From c1865445c5b88cae2df499e4d2047af156225177 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 19 Dec 2007 13:28:33 -0800 Subject: [PATCH] reorganized source tree --- {trunk/bench => bench}/mdtest/COPYRIGHT | 0 {trunk/bench => bench}/mdtest/Makefile | 0 {trunk/bench => bench}/mdtest/README | 0 {trunk/bench => bench}/mdtest/mdtest.c | 0 branches/marnberg/quota/Makefile | 257 - branches/marnberg/quota/TODO | 322 - branches/marnberg/quota/cfuse.cc | 81 - branches/marnberg/quota/client/Client.cc | 2757 ------- branches/marnberg/quota/client/Client.h | 597 -- branches/marnberg/quota/client/FileCache.cc | 180 - branches/marnberg/quota/client/FileCache.h | 70 - .../marnberg/quota/client/SyntheticClient.cc | 1283 ---- .../marnberg/quota/client/SyntheticClient.h | 201 - branches/marnberg/quota/client/Trace.cc | 125 - branches/marnberg/quota/client/Trace.h | 75 - branches/marnberg/quota/client/fuse.cc | 281 - branches/marnberg/quota/client/fuse.h | 23 - .../client/hadoop/CephClientInterface.cc | 217 - .../quota/client/hadoop/CephClientInterface.h | 115 - branches/marnberg/quota/client/ldceph.cc | 297 - branches/marnberg/quota/client/msgthread.h | 25 - branches/marnberg/quota/cmds.cc | 101 - branches/marnberg/quota/cmon.cc | 126 - branches/marnberg/quota/common/Clock.cc | 19 - branches/marnberg/quota/common/Clock.h | 203 - branches/marnberg/quota/common/Cond.h | 118 - branches/marnberg/quota/common/DecayCounter.h | 94 - branches/marnberg/quota/common/LogType.h | 119 - branches/marnberg/quota/common/Logger.cc | 214 - branches/marnberg/quota/common/Logger.h | 74 - branches/marnberg/quota/common/Mutex.h | 68 - branches/marnberg/quota/common/Semaphore.h | 52 - branches/marnberg/quota/common/Thread.h | 62 - branches/marnberg/quota/common/ThreadPool.h | 138 - branches/marnberg/quota/common/Timer.cc | 333 - branches/marnberg/quota/common/Timer.h | 177 - branches/marnberg/quota/config.cc | 834 --- branches/marnberg/quota/config.h | 326 - branches/marnberg/quota/cosd.cc | 124 - branches/marnberg/quota/crush/BinaryTree.h | 284 - branches/marnberg/quota/crush/Bucket.h | 631 -- branches/marnberg/quota/crush/Hash.h | 300 - branches/marnberg/quota/crush/crush.h | 534 -- branches/marnberg/quota/csyn.cc | 99 - branches/marnberg/quota/doc/Commitdir.txt | 22 - branches/marnberg/quota/doc/Replication.txt | 19 - branches/marnberg/quota/doc/caching.txt | 200 - branches/marnberg/quota/doc/dentries.txt | 4 - branches/marnberg/quota/doc/file_modes.txt | 66 - branches/marnberg/quota/doc/header.txt | 12 - branches/marnberg/quota/doc/journal.txt | 124 - branches/marnberg/quota/doc/osd_outline.txt | 37 - .../marnberg/quota/doc/osd_replication.txt | 226 - branches/marnberg/quota/doc/performance.txt | 36 - branches/marnberg/quota/doc/shutdown.txt | 13 - branches/marnberg/quota/ebofs/Allocator.cc | 692 -- branches/marnberg/quota/ebofs/Allocator.h | 85 - branches/marnberg/quota/ebofs/BlockDevice.cc | 777 -- branches/marnberg/quota/ebofs/BlockDevice.h | 338 - branches/marnberg/quota/ebofs/BufferCache.cc | 1147 --- branches/marnberg/quota/ebofs/BufferCache.h | 697 -- branches/marnberg/quota/ebofs/Cnode.h | 100 - branches/marnberg/quota/ebofs/Ebofs.cc | 3270 --------- branches/marnberg/quota/ebofs/Ebofs.h | 330 - branches/marnberg/quota/ebofs/Onode.h | 390 - branches/marnberg/quota/ebofs/Table.h | 898 --- branches/marnberg/quota/ebofs/mkfs.ebofs.cc | 299 - branches/marnberg/quota/ebofs/nodes.h | 583 -- branches/marnberg/quota/ebofs/test.ebofs.cc | 224 - branches/marnberg/quota/ebofs/types.h | 168 - branches/marnberg/quota/fakefuse.cc | 150 - branches/marnberg/quota/fakemon.cc | 178 - branches/marnberg/quota/fakesyn.cc | 190 - branches/marnberg/quota/include/Context.h | 136 - .../marnberg/quota/include/Distribution.h | 74 - branches/marnberg/quota/include/buffer.h | 1127 --- branches/marnberg/quota/include/error.h | 40 - branches/marnberg/quota/include/filepath.h | 206 - .../marnberg/quota/include/interval_set.h | 305 - branches/marnberg/quota/include/lru.h | 321 - branches/marnberg/quota/include/object.h | 97 - branches/marnberg/quota/include/oldbuffer.h | 357 - .../marnberg/quota/include/oldbufferlist.h | 681 -- branches/marnberg/quota/include/rangeset.h | 252 - branches/marnberg/quota/include/reqid.h | 64 - branches/marnberg/quota/include/statlite.h | 70 - branches/marnberg/quota/include/types.h | 367 - branches/marnberg/quota/include/uofs.h | 50 - branches/marnberg/quota/jobs/rados/wr_sizes | 50 - branches/marnberg/quota/mds/Anchor.h | 55 - branches/marnberg/quota/mds/AnchorClient.cc | 149 - branches/marnberg/quota/mds/AnchorClient.h | 55 - branches/marnberg/quota/mds/AnchorTable.cc | 358 - branches/marnberg/quota/mds/AnchorTable.h | 81 - branches/marnberg/quota/mds/CDentry.cc | 203 - branches/marnberg/quota/mds/CDentry.h | 288 - branches/marnberg/quota/mds/CDir.cc | 890 --- branches/marnberg/quota/mds/CDir.h | 617 -- branches/marnberg/quota/mds/CInode.cc | 506 -- branches/marnberg/quota/mds/CInode.h | 655 -- branches/marnberg/quota/mds/Capability.h | 214 - branches/marnberg/quota/mds/ClientMap.h | 85 - branches/marnberg/quota/mds/IdAllocator.cc | 200 - branches/marnberg/quota/mds/IdAllocator.h | 79 - branches/marnberg/quota/mds/Lock.h | 321 - branches/marnberg/quota/mds/Locker.cc | 2237 ------ branches/marnberg/quota/mds/Locker.h | 127 - branches/marnberg/quota/mds/LogEvent.cc | 67 - branches/marnberg/quota/mds/LogEvent.h | 106 - branches/marnberg/quota/mds/MDBalancer.cc | 878 --- branches/marnberg/quota/mds/MDBalancer.h | 109 - branches/marnberg/quota/mds/MDCache.cc | 3536 --------- branches/marnberg/quota/mds/MDCache.h | 364 - branches/marnberg/quota/mds/MDLog.cc | 431 -- branches/marnberg/quota/mds/MDLog.h | 127 - branches/marnberg/quota/mds/MDS.cc | 1021 --- branches/marnberg/quota/mds/MDS.h | 269 - branches/marnberg/quota/mds/MDSMap.h | 288 - branches/marnberg/quota/mds/MDStore.cc | 752 -- branches/marnberg/quota/mds/MDStore.h | 75 - branches/marnberg/quota/mds/Migrator.cc | 3616 --------- branches/marnberg/quota/mds/Migrator.h | 265 - branches/marnberg/quota/mds/Renamer.cc | 918 --- branches/marnberg/quota/mds/Renamer.h | 98 - branches/marnberg/quota/mds/Server.cc | 2376 ------ branches/marnberg/quota/mds/Server.h | 156 - branches/marnberg/quota/mds/events/EAlloc.h | 76 - .../marnberg/quota/mds/events/EExportFinish.h | 59 - .../marnberg/quota/mds/events/EExportStart.h | 68 - .../marnberg/quota/mds/events/EImportFinish.h | 59 - .../marnberg/quota/mds/events/EImportMap.h | 66 - .../marnberg/quota/mds/events/EImportStart.h | 60 - .../marnberg/quota/mds/events/EMetaBlob.h | 339 - .../marnberg/quota/mds/events/EPurgeFinish.h | 48 - branches/marnberg/quota/mds/events/EString.h | 56 - branches/marnberg/quota/mds/events/EUnlink.h | 71 - branches/marnberg/quota/mds/events/EUpdate.h | 49 - branches/marnberg/quota/mds/journal.cc | 589 -- branches/marnberg/quota/mds/mdstypes.h | 290 - .../marnberg/quota/messages/MAnchorReply.h | 74 - .../marnberg/quota/messages/MAnchorRequest.h | 76 - .../marnberg/quota/messages/MCacheExpire.h | 86 - .../marnberg/quota/messages/MClientBoot.h | 31 - .../marnberg/quota/messages/MClientFileCaps.h | 102 - .../quota/messages/MClientInodeAuthUpdate.h | 46 - .../marnberg/quota/messages/MClientMount.h | 34 - .../marnberg/quota/messages/MClientMountAck.h | 59 - .../marnberg/quota/messages/MClientReply.h | 302 - .../marnberg/quota/messages/MClientRequest.h | 202 - .../marnberg/quota/messages/MDentryUnlink.h | 45 - branches/marnberg/quota/messages/MDirExpire.h | 50 - .../marnberg/quota/messages/MDirExpireReq.h | 49 - branches/marnberg/quota/messages/MDirUpdate.h | 71 - branches/marnberg/quota/messages/MDiscover.h | 75 - .../marnberg/quota/messages/MDiscoverReply.h | 254 - branches/marnberg/quota/messages/MExportDir.h | 64 - .../marnberg/quota/messages/MExportDirAck.h | 42 - .../quota/messages/MExportDirDiscover.h | 51 - .../quota/messages/MExportDirDiscoverAck.h | 52 - .../quota/messages/MExportDirFinish.h | 43 - .../quota/messages/MExportDirNotify.h | 111 - .../quota/messages/MExportDirNotifyAck.h | 46 - .../marnberg/quota/messages/MExportDirPrep.h | 186 - .../quota/messages/MExportDirPrepAck.h | 44 - .../quota/messages/MExportDirWarning.h | 45 - branches/marnberg/quota/messages/MFailure.h | 49 - .../marnberg/quota/messages/MFailureAck.h | 42 - .../marnberg/quota/messages/MGenericMessage.h | 44 - branches/marnberg/quota/messages/MHashDir.h | 64 - .../marnberg/quota/messages/MHashDirAck.h | 42 - .../quota/messages/MHashDirDiscover.h | 52 - .../quota/messages/MHashDirDiscoverAck.h | 53 - .../marnberg/quota/messages/MHashDirNotify.h | 50 - .../marnberg/quota/messages/MHashDirPrep.h | 93 - .../marnberg/quota/messages/MHashDirPrepAck.h | 43 - .../marnberg/quota/messages/MHashReaddir.h | 44 - .../quota/messages/MHashReaddirReply.h | 80 - branches/marnberg/quota/messages/MHeartbeat.h | 81 - .../marnberg/quota/messages/MInodeExpire.h | 50 - .../marnberg/quota/messages/MInodeFileCaps.h | 55 - branches/marnberg/quota/messages/MInodeLink.h | 47 - .../marnberg/quota/messages/MInodeLinkAck.h | 47 - .../marnberg/quota/messages/MInodeUnlink.h | 47 - .../marnberg/quota/messages/MInodeUnlinkAck.h | 44 - .../marnberg/quota/messages/MInodeUpdate.h | 61 - branches/marnberg/quota/messages/MLock.h | 128 - branches/marnberg/quota/messages/MMDSBeacon.h | 54 - branches/marnberg/quota/messages/MMDSBoot.h | 38 - .../marnberg/quota/messages/MMDSCacheRejoin.h | 62 - .../quota/messages/MMDSCacheRejoinAck.h | 82 - branches/marnberg/quota/messages/MMDSGetMap.h | 38 - .../marnberg/quota/messages/MMDSImportMap.h | 59 - branches/marnberg/quota/messages/MMDSMap.h | 78 - .../marnberg/quota/messages/MMonElectionAck.h | 31 - .../quota/messages/MMonElectionCollect.h | 42 - .../quota/messages/MMonElectionPropose.h | 32 - .../quota/messages/MMonElectionRefresh.h | 51 - .../quota/messages/MMonElectionStatus.h | 50 - .../quota/messages/MMonElectionVictory.h | 40 - .../marnberg/quota/messages/MMonOSDMapInfo.h | 49 - .../marnberg/quota/messages/MMonOSDMapLease.h | 49 - .../quota/messages/MMonOSDMapLeaseAck.h | 44 - .../quota/messages/MMonOSDMapUpdateAck.h | 42 - .../quota/messages/MMonOSDMapUpdateCommit.h | 42 - .../quota/messages/MMonOSDMapUpdatePrepare.h | 52 - branches/marnberg/quota/messages/MMonPaxos.h | 80 - branches/marnberg/quota/messages/MNSConnect.h | 45 - .../marnberg/quota/messages/MNSConnectAck.h | 53 - branches/marnberg/quota/messages/MNSFailure.h | 52 - branches/marnberg/quota/messages/MNSLookup.h | 46 - .../marnberg/quota/messages/MNSLookupReply.h | 44 - .../marnberg/quota/messages/MNSRegister.h | 59 - .../marnberg/quota/messages/MNSRegisterAck.h | 53 - branches/marnberg/quota/messages/MOSDBoot.h | 44 - .../marnberg/quota/messages/MOSDFailure.h | 49 - branches/marnberg/quota/messages/MOSDGetMap.h | 45 - branches/marnberg/quota/messages/MOSDIn.h | 42 - branches/marnberg/quota/messages/MOSDMap.h | 69 - branches/marnberg/quota/messages/MOSDOp.h | 221 - .../marnberg/quota/messages/MOSDOpReply.h | 148 - branches/marnberg/quota/messages/MOSDOut.h | 42 - branches/marnberg/quota/messages/MOSDPGLog.h | 61 - .../marnberg/quota/messages/MOSDPGNotify.h | 54 - branches/marnberg/quota/messages/MOSDPGPeer.h | 57 - .../marnberg/quota/messages/MOSDPGPeerAck.h | 69 - .../quota/messages/MOSDPGPeerRequest.h | 50 - .../marnberg/quota/messages/MOSDPGQuery.h | 51 - .../marnberg/quota/messages/MOSDPGRemove.h | 51 - .../marnberg/quota/messages/MOSDPGSummary.h | 65 - .../marnberg/quota/messages/MOSDPGUpdate.h | 64 - branches/marnberg/quota/messages/MOSDPing.h | 50 - branches/marnberg/quota/messages/MPing.h | 41 - branches/marnberg/quota/messages/MPingAck.h | 40 - branches/marnberg/quota/messages/MRename.h | 80 - branches/marnberg/quota/messages/MRenameAck.h | 42 - .../marnberg/quota/messages/MRenameNotify.h | 80 - .../quota/messages/MRenameNotifyAck.h | 40 - .../marnberg/quota/messages/MRenamePrep.h | 85 - branches/marnberg/quota/messages/MRenameReq.h | 79 - .../marnberg/quota/messages/MRenameWarning.h | 40 - branches/marnberg/quota/messages/MUnhashDir.h | 42 - .../marnberg/quota/messages/MUnhashDirAck.h | 65 - .../quota/messages/MUnhashDirNotify.h | 50 - .../quota/messages/MUnhashDirNotifyAck.h | 42 - .../marnberg/quota/messages/MUnhashDirPrep.h | 42 - .../quota/messages/MUnhashDirPrepAck.h | 93 - branches/marnberg/quota/mkmonmap.cc | 67 - branches/marnberg/quota/mon/ClientMonitor.cc | 109 - branches/marnberg/quota/mon/ClientMonitor.h | 52 - branches/marnberg/quota/mon/Elector.cc | 219 - branches/marnberg/quota/mon/Elector.h | 72 - branches/marnberg/quota/mon/MDSMonitor.cc | 370 - branches/marnberg/quota/mon/MDSMonitor.h | 87 - branches/marnberg/quota/mon/MonMap.h | 103 - branches/marnberg/quota/mon/Monitor.cc | 303 - branches/marnberg/quota/mon/Monitor.h | 139 - branches/marnberg/quota/mon/MonitorStore.cc | 224 - branches/marnberg/quota/mon/MonitorStore.h | 81 - branches/marnberg/quota/mon/OSDMonitor.cc | 897 --- branches/marnberg/quota/mon/OSDMonitor.h | 110 - branches/marnberg/quota/mon/Paxos.cc | 182 - branches/marnberg/quota/mon/Paxos.h | 73 - branches/marnberg/quota/msg/Dispatcher.cc | 27 - branches/marnberg/quota/msg/Dispatcher.h | 33 - branches/marnberg/quota/msg/FakeMessenger.cc | 338 - branches/marnberg/quota/msg/FakeMessenger.h | 88 - branches/marnberg/quota/msg/HostMonitor.cc | 235 - branches/marnberg/quota/msg/HostMonitor.h | 97 - branches/marnberg/quota/msg/MPIMessenger.cc | 608 -- branches/marnberg/quota/msg/MPIMessenger.h | 56 - branches/marnberg/quota/msg/MTMessenger.cc | 197 - branches/marnberg/quota/msg/MTMessenger.h | 50 - branches/marnberg/quota/msg/Message.cc | 466 -- branches/marnberg/quota/msg/Message.h | 320 - branches/marnberg/quota/msg/Messenger.cc | 38 - branches/marnberg/quota/msg/Messenger.h | 86 - branches/marnberg/quota/msg/NewMessenger.cc | 1714 ----- branches/marnberg/quota/msg/NewMessenger.h | 305 - branches/marnberg/quota/msg/NewerMessenger.cc | 1791 ----- branches/marnberg/quota/msg/NewerMessenger.h | 343 - branches/marnberg/quota/msg/RWLock.h | 49 - branches/marnberg/quota/msg/SerialMessenger.h | 28 - .../marnberg/quota/msg/SimpleMessenger.cc | 1189 --- branches/marnberg/quota/msg/SimpleMessenger.h | 293 - branches/marnberg/quota/msg/TCPDirectory.cc | 178 - branches/marnberg/quota/msg/TCPDirectory.h | 110 - branches/marnberg/quota/msg/TCPMessenger.cc | 1454 ---- branches/marnberg/quota/msg/TCPMessenger.h | 115 - branches/marnberg/quota/msg/error.c | 77 - branches/marnberg/quota/msg/mpistarter.cc | 62 - branches/marnberg/quota/msg/msg_types.h | 186 - branches/marnberg/quota/msg/new_mpistarter.cc | 43 - branches/marnberg/quota/msg/tcp.cc | 87 - branches/marnberg/quota/msg/tcp.h | 37 - branches/marnberg/quota/newsyn.cc | 407 -- branches/marnberg/quota/osbdb/OSBDB.cc | 1905 ----- branches/marnberg/quota/osbdb/OSBDB.h | 478 -- branches/marnberg/quota/osd/Ager.cc | 331 - branches/marnberg/quota/osd/Ager.h | 42 - branches/marnberg/quota/osd/BDBMap.h | 136 - branches/marnberg/quota/osd/Fake.h | 249 - branches/marnberg/quota/osd/FakeStore.cc | 643 -- branches/marnberg/quota/osd/FakeStore.h | 110 - .../quota/osd/FakeStoreBDBCollections.h | 168 - branches/marnberg/quota/osd/OBFSStore.cc | 244 - branches/marnberg/quota/osd/OBFSStore.h | 56 - branches/marnberg/quota/osd/OSD.cc | 3494 --------- branches/marnberg/quota/osd/OSD.h | 273 - branches/marnberg/quota/osd/OSDMap.h | 519 -- branches/marnberg/quota/osd/ObjectStore.cc | 149 - branches/marnberg/quota/osd/ObjectStore.h | 505 -- branches/marnberg/quota/osd/PG.cc | 1333 ---- branches/marnberg/quota/osd/PG.h | 707 -- branches/marnberg/quota/osd/osd_types.h | 174 - branches/marnberg/quota/osd/rush.cc | 230 - branches/marnberg/quota/osd/rush.h | 60 - branches/marnberg/quota/osd/tp.cc | 80 - branches/marnberg/quota/osdc/Blinker.h | 91 - branches/marnberg/quota/osdc/Filer.cc | 235 - branches/marnberg/quota/osdc/Filer.h | 158 - branches/marnberg/quota/osdc/Journaler.cc | 610 -- branches/marnberg/quota/osdc/Journaler.h | 218 - branches/marnberg/quota/osdc/ObjectCacher.cc | 1499 ---- branches/marnberg/quota/osdc/ObjectCacher.h | 555 -- branches/marnberg/quota/osdc/Objecter.cc | 838 --- branches/marnberg/quota/osdc/Objecter.h | 197 - branches/marnberg/quota/script/add_header.pl | 29 - branches/marnberg/quota/script/comb.pl | 113 - .../marnberg/quota/script/find_auth_pins.pl | 46 - branches/marnberg/quota/tcpfuse.cc | 80 - branches/marnberg/quota/tcpsyn.cc | 292 - branches/marnberg/quota/valgrind.supp | 25 - branches/sage/crush/COPYING | 504 -- branches/sage/crush/Makefile | 311 - branches/sage/crush/README | 4 - branches/sage/crush/TODO | 292 - branches/sage/crush/active/README | 0 branches/sage/crush/active/activeslave.cc | 510 -- branches/sage/crush/active/activeslave.h | 23 - branches/sage/crush/active/common.h | 94 - branches/sage/crush/active/msgtestclient.cc | 418 -- branches/sage/crush/active/msgtestclient.h | 44 - branches/sage/crush/active/utility.h | 214 - branches/sage/crush/client/Client.cc | 3909 ---------- branches/sage/crush/client/Client.h | 847 --- branches/sage/crush/client/SyntheticClient.cc | 2882 -------- branches/sage/crush/client/SyntheticClient.h | 241 - branches/sage/crush/cmon.cc | 129 - branches/sage/crush/common/Clock.h | 104 - branches/sage/crush/common/Logger.cc | 320 - branches/sage/crush/config.cc | 1039 --- branches/sage/crush/config.h | 418 -- .../crush/crush.old/test/bucket_movement.cc | 166 - .../crush/crush.old/test/bucket_variance.cc | 199 - .../crush/crush.old/test/cluster_movement.cc | 217 - .../crush.old/test/cluster_movement_remove.cc | 229 - .../crush.old/test/cluster_movement_rush.cc | 218 - .../crush/crush.old/test/creeping_failure.cc | 276 - .../test/creeping_failure_variance.cc | 281 - .../crush/crush.old/test/depth_variance.cc | 185 - branches/sage/crush/crush.old/test/mixed.cc | 300 - .../sage/crush/crush.old/test/movement.cc | 223 - .../crush/crush.old/test/movement_failed.cc | 246 - .../sage/crush/crush.old/test/overload.cc | 335 - .../crush/crush.old/test/overload_variance.cc | 281 - branches/sage/crush/crush.old/test/sizes.cc | 131 - .../sage/crush/crush.old/test/smallbucket.cc | 138 - .../sage/crush/crush.old/test/speed_bucket.cc | 86 - .../sage/crush/crush.old/test/speed_depth.cc | 174 - .../sage/crush/crush.old/test/speed_rush.cc | 145 - branches/sage/crush/crush.old/test/t.cc | 25 - .../sage/crush/crush.old/test/testbucket.cc | 61 - .../sage/crush/crush.old/test/testnormal.cc | 51 - branches/sage/crush/crush/CrushWrapper.h | 227 - branches/sage/crush/crush/Makefile | 30 - branches/sage/crush/crush/builder.c | 375 - branches/sage/crush/crush/builder.h | 45 - branches/sage/crush/crush/crush.c | 81 - branches/sage/crush/crush/crush.h | 117 - branches/sage/crush/crush/mapper.c | 351 - branches/sage/crush/crush/test.c | 65 - branches/sage/crush/doc/bdb.txt | 48 - branches/sage/crush/doc/dentries.txt | 4 - branches/sage/crush/doc/file_modes.txt | 66 - branches/sage/crush/doc/inos.txt | 11 - branches/sage/crush/doc/journal.txt | 124 - branches/sage/crush/doc/lazy_posix.txt | 53 - branches/sage/crush/doc/osd_outline.txt | 37 - branches/sage/crush/doc/osd_replication.txt | 226 - .../crush/doc/shared_write_states_nogo.txt | 39 - branches/sage/crush/doc/shutdown.txt | 13 - branches/sage/crush/dupstore.cc | 102 - branches/sage/crush/ebofs/BlockDevice.cc | 846 --- branches/sage/crush/ebofs/BlockDevice.h | 351 - branches/sage/crush/ebofs/BufferCache.cc | 1228 ---- branches/sage/crush/ebofs/BufferCache.h | 723 -- branches/sage/crush/ebofs/Cnode.h | 101 - branches/sage/crush/ebofs/Ebofs.cc | 3628 ---------- branches/sage/crush/ebofs/Ebofs.h | 370 - branches/sage/crush/ebofs/FileJournal.h | 144 - branches/sage/crush/ebofs/Onode.h | 408 -- branches/sage/crush/ebofs/Table.h | 928 --- branches/sage/crush/ebofs/nodes.h | 568 -- branches/sage/crush/ebofs/test.ebofs.cc | 226 - branches/sage/crush/ebofs/types.h | 171 - branches/sage/crush/fakefuse.cc | 168 - branches/sage/crush/fakesyn.cc | 181 - branches/sage/crush/include/buffer.h | 1205 --- branches/sage/crush/include/ceph_fs.h | 163 - branches/sage/crush/include/filepath.h | 184 - branches/sage/crush/include/frag.h | 573 -- branches/sage/crush/include/hash.h | 70 - branches/sage/crush/include/interval_set.h | 315 - branches/sage/crush/include/object.h | 99 - branches/sage/crush/include/types.h | 303 - branches/sage/crush/include/utime.h | 149 - branches/sage/crush/jobs/alc.tp | 38 - branches/sage/crush/jobs/alcdat/makedirs | 45 - branches/sage/crush/jobs/alcdat/makedirs.big | 45 - branches/sage/crush/jobs/alcdat/makedirs.tput | 46 - .../sage/crush/jobs/alcdat/makefiles.shared | 32 - branches/sage/crush/jobs/alcdat/openshared | 32 - branches/sage/crush/jobs/alcdat/ossh.include | 45 - .../sage/crush/jobs/alcdat/ossh.include.big | 46 - branches/sage/crush/jobs/alcdat/ossh.lib | 45 - branches/sage/crush/jobs/alcdat/ossh.lib.big | 46 - branches/sage/crush/jobs/alcdat/striping | 48 - branches/sage/crush/jobs/example | 56 - branches/sage/crush/jobs/mds/log_striping | 36 - branches/sage/crush/jobs/mds/makedir_lat | 33 - branches/sage/crush/jobs/mds/makedirs | 40 - branches/sage/crush/jobs/mds/opensshlib | 44 - branches/sage/crush/jobs/meta1 | 19 - branches/sage/crush/jobs/meta1.proc.sh | 14 - branches/sage/crush/jobs/osd/ebofs | 51 - branches/sage/crush/jobs/osd/mds_log | 43 - branches/sage/crush/jobs/osd/osd_threads | 33 - branches/sage/crush/jobs/osd/striping | 78 - branches/sage/crush/jobs/osd/wr_lat2 | 44 - branches/sage/crush/jobs/osd/write_sizes | 60 - branches/sage/crush/jobs/rados/map_dist | 32 - branches/sage/crush/jobs/rados/rep_lat | 43 - branches/sage/crush/jobs/rados/wr_sizes | 50 - branches/sage/crush/kernel/Makefile | 7 - branches/sage/crush/kernel/bufferlist.h | 74 - branches/sage/crush/kernel/inode.c | 136 - branches/sage/crush/kernel/kmsg.h | 51 - branches/sage/crush/kernel/kmsgbits.h | 50 - branches/sage/crush/kernel/mds_client.h | 42 - branches/sage/crush/kernel/mdsmap.h | 42 - branches/sage/crush/kernel/monmap.h | 19 - branches/sage/crush/kernel/osd_client.h | 18 - branches/sage/crush/kernel/super.h | 75 - branches/sage/crush/mds/Anchor.h | 108 - branches/sage/crush/mds/AnchorClient.cc | 379 - branches/sage/crush/mds/AnchorTable.cc | 713 -- branches/sage/crush/mds/CDentry.cc | 365 - branches/sage/crush/mds/CDentry.h | 323 - branches/sage/crush/mds/CDir.cc | 1676 ----- branches/sage/crush/mds/CInode.cc | 838 --- branches/sage/crush/mds/CInode.h | 612 -- branches/sage/crush/mds/ClientMap.h | 194 - branches/sage/crush/mds/Locker.cc | 2905 -------- branches/sage/crush/mds/LogEvent.cc | 83 - branches/sage/crush/mds/LogEvent.h | 95 - branches/sage/crush/mds/LogSegment.h | 69 - branches/sage/crush/mds/MDBalancer.cc | 1050 --- branches/sage/crush/mds/MDCache.cc | 6281 ---------------- branches/sage/crush/mds/MDCache.h | 721 -- branches/sage/crush/mds/MDLog.cc | 505 -- branches/sage/crush/mds/MDLog.h | 195 - branches/sage/crush/mds/MDS.cc | 1269 ---- branches/sage/crush/mds/MDS.h | 297 - branches/sage/crush/mds/MDSMap.h | 357 - branches/sage/crush/mds/Migrator.cc | 2114 ------ branches/sage/crush/mds/Migrator.h | 260 - branches/sage/crush/mds/Server.cc | 3976 ---------- branches/sage/crush/mds/Server.h | 184 - branches/sage/crush/mds/events/EImportStart.h | 61 - branches/sage/crush/mds/events/ESession.h | 64 - branches/sage/crush/mds/events/EUpdate.h | 50 - branches/sage/crush/mds/journal.cc | 1084 --- branches/sage/crush/mds/mdstypes.h | 689 -- .../sage/crush/messages/MClientFileCaps.h | 109 - branches/sage/crush/messages/MClientMount.h | 43 - .../sage/crush/messages/MClientReconnect.h | 59 - branches/sage/crush/messages/MClientReply.h | 285 - branches/sage/crush/messages/MClientRequest.h | 325 - .../crush/messages/MClientRequestForward.h | 59 - branches/sage/crush/messages/MClientSession.h | 62 - branches/sage/crush/messages/MClientUnmount.h | 40 - branches/sage/crush/messages/MDirUpdate.h | 71 - .../sage/crush/messages/MExportDirDiscover.h | 59 - branches/sage/crush/messages/MLock.h | 126 - branches/sage/crush/messages/MMDSBeacon.h | 63 - branches/sage/crush/messages/MMDSGetMap.h | 39 - branches/sage/crush/messages/MMDSMap.h | 79 - .../sage/crush/messages/MMDSSlaveRequest.h | 148 - branches/sage/crush/messages/MOSDGetMap.h | 51 - branches/sage/crush/messages/MOSDMap.h | 71 - branches/sage/crush/messages/MOSDOp.h | 280 - branches/sage/crush/messages/MOSDOpReply.h | 164 - branches/sage/crush/messages/MOSDPGQuery.h | 52 - branches/sage/crush/messages/MPing.h | 43 - branches/sage/crush/messages/MPingAck.h | 42 - branches/sage/crush/messages/MStatfs.h | 42 - branches/sage/crush/messages/MStatfsReply.h | 45 - branches/sage/crush/mkmonmap.cc | 68 - branches/sage/crush/mon/ClientMonitor.cc | 256 - branches/sage/crush/mon/ClientMonitor.h | 178 - branches/sage/crush/mon/Elector.cc | 293 - branches/sage/crush/mon/MDSMonitor.cc | 625 -- branches/sage/crush/mon/MDSMonitor.h | 98 - branches/sage/crush/mon/MonMap.h | 101 - branches/sage/crush/mon/Monitor.cc | 405 -- branches/sage/crush/mon/Monitor.h | 154 - branches/sage/crush/mon/MonitorStore.cc | 222 - branches/sage/crush/mon/OSDMonitor.cc | 847 --- branches/sage/crush/mon/OSDMonitor.h | 131 - branches/sage/crush/mon/PGMonitor.cc | 219 - branches/sage/crush/mon/Paxos.cc | 784 -- branches/sage/crush/mon/Paxos.h | 251 - branches/sage/crush/mon/PaxosService.h | 107 - branches/sage/crush/msg/FakeMessenger.cc | 416 -- branches/sage/crush/msg/FakeMessenger.h | 98 - branches/sage/crush/msg/Message.cc | 372 - branches/sage/crush/msg/Message.h | 259 - branches/sage/crush/msg/Messenger.h | 99 - branches/sage/crush/msg/SimpleMessenger.cc | 1445 ---- branches/sage/crush/msg/SimpleMessenger.h | 314 - branches/sage/crush/msg/msg_types.h | 192 - branches/sage/crush/msg/tcp.cc | 93 - branches/sage/crush/msg/tcp.h | 69 - branches/sage/crush/newsyn.cc | 438 -- branches/sage/crush/osd/Fake.h | 262 - branches/sage/crush/osd/FakeStore.cc | 742 -- branches/sage/crush/osd/FakeStore.h | 114 - branches/sage/crush/osd/OSD.cc | 2377 ------ branches/sage/crush/osd/OSD.h | 366 - branches/sage/crush/osd/OSDMap.h | 539 -- branches/sage/crush/osd/ObjectStore.h | 611 -- branches/sage/crush/osd/PG.cc | 1289 ---- branches/sage/crush/osd/PG.h | 754 -- branches/sage/crush/osd/RAID4PG.cc | 123 - branches/sage/crush/osd/RAID4PG.h | 74 - branches/sage/crush/osd/ReplicatedPG.cc | 1972 ----- branches/sage/crush/osd/ReplicatedPG.h | 170 - branches/sage/crush/osd/osd_types.h | 321 - branches/sage/crush/osdc/Journaler.h | 236 - branches/sage/crush/osdc/Objecter.cc | 913 --- branches/sage/crush/osdc/Objecter.h | 230 - branches/sage/crush/script/adjusttabs.pl | 24 - branches/sage/crush/script/clean_osd_cow.sh | 3 - branches/sage/crush/script/clean_trace.pl | 8 - .../sage/crush/script/find_bufferleaks.pl | 69 - .../sage/crush/script/find_lost_bdev_ops.pl | 34 - .../sage/crush/script/find_lost_commit.pl | 38 - .../sage/crush/script/find_lost_objecter.pl | 34 - branches/sage/crush/script/find_pathpins.pl | 41 - branches/sage/crush/script/find_requests.pl | 42 - branches/sage/crush/script/find_waiters.pl | 46 - branches/sage/crush/script/grepblock | 15 - branches/sage/crush/script/merge_trace_rw.pl | 42 - branches/sage/crush/script/profonly.pl | 12 - branches/sage/crush/script/runset.pl | 380 - branches/sage/crush/script/sum.pl | 148 - branches/sage/crush/test/fakemds.cc | 104 - branches/sage/crush/test/gprof-helper.c | 120 - branches/sage/crush/test/makedirs.cc | 38 - branches/sage/crush/test/mpitest.cc | 111 - branches/sage/crush/test/mttest.cc | 140 - branches/sage/crush/test/rushconfig | 7 - branches/sage/crush/test/rushtest.cc | 49 - branches/sage/crush/test/rushtest.cc~ | 49 - branches/sage/crush/test/test_seek_read.c | 53 - branches/sage/crush/test/testbucket.cc | 67 - branches/sage/crush/test/testbuffers.cc | 40 - branches/sage/crush/test/testcrush.cc | 266 - branches/sage/crush/test/testfilepath.cc | 22 - branches/sage/crush/test/testmpi.cc | 53 - branches/sage/crush/test/testnewbuffers.cc | 91 - branches/sage/crush/test/testos.cc | 343 - branches/sage/crush/test/testosbdb.cc | 347 - branches/sage/crush/test/testtree.cc | 46 - branches/sage/crush/test/testxattr.cc | 31 - branches/sage/crush/valgrind.supp | 62 - branches/sage/ebofs2/COPYING | 504 -- branches/sage/ebofs2/Makefile | 311 - branches/sage/ebofs2/README | 4 - branches/sage/ebofs2/TODO | 259 - branches/sage/ebofs2/active/README | 0 branches/sage/ebofs2/active/activemaster.cc | 115 - branches/sage/ebofs2/active/activemaster.h | 18 - branches/sage/ebofs2/active/activeslave.cc | 510 -- branches/sage/ebofs2/active/activeslave.h | 23 - branches/sage/ebofs2/active/activetaskd.cc | 241 - branches/sage/ebofs2/active/activetaskd.h | 14 - branches/sage/ebofs2/active/client_init.cc | 1 - branches/sage/ebofs2/active/client_init.h | 2 - branches/sage/ebofs2/active/common.h | 94 - branches/sage/ebofs2/active/echotestclient.cc | 74 - branches/sage/ebofs2/active/echotestclient.h | 10 - branches/sage/ebofs2/active/inet.h | 9 - branches/sage/ebofs2/active/msgtestclient.cc | 418 -- branches/sage/ebofs2/active/msgtestclient.h | 44 - branches/sage/ebofs2/active/trivial_task.cc | 50 - branches/sage/ebofs2/active/trivial_task.h | 12 - branches/sage/ebofs2/active/utility.h | 214 - branches/sage/ebofs2/cfuse.cc | 88 - branches/sage/ebofs2/client/Client.cc | 3909 ---------- branches/sage/ebofs2/client/Client.h | 847 --- branches/sage/ebofs2/client/FileCache.cc | 266 - branches/sage/ebofs2/client/FileCache.h | 85 - .../sage/ebofs2/client/SyntheticClient.cc | 2882 -------- branches/sage/ebofs2/client/SyntheticClient.h | 241 - branches/sage/ebofs2/client/Trace.cc | 83 - branches/sage/ebofs2/client/Trace.h | 63 - branches/sage/ebofs2/client/fuse.cc | 306 - branches/sage/ebofs2/client/fuse.h | 24 - branches/sage/ebofs2/client/fuse_ll.cc | 397 - branches/sage/ebofs2/client/fuse_ll.h | 15 - .../ebofs2/client/hadoop/CephFSInterface.cc | 789 -- .../ebofs2/client/hadoop/CephFSInterface.h | 239 - branches/sage/ebofs2/client/ldceph.cc | 298 - branches/sage/ebofs2/cmds.cc | 108 - branches/sage/ebofs2/cmonctl.cc | 92 - branches/sage/ebofs2/common/Clock.cc | 20 - branches/sage/ebofs2/common/Clock.h | 104 - branches/sage/ebofs2/common/Cond.h | 119 - branches/sage/ebofs2/common/DecayCounter.h | 138 - branches/sage/ebofs2/common/LogType.h | 122 - branches/sage/ebofs2/common/Logger.cc | 320 - branches/sage/ebofs2/common/Logger.h | 77 - branches/sage/ebofs2/common/Mutex.h | 83 - branches/sage/ebofs2/common/RWLock.h | 50 - branches/sage/ebofs2/common/Semaphore.h | 53 - branches/sage/ebofs2/common/Thread.h | 81 - branches/sage/ebofs2/common/ThreadPool.h | 139 - branches/sage/ebofs2/common/Timer.cc | 335 - branches/sage/ebofs2/common/Timer.h | 175 - branches/sage/ebofs2/config.cc | 1029 --- branches/sage/ebofs2/config.h | 418 -- branches/sage/ebofs2/cosd.cc | 135 - branches/sage/ebofs2/crush.old/BinaryTree.h | 285 - branches/sage/ebofs2/crush.old/Bucket.h | 632 -- branches/sage/ebofs2/crush.old/Hash.h | 301 - branches/sage/ebofs2/crush.old/crush.h | 543 -- .../ebofs2/crush.old/test/bucket_movement.cc | 166 - .../ebofs2/crush.old/test/bucket_variance.cc | 199 - .../ebofs2/crush.old/test/cluster_movement.cc | 217 - .../crush.old/test/cluster_movement_remove.cc | 229 - .../crush.old/test/cluster_movement_rush.cc | 218 - .../ebofs2/crush.old/test/creeping_failure.cc | 276 - .../test/creeping_failure_variance.cc | 281 - .../ebofs2/crush.old/test/depth_variance.cc | 185 - branches/sage/ebofs2/crush.old/test/mixed.cc | 300 - .../sage/ebofs2/crush.old/test/movement.cc | 223 - .../ebofs2/crush.old/test/movement_failed.cc | 246 - .../sage/ebofs2/crush.old/test/overload.cc | 335 - .../crush.old/test/overload_variance.cc | 281 - branches/sage/ebofs2/crush.old/test/sizes.cc | 131 - .../sage/ebofs2/crush.old/test/smallbucket.cc | 138 - .../ebofs2/crush.old/test/speed_bucket.cc | 86 - .../sage/ebofs2/crush.old/test/speed_depth.cc | 174 - .../sage/ebofs2/crush.old/test/speed_rush.cc | 145 - branches/sage/ebofs2/crush.old/test/t.cc | 25 - .../sage/ebofs2/crush.old/test/testbucket.cc | 61 - .../sage/ebofs2/crush.old/test/testnormal.cc | 51 - branches/sage/ebofs2/crush/CrushWrapper.h | 227 - branches/sage/ebofs2/crush/Makefile | 30 - branches/sage/ebofs2/crush/buckets.c | 6 - branches/sage/ebofs2/crush/builder.c | 375 - branches/sage/ebofs2/crush/builder.h | 45 - branches/sage/ebofs2/crush/crush.c | 81 - branches/sage/ebofs2/crush/crush.h | 117 - branches/sage/ebofs2/crush/hash.h | 80 - branches/sage/ebofs2/crush/mapper.c | 351 - branches/sage/ebofs2/crush/mapper.h | 19 - branches/sage/ebofs2/crush/test.c | 65 - branches/sage/ebofs2/crush/types.h | 18 - branches/sage/ebofs2/csyn.cc | 87 - branches/sage/ebofs2/doc/Commitdir.txt | 24 - branches/sage/ebofs2/doc/anchortable.txt | 54 - branches/sage/ebofs2/doc/bdb.txt | 48 - branches/sage/ebofs2/doc/caching.txt | 303 - branches/sage/ebofs2/doc/dentries.txt | 4 - branches/sage/ebofs2/doc/exports.txt | 72 - branches/sage/ebofs2/doc/file_modes.txt | 66 - branches/sage/ebofs2/doc/header.txt | 13 - branches/sage/ebofs2/doc/inos.txt | 11 - branches/sage/ebofs2/doc/journal.txt | 124 - branches/sage/ebofs2/doc/lazy_posix.txt | 53 - branches/sage/ebofs2/doc/mds_locks.txt | 66 - branches/sage/ebofs2/doc/modeline.txt | 2 - branches/sage/ebofs2/doc/osd_outline.txt | 37 - branches/sage/ebofs2/doc/osd_replication.txt | 226 - .../ebofs2/doc/shared_write_states_nogo.txt | 39 - branches/sage/ebofs2/doc/shutdown.txt | 13 - branches/sage/ebofs2/dupstore.cc | 102 - branches/sage/ebofs2/ebofs/Allocator.cc | 693 -- branches/sage/ebofs2/ebofs/Allocator.h | 85 - branches/sage/ebofs2/ebofs/BlockDevice.cc | 846 --- branches/sage/ebofs2/ebofs/BlockDevice.h | 351 - branches/sage/ebofs2/ebofs/BufferCache.cc | 1228 ---- branches/sage/ebofs2/ebofs/BufferCache.h | 723 -- branches/sage/ebofs2/ebofs/Cnode.h | 101 - branches/sage/ebofs2/ebofs/Ebofs.cc | 3628 ---------- branches/sage/ebofs2/ebofs/Ebofs.h | 370 - branches/sage/ebofs2/ebofs/FileJournal.cc | 456 -- branches/sage/ebofs2/ebofs/FileJournal.h | 144 - branches/sage/ebofs2/ebofs/Journal.h | 47 - branches/sage/ebofs2/ebofs/Onode.h | 408 -- branches/sage/ebofs2/ebofs/Table.h | 928 --- branches/sage/ebofs2/ebofs/mkfs.ebofs.cc | 349 - branches/sage/ebofs2/ebofs/nodes.h | 568 -- branches/sage/ebofs2/ebofs/test.ebofs.cc | 226 - branches/sage/ebofs2/ebofs/types.h | 171 - branches/sage/ebofs2/extractosdmaps.cc | 64 - branches/sage/ebofs2/fakefuse.cc | 168 - branches/sage/ebofs2/fakesyn.cc | 181 - branches/sage/ebofs2/include/Context.h | 153 - branches/sage/ebofs2/include/Distribution.h | 75 - branches/sage/ebofs2/include/atomic.h | 56 - branches/sage/ebofs2/include/bitmapper.h | 48 - branches/sage/ebofs2/include/blobhash.h | 47 - branches/sage/ebofs2/include/buffer.h | 1161 --- branches/sage/ebofs2/include/ceph_fs.h | 179 - branches/sage/ebofs2/include/encodable.h | 424 -- branches/sage/ebofs2/include/error.h | 41 - branches/sage/ebofs2/include/filepath.h | 184 - branches/sage/ebofs2/include/frag.h | 573 -- branches/sage/ebofs2/include/hash.h | 70 - branches/sage/ebofs2/include/interval_set.h | 315 - branches/sage/ebofs2/include/lru.h | 341 - branches/sage/ebofs2/include/object.h | 99 - branches/sage/ebofs2/include/rangeset.h | 253 - branches/sage/ebofs2/include/statlite.h | 72 - branches/sage/ebofs2/include/triple.h | 28 - branches/sage/ebofs2/include/types.h | 303 - branches/sage/ebofs2/include/uofs.h | 51 - branches/sage/ebofs2/include/utime.h | 149 - branches/sage/ebofs2/include/xlist.h | 123 - branches/sage/ebofs2/jobs/alc.tp | 38 - branches/sage/ebofs2/jobs/alcdat/makedirs | 45 - branches/sage/ebofs2/jobs/alcdat/makedirs.big | 45 - .../sage/ebofs2/jobs/alcdat/makedirs.tput | 46 - .../sage/ebofs2/jobs/alcdat/makefiles.shared | 32 - branches/sage/ebofs2/jobs/alcdat/openshared | 32 - branches/sage/ebofs2/jobs/alcdat/ossh.include | 45 - .../sage/ebofs2/jobs/alcdat/ossh.include.big | 46 - branches/sage/ebofs2/jobs/alcdat/ossh.lib | 45 - branches/sage/ebofs2/jobs/alcdat/ossh.lib.big | 46 - branches/sage/ebofs2/jobs/alcdat/striping | 48 - branches/sage/ebofs2/jobs/example | 56 - branches/sage/ebofs2/jobs/mds/log_striping | 36 - branches/sage/ebofs2/jobs/mds/makedir_lat | 33 - branches/sage/ebofs2/jobs/mds/makedirs | 40 - branches/sage/ebofs2/jobs/mds/opensshlib | 44 - branches/sage/ebofs2/jobs/meta1 | 19 - branches/sage/ebofs2/jobs/meta1.proc.sh | 14 - branches/sage/ebofs2/jobs/osd/ebofs | 51 - branches/sage/ebofs2/jobs/osd/mds_log | 43 - branches/sage/ebofs2/jobs/osd/osd_threads | 33 - branches/sage/ebofs2/jobs/osd/striping | 78 - branches/sage/ebofs2/jobs/osd/wr_lat2 | 44 - branches/sage/ebofs2/jobs/osd/write_sizes | 60 - branches/sage/ebofs2/jobs/rados/map_dist | 32 - branches/sage/ebofs2/jobs/rados/rep_lat | 43 - branches/sage/ebofs2/jobs/rados/wr_sizes | 50 - branches/sage/ebofs2/jobs/runjobsample | 26 - branches/sage/ebofs2/kernel/Makefile | 7 - branches/sage/ebofs2/kernel/accepter.h | 21 - branches/sage/ebofs2/kernel/bufferlist.c | 147 - branches/sage/ebofs2/kernel/bufferlist.h | 29 - branches/sage/ebofs2/kernel/inode.c | 136 - branches/sage/ebofs2/kernel/kmsg.h | 68 - branches/sage/ebofs2/kernel/ktcp.c | 138 - branches/sage/ebofs2/kernel/ktcp.h | 8 - branches/sage/ebofs2/kernel/mds_client.c | 287 - branches/sage/ebofs2/kernel/mds_client.h | 67 - branches/sage/ebofs2/kernel/mdsmap.c | 96 - branches/sage/ebofs2/kernel/mdsmap.h | 46 - branches/sage/ebofs2/kernel/messenger.c | 60 - branches/sage/ebofs2/kernel/mon_client.h | 17 - branches/sage/ebofs2/kernel/monmap.h | 19 - branches/sage/ebofs2/kernel/osd_client.h | 17 - branches/sage/ebofs2/kernel/super.h | 75 - branches/sage/ebofs2/mds/AnchorClient.h | 107 - branches/sage/ebofs2/mds/AnchorTable.h | 127 - branches/sage/ebofs2/mds/CDentry.cc | 365 - branches/sage/ebofs2/mds/CDentry.h | 323 - branches/sage/ebofs2/mds/CDir.cc | 1676 ----- branches/sage/ebofs2/mds/CDir.h | 540 -- branches/sage/ebofs2/mds/CInode.cc | 838 --- branches/sage/ebofs2/mds/CInode.h | 612 -- branches/sage/ebofs2/mds/Capability.h | 245 - branches/sage/ebofs2/mds/ClientMap.cc | 126 - branches/sage/ebofs2/mds/ClientMap.h | 194 - branches/sage/ebofs2/mds/FileLock.h | 227 - branches/sage/ebofs2/mds/IdAllocator.cc | 205 - branches/sage/ebofs2/mds/IdAllocator.h | 78 - branches/sage/ebofs2/mds/LocalLock.h | 61 - branches/sage/ebofs2/mds/Locker.cc | 2900 -------- branches/sage/ebofs2/mds/Locker.h | 195 - branches/sage/ebofs2/mds/LogEvent.cc | 83 - branches/sage/ebofs2/mds/LogEvent.h | 95 - branches/sage/ebofs2/mds/LogSegment.h | 69 - branches/sage/ebofs2/mds/MDBalancer.cc | 1049 --- branches/sage/ebofs2/mds/MDBalancer.h | 118 - branches/sage/ebofs2/mds/MDCache.cc | 6278 ---------------- branches/sage/ebofs2/mds/MDCache.h | 721 -- branches/sage/ebofs2/mds/MDLog.cc | 505 -- branches/sage/ebofs2/mds/MDLog.h | 195 - branches/sage/ebofs2/mds/MDS.cc | 1295 ---- branches/sage/ebofs2/mds/MDS.h | 298 - branches/sage/ebofs2/mds/MDSMap.h | 380 - branches/sage/ebofs2/mds/Migrator.cc | 2109 ------ branches/sage/ebofs2/mds/Migrator.h | 260 - branches/sage/ebofs2/mds/ScatterLock.h | 183 - branches/sage/ebofs2/mds/Server.cc | 3975 ---------- branches/sage/ebofs2/mds/Server.h | 184 - branches/sage/ebofs2/mds/SimpleLock.h | 301 - branches/sage/ebofs2/mds/events/EAnchor.h | 80 - .../sage/ebofs2/mds/events/EAnchorClient.h | 56 - branches/sage/ebofs2/mds/events/EExport.h | 63 - branches/sage/ebofs2/mds/events/EFragment.h | 54 - .../sage/ebofs2/mds/events/EImportFinish.h | 60 - .../sage/ebofs2/mds/events/EImportStart.h | 61 - branches/sage/ebofs2/mds/events/EMetaBlob.h | 501 -- branches/sage/ebofs2/mds/events/EOpen.h | 53 - .../sage/ebofs2/mds/events/EPurgeFinish.h | 54 - branches/sage/ebofs2/mds/events/ESession.h | 64 - .../sage/ebofs2/mds/events/ESlaveUpdate.h | 79 - branches/sage/ebofs2/mds/events/EString.h | 56 - branches/sage/ebofs2/mds/events/ESubtreeMap.h | 47 - branches/sage/ebofs2/mds/events/EUpdate.h | 50 - branches/sage/ebofs2/mds/journal.cc | 1084 --- branches/sage/ebofs2/mds/mdstypes.h | 684 -- branches/sage/ebofs2/messages/MAnchor.h | 74 - branches/sage/ebofs2/messages/MCacheExpire.h | 127 - .../sage/ebofs2/messages/MClientFileCaps.h | 109 - branches/sage/ebofs2/messages/MClientMount.h | 43 - .../sage/ebofs2/messages/MClientReconnect.h | 59 - branches/sage/ebofs2/messages/MClientReply.h | 285 - .../sage/ebofs2/messages/MClientRequest.h | 325 - .../ebofs2/messages/MClientRequestForward.h | 59 - .../sage/ebofs2/messages/MClientSession.h | 62 - .../sage/ebofs2/messages/MClientUnmount.h | 40 - branches/sage/ebofs2/messages/MDentryUnlink.h | 82 - branches/sage/ebofs2/messages/MDirUpdate.h | 71 - branches/sage/ebofs2/messages/MDiscover.h | 108 - .../sage/ebofs2/messages/MDiscoverReply.h | 300 - branches/sage/ebofs2/messages/MExportDir.h | 65 - branches/sage/ebofs2/messages/MExportDirAck.h | 46 - .../sage/ebofs2/messages/MExportDirCancel.h | 49 - .../sage/ebofs2/messages/MExportDirDiscover.h | 59 - .../ebofs2/messages/MExportDirDiscoverAck.h | 60 - .../sage/ebofs2/messages/MExportDirFinish.h | 46 - .../sage/ebofs2/messages/MExportDirNotify.h | 85 - .../ebofs2/messages/MExportDirNotifyAck.h | 50 - .../sage/ebofs2/messages/MExportDirPrep.h | 205 - .../sage/ebofs2/messages/MExportDirPrepAck.h | 47 - .../sage/ebofs2/messages/MExportDirWarning.h | 50 - .../ebofs2/messages/MExportDirWarningAck.h | 45 - .../sage/ebofs2/messages/MGenericMessage.h | 45 - branches/sage/ebofs2/messages/MHeartbeat.h | 60 - .../sage/ebofs2/messages/MInodeFileCaps.h | 57 - branches/sage/ebofs2/messages/MLock.h | 126 - branches/sage/ebofs2/messages/MMDSBoot.h | 39 - .../sage/ebofs2/messages/MMDSCacheRejoin.h | 230 - .../sage/ebofs2/messages/MMDSFragmentNotify.h | 60 - branches/sage/ebofs2/messages/MMDSGetMap.h | 39 - branches/sage/ebofs2/messages/MMDSMap.h | 79 - branches/sage/ebofs2/messages/MMDSResolve.h | 66 - .../sage/ebofs2/messages/MMDSResolveAck.h | 56 - .../sage/ebofs2/messages/MMDSSlaveRequest.h | 148 - branches/sage/ebofs2/messages/MMonCommand.h | 54 - .../sage/ebofs2/messages/MMonCommandAck.h | 46 - branches/sage/ebofs2/messages/MMonElection.h | 63 - .../ebofs2/messages/MMonElectionCollect.h | 43 - .../ebofs2/messages/MMonElectionRefresh.h | 52 - .../sage/ebofs2/messages/MMonElectionStatus.h | 51 - .../sage/ebofs2/messages/MMonOSDMapInfo.h | 50 - .../sage/ebofs2/messages/MMonOSDMapLease.h | 50 - .../sage/ebofs2/messages/MMonOSDMapLeaseAck.h | 45 - .../ebofs2/messages/MMonOSDMapUpdateAck.h | 43 - .../ebofs2/messages/MMonOSDMapUpdateCommit.h | 43 - .../ebofs2/messages/MMonOSDMapUpdatePrepare.h | 53 - branches/sage/ebofs2/messages/MMonPaxos.h | 98 - branches/sage/ebofs2/messages/MOSDBoot.h | 51 - branches/sage/ebofs2/messages/MOSDFailure.h | 55 - branches/sage/ebofs2/messages/MOSDGetMap.h | 51 - branches/sage/ebofs2/messages/MOSDIn.h | 43 - branches/sage/ebofs2/messages/MOSDMap.h | 71 - branches/sage/ebofs2/messages/MOSDOp.h | 280 - branches/sage/ebofs2/messages/MOSDOpReply.h | 164 - branches/sage/ebofs2/messages/MOSDOut.h | 43 - .../sage/ebofs2/messages/MOSDPGActivateSet.h | 50 - branches/sage/ebofs2/messages/MOSDPGLog.h | 59 - branches/sage/ebofs2/messages/MOSDPGNotify.h | 55 - branches/sage/ebofs2/messages/MOSDPGPeer.h | 58 - branches/sage/ebofs2/messages/MOSDPGPeerAck.h | 70 - .../sage/ebofs2/messages/MOSDPGPeerRequest.h | 51 - branches/sage/ebofs2/messages/MOSDPGQuery.h | 52 - branches/sage/ebofs2/messages/MOSDPGRemove.h | 52 - branches/sage/ebofs2/messages/MOSDPGSummary.h | 69 - branches/sage/ebofs2/messages/MOSDPGUpdate.h | 71 - branches/sage/ebofs2/messages/MOSDPing.h | 49 - branches/sage/ebofs2/messages/MPGStats.h | 43 - branches/sage/ebofs2/messages/MPing.h | 43 - branches/sage/ebofs2/messages/MPingAck.h | 42 - branches/sage/ebofs2/messages/MStatfs.h | 42 - branches/sage/ebofs2/messages/MStatfsReply.h | 45 - branches/sage/ebofs2/mkmonmap.cc | 68 - branches/sage/ebofs2/mon/ClientMonitor.cc | 256 - branches/sage/ebofs2/mon/ClientMonitor.h | 178 - branches/sage/ebofs2/mon/Elector.cc | 293 - branches/sage/ebofs2/mon/Elector.h | 92 - branches/sage/ebofs2/mon/MDSMonitor.cc | 667 -- branches/sage/ebofs2/mon/MDSMonitor.h | 100 - branches/sage/ebofs2/mon/MonMap.h | 101 - branches/sage/ebofs2/mon/Monitor.cc | 405 -- branches/sage/ebofs2/mon/Monitor.h | 154 - branches/sage/ebofs2/mon/MonitorStore.cc | 222 - branches/sage/ebofs2/mon/MonitorStore.h | 82 - branches/sage/ebofs2/mon/OSDMonitor.cc | 847 --- branches/sage/ebofs2/mon/OSDMonitor.h | 131 - branches/sage/ebofs2/mon/PGMap.h | 103 - branches/sage/ebofs2/mon/PGMonitor.cc | 219 - branches/sage/ebofs2/mon/PGMonitor.h | 58 - branches/sage/ebofs2/mon/Paxos.cc | 784 -- branches/sage/ebofs2/mon/Paxos.h | 251 - branches/sage/ebofs2/mon/PaxosService.cc | 172 - branches/sage/ebofs2/mon/PaxosService.h | 107 - branches/sage/ebofs2/mon/mon_types.h | 35 - branches/sage/ebofs2/msg/Dispatcher.cc | 28 - branches/sage/ebofs2/msg/Dispatcher.h | 34 - branches/sage/ebofs2/msg/FakeMessenger.cc | 414 -- branches/sage/ebofs2/msg/FakeMessenger.h | 95 - branches/sage/ebofs2/msg/Message.cc | 369 - branches/sage/ebofs2/msg/Message.h | 250 - branches/sage/ebofs2/msg/Messenger.cc | 39 - branches/sage/ebofs2/msg/Messenger.h | 89 - branches/sage/ebofs2/msg/SimpleMessenger.cc | 1495 ---- branches/sage/ebofs2/msg/SimpleMessenger.h | 320 - branches/sage/ebofs2/msg/msg_types.h | 191 - branches/sage/ebofs2/msg/tcp.h | 67 - branches/sage/ebofs2/newsyn.cc | 438 -- branches/sage/ebofs2/osbdb/OSBDB.cc | 2169 ------ branches/sage/ebofs2/osbdb/OSBDB.h | 482 -- branches/sage/ebofs2/osd/Ager.cc | 333 - branches/sage/ebofs2/osd/Ager.h | 44 - branches/sage/ebofs2/osd/BDBMap.h | 137 - branches/sage/ebofs2/osd/Fake.h | 262 - branches/sage/ebofs2/osd/FakeStore.cc | 742 -- branches/sage/ebofs2/osd/FakeStore.h | 114 - .../sage/ebofs2/osd/FakeStoreBDBCollections.h | 169 - branches/sage/ebofs2/osd/OSD.cc | 2377 ------ branches/sage/ebofs2/osd/OSD.h | 366 - branches/sage/ebofs2/osd/OSDMap.h | 539 -- branches/sage/ebofs2/osd/ObjectStore.cc | 152 - branches/sage/ebofs2/osd/ObjectStore.h | 611 -- branches/sage/ebofs2/osd/PG.cc | 1289 ---- branches/sage/ebofs2/osd/PG.h | 754 -- branches/sage/ebofs2/osd/RAID4PG.cc | 123 - branches/sage/ebofs2/osd/RAID4PG.h | 74 - branches/sage/ebofs2/osd/ReplicatedPG.cc | 1972 ----- branches/sage/ebofs2/osd/ReplicatedPG.h | 170 - branches/sage/ebofs2/osd/osd_types.h | 321 - branches/sage/ebofs2/osdc/Blinker.h | 92 - branches/sage/ebofs2/osdc/Filer.cc | 235 - branches/sage/ebofs2/osdc/Filer.h | 165 - branches/sage/ebofs2/osdc/Journaler.cc | 666 -- branches/sage/ebofs2/osdc/Journaler.h | 236 - branches/sage/ebofs2/osdc/ObjectCacher.cc | 1587 ---- branches/sage/ebofs2/osdc/ObjectCacher.h | 566 -- branches/sage/ebofs2/osdc/Objecter.cc | 913 --- branches/sage/ebofs2/osdc/Objecter.h | 230 - branches/sage/ebofs2/script/add_header.pl | 26 - branches/sage/ebofs2/script/adjusttabs.pl | 24 - .../sage/ebofs2/script/check_cache_dumps.pl | 56 - branches/sage/ebofs2/script/clean_osd_cow.sh | 3 - branches/sage/ebofs2/script/clean_trace.pl | 8 - branches/sage/ebofs2/script/comb.pl | 113 - .../sage/ebofs2/script/convert_soe_trace.pl | 39 - branches/sage/ebofs2/script/find_auth_pins.pl | 51 - .../sage/ebofs2/script/find_bufferleaks.pl | 69 - .../sage/ebofs2/script/find_lost_bdev_ops.pl | 34 - .../sage/ebofs2/script/find_lost_commit.pl | 38 - .../sage/ebofs2/script/find_lost_objecter.pl | 34 - branches/sage/ebofs2/script/find_pathpins.pl | 41 - branches/sage/ebofs2/script/find_requests.pl | 42 - branches/sage/ebofs2/script/find_waiters.pl | 46 - branches/sage/ebofs2/script/fix_modeline.pl | 29 - branches/sage/ebofs2/script/gprofnewsyn | 12 - branches/sage/ebofs2/script/grepblock | 15 - branches/sage/ebofs2/script/merge_cdfs.pl | 24 - branches/sage/ebofs2/script/merge_trace_rw.pl | 42 - branches/sage/ebofs2/script/plot.pl | 48 - branches/sage/ebofs2/script/profonly.pl | 12 - branches/sage/ebofs2/script/runjob.pl | 341 - branches/sage/ebofs2/script/runset.pl | 380 - branches/sage/ebofs2/script/smooth.pl | 41 - branches/sage/ebofs2/script/study_find.pl | 224 - .../ebofs2/script/study_hardlink_lifetimes.pl | 131 - branches/sage/ebofs2/script/study_lookups.pl | 137 - branches/sage/ebofs2/script/sum.pl | 148 - branches/sage/ebofs2/test/fakemds.cc | 104 - branches/sage/ebofs2/test/fg.cc | 19 - branches/sage/ebofs2/test/gprof-helper.c | 120 - branches/sage/ebofs2/test/makedirs.cc | 38 - branches/sage/ebofs2/test/mpitest.cc | 111 - branches/sage/ebofs2/test/mttest.cc | 140 - branches/sage/ebofs2/test/rushconfig | 7 - branches/sage/ebofs2/test/rushtest.cc | 49 - branches/sage/ebofs2/test/rushtest.cc~ | 49 - branches/sage/ebofs2/test/test_disk_bw.cc | 59 - branches/sage/ebofs2/test/test_seek_read.c | 53 - branches/sage/ebofs2/test/testbucket.cc | 67 - branches/sage/ebofs2/test/testbuffers.cc | 40 - branches/sage/ebofs2/test/testcounter.cc | 70 - branches/sage/ebofs2/test/testcrush.cc | 266 - branches/sage/ebofs2/test/testfilepath.cc | 22 - branches/sage/ebofs2/test/testmpi.cc | 53 - branches/sage/ebofs2/test/testnewbuffers.cc | 91 - branches/sage/ebofs2/test/testos.cc | 343 - branches/sage/ebofs2/test/testosbdb.cc | 347 - branches/sage/ebofs2/test/testtree.cc | 46 - branches/sage/ebofs2/test/testxattr.cc | 31 - branches/sage/ebofs2/valgrind.supp | 62 - branches/sage/mds/COPYING | 504 -- branches/sage/mds/Makefile | 304 - branches/sage/mds/README | 4 - branches/sage/mds/TODO | 265 - branches/sage/mds/active/README | 0 branches/sage/mds/active/activemaster.cc | 115 - branches/sage/mds/active/activemaster.h | 18 - branches/sage/mds/active/activeslave.cc | 510 -- branches/sage/mds/active/activeslave.h | 23 - branches/sage/mds/active/activetaskd.cc | 241 - branches/sage/mds/active/activetaskd.h | 14 - branches/sage/mds/active/client_init.cc | 1 - branches/sage/mds/active/client_init.h | 2 - branches/sage/mds/active/common.h | 94 - branches/sage/mds/active/echotestclient.cc | 74 - branches/sage/mds/active/echotestclient.h | 10 - branches/sage/mds/active/inet.h | 9 - branches/sage/mds/active/msgtestclient.cc | 418 -- branches/sage/mds/active/msgtestclient.h | 44 - branches/sage/mds/active/trivial_task.cc | 50 - branches/sage/mds/active/trivial_task.h | 12 - branches/sage/mds/active/utility.h | 214 - branches/sage/mds/cfuse.cc | 88 - branches/sage/mds/client/Client.cc | 3923 ---------- branches/sage/mds/client/Client.h | 845 --- branches/sage/mds/client/FileCache.cc | 266 - branches/sage/mds/client/FileCache.h | 85 - branches/sage/mds/client/SyntheticClient.cc | 2893 -------- branches/sage/mds/client/SyntheticClient.h | 241 - branches/sage/mds/client/Trace.cc | 83 - branches/sage/mds/client/Trace.h | 63 - branches/sage/mds/client/fuse.cc | 306 - branches/sage/mds/client/fuse.h | 24 - branches/sage/mds/client/fuse_ll.cc | 397 - branches/sage/mds/client/fuse_ll.h | 15 - .../sage/mds/client/hadoop/CephFSInterface.cc | 789 -- .../sage/mds/client/hadoop/CephFSInterface.h | 239 - branches/sage/mds/client/ldceph.cc | 298 - branches/sage/mds/cmds.cc | 108 - branches/sage/mds/cmon.cc | 129 - branches/sage/mds/cmonctl.cc | 92 - branches/sage/mds/common/Clock.cc | 20 - branches/sage/mds/common/Clock.h | 104 - branches/sage/mds/common/Cond.h | 119 - branches/sage/mds/common/DecayCounter.h | 138 - branches/sage/mds/common/LogType.h | 122 - branches/sage/mds/common/Logger.cc | 320 - branches/sage/mds/common/Logger.h | 77 - branches/sage/mds/common/Mutex.h | 83 - branches/sage/mds/common/RWLock.h | 50 - branches/sage/mds/common/Semaphore.h | 53 - branches/sage/mds/common/Thread.h | 81 - branches/sage/mds/common/ThreadPool.h | 139 - branches/sage/mds/common/Timer.cc | 335 - branches/sage/mds/common/Timer.h | 175 - branches/sage/mds/config.cc | 1039 --- branches/sage/mds/config.h | 418 -- branches/sage/mds/cosd.cc | 135 - branches/sage/mds/crush/BinaryTree.h | 285 - branches/sage/mds/crush/Bucket.h | 632 -- branches/sage/mds/crush/Hash.h | 301 - branches/sage/mds/crush/crush.h | 543 -- .../sage/mds/crush/test/bucket_movement.cc | 166 - .../sage/mds/crush/test/bucket_variance.cc | 199 - .../sage/mds/crush/test/cluster_movement.cc | 217 - .../mds/crush/test/cluster_movement_remove.cc | 229 - .../mds/crush/test/cluster_movement_rush.cc | 218 - .../sage/mds/crush/test/creeping_failure.cc | 276 - .../crush/test/creeping_failure_variance.cc | 281 - .../sage/mds/crush/test/depth_variance.cc | 185 - branches/sage/mds/crush/test/mixed.cc | 300 - branches/sage/mds/crush/test/movement.cc | 223 - .../sage/mds/crush/test/movement_failed.cc | 246 - branches/sage/mds/crush/test/overload.cc | 335 - .../sage/mds/crush/test/overload_variance.cc | 281 - branches/sage/mds/crush/test/sizes.cc | 131 - branches/sage/mds/crush/test/smallbucket.cc | 138 - branches/sage/mds/crush/test/speed_bucket.cc | 86 - branches/sage/mds/crush/test/speed_depth.cc | 174 - branches/sage/mds/crush/test/speed_rush.cc | 145 - branches/sage/mds/crush/test/t.cc | 25 - branches/sage/mds/crush/test/testbucket.cc | 61 - branches/sage/mds/crush/test/testnormal.cc | 51 - branches/sage/mds/crush2/Makefile | 26 - branches/sage/mds/crush2/buckets.c | 56 - branches/sage/mds/crush2/buckets.h | 49 - branches/sage/mds/crush2/crush.c | 236 - branches/sage/mds/crush2/crush.h | 49 - branches/sage/mds/crush2/hash.h | 80 - branches/sage/mds/crush2/types.h | 11 - branches/sage/mds/csyn.cc | 87 - branches/sage/mds/doc/Commitdir.txt | 24 - branches/sage/mds/doc/anchortable.txt | 54 - branches/sage/mds/doc/bdb.txt | 48 - branches/sage/mds/doc/caching.txt | 303 - branches/sage/mds/doc/dentries.txt | 4 - branches/sage/mds/doc/exports.txt | 72 - branches/sage/mds/doc/file_modes.txt | 66 - branches/sage/mds/doc/header.txt | 13 - branches/sage/mds/doc/inos.txt | 11 - branches/sage/mds/doc/journal.txt | 124 - branches/sage/mds/doc/lazy_posix.txt | 53 - branches/sage/mds/doc/mds_locks.txt | 66 - branches/sage/mds/doc/modeline.txt | 2 - branches/sage/mds/doc/osd_outline.txt | 37 - branches/sage/mds/doc/osd_replication.txt | 226 - .../sage/mds/doc/shared_write_states_nogo.txt | 39 - branches/sage/mds/doc/shutdown.txt | 13 - branches/sage/mds/dupstore.cc | 102 - branches/sage/mds/ebofs/Allocator.cc | 693 -- branches/sage/mds/ebofs/Allocator.h | 85 - branches/sage/mds/ebofs/BlockDevice.cc | 846 --- branches/sage/mds/ebofs/BlockDevice.h | 351 - branches/sage/mds/ebofs/BufferCache.cc | 1228 ---- branches/sage/mds/ebofs/BufferCache.h | 723 -- branches/sage/mds/ebofs/Cnode.h | 101 - branches/sage/mds/ebofs/Ebofs.cc | 3628 ---------- branches/sage/mds/ebofs/Ebofs.h | 370 - branches/sage/mds/ebofs/FileJournal.cc | 456 -- branches/sage/mds/ebofs/FileJournal.h | 144 - branches/sage/mds/ebofs/Journal.h | 47 - branches/sage/mds/ebofs/Onode.h | 408 -- branches/sage/mds/ebofs/Table.h | 928 --- branches/sage/mds/ebofs/mkfs.ebofs.cc | 349 - branches/sage/mds/ebofs/nodes.h | 568 -- branches/sage/mds/ebofs/test.ebofs.cc | 226 - branches/sage/mds/ebofs/types.h | 171 - branches/sage/mds/extractosdmaps.cc | 64 - branches/sage/mds/fakefuse.cc | 168 - branches/sage/mds/fakesyn.cc | 181 - branches/sage/mds/include/Context.h | 153 - branches/sage/mds/include/Distribution.h | 75 - branches/sage/mds/include/bitmapper.h | 48 - branches/sage/mds/include/blobhash.h | 47 - branches/sage/mds/include/buffer.h | 1205 --- branches/sage/mds/include/ceph_fs.h | 163 - branches/sage/mds/include/encodable.h | 424 -- branches/sage/mds/include/error.h | 41 - branches/sage/mds/include/frag.h | 573 -- branches/sage/mds/include/hash.h | 70 - branches/sage/mds/include/interval_set.h | 315 - branches/sage/mds/include/lru.h | 341 - branches/sage/mds/include/object.h | 99 - branches/sage/mds/include/rangeset.h | 253 - branches/sage/mds/include/statlite.h | 72 - branches/sage/mds/include/triple.h | 28 - branches/sage/mds/include/types.h | 294 - branches/sage/mds/include/uofs.h | 51 - branches/sage/mds/include/utime.h | 149 - branches/sage/mds/include/xlist.h | 123 - branches/sage/mds/jobs/alc.tp | 38 - branches/sage/mds/jobs/alcdat/makedirs | 45 - branches/sage/mds/jobs/alcdat/makedirs.big | 45 - branches/sage/mds/jobs/alcdat/makedirs.tput | 46 - .../sage/mds/jobs/alcdat/makefiles.shared | 32 - branches/sage/mds/jobs/alcdat/openshared | 32 - branches/sage/mds/jobs/alcdat/ossh.include | 45 - .../sage/mds/jobs/alcdat/ossh.include.big | 46 - branches/sage/mds/jobs/alcdat/ossh.lib | 45 - branches/sage/mds/jobs/alcdat/ossh.lib.big | 46 - branches/sage/mds/jobs/alcdat/striping | 48 - branches/sage/mds/jobs/example | 56 - branches/sage/mds/jobs/mds/log_striping | 36 - branches/sage/mds/jobs/mds/makedir_lat | 33 - branches/sage/mds/jobs/mds/makedirs | 40 - branches/sage/mds/jobs/mds/opensshlib | 44 - branches/sage/mds/jobs/meta1 | 19 - branches/sage/mds/jobs/meta1.proc.sh | 14 - branches/sage/mds/jobs/osd/ebofs | 51 - branches/sage/mds/jobs/osd/mds_log | 43 - branches/sage/mds/jobs/osd/osd_threads | 33 - branches/sage/mds/jobs/osd/striping | 78 - branches/sage/mds/jobs/osd/wr_lat2 | 44 - branches/sage/mds/jobs/osd/write_sizes | 60 - branches/sage/mds/jobs/rados/map_dist | 32 - branches/sage/mds/jobs/rados/rep_lat | 43 - branches/sage/mds/jobs/rados/wr_sizes | 50 - branches/sage/mds/jobs/runjobsample | 26 - branches/sage/mds/kernel/Makefile | 7 - branches/sage/mds/kernel/bufferlist.h | 74 - branches/sage/mds/kernel/inode.c | 136 - branches/sage/mds/kernel/kmsg.h | 51 - branches/sage/mds/kernel/kmsgbits.h | 50 - branches/sage/mds/kernel/mds_client.h | 42 - branches/sage/mds/kernel/mdsmap.h | 42 - branches/sage/mds/kernel/monmap.h | 19 - branches/sage/mds/kernel/osd_client.h | 18 - branches/sage/mds/kernel/super.h | 75 - branches/sage/mds/mds/Anchor.h | 108 - branches/sage/mds/mds/AnchorClient.cc | 379 - branches/sage/mds/mds/AnchorClient.h | 107 - branches/sage/mds/mds/AnchorTable.cc | 713 -- branches/sage/mds/mds/AnchorTable.h | 127 - branches/sage/mds/mds/CDir.h | 540 -- branches/sage/mds/mds/Capability.h | 245 - branches/sage/mds/mds/ClientMap.cc | 126 - branches/sage/mds/mds/FileLock.h | 227 - branches/sage/mds/mds/IdAllocator.cc | 205 - branches/sage/mds/mds/IdAllocator.h | 78 - branches/sage/mds/mds/LocalLock.h | 61 - branches/sage/mds/mds/Locker.cc | 2918 -------- branches/sage/mds/mds/Locker.h | 195 - branches/sage/mds/mds/MDBalancer.cc | 1050 --- branches/sage/mds/mds/MDBalancer.h | 118 - branches/sage/mds/mds/MDCache.cc | 6444 ----------------- branches/sage/mds/mds/MDLog.cc | 511 -- branches/sage/mds/mds/MDS.cc | 1239 ---- branches/sage/mds/mds/MDS.h | 299 - branches/sage/mds/mds/MDSMap.h | 357 - branches/sage/mds/mds/Migrator.cc | 2315 ------ branches/sage/mds/mds/ScatterLock.h | 183 - branches/sage/mds/mds/Server.cc | 4057 ----------- branches/sage/mds/mds/SimpleLock.h | 301 - branches/sage/mds/mds/events/EAnchor.h | 80 - branches/sage/mds/mds/events/EAnchorClient.h | 56 - branches/sage/mds/mds/events/EExport.h | 63 - branches/sage/mds/mds/events/EFragment.h | 54 - branches/sage/mds/mds/events/EImportFinish.h | 60 - branches/sage/mds/mds/events/EMetaBlob.h | 501 -- branches/sage/mds/mds/events/EOpen.h | 53 - branches/sage/mds/mds/events/EPurgeFinish.h | 54 - branches/sage/mds/mds/events/ESlaveUpdate.h | 79 - branches/sage/mds/mds/events/EString.h | 56 - branches/sage/mds/mds/events/ESubtreeMap.h | 47 - branches/sage/mds/mds/mdstypes.h | 690 -- branches/sage/mds/messages/MAnchor.h | 74 - branches/sage/mds/messages/MCacheExpire.h | 127 - branches/sage/mds/messages/MClientFileCaps.h | 115 - branches/sage/mds/messages/MClientMount.h | 43 - branches/sage/mds/messages/MClientReconnect.h | 59 - branches/sage/mds/messages/MClientReply.h | 289 - branches/sage/mds/messages/MClientRequest.h | 331 - .../sage/mds/messages/MClientRequestForward.h | 59 - branches/sage/mds/messages/MClientSession.h | 62 - branches/sage/mds/messages/MClientUnmount.h | 40 - branches/sage/mds/messages/MDentryUnlink.h | 82 - branches/sage/mds/messages/MDiscover.h | 108 - branches/sage/mds/messages/MDiscoverReply.h | 300 - branches/sage/mds/messages/MExportDir.h | 65 - branches/sage/mds/messages/MExportDirAck.h | 46 - branches/sage/mds/messages/MExportDirCancel.h | 49 - .../sage/mds/messages/MExportDirDiscoverAck.h | 60 - branches/sage/mds/messages/MExportDirFinish.h | 46 - branches/sage/mds/messages/MExportDirNotify.h | 85 - .../sage/mds/messages/MExportDirNotifyAck.h | 50 - branches/sage/mds/messages/MExportDirPrep.h | 205 - .../sage/mds/messages/MExportDirPrepAck.h | 47 - .../sage/mds/messages/MExportDirWarning.h | 50 - .../sage/mds/messages/MExportDirWarningAck.h | 45 - branches/sage/mds/messages/MGenericMessage.h | 45 - branches/sage/mds/messages/MHeartbeat.h | 60 - branches/sage/mds/messages/MInodeFileCaps.h | 57 - branches/sage/mds/messages/MLock.h | 126 - branches/sage/mds/messages/MMDSBeacon.h | 63 - branches/sage/mds/messages/MMDSBoot.h | 39 - branches/sage/mds/messages/MMDSCacheRejoin.h | 230 - .../sage/mds/messages/MMDSFragmentNotify.h | 60 - branches/sage/mds/messages/MMDSGetMap.h | 40 - branches/sage/mds/messages/MMDSMap.h | 79 - branches/sage/mds/messages/MMDSResolve.h | 66 - branches/sage/mds/messages/MMDSResolveAck.h | 56 - branches/sage/mds/messages/MMonCommand.h | 54 - branches/sage/mds/messages/MMonCommandAck.h | 46 - branches/sage/mds/messages/MMonElection.h | 63 - .../sage/mds/messages/MMonElectionCollect.h | 43 - .../sage/mds/messages/MMonElectionRefresh.h | 52 - .../sage/mds/messages/MMonElectionStatus.h | 51 - branches/sage/mds/messages/MMonOSDMapInfo.h | 50 - branches/sage/mds/messages/MMonOSDMapLease.h | 50 - .../sage/mds/messages/MMonOSDMapLeaseAck.h | 45 - .../sage/mds/messages/MMonOSDMapUpdateAck.h | 43 - .../mds/messages/MMonOSDMapUpdateCommit.h | 43 - .../mds/messages/MMonOSDMapUpdatePrepare.h | 53 - branches/sage/mds/messages/MMonPaxos.h | 98 - branches/sage/mds/messages/MOSDBoot.h | 51 - branches/sage/mds/messages/MOSDFailure.h | 55 - branches/sage/mds/messages/MOSDGetMap.h | 51 - branches/sage/mds/messages/MOSDIn.h | 43 - branches/sage/mds/messages/MOSDMap.h | 71 - branches/sage/mds/messages/MOSDOp.h | 280 - branches/sage/mds/messages/MOSDOpReply.h | 164 - branches/sage/mds/messages/MOSDOut.h | 43 - .../sage/mds/messages/MOSDPGActivateSet.h | 50 - branches/sage/mds/messages/MOSDPGLog.h | 59 - branches/sage/mds/messages/MOSDPGNotify.h | 55 - branches/sage/mds/messages/MOSDPGPeer.h | 58 - branches/sage/mds/messages/MOSDPGPeerAck.h | 70 - .../sage/mds/messages/MOSDPGPeerRequest.h | 51 - branches/sage/mds/messages/MOSDPGQuery.h | 52 - branches/sage/mds/messages/MOSDPGRemove.h | 52 - branches/sage/mds/messages/MOSDPGSummary.h | 69 - branches/sage/mds/messages/MOSDPGUpdate.h | 71 - branches/sage/mds/messages/MOSDPing.h | 49 - branches/sage/mds/messages/MPGStats.h | 43 - branches/sage/mds/messages/MPing.h | 43 - branches/sage/mds/messages/MPingAck.h | 42 - branches/sage/mds/messages/MStatfs.h | 42 - branches/sage/mds/messages/MStatfsReply.h | 45 - branches/sage/mds/mkmonmap.cc | 68 - branches/sage/mds/mon/ClientMonitor.cc | 256 - branches/sage/mds/mon/ClientMonitor.h | 178 - branches/sage/mds/mon/Elector.cc | 293 - branches/sage/mds/mon/Elector.h | 92 - branches/sage/mds/mon/MDSMonitor.cc | 633 -- branches/sage/mds/mon/MDSMonitor.h | 100 - branches/sage/mds/mon/MonMap.h | 101 - branches/sage/mds/mon/Monitor.cc | 405 -- branches/sage/mds/mon/Monitor.h | 154 - branches/sage/mds/mon/MonitorStore.cc | 222 - branches/sage/mds/mon/MonitorStore.h | 82 - branches/sage/mds/mon/OSDMonitor.cc | 829 --- branches/sage/mds/mon/OSDMonitor.h | 131 - branches/sage/mds/mon/PGMap.h | 103 - branches/sage/mds/mon/PGMonitor.cc | 219 - branches/sage/mds/mon/PGMonitor.h | 58 - branches/sage/mds/mon/Paxos.cc | 784 -- branches/sage/mds/mon/Paxos.h | 251 - branches/sage/mds/mon/PaxosService.cc | 172 - branches/sage/mds/mon/PaxosService.h | 107 - branches/sage/mds/mon/mon_types.h | 35 - branches/sage/mds/msg/Dispatcher.cc | 28 - branches/sage/mds/msg/Dispatcher.h | 34 - branches/sage/mds/msg/FakeMessenger.cc | 413 -- branches/sage/mds/msg/FakeMessenger.h | 89 - branches/sage/mds/msg/Message.cc | 383 - branches/sage/mds/msg/Message.h | 262 - branches/sage/mds/msg/Messenger.cc | 39 - branches/sage/mds/msg/Messenger.h | 101 - branches/sage/mds/msg/SimpleMessenger.cc | 1410 ---- branches/sage/mds/msg/SimpleMessenger.h | 312 - branches/sage/mds/msg/msg_types.h | 192 - branches/sage/mds/msg/tcp.cc | 93 - branches/sage/mds/msg/tcp.h | 69 - branches/sage/mds/newsyn.cc | 438 -- branches/sage/mds/osbdb/OSBDB.cc | 2169 ------ branches/sage/mds/osbdb/OSBDB.h | 482 -- branches/sage/mds/osd/Ager.cc | 333 - branches/sage/mds/osd/Ager.h | 44 - branches/sage/mds/osd/BDBMap.h | 137 - branches/sage/mds/osd/Fake.h | 262 - branches/sage/mds/osd/FakeStore.cc | 742 -- branches/sage/mds/osd/FakeStore.h | 114 - .../sage/mds/osd/FakeStoreBDBCollections.h | 169 - branches/sage/mds/osd/OSD.cc | 2377 ------ branches/sage/mds/osd/OSD.h | 366 - branches/sage/mds/osd/OSDMap.h | 531 -- branches/sage/mds/osd/ObjectStore.cc | 152 - branches/sage/mds/osd/ObjectStore.h | 611 -- branches/sage/mds/osd/PG.cc | 1289 ---- branches/sage/mds/osd/PG.h | 753 -- branches/sage/mds/osd/RAID4PG.cc | 123 - branches/sage/mds/osd/RAID4PG.h | 74 - branches/sage/mds/osd/ReplicatedPG.cc | 1972 ----- branches/sage/mds/osd/ReplicatedPG.h | 170 - branches/sage/mds/osd/osd_types.h | 321 - branches/sage/mds/osdc/Blinker.h | 92 - branches/sage/mds/osdc/Filer.cc | 235 - branches/sage/mds/osdc/Filer.h | 165 - branches/sage/mds/osdc/Journaler.cc | 666 -- branches/sage/mds/osdc/ObjectCacher.cc | 1587 ---- branches/sage/mds/osdc/ObjectCacher.h | 566 -- branches/sage/mds/osdc/Objecter.cc | 913 --- branches/sage/mds/osdc/Objecter.h | 230 - branches/sage/mds/script/add_header.pl | 26 - branches/sage/mds/script/adjusttabs.pl | 24 - branches/sage/mds/script/check_cache_dumps.pl | 56 - branches/sage/mds/script/clean_osd_cow.sh | 3 - branches/sage/mds/script/clean_trace.pl | 8 - branches/sage/mds/script/comb.pl | 113 - branches/sage/mds/script/convert_soe_trace.pl | 39 - branches/sage/mds/script/find_auth_pins.pl | 51 - branches/sage/mds/script/find_bufferleaks.pl | 69 - .../sage/mds/script/find_lost_bdev_ops.pl | 34 - branches/sage/mds/script/find_lost_commit.pl | 38 - .../sage/mds/script/find_lost_objecter.pl | 34 - branches/sage/mds/script/find_pathpins.pl | 41 - branches/sage/mds/script/find_requests.pl | 42 - branches/sage/mds/script/find_waiters.pl | 46 - branches/sage/mds/script/fix_modeline.pl | 29 - branches/sage/mds/script/gprofnewsyn | 12 - branches/sage/mds/script/grepblock | 15 - branches/sage/mds/script/merge_cdfs.pl | 24 - branches/sage/mds/script/merge_trace_rw.pl | 42 - branches/sage/mds/script/plot.pl | 48 - branches/sage/mds/script/profonly.pl | 12 - branches/sage/mds/script/runjob.pl | 341 - branches/sage/mds/script/runset.pl | 380 - branches/sage/mds/script/smooth.pl | 41 - branches/sage/mds/script/study_find.pl | 224 - .../mds/script/study_hardlink_lifetimes.pl | 131 - branches/sage/mds/script/study_lookups.pl | 137 - branches/sage/mds/script/sum.pl | 148 - branches/sage/mds/test/fakemds.cc | 104 - branches/sage/mds/test/fg.cc | 19 - branches/sage/mds/test/gprof-helper.c | 120 - branches/sage/mds/test/makedirs.cc | 38 - branches/sage/mds/test/mpitest.cc | 111 - branches/sage/mds/test/mttest.cc | 140 - branches/sage/mds/test/rushconfig | 7 - branches/sage/mds/test/rushtest.cc | 49 - branches/sage/mds/test/rushtest.cc~ | 49 - branches/sage/mds/test/test_disk_bw.cc | 59 - branches/sage/mds/test/test_seek_read.c | 53 - branches/sage/mds/test/testbucket.cc | 67 - branches/sage/mds/test/testbuffers.cc | 40 - branches/sage/mds/test/testcounter.cc | 70 - branches/sage/mds/test/testcrush.cc | 266 - branches/sage/mds/test/testfilepath.cc | 22 - branches/sage/mds/test/testmpi.cc | 53 - branches/sage/mds/test/testnewbuffers.cc | 91 - branches/sage/mds/test/testos.cc | 343 - branches/sage/mds/test/testosbdb.cc | 347 - branches/sage/mds/test/testtree.cc | 46 - branches/sage/mds/test/testxattr.cc | 31 - branches/sage/mds/valgrind.supp | 62 - {trunk/fusetrace => fusetrace}/Makefile | 0 .../fusetrace => fusetrace}/fusetrace_ll.cc | 0 {branches/marnberg/quota => src}/COPYING | 0 {trunk/ceph => src}/Makefile | 0 {branches/marnberg/quota => src}/README | 0 {trunk/ceph => src}/TODO | 0 {trunk/ceph => src}/active/README | 0 .../sage/crush => src}/active/activemaster.cc | 0 .../sage/crush => src}/active/activemaster.h | 0 {trunk/ceph => src}/active/activeslave.cc | 0 {trunk/ceph => src}/active/activeslave.h | 0 .../sage/crush => src}/active/activetaskd.cc | 0 .../sage/crush => src}/active/activetaskd.h | 0 .../sage/crush => src}/active/client_init.cc | 0 .../sage/crush => src}/active/client_init.h | 0 {trunk/ceph => src}/active/common.h | 0 .../crush => src}/active/echotestclient.cc | 0 .../crush => src}/active/echotestclient.h | 0 {branches/sage/crush => src}/active/inet.h | 0 {trunk/ceph => src}/active/msgtestclient.cc | 0 {trunk/ceph => src}/active/msgtestclient.h | 0 .../sage/crush => src}/active/trivial_task.cc | 0 .../sage/crush => src}/active/trivial_task.h | 0 {trunk/ceph => src}/active/utility.h | 0 {branches/sage/crush => src}/cfuse.cc | 0 {trunk/ceph => src}/client/Client.cc | 0 {trunk/ceph => src}/client/Client.h | 0 .../sage/crush => src}/client/FileCache.cc | 0 .../sage/crush => src}/client/FileCache.h | 0 {trunk/ceph => src}/client/SyntheticClient.cc | 0 {trunk/ceph => src}/client/SyntheticClient.h | 0 {branches/sage/crush => src}/client/Trace.cc | 0 {branches/sage/crush => src}/client/Trace.h | 0 {branches/sage/crush => src}/client/fuse.cc | 0 {branches/sage/crush => src}/client/fuse.h | 0 .../sage/crush => src}/client/fuse_ll.cc | 0 {branches/sage/crush => src}/client/fuse_ll.h | 0 .../client/hadoop/CephFSInterface.cc | 0 .../client/hadoop/CephFSInterface.h | 0 {branches/sage/crush => src}/client/ldceph.cc | 0 {branches/sage/crush => src}/cmds.cc | 0 {branches/sage/ebofs2 => src}/cmon.cc | 0 {branches/sage/crush => src}/cmonctl.cc | 0 {branches/sage/crush => src}/common/Clock.cc | 0 {trunk/ceph => src}/common/Clock.h | 0 {branches/sage/crush => src}/common/Cond.h | 0 .../sage/crush => src}/common/DecayCounter.h | 0 {branches/sage/crush => src}/common/LogType.h | 0 {trunk/ceph => src}/common/Logger.cc | 0 {branches/sage/crush => src}/common/Logger.h | 0 {branches/sage/crush => src}/common/Mutex.h | 0 {branches/sage/crush => src}/common/RWLock.h | 0 .../sage/crush => src}/common/Semaphore.h | 0 {branches/sage/crush => src}/common/Thread.h | 0 .../sage/crush => src}/common/ThreadPool.h | 0 {branches/sage/crush => src}/common/Timer.cc | 0 {branches/sage/crush => src}/common/Timer.h | 0 {trunk/ceph => src}/config.cc | 0 {trunk/ceph => src}/config.h | 0 {branches/sage/crush => src}/cosd.cc | 0 .../sage/crush => src}/crush.old/BinaryTree.h | 0 .../sage/crush => src}/crush.old/Bucket.h | 0 {branches/sage/crush => src}/crush.old/Hash.h | 0 .../sage/crush => src}/crush.old/crush.h | 0 .../crush.old}/test/bucket_movement.cc | 0 .../crush.old}/test/bucket_variance.cc | 0 .../crush.old}/test/cluster_movement.cc | 0 .../test/cluster_movement_remove.cc | 0 .../crush.old}/test/cluster_movement_rush.cc | 0 .../crush.old}/test/creeping_failure.cc | 0 .../test/creeping_failure_variance.cc | 0 .../crush.old}/test/depth_variance.cc | 0 .../crush => src/crush.old}/test/mixed.cc | 0 .../crush => src/crush.old}/test/movement.cc | 0 .../crush.old}/test/movement_failed.cc | 0 .../crush => src/crush.old}/test/overload.cc | 0 .../crush.old}/test/overload_variance.cc | 0 .../crush => src/crush.old}/test/sizes.cc | 0 .../crush.old}/test/smallbucket.cc | 0 .../crush.old}/test/speed_bucket.cc | 0 .../crush.old}/test/speed_depth.cc | 0 .../crush.old}/test/speed_rush.cc | 0 .../quota/crush => src/crush.old}/test/t.cc | 0 .../crush.old}/test/testbucket.cc | 0 .../crush.old}/test/testnormal.cc | 0 {trunk/ceph => src}/crush/CrushWrapper.h | 0 {trunk/ceph => src}/crush/Makefile | 0 {branches/sage/crush => src}/crush/buckets.c | 0 {trunk/ceph => src}/crush/builder.c | 0 {trunk/ceph => src}/crush/builder.h | 0 {trunk/ceph => src}/crush/crush.c | 0 {trunk/ceph => src}/crush/crush.h | 0 {branches/sage/crush => src}/crush/hash.h | 0 {trunk/ceph => src}/crush/mapper.c | 0 {branches/sage/crush => src}/crush/mapper.h | 0 {trunk/ceph => src}/crush/test.c | 0 {branches/sage/crush => src}/crush/types.h | 0 {branches/sage/crush => src}/csyn.cc | 0 .../sage/crush => src}/doc/Commitdir.txt | 0 .../sage/crush => src}/doc/anchortable.txt | 0 {branches/marnberg/quota => src}/doc/bdb.txt | 0 {branches/sage/crush => src}/doc/caching.txt | 0 {branches/sage/crush => src}/doc/exports.txt | 0 {branches/sage/crush => src}/doc/header.txt | 0 {branches/marnberg/quota => src}/doc/inos.txt | 0 .../marnberg/quota => src}/doc/lazy_posix.txt | 0 .../sage/crush => src}/doc/mds_locks.txt | 0 {branches/sage/crush => src}/doc/modeline.txt | 0 .../doc/shared_write_states_nogo.txt | 0 {trunk/ceph => src}/dupstore.cc | 0 .../sage/crush => src}/ebofs/Allocator.cc | 0 .../sage/crush => src}/ebofs/Allocator.h | 0 {trunk/ceph => src}/ebofs/BlockDevice.cc | 0 {trunk/ceph => src}/ebofs/BlockDevice.h | 0 {trunk/ceph => src}/ebofs/BufferCache.cc | 0 {trunk/ceph => src}/ebofs/BufferCache.h | 0 {trunk/ceph => src}/ebofs/Cnode.h | 0 {trunk/ceph => src}/ebofs/Ebofs.cc | 0 {trunk/ceph => src}/ebofs/Ebofs.h | 0 .../sage/crush => src}/ebofs/FileJournal.cc | 0 {trunk/ceph => src}/ebofs/FileJournal.h | 0 {branches/sage/crush => src}/ebofs/Journal.h | 0 {trunk/ceph => src}/ebofs/Onode.h | 0 {trunk/ceph => src}/ebofs/Table.h | 0 {trunk/ceph => src}/ebofs/csum.h | 0 .../sage/crush => src}/ebofs/mkfs.ebofs.cc | 0 {trunk/ceph => src}/ebofs/nodes.h | 0 {trunk/ceph => src}/ebofs/test.ebofs.cc | 0 {trunk/ceph => src}/ebofs/types.h | 0 .../sage/crush => src}/extractosdmaps.cc | 0 {trunk/ceph => src}/fakefuse.cc | 0 {trunk/ceph => src}/fakesyn.cc | 0 .../sage/crush => src}/include/Context.h | 0 .../sage/crush => src}/include/Distribution.h | 0 {trunk/ceph => src}/include/atomic.h | 0 .../sage/crush => src}/include/bitmapper.h | 0 .../sage/crush => src}/include/blobhash.h | 0 {trunk/ceph => src}/include/buffer.h | 0 {trunk/ceph => src}/include/ceph_fs.h | 0 .../sage/crush => src}/include/encodable.h | 0 {branches/sage/crush => src}/include/error.h | 0 {branches/sage/mds => src}/include/filepath.h | 0 {trunk/ceph => src}/include/frag.h | 0 {trunk/ceph => src}/include/hash.h | 0 {trunk/ceph => src}/include/interval_set.h | 0 {branches/sage/crush => src}/include/lru.h | 0 {trunk/ceph => src}/include/object.h | 0 {trunk/ceph => src}/include/pobject.h | 0 .../sage/crush => src}/include/rangeset.h | 0 .../sage/crush => src}/include/statlite.h | 0 {branches/sage/crush => src}/include/triple.h | 0 {trunk/ceph => src}/include/types.h | 0 {branches/sage/crush => src}/include/uofs.h | 0 {trunk/ceph => src}/include/utime.h | 0 {branches/sage/crush => src}/include/xlist.h | 0 {branches/marnberg/quota => src}/jobs/alc.tp | 0 .../quota => src}/jobs/alcdat/makedirs | 0 .../quota => src}/jobs/alcdat/makedirs.big | 0 .../quota => src}/jobs/alcdat/makedirs.tput | 0 .../jobs/alcdat/makefiles.shared | 0 .../quota => src}/jobs/alcdat/openshared | 0 .../quota => src}/jobs/alcdat/ossh.include | 0 .../jobs/alcdat/ossh.include.big | 0 .../quota => src}/jobs/alcdat/ossh.lib | 0 .../quota => src}/jobs/alcdat/ossh.lib.big | 0 .../quota => src}/jobs/alcdat/striping | 0 {branches/marnberg/quota => src}/jobs/example | 0 .../quota => src}/jobs/mds/log_striping | 0 .../quota => src}/jobs/mds/makedir_lat | 0 .../marnberg/quota => src}/jobs/mds/makedirs | 0 .../quota => src}/jobs/mds/opensshlib | 0 {branches/marnberg/quota => src}/jobs/meta1 | 0 .../marnberg/quota => src}/jobs/meta1.proc.sh | 0 .../marnberg/quota => src}/jobs/osd/ebofs | 0 .../marnberg/quota => src}/jobs/osd/mds_log | 0 .../quota => src}/jobs/osd/osd_threads | 0 .../marnberg/quota => src}/jobs/osd/striping | 0 .../marnberg/quota => src}/jobs/osd/wr_lat2 | 0 .../quota => src}/jobs/osd/write_sizes | 0 .../quota => src}/jobs/rados/map_dist | 0 .../marnberg/quota => src}/jobs/rados/rep_lat | 0 {trunk/ceph => src}/jobs/rados/wr_sizes | 0 .../sage/crush => src}/jobs/runjobsample | 0 {trunk/ceph => src}/kernel/Makefile | 0 {trunk/ceph => src}/kernel/README | 0 {trunk/ceph => src}/kernel/addr.c | 0 {trunk/ceph => src}/kernel/client.c | 0 {trunk/ceph => src}/kernel/client.h | 0 {trunk/ceph => src}/kernel/crush/crush.c | 0 {trunk/ceph => src}/kernel/crush/crush.h | 0 {trunk/ceph => src}/kernel/crush/hash.h | 0 {trunk/ceph => src}/kernel/crush/mapper.c | 0 {trunk/ceph => src}/kernel/crush/mapper.h | 0 {trunk/ceph => src}/kernel/dir.c | 0 {trunk/ceph => src}/kernel/file.c | 0 {trunk/ceph => src}/kernel/inode.c | 0 {trunk/ceph => src}/kernel/kconfig.patch | 0 {trunk/ceph => src}/kernel/ktcp.c | 0 {trunk/ceph => src}/kernel/ktcp.h | 0 {trunk/ceph => src}/kernel/mds_client.c | 0 {trunk/ceph => src}/kernel/mds_client.h | 0 {trunk/ceph => src}/kernel/mdsmap.c | 0 {trunk/ceph => src}/kernel/mdsmap.h | 0 {trunk/ceph => src}/kernel/messenger.c | 0 {trunk/ceph => src}/kernel/messenger.h | 0 {trunk/ceph => src}/kernel/mon_client.c | 0 {trunk/ceph => src}/kernel/mon_client.h | 0 {trunk/ceph => src}/kernel/osd_client.c | 0 {trunk/ceph => src}/kernel/osd_client.h | 0 {trunk/ceph => src}/kernel/sample.uml.config | 0 {trunk/ceph => src}/kernel/super.c | 0 {trunk/ceph => src}/kernel/super.h | 0 {trunk/ceph => src}/kernel/test/Makefile | 0 {trunk/ceph => src}/kernel/test/kernclient.c | 0 {trunk/ceph => src}/kernel/test/kernserver.c | 0 {trunk/ceph => src}/kernel/test/ktcp.c | 0 {trunk/ceph => src}/kernel/test/ktcp.h | 0 {trunk/ceph => src}/kernel/test/messenger.h | 0 .../ceph => src}/kernel/test/messenger_mini.c | 0 {trunk/ceph => src}/kernel/test/threadtest.c | 0 {trunk/ceph => src}/kernel/test/userclient.c | 0 {trunk/ceph => src}/kernel/test/userserver.c | 0 {branches/sage/ebofs2 => src}/mds/Anchor.h | 0 .../sage/ebofs2 => src}/mds/AnchorClient.cc | 0 .../sage/crush => src}/mds/AnchorClient.h | 0 .../sage/ebofs2 => src}/mds/AnchorTable.cc | 0 .../sage/crush => src}/mds/AnchorTable.h | 0 {branches/sage/mds => src}/mds/CDentry.cc | 0 {branches/sage/mds => src}/mds/CDentry.h | 0 {branches/sage/mds => src}/mds/CDir.cc | 0 {branches/sage/crush => src}/mds/CDir.h | 0 {branches/sage/mds => src}/mds/CInode.cc | 0 {branches/sage/mds => src}/mds/CInode.h | 0 {branches/sage/crush => src}/mds/Capability.h | 0 {branches/sage/crush => src}/mds/ClientMap.cc | 0 {branches/sage/mds => src}/mds/ClientMap.h | 0 {branches/sage/crush => src}/mds/FileLock.h | 0 .../sage/crush => src}/mds/IdAllocator.cc | 0 .../sage/crush => src}/mds/IdAllocator.h | 0 {branches/sage/crush => src}/mds/LocalLock.h | 0 {trunk/ceph => src}/mds/Locker.cc | 0 {branches/sage/crush => src}/mds/Locker.h | 0 {branches/sage/mds => src}/mds/LogEvent.cc | 0 {branches/sage/mds => src}/mds/LogEvent.h | 0 {branches/sage/mds => src}/mds/LogSegment.h | 0 {trunk/ceph => src}/mds/MDBalancer.cc | 0 {branches/sage/crush => src}/mds/MDBalancer.h | 0 {trunk/ceph => src}/mds/MDCache.cc | 0 {branches/sage/mds => src}/mds/MDCache.h | 0 {trunk/ceph => src}/mds/MDLog.cc | 0 {branches/sage/mds => src}/mds/MDLog.h | 0 {trunk/ceph => src}/mds/MDS.cc | 0 {trunk/ceph => src}/mds/MDS.h | 0 {trunk/ceph => src}/mds/MDSMap.h | 0 {trunk/ceph => src}/mds/Migrator.cc | 0 {branches/sage/mds => src}/mds/Migrator.h | 0 .../sage/crush => src}/mds/ScatterLock.h | 0 {trunk/ceph => src}/mds/Server.cc | 0 {branches/sage/mds => src}/mds/Server.h | 0 {branches/sage/crush => src}/mds/SimpleLock.h | 0 .../sage/crush => src}/mds/events/EAnchor.h | 0 .../crush => src}/mds/events/EAnchorClient.h | 0 .../sage/crush => src}/mds/events/EExport.h | 0 .../sage/crush => src}/mds/events/EFragment.h | 0 .../crush => src}/mds/events/EImportFinish.h | 0 .../mds => src}/mds/events/EImportStart.h | 0 .../sage/crush => src}/mds/events/EMetaBlob.h | 0 .../sage/crush => src}/mds/events/EOpen.h | 0 .../crush => src}/mds/events/EPurgeFinish.h | 0 .../sage/mds => src}/mds/events/ESession.h | 0 .../sage/mds => src}/mds/events/ESessions.h | 0 .../crush => src}/mds/events/ESlaveUpdate.h | 0 .../sage/crush => src}/mds/events/EString.h | 0 .../crush => src}/mds/events/ESubtreeMap.h | 0 .../sage/mds => src}/mds/events/EUpdate.h | 0 {branches/sage/mds => src}/mds/journal.cc | 0 {trunk/ceph => src}/mds/mdstypes.h | 0 .../sage/crush => src}/messages/MAnchor.h | 0 .../crush => src}/messages/MCacheExpire.h | 0 .../ceph => src}/messages/MClientFileCaps.h | 0 {trunk/ceph => src}/messages/MClientMount.h | 0 .../ceph => src}/messages/MClientReconnect.h | 0 {trunk/ceph => src}/messages/MClientReply.h | 0 {trunk/ceph => src}/messages/MClientRequest.h | 0 .../messages/MClientRequestForward.h | 0 {trunk/ceph => src}/messages/MClientSession.h | 0 {trunk/ceph => src}/messages/MClientUnmount.h | 0 .../crush => src}/messages/MDentryUnlink.h | 0 .../sage/mds => src}/messages/MDirUpdate.h | 0 .../sage/crush => src}/messages/MDiscover.h | 0 .../crush => src}/messages/MDiscoverReply.h | 0 .../sage/mds => src}/messages/MExportCaps.h | 0 .../mds => src}/messages/MExportCapsAck.h | 0 .../sage/crush => src}/messages/MExportDir.h | 0 .../crush => src}/messages/MExportDirAck.h | 0 .../crush => src}/messages/MExportDirCancel.h | 0 .../mds => src}/messages/MExportDirDiscover.h | 0 .../messages/MExportDirDiscoverAck.h | 0 .../crush => src}/messages/MExportDirFinish.h | 0 .../crush => src}/messages/MExportDirNotify.h | 0 .../messages/MExportDirNotifyAck.h | 0 .../crush => src}/messages/MExportDirPrep.h | 0 .../messages/MExportDirPrepAck.h | 0 .../messages/MExportDirWarning.h | 0 .../messages/MExportDirWarningAck.h | 0 .../crush => src}/messages/MGenericMessage.h | 0 .../sage/crush => src}/messages/MHeartbeat.h | 0 .../crush => src}/messages/MInodeFileCaps.h | 0 {trunk/ceph => src}/messages/MLock.h | 0 .../sage/ebofs2 => src}/messages/MMDSBeacon.h | 0 .../sage/crush => src}/messages/MMDSBoot.h | 0 .../crush => src}/messages/MMDSCacheRejoin.h | 0 .../messages/MMDSFragmentNotify.h | 0 {trunk/ceph => src}/messages/MMDSGetMap.h | 0 {trunk/ceph => src}/messages/MMDSMap.h | 0 .../sage/crush => src}/messages/MMDSResolve.h | 0 .../crush => src}/messages/MMDSResolveAck.h | 0 .../mds => src}/messages/MMDSSlaveRequest.h | 0 .../sage/crush => src}/messages/MMonCommand.h | 0 .../crush => src}/messages/MMonCommandAck.h | 0 .../crush => src}/messages/MMonElection.h | 0 .../messages/MMonElectionCollect.h | 0 .../messages/MMonElectionRefresh.h | 0 .../messages/MMonElectionStatus.h | 0 {trunk/ceph => src}/messages/MMonMap.h | 0 .../crush => src}/messages/MMonOSDMapInfo.h | 0 .../crush => src}/messages/MMonOSDMapLease.h | 0 .../messages/MMonOSDMapLeaseAck.h | 0 .../messages/MMonOSDMapUpdateAck.h | 0 .../messages/MMonOSDMapUpdateCommit.h | 0 .../messages/MMonOSDMapUpdatePrepare.h | 0 .../sage/crush => src}/messages/MMonPaxos.h | 0 .../sage/crush => src}/messages/MOSDBoot.h | 0 .../sage/crush => src}/messages/MOSDFailure.h | 0 {trunk/ceph => src}/messages/MOSDGetMap.h | 0 .../sage/crush => src}/messages/MOSDIn.h | 0 {trunk/ceph => src}/messages/MOSDMap.h | 0 {trunk/ceph => src}/messages/MOSDOp.h | 0 {trunk/ceph => src}/messages/MOSDOpReply.h | 0 .../sage/crush => src}/messages/MOSDOut.h | 0 .../messages/MOSDPGActivateSet.h | 0 .../sage/crush => src}/messages/MOSDPGLog.h | 0 .../crush => src}/messages/MOSDPGNotify.h | 0 .../sage/crush => src}/messages/MOSDPGPeer.h | 0 .../crush => src}/messages/MOSDPGPeerAck.h | 0 .../messages/MOSDPGPeerRequest.h | 0 {trunk/ceph => src}/messages/MOSDPGQuery.h | 0 .../crush => src}/messages/MOSDPGRemove.h | 0 .../crush => src}/messages/MOSDPGSummary.h | 0 .../crush => src}/messages/MOSDPGUpdate.h | 0 .../sage/crush => src}/messages/MOSDPing.h | 0 .../sage/crush => src}/messages/MPGStats.h | 0 {trunk/ceph => src}/messages/MPing.h | 0 {trunk/ceph => src}/messages/MPingAck.h | 0 {trunk/ceph => src}/messages/MStatfs.h | 0 {trunk/ceph => src}/messages/MStatfsReply.h | 0 {trunk/ceph => src}/mkmonmap.cc | 0 {trunk/ceph => src}/mon/ClientMonitor.cc | 0 {trunk/ceph => src}/mon/ClientMonitor.h | 0 {trunk/ceph => src}/mon/Elector.cc | 0 {branches/sage/crush => src}/mon/Elector.h | 0 {trunk/ceph => src}/mon/MDSMonitor.cc | 0 {trunk/ceph => src}/mon/MDSMonitor.h | 0 {trunk/ceph => src}/mon/MonMap.h | 0 {trunk/ceph => src}/mon/Monitor.cc | 0 {trunk/ceph => src}/mon/Monitor.h | 0 {trunk/ceph => src}/mon/MonitorStore.cc | 0 .../sage/crush => src}/mon/MonitorStore.h | 0 {trunk/ceph => src}/mon/OSDMonitor.cc | 0 {trunk/ceph => src}/mon/OSDMonitor.h | 0 {branches/sage/crush => src}/mon/PGMap.h | 0 {trunk/ceph => src}/mon/PGMonitor.cc | 0 {branches/sage/crush => src}/mon/PGMonitor.h | 0 {trunk/ceph => src}/mon/Paxos.cc | 0 {trunk/ceph => src}/mon/Paxos.h | 0 .../sage/crush => src}/mon/PaxosService.cc | 0 {trunk/ceph => src}/mon/PaxosService.h | 0 {branches/sage/crush => src}/mon/mon_types.h | 0 .../sage/crush => src}/msg/Dispatcher.cc | 0 {branches/sage/crush => src}/msg/Dispatcher.h | 0 {trunk/ceph => src}/msg/FakeMessenger.cc | 0 {trunk/ceph => src}/msg/FakeMessenger.h | 0 {trunk/ceph => src}/msg/Message.cc | 0 {trunk/ceph => src}/msg/Message.h | 0 {branches/sage/crush => src}/msg/Messenger.cc | 0 {trunk/ceph => src}/msg/Messenger.h | 0 {trunk/ceph => src}/msg/SimpleMessenger.cc | 0 {trunk/ceph => src}/msg/SimpleMessenger.h | 0 {trunk/ceph => src}/msg/msg_types.h | 0 {branches/sage/ebofs2 => src}/msg/tcp.cc | 0 {trunk/ceph => src}/msg/tcp.h | 0 {trunk/ceph => src}/newsyn.cc | 0 {branches/sage/crush => src}/osbdb/OSBDB.cc | 0 {branches/sage/crush => src}/osbdb/OSBDB.h | 0 {branches/sage/crush => src}/osd/Ager.cc | 0 {branches/sage/crush => src}/osd/Ager.h | 0 {branches/sage/crush => src}/osd/BDBMap.h | 0 {trunk/ceph => src}/osd/Fake.h | 0 {trunk/ceph => src}/osd/FakeStore.cc | 0 {trunk/ceph => src}/osd/FakeStore.h | 0 .../osd/FakeStoreBDBCollections.h | 0 {trunk/ceph => src}/osd/OSD.cc | 0 {trunk/ceph => src}/osd/OSD.h | 0 {trunk/ceph => src}/osd/OSDMap.h | 0 .../sage/crush => src}/osd/ObjectStore.cc | 0 {trunk/ceph => src}/osd/ObjectStore.h | 0 {trunk/ceph => src}/osd/PG.cc | 0 {trunk/ceph => src}/osd/PG.h | 0 {trunk/ceph => src}/osd/RAID4PG.cc | 0 {trunk/ceph => src}/osd/RAID4PG.h | 0 {trunk/ceph => src}/osd/ReplicatedPG.cc | 0 {trunk/ceph => src}/osd/ReplicatedPG.h | 0 {trunk/ceph => src}/osd/osd_types.h | 0 {branches/sage/crush => src}/osdc/Blinker.h | 0 {branches/sage/crush => src}/osdc/Filer.cc | 0 {branches/sage/crush => src}/osdc/Filer.h | 0 .../sage/crush => src}/osdc/Journaler.cc | 0 {branches/sage/mds => src}/osdc/Journaler.h | 0 .../sage/crush => src}/osdc/ObjectCacher.cc | 0 .../sage/crush => src}/osdc/ObjectCacher.h | 0 {trunk/ceph => src}/osdc/Objecter.cc | 0 {trunk/ceph => src}/osdc/Objecter.h | 0 .../sage/crush => src}/script/add_header.pl | 0 .../quota => src}/script/adjusttabs.pl | 0 .../crush => src}/script/check_cache_dumps.pl | 0 .../quota => src}/script/clean_osd_cow.sh | 0 .../quota => src}/script/clean_trace.pl | 0 {branches/sage/crush => src}/script/comb.pl | 0 .../crush => src}/script/convert_soe_trace.pl | 0 .../crush => src}/script/find_auth_pins.pl | 0 .../quota => src}/script/find_bufferleaks.pl | 0 .../script/find_lost_bdev_ops.pl | 0 .../quota => src}/script/find_lost_commit.pl | 0 .../script/find_lost_objecter.pl | 0 .../quota => src}/script/find_pathpins.pl | 0 .../quota => src}/script/find_requests.pl | 0 .../quota => src}/script/find_waiters.pl | 0 .../sage/crush => src}/script/fix_modeline.pl | 0 .../sage/crush => src}/script/gprofnewsyn | 0 .../marnberg/quota => src}/script/grepblock | 0 .../sage/crush => src}/script/merge_cdfs.pl | 0 .../quota => src}/script/merge_trace_rw.pl | 0 {branches/sage/crush => src}/script/plot.pl | 0 .../marnberg/quota => src}/script/profonly.pl | 0 {branches/sage/crush => src}/script/runjob.pl | 0 .../marnberg/quota => src}/script/runset.pl | 0 {branches/sage/crush => src}/script/smooth.pl | 0 .../sage/crush => src}/script/study_find.pl | 0 .../script/study_hardlink_lifetimes.pl | 0 .../crush => src}/script/study_lookups.pl | 0 .../marnberg/quota => src}/script/sum.pl | 0 .../marnberg/quota => src}/test/fakemds.cc | 0 {branches/sage/crush => src}/test/fg.cc | 0 .../quota => src}/test/gprof-helper.c | 0 .../marnberg/quota => src}/test/makedirs.cc | 0 .../marnberg/quota => src}/test/mpitest.cc | 0 .../marnberg/quota => src}/test/mttest.cc | 0 .../marnberg/quota => src}/test/rushconfig | 0 .../marnberg/quota => src}/test/rushtest.cc | 0 .../marnberg/quota => src}/test/rushtest.cc~ | 0 .../sage/crush => src}/test/test_disk_bw.cc | 0 {trunk/ceph => src}/test/test_seek_read.c | 0 .../ceph => src}/test/test_short_seek_read.c | 0 .../marnberg/quota => src}/test/testbucket.cc | 0 .../quota => src}/test/testbuffers.cc | 0 .../sage/crush => src}/test/testcounter.cc | 0 .../marnberg/quota => src}/test/testcrush.cc | 0 .../quota => src}/test/testfilepath.cc | 0 .../marnberg/quota => src}/test/testmpi.cc | 0 .../quota => src}/test/testnewbuffers.cc | 0 .../marnberg/quota => src}/test/testos.cc | 0 .../marnberg/quota => src}/test/testosbdb.cc | 0 .../marnberg/quota => src}/test/testtree.cc | 0 .../marnberg/quota => src}/test/testxattr.cc | 0 {trunk/ceph => src}/valgrind.supp | 0 tags/20070517_before_mds_merge/COPYING | 504 -- tags/20070517_before_mds_merge/Makefile | 264 - tags/20070517_before_mds_merge/README | 4 - tags/20070517_before_mds_merge/TODO | 322 - tags/20070517_before_mds_merge/cfuse.cc | 83 - .../client/Client.cc | 2766 ------- .../20070517_before_mds_merge/client/Client.h | 600 -- .../client/FileCache.cc | 263 - .../client/FileCache.h | 84 - .../client/SyntheticClient.cc | 1325 ---- .../client/SyntheticClient.h | 202 - .../20070517_before_mds_merge/client/Trace.cc | 125 - tags/20070517_before_mds_merge/client/Trace.h | 75 - tags/20070517_before_mds_merge/client/fuse.cc | 280 - tags/20070517_before_mds_merge/client/fuse.h | 23 - .../client/hadoop/CephFSInterface.cc | 824 --- .../client/hadoop/CephFSInterface.h | 237 - .../client/ldceph.cc | 297 - .../client/msgthread.h | 25 - tags/20070517_before_mds_merge/cmds.cc | 102 - tags/20070517_before_mds_merge/cmon.cc | 128 - .../20070517_before_mds_merge/common/Clock.cc | 19 - tags/20070517_before_mds_merge/common/Clock.h | 206 - tags/20070517_before_mds_merge/common/Cond.h | 118 - .../common/DecayCounter.h | 94 - .../common/LogType.h | 119 - .../common/Logger.cc | 216 - .../20070517_before_mds_merge/common/Logger.h | 74 - tags/20070517_before_mds_merge/common/Mutex.h | 82 - .../common/Semaphore.h | 52 - .../20070517_before_mds_merge/common/Thread.h | 66 - .../common/ThreadPool.h | 138 - .../20070517_before_mds_merge/common/Timer.cc | 333 - tags/20070517_before_mds_merge/common/Timer.h | 177 - tags/20070517_before_mds_merge/config.cc | 838 --- tags/20070517_before_mds_merge/config.h | 361 - tags/20070517_before_mds_merge/cosd.cc | 126 - .../crush/BinaryTree.h | 284 - tags/20070517_before_mds_merge/crush/Bucket.h | 631 -- tags/20070517_before_mds_merge/crush/Hash.h | 300 - tags/20070517_before_mds_merge/crush/crush.h | 534 -- .../crush/test/bucket_movement.cc | 166 - .../crush/test/bucket_variance.cc | 199 - .../crush/test/cluster_movement.cc | 217 - .../crush/test/cluster_movement_remove.cc | 229 - .../crush/test/cluster_movement_rush.cc | 218 - .../crush/test/creeping_failure.cc | 276 - .../crush/test/creeping_failure_variance.cc | 281 - .../crush/test/depth_variance.cc | 185 - .../crush/test/mixed.cc | 300 - .../crush/test/movement.cc | 223 - .../crush/test/movement_failed.cc | 246 - .../crush/test/overload.cc | 335 - .../crush/test/overload_variance.cc | 281 - .../crush/test/sizes.cc | 131 - .../crush/test/smallbucket.cc | 138 - .../crush/test/speed_bucket.cc | 86 - .../crush/test/speed_depth.cc | 174 - .../crush/test/speed_rush.cc | 145 - .../20070517_before_mds_merge/crush/test/t.cc | 25 - .../crush/test/testbucket.cc | 61 - .../crush/test/testnormal.cc | 51 - tags/20070517_before_mds_merge/csyn.cc | 101 - .../doc/Commitdir.txt | 22 - .../doc/Replication.txt | 19 - tags/20070517_before_mds_merge/doc/bdb.txt | 48 - .../20070517_before_mds_merge/doc/caching.txt | 200 - .../doc/dentries.txt | 4 - .../doc/file_modes.txt | 66 - tags/20070517_before_mds_merge/doc/header.txt | 12 - tags/20070517_before_mds_merge/doc/inos.txt | 11 - .../20070517_before_mds_merge/doc/journal.txt | 124 - .../doc/lazy_posix.txt | 53 - .../doc/osd_outline.txt | 37 - .../doc/osd_replication.txt | 226 - .../doc/performance.txt | 36 - .../doc/shared_write_states_nogo.txt | 39 - .../doc/shutdown.txt | 13 - .../ebofs/Allocator.cc | 692 -- .../ebofs/Allocator.h | 85 - .../ebofs/BlockDevice.cc | 777 -- .../ebofs/BlockDevice.h | 338 - .../ebofs/BufferCache.cc | 1147 --- .../ebofs/BufferCache.h | 697 -- tags/20070517_before_mds_merge/ebofs/Cnode.h | 100 - tags/20070517_before_mds_merge/ebofs/Ebofs.cc | 3270 --------- tags/20070517_before_mds_merge/ebofs/Ebofs.h | 330 - tags/20070517_before_mds_merge/ebofs/Onode.h | 390 - tags/20070517_before_mds_merge/ebofs/Table.h | 898 --- .../ebofs/mkfs.ebofs.cc | 299 - tags/20070517_before_mds_merge/ebofs/nodes.h | 583 -- .../ebofs/test.ebofs.cc | 224 - tags/20070517_before_mds_merge/ebofs/types.h | 168 - tags/20070517_before_mds_merge/fakefuse.cc | 156 - tags/20070517_before_mds_merge/fakesyn.cc | 189 - .../include/Context.h | 136 - .../include/Distribution.h | 74 - .../include/buffer.h | 1127 --- .../20070517_before_mds_merge/include/error.h | 40 - .../include/filepath.h | 206 - .../include/interval_set.h | 305 - tags/20070517_before_mds_merge/include/lru.h | 321 - .../include/object.h | 97 - .../include/oldbuffer.h | 357 - .../include/oldbufferlist.h | 681 -- .../include/rangeset.h | 252 - .../20070517_before_mds_merge/include/reqid.h | 64 - .../include/statlite.h | 70 - .../20070517_before_mds_merge/include/types.h | 367 - tags/20070517_before_mds_merge/include/uofs.h | 50 - tags/20070517_before_mds_merge/jobs/alc.tp | 38 - .../jobs/alcdat/makedirs | 45 - .../jobs/alcdat/makedirs.big | 45 - .../jobs/alcdat/makedirs.tput | 46 - .../jobs/alcdat/makefiles.shared | 32 - .../jobs/alcdat/openshared | 32 - .../jobs/alcdat/ossh.include | 45 - .../jobs/alcdat/ossh.include.big | 46 - .../jobs/alcdat/ossh.lib | 45 - .../jobs/alcdat/ossh.lib.big | 46 - .../jobs/alcdat/striping | 48 - tags/20070517_before_mds_merge/jobs/example | 56 - .../jobs/mds/log_striping | 36 - .../jobs/mds/makedir_lat | 33 - .../jobs/mds/makedirs | 40 - .../jobs/mds/opensshlib | 44 - tags/20070517_before_mds_merge/jobs/meta1 | 19 - .../jobs/meta1.proc.sh | 14 - tags/20070517_before_mds_merge/jobs/osd/ebofs | 51 - .../jobs/osd/mds_log | 43 - .../jobs/osd/osd_threads | 33 - .../jobs/osd/striping | 78 - .../jobs/osd/wr_lat2 | 44 - .../jobs/osd/write_sizes | 60 - .../jobs/rados/map_dist | 32 - .../jobs/rados/rep_lat | 43 - .../jobs/rados/wr_sizes | 50 - tags/20070517_before_mds_merge/mds/Anchor.h | 55 - .../mds/AnchorClient.cc | 149 - .../mds/AnchorClient.h | 55 - .../mds/AnchorTable.cc | 358 - .../mds/AnchorTable.h | 81 - tags/20070517_before_mds_merge/mds/CDentry.cc | 203 - tags/20070517_before_mds_merge/mds/CDentry.h | 288 - tags/20070517_before_mds_merge/mds/CDir.cc | 890 --- tags/20070517_before_mds_merge/mds/CDir.h | 617 -- tags/20070517_before_mds_merge/mds/CInode.cc | 506 -- tags/20070517_before_mds_merge/mds/CInode.h | 655 -- .../mds/Capability.h | 214 - .../20070517_before_mds_merge/mds/ClientMap.h | 85 - .../mds/IdAllocator.cc | 200 - .../mds/IdAllocator.h | 79 - tags/20070517_before_mds_merge/mds/Lock.h | 321 - tags/20070517_before_mds_merge/mds/Locker.cc | 2246 ------ tags/20070517_before_mds_merge/mds/Locker.h | 127 - .../20070517_before_mds_merge/mds/LogEvent.cc | 67 - tags/20070517_before_mds_merge/mds/LogEvent.h | 106 - .../mds/MDBalancer.cc | 878 --- .../mds/MDBalancer.h | 109 - tags/20070517_before_mds_merge/mds/MDCache.cc | 3536 --------- tags/20070517_before_mds_merge/mds/MDCache.h | 364 - tags/20070517_before_mds_merge/mds/MDLog.cc | 437 -- tags/20070517_before_mds_merge/mds/MDLog.h | 128 - tags/20070517_before_mds_merge/mds/MDS.cc | 1032 --- tags/20070517_before_mds_merge/mds/MDS.h | 269 - tags/20070517_before_mds_merge/mds/MDSMap.h | 288 - tags/20070517_before_mds_merge/mds/MDStore.cc | 752 -- tags/20070517_before_mds_merge/mds/MDStore.h | 75 - .../20070517_before_mds_merge/mds/Migrator.cc | 3616 --------- tags/20070517_before_mds_merge/mds/Migrator.h | 265 - tags/20070517_before_mds_merge/mds/Renamer.cc | 918 --- tags/20070517_before_mds_merge/mds/Renamer.h | 98 - tags/20070517_before_mds_merge/mds/Server.cc | 2389 ------ tags/20070517_before_mds_merge/mds/Server.h | 156 - .../mds/events/EAlloc.h | 76 - .../mds/events/EExportFinish.h | 59 - .../mds/events/EExportStart.h | 68 - .../mds/events/EImportFinish.h | 59 - .../mds/events/EImportMap.h | 66 - .../mds/events/EImportStart.h | 60 - .../mds/events/EMetaBlob.h | 339 - .../mds/events/EPurgeFinish.h | 48 - .../mds/events/EString.h | 56 - .../mds/events/EUnlink.h | 71 - .../mds/events/EUpdate.h | 49 - tags/20070517_before_mds_merge/mds/journal.cc | 589 -- tags/20070517_before_mds_merge/mds/mdstypes.h | 290 - .../messages/MAnchorReply.h | 74 - .../messages/MAnchorRequest.h | 76 - .../messages/MCacheExpire.h | 86 - .../messages/MClientBoot.h | 31 - .../messages/MClientFileCaps.h | 102 - .../messages/MClientInodeAuthUpdate.h | 46 - .../messages/MClientMount.h | 34 - .../messages/MClientMountAck.h | 59 - .../messages/MClientReply.h | 302 - .../messages/MClientRequest.h | 202 - .../messages/MDentryUnlink.h | 45 - .../messages/MDirExpire.h | 50 - .../messages/MDirExpireReq.h | 49 - .../messages/MDirUpdate.h | 71 - .../messages/MDiscover.h | 75 - .../messages/MDiscoverReply.h | 254 - .../messages/MExportDir.h | 64 - .../messages/MExportDirAck.h | 42 - .../messages/MExportDirDiscover.h | 51 - .../messages/MExportDirDiscoverAck.h | 52 - .../messages/MExportDirFinish.h | 43 - .../messages/MExportDirNotify.h | 111 - .../messages/MExportDirNotifyAck.h | 46 - .../messages/MExportDirPrep.h | 186 - .../messages/MExportDirPrepAck.h | 44 - .../messages/MExportDirWarning.h | 45 - .../messages/MFailure.h | 49 - .../messages/MFailureAck.h | 42 - .../messages/MGenericMessage.h | 44 - .../messages/MHashDir.h | 64 - .../messages/MHashDirAck.h | 42 - .../messages/MHashDirDiscover.h | 52 - .../messages/MHashDirDiscoverAck.h | 53 - .../messages/MHashDirNotify.h | 50 - .../messages/MHashDirPrep.h | 93 - .../messages/MHashDirPrepAck.h | 43 - .../messages/MHashReaddir.h | 44 - .../messages/MHashReaddirReply.h | 80 - .../messages/MHeartbeat.h | 81 - .../messages/MInodeExpire.h | 50 - .../messages/MInodeFileCaps.h | 55 - .../messages/MInodeLink.h | 47 - .../messages/MInodeLinkAck.h | 47 - .../messages/MInodeUnlink.h | 47 - .../messages/MInodeUnlinkAck.h | 44 - .../messages/MInodeUpdate.h | 61 - .../messages/MLock.h | 128 - .../messages/MMDSBeacon.h | 54 - .../messages/MMDSBoot.h | 38 - .../messages/MMDSCacheRejoin.h | 62 - .../messages/MMDSCacheRejoinAck.h | 82 - .../messages/MMDSGetMap.h | 38 - .../messages/MMDSImportMap.h | 59 - .../messages/MMDSMap.h | 78 - .../messages/MMonElectionAck.h | 31 - .../messages/MMonElectionCollect.h | 42 - .../messages/MMonElectionPropose.h | 32 - .../messages/MMonElectionRefresh.h | 51 - .../messages/MMonElectionStatus.h | 50 - .../messages/MMonElectionVictory.h | 40 - .../messages/MMonOSDMapInfo.h | 49 - .../messages/MMonOSDMapLease.h | 49 - .../messages/MMonOSDMapLeaseAck.h | 44 - .../messages/MMonOSDMapUpdateAck.h | 42 - .../messages/MMonOSDMapUpdateCommit.h | 42 - .../messages/MMonOSDMapUpdatePrepare.h | 52 - .../messages/MMonPaxos.h | 80 - .../messages/MNSConnect.h | 45 - .../messages/MNSConnectAck.h | 53 - .../messages/MNSFailure.h | 52 - .../messages/MNSLookup.h | 46 - .../messages/MNSLookupReply.h | 44 - .../messages/MNSRegister.h | 59 - .../messages/MNSRegisterAck.h | 53 - .../messages/MOSDBoot.h | 44 - .../messages/MOSDFailure.h | 49 - .../messages/MOSDGetMap.h | 45 - .../messages/MOSDIn.h | 42 - .../messages/MOSDMap.h | 69 - .../messages/MOSDOp.h | 221 - .../messages/MOSDOpReply.h | 148 - .../messages/MOSDOut.h | 42 - .../messages/MOSDPGLog.h | 61 - .../messages/MOSDPGNotify.h | 54 - .../messages/MOSDPGPeer.h | 57 - .../messages/MOSDPGPeerAck.h | 69 - .../messages/MOSDPGPeerRequest.h | 50 - .../messages/MOSDPGQuery.h | 51 - .../messages/MOSDPGRemove.h | 51 - .../messages/MOSDPGSummary.h | 65 - .../messages/MOSDPGUpdate.h | 64 - .../messages/MOSDPing.h | 50 - .../messages/MPing.h | 41 - .../messages/MPingAck.h | 40 - .../messages/MRename.h | 80 - .../messages/MRenameAck.h | 42 - .../messages/MRenameNotify.h | 80 - .../messages/MRenameNotifyAck.h | 40 - .../messages/MRenamePrep.h | 85 - .../messages/MRenameReq.h | 79 - .../messages/MRenameWarning.h | 40 - .../messages/MUnhashDir.h | 42 - .../messages/MUnhashDirAck.h | 65 - .../messages/MUnhashDirNotify.h | 50 - .../messages/MUnhashDirNotifyAck.h | 42 - .../messages/MUnhashDirPrep.h | 42 - .../messages/MUnhashDirPrepAck.h | 93 - tags/20070517_before_mds_merge/mkmonmap.cc | 67 - .../mon/ClientMonitor.cc | 109 - .../mon/ClientMonitor.h | 52 - tags/20070517_before_mds_merge/mon/Elector.cc | 219 - tags/20070517_before_mds_merge/mon/Elector.h | 72 - .../mon/MDSMonitor.cc | 370 - .../mon/MDSMonitor.h | 87 - tags/20070517_before_mds_merge/mon/MonMap.h | 103 - tags/20070517_before_mds_merge/mon/Monitor.cc | 303 - tags/20070517_before_mds_merge/mon/Monitor.h | 139 - .../mon/MonitorStore.cc | 224 - .../mon/MonitorStore.h | 81 - .../mon/OSDMonitor.cc | 902 --- .../mon/OSDMonitor.h | 110 - tags/20070517_before_mds_merge/mon/Paxos.cc | 182 - tags/20070517_before_mds_merge/mon/Paxos.h | 73 - .../msg/Dispatcher.cc | 27 - .../msg/Dispatcher.h | 33 - .../msg/FakeMessenger.cc | 338 - .../msg/FakeMessenger.h | 88 - .../msg/HostMonitor.cc | 235 - .../msg/HostMonitor.h | 97 - .../msg/MPIMessenger.cc | 608 -- .../msg/MPIMessenger.h | 56 - .../msg/MTMessenger.cc | 197 - .../msg/MTMessenger.h | 50 - tags/20070517_before_mds_merge/msg/Message.cc | 466 -- tags/20070517_before_mds_merge/msg/Message.h | 320 - .../msg/Messenger.cc | 38 - .../20070517_before_mds_merge/msg/Messenger.h | 86 - .../msg/NewMessenger.cc | 1714 ----- .../msg/NewMessenger.h | 305 - .../msg/NewerMessenger.cc | 1791 ----- .../msg/NewerMessenger.h | 343 - tags/20070517_before_mds_merge/msg/RWLock.h | 49 - .../msg/SerialMessenger.h | 28 - .../msg/SimpleMessenger.cc | 1197 --- .../msg/SimpleMessenger.h | 294 - .../msg/TCPDirectory.cc | 178 - .../msg/TCPDirectory.h | 110 - .../msg/TCPMessenger.cc | 1454 ---- .../msg/TCPMessenger.h | 115 - tags/20070517_before_mds_merge/msg/error.c | 77 - .../msg/mpistarter.cc | 62 - .../20070517_before_mds_merge/msg/msg_types.h | 186 - .../msg/new_mpistarter.cc | 43 - tags/20070517_before_mds_merge/msg/tcp.cc | 87 - tags/20070517_before_mds_merge/msg/tcp.h | 37 - tags/20070517_before_mds_merge/newsyn.cc | 419 -- tags/20070517_before_mds_merge/osbdb/OSBDB.cc | 2169 ------ tags/20070517_before_mds_merge/osbdb/OSBDB.h | 480 -- tags/20070517_before_mds_merge/osd/Ager.cc | 331 - tags/20070517_before_mds_merge/osd/Ager.h | 42 - tags/20070517_before_mds_merge/osd/BDBMap.h | 136 - tags/20070517_before_mds_merge/osd/Fake.h | 249 - .../osd/FakeStore.cc | 643 -- .../20070517_before_mds_merge/osd/FakeStore.h | 110 - .../osd/FakeStoreBDBCollections.h | 168 - .../osd/OBFSStore.cc | 244 - .../20070517_before_mds_merge/osd/OBFSStore.h | 56 - tags/20070517_before_mds_merge/osd/OSD.cc | 3494 --------- tags/20070517_before_mds_merge/osd/OSD.h | 273 - tags/20070517_before_mds_merge/osd/OSDMap.h | 519 -- .../osd/ObjectStore.cc | 149 - .../osd/ObjectStore.h | 505 -- tags/20070517_before_mds_merge/osd/PG.cc | 1333 ---- tags/20070517_before_mds_merge/osd/PG.h | 707 -- .../20070517_before_mds_merge/osd/osd_types.h | 174 - tags/20070517_before_mds_merge/osd/rush.cc | 230 - tags/20070517_before_mds_merge/osd/rush.h | 60 - tags/20070517_before_mds_merge/osd/tp.cc | 80 - tags/20070517_before_mds_merge/osdc/Blinker.h | 91 - tags/20070517_before_mds_merge/osdc/Filer.cc | 235 - tags/20070517_before_mds_merge/osdc/Filer.h | 158 - .../osdc/Journaler.cc | 610 -- .../osdc/Journaler.h | 218 - .../osdc/ObjectCacher.cc | 1555 ---- .../osdc/ObjectCacher.h | 558 -- .../osdc/Objecter.cc | 838 --- .../20070517_before_mds_merge/osdc/Objecter.h | 197 - .../script/add_header.pl | 29 - .../script/adjusttabs.pl | 24 - .../script/clean_osd_cow.sh | 3 - .../script/clean_trace.pl | 8 - tags/20070517_before_mds_merge/script/comb.pl | 113 - .../script/find_auth_pins.pl | 46 - .../script/find_bufferleaks.pl | 69 - .../script/find_lost_bdev_ops.pl | 34 - .../script/find_lost_commit.pl | 38 - .../script/find_lost_objecter.pl | 34 - .../script/find_pathpins.pl | 41 - .../script/find_requests.pl | 42 - .../script/find_waiters.pl | 46 - .../script/grepblock | 15 - .../script/merge_trace_rw.pl | 42 - .../script/profonly.pl | 12 - .../script/runset.pl | 380 - tags/20070517_before_mds_merge/script/sum.pl | 148 - .../20070517_before_mds_merge/test/fakemds.cc | 104 - .../test/gprof-helper.c | 120 - .../test/makedirs.cc | 38 - .../20070517_before_mds_merge/test/mpitest.cc | 111 - tags/20070517_before_mds_merge/test/mttest.cc | 140 - .../20070517_before_mds_merge/test/rushconfig | 7 - .../test/rushtest.cc | 49 - .../test/rushtest.cc~ | 49 - .../test/testbucket.cc | 67 - .../test/testbuffers.cc | 40 - .../test/testcrush.cc | 266 - .../test/testfilepath.cc | 22 - .../20070517_before_mds_merge/test/testmpi.cc | 53 - .../test/testnewbuffers.cc | 91 - tags/20070517_before_mds_merge/test/testos.cc | 343 - .../test/testosbdb.cc | 347 - .../test/testtree.cc | 46 - .../test/testxattr.cc | 31 - tags/20070517_before_mds_merge/valgrind.supp | 25 - trunk/ceph/COPYING | 504 -- trunk/ceph/README | 4 - trunk/ceph/active/activemaster.cc | 115 - trunk/ceph/active/activemaster.h | 18 - trunk/ceph/active/activetaskd.cc | 241 - trunk/ceph/active/activetaskd.h | 14 - trunk/ceph/active/client_init.cc | 1 - trunk/ceph/active/client_init.h | 2 - trunk/ceph/active/echotestclient.cc | 74 - trunk/ceph/active/echotestclient.h | 10 - trunk/ceph/active/inet.h | 9 - trunk/ceph/active/trivial_task.cc | 50 - trunk/ceph/active/trivial_task.h | 12 - trunk/ceph/cfuse.cc | 88 - trunk/ceph/client/FileCache.cc | 266 - trunk/ceph/client/FileCache.h | 85 - trunk/ceph/client/Trace.cc | 83 - trunk/ceph/client/Trace.h | 63 - trunk/ceph/client/fuse.cc | 306 - trunk/ceph/client/fuse.h | 24 - trunk/ceph/client/fuse_ll.cc | 397 - trunk/ceph/client/fuse_ll.h | 15 - trunk/ceph/client/hadoop/CephFSInterface.cc | 789 -- trunk/ceph/client/hadoop/CephFSInterface.h | 239 - trunk/ceph/client/ldceph.cc | 298 - trunk/ceph/cmds.cc | 108 - trunk/ceph/cmon.cc | 129 - trunk/ceph/cmonctl.cc | 92 - trunk/ceph/common/Clock.cc | 20 - trunk/ceph/common/Cond.h | 119 - trunk/ceph/common/DecayCounter.h | 138 - trunk/ceph/common/LogType.h | 122 - trunk/ceph/common/Logger.h | 77 - trunk/ceph/common/Mutex.h | 83 - trunk/ceph/common/RWLock.h | 50 - trunk/ceph/common/Semaphore.h | 53 - trunk/ceph/common/Thread.h | 81 - trunk/ceph/common/ThreadPool.h | 139 - trunk/ceph/common/Timer.cc | 335 - trunk/ceph/common/Timer.h | 175 - trunk/ceph/cosd.cc | 135 - trunk/ceph/crush.old/BinaryTree.h | 285 - trunk/ceph/crush.old/Bucket.h | 632 -- trunk/ceph/crush.old/Hash.h | 301 - trunk/ceph/crush.old/crush.h | 543 -- trunk/ceph/crush.old/test/bucket_movement.cc | 166 - trunk/ceph/crush.old/test/bucket_variance.cc | 199 - trunk/ceph/crush.old/test/cluster_movement.cc | 217 - .../crush.old/test/cluster_movement_remove.cc | 229 - .../crush.old/test/cluster_movement_rush.cc | 218 - trunk/ceph/crush.old/test/creeping_failure.cc | 276 - .../test/creeping_failure_variance.cc | 281 - trunk/ceph/crush.old/test/depth_variance.cc | 185 - trunk/ceph/crush.old/test/mixed.cc | 300 - trunk/ceph/crush.old/test/movement.cc | 223 - trunk/ceph/crush.old/test/movement_failed.cc | 246 - trunk/ceph/crush.old/test/overload.cc | 335 - .../ceph/crush.old/test/overload_variance.cc | 281 - trunk/ceph/crush.old/test/sizes.cc | 131 - trunk/ceph/crush.old/test/smallbucket.cc | 138 - trunk/ceph/crush.old/test/speed_bucket.cc | 86 - trunk/ceph/crush.old/test/speed_depth.cc | 174 - trunk/ceph/crush.old/test/speed_rush.cc | 145 - trunk/ceph/crush.old/test/t.cc | 25 - trunk/ceph/crush.old/test/testbucket.cc | 61 - trunk/ceph/crush.old/test/testnormal.cc | 51 - trunk/ceph/crush/buckets.c | 6 - trunk/ceph/crush/hash.h | 80 - trunk/ceph/crush/mapper.h | 19 - trunk/ceph/crush/types.h | 18 - trunk/ceph/csyn.cc | 87 - trunk/ceph/doc/Commitdir.txt | 24 - trunk/ceph/doc/anchortable.txt | 54 - trunk/ceph/doc/bdb.txt | 48 - trunk/ceph/doc/caching.txt | 303 - trunk/ceph/doc/exports.txt | 72 - trunk/ceph/doc/header.txt | 13 - trunk/ceph/doc/inos.txt | 11 - trunk/ceph/doc/lazy_posix.txt | 53 - trunk/ceph/doc/mds_locks.txt | 66 - trunk/ceph/doc/modeline.txt | 2 - trunk/ceph/doc/shared_write_states_nogo.txt | 39 - trunk/ceph/ebofs/Allocator.cc | 693 -- trunk/ceph/ebofs/Allocator.h | 85 - trunk/ceph/ebofs/FileJournal.cc | 456 -- trunk/ceph/ebofs/Journal.h | 47 - trunk/ceph/ebofs/mkfs.ebofs.cc | 349 - trunk/ceph/extractosdmaps.cc | 64 - trunk/ceph/include/Context.h | 153 - trunk/ceph/include/Distribution.h | 75 - trunk/ceph/include/bitmapper.h | 48 - trunk/ceph/include/blobhash.h | 47 - trunk/ceph/include/encodable.h | 424 -- trunk/ceph/include/error.h | 41 - trunk/ceph/include/filepath.h | 199 - trunk/ceph/include/lru.h | 341 - trunk/ceph/include/rangeset.h | 253 - trunk/ceph/include/statlite.h | 72 - trunk/ceph/include/triple.h | 28 - trunk/ceph/include/uofs.h | 51 - trunk/ceph/include/xlist.h | 123 - trunk/ceph/jobs/alc.tp | 38 - trunk/ceph/jobs/alcdat/makedirs | 45 - trunk/ceph/jobs/alcdat/makedirs.big | 45 - trunk/ceph/jobs/alcdat/makedirs.tput | 46 - trunk/ceph/jobs/alcdat/makefiles.shared | 32 - trunk/ceph/jobs/alcdat/openshared | 32 - trunk/ceph/jobs/alcdat/ossh.include | 45 - trunk/ceph/jobs/alcdat/ossh.include.big | 46 - trunk/ceph/jobs/alcdat/ossh.lib | 45 - trunk/ceph/jobs/alcdat/ossh.lib.big | 46 - trunk/ceph/jobs/alcdat/striping | 48 - trunk/ceph/jobs/example | 56 - trunk/ceph/jobs/mds/log_striping | 36 - trunk/ceph/jobs/mds/makedir_lat | 33 - trunk/ceph/jobs/mds/makedirs | 40 - trunk/ceph/jobs/mds/opensshlib | 44 - trunk/ceph/jobs/meta1 | 19 - trunk/ceph/jobs/meta1.proc.sh | 14 - trunk/ceph/jobs/osd/ebofs | 51 - trunk/ceph/jobs/osd/mds_log | 43 - trunk/ceph/jobs/osd/osd_threads | 33 - trunk/ceph/jobs/osd/striping | 78 - trunk/ceph/jobs/osd/wr_lat2 | 44 - trunk/ceph/jobs/osd/write_sizes | 60 - trunk/ceph/jobs/rados/map_dist | 32 - trunk/ceph/jobs/rados/rep_lat | 43 - trunk/ceph/jobs/runjobsample | 26 - trunk/ceph/mds/Anchor.h | 108 - trunk/ceph/mds/AnchorClient.cc | 365 - trunk/ceph/mds/AnchorClient.h | 107 - trunk/ceph/mds/AnchorTable.cc | 713 -- trunk/ceph/mds/AnchorTable.h | 127 - trunk/ceph/mds/CDentry.cc | 384 - trunk/ceph/mds/CDentry.h | 325 - trunk/ceph/mds/CDir.cc | 1676 ----- trunk/ceph/mds/CDir.h | 540 -- trunk/ceph/mds/CInode.cc | 844 --- trunk/ceph/mds/CInode.h | 615 -- trunk/ceph/mds/Capability.h | 245 - trunk/ceph/mds/ClientMap.cc | 126 - trunk/ceph/mds/ClientMap.h | 205 - trunk/ceph/mds/FileLock.h | 227 - trunk/ceph/mds/IdAllocator.cc | 205 - trunk/ceph/mds/IdAllocator.h | 78 - trunk/ceph/mds/LocalLock.h | 61 - trunk/ceph/mds/Locker.h | 195 - trunk/ceph/mds/LogEvent.cc | 87 - trunk/ceph/mds/LogEvent.h | 97 - trunk/ceph/mds/LogSegment.h | 69 - trunk/ceph/mds/MDBalancer.h | 118 - trunk/ceph/mds/MDCache.h | 726 -- trunk/ceph/mds/MDLog.h | 198 - trunk/ceph/mds/Migrator.h | 277 - trunk/ceph/mds/ScatterLock.h | 183 - trunk/ceph/mds/Server.h | 187 - trunk/ceph/mds/SimpleLock.h | 301 - trunk/ceph/mds/events/EAnchor.h | 80 - trunk/ceph/mds/events/EAnchorClient.h | 56 - trunk/ceph/mds/events/EExport.h | 63 - trunk/ceph/mds/events/EFragment.h | 54 - trunk/ceph/mds/events/EImportFinish.h | 60 - trunk/ceph/mds/events/EImportStart.h | 67 - trunk/ceph/mds/events/EMetaBlob.h | 501 -- trunk/ceph/mds/events/EOpen.h | 53 - trunk/ceph/mds/events/EPurgeFinish.h | 54 - trunk/ceph/mds/events/ESession.h | 64 - trunk/ceph/mds/events/ESessions.h | 55 - trunk/ceph/mds/events/ESlaveUpdate.h | 79 - trunk/ceph/mds/events/EString.h | 56 - trunk/ceph/mds/events/ESubtreeMap.h | 47 - trunk/ceph/mds/events/EUpdate.h | 53 - trunk/ceph/mds/journal.cc | 1126 --- trunk/ceph/messages/MAnchor.h | 74 - trunk/ceph/messages/MCacheExpire.h | 127 - trunk/ceph/messages/MDentryUnlink.h | 82 - trunk/ceph/messages/MDirUpdate.h | 74 - trunk/ceph/messages/MDiscover.h | 108 - trunk/ceph/messages/MDiscoverReply.h | 300 - trunk/ceph/messages/MExportCaps.h | 50 - trunk/ceph/messages/MExportCapsAck.h | 46 - trunk/ceph/messages/MExportDir.h | 65 - trunk/ceph/messages/MExportDirAck.h | 46 - trunk/ceph/messages/MExportDirCancel.h | 49 - trunk/ceph/messages/MExportDirDiscover.h | 59 - trunk/ceph/messages/MExportDirDiscoverAck.h | 60 - trunk/ceph/messages/MExportDirFinish.h | 46 - trunk/ceph/messages/MExportDirNotify.h | 85 - trunk/ceph/messages/MExportDirNotifyAck.h | 50 - trunk/ceph/messages/MExportDirPrep.h | 205 - trunk/ceph/messages/MExportDirPrepAck.h | 47 - trunk/ceph/messages/MExportDirWarning.h | 50 - trunk/ceph/messages/MExportDirWarningAck.h | 45 - trunk/ceph/messages/MGenericMessage.h | 45 - trunk/ceph/messages/MHeartbeat.h | 60 - trunk/ceph/messages/MInodeFileCaps.h | 57 - trunk/ceph/messages/MMDSBeacon.h | 67 - trunk/ceph/messages/MMDSBoot.h | 39 - trunk/ceph/messages/MMDSCacheRejoin.h | 230 - trunk/ceph/messages/MMDSFragmentNotify.h | 60 - trunk/ceph/messages/MMDSResolve.h | 66 - trunk/ceph/messages/MMDSResolveAck.h | 56 - trunk/ceph/messages/MMDSSlaveRequest.h | 148 - trunk/ceph/messages/MMonCommand.h | 54 - trunk/ceph/messages/MMonCommandAck.h | 46 - trunk/ceph/messages/MMonElection.h | 63 - trunk/ceph/messages/MMonElectionCollect.h | 43 - trunk/ceph/messages/MMonElectionRefresh.h | 52 - trunk/ceph/messages/MMonElectionStatus.h | 51 - trunk/ceph/messages/MMonOSDMapInfo.h | 50 - trunk/ceph/messages/MMonOSDMapLease.h | 50 - trunk/ceph/messages/MMonOSDMapLeaseAck.h | 45 - trunk/ceph/messages/MMonOSDMapUpdateAck.h | 43 - trunk/ceph/messages/MMonOSDMapUpdateCommit.h | 43 - trunk/ceph/messages/MMonOSDMapUpdatePrepare.h | 53 - trunk/ceph/messages/MMonPaxos.h | 98 - trunk/ceph/messages/MOSDBoot.h | 51 - trunk/ceph/messages/MOSDFailure.h | 55 - trunk/ceph/messages/MOSDIn.h | 43 - trunk/ceph/messages/MOSDOut.h | 43 - trunk/ceph/messages/MOSDPGActivateSet.h | 50 - trunk/ceph/messages/MOSDPGLog.h | 59 - trunk/ceph/messages/MOSDPGNotify.h | 55 - trunk/ceph/messages/MOSDPGPeer.h | 58 - trunk/ceph/messages/MOSDPGPeerAck.h | 70 - trunk/ceph/messages/MOSDPGPeerRequest.h | 51 - trunk/ceph/messages/MOSDPGRemove.h | 52 - trunk/ceph/messages/MOSDPGSummary.h | 69 - trunk/ceph/messages/MOSDPGUpdate.h | 71 - trunk/ceph/messages/MOSDPing.h | 49 - trunk/ceph/messages/MPGStats.h | 43 - trunk/ceph/mon/Elector.h | 92 - trunk/ceph/mon/MonitorStore.h | 82 - trunk/ceph/mon/PGMap.h | 103 - trunk/ceph/mon/PGMonitor.h | 58 - trunk/ceph/mon/PaxosService.cc | 172 - trunk/ceph/mon/mon_types.h | 35 - trunk/ceph/msg/Dispatcher.cc | 28 - trunk/ceph/msg/Dispatcher.h | 34 - trunk/ceph/msg/Messenger.cc | 39 - trunk/ceph/msg/tcp.cc | 93 - trunk/ceph/osbdb/OSBDB.cc | 2169 ------ trunk/ceph/osbdb/OSBDB.h | 482 -- trunk/ceph/osd/Ager.cc | 333 - trunk/ceph/osd/Ager.h | 44 - trunk/ceph/osd/BDBMap.h | 137 - trunk/ceph/osd/FakeStoreBDBCollections.h | 169 - trunk/ceph/osd/ObjectStore.cc | 152 - trunk/ceph/osdc/Blinker.h | 92 - trunk/ceph/osdc/Filer.cc | 235 - trunk/ceph/osdc/Filer.h | 165 - trunk/ceph/osdc/Journaler.cc | 666 -- trunk/ceph/osdc/Journaler.h | 237 - trunk/ceph/osdc/ObjectCacher.cc | 1587 ---- trunk/ceph/osdc/ObjectCacher.h | 566 -- trunk/ceph/script/add_header.pl | 26 - trunk/ceph/script/adjusttabs.pl | 24 - trunk/ceph/script/check_cache_dumps.pl | 56 - trunk/ceph/script/clean_osd_cow.sh | 3 - trunk/ceph/script/clean_trace.pl | 8 - trunk/ceph/script/comb.pl | 113 - trunk/ceph/script/convert_soe_trace.pl | 39 - trunk/ceph/script/find_auth_pins.pl | 51 - trunk/ceph/script/find_bufferleaks.pl | 69 - trunk/ceph/script/find_lost_bdev_ops.pl | 34 - trunk/ceph/script/find_lost_commit.pl | 38 - trunk/ceph/script/find_lost_objecter.pl | 34 - trunk/ceph/script/find_pathpins.pl | 41 - trunk/ceph/script/find_requests.pl | 42 - trunk/ceph/script/find_waiters.pl | 46 - trunk/ceph/script/fix_modeline.pl | 29 - trunk/ceph/script/gprofnewsyn | 12 - trunk/ceph/script/grepblock | 15 - trunk/ceph/script/merge_cdfs.pl | 24 - trunk/ceph/script/merge_trace_rw.pl | 42 - trunk/ceph/script/plot.pl | 48 - trunk/ceph/script/profonly.pl | 12 - trunk/ceph/script/runjob.pl | 341 - trunk/ceph/script/runset.pl | 380 - trunk/ceph/script/smooth.pl | 41 - trunk/ceph/script/study_find.pl | 224 - trunk/ceph/script/study_hardlink_lifetimes.pl | 131 - trunk/ceph/script/study_lookups.pl | 137 - trunk/ceph/script/sum.pl | 148 - trunk/ceph/test/fakemds.cc | 104 - trunk/ceph/test/fg.cc | 19 - trunk/ceph/test/gprof-helper.c | 120 - trunk/ceph/test/makedirs.cc | 38 - trunk/ceph/test/mpitest.cc | 111 - trunk/ceph/test/mttest.cc | 140 - trunk/ceph/test/rushconfig | 7 - trunk/ceph/test/rushtest.cc | 49 - trunk/ceph/test/rushtest.cc~ | 49 - trunk/ceph/test/test_disk_bw.cc | 59 - trunk/ceph/test/testbucket.cc | 67 - trunk/ceph/test/testbuffers.cc | 40 - trunk/ceph/test/testcounter.cc | 70 - trunk/ceph/test/testcrush.cc | 266 - trunk/ceph/test/testfilepath.cc | 22 - trunk/ceph/test/testmpi.cc | 53 - trunk/ceph/test/testnewbuffers.cc | 91 - trunk/ceph/test/testos.cc | 343 - trunk/ceph/test/testosbdb.cc | 347 - trunk/ceph/test/testtree.cc | 46 - trunk/ceph/test/testxattr.cc | 31 - {trunk/web => web}/Makefile | 0 {trunk/web => web}/ceph.css | 0 {trunk/web => web}/gen.pl | 0 .../web => web}/images/ceph-architecture.png | Bin {trunk/web => web}/images/ceph-logo1.jpg | Bin {trunk/web => web}/index.body | 0 {trunk/web => web}/overview.body | 0 {trunk/web => web}/publications.body | 0 {trunk/web => web}/source.body | 0 {trunk/web => web}/tasks.body | 0 {trunk/web => web}/template.html | 0 2647 files changed, 535217 deletions(-) rename {trunk/bench => bench}/mdtest/COPYRIGHT (100%) rename {trunk/bench => bench}/mdtest/Makefile (100%) rename {trunk/bench => bench}/mdtest/README (100%) rename {trunk/bench => bench}/mdtest/mdtest.c (100%) delete mode 100644 branches/marnberg/quota/Makefile delete mode 100644 branches/marnberg/quota/TODO delete mode 100644 branches/marnberg/quota/cfuse.cc delete mode 100644 branches/marnberg/quota/client/Client.cc delete mode 100644 branches/marnberg/quota/client/Client.h delete mode 100644 branches/marnberg/quota/client/FileCache.cc delete mode 100644 branches/marnberg/quota/client/FileCache.h delete mode 100644 branches/marnberg/quota/client/SyntheticClient.cc delete mode 100644 branches/marnberg/quota/client/SyntheticClient.h delete mode 100644 branches/marnberg/quota/client/Trace.cc delete mode 100644 branches/marnberg/quota/client/Trace.h delete mode 100644 branches/marnberg/quota/client/fuse.cc delete mode 100644 branches/marnberg/quota/client/fuse.h delete mode 100644 branches/marnberg/quota/client/hadoop/CephClientInterface.cc delete mode 100644 branches/marnberg/quota/client/hadoop/CephClientInterface.h delete mode 100644 branches/marnberg/quota/client/ldceph.cc delete mode 100644 branches/marnberg/quota/client/msgthread.h delete mode 100644 branches/marnberg/quota/cmds.cc delete mode 100644 branches/marnberg/quota/cmon.cc delete mode 100644 branches/marnberg/quota/common/Clock.cc delete mode 100644 branches/marnberg/quota/common/Clock.h delete mode 100644 branches/marnberg/quota/common/Cond.h delete mode 100644 branches/marnberg/quota/common/DecayCounter.h delete mode 100644 branches/marnberg/quota/common/LogType.h delete mode 100644 branches/marnberg/quota/common/Logger.cc delete mode 100644 branches/marnberg/quota/common/Logger.h delete mode 100755 branches/marnberg/quota/common/Mutex.h delete mode 100644 branches/marnberg/quota/common/Semaphore.h delete mode 100644 branches/marnberg/quota/common/Thread.h delete mode 100644 branches/marnberg/quota/common/ThreadPool.h delete mode 100644 branches/marnberg/quota/common/Timer.cc delete mode 100644 branches/marnberg/quota/common/Timer.h delete mode 100644 branches/marnberg/quota/config.cc delete mode 100644 branches/marnberg/quota/config.h delete mode 100644 branches/marnberg/quota/cosd.cc delete mode 100644 branches/marnberg/quota/crush/BinaryTree.h delete mode 100644 branches/marnberg/quota/crush/Bucket.h delete mode 100644 branches/marnberg/quota/crush/Hash.h delete mode 100644 branches/marnberg/quota/crush/crush.h delete mode 100644 branches/marnberg/quota/csyn.cc delete mode 100644 branches/marnberg/quota/doc/Commitdir.txt delete mode 100644 branches/marnberg/quota/doc/Replication.txt delete mode 100644 branches/marnberg/quota/doc/caching.txt delete mode 100644 branches/marnberg/quota/doc/dentries.txt delete mode 100644 branches/marnberg/quota/doc/file_modes.txt delete mode 100644 branches/marnberg/quota/doc/header.txt delete mode 100644 branches/marnberg/quota/doc/journal.txt delete mode 100644 branches/marnberg/quota/doc/osd_outline.txt delete mode 100644 branches/marnberg/quota/doc/osd_replication.txt delete mode 100644 branches/marnberg/quota/doc/performance.txt delete mode 100644 branches/marnberg/quota/doc/shutdown.txt delete mode 100644 branches/marnberg/quota/ebofs/Allocator.cc delete mode 100644 branches/marnberg/quota/ebofs/Allocator.h delete mode 100644 branches/marnberg/quota/ebofs/BlockDevice.cc delete mode 100644 branches/marnberg/quota/ebofs/BlockDevice.h delete mode 100644 branches/marnberg/quota/ebofs/BufferCache.cc delete mode 100644 branches/marnberg/quota/ebofs/BufferCache.h delete mode 100644 branches/marnberg/quota/ebofs/Cnode.h delete mode 100644 branches/marnberg/quota/ebofs/Ebofs.cc delete mode 100644 branches/marnberg/quota/ebofs/Ebofs.h delete mode 100644 branches/marnberg/quota/ebofs/Onode.h delete mode 100644 branches/marnberg/quota/ebofs/Table.h delete mode 100644 branches/marnberg/quota/ebofs/mkfs.ebofs.cc delete mode 100644 branches/marnberg/quota/ebofs/nodes.h delete mode 100644 branches/marnberg/quota/ebofs/test.ebofs.cc delete mode 100644 branches/marnberg/quota/ebofs/types.h delete mode 100644 branches/marnberg/quota/fakefuse.cc delete mode 100644 branches/marnberg/quota/fakemon.cc delete mode 100644 branches/marnberg/quota/fakesyn.cc delete mode 100644 branches/marnberg/quota/include/Context.h delete mode 100644 branches/marnberg/quota/include/Distribution.h delete mode 100644 branches/marnberg/quota/include/buffer.h delete mode 100644 branches/marnberg/quota/include/error.h delete mode 100644 branches/marnberg/quota/include/filepath.h delete mode 100644 branches/marnberg/quota/include/interval_set.h delete mode 100644 branches/marnberg/quota/include/lru.h delete mode 100644 branches/marnberg/quota/include/object.h delete mode 100644 branches/marnberg/quota/include/oldbuffer.h delete mode 100644 branches/marnberg/quota/include/oldbufferlist.h delete mode 100644 branches/marnberg/quota/include/rangeset.h delete mode 100644 branches/marnberg/quota/include/reqid.h delete mode 100644 branches/marnberg/quota/include/statlite.h delete mode 100644 branches/marnberg/quota/include/types.h delete mode 100644 branches/marnberg/quota/include/uofs.h delete mode 100644 branches/marnberg/quota/jobs/rados/wr_sizes delete mode 100644 branches/marnberg/quota/mds/Anchor.h delete mode 100644 branches/marnberg/quota/mds/AnchorClient.cc delete mode 100644 branches/marnberg/quota/mds/AnchorClient.h delete mode 100644 branches/marnberg/quota/mds/AnchorTable.cc delete mode 100644 branches/marnberg/quota/mds/AnchorTable.h delete mode 100644 branches/marnberg/quota/mds/CDentry.cc delete mode 100644 branches/marnberg/quota/mds/CDentry.h delete mode 100644 branches/marnberg/quota/mds/CDir.cc delete mode 100644 branches/marnberg/quota/mds/CDir.h delete mode 100644 branches/marnberg/quota/mds/CInode.cc delete mode 100644 branches/marnberg/quota/mds/CInode.h delete mode 100644 branches/marnberg/quota/mds/Capability.h delete mode 100644 branches/marnberg/quota/mds/ClientMap.h delete mode 100644 branches/marnberg/quota/mds/IdAllocator.cc delete mode 100644 branches/marnberg/quota/mds/IdAllocator.h delete mode 100644 branches/marnberg/quota/mds/Lock.h delete mode 100644 branches/marnberg/quota/mds/Locker.cc delete mode 100644 branches/marnberg/quota/mds/Locker.h delete mode 100644 branches/marnberg/quota/mds/LogEvent.cc delete mode 100644 branches/marnberg/quota/mds/LogEvent.h delete mode 100644 branches/marnberg/quota/mds/MDBalancer.cc delete mode 100644 branches/marnberg/quota/mds/MDBalancer.h delete mode 100644 branches/marnberg/quota/mds/MDCache.cc delete mode 100644 branches/marnberg/quota/mds/MDCache.h delete mode 100644 branches/marnberg/quota/mds/MDLog.cc delete mode 100644 branches/marnberg/quota/mds/MDLog.h delete mode 100644 branches/marnberg/quota/mds/MDS.cc delete mode 100644 branches/marnberg/quota/mds/MDS.h delete mode 100644 branches/marnberg/quota/mds/MDSMap.h delete mode 100644 branches/marnberg/quota/mds/MDStore.cc delete mode 100644 branches/marnberg/quota/mds/MDStore.h delete mode 100644 branches/marnberg/quota/mds/Migrator.cc delete mode 100644 branches/marnberg/quota/mds/Migrator.h delete mode 100644 branches/marnberg/quota/mds/Renamer.cc delete mode 100644 branches/marnberg/quota/mds/Renamer.h delete mode 100644 branches/marnberg/quota/mds/Server.cc delete mode 100644 branches/marnberg/quota/mds/Server.h delete mode 100644 branches/marnberg/quota/mds/events/EAlloc.h delete mode 100644 branches/marnberg/quota/mds/events/EExportFinish.h delete mode 100644 branches/marnberg/quota/mds/events/EExportStart.h delete mode 100644 branches/marnberg/quota/mds/events/EImportFinish.h delete mode 100644 branches/marnberg/quota/mds/events/EImportMap.h delete mode 100644 branches/marnberg/quota/mds/events/EImportStart.h delete mode 100644 branches/marnberg/quota/mds/events/EMetaBlob.h delete mode 100644 branches/marnberg/quota/mds/events/EPurgeFinish.h delete mode 100644 branches/marnberg/quota/mds/events/EString.h delete mode 100644 branches/marnberg/quota/mds/events/EUnlink.h delete mode 100644 branches/marnberg/quota/mds/events/EUpdate.h delete mode 100644 branches/marnberg/quota/mds/journal.cc delete mode 100644 branches/marnberg/quota/mds/mdstypes.h delete mode 100644 branches/marnberg/quota/messages/MAnchorReply.h delete mode 100644 branches/marnberg/quota/messages/MAnchorRequest.h delete mode 100644 branches/marnberg/quota/messages/MCacheExpire.h delete mode 100644 branches/marnberg/quota/messages/MClientBoot.h delete mode 100644 branches/marnberg/quota/messages/MClientFileCaps.h delete mode 100644 branches/marnberg/quota/messages/MClientInodeAuthUpdate.h delete mode 100644 branches/marnberg/quota/messages/MClientMount.h delete mode 100644 branches/marnberg/quota/messages/MClientMountAck.h delete mode 100644 branches/marnberg/quota/messages/MClientReply.h delete mode 100644 branches/marnberg/quota/messages/MClientRequest.h delete mode 100644 branches/marnberg/quota/messages/MDentryUnlink.h delete mode 100644 branches/marnberg/quota/messages/MDirExpire.h delete mode 100644 branches/marnberg/quota/messages/MDirExpireReq.h delete mode 100644 branches/marnberg/quota/messages/MDirUpdate.h delete mode 100644 branches/marnberg/quota/messages/MDiscover.h delete mode 100644 branches/marnberg/quota/messages/MDiscoverReply.h delete mode 100644 branches/marnberg/quota/messages/MExportDir.h delete mode 100644 branches/marnberg/quota/messages/MExportDirAck.h delete mode 100644 branches/marnberg/quota/messages/MExportDirDiscover.h delete mode 100644 branches/marnberg/quota/messages/MExportDirDiscoverAck.h delete mode 100644 branches/marnberg/quota/messages/MExportDirFinish.h delete mode 100644 branches/marnberg/quota/messages/MExportDirNotify.h delete mode 100644 branches/marnberg/quota/messages/MExportDirNotifyAck.h delete mode 100644 branches/marnberg/quota/messages/MExportDirPrep.h delete mode 100644 branches/marnberg/quota/messages/MExportDirPrepAck.h delete mode 100644 branches/marnberg/quota/messages/MExportDirWarning.h delete mode 100644 branches/marnberg/quota/messages/MFailure.h delete mode 100644 branches/marnberg/quota/messages/MFailureAck.h delete mode 100644 branches/marnberg/quota/messages/MGenericMessage.h delete mode 100644 branches/marnberg/quota/messages/MHashDir.h delete mode 100644 branches/marnberg/quota/messages/MHashDirAck.h delete mode 100644 branches/marnberg/quota/messages/MHashDirDiscover.h delete mode 100644 branches/marnberg/quota/messages/MHashDirDiscoverAck.h delete mode 100644 branches/marnberg/quota/messages/MHashDirNotify.h delete mode 100644 branches/marnberg/quota/messages/MHashDirPrep.h delete mode 100644 branches/marnberg/quota/messages/MHashDirPrepAck.h delete mode 100644 branches/marnberg/quota/messages/MHashReaddir.h delete mode 100644 branches/marnberg/quota/messages/MHashReaddirReply.h delete mode 100644 branches/marnberg/quota/messages/MHeartbeat.h delete mode 100644 branches/marnberg/quota/messages/MInodeExpire.h delete mode 100644 branches/marnberg/quota/messages/MInodeFileCaps.h delete mode 100644 branches/marnberg/quota/messages/MInodeLink.h delete mode 100644 branches/marnberg/quota/messages/MInodeLinkAck.h delete mode 100644 branches/marnberg/quota/messages/MInodeUnlink.h delete mode 100644 branches/marnberg/quota/messages/MInodeUnlinkAck.h delete mode 100644 branches/marnberg/quota/messages/MInodeUpdate.h delete mode 100644 branches/marnberg/quota/messages/MLock.h delete mode 100644 branches/marnberg/quota/messages/MMDSBeacon.h delete mode 100644 branches/marnberg/quota/messages/MMDSBoot.h delete mode 100644 branches/marnberg/quota/messages/MMDSCacheRejoin.h delete mode 100644 branches/marnberg/quota/messages/MMDSCacheRejoinAck.h delete mode 100644 branches/marnberg/quota/messages/MMDSGetMap.h delete mode 100644 branches/marnberg/quota/messages/MMDSImportMap.h delete mode 100644 branches/marnberg/quota/messages/MMDSMap.h delete mode 100644 branches/marnberg/quota/messages/MMonElectionAck.h delete mode 100644 branches/marnberg/quota/messages/MMonElectionCollect.h delete mode 100644 branches/marnberg/quota/messages/MMonElectionPropose.h delete mode 100644 branches/marnberg/quota/messages/MMonElectionRefresh.h delete mode 100644 branches/marnberg/quota/messages/MMonElectionStatus.h delete mode 100644 branches/marnberg/quota/messages/MMonElectionVictory.h delete mode 100644 branches/marnberg/quota/messages/MMonOSDMapInfo.h delete mode 100644 branches/marnberg/quota/messages/MMonOSDMapLease.h delete mode 100644 branches/marnberg/quota/messages/MMonOSDMapLeaseAck.h delete mode 100644 branches/marnberg/quota/messages/MMonOSDMapUpdateAck.h delete mode 100644 branches/marnberg/quota/messages/MMonOSDMapUpdateCommit.h delete mode 100644 branches/marnberg/quota/messages/MMonOSDMapUpdatePrepare.h delete mode 100644 branches/marnberg/quota/messages/MMonPaxos.h delete mode 100644 branches/marnberg/quota/messages/MNSConnect.h delete mode 100644 branches/marnberg/quota/messages/MNSConnectAck.h delete mode 100644 branches/marnberg/quota/messages/MNSFailure.h delete mode 100644 branches/marnberg/quota/messages/MNSLookup.h delete mode 100644 branches/marnberg/quota/messages/MNSLookupReply.h delete mode 100644 branches/marnberg/quota/messages/MNSRegister.h delete mode 100644 branches/marnberg/quota/messages/MNSRegisterAck.h delete mode 100644 branches/marnberg/quota/messages/MOSDBoot.h delete mode 100644 branches/marnberg/quota/messages/MOSDFailure.h delete mode 100644 branches/marnberg/quota/messages/MOSDGetMap.h delete mode 100644 branches/marnberg/quota/messages/MOSDIn.h delete mode 100644 branches/marnberg/quota/messages/MOSDMap.h delete mode 100644 branches/marnberg/quota/messages/MOSDOp.h delete mode 100644 branches/marnberg/quota/messages/MOSDOpReply.h delete mode 100644 branches/marnberg/quota/messages/MOSDOut.h delete mode 100644 branches/marnberg/quota/messages/MOSDPGLog.h delete mode 100644 branches/marnberg/quota/messages/MOSDPGNotify.h delete mode 100644 branches/marnberg/quota/messages/MOSDPGPeer.h delete mode 100644 branches/marnberg/quota/messages/MOSDPGPeerAck.h delete mode 100644 branches/marnberg/quota/messages/MOSDPGPeerRequest.h delete mode 100644 branches/marnberg/quota/messages/MOSDPGQuery.h delete mode 100644 branches/marnberg/quota/messages/MOSDPGRemove.h delete mode 100644 branches/marnberg/quota/messages/MOSDPGSummary.h delete mode 100644 branches/marnberg/quota/messages/MOSDPGUpdate.h delete mode 100644 branches/marnberg/quota/messages/MOSDPing.h delete mode 100644 branches/marnberg/quota/messages/MPing.h delete mode 100644 branches/marnberg/quota/messages/MPingAck.h delete mode 100644 branches/marnberg/quota/messages/MRename.h delete mode 100644 branches/marnberg/quota/messages/MRenameAck.h delete mode 100644 branches/marnberg/quota/messages/MRenameNotify.h delete mode 100644 branches/marnberg/quota/messages/MRenameNotifyAck.h delete mode 100644 branches/marnberg/quota/messages/MRenamePrep.h delete mode 100644 branches/marnberg/quota/messages/MRenameReq.h delete mode 100644 branches/marnberg/quota/messages/MRenameWarning.h delete mode 100644 branches/marnberg/quota/messages/MUnhashDir.h delete mode 100644 branches/marnberg/quota/messages/MUnhashDirAck.h delete mode 100644 branches/marnberg/quota/messages/MUnhashDirNotify.h delete mode 100644 branches/marnberg/quota/messages/MUnhashDirNotifyAck.h delete mode 100644 branches/marnberg/quota/messages/MUnhashDirPrep.h delete mode 100644 branches/marnberg/quota/messages/MUnhashDirPrepAck.h delete mode 100644 branches/marnberg/quota/mkmonmap.cc delete mode 100644 branches/marnberg/quota/mon/ClientMonitor.cc delete mode 100644 branches/marnberg/quota/mon/ClientMonitor.h delete mode 100644 branches/marnberg/quota/mon/Elector.cc delete mode 100644 branches/marnberg/quota/mon/Elector.h delete mode 100644 branches/marnberg/quota/mon/MDSMonitor.cc delete mode 100644 branches/marnberg/quota/mon/MDSMonitor.h delete mode 100644 branches/marnberg/quota/mon/MonMap.h delete mode 100644 branches/marnberg/quota/mon/Monitor.cc delete mode 100644 branches/marnberg/quota/mon/Monitor.h delete mode 100644 branches/marnberg/quota/mon/MonitorStore.cc delete mode 100644 branches/marnberg/quota/mon/MonitorStore.h delete mode 100644 branches/marnberg/quota/mon/OSDMonitor.cc delete mode 100644 branches/marnberg/quota/mon/OSDMonitor.h delete mode 100644 branches/marnberg/quota/mon/Paxos.cc delete mode 100644 branches/marnberg/quota/mon/Paxos.h delete mode 100644 branches/marnberg/quota/msg/Dispatcher.cc delete mode 100644 branches/marnberg/quota/msg/Dispatcher.h delete mode 100644 branches/marnberg/quota/msg/FakeMessenger.cc delete mode 100644 branches/marnberg/quota/msg/FakeMessenger.h delete mode 100644 branches/marnberg/quota/msg/HostMonitor.cc delete mode 100644 branches/marnberg/quota/msg/HostMonitor.h delete mode 100644 branches/marnberg/quota/msg/MPIMessenger.cc delete mode 100644 branches/marnberg/quota/msg/MPIMessenger.h delete mode 100644 branches/marnberg/quota/msg/MTMessenger.cc delete mode 100644 branches/marnberg/quota/msg/MTMessenger.h delete mode 100644 branches/marnberg/quota/msg/Message.cc delete mode 100644 branches/marnberg/quota/msg/Message.h delete mode 100644 branches/marnberg/quota/msg/Messenger.cc delete mode 100644 branches/marnberg/quota/msg/Messenger.h delete mode 100644 branches/marnberg/quota/msg/NewMessenger.cc delete mode 100644 branches/marnberg/quota/msg/NewMessenger.h delete mode 100644 branches/marnberg/quota/msg/NewerMessenger.cc delete mode 100644 branches/marnberg/quota/msg/NewerMessenger.h delete mode 100644 branches/marnberg/quota/msg/RWLock.h delete mode 100644 branches/marnberg/quota/msg/SerialMessenger.h delete mode 100644 branches/marnberg/quota/msg/SimpleMessenger.cc delete mode 100644 branches/marnberg/quota/msg/SimpleMessenger.h delete mode 100644 branches/marnberg/quota/msg/TCPDirectory.cc delete mode 100644 branches/marnberg/quota/msg/TCPDirectory.h delete mode 100644 branches/marnberg/quota/msg/TCPMessenger.cc delete mode 100644 branches/marnberg/quota/msg/TCPMessenger.h delete mode 100644 branches/marnberg/quota/msg/error.c delete mode 100644 branches/marnberg/quota/msg/mpistarter.cc delete mode 100644 branches/marnberg/quota/msg/msg_types.h delete mode 100644 branches/marnberg/quota/msg/new_mpistarter.cc delete mode 100644 branches/marnberg/quota/msg/tcp.cc delete mode 100644 branches/marnberg/quota/msg/tcp.h delete mode 100644 branches/marnberg/quota/newsyn.cc delete mode 100644 branches/marnberg/quota/osbdb/OSBDB.cc delete mode 100644 branches/marnberg/quota/osbdb/OSBDB.h delete mode 100644 branches/marnberg/quota/osd/Ager.cc delete mode 100644 branches/marnberg/quota/osd/Ager.h delete mode 100644 branches/marnberg/quota/osd/BDBMap.h delete mode 100644 branches/marnberg/quota/osd/Fake.h delete mode 100644 branches/marnberg/quota/osd/FakeStore.cc delete mode 100644 branches/marnberg/quota/osd/FakeStore.h delete mode 100644 branches/marnberg/quota/osd/FakeStoreBDBCollections.h delete mode 100644 branches/marnberg/quota/osd/OBFSStore.cc delete mode 100644 branches/marnberg/quota/osd/OBFSStore.h delete mode 100644 branches/marnberg/quota/osd/OSD.cc delete mode 100644 branches/marnberg/quota/osd/OSD.h delete mode 100644 branches/marnberg/quota/osd/OSDMap.h delete mode 100644 branches/marnberg/quota/osd/ObjectStore.cc delete mode 100644 branches/marnberg/quota/osd/ObjectStore.h delete mode 100644 branches/marnberg/quota/osd/PG.cc delete mode 100644 branches/marnberg/quota/osd/PG.h delete mode 100644 branches/marnberg/quota/osd/osd_types.h delete mode 100644 branches/marnberg/quota/osd/rush.cc delete mode 100644 branches/marnberg/quota/osd/rush.h delete mode 100644 branches/marnberg/quota/osd/tp.cc delete mode 100644 branches/marnberg/quota/osdc/Blinker.h delete mode 100644 branches/marnberg/quota/osdc/Filer.cc delete mode 100644 branches/marnberg/quota/osdc/Filer.h delete mode 100644 branches/marnberg/quota/osdc/Journaler.cc delete mode 100644 branches/marnberg/quota/osdc/Journaler.h delete mode 100644 branches/marnberg/quota/osdc/ObjectCacher.cc delete mode 100644 branches/marnberg/quota/osdc/ObjectCacher.h delete mode 100644 branches/marnberg/quota/osdc/Objecter.cc delete mode 100644 branches/marnberg/quota/osdc/Objecter.h delete mode 100755 branches/marnberg/quota/script/add_header.pl delete mode 100755 branches/marnberg/quota/script/comb.pl delete mode 100755 branches/marnberg/quota/script/find_auth_pins.pl delete mode 100644 branches/marnberg/quota/tcpfuse.cc delete mode 100644 branches/marnberg/quota/tcpsyn.cc delete mode 100644 branches/marnberg/quota/valgrind.supp delete mode 100644 branches/sage/crush/COPYING delete mode 100644 branches/sage/crush/Makefile delete mode 100644 branches/sage/crush/README delete mode 100644 branches/sage/crush/TODO delete mode 100644 branches/sage/crush/active/README delete mode 100644 branches/sage/crush/active/activeslave.cc delete mode 100644 branches/sage/crush/active/activeslave.h delete mode 100644 branches/sage/crush/active/common.h delete mode 100644 branches/sage/crush/active/msgtestclient.cc delete mode 100644 branches/sage/crush/active/msgtestclient.h delete mode 100644 branches/sage/crush/active/utility.h delete mode 100644 branches/sage/crush/client/Client.cc delete mode 100644 branches/sage/crush/client/Client.h delete mode 100644 branches/sage/crush/client/SyntheticClient.cc delete mode 100644 branches/sage/crush/client/SyntheticClient.h delete mode 100644 branches/sage/crush/cmon.cc delete mode 100644 branches/sage/crush/common/Clock.h delete mode 100644 branches/sage/crush/common/Logger.cc delete mode 100644 branches/sage/crush/config.cc delete mode 100644 branches/sage/crush/config.h delete mode 100644 branches/sage/crush/crush.old/test/bucket_movement.cc delete mode 100644 branches/sage/crush/crush.old/test/bucket_variance.cc delete mode 100644 branches/sage/crush/crush.old/test/cluster_movement.cc delete mode 100644 branches/sage/crush/crush.old/test/cluster_movement_remove.cc delete mode 100644 branches/sage/crush/crush.old/test/cluster_movement_rush.cc delete mode 100644 branches/sage/crush/crush.old/test/creeping_failure.cc delete mode 100644 branches/sage/crush/crush.old/test/creeping_failure_variance.cc delete mode 100644 branches/sage/crush/crush.old/test/depth_variance.cc delete mode 100644 branches/sage/crush/crush.old/test/mixed.cc delete mode 100644 branches/sage/crush/crush.old/test/movement.cc delete mode 100644 branches/sage/crush/crush.old/test/movement_failed.cc delete mode 100644 branches/sage/crush/crush.old/test/overload.cc delete mode 100644 branches/sage/crush/crush.old/test/overload_variance.cc delete mode 100644 branches/sage/crush/crush.old/test/sizes.cc delete mode 100644 branches/sage/crush/crush.old/test/smallbucket.cc delete mode 100644 branches/sage/crush/crush.old/test/speed_bucket.cc delete mode 100644 branches/sage/crush/crush.old/test/speed_depth.cc delete mode 100644 branches/sage/crush/crush.old/test/speed_rush.cc delete mode 100644 branches/sage/crush/crush.old/test/t.cc delete mode 100644 branches/sage/crush/crush.old/test/testbucket.cc delete mode 100644 branches/sage/crush/crush.old/test/testnormal.cc delete mode 100644 branches/sage/crush/crush/CrushWrapper.h delete mode 100644 branches/sage/crush/crush/Makefile delete mode 100644 branches/sage/crush/crush/builder.c delete mode 100644 branches/sage/crush/crush/builder.h delete mode 100644 branches/sage/crush/crush/crush.c delete mode 100644 branches/sage/crush/crush/crush.h delete mode 100644 branches/sage/crush/crush/mapper.c delete mode 100644 branches/sage/crush/crush/test.c delete mode 100644 branches/sage/crush/doc/bdb.txt delete mode 100644 branches/sage/crush/doc/dentries.txt delete mode 100644 branches/sage/crush/doc/file_modes.txt delete mode 100644 branches/sage/crush/doc/inos.txt delete mode 100644 branches/sage/crush/doc/journal.txt delete mode 100644 branches/sage/crush/doc/lazy_posix.txt delete mode 100644 branches/sage/crush/doc/osd_outline.txt delete mode 100644 branches/sage/crush/doc/osd_replication.txt delete mode 100644 branches/sage/crush/doc/shared_write_states_nogo.txt delete mode 100644 branches/sage/crush/doc/shutdown.txt delete mode 100644 branches/sage/crush/dupstore.cc delete mode 100644 branches/sage/crush/ebofs/BlockDevice.cc delete mode 100644 branches/sage/crush/ebofs/BlockDevice.h delete mode 100644 branches/sage/crush/ebofs/BufferCache.cc delete mode 100644 branches/sage/crush/ebofs/BufferCache.h delete mode 100644 branches/sage/crush/ebofs/Cnode.h delete mode 100644 branches/sage/crush/ebofs/Ebofs.cc delete mode 100644 branches/sage/crush/ebofs/Ebofs.h delete mode 100644 branches/sage/crush/ebofs/FileJournal.h delete mode 100644 branches/sage/crush/ebofs/Onode.h delete mode 100644 branches/sage/crush/ebofs/Table.h delete mode 100644 branches/sage/crush/ebofs/nodes.h delete mode 100644 branches/sage/crush/ebofs/test.ebofs.cc delete mode 100644 branches/sage/crush/ebofs/types.h delete mode 100644 branches/sage/crush/fakefuse.cc delete mode 100644 branches/sage/crush/fakesyn.cc delete mode 100644 branches/sage/crush/include/buffer.h delete mode 100644 branches/sage/crush/include/ceph_fs.h delete mode 100644 branches/sage/crush/include/filepath.h delete mode 100644 branches/sage/crush/include/frag.h delete mode 100644 branches/sage/crush/include/hash.h delete mode 100644 branches/sage/crush/include/interval_set.h delete mode 100644 branches/sage/crush/include/object.h delete mode 100644 branches/sage/crush/include/types.h delete mode 100644 branches/sage/crush/include/utime.h delete mode 100644 branches/sage/crush/jobs/alc.tp delete mode 100644 branches/sage/crush/jobs/alcdat/makedirs delete mode 100644 branches/sage/crush/jobs/alcdat/makedirs.big delete mode 100644 branches/sage/crush/jobs/alcdat/makedirs.tput delete mode 100644 branches/sage/crush/jobs/alcdat/makefiles.shared delete mode 100644 branches/sage/crush/jobs/alcdat/openshared delete mode 100644 branches/sage/crush/jobs/alcdat/ossh.include delete mode 100644 branches/sage/crush/jobs/alcdat/ossh.include.big delete mode 100644 branches/sage/crush/jobs/alcdat/ossh.lib delete mode 100644 branches/sage/crush/jobs/alcdat/ossh.lib.big delete mode 100644 branches/sage/crush/jobs/alcdat/striping delete mode 100644 branches/sage/crush/jobs/example delete mode 100644 branches/sage/crush/jobs/mds/log_striping delete mode 100644 branches/sage/crush/jobs/mds/makedir_lat delete mode 100644 branches/sage/crush/jobs/mds/makedirs delete mode 100644 branches/sage/crush/jobs/mds/opensshlib delete mode 100644 branches/sage/crush/jobs/meta1 delete mode 100755 branches/sage/crush/jobs/meta1.proc.sh delete mode 100644 branches/sage/crush/jobs/osd/ebofs delete mode 100644 branches/sage/crush/jobs/osd/mds_log delete mode 100644 branches/sage/crush/jobs/osd/osd_threads delete mode 100644 branches/sage/crush/jobs/osd/striping delete mode 100644 branches/sage/crush/jobs/osd/wr_lat2 delete mode 100644 branches/sage/crush/jobs/osd/write_sizes delete mode 100644 branches/sage/crush/jobs/rados/map_dist delete mode 100644 branches/sage/crush/jobs/rados/rep_lat delete mode 100644 branches/sage/crush/jobs/rados/wr_sizes delete mode 100644 branches/sage/crush/kernel/Makefile delete mode 100644 branches/sage/crush/kernel/bufferlist.h delete mode 100644 branches/sage/crush/kernel/inode.c delete mode 100644 branches/sage/crush/kernel/kmsg.h delete mode 100644 branches/sage/crush/kernel/kmsgbits.h delete mode 100644 branches/sage/crush/kernel/mds_client.h delete mode 100644 branches/sage/crush/kernel/mdsmap.h delete mode 100644 branches/sage/crush/kernel/monmap.h delete mode 100644 branches/sage/crush/kernel/osd_client.h delete mode 100644 branches/sage/crush/kernel/super.h delete mode 100644 branches/sage/crush/mds/Anchor.h delete mode 100644 branches/sage/crush/mds/AnchorClient.cc delete mode 100644 branches/sage/crush/mds/AnchorTable.cc delete mode 100644 branches/sage/crush/mds/CDentry.cc delete mode 100644 branches/sage/crush/mds/CDentry.h delete mode 100644 branches/sage/crush/mds/CDir.cc delete mode 100644 branches/sage/crush/mds/CInode.cc delete mode 100644 branches/sage/crush/mds/CInode.h delete mode 100644 branches/sage/crush/mds/ClientMap.h delete mode 100644 branches/sage/crush/mds/Locker.cc delete mode 100644 branches/sage/crush/mds/LogEvent.cc delete mode 100644 branches/sage/crush/mds/LogEvent.h delete mode 100644 branches/sage/crush/mds/LogSegment.h delete mode 100644 branches/sage/crush/mds/MDBalancer.cc delete mode 100644 branches/sage/crush/mds/MDCache.cc delete mode 100644 branches/sage/crush/mds/MDCache.h delete mode 100644 branches/sage/crush/mds/MDLog.cc delete mode 100644 branches/sage/crush/mds/MDLog.h delete mode 100644 branches/sage/crush/mds/MDS.cc delete mode 100644 branches/sage/crush/mds/MDS.h delete mode 100644 branches/sage/crush/mds/MDSMap.h delete mode 100644 branches/sage/crush/mds/Migrator.cc delete mode 100644 branches/sage/crush/mds/Migrator.h delete mode 100644 branches/sage/crush/mds/Server.cc delete mode 100644 branches/sage/crush/mds/Server.h delete mode 100644 branches/sage/crush/mds/events/EImportStart.h delete mode 100644 branches/sage/crush/mds/events/ESession.h delete mode 100644 branches/sage/crush/mds/events/EUpdate.h delete mode 100644 branches/sage/crush/mds/journal.cc delete mode 100644 branches/sage/crush/mds/mdstypes.h delete mode 100644 branches/sage/crush/messages/MClientFileCaps.h delete mode 100644 branches/sage/crush/messages/MClientMount.h delete mode 100644 branches/sage/crush/messages/MClientReconnect.h delete mode 100644 branches/sage/crush/messages/MClientReply.h delete mode 100644 branches/sage/crush/messages/MClientRequest.h delete mode 100644 branches/sage/crush/messages/MClientRequestForward.h delete mode 100644 branches/sage/crush/messages/MClientSession.h delete mode 100644 branches/sage/crush/messages/MClientUnmount.h delete mode 100644 branches/sage/crush/messages/MDirUpdate.h delete mode 100644 branches/sage/crush/messages/MExportDirDiscover.h delete mode 100644 branches/sage/crush/messages/MLock.h delete mode 100644 branches/sage/crush/messages/MMDSBeacon.h delete mode 100644 branches/sage/crush/messages/MMDSGetMap.h delete mode 100644 branches/sage/crush/messages/MMDSMap.h delete mode 100644 branches/sage/crush/messages/MMDSSlaveRequest.h delete mode 100644 branches/sage/crush/messages/MOSDGetMap.h delete mode 100644 branches/sage/crush/messages/MOSDMap.h delete mode 100644 branches/sage/crush/messages/MOSDOp.h delete mode 100644 branches/sage/crush/messages/MOSDOpReply.h delete mode 100644 branches/sage/crush/messages/MOSDPGQuery.h delete mode 100644 branches/sage/crush/messages/MPing.h delete mode 100644 branches/sage/crush/messages/MPingAck.h delete mode 100644 branches/sage/crush/messages/MStatfs.h delete mode 100644 branches/sage/crush/messages/MStatfsReply.h delete mode 100644 branches/sage/crush/mkmonmap.cc delete mode 100644 branches/sage/crush/mon/ClientMonitor.cc delete mode 100644 branches/sage/crush/mon/ClientMonitor.h delete mode 100644 branches/sage/crush/mon/Elector.cc delete mode 100644 branches/sage/crush/mon/MDSMonitor.cc delete mode 100644 branches/sage/crush/mon/MDSMonitor.h delete mode 100644 branches/sage/crush/mon/MonMap.h delete mode 100644 branches/sage/crush/mon/Monitor.cc delete mode 100644 branches/sage/crush/mon/Monitor.h delete mode 100644 branches/sage/crush/mon/MonitorStore.cc delete mode 100644 branches/sage/crush/mon/OSDMonitor.cc delete mode 100644 branches/sage/crush/mon/OSDMonitor.h delete mode 100644 branches/sage/crush/mon/PGMonitor.cc delete mode 100644 branches/sage/crush/mon/Paxos.cc delete mode 100644 branches/sage/crush/mon/Paxos.h delete mode 100644 branches/sage/crush/mon/PaxosService.h delete mode 100644 branches/sage/crush/msg/FakeMessenger.cc delete mode 100644 branches/sage/crush/msg/FakeMessenger.h delete mode 100644 branches/sage/crush/msg/Message.cc delete mode 100644 branches/sage/crush/msg/Message.h delete mode 100644 branches/sage/crush/msg/Messenger.h delete mode 100644 branches/sage/crush/msg/SimpleMessenger.cc delete mode 100644 branches/sage/crush/msg/SimpleMessenger.h delete mode 100644 branches/sage/crush/msg/msg_types.h delete mode 100644 branches/sage/crush/msg/tcp.cc delete mode 100644 branches/sage/crush/msg/tcp.h delete mode 100644 branches/sage/crush/newsyn.cc delete mode 100644 branches/sage/crush/osd/Fake.h delete mode 100644 branches/sage/crush/osd/FakeStore.cc delete mode 100644 branches/sage/crush/osd/FakeStore.h delete mode 100644 branches/sage/crush/osd/OSD.cc delete mode 100644 branches/sage/crush/osd/OSD.h delete mode 100644 branches/sage/crush/osd/OSDMap.h delete mode 100644 branches/sage/crush/osd/ObjectStore.h delete mode 100644 branches/sage/crush/osd/PG.cc delete mode 100644 branches/sage/crush/osd/PG.h delete mode 100644 branches/sage/crush/osd/RAID4PG.cc delete mode 100644 branches/sage/crush/osd/RAID4PG.h delete mode 100644 branches/sage/crush/osd/ReplicatedPG.cc delete mode 100644 branches/sage/crush/osd/ReplicatedPG.h delete mode 100644 branches/sage/crush/osd/osd_types.h delete mode 100644 branches/sage/crush/osdc/Journaler.h delete mode 100644 branches/sage/crush/osdc/Objecter.cc delete mode 100644 branches/sage/crush/osdc/Objecter.h delete mode 100755 branches/sage/crush/script/adjusttabs.pl delete mode 100755 branches/sage/crush/script/clean_osd_cow.sh delete mode 100755 branches/sage/crush/script/clean_trace.pl delete mode 100755 branches/sage/crush/script/find_bufferleaks.pl delete mode 100755 branches/sage/crush/script/find_lost_bdev_ops.pl delete mode 100755 branches/sage/crush/script/find_lost_commit.pl delete mode 100755 branches/sage/crush/script/find_lost_objecter.pl delete mode 100755 branches/sage/crush/script/find_pathpins.pl delete mode 100755 branches/sage/crush/script/find_requests.pl delete mode 100755 branches/sage/crush/script/find_waiters.pl delete mode 100755 branches/sage/crush/script/grepblock delete mode 100644 branches/sage/crush/script/merge_trace_rw.pl delete mode 100755 branches/sage/crush/script/profonly.pl delete mode 100755 branches/sage/crush/script/runset.pl delete mode 100755 branches/sage/crush/script/sum.pl delete mode 100644 branches/sage/crush/test/fakemds.cc delete mode 100644 branches/sage/crush/test/gprof-helper.c delete mode 100644 branches/sage/crush/test/makedirs.cc delete mode 100644 branches/sage/crush/test/mpitest.cc delete mode 100644 branches/sage/crush/test/mttest.cc delete mode 100644 branches/sage/crush/test/rushconfig delete mode 100644 branches/sage/crush/test/rushtest.cc delete mode 100644 branches/sage/crush/test/rushtest.cc~ delete mode 100644 branches/sage/crush/test/test_seek_read.c delete mode 100644 branches/sage/crush/test/testbucket.cc delete mode 100644 branches/sage/crush/test/testbuffers.cc delete mode 100644 branches/sage/crush/test/testcrush.cc delete mode 100644 branches/sage/crush/test/testfilepath.cc delete mode 100644 branches/sage/crush/test/testmpi.cc delete mode 100644 branches/sage/crush/test/testnewbuffers.cc delete mode 100644 branches/sage/crush/test/testos.cc delete mode 100644 branches/sage/crush/test/testosbdb.cc delete mode 100644 branches/sage/crush/test/testtree.cc delete mode 100644 branches/sage/crush/test/testxattr.cc delete mode 100644 branches/sage/crush/valgrind.supp delete mode 100644 branches/sage/ebofs2/COPYING delete mode 100644 branches/sage/ebofs2/Makefile delete mode 100644 branches/sage/ebofs2/README delete mode 100644 branches/sage/ebofs2/TODO delete mode 100644 branches/sage/ebofs2/active/README delete mode 100644 branches/sage/ebofs2/active/activemaster.cc delete mode 100644 branches/sage/ebofs2/active/activemaster.h delete mode 100644 branches/sage/ebofs2/active/activeslave.cc delete mode 100644 branches/sage/ebofs2/active/activeslave.h delete mode 100644 branches/sage/ebofs2/active/activetaskd.cc delete mode 100644 branches/sage/ebofs2/active/activetaskd.h delete mode 100644 branches/sage/ebofs2/active/client_init.cc delete mode 100644 branches/sage/ebofs2/active/client_init.h delete mode 100644 branches/sage/ebofs2/active/common.h delete mode 100644 branches/sage/ebofs2/active/echotestclient.cc delete mode 100644 branches/sage/ebofs2/active/echotestclient.h delete mode 100644 branches/sage/ebofs2/active/inet.h delete mode 100644 branches/sage/ebofs2/active/msgtestclient.cc delete mode 100644 branches/sage/ebofs2/active/msgtestclient.h delete mode 100644 branches/sage/ebofs2/active/trivial_task.cc delete mode 100644 branches/sage/ebofs2/active/trivial_task.h delete mode 100644 branches/sage/ebofs2/active/utility.h delete mode 100644 branches/sage/ebofs2/cfuse.cc delete mode 100644 branches/sage/ebofs2/client/Client.cc delete mode 100644 branches/sage/ebofs2/client/Client.h delete mode 100644 branches/sage/ebofs2/client/FileCache.cc delete mode 100644 branches/sage/ebofs2/client/FileCache.h delete mode 100644 branches/sage/ebofs2/client/SyntheticClient.cc delete mode 100644 branches/sage/ebofs2/client/SyntheticClient.h delete mode 100644 branches/sage/ebofs2/client/Trace.cc delete mode 100644 branches/sage/ebofs2/client/Trace.h delete mode 100644 branches/sage/ebofs2/client/fuse.cc delete mode 100644 branches/sage/ebofs2/client/fuse.h delete mode 100644 branches/sage/ebofs2/client/fuse_ll.cc delete mode 100644 branches/sage/ebofs2/client/fuse_ll.h delete mode 100644 branches/sage/ebofs2/client/hadoop/CephFSInterface.cc delete mode 100644 branches/sage/ebofs2/client/hadoop/CephFSInterface.h delete mode 100644 branches/sage/ebofs2/client/ldceph.cc delete mode 100644 branches/sage/ebofs2/cmds.cc delete mode 100644 branches/sage/ebofs2/cmonctl.cc delete mode 100644 branches/sage/ebofs2/common/Clock.cc delete mode 100644 branches/sage/ebofs2/common/Clock.h delete mode 100644 branches/sage/ebofs2/common/Cond.h delete mode 100644 branches/sage/ebofs2/common/DecayCounter.h delete mode 100644 branches/sage/ebofs2/common/LogType.h delete mode 100644 branches/sage/ebofs2/common/Logger.cc delete mode 100644 branches/sage/ebofs2/common/Logger.h delete mode 100755 branches/sage/ebofs2/common/Mutex.h delete mode 100644 branches/sage/ebofs2/common/RWLock.h delete mode 100644 branches/sage/ebofs2/common/Semaphore.h delete mode 100644 branches/sage/ebofs2/common/Thread.h delete mode 100644 branches/sage/ebofs2/common/ThreadPool.h delete mode 100644 branches/sage/ebofs2/common/Timer.cc delete mode 100644 branches/sage/ebofs2/common/Timer.h delete mode 100644 branches/sage/ebofs2/config.cc delete mode 100644 branches/sage/ebofs2/config.h delete mode 100644 branches/sage/ebofs2/cosd.cc delete mode 100644 branches/sage/ebofs2/crush.old/BinaryTree.h delete mode 100644 branches/sage/ebofs2/crush.old/Bucket.h delete mode 100644 branches/sage/ebofs2/crush.old/Hash.h delete mode 100644 branches/sage/ebofs2/crush.old/crush.h delete mode 100644 branches/sage/ebofs2/crush.old/test/bucket_movement.cc delete mode 100644 branches/sage/ebofs2/crush.old/test/bucket_variance.cc delete mode 100644 branches/sage/ebofs2/crush.old/test/cluster_movement.cc delete mode 100644 branches/sage/ebofs2/crush.old/test/cluster_movement_remove.cc delete mode 100644 branches/sage/ebofs2/crush.old/test/cluster_movement_rush.cc delete mode 100644 branches/sage/ebofs2/crush.old/test/creeping_failure.cc delete mode 100644 branches/sage/ebofs2/crush.old/test/creeping_failure_variance.cc delete mode 100644 branches/sage/ebofs2/crush.old/test/depth_variance.cc delete mode 100644 branches/sage/ebofs2/crush.old/test/mixed.cc delete mode 100644 branches/sage/ebofs2/crush.old/test/movement.cc delete mode 100644 branches/sage/ebofs2/crush.old/test/movement_failed.cc delete mode 100644 branches/sage/ebofs2/crush.old/test/overload.cc delete mode 100644 branches/sage/ebofs2/crush.old/test/overload_variance.cc delete mode 100644 branches/sage/ebofs2/crush.old/test/sizes.cc delete mode 100644 branches/sage/ebofs2/crush.old/test/smallbucket.cc delete mode 100644 branches/sage/ebofs2/crush.old/test/speed_bucket.cc delete mode 100644 branches/sage/ebofs2/crush.old/test/speed_depth.cc delete mode 100644 branches/sage/ebofs2/crush.old/test/speed_rush.cc delete mode 100644 branches/sage/ebofs2/crush.old/test/t.cc delete mode 100644 branches/sage/ebofs2/crush.old/test/testbucket.cc delete mode 100644 branches/sage/ebofs2/crush.old/test/testnormal.cc delete mode 100644 branches/sage/ebofs2/crush/CrushWrapper.h delete mode 100644 branches/sage/ebofs2/crush/Makefile delete mode 100644 branches/sage/ebofs2/crush/buckets.c delete mode 100644 branches/sage/ebofs2/crush/builder.c delete mode 100644 branches/sage/ebofs2/crush/builder.h delete mode 100644 branches/sage/ebofs2/crush/crush.c delete mode 100644 branches/sage/ebofs2/crush/crush.h delete mode 100644 branches/sage/ebofs2/crush/hash.h delete mode 100644 branches/sage/ebofs2/crush/mapper.c delete mode 100644 branches/sage/ebofs2/crush/mapper.h delete mode 100644 branches/sage/ebofs2/crush/test.c delete mode 100644 branches/sage/ebofs2/crush/types.h delete mode 100644 branches/sage/ebofs2/csyn.cc delete mode 100644 branches/sage/ebofs2/doc/Commitdir.txt delete mode 100644 branches/sage/ebofs2/doc/anchortable.txt delete mode 100644 branches/sage/ebofs2/doc/bdb.txt delete mode 100644 branches/sage/ebofs2/doc/caching.txt delete mode 100644 branches/sage/ebofs2/doc/dentries.txt delete mode 100644 branches/sage/ebofs2/doc/exports.txt delete mode 100644 branches/sage/ebofs2/doc/file_modes.txt delete mode 100644 branches/sage/ebofs2/doc/header.txt delete mode 100644 branches/sage/ebofs2/doc/inos.txt delete mode 100644 branches/sage/ebofs2/doc/journal.txt delete mode 100644 branches/sage/ebofs2/doc/lazy_posix.txt delete mode 100644 branches/sage/ebofs2/doc/mds_locks.txt delete mode 100644 branches/sage/ebofs2/doc/modeline.txt delete mode 100644 branches/sage/ebofs2/doc/osd_outline.txt delete mode 100644 branches/sage/ebofs2/doc/osd_replication.txt delete mode 100644 branches/sage/ebofs2/doc/shared_write_states_nogo.txt delete mode 100644 branches/sage/ebofs2/doc/shutdown.txt delete mode 100644 branches/sage/ebofs2/dupstore.cc delete mode 100644 branches/sage/ebofs2/ebofs/Allocator.cc delete mode 100644 branches/sage/ebofs2/ebofs/Allocator.h delete mode 100644 branches/sage/ebofs2/ebofs/BlockDevice.cc delete mode 100644 branches/sage/ebofs2/ebofs/BlockDevice.h delete mode 100644 branches/sage/ebofs2/ebofs/BufferCache.cc delete mode 100644 branches/sage/ebofs2/ebofs/BufferCache.h delete mode 100644 branches/sage/ebofs2/ebofs/Cnode.h delete mode 100644 branches/sage/ebofs2/ebofs/Ebofs.cc delete mode 100644 branches/sage/ebofs2/ebofs/Ebofs.h delete mode 100644 branches/sage/ebofs2/ebofs/FileJournal.cc delete mode 100644 branches/sage/ebofs2/ebofs/FileJournal.h delete mode 100644 branches/sage/ebofs2/ebofs/Journal.h delete mode 100644 branches/sage/ebofs2/ebofs/Onode.h delete mode 100644 branches/sage/ebofs2/ebofs/Table.h delete mode 100644 branches/sage/ebofs2/ebofs/mkfs.ebofs.cc delete mode 100644 branches/sage/ebofs2/ebofs/nodes.h delete mode 100644 branches/sage/ebofs2/ebofs/test.ebofs.cc delete mode 100644 branches/sage/ebofs2/ebofs/types.h delete mode 100644 branches/sage/ebofs2/extractosdmaps.cc delete mode 100644 branches/sage/ebofs2/fakefuse.cc delete mode 100644 branches/sage/ebofs2/fakesyn.cc delete mode 100644 branches/sage/ebofs2/include/Context.h delete mode 100644 branches/sage/ebofs2/include/Distribution.h delete mode 100644 branches/sage/ebofs2/include/atomic.h delete mode 100644 branches/sage/ebofs2/include/bitmapper.h delete mode 100644 branches/sage/ebofs2/include/blobhash.h delete mode 100644 branches/sage/ebofs2/include/buffer.h delete mode 100644 branches/sage/ebofs2/include/ceph_fs.h delete mode 100644 branches/sage/ebofs2/include/encodable.h delete mode 100644 branches/sage/ebofs2/include/error.h delete mode 100644 branches/sage/ebofs2/include/filepath.h delete mode 100644 branches/sage/ebofs2/include/frag.h delete mode 100644 branches/sage/ebofs2/include/hash.h delete mode 100644 branches/sage/ebofs2/include/interval_set.h delete mode 100644 branches/sage/ebofs2/include/lru.h delete mode 100644 branches/sage/ebofs2/include/object.h delete mode 100644 branches/sage/ebofs2/include/rangeset.h delete mode 100644 branches/sage/ebofs2/include/statlite.h delete mode 100644 branches/sage/ebofs2/include/triple.h delete mode 100644 branches/sage/ebofs2/include/types.h delete mode 100644 branches/sage/ebofs2/include/uofs.h delete mode 100644 branches/sage/ebofs2/include/utime.h delete mode 100644 branches/sage/ebofs2/include/xlist.h delete mode 100644 branches/sage/ebofs2/jobs/alc.tp delete mode 100644 branches/sage/ebofs2/jobs/alcdat/makedirs delete mode 100644 branches/sage/ebofs2/jobs/alcdat/makedirs.big delete mode 100644 branches/sage/ebofs2/jobs/alcdat/makedirs.tput delete mode 100644 branches/sage/ebofs2/jobs/alcdat/makefiles.shared delete mode 100644 branches/sage/ebofs2/jobs/alcdat/openshared delete mode 100644 branches/sage/ebofs2/jobs/alcdat/ossh.include delete mode 100644 branches/sage/ebofs2/jobs/alcdat/ossh.include.big delete mode 100644 branches/sage/ebofs2/jobs/alcdat/ossh.lib delete mode 100644 branches/sage/ebofs2/jobs/alcdat/ossh.lib.big delete mode 100644 branches/sage/ebofs2/jobs/alcdat/striping delete mode 100644 branches/sage/ebofs2/jobs/example delete mode 100644 branches/sage/ebofs2/jobs/mds/log_striping delete mode 100644 branches/sage/ebofs2/jobs/mds/makedir_lat delete mode 100644 branches/sage/ebofs2/jobs/mds/makedirs delete mode 100644 branches/sage/ebofs2/jobs/mds/opensshlib delete mode 100644 branches/sage/ebofs2/jobs/meta1 delete mode 100755 branches/sage/ebofs2/jobs/meta1.proc.sh delete mode 100644 branches/sage/ebofs2/jobs/osd/ebofs delete mode 100644 branches/sage/ebofs2/jobs/osd/mds_log delete mode 100644 branches/sage/ebofs2/jobs/osd/osd_threads delete mode 100644 branches/sage/ebofs2/jobs/osd/striping delete mode 100644 branches/sage/ebofs2/jobs/osd/wr_lat2 delete mode 100644 branches/sage/ebofs2/jobs/osd/write_sizes delete mode 100644 branches/sage/ebofs2/jobs/rados/map_dist delete mode 100644 branches/sage/ebofs2/jobs/rados/rep_lat delete mode 100644 branches/sage/ebofs2/jobs/rados/wr_sizes delete mode 100644 branches/sage/ebofs2/jobs/runjobsample delete mode 100644 branches/sage/ebofs2/kernel/Makefile delete mode 100644 branches/sage/ebofs2/kernel/accepter.h delete mode 100644 branches/sage/ebofs2/kernel/bufferlist.c delete mode 100644 branches/sage/ebofs2/kernel/bufferlist.h delete mode 100644 branches/sage/ebofs2/kernel/inode.c delete mode 100644 branches/sage/ebofs2/kernel/kmsg.h delete mode 100644 branches/sage/ebofs2/kernel/ktcp.c delete mode 100644 branches/sage/ebofs2/kernel/ktcp.h delete mode 100644 branches/sage/ebofs2/kernel/mds_client.c delete mode 100644 branches/sage/ebofs2/kernel/mds_client.h delete mode 100644 branches/sage/ebofs2/kernel/mdsmap.c delete mode 100644 branches/sage/ebofs2/kernel/mdsmap.h delete mode 100644 branches/sage/ebofs2/kernel/messenger.c delete mode 100644 branches/sage/ebofs2/kernel/mon_client.h delete mode 100644 branches/sage/ebofs2/kernel/monmap.h delete mode 100644 branches/sage/ebofs2/kernel/osd_client.h delete mode 100644 branches/sage/ebofs2/kernel/super.h delete mode 100644 branches/sage/ebofs2/mds/AnchorClient.h delete mode 100644 branches/sage/ebofs2/mds/AnchorTable.h delete mode 100644 branches/sage/ebofs2/mds/CDentry.cc delete mode 100644 branches/sage/ebofs2/mds/CDentry.h delete mode 100644 branches/sage/ebofs2/mds/CDir.cc delete mode 100644 branches/sage/ebofs2/mds/CDir.h delete mode 100644 branches/sage/ebofs2/mds/CInode.cc delete mode 100644 branches/sage/ebofs2/mds/CInode.h delete mode 100644 branches/sage/ebofs2/mds/Capability.h delete mode 100644 branches/sage/ebofs2/mds/ClientMap.cc delete mode 100644 branches/sage/ebofs2/mds/ClientMap.h delete mode 100644 branches/sage/ebofs2/mds/FileLock.h delete mode 100644 branches/sage/ebofs2/mds/IdAllocator.cc delete mode 100644 branches/sage/ebofs2/mds/IdAllocator.h delete mode 100644 branches/sage/ebofs2/mds/LocalLock.h delete mode 100644 branches/sage/ebofs2/mds/Locker.cc delete mode 100644 branches/sage/ebofs2/mds/Locker.h delete mode 100644 branches/sage/ebofs2/mds/LogEvent.cc delete mode 100644 branches/sage/ebofs2/mds/LogEvent.h delete mode 100644 branches/sage/ebofs2/mds/LogSegment.h delete mode 100644 branches/sage/ebofs2/mds/MDBalancer.cc delete mode 100644 branches/sage/ebofs2/mds/MDBalancer.h delete mode 100644 branches/sage/ebofs2/mds/MDCache.cc delete mode 100644 branches/sage/ebofs2/mds/MDCache.h delete mode 100644 branches/sage/ebofs2/mds/MDLog.cc delete mode 100644 branches/sage/ebofs2/mds/MDLog.h delete mode 100644 branches/sage/ebofs2/mds/MDS.cc delete mode 100644 branches/sage/ebofs2/mds/MDS.h delete mode 100644 branches/sage/ebofs2/mds/MDSMap.h delete mode 100644 branches/sage/ebofs2/mds/Migrator.cc delete mode 100644 branches/sage/ebofs2/mds/Migrator.h delete mode 100644 branches/sage/ebofs2/mds/ScatterLock.h delete mode 100644 branches/sage/ebofs2/mds/Server.cc delete mode 100644 branches/sage/ebofs2/mds/Server.h delete mode 100644 branches/sage/ebofs2/mds/SimpleLock.h delete mode 100644 branches/sage/ebofs2/mds/events/EAnchor.h delete mode 100644 branches/sage/ebofs2/mds/events/EAnchorClient.h delete mode 100644 branches/sage/ebofs2/mds/events/EExport.h delete mode 100644 branches/sage/ebofs2/mds/events/EFragment.h delete mode 100644 branches/sage/ebofs2/mds/events/EImportFinish.h delete mode 100644 branches/sage/ebofs2/mds/events/EImportStart.h delete mode 100644 branches/sage/ebofs2/mds/events/EMetaBlob.h delete mode 100644 branches/sage/ebofs2/mds/events/EOpen.h delete mode 100644 branches/sage/ebofs2/mds/events/EPurgeFinish.h delete mode 100644 branches/sage/ebofs2/mds/events/ESession.h delete mode 100644 branches/sage/ebofs2/mds/events/ESlaveUpdate.h delete mode 100644 branches/sage/ebofs2/mds/events/EString.h delete mode 100644 branches/sage/ebofs2/mds/events/ESubtreeMap.h delete mode 100644 branches/sage/ebofs2/mds/events/EUpdate.h delete mode 100644 branches/sage/ebofs2/mds/journal.cc delete mode 100644 branches/sage/ebofs2/mds/mdstypes.h delete mode 100644 branches/sage/ebofs2/messages/MAnchor.h delete mode 100644 branches/sage/ebofs2/messages/MCacheExpire.h delete mode 100644 branches/sage/ebofs2/messages/MClientFileCaps.h delete mode 100644 branches/sage/ebofs2/messages/MClientMount.h delete mode 100644 branches/sage/ebofs2/messages/MClientReconnect.h delete mode 100644 branches/sage/ebofs2/messages/MClientReply.h delete mode 100644 branches/sage/ebofs2/messages/MClientRequest.h delete mode 100644 branches/sage/ebofs2/messages/MClientRequestForward.h delete mode 100644 branches/sage/ebofs2/messages/MClientSession.h delete mode 100644 branches/sage/ebofs2/messages/MClientUnmount.h delete mode 100644 branches/sage/ebofs2/messages/MDentryUnlink.h delete mode 100644 branches/sage/ebofs2/messages/MDirUpdate.h delete mode 100644 branches/sage/ebofs2/messages/MDiscover.h delete mode 100644 branches/sage/ebofs2/messages/MDiscoverReply.h delete mode 100644 branches/sage/ebofs2/messages/MExportDir.h delete mode 100644 branches/sage/ebofs2/messages/MExportDirAck.h delete mode 100644 branches/sage/ebofs2/messages/MExportDirCancel.h delete mode 100644 branches/sage/ebofs2/messages/MExportDirDiscover.h delete mode 100644 branches/sage/ebofs2/messages/MExportDirDiscoverAck.h delete mode 100644 branches/sage/ebofs2/messages/MExportDirFinish.h delete mode 100644 branches/sage/ebofs2/messages/MExportDirNotify.h delete mode 100644 branches/sage/ebofs2/messages/MExportDirNotifyAck.h delete mode 100644 branches/sage/ebofs2/messages/MExportDirPrep.h delete mode 100644 branches/sage/ebofs2/messages/MExportDirPrepAck.h delete mode 100644 branches/sage/ebofs2/messages/MExportDirWarning.h delete mode 100644 branches/sage/ebofs2/messages/MExportDirWarningAck.h delete mode 100644 branches/sage/ebofs2/messages/MGenericMessage.h delete mode 100644 branches/sage/ebofs2/messages/MHeartbeat.h delete mode 100644 branches/sage/ebofs2/messages/MInodeFileCaps.h delete mode 100644 branches/sage/ebofs2/messages/MLock.h delete mode 100644 branches/sage/ebofs2/messages/MMDSBoot.h delete mode 100644 branches/sage/ebofs2/messages/MMDSCacheRejoin.h delete mode 100644 branches/sage/ebofs2/messages/MMDSFragmentNotify.h delete mode 100644 branches/sage/ebofs2/messages/MMDSGetMap.h delete mode 100644 branches/sage/ebofs2/messages/MMDSMap.h delete mode 100644 branches/sage/ebofs2/messages/MMDSResolve.h delete mode 100644 branches/sage/ebofs2/messages/MMDSResolveAck.h delete mode 100644 branches/sage/ebofs2/messages/MMDSSlaveRequest.h delete mode 100644 branches/sage/ebofs2/messages/MMonCommand.h delete mode 100644 branches/sage/ebofs2/messages/MMonCommandAck.h delete mode 100644 branches/sage/ebofs2/messages/MMonElection.h delete mode 100644 branches/sage/ebofs2/messages/MMonElectionCollect.h delete mode 100644 branches/sage/ebofs2/messages/MMonElectionRefresh.h delete mode 100644 branches/sage/ebofs2/messages/MMonElectionStatus.h delete mode 100644 branches/sage/ebofs2/messages/MMonOSDMapInfo.h delete mode 100644 branches/sage/ebofs2/messages/MMonOSDMapLease.h delete mode 100644 branches/sage/ebofs2/messages/MMonOSDMapLeaseAck.h delete mode 100644 branches/sage/ebofs2/messages/MMonOSDMapUpdateAck.h delete mode 100644 branches/sage/ebofs2/messages/MMonOSDMapUpdateCommit.h delete mode 100644 branches/sage/ebofs2/messages/MMonOSDMapUpdatePrepare.h delete mode 100644 branches/sage/ebofs2/messages/MMonPaxos.h delete mode 100644 branches/sage/ebofs2/messages/MOSDBoot.h delete mode 100644 branches/sage/ebofs2/messages/MOSDFailure.h delete mode 100644 branches/sage/ebofs2/messages/MOSDGetMap.h delete mode 100644 branches/sage/ebofs2/messages/MOSDIn.h delete mode 100644 branches/sage/ebofs2/messages/MOSDMap.h delete mode 100644 branches/sage/ebofs2/messages/MOSDOp.h delete mode 100644 branches/sage/ebofs2/messages/MOSDOpReply.h delete mode 100644 branches/sage/ebofs2/messages/MOSDOut.h delete mode 100644 branches/sage/ebofs2/messages/MOSDPGActivateSet.h delete mode 100644 branches/sage/ebofs2/messages/MOSDPGLog.h delete mode 100644 branches/sage/ebofs2/messages/MOSDPGNotify.h delete mode 100644 branches/sage/ebofs2/messages/MOSDPGPeer.h delete mode 100644 branches/sage/ebofs2/messages/MOSDPGPeerAck.h delete mode 100644 branches/sage/ebofs2/messages/MOSDPGPeerRequest.h delete mode 100644 branches/sage/ebofs2/messages/MOSDPGQuery.h delete mode 100644 branches/sage/ebofs2/messages/MOSDPGRemove.h delete mode 100644 branches/sage/ebofs2/messages/MOSDPGSummary.h delete mode 100644 branches/sage/ebofs2/messages/MOSDPGUpdate.h delete mode 100644 branches/sage/ebofs2/messages/MOSDPing.h delete mode 100644 branches/sage/ebofs2/messages/MPGStats.h delete mode 100644 branches/sage/ebofs2/messages/MPing.h delete mode 100644 branches/sage/ebofs2/messages/MPingAck.h delete mode 100644 branches/sage/ebofs2/messages/MStatfs.h delete mode 100644 branches/sage/ebofs2/messages/MStatfsReply.h delete mode 100644 branches/sage/ebofs2/mkmonmap.cc delete mode 100644 branches/sage/ebofs2/mon/ClientMonitor.cc delete mode 100644 branches/sage/ebofs2/mon/ClientMonitor.h delete mode 100644 branches/sage/ebofs2/mon/Elector.cc delete mode 100644 branches/sage/ebofs2/mon/Elector.h delete mode 100644 branches/sage/ebofs2/mon/MDSMonitor.cc delete mode 100644 branches/sage/ebofs2/mon/MDSMonitor.h delete mode 100644 branches/sage/ebofs2/mon/MonMap.h delete mode 100644 branches/sage/ebofs2/mon/Monitor.cc delete mode 100644 branches/sage/ebofs2/mon/Monitor.h delete mode 100644 branches/sage/ebofs2/mon/MonitorStore.cc delete mode 100644 branches/sage/ebofs2/mon/MonitorStore.h delete mode 100644 branches/sage/ebofs2/mon/OSDMonitor.cc delete mode 100644 branches/sage/ebofs2/mon/OSDMonitor.h delete mode 100644 branches/sage/ebofs2/mon/PGMap.h delete mode 100644 branches/sage/ebofs2/mon/PGMonitor.cc delete mode 100644 branches/sage/ebofs2/mon/PGMonitor.h delete mode 100644 branches/sage/ebofs2/mon/Paxos.cc delete mode 100644 branches/sage/ebofs2/mon/Paxos.h delete mode 100644 branches/sage/ebofs2/mon/PaxosService.cc delete mode 100644 branches/sage/ebofs2/mon/PaxosService.h delete mode 100644 branches/sage/ebofs2/mon/mon_types.h delete mode 100644 branches/sage/ebofs2/msg/Dispatcher.cc delete mode 100644 branches/sage/ebofs2/msg/Dispatcher.h delete mode 100644 branches/sage/ebofs2/msg/FakeMessenger.cc delete mode 100644 branches/sage/ebofs2/msg/FakeMessenger.h delete mode 100644 branches/sage/ebofs2/msg/Message.cc delete mode 100644 branches/sage/ebofs2/msg/Message.h delete mode 100644 branches/sage/ebofs2/msg/Messenger.cc delete mode 100644 branches/sage/ebofs2/msg/Messenger.h delete mode 100644 branches/sage/ebofs2/msg/SimpleMessenger.cc delete mode 100644 branches/sage/ebofs2/msg/SimpleMessenger.h delete mode 100644 branches/sage/ebofs2/msg/msg_types.h delete mode 100644 branches/sage/ebofs2/msg/tcp.h delete mode 100644 branches/sage/ebofs2/newsyn.cc delete mode 100644 branches/sage/ebofs2/osbdb/OSBDB.cc delete mode 100644 branches/sage/ebofs2/osbdb/OSBDB.h delete mode 100644 branches/sage/ebofs2/osd/Ager.cc delete mode 100644 branches/sage/ebofs2/osd/Ager.h delete mode 100644 branches/sage/ebofs2/osd/BDBMap.h delete mode 100644 branches/sage/ebofs2/osd/Fake.h delete mode 100644 branches/sage/ebofs2/osd/FakeStore.cc delete mode 100644 branches/sage/ebofs2/osd/FakeStore.h delete mode 100644 branches/sage/ebofs2/osd/FakeStoreBDBCollections.h delete mode 100644 branches/sage/ebofs2/osd/OSD.cc delete mode 100644 branches/sage/ebofs2/osd/OSD.h delete mode 100644 branches/sage/ebofs2/osd/OSDMap.h delete mode 100644 branches/sage/ebofs2/osd/ObjectStore.cc delete mode 100644 branches/sage/ebofs2/osd/ObjectStore.h delete mode 100644 branches/sage/ebofs2/osd/PG.cc delete mode 100644 branches/sage/ebofs2/osd/PG.h delete mode 100644 branches/sage/ebofs2/osd/RAID4PG.cc delete mode 100644 branches/sage/ebofs2/osd/RAID4PG.h delete mode 100644 branches/sage/ebofs2/osd/ReplicatedPG.cc delete mode 100644 branches/sage/ebofs2/osd/ReplicatedPG.h delete mode 100644 branches/sage/ebofs2/osd/osd_types.h delete mode 100644 branches/sage/ebofs2/osdc/Blinker.h delete mode 100644 branches/sage/ebofs2/osdc/Filer.cc delete mode 100644 branches/sage/ebofs2/osdc/Filer.h delete mode 100644 branches/sage/ebofs2/osdc/Journaler.cc delete mode 100644 branches/sage/ebofs2/osdc/Journaler.h delete mode 100644 branches/sage/ebofs2/osdc/ObjectCacher.cc delete mode 100644 branches/sage/ebofs2/osdc/ObjectCacher.h delete mode 100644 branches/sage/ebofs2/osdc/Objecter.cc delete mode 100644 branches/sage/ebofs2/osdc/Objecter.h delete mode 100755 branches/sage/ebofs2/script/add_header.pl delete mode 100755 branches/sage/ebofs2/script/adjusttabs.pl delete mode 100755 branches/sage/ebofs2/script/check_cache_dumps.pl delete mode 100755 branches/sage/ebofs2/script/clean_osd_cow.sh delete mode 100755 branches/sage/ebofs2/script/clean_trace.pl delete mode 100755 branches/sage/ebofs2/script/comb.pl delete mode 100755 branches/sage/ebofs2/script/convert_soe_trace.pl delete mode 100755 branches/sage/ebofs2/script/find_auth_pins.pl delete mode 100755 branches/sage/ebofs2/script/find_bufferleaks.pl delete mode 100755 branches/sage/ebofs2/script/find_lost_bdev_ops.pl delete mode 100755 branches/sage/ebofs2/script/find_lost_commit.pl delete mode 100755 branches/sage/ebofs2/script/find_lost_objecter.pl delete mode 100755 branches/sage/ebofs2/script/find_pathpins.pl delete mode 100755 branches/sage/ebofs2/script/find_requests.pl delete mode 100755 branches/sage/ebofs2/script/find_waiters.pl delete mode 100755 branches/sage/ebofs2/script/fix_modeline.pl delete mode 100755 branches/sage/ebofs2/script/gprofnewsyn delete mode 100755 branches/sage/ebofs2/script/grepblock delete mode 100755 branches/sage/ebofs2/script/merge_cdfs.pl delete mode 100644 branches/sage/ebofs2/script/merge_trace_rw.pl delete mode 100755 branches/sage/ebofs2/script/plot.pl delete mode 100755 branches/sage/ebofs2/script/profonly.pl delete mode 100755 branches/sage/ebofs2/script/runjob.pl delete mode 100755 branches/sage/ebofs2/script/runset.pl delete mode 100755 branches/sage/ebofs2/script/smooth.pl delete mode 100755 branches/sage/ebofs2/script/study_find.pl delete mode 100755 branches/sage/ebofs2/script/study_hardlink_lifetimes.pl delete mode 100644 branches/sage/ebofs2/script/study_lookups.pl delete mode 100755 branches/sage/ebofs2/script/sum.pl delete mode 100644 branches/sage/ebofs2/test/fakemds.cc delete mode 100644 branches/sage/ebofs2/test/fg.cc delete mode 100644 branches/sage/ebofs2/test/gprof-helper.c delete mode 100644 branches/sage/ebofs2/test/makedirs.cc delete mode 100644 branches/sage/ebofs2/test/mpitest.cc delete mode 100644 branches/sage/ebofs2/test/mttest.cc delete mode 100644 branches/sage/ebofs2/test/rushconfig delete mode 100644 branches/sage/ebofs2/test/rushtest.cc delete mode 100644 branches/sage/ebofs2/test/rushtest.cc~ delete mode 100644 branches/sage/ebofs2/test/test_disk_bw.cc delete mode 100644 branches/sage/ebofs2/test/test_seek_read.c delete mode 100644 branches/sage/ebofs2/test/testbucket.cc delete mode 100644 branches/sage/ebofs2/test/testbuffers.cc delete mode 100644 branches/sage/ebofs2/test/testcounter.cc delete mode 100644 branches/sage/ebofs2/test/testcrush.cc delete mode 100644 branches/sage/ebofs2/test/testfilepath.cc delete mode 100644 branches/sage/ebofs2/test/testmpi.cc delete mode 100644 branches/sage/ebofs2/test/testnewbuffers.cc delete mode 100644 branches/sage/ebofs2/test/testos.cc delete mode 100644 branches/sage/ebofs2/test/testosbdb.cc delete mode 100644 branches/sage/ebofs2/test/testtree.cc delete mode 100644 branches/sage/ebofs2/test/testxattr.cc delete mode 100644 branches/sage/ebofs2/valgrind.supp delete mode 100644 branches/sage/mds/COPYING delete mode 100644 branches/sage/mds/Makefile delete mode 100644 branches/sage/mds/README delete mode 100644 branches/sage/mds/TODO delete mode 100644 branches/sage/mds/active/README delete mode 100644 branches/sage/mds/active/activemaster.cc delete mode 100644 branches/sage/mds/active/activemaster.h delete mode 100644 branches/sage/mds/active/activeslave.cc delete mode 100644 branches/sage/mds/active/activeslave.h delete mode 100644 branches/sage/mds/active/activetaskd.cc delete mode 100644 branches/sage/mds/active/activetaskd.h delete mode 100644 branches/sage/mds/active/client_init.cc delete mode 100644 branches/sage/mds/active/client_init.h delete mode 100644 branches/sage/mds/active/common.h delete mode 100644 branches/sage/mds/active/echotestclient.cc delete mode 100644 branches/sage/mds/active/echotestclient.h delete mode 100644 branches/sage/mds/active/inet.h delete mode 100644 branches/sage/mds/active/msgtestclient.cc delete mode 100644 branches/sage/mds/active/msgtestclient.h delete mode 100644 branches/sage/mds/active/trivial_task.cc delete mode 100644 branches/sage/mds/active/trivial_task.h delete mode 100644 branches/sage/mds/active/utility.h delete mode 100644 branches/sage/mds/cfuse.cc delete mode 100644 branches/sage/mds/client/Client.cc delete mode 100644 branches/sage/mds/client/Client.h delete mode 100644 branches/sage/mds/client/FileCache.cc delete mode 100644 branches/sage/mds/client/FileCache.h delete mode 100644 branches/sage/mds/client/SyntheticClient.cc delete mode 100644 branches/sage/mds/client/SyntheticClient.h delete mode 100644 branches/sage/mds/client/Trace.cc delete mode 100644 branches/sage/mds/client/Trace.h delete mode 100644 branches/sage/mds/client/fuse.cc delete mode 100644 branches/sage/mds/client/fuse.h delete mode 100644 branches/sage/mds/client/fuse_ll.cc delete mode 100644 branches/sage/mds/client/fuse_ll.h delete mode 100644 branches/sage/mds/client/hadoop/CephFSInterface.cc delete mode 100644 branches/sage/mds/client/hadoop/CephFSInterface.h delete mode 100644 branches/sage/mds/client/ldceph.cc delete mode 100644 branches/sage/mds/cmds.cc delete mode 100644 branches/sage/mds/cmon.cc delete mode 100644 branches/sage/mds/cmonctl.cc delete mode 100644 branches/sage/mds/common/Clock.cc delete mode 100644 branches/sage/mds/common/Clock.h delete mode 100644 branches/sage/mds/common/Cond.h delete mode 100644 branches/sage/mds/common/DecayCounter.h delete mode 100644 branches/sage/mds/common/LogType.h delete mode 100644 branches/sage/mds/common/Logger.cc delete mode 100644 branches/sage/mds/common/Logger.h delete mode 100755 branches/sage/mds/common/Mutex.h delete mode 100644 branches/sage/mds/common/RWLock.h delete mode 100644 branches/sage/mds/common/Semaphore.h delete mode 100644 branches/sage/mds/common/Thread.h delete mode 100644 branches/sage/mds/common/ThreadPool.h delete mode 100644 branches/sage/mds/common/Timer.cc delete mode 100644 branches/sage/mds/common/Timer.h delete mode 100644 branches/sage/mds/config.cc delete mode 100644 branches/sage/mds/config.h delete mode 100644 branches/sage/mds/cosd.cc delete mode 100644 branches/sage/mds/crush/BinaryTree.h delete mode 100644 branches/sage/mds/crush/Bucket.h delete mode 100644 branches/sage/mds/crush/Hash.h delete mode 100644 branches/sage/mds/crush/crush.h delete mode 100644 branches/sage/mds/crush/test/bucket_movement.cc delete mode 100644 branches/sage/mds/crush/test/bucket_variance.cc delete mode 100644 branches/sage/mds/crush/test/cluster_movement.cc delete mode 100644 branches/sage/mds/crush/test/cluster_movement_remove.cc delete mode 100644 branches/sage/mds/crush/test/cluster_movement_rush.cc delete mode 100644 branches/sage/mds/crush/test/creeping_failure.cc delete mode 100644 branches/sage/mds/crush/test/creeping_failure_variance.cc delete mode 100644 branches/sage/mds/crush/test/depth_variance.cc delete mode 100644 branches/sage/mds/crush/test/mixed.cc delete mode 100644 branches/sage/mds/crush/test/movement.cc delete mode 100644 branches/sage/mds/crush/test/movement_failed.cc delete mode 100644 branches/sage/mds/crush/test/overload.cc delete mode 100644 branches/sage/mds/crush/test/overload_variance.cc delete mode 100644 branches/sage/mds/crush/test/sizes.cc delete mode 100644 branches/sage/mds/crush/test/smallbucket.cc delete mode 100644 branches/sage/mds/crush/test/speed_bucket.cc delete mode 100644 branches/sage/mds/crush/test/speed_depth.cc delete mode 100644 branches/sage/mds/crush/test/speed_rush.cc delete mode 100644 branches/sage/mds/crush/test/t.cc delete mode 100644 branches/sage/mds/crush/test/testbucket.cc delete mode 100644 branches/sage/mds/crush/test/testnormal.cc delete mode 100644 branches/sage/mds/crush2/Makefile delete mode 100644 branches/sage/mds/crush2/buckets.c delete mode 100644 branches/sage/mds/crush2/buckets.h delete mode 100644 branches/sage/mds/crush2/crush.c delete mode 100644 branches/sage/mds/crush2/crush.h delete mode 100644 branches/sage/mds/crush2/hash.h delete mode 100644 branches/sage/mds/crush2/types.h delete mode 100644 branches/sage/mds/csyn.cc delete mode 100644 branches/sage/mds/doc/Commitdir.txt delete mode 100644 branches/sage/mds/doc/anchortable.txt delete mode 100644 branches/sage/mds/doc/bdb.txt delete mode 100644 branches/sage/mds/doc/caching.txt delete mode 100644 branches/sage/mds/doc/dentries.txt delete mode 100644 branches/sage/mds/doc/exports.txt delete mode 100644 branches/sage/mds/doc/file_modes.txt delete mode 100644 branches/sage/mds/doc/header.txt delete mode 100644 branches/sage/mds/doc/inos.txt delete mode 100644 branches/sage/mds/doc/journal.txt delete mode 100644 branches/sage/mds/doc/lazy_posix.txt delete mode 100644 branches/sage/mds/doc/mds_locks.txt delete mode 100644 branches/sage/mds/doc/modeline.txt delete mode 100644 branches/sage/mds/doc/osd_outline.txt delete mode 100644 branches/sage/mds/doc/osd_replication.txt delete mode 100644 branches/sage/mds/doc/shared_write_states_nogo.txt delete mode 100644 branches/sage/mds/doc/shutdown.txt delete mode 100644 branches/sage/mds/dupstore.cc delete mode 100644 branches/sage/mds/ebofs/Allocator.cc delete mode 100644 branches/sage/mds/ebofs/Allocator.h delete mode 100644 branches/sage/mds/ebofs/BlockDevice.cc delete mode 100644 branches/sage/mds/ebofs/BlockDevice.h delete mode 100644 branches/sage/mds/ebofs/BufferCache.cc delete mode 100644 branches/sage/mds/ebofs/BufferCache.h delete mode 100644 branches/sage/mds/ebofs/Cnode.h delete mode 100644 branches/sage/mds/ebofs/Ebofs.cc delete mode 100644 branches/sage/mds/ebofs/Ebofs.h delete mode 100644 branches/sage/mds/ebofs/FileJournal.cc delete mode 100644 branches/sage/mds/ebofs/FileJournal.h delete mode 100644 branches/sage/mds/ebofs/Journal.h delete mode 100644 branches/sage/mds/ebofs/Onode.h delete mode 100644 branches/sage/mds/ebofs/Table.h delete mode 100644 branches/sage/mds/ebofs/mkfs.ebofs.cc delete mode 100644 branches/sage/mds/ebofs/nodes.h delete mode 100644 branches/sage/mds/ebofs/test.ebofs.cc delete mode 100644 branches/sage/mds/ebofs/types.h delete mode 100644 branches/sage/mds/extractosdmaps.cc delete mode 100644 branches/sage/mds/fakefuse.cc delete mode 100644 branches/sage/mds/fakesyn.cc delete mode 100644 branches/sage/mds/include/Context.h delete mode 100644 branches/sage/mds/include/Distribution.h delete mode 100644 branches/sage/mds/include/bitmapper.h delete mode 100644 branches/sage/mds/include/blobhash.h delete mode 100644 branches/sage/mds/include/buffer.h delete mode 100644 branches/sage/mds/include/ceph_fs.h delete mode 100644 branches/sage/mds/include/encodable.h delete mode 100644 branches/sage/mds/include/error.h delete mode 100644 branches/sage/mds/include/frag.h delete mode 100644 branches/sage/mds/include/hash.h delete mode 100644 branches/sage/mds/include/interval_set.h delete mode 100644 branches/sage/mds/include/lru.h delete mode 100644 branches/sage/mds/include/object.h delete mode 100644 branches/sage/mds/include/rangeset.h delete mode 100644 branches/sage/mds/include/statlite.h delete mode 100644 branches/sage/mds/include/triple.h delete mode 100644 branches/sage/mds/include/types.h delete mode 100644 branches/sage/mds/include/uofs.h delete mode 100644 branches/sage/mds/include/utime.h delete mode 100644 branches/sage/mds/include/xlist.h delete mode 100644 branches/sage/mds/jobs/alc.tp delete mode 100644 branches/sage/mds/jobs/alcdat/makedirs delete mode 100644 branches/sage/mds/jobs/alcdat/makedirs.big delete mode 100644 branches/sage/mds/jobs/alcdat/makedirs.tput delete mode 100644 branches/sage/mds/jobs/alcdat/makefiles.shared delete mode 100644 branches/sage/mds/jobs/alcdat/openshared delete mode 100644 branches/sage/mds/jobs/alcdat/ossh.include delete mode 100644 branches/sage/mds/jobs/alcdat/ossh.include.big delete mode 100644 branches/sage/mds/jobs/alcdat/ossh.lib delete mode 100644 branches/sage/mds/jobs/alcdat/ossh.lib.big delete mode 100644 branches/sage/mds/jobs/alcdat/striping delete mode 100644 branches/sage/mds/jobs/example delete mode 100644 branches/sage/mds/jobs/mds/log_striping delete mode 100644 branches/sage/mds/jobs/mds/makedir_lat delete mode 100644 branches/sage/mds/jobs/mds/makedirs delete mode 100644 branches/sage/mds/jobs/mds/opensshlib delete mode 100644 branches/sage/mds/jobs/meta1 delete mode 100755 branches/sage/mds/jobs/meta1.proc.sh delete mode 100644 branches/sage/mds/jobs/osd/ebofs delete mode 100644 branches/sage/mds/jobs/osd/mds_log delete mode 100644 branches/sage/mds/jobs/osd/osd_threads delete mode 100644 branches/sage/mds/jobs/osd/striping delete mode 100644 branches/sage/mds/jobs/osd/wr_lat2 delete mode 100644 branches/sage/mds/jobs/osd/write_sizes delete mode 100644 branches/sage/mds/jobs/rados/map_dist delete mode 100644 branches/sage/mds/jobs/rados/rep_lat delete mode 100644 branches/sage/mds/jobs/rados/wr_sizes delete mode 100644 branches/sage/mds/jobs/runjobsample delete mode 100644 branches/sage/mds/kernel/Makefile delete mode 100644 branches/sage/mds/kernel/bufferlist.h delete mode 100644 branches/sage/mds/kernel/inode.c delete mode 100644 branches/sage/mds/kernel/kmsg.h delete mode 100644 branches/sage/mds/kernel/kmsgbits.h delete mode 100644 branches/sage/mds/kernel/mds_client.h delete mode 100644 branches/sage/mds/kernel/mdsmap.h delete mode 100644 branches/sage/mds/kernel/monmap.h delete mode 100644 branches/sage/mds/kernel/osd_client.h delete mode 100644 branches/sage/mds/kernel/super.h delete mode 100644 branches/sage/mds/mds/Anchor.h delete mode 100644 branches/sage/mds/mds/AnchorClient.cc delete mode 100644 branches/sage/mds/mds/AnchorClient.h delete mode 100644 branches/sage/mds/mds/AnchorTable.cc delete mode 100644 branches/sage/mds/mds/AnchorTable.h delete mode 100644 branches/sage/mds/mds/CDir.h delete mode 100644 branches/sage/mds/mds/Capability.h delete mode 100644 branches/sage/mds/mds/ClientMap.cc delete mode 100644 branches/sage/mds/mds/FileLock.h delete mode 100644 branches/sage/mds/mds/IdAllocator.cc delete mode 100644 branches/sage/mds/mds/IdAllocator.h delete mode 100644 branches/sage/mds/mds/LocalLock.h delete mode 100644 branches/sage/mds/mds/Locker.cc delete mode 100644 branches/sage/mds/mds/Locker.h delete mode 100644 branches/sage/mds/mds/MDBalancer.cc delete mode 100644 branches/sage/mds/mds/MDBalancer.h delete mode 100644 branches/sage/mds/mds/MDCache.cc delete mode 100644 branches/sage/mds/mds/MDLog.cc delete mode 100644 branches/sage/mds/mds/MDS.cc delete mode 100644 branches/sage/mds/mds/MDS.h delete mode 100644 branches/sage/mds/mds/MDSMap.h delete mode 100644 branches/sage/mds/mds/Migrator.cc delete mode 100644 branches/sage/mds/mds/ScatterLock.h delete mode 100644 branches/sage/mds/mds/Server.cc delete mode 100644 branches/sage/mds/mds/SimpleLock.h delete mode 100644 branches/sage/mds/mds/events/EAnchor.h delete mode 100644 branches/sage/mds/mds/events/EAnchorClient.h delete mode 100644 branches/sage/mds/mds/events/EExport.h delete mode 100644 branches/sage/mds/mds/events/EFragment.h delete mode 100644 branches/sage/mds/mds/events/EImportFinish.h delete mode 100644 branches/sage/mds/mds/events/EMetaBlob.h delete mode 100644 branches/sage/mds/mds/events/EOpen.h delete mode 100644 branches/sage/mds/mds/events/EPurgeFinish.h delete mode 100644 branches/sage/mds/mds/events/ESlaveUpdate.h delete mode 100644 branches/sage/mds/mds/events/EString.h delete mode 100644 branches/sage/mds/mds/events/ESubtreeMap.h delete mode 100644 branches/sage/mds/mds/mdstypes.h delete mode 100644 branches/sage/mds/messages/MAnchor.h delete mode 100644 branches/sage/mds/messages/MCacheExpire.h delete mode 100644 branches/sage/mds/messages/MClientFileCaps.h delete mode 100644 branches/sage/mds/messages/MClientMount.h delete mode 100644 branches/sage/mds/messages/MClientReconnect.h delete mode 100644 branches/sage/mds/messages/MClientReply.h delete mode 100644 branches/sage/mds/messages/MClientRequest.h delete mode 100644 branches/sage/mds/messages/MClientRequestForward.h delete mode 100644 branches/sage/mds/messages/MClientSession.h delete mode 100644 branches/sage/mds/messages/MClientUnmount.h delete mode 100644 branches/sage/mds/messages/MDentryUnlink.h delete mode 100644 branches/sage/mds/messages/MDiscover.h delete mode 100644 branches/sage/mds/messages/MDiscoverReply.h delete mode 100644 branches/sage/mds/messages/MExportDir.h delete mode 100644 branches/sage/mds/messages/MExportDirAck.h delete mode 100644 branches/sage/mds/messages/MExportDirCancel.h delete mode 100644 branches/sage/mds/messages/MExportDirDiscoverAck.h delete mode 100644 branches/sage/mds/messages/MExportDirFinish.h delete mode 100644 branches/sage/mds/messages/MExportDirNotify.h delete mode 100644 branches/sage/mds/messages/MExportDirNotifyAck.h delete mode 100644 branches/sage/mds/messages/MExportDirPrep.h delete mode 100644 branches/sage/mds/messages/MExportDirPrepAck.h delete mode 100644 branches/sage/mds/messages/MExportDirWarning.h delete mode 100644 branches/sage/mds/messages/MExportDirWarningAck.h delete mode 100644 branches/sage/mds/messages/MGenericMessage.h delete mode 100644 branches/sage/mds/messages/MHeartbeat.h delete mode 100644 branches/sage/mds/messages/MInodeFileCaps.h delete mode 100644 branches/sage/mds/messages/MLock.h delete mode 100644 branches/sage/mds/messages/MMDSBeacon.h delete mode 100644 branches/sage/mds/messages/MMDSBoot.h delete mode 100644 branches/sage/mds/messages/MMDSCacheRejoin.h delete mode 100644 branches/sage/mds/messages/MMDSFragmentNotify.h delete mode 100644 branches/sage/mds/messages/MMDSGetMap.h delete mode 100644 branches/sage/mds/messages/MMDSMap.h delete mode 100644 branches/sage/mds/messages/MMDSResolve.h delete mode 100644 branches/sage/mds/messages/MMDSResolveAck.h delete mode 100644 branches/sage/mds/messages/MMonCommand.h delete mode 100644 branches/sage/mds/messages/MMonCommandAck.h delete mode 100644 branches/sage/mds/messages/MMonElection.h delete mode 100644 branches/sage/mds/messages/MMonElectionCollect.h delete mode 100644 branches/sage/mds/messages/MMonElectionRefresh.h delete mode 100644 branches/sage/mds/messages/MMonElectionStatus.h delete mode 100644 branches/sage/mds/messages/MMonOSDMapInfo.h delete mode 100644 branches/sage/mds/messages/MMonOSDMapLease.h delete mode 100644 branches/sage/mds/messages/MMonOSDMapLeaseAck.h delete mode 100644 branches/sage/mds/messages/MMonOSDMapUpdateAck.h delete mode 100644 branches/sage/mds/messages/MMonOSDMapUpdateCommit.h delete mode 100644 branches/sage/mds/messages/MMonOSDMapUpdatePrepare.h delete mode 100644 branches/sage/mds/messages/MMonPaxos.h delete mode 100644 branches/sage/mds/messages/MOSDBoot.h delete mode 100644 branches/sage/mds/messages/MOSDFailure.h delete mode 100644 branches/sage/mds/messages/MOSDGetMap.h delete mode 100644 branches/sage/mds/messages/MOSDIn.h delete mode 100644 branches/sage/mds/messages/MOSDMap.h delete mode 100644 branches/sage/mds/messages/MOSDOp.h delete mode 100644 branches/sage/mds/messages/MOSDOpReply.h delete mode 100644 branches/sage/mds/messages/MOSDOut.h delete mode 100644 branches/sage/mds/messages/MOSDPGActivateSet.h delete mode 100644 branches/sage/mds/messages/MOSDPGLog.h delete mode 100644 branches/sage/mds/messages/MOSDPGNotify.h delete mode 100644 branches/sage/mds/messages/MOSDPGPeer.h delete mode 100644 branches/sage/mds/messages/MOSDPGPeerAck.h delete mode 100644 branches/sage/mds/messages/MOSDPGPeerRequest.h delete mode 100644 branches/sage/mds/messages/MOSDPGQuery.h delete mode 100644 branches/sage/mds/messages/MOSDPGRemove.h delete mode 100644 branches/sage/mds/messages/MOSDPGSummary.h delete mode 100644 branches/sage/mds/messages/MOSDPGUpdate.h delete mode 100644 branches/sage/mds/messages/MOSDPing.h delete mode 100644 branches/sage/mds/messages/MPGStats.h delete mode 100644 branches/sage/mds/messages/MPing.h delete mode 100644 branches/sage/mds/messages/MPingAck.h delete mode 100644 branches/sage/mds/messages/MStatfs.h delete mode 100644 branches/sage/mds/messages/MStatfsReply.h delete mode 100644 branches/sage/mds/mkmonmap.cc delete mode 100644 branches/sage/mds/mon/ClientMonitor.cc delete mode 100644 branches/sage/mds/mon/ClientMonitor.h delete mode 100644 branches/sage/mds/mon/Elector.cc delete mode 100644 branches/sage/mds/mon/Elector.h delete mode 100644 branches/sage/mds/mon/MDSMonitor.cc delete mode 100644 branches/sage/mds/mon/MDSMonitor.h delete mode 100644 branches/sage/mds/mon/MonMap.h delete mode 100644 branches/sage/mds/mon/Monitor.cc delete mode 100644 branches/sage/mds/mon/Monitor.h delete mode 100644 branches/sage/mds/mon/MonitorStore.cc delete mode 100644 branches/sage/mds/mon/MonitorStore.h delete mode 100644 branches/sage/mds/mon/OSDMonitor.cc delete mode 100644 branches/sage/mds/mon/OSDMonitor.h delete mode 100644 branches/sage/mds/mon/PGMap.h delete mode 100644 branches/sage/mds/mon/PGMonitor.cc delete mode 100644 branches/sage/mds/mon/PGMonitor.h delete mode 100644 branches/sage/mds/mon/Paxos.cc delete mode 100644 branches/sage/mds/mon/Paxos.h delete mode 100644 branches/sage/mds/mon/PaxosService.cc delete mode 100644 branches/sage/mds/mon/PaxosService.h delete mode 100644 branches/sage/mds/mon/mon_types.h delete mode 100644 branches/sage/mds/msg/Dispatcher.cc delete mode 100644 branches/sage/mds/msg/Dispatcher.h delete mode 100644 branches/sage/mds/msg/FakeMessenger.cc delete mode 100644 branches/sage/mds/msg/FakeMessenger.h delete mode 100644 branches/sage/mds/msg/Message.cc delete mode 100644 branches/sage/mds/msg/Message.h delete mode 100644 branches/sage/mds/msg/Messenger.cc delete mode 100644 branches/sage/mds/msg/Messenger.h delete mode 100644 branches/sage/mds/msg/SimpleMessenger.cc delete mode 100644 branches/sage/mds/msg/SimpleMessenger.h delete mode 100644 branches/sage/mds/msg/msg_types.h delete mode 100644 branches/sage/mds/msg/tcp.cc delete mode 100644 branches/sage/mds/msg/tcp.h delete mode 100644 branches/sage/mds/newsyn.cc delete mode 100644 branches/sage/mds/osbdb/OSBDB.cc delete mode 100644 branches/sage/mds/osbdb/OSBDB.h delete mode 100644 branches/sage/mds/osd/Ager.cc delete mode 100644 branches/sage/mds/osd/Ager.h delete mode 100644 branches/sage/mds/osd/BDBMap.h delete mode 100644 branches/sage/mds/osd/Fake.h delete mode 100644 branches/sage/mds/osd/FakeStore.cc delete mode 100644 branches/sage/mds/osd/FakeStore.h delete mode 100644 branches/sage/mds/osd/FakeStoreBDBCollections.h delete mode 100644 branches/sage/mds/osd/OSD.cc delete mode 100644 branches/sage/mds/osd/OSD.h delete mode 100644 branches/sage/mds/osd/OSDMap.h delete mode 100644 branches/sage/mds/osd/ObjectStore.cc delete mode 100644 branches/sage/mds/osd/ObjectStore.h delete mode 100644 branches/sage/mds/osd/PG.cc delete mode 100644 branches/sage/mds/osd/PG.h delete mode 100644 branches/sage/mds/osd/RAID4PG.cc delete mode 100644 branches/sage/mds/osd/RAID4PG.h delete mode 100644 branches/sage/mds/osd/ReplicatedPG.cc delete mode 100644 branches/sage/mds/osd/ReplicatedPG.h delete mode 100644 branches/sage/mds/osd/osd_types.h delete mode 100644 branches/sage/mds/osdc/Blinker.h delete mode 100644 branches/sage/mds/osdc/Filer.cc delete mode 100644 branches/sage/mds/osdc/Filer.h delete mode 100644 branches/sage/mds/osdc/Journaler.cc delete mode 100644 branches/sage/mds/osdc/ObjectCacher.cc delete mode 100644 branches/sage/mds/osdc/ObjectCacher.h delete mode 100644 branches/sage/mds/osdc/Objecter.cc delete mode 100644 branches/sage/mds/osdc/Objecter.h delete mode 100755 branches/sage/mds/script/add_header.pl delete mode 100755 branches/sage/mds/script/adjusttabs.pl delete mode 100755 branches/sage/mds/script/check_cache_dumps.pl delete mode 100755 branches/sage/mds/script/clean_osd_cow.sh delete mode 100755 branches/sage/mds/script/clean_trace.pl delete mode 100755 branches/sage/mds/script/comb.pl delete mode 100755 branches/sage/mds/script/convert_soe_trace.pl delete mode 100755 branches/sage/mds/script/find_auth_pins.pl delete mode 100755 branches/sage/mds/script/find_bufferleaks.pl delete mode 100755 branches/sage/mds/script/find_lost_bdev_ops.pl delete mode 100755 branches/sage/mds/script/find_lost_commit.pl delete mode 100755 branches/sage/mds/script/find_lost_objecter.pl delete mode 100755 branches/sage/mds/script/find_pathpins.pl delete mode 100755 branches/sage/mds/script/find_requests.pl delete mode 100755 branches/sage/mds/script/find_waiters.pl delete mode 100755 branches/sage/mds/script/fix_modeline.pl delete mode 100755 branches/sage/mds/script/gprofnewsyn delete mode 100755 branches/sage/mds/script/grepblock delete mode 100755 branches/sage/mds/script/merge_cdfs.pl delete mode 100644 branches/sage/mds/script/merge_trace_rw.pl delete mode 100755 branches/sage/mds/script/plot.pl delete mode 100755 branches/sage/mds/script/profonly.pl delete mode 100755 branches/sage/mds/script/runjob.pl delete mode 100755 branches/sage/mds/script/runset.pl delete mode 100755 branches/sage/mds/script/smooth.pl delete mode 100755 branches/sage/mds/script/study_find.pl delete mode 100755 branches/sage/mds/script/study_hardlink_lifetimes.pl delete mode 100644 branches/sage/mds/script/study_lookups.pl delete mode 100755 branches/sage/mds/script/sum.pl delete mode 100644 branches/sage/mds/test/fakemds.cc delete mode 100644 branches/sage/mds/test/fg.cc delete mode 100644 branches/sage/mds/test/gprof-helper.c delete mode 100644 branches/sage/mds/test/makedirs.cc delete mode 100644 branches/sage/mds/test/mpitest.cc delete mode 100644 branches/sage/mds/test/mttest.cc delete mode 100644 branches/sage/mds/test/rushconfig delete mode 100644 branches/sage/mds/test/rushtest.cc delete mode 100644 branches/sage/mds/test/rushtest.cc~ delete mode 100644 branches/sage/mds/test/test_disk_bw.cc delete mode 100644 branches/sage/mds/test/test_seek_read.c delete mode 100644 branches/sage/mds/test/testbucket.cc delete mode 100644 branches/sage/mds/test/testbuffers.cc delete mode 100644 branches/sage/mds/test/testcounter.cc delete mode 100644 branches/sage/mds/test/testcrush.cc delete mode 100644 branches/sage/mds/test/testfilepath.cc delete mode 100644 branches/sage/mds/test/testmpi.cc delete mode 100644 branches/sage/mds/test/testnewbuffers.cc delete mode 100644 branches/sage/mds/test/testos.cc delete mode 100644 branches/sage/mds/test/testosbdb.cc delete mode 100644 branches/sage/mds/test/testtree.cc delete mode 100644 branches/sage/mds/test/testxattr.cc delete mode 100644 branches/sage/mds/valgrind.supp rename {trunk/fusetrace => fusetrace}/Makefile (100%) rename {trunk/fusetrace => fusetrace}/fusetrace_ll.cc (100%) rename {branches/marnberg/quota => src}/COPYING (100%) rename {trunk/ceph => src}/Makefile (100%) rename {branches/marnberg/quota => src}/README (100%) rename {trunk/ceph => src}/TODO (100%) rename {trunk/ceph => src}/active/README (100%) rename {branches/sage/crush => src}/active/activemaster.cc (100%) rename {branches/sage/crush => src}/active/activemaster.h (100%) rename {trunk/ceph => src}/active/activeslave.cc (100%) rename {trunk/ceph => src}/active/activeslave.h (100%) rename {branches/sage/crush => src}/active/activetaskd.cc (100%) rename {branches/sage/crush => src}/active/activetaskd.h (100%) rename {branches/sage/crush => src}/active/client_init.cc (100%) rename {branches/sage/crush => src}/active/client_init.h (100%) rename {trunk/ceph => src}/active/common.h (100%) rename {branches/sage/crush => src}/active/echotestclient.cc (100%) rename {branches/sage/crush => src}/active/echotestclient.h (100%) rename {branches/sage/crush => src}/active/inet.h (100%) rename {trunk/ceph => src}/active/msgtestclient.cc (100%) rename {trunk/ceph => src}/active/msgtestclient.h (100%) rename {branches/sage/crush => src}/active/trivial_task.cc (100%) rename {branches/sage/crush => src}/active/trivial_task.h (100%) rename {trunk/ceph => src}/active/utility.h (100%) rename {branches/sage/crush => src}/cfuse.cc (100%) rename {trunk/ceph => src}/client/Client.cc (100%) rename {trunk/ceph => src}/client/Client.h (100%) rename {branches/sage/crush => src}/client/FileCache.cc (100%) rename {branches/sage/crush => src}/client/FileCache.h (100%) rename {trunk/ceph => src}/client/SyntheticClient.cc (100%) rename {trunk/ceph => src}/client/SyntheticClient.h (100%) rename {branches/sage/crush => src}/client/Trace.cc (100%) rename {branches/sage/crush => src}/client/Trace.h (100%) rename {branches/sage/crush => src}/client/fuse.cc (100%) rename {branches/sage/crush => src}/client/fuse.h (100%) rename {branches/sage/crush => src}/client/fuse_ll.cc (100%) rename {branches/sage/crush => src}/client/fuse_ll.h (100%) rename {branches/sage/crush => src}/client/hadoop/CephFSInterface.cc (100%) rename {branches/sage/crush => src}/client/hadoop/CephFSInterface.h (100%) rename {branches/sage/crush => src}/client/ldceph.cc (100%) rename {branches/sage/crush => src}/cmds.cc (100%) rename {branches/sage/ebofs2 => src}/cmon.cc (100%) rename {branches/sage/crush => src}/cmonctl.cc (100%) rename {branches/sage/crush => src}/common/Clock.cc (100%) rename {trunk/ceph => src}/common/Clock.h (100%) rename {branches/sage/crush => src}/common/Cond.h (100%) rename {branches/sage/crush => src}/common/DecayCounter.h (100%) rename {branches/sage/crush => src}/common/LogType.h (100%) rename {trunk/ceph => src}/common/Logger.cc (100%) rename {branches/sage/crush => src}/common/Logger.h (100%) rename {branches/sage/crush => src}/common/Mutex.h (100%) rename {branches/sage/crush => src}/common/RWLock.h (100%) rename {branches/sage/crush => src}/common/Semaphore.h (100%) rename {branches/sage/crush => src}/common/Thread.h (100%) rename {branches/sage/crush => src}/common/ThreadPool.h (100%) rename {branches/sage/crush => src}/common/Timer.cc (100%) rename {branches/sage/crush => src}/common/Timer.h (100%) rename {trunk/ceph => src}/config.cc (100%) rename {trunk/ceph => src}/config.h (100%) rename {branches/sage/crush => src}/cosd.cc (100%) rename {branches/sage/crush => src}/crush.old/BinaryTree.h (100%) rename {branches/sage/crush => src}/crush.old/Bucket.h (100%) rename {branches/sage/crush => src}/crush.old/Hash.h (100%) rename {branches/sage/crush => src}/crush.old/crush.h (100%) rename {branches/marnberg/quota/crush => src/crush.old}/test/bucket_movement.cc (100%) rename {branches/marnberg/quota/crush => src/crush.old}/test/bucket_variance.cc (100%) rename {branches/marnberg/quota/crush => src/crush.old}/test/cluster_movement.cc (100%) rename {branches/marnberg/quota/crush => src/crush.old}/test/cluster_movement_remove.cc (100%) rename {branches/marnberg/quota/crush => src/crush.old}/test/cluster_movement_rush.cc (100%) rename {branches/marnberg/quota/crush => src/crush.old}/test/creeping_failure.cc (100%) rename {branches/marnberg/quota/crush => src/crush.old}/test/creeping_failure_variance.cc (100%) rename {branches/marnberg/quota/crush => src/crush.old}/test/depth_variance.cc (100%) rename {branches/marnberg/quota/crush => src/crush.old}/test/mixed.cc (100%) rename {branches/marnberg/quota/crush => src/crush.old}/test/movement.cc (100%) rename {branches/marnberg/quota/crush => src/crush.old}/test/movement_failed.cc (100%) rename {branches/marnberg/quota/crush => src/crush.old}/test/overload.cc (100%) rename {branches/marnberg/quota/crush => src/crush.old}/test/overload_variance.cc (100%) rename {branches/marnberg/quota/crush => src/crush.old}/test/sizes.cc (100%) rename {branches/marnberg/quota/crush => src/crush.old}/test/smallbucket.cc (100%) rename {branches/marnberg/quota/crush => src/crush.old}/test/speed_bucket.cc (100%) rename {branches/marnberg/quota/crush => src/crush.old}/test/speed_depth.cc (100%) rename {branches/marnberg/quota/crush => src/crush.old}/test/speed_rush.cc (100%) rename {branches/marnberg/quota/crush => src/crush.old}/test/t.cc (100%) rename {branches/marnberg/quota/crush => src/crush.old}/test/testbucket.cc (100%) rename {branches/marnberg/quota/crush => src/crush.old}/test/testnormal.cc (100%) rename {trunk/ceph => src}/crush/CrushWrapper.h (100%) rename {trunk/ceph => src}/crush/Makefile (100%) rename {branches/sage/crush => src}/crush/buckets.c (100%) rename {trunk/ceph => src}/crush/builder.c (100%) rename {trunk/ceph => src}/crush/builder.h (100%) rename {trunk/ceph => src}/crush/crush.c (100%) rename {trunk/ceph => src}/crush/crush.h (100%) rename {branches/sage/crush => src}/crush/hash.h (100%) rename {trunk/ceph => src}/crush/mapper.c (100%) rename {branches/sage/crush => src}/crush/mapper.h (100%) rename {trunk/ceph => src}/crush/test.c (100%) rename {branches/sage/crush => src}/crush/types.h (100%) rename {branches/sage/crush => src}/csyn.cc (100%) rename {branches/sage/crush => src}/doc/Commitdir.txt (100%) rename {branches/sage/crush => src}/doc/anchortable.txt (100%) rename {branches/marnberg/quota => src}/doc/bdb.txt (100%) rename {branches/sage/crush => src}/doc/caching.txt (100%) rename {branches/sage/crush => src}/doc/exports.txt (100%) rename {branches/sage/crush => src}/doc/header.txt (100%) rename {branches/marnberg/quota => src}/doc/inos.txt (100%) rename {branches/marnberg/quota => src}/doc/lazy_posix.txt (100%) rename {branches/sage/crush => src}/doc/mds_locks.txt (100%) rename {branches/sage/crush => src}/doc/modeline.txt (100%) rename {branches/marnberg/quota => src}/doc/shared_write_states_nogo.txt (100%) rename {trunk/ceph => src}/dupstore.cc (100%) rename {branches/sage/crush => src}/ebofs/Allocator.cc (100%) rename {branches/sage/crush => src}/ebofs/Allocator.h (100%) rename {trunk/ceph => src}/ebofs/BlockDevice.cc (100%) rename {trunk/ceph => src}/ebofs/BlockDevice.h (100%) rename {trunk/ceph => src}/ebofs/BufferCache.cc (100%) rename {trunk/ceph => src}/ebofs/BufferCache.h (100%) rename {trunk/ceph => src}/ebofs/Cnode.h (100%) rename {trunk/ceph => src}/ebofs/Ebofs.cc (100%) rename {trunk/ceph => src}/ebofs/Ebofs.h (100%) rename {branches/sage/crush => src}/ebofs/FileJournal.cc (100%) rename {trunk/ceph => src}/ebofs/FileJournal.h (100%) rename {branches/sage/crush => src}/ebofs/Journal.h (100%) rename {trunk/ceph => src}/ebofs/Onode.h (100%) rename {trunk/ceph => src}/ebofs/Table.h (100%) rename {trunk/ceph => src}/ebofs/csum.h (100%) rename {branches/sage/crush => src}/ebofs/mkfs.ebofs.cc (100%) rename {trunk/ceph => src}/ebofs/nodes.h (100%) rename {trunk/ceph => src}/ebofs/test.ebofs.cc (100%) rename {trunk/ceph => src}/ebofs/types.h (100%) rename {branches/sage/crush => src}/extractosdmaps.cc (100%) rename {trunk/ceph => src}/fakefuse.cc (100%) rename {trunk/ceph => src}/fakesyn.cc (100%) rename {branches/sage/crush => src}/include/Context.h (100%) rename {branches/sage/crush => src}/include/Distribution.h (100%) rename {trunk/ceph => src}/include/atomic.h (100%) rename {branches/sage/crush => src}/include/bitmapper.h (100%) rename {branches/sage/crush => src}/include/blobhash.h (100%) rename {trunk/ceph => src}/include/buffer.h (100%) rename {trunk/ceph => src}/include/ceph_fs.h (100%) rename {branches/sage/crush => src}/include/encodable.h (100%) rename {branches/sage/crush => src}/include/error.h (100%) rename {branches/sage/mds => src}/include/filepath.h (100%) rename {trunk/ceph => src}/include/frag.h (100%) rename {trunk/ceph => src}/include/hash.h (100%) rename {trunk/ceph => src}/include/interval_set.h (100%) rename {branches/sage/crush => src}/include/lru.h (100%) rename {trunk/ceph => src}/include/object.h (100%) rename {trunk/ceph => src}/include/pobject.h (100%) rename {branches/sage/crush => src}/include/rangeset.h (100%) rename {branches/sage/crush => src}/include/statlite.h (100%) rename {branches/sage/crush => src}/include/triple.h (100%) rename {trunk/ceph => src}/include/types.h (100%) rename {branches/sage/crush => src}/include/uofs.h (100%) rename {trunk/ceph => src}/include/utime.h (100%) rename {branches/sage/crush => src}/include/xlist.h (100%) rename {branches/marnberg/quota => src}/jobs/alc.tp (100%) rename {branches/marnberg/quota => src}/jobs/alcdat/makedirs (100%) rename {branches/marnberg/quota => src}/jobs/alcdat/makedirs.big (100%) rename {branches/marnberg/quota => src}/jobs/alcdat/makedirs.tput (100%) rename {branches/marnberg/quota => src}/jobs/alcdat/makefiles.shared (100%) rename {branches/marnberg/quota => src}/jobs/alcdat/openshared (100%) rename {branches/marnberg/quota => src}/jobs/alcdat/ossh.include (100%) rename {branches/marnberg/quota => src}/jobs/alcdat/ossh.include.big (100%) rename {branches/marnberg/quota => src}/jobs/alcdat/ossh.lib (100%) rename {branches/marnberg/quota => src}/jobs/alcdat/ossh.lib.big (100%) rename {branches/marnberg/quota => src}/jobs/alcdat/striping (100%) rename {branches/marnberg/quota => src}/jobs/example (100%) rename {branches/marnberg/quota => src}/jobs/mds/log_striping (100%) rename {branches/marnberg/quota => src}/jobs/mds/makedir_lat (100%) rename {branches/marnberg/quota => src}/jobs/mds/makedirs (100%) rename {branches/marnberg/quota => src}/jobs/mds/opensshlib (100%) rename {branches/marnberg/quota => src}/jobs/meta1 (100%) rename {branches/marnberg/quota => src}/jobs/meta1.proc.sh (100%) rename {branches/marnberg/quota => src}/jobs/osd/ebofs (100%) rename {branches/marnberg/quota => src}/jobs/osd/mds_log (100%) rename {branches/marnberg/quota => src}/jobs/osd/osd_threads (100%) rename {branches/marnberg/quota => src}/jobs/osd/striping (100%) rename {branches/marnberg/quota => src}/jobs/osd/wr_lat2 (100%) rename {branches/marnberg/quota => src}/jobs/osd/write_sizes (100%) rename {branches/marnberg/quota => src}/jobs/rados/map_dist (100%) rename {branches/marnberg/quota => src}/jobs/rados/rep_lat (100%) rename {trunk/ceph => src}/jobs/rados/wr_sizes (100%) rename {branches/sage/crush => src}/jobs/runjobsample (100%) rename {trunk/ceph => src}/kernel/Makefile (100%) rename {trunk/ceph => src}/kernel/README (100%) rename {trunk/ceph => src}/kernel/addr.c (100%) rename {trunk/ceph => src}/kernel/client.c (100%) rename {trunk/ceph => src}/kernel/client.h (100%) rename {trunk/ceph => src}/kernel/crush/crush.c (100%) rename {trunk/ceph => src}/kernel/crush/crush.h (100%) rename {trunk/ceph => src}/kernel/crush/hash.h (100%) rename {trunk/ceph => src}/kernel/crush/mapper.c (100%) rename {trunk/ceph => src}/kernel/crush/mapper.h (100%) rename {trunk/ceph => src}/kernel/dir.c (100%) rename {trunk/ceph => src}/kernel/file.c (100%) rename {trunk/ceph => src}/kernel/inode.c (100%) rename {trunk/ceph => src}/kernel/kconfig.patch (100%) rename {trunk/ceph => src}/kernel/ktcp.c (100%) rename {trunk/ceph => src}/kernel/ktcp.h (100%) rename {trunk/ceph => src}/kernel/mds_client.c (100%) rename {trunk/ceph => src}/kernel/mds_client.h (100%) rename {trunk/ceph => src}/kernel/mdsmap.c (100%) rename {trunk/ceph => src}/kernel/mdsmap.h (100%) rename {trunk/ceph => src}/kernel/messenger.c (100%) rename {trunk/ceph => src}/kernel/messenger.h (100%) rename {trunk/ceph => src}/kernel/mon_client.c (100%) rename {trunk/ceph => src}/kernel/mon_client.h (100%) rename {trunk/ceph => src}/kernel/osd_client.c (100%) rename {trunk/ceph => src}/kernel/osd_client.h (100%) rename {trunk/ceph => src}/kernel/sample.uml.config (100%) rename {trunk/ceph => src}/kernel/super.c (100%) rename {trunk/ceph => src}/kernel/super.h (100%) rename {trunk/ceph => src}/kernel/test/Makefile (100%) rename {trunk/ceph => src}/kernel/test/kernclient.c (100%) rename {trunk/ceph => src}/kernel/test/kernserver.c (100%) rename {trunk/ceph => src}/kernel/test/ktcp.c (100%) rename {trunk/ceph => src}/kernel/test/ktcp.h (100%) rename {trunk/ceph => src}/kernel/test/messenger.h (100%) rename {trunk/ceph => src}/kernel/test/messenger_mini.c (100%) rename {trunk/ceph => src}/kernel/test/threadtest.c (100%) rename {trunk/ceph => src}/kernel/test/userclient.c (100%) rename {trunk/ceph => src}/kernel/test/userserver.c (100%) rename {branches/sage/ebofs2 => src}/mds/Anchor.h (100%) rename {branches/sage/ebofs2 => src}/mds/AnchorClient.cc (100%) rename {branches/sage/crush => src}/mds/AnchorClient.h (100%) rename {branches/sage/ebofs2 => src}/mds/AnchorTable.cc (100%) rename {branches/sage/crush => src}/mds/AnchorTable.h (100%) rename {branches/sage/mds => src}/mds/CDentry.cc (100%) rename {branches/sage/mds => src}/mds/CDentry.h (100%) rename {branches/sage/mds => src}/mds/CDir.cc (100%) rename {branches/sage/crush => src}/mds/CDir.h (100%) rename {branches/sage/mds => src}/mds/CInode.cc (100%) rename {branches/sage/mds => src}/mds/CInode.h (100%) rename {branches/sage/crush => src}/mds/Capability.h (100%) rename {branches/sage/crush => src}/mds/ClientMap.cc (100%) rename {branches/sage/mds => src}/mds/ClientMap.h (100%) rename {branches/sage/crush => src}/mds/FileLock.h (100%) rename {branches/sage/crush => src}/mds/IdAllocator.cc (100%) rename {branches/sage/crush => src}/mds/IdAllocator.h (100%) rename {branches/sage/crush => src}/mds/LocalLock.h (100%) rename {trunk/ceph => src}/mds/Locker.cc (100%) rename {branches/sage/crush => src}/mds/Locker.h (100%) rename {branches/sage/mds => src}/mds/LogEvent.cc (100%) rename {branches/sage/mds => src}/mds/LogEvent.h (100%) rename {branches/sage/mds => src}/mds/LogSegment.h (100%) rename {trunk/ceph => src}/mds/MDBalancer.cc (100%) rename {branches/sage/crush => src}/mds/MDBalancer.h (100%) rename {trunk/ceph => src}/mds/MDCache.cc (100%) rename {branches/sage/mds => src}/mds/MDCache.h (100%) rename {trunk/ceph => src}/mds/MDLog.cc (100%) rename {branches/sage/mds => src}/mds/MDLog.h (100%) rename {trunk/ceph => src}/mds/MDS.cc (100%) rename {trunk/ceph => src}/mds/MDS.h (100%) rename {trunk/ceph => src}/mds/MDSMap.h (100%) rename {trunk/ceph => src}/mds/Migrator.cc (100%) rename {branches/sage/mds => src}/mds/Migrator.h (100%) rename {branches/sage/crush => src}/mds/ScatterLock.h (100%) rename {trunk/ceph => src}/mds/Server.cc (100%) rename {branches/sage/mds => src}/mds/Server.h (100%) rename {branches/sage/crush => src}/mds/SimpleLock.h (100%) rename {branches/sage/crush => src}/mds/events/EAnchor.h (100%) rename {branches/sage/crush => src}/mds/events/EAnchorClient.h (100%) rename {branches/sage/crush => src}/mds/events/EExport.h (100%) rename {branches/sage/crush => src}/mds/events/EFragment.h (100%) rename {branches/sage/crush => src}/mds/events/EImportFinish.h (100%) rename {branches/sage/mds => src}/mds/events/EImportStart.h (100%) rename {branches/sage/crush => src}/mds/events/EMetaBlob.h (100%) rename {branches/sage/crush => src}/mds/events/EOpen.h (100%) rename {branches/sage/crush => src}/mds/events/EPurgeFinish.h (100%) rename {branches/sage/mds => src}/mds/events/ESession.h (100%) rename {branches/sage/mds => src}/mds/events/ESessions.h (100%) rename {branches/sage/crush => src}/mds/events/ESlaveUpdate.h (100%) rename {branches/sage/crush => src}/mds/events/EString.h (100%) rename {branches/sage/crush => src}/mds/events/ESubtreeMap.h (100%) rename {branches/sage/mds => src}/mds/events/EUpdate.h (100%) rename {branches/sage/mds => src}/mds/journal.cc (100%) rename {trunk/ceph => src}/mds/mdstypes.h (100%) rename {branches/sage/crush => src}/messages/MAnchor.h (100%) rename {branches/sage/crush => src}/messages/MCacheExpire.h (100%) rename {trunk/ceph => src}/messages/MClientFileCaps.h (100%) rename {trunk/ceph => src}/messages/MClientMount.h (100%) rename {trunk/ceph => src}/messages/MClientReconnect.h (100%) rename {trunk/ceph => src}/messages/MClientReply.h (100%) rename {trunk/ceph => src}/messages/MClientRequest.h (100%) rename {trunk/ceph => src}/messages/MClientRequestForward.h (100%) rename {trunk/ceph => src}/messages/MClientSession.h (100%) rename {trunk/ceph => src}/messages/MClientUnmount.h (100%) rename {branches/sage/crush => src}/messages/MDentryUnlink.h (100%) rename {branches/sage/mds => src}/messages/MDirUpdate.h (100%) rename {branches/sage/crush => src}/messages/MDiscover.h (100%) rename {branches/sage/crush => src}/messages/MDiscoverReply.h (100%) rename {branches/sage/mds => src}/messages/MExportCaps.h (100%) rename {branches/sage/mds => src}/messages/MExportCapsAck.h (100%) rename {branches/sage/crush => src}/messages/MExportDir.h (100%) rename {branches/sage/crush => src}/messages/MExportDirAck.h (100%) rename {branches/sage/crush => src}/messages/MExportDirCancel.h (100%) rename {branches/sage/mds => src}/messages/MExportDirDiscover.h (100%) rename {branches/sage/crush => src}/messages/MExportDirDiscoverAck.h (100%) rename {branches/sage/crush => src}/messages/MExportDirFinish.h (100%) rename {branches/sage/crush => src}/messages/MExportDirNotify.h (100%) rename {branches/sage/crush => src}/messages/MExportDirNotifyAck.h (100%) rename {branches/sage/crush => src}/messages/MExportDirPrep.h (100%) rename {branches/sage/crush => src}/messages/MExportDirPrepAck.h (100%) rename {branches/sage/crush => src}/messages/MExportDirWarning.h (100%) rename {branches/sage/crush => src}/messages/MExportDirWarningAck.h (100%) rename {branches/sage/crush => src}/messages/MGenericMessage.h (100%) rename {branches/sage/crush => src}/messages/MHeartbeat.h (100%) rename {branches/sage/crush => src}/messages/MInodeFileCaps.h (100%) rename {trunk/ceph => src}/messages/MLock.h (100%) rename {branches/sage/ebofs2 => src}/messages/MMDSBeacon.h (100%) rename {branches/sage/crush => src}/messages/MMDSBoot.h (100%) rename {branches/sage/crush => src}/messages/MMDSCacheRejoin.h (100%) rename {branches/sage/crush => src}/messages/MMDSFragmentNotify.h (100%) rename {trunk/ceph => src}/messages/MMDSGetMap.h (100%) rename {trunk/ceph => src}/messages/MMDSMap.h (100%) rename {branches/sage/crush => src}/messages/MMDSResolve.h (100%) rename {branches/sage/crush => src}/messages/MMDSResolveAck.h (100%) rename {branches/sage/mds => src}/messages/MMDSSlaveRequest.h (100%) rename {branches/sage/crush => src}/messages/MMonCommand.h (100%) rename {branches/sage/crush => src}/messages/MMonCommandAck.h (100%) rename {branches/sage/crush => src}/messages/MMonElection.h (100%) rename {branches/sage/crush => src}/messages/MMonElectionCollect.h (100%) rename {branches/sage/crush => src}/messages/MMonElectionRefresh.h (100%) rename {branches/sage/crush => src}/messages/MMonElectionStatus.h (100%) rename {trunk/ceph => src}/messages/MMonMap.h (100%) rename {branches/sage/crush => src}/messages/MMonOSDMapInfo.h (100%) rename {branches/sage/crush => src}/messages/MMonOSDMapLease.h (100%) rename {branches/sage/crush => src}/messages/MMonOSDMapLeaseAck.h (100%) rename {branches/sage/crush => src}/messages/MMonOSDMapUpdateAck.h (100%) rename {branches/sage/crush => src}/messages/MMonOSDMapUpdateCommit.h (100%) rename {branches/sage/crush => src}/messages/MMonOSDMapUpdatePrepare.h (100%) rename {branches/sage/crush => src}/messages/MMonPaxos.h (100%) rename {branches/sage/crush => src}/messages/MOSDBoot.h (100%) rename {branches/sage/crush => src}/messages/MOSDFailure.h (100%) rename {trunk/ceph => src}/messages/MOSDGetMap.h (100%) rename {branches/sage/crush => src}/messages/MOSDIn.h (100%) rename {trunk/ceph => src}/messages/MOSDMap.h (100%) rename {trunk/ceph => src}/messages/MOSDOp.h (100%) rename {trunk/ceph => src}/messages/MOSDOpReply.h (100%) rename {branches/sage/crush => src}/messages/MOSDOut.h (100%) rename {branches/sage/crush => src}/messages/MOSDPGActivateSet.h (100%) rename {branches/sage/crush => src}/messages/MOSDPGLog.h (100%) rename {branches/sage/crush => src}/messages/MOSDPGNotify.h (100%) rename {branches/sage/crush => src}/messages/MOSDPGPeer.h (100%) rename {branches/sage/crush => src}/messages/MOSDPGPeerAck.h (100%) rename {branches/sage/crush => src}/messages/MOSDPGPeerRequest.h (100%) rename {trunk/ceph => src}/messages/MOSDPGQuery.h (100%) rename {branches/sage/crush => src}/messages/MOSDPGRemove.h (100%) rename {branches/sage/crush => src}/messages/MOSDPGSummary.h (100%) rename {branches/sage/crush => src}/messages/MOSDPGUpdate.h (100%) rename {branches/sage/crush => src}/messages/MOSDPing.h (100%) rename {branches/sage/crush => src}/messages/MPGStats.h (100%) rename {trunk/ceph => src}/messages/MPing.h (100%) rename {trunk/ceph => src}/messages/MPingAck.h (100%) rename {trunk/ceph => src}/messages/MStatfs.h (100%) rename {trunk/ceph => src}/messages/MStatfsReply.h (100%) rename {trunk/ceph => src}/mkmonmap.cc (100%) rename {trunk/ceph => src}/mon/ClientMonitor.cc (100%) rename {trunk/ceph => src}/mon/ClientMonitor.h (100%) rename {trunk/ceph => src}/mon/Elector.cc (100%) rename {branches/sage/crush => src}/mon/Elector.h (100%) rename {trunk/ceph => src}/mon/MDSMonitor.cc (100%) rename {trunk/ceph => src}/mon/MDSMonitor.h (100%) rename {trunk/ceph => src}/mon/MonMap.h (100%) rename {trunk/ceph => src}/mon/Monitor.cc (100%) rename {trunk/ceph => src}/mon/Monitor.h (100%) rename {trunk/ceph => src}/mon/MonitorStore.cc (100%) rename {branches/sage/crush => src}/mon/MonitorStore.h (100%) rename {trunk/ceph => src}/mon/OSDMonitor.cc (100%) rename {trunk/ceph => src}/mon/OSDMonitor.h (100%) rename {branches/sage/crush => src}/mon/PGMap.h (100%) rename {trunk/ceph => src}/mon/PGMonitor.cc (100%) rename {branches/sage/crush => src}/mon/PGMonitor.h (100%) rename {trunk/ceph => src}/mon/Paxos.cc (100%) rename {trunk/ceph => src}/mon/Paxos.h (100%) rename {branches/sage/crush => src}/mon/PaxosService.cc (100%) rename {trunk/ceph => src}/mon/PaxosService.h (100%) rename {branches/sage/crush => src}/mon/mon_types.h (100%) rename {branches/sage/crush => src}/msg/Dispatcher.cc (100%) rename {branches/sage/crush => src}/msg/Dispatcher.h (100%) rename {trunk/ceph => src}/msg/FakeMessenger.cc (100%) rename {trunk/ceph => src}/msg/FakeMessenger.h (100%) rename {trunk/ceph => src}/msg/Message.cc (100%) rename {trunk/ceph => src}/msg/Message.h (100%) rename {branches/sage/crush => src}/msg/Messenger.cc (100%) rename {trunk/ceph => src}/msg/Messenger.h (100%) rename {trunk/ceph => src}/msg/SimpleMessenger.cc (100%) rename {trunk/ceph => src}/msg/SimpleMessenger.h (100%) rename {trunk/ceph => src}/msg/msg_types.h (100%) rename {branches/sage/ebofs2 => src}/msg/tcp.cc (100%) rename {trunk/ceph => src}/msg/tcp.h (100%) rename {trunk/ceph => src}/newsyn.cc (100%) rename {branches/sage/crush => src}/osbdb/OSBDB.cc (100%) rename {branches/sage/crush => src}/osbdb/OSBDB.h (100%) rename {branches/sage/crush => src}/osd/Ager.cc (100%) rename {branches/sage/crush => src}/osd/Ager.h (100%) rename {branches/sage/crush => src}/osd/BDBMap.h (100%) rename {trunk/ceph => src}/osd/Fake.h (100%) rename {trunk/ceph => src}/osd/FakeStore.cc (100%) rename {trunk/ceph => src}/osd/FakeStore.h (100%) rename {branches/sage/crush => src}/osd/FakeStoreBDBCollections.h (100%) rename {trunk/ceph => src}/osd/OSD.cc (100%) rename {trunk/ceph => src}/osd/OSD.h (100%) rename {trunk/ceph => src}/osd/OSDMap.h (100%) rename {branches/sage/crush => src}/osd/ObjectStore.cc (100%) rename {trunk/ceph => src}/osd/ObjectStore.h (100%) rename {trunk/ceph => src}/osd/PG.cc (100%) rename {trunk/ceph => src}/osd/PG.h (100%) rename {trunk/ceph => src}/osd/RAID4PG.cc (100%) rename {trunk/ceph => src}/osd/RAID4PG.h (100%) rename {trunk/ceph => src}/osd/ReplicatedPG.cc (100%) rename {trunk/ceph => src}/osd/ReplicatedPG.h (100%) rename {trunk/ceph => src}/osd/osd_types.h (100%) rename {branches/sage/crush => src}/osdc/Blinker.h (100%) rename {branches/sage/crush => src}/osdc/Filer.cc (100%) rename {branches/sage/crush => src}/osdc/Filer.h (100%) rename {branches/sage/crush => src}/osdc/Journaler.cc (100%) rename {branches/sage/mds => src}/osdc/Journaler.h (100%) rename {branches/sage/crush => src}/osdc/ObjectCacher.cc (100%) rename {branches/sage/crush => src}/osdc/ObjectCacher.h (100%) rename {trunk/ceph => src}/osdc/Objecter.cc (100%) rename {trunk/ceph => src}/osdc/Objecter.h (100%) rename {branches/sage/crush => src}/script/add_header.pl (100%) rename {branches/marnberg/quota => src}/script/adjusttabs.pl (100%) rename {branches/sage/crush => src}/script/check_cache_dumps.pl (100%) rename {branches/marnberg/quota => src}/script/clean_osd_cow.sh (100%) rename {branches/marnberg/quota => src}/script/clean_trace.pl (100%) rename {branches/sage/crush => src}/script/comb.pl (100%) rename {branches/sage/crush => src}/script/convert_soe_trace.pl (100%) rename {branches/sage/crush => src}/script/find_auth_pins.pl (100%) rename {branches/marnberg/quota => src}/script/find_bufferleaks.pl (100%) rename {branches/marnberg/quota => src}/script/find_lost_bdev_ops.pl (100%) rename {branches/marnberg/quota => src}/script/find_lost_commit.pl (100%) rename {branches/marnberg/quota => src}/script/find_lost_objecter.pl (100%) rename {branches/marnberg/quota => src}/script/find_pathpins.pl (100%) rename {branches/marnberg/quota => src}/script/find_requests.pl (100%) rename {branches/marnberg/quota => src}/script/find_waiters.pl (100%) rename {branches/sage/crush => src}/script/fix_modeline.pl (100%) rename {branches/sage/crush => src}/script/gprofnewsyn (100%) rename {branches/marnberg/quota => src}/script/grepblock (100%) rename {branches/sage/crush => src}/script/merge_cdfs.pl (100%) rename {branches/marnberg/quota => src}/script/merge_trace_rw.pl (100%) rename {branches/sage/crush => src}/script/plot.pl (100%) rename {branches/marnberg/quota => src}/script/profonly.pl (100%) rename {branches/sage/crush => src}/script/runjob.pl (100%) rename {branches/marnberg/quota => src}/script/runset.pl (100%) rename {branches/sage/crush => src}/script/smooth.pl (100%) rename {branches/sage/crush => src}/script/study_find.pl (100%) rename {branches/sage/crush => src}/script/study_hardlink_lifetimes.pl (100%) rename {branches/sage/crush => src}/script/study_lookups.pl (100%) rename {branches/marnberg/quota => src}/script/sum.pl (100%) rename {branches/marnberg/quota => src}/test/fakemds.cc (100%) rename {branches/sage/crush => src}/test/fg.cc (100%) rename {branches/marnberg/quota => src}/test/gprof-helper.c (100%) rename {branches/marnberg/quota => src}/test/makedirs.cc (100%) rename {branches/marnberg/quota => src}/test/mpitest.cc (100%) rename {branches/marnberg/quota => src}/test/mttest.cc (100%) rename {branches/marnberg/quota => src}/test/rushconfig (100%) rename {branches/marnberg/quota => src}/test/rushtest.cc (100%) rename {branches/marnberg/quota => src}/test/rushtest.cc~ (100%) rename {branches/sage/crush => src}/test/test_disk_bw.cc (100%) rename {trunk/ceph => src}/test/test_seek_read.c (100%) rename {trunk/ceph => src}/test/test_short_seek_read.c (100%) rename {branches/marnberg/quota => src}/test/testbucket.cc (100%) rename {branches/marnberg/quota => src}/test/testbuffers.cc (100%) rename {branches/sage/crush => src}/test/testcounter.cc (100%) rename {branches/marnberg/quota => src}/test/testcrush.cc (100%) rename {branches/marnberg/quota => src}/test/testfilepath.cc (100%) rename {branches/marnberg/quota => src}/test/testmpi.cc (100%) rename {branches/marnberg/quota => src}/test/testnewbuffers.cc (100%) rename {branches/marnberg/quota => src}/test/testos.cc (100%) rename {branches/marnberg/quota => src}/test/testosbdb.cc (100%) rename {branches/marnberg/quota => src}/test/testtree.cc (100%) rename {branches/marnberg/quota => src}/test/testxattr.cc (100%) rename {trunk/ceph => src}/valgrind.supp (100%) delete mode 100644 tags/20070517_before_mds_merge/COPYING delete mode 100644 tags/20070517_before_mds_merge/Makefile delete mode 100644 tags/20070517_before_mds_merge/README delete mode 100644 tags/20070517_before_mds_merge/TODO delete mode 100644 tags/20070517_before_mds_merge/cfuse.cc delete mode 100644 tags/20070517_before_mds_merge/client/Client.cc delete mode 100644 tags/20070517_before_mds_merge/client/Client.h delete mode 100644 tags/20070517_before_mds_merge/client/FileCache.cc delete mode 100644 tags/20070517_before_mds_merge/client/FileCache.h delete mode 100644 tags/20070517_before_mds_merge/client/SyntheticClient.cc delete mode 100644 tags/20070517_before_mds_merge/client/SyntheticClient.h delete mode 100644 tags/20070517_before_mds_merge/client/Trace.cc delete mode 100644 tags/20070517_before_mds_merge/client/Trace.h delete mode 100644 tags/20070517_before_mds_merge/client/fuse.cc delete mode 100644 tags/20070517_before_mds_merge/client/fuse.h delete mode 100644 tags/20070517_before_mds_merge/client/hadoop/CephFSInterface.cc delete mode 100644 tags/20070517_before_mds_merge/client/hadoop/CephFSInterface.h delete mode 100644 tags/20070517_before_mds_merge/client/ldceph.cc delete mode 100644 tags/20070517_before_mds_merge/client/msgthread.h delete mode 100644 tags/20070517_before_mds_merge/cmds.cc delete mode 100644 tags/20070517_before_mds_merge/cmon.cc delete mode 100644 tags/20070517_before_mds_merge/common/Clock.cc delete mode 100644 tags/20070517_before_mds_merge/common/Clock.h delete mode 100644 tags/20070517_before_mds_merge/common/Cond.h delete mode 100644 tags/20070517_before_mds_merge/common/DecayCounter.h delete mode 100644 tags/20070517_before_mds_merge/common/LogType.h delete mode 100644 tags/20070517_before_mds_merge/common/Logger.cc delete mode 100644 tags/20070517_before_mds_merge/common/Logger.h delete mode 100755 tags/20070517_before_mds_merge/common/Mutex.h delete mode 100644 tags/20070517_before_mds_merge/common/Semaphore.h delete mode 100644 tags/20070517_before_mds_merge/common/Thread.h delete mode 100644 tags/20070517_before_mds_merge/common/ThreadPool.h delete mode 100644 tags/20070517_before_mds_merge/common/Timer.cc delete mode 100644 tags/20070517_before_mds_merge/common/Timer.h delete mode 100644 tags/20070517_before_mds_merge/config.cc delete mode 100644 tags/20070517_before_mds_merge/config.h delete mode 100644 tags/20070517_before_mds_merge/cosd.cc delete mode 100644 tags/20070517_before_mds_merge/crush/BinaryTree.h delete mode 100644 tags/20070517_before_mds_merge/crush/Bucket.h delete mode 100644 tags/20070517_before_mds_merge/crush/Hash.h delete mode 100644 tags/20070517_before_mds_merge/crush/crush.h delete mode 100644 tags/20070517_before_mds_merge/crush/test/bucket_movement.cc delete mode 100644 tags/20070517_before_mds_merge/crush/test/bucket_variance.cc delete mode 100644 tags/20070517_before_mds_merge/crush/test/cluster_movement.cc delete mode 100644 tags/20070517_before_mds_merge/crush/test/cluster_movement_remove.cc delete mode 100644 tags/20070517_before_mds_merge/crush/test/cluster_movement_rush.cc delete mode 100644 tags/20070517_before_mds_merge/crush/test/creeping_failure.cc delete mode 100644 tags/20070517_before_mds_merge/crush/test/creeping_failure_variance.cc delete mode 100644 tags/20070517_before_mds_merge/crush/test/depth_variance.cc delete mode 100644 tags/20070517_before_mds_merge/crush/test/mixed.cc delete mode 100644 tags/20070517_before_mds_merge/crush/test/movement.cc delete mode 100644 tags/20070517_before_mds_merge/crush/test/movement_failed.cc delete mode 100644 tags/20070517_before_mds_merge/crush/test/overload.cc delete mode 100644 tags/20070517_before_mds_merge/crush/test/overload_variance.cc delete mode 100644 tags/20070517_before_mds_merge/crush/test/sizes.cc delete mode 100644 tags/20070517_before_mds_merge/crush/test/smallbucket.cc delete mode 100644 tags/20070517_before_mds_merge/crush/test/speed_bucket.cc delete mode 100644 tags/20070517_before_mds_merge/crush/test/speed_depth.cc delete mode 100644 tags/20070517_before_mds_merge/crush/test/speed_rush.cc delete mode 100644 tags/20070517_before_mds_merge/crush/test/t.cc delete mode 100644 tags/20070517_before_mds_merge/crush/test/testbucket.cc delete mode 100644 tags/20070517_before_mds_merge/crush/test/testnormal.cc delete mode 100644 tags/20070517_before_mds_merge/csyn.cc delete mode 100644 tags/20070517_before_mds_merge/doc/Commitdir.txt delete mode 100644 tags/20070517_before_mds_merge/doc/Replication.txt delete mode 100644 tags/20070517_before_mds_merge/doc/bdb.txt delete mode 100644 tags/20070517_before_mds_merge/doc/caching.txt delete mode 100644 tags/20070517_before_mds_merge/doc/dentries.txt delete mode 100644 tags/20070517_before_mds_merge/doc/file_modes.txt delete mode 100644 tags/20070517_before_mds_merge/doc/header.txt delete mode 100644 tags/20070517_before_mds_merge/doc/inos.txt delete mode 100644 tags/20070517_before_mds_merge/doc/journal.txt delete mode 100644 tags/20070517_before_mds_merge/doc/lazy_posix.txt delete mode 100644 tags/20070517_before_mds_merge/doc/osd_outline.txt delete mode 100644 tags/20070517_before_mds_merge/doc/osd_replication.txt delete mode 100644 tags/20070517_before_mds_merge/doc/performance.txt delete mode 100644 tags/20070517_before_mds_merge/doc/shared_write_states_nogo.txt delete mode 100644 tags/20070517_before_mds_merge/doc/shutdown.txt delete mode 100644 tags/20070517_before_mds_merge/ebofs/Allocator.cc delete mode 100644 tags/20070517_before_mds_merge/ebofs/Allocator.h delete mode 100644 tags/20070517_before_mds_merge/ebofs/BlockDevice.cc delete mode 100644 tags/20070517_before_mds_merge/ebofs/BlockDevice.h delete mode 100644 tags/20070517_before_mds_merge/ebofs/BufferCache.cc delete mode 100644 tags/20070517_before_mds_merge/ebofs/BufferCache.h delete mode 100644 tags/20070517_before_mds_merge/ebofs/Cnode.h delete mode 100644 tags/20070517_before_mds_merge/ebofs/Ebofs.cc delete mode 100644 tags/20070517_before_mds_merge/ebofs/Ebofs.h delete mode 100644 tags/20070517_before_mds_merge/ebofs/Onode.h delete mode 100644 tags/20070517_before_mds_merge/ebofs/Table.h delete mode 100644 tags/20070517_before_mds_merge/ebofs/mkfs.ebofs.cc delete mode 100644 tags/20070517_before_mds_merge/ebofs/nodes.h delete mode 100644 tags/20070517_before_mds_merge/ebofs/test.ebofs.cc delete mode 100644 tags/20070517_before_mds_merge/ebofs/types.h delete mode 100644 tags/20070517_before_mds_merge/fakefuse.cc delete mode 100644 tags/20070517_before_mds_merge/fakesyn.cc delete mode 100644 tags/20070517_before_mds_merge/include/Context.h delete mode 100644 tags/20070517_before_mds_merge/include/Distribution.h delete mode 100644 tags/20070517_before_mds_merge/include/buffer.h delete mode 100644 tags/20070517_before_mds_merge/include/error.h delete mode 100644 tags/20070517_before_mds_merge/include/filepath.h delete mode 100644 tags/20070517_before_mds_merge/include/interval_set.h delete mode 100644 tags/20070517_before_mds_merge/include/lru.h delete mode 100644 tags/20070517_before_mds_merge/include/object.h delete mode 100644 tags/20070517_before_mds_merge/include/oldbuffer.h delete mode 100644 tags/20070517_before_mds_merge/include/oldbufferlist.h delete mode 100644 tags/20070517_before_mds_merge/include/rangeset.h delete mode 100644 tags/20070517_before_mds_merge/include/reqid.h delete mode 100644 tags/20070517_before_mds_merge/include/statlite.h delete mode 100644 tags/20070517_before_mds_merge/include/types.h delete mode 100644 tags/20070517_before_mds_merge/include/uofs.h delete mode 100644 tags/20070517_before_mds_merge/jobs/alc.tp delete mode 100644 tags/20070517_before_mds_merge/jobs/alcdat/makedirs delete mode 100644 tags/20070517_before_mds_merge/jobs/alcdat/makedirs.big delete mode 100644 tags/20070517_before_mds_merge/jobs/alcdat/makedirs.tput delete mode 100644 tags/20070517_before_mds_merge/jobs/alcdat/makefiles.shared delete mode 100644 tags/20070517_before_mds_merge/jobs/alcdat/openshared delete mode 100644 tags/20070517_before_mds_merge/jobs/alcdat/ossh.include delete mode 100644 tags/20070517_before_mds_merge/jobs/alcdat/ossh.include.big delete mode 100644 tags/20070517_before_mds_merge/jobs/alcdat/ossh.lib delete mode 100644 tags/20070517_before_mds_merge/jobs/alcdat/ossh.lib.big delete mode 100644 tags/20070517_before_mds_merge/jobs/alcdat/striping delete mode 100644 tags/20070517_before_mds_merge/jobs/example delete mode 100644 tags/20070517_before_mds_merge/jobs/mds/log_striping delete mode 100644 tags/20070517_before_mds_merge/jobs/mds/makedir_lat delete mode 100644 tags/20070517_before_mds_merge/jobs/mds/makedirs delete mode 100644 tags/20070517_before_mds_merge/jobs/mds/opensshlib delete mode 100644 tags/20070517_before_mds_merge/jobs/meta1 delete mode 100755 tags/20070517_before_mds_merge/jobs/meta1.proc.sh delete mode 100644 tags/20070517_before_mds_merge/jobs/osd/ebofs delete mode 100644 tags/20070517_before_mds_merge/jobs/osd/mds_log delete mode 100644 tags/20070517_before_mds_merge/jobs/osd/osd_threads delete mode 100644 tags/20070517_before_mds_merge/jobs/osd/striping delete mode 100644 tags/20070517_before_mds_merge/jobs/osd/wr_lat2 delete mode 100644 tags/20070517_before_mds_merge/jobs/osd/write_sizes delete mode 100644 tags/20070517_before_mds_merge/jobs/rados/map_dist delete mode 100644 tags/20070517_before_mds_merge/jobs/rados/rep_lat delete mode 100644 tags/20070517_before_mds_merge/jobs/rados/wr_sizes delete mode 100644 tags/20070517_before_mds_merge/mds/Anchor.h delete mode 100644 tags/20070517_before_mds_merge/mds/AnchorClient.cc delete mode 100644 tags/20070517_before_mds_merge/mds/AnchorClient.h delete mode 100644 tags/20070517_before_mds_merge/mds/AnchorTable.cc delete mode 100644 tags/20070517_before_mds_merge/mds/AnchorTable.h delete mode 100644 tags/20070517_before_mds_merge/mds/CDentry.cc delete mode 100644 tags/20070517_before_mds_merge/mds/CDentry.h delete mode 100644 tags/20070517_before_mds_merge/mds/CDir.cc delete mode 100644 tags/20070517_before_mds_merge/mds/CDir.h delete mode 100644 tags/20070517_before_mds_merge/mds/CInode.cc delete mode 100644 tags/20070517_before_mds_merge/mds/CInode.h delete mode 100644 tags/20070517_before_mds_merge/mds/Capability.h delete mode 100644 tags/20070517_before_mds_merge/mds/ClientMap.h delete mode 100644 tags/20070517_before_mds_merge/mds/IdAllocator.cc delete mode 100644 tags/20070517_before_mds_merge/mds/IdAllocator.h delete mode 100644 tags/20070517_before_mds_merge/mds/Lock.h delete mode 100644 tags/20070517_before_mds_merge/mds/Locker.cc delete mode 100644 tags/20070517_before_mds_merge/mds/Locker.h delete mode 100644 tags/20070517_before_mds_merge/mds/LogEvent.cc delete mode 100644 tags/20070517_before_mds_merge/mds/LogEvent.h delete mode 100644 tags/20070517_before_mds_merge/mds/MDBalancer.cc delete mode 100644 tags/20070517_before_mds_merge/mds/MDBalancer.h delete mode 100644 tags/20070517_before_mds_merge/mds/MDCache.cc delete mode 100644 tags/20070517_before_mds_merge/mds/MDCache.h delete mode 100644 tags/20070517_before_mds_merge/mds/MDLog.cc delete mode 100644 tags/20070517_before_mds_merge/mds/MDLog.h delete mode 100644 tags/20070517_before_mds_merge/mds/MDS.cc delete mode 100644 tags/20070517_before_mds_merge/mds/MDS.h delete mode 100644 tags/20070517_before_mds_merge/mds/MDSMap.h delete mode 100644 tags/20070517_before_mds_merge/mds/MDStore.cc delete mode 100644 tags/20070517_before_mds_merge/mds/MDStore.h delete mode 100644 tags/20070517_before_mds_merge/mds/Migrator.cc delete mode 100644 tags/20070517_before_mds_merge/mds/Migrator.h delete mode 100644 tags/20070517_before_mds_merge/mds/Renamer.cc delete mode 100644 tags/20070517_before_mds_merge/mds/Renamer.h delete mode 100644 tags/20070517_before_mds_merge/mds/Server.cc delete mode 100644 tags/20070517_before_mds_merge/mds/Server.h delete mode 100644 tags/20070517_before_mds_merge/mds/events/EAlloc.h delete mode 100644 tags/20070517_before_mds_merge/mds/events/EExportFinish.h delete mode 100644 tags/20070517_before_mds_merge/mds/events/EExportStart.h delete mode 100644 tags/20070517_before_mds_merge/mds/events/EImportFinish.h delete mode 100644 tags/20070517_before_mds_merge/mds/events/EImportMap.h delete mode 100644 tags/20070517_before_mds_merge/mds/events/EImportStart.h delete mode 100644 tags/20070517_before_mds_merge/mds/events/EMetaBlob.h delete mode 100644 tags/20070517_before_mds_merge/mds/events/EPurgeFinish.h delete mode 100644 tags/20070517_before_mds_merge/mds/events/EString.h delete mode 100644 tags/20070517_before_mds_merge/mds/events/EUnlink.h delete mode 100644 tags/20070517_before_mds_merge/mds/events/EUpdate.h delete mode 100644 tags/20070517_before_mds_merge/mds/journal.cc delete mode 100644 tags/20070517_before_mds_merge/mds/mdstypes.h delete mode 100644 tags/20070517_before_mds_merge/messages/MAnchorReply.h delete mode 100644 tags/20070517_before_mds_merge/messages/MAnchorRequest.h delete mode 100644 tags/20070517_before_mds_merge/messages/MCacheExpire.h delete mode 100644 tags/20070517_before_mds_merge/messages/MClientBoot.h delete mode 100644 tags/20070517_before_mds_merge/messages/MClientFileCaps.h delete mode 100644 tags/20070517_before_mds_merge/messages/MClientInodeAuthUpdate.h delete mode 100644 tags/20070517_before_mds_merge/messages/MClientMount.h delete mode 100644 tags/20070517_before_mds_merge/messages/MClientMountAck.h delete mode 100644 tags/20070517_before_mds_merge/messages/MClientReply.h delete mode 100644 tags/20070517_before_mds_merge/messages/MClientRequest.h delete mode 100644 tags/20070517_before_mds_merge/messages/MDentryUnlink.h delete mode 100644 tags/20070517_before_mds_merge/messages/MDirExpire.h delete mode 100644 tags/20070517_before_mds_merge/messages/MDirExpireReq.h delete mode 100644 tags/20070517_before_mds_merge/messages/MDirUpdate.h delete mode 100644 tags/20070517_before_mds_merge/messages/MDiscover.h delete mode 100644 tags/20070517_before_mds_merge/messages/MDiscoverReply.h delete mode 100644 tags/20070517_before_mds_merge/messages/MExportDir.h delete mode 100644 tags/20070517_before_mds_merge/messages/MExportDirAck.h delete mode 100644 tags/20070517_before_mds_merge/messages/MExportDirDiscover.h delete mode 100644 tags/20070517_before_mds_merge/messages/MExportDirDiscoverAck.h delete mode 100644 tags/20070517_before_mds_merge/messages/MExportDirFinish.h delete mode 100644 tags/20070517_before_mds_merge/messages/MExportDirNotify.h delete mode 100644 tags/20070517_before_mds_merge/messages/MExportDirNotifyAck.h delete mode 100644 tags/20070517_before_mds_merge/messages/MExportDirPrep.h delete mode 100644 tags/20070517_before_mds_merge/messages/MExportDirPrepAck.h delete mode 100644 tags/20070517_before_mds_merge/messages/MExportDirWarning.h delete mode 100644 tags/20070517_before_mds_merge/messages/MFailure.h delete mode 100644 tags/20070517_before_mds_merge/messages/MFailureAck.h delete mode 100644 tags/20070517_before_mds_merge/messages/MGenericMessage.h delete mode 100644 tags/20070517_before_mds_merge/messages/MHashDir.h delete mode 100644 tags/20070517_before_mds_merge/messages/MHashDirAck.h delete mode 100644 tags/20070517_before_mds_merge/messages/MHashDirDiscover.h delete mode 100644 tags/20070517_before_mds_merge/messages/MHashDirDiscoverAck.h delete mode 100644 tags/20070517_before_mds_merge/messages/MHashDirNotify.h delete mode 100644 tags/20070517_before_mds_merge/messages/MHashDirPrep.h delete mode 100644 tags/20070517_before_mds_merge/messages/MHashDirPrepAck.h delete mode 100644 tags/20070517_before_mds_merge/messages/MHashReaddir.h delete mode 100644 tags/20070517_before_mds_merge/messages/MHashReaddirReply.h delete mode 100644 tags/20070517_before_mds_merge/messages/MHeartbeat.h delete mode 100644 tags/20070517_before_mds_merge/messages/MInodeExpire.h delete mode 100644 tags/20070517_before_mds_merge/messages/MInodeFileCaps.h delete mode 100644 tags/20070517_before_mds_merge/messages/MInodeLink.h delete mode 100644 tags/20070517_before_mds_merge/messages/MInodeLinkAck.h delete mode 100644 tags/20070517_before_mds_merge/messages/MInodeUnlink.h delete mode 100644 tags/20070517_before_mds_merge/messages/MInodeUnlinkAck.h delete mode 100644 tags/20070517_before_mds_merge/messages/MInodeUpdate.h delete mode 100644 tags/20070517_before_mds_merge/messages/MLock.h delete mode 100644 tags/20070517_before_mds_merge/messages/MMDSBeacon.h delete mode 100644 tags/20070517_before_mds_merge/messages/MMDSBoot.h delete mode 100644 tags/20070517_before_mds_merge/messages/MMDSCacheRejoin.h delete mode 100644 tags/20070517_before_mds_merge/messages/MMDSCacheRejoinAck.h delete mode 100644 tags/20070517_before_mds_merge/messages/MMDSGetMap.h delete mode 100644 tags/20070517_before_mds_merge/messages/MMDSImportMap.h delete mode 100644 tags/20070517_before_mds_merge/messages/MMDSMap.h delete mode 100644 tags/20070517_before_mds_merge/messages/MMonElectionAck.h delete mode 100644 tags/20070517_before_mds_merge/messages/MMonElectionCollect.h delete mode 100644 tags/20070517_before_mds_merge/messages/MMonElectionPropose.h delete mode 100644 tags/20070517_before_mds_merge/messages/MMonElectionRefresh.h delete mode 100644 tags/20070517_before_mds_merge/messages/MMonElectionStatus.h delete mode 100644 tags/20070517_before_mds_merge/messages/MMonElectionVictory.h delete mode 100644 tags/20070517_before_mds_merge/messages/MMonOSDMapInfo.h delete mode 100644 tags/20070517_before_mds_merge/messages/MMonOSDMapLease.h delete mode 100644 tags/20070517_before_mds_merge/messages/MMonOSDMapLeaseAck.h delete mode 100644 tags/20070517_before_mds_merge/messages/MMonOSDMapUpdateAck.h delete mode 100644 tags/20070517_before_mds_merge/messages/MMonOSDMapUpdateCommit.h delete mode 100644 tags/20070517_before_mds_merge/messages/MMonOSDMapUpdatePrepare.h delete mode 100644 tags/20070517_before_mds_merge/messages/MMonPaxos.h delete mode 100644 tags/20070517_before_mds_merge/messages/MNSConnect.h delete mode 100644 tags/20070517_before_mds_merge/messages/MNSConnectAck.h delete mode 100644 tags/20070517_before_mds_merge/messages/MNSFailure.h delete mode 100644 tags/20070517_before_mds_merge/messages/MNSLookup.h delete mode 100644 tags/20070517_before_mds_merge/messages/MNSLookupReply.h delete mode 100644 tags/20070517_before_mds_merge/messages/MNSRegister.h delete mode 100644 tags/20070517_before_mds_merge/messages/MNSRegisterAck.h delete mode 100644 tags/20070517_before_mds_merge/messages/MOSDBoot.h delete mode 100644 tags/20070517_before_mds_merge/messages/MOSDFailure.h delete mode 100644 tags/20070517_before_mds_merge/messages/MOSDGetMap.h delete mode 100644 tags/20070517_before_mds_merge/messages/MOSDIn.h delete mode 100644 tags/20070517_before_mds_merge/messages/MOSDMap.h delete mode 100644 tags/20070517_before_mds_merge/messages/MOSDOp.h delete mode 100644 tags/20070517_before_mds_merge/messages/MOSDOpReply.h delete mode 100644 tags/20070517_before_mds_merge/messages/MOSDOut.h delete mode 100644 tags/20070517_before_mds_merge/messages/MOSDPGLog.h delete mode 100644 tags/20070517_before_mds_merge/messages/MOSDPGNotify.h delete mode 100644 tags/20070517_before_mds_merge/messages/MOSDPGPeer.h delete mode 100644 tags/20070517_before_mds_merge/messages/MOSDPGPeerAck.h delete mode 100644 tags/20070517_before_mds_merge/messages/MOSDPGPeerRequest.h delete mode 100644 tags/20070517_before_mds_merge/messages/MOSDPGQuery.h delete mode 100644 tags/20070517_before_mds_merge/messages/MOSDPGRemove.h delete mode 100644 tags/20070517_before_mds_merge/messages/MOSDPGSummary.h delete mode 100644 tags/20070517_before_mds_merge/messages/MOSDPGUpdate.h delete mode 100644 tags/20070517_before_mds_merge/messages/MOSDPing.h delete mode 100644 tags/20070517_before_mds_merge/messages/MPing.h delete mode 100644 tags/20070517_before_mds_merge/messages/MPingAck.h delete mode 100644 tags/20070517_before_mds_merge/messages/MRename.h delete mode 100644 tags/20070517_before_mds_merge/messages/MRenameAck.h delete mode 100644 tags/20070517_before_mds_merge/messages/MRenameNotify.h delete mode 100644 tags/20070517_before_mds_merge/messages/MRenameNotifyAck.h delete mode 100644 tags/20070517_before_mds_merge/messages/MRenamePrep.h delete mode 100644 tags/20070517_before_mds_merge/messages/MRenameReq.h delete mode 100644 tags/20070517_before_mds_merge/messages/MRenameWarning.h delete mode 100644 tags/20070517_before_mds_merge/messages/MUnhashDir.h delete mode 100644 tags/20070517_before_mds_merge/messages/MUnhashDirAck.h delete mode 100644 tags/20070517_before_mds_merge/messages/MUnhashDirNotify.h delete mode 100644 tags/20070517_before_mds_merge/messages/MUnhashDirNotifyAck.h delete mode 100644 tags/20070517_before_mds_merge/messages/MUnhashDirPrep.h delete mode 100644 tags/20070517_before_mds_merge/messages/MUnhashDirPrepAck.h delete mode 100644 tags/20070517_before_mds_merge/mkmonmap.cc delete mode 100644 tags/20070517_before_mds_merge/mon/ClientMonitor.cc delete mode 100644 tags/20070517_before_mds_merge/mon/ClientMonitor.h delete mode 100644 tags/20070517_before_mds_merge/mon/Elector.cc delete mode 100644 tags/20070517_before_mds_merge/mon/Elector.h delete mode 100644 tags/20070517_before_mds_merge/mon/MDSMonitor.cc delete mode 100644 tags/20070517_before_mds_merge/mon/MDSMonitor.h delete mode 100644 tags/20070517_before_mds_merge/mon/MonMap.h delete mode 100644 tags/20070517_before_mds_merge/mon/Monitor.cc delete mode 100644 tags/20070517_before_mds_merge/mon/Monitor.h delete mode 100644 tags/20070517_before_mds_merge/mon/MonitorStore.cc delete mode 100644 tags/20070517_before_mds_merge/mon/MonitorStore.h delete mode 100644 tags/20070517_before_mds_merge/mon/OSDMonitor.cc delete mode 100644 tags/20070517_before_mds_merge/mon/OSDMonitor.h delete mode 100644 tags/20070517_before_mds_merge/mon/Paxos.cc delete mode 100644 tags/20070517_before_mds_merge/mon/Paxos.h delete mode 100644 tags/20070517_before_mds_merge/msg/Dispatcher.cc delete mode 100644 tags/20070517_before_mds_merge/msg/Dispatcher.h delete mode 100644 tags/20070517_before_mds_merge/msg/FakeMessenger.cc delete mode 100644 tags/20070517_before_mds_merge/msg/FakeMessenger.h delete mode 100644 tags/20070517_before_mds_merge/msg/HostMonitor.cc delete mode 100644 tags/20070517_before_mds_merge/msg/HostMonitor.h delete mode 100644 tags/20070517_before_mds_merge/msg/MPIMessenger.cc delete mode 100644 tags/20070517_before_mds_merge/msg/MPIMessenger.h delete mode 100644 tags/20070517_before_mds_merge/msg/MTMessenger.cc delete mode 100644 tags/20070517_before_mds_merge/msg/MTMessenger.h delete mode 100644 tags/20070517_before_mds_merge/msg/Message.cc delete mode 100644 tags/20070517_before_mds_merge/msg/Message.h delete mode 100644 tags/20070517_before_mds_merge/msg/Messenger.cc delete mode 100644 tags/20070517_before_mds_merge/msg/Messenger.h delete mode 100644 tags/20070517_before_mds_merge/msg/NewMessenger.cc delete mode 100644 tags/20070517_before_mds_merge/msg/NewMessenger.h delete mode 100644 tags/20070517_before_mds_merge/msg/NewerMessenger.cc delete mode 100644 tags/20070517_before_mds_merge/msg/NewerMessenger.h delete mode 100644 tags/20070517_before_mds_merge/msg/RWLock.h delete mode 100644 tags/20070517_before_mds_merge/msg/SerialMessenger.h delete mode 100644 tags/20070517_before_mds_merge/msg/SimpleMessenger.cc delete mode 100644 tags/20070517_before_mds_merge/msg/SimpleMessenger.h delete mode 100644 tags/20070517_before_mds_merge/msg/TCPDirectory.cc delete mode 100644 tags/20070517_before_mds_merge/msg/TCPDirectory.h delete mode 100644 tags/20070517_before_mds_merge/msg/TCPMessenger.cc delete mode 100644 tags/20070517_before_mds_merge/msg/TCPMessenger.h delete mode 100644 tags/20070517_before_mds_merge/msg/error.c delete mode 100644 tags/20070517_before_mds_merge/msg/mpistarter.cc delete mode 100644 tags/20070517_before_mds_merge/msg/msg_types.h delete mode 100644 tags/20070517_before_mds_merge/msg/new_mpistarter.cc delete mode 100644 tags/20070517_before_mds_merge/msg/tcp.cc delete mode 100644 tags/20070517_before_mds_merge/msg/tcp.h delete mode 100644 tags/20070517_before_mds_merge/newsyn.cc delete mode 100644 tags/20070517_before_mds_merge/osbdb/OSBDB.cc delete mode 100644 tags/20070517_before_mds_merge/osbdb/OSBDB.h delete mode 100644 tags/20070517_before_mds_merge/osd/Ager.cc delete mode 100644 tags/20070517_before_mds_merge/osd/Ager.h delete mode 100644 tags/20070517_before_mds_merge/osd/BDBMap.h delete mode 100644 tags/20070517_before_mds_merge/osd/Fake.h delete mode 100644 tags/20070517_before_mds_merge/osd/FakeStore.cc delete mode 100644 tags/20070517_before_mds_merge/osd/FakeStore.h delete mode 100644 tags/20070517_before_mds_merge/osd/FakeStoreBDBCollections.h delete mode 100644 tags/20070517_before_mds_merge/osd/OBFSStore.cc delete mode 100644 tags/20070517_before_mds_merge/osd/OBFSStore.h delete mode 100644 tags/20070517_before_mds_merge/osd/OSD.cc delete mode 100644 tags/20070517_before_mds_merge/osd/OSD.h delete mode 100644 tags/20070517_before_mds_merge/osd/OSDMap.h delete mode 100644 tags/20070517_before_mds_merge/osd/ObjectStore.cc delete mode 100644 tags/20070517_before_mds_merge/osd/ObjectStore.h delete mode 100644 tags/20070517_before_mds_merge/osd/PG.cc delete mode 100644 tags/20070517_before_mds_merge/osd/PG.h delete mode 100644 tags/20070517_before_mds_merge/osd/osd_types.h delete mode 100644 tags/20070517_before_mds_merge/osd/rush.cc delete mode 100644 tags/20070517_before_mds_merge/osd/rush.h delete mode 100644 tags/20070517_before_mds_merge/osd/tp.cc delete mode 100644 tags/20070517_before_mds_merge/osdc/Blinker.h delete mode 100644 tags/20070517_before_mds_merge/osdc/Filer.cc delete mode 100644 tags/20070517_before_mds_merge/osdc/Filer.h delete mode 100644 tags/20070517_before_mds_merge/osdc/Journaler.cc delete mode 100644 tags/20070517_before_mds_merge/osdc/Journaler.h delete mode 100644 tags/20070517_before_mds_merge/osdc/ObjectCacher.cc delete mode 100644 tags/20070517_before_mds_merge/osdc/ObjectCacher.h delete mode 100644 tags/20070517_before_mds_merge/osdc/Objecter.cc delete mode 100644 tags/20070517_before_mds_merge/osdc/Objecter.h delete mode 100755 tags/20070517_before_mds_merge/script/add_header.pl delete mode 100755 tags/20070517_before_mds_merge/script/adjusttabs.pl delete mode 100755 tags/20070517_before_mds_merge/script/clean_osd_cow.sh delete mode 100755 tags/20070517_before_mds_merge/script/clean_trace.pl delete mode 100755 tags/20070517_before_mds_merge/script/comb.pl delete mode 100755 tags/20070517_before_mds_merge/script/find_auth_pins.pl delete mode 100755 tags/20070517_before_mds_merge/script/find_bufferleaks.pl delete mode 100755 tags/20070517_before_mds_merge/script/find_lost_bdev_ops.pl delete mode 100755 tags/20070517_before_mds_merge/script/find_lost_commit.pl delete mode 100755 tags/20070517_before_mds_merge/script/find_lost_objecter.pl delete mode 100755 tags/20070517_before_mds_merge/script/find_pathpins.pl delete mode 100755 tags/20070517_before_mds_merge/script/find_requests.pl delete mode 100755 tags/20070517_before_mds_merge/script/find_waiters.pl delete mode 100755 tags/20070517_before_mds_merge/script/grepblock delete mode 100644 tags/20070517_before_mds_merge/script/merge_trace_rw.pl delete mode 100755 tags/20070517_before_mds_merge/script/profonly.pl delete mode 100755 tags/20070517_before_mds_merge/script/runset.pl delete mode 100755 tags/20070517_before_mds_merge/script/sum.pl delete mode 100644 tags/20070517_before_mds_merge/test/fakemds.cc delete mode 100644 tags/20070517_before_mds_merge/test/gprof-helper.c delete mode 100644 tags/20070517_before_mds_merge/test/makedirs.cc delete mode 100644 tags/20070517_before_mds_merge/test/mpitest.cc delete mode 100644 tags/20070517_before_mds_merge/test/mttest.cc delete mode 100644 tags/20070517_before_mds_merge/test/rushconfig delete mode 100644 tags/20070517_before_mds_merge/test/rushtest.cc delete mode 100644 tags/20070517_before_mds_merge/test/rushtest.cc~ delete mode 100644 tags/20070517_before_mds_merge/test/testbucket.cc delete mode 100644 tags/20070517_before_mds_merge/test/testbuffers.cc delete mode 100644 tags/20070517_before_mds_merge/test/testcrush.cc delete mode 100644 tags/20070517_before_mds_merge/test/testfilepath.cc delete mode 100644 tags/20070517_before_mds_merge/test/testmpi.cc delete mode 100644 tags/20070517_before_mds_merge/test/testnewbuffers.cc delete mode 100644 tags/20070517_before_mds_merge/test/testos.cc delete mode 100644 tags/20070517_before_mds_merge/test/testosbdb.cc delete mode 100644 tags/20070517_before_mds_merge/test/testtree.cc delete mode 100644 tags/20070517_before_mds_merge/test/testxattr.cc delete mode 100644 tags/20070517_before_mds_merge/valgrind.supp delete mode 100644 trunk/ceph/COPYING delete mode 100644 trunk/ceph/README delete mode 100644 trunk/ceph/active/activemaster.cc delete mode 100644 trunk/ceph/active/activemaster.h delete mode 100644 trunk/ceph/active/activetaskd.cc delete mode 100644 trunk/ceph/active/activetaskd.h delete mode 100644 trunk/ceph/active/client_init.cc delete mode 100644 trunk/ceph/active/client_init.h delete mode 100644 trunk/ceph/active/echotestclient.cc delete mode 100644 trunk/ceph/active/echotestclient.h delete mode 100644 trunk/ceph/active/inet.h delete mode 100644 trunk/ceph/active/trivial_task.cc delete mode 100644 trunk/ceph/active/trivial_task.h delete mode 100644 trunk/ceph/cfuse.cc delete mode 100644 trunk/ceph/client/FileCache.cc delete mode 100644 trunk/ceph/client/FileCache.h delete mode 100644 trunk/ceph/client/Trace.cc delete mode 100644 trunk/ceph/client/Trace.h delete mode 100644 trunk/ceph/client/fuse.cc delete mode 100644 trunk/ceph/client/fuse.h delete mode 100644 trunk/ceph/client/fuse_ll.cc delete mode 100644 trunk/ceph/client/fuse_ll.h delete mode 100644 trunk/ceph/client/hadoop/CephFSInterface.cc delete mode 100644 trunk/ceph/client/hadoop/CephFSInterface.h delete mode 100644 trunk/ceph/client/ldceph.cc delete mode 100644 trunk/ceph/cmds.cc delete mode 100644 trunk/ceph/cmon.cc delete mode 100644 trunk/ceph/cmonctl.cc delete mode 100644 trunk/ceph/common/Clock.cc delete mode 100644 trunk/ceph/common/Cond.h delete mode 100644 trunk/ceph/common/DecayCounter.h delete mode 100644 trunk/ceph/common/LogType.h delete mode 100644 trunk/ceph/common/Logger.h delete mode 100755 trunk/ceph/common/Mutex.h delete mode 100644 trunk/ceph/common/RWLock.h delete mode 100644 trunk/ceph/common/Semaphore.h delete mode 100644 trunk/ceph/common/Thread.h delete mode 100644 trunk/ceph/common/ThreadPool.h delete mode 100644 trunk/ceph/common/Timer.cc delete mode 100644 trunk/ceph/common/Timer.h delete mode 100644 trunk/ceph/cosd.cc delete mode 100644 trunk/ceph/crush.old/BinaryTree.h delete mode 100644 trunk/ceph/crush.old/Bucket.h delete mode 100644 trunk/ceph/crush.old/Hash.h delete mode 100644 trunk/ceph/crush.old/crush.h delete mode 100644 trunk/ceph/crush.old/test/bucket_movement.cc delete mode 100644 trunk/ceph/crush.old/test/bucket_variance.cc delete mode 100644 trunk/ceph/crush.old/test/cluster_movement.cc delete mode 100644 trunk/ceph/crush.old/test/cluster_movement_remove.cc delete mode 100644 trunk/ceph/crush.old/test/cluster_movement_rush.cc delete mode 100644 trunk/ceph/crush.old/test/creeping_failure.cc delete mode 100644 trunk/ceph/crush.old/test/creeping_failure_variance.cc delete mode 100644 trunk/ceph/crush.old/test/depth_variance.cc delete mode 100644 trunk/ceph/crush.old/test/mixed.cc delete mode 100644 trunk/ceph/crush.old/test/movement.cc delete mode 100644 trunk/ceph/crush.old/test/movement_failed.cc delete mode 100644 trunk/ceph/crush.old/test/overload.cc delete mode 100644 trunk/ceph/crush.old/test/overload_variance.cc delete mode 100644 trunk/ceph/crush.old/test/sizes.cc delete mode 100644 trunk/ceph/crush.old/test/smallbucket.cc delete mode 100644 trunk/ceph/crush.old/test/speed_bucket.cc delete mode 100644 trunk/ceph/crush.old/test/speed_depth.cc delete mode 100644 trunk/ceph/crush.old/test/speed_rush.cc delete mode 100644 trunk/ceph/crush.old/test/t.cc delete mode 100644 trunk/ceph/crush.old/test/testbucket.cc delete mode 100644 trunk/ceph/crush.old/test/testnormal.cc delete mode 100644 trunk/ceph/crush/buckets.c delete mode 100644 trunk/ceph/crush/hash.h delete mode 100644 trunk/ceph/crush/mapper.h delete mode 100644 trunk/ceph/crush/types.h delete mode 100644 trunk/ceph/csyn.cc delete mode 100644 trunk/ceph/doc/Commitdir.txt delete mode 100644 trunk/ceph/doc/anchortable.txt delete mode 100644 trunk/ceph/doc/bdb.txt delete mode 100644 trunk/ceph/doc/caching.txt delete mode 100644 trunk/ceph/doc/exports.txt delete mode 100644 trunk/ceph/doc/header.txt delete mode 100644 trunk/ceph/doc/inos.txt delete mode 100644 trunk/ceph/doc/lazy_posix.txt delete mode 100644 trunk/ceph/doc/mds_locks.txt delete mode 100644 trunk/ceph/doc/modeline.txt delete mode 100644 trunk/ceph/doc/shared_write_states_nogo.txt delete mode 100644 trunk/ceph/ebofs/Allocator.cc delete mode 100644 trunk/ceph/ebofs/Allocator.h delete mode 100644 trunk/ceph/ebofs/FileJournal.cc delete mode 100644 trunk/ceph/ebofs/Journal.h delete mode 100644 trunk/ceph/ebofs/mkfs.ebofs.cc delete mode 100644 trunk/ceph/extractosdmaps.cc delete mode 100644 trunk/ceph/include/Context.h delete mode 100644 trunk/ceph/include/Distribution.h delete mode 100644 trunk/ceph/include/bitmapper.h delete mode 100644 trunk/ceph/include/blobhash.h delete mode 100644 trunk/ceph/include/encodable.h delete mode 100644 trunk/ceph/include/error.h delete mode 100644 trunk/ceph/include/filepath.h delete mode 100644 trunk/ceph/include/lru.h delete mode 100644 trunk/ceph/include/rangeset.h delete mode 100644 trunk/ceph/include/statlite.h delete mode 100644 trunk/ceph/include/triple.h delete mode 100644 trunk/ceph/include/uofs.h delete mode 100644 trunk/ceph/include/xlist.h delete mode 100644 trunk/ceph/jobs/alc.tp delete mode 100644 trunk/ceph/jobs/alcdat/makedirs delete mode 100644 trunk/ceph/jobs/alcdat/makedirs.big delete mode 100644 trunk/ceph/jobs/alcdat/makedirs.tput delete mode 100644 trunk/ceph/jobs/alcdat/makefiles.shared delete mode 100644 trunk/ceph/jobs/alcdat/openshared delete mode 100644 trunk/ceph/jobs/alcdat/ossh.include delete mode 100644 trunk/ceph/jobs/alcdat/ossh.include.big delete mode 100644 trunk/ceph/jobs/alcdat/ossh.lib delete mode 100644 trunk/ceph/jobs/alcdat/ossh.lib.big delete mode 100644 trunk/ceph/jobs/alcdat/striping delete mode 100644 trunk/ceph/jobs/example delete mode 100644 trunk/ceph/jobs/mds/log_striping delete mode 100644 trunk/ceph/jobs/mds/makedir_lat delete mode 100644 trunk/ceph/jobs/mds/makedirs delete mode 100644 trunk/ceph/jobs/mds/opensshlib delete mode 100644 trunk/ceph/jobs/meta1 delete mode 100755 trunk/ceph/jobs/meta1.proc.sh delete mode 100644 trunk/ceph/jobs/osd/ebofs delete mode 100644 trunk/ceph/jobs/osd/mds_log delete mode 100644 trunk/ceph/jobs/osd/osd_threads delete mode 100644 trunk/ceph/jobs/osd/striping delete mode 100644 trunk/ceph/jobs/osd/wr_lat2 delete mode 100644 trunk/ceph/jobs/osd/write_sizes delete mode 100644 trunk/ceph/jobs/rados/map_dist delete mode 100644 trunk/ceph/jobs/rados/rep_lat delete mode 100644 trunk/ceph/jobs/runjobsample delete mode 100644 trunk/ceph/mds/Anchor.h delete mode 100644 trunk/ceph/mds/AnchorClient.cc delete mode 100644 trunk/ceph/mds/AnchorClient.h delete mode 100644 trunk/ceph/mds/AnchorTable.cc delete mode 100644 trunk/ceph/mds/AnchorTable.h delete mode 100644 trunk/ceph/mds/CDentry.cc delete mode 100644 trunk/ceph/mds/CDentry.h delete mode 100644 trunk/ceph/mds/CDir.cc delete mode 100644 trunk/ceph/mds/CDir.h delete mode 100644 trunk/ceph/mds/CInode.cc delete mode 100644 trunk/ceph/mds/CInode.h delete mode 100644 trunk/ceph/mds/Capability.h delete mode 100644 trunk/ceph/mds/ClientMap.cc delete mode 100644 trunk/ceph/mds/ClientMap.h delete mode 100644 trunk/ceph/mds/FileLock.h delete mode 100644 trunk/ceph/mds/IdAllocator.cc delete mode 100644 trunk/ceph/mds/IdAllocator.h delete mode 100644 trunk/ceph/mds/LocalLock.h delete mode 100644 trunk/ceph/mds/Locker.h delete mode 100644 trunk/ceph/mds/LogEvent.cc delete mode 100644 trunk/ceph/mds/LogEvent.h delete mode 100644 trunk/ceph/mds/LogSegment.h delete mode 100644 trunk/ceph/mds/MDBalancer.h delete mode 100644 trunk/ceph/mds/MDCache.h delete mode 100644 trunk/ceph/mds/MDLog.h delete mode 100644 trunk/ceph/mds/Migrator.h delete mode 100644 trunk/ceph/mds/ScatterLock.h delete mode 100644 trunk/ceph/mds/Server.h delete mode 100644 trunk/ceph/mds/SimpleLock.h delete mode 100644 trunk/ceph/mds/events/EAnchor.h delete mode 100644 trunk/ceph/mds/events/EAnchorClient.h delete mode 100644 trunk/ceph/mds/events/EExport.h delete mode 100644 trunk/ceph/mds/events/EFragment.h delete mode 100644 trunk/ceph/mds/events/EImportFinish.h delete mode 100644 trunk/ceph/mds/events/EImportStart.h delete mode 100644 trunk/ceph/mds/events/EMetaBlob.h delete mode 100644 trunk/ceph/mds/events/EOpen.h delete mode 100644 trunk/ceph/mds/events/EPurgeFinish.h delete mode 100644 trunk/ceph/mds/events/ESession.h delete mode 100644 trunk/ceph/mds/events/ESessions.h delete mode 100644 trunk/ceph/mds/events/ESlaveUpdate.h delete mode 100644 trunk/ceph/mds/events/EString.h delete mode 100644 trunk/ceph/mds/events/ESubtreeMap.h delete mode 100644 trunk/ceph/mds/events/EUpdate.h delete mode 100644 trunk/ceph/mds/journal.cc delete mode 100644 trunk/ceph/messages/MAnchor.h delete mode 100644 trunk/ceph/messages/MCacheExpire.h delete mode 100644 trunk/ceph/messages/MDentryUnlink.h delete mode 100644 trunk/ceph/messages/MDirUpdate.h delete mode 100644 trunk/ceph/messages/MDiscover.h delete mode 100644 trunk/ceph/messages/MDiscoverReply.h delete mode 100644 trunk/ceph/messages/MExportCaps.h delete mode 100644 trunk/ceph/messages/MExportCapsAck.h delete mode 100644 trunk/ceph/messages/MExportDir.h delete mode 100644 trunk/ceph/messages/MExportDirAck.h delete mode 100644 trunk/ceph/messages/MExportDirCancel.h delete mode 100644 trunk/ceph/messages/MExportDirDiscover.h delete mode 100644 trunk/ceph/messages/MExportDirDiscoverAck.h delete mode 100644 trunk/ceph/messages/MExportDirFinish.h delete mode 100644 trunk/ceph/messages/MExportDirNotify.h delete mode 100644 trunk/ceph/messages/MExportDirNotifyAck.h delete mode 100644 trunk/ceph/messages/MExportDirPrep.h delete mode 100644 trunk/ceph/messages/MExportDirPrepAck.h delete mode 100644 trunk/ceph/messages/MExportDirWarning.h delete mode 100644 trunk/ceph/messages/MExportDirWarningAck.h delete mode 100644 trunk/ceph/messages/MGenericMessage.h delete mode 100644 trunk/ceph/messages/MHeartbeat.h delete mode 100644 trunk/ceph/messages/MInodeFileCaps.h delete mode 100644 trunk/ceph/messages/MMDSBeacon.h delete mode 100644 trunk/ceph/messages/MMDSBoot.h delete mode 100644 trunk/ceph/messages/MMDSCacheRejoin.h delete mode 100644 trunk/ceph/messages/MMDSFragmentNotify.h delete mode 100644 trunk/ceph/messages/MMDSResolve.h delete mode 100644 trunk/ceph/messages/MMDSResolveAck.h delete mode 100644 trunk/ceph/messages/MMDSSlaveRequest.h delete mode 100644 trunk/ceph/messages/MMonCommand.h delete mode 100644 trunk/ceph/messages/MMonCommandAck.h delete mode 100644 trunk/ceph/messages/MMonElection.h delete mode 100644 trunk/ceph/messages/MMonElectionCollect.h delete mode 100644 trunk/ceph/messages/MMonElectionRefresh.h delete mode 100644 trunk/ceph/messages/MMonElectionStatus.h delete mode 100644 trunk/ceph/messages/MMonOSDMapInfo.h delete mode 100644 trunk/ceph/messages/MMonOSDMapLease.h delete mode 100644 trunk/ceph/messages/MMonOSDMapLeaseAck.h delete mode 100644 trunk/ceph/messages/MMonOSDMapUpdateAck.h delete mode 100644 trunk/ceph/messages/MMonOSDMapUpdateCommit.h delete mode 100644 trunk/ceph/messages/MMonOSDMapUpdatePrepare.h delete mode 100644 trunk/ceph/messages/MMonPaxos.h delete mode 100644 trunk/ceph/messages/MOSDBoot.h delete mode 100644 trunk/ceph/messages/MOSDFailure.h delete mode 100644 trunk/ceph/messages/MOSDIn.h delete mode 100644 trunk/ceph/messages/MOSDOut.h delete mode 100644 trunk/ceph/messages/MOSDPGActivateSet.h delete mode 100644 trunk/ceph/messages/MOSDPGLog.h delete mode 100644 trunk/ceph/messages/MOSDPGNotify.h delete mode 100644 trunk/ceph/messages/MOSDPGPeer.h delete mode 100644 trunk/ceph/messages/MOSDPGPeerAck.h delete mode 100644 trunk/ceph/messages/MOSDPGPeerRequest.h delete mode 100644 trunk/ceph/messages/MOSDPGRemove.h delete mode 100644 trunk/ceph/messages/MOSDPGSummary.h delete mode 100644 trunk/ceph/messages/MOSDPGUpdate.h delete mode 100644 trunk/ceph/messages/MOSDPing.h delete mode 100644 trunk/ceph/messages/MPGStats.h delete mode 100644 trunk/ceph/mon/Elector.h delete mode 100644 trunk/ceph/mon/MonitorStore.h delete mode 100644 trunk/ceph/mon/PGMap.h delete mode 100644 trunk/ceph/mon/PGMonitor.h delete mode 100644 trunk/ceph/mon/PaxosService.cc delete mode 100644 trunk/ceph/mon/mon_types.h delete mode 100644 trunk/ceph/msg/Dispatcher.cc delete mode 100644 trunk/ceph/msg/Dispatcher.h delete mode 100644 trunk/ceph/msg/Messenger.cc delete mode 100644 trunk/ceph/msg/tcp.cc delete mode 100644 trunk/ceph/osbdb/OSBDB.cc delete mode 100644 trunk/ceph/osbdb/OSBDB.h delete mode 100644 trunk/ceph/osd/Ager.cc delete mode 100644 trunk/ceph/osd/Ager.h delete mode 100644 trunk/ceph/osd/BDBMap.h delete mode 100644 trunk/ceph/osd/FakeStoreBDBCollections.h delete mode 100644 trunk/ceph/osd/ObjectStore.cc delete mode 100644 trunk/ceph/osdc/Blinker.h delete mode 100644 trunk/ceph/osdc/Filer.cc delete mode 100644 trunk/ceph/osdc/Filer.h delete mode 100644 trunk/ceph/osdc/Journaler.cc delete mode 100644 trunk/ceph/osdc/Journaler.h delete mode 100644 trunk/ceph/osdc/ObjectCacher.cc delete mode 100644 trunk/ceph/osdc/ObjectCacher.h delete mode 100755 trunk/ceph/script/add_header.pl delete mode 100755 trunk/ceph/script/adjusttabs.pl delete mode 100755 trunk/ceph/script/check_cache_dumps.pl delete mode 100755 trunk/ceph/script/clean_osd_cow.sh delete mode 100755 trunk/ceph/script/clean_trace.pl delete mode 100755 trunk/ceph/script/comb.pl delete mode 100755 trunk/ceph/script/convert_soe_trace.pl delete mode 100755 trunk/ceph/script/find_auth_pins.pl delete mode 100755 trunk/ceph/script/find_bufferleaks.pl delete mode 100755 trunk/ceph/script/find_lost_bdev_ops.pl delete mode 100755 trunk/ceph/script/find_lost_commit.pl delete mode 100755 trunk/ceph/script/find_lost_objecter.pl delete mode 100755 trunk/ceph/script/find_pathpins.pl delete mode 100755 trunk/ceph/script/find_requests.pl delete mode 100755 trunk/ceph/script/find_waiters.pl delete mode 100755 trunk/ceph/script/fix_modeline.pl delete mode 100755 trunk/ceph/script/gprofnewsyn delete mode 100755 trunk/ceph/script/grepblock delete mode 100755 trunk/ceph/script/merge_cdfs.pl delete mode 100644 trunk/ceph/script/merge_trace_rw.pl delete mode 100755 trunk/ceph/script/plot.pl delete mode 100755 trunk/ceph/script/profonly.pl delete mode 100755 trunk/ceph/script/runjob.pl delete mode 100755 trunk/ceph/script/runset.pl delete mode 100755 trunk/ceph/script/smooth.pl delete mode 100755 trunk/ceph/script/study_find.pl delete mode 100755 trunk/ceph/script/study_hardlink_lifetimes.pl delete mode 100644 trunk/ceph/script/study_lookups.pl delete mode 100755 trunk/ceph/script/sum.pl delete mode 100644 trunk/ceph/test/fakemds.cc delete mode 100644 trunk/ceph/test/fg.cc delete mode 100644 trunk/ceph/test/gprof-helper.c delete mode 100644 trunk/ceph/test/makedirs.cc delete mode 100644 trunk/ceph/test/mpitest.cc delete mode 100644 trunk/ceph/test/mttest.cc delete mode 100644 trunk/ceph/test/rushconfig delete mode 100644 trunk/ceph/test/rushtest.cc delete mode 100644 trunk/ceph/test/rushtest.cc~ delete mode 100644 trunk/ceph/test/test_disk_bw.cc delete mode 100644 trunk/ceph/test/testbucket.cc delete mode 100644 trunk/ceph/test/testbuffers.cc delete mode 100644 trunk/ceph/test/testcounter.cc delete mode 100644 trunk/ceph/test/testcrush.cc delete mode 100644 trunk/ceph/test/testfilepath.cc delete mode 100644 trunk/ceph/test/testmpi.cc delete mode 100644 trunk/ceph/test/testnewbuffers.cc delete mode 100644 trunk/ceph/test/testos.cc delete mode 100644 trunk/ceph/test/testosbdb.cc delete mode 100644 trunk/ceph/test/testtree.cc delete mode 100644 trunk/ceph/test/testxattr.cc rename {trunk/web => web}/Makefile (100%) rename {trunk/web => web}/ceph.css (100%) rename {trunk/web => web}/gen.pl (100%) rename {trunk/web => web}/images/ceph-architecture.png (100%) rename {trunk/web => web}/images/ceph-logo1.jpg (100%) rename {trunk/web => web}/index.body (100%) rename {trunk/web => web}/overview.body (100%) rename {trunk/web => web}/publications.body (100%) rename {trunk/web => web}/source.body (100%) rename {trunk/web => web}/tasks.body (100%) rename {trunk/web => web}/template.html (100%) diff --git a/trunk/bench/mdtest/COPYRIGHT b/bench/mdtest/COPYRIGHT similarity index 100% rename from trunk/bench/mdtest/COPYRIGHT rename to bench/mdtest/COPYRIGHT diff --git a/trunk/bench/mdtest/Makefile b/bench/mdtest/Makefile similarity index 100% rename from trunk/bench/mdtest/Makefile rename to bench/mdtest/Makefile diff --git a/trunk/bench/mdtest/README b/bench/mdtest/README similarity index 100% rename from trunk/bench/mdtest/README rename to bench/mdtest/README diff --git a/trunk/bench/mdtest/mdtest.c b/bench/mdtest/mdtest.c similarity index 100% rename from trunk/bench/mdtest/mdtest.c rename to bench/mdtest/mdtest.c diff --git a/branches/marnberg/quota/Makefile b/branches/marnberg/quota/Makefile deleted file mode 100644 index 8fa037a3984e2..0000000000000 --- a/branches/marnberg/quota/Makefile +++ /dev/null @@ -1,257 +0,0 @@ - -# mpicxx must be on your path to build newsyn. on googoo, this means -# that /usr/local/mpich2-1.0.2/bin must be on your path. - -# For now, use g++ most of the time. -# When compiling MPI stuff, specify myfile.cc instead of myfile.o so -# that ${MPICC} is invoked instead of the generic .o rule (or it'll -# use g++). This makes it less annoying to build on non-mpi hosts for -# dev work, and seems to behave just fine... change ${CC} back to -# mpicxx if you get paranoid. - -#CC = g++ -#CFLAGS = -g -Wall -I. -D_FILE_OFFSET_BITS=64 -DMPICH_IGNORE_CXX_SEEK -D_REENTRANT -D_THREAD_SAFE -#LIBS = -lpthread - -# Hook for extra -I options, etc. -EXTRA_CFLAGS = - -ifeq ($(target),darwin) -# For Darwin -CFLAGS = -g -Wall -I. -D_FILE_OFFSET_BITS=64 -DMPICH_IGNORE_CXX_SEEK -D_REENTRANT -D_THREAD_SAFE -DDARWIN -D__FreeBSD__=10 ${EXTRA_CFLAGS} -LDINC = ar -rc -else -# For linux -CFLAGS = -g -Wall -I. -D_FILE_OFFSET_BITS=64 -DMPICH_IGNORE_CXX_SEEK -D_REENTRANT -D_THREAD_SAFE -LDINC = ld -i -o -endif - -CC = g++ -LIBS = -lpthread - -ifeq ($(want_bdb),yes) -CFLAGS += -DUSE_OSBDB -OSBDB_LIBS = -ldb_cxx -endif - -#for normal mpich2 machines -MPICC = mpicxx -MPICFLAGS = ${CFLAGS} -MPILIBS = ${LIBS} - -#for LLNL boxes without mpicxx -#MPICC = g++ -#MPICFLAGS = ${CFLAGS} -I/usr/lib/mpi/include -L/usr/lib/mpi/mpi_gnu/lib -#MPILIBS = ${LIBS} -lelan -lmpi - -EBOFS_OBJS= \ - ebofs/BlockDevice.o\ - ebofs/BufferCache.o\ - ebofs/Ebofs.o\ - ebofs/Allocator.o - -MDS_OBJS= \ - mds/MDS.o\ - mds/journal.o\ - mds/Server.o\ - mds/MDCache.o\ - mds/Locker.o\ - mds/Migrator.o\ - mds/Renamer.o\ - mds/MDBalancer.o\ - mds/CDentry.o\ - mds/CDir.o\ - mds/CInode.o\ - mds/AnchorTable.o\ - mds/AnchorClient.o\ - mds/MDStore.o\ - mds/LogEvent.o\ - mds/IdAllocator.o\ - mds/MDLog.o - -OSD_OBJS= \ - osd/PG.o\ - osd/Ager.o\ - osd/FakeStore.o\ - osd/OSD.o - -OSDC_OBJS= \ - osdc/Objecter.o\ - osdc/ObjectCacher.o\ - osdc/Filer.o\ - osdc/Journaler.o - -MON_OBJS= \ - mon/Monitor.o\ - mon/Paxos.o\ - mon/OSDMonitor.o\ - mon/MDSMonitor.o\ - mon/ClientMonitor.o\ - mon/Elector.o\ - mon/MonitorStore.o - -COMMON_OBJS= \ - msg/Message.o\ - common/Logger.o\ - common/Clock.o\ - common/Timer.o\ - config.o - - -CLIENT_OBJS= \ - client/FileCache.o\ - client/Client.o\ - client/SyntheticClient.o\ - client/Trace.o - -ifeq ($(want_bdb),yes) -OSBDB_OBJS = \ - osbdb/OSBDB.o - -OSBDB_OBJ = osbdb.o -endif - -TARGETS = cmon cosd cmds csyn newsyn fakesyn mkmonmap cfuse fakefuse - -SRCS=*.cc */*.cc *.h */*.h */*/*.h - -all: depend ${TARGETS} - -test: depend ${TEST_TARGETS} - -obfs: depend obfstest - - -# real bits -mkmonmap: mkmonmap.cc common.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - -cmon: cmon.cc mon.o msg/SimpleMessenger.o common.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - -cosd: cosd.cc osd.o ebofs.o ${OSBDB_OBJ} msg/SimpleMessenger.o common.o - ${CC} ${CFLAGS} ${LIBS} ${OSBDB_LIBS} $^ -o $@ - -cmds: cmds.cc mds.o osdc.o msg/SimpleMessenger.o common.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - -csyn: csyn.cc client.o osdc.o msg/SimpleMessenger.o common.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - -cfuse: cfuse.cc client.o osdc.o client/fuse.o msg/SimpleMessenger.o common.o - ${CC} ${CFLAGS} ${LIBS} -lfuse $^ -o $@ - - -# misc -gprof-helper.so: test/gprof-helper.c - gcc -shared -fPIC test/gprof-helper.c -o gprof-helper.so -lpthread -ldl - - -# fake* -fakefuse: fakefuse.cc mon.o mds.o client.o osd.o osdc.o ebofs.o ${OSBDB_OBJ} client/fuse.o msg/FakeMessenger.o common.o - ${CC} -pg ${CFLAGS} ${LIBS} ${OSBDB_LIBS} -lfuse $^ -o $@ - -fakesyn: fakesyn.cc mon.o mds.o client.o osd.o ebofs.o ${OSBDB_OBJ} osdc.o msg/FakeMessenger.o common.o - ${CC} -pg ${CFLAGS} ${LIBS} ${OSBDB_LIBS} $^ -o $@ - - -# mpi startup -newsyn: newsyn.cc mon.o mds.o client.o osd.o ebofs.o ${OSBDB_OBJ} osdc.o msg/SimpleMessenger.o common.o - ${MPICC} -pg ${MPICFLAGS} ${MPILIBS} ${OSBDB_LIBS} $^ -o $@ - -newsyn.nopg: newsyn.cc mon.o mds.o client.o osd.o ebofs.o ${OSBDB_OBJ} osdc.o msg/SimpleMessenger.o common.o - ${MPICC} ${MPICFLAGS} ${MPILIBS} ${OSBDB_LIBS} $^ -o $@ - - -# ebofs -mkfs.ebofs: ebofs/mkfs.ebofs.cc config.cc common/Clock.o ebofs.o - ${CC} -pg ${CFLAGS} ${LIBS} $^ -o $@ - -test.ebofs: ebofs/test.ebofs.cc config.cc common/Clock.o ebofs.o - ${CC} -pg ${CFLAGS} ${LIBS} $^ -o $@ - - -# + obfs (old) -fakesynobfs: fakesyn.cc mds.o client.o osd_obfs.o msg/FakeMessenger.o common.o - ${CC} -DUSE_OBFS ${CFLAGS} ${LIBS} $^ -o $@ - -tcpsynobfs: tcpsyn.cc mds.o client.o osd_obfs.o ${TCP_OBJS} common.o - ${MPICC} -DUSE_OBFS ${MPICFLAGS} ${MPILIBS} $^ -o $@ - -osd_obfs.o: osd/OBFSStore.o osd/OSD.cc osd/PG.o osd/ObjectStore.o osd/FakeStore.o - ${MPICC} -DUSE_OBFS ${MPICFLAGS} ${MPILIBS} $^ -o $@ ../uofs/uofs.a - - - -# libceph -libceph.o: client/ldceph.o client/Client.o msg/SimpleMessenger.o ${COMMON_OBJS} ${SYN_OBJS} ${OSDC_OBJS} - ${LDINC} $^ -o $@ - -bench/mdtest/mdtest.o: bench/mdtest/mdtest.c - mpicc -c $^ -o $@ - -mdtest: bench/mdtest/mdtest.o - ${MPICC} ${MPICFLAGS} ${MPILIBS} $^ -o $@ - -mdtest.ceph: bench/mdtest/mdtest.o libceph.o - ${MPICC} ${MPICFLAGS} ${MPILIBS} $^ -o $@ - -# OSD test - -testos: test/testos.o ebofs.o osbdb.o common.o - ${CC} ${CFLAGS} ${LIBS} ${OSBDB_LIBS} -o $@ $^ - -# - -%.so: %.cc - ${CC} -shared -fPIC ${CFLAGS} $< -o $@ - -clean: - rm -f *.o */*.o ${TARGETS} ${TEST_TARGETS} - -common.o: ${COMMON_OBJS} - ${LDINC} $@ $^ - -ebofs.o: ${EBOFS_OBJS} - ${LDINC} $@ $^ - -client.o: ${CLIENT_OBJS} - ${LDINC} $@ $^ - -osd.o: ${OSD_OBJS} - ${LDINC} $@ $^ - -osdc.o: ${OSDC_OBJS} - ${LDINC} $@ $^ - -mds.o: ${MDS_OBJS} - ${LDINC} $@ $^ - -mon.o: ${MON_OBJS} - ${LDINC} $@ $^ - -osbdb.o: ${OSBDB_OBJS} - ${LDINC} $@ $^ - -%.o: %.cc - ${CC} ${CFLAGS} -c $< -o $@ - -%.po: %.cc - ${CC} -fPIC ${CFLAGS} -c $< -o $@ - -count: - cat ${SRCS} | wc -l - cat ${SRCS} | grep -c \; - -TAGS: - etags `find . -name "*.[h|cc]"` - -.depend: - touch .depend - -depend: - $(RM) .depend - makedepend -f- -- $(CFLAGS) -- $(SRCS) > .depend 2>/dev/null - -# now add a line to include the dependency list. -include .depend diff --git a/branches/marnberg/quota/TODO b/branches/marnberg/quota/TODO deleted file mode 100644 index 8a64da39dfc8a..0000000000000 --- a/branches/marnberg/quota/TODO +++ /dev/null @@ -1,322 +0,0 @@ - - -monitor -- finish generic paxos - -osdmon -- distribute w/ paxos framework -- allow fresh replacement osds. add osd_created in osdmap, probably -- monitor needs to monitor some osds... -- monitor pg states, notify on out? -- watch osd utilization; adjust overload in cluster map - -mdsmon -- distribute w/ paxos framework - -journaler -- fix up for large events (e.g. imports) -- use set_floor_and_read for safe takeover from possibly-not-quite-dead otherguy. -- should we pad with zeros to avoid splitting individual entries? - - make it a g_conf flag? - - have to fix reader to skip over zeros (either <4 bytes for size, or zeroed sizes) -- need to truncate at detected (valid) write_pos to clear out any other partial trailing writes - - -crush -- xml import/export? -- crush tools - - -rados+ebofs -- purge replicated writes from cache. (with exception of partial tail blocks.) - -rados paper todo? -- better experiments - - berkeleydb objectstore? -- flush log only in response to subsequent read or write? -- better behaving recovery -- justify use of splay. - - dynamic replication -- snapshots - -rados snapshots -- integrate revisions into ObjectCacher -- clean up oid.rev vs op.rev in osd+osdc - -- attr.crev is rev we were created in. -- oid.rev=0 is "live". defined for attr.crev <= rev. -- otherwise, defined for attr.crev <= rev < oid.rev (i.e. oid.rev is upper bound, non-inclusive.) - -- write|delete is tagged with op.rev - - if attr.crev < op.rev - - we clone to oid.rev=rev (clone keeps old crev) - - change live attr.crev=rev. - - apply update -- read is tagged with op.rev - - if 0, we read from 0 (if it exists). - - otherwise we choose object rev based on op.rev vs oid.rev, and then verifying attr.crev <= op.rev. - -- how to get usage feedback to monitor? - -- change messenger entity_inst_t - - no more rank! make it a uniquish nonce? - -- clean up mds caps release in exporter -- figure out client failure modes -- clean up messenger failure modes. -- add connection retry. - - -objecter -- read+floor_lockout - -osd/rados -- read+floor_lockout for clean STOGITH-like/fencing semantics after failover. -- separate out replication code into a PG class, to pave way for RAID - -- efficiently replicate clone() objects -- pg_num instead of pg_bits -- flag missing log entries on crash recovery --> WRNOOP? or WRLOST? -- consider implications of nvram writeahead logs -- fix heartbeat wrt new replication -- mark residual pgs obsolete ??? -- rdlocks -- optimize remove wrt recovery pushes -- pg_bit/pg_num changes -- report crashed pgs? - -simplemessenger -- close idle connections -- retry, timeout on connection or transmission failure - -objectcacher -- ocacher caps transitions vs locks -- test read locks - -reliability -- heartbeat vs ping? -- osdmonitor, filter - -ebofs -- verify proper behavior of conflicting/overlapping reads of clones -- test(fix) sync() -- combine inodes and/or cnodes into same blocks -- allow btree sets instead of maps -- eliminate nodepools -- nonblocking write on missing onodes? -- fix bug in node rotation on insert (and reenable) -- fix NEAR_LAST_FWD (?) -- journaling? in NVRAM? -- metadata in nvram? flash? - - -remaining hard problems -- how to cope with file size changes and read/write sharing - - -crush -- more efficient failure when all/too many osds are down -- allow forcefeed for more complicated rule structures. (e.g. make force_stack a list< set >) - - -mds -- distributed client management -- anchormgr - - 2pc - - independent journal? - - distributed? -- link count management - - also 2pc -- chdir (directory opens!) -- rewrite logstream - - clean up - - be smart about rados ack vs reread - - log locking? root log object - - trimming, rotation - -- efficient stat for single writers -- lstat vs stat -- add FILE_CAP_EXTEND capability bit -- only share osdmap updates with clients holding capabilities -- delayed replica caps release... we need to set a timer event? (and cancel it when appropriate?) -- finish hard links! - - reclaim danglers from inode file on discover... - - fix rename wrt hard links -- interactive hash/unhash interface -- test hashed readdir -- make logstream.flush align itself to stripes - -- carefully define/document frozen wrt dir_auth vs hashing - - - -client -- fstat -- make_request: cope with mds failure -- mixed lazy and non-lazy io will clobber each others' caps in the buffer cache.. how to isolate.. -- test client caps migration w/ mds exports -- some heuristic behavior to consolidate caps to inode auth? - - - -MDS TODO -- fix hashed readdir: should (optionally) do a lock on dir namespace? -- fix hard links - - they mostly work, but they're fragile -- sync clients on stat - - will need to ditch 10s client metadata caching before this is useful - - implement truncate -- implement hashed directories -- statfs? -- rewrite journal + recovery -- figure out online failure recovery -- more distributed fh management? -- btree directories (for efficient large directories) -- consistency points/snapshots - -- fix MExportAck and others to use dir+dentry, not inode - (otherwise this all breaks with hard links.. altho it probably needs reworking already?) - - - - - -why qsync could be wrong (for very strict POSIX) : varying mds -> client message transit or processing times. -- mds -> 1,2 : qsync -- client1 writes at byte 100 -- client1 -> mds : qsync reply (size=100) -- client1 writes at byte 300 -- client1 -> client2 (outside channel) -- client2 writes at byte 200 -- client2 -> mds : qsync reply (size=200) --> stat results in size 200, even though at no single point in time was the max size 500. --> for correct result, need to _stop_ client writers while gathering metadata. - - -SAGE: - -- string table? - -- hard links - - fix MExportAck and others to use dir+dentry, not inode - (otherwise this all breaks with hard links.. altho it probably needs reworking already!) - -- do real permission checks? - - - - - - -ISSUES - - -- discover - - soft: authority selectively repicates, or sets a 'forward' flag in reply - - hard: authority always replicates (eg. discover for export) - - forward flag (see soft) - - error flag (if file not found, etc.) - - [what was i talking about?] make sure waiters are properly triggered, either upon dir_rep update, or (empty!) discover reply - - - -DOCUMENT -- cache, distributed cache structure and invariants -- export process -- hash/unhash process - - -TEST -- hashing - - test hash/unhash operation - - hash+export: encode list of replicated dir inodes so they can be discovered before import is procesed. - - test nauthitems (wrt hashing?) - - -IMPLEMENT - -- smarter balancing - - popularity calculation and management is inconsistent/wrong. - - does it work? - -- dump active config in run output somewhere - - - - - - - - - - -==== MDS RECOVERY ==== - -- how to reliably deliver cache expire messages? - - how should proxy behave? - - exporter failure - - all cacheexpire info has been passed on up until point where export is permanent. no impact. - - importer failure - - exporter collects expire info, so that it can reverse. - - ??? - - maybe hosts should double-up expires until after export is known to have committed? ---> just send expires to both nodes. dir_auth+dir_auth2. clean up export ack/notify process. :) - -*** dar... no, separate bystander dir_auth updates from the prepare/ack/commit cycle! -- expire should go to both old and new auth -- set_dir_auth should take optional second auth, and authority() should optionally set/return a second possible auth -- does inode need it's own replica list? no! -- dirslices. - - -/- exporter recovery if importer fails during EXPORT_EXPORTING stage -- importer recovery if exporter fails - -/?- delay response to sending import_map if export in progress? -/?- finish export before sending import_map? -/- ambiguous imports on active node should include in-progress imports! -/- how to effectively trim cache after resolve but before rejoin -/ - we need to eliminate unneed non-auth metadata, without hosing potentially useful auth metadata - -- osd needs a set_floor_and_read op for safe failover/STOGITH-like semantics. - -- failures during recovery stages (resolve, rejoin)... make sure rejoin still works! - -- fix mds initial osdmap weirdness (which will currently screw up on standby -> almost anything) - - -importmap only sent after exports have completed. -failures update export ack waitlists, so exports will compelte if unrelated nodes fail. -importmap can be sent regardless of import status -- pending import is just flagged ambiguous. -failure of exporter induces some cleanup on importer. importer will disambiguate when it gets an importmap on exporter recovery. -failure of importer induces cleanup on exporter. no ambiguity. - - -/- no new mds may join if cluster is in a recovery state. starting -> standby (unless failed) -/ - make sure creating -> standby, and are not included in recovery set? - - -mdsmap notes -- mds don't care about intervening states, except rejoin > active, and - that transition requires active involvement. thus, no need worry - about delivering/processing the full sequence of maps. - -blech: -- EMetablob should return 'expired' if they have - higher versions (and are thus described by a newer journal entry) - -mds -- mds falure vs clients - - clean up client op redirection - - idempotent ops - -- journal+recovery - - unlink - - open(wr cap), open+create - - file capabilities i/o - - link - - rename - -- should auth_pins really go to the root? - - FIXME: auth_pins on importer versus import beneath an authpinned region? - diff --git a/branches/marnberg/quota/cfuse.cc b/branches/marnberg/quota/cfuse.cc deleted file mode 100644 index 4b7e490c26b76..0000000000000 --- a/branches/marnberg/quota/cfuse.cc +++ /dev/null @@ -1,81 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include -#include -#include -using namespace std; - -#include "config.h" - -#include "client/Client.h" -#include "client/fuse.h" - -#include "msg/SimpleMessenger.h" - -#include "common/Timer.h" - -#ifndef DARWIN -#include -#endif // DARWIN - -#include -#include -#include - -int main(int argc, char **argv, char *envp[]) { - - //cerr << "cfuse starting " << myrank << "/" << world << endl; - vector args; - argv_to_vec(argc, argv, args); - parse_config_options(args); - - // args for fuse - vec_to_argv(args, argc, argv); - - // FUSE will chdir("/"); be ready. - g_conf.use_abspaths = true; - - // load monmap - MonMap monmap; - int r = monmap.read(".ceph_monmap"); - assert(r >= 0); - - // start up network - rank.start_rank(); - - // start client - Client *client = new Client(rank.register_entity(MSG_ADDR_CLIENT_NEW), &monmap); - client->init(); - - // start up fuse - // use my argc, argv (make sure you pass a mount point!) - cout << "mounting" << endl; - client->mount(); - - cerr << "starting fuse on pid " << getpid() << endl; - ceph_fuse_main(client, argc, argv); - cerr << "fuse finished on pid " << getpid() << endl; - - client->unmount(); - cout << "unmounted" << endl; - client->shutdown(); - - delete client; - - // wait for messenger to finish - rank.wait(); - - return 0; -} - diff --git a/branches/marnberg/quota/client/Client.cc b/branches/marnberg/quota/client/Client.cc deleted file mode 100644 index 4e4a6a1b6b737..0000000000000 --- a/branches/marnberg/quota/client/Client.cc +++ /dev/null @@ -1,2757 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -// unix-ey fs stuff -#include -#include -#include -#include -#include -#include - -#include - - -#include -using namespace std; - - -// ceph stuff -#include "Client.h" - - -#include "messages/MClientBoot.h" -#include "messages/MClientMount.h" -#include "messages/MClientMountAck.h" -#include "messages/MClientFileCaps.h" - -#include "messages/MGenericMessage.h" - -#include "messages/MMDSGetMap.h" -#include "messages/MMDSMap.h" - -#include "osdc/Filer.h" -#include "osdc/Objecter.h" -#include "osdc/ObjectCacher.h" - -#include "common/Cond.h" -#include "common/Mutex.h" -#include "common/Logger.h" - - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_client) cout << g_clock.now() << " client" << whoami << "." << pthread_self() << " " - -#define tout if (g_conf.client_trace) cout << "trace: " - - -// static logger -LogType client_logtype; -Logger *client_logger = 0; - - - -class C_Client_CloseRelease : public Context { - Client *cl; - Inode *in; -public: - C_Client_CloseRelease(Client *c, Inode *i) : cl(c), in(i) {} - void finish(int) { - cl->close_release(in); - } -}; - -class C_Client_CloseSafe : public Context { - Client *cl; - Inode *in; -public: - C_Client_CloseSafe(Client *c, Inode *i) : cl(c), in(i) {} - void finish(int) { - cl->close_safe(in); - } -}; - - - - - - -// cons/des - -Client::Client(Messenger *m, MonMap *mm) -{ - // which client am i? - whoami = m->get_myname().num(); - monmap = mm; - - mounted = false; - unmounting = false; - - last_tid = 0; - unsafe_sync_write = 0; - - mdsmap = 0; - - // - root = 0; - - set_cache_size(g_conf.client_cache_size); - - // file handles - free_fh_set.insert(10, 1<<30); - - // set up messengers - messenger = m; - messenger->set_dispatcher(this); - - // osd interfaces - osdmap = new OSDMap(); // initially blank.. see mount() - objecter = new Objecter(messenger, monmap, osdmap); - objectcacher = new ObjectCacher(objecter, client_lock); - filer = new Filer(objecter); -} - - -Client::~Client() -{ - tear_down_cache(); - - if (objectcacher) { - delete objectcacher; - objectcacher = 0; - } - - if (filer) { delete filer; filer = 0; } - if (objecter) { delete objecter; objecter = 0; } - if (osdmap) { delete osdmap; osdmap = 0; } - if (mdsmap) { delete mdsmap; mdsmap = 0; } - - if (messenger) { delete messenger; messenger = 0; } -} - - -void Client::tear_down_cache() -{ - // fh's - for (hash_map::iterator it = fh_map.begin(); - it != fh_map.end(); - it++) { - Fh *fh = it->second; - dout(1) << "tear_down_cache forcing close of fh " << it->first << " ino " << fh->inode->inode.ino << endl; - put_inode(fh->inode); - delete fh; - } - fh_map.clear(); - - // caps! - // *** FIXME *** - - // empty lru - lru.lru_set_max(0); - trim_cache(); - assert(lru.lru_get_size() == 0); - - // close root ino - assert(inode_map.size() <= 1); - if (root && inode_map.size() == 1) { - delete root; - root = 0; - inode_map.clear(); - } - - assert(inode_map.empty()); -} - - - -// debug crapola - -void Client::dump_inode(Inode *in, set& did) -{ - dout(1) << "dump_inode: inode " << in->ino() << " ref " << in->ref << " dir " << in->dir << endl; - - if (in->dir) { - dout(1) << " dir size " << in->dir->dentries.size() << endl; - //for (hash_map, eqstr>::iterator it = in->dir->dentries.begin(); - for (hash_map::iterator it = in->dir->dentries.begin(); - it != in->dir->dentries.end(); - it++) { - dout(1) << " dn " << it->first << " ref " << it->second->ref << endl; - dump_inode(it->second->inode, did); - } - } -} - -void Client::dump_cache() -{ - set did; - - if (root) dump_inode(root, did); - - for (hash_map::iterator it = inode_map.begin(); - it != inode_map.end(); - it++) { - if (did.count(it->second)) continue; - - dout(1) << "dump_cache: inode " << it->first - << " ref " << it->second->ref - << " dir " << it->second->dir << endl; - if (it->second->dir) { - dout(1) << " dir size " << it->second->dir->dentries.size() << endl; - } - } - -} - - -void Client::init() { - -} - -void Client::shutdown() { - dout(1) << "shutdown" << endl; - messenger->shutdown(); -} - - - - -// =================== -// metadata cache stuff - -void Client::trim_cache() -{ - unsigned last = 0; - while (lru.lru_get_size() != last) { - last = lru.lru_get_size(); - - if (lru.lru_get_size() <= lru.lru_get_max()) break; - - // trim! - Dentry *dn = (Dentry*)lru.lru_expire(); - if (!dn) break; // done - - //dout(10) << "trim_cache unlinking dn " << dn->name << " in dir " << hex << dn->dir->inode->inode.ino << endl; - unlink(dn); - } - - // hose root? - if (lru.lru_get_size() == 0 && root && inode_map.size() == 1) { - delete root; - root = 0; - inode_map.clear(); - } -} - -/** insert_inode - * - * insert + link a single dentry + inode into the metadata cache. - */ -Inode* Client::insert_inode(Dir *dir, InodeStat *st, const string& dname) -{ - Dentry *dn = NULL; - if (dir->dentries.count(dname)) - dn = dir->dentries[dname]; - - dout(12) << "insert_inode " << dname << " ino " << st->inode.ino - << " size " << st->inode.size - << " mtime " << st->inode.mtime - << " hashed " << st->hashed - << endl; - - if (dn) { - if (dn->inode->inode.ino == st->inode.ino) { - touch_dn(dn); - dout(12) << " had dentry " << dname - << " with correct ino " << dn->inode->inode.ino - << endl; - } else { - dout(12) << " had dentry " << dname - << " with WRONG ino " << dn->inode->inode.ino - << endl; - unlink(dn); - dn = NULL; - } - } - - if (!dn) { - // have inode linked elsewhere? -> unlink and relink! - if (inode_map.count(st->inode.ino)) { - Inode *in = inode_map[st->inode.ino]; - assert(in); - - if (in->dn) { - dout(12) << " had ino " << in->inode.ino - << " linked at wrong position, unlinking" - << endl; - dn = relink(in->dn, dir, dname); - } else { - // link - dout(12) << " had ino " << in->inode.ino - << " unlinked, linking" << endl; - dn = link(dir, dname, in); - } - } - } - - if (!dn) { - Inode *in = new Inode(st->inode, objectcacher); - inode_map[st->inode.ino] = in; - dn = link(dir, dname, in); - dout(12) << " new dentry+node with ino " << st->inode.ino << endl; - } else { - // actually update info - dout(12) << " stat inode mask is " << st->inode.mask << endl; - dn->inode->inode = st->inode; - - // ...but don't clobber our mtime, size! - if ((dn->inode->inode.mask & INODE_MASK_SIZE) == 0 && - dn->inode->file_wr_size > dn->inode->inode.size) - dn->inode->inode.size = dn->inode->file_wr_size; - if ((dn->inode->inode.mask & INODE_MASK_MTIME) == 0 && - dn->inode->file_wr_mtime > dn->inode->inode.mtime) - dn->inode->inode.mtime = dn->inode->file_wr_mtime; - } - - // OK, we found it! - assert(dn && dn->inode); - - // or do we have newer size/mtime from writing? - if (dn->inode->file_caps() & CAP_FILE_WR) { - if (dn->inode->file_wr_size > dn->inode->inode.size) - dn->inode->inode.size = dn->inode->file_wr_size; - if (dn->inode->file_wr_mtime > dn->inode->inode.mtime) - dn->inode->inode.mtime = dn->inode->file_wr_mtime; - } - - // symlink? - if ((dn->inode->inode.mode & INODE_TYPE_MASK) == INODE_MODE_SYMLINK) { - if (!dn->inode->symlink) - dn->inode->symlink = new string; - *(dn->inode->symlink) = st->symlink; - } - - return dn->inode; -} - -/** update_inode_dist - * - * update MDS location cache for a single inode - */ -void Client::update_inode_dist(Inode *in, InodeStat *st) -{ - // dir info - in->dir_auth = st->dir_auth; - in->dir_hashed = st->hashed; - in->dir_replicated = st->replicated; - - // dir replication - if (st->spec_defined) { - if (st->dist.empty() && !in->dir_contacts.empty()) - dout(9) << "lost dist spec for " << in->inode.ino - << " " << st->dist << endl; - if (!st->dist.empty() && in->dir_contacts.empty()) - dout(9) << "got dist spec for " << in->inode.ino - << " " << st->dist << endl; - in->dir_contacts = st->dist; - } -} - - -/** insert_trace - * - * insert a trace from a MDS reply into the cache. - */ -Inode* Client::insert_trace(MClientReply *reply) -{ - Inode *cur = root; - time_t now = time(NULL); - - dout(10) << "insert_trace got " << reply->get_trace_in().size() << " inodes" << endl; - - list::const_iterator pdn = reply->get_trace_dn().begin(); - - for (list::const_iterator pin = reply->get_trace_in().begin(); - pin != reply->get_trace_in().end(); - ++pin) { - - if (pin == reply->get_trace_in().begin()) { - // root - dout(10) << "insert_trace root" << endl; - if (!root) { - // create - cur = root = new Inode((*pin)->inode, objectcacher); - inode_map[root->inode.ino] = root; - } - } else { - // not root. - dout(10) << "insert_trace dn " << *pdn << " ino " << (*pin)->inode.ino << endl; - Dir *dir = cur->open_dir(); - cur = this->insert_inode(dir, *pin, *pdn); - ++pdn; - - // move to top of lru! - if (cur->dn) - lru.lru_touch(cur->dn); - } - - // update dist info - update_inode_dist(cur, *pin); - - // set cache ttl - if (g_conf.client_cache_stat_ttl) - cur->valid_until = now + g_conf.client_cache_stat_ttl; - } - - return cur; -} - - - - -Dentry *Client::lookup(filepath& path) -{ - dout(14) << "lookup " << path << endl; - - Inode *cur = root; - if (!cur) return NULL; - - Dentry *dn = 0; - for (unsigned i=0; iinode << " valid_until " << dn->inode->valid_until << endl; - } else { - dout(14) << " dentry " << path[i] << " dne" << endl; - return NULL; - } - cur = dn->inode; - assert(cur); - } else { - return NULL; // not a dir - } - } - - if (dn) { - dout(11) << "lookup '" << path << "' found " << dn->name << " inode " << dn->inode->inode.ino << " valid_until " << dn->inode->valid_until<< endl; - } - - return dn; -} - -// ------- - -MClientReply *Client::make_request(MClientRequest *req, - bool auth_best, - int use_mds) // this param is purely for debug hacking -{ - // assign a unique tid - req->set_tid(++last_tid); - - // find deepest known prefix - Inode *diri = root; // the deepest known containing dir - Inode *item = 0; // the actual item... if we know it - int missing_dn = -1; // which dn we miss on (if we miss) - - unsigned depth = req->get_filepath().depth(); - for (unsigned i=0; iinode.mode & INODE_MODE_DIR && diri->dir) { - Dir *dir = diri->dir; - - // do we have the next dentry? - if (dir->dentries.count( req->get_filepath()[i] ) == 0) { - missing_dn = i; // no. - break; - } - - dout(7) << " have path seg " << i << " on " << diri->dir_auth << " ino " << diri->inode.ino << " " << req->get_filepath()[i] << endl; - - if (i == depth-1) { // last one! - item = dir->dentries[ req->get_filepath()[i] ]->inode; - break; - } - - // continue.. - diri = dir->dentries[ req->get_filepath()[i] ]->inode; - assert(diri); - } else { - missing_dn = i; - break; - } - } - - // choose an mds - int mds = 0; - if (!diri || g_conf.client_use_random_mds) { - // no root info, pick a random MDS - mds = rand() % mdsmap->get_num_mds(); - } else { - if (auth_best) { - // pick the actual auth (as best we can) - if (item) { - mds = item->authority(mdsmap); - } else if (diri->dir_hashed && missing_dn >= 0) { - mds = diri->dentry_authority(req->get_filepath()[missing_dn].c_str(), - mdsmap); - } else { - mds = diri->authority(mdsmap); - } - } else { - // balance our traffic! - if (diri->dir_hashed && missing_dn >= 0) - mds = diri->dentry_authority(req->get_filepath()[missing_dn].c_str(), - mdsmap); - else - mds = diri->pick_replica(mdsmap); - } - } - dout(20) << "mds is " << mds << endl; - - // force use of a particular mds? - if (use_mds >= 0) mds = use_mds; - - - // time the call - utime_t start = g_clock.now(); - - bool nojournal = false; - int op = req->get_op(); - if (op == MDS_OP_STAT || - op == MDS_OP_LSTAT || - op == MDS_OP_READDIR || - op == MDS_OP_OPEN || - op == MDS_OP_RELEASE) - nojournal = true; - - MClientReply *reply = sendrecv(req, mds); - - if (client_logger) { - utime_t lat = g_clock.now(); - lat -= start; - dout(20) << "lat " << lat << endl; - client_logger->finc("lsum",(double)lat); - client_logger->inc("lnum"); - - if (nojournal) { - client_logger->finc("lrsum",(double)lat); - client_logger->inc("lrnum"); - } else { - client_logger->finc("lwsum",(double)lat); - client_logger->inc("lwnum"); - } - - if (op == MDS_OP_STAT) { - client_logger->finc("lstatsum",(double)lat); - client_logger->inc("lstatnum"); - } - else if (op == MDS_OP_READDIR) { - client_logger->finc("ldirsum",(double)lat); - client_logger->inc("ldirnum"); - } - - } - - return reply; -} - - -MClientReply* Client::sendrecv(MClientRequest *req, int mds) -{ - // NEW way. - Cond cond; - tid_t tid = req->get_tid(); - mds_rpc_cond[tid] = &cond; - - messenger->send_message(req, mdsmap->get_inst(mds), MDS_PORT_SERVER); - - // wait - while (mds_rpc_reply.count(tid) == 0) { - dout(20) << "sendrecv awaiting reply kick on " << &cond << endl; - cond.Wait(client_lock); - } - - // got it! - MClientReply *reply = mds_rpc_reply[tid]; - - // kick dispatcher (we've got it!) - assert(mds_rpc_dispatch_cond.count(tid)); - mds_rpc_dispatch_cond[tid]->Signal(); - dout(20) << "sendrecv kickback on tid " << tid << " " << mds_rpc_dispatch_cond[tid] << endl; - - // clean up. - mds_rpc_cond.erase(tid); - mds_rpc_reply.erase(tid); - - return reply; -} - -void Client::handle_client_reply(MClientReply *reply) -{ - tid_t tid = reply->get_tid(); - - // store reply - mds_rpc_reply[tid] = reply; - - // wake up waiter - assert(mds_rpc_cond.count(tid)); - dout(20) << "handle_client_reply kicking caller on " << mds_rpc_cond[tid] << endl; - mds_rpc_cond[tid]->Signal(); - - // wake for kick back - assert(mds_rpc_dispatch_cond.count(tid) == 0); - Cond cond; - mds_rpc_dispatch_cond[tid] = &cond; - while (mds_rpc_cond.count(tid)) { - dout(20) << "handle_client_reply awaiting kickback on tid " << tid << " " << &cond << endl; - cond.Wait(client_lock); - } - - // ok, clean up! - mds_rpc_dispatch_cond.erase(tid); -} - - -// ------------------------ -// incoming messages - -void Client::dispatch(Message *m) -{ - client_lock.Lock(); - - switch (m->get_type()) { - // osd - case MSG_OSD_OPREPLY: - objecter->handle_osd_op_reply((MOSDOpReply*)m); - break; - - case MSG_OSD_MAP: - objecter->handle_osd_map((class MOSDMap*)m); - break; - - // client - case MSG_MDS_MAP: - handle_mds_map((MMDSMap*)m); - break; - - case MSG_CLIENT_REPLY: - handle_client_reply((MClientReply*)m); - break; - - case MSG_CLIENT_FILECAPS: - handle_file_caps((MClientFileCaps*)m); - break; - - case MSG_CLIENT_MOUNTACK: - handle_mount_ack((MClientMountAck*)m); - break; - case MSG_CLIENT_UNMOUNT: - handle_unmount_ack(m); - break; - - - default: - cout << "dispatch doesn't recognize message type " << m->get_type() << endl; - assert(0); // fail loudly - break; - } - - // unmounting? - if (unmounting) { - dout(10) << "unmounting: trim pass, size was " << lru.lru_get_size() - << "+" << inode_map.size() << endl; - trim_cache(); - if (lru.lru_get_size() == 0 && inode_map.empty()) { - dout(10) << "unmounting: trim pass, cache now empty, waking unmount()" << endl; - mount_cond.Signal(); - } else { - dout(10) << "unmounting: trim pass, size still " << lru.lru_get_size() - << "+" << inode_map.size() << endl; - dump_cache(); - } - } - - client_lock.Unlock(); -} - - -void Client::handle_mds_map(MMDSMap* m) -{ - if (mdsmap == 0) - mdsmap = new MDSMap; - - if (whoami < 0) { - whoami = m->get_dest().num(); - dout(1) << "handle_mds_map i am now " << m->get_dest() << endl; - messenger->reset_myname(m->get_dest()); - } - - dout(1) << "handle_mds_map epoch " << m->get_epoch() << endl; - mdsmap->decode(m->get_encoded()); - - delete m; - - // note our inc # - objecter->set_client_incarnation(0); // fixme - - mount_cond.Signal(); // mount might be waiting for this. -} - - -/**** - * caps - */ - - -class C_Client_ImplementedCaps : public Context { - Client *client; - MClientFileCaps *msg; - Inode *in; -public: - C_Client_ImplementedCaps(Client *c, MClientFileCaps *m, Inode *i) : client(c), msg(m), in(i) {} - void finish(int r) { - client->implemented_caps(msg,in); - } -}; - -/** handle_file_caps - * handle caps update from mds. including mds to mds caps transitions. - * do not block. - */ -void Client::handle_file_caps(MClientFileCaps *m) -{ - int mds = m->get_source().num(); - Inode *in = 0; - if (inode_map.count(m->get_ino())) in = inode_map[ m->get_ino() ]; - - m->clear_payload(); // for if/when we send back to MDS - - // reap? - if (m->get_special() == MClientFileCaps::FILECAP_REAP) { - int other = m->get_mds(); - - if (in && in->stale_caps.count(other)) { - dout(5) << "handle_file_caps on ino " << m->get_ino() << " from mds" << mds << " reap on mds" << other << endl; - - // fresh from new mds? - if (!in->caps.count(mds)) { - if (in->caps.empty()) in->get(); - in->caps[mds].seq = m->get_seq(); - in->caps[mds].caps = m->get_caps(); - } - - assert(in->stale_caps.count(other)); - in->stale_caps.erase(other); - if (in->stale_caps.empty()) put_inode(in); // note: this will never delete *in - - // fall-thru! - } else { - dout(5) << "handle_file_caps on ino " << m->get_ino() << " from mds" << mds << " premature (!!) reap on mds" << other << endl; - // delay! - cap_reap_queue[in->ino()][other] = m; - return; - } - } - - assert(in); - - // stale? - if (m->get_special() == MClientFileCaps::FILECAP_STALE) { - dout(5) << "handle_file_caps on ino " << m->get_ino() << " seq " << m->get_seq() << " from mds" << mds << " now stale" << endl; - - // move to stale list - assert(in->caps.count(mds)); - if (in->stale_caps.empty()) in->get(); - in->stale_caps[mds] = in->caps[mds]; - - assert(in->caps.count(mds)); - in->caps.erase(mds); - if (in->caps.empty()) in->put(); - - // delayed reap? - if (cap_reap_queue.count(in->ino()) && - cap_reap_queue[in->ino()].count(mds)) { - dout(5) << "handle_file_caps on ino " << m->get_ino() << " from mds" << mds << " delayed reap on mds" << m->get_mds() << endl; - - // process delayed reap - handle_file_caps( cap_reap_queue[in->ino()][mds] ); - - cap_reap_queue[in->ino()].erase(mds); - if (cap_reap_queue[in->ino()].empty()) - cap_reap_queue.erase(in->ino()); - } - delete m; - return; - } - - // release? - if (m->get_special() == MClientFileCaps::FILECAP_RELEASE) { - dout(5) << "handle_file_caps on ino " << m->get_ino() << " from mds" << mds << " release" << endl; - assert(in->caps.count(mds)); - in->caps.erase(mds); - for (map::iterator p = in->caps.begin(); - p != in->caps.end(); - p++) - dout(20) << " left cap " << p->first << " " - << cap_string(p->second.caps) << " " - << p->second.seq << endl; - for (map::iterator p = in->stale_caps.begin(); - p != in->stale_caps.end(); - p++) - dout(20) << " left stale cap " << p->first << " " - << cap_string(p->second.caps) << " " - << p->second.seq << endl; - - if (in->caps.empty()) { - //dout(0) << "did put_inode" << endl; - put_inode(in); - } else { - //dout(0) << "didn't put_inode" << endl; - } - delete m; - return; - } - - - // don't want? - if (in->file_caps_wanted() == 0) { - dout(5) << "handle_file_caps on ino " << m->get_ino() - << " seq " << m->get_seq() - << " " << cap_string(m->get_caps()) - << ", which we don't want caps for, releasing." << endl; - m->set_caps(0); - m->set_wanted(0); - messenger->send_message(m, m->get_source_inst(), m->get_source_port()); - return; - } - - assert(in->caps.count(mds)); - - // update per-mds caps - const int old_caps = in->caps[mds].caps; - const int new_caps = m->get_caps(); - in->caps[mds].caps = new_caps; - in->caps[mds].seq = m->get_seq(); - dout(5) << "handle_file_caps on in " << m->get_ino() - << " mds" << mds << " seq " << m->get_seq() - << " caps now " << cap_string(new_caps) - << " was " << cap_string(old_caps) << endl; - - // did file size decrease? - if ((old_caps & new_caps & CAP_FILE_RDCACHE) && - in->inode.size > m->get_inode().size) { - dout(10) << "**** file size decreased from " << in->inode.size << " to " << m->get_inode().size << " FIXME" << endl; - // must have been a truncate() by someone. - // trim the buffer cache - // ***** fixme write me **** - - in->file_wr_size = m->get_inode().size; //?? - } - - // update inode - in->inode = m->get_inode(); // might have updated size... FIXME this is overkill! - - // preserve our (possibly newer) file size, mtime - if (in->file_wr_size > in->inode.size) - m->get_inode().size = in->inode.size = in->file_wr_size; - if (in->file_wr_mtime > in->inode.mtime) - m->get_inode().mtime = in->inode.mtime = in->file_wr_mtime; - - if (g_conf.client_oc) { - // caching on, use FileCache. - Context *onimplement = 0; - if (old_caps & ~new_caps) { // this mds is revoking caps - if (in->fc.get_caps() & ~(in->file_caps())) // net revocation - onimplement = new C_Client_ImplementedCaps(this, m, in); - else { - implemented_caps(m, in); // ack now. - } - } - in->fc.set_caps(new_caps, onimplement); - } else { - // caching off. - - // wake up waiters? - if (new_caps & CAP_FILE_RD) { - for (list::iterator it = in->waitfor_read.begin(); - it != in->waitfor_read.end(); - it++) { - dout(5) << "signaling read waiter " << *it << endl; - (*it)->Signal(); - } - in->waitfor_read.clear(); - } - if (new_caps & CAP_FILE_WR) { - for (list::iterator it = in->waitfor_write.begin(); - it != in->waitfor_write.end(); - it++) { - dout(5) << "signaling write waiter " << *it << endl; - (*it)->Signal(); - } - in->waitfor_write.clear(); - } - if (new_caps & CAP_FILE_LAZYIO) { - for (list::iterator it = in->waitfor_lazy.begin(); - it != in->waitfor_lazy.end(); - it++) { - dout(5) << "signaling lazy waiter " << *it << endl; - (*it)->Signal(); - } - in->waitfor_lazy.clear(); - } - - // ack? - if (old_caps & ~new_caps) { - if (in->sync_writes) { - // wait for sync writes to finish - dout(5) << "sync writes in progress, will ack on finish" << endl; - in->waitfor_no_write.push_back(new C_Client_ImplementedCaps(this, m, in)); - } else { - // ok now - implemented_caps(m, in); - } - } else { - // discard - delete m; - } - } -} - -void Client::implemented_caps(MClientFileCaps *m, Inode *in) -{ - dout(5) << "implemented_caps " << cap_string(m->get_caps()) - << ", acking to " << m->get_source() << endl; - - if (in->file_caps() == 0) { - in->file_wr_mtime = 0; - in->file_wr_size = 0; - } - - messenger->send_message(m, m->get_source_inst(), m->get_source_port()); -} - - -void Client::release_caps(Inode *in, - int retain) -{ - dout(5) << "releasing caps on ino " << in->inode.ino << dec - << " had " << cap_string(in->file_caps()) - << " retaining " << cap_string(retain) - << " want " << cap_string(in->file_caps_wanted()) - << endl; - - for (map::iterator it = in->caps.begin(); - it != in->caps.end(); - it++) { - //if (it->second.caps & ~retain) { - if (1) { - // release (some of?) these caps - it->second.caps = retain & it->second.caps; - // note: tell mds _full_ wanted; it'll filter/behave based on what it is allowed to do - MClientFileCaps *m = new MClientFileCaps(in->inode, - it->second.seq, - it->second.caps, - in->file_caps_wanted()); - messenger->send_message(m, mdsmap->get_inst(it->first), MDS_PORT_LOCKER); - } - } - - if (in->file_caps() == 0) { - in->file_wr_mtime = 0; - in->file_wr_size = 0; - } -} - -void Client::update_caps_wanted(Inode *in) -{ - dout(5) << "updating caps wanted on ino " << in->inode.ino - << " to " << cap_string(in->file_caps_wanted()) - << endl; - - // FIXME: pick a single mds and let the others off the hook.. - for (map::iterator it = in->caps.begin(); - it != in->caps.end(); - it++) { - MClientFileCaps *m = new MClientFileCaps(in->inode, - it->second.seq, - it->second.caps, - in->file_caps_wanted()); - messenger->send_message(m, - mdsmap->get_inst(it->first), MDS_PORT_LOCKER); - } -} - - - -// ------------------- -// fs ops - -int Client::mount() -{ - client_lock.Lock(); - - assert(!mounted); // caller is confused? - - // FIXME mds map update race with mount. - - dout(2) << "sending boot msg to monitor" << endl; - if (mdsmap) - delete mdsmap; - int mon = monmap->pick_mon(); - messenger->send_message(new MClientBoot(), - monmap->get_inst(mon)); - - while (!mdsmap) - mount_cond.Wait(client_lock); - - dout(2) << "mounting" << endl; - MClientMount *m = new MClientMount(); - - int who = 0; // mdsmap->get_root(); // mount at root, for now - messenger->send_message(m, - mdsmap->get_inst(who), - MDS_PORT_SERVER); - - while (!mounted) - mount_cond.Wait(client_lock); - - client_lock.Unlock(); - - /* - dout(3) << "op: // client trace data structs" << endl; - dout(3) << "op: struct stat st;" << endl; - dout(3) << "op: struct utimbuf utim;" << endl; - dout(3) << "op: int readlinkbuf_len = 1000;" << endl; - dout(3) << "op: char readlinkbuf[readlinkbuf_len];" << endl; - dout(3) << "op: map dir_contents;" << endl; - dout(3) << "op: map open_files;" << endl; - dout(3) << "op: fh_t fh;" << endl; - */ - return 0; -} - -void Client::handle_mount_ack(MClientMountAck *m) -{ - // mdsmap! - if (!mdsmap) mdsmap = new MDSMap; - mdsmap->decode(m->get_mds_map_state()); - - // we got osdmap! - osdmap->decode(m->get_osd_map_state()); - - dout(2) << "mounted" << endl; - mounted = true; - mount_cond.Signal(); - - delete m; -} - - -int Client::unmount() -{ - client_lock.Lock(); - - assert(mounted); // caller is confused? - - dout(2) << "unmounting" << endl; - unmounting = true; - - // NOTE: i'm assuming all caches are already flushing (because all files are closed). - assert(fh_map.empty()); - - // empty lru cache - lru.lru_set_max(0); - trim_cache(); - - if (g_conf.client_oc) { - // release any/all caps - for (hash_map::iterator p = inode_map.begin(); - p != inode_map.end(); - p++) { - Inode *in = p->second; - if (!in->caps.empty()) { - in->fc.release_clean(); - if (in->fc.is_dirty()) { - dout(10) << "unmount residual caps on " << in->ino() << ", flushing" << endl; - in->fc.empty(new C_Client_CloseRelease(this, in)); - } else { - dout(10) << "unmount residual caps on " << in->ino() << ", releasing" << endl; - release_caps(in); - } - } - } - } - - while (lru.lru_get_size() > 0 || - !inode_map.empty()) { - dout(2) << "cache still has " << lru.lru_get_size() - << "+" << inode_map.size() << " items" - << ", waiting (presumably for safe or for caps to be released?)" - << endl; - dump_cache(); - mount_cond.Wait(client_lock); - } - assert(lru.lru_get_size() == 0); - assert(inode_map.empty()); - - // unsafe writes - if (!g_conf.client_oc) { - while (unsafe_sync_write > 0) { - dout(0) << unsafe_sync_write << " unsafe_sync_writes, waiting" - << endl; - mount_cond.Wait(client_lock); - } - } - - // send unmount! - Message *req = new MGenericMessage(MSG_CLIENT_UNMOUNT); - messenger->send_message(req, mdsmap->get_inst(0), MDS_PORT_SERVER); - - while (mounted) - mount_cond.Wait(client_lock); - - dout(2) << "unmounted" << endl; - - client_lock.Unlock(); - return 0; -} - -void Client::handle_unmount_ack(Message* m) -{ - dout(1) << "got unmount ack" << endl; - mounted = false; - mount_cond.Signal(); - delete m; -} - - - -// namespace ops - -int Client::link(const char *existing, const char *newname) -{ - client_lock.Lock(); - dout(3) << "op: client->link(\"" << existing << "\", \"" << newname << "\");" << endl; - tout << "link" << endl; - tout << existing << endl; - tout << newname << endl; - - - // main path arg is new link name - // sarg is target (existing file) - - - MClientRequest *req = new MClientRequest(MDS_OP_LINK, whoami); - req->set_path(newname); - req->set_sarg(existing); - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - MClientReply *reply = make_request(req, true); - int res = reply->get_result(); - - insert_trace(reply); - delete reply; - dout(10) << "link result is " << res << endl; - - trim_cache(); - client_lock.Unlock(); - return res; -} - - -int Client::unlink(const char *relpath) -{ - client_lock.Lock(); - - string abspath; - mkabspath(relpath, abspath); - const char *path = abspath.c_str(); - - dout(3) << "op: client->unlink\(\"" << path << "\");" << endl; - tout << "unlink" << endl; - tout << path << endl; - - - MClientRequest *req = new MClientRequest(MDS_OP_UNLINK, whoami); - req->set_path(path); - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - //FIXME enforce caller uid rights? - - MClientReply *reply = make_request(req, true); - int res = reply->get_result(); - if (res == 0) { - // remove from local cache - filepath fp(path); - Dentry *dn = lookup(fp); - if (dn) { - assert(dn->inode); - unlink(dn); - } - } - insert_trace(reply); - delete reply; - dout(10) << "unlink result is " << res << endl; - - trim_cache(); - client_lock.Unlock(); - return res; -} - -int Client::rename(const char *relfrom, const char *relto) -{ - client_lock.Lock(); - - string absfrom; - mkabspath(relfrom, absfrom); - const char *from = absfrom.c_str(); - string absto; - mkabspath(relto, absto); - const char *to = absto.c_str(); - - dout(3) << "op: client->rename(\"" << from << "\", \"" << to << "\");" << endl; - tout << "rename" << endl; - tout << from << endl; - tout << to << endl; - - - MClientRequest *req = new MClientRequest(MDS_OP_RENAME, whoami); - req->set_path(from); - req->set_sarg(to); - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - //FIXME enforce caller uid rights? - - MClientReply *reply = make_request(req, true); - int res = reply->get_result(); - insert_trace(reply); - delete reply; - dout(10) << "rename result is " << res << endl; - - trim_cache(); - client_lock.Unlock(); - return res; -} - -// dirs - -int Client::mkdir(const char *relpath, mode_t mode) -{ - client_lock.Lock(); - - string abspath; - mkabspath(relpath, abspath); - const char *path = abspath.c_str(); - - dout(3) << "op: client->mkdir(\"" << path << "\", " << mode << ");" << endl; - tout << "mkdir" << endl; - tout << path << endl; - tout << mode << endl; - - - MClientRequest *req = new MClientRequest(MDS_OP_MKDIR, whoami); - req->set_path(path); - req->set_iarg( (int)mode ); - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - //FIXME enforce caller uid rights? - - MClientReply *reply = make_request(req, true); - int res = reply->get_result(); - insert_trace(reply); - delete reply; - dout(10) << "mkdir result is " << res << endl; - - trim_cache(); - client_lock.Unlock(); - return res; -} - -int Client::rmdir(const char *relpath) -{ - client_lock.Lock(); - - string abspath; - mkabspath(relpath, abspath); - const char *path = abspath.c_str(); - - dout(3) << "op: client->rmdir(\"" << path << "\");" << endl; - tout << "rmdir" << endl; - tout << path << endl; - - - MClientRequest *req = new MClientRequest(MDS_OP_RMDIR, whoami); - req->set_path(path); - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - //FIXME enforce caller uid rights? - - MClientReply *reply = make_request(req, true); - int res = reply->get_result(); - if (res == 0) { - // remove from local cache - filepath fp(path); - Dentry *dn = lookup(fp); - if (dn) { - if (dn->inode->dir && dn->inode->dir->is_empty()) - close_dir(dn->inode->dir); // FIXME: maybe i shoudl proactively hose the whole subtree from cache? - unlink(dn); - } - } - insert_trace(reply); - delete reply; - dout(10) << "rmdir result is " << res << endl; - - trim_cache(); - client_lock.Unlock(); - return res; -} - -// symlinks - -int Client::symlink(const char *reltarget, const char *rellink) -{ - client_lock.Lock(); - - string abstarget; - mkabspath(reltarget, abstarget); - const char *target = abstarget.c_str(); - string abslink; - mkabspath(rellink, abslink); - const char *link = abslink.c_str(); - - dout(3) << "op: client->symlink(\"" << target << "\", \"" << link << "\");" << endl; - tout << "symlink" << endl; - tout << target << endl; - tout << link << endl; - - - MClientRequest *req = new MClientRequest(MDS_OP_SYMLINK, whoami); - req->set_path(link); - req->set_sarg(target); - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - //FIXME enforce caller uid rights? - - MClientReply *reply = make_request(req, true); - int res = reply->get_result(); - insert_trace(reply); //FIXME assuming trace of link, not of target - delete reply; - dout(10) << "symlink result is " << res << endl; - - trim_cache(); - client_lock.Unlock(); - return res; -} - -int Client::readlink(const char *relpath, char *buf, off_t size) -{ - client_lock.Lock(); - - string abspath; - mkabspath(relpath, abspath); - const char *path = abspath.c_str(); - - dout(3) << "op: client->readlink(\"" << path << "\", readlinkbuf, readlinkbuf_len);" << endl; - tout << "readlink" << endl; - tout << path << endl; - client_lock.Unlock(); - - // stat first (FIXME, PERF access cache directly) **** - struct stat stbuf; - int r = this->lstat(path, &stbuf); - if (r != 0) return r; - - client_lock.Lock(); - - // pull symlink content from cache - Inode *in = inode_map[stbuf.st_ino]; - assert(in); // i just did a stat - - // copy into buf (at most size bytes) - unsigned res = in->symlink->length(); - if (res > size) res = size; - memcpy(buf, in->symlink->c_str(), res); - - trim_cache(); - client_lock.Unlock(); - return res; // return length in bytes (to mimic the system call) -} - - - -// inode stuff - -int Client::_lstat(const char *path, int mask, Inode **in) -{ - MClientRequest *req = 0; - filepath fpath(path); - - // check whether cache content is fresh enough - int res = 0; - - Dentry *dn = lookup(fpath); - inode_t inode; - time_t now = time(NULL); - if (dn && - now <= dn->inode->valid_until && - ((dn->inode->inode.mask & INODE_MASK_ALL_STAT) == INODE_MASK_ALL_STAT)) { - inode = dn->inode->inode; - dout(10) << "lstat cache hit w/ sufficient inode.mask, valid until " << dn->inode->valid_until << endl; - - if (g_conf.client_cache_stat_ttl == 0) - dn->inode->valid_until = 0; // only one stat allowed after each readdir - - *in = dn->inode; - } else { - // FIXME where does FUSE maintain user information - //struct fuse_context *fc = fuse_get_context(); - //req->set_caller_uid(fc->uid); - //req->set_caller_gid(fc->gid); - - req = new MClientRequest(MDS_OP_LSTAT, whoami); - req->set_iarg(mask); - req->set_path(fpath); - - MClientReply *reply = make_request(req); - res = reply->get_result(); - dout(10) << "lstat res is " << res << endl; - if (res == 0) { - //Transfer information from reply to stbuf - inode = reply->get_inode(); - - //Update metadata cache - *in = insert_trace(reply); - } - - delete reply; - - if (res != 0) - *in = 0; // not a success. - } - - return res; -} - - -void Client::fill_stat(inode_t& inode, struct stat *st) -{ - memset(st, 0, sizeof(struct stat)); - st->st_ino = inode.ino; - st->st_mode = inode.mode; - st->st_nlink = inode.nlink; - st->st_uid = inode.uid; - st->st_gid = inode.gid; - st->st_ctime = inode.ctime; - st->st_atime = inode.atime; - st->st_mtime = inode.mtime; - st->st_size = inode.size; - st->st_blocks = inode.size ? ((inode.size - 1) / 4096 + 1):0; - st->st_blksize = 4096; -} - -void Client::fill_statlite(inode_t& inode, struct statlite *st) -{ - memset(st, 0, sizeof(struct stat)); - st->st_ino = inode.ino; - st->st_mode = inode.mode; - st->st_nlink = inode.nlink; - st->st_uid = inode.uid; - st->st_gid = inode.gid; -#ifndef DARWIN - // FIXME what's going on here with darwin? - st->st_ctime = inode.ctime; - st->st_atime = inode.atime; - st->st_mtime = inode.mtime; -#endif - st->st_size = inode.size; - st->st_blocks = inode.size ? ((inode.size - 1) / 4096 + 1):0; - st->st_blksize = 4096; - - /* - S_REQUIREBLKSIZE(st->st_litemask); - if (inode.mask & INODE_MASK_BASE) S_REQUIRECTIME(st->st_litemask); - if (inode.mask & INODE_MASK_SIZE) { - S_REQUIRESIZE(st->st_litemask); - S_REQUIREBLOCKS(st->st_litemask); - } - if (inode.mask & INODE_MASK_MTIME) S_REQUIREMTIME(st->st_litemask); - if (inode.mask & INODE_MASK_ATIME) S_REQUIREATIME(st->st_litemask); - */ -} - - -int Client::lstat(const char *relpath, struct stat *stbuf) -{ - client_lock.Lock(); - - string abspath; - mkabspath(relpath, abspath); - const char *path = abspath.c_str(); - - dout(3) << "op: client->lstat(\"" << path << "\", &st);" << endl; - tout << "lstat" << endl; - tout << path << endl; - - Inode *in = 0; - - int res = _lstat(path, INODE_MASK_ALL_STAT, &in); - if (res == 0) { - assert(in); - fill_stat(in->inode,stbuf); - dout(10) << "stat sez size = " << in->inode.size << " ino = " << stbuf->st_ino << endl; - } - - trim_cache(); - client_lock.Unlock(); - return res; -} - - -int Client::lstatlite(const char *relpath, struct statlite *stl) -{ - client_lock.Lock(); - - string abspath; - mkabspath(relpath, abspath); - const char *path = abspath.c_str(); - - dout(3) << "op: client->lstatlite(\"" << path << "\", &st);" << endl; - tout << "lstatlite" << endl; - tout << path << endl; - - // make mask - int mask = INODE_MASK_BASE | INODE_MASK_PERM; - if (S_ISVALIDSIZE(stl->st_litemask) || - S_ISVALIDBLOCKS(stl->st_litemask)) mask |= INODE_MASK_SIZE; - if (S_ISVALIDMTIME(stl->st_litemask)) mask |= INODE_MASK_MTIME; - if (S_ISVALIDATIME(stl->st_litemask)) mask |= INODE_MASK_ATIME; - - Inode *in = 0; - int res = _lstat(path, mask, &in); - - if (res == 0) { - fill_statlite(in->inode,stl); - dout(10) << "stat sez size = " << in->inode.size << " ino = " << in->inode.ino << endl; - } - - trim_cache(); - client_lock.Unlock(); - return res; -} - - - -int Client::chmod(const char *relpath, mode_t mode) -{ - client_lock.Lock(); - - string abspath; - mkabspath(relpath, abspath); - const char *path = abspath.c_str(); - - dout(3) << "op: client->chmod(\"" << path << "\", " << mode << ");" << endl; - tout << "chmod" << endl; - tout << path << endl; - tout << mode << endl; - - - MClientRequest *req = new MClientRequest(MDS_OP_CHMOD, whoami); - req->set_path(path); - req->set_iarg( (int)mode ); - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - MClientReply *reply = make_request(req, true); - int res = reply->get_result(); - insert_trace(reply); - delete reply; - dout(10) << "chmod result is " << res << endl; - - trim_cache(); - client_lock.Unlock(); - return res; -} - -int Client::chown(const char *relpath, uid_t uid, gid_t gid) -{ - client_lock.Lock(); - - string abspath; - mkabspath(relpath, abspath); - const char *path = abspath.c_str(); - - dout(3) << "op: client->chown(\"" << path << "\", " << uid << ", " << gid << ");" << endl; - tout << "chown" << endl; - tout << path << endl; - tout << uid << endl; - tout << gid << endl; - - - MClientRequest *req = new MClientRequest(MDS_OP_CHOWN, whoami); - req->set_path(path); - req->set_iarg( (int)uid ); - req->set_iarg2( (int)gid ); - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - //FIXME enforce caller uid rights? - - MClientReply *reply = make_request(req, true); - int res = reply->get_result(); - insert_trace(reply); - delete reply; - dout(10) << "chown result is " << res << endl; - - trim_cache(); - client_lock.Unlock(); - return res; -} - -int Client::utime(const char *relpath, struct utimbuf *buf) -{ - client_lock.Lock(); - - string abspath; - mkabspath(relpath, abspath); - const char *path = abspath.c_str(); - - dout(3) << "op: utim.actime = " << buf->actime << "; utim.modtime = " << buf->modtime << ";" << endl; - dout(3) << "op: client->utime(\"" << path << "\", &utim);" << endl; - tout << "utime" << endl; - tout << path << endl; - tout << buf->actime << endl; - tout << buf->modtime << endl; - - - MClientRequest *req = new MClientRequest(MDS_OP_UTIME, whoami); - req->set_path(path); - req->set_targ( buf->modtime ); - req->set_targ2( buf->actime ); - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - //FIXME enforce caller uid rights? - - MClientReply *reply = make_request(req, true); - int res = reply->get_result(); - insert_trace(reply); - delete reply; - dout(10) << "utime result is " << res << endl; - - trim_cache(); - client_lock.Unlock(); - return res; -} - - - -int Client::mknod(const char *relpath, mode_t mode) -{ - client_lock.Lock(); - - string abspath; - mkabspath(relpath, abspath); - const char *path = abspath.c_str(); - - dout(3) << "op: client->mknod(\"" << path << "\", " << mode << ");" << endl; - tout << "mknod" << endl; - tout << path << endl; - tout << mode << endl; - - - MClientRequest *req = new MClientRequest(MDS_OP_MKNOD, whoami); - req->set_path(path); - req->set_iarg( mode ); - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - //FIXME enforce caller uid rights? - - MClientReply *reply = make_request(req, true); - int res = reply->get_result(); - insert_trace(reply); - - dout(10) << "mknod result is " << res << endl; - - delete reply; - - trim_cache(); - client_lock.Unlock(); - return res; -} - - - - -//readdir usually include inode info for each entry except of locked entries - -// -// getdir - -// fyi: typedef int (*dirfillerfunc_t) (void *handle, const char *name, int type, inodeno_t ino); - -int Client::getdir(const char *relpath, map& contents) -{ - client_lock.Lock(); - - string abspath; - mkabspath(relpath, abspath); - const char *path = abspath.c_str(); - - dout(3) << "op: client->getdir(\"" << path << "\", dir_contents);" << endl; - tout << "getdir" << endl; - tout << path << endl; - - - MClientRequest *req = new MClientRequest(MDS_OP_READDIR, whoami); - req->set_path(path); - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - //FIXME enforce caller uid rights? - - MClientReply *reply = make_request(req, true); - int res = reply->get_result(); - insert_trace(reply); - - if (res == 0) { - - // dir contents to cache! - inodeno_t ino = reply->get_ino(); - Inode *diri = inode_map[ ino ]; - assert(diri); - assert(diri->inode.mode & INODE_MODE_DIR); - - // add . and ..? - string dot("."); - contents[dot] = diri->inode; - if (diri != root) { - string dotdot(".."); - contents[dotdot] = diri->dn->dir->parent_inode->inode; - } - - if (!reply->get_dir_in().empty()) { - // only open dir if we're actually adding stuff to it! - Dir *dir = diri->open_dir(); - assert(dir); - time_t now = time(NULL); - - list::const_iterator pdn = reply->get_dir_dn().begin(); - for (list::const_iterator pin = reply->get_dir_in().begin(); - pin != reply->get_dir_in().end(); - ++pin, ++pdn) { - if (*pdn == ".") - continue; - - // count entries - res++; - - // put in cache - Inode *in = this->insert_inode(dir, *pin, *pdn); - - if (g_conf.client_cache_stat_ttl) - in->valid_until = now + g_conf.client_cache_stat_ttl; - else if (g_conf.client_cache_readdir_ttl) - in->valid_until = now + g_conf.client_cache_readdir_ttl; - - // contents to caller too! - contents[*pdn] = in->inode; - } - if (dir->is_empty()) - close_dir(dir); - } - - - // FIXME: remove items in cache that weren't in my readdir? - // *** - } - - delete reply; //fix thing above first - - client_lock.Unlock(); - return res; -} - - -/** POSIX stubs **/ - -DIR *Client::opendir(const char *name) -{ - DirResult *d = new DirResult; - d->size = getdir(name, d->contents); - d->p = d->contents.begin(); - d->off = 0; - return (DIR*)d; -} - -int Client::closedir(DIR *dir) -{ - DirResult *d = (DirResult*)dir; - delete d; - return 0; -} - -//struct dirent { -// ino_t d_ino; /* inode number */ -// off_t d_off; /* offset to the next dirent */ -// unsigned short d_reclen; /* length of this record */ -// unsigned char d_type; /* type of file */ -// char d_name[256]; /* filename */ -//}; - -struct dirent *Client::readdir(DIR *dirp) -{ - DirResult *d = (DirResult*)dirp; - - // end of dir? - if (d->p == d->contents.end()) - return 0; - - // fill the dirent - d->dp.d_dirent.d_ino = d->p->second.ino; -#ifndef __CYGWIN__ -#ifndef DARWIN - if (d->p->second.is_symlink()) - d->dp.d_dirent.d_type = DT_LNK; - else if (d->p->second.is_dir()) - d->dp.d_dirent.d_type = DT_DIR; - else if (d->p->second.is_file()) - d->dp.d_dirent.d_type = DT_REG; - else - d->dp.d_dirent.d_type = DT_UNKNOWN; - - d->dp.d_dirent.d_off = d->off; - d->dp.d_dirent.d_reclen = 1; // all records are length 1 (wrt offset, seekdir, telldir, etc.) -#endif // DARWIN -#endif - - strncpy(d->dp.d_dirent.d_name, d->p->first.c_str(), 256); - - // move up - ++d->off; - ++d->p; - - return &d->dp.d_dirent; -} - -void Client::rewinddir(DIR *dirp) -{ - DirResult *d = (DirResult*)dirp; - d->p = d->contents.begin(); - d->off = 0; -} - -off_t Client::telldir(DIR *dirp) -{ - DirResult *d = (DirResult*)dirp; - return d->off; -} - -void Client::seekdir(DIR *dirp, off_t offset) -{ - DirResult *d = (DirResult*)dirp; - - d->p = d->contents.begin(); - d->off = 0; - - if (offset >= d->size) offset = d->size-1; - while (offset > 0) { - ++d->p; - ++d->off; - --offset; - } -} - -struct dirent_plus *Client::readdirplus(DIR *dirp) -{ - DirResult *d = (DirResult*)dirp; - - // end of dir? - if (d->p == d->contents.end()) - return 0; - - // fill the dirent - d->dp.d_dirent.d_ino = d->p->second.ino; -#ifndef __CYGWIN__ -#ifndef DARWIN - if (d->p->second.is_symlink()) - d->dp.d_dirent.d_type = DT_LNK; - else if (d->p->second.is_dir()) - d->dp.d_dirent.d_type = DT_DIR; - else if (d->p->second.is_file()) - d->dp.d_dirent.d_type = DT_REG; - else - d->dp.d_dirent.d_type = DT_UNKNOWN; - - d->dp.d_dirent.d_off = d->off; - d->dp.d_dirent.d_reclen = 1; // all records are length 1 (wrt offset, seekdir, telldir, etc.) -#endif // DARWIN -#endif - - strncpy(d->dp.d_dirent.d_name, d->p->first.c_str(), 256); - - // plus - if ((d->p->second.mask & INODE_MASK_ALL_STAT) == INODE_MASK_ALL_STAT) { - // have it - fill_stat(d->p->second, &d->dp.d_stat); - d->dp.d_stat_err = 0; - } else { - // don't have it, stat it - string path = d->path; - path += "/"; - path += d->p->first; - d->dp.d_stat_err = lstat(path.c_str(), &d->dp.d_stat); - } - - // move up - ++d->off; - ++d->p; - - return &d->dp; -} - -/* -struct dirent_lite *Client::readdirlite(DIR *dirp) -{ - DirResult *d = (DirResult*)dirp; - - // end of dir? - if (d->p == d->contents.end()) - return 0; - - // fill the dirent - d->dp.d_dirent.d_ino = d->p->second.ino; - if (d->p->second.is_symlink()) - d->dp.d_dirent.d_type = DT_LNK; - else if (d->p->second.is_dir()) - d->dp.d_dirent.d_type = DT_DIR; - else if (d->p->second.is_file()) - d->dp.d_dirent.d_type = DT_REG; - else - d->dp.d_dirent.d_type = DT_UNKNOWN; - strncpy(d->dp.d_dirent.d_name, d->p->first.c_str(), 256); - - d->dp.d_dirent.d_off = d->off; - d->dp.d_dirent.d_reclen = 1; // all records are length 1 (wrt offset, seekdir, telldir, etc.) - - // plus - if ((d->p->second.mask & INODE_MASK_ALL_STAT) == INODE_MASK_ALL_STAT) { - // have it - fill_statlite(d->p->second,d->dp.d_stat); - d->dp.d_stat_err = 0; - } else { - // don't have it, stat it - string path = p->path; - path += "/"; - path += p->first; - d->dp.d_statlite - d->dp.d_stat_err = lstatlite(path.c_str(), &d->dp.d_statlite); - } - - // move up - ++d->off; - ++d->p; - - return &d->dp; -} -*/ - - - - - - -/****** file i/o **********/ - -int Client::open(const char *relpath, int flags) -{ - client_lock.Lock(); - - string abspath; - mkabspath(relpath, abspath); - const char *path = abspath.c_str(); - - dout(3) << "op: fh = client->open(\"" << path << "\", " << flags << ");" << endl; - tout << "open" << endl; - tout << path << endl; - tout << flags << endl; - - int cmode = 0; - bool tryauth = false; - if (flags & O_LAZY) - cmode = FILE_MODE_LAZY; - else if (flags & O_WRONLY) { - cmode = FILE_MODE_W; - tryauth = true; - } else if (flags & O_RDWR) { - cmode = FILE_MODE_RW; - tryauth = true; - } else if (flags & O_APPEND) { - cmode = FILE_MODE_W; - tryauth = true; - } else - cmode = FILE_MODE_R; - - // go - MClientRequest *req = new MClientRequest(MDS_OP_OPEN, whoami); - req->set_path(path); - req->set_iarg(flags); - req->set_iarg2(cmode); - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - MClientReply *reply = make_request(req, tryauth); // try auth if writer - - assert(reply); - dout(3) << "op: open_files[" << reply->get_result() << "] = fh; // fh = " << reply->get_result() << endl; - tout << reply->get_result() << endl; - - insert_trace(reply); - int result = reply->get_result(); - - // success? - fh_t fh = 0; - if (result >= 0) { - // yay - Fh *f = new Fh; - f->mode = cmode; - - // inode - f->inode = inode_map[reply->get_ino()]; - assert(f->inode); - f->inode->get(); - - if (cmode & FILE_MODE_R) f->inode->num_open_rd++; - if (cmode & FILE_MODE_W) f->inode->num_open_wr++; - if (cmode & FILE_MODE_LAZY) f->inode->num_open_lazy++; - - // caps included? - int mds = reply->get_source().num(); - - if (f->inode->caps.empty()) {// first caps? - dout(7) << " first caps on " << f->inode->inode.ino << endl; - f->inode->get(); - } - - int new_caps = reply->get_file_caps(); - - assert(reply->get_file_caps_seq() >= f->inode->caps[mds].seq); - if (reply->get_file_caps_seq() > f->inode->caps[mds].seq) { - dout(7) << "open got caps " << cap_string(new_caps) - << " for " << f->inode->ino() - << " seq " << reply->get_file_caps_seq() - << " from mds" << mds - << endl; - - int old_caps = f->inode->caps[mds].caps; - f->inode->caps[mds].caps = new_caps; - f->inode->caps[mds].seq = reply->get_file_caps_seq(); - - // we shouldn't ever lose caps at this point. - // actually, we might...? - assert((old_caps & ~f->inode->caps[mds].caps) == 0); - - if (g_conf.client_oc) - f->inode->fc.set_caps(new_caps); - - } else { - dout(7) << "open got SAME caps " << cap_string(new_caps) - << " for " << f->inode->ino() - << " seq " << reply->get_file_caps_seq() - << " from mds" << mds - << endl; - } - - // put in map - result = fh = get_fh(); - assert(fh_map.count(fh) == 0); - fh_map[fh] = f; - - dout(3) << "open success, fh is " << fh << " combined caps " << cap_string(f->inode->file_caps()) << endl; - } else { - dout(0) << "open failure result " << result << endl; - } - - delete reply; - - trim_cache(); - client_lock.Unlock(); - - return result; -} - - - - - -void Client::close_release(Inode *in) -{ - dout(10) << "close_release on " << in->ino() << endl; - dout(10) << " wr " << in->num_open_wr << " rd " << in->num_open_rd - << " dirty " << in->fc.is_dirty() << " cached " << in->fc.is_cached() << endl; - - if (!in->num_open_rd) - in->fc.release_clean(); - - int retain = 0; - if (in->num_open_wr || in->fc.is_dirty()) retain |= CAP_FILE_WR | CAP_FILE_WRBUFFER | CAP_FILE_WREXTEND; - if (in->num_open_rd || in->fc.is_cached()) retain |= CAP_FILE_RD | CAP_FILE_RDCACHE; - - release_caps(in, retain); // release caps now. -} - -void Client::close_safe(Inode *in) -{ - dout(10) << "close_safe on " << in->ino() << endl; - put_inode(in); - if (unmounting) - mount_cond.Signal(); -} - -int Client::close(fh_t fh) -{ - client_lock.Lock(); - dout(3) << "op: client->close(open_files[ " << fh << " ]);" << endl; - dout(3) << "op: open_files.erase( " << fh << " );" << endl; - tout << "close" << endl; - tout << fh << endl; - - // get Fh, Inode - assert(fh_map.count(fh)); - Fh *f = fh_map[fh]; - Inode *in = f->inode; - - // update inode rd/wr counts - int before = in->file_caps_wanted(); - if (f->mode & FILE_MODE_R) - in->num_open_rd--; - if (f->mode & FILE_MODE_W) - in->num_open_wr--; - int after = in->file_caps_wanted(); - - // does this change what caps we want? - if (before != after && after) - update_caps_wanted(in); - - // hose fh - fh_map.erase(fh); - delete f; - - // release caps right away? - dout(10) << "num_open_rd " << in->num_open_rd << " num_open_wr " << in->num_open_wr << endl; - - if (g_conf.client_oc) { - // caching on. - if (in->num_open_rd == 0 && in->num_open_wr == 0) { - in->fc.empty(new C_Client_CloseRelease(this, in)); - } - else if (in->num_open_rd == 0) { - in->fc.release_clean(); - close_release(in); - } - else if (in->num_open_wr == 0) { - in->fc.flush_dirty(new C_Client_CloseRelease(this,in)); - } - - // pin until safe? - if (in->num_open_wr == 0 && !in->fc.all_safe()) { - dout(10) << "pinning ino " << in->ino() << " until safe" << endl; - in->get(); - in->fc.add_safe_waiter(new C_Client_CloseSafe(this, in)); - } - } else { - // caching off. - if (in->num_open_rd == 0 && in->num_open_wr == 0) { - dout(10) << " releasing caps on " << in->ino() << endl; - release_caps(in); // release caps now. - } - } - - put_inode( in ); - int result = 0; - - client_lock.Unlock(); - return result; -} - - - -// ------------ -// read, write - - -off_t Client::lseek(fh_t fh, off_t offset, int whence) -{ - client_lock.Lock(); - dout(3) << "op: client->lseek(" << fh << ", " << offset << ", " << whence << ");" << endl; - - assert(fh_map.count(fh)); - Fh *f = fh_map[fh]; - Inode *in = f->inode; - - switch (whence) { - case SEEK_SET: - f->pos = offset; - break; - - case SEEK_CUR: - f->pos += offset; - break; - - case SEEK_END: - f->pos = in->inode.size + offset; - break; - - default: - assert(0); - } - - off_t pos = f->pos; - client_lock.Unlock(); - - return pos; -} - - -// blocking osd interface - -int Client::read(fh_t fh, char *buf, off_t size, off_t offset) -{ - client_lock.Lock(); - - dout(3) << "op: client->read(" << fh << ", buf, " << size << ", " << offset << "); // that's " << offset << "~" << size << endl; - tout << "read" << endl; - tout << fh << endl; - tout << size << endl; - tout << offset << endl; - - assert(offset >= 0); - assert(fh_map.count(fh)); - Fh *f = fh_map[fh]; - Inode *in = f->inode; - - if (offset < 0) - offset = f->pos; - - bool lazy = f->mode == FILE_MODE_LAZY; - - // do we have read file cap? - while (!lazy && (in->file_caps() & CAP_FILE_RD) == 0) { - dout(7) << " don't have read cap, waiting" << endl; - Cond cond; - in->waitfor_read.push_back(&cond); - cond.Wait(client_lock); - } - // lazy cap? - while (lazy && (in->file_caps() & CAP_FILE_LAZYIO) == 0) { - dout(7) << " don't have lazy cap, waiting" << endl; - Cond cond; - in->waitfor_lazy.push_back(&cond); - cond.Wait(client_lock); - } - - // determine whether read range overlaps with file - // ...ONLY if we're doing async io - if (!lazy && (in->file_caps() & (CAP_FILE_WRBUFFER|CAP_FILE_RDCACHE))) { - // we're doing buffered i/o. make sure we're inside the file. - // we can trust size info bc we get accurate info when buffering/caching caps are issued. - dout(-10) << "file size: " << in->inode.size << endl; - if (offset > 0 && offset >= in->inode.size) { - client_lock.Unlock(); - return 0; - } - if (offset + size > (off_t)in->inode.size) - size = (off_t)in->inode.size - offset; - - if (size == 0) { - dout(-10) << "read is size=0, returning 0" << endl; - client_lock.Unlock(); - return 0; - } - } else { - // unbuffered, synchronous file i/o. - // or lazy. - // defer to OSDs for file bounds. - } - - bufferlist blist; // data will go here - int r = 0; - int rvalue = 0; - - if (g_conf.client_oc) { - // object cache ON - rvalue = r = in->fc.read(offset, size, blist, client_lock); // may block. - } else { - // object cache OFF -- legacy inconsistent way. - Cond cond; - bool done = false; - C_Cond *onfinish = new C_Cond(&cond, &done, &rvalue); - - r = filer->read(in->inode, offset, size, &blist, onfinish); - - assert(r >= 0); - - // wait! - while (!done) - cond.Wait(client_lock); - } - - // adjust fd pos - f->pos = offset+blist.length(); - - // copy data into caller's char* buf - blist.copy(0, blist.length(), buf); - - //dout(10) << "i read '" << blist.c_str() << "'" << endl; - dout(10) << "read rvalue " << rvalue << ", r " << r << endl; - - // done! - client_lock.Unlock(); - return rvalue; -} - - - -/* - * hack -- - * until we properly implement synchronous writes wrt buffer cache, - * make sure we delay shutdown until they're all safe on disk! - */ -class C_Client_HackUnsafe : public Context { - Client *cl; -public: - C_Client_HackUnsafe(Client *c) : cl(c) {} - void finish(int) { - cl->hack_sync_write_safe(); - } -}; - -void Client::hack_sync_write_safe() -{ - client_lock.Lock(); - assert(unsafe_sync_write > 0); - unsafe_sync_write--; - if (unsafe_sync_write == 0 && unmounting) { - dout(10) << "hack_sync_write_safe -- no more unsafe writes, unmount can proceed" << endl; - mount_cond.Signal(); - } - client_lock.Unlock(); -} - -int Client::write(fh_t fh, const char *buf, off_t size, off_t offset) -{ - client_lock.Lock(); - - //dout(7) << "write fh " << fh << " size " << size << " offset " << offset << endl; - dout(3) << "op: client->write(" << fh << ", buf, " << size << ", " << offset << ");" << endl; - tout << "write" << endl; - tout << fh << endl; - tout << size << endl; - tout << offset << endl; - - assert(offset >= 0); - assert(fh_map.count(fh)); - Fh *f = fh_map[fh]; - Inode *in = f->inode; - - if (offset < 0) - offset = f->pos; - - bool lazy = f->mode == FILE_MODE_LAZY; - - dout(10) << "cur file size is " << in->inode.size << " wr size " << in->file_wr_size << endl; - - // do we have write file cap? - while (!lazy && (in->file_caps() & CAP_FILE_WR) == 0) { - dout(7) << " don't have write cap, waiting" << endl; - Cond cond; - in->waitfor_write.push_back(&cond); - cond.Wait(client_lock); - } - while (lazy && (in->file_caps() & CAP_FILE_LAZYIO) == 0) { - dout(7) << " don't have lazy cap, waiting" << endl; - Cond cond; - in->waitfor_lazy.push_back(&cond); - cond.Wait(client_lock); - } - - // adjust fd pos - f->pos = offset+size; - - // time it. - utime_t start = g_clock.now(); - - // copy into fresh buffer (since our write may be resub, async) - bufferptr bp = buffer::copy(buf, size); - bufferlist blist; - blist.push_back( bp ); - - if (g_conf.client_oc) { // buffer cache ON? - assert(objectcacher); - - // write (this may block!) - in->fc.write(offset, size, blist, client_lock); - - } else { - // legacy, inconsistent synchronous write. - dout(7) << "synchronous write" << endl; - - // prepare write - Cond cond; - bool done = false; - C_Cond *onfinish = new C_Cond(&cond, &done); - C_Client_HackUnsafe *onsafe = new C_Client_HackUnsafe(this); - unsafe_sync_write++; - in->sync_writes++; - - dout(20) << " sync write start " << onfinish << endl; - - filer->write(in->inode, offset, size, blist, 0, - onfinish, onsafe - //, 1+((int)g_clock.now()) / 10 //f->pos // hack hack test osd revision snapshots - ); - - while (!done) { - cond.Wait(client_lock); - dout(20) << " sync write bump " << onfinish << endl; - } - - in->sync_writes--; - if (in->sync_writes == 0 && - !in->waitfor_no_write.empty()) { - for (list::iterator i = in->waitfor_no_write.begin(); - i != in->waitfor_no_write.end(); - i++) - (*i)->finish(0); - in->waitfor_no_write.clear(); - } - - dout(20) << " sync write done " << onfinish << endl; - } - - // time - utime_t lat = g_clock.now(); - lat -= start; - if (client_logger) { - client_logger->finc("wrlsum",(double)lat); - client_logger->inc("wrlnum"); - } - - // assume success for now. FIXME. - off_t totalwritten = size; - - // extend file? - if (totalwritten + offset > in->inode.size) { - in->inode.size = in->file_wr_size = totalwritten + offset; - dout(7) << "wrote to " << totalwritten+offset << ", extending file size" << endl; - } else { - dout(7) << "wrote to " << totalwritten+offset << ", leaving file size at " << in->inode.size << endl; - } - - // mtime - in->file_wr_mtime = in->inode.mtime = g_clock.gettime(); - - // ok! - client_lock.Unlock(); - return totalwritten; -} - - -int Client::truncate(const char *file, off_t size) -{ - client_lock.Lock(); - dout(3) << "op: client->truncate(\"" << file << "\", " << size << ");" << endl; - tout << "truncate" << endl; - tout << file << endl; - tout << size << endl; - - - MClientRequest *req = new MClientRequest(MDS_OP_TRUNCATE, whoami); - req->set_path(file); - req->set_sizearg( size ); - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - MClientReply *reply = make_request(req, true); - int res = reply->get_result(); - insert_trace(reply); - delete reply; - - dout(10) << " truncate result is " << res << endl; - - client_lock.Unlock(); - return res; -} - - -int Client::fsync(fh_t fh, bool syncdataonly) -{ - client_lock.Lock(); - dout(3) << "op: client->fsync(open_files[ " << fh << " ], " << syncdataonly << ");" << endl; - tout << "fsync" << endl; - tout << fh << endl; - tout << syncdataonly << endl; - - int r = 0; - - assert(fh_map.count(fh)); - Fh *f = fh_map[fh]; - Inode *in = f->inode; - - dout(3) << "fsync fh " << fh << " ino " << in->inode.ino << " syncdataonly " << syncdataonly << endl; - - // metadata? - if (!syncdataonly) { - dout(0) << "fsync - not syncing metadata yet.. implement me" << endl; - } - - // data? - Cond cond; - bool done = false; - if (!objectcacher->commit_set(in->ino(), - new C_Cond(&cond, &done))) { - // wait for callback - while (!done) cond.Wait(client_lock); - } - - client_lock.Unlock(); - return r; -} - - -// not written yet, but i want to link! - -int Client::chdir(const char *path) -{ - // fake it for now! - string abs; - mkabspath(path, abs); - dout(3) << "chdir " << path << " -> cwd now " << abs << endl; - cwd = abs; - return 0; -} - -int Client::statfs(const char *path, struct statvfs *stbuf) -{ - bzero (stbuf, sizeof (struct statvfs)); - // FIXME - stbuf->f_bsize = 1024; - stbuf->f_frsize = 1024; - stbuf->f_blocks = 1024 * 1024; - stbuf->f_bfree = 1024 * 1024; - stbuf->f_bavail = 1024 * 1024; - stbuf->f_files = 1024 * 1024; - stbuf->f_ffree = 1024 * 1024; - stbuf->f_favail = 1024 * 1024; - stbuf->f_namemax = 1024; - - return 0; -} - - -int Client::lazyio_propogate(int fd, off_t offset, size_t count) -{ - client_lock.Lock(); - dout(3) << "op: client->lazyio_propogate(" << fd - << ", " << offset << ", " << count << ")" << endl; - - assert(fh_map.count(fd)); - Fh *f = fh_map[fd]; - Inode *in = f->inode; - - if (f->mode & FILE_MODE_LAZY) { - // wait for lazy cap - while ((in->file_caps() & CAP_FILE_LAZYIO) == 0) { - dout(7) << " don't have lazy cap, waiting" << endl; - Cond cond; - in->waitfor_lazy.push_back(&cond); - cond.Wait(client_lock); - } - - if (g_conf.client_oc) { - Cond cond; - bool done = false; - in->fc.flush_dirty(new C_SafeCond(&client_lock, &cond, &done)); - - while (!done) - cond.Wait(client_lock); - - } else { - // mmm, nothin to do. - } - } - - client_lock.Unlock(); - return 0; -} - -int Client::lazyio_synchronize(int fd, off_t offset, size_t count) -{ - client_lock.Lock(); - dout(3) << "op: client->lazyio_synchronize(" << fd - << ", " << offset << ", " << count << ")" << endl; - - assert(fh_map.count(fd)); - Fh *f = fh_map[fd]; - Inode *in = f->inode; - - if (f->mode & FILE_MODE_LAZY) { - // wait for lazy cap - while ((in->file_caps() & CAP_FILE_LAZYIO) == 0) { - dout(7) << " don't have lazy cap, waiting" << endl; - Cond cond; - in->waitfor_lazy.push_back(&cond); - cond.Wait(client_lock); - } - - if (g_conf.client_oc) { - in->fc.flush_dirty(0); // flush to invalidate. - in->fc.release_clean(); - } else { - // mm, nothin to do. - } - } - - client_lock.Unlock(); - return 0; -} - - -// ========================================= -// layout - - -int Client::describe_layout(int fh, FileLayout *lp) -{ - client_lock.Lock(); - dout(3) << "op: client->describe_layout(" << fh << ");" << endl; - - assert(fh_map.count(fh)); - Fh *f = fh_map[fh]; - Inode *in = f->inode; - - *lp = in->inode.layout; - - client_lock.Unlock(); - return 0; -} - -int Client::get_stripe_unit(int fd) -{ - FileLayout layout; - describe_layout(fd, &layout); - return layout.stripe_size; -} - -int Client::get_stripe_width(int fd) -{ - FileLayout layout; - describe_layout(fd, &layout); - return layout.stripe_size*layout.stripe_count; -} - -int Client::get_stripe_period(int fd) -{ - FileLayout layout; - describe_layout(fd, &layout); - return layout.period(); -} - -int Client::enumerate_layout(int fh, list& result, - off_t length, off_t offset) -{ - client_lock.Lock(); - dout(3) << "op: client->enumerate_layout(" << fh << ", " << length << ", " << offset << ");" << endl; - - assert(fh_map.count(fh)); - Fh *f = fh_map[fh]; - Inode *in = f->inode; - - // map to a list of extents - filer->file_to_extents(in->inode, offset, length, result); - - client_lock.Unlock(); - return 0; -} - - - -void Client::ms_handle_failure(Message *m, const entity_inst_t& inst) -{ - entity_name_t dest = inst.name; - - if (dest.is_mon()) { - // resend to a different monitor. - int mon = monmap->pick_mon(true); - dout(0) << "ms_handle_failure " << dest << " inst " << inst - << ", resending to mon" << mon - << endl; - messenger->send_message(m, monmap->get_inst(mon)); - } - else if (dest.is_osd()) { - objecter->ms_handle_failure(m, dest, inst); - } - else if (dest.is_mds()) { - dout(0) << "ms_handle_failure " << dest << " inst " << inst << endl; - // help! - assert(0); - } - else { - // client? - dout(0) << "ms_handle_failure " << dest << " inst " << inst - << ", dropping" << endl; - delete m; - } -} - diff --git a/branches/marnberg/quota/client/Client.h b/branches/marnberg/quota/client/Client.h deleted file mode 100644 index 4ee04f8003f23..0000000000000 --- a/branches/marnberg/quota/client/Client.h +++ /dev/null @@ -1,597 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __CLIENT_H -#define __CLIENT_H - - -#include "mds/MDSMap.h" -#include "osd/OSDMap.h" -#include "mon/MonMap.h" - -#include "msg/Message.h" -#include "msg/Dispatcher.h" -#include "msg/Messenger.h" -#include "msg/SerialMessenger.h" - -#include "messages/MClientRequest.h" -#include "messages/MClientReply.h" - -//#include "msgthread.h" - -#include "include/types.h" -#include "include/lru.h" -#include "include/filepath.h" -#include "include/interval_set.h" - -#include "common/Mutex.h" - -#include "FileCache.h" - -// stl -#include -#include -using namespace std; - -#include -using namespace __gnu_cxx; - -#define O_LAZY 01000000 - - -class Filer; -class Objecter; -class ObjectCacher; - -extern class LogType client_logtype; -extern class Logger *client_logger; - - - -// ============================================ -// types for my local metadata cache -/* basic structure: - - - Dentries live in an LRU loop. they get expired based on last access. - see include/lru.h. items can be bumped to "mid" or "top" of list, etc. - - Inode has ref count for each Fh, Dir, or Dentry that points to it. - - when Inode ref goes to 0, it's expired. - - when Dir is empty, it's removed (and it's Inode ref--) - -*/ - -typedef int fh_t; - -class Dir; -class Inode; - -class Dentry : public LRUObject { - public: - string name; // sort of lame - //const char *name; - Dir *dir; - Inode *inode; - int ref; // 1 if there's a dir beneath me. - - void get() { assert(ref == 0); ref++; lru_pin(); } - void put() { assert(ref == 1); ref--; lru_unpin(); } - - Dentry() : dir(0), inode(0), ref(0) { } - - /*Dentry() : name(0), dir(0), inode(0), ref(0) { } - Dentry(string& n) : name(0), dir(0), inode(0), ref(0) { - name = new char[n.length()+1]; - strcpy((char*)name, n.c_str()); - } - ~Dentry() { - delete[] name; - }*/ -}; - -class Dir { - public: - Inode *parent_inode; // my inode - //hash_map, eqstr> dentries; - hash_map dentries; - - Dir(Inode* in) { parent_inode = in; } - - bool is_empty() { return dentries.empty(); } -}; - - -class InodeCap { - public: - int caps; - long seq; - InodeCap() : caps(0), seq(0) {} -}; - - -class Inode { - public: - inode_t inode; // the actual inode - time_t valid_until; - - // about the dir (if this is one!) - int dir_auth; - set dir_contacts; - bool dir_hashed, dir_replicated; - - // per-mds caps - map caps; // mds -> InodeCap - map stale_caps; // mds -> cap .. stale - - time_t file_wr_mtime; // [writers] time of last write - off_t file_wr_size; // [writers] largest offset we've written to - int num_open_rd, num_open_wr, num_open_lazy; // num readers, writers - - int ref; // ref count. 1 for each dentry, fh that links to me. - Dir *dir; // if i'm a dir. - Dentry *dn; // if i'm linked to a dentry. - string *symlink; // symlink content, if it's a symlink - - // for caching i/o mode - FileCache fc; - - // for sync i/o mode - int sync_reads; // sync reads in progress - int sync_writes; // sync writes in progress - - list waitfor_write; - list waitfor_read; - list waitfor_lazy; - list waitfor_no_read, waitfor_no_write; - - void get() { - ref++; - //cout << "inode.get on " << hex << inode.ino << dec << " now " << ref << endl; - } - void put() { - ref--; assert(ref >= 0); - //cout << "inode.put on " << hex << inode.ino << dec << " now " << ref << endl; - } - - Inode(inode_t _inode, ObjectCacher *_oc) : - inode(_inode), - valid_until(0), - dir_auth(-1), dir_hashed(false), dir_replicated(false), - file_wr_mtime(0), file_wr_size(0), - num_open_rd(0), num_open_wr(0), num_open_lazy(0), - ref(0), dir(0), dn(0), symlink(0), - fc(_oc, _inode), - sync_reads(0), sync_writes(0) - { } - ~Inode() { - if (symlink) { delete symlink; symlink = 0; } - } - - inodeno_t ino() { return inode.ino; } - - bool is_dir() { - return (inode.mode & INODE_TYPE_MASK) == INODE_MODE_DIR; - } - - int file_caps() { - int c = 0; - for (map::iterator it = caps.begin(); - it != caps.end(); - it++) - c |= it->second.caps; - for (map::iterator it = stale_caps.begin(); - it != stale_caps.end(); - it++) - c |= it->second.caps; - return c; - } - - int file_caps_wanted() { - int w = 0; - if (num_open_rd) w |= CAP_FILE_RD|CAP_FILE_RDCACHE; - if (num_open_wr) w |= CAP_FILE_WR|CAP_FILE_WRBUFFER; - if (num_open_lazy) w |= CAP_FILE_LAZYIO; - return w; - } - - int authority(MDSMap *mdsmap) { - //cout << "authority on " << inode.ino << " .. dir_auth is " << dir_auth<< endl; - // parent? - if (dn && dn->dir && dn->dir->parent_inode) { - // parent hashed? - if (dn->dir->parent_inode->dir_hashed) { - // hashed - assert(0); - // fixme - //return mdcluster->hash_dentry( dn->dir->parent_inode->ino(), - //dn->name ); - } - - if (dir_auth >= 0) - return dir_auth; - else - return dn->dir->parent_inode->authority(mdsmap); - } - - if (dir_auth >= 0) - return dir_auth; - - assert(0); // !!! - return 0; - } - int dentry_authority(const char *dn, - MDSMap *mdsmap) { - assert(0); - return 0; - //return ->hash_dentry( ino(), - //dn ); - } - int pick_replica(MDSMap *mdsmap) { - // replicas? - if (ino() > 1ULL && dir_contacts.size()) { - //cout << "dir_contacts if " << dir_contacts << endl; - set::iterator it = dir_contacts.begin(); - if (dir_contacts.size() == 1) - return *it; - else { - int r = rand() % dir_contacts.size(); - while (r--) it++; - return *it; - } - } - - if (dir_replicated || ino() == 1) { - //cout << "num_mds is " << mdcluster->get_num_mds() << endl; - return rand() % mdsmap->get_num_mds(); // huh.. pick a random mds! - } - else - return authority(mdsmap); - } - - - // open Dir for an inode. if it's not open, allocated it (and pin dentry in memory). - Dir *open_dir() { - if (!dir) { - if (dn) dn->get(); // pin dentry - get(); - dir = new Dir(this); - } - return dir; - } - -}; - - - - -// file handle for any open file state - -struct Fh { - Inode *inode; - off_t pos; - int mds; // have to talk to mds we opened with (for now) - int mode; // the mode i opened the file with - - bool is_lazy() { return mode & O_LAZY; } - - Fh() : inode(0), pos(0), mds(0), mode(0) {} -}; - - - - - -// ======================================================== -// client interface - -class Client : public Dispatcher { - public: - - /* getdir result */ - struct DirResult { - string path; - map contents; - map::iterator p; - int off; - int size; - struct dirent_plus dp; - struct dirent_lite dl; - DirResult() : p(contents.end()), off(-1), size(0) {} - }; - - - protected: - Messenger *messenger; - int whoami; - MonMap *monmap; - - // mds fake RPC - tid_t last_tid; - map mds_rpc_cond; - map mds_rpc_reply; - map mds_rpc_dispatch_cond; - - // cluster descriptors - MDSMap *mdsmap; - OSDMap *osdmap; - - bool mounted; - bool unmounting; - Cond mount_cond; - - int unsafe_sync_write; -public: - entity_name_t get_myname() { return messenger->get_myname(); } - void hack_sync_write_safe(); - -protected: - Filer *filer; - ObjectCacher *objectcacher; - Objecter *objecter; // (non-blocking) osd interface - - // cache - hash_map inode_map; - Inode* root; - LRU lru; // lru list of Dentry's in our local metadata cache. - - // cap weirdness - map > cap_reap_queue; // ino -> mds -> msg .. set of (would-be) stale caps to reap - - - // file handles, etc. - string cwd; - interval_set free_fh_set; // unused fh's - hash_map fh_map; - - fh_t get_fh() { - fh_t fh = free_fh_set.start(); - free_fh_set.erase(fh, 1); - return fh; - } - void put_fh(fh_t fh) { - free_fh_set.insert(fh, 1); - } - - void mkabspath(const char *rel, string& abs) { - if (rel[0] == '/') { - abs = rel; - } else { - abs = cwd; - abs += "/"; - abs += rel; - } - } - - - // global client lock - // - protects Client and buffer cache both! - Mutex client_lock; - - - // -- metadata cache stuff - - // decrease inode ref. delete if dangling. - void put_inode(Inode *in) { - in->put(); - if (in->ref == 0) { - inode_map.erase(in->inode.ino); - if (in == root) root = 0; - delete in; - } - } - - void close_dir(Dir *dir) { - assert(dir->is_empty()); - - Inode *in = dir->parent_inode; - if (in->dn) in->dn->put(); // unpin dentry - - delete in->dir; - in->dir = 0; - put_inode(in); - } - - int get_cache_size() { return lru.lru_get_size(); } - void set_cache_size(int m) { lru.lru_set_max(m); } - - Dentry* link(Dir *dir, const string& name, Inode *in) { - Dentry *dn = new Dentry; - dn->name = name; - - // link to dir - dn->dir = dir; - dir->dentries[dn->name] = dn; - - // link to inode - dn->inode = in; - in->dn = dn; - in->get(); - - lru.lru_insert_mid(dn); // mid or top? - return dn; - } - - void unlink(Dentry *dn) { - Inode *in = dn->inode; - - // unlink from inode - dn->inode = 0; - in->dn = 0; - put_inode(in); - - // unlink from dir - dn->dir->dentries.erase(dn->name); - if (dn->dir->is_empty()) - close_dir(dn->dir); - dn->dir = 0; - - // delete den - lru.lru_remove(dn); - delete dn; - } - - Dentry *relink(Dentry *dn, Dir *dir, const string& name) { - // first link new dn to dir - /* - char *oldname = (char*)dn->name; - dn->name = new char[name.length()+1]; - strcpy((char*)dn->name, name.c_str()); - dir->dentries[dn->name] = dn; - */ - dir->dentries[name] = dn; - - // unlink from old dir - dn->dir->dentries.erase(dn->name); - //delete[] oldname; - if (dn->dir->is_empty()) - close_dir(dn->dir); - - // fix up dn - dn->name = name; - dn->dir = dir; - - return dn; - } - - // move dentry to top of lru - void touch_dn(Dentry *dn) { lru.lru_touch(dn); } - - // trim cache. - void trim_cache(); - void dump_inode(Inode *in, set& did); - void dump_cache(); // debug - - // find dentry based on filepath - Dentry *lookup(filepath& path); - - // make blocking mds request - MClientReply *make_request(MClientRequest *req, bool auth_best=false, int use_auth=-1); - MClientReply* sendrecv(MClientRequest *req, int mds); - void handle_client_reply(MClientReply *reply); - - void fill_stat(inode_t& inode, struct stat *st); - void fill_statlite(inode_t& inode, struct statlite *st); - - - // friends - friend class SyntheticClient; - - public: - Client(Messenger *m, MonMap *mm); - ~Client(); - void tear_down_cache(); - - int get_nodeid() { return whoami; } - - void init(); - void shutdown(); - - // messaging - void dispatch(Message *m); - - void handle_mount_ack(class MClientMountAck*); - void handle_unmount_ack(Message*); - void handle_mds_map(class MMDSMap *m); - - // file caps - void handle_file_caps(class MClientFileCaps *m); - void implemented_caps(class MClientFileCaps *m, Inode *in); - void release_caps(Inode *in, int retain=0); - void update_caps_wanted(Inode *in); - - void close_release(Inode *in); - void close_safe(Inode *in); - - // metadata cache - Inode* insert_inode(Dir *dir, InodeStat *in_info, const string& dn); - void update_inode_dist(Inode *in, InodeStat *st); - Inode* insert_trace(MClientReply *reply); - - // ---------------------- - // fs ops. - int mount(); - int unmount(); - - // these shoud (more or less) mirror the actual system calls. - int statfs(const char *path, struct statvfs *stbuf); - - // crap - int chdir(const char *s); - - // namespace ops - int getdir(const char *path, list& contents); - int getdir(const char *path, map& contents); - - DIR *opendir(const char *name); - int closedir(DIR *dir); - struct dirent *readdir(DIR *dir); - void rewinddir(DIR *dir); - off_t telldir(DIR *dir); - void seekdir(DIR *dir, off_t offset); - - struct dirent_plus *readdirplus(DIR *dirp); - int readdirplus_r(DIR *dirp, struct dirent_plus *entry, struct dirent_plus **result); - struct dirent_lite *readdirlite(DIR *dirp); - int readdirlite_r(DIR *dirp, struct dirent_lite *entry, struct dirent_lite **result); - - - int link(const char *existing, const char *newname); - int unlink(const char *path); - int rename(const char *from, const char *to); - - // dirs - int mkdir(const char *path, mode_t mode); - int rmdir(const char *path); - - // symlinks - int readlink(const char *path, char *buf, off_t size); - int symlink(const char *existing, const char *newname); - - // inode stuff - int _lstat(const char *path, int mask, Inode **in); - int lstat(const char *path, struct stat *stbuf); - int lstatlite(const char *path, struct statlite *buf); - - int chmod(const char *path, mode_t mode); - int chown(const char *path, uid_t uid, gid_t gid); - int utime(const char *path, struct utimbuf *buf); - - // file ops - int mknod(const char *path, mode_t mode); - int open(const char *path, int mode); - int close(fh_t fh); - off_t lseek(fh_t fh, off_t offset, int whence); - int read(fh_t fh, char *buf, off_t size, off_t offset=-1); - int write(fh_t fh, const char *buf, off_t size, off_t offset=-1); - int truncate(const char *file, off_t size); - //int truncate(fh_t fh, long long size); - int fsync(fh_t fh, bool syncdataonly); - - - // hpc lazyio - int lazyio_propogate(int fd, off_t offset, size_t count); - int lazyio_synchronize(int fd, off_t offset, size_t count); - - // expose file layout - int describe_layout(int fd, FileLayout* layout); - int get_stripe_unit(int fd); - int get_stripe_width(int fd); - int get_stripe_period(int fd); - int enumerate_layout(int fd, list& result, - off_t length, off_t offset); - - // failure - void ms_handle_failure(Message*, const entity_inst_t& inst); -}; - -#endif diff --git a/branches/marnberg/quota/client/FileCache.cc b/branches/marnberg/quota/client/FileCache.cc deleted file mode 100644 index 2a1dd1576ae59..0000000000000 --- a/branches/marnberg/quota/client/FileCache.cc +++ /dev/null @@ -1,180 +0,0 @@ - -#include "config.h" -#include "include/types.h" - -#include "FileCache.h" -#include "osdc/ObjectCacher.h" - -#include "msg/Messenger.h" - -#undef dout -#define dout(x) if (x <= g_conf.debug_client) cout << g_clock.now() << " " << oc->objecter->messenger->get_myaddr() << ".filecache " -#define derr(x) if (x <= g_conf.debug_client) cout << g_clock.now() << " " << oc->objecter->messenger->get_myaddr() << ".filecache " - - -// flush/release/clean - -void FileCache::flush_dirty(Context *onflush) -{ - if (oc->flush_set(inode.ino, onflush)) { - onflush->finish(0); - delete onflush; - } -} - -off_t FileCache::release_clean() -{ - return oc->release_set(inode.ino); -} - -bool FileCache::is_cached() -{ - return oc->set_is_cached(inode.ino); -} - -bool FileCache::is_dirty() -{ - return oc->set_is_dirty_or_committing(inode.ino); -} - -void FileCache::empty(Context *onempty) -{ - off_t unclean = release_clean(); - bool clean = oc->flush_set(inode.ino, onempty); - assert(!unclean == clean); - - if (clean) { - onempty->finish(0); - delete onempty; - } -} - - -void FileCache::tear_down() -{ - off_t unclean = release_clean(); - if (unclean) { - dout(0) << "tear_down " << unclean << " unclean bytes, purging" << endl; - oc->purge_set(inode.ino); - } -} - -// caps - -void FileCache::set_caps(int caps, Context *onimplement) -{ - if (onimplement) { - assert(latest_caps & ~caps); // we should be losing caps. - caps_callbacks[caps].push_back(onimplement); - } - - latest_caps = caps; - check_caps(); -} - - -void FileCache::check_caps() -{ - int used = 0; - if (num_reading) used |= CAP_FILE_RD; - if (oc->set_is_cached(inode.ino)) used |= CAP_FILE_RDCACHE; - if (num_writing) used |= CAP_FILE_WR; - if (oc->set_is_dirty_or_committing(inode.ino)) used |= CAP_FILE_WRBUFFER; - dout(10) << "check_caps used " << cap_string(used) << endl; - - // check callbacks - map >::iterator p = caps_callbacks.begin(); - while (p != caps_callbacks.end()) { - if (used == 0 || (~(p->first) & used) == 0) { - // implemented. - dout(10) << "used is " << cap_string(used) - << ", caps " << cap_string(p->first) << " implemented, doing callback(s)" << endl; - finish_contexts(p->second); - map >::iterator o = p; - p++; - caps_callbacks.erase(o); - } else { - dout(10) << "used is " << cap_string(used) - << ", caps " << cap_string(p->first) << " not yet implemented" << endl; - p++; - } - } -} - - - -// read/write - -int FileCache::read(off_t offset, size_t size, bufferlist& blist, Mutex& client_lock) -{ - int r = 0; - - // inc reading counter - num_reading++; - - if (latest_caps & CAP_FILE_RDCACHE) { - // read (and block) - Cond cond; - bool done = false; - int rvalue = 0; - C_Cond *onfinish = new C_Cond(&cond, &done, &rvalue); - - r = oc->file_read(inode, offset, size, &blist, onfinish); - - if (r == 0) { - // block - while (!done) - cond.Wait(client_lock); - r = rvalue; - } else { - // it was cached. - delete onfinish; - } - } else { - r = oc->file_atomic_sync_read(inode, offset, size, &blist, client_lock); - } - - // dec reading counter - num_reading--; - - if (num_reading == 0 && !caps_callbacks.empty()) - check_caps(); - - return r; -} - -void FileCache::write(off_t offset, size_t size, bufferlist& blist, Mutex& client_lock) -{ - // inc writing counter - num_writing++; - - if (latest_caps & CAP_FILE_WRBUFFER) { // caps buffered write? - // wait? (this may block!) - oc->wait_for_write(size, client_lock); - - // async, caching, non-blocking. - oc->file_write(inode, offset, size, blist); - } else { - // atomic, synchronous, blocking. - oc->file_atomic_sync_write(inode, offset, size, blist, client_lock); - } - - // dec writing counter - num_writing--; - if (num_writing == 0 && !caps_callbacks.empty()) - check_caps(); -} - -bool FileCache::all_safe() -{ - return !oc->set_is_dirty_or_committing(inode.ino); -} - -void FileCache::add_safe_waiter(Context *c) -{ - bool safe = oc->commit_set(inode.ino, c); - if (safe) { - c->finish(0); - delete c; - } -} diff --git a/branches/marnberg/quota/client/FileCache.h b/branches/marnberg/quota/client/FileCache.h deleted file mode 100644 index 6bef22f4e0c6a..0000000000000 --- a/branches/marnberg/quota/client/FileCache.h +++ /dev/null @@ -1,70 +0,0 @@ -#ifndef __FILECACHE_H -#define __FILECACHE_H - -#include -using namespace std; - -#include "common/Cond.h" -#include "mds/Capability.h" - -class ObjectCacher; - -class FileCache { - ObjectCacher *oc; - inode_t inode; - - // caps - int latest_caps; - map > caps_callbacks; - - int num_reading; - int num_writing; - //int num_unsafe; - - // waiters - list waitfor_read; - list waitfor_write; - //list waitfor_safe; - bool waitfor_release; - - public: - FileCache(ObjectCacher *_oc, inode_t _inode) : - oc(_oc), - inode(_inode), - latest_caps(0), - num_reading(0), num_writing(0),// num_unsafe(0), - waitfor_release(false) {} - ~FileCache() { - tear_down(); - } - - // waiters/waiting - bool can_read() { return latest_caps & CAP_FILE_RD; } - bool can_write() { return latest_caps & CAP_FILE_WR; } - bool all_safe();// { return num_unsafe == 0; } - - void add_read_waiter(Cond *c) { waitfor_read.push_back(c); } - void add_write_waiter(Cond *c) { waitfor_write.push_back(c); } - void add_safe_waiter(Context *c);// { waitfor_safe.push_back(c); } - - // ... - void flush_dirty(Context *onflush=0); - off_t release_clean(); - void empty(Context *onempty=0); - bool is_empty() { return !(is_cached() || is_dirty()); } - bool is_cached(); - bool is_dirty(); - - void tear_down(); - - int get_caps() { return latest_caps; } - void set_caps(int caps, Context *onimplement=0); - void check_caps(); - - int read(off_t offset, size_t size, bufferlist& blist, Mutex& client_lock); // may block. - void write(off_t offset, size_t size, bufferlist& blist, Mutex& client_lock); // may block. - -}; - - -#endif diff --git a/branches/marnberg/quota/client/SyntheticClient.cc b/branches/marnberg/quota/client/SyntheticClient.cc deleted file mode 100644 index 6f0ad60dc88ab..0000000000000 --- a/branches/marnberg/quota/client/SyntheticClient.cc +++ /dev/null @@ -1,1283 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include -#include -using namespace std; - - - -#include "SyntheticClient.h" - -#include "include/filepath.h" -#include "mds/MDS.h" - -#include -#include -#include -#include -#include -#include - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_client) cout << g_clock.now() << " synthetic" << client->get_nodeid() << " " - -// traces -//void trace_include(SyntheticClient *syn, Client *cl, string& prefix); -//void trace_openssh(SyntheticClient *syn, Client *cl, string& prefix); - - -list syn_modes; -list syn_iargs; -list syn_sargs; - -void parse_syn_options(vector& args) -{ - vector nargs; - - for (unsigned i=0; iclient = client; - thread_id = 0; - - did_readdir = false; - - run_only = -1; - - this->modes = syn_modes; - this->iargs = syn_iargs; - this->sargs = syn_sargs; - - run_start = g_clock.now(); -} - - - - -#define DBL 2 - -void *synthetic_client_thread_entry(void *ptr) -{ - SyntheticClient *sc = (SyntheticClient*)ptr; - //int r = - sc->run(); - return 0;//(void*)r; -} - -string SyntheticClient::get_sarg(int seq) -{ - string a; - if (!sargs.empty()) { - a = sargs.front(); - sargs.pop_front(); - } - if (a.length() == 0 || a == "~") { - char s[20]; - sprintf(s,"syn.%d.%d", client->whoami, seq); - a = s; - } - //cout << "a is " << a << endl; - return a; -} - -int SyntheticClient::run() -{ - //run_start = g_clock.now(); - run_until = utime_t(0,0); - dout(5) << "run" << endl; - - for (list::iterator it = modes.begin(); - it != modes.end(); - it++) { - int mode = *it; - dout(3) << "mode " << mode << endl; - - switch (mode) { - case SYNCLIENT_MODE_RANDOMSLEEP: - { - int iarg1 = iargs.front(); - iargs.pop_front(); - if (run_me()) { - srand(time(0) + getpid() + client->whoami); - sleep(rand() % iarg1); - } - } - break; - - case SYNCLIENT_MODE_SLEEP: - { - int iarg1 = iargs.front(); - iargs.pop_front(); - if (run_me()) { - dout(2) << "sleep " << iarg1 << endl; - sleep(iarg1); - } - } - break; - - case SYNCLIENT_MODE_ONLY: - { - run_only = iargs.front(); - iargs.pop_front(); - if (run_only == client->get_nodeid()) - dout(2) << "only " << run_only << endl; - } - break; - - case SYNCLIENT_MODE_UNTIL: - { - int iarg1 = iargs.front(); - iargs.pop_front(); - if (iarg1) { - dout(2) << "until " << iarg1 << endl; - utime_t dur(iarg1,0); - run_until = run_start + dur; - } else { - dout(2) << "until " << iarg1 << " (no limit)" << endl; - run_until = utime_t(0,0); - } - } - break; - - case SYNCLIENT_MODE_SLEEPUNTIL: - { - int iarg1 = iargs.front(); - iargs.pop_front(); - if (iarg1) { - dout(2) << "sleepuntil " << iarg1 << endl; - utime_t at = g_clock.now() - run_start; - if (at.sec() < iarg1) - sleep(iarg1 - at.sec()); - } - } - break; - - case SYNCLIENT_MODE_RANDOMWALK: - { - int iarg1 = iargs.front(); - iargs.pop_front(); - if (run_me()) { - dout(2) << "randomwalk " << iarg1 << endl; - random_walk(iarg1); - } - } - break; - - case SYNCLIENT_MODE_MAKEDIRMESS: - { - string sarg1 = get_sarg(0); - int iarg1 = iargs.front(); iargs.pop_front(); - if (run_me()) { - dout(2) << "makedirmess " << sarg1 << " " << iarg1 << endl; - make_dir_mess(sarg1.c_str(), iarg1); - } - } - break; - case SYNCLIENT_MODE_MAKEDIRS: - { - string sarg1 = get_sarg(0); - int iarg1 = iargs.front(); iargs.pop_front(); - int iarg2 = iargs.front(); iargs.pop_front(); - int iarg3 = iargs.front(); iargs.pop_front(); - if (run_me()) { - dout(2) << "makedirs " << sarg1 << " " << iarg1 << " " << iarg2 << " " << iarg3 << endl; - make_dirs(sarg1.c_str(), iarg1, iarg2, iarg3); - } - } - break; - case SYNCLIENT_MODE_STATDIRS: - { - string sarg1 = get_sarg(0); - int iarg1 = iargs.front(); iargs.pop_front(); - int iarg2 = iargs.front(); iargs.pop_front(); - int iarg3 = iargs.front(); iargs.pop_front(); - if (run_me()) { - dout(2) << "statdirs " << sarg1 << " " << iarg1 << " " << iarg2 << " " << iarg3 << endl; - stat_dirs(sarg1.c_str(), iarg1, iarg2, iarg3); - } - } - break; - case SYNCLIENT_MODE_READDIRS: - { - string sarg1 = get_sarg(0); - int iarg1 = iargs.front(); iargs.pop_front(); - int iarg2 = iargs.front(); iargs.pop_front(); - int iarg3 = iargs.front(); iargs.pop_front(); - if (run_me()) { - dout(2) << "readdirs " << sarg1 << " " << iarg1 << " " << iarg2 << " " << iarg3 << endl; - read_dirs(sarg1.c_str(), iarg1, iarg2, iarg3); - } - } - break; - - - case SYNCLIENT_MODE_MAKEFILES: - { - int num = iargs.front(); iargs.pop_front(); - int count = iargs.front(); iargs.pop_front(); - int priv = iargs.front(); iargs.pop_front(); - if (run_me()) { - dout(2) << "makefiles " << num << " " << count << " " << priv << endl; - make_files(num, count, priv, false); - } - } - break; - case SYNCLIENT_MODE_MAKEFILES2: - { - int num = iargs.front(); iargs.pop_front(); - int count = iargs.front(); iargs.pop_front(); - int priv = iargs.front(); iargs.pop_front(); - if (run_me()) { - dout(2) << "makefiles2 " << num << " " << count << " " << priv << endl; - make_files(num, count, priv, true); - } - } - break; - case SYNCLIENT_MODE_CREATESHARED: - { - string sarg1 = get_sarg(0); - int num = iargs.front(); iargs.pop_front(); - if (run_me()) { - dout(2) << "createshared " << num << endl; - create_shared(num); - } - } - break; - case SYNCLIENT_MODE_OPENSHARED: - { - string sarg1 = get_sarg(0); - int num = iargs.front(); iargs.pop_front(); - int count = iargs.front(); iargs.pop_front(); - if (run_me()) { - dout(2) << "openshared " << num << endl; - open_shared(num, count); - } - } - break; - - case SYNCLIENT_MODE_FULLWALK: - { - string sarg1;// = get_sarg(0); - if (run_me()) { - dout(2) << "fullwalk" << sarg1 << endl; - full_walk(sarg1); - } - } - break; - case SYNCLIENT_MODE_REPEATWALK: - { - string sarg1 = get_sarg(0); - if (run_me()) { - dout(2) << "repeatwalk " << sarg1 << endl; - while (full_walk(sarg1) == 0) ; - } - } - break; - - case SYNCLIENT_MODE_WRITEFILE: - { - string sarg1 = get_sarg(0); - int iarg1 = iargs.front(); iargs.pop_front(); - int iarg2 = iargs.front(); iargs.pop_front(); - if (run_me()) - write_file(sarg1, iarg1, iarg2); - } - break; - case SYNCLIENT_MODE_WRSHARED: - { - string sarg1 = "shared"; - int iarg1 = iargs.front(); iargs.pop_front(); - int iarg2 = iargs.front(); iargs.pop_front(); - if (run_me()) - write_file(sarg1, iarg1, iarg2); - } - break; - case SYNCLIENT_MODE_WRITEBATCH: - { - int iarg1 = iargs.front(); iargs.pop_front(); - int iarg2 = iargs.front(); iargs.pop_front(); - int iarg3 = iargs.front(); iargs.pop_front(); - - if (run_me()) - write_batch(iarg1, iarg2, iarg3); - } - break; - - case SYNCLIENT_MODE_READFILE: - { - string sarg1 = get_sarg(0); - int iarg1 = iargs.front(); iargs.pop_front(); - int iarg2 = iargs.front(); iargs.pop_front(); - if (run_me()) - read_file(sarg1, iarg1, iarg2); - } - break; - - case SYNCLIENT_MODE_TRACE: - { - string tfile = get_sarg(0); - sargs.push_front(string("~")); - int iarg1 = iargs.front(); iargs.pop_front(); - string prefix = get_sarg(0); - - if (run_me()) { - dout(2) << "trace " << tfile << " prefix " << prefix << " ... " << iarg1 << " times" << endl; - - Trace t(tfile.c_str()); - - client->mkdir(prefix.c_str(), 0755); - - for (int i=0; i 0 - && i < iarg1-1 - ) { - client_logger->finc("trsum", (double)lat); - client_logger->inc("trnum"); - } - } - } - } - break; - - - case SYNCLIENT_MODE_OPENTEST: - { - int count = iargs.front(); iargs.pop_front(); - if (run_me()) { - for (int i=0; iopen("test", rand()%2 ? (O_WRONLY|O_CREAT):O_RDONLY); - if (fd > 0) client->close(fd); - } - } - } - break; - - case SYNCLIENT_MODE_OPTEST: - { - int count = iargs.front(); iargs.pop_front(); - if (run_me()) { - client->mknod("test",0777); - struct stat st; - for (int i=0; ilstat("test", &st); - client->chmod("test", 0777); - } - } - } - break; - - default: - assert(0); - } - } - return 0; -} - - -int SyntheticClient::start_thread() -{ - assert(!thread_id); - - pthread_create(&thread_id, NULL, synthetic_client_thread_entry, this); - assert(thread_id); - return 0; -} - -int SyntheticClient::join_thread() -{ - assert(thread_id); - void *rv; - pthread_join(thread_id, &rv); - return 0; -} - - -bool roll_die(float p) -{ - float r = (float)(rand() % 100000) / 100000.0; - if (r < p) - return true; - else - return false; -} - -void SyntheticClient::init_op_dist() -{ - op_dist.clear(); - op_dist.add( MDS_OP_STAT, g_conf.fakeclient_op_stat ); - op_dist.add( MDS_OP_UTIME, g_conf.fakeclient_op_utime ); - op_dist.add( MDS_OP_CHMOD, g_conf.fakeclient_op_chmod ); - op_dist.add( MDS_OP_CHOWN, g_conf.fakeclient_op_chown ); - - op_dist.add( MDS_OP_READDIR, g_conf.fakeclient_op_readdir ); - op_dist.add( MDS_OP_MKNOD, g_conf.fakeclient_op_mknod ); - op_dist.add( MDS_OP_LINK, g_conf.fakeclient_op_link ); - op_dist.add( MDS_OP_UNLINK, g_conf.fakeclient_op_unlink ); - op_dist.add( MDS_OP_RENAME, g_conf.fakeclient_op_rename ); - - op_dist.add( MDS_OP_MKDIR, g_conf.fakeclient_op_mkdir ); - op_dist.add( MDS_OP_RMDIR, g_conf.fakeclient_op_rmdir ); - op_dist.add( MDS_OP_SYMLINK, g_conf.fakeclient_op_symlink ); - - op_dist.add( MDS_OP_OPEN, g_conf.fakeclient_op_openrd ); - //op_dist.add( MDS_OP_READ, g_conf.fakeclient_op_read ); - //op_dist.add( MDS_OP_WRITE, g_conf.fakeclient_op_write ); - op_dist.add( MDS_OP_TRUNCATE, g_conf.fakeclient_op_truncate ); - op_dist.add( MDS_OP_FSYNC, g_conf.fakeclient_op_fsync ); - op_dist.add( MDS_OP_RELEASE, g_conf.fakeclient_op_close ); // actually, close() - op_dist.normalize(); -} - -void SyntheticClient::up() -{ - cwd = cwd.prefixpath(cwd.depth()-1); - dout(DBL) << "cd .. -> " << cwd << endl; - clear_dir(); -} - - -int SyntheticClient::play_trace(Trace& t, string& prefix) -{ - dout(4) << "play trace" << endl; - t.start(); - - utime_t start = g_clock.now(); - - const char *p = prefix.c_str(); - - map<__int64_t, __int64_t> open_files; - - while (!t.end()) { - - if (time_to_stop()) break; - - // op - const char *op = t.get_string(); - dout(4) << "trace op " << op << endl; - if (strcmp(op, "link") == 0) { - const char *a = t.get_string(p); - const char *b = t.get_string(p); - client->link(a,b); - } else if (strcmp(op, "unlink") == 0) { - const char *a = t.get_string(p); - client->unlink(a); - } else if (strcmp(op, "rename") == 0) { - const char *a = t.get_string(p); - const char *b = t.get_string(p); - client->rename(a,b); - } else if (strcmp(op, "mkdir") == 0) { - const char *a = t.get_string(p); - __int64_t b = t.get_int(); - client->mkdir(a, b); - } else if (strcmp(op, "rmdir") == 0) { - const char *a = t.get_string(p); - client->rmdir(a); - } else if (strcmp(op, "symlink") == 0) { - const char *a = t.get_string(p); - const char *b = t.get_string(p); - client->symlink(a,b); - } else if (strcmp(op, "readlink") == 0) { - const char *a = t.get_string(p); - char buf[100]; - client->readlink(a, buf, 100); - } else if (strcmp(op, "lstat") == 0) { - struct stat st; - const char *a = t.get_string(p); - client->lstat(a, &st); - } else if (strcmp(op, "chmod") == 0) { - const char *a = t.get_string(p); - __int64_t b = t.get_int(); - client->chmod(a, b); - } else if (strcmp(op, "chown") == 0) { - const char *a = t.get_string(p); - __int64_t b = t.get_int(); - __int64_t c = t.get_int(); - client->chown(a, b, c); - } else if (strcmp(op, "utime") == 0) { - const char *a = t.get_string(p); - __int64_t b = t.get_int(); - __int64_t c = t.get_int(); - struct utimbuf u; - u.actime = b; - u.modtime = c; - client->utime(a, &u); - } else if (strcmp(op, "mknod") == 0) { - const char *a = t.get_string(p); - __int64_t b = t.get_int(); - client->mknod(a, b); - } else if (strcmp(op, "getdir") == 0) { - const char *a = t.get_string(p); - map contents; - client->getdir(a, contents); - } else if (strcmp(op, "open") == 0) { - const char *a = t.get_string(p); - __int64_t b = t.get_int(); - __int64_t id = t.get_int(); - __int64_t fh = client->open(a, b); - open_files[id] = fh; - } else if (strcmp(op, "close") == 0) { - __int64_t id = t.get_int(); - __int64_t fh = open_files[id]; - if (fh > 0) client->close(fh); - open_files.erase(id); - } else if (strcmp(op, "truncate") == 0) { - const char *a = t.get_string(p); - __int64_t b = t.get_int(); - client->truncate(a,b); - } else if (strcmp(op, "read") == 0) { - __int64_t id = t.get_int(); - __int64_t fh = open_files[id]; - int size = t.get_int(); - int off = t.get_int(); - char *buf = new char[size]; - client->read(fh, buf, size, off); - delete[] buf; - } else if (strcmp(op, "write") == 0) { - __int64_t id = t.get_int(); - __int64_t fh = open_files[id]; - int size = t.get_int(); - int off = t.get_int(); - char *buf = new char[size]; - memset(buf, 1, size); // let's write 1's! - client->write(fh, buf, size, off); - delete[] buf; - } else if (strcmp(op, "fsync") == 0) { - assert(0); - } else - assert(0); - } - - // close open files - for (map<__int64_t, __int64_t>::iterator fi = open_files.begin(); - fi != open_files.end(); - fi++) { - dout(1) << "leftover close " << fi->second << endl; - if (fi->second > 0) client->close(fi->second); - } - - return 0; -} - - -int SyntheticClient::clean_dir(string& basedir) -{ - // read dir - map contents; - int r = client->getdir(basedir.c_str(), contents); - if (r < 0) { - dout(1) << "readdir on " << basedir << " returns " << r << endl; - return r; - } - - for (map::iterator it = contents.begin(); - it != contents.end(); - it++) { - if (it->first == ".") continue; - if (it->first == "..") continue; - string file = basedir + "/" + it->first; - - if (time_to_stop()) break; - - struct stat st; - int r = client->lstat(file.c_str(), &st); - if (r < 0) { - dout(1) << "stat error on " << file << " r=" << r << endl; - continue; - } - - if ((st.st_mode & INODE_TYPE_MASK) == INODE_MODE_DIR) { - clean_dir(file); - client->rmdir(file.c_str()); - } else { - client->unlink(file.c_str()); - } - } - - return 0; - -} - - -int SyntheticClient::full_walk(string& basedir) -{ - if (time_to_stop()) return -1; - - list dirq; - dirq.push_back(basedir); - - while (!dirq.empty()) { - string dir = dirq.front(); - dirq.pop_front(); - - // read dir - map contents; - int r = client->getdir(dir.c_str(), contents); - if (r < 0) { - dout(1) << "readdir on " << dir << " returns " << r << endl; - continue; - } - - for (map::iterator it = contents.begin(); - it != contents.end(); - it++) { - if (it->first == ".") continue; - if (it->first == "..") continue; - string file = dir + "/" + it->first; - - struct stat st; - int r = client->lstat(file.c_str(), &st); - if (r < 0) { - dout(1) << "stat error on " << file << " r=" << r << endl; - continue; - } - - if ((st.st_mode & INODE_TYPE_MASK) == INODE_MODE_DIR) { - dirq.push_back(file); - } - } - } - - return 0; -} - -int SyntheticClient::make_dirs(const char *basedir, int dirs, int files, int depth) -{ - if (time_to_stop()) return 0; - - // make sure base dir exists - int r = client->mkdir(basedir, 0755); - if (r != 0) { - dout(1) << "can't make base dir? " << basedir << endl; - return -1; - } - - // children - char d[500]; - dout(3) << "make_dirs " << basedir << " dirs " << dirs << " files " << files << " depth " << depth << endl; - for (int i=0; imknod(d, 0644); - } - - if (depth == 0) return 0; - - for (int i=0; ilstat(basedir, &st); - if (r != 0) { - dout(1) << "can't make base dir? " << basedir << endl; - return -1; - } - - // children - char d[500]; - dout(3) << "stat_dirs " << basedir << " dirs " << dirs << " files " << files << " depth " << depth << endl; - for (int i=0; ilstat(d, &st); - } - - if (depth == 0) return 0; - - for (int i=0; i contents; - utime_t s = g_clock.now(); - int r = client->getdir(basedir, contents); - utime_t e = g_clock.now(); - e -= s; - if (client_logger) client_logger->finc("readdir", e); - if (r < 0) { - dout(0) << "read_dirs couldn't readdir " << basedir << ", stopping" << endl; - return -1; - } - - for (int i=0; ilstat(d, &st) < 0) { - dout(2) << "read_dirs failed stat on " << d << ", stopping" << endl; - return -1; - } - utime_t e = g_clock.now(); - e -= s; - if (client_logger) client_logger->finc("stat", e); - } - - if (depth > 0) - for (int i=0; iget_nodeid(); - char d[255]; - - if (priv) { - for (int c=0; cmkdir(d, 0755); - } - } else { - // shared - if (whoami == 0) { - for (int c=0; cmkdir(d, 0755); - } - } else { - sleep(5); - } - } - - // files - struct stat st; - for (int c=0; cmknod(d, 0644); - - if (more) { - client->lstat(d, &st); - int fd = client->open(d, O_RDONLY); - client->unlink(d); - client->close(fd); - } - - if (time_to_stop()) return 0; - } - } - - return 0; -} - - -int SyntheticClient::create_shared(int num) -{ - // files - char d[255]; - for (int n=0; nmknod(d, 0644); - } - - return 0; -} - -int SyntheticClient::open_shared(int num, int count) -{ - // files - char d[255]; - for (int c=0; c fds; - for (int n=0; nopen(d,O_RDONLY); - fds.push_back(fd); - } - - while (!fds.empty()) { - int fd = fds.front(); - fds.pop_front(); - client->close(fd); - } - } - - return 0; -} - - -int SyntheticClient::write_file(string& fn, int size, int wrsize) // size is in MB, wrsize in bytes -{ - //__uint64_t wrsize = 1024*256; - char *buf = new char[wrsize+100]; // 1 MB - memset(buf, 7, wrsize); - __uint64_t chunks = (__uint64_t)size * (__uint64_t)(1024*1024) / (__uint64_t)wrsize; - - int fd = client->open(fn.c_str(), O_RDWR|O_CREAT); - dout(5) << "writing to " << fn << " fd " << fd << endl; - if (fd < 0) return fd; - - for (unsigned i=0; iget_nodeid(); - p++; - } - - client->write(fd, buf, wrsize, i*wrsize); - } - - client->close(fd); - delete[] buf; - - return 0; -} - -int SyntheticClient::write_batch(int nfile, int size, int wrsize) -{ - for (int i=0; iopen(fn.c_str(), O_RDONLY); - dout(5) << "reading from " << fn << " fd " << fd << endl; - if (fd < 0) return fd; - - for (unsigned i=0; iread(fd, buf, rdsize, i*rdsize); - if (r < rdsize) { - dout(1) << "read_file got r = " << r << ", probably end of file" << endl; - break; - } - - // verify fingerprint - int bad = 0; - __int64_t *p = (__int64_t*)buf; - __int64_t readoff, readclient; - while ((char*)p + 32 < buf + rdsize) { - readoff = *p; - __int64_t wantoff = i*rdsize + (__int64_t)((char*)p - buf); - p++; - readclient = *p; - p++; - if (readoff != wantoff || - readclient != client->get_nodeid()) { - if (!bad) - dout(0) << "WARNING: wrong data from OSD, block says fileoffset=" << readoff << " client=" << readclient - << ", should be offset " << wantoff << " clietn " << client->get_nodeid() - << endl; - bad++; - } - } - if (bad) - dout(0) << " + " << (bad-1) << " other bad 16-byte bits in this block" << endl; - } - - client->close(fd); - delete[] buf; - - return 0; -} - - - -int SyntheticClient::random_walk(int num_req) -{ - int left = num_req; - - //dout(1) << "random_walk() will do " << left << " ops" << endl; - - init_op_dist(); // set up metadata op distribution - - while (left > 0) { - left--; - - if (time_to_stop()) break; - - // ascend? - if (cwd.depth() && !roll_die(::pow((double).9, (double)cwd.depth()))) { - dout(DBL) << "die says up" << endl; - up(); - continue; - } - - // descend? - if (.9*roll_die(::pow((double).9,(double)cwd.depth())) && subdirs.size()) { - string s = get_random_subdir(); - cwd.add_dentry( s ); - dout(DBL) << "cd " << s << " -> " << cwd << endl; - clear_dir(); - continue; - } - - int op = 0; - filepath path; - - if (contents.empty() && roll_die(.3)) { - if (did_readdir) { - dout(DBL) << "empty dir, up" << endl; - up(); - } else - op = MDS_OP_READDIR; - } else { - op = op_dist.sample(); - } - //dout(DBL) << "op is " << op << endl; - - int r = 0; - - // do op - if (op == MDS_OP_UNLINK) { - if (contents.empty()) - op = MDS_OP_READDIR; - else - r = client->unlink( get_random_sub() ); // will fail on dirs - } - - if (op == MDS_OP_RENAME) { - if (contents.empty()) - op = MDS_OP_READDIR; - else { - r = client->rename( get_random_sub(), make_sub("ren") ); - } - } - - if (op == MDS_OP_MKDIR) { - r = client->mkdir( make_sub("mkdir"), 0755); - } - - if (op == MDS_OP_RMDIR) { - if (!subdirs.empty()) - r = client->rmdir( get_random_subdir() ); - else - r = client->rmdir( cwd.c_str() ); // will pbly fail - } - - if (op == MDS_OP_SYMLINK) { - } - - if (op == MDS_OP_CHMOD) { - if (contents.empty()) - op = MDS_OP_READDIR; - else - r = client->chmod( get_random_sub(), rand() & 0755 ); - } - - if (op == MDS_OP_CHOWN) { - if (contents.empty()) r = client->chown( cwd.c_str(), rand(), rand() ); - else - r = client->chown( get_random_sub(), rand(), rand() ); - } - - if (op == MDS_OP_LINK) { - } - - if (op == MDS_OP_UTIME) { - struct utimbuf b; - memset(&b, 1, sizeof(b)); - if (contents.empty()) - r = client->utime( cwd.c_str(), &b ); - else - r = client->utime( get_random_sub(), &b ); - } - - if (op == MDS_OP_MKNOD) { - r = client->mknod( make_sub("mknod"), 0644); - } - - if (op == MDS_OP_OPEN) { - if (contents.empty()) - op = MDS_OP_READDIR; - else { - r = client->open( get_random_sub(), O_RDONLY ); - if (r > 0) { - assert(open_files.count(r) == 0); - open_files.insert(r); - } - } - } - - if (op == MDS_OP_RELEASE) { // actually, close - if (open_files.empty()) - op = MDS_OP_STAT; - else { - int fh = get_random_fh(); - r = client->close( fh ); - if (r == 0) open_files.erase(fh); - } - } - - if (op == MDS_OP_STAT) { - struct stat st; - if (contents.empty()) { - if (did_readdir) { - if (roll_die(.1)) { - dout(DBL) << "stat in empty dir, up" << endl; - up(); - } else { - op = MDS_OP_MKNOD; - } - } else - op = MDS_OP_READDIR; - } else - r = client->lstat(get_random_sub(), &st); - } - - if (op == MDS_OP_READDIR) { - clear_dir(); - - map c; - r = client->getdir( cwd.c_str(), c ); - - for (map::iterator it = c.begin(); - it != c.end(); - it++) { - //dout(DBL) << " got " << it->first << endl; - contents[it->first] = it->second; - if (it->second.is_dir()) - subdirs.insert(it->first); - } - - did_readdir = true; - } - - // errors? - if (r < 0) { - // reevaluate cwd. - //while (cwd.depth()) { - //if (client->lookup(cwd)) break; // it's in the cache - - //dout(DBL) << "r = " << r << ", client doesn't have " << cwd << ", cd .." << endl; - dout(DBL) << "r = " << r << ", client may not have " << cwd << ", cd .." << endl; - up(); - //} - } - } - - // close files - dout(DBL) << "closing files" << endl; - while (!open_files.empty()) { - int fh = get_random_fh(); - int r = client->close( fh ); - if (r == 0) open_files.erase(fh); - } - - dout(DBL) << "done" << endl; - return 0; -} - - - - -void SyntheticClient::make_dir_mess(const char *basedir, int n) -{ - vector dirs; - - dirs.push_back(basedir); - dirs.push_back(basedir); - - client->mkdir(basedir, 0755); - - // motivation: - // P(dir) ~ subdirs_of(dir) + 2 - // from 5-year metadata workload paper in fast'07 - - // create dirs - for (int i=0; i> dir; - - // update dirs - dirs.push_back(parent); - dirs.push_back(dir); - dirs.push_back(dir); - - // do it - client->mkdir(dir.c_str(), 0755); - } - - -} - diff --git a/branches/marnberg/quota/client/SyntheticClient.h b/branches/marnberg/quota/client/SyntheticClient.h deleted file mode 100644 index ebf96386be95c..0000000000000 --- a/branches/marnberg/quota/client/SyntheticClient.h +++ /dev/null @@ -1,201 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __SYNTHETICCLIENT_H -#define __SYNTHETICCLIENT_H - -#include - -#include "Client.h" -#include "include/Distribution.h" - -#include "Trace.h" - -#define SYNCLIENT_MODE_RANDOMWALK 1 -#define SYNCLIENT_MODE_FULLWALK 2 -#define SYNCLIENT_MODE_REPEATWALK 3 - -#define SYNCLIENT_MODE_MAKEDIRMESS 7 -#define SYNCLIENT_MODE_MAKEDIRS 8 // dirs files depth -#define SYNCLIENT_MODE_STATDIRS 9 // dirs files depth -#define SYNCLIENT_MODE_READDIRS 10 // dirs files depth - -#define SYNCLIENT_MODE_MAKEFILES 11 // num count private -#define SYNCLIENT_MODE_MAKEFILES2 12 // num count private -#define SYNCLIENT_MODE_CREATESHARED 13 // num -#define SYNCLIENT_MODE_OPENSHARED 14 // num count - -#define SYNCLIENT_MODE_WRITEFILE 20 -#define SYNCLIENT_MODE_READFILE 21 -#define SYNCLIENT_MODE_WRITEBATCH 22 -#define SYNCLIENT_MODE_WRSHARED 23 - -#define SYNCLIENT_MODE_TRACE 30 - -#define SYNCLIENT_MODE_OPENTEST 40 -#define SYNCLIENT_MODE_OPTEST 41 - -#define SYNCLIENT_MODE_ONLY 50 -#define SYNCLIENT_MODE_UNTIL 51 -#define SYNCLIENT_MODE_SLEEPUNTIL 52 - -#define SYNCLIENT_MODE_RANDOMSLEEP 61 -#define SYNCLIENT_MODE_SLEEP 62 - - - - -void parse_syn_options(vector& args); - -class SyntheticClient { - Client *client; - - pthread_t thread_id; - - Distribution op_dist; - - void init_op_dist(); - int get_op(); - - - filepath cwd; - map contents; - set subdirs; - bool did_readdir; - set open_files; - - void up(); - - void clear_dir() { - contents.clear(); - subdirs.clear(); - did_readdir = false; - } - - int get_random_fh() { - int r = rand() % open_files.size(); - set::iterator it = open_files.begin(); - while (r--) it++; - return *it; - } - - - filepath n1; - const char *get_random_subdir() { - assert(!subdirs.empty()); - int r = ((rand() % subdirs.size()) + (rand() % subdirs.size())) / 2; // non-uniform distn - set::iterator it = subdirs.begin(); - while (r--) it++; - - n1 = cwd; - n1.add_dentry( *it ); - return n1.get_path().c_str(); - } - filepath n2; - const char *get_random_sub() { - assert(!contents.empty()); - int r = ((rand() % contents.size()) + (rand() % contents.size())) / 2; // non-uniform distn - if (cwd.depth() && cwd.last_bit().length()) - r += cwd.last_bit().c_str()[0]; // slightly permuted - r %= contents.size(); - - map::iterator it = contents.begin(); - while (r--) it++; - - n2 = cwd; - n2.add_dentry( it->first ); - return n2.get_path().c_str(); - } - - filepath sub; - char sub_s[50]; - const char *make_sub(char *base) { - sprintf(sub_s, "%s.%d", base, rand() % 100); - string f = sub_s; - sub = cwd; - sub.add_dentry(f); - return sub.c_str(); - } - - public: - SyntheticClient(Client *client); - - int start_thread(); - int join_thread(); - - int run(); - - bool run_me() { - if (run_only >= 0) { - if (run_only == client->get_nodeid()) { - run_only = -1; - return true; - } - run_only = -1; - return false; - } - return true; - } - - // run() will do one of these things: - list modes; - list sargs; - list iargs; - utime_t run_start; - utime_t run_until; - - int run_only; - - string get_sarg(int seq); - - bool time_to_stop() { - utime_t now = g_clock.now(); - if (0) cout << "time_to_stop .. now " << now - << " until " << run_until - << " start " << run_start - << endl; - if (run_until.sec() && now > run_until) - return true; - else - return false; - } - - string compose_path(string& prefix, char *rest) { - return prefix + rest; - } - - int full_walk(string& fromdir); - int random_walk(int n); - - int make_dirs(const char *basedir, int dirs, int files, int depth); - int stat_dirs(const char *basedir, int dirs, int files, int depth); - int read_dirs(const char *basedir, int dirs, int files, int depth); - int make_files(int num, int count, int priv, bool more); - - int create_shared(int num); - int open_shared(int num, int count); - - int write_file(string& fn, int mb, int chunk); - int write_batch(int nfile, int mb, int chunk); - int read_file(string& fn, int mb, int chunk); - - int clean_dir(string& basedir); - - int play_trace(Trace& t, string& prefix); - - void make_dir_mess(const char *basedir, int n); - -}; - -#endif diff --git a/branches/marnberg/quota/client/Trace.cc b/branches/marnberg/quota/client/Trace.cc deleted file mode 100644 index 43459653011a1..0000000000000 --- a/branches/marnberg/quota/client/Trace.cc +++ /dev/null @@ -1,125 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "Trace.h" - -#include -#include -#include -#include -using namespace __gnu_cxx; - -#include "common/Mutex.h" - -#include "config.h" - -#include -#include -#include - - -Mutex trace_lock; - -class TokenList { -public: - string filename; - char *data; - int len; - list tokens; - - int ref; - - TokenList() : data(0), ref(0) {} - ~TokenList() { - delete[] data; - } -}; - -map traces; - - -// -Trace::Trace(const char* f) -{ - string filename = f; - - trace_lock.Lock(); - - if (traces.count(filename)) - tl = traces[filename]; - else { - tl = new TokenList; - tl->filename = filename; - - // open file - crope cr; - int fd = open(filename.c_str(), O_RDONLY); - assert(fd > 0); - char buf[100]; - while (1) { - int r = read(fd, buf, 100); - if (r == 0) break; - assert(r > 0); - cr.append(buf, r); - } - close(fd); - - // copy - tl->len = cr.length()+1; - tl->data = new char[tl->len]; - memcpy(tl->data, cr.c_str(), cr.length()); - tl->data[tl->len-1] = '\n'; - - // index! - int o = 0; - while (o < tl->len) { - char *n = tl->data + o; - - // find newline - while (tl->data[o] != '\n') o++; - assert(tl->data[o] == '\n'); - tl->data[o] = 0; - - if (tl->data + o > n) tl->tokens.push_back(n); - o++; - } - - dout(1) << "trace " << filename << " loaded with " << tl->tokens.size() << " tokens" << endl; - traces[filename] = tl; - } - - tl->ref++; - - trace_lock.Unlock(); -} - -Trace::~Trace() -{ - trace_lock.Lock(); - - tl->ref--; - if (tl->ref == 0) { - traces.erase(tl->filename); - delete tl; - } - - trace_lock.Unlock(); -} - - -list& Trace::get_list() -{ - return tl->tokens; -} diff --git a/branches/marnberg/quota/client/Trace.h b/branches/marnberg/quota/client/Trace.h deleted file mode 100644 index 08b1fa8ff2722..0000000000000 --- a/branches/marnberg/quota/client/Trace.h +++ /dev/null @@ -1,75 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __CLIENT_TRACE_H -#define __CLIENT_TRACE_H - -#include -#include -#include -using namespace std; - -/* - - this class is more like an iterator over a constant tokenlist (which - is protected by a mutex, see Trace.cc) - - */ - -class Trace { - class TokenList *tl; - - public: - Trace(const char* filename); - ~Trace(); - - list& get_list(); - - list::iterator _cur; - list::iterator _end; - - void start() { - _cur = get_list().begin(); - _end = get_list().end(); - ns = 0; - } - - char strings[10][200]; - int ns; - const char *get_string(const char *prefix = 0) { - assert(_cur != _end); - const char *s = *_cur; - _cur++; - if (prefix) { - if (strstr(s, "/prefix") == s || - strstr(s, "/prefix") == s+1) { - strcpy(strings[ns], prefix); - strcpy(strings[ns] + strlen(prefix), - s + strlen("/prefix")); - s = (const char*)strings[ns]; - ns++; - if (ns == 10) ns = 0; - } - } - return s; - } - __int64_t get_int() { - return atoll(get_string()); - } - bool end() { - return _cur == _end; - } -}; - -#endif diff --git a/branches/marnberg/quota/client/fuse.cc b/branches/marnberg/quota/client/fuse.cc deleted file mode 100644 index f4a1c2d3f7797..0000000000000 --- a/branches/marnberg/quota/client/fuse.cc +++ /dev/null @@ -1,281 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -/* - FUSE: Filesystem in Userspace - Copyright (C) 2001-2005 Miklos Szeredi - - This program can be distributed under the terms of the GNU GPL. - See the file COPYING. -*/ - - -// fuse crap -#ifdef linux -/* For pread()/pwrite() */ -#define _XOPEN_SOURCE 500 -#endif - -#define FUSE_USE_VERSION 25 - -#include -#include -#include -#include -#include -#include -#include -#include - - -// ceph stuff -#include "include/types.h" - -#include "Client.h" - -#include "config.h" - -// stl -#include -using namespace std; - - -// globals -Client *client; // the ceph client - - - -// ------ -// fuse hooks - -static int ceph_getattr(const char *path, struct stat *stbuf) -{ - return client->lstat(path, stbuf); -} - -static int ceph_readlink(const char *path, char *buf, size_t size) -{ - int res; - - res = client->readlink(path, buf, size - 1); - if (res < 0) return res; - - buf[res] = '\0'; - return 0; -} - - -static int ceph_getdir(const char *path, fuse_dirh_t h, fuse_dirfil_t filler) -{ - map contents; - - int res = client->getdir(path, contents); - if (res < 0) return res; - - // return contents to fuse via callback - for (map::iterator it = contents.begin(); - it != contents.end(); - it++) { - // (immutable) inode contents too. - res = filler(h, // fuse's handle - it->first.c_str(), // dentry as char* - it->second.mode & INODE_TYPE_MASK, // mask type bits from mode - it->second.ino); // ino.. 64->32 bit issue here? FIXME - if (res != 0) break; // fuse has had enough - } - return res; -} - -static int ceph_mknod(const char *path, mode_t mode, dev_t rdev) -{ - return client->mknod(path, mode); -} - -static int ceph_mkdir(const char *path, mode_t mode) -{ - return client->mkdir(path, mode); -} - -static int ceph_unlink(const char *path) -{ - return client->unlink(path); -} - -static int ceph_rmdir(const char *path) -{ - return client->rmdir(path); -} - -static int ceph_symlink(const char *from, const char *to) -{ - return client->symlink(from, to); -} - -static int ceph_rename(const char *from, const char *to) -{ - return client->rename(from, to); -} - -static int ceph_link(const char *from, const char *to) -{ - return client->link(from, to); -} - -static int ceph_chmod(const char *path, mode_t mode) -{ - return client->chmod(path, mode); -} - -static int ceph_chown(const char *path, uid_t uid, gid_t gid) -{ - return client->chown(path, uid, gid); -} - -static int ceph_truncate(const char *path, off_t size) -{ - return client->truncate(path, size); -} - -static int ceph_utime(const char *path, struct utimbuf *buf) -{ - return client->utime(path, buf); -} - - -static int ceph_open(const char *path, struct fuse_file_info *fi) -{ - int res; - - res = client->open(path, fi->flags); - if (res < 0) return res; - fi->fh = res; - return 0; // fuse wants 0 onsucess -} - -static int ceph_read(const char *path, char *buf, size_t size, off_t offset, - struct fuse_file_info *fi) -{ - fh_t fh = fi->fh; - return client->read(fh, buf, size, offset); -} - -static int ceph_write(const char *path, const char *buf, size_t size, - off_t offset, struct fuse_file_info *fi) -{ - fh_t fh = fi->fh; - return client->write(fh, buf, size, offset); -} - -/* -static int ceph_flush(const char *path, struct fuse_file_info *fi) -{ - fh_t fh = fi->fh; - return client->flush(fh); -} -*/ - - -static int ceph_statfs(const char *path, struct statvfs *stbuf) -{ - return client->statfs(path, stbuf); -} - - - -static int ceph_release(const char *path, struct fuse_file_info *fi) -{ - fh_t fh = fi->fh; - int r = client->close(fh); // close the file - return r; -} - -static int ceph_fsync(const char *path, int isdatasync, - struct fuse_file_info *fi) -{ - fh_t fh = fi->fh; - return client->fsync(fh, isdatasync ? true:false); -} - - -static struct fuse_operations ceph_oper = { - getattr: ceph_getattr, - readlink: ceph_readlink, - getdir: ceph_getdir, - mknod: ceph_mknod, - mkdir: ceph_mkdir, - unlink: ceph_unlink, - rmdir: ceph_rmdir, - symlink: ceph_symlink, - rename: ceph_rename, - link: ceph_link, - chmod: ceph_chmod, - chown: ceph_chown, - truncate: ceph_truncate, - utime: ceph_utime, - open: ceph_open, - read: ceph_read, - write: ceph_write, - statfs: ceph_statfs, - flush: 0, //ceph_flush, - release: ceph_release, - fsync: ceph_fsync -}; - - -int ceph_fuse_main(Client *c, int argc, char *argv[]) -{ - // init client - client = c; - - // set up fuse argc/argv - int newargc = 0; - char **newargv = (char **) malloc((argc + 10) * sizeof(char *)); - newargv[newargc++] = argv[0]; - - // allow other (all!) users to see my file system - // NOTE: echo user_allow_other >> /etc/fuse.conf - // NB: seems broken on Darwin -#ifndef DARWIN - newargv[newargc++] = "-o"; - newargv[newargc++] = "allow_other"; -#endif // DARWIN - - // use inos - newargv[newargc++] = "-o"; - newargv[newargc++] = "use_ino"; - - // large reads, direct_io (no kernel cachine) - //newargv[newargc++] = "-o"; - //newargv[newargc++] = "large_read"; - if (g_conf.fuse_direct_io) { - newargv[newargc++] = "-o"; - newargv[newargc++] = "direct_io"; - } - - // disable stupid fuse unlink hiding thing - newargv[newargc++] = "-o"; - newargv[newargc++] = "hard_remove"; - - // force into foreground - // -> we can watch stdout this way!! - newargv[newargc++] = "-f"; - - // copy rest of cmdline (hopefully, the mount point!) - for (int argctr = 1; argctr < argc; argctr++) newargv[newargc++] = argv[argctr]; - - // go fuse go - cout << "ok, calling fuse_main" << endl; - int r = fuse_main(newargc, newargv, &ceph_oper); - return r; -} diff --git a/branches/marnberg/quota/client/fuse.h b/branches/marnberg/quota/client/fuse.h deleted file mode 100644 index d0b8dcb1154f5..0000000000000 --- a/branches/marnberg/quota/client/fuse.h +++ /dev/null @@ -1,23 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -/* ceph_fuse_main - * - start up fuse glue, attached to Client* cl. - * - argc, argv should include a mount point, and - * any weird fuse options you want. by default, - * we will put fuse in the foreground so that it - * won't fork and we can see stdout. - */ -int ceph_fuse_main(Client *cl, int argc, char *argv[]); diff --git a/branches/marnberg/quota/client/hadoop/CephClientInterface.cc b/branches/marnberg/quota/client/hadoop/CephClientInterface.cc deleted file mode 100644 index 6466dd6300891..0000000000000 --- a/branches/marnberg/quota/client/hadoop/CephClientInterface.cc +++ /dev/null @@ -1,217 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -//#include - - -using namespace std; - -// globals -//Client *client; // the ceph client -//this has to go - the real client will have to hold the pointer. -//Every function will need to take a Client pointer. - -// ------ -// fuse hooks - -static int ceph_getattr(Client* client, const char *path, struct stat *stbuf) -{ - return client->lstat(path, stbuf); -} - -static int ceph_readlink(Client* client, const char *path, char *buf, size_t size) -{ - int res; - - res = client->readlink(path, buf, size - 1); - if (res < 0) return res; - - buf[res] = '\0'; - return 0; -} - -// get rid of the callback thing, perhaps? and return the answer some other way? -/* -static int ceph_getdir(Client* client, const char *path, fuse_dirh_t h, fuse_dirfil_t filler) -{ - map contents; - - int res = client->getdir(path, contents); - if (res < 0) return res; - - // return contents to fuse via callback - for (map::iterator it = contents.begin(); - it != contents.end(); - it++) { - // (immutable) inode contents too. - res = filler(h, // fuse's handle - it->first.c_str(), // dentry as char* - it->second.mode & INODE_TYPE_MASK, // mask type bits from mode - it->second.ino); // ino.. 64->32 bit issue here? FIXME - if (res != 0) break; // fuse has had enough - } - return res; -} -*/ - -static int ceph_mknod(Client* client, const char *path, mode_t mode, dev_t rdev) -{ - return client->mknod(path, mode); -} - -static int ceph_mkdir(Client* client, const char *path, mode_t mode) -{ - return client->mkdir(path, mode); -} - -static int ceph_unlink(Client* client, const char *path) -{ - return client->unlink(path); -} - -static int ceph_rmdir(Client* client, const char *path) -{ - return client->rmdir(path); -} - -static int ceph_symlink(Client* client, const char *from, const char *to) -{ - return client->symlink(from, to); -} - - -static int ceph_rename(Client* client, const char *from, const char *to) -{ - return client->rename(from, to); -} - -static int ceph_link(Client* client, const char *from, const char *to) -{ - return client->link(from, to); -} - -static int ceph_chmod(Client* client, const char *path, mode_t mode) -{ - return client->chmod(path, mode); -} - -static int ceph_chown(Client* client, const char *path, uid_t uid, gid_t gid) -{ - return client->chown(path, uid, gid); -} - -static int ceph_truncate(Client* client, const char *path, off_t size) -{ - return client->truncate(path, size); -} - -static int ceph_utime(Client* client, const char *path, struct utimbuf *buf) -{ - return client->utime(path, buf); -} - - -static int ceph_open(Client* client, const char *path, struct fuse_file_info *fi) -{ - int res; - - res = client->open(path, fi->flags); - if (res < 0) return res; - fi->fh = res; - return 0; // fuse wants 0 onsucess -} - -static int ceph_read(Client* client, const char *path, char *buf, size_t size, off_t offset, - struct fuse_file_info *fi) -{ - fh_t fh = fi->fh; - return client->read(fh, buf, size, offset); -} - -static int ceph_write(Client* client, const char *path, const char *buf, size_t size, - off_t offset, struct fuse_file_info *fi) -{ - fh_t fh = fi->fh; - return client->write(fh, buf, size, offset); -} - -/* -static int ceph_flush(const char *path, struct fuse_file_info *fi) -{ - fh_t fh = fi->fh; - return client->flush(fh); -} -*/ - - -#ifdef DARWIN -static int ceph_statfs(Client* client, const char *path, struct statvfs *stbuf) -{ - return client->statfs(path, stbuf); -} -#else -static int ceph_statfs(Client* client, const char *path, struct statfs *stbuf) -{ - return client->statfs(path, stbuf); -} -#endif - - -/* remove fuse stuff from these two -static int ceph_release(Client* client, const char *path, struct fuse_file_info *fi) -{ - fh_t fh = fi->fh; - int r = client->close(fh); // close the file - return r; -} - -static int ceph_fsync(Client* client, const char *path, int isdatasync, - struct fuse_file_info *fi) -{ - fh_t fh = fi->fh; - return client->fsync(fh, isdatasync ? true:false); -} -*/ - -/* -static struct fuse_operations ceph_oper = { - getattr: ceph_getattr, - readlink: ceph_readlink, - getdir: ceph_getdir, - mknod: ceph_mknod, - mkdir: ceph_mkdir, - unlink: ceph_unlink, - rmdir: ceph_rmdir, - symlink: ceph_symlink, - rename: ceph_rename, - link: ceph_link, - chmod: ceph_chmod, - chown: ceph_chown, - truncate: ceph_truncate, - utime: ceph_utime, - open: ceph_open, - read: ceph_read, - write: ceph_write, - statfs: ceph_statfs, - flush: 0, //ceph_flush, - release: ceph_release, - fsync: ceph_fsync -}; - -*/ - - -// Does this do anything we need? No. All it does is assemble a bunch of -// arguments and call fuse_main. - diff --git a/branches/marnberg/quota/client/hadoop/CephClientInterface.h b/branches/marnberg/quota/client/hadoop/CephClientInterface.h deleted file mode 100644 index e0b37c305029e..0000000000000 --- a/branches/marnberg/quota/client/hadoop/CephClientInterface.h +++ /dev/null @@ -1,115 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include -#include -#include -#include -#include -#include -#ifdef DARWIN -#include -#else -#include -#endif // DARWIN - -// ceph stuff -#include "include/types.h" - -#include "Client.h" - -#include "config.h" - -// stl -#include - - - - - - -// stbuf holds the attributes -static int ceph_getattr(Client* client, const char *path, struct stat *stbuf); - -// reads a symlink -static int ceph_readlink(Client* client, const char *path, char *buf, size_t size); - -// to do: remove fuse stuff from this one -//static int ceph_getdir(Client* client, const char *path, fuse_dirh_t h, fuse_dirfil_t filler); - -// looks irrelevant - it's for special device files -static int ceph_mknod(Client* client, const char *path, mode_t mode, dev_t rdev); - -// mode is the file permission bits -static int ceph_mkdir(Client* client, const char *path, mode_t mode); - -// delete! -static int ceph_unlink(Client* client, const char *path); - -// delete! if it's an empty directory -static int ceph_rmdir(Client* client, const char *path); - -// make a symlink -static int ceph_symlink(Client* client, const char *from, const char *to); - -// self-explanatory -static int ceph_rename(Client* client, const char *from, const char *to); - -static int ceph_link(Client* client, const char *from, const char *to); //hard link - -static int ceph_chmod(Client* client, const char *path, mode_t mode); //just chmod - -static int ceph_chown(Client* client, const char *path, uid_t uid, gid_t gid); //duh - -static int ceph_truncate(Client* client, const char *path, off_t size); //chop or zero-pad to size - -// set file access/modification times -static int ceph_utime(Client* client, const char *path, struct utimbuf *buf); - -// ok, gotta figure out what's in fuse_file_info and how to use it. Presumably it includes -// a file descriptor and the open flags? -static int ceph_open(Client* client, const char *path, struct fuse_file_info *fi); - -// read! -static int ceph_read(Client* client, const char *path, char *buf, size_t size, off_t offset, - struct fuse_file_info *fi); - -// write! -static int ceph_write(Client* client, const char *path, const char *buf, size_t size, - off_t offset, struct fuse_file_info *fi); - -/* was already commented out -static int ceph_flush(const char *path, struct fuse_file_info *fi); -*/ - - -// is this statvfs perhaps? we probably don't need it -#ifdef DARWIN -static int ceph_statfs(Client* client, const char *path, struct statvfs *stbuf); -#else -static int ceph_statfs(Client* client, const char *path, struct statfs *stbuf); -#endif - -// Remove fuse stuff from these two -//static int ceph_release(Client* client, const char *path, struct fuse_file_info *fi); - -//static int ceph_fsync(Client* client, const char *path, int isdatasync, struct fuse_file_info *fi); //kinda like flush? - -/* ceph_fuse_main - * - start up fuse glue, attached to Client* cl. - * - argc, argv should include a mount point, and - * any weird fuse options you want. by default, - * we will put fuse in the foreground so that it - * won't fork and we can see stdout. - */ -// int ceph_fuse_main(Client *cl, int argc, char *argv[]); diff --git a/branches/marnberg/quota/client/ldceph.cc b/branches/marnberg/quota/client/ldceph.cc deleted file mode 100644 index 9706fd49cad99..0000000000000 --- a/branches/marnberg/quota/client/ldceph.cc +++ /dev/null @@ -1,297 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include -using namespace std; - -// ceph stuff -#include "config.h" -#include "client/Client.h" -#include "msg/TCPMessenger.h" - -// syscall fun -#include -#include -#include -//#include - -#define _FCNTL_H -#include - -#define CEPH_FD_OFF 50000 - - -/****** startup etc *******/ - -class LdCeph { -public: - // globals - bool started; - char *mount_point; - char *mount_point_parent; - int mount_point_len; - - Client *client; - - filepath fp_mount_point; - filepath cwd; - bool cwd_above_mp, cwd_in_mp; - - const char *get_ceph_path(const char *orig, char *buf) { - if (!started) return 0; - - // relative path? BUG: this won't catch "blah/../../asdf" - if (orig[0] && - orig[0] != '/' && - !(orig[0] == '.' && orig[1] == '.')) { - - if (cwd_in_mp) return orig; // inside mount point, definitely ceph - if (!cwd_above_mp) return 0; // not above mount point, definitely not ceph - - // relative, above mp. - filepath o = orig; - filepath p = cwd; - for (unsigned b = 0; b < o.depth(); b++) { - if (o[b] == "..") - p.pop_dentry(); - else - p.add_dentry(o[b]); - } - - // FIXME rewrite - if (strncmp(p.c_str(), mount_point, mount_point_len) == 0) { - if (p.c_str()[mount_point_len] == 0) - return "/"; - if (p.c_str()[mount_point_len] == '/') { - strcpy(buf, p.c_str() + mount_point_len); - return buf; - } - } - return 0; - } else { - // absolute - if (strncmp(orig, mount_point, mount_point_len) == 0) { - if (orig[mount_point_len] == 0) - return "/"; - if (orig[mount_point_len] == '/') - return orig + mount_point_len; - } - return 0; - } - } - - void refresh_cwd() { - char buf[255]; - syscall(SYS_getcwd, buf, 255); - cwd = buf; - - if (strncmp(buf, mount_point, mount_point_len) == 0 && - (buf[mount_point_len] == 0 || - buf[mount_point_len] == '/')) - cwd_in_mp = true; - else { - if (cwd.depth() > fp_mount_point.depth()) - cwd_above_mp = false; - else { - cwd_above_mp = true; - for (unsigned i=0; iget_myaddr() << endl; - - refresh_cwd(); - } - } - ~LdCeph() { - cout << "ldceph fini" << endl; - if (false && client) { - client->unmount(); - client->shutdown(); - delete client; - client = 0; - tcpmessenger_wait(); - tcpmessenger_shutdown(); - } - } - -} ldceph; - - - -/****** original functions ****/ - - - -/****** captured functions ****/ - - -#define MYFD(f) ((fd) > CEPH_FD_OFF && ldceph.started) -#define TO_FD(fd) (fd > 0 ? fd+CEPH_FD_OFF:fd) -#define FROM_FD(fd) (fd - CEPH_FD_OFF) - -extern "C" { - - // open/close - //int open(const char *pathname, int flags) { - int open(const char *pathname, int flags, mode_t mode) { - char buf[255]; - if (const char *c = ldceph.get_ceph_path(pathname, buf)) - return TO_FD(ldceph.client->open(c, flags)); - else - return syscall(SYS_open, pathname, flags, mode); - } - - int creat(const char *pathname, mode_t mode) { - return open(pathname, O_CREAT|O_WRONLY|O_TRUNC, mode); - } - int close(int fd) { - if (MYFD(fd)) - return ldceph.client->close(FROM_FD(fd)); - else - return syscall(SYS_close, fd); - } - - - // read/write - ssize_t write(int fd, const void *buf, size_t count) { - if (MYFD(fd)) - return ldceph.client->write(FROM_FD(fd), (char*)buf, count); - else - return syscall(SYS_write, fd, buf, count); - } - - ssize_t read(int fd, void *buf, size_t count) { - if (MYFD(fd)) - return ldceph.client->read(FROM_FD(fd), (char*)buf, count); - else - return syscall(SYS_read, fd, buf, count); - } - - //int fsync(int fd); - //int fdatasync(int fd); - - - // namespace - int rmdir(const char *pathname) { - char buf[255]; - if (const char *c = ldceph.get_ceph_path(pathname, buf)) - return ldceph.client->rmdir(c); - else - return syscall(SYS_rmdir, pathname); - } - int mkdir(const char *pathname, mode_t mode) { - char buf[255]; - if (const char *c = ldceph.get_ceph_path(pathname, buf)) - return ldceph.client->mkdir(c, mode); - else - return syscall(SYS_mkdir, pathname, mode); - } - int unlink(const char *pathname) { - char buf[255]; - if (const char *c = ldceph.get_ceph_path(pathname, buf)) - return ldceph.client->unlink(c); - else - return syscall(SYS_unlink, pathname); - } - - int stat(const char *pathname, struct stat *st) { - //int __xstat64(int __ver, const char *pathname, struct stat64 *st64) { // stoopid GLIBC - //struct stat *st = (struct stat*)st64; - char buf[255]; - if (const char *c = ldceph.get_ceph_path(pathname, buf)) - return ldceph.client->lstat(c, st); // FIXME - else - return syscall(SYS_stat, pathname, st); - } - //int fstat(int filedes, struct stat *buf); - //int lstat(const char *file_name, struct stat *buf); - - int chdir(const char *pathname) { - char buf[255]; - if (const char *c = ldceph.get_ceph_path(pathname, buf)) { - int r = ldceph.client->chdir(c); - if (r == 0) { - if (!ldceph.cwd_in_mp) - syscall(SYS_chdir, ldceph.mount_point_parent); - ldceph.cwd_in_mp = true; - ldceph.cwd_above_mp = false; - ldceph.cwd = ldceph.mount_point; - filepath fpc = c; - ldceph.cwd.append(fpc); - } - return r; - } else { - int r = syscall(SYS_chdir, pathname); - if (r) { - ldceph.refresh_cwd(); - } - return r; - } - } - char *getcwd(char *buf, size_t size) { - strncpy(buf, ldceph.cwd.c_str(), size); - return buf; - } - //int fchdir(int fd); - - - - -} diff --git a/branches/marnberg/quota/client/msgthread.h b/branches/marnberg/quota/client/msgthread.h deleted file mode 100644 index 69d10be9f6a56..0000000000000 --- a/branches/marnberg/quota/client/msgthread.h +++ /dev/null @@ -1,25 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include "msg/Message.h" - -// send the message, expecting no response. threads other than the -// MPI thread use this function; if the MPI thread uses this function -// it could deadlock: this function could wait for the out queue to be -// emptied, but only the MPI thread can empty it. -void obfsmpi_send(Message *m) - -// send the message to a server and wait for the response. threads -// other than the MPI thread use this function. -Message *obfsmpi_sendrecv(Message *m) diff --git a/branches/marnberg/quota/cmds.cc b/branches/marnberg/quota/cmds.cc deleted file mode 100644 index 8faf6a5bc6049..0000000000000 --- a/branches/marnberg/quota/cmds.cc +++ /dev/null @@ -1,101 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include -#include -#include - -#include -#include -#include -using namespace std; - -#include "config.h" - -#include "mon/MonMap.h" -#include "mds/MDS.h" - -#include "msg/SimpleMessenger.h" - -#include "common/Timer.h" - - -class C_Die : public Context { -public: - void finish(int) { - cerr << "die" << endl; - exit(1); - } -}; - -class C_Debug : public Context { - public: - void finish(int) { - int size = &g_conf.debug_after - &g_conf.debug; - memcpy((char*)&g_conf.debug, (char*)&g_debug_after_conf.debug, size); - dout(0) << "debug_after flipping debug settings" << endl; - } -}; - - -int main(int argc, char **argv) -{ - vector args; - argv_to_vec(argc, argv, args); - - parse_config_options(args); - - if (g_conf.kill_after) - g_timer.add_event_after(g_conf.kill_after, new C_Die); - if (g_conf.debug_after) - g_timer.add_event_after(g_conf.debug_after, new C_Debug); - - // mds specific args - int whoami = -1; - bool standby = false; // by default, i'll start active. - for (unsigned i=0; i= 0); - - // start up network - rank.start_rank(); - - // start mds - Messenger *m = rank.register_entity(MSG_ADDR_MDS(whoami)); - assert(m); - - MDS *mds = new MDS(whoami, m, &monmap); - mds->init(standby); - - // wait - rank.wait(); - - // done - delete mds; - - return 0; -} - diff --git a/branches/marnberg/quota/cmon.cc b/branches/marnberg/quota/cmon.cc deleted file mode 100644 index 442b584f02abd..0000000000000 --- a/branches/marnberg/quota/cmon.cc +++ /dev/null @@ -1,126 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include -#include -#include - -#include -#include -#include -using namespace std; - -#include "config.h" - -#include "mon/MonMap.h" -#include "mon/Monitor.h" - -#include "msg/SimpleMessenger.h" - -#include "common/Timer.h" - - -class C_Die : public Context { -public: - void finish(int) { - cerr << "die" << endl; - exit(1); - } -}; - -class C_Debug : public Context { - public: - void finish(int) { - int size = &g_conf.debug_after - &g_conf.debug; - memcpy((char*)&g_conf.debug, (char*)&g_debug_after_conf.debug, size); - dout(0) << "debug_after flipping debug settings" << endl; - } -}; - - -int main(int argc, char **argv) -{ - vector args; - argv_to_vec(argc, argv, args); - - parse_config_options(args); - - if (g_conf.kill_after) - g_timer.add_event_after(g_conf.kill_after, new C_Die); - if (g_conf.debug_after) - g_timer.add_event_after(g_conf.debug_after, new C_Debug); - - // args - int whoami = -1; - char *monmap_fn = ".ceph_monmap"; - for (unsigned i=0; i= 0); - } else { - // i am specific monitor. - - // read monmap - cout << "reading monmap from .ceph_monmap" << endl; - int r = monmap.read(monmap_fn); - assert(r >= 0); - - // bind to a specific port - cout << "starting mon" << whoami << " at " << monmap.get_inst(whoami) << endl; - g_my_addr = monmap.get_inst(whoami).addr; - rank.start_rank(); - } - - // start monitor - Messenger *m = rank.register_entity(MSG_ADDR_MON(whoami)); - Monitor *mon = new Monitor(whoami, m, &monmap); - mon->init(); - - // wait - cout << "waiting for shutdown ..." << endl; - rank.wait(); - - // done - delete mon; - - return 0; -} - diff --git a/branches/marnberg/quota/common/Clock.cc b/branches/marnberg/quota/common/Clock.cc deleted file mode 100644 index c970a337826b6..0000000000000 --- a/branches/marnberg/quota/common/Clock.cc +++ /dev/null @@ -1,19 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include "Clock.h" - -// public -Clock g_clock; - diff --git a/branches/marnberg/quota/common/Clock.h b/branches/marnberg/quota/common/Clock.h deleted file mode 100644 index 92a2b2bddf6d0..0000000000000 --- a/branches/marnberg/quota/common/Clock.h +++ /dev/null @@ -1,203 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __CLOCK_H -#define __CLOCK_H - -#include -#include - -#include -#include -#include - -#include "Mutex.h" - - -// -------- -// utime_t - -class utime_t { - private: - struct timeval tv; - - struct timeval& timeval() { return tv; } - friend class Clock; - - - public: - void normalize() { - if (tv.tv_usec > 1000*1000) { - tv.tv_sec += tv.tv_usec / (1000*1000); - tv.tv_usec %= 1000*1000; - } - } - - // cons - utime_t() { tv.tv_sec = 0; tv.tv_usec = 0; normalize(); } - utime_t(time_t s, int u) { tv.tv_sec = s; tv.tv_usec = u; normalize(); } - - // accessors - time_t sec() const { return tv.tv_sec; } - long usec() const { return tv.tv_usec; } - int nsec() const { return tv.tv_usec*1000; } - - // ref accessors/modifiers - time_t& sec_ref() { return tv.tv_sec; } - // FIXME: tv.tv_usec is a __darwin_suseconds_t on Darwin. - // is just casting it to long& OK? - long& usec_ref() { return (long&) tv.tv_usec; } - - // cast to double - operator double() { - return (double)sec() + ((double)usec() / 1000000.0L); - } -}; - -// arithmetic operators -inline utime_t operator+(const utime_t& l, const utime_t& r) { - return utime_t( l.sec() + r.sec() + (l.usec()+r.usec())/1000000L, - (l.usec()+r.usec())%1000000L ); -} -inline utime_t& operator+=(utime_t& l, const utime_t& r) { - l.sec_ref() += r.sec() + (l.usec()+r.usec())/1000000L; - l.usec_ref() += r.usec(); - l.usec_ref() %= 1000000L; - return l; -} -inline utime_t& operator+=(utime_t& l, double f) { - double fs = trunc(f); - double us = (f - fs) / (double)1000000.0; - l.sec_ref() += (long)fs; - l.usec_ref() += (long)us; - l.normalize(); - return l; -} - -inline utime_t operator-(const utime_t& l, const utime_t& r) { - return utime_t( l.sec() - r.sec() - (l.usec()= r.usec()) - l.usec_ref() -= r.usec(); - else { - l.usec_ref() += 1000000L - r.usec(); - l.sec_ref()--; - } - return l; -} -inline utime_t& operator-=(utime_t& l, double f) { - l += -f; - return l; -} - -inline bool operator>(const utime_t& a, const utime_t& b) -{ - return (a.sec() > b.sec()) || (a.sec() == b.sec() && a.usec() > b.usec()); -} -inline bool operator<(const utime_t& a, const utime_t& b) -{ - return (a.sec() < b.sec()) || (a.sec() == b.sec() && a.usec() < b.usec()); -} - -// ostream -inline std::ostream& operator<<(std::ostream& out, const utime_t& t) -{ - //return out << t.sec() << "." << t.usec(); - out << (long)t.sec() << "."; - out.setf(std::ios::right); - out.fill('0'); - out << std::setw(6) << t.usec(); - out.unsetf(std::ios::right); - return out; - - //return out << (long)t.sec << "." << ios::setf(ios::right) << ios::fill('0') << t.usec() << ios::usetf(); -} - - - - -// -- clock -- -class Clock { - protected: - //utime_t start_offset; - //utime_t abs_last; - utime_t last; - utime_t zero; - - Mutex lock; - - public: - Clock() { - // set offset - tare(); - } - - // real time. - utime_t real_now() { - utime_t realnow = now(); - realnow += zero; - //gettimeofday(&realnow.timeval(), NULL); - return realnow; - } - - // relative time (from startup) - void tare() { - gettimeofday(&zero.timeval(), NULL); - } - utime_t now() { - //lock.Lock(); - utime_t n; - gettimeofday(&n.timeval(), NULL); - n -= zero; - if (n < last) { - //std::cerr << "WARNING: clock jumped backwards from " << last << " to " << n << std::endl; - n = last; // clock jumped backwards! - } else - last = n; - //lock.Unlock(); - return n; - } - utime_t recent_now() { - return last; - } - - void realify(utime_t& t) { - t += zero; - } - - void make_timespec(utime_t& t, struct timespec *ts) { - utime_t real = t; - realify(real); - - memset(ts, 0, sizeof(*ts)); - ts->tv_sec = real.sec(); - ts->tv_nsec = real.nsec(); - } - - - - // absolute time - time_t gettime() { - return real_now().sec(); - } - -}; - -extern Clock g_clock; - -#endif diff --git a/branches/marnberg/quota/common/Cond.h b/branches/marnberg/quota/common/Cond.h deleted file mode 100644 index ed465ce3762d6..0000000000000 --- a/branches/marnberg/quota/common/Cond.h +++ /dev/null @@ -1,118 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __COND_H -#define __COND_H - -#include - -#include "Mutex.h" -#include "Clock.h" - -#include "include/Context.h" - -#include -#include - -class Cond { - // my bits - pthread_cond_t _c; - - // don't allow copying. - void operator=(Cond &C) {} - Cond( const Cond &C ) {} - - public: - Cond() { - int r = pthread_cond_init(&_c,NULL); - assert(r == 0); - } - virtual ~Cond() { - pthread_cond_destroy(&_c); - } - - int Wait(Mutex &mutex) { - int r = pthread_cond_wait(&_c, &mutex._m); - return r; - } - - int Wait(Mutex &mutex, char* s) { - //cout << "Wait: " << s << endl; - int r = pthread_cond_wait(&_c, &mutex._m); - return r; - } - - int WaitUntil(Mutex &mutex, utime_t when) { - struct timespec ts; - g_clock.make_timespec(when, &ts); - //cout << "timedwait for " << ts.tv_sec << " sec " << ts.tv_nsec << " nsec" << endl; - int r = pthread_cond_timedwait(&_c, &mutex._m, &ts); - return r; - } - int WaitInterval(Mutex &mutex, utime_t interval) { - utime_t when = g_clock.now(); - when += interval; - return WaitUntil(mutex, when); - } - - int Signal() { - //int r = pthread_cond_signal(&_c); - int r = pthread_cond_broadcast(&_c); - return r; - } - int SignalOne() { - int r = pthread_cond_signal(&_c); - return r; - } - int SignalAll() { - //int r = pthread_cond_signal(&_c); - int r = pthread_cond_broadcast(&_c); - return r; - } -}; - -class C_Cond : public Context { - Cond *cond; - bool *done; - int *rval; -public: - C_Cond(Cond *c, bool *d, int *r=0) : cond(c), done(d), rval(r) { - *done = false; - } - void finish(int r) { - if (rval) *rval = r; - *done = true; - cond->Signal(); - } -}; - -class C_SafeCond : public Context { - Mutex *lock; - Cond *cond; - bool *done; - int *rval; -public: - C_SafeCond(Mutex *l, Cond *c, bool *d, int *r=0) : lock(l), cond(c), done(d), rval(r) { - *done = false; - } - void finish(int r) { - lock->Lock(); - if (rval) *rval = r; - *done = true; - cond->Signal(); - lock->Unlock(); - } -}; - -#endif diff --git a/branches/marnberg/quota/common/DecayCounter.h b/branches/marnberg/quota/common/DecayCounter.h deleted file mode 100644 index b95ebea815b7c..0000000000000 --- a/branches/marnberg/quota/common/DecayCounter.h +++ /dev/null @@ -1,94 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __DECAYCOUNTER_H -#define __DECAYCOUNTER_H - -#include -#include "Clock.h" - -#include "config.h" - -class DecayCounter { - protected: - double val; // value - - double half_life; // in seconds - double k; // k = ln(.5)/half_life - - utime_t last_decay; // time of last decay - - public: - DecayCounter() : val(0) { - set_halflife( g_conf.mds_decay_halflife ); - reset(); - } - /* - DecayCounter(double hl) : val(0) { - set_halflife(hl); - reset(); - } - */ - - void adjust(double a) { - decay(); - val += a; - } - void adjust_down(const DecayCounter& other) { - // assume other has same time stamp as us... - val -= other.val; - } - - void set_halflife(double hl) { - half_life = hl; - k = log(.5) / hl; - } - - void take(DecayCounter& other) { - *this = other; - other.reset(); - } - - void reset() { - last_decay.sec_ref() = 0; - last_decay.usec_ref() = 0; - val = 0; - } - - void decay() { - utime_t el = g_clock.recent_now(); - el -= last_decay; - if (el.sec() >= 1) { - val = val * exp((double)el * k); - if (val < .01) val = 0; - last_decay = g_clock.recent_now(); - } - } - - double get() { - decay(); - return val; - } - - double hit(double v = 1.0) { - decay(); - val += v; - return val; - } - -}; - - -#endif diff --git a/branches/marnberg/quota/common/LogType.h b/branches/marnberg/quota/common/LogType.h deleted file mode 100644 index 3de17751ec2f8..0000000000000 --- a/branches/marnberg/quota/common/LogType.h +++ /dev/null @@ -1,119 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __LOGTYPE_H -#define __LOGTYPE_H - -#include "include/types.h" - -#include -#include -using namespace std; -#include -#include -using namespace __gnu_cxx; - -#include "Mutex.h" - - -class LogType { - protected: - hash_map<__uint64_t, int> keymap; - vector keys; - set inc_keys; - - int version; - - // HACK to avoid the hash table as often as possible... - // cache recent key name lookups in a small ring buffer - const static int cache_keys = 10; - __uint64_t kc_ptr[cache_keys]; - int kc_val[cache_keys]; - int kc_pos; - - friend class Logger; - - public: - LogType() { - version = 1; - - for (int i=0;i= 0) return i; - - i = keys.size(); - keys.push_back(key); - -#ifdef __LP64__ - __uint64_t p = (__uint64_t)key; -#else - __uint64_t p = (__uint32_t)key; -#endif - keymap[p] = i; - if (is_inc) inc_keys.insert(i); - - version++; - return i; - } - int add_inc(const char* key) { - return add_key(key, true); - } - int add_set(const char *key) { - return add_key(key, false); - } - - bool have_key(const char* key) { - return lookup_key(key) < 0; - } - - int lookup_key(const char* key) { -#ifdef __LP64__ - __uint64_t p = (__uint64_t)key; -#else - __uint64_t p = (__uint32_t)key; -#endif - - if (keymap.count(p)) - return keymap[p]; - - // try kc ringbuffer - int pos = kc_pos-1; - for (int j=0; j - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include - -#include "LogType.h" -#include "Logger.h" - -#include -#include "Clock.h" - -#include "config.h" - -#include -#include - - -// per-process lock. lame, but this way I protect LogType too! -Mutex logger_lock; - -Logger::Logger(string fn, LogType *type) -{ - logger_lock.Lock(); - { - filename = ""; - if (g_conf.use_abspaths) { - char *cwd = get_current_dir_name(); - filename = cwd; - delete cwd; - filename += "/"; - } - - filename = "log/"; - if (g_conf.log_name) { - filename += g_conf.log_name; - ::mkdir( filename.c_str(), 0755 ); // make sure dir exists - filename += "/"; - } - filename += fn; - //cout << "log " << filename << endl; - interval = g_conf.log_interval; - - //start = g_clock.now(); // time 0! - last_logged = 0; - wrote_header = -1; - open = false; - this->type = type; - wrote_header_last = 0; - - version = 0; - } - logger_lock.Unlock(); - flush(false); -} - -Logger::~Logger() -{ - flush(true); - out.close(); -} - -long Logger::inc(const char *key, long v) -{ - if (!g_conf.log) return 0; - logger_lock.Lock(); - int i = type->lookup_key(key); - if (i < 0) i = type->add_inc(key); - flush(); - vals[i] += v; - long r = vals[i]; - logger_lock.Unlock(); - return r; -} - -double Logger::finc(const char *key, double v) -{ - if (!g_conf.log) return 0; - logger_lock.Lock(); - int i = type->lookup_key(key); - if (i < 0) i = type->add_inc(key); - flush(); - fvals[i] += v; - double r = fvals[i]; - logger_lock.Unlock(); - return r; -} - -long Logger::set(const char *key, long v) -{ - if (!g_conf.log) return 0; - logger_lock.Lock(); - int i = type->lookup_key(key); - if (i < 0) i = type->add_set(key); - flush(); - long r = vals[i] = v; - logger_lock.Unlock(); - return r; -} - - -double Logger::fset(const char *key, double v) -{ - if (!g_conf.log) return 0; - logger_lock.Lock(); - int i = type->lookup_key(key); - if (i < 0) i = type->add_set(key); - flush(); - double r = fvals[i] = v; - logger_lock.Unlock(); - return r; -} - -long Logger::get(const char* key) -{ - if (!g_conf.log) return 0; - logger_lock.Lock(); - int i = type->lookup_key(key); - long r = 0; - if (i >= 0 && (int)vals.size() > i) - r = vals[i]; - logger_lock.Unlock(); - return r; -} - -void Logger::flush(bool force) -{ - if (!g_conf.log) return; - logger_lock.Lock(); - - if (version != type->version) { - while (type->keys.size() > vals.size()) - vals.push_back(0); - while (type->keys.size() > fvals.size()) - fvals.push_back(0); - version = type->version; - } - - if (!open) { - out.open(filename.c_str(), ofstream::out); - open = true; - //cout << "opening log file " << filename << endl; - } - - utime_t fromstart = g_clock.now(); - if (fromstart < start) { - cerr << "logger time jumped backwards from " << start << " to " << fromstart << endl; - assert(0); - start = fromstart; - } - fromstart -= start; - - while (force || - ((fromstart.sec() > last_logged) && - (fromstart.sec() - last_logged >= interval))) { - last_logged += interval; - force = false; - - //cout << "logger " << this << " advancing from " << last_logged << " now " << now << endl; - - if (!open) { - out.open(filename.c_str(), ofstream::out); - open = true; - //cout << "opening log file " << filename << endl; - } - - // header? - wrote_header_last++; - if (wrote_header != type->version || - wrote_header_last > 10) { - out << "#" << type->keymap.size(); - for (unsigned i=0; ikeys.size(); i++) - out << "\t" << type->keys[i]; - out << endl; //out << "\t (" << type->keymap.size() << ")" << endl; - wrote_header = type->version; - wrote_header_last = 0; - } - - // write line to log - out << last_logged; - for (unsigned i=0; ikeys.size(); i++) { - if (fvals[i] > 0 && vals[i] == 0) - out << "\t" << fvals[i]; - else - out << "\t" << vals[i]; - } - out << endl; - - // reset the counters - for (unsigned i=0; ikeys.size(); i++) { - if (type->inc_keys.count(i)) { - this->vals[i] = 0; - this->fvals[i] = 0; - } - } - } - - logger_lock.Unlock(); -} - - - - diff --git a/branches/marnberg/quota/common/Logger.h b/branches/marnberg/quota/common/Logger.h deleted file mode 100644 index 85102acd90370..0000000000000 --- a/branches/marnberg/quota/common/Logger.h +++ /dev/null @@ -1,74 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __LOGGER_H -#define __LOGGER_H - -#include "include/types.h" -#include "Clock.h" -#include "Mutex.h" - -#include -#include -using namespace std; - -#include -using namespace __gnu_cxx; - -#include "LogType.h" - - - - -class Logger { - protected: - //hash_map, eqstr> vals; - //hash_map, eqstr> fvals; - vector vals; - vector fvals; - - //Mutex lock; - LogType *type; - - utime_t start; - int last_logged; - int interval; - int wrote_header; - int wrote_header_last; - - string filename; - - int version; - - ofstream out; - bool open; - - public: - Logger(string fn, LogType *type); - ~Logger(); - - void set_start(const utime_t& a) { start = a; } - utime_t& get_start() { return start; } - - long inc(const char *s, long v = 1); - long set(const char *s, long v); - long get(const char *s); - - double fset(const char *s, double v); - double finc(const char *s, double v); - - void flush(bool force = false); -}; - -#endif diff --git a/branches/marnberg/quota/common/Mutex.h b/branches/marnberg/quota/common/Mutex.h deleted file mode 100755 index c4615a3ff4c6e..0000000000000 --- a/branches/marnberg/quota/common/Mutex.h +++ /dev/null @@ -1,68 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MUTEX_H -#define __MUTEX_H - -#include -#include - -class Mutex { -private: - pthread_mutex_t _m; - int nlock; - bool recursive; - - // don't allow copying. - void operator=(Mutex &M) {} - Mutex( const Mutex &M ) {} - -public: - Mutex(bool r = true) : nlock(0), recursive(r) { - if (recursive) { - pthread_mutexattr_t attr; - pthread_mutexattr_init(&attr); - pthread_mutexattr_settype(&attr,PTHREAD_MUTEX_RECURSIVE); - pthread_mutex_init(&_m,&attr); - pthread_mutexattr_destroy(&attr); - } else { - pthread_mutex_init(&_m,NULL); - } - } - virtual ~Mutex() { - assert(nlock == 0); - pthread_mutex_destroy(&_m); - } - - bool is_locked() { - return (nlock > 0); - } - - void Lock() { - int r = pthread_mutex_lock(&_m); - assert(r == 0); - nlock++; - assert(nlock == 1 || recursive); - } - - void Unlock() { - assert(nlock > 0); - --nlock; - int r = pthread_mutex_unlock(&_m); - assert(r == 0); - } - - friend class Cond; -}; - -#endif diff --git a/branches/marnberg/quota/common/Semaphore.h b/branches/marnberg/quota/common/Semaphore.h deleted file mode 100644 index 7526f5c1ec9c8..0000000000000 --- a/branches/marnberg/quota/common/Semaphore.h +++ /dev/null @@ -1,52 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef _Sem_Posix_ -#define _Sem_Posix_ - -#include - -class Semaphore -{ - Mutex m; - Cond c; - int count; - - public: - - Semaphore() - { - count = 0; - } - - void Put() - { - m.Lock(); - count++; - c.Signal(); - m.Unlock(); - } - - void Get() - { - m.Lock(); - while(count <= 0) { - c.Wait(m); - } - count--; - m.Unlock(); - } -}; - -#endif // !_Mutex_Posix_ diff --git a/branches/marnberg/quota/common/Thread.h b/branches/marnberg/quota/common/Thread.h deleted file mode 100644 index 8565ce9effd92..0000000000000 --- a/branches/marnberg/quota/common/Thread.h +++ /dev/null @@ -1,62 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __THREAD_H -#define __THREAD_H - -#include - -class Thread { - private: - pthread_t thread_id; - - public: - Thread() : thread_id(0) {} - virtual ~Thread() {} - - pthread_t &get_thread_id() { return thread_id; } - bool is_started() { return thread_id != 0; } - - virtual void *entry() = 0; - - private: - static void *_entry_func(void *arg) { - return ((Thread*)arg)->entry(); - } - - public: - int create() { - return pthread_create( &thread_id, NULL, _entry_func, (void*)this ); - } - - bool am_self() { - return (pthread_self() == thread_id); - } - - int join(void **prval = 0) { - assert(thread_id); - //if (thread_id == 0) return -1; // never started. - - int status = pthread_join(thread_id, prval); - if (status == 0) - thread_id = 0; - else { - cout << "join status = " << status << endl; - assert(0); - } - return status; - } -}; - -#endif diff --git a/branches/marnberg/quota/common/ThreadPool.h b/branches/marnberg/quota/common/ThreadPool.h deleted file mode 100644 index 674053bfe1087..0000000000000 --- a/branches/marnberg/quota/common/ThreadPool.h +++ /dev/null @@ -1,138 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef THREADPOOL -#define THREADPOOL - -#include -using namespace std; - - -#include -#include -#include -#include - - -// debug output -#include "config.h" -#define tpdout(x) if (x <= g_conf.debug) cout << myname -#define DBLVL 15 - - -using namespace std; - -#define MAX_THREADS 1000 - -template -class ThreadPool { - - private: - list q; - Mutex q_lock; - Semaphore q_sem; - - int num_ops; - int num_threads; - vector thread; - - U u; - void (*func)(U,T); - void (*prefunc)(U,T); - string myname; - - static void *foo(void *arg) - { - ThreadPool *t = (ThreadPool *)arg; - t->do_ops(arg); - return 0; - } - - void *do_ops(void *nothing) - { - tpdout(DBLVL) << ".do_ops thread " << pthread_self() << " starting" << endl; - while (1) { - q_sem.Get(); - if (q.empty()) break; - - T op = get_op(); - tpdout(DBLVL) << ".func thread "<< pthread_self() << " on " << op << endl; - func(u, op); - } - tpdout(DBLVL) << ".do_ops thread " << pthread_self() << " exiting" << endl; - return 0; - } - - - T get_op() - { - T op; - q_lock.Lock(); - { - op = q.front(); - q.pop_front(); - num_ops--; - - if (prefunc && op) { - tpdout(DBLVL) << ".prefunc thread "<< pthread_self() << " on " << op << endl; - prefunc(u, op); - } - } - q_lock.Unlock(); - - return op; - } - - public: - - ThreadPool(char *myname, int howmany, void (*f)(U,T), U obj, void (*pf)(U,T) = 0) : - num_ops(0), num_threads(howmany), - thread(num_threads), - u(obj), - func(f), prefunc(pf), - myname(myname) { - tpdout(DBLVL) << ".cons num_threads " << num_threads << endl; - - // start threads - int status; - for(int i = 0; i < howmany; i++) { - status = pthread_create(&thread[i], NULL, (void*(*)(void *))&ThreadPool::foo, this); - assert(status == 0); - } - } - - ~ThreadPool() { - // bump sem to make threads exit cleanly - for(int i = 0; i < num_threads; i++) - q_sem.Put(); - - // wait for them to die - for(int i = 0; i < num_threads; i++) { - tpdout(DBLVL) << ".des joining thread " << thread[i] << endl; - void *rval = 0; // we don't actually care - pthread_join(thread[i], &rval); - } - } - - void put_op(T op) { - tpdout(DBLVL) << ".put_op " << op << endl; - q_lock.Lock(); - q.push_back(op); - num_ops++; - q_sem.Put(); - q_lock.Unlock(); - } - -}; -#endif diff --git a/branches/marnberg/quota/common/Timer.cc b/branches/marnberg/quota/common/Timer.cc deleted file mode 100644 index 522a623d5ebac..0000000000000 --- a/branches/marnberg/quota/common/Timer.cc +++ /dev/null @@ -1,333 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - - -#include "Timer.h" -#include "Cond.h" - -#include "config.h" -#include "include/Context.h" - -#undef dout -#define dout(x) if (x <= g_conf.debug) cout << g_clock.now() << " TIMER " -#define derr(x) if (x <= g_conf.debug) cerr << g_clock.now() << " TIMER " - -#define DBL 10 - -#include -#include -#include - -// single global instance -Timer g_timer; - - - -/**** thread solution *****/ - -bool Timer::get_next_due(utime_t& when) -{ - if (scheduled.empty()) { - dout(10) << "get_next_due - nothing scheduled" << endl; - return false; - } else { - map< utime_t, set >::iterator it = scheduled.begin(); - when = it->first; - dout(10) << "get_next_due - " << when << endl; - return true; - } -} - - -void Timer::timer_entry() -{ - lock.Lock(); - - while (!thread_stop) { - - // now - utime_t now = g_clock.now(); - - // any events due? - utime_t next; - bool next_due = get_next_due(next); - - if (next_due && now >= next) { - // move to pending list - list pending; - - map< utime_t, set >::iterator it = scheduled.begin(); - while (it != scheduled.end()) { - if (it->first > now) break; - - utime_t t = it->first; - dout(DBL) << "queueing event(s) scheduled at " << t << endl; - - for (set::iterator cit = it->second.begin(); - cit != it->second.end(); - cit++) { - pending.push_back(*cit); - event_times.erase(*cit); - num_event--; - } - - map< utime_t, set >::iterator previt = it; - it++; - scheduled.erase(previt); - } - - if (!pending.empty()) { - sleeping = false; - lock.Unlock(); - { - // make sure we're not holding any locks while we do callbacks - // make the callbacks myself. - for (list::iterator cit = pending.begin(); - cit != pending.end(); - cit++) { - dout(DBL) << "start callback " << *cit << endl; - (*cit)->finish(0); - dout(DBL) << "finish callback " << *cit << endl; - delete *cit; - } - pending.clear(); - assert(pending.empty()); - } - lock.Lock(); - } - - } - else { - // sleep - if (next_due) { - dout(DBL) << "sleeping until " << next << endl; - timed_sleep = true; - sleeping = true; - timeout_cond.WaitUntil(lock, next); // wait for waker or time - utime_t now = g_clock.now(); - dout(DBL) << "kicked or timed out at " << now << endl; - } else { - dout(DBL) << "sleeping" << endl; - timed_sleep = false; - sleeping = true; - sleep_cond.Wait(lock); // wait for waker - utime_t now = g_clock.now(); - dout(DBL) << "kicked at " << now << endl; - } - } - } - - lock.Unlock(); -} - - - -/** - * Timer bits - */ - -void Timer::register_timer() -{ - if (timer_thread.is_started()) { - if (sleeping) { - dout(DBL) << "register_timer kicking thread" << endl; - if (timed_sleep) - timeout_cond.SignalAll(); - else - sleep_cond.SignalAll(); - } else { - dout(DBL) << "register_timer doing nothing; thread is awake" << endl; - // it's probably doing callbacks. - } - } else { - dout(DBL) << "register_timer starting thread" << endl; - timer_thread.create(); - } -} - -void Timer::cancel_timer() -{ - // clear my callback pointers - if (timer_thread.is_started()) { - dout(10) << "setting thread_stop flag" << endl; - lock.Lock(); - thread_stop = true; - if (timed_sleep) - timeout_cond.SignalAll(); - else - sleep_cond.SignalAll(); - lock.Unlock(); - - dout(10) << "waiting for thread to finish" << endl; - void *ptr; - timer_thread.join(&ptr); - - dout(10) << "thread finished, exit code " << ptr << endl; - } -} - - -/* - * schedule - */ - - -void Timer::add_event_after(float seconds, - Context *callback) -{ - utime_t when = g_clock.now(); - when.sec_ref() += (int)seconds; - add_event_at(when, callback); -} - -void Timer::add_event_at(utime_t when, - Context *callback) -{ - lock.Lock(); - - dout(DBL) << "add_event " << callback << " at " << when << endl; - - // insert - scheduled[when].insert(callback); - assert(event_times.count(callback) == 0); - event_times[callback] = when; - - num_event++; - - // make sure i wake up on time - register_timer(); - - lock.Unlock(); -} - -bool Timer::cancel_event(Context *callback) -{ - lock.Lock(); - - dout(DBL) << "cancel_event " << callback << endl; - - if (!event_times.count(callback)) { - dout(DBL) << "cancel_event " << callback << " isn't scheduled (probably executing)" << endl; - lock.Unlock(); - return false; // wasn't scheduled. - } - - utime_t tp = event_times[callback]; - event_times.erase(callback); - - assert(scheduled.count(tp)); - assert(scheduled[tp].count(callback)); - scheduled[tp].erase(callback); - if (scheduled[tp].empty()) - scheduled.erase(tp); - - lock.Unlock(); - - // delete the canceled event. - delete callback; - - return true; -} - - -// ------------------------------- - -void SafeTimer::add_event_after(float seconds, Context *c) -{ - assert(lock.is_locked()); - Context *w = new EventWrapper(this, c); - dout(DBL) << "SafeTimer.add_event_after wrapping " << c << " with " << w << endl; - scheduled[c] = w; - g_timer.add_event_after(seconds, w); -} - -void SafeTimer::add_event_at(utime_t when, Context *c) -{ - assert(lock.is_locked()); - Context *w = new EventWrapper(this, c); - dout(DBL) << "SafeTimer.add_event_at wrapping " << c << " with " << w << endl; - scheduled[c] = w; - g_timer.add_event_at(when, w); -} - -void SafeTimer::EventWrapper::finish(int r) -{ - timer->lock.Lock(); - if (timer->scheduled.count(actual)) { - // still scheduled. execute. - actual->finish(r); - timer->scheduled.erase(actual); - } else { - // i was canceled. - assert(timer->canceled.count(actual)); - } - - // did i get canceled? - // (this can happen even if i just executed above. e.g., i may have canceled myself.) - if (timer->canceled.count(actual)) { - timer->canceled.erase(actual); - timer->cond.Signal(); - } - - // delete the original event - delete actual; - - timer->lock.Unlock(); -} - -void SafeTimer::cancel_event(Context *c) -{ - assert(lock.is_locked()); - assert(scheduled.count(c)); - - if (g_timer.cancel_event(scheduled[c])) { - // hosed wrapper. hose original event too. - delete c; - } else { - // clean up later. - canceled[c] = scheduled[c]; - } - scheduled.erase(c); -} - -void SafeTimer::cancel_all() -{ - assert(lock.is_locked()); - - while (!scheduled.empty()) - cancel_event(scheduled.begin()->first); -} - -void SafeTimer::join() -{ - assert(lock.is_locked()); - assert(scheduled.empty()); - - while (!canceled.empty()) { - // wait - dout(-10) << "SafeTimer.join waiting for " << canceled.size() << " to join" << endl; - dout(-10) << canceled << endl; - cond.Wait(lock); - } -} - -SafeTimer::~SafeTimer() -{ - if (!scheduled.empty() && !canceled.empty()) { - derr(0) << "SafeTimer.~SafeTimer " << scheduled.size() << " events scheduled, " - << canceled.size() << " canceled but unflushed" - << endl; - } -} diff --git a/branches/marnberg/quota/common/Timer.h b/branches/marnberg/quota/common/Timer.h deleted file mode 100644 index 88d9929ac5ae1..0000000000000 --- a/branches/marnberg/quota/common/Timer.h +++ /dev/null @@ -1,177 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __TIMER_H -#define __TIMER_H - -#include "include/types.h" -#include "include/Context.h" -#include "Clock.h" - -#include "Mutex.h" -#include "Cond.h" -#include "Thread.h" - -#include -#include -using namespace std; - -#include -using namespace __gnu_cxx; - - -/*** Timer - * schedule callbacks - */ - -//class Messenger; - - -namespace __gnu_cxx { - template<> struct hash { - size_t operator()(const Context *p) const { - static hash H; - return H((unsigned long)p); - } - }; -} - - -class Timer { - private: - map< utime_t, set > scheduled; // time -> (context ...) - hash_map< Context*, utime_t > event_times; // event -> time - - // get time of the next event - //Context* get_next_scheduled(utime_t& when); - - bool get_next_due(utime_t &when); - - void register_timer(); // make sure i get a callback - void cancel_timer(); // make sure i get a callback - - //pthread_t thread_id; - bool thread_stop; - Mutex lock; - bool timed_sleep; - bool sleeping; - Cond sleep_cond; - Cond timeout_cond; - - public: - void timer_entry(); // waiter thread (that wakes us up) - - class TimerThread : public Thread { - Timer *t; - public: - void *entry() { - t->timer_entry(); - return 0; - } - TimerThread(Timer *_t) : t(_t) {} - } timer_thread; - - - int num_event; - - - public: - Timer() : - thread_stop(false), - timed_sleep(false), - sleeping(false), - timer_thread(this), - num_event(0) - { - } - ~Timer() { - // stop. - cancel_timer(); - - // scheduled - for (map< utime_t, set >::iterator it = scheduled.begin(); - it != scheduled.end(); - it++) { - for (set::iterator sit = it->second.begin(); - sit != it->second.end(); - sit++) - delete *sit; - } - scheduled.clear(); - } - - void init() { - register_timer(); - } - void shutdown() { - cancel_timer(); - } - - // schedule events - void add_event_after(float seconds, - Context *callback); - void add_event_at(utime_t when, - Context *callback); - bool cancel_event(Context *callback); - - // execute pending events - void execute_pending(); - -}; - - -/* - * SafeTimer is a wrapper around the raw Timer (or rather, g_timer, it's global - * instantiation) that protects event execution with an existing mutex. It - * provides for, among other things, reliable event cancellation on class - * destruction. The caller just needs to cancel each event (or cancel_all()), - * and then call join() to ensure any concurrently exectuting events (in other - * threads) get flushed. - */ -class SafeTimer { - Mutex& lock; - Cond cond; - map scheduled; // actual -> wrapper - map canceled; - - class EventWrapper : public Context { - SafeTimer *timer; - Context *actual; - public: - EventWrapper(SafeTimer *st, Context *c) : timer(st), - actual(c) {} - void finish(int r); - }; - -public: - SafeTimer(Mutex& l) : lock(l) { } - ~SafeTimer(); - - void add_event_after(float seconds, Context *c); - void add_event_at(utime_t when, Context *c); - void cancel_event(Context *c); - void cancel_all(); - void join(); - - int get_num_scheduled() { return scheduled.size(); } - int get_num_canceled() { return canceled.size(); } -}; - - -// single global instance -extern Timer g_timer; - - - -#endif diff --git a/branches/marnberg/quota/config.cc b/branches/marnberg/quota/config.cc deleted file mode 100644 index 516a0e1c1ce29..0000000000000 --- a/branches/marnberg/quota/config.cc +++ /dev/null @@ -1,834 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include "config.h" -#include "include/types.h" - -//#define MDS_CACHE_SIZE 4*10000 -> <20mb -//#define MDS_CACHE_SIZE 80000 62mb - -#define AVG_PER_INODE_SIZE 450 -#define MDS_CACHE_MB_TO_INODES(x) ((x)*1000000/AVG_PER_INODE_SIZE) - -//#define MDS_CACHE_SIZE MDS_CACHE_MB_TO_INODES( 50 ) -//#define MDS_CACHE_SIZE 1500000 -#define MDS_CACHE_SIZE 150000 - - -// hack hack hack ugly FIXME -#include "common/Mutex.h" -long buffer_total_alloc = 0; -Mutex bufferlock; - - - -FileLayout g_OSD_FileLayout( 1<<20, 1, 1<<20, 2 ); // stripe over 1M objects, 2x replication -//FileLayout g_OSD_FileLayout( 1<<17, 4, 1<<20 ); // 128k stripes over sets of 4 - -// ?? -//FileLayout g_OSD_MDDirLayout( 1<<8, 1<<2, 1<<19, 3 ); // this is stupid, but can bring out an ebofs table bug? -FileLayout g_OSD_MDDirLayout( 1<<20, 1, 1<<20, 2 ); // 1M objects, 2x replication - -// stripe mds log over 128 byte bits (see mds_log_pad_entry below to match!) -FileLayout g_OSD_MDLogLayout( 1<<20, 1, 1<<20, 2 ); // 1M objects -//FileLayout g_OSD_MDLogLayout( 1<<8, 1<<2, 1<<19, 3 ); // 256 byte bits -//FileLayout g_OSD_MDLogLayout( 1<<7, 32, 1<<20, 3 ); // 128 byte stripes over 32 1M objects -//FileLayout g_OSD_MDLogLayout( 57, 32, 1<<20 ); // pathological case to test striping buffer mapping -//FileLayout g_OSD_MDLogLayout( 1<<20, 1, 1<<20 ); // old way - -// fake osd failures: osd -> time -std::map g_fake_osd_down; -std::map g_fake_osd_out; - -entity_addr_t g_my_addr; - -md_config_t g_debug_after_conf; - -md_config_t g_conf = { - num_mon: 1, - num_mds: 1, - num_osd: 4, - num_client: 1, - - mkfs: false, - - // profiling and debugging - log: true, - log_interval: 1, - log_name: (char*)0, - - log_messages: true, - log_pins: true, - - fake_clock: false, - fakemessenger_serialize: true, - - fake_osdmap_expand: 0, - fake_osdmap_updates: 0, - fake_osd_mttf: 0, - fake_osd_mttr: 0, - - osd_remount_at: 0, - - kill_after: 0, - - tick: 0, - - debug: 0, - debug_mds: 1, - debug_mds_balancer: 1, - debug_mds_log: 1, - debug_buffer: 0, - debug_filer: 0, - debug_objecter: 0, - debug_objectcacher: 0, - debug_client: 0, - debug_osd: 0, - debug_ebofs: 1, - debug_bdev: 1, // block device - debug_ns: 0, - debug_ms: 0, - debug_mon: 0, - - debug_after: 0, - - // -- misc -- - use_abspaths: false, // make monitorstore et al use absolute path (to workaround FUSE chdir("/")) - - // --- clock --- - clock_lock: false, - - // --- messenger --- - ms_single_dispatch: false, - ms_requeue_on_sender_fail: false, - - ms_stripe_osds: false, - ms_skip_rank0: false, - ms_overlay_clients: false, - - ms_die_on_failure: false, - - /*tcp_skip_rank0: false, - tcp_overlay_clients: false, // over osds! - tcp_log: false, - tcp_serial_marshall: true, - tcp_serial_out: false, - tcp_multi_out: true, - tcp_multi_dispatch: false, // not fully implemented yet - */ - - // --- mon --- - mon_tick_interval: 5, - mon_osd_down_out_interval: 5, // seconds - mon_lease: 2.000, // seconds - mon_stop_with_last_mds: true, - - // --- client --- - client_cache_size: 300, - client_cache_mid: .5, - client_cache_stat_ttl: 0, // seconds until cached stat results become invalid - client_cache_readdir_ttl: 1, // 1 second only - client_use_random_mds: false, - - client_sync_writes: 0, - - client_oc: true, - client_oc_size: 1024*1024* 5, // MB * n - client_oc_max_dirty: 1024*1024* 5, // MB * n - client_oc_max_sync_write: 128*1024, // writes >= this use wrlock - - client_trace: 0, - fuse_direct_io: 0, - - // --- objecter --- - objecter_buffer_uncommitted: true, - - // --- journaler --- - journaler_allow_split_entries: true, - - // --- mds --- - mds_cache_size: MDS_CACHE_SIZE, - mds_cache_mid: .7, - - mds_decay_halflife: 30, - - mds_beacon_interval: 5.0, - mds_beacon_grace: 10.0, - - mds_log: true, - mds_log_max_len: MDS_CACHE_SIZE / 3, - mds_log_max_trimming: 10000, - mds_log_read_inc: 1<<20, - mds_log_pad_entry: 128,//256,//64, - mds_log_before_reply: true, - mds_log_flush_on_shutdown: true, - mds_log_import_map_interval: 1024*1024, // frequency (in bytes) of EImportMap in log - mds_bal_replicate_threshold: 2000, - mds_bal_unreplicate_threshold: 0,//500, - mds_bal_hash_rd: 10000, - mds_bal_unhash_rd: 1000, - mds_bal_hash_wr: 10000, - mds_bal_unhash_wr: 1000, - mds_bal_interval: 30, // seconds - mds_bal_hash_interval: 5, // seconds - mds_bal_idle_threshold: .1, - mds_bal_max: -1, - mds_bal_max_until: -1, - - mds_bal_mode: 0, - mds_bal_min_start: .2, // if we need less than this, we don't do anything - mds_bal_need_min: .8, // take within this range of what we need - mds_bal_need_max: 1.2, - mds_bal_midchunk: .3, // any sub bigger than this taken in full - mds_bal_minchunk: .001, // never take anything smaller than this - - mds_commit_on_shutdown: true, - mds_shutdown_check: 0, //30, - mds_shutdown_on_last_unmount: true, - - mds_verify_export_dirauth: true, - - mds_local_osd: false, - - - // --- osd --- - osd_rep: OSD_REP_PRIMARY, - osd_balance_reads: false, - osd_pg_bits: 0, // 0 == let osdmonitor decide - osd_object_layout: OBJECT_LAYOUT_HASHINO, - osd_pg_layout: PG_LAYOUT_CRUSH, - osd_max_rep: 4, - osd_maxthreads: 2, // 0 == no threading - osd_max_opq: 10, - osd_mkfs: false, - osd_age: .8, - osd_age_time: 0, - osd_heartbeat_interval: 5, // shut up while i'm debugging - osd_replay_window: 5, - osd_max_pull: 2, - osd_pad_pg_log: false, - - // --- fakestore --- - fakestore_fake_sync: 2, // 2 seconds - fakestore_fsync: false,//true, - fakestore_writesync: false, - fakestore_syncthreads: 4, - fakestore_fake_attrs: false, - fakestore_fake_collections: false, - fakestore_dev: 0, - - // --- ebofs --- - ebofs: 1, - ebofs_cloneable: false, - ebofs_verify: false, - ebofs_commit_ms: 2000, // 0 = no forced commit timeout (for debugging/tracing) - ebofs_idle_commit_ms: 100, // 0 = no idle detection. use this -or- bdev_idle_kick_after_ms - ebofs_oc_size: 10000, // onode cache - ebofs_cc_size: 10000, // cnode cache - ebofs_bc_size: (80 *256), // 4k blocks, *256 for MB - ebofs_bc_max_dirty: (60 *256), // before write() will block - ebofs_max_prefetch: 1000, // 4k blocks - ebofs_realloc: true, - - ebofs_abp_zero: false, // zero newly allocated buffers (may shut up valgrind) - ebofs_abp_max_alloc: 4096*16, // max size of new buffers (larger -> more memory fragmentation) - - // --- obfs --- - uofs: 0, - uofs_fake_sync: 2, // 2 seconds - uofs_cache_size: 1 << 28, //256MB - uofs_onode_size: (int)1024, - uofs_small_block_size: (int)4096, //4KB - uofs_large_block_size: (int)524288, //512KB - uofs_segment_size: (int)268435456, //256MB - uofs_block_meta_ratio: (int)10, - uofs_sync_write: (int)0, - uofs_nr_hash_buckets: (int)1023, - uofs_flush_interval: (int)5, //seconds - uofs_min_flush_pages: (int)1024, //4096 4k-pages - uofs_delay_allocation: (int)1, //true - - // --- block device --- - bdev_lock: true, - bdev_iothreads: 1, // number of ios to queue with kernel - bdev_idle_kick_after_ms: 0,//100, // ms ** FIXME ** this seems to break things, not sure why yet ** - bdev_el_fw_max_ms: 10000, // restart elevator at least once every 1000 ms - bdev_el_bw_max_ms: 3000, // restart elevator at least once every 300 ms - bdev_el_bidir: true, // bidirectional elevator? - bdev_iov_max: 512, // max # iov's to collect into a single readv()/writev() call - bdev_debug_check_io_overlap: true, // [DEBUG] check for any pending io overlaps - bdev_fake_mb: 0, - bdev_fake_max_mb: 0, - - // --- fakeclient (mds regression testing) (ancient history) --- - num_fakeclient: 100, - fakeclient_requests: 100, - fakeclient_deterministic: false, - - fakeclient_op_statfs: false, - - // loosely based on Roselli workload paper numbers - fakeclient_op_stat: 610, - fakeclient_op_lstat: false, - fakeclient_op_utime: 0, - fakeclient_op_chmod: 1, - fakeclient_op_chown: 1, - - fakeclient_op_readdir: 2, - fakeclient_op_mknod: 30, - fakeclient_op_link: false, - fakeclient_op_unlink: 20, - fakeclient_op_rename: 0,//40, - - fakeclient_op_mkdir: 10, - fakeclient_op_rmdir: 20, - fakeclient_op_symlink: 20, - - fakeclient_op_openrd: 200, - fakeclient_op_openwr: 0, - fakeclient_op_openwrc: 0, - fakeclient_op_read: false, // osd! - fakeclient_op_write: false, // osd! - fakeclient_op_truncate: false, - fakeclient_op_fsync: false, - fakeclient_op_close: 200 - -#ifdef USE_OSBDB - , - bdbstore: false, - debug_bdbstore: 1, - bdbstore_btree: false, - bdbstore_ffactor: 0, - bdbstore_nelem: 0, - bdbstore_pagesize: 0, - bdbstore_cachesize: 0, - bdbstore_transactional: false -#endif // USE_OSBDB -}; - - -#include -#include - - -void env_to_vec(std::vector& args) -{ - const char *p = getenv("CEPH_ARGS"); - if (!p) return; - - static char buf[1000]; - int len = strlen(p); - memcpy(buf, p, len); - buf[len] = 0; - //cout << "CEPH_ARGS " << buf << endl; - - int l = 0; - for (int i=0; i& args) -{ - for (int i=1; i& args, - int& argc, char **&argv) -{ - argv = (char**)malloc(sizeof(char*) * argc); - argc = 1; - argv[0] = "asdf"; - - for (unsigned i=0; i= '0' && *s <= '9') { - int digit = *s - '0'; - //cout << "digit " << digit << endl; - val *= 10; - val += digit; - numdigits++; - s++; off++; - } - //cout << "val " << val << endl; - - if (numdigits == 0) { - cerr << "no digits at off " << off << endl; - return false; // no digits - } - if (count < 3 && *s != '.') { - cerr << "should period at " << off << endl; - return false; // should have 3 periods - } - if (count == 3 && *s != ':') { - cerr << "expected : at " << off << endl; - return false; // then a colon - } - s++; off++; - - if (count <= 3) - a.ipq[count] = val; - else - a.port = val; - - count++; - if (count == 5) break; - } - - return true; -} - - - -void parse_config_options(std::vector& args) -{ - std::vector nargs; - - for (unsigned i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __CONFIG_H -#define __CONFIG_H - -extern class FileLayout g_OSD_FileLayout; -extern class FileLayout g_OSD_MDDirLayout; -extern class FileLayout g_OSD_MDLogLayout; - -#include -#include - -extern std::map g_fake_osd_down; -extern std::map g_fake_osd_out; - -#define OSD_REP_PRIMARY 0 -#define OSD_REP_SPLAY 1 -#define OSD_REP_CHAIN 2 - - -#include "msg/msg_types.h" - -extern entity_addr_t g_my_addr; - -struct md_config_t { - int num_mon; - int num_mds; - int num_osd; - int num_client; - - bool mkfs; - - // profiling - bool log; - int log_interval; - char *log_name; - - bool log_messages; - bool log_pins; - - bool fake_clock; - bool fakemessenger_serialize; - - int fake_osdmap_expand; - int fake_osdmap_updates; - int fake_osd_mttf; - int fake_osd_mttr; - - int osd_remount_at; - - int kill_after; - - int tick; - - int debug; - int debug_mds; - int debug_mds_balancer; - int debug_mds_log; - int debug_buffer; - int debug_filer; - int debug_objecter; - int debug_objectcacher; - int debug_client; - int debug_osd; - int debug_ebofs; - int debug_bdev; - int debug_ns; - int debug_ms; - int debug_mon; - - int debug_after; - - // misc - bool use_abspaths; - - // clock - bool clock_lock; - - // messenger - - /*bool tcp_skip_rank0; - bool tcp_overlay_clients; - bool tcp_log; - bool tcp_serial_marshall; - bool tcp_serial_out; - bool tcp_multi_out; - bool tcp_multi_dispatch; - */ - - bool ms_single_dispatch; - bool ms_requeue_on_sender_fail; - - bool ms_stripe_osds; - bool ms_skip_rank0; - bool ms_overlay_clients; - bool ms_die_on_failure; - - // mon - int mon_tick_interval; - int mon_osd_down_out_interval; - float mon_lease; - bool mon_stop_with_last_mds; - - // client - int client_cache_size; - float client_cache_mid; - int client_cache_stat_ttl; - int client_cache_readdir_ttl; - bool client_use_random_mds; // debug flag - - bool client_sync_writes; - - bool client_oc; - int client_oc_size; - int client_oc_max_dirty; - size_t client_oc_max_sync_write; - - - - /* - bool client_bcache; - int client_bcache_alloc_minsize; - int client_bcache_alloc_maxsize; - int client_bcache_ttl; - off_t client_bcache_size; - int client_bcache_lowater; - int client_bcache_hiwater; - size_t client_bcache_align; - */ - - int client_trace; - int fuse_direct_io; - - // objecter - bool objecter_buffer_uncommitted; - - // journaler - bool journaler_allow_split_entries; - - // mds - int mds_cache_size; - float mds_cache_mid; - - float mds_decay_halflife; - - float mds_beacon_interval; - float mds_beacon_grace; - - bool mds_log; - int mds_log_max_len; - int mds_log_max_trimming; - int mds_log_read_inc; - int mds_log_pad_entry; - bool mds_log_before_reply; - bool mds_log_flush_on_shutdown; - off_t mds_log_import_map_interval; - - float mds_bal_replicate_threshold; - float mds_bal_unreplicate_threshold; - float mds_bal_hash_rd; - float mds_bal_unhash_rd; - float mds_bal_hash_wr; - float mds_bal_unhash_wr; - int mds_bal_interval; - int mds_bal_hash_interval; - float mds_bal_idle_threshold; - int mds_bal_max; - int mds_bal_max_until; - - int mds_bal_mode; - float mds_bal_min_start; - float mds_bal_need_min; - float mds_bal_need_max; - float mds_bal_midchunk; - float mds_bal_minchunk; - - bool mds_commit_on_shutdown; - int mds_shutdown_check; - bool mds_shutdown_on_last_unmount; - bool mds_verify_export_dirauth; // debug flag - - bool mds_local_osd; - - - // osd - int osd_rep; - bool osd_balance_reads; - int osd_pg_bits; - int osd_object_layout; - int osd_pg_layout; - int osd_max_rep; - int osd_maxthreads; - int osd_max_opq; - bool osd_mkfs; - float osd_age; - int osd_age_time; - int osd_heartbeat_interval; - int osd_replay_window; - int osd_max_pull; - bool osd_pad_pg_log; - - int fakestore_fake_sync; - bool fakestore_fsync; - bool fakestore_writesync; - int fakestore_syncthreads; // such crap - bool fakestore_fake_attrs; - bool fakestore_fake_collections; - char *fakestore_dev; - - // ebofs - int ebofs; - bool ebofs_cloneable; - bool ebofs_verify; - int ebofs_commit_ms; - int ebofs_idle_commit_ms; - int ebofs_oc_size; - int ebofs_cc_size; - off_t ebofs_bc_size; - off_t ebofs_bc_max_dirty; - unsigned ebofs_max_prefetch; - bool ebofs_realloc; - - bool ebofs_abp_zero; - size_t ebofs_abp_max_alloc; - - int uofs; - int uofs_fake_sync; - int uofs_cache_size; - int uofs_onode_size; - int uofs_small_block_size; - int uofs_large_block_size; - int uofs_segment_size; - int uofs_block_meta_ratio; - int uofs_sync_write; - - int uofs_nr_hash_buckets; - int uofs_flush_interval; - int uofs_min_flush_pages; - int uofs_delay_allocation; - - // block device - bool bdev_lock; - int bdev_iothreads; - int bdev_idle_kick_after_ms; - int bdev_el_fw_max_ms; - int bdev_el_bw_max_ms; - bool bdev_el_bidir; - int bdev_iov_max; - bool bdev_debug_check_io_overlap; - int bdev_fake_mb; - int bdev_fake_max_mb; - - // fake client - int num_fakeclient; - unsigned fakeclient_requests; - bool fakeclient_deterministic; // debug flag - - int fakeclient_op_statfs; - - int fakeclient_op_stat; - int fakeclient_op_lstat; - int fakeclient_op_utime; - int fakeclient_op_chmod; - int fakeclient_op_chown; - - int fakeclient_op_readdir; - int fakeclient_op_mknod; - int fakeclient_op_link; - int fakeclient_op_unlink; - int fakeclient_op_rename; - - int fakeclient_op_mkdir; - int fakeclient_op_rmdir; - int fakeclient_op_symlink; - - int fakeclient_op_openrd; - int fakeclient_op_openwr; - int fakeclient_op_openwrc; - int fakeclient_op_read; - int fakeclient_op_write; - int fakeclient_op_truncate; - int fakeclient_op_fsync; - int fakeclient_op_close; - -#ifdef USE_OSBDB - bool bdbstore; - int debug_bdbstore; - bool bdbstore_btree; - int bdbstore_ffactor; - int bdbstore_nelem; - int bdbstore_pagesize; - int bdbstore_cachesize; - bool bdbstore_transactional; -#endif // USE_OSBDB -}; - -extern md_config_t g_conf; -extern md_config_t g_debug_after_conf; - -#define dout(x) if ((x) <= g_conf.debug) std::cout -#define dout2(x) if ((x) <= g_conf.debug) std::cout - -void env_to_vec(std::vector& args); -void argv_to_vec(int argc, char **argv, - std::vector& args); -void vec_to_argv(std::vector& args, - int& argc, char **&argv); - -void parse_config_options(std::vector& args); - -extern bool parse_ip_port(const char *s, entity_addr_t& addr); - - - -#endif diff --git a/branches/marnberg/quota/cosd.cc b/branches/marnberg/quota/cosd.cc deleted file mode 100644 index ae23a667a32da..0000000000000 --- a/branches/marnberg/quota/cosd.cc +++ /dev/null @@ -1,124 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include -#include -#include - -#include -#include -#include -using namespace std; - -#include "config.h" - -#include "mon/MonMap.h" - -#include "osd/OSD.h" -#include "ebofs/Ebofs.h" - -#include "msg/SimpleMessenger.h" - -#include "common/Timer.h" - - -class C_Die : public Context { -public: - void finish(int) { - cerr << "die" << endl; - exit(1); - } -}; - -class C_Debug : public Context { - public: - void finish(int) { - int size = &g_conf.debug_after - &g_conf.debug; - memcpy((char*)&g_conf.debug, (char*)&g_debug_after_conf.debug, size); - dout(0) << "debug_after flipping debug settings" << endl; - } -}; - - -int main(int argc, char **argv) -{ - vector args; - argv_to_vec(argc, argv, args); - - parse_config_options(args); - - if (g_conf.kill_after) - g_timer.add_event_after(g_conf.kill_after, new C_Die); - if (g_conf.debug_after) - g_timer.add_event_after(g_conf.debug_after, new C_Debug); - - // osd specific args - char *dev; - int whoami = -1; - for (unsigned i=0; imount(); - int r = store->read(object_t(0,0), 0, sizeof(sb), bl); - if (r < 0) { - cerr << "couldn't read superblock object on " << dev << endl; - exit(0); - } - bl.copy(0, sizeof(sb), (char*)&sb); - store->umount(); - delete store; - whoami = sb.whoami; - - cout << "osd fs says i am osd" << whoami << endl; - } else { - cout << "command line arg says i am osd" << whoami << endl; - } - - // load monmap - MonMap monmap; - int r = monmap.read(".ceph_monmap"); - assert(r >= 0); - - // start up network - rank.start_rank(); - - // start osd - Messenger *m = rank.register_entity(MSG_ADDR_OSD(whoami)); - assert(m); - OSD *osd = new OSD(whoami, m, &monmap, dev); - osd->init(); - - // wait - rank.wait(); - - // done - delete osd; - - return 0; -} - diff --git a/branches/marnberg/quota/crush/BinaryTree.h b/branches/marnberg/quota/crush/BinaryTree.h deleted file mode 100644 index f13f3f1e565ef..0000000000000 --- a/branches/marnberg/quota/crush/BinaryTree.h +++ /dev/null @@ -1,284 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __crush_BINARYTREE_H -#define __crush_BINARYTREE_H - -#include -#include -#include -#include -//#include -using namespace std; - -#include "include/buffer.h" - -namespace crush { - - class BinaryTree { - private: - // tree def - int root_node; // 0 for empty tree. - int alloc; - vector node_nested; // all existing nodes in this map - vector node_weight; // and this one - vector node_complete; // only nodes with all possible children - - public: - BinaryTree() : root_node(0), alloc(0) {} - - void _encode(bufferlist& bl) { - bl.append((char*)&root_node, sizeof(root_node)); - bl.append((char*)&alloc, sizeof(alloc)); - ::_encode(node_nested, bl); - ::_encode(node_weight, bl); - ::_encode(node_complete, bl); - } - void _decode(bufferlist& bl, int& off) { - bl.copy(off, sizeof(root_node), (char*)&root_node); - off += sizeof(root_node); - bl.copy(off, sizeof(alloc), (char*)&alloc); - off += sizeof(alloc); - ::_decode(node_nested, bl, off); - ::_decode(node_weight, bl, off); - ::_decode(node_complete, bl, off); - } - - // accessors - bool empty() const { return root_node == 0; } - bool exists(int n) const { return n < alloc && node_nested[n]; } - int nested(int n) const { return exists(n) ? node_nested[n]:0; } - float weight(int n) const { return exists(n) ? node_weight[n]:0; } - bool complete(int n) const { return exists(n) ? node_complete[n]:false; } - - int root() const { return root_node; } - - void realloc(int n) { - /* - while (alloc <= n) { - node_nested.push_back(0); - node_weight.push_back(0); - node_complete.push_back(0); - alloc++; - } - */ - if (alloc <= n) { - int add = n - alloc + 1; - node_nested.insert(node_nested.end(), add, 0); - node_weight.insert(node_weight.end(), add, 0); - node_complete.insert(node_complete.end(), add, 0); - alloc = n+1; - } - } - - // tree navigation - bool terminal(int n) const { return n & 1; } // odd nodes are leaves. - int height(int n) const { - assert(n); - int h = 0; - while ((n & 1) == 0) { - assert(n > 0); - h++; n = n >> 1; - } - return h; - } - int left(int n) const { - int h = height(n); - //cout << "left of " << n << " is " << (n - (1 << h)) << endl; - return n - (1 << (h-1)); - } - int right(int n) const { - int h = height(n); - //cout << "right of " << n << " is " << (n + (1 << h)) << endl; - return n + (1 << (h-1)); - } - bool on_right(int n, int h = -1) const { - if (h < 0) h = height(n); - return n & (1 << (h+1)); - } - bool on_left(int n) const { return !on_right(n); } - int parent(int n) const { - int h = height(n); - if (on_right(n, h)) - return n - (1<0; t--) out << " "; - if (tree.root() == n) - out << "root "; - else { - if (tree.on_left(n)) - out << "left "; - else - out << "right "; - } - out << n << " : nested " << tree.nested(n) << " weight " << tree.weight(n); - if (tree.complete(n)) out << " complete"; - out << endl; - if (!tree.terminal(n)) { - if (tree.exists(tree.left(n))) - print_binary_tree_node(out, tree, tree.left(n), i+2); - if (tree.exists(tree.right(n))) - print_binary_tree_node(out, tree, tree.right(n), i+2); - } - } - - inline ostream& operator<<(ostream& out, const BinaryTree& tree) { - if (tree.empty()) - return out << "tree is empty"; - print_binary_tree_node(out, tree, tree.root(), 0); - return out; - } - -} - -#endif diff --git a/branches/marnberg/quota/crush/Bucket.h b/branches/marnberg/quota/crush/Bucket.h deleted file mode 100644 index 5b2d3259e09f8..0000000000000 --- a/branches/marnberg/quota/crush/Bucket.h +++ /dev/null @@ -1,631 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __crush_BUCKET_H -#define __crush_BUCKET_H - -#include "BinaryTree.h" -#include "Hash.h" - -#include -#include -#include -#include -using namespace std; - -#include - -#include "include/buffer.h" - -namespace crush { - - - const int CRUSH_BUCKET_UNIFORM = 1; - const int CRUSH_BUCKET_TREE = 2; - const int CRUSH_BUCKET_LIST = 3; - const int CRUSH_BUCKET_STRAW = 4; - - /** abstract bucket **/ - class Bucket { - protected: - int id; - int parent; - int type; - float weight; - - public: - Bucket(int _type, - float _weight) : - id(0), parent(0), - type(_type), - weight(_weight) { } - - Bucket(bufferlist& bl, int& off) { - bl.copy(off, sizeof(id), (char*)&id); - off += sizeof(id); - bl.copy(off, sizeof(parent), (char*)&parent); - off += sizeof(parent); - bl.copy(off, sizeof(type), (char*)&type); - off += sizeof(type); - bl.copy(off, sizeof(weight), (char*)&weight); - off += sizeof(weight); - } - - virtual ~Bucket() { } - - virtual const char *get_bucket_type() const = 0; - virtual bool is_uniform() const = 0; - - int get_id() const { return id; } - int get_type() const { return type; } - float get_weight() const { return weight; } - int get_parent() const { return parent; } - virtual int get_size() const = 0; - - void set_id(int i) { id = i; } - void set_parent(int p) { parent = p; } - void set_weight(float w) { weight = w; } - - virtual void get_items(vector& i) const = 0; - virtual float get_item_weight(int item) const = 0; - virtual void add_item(int item, float w, bool back=false) = 0; - virtual void adjust_item_weight(int item, float w) = 0; - virtual void set_item_weight(int item, float w) { - adjust_item_weight(item, w - get_item_weight(item)); - } - - virtual int choose_r(int x, int r, Hash& h) const = 0; - - virtual void _encode(bufferlist& bl) = 0; - }; - - - - - /** uniform bucket **/ - class UniformBucket : public Bucket { - protected: - public: - vector items; - int item_type; - float item_weight; - - // primes - vector primes; - - int get_prime(int j) const { - return primes[ j % primes.size() ]; - } - void make_primes() { - if (items.empty()) return; - - //cout << "make_primes " << get_id() << " " << items.size() << endl; - Hash h(123+get_id()); - primes.clear(); - - // start with odd number > num_items - unsigned x = items.size() + 1; // this is the minimum! - x += h(items.size()) % (3*items.size()); // bump it up some - x |= 1; // make it odd - - while (primes.size() < items.size()) { - unsigned j; - for (j=2; j*j<=x; j++) - if (x % j == 0) break; - if (j*j > x) { - primes.push_back(x); - //cout << "prime " << x << endl; - } - x += 2; - } - } - - public: - UniformBucket(int _type, int _item_type) : - Bucket(_type, 0), - item_type(_item_type) { } - UniformBucket(int _type, int _item_type, - float _item_weight, vector& _items) : - Bucket(_type, _item_weight*_items.size()), - item_type(_item_type), - item_weight(_item_weight) { - items = _items; - make_primes(); - } - - UniformBucket(bufferlist& bl, int& off) : Bucket(bl, off) { - bl.copy(off, sizeof(item_type), (char*)&item_type); - off += sizeof(item_type); - bl.copy(off, sizeof(item_weight), (char*)&item_weight); - off += sizeof(item_weight); - ::_decode(items, bl, off); - make_primes(); - } - - void _encode(bufferlist& bl) { - char t = CRUSH_BUCKET_UNIFORM; - bl.append((char*)&t, sizeof(t)); - bl.append((char*)&id, sizeof(id)); - bl.append((char*)&parent, sizeof(parent)); - bl.append((char*)&type, sizeof(type)); - bl.append((char*)&weight, sizeof(weight)); - - bl.append((char*)&item_type, sizeof(item_type)); - bl.append((char*)&item_weight, sizeof(item_weight)); - - ::_encode(items, bl); - } - - const char *get_bucket_type() const { return "uniform"; } - bool is_uniform() const { return true; } - - int get_size() const { return items.size(); } - - // items - void get_items(vector& i) const { - i = items; - } - int get_item_type() const { return item_type; } - float get_item_weight(int item) const { return item_weight; } - - void add_item(int item, float w, bool back=false) { - if (items.empty()) - item_weight = w; - items.push_back(item); - weight += item_weight; - make_primes(); - } - - void adjust_item_weight(int item, float w) { - assert(0); - } - - int choose_r(int x, int r, Hash& hash) const { - //cout << "uniformbucket.choose_r(" << x << ", " << r << ")" << endl; - //if (r >= get_size()) cout << "warning: r " << r << " >= " << get_size() << " uniformbucket.size" << endl; - - unsigned v = hash(x, get_id());// % get_size(); - unsigned p = get_prime( hash(get_id(), x) ); // choose a prime based on hash(x, get_id(), 2) - unsigned s = (x + v + (r+1)*p) % get_size(); - return items[s]; - } - - }; - - - - - - // list bucket.. RUSH_P sorta - - class ListBucket : public Bucket { - protected: - list items; - list item_weight; - list sum_weight; - - public: - ListBucket(int _type) : Bucket(_type, 0) { } - - ListBucket(bufferlist& bl, int& off) : Bucket(bl, off) { - ::_decode(items, bl, off); - ::_decode(item_weight, bl, off); - ::_decode(sum_weight, bl, off); - } - - void _encode(bufferlist& bl) { - char t = CRUSH_BUCKET_LIST; - bl.append((char*)&t, sizeof(t)); - bl.append((char*)&id, sizeof(id)); - bl.append((char*)&parent, sizeof(parent)); - bl.append((char*)&type, sizeof(type)); - bl.append((char*)&weight, sizeof(weight)); - - ::_encode(items, bl); - ::_encode(item_weight, bl); - ::_encode(sum_weight, bl); - } - - const char *get_bucket_type() const { return "list"; } - bool is_uniform() const { return false; } - - int get_size() const { return items.size(); } - - void get_items(vector& i) const { - for (list::const_iterator it = items.begin(); - it != items.end(); - it++) - i.push_back(*it); - } - float get_item_weight(int item) const { - list::const_iterator i = items.begin(); - list::const_iterator w = item_weight.begin(); - while (i != items.end()) { - if (*i == item) return *w; - i++; w++; - } - assert(0); - return 0; - } - - void add_item(int item, float w, bool back=false) { - if (back) { - items.push_back(item); - item_weight.push_back(w); - sum_weight.clear(); - float s = 0.0; - for (list::reverse_iterator i = item_weight.rbegin(); - i != item_weight.rend(); - i++) { - s += *i; - sum_weight.push_front(s); - } - weight += w; - assert(weight == s); - } else { - items.push_front(item); - item_weight.push_front(w); - weight += w; - sum_weight.push_front(weight); - } - } - - void adjust_item_weight(int item, float dw) { - // find it - list::iterator p = items.begin(); - list::iterator pw = item_weight.begin(); - list::iterator ps = sum_weight.begin(); - - while (*p != item) { - *ps += dw; - p++; pw++; ps++; // next! - assert(p != items.end()); - } - - assert(*p == item); - *pw += dw; - *ps += dw; - } - - - int choose_r(int x, int r, Hash& h) const { - //cout << "linearbucket.choose_r(" << x << ", " << r << ")" << endl; - - list::const_iterator p = items.begin(); - list::const_iterator pw = item_weight.begin(); - list::const_iterator ps = sum_weight.begin(); - - while (p != items.end()) { - const int item = *p; - const float iw = *pw; - const float tw = *ps; - const float f = (float)(h(x, item, r, get_id()) % 10000) * tw / 10000.0; - //cout << "item " << item << " iw = " << iw << " tw = " << tw << " f = " << f << endl; - if (f < iw) { - //cout << "linearbucket.choose_r(" << x << ", " << r << ") = " << item << endl; - return item; - } - p++; pw++; ps++; // next! - } - assert(0); - return 0; - } - - - }; - - - - - // mixed bucket, based on RUSH_T type binary tree - - class TreeBucket : public Bucket { - protected: - //vector item_weight; - - // public: - BinaryTree tree; - map node_item; // node id -> item - vector node_item_vec; // fast version of above - map item_node; // item -> node id - map item_weight; - - public: - TreeBucket(int _type) : Bucket(_type, 0) { } - - TreeBucket(bufferlist& bl, int& off) : Bucket(bl, off) { - tree._decode(bl, off); - - ::_decode(node_item, bl, off); - ::_decode(node_item_vec, bl, off); - ::_decode(item_node, bl, off); - ::_decode(item_weight, bl, off); - } - - void _encode(bufferlist& bl) { - char t = CRUSH_BUCKET_TREE; - bl.append((char*)&t, sizeof(t)); - bl.append((char*)&id, sizeof(id)); - bl.append((char*)&parent, sizeof(parent)); - bl.append((char*)&type, sizeof(type)); - bl.append((char*)&weight, sizeof(weight)); - - tree._encode(bl); - - ::_encode(node_item, bl); - ::_encode(node_item_vec, bl); - ::_encode(item_node, bl); - ::_encode(item_weight, bl); - } - - const char *get_bucket_type() const { return "tree"; } - bool is_uniform() const { return false; } - - int get_size() const { return node_item.size(); } - - // items - void get_items(vector& i) const { - for (map::const_iterator it = node_item.begin(); - it != node_item.end(); - it++) - i.push_back(it->second); - } - float get_item_weight(int i) const { - assert(item_weight.count(i)); - return ((map)item_weight)[i]; - } - - - void add_item(int item, float w, bool back=false) { - item_weight[item] = w; - weight += w; - - unsigned n = tree.add_node(w); - node_item[n] = item; - item_node[item] = n; - - while (node_item_vec.size() <= n) - node_item_vec.push_back(0); - node_item_vec[n] = item; - } - - void adjust_item_weight(int item, float dw) { - // adjust my weight - weight += dw; - item_weight[item] += dw; - - // adjust tree weights - tree.adjust_node_weight(item_node[item], dw); - } - - int choose_r(int x, int r, Hash& h) const { - //cout << "mixedbucket.choose_r(" << x << ", " << r << ")" << endl; - int n = tree.root(); - while (!tree.terminal(n)) { - // pick a point in [0,w) - float w = tree.weight(n); - float f = (float)(h(x, n, r, get_id()) % 10000) * w / 10000.0; - - // left or right? - int l = tree.left(n); - if (tree.exists(l) && - f < tree.weight(l)) - n = l; - else - n = tree.right(n); - } - //assert(node_item.count(n)); - //return ((map)node_item)[n]; - return node_item_vec[n]; - } - }; - - - - - - // straw bucket.. new thing! - - class StrawBucket : public Bucket { - protected: - map item_weight; - map item_straw; - - list _items; - list _straws; - - public: - StrawBucket(int _type) : Bucket(_type, 0) { } - - StrawBucket(bufferlist& bl, int& off) : Bucket(bl, off) { - ::_decode(item_weight, bl, off); - calc_straws(); - } - - void _encode(bufferlist& bl) { - char t = CRUSH_BUCKET_TREE; - bl.append((char*)&t, sizeof(t)); - bl.append((char*)&id, sizeof(id)); - bl.append((char*)&parent, sizeof(parent)); - bl.append((char*)&type, sizeof(type)); - bl.append((char*)&weight, sizeof(weight)); - - ::_encode(item_weight, bl); - } - - const char *get_bucket_type() const { return "straw"; } - bool is_uniform() const { return false; } - - int get_size() const { return item_weight.size(); } - - - // items - void get_items(vector& i) const { - for (map::const_iterator it = item_weight.begin(); - it != item_weight.end(); - it++) - i.push_back(it->first); - } - float get_item_weight(int item) const { - assert(item_weight.count(item)); - return ((map)item_weight)[item]; - } - - void add_item(int item, float w, bool back=false) { - item_weight[item] = w; - weight += w; - calc_straws(); - } - - void adjust_item_weight(int item, float dw) { - //cout << "adjust " << item << " " << dw << endl; - weight += dw; - item_weight[item] += dw; - calc_straws(); - } - - - /* calculate straw lengths. - this is kind of ugly. not sure if there's a closed form way to calculate this or not! - */ - void calc_straws() { - //cout << get_id() << ": calc_straws ============" << endl; - - item_straw.clear(); - _items.clear(); - _straws.clear(); - - // reverse sort by weight; skip zero weight items - map > reverse; - for (map::iterator p = item_weight.begin(); - p != item_weight.end(); - p++) { - //cout << get_id() << ":" << p->first << " " << p->second << endl; - if (p->second > 0) { - //p->second /= minw; - reverse[p->second].insert(p->first); - } - } - - /* 1:2:7 - item_straw[0] = 1.0; - item_straw[1] = item_straw[0]*sqrt(1.0/.6); - item_straw[2] = item_straw[1]*2.0; - */ - - // work from low to high weights - float straw = 1.0; - float numleft = item_weight.size(); - float wbelow = 0.0; - float lastw = 0.0; - - map >::iterator next = reverse.begin(); - //while (next != reverse.end()) { - while (1) { - //cout << "hi " << next->first << endl; - map >::iterator cur = next; - - // set straw length for this set - for (set::iterator s = cur->second.begin(); - s != cur->second.end(); - s++) { - item_straw[*s] = straw; - //cout << "straw " << *s << " w " << item_weight[*s] << " -> " << straw << endl; - _items.push_back(*s); - _straws.push_back(straw); - } - - next++; - if (next == reverse.end()) break; - - wbelow += (cur->first-lastw) * numleft; - //cout << "wbelow " << wbelow << endl; - - numleft -= 1.0 * (float)cur->second.size(); - //cout << "numleft now " << numleft << endl; - - float wnext = numleft * (next->first - cur->first); - //cout << "wnext " << wnext << endl; - - float pbelow = wbelow / (wbelow+wnext); - //cout << "pbelow " << pbelow << endl; - - straw *= pow((double)(1.0/pbelow), (double)1.0/numleft); - - lastw = cur->first; - } - //cout << "============" << endl; - } - - int choose_r(int x, int r, Hash& h) const { - //cout << "strawbucket.choose_r(" << x << ", " << r << ")" << endl; - - float high_draw = -1; - int high = 0; - - list::const_iterator pi = _items.begin(); - list::const_iterator ps = _straws.begin(); - while (pi != _items.end()) { - const int item = *pi; - const float rnd = (float)(h(x, item, r) % 1000000) / 1000000.0; - const float straw = *ps * rnd; - - if (high_draw < 0 || - straw > high_draw) { - high = *pi; - high_draw = straw; - } - - pi++; - ps++; - } - return high; - } - }; - - - - - - inline Bucket* decode_bucket(bufferlist& bl, int& off) { - char t; - bl.copy(off, sizeof(t), (char*)&t); - off += sizeof(t); - - switch (t) { - case CRUSH_BUCKET_UNIFORM: - return new UniformBucket(bl, off); - case CRUSH_BUCKET_LIST: - return new ListBucket(bl, off); - case CRUSH_BUCKET_TREE: - return new TreeBucket(bl, off); - case CRUSH_BUCKET_STRAW: - return new StrawBucket(bl, off); - default: - assert(0); - } - return 0; - } - - - -} - - - - - - - - -#endif diff --git a/branches/marnberg/quota/crush/Hash.h b/branches/marnberg/quota/crush/Hash.h deleted file mode 100644 index a321624925d95..0000000000000 --- a/branches/marnberg/quota/crush/Hash.h +++ /dev/null @@ -1,300 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -// Robert Jenkins' function for mixing 32-bit values -// http://burtleburtle.net/bob/hash/evahash.html -// a, b = random bits, c = input and output -#define hashmix(a,b,c) \ - a=a-b; a=a-c; a=a^(c>>13); \ - b=b-c; b=b-a; b=b^(a<<8); \ - c=c-a; c=c-b; c=c^(b>>13); \ - a=a-b; a=a-c; a=a^(c>>12); \ - b=b-c; b=b-a; b=b^(a<<16); \ - c=c-a; c=c-b; c=c^(b>>5); \ - a=a-b; a=a-c; a=a^(c>>3); \ - b=b-c; b=b-a; b=b^(a<<10); \ - c=c-a; c=c-b; c=c^(b>>15); - -namespace crush { - - class Hash { - int seed; - - public: - int get_seed() { return seed; } - void set_seed(int s) { seed = s; } - - Hash(int s) { - unsigned int hash = 1315423911; - int x = 231232; - int y = 1232; - hashmix(s, x, hash); - hashmix(y, s, hash); - seed = s; - } - - inline int operator()(int a) { - unsigned int hash = seed ^ a; - int b = a; - int x = 231232; - int y = 1232; - hashmix(b, x, hash); - hashmix(y, a, hash); - return (hash & 0x7FFFFFFF); - } - - inline int operator()(int a, int b) { - unsigned int hash = seed ^ a ^ b; - int x = 231232; - int y = 1232; - hashmix(a, b, hash); - hashmix(x, a, hash); - hashmix(b, y, hash); - return (hash & 0x7FFFFFFF); - } - - inline int operator()(int a, int b, int c) { - unsigned int hash = seed ^ a ^ b ^ c; - int x = 231232; - int y = 1232; - hashmix(a, b, hash); - hashmix(c, x, hash); - hashmix(y, a, hash); - hashmix(b, x, hash); - hashmix(y, c, hash); - return (hash & 0x7FFFFFFF); - } - - inline int operator()(int a, int b, int c, int d) { - unsigned int hash = seed ^a ^ b ^ c ^ d; - int x = 231232; - int y = 1232; - hashmix(a, b, hash); - hashmix(c, d, hash); - hashmix(a, x, hash); - hashmix(y, b, hash); - hashmix(c, x, hash); - hashmix(y, d, hash); - return (hash & 0x7FFFFFFF); - } - - inline int operator()(int a, int b, int c, int d, int e) { - unsigned int hash = seed ^ a ^ b ^ c ^ d ^ e; - int x = 231232; - int y = 1232; - hashmix(a, b, hash); - hashmix(c, d, hash); - hashmix(e, x, hash); - hashmix(y, a, hash); - hashmix(b, x, hash); - hashmix(y, c, hash); - hashmix(d, x, hash); - hashmix(y, e, hash); - return (hash & 0x7FFFFFFF); - } - }; - -} - - - -#if 0 - - - //return myhash(a) ^ seed; - return myhash(a, seed); - } - int operator()(int a, int b) { - //return myhash( myhash(a) ^ myhash(b) ^ seed ); - return myhash(a, b, seed); - } - int operator()(int a, int b, int c) { - //return myhash( myhash(a ^ seed) ^ myhash(b ^ seed) ^ myhash(c ^ seed) ^ seed ); - return myhash(a, b, c, seed); - } - int operator()(int a, int b, int c, int d) { - //return myhash( myhash(a ^ seed) ^ myhash(b ^ seed) ^ myhash(c ^ seed) ^ myhash(d ^ seed) ^ seed ); - return myhash(a, b, c, d, seed); - } - - // ethan's rush hash? - if (0) - return (n ^ 0xdead1234) * (884811920 * 3 + 1); - - if (1) { - - // before - hash ^= ((hash << 5) + (n&255) + (hash >> 2)); - hashmix(a, b, hash); - n = n >> 8; - hash ^= ((hash << 5) + (n&255) + (hash >> 2)); - hashmix(a, b, hash); - n = n >> 8; - hash ^= ((hash << 5) + (n&255) + (hash >> 2)); - hashmix(a, b, hash); - n = n >> 8; - hash ^= ((hash << 5) + (n&255) + (hash >> 2)); - hashmix(a, b, hash); - n = n >> 8; - - //return hash; - return (hash & 0x7FFFFFFF); - } - - // JS - // a little better than RS - // + jenkin's mixing thing (which sucks on its own but helps tons here) - // best so far - if (1) { - unsigned int hash = 1315423911; - int a = 231232; - int b = 1232; - - for(unsigned int i = 0; i < 4; i++) - { - hash ^= ((hash << 5) + (n&255) + (hash >> 2)); - hashmix(a, b, hash); - n = n >> 8; - } - - return (hash & 0x7FFFFFFF); - } - - - // Robert jenkins' 96 bit mix - // sucks - if (0) { - int c = n; - int a = 12378912; - int b = 2982827; - a=a-b; a=a-c; a=a^(c>>13); - b=b-c; b=b-a; b=b^(a<<8); - c=c-a; c=c-b; c=c^(b>>13); - a=a-b; a=a-c; a=a^(c>>12); - b=b-c; b=b-a; b=b^(a<<16); - c=c-a; c=c-b; c=c^(b>>5); - a=a-b; a=a-c; a=a^(c>>3); - b=b-c; b=b-a; b=b^(a<<10); - c=c-a; c=c-b; c=c^(b>>15); - return c; - } - // robert jenkins 32-bit - // sucks - if (0) { - n += (n << 12); - n ^= (n >> 22); - n += (n << 4); - n ^= (n >> 9); - n += (n << 10); - n ^= (n >> 2); - n += (n << 7); - n ^= (n >> 12); - return n; - } - - // djb2 - if (0) { - unsigned int hash = 5381; - for (int i=0; i<4; i++) { - hash = ((hash << 5) + hash) + ((n&255) ^ 123); - n = n >> 8; - } - return hash; - } - - - // SDBM - if (1) { - unsigned int hash = 0; - - for(unsigned int i = 0; i < 4; i++) - { - hash = (n&255) + (hash << 6) + (hash << 16) - hash; - n = n >> 8; - } - - return (hash & 0x7FFFFFFF); - } - - // PJW - // horrid - if (0) { - unsigned int BitsInUnsignedInt = (unsigned int)(sizeof(unsigned int) * 8); - unsigned int ThreeQuarters = (unsigned int)((BitsInUnsignedInt * 3) / 4); - unsigned int OneEighth = (unsigned int)(BitsInUnsignedInt / 8); - unsigned int HighBits = (unsigned int)(0xFFFFFFFF) << (BitsInUnsignedInt - OneEighth); - unsigned int hash = 0; - unsigned int test = 0; - - for(unsigned int i = 0; i < 4; i++) - { - hash = (hash << OneEighth) + (n&255); - - if((test = hash & HighBits) != 0) - { - hash = (( hash ^ (test >> ThreeQuarters)) & (~HighBits)); - } - n = n >> 8; - } - - return (hash & 0x7FFFFFFF); - } - - // RS Hash function, from Robert Sedgwicks Algorithms in C book, w/ some changes. - if (0) { - unsigned int b = 378551; - unsigned int a = 63689; - unsigned int hash = 0; - - for(unsigned int i=0; i<4; i++) - { - hash = hash * a + (n&0xff); - a = a * b; - n = n >> 8; - } - - return (hash & 0x7FFFFFFF); - } - - // DJB - // worse than rs - if (0) { - unsigned int hash = 5381; - - for(unsigned int i = 0; i < 4; i++) - { - hash = ((hash << 5) + hash) + (n&255); - n = n >> 8; - } - - return (hash & 0x7FFFFFFF); - } - - // AP - // even worse - if (1) { - unsigned int hash = 0; - - for(unsigned int i = 0; i < 4; i++) - { - hash ^= ((i & 1) == 0) ? ( (hash << 7) ^ (n&255) ^ (hash >> 3)) : - (~((hash << 11) ^ (n&255) ^ (hash >> 5))); - n = n >> 8; - } - - return (hash & 0x7FFFFFFF); - } - - -#endif diff --git a/branches/marnberg/quota/crush/crush.h b/branches/marnberg/quota/crush/crush.h deleted file mode 100644 index aa93031beb51e..0000000000000 --- a/branches/marnberg/quota/crush/crush.h +++ /dev/null @@ -1,534 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __crush_CRUSH_H -#define __crush_CRUSH_H - -#include -#include -#include -#include -#include -using namespace std; -#include -#include -using namespace __gnu_cxx; - - -#include "Bucket.h" - -#include "include/buffer.h" - - -namespace crush { - - - // *** RULES *** - - class RuleStep { - public: - int cmd; - vector args; - - RuleStep(int c) : cmd(c) {} - RuleStep(int c, int a) : cmd(c) { - args.push_back(a); - } - RuleStep(int c, int a, int b) : cmd(c) { - args.push_back(a); - args.push_back(b); - } - RuleStep(int o, int a, int b, int c) : cmd(o) { - args.push_back(a); - args.push_back(b); - args.push_back(c); - } - - void _encode(bufferlist& bl) { - bl.append((char*)&cmd, sizeof(cmd)); - ::_encode(args, bl); - } - void _decode(bufferlist& bl, int& off) { - bl.copy(off, sizeof(cmd), (char*)&cmd); - off += sizeof(cmd); - ::_decode(args, bl, off); - } - }; - - - // Rule operations - const int CRUSH_RULE_TAKE = 0; - const int CRUSH_RULE_CHOOSE = 1; // first n by default - const int CRUSH_RULE_CHOOSE_FIRSTN = 1; - const int CRUSH_RULE_CHOOSE_INDEP = 2; - const int CRUSH_RULE_EMIT = 3; - - class Rule { - public: - vector< RuleStep > steps; - - void _encode(bufferlist& bl) { - int n = steps.size(); - bl.append((char*)&n, sizeof(n)); - for (int i=0; i buckets; - int bucketno; - Hash h; - - hash_map parent_map; // what bucket each leaf/bucket lives in - - public: - map rules; - - //map collisions; - //map bumps; - - void _encode(bufferlist& bl) { - // buckets - int n = buckets.size(); - bl.append((char*)&n, sizeof(n)); - for (map::const_iterator it = buckets.begin(); - it != buckets.end(); - it++) { - bl.append((char*)&it->first, sizeof(it->first)); - it->second->_encode(bl); - } - bl.append((char*)&bucketno, sizeof(bucketno)); - - // hash - int s = h.get_seed(); - bl.append((char*)&s, sizeof(s)); - - //::_encode(out, bl); - //::_encode(overload, bl); - - // rules - n = rules.size(); - bl.append((char*)&n, sizeof(n)); - for(map::iterator it = rules.begin(); - it != rules.end(); - it++) { - bl.append((char*)&it->first, sizeof(it->first)); - it->second._encode(bl); - } - - } - - void _decode(bufferlist& bl, int& off) { - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i::iterator bp = buckets.begin(); - bp != buckets.end(); - ++bp) { - // index bucket items - vector items; - bp->second->get_items(items); - for (vector::iterator ip = items.begin(); - ip != items.end(); - ++ip) - parent_map[*ip] = bp->first; - } - } - - - - public: - Crush(int seed=123) : bucketno(-1), h(seed) {} - ~Crush() { - // hose buckets - for (map::iterator it = buckets.begin(); - it != buckets.end(); - it++) { - delete it->second; - } - } - - int print(ostream& out, int root, int indent=0) { - for (int i=0; iget_weight() << "\t" << b->get_id() << "\t"; - for (int i=0; iget_bucket_type() << ": "; - - vector items; - b->get_items(items); - - if (buckets.count(items[0])) { - out << endl; - for (unsigned i=0; iset_id(n); - buckets[n] = b; - return n; - } - - void add_item(int parent, int item, float w, bool back=false) { - // add item - assert(!buckets[parent]->is_uniform()); - Bucket *p = buckets[parent]; - - p->add_item(item, w, back); - - // set item's parent - Bucket *n = buckets[item]; - if (n) - n->set_parent(parent); - - // update weights - while (buckets.count(p->get_parent())) { - int child = p->get_id(); - p = buckets[p->get_parent()]; - p->adjust_item_weight(child, w); - } - } - - - /* - this is a hack, fix me! weights should be consistent throughout hierarchy! - - */ - void set_bucket_weight(int item, float w) { - Bucket *b = buckets[item]; - float adj = w - b->get_weight(); - - while (buckets.count(b->get_parent())) { - Bucket *p = buckets[b->get_parent()]; - p->adjust_item_weight(b->get_id(), adj); - b = p; - } - } - - - /* - * choose numrep distinct items of type type - */ - void choose(int x, - int numrep, - int type, - Bucket *inbucket, - vector& outvec, - bool firstn, - set& outset, map& overloadmap, - bool forcefeed=false, - int forcefeedval=-1) { - int off = outvec.size(); - - // for each replica - for (int rep=0; repis_uniform()) { - // uniform bucket; be careful! - if (firstn || numrep >= in->get_size()) { - // uniform bucket is too small; just walk thru elements - r += ftotal; // r' = r + f_total (first n) - } else { - // make sure numrep is not a multple of bucket size - int add = numrep*flocal; // r' = r + n*f_local - if (in->get_size() % numrep == 0) { - add += add/in->get_size(); // shift seq once per pass through the bucket - } - r += add; - } - } else { - // mixed bucket; just make a distinct-ish r sequence - if (firstn) - r += ftotal; // r' = r + f_total - else - r += numrep * flocal; // r' = r + n*f_local - } - - // choose - outv = in->choose_r(x, r, h); - - // did we get the type we want? - int itemtype = 0; // 0 is terminal type - Bucket *newin = 0; // remember bucket we hit - if (in->is_uniform()) { - itemtype = ((UniformBucket*)in)->get_item_type(); - } else { - if (buckets.count(outv)) { // another bucket - newin = buckets[outv]; - itemtype = newin->get_type(); - } - } - if (itemtype == type) { // this is what we want! - // collision? - bool collide = false; - for (int prep=0; prep overloadmap[outv]) - bad = true; - } - - if (collide || bad) { - ftotal++; - flocal++; - - if (collide && flocal < 3) - continue; // try locally a few times! - - if (ftotal >= 10) { - // ok fine, just ignore dup. FIXME. - skip_rep = true; - break; - } - - retry_rep = true; - } - - break; // ok then! - } - - // next - in = newin; - } - - if (retry_rep) continue; // try again - - break; - } - - // skip this rep? (e.g. too many collisions, we give up) - if (skip_rep) continue; - - // output this value - outvec.push_back(outv); - } // for rep - - // double check! - if (0) { - for (unsigned i=1; i& result, - set& outset, map& overloadmap, - int forcefeed=-1) { - //int numresult = 0; - result.clear(); - - // determine hierarchical context for first. - list force_stack; - if (forcefeed >= 0) { - int t = forcefeed; - while (1) { - force_stack.push_front(t); - if (parent_map.count(t) == 0) break; // reached root, presumably. - //cout << " " << t << " parent is " << parent_map[t] << endl; - t = parent_map[t]; - } - } - - // working vector - vector w; // working variable - - // go through each statement - for (vector::iterator pc = rule.steps.begin(); - pc != rule.steps.end(); - pc++) { - // move input? - - // do it - switch (pc->cmd) { - case CRUSH_RULE_TAKE: - { - const int arg = pc->args[0]; - //cout << "take " << arg << endl; - - if (!force_stack.empty()) { - int forceval = force_stack.front(); - force_stack.pop_front(); - assert(arg == forceval); - } - - w.clear(); - w.push_back(arg); - } - break; - - case CRUSH_RULE_CHOOSE_FIRSTN: - case CRUSH_RULE_CHOOSE_INDEP: - { - const bool firstn = pc->cmd == CRUSH_RULE_CHOOSE_FIRSTN; - const int numrep = pc->args[0]; - const int type = pc->args[1]; - - //cout << "choose " << numrep << " of type " << type << endl; - - assert(!w.empty()); - - // reset output - vector out; - - // forcefeeding? - bool forcing = false; - int forceval; - if (!force_stack.empty()) { - forceval = force_stack.front(); - force_stack.pop_front(); - //cout << "priming out with " << forceval << endl; - forcing = true; - } - - // do each row independently - for (vector::iterator i = w.begin(); - i != w.end(); - i++) { - assert(buckets.count(*i)); - Bucket *b = buckets[*i]; - choose(x, numrep, type, b, out, firstn, - outset, overloadmap, - forcing, - forceval); - forcing = false; // only once - } // for inrow - - // put back into w - w.swap(out); - out.clear(); - } - break; - - case CRUSH_RULE_EMIT: - { - for (unsigned i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include -#include -#include -using namespace std; - -#include "config.h" - -#include "client/SyntheticClient.h" -#include "client/Client.h" -#include "client/fuse.h" - -#include "msg/SimpleMessenger.h" - -#include "common/Timer.h" - -#ifndef DARWIN -#include -#endif // DARWIN - -#include -#include -#include - -int main(int argc, char **argv, char *envp[]) { - - //cerr << "cfuse starting " << myrank << "/" << world << endl; - vector args; - argv_to_vec(argc, argv, args); - parse_config_options(args); - parse_syn_options(args); // for SyntheticClient - - // args for fuse - vec_to_argv(args, argc, argv); - - // load monmap - MonMap monmap; - int r = monmap.read(".ceph_monmap"); - assert(r >= 0); - - // start up network - rank.start_rank(); - - list clients; - list synclients; - - cout << "mounting and starting " << g_conf.num_client << " syn client(s)" << endl; - for (int i=0; iinit(); - - // start syntheticclient - SyntheticClient *syn = new SyntheticClient(client); - - client->mount(); - - syn->start_thread(); - - clients.push_back(client); - synclients.push_back(syn); - } - - cout << "waiting for client(s) to finish" << endl; - while (!clients.empty()) { - Client *client = clients.front(); - SyntheticClient *syn = synclients.front(); - clients.pop_front(); - synclients.pop_front(); - - // wait - syn->join_thread(); - - // unmount - client->unmount(); - client->shutdown(); - - delete syn; - delete client; - } - - // wait for messenger to finish - rank.wait(); - - return 0; -} - diff --git a/branches/marnberg/quota/doc/Commitdir.txt b/branches/marnberg/quota/doc/Commitdir.txt deleted file mode 100644 index 83c89bdcaef4a..0000000000000 --- a/branches/marnberg/quota/doc/Commitdir.txt +++ /dev/null @@ -1,22 +0,0 @@ - -How Directory Committing Works: - -Each CDir has: - version - current version of directory - committing_version - which version was sent to stable storage - last_committed_version - last version to be safely stored - -Each Inode has: - parent_dir_version - what dir version i was in when i was dirtied. (*) - - (*) note that if you change an inode, mark_dirty() again, even if it's already dirty! - - -How committing works: - -A call to commit_dir(dir, context) will ensure tha the _current_ version is stored safely on disk before the context is finished. - -When a commit completes, inodes in the directory are checked. If they are dirty and belonged to the _committed_ (or earlier) version, then they are marked clean. If they belong to a newer version, then they are _still dirty_. - - - diff --git a/branches/marnberg/quota/doc/Replication.txt b/branches/marnberg/quota/doc/Replication.txt deleted file mode 100644 index 0f8d4c9079e4d..0000000000000 --- a/branches/marnberg/quota/doc/Replication.txt +++ /dev/null @@ -1,19 +0,0 @@ - -Primary copy replication. - -Inodes: - -- The primary's list of replicas (cached_by) is inclusive at all times. -- The primary's list never includes the local node. -- The primary's list of replicas will only include non-replicas when the relevant CacheExpire notifications are in-flight. - -- Replicas can be created in two ways: - - via a Discover + DiscoverReply - - via an export and import. (The old auth keeps a copy, and adds itself to the replica list as it exports.) - - -Directories (and their dentries): - -- The primary has an open_by list that is inclusive at all times. -- ..Never includes local node -- No per-dentry replica lists. All dentry lock operations (for unlink, etc.) are sent to all nodes in open_by list. \ No newline at end of file diff --git a/branches/marnberg/quota/doc/caching.txt b/branches/marnberg/quota/doc/caching.txt deleted file mode 100644 index 77b02480bcd6e..0000000000000 --- a/branches/marnberg/quota/doc/caching.txt +++ /dev/null @@ -1,200 +0,0 @@ - - -AUTHORITY - -The authority maintains a list of what nodes cache each inode. -Additionally, each replica is assigned a serial (normally 0) to -disambiguate multiple replicas of the same item (see below). - - set cached_by; - map cached_by_serial; - -The cached_by set _always_ includes all nodes that cache the -partcuarly inode, but may additionally include nodes that used to -cache it but no longer do. In those cases, an expire message should -be in transit. - - -REPLICA - -The replica maintains a notion of who it believes is the authority for -each replicated inode. There are two possibilities: - - - Ordinarily, this notion is correct. - - If the part of the file system in question was recently exported to - a new MDS, the inodes old authority is acting as a CACHEPROXY, - and will forward relevant messages on to the authority. - -When a repica is expired from cache, and expire is sent to the -authority. The expire includes the serial number issued when the -replica was originally created to disambiguate potentially concurrent -replication activity. - - -EXPORTS - -- The old authority suddenly becomes a replica. It's serial is well - defined. It also becomes a CACHEPROXY, which means its cached_by - remains defined (with an alternate meaning!). While a proxy, the - node will forward relevant messages from the replica to the - authority (but not the other way around--the authority knows all - replicas). - -- Once the export is acked, the old authority sends a - message to the replica notifying it of the new authority. As soon - as all replicas acknowedge receipt of this notice, the old authority - can cease CACHEPROXY responsibilities and become a regular replica. - At this point it's cached_by is no longer defined. - -- Replicas always know who the authority for the inode is, OR they - know prior owner acting as a CACHEPROXY. (They don't know which it - is.) - - -CACHED_BY - -The authority always has an inclusive list of nodes who cache an item. -As such it can confidently send updates to replicas for locking, -invalidating, etc. When a replica is expired from cache, an expire is -sent to the authority. If the serial matches, the node is removed -from the cached_by list. - - - - - -SUBTREE AUTHORITY DELEGATION: imports versus hashing - -Authority is generally defined recursively: an inode's authority -matches the containing directory, and a directory's authority matches -the directory inode's. Thus the authority delegation chain can be -broken/redefined in two ways: - - - Imports and exports redefine the directory inode -> directory - linkage, such that the directory authority is explicitly specified - via dir.dir_auth: - - dir.dir_auth == -1 -> directory matches its inode - dir.dir_auth >= 0 -> directory authority is dir.dir_auth - - - Hashed directories redefine the directory -> inode linkage. In - non-hashed directories, inodes match their containing directory. - In hashed directories, each dentry's authority is defined by a hash - function. - - inode.hash_seed == 0 -> inode matches containing directory - inode.hash_seed > 0 -> defined by hash(hash_seed, dentry) - -A directory's "containing_import" (bad name, FIXME) is either the -import or hashed directory that is responsible for delegating a -subtree. Note that the containing_import of a directory may be itself -because it is an import, but it cannot be itself because it is hashed. - -Thus: - - - Import and export operations' manipulation of dir_auth is - completely orthogonal to hashing operations. Hashing methods can - ignore dir_auth, except when they create imports/exports (and break - the inode<->dir auth linkage). - - - Hashdirs act sort of like imports in that they bound an - authoritative region. That is, either hashdirs or imports can be - the key for nested_exports. In some cases, a dir may be both an - import and a hash. - - - Export_dir won't export a hashdir. This is because it's tricky - (tho not necessarily impossible) due to the way nested_exports is - used with imports versus hashdirs. - - - - -FREEZING - -There are two types of freezing: - - - TREE: recursively freezes everything nested beneath a directory, - until an export of edge of cache is reached. - - DIR: freezes the contents of a single directory. - -Some notes: - - - Occurs on the authoritative node only. - - - Used for suspending critical operations while migrating authority - between nodes or hashing/unhashing directories. - - - Freezes the contents of the cache such that items may not be added, - items cannot be auth pinned, and/or subsequently reexported. The - namespace of the affected portions of the hierarchy may not change. - The content of inodes and other orthogonal operations - (e.g. replication, inode locking and modification) are unaffected. - -Two states are defined: freezing and frozen. The freezing state is -used while waiting for auth_pins to be removed. Once all auth_pins -are gone, the state is changed to frozen. New auth_pins cannot be -added while freezing or frozen. - - -AUTH PINS - -An auth pin keeps a given item on the authoritative node until it is -removed. The pins are tracked recursively, so that a subtree cannot -be frozen if it contains any auth pins. - -If a pin is placed on a non-authoritative item, the item is allowed to -become authoritative; the specific restriction is it cannot be frozen, -which only happens during export-type operations. - - -TYPES OF EXPORTS - -- Actual export of a subtree from one node to another -- A rename between directories on different nodes exports the renamed -_inode_. (If it is a directory, it becomes an export such that the -directory itself does not move.) -- A hash or unhash operation will migrate inodes within the directory -either to or from the directory's main authority. - -EXPORT PROCESS - - - - -HASHING - -- All nodes discover and open directory - -- Prep message distributes subdir inode replicas for exports so that - peers can open those dirs. This is necessary because subdirs are - converted into exports or imports as needed to avoid migrating - anything except the hashed dir itself. The prep is needed for the - same reasons its important with exports: the inode authority must - always have the exported dir open so that it gets accurate dir - authority updates, and can keep the inode->dir_auth up to date. - -- MHashDir messsage distributes the directory contents. - -- While auth is frozen_dir, we can't get_or_open_dir. Otherwise the - Prep messages won't be inclusive of all dirs, and the - imports/exports won't get set up properly. - -TODO -readdir - - -- subtrees stop at hashed dir. hashed dir's dir_auth follows parent - subtree, unless the dir is also an explicit import. thus a hashed - dir can also be an import dir. - - -bananas -apples -blueberries -green pepper -carrots -celery - - - - diff --git a/branches/marnberg/quota/doc/dentries.txt b/branches/marnberg/quota/doc/dentries.txt deleted file mode 100644 index ab14765998b2f..0000000000000 --- a/branches/marnberg/quota/doc/dentries.txt +++ /dev/null @@ -1,4 +0,0 @@ - -null dentires only exist - - on auth - - on replica, if they are xlock \ No newline at end of file diff --git a/branches/marnberg/quota/doc/file_modes.txt b/branches/marnberg/quota/doc/file_modes.txt deleted file mode 100644 index d4ceba4034e5f..0000000000000 --- a/branches/marnberg/quota/doc/file_modes.txt +++ /dev/null @@ -1,66 +0,0 @@ - -underlying client capabilities: - -- read + cache -- read sync -- write sync -- write + buffer - (...potentially eventually augmented by byte ranges) - -whatever system of modes, tokens, etc. has to satisfy the basic -constraint that no conflicting capabilities are ever in the -hands of clients. - - -questions: -- is there any use to clients writing to a replica? - - reading, yes.. 100,000 open same file.. - - ------- - -simplest approach: -- all readers, writers go through authority -- all open, close traffic at replicas forwarded to auth - -- fh state migrates with exports. - - - --------- - -less simple: -- all writers go through authority - - open, close traffic fw -- readers from any replica - - need token from auth -- weird auth <-> replica <-> client interactions ensue! - - --------- - -even more complex (and totally FLAWED, ignore this!) - -- clients can open a file with any replica (for read or write). -- replica gets a read or write token from the primary - - primary thus knows if it's all read, all write, mixed, or none. -- once replica has a token it can service as many clients (of given type(s)) as it wants. -- on export, tokens are moved too. - - primary give _itself_ a token too! much simpler. - -- clients maintain a mode for each open file: rdonly, wronly, rdwr, lock -- globally, the mode is controlled by the primary, based on the mixture of - read and write tokens issued - - - -- [optional] if a client has a file open rdwr and the mode is rdonly or wronly, it can - request to read or write from the mds (which might twiddle the mode for performance - reasons.. e.g. lots of ppl rdwr but no actual reading) - - - - --------- - - diff --git a/branches/marnberg/quota/doc/header.txt b/branches/marnberg/quota/doc/header.txt deleted file mode 100644 index 8a3c51280461d..0000000000000 --- a/branches/marnberg/quota/doc/header.txt +++ /dev/null @@ -1,12 +0,0 @@ -// -*- mode:C++; tab-width:4; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ diff --git a/branches/marnberg/quota/doc/journal.txt b/branches/marnberg/quota/doc/journal.txt deleted file mode 100644 index 22cb4fc9e21b2..0000000000000 --- a/branches/marnberg/quota/doc/journal.txt +++ /dev/null @@ -1,124 +0,0 @@ - - -- LogEvent.replay() is idempotent. we won't know whether the update is old or not. - - - - - - - - - - - - - - - -journal is distributed among different nodes. because authority changes over time, it's not immedicatley clear to a recoverying node relaying the journal whether the data is "real" or not (it might be exported later in the journal). - - -possibilities: - - -ONE.. bloat the journal! - -- journal entry includes full trace of dirty data (dentries, inodes) up until import point - - local renames implicit.. cache is reattached on replay - - exports are a list of exported dirs.. which are then dumped - ... - -recovery phase 1 -- each entry includes full trace (inodes + dentries) up until the import point -- cache during recovery is fragmetned/dangling beneath import points -- when export is encountered items are discarded (marked clean) - -recovery phase 2 -- import roots ping store to determine attachment points (if not already known) - - if it was imported during period, attachment point is already known. - - renames affecting imports are logged too -- import roots discovered from other nodes, attached to hierarchy - -then -- maybe resume normal operations -- if recovery is a background process on a takeover mds, "export" everything to that node. - - --> journal contains lots of clean data.. maybe 5+ times bigger as a result! - -possible fixes: - - collect dir traces into journal chunks so they aren't repeated as often - - each chunk summarizes traces in previous chunk - - hopefully next chunk will include many of the same traces - - if not, then the entry will include it - - - - -=== log entry types === -- all inode, dentry, dir items include a dirty flag. -- dirs are implicitly _never_ complete; even if they are, a fetch before commit is necessary to confirm - -ImportPath - log change in import path -Import - log import addition (w/ path, dirino) - -InoAlloc - allocate ino -InoRelease - release ino - -Inode - inode info, along with dentry+inode trace up to import point -Unlink - (null) dentry + trace, + flag (whether inode/dir is destroyed) -Link - (new) dentry + inode + trace - - ------------------------------ - -TWO.. -- directories in store contain path at time of commit (relative to import, and root) -- replay without attaching anything to heirarchy -- after replay, directories pinged in store to attach to hierarchy - --> phase 2 too slow! --> and nested dirs may reattach... that won't be apparent from journal. - - put just parent dir+dentry in dir store.. even worse on phase 2! - - -THREE -- - - - - - - - -metadata journal/log - - -event types: - -chown, chmod, utime - InodeUpdate - -mknod, mkdir, symlink - Mknod .. new inode + link - -unlink, rmdir - Unlink - -rename - Link + Unlink (foreign) -or Rename (local) - -link - Link .. link existing inode - - - - -InodeUpdate -DentryLink -DentryUnlink -InodeCreate -InodeDestroy -Mkdir? diff --git a/branches/marnberg/quota/doc/osd_outline.txt b/branches/marnberg/quota/doc/osd_outline.txt deleted file mode 100644 index 2c6f3287aac5f..0000000000000 --- a/branches/marnberg/quota/doc/osd_outline.txt +++ /dev/null @@ -1,37 +0,0 @@ - -intro - -osd cluster map - requirements - desireable properties - (c)rush - -failure detection - distributed ping or heartbeat - central filter, notifier - -design - placement seed, class/superset, groups - -normal operation - reads - writes - -recovery - triggers: failed disk, or total cluster reorganization - - notify - peering - pull - push - clean - -writes during recovery - -graceful data loss + recovery? - - - - - - diff --git a/branches/marnberg/quota/doc/osd_replication.txt b/branches/marnberg/quota/doc/osd_replication.txt deleted file mode 100644 index 907d00e2050a2..0000000000000 --- a/branches/marnberg/quota/doc/osd_replication.txt +++ /dev/null @@ -1,226 +0,0 @@ - - -SOME GENERAL REQUIREMENTS - -- cluster expansion: - - any or all of the replicas may move to new OSDs. - -- cluster map may change frequently - - map change should translate into pending replication/migration - state quickly (or better yet, instantly), so that we could push - through a series of (say, botched) maps quickly and be fine, so long - as the final map is correct. - -- ideally, unordered osd<->osd, client<->osd communication - (mds<->mds, client<->mds communication is ordered, but file i/o - would be too slow that way?) - - - - -PRIMARY ONLY PICTURE - -let's completely ignore replication for a while, and see how -complicated the picture needs to be to reliably support cluster expansion. - -typedef __uint64_t version_t; - - -per-Object metadata: -- version #. incremented when an object is modified. - e.g. version_t version; -- on primary, keep list of stray replicas - e.g. map stray_replicas; // osds w/ stray replicas - includes old primary osd(s), until deletion is confirmed. used while rg - is importing. - - -per-RG metadata -- object list. well, a method to fetch it by querying a collection or whatever. -- negative list - e.g. map deleted_objects; - - used to enumerate deleted objects, when in "importing" state. -- a RG "state" (enum/int) - - - - - - -Normal RG state: -- role=primary - clean - i am primary, all is well. no stray copies. i can - discard my negative object list, since my local - object store tells me everything. - - -After a map change: -- new primary - undef - initially; i don't know RG exists. -- old primary - homeless - i was primary, still have unmolested data. new primary is not yet migrating - (presumably it's state=undef.) i need to contact new primary and tell them - this RG exists. - -- new primary - importing - i am migrating data from old primary. keep negative dir entries for deletions. - write locally. proxy reads (force immediately migration). do whole objects - initially (on write, block until i migrate the object). later we can do - sub-object state (where "live" object data is spread across new/old primaries.. -- old primary - exporting - primary is migrating my data. - undef - when it finishes. (i will forget this RG existed.) - - -After a second map change (scenario 1): - as above, if we were clean again. - -After a second map change (scenario 2): - we weren't clean yet. -- new primary - undef - initially (until i learn RG exists) -- old primary - importing - i'm still migrating from old old primary -- old old primary - exporting - ... -- old primary -?? importing+exporting - proxy reads as before. continue migrating from old old primary. - - -After a second map change (scenario 3): - we weren't clean yet, and old old primary is also new primary -- new primary (and old old primary) - exporting - change state to importing. be sure to compare object versions, and neg dir - entries (as we always should do, really!). -- old primary - importing - note that the old import source matches new primary, and change - state to exporting, and stop importing. (unlike scenario 2) - --> this approach could mean that a series of fast map changes could - force data to migrate down a "chain" of old primaries to reach the - new one. maybe old primary should go from importing -> exporting, - and pass along old old primary id to new primary such that the - import is a many-to-one thing, instead of one-to-one. version - numbers and neg entries will make it easy to pick out correct versions. - - - -For the importing process on a given RG: - -- metadata for each source - - each source has a state: - 'starting' - don't know anything about source yet. query source! - this probaby induces the source to change from - 'homeless' or something similar to 'exporting'. - 'importing' - i've fetched the source's object list (and neg - object list). i'm busy reading them! these lists - will shrink as the process continues. after i fetch - an object, i will erase it from the source. - (object metadata will include stray copy info - until i confirm that its removed.) - 'finishing' - i've read all my data, and i'm telling the old person - to discard any remaining RG metadata (RG contents - should already be gone) - - unmigrated object list - - migrated but not deleted object list - - stray osd is also listed in per-object MD during this stage - - negative object list - - i can remove these items if i see a newer object version (say, - from a different import source or something). - - i can remove any local objects or ignore imported ones if it is - older than deleted version - -- the lists should be sets or otherwise queryable so that while i'm - importing and a real op comes through I can quickly determine if a - given object_id is pending migration etc or if my local store is to - be trusted. - - - - - -SOME CODE BITS - - -typedef __uint64_t version_t; -class Object { - version_t version; - map stray_replicas; -}; - - -class ReplicaGroup { - int enumerate_objects(list& ls); - - int state; - - // for unstable states, - map deleted_objects; // locally - map exporters; // importing from these guys. -}; - -// primary -#define RG_STATE_CLEAN 1 -#define RG_STATE_IMPORTING 2 // pulling data - -// non-primary -#define RG_STATE_HOMELESS 5 // old primary; new primary not yet - // notified; not yet exporting. -#define RG_STATE_EXPORTING 6 // a newer primary is extracting my - // data. - - -struct RGExporter_t { - int import_state; - - set remaining_objects; // remote object list - set stray_objects; // imported but not deleted. - -}; - - - - - ----- -all crap from here on down - - - - -REPLICAS -- - - - - -OSD STATES -- primary, up to date. -- replica, up to date. - -- primary, proxy to old primary (primaries?) - -- replica, not up to date. - - -REPLICATION STUFF - -Per-RG metadata -- primary - - per-replica state: clean, catching up? -- replica - -Per-object metadata -- primary and replica - - version number/mtime - - rg (reverse indexed) -- primary - - replication level and state. - - commited to memory and/or disk, on which replicas (#1, #2, etc.) -- replica - - - - - --> \ No newline at end of file diff --git a/branches/marnberg/quota/doc/performance.txt b/branches/marnberg/quota/doc/performance.txt deleted file mode 100644 index 7ca278bd284b1..0000000000000 --- a/branches/marnberg/quota/doc/performance.txt +++ /dev/null @@ -1,36 +0,0 @@ - - -quick performance test 2005-05-11. fakemds, 100x100, asdf/asdf, debug 13 - -g marshalling -real 3m8.697s -user 2m53.282s -sys 0m6.291s - -real 3m3.337s -user 2m49.467s -sys 0m6.243s - - -g no marshalling -real 2m1.464s -user 1m42.680s -sys 0m8.128s - -real 1m49.469s -user 1m34.523s -sys 0m6.410s - - -O3 marshalling -real 1m29.833s -user 1m11.474s -sys 0m7.588s - -real 1m9.439s -user 0m56.071s -sys 0m5.643s - - - -O3 no marshalling -real 1m2.739s -user 0m46.578s -sys 0m7.882s - diff --git a/branches/marnberg/quota/doc/shutdown.txt b/branches/marnberg/quota/doc/shutdown.txt deleted file mode 100644 index e5ccde3171004..0000000000000 --- a/branches/marnberg/quota/doc/shutdown.txt +++ /dev/null @@ -1,13 +0,0 @@ - -- mds0 triggers shutdown by sending a shutdown_start to all nodes. - -- from here on out, all client requests are discarded (unless they are a file close?) - -- each mds checks for outstanding inter-mds transations. e.g imports, discoveries, etc. once they're all done, send a shutdown_ready to mds0 - -- each mds successively disassembles its cache, flushing data to long-term storage, and sending inodeexpires, exporting imported dirs to parent (after they're clean + empty) - -- when the cache is empty, send shutdown_done to mds0 and exit. - -- mds0 exits when all mdss have finished. - diff --git a/branches/marnberg/quota/ebofs/Allocator.cc b/branches/marnberg/quota/ebofs/Allocator.cc deleted file mode 100644 index 805957f779a11..0000000000000 --- a/branches/marnberg/quota/ebofs/Allocator.cc +++ /dev/null @@ -1,692 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "Allocator.h" -#include "Ebofs.h" - - -#undef dout -#define dout(x) if (x <= g_conf.debug_ebofs) cout << "ebofs(" << fs->dev.get_device_name() << ").allocator." - - -void Allocator::dump_freelist() -{ - if (1) { - interval_set free; // validate too - - block_t n = 0; - for (int b=0; b<=EBOFS_NUM_FREE_BUCKETS; b++) { - Table *tab; - if (b < EBOFS_NUM_FREE_BUCKETS) { - tab = fs->free_tab[b]; - dout(0) << "dump bucket " << b << " " << tab->get_num_keys() << endl; - } else { - tab = fs->limbo_tab; - dout(0) << "dump limbo " << tab->get_num_keys() << endl;; - } - - if (tab->get_num_keys() > 0) { - Table::Cursor cursor(tab); - assert(tab->find(0, cursor) >= 0); - while (1) { - dout(0) << "dump ex " << cursor.current().key << "~" << cursor.current().value << endl; - assert(cursor.current().value > 0); - - if (b < EBOFS_NUM_FREE_BUCKETS) - n += cursor.current().value; - - if (free.contains( cursor.current().key, cursor.current().value )) - dout(0) << "dump bad " << cursor.current().key << "~" << cursor.current().value << endl; - assert(!free.contains( cursor.current().key, cursor.current().value )); - free.insert( cursor.current().key, cursor.current().value ); - if (cursor.move_right() <= 0) break; - } - } else { - //cout << " empty" << endl; - } - } - - assert(n == fs->free_blocks); - dout(0) << "dump combined freelist is " << free << endl; - - - // alloc_tab - if (fs->alloc_tab->get_num_keys() > 0) { - Table >::Cursor cursor(fs->alloc_tab); - assert(fs->alloc_tab->find(0, cursor) >= 0); - while (1) { - dout(0) << "alloc ex " << cursor.current().key << "~" << cursor.current().value.first << " ref " - << cursor.current().value.second - << endl; - assert(cursor.current().value.first > 0); - - if (cursor.move_right() <= 0) break; - } - } - } -} - - -int Allocator::find(Extent& ex, int bucket, block_t num, block_t near, int dir) -{ - Table::Cursor cursor(fs->free_tab[bucket]); - bool found = false; - - if ((dir == DIR_ANY || dir == DIR_FWD) && - fs->free_tab[bucket]->find( near, cursor ) >= 0) { - // look to the right - do { - if (cursor.current().value >= num) - found = true; - } while (!found && cursor.move_right() > 0); - } - - if ((dir == DIR_ANY || dir == DIR_BACK) && - !found) { - // look to the left - fs->free_tab[bucket]->find( near, cursor ); - - while (!found && cursor.move_left() >= 0) - if (cursor.current().value >= num) - found = true; - } - - if (found) { - ex.start = cursor.current().key; - ex.length = cursor.current().value; - return 0; - } - - return -1; -} - -int Allocator::allocate(Extent& ex, block_t num, block_t near) -{ - //dump_freelist(); - - int dir = DIR_ANY; // no dir - if (near == NEAR_LAST_FWD) { - near = last_pos; - dir = DIR_FWD; // fwd - } - else if (near == NEAR_LAST) - near = last_pos; - - int bucket; - - while (1) { // try twice, if fwd = true - - // look for contiguous extent - for (bucket = pick_bucket(num); bucket < EBOFS_NUM_FREE_BUCKETS; bucket++) { - if (find(ex, bucket, num, near, dir) >= 0) { - // yay! - - // remove original - fs->free_tab[bucket]->remove( ex.start ); - fs->free_blocks -= ex.length; - - if (ex.length > num) { - if (ex.start < near) { - // to the left - if (ex.start + ex.length - num <= near) { - // by a lot. take right-most portion. - Extent left; - left.start = ex.start; - left.length = ex.length - num; - ex.start += left.length; - ex.length -= left.length; - assert(ex.length == num); - _release_loner(left); - } else { - // take middle part. - Extent left,right; - left.start = ex.start; - left.length = near - ex.start; - ex.start = near; - right.start = ex.start + num; - right.length = ex.length - left.length - num; - ex.length = num; - _release_loner(left); - _release_loner(right); - } - } - else { - // to the right. take left-most part. - Extent right; - right.start = ex.start + num; - right.length = ex.length - num; - ex.length = num; - _release_loner(right); - } - } - - dout(20) << "allocate " << ex << " near " << near << endl; - last_pos = ex.end(); - //dump_freelist(); - if (g_conf.ebofs_cloneable) - alloc_inc(ex); - return num; - } - } - - if (dir == DIR_BACK || dir == DIR_ANY) break; - dir = DIR_BACK; - } - - // ok, find partial extent instead. - for (block_t trysize = num/2; trysize >= 1; trysize /= 2) { - int bucket = pick_bucket(trysize); - if (find(ex, bucket, trysize, near) >= 0) { - // yay! - assert(ex.length < num); - - fs->free_tab[bucket]->remove(ex.start); - fs->free_blocks -= ex.length; - last_pos = ex.end(); - dout(20) << "allocate partial " << ex << " (wanted " << num << ") near " << near << endl; - //dump_freelist(); - if (g_conf.ebofs_cloneable) - alloc_inc(ex); - return ex.length; - } - } - - dout(1) << "allocate failed, fs completely full! " << fs->free_blocks << endl; - assert(0); - //dump_freelist(); - return -1; -} - -int Allocator::_release_into_limbo(Extent& ex) -{ - dout(10) << "_release_into_limbo " << ex << endl; - dout(10) << "limbo is " << limbo << endl; - assert(ex.length > 0); - limbo.insert(ex.start, ex.length); - fs->limbo_blocks += ex.length; - return 0; -} - -int Allocator::release(Extent& ex) -{ - if (g_conf.ebofs_cloneable) - return alloc_dec(ex); - - _release_into_limbo(ex); - return 0; -} - -int Allocator::commit_limbo() -{ - dout(20) << "commit_limbo" << endl; - for (map::iterator i = limbo.m.begin(); - i != limbo.m.end(); - i++) { - fs->limbo_tab->insert(i->first, i->second); - //fs->free_blocks += i->second; - } - limbo.clear(); - //fs->limbo_blocks = 0; - //dump_freelist(); - return 0; -} - -int Allocator::release_limbo() -{ - //dump_freelist(); - if (fs->limbo_tab->get_num_keys() > 0) { - Table::Cursor cursor(fs->limbo_tab); - fs->limbo_tab->find(0, cursor); - while (1) { - Extent ex(cursor.current().key, cursor.current().value); - dout(20) << "release_limbo ex " << ex << endl; - - fs->limbo_blocks -= ex.length; - _release_merge(ex); - - if (cursor.move_right() <= 0) break; - } - } - fs->limbo_tab->clear(); - //dump_freelist(); - return 0; -} - - - -/* -int Allocator::_alloc_loner_inc(Extent& ex) -{ - Table >::Cursor cursor(fs->alloc_tab); - - if (fs->alloc_tab->find( ex.start, cursor ) - == Table >::Cursor::MATCH) { - assert(cursor.current().value.first == ex.length); - pair& v = cursor.dirty_current_value(); - v.second++; - dout(10) << "_alloc_loner_inc " << ex << " " - << (v.second-1) << " -> " << v.second - << endl; - } else { - // insert it, @1 - fs->alloc_tab->insert(ex.start, pair(ex.length,1)); - dout(10) << "_alloc_loner_inc " << ex << " 0 -> 1" << endl; - } - return 0; -} - -int Allocator::_alloc_loner_dec(Extent& ex) -{ - Table >::Cursor cursor(fs->alloc_tab); - - if (fs->alloc_tab->find( ex.start, cursor ) - == Table >::Cursor::MATCH) { - assert(cursor.current().value.first == ex.length); - if (cursor.current().value.second == 1) { - dout(10) << "_alloc_loner_dec " << ex << " 1 -> 0" << endl; - fs->alloc_tab->remove( cursor.current().key ); - } else { - pair& v = cursor.dirty_current_value(); - --v.second; - dout(10) << "_alloc_loner_dec " << ex << " " - << (v.second+1) << " -> " << v.second - << endl; - } - } else { - assert(0); - } - return 0; -} -*/ - - -int Allocator::alloc_inc(Extent ex) -{ - dout(10) << "alloc_inc " << ex << endl; - - // empty table? - if (fs->alloc_tab->get_num_keys() == 0) { - // easy. - fs->alloc_tab->insert(ex.start, pair(ex.length,1)); - dout(10) << "alloc_inc + " << ex << " 0 -> 1 (first entry)" << endl; - return 0; - } - - Table >::Cursor cursor(fs->alloc_tab); - - // try to move to left (to check for overlap) - int r = fs->alloc_tab->find( ex.start, cursor ); - if (r == Table >::Cursor::OOB || - cursor.current().key > ex.start) { - r = cursor.move_left(); - dout(10) << "alloc_inc move_left r = " << r << endl; - } - - while (1) { - dout(10) << "alloc_inc loop at " << cursor.current().key - << "~" << cursor.current().value.first - << " ref " << cursor.current().value.second - << endl; - - // too far left? - if (cursor.current().key < ex.start && - cursor.current().key + cursor.current().value.first <= ex.start) { - // adjacent? - bool adjacent = false; - if (cursor.current().key + cursor.current().value.first == ex.start && - cursor.current().value.second == 1) - adjacent = true; - - // no overlap. - r = cursor.move_right(); - dout(10) << "alloc_inc move_right r = " << r << endl; - - // at end? - if (r <= 0) { - // hmm! - if (adjacent) { - // adjust previous entry - cursor.move_left(); - pair &v = cursor.dirty_current_value(); - v.first += ex.length; // yay! - dout(10) << "alloc_inc + " << ex << " 0 -> 1 (adjust at end)" << endl; - } else { - // insert at end, finish. - int r = fs->alloc_tab->insert(ex.start, pair(ex.length,1)); - dout(10) << "alloc_inc + " << ex << " 0 -> 1 (at end) .. r = " << r << endl; - //dump_freelist(); - } - return 0; - } - } - - if (cursor.current().key > ex.start) { - // gap. - // oooooo - // nnnnn..... - block_t l = MIN(ex.length, cursor.current().key - ex.start); - - fs->alloc_tab->insert(ex.start, pair(l,1)); - dout(10) << "alloc_inc + " << ex.start << "~" << l << " 0 -> 1" << endl; - ex.start += l; - ex.length -= l; - if (ex.length == 0) break; - fs->alloc_tab->find( ex.start, cursor ); - } - else if (cursor.current().key < ex.start) { - block_t end = cursor.current().value.first + cursor.current().key; - - if (end <= ex.end()) { - // single split - // oooooo - // nnnnn - pair& v = cursor.dirty_current_value(); - v.first = ex.start - cursor.current().key; - int ref = v.second; - - block_t l = end - ex.start; - fs->alloc_tab->insert(ex.start, pair(l, 1+ref)); - - dout(10) << "alloc_inc " << ex.start << "~" << l - << " " << ref << " -> " << ref+1 - << " (right split)" << endl; - - ex.start += l; - ex.length -= l; - if (ex.length == 0) break; - fs->alloc_tab->find( ex.start, cursor ); - - } else { - // double split, finish. - // ------------- - // ------ - pair& v = cursor.dirty_current_value(); - v.first = ex.start - cursor.current().key; - int ref = v.second; - - fs->alloc_tab->insert(ex.start, pair(ex.length, 1+ref)); - - int rl = end - ex.end(); - fs->alloc_tab->insert(ex.end(), pair(rl, ref)); - - dout(10) << "alloc_inc " << ex - << " " << ref << " -> " << ref+1 - << " (double split finish)" - << endl; - - break; - } - } - else { - assert(cursor.current().key == ex.start); - - if (cursor.current().value.first <= ex.length) { - // inc. - // oooooo - // nnnnnnnn - pair& v = cursor.dirty_current_value(); - v.second++; - dout(10) << "alloc_inc " << ex.start << "~" << cursor.current().value.first - << " " << cursor.current().value.second-1 << " -> " - << cursor.current().value.second - << " (left split)" << endl; - ex.start += v.first; - ex.length -= v.first; - if (ex.length == 0) break; - cursor.move_right(); - } else { - // single split, finish. - // oooooo - // nnn - block_t l = cursor.current().value.first - ex.length; - int ref = cursor.current().value.second; - - pair& v = cursor.dirty_current_value(); - v.first = ex.length; - v.second++; - - fs->alloc_tab->insert(ex.end(), pair(l, ref)); - - dout(10) << "alloc_inc " << ex - << " " << ref << " -> " << ref+1 - << " (left split finish)" - << endl; - - break; - } - } - } - - return 0; -} - - -int Allocator::alloc_dec(Extent ex) -{ - dout(10) << "alloc_dec " << ex << endl; - - assert(fs->alloc_tab->get_num_keys() >= 0); - - Table >::Cursor cursor(fs->alloc_tab); - - // try to move to left (to check for overlap) - int r = fs->alloc_tab->find( ex.start, cursor ); - dout(10) << "alloc_dec find r = " << r << endl; - - if (r == Table >::Cursor::OOB || - cursor.current().key > ex.start) { - r = cursor.move_left(); - dout(10) << "alloc_dec move_left r = " << r << endl; - - // too far left? - if (cursor.current().key < ex.start && - cursor.current().key + cursor.current().value.first <= ex.start) { - // no overlap. - dump_freelist(); - assert(0); - } - } - - while (1) { - dout(10) << "alloc_dec ? " << cursor.current().key - << "~" << cursor.current().value.first - << " " << cursor.current().value.second - << ", ex is " << ex - << endl; - - assert(cursor.current().key <= ex.start); // no gap allowed. - - if (cursor.current().key < ex.start) { - block_t end = cursor.current().value.first + cursor.current().key; - - if (end <= ex.end()) { - // single split - // oooooo - // ----- - pair& v = cursor.dirty_current_value(); - v.first = ex.start - cursor.current().key; - int ref = v.second; - dout(10) << "alloc_dec s " << cursor.current().key << "~" << cursor.current().value.first - << " " << ref - << " shortened left bit of single" << endl; - - block_t l = end - ex.start; - if (ref > 1) { - fs->alloc_tab->insert(ex.start, pair(l, ref-1)); - dout(10) << "alloc_dec . " << ex.start << "~" << l - << " " << ref << " -> " << ref-1 - << endl; - } else { - Extent r(ex.start, l); - _release_into_limbo(r); - } - - ex.start += l; - ex.length -= l; - if (ex.length == 0) break; - fs->alloc_tab->find( ex.start, cursor ); - - } else { - // double split, finish. - // ooooooooooooo - // ------ - pair& v = cursor.dirty_current_value(); - v.first = ex.start - cursor.current().key; - int ref = v.second; - dout(10) << "alloc_dec s " << cursor.current().key << "~" << cursor.current().value.first - << " " << ref - << " shorted left bit of double split" << endl; - - if (ref > 1) { - fs->alloc_tab->insert(ex.start, pair(ex.length, ref-1)); - dout(10) << "alloc_inc s " << ex - << " " << ref << " -> " << ref-1 - << " reinserted middle bit of double split" - << endl; - } else { - _release_into_limbo(ex); - } - - int rl = end - ex.end(); - fs->alloc_tab->insert(ex.end(), pair(rl, ref)); - dout(10) << "alloc_dec s " << ex.end() << "~" << rl - << " " << ref - << " reinserted right bit of double split" << endl; - break; - } - } - else { - assert(cursor.current().key == ex.start); - - if (cursor.current().value.first <= ex.length) { - // inc. - // oooooo - // nnnnnnnn - if (cursor.current().value.second > 1) { - pair& v = cursor.dirty_current_value(); - v.second--; - dout(10) << "alloc_dec s " << ex.start << "~" << cursor.current().value.first - << " " << cursor.current().value.second+1 << " -> " << cursor.current().value.second - << endl; - ex.start += v.first; - ex.length -= v.first; - if (ex.length == 0) break; - cursor.move_right(); - } else { - Extent r(cursor.current().key, cursor.current().value.first); - _release_into_limbo(r); - - ex.start += cursor.current().value.first; - ex.length -= cursor.current().value.first; - cursor.remove(); - - if (ex.length == 0) break; - fs->alloc_tab->find( ex.start, cursor ); - } - } else { - // single split, finish. - // oooooo - // nnn - block_t l = cursor.current().value.first - ex.length; - int ref = cursor.current().value.second; - - if (ref > 1) { - pair& v = cursor.dirty_current_value(); - v.first = ex.length; - v.second--; - dout(10) << "alloc_inc . " << ex - << " " << ref << " -> " << ref-1 - << endl; - } else { - _release_into_limbo(ex); - cursor.remove(); - } - - dout(10) << "alloc_dec s " << ex.end() << "~" << l - << " " << ref - << " reinserted right bit of single split" << endl; - fs->alloc_tab->insert(ex.end(), pair(l, ref)); - break; - } - } - - - } - - return 0; -} - - -/* - * release extent into freelist - * WARNING: *ONLY* use this if you _know_ there are no adjacent free extents - */ -int Allocator::_release_loner(Extent& ex) -{ - assert(ex.length > 0); - int b = pick_bucket(ex.length); - fs->free_tab[b]->insert(ex.start, ex.length); - fs->free_blocks += ex.length; - return 0; -} - -/* - * release extent into freelist - * look for any adjacent extents and merge with them! - */ -int Allocator::_release_merge(Extent& orig) -{ - dout(15) << "_release_merge " << orig << endl; - assert(orig.length > 0); - - Extent newex = orig; - - // one after us? - for (int b=0; b::Cursor cursor(fs->free_tab[b]); - - if (fs->free_tab[b]->find( newex.start+newex.length, cursor ) - == Table::Cursor::MATCH) { - // add following extent to ours - newex.length += cursor.current().value; - - // remove it - fs->free_blocks -= cursor.current().value; - fs->free_tab[b]->remove( cursor.current().key ); - break; - } - } - - // one before us? - for (int b=0; b::Cursor cursor(fs->free_tab[b]); - fs->free_tab[b]->find( newex.start+newex.length, cursor ); - if (cursor.move_left() >= 0 && - (cursor.current().key + cursor.current().value == newex.start)) { - // merge - newex.start = cursor.current().key; - newex.length += cursor.current().value; - - // remove it - fs->free_blocks -= cursor.current().value; - fs->free_tab[b]->remove( cursor.current().key ); - break; - } - } - - // ok, insert newex - _release_loner(newex); - return 0; -} diff --git a/branches/marnberg/quota/ebofs/Allocator.h b/branches/marnberg/quota/ebofs/Allocator.h deleted file mode 100644 index c53ff2a69fba1..0000000000000 --- a/branches/marnberg/quota/ebofs/Allocator.h +++ /dev/null @@ -1,85 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __EBOFS_ALLOCATOR_H -#define __EBOFS_ALLOCATOR_H - -#include "types.h" - -#include "include/interval_set.h" - -class Ebofs; - -class Allocator { -public: - const static block_t NEAR_LAST = 0; - const static block_t NEAR_LAST_FWD = 1; - - const static int DIR_ANY = 0; - const static int DIR_FWD = 2; - const static int DIR_BACK = 1; - -protected: - Ebofs *fs; - block_t last_pos; - - - interval_set limbo; - - static int pick_bucket(block_t num) { - int b = 0; - while (num > 1) { - b++; - num = num >> EBOFS_FREE_BUCKET_BITS; - } - if (b >= EBOFS_NUM_FREE_BUCKETS) - b = EBOFS_NUM_FREE_BUCKETS-1; - return b; - } - - int find(Extent& ex, int bucket, block_t num, block_t near, int dir = DIR_ANY); - - void dump_freelist(); - - public: - int _release_into_limbo(Extent& ex); - - int _release_loner(Extent& ex); // release loner extent - int _release_merge(Extent& ex); // release any extent (searches for adjacent) - - //int _alloc_loner_inc(Extent& ex); - //int _alloc_loner_dec(Extent& ex); - - - public: - Allocator(Ebofs *f) : fs(f), last_pos(0) {} - - int allocate(Extent& ex, block_t num, block_t near=NEAR_LAST); - int release(Extent& ex); // alias for alloc_dec - - int alloc_inc(Extent ex); - int alloc_dec(Extent ex); - - - /*int unallocate(Extent& ex) { // skip limbo - return _release_merge(ex); - } - */ - - int commit_limbo(); // limbo -> fs->limbo_tab - int release_limbo(); // fs->limbo_tab -> free_tabs - -}; - -#endif diff --git a/branches/marnberg/quota/ebofs/BlockDevice.cc b/branches/marnberg/quota/ebofs/BlockDevice.cc deleted file mode 100644 index 7044e4ca38f27..0000000000000 --- a/branches/marnberg/quota/ebofs/BlockDevice.cc +++ /dev/null @@ -1,777 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "BlockDevice.h" - -#include "config.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include - -#ifndef __CYGWIN__ -#ifndef DARWIN -#include -#endif -#endif - - -/******************************************* - * biovec - */ - -inline ostream& operator<<(ostream& out, BlockDevice::biovec &bio) -{ - out << "bio("; - if (bio.type == BlockDevice::biovec::IO_READ) out << "rd "; - if (bio.type == BlockDevice::biovec::IO_WRITE) out << "wr "; - out << bio.start << "~" << bio.length; - if (bio.note) out << " " << bio.note; - out << " " << &bio; - out << ")"; - return out; -} - - - -/******************************************* - * ElevatorQueue - */ - -#undef dout -#define dout(x) if (x <= g_conf.debug_bdev) cout << "bdev(" << dev << ").elevatorq." -#define derr(x) if (x <= g_conf.debug_bdev) cerr << "bdev(" << dev << ").elevatorq." - - -int BlockDevice::ElevatorQueue::dequeue_io(list& biols, - block_t& start, block_t& length, - interval_set& block_lock) -{ - // queue empty? - assert(!io_map.empty()); - - dout(20) << "dequeue_io el_pos " << el_pos << " dir " << el_dir_forward << endl; - - // find our position: i >= pos - map::iterator i; - - int tries = g_conf.bdev_el_bidir + 1; - while (tries > 0) { - if (el_dir_forward) { - i = io_map.lower_bound(el_pos); - if (i != io_map.end()) { - break; // not at end. good. - } - } else { - i = io_map.upper_bound(el_pos); - if (i != io_map.begin()) { - i--; // and back down one (to get i <= pos). good. - break; - } - } - - // reverse (or initial startup)? - if (g_conf.bdev_el_bidir || !el_dir_forward) { - // dout(20) << "restart reversing" << endl; - el_dir_forward = !el_dir_forward; - } - - if (el_dir_forward) { - // forward - el_pos = 0; - - if (g_conf.bdev_el_fw_max_ms) { - el_stop = g_clock.now(); - utime_t max(0, 1000*g_conf.bdev_el_fw_max_ms); // (s,us), convert ms -> us! - el_stop += max; - // dout(20) << "restart forward sweep for " << max << endl; - } else { - // dout(20) << "restart fowrard sweep" << endl; - } - } else { - // reverse - el_pos = bdev->get_num_blocks(); - - if (g_conf.bdev_el_bw_max_ms) { - el_stop = g_clock.now(); - utime_t max(0, 1000*g_conf.bdev_el_bw_max_ms); // (s,us), convert ms -> us! - el_stop += max; - // dout(20) << "restart reverse sweep for " << max << endl; - } else { - // dout(20) << "restart reverse sweep" << endl; - } - } - - tries--; - } - - assert(tries > 0); // this shouldn't happen if the queue is non-empty. - - // get some biovecs - int num_bio = 0; - - dout(20) << "dequeue_io starting with " << i->first << " " << *i->second << endl; - - // merge contiguous ops - char type = i->second->type; // read or write - int num_iovs = 0; // count eventual iov's for readv/writev - - start = i->first; - length = 0; - - if (el_dir_forward) - el_pos = start; - else - el_pos = i->first + i->second->length; - - // while (contiguous) - while ((( el_dir_forward && el_pos == i->first) || - (!el_dir_forward && el_pos == i->first + i->second->length)) && - type == i->second->type) { - biovec *bio = i->second; - - // allowed? (not already submitted to kernel?) - if (block_lock.intersects(bio->start, bio->length)) { - // dout(20) << "dequeue_io " << bio->start << "~" << bio->length - // << " intersects block_lock " << block_lock << endl; - break; // stop, or go with what we've got so far - } - - // add to biols - int nv = bio->bl.buffers().size(); // how many iov's in this bio's bufferlist? - if (num_iovs + nv >= g_conf.bdev_iov_max) break; // too many! - num_iovs += nv; - - start = MIN(start, bio->start); - length += bio->length; - - if (el_dir_forward) { - //dout(20) << "dequeue_io fw dequeue io at " << el_pos << " " << *i->second << endl; - biols.push_back(bio); // add at back - } else { - // dout(20) << "dequeue_io bw dequeue io at " << el_pos << " " << *i->second << endl; - biols.push_front(bio); // add at front - } - num_bio++; - - // move elevator pointer - bool at_end = false; - map::iterator prev = i; - if (el_dir_forward) { - el_pos += bio->length; // cont. next would start right after us - i++; - if (i == io_map.end()) { - at_end = true; - } - } else { - el_pos -= bio->length; - if (i == io_map.begin()) { - at_end = true; - } else { - i--; - } - } - - // dequeue - io_map.erase(prev); - bio->in_queue = 0; - - if (at_end) break; - } - - return num_bio; -} - - - -/******************************************* - * BarrierQueue - */ -#undef dout -#define dout(x) if (x <= g_conf.debug_bdev) cout << "bdev(" << dev << ").barrierq." - -void BlockDevice::BarrierQueue::barrier() -{ - if (!qls.empty() && qls.front()->empty()) { - assert(qls.size() == 1); - dout(10) << "barrier not adding new queue, front is empty" << endl; - } else { - qls.push_back(new ElevatorQueue(bdev, dev)); - dout(10) << "barrier adding new elevator queue (now " << qls.size() << "), front queue has " - << qls.front()->size() << " ios left" << endl; - } -} - -bool BlockDevice::BarrierQueue::bump() -{ - assert(!qls.empty()); - - // is the front queue empty? - if (qls.front()->empty() && - qls.front() != qls.back()) { - delete qls.front(); - qls.pop_front(); - dout(10) << "dequeue_io front empty, moving to next queue (" << qls.front()->size() << ")" << endl; - return true; - } - - return false; -} - -int BlockDevice::BarrierQueue::dequeue_io(list& biols, - block_t& start, block_t& length, - interval_set& locked) -{ - assert(!qls.empty()); - int n = qls.front()->dequeue_io(biols, start, length, locked); - bump(); // in case we emptied the front queue - return n; -} - - - - -/******************************************* - * BlockDevice - */ - -#undef dout -#define dout(x) if (x <= g_conf.debug_bdev) cout << "bdev(" << dev << ")." - - - -block_t BlockDevice::get_num_blocks() -{ - if (!num_blocks) { - assert(fd > 0); - -#ifdef BLKGETSIZE64 - // ioctl block device? - ioctl(fd, BLKGETSIZE64, &num_blocks); -#endif - - if (!num_blocks) { - // hmm, try stat! - struct stat st; - fstat(fd, &st); - num_blocks = st.st_size; - } - - num_blocks /= (__uint64_t)EBOFS_BLOCK_SIZE; - - if (g_conf.bdev_fake_mb) { - num_blocks = g_conf.bdev_fake_mb * 256; - dout(0) << "faking dev size " << g_conf.bdev_fake_mb << " mb" << endl; - } - if (g_conf.bdev_fake_max_mb && - num_blocks > (block_t)g_conf.bdev_fake_max_mb * 256ULL) { - dout(0) << "faking dev size " << g_conf.bdev_fake_max_mb << " mb" << endl; - num_blocks = g_conf.bdev_fake_max_mb * 256; - } - - } - return num_blocks; -} - - - -/** io thread - * each worker thread dequeues ios from the root_queue and submits them to the kernel. - */ -void* BlockDevice::io_thread_entry() -{ - lock.Lock(); - - int whoami = io_threads_started++; - io_threads_running++; - assert(io_threads_running <= g_conf.bdev_iothreads); - dout(10) << "io_thread" << whoami << " start, " << io_threads_running << " now running" << endl; - - // get my own fd (and file position pointer) - int fd = open_fd(); - assert(fd > 0); - - while (!io_stop) { - bool do_sleep = false; - - // queue empty? - if (root_queue.empty()) { - // sleep - do_sleep = true; - } else { - dout(20) << "io_thread" << whoami << " going" << endl; - - block_t start, length; - list biols; - int n = root_queue.dequeue_io(biols, start, length, io_block_lock); - - if (n == 0) { - // failed to dequeue a do-able op, sleep for now - dout(20) << "io_thread" << whoami << " couldn't dequeue doable op, sleeping" << endl; - assert(io_threads_running > 1); // there must be someone else, if we couldn't dequeue something doable. - do_sleep = true; - } - else { - // lock blocks - assert(start == biols.front()->start); - io_block_lock.insert(start, length); - - // drop lock to do the io - lock.Unlock(); - do_io(fd, biols); - lock.Lock(); - - // unlock blocks - io_block_lock.erase(start, length); - - // someone might have blocked on our block_lock? - if (io_threads_running < g_conf.bdev_iothreads && - (int)root_queue.size() > io_threads_running) - io_wakeup.SignalAll(); - } - } - - if (do_sleep) { - do_sleep = false; - - // sleep - io_threads_running--; - dout(20) << "io_thread" << whoami << " sleeping, " << io_threads_running << " threads now running," - << " queue has " << root_queue.size() << endl; - - if (g_conf.bdev_idle_kick_after_ms > 0 && - io_threads_running == 0 && - idle_kicker) { - // first wait for signal | timeout - io_wakeup.WaitInterval(lock, utime_t(0, g_conf.bdev_idle_kick_after_ms*1000)); - - // should we still be sleeping? (did we get woken up, or did timer expire? - if (root_queue.empty() && io_threads_running == 0) { - idle_kicker->kick(); // kick - io_wakeup.Wait(lock); // and wait - } - } else { - // normal, just wait. - io_wakeup.Wait(lock); - } - - io_threads_running++; - assert(io_threads_running <= g_conf.bdev_iothreads); - dout(20) << "io_thread" << whoami << " woke up, " << io_threads_running << " threads now running" << endl; - } - } - - // clean up - ::close(fd); - io_threads_running--; - - lock.Unlock(); - - dout(10) << "io_thread" << whoami << " finish" << endl; - return 0; -} - - - -/** do_io - * do a single io operation - * (lock is NOT held, but we own the *biovec) - */ -void BlockDevice::do_io(int fd, list& biols) -{ - int r; - assert(!biols.empty()); - - // get full range, type, bl - bufferlist bl; - bl.claim(biols.front()->bl); - block_t start = biols.front()->start; - block_t length = biols.front()->length; - char type = biols.front()->type; - - list::iterator p = biols.begin(); - int numbio = 1; - for (p++; p != biols.end(); p++) { - length += (*p)->length; - bl.claim_append((*p)->bl); - numbio++; - } - - // do it - dout(20) << "do_io start " << (type==biovec::IO_WRITE?"write":"read") - << " " << start << "~" << length - << " " << numbio << " bits" << endl; - if (type == biovec::IO_WRITE) { - r = _write(fd, start, length, bl); - } else if (type == biovec::IO_READ) { - r = _read(fd, start, length, bl); - } else assert(0); - dout(20) << "do_io finish " << (type==biovec::IO_WRITE?"write":"read") - << " " << start << "~" << length << endl; - - // set rval - for (p = biols.begin(); p != biols.end(); p++) - (*p)->rval = r; - - if (1) { - // put in completion queue - complete_lock.Lock(); - complete_queue.splice( complete_queue.end(), biols ); - complete_queue_len += numbio; - complete_wakeup.Signal(); - complete_lock.Unlock(); - } else { - // be slow and finish synchronously - for (p = biols.begin(); p != biols.end(); p++) - finish_io(*p); - } -} - - -/** finish_io - * - * finish an io by signaling the cond or performing a callback. - * called by completion thread, unless that's disabled above. - */ -void BlockDevice::finish_io(biovec *bio) -{ - bio->done = true; - if (bio->cond) { - bio->cond->Signal(); - } - else if (bio->cb) { - bio->cb->finish((ioh_t)bio, bio->rval); - delete bio->cb; - delete bio; - } -} - -/*** completion_thread - * handle Cond signals or callbacks for completed ios - */ -void* BlockDevice::complete_thread_entry() -{ - complete_lock.Lock(); - dout(10) << "complete_thread start" << endl; - - while (!io_stop) { - - while (!complete_queue.empty()) { - list ls; - ls.swap(complete_queue); - dout(10) << "complete_thread grabbed " << complete_queue_len << " biovecs" << endl; - complete_queue_len = 0; - - complete_lock.Unlock(); - - // finish - for (list::iterator p = ls.begin(); - p != ls.end(); - p++) { - biovec *bio = *p; - dout(20) << "complete_thread finishing " << *bio << endl; - finish_io(bio); - } - - complete_lock.Lock(); - } - if (io_stop) break; - - /* - if (io_threads_running == 0 && idle_kicker) { - complete_lock.Unlock(); - idle_kicker->kick(); - complete_lock.Lock(); - if (!complete_queue.empty() || io_stop) - continue; - } - */ - - dout(25) << "complete_thread sleeping" << endl; - complete_wakeup.Wait(complete_lock); - } - - dout(10) << "complete_thread finish" << endl; - complete_lock.Unlock(); - return 0; -} - - - - -// io queue - -void BlockDevice::_submit_io(biovec *b) -{ - // NOTE: lock must be held - dout(15) << "_submit_io " << *b << endl; - - // wake up io_thread(s)? - if ((int)root_queue.size() == io_threads_running) - io_wakeup.SignalOne(); - else if ((int)root_queue.size() > io_threads_running) - io_wakeup.SignalAll(); - - // queue - root_queue.submit_io(b); - - /* - // [DEBUG] check for overlapping ios - // BUG: this doesn't detect all overlaps w/ the next queue thing. - if (g_conf.bdev_debug_check_io_overlap) { - // BUG: this doesn't catch everything! eg 1~10000000 will be missed.... - multimap::iterator p = io_queue.lower_bound(b->start); - if ((p != io_queue.end() && - p->first < b->start+b->length) || - (p != io_queue.begin() && - (p--, p->second->start + p->second->length > b->start))) { - dout(1) << "_submit_io new io " << *b - << " overlaps with existing " << *p->second << endl; - cerr << "_submit_io new io " << *b - << " overlaps with existing " << *p->second << endl; - } - } - */ - -} - -int BlockDevice::_cancel_io(biovec *bio) -{ - // NOTE: lock must be held - - if (bio->in_queue == 0) { - dout(15) << "_cancel_io " << *bio << " FAILED" << endl; - return -1; - } else { - dout(15) << "_cancel_io " << *bio << endl; - bio->in_queue->cancel_io(bio); - if (root_queue.bump()) - io_wakeup.SignalAll(); // something happened! - return 0; - } -} - - - -// low level io - -int BlockDevice::_read(int fd, block_t bno, unsigned num, bufferlist& bl) -{ - dout(10) << "_read " << bno << "~" << num << endl; - - assert(fd > 0); - - off_t offset = bno * EBOFS_BLOCK_SIZE; - off_t actual = lseek(fd, offset, SEEK_SET); - assert(actual == offset); - - size_t len = num*EBOFS_BLOCK_SIZE; - assert(bl.length() >= len); - - struct iovec iov[ bl.buffers().size() ]; - int n = 0; - size_t left = len; - for (list::const_iterator i = bl.buffers().begin(); - i != bl.buffers().end(); - i++) { - assert(i->length() % EBOFS_BLOCK_SIZE == 0); - - iov[n].iov_base = (void*)i->c_str(); - iov[n].iov_len = MIN(left, i->length()); - - left -= iov[n].iov_len; - n++; - if (left == 0) break; - } - - int got = ::readv(fd, iov, n); - assert(got <= (int)len); - - return 0; -} - -int BlockDevice::_write(int fd, unsigned bno, unsigned num, bufferlist& bl) -{ - dout(10) << "_write " << bno << "~" << num << endl; - - assert(fd > 0); - - off_t offset = (off_t)bno << EBOFS_BLOCK_BITS; - assert((off_t)bno * (off_t)EBOFS_BLOCK_SIZE == offset); - off_t actual = lseek(fd, offset, SEEK_SET); - assert(actual == offset); - - // write buffers - size_t len = num*EBOFS_BLOCK_SIZE; - - struct iovec iov[ bl.buffers().size() ]; - - int n = 0; - size_t left = len; - for (list::const_iterator i = bl.buffers().begin(); - i != bl.buffers().end(); - i++) { - assert(i->length() % EBOFS_BLOCK_SIZE == 0); - - iov[n].iov_base = (void*)i->c_str(); - iov[n].iov_len = MIN(left, i->length()); - - assert((((unsigned long long)iov[n].iov_base) & 4095ULL) == 0); - assert((iov[n].iov_len & 4095) == 0); - - left -= iov[n].iov_len; - n++; - if (left == 0) break; - } - - int r = ::writev(fd, iov, n); - - if (r < 0) { - dout(1) << "couldn't write bno " << bno << " num " << num - << " (" << len << " bytes) in " << n << " iovs, r=" << r - << " errno " << errno << " " << strerror(errno) << endl; - dout(1) << "bl is " << bl << endl; - assert(0); - } else { - assert(r == (int)len); - } - - return 0; -} - - - -// open/close - -int BlockDevice::open_fd() -{ -#ifdef DARWIN - int fd = ::open(dev.c_str(), O_RDWR|O_SYNC, 0); - ::fcntl(fd, F_NOCACHE); - return fd; -#else - return ::open(dev.c_str(), O_RDWR|O_SYNC|O_DIRECT, 0); -#endif -} - -int BlockDevice::open(kicker *idle) -{ - assert(fd == 0); - - // open? - fd = open_fd(); - if (fd < 0) { - dout(1) << "open failed, r = " << fd << " " << strerror(errno) << endl; - fd = 0; - return -1; - } - - // lock - if (g_conf.bdev_lock) { - int r = ::flock(fd, LOCK_EX|LOCK_NB); - if (r < 0) { - derr(1) << "open " << dev << " failed to get LOCK_EX" << endl; - assert(0); - return -1; - } - } - - // figure size - __uint64_t bsize = get_num_blocks(); - - dout(2) << "open " << bsize << " bytes, " << num_blocks << " blocks" << endl; - - // start thread - io_threads_started = 0; - io_threads.clear(); - for (int i=0; icreate(); - } - complete_thread.create(); - - // idle kicker? - idle_kicker = idle; - - return fd; -} - - -int BlockDevice::close() -{ - assert(fd>0); - - idle_kicker = 0; - - // shut down io thread - dout(10) << "close stopping io+complete threads" << endl; - lock.Lock(); - complete_lock.Lock(); - io_stop = true; - io_wakeup.SignalAll(); - complete_wakeup.SignalAll(); - complete_lock.Unlock(); - lock.Unlock(); - - - for (int i=0; ijoin(); - delete io_threads[i]; - } - io_threads.clear(); - - complete_thread.join(); - - io_stop = false; // in case we start again - - dout(2) << "close " << endl; - - if (g_conf.bdev_lock) - ::flock(fd, LOCK_UN); - - ::close(fd); - fd = 0; - - return 0; -} - -int BlockDevice::cancel_io(ioh_t ioh) -{ - biovec *pbio = (biovec*)ioh; - - lock.Lock(); - int r = _cancel_io(pbio); - lock.Unlock(); - - // FIXME? - if (r == 0 && pbio->cb) { - //pbio->cb->finish(ioh, 0); - delete pbio->cb; - delete pbio; - } - - return r; -} - diff --git a/branches/marnberg/quota/ebofs/BlockDevice.h b/branches/marnberg/quota/ebofs/BlockDevice.h deleted file mode 100644 index 18f639f7176b6..0000000000000 --- a/branches/marnberg/quota/ebofs/BlockDevice.h +++ /dev/null @@ -1,338 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __EBOFS_BLOCKDEVICE_H -#define __EBOFS_BLOCKDEVICE_H - -#include "include/buffer.h" -#include "include/interval_set.h" -#include "include/Context.h" -#include "common/Mutex.h" -#include "common/Cond.h" -#include "common/Thread.h" - -#include "types.h" - - -typedef void *ioh_t; // opaque handle to an io request. (in actuality, a biovec*) - - -class BlockDevice { - public: - // callback type for io completion notification - class callback { - public: - virtual ~callback() {} - virtual void finish(ioh_t ioh, int rval) = 0; - }; - - // kicker for idle notification - class kicker { - public: - virtual ~kicker() {} - virtual void kick() = 0; - }; - - - /********************************************************/ - - class Queue; - - // io item - // two variants: one with Cond*, one with callback*. - class biovec { - public: - static const char IO_WRITE = 1; - static const char IO_READ = 2; - - char type; - block_t start, length; - bufferlist bl; - callback *cb; - Cond *cond; - int rval; - char *note; - bool done; - - Queue *in_queue; - - biovec(char t, block_t s, block_t l, bufferlist& b, callback *c, char *n=0) : - type(t), start(s), length(l), bl(b), cb(c), cond(0), rval(0), note(n), done(false), in_queue(0) {} - biovec(char t, block_t s, block_t l, bufferlist& b, Cond *c, char *n=0) : - type(t), start(s), length(l), bl(b), cb(0), cond(c), rval(0), note(n), done(false), in_queue(0) {} - }; - friend ostream& operator<<(ostream& out, biovec &bio); - - - /********************************************************/ - - /* - * Queue -- abstract IO queue interface - */ - class Queue { - public: - virtual ~Queue() {} - virtual void submit_io(biovec *b) = 0; - virtual void cancel_io(biovec *b) = 0; - virtual int dequeue_io(list& biols, - block_t& start, block_t& length, - interval_set& locked) = 0; - virtual int size() = 0; - virtual bool empty() { return size() == 0; } - }; - - /* - * ElevatorQueue - simple elevator scheduler queue - */ - class ElevatorQueue : public Queue { - BlockDevice *bdev; - const char *dev; - map io_map; - bool el_dir_forward; - block_t el_pos; - utime_t el_stop; - - public: - ElevatorQueue(BlockDevice *bd, const char *d) : - bdev(bd), dev(d), - el_dir_forward(false), - el_pos(0) {} - void submit_io(biovec *b) { - b->in_queue = this; - assert(io_map.count(b->start) == 0); - io_map[b->start] = b; - } - void cancel_io(biovec *b) { - assert(b->in_queue == this); - assert(io_map.count(b->start) && - io_map[b->start] == b); - io_map.erase(b->start); - b->in_queue = 0; - } - int dequeue_io(list& biols, - block_t& start, block_t& length, - interval_set& locked); - int size() { - return io_map.size(); - } - }; - - /* - * BarrierQueue - lets you specify io "barriers" - * barrier() - force completion of all prior IOs before - * future ios are started. - * bump() - must be called after cancel_io to properly - * detect empty subqueue. - */ - class BarrierQueue : public Queue { - BlockDevice *bdev; - const char *dev; - list qls; - public: - BarrierQueue(BlockDevice *bd, const char *d) : bdev(bd), dev(d) { - barrier(); - } - ~BarrierQueue() { - for (list::iterator p = qls.begin(); - p != qls.end(); - ++p) - delete *p; - qls.clear(); - } - int size() { - // this isn't perfectly accurate. - if (!qls.empty()) - return qls.front()->size(); - return 0; - } - void submit_io(biovec *b) { - assert(!qls.empty()); - qls.back()->submit_io(b); - } - void cancel_io(biovec *b) { - assert(0); // shouldn't happen. - } - int dequeue_io(list& biols, - block_t& start, block_t& length, - interval_set& locked); - void barrier(); - bool bump(); - }; - - - private: - string dev; // my device file - int fd; - block_t num_blocks; - - Mutex lock; - - /** the root io queue. - * i current assumeit's a barrier queue,but this can be changed - * with some minor rearchitecting. - */ - BarrierQueue root_queue; - - kicker *idle_kicker; // not used.. - - /* io_block_lock - block ranges current dispatched to kernel - * once a bio is dispatched, it cannot be canceled, so an overlapping - * io and be submitted. the overlapping io cannot be dispatched - * to the kernel, however, until the original io finishes, or else - * there will be a race condition. - */ - interval_set io_block_lock; // blocks currently dispatched to kernel - - // io threads - Cond io_wakeup; - bool io_stop; - int io_threads_started, io_threads_running; - - void *io_thread_entry(); - - class IOThread : public Thread { - BlockDevice *dev; - public: - IOThread(BlockDevice *d) : dev(d) {} - void *entry() { return (void*)dev->io_thread_entry(); } - } ; - - vector io_threads; - - // private io interface - int open_fd(); // get an fd (for a thread) - - void _submit_io(biovec *b); - int _cancel_io(biovec *bio); - void do_io(int fd, list& biols); // called by an io thread - - // low level io - int _read(int fd, block_t bno, unsigned num, bufferlist& bl); - int _write(int fd, unsigned bno, unsigned num, bufferlist& bl); - - - // completion callback queue - Mutex complete_lock; - Cond complete_wakeup; - list complete_queue; - int complete_queue_len; - - void finish_io(biovec *bio); - - // complete thread - void *complete_thread_entry(); - class CompleteThread : public Thread { - BlockDevice *dev; - public: - CompleteThread(BlockDevice *d) : dev(d) {} - void *entry() { return (void*)dev->complete_thread_entry(); } - } complete_thread; - - - public: - BlockDevice(const char *d) : - dev(d), fd(0), num_blocks(0), - root_queue(this, dev.c_str()), - idle_kicker(0), - io_stop(false), io_threads_started(0), io_threads_running(0), - complete_queue_len(0), - complete_thread(this) { } - ~BlockDevice() { - if (fd > 0) close(); - } - - // get size in blocks - block_t get_num_blocks(); - const char *get_device_name() const { return dev.c_str(); } - - // open/close - int open(kicker *idle = 0); - int close(); - - // state stuff - bool is_idle() { - lock.Lock(); - bool idle = (io_threads_running == 0) && root_queue.empty(); - lock.Unlock(); - return idle; - } - void barrier() { - lock.Lock(); - root_queue.barrier(); - lock.Unlock(); - } - - // ** blocking interface ** - - // read - int read(block_t bno, unsigned num, bufferptr& bptr, char *n=0) { - bufferlist bl; - bl.push_back(bptr); - return read(bno, num, bl, n); - } - int read(block_t bno, unsigned num, bufferlist& bl, char *n=0) { - Cond c; - biovec bio(biovec::IO_READ, bno, num, bl, &c, n); - - lock.Lock(); - _submit_io(&bio); - barrier(); // need this, to prevent starvation! - while (!bio.done) - c.Wait(lock); - lock.Unlock(); - return bio.rval; - } - - // write - int write(unsigned bno, unsigned num, bufferptr& bptr, char *n=0) { - bufferlist bl; - bl.push_back(bptr); - return write(bno, num, bl, n); - } - int write(unsigned bno, unsigned num, bufferlist& bl, char *n=0) { - Cond c; - biovec bio(biovec::IO_WRITE, bno, num, bl, &c, n); - - lock.Lock(); - _submit_io(&bio); - barrier(); // need this, to prevent starvation! - while (!bio.done) - c.Wait(lock); - lock.Unlock(); - return bio.rval; - } - - // ** non-blocking interface ** - ioh_t read(block_t bno, unsigned num, bufferlist& bl, callback *fin, char *n=0) { - biovec *pbio = new biovec(biovec::IO_READ, bno, num, bl, fin, n); - lock.Lock(); - _submit_io(pbio); - lock.Unlock(); - return (ioh_t)pbio; - } - ioh_t write(block_t bno, unsigned num, bufferlist& bl, callback *fin, char *n=0) { - biovec *pbio = new biovec(biovec::IO_WRITE, bno, num, bl, fin, n); - lock.Lock(); - _submit_io(pbio); - lock.Unlock(); - return (ioh_t)pbio; - } - int cancel_io(ioh_t ioh); - -}; - - - - -#endif diff --git a/branches/marnberg/quota/ebofs/BufferCache.cc b/branches/marnberg/quota/ebofs/BufferCache.cc deleted file mode 100644 index 4ad22b3a5d0fb..0000000000000 --- a/branches/marnberg/quota/ebofs/BufferCache.cc +++ /dev/null @@ -1,1147 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "BufferCache.h" -#include "Onode.h" - - -/*********** BufferHead **************/ - - -#undef dout -#define dout(x) if (x <= g_conf.debug_ebofs) cout << "ebofs.bh." - - - - - - -/************ ObjectCache **************/ - - -#undef dout -#define dout(x) if (x <= g_conf.debug_ebofs) cout << "ebofs.oc." - - - -void ObjectCache::rx_finish(ioh_t ioh, block_t start, block_t length, bufferlist& bl) -{ - list waiters; - - dout(10) << "rx_finish " << start << "~" << length << endl; - for (map::iterator p = data.lower_bound(start); - p != data.end(); - p++) { - BufferHead *bh = p->second; - dout(10) << "rx_finish ?" << *bh << endl; - assert(p->first == bh->start()); - - // past? - if (p->first >= start+length) break; - if (bh->end() > start+length) break; // past - - assert(p->first >= start); - assert(bh->end() <= start+length); - - dout(10) << "rx_finish !" << *bh << endl; - - if (bh->rx_ioh == ioh) - bh->rx_ioh = 0; - - if (bh->is_rx()) { - assert(bh->get_version() == 0); - assert(bh->end() <= start+length); - assert(bh->start() >= start); - dout(10) << "rx_finish rx -> clean on " << *bh << endl; - bh->data.substr_of(bl, (bh->start()-start)*EBOFS_BLOCK_SIZE, bh->length()*EBOFS_BLOCK_SIZE); - bc->mark_clean(bh); - } - else if (bh->is_partial()) { - dout(10) << "rx_finish partial -> tx on " << *bh << endl; - - if (1) { - // double-check what block i am - vector exv; - on->map_extents(bh->start(), 1, exv); - assert(exv.size() == 1); - block_t cur_block = exv[0].start; - assert(cur_block == bh->partial_tx_to); - } - - // ok, cancel my low-level partial (since we're still here, and can bh_write ourselves) - bc->cancel_partial( bh->rx_from.start, bh->partial_tx_to, bh->partial_tx_epoch ); - - // apply partial to myself - assert(bh->data.length() == 0); - bufferptr bp = buffer::create_page_aligned(EBOFS_BLOCK_SIZE); - bh->data.push_back( bp ); - bh->data.copy_in(0, EBOFS_BLOCK_SIZE, bl); - bh->apply_partial(); - - // write "normally" - bc->mark_dirty(bh); - bc->bh_write(on, bh, bh->partial_tx_to);//cur_block); - - // clean up a bit - bh->partial_tx_to = 0; - bh->partial_tx_epoch = 0; - bh->partial.clear(); - } - else { - dout(10) << "rx_finish ignoring status on (dirty|tx|clean) " << *bh << endl; - assert(bh->is_dirty() || // was overwritten - bh->is_tx() || // was overwritten and queued - bh->is_clean()); // was overwritten, queued, _and_ flushed to disk - } - - // trigger waiters - for (map >::iterator p = bh->waitfor_read.begin(); - p != bh->waitfor_read.end(); - p++) { - assert(p->first >= bh->start() && p->first < bh->end()); - waiters.splice(waiters.begin(), p->second); - } - bh->waitfor_read.clear(); - } - - finish_contexts(waiters); -} - - -void ObjectCache::tx_finish(ioh_t ioh, block_t start, block_t length, - version_t version, version_t epoch) -{ - dout(10) << "tx_finish " << start << "~" << length << " v" << version << endl; - for (map::iterator p = data.lower_bound(start); - p != data.end(); - p++) { - BufferHead *bh = p->second; - dout(30) << "tx_finish ?bh " << *bh << endl; - assert(p->first == bh->start()); - - // past? - if (p->first >= start+length) break; - - if (bh->tx_ioh == ioh) - bh->tx_ioh = 0; - - if (!bh->is_tx()) { - dout(10) << "tx_finish bh not marked tx, skipping" << endl; - continue; - } - assert(bh->is_tx()); - - if (version == bh->version) { - dout(10) << "tx_finish tx -> clean on " << *bh << endl; - assert(bh->end() <= start+length); - bh->set_last_flushed(version); - bc->mark_clean(bh); - } else { - dout(10) << "tx_finish leaving tx, " << bh->version << " > " << version - << " on " << *bh << endl; - assert(bh->version > version); - } - } -} - - - -/* - * return any bh's that are (partially) in this range that are TX. - */ -int ObjectCache::find_tx(block_t start, block_t len, - list& tx) -{ - map::iterator p = data.lower_bound(start); - - block_t cur = start; - block_t left = len; - - /* don't care about overlap, we want things _fully_ in start~len. - if (p != data.begin() && - (p == data.end() || p->first > cur)) { - p--; // might overlap! - if (p->first + p->second->length() <= cur) - p++; // doesn't overlap. - } - */ - - while (left > 0) { - assert(cur+left == start+len); - - // at end? - if (p == data.end()) - break; - - if (p->first <= cur) { - // have it (or part of it) - BufferHead *e = p->second; - - if (e->end() <= start+len && - e->is_tx()) - tx.push_back(e); - - block_t lenfromcur = MIN(e->end() - cur, left); - cur += lenfromcur; - left -= lenfromcur; - p++; - continue; // more? - } else if (p->first > cur) { - // gap.. miss - block_t next = p->first; - left -= (next-cur); - cur = next; - continue; - } - else - assert(0); - } - - return 0; -} - - -int ObjectCache::try_map_read(block_t start, block_t len) -{ - map::iterator p = data.lower_bound(start); - - block_t cur = start; - block_t left = len; - - if (p != data.begin() && - (p == data.end() || p->first > cur)) { - p--; // might overlap! - if (p->first + p->second->length() <= cur) - p++; // doesn't overlap. - } - - int num_missing = 0; - - while (left > 0) { - // at end? - if (p == data.end()) { - // rest is a miss. - vector exv; - on->map_extents(cur, - left, // no prefetch here! - exv); - - num_missing += exv.size(); - left = 0; - cur = start+len; - break; - } - - if (p->first <= cur) { - // have it (or part of it) - BufferHead *e = p->second; - - if (e->is_clean() || - e->is_dirty() || - e->is_tx()) { - dout(20) << "try_map_read hit " << *e << endl; - } - else if (e->is_rx()) { - dout(20) << "try_map_read rx " << *e << endl; - num_missing++; - } - else if (e->is_partial()) { - dout(-20) << "try_map_read partial " << *e << endl; - num_missing++; - } - else { - dout(0) << "try_map_read got unexpected " << *e << endl; - assert(0); - } - - block_t lenfromcur = MIN(e->end() - cur, left); - cur += lenfromcur; - left -= lenfromcur; - p++; - continue; // more? - } else if (p->first > cur) { - // gap.. miss - block_t next = p->first; - vector exv; - on->map_extents(cur, - MIN(next-cur, left), // no prefetch - exv); - - dout(-20) << "try_map_read gap of " << p->first-cur << " blocks, " - << exv.size() << " extents" << endl; - num_missing += exv.size(); - left -= (p->first - cur); - cur = p->first; - continue; // more? - } - else - assert(0); - } - - assert(left == 0); - assert(cur == start+len); - return num_missing; -} - - - - - -/* - * map a range of blocks into buffer_heads. - * - create missing buffer_heads as necessary. - * - fragment along disk extent boundaries - */ -int ObjectCache::map_read(block_t start, block_t len, - map& hits, - map& missing, - map& rx, - map& partial) { - - map::iterator p = data.lower_bound(start); - - block_t cur = start; - block_t left = len; - - if (p != data.begin() && - (p == data.end() || p->first > cur)) { - p--; // might overlap! - if (p->first + p->second->length() <= cur) - p++; // doesn't overlap. - } - - while (left > 0) { - // at end? - if (p == data.end()) { - // rest is a miss. - vector exv; - //on->map_extents(cur, left, exv); // we might consider some prefetch here. - on->map_extents(cur, - //MIN(left + g_conf.ebofs_max_prefetch, // prefetch - //on->object_blocks-cur), - left, // no prefetch - exv); - for (unsigned i=0; i 0; i++) { - BufferHead *n = new BufferHead(this); - n->set_start( cur ); - n->set_length( exv[i].length ); - bc->add_bh(n); - missing[cur] = n; - dout(20) << "map_read miss " << left << " left, " << *n << endl; - cur += MIN(left,exv[i].length); - left -= MIN(left,exv[i].length); - } - assert(left == 0); - assert(cur == start+len); - break; - } - - if (p->first <= cur) { - // have it (or part of it) - BufferHead *e = p->second; - - if (e->is_clean() || - e->is_dirty() || - e->is_tx()) { - hits[cur] = e; // readable! - dout(20) << "map_read hit " << *e << endl; - bc->touch(e); - } - else if (e->is_rx()) { - rx[cur] = e; // missing, not readable. - dout(20) << "map_read rx " << *e << endl; - } - else if (e->is_partial()) { - partial[cur] = e; - dout(20) << "map_read partial " << *e << endl; - } - else { - dout(0) << "map_read ??? got unexpected " << *e << endl; - assert(0); - } - - block_t lenfromcur = MIN(e->end() - cur, left); - cur += lenfromcur; - left -= lenfromcur; - p++; - continue; // more? - } else if (p->first > cur) { - // gap.. miss - block_t next = p->first; - vector exv; - on->map_extents(cur, - //MIN(next-cur, MIN(left + g_conf.ebofs_max_prefetch, // prefetch - // on->object_blocks-cur)), - MIN(next-cur, left), // no prefetch - exv); - - for (unsigned i=0; i0; i++) { - BufferHead *n = new BufferHead(this); - n->set_start( cur ); - n->set_length( exv[i].length ); - bc->add_bh(n); - missing[cur] = n; - cur += MIN(left, n->length()); - left -= MIN(left, n->length()); - dout(20) << "map_read gap " << *n << endl; - } - continue; // more? - } - else - assert(0); - } - - assert(left == 0); - assert(cur == start+len); - return 0; -} - - -/* - * map a range of pages on an object's buffer cache. - * - * - break up bufferheads that don't fall completely within the range - * - cancel rx ops we obsolete. - * - resubmit rx ops if we split bufferheads - * - * - leave potentially obsoleted tx ops alone (for now) - * - don't worry about disk extent boundaries (yet) - */ -int ObjectCache::map_write(block_t start, block_t len, - interval_set& alloc, - map& hits, - version_t super_epoch) -{ - map::iterator p = data.lower_bound(start); - - dout(10) << "map_write " << *on << " " << start << "~" << len << " ... alloc " << alloc << endl; - // p->first >= start - - block_t cur = start; - block_t left = len; - - if (p != data.begin() && - (p == data.end() || p->first > cur)) { - p--; // might overlap! - if (p->first + p->second->length() <= cur) - p++; // doesn't overlap. - } - - //dump(); - - while (left > 0) { - // max for this bh (bc of (re)alloc on disk) - block_t max = left; - bool newalloc = false; - - // based on alloc/no-alloc boundary ... - if (alloc.contains(cur, left)) { - if (alloc.contains(cur)) { - block_t ends = alloc.end_after(cur); - max = MIN(left, ends-cur); - newalloc = true; - } else { - if (alloc.starts_after(cur)) { - block_t st = alloc.start_after(cur); - max = MIN(left, st-cur); - } - } - } - - // based on disk extent boundary ... - vector exv; - on->map_extents(cur, max, exv); - if (exv.size() > 1) - max = exv[0].length; - - if (newalloc) { - dout(10) << "map_write " << cur << "~" << max << " is new alloc on disk" << endl; - } else { - dout(10) << "map_write " << cur << "~" << max << " keeps old alloc on disk" << endl; - } - - // at end? - if (p == data.end()) { - BufferHead *n = new BufferHead(this); - n->set_start( cur ); - n->set_length( max ); - bc->add_bh(n); - hits[cur] = n; - left -= max; - cur += max; - continue; - } - - dout(10) << "p is " << *p->second << endl; - - - if (p->first <= cur) { - BufferHead *bh = p->second; - dout(10) << "map_write bh " << *bh << " intersected" << endl; - - if (p->first < cur) { - if (cur+max >= p->first+p->second->length()) { - // we want right bit (one splice) - if (bh->is_rx() && bc->bh_cancel_read(bh)) { - BufferHead *right = bc->split(bh, cur); - bc->bh_read(on, bh); // reread left bit - bh = right; - } else if (bh->is_tx() && !newalloc && bc->bh_cancel_write(bh, super_epoch)) { - BufferHead *right = bc->split(bh, cur); - bc->bh_write(on, bh); // rewrite left bit - bh = right; - } else { - bh = bc->split(bh, cur); // just split it - } - p++; - assert(p->second == bh); - } else { - // we want middle bit (two splices) - if (bh->is_rx() && bc->bh_cancel_read(bh)) { - BufferHead *middle = bc->split(bh, cur); - bc->bh_read(on, bh); // reread left - p++; - assert(p->second == middle); - BufferHead *right = bc->split(middle, cur+max); - bc->bh_read(on, right); // reread right - bh = middle; - } else if (bh->is_tx() && !newalloc && bc->bh_cancel_write(bh, super_epoch)) { - BufferHead *middle = bc->split(bh, cur); - bc->bh_write(on, bh); // redo left - p++; - assert(p->second == middle); - BufferHead *right = bc->split(middle, cur+max); - bc->bh_write(on, right); // redo right - bh = middle; - } else { - BufferHead *middle = bc->split(bh, cur); - p++; - assert(p->second == middle); - bc->split(middle, cur+max); - bh = middle; - } - } - } else if (p->first == cur) { - if (p->second->length() <= max) { - // whole bufferhead, piece of cake. - } else { - // we want left bit (one splice) - if (bh->is_rx() && bc->bh_cancel_read(bh)) { - BufferHead *right = bc->split(bh, cur+max); - bc->bh_read(on, right); // re-rx the right bit - } else if (bh->is_tx() && !newalloc && bc->bh_cancel_write(bh, super_epoch)) { - BufferHead *right = bc->split(bh, cur+max); - bc->bh_write(on, right); // re-tx the right bit - } else { - bc->split(bh, cur+max); // just split - } - } - } - - // try to cancel tx? - if (bh->is_tx() && !newalloc) bc->bh_cancel_write(bh, super_epoch); - - // put in our map - hits[cur] = bh; - - // keep going. - block_t lenfromcur = bh->end() - cur; - cur += lenfromcur; - left -= lenfromcur; - p++; - continue; - } else { - // gap! - block_t next = p->first; - block_t glen = MIN(next-cur, max); - dout(10) << "map_write gap " << cur << "~" << glen << endl; - BufferHead *n = new BufferHead(this); - n->set_start( cur ); - n->set_length( glen ); - bc->add_bh(n); - hits[cur] = n; - - cur += glen; - left -= glen; - continue; // more? - } - } - - assert(left == 0); - assert(cur == start+len); - return 0; -} - -/* don't need this. -int ObjectCache::scan_versions(block_t start, block_t len, - version_t& low, version_t& high) -{ - map::iterator p = data.lower_bound(start); - // p->first >= start - - if (p != data.begin() && p->first > start) { - p--; // might overlap? - if (p->first + p->second->length() <= start) - p++; // doesn't overlap. - } - if (p->first >= start+len) - return -1; // to the right. no hits. - - // start - low = high = p->second->get_version(); - - for (p++; p != data.end(); p++) { - // past? - if (p->first >= start+len) break; - - const version_t v = p->second->get_version(); - if (low > v) low = v; - if (high < v) high = v; - } - - return 0; -} -*/ - -void ObjectCache::touch_bottom(block_t bstart, block_t blast) -{ - for (map::iterator p = data.lower_bound(bstart); - p != data.end(); - ++p) { - BufferHead *bh = p->second; - - // don't trim unless it's entirely in our range - if (bh->start() < bstart) continue; - if (bh->end() > blast) break; - - dout(12) << "moving " << *bh << " to bottom of lru" << endl; - bc->touch_bottom(bh); // move to bottom of lru list - } -} - - -void ObjectCache::truncate(block_t blocks, version_t super_epoch) -{ - dout(7) << "truncate " << object_id - << " " << blocks << " blocks" - << endl; - - while (!data.empty()) { - block_t bhoff = data.rbegin()->first; - BufferHead *bh = data.rbegin()->second; - - if (bh->end() <= blocks) break; - - bool uncom = on->uncommitted.contains(bh->start(), bh->length()); - dout(10) << "truncate " << *bh << " uncom " << uncom - << " of " << on->uncommitted - << endl; - - if (bhoff < blocks) { - // we want right bit (one splice) - if (bh->is_rx() && bc->bh_cancel_read(bh)) { - BufferHead *right = bc->split(bh, blocks); - bc->bh_read(on, bh); // reread left bit - bh = right; - } else if (bh->is_tx() && uncom && bc->bh_cancel_write(bh, super_epoch)) { - BufferHead *right = bc->split(bh, blocks); - bc->bh_write(on, bh); // rewrite left bit - bh = right; - } else { - bh = bc->split(bh, blocks); // just split it - } - // no worries about partials up here, they're always 1 block (and thus never split) - } else { - // whole thing - // cancel any pending/queued io, if possible. - if (bh->is_rx()) - bc->bh_cancel_read(bh); - if (bh->is_tx() && uncom) - bc->bh_cancel_write(bh, super_epoch); - if (bh->shadow_of) { - dout(10) << "truncate " << *bh << " unshadowing " << *bh->shadow_of << endl; - // shadow - bh->shadow_of->remove_shadow(bh); - if (bh->is_partial()) - bc->cancel_shadow_partial(bh->rx_from.start, bh); - } else { - // normal - if (bh->is_partial() && uncom) - bc->bh_cancel_partial_write(bh); - } - } - - for (map >::iterator p = bh->waitfor_read.begin(); - p != bh->waitfor_read.end(); - p++) { - finish_contexts(p->second, -1); - } - - bc->remove_bh(bh); - delete bh; - } -} - - -void ObjectCache::clone_to(Onode *other) -{ - ObjectCache *ton = 0; - - for (map::iterator p = data.begin(); - p != data.end(); - p++) { - BufferHead *bh = p->second; - dout(10) << "clone_to ? " << *bh << endl; - if (bh->is_dirty() || bh->is_tx() || bh->is_partial()) { - // dup dirty or tx bh's - if (!ton) - ton = other->get_oc(bc); - BufferHead *nbh = new BufferHead(ton); - nbh->set_start( bh->start() ); - nbh->set_length( bh->length() ); - nbh->data = bh->data; // just copy refs to underlying buffers. - bc->add_bh(nbh); - - if (bh->is_partial()) { - dout(0) << "clone_to PARTIAL FIXME NOT FULLY IMPLEMENTED ******" << endl; - nbh->partial = bh->partial; - bc->mark_partial(nbh); - // register as shadow_partial - bc->add_shadow_partial(bh->rx_from.start, nbh); - } else { - // clean buffer will shadow - bh->add_shadow(nbh); - bc->mark_clean(nbh); - } - - dout(10) << "clone_to dup " << *bh << " -> " << *nbh << endl; - } - } -} - - - -/************** BufferCache ***************/ - -#undef dout -#define dout(x) if (x <= g_conf.debug_ebofs) cout << "ebofs.bc." - - - -BufferHead *BufferCache::split(BufferHead *orig, block_t after) -{ - dout(20) << "split " << *orig << " at " << after << endl; - - // split off right - BufferHead *right = new BufferHead(orig->get_oc()); - right->set_version(orig->get_version()); - right->epoch_modified = orig->epoch_modified; - right->last_flushed = orig->last_flushed; - right->set_state(orig->get_state()); - - block_t newleftlen = after - orig->start(); - right->set_start( after ); - right->set_length( orig->length() - newleftlen ); - - // shorten left - stat_sub(orig); - orig->set_length( newleftlen ); - stat_add(orig); - - // add right - add_bh(right); - - // adjust rx_from - if (orig->is_rx()) { - right->rx_from = orig->rx_from; - orig->rx_from.length = newleftlen; - right->rx_from.length -= newleftlen; - right->rx_from.start += newleftlen; - } - - // dup shadows - for (set::iterator p = orig->shadows.begin(); - p != orig->shadows.end(); - ++p) - right->add_shadow(*p); - - // split buffers too - bufferlist bl; - bl.claim(orig->data); - if (bl.length()) { - assert(bl.length() == (orig->length()+right->length())*EBOFS_BLOCK_SIZE); - right->data.substr_of(bl, orig->length()*EBOFS_BLOCK_SIZE, right->length()*EBOFS_BLOCK_SIZE); - orig->data.substr_of(bl, 0, orig->length()*EBOFS_BLOCK_SIZE); - } - - // move read waiters - if (!orig->waitfor_read.empty()) { - map >::iterator o, p = orig->waitfor_read.end(); - p--; - while (p != orig->waitfor_read.begin()) { - if (p->first < right->start()) break; - dout(0) << "split moving waiters at block " << p->first << " to right bh" << endl; - right->waitfor_read[p->first].swap( p->second ); - o = p; - p--; - orig->waitfor_read.erase(o); - } - } - - dout(20) << "split left is " << *orig << endl; - dout(20) << "split right is " << *right << endl; - return right; -} - - -void BufferCache::bh_read(Onode *on, BufferHead *bh, block_t from) -{ - dout(10) << "bh_read " << *on << " on " << *bh << endl; - - if (bh->is_missing()) { - mark_rx(bh); - } else { - assert(bh->is_partial()); - } - - // get extent. there should be only one! - vector exv; - on->map_extents(bh->start(), bh->length(), exv); - assert(exv.size() == 1); - Extent ex = exv[0]; - - if (from) { // force behavior, used for reading partials - dout(10) << "bh_read forcing read from block " << from << " (for a partial)" << endl; - ex.start = from; - ex.length = 1; - } - - // this should be empty!! - assert(bh->rx_ioh == 0); - - dout(20) << "bh_read " << *on << " " << *bh << " from " << ex << endl; - - C_OC_RxFinish *fin = new C_OC_RxFinish(ebofs_lock, on->oc, - bh->start(), bh->length(), - ex.start); - - //bufferpool.alloc(EBOFS_BLOCK_SIZE*bh->length(), fin->bl); // new buffers! - fin->bl.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*bh->length()) ); - - bh->rx_ioh = dev.read(ex.start, ex.length, fin->bl, - fin); - bh->rx_from = ex; - on->oc->get(); - -} - -bool BufferCache::bh_cancel_read(BufferHead *bh) -{ - if (bh->rx_ioh && dev.cancel_io(bh->rx_ioh) >= 0) { - dout(10) << "bh_cancel_read on " << *bh << endl; - bh->rx_ioh = 0; - mark_missing(bh); - int l = bh->oc->put(); - assert(l); - return true; - } - return false; -} - -void BufferCache::bh_write(Onode *on, BufferHead *bh, block_t shouldbe) -{ - dout(10) << "bh_write " << *on << " on " << *bh << " in epoch " << bh->epoch_modified << endl; - assert(bh->get_version() > 0); - - assert(bh->is_dirty()); - mark_tx(bh); - - // get extents - vector exv; - on->map_extents(bh->start(), bh->length(), exv); - assert(exv.size() == 1); - Extent ex = exv[0]; - - if (shouldbe) - assert(ex.length == 1 && ex.start == shouldbe); - - dout(20) << "bh_write " << *on << " " << *bh << " to " << ex << endl; - - //assert(bh->tx_ioh == 0); - - assert(bh->get_last_flushed() < bh->get_version()); - - bh->tx_block = ex.start; - bh->tx_ioh = dev.write(ex.start, ex.length, bh->data, - new C_OC_TxFinish(ebofs_lock, on->oc, - bh->start(), bh->length(), - bh->get_version(), - bh->epoch_modified), - "bh_write"); - - on->oc->get(); - inc_unflushed( EBOFS_BC_FLUSH_BHWRITE, bh->epoch_modified ); - - /* - // assert: no partials on the same block - // hose any partial on the same block - if (bh->partial_write.count(ex.start)) { - dout(10) << "bh_write hosing parital write on same block " << ex.start << " " << *bh << endl; - dec_unflushed( bh->partial_write[ex.start].epoch ); - bh->partial_write.erase(ex.start); - } - */ -} - - -bool BufferCache::bh_cancel_write(BufferHead *bh, version_t cur_epoch) -{ - if (bh->tx_ioh && dev.cancel_io(bh->tx_ioh) >= 0) { - dout(10) << "bh_cancel_write on " << *bh << endl; - bh->tx_ioh = 0; - mark_dirty(bh); - - assert(bh->epoch_modified == cur_epoch); - assert(bh->epoch_modified > 0); - dec_unflushed( EBOFS_BC_FLUSH_BHWRITE, bh->epoch_modified ); // assert.. this should be the same epoch! - - int l = bh->oc->put(); - assert(l); - return true; - } - return false; -} - -void BufferCache::tx_finish(ObjectCache *oc, - ioh_t ioh, block_t start, block_t length, - version_t version, version_t epoch) -{ - ebofs_lock.Lock(); - - // finish oc - if (oc->put() == 0) { - delete oc; - } else - oc->tx_finish(ioh, start, length, version, epoch); - - // update unflushed counter - assert(get_unflushed(EBOFS_BC_FLUSH_BHWRITE, epoch) > 0); - dec_unflushed(EBOFS_BC_FLUSH_BHWRITE, epoch); - - ebofs_lock.Unlock(); -} - -void BufferCache::rx_finish(ObjectCache *oc, - ioh_t ioh, block_t start, block_t length, - block_t diskstart, - bufferlist& bl) -{ - ebofs_lock.Lock(); - dout(10) << "rx_finish ioh " << ioh << " on " << start << "~" << length - << ", at device block " << diskstart << endl; - - // oc - if (oc->put() == 0) - delete oc; - else - oc->rx_finish(ioh, start, length, bl); - - // finish any partials? - // note: these are partials that were re-written after a commit, - // or for whom the OC was destroyed (eg truncated after a commit) - map >::iterator sp = partial_write.lower_bound(diskstart); - while (sp != partial_write.end()) { - if (sp->first >= diskstart+length) break; - assert(sp->first >= diskstart); - - block_t pblock = sp->first; - map writes; - writes.swap( sp->second ); - - map >::iterator t = sp; - sp++; - partial_write.erase(t); - - for (map::iterator p = writes.begin(); - p != writes.end(); - p++) { - dout(10) << "rx_finish partial from " << pblock << " -> " << p->first - << " for epoch " << p->second.epoch - //<< " (bh.epoch_modified is now " << bh->epoch_modified << ")" - << endl; - // this had better be a past epoch - //assert(p->epoch == epoch_modified - 1); // ?? - - // make the combined block - bufferlist combined; - bufferptr bp = buffer::create_page_aligned(EBOFS_BLOCK_SIZE); - combined.push_back( bp ); - combined.copy_in((pblock-diskstart)*EBOFS_BLOCK_SIZE, (pblock-diskstart+1)*EBOFS_BLOCK_SIZE, bl); - BufferHead::apply_partial( combined, p->second.partial ); - - // write it! - dev.write( pblock, 1, combined, - new C_OC_PartialTxFinish( this, p->second.epoch ), - "finish_partials"); - } - } - - // shadow partials? - { - list waiters; - map >::iterator sp = shadow_partials.lower_bound(diskstart); - while (sp != shadow_partials.end()) { - if (sp->first >= diskstart+length) break; - assert(sp->first >= diskstart); - - block_t pblock = sp->first; - set ls; - ls.swap( sp->second ); - - map >::iterator t = sp; - sp++; - shadow_partials.erase(t); - - for (set::iterator p = ls.begin(); - p != ls.end(); - ++p) { - BufferHead *bh = *p; - dout(10) << "rx_finish applying shadow_partial for " << pblock - << " to " << *bh << endl; - bufferptr bp = buffer::create_page_aligned(EBOFS_BLOCK_SIZE); - bh->data.clear(); - bh->data.push_back( bp ); - bh->data.copy_in((pblock-diskstart)*EBOFS_BLOCK_SIZE, - (pblock-diskstart+1)*EBOFS_BLOCK_SIZE, - bl); - bh->apply_partial(); - bh->set_state(BufferHead::STATE_CLEAN); - - // trigger waiters - for (map >::iterator p = bh->waitfor_read.begin(); - p != bh->waitfor_read.end(); - p++) { - assert(p->first >= bh->start() && p->first < bh->end()); - waiters.splice(waiters.begin(), p->second); - } - bh->waitfor_read.clear(); - } - } - - // kick waiters - finish_contexts(waiters); - } - - // done. - ebofs_lock.Unlock(); -} - -void BufferCache::partial_tx_finish(version_t epoch) -{ - ebofs_lock.Lock(); - - dout(10) << "partial_tx_finish in epoch " << epoch << endl; - - // update unflushed counter - assert(get_unflushed(EBOFS_BC_FLUSH_PARTIAL, epoch) > 0); - dec_unflushed(EBOFS_BC_FLUSH_PARTIAL, epoch); - - ebofs_lock.Unlock(); -} - - - - -void BufferCache::bh_queue_partial_write(Onode *on, BufferHead *bh) -{ - assert(bh->get_version() > 0); - - assert(bh->is_partial()); - assert(bh->length() == 1); - - // get the block no - vector exv; - on->map_extents(bh->start(), bh->length(), exv); - assert(exv.size() == 1); - block_t b = exv[0].start; - assert(exv[0].length == 1); - bh->partial_tx_to = exv[0].start; - bh->partial_tx_epoch = bh->epoch_modified; - - dout(10) << "bh_queue_partial_write " << *on << " on " << *bh << " block " << b << " epoch " << bh->epoch_modified << endl; - - - // copy map state, queue for this block - assert(bh->rx_from.length == 1); - queue_partial( bh->rx_from.start, bh->partial_tx_to, bh->partial, bh->partial_tx_epoch ); -} - -void BufferCache::bh_cancel_partial_write(BufferHead *bh) -{ - assert(bh->is_partial()); - assert(bh->length() == 1); - - cancel_partial( bh->rx_from.start, bh->partial_tx_to, bh->partial_tx_epoch ); -} - - -void BufferCache::queue_partial(block_t from, block_t to, - map& partial, version_t epoch) -{ - dout(10) << "queue_partial " << from << " -> " << to - << " in epoch " << epoch - << endl; - - if (partial_write[from].count(to)) { - // this should be in the same epoch. - assert( partial_write[from][to].epoch == epoch); - assert(0); // actually.. no! - } else { - inc_unflushed( EBOFS_BC_FLUSH_PARTIAL, epoch ); - } - - partial_write[from][to].partial = partial; - partial_write[from][to].epoch = epoch; -} - -void BufferCache::cancel_partial(block_t from, block_t to, version_t epoch) -{ - assert(partial_write.count(from)); - assert(partial_write[from].count(to)); - assert(partial_write[from][to].epoch == epoch); - - dout(10) << "cancel_partial " << from << " -> " << to - << " (was epoch " << partial_write[from][to].epoch << ")" - << endl; - - partial_write[from].erase(to); - if (partial_write[from].empty()) - partial_write.erase(from); - - dec_unflushed( EBOFS_BC_FLUSH_PARTIAL, epoch ); -} - - -void BufferCache::add_shadow_partial(block_t from, BufferHead *bh) -{ - dout(10) << "add_shadow_partial from " << from << " " << *bh << endl; - shadow_partials[from].insert(bh); -} - -void BufferCache::cancel_shadow_partial(block_t from, BufferHead *bh) -{ - dout(10) << "cancel_shadow_partial from " << from << " " << *bh << endl; - shadow_partials[from].erase(bh); -} diff --git a/branches/marnberg/quota/ebofs/BufferCache.h b/branches/marnberg/quota/ebofs/BufferCache.h deleted file mode 100644 index 563b3e5791c21..0000000000000 --- a/branches/marnberg/quota/ebofs/BufferCache.h +++ /dev/null @@ -1,697 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __EBOFS_BUFFERCACHE_H -#define __EBOFS_BUFFERCACHE_H - -#include "include/lru.h" -#include "include/Context.h" - -#include "common/Clock.h" - -#include "types.h" -#include "BlockDevice.h" - -#include "include/interval_set.h" - -class ObjectCache; -class BufferCache; -class Onode; - -class BufferHead : public LRUObject { - public: - /* - * - buffer_heads should always break across disk extent boundaries - * - partial buffer_heads are always 1 block. - */ - const static int STATE_MISSING = 0; // missing; data is on disk, but not loaded. - const static int STATE_CLEAN = 1; // Rw clean - const static int STATE_DIRTY = 2; // RW dirty - const static int STATE_TX = 3; // Rw flushing to disk - const static int STATE_RX = 4; // w reading from disk - const static int STATE_PARTIAL = 5; // reading from disk, + partial content map. always 1 block. - - public: - ObjectCache *oc; - - bufferlist data; - - ioh_t rx_ioh; // - Extent rx_from; - ioh_t tx_ioh; // - block_t tx_block; - block_t partial_tx_to; - version_t partial_tx_epoch; - - map partial; // partial dirty content overlayed onto incoming data - - map< block_t, list > waitfor_read; - - set shadows; // shadow bh's that clone()ed me. - BufferHead* shadow_of; - - private: - int ref; - int state; - - public: - version_t epoch_modified; - - version_t version; // current version in cache - version_t last_flushed; // last version flushed to disk - - Extent object_loc; // block position _in_object_ - - utime_t dirty_stamp; - - bool want_to_expire; // wants to be at bottom of lru - - public: - BufferHead(ObjectCache *o) : - oc(o), //cancellable_ioh(0), tx_epoch(0), - rx_ioh(0), tx_ioh(0), tx_block(0), partial_tx_to(0), partial_tx_epoch(0), - shadow_of(0), - ref(0), state(STATE_MISSING), epoch_modified(0), version(0), last_flushed(0), - want_to_expire(false) - {} - ~BufferHead() { - unpin_shadows(); - } - - ObjectCache *get_oc() { return oc; } - - int get() { - assert(ref >= 0); - if (ref == 0) lru_pin(); - return ++ref; - } - int put() { - assert(ref > 0); - if (ref == 1) lru_unpin(); - --ref; - return ref; - } - - block_t start() { return object_loc.start; } - void set_start(block_t s) { object_loc.start = s; } - block_t length() { return object_loc.length; } - void set_length(block_t l) { object_loc.length = l; } - block_t end() { return start() + length(); } - block_t last() { return end()-1; } - - version_t get_version() { return version; } - void set_version(version_t v) { version = v; } - version_t get_last_flushed() { return last_flushed; } - void set_last_flushed(version_t v) { - if (v <= last_flushed) cout << "last_flushed begin set to " << v << ", was " << last_flushed << endl; - assert(v > last_flushed); - last_flushed = v; - } - - utime_t get_dirty_stamp() { return dirty_stamp; } - void set_dirty_stamp(utime_t t) { dirty_stamp = t; } - - void set_state(int s) { - if (s == STATE_PARTIAL || s == STATE_RX || s == STATE_TX) get(); - if (state == STATE_PARTIAL || state == STATE_RX || state == STATE_TX) put(); - - if ((state == STATE_TX && s != STATE_TX) || - (state == STATE_PARTIAL && s != STATE_PARTIAL)) - unpin_shadows(); - - state = s; - } - int get_state() { return state; } - - bool is_missing() { return state == STATE_MISSING; } - bool is_dirty() { return state == STATE_DIRTY; } - bool is_clean() { return state == STATE_CLEAN; } - bool is_tx() { return state == STATE_TX; } - bool is_rx() { return state == STATE_RX; } - bool is_partial() { return state == STATE_PARTIAL; } - - //bool is_partial_writes() { return !partial_write.empty(); } - //void finish_partials(); - //void cancel_partials(); - //void queue_partial_write(block_t b); - - void add_shadow(BufferHead *dup) { - shadows.insert(dup); - dup->shadow_of = this; - dup->get(); - } - void remove_shadow(BufferHead *dup) { - shadows.erase(dup); - dup->shadow_of = 0; - dup->put(); - } - void unpin_shadows() { - for (set::iterator p = shadows.begin(); - p != shadows.end(); - ++p) { - //cout << "unpin shadow " << *p << endl; - (*p)->shadow_of = 0; - (*p)->put(); - } - shadows.clear(); - } - - void copy_partial_substr(off_t start, off_t end, bufferlist& bl) { - map::iterator i = partial.begin(); - - // skip first bits (fully to left) - while ((i->first + i->second.length() < start) && - i != partial.end()) - i++; - assert(i != partial.end()); - assert(i->first <= start); - - // first - unsigned bhoff = MAX(start, i->first) - i->first; - unsigned bhlen = MIN(end-start, i->second.length()); - bl.substr_of( i->second, bhoff, bhlen ); - - off_t pos = i->first + i->second.length(); - - // have continuous to end? - for (i++; i != partial.end(); i++) { - if (pos >= end) break; - assert(pos == i->first); - - pos = i->first + i->second.length(); - - if (pos <= end) { // this whole frag - bl.append( i->second ); - } else { // partial end - unsigned bhlen = end-start-bl.length(); - bufferlist frag; - frag.substr_of( i->second, 0, bhlen ); - bl.claim_append(frag); - break; // done. - } - } - - assert(pos >= end); - assert(bl.length() == (unsigned)(end-start)); - } - - bool have_partial_range(off_t start, off_t end) { - map::iterator i = partial.begin(); - - // skip first bits (fully to left) - while ((i->first + i->second.length() < start) && - i != partial.end()) - i++; - if (i == partial.end()) return false; - - // have start? - if (i->first > start) return false; - off_t pos = i->first + i->second.length(); - - // have continuous to end? - for (i++; i != partial.end(); i++) { - assert(pos <= i->first); - if (pos < i->first) return false; - assert(pos == i->first); - pos = i->first + i->second.length(); - if (pos >= end) break; // gone far enough - } - - if (pos >= end) return true; - return false; - } - - bool partial_is_complete(off_t size) { - return have_partial_range( 0, MIN(size, EBOFS_BLOCK_SIZE) ); - //(off_t)(start()*EBOFS_BLOCK_SIZE), - //MIN( size, (off_t)(end()*EBOFS_BLOCK_SIZE) ) ); - } - void apply_partial() { - apply_partial(data, partial); - partial.clear(); - } - static void apply_partial(bufferlist& bl, map& pm) { - assert(bl.length() == (unsigned)EBOFS_BLOCK_SIZE); - //assert(partial_is_complete()); - //cout << "apply_partial" << endl; - for (map::iterator i = pm.begin(); - i != pm.end(); - i++) { - int pos = i->first; - //cout << " frag at opos " << i->first << " bhpos " << pos << " len " << i->second.length() << endl; - bl.copy_in(pos, i->second.length(), i->second); - } - pm.clear(); - } - void add_partial(off_t off, bufferlist& p) { - unsigned len = p.length(); - assert(len <= (unsigned)EBOFS_BLOCK_SIZE); - //assert(off >= (off_t)(start()*EBOFS_BLOCK_SIZE)); - //assert(off + len <= (off_t)(end()*EBOFS_BLOCK_SIZE)); - assert(off >= 0); - assert(off + len <= EBOFS_BLOCK_SIZE); - - // trim any existing that overlaps - for (map::iterator i = partial.begin(); - i != partial.end(); - ) { - if (i->first + i->second.length() <= off) { // before - i++; - continue; - } - if (i->first >= off+len) break; // past affected area. - - // overlap all? - if (off <= i->first && i->first + i->second.length() <= off+len) { - // erase it and move on. - off_t dead = i->first; - i++; - partial.erase(dead); - continue; - } - // overlap tail? - else if (i->first < off && off < i->first + i->second.length()) { - // shorten. - unsigned newlen = off - i->first; - bufferlist o; - o.claim( i->second ); - i->second.substr_of(o, 0, newlen); - i++; - continue; - } - // overlap head? - else if (off < i->first && off+len < i->first + i->second.length()) { - // move. - off_t oldoff = i->first; - off_t newoff = off+len; - unsigned trim = newoff - oldoff; - partial[newoff].substr_of(i->second, trim, i->second.length()-trim); - i++; // should be at newoff! - partial.erase( oldoff ); - i++; - continue; - } else - assert(0); - } - - // insert - partial[off] = p; - } - - -}; - -inline ostream& operator<<(ostream& out, BufferHead& bh) -{ - out << "bufferhead(" << bh.start() << "~" << bh.length(); - out << " v" << bh.get_version() << "/" << bh.get_last_flushed(); - if (bh.is_missing()) out << " missing"; - if (bh.is_dirty()) out << " dirty"; - if (bh.is_clean()) out << " clean"; - if (bh.is_rx()) out << " rx"; - if (bh.is_tx()) out << " tx"; - if (bh.is_partial()) out << " partial"; - //out << " " << bh.data.length(); - out << " " << &bh; - out << ")"; - return out; -} - - -class ObjectCache { - public: - object_t object_id; - Onode *on; - BufferCache *bc; - - private: - map data; - int ref; - - public: - version_t write_count; - - - public: - ObjectCache(object_t o, Onode *_on, BufferCache *b) : - object_id(o), on(_on), bc(b), ref(0), - write_count(0) { } - ~ObjectCache() { - assert(data.empty()); - assert(ref == 0); - } - - int get() { - ++ref; - //cout << "oc.get " << object_id << " " << ref << endl; - return ref; - } - int put() { - assert(ref > 0); - --ref; - //cout << "oc.put " << object_id << " " << ref << endl; - return ref; - } - - object_t get_object_id() { return object_id; } - - void add_bh(BufferHead *bh) { - // add to my map - assert(data.count(bh->start()) == 0); - - if (0) { // sanity check FIXME DEBUG - //cout << "add_bh " << bh->start() << "~" << bh->length() << endl; - map::iterator p = data.lower_bound(bh->start()); - if (p != data.end()) { - //cout << " after " << *p->second << endl; - //cout << " after starts at " << p->first << endl; - assert(p->first >= bh->end()); - } - if (p != data.begin()) { - p--; - //cout << " before starts at " << p->second->start() - //<< " and ends at " << p->second->end() << endl; - //cout << " before " << *p->second << endl; - assert(p->second->end() <= bh->start()); - } - } - - data[bh->start()] = bh; - } - void remove_bh(BufferHead *bh) { - assert(data.count(bh->start())); - data.erase(bh->start()); - } - bool is_empty() { return data.empty(); } - - int find_tx(block_t start, block_t len, - list& tx); - - int map_read(block_t start, block_t len, - map& hits, // hits - map& missing, // read these from disk - map& rx, // wait for these to finish reading from disk - map& partial); // (maybe) wait for these to read from disk - int try_map_read(block_t start, block_t len); // just tell us how many extents we're missing. - - - int map_write(block_t start, block_t len, - interval_set& alloc, - map& hits, - version_t super_epoch); // can write to these. - void touch_bottom(block_t bstart, block_t blast); - - BufferHead *split(BufferHead *bh, block_t off); - - /*int scan_versions(block_t start, block_t len, - version_t& low, version_t& high); - */ - - void rx_finish(ioh_t ioh, block_t start, block_t length, bufferlist& bl); - void tx_finish(ioh_t ioh, block_t start, block_t length, version_t v, version_t epoch); - - void truncate(block_t blocks, version_t super_epoch); - // void tear_down(); - - void clone_to(Onode *other); - - void dump() { - for (map::iterator i = data.begin(); - i != data.end(); - i++) - cout << "dump: " << i->first << ": " << *i->second << endl; - } - -}; - - - -class BufferCache { - public: - Mutex &ebofs_lock; // hack: this is a ref to global ebofs_lock - BlockDevice &dev; - - set dirty_bh; - - LRU lru_dirty, lru_rest; - - private: - Cond stat_cond; - Cond flush_cond; - int stat_waiter; - - off_t stat_clean; - off_t stat_dirty; - off_t stat_rx; - off_t stat_tx; - off_t stat_partial; - off_t stat_missing; - -#define EBOFS_BC_FLUSH_BHWRITE 0 -#define EBOFS_BC_FLUSH_PARTIAL 1 - - map epoch_unflushed[2]; - - /* partial writes - incomplete blocks that can't be written until - * their prior content is read and overlayed with the new data. - * - * we put partial block management here because objects may be deleted - * before the read completes, but the write may have been committed in a - * prior epoch. - * - * we map: src block -> dest block -> PartialWrite - * - * really, at most there will only ever be two of these, for current+previous epochs. - */ - class PartialWrite { - public: - map partial; // partial dirty content overlayed onto incoming data - version_t epoch; - }; - - map > partial_write; // queued writes w/ partial content - map > shadow_partials; - - public: - BufferCache(BlockDevice& d, Mutex& el) : - ebofs_lock(el), dev(d), - stat_waiter(0), - stat_clean(0), stat_dirty(0), stat_rx(0), stat_tx(0), stat_partial(0), stat_missing(0) - {} - - - off_t get_size() { - return stat_clean+stat_dirty+stat_rx+stat_tx+stat_partial; - } - off_t get_trimmable() { - return stat_clean; - } - - - // bh's in cache - void add_bh(BufferHead *bh) { - bh->get_oc()->add_bh(bh); - if (bh->is_dirty()) { - lru_dirty.lru_insert_mid(bh); - dirty_bh.insert(bh); - } else - lru_rest.lru_insert_mid(bh); - stat_add(bh); - } - void touch(BufferHead *bh) { - if (bh->is_dirty()) { - lru_dirty.lru_touch(bh); - } else - lru_rest.lru_touch(bh); - } - void touch_bottom(BufferHead *bh) { - if (bh->is_dirty()) { - bh->want_to_expire = true; - lru_dirty.lru_bottouch(bh); - } else - lru_rest.lru_bottouch(bh); - } - void remove_bh(BufferHead *bh) { - bh->get_oc()->remove_bh(bh); - stat_sub(bh); - if (bh->is_dirty()) { - lru_dirty.lru_remove(bh); - dirty_bh.erase(bh); - } else - lru_rest.lru_remove(bh); - } - - // stats - void stat_add(BufferHead *bh) { - switch (bh->get_state()) { - case BufferHead::STATE_MISSING: stat_missing += bh->length(); break; - case BufferHead::STATE_CLEAN: stat_clean += bh->length(); break; - case BufferHead::STATE_DIRTY: stat_dirty += bh->length(); break; - case BufferHead::STATE_TX: stat_tx += bh->length(); break; - case BufferHead::STATE_RX: stat_rx += bh->length(); break; - case BufferHead::STATE_PARTIAL: stat_partial += bh->length(); break; - } - if (stat_waiter) stat_cond.Signal(); - } - void stat_sub(BufferHead *bh) { - switch (bh->get_state()) { - case BufferHead::STATE_MISSING: stat_missing -= bh->length(); break; - case BufferHead::STATE_CLEAN: stat_clean -= bh->length(); break; - case BufferHead::STATE_DIRTY: stat_dirty -= bh->length(); break; - case BufferHead::STATE_TX: stat_tx -= bh->length(); break; - case BufferHead::STATE_RX: stat_rx -= bh->length(); break; - case BufferHead::STATE_PARTIAL: stat_partial -= bh->length(); break; - } - } - off_t get_stat_tx() { return stat_tx; } - off_t get_stat_rx() { return stat_rx; } - off_t get_stat_dirty() { return stat_dirty; } - off_t get_stat_clean() { return stat_clean; } - off_t get_stat_partial() { return stat_partial; } - - - map &get_unflushed(int what) { - return epoch_unflushed[what]; - } - - int get_unflushed(int what, version_t epoch) { - return epoch_unflushed[what][epoch]; - } - void inc_unflushed(int what, version_t epoch) { - epoch_unflushed[what][epoch]++; - //cout << "inc_unflushed " << epoch << " now " << epoch_unflushed[epoch] << endl; - } - void dec_unflushed(int what, version_t epoch) { - epoch_unflushed[what][epoch]--; - //cout << "dec_unflushed " << epoch << " now " << epoch_unflushed[epoch] << endl; - if (epoch_unflushed[what][epoch] == 0) - flush_cond.Signal(); - } - - void waitfor_stat() { - stat_waiter++; - stat_cond.Wait(ebofs_lock); - stat_waiter--; - } - void waitfor_flush() { - flush_cond.Wait(ebofs_lock); - } - - - // bh state - void set_state(BufferHead *bh, int s) { - // move between lru lists? - if (s == BufferHead::STATE_DIRTY && bh->get_state() != BufferHead::STATE_DIRTY) { - lru_rest.lru_remove(bh); - lru_dirty.lru_insert_top(bh); - dirty_bh.insert(bh); - } - if (s != BufferHead::STATE_DIRTY && bh->get_state() == BufferHead::STATE_DIRTY) { - lru_dirty.lru_remove(bh); - if (bh->want_to_expire) - lru_rest.lru_insert_bot(bh); - else - lru_rest.lru_insert_mid(bh); - dirty_bh.erase(bh); - } - - // set state - stat_sub(bh); - bh->set_state(s); - stat_add(bh); - } - - void copy_state(BufferHead *bh1, BufferHead *bh2) { - set_state(bh2, bh1->get_state()); - } - - void mark_missing(BufferHead *bh) { set_state(bh, BufferHead::STATE_MISSING); }; - void mark_clean(BufferHead *bh) { set_state(bh, BufferHead::STATE_CLEAN); }; - void mark_rx(BufferHead *bh) { set_state(bh, BufferHead::STATE_RX); }; - void mark_partial(BufferHead *bh) { set_state(bh, BufferHead::STATE_PARTIAL); }; - void mark_tx(BufferHead *bh) { set_state(bh, BufferHead::STATE_TX); }; - void mark_dirty(BufferHead *bh) { - set_state(bh, BufferHead::STATE_DIRTY); - bh->set_dirty_stamp(g_clock.now()); - }; - - - // io - void bh_read(Onode *on, BufferHead *bh, block_t from=0); - void bh_write(Onode *on, BufferHead *bh, block_t shouldbe=0); - - bool bh_cancel_read(BufferHead *bh); - bool bh_cancel_write(BufferHead *bh, version_t cur_epoch); - - void bh_queue_partial_write(Onode *on, BufferHead *bh); - void bh_cancel_partial_write(BufferHead *bh); - - void queue_partial(block_t from, block_t to, map& partial, version_t epoch); - void cancel_partial(block_t from, block_t to, version_t epoch); - - void add_shadow_partial(block_t from, BufferHead *bh); - void cancel_shadow_partial(block_t from, BufferHead *bh); - - void rx_finish(ObjectCache *oc, ioh_t ioh, block_t start, block_t len, block_t diskstart, bufferlist& bl); - void tx_finish(ObjectCache *oc, ioh_t ioh, block_t start, block_t len, version_t v, version_t e); - void partial_tx_finish(version_t epoch); - - friend class C_E_FlushPartial; - - // bh fun - BufferHead *split(BufferHead *orig, block_t after); -}; - - -class C_OC_RxFinish : public BlockDevice::callback { - Mutex &lock; - ObjectCache *oc; - block_t start, length; - block_t diskstart; -public: - bufferlist bl; - C_OC_RxFinish(Mutex &m, ObjectCache *o, block_t s, block_t l, block_t ds) : - lock(m), oc(o), start(s), length(l), diskstart(ds) {} - void finish(ioh_t ioh, int r) { - oc->bc->rx_finish(oc, ioh, start, length, diskstart, bl); - } -}; - -class C_OC_TxFinish : public BlockDevice::callback { - Mutex &lock; - ObjectCache *oc; - block_t start, length; - version_t version; - version_t epoch; - public: - C_OC_TxFinish(Mutex &m, ObjectCache *o, block_t s, block_t l, version_t v, version_t e) : - lock(m), oc(o), start(s), length(l), version(v), epoch(e) {} - void finish(ioh_t ioh, int r) { - oc->bc->tx_finish(oc, ioh, start, length, version, epoch); - } -}; - -class C_OC_PartialTxFinish : public BlockDevice::callback { - BufferCache *bc; - version_t epoch; -public: - C_OC_PartialTxFinish(BufferCache *b, version_t e) : - bc(b), epoch(e) {} - void finish(ioh_t ioh, int r) { - bc->partial_tx_finish(epoch); - } -}; - - -#endif diff --git a/branches/marnberg/quota/ebofs/Cnode.h b/branches/marnberg/quota/ebofs/Cnode.h deleted file mode 100644 index b906a6db24c57..0000000000000 --- a/branches/marnberg/quota/ebofs/Cnode.h +++ /dev/null @@ -1,100 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __EBOFS_CNODE_H -#define __EBOFS_CNODE_H - -#include "Onode.h" - -/* - * collection node - * - * holds attribute metadata for collections. - * colletion membership is stored in b+tree tables, independent of tte cnode. - */ - -class Cnode : public LRUObject -{ - private: - int ref; - bool dirty; - - public: - coll_t coll_id; - Extent cnode_loc; - - map attr; - - public: - Cnode(coll_t cid) : ref(0), dirty(false), coll_id(cid) { - cnode_loc.length = 0; - } - ~Cnode() { - } - - block_t get_cnode_id() { return cnode_loc.start; } - int get_cnode_len() { return cnode_loc.length; } - - void get() { - if (ref == 0) lru_pin(); - ref++; - } - void put() { - ref--; - if (ref == 0) lru_unpin(); - } - int get_ref_count() { return ref; } - - void mark_dirty() { - if (!dirty) { - dirty = true; - get(); - } - } - void mark_clean() { - if (dirty) { - dirty = false; - put(); - } - } - bool is_dirty() { return dirty; } - - - int get_attr_bytes() { - int s = 0; - for (map::iterator i = attr.begin(); - i != attr.end(); - i++) { - s += i->first.length() + 1; - s += i->second.length() + sizeof(int); - } - return s; - } - - // - //???void clear(); - - -}; - -inline ostream& operator<<(ostream& out, Cnode& cn) -{ - out << "cnode(" << hex << cn.coll_id << dec; - if (cn.is_dirty()) out << " dirty"; - //out << " " << &cn; - out << ")"; - return out; -} - -#endif diff --git a/branches/marnberg/quota/ebofs/Ebofs.cc b/branches/marnberg/quota/ebofs/Ebofs.cc deleted file mode 100644 index 2008d1961bfae..0000000000000 --- a/branches/marnberg/quota/ebofs/Ebofs.cc +++ /dev/null @@ -1,3270 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "Ebofs.h" - -#include - -#ifndef DARWIN -#include -#else -#include -#include -#endif // DARWIN - -// ******************* - -#undef dout -#define dout(x) if (x <= g_conf.debug_ebofs) cout << "ebofs(" << dev.get_device_name() << ")." -#define derr(x) if (x <= g_conf.debug_ebofs) cerr << "ebofs(" << dev.get_device_name() << ")." - - -char *nice_blocks(block_t b) -{ - static char s[20]; - float sz = b*4.0; - if (sz > (10 << 20)) - sprintf(s,"%.1f GB", sz / (1024.0*1024.0)); - else if (sz > (10 << 10)) - sprintf(s,"%.1f MB", sz / (1024.0)); - else - sprintf(s,"%llu KB", b*4ULL); - return s; -} - -int Ebofs::mount() -{ - ebofs_lock.Lock(); - assert(!mounted); - - int r = dev.open(&idle_kicker); - if (r < 0) { - ebofs_lock.Unlock(); - return r; - } - - dout(2) << "mounting " << dev.get_device_name() << " " << dev.get_num_blocks() << " blocks, " << nice_blocks(dev.get_num_blocks()) << endl; - - // read super - bufferptr bp1 = buffer::create_page_aligned(EBOFS_BLOCK_SIZE); - bufferptr bp2 = buffer::create_page_aligned(EBOFS_BLOCK_SIZE); - dev.read(0, 1, bp1); - dev.read(1, 1, bp2); - - struct ebofs_super *sb1 = (struct ebofs_super*)bp1.c_str(); - struct ebofs_super *sb2 = (struct ebofs_super*)bp2.c_str(); - dout(3) << "mount super @0 epoch " << sb1->epoch << endl; - dout(3) << "mount super @1 epoch " << sb2->epoch << endl; - - // pick newest super - struct ebofs_super *sb = 0; - if (sb1->epoch > sb2->epoch) - sb = sb1; - else - sb = sb2; - super_epoch = sb->epoch; - dout(3) << "mount epoch " << super_epoch << endl; - assert(super_epoch == sb->epoch); - - free_blocks = sb->free_blocks; - limbo_blocks = sb->limbo_blocks; - - // init node pools - dout(3) << "mount nodepool" << endl; - nodepool.init( &sb->nodepool ); - nodepool.read_usemap( dev, super_epoch ); - nodepool.read_clean_nodes( dev ); - - // open tables - dout(3) << "mount opening tables" << endl; - object_tab = new Table( nodepool, sb->object_tab ); - for (int i=0; i( nodepool, sb->free_tab[i] ); - limbo_tab = new Table( nodepool, sb->limbo_tab ); - alloc_tab = new Table >( nodepool, sb->alloc_tab ); - - collection_tab = new Table( nodepool, sb->collection_tab ); - co_tab = new Table( nodepool, sb->co_tab ); - - allocator.release_limbo(); - - dout(3) << "mount starting commit+finisher threads" << endl; - commit_thread.create(); - finisher_thread.create(); - - dout(1) << "mounted " << dev.get_device_name() << " " << dev.get_num_blocks() << " blocks, " << nice_blocks(dev.get_num_blocks()) << endl; - mounted = true; - - ebofs_lock.Unlock(); - return 0; -} - - -int Ebofs::mkfs() -{ - ebofs_lock.Lock(); - assert(!mounted); - - int r = dev.open(); - if (r < 0) { - ebofs_lock.Unlock(); - return r; - } - - block_t num_blocks = dev.get_num_blocks(); - - free_blocks = 0; - limbo_blocks = 0; - - // create first noderegion - Extent nr; - nr.start = 2; - nr.length = 20+ (num_blocks / 1000); - if (nr.length < 10) nr.length = 10; - nodepool.add_region(nr); - dout(10) << "mkfs: first node region at " << nr << endl; - - // allocate two usemaps - block_t usemap_len = nodepool.get_usemap_len(); - nodepool.usemap_even.start = nr.end(); - nodepool.usemap_even.length = usemap_len; - nodepool.usemap_odd.start = nodepool.usemap_even.end(); - nodepool.usemap_odd.length = usemap_len; - dout(10) << "mkfs: even usemap at " << nodepool.usemap_even << endl; - dout(10) << "mkfs: odd usemap at " << nodepool.usemap_odd << endl; - - // init tables - struct ebofs_table empty; - empty.num_keys = 0; - empty.root = -1; - empty.depth = 0; - - object_tab = new Table( nodepool, empty ); - collection_tab = new Table( nodepool, empty ); - - for (int i=0; i( nodepool, empty ); - limbo_tab = new Table( nodepool, empty ); - alloc_tab = new Table >( nodepool, empty ); - - co_tab = new Table( nodepool, empty ); - - // add free space - Extent left; - left.start = nodepool.usemap_odd.end(); - left.length = num_blocks - left.start; - dout(10) << "mkfs: free data blocks at " << left << endl; - allocator._release_into_limbo( left ); - if (g_conf.ebofs_cloneable) { - allocator.alloc_inc(nr); - allocator.alloc_inc(nodepool.usemap_even); - allocator.alloc_inc(nodepool.usemap_odd); - } - allocator.commit_limbo(); // -> limbo_tab - allocator.release_limbo(); // -> free_tab - - // write nodes, super, 2x - dout(10) << "mkfs: flushing nodepool and superblocks (2x)" << endl; - - nodepool.commit_start( dev, 0 ); - nodepool.commit_wait(); - bufferptr superbp0; - prepare_super(0, superbp0); - write_super(0, superbp0); - - nodepool.commit_start( dev, 1 ); - nodepool.commit_wait(); - bufferptr superbp1; - prepare_super(1, superbp1); - write_super(1, superbp1); - - // free memory - dout(10) << "mkfs: cleaning up" << endl; - close_tables(); - - dev.close(); - - dout(2) << "mkfs: " << dev.get_device_name() << " " << dev.get_num_blocks() << " blocks, " << nice_blocks(dev.get_num_blocks()) << endl; - ebofs_lock.Unlock(); - return 0; -} - -void Ebofs::close_tables() -{ - // close tables - delete object_tab; - for (int i=0; i::iterator i = onode_map.begin(); - i != onode_map.end(); - i++) { - dout(0) << "umount *** leftover: " << i->first << " " << *(i->second) << endl; - } - - // free memory - dout(5) << "umount cleaning up" << endl; - close_tables(); - dev.close(); - readonly = unmounting = mounted = false; - - dout(1) << "umount done on " << dev.get_device_name() << endl; - ebofs_lock.Unlock(); - return 0; -} - - - -void Ebofs::prepare_super(version_t epoch, bufferptr& bp) -{ - struct ebofs_super sb; - - dout(10) << "prepare_super v" << epoch << endl; - - // fill in super - memset(&sb, 0, sizeof(sb)); - sb.s_magic = EBOFS_MAGIC; - sb.epoch = epoch; - sb.num_blocks = dev.get_num_blocks(); - - sb.free_blocks = free_blocks; - sb.limbo_blocks = limbo_blocks; - - - // tables - sb.object_tab.num_keys = object_tab->get_num_keys(); - sb.object_tab.root = object_tab->get_root(); - sb.object_tab.depth = object_tab->get_depth(); - - for (int i=0; iget_num_keys(); - sb.free_tab[i].root = free_tab[i]->get_root(); - sb.free_tab[i].depth = free_tab[i]->get_depth(); - } - sb.limbo_tab.num_keys = limbo_tab->get_num_keys(); - sb.limbo_tab.root = limbo_tab->get_root(); - sb.limbo_tab.depth = limbo_tab->get_depth(); - - sb.alloc_tab.num_keys = alloc_tab->get_num_keys(); - sb.alloc_tab.root = alloc_tab->get_root(); - sb.alloc_tab.depth = alloc_tab->get_depth(); - - sb.collection_tab.num_keys = collection_tab->get_num_keys(); - sb.collection_tab.root = collection_tab->get_root(); - sb.collection_tab.depth = collection_tab->get_depth(); - - sb.co_tab.num_keys = co_tab->get_num_keys(); - sb.co_tab.root = co_tab->get_root(); - sb.co_tab.depth = co_tab->get_depth(); - - // pools - sb.nodepool.num_regions = nodepool.region_loc.size(); - for (unsigned i=0; i 0) { - // periodically check for idle block device - dout(20) << "commit_thread sleeping (up to) " << g_conf.ebofs_commit_ms << " ms, " - << g_conf.ebofs_idle_commit_ms << " ms if idle" << endl; - long left = g_conf.ebofs_commit_ms; - while (left > 0) { - long next = MIN(left, g_conf.ebofs_idle_commit_ms); - if (commit_cond.WaitInterval(ebofs_lock, utime_t(0, next*1000)) != ETIMEDOUT) - break; // we got kicked - if (dev.is_idle()) { - dout(20) << "commit_thread bdev is idle, early commit" << endl; - break; // dev is idle - } - left -= next; - dout(20) << "commit_thread " << left << " ms left" << endl; - - // hack hack - //if (!left) g_conf.debug_ebofs = 10; - // /hack hack - } - } else { - // normal wait+timeout - dout(20) << "commit_thread sleeping (up to) " << g_conf.ebofs_commit_ms << " ms" << endl; - commit_cond.WaitInterval(ebofs_lock, utime_t(0, g_conf.ebofs_commit_ms*1000)); - } - - } else { - // DEBUG.. wait until kicked - dout(10) << "commit_thread no commit_ms, waiting until kicked" << endl; - commit_cond.Wait(ebofs_lock); - } - - if (unmounting) { - dout(10) << "commit_thread unmounting: final commit pass" << endl; - assert(readonly); - unmounting = false; - mounted = false; - dirty = true; - } - - if (!dirty && !limbo_blocks) { - dout(10) << "commit_thread not dirty" << endl; - } - else { - super_epoch++; - dirty = false; - - dout(10) << "commit_thread commit start, new epoch " << super_epoch << endl; - dout(2) << "commit_thread data: " - << 100*(dev.get_num_blocks()-get_free_blocks())/dev.get_num_blocks() << "% used, " - << get_free_blocks() << " (" << 100*get_free_blocks()/dev.get_num_blocks() - << "%) free in " << get_free_extents() - << ", " << get_limbo_blocks() << " (" << 100*get_limbo_blocks()/dev.get_num_blocks() - << "%) limbo in " << get_limbo_extents() - << endl; - dout(2) << "commit_thread nodes: " - << 100*nodepool.num_used()/nodepool.num_total() << "% used, " - << nodepool.num_free() << " (" << 100*nodepool.num_free()/nodepool.num_total() << "%) free, " - << nodepool.num_limbo() << " (" << 100*nodepool.num_limbo()/nodepool.num_total() << "%) limbo, " - << nodepool.num_total() << " total." << endl; - dout(2) << "commit_thread bc: " - << "size " << bc.get_size() - << ", trimmable " << bc.get_trimmable() - << ", max " << g_conf.ebofs_bc_size - << "; dirty " << bc.get_stat_dirty() - << ", tx " << bc.get_stat_tx() - << ", max dirty " << g_conf.ebofs_bc_max_dirty - << endl; - - - // (async) write onodes+condes (do this first; it currently involves inode reallocation) - commit_inodes_start(); - - allocator.commit_limbo(); // limbo -> limbo_tab - - // (async) write btree nodes - nodepool.commit_start( dev, super_epoch ); - - // blockdev barrier (prioritize our writes!) - dout(30) << "commit_thread barrier. flushing inodes " << inodes_flushing << endl; - dev.barrier(); - - // prepare super (before any changes get made!) - bufferptr superbp; - prepare_super(super_epoch, superbp); - - // wait for it all to flush (drops global lock) - commit_bc_wait(super_epoch-1); - dout(30) << "commit_thread bc flushed" << endl; - commit_inodes_wait(); - dout(30) << "commit_thread inodes flushed" << endl; - nodepool.commit_wait(); - dout(30) << "commit_thread btree nodes flushed" << endl; - - // ok, now (synchronously) write the prior super! - dout(10) << "commit_thread commit flushed, writing super for prior epoch" << endl; - ebofs_lock.Unlock(); - write_super(super_epoch, superbp); - ebofs_lock.Lock(); - - dout(10) << "commit_thread wrote super" << endl; - - // free limbo space now - // (since we're done allocating things, - // AND we've flushed all previous epoch data) - allocator.release_limbo(); // limbo_tab -> free_tabs - - // do we need more node space? - if (nodepool.num_free() < nodepool.num_total() / 3) { - dout(2) << "commit_thread running low on node space, allocating more." << endl; - alloc_more_node_space(); - } - - // kick waiters - dout(10) << "commit_thread queueing commit + kicking sync waiters" << endl; - - finisher_lock.Lock(); - finisher_queue.splice(finisher_queue.end(), commit_waiters[super_epoch-1]); - commit_waiters.erase(super_epoch-1); - finisher_cond.Signal(); - finisher_lock.Unlock(); - - sync_cond.Signal(); - - dout(10) << "commit_thread commit finish" << endl; - } - - // trim bc? - trim_bc(); - trim_inodes(); - - } - - dout(10) << "commit_thread finish" << endl; - commit_thread_started = false; - ebofs_lock.Unlock(); - return 0; -} - - -void Ebofs::alloc_more_node_space() -{ - dout(1) << "alloc_more_node_space free " << nodepool.num_free() << "/" << nodepool.num_total() << endl; - - if (nodepool.num_regions() < EBOFS_MAX_NODE_REGIONS) { - int want = nodepool.num_total(); - - Extent ex; - allocator.allocate(ex, want, 2); - dout(1) << "alloc_more_node_space wants " << want << " more, got " << ex << endl; - - Extent even, odd; - unsigned ulen = nodepool.get_usemap_len(nodepool.num_total() + ex.length); - allocator.allocate(even, ulen, 2); - allocator.allocate(odd, ulen, 2); - dout(1) << "alloc_more_node_space maps need " << ulen << " x2, got " << even << " " << odd << endl; - - if (even.length == ulen && odd.length == ulen) { - dout(1) << "alloc_more_node_space got " << ex << ", new usemaps at even " << even << " odd " << odd << endl; - allocator.release(nodepool.usemap_even); - allocator.release(nodepool.usemap_odd); - nodepool.add_region(ex); - nodepool.usemap_even = even; - nodepool.usemap_odd = odd; - } else { - dout (1) << "alloc_more_node_space failed to get space for new usemaps" << endl; - allocator.release(ex); - allocator.release(even); - allocator.release(odd); - //assert(0); - } - } else { - dout(1) << "alloc_more_node_space already have max node regions!" << endl; - assert(0); - } -} - - -void *Ebofs::finisher_thread_entry() -{ - finisher_lock.Lock(); - dout(10) << "finisher_thread start" << endl; - - while (!finisher_stop) { - while (!finisher_queue.empty()) { - list ls; - ls.swap(finisher_queue); - - finisher_lock.Unlock(); - - //ebofs_lock.Lock(); // um.. why lock this? -sage - finish_contexts(ls, 0); - //ebofs_lock.Unlock(); - - finisher_lock.Lock(); - } - if (finisher_stop) break; - - dout(30) << "finisher_thread sleeping" << endl; - finisher_cond.Wait(finisher_lock); - } - - dout(10) << "finisher_thread start" << endl; - finisher_lock.Unlock(); - return 0; -} - - -// *** onodes *** - -Onode* Ebofs::new_onode(object_t oid) -{ - Onode* on = new Onode(oid); - - assert(onode_map.count(oid) == 0); - onode_map[oid] = on; - onode_lru.lru_insert_top(on); - - assert(object_tab->lookup(oid) < 0); - object_tab->insert( oid, on->onode_loc ); // even tho i'm not placed yet - - on->get(); - on->onode_loc.start = 0; - on->onode_loc.length = 0; - - dirty_onode(on); - - dout(7) << "new_onode " << *on << endl; - return on; -} - - -Onode* Ebofs::get_onode(object_t oid) -{ - while (1) { - // in cache? - if (have_onode(oid)) { - // yay - Onode *on = onode_map[oid]; - on->get(); - //cout << "get_onode " << *on << endl; - return on; - } - - // on disk? - Extent onode_loc; - if (object_tab->lookup(oid, onode_loc) < 0) { - dout(10) << "onode lookup failed on " << oid << endl; - // object dne. - return 0; - } - - // already loading? - if (waitfor_onode.count(oid)) { - // yep, just wait. - Cond c; - waitfor_onode[oid].push_back(&c); - dout(10) << "get_onode " << oid << " already loading, waiting" << endl; - c.Wait(ebofs_lock); - continue; - } - - dout(10) << "get_onode reading " << oid << " from " << onode_loc << endl; - - assert(waitfor_onode.count(oid) == 0); - waitfor_onode[oid].clear(); // this should be empty initially. - - // read it! - bufferlist bl; - bl.push_back( buffer::create_page_aligned( EBOFS_BLOCK_SIZE*onode_loc.length ) ); - - ebofs_lock.Unlock(); - dev.read( onode_loc.start, onode_loc.length, bl ); - ebofs_lock.Lock(); - - // add onode - Onode *on = new Onode(oid); - onode_map[oid] = on; - onode_lru.lru_insert_top(on); - - // parse data block - struct ebofs_onode *eo = (struct ebofs_onode*)bl.c_str(); - if (eo->object_id != oid) { - cerr << " wrong oid in onode block: " << eo->object_id << " != " << oid << endl; - cerr << " onode_loc is " << eo->onode_loc << endl; - cerr << " object_size " << eo->object_size << endl; - cerr << " object_blocks " << eo->object_blocks << endl; - cerr << " " << eo->num_collections << " coll + " - << eo->num_attr << " attr + " - << eo->num_extents << " extents" << endl; - assert(eo->object_id == oid); - } - on->readonly = eo->readonly; - on->onode_loc = eo->onode_loc; - on->object_size = eo->object_size; - on->object_blocks = eo->object_blocks; - - // parse - char *p = bl.c_str() + sizeof(*eo); - - // parse collection list - for (int i=0; inum_collections; i++) { - coll_t c = *((coll_t*)p); - p += sizeof(c); - on->collections.insert(c); - } - - // parse attributes - for (int i=0; inum_attr; i++) { - string key = p; - p += key.length() + 1; - int len = *(int*)(p); - p += sizeof(len); - on->attr[key] = buffer::copy(p, len); - p += len; - dout(15) << "get_onode " << *on << " attr " << key << " len " << len << endl; - } - - // parse extents - on->extent_map.clear(); - block_t n = 0; - for (int i=0; inum_extents; i++) { - Extent ex = *((Extent*)p); - on->extent_map[n] = ex; - dout(15) << "get_onode " << *on << " ex " << i << ": " << ex << endl; - n += ex.length; - p += sizeof(Extent); - } - assert(n == on->object_blocks); - - // wake up other waiters - for (list::iterator i = waitfor_onode[oid].begin(); - i != waitfor_onode[oid].end(); - i++) - (*i)->Signal(); - waitfor_onode.erase(oid); // remove Cond list - - on->get(); - //cout << "get_onode " << *on << " (loaded)" << endl; - return on; - } -} - - -class C_E_InodeFlush : public BlockDevice::callback { - Ebofs *ebofs; -public: - C_E_InodeFlush(Ebofs *e) : ebofs(e) {} - void finish(ioh_t ioh, int r) { - ebofs->flush_inode_finish(); - } -}; - - -void Ebofs::encode_onode(Onode *on, bufferlist& bl, unsigned& off) -{ - // onode - struct ebofs_onode eo; - eo.readonly = on->readonly; - eo.onode_loc = on->onode_loc; - eo.object_id = on->object_id; - eo.object_size = on->object_size; - eo.object_blocks = on->object_blocks; - eo.num_collections = on->collections.size(); - eo.num_attr = on->attr.size(); - eo.num_extents = on->extent_map.size(); - bl.copy_in(off, sizeof(eo), (char*)&eo); - off += sizeof(eo); - - // collections - for (set::iterator i = on->collections.begin(); - i != on->collections.end(); - i++) { - bl.copy_in(off, sizeof(*i), (char*)&(*i)); - off += sizeof(*i); - } - - // attr - for (map::iterator i = on->attr.begin(); - i != on->attr.end(); - i++) { - bl.copy_in(off, i->first.length()+1, i->first.c_str()); - off += i->first.length()+1; - int l = i->second.length(); - bl.copy_in(off, sizeof(int), (char*)&l); - off += sizeof(int); - bl.copy_in(off, l, i->second.c_str()); - off += l; - dout(15) << "write_onode " << *on << " attr " << i->first << " len " << l << endl; - } - - // extents - for (map::iterator i = on->extent_map.begin(); - i != on->extent_map.end(); - i++) { - bl.copy_in(off, sizeof(Extent), (char*)&(i->second)); - off += sizeof(Extent); - dout(15) << "write_onode " << *on << " ex " << i->first << ": " << i->second << endl; - } -} - -void Ebofs::write_onode(Onode *on) -{ - // buffer - unsigned bytes = sizeof(ebofs_onode) + on->get_collection_bytes() + on->get_attr_bytes() + on->get_extent_bytes(); - unsigned blocks = (bytes-1)/EBOFS_BLOCK_SIZE + 1; - - bufferlist bl; - bl.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*blocks) ); - - // (always) relocate onode - if (1) { - if (on->onode_loc.length) - allocator.release(on->onode_loc); - - block_t first = 0; - if (on->extent_map.size()) - first = on->extent_map.begin()->second.start; - - allocator.allocate(on->onode_loc, blocks, first); - object_tab->remove( on->object_id ); - object_tab->insert( on->object_id, on->onode_loc ); - //object_tab->verify(); - } - - dout(10) << "write_onode " << *on << " to " << on->onode_loc << endl; - - unsigned off = 0; - encode_onode(on, bl, off); - assert(off == bytes); - - // write - dev.write( on->onode_loc.start, on->onode_loc.length, bl, - new C_E_InodeFlush(this), "write_onode" ); -} - -void Ebofs::remove_onode(Onode *on) -{ - dout(8) << "remove_onode " << *on << endl; - - assert(on->get_ref_count() >= 1); // caller - - // tear down buffer cache - if (on->oc) { - on->oc->truncate(0, super_epoch); // this will kick readers along the way. - on->close_oc(); - } - - // remove from onode map, mark dangling/deleted - onode_map.erase(on->object_id); - onode_lru.lru_remove(on); - on->deleted = true; - on->dangling = true; - - // remove from object table - //dout(0) << "remove_onode on " << *on << endl; - object_tab->remove(on->object_id); - - // free onode space - if (on->onode_loc.length) - allocator.release(on->onode_loc); - - // free data space - for (map::iterator i = on->extent_map.begin(); - i != on->extent_map.end(); - i++) - allocator.release(i->second); - on->extent_map.clear(); - - // remove from collections - for (set::iterator i = on->collections.begin(); - i != on->collections.end(); - i++) { - co_tab->remove(coll_object_t(*i,on->object_id)); - } - on->collections.clear(); - - // dirty -> clean? - if (on->is_dirty()) { - on->mark_clean(); // this unpins *on - dirty_onodes.erase(on); - } - - if (on->get_ref_count() > 1) cout << "remove_onode **** will survive " << *on << endl; - put_onode(on); - - dirty = true; -} - -void Ebofs::put_onode(Onode *on) -{ - on->put(); - //cout << "put_onode " << *on << endl; - - if (on->get_ref_count() == 0 && on->dangling) { - //cout << " *** hosing on " << *on << endl; - delete on; - } -} - -void Ebofs::dirty_onode(Onode *on) -{ - if (!on->is_dirty()) { - on->mark_dirty(); - dirty_onodes.insert(on); - } - dirty = true; -} - -void Ebofs::trim_inodes(int max) -{ - unsigned omax = onode_lru.lru_get_max(); - unsigned cmax = cnode_lru.lru_get_max(); - if (max >= 0) omax = cmax = max; - dout(10) << "trim_inodes start " << onode_lru.lru_get_size() << " / " << omax << " onodes, " - << cnode_lru.lru_get_size() << " / " << cmax << " cnodes" << endl; - - // onodes - while (onode_lru.lru_get_size() > omax) { - // expire an item - Onode *on = (Onode*)onode_lru.lru_expire(); - if (on == 0) break; // nothing to expire - - // expire - dout(20) << "trim_inodes removing onode " << *on << endl; - onode_map.erase(on->object_id); - on->dangling = true; - - if (on->get_ref_count() == 0) { - assert(on->oc == 0); // an open oc pins the onode! - delete on; - } else { - dout(-20) << "trim_inodes still active: " << *on << endl; - assert(0); // huh? - } - } - - - // cnodes - while (cnode_lru.lru_get_size() > cmax) { - // expire an item - Cnode *cn = (Cnode*)cnode_lru.lru_expire(); - if (cn == 0) break; // nothing to expire - - // expire - dout(20) << "trim_inodes removing cnode " << *cn << endl; - cnode_map.erase(cn->coll_id); - - delete cn; - } - - dout(10) << "trim_inodes finish " - << onode_lru.lru_get_size() << " / " << omax << " onodes, " - << cnode_lru.lru_get_size() << " / " << cmax << " cnodes" << endl; -} - - - -// *** cnodes **** - -Cnode* Ebofs::new_cnode(coll_t cid) -{ - Cnode* cn = new Cnode(cid); - - assert(cnode_map.count(cid) == 0); - cnode_map[cid] = cn; - cnode_lru.lru_insert_top(cn); - - assert(collection_tab->lookup(cid) < 0); - collection_tab->insert( cid, cn->cnode_loc ); // even tho i'm not placed yet - - cn->get(); - cn->cnode_loc.start = 0; - cn->cnode_loc.length = 0; - - dirty_cnode(cn); - - return cn; -} - -Cnode* Ebofs::get_cnode(coll_t cid) -{ - while (1) { - // in cache? - if (cnode_map.count(cid)) { - // yay - Cnode *cn = cnode_map[cid]; - cn->get(); - return cn; - } - - // on disk? - Extent cnode_loc; - if (collection_tab->lookup(cid, cnode_loc) < 0) { - // object dne. - return 0; - } - - // already loading? - if (waitfor_cnode.count(cid)) { - // yep, just wait. - Cond c; - waitfor_cnode[cid].push_back(&c); - dout(10) << "get_cnode " << cid << " already loading, waiting" << endl; - c.Wait(ebofs_lock); - continue; - } - - dout(10) << "get_cnode reading " << cid << " from " << cnode_loc << endl; - - assert(waitfor_cnode.count(cid) == 0); - waitfor_cnode[cid].clear(); // this should be empty initially. - - // read it! - bufferlist bl; - //bufferpool.alloc( EBOFS_BLOCK_SIZE*cnode_loc.length, bl ); - bl.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*cnode_loc.length) ); - - ebofs_lock.Unlock(); - dev.read( cnode_loc.start, cnode_loc.length, bl ); - ebofs_lock.Lock(); - - // parse data block - Cnode *cn = new Cnode(cid); - - cnode_map[cid] = cn; - cnode_lru.lru_insert_top(cn); - - struct ebofs_cnode *ec = (struct ebofs_cnode*)bl.c_str(); - cn->cnode_loc = ec->cnode_loc; - - // parse attributes - char *p = bl.c_str() + sizeof(*ec); - for (int i=0; inum_attr; i++) { - string key = p; - p += key.length() + 1; - int len = *(int*)(p); - p += sizeof(len); - cn->attr[key] = buffer::copy(p, len); - p += len; - dout(15) << "get_cnode " << *cn << " attr " << key << " len " << len << endl; - } - - // wake up other waiters - for (list::iterator i = waitfor_cnode[cid].begin(); - i != waitfor_cnode[cid].end(); - i++) - (*i)->Signal(); - waitfor_cnode.erase(cid); // remove Cond list - - cn->get(); - return cn; - } -} - -void Ebofs::encode_cnode(Cnode *cn, bufferlist& bl, unsigned& off) -{ - // cnode - struct ebofs_cnode ec; - ec.cnode_loc = cn->cnode_loc; - ec.coll_id = cn->coll_id; - ec.num_attr = cn->attr.size(); - bl.copy_in(off, sizeof(ec), (char*)&ec); - off += sizeof(ec); - - // attr - for (map::iterator i = cn->attr.begin(); - i != cn->attr.end(); - i++) { - bl.copy_in(off, i->first.length()+1, i->first.c_str()); - off += i->first.length()+1; - int len = i->second.length(); - bl.copy_in(off, sizeof(int), (char*)&len); - off += sizeof(int); - bl.copy_in(off, len, i->second.c_str()); - off += len; - - dout(15) << "write_cnode " << *cn << " attr " << i->first << " len " << len << endl; - } -} - -void Ebofs::write_cnode(Cnode *cn) -{ - // allocate buffer - unsigned bytes = sizeof(ebofs_cnode) + cn->get_attr_bytes(); - unsigned blocks = (bytes-1)/EBOFS_BLOCK_SIZE + 1; - - bufferlist bl; - //bufferpool.alloc( EBOFS_BLOCK_SIZE*blocks, bl ); - bl.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*blocks) ); - - // (always) relocate cnode! - if (1) { - if (cn->cnode_loc.length) - allocator.release(cn->cnode_loc); - - allocator.allocate(cn->cnode_loc, blocks, Allocator::NEAR_LAST_FWD); - collection_tab->remove( cn->coll_id ); - collection_tab->insert( cn->coll_id, cn->cnode_loc ); - } - - dout(10) << "write_cnode " << *cn << " to " << cn->cnode_loc << endl; - - unsigned off = 0; - encode_cnode(cn, bl, off); - assert(off == bytes); - - // write - dev.write( cn->cnode_loc.start, cn->cnode_loc.length, bl, - new C_E_InodeFlush(this), "write_cnode" ); -} - -void Ebofs::remove_cnode(Cnode *cn) -{ - dout(10) << "remove_cnode " << *cn << endl; - - // remove from table - collection_tab->remove(cn->coll_id); - - // free cnode space - if (cn->cnode_loc.length) - allocator.release(cn->cnode_loc); - - // remove from dirty list? - if (cn->is_dirty()) - dirty_cnodes.erase(cn); - - // remove from map and lru - cnode_map.erase(cn->coll_id); - cnode_lru.lru_remove(cn); - - // count down refs - cn->mark_clean(); - cn->put(); - assert(cn->get_ref_count() == 0); - - // hose. - delete cn; - - dirty = true; -} - -void Ebofs::put_cnode(Cnode *cn) -{ - cn->put(); -} - -void Ebofs::dirty_cnode(Cnode *cn) -{ - if (!cn->is_dirty()) { - cn->mark_dirty(); - dirty_cnodes.insert(cn); - } - dirty = true; -} - - - - - -void Ebofs::flush_inode_finish() -{ - ebofs_lock.Lock(); - { - inodes_flushing--; - if (inodes_flushing < 1000) - dout(20) << "flush_inode_finish, " << inodes_flushing << " left" << endl; - if (inodes_flushing == 0) - inode_commit_cond.Signal(); - } - ebofs_lock.Unlock(); -} - -void Ebofs::commit_inodes_start() -{ - dout(10) << "commit_inodes_start" << endl; - - assert(inodes_flushing == 0); - - // onodes - for (set::iterator i = dirty_onodes.begin(); - i != dirty_onodes.end(); - i++) { - Onode *on = *i; - inodes_flushing++; - write_onode(on); - on->mark_clean(); - on->uncommitted.clear(); // commit allocated blocks - on->commit_waiters.clear(); // these guys are gonna get taken care of, bc we committed. - } - dirty_onodes.clear(); - - // cnodes - for (set::iterator i = dirty_cnodes.begin(); - i != dirty_cnodes.end(); - i++) { - Cnode *cn = *i; - inodes_flushing++; - write_cnode(cn); - cn->mark_clean(); - } - dirty_cnodes.clear(); - - dout(10) << "commit_inodes_start writing " << inodes_flushing << " onodes+cnodes" << endl; -} - -void Ebofs::commit_inodes_wait() -{ - // caller must hold ebofs_lock - while (inodes_flushing > 0) { - dout(10) << "commit_inodes_wait waiting for " << inodes_flushing << " onodes+cnodes to flush" << endl; - inode_commit_cond.Wait(ebofs_lock); - } - dout(10) << "commit_inodes_wait all flushed" << endl; -} - - - - - - - -// *** buffer cache *** - -void Ebofs::trim_buffer_cache() -{ - ebofs_lock.Lock(); - trim_bc(0); - ebofs_lock.Unlock(); -} - -void Ebofs::trim_bc(off_t max) -{ - if (max < 0) - max = g_conf.ebofs_bc_size; - dout(10) << "trim_bc start: size " << bc.get_size() << ", trimmable " << bc.get_trimmable() << ", max " << max << endl; - - while (bc.get_size() > max && - bc.get_trimmable()) { - BufferHead *bh = (BufferHead*) bc.lru_rest.lru_expire(); - if (!bh) break; - - dout(25) << "trim_bc trimming " << *bh << endl; - assert(bh->is_clean()); - - ObjectCache *oc = bh->oc; - bc.remove_bh(bh); - delete bh; - - if (oc->is_empty()) { - Onode *on = oc->on; - dout(10) << "trim_bc closing oc on " << *on << endl; - on->close_oc(); - } - } - - dout(10) << "trim_bc finish: size " << bc.get_size() << ", trimmable " << bc.get_trimmable() << ", max " << max << endl; -} - - -void Ebofs::kick_idle() -{ - dout(10) << "kick_idle" << endl; - commit_cond.Signal(); - - /* - ebofs_lock.Lock(); - if (mounted && !unmounting && dirty) { - dout(0) << "kick_idle dirty, doing commit" << endl; - commit_cond.Signal(); - } else { - dout(0) << "kick_idle !dirty or !mounted or unmounting, doing nothing" << endl; - } - ebofs_lock.Unlock(); - */ -} - -void Ebofs::sync(Context *onsafe) -{ - ebofs_lock.Lock(); - if (onsafe) { - dirty = true; - commit_waiters[super_epoch].push_back(onsafe); - } - ebofs_lock.Unlock(); -} - -void Ebofs::sync() -{ - ebofs_lock.Lock(); - if (!dirty) { - dout(7) << "sync in " << super_epoch << ", not dirty" << endl; - } else { - epoch_t start = super_epoch; - dout(7) << "sync start in " << start << endl; - while (super_epoch == start) { - dout(7) << "sync kicking commit in " << super_epoch << endl; - dirty = true; - commit_cond.Signal(); - sync_cond.Wait(ebofs_lock); - } - dout(10) << "sync finish in " << super_epoch << endl; - } - ebofs_lock.Unlock(); -} - - - -void Ebofs::commit_bc_wait(version_t epoch) -{ - dout(10) << "commit_bc_wait on epoch " << epoch << endl; - - while (bc.get_unflushed(EBOFS_BC_FLUSH_BHWRITE,epoch) > 0 || - bc.get_unflushed(EBOFS_BC_FLUSH_PARTIAL,epoch) > 0) { - //dout(10) << "commit_bc_wait " << bc.get_unflushed(epoch) << " unflushed in epoch " << epoch << endl; - dout(10) << "commit_bc_wait epoch " << epoch - << ", unflushed bhwrite " << bc.get_unflushed(EBOFS_BC_FLUSH_BHWRITE) - << ", unflushed partial " << bc.get_unflushed(EBOFS_BC_FLUSH_PARTIAL) - << endl; - bc.waitfor_flush(); - } - - bc.get_unflushed(EBOFS_BC_FLUSH_BHWRITE).erase(epoch); - bc.get_unflushed(EBOFS_BC_FLUSH_PARTIAL).erase(epoch); - - dout(10) << "commit_bc_wait all flushed for epoch " << epoch - << "; " << bc.get_unflushed(EBOFS_BC_FLUSH_BHWRITE) - << " " << bc.get_unflushed(EBOFS_BC_FLUSH_PARTIAL) - << endl; -} - - - -int Ebofs::statfs(struct statfs *buf) -{ - dout(7) << "statfs" << endl; - - buf->f_type = EBOFS_MAGIC; /* type of filesystem */ - buf->f_bsize = 4096; /* optimal transfer block size */ - buf->f_blocks = dev.get_num_blocks(); /* total data blocks in file system */ - buf->f_bfree = get_free_blocks() - + get_limbo_blocks(); /* free blocks in fs */ - buf->f_bavail = get_free_blocks(); /* free blocks avail to non-superuser -- actually, for writing. */ - buf->f_files = nodepool.num_total(); /* total file nodes in file system */ - buf->f_ffree = nodepool.num_free(); /* free file nodes in fs */ - //buf->f_fsid = 0; /* file system id */ -#ifndef DARWIN - buf->f_namelen = 8; /* maximum length of filenames */ -#endif // DARWIN - - return 0; -} - - - - -/* - * allocate a write to blocks on disk. - * - take care to not overwrite any "safe" data blocks. - * - allocate/map new extents on disk as necessary - */ -void Ebofs::alloc_write(Onode *on, - block_t start, block_t len, - interval_set& alloc, - block_t& old_bfirst, block_t& old_blast) -{ - // first decide what pages to (re)allocate - alloc.insert(start, len); // start with whole range - - // figure out what bits are already uncommitted - interval_set already_uncom; - already_uncom.intersection_of(alloc, on->uncommitted); - - // subtract those off, so we're left with the committed bits (that must be reallocated). - alloc.subtract(already_uncom); - - dout(10) << "alloc_write must (re)alloc " << alloc << " on " << *on << endl; - - // release it (into limbo) - for (map::iterator i = alloc.m.begin(); - i != alloc.m.end(); - i++) { - // get old region - vector old; - on->map_extents(i->first, i->second, old); - for (unsigned o=0; ofirst == start) { - old_bfirst = old[0].start; - dout(20) << "alloc_write old_bfirst " << old_bfirst << " of " << old[0] << endl; - } - if (i->first+i->second == start+len) { - old_blast = old[old.size()-1].last(); - dout(20) << "alloc_write old_blast " << old_blast << " of " << old[old.size()-1] << endl; - } - } - } - - // reallocate uncommitted too? - // ( --> yes. we can always make better allocation decisions later, with more information. ) - if (g_conf.ebofs_realloc) { - list tx; - - ObjectCache *oc = on->get_oc(&bc); - oc->find_tx(start, len, tx); - - for (list::reverse_iterator p = tx.rbegin(); - p != tx.rend(); - p++) { - BufferHead *bh = *p; - - // cancelable/moveable? - if (alloc.contains(bh->start(), bh->length())) { - dout(10) << "alloc_write " << *bh << " already in " << alloc << endl; - continue; - } - - vector old; - on->map_extents(bh->start(), bh->length(), old); - assert(old.size() == 1); - - if (bh->start() >= start && bh->end() <= start+len) { - assert(bh->epoch_modified == super_epoch); - if (bc.bh_cancel_write(bh, super_epoch)) { - if (bh->length() == 1) - dout(10) << "alloc_write unallocated tx " << old[0] << ", canceled " << *bh << endl; - // no, this isn't compatible with clone() and extent reference counting. - //allocator.unallocate(old[0]); // release (into free) - allocator.release(old[0]); - alloc.insert(bh->start(), bh->length()); - } else { - if (bh->length() == 1) - dout(10) << "alloc_write released tx " << old[0] << ", couldn't cancel " << *bh << endl; - allocator.release(old[0]); // release (into limbo) - alloc.insert(bh->start(), bh->length()); - } - } else { - if (bh->length() == 1) - dout(10) << "alloc_write skipped tx " << old[0] << ", not entirely within " - << start << "~" << len - << " bh " << *bh << endl; - } - } - - dout(10) << "alloc_write will (re)alloc " << alloc << " on " << *on << endl; - } - - if (alloc.empty()) return; // no need to dirty the onode below! - - - // merge alloc into onode uncommitted map - //dout(10) << " union of " << on->uncommitted << " and " << alloc << endl; - interval_set old = on->uncommitted; - on->uncommitted.union_of(alloc); - - dout(10) << "alloc_write onode.uncommitted is now " << on->uncommitted << endl; - - if (0) { - // verify - interval_set ta; - ta.intersection_of(on->uncommitted, alloc); - cout << " ta " << ta << endl; - assert(alloc == ta); - - interval_set tb; - tb.intersection_of(on->uncommitted, old); - cout << " tb " << tb << endl; - assert(old == tb); - } - - dirty_onode(on); - - // allocate the space - for (map::iterator i = alloc.m.begin(); - i != alloc.m.end(); - i++) { - dout(15) << "alloc_write alloc " << i->first << "~" << i->second << " (of " << start << "~" << len << ")" << endl; - - // allocate new space - block_t left = i->second; - block_t cur = i->first; - while (left > 0) { - Extent ex; - allocator.allocate(ex, left, Allocator::NEAR_LAST_FWD); - dout(10) << "alloc_write got " << ex << " for object offset " << cur << endl; - on->set_extent(cur, ex); // map object to new region - left -= ex.length; - cur += ex.length; - } - } -} - - - - -void Ebofs::apply_write(Onode *on, off_t off, size_t len, bufferlist& bl) -{ - ObjectCache *oc = on->get_oc(&bc); - - // map into blocks - off_t opos = off; // byte pos in object - size_t zleft = 0; // zeros left to write - size_t left = len; // bytes left - - block_t bstart = off / EBOFS_BLOCK_SIZE; - - if (off > on->object_size) { - zleft = off - on->object_size; - opos = on->object_size; - bstart = on->object_size / EBOFS_BLOCK_SIZE; - } - if (off+(off_t)len > on->object_size) { - dout(10) << "apply_write extending size on " << *on << ": " << on->object_size - << " -> " << off+len << endl; - on->object_size = off+len; - dirty_onode(on); - } - if (bl.length() == 0) { - zleft += len; - left = 0; - } - if (zleft) - dout(10) << "apply_write zeroing first " << zleft << " bytes of " << *on << endl; - - block_t blast = (len+off-1) / EBOFS_BLOCK_SIZE; - block_t blen = blast-bstart+1; - - // allocate write on disk. - interval_set alloc; - block_t old_bfirst = 0; // zero means not defined here (since we ultimately pass to bh_read) - block_t old_blast = 0; - alloc_write(on, bstart, blen, alloc, old_bfirst, old_blast); - dout(20) << "apply_write old_bfirst " << old_bfirst << ", old_blast " << old_blast << endl; - - if (fake_writes) { - on->uncommitted.clear(); // worst case! - return; - } - - // map b range onto buffer_heads - map hits; - oc->map_write(bstart, blen, alloc, hits, super_epoch); - - // get current versions - //version_t lowv, highv; - //oc->scan_versions(bstart, blen, lowv, highv); - //highv++; - version_t highv = ++oc->write_count; - - // copy from bl into buffer cache - unsigned blpos = 0; // byte pos in input buffer - - // write data into buffers - for (map::iterator i = hits.begin(); - i != hits.end(); - i++) { - BufferHead *bh = i->second; - bh->set_version(highv); - bh->epoch_modified = super_epoch; - - // old write in progress? - if (bh->is_tx()) { // copy the buffer to avoid munging up in-flight write - dout(10) << "apply_write tx pending, copying buffer on " << *bh << endl; - bufferlist temp; - temp.claim(bh->data); - //bc.bufferpool.alloc(EBOFS_BLOCK_SIZE*bh->length(), bh->data); - bh->data.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*bh->length()) ); - bh->data.copy_in(0, bh->length()*EBOFS_BLOCK_SIZE, temp); - } - - // need to split off partial? (partials can only be ONE block) - if ((bh->is_missing() || bh->is_rx()) && bh->length() > 1) { - if ((bh->start() == bstart && opos % EBOFS_BLOCK_SIZE != 0)) { - BufferHead *right = bc.split(bh, bh->start()+1); - hits[right->start()] = right; - dout(10) << "apply_write split off left block for partial write; rest is " << *right << endl; - } - if ((bh->last() == blast && (len+off) % EBOFS_BLOCK_SIZE != 0) && - ((off_t)len+off < on->object_size)) { - BufferHead *right = bc.split(bh, bh->last()); - hits[right->start()] = right; - dout(10) << "apply_write split off right block for upcoming partial write; rest is " << *right << endl; - } - } - - // partial at head or tail? - if ((bh->start() == bstart && opos % EBOFS_BLOCK_SIZE != 0) || // opos, not off, in case we're zeroing... - (bh->last() == blast && ((off_t)len+off) % EBOFS_BLOCK_SIZE != 0 && ((off_t)len+off) < on->object_size)) { - // locate ourselves in bh - unsigned off_in_bh = opos - bh->start()*EBOFS_BLOCK_SIZE; - assert(off_in_bh >= 0); - unsigned len_in_bh = MIN( (off_t)(zleft+left), - (off_t)(bh->end()*EBOFS_BLOCK_SIZE)-opos ); - - if (bh->is_partial() || bh->is_rx() || bh->is_missing()) { - assert(bh->is_partial() || bh->is_rx() || bh->is_missing()); - assert(bh->length() == 1); - - // add frag to partial - dout(10) << "apply_write writing into partial " << *bh << ":" - << " off_in_bh " << off_in_bh - << " len_in_bh " << len_in_bh - << endl; - unsigned z = MIN( zleft, len_in_bh ); - if (z) { - bufferptr zp(z); - zp.zero(); - bufferlist zb; - zb.push_back(zp); - bh->add_partial(off_in_bh, zb); - zleft -= z; - opos += z; - } - - bufferlist sb; - sb.substr_of(bl, blpos, len_in_bh-z); // substr in existing buffer - bufferlist cp; - cp.append(sb.c_str(), len_in_bh-z); // copy the partial bit! - bh->add_partial(off_in_bh, cp); - left -= len_in_bh-z; - blpos += len_in_bh-z; - opos += len_in_bh-z; - - if (bh->partial_is_complete(on->object_size - bh->start()*EBOFS_BLOCK_SIZE)) { - dout(10) << "apply_write completed partial " << *bh << endl; - //bc.bufferpool.alloc(EBOFS_BLOCK_SIZE*bh->length(), bh->data); // new buffers! - bh->data.clear(); - bh->data.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*bh->length()) ); - bh->data.zero(); - bh->apply_partial(); - bc.mark_dirty(bh); - bc.bh_write(on, bh); - } - else if (bh->is_rx()) { - dout(10) << "apply_write rx -> partial " << *bh << endl; - assert(bh->length() == 1); - bc.mark_partial(bh); - bc.bh_queue_partial_write(on, bh); // queue the eventual write - } - else if (bh->is_missing()) { - dout(10) << "apply_write missing -> partial " << *bh << endl; - assert(bh->length() == 1); - bc.mark_partial(bh); - - // take care to read from _old_ disk block locations! - if (bh->start() == bstart) - bc.bh_read(on, bh, old_bfirst); - else if (bh->start() == blast) - bc.bh_read(on, bh, old_blast); - else assert(0); - - bc.bh_queue_partial_write(on, bh); // queue the eventual write - } - else if (bh->is_partial()) { - dout(10) << "apply_write already partial, no need to submit rx on " << *bh << endl; - if (bh->partial_tx_epoch == super_epoch) - bc.bh_cancel_partial_write(bh); - bc.bh_queue_partial_write(on, bh); // queue the eventual write - } - - - } else { - assert(bh->is_clean() || bh->is_dirty() || bh->is_tx()); - - // just write into the bh! - dout(10) << "apply_write writing leading/tailing partial into " << *bh << ":" - << " off_in_bh " << off_in_bh - << " len_in_bh " << len_in_bh - << endl; - - // copy data into new buffers first (copy on write!) - // FIXME: only do the modified pages? this might be a big bh! - bufferlist temp; - temp.claim(bh->data); - //bc.bufferpool.alloc(EBOFS_BLOCK_SIZE*bh->length(), bh->data); - bh->data.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*bh->length()) ); - bh->data.copy_in(0, bh->length()*EBOFS_BLOCK_SIZE, temp); - - unsigned z = MIN( zleft, len_in_bh ); - if (z) { - bufferptr zp(z); - zp.zero(); - bufferlist zb; - zb.push_back(zp); - bh->data.copy_in(off_in_bh, z, zb); - zleft -= z; - opos += z; - } - - bufferlist sub; - sub.substr_of(bl, blpos, len_in_bh-z); - bh->data.copy_in(off_in_bh+z, len_in_bh-z, sub); - blpos += len_in_bh-z; - left -= len_in_bh-z; - opos += len_in_bh-z; - - if (!bh->is_dirty()) - bc.mark_dirty(bh); - - bc.bh_write(on, bh); - } - continue; - } - - // ok, we're talking full block(s) now (modulo last block of the object) - assert(opos % EBOFS_BLOCK_SIZE == 0); - assert((off_t)(zleft+left) >= (off_t)(EBOFS_BLOCK_SIZE*bh->length()) || - opos+(off_t)(zleft+left) == on->object_size); - - // alloc new buffers. - //bc.bufferpool.alloc(EBOFS_BLOCK_SIZE*bh->length(), bh->data); - bh->data.clear(); - bh->data.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*bh->length()) ); - - // copy! - unsigned len_in_bh = MIN(bh->length()*EBOFS_BLOCK_SIZE, zleft+left); - assert(len_in_bh <= zleft+left); - - dout(10) << "apply_write writing into " << *bh << ":" - << " len_in_bh " << len_in_bh - << endl; - - unsigned z = MIN(len_in_bh, zleft); - if (z) { - bufferptr zp(z); - zp.zero(); - bufferlist zb; - zb.push_back(zp); - bh->data.copy_in(0, z, zb); - zleft -= z; - } - - bufferlist sub; - sub.substr_of(bl, blpos, len_in_bh-z); - bh->data.copy_in(z, len_in_bh-z, sub); - - blpos += len_in_bh-z; - left -= len_in_bh-z; - opos += len_in_bh; - - // old partial? - if (bh->is_partial() && - bh->partial_tx_epoch == super_epoch) - bc.bh_cancel_partial_write(bh); - - // mark dirty - if (!bh->is_dirty()) - bc.mark_dirty(bh); - - bc.bh_write(on, bh); - } - - assert(zleft == 0); - assert(left == 0); - assert(opos == off+(off_t)len); - //assert(blpos == bl.length()); -} - - - - -// *** file i/o *** - -bool Ebofs::attempt_read(Onode *on, off_t off, size_t len, bufferlist& bl, - Cond *will_wait_on, bool *will_wait_on_bool) -{ - dout(10) << "attempt_read " << *on << " " << off << "~" << len << endl; - ObjectCache *oc = on->get_oc(&bc); - - // map - block_t bstart = off / EBOFS_BLOCK_SIZE; - block_t blast = (len+off-1) / EBOFS_BLOCK_SIZE; - block_t blen = blast-bstart+1; - - map hits; - map missing; // read these - map rx; // wait for these - map partials; // ?? - oc->map_read(bstart, blen, hits, missing, rx, partials); - - // missing buffers? - if (!missing.empty()) { - for (map::iterator i = missing.begin(); - i != missing.end(); - i++) { - dout(10) << "attempt_read missing buffer " << *(i->second) << endl; - bc.bh_read(on, i->second); - } - BufferHead *wait_on = missing.begin()->second; - block_t b = MAX(wait_on->start(), bstart); - wait_on->waitfor_read[b].push_back(new C_Cond(will_wait_on, will_wait_on_bool)); - return false; - } - - // are partials sufficient? - bool partials_ok = true; - for (map::iterator i = partials.begin(); - i != partials.end(); - i++) { - BufferHead *bh = i->second; - off_t bhstart = (off_t)(bh->start()*EBOFS_BLOCK_SIZE); - off_t bhend = (off_t)(bh->end()*EBOFS_BLOCK_SIZE); - off_t start = MAX( off, bhstart ); - off_t end = MIN( off+(off_t)len, bhend ); - - if (!i->second->have_partial_range(start-bhstart, end-bhend)) { - if (partials_ok) { - // wait on this one - Context *c = new C_Cond(will_wait_on, will_wait_on_bool); - dout(10) << "attempt_read insufficient partial buffer " << *(i->second) << " c " << c << endl; - i->second->waitfor_read[i->second->start()].push_back(c); - } - partials_ok = false; - } - } - if (!partials_ok) return false; - - // wait on rx? - if (!rx.empty()) { - BufferHead *wait_on = rx.begin()->second; - Context *c = new C_Cond(will_wait_on, will_wait_on_bool); - dout(1) << "attempt_read waiting for read to finish on " << *wait_on << " c " << c << endl; - block_t b = MAX(wait_on->start(), bstart); - wait_on->waitfor_read[b].push_back(c); - return false; - } - - // yay, we have it all! - // concurrently walk thru hits, partials. - map::iterator h = hits.begin(); - map::iterator p = partials.begin(); - - bl.clear(); - off_t pos = off; - block_t curblock = bstart; - while (curblock <= blast) { - BufferHead *bh = 0; - if (h->first == curblock) { - bh = h->second; - h++; - } else if (p->first == curblock) { - bh = p->second; - p++; - } else assert(0); - - off_t bhstart = (off_t)(bh->start()*EBOFS_BLOCK_SIZE); - off_t bhend = (off_t)(bh->end()*EBOFS_BLOCK_SIZE); - off_t start = MAX( pos, bhstart ); - off_t end = MIN( off+(off_t)len, bhend ); - - if (bh->is_partial()) { - // copy from a partial block. yuck! - bufferlist frag; - bh->copy_partial_substr( start-bhstart, end-bhstart, frag ); - bl.claim_append( frag ); - pos += frag.length(); - } else { - // copy from a full block. - if (bhstart == start && bhend == end) { - bl.append( bh->data ); - pos += bh->data.length(); - } else { - bufferlist frag; - dout(10) << "substr " << (start-bhstart) << "~" << (end-start) << " of " << bh->data.length() << " in " << *bh << endl; - frag.substr_of(bh->data, start-bhstart, end-start); - pos += frag.length(); - bl.claim_append( frag ); - } - } - - curblock = bh->end(); - /* this assert is more trouble than it's worth - assert((off_t)(curblock*EBOFS_BLOCK_SIZE) == pos || // should be aligned with next block - end != bhend || // or we ended midway through bh - (bh->last() == blast && end == bhend)); // ended last block ** FIXME WRONG??? - */ - } - - assert(bl.length() == len); - return true; -} - - -/* - * is_cached -- query whether a object extent is in our cache - * return value of -1 if onode isn't loaded. otherwise, the number - * of extents that need to be read (i.e. # of seeks) - */ -int Ebofs::is_cached(object_t oid, off_t off, size_t len) -{ - ebofs_lock.Lock(); - int r = _is_cached(oid, off, len); - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_is_cached(object_t oid, off_t off, size_t len) -{ - if (!have_onode(oid)) { - dout(7) << "_is_cached " << oid << " " << off << "~" << len << " ... onode " << endl; - return -1; // object dne? - } - Onode *on = get_onode(oid); - - if (!on->have_oc()) { - // nothing is cached. return # of extents in file. - dout(10) << "_is_cached have onode but no object cache, returning extent count" << endl; - return on->extent_map.size(); - } - - // map - block_t bstart = off / EBOFS_BLOCK_SIZE; - block_t blast = (len+off-1) / EBOFS_BLOCK_SIZE; - block_t blen = blast-bstart+1; - - map hits; - map missing; // read these - map rx; // wait for these - map partials; // ?? - - int num_missing = on->get_oc(&bc)->try_map_read(bstart, blen); - dout(7) << "_is_cached try_map_read reports " << num_missing << " missing extents" << endl; - return num_missing; - - // FIXME: actually, we should calculate if these extents are contiguous. - // and not using map_read, probably... - /* hrmpf - block_t dpos = 0; - block_t opos = bstart; - while (opos < blen) { - if (hits.begin()->first == opos) { - } else { - block_t d; - if (missing.begin()->first == opos) d = missing.begin()->second. - - } - */ -} - -void Ebofs::trim_from_cache(object_t oid, off_t off, size_t len) -{ - ebofs_lock.Lock(); - _trim_from_cache(oid, off, len); - ebofs_lock.Unlock(); -} - -void Ebofs::_trim_from_cache(object_t oid, off_t off, size_t len) -{ - // be careful not to load it if we don't have it - if (!have_onode(oid)) { - dout(7) << "_trim_from_cache " << oid << " " << off << "~" << len << " ... onode not in cache " << endl; - return; - } - - // ok, we have it, get a pointer. - Onode *on = get_onode(oid); - - if (!on->have_oc()) - return; // nothing is cached. - - // map to blocks - block_t bstart = off / EBOFS_BLOCK_SIZE; - block_t blast = (len+off-1) / EBOFS_BLOCK_SIZE; - - ObjectCache *oc = on->get_oc(&bc); - oc->touch_bottom(bstart, blast); - - return; -} - - -int Ebofs::read(object_t oid, - off_t off, size_t len, - bufferlist& bl) -{ - ebofs_lock.Lock(); - int r = _read(oid, off, len, bl); - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_read(object_t oid, off_t off, size_t len, bufferlist& bl) -{ - dout(7) << "_read " << oid << " " << off << "~" << len << endl; - - Onode *on = get_onode(oid); - if (!on) { - dout(7) << "_read " << oid << " " << off << "~" << len << " ... dne " << endl; - return -ENOENT; // object dne? - } - - // read data into bl. block as necessary. - Cond cond; - - int r = 0; - while (1) { - // check size bound - if (off >= on->object_size) { - dout(7) << "_read " << oid << " " << off << "~" << len << " ... off past eof " << on->object_size << endl; - r = -ESPIPE; // FIXME better errno? - break; - } - - size_t try_len = len ? len:on->object_size; - size_t will_read = MIN(off+(off_t)try_len, on->object_size) - off; - - bool done; - if (attempt_read(on, off, will_read, bl, &cond, &done)) - break; // yay - - // wait - while (!done) - cond.Wait(ebofs_lock); - - if (on->deleted) { - dout(7) << "_read " << oid << " " << off << "~" << len << " ... object deleted" << endl; - r = -ENOENT; - break; - } - } - - put_onode(on); - - trim_bc(); - - if (r < 0) return r; // return error, - dout(7) << "_read " << oid << " " << off << "~" << len << " ... got " << bl.length() << endl; - return bl.length(); // or bytes read. -} - - -bool Ebofs::_write_will_block() -{ - return (bc.get_stat_dirty()+bc.get_stat_tx() > g_conf.ebofs_bc_max_dirty); -} - -bool Ebofs::write_will_block() -{ - ebofs_lock.Lock(); - bool b = _write_will_block(); - ebofs_lock.Unlock(); - return b; -} - - -unsigned Ebofs::apply_transaction(Transaction& t, Context *onsafe) -{ - ebofs_lock.Lock(); - dout(7) << "apply_transaction start (" << t.ops.size() << " ops)" << endl; - - // do ops - unsigned r = 0; // bit fields indicate which ops failed. - int bit = 1; - for (list::iterator p = t.ops.begin(); - p != t.ops.end(); - p++) { - switch (*p) { - case Transaction::OP_READ: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - off_t offset = t.offsets.front(); t.offsets.pop_front(); - size_t len = t.lengths.front(); t.lengths.pop_front(); - bufferlist *pbl = t.pbls.front(); t.pbls.pop_front(); - if (_read(oid, offset, len, *pbl) < 0) { - dout(7) << "apply_transaction fail on _read" << endl; - r &= bit; - } - } - break; - - case Transaction::OP_STAT: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - struct stat *st = t.psts.front(); t.psts.pop_front(); - if (_stat(oid, st) < 0) { - dout(7) << "apply_transaction fail on _stat" << endl; - r &= bit; - } - } - break; - - case Transaction::OP_GETATTR: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - const char *attrname = t.attrnames.front(); t.attrnames.pop_front(); - pair pattrval = t.pattrvals.front(); t.pattrvals.pop_front(); - if ((*(pattrval.second) = _getattr(oid, attrname, pattrval.first, *(pattrval.second))) < 0) { - dout(7) << "apply_transaction fail on _getattr" << endl; - r &= bit; - } - } - break; - - case Transaction::OP_GETATTRS: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - map *pset = t.pattrsets.front(); t.pattrsets.pop_front(); - if (_getattrs(oid, *pset) < 0) { - dout(7) << "apply_transaction fail on _getattrs" << endl; - r &= bit; - } - } - break; - - - case Transaction::OP_WRITE: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - off_t offset = t.offsets.front(); t.offsets.pop_front(); - size_t len = t.lengths.front(); t.lengths.pop_front(); - bufferlist bl = t.bls.front(); t.bls.pop_front(); - if (_write(oid, offset, len, bl) < 0) { - dout(7) << "apply_transaction fail on _write" << endl; - r &= bit; - } - } - break; - - case Transaction::OP_TRIMCACHE: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - off_t offset = t.offsets.front(); t.offsets.pop_front(); - size_t len = t.lengths.front(); t.lengths.pop_front(); - _trim_from_cache(oid, offset, len); - } - break; - - case Transaction::OP_TRUNCATE: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - off_t len = t.offsets.front(); t.offsets.pop_front(); - if (_truncate(oid, len) < 0) { - dout(7) << "apply_transaction fail on _truncate" << endl; - r &= bit; - } - } - break; - - case Transaction::OP_REMOVE: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - if (_remove(oid) < 0) { - dout(7) << "apply_transaction fail on _remove" << endl; - r &= bit; - } - } - break; - - case Transaction::OP_SETATTR: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - const char *attrname = t.attrnames.front(); t.attrnames.pop_front(); - //pair attrval = t.attrvals.front(); t.attrvals.pop_front(); - bufferlist bl; - bl.claim( t.attrbls.front() ); - t.attrbls.pop_front(); - if (_setattr(oid, attrname, bl.c_str(), bl.length()) < 0) { - dout(7) << "apply_transaction fail on _setattr" << endl; - r &= bit; - } - } - break; - - case Transaction::OP_SETATTRS: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - map *pattrset = t.pattrsets.front(); t.pattrsets.pop_front(); - if (_setattrs(oid, *pattrset) < 0) { - dout(7) << "apply_transaction fail on _setattrs" << endl; - r &= bit; - } - } - break; - - case Transaction::OP_RMATTR: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - const char *attrname = t.attrnames.front(); t.attrnames.pop_front(); - if (_rmattr(oid, attrname) < 0) { - dout(7) << "apply_transaction fail on _rmattr" << endl; - r &= bit; - } - } - break; - - case Transaction::OP_CLONE: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - object_t noid = t.oids.front(); t.oids.pop_front(); - if (_clone(oid, noid) < 0) { - dout(7) << "apply_transaction fail on _clone" << endl; - r &= bit; - } - } - break; - - case Transaction::OP_MKCOLL: - { - coll_t cid = t.cids.front(); t.cids.pop_front(); - if (_create_collection(cid) < 0) { - dout(7) << "apply_transaction fail on _create_collection" << endl; - r &= bit; - } - } - break; - - case Transaction::OP_RMCOLL: - { - coll_t cid = t.cids.front(); t.cids.pop_front(); - if (_destroy_collection(cid) < 0) { - dout(7) << "apply_transaction fail on _destroy_collection" << endl; - r &= bit; - } - } - break; - - case Transaction::OP_COLL_ADD: - { - coll_t cid = t.cids.front(); t.cids.pop_front(); - object_t oid = t.oids.front(); t.oids.pop_front(); - if (_collection_add(cid, oid) < 0) { - //dout(7) << "apply_transaction fail on _collection_add" << endl; - //r &= bit; - } - } - break; - - case Transaction::OP_COLL_REMOVE: - { - coll_t cid = t.cids.front(); t.cids.pop_front(); - object_t oid = t.oids.front(); t.oids.pop_front(); - if (_collection_remove(cid, oid) < 0) { - dout(7) << "apply_transaction fail on _collection_remove" << endl; - r &= bit; - } - } - break; - - case Transaction::OP_COLL_SETATTR: - { - coll_t cid = t.cids.front(); t.cids.pop_front(); - const char *attrname = t.attrnames.front(); t.attrnames.pop_front(); - //pair attrval = t.attrvals.front(); t.attrvals.pop_front(); - bufferlist bl; - bl.claim( t.attrbls.front() ); - t.attrbls.pop_front(); - if (_collection_setattr(cid, attrname, bl.c_str(), bl.length()) < 0) { - //if (_collection_setattr(cid, attrname, attrval.first, attrval.second) < 0) { - dout(7) << "apply_transaction fail on _collection_setattr" << endl; - r &= bit; - } - } - break; - - case Transaction::OP_COLL_RMATTR: - { - coll_t cid = t.cids.front(); t.cids.pop_front(); - const char *attrname = t.attrnames.front(); t.attrnames.pop_front(); - if (_collection_rmattr(cid, attrname) < 0) { - dout(7) << "apply_transaction fail on _collection_rmattr" << endl; - r &= bit; - } - } - break; - - default: - cerr << "bad op " << *p << endl; - assert(0); - } - - bit = bit << 1; - } - - dout(7) << "apply_transaction finish (r = " << r << ")" << endl; - - // set up commit waiter - //if (r == 0) { - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - //} else { - //if (onsafe) delete onsafe; - //} - - ebofs_lock.Unlock(); - return r; -} - - - -int Ebofs::_write(object_t oid, off_t offset, size_t length, bufferlist& bl) -{ - dout(7) << "_write " << oid << " " << offset << "~" << length << endl; - assert(bl.length() == length); - - // too much unflushed dirty data? (if so, block!) - if (_write_will_block()) { - dout(10) << "_write blocking " - << oid << " " << offset << "~" << length - << " bc: " - << "size " << bc.get_size() - << ", trimmable " << bc.get_trimmable() - << ", max " << g_conf.ebofs_bc_size - << "; dirty " << bc.get_stat_dirty() - << ", tx " << bc.get_stat_tx() - << ", max dirty " << g_conf.ebofs_bc_max_dirty - << endl; - - while (_write_will_block()) - bc.waitfor_stat(); // waits on ebofs_lock - - dout(10) << "_write unblocked " - << oid << " " << offset << "~" << length - << " bc: " - << "size " << bc.get_size() - << ", trimmable " << bc.get_trimmable() - << ", max " << g_conf.ebofs_bc_size - << "; dirty " << bc.get_stat_dirty() - << ", tx " << bc.get_stat_tx() - << ", max dirty " << g_conf.ebofs_bc_max_dirty - << endl; - } - - // out of space? - unsigned max = (length+offset) / EBOFS_BLOCK_SIZE + 10; // very conservative; assumes we have to rewrite - max += dirty_onodes.size() + dirty_cnodes.size(); - if (max >= free_blocks) { - dout(1) << "write failing, only " << free_blocks << " blocks free, may need up to " << max << endl; - return -ENOSPC; - } - - // get|create inode - Onode *on = get_onode(oid); - if (!on) on = new_onode(oid); // new inode! - if (on->readonly) { - put_onode(on); - return -EACCES; - } - - dirty_onode(on); // dirty onode! - - // apply write to buffer cache - if (length > 0) - apply_write(on, offset, length, bl); - - // done. - put_onode(on); - trim_bc(); - - return length; -} - - -/*int Ebofs::write(object_t oid, - off_t off, size_t len, - bufferlist& bl, bool fsync) -{ - // wait? - if (fsync) { - // wait for flush. - Cond cond; - bool done; - int flush = 1; // write never returns positive - Context *c = new C_Cond(&cond, &done, &flush); - int r = write(oid, off, len, bl, c); - if (r < 0) return r; - - ebofs_lock.Lock(); - { - while (!done) - cond.Wait(ebofs_lock); - assert(flush <= 0); - } - ebofs_lock.Unlock(); - if (flush < 0) return flush; - return r; - } else { - // don't wait for flush. - return write(oid, off, len, bl, (Context*)0); - } -} -*/ - -int Ebofs::write(object_t oid, - off_t off, size_t len, - bufferlist& bl, Context *onsafe) -{ - ebofs_lock.Lock(); - assert(len > 0); - - // go - int r = _write(oid, off, len, bl); - - // commit waiter - if (r > 0) { - assert((size_t)r == len); - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return r; -} - - -int Ebofs::_remove(object_t oid) -{ - dout(7) << "_remove " << oid << endl; - - // get inode - Onode *on = get_onode(oid); - if (!on) return -ENOENT; - - // ok remove it! - remove_onode(on); - - return 0; -} - - -int Ebofs::remove(object_t oid, Context *onsafe) -{ - ebofs_lock.Lock(); - - // do it - int r = _remove(oid); - - // set up commit waiter - if (r >= 0) { - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_truncate(object_t oid, off_t size) -{ - dout(7) << "_truncate " << oid << " size " << size << endl; - - Onode *on = get_onode(oid); - if (!on) - return -ENOENT; - if (on->readonly) { - put_onode(on); - return -EACCES; - } - - int r = 0; - if (size > on->object_size) { - r = -EINVAL; // whatever - } - else if (size < on->object_size) { - // change size - on->object_size = size; - dirty_onode(on); - - // free blocks - block_t nblocks = 0; - if (size) nblocks = 1 + (size-1) / EBOFS_BLOCK_SIZE; - if (on->object_blocks > nblocks) { - vector extra; - on->truncate_extents(nblocks, extra); - for (unsigned i=0; ioc) { - on->oc->truncate(on->object_blocks, super_epoch); - if (on->oc->is_empty()) - on->close_oc(); - } - - // update uncommitted - interval_set uncom; - if (nblocks > 0) { - interval_set left; - left.insert(0, nblocks); - uncom.intersection_of(left, on->uncommitted); - } - dout(10) << "uncommitted was " << on->uncommitted << " now " << uncom << endl; - on->uncommitted = uncom; - - } - else { - assert(size == on->object_size); - } - - put_onode(on); - return r; -} - - -int Ebofs::truncate(object_t oid, off_t size, Context *onsafe) -{ - ebofs_lock.Lock(); - - int r = _truncate(oid, size); - - // set up commit waiter - if (r >= 0) { - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return r; -} - - - -int Ebofs::clone(object_t from, object_t to, Context *onsafe) -{ - ebofs_lock.Lock(); - - int r = _clone(from, to); - - // set up commit waiter - if (r >= 0) { - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_clone(object_t from, object_t to) -{ - dout(7) << "_clone " << from << " -> " << to << endl; - - if (!g_conf.ebofs_cloneable) - return -1; // no! - - Onode *fon = get_onode(from); - if (!fon) return -ENOENT; - Onode *ton = get_onode(to); - if (ton) { - put_onode(fon); - put_onode(ton); - return -EEXIST; - } - ton = new_onode(to); - assert(ton); - - // copy easy bits - ton->readonly = true; - ton->object_size = fon->object_size; - ton->object_blocks = fon->object_blocks; - ton->attr = fon->attr; - - // collections - for (set::iterator p = fon->collections.begin(); - p != fon->collections.end(); - p++) - _collection_add(*p, to); - - // extents - ton->extent_map = fon->extent_map; - for (map::iterator p = ton->extent_map.begin(); - p != ton->extent_map.end(); - ++p) { - allocator.alloc_inc(p->second); - } - - // clear uncommitted - fon->uncommitted.clear(); - - // muck with ObjectCache - if (fon->oc) - fon->oc->clone_to( ton ); - - // ok! - put_onode(ton); - put_onode(fon); - return 0; -} - - - - -/* - * pick object revision with rev < specified rev. - * (oid.rev is a noninclusive upper bound.) - * - */ -int Ebofs::pick_object_revision_lt(object_t& oid) -{ - assert(oid.rev > 0); // this is only useful for non-zero oid.rev - - int r = -EEXIST; // return code - ebofs_lock.Lock(); - { - object_t orig = oid; - object_t live = oid; - live.rev = 0; - - if (object_tab->get_num_keys() > 0) { - Table::Cursor cursor(object_tab); - - object_tab->find(oid, cursor); // this will be just _past_ highest eligible rev - if (cursor.move_left() > 0) { - bool firstpass = true; - while (1) { - object_t t = cursor.current().key; - if (t.ino != oid.ino || - t.bno != oid.bno) // passed to previous object - break; - if (oid.rev < t.rev) { // rev < desired. possible match. - r = 0; - oid = t; - break; - } - if (firstpass && oid.rev >= t.rev) { // there is no old rev < desired. try live. - r = 0; - oid = live; - break; - } - if (cursor.move_left() <= 0) break; - firstpass = false; - } - } - } - - dout(8) << "find_object_revision " << orig << " -> " << oid - << " r=" << r << endl; - } - ebofs_lock.Unlock(); - return r; -} - - - - -bool Ebofs::exists(object_t oid) -{ - ebofs_lock.Lock(); - dout(8) << "exists " << oid << endl; - bool e = (object_tab->lookup(oid) == 0); - ebofs_lock.Unlock(); - return e; -} - -int Ebofs::stat(object_t oid, struct stat *st) -{ - ebofs_lock.Lock(); - int r = _stat(oid,st); - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_stat(object_t oid, struct stat *st) -{ - dout(7) << "_stat " << oid << endl; - - Onode *on = get_onode(oid); - if (!on) return -ENOENT; - - // ?? - st->st_size = on->object_size; - - put_onode(on); - return 0; -} - - -int Ebofs::_setattr(object_t oid, const char *name, const void *value, size_t size) -{ - dout(8) << "setattr " << oid << " '" << name << "' len " << size << endl; - - Onode *on = get_onode(oid); - if (!on) return -ENOENT; - if (on->readonly) { - put_onode(on); - return -EACCES; - } - - string n(name); - on->attr[n] = buffer::copy((char*)value, size); - dirty_onode(on); - put_onode(on); - - dout(8) << "setattr " << oid << " '" << name << "' len " << size << " success" << endl; - - return 0; -} - -int Ebofs::setattr(object_t oid, const char *name, const void *value, size_t size, Context *onsafe) -{ - ebofs_lock.Lock(); - int r = _setattr(oid, name, value, size); - - // set up commit waiter - if (r >= 0) { - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_setattrs(object_t oid, map& attrset) -{ - dout(8) << "setattrs " << oid << endl; - - Onode *on = get_onode(oid); - if (!on) return -ENOENT; - if (on->readonly) { - put_onode(on); - return -EACCES; - } - - on->attr = attrset; - dirty_onode(on); - put_onode(on); - return 0; -} - -int Ebofs::setattrs(object_t oid, map& attrset, Context *onsafe) -{ - ebofs_lock.Lock(); - int r = _setattrs(oid, attrset); - - // set up commit waiter - if (r >= 0) { - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::getattr(object_t oid, const char *name, void *value, size_t size) -{ - ebofs_lock.Lock(); - int r = _getattr(oid, name, value, size); - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_getattr(object_t oid, const char *name, void *value, size_t size) -{ - dout(8) << "_getattr " << oid << " '" << name << "' maxlen " << size << endl; - - Onode *on = get_onode(oid); - if (!on) return -ENOENT; - - string n(name); - int r = 0; - if (on->attr.count(n) == 0) { - dout(10) << "_getattr " << oid << " '" << name << "' dne" << endl; - r = -1; - } else { - r = MIN( on->attr[n].length(), size ); - dout(10) << "_getattr " << oid << " '" << name << "' got len " << r << endl; - memcpy(value, on->attr[n].c_str(), r ); - } - put_onode(on); - return r; -} - -int Ebofs::getattrs(object_t oid, map &aset) -{ - ebofs_lock.Lock(); - int r = _getattrs(oid, aset); - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_getattrs(object_t oid, map &aset) -{ - dout(8) << "_getattrs " << oid << endl; - - Onode *on = get_onode(oid); - if (!on) return -ENOENT; - aset = on->attr; - put_onode(on); - return 0; -} - - - -int Ebofs::_rmattr(object_t oid, const char *name) -{ - dout(8) << "_rmattr " << oid << " '" << name << "'" << endl; - - Onode *on = get_onode(oid); - if (!on) return -ENOENT; - if (on->readonly) { - put_onode(on); - return -EACCES; - } - - string n(name); - on->attr.erase(n); - dirty_onode(on); - put_onode(on); - return 0; -} - -int Ebofs::rmattr(object_t oid, const char *name, Context *onsafe) -{ - ebofs_lock.Lock(); - - int r = _rmattr(oid, name); - - // set up commit waiter - if (r >= 0) { - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::listattr(object_t oid, vector& attrs) -{ - ebofs_lock.Lock(); - dout(8) << "listattr " << oid << endl; - - Onode *on = get_onode(oid); - if (!on) { - ebofs_lock.Unlock(); - return -ENOENT; - } - - attrs.clear(); - for (map::iterator i = on->attr.begin(); - i != on->attr.end(); - i++) { - attrs.push_back(i->first); - } - - put_onode(on); - ebofs_lock.Unlock(); - return 0; -} - - - -/***************** collections ******************/ - -int Ebofs::list_collections(list& ls) -{ - ebofs_lock.Lock(); - dout(9) << "list_collections " << endl; - - Table::Cursor cursor(collection_tab); - - int num = 0; - if (collection_tab->find(0, cursor) >= 0) { - while (1) { - ls.push_back(cursor.current().key); - num++; - if (cursor.move_right() <= 0) break; - } - } - - ebofs_lock.Unlock(); - return num; -} - -int Ebofs::_create_collection(coll_t cid) -{ - dout(9) << "_create_collection " << hex << cid << dec << endl; - - if (_collection_exists(cid)) - return -EEXIST; - - Cnode *cn = new_cnode(cid); - put_cnode(cn); - - return 0; -} - -int Ebofs::create_collection(coll_t cid, Context *onsafe) -{ - ebofs_lock.Lock(); - - int r = _create_collection(cid); - - // set up commit waiter - if (r >= 0) { - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_destroy_collection(coll_t cid) -{ - dout(9) << "_destroy_collection " << hex << cid << dec << endl; - - if (!_collection_exists(cid)) - return -ENOENT; - - Cnode *cn = get_cnode(cid); - assert(cn); - - // hose mappings - list objects; - collection_list(cid, objects); - for (list::iterator i = objects.begin(); - i != objects.end(); - i++) { - co_tab->remove(coll_object_t(cid,*i)); - - Onode *on = get_onode(*i); - if (on) { - on->collections.erase(cid); - dirty_onode(on); - put_onode(on); - } - } - - remove_cnode(cn); - return 0; -} - -int Ebofs::destroy_collection(coll_t cid, Context *onsafe) -{ - ebofs_lock.Lock(); - - int r = _destroy_collection(cid); - - // set up commit waiter - if (r >= 0) { - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return r; -} - -bool Ebofs::collection_exists(coll_t cid) -{ - ebofs_lock.Lock(); - dout(10) << "collection_exists " << hex << cid << dec << endl; - bool r = _collection_exists(cid); - ebofs_lock.Unlock(); - return r; -} -bool Ebofs::_collection_exists(coll_t cid) -{ - return (collection_tab->lookup(cid) == 0); -} - -int Ebofs::_collection_add(coll_t cid, object_t oid) -{ - dout(9) << "_collection_add " << hex << cid << " object " << oid << dec << endl; - - if (!_collection_exists(cid)) - return -ENOENT; - - Onode *on = get_onode(oid); - if (!on) return -ENOENT; - - int r = 0; - - if (on->collections.count(cid) == 0) { - on->collections.insert(cid); - dirty_onode(on); - co_tab->insert(coll_object_t(cid,oid), true); - } else { - r = -ENOENT; // FIXME? already in collection. - } - - put_onode(on); - return r; -} - -int Ebofs::collection_add(coll_t cid, object_t oid, Context *onsafe) -{ - ebofs_lock.Lock(); - - int r = _collection_add(cid, oid); - - // set up commit waiter - if (r >= 0) { - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return 0; -} - -int Ebofs::_collection_remove(coll_t cid, object_t oid) -{ - dout(9) << "_collection_remove " << hex << cid << " object " << oid << dec << endl; - - if (!_collection_exists(cid)) - return -ENOENT; - - Onode *on = get_onode(oid); - if (!on) return -ENOENT; - - int r = 0; - - if (on->collections.count(cid)) { - on->collections.erase(cid); - dirty_onode(on); - co_tab->remove(coll_object_t(cid,oid)); - } else { - r = -ENOENT; // FIXME? - } - - put_onode(on); - return r; -} - -int Ebofs::collection_remove(coll_t cid, object_t oid, Context *onsafe) -{ - ebofs_lock.Lock(); - - int r = _collection_remove(cid, oid); - - // set up commit waiter - if (r >= 0) { - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return 0; -} - -int Ebofs::collection_list(coll_t cid, list& ls) -{ - ebofs_lock.Lock(); - dout(9) << "collection_list " << hex << cid << dec << endl; - - if (!_collection_exists(cid)) { - ebofs_lock.Unlock(); - return -ENOENT; - } - - Table::Cursor cursor(co_tab); - - int num = 0; - if (co_tab->find(coll_object_t(cid,object_t()), cursor) >= 0) { - while (1) { - const coll_t c = cursor.current().key.first; - const object_t o = cursor.current().key.second; - if (c != cid) break; // end! - dout(10) << "collection_list " << hex << cid << " includes " << o << dec << endl; - ls.push_back(o); - num++; - if (cursor.move_right() < 0) break; - } - } - - ebofs_lock.Unlock(); - return num; -} - - -int Ebofs::_collection_setattr(coll_t cid, const char *name, const void *value, size_t size) -{ - dout(10) << "_collection_setattr " << hex << cid << dec << " '" << name << "' len " << size << endl; - - Cnode *cn = get_cnode(cid); - if (!cn) return -ENOENT; - - string n(name); - cn->attr[n] = buffer::copy((char*)value, size); - dirty_cnode(cn); - put_cnode(cn); - - return 0; -} - -int Ebofs::collection_setattr(coll_t cid, const char *name, const void *value, size_t size, Context *onsafe) -{ - ebofs_lock.Lock(); - dout(10) << "collection_setattr " << hex << cid << dec << " '" << name << "' len " << size << endl; - - int r = _collection_setattr(cid, name, value, size); - - // set up commit waiter - if (r >= 0) { - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return 0; -} - -int Ebofs::collection_getattr(coll_t cid, const char *name, void *value, size_t size) -{ - ebofs_lock.Lock(); - dout(10) << "collection_setattr " << hex << cid << dec << " '" << name << "' maxlen " << size << endl; - - Cnode *cn = get_cnode(cid); - if (!cn) { - ebofs_lock.Unlock(); - return -ENOENT; - } - - string n(name); - int r; - if (cn->attr.count(n) == 0) { - r = -1; - } else { - r = MIN( cn->attr[n].length(), size ); - memcpy(value, cn->attr[n].c_str(), r); - } - - put_cnode(cn); - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_collection_rmattr(coll_t cid, const char *name) -{ - dout(10) << "_collection_rmattr " << hex << cid << dec << " '" << name << "'" << endl; - - Cnode *cn = get_cnode(cid); - if (!cn) return -ENOENT; - - string n(name); - cn->attr.erase(n); - - dirty_cnode(cn); - put_cnode(cn); - - return 0; -} - -int Ebofs::collection_rmattr(coll_t cid, const char *name, Context *onsafe) -{ - ebofs_lock.Lock(); - - int r = _collection_rmattr(cid, name); - - // set up commit waiter - if (r >= 0) { - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return 0; -} - -int Ebofs::collection_listattr(coll_t cid, vector& attrs) -{ - ebofs_lock.Lock(); - dout(10) << "collection_listattr " << hex << cid << dec << endl; - - Cnode *cn = get_cnode(cid); - if (!cn) { - ebofs_lock.Unlock(); - return -ENOENT; - } - - attrs.clear(); - for (map::iterator i = cn->attr.begin(); - i != cn->attr.end(); - i++) { - attrs.push_back(i->first); - } - - put_cnode(cn); - ebofs_lock.Unlock(); - return 0; -} - - - -void Ebofs::_export_freelist(bufferlist& bl) -{ - for (int b=0; b<=EBOFS_NUM_FREE_BUCKETS; b++) { - Table *tab; - if (b < EBOFS_NUM_FREE_BUCKETS) { - tab = free_tab[b]; - } else { - tab = limbo_tab; - } - - if (tab->get_num_keys() > 0) { - Table::Cursor cursor(tab); - assert(tab->find(0, cursor) >= 0); - while (1) { - assert(cursor.current().value > 0); - - Extent ex(cursor.current().key, cursor.current().value); - dout(10) << "_export_freelist " << ex << endl; - bl.append((char*)&ex, sizeof(ex)); - if (cursor.move_right() <= 0) break; - } - } - } -} - -void Ebofs::_import_freelist(bufferlist& bl) -{ - // clear - for (int b=0; bclear(); - limbo_tab->clear(); - - // import! - int num = bl.length() / sizeof(Extent); - Extent *p = (Extent*)bl.c_str(); - for (int i=0; i *tab; - if (b < EBOFS_NUM_FREE_BUCKETS) { - tab = free_tab[b]; - dout(30) << "dump bucket " << b << " " << tab->get_num_keys() << endl; - } else { - tab = limbo_tab; - dout(30) << "dump limbo " << tab->get_num_keys() << endl;; - } - - if (tab->get_num_keys() > 0) { - Table::Cursor cursor(tab); - assert(tab->find(0, cursor) >= 0); - while (1) { - assert(cursor.current().value > 0); - - block_t l = cursor.current().value; - tfree += l; - int b = 0; - do { - l = l >> 1; - b++; - } while (l); - st.free_extent_dist[b]++; - st.free_extent_dist_sum[b] += cursor.current().value; - st.num_free_extent++; - - if (cursor.move_right() <= 0) break; - } - } - } - st.avg_free_extent = tfree / st.num_free_extent; -*/ - - // used extents is harder. :( - st.num_extent = 0; - st.avg_extent = 0; - st.extent_dist.clear(); - st.extent_dist_sum.clear(); - st.avg_extent_per_object = 0; - st.avg_extent_jump = 0; - - Table::Cursor cursor(object_tab); - object_tab->find(object_t(), cursor); - int nobj = 0; - int njump = 0; - while (object_tab->get_num_keys() > 0) { - Onode *on = get_onode(cursor.current().key); - assert(on); - - nobj++; - st.avg_extent_per_object += on->extent_map.size(); - - for (map::iterator p = on->extent_map.begin(); - p != on->extent_map.end(); - p++) { - block_t l = p->second.length; - - st.num_extent++; - st.avg_extent += l; - if (p->first > 0) { - njump++; - st.avg_extent_jump += l; - } - - int b = 0; - do { - l = l >> 1; - b++; - } while (l); - st.extent_dist[b]++; - st.extent_dist_sum[b] += p->second.length; - } - put_onode(on); - if (cursor.move_right() <= 0) break; - } - if (njump) st.avg_extent_jump /= njump; - if (nobj) st.avg_extent_per_object /= (float)nobj; - if (st.num_extent) st.avg_extent /= st.num_extent; - - ebofs_lock.Unlock(); -} diff --git a/branches/marnberg/quota/ebofs/Ebofs.h b/branches/marnberg/quota/ebofs/Ebofs.h deleted file mode 100644 index 6d18b7a0204fa..0000000000000 --- a/branches/marnberg/quota/ebofs/Ebofs.h +++ /dev/null @@ -1,330 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include -using namespace std; - -#include -using namespace __gnu_cxx; - -#include "include/Context.h" -#include "include/buffer.h" - -template -inline ostream& operator<<(ostream& out, const pair& p) { - return out << p.first << "," << p.second; -} - -#include "types.h" -#include "Onode.h" -#include "Cnode.h" -#include "BlockDevice.h" -#include "nodes.h" -#include "Allocator.h" -#include "Table.h" - -#include "common/Mutex.h" -#include "common/Cond.h" - -#include "osd/ObjectStore.h" - -//typedef pair object_coll_t; -typedef pair coll_object_t; - - -class Ebofs : public ObjectStore { - protected: - Mutex ebofs_lock; // a beautiful global lock - - // ** debuggy ** - bool fake_writes; - - // ** super ** - BlockDevice dev; - bool mounted, unmounting, dirty; - bool readonly; - version_t super_epoch; - bool commit_thread_started, mid_commit; - Cond commit_cond; // to wake up the commit thread - Cond sync_cond; - - map > commit_waiters; - - void prepare_super(version_t epoch, bufferptr& bp); - void write_super(version_t epoch, bufferptr& bp); - int commit_thread_entry(); - - class CommitThread : public Thread { - Ebofs *ebofs; - public: - CommitThread(Ebofs *e) : ebofs(e) {} - void *entry() { - ebofs->commit_thread_entry(); - return 0; - } - } commit_thread; - - - - - // ** allocator ** - block_t free_blocks, limbo_blocks; - Allocator allocator; - friend class Allocator; - - block_t get_free_blocks() { return free_blocks; } - block_t get_limbo_blocks() { return limbo_blocks; } - block_t get_free_extents() { - int n = 0; - for (int i=0; iget_num_keys(); - return n; - } - block_t get_limbo_extents() { return limbo_tab->get_num_keys(); } - - - // ** tables and sets ** - // nodes - NodePool nodepool; // for all tables... - - // tables - Table *object_tab; - Table *free_tab[EBOFS_NUM_FREE_BUCKETS]; - Table *limbo_tab; - Table > *alloc_tab; - - // collections - Table *collection_tab; - Table *co_tab; - - void close_tables(); - - - // ** onodes ** - hash_map onode_map; // onode cache - LRU onode_lru; - set dirty_onodes; - map > waitfor_onode; - - Onode* new_onode(object_t oid); // make new onode. ref++. - bool have_onode(object_t oid) { - return onode_map.count(oid); - } - Onode* get_onode(object_t oid); // get cached onode, or read from disk. ref++. - void remove_onode(Onode *on); - void put_onode(Onode* o); // put it back down. ref--. - void dirty_onode(Onode* o); - void encode_onode(Onode *on, bufferlist& bl, unsigned& off); - void write_onode(Onode *on); - - // ** cnodes ** - hash_map cnode_map; - LRU cnode_lru; - set dirty_cnodes; - map > waitfor_cnode; - - Cnode* new_cnode(coll_t cid); - Cnode* get_cnode(coll_t cid); - void remove_cnode(Cnode *cn); - void put_cnode(Cnode *cn); - void dirty_cnode(Cnode *cn); - void encode_cnode(Cnode *cn, bufferlist& bl, unsigned& off); - void write_cnode(Cnode *cn); - - // ** onodes+cnodes = inodes ** - int inodes_flushing; - Cond inode_commit_cond; - - void flush_inode_finish(); - void commit_inodes_start(); - void commit_inodes_wait(); - friend class C_E_InodeFlush; - - void trim_inodes(int max = -1); - - // ** buffer cache ** - BufferCache bc; - pthread_t flushd_thread_id; - - version_t trigger_commit(); - void commit_bc_wait(version_t epoch); - void trim_bc(off_t max = -1); - - public: - void kick_idle(); - void sync(); - void sync(Context *onsafe); - void trim_buffer_cache(); - - class IdleKicker : public BlockDevice::kicker { - Ebofs *ebo; - public: - IdleKicker(Ebofs *t) : ebo(t) {} - void kick() { ebo->kick_idle(); } - } idle_kicker; - - - protected: - //void zero(Onode *on, size_t len, off_t off, off_t write_thru); - void alloc_write(Onode *on, - block_t start, block_t len, - interval_set& alloc, - block_t& old_bfirst, block_t& old_blast); - void apply_write(Onode *on, off_t off, size_t len, bufferlist& bl); - bool attempt_read(Onode *on, off_t off, size_t len, bufferlist& bl, - Cond *will_wait_on, bool *will_wait_on_bool); - - // ** finisher ** - // async write notification to users - Mutex finisher_lock; - Cond finisher_cond; - bool finisher_stop; - list finisher_queue; - - void *finisher_thread_entry(); - class FinisherThread : public Thread { - Ebofs *ebofs; - public: - FinisherThread(Ebofs *e) : ebofs(e) {} - void* entry() { return (void*)ebofs->finisher_thread_entry(); } - } finisher_thread; - - - void alloc_more_node_space(); - - void do_csetattrs(map > > &cmods); - void do_setattrs(Onode *on, map > &setattrs); - - - public: - Ebofs(char *devfn) : - fake_writes(false), - dev(devfn), - mounted(false), unmounting(false), dirty(false), readonly(false), - super_epoch(0), commit_thread_started(false), mid_commit(false), - commit_thread(this), - free_blocks(0), limbo_blocks(0), - allocator(this), - nodepool(ebofs_lock), - object_tab(0), limbo_tab(0), collection_tab(0), co_tab(0), - onode_lru(g_conf.ebofs_oc_size), - cnode_lru(g_conf.ebofs_cc_size), - inodes_flushing(0), - bc(dev, ebofs_lock), - idle_kicker(this), - finisher_stop(false), finisher_thread(this) { - for (int i=0; i& attrset, Context *onsafe=0); - int getattr(object_t oid, const char *name, void *value, size_t size); - int getattrs(object_t oid, map &aset); - int rmattr(object_t oid, const char *name, Context *onsafe=0); - int listattr(object_t oid, vector& attrs); - - // collections - int list_collections(list& ls); - bool collection_exists(coll_t c); - - int create_collection(coll_t c, Context *onsafe); - int destroy_collection(coll_t c, Context *onsafe); - int collection_add(coll_t c, object_t o, Context *onsafe); - int collection_remove(coll_t c, object_t o, Context *onsafe); - - int collection_list(coll_t c, list& o); - - int collection_setattr(coll_t oid, const char *name, const void *value, size_t size, Context *onsafe); - int collection_getattr(coll_t oid, const char *name, void *value, size_t size); - int collection_rmattr(coll_t cid, const char *name, Context *onsafe); - int collection_listattr(coll_t oid, vector& attrs); - - // maps - int map_lookup(object_t o, bufferlist& key, bufferlist& val); - int map_insert(object_t o, bufferlist& key, bufferlist& val); - int map_remove(object_t o, bufferlist& key); - int map_list(object_t o, list& keys); - int map_list(object_t o, map& vals); - int map_list(object_t o, - bufferlist& start, bufferlist& end, - map& vals); - - // crap - void _fake_writes(bool b) { fake_writes = b; } - void _get_frag_stat(FragmentationStat& st); - - void _import_freelist(bufferlist& bl); - void _export_freelist(bufferlist& bl); - - -private: - // private interface -- use if caller already holds lock - int _read(object_t oid, off_t off, size_t len, bufferlist& bl); - int _is_cached(object_t oid, off_t off, size_t len); - int _stat(object_t oid, struct stat *st); - int _getattr(object_t oid, const char *name, void *value, size_t size); - int _getattrs(object_t oid, map &aset); - - bool _write_will_block(); - int _write(object_t oid, off_t off, size_t len, bufferlist& bl); - void _trim_from_cache(object_t oid, off_t off, size_t len); - int _truncate(object_t oid, off_t size); - int _truncate_front(object_t oid, off_t size); - int _remove(object_t oid); - int _clone(object_t from, object_t to); - int _setattr(object_t oid, const char *name, const void *value, size_t size); - int _setattrs(object_t oid, map& attrset); - int _rmattr(object_t oid, const char *name); - bool _collection_exists(coll_t c); - int _create_collection(coll_t c); - int _destroy_collection(coll_t c); - int _collection_add(coll_t c, object_t o); - int _collection_remove(coll_t c, object_t o); - int _collection_setattr(coll_t oid, const char *name, const void *value, size_t size); - int _collection_rmattr(coll_t cid, const char *name); - - -}; diff --git a/branches/marnberg/quota/ebofs/Onode.h b/branches/marnberg/quota/ebofs/Onode.h deleted file mode 100644 index 233c97e7ae172..0000000000000 --- a/branches/marnberg/quota/ebofs/Onode.h +++ /dev/null @@ -1,390 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __EBOFS_ONODE_H -#define __EBOFS_ONODE_H - -#include "include/lru.h" - -#include "types.h" -#include "BufferCache.h" - -#include "include/interval_set.h" - - -/* - * object node (like an inode) - * - * holds object metadata, including - * size - * allocation (extent list) - * attributes - * - */ - -class Onode : public LRUObject { -private: - int ref; - -public: - object_t object_id; - version_t version; // incremented on each modify. - - // data - bool readonly; - Extent onode_loc; - off_t object_size; - unsigned object_blocks; - - // onode - set collections; - map attr; - //vector extents; - map extent_map; - - interval_set uncommitted; - - ObjectCache *oc; - - bool dirty; - bool dangling; // not in onode_map - bool deleted; // deleted - - list commit_waiters; - - public: - Onode(object_t oid) : ref(0), object_id(oid), version(0), - readonly(false), - object_size(0), object_blocks(0), oc(0), - dirty(false), dangling(false), deleted(false) { - onode_loc.length = 0; - } - ~Onode() { - if (oc) delete oc; - } - - block_t get_onode_id() { return onode_loc.start; } - int get_onode_len() { return onode_loc.length; } - - int get_ref_count() { return ref; } - void get() { - if (ref == 0) lru_pin(); - ref++; - //cout << "ebofs.onode.get " << hex << object_id << dec << " " << ref << endl; - } - void put() { - ref--; - if (ref == 0) lru_unpin(); - //cout << "ebofs.onode.put " << hex << object_id << dec << " " << ref << endl; - } - - void mark_dirty() { - if (!dirty) { - dirty = true; - get(); - } - } - void mark_clean() { - if (dirty) { - dirty = false; - put(); - } - } - bool is_dirty() { return dirty; } - bool is_deleted() { return deleted; } - bool is_dangling() { return dangling; } - - - bool have_oc() { - return oc != 0; - } - ObjectCache *get_oc(BufferCache *bc) { - if (!oc) { - oc = new ObjectCache(object_id, this, bc); - oc->get(); - get(); - } - return oc; - } - void close_oc() { - if (oc) { - //cout << "close_oc on " << object_id << endl; - assert(oc->is_empty()); - if (oc->put() == 0){ - //cout << "************************* hosing oc" << endl; - delete oc; - } - oc = 0; - put(); - } - } - - - // allocation - void verify_extents() { - if (0) { // do crazy stupid sanity checking - block_t count = 0; - interval_set is; - - set s; - cout << "verifying" << endl; - - for (map::iterator p = extent_map.begin(); - p != extent_map.end(); - p++) { - cout << " " << p->first << ": " << p->second << endl; - assert(count == p->first); - count += p->second.length; - for (unsigned j=0;jsecond.length;j++) { - assert(s.count(p->second.start+j) == 0); - s.insert(p->second.start+j); - } - } - - assert(s.size() == count); - assert(count == object_blocks); - } - } - void set_extent(block_t offset, Extent ex) { - //cout << "set_extent " << offset << " -> " << ex << " ... " << object_blocks << endl; - assert(offset <= object_blocks); - verify_extents(); - - // at the end? - if (offset == object_blocks) { - //cout << " appending " << ex << endl; - if (!extent_map.empty() && extent_map.rbegin()->second.end() == ex.start) { - //cout << "appending " << ex << " to " << extent_map.rbegin()->second << endl; - extent_map.rbegin()->second.length += ex.length; - } else - extent_map[object_blocks] = ex; - object_blocks += ex.length; - return; - } - - // removing any extent bits we overwrite - if (!extent_map.empty()) { - // preceeding extent? - map::iterator p = extent_map.lower_bound(offset); - if (p != extent_map.begin()) { - p--; - if (p->first + p->second.length > offset) { - //cout << " preceeding was " << p->second << endl; - if (p->first + p->second.length > offset+ex.length) { - // cutting chunk out of middle, add last bit - Extent &n = extent_map[offset+ex.length] = p->second; - n.start += offset+ex.length - p->first; - n.length -= offset+ex.length - p->first; - //cout << " tail frag is " << n << endl; - } - p->second.length = offset - p->first; // cut tail off preceeding extent - //cout << " preceeding now " << p->second << endl; - } - p++; - } - - // overlapping extents - while (p != extent_map.end() && - p->first < offset + ex.length) { - map::iterator next = p; - next++; - - // completely subsumed? - if (p->first + p->second.length <= offset+ex.length) { - //cout << " erasing " << p->second << endl; - extent_map.erase(p); - p = next; - continue; - } - - // spans new extent, cut off head - Extent &n = extent_map[ offset+ex.length ] = p->second; - //cout << " cut head off " << p->second; - n.start += offset+ex.length - p->first; - n.length -= offset+ex.length - p->first; - extent_map.erase(p); - //cout << ", now " << n << endl; - break; - } - } - - extent_map[ offset ] = ex; - - // extend object? - if (offset + ex.length > object_blocks) - object_blocks = offset+ex.length; - - verify_extents(); - } - - - /* map_extents(start, len, ls) - * map teh given page range into extents on disk. - */ - int map_extents(block_t start, block_t len, vector& ls) { - //cout << "map_extents " << start << " " << len << endl; - verify_extents(); - - //assert(start+len <= object_blocks); - - map::iterator p = extent_map.lower_bound(start); - if (p != extent_map.begin() && - (p == extent_map.end() || p->first > start && p->first)) { - p--; - if (p->second.length > start - p->first) { - Extent ex; - ex.start = p->second.start + (start - p->first); - ex.length = MIN(len, p->second.length - (start - p->first)); - ls.push_back(ex); - - //cout << " got (tail of?) " << p->second << " : " << ex << endl; - - start += ex.length; - len -= ex.length; - } - p++; - } - - while (len > 0 && - p != extent_map.end()) { - assert(p->first == start); - Extent ex = p->second; - ex.length = MIN(len, ex.length); - ls.push_back(ex); - //cout << " got (head of?) " << p->second << " : " << ex << endl; - start += ex.length; - len -= ex.length; - p++; - } - - return 0; - } - - int truncate_extents(block_t len, vector& extra) { - verify_extents(); - - map::iterator p = extent_map.lower_bound(len); - if (p != extent_map.begin() && - (p == extent_map.end() || p->first > len && p->first)) { - p--; - if (p->second.length > len - p->first) { - Extent ex; - ex.start = p->second.start + (len - p->first); - ex.length = p->second.length - (len - p->first); - extra.push_back(ex); - - p->second.length = len - p->first; - assert(p->second.length > 0); - - //cout << " got (tail of?) " << p->second << " : " << ex << endl; - } - p++; - } - - while (p != extent_map.end()) { - assert(p->first >= len); - extra.push_back(p->second); - map::iterator n = p; - n++; - extent_map.erase(p); - p = n; - } - - object_blocks = len; - verify_extents(); - return 0; - } - - int truncate_front_extents(block_t len, vector& extra) { - verify_extents(); - - while (len > 0) { - Extent& ex = extent_map.begin()->second; // look, this is a reference! - if (ex.length > len) { - // partial first extent - Extent frontbit( ex.start, len ); - extra.push_back(frontbit); - ex.length -= len; - ex.start += len; - break; - } - - // pull off entire first extent. - assert(ex.length <= len); - len -= ex.length; - extra.push_back(ex); - extent_map.erase(extent_map.begin()); - } - - object_blocks -= len; - verify_extents(); - return 0; - } - - - - /* map_alloc_regions(start, len, map) - * map range into regions that need to be (re)allocated on disk - * because they overlap "safe" (or unallocated) parts of the object - */ - /* - void map_alloc_regions(block_t start, block_t len, - interval_set& alloc) { - interval_set already_uncom; - - alloc.insert(start, len); // start with whole range - already_uncom.intersection_of(alloc, uncommitted); - alloc.subtract(already_uncom); // take out the bits that aren't yet committed - } - */ - - - - // pack/unpack - int get_collection_bytes() { - return sizeof(coll_t) * collections.size(); - } - int get_attr_bytes() { - int s = 0; - for (map::iterator i = attr.begin(); - i != attr.end(); - i++) { - s += i->first.length() + 1; - s += i->second.length() + sizeof(int); - } - return s; - } - int get_extent_bytes() { - return sizeof(Extent) * extent_map.size(); - } - -}; - - -inline ostream& operator<<(ostream& out, Onode& on) -{ - out << "onode(" << hex << on.object_id << dec << " len=" << on.object_size; - out << " ref=" << on.get_ref_count(); - if (on.is_dirty()) out << " dirty"; - if (on.is_dangling()) out << " dangling"; - if (on.is_deleted()) out << " deleted"; - out << " uncom=" << on.uncommitted; - // out << " " << &on; - out << ")"; - return out; -} - - - -#endif diff --git a/branches/marnberg/quota/ebofs/Table.h b/branches/marnberg/quota/ebofs/Table.h deleted file mode 100644 index f16e506a9dd63..0000000000000 --- a/branches/marnberg/quota/ebofs/Table.h +++ /dev/null @@ -1,898 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __EBOFS_TABLE_H -#define __EBOFS_TABLE_H - -#include "types.h" -#include "nodes.h" - -/** table **/ - -#define dbtout if (25 <= g_conf.debug_ebofs) cout << "ebofs.table(" << this << ")." - - -template -class Table { - private: - NodePool &pool; - - nodeid_t root; - int nkeys; - int depth; - - public: - Table(NodePool &p, - struct ebofs_table& bts) : - pool(p), - root(bts.root), nkeys(bts.num_keys), depth(bts.depth) { - dbtout << "cons" << endl; - } - - nodeid_t get_root() { return root; } - int get_num_keys() { return nkeys; } - int get_depth() { return depth; } - - - /* - */ - class _IndexItem { // i just need a struct size for below - K k; - nodeid_t n; - }; - class IndexItem { - public: - K key; - nodeid_t node; - static const int MAX = Node::ITEM_LEN / (sizeof(_IndexItem)); - static const int MIN = MAX/2; - }; - class _LeafItem { // i just need a struct size for below - K k; - V v; - }; - class LeafItem { - public: - K key; - V value; - static const int MAX = Node::ITEM_LEN / (sizeof(_LeafItem)); - static const int MIN = MAX/2; - }; - - class Nodeptr { - public: - Node *node; - - Nodeptr() : node(0) {} - Nodeptr(Node *n) : node(n) {} - Nodeptr& operator=(Node *n) { - node = n; - return *this; - } - - LeafItem& leaf_item(int i) { return (( LeafItem*)(node->item_ptr()))[i]; } - IndexItem& index_item(int i) { return ((IndexItem*)(node->item_ptr()))[i]; } - K key(int i) { - if (node->is_index()) - return index_item(i).key; - else - return leaf_item(i).key; - } - - bool is_leaf() { return node->is_leaf(); } - bool is_index() { return node->is_index(); } - void set_type(int t) { node->set_type(t); } - - int max_items() const { - if (node->is_leaf()) - return LeafItem::MAX; - else - return IndexItem::MAX; - } - int min_items() const { return max_items() / 2; } - - nodeid_t get_id() { return node->get_id(); } - - int size() { return node->size(); } - void set_size(int s) { node->set_size(s); } - - void remove_at_pos(int p) { - if (node->is_index()) { - for (int i=p; ip; i--) - leaf_item(i) = leaf_item(i-1); - leaf_item(p).key = key; - leaf_item(p).value = value; - set_size(size() + 1); - } - void insert_at_index_pos(int p, K key, nodeid_t node) { - assert(is_index()); - for (int i=size(); i>p; i--) - index_item(i) = index_item(i-1); - index_item(p).key = key; - index_item(p).node = node; - set_size(size() + 1); - } - - void append_item(LeafItem& i) { - leaf_item(size()) = i; - set_size(size() + 1); - } - void append_item(IndexItem& i) { - index_item(size()) = i; - set_size(size() + 1); - } - - void split(Nodeptr& right) { - if (node->is_index()) { - for (int i=min_items(); iis_index()) - for (int i=0; i open; // open nodes - vector pos; // position within the node - //Nodeptr open[20]; - //int pos[20]; - int level; - - Cursor(Table *t) : table(t), open(t->depth), pos(t->depth), level(0) {} - - public: - - const LeafItem& current() { - assert(open[level].is_leaf()); - return open[level].leaf_item(pos[level]); - } - V& dirty_current_value() { - assert(open[level].is_leaf()); - dirty(); - return open[level].leaf_item(pos[level]).value; - } - - // ** read-only bits ** - int move_left() { - if (table->depth == 0) return OOB; - - // work up around branch - int l; - for (l = level; l >= 0; l--) - if (pos[l] > 0) break; - if (l < 0) - return OOB; // we are the first item in the btree - - // move left one - pos[l]--; - - // work back down right side - for (; lpool.get_node( open[l].index_item(pos[l]).node ); - pos[l+1] = open[l+1].size() - 1; - } - return 1; - } - int move_right() { - if (table->depth == 0) return OOB; - - // work up branch - int l; - for (l=level; l>=0; l--) - if (pos[l] < open[l].size() - 1) break; - if (l < 0) { - /* we are at last item in btree. */ - if (pos[level] < open[level].size()) { - pos[level]++; /* move into add position! */ - return 0; - } - return -1; - } - - /* move right one */ - assert( pos[l] < open[l].size() ); - pos[l]++; - - /* work back down */ - for (; lpool.get_node( open[l].index_item(pos[l]).node ); - pos[l+1] = 0; // furthest left - } - return 1; - } - - // ** modifications ** - void dirty() { - for (int l=level; l>=0; l--) { - if (open[l].node->is_dirty()) break; // already dirty! (and thus parents are too) - - table->pool.dirty_node(open[l].node); - if (l > 0) - open[l-1].index_item( pos[l-1] ).node = open[l].get_id(); - else - table->root = open[0].get_id(); - } - } - private: - void repair_parents() { - // did i make a change at the start of a node? - if (pos[level] == 0) { - K key = open[level].key(0); // new key parents should have - for (int j=level-1; j>=0; j--) { - if (open[j].index_item(pos[j]).key == key) - break; /* it's the same key, we can stop fixing */ - open[j].index_item(pos[j]).key = key; - if (pos[j] > 0) break; /* last in position 0.. */ - } - } - } - - public: - void remove() { - dirty(); - - // remove from node - open[level].remove_at_pos( pos[level] ); - repair_parents(); - - // was it a key? - if (level == table->depth-1) - table->nkeys--; - } - - void insert(K key, V value) { - dirty(); - - // insert - open[level].insert_at_leaf_pos(pos[level], key, value); - repair_parents(); - - // was it a key? - if (level == table->depth-1) - table->nkeys++; - } - - int rotate_left() { - if (level == 0) return -1; // i am root - if (pos[level-1] == 0) return -1; // nothing to left - - Nodeptr here = open[level]; - Nodeptr parent = open[level-1]; - Nodeptr left = table->pool.get_node( parent.index_item(pos[level-1] - 1).node ); - if (left.size() == left.max_items()) return -1; // it's full - - // make both dirty - dirty(); - if (!left.node->is_dirty()) { - table->pool.dirty_node(left.node); - parent.index_item(pos[level-1]-1).node = left.get_id(); - } - - dbtout << "rotating item " << here.key(0) << " left from " << here.get_id() << " to " << left.get_id() << endl; - - /* add */ - if (here.node->is_leaf()) - left.append_item(here.leaf_item(0)); - else - left.append_item(here.index_item(0)); - - /* remove */ - here.remove_at_pos(0); - - /* fix parent index for me */ - parent.index_item( pos[level-1] ).key = here.key(0); - // we never have to update past immediate parent, since we're not at pos 0 - - /* adjust cursor */ - if (pos[level] > 0) - pos[level]--; - //else - //assert(1); /* if we were positioned here, we're equal */ - /* if it was 0, then the shifted item == our key, and we can stay here safely. */ - return 0; - } - int rotate_right() { - if (level == 0) return -1; // i am root - if (pos[level-1] + 1 >= open[level-1].size()) return -1; // nothing to right - - Nodeptr here = open[level]; - Nodeptr parent = open[level-1]; - Nodeptr right = table->pool.get_node( parent.index_item( pos[level-1] + 1 ).node ); - if (right.size() == right.max_items()) return -1; // it's full - - // make both dirty - dirty(); - if (!right.node->is_dirty()) { - table->pool.dirty_node(right.node); - parent.index_item( pos[level-1]+1 ).node = right.get_id(); - } - - if (pos[level] == here.size()) { - /* let's just move the cursor over! */ - //if (sizeof(K) == 8) - dbtout << "shifting cursor right from " << here.get_id() << " to less-full node " << right.get_id() << endl; - open[level] = right; - pos[level] = 0; - pos[level-1]++; - return 0; - } - - //if (sizeof(K) == 8) - dbtout << "rotating item " << hex << here.key(here.size()-1) << dec << " right from " - << here.get_id() << " to " << right.get_id() << endl; - - /* add */ - if (here.is_index()) - right.insert_at_index_pos(0, - here.index_item( here.size()-1 ).key, - here.index_item( here.size()-1 ).node); - else - right.insert_at_leaf_pos(0, - here.leaf_item( here.size()-1 ).key, - here.leaf_item( here.size()-1 ).value); - - /* remove */ - here.set_size(here.size() - 1); - - /* fix parent index for right */ - parent.index_item( pos[level-1] + 1 ).key = right.key(0); - - return 0; - } - }; - - - public: - bool almost_full() { - if (2*(depth+1) > pool.num_free()) // worst case, plus some. - return true; - return false; - } - - int find(K key, Cursor& cursor) { - dbtout << "find " << key << endl; - - if (depth == 0) - return Cursor::OOB; - - // init - cursor.level = 0; - - // start at root - Nodeptr curnode( pool.get_node(root) ); - cursor.open[0] = curnode; - - if (curnode.size() == 0) return -1; // empty! - - // find leaf - for (cursor.level = 0; cursor.level < depth-1; cursor.level++) { - /* if key=5, we want 2 3 [4] 6 7, or 3 4 [5] 5 6 (err to the left) */ - int left = 0; /* i >= left */ - int right = curnode.size()-1; /* i < right */ - while (left < right) { - int i = left + (right - left) / 2; - if (curnode.index_item(i).key < key) { - left = i + 1; - } else if (i && curnode.index_item(i-1).key >= key) { - right = i; - } else { - left = right = i; - break; - } - } - int i = left; - if (i && curnode.index_item(i).key > key) i--; - -#ifdef EBOFS_DEBUG_BTREE - int j; - for (j=0; j key) break; - } - if (i != j) { - dbtout << "btree binary search failed" << endl; - i = j; - } -#endif - - cursor.pos[cursor.level] = i; - - /* get child node */ - curnode = pool.get_node( cursor.open[cursor.level].index_item(i).node ); - cursor.open[cursor.level+1] = curnode; - } - - /* search leaf */ - /* if key=5, we want 2 3 4 [6] 7, or 3 4 [5] 5 6 (err to the right) */ - int left = 0; /* i >= left */ - int right = curnode.size(); /* i < right */ - while (left < right) { - int i = left + (right - left) / 2; - if (curnode.leaf_item(i).key < key) { - left = i + 1; - } else if (i && curnode.leaf_item(i-1).key >= key) { - right = i; - } else { - left = right = i; - break; - } - } - int i = left; - -#ifdef EBOFS_DEBUG_BTREE - int j; - for (j=0; j= key) break; - } - if (i != j) { - dbtout << "btree binary search failed" << endl; - i = j; - } -#endif - - cursor.pos[cursor.level] = i; /* first key in this node, or key insertion point */ - - if (curnode.size() >= i+1) { - if (curnode.leaf_item(i).key == key) { - return Cursor::MATCH; /* it's the actual key */ - } else { - return Cursor::INSERT; /* it's an insertion point */ - } - } - return Cursor::OOB; /* it's the end of the btree (also a valid insertion point) */ - } - - int lookup(K key) { - dbtout << "lookup" << endl; - Cursor cursor(this); - if (find(key, cursor) == Cursor::MATCH) - return 0; - return -1; - } - - int lookup(K key, V& value) { - dbtout << "lookup" << endl; - Cursor cursor(this); - if (find(key, cursor) == Cursor::MATCH) { - value = cursor.current().value; - return 0; - } - return -1; - } - - int insert(K key, V value) { - dbtout << "insert " << key << " -> " << value << endl; - if (almost_full()) return -1; - - // empty? - if (nkeys == 0) { - if (root == -1) { - // create a root node (leaf!) - assert(depth == 0); - Nodeptr newroot( pool.new_node(Node::TYPE_LEAF) ); - root = newroot.get_id(); - depth++; - } - assert(depth == 1); - assert(root >= 0); - } - - // start at/near key - Cursor cursor(this); - find(key, cursor); - - // insert loop - nodeid_t nodevalue = 0; - while (1) { - - /* room in this node? */ - if (cursor.open[cursor.level].size() < cursor.open[cursor.level].max_items()) { - if (cursor.open[cursor.level].is_leaf()) - cursor.insert( key, value ); // will dirty, etc. - else { - // indices are already dirty - cursor.open[cursor.level].insert_at_index_pos(cursor.pos[cursor.level], key, nodevalue); - } - verify("insert 1"); - return 0; - } - - /* this node is full. */ - assert( cursor.open[cursor.level].size() == cursor.open[cursor.level].max_items() ); - - /* can we rotate? */ - if (false) // NO! there's a bug in here somewhere, don't to it. - if (cursor.level > 0) { - if ((cursor.pos[cursor.level-1] > 0 - && cursor.rotate_left() >= 0) || - (cursor.pos[cursor.level-1] + 1 < cursor.open[cursor.level-1].size() - && cursor.rotate_right() >= 0)) { - - if (cursor.open[cursor.level].is_leaf()) - cursor.insert( key, value ); // will dirty, etc. - else { - // indices are already dirty - cursor.open[cursor.level].insert_at_index_pos(cursor.pos[cursor.level], key, nodevalue); - } - verify("insert 2"); - return 0; - } - } - - /** split node **/ - - if (cursor.level == depth-1) { - dbtout << "splitting leaf " << cursor.open[cursor.level].get_id() << endl; - } else { - dbtout << "splitting index " << cursor.open[cursor.level].get_id() << endl; - } - - cursor.dirty(); - - // split - Nodeptr leftnode = cursor.open[cursor.level]; - Nodeptr newnode( pool.new_node(leftnode.node->get_type()) ); - leftnode.split( newnode ); - - /* insert our item */ - if (cursor.pos[cursor.level] > leftnode.size()) { - // not with cursor, since this node isn't added yet! - if (newnode.is_leaf()) { - newnode.insert_at_leaf_pos( cursor.pos[cursor.level] - leftnode.size(), - key, value ); - nkeys++; - } else { - newnode.insert_at_index_pos( cursor.pos[cursor.level] - leftnode.size(), - key, nodevalue ); - } - } else { - // with cursor (if leaf) - if (leftnode.is_leaf()) - cursor.insert( key, value ); - else - leftnode.insert_at_index_pos( cursor.pos[cursor.level], - key, nodevalue ); - } - - /* are we at the root? */ - if (cursor.level == 0) { - /* split root. */ - dbtout << "that split was the root " << root << endl; - Nodeptr newroot( pool.new_node(Node::TYPE_INDEX) ); - - /* new root node */ - newroot.set_size(2); - newroot.index_item(0).key = leftnode.key(0); - newroot.index_item(0).node = root; - newroot.index_item(1).key = newnode.key(0); - newroot.index_item(1).node = newnode.get_id(); - - /* heighten tree */ - depth++; - root = newroot.get_id(); - verify("insert 3"); - return 0; - } - - /* now insert newindex in level-1 */ - nodevalue = newnode.get_id(); - key = newnode.key(0); - cursor.level--; - cursor.pos[cursor.level]++; // ...to the right of leftnode! - } - } - - - int remove(K key) { - dbtout << "remove " << key << endl; - - if (almost_full()) { - cout << "table almost full, failing" << endl; - assert(0); - return -1; - } - - Cursor cursor(this); - if (find(key, cursor) <= 0) { - cerr << "remove " << key << " 0x" << hex << key << dec << " .. dne" << endl; - g_conf.debug_ebofs = 33; - g_conf.ebofs_verify = true; - verify("remove dne"); - assert(0); - return -1; // key dne - } - - - while (1) { - cursor.remove(); - - // balance + adjust - - if (cursor.level == 0) { - // useless root index? - if (cursor.open[0].size() == 1 && - depth > 1) { - depth--; - root = cursor.open[0].index_item(0).node; - pool.release( cursor.open[0].node ); - } - - // note: root can be small, but not empty - else if (nkeys == 0) { - assert(cursor.open[cursor.level].size() == 0); - assert(depth == 1); - root = -1; - depth = 0; - if (cursor.open[0].node) - pool.release(cursor.open[0].node); - } - verify("remove 1"); - return 0; - } - - if (cursor.open[cursor.level].size() > cursor.open[cursor.level].min_items()) { - verify("remove 2"); - return 0; - } - - // borrow from siblings? - Nodeptr left; - Nodeptr right; - - // left? - if (cursor.pos[cursor.level-1] > 0) { - int left_loc = cursor.open[cursor.level-1].index_item( cursor.pos[cursor.level-1] - 1).node; - left = pool.get_node( left_loc ); - - if (left.size() > left.min_items()) { - /* move cursor left, shift right */ - cursor.pos[cursor.level] = 0; - cursor.open[cursor.level] = left; - cursor.pos[cursor.level-1]--; - cursor.rotate_right(); - verify("remove 3"); - return 0; - } - - /* combine to left */ - right = cursor.open[cursor.level]; - } - else { - assert(cursor.pos[cursor.level-1] < cursor.open[cursor.level-1].size() - 1); - int right_loc = cursor.open[cursor.level-1].index_item( cursor.pos[cursor.level-1] + 1 ).node; - right = pool.get_node( right_loc ); - - if (right.size() > right.min_items()) { - /* move cursor right, shift an item left */ - cursor.pos[cursor.level] = 1; - cursor.open[cursor.level] = right; - cursor.pos[cursor.level-1]++; - cursor.rotate_left(); - verify("remove 4"); - return 0; - } - - /* combine to left */ - left = cursor.open[cursor.level]; - cursor.pos[cursor.level-1]++; /* move cursor to (soon-to-be-empty) right side item */ - } - - // note: cursor now points to _right_ node. - - /* combine (towards left) - * (this makes it so our next delete will be in the index - * interior, which is less scary.) - */ - dbtout << "combining nodes " << left.get_id() << " and " << right.get_id() << endl; - - left.merge(right); - - // dirty left + right - cursor.dirty(); // right - if (!left.node->is_dirty()) { - pool.dirty_node(left.node); - cursor.open[cursor.level-1].index_item( cursor.pos[cursor.level-1]-1 ).node = left.get_id(); - } - - pool.release(right.node); - - cursor.level--; // now point to the link to the obsolete (right-side) sib */ - } - - } - - void clear(Cursor& cursor, int node_loc, int level) { - dbtout << "clear" << endl; - - Nodeptr node = pool.get_node( node_loc ); - cursor.open[level] = node; - - // hose children? - if (level < depth-1) { - for (int i=0; i max) - max = node.key(i); - - if (level < depth-1) { - // index - cursor.pos[level] = i; - err += verify_sub( cursor, cursor.open[level].index_item(i).node, level+1, count, last, on ); - } else { - // leaf - count++; - last = node.key(i); - } - } - - if (level) { - // verify that parent's keys are appropriate - if (min != cursor.open[level-1].index_item(cursor.pos[level-1]).key) { - dbtout << ":: key in index node " << cursor.open[level-1].get_id() - << " != min in child " << node_loc - << "(key is " << hex << cursor.open[level-1].index_item(cursor.pos[level-1]).key - << ", min is " << min << ")" << dec << endl; - err++; - } - if (cursor.pos[level-1] < cursor.open[level-1].size()-1) { - if (max > cursor.open[level-1].index_item(1+cursor.pos[level-1]).key) { - dbtout << ":: next key in index node " << cursor.open[level-1].get_id() - << " < max in child " << node_loc - << "(key is " << hex << cursor.open[level-1].index_item(1+cursor.pos[level-1]).key - << ", max is " << max << ")" << dec << endl; - err++; - } - } - } - - //return err; - - // print it - char s[1000]; - strcpy(s," "); - s[level+1] = 0; - if (1) { - if (root == node_loc) { - dbtout << s << "root " << node_loc << ": " - << node.size() << " / " << node.max_items() << " keys, " << hex << min << "-" << max << dec << endl; - } else if (level == depth-1) { - dbtout << s << "leaf " << node_loc << ": " - << node.size() << " / " << node.max_items() << " keys, " << hex << min << "-" << max << dec << endl; - } else { - dbtout << s << "indx " << node_loc << ": " - << node.size() << " / " << node.max_items() << " keys, " << hex << min << "-" << max << dec << endl; - } - - if (0) { - for (int i=0; i " << node.leaf_item(i).value << dec << endl; - } - } - } - } - - return err; - } - - void verify(const char *on) { - if (!g_conf.ebofs_verify) - return; - - if (root == -1 && depth == 0) { - return; // empty! - } - - int count = 0; - Cursor cursor(this); - K last; - - int before = g_conf.debug_ebofs; - g_conf.debug_ebofs = 0; - - int err = verify_sub(cursor, root, 0, count, last, on); - if (count != nkeys) { - cerr << "** count " << count << " != nkeys " << nkeys << endl; - err++; - } - - g_conf.debug_ebofs = before; - - // ok? - if (err) { - cerr << "verify failure, called by '" << on << "'" << endl; - g_conf.debug_ebofs = 30; - // do it again, so we definitely get the dump. - int count = 0; - Cursor cursor(this); - K last; - verify_sub(cursor, root, 0, count, last, on); - assert(err == 0); - } - } - -}; - - -#endif diff --git a/branches/marnberg/quota/ebofs/mkfs.ebofs.cc b/branches/marnberg/quota/ebofs/mkfs.ebofs.cc deleted file mode 100644 index af5f57842068a..0000000000000 --- a/branches/marnberg/quota/ebofs/mkfs.ebofs.cc +++ /dev/null @@ -1,299 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include -#include "ebofs/Ebofs.h" - - -int main(int argc, char **argv) -{ - // args - vector args; - argv_to_vec(argc, argv, args); - parse_config_options(args); - - if (args.size() < 1) { - cerr << "usage: mkfs.ebofs [options] " << endl; - return -1; - } - char *filename = args[0]; - - // mkfs - Ebofs mfs(filename); - int r = mfs.mkfs(); - if (r < 0) exit(r); - - if (args.size() > 1) { // pass an extra arg of some sort to trigger the test crapola - // test-o-rama! - Ebofs fs(filename); - fs.mount(); - - /* - if (1) { - // partial write tests - char crap[1024*1024]; - memset(crap, 0, 1024*1024); - - bufferlist small; - small.append(crap, 10); - bufferlist med; - med.append(crap, 1000); - bufferlist big; - big.append(crap, 1024*1024); - - cout << "0" << endl; - fs.write(10, 0, 1024*1024, big, (Context*)0); - fs.sync(); - fs.trim_buffer_cache(); - - cout << "1" << endl; - fs.write(10, 10, 10, small, 0); - fs.write(10, 1, 1000, med, 0); - fs.sync(); - fs.trim_buffer_cache(); - - cout << "2" << endl; - fs.write(10, 10, 10, small, 0); - //fs.sync(); - fs.write(10, 1, 1000, med, 0); - fs.sync(); - fs.trim_buffer_cache(); - - cout << "3" << endl; - fs.write(10, 1, 1000, med, 0); - fs.write(10, 10000, 10, small, 0); - fs.truncate(10, 100, 0); - fs.sync(); - fs.trim_buffer_cache(); - - cout << "4" << endl; - fs.remove(10); - fs.sync(); - fs.write(10, 10, 10, small, 0); - fs.sync(); - fs.write(10, 1, 1000, med, 0); - fs.sync(); - fs.truncate(10, 100, 0); - fs.write(10, 10, 10, small, 0); - fs.trim_buffer_cache(); - - - - } - - if (0) { // onode write+read test - bufferlist bl; - char crap[1024*1024]; - memset(crap, 0, 1024*1024); - bl.append(crap, 10); - - fs.write(10, 10, 0, bl, (Context*)0); - fs.umount(); - - Ebofs fs2(filename); - fs2.mount(); - fs2.read(10, 10, 0, bl); - fs2.umount(); - - return 0; - } - - - if (0) { // small write + read test - bufferlist bl; - char crap[1024*1024]; - memset(crap, 0, 1024*1024); - - object_t oid = 10; - int n = 10000; - int l = 128; - bl.append(crap, l); - - - char *p = bl.c_str(); - off_t o = 0; - for (int i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __EBOFS_NODES_H -#define __EBOFS_NODES_H - -/** nodes, node regions **/ - -#include "types.h" -#include "BlockDevice.h" - - -/* - - disk wire memory - - free free -> free can alloc - free used -> dirty can modify - - free used used -> tx - free used free -> limbo - - used used -> clean - used free -> limbo - - - // meaningless - used free free -> free can alloc - used free used __DNE__ - - -*/ - -#undef debofs -#define debofs(x) if (x < g_conf.debug_ebofs) cout << "ebofs.nodepool." - - -class Node { - public: - // bit fields - static const int STATE_CLEAN = 1; - static const int STATE_DIRTY = 2; - static const int STATE_TX = 3; - - static const int ITEM_LEN = EBOFS_NODE_BYTES - sizeof(int) - sizeof(int) - sizeof(int); - - static const int TYPE_INDEX = 1; - static const int TYPE_LEAF = 2; - - protected: - nodeid_t id; - int state; // use bit fields above! - - bufferptr bptr; - bufferptr shadow_bptr; - - // in disk buffer - int *type; - int *nrecs; - - public: - Node(nodeid_t i, bufferptr& b, int s) : id(i), state(s), bptr(b) { - nrecs = (int*)(bptr.c_str()); - type = (int*)(bptr.c_str() + sizeof(*nrecs)); - } - - - // id - nodeid_t get_id() const { return id; } - void set_id(nodeid_t n) { id = n; } - - // buffer - bufferptr& get_buffer() { return bptr; } - - char *item_ptr() { return bptr.c_str() + sizeof(*nrecs) + sizeof(*type); } - - // size - int size() { return *nrecs; } - void set_size(int s) { *nrecs = s; } - - // type - int& get_type() { return *type; } - void set_type(int t) { *type = t; } - bool is_index() { return *type == TYPE_INDEX; } - bool is_leaf() { return *type == TYPE_LEAF; } - - - // state - bool is_dirty() { return state == STATE_DIRTY; } - bool is_tx() { return state == STATE_TX; } - bool is_clean() { return state == STATE_CLEAN; } - - void set_state(int s) { state = s; } - - void make_shadow() { - assert(is_tx()); - - shadow_bptr = bptr; - - // new buffer - bptr = buffer::create_page_aligned(EBOFS_NODE_BYTES); - nrecs = (int*)(bptr.c_str()); - type = (int*)(bptr.c_str() + sizeof(*nrecs)); - - // copy contents! - memcpy(bptr.c_str(), shadow_bptr.c_str(), EBOFS_NODE_BYTES); - } - -}; - - - - - -class NodePool { - protected: - map node_map; // open node map - - public: - vector region_loc; // region locations - Extent usemap_even; - Extent usemap_odd; - - protected: - // on-disk block states - int num_nodes; - set free; - set dirty; - set tx; - set clean; // aka used - set limbo; - - Mutex &ebofs_lock; - Cond commit_cond; - int flushing; - - static int make_nodeid(int region, int offset) { - return (region << 24) | offset; - } - static int nodeid_region(nodeid_t nid) { - return nid >> 24; - } - static int nodeid_offset(nodeid_t nid) { - return nid & ((1 << 24) - 1); - } - - - public: - NodePool(Mutex &el) : - num_nodes(0), - ebofs_lock(el), - flushing(0) {} - ~NodePool() { - // nodes - release_all(); - } - - int num_free() { return free.size(); } - int num_dirty() { return dirty.size(); } - int num_limbo() { return limbo.size(); } - int num_tx() { return tx.size(); } - int num_clean() { return clean.size(); } - int num_total() { return num_nodes; } - int num_used() { return num_clean() + num_dirty() + num_tx(); } - - int get_usemap_len(int n=0) { - if (n == 0) n = num_nodes; - return ((n-1) / 8 / EBOFS_BLOCK_SIZE) + 1; - } - - int num_regions() { return region_loc.size(); } - - // the caller had better adjust usemap locations... - void add_region(Extent ex) { - int region = region_loc.size(); - assert(ex.length <= (1 << 24)); - region_loc.push_back(ex); - for (unsigned o = 0; o < ex.length; o++) { - free.insert( make_nodeid(region, o) ); - } - num_nodes += ex.length; - } - - int init(struct ebofs_nodepool *np) { - // regions - assert(region_loc.empty()); - num_nodes = 0; - for (int i=0; inum_regions; i++) { - debofs(3) << "init region " << i << " at " << np->region_loc[i] << endl; - region_loc.push_back( np->region_loc[i] ); - num_nodes += np->region_loc[i].length; - } - - // usemap - usemap_even = np->node_usemap_even; - usemap_odd = np->node_usemap_odd; - debofs(3) << "init even map at " << usemap_even << endl; - debofs(3) << "init odd map at " << usemap_odd << endl; - - return 0; - } - - void close() { - release_all(); - - region_loc.clear(); - free.clear(); - dirty.clear(); - tx.clear(); - clean.clear(); - limbo.clear(); - flushing = 0; - node_map.clear(); - } - - - // *** blocking i/o routines *** - - int read_usemap(BlockDevice& dev, version_t epoch) { - // read map - Extent loc; - if (epoch & 1) - loc = usemap_odd; - else - loc = usemap_even; - - bufferptr bp = buffer::create_page_aligned(EBOFS_BLOCK_SIZE*loc.length); - dev.read(loc.start, loc.length, bp); - - // parse - unsigned region = 0; // current region - unsigned roff = 0; // offset in region - for (unsigned byte = 0; byte> 1; // move one bit right. - roff++; - if (roff == region_loc[region].length) { - // next region! - roff = 0; - region++; - break; - } - } - if (region == region_loc.size()) break; - } - return 0; - } - - int read_clean_nodes(BlockDevice& dev) { - /* - this relies on the clean set begin defined so that we know which nodes - to read. so it only really works when called from mount()! - */ - for (unsigned r=0; rflushed_usemap(); - } - }; - - void flushed_usemap() { - ebofs_lock.Lock(); - flushing--; - if (flushing == 0) - commit_cond.Signal(); - ebofs_lock.Unlock(); - } - - public: - int write_usemap(BlockDevice& dev, version_t version) { - // alloc - Extent loc; - if (version & 1) - loc = usemap_odd; - else - loc = usemap_even; - - bufferptr bp = buffer::create_page_aligned(EBOFS_BLOCK_SIZE*loc.length); - - // fill in - unsigned region = 0; // current region - unsigned roff = 0; // offset in region - for (unsigned byte = 0; byte> 1; - if (roff == region_loc[region].length) { - // next region! - roff = 0; - region++; - break; - } - } - - *(unsigned char*)(bp.c_str() + byte) = x; - if (region == region_loc.size()) break; - } - - - // write - bufferlist bl; - bl.append(bp); - dev.write(loc.start, loc.length, bl, - new C_NP_FlushUsemap(this), "usemap"); - return 0; - } - - - - // *** node commit *** - private: - - class C_NP_FlushNode : public BlockDevice::callback { - NodePool *pool; - nodeid_t nid; - public: - C_NP_FlushNode(NodePool *p, nodeid_t n) : - pool(p), nid(n) {} - void finish(ioh_t ioh, int r) { - pool->flushed_node(nid); - } - }; - - void flushed_node(nodeid_t nid) { - ebofs_lock.Lock(); - - // mark nid clean|limbo - if (tx.count(nid)) { // tx -> clean - tx.erase(nid); - clean.insert(nid); - - // make node itself clean - node_map[nid]->set_state(Node::STATE_CLEAN); - } - else { // already limbo (was dirtied, or released) - assert(limbo.count(nid)); - } - - flushing--; - if (flushing == 0) - commit_cond.Signal(); - ebofs_lock.Unlock(); - } - - public: - void commit_start(BlockDevice& dev, version_t version) { - dout(20) << "ebofs.nodepool.commit_start start" << endl; - - assert(flushing == 0); - /*if (0) - for (unsigned i=0; i tx (write to disk) - assert(tx.empty()); - set didb; - for (set::iterator i = dirty.begin(); - i != dirty.end(); - i++) { - Node *n = get_node(*i); - assert(n); - assert(n->is_dirty()); - n->set_state(Node::STATE_TX); - - unsigned region = nodeid_region(*i); - block_t off = nodeid_offset(*i); - block_t b = region_loc[region].start + off; - - if (1) { // sanity check debug FIXME - assert(didb.count(b) == 0); - didb.insert(b); - } - - bufferlist bl; - bl.append(n->get_buffer()); - dev.write(b, EBOFS_NODE_BLOCKS, - bl, - new C_NP_FlushNode(this, *i), "node"); - flushing++; - - tx.insert(*i); - } - dirty.clear(); - - // limbo -> free - for (set::iterator i = limbo.begin(); - i != limbo.end(); - i++) { - free.insert(*i); - } - limbo.clear(); - - dout(20) << "ebofs.nodepool.commit_start finish" << endl; - } - - void commit_wait() { - while (flushing > 0) - commit_cond.Wait(ebofs_lock); - dout(20) << "ebofs.nodepool.commit_wait finish" << endl; - } - - - - - - - - - - // *** nodes *** - // opened node - Node* get_node(nodeid_t nid) { - //dbtout << "pool.get " << nid << endl; - assert(node_map.count(nid)); - return node_map[nid]; - } - - // unopened node - /* not implemented yet!! - Node* open_node(nodeid_t nid) { - Node *n = node_regions[ NodeRegion::nodeid_region(nid) ]->open_node(nid); - dbtout << "pool.open_node " << n->get_id() << endl; - node_map[n->get_id()] = n; - return n; - } - */ - - // allocate id/block on disk. always free -> dirty. - nodeid_t alloc_id() { - // pick node id - assert(!free.empty()); - nodeid_t nid = *(free.begin()); - free.erase(nid); - dirty.insert(nid); - return nid; - } - - // new node - Node* new_node(int type) { - nodeid_t nid = alloc_id(); - debofs(15) << "ebofs.nodepool.new_node " << nid << endl; - - // alloc node - bufferptr bp = buffer::create_page_aligned(EBOFS_NODE_BYTES); - Node *n = new Node(nid, bp, Node::STATE_DIRTY); - n->set_type(type); - n->set_size(0); - - assert(node_map.count(nid) == 0); - node_map[nid] = n; - return n; - } - - void release(Node *n) { - const nodeid_t nid = n->get_id(); - debofs(15) << "ebofs.nodepool.release on " << nid << endl; - node_map.erase(nid); - - if (n->is_dirty()) { - assert(dirty.count(nid)); - dirty.erase(nid); - free.insert(nid); - } else if (n->is_clean()) { - assert(clean.count(nid)); - clean.erase(nid); - limbo.insert(nid); - } else if (n->is_tx()) { - assert(tx.count(nid)); // i guess htis happens? -sage - tx.erase(nid); - limbo.insert(nid); - } - - delete n; - } - - void release_all() { - while (!node_map.empty()) { - map::iterator i = node_map.begin(); - debofs(2) << "ebofs.nodepool.release_all leftover " << i->first << " " << i->second << endl; - release( i->second ); - } - assert(node_map.empty()); - } - - void dirty_node(Node *n) { - // get new node id? - nodeid_t oldid = n->get_id(); - nodeid_t newid = alloc_id(); - debofs(15) << "ebofs.nodepool.dirty_node on " << oldid << " now " << newid << endl; - - // release old block - if (n->is_clean()) { - assert(clean.count(oldid)); - clean.erase(oldid); - } else { - assert(n->is_tx()); - assert(tx.count(oldid)); - tx.erase(oldid); - - // move/copy current -> shadow buffer as necessary - n->make_shadow(); - } - limbo.insert(oldid); - node_map.erase(oldid); - - n->set_state(Node::STATE_DIRTY); - - // move to new one! - n->set_id(newid); - node_map[newid] = n; - } - - - -}; - -#endif diff --git a/branches/marnberg/quota/ebofs/test.ebofs.cc b/branches/marnberg/quota/ebofs/test.ebofs.cc deleted file mode 100644 index 0e6a7625c502a..0000000000000 --- a/branches/marnberg/quota/ebofs/test.ebofs.cc +++ /dev/null @@ -1,224 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include -#include "ebofs/Ebofs.h" - -bool stop = false; - - -int nt = 0; -class Tester : public Thread { - Ebofs &fs; - int t; - - char b[1024*1024]; - -public: - Tester(Ebofs &e) : fs(e), t(nt) { nt++; } - void *entry() { - - while (!stop) { - object_t oid; - oid.ino = (rand() % 10) + 0x10000000; - coll_t cid = rand() % 50; - off_t off = rand() % 10000;//0;//rand() % 1000000; - off_t len = 1+rand() % 100000; - char *a = "one"; - if (rand() % 2) a = "two"; - int l = 3;//rand() % 10; - - switch (rand() % 10) { - case 0: - { - oid.rev = rand() % 10; - cout << t << " read " << hex << oid << dec << " at " << off << " len " << len << endl; - bufferlist bl; - fs.read(oid, off, len, bl); - int l = MIN(len,bl.length()); - if (l) { - cout << t << " got " << l << endl; - bl.copy(0, l, b); - char *p = b; - while (l--) { - assert(*p == 0 || - *p == (char)(off ^ oid.ino)); - off++; - p++; - } - } - } - break; - - case 1: - { - cout << t << " write " << hex << oid << dec << " at " << off << " len " << len << endl; - for (int j=0;j args; - argv_to_vec(argc, argv, args); - parse_config_options(args); - - // args - if (args.size() != 3) return -1; - char *filename = args[0]; - int seconds = atoi(args[1]); - int threads = atoi(args[2]); - - cout << "dev " << filename << " .. " << threads << " threads .. " << seconds << " seconds" << endl; - - Ebofs fs(filename); - if (fs.mount() < 0) return -1; - - - // explicit tests - if (1) { - // verify that clone() plays nice with partial writes - object_t oid(1,1); - bufferptr bp(10000); - bp.zero(); - bufferlist bl; - bl.push_back(bp); - fs.write(oid, 0, 10000, bl, 0); - - fs.sync(); - fs.trim_buffer_cache(); - - // induce a partial write - bufferlist bl2; - bl2.substr_of(bl, 0, 100); - fs.write(oid, 100, 100, bl2, 0); - - // clone it - object_t oid2; - oid2 = oid; - oid2.rev = 1; - fs.clone(oid, oid2, 0); - - // ... - if (0) { - // make sure partial still behaves after orig is removed... - fs.remove(oid, 0); - - // or i read for oid2... - bufferlist rbl; - fs.read(oid2, 0, 200, rbl); - } - if (1) { - // make sure things behave if we remove the clone - fs.remove(oid2,0); - } - } - // /explicit tests - - list ls; - for (int i=0; icreate(); - ls.push_back(t); - } - - utime_t now = g_clock.now(); - utime_t dur(seconds,0); - utime_t end = now + dur; - cout << "stop at " << end << endl; - while (now < end) { - sleep(1); - now = g_clock.now(); - cout << now << endl; - } - - cout << "stopping" << endl; - stop = true; - - while (!ls.empty()) { - Tester *t = ls.front(); - ls.pop_front(); - t->join(); - delete t; - } - - fs.umount(); - return 0; -} - diff --git a/branches/marnberg/quota/ebofs/types.h b/branches/marnberg/quota/ebofs/types.h deleted file mode 100644 index 1b85d138ec342..0000000000000 --- a/branches/marnberg/quota/ebofs/types.h +++ /dev/null @@ -1,168 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __EBOFS_TYPES_H -#define __EBOFS_TYPES_H - -#include -#include "include/buffer.h" -#include "include/Context.h" -#include "common/Cond.h" - -#include -#include -#include -#include -using namespace std; -using namespace __gnu_cxx; - - -#include "include/object.h" - - -#ifndef MIN -# define MIN(a,b) ((a)<=(b) ? (a):(b)) -#endif -#ifndef MAX -# define MAX(a,b) ((a)>=(b) ? (a):(b)) -#endif - - -/* -namespace __gnu_cxx { - template<> struct hash { - size_t operator()(unsigned long long __x) const { - static hash H; - return H((__x >> 32) ^ (__x & 0xffffffff)); - } - }; - - template<> struct hash< std::string > - { - size_t operator()( const std::string& x ) const - { - static hash H; - return H(x.c_str()); - } - }; -} -*/ - - -// disk -typedef __uint64_t block_t; // disk location/sector/block - -static const int EBOFS_BLOCK_SIZE = 4096; -static const int EBOFS_BLOCK_BITS = 12; // 1<<12 == 4096 - -class Extent { - public: - block_t start, length; - - Extent() : start(0), length(0) {} - Extent(block_t s, block_t l) : start(s), length(l) {} - - block_t last() const { return start + length - 1; } - block_t end() const { return start + length; } -}; - -inline ostream& operator<<(ostream& out, Extent& ex) -{ - return out << ex.start << "~" << ex.length; -} - - -// tree/set nodes -typedef int nodeid_t; - -static const int EBOFS_NODE_BLOCKS = 1; -static const int EBOFS_NODE_BYTES = EBOFS_NODE_BLOCKS * EBOFS_BLOCK_SIZE; -static const int EBOFS_MAX_NODE_REGIONS = 10; // pick a better value! - -struct ebofs_nodepool { - Extent node_usemap_even; // for even sb versions - Extent node_usemap_odd; // for odd sb versions - - int num_regions; - Extent region_loc[EBOFS_MAX_NODE_REGIONS]; -}; - - -// objects - -typedef __uint64_t coll_t; - -struct ebofs_onode { - Extent onode_loc; /* this is actually the block we live in */ - - object_t object_id; /* for kicks */ - off_t object_size; /* file size in bytes. should this be 64-bit? */ - unsigned object_blocks; - bool readonly; - - int num_collections; - int num_attr; // num attr in onode - int num_extents; /* number of extents used. if 0, data is in the onode */ -}; - -struct ebofs_cnode { - Extent cnode_loc; /* this is actually the block we live in */ - coll_t coll_id; - int num_attr; // num attr in cnode -}; - - -// table -struct ebofs_table { - nodeid_t root; /* root node of btree */ - int num_keys; - int depth; -}; - - -// super -typedef __uint64_t version_t; - -static const unsigned EBOFS_MAGIC = 0x000EB0F5; - -static const int EBOFS_NUM_FREE_BUCKETS = 5; /* see alloc.h for bucket constraints */ -static const int EBOFS_FREE_BUCKET_BITS = 2; - - -struct ebofs_super { - unsigned s_magic; - - unsigned epoch; // version of this superblock. - - unsigned num_blocks; /* # blocks in filesystem */ - - // some basic stats, for kicks - unsigned free_blocks; /* unused blocks */ - unsigned limbo_blocks; /* limbo blocks */ - //unsigned num_objects; - //unsigned num_fragmented; - - struct ebofs_nodepool nodepool; - - // tables - struct ebofs_table free_tab[EBOFS_NUM_FREE_BUCKETS]; - struct ebofs_table limbo_tab; - struct ebofs_table alloc_tab; - struct ebofs_table object_tab; // object directory - struct ebofs_table collection_tab; // collection directory - struct ebofs_table co_tab; -}; - - -#endif diff --git a/branches/marnberg/quota/fakefuse.cc b/branches/marnberg/quota/fakefuse.cc deleted file mode 100644 index 2edf3c7930e7a..0000000000000 --- a/branches/marnberg/quota/fakefuse.cc +++ /dev/null @@ -1,150 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include -#include -#include -using namespace std; - -#include "config.h" - -#include "mon/Monitor.h" - -#include "mds/MDS.h" -#include "osd/OSD.h" -#include "client/Client.h" -#include "client/fuse.h" - -#include "common/Timer.h" - -#include "msg/FakeMessenger.h" - - - - -#define NUMMDS g_conf.num_mds -#define NUMOSD g_conf.num_osd -#define NUMCLIENT g_conf.num_client - - -class C_Test : public Context { -public: - void finish(int r) { - cout << "C_Test->finish(" << r << ")" << endl; - } -}; -class C_Test2 : public Context { -public: - void finish(int r) { - cout << "C_Test2->finish(" << r << ")" << endl; - g_timer.add_event_after(2, new C_Test); - } -}; - - - -int main(int argc, char **argv) { - cerr << "fakefuse starting" << endl; - - vector args; - argv_to_vec(argc, argv, args); - parse_config_options(args); - - // start messenger thread - fakemessenger_startthread(); - - //g_timer.add_event_after(5.0, new C_Test2); - //g_timer.add_event_after(10.0, new C_Test); - - vector nargs; - for (unsigned i=0; iinit(); - } - for (int i=0; iinit(); - } - - for (int i=0; iinit(); - } - - - // create client - Client *client[NUMCLIENT]; - for (int i=0; iinit(); - - - // start up fuse - // use my argc, argv (make sure you pass a mount point!) - cout << "starting fuse on pid " << getpid() << endl; - client[i]->mount(); - ceph_fuse_main(client[i], argc, argv); - client[i]->unmount(); - cout << "fuse finished on pid " << getpid() << endl; - client[i]->shutdown(); - } - - - - // wait for it to finish - cout << "DONE -----" << endl; - fakemessenger_wait(); // blocks until messenger stops - - - // cleanup - for (int i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include -#include -#include -using namespace std; - -#include "config.h" - -#include "mds/MDCluster.h" - -#include "mds/MDS.h" -#include "osd/OSD.h" -#include "mon/Monitor.h" -#include "client/Client.h" - -#include "client/SyntheticClient.h" - -#include "msg/FakeMessenger.h" - -#include "common/Timer.h" - -#define NUMMDS g_conf.num_mds -#define NUMOSD g_conf.num_osd -#define NUMCLIENT g_conf.num_client - -class C_Test : public Context { -public: - void finish(int r) { - cout << "C_Test->finish(" << r << ")" << endl; - } -}; - - -int main(int argc, char **argv) -{ - cerr << "fakesyn start" << endl; - - //cerr << "inode_t " << sizeof(inode_t) << endl; - - vector args; - argv_to_vec(argc, argv, args); - - parse_config_options(args); - - int start = 0; - - parse_syn_options(args); - - vector nargs; - - for (unsigned i=0; iinit(); - } - for (int i=0; iinit(); - if (g_conf.mds_local_osd) - mdsosd[i]->init(); - } - - for (int i=0; iinit(); - } - - - // create client(s) - for (int i=0; iinit(); - - // use my argc, argv (make sure you pass a mount point!) - //cout << "mounting" << endl; - client[i]->mount(); - - //cout << "starting synthetic client " << endl; - syn[i] = new SyntheticClient(client[i]); - - syn[i]->start_thread(); - } - - - for (int i=0; ijoin_thread(); - delete syn[i]; - - client[i]->unmount(); - //cout << "unmounted" << endl; - client[i]->shutdown(); - } - - - // wait for it to finish - fakemessenger_wait(); - - // cleanup - for (int i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include -#include -#include -using namespace std; - -#include "config.h" - -#include "mds/MDS.h" -#include "osd/OSD.h" -#include "mon/Monitor.h" -#include "client/Client.h" - -#include "client/SyntheticClient.h" - -#include "msg/FakeMessenger.h" - -#include "common/Timer.h" - - -class C_Test : public Context { -public: - void finish(int r) { - cout << "C_Test->finish(" << r << ")" << endl; - } -}; - -class C_Die : public Context { -public: - void finish(int) { - cerr << "die" << endl; - exit(1); - } -}; - - -int main(int argc, char **argv) -{ - cerr << "fakesyn start" << endl; - - //cerr << "inode_t " << sizeof(inode_t) << endl; - - vector args; - argv_to_vec(argc, argv, args); - - parse_config_options(args); - - int start = 0; - - parse_syn_options(args); - - vector nargs; - - for (unsigned i=0; imon_inst[0] = entity_inst_t(MSG_ADDR_MON(0), a); // hack ; see FakeMessenger.cc - - char hostname[100]; - gethostname(hostname,100); - //int pid = getpid(); - - // create mon - Monitor *mon[g_conf.num_mon]; - for (int i=0; iinit(); - } - for (int i=0; iinit(); - if (g_conf.mds_local_osd) - mdsosd[i]->init(); - } - - for (int i=0; iinit(); - } - - - // create client(s) - for (int i=0; iinit(); - - // use my argc, argv (make sure you pass a mount point!) - //cout << "mounting" << endl; - client[i]->mount(); - - //cout << "starting synthetic client " << endl; - syn[i] = new SyntheticClient(client[i]); - - syn[i]->start_thread(); - } - - - for (int i=0; ijoin_thread(); - delete syn[i]; - - client[i]->unmount(); - //cout << "unmounted" << endl; - client[i]->shutdown(); - } - - - // wait for it to finish - fakemessenger_wait(); - - // cleanup - for (int i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __CONTEXT_H -#define __CONTEXT_H - -#include "config.h" - -#include -#include -#include - -#include - - -/* - * Context - abstract callback class - */ -class Context { - public: - virtual ~Context() {} // we want a virtual destructor!!! - virtual void finish(int r) = 0; -}; - - -/* - * finish and destroy a list of Contexts - */ -inline void finish_contexts(std::list& finished, - int result = 0) -{ - using std::cout; - using std::endl; - - if (finished.empty()) return; - - dout(10) << finished.size() << " contexts to finish with " << result << endl; - for (std::list::iterator it = finished.begin(); - it != finished.end(); - it++) { - Context *c = *it; - dout(10) << "---- " << c << endl; - c->finish(result); - delete c; - } -} - -/* - * C_Contexts - set of Contexts - */ -class C_Contexts : public Context { - std::list clist; - -public: - void add(Context* c) { - clist.push_back(c); - } - void take(std::list& ls) { - clist.splice(clist.end(), ls); - } - void finish(int r) { - finish_contexts(clist, r); - } -}; - - -/* - * C_Gather - * - * BUG: does not report errors. - */ -class C_Gather : public Context { -public: - bool sub_finish(int r) { - //cout << "C_Gather sub_finish " << this << endl; - assert(waitfor.count(r)); - waitfor.erase(r); - if (!waitfor.empty()) - return false; // more subs left - - // last one - onfinish->finish(0); - delete onfinish; - onfinish = 0; - return true; - } - - class C_GatherSub : public Context { - C_Gather *gather; - int num; - public: - C_GatherSub(C_Gather *g, int n) : gather(g), num(n) {} - void finish(int r) { - if (gather->sub_finish(num)) - delete gather; // last one! - } - }; - - Context *new_sub() { - num++; - waitfor.insert(num); - return new C_GatherSub(this, num); - } - -private: - Context *onfinish; - std::set waitfor; - int num; - -public: - C_Gather(Context *f) : onfinish(f), num(0) { - //cout << "C_Gather new " << this << endl; - } - ~C_Gather() { - //cout << "C_Gather delete " << this << endl; - assert(!onfinish); - } - void finish(int r) { - // nobody should ever call me. - assert(0); - } - -}; - -#endif diff --git a/branches/marnberg/quota/include/Distribution.h b/branches/marnberg/quota/include/Distribution.h deleted file mode 100644 index 00f352d59efab..0000000000000 --- a/branches/marnberg/quota/include/Distribution.h +++ /dev/null @@ -1,74 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __DISTRIBUTION_H -#define __DISTRIBUTION_H - -#include -#include -using namespace std; - -class Distribution { - vector p; - vector v; - - public: - //Distribution() { - //} - - unsigned get_width() { - return p.size(); - } - - void clear() { - p.clear(); - v.clear(); - } - void add(int val, float pr) { - p.push_back(pr); - v.push_back(val); - } - - void random() { - float sum = 0.0; - for (unsigned i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __BUFFER_H -#define __BUFFER_H - -#include "common/Mutex.h" - -#include -#include - -using std::cout; -using std::endl; - -#ifndef __CYGWIN__ -# include -#endif - -#define BUFFER_PAGE_SIZE 4096 // fixme. - -// -// these are in config.o -extern Mutex bufferlock; -extern long buffer_total_alloc; -// - -class buffer { -private: - - /* hack for memory utilization debugging. */ - static void inc_total_alloc(unsigned len) { - bufferlock.Lock(); - buffer_total_alloc += len; - bufferlock.Unlock(); - } - static void dec_total_alloc(unsigned len) { - bufferlock.Lock(); - buffer_total_alloc -= len; - bufferlock.Unlock(); - } - - /* - * an abstract raw buffer. with a reference count. - */ - class raw { - public: - char *data; - unsigned len; - int nref; - Mutex lock; // we'll make it non-recursive. - - raw(unsigned l) : len(l), nref(0), lock(false) {} - raw(char *c, unsigned l) : data(c), len(l), nref(0), lock(false) {} - virtual ~raw() {}; - - // no copying. - raw(const raw &other); - const raw& operator=(const raw &other); - - virtual raw* clone_empty() = 0; - raw *clone() { - raw *c = clone_empty(); - memcpy(c->data, data, len); - return c; - } - }; - - friend std::ostream& operator<<(std::ostream& out, const raw &r); - - /* - * primitive buffer types - */ - class raw_char : public raw { - public: - raw_char(unsigned l) : raw(l) { - data = new char[len]; - inc_total_alloc(len); - } - ~raw_char() { - delete[] data; - dec_total_alloc(len); - } - raw* clone_empty() { - return new raw_char(len); - } - }; - - class raw_static : public raw { - public: - raw_static(const char *d, unsigned l) : raw((char*)d, l) { } - ~raw_static() {} - raw* clone_empty() { - return new raw_char(len); - } - }; - -#ifndef __CYGWIN__ - class raw_mmap_pages : public raw { - public: - raw_mmap_pages(unsigned l) : raw(l) { - data = (char*)::mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, 0); - inc_total_alloc(len); - } - ~raw_mmap_pages() { - ::munmap(data, len); - dec_total_alloc(len); - } - raw* clone_empty() { - return new raw_mmap_pages(len); - } - }; - - class raw_posix_aligned : public raw { - public: - raw_posix_aligned(unsigned l) : raw(l) { -#ifdef DARWIN - data = (char *) valloc (len); -#else - ::posix_memalign((void**)&data, BUFFER_PAGE_SIZE, len); -#endif /* DARWIN */ - inc_total_alloc(len); - } - ~raw_posix_aligned() { - ::free((void*)data); - dec_total_alloc(len); - } - raw* clone_empty() { - return new raw_posix_aligned(len); - } - }; -#endif - -#ifdef __CYGWIN__ - class raw_hack_aligned : public raw { - char *realdata; - public: - raw_hack_aligned(unsigned l) : raw(l) { - realdata = new char[len+4095]; - unsigned off = ((unsigned)realdata) % 4096; - if (off) - data = realdata + 4096 - off; - else - data = realdata; - inc_total_alloc(len+4095); - //cout << "hack aligned " << (unsigned)data - //<< " in raw " << (unsigned)realdata - //<< " off " << off << endl; - assert(((unsigned)data & 4095) == 0); - } - ~raw_hack_aligned() { - delete[] realdata; - dec_total_alloc(len+4095); - } - raw* clone_empty() { - return new raw_hack_aligned(len); - } - }; -#endif - -public: - - /* - * named constructors - */ - - static raw* copy(const char *c, unsigned len) { - raw* r = new raw_char(len); - memcpy(r->data, c, len); - return r; - } - static raw* create(unsigned len) { - return new raw_char(len); - } - - static raw* create_page_aligned(unsigned len) { -#ifndef __CYGWIN__ - return new raw_mmap_pages(len); -#else - return new raw_hack_aligned(len); -#endif - } - - - /* - * a buffer pointer. references (a subsequence of) a raw buffer. - */ - class ptr { - raw *_raw; - unsigned _off, _len; - - public: - ptr() : _raw(0), _off(0), _len(0) {} - ptr(raw *r) : _raw(r), _off(0), _len(r->len) { // no lock needed; this is an unref raw. - ++r->nref; - } - ptr(unsigned l) : _off(0), _len(l) { - _raw = create(l); - ++_raw->nref; - } - ptr(char *d, unsigned l) : _off(0), _len(l) { // ditto. - _raw = copy(d, l); - ++_raw->nref; - } - ptr(const ptr& p) : _raw(p._raw), _off(p._off), _len(p._len) { - if (_raw) { - _raw->lock.Lock(); - ++_raw->nref; - _raw->lock.Unlock(); - } - } - ptr(const ptr& p, unsigned o, unsigned l) : _raw(p._raw), _off(p._off + o), _len(l) { - assert(o+l <= p._len); - assert(_raw); - _raw->lock.Lock(); - ++_raw->nref; - _raw->lock.Unlock(); - } - ptr& operator= (const ptr& p) { - // be careful -- we need to properly handle self-assignment. - if (p._raw) { - p._raw->lock.Lock(); - ++p._raw->nref; // inc new - p._raw->lock.Unlock(); - } - release(); // dec (+ dealloc) old (if any) - _raw = p._raw; // change my ref - _off = p._off; - _len = p._len; - return *this; - } - ~ptr() { - release(); - } - - void release() { - if (_raw) { - _raw->lock.Lock(); - if (--_raw->nref == 0) { - //cout << "hosing raw " << (void*)_raw << " len " << _raw->len << std::endl; - _raw->lock.Unlock(); - delete _raw; // dealloc old (if any) - } else - _raw->lock.Unlock(); - _raw = 0; - } - } - - // misc - bool at_buffer_head() const { return _off == 0; } - bool at_buffer_tail() const { return _off + _len == _raw->len; } - - // accessors - const char *c_str() const { assert(_raw); return _raw->data + _off; } - char *c_str() { assert(_raw); return _raw->data + _off; } - unsigned length() const { return _len; } - unsigned offset() const { return _off; } - unsigned unused_tail_length() const { return _raw->len - (_off+_len); } - const char& operator[](unsigned n) const { - assert(_raw); - assert(n < _len); - return _raw->data[_off + n]; - } - char& operator[](unsigned n) { - assert(_raw); - assert(n < _len); - return _raw->data[_off + n]; - } - - const char *raw_c_str() const { assert(_raw); return _raw->data; } - unsigned raw_length() const { assert(_raw); return _raw->len; } - int raw_nref() const { assert(_raw); return _raw->nref; } - - void copy_out(unsigned o, unsigned l, char *dest) const { - assert(_raw); - assert(o >= 0 && o <= _len); - assert(l >= 0 && o+l <= _len); - memcpy(dest, c_str()+o, l); - } - - unsigned wasted() { - assert(_raw); - return _raw->len - _len; - } - - // modifiers - void set_offset(unsigned o) { _off = o; } - void set_length(unsigned l) { _len = l; } - - void append(const char *p, unsigned l) { - assert(_raw); - assert(l <= unused_tail_length()); - memcpy(c_str() + _len, p, l); - _len += l; - } - - void copy_in(unsigned o, unsigned l, const char *src) { - assert(_raw); - assert(o >= 0 && o <= _len); - assert(l >= 0 && o+l <= _len); - memcpy(c_str()+o, src, l); - } - - void zero() { - memset(c_str(), 0, _len); - } - - void clean() { - //raw *newraw = _raw->makesib(_len); - } - }; - - friend std::ostream& operator<<(std::ostream& out, const buffer::ptr& bp); - - /* - * list - the useful bit! - */ - - class list { - // my private bits - std::list _buffers; - unsigned _len; - - public: - // cons/des - list() : _len(0) {} - list(const list& other) : _buffers(other._buffers), _len(other._len) { } - list(unsigned l) : _len(0) { - ptr bp(l); - push_back(bp); - } - ~list() {} - - list& operator= (const list& other) { - _buffers = other._buffers; - _len = other._len; - return *this; - } - - const std::list& buffers() const { return _buffers; } - - unsigned length() const { -#if 0 - // DEBUG: verify _len - unsigned len = 0; - for (std::list::iterator it = _buffers.begin(); - it != _buffers.end(); - it++) { - len += (*it).length(); - } - assert(len == _len); -#endif - return _len; - } - - - // modifiers - void clear() { - _buffers.clear(); - _len = 0; - } - void push_front(ptr& bp) { - _buffers.push_front(bp); - _len += bp.length(); - } - void push_front(raw *r) { - ptr bp(r); - _buffers.push_front(bp); - _len += bp.length(); - } - void push_back(ptr& bp) { - _buffers.push_back(bp); - _len += bp.length(); - } - void push_back(raw *r) { - ptr bp(r); - _buffers.push_back(bp); - _len += bp.length(); - } - void zero() { - for (std::list::iterator it = _buffers.begin(); - it != _buffers.end(); - it++) - it->zero(); - } - - // sort-of-like-assignment-op - void claim(list& bl) { - // free my buffers - clear(); - claim_append(bl); - } - void claim_append(list& bl) { - // steal the other guy's buffers - _len += bl._len; - _buffers.splice( _buffers.end(), bl._buffers ); - bl._len = 0; - } - - // crope lookalikes - void copy(unsigned off, unsigned len, char *dest) { - assert(off >= 0); - assert(off + len <= length()); - /*assert(off < length()); - if (off + len > length()) - len = length() - off; - */ - // advance to off - std::list::iterator curbuf = _buffers.begin(); - - // skip off - while (off > 0) { - assert(curbuf != _buffers.end()); - if (off >= (*curbuf).length()) { - // skip this buffer - off -= (*curbuf).length(); - curbuf++; - } else { - // somewhere in this buffer! - break; - } - } - - // copy - while (len > 0) { - // is the rest ALL in this buffer? - if (off + len <= (*curbuf).length()) { - (*curbuf).copy_out(off, len, dest); // yup, last bit! - break; - } - - // get as much as we can from this buffer. - unsigned howmuch = (*curbuf).length() - off; - (*curbuf).copy_out(off, howmuch, dest); - - dest += howmuch; - len -= howmuch; - off = 0; - curbuf++; - assert(curbuf != _buffers.end()); - } - } - - void copy_in(unsigned off, unsigned len, const char *src) { - assert(off >= 0); - assert(off + len <= length()); - - // advance to off - std::list::iterator curbuf = _buffers.begin(); - - // skip off - while (off > 0) { - assert(curbuf != _buffers.end()); - if (off >= (*curbuf).length()) { - // skip this buffer - off -= (*curbuf).length(); - curbuf++; - } else { - // somewhere in this buffer! - break; - } - } - - // copy - while (len > 0) { - // is the rest ALL in this buffer? - if (off + len <= (*curbuf).length()) { - (*curbuf).copy_in(off, len, src); // yup, last bit! - break; - } - - // get as much as we can from this buffer. - unsigned howmuch = (*curbuf).length() - off; - (*curbuf).copy_in(off, howmuch, src); - - src += howmuch; - len -= howmuch; - off = 0; - curbuf++; - assert(curbuf != _buffers.end()); - } - } - void copy_in(unsigned off, unsigned len, const list& bl) { - unsigned left = len; - for (std::list::const_iterator i = bl._buffers.begin(); - i != bl._buffers.end(); - i++) { - unsigned l = (*i).length(); - if (left < l) l = left; - copy_in(off, l, (*i).c_str()); - left -= l; - if (left == 0) break; - off += l; - } - } - - - void append(const char *data, unsigned len) { - if (len == 0) return; - - unsigned alen = 0; - - // copy into the tail buffer? - if (!_buffers.empty()) { - unsigned avail = _buffers.back().unused_tail_length(); - if (avail > 0) { - //std::cout << "copying up to " << len << " into tail " << avail << " bytes of tail buf " << _buffers.back() << std::endl; - if (avail > len) - avail = len; - _buffers.back().append(data, avail); - _len += avail; - data += avail; - len -= avail; - } - alen = _buffers.back().length(); - } - if (len == 0) return; - - // just add another buffer. - // alloc a bit extra, in case we do a bunch of appends. FIXME be smarter! - if (alen < 4096) alen = 4096; - ptr bp = create(alen); - bp.set_length(len); - bp.copy_in(0, len, data); - push_back(bp); - } - void append(ptr& bp) { - push_back(bp); - } - void append(ptr& bp, unsigned off, unsigned len) { - assert(len+off <= bp.length()); - ptr tempbp(bp, off, len); - push_back(tempbp); - } - void append(const list& bl) { - list temp(bl); // copy list - claim_append(temp); // and append - } - - - /* - * get a char - */ - const char& operator[](unsigned n) { - assert(n < _len); - for (std::list::iterator p = _buffers.begin(); - p != _buffers.end(); - p++) { - if (n >= p->length()) { - n -= p->length(); - continue; - } - return (*p)[n]; - } - assert(0); - } - - /* - * return a contiguous ptr to whole bufferlist contents. - */ - char *c_str() { - if (_buffers.size() == 1) { - return _buffers.front().c_str(); // good, we're already contiguous. - } - else if (_buffers.size() == 0) { - return 0; // no buffers - } - else { - ptr newbuf = create(length()); // make one new contiguous buffer. - copy(0, length(), newbuf.c_str()); // copy myself into it. - clear(); - push_back(newbuf); - return newbuf.c_str(); // now it'll work. - } - } - - void substr_of(list& other, unsigned off, unsigned len) { - assert(off + len <= other.length()); - clear(); - - // skip off - std::list::iterator curbuf = other._buffers.begin(); - while (off > 0) { - assert(curbuf != _buffers.end()); - if (off >= (*curbuf).length()) { - // skip this buffer - //cout << "skipping over " << *curbuf << endl; - off -= (*curbuf).length(); - curbuf++; - } else { - // somewhere in this buffer! - //cout << "somewhere in " << *curbuf << endl; - break; - } - } - - while (len > 0) { - // partial? - if (off + len < (*curbuf).length()) { - //cout << "copying partial of " << *curbuf << endl; - _buffers.push_back( ptr( *curbuf, off, len ) ); - _len += len; - break; - } - - // through end - //cout << "copying end (all?) of " << *curbuf << endl; - unsigned howmuch = (*curbuf).length() - off; - _buffers.push_back( ptr( *curbuf, off, howmuch ) ); - _len += howmuch; - len -= howmuch; - off = 0; - curbuf++; - } - } - - - // funky modifer - void splice(unsigned off, unsigned len, list *claim_by=0 /*, bufferlist& replace_with */) { // fixme? - assert(off < length()); - assert(len > 0); - //cout << "splice off " << off << " len " << len << " ... mylen = " << length() << endl; - - // skip off - std::list::iterator curbuf = _buffers.begin(); - while (off > 0) { - assert(curbuf != _buffers.end()); - if (off >= (*curbuf).length()) { - // skip this buffer - //cout << "off = " << off << " skipping over " << *curbuf << endl; - off -= (*curbuf).length(); - curbuf++; - } else { - // somewhere in this buffer! - //cout << "off = " << off << " somewhere in " << *curbuf << endl; - break; - } - } - assert(off >= 0); - - if (off) { - // add a reference to the front bit - // insert it before curbuf (which we'll hose) - //cout << "keeping front " << off << " of " << *curbuf << endl; - _buffers.insert( curbuf, ptr( *curbuf, 0, off ) ); - _len += off; - } - - while (len > 0) { - // partial? - if (off + len < (*curbuf).length()) { - //cout << "keeping end of " << *curbuf << ", losing first " << off+len << endl; - if (claim_by) - claim_by->append( *curbuf, off, len ); - (*curbuf).set_offset( off+len + (*curbuf).offset() ); // ignore beginning big - (*curbuf).set_length( (*curbuf).length() - (len+off) ); - _len -= off+len; - //cout << " now " << *curbuf << endl; - break; - } - - // hose though the end - unsigned howmuch = (*curbuf).length() - off; - //cout << "discarding " << howmuch << " of " << *curbuf << endl; - if (claim_by) - claim_by->append( *curbuf, off, howmuch ); - _len -= (*curbuf).length(); - _buffers.erase( curbuf++ ); - len -= howmuch; - off = 0; - } - - // splice in *replace (implement me later?) - } - - }; - -}; - -typedef buffer::ptr bufferptr; -typedef buffer::list bufferlist; - - -inline bool operator>(bufferlist& l, bufferlist& r) { - for (unsigned p = 0; ; p++) { - if (l.length() > p && r.length() == p) return true; - if (l.length() == p) return false; - if (l[p] > r[p]) return true; - if (l[p] < r[p]) return false; - p++; - } -} -inline bool operator>=(bufferlist& l, bufferlist& r) { - for (unsigned p = 0; ; p++) { - if (l.length() > p && r.length() == p) return true; - if (r.length() == p && l.length() == p) return true; - if (l[p] > r[p]) return true; - if (l[p] < r[p]) return false; - p++; - } -} -inline bool operator<(bufferlist& l, bufferlist& r) { - return r > l; -} -inline bool operator<=(bufferlist& l, bufferlist& r) { - return r >= l; -} - - -inline std::ostream& operator<<(std::ostream& out, const buffer::raw &r) { - return out << "buffer::raw(" << (void*)r.data << " len " << r.len << " nref " << r.nref << ")"; -} - -inline std::ostream& operator<<(std::ostream& out, const buffer::ptr& bp) { - out << "buffer::ptr(" << bp.offset() << "~" << bp.length() - << " " << (void*)bp.c_str() - << " in raw " << (void*)bp.raw_c_str() - << " len " << bp.raw_length() - << " nref " << bp.raw_nref() << ")"; - return out; -} - -inline std::ostream& operator<<(std::ostream& out, const buffer::list& bl) { - out << "buffer::list(len=" << bl.length() << "," << std::endl; - - std::list::const_iterator it = bl.buffers().begin(); - while (it != bl.buffers().end()) { - out << "\t" << *it; - if (++it == bl.buffers().end()) break; - out << "," << std::endl; - } - out << std::endl << ")"; - return out; -} - - - - -// encoder/decode helpers - -// -- basic types -- -// string -inline void _encode(const std::string& s, bufferlist& bl) -{ - bl.append(s.c_str(), s.length()+1); -} -inline void _decode(std::string& s, bufferlist& bl, int& off) -{ - s = bl.c_str() + off; - off += s.length() + 1; -} - -// bufferptr (encapsulated) -inline void _encode(bufferptr& bp, bufferlist& bl) -{ - size_t len = bp.length(); - bl.append((char*)&len, sizeof(len)); - bl.append(bp); -} -inline void _decode(bufferptr& bp, bufferlist& bl, int& off) -{ - size_t len; - bl.copy(off, sizeof(len), (char*)&len); - off += sizeof(len); - bufferlist s; - s.substr_of(bl, off, len); - off += len; - - if (s.buffers().size() == 1) - bp = s.buffers().front(); - else - bp = buffer::copy(s.c_str(), s.length()); -} - -// bufferlist (encapsulated) -inline void _encode(const bufferlist& s, bufferlist& bl) -{ - size_t len = s.length(); - bl.append((char*)&len, sizeof(len)); - bl.append(s); -} -inline void _decode(bufferlist& s, bufferlist& bl, int& off) -{ - size_t len; - bl.copy(off, sizeof(len), (char*)&len); - off += sizeof(len); - s.substr_of(bl, off, len); - off += len; -} - - -#include -#include -#include -#include - -// set -inline void _encode(const std::set& s, bufferlist& bl) -{ - int n = s.size(); - bl.append((char*)&n, sizeof(n)); - for (std::set::const_iterator it = s.begin(); - it != s.end(); - it++) { - ::_encode(*it, bl); - n--; - } - assert(n==0); -} -inline void _decode(std::set& s, bufferlist& bl, int& off) -{ - s.clear(); - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i -inline void _encode(const std::list& s, bufferlist& bl) -{ - int n = s.size(); - bl.append((char*)&n, sizeof(n)); - for (std::list::const_iterator it = s.begin(); - it != s.end(); - it++) { - ::_encode(*it, bl); - n--; - } - assert(n==0); -} -inline void _decode(std::list& s, bufferlist& bl, int& off) -{ - s.clear(); - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i -template -inline void _encode(const std::set& s, bufferlist& bl) -{ - int n = s.size(); - bl.append((char*)&n, sizeof(n)); - for (typename std::set::const_iterator it = s.begin(); - it != s.end(); - it++) { - T v = *it; - bl.append((char*)&v, sizeof(v)); - n--; - } - assert(n==0); -} -template -inline void _decode(std::set& s, bufferlist& bl, int& off) -{ - s.clear(); - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i -template -inline void _encode(std::vector& s, bufferlist& bl) -{ - int n = s.size(); - bl.append((char*)&n, sizeof(n)); - for (typename std::vector::iterator it = s.begin(); - it != s.end(); - it++) { - T v = *it; - bl.append((char*)&v, sizeof(v)); - n--; - } - assert(n==0); -} -template -inline void _decode(std::vector& s, bufferlist& bl, int& off) -{ - s.clear(); - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - s = std::vector(n); - for (int i=0; i -template -inline void _encode(const std::list& s, bufferlist& bl) -{ - int n = s.size(); - bl.append((char*)&n, sizeof(n)); - for (typename std::list::const_iterator it = s.begin(); - it != s.end(); - it++) { - T v = *it; - bl.append((char*)&v, sizeof(v)); - n--; - } - assert(n==0); -} -template -inline void _decode(std::list& s, bufferlist& bl, int& off) -{ - s.clear(); - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i -inline void _encode(std::map& s, bufferlist& bl) -{ - int n = s.size(); - bl.append((char*)&n, sizeof(n)); - for (std::map::iterator it = s.begin(); - it != s.end(); - it++) { - _encode(it->first, bl); - _encode(it->second, bl); - n--; - } - assert(n==0); -} -inline void _decode(std::map& s, bufferlist& bl, int& off) -{ - s.clear(); - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i -template -inline void _encode(const std::map& s, bufferlist& bl) -{ - int n = s.size(); - bl.append((char*)&n, sizeof(n)); - //std::cout << "n = " << n << std::endl; - for (typename std::map::const_iterator it = s.begin(); - it != s.end(); - it++) { - T k = it->first; - bl.append((char*)&k, sizeof(k)); - _encode(it->second, bl); - n--; - //std::cout << "--n = " << n << " after k " << k << std::endl; - } - assert(n==0); -} -template -inline void _decode(std::map& s, bufferlist& bl, int& off) -{ - s.clear(); - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i -template -inline void _encode(const std::map& s, bufferlist& bl) -{ - int n = s.size(); - bl.append((char*)&n, sizeof(n)); - for (typename std::map::const_iterator it = s.begin(); - it != s.end(); - it++) { - ::_encode(it->first, bl); - U v = it->second; - bl.append((char*)&v, sizeof(v)); - n--; - } - assert(n==0); -} -template -inline void _decode(std::map& s, bufferlist& bl, int& off) -{ - s.clear(); - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i> -template -inline void _encode(const std::map >& s, bufferlist& bl) -{ - int n = s.size(); - bl.append((char*)&n, sizeof(n)); - for (typename std::map >::const_iterator it = s.begin(); - it != s.end(); - it++) { - T k = it->first; - bl.append((char*)&k, sizeof(k)); - ::_encode(it->second, bl); - n--; - } - assert(n==0); -} -template -inline void _decode(std::map >& s, bufferlist& bl, int& off) -{ - s.clear(); - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i -template -inline void _encode(const std::map& s, bufferlist& bl) -{ - int n = s.size(); - bl.append((char*)&n, sizeof(n)); - for (typename std::map::const_iterator it = s.begin(); - it != s.end(); - it++) { - T k = it->first; - U v = it->second; - bl.append((char*)&k, sizeof(k)); - bl.append((char*)&v, sizeof(v)); - n--; - } - assert(n==0); -} -template -inline void _decode(std::map& s, bufferlist& bl, int& off) -{ - s.clear(); - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -#define SYSERROR() syserror("At %s:%d", __FILE__, __LINE__) - -#define ASSERT(c) \ - ((c) || (exiterror("Assertion failed at %s:%d", __FILE__, __LINE__), 1)) - -/* print usage error message and exit */ -extern void userror(const char *use, const char *fmt, ...); - -/* print system error message and exit */ -extern void syserror(const char *fmt, ...); - -/* print error message and exit */ -extern void exiterror(const char *fmt, ...); - -/* print error message */ -extern void error(const char *fmt, ...); - -#ifdef __cplusplus -} // extern "C" -#endif diff --git a/branches/marnberg/quota/include/filepath.h b/branches/marnberg/quota/include/filepath.h deleted file mode 100644 index 5585e536b42db..0000000000000 --- a/branches/marnberg/quota/include/filepath.h +++ /dev/null @@ -1,206 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __FILEPATH_H -#define __FILEPATH_H - - -/* - * BUG: /a/b/c is equivalent to a/b/c in dentry-breakdown, but not string. - * -> should it be different? how? should this[0] be "", with depth 4? - * - */ - - -#include -#include -#include -using namespace std; - -#include -using namespace __gnu_cxx; - -#include "buffer.h" - - -class filepath { - string path; - vector bits; - - void rebuild() { - if (absolute()) - path = "/"; - else - path.clear(); - for (unsigned i=0; i::iterator it = bits.begin(); - it != bits.end(); - it++) { - r.append((*it).c_str(), (*it).length()+1); - } - } - - void _unrope(crope& r, int& off) { - clear(); - - char n; - r.copy(off, sizeof(char), (char*)&n); - off += sizeof(char); - for (int i=0; i::iterator it = bits.begin(); - it != bits.end(); - it++) { - bl.append((*it).c_str(), (*it).length()+1); - } - } - - void _decode(bufferlist& bl, int& off) { - clear(); - - char n; - bl.copy(off, sizeof(char), (char*)&n); - off += sizeof(char); - for (int i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __INTERVAL_SET_H -#define __INTERVAL_SET_H - -#include -#include -#include -using namespace std; - -#ifndef MIN -# define MIN(a,b) ((a)<=(b) ? (a):(b)) -#endif -#ifndef MAX -# define MAX(a,b) ((a)>=(b) ? (a):(b)) -#endif - - -template -class interval_set { - public: - map m; // map start -> len - - // helpers - private: - typename map::const_iterator find_inc(T start) const { - typename map::const_iterator p = m.lower_bound(start); // p->first >= start - if (p != m.begin() && - (p == m.end() || p->first > start)) { - p--; // might overlap? - if (p->first + p->second <= start) - p++; // it doesn't. - } - return p; - } - - typename map::iterator find_inc_m(T start) { - typename map::iterator p = m.lower_bound(start); - if (p != m.begin() && - (p == m.end() || p->first > start)) { - p--; // might overlap? - if (p->first + p->second <= start) - p++; // it doesn't. - } - return p; - } - - typename map::const_iterator find_adj(T start) const { - typename map::const_iterator p = m.lower_bound(start); - if (p != m.begin() && - (p == m.end() || p->first > start)) { - p--; // might touch? - if (p->first + p->second < start) - p++; // it doesn't. - } - return p; - } - - typename map::iterator find_adj_m(T start) { - typename map::iterator p = m.lower_bound(start); - if (p != m.begin() && - (p == m.end() || p->first > start)) { - p--; // might touch? - if (p->first + p->second < start) - p++; // it doesn't. - } - return p; - } - - public: - bool operator==(const interval_set& other) const { - return m == other.m; - } - - void clear() { - m.clear(); - } - - bool contains(T i) const { - typename map::const_iterator p = find_inc(i); - if (p == m.end()) return false; - if (p->first > i) return false; - if (p->first+p->second <= i) return false; - assert(p->first <= i && p->first+p->second > i); - return true; - } - bool contains(T start, T len) const { - typename map::const_iterator p = find_inc(start); - if (p == m.end()) return false; - if (p->first > start) return false; - if (p->first+p->second <= start) return false; - assert(p->first <= start && p->first+p->second > start); - if (p->first+p->second < start+len) return false; - return true; - } - bool intersects(T start, T len) const { - interval_set a; - a.insert(start, len); - interval_set i; - i.intersection_of( *this, a ); - if (i.empty()) return false; - return true; - } - - // outer range of set - bool empty() const { - return m.empty(); - } - T start() const { - assert(!empty()); - typename map::const_iterator p = m.begin(); - return p->first; - } - T end() const { - assert(!empty()); - typename map::const_iterator p = m.end(); - p--; - return p->first+p->second; - } - - // interval start after p (where p not in set) - bool starts_after(T i) const { - assert(!contains(i)); - typename map::const_iterator p = find_inc(i); - if (p == m.end()) return false; - return true; - } - T start_after(T i) const { - assert(!contains(i)); - typename map::const_iterator p = find_inc(i); - return p->first; - } - - // interval end that contains start - T end_after(T start) const { - assert(contains(start)); - typename map::const_iterator p = find_inc(start); - return p->first+p->second; - } - - void insert(T val) { - insert(val, 1); - } - - void insert(T start, T len) { - //cout << "insert " << start << "~" << len << endl; - assert(len > 0); - typename map::iterator p = find_adj_m(start); - if (p == m.end()) { - m[start] = len; // new interval - } else { - if (p->first < start) { - - if (p->first + p->second != start) { - //cout << "p is " << p->first << "~" << p->second << ", start is " << start << ", len is " << len << endl; - assert(0); - } - - assert(p->first + p->second == start); - p->second += len; // append to end - - typename map::iterator n = p; - n++; - if (n != m.end() && - start+len == n->first) { // combine with next, too! - p->second += n->second; - m.erase(n); - } - } else { - if (start+len == p->first) { - m[start] = len + p->second; // append to front - m.erase(p); - } else { - assert(p->first > start+len); - m[start] = len; // new interval - } - } - } - } - - void erase(T val) { - erase(val, 1); - } - - void erase(T start, T len) { - typename map::iterator p = find_inc_m(start); - - assert(p != m.end()); - assert(p->first <= start); - - T before = start - p->first; - assert(p->second >= before+len); - T after = p->second - before - len; - - if (before) - p->second = before; // shorten bit before - else - m.erase(p); - if (after) - m[start+len] = after; - } - - - void subtract(const interval_set &a) { - for (typename map::const_iterator p = a.m.begin(); - p != a.m.end(); - p++) - erase(p->first, p->second); - } - - void insert(const interval_set &a) { - for (typename map::const_iterator p = a.m.begin(); - p != a.m.end(); - p++) - insert(p->first, p->second); - } - - - void intersection_of(const interval_set &a, const interval_set &b) { - assert(&a != this); - assert(&b != this); - clear(); - - typename map::const_iterator pa = a.m.begin(); - typename map::const_iterator pb = b.m.begin(); - - while (pa != a.m.end() && pb != b.m.end()) { - // passing? - if (pa->first + pa->second <= pb->first) - { pa++; continue; } - if (pb->first + pb->second <= pa->first) - { pb++; continue; } - T start = MAX(pa->first, pb->first); - T end = MIN(pa->first+pa->second, pb->first+pb->second); - assert(end > start); - insert(start, end-start); - if (pa->first+pa->second > pb->first+pb->second) - pb++; - else - pa++; - } - } - - void union_of(const interval_set &a, const interval_set &b) { - assert(&a != this); - assert(&b != this); - clear(); - - //cout << "union_of" << endl; - - // a - m = a.m; - - // - (a*b) - interval_set ab; - ab.intersection_of(a, b); - subtract(ab); - - // + b - insert(b); - return; - } - void union_of(const interval_set &b) { - interval_set a; - a.m.swap(m); - union_of(a, b); - } - - bool subset_of(const interval_set &big) const { - for (typename map::const_iterator i = m.begin(); - i != m.end(); - i++) - if (!big.contains(i->first, i->second)) return false; - return true; - } - -}; - -template -inline ostream& operator<<(ostream& out, const interval_set &s) { - out << "["; - for (typename map::const_iterator i = s.m.begin(); - i != s.m.end(); - i++) { - if (i != s.m.begin()) out << ","; - out << i->first << "~" << i->second; - } - out << "]"; - return out; -} - - -#endif diff --git a/branches/marnberg/quota/include/lru.h b/branches/marnberg/quota/include/lru.h deleted file mode 100644 index 63096d0e32079..0000000000000 --- a/branches/marnberg/quota/include/lru.h +++ /dev/null @@ -1,321 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __LRU_H -#define __LRU_H - -#include -#include -using namespace std; - -#include "config.h" - - - -class LRUObject { - private: - LRUObject *lru_next, *lru_prev; - bool lru_pinned; - class LRU *lru; - class LRUList *lru_list; - - public: - LRUObject() { - lru_next = lru_prev = NULL; - lru_list = 0; - lru_pinned = false; - lru = 0; - } - - // pin/unpin item in cache - void lru_pin(); - void lru_unpin(); - bool lru_is_expireable() { return !lru_pinned; } - - friend class LRU; - friend class LRUList; -}; - - -class LRUList { - private: - LRUObject *head, *tail; - __uint32_t len; - - public: - LRUList() { - head = tail = 0; - len = 0; - } - - __uint32_t get_length() { return len; } - - LRUObject *get_head() { - return head; - } - LRUObject *get_tail() { - return tail; - } - - void insert_head(LRUObject *o) { - o->lru_next = head; - o->lru_prev = NULL; - if (head) { - head->lru_prev = o; - } else { - tail = o; - } - head = o; - o->lru_list = this; - len++; - } - void insert_tail(LRUObject *o) { - o->lru_next = NULL; - o->lru_prev = tail; - if (tail) { - tail->lru_next = o; - } else { - head = o; - } - tail = o; - o->lru_list = this; - len++; - } - - void remove(LRUObject *o) { - assert(o->lru_list == this); - if (o->lru_next) - o->lru_next->lru_prev = o->lru_prev; - else - tail = o->lru_prev; - if (o->lru_prev) - o->lru_prev->lru_next = o->lru_next; - else - head = o->lru_next; - o->lru_next = o->lru_prev = NULL; - o->lru_list = 0; - assert(len>0); - len--; - } - -}; - - -class LRU { - protected: - LRUList lru_top, lru_bot, lru_pintail; - __uint32_t lru_num, lru_num_pinned; - __uint32_t lru_max; // max items - double lru_midpoint; - - friend class LRUObject; - //friend class MDCache; // hack - - public: - LRU(int max = 0) { - lru_num = 0; - lru_num_pinned = 0; - lru_midpoint = .9; - lru_max = max; - } - - __uint32_t lru_get_size() { return lru_num; } - __uint32_t lru_get_top() { return lru_top.get_length(); } - __uint32_t lru_get_bot() { return lru_bot.get_length(); } - __uint32_t lru_get_pintail() { return lru_pintail.get_length(); } - __uint32_t lru_get_max() { return lru_max; } - __uint32_t lru_get_num_pinned() { return lru_num_pinned; } - - void lru_set_max(__uint32_t m) { lru_max = m; } - void lru_set_midpoint(float f) { lru_midpoint = f; } - - - // insert at top of lru - void lru_insert_top(LRUObject *o) { - //assert(!o->lru_in_lru); - //o->lru_in_lru = true; - assert(!o->lru); - o->lru = this; - lru_top.insert_head( o ); - lru_num++; - if (o->lru_pinned) lru_num_pinned++; - lru_adjust(); - } - - // insert at mid point in lru - void lru_insert_mid(LRUObject *o) { - //assert(!o->lru_in_lru); - //o->lru_in_lru = true; - assert(!o->lru); - o->lru = this; - lru_bot.insert_head(o); - lru_num++; - if (o->lru_pinned) lru_num_pinned++; - } - - // insert at bottom of lru - void lru_insert_bot(LRUObject *o) { - assert(!o->lru); - o->lru = this; - lru_bot.insert_tail(o); - lru_num++; - if (o->lru_pinned) lru_num_pinned++; - } - - /* - // insert at bottom of lru - void lru_insert_pintail(LRUObject *o) { - assert(!o->lru); - o->lru = this; - - assert(o->lru_pinned); - - lru_pintail.insert_head(o); - lru_num++; - lru_num_pinned += o->lru_pinned; - } - */ - - - - - // adjust top/bot balance, as necessary - void lru_adjust() { - if (!lru_max) return; - - unsigned toplen = lru_top.get_length(); - unsigned topwant = (unsigned)(lru_midpoint * (double)lru_max); - while (toplen > 0 && - toplen > topwant) { - // remove from tail of top, stick at head of bot - // FIXME: this could be way more efficient by moving a whole chain of items. - - LRUObject *o = lru_top.get_tail(); - lru_top.remove(o); - lru_bot.insert_head(o); - toplen--; - } - } - - - // remove an item - LRUObject *lru_remove(LRUObject *o) { - // not in list - //assert(o->lru_in_lru); - //if (!o->lru_in_lru) return o; // might have expired and been removed that way. - if (!o->lru) return o; - - - if (o->lru_list == &lru_top) - lru_top.remove(o); - else if (o->lru_list == &lru_bot) - lru_bot.remove(o); - else if (o->lru_list == &lru_pintail) - lru_pintail.remove(o); - else - assert(0); - - lru_num--; - if (o->lru_pinned) lru_num_pinned--; - o->lru = 0; - return o; - } - - // touch item -- move to head of lru - bool lru_touch(LRUObject *o) { - lru_remove(o); - lru_insert_top(o); - return true; - } - - // touch item -- move to midpoint (unless already higher) - bool lru_midtouch(LRUObject *o) { - if (o->lru_list == &lru_top) return false; - - lru_remove(o); - lru_insert_mid(o); - return true; - } - - // touch item -- move to bottom - bool lru_bottouch(LRUObject *o) { - lru_remove(o); - lru_insert_bot(o); - return true; - } - - - // expire -- expire a single item - LRUObject *lru_get_next_expire() { - LRUObject *p; - - // look through tail of bot - while (lru_bot.get_length()) { - p = lru_bot.get_tail(); - if (!p->lru_pinned) return p; - - // move to pintail - lru_bot.remove(p); - lru_pintail.insert_head(p); - } - - // ok, try head then - while (lru_top.get_length()) { - p = lru_top.get_tail(); - if (!p->lru_pinned) return p; - - // move to pintail - lru_top.remove(p); - lru_pintail.insert_head(p); - } - - // no luck! - return NULL; - } - - LRUObject *lru_expire() { - LRUObject *p = lru_get_next_expire(); - if (p) - return lru_remove(p); - return NULL; - } - - - void lru_status() { - dout(10) << "lru: " << lru_num << " items, " << lru_top.get_length() << " top, " << lru_bot.get_length() << " bot, " << lru_pintail.get_length() << " pintail" << endl; - } - -}; - - -inline void LRUObject::lru_pin() -{ - lru_pinned = true; - if (lru) lru->lru_num_pinned++; -} -inline void LRUObject::lru_unpin() { - lru_pinned = false; - if (lru) { - lru->lru_num_pinned--; - - // move from pintail -> bot - if (lru_list == &lru->lru_pintail) { - lru->lru_pintail.remove(this); - lru->lru_bot.insert_tail(this); - } - } -} - -#endif diff --git a/branches/marnberg/quota/include/object.h b/branches/marnberg/quota/include/object.h deleted file mode 100644 index 5d5a87727e5ad..0000000000000 --- a/branches/marnberg/quota/include/object.h +++ /dev/null @@ -1,97 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __OBJECT_H -#define __OBJECT_H - -#include -#include -using namespace std; - - -typedef __uint32_t objectrev_t; - -struct object_t { - static const __uint32_t MAXREV = 0xffffffffU; - - __uint64_t ino; // "file" identifier - __uint32_t bno; // "block" in that "file" - objectrev_t rev; // revision. normally ctime (as epoch). - - object_t() : ino(0), bno(0), rev(0) {} - object_t(__uint64_t i, __uint32_t b) : ino(i), bno(b), rev(0) {} - object_t(__uint64_t i, __uint32_t b, __uint32_t r) : ino(i), bno(b), rev(r) {} -}; - - -inline bool operator==(const object_t l, const object_t r) { - return (l.ino == r.ino) && (l.bno == r.bno) && (l.rev == r.rev); -} -inline bool operator!=(const object_t l, const object_t r) { - return (l.ino != r.ino) || (l.bno != r.bno) || (l.rev != r.rev); -} -inline bool operator>(const object_t l, const object_t r) { - if (l.ino > r.ino) return true; - if (l.ino < r.ino) return false; - if (l.bno > r.bno) return true; - if (l.bno < r.bno) return false; - if (l.rev > r.rev) return true; - return false; -} -inline bool operator<(const object_t l, const object_t r) { - if (l.ino < r.ino) return true; - if (l.ino > r.ino) return false; - if (l.bno < r.bno) return true; - if (l.bno > r.bno) return false; - if (l.rev < r.rev) return true; - return false; -} -inline bool operator>=(const object_t l, const object_t r) { - return !(l < r); -} -inline bool operator<=(const object_t l, const object_t r) { - return !(l > r); -} -inline ostream& operator<<(ostream& out, const object_t o) { - out << hex << o.ino << '.'; - out.setf(ios::right); - out.fill('0'); - out << setw(8) << o.bno << dec; - out.unsetf(ios::right); - if (o.rev) - out << '.' << o.rev; - return out; -} - - -namespace __gnu_cxx { -#ifndef __LP64__ - template<> struct hash<__uint64_t> { - size_t operator()(__uint64_t __x) const { - static hash<__uint32_t> H; - return H((__x >> 32) ^ (__x & 0xffffffff)); - } - }; -#endif - - template<> struct hash { - size_t operator()(const object_t &r) const { - static hash<__uint64_t> H; - static hash<__uint32_t> I; - return H(r.ino) ^ I(r.bno); - } - }; -} - - -#endif diff --git a/branches/marnberg/quota/include/oldbuffer.h b/branches/marnberg/quota/include/oldbuffer.h deleted file mode 100644 index fda7336bc6461..0000000000000 --- a/branches/marnberg/quota/include/oldbuffer.h +++ /dev/null @@ -1,357 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __BUFFER_H -#define __BUFFER_H - -#include -#include - -#include -using namespace std; - -// bit masks -#define BUFFER_MODE_NOCOPY 0 -#define BUFFER_MODE_COPY 1 // copy on create, my buffer - -#define BUFFER_MODE_NOFREE 0 -#define BUFFER_MODE_FREE 2 - -#define BUFFER_MODE_CUSTOMFREE 4 - -#define BUFFER_MODE_DEFAULT 3//(BUFFER_MODE_COPY|BUFFER_MODE_FREE) - - -// debug crap -#include "config.h" -#define bdbout(x) if (x <= g_conf.debug_buffer) cout - -#include "common/Mutex.h" - -// HACK: in config.cc -/* - * WARNING: bufferlock placements are tricky for efficiency. note that only bufferptr and - * buffer ever use buffer._ref, and only bufferptr should call ~buffer(). - * - * So, I only need to protect: - * - buffer()'s modification of buffer_total_alloc - * - ~bufferptr() check of buffer._ref, and ~buffer's mod of buffer_total_alloc - * - * I don't protect - * - buffer._get() .. increment is atomic on any sane architecture - * - buffer._put() .. only called by ~bufferptr. - * - ~buffer .. only called by ~bufferptr *** I HOPE!! - */ -extern Mutex bufferlock; -extern long buffer_total_alloc; - - -typedef void (buffer_free_func_t)(void*,char*,unsigned); - - -/* - * buffer - the underlying buffer container. with a reference count. - * - * the buffer never shrinks. - * - * some invariants: - * _len never shrinks - * _len <= _alloc_len - */ -class buffer { - protected: - //wtf - //static Mutex bufferlock; - //static long buffer_total_alloc;// = 0; - - private: - // raw buffer alloc - char *_dataptr; - bool _myptr; - unsigned _len; - unsigned _alloc_len; - - // ref counts - unsigned _ref; - int _get() { - bdbout(1) << "buffer.get " << *this << " get " << _ref+1 << endl; - return ++_ref; - } - int _put() { - bdbout(1) << "buffer.put " << *this << " put " << _ref-1 << endl; - assert(_ref > 0); - return --_ref; - } - - // custom (de!)allocator - buffer_free_func_t *free_func; - void *free_func_arg; - - friend class bufferptr; - - public: - // constructors - buffer() : _dataptr(0), _myptr(true), _len(0), _alloc_len(0), _ref(0), free_func(0), free_func_arg(0) { - bdbout(1) << "buffer.cons " << *this << endl; - } - buffer(unsigned a) : _dataptr(0), _myptr(true), _len(a), _alloc_len(a), _ref(0), free_func(0), free_func_arg(0) { - bdbout(1) << "buffer.cons " << *this << endl; - _dataptr = new char[a]; - bufferlock.Lock(); - buffer_total_alloc += _alloc_len; - bufferlock.Unlock(); - bdbout(1) << "buffer.malloc " << (void*)_dataptr << endl; - } - ~buffer() { - bdbout(1) << "buffer.des " << *this << " " << (void*)free_func << endl; - if (free_func) { - bdbout(1) << "buffer.custom_free_func " << free_func_arg << " " << (void*)_dataptr << endl; - free_func( free_func_arg, _dataptr, _alloc_len ); - } - else if (_dataptr && _myptr) { - bdbout(1) << "buffer.free " << (void*)_dataptr << endl; - delete[] _dataptr; - buffer_total_alloc -= _alloc_len; - } - } - - buffer(const char *p, int l, int mode=BUFFER_MODE_DEFAULT, int alloc_len=0, - buffer_free_func_t free_func=0, void* free_func_arg=0) : - _dataptr(0), - _myptr(false), - _len(l), - _ref(0), - free_func(0), free_func_arg(0) { - - if (alloc_len) - _alloc_len = alloc_len; - else - _alloc_len = l; - - _myptr = mode & BUFFER_MODE_FREE ? true:false; - bdbout(1) << "buffer.cons " << *this << " mode = " << mode << ", myptr=" << _myptr << endl; - if (mode & BUFFER_MODE_COPY) { - _dataptr = new char[_alloc_len]; - bdbout(1) << "buffer.malloc " << (void*)_dataptr << endl; - bufferlock.Lock(); - buffer_total_alloc += _alloc_len; - bufferlock.Unlock(); - memcpy(_dataptr, p, l); - bdbout(1) << "buffer.copy " << *this << endl; - } else { - _dataptr = (char*)p; // ugly - bdbout(1) << "buffer.claim " << *this << " myptr=" << _myptr << endl; - } - - if (mode & BUFFER_MODE_CUSTOMFREE && free_func) { - this->free_func = free_func; - this->free_func_arg = free_func_arg; - } - } - - // operators - buffer& operator=(buffer& other) { - assert(0); // not implemented, no reasonable assignment semantics. - return *this; - } - - char *c_str() { - return _dataptr; - } - - bool has_free_func() { return free_func != 0; } - - // accessor - unsigned alloc_length() { - return _alloc_len; - } - void set_length(unsigned l) { - assert(l <= _alloc_len); - _len = l; - } - unsigned length() { return _len; } - unsigned unused_tail_length() { return _alloc_len - _len; } - - friend ostream& operator<<(ostream& out, buffer& b); -}; - -inline ostream& operator<<(ostream& out, buffer& b) { - return out << "buffer(this=" << &b << " len=" << b._len << ", alloc=" << b._alloc_len << ", data=" << (void*)b._dataptr << " ref=" << b._ref << ")"; -} - - -/* - * smart pointer class for buffer - * - * we reference count the actual buffer. - * we also let you refer to a subset of a buffer. - * we implement the high-level buffer accessor methods. - * - * some invariants: - * _off < _buffer->_len - * _off + _len <= _buffer->_len - */ -class bufferptr { - private: - buffer *_buffer; - unsigned _len, _off; - - public: - // empty cons - bufferptr() : - _buffer(0), - _len(0), - _off(0) { } - // main cons - the entire buffer - bufferptr(buffer *b) : - _buffer(b), - _len(b->_len), - _off(0) { - assert(_buffer->_ref == 0); - _buffer->_get(); // this is always the first one. - } - // subset cons - a subset of another bufferptr (subset) - bufferptr(const bufferptr& bp, unsigned len, unsigned off) { - bufferlock.Lock(); - _buffer = bp._buffer; - _len = len; - _off = bp._off + off; - _buffer->_get(); - assert(_off < _buffer->_len); // sanity checks - assert(_off + _len <= _buffer->_len); - bufferlock.Unlock(); - } - - // copy cons - bufferptr(const bufferptr &other) { - bufferlock.Lock(); - _buffer = other._buffer; - _len = other._len; - _off = other._off; - if (_buffer) _buffer->_get(); - bufferlock.Unlock(); - } - - // assignment operator - bufferptr& operator=(const bufferptr& other) { - //assert(0); - // discard old - discard_buffer(); - - // point to other - bufferlock.Lock(); - _buffer = other._buffer; - _len = other._len; - _off = other._off; - if (_buffer) _buffer->_get(); - bufferlock.Unlock(); - return *this; - } - - ~bufferptr() { - discard_buffer(); - } - - void discard_buffer() { - if (_buffer) { - bufferlock.Lock(); - if (_buffer->_put() == 0) - delete _buffer; - _buffer = 0; - bufferlock.Unlock(); - } - } - - - // dereference to get the actual buffer - buffer& operator*() { - return *_buffer; - } - - - bool at_buffer_head() const { - return _off == 0; - } - bool at_buffer_tail() const { - return _off + _len == _buffer->_len; - } - - // accessors for my subset - char *c_str() { - return _buffer->c_str() + _off; - } - unsigned length() const { - return _len; - } - unsigned offset() const { - return _off; - } - unsigned unused_tail_length() { - if (!at_buffer_tail()) return 0; - return _buffer->unused_tail_length(); - } - - - - // modifiers - void set_offset(unsigned off) { - assert(off <= _buffer->_alloc_len); - _off = off; - } - void set_length(unsigned len) { - assert(len >= 0 && _off + len <= _buffer->_alloc_len); - if (_buffer->_len < _off + len) - _buffer->_len = _off + len; // set new buffer len (_IF_ i'm expanding it) - _len = len; // my len too - } - void zero() { - //bzero((void*)c_str(), _len); - memset((void*)c_str(), 0, _len); - } - - - // crope lookalikes - void append(const char *p, unsigned len) { - assert(len + _len + _off <= _buffer->_alloc_len); // FIXME later for auto-expansion? - - // copy - memcpy(c_str() + _len, p, len); - _buffer->_len += len; - _len += len; - } - void copy_out(unsigned off, unsigned len, char *dest) { - assert(off >= 0 && off <= _len); - assert(len >= 0 && off + len <= _len); - memcpy(dest, c_str() + off, len); - } - void copy_in(unsigned off, unsigned len, const char *src) { - assert(off >= 0 && off <= _len); - assert(len >= 0 && off + len <= _len); - memcpy(c_str() + off, src, len); - } - - friend ostream& operator<<(ostream& out, bufferptr& bp); -}; - - -inline ostream& operator<<(ostream& out, bufferptr& bp) { - return out << "bufferptr(len=" << bp._len << " off=" << bp._off - << " cstr=" << (void*)bp.c_str() - << " buf=" << *bp._buffer - << ")"; -} - - - -#endif diff --git a/branches/marnberg/quota/include/oldbufferlist.h b/branches/marnberg/quota/include/oldbufferlist.h deleted file mode 100644 index 466a5ead25d77..0000000000000 --- a/branches/marnberg/quota/include/oldbufferlist.h +++ /dev/null @@ -1,681 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __BUFFERLIST_H -#define __BUFFERLIST_H - -#include "buffer.h" - -#include -#include -#include -#include -using namespace std; - -#include -using namespace __gnu_cxx; - - -// debug crap -#include "config.h" -#define bdbout(x) if (x <= g_conf.debug_buffer) cout - - - -class bufferlist { - private: - /* local state limited to _buffers, and _len. - * we maintain _len ourselves, so we must be careful when fiddling with buffers! - */ - list _buffers; - unsigned _len; - - public: - // cons/des - bufferlist() : _len(0) { - bdbout(1) << "bufferlist.cons " << this << endl; - } - bufferlist(const bufferlist& bl) : _len(0) { - //assert(0); // o(n) and stupid - bdbout(1) << "bufferlist.cons " << this << endl; - _buffers = bl._buffers; - _len = bl._len; - } - ~bufferlist() { - bdbout(1) << "bufferlist.des " << this << endl; - } - - bufferlist& operator=(bufferlist& bl) { - //assert(0); // actually, this should be fine, just slow (O(n)) and stupid. - bdbout(1) << "bufferlist.= " << this << endl; - _buffers = bl._buffers; - _len = bl._len; - return *this; - } - - - // accessors - list& buffers() { - return _buffers; - } - //list::iterator begin() { return _buffers.begin(); } - //list::iterator end() { return _buffers.end(); } - - unsigned length() const { -#if 0 - { // DEBUG: verify _len - int len = 0; - for (list::iterator it = _buffers.begin(); - it != _buffers.end(); - it++) { - len += (*it).length(); - } - assert(len == _len); - } -#endif - return _len; - } - - void _rope(crope& r) { - for (list::iterator it = _buffers.begin(); - it != _buffers.end(); - it++) - r.append((*it).c_str(), (*it).length()); - } - - // modifiers - void clear() { - _buffers.clear(); - _len = 0; - } - void push_front(bufferptr& bp) { - _buffers.push_front(bp); - _len += bp.length(); - } - void push_front(buffer *b) { - bufferptr bp(b); - _buffers.push_front(bp); - _len += bp.length(); - } - void push_back(bufferptr& bp) { - _buffers.push_back(bp); - _len += bp.length(); - } - void push_back(buffer *b) { - bufferptr bp(b); - - _buffers.push_back(bp); - _len += bp.length(); - - } - void zero() { - for (list::iterator it = _buffers.begin(); - it != _buffers.end(); - it++) - it->zero(); - } - - // sort-of-like-assignment-op - void claim(bufferlist& bl) { - // free my buffers - clear(); - claim_append(bl); - } - void claim_append(bufferlist& bl) { - // steal the other guy's buffers - _len += bl._len; - _buffers.splice( _buffers.end(), bl._buffers ); - bl._len = 0; - } - - - - - // crope lookalikes - void copy(unsigned off, unsigned len, char *dest) { - assert(off >= 0); - assert(off + len <= length()); - /*assert(off < length()); - if (off + len > length()) - len = length() - off; - */ - // advance to off - list::iterator curbuf = _buffers.begin(); - - // skip off - while (off > 0) { - assert(curbuf != _buffers.end()); - if (off >= (*curbuf).length()) { - // skip this buffer - off -= (*curbuf).length(); - curbuf++; - } else { - // somewhere in this buffer! - break; - } - } - - // copy - while (len > 0) { - // is the rest ALL in this buffer? - if (off + len <= (*curbuf).length()) { - (*curbuf).copy_out(off, len, dest); // yup, last bit! - break; - } - - // get as much as we can from this buffer. - unsigned howmuch = (*curbuf).length() - off; - (*curbuf).copy_out(off, howmuch, dest); - - dest += howmuch; - len -= howmuch; - off = 0; - curbuf++; - assert(curbuf != _buffers.end()); - } - } - - void copy_in(unsigned off, unsigned len, const char *src) { - assert(off >= 0); - assert(off + len <= length()); - - // advance to off - list::iterator curbuf = _buffers.begin(); - - // skip off - while (off > 0) { - assert(curbuf != _buffers.end()); - if (off >= (*curbuf).length()) { - // skip this buffer - off -= (*curbuf).length(); - curbuf++; - } else { - // somewhere in this buffer! - break; - } - } - - // copy - while (len > 0) { - // is the rest ALL in this buffer? - if (off + len <= (*curbuf).length()) { - (*curbuf).copy_in(off, len, src); // yup, last bit! - break; - } - - // get as much as we can from this buffer. - unsigned howmuch = (*curbuf).length() - off; - (*curbuf).copy_in(off, howmuch, src); - - src += howmuch; - len -= howmuch; - off = 0; - curbuf++; - assert(curbuf != _buffers.end()); - } - } - void copy_in(unsigned off, unsigned len, bufferlist& bl) { - unsigned left = len; - for (list::iterator i = bl._buffers.begin(); - i != bl._buffers.end(); - i++) { - unsigned l = (*i).length(); - if (left < l) l = left; - copy_in(off, l, (*i).c_str()); - left -= l; - if (left == 0) break; - off += l; - } - } - - - void append(const char *data, unsigned len) { - if (len == 0) return; - - unsigned alen = 0; - - // copy into the tail buffer? - if (!_buffers.empty()) { - unsigned avail = _buffers.back().unused_tail_length(); - if (avail > 0) { - //cout << "copying up to " << len << " into tail " << avail << " bytes of tail buf" << endl; - if (avail > len) - avail = len; - unsigned blen = _buffers.back().length(); - memcpy(_buffers.back().c_str() + blen, data, avail); - blen += avail; - _buffers.back().set_length(blen); - _len += avail; - data += avail; - len -= avail; - } - alen = _buffers.back().length(); - } - if (len == 0) return; - - // just add another buffer. - // alloc a bit extra, in case we do a bunch of appends. FIXME be smarter! - if (alen < 1024) alen = 1024; - push_back(new buffer(data, len, BUFFER_MODE_DEFAULT, len+alen)); - } - void append(bufferptr& bp) { - push_back(bp); - } - void append(bufferptr& bp, unsigned len, unsigned off) { - bufferptr tempbp(bp, len, off); - push_back(tempbp); - } - void append(const bufferlist& bl) { - bufferlist temp = bl; // copy list - claim_append(temp); // and append - } - - - /* - * return a contiguous ptr to whole bufferlist contents. - */ - char *c_str() { - if (_buffers.size() == 1) { - return _buffers.front().c_str(); // good, we're already contiguous. - } - else if (_buffers.size() == 0) { - return 0; // no buffers - } - else { - // make one new contiguous buffer. - bufferptr newbuf = new buffer(length()); - unsigned off = 0; - - for (list::iterator it = _buffers.begin(); - it != _buffers.end(); - it++) { - //assert((*(*it)).has_free_func() == false); // not allowed if there's a funky free_func.. -sage ...for debugging at least! - memcpy(newbuf.c_str() + off, - (*it).c_str(), (*it).length()); - off += (*it).length(); - } - assert(off == newbuf.length()); - - _buffers.clear(); - _buffers.push_back( newbuf ); - - // now it'll work. - return c_str(); - } - } - - - void substr_of(bufferlist& other, unsigned off, unsigned len) { - assert(off + len <= other.length()); - clear(); - - // skip off - list::iterator curbuf = other._buffers.begin(); - while (off > 0) { - assert(curbuf != _buffers.end()); - if (off >= (*curbuf).length()) { - // skip this buffer - //cout << "skipping over " << *curbuf << endl; - off -= (*curbuf).length(); - curbuf++; - } else { - // somewhere in this buffer! - //cout << "somewhere in " << *curbuf << endl; - break; - } - } - - while (len > 0) { - // partial? - if (off + len < (*curbuf).length()) { - //cout << "copying partial of " << *curbuf << endl; - _buffers.push_back( bufferptr( *curbuf, len, off ) ); - _len += len; - break; - } - - // through end - //cout << "copying end (all?) of " << *curbuf << endl; - unsigned howmuch = (*curbuf).length() - off; - _buffers.push_back( bufferptr( *curbuf, howmuch, off ) ); - _len += howmuch; - len -= howmuch; - off = 0; - curbuf++; - } - } - - // funky modifer - void splice(unsigned off, unsigned len, bufferlist *claim_by=0 /*, bufferlist& replace_with */) { // fixme? - assert(off < length()); - assert(len > 0); - //cout << "splice off " << off << " len " << len << " ... mylen = " << length() << endl; - - // skip off - list::iterator curbuf = _buffers.begin(); - while (off > 0) { - assert(curbuf != _buffers.end()); - if (off >= (*curbuf).length()) { - // skip this buffer - //cout << "off = " << off << " skipping over " << *curbuf << endl; - off -= (*curbuf).length(); - curbuf++; - } else { - // somewhere in this buffer! - //cout << "off = " << off << " somewhere in " << *curbuf << endl; - break; - } - } - assert(off >= 0); - - if (off) { - // add a reference to the front bit - // insert it before curbuf (which we'll hose) - //cout << "keeping front " << off << " of " << *curbuf << endl; - _buffers.insert( curbuf, bufferptr( *curbuf, off, 0 ) ); - _len += off; - } - - while (len > 0) { - // partial? - if (off + len < (*curbuf).length()) { - //cout << "keeping end of " << *curbuf << ", losing first " << off+len << endl; - if (claim_by) - claim_by->append( *curbuf, len, off ); - (*curbuf).set_offset( off+len + (*curbuf).offset() ); // ignore beginning big - (*curbuf).set_length( (*curbuf).length() - (len+off) ); - _len -= off+len; - //cout << " now " << *curbuf << endl; - break; - } - - // hose though the end - unsigned howmuch = (*curbuf).length() - off; - //cout << "discarding " << howmuch << " of " << *curbuf << endl; - if (claim_by) - claim_by->append( *curbuf, howmuch, off ); - _len -= (*curbuf).length(); - _buffers.erase( curbuf++ ); - len -= howmuch; - off = 0; - } - - // splice in *replace (implement me later?) - } - - friend ostream& operator<<(ostream& out, bufferlist& bl); - -}; - -inline ostream& operator<<(ostream& out, bufferlist& bl) { - out << "bufferlist(len=" << bl.length() << endl; - for (list::iterator it = bl._buffers.begin(); - it != bl._buffers.end(); - it++) - out << "\t" << *it << endl; - out << ")" << endl; - return out; -} - - - -// encoder/decode helpers - -// string -inline void _encode(const string& s, bufferlist& bl) -{ - bl.append(s.c_str(), s.length()+1); -} -inline void _decode(string& s, bufferlist& bl, int& off) -{ - s = bl.c_str() + off; - off += s.length() + 1; -} - -// bufferptr (encapsulated) -inline void _encode(bufferptr& bp, bufferlist& bl) -{ - size_t len = bp.length(); - bl.append((char*)&len, sizeof(len)); - bl.append(bp); -} -inline void _decode(bufferptr& bp, bufferlist& bl, int& off) -{ - size_t len; - bl.copy(off, sizeof(len), (char*)&len); - off += sizeof(len); - bufferlist s; - s.substr_of(bl, off, len); - off += len; - - if (s.buffers().size() == 1) - bp = s.buffers().front(); - else - bp = new buffer(s.c_str(), s.length()); -} - -// bufferlist (encapsulated) -inline void _encode(const bufferlist& s, bufferlist& bl) -{ - size_t len = s.length(); - bl.append((char*)&len, sizeof(len)); - bl.append(s); -} -inline void _decode(bufferlist& s, bufferlist& bl, int& off) -{ - size_t len; - bl.copy(off, sizeof(len), (char*)&len); - off += sizeof(len); - s.substr_of(bl, off, len); - off += len; -} - - -// set -template -inline void _encode(set& s, bufferlist& bl) -{ - int n = s.size(); - bl.append((char*)&n, sizeof(n)); - for (typename set::iterator it = s.begin(); - it != s.end(); - it++) { - T v = *it; - bl.append((char*)&v, sizeof(v)); - n--; - } - assert(n==0); -} -template -inline void _decode(set& s, bufferlist& bl, int& off) -{ - s.clear(); - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i -template -inline void _encode(vector& s, bufferlist& bl) -{ - int n = s.size(); - bl.append((char*)&n, sizeof(n)); - for (typename vector::iterator it = s.begin(); - it != s.end(); - it++) { - T v = *it; - bl.append((char*)&v, sizeof(v)); - n--; - } - assert(n==0); -} -template -inline void _decode(vector& s, bufferlist& bl, int& off) -{ - s.clear(); - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - s = vector(n); - for (int i=0; i -template -inline void _encode(const list& s, bufferlist& bl) -{ - int n = s.size(); - bl.append((char*)&n, sizeof(n)); - for (typename list::const_iterator it = s.begin(); - it != s.end(); - it++) { - T v = *it; - bl.append((char*)&v, sizeof(v)); - n--; - } - assert(n==0); -} -template -inline void _decode(list& s, bufferlist& bl, int& off) -{ - s.clear(); - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i -inline void _encode(map& s, bufferlist& bl) -{ - int n = s.size(); - bl.append((char*)&n, sizeof(n)); - for (map::iterator it = s.begin(); - it != s.end(); - it++) { - _encode(it->first, bl); - _encode(it->second, bl); - n--; - } - assert(n==0); -} -inline void _decode(map& s, bufferlist& bl, int& off) -{ - s.clear(); - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i -template -inline void _encode(const map& s, bufferlist& bl) -{ - int n = s.size(); - bl.append((char*)&n, sizeof(n)); - for (typename map::const_iterator it = s.begin(); - it != s.end(); - it++) { - T k = it->first; - bl.append((char*)&k, sizeof(k)); - _encode(it->second, bl); - n--; - } - assert(n==0); -} -template -inline void _decode(map& s, bufferlist& bl, int& off) -{ - s.clear(); - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i -template -inline void _encode(const map& s, bufferlist& bl) -{ - int n = s.size(); - bl.append((char*)&n, sizeof(n)); - for (typename map::const_iterator it = s.begin(); - it != s.end(); - it++) { - T k = it->first; - U v = it->second; - bl.append((char*)&k, sizeof(k)); - bl.append((char*)&v, sizeof(v)); - n--; - } - assert(n==0); -} -template -inline void _decode(map& s, bufferlist& bl, int& off) -{ - s.clear(); - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __RANGESET_H -#define __RANGESET_H - -/* - * - * my first container with iterator! it's pretty ugly. - * - */ - -#include -#include -#include -using namespace std; - -//typedef int T; - -template -struct _rangeset_base { - map ranges; // pair(first,last) (inclusive, e.g. [first,last]) - - typedef typename map::iterator mapit; - - // get iterator for range including val. or ranges.end(). - mapit get_range_for(T val) { - mapit it = ranges.lower_bound(val); - if (it == ranges.end()) { - // search backwards - typename map::reverse_iterator it = ranges.rbegin(); - if (it == ranges.rend()) return ranges.end(); - if (it->first <= val && it->second >= val) - return ranges.find(it->first); - return ranges.end(); - } else { - if (it->first == val) return - it--; - if (it->first <= val && it->second >= val) - return it; - return ranges.end(); - } - } - -}; - - -template -class rangeset_iterator : - public std::iterator -{ - //typedef typename map::iterator mapit; - - map ranges; - typename map::iterator it; - T current; - -public: - // cons - rangeset_iterator() {} - - rangeset_iterator(typename map::iterator& it, map& ranges) { - this->ranges = ranges; - this->it = it; - if (this->it != ranges.end()) - current = it->first; - } - - bool operator==(rangeset_iterator rit) { - return (it == rit.it && rit.current == current); - } - bool operator!=(rangeset_iterator rit) { - return (it != rit.it) || (rit.current != current); - } - - T& operator*() { - return current; - } - - rangeset_iterator operator++(int) { - if (current < it->second) - current++; - else { - it++; - if (it != ranges.end()) - current = it->first; - } - - return *this; - } -}; - - -template -class rangeset -{ - typedef typename map::iterator map_iterator; - - _rangeset_base theset; - inodeno_t _size; - -public: - rangeset() { _size = 0; } - typedef rangeset_iterator iterator; - - iterator begin() { - map_iterator it = theset.ranges.begin(); - return iterator(it, theset.ranges); - } - - iterator end() { - map_iterator it = theset.ranges.end(); - return iterator(it, theset.ranges); - } - - map_iterator map_begin() { - return theset.ranges.begin(); - } - map_iterator map_end() { - return theset.ranges.end(); - } - int map_size() { - return theset.ranges.size(); - } - - void map_insert(T v1, T v2) { - theset.ranges.insert(pair(v1,v2)); - _size += v2 - v1+1; - } - - - // ... - bool contains(T val) { - if (theset.get_range_for(val) == theset.ranges.end()) return false; - assert(!empty()); - return true; - } - - void insert(T val) { - assert(!contains(val)); - - map_iterator left = theset.get_range_for(val-1); - map_iterator right = theset.get_range_for(val+1); - - if (left != theset.ranges.end() && - right != theset.ranges.end()) { - // join! - left->second = right->second; - theset.ranges.erase(right); - _size++; - return; - } - - if (left != theset.ranges.end()) { - // add to left range - left->second = val; - _size++; - return; - } - - if (right != theset.ranges.end()) { - // add to right range - theset.ranges.insert(pair(val, right->second)); - theset.ranges.erase(val+1); - _size++; - return; - } - - // new range - theset.ranges.insert(pair(val,val)); - _size++; - return; - } - - unsigned size() { - return size(); - } - - bool empty() { - if (theset.ranges.empty()) { - assert(_size == 0); - return true; - } - assert(_size>0); - return false; - } - - - T first() { - assert(!empty()); - map_iterator it = theset.ranges.begin(); - return it->first; - } - - void erase(T val) { - assert(contains(val)); - map_iterator it = theset.get_range_for(val); - assert(it != theset.ranges.end()); - - // entire range - if (val == it->first && val == it->second) { - theset.ranges.erase(it); - _size--; - return; - } - - // beginning - if (val == it->first) { - theset.ranges.insert(pair(val+1, it->second)); - theset.ranges.erase(it); - _size--; - return; - } - - // end - if (val == it->second) { - it->second = val-1; - _size--; - return; - } - - // middle split - theset.ranges.insert(pair(it->first, val-1)); - theset.ranges.insert(pair(val+1, it->second)); - theset.ranges.erase(it); - _size--; - return; - } - - void dump() { - for (typename map::iterator it = theset.ranges.begin(); - it != theset.ranges.end(); - it++) { - cout << " " << it->first << "-" << it->second << endl; - } - } - -}; - - -#endif diff --git a/branches/marnberg/quota/include/reqid.h b/branches/marnberg/quota/include/reqid.h deleted file mode 100644 index 3c71fbae69ab6..0000000000000 --- a/branches/marnberg/quota/include/reqid.h +++ /dev/null @@ -1,64 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __REQID_H -#define __REQID_H - - -#include "include/types.h" -#include "msg/msg_types.h" - -/* reqid_t - caller name + incarnation# + tid to unique identify this request - * use for metadata and osd ops. - */ -class reqid_t { -public: - entity_name_t name; // who - int inc; // incarnation - tid_t tid; - reqid_t() : inc(0), tid(0) {} - reqid_t(const entity_name_t& a, int i, tid_t t) : name(a), inc(i), tid(t) {} -}; - -inline ostream& operator<<(ostream& out, const reqid_t& r) { - return out << r.name << "." << r.inc << ":" << r.tid; -} - -inline bool operator==(const reqid_t& l, const reqid_t& r) { - return (l.name == r.name) && (l.inc == r.inc) && (l.tid == r.tid); -} -inline bool operator!=(const reqid_t& l, const reqid_t& r) { - return (l.name != r.name) || (l.inc != r.inc) || (l.tid != r.tid); -} -inline bool operator<(const reqid_t& l, const reqid_t& r) { - return (l.name < r.name) || (l.inc < r.inc) || - (l.name == r.name && l.inc == r.inc && l.tid < r.tid); -} -inline bool operator<=(const reqid_t& l, const reqid_t& r) { - return (l.name < r.name) || (l.inc < r.inc) || - (l.name == r.name && l.inc == r.inc && l.tid <= r.tid); -} -inline bool operator>(const reqid_t& l, const reqid_t& r) { return !(l <= r); } -inline bool operator>=(const reqid_t& l, const reqid_t& r) { return !(l < r); } - -namespace __gnu_cxx { - template<> struct hash { - size_t operator()(const reqid_t &r) const { - static blobhash H; - return H((const char*)&r, sizeof(r)); - } - }; -} - - -#endif diff --git a/branches/marnberg/quota/include/statlite.h b/branches/marnberg/quota/include/statlite.h deleted file mode 100644 index 60a977e49a499..0000000000000 --- a/branches/marnberg/quota/include/statlite.h +++ /dev/null @@ -1,70 +0,0 @@ -#ifndef _STATLITE_H -#define _STATLITE_H - -extern "C" { - -#include -#include -#include -#include -#include - -struct statlite { - dev_t st_dev; /* device */ - ino_t st_ino; /* inode */ - mode_t st_mode; /* protection */ - nlink_t st_nlink; /* number of hard links */ - uid_t st_uid; /* user ID of owner */ - gid_t st_gid; /* group ID of owner */ - dev_t st_rdev; /* device type (if inode device)*/ - unsigned long st_litemask; /* bit mask for optional fields */ - /***************************************************************/ - /**** Remaining fields are optional according to st_litemask ***/ - off_t st_size; /* total size, in bytes */ - blksize_t st_blksize; /* blocksize for filesystem I/O */ - blkcnt_t st_blocks; /* number of blocks allocated */ - struct timespec st_atim; /* Time of last access. */ - struct timespec st_mtim; /* Time of last modification. */ - struct timespec st_ctim; /* Time of last status change. */ - //time_t st_atime; /* time of last access */ - //time_t st_mtime; /* time of last modification */ - //time_t st_ctime; /* time of last change */ -}; - -#define S_STATLITE_SIZE 1 -#define S_STATLITE_BLKSIZE 2 -#define S_STATLITE_BLOCKS 4 -#define S_STATLITE_ATIME 8 -#define S_STATLITE_MTIME 16 -#define S_STATLITE_CTIME 32 - -#define S_REQUIRESIZE(m) (m | S_STATLITE_SIZE) -#define S_REQUIREBLKSIZE(m) (m | S_STATLITE_BLKSIZE) -#define S_REQUIREBLOCKS(m) (m | S_STATLITE_BLOCKS) -#define S_REQUIREATIME(m) (m | S_STATLITE_ATIME) -#define S_REQUIREMTIME(m) (m | S_STATLITE_MTIME) -#define S_REQUIRECTIME(m) (m | S_STATLITE_CTIME) - -#define S_ISVALIDSIZE(m) (m & S_STATLITE_SIZE) -#define S_ISVALIDBLKSIZE(m) (m & S_STATLITE_BLKSIZE) -#define S_ISVALIDBLOCKS(m) (m & S_STATLITE_BLOCKS) -#define S_ISVALIDATIME(m) (m & S_STATLITE_ATIME) -#define S_ISVALIDMTIME(m) (m & S_STATLITE_MTIME) -#define S_ISVALIDCTIME(m) (m & S_STATLITE_CTIME) - - -// readdirplus etc. - -struct dirent_plus { - struct dirent d_dirent; /* dirent struct for this entry */ - struct stat d_stat; /* attributes for this entry */ - int d_stat_err;/* errno for d_stat, or 0 */ -}; -struct dirent_lite { - struct dirent d_dirent; /* dirent struct for this entry */ - struct statlite d_stat; /* attributes for this entry */ - int d_stat_err;/* errno for d_stat, or 0 */ -}; - -} -#endif diff --git a/branches/marnberg/quota/include/types.h b/branches/marnberg/quota/include/types.h deleted file mode 100644 index 72893cb62141b..0000000000000 --- a/branches/marnberg/quota/include/types.h +++ /dev/null @@ -1,367 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_TYPES_H -#define __MDS_TYPES_H - -extern "C" { -#include -#include -#include -#include "statlite.h" -} - -#include -#include -#include -#include -#include -#include -using namespace std; - -#include -using namespace __gnu_cxx; - -#include "object.h" - -#ifndef MIN -# define MIN(a,b) ((a) < (b) ? (a):(b)) -#endif -#ifndef MAX -# define MAX(a,b) ((a) > (b) ? (a):(b)) -#endif - - -// -- stl crap -- - -/* -- this is to make some of the STL types work with 64 bit values, string hash keys, etc. -- added when i was using an old STL.. maybe try taking these out and see if things - compile now? -*/ - -class blobhash { -public: - size_t operator()(const char *p, unsigned len) { - static hash H; - long acc = 0; - while (len >= sizeof(long)) { - acc ^= *(long*)p; - p += sizeof(long); - len -= sizeof(long); - } - int sh = 0; - while (len) { - acc ^= (long)*p << sh; - sh += 8; - len--; - p++; - } - return H(acc); - } -}; - - -namespace __gnu_cxx { - template<> struct hash< std::string > - { - size_t operator()( const std::string& x ) const - { - static hash H; - return H(x.c_str()); - } - }; - -#ifndef __LP64__ - template<> struct hash<__int64_t> { - size_t operator()(__int64_t __x) const { - static hash<__int32_t> H; - return H((__x >> 32) ^ (__x & 0xffffffff)); - } - }; -#endif - -} - - -/* - * comparators for stl containers - */ -// for hash_map: -// hash_map, eqstr> vals; -struct eqstr -{ - bool operator()(const char* s1, const char* s2) const - { - return strcmp(s1, s2) == 0; - } -}; - -// for set, map -struct ltstr -{ - bool operator()(const char* s1, const char* s2) const - { - return strcmp(s1, s2) < 0; - } -}; - - - -// ---------------------- -// some basic types - -typedef __uint64_t tid_t; // transaction id -typedef __uint64_t version_t; -typedef __uint32_t epoch_t; // map epoch (32bits -> 13 epochs/second for 10 years) - - - - - -/** object layout - * how objects are mapped into PGs - */ -#define OBJECT_LAYOUT_DEFAULT 0 // see g_conf -#define OBJECT_LAYOUT_HASH 1 -#define OBJECT_LAYOUT_LINEAR 2 -#define OBJECT_LAYOUT_HASHINO 3 -#define OBJECT_LAYOUT_STARTOSD 4 - -/** pg layout - * how PGs are mapped into (sets of) OSDs - */ -#define PG_LAYOUT_CRUSH 0 -#define PG_LAYOUT_HASH 1 -#define PG_LAYOUT_LINEAR 2 -#define PG_LAYOUT_HYBRID 3 - -/** FileLayout - * specifies a striping and replication strategy - */ - -//#define FILE_LAYOUT_CRUSH 0 // stripe via crush -//#define FILE_LAYOUT_LINEAR 1 // stripe linearly across cluster - -struct FileLayout { - // layout - int object_layout; - - // FIXME: make this a union? - // rushstripe - int stripe_size; // stripe unit, in bytes - int stripe_count; // over this many objects - int object_size; // until objects are this big, then use a new set of objects. - - // period = bytes before i start on a new set of objects. - int period() { return object_size * stripe_count; } - - int osd; // osdlocal - - int num_rep; // replication - - FileLayout() { } - FileLayout(int ss, int sc, int os, int nr=2, int o=-1) : - object_layout(o < 0 ? OBJECT_LAYOUT_DEFAULT:OBJECT_LAYOUT_STARTOSD), - stripe_size(ss), stripe_count(sc), object_size(os), - osd(o), - num_rep(nr) { } - -}; - - - -// -- inode -- - -struct inodeno_t { - __uint64_t val; - inodeno_t() : val() {} - inodeno_t(__uint64_t v) : val(v) {} - inodeno_t operator+=(inodeno_t o) { val += o.val; return *this; } - operator __uint64_t() const { return val; } -}; - -inline ostream& operator<<(ostream& out, inodeno_t ino) { - return out << hex << ino.val << dec; -} - -namespace __gnu_cxx { - template<> struct hash< inodeno_t > - { - size_t operator()( const inodeno_t& x ) const - { - static hash<__uint64_t> H; - return H(x.val); - } - }; -} - - -#define INODE_MODE_FILE 0100000 // S_IFREG -#define INODE_MODE_SYMLINK 0120000 // S_IFLNK -#define INODE_MODE_DIR 0040000 // S_IFDIR -#define INODE_TYPE_MASK 0170000 - -#define FILE_MODE_R 1 -#define FILE_MODE_W 2 -#define FILE_MODE_RW (1|2) -#define FILE_MODE_LAZY 4 - -#define INODE_MASK_BASE 1 // ino, ctime, nlink -#define INODE_MASK_PERM 2 // uid, gid, mode -#define INODE_MASK_SIZE 4 // size, blksize, blocks -#define INODE_MASK_MTIME 8 // mtime -#define INODE_MASK_ATIME 16 // atime - -#define INODE_MASK_ALL_STAT (INODE_MASK_BASE|INODE_MASK_PERM|INODE_MASK_SIZE|INODE_MASK_MTIME) -//#define INODE_MASK_ALL_STAT (INODE_MASK_BASE|INODE_MASK_PERM|INODE_MASK_SIZE|INODE_MASK_MTIME|INODE_MASK_ATIME) - -struct inode_t { - // base (immutable) - inodeno_t ino; // NOTE: ino _must_ come first for MDStore.cc to behave!! - time_t ctime; - - // other - FileLayout layout; // ?immutable? - int nlink; // base, - - // hard/perm (namespace permissions) - mode_t mode; - uid_t uid; - gid_t gid; - - // file (data access) - off_t size; - time_t atime, mtime; // maybe atime different? "lazy"? - - int mask; - - // special stuff - version_t version; // auth only - unsigned char hash_seed; // only defined for dir; 0 if not hashed. - bool anchored; // auth only - version_t file_data_version; // auth only - - bool is_symlink() { return (mode & INODE_TYPE_MASK) == INODE_MODE_SYMLINK; } - bool is_dir() { return (mode & INODE_TYPE_MASK) == INODE_MODE_DIR; } - bool is_file() { return (mode & INODE_TYPE_MASK) == INODE_MODE_FILE; } -}; - - - - -// client types -typedef int fh_t; // file handle - - -// dentries -#define MAX_DENTRY_LEN 255 - - - - -// -- io helpers -- - -template -inline ostream& operator<<(ostream& out, vector& v) { - out << "["; - for (unsigned i=0; i -inline ostream& operator<<(ostream& out, const set& iset) { - for (typename set::const_iterator it = iset.begin(); - it != iset.end(); - it++) { - if (it != iset.begin()) out << ","; - out << *it; - } - return out; -} - -template -inline ostream& operator<<(ostream& out, const multiset& iset) { - for (typename multiset::const_iterator it = iset.begin(); - it != iset.end(); - it++) { - if (it != iset.begin()) out << ","; - out << *it; - } - return out; -} - -template -inline ostream& operator<<(ostream& out, const map& m) -{ - out << "{"; - for (typename map::const_iterator it = m.begin(); - it != m.end(); - it++) { - if (it != m.begin()) out << ","; - out << it->first << "=" << it->second; - } - out << "}"; - return out; -} - - - - -// -- rope helpers -- - -// string -inline void _rope(string& s, crope& r) -{ - r.append(s.c_str(), s.length()+1); -} -inline void _unrope(string& s, crope& r, int& off) -{ - s = r.c_str() + off; - off += s.length() + 1; -} - -// set -inline void _rope(set& s, crope& r) -{ - int n = s.size(); - r.append((char*)&n, sizeof(n)); - for (set::iterator it = s.begin(); - it != s.end(); - it++) { - int v = *it; - r.append((char*)&v, sizeof(v)); - n--; - } - assert(n==0); -} -inline void _unrope(set& s, crope& r, int& off) -{ - s.clear(); - int n; - r.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -/* - * uofs.h - * - * user-level object-based file system - */ - - #ifndef _UOFS_H_ - #define _UOFS_H_ - - #include - #include - #include - - - int device_open(char *path, int xflags); - void device_findsizes(int fd, long long *sz, int *bsz); - - int uofs_format(int bdev_id, int donode_size, int bd_ratio, int reg_size, int sb_size, int lb_size, - int nr_hash_table_buckets, int delay_allocation, int flush_interval); - - int uofs_mount(int bdev_id); - void uofs_shutdown(void); - - int uofs_read(long long oid, void *buf, off_t offset, size_t count); - int uofs_write(long long oid, void *buf, off_t offset, size_t count); - int uofs_del(long long oid); - int uofs_sync(long long oid); - int uofs_exist(long long oid); - - int uofs_get_size(long long oid); - - void uofs_superblock_printout(void); - int get_large_object_pages(void); - - int uofs_buffer_size(void); - #endif diff --git a/branches/marnberg/quota/jobs/rados/wr_sizes b/branches/marnberg/quota/jobs/rados/wr_sizes deleted file mode 100644 index 9b73477ea6142..0000000000000 --- a/branches/marnberg/quota/jobs/rados/wr_sizes +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - 'nummds' => 1, - 'numosd' => [8],#10,14,16], - 'numclient' => [10*16], - 'n' => 15, - - 'fs' => 'ebofs', - - 'start' => 60, - 'end' => 90, - 'until' => 90, - 'kill_after' => 190, - - 'writefile' => 1, - 'writefile_size' => [4096, - 8*1024, - 16*1024, - 32*1024, - 64*1024, - 128*1024, - 256*1024, - # 512*1024, -# 4*1024*1024, -# 2*1024*1024, -# 1024*1024 -], - 'writefile_mb' => 10000, - - 'file_layout_num_rep' => 1, - 'file_layout_ssize' => 4*1024*1024, - 'file_layout_osize' => 4*1024*1024, - - 'osd_pg_bits' => 12, - -# 'ebofs_freelist' => [0, 1080, 65400], - - 'custom' => '--objecter_buffer_uncommitted 0', - - #'custom' => '--tcp_skip_rank0', - - 'comb' => { - 'x' => 'writefile_size', - 'vars' => [ 'osd.c_wrb', 'cl.wrlat' ] - } -}; diff --git a/branches/marnberg/quota/mds/Anchor.h b/branches/marnberg/quota/mds/Anchor.h deleted file mode 100644 index 8da2bbdb52cd5..0000000000000 --- a/branches/marnberg/quota/mds/Anchor.h +++ /dev/null @@ -1,55 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __ANCHOR_H -#define __ANCHOR_H - -#include -using std::string; - -#include "include/types.h" -#include "include/buffer.h" - -class Anchor { -public: - inodeno_t ino; // my ino - inodeno_t dirino; // containing dir - string ref_dn; // referring dentry - int nref; // reference count - - Anchor() {} - Anchor(inodeno_t ino, inodeno_t dirino, string& ref_dn, int nref=0) { - this->ino = ino; - this->dirino = dirino; - this->ref_dn = ref_dn; - this->nref = nref; - } - - void _encode(bufferlist &bl) { - bl.append((char*)&ino, sizeof(ino)); - bl.append((char*)&dirino, sizeof(dirino)); - bl.append((char*)&nref, sizeof(nref)); - ::_encode(ref_dn, bl); - } - void _decode(bufferlist& bl, int& off) { - bl.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - bl.copy(off, sizeof(dirino), (char*)&dirino); - off += sizeof(dirino); - bl.copy(off, sizeof(nref), (char*)&nref); - off += sizeof(nref); - ::_decode(ref_dn, bl, off); - } -} ; - -#endif diff --git a/branches/marnberg/quota/mds/AnchorClient.cc b/branches/marnberg/quota/mds/AnchorClient.cc deleted file mode 100644 index af84eb6c2448a..0000000000000 --- a/branches/marnberg/quota/mds/AnchorClient.cc +++ /dev/null @@ -1,149 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include -using std::cout; -using std::cerr; -using std::endl; - -#include "Anchor.h" -#include "AnchorClient.h" -#include "MDSMap.h" - -#include "include/Context.h" -#include "msg/Messenger.h" - -#include "MDS.h" - -#include "messages/MAnchorRequest.h" -#include "messages/MAnchorReply.h" - -#include "config.h" -#undef dout -#define dout(x) if (x <= g_conf.debug) cout << g_clock.now() << " " << messenger->get_myaddr() << ".anchorclient " -#define derr(x) if (x <= g_conf.debug) cout << g_clock.now() << " " << messenger->get_myaddr() << ".anchorclient " - - -void AnchorClient::dispatch(Message *m) -{ - switch (m->get_type()) { - case MSG_MDS_ANCHORREPLY: - handle_anchor_reply((MAnchorReply*)m); - break; - - default: - assert(0); - } -} - -void AnchorClient::handle_anchor_reply(class MAnchorReply *m) -{ - switch (m->get_op()) { - - case ANCHOR_OP_LOOKUP: - { - assert(pending_lookup_trace.count(m->get_ino()) == 1); - - *(pending_lookup_trace[ m->get_ino() ]) = m->get_trace(); - Context *onfinish = pending_lookup_context[ m->get_ino() ]; - - pending_lookup_trace.erase(m->get_ino()); - pending_lookup_context.erase(m->get_ino()); - - if (onfinish) { - onfinish->finish(0); - delete onfinish; - } - } - break; - - case ANCHOR_OP_UPDATE: - case ANCHOR_OP_CREATE: - case ANCHOR_OP_DESTROY: - { - assert(pending_op.count(m->get_ino()) == 1); - - Context *onfinish = pending_op[m->get_ino()]; - pending_op.erase(m->get_ino()); - - if (onfinish) { - onfinish->finish(0); - delete onfinish; - } - } - break; - - default: - assert(0); - } - -} - - - -/* - * public async interface - */ - -void AnchorClient::lookup(inodeno_t ino, vector& trace, Context *onfinish) -{ - // send message - MAnchorRequest *req = new MAnchorRequest(ANCHOR_OP_LOOKUP, ino); - - pending_lookup_trace[ino] = &trace; - pending_lookup_context[ino] = onfinish; - - messenger->send_message(req, - mdsmap->get_inst(mdsmap->get_anchortable()), - MDS_PORT_ANCHORMGR, MDS_PORT_ANCHORCLIENT); -} - -void AnchorClient::create(inodeno_t ino, vector& trace, Context *onfinish) -{ - // send message - MAnchorRequest *req = new MAnchorRequest(ANCHOR_OP_CREATE, ino); - req->set_trace(trace); - - pending_op[ino] = onfinish; - - messenger->send_message(req, - mdsmap->get_inst(mdsmap->get_anchortable()), - MDS_PORT_ANCHORMGR, MDS_PORT_ANCHORCLIENT); -} - -void AnchorClient::update(inodeno_t ino, vector& trace, Context *onfinish) -{ - // send message - MAnchorRequest *req = new MAnchorRequest(ANCHOR_OP_UPDATE, ino); - req->set_trace(trace); - - pending_op[ino] = onfinish; - - messenger->send_message(req, - mdsmap->get_inst(mdsmap->get_anchortable()), - MDS_PORT_ANCHORMGR, MDS_PORT_ANCHORCLIENT); -} - -void AnchorClient::destroy(inodeno_t ino, Context *onfinish) -{ - // send message - MAnchorRequest *req = new MAnchorRequest(ANCHOR_OP_DESTROY, ino); - - pending_op[ino] = onfinish; - - messenger->send_message(req, - mdsmap->get_inst(mdsmap->get_anchortable()), - MDS_PORT_ANCHORMGR, MDS_PORT_ANCHORCLIENT); -} - - diff --git a/branches/marnberg/quota/mds/AnchorClient.h b/branches/marnberg/quota/mds/AnchorClient.h deleted file mode 100644 index 80b736a4b65c7..0000000000000 --- a/branches/marnberg/quota/mds/AnchorClient.h +++ /dev/null @@ -1,55 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __ANCHORCLIENT_H -#define __ANCHORCLIENT_H - -#include -using std::vector; -#include -using __gnu_cxx::hash_map; - -#include "include/types.h" -#include "msg/Dispatcher.h" - -#include "Anchor.h" - -class Messenger; -class MDSMap; -class Context; - -class AnchorClient : public Dispatcher { - Messenger *messenger; - MDSMap *mdsmap; - - // remote state - hash_map pending_op; - hash_map pending_lookup_context; - hash_map*> pending_lookup_trace; - - void handle_anchor_reply(class MAnchorReply *m); - - -public: - AnchorClient(Messenger *ms, MDSMap *mm) : messenger(ms), mdsmap(mm) {} - - // async user interface - void lookup(inodeno_t ino, vector& trace, Context *onfinish); - void create(inodeno_t ino, vector& trace, Context *onfinish); - void update(inodeno_t ino, vector& trace, Context *onfinish); - void destroy(inodeno_t ino, Context *onfinish); - - void dispatch(Message *m); -}; - -#endif diff --git a/branches/marnberg/quota/mds/AnchorTable.cc b/branches/marnberg/quota/mds/AnchorTable.cc deleted file mode 100644 index 6f380b0908d8d..0000000000000 --- a/branches/marnberg/quota/mds/AnchorTable.cc +++ /dev/null @@ -1,358 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "AnchorTable.h" -#include "MDS.h" - -#include "osdc/Filer.h" - -#include "msg/Messenger.h" -#include "messages/MAnchorRequest.h" -#include "messages/MAnchorReply.h" - -#include "common/Clock.h" - -#include "config.h" -#undef dout -#define dout(x) if (x <= g_conf.debug_mds) cout << g_clock.now() << " " << mds->messenger->get_myaddr() << ".anchortable " -#define derr(x) if (x <= g_conf.debug_mds) cerr << g_clock.now() << " " << mds->messenger->get_myaddr() << ".anchortable " - -AnchorTable::AnchorTable(MDS *mds) -{ - this->mds = mds; - opening = false; - opened = false; -} - -void AnchorTable::init_inode() -{ - memset(&table_inode, 0, sizeof(table_inode)); - table_inode.ino = MDS_INO_ANCHORTABLE+mds->get_nodeid(); - table_inode.layout = g_OSD_FileLayout; -} - -void AnchorTable::reset() -{ - init_inode(); - opened = true; - anchor_map.clear(); -} - -/* - * basic updates - */ - -bool AnchorTable::add(inodeno_t ino, inodeno_t dirino, string& ref_dn) -{ - dout(7) << "add " << std::hex << ino << " dirino " << dirino << std::dec << " ref_dn " << ref_dn << endl; - - // parent should be there - assert(dirino < 1000 || // system dirino - anchor_map.count(dirino)); // have - - if (anchor_map.count(ino) == 0) { - // new item - anchor_map[ ino ] = new Anchor(ino, dirino, ref_dn); - dout(10) << " add: added " << std::hex << ino << std::dec << endl; - return true; - } else { - dout(10) << " add: had " << std::hex << ino << std::dec << endl; - return false; - } -} - -void AnchorTable::inc(inodeno_t ino) -{ - dout(7) << "inc " << std::hex << ino << std::dec << endl; - - assert(anchor_map.count(ino) != 0); - Anchor *anchor = anchor_map[ino]; - assert(anchor); - - while (1) { - anchor->nref++; - - dout(10) << " inc: record " << std::hex << ino << std::dec << " now " << anchor->nref << endl; - ino = anchor->dirino; - - if (ino == 0) break; - if (anchor_map.count(ino) == 0) break; - anchor = anchor_map[ino]; - assert(anchor); - } -} - -void AnchorTable::dec(inodeno_t ino) -{ - dout(7) << "dec " << std::hex << ino << std::dec << endl; - - assert(anchor_map.count(ino) != 0); - Anchor *anchor = anchor_map[ino]; - assert(anchor); - - while (true) { - anchor->nref--; - - if (anchor->nref == 0) { - dout(10) << " dec: record " << std::hex << ino << std::dec << " now 0, removing" << endl; - inodeno_t dirino = anchor->dirino; - anchor_map.erase(ino); - delete anchor; - ino = dirino; - } else { - dout(10) << " dec: record " << std::hex << ino << std::dec << " now " << anchor->nref << endl; - ino = anchor->dirino; - } - - if (ino == 0) break; - if (anchor_map.count(ino) == 0) break; - anchor = anchor_map[ino]; - assert(anchor); - } -} - - -/* - * high level - */ - -void AnchorTable::lookup(inodeno_t ino, vector& trace) -{ - dout(7) << "lookup " << std::hex << ino << std::dec << endl; - - assert(anchor_map.count(ino) == 1); - Anchor *anchor = anchor_map[ino]; - assert(anchor); - - while (true) { - dout(10) << " record " << std::hex << anchor->ino << " dirino " << anchor->dirino << std::dec << " ref_dn " << anchor->ref_dn << endl; - trace.insert(trace.begin(), anchor); // lame FIXME - - if (anchor->dirino < MDS_INO_BASE) break; - - assert(anchor_map.count(anchor->dirino) == 1); - anchor = anchor_map[anchor->dirino]; - assert(anchor); - } -} - -void AnchorTable::create(inodeno_t ino, vector& trace) -{ - dout(7) << "create " << std::hex << ino << std::dec << endl; - - // make sure trace is in table - for (unsigned i=0; iino, trace[i]->dirino, trace[i]->ref_dn); - - inc(ino); // ok! -} - -void AnchorTable::destroy(inodeno_t ino) -{ - dec(ino); -} - - - -/* - * messages - */ - -void AnchorTable::dispatch(Message *m) -{ - switch (m->get_type()) { - case MSG_MDS_ANCHORREQUEST: - handle_anchor_request((MAnchorRequest*)m); - break; - - default: - assert(0); - } -} - - - -void AnchorTable::handle_anchor_request(class MAnchorRequest *m) -{ - // make sure i'm open! - if (!opened) { - dout(7) << "not open yet" << endl; - - waiting_for_open.push_back(new C_MDS_RetryMessage(mds,m)); - - if (!opening) { - opening = true; - load(0); - } - return; - } - - // go - MAnchorReply *reply = new MAnchorReply(m); - - switch (m->get_op()) { - - case ANCHOR_OP_LOOKUP: - lookup( m->get_ino(), reply->get_trace() ); - break; - - case ANCHOR_OP_UPDATE: - destroy( m->get_ino() ); - create( m->get_ino(), m->get_trace() ); - break; - - case ANCHOR_OP_CREATE: - create( m->get_ino(), m->get_trace() ); - break; - - case ANCHOR_OP_DESTROY: - destroy( m->get_ino() ); - break; - - default: - assert(0); - } - - // send reply - mds->messenger->send_message(reply, m->get_source_inst(), m->get_source_port()); - delete m; -} - - - - -// primitive load/save for now! - -// load/save entire table for now! - -void AnchorTable::save(Context *onfinish) -{ - dout(7) << "save" << endl; - if (!opened) return; - - // build up write - bufferlist tabbl; - - int num = anchor_map.size(); - tabbl.append((char*)&num, sizeof(int)); - - for (hash_map::iterator it = anchor_map.begin(); - it != anchor_map.end(); - it++) { - dout(14) << " saving anchor for " << std::hex << it->first << std::dec << endl; - Anchor *a = it->second; - assert(a); - a->_encode(tabbl); - } - - bufferlist bl; - size_t size = tabbl.length(); - bl.append((char*)&size, sizeof(size)); - bl.claim_append(tabbl); - - dout(7) << " " << num << " anchors, " << size << " bytes" << endl; - - // write! - mds->filer->write(table_inode, - 0, bl.length(), - bl, 0, - NULL, onfinish); -} - - - -class C_AT_Load : public Context { - AnchorTable *at; -public: - size_t size; - bufferlist bl; - C_AT_Load(size_t size, AnchorTable *at) { - this->size = size; - this->at = at; - } - void finish(int result) { - assert(result > 0); - - at->load_2(size, bl); - } -}; - -class C_AT_LoadSize : public Context { - AnchorTable *at; - MDS *mds; -public: - bufferlist bl; - C_AT_LoadSize(AnchorTable *at, MDS *mds) { - this->at = at; - this->mds = mds; - } - void finish(int r) { - size_t size = 0; - assert(bl.length() >= sizeof(size)); - bl.copy(0, sizeof(size), (char*)&size); - cout << "r is " << r << " size is " << size << endl; - if (r > 0 && size > 0) { - C_AT_Load *c = new C_AT_Load(size, at); - mds->filer->read(at->table_inode, - sizeof(size), size, - &c->bl, - c); - } else { - // fail - bufferlist empty; - at->load_2(0, empty); - } - } -}; - -void AnchorTable::load(Context *onfinish) -{ - dout(7) << "load" << endl; - init_inode(); - - assert(!opened); - - waiting_for_open.push_back(onfinish); - - C_AT_LoadSize *c = new C_AT_LoadSize(this, mds); - mds->filer->read(table_inode, - 0, sizeof(size_t), - &c->bl, - c); -} - -void AnchorTable::load_2(size_t size, bufferlist& bl) -{ - // num - int off = 0; - int num; - bl.copy(0, sizeof(num), (char*)&num); - off += sizeof(num); - - // parse anchors - for (int i=0; i_decode(bl, off); - dout(10) << "load_2 decoded " << std::hex << a->ino << " dirino " << a->dirino << std::dec << " ref_dn " << a->ref_dn << endl; - anchor_map[a->ino] = a; - } - - dout(7) << "load_2 got " << num << " anchors" << endl; - - opened = true; - opening = false; - - // finish - finish_contexts(waiting_for_open); -} - diff --git a/branches/marnberg/quota/mds/AnchorTable.h b/branches/marnberg/quota/mds/AnchorTable.h deleted file mode 100644 index 0b0af03af5b68..0000000000000 --- a/branches/marnberg/quota/mds/AnchorTable.h +++ /dev/null @@ -1,81 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __ANCHORTABLE_H -#define __ANCHORTABLE_H - -#include "Anchor.h" -#include "include/Context.h" - -#include -using namespace __gnu_cxx; - -class MDS; - - -class AnchorTable { - MDS *mds; - hash_map anchor_map; - - bool opening, opened; - list waiting_for_open; - - public: - inode_t table_inode; - - public: - AnchorTable(MDS *mds); - - protected: - void init_inode(); // call this before doing anything. - - // - bool have_ino(inodeno_t ino) { - return true; // always in memory for now. - } - void fetch_ino(inodeno_t ino, Context *onfinish) { - assert(!opened); - load(onfinish); - } - - // adjust table - bool add(inodeno_t ino, inodeno_t dirino, string& ref_dn); - void inc(inodeno_t ino); - void dec(inodeno_t ino); - - - // high level interface - void lookup(inodeno_t ino, vector& trace); - void create(inodeno_t ino, vector& trace); - void destroy(inodeno_t ino); - - // messages - public: - void dispatch(class Message *m); - protected: - void handle_anchor_request(class MAnchorRequest *m); - - - public: - - // load/save entire table for now! - void reset(); - void save(Context *onfinish); - void load(Context *onfinish); - void load_2(size_t size, bufferlist& bl); - - -}; - -#endif diff --git a/branches/marnberg/quota/mds/CDentry.cc b/branches/marnberg/quota/mds/CDentry.cc deleted file mode 100644 index 22d292a001e33..0000000000000 --- a/branches/marnberg/quota/mds/CDentry.cc +++ /dev/null @@ -1,203 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "CDentry.h" -#include "CInode.h" -#include "CDir.h" - -#include "MDS.h" -#include "MDCache.h" - -#include - -#undef dout -#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_mds) cout << g_clock.now() << " mds" << dir->cache->mds->get_nodeid() << ".cache.den(" << dir->ino() << " " << name << ") " - - -// CDentry - -ostream& operator<<(ostream& out, CDentry& dn) -{ - string path; - dn.make_path(path); - - out << "[dentry " << path; - if (dn.is_auth()) { - out << " auth"; - if (dn.is_replicated()) - out << dn.get_replicas(); - } else { - out << " rep@" << dn.authority(); - out << "." << dn.get_replica_nonce(); - assert(dn.get_replica_nonce() >= 0); - } - - if (dn.is_null()) out << " NULL"; - if (dn.is_remote()) out << " REMOTE"; - - if (dn.is_pinned()) out << " " << dn.num_pins() << " pathpins"; - - if (dn.get_lockstate() == DN_LOCK_UNPINNING) out << " unpinning"; - if (dn.get_lockstate() == DN_LOCK_PREXLOCK) out << " prexlock=" << dn.get_xlockedby() << " g=" << dn.get_gather_set(); - if (dn.get_lockstate() == DN_LOCK_XLOCK) out << " xlock=" << dn.get_xlockedby(); - - out << " v=" << dn.get_version(); - out << " pv=" << dn.get_projected_version(); - - out << " inode=" << dn.get_inode(); - - if (dn.get_num_ref()) { - out << " |"; - for(set::iterator it = dn.get_ref_set().begin(); - it != dn.get_ref_set().end(); - it++) - out << " " << CDentry::pin_name(*it); - } - - out << " " << &dn; - out << "]"; - return out; -} - -CDentry::CDentry(const CDentry& m) { - assert(0); //std::cerr << "copy cons called, implement me" << endl; -} - - -inodeno_t CDentry::get_ino() -{ - if (inode) - return inode->ino(); - return inodeno_t(); -} - - -int CDentry::authority() -{ - return dir->dentry_authority( name ); -} - - -version_t CDentry::pre_dirty() -{ - // NOTE: in the future, this will dirty a particular slice/subset of the dir. - projected_version = dir->pre_dirty(); - dout(10) << " pre_dirty " << *this << endl; - return projected_version; -} - - -void CDentry::_mark_dirty() -{ - // state+pin - if (!state_test(STATE_DIRTY)) { - state_set(STATE_DIRTY); - get(PIN_DIRTY); - } -} - -void CDentry::mark_dirty(version_t pv) -{ - dout(10) << " mark_dirty " << *this << endl; - - // i now live in this new dir version - assert(pv == projected_version); - version = pv; - _mark_dirty(); - - // mark dir too - dir->mark_dirty(pv); -} - -void CDentry::mark_clean() { - dout(10) << " mark_clean " << *this << endl; - assert(is_dirty()); - assert(version <= dir->get_version()); - - // this happens on export. - //assert(version <= dir->get_last_committed_version()); - - // state+pin - state_clear(STATE_DIRTY); - put(PIN_DIRTY); -} - - -void CDentry::make_path(string& s) -{ - if (dir) { - if (dir->inode->get_parent_dn()) - dir->inode->get_parent_dn()->make_path(s); - } else { - s = "???"; - } - s += "/"; - s += name; -} - - -void CDentry::link_remote(CInode *in) -{ - assert(is_remote()); - assert(in->ino() == remote_ino); - - inode = in; - in->add_remote_parent(this); -} - -void CDentry::unlink_remote() -{ - assert(is_remote()); - assert(inode); - - inode->remove_remote_parent(this); - inode = 0; -} - - -CDentryDiscover *CDentry::replicate_to(int who) -{ - int nonce = add_replica(who); - return new CDentryDiscover(this, nonce); -} - - - - -// = -const CDentry& CDentry::operator= (const CDentry& right) { - assert(0); //std::cerr << "copy op called, implement me" << endl; - return *this; -} - - // comparisons - bool CDentry::operator== (const CDentry& right) const { - return name == right.name; - } - bool CDentry::operator!= (const CDentry& right) const { - return name == right.name; - } - bool CDentry::operator< (const CDentry& right) const { - return name < right.name; - } - bool CDentry::operator> (const CDentry& right) const { - return name > right.name; - } - bool CDentry::operator>= (const CDentry& right) const { - return name >= right.name; - } - bool CDentry::operator<= (const CDentry& right) const { - return name <= right.name; - } diff --git a/branches/marnberg/quota/mds/CDentry.h b/branches/marnberg/quota/mds/CDentry.h deleted file mode 100644 index 65b9155ce69f9..0000000000000 --- a/branches/marnberg/quota/mds/CDentry.h +++ /dev/null @@ -1,288 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __CDENTRY_H -#define __CDENTRY_H - -#include -#include -#include -using namespace std; - -#include "include/types.h" -#include "include/buffer.h" -#include "include/lru.h" -#include "mdstypes.h" - -class CInode; -class CDir; - -#define DN_LOCK_SYNC 0 -#define DN_LOCK_PREXLOCK 1 -#define DN_LOCK_XLOCK 2 -#define DN_LOCK_UNPINNING 3 // waiting for pins to go away .. FIXME REVIEW THIS CODE .. - -#define DN_XLOCK_FOREIGN ((Message*)0x1) // not 0, not a valid pointer. - -class Message; -class CDentryDiscover; - -// dentry -class CDentry : public MDSCacheObject, public LRUObject { - public: - // state - static const int STATE_AUTH = (1<<0); - static const int STATE_DIRTY = (1<<1); - - // pins - static const int PIN_INODEPIN = 0; // linked inode is pinned - static const int PIN_REPLICATED = 1; // replicated by another MDS - static const int PIN_DIRTY = 2; // - static const int PIN_PROXY = 3; // - static const char *pin_name(int p) { - switch (p) { - case PIN_INODEPIN: return "inodepin"; - case PIN_REPLICATED: return "replicated"; - case PIN_DIRTY: return "dirty"; - case PIN_PROXY: return "proxy"; - default: assert(0); - } - }; - - - protected: - string name; - CInode *inode; - CDir *dir; - - inodeno_t remote_ino; // if remote dentry - - version_t version; // dir version when last touched. - version_t projected_version; // what it will be when i unlock/commit. - - // locking - int lockstate; - Message *xlockedby; - set gather_set; - - // path pins - int npins; - multiset pinset; - - friend class Migrator; - friend class Locker; - friend class Renamer; - friend class Server; - friend class MDCache; - friend class MDS; - friend class CInode; - friend class C_MDC_XlockRequest; - - public: - // cons - CDentry() : - inode(0), - dir(0), - remote_ino(0), - version(0), - projected_version(0), - lockstate(DN_LOCK_SYNC), - xlockedby(0), - npins(0) { } - CDentry(const string& n, inodeno_t ino, CInode *in=0) : - name(n), - inode(in), - dir(0), - remote_ino(ino), - version(0), - projected_version(0), - lockstate(DN_LOCK_SYNC), - xlockedby(0), - npins(0) { } - CDentry(const string& n, CInode *in) : - name(n), - inode(in), - dir(0), - remote_ino(0), - version(0), - projected_version(0), - lockstate(DN_LOCK_SYNC), - xlockedby(0), - npins(0) { } - - CInode *get_inode() { return inode; } - CDir *get_dir() { return dir; } - const string& get_name() { return name; } - inodeno_t get_ino(); - inodeno_t get_remote_ino() { return remote_ino; } - - void set_remote_ino(inodeno_t ino) { remote_ino = ino; } - - - // ref counts: pin ourselves in the LRU when we're pinned. - void first_get() { - lru_pin(); - } - void last_put() { - lru_unpin(); - } - - - // dentry type is primary || remote || null - // inode ptr is required for primary, optional for remote, undefined for null - bool is_primary() { return remote_ino == 0 && inode != 0; } - bool is_remote() { return remote_ino > 0; } - bool is_null() { return (remote_ino == 0 && inode == 0) ? true:false; } - - // remote links - void link_remote(CInode *in); - void unlink_remote(); - - - // copy cons - CDentry(const CDentry& m); - const CDentry& operator= (const CDentry& right); - - // comparisons - bool operator== (const CDentry& right) const; - bool operator!= (const CDentry& right) const; - bool operator< (const CDentry& right) const; - bool operator> (const CDentry& right) const; - bool operator>= (const CDentry& right) const; - bool operator<= (const CDentry& right) const; - - // misc - void make_path(string& p); - - // -- state - version_t get_version() { return version; } - void set_version(version_t v) { projected_version = version = v; } - version_t get_projected_version() { return projected_version; } - void set_projected_version(version_t v) { projected_version = v; } - - int authority(); - - bool is_auth() { return state & STATE_AUTH; } - bool is_dirty() { return state & STATE_DIRTY; } - bool is_clean() { return !is_dirty(); } - - version_t pre_dirty(); - void _mark_dirty(); - void mark_dirty(version_t projected_dirv); - void mark_clean(); - - - // -- replication - CDentryDiscover *replicate_to(int rep); - - - // -- locking - int get_lockstate() { return lockstate; } - set& get_gather_set() { return gather_set; } - - bool is_sync() { return lockstate == DN_LOCK_SYNC; } - bool can_read() { return (lockstate == DN_LOCK_SYNC) || (lockstate == DN_LOCK_UNPINNING); } - bool can_read(Message *m) { return is_xlockedbyme(m) || can_read(); } - bool is_xlocked() { return lockstate == DN_LOCK_XLOCK; } - Message* get_xlockedby() { return xlockedby; } - bool is_xlockedbyother(Message *m) { return (lockstate == DN_LOCK_XLOCK) && m != xlockedby; } - bool is_xlockedbyme(Message *m) { return (lockstate == DN_LOCK_XLOCK) && m == xlockedby; } - bool is_prexlockbyother(Message *m) { - return (lockstate == DN_LOCK_PREXLOCK) && m != xlockedby; - } - - int get_replica_lockstate() { - switch (lockstate) { - case DN_LOCK_XLOCK: - case DN_LOCK_SYNC: - return lockstate; - case DN_LOCK_PREXLOCK: - return DN_LOCK_XLOCK; - case DN_LOCK_UNPINNING: - return DN_LOCK_SYNC; - } - assert(0); - return 0; - } - void set_lockstate(int s) { lockstate = s; } - - // path pins - void pin(Message *m) { - npins++; - pinset.insert(m); - assert(pinset.size() == (unsigned)npins); - } - void unpin(Message *m) { - npins--; - assert(npins >= 0); - assert(pinset.count(m) > 0); - pinset.erase(pinset.find(m)); - assert(pinset.size() == (unsigned)npins); - } - bool is_pinnable(Message *m) { - return (lockstate == DN_LOCK_SYNC) || - (lockstate == DN_LOCK_UNPINNING && pinset.count(m)); - } - bool is_pinned() { return npins>0; } - int num_pins() { return npins; } - - friend class CDir; -}; - -ostream& operator<<(ostream& out, CDentry& dn); - - -class CDentryDiscover { - string dname; - int replica_nonce; - int lockstate; - - inodeno_t ino; - inodeno_t remote_ino; - -public: - CDentryDiscover() {} - CDentryDiscover(CDentry *dn, int nonce) : - dname(dn->get_name()), replica_nonce(nonce), - lockstate(dn->get_replica_lockstate()), - ino(dn->get_ino()), - remote_ino(dn->get_remote_ino()) { } - - string& get_dname() { return dname; } - int get_nonce() { return replica_nonce; } - - void update_dentry(CDentry *dn) { - dn->set_replica_nonce( replica_nonce ); - dn->set_lockstate( lockstate ); - } - - void _encode(bufferlist& bl) { - ::_encode(dname, bl); - bl.append((char*)&replica_nonce, sizeof(replica_nonce)); - bl.append((char*)&lockstate, sizeof(lockstate)); - } - - void _decode(bufferlist& bl, int& off) { - ::_decode(dname, bl, off); - bl.copy(off, sizeof(replica_nonce), (char*)&replica_nonce); - off += sizeof(replica_nonce); - bl.copy(off, sizeof(lockstate), (char*)&lockstate); - off += sizeof(lockstate); - } - -}; - - -#endif diff --git a/branches/marnberg/quota/mds/CDir.cc b/branches/marnberg/quota/mds/CDir.cc deleted file mode 100644 index c9b9996d91c2d..0000000000000 --- a/branches/marnberg/quota/mds/CDir.cc +++ /dev/null @@ -1,890 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "CDir.h" -#include "CDentry.h" -#include "CInode.h" - -#include "MDS.h" -#include "MDCache.h" -#include "MDSMap.h" - -#include "include/Context.h" -#include "common/Clock.h" - -#include - -#include "config.h" -#undef dout -#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_mds) cout << g_clock.now() << " mds" << cache->mds->get_nodeid() << ".cache.dir(" << inode->inode.ino << ") " - - -// PINS -//int cdir_pins[CDIR_NUM_PINS] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0 }; - - - -ostream& operator<<(ostream& out, CDir& dir) -{ - string path; - dir.get_inode()->make_path(path); - out << "[dir " << dir.ino() << " " << path << "/"; - if (dir.is_auth()) { - out << " auth"; - if (dir.is_replicated()) - out << dir.get_replicas(); - - out << " v=" << dir.get_version(); - out << " pv=" << dir.get_projected_version(); - out << " cv=" << dir.get_committing_version(); - out << " lastcv=" << dir.get_last_committed_version(); - } else { - out << " rep@" << dir.authority(); - if (dir.get_replica_nonce() > 1) - out << "." << dir.get_replica_nonce(); - } - - if (dir.get_dir_auth() != CDIR_AUTH_PARENT) - out << " dir_auth=" << dir.get_dir_auth(); - - out << " state=" << dir.get_state(); - if (dir.state_test(CDIR_STATE_PROXY)) out << "|proxy"; - if (dir.state_test(CDIR_STATE_COMPLETE)) out << "|complete"; - if (dir.state_test(CDIR_STATE_FREEZINGTREE)) out << "|freezingtree"; - if (dir.state_test(CDIR_STATE_FROZENTREE)) out << "|frozentree"; - if (dir.state_test(CDIR_STATE_FROZENTREELEAF)) out << "|frozentreeleaf"; - if (dir.state_test(CDIR_STATE_FROZENDIR)) out << "|frozendir"; - if (dir.state_test(CDIR_STATE_FREEZINGDIR)) out << "|freezingdir"; - - out << " sz=" << dir.get_nitems() << "+" << dir.get_nnull(); - - if (dir.get_num_ref()) { - out << " |"; - for(set::iterator it = dir.get_ref_set().begin(); - it != dir.get_ref_set().end(); - it++) - out << " " << CDir::pin_name(*it); - } - - out << " " << &dir; - return out << "]"; -} - - -// ------------------------------------------------------------------- -// CDir - -CDir::CDir(CInode *in, MDCache *mdcache, bool auth) -{ - inode = in; - this->cache = mdcache; - - nitems = 0; - nnull = 0; - state = CDIR_STATE_INITIAL; - - projected_version = version = 0; - committing_version = 0; - last_committed_version = 0; - - ref = 0; - - // auth - dir_auth = -1; - assert(in->is_dir()); - if (auth) - state |= CDIR_STATE_AUTH; - /* - if (in->dir_is_hashed()) { - assert(0); // when does this happen? - state |= CDIR_STATE_HASHED; - } - */ - - auth_pins = 0; - nested_auth_pins = 0; - request_pins = 0; - - dir_rep = CDIR_REP_NONE; -} - - - - -/*** - * linking fun - */ - -CDentry* CDir::add_dentry( const string& dname, inodeno_t ino, bool auth) -{ - // foreign - assert(lookup(dname) == 0); - - // create dentry - CDentry* dn = new CDentry(dname, ino); - if (auth) - dn->state_set(CDentry::STATE_AUTH); - cache->lru.lru_insert_mid(dn); - - dn->dir = this; - dn->version = projected_version; - - // add to dir - assert(items.count(dn->name) == 0); - assert(null_items.count(dn->name) == 0); - - items[dn->name] = dn; - nitems++; - - dout(12) << "add_dentry " << *dn << endl; - - // pin? - if (nnull + nitems == 1) get(PIN_CHILD); - - assert(nnull + nitems == items.size()); - assert(nnull == null_items.size()); - return dn; -} - - -CDentry* CDir::add_dentry( const string& dname, CInode *in, bool auth ) -{ - // primary - assert(lookup(dname) == 0); - - // create dentry - CDentry* dn = new CDentry(dname, in); - if (auth) - dn->state_set(CDentry::STATE_AUTH); - cache->lru.lru_insert_mid(dn); - - dn->dir = this; - dn->version = projected_version; - - // add to dir - assert(items.count(dn->name) == 0); - assert(null_items.count(dn->name) == 0); - - items[dn->name] = dn; - - if (in) { - link_inode_work( dn, in ); - } else { - assert(dn->inode == 0); - null_items[dn->name] = dn; - nnull++; - } - - dout(12) << "add_dentry " << *dn << endl; - - // pin? - if (nnull + nitems == 1) get(PIN_CHILD); - - assert(nnull + nitems == items.size()); - assert(nnull == null_items.size()); - return dn; -} - - - -void CDir::remove_dentry(CDentry *dn) -{ - dout(12) << "remove_dentry " << *dn << endl; - - if (dn->inode) { - // detach inode and dentry - unlink_inode_work(dn); - } else { - // remove from null list - assert(null_items.count(dn->name) == 1); - null_items.erase(dn->name); - nnull--; - } - - // remove from list - assert(items.count(dn->name) == 1); - items.erase(dn->name); - - cache->lru.lru_remove(dn); - delete dn; - - // unpin? - if (nnull + nitems == 0) put(PIN_CHILD); - - assert(nnull + nitems == items.size()); - assert(nnull == null_items.size()); -} - -void CDir::link_inode( CDentry *dn, inodeno_t ino) -{ - dout(12) << "link_inode " << *dn << " remote " << ino << endl; - - assert(dn->is_null()); - dn->set_remote_ino(ino); - nitems++; - - assert(null_items.count(dn->name) == 1); - null_items.erase(dn->name); - nnull--; -} - -void CDir::link_inode( CDentry *dn, CInode *in ) -{ - dout(12) << "link_inode " << *dn << " " << *in << endl; - assert(!dn->is_remote()); - - link_inode_work(dn,in); - - // remove from null list - assert(null_items.count(dn->name) == 1); - null_items.erase(dn->name); - nnull--; - - assert(nnull + nitems == items.size()); - assert(nnull == null_items.size()); -} - -void CDir::link_inode_work( CDentry *dn, CInode *in ) -{ - dn->inode = in; - in->set_primary_parent(dn); - - nitems++; // adjust dir size - - // set dir version - in->inode.version = dn->get_version(); - - // clear dangling - in->state_clear(CInode::STATE_DANGLING); - - // pin dentry? - if (in->get_num_ref()) - dn->get(CDentry::PIN_INODEPIN); - - // adjust auth pin count - if (in->auth_pins + in->nested_auth_pins) - adjust_nested_auth_pins( in->auth_pins + in->nested_auth_pins ); -} - -void CDir::unlink_inode( CDentry *dn ) -{ - dout(12) << "unlink_inode " << *dn << " " << *dn->inode << endl; - - unlink_inode_work(dn); - - // add to null list - assert(null_items.count(dn->name) == 0); - null_items[dn->name] = dn; - nnull++; - - assert(nnull + nitems == items.size()); - assert(nnull == null_items.size()); -} - -void CDir::unlink_inode_work( CDentry *dn ) -{ - CInode *in = dn->inode; - - if (dn->is_remote()) { - // remote - if (in) - dn->unlink_remote(); - - dn->set_remote_ino(0); - } else { - // primary - assert(dn->is_primary()); - - // explicitly define auth - in->dangling_auth = in->authority(); - //dout(10) << "unlink_inode " << *in << " dangling_auth now " << in->dangling_auth << endl; - - // unpin dentry? - if (in->get_num_ref()) - dn->put(CDentry::PIN_INODEPIN); - - // unlink auth_pin count - if (in->auth_pins + in->nested_auth_pins) - adjust_nested_auth_pins( 0 - (in->auth_pins + in->nested_auth_pins) ); - - // set dangling flag - in->state_set(CInode::STATE_DANGLING); - - // detach inode - in->remove_primary_parent(dn); - dn->inode = 0; - } - - nitems--; // adjust dir size -} - -void CDir::remove_null_dentries() { - dout(12) << "remove_null_dentries " << *this << endl; - - list dns; - for (CDir_map_t::iterator it = null_items.begin(); - it != null_items.end(); - it++) { - dns.push_back(it->second); - } - - for (list::iterator it = dns.begin(); - it != dns.end(); - it++) { - CDentry *dn = *it; - assert(dn->is_sync()); - remove_dentry(dn); - } - assert(null_items.empty()); - assert(nnull == 0); - assert(nnull + nitems == items.size()); -} - - - -/**************************************** - * WAITING - */ - -bool CDir::waiting_for(int tag) -{ - return waiting.count(tag) > 0; -} - -bool CDir::waiting_for(int tag, const string& dn) -{ - if (!waiting_on_dentry.count(dn)) - return false; - return waiting_on_dentry[dn].count(tag) > 0; -} - -void CDir::add_waiter(int tag, - const string& dentry, - Context *c) { - if (waiting.empty() && waiting_on_dentry.size() == 0) - get(PIN_WAITER); - waiting_on_dentry[ dentry ].insert(pair(tag,c)); - dout(10) << "add_waiter dentry " << dentry << " tag " << tag << " " << c << " on " << *this << endl; -} - -void CDir::add_waiter(int tag, Context *c) { - // hierarchical? - if (tag & CDIR_WAIT_ATFREEZEROOT && (is_freezing() || is_frozen())) { - if (is_freezing_tree_root() || is_frozen_tree_root() || - is_freezing_dir() || is_frozen_dir()) { - // it's us, pin here. (fall thru) - } else { - // pin parent! - dout(10) << "add_waiter " << tag << " " << c << " should be ATFREEZEROOT, " << *this << " is not root, trying parent" << endl; - inode->parent->dir->add_waiter(tag, c); - return; - } - } - - // this dir. - if (waiting.empty() && waiting_on_dentry.size() == 0) - get(PIN_WAITER); - waiting.insert(pair(tag,c)); - dout(10) << "add_waiter " << tag << " " << c << " on " << *this << endl; -} - - -void CDir::take_waiting(int mask, - const string& dentry, - list& ls, - int num) -{ - if (waiting_on_dentry.empty()) return; - - multimap::iterator it = waiting_on_dentry[dentry].begin(); - while (it != waiting_on_dentry[dentry].end()) { - if (it->first & mask) { - ls.push_back(it->second); - dout(10) << "take_waiting dentry " << dentry << " mask " << mask << " took " << it->second << " tag " << it->first << " on " << *this << endl; - waiting_on_dentry[dentry].erase(it++); - - if (num) { - if (num == 1) break; - num--; - } - } else { - dout(10) << "take_waiting dentry " << dentry << " mask " << mask << " SKIPPING " << it->second << " tag " << it->first << " on " << *this << endl; - it++; - } - } - - // did we clear dentry? - if (waiting_on_dentry[dentry].empty()) - waiting_on_dentry.erase(dentry); - - // ...whole map? - if (waiting_on_dentry.size() == 0 && waiting.empty()) - put(PIN_WAITER); -} - -/* NOTE: this checks dentry waiters too */ -void CDir::take_waiting(int mask, - list& ls) -{ - if (waiting_on_dentry.size()) { - // try each dentry - hash_map >::iterator it = - waiting_on_dentry.begin(); - while (it != waiting_on_dentry.end()) { - take_waiting(mask, (it++)->first, ls); // not post-inc - } - } - - // waiting - if (!waiting.empty()) { - multimap::iterator it = waiting.begin(); - while (it != waiting.end()) { - if (it->first & mask) { - ls.push_back(it->second); - dout(10) << "take_waiting mask " << mask << " took " << it->second << " tag " << it->first << " on " << *this << endl; - waiting.erase(it++); - } else { - dout(10) << "take_waiting mask " << mask << " SKIPPING " << it->second << " tag " << it->first << " on " << *this<< endl; - it++; - } - } - - if (waiting_on_dentry.size() == 0 && waiting.empty()) - put(PIN_WAITER); - } -} - - -void CDir::finish_waiting(int mask, int result) -{ - dout(11) << "finish_waiting mask " << mask << " result " << result << " on " << *this << endl; - - list finished; - take_waiting(mask, finished); - finish_contexts(finished, result); -} - -void CDir::finish_waiting(int mask, const string& dn, int result) -{ - dout(11) << "finish_waiting mask " << mask << " dn " << dn << " result " << result << " on " << *this << endl; - - list finished; - take_waiting(mask, dn, finished); - finish_contexts(finished, result); -} - - -// dirty/clean - -version_t CDir::pre_dirty() -{ - ++projected_version; - dout(10) << "pre_dirty " << projected_version << endl; - return projected_version; -} - -void CDir::_mark_dirty() -{ - if (!state_test(CDIR_STATE_DIRTY)) { - state_set(CDIR_STATE_DIRTY); - dout(10) << "mark_dirty (was clean) " << *this << " version " << version << endl; - get(PIN_DIRTY); - } else { - dout(10) << "mark_dirty (already dirty) " << *this << " version " << version << endl; - } -} - -void CDir::mark_dirty(version_t pv) -{ - ++version; - assert(pv == version); - _mark_dirty(); -} - -void CDir::mark_clean() -{ - dout(10) << "mark_clean " << *this << " version " << version << endl; - if (state_test(CDIR_STATE_DIRTY)) { - state_clear(CDIR_STATE_DIRTY); - put(PIN_DIRTY); - } -} - - - - -void CDir::first_get() -{ - inode->get(CInode::PIN_DIR); -} - -void CDir::last_put() -{ - inode->put(CInode::PIN_DIR); -} - - - -/******************************** - * AUTHORITY - */ - -/* - * simple rule: if dir_auth isn't explicit, auth is the same as the inode. - */ -int CDir::authority() -{ - if (dir_auth == CDIR_AUTH_PARENT) - return inode->authority(); - return dir_auth; -} - -int CDir::dentry_authority(const string& dn ) -{ - // hashing -- subset of nodes have hashed the contents - if (is_hashing() && !hashed_subset.empty()) { - int hashauth = cache->hash_dentry( inode->ino(), dn ); // hashed - if (hashed_subset.count(hashauth)) - return hashauth; - } - - // hashed - if (is_hashed()) { - return cache->hash_dentry( inode->ino(), dn ); // hashed - } - - if (get_dir_auth() == CDIR_AUTH_PARENT) { - //dout(15) << "dir_auth = parent at " << *this << endl; - return inode->authority(); // same as my inode - } - - // it's explicit for this whole dir - //dout(15) << "dir_auth explicit " << dir_auth << " at " << *this << endl; - return get_dir_auth(); -} - -void CDir::set_dir_auth(int d) -{ - dout(10) << "setting dir_auth=" << d << " from " << dir_auth << " on " << *this << endl; - dir_auth = d; -} - - -/***************************************** - * AUTH PINS - */ - -void CDir::auth_pin() { - if (auth_pins == 0) - get(PIN_AUTHPIN); - auth_pins++; - - dout(7) << "auth_pin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << endl; - - inode->nested_auth_pins++; - if (inode->parent) - inode->parent->dir->adjust_nested_auth_pins( 1 ); -} - -void CDir::auth_unpin() { - auth_pins--; - if (auth_pins == 0) - put(PIN_AUTHPIN); - - dout(7) << "auth_unpin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << endl; - assert(auth_pins >= 0); - - // pending freeze? - if (auth_pins + nested_auth_pins == 0) - on_freezeable(); - - inode->nested_auth_pins--; - if (inode->parent) - inode->parent->dir->adjust_nested_auth_pins( -1 ); -} - -void CDir::adjust_nested_auth_pins(int inc) -{ - CDir *dir = this; - - while (1) { - // dir - dir->nested_auth_pins += inc; - - dout(10) << "adjust_nested_auth_pins on " << *dir << " count now " << dir->auth_pins << " + " << dir->nested_auth_pins << endl; - assert(dir->nested_auth_pins >= 0); - - // pending freeze? - if (dir->auth_pins + dir->nested_auth_pins == 0) - dir->on_freezeable(); - - // it's inode - dir->inode->nested_auth_pins += inc; - - if (dir->inode->parent) - dir = dir->inode->parent->dir; - else - break; - } -} - - - -/***************************************************************************** - * FREEZING - */ - -void CDir::on_freezeable() -{ - // check for anything pending freezeable - - /* NOTE: the first of these will likely freeze the dir, and unmark - FREEZING. additional ones will re-flag FREEZING. this isn't - particularly graceful, and might cause problems if the first one - needs to know about other waiters.... FIXME? */ - - finish_waiting(CDIR_WAIT_FREEZEABLE); -} - -// FREEZE TREE - -class C_MDS_FreezeTree : public Context { - CDir *dir; - Context *con; -public: - C_MDS_FreezeTree(CDir *dir, Context *c) { - this->dir = dir; - this->con = c; - } - virtual void finish(int r) { - dir->freeze_tree_finish(con); - } -}; - -void CDir::freeze_tree(Context *c) -{ - assert(!is_frozen()); - assert(!is_freezing()); - - if (is_freezeable()) { - dout(10) << "freeze_tree " << *this << endl; - - state_set(CDIR_STATE_FROZENTREE); - inode->auth_pin(); // auth_pin for duration of freeze - - // easy, we're frozen - c->finish(0); - delete c; - - } else { - state_set(CDIR_STATE_FREEZINGTREE); - dout(10) << "freeze_tree + wait " << *this << endl; - - // need to wait for auth pins to expire - add_waiter(CDIR_WAIT_FREEZEABLE, new C_MDS_FreezeTree(this, c)); - } -} - -void CDir::freeze_tree_finish(Context *c) -{ - // freezeable now? - if (!is_freezeable()) { - // wait again! - dout(10) << "freeze_tree_finish still waiting " << *this << endl; - state_set(CDIR_STATE_FREEZINGTREE); - add_waiter(CDIR_WAIT_FREEZEABLE, new C_MDS_FreezeTree(this, c)); - return; - } - - dout(10) << "freeze_tree_finish " << *this << endl; - state_set(CDIR_STATE_FROZENTREE); - state_clear(CDIR_STATE_FREEZINGTREE); // actually, this may get set again by next context? - - inode->auth_pin(); // auth_pin for duration of freeze - - // continue to frozen land - if (c) { - c->finish(0); - delete c; - } -} - -void CDir::unfreeze_tree() -{ - dout(10) << "unfreeze_tree " << *this << endl; - - if (state_test(CDIR_STATE_FROZENTREE)) { - // frozen. unfreeze. - state_clear(CDIR_STATE_FROZENTREE); - - // unpin (may => FREEZEABLE) FIXME: is this order good? - inode->auth_unpin(); - - // waiters? - finish_waiting(CDIR_WAIT_UNFREEZE); - } else { - // freezing. stop it. - assert(state_test(CDIR_STATE_FREEZINGTREE)); - state_clear(CDIR_STATE_FREEZINGTREE); - - // cancel freeze waiters - finish_waiting(CDIR_WAIT_FREEZEABLE, -1); - } -} - -bool CDir::is_freezing_tree() -{ - CDir *dir = this; - while (1) { - if (dir->is_freezing_tree_root()) return true; - if (dir->is_import()) return false; - if (dir->is_hashed()) return false; - if (dir->inode->parent) - dir = dir->inode->parent->dir; - else - return false; // root on replica - } -} - -bool CDir::is_frozen_tree() -{ - CDir *dir = this; - while (1) { - if (dir->is_frozen_tree_root()) return true; - if (dir->is_import()) return false; - if (dir->is_hashed()) return false; - if (dir->is_frozen_tree_leaf()) return false; - if (dir->inode->parent) - dir = dir->inode->parent->dir; - else - return false; // root on replica - } -} - - - -// FREEZE DIR - -class C_MDS_FreezeDir : public Context { - CDir *dir; - Context *con; -public: - C_MDS_FreezeDir(CDir *dir, Context *c) { - this->dir = dir; - this->con = c; - } - virtual void finish(int r) { - dir->freeze_dir_finish(con); - } -}; - -void CDir::freeze_dir(Context *c) -{ - assert(!is_frozen()); - assert(!is_freezing()); - - if (is_freezeable_dir()) { - dout(10) << "freeze_dir " << *this << endl; - - state_set(CDIR_STATE_FROZENDIR); - inode->auth_pin(); // auth_pin for duration of freeze - - // easy, we're frozen - c->finish(0); - delete c; - - } else { - state_set(CDIR_STATE_FREEZINGDIR); - dout(10) << "freeze_dir + wait " << *this << endl; - - // need to wait for auth pins to expire - add_waiter(CDIR_WAIT_FREEZEABLE, new C_MDS_FreezeDir(this, c)); - } -} - -void CDir::freeze_dir_finish(Context *c) -{ - // freezeable now? - if (!is_freezeable_dir()) { - // wait again! - dout(10) << "freeze_dir_finish still waiting " << *this << endl; - state_set(CDIR_STATE_FREEZINGDIR); - add_waiter(CDIR_WAIT_FREEZEABLE, new C_MDS_FreezeDir(this, c)); - return; - } - - dout(10) << "freeze_dir_finish " << *this << endl; - state_set(CDIR_STATE_FROZENDIR); - state_clear(CDIR_STATE_FREEZINGDIR); // actually, this may get set again by next context? - - inode->auth_pin(); // auth_pin for duration of freeze - - // continue to frozen land - if (c) { - c->finish(0); - delete c; - } -} - -void CDir::unfreeze_dir() -{ - dout(10) << "unfreeze_dir " << *this << endl; - state_clear(CDIR_STATE_FROZENDIR); - - // unpin (may => FREEZEABLE) FIXME: is this order good? - inode->auth_unpin(); - - // waiters? - finish_waiting(CDIR_WAIT_UNFREEZE); -} - - - - - - - - - -// ----------------------------------------------------------------- -// debug shite - - -void CDir::dump(int depth) { - string ind(depth, '\t'); - - dout(10) << "dump:" << ind << *this << endl; - - map::iterator iter = items.begin(); - while (iter != items.end()) { - CDentry* d = iter->second; - if (d->inode) { - char isdir = ' '; - if (d->inode->dir != NULL) isdir = '/'; - dout(10) << "dump: " << ind << *d << " = " << *d->inode << endl; - d->inode->dump(depth+1); - } else { - dout(10) << "dump: " << ind << *d << " = [null]" << endl; - } - iter++; - } - - if (!(state_test(CDIR_STATE_COMPLETE))) - dout(10) << ind << "..." << endl; - if (state_test(CDIR_STATE_DIRTY)) - dout(10) << ind << "[dirty]" << endl; - -} - diff --git a/branches/marnberg/quota/mds/CDir.h b/branches/marnberg/quota/mds/CDir.h deleted file mode 100644 index 6283bef7c0aff..0000000000000 --- a/branches/marnberg/quota/mds/CDir.h +++ /dev/null @@ -1,617 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __CDIR_H -#define __CDIR_H - -#include "include/types.h" -#include "include/buffer.h" -#include "mdstypes.h" -#include "config.h" -#include "common/DecayCounter.h" - -#include -#include - -#include -#include -#include -#include -using namespace std; - -#include -using __gnu_cxx::hash_map; - - -#include "CInode.h" - -class CDentry; -class MDCache; -class MDCluster; -class Context; - - -// directory authority types -// >= 0 is the auth mds -#define CDIR_AUTH_PARENT -1 // default -#define CDIR_AUTH_UNKNOWN -2 - - -#define CDIR_NONCE_EXPORT 1 - - -// state bits -#define CDIR_STATE_AUTH (1<<0) // auth for this dir (hashing doesn't count) -#define CDIR_STATE_PROXY (1<<1) // proxy auth - -#define CDIR_STATE_COMPLETE (1<<2) // the complete contents are in cache -#define CDIR_STATE_DIRTY (1<<3) // has been modified since last commit - -#define CDIR_STATE_FROZENTREE (1<<4) // root of tree (bounded by exports) -#define CDIR_STATE_FREEZINGTREE (1<<5) // in process of freezing -#define CDIR_STATE_FROZENTREELEAF (1<<6) // outer bound of frozen region (on import) -#define CDIR_STATE_FROZENDIR (1<<7) -#define CDIR_STATE_FREEZINGDIR (1<<8) - -#define CDIR_STATE_COMMITTING (1<<9) // mid-commit -#define CDIR_STATE_FETCHING (1<<10) // currenting fetching - -#define CDIR_STATE_DELETED (1<<11) - -#define CDIR_STATE_IMPORT (1<<12) // flag set if this is an import. -#define CDIR_STATE_EXPORT (1<<13) -#define CDIR_STATE_IMPORTINGEXPORT (1<<14) - -#define CDIR_STATE_HASHED (1<<15) // if hashed -#define CDIR_STATE_HASHING (1<<16) -#define CDIR_STATE_UNHASHING (1<<17) - - - - - -// these state bits are preserved by an import/export -// ...except if the directory is hashed, in which case none of them are! -#define CDIR_MASK_STATE_EXPORTED (CDIR_STATE_COMPLETE\ - |CDIR_STATE_DIRTY) -#define CDIR_MASK_STATE_IMPORT_KEPT (CDIR_STATE_IMPORT\ - |CDIR_STATE_EXPORT\ - |CDIR_STATE_IMPORTINGEXPORT\ - |CDIR_STATE_FROZENTREE\ - |CDIR_STATE_PROXY) - -#define CDIR_MASK_STATE_EXPORT_KEPT (CDIR_STATE_HASHED\ - |CDIR_STATE_FROZENTREE\ - |CDIR_STATE_FROZENDIR\ - |CDIR_STATE_EXPORT\ - |CDIR_STATE_PROXY) - -// common states -#define CDIR_STATE_CLEAN 0 -#define CDIR_STATE_INITIAL 0 - -// directory replication -#define CDIR_REP_ALL 1 -#define CDIR_REP_NONE 0 -#define CDIR_REP_LIST 2 - - - - - - -// wait reasons -#define CDIR_WAIT_DENTRY 1 // wait for item to be in cache - // waiters: path_traverse - // trigger: handle_discover, fetch_dir_2 -#define CDIR_WAIT_COMPLETE 2 // wait for complete dir contents - // waiters: fetch_dir, commit_dir - // trigger: fetch_dir_2 -#define CDIR_WAIT_FREEZEABLE 4 // hard_pins removed - // waiters: freeze, freeze_finish - // trigger: auth_unpin, adjust_nested_auth_pins -#define CDIR_WAIT_UNFREEZE 8 // unfreeze - // waiters: path_traverse, handle_discover, handle_inode_update, - // export_dir_frozen (mdcache) - // handle_client_readdir (mds) - // trigger: unfreeze -#define CDIR_WAIT_AUTHPINNABLE CDIR_WAIT_UNFREEZE - // waiters: commit_dir (mdstore) - // trigger: (see CDIR_WAIT_UNFREEZE) -#define CDIR_WAIT_COMMITTED 32 // did commit (who uses this?**) - // waiters: commit_dir (if already committing) - // trigger: commit_dir_2 -#define CDIR_WAIT_IMPORTED 64 // import finish - // waiters: import_dir_block - // triggers: handle_export_dir_finish - -#define CDIR_WAIT_EXPORTWARNING 8192 // on bystander. - // watiers: handle_export_dir_notify - // triggers: handle_export_dir_warning -#define CDIR_WAIT_EXPORTPREPACK 16384 - // waiter export_dir - // trigger handel_export_dir_prep_ack - -#define CDIR_WAIT_HASHED (1<<17) // hash finish -#define CDIR_WAIT_THISHASHEDREADDIR (1<<18) // current readdir lock -#define CDIR_WAIT_NEXTHASHEDREADDIR (1<<19) // after current readdir lock finishes - -#define CDIR_WAIT_DNREAD (1<<20) -#define CDIR_WAIT_DNLOCK (1<<21) -#define CDIR_WAIT_DNUNPINNED (1<<22) -#define CDIR_WAIT_DNPINNABLE (CDIR_WAIT_DNREAD|CDIR_WAIT_DNUNPINNED) - -#define CDIR_WAIT_DNREQXLOCK (1<<23) - -#define CDIR_WAIT_ANY (0xffffffff) - -#define CDIR_WAIT_ATFREEZEROOT (CDIR_WAIT_AUTHPINNABLE|\ - CDIR_WAIT_UNFREEZE) // hmm, same same - - -ostream& operator<<(ostream& out, class CDir& dir); - - -// CDir -typedef map CDir_map_t; - - -//extern int cdir_pins[CDIR_NUM_PINS]; - - -class CDir : public MDSCacheObject { - public: - // -- pins -- - static const int PIN_CHILD = 0; - static const int PIN_OPENED = 1; // open by another node - static const int PIN_WAITER = 2; // waiter(s) - static const int PIN_IMPORT = 3; - static const int PIN_EXPORT = 4; - //static const int PIN_FREEZE = 5; - static const int PIN_FREEZELEAF = 6; - static const int PIN_PROXY = 7; // auth just changed. - static const int PIN_AUTHPIN = 8; - static const int PIN_IMPORTING = 9; - static const int PIN_IMPORTINGEXPORT = 10; - static const int PIN_HASHED = 11; - static const int PIN_HASHING = 12; - static const int PIN_DIRTY = 13; - static const int PIN_REQUEST = 14; - static const char *pin_name(int p) { - switch (p) { - case PIN_CHILD: return "child"; - case PIN_OPENED: return "opened"; - case PIN_WAITER: return "waiter"; - case PIN_IMPORT: return "import"; - case PIN_EXPORT: return "export"; - //case PIN_FREEZE: return "freeze"; - case PIN_FREEZELEAF: return "freezeleaf"; - case PIN_PROXY: return "proxy"; - case PIN_AUTHPIN: return "authpin"; - case PIN_IMPORTING: return "importing"; - case PIN_IMPORTINGEXPORT: return "importingexport"; - case PIN_HASHED: return "hashed"; - case PIN_HASHING: return "hashing"; - case PIN_DIRTY: return "dirty"; - case PIN_REQUEST: return "request"; - default: assert(0); - } - } - - - public: - // context - MDCache *cache; - - // my inode - CInode *inode; - - protected: - // contents - CDir_map_t items; // non-null AND null - CDir_map_t null_items; // null and foreign - size_t nitems; // non-null - size_t nnull; // null - - // state - version_t version; - version_t committing_version; - version_t last_committed_version; // slight lie; we bump this on import. - version_t projected_version; - - // authority, replicas - int dir_auth; - - // lock nesting, freeze - int auth_pins; - int nested_auth_pins; - int request_pins; - - // hashed dirs - set hashed_subset; // HASHING: subset of mds's that are hashed - public: - // for class MDS - map, list > > hashed_readdir; - protected: - - - - // waiters - multimap waiting; // tag -> context - hash_map< string, multimap > - waiting_on_dentry; - - // cache control (defined for authority; hints for replicas) - int dir_rep; - set dir_rep_by; // if dir_rep == CDIR_REP_LIST - - // popularity - meta_load_t popularity[MDS_NPOP]; - - // friends - friend class Migrator; - friend class CInode; - friend class MDCache; - friend class MDiscover; - friend class MDBalancer; - - friend class CDirDiscover; - friend class CDirExport; - - public: - CDir(CInode *in, MDCache *mdcache, bool auth); - - - - // -- accessors -- - inodeno_t ino() { return inode->ino(); } - CInode *get_inode() { return inode; } - CDir *get_parent_dir() { return inode->get_parent_dir(); } - - CDir_map_t::iterator begin() { return items.begin(); } - CDir_map_t::iterator end() { return items.end(); } - size_t get_size() { - return nitems; - } - size_t get_nitems() { return nitems; } - size_t get_nnull() { return nnull; } - - /* - float get_popularity() { - return popularity[0].get(); - } - */ - - - // -- dentries and inodes -- - public: - CDentry* lookup(const string& n) { - map::iterator iter = items.find(n); - if (iter == items.end()) - return 0; - else - return iter->second; - } - - CDentry* add_dentry( const string& dname, CInode *in=0, bool auth=true ); - CDentry* add_dentry( const string& dname, inodeno_t ino, bool auth=true ); - void remove_dentry( CDentry *dn ); // delete dentry - void link_inode( CDentry *dn, inodeno_t ino ); - void link_inode( CDentry *dn, CInode *in ); - void unlink_inode( CDentry *dn ); - private: - void link_inode_work( CDentry *dn, CInode *in ); - void unlink_inode_work( CDentry *dn ); - - void remove_null_dentries(); // on empty, clean dir - - // -- authority -- - public: - int authority(); - int dentry_authority(const string& d); - int get_dir_auth() { return dir_auth; } - void set_dir_auth(int d); - - - - // for giving to clients - void get_dist_spec(set& ls, int auth) { - if (( popularity[MDS_POP_CURDOM].pop[META_POP_IRD].get() > g_conf.mds_bal_replicate_threshold)) { - //if (!cached_by.empty() && inode.ino > 1) dout(1) << "distributed spec for " << *this << endl; - for (map::iterator p = replicas_begin(); - p != replicas_end(); - ++p) - ls.insert(p->first); - if (!ls.empty()) - ls.insert(auth); - } - } - - - // -- state -- - bool is_complete() { return state & CDIR_STATE_COMPLETE; } - bool is_dirty() { return state_test(CDIR_STATE_DIRTY); } - - bool is_auth() { return state & CDIR_STATE_AUTH; } - bool is_proxy() { return state & CDIR_STATE_PROXY; } - bool is_import() { return state & CDIR_STATE_IMPORT; } - bool is_export() { return state & CDIR_STATE_EXPORT; } - - bool is_hashed() { return state & CDIR_STATE_HASHED; } - bool is_hashing() { return state & CDIR_STATE_HASHING; } - bool is_unhashing() { return state & CDIR_STATE_UNHASHING; } - - bool is_rep() { - if (dir_rep == CDIR_REP_NONE) return false; - return true; - } - - - - // -- dirtyness -- - version_t get_version() { return version; } - void set_version(version_t v) { projected_version = version = v; } - version_t get_projected_version() { return projected_version; } - - version_t get_committing_version() { return committing_version; } - version_t get_last_committed_version() { return last_committed_version; } - // as in, we're committing the current version. - void set_committing_version() { committing_version = version; } - void set_last_committed_version(version_t v) { last_committed_version = v; } - - version_t pre_dirty(); - void _mark_dirty(); - void mark_dirty(version_t pv); - void mark_clean(); - void mark_complete() { state_set(CDIR_STATE_COMPLETE); } - bool is_clean() { return !state_test(CDIR_STATE_DIRTY); } - - - - - // -- reference counting -- - void first_get(); - void last_put(); - - void request_pin_get() { - if (request_pins == 0) get(PIN_REQUEST); - request_pins++; - } - void request_pin_put() { - request_pins--; - if (request_pins == 0) put(PIN_REQUEST); - } - - - // -- waiters -- - bool waiting_for(int tag); - bool waiting_for(int tag, const string& dn); - void add_waiter(int tag, Context *c); - void add_waiter(int tag, - const string& dentry, - Context *c); - void take_waiting(int mask, list& ls); // includes dentry waiters - void take_waiting(int mask, - const string& dentry, - list& ls, - int num=0); - void finish_waiting(int mask, int result = 0); // ditto - void finish_waiting(int mask, const string& dn, int result = 0); // ditto - - - // -- auth pins -- - bool can_auth_pin() { return !(is_frozen() || is_freezing()); } - int is_auth_pinned() { return auth_pins; } - void auth_pin(); - void auth_unpin(); - void adjust_nested_auth_pins(int inc); - void on_freezeable(); - - // -- freezing -- - void freeze_tree(Context *c); - void freeze_tree_finish(Context *c); - void unfreeze_tree(); - - void freeze_dir(Context *c); - void freeze_dir_finish(Context *c); - void unfreeze_dir(); - - bool is_freezing() { return is_freezing_tree() || is_freezing_dir(); } - bool is_freezing_tree(); - bool is_freezing_tree_root() { return state & CDIR_STATE_FREEZINGTREE; } - bool is_freezing_dir() { return state & CDIR_STATE_FREEZINGDIR; } - - bool is_frozen() { return is_frozen_dir() || is_frozen_tree(); } - bool is_frozen_tree(); - bool is_frozen_tree_root() { return state & CDIR_STATE_FROZENTREE; } - bool is_frozen_tree_leaf() { return state & CDIR_STATE_FROZENTREELEAF; } - bool is_frozen_dir() { return state & CDIR_STATE_FROZENDIR; } - - bool is_freezeable() { - if (auth_pins == 0 && nested_auth_pins == 0) return true; - return false; - } - bool is_freezeable_dir() { - if (auth_pins == 0) return true; - return false; - } - - - - // debuggin bs - void dump(int d = 0); -}; - - - -// -- encoded state -- - -// discover - -class CDirDiscover { - inodeno_t ino; - int nonce; - int dir_auth; - int dir_rep; - set rep_by; - - public: - CDirDiscover() {} - CDirDiscover(CDir *dir, int nonce) { - ino = dir->ino(); - this->nonce = nonce; - dir_auth = dir->dir_auth; - dir_rep = dir->dir_rep; - rep_by = dir->dir_rep_by; - } - - void update_dir(CDir *dir) { - assert(dir->ino() == ino); - assert(!dir->is_auth()); - - dir->replica_nonce = nonce; - dir->dir_auth = dir_auth; - dir->dir_rep = dir_rep; - dir->dir_rep_by = rep_by; - } - - inodeno_t get_ino() { return ino; } - - - void _encode(bufferlist& bl) { - bl.append((char*)&ino, sizeof(ino)); - bl.append((char*)&nonce, sizeof(nonce)); - bl.append((char*)&dir_auth, sizeof(dir_auth)); - bl.append((char*)&dir_rep, sizeof(dir_rep)); - ::_encode(rep_by, bl); - } - - void _decode(bufferlist& bl, int& off) { - bl.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - bl.copy(off, sizeof(nonce), (char*)&nonce); - off += sizeof(nonce); - bl.copy(off, sizeof(dir_auth), (char*)&dir_auth); - off += sizeof(dir_auth); - bl.copy(off, sizeof(dir_rep), (char*)&dir_rep); - off += sizeof(dir_rep); - ::_decode(rep_by, bl, off); - } - -}; - - -// export - -class CDirExport { - struct { - inodeno_t ino; - long nitems; // actual real entries - long nden; // num dentries (including null ones) - version_t version; - unsigned state; - meta_load_t popularity_justme; - meta_load_t popularity_curdom; - int dir_rep; - } st; - map replicas; - set rep_by; - - public: - CDirExport() {} - CDirExport(CDir *dir) { - memset(&st, 0, sizeof(st)); - - assert(dir->get_version() == dir->get_projected_version()); - - st.ino = dir->ino(); - st.nitems = dir->nitems; - st.nden = dir->items.size(); - st.version = dir->version; - st.state = dir->state; - st.dir_rep = dir->dir_rep; - - st.popularity_justme.take( dir->popularity[MDS_POP_JUSTME] ); - st.popularity_curdom.take( dir->popularity[MDS_POP_CURDOM] ); - dir->popularity[MDS_POP_ANYDOM] -= st.popularity_curdom; - dir->popularity[MDS_POP_NESTED] -= st.popularity_curdom; - - rep_by = dir->dir_rep_by; - replicas = dir->replicas; - } - - inodeno_t get_ino() { return st.ino; } - __uint64_t get_nden() { return st.nden; } - - void update_dir(CDir *dir) { - assert(dir->ino() == st.ino); - - //dir->nitems = st.nitems; - - // set last_committed_version at old version - dir->committing_version = dir->last_committed_version = st.version; - dir->projected_version = dir->version = st.version; // this is bumped, below, if dirty - - // twiddle state - if (dir->state & CDIR_STATE_HASHED) - dir->state_set( CDIR_STATE_AUTH ); // just inherit auth flag when hashed - else - dir->state = (dir->state & CDIR_MASK_STATE_IMPORT_KEPT) | // remember import flag, etc. - (st.state & CDIR_MASK_STATE_EXPORTED); - dir->dir_rep = st.dir_rep; - - dir->popularity[MDS_POP_JUSTME] += st.popularity_justme; - dir->popularity[MDS_POP_CURDOM] += st.popularity_curdom; - dir->popularity[MDS_POP_ANYDOM] += st.popularity_curdom; - dir->popularity[MDS_POP_NESTED] += st.popularity_curdom; - - dir->replica_nonce = 0; // no longer defined - - if (!dir->replicas.empty()) - dout(0) << "replicas not empty non import, " << *dir << ", " << dir->replicas << endl; - - dir->dir_rep_by = rep_by; - dir->replicas = replicas; - dout(12) << "replicas in export is " << replicas << ", dir now " << dir->replicas << endl; - if (!replicas.empty()) - dir->get(CDir::PIN_OPENED); - if (dir->is_dirty()) { - dir->get(CDir::PIN_DIRTY); - - // bump dir version + 1 if dirty - dir->projected_version = dir->version = st.version + 1; - } - } - - - void _encode(bufferlist& bl) { - bl.append((char*)&st, sizeof(st)); - ::_encode(replicas, bl); - ::_encode(rep_by, bl); - } - - int _decode(bufferlist& bl, int off = 0) { - bl.copy(off, sizeof(st), (char*)&st); - off += sizeof(st); - ::_decode(replicas, bl, off); - ::_decode(rep_by, bl, off); - return off; - } - -}; - - - -#endif diff --git a/branches/marnberg/quota/mds/CInode.cc b/branches/marnberg/quota/mds/CInode.cc deleted file mode 100644 index f431184fb199b..0000000000000 --- a/branches/marnberg/quota/mds/CInode.cc +++ /dev/null @@ -1,506 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "CInode.h" -#include "CDir.h" -#include "CDentry.h" - -#include "MDS.h" -#include "MDCache.h" -#include "AnchorTable.h" - -#include "common/Clock.h" - -#include - -#include "config.h" -#undef dout -#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mdcache->mds->get_nodeid() << ".cache.ino(" << inode.ino << ") " - - -//int cinode_pins[CINODE_NUM_PINS]; // counts - - -ostream& operator<<(ostream& out, CInode& in) -{ - string path; - in.make_path(path); - out << "[inode " << in.inode.ino << " " << path << (in.is_dir() ? "/ ":" "); - if (in.is_auth()) { - out << "auth"; - if (in.is_replicated()) - out << in.get_replicas(); - } else { - out << "rep@" << in.authority(); - out << "." << in.get_replica_nonce(); - assert(in.get_replica_nonce() >= 0); - } - - if (in.is_symlink()) out << " symlink"; - - out << " v" << in.get_version(); - - out << " hard=" << in.hardlock; - out << " file=" << in.filelock; - - if (in.get_num_ref()) { - out << " |"; - for(set::iterator it = in.get_ref_set().begin(); - it != in.get_ref_set().end(); - it++) - out << " " << CInode::pin_name(*it); - } - - // hack: spit out crap on which clients have caps - if (!in.get_client_caps().empty()) { - out << " caps={"; - for (map::iterator it = in.get_client_caps().begin(); - it != in.get_client_caps().end(); - it++) { - if (it != in.get_client_caps().begin()) out << ","; - out << it->first; - } - out << "}"; - } - out << " " << ∈ - out << "]"; - return out; -} - - -// ====== CInode ======= -CInode::CInode(MDCache *c, bool auth) { - mdcache = c; - - ref = 0; - - num_parents = 0; - parent = NULL; - - dir = NULL; // CDir opened separately - - auth_pins = 0; - nested_auth_pins = 0; - num_request_pins = 0; - - state = 0; - - if (auth) state_set(STATE_AUTH); -} - -CInode::~CInode() { - if (dir) { delete dir; dir = 0; } -} - - -// pins - -void CInode::first_get() -{ - // pin my dentry? - if (parent) - parent->get(CDentry::PIN_INODEPIN); -} - -void CInode::last_put() -{ - // unpin my dentry? - if (parent) { - parent->put(CDentry::PIN_INODEPIN); - } - if (num_parents == 0 && get_num_ref() == 0) - mdcache->inode_expire_queue.push_back(this); // queue myself for garbage collection -} - -void CInode::get_parent() -{ - num_parents++; -} -void CInode::put_parent() -{ - num_parents--; - if (num_parents == 0 && get_num_ref() == 0) - mdcache->inode_expire_queue.push_back(this); // queue myself for garbage collection -} - - - -CDir *CInode::get_parent_dir() -{ - if (parent) - return parent->dir; - return NULL; -} -CInode *CInode::get_parent_inode() -{ - if (parent) - return parent->dir->inode; - return NULL; -} - -bool CInode::dir_is_auth() { - if (dir) - return dir->is_auth(); - else - return is_auth(); -} - -CDir *CInode::get_or_open_dir(MDCache *mdcache) -{ - assert(is_dir()); - - if (dir) return dir; - - // can't open a dir if we're frozen_dir, bc of hashing stuff. - assert(!is_frozen_dir()); - - // only auth can open dir alone. - assert(is_auth()); - set_dir( new CDir(this, mdcache, true) ); - dir->dir_auth = -1; - return dir; -} - -CDir *CInode::set_dir(CDir *newdir) -{ - assert(dir == 0); - dir = newdir; - return dir; -} - -void CInode::close_dir() -{ - assert(dir); - assert(dir->get_num_ref() == 0); - delete dir; - dir = 0; -} - - -void CInode::set_auth(bool a) -{ - if (!is_dangling() && !is_root() && - is_auth() != a) { - } - - if (a) state_set(STATE_AUTH); - else state_clear(STATE_AUTH); -} - - - -void CInode::make_path(string& s) -{ - if (parent) { - parent->make_path(s); - } - else if (is_root()) { - s = ""; // root - } - else { - s = "(dangling)"; // dangling - } -} - -void CInode::make_anchor_trace(vector& trace) -{ - if (parent) { - parent->dir->inode->make_anchor_trace(trace); - - dout(7) << "make_anchor_trace adding " << ino() << " dirino " << parent->dir->inode->ino() << " dn " << parent->name << endl; - trace.push_back( new Anchor(ino(), - parent->dir->inode->ino(), - parent->name) ); - } - else if (state_test(STATE_DANGLING)) { - dout(7) << "make_anchor_trace dangling " << ino() << " on mds " << dangling_auth << endl; - string ref_dn; - trace.push_back( new Anchor(ino(), - MDS_INO_INODEFILE_OFFSET+dangling_auth, - ref_dn) ); - } - else - assert(is_root()); -} - - - - -version_t CInode::pre_dirty() -{ - assert(parent); - return parent->pre_dirty(); -} - -void CInode::_mark_dirty() -{ - if (!state_test(STATE_DIRTY)) { - state_set(STATE_DIRTY); - get(PIN_DIRTY); - } -} - -void CInode::mark_dirty(version_t pv) { - - dout(10) << "mark_dirty " << *this << endl; - - assert(parent); - - /* - NOTE: I may already be dirty, but this fn _still_ needs to be called so that - the directory is (perhaps newly) dirtied, and so that parent_dir_version is - updated below. - */ - - // only auth can get dirty. "dirty" async data in replicas is relative to - // filelock state, not the dirty flag. - assert(is_auth()); - - // touch my private version - assert(inode.version < pv); - inode.version = pv; - _mark_dirty(); - - // mark dentry too - parent->mark_dirty(pv); -} - -void CInode::mark_clean() -{ - dout(10) << " mark_clean " << *this << endl; - if (state_test(STATE_DIRTY)) { - state_clear(STATE_DIRTY); - put(PIN_DIRTY); - } -} - -// state - - - - - -// new state encoders - -void CInode::encode_file_state(bufferlist& bl) -{ - bl.append((char*)&inode.size, sizeof(inode.size)); - bl.append((char*)&inode.mtime, sizeof(inode.mtime)); - bl.append((char*)&inode.atime, sizeof(inode.atime)); // ?? -} - -void CInode::decode_file_state(bufferlist& r, int& off) -{ - r.copy(off, sizeof(inode.size), (char*)&inode.size); - off += sizeof(inode.size); - r.copy(off, sizeof(inode.mtime), (char*)&inode.mtime); - off += sizeof(inode.mtime); - r.copy(off, sizeof(inode.atime), (char*)&inode.atime); - off += sizeof(inode.atime); -} - -/* not used currently -void CInode::decode_merge_file_state(crope& r, int& off) -{ - __uint64_t size; - r.copy(off, sizeof(size), (char*)&size); - off += sizeof(size); - if (size > inode.size) inode.size = size; - - time_t t; - r.copy(off, sizeof(t), (char*)&t); - off += sizeof(t); - if (t > inode.mtime) inode.mtime = t; - - r.copy(off, sizeof(t), (char*)&t); - off += sizeof(t); - if (t > inode.atime) inode.atime = t; -} -*/ - -void CInode::encode_hard_state(bufferlist& r) -{ - r.append((char*)&inode.mode, sizeof(inode.mode)); - r.append((char*)&inode.uid, sizeof(inode.uid)); - r.append((char*)&inode.gid, sizeof(inode.gid)); - r.append((char*)&inode.ctime, sizeof(inode.ctime)); -} - -void CInode::decode_hard_state(bufferlist& r, int& off) -{ - r.copy(off, sizeof(inode.mode), (char*)&inode.mode); - off += sizeof(inode.mode); - r.copy(off, sizeof(inode.uid), (char*)&inode.uid); - off += sizeof(inode.uid); - r.copy(off, sizeof(inode.gid), (char*)&inode.gid); - off += sizeof(inode.gid); - r.copy(off, sizeof(inode.ctime), (char*)&inode.ctime); - off += sizeof(inode.ctime); -} - - - -// waiting - -bool CInode::is_frozen() -{ - if (parent && parent->dir->is_frozen()) - return true; - return false; -} - -bool CInode::is_frozen_dir() -{ - if (parent && parent->dir->is_frozen_dir()) - return true; - return false; -} - -bool CInode::is_freezing() -{ - if (parent && parent->dir->is_freezing()) - return true; - return false; -} - -bool CInode::waiting_for(int tag) -{ - return waiting.count(tag) > 0; -} - -void CInode::add_waiter(int tag, Context *c) { - // waiting on hierarchy? - if (tag & CDIR_WAIT_ATFREEZEROOT && (is_freezing() || is_frozen())) { - parent->dir->add_waiter(tag, c); - return; - } - - // this inode. - if (waiting.size() == 0) - get(PIN_WAITER); - waiting.insert(pair(tag,c)); - dout(10) << "add_waiter " << tag << " " << c << " on " << *this << endl; - -} - -void CInode::take_waiting(int mask, list& ls) -{ - if (waiting.empty()) return; - - multimap::iterator it = waiting.begin(); - while (it != waiting.end()) { - if (it->first & mask) { - ls.push_back(it->second); - dout(10) << "take_waiting mask " << mask << " took " << it->second << " tag " << it->first << " on " << *this << endl; - - waiting.erase(it++); - } else { - dout(10) << "take_waiting mask " << mask << " SKIPPING " << it->second << " tag " << it->first << " on " << *this << endl; - it++; - } - } - - if (waiting.empty()) - put(PIN_WAITER); -} - -void CInode::finish_waiting(int mask, int result) -{ - dout(11) << "finish_waiting mask " << mask << " result " << result << " on " << *this << endl; - - list finished; - take_waiting(mask, finished); - finish_contexts(finished, result); -} - - -// auth_pins -bool CInode::can_auth_pin() { - if (parent) - return parent->dir->can_auth_pin(); - return true; -} - -void CInode::auth_pin() { - if (auth_pins == 0) - get(PIN_AUTHPIN); - auth_pins++; - - dout(7) << "auth_pin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << endl; - - if (parent) - parent->dir->adjust_nested_auth_pins( 1 ); -} - -void CInode::auth_unpin() { - auth_pins--; - if (auth_pins == 0) - put(PIN_AUTHPIN); - - dout(7) << "auth_unpin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << endl; - - assert(auth_pins >= 0); - - if (parent) - parent->dir->adjust_nested_auth_pins( -1 ); -} - - - -// authority - -int CInode::authority() { - if (is_dangling()) - return dangling_auth; // explicit - - if (is_root()) { // i am root - if (dir) - return dir->get_dir_auth(); // bit of a chicken/egg issue here! - else - return CDIR_AUTH_UNKNOWN; - } - - if (parent) - return parent->dir->dentry_authority( parent->name ); - - return -1; // undefined (inode must not be linked yet!) -} - - -CInodeDiscover* CInode::replicate_to( int rep ) -{ - assert(is_auth()); - - // relax locks? - if (!is_replicated()) - replicate_relax_locks(); - - // return the thinger - int nonce = add_replica( rep ); - return new CInodeDiscover( this, nonce ); -} - - -// debug crap ----------------------------- - -void CInode::dump(int dep) -{ - string ind(dep, '\t'); - //cout << ind << "[inode " << this << "]" << endl; - - if (dir) - dir->dump(dep); -} - diff --git a/branches/marnberg/quota/mds/CInode.h b/branches/marnberg/quota/mds/CInode.h deleted file mode 100644 index d2292196a5ebc..0000000000000 --- a/branches/marnberg/quota/mds/CInode.h +++ /dev/null @@ -1,655 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __CINODE_H -#define __CINODE_H - -#include "config.h" -#include "include/types.h" -#include "include/lru.h" - -#include "mdstypes.h" - -#include "CDentry.h" -#include "Lock.h" -#include "Capability.h" - - -#include -#include -#include -#include -#include -#include -using namespace std; - - -// wait reasons -#define CINODE_WAIT_AUTHPINNABLE CDIR_WAIT_UNFREEZE - // waiters: write_hard_start, read_file_start, write_file_start (mdcache) - // handle_client_chmod, handle_client_touch (mds) - // trigger: (see CDIR_WAIT_UNFREEZE) -#define CINODE_WAIT_GETREPLICA (1<<11) // update/replicate individual inode - // waiters: import_dentry_inode - // trigger: handle_inode_replicate_ack - -#define CINODE_WAIT_DIR (1<<13) - // waiters: traverse_path - // triggers: handle_disocver_reply - -#define CINODE_WAIT_LINK (1<<14) // as in remotely nlink++ -#define CINODE_WAIT_ANCHORED (1<<15) -#define CINODE_WAIT_UNLINK (1<<16) // as in remotely nlink-- - -#define CINODE_WAIT_HARDR (1<<17) // 131072 -#define CINODE_WAIT_HARDW (1<<18) // 262... -#define CINODE_WAIT_HARDB (1<<19) -#define CINODE_WAIT_HARDRWB (CINODE_WAIT_HARDR|CINODE_WAIT_HARDW|CINODE_WAIT_HARDB) -#define CINODE_WAIT_HARDSTABLE (1<<20) -#define CINODE_WAIT_HARDNORD (1<<21) -#define CINODE_WAIT_FILER (1<<22) -#define CINODE_WAIT_FILEW (1<<23) -#define CINODE_WAIT_FILEB (1<<24) -#define CINODE_WAIT_FILERWB (CINODE_WAIT_FILER|CINODE_WAIT_FILEW|CINODE_WAIT_FILEB) -#define CINODE_WAIT_FILESTABLE (1<<25) -#define CINODE_WAIT_FILENORD (1<<26) -#define CINODE_WAIT_FILENOWR (1<<27) - -#define CINODE_WAIT_RENAMEACK (1<<28) -#define CINODE_WAIT_RENAMENOTIFYACK (1<<29) - -#define CINODE_WAIT_CAPS (1<<30) - -#define CINODE_WAIT_ANY 0xffffffff - - - -// misc -#define CINODE_EXPORT_NONCE 1 // nonce given to replicas created by export -#define CINODE_HASHREPLICA_NONCE 1 // hashed inodes that are duped ???FIXME??? - -class Context; -class CDentry; -class CDir; -class Message; -class CInode; -class CInodeDiscover; -class MDCache; - - -ostream& operator<<(ostream& out, CInode& in); - - -// cached inode wrapper -class CInode : public MDSCacheObject { - public: - // -- pins -- - static const int PIN_CACHED = 1; - static const int PIN_DIR = 2; - static const int PIN_DIRTY = 4; // must flush - static const int PIN_PROXY = 5; // can't expire yet - static const int PIN_WAITER = 6; // waiter - static const int PIN_CAPS = 7; // local fh's - static const int PIN_AUTHPIN = 8; - static const int PIN_IMPORTING = 9; // multipurpose, for importing - static const int PIN_REQUEST = 10; // request is logging, finishing - static const int PIN_RENAMESRC = 11; // pinned on dest for foreign rename - static const int PIN_ANCHORING = 12; - - static const int PIN_OPENINGDIR = 13; - - static const int PIN_DENTRYLOCK = 14; - - static const char *pin_name(int p) { - switch (p) { - case PIN_CACHED: return "cached"; - case PIN_DIR: return "dir"; - case PIN_DIRTY: return "dirty"; - case PIN_PROXY: return "proxy"; - case PIN_WAITER: return "waiter"; - case PIN_CAPS: return "caps"; - case PIN_AUTHPIN: return "authpin"; - case PIN_IMPORTING: return "importing"; - case PIN_REQUEST: return "request"; - case PIN_RENAMESRC: return "renamesrc"; - case PIN_ANCHORING: return "anchoring"; - case PIN_OPENINGDIR: return "openingdir"; - case PIN_DENTRYLOCK: return "dentrylock"; - default: assert(0); - } - } - - // state - static const int STATE_AUTH = (1<<0); - static const int STATE_ROOT = (1<<1); - static const int STATE_DIRTY = (1<<2); - static const int STATE_UNSAFE = (1<<3); // not logged yet - static const int STATE_DANGLING = (1<<4); // delete me when i expire; i have no dentry - static const int STATE_UNLINKING = (1<<5); - static const int STATE_PROXY = (1<<6); // can't expire yet - static const int STATE_EXPORTING = (1<<7); // on nonauth bystander. - static const int STATE_ANCHORING = (1<<8); - static const int STATE_OPENINGDIR = (1<<9); - //static const int STATE_RENAMING = (1<<8); // moving me - //static const int STATE_RENAMINGTO = (1<<9); // rename target (will be unlinked) - - - - - public: - MDCache *mdcache; - - inode_t inode; // the inode itself - - CDir *dir; // directory, if we have it opened. - string symlink; // symlink dest, if symlink - - protected: - // parent dentries in cache - int num_parents; - CDentry *parent; // primary link - set remote_parents; // if hard linked - - // -- distributed caching - int dangling_auth; // explicit auth, when dangling. - - int num_request_pins; - - // waiters - multimap waiting; - - - // -- distributed state -- -public: - // inode metadata locks - CLock hardlock; - CLock filelock; -protected: - // file capabilities - map client_caps; // client -> caps - map mds_caps_wanted; // [auth] mds -> caps wanted - int replica_caps_wanted; // [replica] what i've requested from auth - utime_t replica_caps_wanted_keep_until; - - - private: - // lock nesting - int auth_pins; - int nested_auth_pins; - - public: - meta_load_t popularity[MDS_NPOP]; - - // friends - friend class Server; - friend class Locker; - friend class Migrator; - friend class MDCache; - friend class CDir; - friend class CInodeExport; - friend class CInodeDiscover; - - public: - // --------------------------- - CInode(MDCache *c, bool auth=true); - ~CInode(); - - - // -- accessors -- - bool is_file() { return ((inode.mode & INODE_TYPE_MASK) == INODE_MODE_FILE) ? true:false; } - bool is_symlink() { return ((inode.mode & INODE_TYPE_MASK) == INODE_MODE_SYMLINK) ? true:false; } - bool is_dir() { return ((inode.mode & INODE_TYPE_MASK) == INODE_MODE_DIR) ? true:false; } - - bool is_anchored() { return inode.anchored; } - - bool is_root() { return state & STATE_ROOT; } - bool is_proxy() { return state & STATE_PROXY; } - - bool is_auth() { return state & STATE_AUTH; } - void set_auth(bool auth); - - inodeno_t ino() { return inode.ino; } - inode_t& get_inode() { return inode; } - CDentry* get_parent_dn() { return parent; } - CDir *get_parent_dir(); - CInode *get_parent_inode(); - CInode *get_realm_root(); // import, hash, or root - - CDir *get_or_open_dir(MDCache *mdcache); - CDir *set_dir(CDir *newdir); - void close_dir(); - - bool dir_is_auth(); - - - - // -- misc -- - void make_path(string& s); - void make_anchor_trace(vector& trace); - - - - // -- state -- - bool is_unsafe() { return state & STATE_UNSAFE; } - bool is_dangling() { return state & STATE_DANGLING; } - bool is_unlinking() { return state & STATE_UNLINKING; } - - void mark_unsafe() { state |= STATE_UNSAFE; } - void mark_safe() { state &= ~STATE_UNSAFE; } - - // -- state encoding -- - //void encode_basic_state(bufferlist& r); - //void decode_basic_state(bufferlist& r, int& off); - - - void encode_file_state(bufferlist& r); - void decode_file_state(bufferlist& r, int& off); - - void encode_hard_state(bufferlist& r); - void decode_hard_state(bufferlist& r, int& off); - - - // -- dirtyness -- - version_t get_version() { return inode.version; } - - bool is_dirty() { return state & STATE_DIRTY; } - bool is_clean() { return !is_dirty(); } - - version_t pre_dirty(); - void _mark_dirty(); - void mark_dirty(version_t projected_dirv); - void mark_clean(); - - - - - CInodeDiscover* replicate_to(int rep); - - - // -- waiting -- - bool waiting_for(int tag); - void add_waiter(int tag, Context *c); - void take_waiting(int tag, list& ls); - void finish_waiting(int mask, int result = 0); - - - bool is_hardlock_write_wanted() { - return waiting_for(CINODE_WAIT_HARDW); - } - bool is_filelock_write_wanted() { - return waiting_for(CINODE_WAIT_FILEW); - } - - // -- caps -- (new) - // client caps - map& get_client_caps() { return client_caps; } - void add_client_cap(int client, Capability& cap) { - if (client_caps.empty()) - get(PIN_CAPS); - assert(client_caps.count(client) == 0); - client_caps[client] = cap; - } - void remove_client_cap(int client) { - assert(client_caps.count(client) == 1); - client_caps.erase(client); - if (client_caps.empty()) - put(PIN_CAPS); - } - Capability* get_client_cap(int client) { - if (client_caps.count(client)) - return &client_caps[client]; - return 0; - } - /* - void set_client_caps(map& cl) { - if (client_caps.empty() && !cl.empty()) - get(PIN_CAPS); - client_caps.clear(); - client_caps = cl; - } - */ - void take_client_caps(map& cl) { - if (!client_caps.empty()) - put(PIN_CAPS); - cl = client_caps; - client_caps.clear(); - } - void merge_client_caps(map& cl, set& new_client_caps) { - if (client_caps.empty() && !cl.empty()) - get(PIN_CAPS); - for (map::iterator it = cl.begin(); - it != cl.end(); - it++) { - new_client_caps.insert(it->first); - if (client_caps.count(it->first)) { - // merge - client_caps[it->first].merge(it->second); - } else { - // new - client_caps[it->first] = it->second; - } - } - } - - // caps issued, wanted - int get_caps_issued() { - int c = 0; - for (map::iterator it = client_caps.begin(); - it != client_caps.end(); - it++) - c |= it->second.issued(); - return c; - } - int get_caps_wanted() { - int w = 0; - for (map::iterator it = client_caps.begin(); - it != client_caps.end(); - it++) { - w |= it->second.wanted(); - //cout << " get_caps_wanted client " << it->first << " " << cap_string(it->second.wanted()) << endl; - } - if (is_auth()) - for (map::iterator it = mds_caps_wanted.begin(); - it != mds_caps_wanted.end(); - it++) { - w |= it->second; - //cout << " get_caps_wanted mds " << it->first << " " << cap_string(it->second) << endl; - } - return w; - } - - - void replicate_relax_locks() { - assert(is_auth()); - assert(!is_replicated()); - dout(10) << " relaxing locks on " << *this << endl; - - if (hardlock.get_state() == LOCK_LOCK && - !hardlock.is_used()) { - dout(10) << " hard now sync " << *this << endl; - hardlock.set_state(LOCK_SYNC); - } - if (filelock.get_state() == LOCK_LOCK) { - if (!filelock.is_used() && - (get_caps_issued() & CAP_FILE_WR) == 0) { - filelock.set_state(LOCK_SYNC); - dout(10) << " file now sync " << *this << endl; - } else { - dout(10) << " can't relax filelock on " << *this << endl; - } - } - } - - - // -- authority -- - int authority(); - - - // -- auth pins -- - int is_auth_pinned() { - return auth_pins; - } - int adjust_nested_auth_pins(int a); - bool can_auth_pin(); - void auth_pin(); - void auth_unpin(); - - - // -- freeze -- - bool is_frozen(); - bool is_frozen_dir(); - bool is_freezing(); - - - // -- reference counting -- - - /* these can be pinned any # of times, and are - linked to an active_request, so they're automatically cleaned - up when a request is finished. pin at will! */ - void request_pin_get() { - if (num_request_pins == 0) get(PIN_REQUEST); - num_request_pins++; - } - void request_pin_put() { - num_request_pins--; - if (num_request_pins == 0) put(PIN_REQUEST); - assert(num_request_pins >= 0); - } - - void bad_put(int by) { - dout(7) << " bad put " << *this << " by " << by << " " << pin_name(by) << " was " << ref << " (" << ref_set << ")" << endl; - assert(ref_set.count(by) == 1); - assert(ref > 0); - } - void bad_get(int by) { - dout(7) << " bad get " << *this << " by " << by << " " << pin_name(by) << " was " << ref << " (" << ref_set << ")" << endl; - assert(ref_set.count(by) == 0); - } - void first_get(); - void last_put(); - - - // -- hierarchy stuff -- -private: - void get_parent(); - void put_parent(); - -public: - void set_primary_parent(CDentry *p) { - assert(parent == 0); - parent = p; - get_parent(); - } - void remove_primary_parent(CDentry *dn) { - assert(dn == parent); - parent = 0; - put_parent(); - } - void add_remote_parent(CDentry *p) { - if (remote_parents.empty()) - get_parent(); - remote_parents.insert(p); - } - void remove_remote_parent(CDentry *p) { - remote_parents.erase(p); - if (remote_parents.empty()) - put_parent(); - } - int num_remote_parents() { - return remote_parents.size(); - } - - - /* - // for giving to clients - void get_dist_spec(set& ls, int auth, timepair_t& now) { - if (( is_dir() && popularity[MDS_POP_CURDOM].get(now) > g_conf.mds_bal_replicate_threshold) || - (!is_dir() && popularity[MDS_POP_JUSTME].get(now) > g_conf.mds_bal_replicate_threshold)) { - //if (!cached_by.empty() && inode.ino > 1) dout(1) << "distributed spec for " << *this << endl; - ls = cached_by; - } - } - */ - - // dbg - void dump(int d = 0); -}; - - - - -// -- encoded state - -// discover - -class CInodeDiscover { - - inode_t inode; - int replica_nonce; - - int hardlock_state; - int filelock_state; - - public: - CInodeDiscover() {} - CInodeDiscover(CInode *in, int nonce) { - inode = in->inode; - replica_nonce = nonce; - - hardlock_state = in->hardlock.get_replica_state(); - filelock_state = in->filelock.get_replica_state(); - } - - inodeno_t get_ino() { return inode.ino; } - int get_replica_nonce() { return replica_nonce; } - - void update_inode(CInode *in) { - in->inode = inode; - - in->replica_nonce = replica_nonce; - in->hardlock.set_state(hardlock_state); - in->filelock.set_state(filelock_state); - } - - void _encode(bufferlist& bl) { - bl.append((char*)&inode, sizeof(inode)); - bl.append((char*)&replica_nonce, sizeof(replica_nonce)); - bl.append((char*)&hardlock_state, sizeof(hardlock_state)); - bl.append((char*)&filelock_state, sizeof(filelock_state)); - } - - void _decode(bufferlist& bl, int& off) { - bl.copy(off,sizeof(inode_t), (char*)&inode); - off += sizeof(inode_t); - bl.copy(off, sizeof(int), (char*)&replica_nonce); - off += sizeof(int); - bl.copy(off, sizeof(hardlock_state), (char*)&hardlock_state); - off += sizeof(hardlock_state); - bl.copy(off, sizeof(filelock_state), (char*)&filelock_state); - off += sizeof(filelock_state); - } - -}; - - -// export - -class CInodeExport { - - struct { - inode_t inode; - meta_load_t popularity_justme; - meta_load_t popularity_curdom; - bool is_dirty; // dirty inode? - - int num_caps; - } st; - - map replicas; - map cap_map; - - CLock hardlock,filelock; - //int remaining_issued; - -public: - CInodeExport() {} - CInodeExport(CInode *in) { - st.inode = in->inode; - st.is_dirty = in->is_dirty(); - replicas = in->replicas; - - hardlock = in->hardlock; - filelock = in->filelock; - - st.popularity_justme.take( in->popularity[MDS_POP_JUSTME] ); - st.popularity_curdom.take( in->popularity[MDS_POP_CURDOM] ); - in->popularity[MDS_POP_ANYDOM] -= st.popularity_curdom; - in->popularity[MDS_POP_NESTED] -= st.popularity_curdom; - - // steal WRITER caps from inode - in->take_client_caps(cap_map); - //remaining_issued = in->get_caps_issued(); - } - ~CInodeExport() { - } - - inodeno_t get_ino() { return st.inode.ino; } - - void update_inode(CInode *in, set& new_client_caps) { - in->inode = st.inode; - - in->popularity[MDS_POP_JUSTME] += st.popularity_justme; - in->popularity[MDS_POP_CURDOM] += st.popularity_curdom; - in->popularity[MDS_POP_ANYDOM] += st.popularity_curdom; - in->popularity[MDS_POP_NESTED] += st.popularity_curdom; - - if (st.is_dirty) - in->_mark_dirty(); - - in->replicas = replicas; - if (!replicas.empty()) - in->get(CInode::PIN_CACHED); - - in->hardlock = hardlock; - in->filelock = filelock; - - // caps - in->merge_client_caps(cap_map, new_client_caps); - } - - void _encode(bufferlist& bl) { - st.num_caps = cap_map.size(); - bl.append((char*)&st, sizeof(st)); - - // cached_by + nonce - ::_encode(replicas, bl); - - hardlock.encode_state(bl); - filelock.encode_state(bl); - - // caps - for (map::iterator it = cap_map.begin(); - it != cap_map.end(); - it++) { - bl.append((char*)&it->first, sizeof(it->first)); - it->second._encode(bl); - } - } - - int _decode(bufferlist& bl, int off = 0) { - bl.copy(off, sizeof(st), (char*)&st); - off += sizeof(st); - - ::_decode(replicas, bl, off); - - hardlock.decode_state(bl, off); - filelock.decode_state(bl, off); - - // caps - for (int i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __CAPABILITY_H -#define __CAPABILITY_H - -#include "include/buffer.h" - -#include -using namespace std; - -#include "config.h" - - -// definite caps -#define CAP_FILE_RDCACHE 1 // client can safely cache reads -#define CAP_FILE_RD 2 // client can read -#define CAP_FILE_WR 4 // client can write -#define CAP_FILE_WREXTEND 8 // client can extend file -#define CAP_FILE_WRBUFFER 16 // client can safely buffer writes -#define CAP_FILE_LAZYIO 32 // client can perform lazy io - - -// heuristics -//#define CAP_FILE_DELAYFLUSH 32 - -inline string cap_string(int cap) -{ - string s; - s = "["; - if (cap & CAP_FILE_RDCACHE) s += " rdcache"; - if (cap & CAP_FILE_RD) s += " rd"; - if (cap & CAP_FILE_WR) s += " wr"; - if (cap & CAP_FILE_WRBUFFER) s += " wrbuffer"; - if (cap & CAP_FILE_WRBUFFER) s += " wrextend"; - if (cap & CAP_FILE_LAZYIO) s += " lazyio"; - s += " ]"; - return s; -} - - -class Capability { - int wanted_caps; // what the client wants (ideally) - - map cap_history; // seq -> cap - long last_sent, last_recv; - - bool suppress; - -public: - Capability(int want=0) : - wanted_caps(want), - last_sent(0), - last_recv(0), - suppress(false) { - //cap_history[last_sent] = 0; - } - - - bool is_suppress() { return suppress; } - void set_suppress(bool b) { suppress = b; } - - bool is_null() { return cap_history.empty(); } - - // most recently issued caps. - int pending() { - if (cap_history.count(last_sent)) - return cap_history[ last_sent ]; - return 0; - } - - // caps client has confirmed receipt of - int confirmed() { - if (cap_history.count(last_recv)) - return cap_history[ last_recv ]; - return 0; - } - - // caps potentially issued - int issued() { - int c = 0; - for (long seq = last_recv; seq <= last_sent; seq++) { - if (cap_history.count(seq)) { - c |= cap_history[seq]; - dout(10) << " cap issued: seq " << seq << " " << cap_string(cap_history[seq]) << " -> " << cap_string(c) << endl; - } - } - return c; - } - - // caps this client wants to hold - int wanted() { return wanted_caps; } - void set_wanted(int w) { - wanted_caps = w; - } - - // needed - static int needed(int from) { - // strip out wrbuffer, rdcache - return from & (CAP_FILE_WR|CAP_FILE_RD); - } - int needed() { return needed(wanted_caps); } - - // conflicts - static int conflicts(int from) { - int c = 0; - if (from & CAP_FILE_WRBUFFER) c |= CAP_FILE_RDCACHE|CAP_FILE_RD; - if (from & CAP_FILE_WR) c |= CAP_FILE_RDCACHE; - if (from & CAP_FILE_RD) c |= CAP_FILE_WRBUFFER; - if (from & CAP_FILE_RDCACHE) c |= CAP_FILE_WRBUFFER|CAP_FILE_WR; - return c; - } - int wanted_conflicts() { return conflicts(wanted()); } - int needed_conflicts() { return conflicts(needed()); } - int issued_conflicts() { return conflicts(issued()); } - - // issue caps; return seq number. - long issue(int c) { - //int was = pending(); - //no! if (c == was && last_sent) return -1; // repeat of previous? - - ++last_sent; - cap_history[last_sent] = c; - - /* no! - // not recalling, just adding? - if (c & ~was && - cap_history.count(last_sent-1)) { - cap_history.erase(last_sent-1); - } - */ - return last_sent; - } - long get_last_seq() { return last_sent; } - - void merge(Capability& other) { - // issued + pending - int newpending = other.pending() | pending(); - if (other.issued() & ~newpending) - issue(other.issued() | newpending); - issue(newpending); - - // wanted - wanted_caps = wanted_caps | other.wanted(); - } - - // confirm receipt of a previous sent/issued seq. - int confirm_receipt(long seq, int caps) { - int r = 0; - - // old seqs - while (last_recv < seq) { - dout(10) << " cap.confirm_receipt forgetting seq " << last_recv << " " << cap_string(cap_history[last_recv]) << endl; - r |= cap_history[last_recv]; - cap_history.erase(last_recv); - ++last_recv; - } - - // release current? - if (cap_history.count(seq) && - cap_history[seq] != caps) { - dout(10) << " cap.confirm_receipt revising seq " << seq << " " << cap_string(cap_history[seq]) << " -> " << cap_string(caps) << endl; - // note what we're releasing.. - assert(cap_history[seq] & ~caps); - r |= cap_history[seq] & ~caps; - - cap_history[seq] = caps; // confirmed() now less than before.. - } - - // null? - if (caps == 0 && - cap_history.size() == 1 && - cap_history.count(seq)) { - cap_history.clear(); // viola, null! - } - - return r; - } - - // serializers - void _encode(bufferlist& bl) { - bl.append((char*)&wanted_caps, sizeof(wanted_caps)); - bl.append((char*)&last_sent, sizeof(last_sent)); - bl.append((char*)&last_recv, sizeof(last_recv)); - ::_encode(cap_history, bl); - } - void _decode(bufferlist& bl, int& off) { - bl.copy(off, sizeof(wanted_caps), (char*)&wanted_caps); - off += sizeof(wanted_caps); - bl.copy(off, sizeof(last_sent), (char*)&last_sent); - off += sizeof(last_sent); - bl.copy(off, sizeof(last_recv), (char*)&last_recv); - off += sizeof(last_recv); - ::_decode(cap_history, bl, off); - } - -}; - - - - - -#endif diff --git a/branches/marnberg/quota/mds/ClientMap.h b/branches/marnberg/quota/mds/ClientMap.h deleted file mode 100644 index 7cd1e496debdd..0000000000000 --- a/branches/marnberg/quota/mds/ClientMap.h +++ /dev/null @@ -1,85 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __CLIENTMAP_H -#define __CLIENTMAP_H - -#include "msg/Message.h" - -#include -using namespace std; - -#include -using namespace __gnu_cxx; - - -/* - * this structure is used by the MDS purely so that - * it can remember client addresses (entity_inst_t) - * while processing request(s) on behalf of clients. - * as such it's only really a sort of short-term cache. - * - * it also remembers which clients mounted via this MDS, - * for the same reason (so that mounted clients can be - * contacted if necessary). - */ -class ClientMap { - hash_map client_inst; - set client_mount; - hash_map client_ref; - - void inc_ref(int client, const entity_inst_t& inst) { - if (client_inst.count(client)) { - assert(client_inst[client] == inst); - assert(client_ref.count(client)); - } else { - client_inst[client] = inst; - } - client_ref[client]++; - } - void dec_ref(int client) { - assert(client_ref.count(client)); - assert(client_ref[client] > 0); - client_ref[client]--; - if (client_ref[client] == 0) { - client_ref.erase(client); - client_inst.erase(client); - } - } - -public: - const entity_inst_t& get_inst(int client) { - assert(client_inst.count(client)); - return client_inst[client]; - } - const set& get_mount_set() { return client_mount; } - - void add_mount(int client, const entity_inst_t& inst) { - inc_ref(client, inst); - client_mount.insert(client); - } - void rem_mount(int client) { - dec_ref(client); - client_mount.erase(client); - } - - - void add_open(int client, const entity_inst_t& inst) { - inc_ref(client, inst); - } - void dec_open(int client) { - dec_ref(client); - } -}; - -#endif diff --git a/branches/marnberg/quota/mds/IdAllocator.cc b/branches/marnberg/quota/mds/IdAllocator.cc deleted file mode 100644 index 671bd70a77c27..0000000000000 --- a/branches/marnberg/quota/mds/IdAllocator.cc +++ /dev/null @@ -1,200 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#define DBLEVEL 20 - -#include "IdAllocator.h" -#include "MDS.h" -#include "MDLog.h" -#include "events/EAlloc.h" - -#include "osdc/Filer.h" - -#include "include/types.h" - -#include "config.h" -#undef dout -#define dout(x) if (x <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".idalloc: " - - -void IdAllocator::init_inode() -{ - memset(&inode, 0, sizeof(inode)); - inode.ino = MDS_INO_IDS_OFFSET + mds->get_nodeid(); - inode.layout = g_OSD_FileLayout; -} - - -idno_t IdAllocator::alloc_id(bool replay) -{ - assert(is_active()); - - // pick one - idno_t id = free.start(); - free.erase(id); - dout(10) << "idalloc " << this << ": alloc id " << id << endl; - - version++; - - // log it - if (!replay) - mds->mdlog->submit_entry(new EAlloc(IDTYPE_INO, id, EALLOC_EV_ALLOC, version)); - - return id; -} - -void IdAllocator::reclaim_id(idno_t id, bool replay) -{ - assert(is_active()); - - dout(10) << "idalloc " << this << ": reclaim id " << id << endl; - free.insert(id); - - version++; - - if (!replay) - mds->mdlog->submit_entry(new EAlloc(IDTYPE_INO, id, EALLOC_EV_FREE, version)); -} - - - -class C_ID_Save : public Context { - IdAllocator *ida; - version_t version; -public: - C_ID_Save(IdAllocator *i, version_t v) : ida(i), version(v) {} - void finish(int r) { - ida->save_2(version); - } -}; - -void IdAllocator::save(Context *onfinish, version_t v) -{ - if (v > 0 && v <= committing_version) { - dout(10) << "save v " << version << " - already saving " - << committing_version << " >= needed " << v << endl; - waitfor_save[v].push_back(onfinish); - return; - } - - dout(10) << "save v " << version << endl; - assert(is_active()); - - bufferlist bl; - - bl.append((char*)&version, sizeof(version)); - ::_encode(free.m, bl); - - committing_version = version; - - if (onfinish) - waitfor_save[version].push_back(onfinish); - - // write (async) - mds->filer->write(inode, - 0, bl.length(), bl, - 0, - 0, new C_ID_Save(this, version)); -} - -void IdAllocator::save_2(version_t v) -{ - dout(10) << "save_2 v " << v << endl; - - committed_version = v; - - list ls; - while (!waitfor_save.empty()) { - if (waitfor_save.begin()->first > v) break; - ls.splice(ls.end(), waitfor_save.begin()->second); - waitfor_save.erase(waitfor_save.begin()); - } - finish_contexts(ls,0); -} - - -void IdAllocator::reset() -{ - init_inode(); - - free.clear(); - - // use generic range FIXME THIS IS CRAP - free.insert((long long)0x1000000 * (long long)(mds->get_nodeid()+1), - (long long)0x1000000 * (long long)(mds->get_nodeid()+2) - 1LL); - //free[ID_INO].dump(); - - //free[ID_FH].map_insert(10000000LL * (mds->get_nodeid()+1), - //10000000LL * (mds->get_nodeid()+2) - 1); - //free[ID_FH].dump(); - - state = STATE_ACTIVE; -} - - - -// ----------------------- - -class C_ID_Load : public Context { -public: - IdAllocator *ida; - Context *onfinish; - bufferlist bl; - C_ID_Load(IdAllocator *i, Context *o) : ida(i), onfinish(o) {} - void finish(int r) { - ida->load_2(r, bl, onfinish); - } -}; - -void IdAllocator::load(Context *onfinish) -{ - dout(10) << "load" << endl; - - init_inode(); - - assert(is_undef()); - state = STATE_OPENING; - - C_ID_Load *c = new C_ID_Load(this, onfinish); - mds->filer->read(inode, - 0, inode.layout.stripe_size, - &c->bl, - c); -} - -void IdAllocator::load_2(int r, bufferlist& bl, Context *onfinish) -{ - assert(is_opening()); - state = STATE_ACTIVE; - - if (r > 0) { - dout(10) << "load_2 got " << bl.length() << " bytes" << endl; - int off = 0; - bl.copy(off, sizeof(version), (char*)&version); - off += sizeof(version); - ::_decode(free.m, bl, off); - committed_version = version; - } - else { - dout(10) << "load_2 found no alloc file" << endl; - assert(0); // this shouldn't happen if mkfs finished. - reset(); - } - - if (onfinish) { - onfinish->finish(0); - delete onfinish; - } -} diff --git a/branches/marnberg/quota/mds/IdAllocator.h b/branches/marnberg/quota/mds/IdAllocator.h deleted file mode 100644 index c79266d3e71b6..0000000000000 --- a/branches/marnberg/quota/mds/IdAllocator.h +++ /dev/null @@ -1,79 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __IDALLOCATOR_H -#define __IDALLOCATOR_H - -#include "include/types.h" -#include "include/interval_set.h" -#include "include/buffer.h" -#include "include/Context.h" - -class MDS; - -#define IDTYPE_INO 1 -typedef inodeno_t idno_t; - -class IdAllocator { - MDS *mds; - inode_t inode; - - static const int STATE_UNDEF = 0; - static const int STATE_OPENING = 1; - static const int STATE_ACTIVE = 2; - //static const int STATE_COMMITTING = 3; - int state; - - version_t version, committing_version, committed_version; - - interval_set free; // unused ids - - map > waitfor_save; - - public: - IdAllocator(MDS *m) : - mds(m), - state(STATE_UNDEF), - version(0), committing_version(0), committed_version(0) - { - } - - void init_inode(); - - // alloc or reclaim ids - idno_t alloc_id(bool replay=false); - void reclaim_id(idno_t id, bool replay=false); - - version_t get_version() { return version; } - version_t get_committed_version() { return committed_version; } - - // load/save from disk (hack) - bool is_undef() { return state == STATE_UNDEF; } - bool is_active() { return state == STATE_ACTIVE; } - bool is_opening() { return state == STATE_OPENING; } - - void reset(); - void save(Context *onfinish=0, version_t need=0); - void save_2(version_t v); - - void shutdown() { - if (is_active()) save(0); - } - - void load(Context *onfinish); - void load_2(int, bufferlist&, Context *onfinish); - -}; - -#endif diff --git a/branches/marnberg/quota/mds/Lock.h b/branches/marnberg/quota/mds/Lock.h deleted file mode 100644 index 0d9dabb61b669..0000000000000 --- a/branches/marnberg/quota/mds/Lock.h +++ /dev/null @@ -1,321 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __LOCK_H -#define __LOCK_H - -#include -#include -using namespace std; - -#include "include/buffer.h" - -#include "Capability.h" - -// states and such. -// C = cache reads, R = read, W = write, A = append, B = buffer writes, L = lazyio - -// basic lock -----auth-------- ---replica------- -#define LOCK_SYNC 0 // AR R . / C R . . . L R . / C R . . . L stat() -#define LOCK_LOCK 1 // AR R W / C . . . . . . . / C . . . . . truncate() -#define LOCK_GLOCKR 2 // AR R . / C . . . . . . . / C . . . . . - -// file lock states -#define LOCK_GLOCKL 3 // A . . / . . . . . . loner -> lock -#define LOCK_GLOCKM 4 // A . . / . . . . . . -#define LOCK_MIXED 5 // AR . . / . R W A . L . . / . R . . . L -#define LOCK_GMIXEDR 6 // AR R . / . R . . . L . . / . R . . . L -#define LOCK_GMIXEDL 7 // A . . / . . . . . L loner -> mixed - -#define LOCK_LONER 8 // A . . / C R W A B L (lock) -#define LOCK_GLONERR 9 // A . . / . R . . . L -#define LOCK_GLONERM 10 // A . . / . R W A . L - -#define LOCK_GSYNCL 11 // A . . / C ? . . . L loner -> sync (*) FIXME: let old loner keep R, somehow... -#define LOCK_GSYNCM 12 // A . . / . R . . . L - -// 4 stable -// +9 transition -// 13 total - -/* no append scenarios: - -loner + truncate(): - - loner needs to lose A (?unless it's the loner doing the truncate?) -loner + statlite(size): - - loner needs to lose A - -any + statlite(size) - - all lose A - -any + statlite(mtime) - - all lose W - --> we need to add lonerfixed and mixedfixed states (and associated transitions) - in order to efficiently support statlite(size) and truncate(). until then, - we have to LOCK. - - */ - -// -- lock... hard or file - -class Message; - -class CLock { - protected: - // lock state - char state; - set gather_set; // auth - - // local state - int nread; - Message *wrlock_by; - - - public: - CLock() : - state(LOCK_SYNC), - nread(0), - wrlock_by(0) { - } - - // encode/decode - void encode_state(bufferlist& bl) { - bl.append((char*)&state, sizeof(state)); - _encode(gather_set, bl); - - //bl.append((char*)&nread, sizeof(nread)); - //bl.append((char*)&nwrite, sizeof(nwrite)); - } - void decode_state(bufferlist& bl, int& off) { - bl.copy(off, sizeof(state), (char*)&state); - off += sizeof(state); - _decode(gather_set, bl, off); - - //bl.copy(off, sizeof(nread), (char*)&nread); - //off += sizeof(nread); - //bl.copy(off, sizeof(nwrite), (char*)&nwrite); - //off += sizeof(nwrite); - } - - char get_state() { return state; } - char set_state(char s) { - state = s; - assert(!is_stable() || gather_set.size() == 0); // gather should be empty in stable states. - return s; - }; - - char get_replica_state() { - switch (state) { - case LOCK_LOCK: - case LOCK_GLOCKM: - case LOCK_GLOCKL: - case LOCK_GLOCKR: - case LOCK_LONER: - case LOCK_GLONERR: - case LOCK_GLONERM: - return LOCK_LOCK; - case LOCK_MIXED: - case LOCK_GMIXEDR: - return LOCK_MIXED; - case LOCK_SYNC: - return LOCK_SYNC; - - // after gather auth will bc LOCK_AC_MIXED or whatever - case LOCK_GSYNCM: - return LOCK_MIXED; - case LOCK_GSYNCL: - case LOCK_GMIXEDL: // ** LOCK isn't exact right state, but works. - return LOCK_LOCK; - - default: - assert(0); - } - return 0; - } - - // gather set - set& get_gather_set() { return gather_set; } - void init_gather(const map& i) { - for (map::const_iterator p = i.begin(); p != i.end(); ++p) - gather_set.insert(p->first); - } - bool is_gathering(int i) { - return gather_set.count(i); - } - void clear_gather() { - gather_set.clear(); - } - - // ref counting - int get_read() { return ++nread; } - int put_read() { - assert(nread>0); - return --nread; - } - int get_nread() { return nread; } - - void get_write(Message *who) { - assert(wrlock_by == 0); - wrlock_by = who; - } - void put_write() { - assert(wrlock_by); - wrlock_by = 0; - } - bool is_wrlocked() { return wrlock_by ? true:false; } - Message *get_wrlocked_by() { return wrlock_by; } - bool is_used() { - return (is_wrlocked() || (nread>0)) ? true:false; - } - - - // stable - bool is_stable() { - return (state == LOCK_SYNC) || - (state == LOCK_LOCK) || - (state == LOCK_MIXED) || - (state == LOCK_LONER); - } - - // read/write access - bool can_read(bool auth) { - if (auth) - return (state == LOCK_SYNC) || (state == LOCK_GMIXEDR) - || (state == LOCK_GLOCKR) || (state == LOCK_LOCK); - else - return (state == LOCK_SYNC); - } - bool can_read_soon(bool auth) { - if (auth) - return (state == LOCK_GLOCKL); - else - return false; - } - - bool can_write(bool auth) { - if (auth) - return (state == LOCK_LOCK) && !is_wrlocked(); - else - return false; - } - bool can_write_soon(bool auth) { - if (auth) - return (state == LOCK_GLOCKR) || (state == LOCK_GLOCKL) - || (state == LOCK_GLOCKM); - else - return false; - } - - // client caps allowed - int caps_allowed_ever(bool auth) { - if (auth) - return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_WR | CAP_FILE_WREXTEND | CAP_FILE_WRBUFFER | CAP_FILE_LAZYIO; - else - return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_LAZYIO; - } - int caps_allowed(bool auth) { - if (auth) - switch (state) { - case LOCK_SYNC: - return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_LAZYIO; - case LOCK_LOCK: - case LOCK_GLOCKR: - return CAP_FILE_RDCACHE; - - case LOCK_GLOCKL: - case LOCK_GLOCKM: - return 0; - - case LOCK_MIXED: - return CAP_FILE_RD | CAP_FILE_WR | CAP_FILE_WREXTEND | CAP_FILE_LAZYIO; - case LOCK_GMIXEDR: - return CAP_FILE_RD | CAP_FILE_LAZYIO; - case LOCK_GMIXEDL: - return 0; - - case LOCK_LONER: // single client writer, of course. - return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_WR | CAP_FILE_WREXTEND | CAP_FILE_WRBUFFER | CAP_FILE_LAZYIO; - case LOCK_GLONERR: - return CAP_FILE_RD | CAP_FILE_LAZYIO; - case LOCK_GLONERM: - return CAP_FILE_RD | CAP_FILE_WR | CAP_FILE_WREXTEND | CAP_FILE_LAZYIO; - - case LOCK_GSYNCL: - return CAP_FILE_RDCACHE | CAP_FILE_LAZYIO; - case LOCK_GSYNCM: - return CAP_FILE_RD | CAP_FILE_LAZYIO; - } - else - switch (state) { - case LOCK_SYNC: - return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_LAZYIO; - case LOCK_LOCK: - case LOCK_GLOCKR: - return CAP_FILE_RDCACHE; - case LOCK_GMIXEDR: - case LOCK_MIXED: - return CAP_FILE_RD | CAP_FILE_LAZYIO; - } - assert(0); - return 0; - } - - friend class MDCache; - friend class Locker; - friend class Migrator; -}; - -//ostream& operator<<(ostream& out, CLock& l); -inline ostream& operator<<(ostream& out, CLock& l) -{ - static char* __lock_states[] = { - "sync", - "lock", - "glockr", - "glockl", - "glockm", - "mixed", - "gmixedr", - "gmixedl", - "loner", - "glonerr", - "glonerm", - "gsyncl", - "gsyncm" - }; - - out << "(" << __lock_states[(int)l.get_state()]; - - if (!l.get_gather_set().empty()) out << " g=" << l.get_gather_set(); - - if (l.get_nread()) - out << " r=" << l.get_nread(); - if (l.is_wrlocked()) - out << " w=" << l.get_wrlocked_by(); - - // rw? - /* - out << " "; - if (l.can_read(true)) out << "r[" << l.get_nread() << "]"; - if (l.can_write(true)) out << "w[" << l.get_nwrite() << "]"; - out << "/"; - if (l.can_read(false)) out << "r[" << l.get_nread() << "]"; - if (l.can_write(false)) out << "w[" << l.get_nwrite() << "]"; - */ - out << ")"; - return out; -} - -#endif diff --git a/branches/marnberg/quota/mds/Locker.cc b/branches/marnberg/quota/mds/Locker.cc deleted file mode 100644 index 7cecbfe785fbc..0000000000000 --- a/branches/marnberg/quota/mds/Locker.cc +++ /dev/null @@ -1,2237 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include "MDS.h" -#include "MDCache.h" -#include "Locker.h" -#include "Server.h" -#include "CInode.h" -#include "CDir.h" -#include "CDentry.h" -#include "Migrator.h" - -#include "MDBalancer.h" -#include "MDLog.h" -#include "MDSMap.h" - -#include "include/filepath.h" - -#include "events/EString.h" -#include "events/EUpdate.h" -#include "events/EUnlink.h" - -#include "msg/Messenger.h" - -#include "messages/MGenericMessage.h" -#include "messages/MDiscover.h" -#include "messages/MDiscoverReply.h" - -#include "messages/MDirUpdate.h" - -#include "messages/MInodeFileCaps.h" - -#include "messages/MInodeLink.h" -#include "messages/MInodeLinkAck.h" -#include "messages/MInodeUnlink.h" -#include "messages/MInodeUnlinkAck.h" - -#include "messages/MLock.h" -#include "messages/MDentryUnlink.h" - -#include "messages/MClientRequest.h" -#include "messages/MClientFileCaps.h" - -#include -#include - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".locker " - - - -void Locker::dispatch(Message *m) -{ - switch (m->get_type()) { - - // locking - case MSG_MDS_LOCK: - handle_lock((MLock*)m); - break; - - // cache fun - case MSG_MDS_INODEFILECAPS: - handle_inode_file_caps((MInodeFileCaps*)m); - break; - - case MSG_CLIENT_FILECAPS: - handle_client_file_caps((MClientFileCaps*)m); - break; - - - - default: - assert(0); - } -} - - -void Locker::send_lock_message(CInode *in, int msg, int type) -{ - for (map::iterator it = in->replicas_begin(); - it != in->replicas_end(); - it++) { - MLock *m = new MLock(msg, mds->get_nodeid()); - m->set_ino(in->ino(), type); - mds->send_message_mds(m, it->first, MDS_PORT_LOCKER); - } -} - - -void Locker::send_lock_message(CInode *in, int msg, int type, bufferlist& data) -{ - for (map::iterator it = in->replicas_begin(); - it != in->replicas_end(); - it++) { - MLock *m = new MLock(msg, mds->get_nodeid()); - m->set_ino(in->ino(), type); - m->set_data(data); - mds->send_message_mds(m, it->first, MDS_PORT_LOCKER); - } -} - -void Locker::send_lock_message(CDentry *dn, int msg) -{ - for (map::iterator it = dn->replicas_begin(); - it != dn->replicas_end(); - it++) { - MLock *m = new MLock(msg, mds->get_nodeid()); - m->set_dn(dn->dir->ino(), dn->name); - mds->send_message_mds(m, it->first, MDS_PORT_LOCKER); - } -} - - - -// file i/o ----------------------------------------- - -__uint64_t Locker::issue_file_data_version(CInode *in) -{ - dout(7) << "issue_file_data_version on " << *in << endl; - return in->inode.file_data_version; -} - - -Capability* Locker::issue_new_caps(CInode *in, - int mode, - MClientRequest *req) -{ - dout(7) << "issue_new_caps for mode " << mode << " on " << *in << endl; - - // my needs - int my_client = req->get_client(); - int my_want = 0; - if (mode & FILE_MODE_R) my_want |= CAP_FILE_RDCACHE | CAP_FILE_RD; - if (mode & FILE_MODE_W) my_want |= CAP_FILE_WRBUFFER | CAP_FILE_WR; - - // register a capability - Capability *cap = in->get_client_cap(my_client); - if (!cap) { - // new cap - Capability c(my_want); - in->add_client_cap(my_client, c); - cap = in->get_client_cap(my_client); - - // note client addr - mds->clientmap.add_open(my_client, req->get_client_inst()); - - } else { - // make sure it has sufficient caps - if (cap->wanted() & ~my_want) { - // augment wanted caps for this client - cap->set_wanted( cap->wanted() | my_want ); - } - } - - // suppress file cap messages for this guy for a few moments (we'll bundle with the open() reply) - cap->set_suppress(true); - int before = cap->pending(); - - if (in->is_auth()) { - // [auth] twiddle mode? - inode_file_eval(in); - } else { - // [replica] tell auth about any new caps wanted - request_inode_file_caps(in); - } - - // issue caps (pot. incl new one) - issue_caps(in); // note: _eval above may have done this already... - - // re-issue whatever we can - cap->issue(cap->pending()); - - // ok, stop suppressing. - cap->set_suppress(false); - - int now = cap->pending(); - if (before != now && - (before & CAP_FILE_WR) == 0 && - (now & CAP_FILE_WR)) { - // FIXME FIXME FIXME - } - - // twiddle file_data_version? - if ((before & CAP_FILE_WRBUFFER) == 0 && - (now & CAP_FILE_WRBUFFER)) { - in->inode.file_data_version++; - dout(7) << " incrementing file_data_version, now " << in->inode.file_data_version << " for " << *in << endl; - } - - return cap; -} - - - -bool Locker::issue_caps(CInode *in) -{ - // allowed caps are determined by the lock mode. - int allowed = in->filelock.caps_allowed(in->is_auth()); - dout(7) << "issue_caps filelock allows=" << cap_string(allowed) - << " on " << *in << endl; - - // count conflicts with - int nissued = 0; - - // client caps - for (map::iterator it = in->client_caps.begin(); - it != in->client_caps.end(); - it++) { - if (it->second.issued() != (it->second.wanted() & allowed)) { - // issue - nissued++; - - int before = it->second.pending(); - long seq = it->second.issue(it->second.wanted() & allowed); - int after = it->second.pending(); - - // twiddle file_data_version? - if (!(before & CAP_FILE_WRBUFFER) && - (after & CAP_FILE_WRBUFFER)) { - dout(7) << " incrementing file_data_version for " << *in << endl; - in->inode.file_data_version++; - } - - if (seq > 0 && - !it->second.is_suppress()) { - dout(7) << " sending MClientFileCaps to client" << it->first << " seq " << it->second.get_last_seq() << " new pending " << cap_string(it->second.pending()) << " was " << cap_string(before) << endl; - mds->messenger->send_message(new MClientFileCaps(in->inode, - it->second.get_last_seq(), - it->second.pending(), - it->second.wanted()), - mds->clientmap.get_inst(it->first), - 0, MDS_PORT_LOCKER); - } - } - } - - return (nissued == 0); // true if no re-issued, no callbacks -} - - - -void Locker::request_inode_file_caps(CInode *in) -{ - int wanted = in->get_caps_wanted(); - if (wanted != in->replica_caps_wanted) { - - if (wanted == 0) { - if (in->replica_caps_wanted_keep_until > g_clock.recent_now()) { - // ok, release them finally! - in->replica_caps_wanted_keep_until.sec_ref() = 0; - dout(7) << "request_inode_file_caps " << cap_string(wanted) - << " was " << cap_string(in->replica_caps_wanted) - << " no keeping anymore " - << " on " << *in - << endl; - } - else if (in->replica_caps_wanted_keep_until.sec() == 0) { - in->replica_caps_wanted_keep_until = g_clock.recent_now(); - in->replica_caps_wanted_keep_until.sec_ref() += 2; - - dout(7) << "request_inode_file_caps " << cap_string(wanted) - << " was " << cap_string(in->replica_caps_wanted) - << " keeping until " << in->replica_caps_wanted_keep_until - << " on " << *in - << endl; - return; - } else { - // wait longer - return; - } - } else { - in->replica_caps_wanted_keep_until.sec_ref() = 0; - } - assert(!in->is_auth()); - - int auth = in->authority(); - dout(7) << "request_inode_file_caps " << cap_string(wanted) - << " was " << cap_string(in->replica_caps_wanted) - << " on " << *in << " to mds" << auth << endl; - assert(!in->is_auth()); - - in->replica_caps_wanted = wanted; - mds->send_message_mds(new MInodeFileCaps(in->ino(), mds->get_nodeid(), - in->replica_caps_wanted), - auth, MDS_PORT_LOCKER); - } else { - in->replica_caps_wanted_keep_until.sec_ref() = 0; - } -} - -void Locker::handle_inode_file_caps(MInodeFileCaps *m) -{ - CInode *in = mdcache->get_inode(m->get_ino()); - assert(in); - assert(in->is_auth() || in->is_proxy()); - - dout(7) << "handle_inode_file_caps replica mds" << m->get_from() << " wants caps " << cap_string(m->get_caps()) << " on " << *in << endl; - - if (in->is_proxy()) { - dout(7) << "proxy, fw" << endl; - mds->send_message_mds(m, in->authority(), MDS_PORT_LOCKER); - return; - } - - if (m->get_caps()) - in->mds_caps_wanted[m->get_from()] = m->get_caps(); - else - in->mds_caps_wanted.erase(m->get_from()); - - inode_file_eval(in); - delete m; -} - - -/* - * note: we only get these from the client if - * - we are calling back previously issued caps (fewer than the client previously had) - * - or if the client releases (any of) its caps on its own - */ -void Locker::handle_client_file_caps(MClientFileCaps *m) -{ - int client = m->get_source().num(); - CInode *in = mdcache->get_inode(m->get_ino()); - Capability *cap = 0; - if (in) - cap = in->get_client_cap(client); - - if (!in || !cap) { - if (!in) { - dout(7) << "handle_client_file_caps on unknown ino " << m->get_ino() << ", dropping" << endl; - } else { - dout(7) << "handle_client_file_caps no cap for client" << client << " on " << *in << endl; - } - delete m; - return; - } - - assert(cap); - - // filter wanted based on what we could ever give out (given auth/replica status) - int wanted = m->get_wanted() & in->filelock.caps_allowed_ever(in->is_auth()); - - dout(7) << "handle_client_file_caps seq " << m->get_seq() - << " confirms caps " << cap_string(m->get_caps()) - << " wants " << cap_string(wanted) - << " from client" << client - << " on " << *in - << endl; - - // update wanted - if (cap->wanted() != wanted) - cap->set_wanted(wanted); - - // confirm caps - int had = cap->confirm_receipt(m->get_seq(), m->get_caps()); - int has = cap->confirmed(); - if (cap->is_null()) { - dout(7) << " cap for client" << client << " is now null, removing from " << *in << endl; - in->remove_client_cap(client); - if (!in->is_auth()) - request_inode_file_caps(in); - - // dec client addr counter - mds->clientmap.dec_open(client); - - // tell client. - MClientFileCaps *r = new MClientFileCaps(in->inode, - 0, 0, 0, - MClientFileCaps::FILECAP_RELEASE); - mds->messenger->send_message(r, m->get_source_inst(), 0, MDS_PORT_LOCKER); - } - - // merge in atime? - if (m->get_inode().atime > in->inode.atime) { - dout(7) << " taking atime " << m->get_inode().atime << " > " - << in->inode.atime << " for " << *in << endl; - in->inode.atime = m->get_inode().atime; - } - - if ((has|had) & CAP_FILE_WR) { - bool dirty = false; - - // mtime - if (m->get_inode().mtime > in->inode.mtime) { - dout(7) << " taking mtime " << m->get_inode().mtime << " > " - << in->inode.mtime << " for " << *in << endl; - in->inode.mtime = m->get_inode().mtime; - dirty = true; - } - // size - if (m->get_inode().size > in->inode.size) { - dout(7) << " taking size " << m->get_inode().size << " > " - << in->inode.size << " for " << *in << endl; - in->inode.size = m->get_inode().size; - dirty = true; - } - - if (dirty) - mds->mdlog->submit_entry(new EString("cap inode update dirty fixme")); - } - - // reevaluate, waiters - inode_file_eval(in); - in->finish_waiting(CINODE_WAIT_CAPS, 0); - - delete m; -} - - - - - - - - - - -// locks ---------------------------------------------------------------- - -/* - - -INODES: - -= two types of inode metadata: - hard - uid/gid, mode - file - mtime, size - ? atime - atime (*) <-- we want a lazy update strategy? - -= correspondingly, two types of inode locks: - hardlock - hard metadata - filelock - file metadata - - -> These locks are completely orthogonal! - -= metadata ops and how they affect inode metadata: - sma=size mtime atime - HARD FILE OP - files: - R RRR stat - RW chmod/chown - R W touch ?ctime - R openr - W read atime - R openw - Wc openwc ?ctime - WW write size mtime - close - - dirs: - R W readdir atime - RRR ( + implied stats on files) - Rc WW mkdir (ctime on new dir, size+mtime on parent dir) - R WW link/unlink/rename/rmdir (size+mtime on dir) - - - -= relationship to client (writers): - - - ops in question are - - stat ... need reasonable value for mtime (+ atime?) - - maybe we want a "quicksync" type operation instead of full lock - - truncate ... need to stop writers for the atomic truncate operation - - need a full lock - - - - -= modes - - SYNC - Rauth Rreplica Wauth Wreplica - sync - - - - - -ALSO: - - dirlock - no dir changes (prior to unhashing) - denlock - dentry lock (prior to unlink, rename) - - -*/ - - -void Locker::handle_lock(MLock *m) -{ - switch (m->get_otype()) { - case LOCK_OTYPE_IHARD: - handle_lock_inode_hard(m); - break; - - case LOCK_OTYPE_IFILE: - handle_lock_inode_file(m); - break; - - case LOCK_OTYPE_DIR: - handle_lock_dir(m); - break; - - case LOCK_OTYPE_DN: - handle_lock_dn(m); - break; - - default: - dout(7) << "handle_lock got otype " << m->get_otype() << endl; - assert(0); - break; - } -} - - - -// =============================== -// hard inode metadata - -bool Locker::inode_hard_read_try(CInode *in, Context *con) -{ - dout(7) << "inode_hard_read_try on " << *in << endl; - - // can read? grab ref. - if (in->hardlock.can_read(in->is_auth())) - return true; - - assert(!in->is_auth()); - - // wait! - dout(7) << "inode_hard_read_try waiting on " << *in << endl; - in->add_waiter(CINODE_WAIT_HARDR, con); - return false; -} - -bool Locker::inode_hard_read_start(CInode *in, MClientRequest *m) -{ - dout(7) << "inode_hard_read_start on " << *in << endl; - - // can read? grab ref. - if (in->hardlock.can_read(in->is_auth())) { - in->hardlock.get_read(); - return true; - } - - // can't read, and replicated. - assert(!in->is_auth()); - - // wait! - dout(7) << "inode_hard_read_start waiting on " << *in << endl; - in->add_waiter(CINODE_WAIT_HARDR, new C_MDS_RetryRequest(mds, m, in)); - return false; -} - - -void Locker::inode_hard_read_finish(CInode *in) -{ - // drop ref - assert(in->hardlock.can_read(in->is_auth())); - in->hardlock.put_read(); - - dout(7) << "inode_hard_read_finish on " << *in << endl; - - //if (in->hardlock.get_nread() == 0) in->finish_waiting(CINODE_WAIT_HARDNORD); -} - - -bool Locker::inode_hard_write_start(CInode *in, MClientRequest *m) -{ - dout(7) << "inode_hard_write_start on " << *in << endl; - - // if not replicated, i can twiddle lock at will - if (in->is_auth() && - !in->is_replicated() && - in->hardlock.get_state() != LOCK_LOCK) - in->hardlock.set_state(LOCK_LOCK); - - // can write? grab ref. - if (in->hardlock.can_write(in->is_auth())) { - assert(in->is_auth()); - if (!in->can_auth_pin()) { - dout(7) << "inode_hard_write_start waiting for authpinnable on " << *in << endl; - in->add_waiter(CINODE_WAIT_AUTHPINNABLE, new C_MDS_RetryRequest(mds, m, in)); - return false; - } - - in->auth_pin(); // ugh, can't condition this on nwrite==0 bc we twiddle that in handle_lock_* - in->hardlock.get_write(m); - return true; - } - - // can't write, replicated. - if (in->is_auth()) { - // auth - if (in->hardlock.can_write_soon(in->is_auth())) { - // just wait - } else { - // initiate lock - inode_hard_lock(in); - } - - dout(7) << "inode_hard_write_start waiting on " << *in << endl; - in->add_waiter(CINODE_WAIT_HARDW, new C_MDS_RetryRequest(mds, m, in)); - - return false; - } else { - // replica - // fw to auth - int auth = in->authority(); - dout(7) << "inode_hard_write_start " << *in << " on replica, fw to auth " << auth << endl; - assert(auth != mds->get_nodeid()); - mdcache->request_forward(m, auth); - return false; - } -} - - -void Locker::inode_hard_write_finish(CInode *in) -{ - // drop ref - //assert(in->hardlock.can_write(in->is_auth())); - in->hardlock.put_write(); - in->auth_unpin(); - dout(7) << "inode_hard_write_finish on " << *in << endl; - - // others waiting? - if (in->is_hardlock_write_wanted()) { - // wake 'em up - in->take_waiting(CINODE_WAIT_HARDW, mds->finished_queue); - } else { - // auto-sync if alone. - if (in->is_auth() && - !in->is_replicated() && - in->hardlock.get_state() != LOCK_SYNC) - in->hardlock.set_state(LOCK_SYNC); - - inode_hard_eval(in); - } -} - - -void Locker::inode_hard_eval(CInode *in) -{ - // finished gather? - if (in->is_auth() && - !in->hardlock.is_stable() && - in->hardlock.gather_set.empty()) { - dout(7) << "inode_hard_eval finished gather on " << *in << endl; - switch (in->hardlock.get_state()) { - case LOCK_GLOCKR: - in->hardlock.set_state(LOCK_LOCK); - - // waiters - //in->hardlock.get_write(); - in->finish_waiting(CINODE_WAIT_HARDRWB|CINODE_WAIT_HARDSTABLE); - //in->hardlock.put_write(); - break; - - default: - assert(0); - } - } - if (!in->hardlock.is_stable()) return; - - if (in->is_auth()) { - - // sync? - if (in->is_replicated() && - in->is_hardlock_write_wanted() && - in->hardlock.get_state() != LOCK_SYNC) { - dout(7) << "inode_hard_eval stable, syncing " << *in << endl; - inode_hard_sync(in); - } - - } else { - // replica - } -} - - -// mid - -void Locker::inode_hard_sync(CInode *in) -{ - dout(7) << "inode_hard_sync on " << *in << endl; - assert(in->is_auth()); - - // check state - if (in->hardlock.get_state() == LOCK_SYNC) - return; // already sync - if (in->hardlock.get_state() == LOCK_GLOCKR) - assert(0); // um... hmm! - assert(in->hardlock.get_state() == LOCK_LOCK); - - // hard data - bufferlist harddata; - in->encode_hard_state(harddata); - - // bcast to replicas - send_lock_message(in, LOCK_AC_SYNC, LOCK_OTYPE_IHARD, harddata); - - // change lock - in->hardlock.set_state(LOCK_SYNC); - - // waiters? - in->finish_waiting(CINODE_WAIT_HARDSTABLE); -} - -void Locker::inode_hard_lock(CInode *in) -{ - dout(7) << "inode_hard_lock on " << *in << " hardlock=" << in->hardlock << endl; - assert(in->is_auth()); - - // check state - if (in->hardlock.get_state() == LOCK_LOCK || - in->hardlock.get_state() == LOCK_GLOCKR) - return; // already lock or locking - assert(in->hardlock.get_state() == LOCK_SYNC); - - // bcast to replicas - send_lock_message(in, LOCK_AC_LOCK, LOCK_OTYPE_IHARD); - - // change lock - in->hardlock.set_state(LOCK_GLOCKR); - in->hardlock.init_gather(in->get_replicas()); -} - - - - - -// messenger - -void Locker::handle_lock_inode_hard(MLock *m) -{ - assert(m->get_otype() == LOCK_OTYPE_IHARD); - - if (mds->logger) mds->logger->inc("lih"); - - int from = m->get_asker(); - CInode *in = mdcache->get_inode(m->get_ino()); - - if (LOCK_AC_FOR_AUTH(m->get_action())) { - // auth - assert(in); - assert(in->is_auth() || in->is_proxy()); - dout(7) << "handle_lock_inode_hard " << *in << " hardlock=" << in->hardlock << endl; - - if (in->is_proxy()) { - // fw - int newauth = in->authority(); - assert(newauth >= 0); - if (from == newauth) { - dout(7) << "handle_lock " << m->get_ino() << " from " << from << ": proxy, but from new auth, dropping" << endl; - delete m; - } else { - dout(7) << "handle_lock " << m->get_ino() << " from " << from << ": proxy, fw to " << newauth << endl; - mds->send_message_mds(m, newauth, MDS_PORT_LOCKER); - } - return; - } - } else { - // replica - if (!in) { - dout(7) << "handle_lock_inode_hard " << m->get_ino() << ": don't have it anymore" << endl; - /* do NOT nak.. if we go that route we need to duplicate all the nonce funkiness - to keep gather_set a proper/correct subset of cached_by. better to use the existing - cacheexpire mechanism instead! - */ - delete m; - return; - } - - assert(!in->is_auth()); - } - - dout(7) << "handle_lock_inode_hard a=" << m->get_action() << " from " << from << " " << *in << " hardlock=" << in->hardlock << endl; - - CLock *lock = &in->hardlock; - - switch (m->get_action()) { - // -- replica -- - case LOCK_AC_SYNC: - assert(lock->get_state() == LOCK_LOCK); - - { // assim data - int off = 0; - in->decode_hard_state(m->get_data(), off); - } - - // update lock - lock->set_state(LOCK_SYNC); - - // no need to reply - - // waiters - in->finish_waiting(CINODE_WAIT_HARDR|CINODE_WAIT_HARDSTABLE); - break; - - case LOCK_AC_LOCK: - assert(lock->get_state() == LOCK_SYNC); - //|| lock->get_state() == LOCK_GLOCKR); - - // wait for readers to finish? - if (lock->get_nread() > 0) { - dout(7) << "handle_lock_inode_hard readers, waiting before ack on " << *in << endl; - lock->set_state(LOCK_GLOCKR); - in->add_waiter(CINODE_WAIT_HARDNORD, - new C_MDS_RetryMessage(mds,m)); - assert(0); // does this ever happen? (if so, fix hard_read_finish, and CInodeExport.update_inode!) - return; - } else { - - // update lock and reply - lock->set_state(LOCK_LOCK); - - { - MLock *reply = new MLock(LOCK_AC_LOCKACK, mds->get_nodeid()); - reply->set_ino(in->ino(), LOCK_OTYPE_IHARD); - mds->send_message_mds(reply, from, MDS_PORT_LOCKER); - } - } - break; - - - // -- auth -- - case LOCK_AC_LOCKACK: - assert(lock->state == LOCK_GLOCKR); - assert(lock->gather_set.count(from)); - lock->gather_set.erase(from); - - if (lock->gather_set.size()) { - dout(7) << "handle_lock_inode_hard " << *in << " from " << from << ", still gathering " << lock->gather_set << endl; - } else { - dout(7) << "handle_lock_inode_hard " << *in << " from " << from << ", last one" << endl; - inode_hard_eval(in); - } - } - delete m; -} - - - - -// ===================== -// soft inode metadata - - -bool Locker::inode_file_read_start(CInode *in, MClientRequest *m) -{ - dout(7) << "inode_file_read_start " << *in << " filelock=" << in->filelock << endl; - - // can read? grab ref. - if (in->filelock.can_read(in->is_auth())) { - in->filelock.get_read(); - return true; - } - - // can't read, and replicated. - if (in->filelock.can_read_soon(in->is_auth())) { - // wait - dout(7) << "inode_file_read_start can_read_soon " << *in << endl; - } else { - if (in->is_auth()) { - // auth - - // FIXME or qsync? - - if (in->filelock.is_stable()) { - inode_file_lock(in); // lock, bc easiest to back off - - if (in->filelock.can_read(in->is_auth())) { - in->filelock.get_read(); - - //in->filelock.get_write(); - in->finish_waiting(CINODE_WAIT_FILERWB|CINODE_WAIT_FILESTABLE); - //in->filelock.put_write(); - return true; - } - } else { - dout(7) << "inode_file_read_start waiting until stable on " << *in << ", filelock=" << in->filelock << endl; - in->add_waiter(CINODE_WAIT_FILESTABLE, new C_MDS_RetryRequest(mds, m, in)); - return false; - } - } else { - // replica - if (in->filelock.is_stable()) { - - // fw to auth - int auth = in->authority(); - dout(7) << "inode_file_read_start " << *in << " on replica and async, fw to auth " << auth << endl; - assert(auth != mds->get_nodeid()); - mdcache->request_forward(m, auth); - return false; - - } else { - // wait until stable - dout(7) << "inode_file_read_start waiting until stable on " << *in << ", filelock=" << in->filelock << endl; - in->add_waiter(CINODE_WAIT_FILESTABLE, new C_MDS_RetryRequest(mds, m, in)); - return false; - } - } - } - - // wait - dout(7) << "inode_file_read_start waiting on " << *in << ", filelock=" << in->filelock << endl; - in->add_waiter(CINODE_WAIT_FILER, new C_MDS_RetryRequest(mds, m, in)); - - return false; -} - - -void Locker::inode_file_read_finish(CInode *in) -{ - // drop ref - assert(in->filelock.can_read(in->is_auth())); - in->filelock.put_read(); - - dout(7) << "inode_file_read_finish on " << *in << ", filelock=" << in->filelock << endl; - - if (in->filelock.get_nread() == 0) { - in->finish_waiting(CINODE_WAIT_FILENORD); - inode_file_eval(in); - } -} - - -bool Locker::inode_file_write_start(CInode *in, MClientRequest *m) -{ - // can't write? - if (!in->filelock.can_write(in->is_auth())) { - - // can't write. - if (in->is_auth()) { - // auth - if (!in->filelock.can_write_soon(in->is_auth())) { - if (!in->filelock.is_stable()) { - dout(7) << "inode_file_write_start on auth, waiting for stable on " << *in << endl; - in->add_waiter(CINODE_WAIT_FILESTABLE, new C_MDS_RetryRequest(mds, m, in)); - return false; - } - - // initiate lock - inode_file_lock(in); - - // fall-thru to below. - } - } else { - // replica - // fw to auth - int auth = in->authority(); - dout(7) << "inode_file_write_start " << *in << " on replica, fw to auth " << auth << endl; - assert(auth != mds->get_nodeid()); - mdcache->request_forward(m, auth); - return false; - } - } - - // check again - if (in->filelock.can_write(in->is_auth())) { - // can i auth pin? - assert(in->is_auth()); - if (!in->can_auth_pin()) { - dout(7) << "inode_file_write_start waiting for authpinnable on " << *in << endl; - in->add_waiter(CINODE_WAIT_AUTHPINNABLE, new C_MDS_RetryRequest(mds, m, in)); - return false; - } - - in->auth_pin(); - in->filelock.get_write(m); - return true; - } else { - dout(7) << "inode_file_write_start on auth, waiting for write on " << *in << endl; - in->add_waiter(CINODE_WAIT_FILEW, new C_MDS_RetryRequest(mds, m, in)); - return false; - } -} - - -void Locker::inode_file_write_finish(CInode *in) -{ - // drop ref - assert(in->filelock.can_write(in->is_auth())); - in->filelock.put_write(); - dout(7) << "inode_file_write_finish on " << *in << ", filelock=" << in->filelock << endl; - - // drop lock? - if (!in->is_filelock_write_wanted()) { - in->finish_waiting(CINODE_WAIT_FILENOWR); - inode_file_eval(in); - } -} - - -/* - * ... - * - * also called after client caps are acked to us - * - checks if we're in unstable sfot state and can now move on to next state - * - checks if soft state should change (eg bc last writer closed) - */ - -void Locker::inode_file_eval(CInode *in) -{ - int issued = in->get_caps_issued(); - - // [auth] finished gather? - if (in->is_auth() && - !in->filelock.is_stable() && - in->filelock.gather_set.size() == 0) { - dout(7) << "inode_file_eval finished mds gather on " << *in << endl; - - switch (in->filelock.get_state()) { - // to lock - case LOCK_GLOCKR: - case LOCK_GLOCKM: - case LOCK_GLOCKL: - if (issued == 0) { - in->filelock.set_state(LOCK_LOCK); - - // waiters - in->filelock.get_read(); - //in->filelock.get_write(); - in->finish_waiting(CINODE_WAIT_FILERWB|CINODE_WAIT_FILESTABLE); - in->filelock.put_read(); - //in->filelock.put_write(); - } - break; - - // to mixed - case LOCK_GMIXEDR: - if ((issued & ~(CAP_FILE_RD)) == 0) { - in->filelock.set_state(LOCK_MIXED); - in->finish_waiting(CINODE_WAIT_FILESTABLE); - } - break; - - case LOCK_GMIXEDL: - if ((issued & ~(CAP_FILE_WR)) == 0) { - in->filelock.set_state(LOCK_MIXED); - - if (in->is_replicated()) { - // data - bufferlist softdata; - in->encode_file_state(softdata); - - // bcast to replicas - send_lock_message(in, LOCK_AC_MIXED, LOCK_OTYPE_IFILE, softdata); - } - - in->finish_waiting(CINODE_WAIT_FILESTABLE); - } - break; - - // to loner - case LOCK_GLONERR: - if (issued == 0) { - in->filelock.set_state(LOCK_LONER); - in->finish_waiting(CINODE_WAIT_FILESTABLE); - } - break; - - case LOCK_GLONERM: - if ((issued & ~CAP_FILE_WR) == 0) { - in->filelock.set_state(LOCK_LONER); - in->finish_waiting(CINODE_WAIT_FILESTABLE); - } - break; - - // to sync - case LOCK_GSYNCL: - case LOCK_GSYNCM: - if ((issued & ~(CAP_FILE_RD)) == 0) { - in->filelock.set_state(LOCK_SYNC); - - { // bcast data to replicas - bufferlist softdata; - in->encode_file_state(softdata); - - send_lock_message(in, LOCK_AC_SYNC, LOCK_OTYPE_IFILE, softdata); - } - - // waiters - in->filelock.get_read(); - in->finish_waiting(CINODE_WAIT_FILER|CINODE_WAIT_FILESTABLE); - in->filelock.put_read(); - } - break; - - default: - assert(0); - } - - issue_caps(in); - } - - // [replica] finished caps gather? - if (!in->is_auth() && - !in->filelock.is_stable()) { - switch (in->filelock.get_state()) { - case LOCK_GMIXEDR: - if ((issued & ~(CAP_FILE_RD)) == 0) { - in->filelock.set_state(LOCK_MIXED); - - // ack - MLock *reply = new MLock(LOCK_AC_MIXEDACK, mds->get_nodeid()); - reply->set_ino(in->ino(), LOCK_OTYPE_IFILE); - mds->send_message_mds(reply, in->authority(), MDS_PORT_LOCKER); - } - break; - - case LOCK_GLOCKR: - if (issued == 0) { - in->filelock.set_state(LOCK_LOCK); - - // ack - MLock *reply = new MLock(LOCK_AC_LOCKACK, mds->get_nodeid()); - reply->set_ino(in->ino(), LOCK_OTYPE_IFILE); - mds->send_message_mds(reply, in->authority(), MDS_PORT_LOCKER); - } - break; - - default: - assert(0); - } - } - - // !stable -> do nothing. - if (!in->filelock.is_stable()) return; - - - // stable. - assert(in->filelock.is_stable()); - - if (in->is_auth()) { - // [auth] - int wanted = in->get_caps_wanted(); - bool loner = (in->client_caps.size() == 1) && in->mds_caps_wanted.empty(); - dout(7) << "inode_file_eval wanted=" << cap_string(wanted) - << " filelock=" << in->filelock - << " loner=" << loner - << endl; - - // * -> loner? - if (in->filelock.get_nread() == 0 && - !in->is_filelock_write_wanted() && - (wanted & CAP_FILE_WR) && - loner && - in->filelock.get_state() != LOCK_LONER) { - dout(7) << "inode_file_eval stable, bump to loner " << *in << ", filelock=" << in->filelock << endl; - inode_file_loner(in); - } - - // * -> mixed? - else if (in->filelock.get_nread() == 0 && - !in->is_filelock_write_wanted() && - (wanted & CAP_FILE_RD) && - (wanted & CAP_FILE_WR) && - !(loner && in->filelock.get_state() == LOCK_LONER) && - in->filelock.get_state() != LOCK_MIXED) { - dout(7) << "inode_file_eval stable, bump to mixed " << *in << ", filelock=" << in->filelock << endl; - inode_file_mixed(in); - } - - // * -> sync? - else if (!in->is_filelock_write_wanted() && - !(wanted & CAP_FILE_WR) && - ((wanted & CAP_FILE_RD) || - in->is_replicated() || - (!loner && in->filelock.get_state() == LOCK_LONER)) && - in->filelock.get_state() != LOCK_SYNC) { - dout(7) << "inode_file_eval stable, bump to sync " << *in << ", filelock=" << in->filelock << endl; - inode_file_sync(in); - } - - // * -> lock? (if not replicated or open) - else if (!in->is_replicated() && - wanted == 0 && - in->filelock.get_state() != LOCK_LOCK) { - inode_file_lock(in); - } - - } else { - // replica - // recall? check wiaters? XXX - } -} - - -// mid - -bool Locker::inode_file_sync(CInode *in) -{ - dout(7) << "inode_file_sync " << *in << " filelock=" << in->filelock << endl; - - assert(in->is_auth()); - - // check state - if (in->filelock.get_state() == LOCK_SYNC || - in->filelock.get_state() == LOCK_GSYNCL || - in->filelock.get_state() == LOCK_GSYNCM) - return true; - - assert(in->filelock.is_stable()); - - int issued = in->get_caps_issued(); - - assert((in->get_caps_wanted() & CAP_FILE_WR) == 0); - - if (in->filelock.get_state() == LOCK_LOCK) { - if (in->is_replicated()) { - // soft data - bufferlist softdata; - in->encode_file_state(softdata); - - // bcast to replicas - send_lock_message(in, LOCK_AC_SYNC, LOCK_OTYPE_IFILE, softdata); - } - - // change lock - in->filelock.set_state(LOCK_SYNC); - - // reissue caps - issue_caps(in); - return true; - } - - else if (in->filelock.get_state() == LOCK_MIXED) { - // writers? - if (issued & CAP_FILE_WR) { - // gather client write caps - in->filelock.set_state(LOCK_GSYNCM); - issue_caps(in); - } else { - // no writers, go straight to sync - - if (in->is_replicated()) { - // bcast to replicas - send_lock_message(in, LOCK_AC_SYNC, LOCK_OTYPE_IFILE); - } - - // change lock - in->filelock.set_state(LOCK_SYNC); - } - return false; - } - - else if (in->filelock.get_state() == LOCK_LONER) { - // writers? - if (issued & CAP_FILE_WR) { - // gather client write caps - in->filelock.set_state(LOCK_GSYNCL); - issue_caps(in); - } else { - // no writers, go straight to sync - if (in->is_replicated()) { - // bcast to replicas - send_lock_message(in, LOCK_AC_SYNC, LOCK_OTYPE_IFILE); - } - - // change lock - in->filelock.set_state(LOCK_SYNC); - } - return false; - } - else - assert(0); // wtf. - - return false; -} - - - -void Locker::inode_file_lock(CInode *in) -{ - dout(7) << "inode_file_lock " << *in << " filelock=" << in->filelock << endl; - - assert(in->is_auth()); - - // check state - if (in->filelock.get_state() == LOCK_LOCK || - in->filelock.get_state() == LOCK_GLOCKR || - in->filelock.get_state() == LOCK_GLOCKM || - in->filelock.get_state() == LOCK_GLOCKL) - return; // lock or locking - - assert(in->filelock.is_stable()); - - int issued = in->get_caps_issued(); - - if (in->filelock.get_state() == LOCK_SYNC) { - if (in->is_replicated()) { - // bcast to replicas - send_lock_message(in, LOCK_AC_LOCK, LOCK_OTYPE_IFILE); - in->filelock.init_gather(in->get_replicas()); - - // change lock - in->filelock.set_state(LOCK_GLOCKR); - - // call back caps - if (issued) - issue_caps(in); - } else { - if (issued) { - // call back caps - in->filelock.set_state(LOCK_GLOCKR); - issue_caps(in); - } else { - in->filelock.set_state(LOCK_LOCK); - } - } - } - - else if (in->filelock.get_state() == LOCK_MIXED) { - if (in->is_replicated()) { - // bcast to replicas - send_lock_message(in, LOCK_AC_LOCK, LOCK_OTYPE_IFILE); - in->filelock.init_gather(in->get_replicas()); - - // change lock - in->filelock.set_state(LOCK_GLOCKM); - - // call back caps - issue_caps(in); - } else { - //assert(issued); // ??? -sage 2/19/06 - if (issued) { - // change lock - in->filelock.set_state(LOCK_GLOCKM); - - // call back caps - issue_caps(in); - } else { - in->filelock.set_state(LOCK_LOCK); - } - } - - } - else if (in->filelock.get_state() == LOCK_LONER) { - if (issued & CAP_FILE_WR) { - // change lock - in->filelock.set_state(LOCK_GLOCKL); - - // call back caps - issue_caps(in); - } else { - in->filelock.set_state(LOCK_LOCK); - } - } - else - assert(0); // wtf. -} - - -void Locker::inode_file_mixed(CInode *in) -{ - dout(7) << "inode_file_mixed " << *in << " filelock=" << in->filelock << endl; - - assert(in->is_auth()); - - // check state - if (in->filelock.get_state() == LOCK_GMIXEDR || - in->filelock.get_state() == LOCK_GMIXEDL) - return; // mixed or mixing - - assert(in->filelock.is_stable()); - - int issued = in->get_caps_issued(); - - if (in->filelock.get_state() == LOCK_SYNC) { - if (in->is_replicated()) { - // bcast to replicas - send_lock_message(in, LOCK_AC_MIXED, LOCK_OTYPE_IFILE); - in->filelock.init_gather(in->get_replicas()); - - in->filelock.set_state(LOCK_GMIXEDR); - issue_caps(in); - } else { - if (issued) { - in->filelock.set_state(LOCK_GMIXEDR); - issue_caps(in); - } else { - in->filelock.set_state(LOCK_MIXED); - } - } - } - - else if (in->filelock.get_state() == LOCK_LOCK) { - if (in->is_replicated()) { - // data - bufferlist softdata; - in->encode_file_state(softdata); - - // bcast to replicas - send_lock_message(in, LOCK_AC_MIXED, LOCK_OTYPE_IFILE, softdata); - } - - // change lock - in->filelock.set_state(LOCK_MIXED); - issue_caps(in); - } - - else if (in->filelock.get_state() == LOCK_LONER) { - if (issued & CAP_FILE_WRBUFFER) { - // gather up WRBUFFER caps - in->filelock.set_state(LOCK_GMIXEDL); - issue_caps(in); - } - else if (in->is_replicated()) { - // bcast to replicas - send_lock_message(in, LOCK_AC_MIXED, LOCK_OTYPE_IFILE); - in->filelock.set_state(LOCK_MIXED); - issue_caps(in); - } else { - in->filelock.set_state(LOCK_MIXED); - issue_caps(in); - } - } - - else - assert(0); // wtf. -} - - -void Locker::inode_file_loner(CInode *in) -{ - dout(7) << "inode_file_loner " << *in << " filelock=" << in->filelock << endl; - - assert(in->is_auth()); - - // check state - if (in->filelock.get_state() == LOCK_LONER || - in->filelock.get_state() == LOCK_GLONERR || - in->filelock.get_state() == LOCK_GLONERM) - return; - - assert(in->filelock.is_stable()); - assert((in->client_caps.size() == 1) && in->mds_caps_wanted.empty()); - - if (in->filelock.get_state() == LOCK_SYNC) { - if (in->is_replicated()) { - // bcast to replicas - send_lock_message(in, LOCK_AC_LOCK, LOCK_OTYPE_IFILE); - in->filelock.init_gather(in->get_replicas()); - - // change lock - in->filelock.set_state(LOCK_GLONERR); - } else { - // only one guy with file open, who gets it all, so - in->filelock.set_state(LOCK_LONER); - issue_caps(in); - } - } - - else if (in->filelock.get_state() == LOCK_LOCK) { - // change lock. ignore replicas; they don't know about LONER. - in->filelock.set_state(LOCK_LONER); - issue_caps(in); - } - - else if (in->filelock.get_state() == LOCK_MIXED) { - if (in->is_replicated()) { - // bcast to replicas - send_lock_message(in, LOCK_AC_LOCK, LOCK_OTYPE_IFILE); - in->filelock.init_gather(in->get_replicas()); - - // change lock - in->filelock.set_state(LOCK_GLONERM); - } else { - in->filelock.set_state(LOCK_LONER); - issue_caps(in); - } - } - - else - assert(0); -} - -// messenger - -void Locker::handle_lock_inode_file(MLock *m) -{ - assert(m->get_otype() == LOCK_OTYPE_IFILE); - - if (mds->logger) mds->logger->inc("lif"); - - CInode *in = mdcache->get_inode(m->get_ino()); - int from = m->get_asker(); - - if (LOCK_AC_FOR_AUTH(m->get_action())) { - // auth - assert(in); - assert(in->is_auth() || in->is_proxy()); - dout(7) << "handle_lock_inode_file " << *in << " hardlock=" << in->hardlock << endl; - - if (in->is_proxy()) { - // fw - int newauth = in->authority(); - assert(newauth >= 0); - if (from == newauth) { - dout(7) << "handle_lock " << m->get_ino() << " from " << from << ": proxy, but from new auth, dropping" << endl; - delete m; - } else { - dout(7) << "handle_lock " << m->get_ino() << " from " << from << ": proxy, fw to " << newauth << endl; - mds->send_message_mds(m, newauth, MDS_PORT_LOCKER); - } - return; - } - } else { - // replica - if (!in) { - // drop it. don't nak. - dout(7) << "handle_lock " << m->get_ino() << ": don't have it anymore" << endl; - delete m; - return; - } - - assert(!in->is_auth()); - } - - dout(7) << "handle_lock_inode_file a=" << m->get_action() << " from " << from << " " << *in << " filelock=" << in->filelock << endl; - - CLock *lock = &in->filelock; - int issued = in->get_caps_issued(); - - switch (m->get_action()) { - // -- replica -- - case LOCK_AC_SYNC: - assert(lock->get_state() == LOCK_LOCK || - lock->get_state() == LOCK_MIXED); - - { // assim data - int off = 0; - in->decode_file_state(m->get_data(), off); - } - - // update lock - lock->set_state(LOCK_SYNC); - - // no need to reply. - - // waiters - in->filelock.get_read(); - in->finish_waiting(CINODE_WAIT_FILER|CINODE_WAIT_FILESTABLE); - in->filelock.put_read(); - inode_file_eval(in); - break; - - case LOCK_AC_LOCK: - assert(lock->get_state() == LOCK_SYNC || - lock->get_state() == LOCK_MIXED); - - // call back caps? - if (issued & CAP_FILE_RD) { - dout(7) << "handle_lock_inode_file client readers, gathering caps on " << *in << endl; - issue_caps(in); - } - if (lock->get_nread() > 0) { - dout(7) << "handle_lock_inode_file readers, waiting before ack on " << *in << endl; - in->add_waiter(CINODE_WAIT_FILENORD, - new C_MDS_RetryMessage(mds,m)); - lock->set_state(LOCK_GLOCKR); - assert(0);// i am broken.. why retry message when state captures all the info i need? - return; - } - if (issued & CAP_FILE_RD) { - lock->set_state(LOCK_GLOCKR); - break; - } - - // nothing to wait for, lock and ack. - { - lock->set_state(LOCK_LOCK); - - MLock *reply = new MLock(LOCK_AC_LOCKACK, mds->get_nodeid()); - reply->set_ino(in->ino(), LOCK_OTYPE_IFILE); - mds->send_message_mds(reply, from, MDS_PORT_LOCKER); - } - break; - - case LOCK_AC_MIXED: - assert(lock->get_state() == LOCK_SYNC || - lock->get_state() == LOCK_LOCK); - - if (lock->get_state() == LOCK_SYNC) { - // MIXED - if (issued & CAP_FILE_RD) { - // call back client caps - lock->set_state(LOCK_GMIXEDR); - issue_caps(in); - break; - } else { - // no clients, go straight to mixed - lock->set_state(LOCK_MIXED); - - // ack - MLock *reply = new MLock(LOCK_AC_MIXEDACK, mds->get_nodeid()); - reply->set_ino(in->ino(), LOCK_OTYPE_IFILE); - mds->send_message_mds(reply, from, MDS_PORT_LOCKER); - } - } else { - // LOCK - lock->set_state(LOCK_MIXED); - - // no ack needed. - } - - issue_caps(in); - - // waiters - //in->filelock.get_write(); - in->finish_waiting(CINODE_WAIT_FILEW|CINODE_WAIT_FILESTABLE); - //in->filelock.put_write(); - inode_file_eval(in); - break; - - - - - // -- auth -- - case LOCK_AC_LOCKACK: - assert(lock->state == LOCK_GLOCKR || - lock->state == LOCK_GLOCKM || - lock->state == LOCK_GLONERM || - lock->state == LOCK_GLONERR); - assert(lock->gather_set.count(from)); - lock->gather_set.erase(from); - - if (lock->gather_set.size()) { - dout(7) << "handle_lock_inode_file " << *in << " from " << from << ", still gathering " << lock->gather_set << endl; - } else { - dout(7) << "handle_lock_inode_file " << *in << " from " << from << ", last one" << endl; - inode_file_eval(in); - } - break; - - case LOCK_AC_SYNCACK: - assert(lock->state == LOCK_GSYNCM); - assert(lock->gather_set.count(from)); - lock->gather_set.erase(from); - - /* not used currently - { - // merge data (keep largest size, mtime, etc.) - int off = 0; - in->decode_merge_file_state(m->get_data(), off); - } - */ - - if (lock->gather_set.size()) { - dout(7) << "handle_lock_inode_file " << *in << " from " << from << ", still gathering " << lock->gather_set << endl; - } else { - dout(7) << "handle_lock_inode_file " << *in << " from " << from << ", last one" << endl; - inode_file_eval(in); - } - break; - - case LOCK_AC_MIXEDACK: - assert(lock->state == LOCK_GMIXEDR); - assert(lock->gather_set.count(from)); - lock->gather_set.erase(from); - - if (lock->gather_set.size()) { - dout(7) << "handle_lock_inode_file " << *in << " from " << from << ", still gathering " << lock->gather_set << endl; - } else { - dout(7) << "handle_lock_inode_file " << *in << " from " << from << ", last one" << endl; - inode_file_eval(in); - } - break; - - - default: - assert(0); - } - - delete m; -} - - - - - - - - - - - - - - -void Locker::handle_lock_dir(MLock *m) -{ - -} - - - -// DENTRY - -bool Locker::dentry_xlock_start(CDentry *dn, Message *m, CInode *ref) -{ - dout(7) << "dentry_xlock_start on " << *dn << endl; - - // locked? - if (dn->lockstate == DN_LOCK_XLOCK) { - if (dn->xlockedby == m) return true; // locked by me! - - // not by me, wait - dout(7) << "dentry " << *dn << " xlock by someone else" << endl; - dn->dir->add_waiter(CDIR_WAIT_DNREAD, dn->name, - new C_MDS_RetryRequest(mds,m,ref)); - return false; - } - - // prelock? - if (dn->lockstate == DN_LOCK_PREXLOCK) { - if (dn->xlockedby == m) { - dout(7) << "dentry " << *dn << " prexlock by me" << endl; - dn->dir->add_waiter(CDIR_WAIT_DNLOCK, dn->name, - new C_MDS_RetryRequest(mds,m,ref)); - } else { - dout(7) << "dentry " << *dn << " prexlock by someone else" << endl; - dn->dir->add_waiter(CDIR_WAIT_DNREAD, dn->name, - new C_MDS_RetryRequest(mds,m,ref)); - } - return false; - } - - - // lockable! - assert(dn->lockstate == DN_LOCK_SYNC || - dn->lockstate == DN_LOCK_UNPINNING); - - // dir auth pinnable? - if (!dn->dir->can_auth_pin()) { - dout(7) << "dentry " << *dn << " dir not pinnable, waiting" << endl; - dn->dir->add_waiter(CDIR_WAIT_AUTHPINNABLE, - new C_MDS_RetryRequest(mds,m,ref)); - return false; - } - - // is dentry path pinned? - if (dn->is_pinned()) { - dout(7) << "dentry " << *dn << " pinned, waiting" << endl; - dn->lockstate = DN_LOCK_UNPINNING; - dn->dir->add_waiter(CDIR_WAIT_DNUNPINNED, - dn->name, - new C_MDS_RetryRequest(mds,m,ref)); - return false; - } - - // pin path up to dentry! (if success, point of no return) - CDentry *pdn = dn->dir->inode->get_parent_dn(); - if (pdn) { - if (mdcache->active_requests[m].traces.count(pdn)) { - dout(7) << "already path pinned parent dentry " << *pdn << endl; - } else { - dout(7) << "pinning parent dentry " << *pdn << endl; - vector trace; - mdcache->make_trace(trace, pdn->inode); - assert(trace.size()); - - if (!mdcache->path_pin(trace, m, new C_MDS_RetryRequest(mds, m, ref))) return false; - - mdcache->active_requests[m].traces[trace[trace.size()-1]] = trace; - } - } - - // pin dir! - dn->dir->auth_pin(); - - // mine! - dn->xlockedby = m; - - if (dn->is_replicated()) { - dn->lockstate = DN_LOCK_PREXLOCK; - - // xlock with whom? - set who; - for (map::iterator p = dn->replicas_begin(); - p != dn->replicas_end(); - ++p) - who.insert(p->first); - dn->gather_set = who; - - // make path - string path; - dn->make_path(path); - dout(10) << "path is " << path << " for " << *dn << endl; - - for (set::iterator it = who.begin(); - it != who.end(); - it++) { - MLock *m = new MLock(LOCK_AC_LOCK, mds->get_nodeid()); - m->set_dn(dn->dir->ino(), dn->name); - m->set_path(path); - mds->send_message_mds(m, *it, MDS_PORT_LOCKER); - } - - // wait - dout(7) << "dentry_xlock_start locking, waiting for replicas " << endl; - dn->dir->add_waiter(CDIR_WAIT_DNLOCK, dn->name, - new C_MDS_RetryRequest(mds, m, ref)); - return false; - } else { - dn->lockstate = DN_LOCK_XLOCK; - mdcache->active_requests[dn->xlockedby].xlocks.insert(dn); - return true; - } -} - -void Locker::dentry_xlock_finish(CDentry *dn, bool quiet) -{ - dout(7) << "dentry_xlock_finish on " << *dn << endl; - - assert(dn->xlockedby); - if (dn->xlockedby == DN_XLOCK_FOREIGN) { - dout(7) << "this was a foreign xlock" << endl; - } else { - // remove from request record - assert(mdcache->active_requests[dn->xlockedby].xlocks.count(dn) == 1); - mdcache->active_requests[dn->xlockedby].xlocks.erase(dn); - } - - dn->xlockedby = 0; - dn->lockstate = DN_LOCK_SYNC; - - // unpin parent dir? - // -> no? because we might have xlocked 2 things in this dir. - // instead, we let request_finish clean up the mess. - - // tell replicas? - if (!quiet) { - // tell even if dn is null. - if (dn->is_replicated()) { - send_lock_message(dn, LOCK_AC_SYNC); - } - } - - // unpin dir - dn->dir->auth_unpin(); -} - -/* - * onfinish->finish() will be called with - * 0 on successful xlock, - * -1 on failure - */ - -class C_MDC_XlockRequest : public Context { - Locker *mdc; - CDir *dir; - string dname; - Message *req; - Context *finisher; -public: - C_MDC_XlockRequest(Locker *mdc, - CDir *dir, string& dname, - Message *req, - Context *finisher) { - this->mdc = mdc; - this->dir = dir; - this->dname = dname; - this->req = req; - this->finisher = finisher; - } - - void finish(int r) { - mdc->dentry_xlock_request_finish(r, dir, dname, req, finisher); - } -}; - -void Locker::dentry_xlock_request_finish(int r, - CDir *dir, string& dname, - Message *req, - Context *finisher) -{ - dout(10) << "dentry_xlock_request_finish r = " << r << endl; - if (r == 1) { // 1 for xlock request success - CDentry *dn = dir->lookup(dname); - if (dn && dn->xlockedby == 0) { - // success - dn->xlockedby = req; // our request was the winner - dout(10) << "xlock request success, now xlocked by req " << req << " dn " << *dn << endl; - - // remember! - mdcache->active_requests[req].foreign_xlocks.insert(dn); - } - } - - // retry request (or whatever) - finisher->finish(0); - delete finisher; -} - -void Locker::dentry_xlock_request(CDir *dir, string& dname, bool create, - Message *req, Context *onfinish) -{ - dout(10) << "dentry_xlock_request on dn " << dname << " create=" << create << " in " << *dir << endl; - // send request - int dauth = dir->dentry_authority(dname); - MLock *m = new MLock(create ? LOCK_AC_REQXLOCKC:LOCK_AC_REQXLOCK, mds->get_nodeid()); - m->set_dn(dir->ino(), dname); - mds->send_message_mds(m, dauth, MDS_PORT_LOCKER); - - // add waiter - dir->add_waiter(CDIR_WAIT_DNREQXLOCK, dname, - new C_MDC_XlockRequest(this, - dir, dname, req, - onfinish)); -} - - - - -void Locker::handle_lock_dn(MLock *m) -{ - assert(m->get_otype() == LOCK_OTYPE_DN); - - CInode *diri = mdcache->get_inode(m->get_ino()); // may be null - CDir *dir = 0; - if (diri) dir = diri->dir; // may be null - string dname = m->get_dn(); - int from = m->get_asker(); - CDentry *dn = 0; - - if (LOCK_AC_FOR_AUTH(m->get_action())) { - // auth - - // normally we have it always - if (diri && dir) { - int dauth = dir->dentry_authority(dname); - assert(dauth == mds->get_nodeid() || dir->is_proxy() || // mine or proxy, - m->get_action() == LOCK_AC_REQXLOCKACK || // or we did a REQXLOCK and this is our ack/nak - m->get_action() == LOCK_AC_REQXLOCKNAK); - - if (dir->is_proxy()) { - - assert(dauth >= 0); - - if (dauth == m->get_asker() && - (m->get_action() == LOCK_AC_REQXLOCK || - m->get_action() == LOCK_AC_REQXLOCKC)) { - dout(7) << "handle_lock_dn got reqxlock from " << dauth << " and they are auth.. dropping on floor (their import will have woken them up)" << endl; - if (mdcache->active_requests.count(m)) - mdcache->request_finish(m); - else - delete m; - return; - } - - dout(7) << "handle_lock_dn " << m << " " << m->get_ino() << " dname " << dname << " from " << from << ": proxy, fw to " << dauth << endl; - - // forward - if (mdcache->active_requests.count(m)) { - // xlock requests are requests, use request_* functions! - assert(m->get_action() == LOCK_AC_REQXLOCK || - m->get_action() == LOCK_AC_REQXLOCKC); - // forward as a request - mdcache->request_forward(m, dauth, MDS_PORT_LOCKER); - } else { - // not an xlock req, or it is and we just didn't register the request yet - // forward normally - mds->send_message_mds(m, dauth, MDS_PORT_LOCKER); - } - return; - } - - dn = dir->lookup(dname); - } - - // except with.. an xlock request? - if (!dn) { - assert(dir); // we should still have the dir, though! the requester has the dir open. - switch (m->get_action()) { - - case LOCK_AC_LOCK: - dout(7) << "handle_lock_dn xlock on " << dname << ", adding (null)" << endl; - dn = dir->add_dentry(dname); - break; - - case LOCK_AC_REQXLOCK: - // send nak - if (dir->state_test(CDIR_STATE_DELETED)) { - dout(7) << "handle_lock_dn reqxlock on deleted dir " << *dir << ", nak" << endl; - } else { - dout(7) << "handle_lock_dn reqxlock on " << dname << " in " << *dir << " dne, nak" << endl; - } - { - MLock *reply = new MLock(LOCK_AC_REQXLOCKNAK, mds->get_nodeid()); - reply->set_dn(dir->ino(), dname); - reply->set_path(m->get_path()); - mds->send_message_mds(reply, m->get_asker(), MDS_PORT_LOCKER); - } - - // finish request (if we got that far) - if (mdcache->active_requests.count(m)) - mdcache->request_finish(m); - - delete m; - return; - - case LOCK_AC_REQXLOCKC: - dout(7) << "handle_lock_dn reqxlockc on " << dname << " in " << *dir << " dne (yet!)" << endl; - break; - - default: - assert(0); - } - } - } else { - // replica - if (dir) dn = dir->lookup(dname); - if (!dn) { - dout(7) << "handle_lock_dn " << m << " don't have " << m->get_ino() << " dname " << dname << endl; - - if (m->get_action() == LOCK_AC_REQXLOCKACK || - m->get_action() == LOCK_AC_REQXLOCKNAK) { - dout(7) << "handle_lock_dn got reqxlockack/nak, but don't have dn " << m->get_path() << ", discovering" << endl; - //assert(0); // how can this happen? tell me now! - - vector trace; - filepath path = m->get_path(); - int r = mdcache->path_traverse(path, trace, true, - m, new C_MDS_RetryMessage(mds,m), - MDS_TRAVERSE_DISCOVER); - assert(r>0); - return; - } - - if (m->get_action() == LOCK_AC_LOCK) { - if (0) { // not anymore - dout(7) << "handle_lock_dn don't have " << m->get_path() << ", discovering" << endl; - - vector trace; - filepath path = m->get_path(); - int r = mdcache->path_traverse(path, trace, true, - m, new C_MDS_RetryMessage(mds,m), - MDS_TRAVERSE_DISCOVER); - assert(r>0); - } - if (1) { - // NAK - MLock *reply = new MLock(LOCK_AC_LOCKNAK, mds->get_nodeid()); - reply->set_dn(m->get_ino(), dname); - mds->send_message_mds(reply, m->get_asker(), MDS_PORT_LOCKER); - } - } else { - dout(7) << "safely ignoring." << endl; - delete m; - } - return; - } - - assert(dn); - } - - if (dn) { - dout(7) << "handle_lock_dn a=" << m->get_action() << " from " << from << " " << *dn << endl; - } else { - dout(7) << "handle_lock_dn a=" << m->get_action() << " from " << from << " " << dname << " in " << *dir << endl; - } - - switch (m->get_action()) { - // -- replica -- - case LOCK_AC_LOCK: - assert(dn->lockstate == DN_LOCK_SYNC || - dn->lockstate == DN_LOCK_UNPINNING || - dn->lockstate == DN_LOCK_XLOCK); // <-- bc the handle_lock_dn did the discover! - - if (dn->is_pinned()) { - dn->lockstate = DN_LOCK_UNPINNING; - - // wait - dout(7) << "dn pinned, waiting " << *dn << endl; - dn->dir->add_waiter(CDIR_WAIT_DNUNPINNED, - dn->name, - new C_MDS_RetryMessage(mds, m)); - return; - } else { - dn->lockstate = DN_LOCK_XLOCK; - dn->xlockedby = 0; - - // ack now - MLock *reply = new MLock(LOCK_AC_LOCKACK, mds->get_nodeid()); - reply->set_dn(diri->ino(), dname); - mds->send_message_mds(reply, from, MDS_PORT_LOCKER); - } - - // wake up waiters - dir->finish_waiting(CDIR_WAIT_DNLOCK, dname); // ? will this happen on replica ? - break; - - case LOCK_AC_SYNC: - assert(dn->lockstate == DN_LOCK_XLOCK); - dn->lockstate = DN_LOCK_SYNC; - dn->xlockedby = 0; - - // null? hose it. - if (dn->is_null()) { - dout(7) << "hosing null (and now sync) dentry " << *dn << endl; - dir->remove_dentry(dn); - } - - // wake up waiters - dir->finish_waiting(CDIR_WAIT_DNREAD, dname); // will this happen either? YES: if a rename lock backs out - break; - - case LOCK_AC_REQXLOCKACK: - case LOCK_AC_REQXLOCKNAK: - { - dout(10) << "handle_lock_dn got ack/nak on a reqxlock for " << *dn << endl; - list finished; - dir->take_waiting(CDIR_WAIT_DNREQXLOCK, m->get_dn(), finished, 1); // TAKE ONE ONLY! - finish_contexts(finished, - (m->get_action() == LOCK_AC_REQXLOCKACK) ? 1:-1); - } - break; - - - // -- auth -- - case LOCK_AC_LOCKACK: - case LOCK_AC_LOCKNAK: - assert(dn->gather_set.count(from) == 1); - dn->gather_set.erase(from); - if (dn->gather_set.size() == 0) { - dout(7) << "handle_lock_dn finish gather, now xlock on " << *dn << endl; - dn->lockstate = DN_LOCK_XLOCK; - mdcache->active_requests[dn->xlockedby].xlocks.insert(dn); - dir->finish_waiting(CDIR_WAIT_DNLOCK, dname); - } - break; - - - case LOCK_AC_REQXLOCKC: - // make sure it's a _file_, if it exists. - if (dn && dn->inode && dn->inode->is_dir()) { - dout(7) << "handle_lock_dn failing, reqxlockc on dir " << *dn->inode << endl; - - // nak - string path; - dn->make_path(path); - - MLock *reply = new MLock(LOCK_AC_REQXLOCKNAK, mds->get_nodeid()); - reply->set_dn(dir->ino(), dname); - reply->set_path(path); - mds->send_message_mds(reply, m->get_asker(), MDS_PORT_LOCKER); - - // done - if (mdcache->active_requests.count(m)) - mdcache->request_finish(m); - else - delete m; - return; - } - - case LOCK_AC_REQXLOCK: - if (dn) { - dout(7) << "handle_lock_dn reqxlock on " << *dn << endl; - } else { - dout(7) << "handle_lock_dn reqxlock on " << dname << " in " << *dir << endl; - } - - - // start request? - if (!mdcache->active_requests.count(m)) { - vector trace; - if (!mdcache->request_start(m, dir->inode, trace)) - return; // waiting for pin - } - - // try to xlock! - if (!dn) { - assert(m->get_action() == LOCK_AC_REQXLOCKC); - dn = dir->add_dentry(dname); - } - - if (dn->xlockedby != m) { - if (!dentry_xlock_start(dn, m, dir->inode)) { - // hose null dn if we're waiting on something - if (dn->is_clean() && dn->is_null() && dn->is_sync()) dir->remove_dentry(dn); - return; // waiting for xlock - } - } else { - // successfully xlocked! on behalf of requestor. - string path; - dn->make_path(path); - - dout(7) << "handle_lock_dn reqxlock success for " << m->get_asker() << " on " << *dn << ", acking" << endl; - - // ACK xlock request - MLock *reply = new MLock(LOCK_AC_REQXLOCKACK, mds->get_nodeid()); - reply->set_dn(dir->ino(), dname); - reply->set_path(path); - mds->send_message_mds(reply, m->get_asker(), MDS_PORT_LOCKER); - - // note: keep request around in memory (to hold the xlock/pins on behalf of requester) - return; - } - break; - - case LOCK_AC_UNXLOCK: - dout(7) << "handle_lock_dn unxlock on " << *dn << endl; - { - string dname = dn->name; - Message *m = dn->xlockedby; - - // finish request - mdcache->request_finish(m); // this will drop the locks (and unpin paths!) - return; - } - break; - - default: - assert(0); - } - - delete m; -} - - - - - - - diff --git a/branches/marnberg/quota/mds/Locker.h b/branches/marnberg/quota/mds/Locker.h deleted file mode 100644 index d8dcb2c541a37..0000000000000 --- a/branches/marnberg/quota/mds/Locker.h +++ /dev/null @@ -1,127 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_LOCKER_H -#define __MDS_LOCKER_H - -#include "include/types.h" - -#include -#include -#include -using std::map; -using std::list; -using std::set; - -class MDS; -class CDir; -class CInode; -class CDentry; - -class Message; - -class MDiscover; -class MDiscoverReply; -class MCacheExpire; -class MDirUpdate; -class MDentryUnlink; -class MLock; - -class MClientRequest; - - -class Anchor; -class Capability; - - -class Locker { -private: - MDS *mds; - MDCache *mdcache; - - public: - Locker(MDS *m, MDCache *c) : mds(m), mdcache(c) {} - - void dispatch(Message *m); - - void send_lock_message(CInode *in, int msg, int type); - void send_lock_message(CInode *in, int msg, int type, bufferlist& data); - void send_lock_message(CDentry *dn, int msg); - - // -- locks -- - // high level interface - public: - bool inode_hard_read_try(CInode *in, Context *con); - bool inode_hard_read_start(CInode *in, MClientRequest *m); - void inode_hard_read_finish(CInode *in); - bool inode_hard_write_start(CInode *in, MClientRequest *m); - void inode_hard_write_finish(CInode *in); - bool inode_file_read_start(CInode *in, MClientRequest *m); - void inode_file_read_finish(CInode *in); - bool inode_file_write_start(CInode *in, MClientRequest *m); - void inode_file_write_finish(CInode *in); - - void inode_hard_eval(CInode *in); - void inode_file_eval(CInode *in); - - protected: - void inode_hard_mode(CInode *in, int mode); - void inode_file_mode(CInode *in, int mode); - - // low level triggers - void inode_hard_sync(CInode *in); - void inode_hard_lock(CInode *in); - bool inode_file_sync(CInode *in); - void inode_file_lock(CInode *in); - void inode_file_mixed(CInode *in); - void inode_file_loner(CInode *in); - - // messengers - void handle_lock(MLock *m); - void handle_lock_inode_hard(MLock *m); - void handle_lock_inode_file(MLock *m); - - // -- file i/o -- - public: - version_t issue_file_data_version(CInode *in); - Capability* issue_new_caps(CInode *in, int mode, MClientRequest *req); - bool issue_caps(CInode *in); - - protected: - void handle_client_file_caps(class MClientFileCaps *m); - - void request_inode_file_caps(CInode *in); - void handle_inode_file_caps(class MInodeFileCaps *m); - - - // dirs - void handle_lock_dir(MLock *m); - - // dentry locks - public: - bool dentry_xlock_start(CDentry *dn, - Message *m, CInode *ref); - void dentry_xlock_finish(CDentry *dn, bool quiet=false); - void handle_lock_dn(MLock *m); - void dentry_xlock_request(CDir *dir, string& dname, bool create, - Message *req, Context *onfinish); - void dentry_xlock_request_finish(int r, - CDir *dir, string& dname, - Message *req, - Context *finisher); - - -}; - - -#endif diff --git a/branches/marnberg/quota/mds/LogEvent.cc b/branches/marnberg/quota/mds/LogEvent.cc deleted file mode 100644 index 4a83902c5c6c4..0000000000000 --- a/branches/marnberg/quota/mds/LogEvent.cc +++ /dev/null @@ -1,67 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "LogEvent.h" - -#include "MDS.h" - -// events i know of -#include "events/EString.h" -#include "events/EImportMap.h" -#include "events/EMetaBlob.h" -#include "events/EUpdate.h" -#include "events/EUnlink.h" -#include "events/EAlloc.h" -#include "events/EPurgeFinish.h" -#include "events/EExportStart.h" -#include "events/EExportFinish.h" -#include "events/EImportStart.h" -#include "events/EImportFinish.h" - -LogEvent *LogEvent::decode(bufferlist& bl) -{ - // parse type, length - int off = 0; - int type; - bl.copy(off, sizeof(type), (char*)&type); - off += sizeof(type); - - int length = bl.length() - off; - dout(15) << "decode_log_event type " << type << ", size " << length << endl; - - assert(type > 0); - - // create event - LogEvent *le; - switch (type) { - case EVENT_STRING: le = new EString(); break; - case EVENT_IMPORTMAP: le = new EImportMap; break; - case EVENT_UPDATE: le = new EUpdate; break; - case EVENT_UNLINK: le = new EUnlink(); break; - case EVENT_PURGEFINISH: le = new EPurgeFinish(); break; - case EVENT_ALLOC: le = new EAlloc(); break; - case EVENT_EXPORTSTART: le = new EExportStart; break; - case EVENT_EXPORTFINISH: le = new EExportFinish; break; - case EVENT_IMPORTSTART: le = new EImportStart; break; - case EVENT_IMPORTFINISH: le = new EImportFinish; break; - default: - dout(1) << "uh oh, unknown log event type " << type << endl; - assert(0); - } - - // decode - le->decode_payload(bl, off); - - return le; -} - diff --git a/branches/marnberg/quota/mds/LogEvent.h b/branches/marnberg/quota/mds/LogEvent.h deleted file mode 100644 index 6895ed54074d4..0000000000000 --- a/branches/marnberg/quota/mds/LogEvent.h +++ /dev/null @@ -1,106 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __LOGEVENT_H -#define __LOGEVENT_H - -#define EVENT_STRING 1 - -#define EVENT_INODEUPDATE 2 -#define EVENT_DIRUPDATE 3 - -#define EVENT_IMPORTMAP 4 -#define EVENT_UPDATE 5 - -#define EVENT_ALLOC 10 -#define EVENT_MKNOD 11 -#define EVENT_MKDIR 12 -#define EVENT_LINK 13 - -#define EVENT_UNLINK 20 -#define EVENT_RMDIR 21 -#define EVENT_PURGEFINISH 22 - -#define EVENT_EXPORTSTART 30 -#define EVENT_EXPORTFINISH 31 -#define EVENT_IMPORTSTART 32 -#define EVENT_IMPORTFINISH 33 - - - -#include -using namespace std; - -#include "include/buffer.h" -#include "include/Context.h" - -class MDS; - -// generic log event -class LogEvent { - private: - int _type; - off_t _start_off,_end_off; - friend class MDLog; - - public: - LogEvent(int t) : _type(t), _start_off(0), _end_off(0) { } - virtual ~LogEvent() { } - - int get_type() { return _type; } - off_t get_start_off() { return _start_off; } - off_t get_end_off() { return _end_off; } - - // encoding - virtual void encode_payload(bufferlist& bl) = 0; - virtual void decode_payload(bufferlist& bl, int& off) = 0; - static LogEvent *decode(bufferlist &bl); - - - virtual void print(ostream& out) { - out << "event(" << _type << ")"; - } - - - /*** live journal ***/ - - /* obsolete() - is this entry committed to primary store, such that - * we can expire it from the journal? - */ - virtual bool has_expired(MDS *m) { - return true; - } - - /* expire() - prod MDS into committing the relevant state so that this - * entry can be expired from the jorunal. - */ - virtual void expire(MDS *m, Context *c) { - assert(0); - c->finish(0); - delete c; - } - - - /*** recovery ***/ - /* replay() - replay given event. this is idempotent. - */ - virtual void replay(MDS *m) { assert(0); } - -}; - -inline ostream& operator<<(ostream& out, LogEvent& le) { - le.print(out); - return out; -} - -#endif diff --git a/branches/marnberg/quota/mds/MDBalancer.cc b/branches/marnberg/quota/mds/MDBalancer.cc deleted file mode 100644 index 57e79dcdf51fc..0000000000000 --- a/branches/marnberg/quota/mds/MDBalancer.cc +++ /dev/null @@ -1,878 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "mdstypes.h" - -#include "MDBalancer.h" -#include "MDS.h" -#include "MDSMap.h" -#include "CInode.h" -#include "CDir.h" -#include "MDCache.h" -#include "Migrator.h" - -#include "include/Context.h" -#include "msg/Messenger.h" -#include "messages/MHeartbeat.h" - -#include -#include -using namespace std; - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug_mds || l<=g_conf.debug_mds_balancer) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".bal " - -#define MIN_LOAD 50 // ?? -#define MIN_REEXPORT 5 // will automatically reexport -#define MIN_OFFLOAD 10 // point at which i stop trying, close enough - - - -int MDBalancer::proc_message(Message *m) -{ - switch (m->get_type()) { - - case MSG_MDS_HEARTBEAT: - handle_heartbeat((MHeartbeat*)m); - break; - - default: - dout(1) << " balancer unknown message " << m->get_type() << endl; - assert(0); - break; - } - - return 0; -} - - - - -void MDBalancer::tick() -{ - static int num_bal_times = g_conf.mds_bal_max; - static utime_t first = g_clock.now(); - utime_t now = g_clock.now(); - utime_t elapsed = now; - elapsed -= first; - - // balance? - if (true && - mds->get_nodeid() == 0 && - (num_bal_times || - (g_conf.mds_bal_max_until >= 0 && - elapsed.sec() > g_conf.mds_bal_max_until)) && - mds->is_active() && - now.sec() - last_heartbeat.sec() >= g_conf.mds_bal_interval) { - last_heartbeat = now; - send_heartbeat(); - num_bal_times--; - } - - // hash? - if (true && - g_conf.num_mds > 1 && - now.sec() - last_hash.sec() > g_conf.mds_bal_hash_interval) { - last_hash = now; - do_hashing(); - } -} - - - - -class C_Bal_SendHeartbeat : public Context { -public: - MDS *mds; - C_Bal_SendHeartbeat(MDS *mds) { - this->mds = mds; - } - virtual void finish(int f) { - mds->balancer->send_heartbeat(); - } -}; - -mds_load_t MDBalancer::get_load() -{ - mds_load_t load; - if (mds->mdcache->get_root()) - load.root = - mds->mdcache->get_root()->popularity[MDS_POP_ANYDOM]; - // + - // mds->mdcache->get_root()->popularity[MDS_POP_NESTED]; - - load.req_rate = mds->get_req_rate(); - load.queue_len = mds->messenger->get_dispatch_queue_len(); - return load; -} - -void MDBalancer::send_heartbeat() -{ - if (!mds->mdcache->get_root()) { - dout(5) << "no root on send_heartbeat" << endl; - mds->mdcache->open_root(new C_Bal_SendHeartbeat(mds)); - return; - } - - mds_load.clear(); - if (mds->get_nodeid() == 0) - beat_epoch++; - - // load - mds_load_t load = get_load(); - mds_load[ mds->get_nodeid() ] = load; - - // import_map - map import_map; - - for (set::iterator it = mds->mdcache->imports.begin(); - it != mds->mdcache->imports.end(); - it++) { - CDir *im = *it; - if (im->inode->is_root()) continue; - int from = im->inode->authority(); - import_map[from] += im->popularity[MDS_POP_CURDOM].meta_load(); - } - mds_import_map[ mds->get_nodeid() ] = import_map; - - - dout(5) << "mds" << mds->get_nodeid() << " sending heartbeat " << beat_epoch << " " << load << endl; - for (map::iterator it = import_map.begin(); - it != import_map.end(); - it++) { - dout(5) << " import_map from " << it->first << " -> " << it->second << endl; - } - - - set up; - mds->get_mds_map()->get_up_mds_set(up); - for (set::iterator p = up.begin(); p != up.end(); ++p) { - if (*p == mds->get_nodeid()) continue; - MHeartbeat *hb = new MHeartbeat(load, beat_epoch); - hb->get_import_map() = import_map; - mds->messenger->send_message(hb, - mds->mdsmap->get_inst(*p), - MDS_PORT_BALANCER, MDS_PORT_BALANCER); - } -} - -void MDBalancer::handle_heartbeat(MHeartbeat *m) -{ - dout(25) << "=== got heartbeat " << m->get_beat() << " from " << m->get_source().num() << " " << m->get_load() << endl; - - if (!mds->mdcache->get_root()) { - dout(10) << "no root on handle" << endl; - mds->mdcache->open_root(new C_MDS_RetryMessage(mds, m)); - return; - } - - int who = m->get_source().num(); - - if (who == 0) { - dout(20) << " from mds0, new epoch" << endl; - beat_epoch = m->get_beat(); - send_heartbeat(); - - show_imports(); - } - - mds_load[ who ] = m->get_load(); - mds_import_map[ who ] = m->get_import_map(); - - //cout << " load is " << load << " have " << mds_load.size() << endl; - - unsigned cluster_size = mds->get_mds_map()->get_num_mds(); - if (mds_load.size() == cluster_size) { - // let's go! - //export_empties(); // no! - do_rebalance(m->get_beat()); - } - - // done - delete m; -} - - -void MDBalancer::export_empties() -{ - dout(5) << "export_empties checking for empty imports" << endl; - - for (set::iterator it = mds->mdcache->imports.begin(); - it != mds->mdcache->imports.end(); - it++) { - CDir *dir = *it; - - if (!dir->inode->is_root() && dir->get_size() == 0) - mds->mdcache->migrator->export_empty_import(dir); - } -} - - - -double MDBalancer::try_match(int ex, double& maxex, - int im, double& maxim) -{ - if (maxex <= 0 || maxim <= 0) return 0.0; - - double howmuch = MIN(maxex, maxim); - if (howmuch <= 0) return 0.0; - - dout(5) << " - mds" << ex << " exports " << howmuch << " to mds" << im << endl; - - if (ex == mds->get_nodeid()) - my_targets[im] += howmuch; - - exported[ex] += howmuch; - imported[im] += howmuch; - - maxex -= howmuch; - maxim -= howmuch; - - return howmuch; -} - - - -void MDBalancer::do_hashing() -{ - if (hash_queue.empty()) { - dout(20) << "do_hashing has nothing to do" << endl; - return; - } - - dout(0) << "do_hashing " << hash_queue.size() << " dirs marked for possible hashing" << endl; - - for (set::iterator i = hash_queue.begin(); - i != hash_queue.end(); - i++) { - inodeno_t dirino = *i; - CInode *in = mds->mdcache->get_inode(dirino); - if (!in) continue; - CDir *dir = in->dir; - if (!dir) continue; - if (!dir->is_auth()) continue; - - dout(0) << "do_hashing hashing " << *dir << endl; - mds->mdcache->migrator->hash_dir(dir); - } - hash_queue.clear(); -} - - - -void MDBalancer::do_rebalance(int beat) -{ - int cluster_size = mds->get_mds_map()->get_num_mds(); - int whoami = mds->get_nodeid(); - - // reset - my_targets.clear(); - imported.clear(); - exported.clear(); - - dout(5) << " do_rebalance: cluster loads are" << endl; - - // rescale! turn my mds_load back into meta_load units - double load_fac = 1.0; - if (mds_load[whoami].mds_load() > 0) { - load_fac = mds_load[whoami].root.meta_load() / mds_load[whoami].mds_load(); - dout(7) << " load_fac is " << load_fac - << " <- " << mds_load[whoami].root.meta_load() << " / " << mds_load[whoami].mds_load() - << endl; - } - - double total_load = 0; - multimap load_map; - for (int i=0; i " << l << endl; - - if (whoami == i) my_load = l; - total_load += l; - - load_map.insert(pair( l, i )); - } - - // target load - target_load = total_load / (double)cluster_size; - dout(5) << "do_rebalance: my load " << my_load - << " target " << target_load - << " total " << total_load - << endl; - - // under or over? - if (my_load < target_load) { - dout(5) << " i am underloaded, doing nothing." << endl; - show_imports(); - return; - } - - dout(5) << " i am overloaded" << endl; - - - // first separate exporters and importers - multimap importers; - multimap exporters; - set importer_set; - set exporter_set; - - for (multimap::iterator it = load_map.begin(); - it != load_map.end(); - it++) { - if (it->first < target_load) { - dout(15) << " mds" << it->second << " is importer" << endl; - importers.insert(pair(it->first,it->second)); - importer_set.insert(it->second); - } else { - dout(15) << " mds" << it->second << " is exporter" << endl; - exporters.insert(pair(it->first,it->second)); - exporter_set.insert(it->second); - } - } - - - // determine load transfer mapping - - if (true) { - // analyze import_map; do any matches i can - - dout(5) << " matching exporters to import sources" << endl; - - // big -> small exporters - for (multimap::reverse_iterator ex = exporters.rbegin(); - ex != exporters.rend(); - ex++) { - double maxex = get_maxex(ex->second); - if (maxex <= .001) continue; - - // check importers. for now, just in arbitrary order (no intelligent matching). - for (map::iterator im = mds_import_map[ex->second].begin(); - im != mds_import_map[ex->second].end(); - im++) { - double maxim = get_maxim(im->first); - if (maxim <= .001) continue; - try_match(ex->second, maxex, - im->first, maxim); - if (maxex <= .001) break;; - } - } - } - - - if (1) { - if (beat % 2 == 1) { - // old way - dout(5) << " matching big exporters to big importers" << endl; - // big exporters to big importers - multimap::reverse_iterator ex = exporters.rbegin(); - multimap::iterator im = importers.begin(); - while (ex != exporters.rend() && - im != importers.end()) { - double maxex = get_maxex(ex->second); - double maxim = get_maxim(im->second); - if (maxex < .001 || maxim < .001) break; - try_match(ex->second, maxex, - im->second, maxim); - if (maxex <= .001) ex++; - if (maxim <= .001) im++; - } - } else { - // new way - dout(5) << " matching small exporters to big importers" << endl; - // small exporters to big importers - multimap::iterator ex = exporters.begin(); - multimap::iterator im = importers.begin(); - while (ex != exporters.end() && - im != importers.end()) { - double maxex = get_maxex(ex->second); - double maxim = get_maxim(im->second); - if (maxex < .001 || maxim < .001) break; - try_match(ex->second, maxex, - im->second, maxim); - if (maxex <= .001) ex++; - if (maxim <= .001) im++; - } - } - } - - - - // make a sorted list of my imports - map import_pop_map; - multimap import_from_map; - for (set::iterator it = mds->mdcache->imports.begin(); - it != mds->mdcache->imports.end(); - it++) { - if ((*it)->is_hashed()) continue; - double pop = (*it)->popularity[MDS_POP_CURDOM].meta_load(); - if (pop < g_conf.mds_bal_idle_threshold && - (*it)->inode != mds->mdcache->get_root()) { - dout(-5) << " exporting idle import " << **it - << " back to mds" << (*it)->inode->authority() - << endl; - mds->mdcache->migrator->export_dir(*it, (*it)->inode->authority()); - continue; - } - import_pop_map[ pop ] = *it; - int from = (*it)->inode->authority(); - dout(15) << " map: i imported " << **it << " from " << from << endl; - import_from_map.insert(pair(from, *it)); - } - - - - // do my exports! - set already_exporting; - double total_sent = 0; - double total_goal = 0; - - for (map::iterator it = my_targets.begin(); - it != my_targets.end(); - it++) { - - /* - double fac = 1.0; - if (false && total_goal > 0 && total_sent > 0) { - fac = total_goal / total_sent; - dout(-5) << " total sent is " << total_sent << " / " << total_goal << " -> fac 1/ " << fac << endl; - if (fac > 1.0) fac = 1.0; - } - fac = .9 - .4 * ((float)g_conf.num_mds / 128.0); // hack magic fixme - */ - - int target = (*it).first; - double amount = (*it).second;// * load_fac; - total_goal += amount; - - if (amount < MIN_OFFLOAD) continue; - - dout(-5) << " sending " << amount << " to mds" << target - //<< " .. " << (*it).second << " * " << load_fac - << " -> " << amount - << endl;//" .. fudge is " << fudge << endl; - double have = 0; - - show_imports(); - - // search imports from target - if (import_from_map.count(target)) { - dout(5) << " aha, looking through imports from target mds" << target << endl; - pair::iterator, multimap::iterator> p = - import_from_map.equal_range(target); - while (p.first != p.second) { - CDir *dir = (*p.first).second; - dout(5) << "considering " << *dir << " from " << (*p.first).first << endl; - multimap::iterator plast = p.first++; - - if (dir->inode->is_root()) continue; - if (dir->is_hashed()) continue; - if (dir->is_freezing() || dir->is_frozen()) continue; // export pbly already in progress - double pop = dir->popularity[MDS_POP_CURDOM].meta_load(); - assert(dir->inode->authority() == target); // cuz that's how i put it in the map, dummy - - if (pop <= amount-have) { - dout(-5) << "reexporting " << *dir - << " pop " << pop - << " back to mds" << target << endl; - mds->mdcache->migrator->export_dir(dir, target); - have += pop; - import_from_map.erase(plast); - import_pop_map.erase(pop); - } else { - dout(5) << "can't reexport " << *dir << ", too big " << pop << endl; - } - if (amount-have < MIN_OFFLOAD) break; - } - } - if (amount-have < MIN_OFFLOAD) { - total_sent += have; - continue; - } - - // any other imports - if (false) - for (map::iterator import = import_pop_map.begin(); - import != import_pop_map.end(); - import++) { - CDir *imp = (*import).second; - if (imp->inode->is_root()) continue; - - double pop = (*import).first; - if (pop < amount-have || pop < MIN_REEXPORT) { - dout(-5) << "reexporting " << *imp - << " pop " << pop - << " back to mds" << imp->inode->authority() - << endl; - have += pop; - mds->mdcache->migrator->export_dir(imp, imp->inode->authority()); - } - if (amount-have < MIN_OFFLOAD) break; - } - if (amount-have < MIN_OFFLOAD) { - //fudge = amount-have; - total_sent += have; - continue; - } - - // okay, search for fragments of my workload - set candidates = mds->mdcache->imports; - - list exports; - - for (set::iterator pot = candidates.begin(); - pot != candidates.end(); - pot++) { - find_exports(*pot, amount, exports, have, already_exporting); - if (have > amount-MIN_OFFLOAD) { - break; - } - } - //fudge = amount - have; - total_sent += have; - - for (list::iterator it = exports.begin(); it != exports.end(); it++) { - dout(-5) << " exporting to mds" << target - << " fragment " << **it - << " pop " << (*it)->popularity[MDS_POP_CURDOM].meta_load() - << endl; - mds->mdcache->migrator->export_dir(*it, target); - - // hack! only do one dir. - break; - } - } - - dout(5) << "rebalance done" << endl; - show_imports(); - -} - - - -void MDBalancer::find_exports(CDir *dir, - double amount, - list& exports, - double& have, - set& already_exporting) -{ - double need = amount - have; - if (need < amount * g_conf.mds_bal_min_start) - return; // good enough! - double needmax = need * g_conf.mds_bal_need_max; - double needmin = need * g_conf.mds_bal_need_min; - double midchunk = need * g_conf.mds_bal_midchunk; - double minchunk = need * g_conf.mds_bal_minchunk; - - list bigger; - multimap smaller; - - double dir_pop = dir->popularity[MDS_POP_CURDOM].meta_load(); - double dir_sum = 0; - dout(-7) << " find_exports in " << dir_pop << " " << *dir << " need " << need << " (" << needmin << " - " << needmax << ")" << endl; - - for (CDir_map_t::iterator it = dir->begin(); - it != dir->end(); - it++) { - CInode *in = it->second->get_inode(); - if (!in) continue; - if (!in->is_dir()) continue; - if (!in->dir) continue; // clearly not popular - - if (in->dir->is_export()) continue; - if (in->dir->is_hashed()) continue; - if (already_exporting.count(in->dir)) continue; - - if (in->dir->is_frozen()) continue; // can't export this right now! - //if (in->dir->get_size() == 0) continue; // don't export empty dirs, even if they're not complete. for now! - - // how popular? - double pop = in->dir->popularity[MDS_POP_CURDOM].meta_load(); - dir_sum += pop; - dout(20) << " pop " << pop << " " << *in->dir << endl; - - if (pop < minchunk) continue; - - // lucky find? - if (pop > needmin && pop < needmax) { - exports.push_back(in->dir); - have += pop; - return; - } - - if (pop > need) - bigger.push_back(in->dir); - else - smaller.insert(pair(pop, in->dir)); - } - dout(7) << " .. sum " << dir_sum << " / " << dir_pop << endl; - - // grab some sufficiently big small items - multimap::reverse_iterator it; - for (it = smaller.rbegin(); - it != smaller.rend(); - it++) { - - if ((*it).first < midchunk) - break; // try later - - dout(7) << " taking smaller " << *(*it).second << endl; - - exports.push_back((*it).second); - already_exporting.insert((*it).second); - have += (*it).first; - if (have > needmin) - return; - } - - // apprently not enough; drill deeper into the hierarchy (if non-replicated) - for (list::iterator it = bigger.begin(); - it != bigger.end(); - it++) { - if ((*it)->is_rep()) continue; - dout(7) << " descending into " << **it << endl; - find_exports(*it, amount, exports, have, already_exporting); - if (have > needmin) - return; - } - - // ok fine, use smaller bits - for (; - it != smaller.rend(); - it++) { - - dout(7) << " taking (much) smaller " << it->first << " " << *(*it).second << endl; - - exports.push_back((*it).second); - already_exporting.insert((*it).second); - have += (*it).first; - if (have > needmin) - return; - } - - // ok fine, drill inot replicated dirs - for (list::iterator it = bigger.begin(); - it != bigger.end(); - it++) { - if (!(*it)->is_rep()) continue; - dout(7) << " descending into replicated " << **it << endl; - find_exports(*it, amount, exports, have, already_exporting); - if (have > needmin) - return; - } - -} - - - - -void MDBalancer::hit_inode(CInode *in, int type) -{ - // hit me - in->popularity[MDS_POP_JUSTME].pop[type].hit(); - in->popularity[MDS_POP_NESTED].pop[type].hit(); - if (in->is_auth()) { - in->popularity[MDS_POP_CURDOM].pop[type].hit(); - in->popularity[MDS_POP_ANYDOM].pop[type].hit(); - } - - // hit auth up to import - CDir *dir = in->get_parent_dir(); - if (dir) hit_dir(dir, type); -} - - -void MDBalancer::hit_dir(CDir *dir, int type) -{ - // hit me - float v = dir->popularity[MDS_POP_JUSTME].pop[type].hit(); - - // hit modify counter, if this was a modify - if (g_conf.num_mds > 2 && // FIXME >2 thing - !dir->inode->is_root() && // not root (for now at least) - dir->is_auth()) { - //dout(-20) << "hit_dir " << type << " pop is " << v << " " << *dir << endl; - - // hash this dir? (later?) - if (((v > g_conf.mds_bal_hash_rd && type == META_POP_IRD) || - //(v > g_conf.mds_bal_hash_wr && type == META_POP_IWR) || - (v > g_conf.mds_bal_hash_wr && type == META_POP_DWR)) && - !(dir->is_hashed() || dir->is_hashing()) && - hash_queue.count(dir->ino()) == 0) { - dout(0) << "hit_dir " << type << " pop is " << v << ", putting in hash_queue: " << *dir << endl; - hash_queue.insert(dir->ino()); - } - - } - - hit_recursive(dir, type); -} - - - -void MDBalancer::hit_recursive(CDir *dir, int type) -{ - bool anydom = dir->is_auth(); - bool curdom = dir->is_auth(); - - float rd_adj = 0.0; - - // replicate? - float dir_pop = dir->popularity[MDS_POP_CURDOM].pop[type].get(); // hmm?? - - if (dir->is_auth()) { - if (!dir->is_rep() && - dir_pop >= g_conf.mds_bal_replicate_threshold) { - // replicate - float rdp = dir->popularity[MDS_POP_JUSTME].pop[META_POP_IRD].get(); - rd_adj = rdp / mds->get_mds_map()->get_num_mds() - rdp; - rd_adj /= 2.0; // temper somewhat - - dout(1) << "replicating dir " << *dir << " pop " << dir_pop << " .. rdp " << rdp << " adj " << rd_adj << endl; - - dir->dir_rep = CDIR_REP_ALL; - mds->mdcache->send_dir_updates(dir, true); - - dir->popularity[MDS_POP_JUSTME].pop[META_POP_IRD].adjust(rd_adj); - dir->popularity[MDS_POP_CURDOM].pop[META_POP_IRD].adjust(rd_adj); - } - - if (!dir->ino() != 1 && - dir->is_rep() && - dir_pop < g_conf.mds_bal_unreplicate_threshold) { - // unreplicate - dout(1) << "unreplicating dir " << *dir << " pop " << dir_pop << endl; - - dir->dir_rep = CDIR_REP_NONE; - mds->mdcache->send_dir_updates(dir); - } - } - - - while (dir) { - CInode *in = dir->inode; - - dir->popularity[MDS_POP_NESTED].pop[type].hit(); - in->popularity[MDS_POP_NESTED].pop[type].hit(); - - if (rd_adj != 0.0) dir->popularity[MDS_POP_NESTED].pop[META_POP_IRD].adjust(rd_adj); - - if (anydom) { - dir->popularity[MDS_POP_ANYDOM].pop[type].hit(); - in->popularity[MDS_POP_ANYDOM].pop[type].hit(); - } - - if (curdom) { - dir->popularity[MDS_POP_CURDOM].pop[type].hit(); - in->popularity[MDS_POP_CURDOM].pop[type].hit(); - } - - if (dir->is_import()) - curdom = false; // end of auth domain, stop hitting auth counters. - dir = dir->inode->get_parent_dir(); - } -} - - -/* - * subtract off an exported chunk - */ -void MDBalancer::subtract_export(CDir *dir) -{ - meta_load_t curdom = dir->popularity[MDS_POP_CURDOM]; - - bool in_domain = !dir->is_import(); - - while (true) { - CInode *in = dir->inode; - - in->popularity[MDS_POP_ANYDOM] -= curdom; - if (in_domain) in->popularity[MDS_POP_CURDOM] -= curdom; - - dir = in->get_parent_dir(); - if (!dir) break; - - if (dir->is_import()) in_domain = false; - - dir->popularity[MDS_POP_ANYDOM] -= curdom; - if (in_domain) dir->popularity[MDS_POP_CURDOM] -= curdom; - } -} - - -void MDBalancer::add_import(CDir *dir) -{ - meta_load_t curdom = dir->popularity[MDS_POP_CURDOM]; - - bool in_domain = !dir->is_import(); - - while (true) { - CInode *in = dir->inode; - - in->popularity[MDS_POP_ANYDOM] += curdom; - if (in_domain) in->popularity[MDS_POP_CURDOM] += curdom; - - dir = in->get_parent_dir(); - if (!dir) break; - - if (dir->is_import()) in_domain = false; - - dir->popularity[MDS_POP_ANYDOM] += curdom; - if (in_domain) dir->popularity[MDS_POP_CURDOM] += curdom; - } - -} - - - - - - -void MDBalancer::show_imports(bool external) -{ - mds->mdcache->show_imports(); -} - - - -/* replicate? - - float dir_pop = dir->get_popularity(); - - if (dir->is_auth()) { - if (!dir->is_rep() && - dir_pop >= g_conf.mds_bal_replicate_threshold) { - // replicate - dout(5) << "replicating dir " << *in << " pop " << dir_pop << endl; - - dir->dir_rep = CDIR_REP_ALL; - mds->mdcache->send_dir_updates(dir); - } - - if (dir->is_rep() && - dir_pop < g_conf.mds_bal_unreplicate_threshold) { - // unreplicate - dout(5) << "unreplicating dir " << *in << " pop " << dir_pop << endl; - - dir->dir_rep = CDIR_REP_NONE; - mds->mdcache->send_dir_updates(dir); - } - } - -*/ diff --git a/branches/marnberg/quota/mds/MDBalancer.h b/branches/marnberg/quota/mds/MDBalancer.h deleted file mode 100644 index d84d6439dbccc..0000000000000 --- a/branches/marnberg/quota/mds/MDBalancer.h +++ /dev/null @@ -1,109 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __MDBALANCER_H -#define __MDBALANCER_H - -#include -#include -using namespace std; - -#include -#include -using namespace __gnu_cxx; - -#include "include/types.h" -#include "common/Clock.h" -#include "CInode.h" - - -class MDS; -class Message; -class MHeartbeat; -class CInode; -class Context; -class CDir; - -class MDBalancer { - protected: - MDS *mds; - int beat_epoch; - - utime_t last_heartbeat; - utime_t last_hash; - - // todo - set hash_queue; - - // per-epoch scatter/gathered info - hash_map mds_load; - hash_map mds_meta_load; - map > mds_import_map; - - // per-epoch state - double my_load, target_load; - map my_targets; - map imported; - map exported; - - double try_match(int ex, double& maxex, - int im, double& maxim); - double get_maxim(int im) { - return target_load - mds_meta_load[im] - imported[im]; - } - double get_maxex(int ex) { - return mds_meta_load[ex] - target_load - exported[ex]; - } - - public: - MDBalancer(MDS *m) : - mds(m), - beat_epoch(0) { } - - mds_load_t get_load(); - - int proc_message(Message *m); - - void send_heartbeat(); - void handle_heartbeat(MHeartbeat *m); - - void tick(); - - void do_hashing(); - - void export_empties(); - void do_rebalance(int beat); - void find_exports(CDir *dir, - double amount, - list& exports, - double& have, - set& already_exporting); - - - void subtract_export(class CDir *ex); - void add_import(class CDir *im); - - void hit_inode(class CInode *in, int type=0); - void hit_dir(class CDir *dir, int type=0); - void hit_recursive(class CDir *dir, int type=0); - - - void show_imports(bool external=false); - -}; - - - -#endif diff --git a/branches/marnberg/quota/mds/MDCache.cc b/branches/marnberg/quota/mds/MDCache.cc deleted file mode 100644 index eb8ad591d6a35..0000000000000 --- a/branches/marnberg/quota/mds/MDCache.cc +++ /dev/null @@ -1,3536 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "MDCache.h" -#include "MDStore.h" -#include "MDS.h" -#include "Server.h" -#include "Locker.h" -#include "MDLog.h" -#include "MDBalancer.h" -#include "AnchorClient.h" -#include "Migrator.h" -#include "Renamer.h" - -#include "MDSMap.h" - -#include "CInode.h" -#include "CDir.h" - -#include "include/filepath.h" - -#include "msg/Message.h" -#include "msg/Messenger.h" - -#include "common/Logger.h" - -#include "osdc/Filer.h" - -#include "events/EImportMap.h" -#include "events/EString.h" -#include "events/EUnlink.h" -#include "events/EPurgeFinish.h" - -#include "messages/MGenericMessage.h" - -#include "messages/MMDSImportMap.h" -#include "messages/MMDSCacheRejoin.h" -#include "messages/MMDSCacheRejoinAck.h" - -#include "messages/MDiscover.h" -#include "messages/MDiscoverReply.h" - -//#include "messages/MInodeUpdate.h" -#include "messages/MDirUpdate.h" -#include "messages/MCacheExpire.h" - -#include "messages/MInodeFileCaps.h" - -#include "messages/MInodeLink.h" -#include "messages/MInodeLinkAck.h" -#include "messages/MInodeUnlink.h" -#include "messages/MInodeUnlinkAck.h" - -#include "messages/MLock.h" -#include "messages/MDentryUnlink.h" - -#include "messages/MClientRequest.h" -#include "messages/MClientFileCaps.h" - -#include "IdAllocator.h" - -#include "common/Timer.h" - -#include -#include -#include -#include -#include -using namespace std; - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".cache " - - - - -MDCache::MDCache(MDS *m) -{ - mds = m; - migrator = new Migrator(mds, this); - renamer = new Renamer(mds, this); - root = NULL; - lru.lru_set_max(g_conf.mds_cache_size); - lru.lru_set_midpoint(g_conf.mds_cache_mid); - - did_shutdown_exports = false; - did_shutdown_log_cap = false; - shutdown_commits = 0; -} - -MDCache::~MDCache() -{ - delete migrator; - delete renamer; -} - - - -void MDCache::log_stat(Logger *logger) -{ - if (get_root()) { - logger->set("popanyd", (int)get_root()->popularity[MDS_POP_ANYDOM].meta_load()); - logger->set("popnest", (int)get_root()->popularity[MDS_POP_NESTED].meta_load()); - } - logger->set("c", lru.lru_get_size()); - logger->set("cpin", lru.lru_get_num_pinned()); - logger->set("ctop", lru.lru_get_top()); - logger->set("cbot", lru.lru_get_bot()); - logger->set("cptail", lru.lru_get_pintail()); -} - - -// - -bool MDCache::shutdown() -{ - if (lru.lru_get_size() > 0) { - dout(7) << "WARNING: mdcache shutodwn with non-empty cache" << endl; - //show_cache(); - show_imports(); - //dump(); - } - return true; -} - - -// MDCache - -CInode *MDCache::create_inode() -{ - CInode *in = new CInode(this); - - // zero - memset(&in->inode, 0, sizeof(inode_t)); - - // assign ino - in->inode.ino = mds->idalloc->alloc_id(); - - in->inode.nlink = 1; // FIXME - - in->inode.layout = g_OSD_FileLayout; - - add_inode(in); // add - return in; -} - -void MDCache::destroy_inode(CInode *in) -{ - mds->idalloc->reclaim_id(in->ino()); - remove_inode(in); -} - - -void MDCache::add_inode(CInode *in) -{ - // add to lru, inode map - assert(inode_map.count(in->ino()) == 0); // should be no dup inos! - inode_map[ in->ino() ] = in; -} - -void MDCache::remove_inode(CInode *o) -{ - dout(14) << "remove_inode " << *o << endl; - if (o->get_parent_dn()) { - // FIXME: multiple parents? - CDentry *dn = o->get_parent_dn(); - assert(!dn->is_dirty()); - if (dn->is_sync()) - dn->dir->remove_dentry(dn); // unlink inode AND hose dentry - else - dn->dir->unlink_inode(dn); // leave dentry - } - inode_map.erase(o->ino()); // remove from map -} - - -/* - * take note of where we write import_maps in the log, as we need - * to take care not to expire them until an updated map is safely flushed. - */ -class C_MDS_WroteImportMap : public Context { - MDLog *mdlog; - off_t end_off; -public: - C_MDS_WroteImportMap(MDLog *ml, off_t eo) : mdlog(ml), end_off(eo) { } - void finish(int r) { - // cout << "WroteImportMap at " << end_off << endl; - if (r >= 0) - mdlog->last_import_map = end_off; - mdlog->writing_import_map = false; - } -}; - - - -void MDCache::log_import_map(Context *onsync) -{ - dout(10) << "log_import_map " << imports.size() << " imports, " - << exports.size() << " exports" << endl; - - EImportMap *le = new EImportMap; - - // include import/export inodes, - // and a spanning tree to tie it to the root of the fs - for (set::iterator p = imports.begin(); - p != imports.end(); - p++) { - CDir *im = *p; - le->imports.insert(im->ino()); - le->metablob.add_dir_context(im, true); - le->metablob.add_dir(im, false); - - if (nested_exports.count(im)) { - for (set::iterator q = nested_exports[im].begin(); - q != nested_exports[im].end(); - ++q) { - CDir *ex = *q; - le->nested_exports[im->ino()].insert(ex->ino()); - le->exports.insert(ex->ino()); - le->metablob.add_dir_context(ex); - le->metablob.add_dir(ex, false); - } - } - } - - mds->mdlog->writing_import_map = true; - mds->mdlog->submit_entry(le); - mds->mdlog->wait_for_sync(new C_MDS_WroteImportMap(mds->mdlog, mds->mdlog->get_write_pos())); - if (onsync) - mds->mdlog->wait_for_sync(onsync); -} - - - - - -// ===================== -// recovery stuff - -void MDCache::send_pending_import_maps() -{ - if (wants_import_map.empty()) - return; // nothing to send. - - // only if it's appropriate! - if (migrator->is_exporting()) { - dout(7) << "send_pending_import_maps waiting, exports still in progress" << endl; - return; // not now - } - - // ok, send them. - for (set::iterator p = wants_import_map.begin(); - p != wants_import_map.end(); - p++) - send_import_map_now(*p); - wants_import_map.clear(); -} - -void MDCache::send_import_map(int who) -{ - if (migrator->is_exporting()) - send_import_map_later(who); - else - send_import_map_now(who); -} - -void MDCache::send_import_map_now(int who) -{ - dout(10) << "send_import_map to mds" << who << endl; - - MMDSImportMap *m = new MMDSImportMap; - - // known - for (set::iterator p = imports.begin(); - p != imports.end(); - p++) { - CDir *im = *p; - - if (migrator->is_importing(im->ino())) { - // ambiguous (mid-import) - m->add_ambiguous_import(im->ino(), - migrator->get_import_bounds(im->ino())); - } else { - // not ambiguous. - m->add_import(im->ino()); - - if (nested_exports.count(im)) { - for (set::iterator q = nested_exports[im].begin(); - q != nested_exports[im].end(); - ++q) { - CDir *ex = *q; - m->add_import_export(im->ino(), ex->ino()); - } - } - } - } - - // ambiguous - for (map >::iterator p = my_ambiguous_imports.begin(); - p != my_ambiguous_imports.end(); - ++p) - m->add_ambiguous_import(p->first, p->second); - - // second - mds->send_message_mds(m, who, MDS_PORT_CACHE); -} - - - -/* - * during resolve state, we share import_maps to determine who - * is authoritative for which trees. we expect to get an import_map - * from _everyone_ in the recovery_set (the mds cluster at the time of - * the first failure). - */ -void MDCache::handle_import_map(MMDSImportMap *m) -{ - dout(7) << "handle_import_map from " << m->get_source() << endl; - int from = m->get_source().num(); - - // FIXME: check if we are a surviving ambiguous importer - - // update my dir_auth values - for (map >::iterator pi = m->imap.begin(); - pi != m->imap.end(); - ++pi) { - CInode *imi = get_inode(pi->first); - if (!imi) continue; - CDir *im = imi->dir; - if (!im) continue; - - im->set_dir_auth(from); - - for (set::iterator pe = pi->second.begin(); - pe != pi->second.end(); - ++pe) { - CInode *exi = get_inode(*pe); - if (!exi) continue; - CDir *ex = exi->dir; - if (!ex) continue; - - if (ex->get_dir_auth() == CDIR_AUTH_PARENT) - ex->set_dir_auth(CDIR_AUTH_UNKNOWN); - } - } - - // note ambiguous imports too - for (map >::iterator pi = m->ambiguous_imap.begin(); - pi != m->ambiguous_imap.end(); - ++pi) - mds->mdcache->other_ambiguous_imports[from][pi->first].swap( pi->second ); - - // did i get them all? - got_import_map.insert(from); - - if (got_import_map == recovery_set) { - dout(10) << "got all import maps, ready to rejoin" << endl; - disambiguate_imports(); - recalc_auth_bits(); - trim_non_auth(); - - // move to rejoin state - mds->set_want_state(MDSMap::STATE_REJOIN); - - } else { - dout(10) << "still waiting for more importmaps, got " << got_import_map - << ", need " << recovery_set << endl; - } - - delete m; -} - - -void MDCache::disambiguate_imports() -{ - dout(10) << "disambiguate_imports" << endl; - - // other nodes' ambiguous imports - for (map > >::iterator p = other_ambiguous_imports.begin(); - p != other_ambiguous_imports.begin(); - ++p) { - int who = p->first; - - for (map >::iterator q = p->second.begin(); - q != p->second.end(); - ++q) { - CInode *diri = get_inode(q->first); - if (!diri) continue; - CDir *dir = diri->dir; - if (!dir) continue; - - if (dir->authority() >= CDIR_AUTH_UNKNOWN) { - dout(10) << "mds" << who << " did not import " << *dir << endl; - } else { - dout(10) << "mds" << who << " did import " << *dir << endl; - int was = dir->authority(); - dir->set_dir_auth(who); - - for (set::iterator r = q->second.begin(); - r != q->second.end(); - ++r) { - CInode *exi = get_inode(q->first); - if (!exi) continue; - CDir *ex = exi->dir; - if (!ex) continue; - if (ex->get_dir_auth() == CDIR_AUTH_PARENT) - ex->set_dir_auth(was); - dout(10) << " bound " << *ex << endl; - } - } - } - } - other_ambiguous_imports.clear(); - - // my ambiguous imports - while (!my_ambiguous_imports.empty()) { - map >::iterator q = my_ambiguous_imports.begin(); - - CInode *diri = get_inode(q->first); - if (!diri) continue; - CDir *dir = diri->dir; - if (!dir) continue; - - if (dir->authority() != CDIR_AUTH_UNKNOWN) { - dout(10) << "ambiguous import auth known, must not be me " << *dir << endl; - cancel_ambiguous_import(q->first); - } else { - dout(10) << "ambiguous import auth unknown, must be me " << *dir << endl; - finish_ambiguous_import(q->first); - } - } - assert(my_ambiguous_imports.empty()); - - show_imports(); -} - -void MDCache::cancel_ambiguous_import(inodeno_t dirino) -{ - assert(my_ambiguous_imports.count(dirino)); - dout(10) << "cancel_ambiguous_import " << dirino - << " bounds " << my_ambiguous_imports[dirino] - << endl; - my_ambiguous_imports.erase(dirino); -} - -void MDCache::finish_ambiguous_import(inodeno_t dirino) -{ - assert(my_ambiguous_imports.count(dirino)); - set bounds; - bounds.swap(my_ambiguous_imports[dirino]); - my_ambiguous_imports.erase(dirino); - - dout(10) << "finish_ambiguous_import " << dirino - << " bounds " << bounds - << endl; - - CInode *diri = get_inode(dirino); - assert(diri); - CDir *dir = diri->dir; - assert(dir); - - // adjust dir_auth - CDir *im = dir; - if (dir->get_inode()->authority() == mds->get_nodeid()) { - // parent is already me. adding to existing import. - im = get_auth_container(dir); - if (!im) im = dir; - nested_exports[im].erase(dir); - exports.erase(dir); - dir->set_dir_auth( CDIR_AUTH_PARENT ); - dir->state_clear(CDIR_STATE_EXPORT); - dir->put(CDir::PIN_EXPORT); - } else { - // parent isn't me. new import. - imports.insert(dir); - dir->set_dir_auth( mds->get_nodeid() ); - dir->state_set(CDIR_STATE_IMPORT); - dir->get(CDir::PIN_IMPORT); - } - - dout(10) << " base " << *dir << endl; - if (dir != im) - dout(10) << " under " << *im << endl; - - // bounds (exports, before) - for (set::iterator p = bounds.begin(); - p != bounds.end(); - ++p) { - CInode *bi = get_inode(*p); - assert(bi); - CDir *bd = bi->dir; - assert(bd); - - if (bd->get_dir_auth() == mds->get_nodeid()) { - // still me. was an import. - imports.erase(bd); - bd->set_dir_auth( CDIR_AUTH_PARENT ); - bd->state_clear(CDIR_STATE_IMPORT); - bd->put(CDir::PIN_IMPORT); - // move nested exports. - for (set::iterator q = nested_exports[bd].begin(); - q != nested_exports[bd].end(); - ++q) - nested_exports[im].insert(*q); - nested_exports.erase(bd); - - } else { - // not me anymore. now an export. - exports.insert(bd); - nested_exports[im].insert(bd); - assert(bd->get_dir_auth() != CDIR_AUTH_PARENT); - bd->set_dir_auth( CDIR_AUTH_UNKNOWN ); - bd->state_set(CDIR_STATE_EXPORT); - bd->get(CDir::PIN_EXPORT); - } - - dout(10) << " bound " << *bd << endl; - } -} - -void MDCache::finish_ambiguous_export(inodeno_t dirino, set& bounds) -{ - CInode *diri = get_inode(dirino); - assert(diri); - CDir *dir = diri->dir; - assert(dir); - - dout(10) << "finish_ambiguous_export " << dirino - << " bounds " << bounds - << endl; - - // adjust dir_auth - CDir *im = get_auth_container(dir); - if (dir->get_inode()->authority() == CDIR_AUTH_UNKNOWN) { - // was an import, hose it - assert(im == dir); - assert(imports.count(dir)); - imports.erase(dir); - dir->set_dir_auth( CDIR_AUTH_PARENT ); - dir->state_clear(CDIR_STATE_IMPORT); - dir->put(CDir::PIN_IMPORT); - } else { - // i'm now an export - exports.insert(dir); - nested_exports[im].insert(dir); - dir->set_dir_auth( CDIR_AUTH_UNKNOWN ); // not me - dir->state_set(CDIR_STATE_EXPORT); - dir->get(CDir::PIN_EXPORT); - } - dout(10) << " base " << *dir << endl; - if (dir != im) - dout(10) << " under " << *im << endl; - - // bounds (there were exports, before) - for (set::iterator p = bounds.begin(); - p != bounds.end(); - ++p) { - CInode *bi = get_inode(*p); - assert(bi); - CDir *bd = bi->dir; - assert(bd); - - // hose export - assert(exports.count(bd)); - exports.erase(bd); - nested_exports[im].erase(bd); - - // fix dir_auth - assert(bd->get_dir_auth() != CDIR_AUTH_PARENT); - bd->set_dir_auth( CDIR_AUTH_PARENT ); // not me - - bd->state_clear(CDIR_STATE_EXPORT); - bd->put(CDir::PIN_EXPORT); - - dout(10) << " bound " << *bd << endl; - } - - show_imports(); -} - - - - -/* - * rejoin phase! - * we start out by sending rejoins to everyone in the recovery set. - * - * if _were_ are rejoining, send for all regions in our cache. - * if we are active|stopping, send only to nodes that are are rejoining. - */ -void MDCache::send_cache_rejoins() -{ - dout(10) << "send_cache_rejoins " << endl; - - map rejoins; - - // if i am rejoining, send a rejoin to everyone. - // otherwise, just send to others who are rejoining. - for (set::iterator p = recovery_set.begin(); - p != recovery_set.end(); - ++p) { - if (*p == mds->get_nodeid()) continue; // nothing to myself! - if (mds->is_rejoin() || - mds->mdsmap->is_rejoin(*p)) - rejoins[*p] = new MMDSCacheRejoin; - } - - // build list of dir_auth regions - list dir_auth_regions; - for (hash_map::iterator p = inode_map.begin(); - p != inode_map.end(); - ++p) { - if (!p->second->is_dir()) continue; - if (!p->second->dir) continue; - if (p->second->dir->get_dir_auth() == CDIR_AUTH_PARENT) continue; - - int auth = p->second->dir->get_dir_auth(); - assert(auth >= 0); - - if (auth == mds->get_nodeid()) continue; // skip my own regions! - - if (rejoins.count(auth) == 0) - continue; // don't care about this node's regions - - // add to list - dout(10) << " on mds" << auth << " region " << *p->second << endl; - dir_auth_regions.push_back(p->second->dir); - } - - // walk the regions - for (list::iterator p = dir_auth_regions.begin(); - p != dir_auth_regions.end(); - ++p) { - CDir *dir = *p; - int to = dir->authority(); - cache_rejoin_walk(dir, rejoins[to]); - } - - // send the messages - assert(rejoin_ack_gather.empty()); - for (map::iterator p = rejoins.begin(); - p != rejoins.end(); - ++p) { - mds->send_message_mds(p->second, p->first, MDS_PORT_CACHE); - rejoin_ack_gather.insert(p->first); - } - - // nothing? - if (rejoins.empty()) { - dout(10) << "nothing to rejoin, going active" << endl; - mds->set_want_state(MDSMap::STATE_ACTIVE); - } -} - - - -void MDCache::cache_rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin) -{ - dout(10) << "cache_rejoin_walk " << *dir << endl; - rejoin->add_dir(dir->ino()); - - list nested; // finish this dir, then do nested items - - // walk dentries - for (map::iterator p = dir->items.begin(); - p != dir->items.end(); - ++p) { - // dentry - rejoin->add_dentry(dir->ino(), p->first); - - // inode? - if (p->second->is_primary() && p->second->get_inode()) { - CInode *in = p->second->get_inode(); - rejoin->add_inode(in->ino(), - in->get_caps_wanted()); - - // dir? - if (in->dir && - in->dir->get_dir_auth() == CDIR_AUTH_PARENT) - nested.push_back(in->dir); - } - } - - // recurse into nested dirs - for (list::iterator p = nested.begin(); - p != nested.end(); - ++p) - cache_rejoin_walk(*p, rejoin); -} - - -/* - * i got a rejoin. - * - * - reply with the lockstate - * - * if i am active|stopping, - * - remove source from replica list for everything not referenced here. - */ -void MDCache::handle_cache_rejoin(MMDSCacheRejoin *m) -{ - dout(7) << "handle_cache_rejoin from " << m->get_source() << endl; - int from = m->get_source().num(); - - MMDSCacheRejoinAck *ack = new MMDSCacheRejoinAck; - - if (mds->is_active() || mds->is_stopping()) { - dout(10) << "removing stale cache replicas" << endl; - // first, scour cache of replica references - for (hash_map::iterator p = inode_map.begin(); - p != inode_map.end(); - ++p) { - // inode - CInode *in = p->second; - if (in->is_replica(from) && m->inodes.count(p->first) == 0) { - inode_remove_replica(in, from); - dout(10) << " rem " << *in << endl; - } - - // dentry - if (in->parent) { - CDentry *dn = in->parent; - if (dn->is_replica(from) && - (m->dentries.count(dn->get_dir()->ino()) == 0 || - m->dentries[dn->get_dir()->ino()].count(dn->get_name()) == 0)) { - dn->remove_replica(from); - dout(10) << " rem " << *dn << endl; - } - } - - // dir - if (in->dir) { - CDir *dir = in->dir; - if (dir->is_replica(from) && m->dirs.count(p->first) == 0) { - dir->remove_replica(from); - dout(10) << " rem " << *dir << endl; - } - } - } - } else { - assert(mds->is_rejoin()); - } - - // dirs - for (set::iterator p = m->dirs.begin(); - p != m->dirs.end(); - ++p) { - CInode *diri = get_inode(*p); - assert(diri); - CDir *dir = diri->dir; - assert(dir); - int nonce = dir->add_replica(from); - dout(10) << " has " << *dir << endl; - ack->add_dir(*p, nonce); - - // dentries - for (set::iterator q = m->dentries[*p].begin(); - q != m->dentries[*p].end(); - ++q) { - CDentry *dn = dir->lookup(*q); - assert(dn); - int nonce = dn->add_replica(from); - dout(10) << " has " << *dn << endl; - ack->add_dentry(*p, *q, dn->get_lockstate(), nonce); - } - } - - // inodes - for (map::iterator p = m->inodes.begin(); - p != m->inodes.end(); - ++p) { - CInode *in = get_inode(p->first); - assert(in); - int nonce = in->add_replica(from); - if (p->second) - in->mds_caps_wanted[from] = p->second; - else - in->mds_caps_wanted.erase(from); - in->hardlock.gather_set.erase(from); // just in case - in->filelock.gather_set.erase(from); // just in case - dout(10) << " has " << *in << endl; - ack->add_inode(p->first, - in->hardlock.get_replica_state(), in->filelock.get_replica_state(), - nonce); - } - - // send ack - mds->send_message_mds(ack, from, MDS_PORT_CACHE); - - delete m; -} - - -void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoinAck *m) -{ - dout(7) << "handle_cache_rejoin from " << m->get_source() << endl; - int from = m->get_source().num(); - - // dirs - for (list::iterator p = m->dirs.begin(); - p != m->dirs.end(); - ++p) { - CInode *diri = get_inode(p->dirino); - CDir *dir = diri->dir; - assert(dir); - - dir->set_replica_nonce(p->nonce); - dout(10) << " got " << *dir << endl; - - // dentries - for (map::iterator q = m->dentries[p->dirino].begin(); - q != m->dentries[p->dirino].end(); - ++q) { - CDentry *dn = dir->lookup(q->first); - assert(dn); - dn->set_replica_nonce(q->second.nonce); - dn->set_lockstate(q->second.lock); - dout(10) << " got " << *dn << endl; - } - } - - // inodes - for (list::iterator p = m->inodes.begin(); - p != m->inodes.end(); - ++p) { - CInode *in = get_inode(p->ino); - assert(in); - in->set_replica_nonce(p->nonce); - in->hardlock.set_state(p->hardlock); - in->filelock.set_state(p->filelock); - dout(10) << " got " << *in << endl; - } - - delete m; - - // done? - rejoin_ack_gather.erase(from); - if (rejoin_ack_gather.empty()) { - dout(7) << "all done, going active!" << endl; - show_imports(); - show_cache(); - mds->set_want_state(MDSMap::STATE_ACTIVE); - } else { - dout(7) << "still need rejoin_ack from " << rejoin_ack_gather << endl; - } - -} - - - - - -// =============================================================================== - -void MDCache::rename_file(CDentry *srcdn, - CDentry *destdn) -{ - CInode *in = srcdn->inode; - - // unlink src - srcdn->dir->unlink_inode(srcdn); - - // unlink old inode? - if (destdn->inode) destdn->dir->unlink_inode(destdn); - - // link inode w/ dentry - destdn->dir->link_inode( destdn, in ); -} - - - -void MDCache::set_root(CInode *in) -{ - assert(root == 0); - root = in; - root->state_set(CInode::STATE_ROOT); -} - -void MDCache::add_import(CDir *dir) -{ - imports.insert(dir); - dir->state_set(CDIR_STATE_IMPORT); - dir->get(CDir::PIN_IMPORT); -} - - -void MDCache::recalc_auth_bits() -{ - dout(7) << "recalc_auth_bits" << endl; - - for (hash_map::iterator p = inode_map.begin(); - p != inode_map.end(); - ++p) { - CInode *in = p->second; - if (in->authority() == mds->get_nodeid()) - in->state_set(CInode::STATE_AUTH); - else { - in->state_clear(CInode::STATE_AUTH); - if (in->is_dirty()) - in->mark_clean(); - } - - if (in->parent) { - if (in->parent->authority() == mds->get_nodeid()) - in->parent->state_set(CDentry::STATE_AUTH); - else { - in->parent->state_clear(CDentry::STATE_AUTH); - if (in->parent->is_dirty()) - in->parent->mark_clean(); - } - } - - if (in->dir) { - if (in->dir->authority() == mds->get_nodeid()) - in->dir->state_set(CDIR_STATE_AUTH); - else { - in->dir->state_clear(CDIR_STATE_AUTH); - if (in->dir->is_dirty()) - in->dir->mark_clean(); - } - } - } - show_imports(); - show_cache(); -} - - - - - -// ************** -// Inode purging -- reliably removing deleted file's objects - -class C_MDC_PurgeFinish : public Context { - MDCache *mdc; - inodeno_t ino; -public: - C_MDC_PurgeFinish(MDCache *c, inodeno_t i) : mdc(c), ino(i) {} - void finish(int r) { - mdc->purge_inode_finish(ino); - } -}; -class C_MDC_PurgeFinish2 : public Context { - MDCache *mdc; - inodeno_t ino; -public: - C_MDC_PurgeFinish2(MDCache *c, inodeno_t i) : mdc(c), ino(i) {} - void finish(int r) { - mdc->purge_inode_finish_2(ino); - } -}; - -/* purge_inode in - * will be called by on unlink or rmdir - * caller responsible for journaling an appropriate EUnlink or ERmdir - */ -void MDCache::purge_inode(inode_t &inode) -{ - dout(10) << "purge_inode " << inode.ino << " size " << inode.size << endl; - - // take note - assert(purging.count(inode.ino) == 0); - purging[inode.ino] = inode; - - // remove - mds->filer->remove(inode, 0, inode.size, - 0, new C_MDC_PurgeFinish(this, inode.ino)); -} - -void MDCache::purge_inode_finish(inodeno_t ino) -{ - dout(10) << "purge_inode_finish " << ino << " - logging our completion" << endl; - - // log completion - mds->mdlog->submit_entry(new EPurgeFinish(ino), - new C_MDC_PurgeFinish2(this, ino)); -} - -void MDCache::purge_inode_finish_2(inodeno_t ino) -{ - dout(10) << "purge_inode_finish_2 " << ino << endl; - - // remove from purging list - purging.erase(ino); - - // tell anyone who cares (log flusher?) - list ls; - ls.swap(waiting_for_purge[ino]); - waiting_for_purge.erase(ino); - finish_contexts(ls, 0); - - // reclaim ino? - -} - -void MDCache::start_recovered_purges() -{ - for (map::iterator p = purging.begin(); - p != purging.end(); - ++p) { - dout(10) << "start_recovered_purges " << p->first << " size " << p->second.size << endl; - mds->filer->remove(p->second, 0, p->second.size, - 0, new C_MDC_PurgeFinish(this, p->first)); - } -} - - - -bool MDCache::trim(int max) -{ - // trim LRU - if (max < 0) { - max = lru.lru_get_max(); - if (!max) return false; - } - dout(7) << "trim max=" << max << " cur=" << lru.lru_get_size() << endl; - - map expiremap; - - while (lru.lru_get_size() > (unsigned)max) { - CDentry *dn = (CDentry*)lru.lru_expire(); - if (!dn) break; - - CDir *dir = dn->get_dir(); - assert(dir); - - // notify dentry authority? - if (!dn->is_auth()) { - int auth = dn->authority(); - dout(17) << "sending expire to mds" << auth << " on " << *dn << endl; - if (expiremap.count(auth) == 0) - expiremap[auth] = new MCacheExpire(mds->get_nodeid()); - expiremap[auth]->add_dentry(dir->ino(), dn->get_name(), dn->get_replica_nonce()); - } - - // unlink the dentry - dout(15) << "trim removing " << *dn << endl; - if (!dn->is_null()) - dir->unlink_inode(dn); - dir->remove_dentry(dn); - - // adjust the dir state - CInode *diri = dir->get_inode(); - diri->dir->state_clear(CDIR_STATE_COMPLETE); // dir incomplete! - - // reexport? - if (diri->dir->is_import() && // import - diri->dir->get_size() == 0 && // no children - !diri->is_root()) // not root - migrator->export_empty_import(diri->dir); - - if (mds->logger) mds->logger->inc("cex"); - } - - // inode expire_queue - while (!inode_expire_queue.empty()) { - CInode *in = inode_expire_queue.front(); - inode_expire_queue.pop_front(); - - assert(in->get_num_ref() == 0); - - int dirauth = -2; - if (in->dir) { - // notify dir authority? - dirauth = in->dir->authority(); - if (dirauth != mds->get_nodeid()) { - dout(17) << "sending expire to mds" << dirauth << " on " << *in->dir << endl; - if (expiremap.count(dirauth) == 0) - expiremap[dirauth] = new MCacheExpire(mds->get_nodeid()); - expiremap[dirauth]->add_dir(in->ino(), in->dir->replica_nonce); - } - - in->close_dir(); - } - - // notify inode authority - int auth = in->authority(); - if (auth == CDIR_AUTH_UNKNOWN) { - assert(in->ino() == 1); - assert(dirauth >= 0); - auth = dirauth; - } - if (auth != mds->get_nodeid()) { - assert(!in->is_auth()); - dout(17) << "sending expire to mds" << auth << " on " << *in << endl; - if (expiremap.count(auth) == 0) - expiremap[auth] = new MCacheExpire(mds->get_nodeid()); - expiremap[auth]->add_inode(in->ino(), in->get_replica_nonce()); - } else { - assert(in->is_auth()); - } - - dout(15) << "trim removing " << *in << endl; - if (in == root) root = 0; - remove_inode(in); - } - - // send expires - for (map::iterator it = expiremap.begin(); - it != expiremap.end(); - it++) { - dout(7) << "sending cache_expire to " << it->first << endl; - mds->send_message_mds(it->second, it->first, MDS_PORT_CACHE); - } - - - return true; -} - - -void MDCache::trim_non_auth() -{ - dout(7) << "trim_non_auth" << endl; - - CDentry *first_auth = 0; - - // trim non-auth items from the lru - while (lru.lru_get_size() > 0) { - CDentry *dn = (CDentry*)lru.lru_expire(); - if (!dn) break; - - if (dn->is_auth()) { - // add back into lru (at the top) - lru.lru_insert_top(dn); - - if (!first_auth) { - first_auth = dn; - } else { - if (first_auth == dn) - break; - } - } else { - // non-auth. expire. - CDir *dir = dn->get_dir(); - assert(dir); - - // unlink the dentry - dout(15) << "trim_non_auth removing " << *dn << endl; - if (!dn->is_null()) - dir->unlink_inode(dn); - dir->remove_dentry(dn); - - // adjust the dir state - CInode *diri = dir->get_inode(); - diri->dir->state_clear(CDIR_STATE_COMPLETE); // dir incomplete! - } - } - - // inode expire queue - while (!inode_expire_queue.empty()) { - CInode *in = inode_expire_queue.front(); - inode_expire_queue.pop_front(); - dout(15) << "trim_non_auth removing " << *in << endl; - if (in == root) root = 0; - remove_inode(in); - } -} - - - -class C_MDC_ShutdownCommit : public Context { - MDCache *mdc; -public: - C_MDC_ShutdownCommit(MDCache *mdc) { - this->mdc = mdc; - } - void finish(int r) { - mdc->shutdown_commits--; - } -}; - -class C_MDC_ShutdownCheck : public Context { - MDCache *mdc; -public: - C_MDC_ShutdownCheck(MDCache *m) : mdc(m) {} - void finish(int) { - mdc->shutdown_check(); - } -}; - -void MDCache::shutdown_check() -{ - dout(0) << "shutdown_check at " << g_clock.now() << endl; - - // cache - int o = g_conf.debug_mds; - g_conf.debug_mds = 10; - show_cache(); - g_conf.debug_mds = o; - mds->timer.add_event_after(g_conf.mds_shutdown_check, new C_MDC_ShutdownCheck(this)); - - // this - dout(0) << "lru size now " << lru.lru_get_size() << endl; - dout(0) << "log len " << mds->mdlog->get_num_events() << endl; - - - if (exports.size()) - dout(0) << "still have " << exports.size() << " exports" << endl; - - if (mds->filer->is_active()) - dout(0) << "filer still active" << endl; -} - -void MDCache::shutdown_start() -{ - dout(1) << "shutdown_start" << endl; - - if (g_conf.mds_shutdown_check) - mds->timer.add_event_after(g_conf.mds_shutdown_check, new C_MDC_ShutdownCheck(this)); -} - - - -bool MDCache::shutdown_pass() -{ - dout(7) << "shutdown_pass" << endl; - //assert(mds->is_shutting_down()); - if (mds->is_out()) { - dout(7) << " already shut down" << endl; - show_cache(); - show_imports(); - return true; - } - - // unhash dirs? - if (!hashdirs.empty()) { - // unhash any of my dirs? - for (set::iterator it = hashdirs.begin(); - it != hashdirs.end(); - it++) { - CDir *dir = *it; - if (!dir->is_auth()) continue; - if (dir->is_unhashing()) continue; - migrator->unhash_dir(dir); - } - - dout(7) << "waiting for dirs to unhash" << endl; - return false; - } - - // commit dirs? - if (g_conf.mds_commit_on_shutdown) { - - if (shutdown_commits < 0) { - dout(1) << "shutdown_pass committing all dirty dirs" << endl; - shutdown_commits = 0; - - for (hash_map::iterator it = inode_map.begin(); - it != inode_map.end(); - it++) { - CInode *in = it->second; - - // commit any dirty dir that's ours - if (in->is_dir() && in->dir && in->dir->is_auth() && in->dir->is_dirty()) { - mds->mdstore->commit_dir(in->dir, new C_MDC_ShutdownCommit(this)); - shutdown_commits++; - } - } - } - - // commits? - if (shutdown_commits > 0) { - dout(7) << "shutdown_commits still waiting for " << shutdown_commits << endl; - return false; - } - } - - // flush anything we can from the cache - trim(0); - dout(5) << "lru size now " << lru.lru_get_size() << endl; - - mds->mdlog->trim(0); - - // (wait for) flush log? - if (g_conf.mds_log_flush_on_shutdown) { - if (mds->mdlog->get_non_importmap_events()) { - dout(7) << "waiting for log to flush .. " << mds->mdlog->get_num_events() - << " (" << mds->mdlog->get_non_importmap_events() << ")" << endl; - return false; - } - } - - - // send all imports back to 0. - if (mds->get_nodeid() != 0 && !did_shutdown_exports) { - // flush what i can from the cache first.. - trim(0); - - // export to root - for (set::iterator it = imports.begin(); - it != imports.end(); - ) { - CDir *im = *it; - it++; - if (im->inode->is_root()) continue; - if (im->is_frozen() || im->is_freezing()) continue; - - dout(7) << "sending " << *im << " back to mds0" << endl; - migrator->export_dir(im,0); - } - did_shutdown_exports = true; - } - - - // waiting for imports? (e.g. root?) - if (exports.size()) { - dout(7) << "still have " << exports.size() << " exports" << endl; - //show_cache(); - return false; - } - - - // close root? - if (mds->get_nodeid() == 0 && - lru.lru_get_size() == 0 && - root && - root->dir && - root->dir->is_import() && - root->dir->get_num_ref() == 1) { // 1 is the import! - // un-import - dout(7) << "removing root import" << endl; - imports.erase(root->dir); - root->dir->state_clear(CDIR_STATE_IMPORT); - root->dir->put(CDir::PIN_IMPORT); - - if (root->is_pinned_by(CInode::PIN_DIRTY)) { - dout(7) << "clearing root inode dirty flag" << endl; - root->put(CInode::PIN_DIRTY); - } - - trim(0); - } - - // imports? - if (!imports.empty() || migrator->is_exporting()) { - dout(7) << "still have " << imports.size() << " imports, or still exporting" << endl; - show_cache(); - return false; - } - - // cap log? - if (g_conf.mds_log_flush_on_shutdown) { - - if (imports.empty() && exports.empty()) { - // (only do this once!) - if (!mds->mdlog->is_capped()) { - dout(7) << "capping the log" << endl; - mds->mdlog->cap(); - // note that this won't flush right away, so we'll make at least one more pass - } - } - - if (mds->mdlog->get_num_events()) { - dout(7) << "waiting for log to flush (including import_map, now) .. " << mds->mdlog->get_num_events() - << " (" << mds->mdlog->get_non_importmap_events() << ")" << endl; - return false; - } - - if (!did_shutdown_log_cap) { - // flush journal header - dout(7) << "writing header for (now-empty) journal" << endl; - assert(mds->mdlog->empty()); - mds->mdlog->write_head(0); - // NOTE: filer active checker below will block us until this completes. - did_shutdown_log_cap = true; - return false; - } - } - - // filer active? - if (mds->filer->is_active()) { - dout(7) << "filer still active" << endl; - return false; - } - - - // done? - if (lru.lru_get_size() > 0) { - dout(7) << "there's still stuff in the cache: " << lru.lru_get_size() << endl; - show_cache(); - //dump(); - return false; - } - - // done! - dout(1) << "shutdown done." << endl; - return true; -} - - - - - -CInode *MDCache::create_root_inode() -{ - CInode *root = new CInode(this); - memset(&root->inode, 0, sizeof(inode_t)); - root->inode.ino = 1; - root->inode.hash_seed = 0; // not hashed! - - // make it up (FIXME) - root->inode.mode = 0755 | INODE_MODE_DIR; - root->inode.size = 0; - root->inode.ctime = 0; - root->inode.mtime = g_clock.gettime(); - - root->inode.nlink = 1; - root->inode.layout = g_OSD_MDDirLayout; - - set_root( root ); - add_inode( root ); - - return root; -} - - -int MDCache::open_root(Context *c) -{ - int whoami = mds->get_nodeid(); - - // open root inode - if (whoami == 0) { - // i am root inode - CInode *root = create_root_inode(); - - // root directory too - assert(root->dir == NULL); - root->set_dir( new CDir(root, this, true) ); - root->dir->set_dir_auth( 0 ); // me! - root->dir->dir_rep = CDIR_REP_ALL; //NONE; - - // root is sort of technically an import (from a vacuum) - imports.insert( root->dir ); - root->dir->state_set(CDIR_STATE_IMPORT); - root->dir->get(CDir::PIN_IMPORT); - - if (c) { - c->finish(0); - delete c; - } - } else { - // request inode from root mds - if (waiting_for_root.empty()) { - dout(7) << "discovering root" << endl; - - filepath want; - MDiscover *req = new MDiscover(whoami, - 0, - want, - false); // there _is_ no base dir for the root inode - mds->send_message_mds(req, 0, MDS_PORT_CACHE); - } else { - dout(7) << "waiting for root" << endl; - } - - // wait - waiting_for_root.push_back(c); - - } - - return 0; -} - - - - - - - - -// ========= messaging ============== - - -void MDCache::dispatch(Message *m) -{ - switch (m->get_type()) { - - case MSG_MDS_IMPORTMAP: - handle_import_map((MMDSImportMap*)m); - break; - - case MSG_MDS_CACHEREJOIN: - handle_cache_rejoin((MMDSCacheRejoin*)m); - break; - case MSG_MDS_CACHEREJOINACK: - handle_cache_rejoin_ack((MMDSCacheRejoinAck*)m); - break; - - - case MSG_MDS_DISCOVER: - handle_discover((MDiscover*)m); - break; - case MSG_MDS_DISCOVERREPLY: - handle_discover_reply((MDiscoverReply*)m); - break; - - /* - case MSG_MDS_INODEUPDATE: - handle_inode_update((MInodeUpdate*)m); - break; - */ - - case MSG_MDS_INODELINK: - handle_inode_link((MInodeLink*)m); - break; - case MSG_MDS_INODELINKACK: - handle_inode_link_ack((MInodeLinkAck*)m); - break; - - case MSG_MDS_DIRUPDATE: - handle_dir_update((MDirUpdate*)m); - break; - - case MSG_MDS_CACHEEXPIRE: - handle_cache_expire((MCacheExpire*)m); - break; - - - - case MSG_MDS_DENTRYUNLINK: - handle_dentry_unlink((MDentryUnlink*)m); - break; - - - - - - default: - dout(7) << "cache unknown message " << m->get_type() << endl; - assert(0); - break; - } -} - - -/* path_traverse - * - * return values: - * <0 : traverse error (ENOTDIR, ENOENT) - * 0 : success - * >0 : delayed or forwarded - * - * Notes: - * onfinish context is only needed if you specify MDS_TRAVERSE_DISCOVER _and_ - * you aren't absolutely certain that the path actually exists. If it doesn't, - * the context is needed to pass a (failure) result code. - */ - -class C_MDC_TraverseDiscover : public Context { - Context *onfinish, *ondelay; - public: - C_MDC_TraverseDiscover(Context *onfinish, Context *ondelay) { - this->ondelay = ondelay; - this->onfinish = onfinish; - } - void finish(int r) { - //dout(10) << "TraverseDiscover r = " << r << endl; - if (r < 0 && onfinish) { // ENOENT on discover, pass back to caller. - onfinish->finish(r); - } else { - ondelay->finish(r); // retry as usual - } - delete onfinish; - delete ondelay; - } -}; - -int MDCache::path_traverse(filepath& origpath, - vector& trace, - bool follow_trailing_symlink, - Message *req, - Context *ondelay, - int onfail, - Context *onfinish, - bool is_client_req) // true if req is MClientRequest .. gross, FIXME -{ - int whoami = mds->get_nodeid(); - set< pair > symlinks_resolved; // keep a list of symlinks we touch to avoid loops - - bool noperm = false; - if (onfail == MDS_TRAVERSE_DISCOVER || - onfail == MDS_TRAVERSE_DISCOVERXLOCK) noperm = true; - - // root - CInode *cur = get_root(); - if (cur == NULL) { - dout(7) << "traverse: i don't have root" << endl; - open_root(ondelay); - if (onfinish) delete onfinish; - return 1; - } - - // start trace - trace.clear(); - - // make our own copy, since we'll modify when we hit symlinks - filepath path = origpath; - - unsigned depth = 0; - while (depth < path.depth()) { - dout(12) << "traverse: path seg depth " << depth << " = " << path[depth] << endl; - - // ENOTDIR? - if (!cur->is_dir()) { - dout(7) << "traverse: " << *cur << " not a dir " << endl; - delete ondelay; - if (onfinish) { - onfinish->finish(-ENOTDIR); - delete onfinish; - } - return -ENOTDIR; - } - - // open dir - if (!cur->dir) { - if (cur->dir_is_auth()) { - // parent dir frozen_dir? - if (cur->is_frozen_dir()) { - dout(7) << "traverse: " << *cur->get_parent_dir() << " is frozen_dir, waiting" << endl; - cur->get_parent_dir()->add_waiter(CDIR_WAIT_UNFREEZE, ondelay); - if (onfinish) delete onfinish; - return 1; - } - - cur->get_or_open_dir(this); - assert(cur->dir); - } else { - // discover dir from/via inode auth - assert(!cur->is_auth()); - if (cur->waiting_for(CINODE_WAIT_DIR)) { - dout(10) << "traverse: need dir for " << *cur << ", already doing discover" << endl; - } else { - filepath want = path.postfixpath(depth); - dout(10) << "traverse: need dir for " << *cur << ", doing discover, want " << want.get_path() << endl; - mds->send_message_mds(new MDiscover(mds->get_nodeid(), - cur->ino(), - want, - true), // need this dir too - cur->authority(), MDS_PORT_CACHE); - } - cur->add_waiter(CINODE_WAIT_DIR, ondelay); - if (onfinish) delete onfinish; - return 1; - } - } - - // frozen? - /* - if (cur->dir->is_frozen()) { - // doh! - // FIXME: traverse is allowed? - dout(7) << "traverse: " << *cur->dir << " is frozen, waiting" << endl; - cur->dir->add_waiter(CDIR_WAIT_UNFREEZE, ondelay); - if (onfinish) delete onfinish; - return 1; - } - */ - - // must read directory hard data (permissions, x bit) to traverse - if (!noperm && !mds->locker->inode_hard_read_try(cur, ondelay)) { - if (onfinish) delete onfinish; - return 1; - } - - // check permissions? - // XXX - - // ..? - if (path[depth] == "..") { - trace.pop_back(); - depth++; - cur = cur->get_parent_inode(); - dout(10) << "traverse: following .. back to " << *cur << endl; - continue; - } - - - // dentry - CDentry *dn = cur->dir->lookup(path[depth]); - - // null and last_bit and xlocked by me? - if (dn && dn->is_null() && - dn->is_xlockedbyme(req) && - depth == path.depth()-1) { - dout(10) << "traverse: hit (my) xlocked dentry at tail of traverse, succeeding" << endl; - trace.push_back(dn); - break; // done! - } - - if (dn && !dn->is_null()) { - // dentry exists. xlocked? - if (!noperm && dn->is_xlockedbyother(req)) { - dout(10) << "traverse: xlocked dentry at " << *dn << endl; - cur->dir->add_waiter(CDIR_WAIT_DNREAD, - path[depth], - ondelay); - if (onfinish) delete onfinish; - return 1; - } - - // do we have inode? - if (!dn->inode) { - assert(dn->is_remote()); - // do i have it? - CInode *in = get_inode(dn->get_remote_ino()); - if (in) { - dout(7) << "linking in remote in " << *in << endl; - dn->link_remote(in); - } else { - dout(7) << "remote link to " << dn->get_remote_ino() << ", which i don't have" << endl; - open_remote_ino(dn->get_remote_ino(), req, - ondelay); - return 1; - } - } - - // symlink? - if (dn->inode->is_symlink() && - (follow_trailing_symlink || depth < path.depth()-1)) { - // symlink, resolve! - filepath sym = dn->inode->symlink; - dout(10) << "traverse: hit symlink " << *dn->inode << " to " << sym << endl; - - // break up path components - // /head/symlink/tail - filepath head = path.prefixpath(depth); - filepath tail = path.postfixpath(depth+1); - dout(10) << "traverse: path head = " << head << endl; - dout(10) << "traverse: path tail = " << tail << endl; - - if (symlinks_resolved.count(pair(dn->inode, tail.get_path()))) { - dout(10) << "already hit this symlink, bailing to avoid the loop" << endl; - return -ELOOP; - } - symlinks_resolved.insert(pair(dn->inode, tail.get_path())); - - // start at root? - if (dn->inode->symlink[0] == '/') { - // absolute - trace.clear(); - depth = 0; - path = tail; - dout(10) << "traverse: absolute symlink, path now " << path << " depth " << depth << endl; - } else { - // relative - path = head; - path.append(sym); - path.append(tail); - dout(10) << "traverse: relative symlink, path now " << path << " depth " << depth << endl; - } - continue; - } else { - // keep going. - - // forwarder wants replicas? - if (is_client_req && ((MClientRequest*)req)->get_mds_wants_replica_in_dirino()) { - dout(30) << "traverse: REP is here, " << ((MClientRequest*)req)->get_mds_wants_replica_in_dirino() << " vs " << cur->dir->ino() << endl; - - if (((MClientRequest*)req)->get_mds_wants_replica_in_dirino() == cur->dir->ino() && - cur->dir->is_auth() && - cur->dir->is_rep() && - cur->dir->is_replica(req->get_source().num()) && - dn->get_inode()->is_auth() - ) { - assert(req->get_source().is_mds()); - int from = req->get_source().num(); - - if (dn->get_inode()->is_replica(from)) { - dout(15) << "traverse: REP would replicate to mds" << from << ", but already cached_by " - << req->get_source() << " dn " << *dn << endl; - } else { - dout(10) << "traverse: REP replicating to " << req->get_source() << " dn " << *dn << endl; - MDiscoverReply *reply = new MDiscoverReply(cur->dir->ino()); - reply->add_dentry( dn->replicate_to( from ) ); - reply->add_inode( dn->inode->replicate_to( from ) ); - mds->send_message_mds(reply, req->get_source().num(), MDS_PORT_CACHE); - } - } - } - - trace.push_back(dn); - cur = dn->inode; - touch_inode(cur); - depth++; - continue; - } - } - - // MISS. don't have it. - - int dauth = cur->dir->dentry_authority( path[depth] ); - dout(12) << "traverse: miss on dentry " << path[depth] << " dauth " << dauth << " in " << *cur->dir << endl; - - - if (dauth == whoami) { - // dentry is mine. - if (cur->dir->is_complete()) { - // file not found - delete ondelay; - if (onfinish) { - onfinish->finish(-ENOENT); - delete onfinish; - } - return -ENOENT; - } else { - - //wrong? - //if (onfail == MDS_TRAVERSE_DISCOVER) - // return -1; - - // directory isn't complete; reload - dout(7) << "traverse: incomplete dir contents for " << *cur << ", fetching" << endl; - touch_inode(cur); - mds->mdstore->fetch_dir(cur->dir, ondelay); - - if (mds->logger) mds->logger->inc("cmiss"); - - if (onfinish) delete onfinish; - return 1; - } - } else { - // dentry is not mine. - - /* no, let's let auth handle the discovery/replication .. - if (onfail == MDS_TRAVERSE_FORWARD && - onfinish == 0 && // no funnyness - cur->dir->is_rep()) { - dout(5) << "trying to discover in popular dir " << *cur->dir << endl; - onfail = MDS_TRAVERSE_DISCOVER; - } - */ - - if ((onfail == MDS_TRAVERSE_DISCOVER || - onfail == MDS_TRAVERSE_DISCOVERXLOCK)) { - // discover - - filepath want = path.postfixpath(depth); - if (cur->dir->waiting_for(CDIR_WAIT_DENTRY, path[depth])) { - dout(7) << "traverse: already waiting for discover on " << *cur << " for " << want.get_path() << " to mds" << dauth << endl; - } else { - dout(7) << "traverse: discover on " << *cur << " for " << want.get_path() << " to mds" << dauth << endl; - - touch_inode(cur); - - mds->send_message_mds(new MDiscover(mds->get_nodeid(), - cur->ino(), - want, - false), - dauth, MDS_PORT_CACHE); - if (mds->logger) mds->logger->inc("dis"); - } - - // delay processing of current request. - // delay finish vs ondelay until result of traverse, so that ENOENT can be - // passed to onfinish if necessary - cur->dir->add_waiter(CDIR_WAIT_DENTRY, - path[depth], - new C_MDC_TraverseDiscover(onfinish, ondelay)); - - if (mds->logger) mds->logger->inc("cmiss"); - return 1; - } - if (onfail == MDS_TRAVERSE_FORWARD) { - // forward - dout(7) << "traverse: not auth for " << path << " at " << path[depth] << ", fwd to mds" << dauth << endl; - - if (is_client_req && cur->dir->is_rep()) { - dout(15) << "traverse: REP fw to mds" << dauth << ", requesting rep under " << *cur->dir << " req " << *(MClientRequest*)req << endl; - ((MClientRequest*)req)->set_mds_wants_replica_in_dirino(cur->dir->ino()); - req->clear_payload(); // reencode! - } - - mds->send_message_mds(req, dauth, req->get_dest_port()); - //show_imports(); - - if (mds->logger) mds->logger->inc("cfw"); - if (onfinish) delete onfinish; - delete ondelay; - return 2; - } - if (onfail == MDS_TRAVERSE_FAIL) { - delete ondelay; - if (onfinish) { - onfinish->finish(-ENOENT); // -ENOENT, but only because i'm not the authority! - delete onfinish; - } - return -ENOENT; // not necessarily exactly true.... - } - } - - assert(0); // i shouldn't get here - } - - // success. - delete ondelay; - if (onfinish) { - onfinish->finish(0); - delete onfinish; - } - return 0; -} - - - -void MDCache::open_remote_dir(CInode *diri, - Context *fin) -{ - dout(10) << "open_remote_dir on " << *diri << endl; - - assert(diri->is_dir()); - assert(!diri->dir_is_auth()); - assert(!diri->is_auth()); - assert(diri->dir == 0); - - filepath want; // no dentries, i just want the dir open - mds->send_message_mds(new MDiscover(mds->get_nodeid(), - diri->ino(), - want, - true), // need the dir open - diri->authority(), MDS_PORT_CACHE); - - diri->add_waiter(CINODE_WAIT_DIR, fin); -} - - - -class C_MDC_OpenRemoteInoLookup : public Context { - MDCache *mdc; - inodeno_t ino; - Message *req; - Context *onfinish; -public: - vector anchortrace; - C_MDC_OpenRemoteInoLookup(MDCache *mdc, inodeno_t ino, Message *req, Context *onfinish) { - this->mdc = mdc; - this->ino = ino; - this->req = req; - this->onfinish = onfinish; - } - void finish(int r) { - assert(r == 0); - if (r == 0) - mdc->open_remote_ino_2(ino, req, anchortrace, onfinish); - else { - onfinish->finish(r); - delete onfinish; - } - } -}; - -void MDCache::open_remote_ino(inodeno_t ino, - Message *req, - Context *onfinish) -{ - dout(7) << "open_remote_ino on " << ino << endl; - - C_MDC_OpenRemoteInoLookup *c = new C_MDC_OpenRemoteInoLookup(this, ino, req, onfinish); - mds->anchorclient->lookup(ino, c->anchortrace, c); -} - -void MDCache::open_remote_ino_2(inodeno_t ino, - Message *req, - vector& anchortrace, - Context *onfinish) -{ - dout(7) << "open_remote_ino_2 on " << ino << ", trace depth is " << anchortrace.size() << endl; - - // construct path - filepath path; - for (unsigned i=0; iref_dn); - - dout(7) << " path is " << path << endl; - - vector trace; - int r = path_traverse(path, trace, false, - req, - onfinish, // delay actually - MDS_TRAVERSE_DISCOVER); - if (r > 0) return; - - onfinish->finish(r); - delete onfinish; -} - - - - -// path pins - -bool MDCache::path_pin(vector& trace, - Message *m, - Context *c) -{ - // verify everything is pinnable - for (vector::iterator it = trace.begin(); - it != trace.end(); - it++) { - CDentry *dn = *it; - if (!dn->is_pinnable(m)) { - // wait - if (c) { - dout(10) << "path_pin can't pin " << *dn << ", waiting" << endl; - dn->dir->add_waiter(CDIR_WAIT_DNPINNABLE, - dn->name, - c); - } else { - dout(10) << "path_pin can't pin, no waiter, failing." << endl; - } - return false; - } - } - - // pin! - for (vector::iterator it = trace.begin(); - it != trace.end(); - it++) { - (*it)->pin(m); - dout(11) << "path_pinned " << *(*it) << endl; - } - - delete c; - return true; -} - - -void MDCache::path_unpin(vector& trace, - Message *m) -{ - for (vector::iterator it = trace.begin(); - it != trace.end(); - it++) { - CDentry *dn = *it; - dn->unpin(m); - dout(11) << "path_unpinned " << *dn << endl; - - // did we completely unpin a waiter? - if (dn->lockstate == DN_LOCK_UNPINNING && !dn->get_num_ref()) { - // return state to sync, in case the unpinner flails - dn->lockstate = DN_LOCK_SYNC; - - // run finisher right now to give them a fair shot. - dn->dir->finish_waiting(CDIR_WAIT_DNUNPINNED, dn->name); - } - } -} - - -void MDCache::make_trace(vector& trace, CInode *in) -{ - CInode *parent = in->get_parent_inode(); - if (parent) { - make_trace(trace, parent); - - CDentry *dn = in->get_parent_dn(); - dout(15) << "make_trace adding " << *dn << endl; - trace.push_back(dn); - } -} - - -bool MDCache::request_start(Message *req, - CInode *ref, - vector& trace) -{ - assert(active_requests.count(req) == 0); - - // pin path - if (trace.size()) { - if (!path_pin(trace, req, new C_MDS_RetryMessage(mds,req))) return false; - } - - dout(7) << "request_start " << *req << endl; - - // add to map - active_requests[req].ref = ref; - if (trace.size()) active_requests[req].traces[trace[trace.size()-1]] = trace; - - // request pins - request_pin_inode(req, ref); - - if (mds->logger) mds->logger->inc("req"); - - return true; -} - - -void MDCache::request_pin_inode(Message *req, CInode *in) -{ - if (active_requests[req].request_pins.count(in) == 0) { - in->request_pin_get(); - active_requests[req].request_pins.insert(in); - } -} - -void MDCache::request_pin_dir(Message *req, CDir *dir) -{ - if (active_requests[req].request_dir_pins.count(dir) == 0) { - dir->request_pin_get(); - active_requests[req].request_dir_pins.insert(dir); - } -} - - -void MDCache::request_cleanup(Message *req) -{ - assert(active_requests.count(req) == 1); - - // leftover xlocks? - if (active_requests[req].xlocks.size()) { - set dns = active_requests[req].xlocks; - - for (set::iterator it = dns.begin(); - it != dns.end(); - it++) { - CDentry *dn = *it; - - dout(7) << "request_cleanup leftover xlock " << *dn << endl; - - mds->locker->dentry_xlock_finish(dn); - - // queue finishers - dn->dir->take_waiting(CDIR_WAIT_ANY, dn->name, mds->finished_queue); - - // remove clean, null dentry? (from a failed rename or whatever) - if (dn->is_null() && dn->is_sync() && !dn->is_dirty()) { - dn->dir->remove_dentry(dn); - } - } - - assert(active_requests[req].xlocks.empty()); // we just finished finished them - } - - // foreign xlocks? - if (active_requests[req].foreign_xlocks.size()) { - set dns = active_requests[req].foreign_xlocks; - active_requests[req].foreign_xlocks.clear(); - - for (set::iterator it = dns.begin(); - it != dns.end(); - it++) { - CDentry *dn = *it; - - dout(7) << "request_cleanup sending unxlock for foreign xlock on " << *dn << endl; - assert(dn->is_xlocked()); - int dauth = dn->dir->dentry_authority(dn->name); - MLock *m = new MLock(LOCK_AC_UNXLOCK, mds->get_nodeid()); - m->set_dn(dn->dir->ino(), dn->name); - mds->send_message_mds(m, dauth, MDS_PORT_CACHE); - } - } - - // unpin paths - for (map< CDentry*, vector >::iterator it = active_requests[req].traces.begin(); - it != active_requests[req].traces.end(); - it++) { - path_unpin(it->second, req); - } - - // request pins - for (set::iterator it = active_requests[req].request_pins.begin(); - it != active_requests[req].request_pins.end(); - it++) { - (*it)->request_pin_put(); - } - for (set::iterator it = active_requests[req].request_dir_pins.begin(); - it != active_requests[req].request_dir_pins.end(); - it++) { - (*it)->request_pin_put(); - } - - // remove from map - active_requests.erase(req); - - - // log some stats ***** - if (mds->logger) { - mds->logger->set("c", lru.lru_get_size()); - mds->logger->set("cpin", lru.lru_get_num_pinned()); - mds->logger->set("ctop", lru.lru_get_top()); - mds->logger->set("cbot", lru.lru_get_bot()); - mds->logger->set("cptail", lru.lru_get_pintail()); - //mds->logger->set("buf",buffer_total_alloc); - } - - if (g_conf.log_pins) { - // pin - /* -for (int i=0; ilogger2) mds->logger2->set(cinode_pin_names[i], - cinode_pins[i]); - } - */ - /* - for (map::iterator it = cdir_pins.begin(); - it != cdir_pins.end(); - it++) { - //string s = "D"; - //s += cdir_pin_names[it->first]; - if (mds->logger2) mds->logger2->set(//s, - cdir_pin_names[it->first], - it->second); - } - */ - } - -} - -void MDCache::request_finish(Message *req) -{ - dout(7) << "request_finish " << *req << endl; - request_cleanup(req); - delete req; // delete req - - if (mds->logger) mds->logger->inc("reply"); - - - //dump(); -} - - -void MDCache::request_forward(Message *req, int who, int port) -{ - if (!port) port = MDS_PORT_SERVER; - - dout(7) << "request_forward to " << who << " req " << *req << endl; - request_cleanup(req); - mds->send_message_mds(req, who, port); - - if (mds->logger) mds->logger->inc("fw"); -} - - - -// ANCHORS - -class C_MDC_AnchorInode : public Context { - CInode *in; - -public: - C_MDC_AnchorInode(CInode *in) { - this->in = in; - } - void finish(int r) { - if (r == 0) { - assert(in->inode.anchored == false); - in->inode.anchored = true; - - in->state_clear(CInode::STATE_ANCHORING); - in->put(CInode::PIN_ANCHORING); - - in->_mark_dirty(); // fixme - } - - // trigger - in->finish_waiting(CINODE_WAIT_ANCHORED, r); - } -}; - -void MDCache::anchor_inode(CInode *in, Context *onfinish) -{ - assert(in->is_auth()); - - // already anchoring? - if (in->state_test(CInode::STATE_ANCHORING)) { - dout(7) << "anchor_inode already anchoring " << *in << endl; - - // wait - in->add_waiter(CINODE_WAIT_ANCHORED, - onfinish); - - } else { - dout(7) << "anchor_inode anchoring " << *in << endl; - - // auth: do it - in->state_set(CInode::STATE_ANCHORING); - in->get(CInode::PIN_ANCHORING); - - // wait - in->add_waiter(CINODE_WAIT_ANCHORED, - onfinish); - - // make trace - vector trace; - in->make_anchor_trace(trace); - - // do it - mds->anchorclient->create(in->ino(), trace, - new C_MDC_AnchorInode( in )); - } -} - - -void MDCache::handle_inode_link(MInodeLink *m) -{ - CInode *in = get_inode(m->get_ino()); - assert(in); - - if (!in->is_auth()) { - assert(in->is_proxy()); - dout(7) << "handle_inode_link not auth for " << *in << ", fw to auth" << endl; - mds->send_message_mds(m, in->authority(), MDS_PORT_CACHE); - return; - } - - dout(7) << "handle_inode_link on " << *in << endl; - - if (!in->is_anchored()) { - assert(in->inode.nlink == 1); - dout(7) << "needs anchor, nlink=" << in->inode.nlink << ", creating anchor" << endl; - - anchor_inode(in, - new C_MDS_RetryMessage(mds, m)); - return; - } - - in->inode.nlink++; - in->_mark_dirty(); // fixme - - // reply - dout(7) << " nlink++, now " << in->inode.nlink++ << endl; - - mds->send_message_mds(new MInodeLinkAck(m->get_ino(), true), m->get_from(), MDS_PORT_CACHE); - delete m; -} - - -void MDCache::handle_inode_link_ack(MInodeLinkAck *m) -{ - CInode *in = get_inode(m->get_ino()); - assert(in); - - dout(7) << "handle_inode_link_ack success = " << m->is_success() << " on " << *in << endl; - in->finish_waiting(CINODE_WAIT_LINK, - m->is_success() ? 1:-1); -} - - - -// REPLICAS - - -void MDCache::handle_discover(MDiscover *dis) -{ - int whoami = mds->get_nodeid(); - - // from me to me? - if (dis->get_asker() == whoami) { - dout(7) << "discover for " << dis->get_want().get_path() << " bounced back to me, dropping." << endl; - delete dis; - return; - } - - CInode *cur = 0; - MDiscoverReply *reply = 0; - //filepath fullpath; - - // get started. - if (dis->get_base_ino() == 0) { - // wants root - dout(7) << "discover from mds" << dis->get_asker() << " wants root + " << dis->get_want().get_path() << endl; - - assert(mds->get_nodeid() == 0); - assert(root->is_auth()); - - //fullpath = dis->get_want(); - - - // add root - reply = new MDiscoverReply(0); - reply->add_inode( root->replicate_to( dis->get_asker() ) ); - dout(10) << "added root " << *root << endl; - - cur = root; - - } else { - // there's a base inode - cur = get_inode(dis->get_base_ino()); - assert(cur); - - if (dis->wants_base_dir()) { - dout(7) << "discover from mds" << dis->get_asker() << " has " << *cur << " wants dir+" << dis->get_want().get_path() << endl; - } else { - dout(7) << "discover from mds" << dis->get_asker() << " has " << *cur->dir << " wants " << dis->get_want().get_path() << endl; - } - - assert(cur->is_dir()); - - // crazyness? - if (!cur->dir && !cur->is_auth()) { - int iauth = cur->authority(); - dout(7) << "no dir and not inode auth; fwd to auth " << iauth << endl; - mds->send_message_mds( dis, iauth, MDS_PORT_CACHE); - return; - } - - // frozen_dir? - if (!cur->dir && cur->is_frozen_dir()) { - dout(7) << "is frozen_dir, waiting" << endl; - cur->get_parent_dir()->add_waiter(CDIR_WAIT_UNFREEZE, - new C_MDS_RetryMessage(mds, dis)); - return; - } - - if (!cur->dir) - cur->get_or_open_dir(this); - assert(cur->dir); - - dout(10) << "dir is " << *cur->dir << endl; - - // create reply - reply = new MDiscoverReply(cur->ino()); - } - - assert(reply); - assert(cur); - - /* - // first traverse and make sure we won't have to do any waiting - dout(10) << "traversing full discover path = " << fullpath << endl; - vector trav; - int r = path_traverse(fullpath, trav, dis, MDS_TRAVERSE_FAIL); - if (r > 0) - return; // fw or delay - dout(10) << "traverse finish w/o blocking, continuing" << endl; - // ok, now we know we won't block on dentry locks or readdir. - */ - - - // add content - // do some fidgeting to include a dir if they asked for the base dir, or just root. - for (unsigned i = 0; i < dis->get_want().depth() || dis->get_want().depth() == 0; i++) { - // add dir - if (reply->is_empty() && !dis->wants_base_dir()) { - dout(7) << "they don't want the base dir" << endl; - } else { - // is it actaully a dir at all? - if (!cur->is_dir()) { - dout(7) << "not a dir " << *cur << endl; - reply->set_flag_error_dir(); - break; - } - - // add dir - if (!cur->dir_is_auth()) { - dout(7) << *cur << " dir auth is someone else, i'm done" << endl; - break; - } - - // did we hit a frozen_dir? - if (!cur->dir && cur->is_frozen_dir()) { - dout(7) << *cur << " is frozen_dir, stopping" << endl; - break; - } - - if (!cur->dir) cur->get_or_open_dir(this); - - reply->add_dir( new CDirDiscover( cur->dir, - cur->dir->add_replica( dis->get_asker() ) ) ); - dout(7) << "added dir " << *cur->dir << endl; - } - if (dis->get_want().depth() == 0) break; - - // lookup dentry - int dentry_auth = cur->dir->dentry_authority( dis->get_dentry(i) ); - if (dentry_auth != mds->get_nodeid()) { - dout(7) << *cur->dir << "dentry " << dis->get_dentry(i) << " auth " << dentry_auth << ", i'm done." << endl; - break; // that's it for us! - } - - // get inode - CDentry *dn = cur->dir->lookup( dis->get_dentry(i) ); - - /* - if (dn && !dn->can_read()) { // xlocked? - dout(7) << "waiting on " << *dn << endl; - cur->dir->add_waiter(CDIR_WAIT_DNREAD, - dn->name, - new C_MDS_RetryMessage(mds, dis)); - return; - } - */ - - if (dn) { - if (!dn->inode && dn->is_sync()) { - dout(7) << "mds" << whoami << " dentry " << dis->get_dentry(i) << " null in " << *cur->dir << ", returning error" << endl; - reply->set_flag_error_dn( dis->get_dentry(i) ); - break; // don't replicate null but non-locked dentries. - } - - reply->add_dentry( dn->replicate_to( dis->get_asker() ) ); - dout(7) << "added dentry " << *dn << endl; - - if (!dn->inode) break; // we're done. - } - - if (dn && dn->inode) { - CInode *next = dn->inode; - assert(next->is_auth()); - - // add inode - //int nonce = next->cached_by_add(dis->get_asker()); - reply->add_inode( next->replicate_to( dis->get_asker() ) ); - dout(7) << "added inode " << *next << endl;// " nonce=" << nonce<< endl; - - // descend - cur = next; - } else { - // don't have inode? - if (cur->dir->is_complete()) { - // set error flag in reply - dout(7) << "mds" << whoami << " dentry " << dis->get_dentry(i) << " not found in " << *cur->dir << ", returning error" << endl; - reply->set_flag_error_dn( dis->get_dentry(i) ); - break; - } else { - // readdir - dout(7) << "mds" << whoami << " incomplete dir contents for " << *cur->dir << ", fetching" << endl; - - //mds->mdstore->fetch_dir(cur->dir, NULL); //new C_MDS_RetryMessage(mds, dis)); - //break; // send what we have so far - - mds->mdstore->fetch_dir(cur->dir, new C_MDS_RetryMessage(mds, dis)); - return; - } - } - } - - // how did we do. - if (reply->is_empty()) { - - // discard empty reply - delete reply; - - if ((cur->is_auth() || cur->is_proxy() || cur->dir->is_proxy()) && - !cur->dir->is_auth()) { - // fwd to dir auth - int dirauth = cur->dir->authority(); - if (dirauth == dis->get_asker()) { - dout(7) << "from (new?) dir auth, dropping (obsolete) discover on floor." << endl; // XXX FIXME is this right? - //assert(dis->get_asker() == dis->get_source()); //might be a weird other loop. either way, asker has it. - delete dis; - } else { - dout(7) << "fwd to dir auth " << dirauth << endl; - mds->send_message_mds( dis, dirauth, MDS_PORT_CACHE ); - } - return; - } - - dout(7) << "i'm not auth or proxy, dropping (this empty reply). i bet i just exported." << endl; - //assert(0); - - } else { - // send back to asker - dout(7) << "sending result back to asker mds" << dis->get_asker() << endl; - mds->send_message_mds(reply, dis->get_asker(), MDS_PORT_CACHE); - } - - // done. - delete dis; -} - - -void MDCache::handle_discover_reply(MDiscoverReply *m) -{ - // starting point - CInode *cur; - list finished, error; - - if (m->has_root()) { - // nowhere! - dout(7) << "discover_reply root + " << m->get_path() << " " << m->get_num_inodes() << " inodes" << endl; - assert(!root); - assert(m->get_base_ino() == 0); - assert(!m->has_base_dentry()); - assert(!m->has_base_dir()); - - // add in root - cur = new CInode(this, false); - - m->get_inode(0).update_inode(cur); - - // root - set_root( cur ); - add_inode( cur ); - dout(7) << " got root: " << *cur << endl; - - // take waiters - finished.swap(waiting_for_root); - } else { - // grab inode - cur = get_inode(m->get_base_ino()); - - if (!cur) { - dout(7) << "discover_reply don't have base ino " << m->get_base_ino() << ", dropping" << endl; - delete m; - return; - } - - dout(7) << "discover_reply " << *cur << " + " << m->get_path() << ", have " << m->get_num_inodes() << " inodes" << endl; - } - - // fyi - if (m->is_flag_error_dir()) dout(7) << " flag error, dir" << endl; - if (m->is_flag_error_dn()) dout(7) << " flag error, dentry = " << m->get_error_dentry() << endl; - dout(10) << "depth is " << m->get_depth() << ", has_root = " << m->has_root() << endl; - - // loop over discover results. - // indexese follow each ([[dir] dentry] inode) - // can start, end with any type. - - for (int i=m->has_root(); iget_depth(); i++) { - dout(10) << "discover_reply i=" << i << " cur " << *cur << endl; - - // dir - if ((i > 0) || - (i == 0 && m->has_base_dir())) { - if (cur->dir) { - // had it - /* this is strange, but it happens when: - we discover multiple dentries under a dir. - bc, no flag to indicate a dir discover is underway, (as there is w/ a dentry one). - this is actually good, since (dir aside) they're asking for different information. - */ - dout(7) << "had " << *cur->dir; - m->get_dir(i).update_dir(cur->dir); - dout2(7) << ", now " << *cur->dir << endl; - } else { - // add it (_replica_) - cur->set_dir( new CDir(cur, this, false) ); - m->get_dir(i).update_dir(cur->dir); - dout(7) << "added " << *cur->dir << " nonce " << cur->dir->replica_nonce << endl; - - // get waiters - cur->take_waiting(CINODE_WAIT_DIR, finished); - } - } - - // dentry error? - if (i == m->get_depth()-1 && - m->is_flag_error_dn()) { - // error! - assert(cur->is_dir()); - if (cur->dir) { - dout(7) << " flag_error on dentry " << m->get_error_dentry() << ", triggering dentry?" << endl; - cur->dir->take_waiting(CDIR_WAIT_DENTRY, - m->get_error_dentry(), - error); - } else { - dout(7) << " flag_error on dentry " << m->get_error_dentry() << ", triggering dir?" << endl; - cur->take_waiting(CINODE_WAIT_DIR, error); - } - break; - } - - if (i >= m->get_num_dentries()) break; - - // dentry - dout(7) << "i = " << i << " dentry is " << m->get_dentry(i).get_dname() << endl; - - CDentry *dn = 0; - if (i > 0 || - m->has_base_dentry()) { - dn = cur->dir->lookup( m->get_dentry(i).get_dname() ); - - if (dn) { - dout(7) << "had " << *dn << endl; - dn->replica_nonce = m->get_dentry(i).get_nonce(); // fix nonce. - } else { - dn = cur->dir->add_dentry( m->get_dentry(i).get_dname(), 0, false ); - m->get_dentry(i).update_dentry(dn); - dout(7) << "added " << *dn << endl; - } - - cur->dir->take_waiting(CDIR_WAIT_DENTRY, - m->get_dentry(i).get_dname(), - finished); - } - - if (i >= m->get_num_inodes()) break; - - // inode - dout(7) << "i = " << i << " ino is " << m->get_ino(i) << endl; - CInode *in = get_inode( m->get_inode(i).get_ino() ); - assert(dn); - - if (in) { - dout(7) << "had " << *in << endl; - - // fix nonce - dout(7) << " my nonce is " << in->replica_nonce << ", taking from discover, which has " << m->get_inode(i).get_replica_nonce() << endl; - in->replica_nonce = m->get_inode(i).get_replica_nonce(); - - if (dn && in != dn->inode) { - dout(7) << " but it's not linked via dentry " << *dn << endl; - // link - if (dn->inode) { - dout(7) << "dentry WAS linked to " << *dn->inode << endl; - assert(0); // WTF. - } - dn->dir->link_inode(dn, in); - } - } - else { - assert(dn->inode == 0); // better not be something else linked to this dentry... - - // didn't have it. - in = new CInode(this, false); - - m->get_inode(i).update_inode(in); - - // link in - add_inode( in ); - dn->dir->link_inode(dn, in); - - dout(7) << "added " << *in << " nonce " << in->replica_nonce << endl; - } - - // onward! - cur = in; - } - - // dir error at the end there? - if (m->is_flag_error_dir()) { - dout(7) << " flag_error on dir " << *cur << endl; - assert(!cur->is_dir()); - cur->take_waiting(CINODE_WAIT_DIR, error); - } - - // finish errors directly - finish_contexts(error, -ENOENT); - - mds->queue_finished(finished); - - // done - delete m; -} - - - - - - - - -/* -int MDCache::send_inode_updates(CInode *in) -{ - assert(in->is_auth()); - for (set::iterator it = in->cached_by_begin(); - it != in->cached_by_end(); - it++) { - dout(7) << "sending inode_update on " << *in << " to " << *it << endl; - assert(*it != mds->get_nodeid()); - mds->send_message_mds(new MInodeUpdate(in, in->get_cached_by_nonce(*it)), *it, MDS_PORT_CACHE); - } - - return 0; -} - - -void MDCache::handle_inode_update(MInodeUpdate *m) -{ - inodeno_t ino = m->get_ino(); - CInode *in = get_inode(m->get_ino()); - if (!in) { - //dout(7) << "inode_update on " << m->get_ino() << ", don't have it, ignoring" << endl; - dout(7) << "inode_update on " << m->get_ino() << ", don't have it, sending expire" << endl; - MCacheExpire *expire = new MCacheExpire(mds->get_nodeid()); - expire->add_inode(m->get_ino(), m->get_nonce()); - mds->send_message_mds(expire, m->get_source().num(), MDS_PORT_CACHE); - goto out; - } - - if (in->is_auth()) { - dout(7) << "inode_update on " << *in << ", but i'm the authority!" << endl; - assert(0); // this should never happen - } - - dout(7) << "inode_update on " << *in << endl; - - // update! NOTE dir_auth is unaffected by this. - in->decode_basic_state(m->get_payload()); - - out: - // done - delete m; -} -*/ - - - -void MDCache::handle_cache_expire(MCacheExpire *m) -{ - int from = m->get_from(); - int source = m->get_source().num(); - map proxymap; - - if (m->get_from() == source) { - dout(7) << "cache_expire from mds" << from << endl; - } else { - dout(7) << "cache_expire from mds" << from << " via " << source << endl; - } - - // inodes - for (map::iterator it = m->get_inodes().begin(); - it != m->get_inodes().end(); - it++) { - CInode *in = get_inode(it->first); - int nonce = it->second; - - if (!in) { - dout(0) << "inode expire on " << it->first << " from " << from << ", don't have it" << endl; - assert(in); // i should be authority, or proxy .. and pinned - } - if (!in->is_auth()) { - int newauth = in->authority(); - dout(7) << "proxy inode expire on " << *in << " to " << newauth << endl; - assert(newauth >= 0); - if (!in->state_test(CInode::STATE_PROXY)) dout(0) << "missing proxy bit on " << *in << endl; - assert(in->state_test(CInode::STATE_PROXY)); - if (proxymap.count(newauth) == 0) proxymap[newauth] = new MCacheExpire(from); - proxymap[newauth]->add_inode(it->first, it->second); - continue; - } - - // check nonce - if (from == mds->get_nodeid()) { - // my cache_expire, and the export_dir giving auth back to me crossed paths! - // we can ignore this. no danger of confusion since the two parties are both me. - dout(7) << "inode expire on " << *in << " from mds" << from << " .. ME! ignoring." << endl; - } - else if (nonce == in->get_replica_nonce(from)) { - // remove from our cached_by - dout(7) << "inode expire on " << *in << " from mds" << from << " cached_by was " << in->get_replicas() << endl; - inode_remove_replica(in, from); - - } - else { - // this is an old nonce, ignore expire. - dout(7) << "inode expire on " << *in << " from mds" << from - << " with old nonce " << nonce << " (current " << in->get_replica_nonce(from) << "), dropping" - << endl; - assert(in->get_replica_nonce(from) > nonce); - } - } - - // dirs - for (map::iterator it = m->get_dirs().begin(); - it != m->get_dirs().end(); - it++) { - CInode *diri = get_inode(it->first); - assert(diri); - CDir *dir = diri->dir; - int nonce = it->second; - - if (!dir) { - dout(0) << "dir expire on " << it->first << " from " << from << ", don't have it" << endl; - assert(dir); // i should be authority, or proxy ... and pinned - } - if (!dir->is_auth()) { - int newauth = dir->authority(); - dout(7) << "proxy dir expire on " << *dir << " to " << newauth << endl; - if (!dir->is_proxy()) dout(0) << "nonproxy dir expire? " << *dir << " .. auth is " << newauth << " .. expire is from " << from << endl; - assert(dir->is_proxy()); - assert(newauth >= 0); - assert(dir->state_test(CDIR_STATE_PROXY)); - if (proxymap.count(newauth) == 0) proxymap[newauth] = new MCacheExpire(from); - proxymap[newauth]->add_dir(it->first, it->second); - continue; - } - - // check nonce - if (from == mds->get_nodeid()) { - dout(7) << "dir expire on " << *dir << " from mds" << from - << " .. ME! ignoring" << endl; - } - else if (nonce == dir->get_replica_nonce(from)) { - // remove from our cached_by - dout(7) << "dir expire on " << *dir << " from mds" << from - << " replicas was " << dir->replicas << endl; - dir->remove_replica(from); - } - else { - // this is an old nonce, ignore expire. - dout(7) << "dir expire on " << *dir << " from mds" << from - << " with old nonce " << nonce << " (current " << dir->get_replica_nonce(from) - << "), dropping" << endl; - assert(dir->get_replica_nonce(from) > nonce); - } - } - - // dentries - for (map >::iterator pd = m->get_dentries().begin(); - pd != m->get_dentries().end(); - ++pd) { - dout(0) << "dn expires in dir " << pd->first << endl; - CInode *diri = get_inode(pd->first); - CDir *dir = diri->dir; - assert(dir); - - if (!dir->is_auth()) { - int newauth = dir->authority(); - dout(7) << "proxy dentry expires on " << *dir << " to " << newauth << endl; - if (!dir->is_proxy()) - dout(0) << "nonproxy dentry expires? " << *dir << " .. auth is " << newauth - << " .. expire is from " << from << endl; - assert(dir->is_proxy()); - assert(newauth >= 0); - assert(dir->state_test(CDIR_STATE_PROXY)); - if (proxymap.count(newauth) == 0) proxymap[newauth] = new MCacheExpire(from); - proxymap[newauth]->add_dentries(pd->first, pd->second); - continue; - } - - for (map::iterator p = pd->second.begin(); - p != pd->second.end(); - ++p) { - int nonce = p->second; - - CDentry *dn = dir->lookup(p->first); - if (!dn) - dout(0) << "missing dentry for " << p->first << " in " << *dir << endl; - assert(dn); - - if (from == mds->get_nodeid()) { - dout(7) << "dentry_expire on " << *dn << " from mds" << from - << " .. ME! ignoring" << endl; - } - else if (nonce == dn->get_replica_nonce(from)) { - dout(7) << "dentry_expire on " << *dn << " from mds" << from << endl; - dn->remove_replica(from); - } - else { - dout(7) << "dentry_expire on " << *dn << " from mds" << from - << " with old nonce " << nonce << " (current " << dn->get_replica_nonce(from) - << "), dropping" << endl; - assert(dn->get_replica_nonce(from) > nonce); - } - } - } - - // send proxy forwards - for (map::iterator it = proxymap.begin(); - it != proxymap.end(); - it++) { - dout(7) << "sending proxy forward to " << it->first << endl; - mds->send_message_mds(it->second, it->first, MDS_PORT_CACHE); - } - - // done - delete m; -} - -void MDCache::inode_remove_replica(CInode *in, int from) -{ - in->remove_replica(from); - in->mds_caps_wanted.erase(from); - - // note: this code calls _eval more often than it needs to! - // fix lock - if (in->hardlock.is_gathering(from)) { - in->hardlock.gather_set.erase(from); - if (in->hardlock.gather_set.size() == 0) - mds->locker->inode_hard_eval(in); - } - if (in->filelock.is_gathering(from)) { - in->filelock.gather_set.erase(from); - if (in->filelock.gather_set.size() == 0) - mds->locker->inode_file_eval(in); - } - - // alone now? - if (!in->is_replicated()) { - mds->locker->inode_hard_eval(in); - mds->locker->inode_file_eval(in); - } -} - - -int MDCache::send_dir_updates(CDir *dir, bool bcast) -{ - // this is an FYI, re: replication - - set who; - if (bcast) { - mds->get_mds_map()->get_active_mds_set(who); - } else { - for (map::iterator p = dir->replicas_begin(); - p != dir->replicas_end(); - ++p) - who.insert(p->first); - } - - dout(7) << "sending dir_update on " << *dir << " bcast " << bcast << " to " << who << endl; - - string path; - dir->inode->make_path(path); - - int whoami = mds->get_nodeid(); - for (set::iterator it = who.begin(); - it != who.end(); - it++) { - if (*it == whoami) continue; - //if (*it == except) continue; - dout(7) << "sending dir_update on " << *dir << " to " << *it << endl; - - mds->send_message_mds(new MDirUpdate(dir->ino(), - dir->dir_rep, - dir->dir_rep_by, - path, - bcast), - *it, MDS_PORT_CACHE); - } - - return 0; -} - - -void MDCache::handle_dir_update(MDirUpdate *m) -{ - CInode *in = get_inode(m->get_ino()); - if (!in || !in->dir) { - dout(5) << "dir_update on " << m->get_ino() << ", don't have it" << endl; - - // discover it? - if (m->should_discover()) { - m->tried_discover(); // only once! - vector trace; - filepath path = m->get_path(); - - dout(5) << "trying discover on dir_update for " << path << endl; - - int r = path_traverse(path, trace, true, - m, new C_MDS_RetryMessage(mds, m), - MDS_TRAVERSE_DISCOVER); - if (r > 0) - return; - if (r == 0) { - assert(in); - open_remote_dir(in, new C_MDS_RetryMessage(mds, m)); - return; - } - assert(0); - } - - goto out; - } - - // update - dout(5) << "dir_update on " << *in->dir << endl; - in->dir->dir_rep = m->get_dir_rep(); - in->dir->dir_rep_by = m->get_dir_rep_by(); - - // done - out: - delete m; -} - - - - - -class C_MDC_DentryUnlink : public Context { -public: - MDCache *mdc; - CDentry *dn; - CDir *dir; - Context *c; - C_MDC_DentryUnlink(MDCache *mdc, CDentry *dn, CDir *dir, Context *c) { - this->mdc = mdc; - this->dn = dn; - this->dir = dir; - this->c = c; - } - void finish(int r) { - assert(r == 0); - mdc->dentry_unlink_finish(dn, dir, c); - } -}; - - -// NAMESPACE FUN - -void MDCache::dentry_unlink(CDentry *dn, Context *c) -{ - CDir *dir = dn->dir; - string dname = dn->name; - - assert(dn->lockstate == DN_LOCK_XLOCK); - - // i need the inode to do any of this properly - assert(dn->inode); - - // log it - if (dn->inode) dn->inode->mark_unsafe(); // XXX ??? FIXME - mds->mdlog->submit_entry(new EString("unlink fixme fixme"),//EUnlink(dir, dn, dn->inode), - NULL); // FIXME FIXME FIXME - - // tell replicas - if (dir->is_replicated()) { - for (map::iterator it = dir->replicas_begin(); - it != dir->replicas_end(); - it++) { - dout(7) << "inode_unlink sending DentryUnlink to mds" << it->first << endl; - - mds->send_message_mds(new MDentryUnlink(dir->ino(), dn->name), it->first, MDS_PORT_CACHE); - } - - // don't need ack. - } - - - // inode deleted? - if (dn->is_primary()) { - assert(dn->inode->is_auth()); - dn->inode->inode.nlink--; - - if (dn->inode->is_dir()) assert(dn->inode->inode.nlink == 0); // no hard links on dirs - - // last link? - if (dn->inode->inode.nlink == 0) { - // truly dangling - if (dn->inode->dir) { - // mark dir clean too, since it now dne! - assert(dn->inode->dir->is_auth()); - dn->inode->dir->state_set(CDIR_STATE_DELETED); - dn->inode->dir->remove_null_dentries(); - dn->inode->dir->mark_clean(); - } - - // mark it clean, it's dead - if (dn->inode->is_dirty()) - dn->inode->mark_clean(); - - } else { - // migrate to inode file - dout(7) << "removed primary, but there are remote links, moving to inode file: " << *dn->inode << endl; - - // dangling but still linked. - assert(dn->inode->is_anchored()); - - // unlink locally - CInode *in = dn->inode; - dn->dir->unlink_inode( dn ); - dn->_mark_dirty(); // fixme - - // mark it dirty! - in->_mark_dirty(); // fixme - - // update anchor to point to inode file+mds - vector atrace; - in->make_anchor_trace(atrace); - assert(atrace.size() == 1); // it's dangling - mds->anchorclient->update(in->ino(), atrace, - new C_MDC_DentryUnlink(this, dn, dir, c)); - return; - } - } - else if (dn->is_remote()) { - // need to dec nlink on primary - if (dn->inode->is_auth()) { - // awesome, i can do it - dout(7) << "remote target is local, nlink--" << endl; - dn->inode->inode.nlink--; - dn->inode->_mark_dirty(); // fixme - - if (( dn->inode->state_test(CInode::STATE_DANGLING) && dn->inode->inode.nlink == 0) || - (!dn->inode->state_test(CInode::STATE_DANGLING) && dn->inode->inode.nlink == 1)) { - dout(7) << "nlink=1+primary or 0+dangling, removing anchor" << endl; - - // remove anchor (async) - mds->anchorclient->destroy(dn->inode->ino(), NULL); - } - } else { - int auth = dn->inode->authority(); - dout(7) << "remote target is remote, sending unlink request to " << auth << endl; - - mds->send_message_mds(new MInodeUnlink(dn->inode->ino(), mds->get_nodeid()), - auth, MDS_PORT_CACHE); - - // unlink locally - CInode *in = dn->inode; - dn->dir->unlink_inode( dn ); - dn->_mark_dirty(); // fixme - - // add waiter - in->add_waiter(CINODE_WAIT_UNLINK, c); - return; - } - } - else - assert(0); // unlink on null dentry?? - - // unlink locally - dn->dir->unlink_inode( dn ); - dn->_mark_dirty(); // fixme - - // finish! - dentry_unlink_finish(dn, dir, c); -} - - -void MDCache::dentry_unlink_finish(CDentry *dn, CDir *dir, Context *c) -{ - dout(7) << "dentry_unlink_finish on " << *dn << endl; - string dname = dn->name; - - // unpin dir / unxlock - mds->locker->dentry_xlock_finish(dn, true); // quiet, no need to bother replicas since they're already unlinking - - // did i empty out an imported dir? - if (dir->is_import() && !dir->inode->is_root() && dir->get_size() == 0) - migrator->export_empty_import(dir); - - // wake up any waiters - dir->take_waiting(CDIR_WAIT_ANY, dname, mds->finished_queue); - - c->finish(0); -} - - - - -void MDCache::handle_dentry_unlink(MDentryUnlink *m) -{ - CInode *diri = get_inode(m->get_dirino()); - CDir *dir = 0; - if (diri) dir = diri->dir; - - if (!diri || !dir) { - dout(7) << "handle_dentry_unlink don't have dir " << m->get_dirino() << endl; - } - else { - CDentry *dn = dir->lookup(m->get_dn()); - if (!dn) { - dout(7) << "handle_dentry_unlink don't have dentry " << *dir << " dn " << m->get_dn() << endl; - } else { - dout(7) << "handle_dentry_unlink on " << *dn << endl; - - // dir? - if (dn->inode) { - if (dn->inode->dir) { - dn->inode->dir->state_set(CDIR_STATE_DELETED); - dn->inode->dir->remove_null_dentries(); - } - } - - string dname = dn->name; - - // unlink - dn->dir->remove_dentry(dn); - - // wake up - //dir->finish_waiting(CDIR_WAIT_DNREAD, dname); - dir->take_waiting(CDIR_WAIT_DNREAD, dname, mds->finished_queue); - } - } - - delete m; - return; -} - - -void MDCache::handle_inode_unlink(MInodeUnlink *m) -{ - CInode *in = get_inode(m->get_ino()); - assert(in); - - // proxy? - if (in->is_proxy()) { - dout(7) << "handle_inode_unlink proxy on " << *in << endl; - mds->send_message_mds(m, in->authority(), MDS_PORT_CACHE); - return; - } - assert(in->is_auth()); - - // do it. - dout(7) << "handle_inode_unlink nlink=" << in->inode.nlink << " on " << *in << endl; - assert(in->inode.nlink > 0); - in->inode.nlink--; - - if (in->state_test(CInode::STATE_DANGLING)) { - // already dangling. - // last link? - if (in->inode.nlink == 0) { - dout(7) << "last link, marking clean and removing anchor" << endl; - - in->mark_clean(); // mark it clean. - - // remove anchor (async) - mds->anchorclient->destroy(in->ino(), NULL); - } - else { - in->_mark_dirty(); // fixme - } - } else { - // has primary link still. - assert(in->inode.nlink >= 1); - in->_mark_dirty(); // fixme - - if (in->inode.nlink == 1) { - dout(7) << "nlink=1, removing anchor" << endl; - - // remove anchor (async) - mds->anchorclient->destroy(in->ino(), NULL); - } - } - - // ack - mds->send_message_mds(new MInodeUnlinkAck(m->get_ino()), m->get_from(), MDS_PORT_CACHE); -} - -void MDCache::handle_inode_unlink_ack(MInodeUnlinkAck *m) -{ - CInode *in = get_inode(m->get_ino()); - assert(in); - - dout(7) << "handle_inode_unlink_ack on " << *in << endl; - in->finish_waiting(CINODE_WAIT_UNLINK, 0); -} - - - - - - - - - - -/* - * some import/export helpers - */ - -/** con = get_auth_container(dir) - * Returns the directory in which authority is delegated for *dir. - * This may be because a directory is an import, or because it is hashed - * and we are nested underneath an inode in that dir (that hashes to us). - * Thus do not assume result->is_auth()! It is_auth() || is_hashed(). - */ -CDir *MDCache::get_auth_container(CDir *dir) -{ - CDir *imp = dir; // might be *dir - - // find the underlying import or hash that delegates dir - while (true) { - if (imp->is_import()) break; // import - imp = imp->get_parent_dir(); - if (!imp) break; // none - if (imp->is_hashed()) break; // hash - } - - return imp; -} - -CDir *MDCache::get_export_container(CDir *dir) -{ - CDir *ex = dir; // might be *dir - assert(!ex->is_auth()); - - // find the underlying import or hash that delegates dir away - while (true) { - if (ex->is_export()) break; // import - ex = ex->get_parent_dir(); - assert(ex); - if (ex->is_hashed()) break; // hash - } - - return ex; -} - - -void MDCache::find_nested_exports(CDir *dir, set& s) -{ - CDir *import = get_auth_container(dir); - find_nested_exports_under(import, dir, s); -} - -void MDCache::find_nested_exports_under(CDir *import, CDir *dir, set& s) -{ - dout(10) << "find_nested_exports for " << *dir << endl; - dout(10) << "find_nested_exports_under import " << *import << endl; - - if (import == dir) { - // yay, my job is easy! - for (set::iterator p = nested_exports[import].begin(); - p != nested_exports[import].end(); - p++) { - CDir *nested = *p; - s.insert(nested); - dout(10) << "find_nested_exports " << *dir << " " << *nested << endl; - } - return; - } - - // ok, my job is annoying. - for (set::iterator p = nested_exports[import].begin(); - p != nested_exports[import].end(); - p++) { - CDir *nested = *p; - - dout(12) << "find_nested_exports checking " << *nested << endl; - - // trace back to import, or dir - CDir *cur = nested->get_parent_dir(); - while (!cur->is_import() || cur == dir) { - if (cur == dir) { - s.insert(nested); - dout(10) << "find_nested_exports " << *dir << " " << *nested << endl; - break; - } else { - cur = cur->get_parent_dir(); - } - } - } -} - - - - - - - - - - - - - - - - - - -// ============================================================== -// debug crap - - -void MDCache::show_imports() -{ - int db = 10; - - if (imports.empty() && - hashdirs.empty()) { - dout(db) << "show_imports: no imports/exports/hashdirs" << endl; - return; - } - dout(db) << "show_imports:" << endl; - - set ecopy = exports; - - set::iterator it = hashdirs.begin(); - while (1) { - if (it == hashdirs.end()) it = imports.begin(); - if (it == imports.end() ) break; - - CDir *im = *it; - - if (im->is_import()) { - dout(db) << " + import (" << im->popularity[MDS_POP_CURDOM] << "/" << im->popularity[MDS_POP_ANYDOM] << ") " << *im << endl; - //assert( im->is_auth() ); - } - else if (im->is_hashed()) { - if (im->is_import()) continue; // if import AND hash, list as import. - dout(db) << " + hash (" << im->popularity[MDS_POP_CURDOM] << "/" << im->popularity[MDS_POP_ANYDOM] << ") " << *im << endl; - } - - for (set::iterator p = nested_exports[im].begin(); - p != nested_exports[im].end(); - p++) { - CDir *exp = *p; - if (exp->is_hashed()) { - //assert(0); // we don't do it this way actually - dout(db) << " - hash (" << exp->popularity[MDS_POP_NESTED] << ", " << exp->popularity[MDS_POP_ANYDOM] << ") " << *exp << " to " << exp->dir_auth << endl; - //assert( !exp->is_auth() ); - } else { - dout(db) << " - ex (" << exp->popularity[MDS_POP_NESTED] << ", " << exp->popularity[MDS_POP_ANYDOM] << ") " << *exp << " to " << exp->dir_auth << endl; - assert( exp->is_export() ); - //assert( !exp->is_auth() ); - } - - if ( get_auth_container(exp) != im ) { - dout(1) << "uh oh, auth container is " << *get_auth_container(exp) << endl; - assert( get_auth_container(exp) == im ); - } - - if (ecopy.count(exp) != 1) { - dout(1) << "***** nested_export " << *exp << " not in exports" << endl; - assert(0); - } - ecopy.erase(exp); - } - - it++; - } - - if (ecopy.size()) { - for (set::iterator it = ecopy.begin(); - it != ecopy.end(); - it++) - dout(1) << "***** stray item in exports: " << **it << endl; - assert(ecopy.size() == 0); - } -} - - -void MDCache::show_cache() -{ - dout(7) << "show_cache" << endl; - - for (hash_map::iterator it = inode_map.begin(); - it != inode_map.end(); - it++) { - dout(7) << *((*it).second) << endl; - - CDentry *dn = (*it).second->get_parent_dn(); - if (dn) - dout(7) << " dn " << *dn << endl; - if ((*it).second->dir) - dout(7) << " subdir " << *(*it).second->dir << endl; - } -} - diff --git a/branches/marnberg/quota/mds/MDCache.h b/branches/marnberg/quota/mds/MDCache.h deleted file mode 100644 index 7b8825f073726..0000000000000 --- a/branches/marnberg/quota/mds/MDCache.h +++ /dev/null @@ -1,364 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __MDCACHE_H -#define __MDCACHE_H - -#include -#include -#include -#include -#include - -#include "include/types.h" -#include "include/filepath.h" - -#include "CInode.h" -#include "CDentry.h" -#include "CDir.h" -#include "Lock.h" - - -class MDS; -class Migrator; -class Renamer; - -class Logger; - -class Message; - -class MMDSImportMap; -class MMDSCacheRejoin; -class MMDSCacheRejoinAck; -class MDiscover; -class MDiscoverReply; -class MCacheExpire; -class MDirUpdate; -class MDentryUnlink; -class MLock; - - -class MClientRequest; - - -// MDCache - -//typedef const char* pchar; - - - -/** active_request_t - * state we track for requests we are currently processing. - * mostly information about locks held, so that we can drop them all - * the request is finished or forwarded. see request_*(). - */ -typedef struct { - CInode *ref; // reference inode - set< CInode* > request_pins; - set< CDir* > request_dir_pins; - map< CDentry*, vector > traces; // path pins held - set< CDentry* > xlocks; // xlocks (local) - set< CDentry* > foreign_xlocks; // xlocks on foreign hosts -} active_request_t; - -namespace __gnu_cxx { - template<> struct hash { - size_t operator()(const Message *p) const { - static hash H; - return H((unsigned long)p); - } - }; -} - -class MDCache { - public: - // my master - MDS *mds; - - LRU lru; // dentry lru for expiring items from cache - - protected: - // the cache - CInode *root; // root inode - hash_map inode_map; // map of inodes by ino - - list inode_expire_queue; // inodes to delete - - - // root - list waiting_for_root; - - // imports, exports, and hashes. - set imports; // includes root (on mds0) - set exports; - set hashdirs; - map > nested_exports; // exports nested under imports _or_ hashdirs - - void adjust_export(int to, CDir *root, set& bounds); - void adjust_import(int from, CDir *root, set& bounds); - - - - // active MDS requests - hash_map active_requests; - - // inode purging - map purging; - map > waiting_for_purge; - - // shutdown crap - int shutdown_commits; - bool did_shutdown_exports; - bool did_shutdown_log_cap; - friend class C_MDC_ShutdownCommit; - - // recovery -protected: - // from EImportStart w/o EImportFinish during journal replay - map > my_ambiguous_imports; - // from MMDSImportMaps - map > > other_ambiguous_imports; - - set recovery_set; - set wants_import_map; // nodes i need to send my import map to - set got_import_map; // nodes i need to send my import map to (when exports finish) - set rejoin_ack_gather; // nodes i need a rejoin ack from - - void handle_import_map(MMDSImportMap *m); - void handle_cache_rejoin(MMDSCacheRejoin *m); - void handle_cache_rejoin_ack(MMDSCacheRejoinAck *m); - void disambiguate_imports(); - void cache_rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin); - void send_cache_rejoin_acks(); -public: - void send_import_map(int who); - void send_import_map_now(int who); - void send_import_map_later(int who) { - wants_import_map.insert(who); - } - void send_pending_import_maps(); // maybe. - void send_cache_rejoins(); - - void set_recovery_set(set& s) { - recovery_set = s; - } - - // ambiguous imports - void add_ambiguous_import(inodeno_t base, set& bounds) { - my_ambiguous_imports[base].swap(bounds); - } - void cancel_ambiguous_import(inodeno_t dirino); - void finish_ambiguous_import(inodeno_t dirino); - - void finish_ambiguous_export(inodeno_t dirino, set& bounds); - - - - - - friend class CInode; - friend class Locker; - friend class Migrator; - friend class Renamer; - friend class MDBalancer; - friend class EImportMap; - - - public: - - // subsystems - Migrator *migrator; - Renamer *renamer; - - public: - MDCache(MDS *m); - ~MDCache(); - - // debug - void log_stat(Logger *logger); - - // root inode - CInode *get_root() { return root; } - void set_root(CInode *r); - - int get_num_imports() { return imports.size(); } - void add_import(CDir *dir); - void remove_import(CDir *dir); - void recalc_auth_bits(); - - void log_import_map(Context *onsync=0); - - - // cache - void set_cache_size(size_t max) { lru.lru_set_max(max); } - size_t get_cache_size() { return lru.lru_get_size(); } - bool trim(int max = -1); // trim cache - void trim_non_auth(); // trim out trimmable non-auth items - - // shutdown - void shutdown_start(); - void shutdown_check(); - bool shutdown_pass(); - bool shutdown(); // clear cache (ie at shutodwn) - - // inode_map - bool have_inode( inodeno_t ino ) { return inode_map.count(ino) ? true:false; } - CInode* get_inode( inodeno_t ino ) { - if (have_inode(ino)) - return inode_map[ ino ]; - return NULL; - } - - - int hash_dentry(inodeno_t ino, const string& s) { - return 0; // fixme - } - - - public: - CInode *create_inode(); - void add_inode(CInode *in); - - protected: - void remove_inode(CInode *in); - void destroy_inode(CInode *in); - void touch_inode(CInode *in) { - if (in->get_parent_dn()) - touch_dentry(in->get_parent_dn()); - } - void touch_dentry(CDentry *dn) { - // touch ancestors - if (dn->get_dir()->get_inode()->get_parent_dn()) - touch_dentry(dn->get_dir()->get_inode()->get_parent_dn()); - - // touch me - if (dn->is_auth()) - lru.lru_touch(dn); - else - lru.lru_midtouch(dn); - } - - void inode_remove_replica(CInode *in, int rep); - - void rename_file(CDentry *srcdn, CDentry *destdn); - - public: - // inode purging - void purge_inode(inode_t& inode); - void purge_inode_finish(inodeno_t ino); - void purge_inode_finish_2(inodeno_t ino); - void waitfor_purge(inodeno_t ino, Context *c); - void start_recovered_purges(); - - - public: - CDir *get_auth_container(CDir *in); - CDir *get_export_container(CDir *dir); - void find_nested_exports(CDir *dir, set& s); - void find_nested_exports_under(CDir *import, CDir *dir, set& s); - - - public: - CInode *create_root_inode(); - int open_root(Context *c); - int path_traverse(filepath& path, vector& trace, bool follow_trailing_sym, - Message *req, Context *ondelay, - int onfail, - Context *onfinish=0, - bool is_client_req = false); - void open_remote_dir(CInode *diri, Context *fin); - void open_remote_ino(inodeno_t ino, Message *req, Context *fin); - void open_remote_ino_2(inodeno_t ino, Message *req, - vector& anchortrace, - Context *onfinish); - - bool path_pin(vector& trace, Message *m, Context *c); - void path_unpin(vector& trace, Message *m); - void make_trace(vector& trace, CInode *in); - - bool request_start(Message *req, - CInode *ref, - vector& trace); - void request_cleanup(Message *req); - void request_finish(Message *req); - void request_forward(Message *req, int mds, int port=0); - void request_pin_inode(Message *req, CInode *in); - void request_pin_dir(Message *req, CDir *dir); - - // anchors - void anchor_inode(CInode *in, Context *onfinish); - //void unanchor_inode(CInode *in, Context *c); - - void handle_inode_link(class MInodeLink *m); - void handle_inode_link_ack(class MInodeLinkAck *m); - - // == messages == - public: - void dispatch(Message *m); - - protected: - // -- replicas -- - void handle_discover(MDiscover *dis); - void handle_discover_reply(MDiscoverReply *m); - - - // -- namespace -- - // these handle logging, cache sync themselves. - // UNLINK - public: - void dentry_unlink(CDentry *in, Context *c); - protected: - void dentry_unlink_finish(CDentry *in, CDir *dir, Context *c); - void handle_dentry_unlink(MDentryUnlink *m); - void handle_inode_unlink(class MInodeUnlink *m); - void handle_inode_unlink_ack(class MInodeUnlinkAck *m); - friend class C_MDC_DentryUnlink; - - - - // -- misc auth -- - int ino_proxy_auth(inodeno_t ino, - int frommds, - map >& inomap); - void do_ino_proxy(CInode *in, Message *m); - void do_dir_proxy(CDir *dir, Message *m); - - - - - // -- updates -- - //int send_inode_updates(CInode *in); - //void handle_inode_update(MInodeUpdate *m); - - int send_dir_updates(CDir *in, bool bcast=false); - void handle_dir_update(MDirUpdate *m); - - void handle_cache_expire(MCacheExpire *m); - - - - // == crap fns == - public: - void dump() { - if (root) root->dump(); - } - - void show_imports(); - void show_cache(); - -}; - - -#endif diff --git a/branches/marnberg/quota/mds/MDLog.cc b/branches/marnberg/quota/mds/MDLog.cc deleted file mode 100644 index 182bd4d0333e1..0000000000000 --- a/branches/marnberg/quota/mds/MDLog.cc +++ /dev/null @@ -1,431 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "MDLog.h" -#include "MDS.h" -#include "MDCache.h" -#include "LogEvent.h" - -#include "osdc/Journaler.h" - -#include "common/LogType.h" -#include "common/Logger.h" - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug_mds || l <= g_conf.debug_mds_log) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".log " -#define derr(l) if (l<=g_conf.debug_mds || l <= g_conf.debug_mds_log) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".log " - -// cons/des - -LogType mdlog_logtype; - -MDLog::MDLog(MDS *m) -{ - mds = m; - num_events = 0; - waiting_for_read = false; - - last_import_map = 0; - writing_import_map = false; - seen_import_map = false; - - max_events = g_conf.mds_log_max_len; - - capped = false; - - unflushed = 0; - - journaler = 0; - logger = 0; -} - - -MDLog::~MDLog() -{ - if (journaler) { delete journaler; journaler = 0; } - if (logger) { delete logger; logger = 0; } -} - - -void MDLog::init_journaler() -{ - // logger - char name[80]; - sprintf(name, "mds%d.log", mds->get_nodeid()); - logger = new Logger(name, &mdlog_logtype); - - static bool didit = false; - if (!didit) { - mdlog_logtype.add_inc("add"); - mdlog_logtype.add_inc("expire"); - mdlog_logtype.add_inc("obs"); - mdlog_logtype.add_inc("trim"); - mdlog_logtype.add_set("size"); - mdlog_logtype.add_set("read"); - mdlog_logtype.add_set("append"); - mdlog_logtype.add_inc("lsum"); - mdlog_logtype.add_inc("lnum"); - } - - // inode - memset(&log_inode, 0, sizeof(log_inode)); - log_inode.ino = MDS_INO_LOG_OFFSET + mds->get_nodeid(); - log_inode.layout = g_OSD_MDLogLayout; - - if (g_conf.mds_local_osd) { - log_inode.layout.object_layout = OBJECT_LAYOUT_STARTOSD; - log_inode.layout.osd = mds->get_nodeid() + 10000; // hack - } - - // log streamer - if (journaler) delete journaler; - journaler = new Journaler(log_inode, mds->objecter, logger); -} - - - -void MDLog::reset() -{ - dout(5) << "reset to empty log" << endl; - init_journaler(); - journaler->reset(); -} - -void MDLog::open(Context *c) -{ - dout(5) << "open discovering log bounds" << endl; - init_journaler(); - journaler->recover(c); -} - -void MDLog::write_head(Context *c) -{ - journaler->write_head(c); -} - - -off_t MDLog::get_read_pos() -{ - return journaler->get_read_pos(); -} - -off_t MDLog::get_write_pos() -{ - return journaler->get_write_pos(); -} - - - -void MDLog::submit_entry( LogEvent *le, - Context *c ) -{ - if (g_conf.mds_log) { - dout(5) << "submit_entry " << journaler->get_write_pos() << " : " << *le << endl; - - // encode it, with event type - bufferlist bl; - bl.append((char*)&le->_type, sizeof(le->_type)); - le->encode_payload(bl); - - // journal it. - journaler->append_entry(bl); - - assert(!capped); - - delete le; - num_events++; - - logger->inc("add"); - logger->set("size", num_events); - logger->set("append", journaler->get_write_pos()); - - if (c) { - unflushed = 0; - journaler->flush(c); - } - else - unflushed++; - - // should we log a new import_map? - // FIXME: should this go elsewhere? - if (last_import_map && !writing_import_map && - journaler->get_write_pos() - last_import_map >= g_conf.mds_log_import_map_interval) { - // log import map - mds->mdcache->log_import_map(); - } - - } else { - // hack: log is disabled. - if (c) { - c->finish(0); - delete c; - } - } -} - -void MDLog::wait_for_sync( Context *c ) -{ - if (g_conf.mds_log) { - // wait - journaler->flush(c); - } else { - // hack: bypass. - c->finish(0); - delete c; - } -} - -void MDLog::flush() -{ - if (unflushed) - journaler->flush(); - unflushed = 0; - - // trim - trim(NULL); -} - - - - -// trim - -class C_MDL_Trimmed : public Context { -public: - MDLog *mdl; - LogEvent *le; - - C_MDL_Trimmed(MDLog *mdl, LogEvent *le) { - this->mdl = mdl; - this->le = le; - } - void finish(int res) { - mdl->_trimmed(le); - } -}; - -class C_MDL_Reading : public Context { -public: - MDLog *mdl; - C_MDL_Reading(MDLog *m) { - mdl = m; - } - void finish(int res) { - mdl->_did_read(); - } -}; - - -void MDLog::_did_read() -{ - dout(5) << "_did_read()" << endl; - waiting_for_read = false; - trim(0); -} - -void MDLog::_trimmed(LogEvent *le) -{ - dout(7) << "trimmed : " << le->get_start_off() << " : " << *le << endl; - assert(le->has_expired(mds)); - - if (trimming.begin()->first == le->_end_off) { - // we trimmed off the front! - // we can expire the log a bit. - journaler->set_expire_pos(le->_end_off); - } - - trimming.erase(le->_end_off); - delete le; - - logger->set("trim", trimming.size()); - logger->set("read", journaler->get_read_pos()); - - trim(0); -} - - - -void MDLog::trim(Context *c) -{ - // add waiter - if (c) - trim_waiters.push_back(c); - - // trim! - dout(10) << "trim " << num_events << " events / " << max_events << " max" << endl; - - while (num_events > max_events) { - - off_t gap = journaler->get_write_pos() - journaler->get_read_pos(); - dout(5) << "trim num_events " << num_events << " > max " << max_events - << ", trimming " << trimming.size() - << ", byte gap " << gap - << endl; - - if ((int)trimming.size() >= g_conf.mds_log_max_trimming) { - dout(7) << "trim already trimming max, waiting" << endl; - return; - } - - bufferlist bl; - off_t so = journaler->get_read_pos(); - if (journaler->try_read_entry(bl)) { - // decode logevent - LogEvent *le = LogEvent::decode(bl); - le->_start_off = so; - le->_end_off = journaler->get_read_pos(); - num_events--; - - // we just read an event. - if (le->has_expired(mds)) { - // obsolete - dout(7) << "trim obsolete : " << le->get_start_off() << " : " << *le << endl; - delete le; - logger->inc("obs"); - } else { - assert ((int)trimming.size() < g_conf.mds_log_max_trimming); - - // trim! - dout(7) << "trim expiring : " << le->get_start_off() << " : " << *le << endl; - trimming[le->_end_off] = le; - le->expire(mds, new C_MDL_Trimmed(this, le)); - logger->inc("expire"); - logger->set("trim", trimming.size()); - } - logger->set("read", journaler->get_read_pos()); - logger->set("size", num_events); - } else { - // need to read! - if (!waiting_for_read) { - waiting_for_read = true; - dout(7) << "trim waiting for read" << endl; - journaler->wait_for_readable(new C_MDL_Reading(this)); - } else { - dout(7) << "trim already waiting for read" << endl; - } - return; - } - } - - dout(5) << "trim num_events " << num_events << " <= max " << max_events - << ", trimming " << trimming.size() - << ", done for now." - << endl; - - // trimmed! - std::list finished; - finished.swap(trim_waiters); - finish_contexts(finished, 0); - - // hmm, are we at the end? - /* - if (journaler->get_read_pos() == journaler->get_write_pos() && - trimming.size() == import_map_expire_waiters.size()) { - dout(5) << "trim log is empty, allowing import_map to expire" << endl; - list ls; - ls.swap(import_map_expire_waiters); - finish_contexts(ls); - } - */ -} - - -void MDLog::replay(Context *c) -{ - assert(journaler->is_active()); - - // start reading at the last known expire point. - journaler->set_read_pos( journaler->get_expire_pos() ); - - // empty? - if (journaler->get_read_pos() == journaler->get_write_pos()) { - dout(10) << "replay - journal empty, done." << endl; - if (c) { - c->finish(0); - delete c; - } - return; - } - - // add waiter - if (c) - waitfor_replay.push_back(c); - - // go! - dout(10) << "replay start, from " << journaler->get_read_pos() - << " to " << journaler->get_write_pos() << endl; - - assert(num_events == 0); - - _replay(); -} - -class C_MDL_Replay : public Context { - MDLog *mdlog; -public: - C_MDL_Replay(MDLog *l) : mdlog(l) {} - void finish(int r) { mdlog->_replay(); } -}; - -void MDLog::_replay() -{ - // read what's buffered - while (journaler->is_readable() && - journaler->get_read_pos() < journaler->get_write_pos()) { - // read it - off_t pos = journaler->get_read_pos(); - bufferlist bl; - bool r = journaler->try_read_entry(bl); - assert(r); - - // unpack event - LogEvent *le = LogEvent::decode(bl); - num_events++; - - // have we seen an import map yet? - if (!seen_import_map && - le->get_type() != EVENT_IMPORTMAP) { - dout(10) << "_replay " << pos << " / " << journaler->get_write_pos() - << " -- waiting for import_map. (skipping " << *le << ")" << endl; - } else { - dout(10) << "_replay " << pos << " / " << journaler->get_write_pos() - << " : " << *le << endl; - le->replay(mds); - - if (le->get_type() == EVENT_IMPORTMAP) - seen_import_map = true; - } - delete le; - } - - // wait for read? - if (journaler->get_read_pos() < journaler->get_write_pos()) { - journaler->wait_for_readable(new C_MDL_Replay(this)); - return; - } - - // done! - assert(journaler->get_read_pos() == journaler->get_write_pos()); - dout(10) << "_replay - complete" << endl; - - // move read pointer _back_ to expire pos, for eventual trimming - journaler->set_read_pos(journaler->get_expire_pos()); - - // kick waiter(s) - list ls; - ls.swap(waitfor_replay); - finish_contexts(ls,0); -} - - diff --git a/branches/marnberg/quota/mds/MDLog.h b/branches/marnberg/quota/mds/MDLog.h deleted file mode 100644 index 0d0248bde391d..0000000000000 --- a/branches/marnberg/quota/mds/MDLog.h +++ /dev/null @@ -1,127 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MDLOG_H -#define __MDLOG_H - -#include "include/types.h" -#include "include/Context.h" - -#include - -//#include -//using __gnu_cxx::hash_mapset; - -class Journaler; -class LogEvent; -class MDS; - -class Logger; - -/* -namespace __gnu_cxx { - template<> struct hash { - size_t operator()(const LogEvent *p) const { - static hash H; - return H((unsigned long)p); - } - }; -} -*/ - -class MDLog { - protected: - MDS *mds; - size_t num_events; // in events - size_t max_events; - - int unflushed; - - bool capped; - - inode_t log_inode; - Journaler *journaler; - - map trimming; - std::list trim_waiters; // contexts waiting for trim - bool trim_reading; - - bool waiting_for_read; - friend class C_MDL_Reading; - - Logger *logger; - - list waitfor_replay; - - // importmaps - off_t last_import_map; // offsets of last committed importmap. constrains trimming. - list import_map_expire_waiters; - bool writing_import_map; // one is being written now - bool seen_import_map; // for recovery - - friend class EImportMap; - friend class C_MDS_WroteImportMap; - friend class MDCache; - - void init_journaler(); - - - public: - // replay state - map > pending_exports; - - - - public: - MDLog(MDS *m); - ~MDLog(); - - - - void set_max_events(size_t max) { max_events = max; } - size_t get_max_events() { return max_events; } - size_t get_num_events() { return num_events + trimming.size(); } - size_t get_non_importmap_events() { return num_events + trimming.size() - import_map_expire_waiters.size(); } - - off_t get_read_pos(); - off_t get_write_pos(); - bool empty() { - return get_read_pos() == get_write_pos(); - } - - bool is_capped() { return capped; } - void cap() { - capped = true; - list ls; - ls.swap(import_map_expire_waiters); - finish_contexts(ls); - } - - void submit_entry( LogEvent *e, Context *c = 0 ); - void wait_for_sync( Context *c ); - void flush(); - - void trim(Context *c); - void _did_read(); - void _trimmed(LogEvent *le); - - void reset(); // fresh, empty log! - void open(Context *onopen); - void write_head(Context *onfinish); - - void replay(Context *onfinish); - void _replay(); -}; - -#endif diff --git a/branches/marnberg/quota/mds/MDS.cc b/branches/marnberg/quota/mds/MDS.cc deleted file mode 100644 index 2c0cf8973f06d..0000000000000 --- a/branches/marnberg/quota/mds/MDS.cc +++ /dev/null @@ -1,1021 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "include/types.h" -#include "common/Clock.h" - -#include "msg/Messenger.h" - -#include "osd/OSDMap.h" -#include "osdc/Objecter.h" -#include "osdc/Filer.h" - -#include "MDSMap.h" - -#include "MDS.h" -#include "Server.h" -#include "Locker.h" -#include "MDCache.h" -#include "MDStore.h" -#include "MDLog.h" -#include "MDBalancer.h" -#include "IdAllocator.h" -#include "Migrator.h" -#include "Renamer.h" - -#include "AnchorTable.h" -#include "AnchorClient.h" - -#include "common/Logger.h" -#include "common/LogType.h" - -#include "common/Timer.h" - -#include "messages/MMDSMap.h" -#include "messages/MMDSBeacon.h" - -#include "messages/MPing.h" -#include "messages/MPingAck.h" -#include "messages/MGenericMessage.h" - -#include "messages/MOSDMap.h" -#include "messages/MOSDGetMap.h" - - -LogType mds_logtype, mds_cache_logtype; - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << whoami << " " -#define derr(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << whoami << " " - - - - - -// cons/des -MDS::MDS(int whoami, Messenger *m, MonMap *mm) : timer(mds_lock) { - this->whoami = whoami; - - monmap = mm; - messenger = m; - - mdsmap = new MDSMap; - osdmap = new OSDMap; - - objecter = new Objecter(messenger, monmap, osdmap); - filer = new Filer(objecter); - - mdcache = new MDCache(this); - mdstore = new MDStore(this); - mdlog = new MDLog(this); - balancer = new MDBalancer(this); - - anchorclient = new AnchorClient(messenger, mdsmap); - idalloc = new IdAllocator(this); - - anchormgr = new AnchorTable(this); - - server = new Server(this); - locker = new Locker(this, mdcache); - - - // beacon - beacon_last_seq = 0; - beacon_sender = 0; - beacon_killer = 0; - - // tick - tick_event = 0; - - req_rate = 0; - - want_state = state = MDSMap::STATE_DNE; - - - logger = logger2 = 0; - - // i'm ready! - messenger->set_dispatcher(this); -} - -MDS::~MDS() { - if (mdcache) { delete mdcache; mdcache = NULL; } - if (mdstore) { delete mdstore; mdstore = NULL; } - if (mdlog) { delete mdlog; mdlog = NULL; } - if (balancer) { delete balancer; balancer = NULL; } - if (idalloc) { delete idalloc; idalloc = NULL; } - if (anchormgr) { delete anchormgr; anchormgr = NULL; } - if (anchorclient) { delete anchorclient; anchorclient = NULL; } - if (osdmap) { delete osdmap; osdmap = 0; } - if (mdsmap) { delete mdsmap; mdsmap = 0; } - - if (server) { delete server; server = 0; } - if (locker) { delete locker; locker = 0; } - - if (filer) { delete filer; filer = 0; } - if (objecter) { delete objecter; objecter = 0; } - if (messenger) { delete messenger; messenger = NULL; } - - if (logger) { delete logger; logger = 0; } - if (logger2) { delete logger2; logger2 = 0; } - -} - - -void MDS::reopen_logger() -{ - // flush+close old log - if (logger) { - logger->flush(true); - delete logger; - } - if (logger2) { - logger2->flush(true); - delete logger2; - } - - - // log - string name; - name = "mds"; - int w = whoami; - if (w >= 1000) name += ('0' + ((w/1000)%10)); - if (w >= 100) name += ('0' + ((w/100)%10)); - if (w >= 10) name += ('0' + ((w/10)%10)); - name += ('0' + ((w/1)%10)); - - logger = new Logger(name, (LogType*)&mds_logtype); - - mds_logtype.add_inc("req"); - mds_logtype.add_inc("reply"); - mds_logtype.add_inc("fw"); - mds_logtype.add_inc("cfw"); - - mds_logtype.add_set("l"); - mds_logtype.add_set("q"); - mds_logtype.add_set("popanyd"); - mds_logtype.add_set("popnest"); - - mds_logtype.add_inc("lih"); - mds_logtype.add_inc("lif"); - - mds_logtype.add_set("c"); - mds_logtype.add_set("ctop"); - mds_logtype.add_set("cbot"); - mds_logtype.add_set("cptail"); - mds_logtype.add_set("cpin"); - mds_logtype.add_inc("cex"); - mds_logtype.add_inc("dis"); - mds_logtype.add_inc("cmiss"); - - mds_logtype.add_set("buf"); - mds_logtype.add_inc("cdir"); - mds_logtype.add_inc("fdir"); - - mds_logtype.add_inc("iex"); - mds_logtype.add_inc("iim"); - mds_logtype.add_inc("ex"); - mds_logtype.add_inc("im"); - mds_logtype.add_inc("imex"); - mds_logtype.add_set("nex"); - mds_logtype.add_set("nim"); - - - char n[80]; - sprintf(n, "mds%d.cache", whoami); - logger2 = new Logger(n, (LogType*)&mds_cache_logtype); -} - -void MDS::send_message_mds(Message *m, int mds, int port, int fromport) -{ - // send mdsmap first? - if (peer_mdsmap_epoch[mds] < mdsmap->get_epoch()) { - messenger->send_message(new MMDSMap(mdsmap), - mdsmap->get_inst(mds)); - peer_mdsmap_epoch[mds] = mdsmap->get_epoch(); - } - - // send message - if (port && !fromport) - fromport = port; - messenger->send_message(m, mdsmap->get_inst(mds), port, fromport); -} - - -class C_MDS_Tick : public Context { - MDS *mds; -public: - C_MDS_Tick(MDS *m) : mds(m) {} - void finish(int r) { - mds->tick(); - } -}; - - - -int MDS::init(bool standby) -{ - mds_lock.Lock(); - - if (standby) - want_state = MDSMap::STATE_STANDBY; - else - want_state = MDSMap::STATE_STARTING; - - // starting beacon. this will induce an MDSMap from the monitor - beacon_start(); - - // schedule tick - reset_tick(); - - mds_lock.Unlock(); - return 0; -} - -void MDS::reset_tick() -{ - // cancel old - if (tick_event) timer.cancel_event(tick_event); - - // schedule - tick_event = new C_MDS_Tick(this); - timer.add_event_after(g_conf.mon_tick_interval, tick_event); -} - -void MDS::tick() -{ - // reschedule - reset_tick(); - - // log - mds_load_t load = balancer->get_load(); - - if (logger) { - req_rate = logger->get("req"); - - logger->set("l", (int)load.mds_load()); - logger->set("q", messenger->get_dispatch_queue_len()); - logger->set("buf", buffer_total_alloc); - - mdcache->log_stat(logger); - } - - // booted? - if (is_active()) { - - // balancer - balancer->tick(); - - // HACK to test hashing stuff - if (false) { - /* - static map didhash; - if (elapsed.sec() > 15 && !didhash[whoami]) { - CInode *in = mdcache->get_inode(100000010); - if (in && in->dir) { - if (in->dir->is_auth()) - mdcache->migrator->hash_dir(in->dir); - didhash[whoami] = 1; - } - } - if (0 && elapsed.sec() > 25 && didhash[whoami] == 1) { - CInode *in = mdcache->get_inode(100000010); - if (in && in->dir) { - if (in->dir->is_auth() && in->dir->is_hashed()) - mdcache->migrator->unhash_dir(in->dir); - didhash[whoami] = 2; - } - } - */ - } - } -} - - - - -// ----------------------- -// beacons - -void MDS::beacon_start() -{ - beacon_send(); // send first beacon - - //reset_beacon_killer(); // schedule killer -} - - -class C_MDS_BeaconSender : public Context { - MDS *mds; -public: - C_MDS_BeaconSender(MDS *m) : mds(m) {} - void finish(int r) { - mds->beacon_send(); - } -}; - -void MDS::beacon_send() -{ - ++beacon_last_seq; - dout(10) << "beacon_send " << MDSMap::get_state_name(want_state) - << " seq " << beacon_last_seq - << " (currently " << MDSMap::get_state_name(state) << ")" - << endl; - - beacon_seq_stamp[beacon_last_seq] = g_clock.now(); - - int mon = monmap->pick_mon(); - messenger->send_message(new MMDSBeacon(want_state, beacon_last_seq), - monmap->get_inst(mon)); - - // schedule next sender - if (beacon_sender) timer.cancel_event(beacon_sender); - beacon_sender = new C_MDS_BeaconSender(this); - timer.add_event_after(g_conf.mds_beacon_interval, beacon_sender); -} - -void MDS::handle_mds_beacon(MMDSBeacon *m) -{ - dout(10) << "handle_mds_beacon " << MDSMap::get_state_name(m->get_state()) - << " seq " << m->get_seq() << endl; - version_t seq = m->get_seq(); - - // update lab - if (beacon_seq_stamp.count(seq)) { - assert(beacon_seq_stamp[seq] > beacon_last_acked_stamp); - beacon_last_acked_stamp = beacon_seq_stamp[seq]; - - // clean up seq_stamp map - while (!beacon_seq_stamp.empty() && - beacon_seq_stamp.begin()->first <= seq) - beacon_seq_stamp.erase(beacon_seq_stamp.begin()); - - reset_beacon_killer(); - } - - delete m; -} - -class C_MDS_BeaconKiller : public Context { - MDS *mds; - utime_t lab; -public: - C_MDS_BeaconKiller(MDS *m, utime_t l) : mds(m), lab(l) {} - void finish(int r) { - mds->beacon_kill(lab); - } -}; - -void MDS::reset_beacon_killer() -{ - utime_t when = beacon_last_acked_stamp; - when += g_conf.mds_beacon_grace; - - dout(15) << "reset_beacon_killer last_acked_stamp at " << beacon_last_acked_stamp - << ", will die at " << when << endl; - - if (beacon_killer) timer.cancel_event(beacon_killer); - - beacon_killer = new C_MDS_BeaconKiller(this, beacon_last_acked_stamp); - timer.add_event_at(when, beacon_killer); -} - -void MDS::beacon_kill(utime_t lab) -{ - if (lab == beacon_last_acked_stamp) { - dout(0) << "beacon_kill last_acked_stamp " << lab - << ", killing myself." - << endl; - exit(0); - } else { - dout(20) << "beacon_kill last_acked_stamp " << beacon_last_acked_stamp - << " != my " << lab - << ", doing nothing." - << endl; - } -} - - - -void MDS::handle_mds_map(MMDSMap *m) -{ - version_t epoch = m->get_epoch(); - dout(1) << "handle_mds_map epoch " << epoch << " from " << m->get_source() << endl; - - // note source's map version - if (m->get_source().is_mds() && - peer_mdsmap_epoch[m->get_source().num()] < epoch) { - dout(15) << " peer " << m->get_source() - << " has mdsmap epoch >= " << epoch - << endl; - peer_mdsmap_epoch[m->get_source().num()] = epoch; - } - - // is it new? - if (epoch <= mdsmap->get_epoch()) { - dout(1) << " old map epoch " << epoch << " <= " << mdsmap->get_epoch() - << ", discarding" << endl; - delete m; - return; - } - - // note some old state - int oldwhoami = whoami; - int oldstate = state; - set oldresolve; - mdsmap->get_mds_set(oldresolve, MDSMap::STATE_RESOLVE); - bool wasrejoining = mdsmap->is_rejoining(); - set oldfailed; - mdsmap->get_mds_set(oldfailed, MDSMap::STATE_FAILED); - - // decode and process - mdsmap->decode(m->get_encoded()); - - // see who i am - whoami = mdsmap->get_inst_rank(messenger->get_myaddr()); - if (oldwhoami != whoami) { - // update messenger. - messenger->reset_myname(MSG_ADDR_MDS(whoami)); - - reopen_logger(); - dout(1) << "handle_mds_map i am now mds" << whoami - << " incarnation " << mdsmap->get_inc(whoami) - << endl; - - // do i need an osdmap? - if (oldwhoami < 0) { - // we need an osdmap too. - int mon = monmap->pick_mon(); - messenger->send_message(new MOSDGetMap(0), - monmap->get_inst(mon)); - } - } - - // tell objecter my incarnation - if (objecter->get_client_incarnation() < 0 && - mdsmap->have_inst(whoami)) { - assert(mdsmap->get_inc(whoami) > 0); - objecter->set_client_incarnation(mdsmap->get_inc(whoami)); - } - - // update my state - state = mdsmap->get_state(whoami); - - // did it change? - if (oldstate != state) { - if (state == want_state) { - dout(1) << "handle_mds_map new state " << mdsmap->get_state_name(state) << endl; - } else { - dout(1) << "handle_mds_map new state " << mdsmap->get_state_name(state) - << ", although i wanted " << mdsmap->get_state_name(want_state) - << endl; - want_state = state; - } - - // now active? - if (is_active()) { - dout(1) << "now active" << endl; - finish_contexts(waitfor_active); // kick waiters - } - - else if (is_replay()) { - // initialize gather sets - set rs; - mdsmap->get_recovery_mds_set(rs); - rs.erase(whoami); - dout(1) << "now replay. my recovery peers are " << rs << endl; - mdcache->set_recovery_set(rs); - } - - // now stopping? - else if (is_stopping()) { - assert(oldstate == MDSMap::STATE_ACTIVE); - dout(1) << "now stopping" << endl; - - mdcache->shutdown_start(); - - // save anchor table - if (mdsmap->get_anchortable() == whoami) - anchormgr->save(0); // FIXME? or detect completion via filer? - - if (idalloc) - idalloc->save(0); // FIXME? or detect completion via filer? - - // flush log - mdlog->set_max_events(0); - mdlog->trim(NULL); - } - - // now standby? - else if (is_stopped()) { - assert(oldstate == MDSMap::STATE_STOPPING); - dout(1) << "now stopped, sending down:out and exiting" << endl; - shutdown_final(); - } - } - - - // is anyone resolving? - if (is_resolve() || is_rejoin() || is_active() || is_stopping()) { - set resolve; - mdsmap->get_mds_set(resolve, MDSMap::STATE_RESOLVE); - if (oldresolve != resolve) - dout(10) << "resolve set is " << resolve << ", was " << oldresolve << endl; - for (set::iterator p = resolve.begin(); p != resolve.end(); ++p) { - if (*p == whoami) continue; - if (oldresolve.count(*p) == 0 || // if other guy newly resolve, or - oldstate == MDSMap::STATE_REPLAY) // if i'm newly resolve, - mdcache->send_import_map(*p); // share my import map (now or later) - } - } - - // is everybody finally rejoining? - if (is_rejoin() || is_active() || is_stopping()) { - if (!wasrejoining && mdsmap->is_rejoining()) { - mdcache->send_cache_rejoins(); - } - } - - // did anyone go down? - if (is_active() || is_stopping()) { - set failed; - mdsmap->get_mds_set(failed, MDSMap::STATE_FAILED); - for (set::iterator p = failed.begin(); p != failed.end(); ++p) { - // newly so? - if (oldfailed.count(*p)) continue; - - mdcache->migrator->handle_mds_failure(*p); - } - } - - delete m; -} - -void MDS::handle_osd_map(MOSDMap *m) -{ - version_t had = osdmap->get_epoch(); - - // process locally - objecter->handle_osd_map(m); - - if (had == 0) { - if (is_creating()) - boot_create(); // new tables, journal - else if (is_starting()) - boot_start(); // old tables, empty journal - else if (is_replay()) - boot_replay(); // replay, join - else - assert(is_standby()); - } - - // pass on to clients - for (set::iterator it = clientmap.get_mount_set().begin(); - it != clientmap.get_mount_set().end(); - it++) { - MOSDMap *n = new MOSDMap; - n->maps = m->maps; - n->incremental_maps = m->incremental_maps; - messenger->send_message(n, clientmap.get_inst(*it)); - } -} - - -class C_MDS_BootFinish : public Context { - MDS *mds; -public: - C_MDS_BootFinish(MDS *m) : mds(m) {} - void finish(int r) { mds->boot_finish(); } -}; - -void MDS::boot_create() -{ - dout(3) << "boot_create" << endl; - - C_Gather *fin = new C_Gather(new C_MDS_BootFinish(this)); - - if (whoami == 0) { - dout(3) << "boot_create since i am also mds0, creating root inode and dir" << endl; - - // create root inode. - mdcache->open_root(0); - CInode *root = mdcache->get_root(); - assert(root); - - // force empty root dir - CDir *dir = root->dir; - dir->mark_complete(); - dir->mark_dirty(dir->pre_dirty()); - - // save it - mdstore->commit_dir(dir, fin->new_sub()); - } - - // start with a fresh journal - dout(10) << "boot_create creating fresh journal" << endl; - mdlog->reset(); - mdlog->write_head(fin->new_sub()); - - // write our first importmap - mdcache->log_import_map(fin->new_sub()); - - // fixme: fake out idalloc (reset, pretend loaded) - dout(10) << "boot_create creating fresh idalloc table" << endl; - idalloc->reset(); - idalloc->save(fin->new_sub()); - - // fixme: fake out anchortable - if (mdsmap->get_anchortable() == whoami) { - dout(10) << "boot_create creating fresh anchortable" << endl; - anchormgr->reset(); - anchormgr->save(fin->new_sub()); - } -} - -void MDS::boot_start() -{ - dout(2) << "boot_start" << endl; - - C_Gather *fin = new C_Gather(new C_MDS_BootFinish(this)); - - dout(2) << "boot_start opening idalloc" << endl; - idalloc->load(fin->new_sub()); - - if (mdsmap->get_anchortable() == whoami) { - dout(2) << "boot_start opening anchor table" << endl; - anchormgr->load(fin->new_sub()); - } else { - dout(2) << "boot_start i have no anchor table" << endl; - } - - dout(2) << "boot_start opening mds log" << endl; - mdlog->open(fin->new_sub()); - - if (mdsmap->get_root() == whoami) { - dout(2) << "boot_start opening root directory" << endl; - mdcache->open_root(fin->new_sub()); - } -} - -void MDS::boot_finish() -{ - dout(3) << "boot_finish" << endl; - - if (is_starting()) { - // make sure mdslog is empty - assert(mdlog->get_read_pos() == mdlog->get_write_pos()); - } - - set_want_state(MDSMap::STATE_ACTIVE); -} - - -class C_MDS_BootRecover : public Context { - MDS *mds; - int nextstep; -public: - C_MDS_BootRecover(MDS *m, int n) : mds(m), nextstep(n) {} - void finish(int r) { mds->boot_replay(nextstep); } -}; - -void MDS::boot_replay(int step) -{ - switch (step) { - case 0: - step = 1; // fall-thru. - - case 1: - dout(2) << "boot_replay " << step << ": opening idalloc" << endl; - idalloc->load(new C_MDS_BootRecover(this, 2)); - break; - - case 2: - if (mdsmap->get_anchortable() == whoami) { - dout(2) << "boot_replay " << step << ": opening anchor table" << endl; - anchormgr->load(new C_MDS_BootRecover(this, 3)); - break; - } - dout(2) << "boot_replay " << step << ": i have no anchor table" << endl; - step++; // fall-thru - - case 3: - dout(2) << "boot_replay " << step << ": opening mds log" << endl; - mdlog->open(new C_MDS_BootRecover(this, 4)); - break; - - case 4: - dout(2) << "boot_replay " << step << ": replaying mds log" << endl; - mdlog->replay(new C_MDS_BootRecover(this, 5)); - break; - - case 5: - dout(2) << "boot_replay " << step << ": restarting any recovered purges" << endl; - mdcache->start_recovered_purges(); - - step++; // fall-thru - - case 6: - // done with replay! - if (mdsmap->get_num_mds(MDSMap::STATE_ACTIVE) == 0 && - mdsmap->get_num_mds(MDSMap::STATE_STOPPING) == 0 && - mdsmap->get_num_mds(MDSMap::STATE_RESOLVE) == 0 && - mdsmap->get_num_mds(MDSMap::STATE_REJOIN) == 0 && - mdsmap->get_num_mds(MDSMap::STATE_REPLAY) == 1 && // me - mdsmap->get_num_mds(MDSMap::STATE_FAILED) == 0) { - dout(2) << "boot_replay " << step << ": i am alone, moving to state active" << endl; - set_want_state(MDSMap::STATE_ACTIVE); - } else { - dout(2) << "boot_replay " << step << ": i am not alone, moving to state resolve" << endl; - set_want_state(MDSMap::STATE_RESOLVE); - } - break; - - } -} - - -void MDS::set_want_state(int s) -{ - dout(3) << "set_want_state " << MDSMap::get_state_name(s) << endl; - want_state = s; - beacon_send(); -} - - - - -int MDS::shutdown_start() -{ - dout(1) << "shutdown_start" << endl; - derr(0) << "mds shutdown start" << endl; - - // tell everyone to stop. - set active; - mdsmap->get_active_mds_set(active); - for (set::iterator p = active.begin(); - p != active.end(); - p++) { - if (mdsmap->is_up(*p)) { - dout(1) << "sending MShutdownStart to mds" << *p << endl; - send_message_mds(new MGenericMessage(MSG_MDS_SHUTDOWNSTART), - *p, MDS_PORT_MAIN); - } - } - - // go - set_want_state(MDSMap::STATE_STOPPING); - return 0; -} - - -void MDS::handle_shutdown_start(Message *m) -{ - dout(1) << " handle_shutdown_start" << endl; - - // set flag - set_want_state(MDSMap::STATE_STOPPING); - - delete m; -} - - - -int MDS::shutdown_final() -{ - dout(1) << "shutdown_final" << endl; - - // send final down:out beacon (it doesn't matter if this arrives) - set_want_state(MDSMap::STATE_OUT); - - // stop timers - if (beacon_killer) { - timer.cancel_event(beacon_killer); - beacon_killer = 0; - } - if (beacon_sender) { - timer.cancel_event(beacon_sender); - beacon_sender = 0; - } - if (tick_event) { - timer.cancel_event(tick_event); - tick_event = 0; - } - timer.cancel_all(); - timer.join(); - - // shut down cache - mdcache->shutdown(); - - // shut down messenger - messenger->shutdown(); - - return 0; -} - - - - - -void MDS::dispatch(Message *m) -{ - mds_lock.Lock(); - my_dispatch(m); - mds_lock.Unlock(); -} - - - -void MDS::my_dispatch(Message *m) -{ - // from bad mds? - if (m->get_source().is_mds()) { - int from = m->get_source().num(); - if (!mdsmap->have_inst(from) || - mdsmap->get_inst(from) != m->get_source_inst()) { - // bogus mds? - if (m->get_type() != MSG_MDS_MAP) { - dout(5) << "got " << *m << " from old/bad/imposter mds " << m->get_source() - << ", dropping" << endl; - delete m; - return; - } else { - dout(5) << "got " << *m << " from old/bad/imposter mds " << m->get_source() - << ", but it's an mdsmap, looking at it" << endl; - } - } - } - - - switch (m->get_dest_port()) { - - case MDS_PORT_ANCHORMGR: - anchormgr->dispatch(m); - break; - case MDS_PORT_ANCHORCLIENT: - anchorclient->dispatch(m); - break; - - case MDS_PORT_CACHE: - mdcache->dispatch(m); - break; - case MDS_PORT_LOCKER: - locker->dispatch(m); - break; - - case MDS_PORT_MIGRATOR: - mdcache->migrator->dispatch(m); - break; - case MDS_PORT_RENAMER: - mdcache->renamer->dispatch(m); - break; - - case MDS_PORT_BALANCER: - balancer->proc_message(m); - break; - - case MDS_PORT_MAIN: - proc_message(m); - break; - - case MDS_PORT_SERVER: - server->dispatch(m); - break; - - default: - dout(1) << "MDS dispatch unknown message port" << m->get_dest_port() << endl; - assert(0); - } - - - // HACK FOR NOW - if (is_active()) { - // flush log to disk after every op. for now. - mdlog->flush(); - - // trim cache - mdcache->trim(); - } - - // finish any triggered contexts - if (finished_queue.size()) { - dout(7) << "mds has " << finished_queue.size() << " queued contexts" << endl; - list ls; - ls.splice(ls.begin(), finished_queue); - assert(finished_queue.empty()); - finish_contexts(ls); - } - - - - // hack: force hash root? - if (false && - mdcache->get_root() && - mdcache->get_root()->dir && - !(mdcache->get_root()->dir->is_hashed() || - mdcache->get_root()->dir->is_hashing())) { - dout(0) << "hashing root" << endl; - mdcache->migrator->hash_dir(mdcache->get_root()->dir); - } - - - - - // HACK to force export to test foreign renames - if (false && whoami == 0) { - static bool didit = false; - - // 7 to 1 - CInode *in = mdcache->get_inode(1001); - if (in && in->is_dir() && !didit) { - CDir *dir = in->get_or_open_dir(mdcache); - if (dir->is_auth()) { - dout(1) << "FORCING EXPORT" << endl; - mdcache->migrator->export_dir(dir,1); - didit = true; - } - } - } - - - - // shut down? - if (is_stopping()) { - if (mdcache->shutdown_pass()) { - dout(7) << "shutdown_pass=true, finished w/ shutdown, moving to up:stopped" << endl; - - // tell monitor we shut down cleanly. - set_want_state(MDSMap::STATE_STOPPED); - } - } - -} - - -void MDS::proc_message(Message *m) -{ - switch (m->get_type()) { - // OSD =============== - /* - case MSG_OSD_MKFS_ACK: - handle_osd_mkfs_ack(m); - return; - */ - case MSG_OSD_OPREPLY: - objecter->handle_osd_op_reply((class MOSDOpReply*)m); - return; - case MSG_OSD_MAP: - handle_osd_map((MOSDMap*)m); - return; - - - // MDS - case MSG_MDS_MAP: - handle_mds_map((MMDSMap*)m); - return; - - case MSG_MDS_BEACON: - handle_mds_beacon((MMDSBeacon*)m); - return; - - case MSG_MDS_SHUTDOWNSTART: // mds0 -> mds1+ - handle_shutdown_start(m); - return; - - case MSG_PING: - handle_ping((MPing*)m); - return; - - default: - assert(0); - } - -} - - - - - - -void MDS::handle_ping(MPing *m) -{ - dout(10) << " received ping from " << m->get_source() << " with seq " << m->seq << endl; - - messenger->send_message(new MPingAck(m), - m->get_source_inst()); - - delete m; -} - diff --git a/branches/marnberg/quota/mds/MDS.h b/branches/marnberg/quota/mds/MDS.h deleted file mode 100644 index 8b3ff1e4aa430..0000000000000 --- a/branches/marnberg/quota/mds/MDS.h +++ /dev/null @@ -1,269 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __MDS_H -#define __MDS_H - -#include -#include -#include -#include -#include -using namespace std; - -#include -using namespace __gnu_cxx; - -#include "mdstypes.h" - -#include "msg/Dispatcher.h" -#include "include/types.h" -#include "include/Context.h" -#include "common/DecayCounter.h" -#include "common/Logger.h" -#include "common/Mutex.h" -#include "common/Cond.h" -#include "common/Timer.h" - -#include "mon/MonMap.h" -#include "MDSMap.h" - -#include "ClientMap.h" - - -#define MDS_PORT_MAIN 0 -#define MDS_PORT_SERVER 1 -#define MDS_PORT_CACHE 2 -#define MDS_PORT_LOCKER 3 -#define MDS_PORT_STORE 4 -#define MDS_PORT_BALANCER 5 -#define MDS_PORT_MIGRATOR 6 -#define MDS_PORT_RENAMER 7 - -#define MDS_PORT_ANCHORCLIENT 10 -#define MDS_PORT_ANCHORMGR 11 - - -#define MDS_INO_ROOT 1 -#define MDS_INO_PGTABLE 2 -#define MDS_INO_LOG_OFFSET 0x100 -#define MDS_INO_IDS_OFFSET 0x200 -#define MDS_INO_INODEFILE_OFFSET 0x300 -#define MDS_INO_ANCHORTABLE 0x400 -#define MDS_INO_BASE 0x1000 - -#define MDS_TRAVERSE_FORWARD 1 -#define MDS_TRAVERSE_DISCOVER 2 // skips permissions checks etc. -#define MDS_TRAVERSE_DISCOVERXLOCK 3 // succeeds on (foreign?) null, xlocked dentries. -#define MDS_TRAVERSE_FAIL 4 - - -class filepath; - -class OSDMap; -class Objecter; -class Filer; - -class Server; -class Locker; -class AnchorTable; -class AnchorClient; -class MDCache; -class MDStore; -class MDLog; -class MDBalancer; -class IdAllocator; - -class CInode; -class CDir; -class CDentry; - -class Messenger; -class Message; - -class MClientRequest; -class MClientReply; -class MHashReaddir; -class MHashReaddirReply; - -class MMDSBeacon; - - -class MDS : public Dispatcher { - public: - Mutex mds_lock; - - SafeTimer timer; - - protected: - int whoami; - - public: - Messenger *messenger; - MDSMap *mdsmap; - MonMap *monmap; - OSDMap *osdmap; - Objecter *objecter; - Filer *filer; // for reading/writing to/from osds - - ClientMap clientmap; - - // sub systems - Server *server; - MDCache *mdcache; - Locker *locker; - MDStore *mdstore; - MDLog *mdlog; - MDBalancer *balancer; - - IdAllocator *idalloc; - - AnchorTable *anchormgr; - AnchorClient *anchorclient; - - Logger *logger, *logger2; - - - protected: - // -- MDS state -- - int state; // my confirmed state - int want_state; // the state i want - list waitfor_active; - - map peer_mdsmap_epoch; - - public: - void queue_waitfor_active(Context *c) { waitfor_active.push_back(c); } - - bool is_dne() { return state == MDSMap::STATE_DNE; } - bool is_out() { return state == MDSMap::STATE_OUT; } - bool is_failed() { return state == MDSMap::STATE_FAILED; } - bool is_creating() { return state == MDSMap::STATE_CREATING; } - bool is_starting() { return state == MDSMap::STATE_STARTING; } - bool is_standby() { return state == MDSMap::STATE_STANDBY; } - bool is_replay() { return state == MDSMap::STATE_REPLAY; } - bool is_resolve() { return state == MDSMap::STATE_RESOLVE; } - bool is_rejoin() { return state == MDSMap::STATE_REJOIN; } - bool is_active() { return state == MDSMap::STATE_ACTIVE; } - bool is_stopping() { return state == MDSMap::STATE_STOPPING; } - bool is_stopped() { return state == MDSMap::STATE_STOPPED; } - - void set_want_state(int s); - - - // -- waiters -- - list finished_queue; - - void queue_finished(Context *c) { - finished_queue.push_back(c); - } - void queue_finished(list& ls) { - finished_queue.splice( finished_queue.end(), ls ); - } - - // -- keepalive beacon -- - version_t beacon_last_seq; // last seq sent to monitor - map beacon_seq_stamp; // seq # -> time sent - utime_t beacon_last_acked_stamp; // last time we sent a beacon that got acked - Context *beacon_sender; - Context *beacon_killer; // next scheduled time of death - - // tick and other timer fun - Context *tick_event; - void reset_tick(); - - - - // shutdown crap - int req_rate; - - // ino's and fh's - public: - - int get_req_rate() { return req_rate; } - - protected: - - friend class MDStore; - - - public: - MDS(int whoami, Messenger *m, MonMap *mm); - ~MDS(); - - // who am i etc - int get_nodeid() { return whoami; } - MDSMap *get_mds_map() { return mdsmap; } - OSDMap *get_osd_map() { return osdmap; } - - void send_message_mds(Message *m, int mds, int port=0, int fromport=0); - - // start up, shutdown - int init(bool standby=false); - void reopen_logger(); - - void boot_create(); // i am new mds. - void boot_start(); // i am old but empty (was down:out) mds. - void boot_replay(int step=0); // i am recovering existing (down:failed) mds. - void boot_finish(); - - int shutdown_start(); - int shutdown_final(); - - void tick(); - - void beacon_start(); - void beacon_send(); - void beacon_kill(utime_t lab); - void handle_mds_beacon(MMDSBeacon *m); - void reset_beacon_killer(); - - // messages - void proc_message(Message *m); - virtual void dispatch(Message *m); - void my_dispatch(Message *m); - - // special message types - void handle_ping(class MPing *m); - - void handle_mds_map(class MMDSMap *m); - - void handle_shutdown_start(Message *m); - - // osds - void handle_osd_getmap(Message *m); - void handle_osd_map(class MOSDMap *m); - -}; - - - -class C_MDS_RetryMessage : public Context { - Message *m; - MDS *mds; -public: - C_MDS_RetryMessage(MDS *mds, Message *m) { - assert(m); - this->m = m; - this->mds = mds; - } - virtual void finish(int r) { - mds->my_dispatch(m); - } -}; - - - -#endif diff --git a/branches/marnberg/quota/mds/MDSMap.h b/branches/marnberg/quota/mds/MDSMap.h deleted file mode 100644 index 66b086e5ea39f..0000000000000 --- a/branches/marnberg/quota/mds/MDSMap.h +++ /dev/null @@ -1,288 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MDSMAP_H -#define __MDSMAP_H - -#include "common/Clock.h" -#include "msg/Message.h" - -#include "include/types.h" - -#include -#include -#include -using namespace std; - -class MDSMap { - public: - // mds states - static const int STATE_DNE = 0; // down, never existed. - static const int STATE_OUT = 1; // down, once existed, but no imports, empty log. - static const int STATE_FAILED = 2; // down, holds (er, held) metadata; needs to be recovered. - - static const int STATE_STANDBY = 3; // up, but inactive. waiting for assignment by monitor. - static const int STATE_CREATING = 4; // up, creating MDS instance (new journal, idalloc..) - static const int STATE_STARTING = 5; // up, starting prior out MDS instance. - static const int STATE_REPLAY = 6; // up, scanning journal, recoverying any shared state - static const int STATE_RESOLVE = 7; // up, disambiguating partial distributed operations (import/export, ...rename?) - static const int STATE_REJOIN = 8; // up, replayed journal, rejoining distributed cache - static const int STATE_ACTIVE = 9; // up, active - static const int STATE_STOPPING = 10; // up, exporting metadata (-> standby or out) - static const int STATE_STOPPED = 11; // up, finished stopping. like standby, but not avail to takeover. - - static const char *get_state_name(int s) { - switch (s) { - // down - case STATE_DNE: return "down:dne"; - case STATE_OUT: return "down:out"; - case STATE_FAILED: return "down:failed"; - // up - case STATE_STANDBY: return "up:standby"; - case STATE_CREATING: return "up:creating"; - case STATE_STARTING: return "up:starting"; - case STATE_REPLAY: return "up:replay"; - case STATE_RESOLVE: return "up:resolve"; - case STATE_REJOIN: return "up:rejoin"; - case STATE_ACTIVE: return "up:active"; - case STATE_STOPPING: return "up:stopping"; - case STATE_STOPPED: return "up:stopped"; - default: assert(0); - } - return 0; - } - - protected: - epoch_t epoch; - utime_t ctime; - - int anchortable; // which MDS has anchortable (fixme someday) - int root; // which MDS has root directory - - set mds_created; // which mds ids have initialized journals and id tables. - map mds_state; // MDS state - map mds_state_seq; - map mds_inst; // up instances - map mds_inc; // incarnation count (monotonically increases) - - friend class MDSMonitor; - - public: - MDSMap() : epoch(0), anchortable(0), root(0) {} - - epoch_t get_epoch() const { return epoch; } - void inc_epoch() { epoch++; } - - const utime_t& get_ctime() const { return ctime; } - - int get_anchortable() const { return anchortable; } - int get_root() const { return root; } - - // counts - int get_num_mds() const { return mds_state.size(); } - int get_num_mds(int state) { - int n = 0; - for (map::const_iterator p = mds_state.begin(); - p != mds_state.end(); - p++) - if (p->second == state) ++n; - return n; - } - int get_num_up_mds() { - int n = 0; - for (map::const_iterator p = mds_state.begin(); - p != mds_state.end(); - p++) - if (is_up(p->first)) ++n; - return n; - } - int get_num_up_or_failed_mds() { - int n = 0; - for (map::const_iterator p = mds_state.begin(); - p != mds_state.end(); - p++) - if (is_up(p->first) || is_failed(p->first)) - ++n; - return n; - } - - // sets - void get_mds_set(set& s) { - s.clear(); - for (map::const_iterator p = mds_state.begin(); - p != mds_state.end(); - p++) - s.insert(p->first); - } - void get_up_mds_set(set& s) { - s.clear(); - for (map::const_iterator p = mds_state.begin(); - p != mds_state.end(); - p++) - if (is_up(p->first)) - s.insert(p->first); - } - void get_mds_set(set& s, int state) { - s.clear(); - for (map::const_iterator p = mds_state.begin(); - p != mds_state.end(); - p++) - if (p->second == state) - s.insert(p->first); - } - void get_active_mds_set(set& s) { - get_mds_set(s, MDSMap::STATE_ACTIVE); - } - void get_failed_mds_set(set& s) { - get_mds_set(s, MDSMap::STATE_FAILED); - } - void get_recovery_mds_set(set& s) { - s.clear(); - for (map::const_iterator p = mds_state.begin(); - p != mds_state.end(); - p++) - if (is_failed(p->first) || - is_replay(p->first) || is_resolve(p->first) || is_rejoin(p->first) || - is_active(p->first) || is_stopping(p->first)) - s.insert(p->first); - } - - - // mds states - bool is_down(int m) { return is_dne(m) || is_out(m) || is_failed(m); } - bool is_up(int m) { return !is_down(m); } - - bool is_dne(int m) { return mds_state.count(m) == 0 || mds_state[m] == STATE_DNE; } - bool is_out(int m) { return mds_state.count(m) && mds_state[m] == STATE_OUT; } - bool is_failed(int m) { return mds_state.count(m) && mds_state[m] == STATE_FAILED; } - - bool is_standby(int m) { return mds_state.count(m) && mds_state[m] == STATE_STANDBY; } - bool is_creating(int m) { return mds_state.count(m) && mds_state[m] == STATE_CREATING; } - bool is_starting(int m) { return mds_state.count(m) && mds_state[m] == STATE_STARTING; } - bool is_replay(int m) { return mds_state.count(m) && mds_state[m] == STATE_REPLAY; } - bool is_resolve(int m) { return mds_state.count(m) && mds_state[m] == STATE_RESOLVE; } - bool is_rejoin(int m) { return mds_state.count(m) && mds_state[m] == STATE_REJOIN; } - bool is_active(int m) { return mds_state.count(m) && mds_state[m] == STATE_ACTIVE; } - bool is_stopping(int m) { return mds_state.count(m) && mds_state[m] == STATE_STOPPING; } - bool is_stopped(int m) { return mds_state.count(m) && mds_state[m] == STATE_STOPPED; } - - bool has_created(int m) { return mds_created.count(m); } - - // cluster states - bool is_degraded() { // degraded = some recovery in process. fixes active membership and recovery_set. - return get_num_mds(STATE_REPLAY) + - get_num_mds(STATE_RESOLVE) + - get_num_mds(STATE_REJOIN) + - get_num_mds(STATE_FAILED); - } - /*bool is_resolving() { // nodes are resolving distributed ops - return get_num_mds(STATE_RESOLVE); - }*/ - bool is_rejoining() { - // nodes are rejoining cache state - return get_num_mds(STATE_REJOIN) > 0 && - get_num_mds(STATE_RESOLVE) == 0 && - get_num_mds(STATE_REPLAY) == 0 && - get_num_mds(STATE_FAILED) == 0; - } - - - int get_state(int m) { - if (mds_state.count(m)) return mds_state[m]; - return STATE_OUT; - } - - // inst - bool have_inst(int m) { - return mds_inst.count(m); - } - const entity_inst_t& get_inst(int m) { - assert(mds_inst.count(m)); - return mds_inst[m]; - } - bool get_inst(int m, entity_inst_t& inst) { - if (mds_inst.count(m)) { - inst = mds_inst[m]; - return true; - } - return false; - } - - int get_inst_rank(const entity_addr_t& addr) { - for (map::iterator p = mds_inst.begin(); - p != mds_inst.end(); - ++p) { - if (p->second.addr == addr) return p->first; - } - /*else - for (map::iterator p = mds_inst.begin(); - p != mds_inst.end(); - ++p) { - if (memcmp(&p->second.addr,&inst.addr, sizeof(inst.addr)) == 0) return p->first; - } - */ - - return -1; - } - - int get_inc(int m) { - assert(mds_inc.count(m)); - return mds_inc[m]; - } - - - void remove_mds(int m) { - mds_inst.erase(m); - mds_state.erase(m); - mds_state_seq.erase(m); - } - - - // serialize, unserialize - void encode(bufferlist& blist) { - blist.append((char*)&epoch, sizeof(epoch)); - blist.append((char*)&ctime, sizeof(ctime)); - blist.append((char*)&anchortable, sizeof(anchortable)); - blist.append((char*)&root, sizeof(root)); - - ::_encode(mds_state, blist); - ::_encode(mds_state_seq, blist); - ::_encode(mds_inst, blist); - ::_encode(mds_inc, blist); - } - - void decode(bufferlist& blist) { - int off = 0; - blist.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - blist.copy(off, sizeof(ctime), (char*)&ctime); - off += sizeof(ctime); - blist.copy(off, sizeof(anchortable), (char*)&anchortable); - off += sizeof(anchortable); - blist.copy(off, sizeof(root), (char*)&root); - off += sizeof(root); - - ::_decode(mds_state, blist, off); - ::_decode(mds_state_seq, blist, off); - ::_decode(mds_inst, blist, off); - ::_decode(mds_inc, blist, off); - } - - - /*** mapping functions ***/ - - int hash_dentry( inodeno_t dirino, const string& dn ); -}; - -#endif diff --git a/branches/marnberg/quota/mds/MDStore.cc b/branches/marnberg/quota/mds/MDStore.cc deleted file mode 100644 index 13aa270a2ee6c..0000000000000 --- a/branches/marnberg/quota/mds/MDStore.cc +++ /dev/null @@ -1,752 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "MDStore.h" -#include "MDS.h" -#include "MDCache.h" -#include "CInode.h" -#include "CDir.h" -#include "CDentry.h" -#include "MDSMap.h" - -#include "osd/OSDMap.h" -#include "osdc/Filer.h" - -#include "msg/Message.h" - -#include -#include -using namespace std; - - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".store " - - -/* - * separate hashed dir slices into "regions" - */ -size_t get_hash_offset(int hashcode) { - if (hashcode < 0) - return 0; // not hashed - else - return (size_t)(1<<30) * (size_t)(1+hashcode); -} - - - - -// ========================================================================== -// FETCH - - -class C_MDS_Fetch : public Context { - protected: - MDStore *ms; - inodeno_t ino; - - public: - C_MDS_Fetch(MDStore *ms, inodeno_t ino) : Context() { - this->ms = ms; - this->ino = ino; - } - - void finish(int result) { - ms->fetch_dir_2( result, ino ); - } -}; - -/** fetch_dir(dir, context) - * public call to fetch a dir. - */ -void MDStore::fetch_dir( CDir *dir, - Context *c ) -{ - dout(7) << "fetch_dir " << *dir << " context is " << c << endl; - assert(dir->is_auth() || - dir->is_hashed()); - - // wait - if (c) dir->add_waiter(CDIR_WAIT_COMPLETE, c); - - // already fetching? - if (dir->state_test(CDIR_STATE_FETCHING)) { - dout(7) << "already fetching " << *dir << "; waiting" << endl; - return; - } - - // state - dir->state_set(CDIR_STATE_FETCHING); - - // stats - if (mds->logger) mds->logger->inc("fdir"); - - // create return context - Context *fin = new C_MDS_Fetch( this, dir->ino() ); - if (dir->is_hashed()) - fetch_dir_hash( dir, fin, mds->get_nodeid()); // hashed - else - fetch_dir_hash( dir, fin ); // normal -} - -/* - * called by low level fn when it's fetched. - * fix up dir state. - */ -void MDStore::fetch_dir_2( int result, - inodeno_t ino) -{ - CInode *idir = mds->mdcache->get_inode(ino); - - if (!idir || result < 0) return; // hmm! nevermind i guess. - - assert(idir); - CDir *dir = idir->dir; - assert(dir); - - // dir is now complete - dir->state_set(CDIR_STATE_COMPLETE); - dir->state_clear(CDIR_STATE_FETCHING); - - // finish - list finished; - dir->take_waiting(CDIR_WAIT_COMPLETE|CDIR_WAIT_DENTRY, finished); - finish_contexts(finished, result); -} - - -/** low level methods **/ - -class C_MDS_FetchHash : public Context { -protected: - MDS *mds; - inode_t inode; - int hashcode; - Context *context; - -public: - bufferlist bl; - bufferlist bl2; - - C_MDS_FetchHash(MDS *mds, inode_t inode, Context *c, int hashcode) : Context() { - this->mds = mds; - this->inode = inode; - this->hashcode = hashcode; - this->context = c; - } - - void finish(int result) { - assert(result>0); - - // combine bufferlists bl + bl2 -> bl - bl.claim_append(bl2); - - // did i get the whole thing? - size_t size; - bl.copy(0, sizeof(size_t), (char*)&size); - size_t got = bl.length() - sizeof(size); - size_t left = size - got; - size_t from = bl.length(); - - // what part of dir are we getting? - from += get_hash_offset(hashcode); - - if (got >= size) { - // done. - mds->mdstore->fetch_dir_hash_2( bl, inode, context, hashcode ); - } - else { - // read the rest! - dout(12) << "fetch_dir_hash_2 dir size is " << size << ", got " << got << ", reading remaniing " << left << " from off " << from << endl; - - // create return context - C_MDS_FetchHash *fin = new C_MDS_FetchHash( mds, inode, context, hashcode ); - fin->bl.claim( bl ); - mds->filer->read(inode, - from, left, - &fin->bl2, - fin ); - return; - } - } -}; - -/** fetch_dir_hash - * low level method. - * fetch part of a dir. either the whole thing if hashcode is -1, or a specific - * hash segment. - */ -void MDStore::fetch_dir_hash( CDir *dir, - Context *c, - int hashcode) -{ - dout(11) << "fetch_dir_hash hashcode " << hashcode << " " << *dir << endl; - - // create return context - C_MDS_FetchHash *fin = new C_MDS_FetchHash( mds, dir->get_inode()->inode, c, hashcode ); - - // grab first stripe bit (which had better be more than 16 bytes!) - assert(dir->get_inode()->inode.layout.stripe_size >= 16); - mds->filer->read(dir->get_inode()->inode, - get_hash_offset(hashcode), dir->get_inode()->inode.layout.stripe_size, - &fin->bl, - fin ); -} - -void MDStore::fetch_dir_hash_2( bufferlist& bl, - inode_t& inode, - Context *c, - int hashcode) -{ - CInode *idir = mds->mdcache->get_inode(inode.ino); - if (!idir) { - dout(7) << "fetch_dir_hash_2 on ino " << inode.ino << " but no longer in our cache!" << endl; - c->finish(-1); - delete c; - return; - } - - if (!idir->dir_is_auth() || - !idir->dir) { - dout(7) << "fetch_dir_hash_2 on " << *idir << ", but i'm not auth, or dir not open" << endl; - c->finish(-1); - delete c; - return; - } - - // make sure we have a CDir - CDir *dir = idir->get_or_open_dir(mds->mdcache); - - // do it - dout(7) << "fetch_dir_hash_2 hashcode " << hashcode << " dir " << *dir << endl; - - // parse buffer contents into cache - dout(15) << "bl is " << bl << endl; - - int off = 0; - size_t size; - __uint32_t num; - version_t got_version; - int got_hashcode; - bl.copy(off, sizeof(size), (char*)&size); - off += sizeof(size); - assert(bl.length() >= size + sizeof(size)); - bl.copy(off, sizeof(num), (char*)&num); - off += sizeof(num); - bl.copy(off, sizeof(got_version), (char*)&got_version); - off += sizeof(got_version); - bl.copy(off, sizeof(got_hashcode), (char*)&got_hashcode); - off += sizeof(got_hashcode); - - assert(got_hashcode == hashcode); - - int buflen = bl.length(); - - dout(10) << " " << num << " items in " << size << " bytes" << endl; - - unsigned parsed = 0; - while (parsed < num) { - assert(off < buflen && num > 0); - parsed++; - - dout(24) << " " << parsed << "/" << num << " pos " << off << endl; - - // dentry - string dname; - ::_decode(dname, bl, off); - dout(24) << "parse filename '" << dname << "'" << endl; - - CDentry *dn = dir->lookup(dname); // existing dentry? - - char type = bl[off]; - ++off; - if (type == 'L') { - // hard link - inodeno_t ino; - bl.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - - // what to do? - if (hashcode >= 0) { - int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), dname ); - assert(dentryhashcode == hashcode); - } - - if (dn) { - if (dn->get_inode() == 0) { - // negative dentry? - dout(12) << "readdir had NEG dentry " << dname << endl; - } else { - // had dentry - dout(12) << "readdir had dentry " << dname << endl; - } - continue; - } - - // (remote) link - CDentry *dn = dir->add_dentry( dname, ino ); - - // link to inode? - CInode *in = mds->mdcache->get_inode(ino); // we may or may not have it. - if (in) { - dn->link_remote(in); - dout(12) << "readdir got remote link " << ino << " which we have " << *in << endl; - } else { - dout(12) << "readdir got remote link " << ino << " (dont' have it)" << endl; - } - } - else if (type == 'I') { - // inode - - // parse out inode - inode_t inode; - bl.copy(off, sizeof(inode), (char*)&inode); - off += sizeof(inode); - - string symlink; - if (inode.is_symlink()) - ::_decode(symlink, bl, off); - - // what to do? - if (hashcode >= 0) { - int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), dname ); - assert(dentryhashcode == hashcode); - } - - if (dn) { - if (dn->get_inode() == 0) { - // negative dentry? - dout(12) << "readdir had NEG dentry " << dname << endl; - } else { - // had dentry - dout(12) << "readdir had dentry " << dname << endl; - - // under water? - if (dn->get_version() <= got_version) { - assert(dn->get_inode()->get_version() <= got_version); - dout(10) << "readdir had underwater dentry " << dname << " and inode, marking clean" << endl; - dn->mark_clean(); - dn->get_inode()->mark_clean(); - } - } - continue; - } - - // add inode - CInode *in = 0; - if (mds->mdcache->have_inode(inode.ino)) { - in = mds->mdcache->get_inode(inode.ino); - dout(12) << "readdir got (but i already had) " << *in - << " mode " << in->inode.mode - << " mtime " << in->inode.mtime << endl; - } else { - // inode - in = new CInode(mds->mdcache); - in->inode = inode; - - // symlink? - if (in->is_symlink()) { - in->symlink = symlink; - } - - // add - mds->mdcache->add_inode( in ); - } - - // link - dir->add_dentry( dname, in ); - dout(12) << "readdir got " << *in << " mode " << in->inode.mode << " mtime " << in->inode.mtime << endl; - } - else { - dout(1) << "corrupt directory, i got tag char '" << type << "' val " << (int)(type) - << " at pos " << off << endl; - assert(0); - } - } - dout(15) << "parsed " << parsed << endl; - - if (c) { - c->finish(0); - delete c; - } -} - - - - -// ================================================================== -// COMMIT - -class C_MDS_CommitDirVerify : public Context { -public: - MDS *mds; - inodeno_t ino; - version_t version; - Context *c; - - C_MDS_CommitDirVerify( MDS *mds, - inodeno_t ino, - version_t version, - Context *c) { - this->mds = mds; - this->c = c; - this->version = version; - this->ino = ino; - } - - virtual void finish(int r) { - - if (r >= 0) { - CInode *in = mds->mdcache->get_inode(ino); - assert(in && in->dir); - if (in && in->dir && in->dir->is_auth()) { - dout(7) << "CommitDirVerify: current = " << in->dir->get_version() - << ", last committed = " << in->dir->get_last_committed_version() - << ", required = " << version << endl; - - if (in->dir->get_last_committed_version() >= version) { - dout(7) << "my required version is safe, done." << endl; - if (c) { - c->finish(0); - delete c; - } - } else { - dout(7) << "my required version is still not safe, committing again." << endl; - - // what was requested isn't committed yet. - mds->mdstore->commit_dir(in->dir, - version, - c); - } - return; - } - } - - // must have exported ors omethign! - dout(7) << "can't retry commit dir on " << ino << ", must have exported?" << endl; - - // finish. - if (c) { - c->finish(-1); - delete c; - } - } -}; - -class C_MDS_CommitDirFinish : public Context { - protected: - MDStore *ms; - CDir *dir; - version_t version; - - public: - - C_MDS_CommitDirFinish(MDStore *ms, CDir *dir) : Context() { - this->ms = ms; - this->dir = dir; - this->version = dir->get_version(); // just for sanity check later - } - - void finish(int result) { - ms->commit_dir_2( result, dir, version ); - } -}; - - -void MDStore::commit_dir( CDir *dir, - Context *c ) -{ - assert(dir->is_dirty()); - - // commit thru current version - commit_dir(dir, dir->get_version(), c); -} - -void MDStore::commit_dir( CDir *dir, - version_t version, - Context *c ) -{ - assert(dir->is_auth() || - dir->is_hashed()); - - // already committing? - if (dir->state_test(CDIR_STATE_COMMITTING)) { - // already mid-commit! - dout(7) << "commit_dir " << *dir << " mid-commit of " << dir->get_committing_version() << endl; - dout(7) << " current version = " << dir->get_version() << endl; - dout(7) << "requested version = " << version << endl; - - assert(version >= dir->get_last_committed_version()); // why would we request _old_ one? - - dir->add_waiter(CDIR_WAIT_COMMITTED, - new C_MDS_CommitDirVerify(mds, dir->ino(), version, c) ); - return; - } - - if (!dir->can_auth_pin()) { - // something must be frozen up the hiearchy! - dout(7) << "commit_dir " << *dir << " can't auth_pin, waiting" << endl; - dir->add_waiter(CDIR_WAIT_AUTHPINNABLE, - new C_MDS_CommitDirVerify(mds, dir->ino(), version, c) ); - return; - } - - - // is it complete? - if (!dir->is_complete()) { - dout(7) << "commit_dir " << *dir << " not complete, fetching first" << endl; - // fetch dir first - fetch_dir(dir, - new C_MDS_CommitDirVerify(mds, dir->ino(), version, c) ); - return; - } - - - // ok go - dout(7) << "commit_dir " << *dir << " version " << dir->get_version() << endl; - - // add waiter - if (c) dir->add_waiter(CDIR_WAIT_COMMITTED, c); - - // get continuation ready - Context *fin = new C_MDS_CommitDirFinish(this, dir); - - // state - dir->state_set(CDIR_STATE_COMMITTING); - dir->set_committing_version(); - - // stats - if (mds->logger) mds->logger->inc("cdir"); - - if (dir->is_hashed()) { - // hashed - commit_dir_slice( dir, fin, mds->get_nodeid() ); - } else { - // non-hashed - commit_dir_slice( dir, fin ); - } -} - -void MDStore::commit_dir_2( int result, - CDir *dir, - version_t committed_version) -{ - dout(5) << "commit_dir_2 " << *dir << " committed " << committed_version << ", current version " << dir->get_version() << endl; - assert(committed_version == dir->get_committing_version()); - - // remember which version is now safe - dir->set_last_committed_version(committed_version); - - // is the dir now clean? - if (committed_version == dir->get_version()) - dir->mark_clean(); - - dir->state_clear(CDIR_STATE_COMMITTING); - - // finish - dir->finish_waiting(CDIR_WAIT_COMMITTED); -} - - - - -// low-level committer (hashed or normal) - -class C_MDS_CommitSlice : public Context { - protected: - MDStore *ms; - CDir *dir; - Context *c; - int hashcode; - version_t version; - -public: - bufferlist bl; - - C_MDS_CommitSlice(MDStore *ms, CDir *dir, Context *c, int w) : Context() { - this->ms = ms; - this->dir = dir; - this->c = c; - this->hashcode = w; - version = dir->get_version(); - } - - void finish(int result) { - ms->commit_dir_slice_2( result, dir, c, version, hashcode ); - } -}; - - -void MDStore::commit_dir_slice( CDir *dir, - Context *c, - int hashcode) -{ - if (hashcode >= 0) { - assert(dir->is_hashed()); - dout(10) << "commit_dir_slice hashcode " << hashcode << " " << *dir << " version " << dir->get_version() << endl; - } else { - assert(dir->is_auth()); - dout(10) << "commit_dir_slice (whole dir) " << *dir << " version " << dir->get_version() << endl; - } - - // get continuation ready - C_MDS_CommitSlice *fin = new C_MDS_CommitSlice(this, dir, c, hashcode); - - // fill buffer - __uint32_t num = 0; - - bufferlist dirdata; - - version_t v = dir->get_version(); - dirdata.append((char*)&v, sizeof(v)); - dirdata.append((char*)&hashcode, sizeof(hashcode)); - - for (CDir_map_t::iterator it = dir->begin(); - it != dir->end(); - it++) { - CDentry *dn = it->second; - - if (hashcode >= 0) { - int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), it->first ); - if (dentryhashcode != hashcode) continue; - } - - if (dn->is_null()) continue; // skipping negative entry - - // primary or remote? - if (dn->is_remote()) { - - inodeno_t ino = dn->get_remote_ino(); - dout(14) << " pos " << dirdata.length() << " dn '" << it->first << "' remote ino " << ino << endl; - - // name, marker, ion - dirdata.append( it->first.c_str(), it->first.length() + 1); - dirdata.append( "L", 1 ); // remote link - dirdata.append((char*)&ino, sizeof(ino)); - - } else { - // primary link - CInode *in = dn->get_inode(); - assert(in); - - dout(14) << " pos " << dirdata.length() << " dn '" << it->first << "' inode " << *in << endl; - - // name, marker, inode, [symlink string] - dirdata.append( it->first.c_str(), it->first.length() + 1); - dirdata.append( "I", 1 ); // inode - dirdata.append( (char*) &in->inode, sizeof(inode_t)); - - if (in->is_symlink()) { - // include symlink destination! - dout(18) << " inlcuding symlink ptr " << in->symlink << endl; - dirdata.append( (char*) in->symlink.c_str(), in->symlink.length() + 1); - } - } - - num++; - } - dout(14) << "num " << num << endl; - - // put count in buffer - //bufferlist bl; - size_t size = sizeof(num) + dirdata.length(); - fin->bl.append((char*)&size, sizeof(size)); - fin->bl.append((char*)&num, sizeof(num)); - fin->bl.claim_append(dirdata); //.c_str(), dirdata.length()); - assert(fin->bl.length() == size + sizeof(size)); - - // pin inode - dir->auth_pin(); - - // submit to osd - mds->filer->write( dir->get_inode()->inode, - 0, fin->bl.length(), - fin->bl, - 0, //OSD_OP_FLAGS_TRUNCATE, // truncate file/object after end of this write - NULL, fin ); // on safe -} - - -void MDStore::commit_dir_slice_2( int result, - CDir *dir, - Context *c, - version_t committed_version, - int hashcode ) -{ - dout(11) << "commit_dir_slice_2 hashcode " << hashcode << " " << *dir << " v " << committed_version << endl; - - // mark inodes and dentries clean too (if we committed them!) - list null_clean; - for (CDir_map_t::iterator it = dir->begin(); - it != dir->end(); ) { - CDentry *dn = it->second; - it++; - - if (hashcode >= 0) { - int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), dn->get_name() ); - if (dentryhashcode != hashcode) continue; - } - - // dentry - if (committed_version >= dn->get_version()) { - if (dn->is_dirty()) { - dout(15) << " dir " << committed_version << " >= dn " << dn->get_version() << " now clean " << *dn << endl; - dn->mark_clean(); - } - } else { - dout(15) << " dir " << committed_version << " < dn " << dn->get_version() << " still dirty " << *dn << endl; - } - - // only do primary... - if (!dn->is_primary()) - continue; - - CInode *in = dn->get_inode(); - assert(in); - assert(in->is_auth()); - - if (committed_version >= in->get_version()) { - if (in->is_dirty()) { - dout(15) << " dir " << committed_version << " >= inode " << in->get_version() << " now clean " << *in << endl; - in->mark_clean(); - } - } else { - dout(15) << " dir " << committed_version << " < inode " << in->get_version() << " still dirty " << *in << endl; - assert(in->is_dirty()); - } - } - - // unpin - dir->auth_unpin(); - - // finish - if (c) { - c->finish(0); - delete c; - } -} - - - - - - - - - - - - diff --git a/branches/marnberg/quota/mds/MDStore.h b/branches/marnberg/quota/mds/MDStore.h deleted file mode 100644 index fe7553608a975..0000000000000 --- a/branches/marnberg/quota/mds/MDStore.h +++ /dev/null @@ -1,75 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __MDSTORE_H -#define __MDSTORE_H - -#include "include/types.h" -#include "include/buffer.h" - -class MDS; -class CDir; -class Context; - -class MDStore { - protected: - MDS *mds; - - - public: - MDStore(MDS *m) { - mds = m; - } - - - // fetch - public: - void fetch_dir( CDir *dir, Context *c ); - protected: - void fetch_dir_2( int result, inodeno_t ino ); - - void fetch_dir_hash( CDir *dir, - Context *c, - int hashcode = -1); - void fetch_dir_hash_2( bufferlist &bl, - inode_t& inode, - Context *c, - int which); - friend class C_MDS_Fetch; - friend class C_MDS_FetchHash; - - // commit - public: - void commit_dir( CDir *dir, Context *c ); // commit current dir version to disk. - void commit_dir( CDir *dir, __uint64_t version, Context *c ); // commit specified version to disk - protected: - void commit_dir_2( int result, CDir *dir, __uint64_t committed_version ); - - // low level committers - void commit_dir_slice( CDir *dir, - Context *c, - int hashcode = -1); - void commit_dir_slice_2( int result, - CDir *dir, - Context *c, - __uint64_t version, - int hashcode ); - - friend class C_MDS_CommitDirFinish; - friend class C_MDS_CommitSlice; -}; - - -#endif diff --git a/branches/marnberg/quota/mds/Migrator.cc b/branches/marnberg/quota/mds/Migrator.cc deleted file mode 100644 index 5d14bfbee4283..0000000000000 --- a/branches/marnberg/quota/mds/Migrator.cc +++ /dev/null @@ -1,3616 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include "MDS.h" -#include "MDCache.h" -#include "CInode.h" -#include "CDir.h" -#include "CDentry.h" -#include "Migrator.h" -#include "Locker.h" -#include "MDStore.h" -#include "Migrator.h" - -#include "MDBalancer.h" -#include "MDLog.h" -#include "MDSMap.h" - -#include "include/filepath.h" - -#include "events/EString.h" -#include "events/EExportStart.h" -#include "events/EExportFinish.h" -#include "events/EImportStart.h" -#include "events/EImportFinish.h" - -#include "msg/Messenger.h" - -#include "messages/MClientFileCaps.h" - -#include "messages/MExportDirDiscover.h" -#include "messages/MExportDirDiscoverAck.h" -#include "messages/MExportDirPrep.h" -#include "messages/MExportDirPrepAck.h" -#include "messages/MExportDirWarning.h" -#include "messages/MExportDir.h" -#include "messages/MExportDirNotify.h" -#include "messages/MExportDirNotifyAck.h" -#include "messages/MExportDirFinish.h" - -#include "messages/MHashDirDiscover.h" -#include "messages/MHashDirDiscoverAck.h" -#include "messages/MHashDirPrep.h" -#include "messages/MHashDirPrepAck.h" -#include "messages/MHashDir.h" -#include "messages/MHashDirNotify.h" -#include "messages/MHashDirAck.h" - -#include "messages/MUnhashDirPrep.h" -#include "messages/MUnhashDirPrepAck.h" -#include "messages/MUnhashDir.h" -#include "messages/MUnhashDirAck.h" -#include "messages/MUnhashDirNotify.h" -#include "messages/MUnhashDirNotifyAck.h" - - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".migrator " - - - -void Migrator::dispatch(Message *m) -{ - switch (m->get_type()) { - // import - case MSG_MDS_EXPORTDIRDISCOVER: - handle_export_dir_discover((MExportDirDiscover*)m); - break; - case MSG_MDS_EXPORTDIRPREP: - handle_export_dir_prep((MExportDirPrep*)m); - break; - case MSG_MDS_EXPORTDIR: - handle_export_dir((MExportDir*)m); - break; - case MSG_MDS_EXPORTDIRFINISH: - handle_export_dir_finish((MExportDirFinish*)m); - break; - - // export - case MSG_MDS_EXPORTDIRDISCOVERACK: - handle_export_dir_discover_ack((MExportDirDiscoverAck*)m); - break; - case MSG_MDS_EXPORTDIRPREPACK: - handle_export_dir_prep_ack((MExportDirPrepAck*)m); - break; - case MSG_MDS_EXPORTDIRNOTIFYACK: - handle_export_dir_notify_ack((MExportDirNotifyAck*)m); - break; - - // export 3rd party (inode authority) - case MSG_MDS_EXPORTDIRWARNING: - handle_export_dir_warning((MExportDirWarning*)m); - break; - case MSG_MDS_EXPORTDIRNOTIFY: - handle_export_dir_notify((MExportDirNotify*)m); - break; - - - // hashing - case MSG_MDS_HASHDIRDISCOVER: - handle_hash_dir_discover((MHashDirDiscover*)m); - break; - case MSG_MDS_HASHDIRDISCOVERACK: - handle_hash_dir_discover_ack((MHashDirDiscoverAck*)m); - break; - case MSG_MDS_HASHDIRPREP: - handle_hash_dir_prep((MHashDirPrep*)m); - break; - case MSG_MDS_HASHDIRPREPACK: - handle_hash_dir_prep_ack((MHashDirPrepAck*)m); - break; - case MSG_MDS_HASHDIR: - handle_hash_dir((MHashDir*)m); - break; - case MSG_MDS_HASHDIRACK: - handle_hash_dir_ack((MHashDirAck*)m); - break; - case MSG_MDS_HASHDIRNOTIFY: - handle_hash_dir_notify((MHashDirNotify*)m); - break; - - // unhashing - case MSG_MDS_UNHASHDIRPREP: - handle_unhash_dir_prep((MUnhashDirPrep*)m); - break; - case MSG_MDS_UNHASHDIRPREPACK: - handle_unhash_dir_prep_ack((MUnhashDirPrepAck*)m); - break; - case MSG_MDS_UNHASHDIR: - handle_unhash_dir((MUnhashDir*)m); - break; - case MSG_MDS_UNHASHDIRACK: - handle_unhash_dir_ack((MUnhashDirAck*)m); - break; - case MSG_MDS_UNHASHDIRNOTIFY: - handle_unhash_dir_notify((MUnhashDirNotify*)m); - break; - case MSG_MDS_UNHASHDIRNOTIFYACK: - handle_unhash_dir_notify_ack((MUnhashDirNotifyAck*)m); - break; - - default: - assert(0); - } -} - - -class C_MDC_EmptyImport : public Context { - Migrator *mig; - CDir *dir; -public: - C_MDC_EmptyImport(Migrator *m, CDir *d) : mig(m), dir(d) {} - void finish(int r) { - mig->export_empty_import(dir); - } -}; - - -void Migrator::export_empty_import(CDir *dir) -{ - dout(7) << "export_empty_import " << *dir << endl; - - return; // hack fixme - - if (!dir->is_import()) { - dout(7) << "not import (anymore?)" << endl; - return; - } - if (dir->inode->is_root()) { - dout(7) << "root" << endl; - return; - } - - if (dir->get_size() > 0) { - dout(7) << "not actually empty" << endl; - return; - } - - // is it really empty? - if (!dir->is_complete()) { - dout(7) << "not complete, fetching." << endl; - mds->mdstore->fetch_dir(dir, - new C_MDC_EmptyImport(this,dir)); - return; - } - - int dest = dir->inode->authority(); - - // comment this out ot wreak havoc? - //if (mds->is_shutting_down()) dest = 0; // this is more efficient. - - dout(7) << "really empty, exporting to " << dest << endl; - assert (dest != mds->get_nodeid()); - - dout(-7) << "exporting to mds" << dest - << " empty import " << *dir << endl; - export_dir( dir, dest ); -} - - - - -// ========================================================== -// mds failure handling - -void Migrator::handle_mds_failure(int who) -{ - dout(5) << "handle_mds_failure mds" << who << endl; - - // check my exports - map::iterator p = export_state.begin(); - while (p != export_state.end()) { - map::iterator next = p; - next++; - CDir *dir = p->first; - - if (export_peer[dir] == who) { - // the guy i'm exporting to failed. - // clean up. - dout(10) << "cleaning up export state " << p->second << " of " << *dir << endl; - - switch (p->second) { - case EXPORT_DISCOVERING: - dout(10) << "state discovering : canceling freeze and removing auth_pin" << endl; - dir->unfreeze_tree(); // cancel the freeze - dir->auth_unpin(); // remove the auth_pin (that was holding up the freeze) - break; - - case EXPORT_FREEZING: - dout(10) << "state freezing : canceling freeze" << endl; - dir->unfreeze_tree(); // cancel the freeze - break; - - case EXPORT_LOGGINGSTART: - case EXPORT_PREPPING: - dout(10) << "state loggingstart|prepping : logging EExportFinish(false)" << endl; - mds->mdlog->submit_entry(new EExportFinish(dir,false)); - // logger will unfreeze. - break; - - case EXPORT_EXPORTING: - dout(10) << "state exporting : logging EExportFinish(false), reversing, and unfreezing" << endl; - mds->mdlog->submit_entry(new EExportFinish(dir,false)); - reverse_export(dir); - dir->unfreeze_tree(); - break; - - case EXPORT_LOGGINGFINISH: - dout(10) << "state loggingfinish : doing nothing, we were successful." << endl; - break; - - default: - assert(0); - } - - export_state.erase(dir); - export_peer.erase(dir); - - // unpin the path - vector trace; - cache->make_trace(trace, dir->inode); - cache->path_unpin(trace, 0); - - // wake up any waiters - mds->queue_finished(export_finish_waiters[dir]); - export_finish_waiters.erase(dir); - - // send pending import_maps? - mds->mdcache->send_pending_import_maps(); - - mds->mdcache->show_imports(); - mds->mdcache->show_cache(); - } else { - // third party failed. potential peripheral damage? - if (p->second == EXPORT_EXPORTING) { - // yeah, i'm waiting for acks, let's fake theirs. - if (export_notify_ack_waiting[dir].count(who)) { - dout(10) << "faking export_dir_notify_ack from mds" << who - << " on " << *dir << " to mds" << export_peer[dir] - << endl; - export_notify_ack_waiting[dir].erase(who); - if (export_notify_ack_waiting[dir].empty()) - export_dir_acked(dir); - } - } - } - - // next! - p = next; - } - - - // check my imports - map::iterator q = import_state.begin(); - while (q != import_state.end()) { - map::iterator next = q; - next++; - inodeno_t dirino = q->first; - CInode *diri = mds->mdcache->get_inode(dirino); - CDir *dir = 0; - if (diri) - dir = diri->dir; - - if (import_peer[dirino] == who) { - switch (import_peer[dirino]) { - case IMPORT_DISCOVERED: - - break; - - case IMPORT_PREPPING: - - break; - - case IMPORT_PREPPED: - - break; - - case IMPORT_LOGGINGSTART: - - break; - - case IMPORT_ACKING: - // hrm. make this an ambiguous import, and wait for exporter recovery to disambiguate - // ... - break; - - case IMPORT_LOGGINGFINISH: - // do nothing, exporter is no longer involved. - break; - } - } - - // next! - q = next; - } -} - - - - - - -// ========================================================== -// EXPORT - - -class C_MDC_ExportFreeze : public Context { - Migrator *mig; - CDir *ex; // dir i'm exporting - int dest; - -public: - C_MDC_ExportFreeze(Migrator *m, CDir *e, int d) : - mig(m), ex(e), dest(d) {} - virtual void finish(int r) { - if (r >= 0) - mig->export_dir_frozen(ex, dest); - } -}; - - - -/** export_dir(dir, dest) - * public method to initiate an export. - * will fail if the directory is freezing, frozen, unpinnable, or root. - */ -void Migrator::export_dir(CDir *dir, - int dest) -{ - dout(7) << "export_dir " << *dir << " to " << dest << endl; - assert(dest != mds->get_nodeid()); - assert(!dir->is_hashed()); - - if (mds->mdsmap->is_degraded()) { - dout(7) << "cluster degraded, no exports for now" << endl; - return; - } - - if (dir->inode->is_root()) { - dout(7) << "i won't export root" << endl; - assert(0); - return; - } - - if (dir->is_frozen() || - dir->is_freezing()) { - dout(7) << " can't export, freezing|frozen. wait for other exports to finish first." << endl; - return; - } - if (dir->is_hashed()) { - dout(7) << "can't export hashed dir right now. implement me carefully later." << endl; - return; - } - - - // pin path? - vector trace; - cache->make_trace(trace, dir->inode); - if (!cache->path_pin(trace, 0, 0)) { - dout(7) << "export_dir couldn't pin path, failing." << endl; - return; - } - - // ok, let's go. - assert(export_state.count(dir) == 0); - export_state[dir] = EXPORT_DISCOVERING; - export_peer[dir] = dest; - - // send ExportDirDiscover (ask target) - mds->send_message_mds(new MExportDirDiscover(dir->inode), dest, MDS_PORT_MIGRATOR); - dir->auth_pin(); // pin dir, to hang up our freeze (unpin on prep ack) - - // take away the popularity we're sending. FIXME: do this later? - mds->balancer->subtract_export(dir); - - // freeze the subtree - dir->freeze_tree(new C_MDC_ExportFreeze(this, dir, dest)); -} - - -/* - * called on receipt of MExportDirDiscoverAck - * the importer now has the directory's _inode_ in memory, and pinned. - */ -void Migrator::handle_export_dir_discover_ack(MExportDirDiscoverAck *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - - dout(7) << "export_dir_discover_ack from " << m->get_source() - << " on " << *dir << ", releasing auth_pin" << endl; - - export_state[dir] = EXPORT_FREEZING; - - dir->auth_unpin(); // unpin to allow freeze to complete - - delete m; // done -} - -class C_MDC_ExportStartLogged : public Context { - Migrator *mig; - CDir *ex; // dir i'm exporting - int dest; - MExportDirPrep *prep; - -public: - C_MDC_ExportStartLogged(Migrator *m, CDir *e, int d, MExportDirPrep *p) : - mig(m), ex(e), dest(d), prep(p) {} - virtual void finish(int r) { - mig->export_dir_frozen_logged(ex, prep, dest); - } -}; - -void Migrator::export_dir_frozen(CDir *dir, - int dest) -{ - // subtree is now frozen! - dout(7) << "export_dir_frozen on " << *dir << " to " << dest << endl; - export_state[dir] = EXPORT_LOGGINGSTART; - - show_imports(); - - EExportStart *le = new EExportStart(dir, dest); - MExportDirPrep *prep = new MExportDirPrep(dir->inode); - - // include spanning tree for all nested exports. - // these need to be on the destination _before_ the final export so that - // dir_auth updates on any nested exports are properly absorbed. - - set inodes_added; - - // include base dir - prep->add_dir( new CDirDiscover(dir, dir->add_replica(dest)) ); - le->metablob.add_dir( dir, false ); - - // also include traces to all nested exports. - set my_nested; - cache->find_nested_exports(dir, my_nested); - for (set::iterator it = my_nested.begin(); - it != my_nested.end(); - it++) { - CDir *exp = *it; - - dout(7) << " including nested export " << *exp << " in prep" << endl; - - prep->add_export( exp->ino() ); - le->get_bounds().insert(exp->ino()); - le->metablob.add_dir_context( exp ); - le->metablob.add_dir( exp, false ); - - /* first assemble each trace, in trace order, and put in message */ - list inode_trace; - - // trace to dir - CDir *cur = exp; - while (cur != dir) { - // don't repeat ourselves - if (inodes_added.count(cur->ino())) break; // did already! - inodes_added.insert(cur->ino()); - - CDir *parent_dir = cur->get_parent_dir(); - - // inode? - assert(cur->inode->is_auth()); - inode_trace.push_front(cur->inode); - dout(7) << " will add " << *cur->inode << endl; - - // include dir? note: this'll include everything except the nested exports themselves, - // since someone else is obviously auth. - if (cur->is_auth()) { - prep->add_dir( new CDirDiscover(cur, cur->add_replica(dest)) ); // yay! - dout(7) << " added " << *cur << endl; - } - - cur = parent_dir; - } - - for (list::iterator it = inode_trace.begin(); - it != inode_trace.end(); - it++) { - CInode *in = *it; - dout(7) << " added " << *in << endl; - prep->add_inode( in->parent->get_dir()->ino(), - in->parent->get_name(), - in->replicate_to(dest) ); - } - - } - - // log our intentions - dout(7) << " logging EExportStart" << endl; - mds->mdlog->submit_entry(le, new C_MDC_ExportStartLogged(this, dir, dest, prep)); -} - -void Migrator::export_dir_frozen_logged(CDir *dir, MExportDirPrep *prep, int dest) -{ - dout(7) << "export_dir_frozen_logged " << *dir << endl; - - if (export_state.count(dir) == 0 || - export_state[dir] != EXPORT_LOGGINGSTART) { - // export must have aborted. - dout(7) << "export must have aborted, unfreezing and deleting me old prep message" << endl; - delete prep; - dir->unfreeze_tree(); // cancel the freeze - return; - } - - export_state[dir] = EXPORT_PREPPING; - mds->send_message_mds(prep, dest, MDS_PORT_MIGRATOR); -} - -void Migrator::handle_export_dir_prep_ack(MExportDirPrepAck *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - - dout(7) << "export_dir_prep_ack " << *dir << ", starting export" << endl; - - if (export_state.count(dir) == 0 || - export_state[dir] != EXPORT_PREPPING) { - // export must have aborted. - dout(7) << "export must have aborted, unfreezing" << endl; - dir->unfreeze_tree(); - return; - } - - // start export. - export_state[dir] = EXPORT_EXPORTING; - export_dir_go(dir, m->get_source().num()); - - // done - delete m; -} - - -void Migrator::export_dir_go(CDir *dir, - int dest) -{ - dout(7) << "export_dir_go " << *dir << " to " << dest << endl; - - show_imports(); - - assert(export_bounds.count(dir) == 0); - assert(export_data.count(dir) == 0); - - // update imports/exports - CDir *containing_import = cache->get_auth_container(dir); - - if (containing_import == dir) { - dout(7) << " i'm rexporting a previous import" << endl; - assert(dir->is_import()); - cache->imports.erase(dir); - dir->state_clear(CDIR_STATE_IMPORT); - dir->put(CDir::PIN_IMPORT); // unpin, no longer an import - - // discard nested exports (that we're handing off - for (set::iterator p = cache->nested_exports[dir].begin(); - p != cache->nested_exports[dir].end(); ) { - CDir *nested = *p; - p++; - - // add to export message - export_bounds[dir].insert(nested); - - // nested beneath our new export *in; remove! - dout(7) << " export " << *nested << " was nested beneath us; removing from export list(s)" << endl; - assert(cache->exports.count(nested) == 1); - cache->nested_exports[dir].erase(nested); - } - - } else { - dout(7) << " i'm a subdir nested under import " << *containing_import << endl; - cache->exports.insert(dir); - cache->nested_exports[containing_import].insert(dir); - - dir->state_set(CDIR_STATE_EXPORT); - dir->get(CDir::PIN_EXPORT); // i must keep it pinned - - // discard nested exports (that we're handing off) - for (set::iterator p = cache->nested_exports[containing_import].begin(); - p != cache->nested_exports[containing_import].end(); ) { - CDir *nested = *p; - p++; - if (nested == dir) continue; // ignore myself - - // container of parent; otherwise we get ourselves. - CDir *containing_export = nested->get_parent_dir(); - while (containing_export && !containing_export->is_export()) - containing_export = containing_export->get_parent_dir(); - if (!containing_export) continue; - - if (containing_export == dir) { - // nested beneath our new export *in; remove! - dout(7) << " export " << *nested << " was nested beneath us; removing from nested_exports" << endl; - cache->nested_exports[containing_import].erase(nested); - // exports.erase(nested); _walk does this - - // add to msg - export_bounds[dir].insert(nested); - } else { - dout(12) << " export " << *nested << " is under other export " << *containing_export << ", which is unrelated" << endl; - assert(cache->get_auth_container(containing_export) != containing_import); - } - } - } - - // note new authority (locally) - if (dir->inode->authority() == dest) - dir->set_dir_auth( CDIR_AUTH_PARENT ); - else - dir->set_dir_auth( dest ); - - - // make list of nodes i expect an export_dir_notify_ack from - // (everyone w/ this dir open, but me!) - assert(export_notify_ack_waiting[dir].empty()); - for (map::iterator it = dir->replicas_begin(); - it != dir->replicas_end(); - it++) { - if (it->first == mds->get_nodeid()) continue; - export_notify_ack_waiting[dir].insert( it->first ); - - // send warning to all but dest - if (it->first != dest) { - dout(10) << " sending export_dir_warning to mds" << it->first << endl; - mds->send_message_mds(new MExportDirWarning( dir->ino() ), it->first, MDS_PORT_MIGRATOR); - } - } - assert(export_notify_ack_waiting[dir].count( dest )); - - // fill export message with cache data - C_Contexts *fin = new C_Contexts; // collect all the waiters - int num_exported_inodes = encode_export_dir( export_data[dir], - fin, - dir, // base - dir, // recur start point - dest ); - - // send the export data! - MExportDir *req = new MExportDir(dir->ino()); - - // export state - req->set_dirstate( export_data[dir] ); - - // add bounds - for (set::iterator p = export_bounds[dir].begin(); - p != export_bounds[dir].end(); - ++p) - req->add_export((*p)->ino()); - - //s end - mds->send_message_mds(req, dest, MDS_PORT_MIGRATOR); - - // queue up the finisher - dir->add_waiter( CDIR_WAIT_UNFREEZE, fin ); - - - // stats - if (mds->logger) mds->logger->inc("ex"); - if (mds->logger) mds->logger->inc("iex", num_exported_inodes); - - show_imports(); -} - - -/** encode_export_inode - * update our local state for this inode to export. - * encode relevant state to be sent over the wire. - * used by: export_dir_walk, file_rename (if foreign) - */ -void Migrator::encode_export_inode(CInode *in, bufferlist& enc_state, int new_auth) -{ - // tell (all) clients about migrating caps.. mark STALE - for (map::iterator it = in->client_caps.begin(); - it != in->client_caps.end(); - it++) { - dout(7) << "encode_export_inode " << *in << " telling client" << it->first << " stale caps" << endl; - MClientFileCaps *m = new MClientFileCaps(in->inode, - it->second.get_last_seq(), - it->second.pending(), - it->second.wanted(), - MClientFileCaps::FILECAP_STALE); - mds->messenger->send_message(m, mds->clientmap.get_inst(it->first), - 0, MDS_PORT_CACHE); - } - - // relax locks? - if (!in->is_replicated()) - in->replicate_relax_locks(); - - // add inode - assert(!in->is_replica(mds->get_nodeid())); - CInodeExport istate( in ); - istate._encode( enc_state ); - - // we're export this inode; fix inode state - dout(7) << "encode_export_inode " << *in << endl; - - if (in->is_dirty()) in->mark_clean(); - - // clear/unpin cached_by (we're no longer the authority) - in->clear_replicas(); - - // twiddle lock states for auth -> replica transition - // hard - in->hardlock.clear_gather(); - if (in->hardlock.get_state() == LOCK_GLOCKR) - in->hardlock.set_state(LOCK_LOCK); - - // file : we lost all our caps, so move to stable state! - in->filelock.clear_gather(); - if (in->filelock.get_state() == LOCK_GLOCKR || - in->filelock.get_state() == LOCK_GLOCKM || - in->filelock.get_state() == LOCK_GLOCKL || - in->filelock.get_state() == LOCK_GLONERR || - in->filelock.get_state() == LOCK_GLONERM || - in->filelock.get_state() == LOCK_LONER) - in->filelock.set_state(LOCK_LOCK); - if (in->filelock.get_state() == LOCK_GMIXEDR) - in->filelock.set_state(LOCK_MIXED); - // this looks like a step backwards, but it's what we want! - if (in->filelock.get_state() == LOCK_GSYNCM) - in->filelock.set_state(LOCK_MIXED); - if (in->filelock.get_state() == LOCK_GSYNCL) - in->filelock.set_state(LOCK_LOCK); - if (in->filelock.get_state() == LOCK_GMIXEDL) - in->filelock.set_state(LOCK_LOCK); - //in->filelock.set_state(LOCK_MIXED); - - // mark auth - assert(in->is_auth()); - in->set_auth(false); - in->replica_nonce = CINODE_EXPORT_NONCE; - - // *** other state too? - - // move to end of LRU so we drop out of cache quickly! - if (in->get_parent_dn()) - cache->lru.lru_bottouch(in->get_parent_dn()); -} - - -int Migrator::encode_export_dir(list& dirstatelist, - C_Contexts *fin, - CDir *basedir, - CDir *dir, - int newauth) -{ - int num_exported = 0; - - dout(7) << "export_dir_walk " << *dir << " " << dir->nitems << " items" << endl; - - assert(dir->get_projected_version() == dir->get_version()); - - // dir - bufferlist enc_dir; - - CDirExport dstate(dir); - dstate._encode( enc_dir ); - - // release open_by - dir->clear_replicas(); - - // mark - assert(dir->is_auth()); - dir->state_clear(CDIR_STATE_AUTH); - dir->replica_nonce = CDIR_NONCE_EXPORT; - - // proxy - dir->state_set(CDIR_STATE_PROXY); - dir->get(CDir::PIN_PROXY); - export_proxy_dirinos[basedir].push_back(dir->ino()); - - list subdirs; - - if (dir->is_hashed()) { - // fix state - dir->state_clear( CDIR_STATE_AUTH ); - - } else { - - if (dir->is_dirty()) - dir->mark_clean(); - - // discard most dir state - dir->state &= CDIR_MASK_STATE_EXPORT_KEPT; // i only retain a few things. - - // suck up all waiters - list waiting; - dir->take_waiting(CDIR_WAIT_ANY, waiting); // all dir waiters - fin->take(waiting); - - // inodes - - CDir_map_t::iterator it; - for (it = dir->begin(); it != dir->end(); it++) { - CDentry *dn = it->second; - CInode *in = dn->get_inode(); - - num_exported++; - - // -- dentry - dout(7) << "export_dir_walk exporting " << *dn << endl; - _encode(it->first, enc_dir); - - if (dn->is_dirty()) - enc_dir.append("D", 1); // dirty - else - enc_dir.append("C", 1); // clean - - version_t dnv = dn->get_version(); - enc_dir.append((char*)&dnv, sizeof(dnv)); - - // null dentry? - if (dn->is_null()) { - enc_dir.append("N", 1); // null dentry - assert(dn->is_sync()); - continue; - } - - if (dn->is_remote()) { - // remote link - enc_dir.append("L", 1); // remote link - - inodeno_t ino = dn->get_remote_ino(); - enc_dir.append((char*)&ino, sizeof(ino)); - continue; - } - - // primary link - // -- inode - enc_dir.append("I", 1); // inode dentry - - encode_export_inode(in, enc_dir, newauth); // encode, and (update state for) export - - // directory? - if (in->is_dir() && in->dir) { - if (in->dir->is_auth()) { - // nested subdir - assert(in->dir->get_dir_auth() == CDIR_AUTH_PARENT); - subdirs.push_back(in->dir); // it's ours, recurse (later) - - } else { - // nested export - assert(in->dir->get_dir_auth() >= 0); - dout(7) << " encountered nested export " << *in->dir << " dir_auth " << in->dir->get_dir_auth() << "; removing from exports" << endl; - assert(cache->exports.count(in->dir) == 1); - cache->exports.erase(in->dir); // discard nested export (nested_exports updated above) - - in->dir->state_clear(CDIR_STATE_EXPORT); - in->dir->put(CDir::PIN_EXPORT); - - // simplify dir_auth? - if (in->dir->get_dir_auth() == newauth) - in->dir->set_dir_auth( CDIR_AUTH_PARENT ); - } - } - - // add to proxy - export_proxy_inos[basedir].push_back(in->ino()); - in->state_set(CInode::STATE_PROXY); - in->get(CInode::PIN_PROXY); - - // waiters - list waiters; - in->take_waiting(CINODE_WAIT_ANY, waiters); - fin->take(waiters); - } - } - - // add to dirstatelist - bufferlist bl; - dirstatelist.push_back( bl ); - dirstatelist.back().claim( enc_dir ); - - // subdirs - for (list::iterator it = subdirs.begin(); it != subdirs.end(); it++) - num_exported += encode_export_dir(dirstatelist, fin, basedir, *it, newauth); - - return num_exported; -} - - -class C_MDS_ExportFinishLogged : public Context { - Migrator *migrator; - CDir *dir; -public: - C_MDS_ExportFinishLogged(Migrator *m, CDir *d) : migrator(m), dir(d) {} - void finish(int r) { - migrator->export_dir_finish(dir); - } -}; - - -/* - * i should get an export_dir_notify_ack from every mds that had me open, including the new auth (an ack) - */ -void Migrator::handle_export_dir_notify_ack(MExportDirNotifyAck *m) -{ - CInode *diri = cache->get_inode(m->get_ino()); - CDir *dir = diri->dir; - assert(dir); - assert(dir->is_frozen_tree_root()); // i'm exporting! - - // remove from waiting list - int from = m->get_source().num(); - assert(export_notify_ack_waiting[dir].count(from)); - export_notify_ack_waiting[dir].erase(from); - - dout(7) << "handle_export_dir_notify_ack on " << *dir << " from " << from - << ", still need (" << export_notify_ack_waiting[dir] << ")" << endl; - - // done? - if (export_notify_ack_waiting[dir].empty()) { - export_dir_acked(dir); - } else { - dout(7) << "handle_export_dir_notify_ack on " << *dir << " from " << from - << ", still waiting for " << export_notify_ack_waiting[dir] << endl; - } - - delete m; -} - - - -/* - * this happens if hte dest failes after i send teh export data but before it is acked - * that is, we don't know they safely received and logged it, so we reverse our changes - * and go on. - */ -void Migrator::reverse_export(CDir *dir) -{ - dout(7) << "reverse_export " << *dir << endl; - - assert(export_state[dir] == EXPORT_EXPORTING); - assert(export_bounds.count(dir)); - assert(export_data.count(dir)); - - // re-import it. - set bounds; - bounds.swap(export_bounds[dir]); - export_bounds.erase(dir); - - // -- adjust dir_auth -- - // base - CDir *im = dir; - if (dir->get_inode()->authority() == mds->get_nodeid()) { - // parent is already me. was export, adding back to existing import. - im = mds->mdcache->get_auth_container(dir); - assert(im); - mds->mdcache->nested_exports[im].erase(dir); - mds->mdcache->exports.erase(dir); - dir->set_dir_auth( CDIR_AUTH_PARENT ); - dir->state_clear(CDIR_STATE_EXPORT); - dir->put(CDir::PIN_EXPORT); - } else { - // parent isn't me. new import. - mds->mdcache->imports.insert(dir); - dir->set_dir_auth( mds->get_nodeid() ); - dir->state_set(CDIR_STATE_IMPORT); - dir->get(CDir::PIN_IMPORT); - } - - dout(10) << " base " << *dir << endl; - if (dir != im) - dout(10) << " under " << *im << endl; - - // bounds - for (set::iterator p = bounds.begin(); - p != bounds.end(); - ++p) { - CDir *bd = *p; - - if (bd->get_dir_auth() == mds->get_nodeid()) { - // still me. was an import. - mds->mdcache->imports.erase(bd); - bd->set_dir_auth( CDIR_AUTH_PARENT ); - bd->state_clear(CDIR_STATE_IMPORT); - bd->put(CDir::PIN_IMPORT); - // move nested exports. - for (set::iterator q = mds->mdcache->nested_exports[bd].begin(); - q != mds->mdcache->nested_exports[bd].end(); - ++q) - mds->mdcache->nested_exports[im].insert(*q); - mds->mdcache->nested_exports.erase(bd); - } else { - // not me anymore. now an export. - mds->mdcache->exports.insert(bd); - mds->mdcache->nested_exports[im].insert(bd); - assert(bd->get_dir_auth() != CDIR_AUTH_PARENT); - bd->set_dir_auth( CDIR_AUTH_UNKNOWN ); - bd->state_set(CDIR_STATE_EXPORT); - bd->get(CDir::PIN_EXPORT); - } - - dout(10) << " bound " << *bd << endl; - } - - - // reimport the dirs - list imported_subdirs; - int num_imported_inodes = 0; - - for (list::iterator p = export_data[dir].begin(); - p != export_data[dir].end(); - ++p) { - num_imported_inodes += - decode_import_dir(*p, - export_peer[dir], - dir, // import root - imported_subdirs, - 0); - } - - // remove proxy bits - clear_export_proxy_pins(dir); - - // some clean up - export_data.erase(dir); - export_bounds.erase(dir); - export_notify_ack_waiting.erase(dir); -} - - -void Migrator::export_dir_acked(CDir *dir) -{ - dout(7) << "export_dir_acked " << *dir << endl; - export_notify_ack_waiting.erase(dir); - - export_state[dir] = EXPORT_LOGGINGFINISH; - export_data.erase(dir); - export_bounds.erase(dir); - - // log export completion, then finish (unfreeze, trigger finish context, etc.) - mds->mdlog->submit_entry(new EExportFinish(dir, true), - new C_MDS_ExportFinishLogged(this, dir)); -} - - -/* - * once i get all teh notify_acks i can finish - */ -void Migrator::export_dir_finish(CDir *dir) -{ - dout(7) << "export_dir_finish " << *dir << endl; - - if (export_state.count(dir)) { - // send finish/commit to new auth - mds->send_message_mds(new MExportDirFinish(dir->ino()), dir->authority(), MDS_PORT_MIGRATOR); - - // remove from exporting list - export_state.erase(dir); - export_peer.erase(dir); - } else { - dout(7) << "target must have failed, not sending final commit message. export succeeded anyway." << endl; - } - - // unfreeze - dout(7) << "export_dir_finish unfreezing" << endl; - dir->unfreeze_tree(); - - // unpin path - dout(7) << "export_dir_finish unpinning path" << endl; - vector trace; - cache->make_trace(trace, dir->inode); - cache->path_unpin(trace, 0); - - // unpin proxies - clear_export_proxy_pins(dir); - - // queue finishers - mds->queue_finished(export_finish_waiters[dir]); - export_finish_waiters.erase(dir); - - // stats - if (mds->logger) mds->logger->set("nex", cache->exports.size()); - - show_imports(); - - // send pending import_maps? - mds->mdcache->send_pending_import_maps(); -} - - -void Migrator::clear_export_proxy_pins(CDir *dir) -{ - dout(10) << "clear_export_proxy_pins " << *dir << endl; - - // inodes - for (list::iterator it = export_proxy_inos[dir].begin(); - it != export_proxy_inos[dir].end(); - it++) { - CInode *in = cache->get_inode(*it); - dout(15) << " " << *in << endl; - in->put(CInode::PIN_PROXY); - assert(in->state_test(CInode::STATE_PROXY)); - in->state_clear(CInode::STATE_PROXY); - } - export_proxy_inos.erase(dir); - - // dirs - for (list::iterator it = export_proxy_dirinos[dir].begin(); - it != export_proxy_dirinos[dir].end(); - it++) { - CDir *dir = cache->get_inode(*it)->dir; - dout(15) << " " << *dir << endl; - dir->put(CDir::PIN_PROXY); - assert(dir->state_test(CDIR_STATE_PROXY)); - dir->state_clear(CDIR_STATE_PROXY); - - // hose neg dentries, too, since we're no longer auth - CDir_map_t::iterator it; - for (it = dir->begin(); it != dir->end(); ) { - CDentry *dn = it->second; - it++; - if (dn->is_null()) { - assert(dn->is_sync()); - dir->remove_dentry(dn); - } else { - //dout(10) << "export_dir_notify_ack leaving xlocked neg " << *dn << endl; - if (dn->is_dirty()) - dn->mark_clean(); - } - } - } - export_proxy_dirinos.erase(dir); -} - - - - - - -// ========================================================== -// IMPORT - - -class C_MDC_ExportDirDiscover : public Context { - Migrator *mig; - MExportDirDiscover *m; -public: - vector trace; - C_MDC_ExportDirDiscover(Migrator *mig_, MExportDirDiscover *m_) : - mig(mig_), m(m_) {} - void finish(int r) { - CInode *in = 0; - if (r >= 0) in = trace[trace.size()-1]->get_inode(); - mig->handle_export_dir_discover_2(m, in, r); - } -}; - -void Migrator::handle_export_dir_discover(MExportDirDiscover *m) -{ - assert(m->get_source().num() != mds->get_nodeid()); - - dout(7) << "handle_export_dir_discover on " << m->get_path() << endl; - - // must discover it! - C_MDC_ExportDirDiscover *onfinish = new C_MDC_ExportDirDiscover(this, m); - filepath fpath(m->get_path()); - cache->path_traverse(fpath, onfinish->trace, true, - m, new C_MDS_RetryMessage(mds,m), // on delay/retry - MDS_TRAVERSE_DISCOVER, - onfinish); // on completion|error -} - -void Migrator::handle_export_dir_discover_2(MExportDirDiscover *m, CInode *in, int r) -{ - // yay! - if (in) { - dout(7) << "handle_export_dir_discover_2 has " << *in << endl; - } - - if (r < 0 || !in->is_dir()) { - dout(7) << "handle_export_dir_discover_2 failed to discover or not dir " << m->get_path() << ", NAK" << endl; - - assert(0); // this shouldn't happen if the auth pins his path properly!!!! - - mds->send_message_mds(new MExportDirDiscoverAck(m->get_ino(), false), - m->get_source().num(), MDS_PORT_MIGRATOR); - delete m; - return; - } - - assert(in->is_dir()); - - if (in->is_frozen()) { - dout(7) << "frozen, waiting." << endl; - in->add_waiter(CINODE_WAIT_AUTHPINNABLE, - new C_MDS_RetryMessage(mds,m)); - return; - } - - // pin inode in the cache (for now) - in->get(CInode::PIN_IMPORTING); - - // pin auth too, until the import completes. - in->auth_pin(); - - import_state[in->ino()] = IMPORT_DISCOVERED; - import_peer[in->ino()] = m->get_source().num(); - - - // reply - dout(7) << " sending export_dir_discover_ack on " << *in << endl; - mds->send_message_mds(new MExportDirDiscoverAck(in->ino()), - m->get_source().num(), MDS_PORT_MIGRATOR); - delete m; -} - - - -void Migrator::handle_export_dir_prep(MExportDirPrep *m) -{ - assert(m->get_source().num() != mds->get_nodeid()); - - CInode *diri = cache->get_inode(m->get_ino()); - assert(diri); - - list finished; - - // assimilate root dir. - CDir *dir = diri->dir; - if (dir) { - dout(7) << "handle_export_dir_prep on " << *dir << " (had dir)" << endl; - - if (!m->did_assim()) - m->get_dir(diri->ino())->update_dir(dir); - } else { - assert(!m->did_assim()); - - // open dir i'm importing. - diri->set_dir( new CDir(diri, mds->mdcache, false) ); - dir = diri->dir; - m->get_dir(diri->ino())->update_dir(dir); - - dout(7) << "handle_export_dir_prep on " << *dir << " (opening dir)" << endl; - - diri->take_waiting(CINODE_WAIT_DIR, finished); - } - assert(dir->is_auth() == false); - - show_imports(); - - // assimilate contents? - if (!m->did_assim()) { - dout(7) << "doing assim on " << *dir << endl; - m->mark_assim(); // only do this the first time! - - // move pin to dir - diri->put(CInode::PIN_IMPORTING); - dir->get(CDir::PIN_IMPORTING); - - // auth pin too - dir->auth_pin(); - diri->auth_unpin(); - - // change import state - import_state[diri->ino()] = IMPORT_PREPPING; - - // assimilate traces to exports - for (list::iterator it = m->get_inodes().begin(); - it != m->get_inodes().end(); - it++) { - // inode - CInode *in = cache->get_inode( (*it)->get_ino() ); - if (in) { - (*it)->update_inode(in); - dout(7) << " updated " << *in << endl; - } else { - in = new CInode(mds->mdcache, false); - (*it)->update_inode(in); - - // link to the containing dir - CInode *condiri = cache->get_inode( m->get_containing_dirino(in->ino()) ); - assert(condiri && condiri->dir); - cache->add_inode( in ); - condiri->dir->add_dentry( m->get_dentry(in->ino()), in ); - - dout(7) << " added " << *in << endl; - } - - assert( in->get_parent_dir()->ino() == m->get_containing_dirino(in->ino()) ); - - // dir - if (m->have_dir(in->ino())) { - if (in->dir) { - m->get_dir(in->ino())->update_dir(in->dir); - dout(7) << " updated " << *in->dir << endl; - } else { - in->set_dir( new CDir(in, mds->mdcache, false) ); - m->get_dir(in->ino())->update_dir(in->dir); - dout(7) << " added " << *in->dir << endl; - in->take_waiting(CINODE_WAIT_DIR, finished); - } - } - } - - // open export dirs? - for (list::iterator it = m->get_exports().begin(); - it != m->get_exports().end(); - it++) { - dout(7) << " checking dir " << hex << *it << dec << endl; - CInode *in = cache->get_inode(*it); - assert(in); - - // note bound. - import_bounds[dir->ino()].insert(*it); - - if (!in->dir) { - dout(7) << " opening nested export on " << *in << endl; - cache->open_remote_dir(in, - new C_MDS_RetryMessage(mds, m)); - - // pin it! - in->get(CInode::PIN_OPENINGDIR); - in->state_set(CInode::STATE_OPENINGDIR); - } - } - } else { - dout(7) << " not doing assim on " << *dir << endl; - } - - - // verify we have all exports - int waiting_for = 0; - for (list::iterator it = m->get_exports().begin(); - it != m->get_exports().end(); - it++) { - inodeno_t ino = *it; - CInode *in = cache->get_inode(ino); - if (!in) dout(0) << "** missing ino " << hex << ino << dec << endl; - assert(in); - if (in->dir) { - if (!in->dir->state_test(CDIR_STATE_IMPORTINGEXPORT)) { - dout(7) << " pinning nested export " << *in->dir << endl; - in->dir->get(CDir::PIN_IMPORTINGEXPORT); - in->dir->state_set(CDIR_STATE_IMPORTINGEXPORT); - - if (in->state_test(CInode::STATE_OPENINGDIR)) { - in->put(CInode::PIN_OPENINGDIR); - in->state_clear(CInode::STATE_OPENINGDIR); - } - } else { - dout(7) << " already pinned nested export " << *in << endl; - } - } else { - dout(7) << " waiting for nested export dir on " << *in << endl; - waiting_for++; - } - } - if (waiting_for) { - dout(7) << " waiting for " << waiting_for << " nested export dir opens" << endl; - } else { - // ok! - dout(7) << " all ready, sending export_dir_prep_ack on " << *dir << endl; - mds->send_message_mds(new MExportDirPrepAck(dir->ino()), - m->get_source().num(), MDS_PORT_MIGRATOR); - - // note new state - import_state[diri->ino()] = IMPORT_PREPPED; - - // done - delete m; - } - - // finish waiters - finish_contexts(finished, 0); -} - - - - -/* this guy waits for the pre-import discovers on hashed directory dir inodes to finish. - * if it's the last one on the dir, it reprocessed the import. - */ -/* -class C_MDS_ImportPrediscover : public Context { -public: - MDS *mds; - MExportDir *m; - inodeno_t dir_ino; - string dentry; - C_MDS_ImportPrediscover(MDS *mds, MExportDir *m, inodeno_t dir_ino, const string& dentry) { - this->mds = mds; - this->m = m; - this->dir_ino = dir_ino; - this->dentry = dentry; - } - virtual void finish(int r) { - assert(r == 0); // should never fail! - - m->remove_prediscover(dir_ino, dentry); - - if (!m->any_prediscovers()) - mds->mdcache->handle_export_dir(m); - } -}; -*/ - -class C_MDS_ImportDirLoggedStart : public Context { - Migrator *migrator; - CDir *dir; - int from; - list imported_subdirs; - list exports; -public: - C_MDS_ImportDirLoggedStart(Migrator *m, CDir *d, int f, - list& is, list& e) : - migrator(m), dir(d), from(f) { - imported_subdirs.swap(is); - exports.swap(e); - } - void finish(int r) { - migrator->import_dir_logged_start(dir, from, imported_subdirs, exports); - } -}; - -void Migrator::handle_export_dir(MExportDir *m) -{ - CInode *diri = cache->get_inode(m->get_ino()); - assert(diri); - CDir *dir = diri->dir; - assert(dir); - - int oldauth = m->get_source().num(); - dout(7) << "handle_export_dir importing " << *dir << " from " << oldauth << endl; - assert(dir->is_auth() == false); - - show_imports(); - - // start the journal entry - EImportStart *le = new EImportStart(dir->ino(), m->get_exports()); - le->metablob.add_dir_context(dir); - - // note new authority (locally) - CDir *im = dir; - if (dir->inode->is_auth()) { - // parent is already me. was export, adding back to existing import. - im = mds->mdcache->get_auth_container(dir); - assert(im); - mds->mdcache->nested_exports[im].erase(dir); - mds->mdcache->exports.erase(dir); - dir->set_dir_auth( CDIR_AUTH_PARENT ); - dir->state_clear(CDIR_STATE_EXPORT); - dir->put(CDir::PIN_EXPORT); - } else { - // parent isn't me. new import. - mds->mdcache->imports.insert(dir); - dir->set_dir_auth( mds->get_nodeid() ); - dir->state_set(CDIR_STATE_IMPORT); - dir->get(CDir::PIN_IMPORT); - } - - // take out my temp pin - dir->put(CDir::PIN_IMPORTING); - - // mark import point frozen - // (note: this is a manual freeze.. hack hack hack!) - dir->get_inode()->auth_pin(); - dir->state_set(CDIR_STATE_FROZENTREE); - - dout(10) << " base " << *dir << endl; - if (dir != im) - dout(10) << " under " << *im << endl; - - // bounds - for (list::iterator it = m->get_exports().begin(); - it != m->get_exports().end(); - it++) { - CInode *bdi = cache->get_inode(*it); - CDir *bd = bdi->dir; - - if (bd->get_dir_auth() == mds->get_nodeid()) { - // still me. was an import. - assert(bd->is_import()); - mds->mdcache->imports.erase(bd); - bd->set_dir_auth( CDIR_AUTH_PARENT ); - bd->state_clear(CDIR_STATE_IMPORT); - bd->put(CDir::PIN_IMPORT); - // move nested exports. - for (set::iterator q = mds->mdcache->nested_exports[bd].begin(); - q != mds->mdcache->nested_exports[bd].end(); - ++q) - mds->mdcache->nested_exports[im].insert(*q); - mds->mdcache->nested_exports.erase(bd); - } else { - // not me anymore. now an export. - mds->mdcache->exports.insert(bd); - mds->mdcache->nested_exports[im].insert(bd); - assert(bd->get_dir_auth() != CDIR_AUTH_PARENT); - bd->set_dir_auth( CDIR_AUTH_UNKNOWN ); - bd->state_set(CDIR_STATE_EXPORT); - bd->get(CDir::PIN_EXPORT); - } - - // mark export point frozenleaf - bd->get(CDir::PIN_FREEZELEAF); - bd->state_set(CDIR_STATE_FROZENTREELEAF); - assert(import_bounds[dir->ino()].count(*it)); // we took note during prep stage - - // remove our pin - bd->put(CDir::PIN_IMPORTINGEXPORT); - bd->state_clear(CDIR_STATE_IMPORTINGEXPORT); - - dout(10) << " bound " << *bd << endl; - } - - // add this crap to my cache - list imported_subdirs; - int num_imported_inodes = 0; - - for (list::iterator p = m->get_dirstate().begin(); - p != m->get_dirstate().end(); - ++p) { - num_imported_inodes += - decode_import_dir(*p, - oldauth, - dir, // import root - imported_subdirs, - le); - } - dout(10) << " " << imported_subdirs.size() << " imported subdirs" << endl; - dout(10) << " " << m->get_exports().size() << " imported nested exports" << endl; - - - // adjust popularity - mds->balancer->add_import(dir); - - dout(7) << "handle_export_dir did " << *dir << endl; - - // log it - mds->mdlog->submit_entry(le, - new C_MDS_ImportDirLoggedStart(this, dir, m->get_source().num(), - imported_subdirs, m->get_exports())); - - // note state - import_state[dir->ino()] = IMPORT_LOGGINGSTART; - - // some stats - if (mds->logger) { - mds->logger->inc("im"); - mds->logger->inc("iim", num_imported_inodes); - mds->logger->set("nim", cache->imports.size()); - } - - delete m; -} - - -void Migrator::import_dir_logged_start(CDir *dir, int from, - list &imported_subdirs, - list &exports) -{ - dout(7) << "import_dir_logged " << *dir << endl; - - // note state - import_state[dir->ino()] = IMPORT_ACKING; - - // send notify's etc. - dout(7) << "sending notifyack for " << *dir << " to old auth mds" << from << endl; - mds->send_message_mds(new MExportDirNotifyAck(dir->inode->ino()), - from, MDS_PORT_MIGRATOR); - - dout(7) << "sending notify to others" << endl; - for (map::iterator it = dir->replicas_begin(); - it != dir->replicas_end(); - it++) { - assert( it->first != mds->get_nodeid() ); - if ( it->first == from ) continue; // not to old auth. - - MExportDirNotify *notify = new MExportDirNotify(dir->ino(), from, mds->get_nodeid()); - notify->copy_exports(exports); - - if (g_conf.mds_verify_export_dirauth) - notify->copy_subdirs(imported_subdirs); // copy subdir list (DEBUG) - - mds->send_message_mds(notify, it->first, MDS_PORT_MIGRATOR); - } - - show_imports(); -} - - -class C_MDS_ImportDirLoggedFinish : public Context { - Migrator *migrator; - CDir *dir; -public: - C_MDS_ImportDirLoggedFinish(Migrator *m, CDir *d) : migrator(m), dir(d) { } - void finish(int r) { - migrator->import_dir_logged_finish(dir); - } -}; - -void Migrator::handle_export_dir_finish(MExportDirFinish *m) -{ - CInode *diri = cache->get_inode(m->get_ino()); - CDir *dir = diri->dir; - assert(dir); - - dout(7) << "handle_export_dir_finish logging import_finish on " << *dir << endl; - assert(dir->is_auth()); - - // note state - import_state[dir->ino()] = IMPORT_LOGGINGFINISH; - - // log - mds->mdlog->submit_entry(new EImportFinish(dir, true), - new C_MDS_ImportDirLoggedFinish(this,dir)); - delete m; -} - -void Migrator::import_dir_logged_finish(CDir *dir) -{ - dout(7) << "import_dir_logged_finish " << *dir << endl; - - // un auth pin (other exports can now proceed) - dir->auth_unpin(); - - // unfreeze! - for (set::iterator p = import_bounds[dir->ino()].begin(); - p != import_bounds[dir->ino()].end(); - ++p) { - CInode *diri = mds->mdcache->get_inode(*p); - CDir *dir = diri->dir; - assert(dir->state_test(CDIR_STATE_FROZENTREELEAF)); - dir->put(CDir::PIN_FREEZELEAF); - dir->state_clear(CDIR_STATE_FROZENTREELEAF); - } - - dir->unfreeze_tree(); - - // clear import state (we're done!) - import_state.erase(dir->ino()); - import_peer.erase(dir->ino()); - import_bounds.erase(dir->ino()); - - // ok now finish contexts - dout(5) << "finishing any waiters on imported data" << endl; - dir->finish_waiting(CDIR_WAIT_IMPORTED); - - // log it - if (mds->logger) { - mds->logger->set("nex", cache->exports.size()); - mds->logger->set("nim", cache->imports.size()); - } - show_imports(); - - // is it empty? - if (dir->get_size() == 0 && - !dir->inode->is_auth()) { - // reexport! - export_empty_import(dir); - } -} - - -void Migrator::decode_import_inode(CDentry *dn, bufferlist& bl, int& off, int oldauth) -{ - CInodeExport istate; - off = istate._decode(bl, off); - dout(15) << "got a cinodeexport " << endl; - - bool added = false; - CInode *in = cache->get_inode(istate.get_ino()); - if (!in) { - in = new CInode(mds->mdcache); - added = true; - } else { - in->set_auth(true); - } - - // state after link -- or not! -sage - set merged_client_caps; - istate.update_inode(in, merged_client_caps); - - // link before state -- or not! -sage - if (dn->inode != in) { - assert(!dn->inode); - dn->dir->link_inode(dn, in); - } - - // add inode? - if (added) { - cache->add_inode(in); - dout(10) << "added " << *in << endl; - } else { - dout(10) << " had " << *in << endl; - } - - - // adjust replica list - //assert(!in->is_replica(oldauth)); // not true on failed export - in->add_replica( oldauth, CINODE_EXPORT_NONCE ); - if (in->is_replica(mds->get_nodeid())) - in->remove_replica(mds->get_nodeid()); - - // twiddle locks - // hard - if (in->hardlock.get_state() == LOCK_GLOCKR) { - in->hardlock.gather_set.erase(mds->get_nodeid()); - in->hardlock.gather_set.erase(oldauth); - if (in->hardlock.gather_set.empty()) - mds->locker->inode_hard_eval(in); - } - - // caps - for (set::iterator it = merged_client_caps.begin(); - it != merged_client_caps.end(); - it++) { - MClientFileCaps *caps = new MClientFileCaps(in->inode, - in->client_caps[*it].get_last_seq(), - in->client_caps[*it].pending(), - in->client_caps[*it].wanted(), - MClientFileCaps::FILECAP_REAP); - caps->set_mds( oldauth ); // reap from whom? - mds->messenger->send_message(caps, - mds->clientmap.get_inst(*it), - 0, MDS_PORT_CACHE); - } - - // filelock - if (!in->filelock.is_stable()) { - // take me and old auth out of gather set - in->filelock.gather_set.erase(mds->get_nodeid()); - in->filelock.gather_set.erase(oldauth); - if (in->filelock.gather_set.empty()) // necessary but not suffient... - mds->locker->inode_file_eval(in); - } -} - - -int Migrator::decode_import_dir(bufferlist& bl, - int oldauth, - CDir *import_root, - list& imported_subdirs, - EImportStart *le) -{ - int off = 0; - - // set up dir - CDirExport dstate; - off = dstate._decode(bl, off); - - CInode *diri = cache->get_inode(dstate.get_ino()); - assert(diri); - CDir *dir = diri->get_or_open_dir(mds->mdcache); - assert(dir); - - dout(7) << "decode_import_dir " << *dir << endl; - - // add to list - if (dir != import_root) - imported_subdirs.push_back(dir->ino()); - - // assimilate state - dstate.update_dir( dir ); - - // mark (may already be marked from get_or_open_dir() above) - if (!dir->is_auth()) - dir->state_set(CDIR_STATE_AUTH); - - // adjust replica list - //assert(!dir->is_replica(oldauth)); // not true on failed export - dir->add_replica(oldauth); - if (dir->is_replica(mds->get_nodeid())) - dir->remove_replica(mds->get_nodeid()); - - // add to journal entry - if (le) - le->metablob.add_dir(dir, true); // Hmm: false would be okay in some cases - - int num_imported = 0; - - if (dir->is_hashed()) { - - // do nothing; dir is hashed - } else { - // take all waiters on this dir - // NOTE: a pass of imported data is guaranteed to get all of my waiters because - // a replica's presense in my cache implies/forces it's presense in authority's. - list waiters; - - dir->take_waiting(CDIR_WAIT_ANY, waiters); - for (list::iterator it = waiters.begin(); - it != waiters.end(); - it++) - import_root->add_waiter(CDIR_WAIT_IMPORTED, *it); - - dout(15) << "doing contents" << endl; - - // contents - long nden = dstate.get_nden(); - - for (; nden>0; nden--) { - - num_imported++; - - // dentry - string dname; - _decode(dname, bl, off); - dout(15) << "dname is " << dname << endl; - - char dirty; - bl.copy(off, 1, &dirty); - off++; - - version_t dnv; - bl.copy(off, sizeof(dnv), (char*)&dnv); - off += sizeof(dnv); - - char icode; - bl.copy(off, 1, &icode); - off++; - - CDentry *dn = dir->lookup(dname); - if (!dn) - dn = dir->add_dentry(dname); // null - - // mark dentry dirty? - if (dirty == 'D') - dn->_mark_dirty(); - - dn->set_version( dnv ); - dn->set_projected_version( dnv ); - - if (icode == 'N') { - // null dentry - assert(dn->is_null()); - - // fall thru - } - else if (icode == 'L') { - // remote link - inodeno_t ino; - bl.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - dir->link_inode(dn, ino); - } - else if (icode == 'I') { - // inode - decode_import_inode(dn, bl, off, oldauth); - } - - // add dentry to journal entry - if (le) - le->metablob.add_dentry(dn, true); // Hmm: might we do dn->is_dirty() here instead? - } - - } - - dout(7) << "decode_import_dir done " << *dir << endl; - return num_imported; -} - - - - - -// authority bystander - -void Migrator::handle_export_dir_warning(MExportDirWarning *m) -{ - // add to warning list - stray_export_warnings.insert( m->get_ino() ); - - // did i already see the notify? - if (stray_export_notifies.count(m->get_ino())) { - // i did, we're good. - dout(7) << "handle_export_dir_warning on " << m->get_ino() << ". already got notify." << endl; - - // process the notify - map::iterator it = stray_export_notifies.find(m->get_ino()); - handle_export_dir_notify(it->second); - stray_export_notifies.erase(it); - } else { - dout(7) << "handle_export_dir_warning on " << m->get_ino() << ". waiting for notify." << endl; - } - - // done - delete m; -} - - -void Migrator::handle_export_dir_notify(MExportDirNotify *m) -{ - CDir *dir = 0; - CInode *in = cache->get_inode(m->get_ino()); - if (in) dir = in->dir; - - // did i see the warning yet? - if (!stray_export_warnings.count(m->get_ino())) { - // wait for it. - dout(7) << "export_dir_notify on " << m->get_ino() << ", waiting for warning." << endl; - stray_export_notifies.insert(pair( m->get_ino(), m )); - return; - } - - // i did, we're all good. - dout(7) << "export_dir_notify on " << m->get_ino() << ", already saw warning." << endl; - - // update dir_auth! - if (dir) { - dout(7) << "export_dir_notify on " << *dir << " new_auth " << m->get_new_auth() << " (old_auth " << m->get_old_auth() << ")" << endl; - - // update bounds first - for (list::iterator it = m->get_exports().begin(); - it != m->get_exports().end(); - it++) { - CInode *n = cache->get_inode(*it); - if (!n) continue; - CDir *ndir = n->dir; - if (!ndir) continue; - - int boundauth = ndir->authority(); - dout(7) << "export_dir_notify bound " << *ndir << " was dir_auth " << ndir->get_dir_auth() << " (" << boundauth << ")" << endl; - if (ndir->get_dir_auth() == CDIR_AUTH_PARENT) { - if (boundauth != m->get_new_auth()) - ndir->set_dir_auth( boundauth ); - else assert(dir->authority() == m->get_new_auth()); // apparently we already knew! - } else { - if (boundauth == m->get_new_auth()) - ndir->set_dir_auth( CDIR_AUTH_PARENT ); - } - } - - // update dir_auth - if (in->authority() == m->get_new_auth()) { - dout(7) << "handle_export_dir_notify on " << *in << ": inode auth is the same, setting dir_auth -1" << endl; - dir->set_dir_auth( CDIR_AUTH_PARENT ); - assert(!in->is_auth()); - assert(!dir->is_auth()); - } else { - dir->set_dir_auth( m->get_new_auth() ); - } - assert(dir->authority() != mds->get_nodeid()); - assert(!dir->is_auth()); - - // DEBUG: verify subdirs - if (g_conf.mds_verify_export_dirauth) { - - dout(7) << "handle_export_dir_notify on " << *dir << " checking " << m->num_subdirs() << " subdirs" << endl; - for (list::iterator it = m->subdirs_begin(); - it != m->subdirs_end(); - it++) { - CInode *diri = cache->get_inode(*it); - if (!diri) continue; // don't have it, don't care - if (!diri->dir) continue; - dout(10) << "handle_export_dir_notify checking subdir " << *diri->dir << " is auth " << diri->dir->get_dir_auth() << endl; - assert(diri->dir != dir); // base shouldn't be in subdir list - if (diri->dir->get_dir_auth() != CDIR_AUTH_PARENT) { - dout(7) << "*** weird value for dir_auth " << diri->dir->get_dir_auth() << " on " << *diri->dir << ", should have been -1 probably??? ******************" << endl; - assert(0); // bad news! - //dir->set_dir_auth( CDIR_AUTH_PARENT ); - } - assert(diri->dir->authority() == m->get_new_auth()); - } - } - } - - // send notify ack to old auth - dout(7) << "handle_export_dir_notify sending ack to old_auth " << m->get_old_auth() << endl; - mds->send_message_mds(new MExportDirNotifyAck(m->get_ino()), - m->get_old_auth(), MDS_PORT_MIGRATOR); - - - // done - stray_export_warnings.erase( m->get_ino() ); - delete m; -} - - - - - -// ======================================================================= -// HASHING - - -void Migrator::import_hashed_content(CDir *dir, bufferlist& bl, int nden, int oldauth) -{ - int off = 0; - - for (; nden>0; nden--) { - // dentry - string dname; - _decode(dname, bl, off); - dout(15) << "dname is " << dname << endl; - - char icode; - bl.copy(off, 1, &icode); - off++; - - CDentry *dn = dir->lookup(dname); - if (!dn) - dn = dir->add_dentry(dname); // null - - // mark dn dirty _after_ we link the inode (scroll down) - - if (icode == 'N') { - - // null dentry - assert(dn->is_null()); - - // fall thru - } - else if (icode == 'L') { - // remote link - inodeno_t ino; - bl.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - dir->link_inode(dn, ino); - } - else if (icode == 'I') { - // inode - decode_import_inode(dn, bl, off, oldauth); - - // fix up subdir export? - if (dn->inode->dir) { - assert(dn->inode->dir->state_test(CDIR_STATE_IMPORTINGEXPORT)); - dn->inode->dir->put(CDir::PIN_IMPORTINGEXPORT); - dn->inode->dir->state_clear(CDIR_STATE_IMPORTINGEXPORT); - - if (dn->inode->dir->is_auth()) { - // mine. must have been an import. - assert(dn->inode->dir->is_import()); - dout(7) << "unimporting subdir now that inode is mine " << *dn->inode->dir << endl; - dn->inode->dir->set_dir_auth( CDIR_AUTH_PARENT ); - cache->imports.erase(dn->inode->dir); - dn->inode->dir->put(CDir::PIN_IMPORT); - dn->inode->dir->state_clear(CDIR_STATE_IMPORT); - - // move nested under hashdir - for (set::iterator it = cache->nested_exports[dn->inode->dir].begin(); - it != cache->nested_exports[dn->inode->dir].end(); - it++) - cache->nested_exports[dir].insert(*it); - cache->nested_exports.erase(dn->inode->dir); - - // now it matches the inode - dn->inode->dir->set_dir_auth( CDIR_AUTH_PARENT ); - } - else { - // not mine. make it an export. - dout(7) << "making subdir into export " << *dn->inode->dir << endl; - dn->inode->dir->get(CDir::PIN_EXPORT); - dn->inode->dir->state_set(CDIR_STATE_EXPORT); - cache->exports.insert(dn->inode->dir); - cache->nested_exports[dir].insert(dn->inode->dir); - - if (dn->inode->dir->get_dir_auth() == CDIR_AUTH_PARENT) - dn->inode->dir->set_dir_auth( oldauth ); // no longer matches inode - assert(dn->inode->dir->get_dir_auth() >= 0); - } - } - } - - // mark dentry dirty? (only _after_ we link the inode!) - dn->_mark_dirty(); // fixme - } -} - -/* - - notes on interaction of hashing and export/import: - - - dir->is_auth() is completely independent of hashing. for a hashed dir, - - all nodes are partially authoritative - - all nodes dir->is_hashed() == true - - all nodes dir->inode->dir_is_hashed() == true - - one node dir->is_auth() == true, the rest == false - - dir_auth for all subdirs in a hashed dir will (likely?) be explicit. - - - remember simple rule: dir auth follows inode, unless dir_auth is explicit. - - - export_dir_walk and decode_import_dir take care with dir_auth: (for import/export) - - on export, -1 is changed to mds->get_nodeid() - - on import, nothing special, actually. - - - hashed dir files aren't included in export; subdirs are converted to imports - or exports as necessary. - - hashed dir subdirs are discovered on export. this is important - because dirs are needed to tie together auth hierarchy, for auth to know about - imports/exports, etc. - - - dir state is maintained on auth. - - COMPLETE and HASHED are transfered to importers. - - DIRTY is set everywhere. - - - hashed dir is like an import: hashed dir used for nested_exports map. - - nested_exports is updated appropriately on auth and replicas. - - a subtree terminates as a hashed dir, since the hashing explicitly - redelegates all inodes. thus export_dir_walk includes hashed dirs, but - not their inodes. -*/ - -// HASH on auth - -class C_MDC_HashFreeze : public Context { -public: - Migrator *mig; - CDir *dir; - C_MDC_HashFreeze(Migrator *m, CDir *d) : mig(m), dir(d) {} - virtual void finish(int r) { - mig->hash_dir_frozen(dir); - } -}; - -class C_MDC_HashComplete : public Context { -public: - Migrator *mig; - CDir *dir; - C_MDC_HashComplete(Migrator *mig, CDir *dir) { - this->mig = mig; - this->dir = dir; - } - virtual void finish(int r) { - mig->hash_dir_complete(dir); - } -}; - - -/** hash_dir(dir) - * start hashing a directory. - */ -void Migrator::hash_dir(CDir *dir) -{ - dout(-7) << "hash_dir " << *dir << endl; - - assert(!dir->is_hashed()); - assert(dir->is_auth()); - - if (dir->is_frozen() || - dir->is_freezing()) { - dout(7) << " can't hash, freezing|frozen." << endl; - return; - } - - // pin path? - vector trace; - cache->make_trace(trace, dir->inode); - if (!cache->path_pin(trace, 0, 0)) { - dout(7) << "hash_dir couldn't pin path, failing." << endl; - return; - } - - // ok, go - dir->state_set(CDIR_STATE_HASHING); - dir->get(CDir::PIN_HASHING); - assert(dir->hashed_subset.empty()); - - // discover on all mds - assert(hash_gather.count(dir) == 0); - for (int i=0; iget_mds_map()->get_num_mds(); i++) { - if (i == mds->get_nodeid()) continue; // except me - hash_gather[dir].insert(i); - mds->send_message_mds(new MHashDirDiscover(dir->inode), i, MDS_PORT_MIGRATOR); - } - dir->auth_pin(); // pin until discovers are all acked. - - // start freeze - dir->freeze_dir(new C_MDC_HashFreeze(this, dir)); - - // make complete - if (!dir->is_complete()) { - dout(7) << "hash_dir " << *dir << " not complete, fetching" << endl; - mds->mdstore->fetch_dir(dir, - new C_MDC_HashComplete(this, dir)); - } else - hash_dir_complete(dir); -} - - -/* - * wait for everybody to discover and open the hashing dir - * then auth_unpin, to let the freeze happen - */ -void Migrator::handle_hash_dir_discover_ack(MHashDirDiscoverAck *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - - int from = m->get_source().num(); - assert(hash_gather[dir].count(from)); - hash_gather[dir].erase(from); - - if (hash_gather[dir].empty()) { - hash_gather.erase(dir); - dout(7) << "hash_dir_discover_ack " << *dir << ", releasing auth_pin" << endl; - dir->auth_unpin(); // unpin to allow freeze to complete - } else { - dout(7) << "hash_dir_discover_ack " << *dir << ", still waiting for " << hash_gather[dir] << endl; - } - - delete m; // done -} - - - -/* - * once the dir is completely in memory, - * mark all migrating inodes dirty (to pin in cache) - */ -void Migrator::hash_dir_complete(CDir *dir) -{ - dout(7) << "hash_dir_complete " << *dir << ", dirtying inodes" << endl; - - assert(!dir->is_hashed()); - assert(dir->is_auth()); - - // mark dirty to pin in cache - for (CDir_map_t::iterator it = dir->begin(); - it != dir->end(); - it++) { - CInode *in = it->second->inode; - in->_mark_dirty(); // fixme - } - - if (dir->is_frozen_dir()) - hash_dir_go(dir); -} - - -/* - * once the dir is frozen, - * make sure it's complete - * send the prep messages! - */ -void Migrator::hash_dir_frozen(CDir *dir) -{ - dout(7) << "hash_dir_frozen " << *dir << endl; - - assert(!dir->is_hashed()); - assert(dir->is_auth()); - assert(dir->is_frozen_dir()); - - if (!dir->is_complete()) { - dout(7) << "hash_dir_frozen !complete, waiting still on " << *dir << endl; - return; - } - - // send prep messages w/ export directories to open - vector msgs(mds->get_mds_map()->get_num_mds()); - - // check for subdirs - for (CDir_map_t::iterator it = dir->begin(); - it != dir->end(); - it++) { - CDentry *dn = it->second; - CInode *in = dn->inode; - - if (!in->is_dir()) continue; - if (!in->dir) continue; - - int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), it->first ); - if (dentryhashcode == mds->get_nodeid()) continue; - - // msg? - if (msgs[dentryhashcode] == 0) { - msgs[dentryhashcode] = new MHashDirPrep(dir->ino()); - } - msgs[dentryhashcode]->add_inode(it->first, in->replicate_to(dentryhashcode)); - } - - // send them! - assert(hash_gather[dir].empty()); - for (unsigned i=0; isend_message_mds(msgs[i], i, MDS_PORT_MIGRATOR); - hash_gather[dir].insert(i); - } - } - - if (hash_gather[dir].empty()) { - // no subdirs! continue! - hash_gather.erase(dir); - hash_dir_go(dir); - } else { - // wait! - } -} - -/* - * wait for peers to open all subdirs - */ -void Migrator::handle_hash_dir_prep_ack(MHashDirPrepAck *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - - int from = m->get_source().num(); - - assert(hash_gather[dir].count(from) == 1); - hash_gather[dir].erase(from); - - if (hash_gather[dir].empty()) { - hash_gather.erase(dir); - dout(7) << "handle_hash_dir_prep_ack on " << *dir << ", last one" << endl; - hash_dir_go(dir); - } else { - dout(7) << "handle_hash_dir_prep_ack on " << *dir << ", waiting for " << hash_gather[dir] << endl; - } - - delete m; -} - - -/* - * once the dir is frozen, - * make sure it's complete - * do the hashing! - */ -void Migrator::hash_dir_go(CDir *dir) -{ - dout(7) << "hash_dir_go " << *dir << endl; - - assert(!dir->is_hashed()); - assert(dir->is_auth()); - assert(dir->is_frozen_dir()); - - // get messages to other nodes ready - vector msgs(mds->get_mds_map()->get_num_mds()); - for (int i=0; iget_mds_map()->get_num_mds(); i++) { - if (i == mds->get_nodeid()) continue; - msgs[i] = new MHashDir(dir->ino()); - } - - // pick a hash seed. - dir->inode->inode.hash_seed = 1;//dir->ino(); - - // suck up all waiters - C_Contexts *fin = new C_Contexts; - list waiting; - dir->take_waiting(CDIR_WAIT_ANY, waiting); // all dir waiters - fin->take(waiting); - - // get containing import. might be me. - CDir *containing_import = cache->get_auth_container(dir); - assert(containing_import != dir || dir->is_import()); - - // divy up contents - for (CDir_map_t::iterator it = dir->begin(); - it != dir->end(); - it++) { - CDentry *dn = it->second; - CInode *in = dn->inode; - - int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), it->first ); - if (dentryhashcode == mds->get_nodeid()) { - continue; // still mine! - } - - bufferlist *bl = msgs[dentryhashcode]->get_state_ptr(); - assert(bl); - - // -- dentry - dout(7) << "hash_dir_go sending to " << dentryhashcode << " dn " << *dn << endl; - _encode(it->first, *bl); - - // null dentry? - if (dn->is_null()) { - bl->append("N", 1); // null dentry - assert(dn->is_sync()); - continue; - } - - if (dn->is_remote()) { - // remote link - bl->append("L", 1); // remote link - - inodeno_t ino = dn->get_remote_ino(); - bl->append((char*)&ino, sizeof(ino)); - continue; - } - - // primary link - // -- inode - bl->append("I", 1); // inode dentry - - encode_export_inode(in, *bl, dentryhashcode); // encode, and (update state for) export - msgs[dentryhashcode]->inc_nden(); - - if (dn->is_dirty()) - dn->mark_clean(); - - // add to proxy - hash_proxy_inos[dir].push_back(in); - in->state_set(CInode::STATE_PROXY); - in->get(CInode::PIN_PROXY); - - // fix up subdirs - if (in->dir) { - if (in->dir->is_auth()) { - // mine. make it into an import. - dout(7) << "making subdir into import " << *in->dir << endl; - in->dir->set_dir_auth( mds->get_nodeid() ); - cache->imports.insert(in->dir); - in->dir->get(CDir::PIN_IMPORT); - in->dir->state_set(CDIR_STATE_IMPORT); - - // fix nested bits - for (set::iterator it = cache->nested_exports[containing_import].begin(); - it != cache->nested_exports[containing_import].end(); ) { - CDir *ex = *it; - it++; - if (cache->get_auth_container(ex) == in->dir) { - dout(10) << "moving nested export " << *ex << endl; - cache->nested_exports[containing_import].erase(ex); - cache->nested_exports[in->dir].insert(ex); - } - } - } - else { - // not mine. - dout(7) << "un-exporting subdir that's being hashed away " << *in->dir << endl; - assert(in->dir->is_export()); - in->dir->put(CDir::PIN_EXPORT); - in->dir->state_clear(CDIR_STATE_EXPORT); - cache->exports.erase(in->dir); - cache->nested_exports[containing_import].erase(in->dir); - if (in->dir->authority() == dentryhashcode) - in->dir->set_dir_auth( CDIR_AUTH_PARENT ); - else - in->dir->set_dir_auth( in->dir->authority() ); - } - } - - // waiters - list waiters; - in->take_waiting(CINODE_WAIT_ANY, waiters); - fin->take(waiters); - } - - // dir state - dir->state_set(CDIR_STATE_HASHED); - dir->get(CDir::PIN_HASHED); - cache->hashdirs.insert(dir); - dir->mark_dirty(dir->pre_dirty()); // fixme - mds->mdlog->submit_entry(new EString("dirty dir fixme")); - - // inode state - if (dir->inode->is_auth()) { - dir->inode->_mark_dirty(); // fixme - mds->mdlog->submit_entry(new EString("hash dirty fixme")); - } - - // fix up nested_exports? - if (containing_import != dir) { - dout(7) << "moving nested exports under hashed dir" << endl; - for (set::iterator it = cache->nested_exports[containing_import].begin(); - it != cache->nested_exports[containing_import].end(); ) { - CDir *ex = *it; - it++; - if (cache->get_auth_container(ex) == dir) { - dout(7) << " moving nested export under hashed dir: " << *ex << endl; - cache->nested_exports[containing_import].erase(ex); - cache->nested_exports[dir].insert(ex); - } else { - dout(7) << " NOT moving nested export under hashed dir: " << *ex << endl; - } - } - } - - // send hash messages - assert(hash_gather[dir].empty()); - assert(hash_notify_gather[dir].empty()); - assert(dir->hashed_subset.empty()); - for (int i=0; iget_mds_map()->get_num_mds(); i++) { - // all nodes hashed locally.. - dir->hashed_subset.insert(i); - - if (i == mds->get_nodeid()) continue; - - // init hash_gather and hash_notify_gather sets - hash_gather[dir].insert(i); - - assert(hash_notify_gather[dir][i].empty()); - for (int j=0; jget_mds_map()->get_num_mds(); j++) { - if (j == mds->get_nodeid()) continue; - if (j == i) continue; - hash_notify_gather[dir][i].insert(j); - } - - mds->send_message_mds(msgs[i], i, MDS_PORT_MIGRATOR); - } - - // wait for all the acks. -} - - -void Migrator::handle_hash_dir_ack(MHashDirAck *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - - assert(dir->is_hashed()); - assert(dir->is_hashing()); - - int from = m->get_source().num(); - assert(hash_gather[dir].count(from) == 1); - hash_gather[dir].erase(from); - - if (hash_gather[dir].empty()) { - dout(7) << "handle_hash_dir_ack on " << *dir << ", last one" << endl; - - if (hash_notify_gather[dir].empty()) { - dout(7) << "got notifies too, all done" << endl; - hash_dir_finish(dir); - } else { - dout(7) << "waiting on notifies " << endl; - } - - } else { - dout(7) << "handle_hash_dir_ack on " << *dir << ", waiting for " << hash_gather[dir] << endl; - } - - delete m; -} - - -void Migrator::hash_dir_finish(CDir *dir) -{ - dout(7) << "hash_dir_finish finishing " << *dir << endl; - assert(dir->is_hashed()); - assert(dir->is_hashing()); - - // dir state - hash_gather.erase(dir); - dir->state_clear(CDIR_STATE_HASHING); - dir->put(CDir::PIN_HASHING); - dir->hashed_subset.clear(); - - // unproxy inodes - // this _could_ happen sooner, on a per-peer basis, but no harm in waiting a few more seconds. - for (list::iterator it = hash_proxy_inos[dir].begin(); - it != hash_proxy_inos[dir].end(); - it++) { - CInode *in = *it; - assert(in->state_test(CInode::STATE_PROXY)); - in->state_clear(CInode::STATE_PROXY); - in->put(CInode::PIN_PROXY); - } - hash_proxy_inos.erase(dir); - - // unpin path - vector trace; - cache->make_trace(trace, dir->inode); - cache->path_unpin(trace, 0); - - // unfreeze - dir->unfreeze_dir(); - - show_imports(); - assert(hash_gather.count(dir) == 0); - - // stats - //if (mds->logger) mds->logger->inc("nh", 1); - -} - - - - -// HASH on auth and non-auth - -void Migrator::handle_hash_dir_notify(MHashDirNotify *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - assert(dir->is_hashing()); - - dout(5) << "handle_hash_dir_notify " << *dir << endl; - int from = m->get_from(); - - int source = m->get_source().num(); - if (dir->is_auth()) { - // gather notifies - assert(dir->is_hashed()); - - assert( hash_notify_gather[dir][from].count(source) ); - hash_notify_gather[dir][from].erase(source); - - if (hash_notify_gather[dir][from].empty()) { - dout(7) << "last notify from " << from << endl; - hash_notify_gather[dir].erase(from); - - if (hash_notify_gather[dir].empty()) { - dout(7) << "last notify!" << endl; - hash_notify_gather.erase(dir); - - if (hash_gather[dir].empty()) { - dout(7) << "got acks too, all done" << endl; - hash_dir_finish(dir); - } else { - dout(7) << "still waiting on acks from " << hash_gather[dir] << endl; - } - } else { - dout(7) << "still waiting for notify gathers from " << hash_notify_gather[dir].size() << " others" << endl; - } - } else { - dout(7) << "still waiting for notifies from " << from << " via " << hash_notify_gather[dir][from] << endl; - } - - // delete msg - delete m; - } else { - // update dir hashed_subset - assert(dir->hashed_subset.count(from) == 0); - dir->hashed_subset.insert(from); - - // update open subdirs - for (CDir_map_t::iterator it = dir->begin(); - it != dir->end(); - it++) { - CDentry *dn = it->second; - CInode *in = dn->get_inode(); - if (!in) continue; - if (!in->dir) continue; - - int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), it->first ); - if (dentryhashcode != from) continue; // we'll import these in a minute - - if (in->dir->authority() != dentryhashcode) - in->dir->set_dir_auth( in->dir->authority() ); - else - in->dir->set_dir_auth( CDIR_AUTH_PARENT ); - } - - // remove from notify gather set - assert(hash_gather[dir].count(from)); - hash_gather[dir].erase(from); - - // last notify? - if (hash_gather[dir].empty()) { - dout(7) << "gathered all the notifies, finishing hash of " << *dir << endl; - hash_gather.erase(dir); - - dir->state_clear(CDIR_STATE_HASHING); - dir->put(CDir::PIN_HASHING); - dir->hashed_subset.clear(); - } else { - dout(7) << "still waiting for notify from " << hash_gather[dir] << endl; - } - - // fw notify to auth - mds->send_message_mds(m, dir->authority(), MDS_PORT_MIGRATOR); - } -} - - - - -// HASH on non-auth - -/* - * discover step: - * each peer needs to open up the directory and pin it before we start - */ -class C_MDC_HashDirDiscover : public Context { - Migrator *mig; - MHashDirDiscover *m; -public: - vector trace; - C_MDC_HashDirDiscover(Migrator *mig, MHashDirDiscover *m) { - this->mig = mig; - this->m = m; - } - void finish(int r) { - CInode *in = 0; - if (r >= 0) { - if (trace.size()) - in = trace[trace.size()-1]->get_inode(); - else - in = mig->cache->get_root(); - } - mig->handle_hash_dir_discover_2(m, in, r); - } -}; - -void Migrator::handle_hash_dir_discover(MHashDirDiscover *m) -{ - assert(m->get_source().num() != mds->get_nodeid()); - - dout(7) << "handle_hash_dir_discover on " << m->get_path() << endl; - - // must discover it! - C_MDC_HashDirDiscover *onfinish = new C_MDC_HashDirDiscover(this, m); - filepath fpath(m->get_path()); - cache->path_traverse(fpath, onfinish->trace, true, - m, new C_MDS_RetryMessage(mds,m), // on delay/retry - MDS_TRAVERSE_DISCOVER, - onfinish); // on completion|error -} - -void Migrator::handle_hash_dir_discover_2(MHashDirDiscover *m, CInode *in, int r) -{ - // yay! - if (in) { - dout(7) << "handle_hash_dir_discover_2 has " << *in << endl; - } - - if (r < 0 || !in->is_dir()) { - dout(7) << "handle_hash_dir_discover_2 failed to discover or not dir " << m->get_path() << ", NAK" << endl; - assert(0); // this shouldn't happen if the auth pins his path properly!!!! - } - assert(in->is_dir()); - - // is dir open? - if (!in->dir) { - dout(7) << "handle_hash_dir_discover_2 opening dir " << *in << endl; - cache->open_remote_dir(in, - new C_MDS_RetryMessage(mds, m)); - return; - } - CDir *dir = in->dir; - - // pin dir, set hashing flag - dir->state_set(CDIR_STATE_HASHING); - dir->get(CDir::PIN_HASHING); - assert(dir->hashed_subset.empty()); - - // inode state - dir->inode->inode.hash_seed = 1;// dir->ino(); - if (dir->inode->is_auth()) { - dir->inode->_mark_dirty(); // fixme - mds->mdlog->submit_entry(new EString("hash dirty fixme")); - } - - // get gather set ready for notifies - assert(hash_gather[dir].empty()); - for (int i=0; iget_mds_map()->get_num_mds(); i++) { - if (i == mds->get_nodeid()) continue; - if (i == dir->authority()) continue; - hash_gather[dir].insert(i); - } - - // reply - dout(7) << " sending hash_dir_discover_ack on " << *dir << endl; - mds->send_message_mds(new MHashDirDiscoverAck(dir->ino()), - m->get_source().num(), MDS_PORT_MIGRATOR); - delete m; -} - -/* - * prep step: - * peers need to open up all subdirs of the hashed dir - */ - -void Migrator::handle_hash_dir_prep(MHashDirPrep *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - - dout(7) << "handle_hash_dir_prep " << *dir << endl; - - if (!m->did_assim()) { - m->mark_assim(); // only do this the first time! - - // assimilate dentry+inodes for exports - for (map::iterator it = m->get_inodes().begin(); - it != m->get_inodes().end(); - it++) { - CInode *in = cache->get_inode( it->second->get_ino() ); - if (in) { - it->second->update_inode(in); - dout(5) << " updated " << *in << endl; - } else { - in = new CInode(mds->mdcache, false); - it->second->update_inode(in); - cache->add_inode(in); - - // link - dir->add_dentry( it->first, in ); - dout(5) << " added " << *in << endl; - } - - // open! - if (!in->dir) { - dout(5) << " opening nested export on " << *in << endl; - cache->open_remote_dir(in, - new C_MDS_RetryMessage(mds, m)); - } - } - } - - // verify! - int waiting_for = 0; - for (map::iterator it = m->get_inodes().begin(); - it != m->get_inodes().end(); - it++) { - CInode *in = cache->get_inode( it->second->get_ino() ); - assert(in); - - if (in->dir) { - if (!in->dir->state_test(CDIR_STATE_IMPORTINGEXPORT)) { - dout(5) << " pinning nested export " << *in->dir << endl; - in->dir->get(CDir::PIN_IMPORTINGEXPORT); - in->dir->state_set(CDIR_STATE_IMPORTINGEXPORT); - } else { - dout(5) << " already pinned nested export " << *in << endl; - } - } else { - dout(5) << " waiting for nested export dir on " << *in << endl; - waiting_for++; - } - } - - if (waiting_for) { - dout(5) << "waiting for " << waiting_for << " dirs to open" << endl; - return; - } - - // ack! - mds->send_message_mds(new MHashDirPrepAck(dir->ino()), - m->get_source().num(), MDS_PORT_MIGRATOR); - - // done. - delete m; -} - - -/* - * hash step: - */ - -void Migrator::handle_hash_dir(MHashDir *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - assert(!dir->is_auth()); - assert(!dir->is_hashed()); - assert(dir->is_hashing()); - - dout(5) << "handle_hash_dir " << *dir << endl; - int oldauth = m->get_source().num(); - - // content - import_hashed_content(dir, m->get_state(), m->get_nden(), oldauth); - - // dir state - dir->state_set(CDIR_STATE_HASHED); - dir->get(CDir::PIN_HASHED); - cache->hashdirs.insert(dir); - dir->hashed_subset.insert(mds->get_nodeid()); - - // dir is complete - dir->mark_complete(); - dir->mark_dirty(dir->pre_dirty()); // fixme - mds->mdlog->submit_entry(new EString("dirty dir fixme")); - - // commit - mds->mdstore->commit_dir(dir, 0); - - // send notifies - dout(7) << "sending notifies" << endl; - for (int i=0; iget_mds_map()->get_num_mds(); i++) { - if (i == mds->get_nodeid()) continue; - if (i == m->get_source().num()) continue; - mds->send_message_mds(new MHashDirNotify(dir->ino(), mds->get_nodeid()), - i, MDS_PORT_MIGRATOR); - } - - // ack - dout(7) << "acking" << endl; - mds->send_message_mds(new MHashDirAck(dir->ino()), - m->get_source().num(), MDS_PORT_MIGRATOR); - - // done. - delete m; - - show_imports(); -} - - - - - -// UNHASH on auth - -class C_MDC_UnhashFreeze : public Context { -public: - Migrator *mig; - CDir *dir; - C_MDC_UnhashFreeze(Migrator *m, CDir *d) : mig(m), dir(d) {} - virtual void finish(int r) { - mig->unhash_dir_frozen(dir); - } -}; - -class C_MDC_UnhashComplete : public Context { -public: - Migrator *mig; - CDir *dir; - C_MDC_UnhashComplete(Migrator *m, CDir *d) : mig(m), dir(d) {} - virtual void finish(int r) { - mig->unhash_dir_complete(dir); - } -}; - - -void Migrator::unhash_dir(CDir *dir) -{ - dout(-7) << "unhash_dir " << *dir << endl; - - assert(dir->is_hashed()); - assert(!dir->is_unhashing()); - assert(dir->is_auth()); - assert(hash_gather.count(dir)==0); - - // pin path? - vector trace; - cache->make_trace(trace, dir->inode); - if (!cache->path_pin(trace, 0, 0)) { - dout(7) << "unhash_dir couldn't pin path, failing." << endl; - return; - } - - // twiddle state - dir->state_set(CDIR_STATE_UNHASHING); - - // first, freeze the dir. - dir->freeze_dir(new C_MDC_UnhashFreeze(this, dir)); - - // make complete - if (!dir->is_complete()) { - dout(7) << "unhash_dir " << *dir << " not complete, fetching" << endl; - mds->mdstore->fetch_dir(dir, - new C_MDC_UnhashComplete(this, dir)); - } else - unhash_dir_complete(dir); - -} - -void Migrator::unhash_dir_frozen(CDir *dir) -{ - dout(7) << "unhash_dir_frozen " << *dir << endl; - - assert(dir->is_hashed()); - assert(dir->is_auth()); - assert(dir->is_frozen_dir()); - - if (!dir->is_complete()) { - dout(7) << "unhash_dir_frozen !complete, waiting still on " << *dir << endl; - } else - unhash_dir_prep(dir); -} - - -/* - * ask peers to freeze and complete hashed dir - */ -void Migrator::unhash_dir_prep(CDir *dir) -{ - dout(7) << "unhash_dir_prep " << *dir << endl; - assert(dir->is_hashed()); - assert(dir->is_auth()); - assert(dir->is_frozen_dir()); - assert(dir->is_complete()); - - if (!hash_gather[dir].empty()) return; // already been here..freeze must have been instantaneous - - // send unhash prep to all peers - assert(hash_gather[dir].empty()); - for (int i=0; iget_mds_map()->get_num_mds(); i++) { - if (i == mds->get_nodeid()) continue; - hash_gather[dir].insert(i); - mds->send_message_mds(new MUnhashDirPrep(dir->ino()), - i, MDS_PORT_MIGRATOR); - } -} - -/* - * wait for peers to freeze and complete hashed dirs - */ -void Migrator::handle_unhash_dir_prep_ack(MUnhashDirPrepAck *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - - int from = m->get_source().num(); - dout(7) << "handle_unhash_dir_prep_ack from " << from << " " << *dir << endl; - - if (!m->did_assim()) { - m->mark_assim(); // only do this the first time! - - // assimilate dentry+inodes for exports - for (map::iterator it = m->get_inodes().begin(); - it != m->get_inodes().end(); - it++) { - CInode *in = cache->get_inode( it->second->get_ino() ); - if (in) { - it->second->update_inode(in); - dout(5) << " updated " << *in << endl; - } else { - in = new CInode(mds->mdcache, false); - it->second->update_inode(in); - cache->add_inode(in); - - // link - dir->add_dentry( it->first, in ); - dout(5) << " added " << *in << endl; - } - - // open! - if (!in->dir) { - dout(5) << " opening nested export on " << *in << endl; - cache->open_remote_dir(in, - new C_MDS_RetryMessage(mds, m)); - } - } - } - - // verify! - int waiting_for = 0; - for (map::iterator it = m->get_inodes().begin(); - it != m->get_inodes().end(); - it++) { - CInode *in = cache->get_inode( it->second->get_ino() ); - assert(in); - - if (in->dir) { - if (!in->dir->state_test(CDIR_STATE_IMPORTINGEXPORT)) { - dout(5) << " pinning nested export " << *in->dir << endl; - in->dir->get(CDir::PIN_IMPORTINGEXPORT); - in->dir->state_set(CDIR_STATE_IMPORTINGEXPORT); - } else { - dout(5) << " already pinned nested export " << *in << endl; - } - } else { - dout(5) << " waiting for nested export dir on " << *in << endl; - waiting_for++; - } - } - - if (waiting_for) { - dout(5) << "waiting for " << waiting_for << " dirs to open" << endl; - return; - } - - // ok, done with this PrepAck - assert(hash_gather[dir].count(from) == 1); - hash_gather[dir].erase(from); - - if (hash_gather[dir].empty()) { - hash_gather.erase(dir); - dout(7) << "handle_unhash_dir_prep_ack on " << *dir << ", last one" << endl; - unhash_dir_go(dir); - } else { - dout(7) << "handle_unhash_dir_prep_ack on " << *dir << ", waiting for " << hash_gather[dir] << endl; - } - - delete m; -} - - -/* - * auth: - * send out MHashDir's to peers - */ -void Migrator::unhash_dir_go(CDir *dir) -{ - dout(7) << "unhash_dir_go " << *dir << endl; - assert(dir->is_hashed()); - assert(dir->is_auth()); - assert(dir->is_frozen_dir()); - assert(dir->is_complete()); - - // send unhash prep to all peers - assert(hash_gather[dir].empty()); - for (int i=0; iget_mds_map()->get_num_mds(); i++) { - if (i == mds->get_nodeid()) continue; - hash_gather[dir].insert(i); - mds->send_message_mds(new MUnhashDir(dir->ino()), - i, MDS_PORT_MIGRATOR); - } -} - -/* - * auth: - * assimilate unhashing content - */ -void Migrator::handle_unhash_dir_ack(MUnhashDirAck *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - - dout(7) << "handle_unhash_dir_ack " << *dir << endl; - assert(dir->is_hashed()); - - // assimilate content - int from = m->get_source().num(); - import_hashed_content(dir, m->get_state(), m->get_nden(), from); - delete m; - - // done? - assert(hash_gather[dir].count(from)); - hash_gather[dir].erase(from); - - if (!hash_gather[dir].empty()) { - dout(7) << "still waiting for unhash acks from " << hash_gather[dir] << endl; - return; - } - - // done! - - // fix up nested_exports - CDir *containing_import = cache->get_auth_container(dir); - if (containing_import != dir) { - for (set::iterator it = cache->nested_exports[dir].begin(); - it != cache->nested_exports[dir].end(); - it++) { - dout(7) << "moving nested export out from under hashed dir : " << **it << endl; - cache->nested_exports[containing_import].insert(*it); - } - cache->nested_exports.erase(dir); - } - - // dir state - //dir->state_clear(CDIR_STATE_UNHASHING); //later - dir->state_clear(CDIR_STATE_HASHED); - dir->put(CDir::PIN_HASHED); - cache->hashdirs.erase(dir); - - // commit! - assert(dir->is_complete()); - //dir->mark_complete(); - dir->mark_dirty(dir->pre_dirty()); // fixme - mds->mdstore->commit_dir(dir, 0); - - // inode state - dir->inode->inode.hash_seed = 0; - if (dir->inode->is_auth()) { - dir->inode->_mark_dirty(); // fixme - mds->mdlog->submit_entry(new EString("hash inode dirty fixme")); - } - - // notify - assert(hash_gather[dir].empty()); - for (int i=0; iget_mds_map()->get_num_mds(); i++) { - if (i == mds->get_nodeid()) continue; - - hash_gather[dir].insert(i); - - mds->send_message_mds(new MUnhashDirNotify(dir->ino()), - i, MDS_PORT_MIGRATOR); - } -} - - -/* - * sent by peer to flush mds links. unfreeze when all gathered. - */ -void Migrator::handle_unhash_dir_notify_ack(MUnhashDirNotifyAck *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - - dout(7) << "handle_unhash_dir_ack " << *dir << endl; - assert(!dir->is_hashed()); - assert(dir->is_unhashing()); - assert(dir->is_frozen_dir()); - - // done? - int from = m->get_source().num(); - assert(hash_gather[dir].count(from)); - hash_gather[dir].erase(from); - delete m; - - if (!hash_gather[dir].empty()) { - dout(7) << "still waiting for notifyack from " << hash_gather[dir] << " on " << *dir << endl; - } else { - unhash_dir_finish(dir); - } -} - - -/* - * all mds links are flushed. unfreeze dir! - */ -void Migrator::unhash_dir_finish(CDir *dir) -{ - dout(7) << "unhash_dir_finish " << *dir << endl; - hash_gather.erase(dir); - - // unpin path - vector trace; - cache->make_trace(trace, dir->inode); - cache->path_unpin(trace, 0); - - // state - dir->state_clear(CDIR_STATE_UNHASHING); - - // unfreeze - dir->unfreeze_dir(); - -} - - - -// UNHASH on all - -/* - * hashed dir is complete. - * mark all migrating inodes dirty (to pin in cache) - * if frozen too, then go to next step (depending on auth) - */ -void Migrator::unhash_dir_complete(CDir *dir) -{ - dout(7) << "unhash_dir_complete " << *dir << ", dirtying inodes" << endl; - - assert(dir->is_hashed()); - assert(dir->is_complete()); - - // mark dirty to pin in cache - for (CDir_map_t::iterator it = dir->begin(); - it != dir->end(); - it++) { - CInode *in = it->second->inode; - if (in->is_auth()) { - in->_mark_dirty(); // fixme - mds->mdlog->submit_entry(new EString("unhash dirty fixme")); - } - } - - if (!dir->is_frozen_dir()) { - dout(7) << "dir complete but !frozen, waiting " << *dir << endl; - } else { - if (dir->is_auth()) - unhash_dir_prep(dir); // auth - else - unhash_dir_prep_finish(dir); // nonauth - } -} - - -// UNHASH on non-auth - -class C_MDC_UnhashPrepFreeze : public Context { -public: - Migrator *mig; - CDir *dir; - C_MDC_UnhashPrepFreeze(Migrator *m, CDir *d) : mig(m), dir(d) {} - virtual void finish(int r) { - mig->unhash_dir_prep_frozen(dir); - } -}; - - -/* - * peers need to freeze their dir and make them complete - */ -void Migrator::handle_unhash_dir_prep(MUnhashDirPrep *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - - dout(7) << "handle_unhash_dir_prep " << *dir << endl; - assert(dir->is_hashed()); - - // freeze - dir->freeze_dir(new C_MDC_UnhashPrepFreeze(this, dir)); - - // make complete - if (!dir->is_complete()) { - dout(7) << "unhash_dir " << *dir << " not complete, fetching" << endl; - mds->mdstore->fetch_dir(dir, - new C_MDC_UnhashComplete(this, dir)); - } else { - unhash_dir_complete(dir); - } - - delete m; -} - -/* - * peer has hashed dir frozen. - * complete too? - */ -void Migrator::unhash_dir_prep_frozen(CDir *dir) -{ - dout(7) << "unhash_dir_prep_frozen " << *dir << endl; - - assert(dir->is_hashed()); - assert(dir->is_frozen_dir()); - assert(!dir->is_auth()); - - if (!dir->is_complete()) { - dout(7) << "unhash_dir_prep_frozen !complete, waiting still on " << *dir << endl; - } else - unhash_dir_prep_finish(dir); -} - -/* - * peer has hashed dir complete and frozen. ack. - */ -void Migrator::unhash_dir_prep_finish(CDir *dir) -{ - dout(7) << "unhash_dir_prep_finish " << *dir << endl; - assert(dir->is_hashed()); - assert(!dir->is_auth()); - assert(dir->is_frozen()); - assert(dir->is_complete()); - - // twiddle state - if (dir->is_unhashing()) - return; // already replied. - dir->state_set(CDIR_STATE_UNHASHING); - - // send subdirs back to auth - MUnhashDirPrepAck *ack = new MUnhashDirPrepAck(dir->ino()); - int auth = dir->authority(); - - for (CDir_map_t::iterator it = dir->begin(); - it != dir->end(); - it++) { - CDentry *dn = it->second; - CInode *in = dn->inode; - - if (!in->is_dir()) continue; - if (!in->dir) continue; - - int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), it->first ); - if (dentryhashcode != mds->get_nodeid()) continue; - - // msg? - ack->add_inode(it->first, in->replicate_to(auth)); - } - - // ack - mds->send_message_mds(ack, auth, MDS_PORT_MIGRATOR); -} - - - -/* - * peer needs to send hashed dir content back to auth. - * unhash dir. - */ -void Migrator::handle_unhash_dir(MUnhashDir *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - - dout(7) << "handle_unhash_dir " << *dir << endl;//" .. hash_seed is " << dir->inode->inode.hash_seed << endl; - assert(dir->is_hashed()); - assert(dir->is_unhashing()); - assert(!dir->is_auth()); - - // get message ready - bufferlist bl; - int nden = 0; - - // suck up all waiters - C_Contexts *fin = new C_Contexts; - list waiting; - dir->take_waiting(CDIR_WAIT_ANY, waiting); // all dir waiters - fin->take(waiting); - - // divy up contents - for (CDir_map_t::iterator it = dir->begin(); - it != dir->end(); - it++) { - CDentry *dn = it->second; - CInode *in = dn->inode; - - int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), it->first ); - if (dentryhashcode != mds->get_nodeid()) { - // not mine! - // twiddle dir_auth? - if (in->dir) { - if (in->dir->authority() != dir->authority()) - in->dir->set_dir_auth( in->dir->authority() ); - else - in->dir->set_dir_auth( CDIR_AUTH_PARENT ); - } - continue; - } - - // -- dentry - dout(7) << "unhash_dir_go sending to " << dentryhashcode << " dn " << *dn << endl; - _encode(it->first, bl); - - // null dentry? - if (dn->is_null()) { - bl.append("N", 1); // null dentry - assert(dn->is_sync()); - continue; - } - - if (dn->is_remote()) { - // remote link - bl.append("L", 1); // remote link - - inodeno_t ino = dn->get_remote_ino(); - bl.append((char*)&ino, sizeof(ino)); - continue; - } - - // primary link - // -- inode - bl.append("I", 1); // inode dentry - - encode_export_inode(in, bl, dentryhashcode); // encode, and (update state for) export - nden++; - - if (dn->is_dirty()) - dn->mark_clean(); - - // proxy - in->state_set(CInode::STATE_PROXY); - in->get(CInode::PIN_PROXY); - hash_proxy_inos[dir].push_back(in); - - if (in->dir) { - if (in->dir->is_auth()) { - // mine. make it into an import. - dout(7) << "making subdir into import " << *in->dir << endl; - in->dir->set_dir_auth( mds->get_nodeid() ); - cache->imports.insert(in->dir); - in->dir->get(CDir::PIN_IMPORT); - in->dir->state_set(CDIR_STATE_IMPORT); - } - else { - // not mine. - dout(7) << "un-exporting subdir that's being unhashed away " << *in->dir << endl; - assert(in->dir->is_export()); - in->dir->put(CDir::PIN_EXPORT); - in->dir->state_clear(CDIR_STATE_EXPORT); - cache->exports.erase(in->dir); - cache->nested_exports[dir].erase(in->dir); - } - } - - // waiters - list waiters; - in->take_waiting(CINODE_WAIT_ANY, waiters); - fin->take(waiters); - } - - // we should have no nested exports; we're not auth for the dir! - assert(cache->nested_exports[dir].empty()); - cache->nested_exports.erase(dir); - - // dir state - //dir->state_clear(CDIR_STATE_UNHASHING); // later - dir->state_clear(CDIR_STATE_HASHED); - dir->put(CDir::PIN_HASHED); - cache->hashdirs.erase(dir); - dir->mark_clean(); - - // inode state - dir->inode->inode.hash_seed = 0; - if (dir->inode->is_auth()) { - dir->inode->_mark_dirty(); // fixme - mds->mdlog->submit_entry(new EString("unhash inode dirty fixme")); - } - - // init gather set - mds->get_mds_map()->get_active_mds_set( hash_gather[dir] ); - hash_gather[dir].erase(mds->get_nodeid()); - - // send unhash message - mds->send_message_mds(new MUnhashDirAck(dir->ino(), bl, nden), - dir->authority(), MDS_PORT_MIGRATOR); -} - - -/* - * first notify comes from auth. - * send notifies to all other peers, with peer = self - * if we get notify from peer=other, remove from our gather list. - * when we've gotten notifies from everyone, - * unpin proxies, - * send notify_ack to auth. - * this ensures that all mds links are flushed of cache_expire type messages. - */ -void Migrator::handle_unhash_dir_notify(MUnhashDirNotify *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - - dout(7) << "handle_unhash_dir_finish " << *dir << endl; - assert(!dir->is_hashed()); - assert(dir->is_unhashing()); - assert(!dir->is_auth()); - - int from = m->get_source().num(); - assert(hash_gather[dir].count(from) == 1); - hash_gather[dir].erase(from); - delete m; - - // did we send our shout out? - if (from == dir->authority()) { - // send notify to everyone else in weird chatter storm - for (int i=0; iget_mds_map()->get_num_mds(); i++) { - if (i == from) continue; - if (i == mds->get_nodeid()) continue; - mds->send_message_mds(new MUnhashDirNotify(dir->ino()), i, MDS_PORT_MIGRATOR); - } - } - - // are we done? - if (!hash_gather[dir].empty()) { - dout(7) << "still waiting for notify from " << hash_gather[dir] << endl; - return; - } - hash_gather.erase(dir); - - // all done! - dout(7) << "all mds links flushed, unpinning unhash proxies" << endl; - - // unpin proxies - for (list::iterator it = hash_proxy_inos[dir].begin(); - it != hash_proxy_inos[dir].end(); - it++) { - CInode *in = *it; - assert(in->state_test(CInode::STATE_PROXY)); - in->state_clear(CInode::STATE_PROXY); - in->put(CInode::PIN_PROXY); - } - - // unfreeze - dir->unfreeze_dir(); - - // ack - dout(7) << "sending notify_ack to auth for unhash of " << *dir << endl; - mds->send_message_mds(new MUnhashDirNotifyAck(dir->ino()), dir->authority(), MDS_PORT_MIGRATOR); - -} - - - - -void Migrator::show_imports() -{ - mds->balancer->show_imports(); -} diff --git a/branches/marnberg/quota/mds/Migrator.h b/branches/marnberg/quota/mds/Migrator.h deleted file mode 100644 index dd2886008d163..0000000000000 --- a/branches/marnberg/quota/mds/Migrator.h +++ /dev/null @@ -1,265 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_MIGRATOR_H -#define __MDS_MIGRATOR_H - -#include "include/types.h" - -#include -#include -#include -using std::map; -using std::list; -using std::set; - - -class MDS; -class CDir; -class CInode; -class CDentry; - -class MExportDir; -class MExportDirDiscover; -class MExportDirDiscoverAck; -class MExportDirPrep; -class MExportDirPrepAck; -class MExportDirWarning; -class MExportDirNotify; -class MExportDirNotifyAck; -class MExportDirFinish; - -class MHashDirDiscover; -class MHashDirDiscoverAck; -class MHashDirPrep; -class MHashDirPrepAck; -class MHashDir; -class MHashDirAck; -class MHashDirNotify; - -class MUnhashDirPrep; -class MUnhashDirPrepAck; -class MUnhashDir; -class MUnhashDirAck; -class MUnhashDirNotify; -class MUnhashDirNotifyAck; - -class EImportStart; - -class Migrator { -private: - MDS *mds; - MDCache *cache; - - // -- exports -- - // export stages. used to clean up intelligently if there's a failure. - const static int EXPORT_DISCOVERING = 1; // dest is disovering export dir - const static int EXPORT_FREEZING = 2; // we're freezing the dir tree - const static int EXPORT_LOGGINGSTART = 3; // we're logging EExportStart - const static int EXPORT_PREPPING = 4; // sending dest spanning tree to export bounds - const static int EXPORT_EXPORTING = 5; // sent actual export, waiting for acks - const static int EXPORT_LOGGINGFINISH = 6; // logging EExportFinish - - // export fun - map export_state; - map export_peer; - map > export_bounds; - map > export_data; // only during EXPORTING state - map > export_notify_ack_waiting; // nodes i am waiting to get export_notify_ack's from - map > export_proxy_inos; - map > export_proxy_dirinos; - - map > export_finish_waiters; - - set stray_export_warnings; // notifies i haven't seen - map stray_export_notifies; - - - // -- imports -- - const static int IMPORT_DISCOVERED = 1; // waiting for prep - const static int IMPORT_PREPPING = 2; // opening dirs on bounds - const static int IMPORT_PREPPED = 3; // opened bounds, waiting for import - const static int IMPORT_LOGGINGSTART = 4; // got import, logging EImportStart - const static int IMPORT_ACKING = 5; // logged, sent acks - const static int IMPORT_LOGGINGFINISH = 6; - - map import_state; - map import_peer; - map > import_bounds; - - - // -- hashing madness -- - multimap unhash_waiting; // nodes i am waiting for UnhashDirAck's from - multimap import_hashed_replicate_waiting; // nodes i am waiting to discover to complete my import of a hashed dir - // maps frozen_dir_ino's to waiting-for-discover ino's. - multimap import_hashed_frozen_waiting; // dirs i froze (for the above) - - - -public: - // -- cons -- - Migrator(MDS *m, MDCache *c) : mds(m), cache(c) {} - - void dispatch(Message*); - - - // -- status -- - int is_exporting(CDir *dir) { - if (export_state.count(dir)) return export_state[dir]; - return 0; - } - bool is_exporting() { return !export_state.empty(); } - int is_importing(inodeno_t dirino) { - if (import_state.count(dirino)) return import_state[dirino]; - return 0; - } - bool is_importing() { return !import_state.empty(); } - const set& get_import_bounds(inodeno_t base) { - assert(import_bounds.count(base)); - return import_bounds[base]; - } - - - // -- misc -- - void handle_mds_failure(int who); - void show_imports(); - - - // -- import/export -- - // exporter - public: - void export_dir(CDir *dir, - int mds); - void export_empty_import(CDir *dir); - - void encode_export_inode(CInode *in, bufferlist& enc_state, int newauth); - void decode_import_inode(CDentry *dn, bufferlist& bl, int &off, int oldauth); - - void add_export_finish_waiter(CDir *dir, Context *c) { - export_finish_waiters[dir].push_back(c); - } - void clear_export_proxy_pins(CDir *dir); - - protected: - void handle_export_dir_discover_ack(MExportDirDiscoverAck *m); - void export_dir_frozen(CDir *dir, int dest); - void export_dir_frozen_logged(CDir *dir, MExportDirPrep *prep, int dest); - void handle_export_dir_prep_ack(MExportDirPrepAck *m); - void export_dir_go(CDir *dir, - int dest); - int encode_export_dir(list& dirstatelist, - class C_Contexts *fin, - CDir *basedir, - CDir *dir, - int newauth); - void handle_export_dir_notify_ack(MExportDirNotifyAck *m); - void reverse_export(CDir *dir); - void export_dir_acked(CDir *dir); - void export_dir_finish(CDir *dir); - - friend class C_MDC_ExportFreeze; - friend class C_MDC_ExportStartLogged; - friend class C_MDS_ExportFinishLogged; - // importer - void handle_export_dir_discover(MExportDirDiscover *m); - void handle_export_dir_discover_2(MExportDirDiscover *m, CInode *in, int r); - void handle_export_dir_prep(MExportDirPrep *m); - void handle_export_dir(MExportDir *m); - void import_dir_logged_start(CDir *dir, int from, - list &imported_subdirs, - list &exports); - void import_dir_logged_finish(CDir *dir); - void handle_export_dir_finish(MExportDirFinish *m); - int decode_import_dir(bufferlist& bl, - int oldauth, - CDir *import_root, - list& imported_subdirs, - EImportStart *le); - void got_hashed_replica(CDir *import, - inodeno_t dir_ino, - inodeno_t replica_ino); - - friend class C_MDC_ExportDirDiscover; - friend class C_MDS_ImportDirLoggedStart; - friend class C_MDS_ImportDirLoggedFinish; - - // bystander - void handle_export_dir_warning(MExportDirWarning *m); - void handle_export_dir_notify(MExportDirNotify *m); - - - // -- hashed directories -- - - // HASH - public: - void hash_dir(CDir *dir); // on auth - protected: - map< CDir*, set > hash_gather; - map< CDir*, map< int, set > > hash_notify_gather; - map< CDir*, list > hash_proxy_inos; - - // hash on auth - void handle_hash_dir_discover_ack(MHashDirDiscoverAck *m); - void hash_dir_complete(CDir *dir); - void hash_dir_frozen(CDir *dir); - void handle_hash_dir_prep_ack(MHashDirPrepAck *m); - void hash_dir_go(CDir *dir); - void handle_hash_dir_ack(MHashDirAck *m); - void hash_dir_finish(CDir *dir); - friend class C_MDC_HashFreeze; - friend class C_MDC_HashComplete; - - // auth and non-auth - void handle_hash_dir_notify(MHashDirNotify *m); - - // hash on non-auth - void handle_hash_dir_discover(MHashDirDiscover *m); - void handle_hash_dir_discover_2(MHashDirDiscover *m, CInode *in, int r); - void handle_hash_dir_prep(MHashDirPrep *m); - void handle_hash_dir(MHashDir *m); - friend class C_MDC_HashDirDiscover; - - // UNHASH - public: - void unhash_dir(CDir *dir); // on auth - protected: - map< CDir*, list > unhash_content; - void import_hashed_content(CDir *dir, bufferlist& bl, int nden, int oldauth); - - // unhash on auth - void unhash_dir_frozen(CDir *dir); - void unhash_dir_prep(CDir *dir); - void handle_unhash_dir_prep_ack(MUnhashDirPrepAck *m); - void unhash_dir_go(CDir *dir); - void handle_unhash_dir_ack(MUnhashDirAck *m); - void handle_unhash_dir_notify_ack(MUnhashDirNotifyAck *m); - void unhash_dir_finish(CDir *dir); - friend class C_MDC_UnhashFreeze; - friend class C_MDC_UnhashComplete; - - // unhash on all - void unhash_dir_complete(CDir *dir); - - // unhash on non-auth - void handle_unhash_dir_prep(MUnhashDirPrep *m); - void unhash_dir_prep_frozen(CDir *dir); - void unhash_dir_prep_finish(CDir *dir); - void handle_unhash_dir(MUnhashDir *m); - void handle_unhash_dir_notify(MUnhashDirNotify *m); - friend class C_MDC_UnhashPrepFreeze; - - -}; - - -#endif diff --git a/branches/marnberg/quota/mds/Renamer.cc b/branches/marnberg/quota/mds/Renamer.cc deleted file mode 100644 index cf7d79170f479..0000000000000 --- a/branches/marnberg/quota/mds/Renamer.cc +++ /dev/null @@ -1,918 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "MDCache.h" -#include "MDStore.h" -#include "CInode.h" -#include "CDir.h" -#include "MDS.h" -#include "MDSMap.h" -#include "MDLog.h" -#include "AnchorClient.h" -#include "Migrator.h" -#include "Renamer.h" - -#include "include/filepath.h" - -#include "msg/Message.h" -#include "msg/Messenger.h" - -#include "events/EString.h" -#include "events/EUnlink.h" - -#include "messages/MRenameWarning.h" -#include "messages/MRenameNotify.h" -#include "messages/MRenameNotifyAck.h" -#include "messages/MRename.h" -#include "messages/MRenameAck.h" -#include "messages/MRenameReq.h" -#include "messages/MRenamePrep.h" - - - -void Renamer::dispatch(Message *m) -{ - switch (m->get_type()) { - case MSG_MDS_RENAMEWARNING: - handle_rename_warning((MRenameWarning*)m); - break; - case MSG_MDS_RENAMENOTIFY: - handle_rename_notify((MRenameNotify*)m); - break; - case MSG_MDS_RENAMENOTIFYACK: - handle_rename_notify_ack((MRenameNotifyAck*)m); - break; - case MSG_MDS_RENAME: - handle_rename((MRename*)m); - break; - case MSG_MDS_RENAMEREQ: - handle_rename_req((MRenameReq*)m); - break; - case MSG_MDS_RENAMEPREP: - handle_rename_prep((MRenamePrep*)m); - break; - case MSG_MDS_RENAMEACK: - handle_rename_ack((MRenameAck*)m); - break; - - default: - assert(0); - } -} - - -// renaming! - - -/* - fix_renamed_dir(): - - caller has already: - - relinked inode in new location - - fixed in->is_auth() - - set dir_auth, if appropriate - - caller has not: - - touched in->dir - - updated import/export tables -*/ -void Renamer::fix_renamed_dir(CDir *srcdir, - CInode *in, - CDir *destdir, - bool authchanged, // _inode_ auth - int dir_auth) // dir auth (for certain cases) -{ - dout(7) << "fix_renamed_dir on " << *in << endl; - dout(7) << "fix_renamed_dir on " << *in->dir << endl; - - if (in->dir->is_auth()) { - // dir ours - dout(7) << "dir is auth" << endl; - assert(!in->dir->is_export()); - - if (in->is_auth()) { - // inode now ours - - if (authchanged) { - // inode _was_ replica, now ours - dout(7) << "inode was replica, now ours. removing from import list." << endl; - assert(in->dir->is_import()); - - // not import anymore! - cache->imports.erase(in->dir); - in->dir->state_clear(CDIR_STATE_IMPORT); - in->dir->put(CDir::PIN_IMPORT); - - in->dir->set_dir_auth( CDIR_AUTH_PARENT ); - dout(7) << " fixing dir_auth to be " << in->dir->get_dir_auth() << endl; - - // move my nested imports to in's containing import - CDir *con = cache->get_auth_container(in->dir); - assert(con); - for (set::iterator p = cache->nested_exports[in->dir].begin(); - p != cache->nested_exports[in->dir].end(); - p++) { - dout(7) << "moving nested export under new container " << *con << endl; - cache->nested_exports[con].insert(*p); - } - cache->nested_exports.erase(in->dir); - - } else { - // inode was ours, still ours. - dout(7) << "inode was ours, still ours." << endl; - assert(!in->dir->is_import()); - assert(in->dir->get_dir_auth() == CDIR_AUTH_PARENT); - - // move any exports nested beneath me? - CDir *newcon = cache->get_auth_container(in->dir); - assert(newcon); - CDir *oldcon = cache->get_auth_container(srcdir); - assert(oldcon); - if (newcon != oldcon) { - dout(7) << "moving nested exports under new container" << endl; - set nested; - cache->find_nested_exports_under(oldcon, in->dir, nested); - for (set::iterator it = nested.begin(); - it != nested.end(); - it++) { - dout(7) << "moving nested export " << *it << " under new container" << endl; - cache->nested_exports[oldcon].erase(*it); - cache->nested_exports[newcon].insert(*it); - } - } - } - - } else { - // inode now replica - - if (authchanged) { - // inode was ours, but now replica - dout(7) << "inode was ours, now replica. adding to import list." << endl; - - // i am now an import - cache->imports.insert(in->dir); - in->dir->state_set(CDIR_STATE_IMPORT); - in->dir->get(CDir::PIN_IMPORT); - - in->dir->set_dir_auth( mds->get_nodeid() ); - dout(7) << " fixing dir_auth to be " << in->dir->get_dir_auth() << endl; - - // find old import - CDir *oldcon = cache->get_auth_container(srcdir); - assert(oldcon); - dout(7) << " oldcon is " << *oldcon << endl; - - // move nested exports under me - set nested; - cache->find_nested_exports_under(oldcon, in->dir, nested); - for (set::iterator it = nested.begin(); - it != nested.end(); - it++) { - dout(7) << "moving nested export " << *it << " under me" << endl; - cache->nested_exports[oldcon].erase(*it); - cache->nested_exports[in->dir].insert(*it); - } - - } else { - // inode was replica, still replica - dout(7) << "inode was replica, still replica. doing nothing." << endl; - assert(in->dir->is_import()); - - // verify dir_auth - assert(in->dir->get_dir_auth() == mds->get_nodeid()); // me, because i'm auth for dir. - assert(in->authority() != in->dir->get_dir_auth()); // inode not me. - } - - assert(in->dir->is_import()); - } - - } else { - // dir is not ours - dout(7) << "dir is not auth" << endl; - - if (in->is_auth()) { - // inode now ours - - if (authchanged) { - // inode was replica, now ours - dout(7) << "inode was replica, now ours. now an export." << endl; - assert(!in->dir->is_export()); - - // now export - cache->exports.insert(in->dir); - in->dir->state_set(CDIR_STATE_EXPORT); - in->dir->get(CDir::PIN_EXPORT); - - assert(dir_auth >= 0); // better be defined - in->dir->set_dir_auth( dir_auth ); - dout(7) << " fixing dir_auth to be " << in->dir->get_dir_auth() << endl; - - CDir *newcon = cache->get_auth_container(in->dir); - assert(newcon); - cache->nested_exports[newcon].insert(in->dir); - - } else { - // inode was ours, still ours - dout(7) << "inode was ours, still ours. did my import change?" << endl; - - // sanity - assert(in->dir->is_export()); - assert(in->dir->get_dir_auth() >= 0); - assert(in->dir->get_dir_auth() != in->authority()); - - // moved under new import? - CDir *oldcon = cache->get_auth_container(srcdir); - CDir *newcon = cache->get_auth_container(in->dir); - if (oldcon != newcon) { - dout(7) << "moving myself under new import " << *newcon << endl; - cache->nested_exports[oldcon].erase(in->dir); - cache->nested_exports[newcon].insert(in->dir); - } - } - - assert(in->dir->is_export()); - } else { - // inode now replica - - if (authchanged) { - // inode was ours, now replica - dout(7) << "inode was ours, now replica. removing from export list." << endl; - assert(in->dir->is_export()); - - // remove from export list - cache->exports.erase(in->dir); - in->dir->state_clear(CDIR_STATE_EXPORT); - in->dir->put(CDir::PIN_EXPORT); - - CDir *oldcon = cache->get_auth_container(srcdir); - assert(oldcon); - assert(cache->nested_exports[oldcon].count(in->dir) == 1); - cache->nested_exports[oldcon].erase(in->dir); - - // simplify dir_auth - if (in->authority() == in->dir->authority()) { - in->dir->set_dir_auth( CDIR_AUTH_PARENT ); - dout(7) << "simplified dir_auth to -1, inode auth is (also) " << in->authority() << endl; - } else { - assert(in->dir->get_dir_auth() >= 0); // someone else's export, - } - - } else { - // inode was replica, still replica - dout(7) << "inode was replica, still replica. do nothing." << endl; - - // fix dir_auth? - if (in->authority() == dir_auth) - in->dir->set_dir_auth( CDIR_AUTH_PARENT ); - else - in->dir->set_dir_auth( dir_auth ); - dout(7) << " fixing dir_auth to be " << dir_auth << endl; - - // do nothing. - } - - assert(!in->dir->is_export()); - } - } - - cache->show_imports(); -} - -/* - * when initiator gets an ack back for a foreign rename - */ - -class C_MDC_RenameNotifyAck : public Context { - Renamer *rn; - CInode *in; - int initiator; - -public: - C_MDC_RenameNotifyAck(Renamer *r, - CInode *i, int init) : rn(r), in(i), initiator(init) {} - void finish(int r) { - rn->file_rename_ack(in, initiator); - } -}; - - - -/************** initiator ****************/ - -/* - * when we get MRenameAck (and rename is done, notifies gone out+acked, etc.) - */ -class C_MDC_RenameAck : public Context { - Renamer *mdc; - CDir *srcdir; - CInode *in; - Context *c; -public: - C_MDC_RenameAck(Renamer *mdc, CDir *srcdir, CInode *in, Context *c) { - this->mdc = mdc; - this->srcdir = srcdir; - this->in = in; - this->c = c; - } - void finish(int r) { - mdc->file_rename_finish(srcdir, in, c); - } -}; - - -void Renamer::file_rename(CDentry *srcdn, CDentry *destdn, Context *onfinish) -{ - assert(srcdn->is_xlocked()); // by me - assert(destdn->is_xlocked()); // by me - - CDir *srcdir = srcdn->dir; - string srcname = srcdn->name; - - CDir *destdir = destdn->dir; - string destname = destdn->name; - - CInode *in = srcdn->inode; - //Message *req = srcdn->xlockedby; - - - // determine the players - int srcauth = srcdir->dentry_authority(srcdn->name); - int destauth = destdir->dentry_authority(destname); - - - // FOREIGN rename? - if (srcauth != mds->get_nodeid() || - destauth != mds->get_nodeid()) { - dout(7) << "foreign rename. srcauth " << srcauth << ", destauth " << destauth << ", isdir " << srcdn->inode->is_dir() << endl; - - string destpath; - destdn->make_path(destpath); - - if (destauth != mds->get_nodeid()) { - // make sure dest has dir open. - dout(7) << "file_rename i'm not dest auth. sending MRenamePrep to " << destauth << endl; - - // prep dest first, they must have the dir open! rest will follow. - string srcpath; - srcdn->make_path(srcpath); - - MRenamePrep *m = new MRenamePrep(mds->get_nodeid(), // i'm the initiator - srcdir->ino(), srcname, srcpath, - destdir->ino(), destname, destpath, - srcauth); // tell dest who src is (maybe even me) - mds->send_message_mds(m, destauth, MDS_PORT_CACHE); - - cache->show_imports(); - - } - - else if (srcauth != mds->get_nodeid()) { - if (destauth == mds->get_nodeid()) { - dout(7) << "file_rename dest auth, not src auth. sending MRenameReq" << endl; - } else { - dout(7) << "file_rename neither src auth nor dest auth. sending MRenameReq" << endl; - } - - // srcdn not important on destauth, just request - MRenameReq *m = new MRenameReq(mds->get_nodeid(), // i'm the initiator - srcdir->ino(), srcname, - destdir->ino(), destname, destpath, destauth); // tell src who dest is (they may not know) - mds->send_message_mds(m, srcauth, MDS_PORT_CACHE); - } - - else - assert(0); - - // set waiter on the inode (is this the best place?) - in->add_waiter(CINODE_WAIT_RENAMEACK, - new C_MDC_RenameAck(this, - srcdir, in, onfinish)); - return; - } - - // LOCAL rename! - assert(srcauth == mds->get_nodeid() && destauth == mds->get_nodeid()); - dout(7) << "file_rename src and dest auth, renaming locally (easy!)" << endl; - - // update our cache - if (destdn->inode && destdn->inode->is_dirty()) - destdn->inode->mark_clean(); - - cache->rename_file(srcdn, destdn); - - // update imports/exports? - if (in->is_dir() && in->dir) - fix_renamed_dir(srcdir, in, destdir, false); // auth didnt change - - // mark dentries dirty - srcdn->_mark_dirty(); // fixme - destdn->_mark_dirty(); // fixme - in->_mark_dirty(); // fixme - - - // local, restrict notify to ppl with open dirs - set notify; - for (map::iterator it = srcdir->replicas_begin(); - it != srcdir->replicas_end(); - ++it) - notify.insert(it->first); - for (map::iterator it = destdir->replicas_begin(); - it != destdir->replicas_end(); - it++) - if (notify.count(it->first) == 0) notify.insert(it->first); - - if (notify.size()) { - // warn + notify - file_rename_warn(in, notify); - file_rename_notify(in, srcdir, srcname, destdir, destname, notify, mds->get_nodeid()); - - // wait for MRenameNotifyAck's - in->add_waiter(CINODE_WAIT_RENAMENOTIFYACK, - new C_MDC_RenameNotifyAck(this, in, mds->get_nodeid())); // i am initiator - - // wait for finish - in->add_waiter(CINODE_WAIT_RENAMEACK, - new C_MDC_RenameAck(this, srcdir, in, onfinish)); - } else { - // sweet, no notify necessary, we're done! - file_rename_finish(srcdir, in, onfinish); - } -} - -void Renamer::handle_rename_ack(MRenameAck *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - - dout(7) << "handle_rename_ack on " << *in << endl; - - // all done! - in->finish_waiting(CINODE_WAIT_RENAMEACK); - - delete m; -} - -void Renamer::file_rename_finish(CDir *srcdir, CInode *in, Context *c) -{ - dout(10) << "file_rename_finish on " << *in << endl; - - // did i empty out an imported dir? FIXME this check should go somewhere else??? - if (srcdir->is_import() && !srcdir->inode->is_root() && srcdir->get_size() == 0) - cache->migrator->export_empty_import(srcdir); - - // finish our caller - if (c) { - c->finish(0); - delete c; - } -} - - -/************* src **************/ - - -/** handle_rename_req - * received by auth of src dentry (from init, or destauth if dir). - * src may not have dest dir open. - * src will export inode, unlink|rename, and send MRename to dest. - */ -void Renamer::handle_rename_req(MRenameReq *m) -{ - // i am auth, i will have it. - CInode *srcdiri = cache->get_inode(m->get_srcdirino()); - CDir *srcdir = srcdiri->dir; - CDentry *srcdn = srcdir->lookup(m->get_srcname()); - assert(srcdn); - - // do it - file_rename_foreign_src(srcdn, - m->get_destdirino(), m->get_destname(), m->get_destpath(), m->get_destauth(), - m->get_initiator()); - delete m; -} - - -void Renamer::file_rename_foreign_src(CDentry *srcdn, - inodeno_t destdirino, string& destname, string& destpath, int destauth, - int initiator) -{ - dout(7) << "file_rename_foreign_src " << *srcdn << endl; - - CDir *srcdir = srcdn->dir; - string srcname = srcdn->name; - - // (we're basically exporting this inode) - CInode *in = srcdn->inode; - assert(in); - assert(in->is_auth()); - - if (in->is_dir()) cache->show_imports(); - - // encode and export inode state - bufferlist inode_state; - cache->migrator->encode_export_inode(in, inode_state, destauth); - - // send - MRename *m = new MRename(initiator, - srcdir->ino(), srcdn->name, destdirino, destname, - inode_state); - mds->send_message_mds(m, destauth, MDS_PORT_CACHE); - - // have dest? - CInode *destdiri = cache->get_inode(m->get_destdirino()); - CDir *destdir = 0; - if (destdiri) destdir = destdiri->dir; - CDentry *destdn = 0; - if (destdir) destdn = destdir->lookup(m->get_destname()); - - // discover src - if (!destdn) { - dout(7) << "file_rename_foreign_src doesn't have destdn, discovering " << destpath << endl; - - filepath destfilepath = destpath; - vector trace; - int r = cache->path_traverse(destfilepath, trace, true, - m, new C_MDS_RetryMessage(mds, m), - MDS_TRAVERSE_DISCOVER); - assert(r>0); - return; - } - - assert(destdn); - - // update our cache - cache->rename_file(srcdn, destdn); - - // update imports/exports? - if (in->is_dir() && in->dir) - fix_renamed_dir(srcdir, in, destdir, true); // auth changed - - srcdn->_mark_dirty(); // fixme - - // proxy! - in->state_set(CInode::STATE_PROXY); - in->get(CInode::PIN_PROXY); - - // generate notify list (everybody but src|dst) and send warnings - set notify; - for (int i=0; iget_mds_map()->get_num_mds(); i++) { - if (i != mds->get_nodeid() && // except the source - i != destauth) // and the dest - notify.insert(i); - } - file_rename_warn(in, notify); - - - // wait for MRenameNotifyAck's - in->add_waiter(CINODE_WAIT_RENAMENOTIFYACK, - new C_MDC_RenameNotifyAck(this, in, initiator)); -} - -void Renamer::file_rename_warn(CInode *in, - set& notify) -{ - // note gather list - rename_waiting_for_ack[in->ino()] = notify; - - // send - for (set::iterator it = notify.begin(); - it != notify.end(); - it++) { - dout(10) << "file_rename_warn to " << *it << " for " << *in << endl; - mds->send_message_mds(new MRenameWarning(in->ino()), *it, MDS_PORT_CACHE); - } -} - - -void Renamer::handle_rename_notify_ack(MRenameNotifyAck *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - dout(7) << "handle_rename_notify_ack on " << *in << endl; - - int source = m->get_source().num(); - rename_waiting_for_ack[in->ino()].erase(source); - if (rename_waiting_for_ack[in->ino()].empty()) { - // last one! - rename_waiting_for_ack.erase(in->ino()); - in->finish_waiting(CINODE_WAIT_RENAMENOTIFYACK, 0); - } else { - dout(7) << "still waiting for " << rename_waiting_for_ack[in->ino()] << endl; - } -} - - -void Renamer::file_rename_ack(CInode *in, int initiator) -{ - // we got all our MNotifyAck's. - - // was i proxy (if not, it's cuz this was a local rename) - if (in->state_test(CInode::STATE_PROXY)) { - dout(10) << "file_rename_ack clearing proxy bit on " << *in << endl; - in->state_clear(CInode::STATE_PROXY); - in->put(CInode::PIN_PROXY); - } - - // done! - if (initiator == mds->get_nodeid()) { - // it's me, finish - dout(7) << "file_rename_ack i am initiator, finishing" << endl; - in->finish_waiting(CINODE_WAIT_RENAMEACK); - } else { - // send ack - dout(7) << "file_rename_ack sending MRenameAck to initiator " << initiator << endl; - mds->send_message_mds(new MRenameAck(in->ino()), initiator, MDS_PORT_CACHE); - } -} - - - - -/************ dest *************/ - -/** handle_rename_prep - * received by auth of dest dentry to make sure they have src + dir open. - * this is so that when they get the inode and dir, they can update exports etc properly. - * will send MRenameReq to src. - */ -void Renamer::handle_rename_prep(MRenamePrep *m) -{ - // open src - filepath srcpath = m->get_srcpath(); - vector trace; - int r = cache->path_traverse(srcpath, trace, false, - m, new C_MDS_RetryMessage(mds, m), - MDS_TRAVERSE_DISCOVER); - - if (r>0) return; - - // ok! - CInode *srcin = trace[trace.size()-1]->inode; - assert(srcin); - - dout(7) << "handle_rename_prep have srcin " << *srcin << endl; - - if (srcin->is_dir()) { - if (!srcin->dir) { - dout(7) << "handle_rename_prep need to open dir" << endl; - cache->open_remote_dir(srcin, - new C_MDS_RetryMessage(mds,m)); - return; - } - - dout(7) << "handle_rename_prep have dir " << *srcin->dir << endl; - } - - // pin - srcin->get(CInode::PIN_RENAMESRC); - - // send rename request - MRenameReq *req = new MRenameReq(m->get_initiator(), // i'm the initiator - m->get_srcdirino(), m->get_srcname(), - m->get_destdirino(), m->get_destname(), m->get_destpath(), - mds->get_nodeid()); // i am dest - mds->send_message_mds(req, m->get_srcauth(), MDS_PORT_CACHE); - delete m; - return; -} - - - -/** handle_rename - * received by auth of dest dentry. includes exported inode info. - * dest may not have srcdir open. - */ -void Renamer::handle_rename(MRename *m) -{ - // srcdn (required) - CInode *srcdiri = cache->get_inode(m->get_srcdirino()); - CDir *srcdir = srcdiri->dir; - CDentry *srcdn = srcdir->lookup(m->get_srcname()); - string srcname = srcdn->name; - assert(srcdn && srcdn->inode); - - dout(7) << "handle_rename srcdn " << *srcdn << endl; - - // destdn (required). i am auth, so i will have it. - CInode *destdiri = cache->get_inode(m->get_destdirino()); - CDir *destdir = destdiri->dir; - CDentry *destdn = destdir->lookup(m->get_destname()); - string destname = destdn->name; - assert(destdn); - - dout(7) << "handle_rename destdn " << *destdn << endl; - - // note old dir auth - int old_dir_auth = -1; - if (srcdn->inode->dir) old_dir_auth = srcdn->inode->dir->authority(); - - // rename replica into position - if (destdn->inode && destdn->inode->is_dirty()) - destdn->inode->mark_clean(); - - cache->rename_file(srcdn, destdn); - - // decode + import inode (into new location start) - int off = 0; - // HACK - bufferlist bufstate; - bufstate.claim_append(m->get_inode_state()); - cache->migrator->decode_import_inode(destdn, bufstate, off, m->get_source().num()); - - CInode *in = destdn->inode; - assert(in); - - // update imports/exports? - if (in->is_dir()) { - assert(in->dir); // i had better already ahve it open.. see MRenamePrep - fix_renamed_dir(srcdir, in, destdir, true, // auth changed - old_dir_auth); // src is possibly new dir auth. - } - - // mark dirty - destdn->_mark_dirty(); // fixme - in->_mark_dirty(); // fixme - - // unpin - in->put(CInode::PIN_RENAMESRC); - - // ok, send notifies. - set notify; - for (int i=0; iget_mds_map()->get_num_mds(); i++) { - if (i != m->get_source().num() && // except the source - i != mds->get_nodeid()) // and the dest - notify.insert(i); - } - file_rename_notify(in, srcdir, srcname, destdir, destname, notify, m->get_source().num()); - - delete m; -} - - -void Renamer::file_rename_notify(CInode *in, - CDir *srcdir, string& srcname, CDir *destdir, string& destname, - set& notify, - int srcauth) -{ - /* NOTE: notify list might include myself */ - - // tell - string destdirpath; - destdir->inode->make_path(destdirpath); - - for (set::iterator it = notify.begin(); - it != notify.end(); - it++) { - dout(10) << "file_rename_notify to " << *it << " for " << *in << endl; - mds->send_message_mds(new MRenameNotify(in->ino(), - srcdir->ino(), - srcname, - destdir->ino(), - destdirpath, - destname, - srcauth), - *it, MDS_PORT_CACHE); - } -} - - - -/************** bystanders ****************/ - -void Renamer::handle_rename_warning(MRenameWarning *m) -{ - // add to warning list - stray_rename_warnings.insert( m->get_ino() ); - - // did i already see the notify? - if (stray_rename_notifies.count(m->get_ino())) { - // i did, we're good. - dout(7) << "handle_rename_warning on " << m->get_ino() << ". already got notify." << endl; - - handle_rename_notify(stray_rename_notifies[m->get_ino()]); - stray_rename_notifies.erase(m->get_ino()); - } else { - dout(7) << "handle_rename_warning on " << m->get_ino() << ". waiting for notify." << endl; - } - - // done - delete m; -} - - -void Renamer::handle_rename_notify(MRenameNotify *m) -{ - // FIXME: when we do hard links, i think we need to - // have srcdn and destdn both, or neither, always! - - // did i see the warning yet? - if (!stray_rename_warnings.count(m->get_ino())) { - // wait for it. - dout(7) << "handle_rename_notify on " << m->get_ino() << ", waiting for warning." << endl; - stray_rename_notifies[m->get_ino()] = m; - return; - } - - dout(7) << "handle_rename_notify dir " << m->get_srcdirino() << " dn " << m->get_srcname() << " to dir " << m->get_destdirino() << " dname " << m->get_destname() << endl; - - // src - CInode *srcdiri = cache->get_inode(m->get_srcdirino()); - CDir *srcdir = 0; - if (srcdiri) srcdir = srcdiri->dir; - CDentry *srcdn = 0; - if (srcdir) srcdn = srcdir->lookup(m->get_srcname()); - - // dest - CInode *destdiri = cache->get_inode(m->get_destdirino()); - CDir *destdir = 0; - if (destdiri) destdir = destdiri->dir; - CDentry *destdn = 0; - if (destdir) destdn = destdir->lookup(m->get_destname()); - - // have both? - list finished; - if (srcdn && destdir) { - CInode *in = srcdn->inode; - - int old_dir_auth = -1; - if (in && in->dir) old_dir_auth = in->dir->authority(); - - if (!destdn) { - destdn = destdir->add_dentry(m->get_destname()); // create null dentry - destdn->lockstate = DN_LOCK_XLOCK; // that's xlocked! - } - - dout(7) << "handle_rename_notify renaming " << *srcdn << " to " << *destdn << endl; - - if (in) { - cache->rename_file(srcdn, destdn); - - // update imports/exports? - if (in && in->is_dir() && in->dir) { - fix_renamed_dir(srcdir, in, destdir, false, old_dir_auth); // auth didnt change - } - } else { - dout(7) << " i don't have the inode (just null dentries)" << endl; - } - - } - - else if (srcdn) { - dout(7) << "handle_rename_notify no dest, but have src" << endl; - dout(7) << "srcdn is " << *srcdn << endl; - - if (destdiri) { - dout(7) << "have destdiri, opening dir " << *destdiri << endl; - cache->open_remote_dir(destdiri, - new C_MDS_RetryMessage(mds,m)); - } else { - filepath destdirpath = m->get_destdirpath(); - dout(7) << "don't have destdiri even, doing traverse+discover on " << destdirpath << endl; - - vector trace; - int r = cache->path_traverse(destdirpath, trace, true, - m, new C_MDS_RetryMessage(mds, m), - MDS_TRAVERSE_DISCOVER); - assert(r>0); - } - return; - } - - else if (destdn) { - dout(7) << "handle_rename_notify unlinking dst only " << *destdn << endl; - if (destdn->inode) { - destdir->unlink_inode(destdn); - } - } - - else { - dout(7) << "handle_rename_notify didn't have srcdn or destdn" << endl; - assert(srcdn == 0 && destdn == 0); - } - - mds->queue_finished(finished); - - - // ack - dout(10) << "sending RenameNotifyAck back to srcauth " << m->get_srcauth() << endl; - MRenameNotifyAck *ack = new MRenameNotifyAck(m->get_ino()); - mds->send_message_mds(ack, m->get_srcauth(), MDS_PORT_CACHE); - - - stray_rename_warnings.erase( m->get_ino() ); - delete m; -} - - - - diff --git a/branches/marnberg/quota/mds/Renamer.h b/branches/marnberg/quota/mds/Renamer.h deleted file mode 100644 index 1005971df986f..0000000000000 --- a/branches/marnberg/quota/mds/Renamer.h +++ /dev/null @@ -1,98 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_RENAMER_H -#define __MDS_RENAMER_H - -#include "include/types.h" - -#include -#include -using std::map; -using std::set; - -class MDS; -class MDCache; -class CDentry; -class CInode; -class CDir; - -class Message; -class MRenameWarning; -class MRenameNotify; -class MRenameNotifyAck; -class MRename; -class MRenamePrep; -class MRenameReq; -class MRenameAck; - -class Renamer { - MDS *mds; - MDCache *cache; - - // rename fun - set stray_rename_warnings; // notifies i haven't seen - map stray_rename_notifies; - - map > rename_waiting_for_ack; - - - - void fix_renamed_dir(CDir *srcdir, - CInode *in, - CDir *destdir, - bool authchanged, // _inode_ auth changed - int dirauth=-1); // dirauth (for certain cases) - - -public: - Renamer(MDS *m, MDCache *c) : mds(m), cache(c) {} - - void dispatch(Message *m); - - // RENAME - // initiator - public: - void file_rename(CDentry *srcdn, CDentry *destdn, Context *c); - protected: - void handle_rename_ack(MRenameAck *m); // dest -> init (almost always) - void file_rename_finish(CDir *srcdir, CInode *in, Context *c); - friend class C_MDC_RenameAck; - - // src - void handle_rename_req(MRenameReq *m); // dest -> src - void file_rename_foreign_src(CDentry *srcdn, - inodeno_t destdirino, string& destname, string& destpath, int destauth, - int initiator); - void file_rename_warn(CInode *in, set& notify); - void handle_rename_notify_ack(MRenameNotifyAck *m); // bystanders -> src - void file_rename_ack(CInode *in, int initiator); - friend class C_MDC_RenameNotifyAck; - - // dest - void handle_rename_prep(MRenamePrep *m); // init -> dest - void handle_rename(MRename *m); // src -> dest - void file_rename_notify(CInode *in, - CDir *srcdir, string& srcname, CDir *destdir, string& destname, - set& notify, int srcauth); - - // bystander - void handle_rename_warning(MRenameWarning *m); // src -> bystanders - void handle_rename_notify(MRenameNotify *m); // dest -> bystanders - - -}; - -#endif - - diff --git a/branches/marnberg/quota/mds/Server.cc b/branches/marnberg/quota/mds/Server.cc deleted file mode 100644 index b7178e31a4d5a..0000000000000 --- a/branches/marnberg/quota/mds/Server.cc +++ /dev/null @@ -1,2376 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "MDS.h" -#include "Server.h" -#include "Locker.h" -#include "MDCache.h" -#include "MDLog.h" -#include "Migrator.h" -#include "MDBalancer.h" -#include "Renamer.h" -#include "MDStore.h" - -#include "msg/Messenger.h" - -#include "messages/MClientMount.h" -#include "messages/MClientMountAck.h" -#include "messages/MClientRequest.h" -#include "messages/MClientReply.h" -#include "messages/MHashReaddir.h" -#include "messages/MHashReaddirReply.h" - -#include "messages/MLock.h" - -#include "messages/MInodeLink.h" - -#include "events/EString.h" -#include "events/EUpdate.h" - -#include "include/filepath.h" -#include "common/Timer.h" -#include "common/Logger.h" -#include "common/LogType.h" - -#include -#include - -#include -#include -using namespace std; - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".server " -#define derr(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".server " - - -void Server::dispatch(Message *m) -{ - // active? - if (!mds->is_active()) { - dout(3) << "not active yet, waiting" << endl; - mds->queue_waitfor_active(new C_MDS_RetryMessage(mds, m)); - return; - } - - switch (m->get_type()) { - case MSG_CLIENT_MOUNT: - handle_client_mount((MClientMount*)m); - return; - case MSG_CLIENT_UNMOUNT: - handle_client_unmount(m); - return; - } - - - switch (m->get_type()) { - case MSG_CLIENT_REQUEST: - handle_client_request((MClientRequest*)m); - return; - - case MSG_MDS_HASHREADDIR: - handle_hash_readdir((MHashReaddir*)m); - return; - case MSG_MDS_HASHREADDIRREPLY: - handle_hash_readdir_reply((MHashReaddirReply*)m); - return; - - } - - dout(1) << " main unknown message " << m->get_type() << endl; - assert(0); -} - - - - - -void Server::handle_client_mount(MClientMount *m) -{ - int n = m->get_source().num(); - dout(3) << "mount by client" << n << endl; - mds->clientmap.add_mount(n, m->get_source_inst()); - - assert(mds->get_nodeid() == 0); // mds0 mounts/unmounts - - // ack - messenger->send_message(new MClientMountAck(m, mds->mdsmap, mds->osdmap), - m->get_source_inst()); - delete m; -} - -void Server::handle_client_unmount(Message *m) -{ - int n = m->get_source().num(); - dout(3) << "unmount by client" << n << endl; - - assert(mds->get_nodeid() == 0); // mds0 mounts/unmounts - - mds->clientmap.rem_mount(n); - - if (g_conf.mds_shutdown_on_last_unmount && - mds->clientmap.get_mount_set().empty()) { - dout(3) << "all clients done, initiating shutdown" << endl; - mds->shutdown_start(); - } - - // ack by sending back to client - messenger->send_message(m, m->get_source_inst()); -} - - - -/******* - * some generic stuff for finishing off requests - */ - -/** C_MDS_CommitRequest - */ - -class C_MDS_CommitRequest : public Context { - Server *server; - MClientRequest *req; - MClientReply *reply; - CInode *tracei; // inode to include a trace for - LogEvent *event; - -public: - C_MDS_CommitRequest(Server *server, - MClientRequest *req, MClientReply *reply, CInode *tracei, - LogEvent *event=0) { - this->server = server; - this->req = req; - this->tracei = tracei; - this->reply = reply; - this->event = event; - } - void finish(int r) { - if (r != 0) { - // failure. set failure code and reply. - reply->set_result(r); - } - if (event) { - server->commit_request(req, reply, tracei, event); - } else { - // reply. - server->reply_request(req, reply, tracei); - } - } -}; - - -/* - * send generic response (just and error code) - */ -void Server::reply_request(MClientRequest *req, int r, CInode *tracei) -{ - reply_request(req, new MClientReply(req, r), tracei); -} - - -/* - * send given reply - * include a trace to tracei - */ -void Server::reply_request(MClientRequest *req, MClientReply *reply, CInode *tracei) { - dout(10) << "reply_request r=" << reply->get_result() << " " << *req << endl; - - // include trace - if (tracei) { - reply->set_trace_dist( tracei, mds->get_nodeid() ); - } - - // send reply - messenger->send_message(reply, - req->get_client_inst()); - - // discard request - mdcache->request_finish(req); - - // stupid stats crap (FIXME) - stat_ops++; -} - - -void Server::submit_update(MClientRequest *req, - CInode *wrlockedi, - LogEvent *event, - Context *oncommit) -{ - // log - mdlog->submit_entry(event); - - // pin - mdcache->request_pin_inode(req, wrlockedi); - - // wait - mdlog->wait_for_sync(oncommit); -} - - -/* - * commit event(s) to the metadata journal, then reply. - * or, be sloppy and do it concurrently (see g_conf.mds_log_before_reply) - * - * NOTE: this is old and bad (write-behind!) - */ -void Server::commit_request(MClientRequest *req, - MClientReply *reply, - CInode *tracei, - LogEvent *event, - LogEvent *event2) -{ - // log - if (event) mdlog->submit_entry(event); - if (event2) mdlog->submit_entry(event2); - - if (g_conf.mds_log_before_reply && g_conf.mds_log && event) { - // SAFE mode! - - // pin inode so it doesn't go away! - if (tracei) mdcache->request_pin_inode(req, tracei); - - // wait for log sync - mdlog->wait_for_sync(new C_MDS_CommitRequest(this, req, reply, tracei)); - return; - } - else { - // just reply - reply_request(req, reply, tracei); - } -} - - - -/*** - * process a client request - */ - -void Server::handle_client_request(MClientRequest *req) -{ - dout(4) << "req " << *req << endl; - - // note original client addr - if (req->get_source().is_client()) { - req->set_client_inst( req->get_source_inst() ); - req->clear_payload(); - } - - if (!mds->is_active()) { - dout(5) << " not active, discarding client request." << endl; - delete req; - return; - } - - if (!mdcache->get_root()) { - dout(5) << "need to open root" << endl; - mdcache->open_root(new C_MDS_RetryMessage(mds, req)); - return; - } - - // okay, i want - CInode *ref = 0; - vector trace; // might be blank, for fh guys - - bool follow_trailing_symlink = false; - - // operations on fh's or other non-files - switch (req->get_op()) { - /* - case MDS_OP_FSTAT: - reply = handle_client_fstat(req, cur); - break; ****** fiX ME *** - */ - - case MDS_OP_TRUNCATE: - if (!req->get_ino()) break; // can be called w/ either fh OR path - - case MDS_OP_RELEASE: - case MDS_OP_FSYNC: - ref = mdcache->get_inode(req->get_ino()); // fixme someday no ino needed? - - if (!ref) { - int next = mds->get_nodeid() + 1; - if (next >= mds->mdsmap->get_num_mds()) next = 0; - dout(10) << "got request on ino we don't have, passing buck to " << next << endl; - mds->send_message_mds(req, next, MDS_PORT_SERVER); - return; - } - } - - if (!ref) { - // we need to traverse a path - filepath refpath = req->get_filepath(); - - // ops on non-existing files --> directory paths - switch (req->get_op()) { - case MDS_OP_OPEN: - if (!(req->get_iarg() & O_CREAT)) break; - - case MDS_OP_MKNOD: - case MDS_OP_MKDIR: - case MDS_OP_SYMLINK: - case MDS_OP_LINK: - case MDS_OP_UNLINK: // also wrt parent dir, NOT the unlinked inode!! - case MDS_OP_RMDIR: - case MDS_OP_RENAME: - // remove last bit of path - refpath = refpath.prefixpath(refpath.depth()-1); - break; - } - dout(10) << "refpath = " << refpath << endl; - - Context *ondelay = new C_MDS_RetryMessage(mds, req); - - if (req->get_op() == MDS_OP_LSTAT) { - follow_trailing_symlink = false; - } - - // do trace - int r = mdcache->path_traverse(refpath, trace, follow_trailing_symlink, - req, ondelay, - MDS_TRAVERSE_FORWARD, - 0, - true); // is MClientRequest - - if (r > 0) return; // delayed - if (r == -ENOENT || - r == -ENOTDIR || - r == -EISDIR) { - // error! - dout(10) << " path traverse error " << r << ", replying" << endl; - - // send error - messenger->send_message(new MClientReply(req, r), - req->get_client_inst()); - - // - // is this a special debug command? - if (refpath.depth() - 1 == trace.size() && - refpath.last_bit().find(".ceph.") == 0) { - CDir *dir = 0; - if (trace.empty()) - dir = mdcache->get_root()->dir; - else - dir = trace[trace.size()-1]->get_inode()->dir; - - dout(1) << "** POSSIBLE CEPH DEBUG COMMAND '" << refpath.last_bit() << "' in " << *dir << endl; - - if (refpath.last_bit() == ".ceph.hash" && - refpath.depth() > 1) { - dout(1) << "got explicit hash command " << refpath << endl; - CDir *dir = trace[trace.size()-1]->get_inode()->dir; - if (!dir->is_hashed() && - !dir->is_hashing() && - dir->is_auth()) - mdcache->migrator->hash_dir(dir); - } - else if (refpath.last_bit() == ".ceph.commit") { - dout(1) << "got explicit commit command on " << *dir << endl; - mds->mdstore->commit_dir(dir, 0); - } - } - // - - - delete req; - return; - } - - if (trace.size()) - ref = trace[trace.size()-1]->inode; - else - ref = mdcache->get_root(); - } - - dout(10) << "ref is " << *ref << endl; - - // rename doesn't pin src path (initially) - if (req->get_op() == MDS_OP_RENAME) trace.clear(); - - // register - if (!mdcache->request_start(req, ref, trace)) - return; - - // process - dispatch_request(req, ref); -} - - - -void Server::dispatch_request(Message *m, CInode *ref) -{ - MClientRequest *req = 0; - - // MLock or MClientRequest? - /* this is a little weird. - client requests and mlocks both initial dentry xlocks, path pins, etc., - and thus both make use of the context C_MDS_RetryRequest. - */ - switch (m->get_type()) { - case MSG_CLIENT_REQUEST: - req = (MClientRequest*)m; - break; // continue below! - - case MSG_MDS_LOCK: - mds->locker->handle_lock_dn((MLock*)m); - return; // done - - default: - assert(0); // shouldn't get here - } - - // MClientRequest. - - switch(req->get_op()) { - - // files - case MDS_OP_OPEN: - if (req->get_iarg() & O_CREAT) - handle_client_openc(req, ref); - else - handle_client_open(req, ref); - break; - case MDS_OP_TRUNCATE: - handle_client_truncate(req, ref); - break; - /* - case MDS_OP_FSYNC: - handle_client_fsync(req, ref); - break; - */ - /* - case MDS_OP_RELEASE: - handle_client_release(req, ref); - break; - */ - - // inodes - case MDS_OP_STAT: - case MDS_OP_LSTAT: - handle_client_stat(req, ref); - break; - case MDS_OP_UTIME: - handle_client_utime(req, ref); - break; - case MDS_OP_CHMOD: - handle_client_chmod(req, ref); - break; - case MDS_OP_CHOWN: - handle_client_chown(req, ref); - break; - - // namespace - case MDS_OP_READDIR: - handle_client_readdir(req, ref); - break; - case MDS_OP_MKNOD: - handle_client_mknod(req, ref); - break; - case MDS_OP_LINK: - handle_client_link(req, ref); - break; - case MDS_OP_UNLINK: - handle_client_unlink(req, ref); - break; - case MDS_OP_RENAME: - handle_client_rename(req, ref); - break; - case MDS_OP_RMDIR: - handle_client_unlink(req, ref); - break; - case MDS_OP_MKDIR: - handle_client_mkdir(req, ref); - break; - case MDS_OP_SYMLINK: - handle_client_symlink(req, ref); - break; - - - - default: - dout(1) << " unknown client op " << req->get_op() << endl; - assert(0); - } - - return; -} - - -// FIXME: this probably should go somewhere else. - -bool Server::try_open_dir(CInode *in, MClientRequest *req) -{ - if (!in->dir && in->is_frozen_dir()) { - // doh! - dout(10) << " dir inode is frozen, can't open dir, waiting " << *in << endl; - assert(in->get_parent_dir()); - in->get_parent_dir()->add_waiter(CDIR_WAIT_UNFREEZE, - new C_MDS_RetryRequest(mds, req, in)); - return false; - } - - in->get_or_open_dir(mds->mdcache); - return true; -} - - - - - -// =============================================================================== -// STAT - -void Server::handle_client_stat(MClientRequest *req, - CInode *ref) -{ - // FIXME: this is really not the way to handle the statlite mask. - - // do I need file info? - int mask = req->get_iarg(); - if (mask & (INODE_MASK_SIZE|INODE_MASK_MTIME)) { - // yes. do a full stat. - if (!mds->locker->inode_file_read_start(ref, req)) - return; // syncing - mds->locker->inode_file_read_finish(ref); - } else { - // nope! easy peasy. - } - - mds->balancer->hit_inode(ref, META_POP_IRD); - - // reply - //dout(10) << "reply to " << *req << " stat " << ref->inode.mtime << endl; - MClientReply *reply = new MClientReply(req); - reply_request(req, reply, ref); -} - - - - -// =============================================================================== -// INODE UPDATES - - -/* - * finisher: do a inode_file_write_finish and reply. - */ -class C_MDS_utime_finish : public Context { - MDS *mds; - MClientRequest *req; - CInode *in; - version_t pv; - time_t mtime, atime; -public: - C_MDS_utime_finish(MDS *m, MClientRequest *r, CInode *i, version_t pdv, time_t mt, time_t at) : - mds(m), req(r), in(i), - pv(pdv), - mtime(mt), atime(at) { } - void finish(int r) { - assert(r == 0); - - // apply - in->inode.mtime = mtime; - in->inode.atime = atime; - in->mark_dirty(pv); - - // unlock - mds->locker->inode_file_write_finish(in); - - // reply - MClientReply *reply = new MClientReply(req, 0); - reply->set_result(0); - mds->server->reply_request(req, reply, in); - } -}; - - -// utime - -void Server::handle_client_utime(MClientRequest *req, - CInode *cur) -{ - // write - if (!mds->locker->inode_file_write_start(cur, req)) - return; // fw or (wait for) sync - - mds->balancer->hit_inode(cur, META_POP_IWR); - - // prepare - version_t pdv = cur->pre_dirty(); - time_t mtime = req->get_targ(); - time_t atime = req->get_targ2(); - C_MDS_utime_finish *fin = new C_MDS_utime_finish(mds, req, cur, pdv, - mtime, atime); - - // log + wait - EUpdate *le = new EUpdate("utime"); - le->metablob.add_dir_context(cur->get_parent_dir()); - inode_t *pi = le->metablob.add_dentry(cur->parent, true); - pi->mtime = mtime; - pi->atime = mtime; - pi->version = pdv; - - mdlog->submit_entry(le); - mdlog->wait_for_sync(fin); -} - - -// -------------- - -/* - * finisher: do a inode_hard_write_finish and reply. - */ -class C_MDS_chmod_finish : public Context { - MDS *mds; - MClientRequest *req; - CInode *in; - version_t pv; - int mode; -public: - C_MDS_chmod_finish(MDS *m, MClientRequest *r, CInode *i, version_t pdv, int mo) : - mds(m), req(r), in(i), pv(pdv), mode(mo) { } - void finish(int r) { - assert(r == 0); - - // apply - in->inode.mode &= ~04777; - in->inode.mode |= (mode & 04777); - in->mark_dirty(pv); - - // unlock - mds->locker->inode_hard_write_finish(in); - - // reply - MClientReply *reply = new MClientReply(req, 0); - reply->set_result(0); - mds->server->reply_request(req, reply, in); - } -}; - - -// chmod - -void Server::handle_client_chmod(MClientRequest *req, - CInode *cur) -{ - // write - if (!mds->locker->inode_hard_write_start(cur, req)) - return; // fw or (wait for) lock - - mds->balancer->hit_inode(cur, META_POP_IWR); - - // prepare - version_t pdv = cur->pre_dirty(); - int mode = req->get_iarg(); - C_MDS_chmod_finish *fin = new C_MDS_chmod_finish(mds, req, cur, pdv, - mode); - - // log + wait - EUpdate *le = new EUpdate("chmod"); - le->metablob.add_dir_context(cur->get_parent_dir()); - inode_t *pi = le->metablob.add_dentry(cur->parent, true); - pi->mode = mode; - pi->version = pdv; - - mdlog->submit_entry(le); - mdlog->wait_for_sync(fin); -} - - -// chown - -class C_MDS_chown_finish : public Context { - MDS *mds; - MClientRequest *req; - CInode *in; - version_t pv; - int uid, gid; -public: - C_MDS_chown_finish(MDS *m, MClientRequest *r, CInode *i, version_t pdv, int u, int g) : - mds(m), req(r), in(i), pv(pdv), uid(u), gid(g) { } - void finish(int r) { - assert(r == 0); - - // apply - if (uid >= 0) in->inode.uid = uid; - if (gid >= 0) in->inode.gid = gid; - in->mark_dirty(pv); - - // unlock - mds->locker->inode_hard_write_finish(in); - - // reply - MClientReply *reply = new MClientReply(req, 0); - reply->set_result(0); - mds->server->reply_request(req, reply, in); - } -}; - - -void Server::handle_client_chown(MClientRequest *req, - CInode *cur) -{ - // write - if (!mds->locker->inode_hard_write_start(cur, req)) - return; // fw or (wait for) lock - - mds->balancer->hit_inode(cur, META_POP_IWR); - - // prepare - version_t pdv = cur->pre_dirty(); - int uid = req->get_iarg(); - int gid = req->get_iarg2(); - C_MDS_chown_finish *fin = new C_MDS_chown_finish(mds, req, cur, pdv, - uid, gid); - - // log + wait - EUpdate *le = new EUpdate("chown"); - le->metablob.add_dir_context(cur->get_parent_dir()); - inode_t *pi = le->metablob.add_dentry(cur->parent, true); - if (uid >= 0) pi->uid = uid; - if (gid >= 0) pi->gid = gid; - pi->version = pdv; - - mdlog->submit_entry(le); - mdlog->wait_for_sync(fin); -} - - - - - - - -// ================================================================= -// DIRECTORY and NAMESPACE OPS - -// READDIR - -int Server::encode_dir_contents(CDir *dir, - list& inls, - list& dnls) -{ - int numfiles = 0; - - for (CDir_map_t::iterator it = dir->begin(); - it != dir->end(); - it++) { - CDentry *dn = it->second; - - // hashed? - if (dir->is_hashed() && - mds->get_nodeid() != mds->mdcache->hash_dentry( dir->ino(), it->first )) - continue; - - if (dn->is_null()) continue; - - CInode *in = dn->inode; - if (!in) - continue; // hmm, fixme!, what about REMOTE links? - - dout(12) << "including inode " << *in << endl; - - // add this item - // note: InodeStat makes note of whether inode data is readable. - dnls.push_back( it->first ); - inls.push_back( new InodeStat(in, mds->get_nodeid()) ); - numfiles++; - } - return numfiles; -} - - -/* - * note: this is pretty sloppy, but should work just fine i think... - */ -void Server::handle_hash_readdir(MHashReaddir *m) -{ - CInode *cur = mdcache->get_inode(m->get_ino()); - assert(cur); - - if (!cur->dir || - !cur->dir->is_hashed()) { - assert(0); - dout(7) << "handle_hash_readdir don't have dir open, or not hashed. giving up!" << endl; - delete m; - return; - } - CDir *dir = cur->dir; - assert(dir); - assert(dir->is_hashed()); - - // complete? - if (!dir->is_complete()) { - dout(10) << " incomplete dir contents for readdir on " << *dir << ", fetching" << endl; - mds->mdstore->fetch_dir(dir, new C_MDS_RetryMessage(mds, m)); - return; - } - - // get content - list inls; - list dnls; - int num = encode_dir_contents(dir, inls, dnls); - - // sent it back! - messenger->send_message(new MHashReaddirReply(dir->ino(), inls, dnls, num), - m->get_source_inst(), MDS_PORT_CACHE); -} - - -void Server::handle_hash_readdir_reply(MHashReaddirReply *m) -{ - CInode *cur = mdcache->get_inode(m->get_ino()); - assert(cur); - - if (!cur->dir || - !cur->dir->is_hashed()) { - assert(0); - dout(7) << "handle_hash_readdir don't have dir open, or not hashed. giving up!" << endl; - delete m; - return; - } - CDir *dir = cur->dir; - assert(dir); - assert(dir->is_hashed()); - - // move items to hashed_readdir gather - int from = m->get_source().num(); - assert(dir->hashed_readdir.count(from) == 0); - dir->hashed_readdir[from].first.splice(dir->hashed_readdir[from].first.begin(), - m->get_in()); - dir->hashed_readdir[from].second.splice(dir->hashed_readdir[from].second.begin(), - m->get_dn()); - delete m; - - // gather finished? - if (dir->hashed_readdir.size() < (unsigned)mds->mdsmap->get_num_mds()) { - dout(7) << "still waiting for more hashed readdir bits" << endl; - return; - } - - dout(7) << "got last bit! finishing waiters" << endl; - - // do these finishers. they'll copy the results. - list finished; - dir->take_waiting(CDIR_WAIT_THISHASHEDREADDIR, finished); - finish_contexts(finished); - - // now discard these results - for (map, list > >::iterator it = dir->hashed_readdir.begin(); - it != dir->hashed_readdir.end(); - it++) { - for (list::iterator ci = it->second.first.begin(); - ci != it->second.first.end(); - ci++) - delete *ci; - } - dir->hashed_readdir.clear(); - - // unpin dir (we're done!) - dir->auth_unpin(); - - // trigger any waiters for next hashed readdir cycle - dir->take_waiting(CDIR_WAIT_NEXTHASHEDREADDIR, mds->finished_queue); -} - - -class C_MDS_HashReaddir : public Context { - Server *server; - MClientRequest *req; - CDir *dir; -public: - C_MDS_HashReaddir(Server *server, MClientRequest *req, CDir *dir) { - this->server = server; - this->req = req; - this->dir = dir; - } - void finish(int r) { - server->finish_hash_readdir(req, dir); - } -}; - -void Server::finish_hash_readdir(MClientRequest *req, CDir *dir) -{ - dout(7) << "finish_hash_readdir on " << *dir << endl; - - assert(dir->is_hashed()); - assert(dir->hashed_readdir.size() == (unsigned)mds->mdsmap->get_num_mds()); - - // reply! - MClientReply *reply = new MClientReply(req); - reply->set_result(0); - - for (int i=0; imdsmap->get_num_mds(); i++) { - reply->copy_dir_items(dir->hashed_readdir[i].first, - dir->hashed_readdir[i].second); - } - - // ok! - reply_request(req, reply, dir->inode); -} - - -void Server::handle_client_readdir(MClientRequest *req, - CInode *cur) -{ - // it's a directory, right? - if (!cur->is_dir()) { - // not a dir - dout(10) << "reply to " << *req << " readdir -ENOTDIR" << endl; - reply_request(req, -ENOTDIR); - return; - } - - // auth? - if (!cur->dir_is_auth()) { - int dirauth = cur->authority(); - if (cur->dir) - dirauth = cur->dir->authority(); - assert(dirauth >= 0); - assert(dirauth != mds->get_nodeid()); - - // forward to authority - dout(10) << " forwarding readdir to authority " << dirauth << endl; - mdcache->request_forward(req, dirauth); - return; - } - - if (!try_open_dir(cur, req)) - return; - assert(cur->dir->is_auth()); - - // unhashing? wait! - if (cur->dir->is_hashed() && - cur->dir->is_unhashing()) { - dout(10) << "unhashing, waiting" << endl; - cur->dir->add_waiter(CDIR_WAIT_UNFREEZE, - new C_MDS_RetryRequest(mds, req, cur)); - return; - } - - // check perm - if (!mds->locker->inode_hard_read_start(cur,req)) - return; - mds->locker->inode_hard_read_finish(cur); - - CDir *dir = cur->dir; - assert(dir); - - if (!dir->is_complete()) { - // fetch - dout(10) << " incomplete dir contents for readdir on " << *cur->dir << ", fetching" << endl; - mds->mdstore->fetch_dir(dir, new C_MDS_RetryRequest(mds, req, cur)); - return; - } - - if (dir->is_hashed()) { - // HASHED - dout(7) << "hashed dir" << endl; - if (!dir->can_auth_pin()) { - dout(7) << "can't auth_pin dir " << *dir << " waiting" << endl; - dir->add_waiter(CDIR_WAIT_AUTHPINNABLE, new C_MDS_RetryRequest(mds, req, cur)); - return; - } - - if (!dir->hashed_readdir.empty()) { - dout(7) << "another readdir gather in progres, waiting" << endl; - dir->add_waiter(CDIR_WAIT_NEXTHASHEDREADDIR, new C_MDS_RetryRequest(mds, req, cur)); - return; - } - - // start new readdir gather - dout(7) << "staring new hashed readdir gather" << endl; - - // pin auth for process! - dir->auth_pin(); - - // get local bits - encode_dir_contents(cur->dir, - dir->hashed_readdir[mds->get_nodeid()].first, - dir->hashed_readdir[mds->get_nodeid()].second); - - // request other bits - for (int i=0; imdsmap->get_num_mds(); i++) { - if (i == mds->get_nodeid()) continue; - mds->send_message_mds(new MHashReaddir(dir->ino()), i, MDS_PORT_SERVER); - } - - // wait - dir->add_waiter(CDIR_WAIT_THISHASHEDREADDIR, - new C_MDS_HashReaddir(this, req, dir)); - } else { - // NON-HASHED - // build dir contents - list inls; - list dnls; - int numfiles = encode_dir_contents(cur->dir, inls, dnls); - - // . too - dnls.push_back("."); - inls.push_back(new InodeStat(cur, mds->get_nodeid())); - ++numfiles; - - // yay, reply - MClientReply *reply = new MClientReply(req); - reply->take_dir_items(inls, dnls, numfiles); - - dout(10) << "reply to " << *req << " readdir " << numfiles << " files" << endl; - reply->set_result(0); - - //balancer->hit_dir(cur->dir); - - // reply - reply_request(req, reply, cur); - } -} - - - -// ------------------------------------------------ - -// MKNOD - -class C_MDS_mknod_finish : public Context { - MDS *mds; - MClientRequest *req; - CDentry *dn; - CInode *newi; - version_t pv; -public: - C_MDS_mknod_finish(MDS *m, MClientRequest *r, CDentry *d, CInode *ni) : - mds(m), req(r), dn(d), newi(ni), - pv(d->get_projected_version()) {} - void finish(int r) { - assert(r == 0); - - // link the inode - dn->get_dir()->link_inode(dn, newi); - - // dirty inode, dn, dir - newi->mark_dirty(pv); - - // unlock - mds->locker->dentry_xlock_finish(dn); - - // hit pop - mds->balancer->hit_inode(newi, META_POP_IWR); - - // reply - MClientReply *reply = new MClientReply(req, 0); - reply->set_result(0); - mds->server->reply_request(req, reply, newi); - } -}; - -void Server::handle_client_mknod(MClientRequest *req, CInode *diri) -{ - CInode *newi = 0; - CDentry *dn = 0; - - // make dentry and inode, xlock dentry. - if (!prepare_mknod(req, diri, &newi, &dn)) - return; - assert(newi); - assert(dn); - - // it's a file. - newi->inode.mode = req->get_iarg(); - newi->inode.mode &= ~INODE_TYPE_MASK; - newi->inode.mode |= INODE_MODE_FILE; - - // prepare finisher - C_MDS_mknod_finish *fin = new C_MDS_mknod_finish(mds, req, dn, newi); - EUpdate *le = new EUpdate("mknod"); - le->metablob.add_dir_context(diri->dir); - inode_t *pi = le->metablob.add_dentry(dn, true, newi); - pi->version = dn->get_projected_version(); - - // log + wait - mdlog->submit_entry(le); - mdlog->wait_for_sync(fin); -} - - - -/* - * verify that the dir exists and would own the dname. - * do not check if the dentry exists. - */ -CDir *Server::validate_new_dentry_dir(MClientRequest *req, CInode *diri, string& name) -{ - // make sure parent is a dir? - if (!diri->is_dir()) { - dout(7) << "validate_new_dentry_dir: not a dir" << endl; - reply_request(req, -ENOTDIR); - return false; - } - - // am i not open, not auth? - if (!diri->dir && !diri->is_auth()) { - int dirauth = diri->authority(); - dout(7) << "validate_new_dentry_dir: don't know dir auth, not open, auth is i think mds" << dirauth << endl; - mdcache->request_forward(req, dirauth); - return false; - } - - if (!try_open_dir(diri, req)) - return false; - CDir *dir = diri->dir; - - // make sure it's my dentry - int dnauth = dir->dentry_authority(name); - if (dnauth != mds->get_nodeid()) { - // fw - dout(7) << "mknod on " << req->get_path() << ", dentry " << *dir - << " dn " << name - << " not mine, fw to " << dnauth << endl; - mdcache->request_forward(req, dnauth); - return false; - } - - // dir auth pinnable? - if (!dir->can_auth_pin()) { - dout(7) << "validate_new_dentry_dir: dir " << *dir << " not pinnable, waiting" << endl; - dir->add_waiter(CDIR_WAIT_AUTHPINNABLE, - new C_MDS_RetryRequest(mds, req, diri)); - return false; - } - - // frozen? - if (dir->is_frozen()) { - dout(7) << "dir is frozen " << *dir << endl; - dir->add_waiter(CDIR_WAIT_UNFREEZE, - new C_MDS_RetryRequest(mds, req, diri)); - return false; - } - - return dir; -} - -/* - * prepare a mknod-type operation (mknod, mkdir, symlink, open+create). - * create the inode and dentry, but do not link them. - * pre_dirty the dentry+dir. - * xlock the dentry. - * - * return val - * 0 - wait for something - * 1 - created - * 2 - already exists (only if okexist=true) - */ -int Server::prepare_mknod(MClientRequest *req, CInode *diri, - CInode **pin, CDentry **pdn, - bool okexist) -{ - dout(10) << "prepare_mknod " << req->get_filepath() << " in " << *diri << endl; - - // get containing directory (without last bit) - filepath dirpath = req->get_filepath().prefixpath(req->get_filepath().depth() - 1); - string name = req->get_filepath().last_bit(); - - CDir *dir = validate_new_dentry_dir(req, diri, name); - if (!dir) return 0; - - // make sure name doesn't already exist - *pdn = dir->lookup(name); - if (*pdn) { - if (!(*pdn)->can_read(req)) { - dout(10) << "waiting on (existing!) dentry " << **pdn << endl; - dir->add_waiter(CDIR_WAIT_DNREAD, name, new C_MDS_RetryRequest(mds, req, diri)); - return 0; - } - - if (!(*pdn)->is_null()) { - // name already exists - if (okexist) { - dout(10) << "dentry " << name << " exists in " << *dir << endl; - *pin = (*pdn)->inode; - return 2; - } else { - dout(10) << "dentry " << name << " exists in " << *dir << endl; - reply_request(req, -EEXIST); - return 0; - } - } - } - - // make sure dir is complete - if (!dir->is_complete()) { - dout(7) << " incomplete dir contents for " << *dir << ", fetching" << endl; - mds->mdstore->fetch_dir(dir, new C_MDS_RetryRequest(mds, req, diri)); - return 0; - } - - // make sure dir is pinnable - - - // create inode - *pin = mdcache->create_inode(); - (*pin)->inode.uid = req->get_caller_uid(); - (*pin)->inode.gid = req->get_caller_gid(); - (*pin)->inode.ctime = (*pin)->inode.mtime = (*pin)->inode.atime = g_clock.gettime(); // now - // note: inode.version will get set by finisher's mark_dirty. - - // create dentry - if (!*pdn) - *pdn = dir->add_dentry(name, 0); - - (*pdn)->pre_dirty(); - - // xlock dentry - bool res = mds->locker->dentry_xlock_start(*pdn, req, diri); - assert(res == true); - - // bump modify pop - mds->balancer->hit_dir(dir, META_POP_DWR); - - return 1; -} - - - - - -// MKDIR - -void Server::handle_client_mkdir(MClientRequest *req, CInode *diri) -{ - CInode *newi = 0; - CDentry *dn = 0; - - // make dentry and inode, xlock dentry. - if (!prepare_mknod(req, diri, &newi, &dn)) - return; - assert(newi); - assert(dn); - - // it's a directory. - newi->inode.mode = req->get_iarg(); - newi->inode.mode &= ~INODE_TYPE_MASK; - newi->inode.mode |= INODE_MODE_DIR; - newi->inode.layout = g_OSD_MDDirLayout; - - // ...and that new dir is empty. - CDir *newdir = newi->get_or_open_dir(mds->mdcache); - newdir->mark_complete(); - newdir->mark_dirty(newdir->pre_dirty()); - - // prepare finisher - C_MDS_mknod_finish *fin = new C_MDS_mknod_finish(mds, req, dn, newi); - EUpdate *le = new EUpdate("mkdir"); - le->metablob.add_dir_context(diri->dir); - inode_t *pi = le->metablob.add_dentry(dn, true, newi); - pi->version = dn->get_projected_version(); - le->metablob.add_dir(newi->dir, true); - - // log + wait - mdlog->submit_entry(le); - mdlog->wait_for_sync(fin); - - - /* old export heuristic. pbly need to reimplement this at some point. - if ( - diri->dir->is_auth() && - diri->dir->is_rep() && - newdir->is_auth() && - !newdir->is_hashing()) { - int dest = rand() % mds->mdsmap->get_num_mds(); - if (dest != whoami) { - dout(10) << "exporting new dir " << *newdir << " in replicated parent " << *diri->dir << endl; - mdcache->migrator->export_dir(newdir, dest); - } - } - */ -} - - - -// SYMLINK - -void Server::handle_client_symlink(MClientRequest *req, CInode *diri) -{ - CInode *newi = 0; - CDentry *dn = 0; - - // make dentry and inode, xlock dentry. - if (!prepare_mknod(req, diri, &newi, &dn)) - return; - assert(newi); - assert(dn); - - // it's a symlink - newi->inode.mode &= ~INODE_TYPE_MASK; - newi->inode.mode |= INODE_MODE_SYMLINK; - newi->symlink = req->get_sarg(); - - // prepare finisher - C_MDS_mknod_finish *fin = new C_MDS_mknod_finish(mds, req, dn, newi); - EUpdate *le = new EUpdate("symlink"); - le->metablob.add_dir_context(diri->dir); - inode_t *pi = le->metablob.add_dentry(dn, true, newi); - pi->version = dn->get_projected_version(); - - // log + wait - mdlog->submit_entry(le); - mdlog->wait_for_sync(fin); -} - - - - - -// LINK - -class C_MDS_LinkTraverse : public Context { - Server *server; - MClientRequest *req; - CInode *ref; -public: - vector trace; - C_MDS_LinkTraverse(Server *server, MClientRequest *req, CInode *ref) { - this->server = server; - this->req = req; - this->ref = ref; - } - void finish(int r) { - server->handle_client_link_2(r, req, ref, trace); - } -}; - -void Server::handle_client_link(MClientRequest *req, CInode *ref) -{ - // figure out name - string dname = req->get_filepath().last_bit(); - dout(7) << "handle_client_link dname is " << dname << endl; - - // validate dir - CDir *dir = validate_new_dentry_dir(req, ref, dname); - if (!dir) return; - - // dentry exists? - CDentry *dn = dir->lookup(dname); - if (dn && (!dn->is_null() || dn->is_xlockedbyother(req))) { - dout(7) << "handle_client_link dn exists " << *dn << endl; - reply_request(req, -EEXIST); - return; - } - - // xlock dentry - if (!dn->is_xlockedbyme(req)) { - if (!mds->locker->dentry_xlock_start(dn, req, ref)) - return; - } - - // discover link target - filepath target = req->get_sarg(); - dout(7) << "handle_client_link discovering target " << target << endl; - C_MDS_LinkTraverse *onfinish = new C_MDS_LinkTraverse(this, req, ref); - Context *ondelay = new C_MDS_RetryRequest(mds, req, ref); - - mdcache->path_traverse(target, onfinish->trace, false, - req, ondelay, - MDS_TRAVERSE_DISCOVER, //XLOCK, - onfinish); -} - - -class C_MDS_RemoteLink : public Context { - Server *server; - MClientRequest *req; - CInode *ref; - CDentry *dn; - CInode *targeti; -public: - C_MDS_RemoteLink(Server *server, MClientRequest *req, CInode *ref, CDentry *dn, CInode *targeti) { - this->server = server; - this->req = req; - this->ref = ref; - this->dn = dn; - this->targeti = targeti; - } - void finish(int r) { - if (r > 0) { // success - // yay - server->handle_client_link_finish(req, ref, dn, targeti); - } - else if (r == 0) { - // huh? retry! - assert(0); - server->dispatch_request(req, ref); - } else { - // link failed - server->reply_request(req, r); - } - } -}; - -void Server::handle_client_link_2(int r, MClientRequest *req, CInode *diri, vector& trace) -{ - // target dne? - if (r < 0) { - dout(7) << "target " << req->get_sarg() << " dne" << endl; - reply_request(req, r); - return; - } - assert(r == 0); - - CInode *targeti = mdcache->get_root(); - if (trace.size()) targeti = trace[trace.size()-1]->inode; - assert(targeti); - - // dir? - dout(7) << "target is " << *targeti << endl; - if (targeti->is_dir()) { - dout(7) << "target is a dir, failing" << endl; - reply_request(req, -EINVAL); - return; - } - - // what was the new dentry again? - CDir *dir = diri->dir; - assert(dir); - string dname = req->get_filepath().last_bit(); - CDentry *dn = dir->lookup(dname); - assert(dn); - assert(dn->is_xlockedbyme(req)); - - - // ok! - if (targeti->is_auth()) { - // mine - - // same dir? - if (targeti->get_parent_dir() == dn->get_dir()) { - dout(7) << "target is in the same dir, sweet" << endl; - } - else if (targeti->is_anchored()) { - dout(7) << "target anchored already (nlink=" << targeti->inode.nlink << "), sweet" << endl; - } else { - assert(targeti->inode.nlink == 1); - dout(7) << "target needs anchor, nlink=" << targeti->inode.nlink << ", creating anchor" << endl; - - mdcache->anchor_inode(targeti, - new C_MDS_RetryRequest(mds, req, diri)); - return; - } - - // ok, inc link! - targeti->inode.nlink++; - dout(7) << "nlink++, now " << targeti->inode.nlink << " on " << *targeti << endl; - targeti->_mark_dirty(); // fixme - - } else { - // remote: send nlink++ request, wait - dout(7) << "target is remote, sending InodeLink" << endl; - mds->send_message_mds(new MInodeLink(targeti->ino(), mds->get_nodeid()), targeti->authority(), MDS_PORT_CACHE); - - // wait - targeti->add_waiter(CINODE_WAIT_LINK, - new C_MDS_RemoteLink(this, req, diri, dn, targeti)); - return; - } - - handle_client_link_finish(req, diri, dn, targeti); -} - -void Server::handle_client_link_finish(MClientRequest *req, CInode *ref, - CDentry *dn, CInode *targeti) -{ - // create remote link - dn->dir->link_inode(dn, targeti->ino()); - dn->link_remote( targeti ); // since we have it - dn->_mark_dirty(); // fixme - - mds->balancer->hit_dir(dn->dir, META_POP_DWR); - - // done! - commit_request(req, new MClientReply(req, 0), ref, - 0); // FIXME i should log something -} - - -// UNLINK - -void Server::handle_client_unlink(MClientRequest *req, - CInode *diri) -{ - // rmdir or unlink - bool rmdir = false; - if (req->get_op() == MDS_OP_RMDIR) rmdir = true; - - // find it - if (req->get_filepath().depth() == 0) { - dout(7) << "can't rmdir root" << endl; - reply_request(req, -EINVAL); - return; - } - string name = req->get_filepath().last_bit(); - - // make sure parent is a dir? - if (!diri->is_dir()) { - dout(7) << "not a dir" << endl; - reply_request(req, -ENOTDIR); - return; - } - - // am i not open, not auth? - if (!diri->dir && !diri->is_auth()) { - int dirauth = diri->authority(); - dout(7) << "don't know dir auth, not open, auth is i think " << dirauth << endl; - mdcache->request_forward(req, dirauth); - return; - } - - if (!try_open_dir(diri, req)) return; - CDir *dir = diri->dir; - int dnauth = dir->dentry_authority(name); - - // does it exist? - CDentry *dn = dir->lookup(name); - if (!dn) { - if (dnauth == mds->get_nodeid()) { - dout(7) << "handle_client_rmdir/unlink dne " << name << " in " << *dir << endl; - reply_request(req, -ENOENT); - } else { - // send to authority! - dout(7) << "handle_client_rmdir/unlink fw, don't have " << name << " in " << *dir << endl; - mdcache->request_forward(req, dnauth); - } - return; - } - - // have it. locked? - if (!dn->can_read(req)) { - dout(10) << " waiting on " << *dn << endl; - dir->add_waiter(CDIR_WAIT_DNREAD, - name, - new C_MDS_RetryRequest(mds, req, diri)); - return; - } - - // null? - if (dn->is_null()) { - dout(10) << "unlink on null dn " << *dn << endl; - reply_request(req, -ENOENT); - return; - } - - // ok! - CInode *in = dn->inode; - assert(in); - if (rmdir) { - dout(7) << "handle_client_rmdir on dir " << *in << endl; - } else { - dout(7) << "handle_client_unlink on non-dir " << *in << endl; - } - - // dir stuff - if (in->is_dir()) { - if (rmdir) { - // rmdir - - // open dir? - if (in->is_auth() && !in->dir) { - if (!try_open_dir(in, req)) return; - } - - // not dir auth? (or not open, which implies the same!) - if (!in->dir) { - dout(7) << "handle_client_rmdir dir not open for " << *in << ", sending to dn auth " << dnauth << endl; - mdcache->request_forward(req, dnauth); - return; - } - if (!in->dir->is_auth()) { - int dirauth = in->dir->authority(); - dout(7) << "handle_client_rmdir not auth for dir " << *in->dir << ", sending to dir auth " << dnauth << endl; - mdcache->request_forward(req, dirauth); - return; - } - - assert(in->dir); - assert(in->dir->is_auth()); - - // dir size check on dir auth (but not necessarily dentry auth)? - - // should be empty - if (in->dir->get_size() == 0 && !in->dir->is_complete()) { - dout(7) << "handle_client_rmdir on dir " << *in->dir << ", empty but not complete, fetching" << endl; - mds->mdstore->fetch_dir(in->dir, - new C_MDS_RetryRequest(mds, req, diri)); - return; - } - if (in->dir->get_size() > 0) { - dout(7) << "handle_client_rmdir on dir " << *in->dir << ", not empty" << endl; - reply_request(req, -ENOTEMPTY); - return; - } - - dout(7) << "handle_client_rmdir dir is empty!" << endl; - - // export sanity check - if (!in->is_auth()) { - // i should be exporting this now/soon, since the dir is empty. - dout(7) << "handle_client_rmdir dir is auth, but not inode." << endl; - if (!in->dir->is_freezing() && in->dir->is_frozen()) { - assert(in->dir->is_import()); - mdcache->migrator->export_empty_import(in->dir); - } else { - dout(7) << "apparently already exporting" << endl; - } - in->dir->add_waiter(CDIR_WAIT_UNFREEZE, - new C_MDS_RetryRequest(mds, req, diri)); - return; - } - - } else { - // unlink - dout(7) << "handle_client_unlink on dir " << *in << ", returning error" << endl; - reply_request(req, -EISDIR); - return; - } - } else { - if (rmdir) { - // unlink - dout(7) << "handle_client_rmdir on non-dir " << *in << ", returning error" << endl; - reply_request(req, -ENOTDIR); - return; - } - } - - // am i dentry auth? - if (dnauth != mds->get_nodeid()) { - // not auth; forward! - dout(7) << "handle_client_unlink not auth for " << *dir << " dn " << dn->name << ", fwd to " << dnauth << endl; - mdcache->request_forward(req, dnauth); - return; - } - - dout(7) << "handle_client_unlink/rmdir on " << *in << endl; - - // xlock dentry - if (!mds->locker->dentry_xlock_start(dn, req, diri)) - return; - - // is this a remote link? - if (dn->is_remote() && !dn->inode) { - CInode *in = mdcache->get_inode(dn->get_remote_ino()); - if (in) { - dn->link_remote(in); - } else { - // open inode - dout(7) << "opening target inode first, ino is " << dn->get_remote_ino() << endl; - mdcache->open_remote_ino(dn->get_remote_ino(), req, - new C_MDS_RetryRequest(mds, req, diri)); - return; - } - } - - - mds->balancer->hit_dir(dn->dir, META_POP_DWR); - - // it's locked, unlink! - MClientReply *reply = new MClientReply(req,0); - mdcache->dentry_unlink(dn, - new C_MDS_CommitRequest(this, req, reply, diri, - new EString("unlink fixme"))); - return; -} - - - - - - -// RENAME - -class C_MDS_RenameTraverseDst : public Context { - Server *server; - MClientRequest *req; - CInode *ref; - CInode *srcdiri; - CDir *srcdir; - CDentry *srcdn; - filepath destpath; -public: - vector trace; - - C_MDS_RenameTraverseDst(Server *server, - MClientRequest *req, - CInode *ref, - CInode *srcdiri, - CDir *srcdir, - CDentry *srcdn, - filepath& destpath) { - this->server = server; - this->req = req; - this->ref = ref; - this->srcdiri = srcdiri; - this->srcdir = srcdir; - this->srcdn = srcdn; - this->destpath = destpath; - } - void finish(int r) { - server->handle_client_rename_2(req, ref, - srcdiri, srcdir, srcdn, destpath, - trace, r); - } -}; - - -/* - - weirdness iwith rename: - - ref inode is what was originally srcdiri, but that may change by the tiem - the rename actually happens. for all practical purpose, ref is useless except - for C_MDS_RetryRequest - - */ -void Server::handle_client_rename(MClientRequest *req, - CInode *ref) -{ - dout(7) << "handle_client_rename on " << *req << endl; - - // sanity checks - if (req->get_filepath().depth() == 0) { - dout(7) << "can't rename root" << endl; - reply_request(req, -EINVAL); - return; - } - // mv a/b a/b/c -- meaningless - if (req->get_sarg().compare( 0, req->get_path().length(), req->get_path()) == 0 && - req->get_sarg().c_str()[ req->get_path().length() ] == '/') { - dout(7) << "can't rename to underneath myself" << endl; - reply_request(req, -EINVAL); - return; - } - - // mv blah blah -- also meaningless - if (req->get_sarg() == req->get_path()) { - dout(7) << "can't rename something to itself (or into itself)" << endl; - reply_request(req, -EINVAL); - return; - } - - // traverse to source - /* - this is abnoraml, just for rename. since we don't pin source path - (because we don't want to screw up the lock ordering) the ref inode - (normally/initially srcdiri) may move, and this may fail. - -> so, re-traverse path. and make sure we request_finish in the case of a forward! - */ - filepath refpath = req->get_filepath(); - string srcname = refpath.last_bit(); - refpath = refpath.prefixpath(refpath.depth()-1); - - dout(7) << "handle_client_rename src traversing to srcdir " << refpath << endl; - vector trace; - int r = mdcache->path_traverse(refpath, trace, true, - req, new C_MDS_RetryRequest(mds, req, ref), - MDS_TRAVERSE_FORWARD); - if (r == 2) { - dout(7) << "path traverse forwarded, ending request, doing manual request_cleanup" << endl; - dout(7) << "(pseudo) request_forward to 9999 req " << *req << endl; - mdcache->request_cleanup(req); // not _finish (deletes) or _forward (path_traverse did that) - return; - } - if (r > 0) return; - if (r < 0) { // dne or something. got renamed out from under us, probably! - dout(7) << "traverse r=" << r << endl; - reply_request(req, r); - return; - } - - CInode *srcdiri; - if (trace.size()) - srcdiri = trace[trace.size()-1]->inode; - else - srcdiri = mdcache->get_root(); - - dout(7) << "handle_client_rename srcdiri is " << *srcdiri << endl; - - dout(7) << "handle_client_rename srcname is " << srcname << endl; - - // make sure parent is a dir? - if (!srcdiri->is_dir()) { - dout(7) << "srcdiri not a dir " << *srcdiri << endl; - reply_request(req, -EINVAL); - return; - } - - // am i not open, not auth? - if (!srcdiri->dir && !srcdiri->is_auth()) { - int dirauth = srcdiri->authority(); - dout(7) << "don't know dir auth, not open, srcdir auth is probably " << dirauth << endl; - mdcache->request_forward(req, dirauth); - return; - } - - if (!try_open_dir(srcdiri, req)) return; - CDir *srcdir = srcdiri->dir; - dout(7) << "handle_client_rename srcdir is " << *srcdir << endl; - - // make sure it's my dentry - int srcauth = srcdir->dentry_authority(srcname); - if (srcauth != mds->get_nodeid()) { - // fw - dout(7) << "rename on " << req->get_path() << ", dentry " << *srcdir << " dn " << srcname << " not mine, fw to " << srcauth << endl; - mdcache->request_forward(req, srcauth); - return; - } - // ok, done passing buck. - - // src dentry - CDentry *srcdn = srcdir->lookup(srcname); - - // xlocked? - if (srcdn && !srcdn->can_read(req)) { - dout(10) << " waiting on " << *srcdn << endl; - srcdir->add_waiter(CDIR_WAIT_DNREAD, - srcname, - new C_MDS_RetryRequest(mds, req, srcdiri)); - return; - } - - if ((srcdn && !srcdn->inode) || - (!srcdn && srcdir->is_complete())) { - dout(10) << "handle_client_rename src dne " << endl; - reply_request(req, -EEXIST); - return; - } - - if (!srcdn && !srcdir->is_complete()) { - dout(10) << "readding incomplete dir" << endl; - mds->mdstore->fetch_dir(srcdir, - new C_MDS_RetryRequest(mds, req, srcdiri)); - return; - } - assert(srcdn && srcdn->inode); - - - dout(10) << "handle_client_rename srcdn is " << *srcdn << endl; - dout(10) << "handle_client_rename srci is " << *srcdn->inode << endl; - - // pin src in cache (so it won't expire) - mdcache->request_pin_inode(req, srcdn->inode); - - // find the destination, normalize - // discover, etc. on the way... just get it on the local node. - filepath destpath = req->get_sarg(); - - C_MDS_RenameTraverseDst *onfinish = new C_MDS_RenameTraverseDst(this, req, ref, srcdiri, srcdir, srcdn, destpath); - Context *ondelay = new C_MDS_RetryRequest(mds, req, ref); - - /* - * use DISCOVERXLOCK mode: - * the dest may not exist, and may be xlocked from a remote host - * we want to succeed if we find the xlocked dentry - * ?? - */ - mdcache->path_traverse(destpath, onfinish->trace, false, - req, ondelay, - MDS_TRAVERSE_DISCOVER, //XLOCK, - onfinish); -} - -void Server::handle_client_rename_2(MClientRequest *req, - CInode *ref, - CInode *srcdiri, - CDir *srcdir, - CDentry *srcdn, - filepath& destpath, - vector& trace, - int r) -{ - dout(7) << "handle_client_rename_2 on " << *req << endl; - dout(12) << " r = " << r << " trace depth " << trace.size() << " destpath depth " << destpath.depth() << endl; - - CInode *srci = srcdn->inode; - assert(srci); - CDir* destdir = 0; - string destname; - - // what is the dest? (dir or file or complete filename) - // note: trace includes root, destpath doesn't (include leading /) - if (trace.size() && trace[trace.size()-1]->inode == 0) { - dout(10) << "dropping null dentry from tail of trace" << endl; - trace.pop_back(); // drop it! - } - - CInode *d; - if (trace.size()) - d = trace[trace.size()-1]->inode; - else - d = mdcache->get_root(); - assert(d); - dout(10) << "handle_client_rename_2 traced to " << *d << ", trace size = " << trace.size() << ", destpath = " << destpath.depth() << endl; - - // make sure i can open the dir? - if (d->is_dir() && !d->dir_is_auth() && !d->dir) { - // discover it - mdcache->open_remote_dir(d, - new C_MDS_RetryRequest(mds, req, ref)); - return; - } - - if (trace.size() == destpath.depth()) { - if (d->is_dir()) { - // mv /some/thing /to/some/dir - if (!try_open_dir(d, req)) return; - destdir = d->dir; // /to/some/dir - destname = req->get_filepath().last_bit(); // thing - destpath.add_dentry(destname); - } else { - // mv /some/thing /to/some/existing_filename - destdir = trace[trace.size()-1]->dir; // /to/some - destname = destpath.last_bit(); // existing_filename - } - } - else if (trace.size() == destpath.depth()-1) { - if (d->is_dir()) { - // mv /some/thing /to/some/place_that_maybe_dne (we might be replica) - if (!try_open_dir(d, req)) return; - destdir = d->dir; // /to/some - destname = destpath.last_bit(); // place_that_MAYBE_dne - } else { - dout(7) << "dest dne" << endl; - reply_request(req, -EINVAL); - return; - } - } - else { - assert(trace.size() < destpath.depth()-1); - // check traverse return value - if (r > 0) { - return; // discover, readdir, etc. - } - - // ?? - assert(r < 0 || trace.size() == 0); // musta been an error - - // error out - dout(7) << " rename dest " << destpath << " dne" << endl; - reply_request(req, -EINVAL); - return; - } - - string srcpath = req->get_path(); - dout(10) << "handle_client_rename_2 srcpath " << srcpath << endl; - dout(10) << "handle_client_rename_2 destpath " << destpath << endl; - - // src == dest? - if (srcdn->get_dir() == destdir && srcdn->name == destname) { - dout(7) << "rename src=dest, same file " << endl; - reply_request(req, -EINVAL); - return; - } - - // does destination exist? (is this an overwrite?) - CDentry *destdn = destdir->lookup(destname); - CInode *oldin = 0; - if (destdn) { - oldin = destdn->get_inode(); - - if (oldin) { - // make sure it's also a file! - // this can happen, e.g. "mv /some/thing /a/dir" where /a/dir/thing exists and is a dir. - if (oldin->is_dir()) { - // fail! - dout(7) << "dest exists and is dir" << endl; - reply_request(req, -EISDIR); - return; - } - - if (srcdn->inode->is_dir() && - !oldin->is_dir()) { - dout(7) << "cannot overwrite non-directory with directory" << endl; - reply_request(req, -EISDIR); - return; - } - } - - dout(7) << "dest exists " << *destdn << endl; - if (destdn->get_inode()) { - dout(7) << "destino is " << *destdn->get_inode() << endl; - } else { - dout(7) << "dest dn is a NULL stub" << endl; - } - } else { - dout(7) << "dest dn dne (yet)" << endl; - } - - - // local or remote? - int srcauth = srcdir->dentry_authority(srcdn->name); - int destauth = destdir->dentry_authority(destname); - dout(7) << "handle_client_rename_2 destname " << destname << " destdir " << *destdir << " auth " << destauth << endl; - - // - if (srcauth != mds->get_nodeid() || - destauth != mds->get_nodeid()) { - dout(7) << "rename has remote dest " << destauth << endl; - dout(7) << "FOREIGN RENAME" << endl; - - // punt? - if (false && srcdn->inode->is_dir()) { - reply_request(req, -EINVAL); - return; - } - - } else { - dout(7) << "rename is local" << endl; - } - - handle_client_rename_local(req, ref, - srcpath, srcdiri, srcdn, - destpath.get_path(), destdir, destdn, destname); - return; -} - - - - -void Server::handle_client_rename_local(MClientRequest *req, - CInode *ref, - string& srcpath, - CInode *srcdiri, - CDentry *srcdn, - string& destpath, - CDir *destdir, - CDentry *destdn, - string& destname) -{ - //bool everybody = false; - //if (true || srcdn->inode->is_dir()) { - /* overkill warning: lock w/ everyone for simplicity. FIXME someday! along with the foreign rename crap! - i could limit this to cases where something beneath me is exported. - could possibly limit the list. (maybe.) - Underlying constraint is that, regardless of the order i do the xlocks, and whatever - imports/exports might happen in the process, the destdir _must_ exist on any node - importing something beneath me when rename finishes, or else mayhem ensues when - their import is dangling in the cache. - */ - /* - having made a proper mess of this on the first pass, here is my plan: - - - xlocks of src, dest are done in lex order - - xlock is optional.. if you have the dentry, lock it, if not, don't. - - if you discover an xlocked dentry, you get the xlock. - - possible trouble: - - you have an import beneath the source, and don't have the dest dir. - - when the actual rename happens, you discover the dest - - actually, do this on any open dir, so we don't detach whole swaths - of our cache. - - notes: - - xlocks are initiated from authority, as are discover_replies, so replicas are - guaranteed to either not have dentry, or to have it xlocked. - - - - foreign xlocks are eventually unraveled by the initiator on success or failure. - - todo to make this work: - - hose bool everybody param crap - /- make handle_lock_dn not discover, clean up cases - /- put dest path in MRenameNotify - /- make rename_notify discover if its a dir - / - this will catch nested imports too, obviously - /- notify goes to merged list on local rename - /- notify goes to everybody on a foreign rename - /- handle_notify needs to gracefully ignore spurious notifies - */ - //dout(7) << "handle_client_rename_local: overkill? doing xlocks with _all_ nodes" << endl; - //everybody = true; - //} - - bool srclocal = srcdn->dir->dentry_authority(srcdn->name) == mds->get_nodeid(); - bool destlocal = destdir->dentry_authority(destname) == mds->get_nodeid(); - - dout(7) << "handle_client_rename_local: src local=" << srclocal << " " << *srcdn << endl; - if (destdn) { - dout(7) << "handle_client_rename_local: dest local=" << destlocal << " " << *destdn << endl; - } else { - dout(7) << "handle_client_rename_local: dest local=" << destlocal << " dn dne yet" << endl; - } - - /* lock source and dest dentries, in lexicographic order. - */ - bool dosrc = srcpath < destpath; - for (int i=0; i<2; i++) { - if (dosrc) { - - // src - if (srclocal) { - if (!srcdn->is_xlockedbyme(req) && - !mds->locker->dentry_xlock_start(srcdn, req, ref)) - return; - } else { - if (!srcdn || srcdn->xlockedby != req) { - mds->locker->dentry_xlock_request(srcdn->dir, srcdn->name, false, req, new C_MDS_RetryRequest(mds, req, ref)); - return; - } - } - dout(7) << "handle_client_rename_local: srcdn is xlock " << *srcdn << endl; - - } else { - - if (destlocal) { - // dest - if (!destdn) destdn = destdir->add_dentry(destname); - if (!destdn->is_xlockedbyme(req) && - !mds->locker->dentry_xlock_start(destdn, req, ref)) { - if (destdn->is_clean() && destdn->is_null() && destdn->is_sync()) destdir->remove_dentry(destdn); - return; - } - } else { - if (!destdn || destdn->xlockedby != req) { - /* NOTE: require that my xlocked item be a leaf/file, NOT a dir. in case - * my traverse and determination of dest vs dest/srcfilename was out of date. - */ - mds->locker->dentry_xlock_request(destdir, destname, true, req, new C_MDS_RetryRequest(mds, req, ref)); - return; - } - } - dout(7) << "handle_client_rename_local: destdn is xlock " << *destdn << endl; - - } - - dosrc = !dosrc; - } - - - // final check: verify if dest exists that src is a file - - // FIXME: is this necessary? - - if (destdn->inode) { - if (destdn->inode->is_dir()) { - dout(7) << "handle_client_rename_local failing, dest exists and is a dir: " << *destdn->inode << endl; - assert(0); - reply_request(req, -EINVAL); - return; - } - if (srcdn->inode->is_dir()) { - dout(7) << "handle_client_rename_local failing, dest exists and src is a dir: " << *destdn->inode << endl; - assert(0); - reply_request(req, -EINVAL); - return; - } - } else { - // if destdn->inode is null, then we know it's a non-existent dest, - // why? because if it's local, it dne. and if it's remote, we xlocked with - // REQXLOCKC, which will only allow you to lock a file. - // so we know dest is a file, or non-existent - if (!destlocal) { - if (srcdn->inode->is_dir()) { - // help: maybe the dest exists and is a file? ..... FIXME - } else { - // we're fine, src is file, dest is file|dne - } - } - } - - mds->balancer->hit_dir(srcdn->dir, META_POP_DWR); - mds->balancer->hit_dir(destdn->dir, META_POP_DWR); - - // we're golden. - // everything is xlocked by us, we rule, etc. - MClientReply *reply = new MClientReply(req, 0); - mdcache->renamer->file_rename( srcdn, destdn, - new C_MDS_CommitRequest(this, req, reply, srcdn->inode, - new EString("file rename fixme")) ); -} - - - - - - - - - - - -// =================================== -// TRUNCATE, FSYNC - -/* - * FIXME: this truncate implemention is WRONG WRONG WRONG - */ - -void Server::handle_client_truncate(MClientRequest *req, CInode *cur) -{ - // write - if (!mds->locker->inode_file_write_start(cur, req)) - return; // fw or (wait for) lock - - // check permissions - - // do update - cur->inode.size = req->get_sizearg(); - cur->_mark_dirty(); // fixme - - mds->locker->inode_file_write_finish(cur); - - mds->balancer->hit_inode(cur, META_POP_IWR); - - // start reply - MClientReply *reply = new MClientReply(req, 0); - - // commit - commit_request(req, reply, cur, - new EString("truncate fixme")); -} - - - -// =========================== -// open, openc, close - -void Server::handle_client_open(MClientRequest *req, - CInode *cur) -{ - int flags = req->get_iarg(); - int mode = req->get_iarg2(); - - dout(7) << "open " << flags << " on " << *cur << endl; - dout(10) << "open flags = " << flags << " mode = " << mode << endl; - - // is it a file? - if (!(cur->inode.mode & INODE_MODE_FILE)) { - dout(7) << "not a regular file" << endl; - reply_request(req, -EINVAL); // FIXME what error do we want? - return; - } - - // auth for write access - if (mode != FILE_MODE_R && mode != FILE_MODE_LAZY && - !cur->is_auth()) { - int auth = cur->authority(); - assert(auth != mds->get_nodeid()); - dout(9) << "open writeable on replica for " << *cur << " fw to auth " << auth << endl; - - mdcache->request_forward(req, auth); - return; - } - - - // hmm, check permissions or something. - - - // can we issue the caps they want? - version_t fdv = mds->locker->issue_file_data_version(cur); - Capability *cap = mds->locker->issue_new_caps(cur, mode, req); - if (!cap) return; // can't issue (yet), so wait! - - dout(12) << "open gets caps " << cap_string(cap->pending()) << " for " << req->get_source() << " on " << *cur << endl; - - mds->balancer->hit_inode(cur, META_POP_IRD); - - // reply - MClientReply *reply = new MClientReply(req, 0); - reply->set_file_caps(cap->pending()); - reply->set_file_caps_seq(cap->get_last_seq()); - reply->set_file_data_version(fdv); - reply_request(req, reply, cur); -} - - -class C_MDS_openc_finish : public Context { - MDS *mds; - MClientRequest *req; - CDentry *dn; - CInode *newi; - version_t pv; -public: - C_MDS_openc_finish(MDS *m, MClientRequest *r, CDentry *d, CInode *ni) : - mds(m), req(r), dn(d), newi(ni), - pv(d->get_projected_version()) {} - void finish(int r) { - assert(r == 0); - - // link the inode - dn->get_dir()->link_inode(dn, newi); - - // dirty inode, dn, dir - newi->mark_dirty(pv); - - // unlock - mds->locker->dentry_xlock_finish(dn); - - // hit pop - mds->balancer->hit_inode(newi, META_POP_IWR); - - // ok, do the open. - mds->server->handle_client_open(req, newi); - } -}; - - -void Server::handle_client_openc(MClientRequest *req, CInode *diri) -{ - dout(7) << "open w/ O_CREAT on " << req->get_filepath() << endl; - - CInode *in = 0; - CDentry *dn = 0; - - // make dentry and inode, xlock dentry. - bool excl = req->get_iarg() & O_EXCL; - int r = prepare_mknod(req, diri, &in, &dn, !excl); - if (!r) - return; // wait on something - assert(in); - assert(dn); - - if (r == 1) { - // created. - // it's a file. - in->inode.mode = 0644; // FIXME req should have a umask - in->inode.mode |= INODE_MODE_FILE; - - // prepare finisher - C_MDS_openc_finish *fin = new C_MDS_openc_finish(mds, req, dn, in); - EUpdate *le = new EUpdate("openc"); - le->metablob.add_dir_context(diri->dir); - inode_t *pi = le->metablob.add_dentry(dn, true, in); - pi->version = dn->get_projected_version(); - - // log + wait - mdlog->submit_entry(le); - mdlog->wait_for_sync(fin); - - /* - FIXME. this needs to be rewritten when the write capability stuff starts - getting journaled. - */ - } else { - // exists! - // FIXME: do i need to repin path based existant inode? hmm. - handle_client_open(req, in); - } -} - - - - - - - - - - - - - - diff --git a/branches/marnberg/quota/mds/Server.h b/branches/marnberg/quota/mds/Server.h deleted file mode 100644 index d4509f1418e07..0000000000000 --- a/branches/marnberg/quota/mds/Server.h +++ /dev/null @@ -1,156 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_SERVER_H -#define __MDS_SERVER_H - -#include "MDS.h" - -class LogEvent; - -class Server { - MDS *mds; - MDCache *mdcache; - MDLog *mdlog; - Messenger *messenger; - - __uint64_t stat_ops; - - -public: - Server(MDS *m) : - mds(m), - mdcache(mds->mdcache), mdlog(mds->mdlog), - messenger(mds->messenger), - stat_ops(0) { - } - - void dispatch(Message *m); - - // generic request helpers - void reply_request(MClientRequest *req, int r = 0, CInode *tracei = 0); - void reply_request(MClientRequest *req, MClientReply *reply, CInode *tracei); - - void submit_update(MClientRequest *req, CInode *wrlockedi, - LogEvent *event, - Context *oncommit); - - void commit_request(MClientRequest *req, - MClientReply *reply, - CInode *tracei, - LogEvent *event, - LogEvent *event2 = 0); - - bool try_open_dir(CInode *in, MClientRequest *req); - - - // clients - void handle_client_mount(class MClientMount *m); - void handle_client_unmount(Message *m); - - void handle_client_request(MClientRequest *m); - void handle_client_request_2(MClientRequest *req, - vector& trace, - int r); - - // fs ops - void handle_client_fstat(MClientRequest *req); - - // requests - void dispatch_request(Message *m, CInode *ref); - - // inode request *req, CInode *ref; - void handle_client_stat(MClientRequest *req, CInode *ref); - void handle_client_utime(MClientRequest *req, CInode *ref); - void handle_client_inode_soft_update_2(MClientRequest *req, - MClientReply *reply, - CInode *ref); - void handle_client_chmod(MClientRequest *req, CInode *ref); - void handle_client_chown(MClientRequest *req, CInode *ref); - void handle_client_inode_hard_update_2(MClientRequest *req, - MClientReply *reply, - CInode *ref); - - // readdir - void handle_client_readdir(MClientRequest *req, CInode *ref); - int encode_dir_contents(CDir *dir, - list& inls, - list& dnls); - void handle_hash_readdir(MHashReaddir *m); - void handle_hash_readdir_reply(MHashReaddirReply *m); - void finish_hash_readdir(MClientRequest *req, CDir *dir); - - // namespace changes - void handle_client_mknod(MClientRequest *req, CInode *ref); - void handle_client_link(MClientRequest *req, CInode *ref); - void handle_client_link_2(int r, MClientRequest *req, CInode *ref, vector& trace); - void handle_client_link_finish(MClientRequest *req, CInode *ref, - CDentry *dn, CInode *targeti); - - void handle_client_unlink(MClientRequest *req, CInode *ref); - void handle_client_rename(MClientRequest *req, CInode *ref); - void handle_client_rename_2(MClientRequest *req, - CInode *ref, - CInode *srcdiri, - CDir *srcdir, - CDentry *srcdn, - filepath& destpath, - vector& trace, - int r); - void handle_client_rename_local(MClientRequest *req, CInode *ref, - string& srcpath, CInode *srcdiri, CDentry *srcdn, - string& destpath, CDir *destdir, CDentry *destdn, string& name); - - void handle_client_mkdir(MClientRequest *req, CInode *ref); - void handle_client_rmdir(MClientRequest *req, CInode *ref); - void handle_client_symlink(MClientRequest *req, CInode *ref); - - // file - void handle_client_open(MClientRequest *req, CInode *ref); - void handle_client_openc(MClientRequest *req, CInode *ref); - void handle_client_release(MClientRequest *req, CInode *in); - void handle_client_truncate(MClientRequest *req, CInode *in); - void handle_client_fsync(MClientRequest *req, CInode *in); - - - // some helpers - CInode *mknod(MClientRequest *req, CInode *ref, bool okexist=false); // used by mknod, symlink, mkdir, openc - - CDir *validate_new_dentry_dir(MClientRequest *req, CInode *diri, string& dname); - int prepare_mknod(MClientRequest *req, CInode *diri, - CInode **pin, CDentry **pdn, - bool okexist=false); - - - -}; - -class C_MDS_RetryRequest : public Context { - MDS *mds; - Message *req; // MClientRequest or MLock - CInode *ref; - public: - C_MDS_RetryRequest(MDS *mds, Message *req, CInode *ref) { - assert(ref); - this->mds = mds; - this->req = req; - this->ref = ref; - } - virtual void finish(int r) { - mds->server->dispatch_request(req, ref); - } -}; - - - -#endif diff --git a/branches/marnberg/quota/mds/events/EAlloc.h b/branches/marnberg/quota/mds/events/EAlloc.h deleted file mode 100644 index 9360db4ab49bb..0000000000000 --- a/branches/marnberg/quota/mds/events/EAlloc.h +++ /dev/null @@ -1,76 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_EALLOC_H -#define __MDS_EALLOC_H - -#include -#include "config.h" -#include "include/types.h" - -#include "../LogEvent.h" -#include "../IdAllocator.h" - -#define EALLOC_EV_ALLOC 1 -#define EALLOC_EV_FREE 2 - -class EAlloc : public LogEvent { - protected: - int idtype; - idno_t id; - int what; // alloc or dealloc - version_t table_version; - - public: - EAlloc() : LogEvent(EVENT_ALLOC) { } - EAlloc(int idtype, idno_t id, int what, version_t v) : - LogEvent(EVENT_ALLOC) { - this->idtype = idtype; - this->id = id; - this->what = what; - this->table_version = v; - } - - void encode_payload(bufferlist& bl) { - bl.append((char*)&idtype, sizeof(idtype)); - bl.append((char*)&id, sizeof(id)); - bl.append((char*)&what, sizeof(what)); - bl.append((char*)&table_version, sizeof(table_version)); - } - void decode_payload(bufferlist& bl, int& off) { - bl.copy(off, sizeof(idtype), (char*)&idtype); - off += sizeof(idtype); - bl.copy(off, sizeof(id), (char*)&id); - off += sizeof(id); - bl.copy(off, sizeof(what), (char*)&what); - off += sizeof(what); - bl.copy(off, sizeof(table_version), (char*)&table_version); - off += sizeof(table_version); - } - - - void print(ostream& out) { - if (what == EALLOC_EV_ALLOC) - out << "EAlloc alloc " << hex << id << dec << " tablev " << table_version; - else - out << "EAlloc dealloc " << hex << id << dec << " tablev " << table_version; - } - - - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); - void replay(MDS *mds); - -}; - -#endif diff --git a/branches/marnberg/quota/mds/events/EExportFinish.h b/branches/marnberg/quota/mds/events/EExportFinish.h deleted file mode 100644 index 114d580b6a499..0000000000000 --- a/branches/marnberg/quota/mds/events/EExportFinish.h +++ /dev/null @@ -1,59 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __EEXPORTFINISH_H -#define __EEXPORTFINISH_H - -#include -#include "config.h" -#include "include/types.h" - -#include "../MDS.h" - -class EExportFinish : public LogEvent { - protected: - inodeno_t dirino; // exported dir - bool success; - - public: - EExportFinish(CDir *dir, bool s) : LogEvent(EVENT_EXPORTFINISH), - dirino(dir->ino()), - success(s) { } - EExportFinish() : LogEvent(EVENT_EXPORTFINISH) { } - - void print(ostream& out) { - out << "export_finish " << dirino; - if (success) - out << " success"; - else - out << " failure"; - } - - virtual void encode_payload(bufferlist& bl) { - bl.append((char*)&dirino, sizeof(dirino)); - bl.append((char*)&success, sizeof(success)); - } - void decode_payload(bufferlist& bl, int& off) { - bl.copy(off, sizeof(dirino), (char*)&dirino); - off += sizeof(dirino); - bl.copy(off, sizeof(success), (char*)&success); - off += sizeof(success); - } - - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); - void replay(MDS *mds); - -}; - -#endif diff --git a/branches/marnberg/quota/mds/events/EExportStart.h b/branches/marnberg/quota/mds/events/EExportStart.h deleted file mode 100644 index 37ed92a7239c2..0000000000000 --- a/branches/marnberg/quota/mds/events/EExportStart.h +++ /dev/null @@ -1,68 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __EEXPORTSTART_H -#define __EEXPORTSTART_H - -#include -#include "config.h" -#include "include/types.h" - -#include "../MDS.h" - -#include "EMetaBlob.h" - -class EExportStart : public LogEvent { - public: - EMetaBlob metablob; // exported dir - protected: - inodeno_t dirino; - int dest; // dest mds - set bounds; - - public: - EExportStart(CDir *dir, int d) : LogEvent(EVENT_EXPORTSTART), - dirino(dir->ino()), - dest(d) { - metablob.add_dir_context(dir); - } - EExportStart() : LogEvent(EVENT_EXPORTSTART) { } - - set &get_bounds() { return bounds; } - - void print(ostream& out) { - out << "export_start " << dirino << " -> " << dest; - } - - virtual void encode_payload(bufferlist& bl) { - metablob._encode(bl); - bl.append((char*)&dirino, sizeof(dirino)); - bl.append((char*)&dest, sizeof(dest)); - ::_encode(bounds, bl); - } - void decode_payload(bufferlist& bl, int& off) { - metablob._decode(bl, off); - bl.copy(off, sizeof(dirino), (char*)&dirino); - off += sizeof(dirino); - bl.copy(off, sizeof(dest), (char*)&dest); - off += sizeof(dest); - ::_decode(bounds, bl, off); - } - - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); - void replay(MDS *mds); - -}; - -#endif diff --git a/branches/marnberg/quota/mds/events/EImportFinish.h b/branches/marnberg/quota/mds/events/EImportFinish.h deleted file mode 100644 index 14a9ab6403af6..0000000000000 --- a/branches/marnberg/quota/mds/events/EImportFinish.h +++ /dev/null @@ -1,59 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __EIMPORTFINISH_H -#define __EIMPORTFINISH_H - -#include -#include "config.h" -#include "include/types.h" - -#include "../MDS.h" - -class EImportFinish : public LogEvent { - protected: - inodeno_t dirino; // imported dir - bool success; - - public: - EImportFinish(CDir *dir, bool s) : LogEvent(EVENT_IMPORTFINISH), - dirino(dir->ino()), - success(s) { } - EImportFinish() : LogEvent(EVENT_IMPORTFINISH) { } - - void print(ostream& out) { - out << "import_finish " << dirino; - if (success) - out << " success"; - else - out << " failed"; - } - - virtual void encode_payload(bufferlist& bl) { - bl.append((char*)&dirino, sizeof(dirino)); - bl.append((char*)&success, sizeof(success)); - } - void decode_payload(bufferlist& bl, int& off) { - bl.copy(off, sizeof(dirino), (char*)&dirino); - off += sizeof(dirino); - bl.copy(off, sizeof(success), (char*)&success); - off += sizeof(success); - } - - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); - void replay(MDS *mds); - -}; - -#endif diff --git a/branches/marnberg/quota/mds/events/EImportMap.h b/branches/marnberg/quota/mds/events/EImportMap.h deleted file mode 100644 index 50f366faaa9fa..0000000000000 --- a/branches/marnberg/quota/mds/events/EImportMap.h +++ /dev/null @@ -1,66 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_EIMPORTMAP_H -#define __MDS_EIMPORTMAP_H - -#include "../LogEvent.h" -#include "EMetaBlob.h" - -class EImportMap : public LogEvent { -public: - EMetaBlob metablob; - set imports; - set exports; - //set hashdirs; - map > nested_exports; - - EImportMap() : LogEvent(EVENT_IMPORTMAP) { } - - void print(ostream& out) { - out << "import_map " << imports.size() << " imports, " - << exports.size() << " exports" - << " " << metablob; - } - - void encode_payload(bufferlist& bl) { - metablob._encode(bl); - ::_encode(imports, bl); - ::_encode(exports, bl); - for (set::iterator p = imports.begin(); - p != imports.end(); - ++p) { - ::_encode(nested_exports[*p], bl); - if (nested_exports[*p].empty()) - nested_exports.erase(*p); - } - } - void decode_payload(bufferlist& bl, int& off) { - metablob._decode(bl, off); - ::_decode(imports, bl, off); - ::_decode(exports, bl, off); - for (set::iterator p = imports.begin(); - p != imports.end(); - ++p) { - ::_decode(nested_exports[*p], bl, off); - if (nested_exports[*p].empty()) - nested_exports.erase(*p); - } - } - - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); - void replay(MDS *mds); -}; - -#endif diff --git a/branches/marnberg/quota/mds/events/EImportStart.h b/branches/marnberg/quota/mds/events/EImportStart.h deleted file mode 100644 index 59c074dec6f4f..0000000000000 --- a/branches/marnberg/quota/mds/events/EImportStart.h +++ /dev/null @@ -1,60 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __EIMPORTSTART_H -#define __EIMPORTSTART_H - -#include -#include "config.h" -#include "include/types.h" - -#include "../MDS.h" - -#include "EMetaBlob.h" - -class EImportStart : public LogEvent { -protected: - inodeno_t dirino; - list bounds; - - public: - EMetaBlob metablob; - - EImportStart(inodeno_t di, - list& b) : LogEvent(EVENT_IMPORTSTART), - dirino(di), bounds(b) { } - EImportStart() : LogEvent(EVENT_IMPORTSTART) { } - - void print(ostream& out) { - out << "EImportStart " << metablob; - } - - virtual void encode_payload(bufferlist& bl) { - bl.append((char*)&dirino, sizeof(dirino)); - metablob._encode(bl); - ::_encode(bounds, bl); - } - void decode_payload(bufferlist& bl, int& off) { - bl.copy(off, sizeof(dirino), (char*)&dirino); - off += sizeof(dirino); - metablob._decode(bl, off); - ::_decode(bounds, bl, off); - } - - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); - void replay(MDS *mds); - -}; - -#endif diff --git a/branches/marnberg/quota/mds/events/EMetaBlob.h b/branches/marnberg/quota/mds/events/EMetaBlob.h deleted file mode 100644 index 800c6674c91a8..0000000000000 --- a/branches/marnberg/quota/mds/events/EMetaBlob.h +++ /dev/null @@ -1,339 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_EMETABLOB_H -#define __MDS_EMETABLOB_H - -#include -#include -using namespace std; - -#include "../CInode.h" -#include "../CDir.h" -#include "../CDentry.h" - - -class MDS; - -/* - * a bunch of metadata in the journal - */ - -/* notes: - * - * - make sure you adjust the inode.version for any modified inode you - * journal. CDir and CDentry maintain a projected_version, but CInode - * doesn't, since the journaled inode usually has to be modifed - * manually anyway (to delay the change in the MDS's cache until after - * it is journaled). - * - */ - - -class EMetaBlob { - - /* fullbit - a regular dentry + inode - */ - struct fullbit { - string dn; // dentry - version_t dnv; - inode_t inode; // if it's not - string symlink; - bool dirty; - - fullbit(const string& d, version_t v, inode_t& i, bool dr) : dn(d), dnv(v), inode(i), dirty(dr) { } - fullbit(const string& d, version_t v, inode_t& i, string& sym, bool dr) : dn(d), dnv(v), inode(i), symlink(sym), dirty(dr) { } - fullbit(bufferlist& bl, int& off) { _decode(bl, off); } - void _encode(bufferlist& bl) { - ::_encode(dn, bl); - bl.append((char*)&dnv, sizeof(dnv)); - bl.append((char*)&inode, sizeof(inode)); - if (inode.is_symlink()) - ::_encode(symlink, bl); - bl.append((char*)&dirty, sizeof(dirty)); - } - void _decode(bufferlist& bl, int& off) { - ::_decode(dn, bl, off); - bl.copy(off, sizeof(dnv), (char*)&dnv); - off += sizeof(dnv); - bl.copy(off, sizeof(inode), (char*)&inode); - off += sizeof(inode); - if (inode.is_symlink()) - ::_decode(symlink, bl, off); - bl.copy(off, sizeof(dirty), (char*)&dirty); - off += sizeof(dirty); - } - }; - - /* remotebit - a dentry + remote inode link (i.e. just an ino) - */ - struct remotebit { - string dn; - version_t dnv; - inodeno_t ino; - bool dirty; - - remotebit(const string& d, version_t v, inodeno_t i, bool dr) : dn(d), dnv(v), ino(i), dirty(dr) { } - remotebit(bufferlist& bl, int& off) { _decode(bl, off); } - void _encode(bufferlist& bl) { - ::_encode(dn, bl); - bl.append((char*)&dnv, sizeof(dnv)); - bl.append((char*)&ino, sizeof(ino)); - bl.append((char*)&dirty, sizeof(dirty)); - } - void _decode(bufferlist& bl, int& off) { - ::_decode(dn, bl, off); - bl.copy(off, sizeof(dnv), (char*)&dnv); - off += sizeof(dnv); - bl.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - bl.copy(off, sizeof(dirty), (char*)&dirty); - off += sizeof(dirty); - } - }; - - /* - * nullbit - a null dentry - */ - struct nullbit { - string dn; - version_t dnv; - bool dirty; - nullbit(const string& d, version_t v, bool dr) : dn(d), dnv(v), dirty(dr) { } - nullbit(bufferlist& bl, int& off) { _decode(bl, off); } - void _encode(bufferlist& bl) { - ::_encode(dn, bl); - bl.append((char*)&dnv, sizeof(dnv)); - bl.append((char*)&dirty, sizeof(dirty)); - } - void _decode(bufferlist& bl, int& off) { - ::_decode(dn, bl, off); - bl.copy(off, sizeof(dnv), (char*)&dnv); - off += sizeof(dnv); - bl.copy(off, sizeof(dirty), (char*)&dirty); - off += sizeof(dirty); - } - }; - - - /* dirlump - contains metadata for any dir we have contents for. - */ - struct dirlump { - static const int STATE_IMPORT = (1<<0); - static const int STATE_COMPLETE = (1<<1); - static const int STATE_DIRTY = (1<<2); // dirty due to THIS journal item, that is! - - dirslice_t dirslice; - version_t dirv; - int state; - int nfull, nremote, nnull; - bufferlist bfull, bremote, bnull; - - private: - bool dn_decoded; - list dfull; - list dremote; - list dnull; - - public: - dirlump() : state(0), nfull(0), nremote(0), nnull(0), dn_decoded(true) { } - - bool is_import() { return state & STATE_IMPORT; } - void mark_import() { state |= STATE_IMPORT; } - bool is_complete() { return state & STATE_COMPLETE; } - void mark_complete() { state |= STATE_COMPLETE; } - bool is_dirty() { return state & STATE_DIRTY; } - void mark_dirty() { state |= STATE_DIRTY; } - - list &get_dfull() { return dfull; } - list &get_dremote() { return dremote; } - list &get_dnull() { return dnull; } - - void _encode_bits() { - for (list::iterator p = dfull.begin(); p != dfull.end(); ++p) - p->_encode(bfull); - for (list::iterator p = dremote.begin(); p != dremote.end(); ++p) - p->_encode(bremote); - for (list::iterator p = dnull.begin(); p != dnull.end(); ++p) - p->_encode(bnull); - } - void _decode_bits() { - if (dn_decoded) return; - int off = 0; - for (int i=0; i lump_order; - map lump_map; - - public: - - // remote pointer to to-be-journaled inode iff it's a normal (non-remote) dentry - inode_t *add_dentry(CDentry *dn, bool dirty, CInode *in=0) { - CDir *dir = dn->get_dir(); - if (!in) in = dn->get_inode(); - - // add the dir - dirlump& lump = add_dir(dir, false); - - // add the dirbit - if (dn->is_remote()) { - lump.nremote++; - if (dirty) - lump.get_dremote().push_front(remotebit(dn->get_name(), - dn->get_projected_version(), - dn->get_remote_ino(), - dirty)); - else - lump.get_dremote().push_back(remotebit(dn->get_name(), - dn->get_projected_version(), - dn->get_remote_ino(), - dirty)); - } - else if (!in) { - lump.nnull++; - if (dirty) - lump.get_dnull().push_front(nullbit(dn->get_name(), - dn->get_projected_version(), - dirty)); - else - lump.get_dnull().push_back(nullbit(dn->get_name(), - dn->get_projected_version(), - dirty)); - } - else { - lump.nfull++; - if (dirty) { - lump.get_dfull().push_front(fullbit(dn->get_name(), - dn->get_projected_version(), - in->inode, in->symlink, - dirty)); - return &lump.get_dfull().front().inode; - } else { - lump.get_dfull().push_back(fullbit(dn->get_name(), - dn->get_projected_version(), - in->inode, in->symlink, - dirty)); - return &lump.get_dfull().back().inode; - } - } - return 0; - } - - dirlump& add_dir(CDir *dir, bool dirty) { - if (lump_map.count(dir->ino()) == 0) { - lump_order.push_back(dir->ino()); - lump_map[dir->ino()].dirv = dir->get_projected_version(); - } - dirlump& l = lump_map[dir->ino()]; - if (dir->is_complete()) l.mark_complete(); - if (dir->is_import()) l.mark_import(); - if (dirty) l.mark_dirty(); - return l; - } - - void add_dir_context(CDir *dir, bool toroot=false) { - // already have this dir? (we must always add in order) - if (lump_map.count(dir->ino())) - return; - - CInode *diri = dir->get_inode(); - if (!toroot && - (dir->is_import() || dir->is_hashed())) - return; // stop at import point - if (!dir->get_inode()->get_parent_dn()) - return; - - CDentry *parent = diri->get_parent_dn(); - add_dir_context(parent->get_dir(), toroot); - add_dentry(parent, false); - } - - - // encoding - - void _encode(bufferlist& bl) { - int n = lump_map.size(); - bl.append((char*)&n, sizeof(n)); - for (list::iterator i = lump_order.begin(); - i != lump_order.end(); - ++i) { - bl.append((char*)&(*i), sizeof(*i)); - lump_map[*i]._encode(bl); - } - } - void _decode(bufferlist& bl, int& off) { - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __EPURGE_H -#define __EPURGE_H - -#include -#include "config.h" -#include "include/types.h" - -class EPurgeFinish : public LogEvent { - protected: - inodeno_t ino; - - public: - EPurgeFinish(inodeno_t i) : - LogEvent(EVENT_PURGEFINISH), - ino(i) { } - EPurgeFinish() : LogEvent(EVENT_PURGEFINISH) { } - - void print(ostream& out) { - out << "purgefinish " << ino; - } - - virtual void encode_payload(bufferlist& bl) { - bl.append((char*)&ino, sizeof(ino)); - } - void decode_payload(bufferlist& bl, int& off) { - bl.copy(off, sizeof(ino), (char*)&ino); - } - - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); - void replay(MDS *mds); - -}; - -#endif diff --git a/branches/marnberg/quota/mds/events/EString.h b/branches/marnberg/quota/mds/events/EString.h deleted file mode 100644 index 0ef7577406454..0000000000000 --- a/branches/marnberg/quota/mds/events/EString.h +++ /dev/null @@ -1,56 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __ESTRING_H -#define __ESTRING_H - -#include -#include -using namespace std; - -#include "../LogEvent.h" - -// generic log event -class EString : public LogEvent { - protected: - string event; - - public: - EString(string e) : - LogEvent(EVENT_STRING) { - event = e; - } - EString() : - LogEvent(EVENT_STRING) { - } - - void decode_payload(bufferlist& bl, int& off) { - event = bl.c_str() + off; - off += event.length() + 1; - } - void encode_payload(bufferlist& bl) { - bl.append(event.c_str(), event.length()+1); - } - - void print(ostream& out) { - out << '"' << event << '"'; - } - - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); - void replay(MDS *mds); - -}; - -#endif diff --git a/branches/marnberg/quota/mds/events/EUnlink.h b/branches/marnberg/quota/mds/events/EUnlink.h deleted file mode 100644 index 7d972488dab1b..0000000000000 --- a/branches/marnberg/quota/mds/events/EUnlink.h +++ /dev/null @@ -1,71 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __EUNLINK_H -#define __EUNLINK_H - -#include -#include "config.h" -#include "include/types.h" - -#include "../LogEvent.h" -#include "EMetaBlob.h" - -#include "../CInode.h" -#include "../CDentry.h" -#include "../CDir.h" - -/// help rewrite me - -class EUnlink : public LogEvent { - protected: - version_t dirv; - string dname; - - public: - EMetaBlob metaglob; - - /* - EUnlink(CDir *dir, CDentry* dn, CInode *in) : - LogEvent(EVENT_UNLINK), - diritrace(dir->inode), - dirv(dir->get_version()), - dname(dn->get_name()), - inodetrace(in) {} - */ - EUnlink() : LogEvent(EVENT_UNLINK) { } - - virtual void encode_payload(bufferlist& bl) { - /* - diritrace.encode(bl); - bl.append((char*)&dirv, sizeof(dirv)); - ::_encode(dname, bl); - inodetrace.encode(bl); - */ - } - void decode_payload(bufferlist& bl, int& off) { - /* - diritrace.decode(bl,off); - bl.copy(off, sizeof(dirv), (char*)&dirv); - off += sizeof(dirv); - ::_decode(dname, bl, off); - inodetrace.decode(bl, off); - */ - } - - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); - void replay(MDS *mds); -}; - -#endif diff --git a/branches/marnberg/quota/mds/events/EUpdate.h b/branches/marnberg/quota/mds/events/EUpdate.h deleted file mode 100644 index 4a8dad5876a62..0000000000000 --- a/branches/marnberg/quota/mds/events/EUpdate.h +++ /dev/null @@ -1,49 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_EUPDATE_H -#define __MDS_EUPDATE_H - -#include "../LogEvent.h" -#include "EMetaBlob.h" - -class EUpdate : public LogEvent { -public: - EMetaBlob metablob; - string type; - - EUpdate() : LogEvent(EVENT_UPDATE) { } - EUpdate(const char *s) : LogEvent(EVENT_UPDATE), - type(s) { } - - void print(ostream& out) { - if (type.length()) - out << type << " "; - out << metablob; - } - - void encode_payload(bufferlist& bl) { - ::_encode(type, bl); - metablob._encode(bl); - } - void decode_payload(bufferlist& bl, int& off) { - ::_decode(type, bl, off); - metablob._decode(bl, off); - } - - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); - void replay(MDS *mds); -}; - -#endif diff --git a/branches/marnberg/quota/mds/journal.cc b/branches/marnberg/quota/mds/journal.cc deleted file mode 100644 index 2182d33ffc878..0000000000000 --- a/branches/marnberg/quota/mds/journal.cc +++ /dev/null @@ -1,589 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "events/EString.h" - -#include "events/EMetaBlob.h" -#include "events/EAlloc.h" -#include "events/EUpdate.h" -#include "events/EImportMap.h" - -#include "events/EPurgeFinish.h" -#include "events/EUnlink.h" -#include "events/EExportStart.h" -#include "events/EExportFinish.h" -#include "events/EImportStart.h" -#include "events/EImportFinish.h" - -#include "MDS.h" -#include "MDLog.h" -#include "MDCache.h" -#include "MDStore.h" -#include "Migrator.h" - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug_mds || l <= g_conf.debug_mds_log) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".journal " -#define derr(l) if (l<=g_conf.debug_mds || l <= g_conf.debug_mds_log) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".journal " - - -// ----------------------- -// EString - -bool EString::has_expired(MDS *mds) { - dout(10) << "EString.has_expired " << event << endl; - return true; -} -void EString::expire(MDS *mds, Context *c) -{ - dout(10) << "EString.expire " << event << endl; -} -void EString::replay(MDS *mds) -{ - dout(10) << "EString.replay " << event << endl; -} - - - -// ----------------------- -// EMetaBlob - -/* - * we need to ensure that a journaled item has either - * - * - been safely committed to its dirslice. - * - * - has been safely exported. note that !is_auth() && !is_proxy() - * implies safely exported. if !is_auth() && is_proxy(), we need to - * add a waiter for the export to complete. - * - */ -bool EMetaBlob::has_expired(MDS *mds) -{ - // examine dirv's for my lumps - for (map::iterator lp = lump_map.begin(); - lp != lump_map.end(); - ++lp) { - CInode *diri = mds->mdcache->get_inode(lp->first); - if (!diri) - continue; // we expired it - CDir *dir = diri->dir; - if (!dir) - continue; // we expired it - - // FIXME: check the slice only - - if (dir->is_proxy()) { - dout(10) << "EMetaBlob.has_expired am proxy, needed dirv " << lp->second.dirv - << " for " << *dir << endl; - return false; // we need to wait until the export flushes! - } - if (!dir->is_auth()) { - dout(10) << "EMetaBlob.has_expired not auth, needed dirv " << lp->second.dirv - << " for " << *dir << endl; - continue; // not our problem - } - - if (dir->get_last_committed_version() < lp->second.dirv) { - dout(10) << "EMetaBlob.has_expired need dirv " << lp->second.dirv - << " for " << *dir << endl; - return false; // not committed. - } else { - dout(10) << "EMetaBlob.has_expired have dirv " << lp->second.dirv - << " for " << *dir << endl; - } - } - - return true; // all dirlumps expired. -} - -void EMetaBlob::expire(MDS *mds, Context *c) -{ - list commit; - list waitfor_export; - int ncommit = 0; - - // examine dirv's for my lumps - // make list of dir slices i need to commit - for (map::iterator lp = lump_map.begin(); - lp != lump_map.end(); - ++lp) { - CInode *diri = mds->mdcache->get_inode(lp->first); - if (!diri) - continue; // we expired it - CDir *dir = diri->dir; - if (!dir) - continue; // we expired it - - // FIXME: check the slice only - - if (dir->is_proxy()) { - // wait until export is acked (logged on remote) and committed (logged locally) - CDir *ex = mds->mdcache->get_export_container(dir); - dout(10) << "EMetaBlob.expire proxy for " << *dir - << ", waiting for export finish on " << *ex << endl; - waitfor_export.push_back(ex); - continue; - } - if (!dir->is_auth()) { - dout(10) << "EMetaBlob.expire not auth, needed dirv " << lp->second.dirv - << " for " << *dir << endl; - continue; // not our problem - } - if (dir->get_last_committed_version() < lp->second.dirv) { - dout(10) << "EMetaBlob.expire need dirv " << lp->second.dirv - << ", committing " << *dir << endl; - commit.push_back(dir); - ncommit++; - } else { - dout(10) << "EMetaBlob.expire have dirv " << lp->second.dirv - << " on " << *dir << endl; - } - } - - // commit - assert(!commit.empty()); - - if (ncommit == 1) { - mds->mdstore->commit_dir(commit.front(), c); - } else { - C_Gather *gather = new C_Gather(c); - for (list::iterator p = commit.begin(); - p != commit.end(); - ++p) - mds->mdstore->commit_dir(*p, gather->new_sub()); - for (list::iterator p = waitfor_export.begin(); - p != waitfor_export.end(); - ++p) - mds->mdcache->migrator->add_export_finish_waiter(*p, gather->new_sub()); - } -} - -void EMetaBlob::replay(MDS *mds) -{ - dout(10) << "EMetaBlob.replay " << lump_map.size() << " dirlumps" << endl; - - // walk through my dirs (in order!) - for (list::iterator lp = lump_order.begin(); - lp != lump_order.end(); - ++lp) { - dout(10) << "EMetaBlob.replay dir " << *lp << endl; - dirlump &lump = lump_map[*lp]; - - // the dir - CInode *diri = mds->mdcache->get_inode(*lp); - CDir *dir; - if (!diri) { - assert(*lp == 1); - diri = mds->mdcache->create_root_inode(); - dout(10) << "EMetaBlob.replay created root " << *diri << endl; - } - if (diri->dir) { - dir = diri->dir; - dout(20) << "EMetaBlob.replay had dir " << *dir << endl; - } else { - dir = diri->get_or_open_dir(mds->mdcache); - if (*lp == 1) - dir->set_dir_auth(CDIR_AUTH_UNKNOWN); - dout(10) << "EMetaBlob.replay added dir " << *dir << endl; - } - dir->set_version( lump.dirv ); - if (lump.is_dirty()) - dir->_mark_dirty(); - if (lump.is_complete()) - dir->mark_complete(); - - // decode bits - lump._decode_bits(); - - // full dentry+inode pairs - for (list::iterator p = lump.get_dfull().begin(); - p != lump.get_dfull().end(); - p++) { - CInode *in = mds->mdcache->get_inode(p->inode.ino); - if (!in) { - // inode - in = new CInode(mds->mdcache); - in->inode = p->inode; - if (in->inode.is_symlink()) in->symlink = p->symlink; - mds->mdcache->add_inode(in); - // dentry - CDentry *dn = dir->add_dentry( p->dn, in ); - dn->set_version(p->dnv); - dn->_mark_dirty(); - dout(10) << "EMetaBlob.replay added " << *dn << " " << *in << endl; - } else { - // inode - in->inode = p->inode; - if (in->inode.is_symlink()) in->symlink = p->symlink; - // dentry - CDentry *dn = in->get_parent_dn(); - dn->set_version(p->dnv); - dn->_mark_dirty(); - dout(10) << "EMetaBlob.replay had " << *in->get_parent_dn() << " " << *in << endl; - } - } - - // remote dentries - for (list::iterator p = lump.get_dremote().begin(); - p != lump.get_dremote().end(); - p++) { - CDentry *dn = dir->lookup(p->dn); - if (!dn) { - dn = dir->add_dentry(p->dn, p->ino); - dn->set_remote_ino(p->ino); - dn->set_version(p->dnv); - dn->_mark_dirty(); - dout(10) << "EMetaBlob.replay added " << *dn << endl; - } else { - dn->set_remote_ino(p->ino); - dn->set_version(p->dnv); - dn->_mark_dirty(); - dout(10) << "EMetaBlob.replay had " << *dn << endl; - } - } - - // null dentries - for (list::iterator p = lump.get_dnull().begin(); - p != lump.get_dnull().end(); - p++) { - CDentry *dn = dir->lookup(p->dn); - if (!dn) { - dn = dir->add_dentry(p->dn); - dn->set_version(p->dnv); - dn->_mark_dirty(); - dout(10) << "EMetaBlob.replay added " << *dn << endl; - } else { - dn->set_version(p->dnv); - dn->_mark_dirty(); - dout(10) << "EMetaBlob.replay had " << *dn << endl; - } - } - } -} - - - -// ----------------------- -// EAlloc - -bool EAlloc::has_expired(MDS *mds) -{ - version_t cv = mds->idalloc->get_committed_version(); - if (cv < table_version) { - dout(10) << "EAlloc.has_expired v " << table_version << " > " << cv - << ", still dirty" << endl; - return false; // still dirty - } else { - dout(10) << "EAlloc.has_expired v " << table_version << " <= " << cv - << ", already flushed" << endl; - return true; // already flushed - } -} - -void EAlloc::expire(MDS *mds, Context *c) -{ - dout(10) << "EAlloc.expire saving idalloc table" << endl; - mds->idalloc->save(c, table_version); -} - -void EAlloc::replay(MDS *mds) -{ - if (mds->idalloc->get_version() >= table_version) { - dout(10) << "EAlloc.replay event " << table_version - << " <= table " << mds->idalloc->get_version() << endl; - } else { - dout(10) << " EAlloc.replay event " << table_version - << " - 1 == table " << mds->idalloc->get_version() << endl; - assert(table_version-1 == mds->idalloc->get_version()); - - if (what == EALLOC_EV_ALLOC) { - idno_t nid = mds->idalloc->alloc_id(true); - assert(nid == id); // this should match. - } - else if (what == EALLOC_EV_FREE) { - mds->idalloc->reclaim_id(id, true); - } - else - assert(0); - - assert(table_version == mds->idalloc->get_version()); - } -} - - -// ----------------------- -// EUpdate - -bool EUpdate::has_expired(MDS *mds) -{ - return metablob.has_expired(mds); -} - -void EUpdate::expire(MDS *mds, Context *c) -{ - metablob.expire(mds, c); -} - -void EUpdate::replay(MDS *mds) -{ - metablob.replay(mds); -} - - -// ----------------------- -// EImportMap - -bool EImportMap::has_expired(MDS *mds) -{ - if (mds->mdlog->last_import_map > get_end_off()) { - dout(10) << "EImportMap.has_expired -- there's a newer map" << endl; - return true; - } - else if (mds->mdlog->is_capped()) { - dout(10) << "EImportMap.has_expired -- log is capped, allowing map to expire" << endl; - return true; - } else { - dout(10) << "EImportMap.has_expired -- not until there's a newer map written" << endl; - return false; - } -} - -/* -class C_MDS_ImportMapFlush : public Context { - MDS *mds; - off_t end_off; -public: - C_MDS_ImportMapFlush(MDS *m, off_t eo) : mds(m), end_off(eo) { } - void finish(int r) { - // am i the last thing in the log? - if (mds->mdlog->get_write_pos() == end_off) { - // yes. we're good. - } else { - // no. submit another import_map so that we can go away. - } - } -}; -*/ - -void EImportMap::expire(MDS *mds, Context *c) -{ - dout(10) << "EImportMap.has_expire -- waiting for a newer map to be written (or for shutdown)" << endl; - mds->mdlog->import_map_expire_waiters.push_back(c); -} - -void EImportMap::replay(MDS *mds) -{ - dout(10) << "EImportMap.replay -- reconstructing import/export spanning tree" << endl; - assert(mds->mdcache->imports.empty()); - - // first, stick the spanning tree in my cache - metablob.replay(mds); - - // restore import/export maps - for (set::iterator p = imports.begin(); - p != imports.end(); - ++p) { - mds->mdcache->add_ambiguous_import(*p, nested_exports[*p]); - mds->mdcache->finish_ambiguous_import(*p); - } - - mds->mdcache->show_imports(); -} - - - -// ----------------------- -// EUnlink - -bool EUnlink::has_expired(MDS *mds) -{ - /* - // dir - CInode *diri = mds->mdcache->get_inode( diritrace.back().inode.ino ); - CDir *dir = 0; - if (diri) dir = diri->dir; - - if (dir && dir->get_last_committed_version() < dirv) return false; - - if (!inodetrace.trace.empty()) { - // inode - CInode *in = mds->mdcache->get_inode( inodetrace.back().inode.ino ); - if (in && in->get_last_committed_version() < inodetrace.back().inode.version) - return false; - } - */ - return true; -} - -void EUnlink::expire(MDS *mds, Context *c) -{ - /* - CInode *diri = mds->mdcache->get_inode( diritrace.back().inode.ino ); - CDir *dir = diri->dir; - assert(dir); - - // okay! - dout(7) << "commiting dirty (from unlink) dir " << *dir << endl; - mds->mdstore->commit_dir(dir, dirv, c); - */ -} - -void EUnlink::replay(MDS *mds) -{ -} - - - - -// ----------------------- -// EPurgeFinish - - -bool EPurgeFinish::has_expired(MDS *mds) -{ - return true; -} - -void EPurgeFinish::expire(MDS *mds, Context *c) -{ -} - -void EPurgeFinish::replay(MDS *mds) -{ -} - - - - - -// ========================================================================= - -// ----------------------- -// EExportStart - -bool EExportStart::has_expired(MDS *mds) -{ - CInode *diri = mds->mdcache->get_inode(dirino); - if (!diri) return true; - CDir *dir = diri->dir; - if (!dir) return true; - if (!mds->mdcache->migrator->is_exporting(dir)) - return true; - dout(10) << "EExportStart.has_expired still exporting " << *dir << endl; - return false; -} - -void EExportStart::expire(MDS *mds, Context *c) -{ - CInode *diri = mds->mdcache->get_inode(dirino); - assert(diri); - CDir *dir = diri->dir; - assert(dir); - assert(mds->mdcache->migrator->is_exporting(dir)); - - dout(10) << "EExportStart.expire waiting for export of " << *dir << endl; - mds->mdcache->migrator->add_export_finish_waiter(dir, c); -} - -void EExportStart::replay(MDS *mds) -{ - dout(10) << "EExportStart.replay " << dirino << " -> " << dest << endl; - metablob.replay(mds); - - // put in pending_exports lists - mds->mdlog->pending_exports[dirino] = bounds; -} - -// ----------------------- -// EExportFinish - -bool EExportFinish::has_expired(MDS *mds) -{ - // we can always expire. - return true; -} - -void EExportFinish::expire(MDS *mds, Context *c) -{ - assert(0); // should never happen. -} - -void EExportFinish::replay(MDS *mds) -{ - dout(10) << "EExportFinish.replay " << dirino << " success=" << success << endl; - - assert(mds->mdlog->pending_exports.count(dirino)); - - // finish? - if (success) - mds->mdcache->finish_ambiguous_export(dirino, mds->mdlog->pending_exports[dirino]); - - // remove from pending_exports list - mds->mdlog->pending_exports.erase(dirino); -} - - -// ----------------------- -// EImportStart - -bool EImportStart::has_expired(MDS *mds) -{ - return metablob.has_expired(mds); -} - -void EImportStart::expire(MDS *mds, Context *c) -{ - dout(10) << "EImportStart.expire " << dirino << endl; - metablob.expire(mds, c); -} - -void EImportStart::replay(MDS *mds) -{ - dout(10) << "EImportStart.replay " << dirino << endl; - metablob.replay(mds); - - // convert list -> set - set b; - for (list::iterator p = bounds.begin(); p != bounds.end(); ++p) - b.insert(*p); - - // put in ambiguous import list - mds->mdcache->add_ambiguous_import(dirino, b); -} - -// ----------------------- -// EImportFinish - -bool EImportFinish::has_expired(MDS *mds) -{ - return true; -} -void EImportFinish::expire(MDS *mds, Context *c) -{ - assert(0); // shouldn't ever happen -} - -void EImportFinish::replay(MDS *mds) -{ - dout(10) << "EImportFinish.replay " << dirino << " success=" << success << endl; - if (success) - mds->mdcache->finish_ambiguous_import(dirino); - else - mds->mdcache->cancel_ambiguous_import(dirino); -} - - - - - diff --git a/branches/marnberg/quota/mds/mdstypes.h b/branches/marnberg/quota/mds/mdstypes.h deleted file mode 100644 index 1ac4525e76559..0000000000000 --- a/branches/marnberg/quota/mds/mdstypes.h +++ /dev/null @@ -1,290 +0,0 @@ -#ifndef __MDSTYPES_H -#define __MDSTYPES_H - - -#include -#include -#include -#include -using namespace std; - -#include "config.h" -#include "common/DecayCounter.h" - -#include - - - -// md ops -#define MDS_OP_STATFS 1 - -#define MDS_OP_STAT 100 -#define MDS_OP_LSTAT 101 -#define MDS_OP_UTIME 102 -#define MDS_OP_CHMOD 103 -#define MDS_OP_CHOWN 104 - - -#define MDS_OP_READDIR 200 -#define MDS_OP_MKNOD 201 -#define MDS_OP_LINK 202 -#define MDS_OP_UNLINK 203 -#define MDS_OP_RENAME 204 - -#define MDS_OP_MKDIR 220 -#define MDS_OP_RMDIR 221 -#define MDS_OP_SYMLINK 222 - -#define MDS_OP_OPEN 301 -#define MDS_OP_TRUNCATE 306 -#define MDS_OP_FSYNC 307 -//#define MDS_OP_CLOSE 310 -#define MDS_OP_RELEASE 308 - - - -// ================================================================ - -/* meta_load_t - * hierarchical load for an inode/dir and it's children - */ -#define META_POP_IRD 0 -#define META_POP_IWR 1 -#define META_POP_DWR 2 -//#define META_POP_LOG 3 -//#define META_POP_FDIR 4 -//#define META_POP_CDIR 4 -#define META_NPOP 3 - -class meta_load_t { - public: - DecayCounter pop[META_NPOP]; - - double meta_load() { - return pop[META_POP_IRD].get() + 2*pop[META_POP_IWR].get(); - } - - void take(meta_load_t& other) { - for (int i=0; i"; -} - - -inline meta_load_t& operator-=(meta_load_t& l, meta_load_t& r) -{ - for (int i=0; i"; -} - -/* -inline mds_load_t& operator+=( mds_load_t& l, mds_load_t& r ) -{ - l.root_pop += r.root_pop; - l.req_rate += r.req_rate; - l.queue_len += r.queue_len; - return l; -} - -inline mds_load_t operator/( mds_load_t& a, double d ) -{ - mds_load_t r; - r.root_pop = a.root_pop / d; - r.req_rate = a.req_rate / d; - r.queue_len = a.queue_len / d; - return r; -} -*/ - - -// ================================================================ -// dir slices - -struct dirslice_t { - short hash_mask; - short hash_val; -}; - - - -// ================================================================ - -#define MDS_PIN_REPLICATED 1 - -class MDSCacheObject { - protected: - unsigned state; // state bits - - int ref; // reference count - set ref_set; - - map replicas; // [auth] mds -> nonce - int replica_nonce; // [replica] defined on replica - - public: - MDSCacheObject() : - state(0), - ref(0), - replica_nonce(0) {} - virtual ~MDSCacheObject() {} - - // -------------------------------------------- - // state - unsigned get_state() { return state; } - void state_clear(unsigned mask) { state &= ~mask; } - void state_set(unsigned mask) { state |= mask; } - unsigned state_test(unsigned mask) { return state & mask; } - void state_reset(unsigned s) { state = s; } - - // -------------------------------------------- - // pins - int get_num_ref() { return ref; } - bool is_pinned_by(int by) { return ref_set.count(by); } - set& get_ref_set() { return ref_set; } - - virtual void last_put() {} - virtual void bad_put(int by) { - assert(ref_set.count(by) == 1); - assert(ref > 0); - } - void put(int by) { - if (ref == 0 || ref_set.count(by) != 1) { - bad_put(by); - } else { - ref--; - ref_set.erase(by); - assert(ref == (int)ref_set.size()); - if (ref == 0) - last_put(); - } - } - - virtual void first_get() {} - virtual void bad_get(int by) { - assert(ref_set.count(by) == 0); - assert(0); - } - void get(int by) { - if (ref_set.count(by)) { - bad_get(by); - } else { - if (ref == 0) - first_get(); - ref++; - ref_set.insert(by); - assert(ref == (int)ref_set.size()); - } - } - - - - // -------------------------------------------- - // replication - bool is_replicated() { return !replicas.empty(); } - bool is_replica(int mds) { return replicas.count(mds); } - int num_replicas() { return replicas.size(); } - int add_replica(int mds) { - if (replicas.count(mds)) - return ++replicas[mds]; // inc nonce - if (replicas.empty()) - get(MDS_PIN_REPLICATED); - return replicas[mds] = 1; - } - void add_replica(int mds, int nonce) { - if (replicas.empty()) - get(MDS_PIN_REPLICATED); - replicas[mds] = nonce; - } - int get_replica_nonce(int mds) { - assert(replicas.count(mds)); - return replicas[mds]; - } - void remove_replica(int mds) { - assert(replicas.count(mds)); - replicas.erase(mds); - if (replicas.empty()) - put(MDS_PIN_REPLICATED); - } - void clear_replicas() { - if (!replicas.empty()) - put(MDS_PIN_REPLICATED); - replicas.clear(); - } - map::iterator replicas_begin() { return replicas.begin(); } - map::iterator replicas_end() { return replicas.end(); } - const map& get_replicas() { return replicas; } - - int get_replica_nonce() { return replica_nonce;} - void set_replica_nonce(int n) { replica_nonce = n; } -}; - - -#endif diff --git a/branches/marnberg/quota/messages/MAnchorReply.h b/branches/marnberg/quota/messages/MAnchorReply.h deleted file mode 100644 index 0186118f53260..0000000000000 --- a/branches/marnberg/quota/messages/MAnchorReply.h +++ /dev/null @@ -1,74 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MANCHORREPLY_H -#define __MANCHORREPLY_H - -#include - -#include "msg/Message.h" -#include "mds/AnchorTable.h" - -#include "MAnchorRequest.h" - - -class MAnchorReply : public Message { - int op; - inodeno_t ino; - vector trace; - - public: - MAnchorReply() {} - MAnchorReply(MAnchorRequest *req) : Message(MSG_MDS_ANCHORREPLY) { - this->op = req->get_op(); - this->ino = req->get_ino(); - } - ~MAnchorReply() { - for (unsigned i=0; i& trace) { this->trace = trace; } - - int get_op() { return op; } - inodeno_t get_ino() { return ino; } - vector& get_trace() { return trace; } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(op), (char*)&op); - off += sizeof(op); - payload.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - int n; - payload.copy(off, sizeof(int), (char*)&n); - off += sizeof(int); - for (int i=0; i_decode(payload, off); - trace.push_back(a); - } - } - - virtual void encode_payload() { - payload.append((char*)&op, sizeof(op)); - payload.append((char*)&ino, sizeof(ino)); - int n = trace.size(); - payload.append((char*)&n, sizeof(int)); - for (int i=0; i_encode(payload); - } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MAnchorRequest.h b/branches/marnberg/quota/messages/MAnchorRequest.h deleted file mode 100644 index 2a2d0088978b4..0000000000000 --- a/branches/marnberg/quota/messages/MAnchorRequest.h +++ /dev/null @@ -1,76 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MANCHORREQUEST_H -#define __MANCHORREQUEST_H - -#include - -#include "msg/Message.h" -#include "mds/AnchorTable.h" - -#define ANCHOR_OP_CREATE 1 -#define ANCHOR_OP_DESTROY 2 -#define ANCHOR_OP_LOOKUP 3 -#define ANCHOR_OP_UPDATE 4 - -class MAnchorRequest : public Message { - int op; - inodeno_t ino; - vector trace; - - public: - MAnchorRequest() {} - MAnchorRequest(int op, inodeno_t ino) : Message(MSG_MDS_ANCHORREQUEST) { - this->op = op; - this->ino = ino; - } - ~MAnchorRequest() { - for (unsigned i=0; i& trace) { this->trace = trace; } - - int get_op() { return op; } - inodeno_t get_ino() { return ino; } - vector& get_trace() { return trace; } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(op), (char*)&op); - off += sizeof(op); - payload.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - int n; - payload.copy(off, sizeof(int), (char*)&n); - off += sizeof(int); - for (int i=0; i_decode(payload, off); - trace.push_back(a); - } - } - - virtual void encode_payload() { - payload.append((char*)&op, sizeof(op)); - payload.append((char*)&ino, sizeof(ino)); - int n = trace.size(); - payload.append((char*)&n, sizeof(int)); - for (int i=0; i_encode(payload); - } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MCacheExpire.h b/branches/marnberg/quota/messages/MCacheExpire.h deleted file mode 100644 index 461d283c23072..0000000000000 --- a/branches/marnberg/quota/messages/MCacheExpire.h +++ /dev/null @@ -1,86 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MCACHEEXPIRE_H -#define __MCACHEEXPIRE_H - -class MCacheExpire : public Message { - int from; - map inodes; - map dirs; - map > dentries; - - public: - int get_from() { return from; } - map& get_inodes() { return inodes; } - map& get_dirs() { return dirs; } - map >& get_dentries() { return dentries; } - - MCacheExpire() {} - MCacheExpire(int f) : - Message(MSG_MDS_CACHEEXPIRE), - from(f) { } - - virtual char *get_type_name() { return "CEx";} - - void add_inode(inodeno_t ino, int nonce) { - inodes[ino] = nonce; - } - void add_dir(inodeno_t ino, int nonce) { - dirs[ino] = nonce; - } - void add_dentry(inodeno_t dirino, const string& dn, int nonce) { - dentries[dirino][dn] = nonce; - } - void add_dentries(inodeno_t dirino, map& dmap) { - dentries[dirino] = dmap; - } - - void decode_payload() { - int off = 0; - - payload.copy(off, sizeof(from), (char*)&from); - off += sizeof(from); - - ::_decode(inodes, payload, off); - ::_decode(dirs, payload, off); - - int n; - payload.copy(off, sizeof(int), (char*)&n); - off += sizeof(int); - for (int i=0; i >::iterator p = dentries.begin(); - p != dentries.end(); - ++p) { - payload.append((char*)&p->first, sizeof(p->first)); - ::_encode(p->second, payload); - } - } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MClientBoot.h b/branches/marnberg/quota/messages/MClientBoot.h deleted file mode 100644 index 460f9f02e27f4..0000000000000 --- a/branches/marnberg/quota/messages/MClientBoot.h +++ /dev/null @@ -1,31 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MCLIENTBOOT_H -#define __MCLIENTBOOT_H - -#include "msg/Message.h" - -class MClientBoot : public Message { - - public: - MClientBoot() : Message(MSG_CLIENT_BOOT) { } - - char *get_type_name() { return "ClientBoot"; } - - void encode_payload() { } - void decode_payload() { } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MClientFileCaps.h b/branches/marnberg/quota/messages/MClientFileCaps.h deleted file mode 100644 index 7fde047b02655..0000000000000 --- a/branches/marnberg/quota/messages/MClientFileCaps.h +++ /dev/null @@ -1,102 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MCLIENTFILECAPS_H -#define __MCLIENTFILECAPS_H - -#define CLIENT_FILECAP_RELEASE 1 // mds closed the cap -#define CLIENT_FILECAP_STALE 2 // mds has exported the cap -#define CLIENT_FILECAP_REAP 3 // mds has imported the cap from get_mds() - -class MClientFileCaps : public Message { - public: - static const int FILECAP_RELEASE = 1; - static const int FILECAP_STALE = 2; - static const int FILECAP_REAP = 3; - - - private: - inode_t inode; - int caps; - long seq; - int wanted; - //int client; - - int special; // stale || reap; in conjunction w/ mds value - int mds; - - public: - inodeno_t get_ino() { return inode.ino; } - inode_t& get_inode() { return inode; } - int get_caps() { return caps; } - int get_wanted() { return wanted; } - long get_seq() { return seq; } - //int get_client() { return client; } - - // for cap migration - int get_mds() { return mds; } - int get_special() { return special; } - - //void set_client(int c) { client = c; } - void set_caps(int c) { caps = c; } - void set_wanted(int w) { wanted = w; } - - void set_mds(int m) { mds = m; } - void set_special(int s) { special = s; } - - MClientFileCaps() {} - MClientFileCaps(inode_t& inode, - long seq, - int caps, - int wanted, - int special=0, - int mds=0) : - Message(MSG_CLIENT_FILECAPS) { - this->inode = inode; - this->seq = seq; - this->caps = caps; - this->wanted = wanted; - this->special = special; - this->mds = mds; - } - virtual char *get_type_name() { return "Cfcap";} - - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(seq), (char*)&seq); - off += sizeof(seq); - s.copy(off, sizeof(inode), (char*)&inode); - off += sizeof(inode); - s.copy(off, sizeof(caps), (char*)&caps); - off += sizeof(caps); - s.copy(off, sizeof(wanted), (char*)&wanted); - off += sizeof(wanted); - //s.copy(off, sizeof(client), (char*)&client); - //off += sizeof(client); - s.copy(off, sizeof(mds), (char*)&mds); - off += sizeof(mds); - s.copy(off, sizeof(special), (char*)&special); - off += sizeof(special); - } - virtual void encode_payload(crope& s) { - s.append((char*)&seq, sizeof(seq)); - s.append((char*)&inode, sizeof(inode)); - s.append((char*)&caps, sizeof(caps)); - s.append((char*)&wanted, sizeof(wanted)); - //s.append((char*)&client, sizeof(client)); - s.append((char*)&mds,sizeof(mds)); - s.append((char*)&special,sizeof(special)); - } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MClientInodeAuthUpdate.h b/branches/marnberg/quota/messages/MClientInodeAuthUpdate.h deleted file mode 100644 index e9083f6abc575..0000000000000 --- a/branches/marnberg/quota/messages/MClientInodeAuthUpdate.h +++ /dev/null @@ -1,46 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MCLIENTINODEAUTHUPDATE_H -#define __MCLIENTINODEAUTHUPDATE_H - -class MClientInodeAuthUpdate : public Message { - inodeno_t ino; - int newauth; - - public: - inodeno_t get_ino() { return ino; } - int get_auth() { return newauth; } - - MClientInodeAuthUpdate() {} - MClientInodeAuthUpdate(inodeno_t ino, int newauth) : - Message(MSG_CLIENT_INODEAUTHUPDATE) { - this->ino = ino; - this->newauth = newauth; - } - virtual char *get_type_name() { return "Ciau";} - - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - s.copy(off, sizeof(newauth), (char*)&newauth); - off += sizeof(newauth); - } - virtual void encode_payload(crope& s) { - s.append((char*)&ino,sizeof(ino)); - s.append((char*)&newauth,sizeof(newauth)); - } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MClientMount.h b/branches/marnberg/quota/messages/MClientMount.h deleted file mode 100644 index 0684cea8d95c2..0000000000000 --- a/branches/marnberg/quota/messages/MClientMount.h +++ /dev/null @@ -1,34 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MCLIENTMOUNT_H -#define __MCLIENTMOUNT_H - -#include "msg/Message.h" - -class MClientMount : public Message { - - public: - MClientMount() : Message(MSG_CLIENT_MOUNT) { - } - - char *get_type_name() { return "Cmnt"; } - - virtual void decode_payload(crope& s, int& off) { - } - virtual void encode_payload(crope& s) { - } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MClientMountAck.h b/branches/marnberg/quota/messages/MClientMountAck.h deleted file mode 100644 index 6b1b7cb2a901b..0000000000000 --- a/branches/marnberg/quota/messages/MClientMountAck.h +++ /dev/null @@ -1,59 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MCLIENTMOUNTACK_H -#define __MCLIENTMOUNTACK_H - -#include "msg/Message.h" -#include "MClientMount.h" -#include "mds/MDSMap.h" -#include "osd/OSDMap.h" - - -class MClientMountAck : public Message { - long pcid; - bufferlist osd_map_state; - bufferlist mds_map_state; - - public: - MClientMountAck() {} - MClientMountAck(MClientMount *mnt, MDSMap *mdsmap, OSDMap *osdmap) : Message(MSG_CLIENT_MOUNTACK) { - this->pcid = mnt->get_pcid(); - mdsmap->encode( mds_map_state ); - osdmap->encode( osd_map_state ); - } - - bufferlist& get_mds_map_state() { return mds_map_state; } - bufferlist& get_osd_map_state() { return osd_map_state; } - - void set_pcid(long pcid) { this->pcid = pcid; } - long get_pcid() { return pcid; } - - char *get_type_name() { return "CmntA"; } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(pcid), (char*)&pcid); - off += sizeof(pcid); - ::_decode( mds_map_state, payload, off); - ::_decode( osd_map_state, payload, off); - } - virtual void encode_payload() { - payload.append((char*)&pcid, sizeof(pcid)); - ::_encode( mds_map_state, payload ); - ::_encode( osd_map_state, payload ); - } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MClientReply.h b/branches/marnberg/quota/messages/MClientReply.h deleted file mode 100644 index 6206b909b0c05..0000000000000 --- a/branches/marnberg/quota/messages/MClientReply.h +++ /dev/null @@ -1,302 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MCLIENTREPLY_H -#define __MCLIENTREPLY_H - -#include "include/types.h" - -#include "msg/Message.h" -#include "mds/CInode.h" -#include "mds/CDir.h" -#include "mds/CDentry.h" - -#include -using namespace std; - -class CInode; - -/*** - * - * MClientReply - container message for MDS reply to a client's MClientRequest - * - * key fields: - * long tid - transaction id, so the client can match up with pending request - * int result - error code, or fh if it was open - * - * for most requests: - * trace is a vector of c_inoe_info's tracing from root to the file/dir/whatever - * the operation referred to, so that the client can update it's info about what - * metadata lives on what MDS. - * - * for readdir replies: - * dir_contents is a vector c_inode_info*'s. - * - * that's mostly it, i think! - * - */ - -class InodeStat { - - public: - inode_t inode; - string symlink; // symlink content (if symlink) - - - // mds distribution hints - int dir_auth; - bool hashed, replicated; - bool spec_defined; - set dist; // where am i replicated? - - public: - InodeStat() {} - InodeStat(CInode *in, int whoami) : - inode(in->inode) - { - // inode.mask - inode.mask = INODE_MASK_BASE; - if (in->filelock.can_read(in->is_auth())) - inode.mask |= INODE_MASK_PERM; - if (in->hardlock.can_read(in->is_auth())) - inode.mask |= INODE_MASK_SIZE | INODE_MASK_MTIME; // fixme when we separate this out. - - // symlink content? - if (in->is_symlink()) - symlink = in->symlink; - - // replicated where? - if (in->dir && in->dir->is_auth()) { - spec_defined = true; - in->dir->get_dist_spec(this->dist, whoami); - } else - spec_defined = false; - - if (in->dir) - dir_auth = in->dir->get_dir_auth(); - else - dir_auth = -1; - - // dir info - hashed = (in->dir && in->dir->is_hashed()); // FIXME not quite right. - replicated = (in->dir && in->dir->is_rep()); - } - - void _encode(bufferlist &bl) { - bl.append((char*)&inode, sizeof(inode)); - bl.append((char*)&spec_defined, sizeof(spec_defined)); - bl.append((char*)&dir_auth, sizeof(dir_auth)); - bl.append((char*)&hashed, sizeof(hashed)); - bl.append((char*)&replicated, sizeof(replicated)); - - ::_encode(symlink, bl); - ::_encode(dist, bl); // distn - } - - void _decode(bufferlist &bl, int& off) { - bl.copy(off, sizeof(inode), (char*)&inode); - off += sizeof(inode); - bl.copy(off, sizeof(spec_defined), (char*)&spec_defined); - off += sizeof(spec_defined); - bl.copy(off, sizeof(dir_auth), (char*)&dir_auth); - off += sizeof(dir_auth); - bl.copy(off, sizeof(hashed), (char*)&hashed); - off += sizeof(hashed); - bl.copy(off, sizeof(replicated), (char*)&replicated); - off += sizeof(replicated); - - ::_decode(symlink, bl, off); - ::_decode(dist, bl, off); - } -}; - - -typedef struct { - long pcid; - long tid; - int op; - int result; // error code - unsigned char file_caps; // for open - long file_caps_seq; - __uint64_t file_data_version; // for client buffercache consistency - - int _num_trace_in; - int _dir_size; -} MClientReply_st; - -class MClientReply : public Message { - // reply data - MClientReply_st st; - - string path; - list trace_in; - list trace_dn; - - list dir_in; - list dir_dn; - - public: - void set_pcid(long pcid) { this->st.pcid = pcid; } - long get_pcid() { return st.pcid; } - - long get_tid() { return st.tid; } - int get_op() { return st.op; } - - int get_result() { return st.result; } - const string& get_path() { return path; } - - inodeno_t get_ino() { return trace_in.back()->inode.ino; } - const inode_t& get_inode() { return trace_in.back()->inode; } - - const list& get_trace_in() { return trace_in; } - const list& get_trace_dn() { return trace_dn; } - - const list& get_dir_in() { return dir_in; } - const list& get_dir_dn() { return dir_dn; } - - unsigned char get_file_caps() { return st.file_caps; } - long get_file_caps_seq() { return st.file_caps_seq; } - __uint64_t get_file_data_version() { return st.file_data_version; } - - void set_result(int r) { st.result = r; } - void set_file_caps(unsigned char c) { st.file_caps = c; } - void set_file_caps_seq(long s) { st.file_caps_seq = s; } - void set_file_data_version(__uint64_t v) { st.file_data_version = v; } - - MClientReply() {}; - MClientReply(MClientRequest *req, int result = 0) : - Message(MSG_CLIENT_REPLY) { - memset(&st, 0, sizeof(st)); - this->st.pcid = req->get_pcid(); // match up procedure call id!!! - this->st.tid = req->get_tid(); - this->st.op = req->get_op(); - this->path = req->get_path(); - - this->st.result = result; - - st._dir_size = 0; - st._num_trace_in = 0; - } - virtual ~MClientReply() { - list::iterator it; - - for (it = trace_in.begin(); it != trace_in.end(); ++it) - delete *it; - for (it = dir_in.begin(); it != dir_in.end(); ++it) - delete *it; - } - virtual char *get_type_name() { return "creply"; } - - - // serialization - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(st), (char*)&st); - off += sizeof(st); - - _decode(path, payload, off); - - for (int i=0; i_decode(payload, off); - trace_in.push_back(ci); - } - - for (int i=0; i_decode(payload, off); - dir_in.push_back(ci); - string dn; - ::_decode(dn, payload, off); - dir_dn.push_back(dn); - } - } - virtual void encode_payload() { - payload.append((char*)&st, sizeof(st)); - _encode(path, payload); - - // trace - list::iterator pdn = trace_dn.begin(); - list::iterator pin; - for (pin = trace_in.begin(); - pin != trace_in.end(); - ++pin) { - if (pin != trace_in.begin()) { - ::_encode(*pdn, payload); - ++pdn; - } - (*pin)->_encode(payload); - } - - // dir contents - pdn = dir_dn.begin(); - for (pin = dir_in.begin(); - pin != dir_in.end(); - ++pin, ++pdn) { - (*pin)->_encode(payload); - ::_encode(*pdn, payload); - } - } - - // builders - /* - void add_dir_item(string& dn, InodeStat *in) { - dir_dn.push_back(dn); - dir_in.push_back(in); - ++st._dir_size; - }*/ - void take_dir_items(list& inls, - list& dnls, - int num) { - dir_in.swap(inls); - dir_dn.swap(dnls); - st._dir_size = num; - } - void copy_dir_items(const list& inls, - const list& dnls) { - list::const_iterator pdn = dnls.begin(); - list::const_iterator pin = inls.begin(); - while (pin != inls.end()) { - // copy! - InodeStat *i = new InodeStat; - *i = **pin; - dir_in.push_back(i); - dir_dn.push_back(*pdn); - ++pin; - ++pdn; - ++st._dir_size; - } - } - - void set_trace_dist(CInode *in, int whoami) { - st._num_trace_in = 0; - while (in) { - // add this inode to trace, along with referring dentry name - if (in->get_parent_dn()) - trace_dn.push_front(in->get_parent_dn()->get_name()); - trace_in.push_front(new InodeStat(in, whoami)); - ++st._num_trace_in; - - in = in->get_parent_inode(); - } - } - -}; - -#endif diff --git a/branches/marnberg/quota/messages/MClientRequest.h b/branches/marnberg/quota/messages/MClientRequest.h deleted file mode 100644 index 9b9ac4e115cac..0000000000000 --- a/branches/marnberg/quota/messages/MClientRequest.h +++ /dev/null @@ -1,202 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MCLIENTREQUEST_H -#define __MCLIENTREQUEST_H - -#include - -#include "msg/Message.h" -#include "include/filepath.h" -#include "mds/mdstypes.h" -#include "mds/MDS.h" - -/** - * - * MClientRequest - container for a client METADATA request. created/sent by clients. - * can be forwarded around between MDS's. - * - * int client - the originating client - * long pcid - procedure call id, used to match request+response. - * long tid - transaction id, unique among requests for that client. probably just a counter! - * -> the MDS passes the Request to the Reply constructor, so this always matches. - * - * int op - the metadata op code. MDS_OP_RENAME, etc. - * int caller_uid, _gid - guess - * - * arguments: one or more of these are defined, depending on the metadata op: - * inodeno ino - used by close(), along with fh. not strictly necessary except MDS is currently coded lame. - * filepath path - main file argument (almost everything) - * string sarg - string argument (if a second arg is needed, e.g. rename, symlink) - * int iarg - int arg... file mode for open, fh for close, mode for mkdir, etc. - * int iarg2 - second int arg... gid for chown (iarg is uid) - * time_t targ, targ2 - time args, used by utime - * - * That's basically it! - * - */ - - -typedef struct { - long tid; - int client; - int op; - - entity_inst_t client_inst; - - int caller_uid, caller_gid; - inodeno_t ino; - - int iarg, iarg2; - time_t targ, targ2; - - inodeno_t mds_wants_replica_in_dirino; - - size_t sizearg; -} MClientRequest_st; - - -class MClientRequest : public Message { - MClientRequest_st st; - filepath path; - string sarg; - string sarg2; - - - public: - MClientRequest() {} - MClientRequest(int op, int client) : Message(MSG_CLIENT_REQUEST) { - memset(&st, 0, sizeof(st)); - this->st.op = op; - this->st.client = client; - this->st.iarg = 0; - } - virtual char *get_type_name() { return "creq"; } - - // keep a pcid (procedure call id) to match up request+reply - //void set_pcid(long pcid) { this->st.pcid = pcid; } - //long get_pcid() { return st.pcid; } - - // normal fields - void set_tid(long t) { st.tid = t; } - void set_path(string& p) { path.set_path(p); } - void set_path(const char *p) { path.set_path(p); } - void set_path(const filepath& fp) { path = fp; } - void set_caller_uid(int u) { st.caller_uid = u; } - void set_caller_gid(int g) { st.caller_gid = g; } - void set_ino(inodeno_t ino) { st.ino = ino; } - void set_iarg(int i) { st.iarg = i; } - void set_iarg2(int i) { st.iarg2 = i; } - void set_targ(time_t& t) { st.targ = t; } - void set_targ2(time_t& t) { st.targ2 = t; } - void set_sarg(string& arg) { this->sarg = arg; } - void set_sarg(const char *arg) { this->sarg = arg; } - void set_sarg2(string& arg) { this->sarg2 = arg; } - void set_sizearg(size_t s) { st.sizearg = s; } - void set_mds_wants_replica_in_dirino(inodeno_t dirino) { - st.mds_wants_replica_in_dirino = dirino; } - - void set_client_inst(const entity_inst_t& i) { st.client_inst = i; } - const entity_inst_t& get_client_inst() { return st.client_inst; } - - int get_client() { return st.client; } - long get_tid() { return st.tid; } - int get_op() { return st.op; } - int get_caller_uid() { return st.caller_uid; } - int get_caller_gid() { return st.caller_gid; } - inodeno_t get_ino() { return st.ino; } - string& get_path() { return path.get_path(); } - filepath& get_filepath() { return path; } - int get_iarg() { return st.iarg; } - int get_iarg2() { return st.iarg2; } - time_t get_targ() { return st.targ; } - time_t get_targ2() { return st.targ2; } - string& get_sarg() { return sarg; } - string& get_sarg2() { return sarg2; } - size_t get_sizearg() { return st.sizearg; } - inodeno_t get_mds_wants_replica_in_dirino() { - return st.mds_wants_replica_in_dirino; } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(st), (char*)&st); - off += sizeof(st); - path._decode(payload, off); - _decode(sarg, payload, off); - _decode(sarg2, payload, off); - } - - virtual void encode_payload() { - payload.append((char*)&st, sizeof(st)); - path._encode(payload); - _encode(sarg, payload); - _encode(sarg2, payload); - } - - void print(ostream& out) { - out << "clientreq(client" << get_client() - << "." << get_tid() - //<< ".pcid=" << get_pcid() - << ":"; - switch(get_op()) { - case MDS_OP_STAT: - out << "stat"; break; - case MDS_OP_LSTAT: - out << "lstat"; break; - case MDS_OP_UTIME: - out << "utime"; break; - case MDS_OP_CHMOD: - out << "chmod"; break; - case MDS_OP_CHOWN: - out << "chown"; break; - - case MDS_OP_READDIR: - out << "readdir"; break; - case MDS_OP_MKNOD: - out << "mknod"; break; - case MDS_OP_LINK: - out << "link"; break; - case MDS_OP_UNLINK: - out << "unlink"; break; - case MDS_OP_RENAME: - out << "rename"; break; - - case MDS_OP_MKDIR: - out << "mkdir"; break; - case MDS_OP_RMDIR: - out << "rmdir"; break; - case MDS_OP_SYMLINK: - out << "symlink"; break; - - case MDS_OP_OPEN: - out << "open"; break; - case MDS_OP_TRUNCATE: - out << "truncate"; break; - case MDS_OP_FSYNC: - out << "fsync"; break; - case MDS_OP_RELEASE: - out << "release"; break; - default: - out << "unknown=" << get_op(); - } - if (get_path().length()) - out << "=" << get_path(); - if (get_sarg().length()) - out << " " << get_sarg(); - out << ")"; - } - -}; - -#endif diff --git a/branches/marnberg/quota/messages/MDentryUnlink.h b/branches/marnberg/quota/messages/MDentryUnlink.h deleted file mode 100644 index ec1503eeadf00..0000000000000 --- a/branches/marnberg/quota/messages/MDentryUnlink.h +++ /dev/null @@ -1,45 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MDENTRYUNLINK_H -#define __MDENTRYUNLINK_H - -class MDentryUnlink : public Message { - inodeno_t dirino; - string dn; - - public: - inodeno_t get_dirino() { return dirino; } - string& get_dn() { return dn; } - - MDentryUnlink() {} - MDentryUnlink(inodeno_t dirino, string& dn) : - Message(MSG_MDS_DENTRYUNLINK) { - this->dirino = dirino; - this->dn = dn; - } - virtual char *get_type_name() { return "Dun";} - - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(dirino), (char*)&dirino); - off += sizeof(dirino); - _unrope(dn, s, off); - } - virtual void encode_payload(crope& s) { - s.append((char*)&dirino,sizeof(dirino)); - _rope(dn, s); - } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MDirExpire.h b/branches/marnberg/quota/messages/MDirExpire.h deleted file mode 100644 index a81de3d538365..0000000000000 --- a/branches/marnberg/quota/messages/MDirExpire.h +++ /dev/null @@ -1,50 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MDIREXPIRE_H -#define __MDIREXPIRE_H - -typedef struct { - inodeno_t ino; - int nonce; - int from; -} MDirExpire_st; - -class MDirExpire : public Message { - MDirExpire_st st; - - public: - inodeno_t get_ino() { return st.ino; } - int get_from() { return st.from; } - int get_nonce() { return st.nonce; } - - MDirExpire() {} - MDirExpire(inodeno_t ino, int from, int nonce) : - Message(MSG_MDS_DIREXPIRE) { - st.ino = ino; - st.from = from; - st.nonce = nonce; - } - virtual char *get_type_name() { return "DirEx";} - - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(st), (char*)&st); - off += sizeof(st); - } - virtual void encode_payload(crope& s) { - s.append((char*)&st,sizeof(st)); - } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MDirExpireReq.h b/branches/marnberg/quota/messages/MDirExpireReq.h deleted file mode 100644 index 604a55265c723..0000000000000 --- a/branches/marnberg/quota/messages/MDirExpireReq.h +++ /dev/null @@ -1,49 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MDIREXPIREREQ_H -#define __MDIREXPIREREQ_H - -typedef struct { - inodeno_t ino; - int nonce; - int from; -} MDirExpireReq_st; - -class MDirExpire : public Message { - MDirExpireReq_st st; - - public: - inodeno_t get_ino() { return st.ino; } - int get_from() { return st.from; } - int get_nonce() { return st.nonce; } - - MDirExpire() {} - MDirExpire(inodeno_t ino, int from, int nonce) : - Message(MSG_MDS_DIREXPIREREQ) { - st.ino = ino; - st.from = from; - st.nonce = nonce; - } - virtual char *get_type_name() { return "DirExR";} - - virtual void decode_payload(crope& s) { - s.copy(0, sizeof(st), (char*)&st); - } - virtual void encode_payload(crope& s) { - s.append((char*)&st,sizeof(st)); - } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MDirUpdate.h b/branches/marnberg/quota/messages/MDirUpdate.h deleted file mode 100644 index 9bac721654c22..0000000000000 --- a/branches/marnberg/quota/messages/MDirUpdate.h +++ /dev/null @@ -1,71 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MDIRUPDATE_H -#define __MDIRUPDATE_H - -#include "msg/Message.h" - -typedef struct { - inodeno_t ino; - int dir_rep; - int discover; -} MDirUpdate_st; - -class MDirUpdate : public Message { - MDirUpdate_st st; - set dir_rep_by; - string path; - - public: - inodeno_t get_ino() { return st.ino; } - int get_dir_rep() { return st.dir_rep; } - set& get_dir_rep_by() { return dir_rep_by; } - bool should_discover() { return st.discover > 0; } - string& get_path() { return path; } - - void tried_discover() { - if (st.discover) st.discover--; - } - - MDirUpdate() {} - MDirUpdate(inodeno_t ino, - int dir_rep, - set& dir_rep_by, - string& path, - bool discover = false) : - Message(MSG_MDS_DIRUPDATE) { - this->st.ino = ino; - this->st.dir_rep = dir_rep; - this->dir_rep_by = dir_rep_by; - if (discover) this->st.discover = 5; - this->path = path; - } - virtual char *get_type_name() { return "dup"; } - - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(st), (char*)&st); - off += sizeof(st); - _unrope(dir_rep_by, s, off); - _unrope(path, s, off); - } - - virtual void encode_payload(crope& r) { - r.append((char*)&st, sizeof(st)); - _rope(dir_rep_by, r); - _rope(path, r); - } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MDiscover.h b/branches/marnberg/quota/messages/MDiscover.h deleted file mode 100644 index d207ab28cc143..0000000000000 --- a/branches/marnberg/quota/messages/MDiscover.h +++ /dev/null @@ -1,75 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MDISCOVER_H -#define __MDISCOVER_H - -#include "msg/Message.h" -#include "mds/CDir.h" -#include "include/filepath.h" - -#include -#include -using namespace std; - - -class MDiscover : public Message { - int asker; - inodeno_t base_ino; // 0 -> none, want root - bool want_base_dir; - bool want_root_inode; - - filepath want; // ... [/]need/this/stuff - - public: - int get_asker() { return asker; } - inodeno_t get_base_ino() { return base_ino; } - filepath& get_want() { return want; } - const string& get_dentry(int n) { return want[n]; } - bool wants_base_dir() { return want_base_dir; } - - MDiscover() { } - MDiscover(int asker, - inodeno_t base_ino, - filepath& want, - bool want_base_dir = true, - bool want_root_inode = false) : - Message(MSG_MDS_DISCOVER) { - this->asker = asker; - this->base_ino = base_ino; - this->want = want; - this->want_base_dir = want_base_dir; - this->want_root_inode = want_root_inode; - } - virtual char *get_type_name() { return "Dis"; } - - virtual void decode_payload(crope& r, int& off) { - r.copy(off, sizeof(asker), (char*)&asker); - off += sizeof(asker); - r.copy(off, sizeof(base_ino), (char*)&base_ino); - off += sizeof(base_ino); - r.copy(off, sizeof(bool), (char*)&want_base_dir); - off += sizeof(bool); - want._unrope(r, off); - } - virtual void encode_payload(crope& r) { - r.append((char*)&asker, sizeof(asker)); - r.append((char*)&base_ino, sizeof(base_ino)); - r.append((char*)&want_base_dir, sizeof(want_base_dir)); - want._rope(r); - } - -}; - -#endif diff --git a/branches/marnberg/quota/messages/MDiscoverReply.h b/branches/marnberg/quota/messages/MDiscoverReply.h deleted file mode 100644 index c759bc9a76bd1..0000000000000 --- a/branches/marnberg/quota/messages/MDiscoverReply.h +++ /dev/null @@ -1,254 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MDISCOVERREPLY_H -#define __MDISCOVERREPLY_H - -#include "msg/Message.h" -#include "mds/CDir.h" -#include "mds/CInode.h" -#include "include/filepath.h" - -#include -#include -using namespace std; - -#define max(a,b) ((a)>(b) ? (a):(b)) - - -/** - * MDiscoverReply - return new replicas (of inodes, dirs, dentries) - * - * we group returned items by (dir, dentry, inode). each - * item in each set shares an index (it's "depth"). - * - * we can start and end with any type. - * no_base_dir = true if the first group has an inode but no dir - * no_base_dentry = true if the first group has an inode but no dentry - * they are false if there is no returned data, ie the first group is empty. - * - * we also return errors: - * error_flag_dn(string) - the specified dentry dne - * error_flag_dir - the last item wasn't a dir, so we couldn't continue. - * - * depth() gives us the number of depth units/indices for which we have - * information. this INCLUDES those for which we have errors but no data. - * - * see MDCache::handle_discover, handle_discover_reply. - * - - old crap, maybe not accurate: - - // dir [ + ... ] : discover want_base_dir=true - - // dentry [ + inode [ + ... ] ] : discover want_base_dir=false - // no_base_dir=true - // -> we only exclude inode if dentry is null+xlock - - // inode [ + ... ], base_ino = 0 : discover base_ino=0, start w/ root ino, - // no_base_dir=no_base_dentry=true - - * - */ - -class MDiscoverReply : public Message { - inodeno_t base_ino; - bool no_base_dir; // no base dir (but IS dentry+inode) - bool no_base_dentry; // no base dentry (but IS inode) - bool flag_error_dn; - bool flag_error_dir; - string error_dentry; // dentry that was not found (to trigger waiters on asker) - - - vector dirs; // not inode-aligned if no_base_dir = true. - vector dentries; // not inode-aligned if no_base_dentry = true - vector inodes; - - string path; - - public: - // accessors - inodeno_t get_base_ino() { return base_ino; } - int get_num_inodes() { return inodes.size(); } - int get_num_dentries() { return dentries.size(); } - int get_num_dirs() { return dirs.size(); } - - int get_depth() { // return depth of deepest object (in dir/dentry/inode units) - return max( inodes.size(), // at least this many - max( no_base_dentry + dentries.size() + flag_error_dn, // inode start + path + possible error - dirs.size() + no_base_dir )); // dn/inode + dirs - } - - bool has_base_dir() { return !no_base_dir && dirs.size(); } - bool has_base_dentry() { return !no_base_dentry && dentries.size(); } - bool has_root() { - if (base_ino == 0) { - assert(no_base_dir && no_base_dentry); - return true; - } - return false; - } - - const string& get_path() { return path; } - - // bool is_flag_forward() { return flag_forward; } - bool is_flag_error_dn() { return flag_error_dn; } - bool is_flag_error_dir() { return flag_error_dir; } - string& get_error_dentry() { return error_dentry; } - - // these index _arguments_ are aligned to each ([[dir, ] dentry, ] inode) set. - CDirDiscover& get_dir(int n) { return *(dirs[n - no_base_dir]); } - CDentryDiscover& get_dentry(int n) { return *(dentries[n - no_base_dentry]); } - CInodeDiscover& get_inode(int n) { return *(inodes[n]); } - inodeno_t get_ino(int n) { return inodes[n]->get_ino(); } - - // cons - MDiscoverReply() {} - MDiscoverReply(inodeno_t base_ino) : - Message(MSG_MDS_DISCOVERREPLY) { - this->base_ino = base_ino; - flag_error_dn = false; - flag_error_dir = false; - no_base_dir = no_base_dentry = false; - } - ~MDiscoverReply() { - for (vector::iterator it = dirs.begin(); - it != dirs.end(); - it++) - delete *it; - for (vector::iterator it = inodes.begin(); - it != inodes.end(); - it++) - delete *it; - } - virtual char *get_type_name() { return "DisR"; } - - // builders - bool is_empty() { - return dirs.empty() && dentries.empty() && inodes.empty() && - !flag_error_dn && - !flag_error_dir; - } - void add_dentry(CDentryDiscover* ddis) { - if (dentries.empty() && dirs.empty()) no_base_dir = true; - dentries.push_back(ddis); - if (path.length()) path += "/"; - path += ddis->get_dname(); - } - - void add_inode(CInodeDiscover* din) { - if (inodes.empty() && dentries.empty()) no_base_dir = no_base_dentry = true; - inodes.push_back( din ); - } - - void add_dir(CDirDiscover* dir) { - dirs.push_back( dir ); - } - - // void set_flag_forward() { flag_forward = true; } - void set_flag_error_dn(const string& dn) { - flag_error_dn = true; - error_dentry = dn; - } - void set_flag_error_dir() { - flag_error_dir = true; - } - - - // ... - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(base_ino), (char*)&base_ino); - off += sizeof(base_ino); - payload.copy(off, sizeof(bool), (char*)&no_base_dir); - off += sizeof(bool); - payload.copy(off, sizeof(bool), (char*)&no_base_dentry); - off += sizeof(bool); - // payload.copy(off, sizeof(bool), (char*)&flag_forward); - //off += sizeof(bool); - payload.copy(off, sizeof(bool), (char*)&flag_error_dn); - off += sizeof(bool); - - _decode(error_dentry, payload, off); - payload.copy(off, sizeof(bool), (char*)&flag_error_dir); - off += sizeof(bool); - - // dirs - int n; - payload.copy(off, sizeof(int), (char*)&n); - off += sizeof(int); - for (int i=0; i_decode(payload, off); - } - //dout(12) << n << " dirs out" << endl; - - // inodes - payload.copy(off, sizeof(int), (char*)&n); - off += sizeof(int); - for (int i=0; i_decode(payload, off); - } - //dout(12) << n << " inodes out" << endl; - - // dentries - payload.copy(off, sizeof(int), (char*)&n); - off += sizeof(int); - for (int i=0; i_decode(payload, off); - } - } - void encode_payload() { - payload.append((char*)&base_ino, sizeof(base_ino)); - payload.append((char*)&no_base_dir, sizeof(bool)); - payload.append((char*)&no_base_dentry, sizeof(bool)); - // payload.append((char*)&flag_forward, sizeof(bool)); - payload.append((char*)&flag_error_dn, sizeof(bool)); - - _encode(error_dentry, payload); - payload.append((char*)&flag_error_dir, sizeof(bool)); - - // dirs - int n = dirs.size(); - payload.append((char*)&n, sizeof(int)); - for (vector::iterator it = dirs.begin(); - it != dirs.end(); - it++) - (*it)->_encode( payload ); - //dout(12) << n << " dirs in" << endl; - - // inodes - n = inodes.size(); - payload.append((char*)&n, sizeof(int)); - for (vector::iterator it = inodes.begin(); - it != inodes.end(); - it++) - (*it)->_encode( payload ); - //dout(12) << n << " inodes in" << endl; - - // dentries - n = dentries.size(); - payload.append((char*)&n, sizeof(int)); - for (vector::iterator it = dentries.begin(); - it != dentries.end(); - it++) - (*it)->_encode( payload ); - //dout(12) << n << " dentries in" << endl; - } - -}; - -#endif diff --git a/branches/marnberg/quota/messages/MExportDir.h b/branches/marnberg/quota/messages/MExportDir.h deleted file mode 100644 index 8fdda89466b1e..0000000000000 --- a/branches/marnberg/quota/messages/MExportDir.h +++ /dev/null @@ -1,64 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MEXPORTDIR_H -#define __MEXPORTDIR_H - -#include "msg/Message.h" - - -class MExportDir : public Message { - inodeno_t ino; - - list dirstate; // a bl for reach dir - list exports; - - public: - MExportDir() {} - MExportDir(inodeno_t dirino) : - Message(MSG_MDS_EXPORTDIR), - ino(dirino) { - } - virtual char *get_type_name() { return "Ex"; } - - inodeno_t get_ino() { return ino; } - list& get_dirstate() { return dirstate; } - list& get_exports() { return exports; } - - void add_dir(bufferlist& dir) { - dirstate.push_back(dir); - } - void set_dirstate(const list& ls) { - dirstate = ls; - } - void add_export(inodeno_t dirino) { - exports.push_back(dirino); - } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - ::_decode(exports, payload, off); - ::_decode(dirstate, payload, off); - } - virtual void encode_payload() { - payload.append((char*)&ino, sizeof(ino)); - ::_encode(exports, payload); - ::_encode(dirstate, payload); - } - -}; - -#endif diff --git a/branches/marnberg/quota/messages/MExportDirAck.h b/branches/marnberg/quota/messages/MExportDirAck.h deleted file mode 100644 index 35691bf94e2a7..0000000000000 --- a/branches/marnberg/quota/messages/MExportDirAck.h +++ /dev/null @@ -1,42 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MEXPORTDIRACK_H -#define __MEXPORTDIRACK_H - -#include "MExportDir.h" - -class MExportDirAck : public Message { - inodeno_t ino; - - public: - inodeno_t get_ino() { return ino; } - - MExportDirAck() {} - MExportDirAck(MExportDir *req) : - Message(MSG_MDS_EXPORTDIRACK) { - ino = req->get_ino(); - } - virtual char *get_type_name() { return "ExAck"; } - - virtual void decode_payload(crope& s) { - s.copy(0, sizeof(ino), (char*)&ino); - } - virtual void encode_payload(crope& s) { - s.append((char*)&ino, sizeof(ino)); - } - -}; - -#endif diff --git a/branches/marnberg/quota/messages/MExportDirDiscover.h b/branches/marnberg/quota/messages/MExportDirDiscover.h deleted file mode 100644 index 24f77036455f4..0000000000000 --- a/branches/marnberg/quota/messages/MExportDirDiscover.h +++ /dev/null @@ -1,51 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MEXPORTDIRDISCOVER_H -#define __MEXPORTDIRDISCOVER_H - -#include "msg/Message.h" -#include "mds/CInode.h" -#include "include/types.h" - -class MExportDirDiscover : public Message { - inodeno_t ino; - string path; - - public: - inodeno_t get_ino() { return ino; } - string& get_path() { return path; } - - MExportDirDiscover() {} - MExportDirDiscover(CInode *in) : - Message(MSG_MDS_EXPORTDIRDISCOVER) { - in->make_path(path); - ino = in->ino(); - } - virtual char *get_type_name() { return "ExDis"; } - - - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - _unrope(path, s, off); - } - - virtual void encode_payload(crope& s) { - s.append((char*)&ino, sizeof(ino)); - _rope(path, s); - } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MExportDirDiscoverAck.h b/branches/marnberg/quota/messages/MExportDirDiscoverAck.h deleted file mode 100644 index a25e3b46672e3..0000000000000 --- a/branches/marnberg/quota/messages/MExportDirDiscoverAck.h +++ /dev/null @@ -1,52 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MEXPORTDIRDISCOVERACK_H -#define __MEXPORTDIRDISCOVERACK_H - -#include "msg/Message.h" -#include "mds/CInode.h" -#include "include/types.h" - -class MExportDirDiscoverAck : public Message { - inodeno_t ino; - bool success; - - public: - inodeno_t get_ino() { return ino; } - bool is_success() { return success; } - - MExportDirDiscoverAck() {} - MExportDirDiscoverAck(inodeno_t ino, bool success=true) : - Message(MSG_MDS_EXPORTDIRDISCOVERACK) { - this->ino = ino; - this->success = false; - } - virtual char *get_type_name() { return "ExDisA"; } - - - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - s.copy(off, sizeof(success), (char*)&success); - off += sizeof(success); - } - - virtual void encode_payload(crope& s) { - s.append((char*)&ino, sizeof(ino)); - s.append((char*)&success, sizeof(success)); - } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MExportDirFinish.h b/branches/marnberg/quota/messages/MExportDirFinish.h deleted file mode 100644 index 89c9e5290c4b2..0000000000000 --- a/branches/marnberg/quota/messages/MExportDirFinish.h +++ /dev/null @@ -1,43 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MEXPORTDIRFINISH_H -#define __MEXPORTDIRFINISH_H - -#include "MExportDir.h" - -class MExportDirFinish : public Message { - inodeno_t ino; - - public: - inodeno_t get_ino() { return ino; } - - MExportDirFinish() {} - MExportDirFinish(inodeno_t ino) : - Message(MSG_MDS_EXPORTDIRFINISH) { - this->ino = ino; - } - virtual char *get_type_name() { return "ExFin"; } - - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - } - virtual void encode_payload(crope& s) { - s.append((char*)&ino, sizeof(ino)); - } - -}; - -#endif diff --git a/branches/marnberg/quota/messages/MExportDirNotify.h b/branches/marnberg/quota/messages/MExportDirNotify.h deleted file mode 100644 index 9d6532cad478c..0000000000000 --- a/branches/marnberg/quota/messages/MExportDirNotify.h +++ /dev/null @@ -1,111 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MEXPORTDIRNOTIFY_H -#define __MEXPORTDIRNOTIFY_H - -#include "msg/Message.h" -#include -using namespace std; - -class MExportDirNotify : public Message { - int new_auth; - int old_auth; - inodeno_t ino; - - list exports; // bounds; these dirs are _not_ included (tho the inodes are) - list subdirs; - - public: - inodeno_t get_ino() { return ino; } - int get_new_auth() { return new_auth; } - int get_old_auth() { return old_auth; } - list& get_exports() { return exports; } - list::iterator subdirs_begin() { return subdirs.begin(); } - list::iterator subdirs_end() { return subdirs.end(); } - int num_subdirs() { return subdirs.size(); } - - MExportDirNotify() {} - MExportDirNotify(inodeno_t ino, int old_auth, int new_auth) : - Message(MSG_MDS_EXPORTDIRNOTIFY) { - this->ino = ino; - this->old_auth = old_auth; - this->new_auth = new_auth; - } - virtual char *get_type_name() { return "ExNot"; } - - void copy_subdirs(list& s) { - this->subdirs = s; - } - void copy_exports(list& ex) { - this->exports = ex; - } - - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(int), (char*)&new_auth); - off += sizeof(int); - s.copy(off, sizeof(int), (char*)&old_auth); - off += sizeof(int); - s.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - - // notify - int n; - s.copy(off, sizeof(int), (char*)&n); - off += sizeof(int); - for (int i=0; i::iterator it = exports.begin(); - it != exports.end(); - it++) { - inodeno_t ino = *it; - s.append((char*)&ino, sizeof(ino)); - } - - // subdirs - n = subdirs.size(); - s.append((char*)&n, sizeof(int)); - for (list::iterator it = subdirs.begin(); - it != subdirs.end(); - it++) { - inodeno_t ino = *it; - s.append((char*)&ino, sizeof(ino)); - } - } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MExportDirNotifyAck.h b/branches/marnberg/quota/messages/MExportDirNotifyAck.h deleted file mode 100644 index 3179fd4f544f1..0000000000000 --- a/branches/marnberg/quota/messages/MExportDirNotifyAck.h +++ /dev/null @@ -1,46 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MEXPORTDIRNOTIFYACK_H -#define __MEXPORTDIRNOTIFYACK_H - -#include "msg/Message.h" -#include -using namespace std; - -class MExportDirNotifyAck : public Message { - inodeno_t ino; - - public: - inodeno_t get_ino() { return ino; } - - MExportDirNotifyAck() {} - MExportDirNotifyAck(inodeno_t ino) : - Message(MSG_MDS_EXPORTDIRNOTIFYACK) { - this->ino = ino; - } - virtual char *get_type_name() { return "ExNotA"; } - - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - } - - virtual void encode_payload(crope& s) { - s.append((char*)&ino, sizeof(ino)); - } - -}; - -#endif diff --git a/branches/marnberg/quota/messages/MExportDirPrep.h b/branches/marnberg/quota/messages/MExportDirPrep.h deleted file mode 100644 index 6967d950afad9..0000000000000 --- a/branches/marnberg/quota/messages/MExportDirPrep.h +++ /dev/null @@ -1,186 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MEXPORTDIRPREP_H -#define __MEXPORTDIRPREP_H - -#include "msg/Message.h" -#include "mds/CInode.h" -#include "include/types.h" - -class MExportDirPrep : public Message { - inodeno_t ino; - - /* nested export discover payload. - not all inodes will have dirs; they may require a separate discover. - dentries are the links to each inode. - dirs map includes base dir (ino) - */ - list exports; - - list inodes; - map inode_dirino; - map inode_dentry; - - map dirs; - - bool b_did_assim; - - public: - inodeno_t get_ino() { return ino; } - list& get_exports() { return exports; } - list& get_inodes() { return inodes; } - inodeno_t get_containing_dirino(inodeno_t ino) { - return inode_dirino[ino]; - } - string& get_dentry(inodeno_t ino) { - return inode_dentry[ino]; - } - bool have_dir(inodeno_t ino) { - return dirs.count(ino); - } - CDirDiscover* get_dir(inodeno_t ino) { - return dirs[ino]; - } - - bool did_assim() { return b_did_assim; } - void mark_assim() { b_did_assim = true; } - - MExportDirPrep() { - b_did_assim = false; - } - MExportDirPrep(CInode *in) : - Message(MSG_MDS_EXPORTDIRPREP) { - ino = in->ino(); - b_did_assim = false; - } - ~MExportDirPrep() { - for (list::iterator iit = inodes.begin(); - iit != inodes.end(); - iit++) - delete *iit; - for (map::iterator dit = dirs.begin(); - dit != dirs.end(); - dit++) - delete dit->second; - } - - - virtual char *get_type_name() { return "ExP"; } - - - - - void add_export(inodeno_t dirino) { - exports.push_back( dirino ); - } - void add_inode(inodeno_t dirino, const string& dentry, CInodeDiscover *in) { - inodes.push_back(in); - inode_dirino.insert(pair(in->get_ino(), dirino)); - inode_dentry.insert(pair(in->get_ino(), dentry)); - } - void add_dir(CDirDiscover *dir) { - dirs.insert(pair(dir->get_ino(), dir)); - } - - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - - // exports - int ne; - payload.copy(off, sizeof(int), (char*)&ne); - off += sizeof(int); - for (int i=0; i_decode(payload, off); - inodes.push_back(in); - - // dentry - string d; - _decode(d, payload, off); - inode_dentry[in->get_ino()] = d; - - // dir ino - inodeno_t dino; - payload.copy(off, sizeof(dino), (char*)&dino); - off += sizeof(dino); - inode_dirino[in->get_ino()] = dino; - } - - // dirs - int nd; - payload.copy(off, sizeof(int), (char*)&nd); - off += sizeof(int); - for (int i=0; i_decode(payload, off); - dirs[dir->get_ino()] = dir; - } - } - - virtual void encode_payload() { - payload.append((char*)&ino, sizeof(ino)); - - // exports - int ne = exports.size(); - payload.append((char*)&ne, sizeof(int)); - for (list::iterator it = exports.begin(); - it != exports.end(); - it++) { - inodeno_t ino = *it; - payload.append((char*)&ino, sizeof(ino)); - } - - // inodes - int ni = inodes.size(); - payload.append((char*)&ni, sizeof(int)); - for (list::iterator iit = inodes.begin(); - iit != inodes.end(); - iit++) { - (*iit)->_encode(payload); - - // dentry - _encode(inode_dentry[(*iit)->get_ino()], payload); - - // dir ino - inodeno_t ino = inode_dirino[(*iit)->get_ino()]; - payload.append((char*)&ino, sizeof(ino)); - } - - // dirs - int nd = dirs.size(); - payload.append((char*)&nd, sizeof(int)); - for (map::iterator dit = dirs.begin(); - dit != dirs.end(); - dit++) - dit->second->_encode(payload); - } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MExportDirPrepAck.h b/branches/marnberg/quota/messages/MExportDirPrepAck.h deleted file mode 100644 index c32d7255c5074..0000000000000 --- a/branches/marnberg/quota/messages/MExportDirPrepAck.h +++ /dev/null @@ -1,44 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MEXPORTDIRPREPACK_H -#define __MEXPORTDIRPREPACK_H - -#include "msg/Message.h" -#include "include/types.h" - -class MExportDirPrepAck : public Message { - inodeno_t ino; - - public: - inodeno_t get_ino() { return ino; } - - MExportDirPrepAck() {} - MExportDirPrepAck(inodeno_t ino) : - Message(MSG_MDS_EXPORTDIRPREPACK) { - this->ino = ino; - } - - virtual char *get_type_name() { return "ExPAck"; } - - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - } - virtual void encode_payload(crope& s) { - s.append((char*)&ino, sizeof(ino)); - } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MExportDirWarning.h b/branches/marnberg/quota/messages/MExportDirWarning.h deleted file mode 100644 index 6f2fdf55dde4f..0000000000000 --- a/branches/marnberg/quota/messages/MExportDirWarning.h +++ /dev/null @@ -1,45 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MEXPORTDIRWARNING_H -#define __MEXPORTDIRWARNING_H - -#include "msg/Message.h" -#include "mds/CInode.h" -#include "include/types.h" - -class MExportDirWarning : public Message { - inodeno_t ino; - - public: - inodeno_t get_ino() { return ino; } - - MExportDirWarning() {} - MExportDirWarning(inodeno_t ino) : - Message(MSG_MDS_EXPORTDIRWARNING) { - this->ino = ino; - } - - virtual char *get_type_name() { return "ExW"; } - - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - } - virtual void encode_payload(crope& s) { - s.append((char*)&ino, sizeof(ino)); - } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MFailure.h b/branches/marnberg/quota/messages/MFailure.h deleted file mode 100644 index 0ec53f6e36b18..0000000000000 --- a/branches/marnberg/quota/messages/MFailure.h +++ /dev/null @@ -1,49 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MFAILURE_H -#define __MFAILURE_H - -#include "msg/Message.h" - - -class MFailure : public Message { - public: - entity_name_t failed; - entity_inst_t inst; - - MFailure() {} - MFailure(entity_name_t f, entity_inst_t& i) : - Message(MSG_FAILURE), - failed(f), inst(i) {} - - entity_name_t get_failed() { return failed; } - entity_inst_t& get_inst() { return inst; } - - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(failed), (char*)&failed); - off += sizeof(failed); - payload.copy(off, sizeof(inst), (char*)&inst); - off += sizeof(inst); - } - void encode_payload() { - payload.append((char*)&failed, sizeof(failed)); - payload.append((char*)&inst, sizeof(inst)); - } - - virtual char *get_type_name() { return "fail"; } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MFailureAck.h b/branches/marnberg/quota/messages/MFailureAck.h deleted file mode 100644 index ec0036dcdac55..0000000000000 --- a/branches/marnberg/quota/messages/MFailureAck.h +++ /dev/null @@ -1,42 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MFAILUREACK_H -#define __MFAILUREACK_H - -#include "MFailure.h" - - -class MFailureAck : public Message { - public: - entity_name_t failed; - MFailureAck(MFailure *m) : Message(MSG_FAILURE_ACK) { - this->failed = m->get_failed(); - } - MFailureAck() {} - - entity_name_t get_failed() { return failed; } - - virtual void decode_payload(crope& s, int& off) { - s.copy(0, sizeof(failed), (char*)&failed); - off += sizeof(failed); - } - virtual void encode_payload(crope& s) { - s.append((char*)&failed, sizeof(failed)); - } - - virtual char *get_type_name() { return "faila"; } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MGenericMessage.h b/branches/marnberg/quota/messages/MGenericMessage.h deleted file mode 100644 index b2f39534e6e23..0000000000000 --- a/branches/marnberg/quota/messages/MGenericMessage.h +++ /dev/null @@ -1,44 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MGENERICMESSAGE_H -#define __MGENERICMESSAGE_H - -#include "msg/Message.h" - -class MGenericMessage : public Message { - char tname[20]; - //long pcid; - - public: - MGenericMessage(int t) : Message(t) { - sprintf(tname, "generic%d", get_type()); - } - - //void set_pcid(long pcid) { this->pcid = pcid; } - //long get_pcid() { return pcid; } - - char *get_type_name() { return tname; } - - virtual void decode_payload() { - //int off = 0; - //payload.copy(off, sizeof(pcid), (char*)&pcid); - //off += sizeof(pcid); - } - virtual void encode_payload() { - //payload.append((char*)&pcid, sizeof(pcid)); - } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MHashDir.h b/branches/marnberg/quota/messages/MHashDir.h deleted file mode 100644 index ddf7e3ac2bbce..0000000000000 --- a/branches/marnberg/quota/messages/MHashDir.h +++ /dev/null @@ -1,64 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MHASHDIR_H -#define __MHASHDIR_H - -#include "msg/Message.h" - -class MHashDir : public Message { - inodeno_t ino; - bufferlist state; - int nden; - - public: - MHashDir() {} - MHashDir(inodeno_t ino) : - Message(MSG_MDS_HASHDIR) { - this->ino = ino; - nden = 0; - } - virtual char *get_type_name() { return "Ha"; } - - inodeno_t get_ino() { return ino; } - bufferlist& get_state() { return state; } - bufferlist* get_state_ptr() { return &state; } - int get_nden() { return nden; } - - void set_nden(int n) { nden = n; } - void inc_nden() { nden++; } - - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - payload.copy(off, sizeof(nden), (char*)&nden); - off += sizeof(nden); - - size_t len; - payload.copy(off, sizeof(len), (char*)&len); - off += sizeof(len); - state.substr_of(payload, off, len); - } - void encode_payload() { - payload.append((char*)&ino, sizeof(ino)); - payload.append((char*)&nden, sizeof(nden)); - size_t size = state.length(); - payload.append((char*)&size, sizeof(size)); - payload.claim_append(state); - } - -}; - -#endif diff --git a/branches/marnberg/quota/messages/MHashDirAck.h b/branches/marnberg/quota/messages/MHashDirAck.h deleted file mode 100644 index cd6d4da8cf34f..0000000000000 --- a/branches/marnberg/quota/messages/MHashDirAck.h +++ /dev/null @@ -1,42 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MHASHDIRACK_H -#define __MHASHDIRACK_H - -#include "MHashDir.h" - -class MHashDirAck : public Message { - inodeno_t ino; - - public: - inodeno_t get_ino() { return ino; } - - MHashDirAck() {} - MHashDirAck(inodeno_t ino) : - Message(MSG_MDS_HASHDIRACK) { - this->ino = ino; - } - virtual char *get_type_name() { return "HAck"; } - - virtual void decode_payload() { - payload.copy(0, sizeof(ino), (char*)&ino); - } - virtual void encode_payload() { - payload.append((char*)&ino, sizeof(ino)); - } - -}; - -#endif diff --git a/branches/marnberg/quota/messages/MHashDirDiscover.h b/branches/marnberg/quota/messages/MHashDirDiscover.h deleted file mode 100644 index 0ea1ff8b79990..0000000000000 --- a/branches/marnberg/quota/messages/MHashDirDiscover.h +++ /dev/null @@ -1,52 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MHASHDIRDISCOVER_H -#define __MHASHDIRDISCOVER_H - -#include "msg/Message.h" -#include "mds/CInode.h" -#include "include/types.h" - -class MHashDirDiscover : public Message { - inodeno_t ino; - string path; - - public: - inodeno_t get_ino() { return ino; } - string& get_path() { return path; } - - MHashDirDiscover() {} - MHashDirDiscover(CInode *in) : - Message(MSG_MDS_HASHDIRDISCOVER) { - in->make_path(path); - ino = in->ino(); - } - virtual char *get_type_name() { return "HDis"; } - - - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - _decode(path, payload, off); - } - - void encode_payload() { - payload.append((char*)&ino, sizeof(ino)); - _encode(path, payload); - } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MHashDirDiscoverAck.h b/branches/marnberg/quota/messages/MHashDirDiscoverAck.h deleted file mode 100644 index 34734af0f97ad..0000000000000 --- a/branches/marnberg/quota/messages/MHashDirDiscoverAck.h +++ /dev/null @@ -1,53 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MHASHDIRDISCOVERACK_H -#define __MHASHDIRDISCOVERACK_H - -#include "msg/Message.h" -#include "mds/CInode.h" -#include "include/types.h" - -class MHashDirDiscoverAck : public Message { - inodeno_t ino; - bool success; - - public: - inodeno_t get_ino() { return ino; } - bool is_success() { return success; } - - MHashDirDiscoverAck() {} - MHashDirDiscoverAck(inodeno_t ino, bool success=true) : - Message(MSG_MDS_HASHDIRDISCOVERACK) { - this->ino = ino; - this->success = false; - } - virtual char *get_type_name() { return "HDisA"; } - - - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - payload.copy(off, sizeof(success), (char*)&success); - off += sizeof(success); - } - - void encode_payload() { - payload.append((char*)&ino, sizeof(ino)); - payload.append((char*)&success, sizeof(success)); - } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MHashDirNotify.h b/branches/marnberg/quota/messages/MHashDirNotify.h deleted file mode 100644 index ececc3ec2cc65..0000000000000 --- a/branches/marnberg/quota/messages/MHashDirNotify.h +++ /dev/null @@ -1,50 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MHASHDIRNOTIFY_H -#define __MHASHDIRNOTIFY_H - -#include "msg/Message.h" - -class MHashDirNotify : public Message { - inodeno_t ino; - int from; - - public: - inodeno_t get_ino() { return ino; } - int get_from() { return from; } - - MHashDirNotify() {} - MHashDirNotify(inodeno_t ino, int from) : - Message(MSG_MDS_HASHDIRNOTIFY) { - this->ino = ino; - this->from = from; - } - virtual char *get_type_name() { return "HN"; } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - payload.copy(off, sizeof(from), (char*)&from); - off += sizeof(from); - } - virtual void encode_payload() { - payload.append((char*)&ino, sizeof(ino)); - payload.append((char*)&from, sizeof(from)); - } - -}; - -#endif diff --git a/branches/marnberg/quota/messages/MHashDirPrep.h b/branches/marnberg/quota/messages/MHashDirPrep.h deleted file mode 100644 index 29a42217d6a4b..0000000000000 --- a/branches/marnberg/quota/messages/MHashDirPrep.h +++ /dev/null @@ -1,93 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MHASHDIRPREP_H -#define __MHASHDIRPREP_H - -#include "msg/Message.h" -#include "mds/CInode.h" -#include "include/types.h" - -class MHashDirPrep : public Message { - inodeno_t ino; - bool assim; - - // subdir dentry names + inodes - map inodes; - - public: - inodeno_t get_ino() { return ino; } - map& get_inodes() { return inodes; } - - bool did_assim() { return assim; } - void mark_assim() { assert(!assim); assim = true; } - - MHashDirPrep() : assim(false) { } - MHashDirPrep(inodeno_t ino) : - Message(MSG_MDS_HASHDIRPREP), - assim(false) { - this->ino = ino; - } - ~MHashDirPrep() { - for (map::iterator it = inodes.begin(); - it != inodes.end(); - it++) - delete it->second; - } - - - virtual char *get_type_name() { return "HP"; } - - void add_inode(const string& dentry, CInodeDiscover *in) { - inodes[dentry] = in; - } - - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - - // inodes - int ni; - payload.copy(off, sizeof(int), (char*)&ni); - off += sizeof(int); - for (int i=0; i_decode(payload, off); - - inodes[dname] = in; - } - } - - virtual void encode_payload() { - payload.append((char*)&ino, sizeof(ino)); - - // inodes - int ni = inodes.size(); - payload.append((char*)&ni, sizeof(int)); - for (map::iterator iit = inodes.begin(); - iit != inodes.end(); - iit++) { - _encode(iit->first, payload); // dentry - iit->second->_encode(payload); // inode - } - } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MHashDirPrepAck.h b/branches/marnberg/quota/messages/MHashDirPrepAck.h deleted file mode 100644 index 1d0db35c10f88..0000000000000 --- a/branches/marnberg/quota/messages/MHashDirPrepAck.h +++ /dev/null @@ -1,43 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MHASHDIRPREPACK_H -#define __MHASHDIRPREPACK_H - -#include "msg/Message.h" -#include "include/types.h" - -class MHashDirPrepAck : public Message { - inodeno_t ino; - - public: - inodeno_t get_ino() { return ino; } - - MHashDirPrepAck() {} - MHashDirPrepAck(inodeno_t ino) : - Message(MSG_MDS_HASHDIRPREPACK) { - this->ino = ino; - } - - virtual char *get_type_name() { return "HPAck"; } - - void decode_payload() { - payload.copy(0, sizeof(ino), (char*)&ino); - } - void encode_payload() { - payload.append((char*)&ino, sizeof(ino)); - } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MHashReaddir.h b/branches/marnberg/quota/messages/MHashReaddir.h deleted file mode 100644 index 864cb6944aeda..0000000000000 --- a/branches/marnberg/quota/messages/MHashReaddir.h +++ /dev/null @@ -1,44 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MHASHREADDIR_H -#define __MHASHREADDIR_H - -#include "include/types.h" -#include "msg/Message.h" - -class MHashReaddir : public Message { - inodeno_t ino; - - public: - MHashReaddir() { } - MHashReaddir(inodeno_t ino) : - Message(MSG_MDS_HASHREADDIR) { - this->ino = ino; - } - - inodeno_t get_ino() { return ino; } - - virtual char *get_type_name() { return "Hls"; } - - virtual void decode_payload() { - payload.copy(0, sizeof(ino), (char*)&ino); - } - virtual void encode_payload() { - payload.append((char*)&ino, sizeof(ino)); - } - -}; - -#endif diff --git a/branches/marnberg/quota/messages/MHashReaddirReply.h b/branches/marnberg/quota/messages/MHashReaddirReply.h deleted file mode 100644 index d9d73d8528f00..0000000000000 --- a/branches/marnberg/quota/messages/MHashReaddirReply.h +++ /dev/null @@ -1,80 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MHASHREADDIRREPLY_H -#define __MHASHREADDIRREPLY_H - -#include "MClientReply.h" - -class MHashReaddirReply : public Message { - inodeno_t ino; - - list dir_in; - list dir_dn; - - int num; - - public: - MHashReaddirReply() { } - MHashReaddirReply(inodeno_t _ino, list& inls, list& dnls, int n) : - Message(MSG_MDS_HASHREADDIRREPLY), - ino(_ino), - num(n) { - dir_in.swap(inls); - dir_dn.swap(dnls); - } - ~MHashReaddirReply() { - for (list::iterator it = dir_in.begin(); it != dir_in.end(); it++) - delete *it; - } - - inodeno_t get_ino() { return ino; } - list& get_in() { return dir_in; } - list& get_dn() { return dir_dn; } - - virtual char *get_type_name() { return "Hls"; } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - int n; - payload.copy(n, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i_decode(payload, off); - dir_in.push_back(ci); - } - } - virtual void encode_payload() { - payload.append((char*)&ino, sizeof(ino)); - int n = dir_in.size(); // FIXME? - payload.append((char*)&n, sizeof(n)); - list::iterator pdn = dir_dn.begin(); - for (list::iterator pin = dir_in.begin(); - pin != dir_in.end(); - ++pin, ++pdn) { - ::_encode(*pdn, payload); - (*pin)->_encode(payload); - } - } - -}; - -#endif diff --git a/branches/marnberg/quota/messages/MHeartbeat.h b/branches/marnberg/quota/messages/MHeartbeat.h deleted file mode 100644 index 55455f406ef18..0000000000000 --- a/branches/marnberg/quota/messages/MHeartbeat.h +++ /dev/null @@ -1,81 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MHEARTBEAT_H -#define __MHEARTBEAT_H - -#include "include/types.h" -#include "msg/Message.h" - -class MHeartbeat : public Message { - mds_load_t load; - int beat; - map import_map; - - public: - mds_load_t& get_load() { return load; } - int get_beat() { return beat; } - - map& get_import_map() { - return import_map; - } - - MHeartbeat() {} - MHeartbeat(mds_load_t& load, int beat) : - Message(MSG_MDS_HEARTBEAT) { - this->load = load; - this->beat = beat; - } - - virtual char *get_type_name() { return "HB"; } - - virtual void decode_payload(crope& s, int& off) { - s.copy(off,sizeof(load), (char*)&load); - off += sizeof(load); - s.copy(off, sizeof(beat), (char*)&beat); - off += sizeof(beat); - - int n; - s.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - while (n--) { - int f; - s.copy(off, sizeof(f), (char*)&f); - off += sizeof(f); - float v; - s.copy(off, sizeof(v), (char*)&v); - off += sizeof(v); - import_map[f] = v; - } - } - virtual void encode_payload(crope& s) { - s.append((char*)&load, sizeof(load)); - s.append((char*)&beat, sizeof(beat)); - - int n = import_map.size(); - s.append((char*)&n, sizeof(n)); - for (map::iterator it = import_map.begin(); - it != import_map.end(); - it++) { - int f = it->first; - s.append((char*)&f, sizeof(f)); - float v = it->second; - s.append((char*)&v, sizeof(v)); - } - - } - -}; - -#endif diff --git a/branches/marnberg/quota/messages/MInodeExpire.h b/branches/marnberg/quota/messages/MInodeExpire.h deleted file mode 100644 index 637f378324022..0000000000000 --- a/branches/marnberg/quota/messages/MInodeExpire.h +++ /dev/null @@ -1,50 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MINODEEXPIRE_H -#define __MINODEEXPIRE_H - -typedef struct { - inodeno_t ino; - int nonce; - int from; -} MInodeExpire_st; - -class MInodeExpire : public Message { - MInodeExpire_st st; - - public: - inodeno_t get_ino() { return st.ino; } - int get_from() { return st.from; } - int get_nonce() { return st.nonce; } - - MInodeExpire() {} - MInodeExpire(inodeno_t ino, int from, int nonce) : - Message(MSG_MDS_INODEEXPIRE) { - st.ino = ino; - st.from = from; - st.nonce = nonce; - } - virtual char *get_type_name() { return "InEx";} - - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(st), (char*)&st); - off += sizeof(st); - } - virtual void encode_payload(crope& s) { - s.append((char*)&st,sizeof(st)); - } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MInodeFileCaps.h b/branches/marnberg/quota/messages/MInodeFileCaps.h deleted file mode 100644 index 5bd51be0e347b..0000000000000 --- a/branches/marnberg/quota/messages/MInodeFileCaps.h +++ /dev/null @@ -1,55 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MINODEFILECAPS_H -#define __MINODEFILECAPS_H - -class MInodeFileCaps : public Message { - inodeno_t ino; - int from; - int caps; - - public: - inodeno_t get_ino() { return ino; } - int get_from() { return from; } - int get_caps() { return caps; } - - MInodeFileCaps() {} - // from auth - MInodeFileCaps(inodeno_t ino, int from, int caps) : - Message(MSG_MDS_INODEFILECAPS) { - - this->ino = ino; - this->from = from; - this->caps = caps; - } - - virtual char *get_type_name() { return "Icap";} - - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(from), (char*)&from); - off += sizeof(from); - s.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - s.copy(off, sizeof(caps), (char*)&caps); - off += sizeof(caps); - } - virtual void encode_payload(crope& s) { - s.append((char*)&from, sizeof(from)); - s.append((char*)&ino, sizeof(ino)); - s.append((char*)&caps, sizeof(caps)); - } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MInodeLink.h b/branches/marnberg/quota/messages/MInodeLink.h deleted file mode 100644 index feefc4ea21c7b..0000000000000 --- a/branches/marnberg/quota/messages/MInodeLink.h +++ /dev/null @@ -1,47 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MINODELINK_H -#define __MINODELINK_H - -typedef struct { - inodeno_t ino; - int from; -} MInodeLink_st; - -class MInodeLink : public Message { - MInodeLink_st st; - - public: - inodeno_t get_ino() { return st.ino; } - int get_from() { return st.from; } - - MInodeLink() {} - MInodeLink(inodeno_t ino, int from) : - Message(MSG_MDS_INODELINK) { - st.ino = ino; - st.from = from; - } - virtual char *get_type_name() { return "InL";} - - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(st), (char*)&st); - off += sizeof(st); - } - virtual void encode_payload(crope& s) { - s.append((char*)&st,sizeof(st)); - } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MInodeLinkAck.h b/branches/marnberg/quota/messages/MInodeLinkAck.h deleted file mode 100644 index 987b70741edcb..0000000000000 --- a/branches/marnberg/quota/messages/MInodeLinkAck.h +++ /dev/null @@ -1,47 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MINODELINKACK_H -#define __MINODELINKACK_H - -typedef struct { - inodeno_t ino; - bool success; -} MInodeLinkAck_st; - -class MInodeLinkAck : public Message { - MInodeLinkAck_st st; - - public: - inodeno_t get_ino() { return st.ino; } - bool is_success() { return st.success; } - - MInodeLinkAck() {} - MInodeLinkAck(inodeno_t ino, bool success) : - Message(MSG_MDS_INODELINKACK) { - st.ino = ino; - st.success = success; - } - virtual char *get_type_name() { return "InLA";} - - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(st), (char*)&st); - off += sizeof(st); - } - virtual void encode_payload(crope& s) { - s.append((char*)&st,sizeof(st)); - } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MInodeUnlink.h b/branches/marnberg/quota/messages/MInodeUnlink.h deleted file mode 100644 index e1aa463153c26..0000000000000 --- a/branches/marnberg/quota/messages/MInodeUnlink.h +++ /dev/null @@ -1,47 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MINODEUNLINK_H -#define __MINODEUNLINK_H - -typedef struct { - inodeno_t ino; - int from; -} MInodeUnlink_st; - -class MInodeUnlink : public Message { - MInodeUnlink_st st; - - public: - inodeno_t get_ino() { return st.ino; } - int get_from() { return st.from; } - - MInodeUnlink() {} - MInodeUnlink(inodeno_t ino, int from) : - Message(MSG_MDS_INODEUNLINK) { - st.ino = ino; - st.from = from; - } - virtual char *get_type_name() { return "InUl";} - - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(st), (char*)&st); - off += sizeof(st); - } - virtual void encode_payload(crope& s) { - s.append((char*)&st,sizeof(st)); - } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MInodeUnlinkAck.h b/branches/marnberg/quota/messages/MInodeUnlinkAck.h deleted file mode 100644 index 283c016f2bec9..0000000000000 --- a/branches/marnberg/quota/messages/MInodeUnlinkAck.h +++ /dev/null @@ -1,44 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MINODEUNLINKACK_H -#define __MINODEUNLINKACK_H - -typedef struct { - inodeno_t ino; -} MInodeUnlinkAck_st; - -class MInodeUnlinkAck : public Message { - MInodeUnlinkAck_st st; - - public: - inodeno_t get_ino() { return st.ino; } - - MInodeUnlinkAck() {} - MInodeUnlinkAck(inodeno_t ino) : - Message(MSG_MDS_INODEUNLINKACK) { - st.ino = ino; - } - virtual char *get_type_name() { return "InUlA";} - - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(st), (char*)&st); - off += sizeof(st); - } - virtual void encode_payload(crope& s) { - s.append((char*)&st,sizeof(st)); - } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MInodeUpdate.h b/branches/marnberg/quota/messages/MInodeUpdate.h deleted file mode 100644 index bbab924089aa5..0000000000000 --- a/branches/marnberg/quota/messages/MInodeUpdate.h +++ /dev/null @@ -1,61 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MINODEUPDATE_H -#define __MINODEUPDATE_H - -#include "msg/Message.h" - -#include -using namespace std; - -class MInodeUpdate : public Message { - int nonce; - crope inode_basic_state; - - public: - inodeno_t get_ino() { - inodeno_t ino; - inode_basic_state.copy(0, sizeof(inodeno_t), (char*)&ino); - return ino; - } - int get_nonce() { return nonce; } - - MInodeUpdate() {} - MInodeUpdate(CInode *in, int nonce) : - Message(MSG_MDS_INODEUPDATE) { - inode_basic_state = in->encode_basic_state(); - this->nonce = nonce; - } - virtual char *get_type_name() { return "Iup"; } - - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(int), (char*)&nonce); - off += sizeof(int); - size_t len; - s.copy(off, sizeof(len), (char*)&len); - off += sizeof(len); - inode_basic_state = s.substr(off, len); - off += len; - } - virtual void encode_payload(crope& s) { - s.append((char*)&nonce, sizeof(int)); - size_t len = inode_basic_state.length(); - s.append((char*)&len, sizeof(len)); - s.append(inode_basic_state); - } - -}; - -#endif diff --git a/branches/marnberg/quota/messages/MLock.h b/branches/marnberg/quota/messages/MLock.h deleted file mode 100644 index 1d22d297d79d4..0000000000000 --- a/branches/marnberg/quota/messages/MLock.h +++ /dev/null @@ -1,128 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MLOCK_H -#define __MLOCK_H - -#include "msg/Message.h" - -#define LOCK_OTYPE_IHARD 1 -#define LOCK_OTYPE_IFILE 2 -#define LOCK_OTYPE_DIR 3 -#define LOCK_OTYPE_DN 4 - -// for replicas -#define LOCK_AC_SYNC 0 -#define LOCK_AC_MIXED 1 -#define LOCK_AC_LOCK 2 - -#define LOCK_AC_REQXLOCKACK 9 // req dentry xlock -#define LOCK_AC_REQXLOCKNAK 10 // req dentry xlock -#define LOCK_AC_LOCKNAK 12 // for dentry xlock - - -#define LOCK_AC_FOR_REPLICA(a) ((a) <= 10) -#define LOCK_AC_FOR_AUTH(a) ((a) >= 11) - -// for auth - -#define LOCK_AC_SYNCACK 13 -#define LOCK_AC_MIXEDACK 14 -#define LOCK_AC_LOCKACK 15 - - -#define LOCK_AC_REQREAD 19 -#define LOCK_AC_REQWRITE 20 - -#define LOCK_AC_REQXLOCK 21 -#define LOCK_AC_REQXLOCKC 22 // create if necessary -#define LOCK_AC_UNXLOCK 23 - -#define lock_ac_name(x) - - -class MLock : public Message { - int asker; // who is initiating this request - int action; // action type - - char otype; // lock object type - inodeno_t ino; // ino ref, or possibly - string dn; // dentry name - bufferlist data; // and possibly some data - string path; // possibly a path too (for dentry lock discovers) - - public: - inodeno_t get_ino() { return ino; } - string& get_dn() { return dn; } - bufferlist& get_data() { return data; } - int get_asker() { return asker; } - int get_action() { return action; } - int get_otype() { return otype; } - string& get_path() { return path; } - - MLock() {} - MLock(int action, int asker) : - Message(MSG_MDS_LOCK) { - this->action = action; - this->asker = asker; - } - virtual char *get_type_name() { return "ILock"; } - - void set_ino(inodeno_t ino, char ot) { - otype = ot; - this->ino = ino; - } - void set_dirino(inodeno_t dirino) { - otype = LOCK_OTYPE_DIR; - this->ino = ino; - } - void set_dn(inodeno_t dirino, string& dn) { - otype = LOCK_OTYPE_DN; - this->ino = dirino; - this->dn = dn; - } - void set_data(bufferlist& data) { - this->data.claim( data ); - } - void set_path(const string& p) { - path = p; - } - - void decode_payload() { - int off = 0; - payload.copy(off,sizeof(action), (char*)&action); - off += sizeof(action); - payload.copy(off,sizeof(asker), (char*)&asker); - off += sizeof(asker); - payload.copy(off,sizeof(otype), (char*)&otype); - off += sizeof(otype); - payload.copy(off,sizeof(ino), (char*)&ino); - off += sizeof(ino); - ::_decode(dn, payload, off); - ::_decode(path, payload, off); - ::_decode(data, payload, off); - } - virtual void encode_payload() { - payload.append((char*)&action, sizeof(action)); - payload.append((char*)&asker, sizeof(asker)); - payload.append((char*)&otype, sizeof(otype)); - payload.append((char*)&ino, sizeof(inodeno_t)); - ::_encode(dn, payload); - ::_encode(path, payload); - ::_encode(data, payload); - } - -}; - -#endif diff --git a/branches/marnberg/quota/messages/MMDSBeacon.h b/branches/marnberg/quota/messages/MMDSBeacon.h deleted file mode 100644 index 86eccc689d396..0000000000000 --- a/branches/marnberg/quota/messages/MMDSBeacon.h +++ /dev/null @@ -1,54 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMDSBEACON_H -#define __MMDSBEACON_H - -#include "msg/Message.h" - -#include "include/types.h" - -#include "mds/MDSMap.h" - -class MMDSBeacon : public Message { - int state; - version_t seq; - - public: - MMDSBeacon() : Message(MSG_MDS_BEACON) {} - MMDSBeacon(int st, version_t se) : Message(MSG_MDS_BEACON), - state(st), seq(se) { } - - int get_state() { return state; } - version_t get_seq() { return seq; } - char *get_type_name() { return "mdsbeacon"; } - - void print(ostream& out) { - out << "mdsbeacon(" << MDSMap::get_state_name(state) - << " seq " << seq << ")"; - } - - void encode_payload() { - payload.append((char*)&state, sizeof(state)); - payload.append((char*)&seq, sizeof(seq)); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(state), (char*)&state); - off += sizeof(state); - payload.copy(off, sizeof(seq), (char*)&seq); - off += sizeof(seq); - } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MMDSBoot.h b/branches/marnberg/quota/messages/MMDSBoot.h deleted file mode 100644 index c0c554152cc87..0000000000000 --- a/branches/marnberg/quota/messages/MMDSBoot.h +++ /dev/null @@ -1,38 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMDSBOOT_H -#define __MMDSBOOT_H - -#include "msg/Message.h" - -#include "include/types.h" - -class MMDSBoot : public Message { - public: - MMDSBoot() : Message(MSG_MDS_BOOT) { - } - - char *get_type_name() { return "mdsboot"; } - - void encode_payload() { - //payload.append((char*)&sb, sizeof(sb)); - } - void decode_payload() { - //int off = 0; - //payload.copy(off, sizeof(sb), (char*)&sb); - //off += sizeof(sb); - } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MMDSCacheRejoin.h b/branches/marnberg/quota/messages/MMDSCacheRejoin.h deleted file mode 100644 index 2789e30844743..0000000000000 --- a/branches/marnberg/quota/messages/MMDSCacheRejoin.h +++ /dev/null @@ -1,62 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMDSCACHEREJOIN_H -#define __MMDSCACHEREJOIN_H - -#include "msg/Message.h" - -#include "include/types.h" - -// sent from replica to auth - -class MMDSCacheRejoin : public Message { - public: - map inodes; // ino -> caps_wanted - set dirs; - map > dentries; // dir -> (dentries...) - - MMDSCacheRejoin() : Message(MSG_MDS_CACHEREJOIN) {} - - char *get_type_name() { return "cache_rejoin"; } - - void print(ostream& out) { - out << "cache_rejoin" << endl; - } - - void add_dir(inodeno_t dirino) { - dirs.insert(dirino); - } - void add_dentry(inodeno_t dirino, const string& dn) { - dentries[dirino].insert(dn); - } - void add_inode(inodeno_t ino, int cw) { - inodes[ino] = cw; - } - - void encode_payload() { - ::_encode(inodes, payload); - ::_encode(dirs, payload); - for (set::iterator p = dirs.begin(); p != dirs.end(); ++p) - ::_encode(dentries[*p], payload); - } - void decode_payload() { - int off = 0; - ::_decode(inodes, payload, off); - ::_decode(dirs, payload, off); - for (set::iterator p = dirs.begin(); p != dirs.end(); ++p) - ::_decode(dentries[*p], payload, off); - } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MMDSCacheRejoinAck.h b/branches/marnberg/quota/messages/MMDSCacheRejoinAck.h deleted file mode 100644 index b8f0d23ebbba0..0000000000000 --- a/branches/marnberg/quota/messages/MMDSCacheRejoinAck.h +++ /dev/null @@ -1,82 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMDSCACHEREJOINACK_H -#define __MMDSCACHEREJOINACK_H - -#include "msg/Message.h" - -#include "include/types.h" - -// sent from auth back to replica - -class MMDSCacheRejoinAck : public Message { - public: - struct inodeinfo { - inodeno_t ino; - int hardlock; - int filelock; - int nonce; - inodeinfo() {} - inodeinfo(inodeno_t i, int h, int f, int n) : ino(i), hardlock(h), filelock(f), nonce(n) {} - }; - struct dninfo { - int lock; - int nonce; - dninfo() {} - dninfo(int l, int n) : lock(l), nonce(n) {} - }; - struct dirinfo { - inodeno_t dirino; - int nonce; - dirinfo() {} - dirinfo(inodeno_t i, int n) : dirino(i), nonce(n) {} - }; - list inodes; - map > dentries; - list dirs; - - MMDSCacheRejoinAck() : Message(MSG_MDS_CACHEREJOINACK) {} - - char *get_type_name() { return "cache_rejoin_ack"; } - - void print(ostream& out) { - out << "cache_rejoin" << endl; - } - - void add_dir(inodeno_t dirino, int nonce) { - dirs.push_back(dirinfo(dirino,nonce)); - } - void add_dentry(inodeno_t dirino, const string& dn, int ls, int nonce) { - dentries[dirino][dn] = dninfo(ls, nonce); - } - void add_inode(inodeno_t ino, int hl, int fl, int nonce) { - inodes.push_back(inodeinfo(ino, hl, fl, nonce)); - } - - void encode_payload() { - ::_encode(inodes, payload); - ::_encode(dirs, payload); - for (list::iterator p = dirs.begin(); p != dirs.end(); ++p) - ::_encode(dentries[p->dirino], payload); - } - void decode_payload() { - int off = 0; - ::_decode(inodes, payload, off); - ::_decode(dirs, payload, off); - for (list::iterator p = dirs.begin(); p != dirs.end(); ++p) - ::_decode(dentries[p->dirino], payload, off); - } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MMDSGetMap.h b/branches/marnberg/quota/messages/MMDSGetMap.h deleted file mode 100644 index 6bb6b92c00ccd..0000000000000 --- a/branches/marnberg/quota/messages/MMDSGetMap.h +++ /dev/null @@ -1,38 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMDSGETMAP_H -#define __MMDSGETMAP_H - -#include "msg/Message.h" - -#include "include/types.h" - -class MMDSGetMap : public Message { - public: - MMDSGetMap() : Message(MSG_MDS_GETMAP) { - } - - char *get_type_name() { return "mdsgetmap"; } - - void encode_payload() { - //payload.append((char*)&sb, sizeof(sb)); - } - void decode_payload() { - //int off = 0; - //payload.copy(off, sizeof(sb), (char*)&sb); - //off += sizeof(sb); - } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MMDSImportMap.h b/branches/marnberg/quota/messages/MMDSImportMap.h deleted file mode 100644 index 22774cdabc2ec..0000000000000 --- a/branches/marnberg/quota/messages/MMDSImportMap.h +++ /dev/null @@ -1,59 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMDSIMPORTMAP_H -#define __MMDSIMPORTMAP_H - -#include "msg/Message.h" - -#include "include/types.h" - - -class MMDSImportMap : public Message { - public: - map > imap; - map > ambiguous_imap; - - MMDSImportMap() : Message(MSG_MDS_IMPORTMAP) {} - - char *get_type_name() { return "mdsimportmap"; } - - void print(ostream& out) { - out << "mdsimportmap(" << imap.size() - << "+" << ambiguous_imap.size() - << " imports)"; - } - - void add_import(inodeno_t im) { - imap[im].clear(); - } - void add_import_export(inodeno_t im, inodeno_t ex) { - imap[im].insert(ex); - } - - void add_ambiguous_import(inodeno_t im, const set& m) { - ambiguous_imap[im] = m; - } - - void encode_payload() { - ::_encode(imap, payload); - ::_encode(ambiguous_imap, payload); - } - void decode_payload() { - int off = 0; - ::_decode(imap, payload, off); - ::_decode(ambiguous_imap, payload, off); - } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MMDSMap.h b/branches/marnberg/quota/messages/MMDSMap.h deleted file mode 100644 index 701ba9a050cc3..0000000000000 --- a/branches/marnberg/quota/messages/MMDSMap.h +++ /dev/null @@ -1,78 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MMDSMAP_H -#define __MMDSMAP_H - -#include "msg/Message.h" -#include "mds/MDSMap.h" - - -class MMDSMap : public Message { - public: - /* - map maps; - map incremental_maps; - - epoch_t get_first() { - epoch_t e = 0; - map::iterator i = maps.begin(); - if (i != maps.end()) e = i->first; - i = incremental_maps.begin(); - if (i != incremental_maps.end() && - (e == 0 || i->first < e)) e = i->first; - return e; - } - epoch_t get_last() { - epoch_t e = 0; - map::reverse_iterator i = maps.rbegin(); - if (i != maps.rend()) e = i->first; - i = incremental_maps.rbegin(); - if (i != incremental_maps.rend() && - (e == 0 || i->first > e)) e = i->first; - return e; - } - */ - - version_t epoch; - bufferlist encoded; - - version_t get_epoch() const { return epoch; } - bufferlist& get_encoded() { return encoded; } - - MMDSMap() : - Message(MSG_MDS_MAP) {} - MMDSMap(MDSMap *mm) : - Message(MSG_MDS_MAP) { - epoch = mm->get_epoch(); - mm->encode(encoded); - } - - - // marshalling - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - ::_decode(encoded, payload, off); - } - virtual void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - ::_encode(encoded, payload); - } - - virtual char *get_type_name() { return "mdsmap"; } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MMonElectionAck.h b/branches/marnberg/quota/messages/MMonElectionAck.h deleted file mode 100644 index 2399cca73d60c..0000000000000 --- a/branches/marnberg/quota/messages/MMonElectionAck.h +++ /dev/null @@ -1,31 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MMONELECTIONACK_H -#define __MMONELECTIONACK_H - -#include "msg/Message.h" - - -class MMonElectionAck : public Message { - public: - MMonElectionAck() : Message(MSG_MON_ELECTION_ACK) {} - - virtual char *get_type_name() { return "election_ack"; } - - void encode_payload() {} - void decode_payload() {} -}; - -#endif diff --git a/branches/marnberg/quota/messages/MMonElectionCollect.h b/branches/marnberg/quota/messages/MMonElectionCollect.h deleted file mode 100644 index d91870dfce5c6..0000000000000 --- a/branches/marnberg/quota/messages/MMonElectionCollect.h +++ /dev/null @@ -1,42 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MMONELECTIONCOLLECT_H -#define __MMONELECTIONCOLLECT_H - -#include "msg/Message.h" - - -class MMonElectionCollect : public Message { - public: - int read_num; - - MMonElectionCollect() {} - MMonElectionCollect(int n) : - Message(MSG_MON_ELECTION_COLLECT), - read_num(n) {} - - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(read_num), (char*)&read_num); - off += sizeof(read_num); - } - void encode_payload() { - payload.append((char*)&read_num, sizeof(read_num)); - } - - virtual char *get_type_name() { return "MonElCollect"; } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MMonElectionPropose.h b/branches/marnberg/quota/messages/MMonElectionPropose.h deleted file mode 100644 index d9310f222bc7b..0000000000000 --- a/branches/marnberg/quota/messages/MMonElectionPropose.h +++ /dev/null @@ -1,32 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MMONELECTIONPROPOSE_H -#define __MMONELECTIONPROPOSE_H - -#include "msg/Message.h" - - -class MMonElectionPropose : public Message { - public: - MMonElectionPropose() : Message(MSG_MON_ELECTION_PROPOSE) {} - - virtual char *get_type_name() { return "election_propose"; } - - void encode_payload() {} - void decode_payload() {} - -}; - -#endif diff --git a/branches/marnberg/quota/messages/MMonElectionRefresh.h b/branches/marnberg/quota/messages/MMonElectionRefresh.h deleted file mode 100644 index 497276f06b12f..0000000000000 --- a/branches/marnberg/quota/messages/MMonElectionRefresh.h +++ /dev/null @@ -1,51 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MMONELECTIONREFRESH_H -#define __MMONELECTIONREFRESH_H - -#include "msg/Message.h" - -#include "mon/Elector.h" - -class MMonElectionRefresh : public Message { - public: - int p; - Elector::State state; - int refresh_num; - - MMonElectionRefresh() {} - MMonElectionRefresh(int _p, Elector::State& s, int r) : - Message(MSG_MON_ELECTION_REFRESH), - p(_p), state(s), refresh_num(r) {} - - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(p), (char*)&p); - off += sizeof(p); - payload.copy(off, sizeof(state), (char*)&state); - off += sizeof(state); - payload.copy(off, sizeof(refresh_num), (char*)&refresh_num); - off += sizeof(refresh_num); - } - void encode_payload() { - payload.append((char*)&p, sizeof(p)); - payload.append((char*)&state, sizeof(state)); - payload.append((char*)&refresh_num, sizeof(refresh_num)); - } - - virtual char *get_type_name() { return "MonElRefresh"; } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MMonElectionStatus.h b/branches/marnberg/quota/messages/MMonElectionStatus.h deleted file mode 100644 index 071d0fcc82e0a..0000000000000 --- a/branches/marnberg/quota/messages/MMonElectionStatus.h +++ /dev/null @@ -1,50 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MMONELECTIONSTATUS_H -#define __MMONELECTIONSTATUS_H - -#include "msg/Message.h" - -#include "mon/Elector.h" - -class MMonElectionStatus : public Message { - public: - int q; - int read_num; - map registry; - - MMonElectionStatus() {} - MMonElectionStatus(int _q, int r, map reg) : - Message(MSG_MON_ELECTION_STATUS), - q(_q), read_num(r), registry(reg) {} - - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(q), (char*)&q); - off += sizeof(q); - payload.copy(off, sizeof(read_num), (char*)&read_num); - off += sizeof(read_num); - ::_decode(registry, payload, off); - } - void encode_payload() { - payload.append((char*)&q, sizeof(q)); - payload.append((char*)&read_num, sizeof(read_num)); - ::_encode(registry, payload); - } - - virtual char *get_type_name() { return "MonElStatus"; } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MMonElectionVictory.h b/branches/marnberg/quota/messages/MMonElectionVictory.h deleted file mode 100644 index 8bdbf2f85a3aa..0000000000000 --- a/branches/marnberg/quota/messages/MMonElectionVictory.h +++ /dev/null @@ -1,40 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MMONELECTIONVICTORY_H -#define __MMONELECTIONVICTORY_H - -#include "msg/Message.h" - - -class MMonElectionVictory : public Message { - public: - //set active_set; - - MMonElectionVictory(/*set& as*/) : Message(MSG_MON_ELECTION_VICTORY)//, - //active_set(as) - {} - - virtual char *get_type_name() { return "election_victory"; } - - void encode_payload() { - //::_encode(active_set, payload); - } - void decode_payload() { - //int off = 0; - //::_decode(active_set, payload, off); - } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MMonOSDMapInfo.h b/branches/marnberg/quota/messages/MMonOSDMapInfo.h deleted file mode 100644 index 182b36f0a57cf..0000000000000 --- a/branches/marnberg/quota/messages/MMonOSDMapInfo.h +++ /dev/null @@ -1,49 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMONOSDMAPINFO_H -#define __MMONOSDMAPINFO_H - -#include "msg/Message.h" - -#include "include/types.h" - -class MMonOSDMapInfo : public Message { - public: - epoch_t epoch; - epoch_t mon_epoch; - - epoch_t get_epoch() { return epoch; } - epoch_t get_mon_epoch() { return mon_epoch; } - - MMonOSDMapInfo(epoch_t e, epoch_t me) : - Message(MSG_MON_OSDMAP_UPDATE_PREPARE), - epoch(e), mon_epoch(me) { - } - - char *get_type_name() { return "omap_info"; } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - payload.append((char*)&mon_epoch, sizeof(mon_epoch)); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - payload.copy(off, sizeof(mon_epoch), (char*)&mon_epoch); - off += sizeof(mon_epoch); - } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MMonOSDMapLease.h b/branches/marnberg/quota/messages/MMonOSDMapLease.h deleted file mode 100644 index c6112bd898cae..0000000000000 --- a/branches/marnberg/quota/messages/MMonOSDMapLease.h +++ /dev/null @@ -1,49 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMONOSDMAPLEASE_H -#define __MMONOSDMAPLEASE_H - -#include "msg/Message.h" - -#include "include/types.h" - -class MMonOSDMapLease : public Message { - epoch_t epoch; - utime_t lease_expire; - - public: - epoch_t get_epoch() { return epoch; } - const utime_t& get_lease_expire() { return lease_expire; } - - MMonOSDMapLease(epoch_t e, utime_t le) : - Message(MSG_MON_OSDMAP_LEASE), - epoch(e), lease_expire(le) { - } - - char *get_type_name() { return "omap_lease"; } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - payload.append((char*)&lease_expire, sizeof(lease_expire)); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - payload.copy(off, sizeof(lease_expire), (char*)&lease_expire); - off += sizeof(lease_expire); - } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MMonOSDMapLeaseAck.h b/branches/marnberg/quota/messages/MMonOSDMapLeaseAck.h deleted file mode 100644 index 85d5ea7c02809..0000000000000 --- a/branches/marnberg/quota/messages/MMonOSDMapLeaseAck.h +++ /dev/null @@ -1,44 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMONOSDMAPLEASEACK_H -#define __MMONOSDMAPLEASEACK_H - -#include "msg/Message.h" - -#include "include/types.h" - -class MMonOSDMapLeaseAck : public Message { - epoch_t epoch; - -public: - epoch_t get_epoch() { return epoch; } - - MMonOSDMapLeaseAck(epoch_t e) : - Message(MSG_MON_OSDMAP_LEASE_ACK), - epoch(e) { - } - - char *get_type_name() { return "omap_lease_ack"; } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MMonOSDMapUpdateAck.h b/branches/marnberg/quota/messages/MMonOSDMapUpdateAck.h deleted file mode 100644 index 8673788f0632f..0000000000000 --- a/branches/marnberg/quota/messages/MMonOSDMapUpdateAck.h +++ /dev/null @@ -1,42 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMONOSDMAPUPDATEACK_H -#define __MMONOSDMAPUPDATEACK_H - -#include "msg/Message.h" - -#include "include/types.h" - -class MMonOSDMapUpdateAck : public Message { -public: - epoch_t epoch; - - MMonOSDMapUpdateAck(epoch_t e) : - Message(MSG_MON_OSDMAP_UPDATE_ACK), - epoch(e) { - } - - char *get_type_name() { return "omap_update_ack"; } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MMonOSDMapUpdateCommit.h b/branches/marnberg/quota/messages/MMonOSDMapUpdateCommit.h deleted file mode 100644 index 6f12a8e3c784d..0000000000000 --- a/branches/marnberg/quota/messages/MMonOSDMapUpdateCommit.h +++ /dev/null @@ -1,42 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMONOSDMAPUPDATECOMMIT_H -#define __MMONOSDMAPUPDATECOMMIT_H - -#include "msg/Message.h" - -#include "include/types.h" - -class MMonOSDMapUpdateCommit : public Message { - public: - epoch_t epoch; - - MMonOSDMapUpdateCommit(epoch_t e) : - Message(MSG_MON_OSDMAP_UPDATE_COMMIT), - epoch(e) { - } - - char *get_type_name() { return "omap_update_commit"; } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MMonOSDMapUpdatePrepare.h b/branches/marnberg/quota/messages/MMonOSDMapUpdatePrepare.h deleted file mode 100644 index bc962ea2b3eb2..0000000000000 --- a/branches/marnberg/quota/messages/MMonOSDMapUpdatePrepare.h +++ /dev/null @@ -1,52 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMONOSDMAPUPDATEPREPARE_H -#define __MMONOSDMAPUPDATEPREPARE_H - -#include "msg/Message.h" - -#include "include/types.h" - -class MMonOSDMapUpdatePrepare : public Message { - public: - epoch_t epoch; - bufferlist map_bl; - bufferlist inc_map_bl; - - epoch_t get_epoch() { return epoch; } - - MMonOSDMapUpdatePrepare(epoch_t e, - bufferlist& mbl, bufferlist& incmbl) : - Message(MSG_MON_OSDMAP_UPDATE_PREPARE), - epoch(e), - map_bl(mbl), inc_map_bl(incmbl) { - } - - char *get_type_name() { return "omap_update_prepare"; } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - ::_encode(map_bl, payload); - ::_encode(inc_map_bl, payload); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - ::_decode(map_bl, payload, off); - ::_decode(inc_map_bl, payload, off); - } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MMonPaxos.h b/branches/marnberg/quota/messages/MMonPaxos.h deleted file mode 100644 index b3f6e850a9c5d..0000000000000 --- a/branches/marnberg/quota/messages/MMonPaxos.h +++ /dev/null @@ -1,80 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MMONPAXOS_H -#define __MMONPAXOS_H - -#include "msg/Message.h" - -class MMonPaxos : public Message { - public: - // op types - const static int OP_COLLECT = 1; // proposer: propose round - const static int OP_LAST = 2; // voter: accept proposed round - const static int OP_OLDROUND = 3; // voter: notify proposer he proposed an old round - const static int OP_BEGIN = 4; // proposer: value proposed for this round - const static int OP_ACCEPT = 5; // voter: accept propsed value - const static int OP_SUCCESS = 7; // proposer: notify learners of agreed value - const static int OP_ACK = 8; // learner: notify proposer that new value has been saved - - int op; - int machine_id; - version_t proposal; - version_t n; - bufferlist value; - - MMonPaxos() : Message(MSG_MON_PAXOS) {} - MMonPaxos(int o, int mid, - version_t pn, version_t v) : Message(MSG_MON_PAXOS), - op(o), machine_id(mid), - proposal(pn), n(v) {} - MMonPaxos(int o, int mid, - version_t pn, version_t v, - bufferlist& b) : Message(MSG_MON_PAXOS), - op(o), machine_id(mid), - proposal(pn), n(v), - value(b) {} - - virtual char *get_type_name() { return "paxos"; } - - void print(ostream& out) { - out << "paxos(op " << op - << ", machine " << machine_id - << ", proposal " << proposal - << ", state " << n - << ", " << value.length() << " bytes)"; - } - - void encode_payload() { - payload.append((char*)&op, sizeof(op)); - payload.append((char*)&machine_id, sizeof(machine_id)); - payload.append((char*)&proposal, sizeof(proposal)); - payload.append((char*)&n, sizeof(n)); - ::_encode(value, payload); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(op), (char*)&op); - off += sizeof(op); - payload.copy(off, sizeof(machine_id), (char*)&machine_id); - off += sizeof(machine_id); - payload.copy(off, sizeof(proposal), (char*)&proposal); - off += sizeof(proposal); - payload.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - ::_decode(value, payload, off); - } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MNSConnect.h b/branches/marnberg/quota/messages/MNSConnect.h deleted file mode 100644 index 28150f79d8476..0000000000000 --- a/branches/marnberg/quota/messages/MNSConnect.h +++ /dev/null @@ -1,45 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MNSCONNECT_H -#define __MNSCONNECT_H - -#include "msg/Message.h" -#include "msg/tcp.h" - -class MNSConnect : public Message { - tcpaddr_t tcpaddr; - - public: - MNSConnect() {} - MNSConnect(tcpaddr_t t) : - Message(MSG_NS_CONNECT) { - tcpaddr = t; - } - - char *get_type_name() { return "NSCon"; } - - tcpaddr_t& get_addr() { return tcpaddr; } - - void encode_payload() { - payload.append((char*)&tcpaddr, sizeof(tcpaddr)); - } - void decode_payload() { - payload.copy(0, sizeof(tcpaddr), (char*)&tcpaddr); - } -}; - - -#endif - diff --git a/branches/marnberg/quota/messages/MNSConnectAck.h b/branches/marnberg/quota/messages/MNSConnectAck.h deleted file mode 100644 index 696b13f2a41e6..0000000000000 --- a/branches/marnberg/quota/messages/MNSConnectAck.h +++ /dev/null @@ -1,53 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MNSCONNECTACK_H -#define __MNSCONNECTACK_H - -#include "msg/Message.h" -#include "msg/TCPMessenger.h" - -class MNSConnectAck : public Message { - int rank; - int inst; - - public: - MNSConnectAck() {} - MNSConnectAck(int r, int g=0) : - Message(MSG_NS_CONNECTACK) { - rank = r; - inst = g; - } - - char *get_type_name() { return "NSConA"; } - - int get_rank() { return rank; } - int get_inst() { return inst; } - - void encode_payload() { - payload.append((char*)&rank, sizeof(rank)); - payload.append((char*)&inst, sizeof(inst)); - } - void decode_payload() { - unsigned off = 0; - payload.copy(off, sizeof(rank), (char*)&rank); - off += sizeof(rank); - payload.copy(off, sizeof(inst), (char*)&inst); - off += sizeof(inst); - } -}; - - -#endif - diff --git a/branches/marnberg/quota/messages/MNSFailure.h b/branches/marnberg/quota/messages/MNSFailure.h deleted file mode 100644 index 405bfcfd2dacb..0000000000000 --- a/branches/marnberg/quota/messages/MNSFailure.h +++ /dev/null @@ -1,52 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MNSFAILURE_H -#define __MNSFAILURE_H - -#include "msg/Message.h" -#include "msg/tcp.h" - -class MNSFailure : public Message { - //msg_addr_t entity; - entity_inst_t inst; - - public: - MNSFailure() {} - MNSFailure(entity_inst_t& i) : - Message(MSG_NS_FAILURE), - //entity(w), - inst(i) {} - - char *get_type_name() { return "NSFail"; } - - //msg_addr_t &get_entity() { return entity; } - entity_inst_t &get_inst() { return inst; } - - void encode_payload() { - //payload.append((char*)&entity, sizeof(entity)); - payload.append((char*)&inst, sizeof(inst)); - } - void decode_payload() { - unsigned off = 0; - //payload.copy(off, sizeof(entity), (char*)&entity); - //off += sizeof(entity); - payload.copy(off, sizeof(inst), (char*)&inst); - off += sizeof(inst); - } -}; - - -#endif - diff --git a/branches/marnberg/quota/messages/MNSLookup.h b/branches/marnberg/quota/messages/MNSLookup.h deleted file mode 100644 index b6df663a15a88..0000000000000 --- a/branches/marnberg/quota/messages/MNSLookup.h +++ /dev/null @@ -1,46 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MNSLOOKUP_H -#define __MNSLOOKUP_H - -#include "msg/Message.h" - -class MNSLookup : public Message { - entity_name_t entity; - - public: - MNSLookup() {} - MNSLookup(entity_name_t e) : - Message(MSG_NS_LOOKUP) { - entity = e; - } - - char *get_type_name() { return "NSLook"; } - - entity_name_t get_entity() { return entity; } - - void encode_payload() { - payload.append((char*)&entity, sizeof(entity)); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(entity), (char*)&entity); - off += sizeof(entity); - } -}; - - -#endif - diff --git a/branches/marnberg/quota/messages/MNSLookupReply.h b/branches/marnberg/quota/messages/MNSLookupReply.h deleted file mode 100644 index e6720eba397d8..0000000000000 --- a/branches/marnberg/quota/messages/MNSLookupReply.h +++ /dev/null @@ -1,44 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MNSLOOKUPREPLY_H -#define __MNSLOOKUPREPLY_H - -#include "msg/Message.h" -#include "msg/TCPMessenger.h" - -class MNSLookupReply : public Message { - public: - map entity_map; - - public: - MNSLookupReply() {} - MNSLookupReply(MNSLookup *m) : - Message(MSG_NS_LOOKUPREPLY) { - } - - char *get_type_name() { return "NSLookR"; } - - void encode_payload() { - ::_encode(entity_map, payload); - } - void decode_payload() { - int off = 0; - ::_decode(entity_map, payload, off); - } -}; - - -#endif - diff --git a/branches/marnberg/quota/messages/MNSRegister.h b/branches/marnberg/quota/messages/MNSRegister.h deleted file mode 100644 index 01d29a2315fa9..0000000000000 --- a/branches/marnberg/quota/messages/MNSRegister.h +++ /dev/null @@ -1,59 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MNSREGISTER_H -#define __MNSREGISTER_H - -#include "msg/Message.h" -#include "msg/TCPMessenger.h" - -class MNSRegister : public Message { - entity_name_t addr; - int rank; - long tid; - - public: - MNSRegister() {} - MNSRegister(entity_name_t a, int r, int ti) : - Message(MSG_NS_REGISTER) { - addr = a; - rank = r; - tid = ti; - } - - char *get_type_name() { return "NSReg"; } - - entity_name_t get_entity() { return addr; } - int get_rank() { return rank; } - long get_tid() { return tid; } - - void encode_payload() { - payload.append((char*)&addr, sizeof(addr)); - payload.append((char*)&rank, sizeof(rank)); - payload.append((char*)&tid, sizeof(tid)); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(addr), (char*)&addr); - off += sizeof(addr); - payload.copy(off, sizeof(rank), (char*)&rank); - off += sizeof(rank); - payload.copy(off, sizeof(tid), (char*)&tid); - off += sizeof(tid); - } -}; - - -#endif - diff --git a/branches/marnberg/quota/messages/MNSRegisterAck.h b/branches/marnberg/quota/messages/MNSRegisterAck.h deleted file mode 100644 index fa2f88ac10e82..0000000000000 --- a/branches/marnberg/quota/messages/MNSRegisterAck.h +++ /dev/null @@ -1,53 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MNSREGISTERACK_H -#define __MNSREGISTERACK_H - -#include "msg/Message.h" -#include "msg/TCPMessenger.h" - -class MNSRegisterAck : public Message { - entity_name_t entity; - long tid; - - public: - MNSRegisterAck() {} - MNSRegisterAck(long t, entity_name_t e) : - Message(MSG_NS_REGISTERACK) { - entity = e; - tid = t; - } - - char *get_type_name() { return "NSRegA"; } - - entity_name_t get_entity() { return entity; } - long get_tid() { return tid; } - - void encode_payload() { - payload.append((char*)&entity, sizeof(entity)); - payload.append((char*)&tid, sizeof(tid)); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(entity), (char*)&entity); - off += sizeof(entity); - payload.copy(off, sizeof(tid), (char*)&tid); - off += sizeof(tid); - } -}; - - -#endif - diff --git a/branches/marnberg/quota/messages/MOSDBoot.h b/branches/marnberg/quota/messages/MOSDBoot.h deleted file mode 100644 index cfff1869fbe51..0000000000000 --- a/branches/marnberg/quota/messages/MOSDBoot.h +++ /dev/null @@ -1,44 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MOSDBOOT_H -#define __MOSDBOOT_H - -#include "msg/Message.h" - -#include "include/types.h" -#include "osd/osd_types.h" - -class MOSDBoot : public Message { - public: - OSDSuperblock sb; - - MOSDBoot() {} - MOSDBoot(OSDSuperblock& s) : - Message(MSG_OSD_BOOT), - sb(s) { - } - - char *get_type_name() { return "oboot"; } - - void encode_payload() { - payload.append((char*)&sb, sizeof(sb)); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(sb), (char*)&sb); - off += sizeof(sb); - } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MOSDFailure.h b/branches/marnberg/quota/messages/MOSDFailure.h deleted file mode 100644 index c4a557856594a..0000000000000 --- a/branches/marnberg/quota/messages/MOSDFailure.h +++ /dev/null @@ -1,49 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDFAILURE_H -#define __MOSDFAILURE_H - -#include "msg/Message.h" - - -class MOSDFailure : public Message { - public: - entity_inst_t failed; - epoch_t epoch; - - MOSDFailure() {} - MOSDFailure(entity_inst_t f, epoch_t e) : - Message(MSG_OSD_FAILURE), - failed(f), epoch(e) {} - - entity_inst_t get_failed() { return failed; } - epoch_t get_epoch() { return epoch; } - - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(failed), (char*)&failed); - off += sizeof(failed); - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - } - void encode_payload() { - payload.append((char*)&failed, sizeof(failed)); - payload.append((char*)&epoch, sizeof(epoch)); - } - - virtual char *get_type_name() { return "osdfail"; } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MOSDGetMap.h b/branches/marnberg/quota/messages/MOSDGetMap.h deleted file mode 100644 index 58afd527bda93..0000000000000 --- a/branches/marnberg/quota/messages/MOSDGetMap.h +++ /dev/null @@ -1,45 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MOSDGETMAP_H -#define __MOSDGETMAP_H - -#include "msg/Message.h" - -#include "include/types.h" - -class MOSDGetMap : public Message { - public: - epoch_t since; - - //MOSDGetMap() : since(0) {} - MOSDGetMap(epoch_t s=0) : - Message(MSG_OSD_GETMAP), - since(s) { - } - - epoch_t get_since() { return since; } - - char *get_type_name() { return "getomap"; } - - void encode_payload() { - payload.append((char*)&since, sizeof(since)); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(since), (char*)&since); - off += sizeof(since); - } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MOSDIn.h b/branches/marnberg/quota/messages/MOSDIn.h deleted file mode 100644 index 276a930d2e00b..0000000000000 --- a/branches/marnberg/quota/messages/MOSDIn.h +++ /dev/null @@ -1,42 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __MOSDIN_H -#define __MOSDIN_H - -#include "msg/Message.h" - - -class MOSDIn : public Message { - public: - epoch_t map_epoch; - - MOSDIn(epoch_t e) : Message(MSG_OSD_IN), map_epoch(e) { - } - MOSDIn() {} - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(map_epoch), (char*)&map_epoch); - off += sizeof(map_epoch); - } - virtual void encode_payload() { - payload.append((char*)&map_epoch, sizeof(map_epoch)); - } - - virtual char *get_type_name() { return "oin"; } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MOSDMap.h b/branches/marnberg/quota/messages/MOSDMap.h deleted file mode 100644 index dd231a831d63d..0000000000000 --- a/branches/marnberg/quota/messages/MOSDMap.h +++ /dev/null @@ -1,69 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDGETMAPACK_H -#define __MOSDGETMAPACK_H - -#include "msg/Message.h" -#include "osd/OSDMap.h" - - -class MOSDMap : public Message { - public: - map maps; - map incremental_maps; - - epoch_t get_first() { - epoch_t e = 0; - map::iterator i = maps.begin(); - if (i != maps.end()) e = i->first; - i = incremental_maps.begin(); - if (i != incremental_maps.end() && - (e == 0 || i->first < e)) e = i->first; - return e; - } - epoch_t get_last() { - epoch_t e = 0; - map::reverse_iterator i = maps.rbegin(); - if (i != maps.rend()) e = i->first; - i = incremental_maps.rbegin(); - if (i != incremental_maps.rend() && - (e == 0 || i->first > e)) e = i->first; - return e; - } - - - MOSDMap() : - Message(MSG_OSD_MAP) {} - MOSDMap(OSDMap *oc) : - Message(MSG_OSD_MAP) { - oc->encode(maps[oc->get_epoch()]); - } - - - // marshalling - virtual void decode_payload() { - int off = 0; - ::_decode(maps, payload, off); - ::_decode(incremental_maps, payload, off); - } - virtual void encode_payload() { - ::_encode(maps, payload); - ::_encode(incremental_maps, payload); - } - - virtual char *get_type_name() { return "omap"; } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MOSDOp.h b/branches/marnberg/quota/messages/MOSDOp.h deleted file mode 100644 index d16b02e8aad51..0000000000000 --- a/branches/marnberg/quota/messages/MOSDOp.h +++ /dev/null @@ -1,221 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDOP_H -#define __MOSDOP_H - -#include "msg/Message.h" -#include "osd/osd_types.h" - -/* - * OSD op - * - * oid - object id - * op - OSD_OP_DELETE, etc. - * - */ - -// osd client ops -#define OSD_OP_READ 1 -#define OSD_OP_STAT 2 - -#define OSD_OP_WRNOOP 10 -#define OSD_OP_WRITE 11 -#define OSD_OP_DELETE 12 -#define OSD_OP_TRUNCATE 13 -#define OSD_OP_ZERO 14 - -#define OSD_OP_WRLOCK 20 -#define OSD_OP_WRUNLOCK 21 -#define OSD_OP_RDLOCK 22 -#define OSD_OP_RDUNLOCK 23 -#define OSD_OP_UPLOCK 24 -#define OSD_OP_DNLOCK 25 - -#define OSD_OP_PULL 30 -#define OSD_OP_PUSH 31 - - -class MOSDOp : public Message { -public: - static const char* get_opname(int op) { - switch (op) { - case OSD_OP_READ: return "read"; - case OSD_OP_STAT: return "stat"; - - case OSD_OP_WRNOOP: return "wrnoop"; - case OSD_OP_WRITE: return "write"; - case OSD_OP_ZERO: return "zero"; - case OSD_OP_DELETE: return "delete"; - case OSD_OP_TRUNCATE: return "truncate"; - case OSD_OP_WRLOCK: return "wrlock"; - case OSD_OP_WRUNLOCK: return "wrunlock"; - case OSD_OP_RDLOCK: return "rdlock"; - case OSD_OP_RDUNLOCK: return "rdunlock"; - case OSD_OP_UPLOCK: return "uplock"; - case OSD_OP_DNLOCK: return "dnlock"; - - case OSD_OP_PULL: return "pull"; - case OSD_OP_PUSH: return "push"; - default: assert(0); - } - return 0; - } - -private: - struct { - long pcid; - - // who's asking? - entity_inst_t client; - reqid_t reqid; // minor weirdness: entity_name_t is in reqid_t too. - - // for replication - tid_t rep_tid; - - object_t oid; - objectrev_t rev; - pg_t pg; - - epoch_t map_epoch; - - eversion_t pg_trim_to; // primary->replica: trim to here - - int op; - size_t length; - off_t offset; - - eversion_t version; - eversion_t old_version; - - bool want_ack; - bool want_commit; - } st; - - bufferlist data; - map attrset; - - friend class MOSDOpReply; - - public: - const reqid_t& get_reqid() { return st.reqid; } - const tid_t get_client_tid() { return st.reqid.tid; } - int get_client_inc() { return st.reqid.inc; } - - const entity_name_t& get_client() { return st.client.name; } - const entity_inst_t& get_client_inst() { return st.client; } - void set_client_inst(const entity_inst_t& i) { st.client = i; } - - const tid_t get_rep_tid() { return st.rep_tid; } - void set_rep_tid(tid_t t) { st.rep_tid = t; } - - const object_t get_oid() { return st.oid; } - const pg_t get_pg() { return st.pg; } - const epoch_t get_map_epoch() { return st.map_epoch; } - - //const int get_pg_role() { return st.pg_role; } // who am i asking for? - const eversion_t get_version() { return st.version; } - //const eversion_t get_old_version() { return st.old_version; } - - void set_rev(objectrev_t r) { st.rev = r; } - objectrev_t get_rev() { return st.rev; } - - const eversion_t get_pg_trim_to() { return st.pg_trim_to; } - void set_pg_trim_to(eversion_t v) { st.pg_trim_to = v; } - - const int get_op() { return st.op; } - void set_op(int o) { st.op = o; } - - const size_t get_length() { return st.length; } - const off_t get_offset() { return st.offset; } - - map& get_attrset() { return attrset; } - void set_attrset(map &as) { attrset = as; } - - const bool wants_ack() { return st.want_ack; } - const bool wants_commit() { return st.want_commit; } - - - void set_data(bufferlist &d) { - data.claim(d); - } - bufferlist& get_data() { - return data; - } - size_t get_data_len() { return data.length(); } - - - // keep a pcid (procedure call id) to match up request+reply - void set_pcid(long pcid) { this->st.pcid = pcid; } - long get_pcid() { return st.pcid; } - - MOSDOp(entity_inst_t asker, int inc, long tid, - object_t oid, pg_t pg, epoch_t mapepoch, int op) : - Message(MSG_OSD_OP) { - memset(&st, 0, sizeof(st)); - this->st.client = asker; - this->st.reqid.name = asker.name; - this->st.reqid.inc = inc; - this->st.reqid.tid = tid; - - this->st.oid = oid; - this->st.pg = pg; - this->st.map_epoch = mapepoch; - this->st.op = op; - - this->st.rep_tid = 0; - - this->st.want_ack = true; - this->st.want_commit = true; - } - MOSDOp() {} - - //void set_pg_role(int r) { st.pg_role = r; } - //void set_rg_nrep(int n) { st.rg_nrep = n; } - - void set_length(size_t l) { st.length = l; } - void set_offset(off_t o) { st.offset = o; } - void set_version(eversion_t v) { st.version = v; } - void set_old_version(eversion_t ov) { st.old_version = ov; } - - void set_want_ack(bool b) { st.want_ack = b; } - void set_want_commit(bool b) { st.want_commit = b; } - - // marshalling - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(st), (char*)&st); - off += sizeof(st); - ::_decode(attrset, payload, off); - ::_decode(data, payload, off); - } - virtual void encode_payload() { - payload.append((char*)&st, sizeof(st)); - ::_encode(attrset, payload); - ::_encode(data, payload); - } - - virtual char *get_type_name() { return "oop"; } - - void print(ostream& out) { - out << "osd_op(" << st.reqid - << " " << get_opname(st.op) - << " " << st.oid - //<< " " << this - << ")"; - } -}; - - -#endif diff --git a/branches/marnberg/quota/messages/MOSDOpReply.h b/branches/marnberg/quota/messages/MOSDOpReply.h deleted file mode 100644 index 05106e096d176..0000000000000 --- a/branches/marnberg/quota/messages/MOSDOpReply.h +++ /dev/null @@ -1,148 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDOPREPLY_H -#define __MOSDOPREPLY_H - -#include "msg/Message.h" - -#include "MOSDOp.h" -#include "osd/ObjectStore.h" - -/* - * OSD op reply - * - * oid - object id - * op - OSD_OP_DELETE, etc. - * - */ - -class MOSDOpReply : public Message { - struct { - // req - reqid_t reqid; - - tid_t rep_tid; - - object_t oid; - pg_t pg; - - int op; - - // reply - int result; - bool commit; - size_t length, offset; - size_t object_size; - eversion_t version; - - eversion_t pg_complete_thru; - - epoch_t map_epoch; - } st; - - bufferlist data; - map attrset; - - public: - const reqid_t& get_reqid() { return st.reqid; } - long get_tid() { return st.reqid.tid; } - long get_rep_tid() { return st.rep_tid; } - object_t get_oid() { return st.oid; } - pg_t get_pg() { return st.pg; } - int get_op() { return st.op; } - bool get_commit() { return st.commit; } - - int get_result() { return st.result; } - size_t get_length() { return st.length; } - size_t get_offset() { return st.offset; } - size_t get_object_size() { return st.object_size; } - eversion_t get_version() { return st.version; } - map& get_attrset() { return attrset; } - - eversion_t get_pg_complete_thru() { return st.pg_complete_thru; } - void set_pg_complete_thru(eversion_t v) { st.pg_complete_thru = v; } - - void set_result(int r) { st.result = r; } - void set_length(size_t s) { st.length = s; } - void set_offset(size_t o) { st.offset = o; } - void set_object_size(size_t s) { st.object_size = s; } - void set_version(eversion_t v) { st.version = v; } - void set_attrset(map &as) { attrset = as; } - - void set_op(int op) { st.op = op; } - void set_rep_tid(tid_t t) { st.rep_tid = t; } - - // data payload - void set_data(bufferlist &d) { - data.claim(d); - } - bufferlist& get_data() { - return data; - } - - // osdmap - epoch_t get_map_epoch() { return st.map_epoch; } - - -public: - MOSDOpReply(MOSDOp *req, int result, epoch_t e, bool commit) : - Message(MSG_OSD_OPREPLY) { - memset(&st, 0, sizeof(st)); - this->st.reqid = req->st.reqid; - this->st.op = req->st.op; - this->st.rep_tid = req->st.rep_tid; - - this->st.oid = req->st.oid; - this->st.pg = req->st.pg; - this->st.result = result; - this->st.commit = commit; - - this->st.length = req->st.length; // speculative... OSD should ensure these are correct - this->st.offset = req->st.offset; - this->st.version = req->st.version; - - this->st.map_epoch = e; - } - MOSDOpReply() {} - - - // marshalling - virtual void decode_payload() { - payload.copy(0, sizeof(st), (char*)&st); - payload.splice(0, sizeof(st)); - int off = 0; - ::_decode(attrset, payload, off); - ::_decode(data, payload, off); - } - virtual void encode_payload() { - payload.append((char*)&st, sizeof(st)); - ::_encode(attrset, payload); - ::_encode(data, payload); - } - - virtual char *get_type_name() { return "oopr"; } - - void print(ostream& out) { - out << "osd_op_reply(" << st.reqid - << " " << MOSDOp::get_opname(st.op) - << " " << st.oid << " = " << st.result - //<< " " << this - << ")"; - } - -}; - - -#endif diff --git a/branches/marnberg/quota/messages/MOSDOut.h b/branches/marnberg/quota/messages/MOSDOut.h deleted file mode 100644 index 61a594de3294a..0000000000000 --- a/branches/marnberg/quota/messages/MOSDOut.h +++ /dev/null @@ -1,42 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __MOSDOUT_H -#define __MOSDOUT_H - -#include "msg/Message.h" - - -class MOSDOut : public Message { - public: - epoch_t map_epoch; - - MOSDOut(epoch_t e) : Message(MSG_OSD_OUT), map_epoch(e) { - } - MOSDOut() {} - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(map_epoch), (char*)&map_epoch); - off += sizeof(map_epoch); - } - virtual void encode_payload() { - payload.append((char*)&map_epoch, sizeof(map_epoch)); - } - - virtual char *get_type_name() { return "oout"; } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MOSDPGLog.h b/branches/marnberg/quota/messages/MOSDPGLog.h deleted file mode 100644 index e4731c6037107..0000000000000 --- a/branches/marnberg/quota/messages/MOSDPGLog.h +++ /dev/null @@ -1,61 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDPGLOG_H -#define __MOSDPGLOG_H - -#include "msg/Message.h" - -class MOSDPGLog : public Message { - epoch_t epoch; - pg_t pgid; - -public: - PG::Info info; - PG::Log log; - PG::Missing missing; - - epoch_t get_epoch() { return epoch; } - pg_t get_pgid() { return pgid; } - - MOSDPGLog() {} - MOSDPGLog(version_t mv, pg_t pgid) : - Message(MSG_OSD_PG_LOG) { - this->epoch = mv; - this->pgid = pgid; - } - - char *get_type_name() { return "PGlog"; } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - payload.append((char*)&pgid, sizeof(pgid)); - payload.append((char*)&info, sizeof(info)); - log._encode(payload); - missing._encode(payload); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - payload.copy(off, sizeof(pgid), (char*)&pgid); - off += sizeof(pgid); - payload.copy(off, sizeof(info), (char*)&info); - off += sizeof(info); - log._decode(payload, off); - missing._decode(payload, off); - } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MOSDPGNotify.h b/branches/marnberg/quota/messages/MOSDPGNotify.h deleted file mode 100644 index f6fe8ee88c170..0000000000000 --- a/branches/marnberg/quota/messages/MOSDPGNotify.h +++ /dev/null @@ -1,54 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MOSDPGPEERNOTIFY_H -#define __MOSDPGPEERNOTIFY_H - -#include "msg/Message.h" - -#include "osd/PG.h" - -/* - * PGNotify - notify primary of my PGs and versions. - */ - -class MOSDPGNotify : public Message { - epoch_t epoch; - list pg_list; // pgid -> version - - public: - version_t get_epoch() { return epoch; } - list& get_pg_list() { return pg_list; } - - MOSDPGNotify() {} - MOSDPGNotify(epoch_t e, list& l) : - Message(MSG_OSD_PG_NOTIFY) { - this->epoch = e; - pg_list.splice(pg_list.begin(),l); - } - - char *get_type_name() { return "PGnot"; } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - _encode(pg_list, payload); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - _decode(pg_list, payload, off); - } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MOSDPGPeer.h b/branches/marnberg/quota/messages/MOSDPGPeer.h deleted file mode 100644 index ebe1cda485c4c..0000000000000 --- a/branches/marnberg/quota/messages/MOSDPGPeer.h +++ /dev/null @@ -1,57 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDPGPEER_H -#define __MOSDPGPEER_H - -#include "msg/Message.h" - - -class MOSDPGPeer : public Message { - __uint64_t map_version; - list pg_list; - - bool complete; - - public: - __uint64_t get_version() { return map_version; } - list& get_pg_list() { return pg_list; } - bool get_complete() { return complete; } - - MOSDPGPeer() {} - MOSDPGPeer(__uint64_t v, list& l, bool c=false) : - Message(MSG_OSD_PG_PEER) { - this->map_version = v; - this->complete = c; - pg_list.splice(pg_list.begin(), l); - } - - char *get_type_name() { return "PGPeer"; } - - void encode_payload() { - payload.append((char*)&map_version, sizeof(map_version)); - payload.append((char*)&complete, sizeof(complete)); - _encode(pg_list, payload); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(map_version), (char*)&map_version); - off += sizeof(map_version); - payload.copy(off, sizeof(complete), (char*)&complete); - off += sizeof(complete); - _decode(pg_list, payload, off); - } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MOSDPGPeerAck.h b/branches/marnberg/quota/messages/MOSDPGPeerAck.h deleted file mode 100644 index e21a2607bb573..0000000000000 --- a/branches/marnberg/quota/messages/MOSDPGPeerAck.h +++ /dev/null @@ -1,69 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDPGPEERACK_H -#define __MOSDPGPEERACK_H - -#include "msg/Message.h" -#include "osd/OSD.h" - -class MOSDPGPeerAck : public Message { - __uint64_t map_version; - - public: - list pg_dne; // pg dne - map pg_state; // state, lists, etc. - - __uint64_t get_version() { return map_version; } - - MOSDPGPeerAck() {} - MOSDPGPeerAck(__uint64_t v) : - Message(MSG_OSD_PG_PEERACK) { - this->map_version = v; - } - - char *get_type_name() { return "PGPeer"; } - - void encode_payload() { - payload.append((char*)&map_version, sizeof(map_version)); - _encode(pg_dne, payload); - - int n = pg_state.size(); - payload.append((char*)&n, sizeof(n)); - for (map::iterator it = pg_state.begin(); - it != pg_state.end(); - it++) { - payload.append((char*)&it->first, sizeof(it->first)); - it->second._encode(payload); - } - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(map_version), (char*)&map_version); - off += sizeof(map_version); - _decode(pg_dne, payload, off); - - int n; - payload.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDPEERREQUEST_H -#define __MOSDPEERREQUEST_H - -#include "msg/Message.h" - - -class MOSDPGPeerRequest : public Message { - __uint64_t map_version; - list pg_list; - - public: - __uint64_t get_version() { return map_version; } - list& get_pg_list() { return pg_list; } - - MOSDPGPeerRequest() {} - MOSDPGPeerRequest(__uint64_t v, list& l) : - Message(MSG_OSD_PG_PEERREQUEST) { - this->map_version = v; - pg_list.splice(pg_list.begin(), l); - } - - char *get_type_name() { return "PGPR"; } - - void encode_payload() { - payload.append((char*)&map_version, sizeof(map_version)); - _encode(pg_list, payload); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(map_version), (char*)&map_version); - off += sizeof(map_version); - _decode(pg_list, payload, off); - } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MOSDPGQuery.h b/branches/marnberg/quota/messages/MOSDPGQuery.h deleted file mode 100644 index 926acce81349d..0000000000000 --- a/branches/marnberg/quota/messages/MOSDPGQuery.h +++ /dev/null @@ -1,51 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDPGQUERY_H -#define __MOSDPGQUERY_H - -#include "msg/Message.h" - -/* - * PGQuery - query another OSD as to the contents of their PGs - */ - -class MOSDPGQuery : public Message { - version_t epoch; - - public: - version_t get_epoch() { return epoch; } - map pg_list; - - MOSDPGQuery() {} - MOSDPGQuery(epoch_t e, map& ls) : - Message(MSG_OSD_PG_QUERY), - epoch(e), pg_list(ls) { - } - - char *get_type_name() { return "PGq"; } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - ::_encode(pg_list, payload); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - ::_decode(pg_list, payload, off); - } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MOSDPGRemove.h b/branches/marnberg/quota/messages/MOSDPGRemove.h deleted file mode 100644 index 9629a3782764b..0000000000000 --- a/branches/marnberg/quota/messages/MOSDPGRemove.h +++ /dev/null @@ -1,51 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDPGREMOVE_H -#define __MOSDPGREMOVE_H - -#include "msg/Message.h" - - -class MOSDPGRemove : public Message { - epoch_t epoch; - - public: - set pg_list; - - epoch_t get_epoch() { return epoch; } - - MOSDPGRemove() {} - MOSDPGRemove(epoch_t e, set& l) : - Message(MSG_OSD_PG_REMOVE) { - this->epoch = e; - pg_list = l; - } - - char *get_type_name() { return "PGrm"; } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - _encode(pg_list, payload); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - _decode(pg_list, payload, off); - } - -}; - -#endif diff --git a/branches/marnberg/quota/messages/MOSDPGSummary.h b/branches/marnberg/quota/messages/MOSDPGSummary.h deleted file mode 100644 index dc4af837209bb..0000000000000 --- a/branches/marnberg/quota/messages/MOSDPGSummary.h +++ /dev/null @@ -1,65 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDPGQUERYREPLY_H -#define __MOSDPGQUERYREPLY_H - -#include "msg/Message.h" - -class MOSDPGSummary : public Message { - epoch_t epoch; - pg_t pgid; - -public: - PG::PGInfo info; - bufferlist sumbl; - - epoch_t get_epoch() { return epoch; } - - MOSDPGSummary() {} - MOSDPGSummary(version_t mv, pg_t pgid, PG::PGSummary &summary) : - Message(MSG_OSD_PG_SUMMARY) { - this->epoch = mv; - this->pgid = pgid; - summary._encode(sumbl); - } - - pg_t get_pgid() { return pgid; } - bufferlist& get_summary_bl() { - return sumbl; - } - - char *get_type_name() { return "PGsum"; } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - payload.append((char*)&pgid, sizeof(pgid)); - payload.append((char*)&info, sizeof(info)); - payload.claim_append(sumbl); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - payload.copy(off, sizeof(pgid), (char*)&pgid); - off += sizeof(pgid); - payload.copy(off, sizeof(info), (char*)&info); - off += sizeof(info); - - payload.splice(0, off); - sumbl.claim(payload); - } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MOSDPGUpdate.h b/branches/marnberg/quota/messages/MOSDPGUpdate.h deleted file mode 100644 index 93809d6820d21..0000000000000 --- a/branches/marnberg/quota/messages/MOSDPGUpdate.h +++ /dev/null @@ -1,64 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDPGUPDATE_H -#define __MOSDPGUPDATE_H - -#include "msg/Message.h" - -class MOSDPGUpdate : public Message { - version_t map_version; - pg_t pgid; - //pginfo_t info; - bool complete; - version_t last_any_complete; - - public: - version_t get_version() { return map_version; } - pg_t get_pgid() { return pgid; } - //pginfo_t& get_pginfo() { return info; } - bool is_complete() { return complete; } - version_t get_last_any_complete() { return last_any_complete; } - - MOSDPGUpdate() {} - MOSDPGUpdate(version_t mv, pg_t pgid, bool complete, version_t last_any_complete) : - Message(MSG_OSD_PG_UPDATE) { - this->map_version = mv; - this->pgid = pgid; - this->complete = complete; - this->last_any_complete = last_any_complete; - } - - char *get_type_name() { return "PGUp"; } - - void encode_payload() { - payload.append((char*)&map_version, sizeof(map_version)); - payload.append((char*)&pgid, sizeof(pgid)); - payload.append((char*)&complete, sizeof(complete)); - payload.append((char*)&last_any_complete, sizeof(last_any_complete)); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(map_version), (char*)&map_version); - off += sizeof(map_version); - payload.copy(off, sizeof(pgid), (char*)&pgid); - off += sizeof(pgid); - payload.copy(off, sizeof(complete), (char*)&complete); - off += sizeof(complete); - payload.copy(off, sizeof(last_any_complete), (char*)&last_any_complete); - off += sizeof(last_any_complete); - } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MOSDPing.h b/branches/marnberg/quota/messages/MOSDPing.h deleted file mode 100644 index fae80edd91cfc..0000000000000 --- a/branches/marnberg/quota/messages/MOSDPing.h +++ /dev/null @@ -1,50 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MOSDPING_H -#define __MOSDPING_H - -#include "msg/Message.h" - - -class MOSDPing : public Message { - public: - epoch_t map_epoch; - bool ack; - float avg_qlen; - - MOSDPing(epoch_t e, - float aq, - bool a=false) : Message(MSG_OSD_PING), map_epoch(e), ack(a), avg_qlen(aq) { - } - MOSDPing() {} - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(map_epoch), (char*)&map_epoch); - off += sizeof(map_epoch); - payload.copy(off, sizeof(ack), (char*)&ack); - off += sizeof(ack); - payload.copy(off, sizeof(avg_qlen), (char*)&avg_qlen); - off += sizeof(avg_qlen); - } - virtual void encode_payload() { - payload.append((char*)&map_epoch, sizeof(map_epoch)); - payload.append((char*)&ack, sizeof(ack)); - payload.append((char*)&avg_qlen, sizeof(avg_qlen)); - } - - virtual char *get_type_name() { return "oping"; } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MPing.h b/branches/marnberg/quota/messages/MPing.h deleted file mode 100644 index 65b65a738cd66..0000000000000 --- a/branches/marnberg/quota/messages/MPing.h +++ /dev/null @@ -1,41 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __MPING_H -#define __MPING_H - -#include "msg/Message.h" - - -class MPing : public Message { - public: - int seq; - MPing(int s) : Message(MSG_PING) { - seq = s; - } - MPing() : Message(MSG_PING) {} - - virtual void decode_payload(crope& s, int& off) { - s.copy(0, sizeof(seq), (char*)&seq); - off += sizeof(seq); - } - virtual void encode_payload(crope& s) { - s.append((char*)&seq, sizeof(seq)); - } - - virtual char *get_type_name() { return "ping"; } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MPingAck.h b/branches/marnberg/quota/messages/MPingAck.h deleted file mode 100644 index 0ee385b7a2b80..0000000000000 --- a/branches/marnberg/quota/messages/MPingAck.h +++ /dev/null @@ -1,40 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MPINGACK_H -#define __MPINGACK_H - -#include "MPing.h" - - -class MPingAck : public Message { - public: - int seq; - MPingAck() {} - MPingAck(MPing *p) : Message(MSG_PING_ACK) { - this->seq = p->seq; - } - - virtual void decode_payload(crope& s, int& off) { - s.copy(0, sizeof(seq), (char*)&seq); - off += sizeof(seq); - } - virtual void encode_payload(crope& s) { - s.append((char*)&seq, sizeof(seq)); - } - - virtual char *get_type_name() { return "pinga"; } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MRename.h b/branches/marnberg/quota/messages/MRename.h deleted file mode 100644 index e648f3e652fc7..0000000000000 --- a/branches/marnberg/quota/messages/MRename.h +++ /dev/null @@ -1,80 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MRENAME_H -#define __MRENAME_H - -class MRename : public Message { - inodeno_t srcdirino; - string srcname; - inodeno_t destdirino; - string destname; - int initiator; - - bufferlist inode_state; - - public: - int get_initiator() { return initiator; } - inodeno_t get_srcdirino() { return srcdirino; } - string& get_srcname() { return srcname; } - inodeno_t get_destdirino() { return destdirino; } - string& get_destname() { return destname; } - bufferlist& get_inode_state() { return inode_state; } - - MRename() {} - MRename(int initiator, - inodeno_t srcdirino, - const string& srcname, - inodeno_t destdirino, - const string& destname, - bufferlist& inode_state) : - Message(MSG_MDS_RENAME) { - this->initiator = initiator; - this->srcdirino = srcdirino; - this->srcname = srcname; - this->destdirino = destdirino; - this->destname = destname; - this->inode_state.claim( inode_state ); - } - virtual char *get_type_name() { return "Rn";} - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(initiator), (char*)&initiator); - off += sizeof(initiator); - payload.copy(off, sizeof(srcdirino), (char*)&srcdirino); - off += sizeof(srcdirino); - payload.copy(off, sizeof(destdirino), (char*)&destdirino); - off += sizeof(destdirino); - _decode(srcname, payload, off); - _decode(destname, payload, off); - size_t len; - payload.copy(off, sizeof(len), (char*)&len); - off += sizeof(len); - inode_state.substr_of(payload, off, len); - off += len; - } - virtual void encode_payload() { - payload.append((char*)&initiator,sizeof(initiator)); - payload.append((char*)&srcdirino,sizeof(srcdirino)); - payload.append((char*)&destdirino,sizeof(destdirino)); - _encode(srcname, payload); - _encode(destname, payload); - size_t len = inode_state.length(); - payload.append((char*)&len, sizeof(len)); - payload.claim_append(inode_state); - } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MRenameAck.h b/branches/marnberg/quota/messages/MRenameAck.h deleted file mode 100644 index 14843cef5f616..0000000000000 --- a/branches/marnberg/quota/messages/MRenameAck.h +++ /dev/null @@ -1,42 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MRENAMEACK_H -#define __MRENAMEACK_H - -/* FIXME: relateive to dn, not inode */ - -class MRenameAck : public Message { - inodeno_t ino; - - public: - inodeno_t get_ino() { return ino; } - - MRenameAck() {} - MRenameAck(inodeno_t ino) : - Message(MSG_MDS_RENAMEACK) { - this->ino = ino; - } - virtual char *get_type_name() { return "RnAck";} - - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - } - virtual void encode_payload(crope& s) { - s.append((char*)&ino,sizeof(ino)); - } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MRenameNotify.h b/branches/marnberg/quota/messages/MRenameNotify.h deleted file mode 100644 index bc32300b82e3a..0000000000000 --- a/branches/marnberg/quota/messages/MRenameNotify.h +++ /dev/null @@ -1,80 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MRENAMENOTIFY_H -#define __MRENAMENOTIFY_H - -class MRenameNotify : public Message { - inodeno_t ino; - inodeno_t srcdirino; - string srcname; - inodeno_t destdirino; - string destname; - string destdirpath; - int srcauth; - - public: - inodeno_t get_ino() { return ino; } - inodeno_t get_srcdirino() { return srcdirino; } - string& get_srcname() { return srcname; } - inodeno_t get_destdirino() { return destdirino; } - string& get_destname() { return destname; } - string& get_destdirpath() { return destdirpath; } - int get_srcauth() { return srcauth; } - - MRenameNotify() {} - MRenameNotify(inodeno_t ino, - inodeno_t srcdirino, - const string& srcname, - inodeno_t destdirino, - const string& destdirpath, - const string& destname, - int srcauth - ) : - Message(MSG_MDS_RENAMENOTIFY) { - this->ino = ino; - this->srcdirino = srcdirino; - this->srcname = srcname; - this->destdirino = destdirino; - this->destname = destname; - this->destdirpath = destdirpath; - this->srcauth = srcauth; - } - virtual char *get_type_name() { return "Rnot";} - - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - s.copy(off, sizeof(srcdirino), (char*)&srcdirino); - off += sizeof(srcdirino); - s.copy(off, sizeof(destdirino), (char*)&destdirino); - off += sizeof(destdirino); - _unrope(srcname, s, off); - _unrope(destname, s, off); - _unrope(destdirpath, s, off); - s.copy(off, sizeof(srcauth), (char*)&srcauth); - off += sizeof(srcauth); - } - virtual void encode_payload(crope& s) { - s.append((char*)&ino,sizeof(ino)); - s.append((char*)&srcdirino,sizeof(srcdirino)); - s.append((char*)&destdirino,sizeof(destdirino)); - _rope(srcname, s); - _rope(destname, s); - _rope(destdirpath, s); - s.append((char*)&srcauth, sizeof(srcauth)); - } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MRenameNotifyAck.h b/branches/marnberg/quota/messages/MRenameNotifyAck.h deleted file mode 100644 index d1a01339cd97a..0000000000000 --- a/branches/marnberg/quota/messages/MRenameNotifyAck.h +++ /dev/null @@ -1,40 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MRENAMENOTIFYACK_H -#define __MRENAMENOTIFYACK_H - -class MRenameNotifyAck : public Message { - inodeno_t ino; - - public: - inodeno_t get_ino() { return ino; } - - MRenameNotifyAck() {} - MRenameNotifyAck(inodeno_t ino) : - Message(MSG_MDS_RENAMENOTIFYACK) { - this->ino = ino; - } - virtual char *get_type_name() { return "RnotA";} - - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - } - virtual void encode_payload(crope& s) { - s.append((char*)&ino,sizeof(ino)); - } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MRenamePrep.h b/branches/marnberg/quota/messages/MRenamePrep.h deleted file mode 100644 index 1af798c674489..0000000000000 --- a/branches/marnberg/quota/messages/MRenamePrep.h +++ /dev/null @@ -1,85 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MRENAMEPREP_H -#define __MRENAMEPREP_H - -class MRenamePrep : public Message { - inodeno_t srcdirino; - string srcname; - string srcpath; - inodeno_t destdirino; - string destname; - string destpath; - int initiator; - int srcauth; - - public: - int get_initiator() { return initiator; } - inodeno_t get_srcdirino() { return srcdirino; } - string& get_srcname() { return srcname; } - string& get_srcpath() { return srcpath; } - int get_srcauth() { return srcauth; } - inodeno_t get_destdirino() { return destdirino; } - string& get_destname() { return destname; } - string& get_destpath() { return destpath; } - - MRenamePrep() {} - MRenamePrep(int initiator, - inodeno_t srcdirino, - const string& srcname, - const string& srcpath, - inodeno_t destdirino, - const string& destname, - const string& destpath, - int srcauth) : - Message(MSG_MDS_RENAMEPREP) { - this->initiator = initiator; - this->srcdirino = srcdirino; - this->srcname = srcname; - this->srcpath = srcpath; - this->destdirino = destdirino; - this->destname = destname; - this->destpath = destpath; - this->srcauth = srcauth; - } - virtual char *get_type_name() { return "RnP";} - - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(initiator), (char*)&initiator); - off += sizeof(initiator); - s.copy(off, sizeof(srcdirino), (char*)&srcdirino); - off += sizeof(srcdirino); - s.copy(off, sizeof(destdirino), (char*)&destdirino); - off += sizeof(destdirino); - _unrope(srcname, s, off); - _unrope(srcpath, s, off); - _unrope(destname, s, off); - _unrope(destpath, s, off); - s.copy(off, sizeof(srcauth), (char*)&srcauth); - off += sizeof(srcauth); - } - virtual void encode_payload(crope& s) { - s.append((char*)&initiator,sizeof(initiator)); - s.append((char*)&srcdirino,sizeof(srcdirino)); - s.append((char*)&destdirino,sizeof(destdirino)); - _rope(srcname, s); - _rope(srcpath, s); - _rope(destname, s); - _rope(destpath, s); - s.append((char*)&srcauth, sizeof(srcauth)); - } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MRenameReq.h b/branches/marnberg/quota/messages/MRenameReq.h deleted file mode 100644 index b70e96a38203b..0000000000000 --- a/branches/marnberg/quota/messages/MRenameReq.h +++ /dev/null @@ -1,79 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MRENAMEREQ_H -#define __MRENAMEREQ_H - -class MRenameReq : public Message { - int initiator; - inodeno_t srcdirino; - string srcname; - inodeno_t destdirino; - string destname; - string destpath; - int destauth; - - public: - int get_initiator() { return initiator; } - inodeno_t get_srcdirino() { return srcdirino; } - string& get_srcname() { return srcname; } - inodeno_t get_destdirino() { return destdirino; } - string& get_destname() { return destname; } - string& get_destpath() { return destpath; } - int get_destauth() { return destauth; } - - MRenameReq() {} - MRenameReq(int initiator, - inodeno_t srcdirino, - const string& srcname, - inodeno_t destdirino, - const string& destname, - const string& destpath, - int destauth) : - Message(MSG_MDS_RENAMEREQ) { - this->initiator = initiator; - this->srcdirino = srcdirino; - this->srcname = srcname; - this->destdirino = destdirino; - this->destname = destname; - this->destpath = destpath; - this->destauth = destauth; - } - virtual char *get_type_name() { return "RnReq";} - - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(initiator), (char*)&initiator); - off += sizeof(initiator); - s.copy(off, sizeof(srcdirino), (char*)&srcdirino); - off += sizeof(srcdirino); - s.copy(off, sizeof(destdirino), (char*)&destdirino); - off += sizeof(destdirino); - _unrope(srcname, s, off); - _unrope(destname, s, off); - _unrope(destpath, s, off); - s.copy(off, sizeof(destauth), (char*)&destauth); - off += sizeof(destauth); - } - virtual void encode_payload(crope& s) { - s.append((char*)&initiator,sizeof(initiator)); - s.append((char*)&srcdirino,sizeof(srcdirino)); - s.append((char*)&destdirino,sizeof(destdirino)); - _rope(srcname, s); - _rope(destname, s); - _rope(destpath, s); - s.append((char*)&destauth, sizeof(destauth)); - } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MRenameWarning.h b/branches/marnberg/quota/messages/MRenameWarning.h deleted file mode 100644 index 85463dfd2c179..0000000000000 --- a/branches/marnberg/quota/messages/MRenameWarning.h +++ /dev/null @@ -1,40 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MRENAMEWARNING_H -#define __MRENAMEWARNING_H - -class MRenameWarning : public Message { - inodeno_t ino; - - public: - inodeno_t get_ino() { return ino; } - - MRenameWarning() {} - MRenameWarning(inodeno_t ino) : - Message(MSG_MDS_RENAMEWARNING) { - this->ino = ino; - } - virtual char *get_type_name() { return "RnW";} - - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - } - virtual void encode_payload(crope& s) { - s.append((char*)&ino,sizeof(ino)); - } -}; - -#endif diff --git a/branches/marnberg/quota/messages/MUnhashDir.h b/branches/marnberg/quota/messages/MUnhashDir.h deleted file mode 100644 index 911a14d9c9592..0000000000000 --- a/branches/marnberg/quota/messages/MUnhashDir.h +++ /dev/null @@ -1,42 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MUNHASHDIR_H -#define __MUNHASHDIR_H - -#include "msg/Message.h" - -class MUnhashDir : public Message { - inodeno_t ino; - - public: - inodeno_t get_ino() { return ino; } - - MUnhashDir() {} - MUnhashDir(inodeno_t ino) : - Message(MSG_MDS_UNHASHDIR) { - this->ino = ino; - } - virtual char *get_type_name() { return "UH"; } - - virtual void decode_payload() { - payload.copy(0, sizeof(ino), (char*)&ino); - } - virtual void encode_payload() { - payload.append((char*)&ino, sizeof(ino)); - } - -}; - -#endif diff --git a/branches/marnberg/quota/messages/MUnhashDirAck.h b/branches/marnberg/quota/messages/MUnhashDirAck.h deleted file mode 100644 index e052683e736c3..0000000000000 --- a/branches/marnberg/quota/messages/MUnhashDirAck.h +++ /dev/null @@ -1,65 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MUNHASHDIRACK_H -#define __MUNHASHDIRACK_H - -#include "msg/Message.h" - -class MUnhashDirAck : public Message { - inodeno_t ino; - bufferlist state; - int nden; - - public: - MUnhashDirAck() {} - MUnhashDirAck(inodeno_t ino, bufferlist& bl, int nden) : - Message(MSG_MDS_UNHASHDIRACK) { - this->ino = ino; - state.claim(bl); - this->nden = nden; - } - virtual char *get_type_name() { return "UHaA"; } - - inodeno_t get_ino() { return ino; } - bufferlist& get_state() { return state; } - bufferlist* get_state_ptr() { return &state; } - int get_nden() { return nden; } - - //void set_nden(int n) { nden = n; } - //void inc_nden() { nden++; } - - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - payload.copy(off, sizeof(nden), (char*)&nden); - off += sizeof(nden); - - size_t len; - payload.copy(off, sizeof(len), (char*)&len); - off += sizeof(len); - state.substr_of(payload, off, len); - } - void encode_payload() { - payload.append((char*)&ino, sizeof(ino)); - payload.append((char*)&nden, sizeof(nden)); - size_t size = state.length(); - payload.append((char*)&size, sizeof(size)); - payload.claim_append(state); - } - -}; - -#endif diff --git a/branches/marnberg/quota/messages/MUnhashDirNotify.h b/branches/marnberg/quota/messages/MUnhashDirNotify.h deleted file mode 100644 index a9d6707a3aa25..0000000000000 --- a/branches/marnberg/quota/messages/MUnhashDirNotify.h +++ /dev/null @@ -1,50 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MUNHASHDIRNOTIFY_H -#define __MUNHASHDIRNOTIFY_H - -#include "msg/Message.h" - -class MUnhashDirNotify : public Message { - inodeno_t ino; - //int peer; - - public: - inodeno_t get_ino() { return ino; } - //int get_peer() { return peer; } - - MUnhashDirNotify() {} - MUnhashDirNotify(inodeno_t ino/*, int peer*/) : - Message(MSG_MDS_UNHASHDIRNOTIFY) { - this->ino = ino; - //this->peer = peer; - } - virtual char *get_type_name() { return "UHN"; } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - //payload.copy(off, sizeof(peer), (char*)&peer); - //off += sizeof(peer); - } - virtual void encode_payload() { - payload.append((char*)&ino, sizeof(ino)); - //payload.append((char*)&peer, sizeof(peer)); - } - -}; - -#endif diff --git a/branches/marnberg/quota/messages/MUnhashDirNotifyAck.h b/branches/marnberg/quota/messages/MUnhashDirNotifyAck.h deleted file mode 100644 index ad4843676f0fb..0000000000000 --- a/branches/marnberg/quota/messages/MUnhashDirNotifyAck.h +++ /dev/null @@ -1,42 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MUNHASHDIRNOTIFYACK_H -#define __MUNHASHDIRNOTIFYACK_H - -#include "msg/Message.h" - -class MUnhashDirNotifyAck : public Message { - inodeno_t ino; - - public: - inodeno_t get_ino() { return ino; } - - MUnhashDirNotifyAck() {} - MUnhashDirNotifyAck(inodeno_t ino) : - Message(MSG_MDS_UNHASHDIRNOTIFYACK) { - this->ino = ino; - } - virtual char *get_type_name() { return "UHNa"; } - - virtual void decode_payload() { - payload.copy(0, sizeof(ino), (char*)&ino); - } - virtual void encode_payload() { - payload.append((char*)&ino, sizeof(ino)); - } - -}; - -#endif diff --git a/branches/marnberg/quota/messages/MUnhashDirPrep.h b/branches/marnberg/quota/messages/MUnhashDirPrep.h deleted file mode 100644 index c4dc2ea422cd9..0000000000000 --- a/branches/marnberg/quota/messages/MUnhashDirPrep.h +++ /dev/null @@ -1,42 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MUNHASHDIRPREP_H -#define __MUNHASHDIRPREP_H - -#include "msg/Message.h" - -class MUnhashDirPrep : public Message { - inodeno_t ino; - - public: - inodeno_t get_ino() { return ino; } - - MUnhashDirPrep() {} - MUnhashDirPrep(inodeno_t ino) : - Message(MSG_MDS_UNHASHDIRPREP) { - this->ino = ino; - } - virtual char *get_type_name() { return "UHP"; } - - virtual void decode_payload() { - payload.copy(0, sizeof(ino), (char*)&ino); - } - virtual void encode_payload() { - payload.append((char*)&ino, sizeof(ino)); - } - -}; - -#endif diff --git a/branches/marnberg/quota/messages/MUnhashDirPrepAck.h b/branches/marnberg/quota/messages/MUnhashDirPrepAck.h deleted file mode 100644 index bd7e93981964b..0000000000000 --- a/branches/marnberg/quota/messages/MUnhashDirPrepAck.h +++ /dev/null @@ -1,93 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MUNHASHDIRPREPACK_H -#define __MUNHASHDIRPREPACK_H - -#include "msg/Message.h" -#include "mds/CInode.h" -#include "include/types.h" - -class MUnhashDirPrepAck : public Message { - inodeno_t ino; - bool assim; - - // subdir dentry names + inodes - map inodes; - - public: - inodeno_t get_ino() { return ino; } - map& get_inodes() { return inodes; } - - bool did_assim() { return assim; } - void mark_assim() { assert(!assim); assim = true; } - - MUnhashDirPrepAck() : assim(false) { } - MUnhashDirPrepAck(inodeno_t ino) : - Message(MSG_MDS_UNHASHDIRPREPACK), - assim(false) { - this->ino = ino; - } - ~MUnhashDirPrepAck() { - for (map::iterator it = inodes.begin(); - it != inodes.end(); - it++) - delete it->second; - } - - - virtual char *get_type_name() { return "HP"; } - - void add_inode(const string& dentry, CInodeDiscover *in) { - inodes[dentry] = in; - } - - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - - // inodes - int ni; - payload.copy(off, sizeof(int), (char*)&ni); - off += sizeof(int); - for (int i=0; i_decode(payload, off); - - inodes[dname] = in; - } - } - - virtual void encode_payload() { - payload.append((char*)&ino, sizeof(ino)); - - // inodes - int ni = inodes.size(); - payload.append((char*)&ni, sizeof(int)); - for (map::iterator iit = inodes.begin(); - iit != inodes.end(); - iit++) { - _encode(iit->first, payload); // dentry - iit->second->_encode(payload); // inode - } - } -}; - -#endif diff --git a/branches/marnberg/quota/mkmonmap.cc b/branches/marnberg/quota/mkmonmap.cc deleted file mode 100644 index 1ec4c808d6204..0000000000000 --- a/branches/marnberg/quota/mkmonmap.cc +++ /dev/null @@ -1,67 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include -#include -#include - -#include -#include -#include -using namespace std; - -#include "config.h" - -#include "mon/MonMap.h" - - - - - -int main(int argc, char **argv) -{ - vector args; - argv_to_vec(argc, argv, args); - - MonMap monmap; - - char *outfn = ".ceph_monmap"; - - for (unsigned i=0; i= 0); - - return 0; -} diff --git a/branches/marnberg/quota/mon/ClientMonitor.cc b/branches/marnberg/quota/mon/ClientMonitor.cc deleted file mode 100644 index 8ab59504d4bae..0000000000000 --- a/branches/marnberg/quota/mon/ClientMonitor.cc +++ /dev/null @@ -1,109 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include "ClientMonitor.h" -#include "Monitor.h" -#include "MDSMonitor.h" - -#include "messages/MClientBoot.h" -#include "messages/MMDSMap.h" -//#include "messages/MMDSFailure.h" - -#include "common/Timer.h" - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cout << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".client " -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cerr << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".client " - - - - -void ClientMonitor::dispatch(Message *m) -{ - switch (m->get_type()) { - - case MSG_CLIENT_BOOT: - handle_client_boot((MClientBoot*)m); - break; - - /* - case MSG_client_FAILURE: - handle_client_failure((MClientFailure*)m); - break; - */ - - default: - assert(0); - } -} - -void ClientMonitor::handle_client_boot(MClientBoot *m) -{ - dout(7) << "client_boot from " << m->get_source() << " at " << m->get_source_inst() << endl; - assert(m->get_source().is_client()); - int from = m->get_source().num(); - - // choose a client id - if (from < 0 || - (client_map.count(m->get_source()) && client_map[m->get_source()] != m->get_source_addr())) { - from = ++num_clients; - dout(10) << "client_boot assigned client" << from << endl; - } - - client_map[MSG_ADDR_CLIENT(from)] = m->get_source_addr(); - - // reply with latest mds map - entity_inst_t to = m->get_source_inst(); - to.name = MSG_ADDR_CLIENT(from); - mon->mdsmon->send_latest(to); - delete m; -} - -/* -void ClientMonitor::handle_mds_shutdown(Message *m) -{ - assert(m->get_source().is_mds()); - int from = m->get_source().num(); - - mdsmap.mds_inst.erase(from); - mdsmap.all_mds.erase(from); - - dout(7) << "mds_shutdown from " << m->get_source() - << ", still have " << mdsmap.all_mds - << endl; - - // tell someone? - // fixme - - delete m; -} - -*/ - -/* -void ClientMonitor::bcast_latest_mds() -{ - dout(10) << "bcast_latest_mds " << mdsmap.get_epoch() << endl; - - // tell mds - for (set::iterator p = mdsmap.get_mds().begin(); - p != mdsmap.get_mds().end(); - p++) { - if (mdsmap.is_down(*p)) continue; - send_full(MSG_ADDR_MDS(*p), mdsmap.get_inst(*p)); - } -} - -*/ diff --git a/branches/marnberg/quota/mon/ClientMonitor.h b/branches/marnberg/quota/mon/ClientMonitor.h deleted file mode 100644 index c3ea253bafc48..0000000000000 --- a/branches/marnberg/quota/mon/ClientMonitor.h +++ /dev/null @@ -1,52 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __CLIENTMONITOR_H -#define __CLIENTMONITOR_H - -#include -#include -using namespace std; - -#include "include/types.h" -#include "msg/Messenger.h" - -#include "mds/MDSMap.h" - -class Monitor; - -class ClientMonitor : public Dispatcher { - Monitor *mon; - Messenger *messenger; - Mutex &lock; - - private: - int num_clients; - map client_map; - - void bcast_latest_mds(); - - //void accept_pending(); // accept pending, new map. - //void send_incremental(epoch_t since, msg_addr_t dest); - - void handle_client_boot(class MClientBoot *m); - - public: - ClientMonitor(Monitor *mn, Messenger *m, Mutex& l) : mon(mn), messenger(m), lock(l), - num_clients(0) { } - - void dispatch(Message *m); - void tick(); // check state, take actions -}; - -#endif diff --git a/branches/marnberg/quota/mon/Elector.cc b/branches/marnberg/quota/mon/Elector.cc deleted file mode 100644 index d3098ba065a47..0000000000000 --- a/branches/marnberg/quota/mon/Elector.cc +++ /dev/null @@ -1,219 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "Elector.h" -#include "Monitor.h" - -#include "common/Timer.h" - -#include "messages/MMonElectionPropose.h" -#include "messages/MMonElectionAck.h" -#include "messages/MMonElectionVictory.h" - -#include "config.h" -#undef dout -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cerr << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".elector " -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cout << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".elector " - - -void Elector::start() -{ - dout(5) << "start -- can i be leader?" << endl; - - leader_acked = -1; - - // start by trying to elect me - start_stamp = g_clock.now(); - acked_me.clear(); - acked_me.insert(whoami); - electing_me = true; - - // bcast to everyone else - for (int i=0; imonmap->num_mon; ++i) { - if (i == whoami) continue; - mon->messenger->send_message(new MMonElectionPropose, - mon->monmap->get_inst(i)); - } - - reset_timer(); -} - -void Elector::defer(int who) -{ - dout(5) << "defer to " << who << endl; - - if (electing_me) { - acked_me.clear(); - electing_me = false; - } - - // ack them - leader_acked = who; - ack_stamp = g_clock.now(); - mon->messenger->send_message(new MMonElectionAck, - mon->monmap->get_inst(who)); - - // set a timer - reset_timer(1.0); // give the leader some extra time to declare victory -} - - -class C_Mon_ElectionExpire : public Context { - Elector *elector; -public: - C_Mon_ElectionExpire(Elector *e) : elector(e) { } - void finish(int r) { - elector->expire(); - } -}; - -void Elector::reset_timer(double plus) -{ - // set the timer - cancel_timer(); - expire_event = new C_Mon_ElectionExpire(this); - g_timer.add_event_after(g_conf.mon_lease + plus, - expire_event); -} - - -void Elector::cancel_timer() -{ - if (expire_event) - g_timer.cancel_event(expire_event); -} - -void Elector::expire() -{ - dout(5) << "election timer expired" << endl; - - // did i win? - if (electing_me && - acked_me.size() > (unsigned)(mon->monmap->num_mon / 2)) { - // i win - victory(); - } else { - // whoever i deferred to didn't declare victory quickly enough. - start(); - } -} - - -void Elector::victory() -{ - leader_acked = -1; - electing_me = false; - - cancel_timer(); - - // tell everyone - for (int i=0; imonmap->num_mon; ++i) { - if (i == whoami) continue; - mon->messenger->send_message(new MMonElectionVictory, - mon->monmap->get_inst(i)); - } - - // tell monitor - mon->win_election(acked_me); -} - - -void Elector::handle_propose(MMonElectionPropose *m) -{ - dout(5) << "handle_propose from " << m->get_source() << endl; - int from = m->get_source().num(); - - if (from > whoami) { - // wait, i should win! - if (!electing_me) - start(); - } else { - // they would win over me - if (leader_acked < 0 || // haven't acked anyone yet, or - leader_acked > from || // they would win over who you did ack, or - leader_acked == from) { // this is the guy we're already deferring to - defer(from); - } else { - // ignore them! - dout(5) << "no, we already acked " << leader_acked << endl; - } - } - - delete m; -} - -void Elector::handle_ack(MMonElectionAck *m) -{ - dout(5) << "handle_ack from " << m->get_source() << endl; - int from = m->get_source().num(); - - if (electing_me) { - // thanks - acked_me.insert(from); - dout(5) << " so far i have " << acked_me << endl; - - // is that _everyone_? - if (acked_me.size() == (unsigned)mon->monmap->num_mon) { - // if yes, shortcut to election finish - victory(); - } - } else { - // ignore, i'm deferring already. - } - - delete m; -} - -void Elector::handle_victory(MMonElectionVictory *m) -{ - dout(5) << "handle_victory from " << m->get_source() << endl; - int from = m->get_source().num(); - - if (from < whoami) { - // ok, fine, they win - mon->lose_election(from); - - // cancel my timer - cancel_timer(); - } else { - // no, that makes no sense, i should win. start over! - start(); - } -} - - - - -void Elector::dispatch(Message *m) -{ - switch (m->get_type()) { - case MSG_MON_ELECTION_ACK: - handle_ack((MMonElectionAck*)m); - break; - - case MSG_MON_ELECTION_PROPOSE: - handle_propose((MMonElectionPropose*)m); - break; - - case MSG_MON_ELECTION_VICTORY: - handle_victory((MMonElectionVictory*)m); - break; - - default: - assert(0); - } -} - - - - diff --git a/branches/marnberg/quota/mon/Elector.h b/branches/marnberg/quota/mon/Elector.h deleted file mode 100644 index 67ed59945c46b..0000000000000 --- a/branches/marnberg/quota/mon/Elector.h +++ /dev/null @@ -1,72 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MON_ELECTOR_H -#define __MON_ELECTOR_H - -#include -using namespace std; - -#include "include/types.h" -#include "msg/Message.h" - -#include "include/Context.h" - -#include "common/Timer.h" - -class Monitor; - - -class Elector { - private: - Monitor *mon; - int whoami; - - Context *expire_event; - - void reset_timer(double plus=0.0); - void cancel_timer(); - - // electing me - bool electing_me; - utime_t start_stamp; - set acked_me; - - // electing them - int leader_acked; // who i've acked - utime_t ack_stamp; // and when - - public: - - void start(); // start an electing me - void defer(int who); - void expire(); // timer goes off - void victory(); - - void handle_propose(class MMonElectionPropose *m); - void handle_ack(class MMonElectionAck *m); - void handle_victory(class MMonElectionVictory *m); - - - public: - Elector(Monitor *m, int w) : mon(m), whoami(w) { - // initialize all those values! - // ... - } - - void dispatch(Message *m); -}; - - -#endif diff --git a/branches/marnberg/quota/mon/MDSMonitor.cc b/branches/marnberg/quota/mon/MDSMonitor.cc deleted file mode 100644 index 24beadf85e9f0..0000000000000 --- a/branches/marnberg/quota/mon/MDSMonitor.cc +++ /dev/null @@ -1,370 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include "MDSMonitor.h" -#include "Monitor.h" -#include "MonitorStore.h" - -#include "messages/MMDSMap.h" -#include "messages/MMDSGetMap.h" -#include "messages/MMDSBeacon.h" - -#include "common/Timer.h" - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cout << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".mds e" << mdsmap.get_epoch() << " " -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cerr << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".mds e" << mdsmap.get_epoch() << " " - - - -/********* MDS map **************/ - -void MDSMonitor::dispatch(Message *m) -{ - switch (m->get_type()) { - - case MSG_MDS_BEACON: - handle_mds_beacon((MMDSBeacon*)m); - break; - - case MSG_MDS_GETMAP: - handle_mds_getmap((MMDSGetMap*)m); - break; - - default: - assert(0); - } -} - - - -void MDSMonitor::election_finished() -{ - if (mon->is_leader()) { - - // FIXME be smarter later. - - if (g_conf.mkfs) { - create_initial(); - save_map(); - } else { - load_map(); - } - } -} - - -void MDSMonitor::create_initial() -{ - mdsmap.epoch = 0; // until everyone boots - mdsmap.ctime = g_clock.now(); - - mdsmap.encode(encoded_map); - - print_map(); -} - -void MDSMonitor::load_map() -{ - int r = mon->store->get_bl_ss(encoded_map, "mdsmap", "current"); - assert(r > 0); - mdsmap.decode(encoded_map); - dout(7) << "load_map epoch " << mdsmap.get_epoch() << endl; -} - -void MDSMonitor::save_map() -{ - dout(7) << "save_map epoch " << mdsmap.get_epoch() << endl; - - int r = mon->store->put_bl_ss(encoded_map, "mdsmap", "current"); - assert(r>=0); -} - -void MDSMonitor::print_map() -{ - dout(7) << "print_map epoch " << mdsmap.get_epoch() << endl; - entity_inst_t blank; - set all; - mdsmap.get_mds_set(all); - for (set::iterator p = all.begin(); - p != all.end(); - ++p) { - dout(7) << " mds" << *p << "." << mdsmap.mds_inc[*p] - << " : " << MDSMap::get_state_name(mdsmap.get_state(*p)) - << " : " << (mdsmap.have_inst(*p) ? mdsmap.get_inst(*p) : blank) - << endl; - } -} - - -void MDSMonitor::issue_map() -{ - mdsmap.inc_epoch(); - encoded_map.clear(); - mdsmap.encode(encoded_map); - - dout(7) << "issue_map epoch " << mdsmap.get_epoch() << endl; - - save_map(); - print_map(); - - // bcast map - bcast_latest_mds(); - send_current(); -} - - -void MDSMonitor::handle_mds_beacon(MMDSBeacon *m) -{ - dout(7) << "mds_beacon " << *m - << " from " << m->get_source() - << " " << m->get_source_inst() - << endl; - int from = m->get_source().num(); - int state = m->get_state(); - version_t seq = m->get_seq(); - - // initial boot? - bool booted = false; - - // choose an MDS id - if (from >= 0) { - // wants to be (or already is) a specific MDS. - if (mdsmap.is_down(from)) { - dout(10) << "mds_beacon assigning requested mds" << from << endl; - booted = true; - } else if (mdsmap.get_inst(from) != m->get_source_inst()) { - dout(10) << "mds_beacon not assigning requested mds" << from - << ", that mds is up and someone else" << endl; - from = -1; - } - } - if (from < 0) { - // pick a failed mds? - set failed; - mdsmap.get_failed_mds_set(failed); - if (!failed.empty()) { - from = *failed.begin(); - dout(10) << "mds_beacon assigned failed mds" << from << endl; - booted = true; - } - } - if (from < 0) { - // ok, just pick any unused mds id. - for (from=0; ; ++from) { - if (mdsmap.is_dne(from) || - mdsmap.is_out(from)) { - dout(10) << "mds_beacon assigned out|dne mds" << from << endl; - booted = true; - break; - } - } - } - - - // old beacon? - if (mdsmap.mds_state_seq[from] > seq) { - dout(7) << "mds_beacon " << *m << " has old seq, ignoring" << endl; - delete m; - return; - } - - // reply to beacon? - if (state != MDSMap::STATE_OUT) { - last_beacon[from] = g_clock.now(); // note time - messenger->send_message(new MMDSBeacon(state, seq), - m->get_source_inst()); - } - - - // make sure it's in the map - if (booted) { - mdsmap.mds_inst[from].addr = m->get_source_addr(); - mdsmap.mds_inst[from].name = MSG_ADDR_MDS(from); - mdsmap.mds_inc[from]++; - - // starting -> creating|starting|replay - if (mdsmap.is_degraded() && - !mdsmap.is_failed(from)) { - dout(10) << "mds_beacon currently degraded, mds" << from << " will be standby" << endl; - state = MDSMap::STATE_STANDBY; - } - else if (state == MDSMap::STATE_STARTING) { - if (mdsmap.is_failed(from)) { - dout(10) << "mds_beacon will recover mds" << from << endl; - state = MDSMap::STATE_REPLAY; - } - else if (mdsmap.is_out(from)) { - dout(10) << "mds_beacon will start mds" << from << endl; - state = MDSMap::STATE_STARTING; - } - else { - dout(10) << "mds_beacon will create mds" << from << endl; - state = MDSMap::STATE_CREATING; - } - } - } - - // if creating -> active, go to standby instead - if (state == MDSMap::STATE_ACTIVE && mdsmap.is_creating(from)) { - mdsmap.mds_created.insert(from); - dout(10) << "mds_beacon created mds" << from << endl; - - if (mdsmap.is_degraded()) { - dout(10) << "mds_beacon current degraded, marking mds" << from << " as standby" << endl; - state = MDSMap::STATE_STANDBY; - } - } - - - // did we update the map? - if (mdsmap.mds_state.count(from) == 0 || - mdsmap.mds_state[from] != state) { - // update mds state - dout(10) << "mds_beacon mds" << from << " " << MDSMap::get_state_name(mdsmap.mds_state[from]) - << " -> " << MDSMap::get_state_name(state) - << endl; - mdsmap.mds_state[from] = state; - if (mdsmap.is_up(from)) - mdsmap.mds_state_seq[from] = seq; - else - mdsmap.mds_state_seq.erase(from); - - issue_map(); - } - - delete m; -} - - -void MDSMonitor::handle_mds_getmap(MMDSGetMap *m) -{ - dout(7) << "mds_getmap from " << m->get_source() << " " << m->get_source_inst() << endl; - if (mdsmap.get_epoch() > 0) - send_full(m->get_source_inst()); - else - awaiting_map.push_back( m->get_source_inst() ); -} - - -void MDSMonitor::bcast_latest_mds() -{ - dout(10) << "bcast_latest_mds " << mdsmap.get_epoch() << endl; - - // tell mds - set up; - mdsmap.get_up_mds_set(up); - for (set::iterator p = up.begin(); - p != up.end(); - p++) - send_full(mdsmap.get_inst(*p)); -} - -void MDSMonitor::send_full(entity_inst_t dest) -{ - dout(11) << "send_full to " << dest << endl; - messenger->send_message(new MMDSMap(&mdsmap), dest); -} - -void MDSMonitor::send_current() -{ - dout(10) << "mds_send_current " << mdsmap.get_epoch() << endl; - for (list::iterator i = awaiting_map.begin(); - i != awaiting_map.end(); - i++) - send_full(*i); - awaiting_map.clear(); -} - -void MDSMonitor::send_latest(entity_inst_t dest) -{ - // FIXME: check if we're locked, etc. - if (mdsmap.get_epoch() > 0) - send_full(dest); - else - awaiting_map.push_back(dest); -} - - -void MDSMonitor::tick() -{ - // make sure mds's are still alive - utime_t now = g_clock.now(); - if (now > g_conf.mds_beacon_grace) { - utime_t cutoff = now; - cutoff -= g_conf.mds_beacon_grace; - - bool changed = false; - - set up; - mdsmap.get_up_mds_set(up); - - for (set::iterator p = up.begin(); - p != up.end(); - ++p) { - if (last_beacon.count(*p)) { - if (last_beacon[*p] < cutoff) { - - // failure! - int newstate; - switch (mdsmap.get_state(*p)) { - case MDSMap::STATE_CREATING: - // didn't finish creating - newstate = MDSMap::STATE_DNE; - break; - - case MDSMap::STATE_STANDBY: - if (mdsmap.has_created(*p)) - newstate = MDSMap::STATE_OUT; - else - newstate = MDSMap::STATE_DNE; - break; - - case MDSMap::STATE_REPLAY: - case MDSMap::STATE_REJOIN: - case MDSMap::STATE_ACTIVE: - case MDSMap::STATE_STOPPING: - newstate = MDSMap::STATE_FAILED; - break; - - case MDSMap::STATE_STARTING: - case MDSMap::STATE_STOPPED: - newstate = MDSMap::STATE_OUT; - break; - - default: - assert(0); - } - - dout(10) << "no beacon from mds" << *p << " since " << last_beacon[*p] - << ", marking " << mdsmap.get_state_name(newstate) - << endl; - - // update map - mdsmap.mds_state[*p] = newstate; - mdsmap.mds_state_seq.erase(*p); - changed = true; - } - } else { - dout(10) << "no beacons from mds" << *p << ", assuming one " << now << endl; - last_beacon[*p] = now; - } - } - - if (changed) { - issue_map(); - } - } -} diff --git a/branches/marnberg/quota/mon/MDSMonitor.h b/branches/marnberg/quota/mon/MDSMonitor.h deleted file mode 100644 index c3bc3d165883c..0000000000000 --- a/branches/marnberg/quota/mon/MDSMonitor.h +++ /dev/null @@ -1,87 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDSMONITOR_H -#define __MDSMONITOR_H - -#include -#include -using namespace std; - -#include "include/types.h" -#include "msg/Messenger.h" - -#include "mds/MDSMap.h" - -class Monitor; - -class MDSMonitor : public Dispatcher { - Monitor *mon; - Messenger *messenger; - Mutex &lock; - - // mds maps - public: - MDSMap mdsmap; - - private: - bufferlist encoded_map; - - //map inc_maps; - //MDSMap::Incremental pending_inc; - - list awaiting_map; - - // beacons - map last_beacon; - - bool is_alive(int mds); - - - // maps - void create_initial(); - void send_current(); // send current map to waiters. - void send_full(entity_inst_t dest); - void bcast_latest_mds(); - - void issue_map(); - - void save_map(); - void load_map(); - void print_map(); - - //void accept_pending(); // accept pending, new map. - //void send_incremental(epoch_t since, msg_addr_t dest); - - void handle_mds_state(class MMDSState *m); - void handle_mds_beacon(class MMDSBeacon *m); - //void handle_mds_failure(class MMDSFailure *m); - void handle_mds_getmap(class MMDSGetMap *m); - - - - public: - MDSMonitor(Monitor *mn, Messenger *m, Mutex& l) : mon(mn), messenger(m), lock(l) { - } - - void dispatch(Message *m); - void tick(); // check state, take actions - - void election_starting(); - void election_finished(); - - void send_latest(entity_inst_t dest); - -}; - -#endif diff --git a/branches/marnberg/quota/mon/MonMap.h b/branches/marnberg/quota/mon/MonMap.h deleted file mode 100644 index d8e66c51b589e..0000000000000 --- a/branches/marnberg/quota/mon/MonMap.h +++ /dev/null @@ -1,103 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MONMAP_H -#define __MONMAP_H - -#include -#include -#include - -#include "msg/Message.h" -#include "include/types.h" - -class MonMap { - public: - epoch_t epoch; // what epoch of the osd cluster descriptor is this - int num_mon; - vector mon_inst; - - int last_mon; // last mon i talked to - - MonMap(int s=0) : epoch(0), num_mon(s), mon_inst(s), last_mon(-1) {} - - void add_mon(entity_inst_t inst) { - mon_inst.push_back(inst); - num_mon++; - } - - // pick a mon. - // choice should be stable, unless we explicitly ask for a new one. - int pick_mon(bool newmon=false) { - if (newmon || (last_mon < 0)) { - last_mon = 0; //last_mon = rand() % num_mon; - } - return last_mon; - } - - const entity_inst_t &get_inst(int m) { - assert(m < num_mon); - return mon_inst[m]; - } - - void encode(bufferlist& blist) { - blist.append((char*)&epoch, sizeof(epoch)); - blist.append((char*)&num_mon, sizeof(num_mon)); - - _encode(mon_inst, blist); - } - - void decode(bufferlist& blist) { - int off = 0; - blist.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - blist.copy(off, sizeof(num_mon), (char*)&num_mon); - off += sizeof(num_mon); - - _decode(mon_inst, blist, off); - } - - int write(char *fn) { - // encode - bufferlist bl; - encode(bl); - - // write - int fd = ::open(fn, O_RDWR|O_CREAT); - if (fd < 0) return fd; - ::fchmod(fd, 0644); - ::write(fd, (void*)bl.c_str(), bl.length()); - ::close(fd); - return 0; - } - - int read(char *fn) { - // read - bufferlist bl; - int fd = ::open(fn, O_RDONLY); - if (fd < 0) return fd; - struct stat st; - ::fstat(fd, &st); - bufferptr bp(st.st_size); - bl.append(bp); - ::read(fd, (void*)bl.c_str(), bl.length()); - ::close(fd); - - // decode - decode(bl); - return 0; - } - -}; - -#endif diff --git a/branches/marnberg/quota/mon/Monitor.cc b/branches/marnberg/quota/mon/Monitor.cc deleted file mode 100644 index 8bf1d2f0cfe21..0000000000000 --- a/branches/marnberg/quota/mon/Monitor.cc +++ /dev/null @@ -1,303 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -// TODO: missing run() method, which creates the two main timers, refreshTimer and readTimer - -#include "Monitor.h" - -#include "osd/OSDMap.h" - -#include "MonitorStore.h" - -#include "msg/Message.h" -#include "msg/Messenger.h" - -#include "messages/MPing.h" -#include "messages/MPingAck.h" -#include "messages/MGenericMessage.h" - -#include "messages/MMonPaxos.h" - -#include "common/Timer.h" -#include "common/Clock.h" - -#include "OSDMonitor.h" -#include "MDSMonitor.h" -#include "ClientMonitor.h" - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cout << g_clock.now() << " mon" << whoami << (is_starting() ? (const char*)"(starting)":(is_leader() ? (const char*)"(leader)":(is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << " " -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cerr << g_clock.now() << " mon" << whoami << (is_starting() ? (const char*)"(starting)":(is_leader() ? (const char*)"(leader)":(is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << " " - - - -void Monitor::init() -{ - lock.Lock(); - - dout(1) << "init" << endl; - - // store - char s[80]; - sprintf(s, "mondata/mon%d", whoami); - store = new MonitorStore(s); - - if (g_conf.mkfs) - store->mkfs(); - - store->mount(); - - // create - osdmon = new OSDMonitor(this, messenger, lock); - mdsmon = new MDSMonitor(this, messenger, lock); - clientmon = new ClientMonitor(this, messenger, lock); - - // i'm ready! - messenger->set_dispatcher(this); - - // start ticker - reset_tick(); - - // call election? - if (monmap->num_mon > 1) { - assert(monmap->num_mon != 2); - call_election(); - } else { - // we're standalone. - set q; - q.insert(whoami); - win_election(q); - } - - lock.Unlock(); -} - -void Monitor::shutdown() -{ - dout(1) << "shutdown" << endl; - - // cancel all events - cancel_tick(); - timer.cancel_all(); - timer.join(); - - // stop osds. - for (set::iterator it = osdmon->osdmap.get_osds().begin(); - it != osdmon->osdmap.get_osds().end(); - it++) { - if (osdmon->osdmap.is_down(*it)) continue; - dout(10) << "sending shutdown to osd" << *it << endl; - messenger->send_message(new MGenericMessage(MSG_SHUTDOWN), - osdmon->osdmap.get_inst(*it)); - } - osdmon->mark_all_down(); - - // monitors too. - for (int i=0; inum_mon; i++) - if (i != whoami) - messenger->send_message(new MGenericMessage(MSG_SHUTDOWN), - monmap->get_inst(i)); - - // unmount my local storage - if (store) - delete store; - - // clean up - if (monmap) delete monmap; - if (osdmon) delete osdmon; - if (mdsmon) delete mdsmon; - if (clientmon) delete clientmon; - - // die. - messenger->shutdown(); - delete messenger; -} - - -void Monitor::call_election() -{ - if (monmap->num_mon == 1) return; - - dout(10) << "call_election" << endl; - state = STATE_STARTING; - - elector.start(); - - osdmon->election_starting(); - //mdsmon->election_starting(); -} - -void Monitor::win_election(set& active) -{ - state = STATE_LEADER; - leader = whoami; - quorum = active; - dout(10) << "win_election, quorum is " << quorum << endl; - - // init - osdmon->election_finished(); - mdsmon->election_finished(); - - // init paxos - test_paxos.leader_start(); -} - -void Monitor::lose_election(int l) -{ - state = STATE_PEON; - leader = l; - dout(10) << "lose_election, leader is mon" << leader << endl; -} - - - -void Monitor::dispatch(Message *m) -{ - lock.Lock(); - { - switch (m->get_type()) { - - // misc - case MSG_PING_ACK: - handle_ping_ack((MPingAck*)m); - break; - - case MSG_SHUTDOWN: - assert(m->get_source().is_osd()); - osdmon->dispatch(m); - break; - - - // OSDs - case MSG_OSD_GETMAP: - case MSG_OSD_FAILURE: - case MSG_OSD_BOOT: - case MSG_OSD_IN: - case MSG_OSD_OUT: - osdmon->dispatch(m); - break; - - - // MDSs - case MSG_MDS_BEACON: - case MSG_MDS_GETMAP: - mdsmon->dispatch(m); - - // hackish: did all mds's shut down? - if (g_conf.mon_stop_with_last_mds && - mdsmon->mdsmap.get_num_up_or_failed_mds() == 0) - shutdown(); - - break; - - // clients - case MSG_CLIENT_BOOT: - clientmon->dispatch(m); - break; - - - // paxos - case MSG_MON_PAXOS: - // send it to the right paxos instance - switch (((MMonPaxos*)m)->machine_id) { - case PAXOS_TEST: - test_paxos.dispatch(m); - break; - case PAXOS_OSDMAP: - //... - - default: - assert(0); - } - break; - - // elector messages - case MSG_MON_ELECTION_PROPOSE: - case MSG_MON_ELECTION_ACK: - case MSG_MON_ELECTION_VICTORY: - elector.dispatch(m); - break; - - - default: - dout(0) << "unknown message " << *m << endl; - assert(0); - } - } - lock.Unlock(); -} - - -void Monitor::handle_shutdown(Message *m) -{ - dout(1) << "shutdown from " << m->get_source() << endl; - - shutdown(); - delete m; -} - -void Monitor::handle_ping_ack(MPingAck *m) -{ - // ... - - delete m; -} - - - - -/************ TICK ***************/ - -class C_Mon_Tick : public Context { - Monitor *mon; -public: - C_Mon_Tick(Monitor *m) : mon(m) {} - void finish(int r) { - mon->tick(); - } -}; - -void Monitor::cancel_tick() -{ - if (tick_timer) timer.cancel_event(tick_timer); -} - -void Monitor::reset_tick() -{ - cancel_tick(); - tick_timer = new C_Mon_Tick(this); - timer.add_event_after(g_conf.mon_tick_interval, tick_timer); -} - - -void Monitor::tick() -{ - tick_timer = 0; - - // ok go. - dout(11) << "tick" << endl; - - osdmon->tick(); - mdsmon->tick(); - - // next tick! - reset_tick(); -} - - - - - - - diff --git a/branches/marnberg/quota/mon/Monitor.h b/branches/marnberg/quota/mon/Monitor.h deleted file mode 100644 index 6554ad36239b1..0000000000000 --- a/branches/marnberg/quota/mon/Monitor.h +++ /dev/null @@ -1,139 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MONITOR_H -#define __MONITOR_H - -#include "include/types.h" -#include "msg/Messenger.h" - -#include "common/Timer.h" - -#include "MonMap.h" -#include "Elector.h" -#include "Paxos.h" - - -class MonitorStore; -class OSDMonitor; -class MDSMonitor; -class ClientMonitor; - -#define PAXOS_TEST 0 -#define PAXOS_OSDMAP 1 -#define PAXOS_MDSMAP 2 -#define PAXOS_CLIENTMAP 3 - -class Monitor : public Dispatcher { -protected: - // me - int whoami; - Messenger *messenger; - Mutex lock; - - MonMap *monmap; - - // timer. - SafeTimer timer; - Context *tick_timer; - void cancel_tick(); - void reset_tick(); - friend class C_Mon_Tick; - - // my local store - //ObjectStore *store; - MonitorStore *store; - - const static int INO_ELECTOR = 1; - const static int INO_MON_MAP = 2; - const static int INO_OSD_MAP = 10; - const static int INO_OSD_INC_MAP = 11; - const static int INO_MDS_MAP = 20; - - // elector - Elector elector; - friend class Elector; - - epoch_t mon_epoch; // monitor epoch (election instance) - set quorum; // current active set of monitors (if !starting) - - //void call_election(); - - // paxos - Paxos test_paxos; - friend class Paxos; - - - // monitor state - const static int STATE_STARTING = 0; // electing - const static int STATE_LEADER = 1; - const static int STATE_PEON = 2; - int state; - - int leader; // current leader (to best of knowledge) - utime_t last_called_election; // [starting] last time i called an election - - bool is_starting() { return state == STATE_STARTING; } - bool is_leader() { return state == STATE_LEADER; } - bool is_peon() { return state == STATE_PEON; } - - // my public services - OSDMonitor *osdmon; - MDSMonitor *mdsmon; - ClientMonitor *clientmon; - - // messages - void handle_shutdown(Message *m); - void handle_ping_ack(class MPingAck *m); - - friend class OSDMonitor; - friend class MDSMonitor; - friend class ClientMonitor; - - // initiate election - void call_election(); - - // end election (called by Elector) - void win_election(set& q); - void lose_election(int l); - - - - public: - Monitor(int w, Messenger *m, MonMap *mm) : - whoami(w), - messenger(m), - monmap(mm), - timer(lock), tick_timer(0), - store(0), - elector(this, w), - mon_epoch(0), - - test_paxos(this, w, PAXOS_TEST, "tester"), // tester state machine - - state(STATE_STARTING), - leader(0), - osdmon(0), mdsmon(0), clientmon(0) - { - } - - - void init(); - void shutdown(); - void dispatch(Message *m); - void tick(); - -}; - -#endif diff --git a/branches/marnberg/quota/mon/MonitorStore.cc b/branches/marnberg/quota/mon/MonitorStore.cc deleted file mode 100644 index f5a10696c7ada..0000000000000 --- a/branches/marnberg/quota/mon/MonitorStore.cc +++ /dev/null @@ -1,224 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "MonitorStore.h" -#include "common/Clock.h" - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cout << g_clock.now() << " store(" << dir <<") " -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cerr << g_clock.now() << " store(" << dir <<") " - -#include -#include -#include -#include -#include - -void MonitorStore::mount() -{ - dout(1) << "mount" << endl; - // verify dir exists - DIR *d = ::opendir(dir.c_str()); - if (!d) { - derr(1) << "basedir " << dir << " dne" << endl; - assert(0); - } - ::closedir(d); - - if (g_conf.use_abspaths) { - // combine it with the cwd, in case fuse screws things up (i.e. fakefuse) - string old = dir; - char *cwd = get_current_dir_name(); - dir = cwd; - delete cwd; - dir += "/"; - dir += old; - } -} - - -void MonitorStore::mkfs() -{ - dout(1) << "mkfs" << endl; - - char cmd[200]; - sprintf(cmd, "test -d %s && /bin/rm -r %s ; mkdir -p %s", dir.c_str(), dir.c_str(), dir.c_str()); - dout(1) << cmd << endl; - system(cmd); -} - - -version_t MonitorStore::get_int(const char *a, const char *b) -{ - char fn[200]; - if (b) - sprintf(fn, "%s/%s/%s", dir.c_str(), a, b); - else - sprintf(fn, "%s/%s", dir.c_str(), a); - - FILE *f = ::fopen(fn, "r"); - if (!f) - return 0; - - char buf[20]; - ::fgets(buf, 20, f); - ::fclose(f); - - version_t val = atoi(buf); - - if (b) { - dout(15) << "get_int " << a << "/" << b << " = " << val << endl; - } else { - dout(15) << "get_int " << a << " = " << val << endl; - } - return val; -} - - -void MonitorStore::put_int(version_t val, const char *a, const char *b) -{ - char fn[200]; - sprintf(fn, "%s/%s", dir.c_str(), a); - if (b) { - ::mkdir(fn, 0755); - dout(15) << "set_int " << a << "/" << b << " = " << val << endl; - sprintf(fn, "%s/%s/%s", dir.c_str(), a, b); - } else { - dout(15) << "set_int " << a << " = " << val << endl; - } - - char vs[30]; -#ifdef __LP64__ - sprintf(vs, "%ld\n", val); -#else - sprintf(vs, "%lld\n", val); -#endif - - char tfn[200]; - sprintf(tfn, "%s.new", fn); - - int fd = ::open(tfn, O_WRONLY|O_CREAT); - assert(fd > 0); - ::fchmod(fd, 0644); - ::write(fd, vs, strlen(vs)); - ::close(fd); - ::rename(tfn, fn); -} - - -// ---------------------------------------- -// buffers - -bool MonitorStore::exists_bl_ss(const char *a, const char *b) -{ - char fn[200]; - if (b) { - dout(15) << "exists_bl " << a << "/" << b << endl; - sprintf(fn, "%s/%s/%s", dir.c_str(), a, b); - } else { - dout(15) << "exists_bl " << a << endl; - sprintf(fn, "%s/%s", dir.c_str(), a); - } - - struct stat st; - int r = ::stat(fn, &st); - return r == 0; -} - - -int MonitorStore::get_bl_ss(bufferlist& bl, const char *a, const char *b) -{ - char fn[200]; - if (b) { - sprintf(fn, "%s/%s/%s", dir.c_str(), a, b); - } else { - sprintf(fn, "%s/%s", dir.c_str(), a); - } - - int fd = ::open(fn, O_RDONLY); - if (!fd) { - if (b) { - dout(15) << "get_bl " << a << "/" << b << " DNE" << endl; - } else { - dout(15) << "get_bl " << a << " DNE" << endl; - } - return 0; - } - - // get size - struct stat st; - int rc = ::fstat(fd, &st); - assert(rc == 0); - __int32_t len = st.st_size; - - // read buffer - bl.clear(); - bufferptr bp(len); - int off = 0; - while (off < len) { - dout(20) << "reading at off " << off << " of " << len << endl; - int r = ::read(fd, bp.c_str()+off, len-off); - if (r < 0) derr(0) << "errno on read " << strerror(errno) << endl; - assert(r>0); - off += r; - } - bl.append(bp); - ::close(fd); - - if (b) { - dout(15) << "get_bl " << a << "/" << b << " = " << bl.length() << " bytes" << endl; - } else { - dout(15) << "get_bl " << a << " = " << bl.length() << " bytes" << endl; - } - - return len; -} - -int MonitorStore::put_bl_ss(bufferlist& bl, const char *a, const char *b) -{ - char fn[200]; - sprintf(fn, "%s/%s", dir.c_str(), a); - if (b) { - ::mkdir(fn, 0755); - dout(15) << "put_bl " << a << "/" << b << " = " << bl.length() << " bytes" << endl; - sprintf(fn, "%s/%s/%s", dir.c_str(), a, b); - } else { - dout(15) << "put_bl " << a << " = " << bl.length() << " bytes" << endl; - } - - char tfn[200]; - sprintf(tfn, "%s.new", fn); - int fd = ::open(tfn, O_WRONLY|O_CREAT); - assert(fd); - - // chmod - ::fchmod(fd, 0644); - - // write data - for (list::const_iterator it = bl.buffers().begin(); - it != bl.buffers().end(); - it++) { - int r = ::write(fd, it->c_str(), it->length()); - if (r != (int)it->length()) - derr(0) << "put_bl_ss ::write() returned " << r << " not " << it->length() << endl; - if (r < 0) - derr(0) << "put_bl_ss ::write() errored out, errno is " << strerror(errno) << endl; - } - - ::fsync(fd); - ::close(fd); - ::rename(tfn, fn); - - return 0; -} diff --git a/branches/marnberg/quota/mon/MonitorStore.h b/branches/marnberg/quota/mon/MonitorStore.h deleted file mode 100644 index 122118f33f556..0000000000000 --- a/branches/marnberg/quota/mon/MonitorStore.h +++ /dev/null @@ -1,81 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MON_MONITORSTORE_H -#define __MON_MONITORSTORE_H - -#include "include/types.h" -#include "include/buffer.h" - -#include - -class MonitorStore { - string dir; - -public: - MonitorStore(char *d) : dir(d) { - } - ~MonitorStore() { - } - - void mkfs(); // wipe - void mount(); - - // ints (stored as ascii) - version_t get_int(const char *a, const char *b=0); - void put_int(version_t v, const char *a, const char *b=0); - - // buffers - // ss and sn varieties. - bool exists_bl_ss(const char *a, const char *b=0); - int get_bl_ss(bufferlist& bl, const char *a, const char *b); - int put_bl_ss(bufferlist& bl, const char *a, const char *b); - bool exists_bl_sn(const char *a, version_t b) { - char bs[20]; -#ifdef __LP64__ - sprintf(bs, "%lu", b); -#else - sprintf(bs, "%llu", b); -#endif - return exists_bl_ss(a, bs); - } - int get_bl_sn(bufferlist& bl, const char *a, version_t b) { - char bs[20]; -#ifdef __LP64__ - sprintf(bs, "%lu", b); -#else - sprintf(bs, "%llu", b); -#endif - return get_bl_ss(bl, a, bs); - } - int put_bl_sn(bufferlist& bl, const char *a, version_t b) { - char bs[20]; -#ifdef __LP64__ - sprintf(bs, "%lu", b); -#else - sprintf(bs, "%llu", b); -#endif - return put_bl_ss(bl, a, bs); - } - - /* - version_t get_incarnation() { return get_int("incarnation"); } - void set_incarnation(version_t i) { set_int(i, "incarnation"); } - - version_t get_last_proposal() { return get_int("last_proposal"); } - void set_last_proposal(version_t i) { set_int(i, "last_proposal"); } - */ -}; - - -#endif diff --git a/branches/marnberg/quota/mon/OSDMonitor.cc b/branches/marnberg/quota/mon/OSDMonitor.cc deleted file mode 100644 index fe9d54b189de6..0000000000000 --- a/branches/marnberg/quota/mon/OSDMonitor.cc +++ /dev/null @@ -1,897 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "OSDMonitor.h" -#include "Monitor.h" -#include "MDSMonitor.h" - -#include "MonitorStore.h" - -#include "messages/MOSDFailure.h" -#include "messages/MOSDMap.h" -#include "messages/MOSDGetMap.h" -#include "messages/MOSDBoot.h" -#include "messages/MOSDIn.h" -#include "messages/MOSDOut.h" - -#include "messages/MMonOSDMapInfo.h" -#include "messages/MMonOSDMapLease.h" -#include "messages/MMonOSDMapLeaseAck.h" -#include "messages/MMonOSDMapUpdatePrepare.h" -#include "messages/MMonOSDMapUpdateAck.h" -#include "messages/MMonOSDMapUpdateCommit.h" - -#include "common/Timer.h" - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cout << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".osd(" << (state == STATE_INIT ? (const char*)"init":(state == STATE_SYNC ? (const char*)"sync":(state == STATE_LOCK ? (const char*)"lock":(state == STATE_UPDATING ? (const char*)"updating":(const char*)"?\?")))) << ") e" << osdmap.get_epoch() << " " -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cerr << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".osd(" << (state == STATE_INIT ? (const char*)"init":(state == STATE_SYNC ? (const char*)"sync":(state == STATE_LOCK ? (const char*)"lock":(state == STATE_UPDATING ? (const char*)"updating":(const char*)"?\?")))) << ") e" << osdmap.get_epoch() << " " - - -class C_Mon_FakeOSDFailure : public Context { - OSDMonitor *mon; - int osd; - bool down; -public: - C_Mon_FakeOSDFailure(OSDMonitor *m, int o, bool d) : mon(m), osd(o), down(d) {} - void finish(int r) { - mon->fake_osd_failure(osd,down); - } -}; - - -void OSDMonitor::fake_osdmap_update() -{ - dout(1) << "fake_osdmap_update" << endl; - accept_pending(); - - // tell a random osd - int osd = rand() % g_conf.num_osd; - send_incremental(osdmap.get_epoch()-1, // ick! FIXME - osdmap.get_inst(osd)); -} - - -void OSDMonitor::fake_reorg() -{ - int r = rand() % g_conf.num_osd; - - if (osdmap.is_out(r)) { - dout(1) << "fake_reorg marking osd" << r << " in" << endl; - pending_inc.new_in.push_back(r); - } else { - dout(1) << "fake_reorg marking osd" << r << " out" << endl; - pending_inc.new_out.push_back(r); - } - - accept_pending(); - - // tell him! - send_incremental(osdmap.get_epoch()-1, osdmap.get_inst(r)); - - // do it again? - /* - if (g_conf.num_osd - d > 4 && - g_conf.num_osd - d > g_conf.num_osd/2) - mon->timer.add_event_after(g_conf.fake_osdmap_expand, - new C_Mon_Faker(this)); - */ -} - - - -/* -void OSDMonitor::init() -{ - // start with blank map - - // load my last state from the store - bufferlist bl; - if (get_map_bl(0, bl)) { // FIXME - // yay! - osdmap.decode(bl); - dout(1) << "init got epoch " << osdmap.get_epoch() << " from store" << endl; - - // set up pending_inc - pending_inc.epoch = osdmap.get_epoch()+1; - } -} -*/ - - - - -/************ MAPS ****************/ - - -void OSDMonitor::create_initial() -{ - dout(1) << "create_initial generating osdmap from g_conf" << endl; - - // - osdmap.mon_epoch = mon->mon_epoch; - osdmap.ctime = g_clock.now(); - - if (g_conf.osd_pg_bits) { - osdmap.set_pg_bits(g_conf.osd_pg_bits); - } else { - int osdbits = 1; - int n = g_conf.num_osd; - while (n) { - n = n >> 1; - osdbits++; - } - - // 2 bits per osd. - osdmap.set_pg_bits(osdbits + 2); - } - - // start at epoch 0 until all osds boot - //osdmap.inc_epoch(); // = 1 - //assert(osdmap.get_epoch() == 1); - - if (g_conf.num_osd >= 12) { - int ndom = g_conf.osd_max_rep; - UniformBucket *domain[ndom]; - int domid[ndom]; - for (int i=0; iadd_item(i, 1.0); - //cerr << "osd" << i << " in domain " << dom << endl; - i++; - if (i == g_conf.num_osd) break; - } - } - - // root - Bucket *root = new ListBucket(2); - for (int i=0; iget_weight() << endl; - root->add_item(domid[i], domain[i]->get_weight()); - } - int nroot = osdmap.crush.add_bucket(root); - - // rules - for (int i=1; i<=ndom; i++) { - osdmap.crush.rules[i].steps.push_back(RuleStep(CRUSH_RULE_TAKE, nroot)); - osdmap.crush.rules[i].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, i, 1)); - osdmap.crush.rules[i].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, 1, 0)); - osdmap.crush.rules[i].steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - } - - // test - //vector out; - //osdmap.pg_to_osds(0x40200000110ULL, out); - - } else { - // one bucket - Bucket *b = new UniformBucket(1, 0); - int root = osdmap.crush.add_bucket(b); - for (int i=0; iadd_item(i, 1.0); - } - - for (int i=1; i<=g_conf.osd_max_rep; i++) { - osdmap.crush.rules[i].steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - osdmap.crush.rules[i].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, i, 0)); - osdmap.crush.rules[i].steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - } - } - - if (g_conf.mds_local_osd) { - // add mds osds, but don't put them in the crush mapping func - for (int i=0; i - - // fake osd failures - for (map::iterator i = g_fake_osd_down.begin(); - i != g_fake_osd_down.end(); - i++) { - dout(0) << "will fake osd" << i->first << " DOWN after " << i->second << endl; - mon->timer.add_event_after(i->second, new C_Mon_FakeOSDFailure(this, i->first, 1)); - } - for (map::iterator i = g_fake_osd_out.begin(); - i != g_fake_osd_out.end(); - i++) { - dout(0) << "will fake osd" << i->first << " OUT after " << i->second << endl; - mon->timer.add_event_after(i->second, new C_Mon_FakeOSDFailure(this, i->first, 0)); - } -} - - -bool OSDMonitor::get_map_bl(epoch_t epoch, bufferlist& bl) -{ - if (!mon->store->exists_bl_sn("osdmap", epoch)) - return false; - int r = mon->store->get_bl_sn(bl, "osdmap", epoch); - assert(r > 0); - return true; -} - -bool OSDMonitor::get_inc_map_bl(epoch_t epoch, bufferlist& bl) -{ - if (!mon->store->exists_bl_sn("osdincmap", epoch)) - return false; - int r = mon->store->get_bl_sn(bl, "osdincmap", epoch); - assert(r > 0); - return true; -} - - -void OSDMonitor::save_map() -{ - bufferlist bl; - osdmap.encode(bl); - - mon->store->put_bl_sn(bl, "osdmap", osdmap.get_epoch()); - mon->store->put_int(osdmap.get_epoch(), "osd_epoch"); -} - -void OSDMonitor::save_inc_map(OSDMap::Incremental &inc) -{ - bufferlist bl; - osdmap.encode(bl); - - bufferlist incbl; - inc.encode(incbl); - - mon->store->put_bl_sn(bl, "osdmap", osdmap.get_epoch()); - mon->store->put_bl_sn(incbl, "osdincmap", osdmap.get_epoch()); - mon->store->put_int(osdmap.get_epoch(), "osd_epoch"); -} - - - -void OSDMonitor::dispatch(Message *m) -{ - switch (m->get_type()) { - - // services - case MSG_OSD_GETMAP: - handle_osd_getmap((MOSDGetMap*)m); - break; - case MSG_OSD_FAILURE: - handle_osd_failure((MOSDFailure*)m); - break; - case MSG_OSD_BOOT: - handle_osd_boot((MOSDBoot*)m); - break; - case MSG_OSD_IN: - handle_osd_in((MOSDIn*)m); - break; - case MSG_OSD_OUT: - handle_osd_out((MOSDOut*)m); - break; - - // replication - case MSG_MON_OSDMAP_INFO: - handle_info((MMonOSDMapInfo*)m); - break; - case MSG_MON_OSDMAP_LEASE: - handle_lease((MMonOSDMapLease*)m); - break; - case MSG_MON_OSDMAP_LEASE_ACK: - handle_lease_ack((MMonOSDMapLeaseAck*)m); - break; - case MSG_MON_OSDMAP_UPDATE_PREPARE: - handle_update_prepare((MMonOSDMapUpdatePrepare*)m); - break; - case MSG_MON_OSDMAP_UPDATE_ACK: - handle_update_ack((MMonOSDMapUpdateAck*)m); - break; - case MSG_MON_OSDMAP_UPDATE_COMMIT: - handle_update_commit((MMonOSDMapUpdateCommit*)m); - break; - - default: - assert(0); - } -} - - - -void OSDMonitor::handle_osd_failure(MOSDFailure *m) -{ - dout(1) << "osd failure: " << m->get_failed() << " from " << m->get_source() << endl; - - // FIXME - // take their word for it - int from = m->get_failed().name.num(); - if (osdmap.is_up(from) && - (osdmap.osd_inst.count(from) == 0 || - osdmap.osd_inst[from] == m->get_failed())) { - pending_inc.new_down[from] = m->get_failed(); - - if (osdmap.is_in(from)) - down_pending_out[from] = g_clock.now(); - - //awaiting_maps[pending_inc.epoch][m->get_source()] = - - accept_pending(); - - send_incremental(m->get_epoch(), m->get_source_inst()); - - send_waiting(); - bcast_latest_mds(); - } - - delete m; -} - - -void OSDMonitor::fake_osd_failure(int osd, bool down) -{ - if (down) { - dout(1) << "fake_osd_failure DOWN osd" << osd << endl; - pending_inc.new_down[osd] = osdmap.osd_inst[osd]; - } else { - dout(1) << "fake_osd_failure OUT osd" << osd << endl; - pending_inc.new_out.push_back(osd); - } - accept_pending(); - bcast_latest_osd(); - bcast_latest_mds(); -} - -void OSDMonitor::mark_all_down() -{ - dout(7) << "mark_all_down" << endl; - - for (set::iterator it = osdmap.get_osds().begin(); - it != osdmap.get_osds().end(); - it++) { - if (osdmap.is_down(*it)) continue; - pending_inc.new_down[*it] = osdmap.get_inst(*it); - } - accept_pending(); -} - - - - -void OSDMonitor::handle_osd_boot(MOSDBoot *m) -{ - dout(7) << "osd_boot from " << m->get_source() << endl; - assert(m->get_source().is_osd()); - int from = m->get_source().num(); - - if (osdmap.get_epoch() == 0) { - // waiting for boot! - osdmap.osd_inst[from] = m->get_source_inst(); - - if (osdmap.osd_inst.size() == osdmap.osds.size()) { - dout(-7) << "osd_boot all osds booted." << endl; - osdmap.inc_epoch(); - - save_map(); - - pending_inc.epoch = osdmap.get_epoch()+1; // 2 - - bcast_latest_osd(); - bcast_latest_mds(); - } else { - dout(7) << "osd_boot waiting for " - << (osdmap.osds.size() - osdmap.osd_inst.size()) - << " osds to boot" << endl; - } - - delete m; - return; - } - - // already up? mark down first? - if (osdmap.is_up(from)) { - pending_inc.new_down[from] = osdmap.osd_inst[from]; - accept_pending(); - } - - // mark up. - down_pending_out.erase(from); - assert(osdmap.is_down(from)); - pending_inc.new_up[from] = m->get_source_inst(); - - // mark in? - if (osdmap.out_osds.count(from)) - pending_inc.new_in.push_back(from); - - accept_pending(); - - // the booting osd will spread word - send_incremental(m->sb.current_epoch, m->get_source_inst()); - delete m; - - // tell mds - bcast_latest_mds(); -} - -void OSDMonitor::handle_osd_in(MOSDIn *m) -{ - dout(7) << "osd_in from " << m->get_source() << endl; - int from = m->get_source().num(); - - if (osdmap.is_out(from)) - pending_inc.new_in.push_back(from); - accept_pending(); - send_incremental(m->map_epoch, m->get_source_inst()); -} - -void OSDMonitor::handle_osd_out(MOSDOut *m) -{ - dout(7) << "osd_out from " << m->get_source() << endl; - int from = m->get_source().num(); - if (osdmap.is_in(from)) { - pending_inc.new_out.push_back(from); - accept_pending(); - send_incremental(m->map_epoch, m->get_source_inst()); - } -} - -void OSDMonitor::handle_osd_getmap(MOSDGetMap *m) -{ - dout(7) << "osd_getmap from " << m->get_source() << " since " << m->get_since() << endl; - - if (osdmap.get_epoch() == 0) { - awaiting_map[m->get_source()].first = m->get_source_inst(); - awaiting_map[m->get_source()].second = m->get_since(); - } else { - //if (m->get_since()) - send_incremental(m->get_since(), m->get_source_inst()); - //else - //send_full(m->get_source(), m->get_source_inst()); - } - delete m; -} - - - -void OSDMonitor::accept_pending() -{ - dout(-10) << "accept_pending " << osdmap.get_epoch() << " -> " << pending_inc.epoch << endl; - - // accept pending into a new map! - pending_inc.ctime = g_clock.now(); - pending_inc.mon_epoch = mon->mon_epoch; - - // advance! - osdmap.apply_incremental(pending_inc); - - // save it. - save_inc_map( pending_inc ); - - // tell me about it - for (map::iterator i = pending_inc.new_up.begin(); - i != pending_inc.new_up.end(); - i++) { - dout(0) << "osd" << i->first << " UP " << i->second << endl; - derr(0) << "osd" << i->first << " UP " << i->second << endl; - } - for (map::iterator i = pending_inc.new_down.begin(); - i != pending_inc.new_down.end(); - i++) { - dout(0) << "osd" << i->first << " DOWN " << i->second << endl; - derr(0) << "osd" << i->first << " DOWN " << i->second << endl; - messenger->mark_down(i->second.addr); - } - for (list::iterator i = pending_inc.new_in.begin(); - i != pending_inc.new_in.end(); - i++) { - dout(0) << "osd" << *i << " IN" << endl; - derr(0) << "osd" << *i << " IN" << endl; - } - for (list::iterator i = pending_inc.new_out.begin(); - i != pending_inc.new_out.end(); - i++) { - dout(0) << "osd" << *i << " OUT" << endl; - derr(0) << "osd" << *i << " OUT" << endl; - } - - // clear new pending - OSDMap::Incremental next(osdmap.get_epoch() + 1); - pending_inc = next; -} - -void OSDMonitor::send_waiting() -{ - dout(10) << "send_waiting " << osdmap.get_epoch() << endl; - - for (map >::iterator i = awaiting_map.begin(); - i != awaiting_map.end(); - i++) - send_incremental(i->second.second, i->second.first); -} - - -void OSDMonitor::send_full(entity_inst_t who) -{ - messenger->send_message(new MOSDMap(&osdmap), who); -} - -void OSDMonitor::send_incremental(epoch_t since, entity_inst_t dest) -{ - dout(5) << "osd_send_incremental " << since << " -> " << osdmap.get_epoch() - << " to " << dest << endl; - - MOSDMap *m = new MOSDMap; - - for (epoch_t e = osdmap.get_epoch(); - e > since; - e--) { - bufferlist bl; - if (get_inc_map_bl(e, bl)) { - dout(10) << "osd_send_incremental inc " << e << endl; - m->incremental_maps[e] = bl; - } - else if (get_map_bl(e, bl)) { - dout(10) << "osd_send_incremental full " << e << endl; - m->maps[e] = bl; - } - else { - assert(0); // we should have all maps. - } - } - - messenger->send_message(m, dest); -} - - - -void OSDMonitor::bcast_latest_mds() -{ - epoch_t e = osdmap.get_epoch(); - dout(1) << "bcast_latest_mds epoch " << e << endl; - - // tell mds - set up; - mon->mdsmon->mdsmap.get_up_mds_set(up); - for (set::iterator i = up.begin(); - i != up.end(); - i++) { - send_incremental(osdmap.get_epoch()-1, mon->mdsmon->mdsmap.get_inst(*i)); - } -} - -void OSDMonitor::bcast_latest_osd() -{ - epoch_t e = osdmap.get_epoch(); - dout(1) << "bcast_latest_osd epoch " << e << endl; - - // tell osds - set osds; - osdmap.get_all_osds(osds); - for (set::iterator it = osds.begin(); - it != osds.end(); - it++) { - if (osdmap.is_down(*it)) continue; - - send_incremental(osdmap.get_epoch()-1, osdmap.get_inst(*it)); - } -} - - - -void OSDMonitor::tick() -{ - // mark down osds out? - utime_t now = g_clock.now(); - list mark_out; - for (map::iterator i = down_pending_out.begin(); - i != down_pending_out.end(); - i++) { - utime_t down = now; - down -= i->second; - - if (down.sec() >= g_conf.mon_osd_down_out_interval) { - dout(10) << "tick marking osd" << i->first << " OUT after " << down << " sec" << endl; - mark_out.push_back(i->first); - } - } - for (list::iterator i = mark_out.begin(); - i != mark_out.end(); - i++) { - down_pending_out.erase(*i); - pending_inc.new_out.push_back( *i ); - } - if (!mark_out.empty()) { - accept_pending(); - - // hrmpf. bcast map for now. FIXME FIXME. - bcast_latest_osd(); - } -} - -void OSDMonitor::election_starting() -{ - dout(10) << "election_starting" << endl; -} - -void OSDMonitor::election_finished() -{ - dout(10) << "election_finished" << endl; - - if (mon->is_leader()) { - if (g_conf.mkfs) { - create_initial(); - save_map(); - } else { - // - epoch_t epoch = mon->store->get_int("osd_epoch"); - dout(10) << " last epoch was " << epoch << endl; - bufferlist bl, blinc; - int r = mon->store->get_bl_sn(bl, "osdmap", epoch); - assert(r>0); - osdmap.decode(bl); - - // pending_inc - pending_inc.epoch = epoch+1; - } - - } - - /* - state = STATE_INIT; - - // map? - if (osdmap.get_epoch() == 0 && - mon->is_leader()) { - create_initial(); - } - - - - if (mon->is_leader()) { - // leader. - if (mon->monmap->num_mon == 1) { - // hmm, it's just me! - state = STATE_SYNC; - } - } - else if (mon->is_peon()) { - // peon. send info - //messenger->send_message(new MMonOSDMapInfo(osdmap.epoch, osdmap.mon_epoch), - // mon->monmap->get_inst(mon->leader)); - } - */ -} - - - -void OSDMonitor::handle_info(MMonOSDMapInfo *m) -{ - dout(10) << "handle_info from " << m->get_source() - << " epoch " << m->get_epoch() << " in mon_epoch " << m->get_mon_epoch() - << endl; - - epoch_t epoch = m->get_epoch(); - - // did they have anything? - if (epoch > 0) { - // make sure it's current. - if (epoch == osdmap.get_epoch()) { - if (osdmap.mon_epoch != m->get_mon_epoch()) { - dout(10) << "handle_info had divergent epoch " << m->get_epoch() - << ", mon_epoch " << m->get_mon_epoch() << " != " << osdmap.mon_epoch << endl; - epoch--; - } - } else { - bufferlist bl; - get_map_bl(epoch, bl); - - OSDMap old; - old.decode(bl); - - if (old.mon_epoch != m->get_mon_epoch()) { - dout(10) << "handle_info had divergent epoch " << m->get_epoch() - << ", mon_epoch " << m->get_mon_epoch() << " != " << old.mon_epoch << endl; - epoch--; - } - } - } - - // bring up to date - if (epoch < osdmap.get_epoch()) - send_incremental(epoch, m->get_source_inst()); - - delete m; -} - - -void OSDMonitor::issue_leases() -{ - dout(10) << "issue_leases" << endl; - assert(mon->is_leader()); - - // set lease endpoint - lease_expire = g_clock.now(); - lease_expire += g_conf.mon_lease; - - pending_ack.clear(); - - for (set::iterator i = mon->quorum.begin(); - i != mon->quorum.end(); - i++) { - if (*i == mon->whoami) continue; - messenger->send_message(new MMonOSDMapLease(osdmap.get_epoch(), lease_expire), - mon->monmap->get_inst(*i)); - pending_ack.insert(*i); - } -} - -void OSDMonitor::handle_lease(MMonOSDMapLease *m) -{ - if (m->get_epoch() != osdmap.get_epoch() + 1) { - dout(10) << "map_lease from " << m->get_source() - << " on epoch " << m->get_epoch() << ", but i am " << osdmap.get_epoch() << endl; - assert(0); - delete m; - return; - } - - dout(10) << "map_lease from " << m->get_source() << " expires " << lease_expire << endl; - lease_expire = m->get_lease_expire(); - - delete m; -} - -void OSDMonitor::handle_lease_ack(MMonOSDMapLeaseAck *m) -{ - // right epoch? - if (m->get_epoch() != osdmap.get_epoch()) { - dout(10) << "map_lease_ack from " << m->get_source() - << " on old epoch " << m->get_epoch() << ", dropping" << endl; - delete m; - return; - } - - // within time limit? - if (g_clock.now() >= lease_expire) { - dout(10) << "map_lease_ack from " << m->get_source() - << ", but lease expired, calling election" << endl; - mon->call_election(); - delete m; - return; - } - - assert(m->get_source().is_mon()); - int from = m->get_source().num(); - - assert(pending_ack.count(from)); - pending_ack.erase(from); - - if (pending_ack.empty()) { - dout(10) << "map_lease_ack from " << m->get_source() - << ", last one" << endl; - } else { - dout(10) << "map_lease_ack from " << m->get_source() - << ", still waiting on " << pending_ack << endl; - } - - delete m; -} - - -void OSDMonitor::update_map() -{ - // lock map - state = STATE_UPDATING; - pending_ack.clear(); - - // set lease endpoint - lease_expire += g_conf.mon_lease; - - // send prepare - epoch_t epoch = osdmap.get_epoch(); - bufferlist map_bl, inc_map_bl; - if (!get_inc_map_bl(epoch, inc_map_bl)) - get_map_bl(epoch, map_bl); - - for (set::iterator i = mon->quorum.begin(); - i != mon->quorum.end(); - i++) { - if (*i == mon->whoami) continue; - messenger->send_message(new MMonOSDMapUpdatePrepare(epoch, - map_bl, inc_map_bl), - mon->monmap->get_inst(*i)); - pending_ack.insert(*i); - } -} - - - -void OSDMonitor::handle_update_prepare(MMonOSDMapUpdatePrepare *m) -{ - dout(10) << "map_update_prepare from " << m->get_source() << " epoch " << m->get_epoch() << endl; - // accept map - assert(m->get_epoch() == osdmap.get_epoch() + 1); - - if (m->inc_map_bl.length()) { - int off = 0; - pending_inc.decode(m->inc_map_bl, off); - accept_pending(); - } else { - osdmap.decode(m->map_bl); - } - - // state - state = STATE_LOCK; - //lease_expire = m->lease_expire; - - // ack - messenger->send_message(new MMonOSDMapUpdateAck(osdmap.get_epoch()), - m->get_source_inst()); - delete m; -} - -void OSDMonitor::handle_update_ack(MMonOSDMapUpdateAck *m) -{ - /* - // right epoch? - if (m->get_epoch() != osdmap.get_epoch()) { - dout(10) << "map_update_ack from " << m->get_source() - << " on old epoch " << m->get_epoch() << ", dropping" << endl; - delete m; - return; - } - - // within time limit? - if (g_clock.now() >= lease_expire) { - dout(10) << "map_update_ack from " << m->get_source() - << ", but lease expired, calling election" << endl; - state = STATE_SYNC; - mon->call_election(); - return; - } - - assert(m->get_source().is_mon()); - int from = m->get_source().num(); - - assert(pending_lease_ack.count(from)); - pending_lease_ack.erase(from); - - if (pending_lease_ack.empty()) { - dout(10) << "map_update_ack from " << m->get_source() - << ", last one" << endl; - state = STATE_SYNC; - - // send lease commit - for (map::iterator i = mon->quorum.begin(); - i != mon->quorum.end(); - i++) { - if (i == mon->whoami) continue; - messenger->send_message(new MMonOSDMapLeaseCommit(osdmap), - MSG_ADDR_MON(*i), mon->monmap->get_inst(*i)); - } - } else { - dout(10) << "map_update_ack from " << m->get_source() - << ", still waiting on " << pending_lease_ack << endl; - } -*/ -} - -void OSDMonitor::handle_update_commit(MMonOSDMapUpdateCommit *m) -{ -} diff --git a/branches/marnberg/quota/mon/OSDMonitor.h b/branches/marnberg/quota/mon/OSDMonitor.h deleted file mode 100644 index bf393f17d9f7a..0000000000000 --- a/branches/marnberg/quota/mon/OSDMonitor.h +++ /dev/null @@ -1,110 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __OSDMONITOR_H -#define __OSDMONITOR_H - -#include -#include -using namespace std; - -#include "include/types.h" -#include "msg/Messenger.h" - -#include "osd/OSDMap.h" - -class Monitor; - -class OSDMonitor : public Dispatcher { - Monitor *mon; - Messenger *messenger; - Mutex &lock; - - // osd maps -public: - OSDMap osdmap; - -private: - map > awaiting_map; - - void create_initial(); - bool get_map_bl(epoch_t epoch, bufferlist &bl); - bool get_inc_map_bl(epoch_t epoch, bufferlist &bl); - - void save_map(); - void save_inc_map(OSDMap::Incremental &inc); - - // [leader] - OSDMap::Incremental pending_inc; - map down_pending_out; // osd down -> out - - set pending_ack; - - // we are distributed - const static int STATE_INIT = 0; // startup - const static int STATE_SYNC = 1; // sync map copy (readonly) - const static int STATE_LOCK = 2; // [peon] map locked - const static int STATE_UPDATING = 3; // [leader] map locked, waiting for peon ack - - int state; - utime_t lease_expire; // when lease expires - - //void init(); - - // maps - void accept_pending(); // accept pending, new map. - void send_waiting(); // send current map to waiters. - void send_full(entity_inst_t dest); - void send_incremental(epoch_t since, entity_inst_t dest); - void bcast_latest_mds(); - void bcast_latest_osd(); - - void update_map(); - - void handle_osd_boot(class MOSDBoot *m); - void handle_osd_in(class MOSDIn *m); - void handle_osd_out(class MOSDOut *m); - void handle_osd_failure(class MOSDFailure *m); - void handle_osd_getmap(class MOSDGetMap *m); - - void handle_info(class MMonOSDMapInfo*); - void handle_lease(class MMonOSDMapLease*); - void handle_lease_ack(class MMonOSDMapLeaseAck*); - void handle_update_prepare(class MMonOSDMapUpdatePrepare*); - void handle_update_ack(class MMonOSDMapUpdateAck*); - void handle_update_commit(class MMonOSDMapUpdateCommit*); - - public: - OSDMonitor(Monitor *mn, Messenger *m, Mutex& l) : - mon(mn), messenger(m), lock(l), - state(STATE_SYNC) { - //init(); - } - - void dispatch(Message *m); - void tick(); // check state, take actions - - void election_starting(); // abort whatever. - void election_finished(); // reinitialize whatever. - - void issue_leases(); - - void mark_all_down(); - - void fake_osd_failure(int osd, bool down); - void fake_osdmap_update(); - void fake_reorg(); -}; - -#endif diff --git a/branches/marnberg/quota/mon/Paxos.cc b/branches/marnberg/quota/mon/Paxos.cc deleted file mode 100644 index 67c4e2e99e179..0000000000000 --- a/branches/marnberg/quota/mon/Paxos.cc +++ /dev/null @@ -1,182 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "Paxos.h" -#include "Monitor.h" -#include "MonitorStore.h" - -#include "messages/MMonPaxos.h" - -#include "config.h" -#undef dout -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cerr << g_clock.now() << " mon" << whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".paxos(" << machine_name << ") " -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cout << g_clock.now() << " mon" << whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".paxos(" << machine_name << ") " - - -// --------------------------------- -// proposer -void Paxos::propose(version_t v, bufferlist& value) -{ -//todo high rf -} - -void Paxos::handle_last(MMonPaxos *m) -{ -//todo high rf - dout(10) << "handle_last " << *m << endl; - delete m; -} - -void Paxos::handle_accept(MMonPaxos *m) -{ -//todo high rf - dout(10) << "handle_accept " << *m << endl; - delete m; - -} - -void Paxos::handle_ack(MMonPaxos *m) -{ -//todo high rf - dout(10) << "handle_ack " << *m << endl; - delete m; -} - -void Paxos::handle_old_round(MMonPaxos *m) -{ -//todo high rf - dout(10) << "handle_old_round " << *m << endl; - delete m; -} - - -/* - * return a globally unique, monotonically increasing proposal number - */ -version_t Paxos::get_new_proposal_number(version_t gt) -{ - // read last - version_t last = mon->store->get_int("last_paxos_proposal"); - if (last < gt) - last = gt; - - // update - last /= 100; - last++; - - // make it unique among all monitors. - version_t pn = last*100 + (version_t)whoami; - - // write - mon->store->put_int(pn, "last_paxos_proposal"); - - dout(10) << "get_new_proposal_number = " << pn << endl; - return pn; -} - - -// --------------------------------- -// accepter -void Paxos::handle_collect(MMonPaxos *m) -{ -//todo high rf - // ... - - delete m; -} - - - - -// --------------------------------- -// learner -void Paxos::handle_success(MMonPaxos *m) -{ - //todo high rf - delete m; -} - -void Paxos::handle_begin(MMonPaxos *m) -{ - //todo high rf - delete m; -} - -// --------------------------------- - -void Paxos::leader_start() -{ - dout(10) << "i am the leader" << endl; - - // .. do something else too - version_t pn = get_new_proposal_number(); - for (int i=0; imonmap->num_mon; ++i) { - if (i == whoami) continue; - // todo high rf I pass the pn twice... what is the last parameter for? - mon->messenger->send_message(new MMonPaxos(MMonPaxos::OP_COLLECT, whoami, pn, pn), - mon->monmap->get_inst(i)); - } -} - - - -void Paxos::dispatch(Message *m) -{ - switch (m->get_type()) { - - case MSG_MON_PAXOS: - { - MMonPaxos *pm = (MMonPaxos*)m; - - // NOTE: these ops are defined in messages/MMonPaxos.h - switch (pm->op) { - // learner - case MMonPaxos::OP_COLLECT: - handle_collect(pm); - break; - - case MMonPaxos::OP_LAST: - handle_last(pm); - break; - - case MMonPaxos::OP_OLDROUND: - handle_old_round(pm); - break; - - case MMonPaxos::OP_BEGIN: - handle_begin(pm); - break; - - case MMonPaxos::OP_ACCEPT: - handle_accept(pm); - break; - - case MMonPaxos::OP_SUCCESS: - handle_success(pm); - break; - - case MMonPaxos::OP_ACK: - handle_ack(pm); - break; - - default: - assert(0); - } - } - break; - - default: - assert(0); - } -} - diff --git a/branches/marnberg/quota/mon/Paxos.h b/branches/marnberg/quota/mon/Paxos.h deleted file mode 100644 index 52a509d25aa76..0000000000000 --- a/branches/marnberg/quota/mon/Paxos.h +++ /dev/null @@ -1,73 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MON_PAXOS_H -#define __MON_PAXOS_H - -#include "include/types.h" -#include "include/buffer.h" -#include "msg/Message.h" - -#include "include/Context.h" - -#include "common/Timer.h" - -class Monitor; -class MMonPaxos; - -// i am one state machine. -class Paxos { - Monitor *mon; - int whoami; - - // my state machine info - int machine_id; - const char *machine_name; - map accepted_values; - map accepted_proposal_number; - - // proposer - void propose(version_t v, bufferlist& value); - - void handle_last(MMonPaxos*); - void handle_accept(MMonPaxos*); - void handle_ack(MMonPaxos*); - void handle_old_round(MMonPaxos*); - - version_t get_new_proposal_number(version_t gt=0); - - // accepter - void handle_collect(MMonPaxos*); - - // learner - void handle_success(MMonPaxos*); - void handle_begin(MMonPaxos*); - - -public: - Paxos(Monitor *m, int w, - int mid,const char *mnm) : mon(m), whoami(w), - machine_id(mid), machine_name(mnm) { - } - - void dispatch(Message *m); - - void leader_start(); - -}; - - - -#endif - diff --git a/branches/marnberg/quota/msg/Dispatcher.cc b/branches/marnberg/quota/msg/Dispatcher.cc deleted file mode 100644 index edee54a2c631f..0000000000000 --- a/branches/marnberg/quota/msg/Dispatcher.cc +++ /dev/null @@ -1,27 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "Dispatcher.h" -#include "Messenger.h" - -#include "mds/MDS.h" - -/* -int Dispatcher::send_message(Message *m, msg_addr_t dest, int dest_port) -{ - assert(0); - //return dis_messenger->send_message(m, dest, dest_port, MDS_PORT_SERVER); // on my port! -} -*/ diff --git a/branches/marnberg/quota/msg/Dispatcher.h b/branches/marnberg/quota/msg/Dispatcher.h deleted file mode 100644 index 8b6fe92381427..0000000000000 --- a/branches/marnberg/quota/msg/Dispatcher.h +++ /dev/null @@ -1,33 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __DISPATCHER_H -#define __DISPATCHER_H - -#include "Message.h" - -class Messenger; - -class Dispatcher { - public: - virtual ~Dispatcher() { } - - // how i receive messages - virtual void dispatch(Message *m) = 0; - - // how i deal with transmission failures. - virtual void ms_handle_failure(Message *m, const entity_inst_t& inst) { delete m; } -}; - -#endif diff --git a/branches/marnberg/quota/msg/FakeMessenger.cc b/branches/marnberg/quota/msg/FakeMessenger.cc deleted file mode 100644 index 2aa6c6b06b75b..0000000000000 --- a/branches/marnberg/quota/msg/FakeMessenger.cc +++ /dev/null @@ -1,338 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "Message.h" -#include "FakeMessenger.h" -#include "mds/MDS.h" - -#include "common/Timer.h" - -#include "common/LogType.h" -#include "common/Logger.h" - -#include "config.h" - -#undef dout -#define dout(x) if ((x) <= g_conf.debug_ms) cout << g_clock.now() << " " - - - -#include -#include -#include -#include -#include - -using namespace std; - -#include -using namespace __gnu_cxx; - - -#include "common/Cond.h" -#include "common/Mutex.h" -#include - - -// global queue. - -int nranks = 0; // this identify each entity_inst_t - -map directory; -hash_map loggers; -LogType fakemsg_logtype; - -set shutdown_set; - -Mutex lock; -Cond cond; - -bool awake = false; -bool fm_shutdown = false; -pthread_t thread_id; - - - - -void *fakemessenger_thread(void *ptr) -{ - lock.Lock(); - while (1) { - dout(20) << "thread waiting" << endl; - if (fm_shutdown) break; - awake = false; - cond.Wait(lock); - awake = true; - dout(20) << "thread woke up" << endl; - if (fm_shutdown) break; - - fakemessenger_do_loop_2(); - - if (directory.empty()) break; - } - lock.Unlock(); - - dout(1) << "thread finish (i woke up but no messages, bye)" << endl; - return 0; -} - - -void fakemessenger_startthread() { - pthread_create(&thread_id, NULL, fakemessenger_thread, 0); -} - -void fakemessenger_stopthread() { - cout << "fakemessenger_stopthread setting stop flag" << endl; - lock.Lock(); - fm_shutdown = true; - lock.Unlock(); - cond.Signal(); - - fakemessenger_wait(); -} - -void fakemessenger_wait() -{ - cout << "fakemessenger_wait waiting" << endl; - void *ptr; - pthread_join(thread_id, &ptr); -} - - - - -// lame main looper - -int fakemessenger_do_loop() -{ - lock.Lock(); - fakemessenger_do_loop_2(); - lock.Unlock(); - - g_timer.shutdown(); - return 0; -} - - -int fakemessenger_do_loop_2() -{ - //lock.Lock(); - dout(18) << "do_loop begin." << endl; - - while (1) { - bool didone = false; - - dout(18) << "do_loop top" << endl; - - // messages - map::iterator it = directory.begin(); - while (it != directory.end()) { - FakeMessenger *mgr = it->second; - - dout(18) << "messenger " << mgr << " at " << mgr->get_myname() << " has " << mgr->num_incoming() << " queued" << endl; - - if (!mgr->is_ready()) { - dout(18) << "messenger " << mgr << " at " << mgr->get_myname() << " has no dispatcher, skipping" << endl; - it++; - continue; - } - - Message *m = mgr->get_message(); - it++; - - if (m) { - //dout(18) << "got " << m << endl; - dout(1) << "---- " << m->get_dest() - << " <- " << m->get_source() - << " ---- " << *m - << endl; - - if (g_conf.fakemessenger_serialize) { - // encode - if (m->empty_payload()) - m->encode_payload(); - msg_envelope_t env = m->get_envelope(); - bufferlist bl; - bl.claim( m->get_payload() ); - //bl.c_str(); // condense into 1 buffer - - delete m; - - // decode - m = decode_message(env, bl); - assert(m); - } - - didone = true; - - lock.Unlock(); - mgr->dispatch(m); - lock.Lock(); - } - } - - // deal with shutdowns.. dleayed to avoid concurrent directory modification - if (!shutdown_set.empty()) { - for (set::iterator it = shutdown_set.begin(); - it != shutdown_set.end(); - it++) { - dout(7) << "fakemessenger: removing " << *it << " from directory" << endl; - assert(directory.count(*it)); - directory.erase(*it); - if (directory.empty()) { - dout(1) << "fakemessenger: last shutdown" << endl; - ::fm_shutdown = true; - } - } - shutdown_set.clear(); - } - - if (!didone) - break; - } - - - dout(18) << "do_loop end (no more messages)." << endl; - //lock.Unlock(); - return 0; -} - - -FakeMessenger::FakeMessenger(entity_name_t me) : Messenger(me) -{ - lock.Lock(); - { - // assign rank - _myinst.name = me; - _myinst.addr.port = nranks++; - //if (!me.is_mon()) - //_myinst.addr.nonce = getpid(); - - // add to directory - directory[ _myinst.addr ] = this; - } - lock.Unlock(); - - - cout << "fakemessenger " << get_myname() << " messenger is " << this << " at " << _myinst << endl; - - qlen = 0; - - /* - string name; - name = "m."; - name += MSG_ADDR_TYPE(myaddr); - int w = MSG_ADDR_NUM(myaddr); - if (w >= 1000) name += ('0' + ((w/1000)%10)); - if (w >= 100) name += ('0' + ((w/100)%10)); - if (w >= 10) name += ('0' + ((w/10)%10)); - name += ('0' + ((w/1)%10)); - - loggers[ myaddr ] = new Logger(name, (LogType*)&fakemsg_logtype); - */ -} - -FakeMessenger::~FakeMessenger() -{ - // hose any undelivered messages - for (list::iterator p = incoming.begin(); - p != incoming.end(); - ++p) - delete *p; -} - - -int FakeMessenger::shutdown() -{ - //cout << "shutdown on messenger " << this << " has " << num_incoming() << " queued" << endl; - lock.Lock(); - assert(directory.count(_myinst.addr) == 1); - shutdown_set.insert(_myinst.addr); - - /* - if (loggers[myaddr]) { - delete loggers[myaddr]; - loggers.erase(myaddr); - } - */ - - lock.Unlock(); - return 0; -} - - -void FakeMessenger::reset_myname(entity_name_t m) -{ - dout(1) << "reset_myname from " << get_myname() << " to " << m << endl; - _set_myname(m); - - directory.erase(_myinst.addr); - _myinst.name = m; - directory[_myinst.addr] = this; - -} - - -int FakeMessenger::send_message(Message *m, entity_inst_t inst, int port, int fromport) -{ - entity_name_t dest = inst.name; - - m->set_source(get_myname(), fromport); - m->set_source_addr(get_myaddr()); - - m->set_dest(inst.name, port); - - lock.Lock(); - -#ifdef LOG_MESSAGES - // stats - loggers[get_myaddr()]->inc("+send",1); - loggers[dest]->inc("-recv",1); - - char s[20]; - sprintf(s,"+%s", m->get_type_name()); - loggers[get_myaddr()]->inc(s); - sprintf(s,"-%s", m->get_type_name()); - loggers[dest]->inc(s); -#endif - - // queue - if (directory.count(inst.addr)) { - dout(1) << "--> " << get_myname() << " -> " << inst.name << " " << *m << endl; - directory[inst.addr]->queue_incoming(m); - } else { - dout(0) << "--> " << get_myname() << " -> " << inst.name << " " << *m - << " *** destination DNE ***" << endl; - for (map::iterator p = directory.begin(); - p != directory.end(); - ++p) { - dout(0) << "** have " << p->first << " to " << p->second << endl; - } - //assert(dm); - delete m; - } - - // wake up loop? - if (!awake) { - dout(10) << "waking up fakemessenger thread" << endl; - cond.Signal(); - lock.Unlock(); - } else - lock.Unlock(); - - return 0; -} - - diff --git a/branches/marnberg/quota/msg/FakeMessenger.h b/branches/marnberg/quota/msg/FakeMessenger.h deleted file mode 100644 index 13cd6f95326d1..0000000000000 --- a/branches/marnberg/quota/msg/FakeMessenger.h +++ /dev/null @@ -1,88 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __FAKEMESSENGER_H -#define __FAKEMESSENGER_H - -#include "Messenger.h" -#include "Dispatcher.h" - -#include -#include - -class Timer; - -class FakeMessenger : public Messenger { - protected: - class Logger *logger; - - int qlen; - list incoming; // incoming queue - - entity_inst_t _myinst; - - public: - FakeMessenger(entity_name_t me); - ~FakeMessenger(); - - virtual int shutdown(); - - const entity_inst_t& get_myinst() { - return _myinst; - }; - const entity_addr_t& get_myaddr() { - return _myinst.addr; - } - - void reset_myname(entity_name_t m); - - // msg interface - virtual int send_message(Message *m, entity_inst_t dest, int port=0, int fromport=0); - - // events - //virtual void trigger_timer(Timer *t); - - int get_dispatch_queue_len() { return qlen; } - - // -- incoming queue -- - // (that nothing uses) - Message *get_message() { - if (!incoming.empty()) { - Message *m = incoming.front(); - incoming.pop_front(); - qlen--; - return m; - } - return NULL; - } - bool queue_incoming(Message *m) { - incoming.push_back(m); - qlen++; - return true; - } - int num_incoming() { - //return incoming.size(); - return qlen; - } - -}; - -int fakemessenger_do_loop(); -int fakemessenger_do_loop_2(); -void fakemessenger_startthread(); -void fakemessenger_stopthread(); -void fakemessenger_wait(); - -#endif diff --git a/branches/marnberg/quota/msg/HostMonitor.cc b/branches/marnberg/quota/msg/HostMonitor.cc deleted file mode 100644 index 44ab35a9fcc10..0000000000000 --- a/branches/marnberg/quota/msg/HostMonitor.cc +++ /dev/null @@ -1,235 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "HostMonitor.h" - -#include "msg/Message.h" -#include "msg/Messenger.h" - -#include "messages/MPing.h" -#include "messages/MPingAck.h" -#include "messages/MFailure.h" -#include "messages/MFailureAck.h" - -#include "common/Timer.h" -#include "common/Clock.h" - -#define DBL 10 - - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug) cout << whoami << " hostmon: " - - -// timer contexts - -class C_HM_InitiateHeartbeat : public Context { - HostMonitor *hm; -public: - C_HM_InitiateHeartbeat(HostMonitor *hm) { - this->hm = hm; - } - void finish(int r) { - //cout << "HEARTBEAT" << endl; - hm->pending_events.erase(this); - hm->initiate_heartbeat(); - } -}; - -class C_HM_CheckHeartbeat : public Context { - HostMonitor *hm; -public: - C_HM_CheckHeartbeat(HostMonitor *hm) { - this->hm = hm; - } - void finish(int r) { - //cout << "CHECK" << endl; - hm->pending_events.erase(this); - hm->check_heartbeat(); - } -}; - - - -// startup/shutdown - -void HostMonitor::init() -{ - dout(DBL) << "init" << endl; - - // hack params for now - heartbeat_interval = 10; - max_ping_time = 2; - max_heartbeat_misses = 3; - notify_retry_interval = 10; - - // schedule first hb - schedule_heartbeat(); -} - - -void HostMonitor::shutdown() -{ - // cancel any events - for (set::iterator it = pending_events.begin(); - it != pending_events.end(); - it++) { - g_timer.cancel_event(*it); - delete *it; - } - pending_events.clear(); -} - - -// schedule next heartbeat - -void HostMonitor::schedule_heartbeat() -{ - dout(DBL) << "schedule_heartbeat" << endl; - Context *e = new C_HM_InitiateHeartbeat(this); - pending_events.insert(e); - g_timer.add_event_after(heartbeat_interval, e); -} - - -// take note of a live host - -void HostMonitor::host_is_alive(entity_name_t host) -{ - if (hosts.count(host)) - status[host].last_heard_from = g_clock.gettime(); -} - - -// do heartbeat - -void HostMonitor::initiate_heartbeat() -{ - time_t now = g_clock.gettime(); - - // send out pings - inflight_pings.clear(); - for (set::iterator it = hosts.begin(); - it != hosts.end(); - it++) { - // have i heard from them recently? - if (now - status[*it].last_heard_from < heartbeat_interval) { - dout(DBL) << "skipping " << *it << ", i heard from them recently" << endl; - } else { - dout(DBL) << "pinging " << *it << endl; - status[*it].last_pinged = now; - inflight_pings.insert(*it); - - messenger->send_message(new MPing(1), *it, 0); - } - } - - // set timer to check results - Context *e = new C_HM_CheckHeartbeat(this); - pending_events.insert(e); - g_timer.add_event_after(max_ping_time, e); - dout(10) << "scheduled check " << e << endl; - - schedule_heartbeat(); // schedule next heartbeat -} - - -// check results - -void HostMonitor::check_heartbeat() -{ - dout(DBL) << "check_heartbeat()" << endl; - - // check inflight pings - for (set::iterator it = inflight_pings.begin(); - it != inflight_pings.end(); - it++) { - status[*it].num_heartbeats_missed++; - - dout(DBL) << "no response from " << *it << " for " << status[*it].num_heartbeats_missed << " beats" << endl; - - if (status[*it].num_heartbeats_missed >= max_heartbeat_misses) { - if (acked_failures.count(*it)) { - dout(DBL) << *it << " is already failed" << endl; - } else { - if (unacked_failures.count(*it)) { - dout(DBL) << *it << " is already failed, but unacked, sending another failure message" << endl; - } else { - dout(DBL) << "failing " << *it << endl; - unacked_failures.insert(*it); - } - - /*if (false) // do this in NewMessenger for now! FIXME - for (set::iterator nit = notify.begin(); - nit != notify.end(); - nit++) { - messenger->send_message(new MFailure(*it, messenger->get_inst(*it)), - *nit, notify_port, 0); - } - */ - } - } - } - - // forget about the pings. - inflight_pings.clear(); -} - - -// incoming messages - -void HostMonitor::proc_message(Message *m) -{ - switch (m->get_type()) { - - case MSG_PING_ACK: - handle_ping_ack((MPingAck*)m); - break; - - case MSG_FAILURE_ACK: - handle_failure_ack((MFailureAck*)m); - break; - - } -} - -void HostMonitor::handle_ping_ack(MPingAck *m) -{ - entity_name_t from = m->get_source(); - - dout(DBL) << "ping ack from " << from << endl; - status[from].last_pinged = g_clock.gettime(); - status[from].num_heartbeats_missed = 0; - inflight_pings.erase(from); - - delete m; -} - -void HostMonitor::handle_failure_ack(MFailureAck *m) -{ - - // FIXME: this doesn't handle failed -> alive transitions gracefully at all.. - - // the higher-up's acknowledged our failure notification, we can stop resending it. - entity_name_t failed = m->get_failed(); - dout(DBL) << "handle_failure_ack " << failed << endl; - unacked_failures.erase(failed); - acked_failures.insert(failed); - - delete m; -} - - diff --git a/branches/marnberg/quota/msg/HostMonitor.h b/branches/marnberg/quota/msg/HostMonitor.h deleted file mode 100644 index fffe798b71450..0000000000000 --- a/branches/marnberg/quota/msg/HostMonitor.h +++ /dev/null @@ -1,97 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __HOSTMONITOR_H -#define __HOSTMONITOR_H - -#include - -#include -#include -using namespace std; - -#include "include/Context.h" -#include "msg/Message.h" - -class Message; -class Messenger; - -typedef struct { - time_t last_heard_from; - time_t last_pinged; - int num_heartbeats_missed; -} monitor_rec_t; - -class HostMonitor { - Messenger *messenger; - string whoami; - - // hosts i monitor - set hosts; - - // who i tell when they fail - set notify; - int notify_port; - - // their status - map status; - - set inflight_pings; // pings we sent that haven't replied yet - - set unacked_failures; // failed hosts that haven't been acked yet. - set acked_failures; // these failures have been acked. - - float heartbeat_interval; // how often to do a heartbeat - float max_ping_time; // how long before it's a miss - int max_heartbeat_misses; // how many misses before i tell - float notify_retry_interval; // how often to retry failure notification - - public: - set pending_events; - - private: - void schedule_heartbeat(); - - public: - HostMonitor(Messenger *m, string& whoami) { - this->messenger = m; - this->whoami = whoami; - notify_port = 0; - } - set& get_hosts() { return hosts; } - set& get_notify() { return notify; } - void set_notify_port(int p) { notify_port = p; } - - void remove_host(entity_name_t h) { - hosts.erase(h); - status.erase(h); - unacked_failures.erase(h); - acked_failures.erase(h); - } - - void init(); - void shutdown(); - - void host_is_alive(entity_name_t who); - - void proc_message(Message *m); - void handle_ping_ack(class MPingAck *m); - void handle_failure_ack(class MFailureAck *m); - - void initiate_heartbeat(); - void check_heartbeat(); - -}; - -#endif diff --git a/branches/marnberg/quota/msg/MPIMessenger.cc b/branches/marnberg/quota/msg/MPIMessenger.cc deleted file mode 100644 index 6c4e65d063fc9..0000000000000 --- a/branches/marnberg/quota/msg/MPIMessenger.cc +++ /dev/null @@ -1,608 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "config.h" -#include "include/error.h" - -#include "common/Timer.h" -#include "common/Mutex.h" - -#include "MPIMessenger.h" -#include "Message.h" - -#include -#include -using namespace std; -#include -using namespace __gnu_cxx; - -#include -#include - -/* - * We make a directory, so that we can have multiple Messengers in the - * same process (rank). This is useful for benchmarking and creating lots of - * simulated clients, e.g. - */ - -hash_map directory; -list outgoing, incoming; -list unfinished_sends; -map unfinished_send_message; - -/* this process */ -int mpi_world; -int mpi_rank; -bool mpi_done = false; // set this flag to stop the event loop - - -#define FUNNEL_MPI // if we want to funnel mpi through a single thread -#define TAG_UNSOLICITED 0 -#define DBLVL 18 - -// the key used to fetch the tag for the current thread. -pthread_key_t tag_key; -pthread_t thread_id = 0; // thread id of the event loop. init value == nobody - -Mutex sender_lock; -Mutex out_queue_lock; - -bool pending_timer; - - -// our lock for any common data; it's okay to have only the one global mutex -// because our common data isn't a whole lot. -//static pthread_mutex_t mutex; - -// the number of distinct threads we've seen so far; used to generate -// a unique tag for each thread. -//static int nthreads = 10; - -//#define TAG_UNSOLICITED 0 - -// debug -#undef dout -#define dout(l) if (l<=g_conf.debug) cout << "[MPI " << mpi_rank << "/" << mpi_world << " " << getpid() << "." << pthread_self() << "] " - - - -/***** - * MPI global methods for process-wide startup, shutdown. - */ - -int mpimessenger_init(int& argc, char**& argv) -{ - MPI_Init(&argc, &argv); - - MPI_Comm_size(MPI_COMM_WORLD, &mpi_world); - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - - char hostname[100]; - gethostname(hostname,100); - int pid = getpid(); - - dout(12) << "init: i am " << hostname << " pid " << pid << endl; - - assert(mpi_world > g_conf.num_osd+g_conf.num_mds); - - return mpi_rank; -} - -int mpimessenger_shutdown() -{ - dout(5) << "mpimessenger_shutdown barrier waiting for all to finish" << endl; - MPI_Barrier (MPI_COMM_WORLD); - dout(1) << "mpimessenger_shutdown all done, MPI_Finalize()" << endl; - MPI_Finalize(); - return 0; -} - -int mpimessenger_world() -{ - return mpi_world; -} - - - -/*** - * internal send/recv - */ - - -/* - * get fresh MPI_Request* (on heap) for a new async MPI_Isend - */ - -MPI_Request *mpi_prep_send_req() { - MPI_Request *req = new MPI_Request; - unfinished_sends.push_back(req); - dout(DBLVL) << "prep_send_req " << req << endl; - return req; -} - - -/* - * clean up MPI_Request*'s for Isends that have completed. - * also, hose any associated Message*'s for Messages that are completely sent. - * - * if wait=true, block and wait for sends to finish. - */ - -void mpi_reap_sends(bool wait=false) { - sender_lock.Lock(); - - list::iterator it = unfinished_sends.begin(); - while (it != unfinished_sends.end()) { - MPI_Status status; - int flag; - - if (wait) { - MPI_Wait(*it, &status); - } else { - MPI_Test(*it, &flag, &status); - if (!flag) break; // not finished yet - } - - dout(DBLVL) << "send " << *it << " completed" << endl; - - if (unfinished_send_message.count(*it)) { - dout(DBLVL) << "send message " << unfinished_send_message[*it] << " completed" << endl; - delete unfinished_send_message[*it]; - unfinished_send_message.erase(*it); - } - - delete *it; - it++; - unfinished_sends.pop_front(); - } - - dout(DBLVL) << "reap has " << unfinished_sends.size() << " Isends outstanding, " << unfinished_send_message.size() << " messages" << endl; - - sender_lock.Unlock(); -} - - -void mpi_finish_sends() { - mpi_reap_sends(true); -} - - -/* - * recv a Message* - */ -Message *mpi_recv(int tag) -{ - // envelope - dout(DBLVL) << "mpi_recv waiting for message tag " << tag << endl; - - MPI_Status status; - msg_envelope_t env; - - ASSERT(MPI_Recv((void*)&env, - sizeof(env), - MPI_CHAR, - MPI_ANY_SOURCE,// status.MPI_SOURCE,//MPI_ANY_SOURCE, - tag, - MPI_COMM_WORLD, - &status/*, - &recv_env_req*/) == MPI_SUCCESS); - assert(status.count == MSG_ENVELOPE_LEN); - - if (env.type == 0) { - dout(DBLVL) << "mpi_recv got type 0 message, kicked!" << endl; - return 0; - } - - dout(DBLVL) << "mpi_recv got envelope " << status.count << ", type=" << env.type << " src " << env.source << " dst " << env.dest << " nchunks=" << env.nchunks << " from " << status.MPI_SOURCE << endl; - - // payload - bufferlist blist; - for (int i=0; iget_dest(), mpi_world); - - // local? - if (rank == mpi_rank) { - dout(DBLVL) << "queuing local delivery" << endl; - incoming.push_back(m); - return 0; - } - - // marshall - if (m->empty_payload()) - m->encode_payload(); - msg_envelope_t *env = &m->get_envelope(); - env->nchunks = m->get_payload().buffers().size(); - - dout(7) << "sending " << *m << " to " << MSG_ADDR_NICE(env->dest) << " (rank " << rank << ")" << endl; - -#ifndef FUNNEL_MPI - sender_lock.Lock(); -#endif - - // send envelope - ASSERT(MPI_Isend((void*)env, - sizeof(*env), - MPI_CHAR, - rank, - tag, - MPI_COMM_WORLD, - mpi_prep_send_req()) == MPI_SUCCESS); - - // payload - int i = 0; - for (list::iterator it = m->get_payload().buffers().begin(); - it != m->get_payload().buffers().end(); - it++) { - dout(DBLVL) << "mpi_sending frag " << i << " len " << (*it).length() << endl; - //MPI_Request *req = new MPI_Request; - ASSERT(MPI_Isend((void*)(*it).c_str(), - (*it).length(), - MPI_CHAR, - rank, - tag, - MPI_COMM_WORLD, - mpi_prep_send_req()) == MPI_SUCCESS); - i++; - } - - // attach message to last send, so we can free it later - MPI_Request *req = unfinished_sends.back(); - unfinished_send_message[req] = m; - - dout(DBLVL) << "mpi_send done, attached message to Isend " << req << endl; - -#ifndef FUNNEL_MPI - sender_lock.Unlock(); -#endif - return 0; -} - - - -// get the tag for this thread - -#ifndef FUNNEL_MPI -static int get_thread_tag() -{ - int tag = (int)pthread_getspecific(tag_key); - - if (tag == 0) { - // first time this thread has performed MPI messaging - - if (pthread_mutex_lock(&mutex) < 0) - SYSERROR(); - - tag = ++nthreads; - - if (pthread_mutex_unlock(&mutex) < 0) - SYSERROR(); - - if (pthread_setspecific(tag_key, (void*)tag) < 0) - SYSERROR(); - } - - return tag; -} -#endif - - - -// recv event loop, for unsolicited messages. - -void* mpimessenger_loop(void*) -{ - dout(5) << "mpimessenger_loop start pid " << getpid() << endl; - - while (1) { - - // outgoing - mpi_reap_sends(); - -#ifdef FUNNEL_MPI - // check outgoing queue - out_queue_lock.Lock(); - if (outgoing.size()) { - dout(10) << outgoing.size() << " outgoing messages" << endl; - for (list::iterator it = outgoing.begin(); - it != outgoing.end(); - it++) { - mpi_send(*it, TAG_UNSOLICITED); - } - } - outgoing.clear(); - out_queue_lock.Unlock(); -#endif - - - // timer events? - if (pending_timer) { - dout(DBLVL) << "pending timer" << endl; - g_timer.execute_pending(); - } - - // done? - if (mpi_done && - incoming.empty() && - outgoing.empty() && - !pending_timer) break; - - - // incoming - Message *m = 0; - - if (incoming.size()) { - dout(12) << "loop pulling message off incoming" << endl; - m = incoming.front(); - incoming.pop_front(); - } - else { - // check mpi - dout(12) << "loop waiting for incoming messages" << endl; - - // get message - m = mpi_recv(TAG_UNSOLICITED); - } - - // dispatch? - if (m) { - int dest = m->get_dest(); - if (directory.count(dest)) { - Messenger *who = directory[ dest ]; - - dout(4) << "---- '" << m->get_type_name() << - "' from " << MSG_ADDR_NICE(m->get_source()) << ':' << m->get_source_port() << - " to " << MSG_ADDR_NICE(m->get_dest()) << ':' << m->get_dest_port() << " ---- " - << m - << endl; - - who->dispatch(m); - } else { - dout (1) << "---- i don't know who " << dest << " is." << endl; - assert(0); - break; - } - } - - } - - dout(5) << "finishing async sends" << endl; - mpi_finish_sends(); - - g_timer.shutdown(); - - dout(5) << "mpimessenger_loop exiting loop" << endl; - return 0; -} - - -// start/stop mpi receiver thread (for unsolicited messages) -int mpimessenger_start() -{ - dout(5) << "starting thread" << endl; - - // start a thread - pthread_create(&thread_id, - NULL, - mpimessenger_loop, - 0); - return 0; -} - - -/* - * kick and wake up _loop (to pick up new outgoing message, or quit) - */ - -MPI_Request kick_req; -msg_envelope_t kick_env; - -void mpimessenger_kick_loop() -{ - // if we're same thread as the loop, no kicking necessary - if (pthread_self() == thread_id) return; - - kick_env.type = 0; - - sender_lock.Lock(); - ASSERT(MPI_Isend(&kick_env, // kick sync for now, but ONLY because it makes me feel safer. - sizeof(kick_env), - MPI_CHAR, - mpi_rank, - TAG_UNSOLICITED, - MPI_COMM_WORLD, - mpi_prep_send_req()) == MPI_SUCCESS); - sender_lock.Unlock(); -} - - -// stop thread - -void mpimessenger_stop() -{ - dout(5) << "mpimessenger_stop stopping thread" << endl; - - if (mpi_done) { - dout(1) << "mpimessenger_stop called, but already done!" << endl; - assert(!mpi_done); - } - - // set finish flag - mpi_done = true; - mpimessenger_kick_loop(); - - // wait for thread to stop - mpimessenger_wait(); -} - - -// wait for thread to finish - -void mpimessenger_wait() -{ - void *returnval; - dout(10) << "mpimessenger_wait waiting for thread to finished." << endl; - pthread_join(thread_id, &returnval); - dout(10) << "mpimessenger_wait thread finished." << endl; -} - - - - -/*********** - * MPIMessenger class implementation - */ - -class C_MPIKicker : public Context { - void finish(int r) { - dout(DBLVL) << "timer kick" << endl; - mpimessenger_kick_loop(); - } -}; - -MPIMessenger::MPIMessenger(entity_name_t myaddr) : Messenger(myaddr) -{ - // my address - this->myaddr = myaddr; - - // register myself in the messenger directory - directory[myaddr] = this; - - // register to execute timer events - g_timer.set_messenger_kicker(new C_MPIKicker()); - - // logger - /* - string name; - name = "m."; - name += MSG_ADDR_TYPE(whoami); - int w = MSG_ADDR_NUM(whoami); - if (w >= 1000) name += ('0' + ((w/1000)%10)); - if (w >= 100) name += ('0' + ((w/100)%10)); - if (w >= 10) name += ('0' + ((w/10)%10)); - name += ('0' + ((w/1)%10)); - - logger = new Logger(name, (LogType*)&mpimsg_logtype); - loggers[ whoami ] = logger; - */ -} - -MPIMessenger::~MPIMessenger() -{ - //delete logger; -} - - -int MPIMessenger::shutdown() -{ - // remove me from the directory - directory.erase(myaddr); - - // no more timer events - g_timer.unset_messenger_kicker(); - - // last one? - if (directory.empty()) { - dout(10) << "shutdown last mpimessenger on rank " << mpi_rank << " shut down" << endl; - pthread_t whoami = pthread_self(); - - dout(15) << "whoami = " << whoami << ", thread = " << thread_id << endl; - if (whoami == thread_id) { - // i am the event loop thread, just set flag! - dout(15) << " set mpi_done=true" << endl; - mpi_done = true; - } else { - // i am a different thread, tell the event loop to stop. - dout(15) << " calling mpimessenger_stop()" << endl; - mpimessenger_stop(); - } - } else { - dout(10) << "shutdown still " << directory.size() << " other messengers on rank " << mpi_rank << endl; - } - return 0; -} - - - - -/*** - * public messaging interface - */ - - -/* note: send_message _MUST_ be non-blocking */ -int MPIMessenger::send_message(Message *m, entity_name_t dest, int port, int fromport) -{ - // set envelope - m->set_source(myaddr, fromport); - m->set_dest(dest, port); - -#ifdef FUNNEL_MPI - - // queue up - out_queue_lock.Lock(); - dout(DBLVL) << "queuing outgoing message " << *m << endl; - outgoing.push_back(m); - out_queue_lock.Unlock(); - - mpimessenger_kick_loop(); - -#else - - // send in this thread - mpi_send(m, m->get_pcid()); - -#endif - return 0; -} - - - - - - diff --git a/branches/marnberg/quota/msg/MPIMessenger.h b/branches/marnberg/quota/msg/MPIMessenger.h deleted file mode 100644 index 88e753de89749..0000000000000 --- a/branches/marnberg/quota/msg/MPIMessenger.h +++ /dev/null @@ -1,56 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MPIMESSENGER_H -#define __MPIMESSENGER_H - -#include "Messenger.h" -#include "Dispatcher.h" - -#define NUMMDS g_conf.num_mds -#define NUMOSD g_conf.num_osd -#define MPI_DEST_TO_RANK(dest,world) ((dest)<(NUMMDS+NUMOSD) ? \ - (dest) : \ - ((NUMMDS+NUMOSD)+(((dest)-NUMMDS-NUMOSD) % ((world)-NUMMDS-NUMOSD)))) - -class Timer; - -class MPIMessenger : public Messenger { - protected: - entity_name_t myaddr; // my address - //class Logger *logger; // for logging - - public: - MPIMessenger(entity_name_t myaddr); - ~MPIMessenger(); - - // init, shutdown MPI and associated event loop thread. - virtual int shutdown(); - - // message interface - virtual int send_message(Message *m, entity_name_t dest, int port=0, int fromport=0); -}; - -/** - * these are all ONE per process. - */ -extern int mpimessenger_world(); // get world size -extern int mpimessenger_init(int& argc, char**& argv); // init mpi -extern int mpimessenger_start(); // start thread -extern void mpimessenger_stop(); // stop thread. -extern void mpimessenger_wait(); // wait for thread to finish. -extern int mpimessenger_shutdown(); // finalize MPI - - -#endif diff --git a/branches/marnberg/quota/msg/MTMessenger.cc b/branches/marnberg/quota/msg/MTMessenger.cc deleted file mode 100644 index 02ab9981ff353..0000000000000 --- a/branches/marnberg/quota/msg/MTMessenger.cc +++ /dev/null @@ -1,197 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include -#include "mpi.h" - -#include "include/config.h" -#include "include/error.h" -#include "Messenger.h" -#include "MTMessenger.h" - -// This module uses MPI to implement a blocking sendrecv function that -// feels more like a procedure call and less like event processesing. -// -// Threads are not independently addressable in MPI, only processes -// are. However, MPI does include a user defined tag in the message -// envelope, and a reader may selectively read only messages with a -// matching tag. The modules assign an integer to each thread to use -// as the tag. -// - -// our lock for any common data; it's okay to have only the one global mutex -// because our common data isn't a whole lot. -static pthread_mutex_t mutex; - -// the key used to fetch the tag for the current thread. -pthread_key_t tag_key; - -// the number of distinct threads we've seen so far; used to generate -// a unique tag for each thread. -static int nthreads; - -// the MPI identity of this process -static int mpi_rank; - - -// get the tag for this thread -static int get_tag() -{ - int tag = (int)pthread_getspecific(tag_key); - - if (tag == 0) { - // first time this thread has performed MPI messaging - - if (pthread_mutex_lock(&mutex) < 0) - SYSERROR(); - - tag = ++nthreads; - - if (pthread_mutex_unlock(&mutex) < 0) - SYSERROR(); - - if (pthread_setspecific(tag_key, (void*)tag) < 0) - SYSERROR(); - } - - return tag; -} - - -// marshall a message and send it over MPI -static void send(Message *m, int rank, int tag) -{ - // marshall the message - crope r; - m->encode(r); - int size = r.length(); - - char *buf = (char*)r.c_str(); - ASSERT(MPI_Send(buf, - size, - MPI_CHAR, - rank, - tag, - MPI_COMM_WORLD) == MPI_SUCCESS); -} - -// read a message from MPI and unmarshall it -static Message *receive(int tag) -{ - MPI_Status status; - - // get message size - ASSERT(MPI_Probe(MPI_ANY_SOURCE, - tag, - MPI_COMM_WORLD, - &status) == MPI_SUCCESS); - - // get message; there may be multiple messages on the queue, we - // need to be sure to read the one which corresponds to size - // obtained above. - char *buf = new char[status.count]; - ASSERT(MPI_Recv(buf, - status.count, - MPI_CHAR, - status.MPI_SOURCE, - status.MPI_TAG, - MPI_COMM_WORLD, - &status) == MPI_SUCCESS); - - // unmarshall message - crope r(buf, status.count); - delete[] buf; - Message *m = decode_message(r); - - return m; -} - -MTMessenger::MTMessenger(int& argc, char**& argv) -{ - // setup MPI; MPI errors will probably invoke the default MPI error - // handler, which aborts the program with a friendly message rather - // than returning from a function; just in case, we abort the - // program if we get an MPI error. - - int provided; - ASSERT(MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided) - == MPI_SUCCESS); - - ASSERT(MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank) == MPI_SUCCESS); - - if (pthread_mutex_init(&mutex, NULL) < 0) - SYSERROR(); - - if (pthread_key_create(&tag_key, NULL) < 0) - SYSERROR(); - - nthreads = 0; -} - -MTMessenger::~MTMessenger() -{ - // ignore shutdown errors - - pthread_key_delete(tag_key); - - pthread_mutex_destroy(&mutex); - - MPI_Finalize(); -} - -// send a request and wait for the response -Message *MTMessenger::sendrecv(Message *m, entity_name_t dest) -{ - int dest_tag = 0; // servers listen for any tag - int my_tag = get_tag(); - - // set our envelope (not to be confused with the MPI envelope) - m->set_source(mpi_rank, my_tag); - m->set_dest(dest, dest_tag); - - send(m, dest, dest_tag); - - return receive(my_tag); -} - -// receive a request from anyone -Message *MTMessenger::recvreq() -{ - return receive(MPI_ANY_TAG); -} - -// forward request, masquerading as original source -void MTMessenger::fwdreq(Message *req, int dest) -{ - int dest_tag = 0; // servers listen for any tag - - // set our envelope (not to be confused with the MPI envelope) - req->set_dest(dest, dest_tag); - - send(req, dest, dest_tag); -} - -// send a response to the originator of the request -void MTMessenger::sendresp(Message *req, Message *resp) -{ - int req_rank = req->get_source(); - int req_tag = req->get_source_port(); - int my_tag = get_tag(); - - // set our envelope (not to be confused with the MPI envelope) - resp->set_source(mpi_rank, my_tag); - resp->set_dest(req_rank, req_tag); - - send(resp, req_rank, req_tag); -} diff --git a/branches/marnberg/quota/msg/MTMessenger.h b/branches/marnberg/quota/msg/MTMessenger.h deleted file mode 100644 index 477a39c60561d..0000000000000 --- a/branches/marnberg/quota/msg/MTMessenger.h +++ /dev/null @@ -1,50 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MTMESSENGER_H -#define __MTMESSENGER_H - -#include "Message.h" -#include "SerialMessenger.h" - -// Marshall and unmarshall OBFS messages, send and receive them over -// MPI. - -class MTMessenger -{ -public: - // sets up the queues and internal thread; the MPI initialization - // will scan argc/argv for MPI specific flags and remove them from - // argc/argv. - MTMessenger(int &argc, char **&argv); - - // tears it all down - ~MTMessenger(); - - // send a request to a server and wait (block) for the response; - virtual Message *sendrecv(Message *m, entity_name_t dest); - - // wait (block) for a request from anyone - Message *recvreq(); - - // forward request, masquerading as original source - void fwdreq(Message *req, int dest); - - // send the response to the originator of the request - virtual void sendresp(Message *req, Message *resp); - - -}; // class MTMessenger - -#endif // __MTMESSENGER_H diff --git a/branches/marnberg/quota/msg/Message.cc b/branches/marnberg/quota/msg/Message.cc deleted file mode 100644 index ae01d9106ddaf..0000000000000 --- a/branches/marnberg/quota/msg/Message.cc +++ /dev/null @@ -1,466 +0,0 @@ - -#include -#include -using namespace std; - -#include "include/types.h" - -#include "Message.h" - -#include "messages/MGenericMessage.h" - -/* -#include "messages/MNSConnect.h" -#include "messages/MNSConnectAck.h" -#include "messages/MNSRegister.h" -#include "messages/MNSRegisterAck.h" -#include "messages/MNSLookup.h" -#include "messages/MNSLookupReply.h" -#include "messages/MNSFailure.h" -*/ - -#include "messages/MMonPaxos.h" - -#include "messages/MMonElectionAck.h" -#include "messages/MMonElectionPropose.h" -#include "messages/MMonElectionVictory.h" - -#include "messages/MPing.h" -#include "messages/MPingAck.h" -//#include "messages/MFailure.h" -//#include "messages/MFailureAck.h" - -#include "messages/MOSDBoot.h" -#include "messages/MOSDIn.h" -#include "messages/MOSDOut.h" -#include "messages/MOSDFailure.h" -#include "messages/MOSDPing.h" -#include "messages/MOSDOp.h" -#include "messages/MOSDOpReply.h" -#include "messages/MOSDMap.h" -#include "messages/MOSDGetMap.h" -#include "messages/MOSDPGNotify.h" -#include "messages/MOSDPGQuery.h" -#include "messages/MOSDPGLog.h" -#include "messages/MOSDPGRemove.h" - -#include "messages/MClientBoot.h" -#include "messages/MClientMount.h" -#include "messages/MClientMountAck.h" -#include "messages/MClientRequest.h" -#include "messages/MClientReply.h" -#include "messages/MClientFileCaps.h" - -#include "messages/MMDSGetMap.h" -#include "messages/MMDSMap.h" -#include "messages/MMDSBeacon.h" -#include "messages/MMDSImportMap.h" -#include "messages/MMDSCacheRejoin.h" -#include "messages/MMDSCacheRejoinAck.h" - -#include "messages/MDirUpdate.h" -#include "messages/MDiscover.h" -#include "messages/MDiscoverReply.h" - -#include "messages/MExportDirDiscover.h" -#include "messages/MExportDirDiscoverAck.h" -#include "messages/MExportDirPrep.h" -#include "messages/MExportDirPrepAck.h" -#include "messages/MExportDirWarning.h" -#include "messages/MExportDir.h" -#include "messages/MExportDirNotify.h" -#include "messages/MExportDirNotifyAck.h" -#include "messages/MExportDirFinish.h" - -#include "messages/MHashReaddir.h" -#include "messages/MHashReaddirReply.h" - -#include "messages/MHashDirDiscover.h" -#include "messages/MHashDirDiscoverAck.h" -#include "messages/MHashDirPrep.h" -#include "messages/MHashDirPrepAck.h" -#include "messages/MHashDir.h" -#include "messages/MHashDirAck.h" -#include "messages/MHashDirNotify.h" - -#include "messages/MUnhashDirPrep.h" -#include "messages/MUnhashDirPrepAck.h" -#include "messages/MUnhashDir.h" -#include "messages/MUnhashDirAck.h" -#include "messages/MUnhashDirNotify.h" -#include "messages/MUnhashDirNotifyAck.h" - -#include "messages/MRenameWarning.h" -#include "messages/MRenameNotify.h" -#include "messages/MRenameNotifyAck.h" -#include "messages/MRename.h" -#include "messages/MRenamePrep.h" -#include "messages/MRenameReq.h" -#include "messages/MRenameAck.h" -#include "messages/MDentryUnlink.h" - -#include "messages/MHeartbeat.h" - -#include "messages/MAnchorRequest.h" -#include "messages/MAnchorReply.h" -#include "messages/MInodeLink.h" -#include "messages/MInodeLinkAck.h" - -//#include "messages/MInodeUpdate.h" -#include "messages/MInodeExpire.h" -#include "messages/MDirExpire.h" -#include "messages/MCacheExpire.h" -#include "messages/MInodeFileCaps.h" - -#include "messages/MLock.h" - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug) cout << "messenger: " -#define DEBUGLVL 10 // debug level of output - - - - - - - -Message * -decode_message(msg_envelope_t& env, bufferlist& payload) -{ - // make message - Message *m = 0; - switch(env.type) { - - // -- with payload -- - - /* - case MSG_NS_CONNECT: - m = new MNSConnect(); - break; - case MSG_NS_CONNECTACK: - m = new MNSConnectAck(); - break; - case MSG_NS_REGISTER: - m = new MNSRegister(); - break; - case MSG_NS_REGISTERACK: - m = new MNSRegisterAck(); - break; - case MSG_NS_LOOKUP: - m = new MNSLookup(); - break; - case MSG_NS_LOOKUPREPLY: - m = new MNSLookupReply(); - break; - case MSG_NS_FAILURE: - m = new MNSFailure(); - break; - */ - - case MSG_MON_PAXOS: - m = new MMonPaxos; - break; - - case MSG_MON_ELECTION_PROPOSE: - m = new MMonElectionPropose; - break; - case MSG_MON_ELECTION_ACK: - m = new MMonElectionAck; - break; - case MSG_MON_ELECTION_VICTORY: - m = new MMonElectionVictory; - break; - - case MSG_PING: - m = new MPing(); - break; - case MSG_PING_ACK: - m = new MPingAck(); - break; - /* - case MSG_FAILURE: - m = new MFailure(); - break; - case MSG_FAILURE_ACK: - m = new MFailureAck(); - break; - */ - - case MSG_OSD_BOOT: - m = new MOSDBoot(); - break; - case MSG_OSD_IN: - m = new MOSDIn(); - break; - case MSG_OSD_OUT: - m = new MOSDOut(); - break; - case MSG_OSD_FAILURE: - m = new MOSDFailure(); - break; - case MSG_OSD_PING: - m = new MOSDPing(); - break; - case MSG_OSD_OP: - m = new MOSDOp(); - break; - case MSG_OSD_OPREPLY: - m = new MOSDOpReply(); - break; - - case MSG_OSD_MAP: - m = new MOSDMap(); - break; - case MSG_OSD_GETMAP: - m = new MOSDGetMap(); - break; - - case MSG_OSD_PG_NOTIFY: - m = new MOSDPGNotify(); - break; - case MSG_OSD_PG_QUERY: - m = new MOSDPGQuery(); - break; - case MSG_OSD_PG_LOG: - m = new MOSDPGLog(); - break; - case MSG_OSD_PG_REMOVE: - m = new MOSDPGRemove(); - break; - - // clients - case MSG_CLIENT_BOOT: - m = new MClientBoot(); - break; - case MSG_CLIENT_MOUNT: - m = new MClientMount(); - break; - case MSG_CLIENT_MOUNTACK: - m = new MClientMountAck(); - break; - case MSG_CLIENT_REQUEST: - m = new MClientRequest(); - break; - case MSG_CLIENT_REPLY: - m = new MClientReply(); - break; - case MSG_CLIENT_FILECAPS: - m = new MClientFileCaps(); - break; - - // mds - case MSG_MDS_GETMAP: - m = new MMDSGetMap(); - break; - case MSG_MDS_MAP: - m = new MMDSMap(); - break; - case MSG_MDS_BEACON: - m = new MMDSBeacon; - break; - case MSG_MDS_IMPORTMAP: - m = new MMDSImportMap; - break; - case MSG_MDS_CACHEREJOIN: - m = new MMDSCacheRejoin; - break; - case MSG_MDS_CACHEREJOINACK: - m = new MMDSCacheRejoinAck; - break; - - case MSG_MDS_DIRUPDATE: - m = new MDirUpdate(); - break; - - case MSG_MDS_DISCOVER: - m = new MDiscover(); - break; - case MSG_MDS_DISCOVERREPLY: - m = new MDiscoverReply(); - break; - - case MSG_MDS_EXPORTDIRDISCOVER: - m = new MExportDirDiscover(); - break; - case MSG_MDS_EXPORTDIRDISCOVERACK: - m = new MExportDirDiscoverAck(); - break; - - case MSG_MDS_EXPORTDIR: - m = new MExportDir(); - break; - - case MSG_MDS_EXPORTDIRFINISH: - m = new MExportDirFinish(); - break; - - case MSG_MDS_EXPORTDIRNOTIFY: - m = new MExportDirNotify(); - break; - - case MSG_MDS_EXPORTDIRNOTIFYACK: - m = new MExportDirNotifyAck(); - break; - - case MSG_MDS_EXPORTDIRPREP: - m = new MExportDirPrep(); - break; - - case MSG_MDS_EXPORTDIRPREPACK: - m = new MExportDirPrepAck(); - break; - - case MSG_MDS_EXPORTDIRWARNING: - m = new MExportDirWarning(); - break; - - - case MSG_MDS_HASHREADDIR: - m = new MHashReaddir(); - break; - case MSG_MDS_HASHREADDIRREPLY: - m = new MHashReaddirReply(); - break; - - case MSG_MDS_HASHDIRDISCOVER: - m = new MHashDirDiscover(); - break; - case MSG_MDS_HASHDIRDISCOVERACK: - m = new MHashDirDiscoverAck(); - break; - case MSG_MDS_HASHDIRPREP: - m = new MHashDirPrep(); - break; - case MSG_MDS_HASHDIRPREPACK: - m = new MHashDirPrepAck(); - break; - case MSG_MDS_HASHDIR: - m = new MHashDir(); - break; - case MSG_MDS_HASHDIRACK: - m = new MHashDirAck(); - break; - case MSG_MDS_HASHDIRNOTIFY: - m = new MHashDirNotify(); - break; - - case MSG_MDS_UNHASHDIRPREP: - m = new MUnhashDirPrep(); - break; - case MSG_MDS_UNHASHDIRPREPACK: - m = new MUnhashDirPrepAck(); - break; - case MSG_MDS_UNHASHDIR: - m = new MUnhashDir(); - break; - case MSG_MDS_UNHASHDIRACK: - m = new MUnhashDirAck(); - break; - case MSG_MDS_UNHASHDIRNOTIFY: - m = new MUnhashDirNotify(); - break; - case MSG_MDS_UNHASHDIRNOTIFYACK: - m = new MUnhashDirNotifyAck(); - break; - - case MSG_MDS_RENAMEWARNING: - m = new MRenameWarning(); - break; - case MSG_MDS_RENAMENOTIFY: - m = new MRenameNotify(); - break; - case MSG_MDS_RENAMENOTIFYACK: - m = new MRenameNotifyAck(); - break; - case MSG_MDS_RENAME: - m = new MRename(); - break; - case MSG_MDS_RENAMEPREP: - m = new MRenamePrep(); - break; - case MSG_MDS_RENAMEREQ: - m = new MRenameReq(); - break; - case MSG_MDS_RENAMEACK: - m = new MRenameAck(); - break; - - case MSG_MDS_DENTRYUNLINK: - m = new MDentryUnlink(); - break; - - case MSG_MDS_HEARTBEAT: - m = new MHeartbeat(); - break; - - case MSG_MDS_CACHEEXPIRE: - m = new MCacheExpire(); - break; - - case MSG_MDS_ANCHORREQUEST: - m = new MAnchorRequest(); - break; - case MSG_MDS_ANCHORREPLY: - m = new MAnchorReply(); - break; - - case MSG_MDS_INODELINK: - m = new MInodeLink(); - break; - case MSG_MDS_INODELINKACK: - m = new MInodeLinkAck(); - break; - - /* case MSG_MDS_INODEUPDATE: - m = new MInodeUpdate(); - break; - */ - - case MSG_MDS_INODEEXPIRE: - m = new MInodeExpire(); - break; - - case MSG_MDS_INODEFILECAPS: - m = new MInodeFileCaps(); - break; - - case MSG_MDS_DIREXPIRE: - m = new MDirExpire(); - break; - - case MSG_MDS_LOCK: - m = new MLock(); - break; - - - // -- simple messages without payload -- - - case MSG_CLOSE: - case MSG_NS_STARTED: - case MSG_NS_UNREGISTER: - case MSG_SHUTDOWN: - case MSG_MDS_SHUTDOWNSTART: - case MSG_MDS_SHUTDOWNFINISH: - case MSG_CLIENT_UNMOUNT: - case MSG_OSD_MKFS_ACK: - m = new MGenericMessage(env.type); - break; - - default: - dout(1) << "can't decode unknown message type " << env.type << endl; - assert(0); - } - - // env - m->set_envelope(env); - - // decode - m->set_payload(payload); - m->decode_payload(); - - // done! - return m; -} - - diff --git a/branches/marnberg/quota/msg/Message.h b/branches/marnberg/quota/msg/Message.h deleted file mode 100644 index 80e1b9feaac28..0000000000000 --- a/branches/marnberg/quota/msg/Message.h +++ /dev/null @@ -1,320 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MESSAGE_H -#define __MESSAGE_H - -#define MSG_CLOSE 0 - -#define MSG_NS_CONNECT 1 -#define MSG_NS_CONNECTACK 2 -#define MSG_NS_REGISTER 3 -#define MSG_NS_REGISTERACK 4 -#define MSG_NS_STARTED 5 -#define MSG_NS_UNREGISTER 6 -#define MSG_NS_LOOKUP 7 -#define MSG_NS_LOOKUPREPLY 8 -#define MSG_NS_FAILURE 9 - - -#define MSG_PING 10 -#define MSG_PING_ACK 11 - -#define MSG_FAILURE 12 -#define MSG_FAILURE_ACK 13 - -#define MSG_SHUTDOWN 99999 - - - -#define MSG_MON_ELECTION_ACK 15 -#define MSG_MON_ELECTION_PROPOSE 16 -#define MSG_MON_ELECTION_VICTORY 17 - -#define MSG_MON_OSDMAP_INFO 20 -#define MSG_MON_OSDMAP_LEASE 21 -#define MSG_MON_OSDMAP_LEASE_ACK 22 -#define MSG_MON_OSDMAP_UPDATE_PREPARE 23 -#define MSG_MON_OSDMAP_UPDATE_ACK 24 -#define MSG_MON_OSDMAP_UPDATE_COMMIT 25 - -#define MSG_MON_PAXOS 30 - -#define MSG_OSD_OP 40 // delete, etc. -#define MSG_OSD_OPREPLY 41 // delete, etc. -#define MSG_OSD_PING 42 - -#define MSG_OSD_GETMAP 43 -#define MSG_OSD_MAP 44 - -#define MSG_OSD_BOOT 45 -#define MSG_OSD_MKFS_ACK 46 - -#define MSG_OSD_FAILURE 47 - -#define MSG_OSD_IN 48 -#define MSG_OSD_OUT 49 - - - -#define MSG_OSD_PG_NOTIFY 50 -#define MSG_OSD_PG_QUERY 51 -#define MSG_OSD_PG_SUMMARY 52 -#define MSG_OSD_PG_LOG 53 -#define MSG_OSD_PG_REMOVE 54 - -#define MSG_CLIENT_REQUEST 60 -#define MSG_CLIENT_REPLY 61 -//#define MSG_CLIENT_DONE 62 -#define MSG_CLIENT_FILECAPS 63 -#define MSG_CLIENT_INODEAUTHUPDATE 64 - -#define MSG_CLIENT_BOOT 70 -#define MSG_CLIENT_MOUNT 71 -#define MSG_CLIENT_MOUNTACK 72 -#define MSG_CLIENT_UNMOUNT 73 - - -// *** MDS *** - -#define MSG_MDS_GETMAP 102 -#define MSG_MDS_MAP 103 -#define MSG_MDS_HEARTBEAT 104 // for mds load balancer -#define MSG_MDS_BEACON 105 // to monitor - -#define MSG_MDS_IMPORTMAP 106 -#define MSG_MDS_CACHEREJOIN 107 -#define MSG_MDS_CACHEREJOINACK 108 - -#define MSG_MDS_DISCOVER 110 -#define MSG_MDS_DISCOVERREPLY 111 - -#define MSG_MDS_INODEGETREPLICA 112 -#define MSG_MDS_INODEGETREPLICAACK 113 - -#define MSG_MDS_INODEFILECAPS 115 - -#define MSG_MDS_INODEUPDATE 120 -#define MSG_MDS_DIRUPDATE 121 -#define MSG_MDS_INODEEXPIRE 122 -#define MSG_MDS_DIREXPIRE 123 - -#define MSG_MDS_DIREXPIREREQ 124 - -#define MSG_MDS_CACHEEXPIRE 125 - -#define MSG_MDS_ANCHORREQUEST 130 -#define MSG_MDS_ANCHORREPLY 131 - -#define MSG_MDS_INODELINK 140 -#define MSG_MDS_INODELINKACK 141 -#define MSG_MDS_INODEUNLINK 142 -#define MSG_MDS_INODEUNLINKACK 143 - -#define MSG_MDS_EXPORTDIRDISCOVER 150 -#define MSG_MDS_EXPORTDIRDISCOVERACK 151 -#define MSG_MDS_EXPORTDIRPREP 152 -#define MSG_MDS_EXPORTDIRPREPACK 153 -#define MSG_MDS_EXPORTDIRWARNING 154 -#define MSG_MDS_EXPORTDIR 155 -#define MSG_MDS_EXPORTDIRNOTIFY 156 -#define MSG_MDS_EXPORTDIRNOTIFYACK 157 -#define MSG_MDS_EXPORTDIRFINISH 158 - - -#define MSG_MDS_HASHDIRDISCOVER 160 -#define MSG_MDS_HASHDIRDISCOVERACK 161 -#define MSG_MDS_HASHDIRPREP 162 -#define MSG_MDS_HASHDIRPREPACK 163 -#define MSG_MDS_HASHDIR 164 -#define MSG_MDS_HASHDIRACK 165 -#define MSG_MDS_HASHDIRNOTIFY 166 - -#define MSG_MDS_HASHREADDIR 168 -#define MSG_MDS_HASHREADDIRREPLY 169 - -#define MSG_MDS_UNHASHDIRPREP 170 -#define MSG_MDS_UNHASHDIRPREPACK 171 -#define MSG_MDS_UNHASHDIR 172 -#define MSG_MDS_UNHASHDIRACK 173 -#define MSG_MDS_UNHASHDIRNOTIFY 174 -#define MSG_MDS_UNHASHDIRNOTIFYACK 175 - -#define MSG_MDS_DENTRYUNLINK 200 - -#define MSG_MDS_RENAMEWARNING 300 // sent from src to bystanders -#define MSG_MDS_RENAMENOTIFY 301 // sent from dest to bystanders -#define MSG_MDS_RENAMENOTIFYACK 302 // sent back to src -#define MSG_MDS_RENAMEACK 303 // sent from src to initiator, to xlock_finish - -#define MSG_MDS_RENAMEPREP 304 // sent from initiator to dest auth (if dir) -#define MSG_MDS_RENAMEREQ 305 // sent from initiator (or dest if dir) to src auth -#define MSG_MDS_RENAME 306 // sent from src to dest, includes inode - -#define MSG_MDS_LOCK 500 - -#define MSG_MDS_SHUTDOWNSTART 900 -#define MSG_MDS_SHUTDOWNFINISH 901 - - -#include -#include - -#include -#include -using std::list; - -#include -#include - -using __gnu_cxx::crope; - -#include "include/types.h" -#include "include/buffer.h" -#include "msg_types.h" - - - - -// ====================================================== - -// abstract Message class - - - -typedef struct { - int type; - entity_inst_t src, dst; - int source_port, dest_port; - int nchunks; -} msg_envelope_t; - -#define MSG_ENVELOPE_LEN sizeof(msg_envelope_t) - - -class Message { - private: - - protected: - msg_envelope_t env; // envelope - bufferlist payload; // payload - - friend class Messenger; -public: - - public: - Message() { - env.source_port = env.dest_port = -1; - env.nchunks = 0; - }; - Message(int t) { - env.source_port = env.dest_port = -1; - env.nchunks = 0; - env.type = t; - } - virtual ~Message() { - } - - - // for rpc-type procedural messages (pcid = procedure call id) - virtual long get_pcid() { return 0; } - virtual void set_pcid(long t) { assert(0); } // overload me - - void clear_payload() { payload.clear(); } - bool empty_payload() { return payload.length() == 0; } - bufferlist& get_payload() { - return payload; - } - void set_payload(bufferlist& bl) { - payload.claim(bl); - } - msg_envelope_t& get_envelope() { - return env; - } - void set_envelope(msg_envelope_t& env) { - this->env = env; - } - - - // ENVELOPE ---- - - // type - int get_type() { return env.type; } - void set_type(int t) { env.type = t; } - virtual char *get_type_name() = 0; - - // source/dest - entity_inst_t& get_dest_inst() { return env.dst; } - void set_dest_inst(entity_inst_t& inst) { env.dst = inst; } - - entity_inst_t& get_source_inst() { return env.src; } - void set_source_inst(entity_inst_t& inst) { env.src = inst; } - - entity_name_t& get_dest() { return env.dst.name; } - void set_dest(entity_name_t a, int p) { env.dst.name = a; env.dest_port = p; } - int get_dest_port() { return env.dest_port; } - void set_dest_port(int p) { env.dest_port = p; } - - entity_name_t& get_source() { return env.src.name; } - void set_source(entity_name_t a, int p) { env.src.name = a; env.source_port = p; } - int get_source_port() { return env.source_port; } - - entity_addr_t& get_source_addr() { return env.src.addr; } - void set_source_addr(const entity_addr_t &i) { env.src.addr = i; } - - // PAYLOAD ---- - void reset_payload() { - payload.clear(); - } - - // overload either the rope version (easier!) - virtual void encode_payload(crope& s) { assert(0); } - virtual void decode_payload(crope& s, int& off) { assert(0); } - - // of the bufferlist versions (faster!) - virtual void decode_payload() { - // use a crope for convenience, small messages, etc. FIXME someday. - crope ser; - for (list::const_iterator it = payload.buffers().begin(); - it != payload.buffers().end(); - it++) - ser.append((*it).c_str(), (*it).length()); - - int off = 0; - decode_payload(ser, off); - assert((unsigned)off == payload.length()); - } - virtual void encode_payload() { - assert(payload.length() == 0); // caller should reset payload - - // use crope for convenience, small messages. FIXME someday. - crope r; - encode_payload(r); - - // copy payload - payload.push_back( buffer::copy(r.c_str(), r.length()) ); - } - - virtual void print(ostream& out) { - out << get_type_name(); - } - -}; - -extern Message *decode_message(msg_envelope_t &env, bufferlist& bl); -inline ostream& operator<<(ostream& out, Message& m) { - m.print(out); - return out; -} - -#endif diff --git a/branches/marnberg/quota/msg/Messenger.cc b/branches/marnberg/quota/msg/Messenger.cc deleted file mode 100644 index a6133260e9b9e..0000000000000 --- a/branches/marnberg/quota/msg/Messenger.cc +++ /dev/null @@ -1,38 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include -#include "include/types.h" - -#include "Message.h" -#include "Messenger.h" -#include "messages/MGenericMessage.h" - -#include -#include -using namespace std; - - -// --------- -// incoming messages - -void Messenger::dispatch(Message *m) -{ - assert(dispatcher); - dispatcher->dispatch(m); -} - - - diff --git a/branches/marnberg/quota/msg/Messenger.h b/branches/marnberg/quota/msg/Messenger.h deleted file mode 100644 index 991e80c839112..0000000000000 --- a/branches/marnberg/quota/msg/Messenger.h +++ /dev/null @@ -1,86 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __MESSENGER_H -#define __MESSENGER_H - -#include -using namespace std; - -#include "Message.h" -#include "Dispatcher.h" -#include "common/Mutex.h" -#include "common/Cond.h" -#include "include/Context.h" - - - -class MDS; -class Timer; - -class Messenger { - private: - Dispatcher *dispatcher; - entity_name_t _myname; - - public: - Messenger(entity_name_t w) : dispatcher(0), _myname(w) { } - virtual ~Messenger() { } - - // accessors - entity_name_t get_myname() { return _myname; } - void _set_myname(entity_name_t m) { _myname = m; } - - virtual void reset_myname(entity_name_t m) = 0; - - virtual const entity_addr_t &get_myaddr() = 0; - - entity_inst_t get_myinst() { return entity_inst_t(_myname, get_myaddr()); } - - // hrmpf. - virtual int get_dispatch_queue_len() { return 0; }; - - // setup - void set_dispatcher(Dispatcher *d) { dispatcher = d; ready(); } - Dispatcher *get_dispatcher() { return dispatcher; } - virtual void ready() { } - bool is_ready() { return dispatcher != 0; } - - // dispatch incoming messages - virtual void dispatch(Message *m) { - assert(dispatcher); - dispatcher->dispatch(m); - } - - // shutdown - virtual int shutdown() = 0; - - // send message - virtual void prepare_dest(const entity_addr_t& addr) {} - virtual int send_message(Message *m, entity_inst_t dest, - int port=0, int fromport=0) = 0; - - // make a procedure call - //virtual Message* sendrecv(Message *m, msg_name_t dest, int port=0); - - virtual void mark_down(entity_addr_t a) {} - -}; - - - - - -#endif diff --git a/branches/marnberg/quota/msg/NewMessenger.cc b/branches/marnberg/quota/msg/NewMessenger.cc deleted file mode 100644 index 1455c31724c68..0000000000000 --- a/branches/marnberg/quota/msg/NewMessenger.cc +++ /dev/null @@ -1,1714 +0,0 @@ - -#include "NewMessenger.h" - -#include -#include -#include -#include - -#include "config.h" - -#include "messages/MGenericMessage.h" -#include "messages/MNSConnect.h" -#include "messages/MNSConnectAck.h" -#include "messages/MNSRegister.h" -#include "messages/MNSRegisterAck.h" -#include "messages/MNSLookup.h" -#include "messages/MNSLookupReply.h" -#include "messages/MNSFailure.h" - -//#include "messages/MFailure.h" - -#include - - -#undef dout -#define dout(l) if (l<=g_conf.debug_ms) cout << g_clock.now() << " -- rank" << rank.my_rank << " " -#define derr(l) if (l<=g_conf.debug_ms) cerr << g_clock.now() << " -- rank" << rank.my_rank << " " - - - -#include "tcp.cc" - - -Rank rank; - - -/******************************************** - * Namer - */ - -Rank::Namer::Namer(EntityMessenger *msgr) : - messenger(msgr), - nrank(0), nclient(0), nmds(0), nosd(0), nmon(0) -{ - assert(rank.my_rank == 0); - nrank = g_conf.num_mon; - - // announce myself - /* - cerr << "ceph ns is " << rank.accepter.listen_addr << endl; - cout << "export CEPH_NAMESERVER=" << rank.accepter.listen_addr << endl; - int fd = ::open(".ceph_ns", O_WRONLY|O_CREAT); - ::write(fd, (void*)&rank.accepter.listen_addr, sizeof(tcpaddr_t)); - ::fchmod(fd, 0755); - ::close(fd); - */ - - // ok - messenger->set_dispatcher(this); -} - -Rank::Namer::~Namer() -{ - //::unlink(".ceph_ns"); -} - - -void Rank::Namer::dispatch(Message *m) -{ - rank.lock.Lock(); - int type = m->get_type(); - switch (type) { - case MSG_NS_CONNECT: - handle_connect((class MNSConnect*)m); - break; - case MSG_NS_REGISTER: - handle_register((class MNSRegister*)m); - break; - case MSG_NS_STARTED: - handle_started(m); - break; - case MSG_NS_UNREGISTER: - handle_unregister(m); - break; - case MSG_NS_LOOKUP: - handle_lookup((class MNSLookup*)m); - break; - case MSG_NS_FAILURE: - handle_failure((class MNSFailure*)m); - break; - - case MSG_FAILURE_ACK: - delete m; - break; - - default: - assert(0); - } - rank.lock.Unlock(); -} - -void Rank::Namer::handle_connect(MNSConnect *m) -{ - int newrank = nrank++; - dout(2) << "namer.handle_connect from new rank" << newrank << " " << m->get_addr() << endl; - - rank.entity_map[MSG_ADDR_RANK(newrank)].addr = m->get_addr(); - rank.entity_map[MSG_ADDR_RANK(newrank)].rank = newrank; - rank.entity_unstarted.insert(MSG_ADDR_RANK(newrank)); - - messenger->send_message(new MNSConnectAck(newrank), - MSG_ADDR_RANK(newrank), rank.entity_map[MSG_ADDR_RANK(newrank)]); - delete m; -} - -void Rank::Namer::manual_insert_inst(const entity_inst_t &inst) -{ - rank.entity_map[MSG_ADDR_RANK(inst.rank)] = inst; -} - -void Rank::Namer::handle_register(MNSRegister *m) -{ - dout(10) << "namer.handle_register from rank " << m->get_rank() - << " addr " << m->get_entity() << endl; - - // pick id - entity_name_t entity = m->get_entity(); - - if (entity.is_new()) { - // make up a new address! - switch (entity.type()) { - case MSG_ADDR_MDS_BASE: - entity = MSG_ADDR_MDS(nmds++); - break; - - case MSG_ADDR_OSD_BASE: - entity = MSG_ADDR_OSD(nosd++); - break; - - case MSG_ADDR_CLIENT_BASE: - entity = MSG_ADDR_CLIENT(nclient++); - break; - - default: - assert(0); - } - } else { - // specific address! - } - - - // register - if (rank.entity_map.count(entity)) { - dout(1) << "namer.handle_register re-registering " << entity - << " inst " << m->get_source_inst() - << " (was " << rank.entity_map[entity] << ")" - << endl; - } else { - dout(1) << "namer.handle_register registering " << entity - << " inst " << m->get_source_inst() - << endl; - } - rank.entity_map[entity] = m->get_source_inst(); - rank.entity_unstarted.insert(entity); - - // reply w/ new id - messenger->send_message(new MNSRegisterAck(m->get_tid(), entity), - m->get_source(), rank.entity_map[entity]); - - delete m; -} - -void Rank::Namer::handle_started(Message *m) -{ - entity_name_t who = m->get_source(); - dout(10) << "namer.handle_started from entity " << who << endl; - - assert(rank.entity_unstarted.count(who)); - rank.entity_unstarted.erase(who); - - // anybody waiting? - if (waiting.count(who)) { - list ls; - ls.swap(waiting[who]); - waiting.erase(who); - - dout(10) << "doing waiters on " << who << endl; - for (list::iterator it = ls.begin(); - it != ls.end(); - it++) - dispatch(*it); - } - -} - -void Rank::Namer::handle_unregister(Message *m) -{ - entity_name_t who = m->get_source(); - dout(1) << "namer.handle_unregister entity " << who << endl; - - rank.show_dir(); - - assert(rank.entity_map.count(who)); - rank.entity_map.erase(who); - - rank.show_dir(); - - // shut myself down? kick watcher. - if (rank.entity_map.size() == 2) { - dout(10) << "namer.handle_unregister stopping namer" << endl; - rank.lock.Unlock(); - messenger->shutdown(); - delete messenger; - rank.lock.Lock(); - } - - delete m; -} - - -void Rank::Namer::handle_lookup(MNSLookup *m) -{ - // have it? - if (rank.entity_map.count(m->get_entity()) == 0) { - dout(10) << "namer " << m->get_source() << " lookup '" << m->get_entity() << "' -> dne" << endl; - waiting[m->get_entity()].push_back(m); - return; - } - - if (rank.entity_unstarted.count(m->get_entity())) { - dout(10) << "namer " << m->get_source() << " lookup '" << m->get_entity() << "' -> unstarted" << endl; - waiting[m->get_entity()].push_back(m); - return; - } - - // look it up! - MNSLookupReply *reply = new MNSLookupReply(m); - - reply->entity_map[m->get_entity()] = rank.entity_map[m->get_entity()]; - - dout(10) << "namer " << m->get_source() - << " lookup '" << m->get_entity() - << "' -> " << rank.entity_map[m->get_entity()] << endl; - - messenger->send_message(reply, m->get_source(), m->get_source_inst()); - delete m; -} - -void Rank::Namer::handle_failure(MNSFailure *m) -{ - dout(10) << "namer.handle_failure inst " << m->get_inst() - << endl; - - // search for entities on this instance - list rm; - for (hash_map::iterator i = rank.entity_map.begin(); - i != rank.entity_map.end(); - i++) { - if (i->second != m->get_inst()) continue; - rm.push_back(i->first); - } - for (list::iterator i = rm.begin(); - i != rm.end(); - i++) { - dout(10) << "namer.handle_failure inst " << m->get_inst() - << ", removing " << *i << endl; - - rank.entity_map.erase(*i); - rank.entity_unstarted.erase(*i); - - /* - if ((*i).is_osd()) { - // tell the monitor - messenger->send_message(new MFailure(*i, m->get_inst()), MSG_ADDR_MON(0)); - } - */ - } - - delete m; -} - - - -/******************************************** - * Accepter - */ - -int Rank::Accepter::start() -{ - // bind to a socket - dout(10) << "accepter.start binding to listen " << endl; - - /* socket creation */ - listen_sd = socket(AF_INET,SOCK_STREAM,0); - assert(listen_sd > 0); - - /* bind to port */ - memset((char*)&listen_addr, 0, sizeof(listen_addr)); - listen_addr.sin_family = AF_INET; - listen_addr.sin_addr.s_addr = htonl(INADDR_ANY); - listen_addr.sin_port = 0; - - int rc = bind(listen_sd, (struct sockaddr *) &listen_addr, sizeof(listen_addr)); - assert(rc >= 0); - - socklen_t llen = sizeof(listen_addr); - getsockname(listen_sd, (sockaddr*)&listen_addr, &llen); - - int myport = listen_addr.sin_port; - - // listen! - rc = ::listen(listen_sd, 1000); - assert(rc >= 0); - - //dout(10) << "accepter.start listening on " << myport << endl; - - // my address is... - char host[100]; - bzero(host, 100); - gethostname(host, 100); - //dout(10) << "accepter.start my hostname is " << host << endl; - - struct hostent *myhostname = gethostbyname( host ); - - struct sockaddr_in my_addr; - memset(&my_addr, 0, sizeof(my_addr)); - - my_addr.sin_family = myhostname->h_addrtype; - memcpy((char *) &my_addr.sin_addr.s_addr, - myhostname->h_addr_list[0], - myhostname->h_length); - my_addr.sin_port = myport; - - listen_addr = my_addr; - - dout(10) << "accepter.start listen addr is " << listen_addr << endl; - - // start thread - create(); - - return 0; -} - -void *Rank::Accepter::entry() -{ - dout(10) << "accepter starting" << endl; - - while (!done) { - // accept - struct sockaddr_in addr; - socklen_t slen = sizeof(addr); - int sd = ::accept(listen_sd, (sockaddr*)&addr, &slen); - if (sd > 0) { - dout(10) << "accepted incoming on sd " << sd << endl; - - Receiver *r = new Receiver(sd); - r->create(); - - rank.lock.Lock(); - rank.receivers.insert(r); - rank.lock.Unlock(); - } else { - dout(10) << "no incoming connection?" << endl; - break; - } - } - - return 0; -} - - -/************************************** - * Receiver - */ - -void *Rank::Receiver::entry() -{ - while (!done) { - Message *m = read_message(); - if (!m) { - ::close(sd); - break; - } - - dout(10) << "receiver.entry got message for " << m->get_dest() << endl; - - EntityMessenger *entity = 0; - - rank.lock.Lock(); - { - if (rank.down.count(m->get_dest())) { - dout(0) << "receiver.entry dest " << m->get_dest() << " down, dropping " << *m << endl; - delete m; - - if (rank.looking_up.count(m->get_dest()) == 0) - rank.lookup(m->get_dest()); - } - else if (rank.entity_map.count(m->get_source()) && - rank.entity_map[m->get_source()] > m->get_source_inst()) { - derr(0) << "receiver.entry source " << m->get_source() - << " inst " << m->get_source_inst() - << " < " << rank.entity_map[m->get_source()] - << ", dropping " << *m << endl; - delete m; - } - else { - if (rank.entity_map.count(m->get_source()) && - rank.entity_map[m->get_source()] > m->get_source_inst()) { - derr(0) << "receiver.entry source " << m->get_source() - << " inst " << m->get_source_inst() - << " > " << rank.entity_map[m->get_source()] - << ", WATCH OUT " << *m << endl; - rank.entity_map[m->get_source()] = m->get_source_inst(); - } - - if (m->get_dest().type() == MSG_ADDR_RANK_BASE) { - // ours. - rank.dispatch(m); - } else { - if (g_conf.ms_single_dispatch) { - // submit to single dispatch queue - rank._submit_single_dispatch(m); - } else { - if (rank.local.count(m->get_dest())) { - // find entity - entity = rank.local[m->get_dest()]; - } else { - derr(0) << "got message " << *m << " for " << m->get_dest() << ", which isn't local" << endl; - rank.waiting_for_lookup[m->get_dest()].push_back(m); - } - } - } - } - } - rank.lock.Unlock(); - - if (entity) - entity->queue_message(m); // queue - } - - // add to reap queue - rank.lock.Lock(); - rank.receiver_reap_queue.push_back(this); - rank.wait_cond.Signal(); - rank.lock.Unlock(); - - return 0; -} - -Message *Rank::Receiver::read_message() -{ - // envelope - //dout(10) << "receiver.read_message from sd " << sd << endl; - - msg_envelope_t env; - if (!tcp_read( sd, (char*)&env, sizeof(env) )) - return 0; - - if (env.type == 0) { - dout(10) << "receiver got dummy env, bailing" << endl; - return 0; - } - - dout(20) << "receiver got envelope type=" << env.type - << " src " << env.source << " dst " << env.dest - << " nchunks=" << env.nchunks - << endl; - - // payload - bufferlist blist; - for (int i=0; iget_source() << endl; - - return m; -} - - -/************************************** - * Sender - */ - -int Rank::Sender::connect() -{ - dout(10) << "sender(" << inst << ").connect" << endl; - - // create socket? - sd = socket(AF_INET,SOCK_STREAM,0); - assert(sd > 0); - - // bind any port - struct sockaddr_in myAddr; - myAddr.sin_family = AF_INET; - myAddr.sin_addr.s_addr = htonl(INADDR_ANY); - myAddr.sin_port = htons( 0 ); - - int rc = bind(sd, (struct sockaddr *) &myAddr, sizeof(myAddr)); - assert(rc>=0); - - // connect! - int r = ::connect(sd, (sockaddr*)&inst.addr, sizeof(myAddr)); - if (r < 0) return r; - - // identify myself - // FIXME - - return 0; -} - - -void Rank::Sender::finish() -{ - dout(10) << "sender(" << inst << ").finish" << endl; - - // make sure i get reaped. - rank.lock.Lock(); - rank.sender_reap_queue.push_back(this); - rank.wait_cond.Signal(); - rank.lock.Unlock(); -} - -void Rank::Sender::fail_and_requeue(list& out) -{ - dout(10) << "sender(" << inst << ").fail" << endl;// and requeue" << endl; - - // tell namer - if (!rank.messenger) { - derr(0) << "FATAL error: can't send failure to namer0, not connected yet" << endl; - assert(0); - } - - // old and unnecessary? - if (0) - rank.messenger->send_message(new MNSFailure(inst), - MSG_ADDR_NAMER(0)); - - - // FIXME: possible race before i reclaim lock here? - - Dispatcher *dis = 0; - entity_name_t dis_dest; - - list lost; - - // requeue my messages - rank.lock.Lock(); - lock.Lock(); - { - // include out at front of queue - q.splice(q.begin(), out); - dout(10) << "sender(" << inst << ").fail " - << q.size() << " messages" << endl; - - if (0) { - lost.swap(q); - } else { - - while (!q.empty()) { - // don't keep reconnecting.. - if (rank.entity_map.count(q.front()->get_dest()) && - rank.entity_map[q.front()->get_dest()] == inst) - rank.down.insert(q.front()->get_dest()); - //rank.entity_map.erase(q.front()->get_dest()); - - if (!dis && - rank.local.count(q.front()->get_source())) { - dis_dest = q.front()->get_dest(); - dis = rank.local[q.front()->get_source()]->get_dispatcher(); - } - - if (g_conf.ms_requeue_on_sender_fail) - rank.submit_message( q.front() ); - else - lost.push_back( q.front() ); - q.pop_front(); - } - } - - // deactivate myself - if (rank.rank_sender.count(inst.rank) && - rank.rank_sender[inst.rank] == this) - rank.rank_sender.erase(inst.rank); - - // stop sender loop - done = true; - } - lock.Unlock(); - - - // send special failure msg? - if (dis) { - for (list::iterator p = lost.begin(); - p != lost.end(); - p++) - dis->ms_handle_failure(*p, dis_dest, inst); - } - - rank.lock.Unlock(); -} - -void *Rank::Sender::entry() -{ - // connect - if (sd == 0) { - int rc = connect(); - if (rc < 0) { - list out; - derr(0) << "error connecting to " << inst << endl; - fail_and_requeue(out); - finish(); - return 0; - } - } - - lock.Lock(); - while (!q.empty() || !done) { - - if (!q.empty()) { - dout(20) << "sender(" << inst << ") grabbing message(s)" << endl; - - // grab outgoing list - list out; - out.swap(q); - - // drop lock while i send these - lock.Unlock(); - - while (!out.empty()) { - Message *m = out.front(); - out.pop_front(); - - dout(20) << "sender(" << inst << ") sending " << *m << endl; - - // stamp. - m->set_source_inst(rank.my_inst); - - // marshall - if (m->empty_payload()) - m->encode_payload(); - - if (write_message(m) < 0) { - // failed! - derr(0) << "error sending to " << m->get_dest() << " on " << inst << endl; - out.push_front(m); - fail_and_requeue(out); - break; - } - } - - lock.Lock(); - continue; - } - - // wait - dout(20) << "sender(" << inst << ") sleeping" << endl; - cond.Wait(lock); - } - lock.Unlock(); - - finish(); - return 0; -} - - -int Rank::Sender::write_message(Message *m) -{ - // get envelope, buffers - msg_envelope_t *env = &m->get_envelope(); - bufferlist blist; - blist.claim( m->get_payload() ); - -#ifdef TCP_KEEP_CHUNKS - env->nchunks = blist.buffers().size(); -#else - env->nchunks = 1; -#endif - - dout(20)// << g_clock.now() - << " sending " << m << " " << *m - << " to " << m->get_dest() - << endl; - - // send envelope - int r = tcp_write( sd, (char*)env, sizeof(*env) ); - if (r < 0) { - derr(20) << "error sending envelope for " << *m - << " to " << m->get_dest() << endl; - return -1; - } - - // payload -#ifdef TCP_KEEP_CHUNKS - // send chunk-wise - int i = 0; - for (list::iterator it = blist.buffers().begin(); - it != blist.buffers().end(); - it++) { - dout(10) << "tcp_sending frag " << i << " len " << (*it).length() << endl; - int size = (*it).length(); - r = tcp_write( sd, (char*)&size, sizeof(size) ); - if (r < 0) { - derr(20) << "error sending chunk len for " << *m << " to " << m->get_dest() << endl; - return -1; - } - r = tcp_write( sd, (*it).c_str(), size ); - if (r < 0) { - derr(20) << "error sending data chunk for " << *m << " to " << m->get_dest() << endl; - return -1; - } - i++; - } -#else - // one big chunk - int size = blist.length(); - r = tcp_write( sd, (char*)&size, sizeof(size) ); - if (r < 0) { - derr(20) << "error sending data len for " << *m << " to " << m->get_dest() << endl; - return -1; - } - for (list::iterator it = blist.buffers().begin(); - it != blist.buffers().end(); - it++) { - r = tcp_write( sd, (*it).c_str(), (*it).length() ); - if (r < 0) { - derr(20) << "error sending data megachunk for " << *m << " to " << m->get_dest() << " : len " << (*it).length() << endl; - return -1; - } - } -#endif - - // delete message - delete m; - return 0; -} - - - -/******************************************** - * Rank - */ - -Rank::Rank(int r) : - single_dispatcher(this), - my_rank(r), - namer(0) { -} -Rank::~Rank() -{ - //FIXME - if (namer) delete namer; -} - - -void Rank::_submit_single_dispatch(Message *m) -{ - assert(lock.is_locked()); - - if (local.count(m->get_dest()) && - local[m->get_dest()]->is_ready()) { - rank.single_dispatch_queue.push_back(m); - rank.single_dispatch_cond.Signal(); - } else { - waiting_for_ready[m->get_dest()].push_back(m); - } -} - - -void Rank::single_dispatcher_entry() -{ - lock.Lock(); - while (!single_dispatch_stop || !single_dispatch_queue.empty()) { - if (!single_dispatch_queue.empty()) { - list ls; - ls.swap(single_dispatch_queue); - - lock.Unlock(); - { - while (!ls.empty()) { - Message *m = ls.front(); - ls.pop_front(); - - dout(1) //<< g_clock.now() - << "---- " - << m->get_source() << ':' << m->get_source_port() - << " to " << m->get_dest() << ':' << m->get_dest_port() - << " ---- " << m->get_type_name() - << " ---- " << m - << endl; - - if (m->get_dest().type() == MSG_ADDR_RANK_BASE) - rank.dispatch(m); - else { - assert(local.count(m->get_dest())); - local[m->get_dest()]->dispatch(m); - } - } - } - lock.Lock(); - continue; - } - single_dispatch_cond.Wait(lock); - } - lock.Unlock(); -} - - -/* - * note: assumes lock is held - */ -void Rank::reaper() -{ - assert(lock.is_locked()); - - while (!receiver_reap_queue.empty()) { - Receiver *r = receiver_reap_queue.front(); - receiver_reap_queue.pop_front(); - //dout(10) << "reaper reaping receiver sd " << r->sd << endl; - receivers.erase(r); - r->join(); - dout(10) << "reaper reaped receiver sd " << r->sd << endl; - delete r; - } - - while (!sender_reap_queue.empty()) { - Sender *s = sender_reap_queue.front(); - sender_reap_queue.pop_front(); - //dout(10) << "reaper reaping sender rank " << s->dest_rank << " at " << s->tcpaddr << endl; - if (rank_sender.count(s->inst.rank) && - rank_sender[s->inst.rank] == s) - rank_sender.erase(s->inst.rank); - s->join(); - dout(10) << "reaper reaped sender " << s->inst << endl; - delete s; - } -} - - -int Rank::start_rank() -{ - dout(10) << "start_rank" << endl; - - // bind to a socket - if (accepter.start() < 0) - return -1; - - // start single thread dispatcher? - if (g_conf.ms_single_dispatch) { - single_dispatch_stop = false; - single_dispatcher.create(); - } - - lock.Lock(); - - if (my_rank < 0) { - dout(10) << "start_rank connecting to namer0" << endl; - - // connect to namer - assert(entity_map.count(MSG_ADDR_NAMER(0))); - Sender *sender = connect_rank(entity_map[MSG_ADDR_NAMER(0)]); - - // send - Message *m = new MNSConnect(accepter.listen_addr); - m->set_dest(MSG_ADDR_NAMER(0), 0); - sender->send(m); - - // wait - while (my_rank < 0) - waiting_for_rank.Wait(lock); - assert(my_rank >= 0); - - dout(10) << "start_rank got rank " << my_rank << endl; - - // create rank entity - entity_map[MSG_ADDR_RANK(my_rank)] = my_inst; - local[MSG_ADDR_RANK(my_rank)] = messenger = new EntityMessenger(MSG_ADDR_RANK(my_rank)); - messenger->set_dispatcher(this); - } else { - // my_inst - my_inst.addr = accepter.listen_addr; - my_inst.rank = my_rank; - - // create my rank - entity_name_t raddr = MSG_ADDR_RANK(my_rank); - entity_map[raddr] = my_inst; - entity_unstarted.insert(raddr); - local[raddr] = messenger = new EntityMessenger(raddr); - messenger->set_dispatcher(this); - - dout(1) << "start_rank " << my_rank << " at " << my_inst << endl; - } - - lock.Unlock(); - return 0; -} - -void Rank::start_namer() -{ - // create namer0 - entity_name_t naddr = MSG_ADDR_NAMER(0); - entity_map[naddr] = my_inst; - local[naddr] = new EntityMessenger(naddr); - namer = new Namer(local[naddr]); -} - -void Rank::set_namer(const tcpaddr_t& ns) -{ - entity_map[MSG_ADDR_NAMER(0)].addr = ns; - entity_map[MSG_ADDR_NAMER(0)].rank = 0; -} - -/* connect_rank - * NOTE: assumes rank.lock held. - */ -Rank::Sender *Rank::connect_rank(const entity_inst_t& inst) -{ - assert(rank.lock.is_locked()); - assert(inst != rank.my_inst); - - dout(10) << "connect_rank to " << inst << endl; - - // create sender - Sender *sender = new Sender(inst); - //int rc = sender->connect(); - //assert(rc >= 0); - - // start thread. - sender->create(); - - // old sender? - assert(rank.rank_sender.count(inst.rank) == 0); - //if (rank.rank_sender.count(r)) - //rank.rank_sender[r]->stop(); - - // ok! - rank.rank_sender[inst.rank] = sender; - return sender; -} - - - - - -void Rank::show_dir() -{ - dout(10) << "show_dir ---" << endl; - - for (hash_map::iterator i = entity_map.begin(); - i != entity_map.end(); - i++) { - if (local.count(i->first)) { - dout(10) << "show_dir entity_map " << i->first << " -> " << i->second << " local " << endl; - } else { - dout(10) << "show_dir entity_map " << i->first << " -> " << i->second << endl; - } - } -} - - -/* lookup - * NOTE: assumes directory.lock held - */ -void Rank::lookup(entity_name_t addr) -{ - dout(10) << "lookup " << addr << endl; - assert(lock.is_locked()); - - assert(looking_up.count(addr) == 0); - looking_up.insert(addr); - - MNSLookup *r = new MNSLookup(addr); - messenger->send_message(r, MSG_ADDR_DIRECTORY); -} - - - -/* register_entity - */ -Rank::EntityMessenger *Rank::register_entity(entity_name_t addr) -{ - dout(10) << "register_entity " << addr << endl; - lock.Lock(); - - // register with namer - static long reg_attempt = 0; - long id = ++reg_attempt; - - Message *reg = new MNSRegister(addr, my_rank, id); - reg->set_source(MSG_ADDR_RANK(my_rank), 0); - reg->set_source_inst(my_inst); - reg->set_dest(MSG_ADDR_DIRECTORY, 0); - - // prepare cond - Cond cond; - waiting_for_register_cond[id] = &cond; - - // send request - lock.Unlock(); - submit_message(reg); - lock.Lock(); - - // wait - while (!waiting_for_register_result.count(id)) - cond.Wait(lock); - - // grab result - addr = waiting_for_register_result[id]; - dout(10) << "register_entity got " << addr << endl; - - // clean up - waiting_for_register_cond.erase(id); - waiting_for_register_result.erase(id); - - // create messenger - EntityMessenger *msgr = new EntityMessenger(addr); - - // add to directory - entity_map[addr] = my_inst; - local[addr] = msgr; - - // was anyone waiting? - if (waiting_for_lookup.count(addr)) { - submit_messages(waiting_for_lookup[addr]); - waiting_for_lookup.erase(addr); - } - - lock.Unlock(); - return msgr; -} - -void Rank::unregister_entity(EntityMessenger *msgr) -{ - lock.Lock(); - dout(10) << "unregister_entity " << msgr->get_myaddr() << endl; - - // remove from local directory. - assert(local.count(msgr->get_myaddr())); - local.erase(msgr->get_myaddr()); - - if (my_rank > 0) { - assert(entity_map.count(msgr->get_myaddr())); - entity_map.erase(msgr->get_myaddr()); - } // else namer will do it. - - // tell namer. - if (msgr->get_myaddr() != MSG_ADDR_NAMER(0) && - msgr->get_myaddr() != MSG_ADDR_RANK(0)) - msgr->send_message(new MGenericMessage(MSG_NS_UNREGISTER), - MSG_ADDR_NAMER(0)); - - // kick wait()? - if (local.size() <= 2) - wait_cond.Signal(); - - lock.Unlock(); -} - - -void Rank::submit_messages(list& ls) -{ - for (list::iterator i = ls.begin(); i != ls.end(); i++) - submit_message(*i); - ls.clear(); -} - - -void Rank::prepare_dest(entity_name_t dest) -{ - lock.Lock(); - - if (entity_map.count( dest )) { - // remote, known rank addr. - entity_inst_t inst = entity_map[dest]; - - if (inst == my_inst) { - //dout(20) << "submit_message " << *m << " dest " << dest << " local but mid-register, waiting." << endl; - //waiting_for_lookup[dest].push_back(m); - } - else if (rank_sender.count( inst.rank ) && - rank_sender[inst.rank]->inst == inst) { - //dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << inst << ", connected." << endl; - // connected. - //sender = rank_sender[ inst.rank ]; - } else { - //dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << inst << ", connecting." << endl; - // not connected. - connect_rank( inst ); - } - } else { - // unknown dest rank or rank addr. - if (looking_up.count(dest) == 0) { - //dout(20) << "submit_message " << *m << " dest " << dest << " remote, unknown addr, looking up" << endl; - lookup(dest); - } else { - //dout(20) << "submit_message " << *m << " dest " << dest << " remote, unknown addr, already looking up" << endl; - } - } - - lock.Unlock(); -} - -void Rank::submit_message(Message *m, const entity_inst_t& dest_inst) -{ - const entity_name_t dest = m->get_dest(); - - // lookup - EntityMessenger *entity = 0; - Sender *sender = 0; - - lock.Lock(); - { - // local? - if (dest_inst.rank == my_inst.rank) { - if (local.count(dest)) { - // local - dout(20) << "submit_message " << *m << " dest " << dest << " local" << endl; - if (g_conf.ms_single_dispatch) { - _submit_single_dispatch(m); - } else { - entity = local[dest]; - } - } else { - // mid-register - dout(20) << "submit_message " << *m << " dest " << dest << " local but mid-register, waiting." << endl; - assert(0); - waiting_for_lookup[dest].push_back(m); - } - } - else { - // remote. - if (rank_sender.count( dest_inst.rank )) { - //&& - //rank_sender[dest_inst.rank]->inst == dest_inst) { - dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << dest_inst << ", connected." << endl; - // connected. - sender = rank_sender[ dest_inst.rank ]; - } else { - dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << dest_inst << ", connecting." << endl; - // not connected. - sender = connect_rank( dest_inst ); - } - } - } - lock.Unlock(); - - // do it - if (entity) { - // local! - dout(20) << "submit_message " << *m << " dest " << dest << " local, queueing" << endl; - entity->queue_message(m); - } - else if (sender) { - // remote! - dout(20) << "submit_message " << *m << " dest " << dest << " remote, sending" << endl; - sender->send(m); - } -} - - -void Rank::submit_message(Message *m) -{ - const entity_name_t dest = m->get_dest(); - - // lookup - EntityMessenger *entity = 0; - Sender *sender = 0; - - lock.Lock(); - { - if (down.count(dest)) { - // black hole. - dout(0) << "submit_message " << *m << " dest " << dest << " down, dropping" << endl; - delete m; - - if (looking_up.count(dest) == 0) - lookup(dest); - - } else if (local.count(dest)) { - dout(20) << "submit_message " << *m << " dest " << dest << " local" << endl; - - // local - if (g_conf.ms_single_dispatch) { - _submit_single_dispatch(m); - } else { - entity = local[dest]; - } - } else if (entity_map.count( dest )) { - // remote, known rank addr. - entity_inst_t inst = entity_map[dest]; - - if (inst == my_inst) { - dout(20) << "submit_message " << *m << " dest " << dest << " local but mid-register, waiting." << endl; - waiting_for_lookup[dest].push_back(m); - } - else if (rank_sender.count( inst.rank ) && - rank_sender[inst.rank]->inst == inst) { - dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << inst << ", connected." << endl; - // connected. - sender = rank_sender[ inst.rank ]; - } else { - dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << inst << ", connecting." << endl; - // not connected. - sender = connect_rank( inst ); - } - } else { - // unknown dest rank or rank addr. - if (looking_up.count(dest) == 0) { - dout(20) << "submit_message " << *m << " dest " << dest << " remote, unknown addr, looking up" << endl; - lookup(dest); - } else { - dout(20) << "submit_message " << *m << " dest " << dest << " remote, unknown addr, already looking up" << endl; - } - waiting_for_lookup[dest].push_back(m); - } - } - lock.Unlock(); - - // do it - if (entity) { - // local! - dout(20) << "submit_message " << *m << " dest " << dest << " local, queueing" << endl; - entity->queue_message(m); - } - else if (sender) { - // remote! - dout(20) << "submit_message " << *m << " dest " << dest << " remote, sending" << endl; - sender->send(m); - } -} - - - - -void Rank::dispatch(Message *m) -{ - lock.Lock(); - - dout(10) << "dispatching " << *m << endl; - - switch (m->get_type()) { - case MSG_NS_CONNECTACK: - handle_connect_ack((MNSConnectAck*)m); - break; - - case MSG_NS_REGISTERACK: - handle_register_ack((MNSRegisterAck*)m); - break; - - case MSG_NS_LOOKUPREPLY: - handle_lookup_reply((MNSLookupReply*)m); - break; - - default: - assert(0); - } - - lock.Unlock(); -} - -void Rank::handle_connect_ack(MNSConnectAck *m) -{ - dout(10) << "handle_connect_ack, my rank is " << m->get_rank() << endl; - my_rank = m->get_rank(); - - my_inst.addr = accepter.listen_addr; - my_inst.rank = my_rank; - - waiting_for_rank.SignalAll(); - delete m; - - // logger! - /*dout(10) << "logger" << endl; - char names[100]; - sprintf(names, "rank%d", my_rank); - string name = names; - - if (g_conf.tcp_log) { - logger = new Logger(name, (LogType*)&rank_logtype); - rank_logtype.add_set("num"); - rank_logtype.add_inc("in"); - rank_logtype.add_inc("inb"); - rank_logtype.add_inc("dis"); - rank_logtype.add_set("inq"); - rank_logtype.add_set("inqb"); - rank_logtype.add_set("outq"); - rank_logtype.add_set("outqb"); - } - */ -} - - -void Rank::handle_register_ack(MNSRegisterAck *m) -{ - dout(10) << "handle_register_ack " << m->get_entity() << endl; - - long tid = m->get_tid(); - waiting_for_register_result[tid] = m->get_entity(); - waiting_for_register_cond[tid]->Signal(); - delete m; -} - -void Rank::handle_lookup_reply(MNSLookupReply *m) -{ - list waiting; - dout(10) << "got lookup reply" << endl; - - for (map::iterator it = m->entity_map.begin(); - it != m->entity_map.end(); - it++) { - dout(10) << "lookup got " << it->first << " at " << it->second << endl; - entity_name_t addr = it->first; - entity_inst_t inst = it->second; - - if (down.count(addr)) { - // ignore - dout(10) << "ignoring lookup results for " << addr << ", who is down" << endl; - //assert(entity_map.count(addr) == 0); - continue; - } - - if (entity_map.count(addr) && - entity_map[addr] > inst) { - dout(10) << "ignoring lookup results for " << addr << ", " \ - << entity_map[addr] << " > " << inst << endl; - continue; - } - - // update map. - entity_map[addr] = inst; - - if (inst.rank == my_rank) { - // local - dout(10) << "delivering lookup results locally" << endl; - if (local.count(addr)) { - if (g_conf.ms_single_dispatch) { - single_dispatch_queue.splice(single_dispatch_queue.end(), - waiting_for_lookup[addr]); - } else { - local[addr]->queue_messages(waiting_for_lookup[addr]); - } - waiting_for_lookup.erase(addr); - } else - lookup(addr); // try again! - - } else { - // remote - if (rank_sender.count(inst.rank) == 0) - connect_rank(inst); - else if (rank_sender[inst.rank]->inst != inst) { - dout(0) << "lookup got rank addr change, WATCH OUT" << endl; - // FIXME BUG possible message loss weirdness? - rank_sender[inst.rank]->stop(); - rank_sender.erase(inst.rank); - connect_rank(inst); - } - - // take waiters - Sender *sender = rank_sender[inst.rank]; - assert(sender); - - if (waiting_for_lookup.count(addr)) { - sender->send(waiting_for_lookup[addr]); - waiting_for_lookup.erase(addr); - } - } - } - - delete m; -} - - -void Rank::wait() -{ - lock.Lock(); - while (1) { - // reap dead senders, receivers. - reaper(); - - if (local.size() == 0) { - dout(10) << "wait: everything stopped" << endl; - break; // everything stopped. - } - - if (local.size() == 1 && - !messenger->is_stopped()) { - dout(10) << "wait: stopping rank" << endl; - lock.Unlock(); - messenger->shutdown(); - delete messenger; - lock.Lock(); - continue; - } - - wait_cond.Wait(lock); - } - lock.Unlock(); - - // done! clean up. - - // stop dispatch thread - if (g_conf.ms_single_dispatch) { - dout(10) << "wait: stopping dispatch thread" << endl; - lock.Lock(); - single_dispatch_stop = true; - single_dispatch_cond.Signal(); - lock.Unlock(); - single_dispatcher.join(); - } - - // reap senders and receivers - lock.Lock(); - { - dout(10) << "wait: stopping senders" << endl; - for (hash_map::iterator i = rank_sender.begin(); - i != rank_sender.end(); - i++) - i->second->stop(); - while (!rank_sender.empty()) { - wait_cond.Wait(lock); - reaper(); - } - - if (0) { // stop() no worky on receivers! we leak, but who cares. - dout(10) << "wait: stopping receivers" << endl; - for (set::iterator i = receivers.begin(); - i != receivers.end(); - i++) - (*i)->stop(); - while (!receivers.empty()) { - wait_cond.Wait(lock); - reaper(); - } - } - - } - lock.Unlock(); - - dout(10) << "wait: done." << endl; -} - - - -int Rank::find_ns_addr(tcpaddr_t &nsa) -{ - // file? - int fd = ::open(".ceph_ns",O_RDONLY); - if (fd > 0) { - ::read(fd, (void*)&nsa, sizeof(nsa)); - ::close(fd); - cout << "ceph ns is " << nsa << endl; - return 0; - } - - // env var? - char *nsaddr = getenv("CEPH_NAMESERVER");////envz_entry(*envp, e_len, "CEPH_NAMESERVER"); - if (nsaddr) { - while (nsaddr[0] != '=') nsaddr++; - nsaddr++; - - if (tcp_hostlookup(nsaddr, nsa) < 0) { - cout << "can't resolve " << nsaddr << endl; - return -1; - } - - cout << "ceph ns is " << nsa << endl; - return 0; - } - - cerr << "i can't find ceph ns addr in .ceph_ns or CEPH_NAMESERVER" << endl; - return -1; -} - - - -/********************************** - * EntityMessenger - */ - -Rank::EntityMessenger::EntityMessenger(entity_name_t myaddr) : - Messenger(myaddr), - stop(false), - dispatch_thread(this) -{ -} -Rank::EntityMessenger::~EntityMessenger() -{ -} - -void Rank::EntityMessenger::dispatch_entry() -{ - lock.Lock(); - while (!stop) { - if (!dispatch_queue.empty()) { - list ls; - ls.swap(dispatch_queue); - - lock.Unlock(); - { - // deliver - while (!ls.empty()) { - Message *m = ls.front(); - ls.pop_front(); - dout(1) //<< g_clock.now() - << "---- " - << m->get_source() << ':' << m->get_source_port() - << " to " << m->get_dest() << ':' << m->get_dest_port() - << " ---- " << m->get_type_name() - << " ---- " << m->get_source_inst() - << " ---- " << m - << endl; - dispatch(m); - } - } - lock.Lock(); - continue; - } - cond.Wait(lock); - } - lock.Unlock(); -} - -void Rank::EntityMessenger::ready() -{ - dout(10) << "ready " << get_myaddr() << endl; - - if (g_conf.ms_single_dispatch) { - rank.lock.Lock(); - if (rank.waiting_for_ready.count(get_myaddr())) { - rank.single_dispatch_queue.splice(rank.single_dispatch_queue.end(), - rank.waiting_for_ready[get_myaddr()]); - rank.waiting_for_ready.erase(get_myaddr()); - rank.single_dispatch_cond.Signal(); - } - rank.lock.Unlock(); - } else { - // start my dispatch thread - dispatch_thread.create(); - } - - // tell namer - if (get_myaddr() != MSG_ADDR_NAMER(0)) - send_message(new MGenericMessage(MSG_NS_STARTED), MSG_ADDR_NAMER(0)); -} - - -int Rank::EntityMessenger::shutdown() -{ - dout(10) << "shutdown " << get_myaddr() << endl; - - // deregister - rank.unregister_entity(this); - - // stop my dispatch thread - if (dispatch_thread.am_self()) { - dout(1) << "shutdown i am dispatch, setting stop flag" << endl; - stop = true; - } else { - dout(1) << "shutdown i am not dispatch, setting stop flag and joining thread." << endl; - lock.Lock(); - stop = true; - cond.Signal(); - lock.Unlock(); - dispatch_thread.join(); - } - - return 0; -} - - -void Rank::EntityMessenger::prepare_send_message(entity_name_t dest) -{ - rank.prepare_dest(dest); -} - -int Rank::EntityMessenger::send_message(Message *m, entity_name_t dest, const entity_inst_t& inst) -{ - // set envelope - m->set_source(get_myaddr(), 0); - m->set_dest(dest, 0); - - m->set_source_inst(rank.my_inst); - - dout(1) << "--> " - << m->get_source() //<< ':' << m->get_source_port() - << " to " << m->get_dest() //<< ':' << m->get_dest_port() - << " ---- " << m->get_type_name() - << " ---- " << rank.my_inst << " --> " << inst - << " ---- " << m - << endl; - - rank.submit_message(m, inst); - - return 0; -} - - -int Rank::EntityMessenger::send_message(Message *m, entity_name_t dest, int port, int fromport) -{ - // set envelope - m->set_source(get_myaddr(), fromport); - m->set_dest(dest, port); - - m->set_source_inst(rank.my_inst); - - dout(1) << "--> " - << m->get_source() //<< ':' << m->get_source_port() - << " to " << m->get_dest() //<< ':' << m->get_dest_port() - << " ---- " << m->get_type_name() - << " ---- " << rank.my_inst << " --> ?" - << " ---- " << m - << endl; - - rank.submit_message(m); - - return 0; -} - - -void Rank::EntityMessenger::mark_down(entity_name_t a, entity_inst_t& i) -{ - assert(a != get_myaddr()); - rank.mark_down(a,i); -} - -void Rank::mark_down(entity_name_t a, entity_inst_t& inst) -{ - if (my_rank == 0) return; // ugh.. rank0 already handles this stuff in the namer - lock.Lock(); - if (down.count(a) == 0) { - if (entity_map.count(a) && - entity_map[a] > inst) { - dout(10) << "mark_down " << a << " inst " << inst << " < " << entity_map[a] << endl; - derr(10) << "mark_down " << a << " inst " << inst << " < " << entity_map[a] << endl; - // do nothing! - } else { - down.insert(a); - - if (entity_map.count(a) == 0) { - // don't know it - dout(10) << "mark_down " << a << " inst " << inst << " ... unknown by me" << endl; - derr(10) << "mark_down " << a << " inst " << inst << " ... unknown by me" << endl; - - waiting_for_lookup.erase(a); - looking_up.erase(a); - } else { - // know it - assert(entity_map[a] <= inst); - dout(10) << "mark_down " << a << " inst " << inst << endl; - derr(10) << "mark_down " << a << " inst " << inst << endl; - - entity_map.erase(a); - - if (rank_sender.count(inst.rank)) { - rank_sender[inst.rank]->stop(); - rank_sender.erase(inst.rank); - } - } - } - } - lock.Unlock(); -} - -void Rank::EntityMessenger::mark_up(entity_name_t a, entity_inst_t& i) -{ - assert(a != get_myaddr()); - rank.mark_up(a, i); -} - -void Rank::mark_up(entity_name_t a, entity_inst_t& i) -{ - if (my_rank == 0) return; - lock.Lock(); - { - dout(10) << "mark_up " << a << " inst " << i << endl; - derr(10) << "mark_up " << a << " inst " << i << endl; - - down.erase(a); - - assert(i.rank != my_rank); // hrm? - - if (entity_map.count(a) == 0 || - entity_map[a] < i) { - entity_map[a] = i; - connect_rank(i); - } else if (entity_map[a] == i) { - dout(10) << "mark_up " << a << " inst " << i << " ... knew it" << endl; - derr(10) << "mark_up " << a << " inst " << i << " ... knew it" << endl; - } else { - dout(-10) << "mark_up " << a << " inst " << i << " < " << entity_map[a] << endl; - derr(-10) << "mark_up " << a << " inst " << i << " < " << entity_map[a] << endl; - } - - //if (waiting_for_lookup.count(a)) - //lookup(a); - } - lock.Unlock(); -} - diff --git a/branches/marnberg/quota/msg/NewMessenger.h b/branches/marnberg/quota/msg/NewMessenger.h deleted file mode 100644 index 0e04315a10883..0000000000000 --- a/branches/marnberg/quota/msg/NewMessenger.h +++ /dev/null @@ -1,305 +0,0 @@ -#ifndef __NEWMESSENGER_H -#define __NEWMESSENGER_H - - -#include -#include -using namespace std; -#include -#include -using namespace __gnu_cxx; - - -#include "include/types.h" - -#include "common/Mutex.h" -#include "common/Cond.h" -#include "common/Thread.h" - -#include "Messenger.h" -#include "Message.h" -#include "tcp.h" - - - - -/* Rank - per-process - */ -class Rank : public Dispatcher { - - class EntityMessenger; - class Sender; - class Receiver; - - // namer - class Namer : public Dispatcher { - public: - EntityMessenger *messenger; // namerN - - int nrank; - int nclient, nmds, nosd, nmon; - - map > waiting; - - Namer(EntityMessenger *msgr); - ~Namer(); - - void handle_connect(class MNSConnect*); - void handle_register(class MNSRegister *m); - void handle_started(Message *m); - void handle_lookup(class MNSLookup *m); - void handle_unregister(Message *m); - void handle_failure(class MNSFailure *m); - - void dispatch(Message *m); - - void manual_insert_inst(const entity_inst_t &inst); - - }; - - // incoming - class Accepter : public Thread { - public: - bool done; - - tcpaddr_t listen_addr; - int listen_sd; - - Accepter() : done(false) {} - - void *entry(); - void stop() { - done = true; - ::close(listen_sd); - join(); - } - int start(); - } accepter; - - - class Receiver : public Thread { - public: - int sd; - bool done; - - Receiver(int _sd) : sd(_sd), done(false) {} - - void *entry(); - void stop() { - done = true; - ::close(sd); - //join(); - } - Message *read_message(); - }; - - - // outgoing - class Sender : public Thread { - public: - entity_inst_t inst; - bool done; - int sd; - - set entities; - list q; - - Mutex lock; - Cond cond; - - Sender(const entity_inst_t& i, int s=0) : inst(i), done(false), sd(s) {} - virtual ~Sender() {} - - void *entry(); - - int connect(); - void fail_and_requeue(list& ls); - void finish(); - - void stop() { - lock.Lock(); - done = true; - cond.Signal(); - lock.Unlock(); - } - - void send(Message *m) { - lock.Lock(); - q.push_back(m); - cond.Signal(); - lock.Unlock(); - } - void send(list& ls) { - lock.Lock(); - q.splice(q.end(), ls); - cond.Signal(); - lock.Unlock(); - } - - int write_message(Message *m); - }; - - - - // messenger interface - class EntityMessenger : public Messenger { - Mutex lock; - Cond cond; - list dispatch_queue; - bool stop; - - class DispatchThread : public Thread { - EntityMessenger *m; - public: - DispatchThread(EntityMessenger *_m) : m(_m) {} - void *entry() { - m->dispatch_entry(); - return 0; - } - } dispatch_thread; - void dispatch_entry(); - - public: - void queue_message(Message *m) { - lock.Lock(); - dispatch_queue.push_back(m); - cond.Signal(); - lock.Unlock(); - } - void queue_messages(list ls) { - lock.Lock(); - dispatch_queue.splice(dispatch_queue.end(), ls); - cond.Signal(); - lock.Unlock(); - } - - public: - EntityMessenger(entity_name_t myaddr); - ~EntityMessenger(); - - void ready(); - bool is_stopped() { return stop; } - - void wait() { - dispatch_thread.join(); - } - - virtual void callback_kick() {} - virtual int shutdown(); - virtual void prepare_send_message(entity_name_t dest); - virtual int send_message(Message *m, entity_name_t dest, int port=0, int fromport=0); - virtual int send_message(Message *m, entity_name_t dest, const entity_inst_t& inst); - - virtual void mark_down(entity_name_t a, entity_inst_t& i); - virtual void mark_up(entity_name_t a, entity_inst_t& i); - //virtual void reset(msg_addr_t a); - }; - - - class SingleDispatcher : public Thread { - Rank *rank; - public: - SingleDispatcher(Rank *r) : rank(r) {} - void *entry() { - rank->single_dispatcher_entry(); - return 0; - } - } single_dispatcher; - - Cond single_dispatch_cond; - bool single_dispatch_stop; - list single_dispatch_queue; - - map > waiting_for_ready; - - void single_dispatcher_entry(); - void _submit_single_dispatch(Message *m); - - - // Rank stuff - public: - Mutex lock; - Cond wait_cond; // for wait() - - // my rank - int my_rank; - Cond waiting_for_rank; - - // my instance - entity_inst_t my_inst; - - // lookup - hash_map entity_map; - hash_set entity_unstarted; - - map > waiting_for_lookup; - set looking_up; - - hash_set down; - - // register - map waiting_for_register_cond; - map waiting_for_register_result; - - // local - map local; - - // remote - hash_map rank_sender; - - set receivers; - - list sender_reap_queue; - list receiver_reap_queue; - - EntityMessenger *messenger; // rankN - Namer *namer; - - - void show_dir(); - - void lookup(entity_name_t addr); - - void dispatch(Message *m); - void handle_connect_ack(class MNSConnectAck *m); - void handle_register_ack(class MNSRegisterAck *m); - void handle_lookup_reply(class MNSLookupReply *m); - - Sender *connect_rank(const entity_inst_t& inst); - - void mark_down(entity_name_t addr, entity_inst_t& i); - void mark_up(entity_name_t addr, entity_inst_t& i); - - tcpaddr_t get_listen_addr() { return accepter.listen_addr; } - - void reaper(); - - -public: - Rank(int r=-1); - ~Rank(); - - int find_ns_addr(tcpaddr_t &tcpaddr); - - void set_namer(const tcpaddr_t& ns); - void start_namer(); - - int start_rank(); - void wait(); - - EntityMessenger *register_entity(entity_name_t addr); - void unregister_entity(EntityMessenger *ms); - - void submit_message(Message *m, const entity_inst_t& inst); - void prepare_dest(entity_name_t dest); - void submit_message(Message *m); - void submit_messages(list& ls); - - // create a new messenger - EntityMessenger *new_entity(entity_name_t addr); - -} ; - -extern Rank rank; - -#endif diff --git a/branches/marnberg/quota/msg/NewerMessenger.cc b/branches/marnberg/quota/msg/NewerMessenger.cc deleted file mode 100644 index c277eea4b409b..0000000000000 --- a/branches/marnberg/quota/msg/NewerMessenger.cc +++ /dev/null @@ -1,1791 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "NewerMessenger.h" - -#include -#include -#include -#include - -#include "config.h" - -#include "messages/MGenericMessage.h" -#include "messages/MNSConnect.h" -#include "messages/MNSConnectAck.h" -#include "messages/MNSRegister.h" -#include "messages/MNSRegisterAck.h" -#include "messages/MNSLookup.h" -#include "messages/MNSLookupReply.h" -#include "messages/MNSFailure.h" - -//#include "messages/MFailure.h" - -#include - - -#undef dout -#define dout(l) if (l<=g_conf.debug_ms) cout << g_clock.now() << " -- rank" << rank.my_rank << " " -#define derr(l) if (l<=g_conf.debug_ms) cerr << g_clock.now() << " -- rank" << rank.my_rank << " " - - - -#include "tcp.cc" - - -Rank rank; - - -/******************************************** - * Namer - */ - -Rank::Namer::Namer(EntityMessenger *msgr) : - messenger(msgr), - nrank(0), nclient(0), nmds(0), nosd(0), nmon(0) -{ - assert(rank.my_rank == 0); - nrank = g_conf.num_mon; - - // announce myself - /* - cerr << "ceph ns is " << rank.accepter.listen_addr << endl; - cout << "export CEPH_NAMESERVER=" << rank.accepter.listen_addr << endl; - int fd = ::open(".ceph_ns", O_WRONLY|O_CREAT); - ::write(fd, (void*)&rank.accepter.listen_addr, sizeof(tcpaddr_t)); - ::fchmod(fd, 0755); - ::close(fd); - */ - - // ok - messenger->set_dispatcher(this); -} - -Rank::Namer::~Namer() -{ - //::unlink(".ceph_ns"); -} - - -void Rank::Namer::dispatch(Message *m) -{ - rank.lock.Lock(); - int type = m->get_type(); - switch (type) { - case MSG_NS_CONNECT: - handle_connect((class MNSConnect*)m); - break; - case MSG_NS_REGISTER: - handle_register((class MNSRegister*)m); - break; - case MSG_NS_STARTED: - handle_started(m); - break; - case MSG_NS_UNREGISTER: - handle_unregister(m); - break; - case MSG_NS_LOOKUP: - handle_lookup((class MNSLookup*)m); - break; - case MSG_NS_FAILURE: - handle_failure((class MNSFailure*)m); - break; - - case MSG_FAILURE_ACK: - delete m; - break; - - default: - assert(0); - } - rank.lock.Unlock(); -} - -void Rank::Namer::handle_connect(MNSConnect *m) -{ - int newrank = nrank++; - dout(2) << "namer.handle_connect from new rank" << newrank << " " << m->get_addr() << endl; - - rank.entity_map[MSG_ADDR_RANK(newrank)].addr = m->get_addr(); - rank.entity_map[MSG_ADDR_RANK(newrank)].rank = newrank; - rank.entity_unstarted.insert(MSG_ADDR_RANK(newrank)); - - messenger->send_message(new MNSConnectAck(newrank), - MSG_ADDR_RANK(newrank), rank.entity_map[MSG_ADDR_RANK(newrank)]); - delete m; -} - -void Rank::Namer::manual_insert_inst(const entity_inst_t &inst) -{ - rank.entity_map[MSG_ADDR_RANK(inst.rank)] = inst; -} - -void Rank::Namer::handle_register(MNSRegister *m) -{ - dout(10) << "namer.handle_register from rank " << m->get_rank() - << " addr " << m->get_entity() << endl; - - // pick id - entity_name_t entity = m->get_entity(); - - if (entity.is_new()) { - // make up a new address! - switch (entity.type()) { - case MSG_ADDR_MDS_BASE: - entity = MSG_ADDR_MDS(nmds++); - break; - - case MSG_ADDR_OSD_BASE: - entity = MSG_ADDR_OSD(nosd++); - break; - - case MSG_ADDR_CLIENT_BASE: - entity = MSG_ADDR_CLIENT(nclient++); - break; - - default: - assert(0); - } - } else { - // specific address! - } - - - // register - if (rank.entity_map.count(entity)) { - dout(1) << "namer.handle_register re-registering " << entity - << " inst " << m->get_source_inst() - << " (was " << rank.entity_map[entity] << ")" - << endl; - } else { - dout(1) << "namer.handle_register registering " << entity - << " inst " << m->get_source_inst() - << endl; - } - rank.entity_map[entity] = m->get_source_inst(); - rank.entity_unstarted.insert(entity); - - // reply w/ new id - messenger->send_message(new MNSRegisterAck(m->get_tid(), entity), - m->get_source(), rank.entity_map[entity]); - - delete m; -} - -void Rank::Namer::handle_started(Message *m) -{ - entity_name_t who = m->get_source(); - dout(10) << "namer.handle_started from entity " << who << endl; - - assert(rank.entity_unstarted.count(who)); - rank.entity_unstarted.erase(who); - - // anybody waiting? - if (waiting.count(who)) { - list ls; - ls.swap(waiting[who]); - waiting.erase(who); - - dout(10) << "doing waiters on " << who << endl; - for (list::iterator it = ls.begin(); - it != ls.end(); - it++) - dispatch(*it); - } - -} - -void Rank::Namer::handle_unregister(Message *m) -{ - entity_name_t who = m->get_source(); - dout(1) << "namer.handle_unregister entity " << who << endl; - - rank.show_dir(); - - assert(rank.entity_map.count(who)); - rank.entity_map.erase(who); - - rank.show_dir(); - - // shut myself down? kick watcher. - if (rank.entity_map.size() == 2) { - dout(10) << "namer.handle_unregister stopping namer" << endl; - rank.lock.Unlock(); - messenger->shutdown(); - delete messenger; - rank.lock.Lock(); - } - - delete m; -} - - -void Rank::Namer::handle_lookup(MNSLookup *m) -{ - // have it? - if (rank.entity_map.count(m->get_entity()) == 0) { - dout(10) << "namer " << m->get_source() << " lookup '" << m->get_entity() << "' -> dne" << endl; - waiting[m->get_entity()].push_back(m); - return; - } - - if (rank.entity_unstarted.count(m->get_entity())) { - dout(10) << "namer " << m->get_source() << " lookup '" << m->get_entity() << "' -> unstarted" << endl; - waiting[m->get_entity()].push_back(m); - return; - } - - // look it up! - MNSLookupReply *reply = new MNSLookupReply(m); - - reply->entity_map[m->get_entity()] = rank.entity_map[m->get_entity()]; - - dout(10) << "namer " << m->get_source() - << " lookup '" << m->get_entity() - << "' -> " << rank.entity_map[m->get_entity()] << endl; - - messenger->send_message(reply, m->get_source(), m->get_source_inst()); - delete m; -} - -void Rank::Namer::handle_failure(MNSFailure *m) -{ - dout(10) << "namer.handle_failure inst " << m->get_inst() - << endl; - - // search for entities on this instance - list rm; - for (hash_map::iterator i = rank.entity_map.begin(); - i != rank.entity_map.end(); - i++) { - if (i->second != m->get_inst()) continue; - rm.push_back(i->first); - } - for (list::iterator i = rm.begin(); - i != rm.end(); - i++) { - dout(10) << "namer.handle_failure inst " << m->get_inst() - << ", removing " << *i << endl; - - rank.entity_map.erase(*i); - rank.entity_unstarted.erase(*i); - - /* - if ((*i).is_osd()) { - // tell the monitor - messenger->send_message(new MFailure(*i, m->get_inst()), MSG_ADDR_MON(0)); - } - */ - } - - delete m; -} - - - -/******************************************** - * Accepter - */ - -int Rank::Accepter::start() -{ - // bind to a socket - dout(10) << "accepter.start binding to listen " << endl; - - /* socket creation */ - listen_sd = socket(AF_INET,SOCK_STREAM,0); - assert(listen_sd > 0); - - /* bind to port */ - memset((char*)&listen_addr, 0, sizeof(listen_addr)); - listen_addr.sin_family = AF_INET; - listen_addr.sin_addr.s_addr = htonl(INADDR_ANY); - listen_addr.sin_port = 0; - - int rc = bind(listen_sd, (struct sockaddr *) &listen_addr, sizeof(listen_addr)); - assert(rc >= 0); - - socklen_t llen = sizeof(listen_addr); - getsockname(listen_sd, (sockaddr*)&listen_addr, &llen); - - int myport = listen_addr.sin_port; - - // listen! - rc = ::listen(listen_sd, 1000); - assert(rc >= 0); - - //dout(10) << "accepter.start listening on " << myport << endl; - - // my address is... - char host[100]; - bzero(host, 100); - gethostname(host, 100); - //dout(10) << "accepter.start my hostname is " << host << endl; - - struct hostent *myhostname = gethostbyname( host ); - - struct sockaddr_in my_addr; - memset(&my_addr, 0, sizeof(my_addr)); - - my_addr.sin_family = myhostname->h_addrtype; - memcpy((char *) &my_addr.sin_addr.s_addr, - myhostname->h_addr_list[0], - myhostname->h_length); - my_addr.sin_port = myport; - - listen_addr = my_addr; - - dout(10) << "accepter.start listen addr is " << listen_addr << endl; - - // start thread - create(); - - return 0; -} - -void *Rank::Accepter::entry() -{ - dout(10) << "accepter starting" << endl; - - while (!done) { - // accept - struct sockaddr_in addr; - socklen_t slen = sizeof(addr); - int sd = ::accept(listen_sd, (sockaddr*)&addr, &slen); - if (sd > 0) { - dout(10) << "accepted incoming on sd " << sd << endl; - - rank.lock.Lock(); - Pipe *p = new Pipe(sd); - rank.pipes.insert(p); - rank.lock.Unlock(); - } else { - dout(10) << "no incoming connection?" << endl; - break; - } - } - - return 0; -} - - - -/************************************** - * Pipe - */ - -int Rank::Pipe::accept() -{ - // my creater gave me sd via accept() - - // announce myself. - int rc = tcp_write(sd, (char*)&rank.my_inst, sizeof(rank.my_inst)); - if (rc < 0) { - ::close(sd); - done = true; - return -1; - } - - // identify peer - rc = tcp_read(sd, (char*)&peer_inst, sizeof(peer_inst)); - if (rc < 0) { - dout(10) << "pipe(? " << this << ").accept couldn't read peer inst" << endl; - ::close(sd); - done = true; - return -1; - } - - // create writer thread. - writer_running = true; - writer_thread.create(); - - // register pipe. - if (peer_inst.rank >= 0) { - rank.lock.Lock(); - { - if (rank.rank_pipe.count(peer_inst.rank) == 0) { - // install a pipe! - dout(10) << "pipe(" << peer_inst << ' ' << this << ").accept peer is " << peer_inst << endl; - rank.rank_pipe[peer_inst.rank] = this; - } else { - // low ranks' Pipes "win" - if (peer_inst.rank < rank.my_inst.rank || - rank.my_inst.rank < 0) { - dout(10) << "pipe(" << peer_inst << ' ' << this << ").accept peer is " << peer_inst - << ", already had pipe, but switching to this new one" << endl; - // switch to this new Pipe - rank.rank_pipe[peer_inst.rank]->close(); // close old one - rank.rank_pipe[peer_inst.rank] = this; - } else { - dout(10) << "pipe(" << peer_inst << ' ' << this << ").accept peer is " << peer_inst - << ", already had pipe, sticking with it" << endl; - } - } - } - rank.lock.Unlock(); - } else { - dout(10) << "pipe(" << peer_inst << ' ' << this << ").accept peer is unranked " << peer_inst << endl; - } - - return 0; // success. -} - -int Rank::Pipe::connect() -{ - dout(10) << "pipe(" << peer_inst << ' ' << this << ").connect" << endl; - - // create socket? - sd = socket(AF_INET,SOCK_STREAM,0); - assert(sd > 0); - - // bind any port - struct sockaddr_in myAddr; - myAddr.sin_family = AF_INET; - myAddr.sin_addr.s_addr = htonl(INADDR_ANY); - myAddr.sin_port = htons( 0 ); - - int rc = bind(sd, (struct sockaddr *) &myAddr, sizeof(myAddr)); - assert(rc>=0); - - // connect! - rc = ::connect(sd, (sockaddr*)&peer_inst.addr, sizeof(myAddr)); - if (rc < 0) return rc; - - // identify peer - entity_inst_t inst; - rc = tcp_read(sd, (char*)&inst, sizeof(inst)); - if (inst.rank < 0) - inst = peer_inst; // i know better than they do. - if (peer_inst != inst && inst.rank > 0) { - derr(0) << "pipe(" << peer_inst << ' ' << this << ").connect peer is " << inst << ", wtf" << endl; - assert(0); - return -1; - } - - // identify myself - rc = tcp_write(sd, (char*)&rank.my_inst, sizeof(rank.my_inst)); - if (rc < 0) - return -1; - - // register pipe - rank.lock.Lock(); - { - if (rank.rank_pipe.count(peer_inst.rank) == 0) { - dout(10) << "pipe(" << peer_inst << ' ' << this << ").connect registering pipe" << endl; - rank.rank_pipe[peer_inst.rank] = this; - } else { - // this is normal. - dout(10) << "pipe(" << peer_inst << ' ' << this << ").connect pipe already registered." << endl; - } - } - rank.lock.Unlock(); - - // start reader - reader_running = true; - reader_thread.create(); - - return 0; -} - - -void Rank::Pipe::close() -{ - if (sent_close) { - dout(10) << "pipe(" << peer_inst << ' ' << this << ").close already closing" << endl; - return; - } - dout(10) << "pipe(" << peer_inst << ' ' << this << ").close" << endl; - - // unreg ourselves - rank.lock.Lock(); - { - if (rank.rank_pipe.count(peer_inst.rank) && - rank.rank_pipe[peer_inst.rank] == this) { - dout(10) << "pipe(" << peer_inst << ' ' << this << ").close unregistering pipe" << endl; - rank.rank_pipe.erase(peer_inst.rank); - } - } - rank.lock.Unlock(); - - // queue close message. - dout(10) << "pipe(" << peer_inst << ' ' << this << ").close queueing MSG_CLOSE" << endl; - lock.Lock(); - q.push_back(new MGenericMessage(MSG_CLOSE)); - cond.Signal(); - sent_close = true; - lock.Unlock(); -} - - -/* read msgs from socket. - * also, server. - * - */ -void Rank::Pipe::reader() -{ - if (server) - accept(); - - // loop. - while (!done) { - Message *m = read_message(); - if (!m || m->get_type() == 0) { - if (m) { - delete m; - dout(10) << "pipe(" << peer_inst << ' ' << this << ").reader read MSG_CLOSE message" << endl; - } else { - derr(10) << "pipe(" << peer_inst << ' ' << this << ").reader read null message" << endl; - } - - if (!sent_close) - close(); - - done = true; - cond.Signal(); // wake up writer too. - break; - } - - dout(10) << "pipe(" << peer_inst << ' ' << this << ").reader got message for " << m->get_dest() << endl; - - EntityMessenger *entity = 0; - - rank.lock.Lock(); - { - if (rank.entity_map.count(m->get_source()) && - rank.entity_map[m->get_source()] > m->get_source_inst()) { - derr(0) << "pipe(" << peer_inst << ' ' << this << ").reader source " << m->get_source() - << " inst " << m->get_source_inst() - << " > " << rank.entity_map[m->get_source()] - << ", WATCH OUT " << *m << endl; - assert(0); - } - - if (m->get_dest().type() == MSG_ADDR_RANK_BASE) { - // ours. - rank.dispatch(m); - } else { - if (g_conf.ms_single_dispatch) { - // submit to single dispatch queue - rank._submit_single_dispatch(m); - } else { - if (rank.local.count(m->get_dest())) { - // find entity - entity = rank.local[m->get_dest()]; - } else { - derr(0) << "pipe(" << peer_inst << ' ' << this << ").reader got message " << *m << " for " << m->get_dest() << ", which isn't local" << endl; - assert(0); // FIXME do this differently - //rank.waiting_for_lookup[m->get_dest()].push_back(m); - } - } - } - } - rank.lock.Unlock(); - - if (entity) - entity->queue_message(m); // queue - } - - - // reap? - bool reap = false; - lock.Lock(); - { - reader_running = false; - if (!writer_running) reap = true; - } - lock.Unlock(); - - if (reap) { - dout(20) << "pipe(" << peer_inst << ' ' << this << ").reader queueing for reap" << endl; - ::close(sd); - rank.lock.Lock(); - { - rank.pipe_reap_queue.push_back(this); - rank.wait_cond.Signal(); - } - rank.lock.Unlock(); - } -} - - -/* write msgs to socket. - * also, client. - */ -void Rank::Pipe::writer() -{ - if (!server) { - int rc = connect(); - if (rc < 0) { - derr(1) << "pipe(" << peer_inst << ' ' << this << ").writer error connecting" << endl; - done = true; - list out; - fail(out); - } - } - - // loop. - lock.Lock(); - while (!q.empty() || !done) { - - if (!q.empty()) { - dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer grabbing message(s)" << endl; - - // grab outgoing list - list out; - out.swap(q); - - // drop lock while i send these - lock.Unlock(); - - while (!out.empty()) { - Message *m = out.front(); - out.pop_front(); - - dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer sending " << *m << endl; - - // stamp. - m->set_source_inst(rank.my_inst); - - // marshall - if (m->empty_payload()) - m->encode_payload(); - - if (write_message(m) < 0) { - // failed! - derr(1) << "pipe(" << peer_inst << ' ' << this << ").writer error sending " << *m << " to " << m->get_dest() << endl; - out.push_front(m); - fail(out); - done = true; - break; - } - - // did i just send a close? - if (m->get_type() == MSG_CLOSE) - done = true; - - // clean up - delete m; - } - - lock.Lock(); - continue; - } - - // wait - dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer sleeping" << endl; - cond.Wait(lock); - } - lock.Unlock(); - - dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer finishing" << endl; - - // reap? - bool reap = false; - lock.Lock(); - { - writer_running = false; - if (!reader_running) reap = true; - } - lock.Unlock(); - - if (reap) { - dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer queueing for reap" << endl; - ::close(sd); - rank.lock.Lock(); - { - rank.pipe_reap_queue.push_back(this); - rank.wait_cond.Signal(); - } - rank.lock.Unlock(); - } -} - - -Message *Rank::Pipe::read_message() -{ - // envelope - //dout(10) << "receiver.read_message from sd " << sd << endl; - - msg_envelope_t env; - if (!tcp_read( sd, (char*)&env, sizeof(env) )) - return 0; - - dout(20) << "pipe(" << peer_inst << ' ' << this << ").reader got envelope type=" << env.type - << " src " << env.source << " dst " << env.dest - << " nchunks=" << env.nchunks - << endl; - - // payload - bufferlist blist; - for (int i=0; iget_source() << endl; - - return m; -} - - - -int Rank::Pipe::write_message(Message *m) -{ - // get envelope, buffers - msg_envelope_t *env = &m->get_envelope(); - bufferlist blist; - blist.claim( m->get_payload() ); - -#ifdef TCP_KEEP_CHUNKS - env->nchunks = blist.buffers().size(); -#else - env->nchunks = 1; -#endif - - dout(20)// << g_clock.now() - << "pipe(" << peer_inst << ' ' << this << ").writer sending " << m << " " << *m - << " to " << m->get_dest() - << endl; - - // send envelope - int r = tcp_write( sd, (char*)env, sizeof(*env) ); - if (r < 0) { - derr(1) << "pipe(" << peer_inst << ' ' << this << ").writer error sending envelope for " << *m - << " to " << m->get_dest() << endl; - return -1; - } - - // payload -#ifdef TCP_KEEP_CHUNKS - // send chunk-wise - int i = 0; - for (list::const_iterator it = blist.buffers().begin(); - it != blist.buffers().end(); - it++) { - dout(10) << "pipe(" << peer_inst << ' ' << this << ").writer tcp_sending frag " << i << " len " << (*it).length() << endl; - int size = (*it).length(); - r = tcp_write( sd, (char*)&size, sizeof(size) ); - if (r < 0) { - derr(10) << "pipe(" << peer_inst << ' ' << this << ").writer error sending chunk len for " << *m << " to " << m->get_dest() << endl; - return -1; - } - r = tcp_write( sd, (*it).c_str(), size ); - if (r < 0) { - derr(10) << "pipe(" << peer_inst << ' ' << this << ").writer error sending data chunk for " << *m << " to " << m->get_dest() << endl; - return -1; - } - i++; - } -#else - // one big chunk - int size = blist.length(); - r = tcp_write( sd, (char*)&size, sizeof(size) ); - if (r < 0) { - derr(10) << "pipe(" << peer_inst << ' ' << this << ").writer error sending data len for " << *m << " to " << m->get_dest() << endl; - return -1; - } - dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer data len is " << size << " in " << blist.buffers().size() << " buffers" << endl; - - for (list::const_iterator it = blist.buffers().begin(); - it != blist.buffers().end(); - it++) { - if ((*it).length() == 0) continue; // blank buffer. - r = tcp_write( sd, (char*)(*it).c_str(), (*it).length() ); - if (r < 0) { - derr(10) << "pipe(" << peer_inst << ' ' << this << ").writer error sending data megachunk for " << *m << " to " << m->get_dest() << " : len " << (*it).length() << endl; - return -1; - } - } -#endif - - return 0; -} - - -void Rank::Pipe::fail(list& out) -{ - derr(10) << "pipe(" << peer_inst << ' ' << this << ").fail" << endl; - - // tell namer - if (!rank.messenger) { - derr(0) << "FATAL error: can't send failure to namer0, not connected yet" << endl; - assert(0); - } - - // FIXME: possible race before i reclaim lock here? - - // deactivate myself - rank.lock.Lock(); - { - if (rank.rank_pipe.count(peer_inst.rank) && - rank.rank_pipe[peer_inst.rank] == this) - rank.rank_pipe.erase(peer_inst.rank); - } - rank.lock.Unlock(); - - // what do i do about reader()? FIXME - - // sort my messages by (source) dispatcher, dest. - map > > by_dis; - lock.Lock(); - { - // include out at front of queue - q.splice(q.begin(), out); - - // sort - while (!q.empty()) { - if (q.front()->get_type() == MSG_CLOSE) { - delete q.front(); - } - else if (rank.local.count(q.front()->get_source())) { - Dispatcher *dis = rank.local[q.front()->get_source()]->get_dispatcher(); - by_dis[dis][q.front()->get_dest()].push_back(q.front()); - } - else { - // oh well. sending entity musta just shut down? - assert(0); - delete q.front(); - } - q.pop_front(); - } - } - lock.Unlock(); - - // report failure(s) to dispatcher(s) - for (map > >::iterator i = by_dis.begin(); - i != by_dis.end(); - ++i) - for (map >::iterator j = i->second.begin(); - j != i->second.end(); - ++j) - for (list::iterator k = j->second.begin(); - k != j->second.end(); - ++k) { - derr(1) << "pipe(" << peer_inst << ' ' << this << ").fail on " << **k << " to " << j->first << " inst " << peer_inst << endl; - i->first->ms_handle_failure(*k, j->first, peer_inst); - } -} - - - - - - -/******************************************** - * Rank - */ - -Rank::Rank(int r) : - single_dispatcher(this), - my_rank(r), - namer(0) { -} -Rank::~Rank() -{ - //FIXME - if (namer) delete namer; -} - - -void Rank::_submit_single_dispatch(Message *m) -{ - assert(lock.is_locked()); - - if (local.count(m->get_dest()) && - local[m->get_dest()]->is_ready()) { - rank.single_dispatch_queue.push_back(m); - rank.single_dispatch_cond.Signal(); - } else { - waiting_for_ready[m->get_dest()].push_back(m); - } -} - - -void Rank::single_dispatcher_entry() -{ - lock.Lock(); - while (!single_dispatch_stop || !single_dispatch_queue.empty()) { - if (!single_dispatch_queue.empty()) { - list ls; - ls.swap(single_dispatch_queue); - - lock.Unlock(); - { - while (!ls.empty()) { - Message *m = ls.front(); - ls.pop_front(); - - dout(1) //<< g_clock.now() - << "---- " - << m->get_source()// << ':' << m->get_source_port() - << " to " << m->get_dest()// << ':' << m->get_dest_port() - << " ---- " << m->get_type_name() - << " ---- " << m - << endl; - - if (m->get_dest().type() == MSG_ADDR_RANK_BASE) - rank.dispatch(m); - else { - assert(local.count(m->get_dest())); - local[m->get_dest()]->dispatch(m); - } - } - } - lock.Lock(); - continue; - } - single_dispatch_cond.Wait(lock); - } - lock.Unlock(); -} - - -/* - * note: assumes lock is held - */ -void Rank::reaper() -{ - dout(10) << "reaper" << endl; - assert(lock.is_locked()); - - while (!pipe_reap_queue.empty()) { - Pipe *p = pipe_reap_queue.front(); - dout(10) << "reaper reaping pipe " << p->get_peer_inst() << endl; - pipe_reap_queue.pop_front(); - assert(pipes.count(p)); - pipes.erase(p); - p->join(); - dout(10) << "reaper reaped pipe " << p->get_peer_inst() << endl; - delete p; - } -} - - -int Rank::start_rank() -{ - dout(10) << "start_rank" << endl; - - // bind to a socket - if (accepter.start() < 0) - return -1; - - // start single thread dispatcher? - if (g_conf.ms_single_dispatch) { - single_dispatch_stop = false; - single_dispatcher.create(); - } - - lock.Lock(); - - // my_inst - my_inst.addr = accepter.listen_addr; - my_inst.rank = my_rank; - - if (my_rank < 0) { - dout(10) << "start_rank connecting to namer0" << endl; - - // connect to namer - assert(entity_map.count(MSG_ADDR_NAMER(0))); - Pipe *pipe = connect_rank(entity_map[MSG_ADDR_NAMER(0)]); - - // send - Message *m = new MNSConnect(accepter.listen_addr); - m->set_dest(MSG_ADDR_NAMER(0), 0); - pipe->send(m); - - // wait - while (my_rank < 0) - waiting_for_rank.Wait(lock); - assert(my_rank >= 0); - - dout(10) << "start_rank got rank " << my_rank << endl; - - // create rank entity - entity_map[MSG_ADDR_RANK(my_rank)] = my_inst; - local[MSG_ADDR_RANK(my_rank)] = messenger = new EntityMessenger(MSG_ADDR_RANK(my_rank)); - messenger->set_dispatcher(this); - } else { - // create my rank - entity_name_t raddr = MSG_ADDR_RANK(my_rank); - entity_map[raddr] = my_inst; - entity_unstarted.insert(raddr); - local[raddr] = messenger = new EntityMessenger(raddr); - messenger->set_dispatcher(this); - - dout(1) << "start_rank " << my_rank << " at " << my_inst << endl; - } - - lock.Unlock(); - return 0; -} - -void Rank::start_namer() -{ - // create namer0 - entity_name_t naddr = MSG_ADDR_NAMER(0); - entity_map[naddr] = my_inst; - local[naddr] = new EntityMessenger(naddr); - namer = new Namer(local[naddr]); - namer_inst = my_inst; -} - -void Rank::set_namer(const tcpaddr_t& ns) -{ - namer_inst.addr = entity_map[MSG_ADDR_NAMER(0)].addr = ns; - namer_inst.rank = entity_map[MSG_ADDR_NAMER(0)].rank = 0; -} - -/* connect_rank - * NOTE: assumes rank.lock held. - */ -Rank::Pipe *Rank::connect_rank(const entity_inst_t& inst) -{ - assert(rank.lock.is_locked()); - assert(inst != rank.my_inst); - - dout(10) << "connect_rank to " << inst << endl; - - // create pipe - Pipe *pipe = new Pipe(inst); - rank.rank_pipe[inst.rank] = pipe; - pipes.insert(pipe); - - return pipe; -} - - - - - -void Rank::show_dir() -{ - dout(10) << "show_dir ---" << endl; - - for (hash_map::iterator i = entity_map.begin(); - i != entity_map.end(); - i++) { - if (local.count(i->first)) { - dout(10) << "show_dir entity_map " << i->first << " -> " << i->second << " local " << endl; - } else { - dout(10) << "show_dir entity_map " << i->first << " -> " << i->second << endl; - } - } -} - - -/* lookup - * NOTE: assumes directory.lock held - */ -void Rank::lookup(entity_name_t addr) -{ - dout(10) << "lookup " << addr << endl; - assert(lock.is_locked()); - - assert(looking_up.count(addr) == 0); - looking_up.insert(addr); - - MNSLookup *r = new MNSLookup(addr); - messenger->send_message(r, MSG_ADDR_NAMER(0), namer_inst); -} - - - -/* register_entity - */ -Rank::EntityMessenger *Rank::register_entity(entity_name_t addr) -{ - dout(10) << "register_entity " << addr << endl; - lock.Lock(); - - // register with namer - static long reg_attempt = 0; - long id = ++reg_attempt; - - Message *reg = new MNSRegister(addr, my_rank, id); - reg->set_source(MSG_ADDR_RANK(my_rank), 0); - reg->set_source_inst(my_inst); - reg->set_dest(MSG_ADDR_DIRECTORY, 0); - - // prepare cond - Cond cond; - waiting_for_register_cond[id] = &cond; - - // send request - lock.Unlock(); - submit_message(reg); - lock.Lock(); - - // wait - while (!waiting_for_register_result.count(id)) - cond.Wait(lock); - - // grab result - addr = waiting_for_register_result[id]; - dout(10) << "register_entity got " << addr << endl; - - // clean up - waiting_for_register_cond.erase(id); - waiting_for_register_result.erase(id); - - // create messenger - EntityMessenger *msgr = new EntityMessenger(addr); - - // add to directory - entity_map[addr] = my_inst; - local[addr] = msgr; - - // was anyone waiting? - if (waiting_for_lookup.count(addr)) { - submit_messages(waiting_for_lookup[addr]); - waiting_for_lookup.erase(addr); - } - - lock.Unlock(); - return msgr; -} - -void Rank::unregister_entity(EntityMessenger *msgr) -{ - lock.Lock(); - dout(10) << "unregister_entity " << msgr->get_myaddr() << endl; - - // remove from local directory. - assert(local.count(msgr->get_myaddr())); - local.erase(msgr->get_myaddr()); - - if (my_rank > 0) { - assert(entity_map.count(msgr->get_myaddr())); - entity_map.erase(msgr->get_myaddr()); - } // else namer will do it. - - // tell namer. - if (msgr->get_myaddr() != MSG_ADDR_NAMER(0) && - msgr->get_myaddr() != MSG_ADDR_RANK(0)) - msgr->send_message(new MGenericMessage(MSG_NS_UNREGISTER), - MSG_ADDR_NAMER(0), namer_inst); - - // kick wait()? - if (local.size() <= 2) - wait_cond.Signal(); - - lock.Unlock(); -} - - -void Rank::submit_messages(list& ls) -{ - for (list::iterator i = ls.begin(); i != ls.end(); i++) - submit_message(*i); - ls.clear(); -} - - - -void Rank::submit_message(Message *m, const entity_inst_t& dest_inst) -{ - const entity_name_t dest = m->get_dest(); - - // lookup - EntityMessenger *entity = 0; - Pipe *pipe = 0; - - lock.Lock(); - { - // local? - if (dest_inst.rank == my_inst.rank) { - if (local.count(dest)) { - // local - dout(20) << "submit_message " << *m << " dest " << dest << " local" << endl; - if (g_conf.ms_single_dispatch) { - _submit_single_dispatch(m); - } else { - entity = local[dest]; - } - } else { - // mid-register - dout(20) << "submit_message " << *m << " dest " << dest << " " << dest_inst << " local but mid-register, waiting." << endl; - assert(0); // hmpf - waiting_for_lookup[dest].push_back(m); - } - } - else { - // remote. - if (rank_pipe.count( dest_inst.rank )) { - //&& - //rank_pipe[dest_inst.rank]->inst == dest_inst) { - dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << dest_inst << ", already connected." << endl; - // connected. - pipe = rank_pipe[ dest_inst.rank ]; - } else { - dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << dest_inst << ", connecting." << endl; - // not connected. - pipe = connect_rank( dest_inst ); - } - } - } - lock.Unlock(); - - // do it - if (entity) { - // local! - dout(20) << "submit_message " << *m << " dest " << dest << " local, queueing" << endl; - entity->queue_message(m); - } - else if (pipe) { - // remote! - dout(20) << "submit_message " << *m << " dest " << dest << " remote, sending" << endl; - pipe->send(m); - } -} - - -void Rank::submit_message(Message *m) -{ - const entity_name_t dest = m->get_dest(); - - // lookup - EntityMessenger *entity = 0; - Pipe *pipe = 0; - - lock.Lock(); - { - if (local.count(dest)) { - dout(20) << "submit_message " << *m << " dest " << dest << " local" << endl; - - // local - if (g_conf.ms_single_dispatch) { - _submit_single_dispatch(m); - } else { - entity = local[dest]; - } - } else if (entity_map.count( dest )) { - // remote, known rank addr. - entity_inst_t inst = entity_map[dest]; - - if (inst == my_inst) { - dout(20) << "submit_message " << *m << " dest " << dest << " local but mid-register, waiting." << endl; - waiting_for_lookup[dest].push_back(m); - } - else if (rank_pipe.count( inst.rank ) && - rank_pipe[inst.rank]->get_peer_inst() == inst) { - dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << inst << ", connected." << endl; - // connected. - pipe = rank_pipe[ inst.rank ]; - } else { - dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << inst << ", connecting." << endl; - // not connected. - pipe = connect_rank( inst ); - } - } else { - // unknown dest rank or rank addr. - if (looking_up.count(dest) == 0) { - dout(20) << "submit_message " << *m << " dest " << dest << " remote, unknown addr, looking up" << endl; - lookup(dest); - } else { - dout(20) << "submit_message " << *m << " dest " << dest << " remote, unknown addr, already looking up" << endl; - } - waiting_for_lookup[dest].push_back(m); - } - } - lock.Unlock(); - - // do it - if (entity) { - // local! - dout(20) << "submit_message " << *m << " dest " << dest << " local, queueing" << endl; - entity->queue_message(m); - } - else if (pipe) { - // remote! - dout(20) << "submit_message " << *m << " dest " << dest << " remote, sending" << endl; - pipe->send(m); - } -} - - - - -void Rank::dispatch(Message *m) -{ - lock.Lock(); - - dout(10) << "dispatching " << *m << endl; - - switch (m->get_type()) { - case MSG_NS_CONNECTACK: - handle_connect_ack((MNSConnectAck*)m); - break; - - case MSG_NS_REGISTERACK: - handle_register_ack((MNSRegisterAck*)m); - break; - - case MSG_NS_LOOKUPREPLY: - handle_lookup_reply((MNSLookupReply*)m); - break; - - default: - assert(0); - } - - lock.Unlock(); -} - -void Rank::handle_connect_ack(MNSConnectAck *m) -{ - dout(10) << "handle_connect_ack, my rank is " << m->get_rank() << endl; - my_rank = m->get_rank(); - - my_inst.addr = accepter.listen_addr; - my_inst.rank = my_rank; - - waiting_for_rank.SignalAll(); - delete m; - - // logger! - /*dout(10) << "logger" << endl; - char names[100]; - sprintf(names, "rank%d", my_rank); - string name = names; - - if (g_conf.tcp_log) { - logger = new Logger(name, (LogType*)&rank_logtype); - rank_logtype.add_set("num"); - rank_logtype.add_inc("in"); - rank_logtype.add_inc("inb"); - rank_logtype.add_inc("dis"); - rank_logtype.add_set("inq"); - rank_logtype.add_set("inqb"); - rank_logtype.add_set("outq"); - rank_logtype.add_set("outqb"); - } - */ -} - - -void Rank::handle_register_ack(MNSRegisterAck *m) -{ - dout(10) << "handle_register_ack " << m->get_entity() << endl; - - long tid = m->get_tid(); - waiting_for_register_result[tid] = m->get_entity(); - waiting_for_register_cond[tid]->Signal(); - delete m; -} - -void Rank::handle_lookup_reply(MNSLookupReply *m) -{ - list waiting; - dout(10) << "got lookup reply" << endl; - - for (map::iterator it = m->entity_map.begin(); - it != m->entity_map.end(); - it++) { - dout(10) << "lookup got " << it->first << " at " << it->second << endl; - entity_name_t addr = it->first; - entity_inst_t inst = it->second; - - if (entity_map.count(addr) && - entity_map[addr] > inst) { - dout(10) << "ignoring lookup results for " << addr << ", " \ - << entity_map[addr] << " > " << inst << endl; - continue; - } - - // update map. - entity_map[addr] = inst; - - if (inst.rank == my_rank) { - // local - dout(10) << "delivering lookup results locally" << endl; - if (local.count(addr)) { - if (g_conf.ms_single_dispatch) { - single_dispatch_queue.splice(single_dispatch_queue.end(), - waiting_for_lookup[addr]); - } else { - local[addr]->queue_messages(waiting_for_lookup[addr]); - } - waiting_for_lookup.erase(addr); - } else - lookup(addr); // try again! - - } else { - // remote - if (rank_pipe.count(inst.rank) == 0) - connect_rank(inst); - else if (rank_pipe[inst.rank]->get_peer_inst() != inst) { - dout(0) << "lookup got rank addr change, WATCH OUT" << endl; - // FIXME BUG possible message loss weirdness? - rank_pipe[inst.rank]->close(); - rank_pipe.erase(inst.rank); - connect_rank(inst); - } - - // take waiters - Pipe *pipe = rank_pipe[inst.rank]; - assert(pipe); - - if (waiting_for_lookup.count(addr)) { - pipe->send(waiting_for_lookup[addr]); - waiting_for_lookup.erase(addr); - } - } - } - - delete m; -} - - -void Rank::wait() -{ - lock.Lock(); - while (1) { - // reap dead pipes - reaper(); - - if (local.size() == 0) { - dout(10) << "wait: everything stopped" << endl; - break; // everything stopped. - } - - if (local.size() == 1 && - !messenger->is_stopped()) { - dout(10) << "wait: stopping rank" << endl; - lock.Unlock(); - messenger->shutdown(); - delete messenger; - lock.Lock(); - continue; - } - - wait_cond.Wait(lock); - } - lock.Unlock(); - - // done! clean up. - - // stop dispatch thread - if (g_conf.ms_single_dispatch) { - dout(10) << "wait: stopping dispatch thread" << endl; - lock.Lock(); - single_dispatch_stop = true; - single_dispatch_cond.Signal(); - lock.Unlock(); - single_dispatcher.join(); - } - - // reap pipes - lock.Lock(); - { - dout(10) << "wait: closing pipes" << endl; - list toclose; - for (hash_map::iterator i = rank_pipe.begin(); - i != rank_pipe.end(); - i++) - toclose.push_back(i->second); - for (list::iterator i = toclose.begin(); - i != toclose.end(); - i++) - (*i)->close(); - - dout(10) << "wait: waiting for pipes " << pipes << " to close" << endl; - while (!pipes.empty()) { - wait_cond.Wait(lock); - reaper(); - } - } - lock.Unlock(); - - dout(10) << "wait: done." << endl; -} - - - -int Rank::find_ns_addr(tcpaddr_t &nsa) -{ - // file? - int fd = ::open(".ceph_ns",O_RDONLY); - if (fd > 0) { - ::read(fd, (void*)&nsa, sizeof(nsa)); - ::close(fd); - cout << "ceph ns is " << nsa << endl; - return 0; - } - - // env var? - char *nsaddr = getenv("CEPH_NAMESERVER");////envz_entry(*envp, e_len, "CEPH_NAMESERVER"); - if (nsaddr) { - while (nsaddr[0] != '=') nsaddr++; - nsaddr++; - - if (tcp_hostlookup(nsaddr, nsa) < 0) { - cout << "can't resolve " << nsaddr << endl; - return -1; - } - - cout << "ceph ns is " << nsa << endl; - return 0; - } - - cerr << "i can't find ceph ns addr in .ceph_ns or CEPH_NAMESERVER" << endl; - return -1; -} - - - -/********************************** - * EntityMessenger - */ - -Rank::EntityMessenger::EntityMessenger(entity_name_t myaddr) : - Messenger(myaddr), - stop(false), - dispatch_thread(this) -{ -} -Rank::EntityMessenger::~EntityMessenger() -{ -} - -void Rank::EntityMessenger::dispatch_entry() -{ - lock.Lock(); - while (!stop) { - if (!dispatch_queue.empty()) { - list ls; - ls.swap(dispatch_queue); - - lock.Unlock(); - { - // deliver - while (!ls.empty()) { - Message *m = ls.front(); - ls.pop_front(); - dout(1) //<< g_clock.now() - << "---- " - << m->get_source()// << ':' << m->get_source_port() - << " to " << m->get_dest()// << ':' << m->get_dest_port() - << " ---- " << m->get_type_name() - << " ---- " << m->get_source_inst() - << " ---- " << m - << endl; - dispatch(m); - } - } - lock.Lock(); - continue; - } - cond.Wait(lock); - } - lock.Unlock(); -} - -void Rank::EntityMessenger::ready() -{ - dout(10) << "ready " << get_myaddr() << endl; - - if (g_conf.ms_single_dispatch) { - rank.lock.Lock(); - if (rank.waiting_for_ready.count(get_myaddr())) { - rank.single_dispatch_queue.splice(rank.single_dispatch_queue.end(), - rank.waiting_for_ready[get_myaddr()]); - rank.waiting_for_ready.erase(get_myaddr()); - rank.single_dispatch_cond.Signal(); - } - rank.lock.Unlock(); - } else { - // start my dispatch thread - dispatch_thread.create(); - } - - // tell namer - if (get_myaddr() != MSG_ADDR_NAMER(0) && - get_myaddr() != MSG_ADDR_RANK(0)) - send_message(new MGenericMessage(MSG_NS_STARTED), MSG_ADDR_NAMER(0), rank.namer_inst); -} - - -int Rank::EntityMessenger::shutdown() -{ - dout(10) << "shutdown " << get_myaddr() << endl; - - // deregister - rank.unregister_entity(this); - - // stop my dispatch thread - if (dispatch_thread.am_self()) { - dout(1) << "shutdown i am dispatch, setting stop flag" << endl; - stop = true; - } else { - dout(1) << "shutdown i am not dispatch, setting stop flag and joining thread." << endl; - lock.Lock(); - stop = true; - cond.Signal(); - lock.Unlock(); - dispatch_thread.join(); - } - - return 0; -} - - -void Rank::EntityMessenger::prepare_dest(const entity_inst_t& inst) -{ - rank.lock.Lock(); - { - if (rank.rank_pipe.count(inst.rank) == 0) - rank.connect_rank(inst); - } - rank.lock.Unlock(); -} - -int Rank::EntityMessenger::send_message(Message *m, entity_name_t dest, const entity_inst_t& inst, - int port, int fromport) -{ - // set envelope - m->set_source(get_myaddr(), fromport); - m->set_dest(dest, port); - - m->set_source_inst(rank.my_inst); - - dout(1) << "--> " - << m->get_source() //<< ':' << m->get_source_port() - << " to " << m->get_dest() //<< ':' << m->get_dest_port() - << " ---- " << m->get_type_name() - << " ---- " << rank.my_inst << " --> " << inst - << " ---- " << m - << endl; - - rank.submit_message(m, inst); - - return 0; -} - - -int Rank::EntityMessenger::send_message(Message *m, entity_name_t dest, int port, int fromport) -{ - // set envelope - m->set_source(get_myaddr(), fromport); - m->set_dest(dest, port); - - m->set_source_inst(rank.my_inst); - - dout(1) << "--> " - << m->get_source() //<< ':' << m->get_source_port() - << " to " << m->get_dest() //<< ':' << m->get_dest_port() - << " ---- " << m->get_type_name() - << " ---- " << rank.my_inst << " --> ? (DEPRECATED)" - << " ---- " << m - << endl; - - rank.submit_message(m); - - return 0; -} - - -void Rank::EntityMessenger::mark_down(entity_name_t a, entity_inst_t& i) -{ - assert(a != get_myaddr()); - rank.mark_down(a,i); -} - -void Rank::mark_down(entity_name_t a, entity_inst_t& inst) -{ - //if (my_rank == 0) return; // ugh.. rank0 already handles this stuff in the namer - lock.Lock(); - if (entity_map.count(a) && - entity_map[a] > inst) { - dout(10) << "mark_down " << a << " inst " << inst << " < " << entity_map[a] << endl; - derr(10) << "mark_down " << a << " inst " << inst << " < " << entity_map[a] << endl; - // do nothing! - } else { - if (entity_map.count(a) == 0) { - // don't know it - dout(10) << "mark_down " << a << " inst " << inst << " ... unknown by me" << endl; - derr(10) << "mark_down " << a << " inst " << inst << " ... unknown by me" << endl; - - waiting_for_lookup.erase(a); - looking_up.erase(a); - } else { - // know it - assert(entity_map[a] <= inst); - dout(10) << "mark_down " << a << " inst " << inst << endl; - derr(10) << "mark_down " << a << " inst " << inst << endl; - - entity_map.erase(a); - - if (rank_pipe.count(inst.rank)) { - rank_pipe[inst.rank]->close(); - rank_pipe.erase(inst.rank); - } - - // kill rank# too? only if i'm the namer. - if (my_rank == 0) { - entity_map.erase(MSG_ADDR_RANK(inst.rank)); - } - } - } - lock.Unlock(); -} - -void Rank::EntityMessenger::mark_up(entity_name_t a, entity_inst_t& i) -{ - assert(a != get_myaddr()); - rank.mark_up(a, i); -} - -void Rank::mark_up(entity_name_t a, entity_inst_t& i) -{ - if (my_rank == 0) return; - lock.Lock(); - { - dout(10) << "mark_up " << a << " inst " << i << endl; - derr(10) << "mark_up " << a << " inst " << i << endl; - - assert(i.rank != my_rank); // hrm? - - if (entity_map.count(a) == 0 || - entity_map[a] < i) { - entity_map[a] = i; - connect_rank(i); - } else if (entity_map[a] == i) { - dout(10) << "mark_up " << a << " inst " << i << " ... knew it" << endl; - derr(10) << "mark_up " << a << " inst " << i << " ... knew it" << endl; - } else { - dout(-10) << "mark_up " << a << " inst " << i << " < " << entity_map[a] << endl; - derr(-10) << "mark_up " << a << " inst " << i << " < " << entity_map[a] << endl; - } - - //if (waiting_for_lookup.count(a)) - //lookup(a); - } - lock.Unlock(); -} - diff --git a/branches/marnberg/quota/msg/NewerMessenger.h b/branches/marnberg/quota/msg/NewerMessenger.h deleted file mode 100644 index 29b885745df48..0000000000000 --- a/branches/marnberg/quota/msg/NewerMessenger.h +++ /dev/null @@ -1,343 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __NEWMESSENGER_H -#define __NEWMESSENGER_H - - -#include -#include -using namespace std; -#include -#include -using namespace __gnu_cxx; - - -#include "include/types.h" - -#include "common/Mutex.h" -#include "common/Cond.h" -#include "common/Thread.h" - -#include "Messenger.h" -#include "Message.h" -#include "tcp.h" - - - - -/* Rank - per-process - */ -class Rank : public Dispatcher { - - class EntityMessenger; - class Pipe; - - // namer - class Namer : public Dispatcher { - public: - EntityMessenger *messenger; // namerN - - int nrank; - int nclient, nmds, nosd, nmon; - - map > waiting; - - Namer(EntityMessenger *msgr); - ~Namer(); - - void handle_connect(class MNSConnect*); - void handle_register(class MNSRegister *m); - void handle_started(Message *m); - void handle_lookup(class MNSLookup *m); - void handle_unregister(Message *m); - void handle_failure(class MNSFailure *m); - - void dispatch(Message *m); - - void manual_insert_inst(const entity_inst_t &inst); - - }; - - // incoming - class Accepter : public Thread { - public: - bool done; - - tcpaddr_t listen_addr; - int listen_sd; - - Accepter() : done(false) {} - - void *entry(); - void stop() { - done = true; - ::close(listen_sd); - join(); - } - int start(); - } accepter; - - - - class Pipe { - protected: - int sd; - bool done; - entity_inst_t peer_inst; - bool server; - bool sent_close; - - bool reader_running; - bool writer_running; - - list q; - Mutex lock; - Cond cond; - - int accept(); // server handshake - int connect(); // client handshake - void reader(); - void writer(); - - Message *read_message(); - int write_message(Message *m); - void fail(list& ls); - - // threads - class Reader : public Thread { - Pipe *pipe; - public: - Reader(Pipe *p) : pipe(p) {} - void *entry() { pipe->reader(); return 0; } - } reader_thread; - friend class Reader; - - class Writer : public Thread { - Pipe *pipe; - public: - Writer(Pipe *p) : pipe(p) {} - void *entry() { pipe->writer(); return 0; } - } writer_thread; - friend class Writer; - - public: - Pipe(int s) : sd(s), - done(false), server(true), - sent_close(false), - reader_running(false), writer_running(false), - reader_thread(this), writer_thread(this) { - // server - reader_running = true; - reader_thread.create(); - } - Pipe(const entity_inst_t &pi) : sd(0), - done(false), peer_inst(pi), server(false), - sent_close(false), - reader_running(false), writer_running(false), - reader_thread(this), writer_thread(this) { - // client - writer_running = true; - writer_thread.create(); - } - - // public constructors - static const Pipe& Server(int s); - static const Pipe& Client(const entity_inst_t& pi); - - entity_inst_t& get_peer_inst() { return peer_inst; } - - void close(); - void join() { - writer_thread.join(); - reader_thread.join(); - } - - void send(Message *m) { - lock.Lock(); - q.push_back(m); - cond.Signal(); - lock.Unlock(); - } - void send(list& ls) { - lock.Lock(); - q.splice(q.end(), ls); - cond.Signal(); - lock.Unlock(); - } - }; - - - - // messenger interface - class EntityMessenger : public Messenger { - Mutex lock; - Cond cond; - list dispatch_queue; - bool stop; - - class DispatchThread : public Thread { - EntityMessenger *m; - public: - DispatchThread(EntityMessenger *_m) : m(_m) {} - void *entry() { - m->dispatch_entry(); - return 0; - } - } dispatch_thread; - void dispatch_entry(); - - public: - void queue_message(Message *m) { - lock.Lock(); - dispatch_queue.push_back(m); - cond.Signal(); - lock.Unlock(); - } - void queue_messages(list ls) { - lock.Lock(); - dispatch_queue.splice(dispatch_queue.end(), ls); - cond.Signal(); - lock.Unlock(); - } - - public: - EntityMessenger(entity_name_t myaddr); - ~EntityMessenger(); - - void ready(); - bool is_stopped() { return stop; } - - void wait() { - dispatch_thread.join(); - } - - virtual void callback_kick() {} - virtual int shutdown(); - virtual void prepare_dest(const entity_inst_t& inst); - virtual int send_message(Message *m, entity_name_t dest, int port=0, int fromport=0); - virtual int send_message(Message *m, entity_name_t dest, const entity_inst_t& inst, - int port=0, int fromport=0); - - virtual void mark_down(entity_name_t a, entity_inst_t& i); - virtual void mark_up(entity_name_t a, entity_inst_t& i); - //virtual void reset(msg_addr_t a); - }; - - - class SingleDispatcher : public Thread { - Rank *rank; - public: - SingleDispatcher(Rank *r) : rank(r) {} - void *entry() { - rank->single_dispatcher_entry(); - return 0; - } - } single_dispatcher; - - Cond single_dispatch_cond; - bool single_dispatch_stop; - list single_dispatch_queue; - - map > waiting_for_ready; - - void single_dispatcher_entry(); - void _submit_single_dispatch(Message *m); - - - // Rank stuff - public: - Mutex lock; - Cond wait_cond; // for wait() - - // my rank - int my_rank; - Cond waiting_for_rank; - - // my instance - entity_inst_t my_inst; - - // lookup - hash_map entity_map; - hash_set entity_unstarted; - - map > waiting_for_lookup; - set looking_up; - - // register - map waiting_for_register_cond; - map waiting_for_register_result; - - // local - map local; - - // remote - hash_map rank_pipe; - - set pipes; - list pipe_reap_queue; - - EntityMessenger *messenger; // rankN - Namer *namer; - - entity_inst_t namer_inst; - - void show_dir(); - - void lookup(entity_name_t addr); - - void dispatch(Message *m); - void handle_connect_ack(class MNSConnectAck *m); - void handle_register_ack(class MNSRegisterAck *m); - void handle_lookup_reply(class MNSLookupReply *m); - - Pipe *connect_rank(const entity_inst_t& inst); - - void mark_down(entity_name_t addr, entity_inst_t& i); - void mark_up(entity_name_t addr, entity_inst_t& i); - - tcpaddr_t get_listen_addr() { return accepter.listen_addr; } - - void reaper(); - - -public: - Rank(int r=-1); - ~Rank(); - - int find_ns_addr(tcpaddr_t &tcpaddr); - - void set_namer(const tcpaddr_t& ns); - void start_namer(); - - int start_rank(); - void wait(); - - EntityMessenger *register_entity(entity_name_t addr); - void unregister_entity(EntityMessenger *ms); - - void submit_message(Message *m, const entity_inst_t& inst); - void prepare_dest(const entity_inst_t& inst); - void submit_message(Message *m); - void submit_messages(list& ls); - - // create a new messenger - EntityMessenger *new_entity(entity_name_t addr); - -} ; - - - -extern Rank rank; - -#endif diff --git a/branches/marnberg/quota/msg/RWLock.h b/branches/marnberg/quota/msg/RWLock.h deleted file mode 100644 index 83b84c6faf370..0000000000000 --- a/branches/marnberg/quota/msg/RWLock.h +++ /dev/null @@ -1,49 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef _RWLock_Posix_ -#define _RWLock_Posix_ - -#include - -class RWLock -{ - mutable pthread_rwlock_t L; - - public: - - RWLock() { - pthread_rwlock_init(&L, NULL); - } - - virtual ~RWLock() { - pthread_rwlock_unlock(&L); - pthread_rwlock_destroy(&L); - } - - void unlock() { - pthread_rwlock_unlock(&L); - } - void get_read() { - pthread_rwlock_rdlock(&L); - } - void put_read() { unlock(); } - void get_write() { - pthread_rwlock_wrlock(&L); - } - void put_write() { unlock(); } -}; - -#endif // !_Mutex_Posix_ diff --git a/branches/marnberg/quota/msg/SerialMessenger.h b/branches/marnberg/quota/msg/SerialMessenger.h deleted file mode 100644 index 1c5c9e9c3961a..0000000000000 --- a/branches/marnberg/quota/msg/SerialMessenger.h +++ /dev/null @@ -1,28 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __SERIAL_MESSENGER_H -#define __SERIAL_MESSENGER_H - -#include "Dispatcher.h" -#include "Message.h" - -class SerialMessenger : public Dispatcher { - public: - virtual void dispatch(Message *m) = 0; // i receive my messages here - virtual void send(Message *m, entity_name_t dest, int port=0, int fromport=0) = 0; // doesn't block - virtual Message *sendrecv(Message *m, entity_name_t dest, int port=0, int fromport=0) = 0; // blocks for matching reply -}; - -#endif diff --git a/branches/marnberg/quota/msg/SimpleMessenger.cc b/branches/marnberg/quota/msg/SimpleMessenger.cc deleted file mode 100644 index 5bb9e84d188d6..0000000000000 --- a/branches/marnberg/quota/msg/SimpleMessenger.cc +++ /dev/null @@ -1,1189 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "SimpleMessenger.h" - -#include -#include -#include -#include -#include - -#include "config.h" - -#include "messages/MGenericMessage.h" - -//#include "messages/MFailure.h" - -#include - - -#undef dout -#define dout(l) if (l<=g_conf.debug_ms) cout << g_clock.now() << " -- " << rank.my_addr << " " -#define derr(l) if (l<=g_conf.debug_ms) cerr << g_clock.now() << " -- " << rank.my_addr << " " - - - -#include "tcp.cc" - - -Rank rank; - - - -/******************************************** - * Accepter - */ - -void simplemessenger_sigint(int r) -{ - rank.sigint(); -} - -void Rank::sigint() -{ - lock.Lock(); - derr(0) << "got control-c, exiting" << endl; - ::close(accepter.listen_sd); - _exit(-1); - lock.Unlock(); -} - - - - -int Rank::Accepter::start() -{ - // bind to a socket - dout(10) << "accepter.start binding to listen " << endl; - - // use whatever user specified.. - g_my_addr.make_addr(rank.listen_addr); - - /* socket creation */ - listen_sd = socket(AF_INET,SOCK_STREAM,0); - assert(listen_sd > 0); - - /* bind to port */ - int rc = bind(listen_sd, (struct sockaddr *) &rank.listen_addr, sizeof(rank.listen_addr)); - if (rc < 0) - derr(0) << "accepter.start unable to bind to " << rank.listen_addr << endl; - assert(rc >= 0); - - // what port did we get? - socklen_t llen = sizeof(rank.listen_addr); - getsockname(listen_sd, (sockaddr*)&rank.listen_addr, &llen); - - dout(10) << "accepter.start bound to " << rank.listen_addr << endl; - - // listen! - rc = ::listen(listen_sd, 1000); - assert(rc >= 0); - - // my address is... HELP HELP HELP! - char host[100]; - bzero(host, 100); - gethostname(host, 100); - //dout(10) << "accepter.start my hostname is " << host << endl; - - struct hostent *myhostname = gethostbyname( host ); - - // figure out my_addr - if (g_my_addr.port > 0) { - // user specified it, easy peasy. - rank.my_addr = g_my_addr; - } else { - // look up my hostname. blech! this sucks. - rank.listen_addr.sin_family = myhostname->h_addrtype; - memcpy((char *) &rank.listen_addr.sin_addr.s_addr, - myhostname->h_addr_list[0], - myhostname->h_length); - - // set up my_addr with a nonce - rank.my_addr.set_addr(rank.listen_addr); - rank.my_addr.nonce = getpid(); // FIXME: pid might not be best choice here. - } - - dout(10) << "accepter.start my addr is " << rank.my_addr << endl; - - // set up signal handler - signal(SIGINT, simplemessenger_sigint); - - // start thread - create(); - - return 0; -} - -void *Rank::Accepter::entry() -{ - dout(10) << "accepter starting" << endl; - - while (!done) { - // accept - struct sockaddr_in addr; - socklen_t slen = sizeof(addr); - int sd = ::accept(listen_sd, (sockaddr*)&addr, &slen); - if (sd > 0) { - dout(10) << "accepted incoming on sd " << sd << endl; - - rank.lock.Lock(); - if (!rank.local.empty()) { - Pipe *p = new Pipe(sd); - rank.pipes.insert(p); - } - rank.lock.Unlock(); - } else { - dout(10) << "no incoming connection?" << endl; - break; - } - } - - return 0; -} - - - -/************************************** - * Pipe - */ - -int Rank::Pipe::accept() -{ - // my creater gave me sd via accept() - - // announce myself. - int rc = tcp_write(sd, (char*)&rank.my_addr, sizeof(rank.my_addr)); - if (rc < 0) { - ::close(sd); - done = true; - return -1; - } - - // identify peer - rc = tcp_read(sd, (char*)&peer_addr, sizeof(peer_addr)); - if (rc < 0) { - dout(10) << "pipe(? " << this << ").accept couldn't read peer inst" << endl; - ::close(sd); - done = true; - return -1; - } - - // create writer thread. - writer_running = true; - writer_thread.create(); - - // register pipe. - rank.lock.Lock(); - { - if (rank.rank_pipe.count(peer_addr) == 0) { - // install a pipe! - dout(10) << "pipe(" << peer_addr << ' ' << this << ").accept peer is " << peer_addr << endl; - rank.rank_pipe[peer_addr] = this; - } else { - // low ranks' Pipes "win" - if (peer_addr < rank.my_addr) { - dout(10) << "pipe(" << peer_addr << ' ' << this << ").accept peer is " << peer_addr - << ", already had pipe, but switching to this new one" << endl; - // switch to this new Pipe - rank.rank_pipe[peer_addr]->close(); // close old one - rank.rank_pipe[peer_addr] = this; - } else { - dout(10) << "pipe(" << peer_addr << ' ' << this << ").accept peer is " << peer_addr - << ", already had pipe, sticking with it" << endl; - } - } - } - rank.lock.Unlock(); - - return 0; // success. -} - -int Rank::Pipe::connect() -{ - dout(10) << "pipe(" << peer_addr << ' ' << this << ").connect" << endl; - - // create socket? - sd = socket(AF_INET,SOCK_STREAM,0); - assert(sd > 0); - - // bind any port - struct sockaddr_in myAddr; - myAddr.sin_family = AF_INET; - myAddr.sin_addr.s_addr = htonl(INADDR_ANY); - myAddr.sin_port = htons( 0 ); - - int rc = bind(sd, (struct sockaddr *) &myAddr, sizeof(myAddr)); - assert(rc>=0); - - // connect! - tcpaddr_t tcpaddr; - peer_addr.make_addr(tcpaddr); - rc = ::connect(sd, (sockaddr*)&tcpaddr, sizeof(myAddr)); - if (rc < 0) { - dout(10) << "connect error " << peer_addr - << ", " << errno << ": " << strerror(errno) << endl; - return rc; - } - - // identify peer - entity_addr_t paddr; - rc = tcp_read(sd, (char*)&paddr, sizeof(paddr)); - if (!rc) { // bool - dout(10) << "pipe(" << peer_addr << ' ' << this << ").connect couldn't read peer addr" << endl; - return -1; - } - if (peer_addr != paddr) { - derr(0) << "pipe(" << peer_addr << ' ' << this << ").connect peer is " << paddr << ", wtf" << endl; - assert(0); - return -1; - } - - // identify myself - rc = tcp_write(sd, (char*)&rank.my_addr, sizeof(rank.my_addr)); - if (rc < 0) - return -1; - - // register pipe - rank.lock.Lock(); - { - if (rank.rank_pipe.count(peer_addr) == 0) { - dout(10) << "pipe(" << peer_addr << ' ' << this << ").connect registering pipe" << endl; - rank.rank_pipe[peer_addr] = this; - } else { - // this is normal. - dout(10) << "pipe(" << peer_addr << ' ' << this << ").connect pipe already registered." << endl; - } - } - rank.lock.Unlock(); - - // start reader - reader_running = true; - reader_thread.create(); - - return 0; -} - - -void Rank::Pipe::close() -{ - dout(10) << "pipe(" << peer_addr << ' ' << this << ").close" << endl; - - // unreg ourselves - rank.lock.Lock(); - { - if (rank.rank_pipe.count(peer_addr) && - rank.rank_pipe[peer_addr] == this) { - dout(10) << "pipe(" << peer_addr << ' ' << this - << ").close unregistering pipe" << endl; - rank.rank_pipe.erase(peer_addr); - } - } - rank.lock.Unlock(); - - // queue close message? - if (!need_to_send_close) { - dout(10) << "pipe(" << peer_addr << ' ' << this - << ").close already closing/closed" << endl; - return; - } - - if (!writer_running) { - dout(10) << "pipe(" << peer_addr << ' ' << this - << ").close not queueing MSG_CLOSE, no writer running" << endl; - } else { - dout(10) << "pipe(" << peer_addr << ' ' << this - << ").close queueing MSG_CLOSE" << endl; - lock.Lock(); - q.push_back(new MGenericMessage(MSG_CLOSE)); - cond.Signal(); - need_to_send_close = false; - lock.Unlock(); - } -} - - -/* read msgs from socket. - * also, server. - * - */ -void Rank::Pipe::reader() -{ - if (server) - accept(); - - // loop. - while (!done) { - Message *m = read_message(); - if (!m || m->get_type() == 0) { - if (m) { - delete m; - dout(10) << "pipe(" << peer_addr << ' ' << this << ").reader read MSG_CLOSE message" << endl; - need_to_send_close = false; - } else { - derr(10) << "pipe(" << peer_addr << ' ' << this << ").reader read null message" << endl; - } - - close(); - - done = true; - cond.Signal(); // wake up writer too. - break; - } - - dout(10) << "pipe(" << peer_addr << ' ' << this << ").reader got message for " << m->get_dest() << endl; - - EntityMessenger *entity = 0; - - rank.lock.Lock(); - { - if (g_conf.ms_single_dispatch) { - // submit to single dispatch queue - rank._submit_single_dispatch(m); - } else { - if (rank.local.count(m->get_dest())) { - // find entity - entity = rank.local[m->get_dest()]; - } else { - entity = rank.find_unnamed(m->get_dest()); - if (!entity) { - if (rank.stopped.count(m->get_dest())) { - // ignore it - } else { - derr(0) << "pipe(" << peer_addr << ' ' << this << ").reader got message " << *m << " for " << m->get_dest() << ", which isn't local" << endl; - assert(0); // FIXME do this differently - } - } - } - } - } - rank.lock.Unlock(); - - if (entity) - entity->queue_message(m); // queue - } - - - // reap? - bool reap = false; - lock.Lock(); - { - reader_running = false; - if (!writer_running) reap = true; - } - lock.Unlock(); - - if (reap) { - dout(20) << "pipe(" << peer_addr << ' ' << this << ").reader queueing for reap" << endl; - ::close(sd); - rank.lock.Lock(); - { - rank.pipe_reap_queue.push_back(this); - rank.wait_cond.Signal(); - } - rank.lock.Unlock(); - } -} - - -/* write msgs to socket. - * also, client. - */ -void Rank::Pipe::writer() -{ - if (!server) { - int rc = connect(); - if (rc < 0) { - derr(1) << "pipe(" << peer_addr << ' ' << this << ").writer error connecting, " - << errno << ": " << strerror(errno) - << endl; - done = true; - list out; - fail(out); - } - } - - // loop. - lock.Lock(); - while (!q.empty() || !done) { - - if (!q.empty()) { - dout(20) << "pipe(" << peer_addr << ' ' << this << ").writer grabbing message(s)" << endl; - - // grab outgoing list - list out; - out.swap(q); - - // drop lock while i send these - lock.Unlock(); - - while (!out.empty()) { - Message *m = out.front(); - out.pop_front(); - - dout(20) << "pipe(" << peer_addr << ' ' << this << ").writer sending " << *m << endl; - - // stamp. - m->set_source_addr(rank.my_addr); - - // marshall - if (m->empty_payload()) - m->encode_payload(); - - if (write_message(m) < 0) { - // failed! - derr(1) << "pipe(" << peer_addr << ' ' << this << ").writer error sending " << *m << " to " << m->get_dest() - << ", " << errno << ": " << strerror(errno) - << endl; - out.push_front(m); - fail(out); - done = true; - break; - } - - // did i just send a close? - if (m->get_type() == MSG_CLOSE) - done = true; - - // clean up - delete m; - } - - lock.Lock(); - continue; - } - - // wait - dout(20) << "pipe(" << peer_addr << ' ' << this << ").writer sleeping" << endl; - cond.Wait(lock); - } - lock.Unlock(); - - dout(20) << "pipe(" << peer_addr << ' ' << this << ").writer finishing" << endl; - - // reap? - bool reap = false; - lock.Lock(); - { - writer_running = false; - if (!reader_running) reap = true; - } - lock.Unlock(); - - if (reap) { - dout(20) << "pipe(" << peer_addr << ' ' << this << ").writer queueing for reap" << endl; - ::close(sd); - rank.lock.Lock(); - { - rank.pipe_reap_queue.push_back(this); - rank.wait_cond.Signal(); - } - rank.lock.Unlock(); - } -} - - -Message *Rank::Pipe::read_message() -{ - // envelope - //dout(10) << "receiver.read_message from sd " << sd << endl; - - msg_envelope_t env; - if (!tcp_read( sd, (char*)&env, sizeof(env) )) { - need_to_send_close = false; - return 0; - } - - dout(20) << "pipe(" << peer_addr << ' ' << this << ").reader got envelope type=" << env.type - << " src " << env.src << " dst " << env.dst - << " nchunks=" << env.nchunks - << endl; - - // payload - bufferlist blist; - for (int i=0; iget_source() << endl; - - return m; -} - - - -int Rank::Pipe::write_message(Message *m) -{ - // get envelope, buffers - msg_envelope_t *env = &m->get_envelope(); - bufferlist blist; - blist.claim( m->get_payload() ); - -#ifdef TCP_KEEP_CHUNKS - env->nchunks = blist.buffers().size(); -#else - env->nchunks = 1; -#endif - - dout(20) << "pipe(" << peer_addr << ' ' << this << ").writer sending " << m << " " << *m - << " to " << m->get_dest() - << endl; - - // send envelope - int r = tcp_write( sd, (char*)env, sizeof(*env) ); - if (r < 0) { - derr(1) << "pipe(" << peer_addr << ' ' << this << ").writer error sending envelope for " << *m - << " to " << m->get_dest() << endl; - need_to_send_close = false; - return -1; - } - - // payload -#ifdef TCP_KEEP_CHUNKS - // send chunk-wise - int i = 0; - for (list::const_iterator it = blist.buffers().begin(); - it != blist.buffers().end(); - it++) { - dout(10) << "pipe(" << peer_addr << ' ' << this << ").writer tcp_sending frag " << i << " len " << (*it).length() << endl; - int size = (*it).length(); - r = tcp_write( sd, (char*)&size, sizeof(size) ); - if (r < 0) { - derr(10) << "pipe(" << peer_addr << ' ' << this << ").writer error sending chunk len for " << *m << " to " << m->get_dest() << endl; - need_to_send_close = false; - return -1; - } - r = tcp_write( sd, (*it).c_str(), size ); - if (r < 0) { - derr(10) << "pipe(" << peer_addr << ' ' << this << ").writer error sending data chunk for " << *m << " to " << m->get_dest() << endl; - need_to_send_close = false; - return -1; - } - i++; - } -#else - // one big chunk - int size = blist.length(); - r = tcp_write( sd, (char*)&size, sizeof(size) ); - if (r < 0) { - derr(10) << "pipe(" << peer_addr << ' ' << this << ").writer error sending data len for " << *m << " to " << m->get_dest() << endl; - need_to_send_close = false; - return -1; - } - dout(20) << "pipe(" << peer_addr << ' ' << this << ").writer data len is " << size << " in " << blist.buffers().size() << " buffers" << endl; - - for (list::const_iterator it = blist.buffers().begin(); - it != blist.buffers().end(); - it++) { - if ((*it).length() == 0) continue; // blank buffer. - r = tcp_write( sd, (char*)(*it).c_str(), (*it).length() ); - if (r < 0) { - derr(10) << "pipe(" << peer_addr << ' ' << this << ").writer error sending data megachunk for " << *m << " to " << m->get_dest() << " : len " << (*it).length() << endl; - need_to_send_close = false; - return -1; - } - } -#endif - - return 0; -} - - -void Rank::Pipe::fail(list& out) -{ - derr(10) << "pipe(" << peer_addr << ' ' << this << ").fail" << endl; - - // FIXME: possible race before i reclaim lock here? - - // deactivate myself - rank.lock.Lock(); - { - if (rank.rank_pipe.count(peer_addr) && - rank.rank_pipe[peer_addr] == this) - rank.rank_pipe.erase(peer_addr); - } - rank.lock.Unlock(); - - // what do i do about reader()? FIXME - - // sort my messages by (source) dispatcher, dest. - map > > by_dis; - lock.Lock(); - { - // include out at front of queue - q.splice(q.begin(), out); - - // sort - while (!q.empty()) { - if (q.front()->get_type() == MSG_CLOSE) { - delete q.front(); - } - else if (rank.local.count(q.front()->get_source())) { - EntityMessenger *mgr = rank.local[q.front()->get_source()]; - Dispatcher *dis = mgr->get_dispatcher(); - if (mgr->is_stopped()) { - // ignore. - dout(1) << "pipe(" << peer_addr << ' ' << this << ").fail on " << *q.front() << ", dispatcher stopping, ignoring." << endl; - delete q.front(); - } else { - by_dis[dis][q.front()->get_dest()].push_back(q.front()); - } - } - else { - // oh well. sending entity musta just shut down? - assert(0); - delete q.front(); - } - q.pop_front(); - } - } - lock.Unlock(); - - // report failure(s) to dispatcher(s) - for (map > >::iterator i = by_dis.begin(); - i != by_dis.end(); - ++i) - for (map >::iterator j = i->second.begin(); - j != i->second.end(); - ++j) - for (list::iterator k = j->second.begin(); - k != j->second.end(); - ++k) { - derr(1) << "pipe(" << peer_addr << ' ' << this << ").fail on " << **k << " to " << (*k)->get_dest_inst() << endl; - i->first->ms_handle_failure(*k, (*k)->get_dest_inst()); - } -} - - - - - - -/******************************************** - * Rank - */ - -Rank::Rank() : - single_dispatcher(this) { - // default to any listen_addr - memset((char*)&listen_addr, 0, sizeof(listen_addr)); - listen_addr.sin_family = AF_INET; - listen_addr.sin_addr.s_addr = htonl(INADDR_ANY); - listen_addr.sin_port = 0; -} -Rank::~Rank() -{ -} - -/* -void Rank::set_listen_addr(tcpaddr_t& a) -{ - dout(10) << "set_listen_addr " << a << endl; - memcpy((char*)&listen_addr.sin_addr.s_addr, (char*)&a.sin_addr.s_addr, 4); - listen_addr.sin_port = a.sin_port; -} -*/ - -void Rank::_submit_single_dispatch(Message *m) -{ - assert(lock.is_locked()); - - if (local.count(m->get_dest()) && - local[m->get_dest()]->is_ready()) { - rank.single_dispatch_queue.push_back(m); - rank.single_dispatch_cond.Signal(); - } else { - waiting_for_ready[m->get_dest()].push_back(m); - } -} - - -void Rank::single_dispatcher_entry() -{ - lock.Lock(); - while (!single_dispatch_stop || !single_dispatch_queue.empty()) { - if (!single_dispatch_queue.empty()) { - list ls; - ls.swap(single_dispatch_queue); - - lock.Unlock(); - { - while (!ls.empty()) { - Message *m = ls.front(); - ls.pop_front(); - - dout(1) << m->get_dest() - << " <-- " << m->get_source_inst() - << " ---- " << *m - << " -- " << m - << endl; - - assert(local.count(m->get_dest())); - local[m->get_dest()]->dispatch(m); - } - } - lock.Lock(); - continue; - } - single_dispatch_cond.Wait(lock); - } - lock.Unlock(); -} - - -/* - * note: assumes lock is held - */ -void Rank::reaper() -{ - dout(10) << "reaper" << endl; - assert(lock.is_locked()); - - while (!pipe_reap_queue.empty()) { - Pipe *p = pipe_reap_queue.front(); - dout(10) << "reaper reaping pipe " << p->get_peer_addr() << endl; - pipe_reap_queue.pop_front(); - assert(pipes.count(p)); - pipes.erase(p); - p->join(); - dout(10) << "reaper reaped pipe " << p->get_peer_addr() << endl; - delete p; - } -} - - -int Rank::start_rank() -{ - dout(10) << "start_rank" << endl; - - // bind to a socket - if (accepter.start() < 0) - return -1; - - // start single thread dispatcher? - if (g_conf.ms_single_dispatch) { - single_dispatch_stop = false; - single_dispatcher.create(); - } - - lock.Lock(); - - dout(1) << "start_rank at " << listen_addr << endl; - - lock.Unlock(); - return 0; -} - - - -/* connect_rank - * NOTE: assumes rank.lock held. - */ -Rank::Pipe *Rank::connect_rank(const entity_addr_t& addr) -{ - assert(rank.lock.is_locked()); - assert(addr != rank.my_addr); - - dout(10) << "connect_rank to " << addr << endl; - - // create pipe - Pipe *pipe = new Pipe(addr); - rank.rank_pipe[addr] = pipe; - pipes.insert(pipe); - - return pipe; -} - - - - - - -Rank::EntityMessenger *Rank::find_unnamed(entity_name_t a) -{ - // find an unnamed local entity of the right type - for (map::iterator p = local.begin(); - p != local.end(); - ++p) { - if (p->first.type() == a.type() && p->first.is_new()) - return p->second; - } - return 0; -} - - - - -/* register_entity - */ -Rank::EntityMessenger *Rank::register_entity(entity_name_t name) -{ - dout(10) << "register_entity " << name << endl; - lock.Lock(); - - // create messenger - EntityMessenger *msgr = new EntityMessenger(name); - - // add to directory - assert(local.count(name) == 0); - local[name] = msgr; - - lock.Unlock(); - return msgr; -} - - -void Rank::unregister_entity(EntityMessenger *msgr) -{ - lock.Lock(); - dout(10) << "unregister_entity " << msgr->get_myname() << endl; - - // remove from local directory. - entity_name_t name = msgr->get_myname(); - assert(local.count(name)); - local.erase(name); - - stopped.insert(name); - wait_cond.Signal(); - - lock.Unlock(); -} - - -void Rank::submit_message(Message *m, const entity_addr_t& dest_addr) -{ - const entity_name_t dest = m->get_dest(); - - // lookup - EntityMessenger *entity = 0; - Pipe *pipe = 0; - - lock.Lock(); - { - // local? - if (dest_addr == my_addr) { - if (local.count(dest)) { - // local - dout(20) << "submit_message " << *m << " dest " << dest << " local" << endl; - if (g_conf.ms_single_dispatch) { - _submit_single_dispatch(m); - } else { - entity = local[dest]; - } - } else { - derr(0) << "submit_message " << *m << " dest " << dest << " " << dest_addr << " local but not in local map?" << endl; - //assert(0); // hmpf, this is probably mds->mon beacon from newsyn. - } - } - else { - // remote. - if (rank_pipe.count( dest_addr )) { - dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << dest_addr << ", already connected." << endl; - // connected. - pipe = rank_pipe[ dest_addr ]; - } else { - dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << dest_addr << ", connecting." << endl; - // not connected. - pipe = connect_rank( dest_addr ); - } - } - } - lock.Unlock(); - - // do it - if (entity) { - // local! - dout(20) << "submit_message " << *m << " dest " << dest << " local, queueing" << endl; - entity->queue_message(m); - } - else if (pipe) { - // remote! - dout(20) << "submit_message " << *m << " dest " << dest << " remote, sending" << endl; - pipe->send(m); - } -} - - - - - -void Rank::wait() -{ - lock.Lock(); - while (1) { - // reap dead pipes - reaper(); - - if (local.empty()) { - dout(10) << "wait: everything stopped" << endl; - break; // everything stopped. - } else { - dout(10) << "wait: local still has " << local.size() << " items, waiting" << endl; - } - - wait_cond.Wait(lock); - } - lock.Unlock(); - - // done! clean up. - - //dout(10) << "wait: stopping accepter thread" << endl; - //accepter.stop(); - - // stop dispatch thread - if (g_conf.ms_single_dispatch) { - dout(10) << "wait: stopping dispatch thread" << endl; - lock.Lock(); - single_dispatch_stop = true; - single_dispatch_cond.Signal(); - lock.Unlock(); - single_dispatcher.join(); - } - - // reap pipes - lock.Lock(); - { - dout(10) << "wait: closing pipes" << endl; - list toclose; - for (hash_map::iterator i = rank_pipe.begin(); - i != rank_pipe.end(); - i++) - toclose.push_back(i->second); - for (list::iterator i = toclose.begin(); - i != toclose.end(); - i++) - (*i)->close(); - - dout(10) << "wait: waiting for pipes " << pipes << " to close" << endl; - while (!pipes.empty()) { - wait_cond.Wait(lock); - reaper(); - } - } - lock.Unlock(); - - dout(10) << "wait: done." << endl; -} - - - - - - -/********************************** - * EntityMessenger - */ - -Rank::EntityMessenger::EntityMessenger(entity_name_t myaddr) : - Messenger(myaddr), - stop(false), - dispatch_thread(this) -{ -} -Rank::EntityMessenger::~EntityMessenger() -{ -} - -void Rank::EntityMessenger::dispatch_entry() -{ - lock.Lock(); - while (!stop) { - if (!dispatch_queue.empty()) { - list ls; - ls.swap(dispatch_queue); - - lock.Unlock(); - { - // deliver - while (!ls.empty()) { - if (stop) { - dout(1) << "dispatch: stop=true, discarding " << ls.size() - << " messages in dispatch queue" << endl; - break; - } - Message *m = ls.front(); - ls.pop_front(); - dout(1) << m->get_dest() - << " <-- " << m->get_source_inst() - << " ---- " << *m - << " -- " << m - << endl; - dispatch(m); - } - } - lock.Lock(); - continue; - } - cond.Wait(lock); - } - lock.Unlock(); - - // deregister - rank.unregister_entity(this); -} - -void Rank::EntityMessenger::ready() -{ - dout(10) << "ready " << get_myaddr() << endl; - - if (g_conf.ms_single_dispatch) { - rank.lock.Lock(); - if (rank.waiting_for_ready.count(get_myname())) { - rank.single_dispatch_queue.splice(rank.single_dispatch_queue.end(), - rank.waiting_for_ready[get_myname()]); - rank.waiting_for_ready.erase(get_myname()); - rank.single_dispatch_cond.Signal(); - } - rank.lock.Unlock(); - } else { - // start my dispatch thread - dispatch_thread.create(); - } -} - - -int Rank::EntityMessenger::shutdown() -{ - dout(10) << "shutdown " << get_myaddr() << endl; - - // stop my dispatch thread - if (dispatch_thread.am_self()) { - dout(1) << "shutdown i am dispatch, setting stop flag" << endl; - stop = true; - } else { - dout(1) << "shutdown i am not dispatch, setting stop flag and joining thread." << endl; - lock.Lock(); - stop = true; - cond.Signal(); - lock.Unlock(); - dispatch_thread.join(); - } - - return 0; -} - - -void Rank::EntityMessenger::prepare_dest(const entity_addr_t& addr) -{ - rank.lock.Lock(); - { - if (rank.rank_pipe.count(addr) == 0) - rank.connect_rank(addr); - } - rank.lock.Unlock(); -} - -int Rank::EntityMessenger::send_message(Message *m, entity_inst_t dest, - int port, int fromport) -{ - // set envelope - m->set_source(get_myname(), fromport); - m->set_source_addr(rank.my_addr); - m->set_dest_inst(dest); - m->set_dest_port(port); - - dout(1) << m->get_source() - << " --> " << dest.name << " " << dest.addr - << " -- " << *m - << " -- " << m - << endl; - - rank.submit_message(m, dest.addr); - - return 0; -} - - - -const entity_addr_t &Rank::EntityMessenger::get_myaddr() -{ - return rank.my_addr; -} - - -void Rank::EntityMessenger::reset_myname(entity_name_t newname) -{ - entity_name_t oldname = get_myname(); - dout(10) << "reset_myname " << oldname << " to " << newname << endl; - - rank.local.erase(oldname); - rank.local[newname] = this; - - _set_myname(newname); -} - - - - -void Rank::EntityMessenger::mark_down(entity_addr_t a) -{ - rank.mark_down(a); -} - -void Rank::mark_down(entity_addr_t addr) -{ - //if (my_rank == 0) return; // ugh.. rank0 already handles this stuff in the namer - lock.Lock(); - /* - if (entity_map.count(a) && - entity_map[a] > inst) { - dout(10) << "mark_down " << a << " inst " << inst << " < " << entity_map[a] << endl; - derr(10) << "mark_down " << a << " inst " << inst << " < " << entity_map[a] << endl; - // do nothing! - } else { - if (entity_map.count(a) == 0) { - // don't know it - dout(10) << "mark_down " << a << " inst " << inst << " ... unknown by me" << endl; - derr(10) << "mark_down " << a << " inst " << inst << " ... unknown by me" << endl; - } else { - // know it - assert(entity_map[a] <= inst); - dout(10) << "mark_down " << a << " inst " << inst << endl; - derr(10) << "mark_down " << a << " inst " << inst << endl; - - entity_map.erase(a); - - if (rank_pipe.count(inst)) { - rank_pipe[inst]->close(); - rank_pipe.erase(inst); - } - } - } - */ - lock.Unlock(); -} - - diff --git a/branches/marnberg/quota/msg/SimpleMessenger.h b/branches/marnberg/quota/msg/SimpleMessenger.h deleted file mode 100644 index e1265423edb13..0000000000000 --- a/branches/marnberg/quota/msg/SimpleMessenger.h +++ /dev/null @@ -1,293 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __SIMPLEMESSENGER_H -#define __SIMPLEMESSENGER_H - - -#include -#include -using namespace std; -#include -#include -using namespace __gnu_cxx; - - -#include "include/types.h" - -#include "common/Mutex.h" -#include "common/Cond.h" -#include "common/Thread.h" - -#include "Messenger.h" -#include "Message.h" -#include "tcp.h" - - - - -/* Rank - per-process - */ -class Rank { -public: - void sigint(); - -private: - class EntityMessenger; - class Pipe; - - // incoming - class Accepter : public Thread { - public: - bool done; - - int listen_sd; - - Accepter() : done(false) {} - - void *entry(); - void stop() { - done = true; - ::close(listen_sd); - join(); - } - int start(); - } accepter; - - void sigint(int r); - - - // pipe - class Pipe { - protected: - int sd; - bool done; - entity_addr_t peer_addr; - bool server; - bool need_to_send_close; - - bool reader_running; - bool writer_running; - - list q; - Mutex lock; - Cond cond; - - int accept(); // server handshake - int connect(); // client handshake - void reader(); - void writer(); - - Message *read_message(); - int write_message(Message *m); - void fail(list& ls); - - // threads - class Reader : public Thread { - Pipe *pipe; - public: - Reader(Pipe *p) : pipe(p) {} - void *entry() { pipe->reader(); return 0; } - } reader_thread; - friend class Reader; - - class Writer : public Thread { - Pipe *pipe; - public: - Writer(Pipe *p) : pipe(p) {} - void *entry() { pipe->writer(); return 0; } - } writer_thread; - friend class Writer; - - public: - Pipe(int s) : sd(s), - done(false), server(true), - need_to_send_close(true), - reader_running(false), writer_running(false), - reader_thread(this), writer_thread(this) { - // server - reader_running = true; - reader_thread.create(); - } - Pipe(const entity_addr_t &pi) : sd(0), - done(false), peer_addr(pi), server(false), - need_to_send_close(true), - reader_running(false), writer_running(false), - reader_thread(this), writer_thread(this) { - // client - writer_running = true; - writer_thread.create(); - } - - // public constructors - static const Pipe& Server(int s); - static const Pipe& Client(const entity_addr_t& pi); - - entity_addr_t& get_peer_addr() { return peer_addr; } - - void close(); - void join() { - writer_thread.join(); - reader_thread.join(); - } - - void send(Message *m) { - lock.Lock(); - q.push_back(m); - cond.Signal(); - lock.Unlock(); - } - void send(list& ls) { - lock.Lock(); - q.splice(q.end(), ls); - cond.Signal(); - lock.Unlock(); - } - }; - - - - // messenger interface - class EntityMessenger : public Messenger { - Mutex lock; - Cond cond; - list dispatch_queue; - bool stop; - - class DispatchThread : public Thread { - EntityMessenger *m; - public: - DispatchThread(EntityMessenger *_m) : m(_m) {} - void *entry() { - m->dispatch_entry(); - return 0; - } - } dispatch_thread; - void dispatch_entry(); - - public: - void queue_message(Message *m) { - lock.Lock(); - dispatch_queue.push_back(m); - cond.Signal(); - lock.Unlock(); - } - void queue_messages(list ls) { - lock.Lock(); - dispatch_queue.splice(dispatch_queue.end(), ls); - cond.Signal(); - lock.Unlock(); - } - - public: - EntityMessenger(entity_name_t myaddr); - ~EntityMessenger(); - - void ready(); - bool is_stopped() { return stop; } - - void wait() { - dispatch_thread.join(); - } - - const entity_addr_t &get_myaddr(); - - void reset_myname(entity_name_t m); - - int shutdown(); - void prepare_dest(const entity_addr_t& addr); - int send_message(Message *m, entity_inst_t dest, - int port=0, int fromport=0); - - void mark_down(entity_addr_t a); - void mark_up(entity_name_t a, entity_addr_t& i); - }; - - - class SingleDispatcher : public Thread { - Rank *rank; - public: - SingleDispatcher(Rank *r) : rank(r) {} - void *entry() { - rank->single_dispatcher_entry(); - return 0; - } - } single_dispatcher; - - Cond single_dispatch_cond; - bool single_dispatch_stop; - list single_dispatch_queue; - - map > waiting_for_ready; - - void single_dispatcher_entry(); - void _submit_single_dispatch(Message *m); - - - // Rank stuff - public: - Mutex lock; - Cond wait_cond; // for wait() - - // where i listen - tcpaddr_t listen_addr; - entity_addr_t my_addr; - - // local - map local; - set stopped; - //hash_set entity_unstarted; - - // remote - hash_map rank_pipe; - - set pipes; - list pipe_reap_queue; - - Pipe *connect_rank(const entity_addr_t& addr); - - void mark_down(entity_addr_t addr); - //void mark_up(entity_name_t addr, entity_addr_t& i); - - tcpaddr_t get_listen_addr() { return listen_addr; } - - void reaper(); - - EntityMessenger *find_unnamed(entity_name_t a); - -public: - Rank(); - ~Rank(); - - //void set_listen_addr(tcpaddr_t& a); - - int start_rank(); - void wait(); - - EntityMessenger *register_entity(entity_name_t addr); - void rename_entity(EntityMessenger *ms, entity_name_t newaddr); - void unregister_entity(EntityMessenger *ms); - - void submit_message(Message *m, const entity_addr_t& addr); - void prepare_dest(const entity_addr_t& addr); - - // create a new messenger - EntityMessenger *new_entity(entity_name_t addr); - -} ; - - - -extern Rank rank; - -#endif diff --git a/branches/marnberg/quota/msg/TCPDirectory.cc b/branches/marnberg/quota/msg/TCPDirectory.cc deleted file mode 100644 index 57000ac30d74c..0000000000000 --- a/branches/marnberg/quota/msg/TCPDirectory.cc +++ /dev/null @@ -1,178 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "TCPDirectory.h" - -#include "messages/MNSConnect.h" -#include "messages/MNSConnectAck.h" -#include "messages/MNSRegister.h" -#include "messages/MNSRegisterAck.h" -#include "messages/MNSLookup.h" -#include "messages/MNSLookupReply.h" -//#include "messages/MNSUnregister.h" - -#include "config.h" -#undef dout -#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_ns) cout << "nameserver: " - -void tcp_open(int rank); - - -void TCPDirectory::handle_connect(MNSConnect *m) -{ - int rank = nrank++; - dout(2) << "connect from new rank " << rank << " " << m->get_addr() << endl; - - dir[MSG_ADDR_RANK(rank)] = rank; - messenger->map_entity_rank(MSG_ADDR_RANK(rank), rank); - - rank_addr[rank] = m->get_addr(); - messenger->map_rank_addr(rank, m->get_addr()); - - messenger->send_message(new MNSConnectAck(rank), - MSG_ADDR_RANK(rank)); - delete m; -} - - - -void TCPDirectory::handle_register(MNSRegister *m) -{ - dout(10) << "register from rank " << m->get_rank() << " addr " << MSG_ADDR_NICE(m->get_entity()) << endl; - - // pick id - int rank = m->get_rank(); - entity_name_t entity = m->get_entity(); - - if (entity.is_new()) { - // make up a new address! - switch (entity.type()) { - - case MSG_ADDR_RANK_BASE: // stupid client should be able to figure this out - entity = MSG_ADDR_RANK(rank); - break; - - case MSG_ADDR_MDS_BASE: - entity = MSG_ADDR_MDS(nmds++); - break; - - case MSG_ADDR_OSD_BASE: - entity = MSG_ADDR_OSD(nosd++); - break; - - case MSG_ADDR_CLIENT_BASE: - entity = MSG_ADDR_CLIENT(nclient++); - break; - - default: - assert(0); - } - } else { - // specific address! - assert(dir.count(entity) == 0); // make sure it doesn't exist yet. - } - - dout(2) << "registered " << MSG_ADDR_NICE(entity) << endl; - - // register - dir[entity] = rank; - - if (entity == MSG_ADDR_RANK(rank)) // map this locally now so we can reply - messenger->map_entity_rank(entity, rank); // otherwise wait until they send STARTED msg - - hold.insert(entity); - - ++version; - update_log[version] = entity; - - // reply w/ new id - messenger->send_message(new MNSRegisterAck(m->get_tid(), entity), - MSG_ADDR_RANK(rank)); - delete m; -} - -void TCPDirectory::handle_started(Message *m) -{ - entity_name_t entity = m->get_source(); - - dout(3) << "start signal from " << MSG_ADDR_NICE(entity) << endl; - hold.erase(entity); - messenger->map_entity_rank(entity, dir[entity]); - - // waiters? - if (waiting.count(entity)) { - list ls; - ls.splice(ls.begin(), waiting[entity]); - waiting.erase(entity); - - dout(10) << "doing waiter on " << MSG_ADDR_NICE(entity) << endl; - for (list::iterator it = ls.begin(); - it != ls.end(); - it++) { - dispatch(*it); - } - } -} - -void TCPDirectory::handle_unregister(Message *m) -{ - entity_name_t who = m->get_source(); - dout(2) << "unregister from entity " << MSG_ADDR_NICE(who) << endl; - - assert(dir.count(who)); - dir.erase(who); - - // shutdown? - if (dir.size() <= 2) { - dout(2) << "dir is empty except for me, shutting down" << endl; - tcpmessenger_stop_nameserver(); - } - else { - if (0) { - dout(10) << "dir size now " << dir.size() << endl; - for (hash_map::iterator it = dir.begin(); - it != dir.end(); - it++) { - dout(10) << " dir: " << MSG_ADDR_NICE(it->first) << " on rank " << it->second << endl; - } - } - } - -} - - -void TCPDirectory::handle_lookup(MNSLookup *m) -{ - // have it? - if (dir.count(m->get_entity()) == 0 || - hold.count(m->get_entity())) { - dout(2) << MSG_ADDR_NICE(m->get_source()) << " lookup '" << MSG_ADDR_NICE(m->get_entity()) << "' -> dne or on hold" << endl; - waiting[m->get_entity()].push_back(m); - return; - } - - // look it up! - MNSLookupReply *reply = new MNSLookupReply(m); - - int rank = dir[m->get_entity()]; - reply->entity_map[m->get_entity()] = rank; - reply->rank_addr[rank] = rank_addr[rank]; - - dout(2) << MSG_ADDR_NICE(m->get_source()) << " lookup '" << MSG_ADDR_NICE(m->get_entity()) << "' -> rank " << rank << endl; - - messenger->send_message(reply, - m->get_source(), m->get_source_port()); - delete m; -} diff --git a/branches/marnberg/quota/msg/TCPDirectory.h b/branches/marnberg/quota/msg/TCPDirectory.h deleted file mode 100644 index 7f450e9a64be5..0000000000000 --- a/branches/marnberg/quota/msg/TCPDirectory.h +++ /dev/null @@ -1,110 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __TCPDIRECTORY_H -#define __TCPDIRECTORY_H - -/* - * rank -- a process (listening on some host:port) - * entity -- a logical entity (osd123, mds3, client3245, etc.) - * - * multiple entities can coexist on a single rank. - */ - -#include "Dispatcher.h" -#include "TCPMessenger.h" - -#include -using namespace std; -#include -using namespace __gnu_cxx; - -#include -//#include -#include - -class TCPDirectory : public Dispatcher { - protected: - // how i communicate - TCPMessenger *messenger; - - // directory - hash_map dir; // entity -> rank - hash_map rank_addr; // rank -> ADDR (e.g. host:port) - - __uint64_t version; - map<__uint64_t, entity_name_t> update_log; - - int nrank; - int nclient, nmds, nosd; - - set hold; - map > waiting; - - // messages - void handle_connect(class MNSConnect*); - void handle_register(class MNSRegister *m); - void handle_started(Message *m); - void handle_lookup(class MNSLookup *m); - void handle_unregister(Message *m); - - public: - TCPDirectory(TCPMessenger *m) : - messenger(m), - version(0), - nrank(0), nclient(0), nmds(0), nosd(0) { - messenger->set_dispatcher(this); - - // i am rank 0! - dir[MSG_ADDR_DIRECTORY] = 0; - rank_addr[0] = m->get_tcpaddr(); - ++nrank; - - // announce nameserver - cout << "export CEPH_NAMESERVER=" << m->get_tcpaddr() << endl; - - int fd = ::open(".ceph_ns", O_WRONLY|O_CREAT); - ::write(fd, (void*)&m->get_tcpaddr(), sizeof(tcpaddr_t)); - ::fchmod(fd, 0755); - ::close(fd); - } - ~TCPDirectory() { - ::unlink(".ceph_ns"); - } - - void dispatch(Message *m) { - switch (m->get_type()) { - case MSG_NS_CONNECT: - handle_connect((class MNSConnect*)m); - break; - case MSG_NS_REGISTER: - handle_register((class MNSRegister*)m); - break; - case MSG_NS_STARTED: - handle_started(m); - break; - case MSG_NS_UNREGISTER: - handle_unregister(m); - break; - case MSG_NS_LOOKUP: - handle_lookup((class MNSLookup*)m); - break; - - default: - assert(0); - } - } -}; - -#endif diff --git a/branches/marnberg/quota/msg/TCPMessenger.cc b/branches/marnberg/quota/msg/TCPMessenger.cc deleted file mode 100644 index f40ea9b162e6b..0000000000000 --- a/branches/marnberg/quota/msg/TCPMessenger.cc +++ /dev/null @@ -1,1454 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "config.h" -#include "include/error.h" - -#include "common/Timer.h" -#include "common/Mutex.h" - -#include "TCPMessenger.h" -#include "Message.h" - -#include -#include -using namespace std; -#include -using namespace __gnu_cxx; - -#include -# include -# include -# include -# include -#include -#include -#include -#include - -#include - -#include "messages/MGenericMessage.h" -#include "messages/MNSConnect.h" -#include "messages/MNSConnectAck.h" -#include "messages/MNSRegister.h" -#include "messages/MNSRegisterAck.h" -#include "messages/MNSLookup.h" -#include "messages/MNSLookupReply.h" - -#include "TCPDirectory.h" - -#include "common/Logger.h" - -#define DBL 18 - -//#define TCP_SERIALMARSHALL // do NOT turn this off until you check messages/* encode_payload methods -//#define TCP_SERIALOUT // be paranoid/annoying and send messages in same thread - - -TCPMessenger *rankmessenger = 0; // - -TCPDirectory *nameserver = 0; // only defined on rank 0 -TCPMessenger *nsmessenger = 0; - - -/***************************/ -LogType rank_logtype; -Logger *logger; - -int stat_num = 0; -off_t stat_inq = 0, stat_inqb = 0; -off_t stat_disq = 0, stat_disqb = 0; -off_t stat_outq = 0, stat_outqb = 0; -/***************************/ - - -// local directory -hash_map directory; // local -hash_set directory_ready; -Mutex directory_lock; - -// connecting -struct sockaddr_in listen_addr; // my listen addr -int listen_sd = 0; -int my_rank = -1; -Cond waiting_for_rank; - -// register -long regid = 0; -map waiting_for_register_cond; -map waiting_for_register_result; - -// incoming messages -list incoming; -Mutex incoming_lock; -Cond incoming_cond; - -// outgoing messages -/* -list outgoing; -Mutex outgoing_lock; -Cond outgoing_cond; -*/ - -class OutThread : public Thread { -public: - Mutex lock; - Cond cond; - list q; - bool done; - - OutThread() : done(false) {} - virtual ~OutThread() {} - - void *entry(); - - void stop() { - lock.Lock(); - done = true; - cond.Signal(); - lock.Unlock(); - join(); - } - - void send(Message *m) { - lock.Lock(); - q.push_back(m); - cond.Signal(); - lock.Unlock(); - } -} single_out_thread; - -Mutex lookup_lock; // -hash_map entity_rank; // entity -> rank -hash_map rank_sd; // outgoing sockets, rank -> sd -hash_map rank_out; -hash_map rank_addr; // rank -> tcpaddr -map > waiting_for_lookup; - - -/* this process */ -bool tcp_done = false; // set this flag to stop the event loop - - -// threads -pthread_t dispatch_thread_id = 0; // thread id of the event loop. init value == nobody -pthread_t out_thread_id = 0; // thread id of the event loop. init value == nobody -pthread_t listen_thread_id = 0; -map in_threads; // sd -> threadid - -//bool pending_timer = false; - -// per-rank fun - - -// debug -#undef dout -#define dout(l) if (l<=g_conf.debug) cout << "[TCP " << my_rank /*(<< " " << getpid() << "." << pthread_self() */ << "] " - - -#include "tcp.cc" - -// some declarations -void tcp_open(int rank); -int tcp_send(Message *m); -void tcpmessenger_kick_dispatch_loop(); -OutThread *tcp_lookup(Message *m); - -int tcpmessenger_get_rank() -{ - return my_rank; -} - - -int tcpmessenger_findns(tcpaddr_t &nsa) -{ - char *nsaddr = 0; - bool have_nsa = false; - - // env var? - /*int e_len = 0; - for (int i=0; envp[i]; i++) - e_len += strlen(envp[i]) + 1; - */ - nsaddr = getenv("CEPH_NAMESERVER");////envz_entry(*envp, e_len, "CEPH_NAMESERVER"); - if (nsaddr) { - while (nsaddr[0] != '=') nsaddr++; - nsaddr++; - } - - else { - // file? - int fd = ::open(".ceph_ns",O_RDONLY); - if (fd > 0) { - ::read(fd, (void*)&nsa, sizeof(nsa)); - ::close(fd); - have_nsa = true; - nsaddr = "from .ceph_ns"; - } - } - - if (!nsaddr && !have_nsa) { - cerr << "i need ceph ns addr.. either CEPH_NAMESERVER env var or --ns blah" << endl; - return -1; - //exit(-1); - } - - // look up nsaddr? - if (!have_nsa && tcpmessenger_lookup(nsaddr, nsa) < 0) { - return -1; - } - - dout(2) << "ceph ns is " << nsaddr << " or " << nsa << endl; - return 0; -} - - - -/** rankserver - * - * one per rank. handles entity->rank lookup replies. - */ - -class RankServer : public Dispatcher { -public: - void dispatch(Message *m) { - lookup_lock.Lock(); - - dout(DBL) << "rankserver dispatching " << *m << endl; - - switch (m->get_type()) { - case MSG_NS_CONNECTACK: - handle_connect_ack((MNSConnectAck*)m); - break; - - case MSG_NS_REGISTERACK: - handle_register_ack((MNSRegisterAck*)m); - break; - - case MSG_NS_LOOKUPREPLY: - handle_lookup_reply((MNSLookupReply*)m); - break; - - default: - assert(0); - } - - lookup_lock.Unlock(); - } - - void handle_connect_ack(MNSConnectAck *m) { - dout(DBL) << "my rank is " << m->get_rank(); - my_rank = m->get_rank(); - - // now that i know my rank, - entity_rank[MSG_ADDR_RANK(my_rank)] = my_rank; - rank_addr[my_rank] = listen_addr; - - waiting_for_rank.SignalAll(); - - delete m; - - // logger! - dout(DBL) << "logger" << endl; - char names[100]; - sprintf(names, "rank%d", my_rank); - string name = names; - - if (g_conf.tcp_log) { - logger = new Logger(name, (LogType*)&rank_logtype); - rank_logtype.add_set("num"); - rank_logtype.add_inc("in"); - rank_logtype.add_inc("inb"); - rank_logtype.add_inc("dis"); - rank_logtype.add_set("inq"); - rank_logtype.add_set("inqb"); - rank_logtype.add_set("outq"); - rank_logtype.add_set("outqb"); - } - - } - - void handle_register_ack(MNSRegisterAck *m) { - long tid = m->get_tid(); - waiting_for_register_result[tid] = m->get_entity(); - waiting_for_register_cond[tid]->Signal(); - delete m; - } - - void handle_lookup_reply(MNSLookupReply *m) { - list waiting; - dout(DBL) << "got lookup reply" << endl; - - for (map::iterator it = m->entity_rank.begin(); - it != m->entity_rank.end(); - it++) { - dout(DBL) << "lookup got " << MSG_ADDR_NICE(it->first) << " on rank " << it->second << endl; - entity_rank[it->first] = it->second; - - if (it->second == my_rank) { - // deliver locally - dout(-DBL) << "delivering lookup results locally" << endl; - incoming_lock.Lock(); - - for (list::iterator i = waiting_for_lookup[it->first].begin(); - i != waiting_for_lookup[it->first].end(); - i++) { - stat_inq++; - stat_inqb += (*i)->get_payload().length(); - (*i)->decode_payload(); - incoming.push_back(*i); - } - incoming_cond.Signal(); - incoming_lock.Unlock(); - } else { - // take waiters - waiting.splice(waiting.begin(), waiting_for_lookup[it->first]); - } - waiting_for_lookup.erase(it->first); - - } - - for (map::iterator it = m->rank_addr.begin(); - it != m->rank_addr.end(); - it++) { - dout(DBL) << "lookup got rank " << it->first << " addr " << it->second << endl; - rank_addr[it->first] = it->second; - - // open it now - if (rank_sd.count(it->first) == 0) - tcp_open(it->first); - } - - // send waiting messages -#ifdef TCP_SERIALOUT - for (list::iterator it = waiting.begin(); - it != waiting.end(); - it++) { - OutThread *outt = tcp_lookup(*it); - assert(outt); - tcp_send(*it); - } -#else - for (list::iterator it = waiting.begin(); - it != waiting.end(); - it++) { - OutThread *outt = tcp_lookup(*it); - assert(outt); - outt->send(*it); -// dout(0) << "lookup done, splicing in " << *it << endl; - } -#endif - - delete m; - } - -} rankserver; - - -class C_TCPKicker : public Context { - void finish(int r) { - dout(DBL) << "timer kick" << endl; - tcpmessenger_kick_dispatch_loop(); - } -}; - -void TCPMessenger::callback_kick() -{ - tcpmessenger_kick_dispatch_loop(); -} - - -extern int tcpmessenger_lookup(char *str, tcpaddr_t& ta) -{ - char *host = str; - char *port = 0; - - for (int i=0; str[i]; i++) { - if (str[i] == ':') { - port = str+i+1; - str[i] = 0; - break; - } - } - if (!port) { - cerr << "addr '" << str << "' doesn't look like 'host:port'" << endl; - return -1; - } - //cout << "host '" << host << "' port '" << port << "'" << endl; - - int iport = atoi(port); - - struct hostent *myhostname = gethostbyname( host ); - if (!myhostname) { - cerr << "host " << host << " not found" << endl; - return -1; - } - - memset(&ta, 0, sizeof(ta)); - - //cout << "addrtype " << myhostname->h_addrtype << " len " << myhostname->h_length << endl; - - ta.sin_family = myhostname->h_addrtype; - memcpy((char *)&ta.sin_addr, - myhostname->h_addr, - myhostname->h_length); - ta.sin_port = iport; - - cout << "lookup '" << host << ":" << port << "' -> " << ta << endl; - - return 0; -} - - - -/***** - * global methods for process-wide startup, shutdown. - */ - -int tcpmessenger_init() -{ - // LISTEN - dout(DBL) << "binding to listen " << endl; - - /* socket creation */ - listen_sd = socket(AF_INET,SOCK_STREAM,0); - assert(listen_sd > 0); - - /* bind to port */ - memset((char*)&listen_addr, 0, sizeof(listen_addr)); - listen_addr.sin_family = AF_INET; - listen_addr.sin_addr.s_addr = htonl(INADDR_ANY); - listen_addr.sin_port = 0; - - int rc = bind(listen_sd, (struct sockaddr *) &listen_addr, sizeof(listen_addr)); - assert(rc >= 0); - - socklen_t llen = sizeof(listen_addr); - getsockname(listen_sd, (sockaddr*)&listen_addr, &llen); - - int myport = listen_addr.sin_port; - - // listen! - rc = ::listen(listen_sd, 1000); - assert(rc >= 0); - - dout(DBL) << "listening on " << myport << endl; - - // my address is... - char host[100]; - gethostname(host, 100); - dout(DBL) << "my hostname is " << host << endl; - - struct hostent *myhostname = gethostbyname( host ); - - struct sockaddr_in my_addr; - memset(&my_addr, 0, sizeof(my_addr)); - - my_addr.sin_family = myhostname->h_addrtype; - memcpy((char *) &my_addr.sin_addr.s_addr, - myhostname->h_addr_list[0], - myhostname->h_length); - my_addr.sin_port = myport; - - listen_addr = my_addr; - - dout(DBL) << "listen addr is " << listen_addr << endl; - - // register to execute timer events - //g_timer.set_messenger_kicker(new C_TCPKicker()); - - - dout(DBL) << "init done" << endl; - return 0; -} - - -// on first rank only -void tcpmessenger_start_nameserver(tcpaddr_t& diraddr) -{ - dout(DBL) << "starting nameserver on " << MSG_ADDR_NICE(MSG_ADDR_DIRECTORY) << endl; - - // i am rank 0. - nsmessenger = new TCPMessenger(MSG_ADDR_DIRECTORY); - - // start name server - nameserver = new TCPDirectory(nsmessenger); - - // diraddr is my addr! - diraddr = rank_addr[0] = listen_addr; - my_rank = 0; - entity_rank[MSG_ADDR_DIRECTORY] = 0; -} -void tcpmessenger_stop_nameserver() -{ - if (nsmessenger) { - dout(DBL) << "shutting down nsmessenger" << endl; - TCPMessenger *m = nsmessenger; - nsmessenger = 0; - m->shutdown(); - delete m; - } -} - -// on all ranks -void tcpmessenger_start_rankserver(tcpaddr_t& ns) -{ - // connect to nameserver - entity_rank[MSG_ADDR_DIRECTORY] = 0; - rank_addr[0] = ns; - tcp_open(0); - - if (my_rank >= 0) { - // i know my rank - rankmessenger = new TCPMessenger(MSG_ADDR_RANK(my_rank)); - } else { - // start rank messenger, and discover my rank. - rankmessenger = new TCPMessenger(MSG_ADDR_RANK_NEW); - } -} -void tcpmessenger_stop_rankserver() -{ - if (rankmessenger) { - dout(DBL) << "shutting down rankmessenger" << endl; - rankmessenger->shutdown(); - delete rankmessenger; - rankmessenger = 0; - } -} - - - - - - -int tcpmessenger_shutdown() -{ - dout(DBL) << "tcpmessenger_shutdown barrier" << endl; - - - dout(2) << "tcpmessenger_shutdown closing all sockets etc" << endl; - - // bleh - for (hash_map::iterator it = rank_sd.begin(); - it != rank_sd.end(); - it++) { - ::close(it->second); - } - - return 0; -} - - - - -/*** - * internal send/recv - */ - - - - -/* - * recv a Message* - */ - - - -Message *tcp_recv(int sd) -{ - // envelope - dout(DBL) << "tcp_recv receiving message from sd " << sd << endl; - - msg_envelope_t env; - if (!tcp_read( sd, (char*)&env, sizeof(env) )) - return 0; - - if (env.type == 0) { - dout(DBL) << "got dummy env, bailing" << endl; - return 0; - } - - dout(DBL) << "tcp_recv got envelope type=" << env.type << " src " << MSG_ADDR_NICE(env.source) << " dst " << MSG_ADDR_NICE(env.dest) << " nchunks=" << env.nchunks << endl; - - // payload - bufferlist blist; - for (int i=0; iinc("in"); - logger->inc("inb", s+sizeof(env)); - } - - dout(DBL) << "tcp_recv got " << s << " byte message from " << MSG_ADDR_NICE(m->get_source()) << endl; - - return m; -} - - - - -void tcp_open(int rank) -{ - dout(DBL) << "tcp_open to rank " << rank << " at " << rank_addr[rank] << endl; - - // create socket? - int sd = socket(AF_INET,SOCK_STREAM,0); - assert(sd > 0); - - // bind any port - struct sockaddr_in myAddr; - myAddr.sin_family = AF_INET; - myAddr.sin_addr.s_addr = htonl(INADDR_ANY); - myAddr.sin_port = htons( 0 ); - - int rc = bind(sd, (struct sockaddr *) &myAddr, sizeof(myAddr)); - assert(rc>=0); - - // connect! - int r = connect(sd, (sockaddr*)&rank_addr[rank], sizeof(myAddr)); - assert(r >= 0); - - //dout(DBL) << "tcp_open connected to " << who << endl; - assert(rank_sd.count(rank) == 0); - rank_sd[rank] = sd; - - if (g_conf.tcp_multi_out) { - rank_out[rank] = new OutThread(); - rank_out[rank]->create(); - } else { - rank_out[rank] = &single_out_thread; - if (!single_out_thread.is_started()) - single_out_thread.create(); - } -} - - -void tcp_marshall(Message *m) -{ - // marshall - if (m->empty_payload()) - m->encode_payload(); -} - -OutThread *tcp_lookup(Message *m) -{ - entity_name_t addr = m->get_dest(); - - if (!entity_rank.count(m->get_dest())) { - // lookup and wait. - if (waiting_for_lookup.count(addr)) { - dout(DBL) << "already looking up " << MSG_ADDR_NICE(addr) << endl; - } else { - dout(DBL) << "lookup on " << MSG_ADDR_NICE(addr) << " for " << m << endl; - MNSLookup *r = new MNSLookup(addr); - rankmessenger->send_message(r, MSG_ADDR_DIRECTORY); - } - - // add waiter - waiting_for_lookup[addr].push_back(m); - return 0; - } - - int rank = entity_rank[m->get_dest()]; - - if (rank_sd.count(rank) == 0) { // should only happen on rank0? - tcp_open(rank); - } - assert(rank_sd.count(rank)); - m->set_tcp_sd( rank_sd[rank] ); - return rank_out[rank]; -} - - -/* - * send a Message* over the wire. ** do not block **. - */ -int tcp_send(Message *m) -{ - /*int rank = entity_rank[m->get_dest()]; - //if (rank_sd.count(rank) == 0) tcp_open(rank); - assert(rank_sd.count(rank)); - - int sd = rank_sd[rank]; - assert(sd); - */ - int sd = m->get_tcp_sd(); - assert(sd); - - // get envelope, buffers - msg_envelope_t *env = &m->get_envelope(); - bufferlist blist; - blist.claim( m->get_payload() ); - -#ifdef TCP_KEEP_CHUNKS - env->nchunks = blist.buffers().size(); -#else - env->nchunks = 1; -#endif - - // HACK osd -> client only - //if (m->get_source() >= MSG_ADDR_OSD(0) && m->get_source() < MSG_ADDR_CLIENT(0) && - // m->get_dest() >= MSG_ADDR_CLIENT(0)) - dout(DBL) << g_clock.now() << " sending " << m << " " << *m << " to " << MSG_ADDR_NICE(m->get_dest()) - //<< " rank " << rank - << " sd " << sd << endl; - - // send envelope - int r = tcp_write( sd, (char*)env, sizeof(*env) ); - if (r < 0) { cerr << "error sending envelope for " << *m << " to " << MSG_ADDR_NICE(m->get_dest()) << endl; assert(0); } - - // payload -#ifdef TCP_KEEP_CHUNKS - // send chunk-wise - int i = 0; - for (list::iterator it = blist.buffers().begin(); - it != blist.buffers().end(); - it++) { - dout(DBL) << "tcp_sending frag " << i << " len " << (*it).length() << endl; - int size = (*it).length(); - r = tcp_write( sd, (char*)&size, sizeof(size) ); - if (r < 0) { cerr << "error sending chunk len for " << *m << " to " << MSG_ADDR_NICE(m->get_dest()) << endl; assert(0); } - r = tcp_write( sd, (*it).c_str(), size ); - if (r < 0) { cerr << "error sending data chunk for " << *m << " to " << MSG_ADDR_NICE(m->get_dest()) << endl; assert(0); } - i++; - } -#else - // one big chunk - int size = blist.length(); - r = tcp_write( sd, (char*)&size, sizeof(size) ); - if (r < 0) { cerr << "error sending data len for " << *m << " to " << MSG_ADDR_NICE(m->get_dest()) << endl; assert(0); } - for (list::iterator it = blist.buffers().begin(); - it != blist.buffers().end(); - it++) { - r = tcp_write( sd, (*it).c_str(), (*it).length() ); - if (r < 0) { cerr << "error sending data megachunk for " << *m << " to " << MSG_ADDR_NICE(m->get_dest()) << " : len " << (*it).length() << endl; assert(0); } - } -#endif - - // hose message - delete m; - return 0; -} - - - - - -/** tcp_outthread - * this thread watching the outgoing queue, and encodes+sends any queued messages - */ - -void* OutThread::entry() -{ - lock.Lock(); - while (!q.empty() || !done) { - - if (!q.empty()) { - dout(DBL) << "outthread grabbing message(s)" << endl; - - // grab outgoing list - list out; - out.splice(out.begin(), q); - - // drop lock while i send these - lock.Unlock(); - - while (!out.empty()) { - Message *m = out.front(); - out.pop_front(); - - dout(DBL) << "outthread sending " << m << endl; - - if (!g_conf.tcp_serial_marshall) - tcp_marshall(m); - - tcp_send(m); - } - - lock.Lock(); - continue; - } - - // wait - dout(DBL) << "outthread sleeping" << endl; - cond.Wait(lock); - } - dout(DBL) << "outthread done" << endl; - - lock.Unlock(); - return 0; -} - - - -/** tcp_inthread - * read incoming messages from a given peer. - * give received and decoded messages to dispatch loop. - */ -void *tcp_inthread(void *r) -{ - int sd = (int)r; - - dout(DBL) << "tcp_inthread reading on sd " << sd << endl; - - while (!tcp_done) { - Message *m = tcp_recv(sd); - if (!m) break; - entity_name_t who = m->get_source(); - - dout(20) << g_clock.now() << " inthread got " << m << " from sd " << sd << " who is " << who << endl; - - // give to dispatch loop - size_t sz = m->get_payload().length(); - - if (g_conf.tcp_multi_dispatch) { - const entity_name_t dest = m->get_dest(); - directory_lock.Lock(); - TCPMessenger *messenger = directory[ dest ]; - directory_lock.Unlock(); - - if (messenger) - messenger->dispatch_queue(m); - else - dout(0) << "dest " << dest << " dne" << endl; - - } else { - // single dispatch queue - incoming_lock.Lock(); - { - //dout(-20) << "in1 stat_inq " << stat_inq << ", incoming " << incoming.size() << endl; - //assert(stat_inq == incoming.size()); - incoming.push_back(m); - incoming_cond.Signal(); - - stat_inq++; - //assert(stat_inq == incoming.size()); - //dout(-20) << "in2 stat_inq " << stat_inq << ", incoming " << incoming.size() << endl; - stat_inqb += sz; - } - incoming_lock.Unlock(); - } - - if (logger) { - //logger->inc("in"); - //logger->inc("inb", sz); - } - } - - dout(DBL) << "tcp_inthread closing " << sd << endl; - - //::close(sd); - return 0; -} - -/** tcp_accepthread - * accept incoming connections from peers. - * start a tcp_inthread for each. - */ -void *tcp_acceptthread(void *) -{ - dout(DBL) << "tcp_acceptthread starting" << endl; - - while (!tcp_done) { - //dout(DBL) << "accepting, left = " << left << endl; - - struct sockaddr_in addr; - socklen_t slen = sizeof(addr); - int sd = ::accept(listen_sd, (sockaddr*)&addr, &slen); - if (sd > 0) { - dout(DBL) << "accepted incoming on sd " << sd << endl; - - pthread_t th; - pthread_create(&th, - NULL, - tcp_inthread, - (void*)sd); - in_threads[sd] = th; - } else { - dout(DBL) << "no incoming connection?" << endl; - break; - } - } - return 0; -} - - - - -/** tcp_dispatchthread - * wait for pending timers, incoming messages. dispatch them. - */ -void TCPMessenger::dispatch_entry() -{ - incoming_lock.Lock(); - while (!incoming.empty() || !incoming_stop) { - if (!incoming.empty()) { - // grab incoming messages - list in; - in.splice(in.begin(), incoming); - - assert(stat_disq == 0); - stat_disq = stat_inq; - stat_disqb = stat_inqb; - stat_inq = 0; - stat_inqb = 0; - - // drop lock while we deliver - //assert(stat_inq == incoming.size()); - incoming_lock.Unlock(); - - // dispatch! - while (!in.empty()) { - Message *m = in.front(); - in.pop_front(); - - stat_disq--; - stat_disqb -= m->get_payload().length(); - if (logger) { - logger->set("inq", stat_inq+stat_disq); - logger->set("inqb", stat_inqb+stat_disq); - logger->inc("dis"); - } - - dout(4) << g_clock.now() << " ---- '" << m->get_type_name() << - "' from " << MSG_ADDR_NICE(m->get_source()) << ':' << m->get_source_port() << - " to " << MSG_ADDR_NICE(m->get_dest()) << ':' << m->get_dest_port() << " ---- " - << m - << endl; - - dispatch(m); - } - - continue; - } - - // sleep - dout(DBL) << "dispatch: waiting for incoming messages" << endl; - incoming_cond.Wait(incoming_lock); - dout(DBL) << "dispatch: woke up" << endl; - } - incoming_lock.Unlock(); -} - - -void* tcp_dispatchthread(void*) -{ - dout(5) << "tcp_dispatchthread start pid " << getpid() << endl; - - while (1) { - // inq? - incoming_lock.Lock(); - - // done? - if (tcp_done && incoming.empty()) { - incoming_lock.Unlock(); - break; - } - - // wait? - if (incoming.empty()) { - // wait - dout(DBL) << "dispatch: incoming empty, waiting for incoming messages" << endl; - incoming_cond.Wait(incoming_lock); - dout(DBL) << "dispatch: woke up" << endl; - } - - // grab incoming messages - //dout(-20) << "dis1 stat_inq " << stat_inq << ", incoming " << incoming.size() << endl; - //assert(stat_inq == incoming.size()); - - list in; - in.splice(in.begin(), incoming); - - assert(stat_disq == 0); - stat_disq = stat_inq; - stat_disqb = stat_inqb; - stat_inq = 0; - stat_inqb = 0; - //assert(stat_inq == incoming.size()); - //dout(-20) << "dis2 stat_inq " << stat_inq << ", incoming " << incoming.size() << endl; - - // drop lock while we deliver - incoming_lock.Unlock(); - - // dispatch! - while (!in.empty()) { - Message *m = in.front(); - in.pop_front(); - - stat_disq--; - stat_disqb -= m->get_payload().length(); - if (logger) { - logger->set("inq", stat_inq+stat_disq); - logger->set("inqb", stat_inqb+stat_disq); - logger->inc("dis"); - } - - dout(DBL) << "dispatch doing " << *m << endl; - - // for rankserver? - if (m->get_type() == MSG_NS_CONNECTACK || // i just connected - m->get_dest() == MSG_ADDR_RANK(my_rank)) { - dout(DBL) << " giving to rankserver" << endl; - rankserver.dispatch(m); - continue; - } - - // ok - entity_name_t dest = m->get_dest(); - directory_lock.Lock(); - if (directory.count(dest)) { - Messenger *who = directory[ dest ]; - directory_lock.Unlock(); - - dout(4) << g_clock.now() << " ---- '" << m->get_type_name() << - "' from " << MSG_ADDR_NICE(m->get_source()) << ':' << m->get_source_port() << - " to " << MSG_ADDR_NICE(m->get_dest()) << ':' << m->get_dest_port() << " ---- " - << *m - << endl; - - who->dispatch(m); - } else { - directory_lock.Unlock(); - dout (1) << "---- i don't know who " << MSG_ADDR_NICE(dest) << " " << dest << " is." << endl; - assert(0); - } - } - assert(stat_disq == 0); - - } - - - g_timer.shutdown(); - - dout(5) << "tcp_dispatchthread exiting loop" << endl; - return 0; -} - - -// start/stop mpi receiver thread (for unsolicited messages) -int tcpmessenger_start() -{ - dout(5) << "starting accept thread" << endl; - pthread_create(&listen_thread_id, - NULL, - tcp_acceptthread, - 0); - - dout(5) << "starting dispatch thread" << endl; - - // start a thread - pthread_create(&dispatch_thread_id, - NULL, - tcp_dispatchthread, - 0); - - - /* - dout(5) << "starting outgoing thread" << endl; - pthread_create(&out_thread_id, - NULL, - tcp_outthread, - 0); - */ - if (!g_conf.tcp_multi_out) - single_out_thread.create(); - return 0; -} - - -/* - * kick and wake up _loop (to pick up new outgoing message, or quit) - */ - -void tcpmessenger_kick_dispatch_loop() -{ - if (g_conf.tcp_multi_dispatch) { - assert(0); - // all of them - /*for (hash_map::iterator i = directory.begin(); - i != directory.end(); - i++) - i->second->dispatch_kick(); - */ - } else { - // just one - dout(DBL) << "kicking" << endl; - incoming_lock.Lock(); - dout(DBL) << "prekick" << endl; - incoming_cond.Signal(); - incoming_lock.Unlock(); - dout(DBL) << "kicked" << endl; - } -} - -/* -void tcpmessenger_kick_outgoing_loop() -{ - outgoing_lock.Lock(); - outgoing_cond.Signal(); - outgoing_lock.Unlock(); -} -*/ - - -// wait for thread to finish - -void tcpmessenger_wait() -{ - if (g_conf.tcp_multi_dispatch) { - // new way - incoming_lock.Lock(); - while (!tcp_done) - incoming_cond.Wait(incoming_lock); - incoming_lock.Unlock(); - } else { - // old way - dout(10) << "tcpmessenger_wait waking up dispatch loop" << endl; - tcpmessenger_kick_dispatch_loop(); - - void *returnval; - dout(10) << "tcpmessenger_wait waiting for thread to finished." << endl; - pthread_join(dispatch_thread_id, &returnval); - dout(10) << "tcpmessenger_wait thread finished." << endl; - } -} - - - - -entity_name_t register_entity(entity_name_t addr) -{ - lookup_lock.Lock(); - - // prepare to wait - long id = ++regid; - Cond cond; - waiting_for_register_cond[id] = &cond; - - if (my_rank < 0) { - dout(DBL) << "register_entity don't know my rank, connecting" << endl; - - // connect to nameserver; discover my rank. - Message *m = new MNSConnect(listen_addr); - m->set_dest(MSG_ADDR_DIRECTORY, 0); - tcp_marshall(m); - OutThread *outt = tcp_lookup(m); - assert(outt); - tcp_send(m); - - // wait for reply - while (my_rank < 0) - waiting_for_rank.Wait(lookup_lock); - assert(my_rank > 0); - } - - // send req - dout(DBL) << "register_entity " << MSG_ADDR_NICE(addr) << endl; - Message *m = new MNSRegister(addr, my_rank, id); - m->set_dest(MSG_ADDR_DIRECTORY, 0); - tcp_marshall(m); - OutThread *outt = tcp_lookup(m); - assert(outt); - tcp_send(m); - - // wait? - while (!waiting_for_register_result.count(id)) - cond.Wait(lookup_lock); - - // get result, clean up - entity_name_t entity = waiting_for_register_result[id]; - waiting_for_register_result.erase(id); - waiting_for_register_cond.erase(id); - - dout(DBL) << "register_entity got " << MSG_ADDR_NICE(entity) << endl; - - lookup_lock.Unlock(); - - // ok! - return entity; -} - - - -/*********** - * Tcpmessenger class implementation - */ - - -TCPMessenger::TCPMessenger(entity_name_t myaddr) : - Messenger(myaddr), - dispatch_thread(this) -{ - if (myaddr != MSG_ADDR_DIRECTORY) { - // register! - myaddr = register_entity(myaddr); - } - - - // my address - set_myaddr( myaddr ); - - // register myself in the messenger directory - directory_lock.Lock(); - { - directory[myaddr] = this; - - stat_num++; - if (logger) logger->set("num", stat_num); - } - directory_lock.Unlock(); - - // register to execute timer events - //g_timer.set_messenger_kicker(new C_TCPKicker()); - // g_timer.set_messenger(this); -} - - -void TCPMessenger::ready() -{ - directory_lock.Lock(); - directory_ready.insert(get_myaddr()); - directory_lock.Unlock(); - - if (get_myaddr() != MSG_ADDR_DIRECTORY) { - // started! tell namer we are up and running. - lookup_lock.Lock(); - { - Message *m = new MGenericMessage(MSG_NS_STARTED); - m->set_source(get_myaddr(), 0); - m->set_dest(MSG_ADDR_DIRECTORY, 0); - tcp_marshall(m); - OutThread *outt = tcp_lookup(m); - assert(outt); - tcp_send(m); - } - lookup_lock.Unlock(); - } -} - - -TCPMessenger::~TCPMessenger() -{ - //delete logger; -} - -tcpaddr_t& TCPMessenger::get_tcpaddr() -{ - return listen_addr; -} - -void TCPMessenger::map_entity_rank(entity_name_t e, int r) -{ - lookup_lock.Lock(); - entity_rank[e] = r; - lookup_lock.Unlock(); -} - -void TCPMessenger::map_rank_addr(int r, tcpaddr_t a) -{ - lookup_lock.Lock(); - rank_addr[r] = a; - lookup_lock.Unlock(); -} - - -int TCPMessenger::get_dispatch_queue_len() -{ - return stat_inq+stat_disq; -} - - -int TCPMessenger::shutdown() -{ - dout(DBL) << "shutdown by " << MSG_ADDR_NICE(get_myaddr()) << endl; - - // dont' send unregistery from nsmessenger shutdown! - if (this != nsmessenger && - (my_rank > 0 || nsmessenger)) { - dout(DBL) << "sending unregister from " << MSG_ADDR_NICE(get_myaddr()) << " to ns" << endl; - send_message(new MGenericMessage(MSG_NS_UNREGISTER), - MSG_ADDR_DIRECTORY); - } - - // remove me from the directory - directory_lock.Lock(); - directory.erase(get_myaddr()); - - // last one? - bool lastone = directory.empty(); - //dout(1) << "lastone = " << lastone << " .. " << directory.size() << endl; - - - // or almost last one? - if (rankmessenger && directory.size() == 1) { - directory_lock.Unlock(); - tcpmessenger_stop_rankserver(); - directory_lock.Lock(); - } - - stat_num--; - if (logger) logger->set("num", stat_num); - - directory_lock.Unlock(); - - // last one? - if (lastone) { - dout(2) << "shutdown last tcpmessenger on rank " << my_rank << " shut down" << endl; - //pthread_t whoami = pthread_self(); - - // no more timer events - //g_timer.unset_messenger(); - - // close incoming sockets - //void *r; - for (map::iterator it = in_threads.begin(); - it != in_threads.end(); - it++) { - dout(DBL) << "closing reader on sd " << it->first << endl; - ::close(it->first); - //pthread_join(it->second, &r); - } - - if (g_conf.tcp_multi_dispatch) { - // kill off dispatch threads - dout(DBL) << "killing dispatch threads" << endl; - for (hash_map::iterator it = directory.begin(); - it != directory.end(); - it++) - it->second->dispatch_stop(); - } - - dout(DBL) << "setting tcp_done" << endl; - - // kick/kill incoming thread - incoming_lock.Lock(); - tcp_done = true; - incoming_cond.Signal(); - incoming_lock.Unlock(); - - // finish off outgoing thread - dout(10) << "waiting for outgoing to finish" << endl; - if (g_conf.tcp_multi_out) { - for (hash_map::iterator it = rank_out.begin(); - it != rank_out.end(); - it++) { - it->second->stop(); - delete it->second; - } - } else { - single_out_thread.stop(); - } - - - /* - - dout(15) << "whoami = " << whoami << ", thread = " << dispatch_thread_id << endl; - if (whoami == thread_id) { - // i am the event loop thread, just set flag! - dout(15) << " set tcp_done=true" << endl; - tcp_done = true; - } - */ - } - return 0; -} - - - - -/*** - * public messaging interface - */ - - -/* note: send_message _MUST_ be non-blocking */ -int TCPMessenger::send_message(Message *m, entity_name_t dest, int port, int fromport) -{ - // set envelope - m->set_source(get_myaddr(), fromport); - m->set_dest(dest, port); - m->set_lamport_send_stamp( get_lamport() ); - - dout(4) << "--> " << m->get_type_name() - << " from " << MSG_ADDR_NICE(m->get_source()) << ':' << m->get_source_port() - << " to " << MSG_ADDR_NICE(m->get_dest()) << ':' << m->get_dest_port() - << " ---- " << m - << endl; - - // local? - TCPMessenger *entity = 0; - directory_lock.Lock(); - if (directory.count(dest) && - directory_ready.count(dest)) entity = directory[dest]; - directory_lock.Unlock(); - - if (entity) { - // local! - ::incoming_lock.Lock(); - { - dout(20) << " queueing locally for " << dest << " " << m << endl; //", stat_inq " << stat_inq << ", incomign " << ::incoming.size() << endl; - //assert(stat_inq == ::incoming.size()); - ::incoming.push_back(m); - ::incoming_cond.Signal(); - stat_inq++; - //assert(stat_inq == ::incoming.size()); - //dout(-20) << " stat_inq " << stat_inq << ", incoming " << ::incoming.size() << endl; - stat_inqb += m->get_payload().length(); - } - ::incoming_lock.Unlock(); - } else { - // remote! - - if (g_conf.tcp_serial_marshall) - tcp_marshall(m); - - if (g_conf.tcp_serial_out) { - lookup_lock.Lock(); - // send in this thread - if (tcp_lookup(m)) - tcp_send(m); - lookup_lock.Unlock(); - } else { - lookup_lock.Lock(); - OutThread *outt = tcp_lookup(m); - lookup_lock.Unlock(); - - if (outt) outt->send(m); - } - } - - return 0; -} - - - - diff --git a/branches/marnberg/quota/msg/TCPMessenger.h b/branches/marnberg/quota/msg/TCPMessenger.h deleted file mode 100644 index 414e50f5fef87..0000000000000 --- a/branches/marnberg/quota/msg/TCPMessenger.h +++ /dev/null @@ -1,115 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __TCPMESSENGER_H -#define __TCPMESSENGER_H - -#include "Messenger.h" -#include "Dispatcher.h" -#include "common/Thread.h" - -#include "tcp.h" - -class Timer; - - -class TCPMessenger : public Messenger { - protected: - - //class Logger *logger; // for logging - - bool incoming_stop; - Mutex incoming_lock; - list incoming; - Cond incoming_cond; - - class DispatchThread : public Thread { - TCPMessenger *m; - public: - DispatchThread(TCPMessenger *_m) : m(_m) {} - void *entry() { - m->dispatch_entry(); - return 0; - } - } dispatch_thread; - - void dispatch_entry(); - -public: - void dispatch_start() { - incoming_stop = false; - dispatch_thread.create(); - } - /* void dispatch_kick() { - incoming_lock.Lock(); - incoming_cond.Signal(); - incoming_lock.Unlock(); - }*/ - void dispatch_stop() { - incoming_lock.Lock(); - incoming_stop = true; - incoming_cond.Signal(); - incoming_lock.Unlock(); - dispatch_thread.join(); - } - void dispatch_queue(Message *m) { - incoming_lock.Lock(); - incoming.push_back(m); - incoming_cond.Signal(); - incoming_lock.Unlock(); - } - - public: - TCPMessenger(entity_name_t myaddr); - ~TCPMessenger(); - - void ready(); - - tcpaddr_t& get_tcpaddr(); - void map_entity_rank(entity_name_t e, int r); - void map_rank_addr(int r, tcpaddr_t a); - - int get_dispatch_queue_len(); - - void callback_kick(); - - // init, shutdown MPI and associated event loop thread. - virtual int shutdown(); - - // message interface - virtual int send_message(Message *m, entity_name_t dest, int port=0, int fromport=0); -}; - -/** - * these are all ONE per process. - */ - -extern int tcpmessenger_lookup(char *str, tcpaddr_t& ta); - -extern int tcpmessenger_findns(tcpaddr_t &nsa); - -extern int tcpmessenger_init(); -extern int tcpmessenger_start(); // start thread -extern void tcpmessenger_wait(); // wait for thread to finish. -extern int tcpmessenger_shutdown(); // finalize MPI - -extern void tcpmessenger_start_nameserver(tcpaddr_t& ta); // on rank 0 -extern void tcpmessenger_stop_nameserver(); // on rank 0 -extern void tcpmessenger_start_rankserver(tcpaddr_t& ta); // on all ranks -extern void tcpmessenger_stop_rankserver(); // on all ranks - -extern int tcpmessenger_get_rank(); - - -#endif diff --git a/branches/marnberg/quota/msg/error.c b/branches/marnberg/quota/msg/error.c deleted file mode 100644 index 15cd16a2ca9da..0000000000000 --- a/branches/marnberg/quota/msg/error.c +++ /dev/null @@ -1,77 +0,0 @@ -#include -#include -#include -#include - -#include "include/error.h" - -#define EXIT_USAGE_ERROR -1 /* error codes for program exit */ -#define EXIT_SYSTEM_ERROR -2 -#define EXIT_GENERIC_ERROR -3 -#define MSGSIZ 1024 /* maximum error message length */ - -/* print usage error message and exit */ -void userror(const char *use, const char *fmt, ...) -{ - char msg[MSGSIZ]; - int len; - - va_list ap; - va_start(ap, fmt); - - len = vsnprintf(msg+len, MSGSIZ-len, fmt, ap); - len += snprintf(msg+len, MSGSIZ-len, "\n"); - len += snprintf(msg+len, MSGSIZ-len, use); - fprintf(stderr, "%s\n", msg); - exit(EXIT_USAGE_ERROR); - - va_end(ap); -} - -/* print system error message and exit */ -void syserror(const char *fmt, ...) -{ - char msg[MSGSIZ]; - int len; - - va_list ap; - va_start(ap, fmt); - - len = vsnprintf(msg+len, MSGSIZ-len, fmt, ap); - len += snprintf(msg+len, MSGSIZ-len, ": %s\n", strerror(errno)); - fprintf(stderr, "%s", msg); - exit(EXIT_SYSTEM_ERROR); - - va_end(ap); -} - -/* print error message and exit */ -void exiterror(const char *fmt, ...) -{ - char msg[MSGSIZ]; - int len; - - va_list ap; - va_start(ap, fmt); - - len = vsnprintf(msg+len, MSGSIZ-len, fmt, ap); - fprintf(stderr, "%s\n", msg); - exit(EXIT_GENERIC_ERROR); - - va_end(ap); -} - -/* print error message */ -void error(const char *fmt, ...) -{ - char msg[MSGSIZ]; - int len; - - va_list ap; - va_start(ap, fmt); - - len = vsnprintf(msg+len, MSGSIZ-len, fmt, ap); - fprintf(stderr, "%s\n", msg); - - va_end(ap); -} diff --git a/branches/marnberg/quota/msg/mpistarter.cc b/branches/marnberg/quota/msg/mpistarter.cc deleted file mode 100644 index 79391f78210d2..0000000000000 --- a/branches/marnberg/quota/msg/mpistarter.cc +++ /dev/null @@ -1,62 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include - -#include "TCPMessenger.h" - -/* - * start up TCPMessenger via MPI. - */ - -pair mpi_bootstrap_tcp(int& argc, char**& argv) -{ - tcpmessenger_init(); - tcpmessenger_start(); - - // exchnage addresses with other nodes - MPI_Init(&argc, &argv); - - int mpi_world; - int mpi_rank; - MPI_Comm_size(MPI_COMM_WORLD, &mpi_world); - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - - //dout(1) << "i am " << mpi_rank << " of " << mpi_world << endl; - - // start up directory? - tcpaddr_t ta; - if (mpi_rank == 0) { - dout(30) << "i am rank 0, starting ns directory" << endl; - tcpmessenger_start_nameserver(ta); - } else { - memset(&ta, 0, sizeof(ta)); - } - - // distribute tcpaddr - int r = MPI_Bcast(&ta, sizeof(ta), MPI_CHAR, - 0, MPI_COMM_WORLD); - - dout(30) << "r = " << r << " ns tcpaddr is " << ta << endl; - tcpmessenger_start_rankserver(ta); - - MPI_Barrier(MPI_COMM_WORLD); - //g_clock.tare(); - MPI_Finalize(); - - return pair(mpi_rank, mpi_world); -} - - diff --git a/branches/marnberg/quota/msg/msg_types.h b/branches/marnberg/quota/msg/msg_types.h deleted file mode 100644 index 0b92df47020d0..0000000000000 --- a/branches/marnberg/quota/msg/msg_types.h +++ /dev/null @@ -1,186 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MSG_TYPES_H -#define __MSG_TYPES_H - -#include "include/types.h" -#include "tcp.h" - -// new typed msg_addr_t way! -class entity_name_t { - int _type; - int _num; - -public: - static const int TYPE_MON = 1; - static const int TYPE_MDS = 2; - static const int TYPE_OSD = 3; - static const int TYPE_CLIENT = 4; - - static const int NEW = -1; - - // cons - entity_name_t() : _type(0), _num(0) {} - entity_name_t(int t, int n) : _type(t), _num(n) {} - - int num() const { return _num; } - int type() const { return _type; } - const char *type_str() const { - switch (type()) { - case TYPE_MDS: return "mds"; - case TYPE_OSD: return "osd"; - case TYPE_MON: return "mon"; - case TYPE_CLIENT: return "client"; - default: return "unknown"; - } - } - - bool is_new() const { return num() == NEW; } - - bool is_client() const { return type() == TYPE_CLIENT; } - bool is_mds() const { return type() == TYPE_MDS; } - bool is_osd() const { return type() == TYPE_OSD; } - bool is_mon() const { return type() == TYPE_MON; } -}; - -inline bool operator== (const entity_name_t& l, const entity_name_t& r) { - return (l.type() == r.type()) && (l.num() == r.num()); } -inline bool operator!= (const entity_name_t& l, const entity_name_t& r) { - return (l.type() != r.type()) || (l.num() != r.num()); } -inline bool operator< (const entity_name_t& l, const entity_name_t& r) { - return (l.type() < r.type()) || (l.type() == r.type() && l.num() < r.num()); } - -inline std::ostream& operator<<(std::ostream& out, const entity_name_t& addr) { - //if (addr.is_namer()) return out << "namer"; - if (addr.is_new() || addr.num() < 0) - return out << addr.type_str() << "?"; - else - return out << addr.type_str() << addr.num(); -} - -namespace __gnu_cxx { - template<> struct hash< entity_name_t > - { - size_t operator()( const entity_name_t m ) const - { - static blobhash H; - return H((const char*)&m, sizeof(m)); - } - }; -} - -// get rid of these -#define MSG_ADDR_MDS(x) entity_name_t(entity_name_t::TYPE_MDS,x) -#define MSG_ADDR_OSD(x) entity_name_t(entity_name_t::TYPE_OSD,x) -#define MSG_ADDR_MON(x) entity_name_t(entity_name_t::TYPE_MON,x) -#define MSG_ADDR_CLIENT(x) entity_name_t(entity_name_t::TYPE_CLIENT,x) - -#define MSG_ADDR_RANK_NEW MSG_ADDR_RANK(entity_name_t::NEW) -#define MSG_ADDR_MDS_NEW MSG_ADDR_MDS(entity_name_t::NEW) -#define MSG_ADDR_OSD_NEW MSG_ADDR_OSD(entity_name_t::NEW) -#define MSG_ADDR_CLIENT_NEW MSG_ADDR_CLIENT(entity_name_t::NEW) - - -/* - * an entity's network address. - * includes a random value that prevents it from being reused. - * thus identifies a particular process instance. - * ipv4 for now. - */ -struct entity_addr_t { - __uint8_t ipq[4]; - __uint32_t port; - __uint32_t nonce; // bind time, or pid, or something unique! - - entity_addr_t() : port(0), nonce(0) { - ipq[0] = ipq[1] = ipq[2] = ipq[3] = 0; - } - - void set_addr(tcpaddr_t a) { - memcpy((char*)ipq, (char*)&a.sin_addr.s_addr, 4); - port = ntohs(a.sin_port); - } - void make_addr(tcpaddr_t& a) const { - memset(&a, 0, sizeof(a)); - a.sin_family = AF_INET; - memcpy((char*)&a.sin_addr.s_addr, (char*)ipq, 4); - a.sin_port = htons(port); - } -}; - -inline ostream& operator<<(ostream& out, const entity_addr_t &addr) -{ - return out << (int)addr.ipq[0] - << '.' << (int)addr.ipq[1] - << '.' << (int)addr.ipq[2] - << '.' << (int)addr.ipq[3] - << ':' << addr.port - << '.' << addr.nonce; -} - -inline bool operator==(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) == 0; } -inline bool operator!=(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) != 0; } -inline bool operator<(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) < 0; } -inline bool operator<=(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) <= 0; } -inline bool operator>(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) > 0; } -inline bool operator>=(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) >= 0; } - -namespace __gnu_cxx { - template<> struct hash< entity_addr_t > - { - size_t operator()( const entity_addr_t& x ) const - { - static blobhash H; - return H((const char*)&x, sizeof(x)); - } - }; -} - - -/* - * a particular entity instance - */ -struct entity_inst_t { - entity_name_t name; - entity_addr_t addr; - entity_inst_t() {} - entity_inst_t(entity_name_t n, const entity_addr_t& a) : name(n), addr(a) {} -}; - - -inline bool operator==(const entity_inst_t& a, const entity_inst_t& b) { return memcmp(&a, &b, sizeof(a)) == 0; } -inline bool operator!=(const entity_inst_t& a, const entity_inst_t& b) { return memcmp(&a, &b, sizeof(a)) != 0; } -inline bool operator<(const entity_inst_t& a, const entity_inst_t& b) { return memcmp(&a, &b, sizeof(a)) < 0; } -inline bool operator<=(const entity_inst_t& a, const entity_inst_t& b) { return memcmp(&a, &b, sizeof(a)) <= 0; } -inline bool operator>(const entity_inst_t& a, const entity_inst_t& b) { return memcmp(&a, &b, sizeof(a)) > 0; } -inline bool operator>=(const entity_inst_t& a, const entity_inst_t& b) { return memcmp(&a, &b, sizeof(a)) >= 0; } - -namespace __gnu_cxx { - template<> struct hash< entity_inst_t > - { - size_t operator()( const entity_inst_t& x ) const - { - static blobhash H; - return H((const char*)&x, sizeof(x)); - } - }; -} - -inline ostream& operator<<(ostream& out, const entity_inst_t &i) -{ - return out << i.name << " " << i.addr; -} - - -#endif diff --git a/branches/marnberg/quota/msg/new_mpistarter.cc b/branches/marnberg/quota/msg/new_mpistarter.cc deleted file mode 100644 index fc9da720f19ee..0000000000000 --- a/branches/marnberg/quota/msg/new_mpistarter.cc +++ /dev/null @@ -1,43 +0,0 @@ -#include -#include "NewMessenger.h" - -/* - * start up NewMessenger via MPI. - */ - -pair mpi_bootstrap_new(int& argc, char**& argv) -{ - MPI_Init(&argc, &argv); - - int mpi_world; - int mpi_rank; - MPI_Comm_size(MPI_COMM_WORLD, &mpi_world); - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - - tcpaddr_t nsaddr; - memset(&nsaddr, 0, sizeof(nsaddr)); - - if (mpi_rank == 0) { - // i am root. - rank.my_rank = 0; - rank.start_rank(nsaddr); - nsaddr = rank.get_listen_addr(); - } - - int r = MPI_Bcast(&nsaddr, sizeof(nsaddr), MPI_CHAR, - 0, MPI_COMM_WORLD); - - dout(30) << "r = " << r << " ns tcpaddr is " << nsaddr << endl; - - if (mpi_rank != 0) { - rank.start_rank(nsaddr); - } - - MPI_Barrier(MPI_COMM_WORLD); - - //g_clock.tare(); - - MPI_Finalize(); - - return pair(mpi_rank, mpi_world); -} diff --git a/branches/marnberg/quota/msg/tcp.cc b/branches/marnberg/quota/msg/tcp.cc deleted file mode 100644 index 1a448a91cb2c6..0000000000000 --- a/branches/marnberg/quota/msg/tcp.cc +++ /dev/null @@ -1,87 +0,0 @@ - -#include "tcp.h" - -/****************** - * tcp crap - */ - -bool tcp_read(int sd, char *buf, int len) -{ - while (len > 0) { - int got = ::recv( sd, buf, len, 0 ); - if (got == 0) { - dout(18) << "tcp_read socket " << sd << " closed" << endl; - return false; - } - if (got < 0) { - dout(18) << "tcp_read bailing with " << got << endl; - return false; - } - assert(got >= 0); - len -= got; - buf += got; - //dout(DBL) << "tcp_read got " << got << ", " << len << " left" << endl; - } - return true; -} - -int tcp_write(int sd, char *buf, int len) -{ - //dout(DBL) << "tcp_write writing " << len << endl; - assert(len > 0); - while (len > 0) { - int did = ::send( sd, buf, len, 0 ); - if (did < 0) { - dout(1) << "tcp_write error did = " << did << " errno " << errno << " " << strerror(errno) << endl; - //cerr << "tcp_write error did = " << did << " errno " << errno << " " << strerror(errno) << endl; - } - //assert(did >= 0); - if (did < 0) return did; - len -= did; - buf += did; - //dout(DBL) << "tcp_write did " << did << ", " << len << " left" << endl; - } - return 0; -} - - -int tcp_hostlookup(char *str, tcpaddr_t& ta) -{ - char *host = str; - char *port = 0; - - for (int i=0; str[i]; i++) { - if (str[i] == ':') { - port = str+i+1; - str[i] = 0; - break; - } - } - if (!port) { - cerr << "addr '" << str << "' doesn't look like 'host:port'" << endl; - return -1; - } - //cout << "host '" << host << "' port '" << port << "'" << endl; - - int iport = atoi(port); - - struct hostent *myhostname = gethostbyname( host ); - if (!myhostname) { - cerr << "host " << host << " not found" << endl; - return -1; - } - - memset(&ta, 0, sizeof(ta)); - - //cout << "addrtype " << myhostname->h_addrtype << " len " << myhostname->h_length << endl; - - ta.sin_family = myhostname->h_addrtype; - memcpy((char *)&ta.sin_addr, - myhostname->h_addr, - myhostname->h_length); - ta.sin_port = iport; - - cout << "lookup '" << host << ":" << port << "' -> " << ta << endl; - - return 0; -} diff --git a/branches/marnberg/quota/msg/tcp.h b/branches/marnberg/quota/msg/tcp.h deleted file mode 100644 index 65043cda8e2ac..0000000000000 --- a/branches/marnberg/quota/msg/tcp.h +++ /dev/null @@ -1,37 +0,0 @@ -#ifndef __TCP_H -#define __TCP_H - -#include -#include -#include -#include - -typedef struct sockaddr_in tcpaddr_t; - -using std::ostream; - -inline ostream& operator<<(ostream& out, const tcpaddr_t &a) -{ - unsigned char addr[4]; - memcpy((char*)addr, (char*)&a.sin_addr.s_addr, 4); - out << (unsigned)addr[0] << "." - << (unsigned)addr[1] << "." - << (unsigned)addr[2] << "." - << (unsigned)addr[3] << ":" - << ntohs(a.sin_port); - return out; -} - -extern bool tcp_read(int sd, char *buf, int len); -extern int tcp_write(int sd, char *buf, int len); -extern int tcp_hostlookup(char *str, tcpaddr_t& ta); - -inline bool operator==(const tcpaddr_t& a, const tcpaddr_t& b) { - return strncmp((const char*)&a, (const char*)&b, sizeof(a)) == 0; -} -inline bool operator!=(const tcpaddr_t& a, const tcpaddr_t& b) { - return strncmp((const char*)&a, (const char*)&b, sizeof(a)) != 0; -} - - -#endif diff --git a/branches/marnberg/quota/newsyn.cc b/branches/marnberg/quota/newsyn.cc deleted file mode 100644 index 82292033fa062..0000000000000 --- a/branches/marnberg/quota/newsyn.cc +++ /dev/null @@ -1,407 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include -#include -#include -using namespace std; - -#include - -#include "config.h" - -#include "mds/MDS.h" -#include "osd/OSD.h" -#include "mon/Monitor.h" -#include "client/Client.h" -#include "client/SyntheticClient.h" - -#include "msg/SimpleMessenger.h" - -#include "common/Timer.h" - -#define NUMMDS g_conf.num_mds -#define NUMOSD g_conf.num_osd -#define NUMCLIENT g_conf.num_client - -class C_Test : public Context { -public: - void finish(int r) { - cout << "C_Test->finish(" << r << ")" << endl; - } -}; - - -/* - * start up NewMessenger via MPI. - */ -#include - -pair mpi_bootstrap_new(int& argc, char**& argv, MonMap *monmap) -{ - MPI_Init(&argc, &argv); - - int mpi_world; - int mpi_rank; - MPI_Comm_size(MPI_COMM_WORLD, &mpi_world); - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - - // first, synchronize clocks. - MPI_Barrier(MPI_COMM_WORLD); - //dout(-10) << "tare" << endl; - g_clock.tare(); - - // start up all monitors at known addresses. - entity_inst_t moninst[mpi_world]; // only care about first g_conf.num_mon of these. - - rank.start_rank(); // bind and listen - - if (mpi_rank < g_conf.num_mon) { - moninst[mpi_rank].addr = rank.my_addr; - moninst[mpi_rank].name = MSG_ADDR_MON(mpi_rank); - - //cerr << mpi_rank << " at " << rank.get_listen_addr() << endl; - } - - MPI_Gather( &moninst[mpi_rank], sizeof(entity_inst_t), MPI_CHAR, - moninst, sizeof(entity_inst_t), MPI_CHAR, - 0, MPI_COMM_WORLD); - - if (mpi_rank == 0) { - for (int i=0; imon_inst[i] = moninst[i]; - } - } - - - // distribute monmap - bufferlist bl; - if (mpi_rank == 0) { - monmap->encode(bl); - monmap->write(".ceph_monmap"); - } else { - int l = g_conf.num_mon * 1000; // nice'n big. - bufferptr bp(l); - bl.append(bp); - } - - MPI_Bcast(bl.c_str(), bl.length(), MPI_CHAR, - 0, MPI_COMM_WORLD); - - if (mpi_rank > 0) { - monmap->decode(bl); - } - - // wait for everyone! - MPI_Barrier(MPI_COMM_WORLD); - - return pair(mpi_rank, mpi_world); -} - -utime_t tick_start; -int tick_count = 0; - -class C_Tick : public Context { -public: - void finish(int) { - utime_t now = g_clock.now() - tick_start; - dout(0) << "tick +" << g_conf.tick << " -> " << now << " (" << tick_count << ")" << endl; - tick_count += g_conf.tick; - utime_t next = tick_start; - next.sec_ref() += tick_count; - g_timer.add_event_at(next, new C_Tick); - } -}; - -class C_Die : public Context { -public: - void finish(int) { - cerr << "die" << endl; - exit(1); - } -}; - -class C_Debug : public Context { - public: - void finish(int) { - int size = &g_conf.debug_after - &g_conf.debug; - memcpy((char*)&g_conf.debug, (char*)&g_debug_after_conf.debug, size); - dout(0) << "debug_after flipping debug settings" << endl; - } -}; - - -int main(int argc, char **argv) -{ - vector args; - argv_to_vec(argc, argv, args); - - map kill_osd_after; - if (1) { - vector nargs; - for (unsigned i=0; i nargs; - for (unsigned i=0; i mpiwho = mpi_bootstrap_new(argc, argv, monmap); - int myrank = mpiwho.first; - int world = mpiwho.second; - - int need = 0; - if (g_conf.ms_skip_rank0) need++; - need += NUMMDS; - if (g_conf.ms_stripe_osds) - need++; - else - need += NUMOSD; - if (NUMCLIENT) { - if (!g_conf.ms_overlay_clients) - need += 1; - } - assert(need <= world); - - if (myrank == 0) - cerr << "nummds " << NUMMDS << " numosd " << NUMOSD << " numclient " << NUMCLIENT << " .. need " << need << ", have " << world << endl; - - - char hostname[100]; - gethostname(hostname,100); - int pid = getpid(); - - int started = 0; - - //if (myrank == 0) g_conf.debug = 20; - - // create mon - if (myrank < g_conf.num_mon) { - Monitor *mon = new Monitor(myrank, rank.register_entity(MSG_ADDR_MON(myrank)), monmap); - mon->init(); - } - - - // wait for monitors to start. - MPI_Barrier(MPI_COMM_WORLD); - - // okay, home free! - MPI_Finalize(); - - - // create mds - map mds; - map mdsosd; - for (int i=0; iinit(); - started++; - - if (g_conf.mds_local_osd) { - mdsosd[i] = new OSD(i+10000, rank.register_entity(MSG_ADDR_OSD(i+10000)), monmap); - mdsosd[i]->init(); - } - } - - // create osd - map osd; - int max_osd_nodes = world - NUMMDS - g_conf.ms_skip_rank0; // assumes 0 clients, if we stripe. - int osds_per_node = (NUMOSD-1)/max_osd_nodes + 1; - for (int i=0; iinit(); - started++; - } - - if (g_conf.ms_overlay_clients) sleep(5); - - // create client - int skip_osd = NUMOSD; - if (g_conf.ms_overlay_clients) - skip_osd = 0; // put clients with osds too! - int client_nodes = world - NUMMDS - skip_osd - g_conf.ms_skip_rank0; - int clients_per_node = 1; - if (NUMCLIENT && client_nodes > 0) clients_per_node = (NUMCLIENT-1) / client_nodes + 1; - set clientlist; - map client;//[NUMCLIENT]; - map syn;//[NUMCLIENT]; - int nclients = 0; - for (int i=0; iinit(); - started++; - - syn[i] = new SyntheticClient(client[i]); - - client[i]->mount(); - nclients++; - } - - if (!clientlist.empty()) dout(2) << "i have " << clientlist << endl; - - for (set::iterator it = clientlist.begin(); - it != clientlist.end(); - it++) { - int i = *it; - - //cerr << "starting synthetic client" << i << " on rank " << myrank << endl; - syn[i]->start_thread(); - - } - if (nclients) { - cerr << nclients << " clients at " << rank.my_addr << " " << hostname << "." << pid << endl; - } - - for (set::iterator it = clientlist.begin(); - it != clientlist.end(); - it++) { - int i = *it; - - // cout << "waiting for synthetic client" << i << " to finish" << endl; - syn[i]->join_thread(); - delete syn[i]; - - client[i]->unmount(); - //cout << "client" << i << " unmounted" << endl; - client[i]->shutdown(); - - delete client[i]; - } - - - if (myrank && !started) { - //dout(1) << "IDLE" << endl; - cerr << "idle at " << rank.my_addr << " " << hostname << "." << pid << endl; - //rank.stop_rank(); - } - - // wait for everything to finish - rank.wait(); - - if (started) cerr << "newsyn finishing" << endl; - - return 0; // whatever, cleanup hangs sometimes (stopping ebofs threads?). - - - // cleanup - for (map::iterator i = mds.begin(); i != mds.end(); i++) - delete i->second; - for (map::iterator i = mdsosd.begin(); i != mdsosd.end(); i++) - delete i->second; - for (map::iterator i = osd.begin(); i != osd.end(); i++) - delete i->second; - /* - for (map::iterator i = client.begin(); i != client.end(); i++) - delete i->second; - for (map::iterator i = syn.begin(); i != syn.end(); i++) - delete i->second; - */ - /* - for (int i=0; i - -Ceph - scalable distributed file system - -This is free software; you can redistribute it and/or -modify it under the terms of the GNU Lesser General Public -License version 2.1, as published by the Free Software -Foundation. See file COPYING. */ - - -#include -#include "OSBDB.h" - -using namespace std; - -#undef dout -#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_bdbstore) cout << "bdbstore(" << device << ")@" << __LINE__ << "." -#undef derr -#define derr(x) if (x <= g_conf.debug || x <= g_conf.debug_bdbstore) cerr << "bdbstore(" << device << ")@" << __LINE__ << "." - - // Utilities. - -// Starting off with my own bsearch; mail reader to follow... - -// Perform a binary search on a sorted array, returning the insertion -// point for key, or key if it is exactly found. In other words, this -// will return a pointer to the element that will come after key if -// key were to be inserted into the sorted array. -// -// Requires that T have < and > operators defined. -template -uint32_t binary_search (T *array, size_t size, T key) -{ - int low = 0; - int high = size; - int p = (low + high) / 2; - - while (low < high - 1) - { - if (array[p] > key) - { - high = p; - } - else if (array[p] < key) - { - low = p; - } - else - return p; - - p = (low + high) / 2; - } - - if (array[p] < key) - p++; - else if (array[p] > key && p > 0) - p--; - return p; -} - - // Management. - -int OSBDB::opendb(DBTYPE type, int flags, bool new_env) -{ - // BDB transactions require an environment. - if (g_conf.bdbstore_transactional) - { - env = new DbEnv (DB_CXX_NO_EXCEPTIONS); - env->set_error_stream (&std::cerr); - env->set_message_stream (&std::cout); - env->set_flags (DB_LOG_INMEMORY, 1); - //env->set_flags (DB_DIRECT_DB, 1); - int env_flags = (DB_CREATE - | DB_THREAD - | DB_INIT_LOCK - | DB_INIT_MPOOL - | DB_INIT_TXN - | DB_INIT_LOG - | DB_PRIVATE); - //if (new_env) - // env->remove (env_dir.c_str(), 0); - if (env->open (NULL, env_flags, 0) != 0) - { - std::cerr << "failed to open environment " << std::endl; - return -EIO; - } - - } - - db = new Db(env, 0); - db->set_error_stream (&std::cerr); - db->set_message_stream (&std::cout); - db->set_flags (0); - if (!g_conf.bdbstore_btree) - { - if (g_conf.bdbstore_pagesize > 0) - db->set_pagesize (g_conf.bdbstore_pagesize); - if (g_conf.bdbstore_ffactor > 0 && g_conf.bdbstore_nelem > 0) - { - db->set_h_ffactor (g_conf.bdbstore_ffactor); - db->set_h_nelem (g_conf.bdbstore_nelem); - } - } - if (g_conf.bdbstore_cachesize > 0) - { - db->set_cachesize (0, g_conf.bdbstore_cachesize, 0); - } - - flags = flags | DB_THREAD; - if (transactional) - flags = flags | DB_AUTO_COMMIT; - - int ret; - if ((ret = db->open (NULL, device.c_str(), NULL, type, flags, 0)) != 0) - { - derr(1) << "failed to open database: " << device << ": " - << strerror(ret) << std::endl; - return -EINVAL; - } - opened = true; - return 0; -} - -int OSBDB::mount() -{ - dout(2) << "mount " << device << endl; - - if (mounted) - { - dout(4) << "..already mounted" << endl; - return 0; - } - - if (!opened) - { - int ret; - if ((ret = opendb ()) != 0) - { - dout(4) << "..returns " << ret << endl; - return ret; - } - } - - // XXX Do we want anything else in the superblock? - - Dbt key (OSBDB_SUPERBLOCK_KEY, 1); - stored_superblock super; - Dbt value (&super, sizeof (super)); - value.set_dlen (sizeof (super)); - value.set_ulen (sizeof (super)); - value.set_flags (DB_DBT_USERMEM | DB_DBT_PARTIAL); - - if (db->get (NULL, &key, &value, 0) != 0) - { - dout(4) << "..get superblock fails" << endl; - return -EINVAL; // XXX how to say "badly formed fs?" - } - - dout(3) << ".mount " << super << endl; - - if (super.version != OSBDB_THIS_VERSION) - { - dout(4) << "version mismatch (" << super.version << ")" << endl; - return -EINVAL; - } - - DBTYPE t; - db->get_type (&t); - - if (t == DB_BTREE) - { - u_int32_t minkey; - u_int32_t flags; - db->get_bt_minkey (&minkey); - db->get_flags (&flags); - dout(1) << "mounted version " << OSBDB_THIS_VERSION << "; Btree; " - << "min keys per page: " << minkey << "; flags: " - << hex << flags << dec << endl; - cout << dec; - } - else - { - u_int32_t ffactor; - u_int32_t nelem; - u_int32_t flags; - db->get_h_ffactor (&ffactor); - db->get_h_nelem (&nelem); - db->get_flags (&flags); - dout(1) << "mounted version " << OSBDB_THIS_VERSION << "; Hash; " - << "fill factor: " << ffactor - << " table size: " << nelem << "; flags: " - << hex << flags << dec << endl; - cout << dec; - } - - mounted = true; - dout(4) << "..mounted" << endl; - return 0; -} - -int OSBDB::umount() -{ - if (!mounted) - return -EINVAL; - - dout(2) << "umount" << endl; - - int ret; - if (opened) - { - if (transactional) - { - env->log_flush (NULL); - if ((ret = env->lsn_reset (device.c_str(), 0)) != 0) - { - derr(1) << "lsn_reset: " << db_strerror (ret) << endl; - } - } - - db->sync (0); - - if ((ret = db->close (0)) != 0) - { - derr(1) << "close: " << db_strerror(ret) << endl; - return -EINVAL; - } - delete db; - db = NULL; - - if (env) - { - env->close (0); - delete env; - env = NULL; - } - } - mounted = false; - opened = false; - dout(4) << "..unmounted" << endl; - return 0; -} - -int OSBDB::mkfs() -{ - if (mounted) - return -EINVAL; - - dout(2) << "mkfs" << endl; - - unlink (device.c_str()); - - int ret; - if ((ret = opendb((g_conf.bdbstore_btree ? DB_BTREE : DB_HASH), - DB_CREATE, true)) != 0) - { - derr(1) << "failed to open database: " << device << ": " - << db_strerror(ret) << std::endl; - return -EINVAL; - } - opened = true; - dout(3) << "..opened " << device << endl; - - uint32_t c; - ret = db->truncate (NULL, &c, 0); - if (ret != 0) - { - derr(1) << "db truncate failed: " << db_strerror (ret) << endl; - return -EIO; // ??? - } - - Dbt key (OSBDB_SUPERBLOCK_KEY, 1); - struct stored_superblock sb; - sb.version = OSBDB_THIS_VERSION; - Dbt value (&sb, sizeof (sb)); - - dout(3) << "..writing superblock" << endl; - if ((ret = db->put (NULL, &key, &value, 0)) != 0) - { - derr(1) << "failed to write superblock: " << db_strerror (ret) - << endl; - return -EIO; - } - dout(3) << "..wrote superblock" << endl; - dout(4) << "..mkfs done" << endl; - return 0; -} - - // Objects. - -int OSBDB::pick_object_revision_lt(object_t& oid) -{ - // Not really needed. - dout(0) << "pick_object_revision_lt " << oid << endl; - return -ENOSYS; -} - -bool OSBDB::exists(object_t oid) -{ - dout(2) << "exists " << oid << endl; - struct stat st; - bool ret = (stat (oid, &st) == 0); - dout(4) << "..returns " << ret << endl; - return ret; -} - -int OSBDB::statfs (struct statfs *st) -{ - // Hacky? - if (::statfs (device.c_str(), st) != 0) - { - int ret = -errno; - derr(1) << "statfs returns " << ret << endl; - return ret; - } - st->f_type = OSBDB_MAGIC; - dout(4) << "..statfs OK" << endl; - return 0; -} - -int OSBDB::stat(object_t oid, struct stat *st) -{ - if (!mounted) - { - dout(4) << "not mounted!" << endl; - return -EINVAL; - } - - dout(2) << "stat " << oid << endl; - - object_inode_key ikey = new_object_inode_key(oid); - stored_object obj; - Dbt key (&ikey, sizeof_object_inode_key()); - Dbt value (&obj, sizeof (obj)); - value.set_flags (DB_DBT_USERMEM); - value.set_ulen (sizeof (obj)); - - dout(3) << " lookup " << ikey << endl; - int ret; - if ((ret = db->get (NULL, &key, &value, 0)) != 0) - { - derr(1) << " get returned " << ret << endl; - return -ENOENT; - } - - st->st_size = obj.length; - dout(3) << "stat length:" << obj.length << endl; - dout(4) << "..stat OK" << endl; - return 0; -} - -int OSBDB::remove(object_t oid, Context *onsafe) -{ - if (!mounted) - { - derr(1) << "not mounted!" << endl; - return -EINVAL; - } - - dout(2) << "remove " << oid << endl; - - DbTxn *txn = NULL; - if (transactional) - env->txn_begin (NULL, &txn, 0); - - oid_t id; - mkoid(id, oid); - Dbt key (&id, sizeof (oid_t)); - db->del (NULL, &key, 0); - - object_inode_key _ikey = new_object_inode_key (oid); - Dbt ikey (&_ikey, sizeof_object_inode_key()); - db->del (txn, &ikey, 0); - - attrs_id aids = new_attrs_id (oid); - Dbt askey (&aids, sizeof_attrs_id()); - Dbt asval; - asval.set_flags (DB_DBT_MALLOC); - if (db->get (txn, &askey, &asval, 0) == 0) - { - // We have attributes; remove them. - stored_attrs *sap = (stored_attrs *) asval.get_data(); - auto_ptr sa (sap); - for (unsigned i = 0; i < sap->count; i++) - { - attr_id aid = new_attr_id (oid, sap->names[i].name); - Dbt akey (&aid, sizeof (aid)); - db->del (txn, &akey, 0); - } - db->del (txn, &askey, 0); - } - - // XXX check del return value - - if (txn) - txn->commit (0); - dout(4) << "..remove OK" << endl; - return 0; -} - -int OSBDB::truncate(object_t oid, off_t size, Context *onsafe) -{ - if (!mounted) - { - derr(1) << "not mounted!" << endl; - return -EINVAL; - } - - dout(2) << "truncate " << size << endl; - - if (size > 0xFFFFFFFF) - { - derr(1) << "object size too big!" << endl; - return -ENOSPC; - } - - DbTxn *txn = NULL; - - if (transactional) - env->txn_begin (NULL, &txn, 0); - - object_inode_key ikey = new_object_inode_key (oid); - stored_object obj; - Dbt key (&ikey, sizeof_object_inode_key()); - Dbt value (&obj, sizeof (obj)); - value.set_dlen (sizeof (obj)); - value.set_ulen (sizeof (obj)); - value.set_flags (DB_DBT_USERMEM); - - if (db->get (txn, &key, &value, 0) != 0) - { - if (txn) - txn->abort(); - dout(4) << "..returns -ENOENT" << endl; - return -ENOENT; - } - - if (obj.length < size) - { - oid_t id; - mkoid (id, oid); - Dbt okey (&id, sizeof (oid_t)); - char b[] = { '\0' }; - Dbt newVal (b, 1); - newVal.set_doff ((size_t) size); - newVal.set_dlen (1); - newVal.set_ulen (1); - newVal.set_flags (DB_DBT_PARTIAL); - if (db->put (txn, &okey, &newVal, 0) != 0) - { - if (txn) - txn->abort(); - derr(1) << ".updating object failed" << endl; - return -EIO; - } - - obj.length = size; - value.set_ulen (sizeof (obj)); - if (db->put (txn, &key, &value, 0) != 0) - { - if (txn) - txn->abort(); - derr(1) << ".updating object info failed" << endl; - return -EIO; - } - } - else if (obj.length > size) - { - obj.length = size; - Dbt tval (&obj, sizeof (obj)); - tval.set_ulen (sizeof (obj)); - tval.set_flags (DB_DBT_USERMEM); - if (db->put (txn, &key, &tval, 0) != 0) - { - if (txn) - txn->abort(); - derr(1) << ".updating object info failed" << endl; - return -EIO; - } - if (size == 0) - { - char x[1]; - oid_t id; - mkoid (id, oid); - Dbt okey (&id, sizeof (oid_t)); - Dbt oval (&x, 0); - if (db->put (txn, &okey, &oval, 0) != 0) - { - if (txn) - txn->abort(); - derr(1) << ".updating object failed" << endl; - return -EIO; - } - } - else - { - oid_t id; - mkoid (id, oid); - Dbt okey (&id, sizeof (oid_t)); - Dbt oval; - oval.set_flags (DB_DBT_MALLOC); - if (db->get (txn, &okey, &oval, 0) != 0) - { - if (txn) - txn->abort(); - derr(1) << ".getting old object failed" << endl; - return -EIO; - } - auto_ptr ovalPtr ((char *) oval.get_data()); - oval.set_size ((size_t) size); - oval.set_ulen ((size_t) size); - if (db->put (txn, &okey, &oval, 0) != 0) - { - if (txn) - txn->abort(); - derr(1) << ".putting new object failed" << endl; - return -EIO; - } - } - } - - if (txn) - txn->commit (0); - - dout(4) << "..truncate OK" << endl; - return 0; -} - -int OSBDB::read(object_t oid, off_t offset, size_t len, bufferlist& bl) -{ - if (!mounted) - { - derr(1) << "not mounted!" << endl; - return -EINVAL; - } - - dout(2) << "read " << oid << " " << offset << " " - << len << endl; - - DbTxn *txn = NULL; - if (transactional) - env->txn_begin (NULL, &txn, 0); - - object_inode_key _ikey = new_object_inode_key (oid); - stored_object obj; - Dbt ikey (&_ikey, sizeof_object_inode_key()); - Dbt ival (&obj, sizeof (obj)); - ival.set_flags (DB_DBT_USERMEM); - ival.set_ulen (sizeof(obj)); - - dout(3) << " get " << _ikey << endl; - int ret; - if ((ret = db->get (txn, &ikey, &ival, 0)) != 0) - { - if (txn) - txn->abort(); - derr(1) << "get returned " << db_strerror (ret) << endl; - return -ENOENT; - } - - if (offset == 0 && len >= obj.length) - { - len = obj.length; - dout(3) << " doing full read of " << len << endl; - oid_t id; - mkoid (id, oid); - Dbt key (&id, sizeof (oid_t)); - Dbt value (bl.c_str(), len); - value.set_ulen (len); - value.set_flags (DB_DBT_USERMEM); - dout(3) << " getting " << oid << endl; - if ((ret = db->get (txn, &key, &value, 0)) != 0) - { - derr(1) << " get returned " << db_strerror (ret) << endl; - if (txn) - txn->abort(); - return -EIO; - } - } - else - { - if (offset > obj.length) - { - dout(2) << "..offset out of range" << endl; - return 0; - } - if (offset + len > obj.length) - len = obj.length - (size_t) offset; - dout(3) << " doing partial read of " << len << endl; - oid_t id; - mkoid (id, oid); - Dbt key (&id, sizeof (oid)); - Dbt value (bl.c_str(), len); - value.set_doff ((size_t) offset); - value.set_dlen (len); - value.set_ulen (len); - value.set_flags (DB_DBT_USERMEM | DB_DBT_PARTIAL); - dout(3) << " getting " << oid << endl; - if ((ret = db->get (txn, &key, &value, 0)) != 0) - { - derr(1) << "get returned " << db_strerror (ret) << endl; - if (txn) - txn->abort(); - return -EIO; - } - } - - if (txn) - txn->commit (0); - dout(4) << "..read OK, returning " << len << endl; - return len; -} - -int OSBDB::write(object_t oid, off_t offset, size_t len, - bufferlist& bl, Context *onsafe) -{ - if (!mounted) - { - derr(1) << "not mounted!" << endl; - return -EINVAL; - } - - dout(2) << "write " << oid << " " << offset << " " - << len << endl; - - if (offset > 0xFFFFFFFFL || offset + len > 0xFFFFFFFFL) - { - derr(1) << "object too big" << endl; - return -ENOSPC; - } - - DbTxn *txn = NULL; - if (transactional) - env->txn_begin (txn, &txn, 0); - - object_inode_key _ikey = new_object_inode_key (oid); - stored_object obj; - Dbt ikey (&_ikey, sizeof_object_inode_key()); - Dbt ival (&obj, sizeof (obj)); - ival.set_ulen (sizeof (obj)); - ival.set_flags (DB_DBT_USERMEM); - - int ret; - dout(3) << "..getting " << _ikey << endl; - if (db->get (txn, &ikey, &ival, 0) != 0) - { - dout(3) << "..writing new object" << endl; - - // New object. - obj.length = (size_t) offset + len; - dout(3) << "..mapping " << _ikey << " => " - << obj << endl; - if ((ret = db->put (txn, &ikey, &ival, 0)) != 0) - { - derr(1) << "..put returned " << db_strerror (ret) << endl; - if (txn) - txn->abort(); - return -EIO; - } - - oid_t id; - mkoid (id, oid); - Dbt key (&id, sizeof (oid_t)); - Dbt value (bl.c_str(), len); - if (offset == 0) // whole object - { - value.set_flags (DB_DBT_USERMEM); - value.set_ulen (len); - } - else - { - value.set_flags (DB_DBT_USERMEM | DB_DBT_PARTIAL); - value.set_ulen (len); - value.set_doff ((size_t) offset); - value.set_dlen (len); - } - dout(3) << "..mapping " << oid << " => (" - << obj.length << " bytes)" << endl; - if ((ret = db->put (txn, &key, &value, 0)) != 0) - { - derr(1) << "..put returned " << db_strerror (ret) << endl; - if (txn) - txn->abort(); - return -EIO; - } - - if (txn) - txn->commit (0); - - dout(4) << "..write OK, returning " << len << endl; - return len; - } - - if (offset == 0 && len >= obj.length) - { - if (len != obj.length) - { - obj.length = len; - if ((ret = db->put (txn, &ikey, &ival, 0)) != 0) - { - derr(1) << " put returned " << db_strerror (ret) << endl; - if (txn) - txn->abort(); - return -EIO; - } - } - oid_t id; - mkoid(id, oid); - Dbt key (&id, sizeof (oid_t)); - Dbt value (bl.c_str(), len); - if (db->put (txn, &key, &value, 0) != 0) - { - if (txn) - txn->abort(); - derr(1) << "..writing object failed!" << endl; - return -EIO; - } - } - else - { - if (offset + len > obj.length) - { - obj.length = (size_t) offset + len; - if (db->put (txn, &ikey, &ival, 0) != 0) - { - if (txn) - txn->abort(); - derr(1) << "..writing object info failed!" << endl; - return -EIO; - } - } - oid_t id; - mkoid(id, oid); - Dbt key (&id, sizeof (oid_t)); - Dbt value (bl.c_str(), len); - value.set_doff ((size_t) offset); - value.set_dlen (len); - value.set_ulen (len); - value.set_flags (DB_DBT_PARTIAL); - if (db->put (txn, &key, &value, 0) != 0) - { - if (txn) - txn->abort(); - derr(1) << "..writing object failed!" << endl; - return -EIO; - } - } - - if (txn) - txn->commit (0); - - dout(4) << "..write OK, returning " << len << endl; - return len; -} - -int OSBDB::clone(object_t oid, object_t noid) -{ - if (!mounted) - { - derr(1) << "not mounted!" << endl; - return -EINVAL; - } - - dout(2) << "clone " << oid << ", " << noid << endl; - - if (exists (noid)) - { - dout(4) << "..target exists; returning -EEXIST" << endl; - return -EEXIST; - } - - DbTxn *txn = NULL; - if (transactional) - env->txn_begin (NULL, &txn, 0); - - object_inode_key _ikey = new_object_inode_key (oid); - object_inode_key _nikey = new_object_inode_key (noid); - stored_object obj; - Dbt ikey (&_ikey, sizeof_object_inode_key()); - Dbt ival (&obj, sizeof (obj)); - Dbt nikey (&_nikey, sizeof_object_inode_key()); - ival.set_ulen (sizeof (obj)); - ival.set_flags (DB_DBT_USERMEM); - - oid_t id, nid; - mkoid(id, oid); - mkoid(nid, noid); - Dbt key (&id, sizeof (oid_t)); - Dbt nkey (&oid, sizeof (oid_t)); - Dbt value; - value.set_flags (DB_DBT_MALLOC); - - if (db->get (txn, &ikey, &ival, 0) != 0) - { - if (txn) - txn->abort(); - derr(1) << "..getting object info failed!" << endl; - return -ENOENT; - } - if (db->get (txn, &key, &value, 0) != 0) - { - if (txn) - txn->abort(); - derr(1) << "..getting original object failed" << endl; - return -ENOENT; - } - auto_ptr valueptr ((char *) value.get_data()); - - if (db->put (txn, &nikey, &ival, 0) != 0) - { - if (txn) - txn->abort(); - derr(1) << "..putting object info failed" << endl; - return -EIO; - } - if (db->put (txn, &nkey, &value, 0) != 0) - { - if (txn) - txn->abort(); - derr(1) << "..putting new object failed" << endl; - return -EIO; - } - - if (txn) - txn->commit (0); - - dout(4) << "..clone OK" << endl; - return 0; -} - - // Collections - -int OSBDB::list_collections(list& ls) -{ - if (!mounted) - { - derr(1) << "not mounted!" << endl; - return -EINVAL; - } - - dout(2) << "list_collections" << endl; - - Dbt key (COLLECTIONS_KEY, 1); - Dbt value; - value.set_flags (DB_DBT_MALLOC); - - if (db->get (NULL, &key, &value, 0) != 0) - { - dout(4) << "..no collections" << endl; - return 0; // no collections. - } - - auto_ptr sc ((stored_colls *) value.get_data()); - stored_colls *scp = sc.get(); - for (uint32_t i = 0; i < sc->count; i++) - ls.push_back (scp->colls[i]); - - dout(4) << "..list_collections returns " << scp->count << endl; - return scp->count; -} - -int OSBDB::create_collection(coll_t c, Context *onsafe) -{ - if (!mounted) - { - derr(1) << "not mounted" << endl; - return -EINVAL; - } - - dout(2) << "create_collection " << hex << c << dec << endl; - - Dbt key (COLLECTIONS_KEY, 1); - Dbt value; - value.set_flags (DB_DBT_MALLOC); - - DbTxn *txn = NULL; - if (transactional) - env->txn_begin (NULL, &txn, 0); - - stored_colls *scp = NULL; - size_t sz = 0; - bool created = false; - if (db->get (txn, &key, &value, 0) != 0) - { - sz = sizeof (stored_colls) + sizeof (coll_t); - scp = (stored_colls *) malloc (sz); - scp->count = 0; - created = true; - } - else - { - scp = (stored_colls *) value.get_data(); - sz = value.get_size(); - } - - auto_ptr sc (scp); - int ins = 0; - if (scp->count > 0) - ins = binary_search (scp->colls, scp->count, c); - if (scp->colls[ins] == c) - { - if (txn != NULL) - txn->abort(); - derr(1) << ".collection " << c << " already exists " << endl; - return -EEXIST; - } - - dout(3) << "..insertion point: " << ins << endl; - - // Make room for a new collection ID. - if (!created) - { - sz += sizeof (coll_t); - dout(3) << "..increase size to " << sz << endl; - stored_colls *scp2 = (stored_colls *) realloc (scp, sz); - sc.release (); - sc.reset (scp2); - scp = scp2; - } - - int n = (scp->count - ins) * sizeof (coll_t); - if (n > 0) - { - dout(3) << "..moving " << n << " bytes up" << endl; - memmove (&scp->colls[ins + 1], &scp->colls[ins], n); - } - scp->count++; - scp->colls[ins] = c; - - dout(3) << "..collections: " << scp << endl; - - // Put the modified collection list back. - { - Dbt value2 (scp, sz); - if (db->put (txn, &key, &value2, 0) != 0) - { - if (txn != NULL) - txn->abort(); - derr(1) << ".writing new collections list failed" << endl; - return -EIO; - } - } - - // Create the new collection. - { - stored_coll new_coll; - new_coll.count = 0; - Dbt coll_key (&c, sizeof (coll_t)); - Dbt coll_value (&new_coll, sizeof (stored_coll)); - if (db->put (txn, &coll_key, &coll_value, 0) != 0) - { - if (txn != NULL) - txn->abort(); - derr(1) << ".writing new collection failed" << endl; - return -EIO; - } - } - - if (txn != NULL) - txn->commit (0); - - dout(4) << "..create_collection OK" << endl; - return 0; -} - -int OSBDB::destroy_collection(coll_t c, Context *onsafe) -{ - if (!mounted) - { - derr(1) << "not mounted" << endl; - return -EINVAL; - } - - dout(2) << "destroy_collection " << hex << c << dec << endl; - - Dbt key (COLLECTIONS_KEY, 1); - Dbt value; - value.set_flags (DB_DBT_MALLOC); - DbTxn *txn = NULL; - - if (transactional) - env->txn_begin (NULL, &txn, 0); - - if (db->get (txn, &key, &value, 0) != 0) - { - if (txn != NULL) - txn->abort(); - derr(1) << ".collection list doesn't exist" << endl; - return -ENOENT; // XXX - } - - stored_colls *scp = (stored_colls *) value.get_data(); - auto_ptr valueBuf (scp); - if (scp->count == 0) - { - if (txn != NULL) - txn->abort(); - derr(1) << ".collection " << c << " not listed" << endl; - return -ENOENT; - } - uint32_t ins = binary_search (scp->colls, scp->count, c); - dout(4) << "..insertion point is " << ins << endl; - if (scp->colls[ins] != c) - { - if (txn != NULL) - txn->abort(); - derr(1) << ".collection " << c << " not listed" << endl; - return -ENOENT; - } - - dout(4) << "..collections list is " << scp << endl; - - // Move the rest of the list down in memory, if needed. - if (ins < scp->count) - { - size_t n = scp->count - ins - 1; - dout(4) << "..shift list down " << n << endl; - memmove (&scp->colls[ins], &scp->colls[ins + 1], n); - } - - dout(4) << "..collections list is " << scp << endl; - - // Modify the record size to be one less. - Dbt nvalue (scp, value.get_size() - sizeof (coll_t)); - nvalue.set_flags (DB_DBT_USERMEM); - if (db->put (txn, &key, &nvalue, 0) != 0) - { - if (txn != NULL) - txn->abort(); - derr(1) << ".putting modified collection list failed" << endl; - return -EIO; - } - - // Delete the collection. - Dbt collKey (&c, sizeof (coll_t)); - if (db->del (txn, &collKey, 0) != 0) - { - if (txn != NULL) - txn->abort(); - derr(1) << ".deleting collection failed" << endl; - return -EIO; - } - - if (txn != NULL) - txn->commit (0); - dout(4) << "..destroy_collection OK" << endl; - return 0; -} - -bool OSBDB::collection_exists(coll_t c) -{ - if (!mounted) - { - derr(1) << "not mounted" << endl; - return -EINVAL; - } - - dout(2) << "collection_exists " << hex << c << dec << endl; - - /*Dbt key (COLLECTIONS_KEY, 1); - Dbt value; - value.set_flags (DB_DBT_MALLOC); - - if (db->get (NULL, &key, &value, 0) != 0) - { - dout(4) << "..no collection list; return false" << endl; - return false; - } - - stored_colls *scp = (stored_colls *) value.get_data(); - auto_ptr sc (scp); - dout(5) << "..collection list is " << scp << endl; - if (scp->count == 0) - { - dout(4) << "..empty collection list; return false" << endl; - return false; - } - uint32_t ins = binary_search (scp->colls, scp->count, c); - dout(4) << "..insertion point is " << ins << endl; - - int ret = (scp->colls[ins] == c); - dout(4) << "..returns " << ret << endl; - return ret;*/ - - Dbt key (&c, sizeof (coll_t)); - Dbt value; - value.set_flags (DB_DBT_MALLOC); - if (db->get (NULL, &key, &value, 0) != 0) - { - dout(4) << "..no collection, return false" << endl; - return false; - } - void *val = value.get_data(); - free (val); - dout(4) << "..collection exists; return true" << endl; - return true; -} - -int OSBDB::collection_stat(coll_t c, struct stat *st) -{ - if (!mounted) - { - derr(1) << "not mounted" << endl; - return -EINVAL; - } - - dout(2) << "collection_stat " << c << endl; - // XXX is this needed? - return -ENOSYS; -} - -int OSBDB::collection_add(coll_t c, object_t o, Context *onsafe) -{ - if (!mounted) - { - dout(2) << "not mounted" << endl; - return -EINVAL; - } - - dout(2) << "collection_add " << hex << c << dec << " " << o << endl; - - Dbt key (&c, sizeof (coll_t)); - Dbt value; - value.set_flags (DB_DBT_MALLOC); - DbTxn *txn = NULL; - - if (transactional) - env->txn_begin (NULL, &txn, 0); - - if (db->get (txn, &key, &value, 0) != 0) - { - if (txn != NULL) - txn->abort(); - derr(1) << "failed to find collection" << endl; - return -ENOENT; - } - - size_t sz = value.get_size(); - stored_coll *scp = (stored_coll *) value.get_data(); - auto_ptr sc (scp); - - // Find the insertion point for the new object ID. - uint32_t ins = 0; - if (scp->count > 0) - { - ins = binary_search (scp->objects, scp->count, o); - // Already there? - if (scp->objects[ins] == o) - { - if (txn != NULL) - txn->abort(); - derr(1) << "collection already has object" << endl; - return -EEXIST; - } - } - - // Make room for the new value, and add it. - sz += sizeof (object_t); - scp = (stored_coll *) realloc (scp, sz); - sc.release(); - sc.reset (scp); - dout(3) << "..current collection: " << scp << endl; - if (ins < scp->count - 1) - { - size_t n = (scp->count - ins) * sizeof (object_t); - dout(3) << "..move up " << n << " bytes" << endl; - memmove (&scp->objects[ins + 1], &scp->objects[ins], n); - } - scp->count++; - scp->objects[ins] = o; - - dout(3) << "..collection: " << scp << endl; - - Dbt nvalue (scp, sz); - if (db->put (txn, &key, &nvalue, 0) != 0) - { - if (txn != NULL) - txn->abort(); - derr(1) << "..putting modified collection failed" << endl; - return -EIO; - } - - if (txn != NULL) - txn->commit (0); - dout(4) << "..collection add OK" << endl; - return 0; -} - -int OSBDB::collection_remove(coll_t c, object_t o, Context *onsafe) -{ - if (!mounted) - { - derr(1) << "not mounted" << endl; - return -EINVAL; - } - - dout(2) << "collection_remove " << hex << c << dec << " " << o << endl; - - Dbt key (&c, sizeof (coll_t)); - Dbt value; - value.set_flags (DB_DBT_MALLOC); - DbTxn *txn = NULL; - - if (transactional) - env->txn_begin (NULL, &txn, 0); - - if (db->get (txn, &key, &value, 0) != 0) - { - if (txn != NULL) - txn->abort(); - dout(1) << "..collection doesn't exist" << endl; - return -ENOENT; - } - - stored_coll *scp = (stored_coll *) value.get_data(); - auto_ptr sc (scp); - - dout(5) << "..collection is " << scp << endl; - if (scp->count == 0) - { - if (txn != NULL) - txn->abort(); - dout(1) << "..collection is empty" << endl; - return -ENOENT; - } - uint32_t ins = binary_search (scp->objects, scp->count, o); - dout(4) << "..insertion point is " << ins << endl; - if (scp->objects[ins] != o) - { - if (txn != NULL) - txn->abort(); - dout(1) << "..object not in collection" << endl; - return -ENOENT; - } - - if (ins < scp->count - 1) - { - size_t n = (scp->count - ins - 1) * sizeof (object_t); - dout(5) << "..moving " << n << " bytes down" << endl; - memmove (&scp->objects[ins], &scp->objects[ins + 1], n); - } - scp->count--; - - dout(3) << "..collection " << scp << endl; - - Dbt nval (scp, value.get_size() - sizeof (object_t)); - if (db->put (txn, &key, &nval, 0) != 0) - { - if (txn != NULL) - txn->abort(); - derr(1) << "..putting modified collection failed" << endl; - return -EIO; - } - - if (txn != NULL) - txn->commit (0); - dout(4) << "..collection remove OK" << endl; - return 0; -} - -int OSBDB::collection_list(coll_t c, list& o) -{ - if (!mounted) - { - derr(1) << "not mounted" << endl; - return -EINVAL; - } - - Dbt key (&c, sizeof (coll_t)); - Dbt value; - DbTxn *txn = NULL; - - if (transactional) - env->txn_begin (NULL, &txn, 0); - - if (db->get (txn, &key, &value, 0) != 0) - { - if (txn != NULL) - txn->abort(); - return -ENOENT; - } - - stored_coll *scp = (stored_coll *) value.get_data(); - auto_ptr sc (scp); - for (uint32_t i = 0; i < scp->count; i++) - o.push_back (scp->objects[i]); - - if (txn != NULL) - txn->commit (0); - return 0; -} - - // Attributes - -int OSBDB::_setattr(object_t oid, const char *name, - const void *value, size_t size, Context *onsafe, - DbTxn *txn) -{ - if (!mounted) - return -EINVAL; - - if (strlen (name) >= OSBDB_MAX_ATTR_LEN) - return -ENAMETOOLONG; - - // Add name to attribute list, if needed. - attrs_id aids = new_attrs_id (oid); - Dbt attrs_key (&aids, sizeof_attrs_id()); - Dbt attrs_val; - attrs_val.set_flags (DB_DBT_MALLOC); - stored_attrs *sap = NULL; - size_t sz = 0; - - dout(3) << " getting " << aids << endl; - if (db->get (txn, &attrs_key, &attrs_val, 0) != 0) - { - dout(2) << " first attribute" << endl; - sz = sizeof (stored_attrs); - sap = (stored_attrs *) malloc(sz); - sap->count = 0; - } - else - { - sz = attrs_val.get_size(); - sap = (stored_attrs *) attrs_val.get_data(); - dout(2) << "..add to list of " << sap->count << " attrs" << endl; - } - auto_ptr sa (sap); - - attr_name _name; - strncpy (_name.name, name, OSBDB_MAX_ATTR_LEN); - - int ins = 0; - if (sap->count > 0) - ins = binary_search (sap->names, sap->count, _name); - dout(3) << "..insertion point is " << ins << endl; - if (sap->count == 0 || strcmp (sap->names[ins].name, name) != 0) - { - sz += sizeof (attr_name); - dout(3) << "..realloc " << ((void *) sap) << " to " - << dec << sz << endl; - sap = (stored_attrs *) realloc (sap, sz); - dout(3) << "..returns " << ((void *) sap) << endl; - sa.release (); - sa.reset (sap); - int n = (sap->count - ins) * sizeof (attr_name); - if (n > 0) - { - dout(3) << "..move " << n << " bytes from 0x" - << hex << (&sap->names[ins]) << " to 0x" - << hex << (&sap->names[ins+1]) << dec << endl; - memmove (&sap->names[ins+1], &sap->names[ins], n); - } - memset (&sap->names[ins], 0, sizeof (attr_name)); - strncpy (sap->names[ins].name, name, OSBDB_MAX_ATTR_LEN); - sap->count++; - - Dbt newAttrs_val (sap, sz); - newAttrs_val.set_ulen (sz); - newAttrs_val.set_flags (DB_DBT_USERMEM); - dout(3) << "..putting " << aids << endl; - if (db->put (txn, &attrs_key, &newAttrs_val, 0) != 0) - { - derr(1) << ".writing attributes list failed" << endl; - return -EIO; - } - } - else - { - dout(3) << "..attribute " << name << " already exists" << endl; - } - - dout(5) << "..attributes list: " << sap << endl; - - // Add the attribute. - attr_id aid = new_attr_id (oid, name); - Dbt attr_key (&aid, sizeof (aid)); - Dbt attr_val ((void *) value, size); - dout(3) << "..writing attribute key " << aid << endl; - if (db->put (txn, &attr_key, &attr_val, 0) != 0) - { - derr(1) << ".writing attribute key failed" << endl; - return -EIO; - } - - dout(4) << "..setattr OK" << endl; - return 0; -} - -int OSBDB::setattr(object_t oid, const char *name, - const void *value, size_t size, - Context *onsafe) -{ - if (!mounted) - return -EINVAL; - - DbTxn *txn = NULL; - if (transactional) - env->txn_begin (NULL, &txn, 0); - - dout(2) << "setattr " << oid << ":" << name << " => (" - << size << " bytes)" << endl; - int ret = _setattr (oid, name, value, size, onsafe, txn); - if (ret == 0) - { - if (txn != NULL) - txn->commit (0); - } - else - { - if (txn != NULL) - txn->abort(); - } - return ret; -} - -int OSBDB::setattrs(object_t oid, map& aset, - Context *onsafe) -{ - if (!mounted) - return -EINVAL; - - DbTxn *txn = NULL; - - if (transactional) - env->txn_begin (NULL, &txn, 0); - - map::iterator it; - for (it = aset.begin(); it != aset.end(); it++) - { - string name = it->first; - bufferptr value = it->second; - int ret = _setattr (oid, name.c_str(), value.c_str(), - value.length(), onsafe, txn); - if (ret != 0) - { - if (txn != NULL) - txn->abort(); - return ret; - } - } - - if (txn != NULL) - txn->commit (0); - return 0; -} - -int OSBDB::_getattr (object_t oid, const char *name, void *value, size_t size) -{ - if (!mounted) - return -EINVAL; - - dout(2) << "_getattr " << oid << " " << name << " " << size << endl; - - attr_id aid = new_attr_id (oid, name); - Dbt key (&aid, sizeof (aid)); - Dbt val (value, size); - val.set_ulen (size); - val.set_doff (0); - val.set_dlen (size); - val.set_flags (DB_DBT_USERMEM | DB_DBT_PARTIAL); - - if (db->get (NULL, &key, &val, 0) != 0) - { - derr(1) << ".getting value failed" << endl; - return -ENOENT; - } - - dout(4) << ".._getattr OK; returns " << val.get_size() << endl; - return val.get_size(); -} - -int OSBDB::getattr(object_t oid, const char *name, void *value, size_t size) -{ - if (!mounted) - return -EINVAL; - - return _getattr (oid, name, value, size); -} - -int OSBDB::getattrs(object_t oid, map& aset) -{ - if (!mounted) - return -EINVAL; - - for (map::iterator it = aset.begin(); - it != aset.end(); it++) - { - int ret = _getattr (oid, (*it).first.c_str(), - (*it).second.c_str(), - (*it).second.length()); - if (ret < 0) - return ret; - } - return 0; -} - -int OSBDB::rmattr(object_t oid, const char *name, Context *onsafe) -{ - if (!mounted) - return -EINVAL; - - dout(2) << "rmattr " << oid << " " << name << endl; - - attrs_id aids = new_attrs_id (oid); - Dbt askey (&aids, sizeof_attrs_id()); - Dbt asvalue; - asvalue.set_flags (DB_DBT_MALLOC); - - DbTxn *txn = NULL; - - if (transactional) - env->txn_begin (NULL, &txn, 0); - - if (db->get (txn, &askey, &asvalue, 0) != 0) - { - if (txn != NULL) - txn->abort(); - return -ENOENT; - } - - stored_attrs *sap = (stored_attrs *) asvalue.get_data(); - auto_ptr sa (sap); - - dout(5) << "..attributes list " << sap << endl; - - if (sap->count == 0) - { - if (txn != NULL) - txn->abort(); - derr(1) << ".empty attribute list" << endl; - return -ENOENT; - } - - attr_name _name; - memset(&_name, 0, sizeof (_name)); - strncpy (_name.name, name, OSBDB_MAX_ATTR_LEN); - int ins = binary_search (sap->names, sap->count, _name); - dout(4) << "..insertion point is " << ins << endl; - if (strcmp (sap->names[ins].name, name) != 0) - { - if (txn != NULL) - txn->abort(); - derr(1) << ".attribute not found in list" << endl; - return -ENOENT; - } - - // Shift the later elements down by one, if needed. - int n = (sap->count - ins) * sizeof (attr_name); - if (n > 0) - { - dout(4) << "..shift down by " << n << endl; - memmove (&(sap->names[ins]), &(sap->names[ins + 1]), n); - } - sap->count--; - dout(5) << "..attributes list now " << sap << endl; - - asvalue.set_size(asvalue.get_size() - sizeof (attr_name)); - int ret; - if ((ret = db->put (txn, &askey, &asvalue, 0)) != 0) - { - derr(1) << "put stored_attrs " << db_strerror (ret) << endl; - if (txn != NULL) - txn->abort(); - return -EIO; - } - - // Remove the attribute. - attr_id aid = new_attr_id (oid, name); - Dbt key (&aid, sizeof (aid)); - if ((ret = db->del (txn, &key, 0)) != 0) - { - derr(1) << "deleting " << aid << ": " << db_strerror(ret) << endl; - if (txn != NULL) - txn->abort(); - return -EIO; - } - - if (txn != NULL) - txn->commit (0); - dout(4) << "..rmattr OK" << endl; - return 0; -} - -int OSBDB::listattr(object_t oid, char *attrs, size_t size) -{ - if (!mounted) - return -EINVAL; - - dout(2) << "listattr " << oid << endl; - - attrs_id aids = new_attrs_id (oid); - Dbt key (&aids, sizeof_attrs_id()); - Dbt value; - value.set_flags (DB_DBT_MALLOC); - - // XXX Transactions for read atomicity??? - - int ret; - if ((ret = db->get (NULL, &key, &value, 0)) != 0) - { - derr(1) << "fetching " << aids << ": " << db_strerror (ret) - << endl; - return -ENOENT; - } - - stored_attrs *attrsp = (stored_attrs *) value.get_data(); - auto_ptr _attrs (attrsp); - size_t s = 0; - char *p = attrs; - for (unsigned i = 0; i < attrsp->count && s < size; i++) - { - int n = MIN (OSBDB_MAX_ATTR_LEN, - MIN (strlen (attrsp->names[i].name), size - s - 1)); - strncpy (p, attrsp->names[i].name, n); - p[n] = '\0'; - p = p + n + 1; - } - - dout(4) << "listattr OK" << endl; - return 0; -} - - // Collection attributes. - -int OSBDB::collection_setattr(coll_t cid, const char *name, - const void *value, size_t size, - Context *onsafe) -{ - if (!mounted) - return -EINVAL; - - dout(2) << "collection_setattr " << hex << cid << dec << " " << name - << " (" << size << " bytes)" << endl; - if (strlen (name) >= OSBDB_MAX_ATTR_LEN) - { - derr(1) << "name too long" << endl; - return -ENAMETOOLONG; - } - - // Add name to attribute list, if needed. - coll_attrs_id aids = new_coll_attrs_id (cid); - Dbt attrs_key (&aids, sizeof_coll_attrs_id()); - Dbt attrs_val; - attrs_val.set_flags (DB_DBT_MALLOC); - stored_attrs *sap = NULL; - size_t sz = 0; - - DbTxn *txn = NULL; - if (transactional) - env->txn_begin (NULL, &txn, 0); - - dout(3) << " getting " << aids << endl; - if (db->get (txn, &attrs_key, &attrs_val, 0) != 0) - { - dout(2) << " first attribute" << endl; - sz = sizeof (stored_attrs); - sap = (stored_attrs *) malloc(sz); - sap->count = 0; - } - else - { - sz = attrs_val.get_size(); - sap = (stored_attrs *) attrs_val.get_data(); - dout(2) << " add to list of " << sap->count << " attrs" << endl; - } - auto_ptr sa (sap); - - attr_name _name; - strncpy (_name.name, name, OSBDB_MAX_ATTR_LEN); - - int ins = 0; - if (sap->count > 0) - ins = binary_search (sap->names, sap->count, _name); - dout(3) << " insertion point is " << ins << endl; - if (sap->count == 0 || strcmp (sap->names[ins].name, name) != 0) - { - sz += sizeof (attr_name); - dout(3) << " realloc " << hex << ((void *) sap) << " to " - << dec << sz << endl; - sap = (stored_attrs *) realloc (sap, sz); - dout(3) << " returns " << hex << ((void *) sap) << dec << endl; - sa.release (); - sa.reset (sap); - int n = (sap->count - ins) * sizeof (attr_name); - if (n > 0) - { - dout(3) << " move " << n << " bytes from 0x" - << hex << (&sap->names[ins]) << " to 0x" - << hex << (&sap->names[ins+1]) << dec << endl; - memmove (&sap->names[ins+1], &sap->names[ins], n); - } - memset (&sap->names[ins], 0, sizeof (attr_name)); - strncpy (sap->names[ins].name, name, OSBDB_MAX_ATTR_LEN); - sap->count++; - - Dbt newAttrs_val (sap, sz); - newAttrs_val.set_ulen (sz); - newAttrs_val.set_flags (DB_DBT_USERMEM); - dout(3) << " putting " << aids << endl; - if (db->put (txn, &attrs_key, &newAttrs_val, 0) != 0) - { - if (txn != NULL) - txn->abort(); - derr(1) << ".putting new attributes failed" << endl; - return -EIO; - } - } - else - { - dout(3) << "..attribute " << name << " already exists" << endl; - } - - dout(3) << "..attributes list: " << sap << endl; - - // Add the attribute. - coll_attr_id aid = new_coll_attr_id (cid, name); - Dbt attr_key (&aid, sizeof (aid)); - Dbt attr_val ((void *) value, size); - dout(3) << " writing attribute key " << aid << endl; - if (db->put (txn, &attr_key, &attr_val, 0) != 0) - { - if (txn != NULL) - txn->abort(); - derr(1) << ".putting attribute failed" << endl; - return -EIO; - } - - if (txn != NULL) - txn->commit (0); - - dout(4) << "..collection setattr OK" << endl; - return 0; -} - -int OSBDB::collection_rmattr(coll_t cid, const char *name, - Context *onsafe) -{ - if (!mounted) - return -EINVAL; - - dout(2) << "collection_rmattr " << hex << cid << dec - << " " << name << endl; - - coll_attrs_id aids = new_coll_attrs_id (cid); - Dbt askey (&aids, sizeof_coll_attrs_id()); - Dbt asvalue; - asvalue.set_flags (DB_DBT_MALLOC); - - DbTxn *txn = NULL; - if (transactional) - env->txn_begin (NULL, &txn, 0); - - if (db->get (txn, &askey, &asvalue, 0) != 0) - { - if (txn != NULL) - txn->abort(); - derr(1) << ".no attributes list" << endl; - return -ENOENT; - } - - stored_attrs *sap = (stored_attrs *) asvalue.get_data(); - auto_ptr sa (sap); - - dout(5) << "..attributes list " << sap << endl; - if (sap->count == 0) - { - if (txn != NULL) - txn->abort(); - derr(1) << ".empty attributes list" << endl; - return -ENOENT; - } - - attr_name _name; - memset(&_name, 0, sizeof (_name)); - strncpy (_name.name, name, OSBDB_MAX_ATTR_LEN); - int ins = binary_search (sap->names, sap->count, _name); - if (strcmp (sap->names[ins].name, name) != 0) - { - if (txn != NULL) - txn->abort(); - derr(1) << ".attribute not listed" << endl; - return -ENOENT; - } - - // Shift the later elements down by one, if needed. - int n = (sap->count - ins) * sizeof (attr_name); - if (n > 0) - { - dout(4) << "..shift down by " << n << endl; - memmove (&(sap->names[ins]), &(sap->names[ins + 1]), n); - } - sap->count--; - dout(5) << "..attributes list now " << sap << endl; - - asvalue.set_size(asvalue.get_size() - sizeof (attr_name)); - int ret; - if ((ret = db->put (txn, &askey, &asvalue, 0)) != 0) - { - derr(1) << "put stored_attrs " << db_strerror (ret) << endl; - if (txn != NULL) - txn->abort(); - return -EIO; - } - - // Remove the attribute. - coll_attr_id aid = new_coll_attr_id (cid, name); - Dbt key (&aid, sizeof (aid)); - if ((ret = db->del (txn, &key, 0)) != 0) - { - derr(1) << "deleting " << aid << ": " << db_strerror(ret) << endl; - if (txn != NULL) - txn->abort(); - return -EIO; - } - - if (txn != NULL) - txn->commit (0); - - dout(4) << "..collection rmattr OK" << endl; - return 0; -} - -int OSBDB::collection_getattr(coll_t cid, const char *name, - void *value, size_t size) -{ - if (!mounted) - return -EINVAL; - - dout(2) << "collection_getattr " << hex << cid << dec - << " " << name << endl; - - // XXX transactions/read isolation? - - coll_attr_id caid = new_coll_attr_id (cid, name); - Dbt key (&caid, sizeof (caid)); - Dbt val (value, size); - val.set_ulen (size); - val.set_dlen (size); - val.set_flags (DB_DBT_USERMEM | DB_DBT_PARTIAL); - - if (db->get (NULL, &key, &val, 0) != 0) - { - derr(1) << ".no attribute entry" << endl; - return -ENOENT; - } - - dout(4) << "..collection getattr OK; returns " << val.get_size() << endl; - return val.get_size(); -} - -int OSBDB::collection_listattr(coll_t cid, char *attrs, size_t size) -{ - if (!mounted) - return -EINVAL; - - dout(2) << "collection_listattr " << hex << cid << dec << endl; - - // XXX transactions/read isolation? - - coll_attrs_id caids = new_coll_attrs_id (cid); - Dbt key (&caids, sizeof_coll_attrs_id()); - Dbt value; - value.set_flags (DB_DBT_MALLOC); - - int ret; - if ((ret = db->get (NULL, &key, &value, 0)) != 0) - { - derr(1) << "fetching " << caids << ": " << db_strerror (ret) - << endl; - return -ENOENT; - } - - stored_attrs *attrsp = (stored_attrs *) value.get_data(); - auto_ptr _attrs (attrsp); - size_t s = 0; - char *p = attrs; - for (unsigned i = 0; i < attrsp->count && s < size; i++) - { - int n = MIN (OSBDB_MAX_ATTR_LEN, - MIN (strlen (attrsp->names[i].name), size - s - 1)); - strncpy (p, attrsp->names[i].name, n); - p[n] = '\0'; - p = p + n + 1; - } - return 0; -} - - // Sync. - -void OSBDB::sync (Context *onsync) -{ - if (!mounted) - return; - - sync(); - // huh? -} - -void OSBDB::sync() -{ - if (!mounted) - return; - - db->sync(0); -} diff --git a/branches/marnberg/quota/osbdb/OSBDB.h b/branches/marnberg/quota/osbdb/OSBDB.h deleted file mode 100644 index 61cf4b16c48b5..0000000000000 --- a/branches/marnberg/quota/osbdb/OSBDB.h +++ /dev/null @@ -1,478 +0,0 @@ -/* OSBDB.h -- ObjectStore on Berkeley DB. -*- c++ -*- - Copyright (C) 2007 Casey Marshall - -Ceph - scalable distributed file system - -This is free software; you can redistribute it and/or -modify it under the terms of the GNU Lesser General Public -License version 2.1, as published by the Free Software -Foundation. See file COPYING. */ - - -#include -#include "osd/ObjectStore.h" - -#define OSBDB_MAGIC 0x05BDB - -/* - * Maximum length of an attribute name. - */ -#define OSBDB_MAX_ATTR_LEN 256 - -#define OSBDB_THIS_VERSION 1 - -#define OSBDB_SUPERBLOCK_KEY ((void *) "s") - -/* - * The "superblock" of the BDB object store. We store one of these in - * the DB, to store version and other information. We don't record - * anything special here, just the version number the database was - * written with. - * - * In principle, this structure is variable-length, depending on the - * software version writing the superblock. - */ -struct stored_superblock -{ - uint32_t version; -}; - -inline ostream& operator<<(ostream& out, const stored_superblock sb) -{ - out << "osbdb.super(" << sb.version << ")" << endl; - return out; -} - -/** - * An object identifier; we define this so we can have a POD object to - * work with. - */ -struct oid_t // POD -{ - char id[16]; -}; - -inline void mkoid (oid_t& id, object_t& oid) -{ - // XXX byte order? - memcpy (id.id, &oid, sizeof (oid_t)); -} - -inline ostream& operator<<(ostream& out, const oid_t id) -{ - for (int i = 0; i < 16; i++) - { - out.fill('0'); - out << setw(2) << hex << (id.id[i] & 0xFF); - if ((i & 3) == 3) - out << ':'; - } - out.unsetf(ios::right); - out << dec; - return out; -} - -/** - * An "inode" key. We map a 'stored_object' struct to this key for - * every object. - */ -struct object_inode_key // POD -{ - oid_t oid; - char tag; -}; - -/** - * "Constructor" for an object_inode_key. - */ -inline object_inode_key new_object_inode_key (object_t& oid) -{ - object_inode_key key; - memset(&key, 0, sizeof (object_inode_key)); - mkoid (key.oid, oid); - key.tag = 'i'; - return key; -} - -/* - * We use this, instead of sizeof(), to try and guarantee that we - * don't include the structure padding, if any. - * - * This *should* return 17: sizeof (oid_t) == 16; sizeof (char) == 1. - */ -inline size_t sizeof_object_inode_key() -{ - return offsetof(object_inode_key, tag) + sizeof (char); -} - - // Frank Poole: Unfortunately, that sounds a little - // like famous last words. - // -- 2001: A Space Odyssey - -inline ostream& operator<<(ostream& out, const object_inode_key o) -{ - out << o.tag << "/" << o.oid; - return out; -} - -/** - * A stored object. This is essentially the "inode" of the object, - * containing things like the object's length. The object itself is - * stored as-is, mapped by the 128-bit object ID. - */ -struct stored_object -{ - uint32_t length; -}; - -inline ostream& operator<<(ostream& out, const stored_object s) -{ - out << "inode(l:" << s.length << ")"; - return out; -} - -/* - * Key referencing the list of attribute names for an object. This is - * simply the object's ID, with an additional character 'a' appended. - */ -struct attrs_id // POD -{ - oid_t oid; - char tag; -}; - -/* - * "Construtor" for attrs_id. - */ -inline struct attrs_id new_attrs_id (object_t& oid) -{ - attrs_id aid; - memset (&aid, 0, sizeof (attrs_id)); - mkoid(aid.oid, oid); - aid.tag = 'a'; - return aid; -} - -/* - * See explanation for sizeof_object_inode_id. - */ -inline size_t sizeof_attrs_id() -{ - return offsetof(struct attrs_id, tag) + sizeof (char); -} - -inline ostream& operator<<(ostream& out, const attrs_id id) -{ - out << id.tag << "/" << id.oid; - return out; -} - -/* - * Encapsulation of a single attribute name. - */ -struct attr_name // POD -{ - char name[OSBDB_MAX_ATTR_LEN]; -}; - -inline ostream& operator<<(ostream& out, const attr_name n) -{ - out << n.name; - return out; -} - -inline bool operator<(const attr_name n1, const attr_name n2) -{ - return (strncmp (n1.name, n2.name, OSBDB_MAX_ATTR_LEN) < 0); -} - -inline bool operator>(const attr_name n1, const attr_name n2) -{ - return (strncmp (n1.name, n2.name, OSBDB_MAX_ATTR_LEN) > 0); -} - -inline bool operator==(const attr_name n1, const attr_name n2) -{ - std::cerr << n1.name << " == " << n2.name << "?" << endl; - return (strncmp (n1.name, n2.name, OSBDB_MAX_ATTR_LEN) == 0); -} - -inline bool operator!=(const attr_name n1, const attr_name n2) -{ - return !(n1 == n2); -} - -inline bool operator>=(const attr_name n1, const attr_name n2) -{ - return !(n1 < n2); -} - -inline bool operator<=(const attr_name n1, const attr_name n2) -{ - return !(n1 > n2); -} - -/* - * A list of an object or collection's attribute names. - */ -struct stored_attrs -{ - uint32_t count; - attr_name names[0]; // actually variable-length -}; - -inline ostream& operator<<(ostream& out, const stored_attrs *sa) -{ - out << sa->count << " [ "; - for (unsigned i = 0; i < sa->count; i++) - out << sa->names[i] << (i == sa->count - 1 ? " " : ", "); - out << "]"; - return out; -} - -/* - * An object attribute key. An object attribute is mapped simply by - * the object ID appended with the attribute name. Attribute names - * may not be empty, and must be less than 256 characters, in this - * implementation. - */ -struct attr_id // POD -{ - oid_t oid; - attr_name name; -}; - -inline attr_id new_attr_id (object_t& oid, const char *name) -{ - attr_id aid; - memset(&aid, 0, sizeof (attr_id)); - mkoid (aid.oid, oid); - strncpy (aid.name.name, name, OSBDB_MAX_ATTR_LEN); - return aid; -} - -inline ostream& operator<<(ostream &out, const attr_id id) -{ - out << id.oid << ":" << id.name; - return out; -} - -/* - * A key for a collection attributes list. - */ -struct coll_attrs_id // POD -{ - coll_t cid; - char tag; -}; - -inline coll_attrs_id new_coll_attrs_id (coll_t cid) -{ - coll_attrs_id catts; - memset(&catts, 0, sizeof (coll_attrs_id)); - catts.cid = cid; - catts.tag = 'C'; - return catts; -} - -inline size_t sizeof_coll_attrs_id() -{ - return offsetof(coll_attrs_id, tag) + sizeof (char); -} - -inline ostream& operator<<(ostream& out, coll_attrs_id id) -{ - out << id.tag << "/" << id.cid; - return out; -} - -/* - * A collection attribute key. Similar to - */ -struct coll_attr_id // POD -{ - coll_t cid; - attr_name name; -}; - -inline coll_attr_id new_coll_attr_id (coll_t cid, const char *name) -{ - coll_attr_id catt; - memset(&catt, 0, sizeof (coll_attr_id)); - catt.cid = cid; - strncpy (catt.name.name, name, OSBDB_MAX_ATTR_LEN); - return catt; -} - -inline ostream& operator<<(ostream& out, coll_attr_id id) -{ - out << id.cid << ":" << id.name; - return out; -} - -/* - * This is the key we store the master collections list under. - */ -#define COLLECTIONS_KEY ((void *) "c") - -/* - * The master list of collections. There should be one of these per - * OSD. The sole reason for this structure is to have the ability - * to enumerate all collections stored on this OSD. - */ -struct stored_colls -{ - // The number of collections. - uint32_t count; - - // The collection identifiers. This is a sorted list of coll_t - // values. - coll_t colls[0]; // actually variable-length -}; - -inline ostream& operator<<(ostream& out, stored_colls *c) -{ - out << c->count << " [ "; - for (unsigned i = 0; i < c->count; i++) - { - out << hex << c->colls[i]; - if (i < c->count - 1) - out << ", "; - } - out << " ]" << dec; - return out; -} - -/* - * A stored collection (a bag of object IDs). These are referenced by - * the bare collection identifier type, a coll_t (thus, a 32-bit - * integer). Internally this is stored as a sorted list of object IDs. - * - * Note, this structure places all collection items in a single - * record; this may be a memory burden for large collections. - */ -struct stored_coll -{ - // The size of this collection. - uint32_t count; - - // The object IDs in this collection. This is a sorted list of all - // object ID's in this collection. - object_t objects[0]; // actually variable-length -}; - -inline ostream& operator<<(ostream& out, stored_coll *c) -{ - out << c->count << " [ "; - for (unsigned i = 0; i < c->count; i++) - { - out << c->objects[i]; - if (i < c->count - 1) - out << ", "; - } - out << " ]"; - return out; -} - -class OSBDBException : public std::exception -{ - const char *msg; - -public: - OSBDBException(const char *msg) : msg(msg) { } - const char *what() { return msg; } -}; - -/* - * The object store interface for Berkeley DB. - */ -class OSBDB : public ObjectStore -{ - private: - DbEnv *env; - Db *db; - string device; - string env_dir; - bool mounted; - bool opened; - bool transactional; - - public: - - OSBDB(const char *dev) throw(OSBDBException) - : env(0), db (0), device (dev), mounted(false), opened(false), - transactional(g_conf.bdbstore_transactional) - { - } - - ~OSBDB() - { - if (mounted) - { - umount(); - } - } - - int mount(); - int umount(); - int mkfs(); - - int statfs(struct statfs *buf); - - int pick_object_revision_lt(object_t& oid); - - bool exists(object_t oid); - int stat(object_t oid, struct stat *st); - - int remove(object_t oid, Context *onsafe=0); - - int truncate(object_t oid, off_t size, Context *onsafe=0); - - int read(object_t oid, off_t offset, size_t len, - bufferlist& bl); - int write(object_t oid, off_t offset, size_t len, - bufferlist& bl, Context *onsafe); - - int setattr(object_t oid, const char *name, - const void *value, size_t size, Context *onsafe=0); - int setattrs(object_t oid, map& aset, - Context *onsafe=0); - int getattr(object_t oid, const char *name, - void *value, size_t size); - int getattrs(object_t oid, map& aset); - int rmattr(object_t oid, const char *name, - Context *onsafe=0); - int listattr(object_t oid, char *attrs, size_t size); - - int clone(object_t oid, object_t noid); - - // Collections. - - int list_collections(list& ls); - int create_collection(coll_t c, Context *onsafe=0); - int destroy_collection(coll_t c, Context *onsafe=0); - bool collection_exists(coll_t c); - int collection_stat(coll_t c, struct stat *st); - int collection_add(coll_t c, object_t o, Context *onsafe=0); - int collection_remove(coll_t c, object_t o, Context *onsafe=0); - int collection_list(coll_t c, list& o); - - int collection_setattr(coll_t cid, const char *name, - const void *value, size_t size, - Context *onsafe=0); - int collection_rmattr(coll_t cid, const char *name, - Context *onsafe=0); - int collection_getattr(coll_t cid, const char *name, - void *value, size_t size); - int collection_listattr(coll_t cid, char *attrs, size_t size); - - void sync(Context *onsync); - void sync(); - -private: - int opendb (DBTYPE type=DB_UNKNOWN, int flags=0, bool new_env=false); - - int _setattr(object_t oid, const char *name, const void *value, - size_t size, Context *onsync, DbTxn *txn); - int _getattr(object_t oid, const char *name, void *value, size_t size); -}; diff --git a/branches/marnberg/quota/osd/Ager.cc b/branches/marnberg/quota/osd/Ager.cc deleted file mode 100644 index 038688c5cdfd5..0000000000000 --- a/branches/marnberg/quota/osd/Ager.cc +++ /dev/null @@ -1,331 +0,0 @@ - -#include "include/types.h" - -#include "Ager.h" -#include "ObjectStore.h" - -#include "config.h" -#include "common/Clock.h" - -// ick -#include "ebofs/Ebofs.h" -#include -#include -#include - -#ifdef DARWIN -#include -#include -#endif // DARWIN - - -int myrand() -{ - if (0) - return rand(); - else { - static int n = 0; - srand(n++); - return rand(); - } -} - - -object_t Ager::age_get_oid() { - if (!age_free_oids.empty()) { - object_t o = age_free_oids.front(); - age_free_oids.pop_front(); - return o; - } - object_t last = age_cur_oid; - ++age_cur_oid.bno; - return last; -} - -ssize_t Ager::age_pick_size() { - ssize_t max = file_size_distn.sample() * 1024; - return max/2 + (myrand() % 100) * max/200 + 1; -} - -bool start_debug = false; - -__uint64_t Ager::age_fill(float pc, utime_t until) { - int max = 1024*1024; - bufferptr bp(max); - bp.zero(); - bufferlist bl; - bl.push_back(bp); - __uint64_t wrote = 0; - while (1) { - if (g_clock.now() > until) break; - - struct statfs st; - store->statfs(&st); - float free = 1.0 - ((float)(st.f_bfree) / (float)st.f_blocks); - float avail = 1.0 - ((float)(st.f_bavail) / (float)st.f_blocks); // to write to - //float a = (float)(st.f_bfree) / (float)st.f_blocks; - //dout(10) << "age_fill at " << a << " / " << pc << " .. " << st.f_blocks << " " << st.f_bavail << endl; - if (free >= pc) { - dout(2) << "age_fill at " << free << " / " << avail << " / " << " / " << pc << " stopping" << endl; - break; - } - - // make sure we can write to it.. - if (avail > .98 || - avail - free > .02) - store->sync(); - - object_t oid = age_get_oid(); - - int b = myrand() % 10; - age_objects[b].push_back(oid); - - ssize_t s = age_pick_size(); - wrote += (s + 4095) / 4096; - - - - - dout(2) << "age_fill at " << free << " / " << avail << " / " << pc << " creating " << hex << oid << dec << " sz " << s << endl; - - - if (false && !g_conf.ebofs_verify && start_debug && wrote > 1000000ULL) { - /* - - - 1005700 -? -1005000 -1005700 - 1005710 - 1005725ULL - 1005750ULL - 1005800 - 1006000 - -// 99 1000500 ? 1000750 1006000 -*/ - g_conf.debug_ebofs = 30; - g_conf.ebofs_verify = true; - } - - off_t off = 0; - while (s) { - ssize_t t = MIN(s, max); - bufferlist sbl; - sbl.substr_of(bl, 0, t); - store->write(oid, off, t, sbl, false); - off += t; - s -= t; - } - oid.bno++; - } - - return wrote*4; // KB -} - -void Ager::age_empty(float pc) { - int nper = 20; - int n = nper; - - //g_conf.ebofs_verify = true; - - while (1) { - struct statfs st; - store->statfs(&st); - float free = 1.0 - ((float)(st.f_bfree) / (float)st.f_blocks); - float avail = 1.0 - ((float)(st.f_bavail) / (float)st.f_blocks); // to write to - dout(2) << "age_empty at " << free << " / " << avail << " / " << pc << endl;//" stopping" << endl; - if (free <= pc) { - dout(2) << "age_empty at " << free << " / " << avail << " / " << pc << " stopping" << endl; - break; - } - - int b = myrand() % 10; - n--; - if (n == 0 || age_objects[b].empty()) { - dout(2) << "age_empty sync" << endl; - //sync(); - //sync(); - n = nper; - continue; - } - object_t oid = age_objects[b].front(); - age_objects[b].pop_front(); - - dout(2) << "age_empty at " << free << " / " << avail << " / " << pc << " removing " << hex << oid << dec << endl; - - store->remove(oid); - age_free_oids.push_back(oid); - } - - g_conf.ebofs_verify = false; -} - -void pfrag(__uint64_t written, ObjectStore::FragmentationStat &st) -{ - cout << "#gb wr\ttotal\tn x\tavg x\tavg per\tavg j\tfree\tn fr\tavg fr\tnum<2\tsum<2\tnum<4\tsum<4\t..." - << endl; - cout << written - << "\t" << st.total - << "\t" << st.num_extent - << "\t" << st.avg_extent - << "\t" << st.avg_extent_per_object - << "\t" << st.avg_extent_jump - << "\t" << st.total_free - << "\t" << st.num_free_extent - << "\t" << st.avg_free_extent; - - int n = st.num_extent; - for (__uint64_t i=1; i <= 30; i += 1) { - cout << "\t" << st.extent_dist[i]; - cout << "\t" << st.extent_dist_sum[i]; - //cout << "\ta " << (st.extent_dist[i] ? (st.extent_dist_sum[i] / st.extent_dist[i]):0); - n -= st.extent_dist[i]; - if (n == 0) break; - } - cout << endl; -} - - -void Ager::age(int time, - float high_water, // fill to this % - float low_water, // then empty to this % - int count, // this many times - float final_water, // and end here ( <= low_water) - int fake_size_mb) { - - store->_fake_writes(true); - srand(0); - - utime_t start = g_clock.now(); - utime_t until = start; - until.sec_ref() += time; - - int elapsed = 0; - int freelist_inc = 60; - utime_t nextfl = start; - nextfl.sec_ref() += freelist_inc; - - while (age_objects.size() < 10) age_objects.push_back( list() ); - - if (fake_size_mb) { - int fake_bl = fake_size_mb * 256; - struct statfs st; - store->statfs(&st); - float f = (float)fake_bl / (float)st.f_blocks; - high_water = (float)high_water * f; - low_water = (float)low_water * f; - final_water = (float)final_water * f; - dout(2) << "fake " << fake_bl << " / " << st.f_blocks << " is " << f << ", high " << high_water << " low " << low_water << " final " << final_water << endl; - } - - // init size distn (once) - if (!did_distn) { - did_distn = true; - age_cur_oid = object_t(0,1); - file_size_distn.add(1, 19.0758125+0.65434375); - file_size_distn.add(512, 35.6566); - file_size_distn.add(1024, 27.7271875); - file_size_distn.add(2*1024, 16.63503125); - //file_size_distn.add(4*1024, 106.82384375); - //file_size_distn.add(8*1024, 81.493375); - //file_size_distn.add(16*1024, 14.13553125); - //file_size_distn.add(32*1024, 2.176); - //file_size_distn.add(256*1024, 0.655938); - //file_size_distn.add(512*1024, 0.1480625); - //file_size_distn.add(1*1024*1024, 0.020125); // actually 2, but 32bit - file_size_distn.normalize(); - } - - // clear - for (int i=0; i<10; i++) - age_objects[i].clear(); - - ObjectStore::FragmentationStat st; - - __uint64_t wrote = 0; - - for (int c=1; c<=count; c++) { - if (g_clock.now() > until) break; - - //if (c == 7) start_debug = true; - - dout(1) << "#age " << c << "/" << count << " filling to " << high_water << endl; - __uint64_t w = age_fill(high_water, until); - //dout(1) << "age wrote " << w << endl; - wrote += w; - //store->sync(); - //store->_get_frag_stat(st); - //pfrag(st); - - - if (c == count) { - dout(1) << "#age final empty to " << final_water << endl; - age_empty(final_water); - } else { - dout(1) << "#age " << c << "/" << count << " emptying to " << low_water << endl; - age_empty(low_water); - } - //store->sync(); - //store->sync(); - - // show frag state - store->_get_frag_stat(st); - pfrag(wrote / (1024ULL*1024ULL) , // GB - st); - - // dump freelist? - if (g_clock.now() > nextfl) { - elapsed += freelist_inc; - save_freelist(elapsed); - nextfl.sec_ref() += freelist_inc; - } - } - - // dump the freelist - save_freelist(0); - exit(0); // hack - - // ok! - store->_fake_writes(false); - store->sync(); - store->sync(); - dout(1) << "age finished" << endl; -} - - -void Ager::load_freelist() -{ - dout(1) << "load_freelist" << endl; - - struct stat st; - - int r = ::stat("ebofs.freelist", &st); - assert(r == 0); - - bufferptr bp(st.st_size); - bufferlist bl; - bl.push_back(bp); - int fd = ::open("ebofs.freelist", O_RDONLY); - ::read(fd, bl.c_str(), st.st_size); - ::close(fd); - - ((Ebofs*)store)->_import_freelist(bl); - store->sync(); - store->sync(); -} - -void Ager::save_freelist(int el) -{ - dout(1) << "save_freelist " << el << endl; - char s[100]; - sprintf(s, "ebofs.freelist.%d", el); - bufferlist bl; - ((Ebofs*)store)->_export_freelist(bl); - ::unlink(s); - int fd = ::open(s, O_CREAT|O_WRONLY); - ::fchmod(fd, 0644); - ::write(fd, bl.c_str(), bl.length()); - ::close(fd); -} diff --git a/branches/marnberg/quota/osd/Ager.h b/branches/marnberg/quota/osd/Ager.h deleted file mode 100644 index 864c23fce8e14..0000000000000 --- a/branches/marnberg/quota/osd/Ager.h +++ /dev/null @@ -1,42 +0,0 @@ -#ifndef __AGER_H -#define __AGER_H - -#include "include/types.h" -#include "include/Distribution.h" -#include "ObjectStore.h" -#include "common/Clock.h" - -#include -#include -using namespace std; - -class Ager { - ObjectStore *store; - - private: - list age_free_oids; - object_t age_cur_oid; - vector< list > age_objects; - Distribution file_size_distn; //kb - bool did_distn; - - void age_empty(float pc); - __uint64_t age_fill(float pc, utime_t until); - ssize_t age_pick_size(); - object_t age_get_oid(); - - public: - Ager(ObjectStore *s) : store(s), did_distn(false) {} - - void age(int time, - float high_water, // fill to this % - float low_water, // then empty to this % - int count, // this many times - float final_water, // and end here ( <= low_water) - int fake_size_mb=0); - - void save_freelist(int); - void load_freelist(); -}; - -#endif diff --git a/branches/marnberg/quota/osd/BDBMap.h b/branches/marnberg/quota/osd/BDBMap.h deleted file mode 100644 index 203a4ca9dce8f..0000000000000 --- a/branches/marnberg/quota/osd/BDBMap.h +++ /dev/null @@ -1,136 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __BERKELEYDB_H -#define __BERKELEYDB_H - -#include -#include - -#include -using namespace std; - - -template -class BDBMap { - private: - DB *dbp; - - public: - BDBMap() : dbp(0) {} - ~BDBMap() { - close(); - } - - bool is_open() { return dbp ? true:false; } - - // open/close - int open(const char *fn) { - //cout << "open " << fn << endl; - - int r; - if ((r = db_create(&dbp, NULL, 0)) != 0) { - cerr << "db_create: " << db_strerror(r) << endl; - assert(0); - } - - dbp->set_errfile(dbp, stderr); - dbp->set_errpfx(dbp, "bdbmap"); - - r = dbp->open(dbp, NULL, fn, NULL, DB_BTREE, DB_CREATE, 0644); - if (r != 0) { - dbp->err(dbp, r, "%s", fn); - } - assert(r == 0); - return 0; - } - void close() { - if (dbp) { - dbp->close(dbp,0); - dbp = 0; - } - } - void remove(const char *fn) { - if (!dbp) open(fn); - if (dbp) { - dbp->remove(dbp, fn, 0, 0); - dbp = 0; - } else { - ::unlink(fn); - } - } - - // accessors - int put(K key, - D data) { - DBT k; - memset(&k, 0, sizeof(k)); - k.data = &key; - k.size = sizeof(K); - DBT d; - memset(&d, 0, sizeof(d)); - d.data = &data; - d.size = sizeof(data); - return dbp->put(dbp, NULL, &k, &d, 0); - } - - int get(K key, - D& data) { - DBT k; - memset(&k, 0, sizeof(k)); - k.data = &key; - k.size = sizeof(key); - DBT d; - memset(&d, 0, sizeof(d)); - d.data = &data; - d.size = sizeof(data); - int r = dbp->get(dbp, NULL, &k, &d, 0); - return r; - } - - int del(K key) { - DBT k; - memset(&k, 0, sizeof(k)); - k.data = &key; - k.size = sizeof(key); - return dbp->del(dbp, NULL, &k, 0); - } - - int list_keys(list& ls) { - DBC *cursor = 0; - int r = dbp->cursor(dbp, NULL, &cursor, 0); - assert(r == 0); - - DBT k,d; - memset(&k, 0, sizeof(k)); - memset(&d, 0, sizeof(d)); - - while ((r = cursor->c_get(cursor, &k, &d, DB_NEXT)) == 0) { - K key; - assert(k.size == sizeof(key)); - memcpy(&key, k.data, k.size); - ls.push_back(key); - } - if (r != DB_NOTFOUND) { - dbp->err(dbp, r, "DBcursor->get"); - assert(r == DB_NOTFOUND); - } - - cursor->c_close(cursor); - return 0; - } - -}; - -#endif diff --git a/branches/marnberg/quota/osd/Fake.h b/branches/marnberg/quota/osd/Fake.h deleted file mode 100644 index 01fa4afcf3cb8..0000000000000 --- a/branches/marnberg/quota/osd/Fake.h +++ /dev/null @@ -1,249 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __FAKE_H -#define __FAKE_H - -#include "include/types.h" - -#include -#include -#include -using namespace std; -using namespace __gnu_cxx; - -class FakeStoreCollections { - private: - Mutex faker_lock; - ObjectStore *store; - hash_map > fakecollections; - - public: - FakeStoreCollections(ObjectStore *s) : store(s) {} - - // faked collections - int list_collections(list& ls) { - faker_lock.Lock(); - int r = 0; - for (hash_map< coll_t, set >::iterator p = fakecollections.begin(); - p != fakecollections.end(); - p++) { - r++; - ls.push_back(p->first); - } - faker_lock.Unlock(); - return r; - } - - int create_collection(coll_t c, - Context *onsafe=0) { - faker_lock.Lock(); - fakecollections[c].size(); - if (onsafe) store->sync(onsafe); - faker_lock.Unlock(); - return 0; - } - - int destroy_collection(coll_t c, - Context *onsafe=0) { - int r = 0; - faker_lock.Lock(); - if (fakecollections.count(c)) { - fakecollections.erase(c); - //fakecattr.erase(c); - if (onsafe) store->sync(onsafe); - } else - r = -1; - faker_lock.Unlock(); - return r; - } - - int collection_stat(coll_t c, struct stat *st) { - return collection_exists(c) ? 0:-1; - } - - bool collection_exists(coll_t c) { - faker_lock.Lock(); - int r = fakecollections.count(c); - faker_lock.Unlock(); - return r; - } - - int collection_add(coll_t c, object_t o, - Context *onsafe=0) { - faker_lock.Lock(); - fakecollections[c].insert(o); - if (onsafe) store->sync(onsafe); - faker_lock.Unlock(); - return 0; - } - - int collection_remove(coll_t c, object_t o, - Context *onsafe=0) { - faker_lock.Lock(); - fakecollections[c].erase(o); - if (onsafe) store->sync(onsafe); - faker_lock.Unlock(); - return 0; - } - - int collection_list(coll_t c, list& o) { - faker_lock.Lock(); - int r = 0; - for (set::iterator p = fakecollections[c].begin(); - p != fakecollections[c].end(); - p++) { - o.push_back(*p); - r++; - } - faker_lock.Unlock(); - return r; - } - -}; - -class FakeStoreAttrs { - private: - - class FakeAttrSet { - public: - map attrs; - - int getattr(const char *name, void *value, size_t size) { - string n = name; - if (attrs.count(n)) { - size_t l = MIN( attrs[n].length(), size ); - bufferlist bl; - bl.append(attrs[n]); - bl.copy(0, l, (char*)value); - return l; - } - return -1; - } - int getattrs(map& aset) { - aset = attrs; - return 0; - } - int setattrs(map& aset) { - attrs = aset; - return 0; - } - - int setattr(const char *name, const void *value, size_t size) { - string n = name; - bufferptr bp = buffer::copy((char*)value, size); - attrs[n] = bp; - return 0; - } - - int listattr(char *attrs, size_t size) { - assert(0); - return 0; - } - - int rmattr(const char *name) { - string n = name; - attrs.erase(n); - return 0; - } - - bool empty() { return attrs.empty(); } - }; - - Mutex faker_lock; - ObjectStore *store; - hash_map fakeoattrs; - hash_map fakecattrs; - - public: - FakeStoreAttrs(ObjectStore *s) : store(s) {} - - int setattr(object_t oid, const char *name, - const void *value, size_t size, - Context *onsafe=0) { - faker_lock.Lock(); - int r = fakeoattrs[oid].setattr(name, value, size); - if (onsafe) store->sync(onsafe); - faker_lock.Unlock(); - return r; - } - int setattrs(object_t oid, map& aset) { - faker_lock.Lock(); - int r = fakeoattrs[oid].setattrs(aset); - faker_lock.Unlock(); - return r; - } - int getattr(object_t oid, const char *name, - void *value, size_t size) { - faker_lock.Lock(); - int r = fakeoattrs[oid].getattr(name, value, size); - faker_lock.Unlock(); - return r; - } - int getattrs(object_t oid, map& aset) { - faker_lock.Lock(); - int r = fakeoattrs[oid].getattrs(aset); - faker_lock.Unlock(); - return r; - } - int rmattr(object_t oid, const char *name, - Context *onsafe=0) { - faker_lock.Lock(); - int r = fakeoattrs[oid].rmattr(name); - if (onsafe) store->sync(onsafe); - faker_lock.Unlock(); - return r; - } - - int listattr(object_t oid, char *attrs, size_t size) { - faker_lock.Lock(); - int r = fakeoattrs[oid].listattr(attrs,size); - faker_lock.Unlock(); - return r; - } - - int collection_setattr(coll_t c, const char *name, - void *value, size_t size, - Context *onsafe=0) { - faker_lock.Lock(); - int r = fakecattrs[c].setattr(name, value, size); - if (onsafe) store->sync(onsafe); - faker_lock.Unlock(); - return r; - } - int collection_rmattr(coll_t c, const char *name, - Context *onsafe=0) { - faker_lock.Lock(); - int r = fakecattrs[c].rmattr(name); - if (onsafe) store->sync(onsafe); - faker_lock.Unlock(); - return r; - } - int collection_getattr(coll_t c, const char *name, - void *value, size_t size) { - faker_lock.Lock(); - int r = fakecattrs[c].getattr(name, value, size); - faker_lock.Unlock(); - return r; - } - int collection_listattr(coll_t c, char *attrs, size_t size) { - faker_lock.Lock(); - int r = fakecattrs[c].listattr(attrs,size); - faker_lock.Unlock(); - return r; - } - -}; - -#endif diff --git a/branches/marnberg/quota/osd/FakeStore.cc b/branches/marnberg/quota/osd/FakeStore.cc deleted file mode 100644 index 1360711f3b417..0000000000000 --- a/branches/marnberg/quota/osd/FakeStore.cc +++ /dev/null @@ -1,643 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "FakeStore.h" -#include "include/types.h" - -#include "common/Timer.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -//#include - -#ifdef DARWIN -#include -#include -#endif // DARWIN - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug) cout << g_clock.now() << " osd" << whoami << ".fakestore " -#define derr(l) if (l<=g_conf.debug) cerr << g_clock.now() << " osd" << whoami << ".fakestore " - -#include "include/buffer.h" - -#include -#include -using namespace __gnu_cxx; - -// crap-a-crap hash -#define HASH_DIRS 0x80 -#define HASH_MASK 0x7f -// end crap hash - - - - -int FakeStore::statfs(struct statfs *buf) -{ - return ::statfs(basedir.c_str(), buf); -} - - -/* - * sorry, these are sentitive to the object_t and coll_t typing. - */ -void FakeStore::get_oname(object_t oid, char *s) -{ - static hash H; - assert(sizeof(oid) == 16); -#ifdef __LP64__ - sprintf(s, "%s/objects/%02lx/%016lx.%016lx", basedir.c_str(), H(oid) & HASH_MASK, - *((__uint64_t*)&oid), - *(((__uint64_t*)&oid) + 1)); -#else - sprintf(s, "%s/objects/%02x/%016llx.%016llx", basedir.c_str(), H(oid) & HASH_MASK, - *((__uint64_t*)&oid), - *(((__uint64_t*)&oid) + 1)); -#endif -} - -void FakeStore::get_cdir(coll_t cid, char *s) -{ - assert(sizeof(cid) == 8); -#ifdef __LP64__ - sprintf(s, "%s/collections/%016lx", basedir.c_str(), - cid); -#else - sprintf(s, "%s/collections/%016llx", basedir.c_str(), - cid); -#endif -} - -void FakeStore::get_coname(coll_t cid, object_t oid, char *s) -{ - assert(sizeof(oid) == 16); -#ifdef __LP64__ - sprintf(s, "%s/collections/%016lx/%016lx.%016lx", basedir.c_str(), cid, - *((__uint64_t*)&oid), - *(((__uint64_t*)&oid) + 1)); -#else - sprintf(s, "%s/collections/%016llx/%016llx.%016llx", basedir.c_str(), cid, - *((__uint64_t*)&oid), - *(((__uint64_t*)&oid) + 1)); -#endif -} - - - - -int FakeStore::mkfs() -{ - char cmd[200]; - if (g_conf.fakestore_dev) { - dout(0) << "mounting" << endl; - sprintf(cmd,"mount %s", g_conf.fakestore_dev); - system(cmd); - } - - dout(1) << "mkfs in " << basedir << endl; - - // wipe - sprintf(cmd, "test -d %s && rm -r %s ; mkdir -p %s/collections && mkdir -p %s/objects", - basedir.c_str(), basedir.c_str(), basedir.c_str(), basedir.c_str()); - - dout(5) << "wipe: " << cmd << endl; - system(cmd); - - // hashed bits too - for (int i=0; i 0) bl.push_back( bptr ); // put it in the target bufferlist - } - ::flock(fd, LOCK_UN); - ::close(fd); - return got; -} - - -int FakeStore::write(object_t oid, - off_t offset, size_t len, - bufferlist& bl, - Context *onsafe) -{ - dout(20) << "write " << oid << " len " << len << " off " << offset << endl; - - char fn[200]; - get_oname(oid,fn); - - ::mknod(fn, 0644, 0); // in case it doesn't exist yet. - - int flags = O_WRONLY;//|O_CREAT; - int fd = ::open(fn, flags); - if (fd < 0) { - derr(0) << "write couldn't open " << fn << " flags " << flags << " errno " << errno << " " << strerror(errno) << endl; - return fd; - } - ::fchmod(fd, 0664); - ::flock(fd, LOCK_EX); // lock for safety - - // seek - off_t actual = ::lseek(fd, offset, SEEK_SET); - int did = 0; - assert(actual == offset); - - // write buffers - for (list::const_iterator it = bl.buffers().begin(); - it != bl.buffers().end(); - it++) { - int r = ::write(fd, (char*)(*it).c_str(), (*it).length()); - if (r > 0) - did += r; - else { - derr(0) << "couldn't write to " << fn << " len " << len << " off " << offset << " errno " << errno << " " << strerror(errno) << endl; - } - } - - if (did < 0) { - derr(0) << "couldn't write to " << fn << " len " << len << " off " << offset << " errno " << errno << " " << strerror(errno) << endl; - } - - ::flock(fd, LOCK_UN); - - // schedule sync - if (onsafe) sync(onsafe); - - ::close(fd); - - return did; -} - - -class C_FakeSync : public Context { - Context *c; - int *n; - Mutex *lock; - Cond *cond; - -public: - C_FakeSync(Context *c_, int *n_, Mutex *lo, Cond *co) : - c(c_), n(n_), - lock(lo), cond(co) { - lock->Lock(); - ++*n; - lock->Unlock(); - } - void finish(int r) { - c->finish(r); - - lock->Lock(); - --(*n); - if (*n == 0) cond->Signal(); - lock->Unlock(); - } -}; - -void FakeStore::sync() -{ - synclock.Lock(); - while (unsync > 0) { - dout(0) << "sync waiting for " << unsync << " items to (fake) sync" << endl; - synccond.Wait(synclock); - } - synclock.Unlock(); -} - -void FakeStore::sync(Context *onsafe) -{ - if (g_conf.fakestore_fake_sync) { - g_timer.add_event_after((float)g_conf.fakestore_fake_sync, - new C_FakeSync(onsafe, &unsync, &synclock, &synccond)); - - } else { - assert(0); // der..no implemented anymore - } -} - - -// ------------------------------- -// attributes - -// objects - -int FakeStore::setattr(object_t oid, const char *name, - const void *value, size_t size, - Context *onsafe) -{ - if (fake_attrs) return attrs.setattr(oid, name, value, size, onsafe); - - char fn[100]; - get_oname(oid, fn); - int r = ::setxattr(fn, name, value, size, 0); - return r; -} - -int FakeStore::setattrs(object_t oid, map& aset) -{ - if (fake_attrs) return attrs.setattrs(oid, aset); - - char fn[100]; - get_oname(oid, fn); - int r = 0; - for (map::iterator p = aset.begin(); - p != aset.end(); - ++p) { - r = ::setxattr(fn, p->first.c_str(), p->second.c_str(), p->second.length(), 0); - if (r < 0) break; - } - return r; -} - -int FakeStore::getattr(object_t oid, const char *name, - void *value, size_t size) -{ - if (fake_attrs) return attrs.getattr(oid, name, value, size); - char fn[100]; - get_oname(oid, fn); - int r = ::getxattr(fn, name, value, size); - return r; -} - -int FakeStore::getattrs(object_t oid, map& aset) -{ - if (fake_attrs) return attrs.getattrs(oid, aset); - - char fn[100]; - get_oname(oid, fn); - - char val[1000]; - char names[1000]; - int num = ::listxattr(fn, names, 1000); - - char *name = names; - for (int i=0; i& ls) -{ - if (fake_collections) return collections.list_collections(ls); - - char fn[200]; - sprintf(fn, "%s/collections", basedir.c_str()); - - DIR *dir = ::opendir(fn); - assert(dir); - - struct dirent *de; - while ((de = ::readdir(dir)) != 0) { - // parse - coll_t c = strtoll(de->d_name, 0, 16); - dout(0) << " got " << c << " errno " << errno << " on " << de->d_name << endl; - if (errno) continue; - ls.push_back(c); - } - - ::closedir(dir); - return 0; -} - -int FakeStore::create_collection(coll_t c, - Context *onsafe) -{ - if (fake_collections) return collections.create_collection(c, onsafe); - - char fn[200]; - get_cdir(c, fn); - - int r = ::mkdir(fn, 0755); - - if (onsafe) sync(onsafe); - return r; -} - -int FakeStore::destroy_collection(coll_t c, - Context *onsafe) -{ - if (fake_collections) return collections.destroy_collection(c, onsafe); - - char fn[200]; - get_cdir(c, fn); - char cmd[200]; - sprintf(cmd, "test -d %s && rm -r %s", fn, fn); - system(cmd); - - if (onsafe) sync(onsafe); - return 0; -} - -int FakeStore::collection_stat(coll_t c, struct stat *st) -{ - if (fake_collections) return collections.collection_stat(c, st); - - char fn[200]; - get_cdir(c, fn); - return ::lstat(fn, st); -} - -bool FakeStore::collection_exists(coll_t c) -{ - if (fake_collections) return collections.collection_exists(c); - - struct stat st; - return collection_stat(c, &st) == 0; -} - - -int FakeStore::collection_add(coll_t c, object_t o, - Context *onsafe) -{ - if (fake_collections) return collections.collection_add(c, o, onsafe); - - char cof[200]; - get_coname(c, o, cof); - char of[200]; - get_oname(o, of); - - int r = ::link(of, cof); - if (onsafe) sync(onsafe); - return r; -} - -int FakeStore::collection_remove(coll_t c, object_t o, - Context *onsafe) -{ - if (fake_collections) return collections.collection_remove(c, o, onsafe); - - char cof[200]; - get_coname(c, o, cof); - - int r = ::unlink(cof); - if (onsafe) sync(onsafe); - return r; -} - -int FakeStore::collection_list(coll_t c, list& ls) -{ - if (fake_collections) return collections.collection_list(c, ls); - - char fn[200]; - get_cdir(c, fn); - - DIR *dir = ::opendir(fn); - assert(dir); - - struct dirent *de; - while ((de = ::readdir(dir)) != 0) { - // parse - object_t o; - assert(sizeof(o) == 16); - *(((__uint64_t*)&o) + 0) = strtoll(de->d_name, 0, 16); - assert(de->d_name[16] == '.'); - *(((__uint64_t*)&o) + 1) = strtoll(de->d_name+17, 0, 16); - dout(0) << " got " << o << " errno " << errno << " on " << de->d_name << endl; - if (errno) continue; - ls.push_back(o); - } - - ::closedir(dir); - return 0; -} - -// eof. diff --git a/branches/marnberg/quota/osd/FakeStore.h b/branches/marnberg/quota/osd/FakeStore.h deleted file mode 100644 index 4ad2cb4a054e8..0000000000000 --- a/branches/marnberg/quota/osd/FakeStore.h +++ /dev/null @@ -1,110 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __FAKESTORE_H -#define __FAKESTORE_H - -#include "ObjectStore.h" -#include "common/ThreadPool.h" -#include "common/Mutex.h" - -#include "Fake.h" -//#include "FakeStoreBDBCollections.h" - - -#include -using namespace std; - -#include -using namespace __gnu_cxx; - - -// fake attributes in memory, if we need to. - -class FakeStore : public ObjectStore { - string basedir; - int whoami; - - Mutex synclock; - Cond synccond; - int unsync; - - // fake attrs? - FakeStoreAttrs attrs; - bool fake_attrs; - - // fake collections? - FakeStoreCollections collections; - bool fake_collections; - - // helper fns - void get_oname(object_t oid, char *s); - void get_cdir(coll_t cid, char *s); - void get_coname(coll_t cid, object_t oid, char *s); - - public: - FakeStore(char *base, int w) : - basedir(base), - whoami(w), - unsync(0), - attrs(this), fake_attrs(false), - collections(this), fake_collections(false) { } - - int mount(); - int umount(); - int mkfs(); - - int statfs(struct statfs *buf); - - // ------------------ - // objects - int pick_object_revision_lt(object_t& oid) { - return 0; - } - bool exists(object_t oid); - int stat(object_t oid, struct stat *st); - int remove(object_t oid, Context *onsafe); - int truncate(object_t oid, off_t size, Context *onsafe); - int read(object_t oid, off_t offset, size_t len, bufferlist& bl); - int write(object_t oid, off_t offset, size_t len, bufferlist& bl, Context *onsafe); - - void sync(); - void sync(Context *onsafe); - - // attrs - int setattr(object_t oid, const char *name, const void *value, size_t size, Context *onsafe=0); - int setattrs(object_t oid, map& aset); - int getattr(object_t oid, const char *name, void *value, size_t size); - int getattrs(object_t oid, map& aset); - int rmattr(object_t oid, const char *name, Context *onsafe=0); - //int listattr(object_t oid, char *attrs, size_t size); - int collection_setattr(coll_t c, const char *name, void *value, size_t size, Context *onsafe=0); - int collection_rmattr(coll_t c, const char *name, Context *onsafe=0); - int collection_getattr(coll_t c, const char *name, void *value, size_t size); - //int collection_listattr(coll_t c, char *attrs, size_t size); - - - // collections - int list_collections(list& ls); - int create_collection(coll_t c, Context *onsafe=0); - int destroy_collection(coll_t c, Context *onsafe=0); - int collection_stat(coll_t c, struct stat *st); - bool collection_exists(coll_t c); - int collection_add(coll_t c, object_t o, Context *onsafe=0); - int collection_remove(coll_t c, object_t o, Context *onsafe=0); - int collection_list(coll_t c, list& o); - -}; - -#endif diff --git a/branches/marnberg/quota/osd/FakeStoreBDBCollections.h b/branches/marnberg/quota/osd/FakeStoreBDBCollections.h deleted file mode 100644 index 97316d2642674..0000000000000 --- a/branches/marnberg/quota/osd/FakeStoreBDBCollections.h +++ /dev/null @@ -1,168 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __FAKESTOREBDBCOLLECTIONS_H -#define __FAKESTOREBDBCOLLECTIONS_H - -#include "BDBMap.h" -#include "ObjectStore.h" -#include "common/Mutex.h" - -#define BDBHASH_DIRS 128LL -#define BDBHASH_FUNC(x) (((x) ^ ((x)>>30) ^ ((x)>>18) ^ ((x)>>45) ^ 0xdead1234) * 884811 % BDBHASH_DIRS) - -class FakeStoreBDBCollections { - private: - int whoami; - string basedir; - - Mutex bdblock; - - // collection dbs - BDBMap collections; - map*> collection_map; - - // dirs - void get_dir(string& dir) { - char s[30]; - sprintf(s, "%d", whoami); - dir = basedir + "/" + s; - } - void get_collfn(coll_t c, string &fn) { - char s[100]; - sprintf(s, "%d/%02llx/%016llx.co", whoami, BDBHASH_FUNC(c), c); - fn = basedir + "/" + s; - } - - void open_collections() { - string cfn; - get_dir(cfn); - cfn += "/collections"; - collections.open(cfn.c_str()); - list ls; - collections.list_keys(ls); - } - void close_collections() { - if (collections.is_open()) - collections.close(); - - for (map*>::iterator it = collection_map.begin(); - it != collection_map.end(); - it++) { - it->second->close(); - } - collection_map.clear(); - } - - int open_collection(coll_t c) { - if (collection_map.count(c)) - return 0; // already open. - - string fn; - get_collfn(c,fn); - collection_map[c] = new BDBMap; - int r = collection_map[c]->open(fn.c_str()); - if (r != 0) - collection_map.erase(c); // failed - return r; - } - - public: - FakeStoreBDBCollections(int w, string& bd) : whoami(w), basedir(bd) {} - ~FakeStoreBDBCollections() { - close_collections(); - } - - int list_collections(list& ls) { - bdblock.Lock(); - if (!collections.is_open()) open_collections(); - - ls.clear(); - collections.list_keys(ls); - bdblock.Unlock(); - return 0; - } - int create_collection(coll_t c) { - bdblock.Lock(); - if (!collections.is_open()) open_collections(); - - collections.put(c, 1); - open_collection(c); - bdblock.Unlock(); - return 0; - } - int destroy_collection(coll_t c) { - bdblock.Lock(); - if (!collections.is_open()) open_collections(); - - collections.del(c); - - open_collection(c); - collection_map[c]->close(); - - string fn; - get_collfn(c,fn); - collection_map[c]->remove(fn.c_str()); - delete collection_map[c]; - collection_map.erase(c); - bdblock.Unlock(); - return 0; - } - int collection_stat(coll_t c, struct stat *st) { - bdblock.Lock(); - if (!collections.is_open()) open_collections(); - - string fn; - get_collfn(c,fn); - int r = ::stat(fn.c_str(), st); - bdblock.Unlock(); - return r; - } - bool collection_exists(coll_t c) { - bdblock.Lock(); - struct stat st; - int r = collection_stat(c, &st) == 0; - bdblock.Unlock(); - return r; - } - int collection_add(coll_t c, object_t o) { - bdblock.Lock(); - if (!collections.is_open()) open_collections(); - - open_collection(c); - collection_map[c]->put(o,1); - bdblock.Unlock(); - return 0; - } - int collection_remove(coll_t c, object_t o) { - bdblock.Lock(); - if (!collections.is_open()) open_collections(); - - open_collection(c); - collection_map[c]->del(o); - bdblock.Unlock(); - return 0; - } - int collection_list(coll_t c, list& o) { - bdblock.Lock(); - if (!collections.is_open()) open_collections(); - - open_collection(c); - collection_map[c]->list_keys(o); - bdblock.Unlock(); - return 0; - } -}; - -#endif diff --git a/branches/marnberg/quota/osd/OBFSStore.cc b/branches/marnberg/quota/osd/OBFSStore.cc deleted file mode 100644 index e82c6f804721d..0000000000000 --- a/branches/marnberg/quota/osd/OBFSStore.cc +++ /dev/null @@ -1,244 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "OBFSStore.h" - -extern "C" { -#include "../../uofs/uofs.h" -} - -#include "common/Timer.h" - -#include "include/types.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug) cout << "osd" << whoami << ".obfs " - -OBFSStore::OBFSStore(int whoami, char *param, char *dev) -{ - this->whoami = whoami; - this->mounted = -1; - this->bdev_id = -1; - this->param[0] = 0; - this->dev[0] = 0; - if (dev) - strcpy(this->dev, dev); - if (param) - strcpy(this->param, param); -} - -int OBFSStore::mount(void) -{ - dout(0) << "OBFS init!" << endl; - if ((this->bdev_id = device_open(this->dev, O_RDWR)) < 0) { - dout(0) << "device open FAILED on " << this->dev << ", errno " << errno << endl; - return -1; - } - - this->mkfs(); - this->mounted = uofs_mount(this->bdev_id, - g_conf.uofs_cache_size, - g_conf.uofs_min_flush_pages, - this->whoami); - switch (this->mounted) { - case -1: - this->mkfs(); - //retry to mount - dout(0) << "remount the OBFS" << endl; - this->mounted = uofs_mount(this->bdev_id, - g_conf.uofs_cache_size, - g_conf.uofs_min_flush_pages, - this->whoami); - assert(this->mounted >= 0); - break; - case -2: - //fsck - dout(0) << "Need fsck! Simply formatted for now!" << endl; - this->mkfs(); - this->mounted = uofs_mount(this->bdev_id, - g_conf.uofs_cache_size, - g_conf.uofs_min_flush_pages, - this->whoami); - assert(this->mounted >= 0); - break; - case 0: - //success - break; - default: - break; - } - - if (this->mounted >= 0) - dout(0) << "successfully mounted!" << endl; - else - dout(0) << "error in mounting obfsstore!" << endl; - - return 0; -} - -int OBFSStore::mkfs(void) -{ - /*int donode_size_byte = 1024, - bd_ratio = 10, - reg_size_mb = 256, - sb_size_kb = 4, - lb_size_kb = 1024, - nr_hash_table_buckets = 1023, - delay_allocation = 1, - flush_interval = 5; - FILE *param; - */ - - - if (this->mounted >= 0) - return 0; - - dout(0) << "OBFS.mkfs!" << endl; - /* - if (strlen(this->param) > 0) { - param = fopen(this->param, "r"); - if (param) { - //fscanf(param, "Block Device: %s\n", this->dev); - fscanf(param, "Donode Size: %d\n", &donode_size_byte); - fscanf(param, "Block vs Donode Ratio: %d\n", &bd_ratio); - fscanf(param, "Region Size: %d MB\n", ®_size_mb); - fscanf(param, "Small Block Size: %d KB\n", &sb_size_kb); - fscanf(param, "Large Block Size: %d KB\n", &lb_size_kb); - fscanf(param, "Hash Table Buckets: %d\n", &nr_hash_table_buckets); - fscanf(param, "Delayed Allocation: %d\n", &delay_allocation); - } else { - dout(0) << "read open FAILED on "<< this->param <<", errno " << errno << endl; - dout(0) << "use default parameters" << endl; - } - } else - dout(0) << "use default parameters" << endl; - */ - - if (this->bdev_id <= 0) - if ((this->bdev_id = device_open(this->dev, O_RDWR)) < 0) { - dout(0) << "device open FAILED on "<< this->dev <<", errno " << errno << endl; - return -1; - } - - dout(0) << "start formating!" << endl; - - uofs_format(this->bdev_id, - g_conf.uofs_onode_size, - g_conf.uofs_block_meta_ratio, - g_conf.uofs_segment_size, - g_conf.uofs_small_block_size, - g_conf.uofs_large_block_size, - g_conf.uofs_nr_hash_buckets, - g_conf.uofs_delay_allocation, - 0,//g_conf.uofs_dev_force_size, - g_conf.uofs_flush_interval, - 0); - - dout(0) << "formatting complete!" << endl; - return 0; -} - -int OBFSStore::umount(void) -{ - uofs_shutdown(); - close(this->bdev_id); - - return 0; -} - -int OBFSStore::statfs(struct statfs *sfs) -{ - return 0; -} - -bool OBFSStore::exists(object_t oid) -{ - //dout(0) << "calling function exists!" << endl; - return uofs_exist(oid); -} - -int OBFSStore::stat(object_t oid, struct stat *st) -{ - dout(0) << "calling function stat!" << endl; - if (uofs_exist(oid)) return 0; - return -1; -} - -int OBFSStore::remove(object_t oid) -{ - dout(0) << "calling remove function!" << endl; - return uofs_del(oid); -} - -int OBFSStore::truncate(object_t oid, off_t size) -{ - dout(0) << "calling truncate function!" << endl; - //return uofs_truncate(oid, size); - return -1; -} - -int OBFSStore::read(object_t oid, size_t len, - off_t offset, bufferlist &bl) -{ - //dout(0) << "calling read function!" << endl; - //dout(0) << oid << " 0 " << len << " " << offset << " 100" << endl; - - // FIXME: page-align this and we can avoid a memcpy... - bl.push_back(new buffer(len)); - return uofs_read(oid, bl.c_str(), offset, len); -} - -int OBFSStore::write(object_t oid, size_t len, - off_t offset, bufferlist& bl, bool fsync) -{ - int ret = 0; - - //dout(0) << "calling write function!" << endl; - //if (whoami == 0) - // dout(0) << oid << " 0 " << len << " " << offset << " 101" << endl; - - for (list::iterator p = bl.buffers().begin(); - p != bl.buffers().end(); - p++) { - ret += uofs_write(oid, (*p).c_str(), offset, len, 0); - } - - if (fsync) - ret += uofs_sync(oid); - - return ret; -} - - -int OBFSStore::write(object_t oid, size_t len, - off_t offset, bufferlist& bl, Context *onflush) -{ - int r = write(oid, len, offset, bl, false); - g_timer.add_event_after((float)g_conf.uofs_fake_sync, onflush); - return r; -} diff --git a/branches/marnberg/quota/osd/OBFSStore.h b/branches/marnberg/quota/osd/OBFSStore.h deleted file mode 100644 index cb4a6afc815d7..0000000000000 --- a/branches/marnberg/quota/osd/OBFSStore.h +++ /dev/null @@ -1,56 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef _OBFSSTORE_H_ -#define _OBFSSTORE_H_ - -#include "ObjectStore.h" -#include "Fake.h" - -class OBFSStore : public ObjectStore, - public FakeStoreAttrs, - public FakeStoreCollections { - int whoami; - int bdev_id; - int mounted; - char dev[128]; - char param[128]; - - public: - OBFSStore(int whoami, char *param, char *dev); - - int mount(void); - int umount(void); - int mkfs(void); - - int statfs(struct statfs *); - - bool exists(object_t oid); - int stat(object_t oid, struct stat *st); - - int remove(object_t oid); - int truncate(object_t oid, off_t size); - - int read(object_t oid, size_t len, - off_t offset, bufferlist& bl); - int write(object_t oid, size_t len, - off_t offset, bufferlist& bl, - bool fsync); - int write(object_t oid, size_t len, - off_t offset, bufferlist& bl, - Context *onflush); - -}; - -#endif diff --git a/branches/marnberg/quota/osd/OSD.cc b/branches/marnberg/quota/osd/OSD.cc deleted file mode 100644 index 058692fab3fc0..0000000000000 --- a/branches/marnberg/quota/osd/OSD.cc +++ /dev/null @@ -1,3494 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "include/types.h" - -#include "OSD.h" -#include "OSDMap.h" - -#ifdef USE_OBFS -# include "OBFSStore.h" -#else -# include "FakeStore.h" -#endif - -#include "ebofs/Ebofs.h" - -#ifdef USE_OSBDB -#include "osbdb/OSBDB.h" -#endif // USE_OSBDB - -#include "Ager.h" - - -#include "msg/Messenger.h" -#include "msg/Message.h" - -#include "messages/MGenericMessage.h" -#include "messages/MPing.h" -#include "messages/MPingAck.h" -#include "messages/MOSDPing.h" -#include "messages/MOSDFailure.h" -#include "messages/MOSDOp.h" -#include "messages/MOSDOpReply.h" -#include "messages/MOSDBoot.h" -#include "messages/MOSDIn.h" -#include "messages/MOSDOut.h" - -#include "messages/MOSDMap.h" -#include "messages/MOSDGetMap.h" -#include "messages/MOSDPGNotify.h" -#include "messages/MOSDPGQuery.h" -#include "messages/MOSDPGLog.h" -#include "messages/MOSDPGRemove.h" - -#include "common/Logger.h" -#include "common/LogType.h" -#include "common/Timer.h" -#include "common/ThreadPool.h" - -#include -#include -#include -#include - - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_osd) cout << g_clock.now() << " osd" << whoami << " " << (osdmap ? osdmap->get_epoch():0) << " " -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_osd) cerr << g_clock.now() << " osd" << whoami << " " << (osdmap ? osdmap->get_epoch():0) << " " - -char *osd_base_path = "./osddata"; -char *ebofs_base_path = "./dev"; - - -object_t SUPERBLOCK_OBJECT(0,0); - - -// force remount hack for performance testing FakeStore -class C_Remount : public Context { - OSD *osd; -public: - C_Remount(OSD *o) : osd(o) {} - void finish(int) { - osd->force_remount(); - } -}; - -void OSD::force_remount() -{ - dout(0) << "forcing remount" << endl; - osd_lock.Lock(); - { - store->umount(); - store->mount(); - } - osd_lock.Unlock(); - dout(0) << "finished remount" << endl; -} -// - - -// cons/des - -LogType osd_logtype; - -OSD::OSD(int id, Messenger *m, MonMap *mm, char *dev) : timer(osd_lock) -{ - whoami = id; - messenger = m; - monmap = mm; - - osdmap = 0; - boot_epoch = 0; - - last_tid = 0; - num_pulling = 0; - - state = STATE_BOOTING; - - hb_stat_ops = 0; - hb_stat_qlen = 0; - - pending_ops = 0; - waiting_for_no_ops = false; - - if (g_conf.osd_remount_at) - timer.add_event_after(g_conf.osd_remount_at, new C_Remount(this)); - - - // init object store - // try in this order: - // dev/osd$num - // dev/osd.$hostname - // dev/osd.all - - if (dev) { - strcpy(dev_path,dev); - } else { - char hostname[100]; - hostname[0] = 0; - gethostname(hostname,100); - - sprintf(dev_path, "%s/osd%d", ebofs_base_path, whoami); - - struct stat sta; - if (::lstat(dev_path, &sta) != 0) - sprintf(dev_path, "%s/osd.%s", ebofs_base_path, hostname); - - if (::lstat(dev_path, &sta) != 0) - sprintf(dev_path, "%s/osd.all", ebofs_base_path); - } - - if (g_conf.ebofs) { - store = new Ebofs(dev_path); - //store->_fake_writes(true); - } -#ifdef USE_OBFS - else if (g_conf.uofs) { - store = new OBFSStore(whoami, NULL, dev_path); - } -#endif -#ifdef USE_OSBDB - else if (g_conf.bdbstore) { - store = new OSBDB(dev_path); - } -#endif // USE_OSBDB - else { - sprintf(dev_path, "osddata/osd%d", whoami); - store = new FakeStore(dev_path, whoami); - } - -} - -OSD::~OSD() -{ - if (threadpool) { delete threadpool; threadpool = 0; } - if (osdmap) { delete osdmap; osdmap = 0; } - //if (monitor) { delete monitor; monitor = 0; } - if (messenger) { delete messenger; messenger = 0; } - if (logger) { delete logger; logger = 0; } - if (store) { delete store; store = 0; } -} - -int OSD::init() -{ - osd_lock.Lock(); - { - // mkfs? - if (g_conf.osd_mkfs) { - dout(2) << "mkfs" << endl; - store->mkfs(); - - // make up a superblock - //superblock.fsid = ???; - superblock.whoami = whoami; - } - - // mount. - dout(2) << "mounting " << dev_path << endl; - int r = store->mount(); - assert(r>=0); - - if (g_conf.osd_mkfs) { - // age? - if (g_conf.osd_age_time != 0) { - dout(2) << "age" << endl; - Ager ager(store); - if (g_conf.osd_age_time < 0) - ager.load_freelist(); - else - ager.age(g_conf.osd_age_time, - g_conf.osd_age, - g_conf.osd_age - .05, - 50000, - g_conf.osd_age - .05); - } - } - else { - dout(2) << "boot" << endl; - - // read superblock - read_superblock(); - - // load up pgs (as they previously existed) - load_pgs(); - - dout(2) << "superblock: i am osd" << superblock.whoami << endl; - assert(whoami == superblock.whoami); - } - - - // log - char name[80]; - sprintf(name, "osd%02d", whoami); - logger = new Logger(name, (LogType*)&osd_logtype); - osd_logtype.add_set("opq"); - osd_logtype.add_inc("op"); - osd_logtype.add_inc("c_rd"); - osd_logtype.add_inc("c_rdb"); - osd_logtype.add_inc("c_wr"); - osd_logtype.add_inc("c_wrb"); - - osd_logtype.add_inc("r_push"); - osd_logtype.add_inc("r_pushb"); - osd_logtype.add_inc("r_wr"); - osd_logtype.add_inc("r_wrb"); - - osd_logtype.add_inc("rlnum"); - - osd_logtype.add_set("numpg"); - osd_logtype.add_set("pingset"); - - osd_logtype.add_set("buf"); - - osd_logtype.add_inc("map"); - osd_logtype.add_inc("mapi"); - osd_logtype.add_inc("mapidup"); - osd_logtype.add_inc("mapf"); - osd_logtype.add_inc("mapfdup"); - - // request thread pool - { - char name[80]; - sprintf(name,"osd%d.threadpool", whoami); - threadpool = new ThreadPool(name, g_conf.osd_maxthreads, - static_dequeueop, - this); - } - - // i'm ready! - messenger->set_dispatcher(this); - - // announce to monitor i exist and have booted. - int mon = monmap->pick_mon(); - messenger->send_message(new MOSDBoot(superblock), monmap->get_inst(mon)); - - // start the heart - timer.add_event_after(g_conf.osd_heartbeat_interval, new C_Heartbeat(this)); - } - osd_lock.Unlock(); - - //dout(0) << "osd_rep " << g_conf.osd_rep << endl; - - return 0; -} - -int OSD::shutdown() -{ - dout(1) << "shutdown" << endl; - - state = STATE_STOPPING; - - // cancel timers - timer.cancel_all(); - timer.join(); - - // finish ops - wait_for_no_ops(); - - // stop threads - delete threadpool; - threadpool = 0; - - // close pgs - for (hash_map::iterator p = pg_map.begin(); - p != pg_map.end(); - p++) { - delete p->second; - } - pg_map.clear(); - - // shut everything else down - //monitor->shutdown(); - messenger->shutdown(); - - osd_lock.Unlock(); - int r = store->umount(); - osd_lock.Lock(); - return r; -} - - - -void OSD::write_superblock(ObjectStore::Transaction& t) -{ - dout(10) << "write_superblock " << superblock << endl; - - bufferlist bl; - bl.append((char*)&superblock, sizeof(superblock)); - t.write(SUPERBLOCK_OBJECT, 0, sizeof(superblock), bl); -} - -int OSD::read_superblock() -{ - bufferlist bl; - int r = store->read(SUPERBLOCK_OBJECT, 0, sizeof(superblock), bl); - if (bl.length() != sizeof(superblock)) { - dout(10) << "read_superblock failed, r = " << r << ", i got " << bl.length() << " bytes, not " << sizeof(superblock) << endl; - return -1; - } - - bl.copy(0, sizeof(superblock), (char*)&superblock); - - dout(10) << "read_superblock " << superblock << endl; - - // load up "current" osdmap - assert(!osdmap); - osdmap = new OSDMap; - bl.clear(); - get_map_bl(superblock.current_epoch, bl); - osdmap->decode(bl); - - assert(whoami == superblock.whoami); // fixme! - return 0; -} - - -// object locks - -PG *OSD::lock_pg(pg_t pgid) -{ - osd_lock.Lock(); - PG *pg = _lock_pg(pgid); - osd_lock.Unlock(); - return pg; -} - -PG *OSD::_lock_pg(pg_t pgid) -{ - assert(pg_map.count(pgid)); - - if (pg_lock.count(pgid)) { - Cond c; - dout(15) << "lock_pg " << pgid << " waiting as " << &c << endl; - //cerr << "lock_pg " << pgid << " waiting as " << &c << endl; - - list& ls = pg_lock_waiters[pgid]; // this is commit, right? - ls.push_back(&c); - - while (pg_lock.count(pgid) || - ls.front() != &c) - c.Wait(osd_lock); - - assert(ls.front() == &c); - ls.pop_front(); - if (ls.empty()) - pg_lock_waiters.erase(pgid); - } - - dout(15) << "lock_pg " << pgid << endl; - pg_lock.insert(pgid); - - return pg_map[pgid]; -} - -void OSD::unlock_pg(pg_t pgid) -{ - osd_lock.Lock(); - _unlock_pg(pgid); - osd_lock.Unlock(); -} - -void OSD::_unlock_pg(pg_t pgid) -{ - // unlock - assert(pg_lock.count(pgid)); - pg_lock.erase(pgid); - - if (pg_lock_waiters.count(pgid)) { - // someone is in line - Cond *c = pg_lock_waiters[pgid].front(); - assert(c); - dout(15) << "unlock_pg " << pgid << " waking up next guy " << c << endl; - c->Signal(); - } else { - // nobody waiting - dout(15) << "unlock_pg " << pgid << endl; - } -} - -void OSD::_remove_pg(pg_t pgid) -{ - dout(10) << "_remove_pg " << pgid << endl; - - // remove from store - list olist; - store->collection_list(pgid, olist); - - ObjectStore::Transaction t; - { - for (list::iterator p = olist.begin(); - p != olist.end(); - p++) - t.remove(*p); - t.remove_collection(pgid); - t.remove(pgid.to_object()); // log too - } - store->apply_transaction(t); - - // hose from memory - delete pg_map[pgid]; - pg_map.erase(pgid); -} - - -void OSD::activate_pg(pg_t pgid, epoch_t epoch) -{ - osd_lock.Lock(); - { - if (pg_map.count(pgid)) { - PG *pg = _lock_pg(pgid); - if (pg->is_crashed() && - pg->is_replay() && - pg->get_role() == 0 && - pg->info.history.same_primary_since <= epoch) { - ObjectStore::Transaction t; - pg->activate(t); - store->apply_transaction(t); - } - _unlock_pg(pgid); - } - } - - // finishers? - if (finished.empty()) { - osd_lock.Unlock(); - } else { - list waiting; - waiting.splice(waiting.begin(), finished); - - osd_lock.Unlock(); - - for (list::iterator it = waiting.begin(); - it != waiting.end(); - it++) { - dispatch(*it); - } - } -} - - -// ------------------------------------- - -void OSD::heartbeat() -{ - utime_t now = g_clock.now(); - utime_t since = now; - since.sec_ref() -= g_conf.osd_heartbeat_interval; - - // calc my stats - float avg_qlen = 0; - if (hb_stat_ops) avg_qlen = (float)hb_stat_qlen / (float)hb_stat_ops; - - dout(5) << "heartbeat " << now - << ": ops " << hb_stat_ops - << ", avg qlen " << avg_qlen - << endl; - - // reset until next time around - hb_stat_ops = 0; - hb_stat_qlen = 0; - - // send pings - set pingset; - for (hash_map::iterator i = pg_map.begin(); - i != pg_map.end(); - i++) { - PG *pg = i->second; - - // we want to ping the primary. - if (pg->get_role() <= 0) continue; - if (pg->acting.size() < 1) continue; - - if (pg->last_heartbeat < since) { - pg->last_heartbeat = now; - pingset.insert(pg->acting[0]); - } - } - for (set::iterator i = pingset.begin(); - i != pingset.end(); - i++) { - _share_map_outgoing( osdmap->get_inst(*i) ); - messenger->send_message(new MOSDPing(osdmap->get_epoch(), avg_qlen), - osdmap->get_inst(*i)); - } - - if (logger) logger->set("pingset", pingset.size()); - - // hack: fake reorg? - if (osdmap && g_conf.fake_osdmap_updates) { - int mon = monmap->pick_mon(); - if ((rand() % g_conf.fake_osdmap_updates) == 0) { - //if ((rand() % (g_conf.num_osd / g_conf.fake_osdmap_updates)) == whoami / g_conf.fake_osdmap_updates) { - messenger->send_message(new MOSDIn(osdmap->get_epoch()), - monmap->get_inst(mon)); - } - /* - if (osdmap->is_out(whoami)) { - messenger->send_message(new MOSDIn(osdmap->get_epoch()), - MSG_ADDR_MON(mon), monmap->get_inst(mon)); - } - else if ((rand() % g_conf.fake_osdmap_updates) == 0) { - //messenger->send_message(new MOSDOut(osdmap->get_epoch()), - //MSG_ADDR_MON(mon), monmap->get_inst(mon)); - } - } - */ - } - - // schedule next! randomly. - float wait = .5 + ((float)(rand() % 10)/10.0) * (float)g_conf.osd_heartbeat_interval; - timer.add_event_after(wait, new C_Heartbeat(this)); -} - - - -// -------------------------------------- -// dispatch - -bool OSD::_share_map_incoming(const entity_inst_t& inst, epoch_t epoch) -{ - bool shared = false; - - // does client have old map? - if (inst.name.is_client()) { - if (epoch < osdmap->get_epoch()) { - dout(10) << inst.name << " has old map " << epoch << " < " << osdmap->get_epoch() << endl; - send_incremental_map(epoch, inst, true); - shared = true; - } - } - - // does peer have old map? - if (inst.name.is_osd()) { - // remember - if (peer_map_epoch[inst.name] < epoch) - peer_map_epoch[inst.name] = epoch; - - // older? - if (peer_map_epoch[inst.name] < osdmap->get_epoch()) { - dout(10) << inst.name << " has old map " << epoch << " < " << osdmap->get_epoch() << endl; - send_incremental_map(epoch, inst, true); - peer_map_epoch[inst.name] = osdmap->get_epoch(); // so we don't send it again. - shared = true; - } - } - - return shared; -} - - -void OSD::_share_map_outgoing(const entity_inst_t& inst) -{ - assert(inst.name.is_osd()); - - if (inst.name.is_osd()) { - // send map? - if (peer_map_epoch.count(inst.name)) { - epoch_t pe = peer_map_epoch[inst.name]; - if (pe < osdmap->get_epoch()) { - send_incremental_map(pe, inst, true); - peer_map_epoch[inst.name] = osdmap->get_epoch(); - } - } else { - // no idea about peer's epoch. - // ??? send recent ??? - // do nothing. - } - } -} - - - -void OSD::dispatch(Message *m) -{ - // lock! - osd_lock.Lock(); - - switch (m->get_type()) { - - // -- don't need lock -- - case MSG_PING: - dout(10) << "ping from " << m->get_source() << endl; - delete m; - break; - - // -- don't need OSDMap -- - - /* - // host monitor - case MSG_PING_ACK: - case MSG_FAILURE_ACK: - monitor->proc_message(m); - break; - */ - - // map and replication - case MSG_OSD_MAP: - handle_osd_map((MOSDMap*)m); - break; - - // osd - case MSG_SHUTDOWN: - shutdown(); - delete m; - break; - - - - // -- need OSDMap -- - - default: - { - // no map? starting up? - if (!osdmap) { - dout(7) << "no OSDMap, not booted" << endl; - waiting_for_osdmap.push_back(m); - break; - } - - // down? - if (osdmap->is_down(whoami)) { - dout(7) << "i am marked down, dropping " << *m << endl; - delete m; - break; - } - - - - - // need OSDMap - switch (m->get_type()) { - - case MSG_OSD_PING: - // take note. - handle_osd_ping((MOSDPing*)m); - break; - - case MSG_OSD_PG_NOTIFY: - handle_pg_notify((MOSDPGNotify*)m); - break; - case MSG_OSD_PG_QUERY: - handle_pg_query((MOSDPGQuery*)m); - break; - case MSG_OSD_PG_LOG: - handle_pg_log((MOSDPGLog*)m); - break; - case MSG_OSD_PG_REMOVE: - handle_pg_remove((MOSDPGRemove*)m); - break; - - case MSG_OSD_OP: - handle_op((MOSDOp*)m); - break; - - // for replication etc. - case MSG_OSD_OPREPLY: - handle_op_reply((MOSDOpReply*)m); - break; - - - default: - dout(1) << " got unknown message " << m->get_type() << endl; - assert(0); - } - } - } - - // finishers? - if (!finished.empty()) { - list waiting; - waiting.splice(waiting.begin(), finished); - - osd_lock.Unlock(); - - for (list::iterator it = waiting.begin(); - it != waiting.end(); - it++) { - dispatch(*it); - } - return; - } - - osd_lock.Unlock(); -} - - -void OSD::ms_handle_failure(Message *m, const entity_inst_t& inst) -{ - entity_name_t dest = inst.name; - - if (g_conf.ms_die_on_failure) { - dout(0) << "ms_handle_failure " << inst << " on " << *m << endl; - exit(0); - } - - if (dest.is_osd()) { - // failed osd. drop message, report to mon. - int mon = monmap->pick_mon(); - dout(0) << "ms_handle_failure " << inst - << ", dropping and reporting to mon" << mon - << " " << *m - << endl; - messenger->send_message(new MOSDFailure(inst, osdmap->get_epoch()), - monmap->get_inst(mon)); - delete m; - } else if (dest.is_mon()) { - // resend to a different monitor. - int mon = monmap->pick_mon(true); - dout(0) << "ms_handle_failure " << inst - << ", resending to mon" << mon - << " " << *m - << endl; - messenger->send_message(m, monmap->get_inst(mon)); - } - else { - // client? - dout(0) << "ms_handle_failure " << inst - << ", dropping " << *m << endl; - delete m; - } -} - - - - -void OSD::handle_osd_ping(MOSDPing *m) -{ - dout(20) << "osdping from " << m->get_source() << endl; - _share_map_incoming(m->get_source_inst(), ((MOSDPing*)m)->map_epoch); - - int from = m->get_source().num(); - peer_qlen[from] = m->avg_qlen; - - //if (!m->ack) - //messenger->send_message(new MOSDPing(osdmap->get_epoch(), true), - //m->get_source()); - - delete m; -} - - - - -// ===================================================== -// MAP - -void OSD::wait_for_new_map(Message *m) -{ - // ask - if (waiting_for_osdmap.empty()) { - int mon = monmap->pick_mon(); - messenger->send_message(new MOSDGetMap(osdmap->get_epoch()), - monmap->get_inst(mon)); - } - - waiting_for_osdmap.push_back(m); -} - - -/** update_map - * assimilate new OSDMap(s). scan pgs, etc. - */ -void OSD::handle_osd_map(MOSDMap *m) -{ - wait_for_no_ops(); - - assert(osd_lock.is_locked()); - - ObjectStore::Transaction t; - - if (osdmap) { - dout(3) << "handle_osd_map epochs [" - << m->get_first() << "," << m->get_last() - << "], i have " << osdmap->get_epoch() - << endl; - } else { - dout(3) << "handle_osd_map epochs [" - << m->get_first() << "," << m->get_last() - << "], i have none" - << endl; - osdmap = new OSDMap; - boot_epoch = m->get_last(); // hrm...? - } - - logger->inc("mapmsg"); - - // store them? - for (map::iterator p = m->maps.begin(); - p != m->maps.end(); - p++) { - object_t oid = get_osdmap_object_name(p->first); - if (store->exists(oid)) { - dout(10) << "handle_osd_map already had full map epoch " << p->first << endl; - logger->inc("mapfdup"); - bufferlist bl; - get_map_bl(p->first, bl); - dout(10) << " .. it is " << bl.length() << " bytes" << endl; - continue; - } - - dout(10) << "handle_osd_map got full map epoch " << p->first << endl; - //t.write(oid, 0, p->second.length(), p->second); - store->write(oid, 0, p->second.length(), p->second, 0); - - if (p->first > superblock.newest_map) - superblock.newest_map = p->first; - if (p->first < superblock.oldest_map || - superblock.oldest_map == 0) - superblock.oldest_map = p->first; - - logger->inc("mapf"); - } - for (map::iterator p = m->incremental_maps.begin(); - p != m->incremental_maps.end(); - p++) { - object_t oid = get_inc_osdmap_object_name(p->first); - if (store->exists(oid)) { - dout(10) << "handle_osd_map already had incremental map epoch " << p->first << endl; - logger->inc("mapidup"); - bufferlist bl; - get_inc_map_bl(p->first, bl); - dout(10) << " .. it is " << bl.length() << " bytes" << endl; - continue; - } - - dout(10) << "handle_osd_map got incremental map epoch " << p->first << endl; - //t.write(oid, 0, p->second.length(), p->second); - store->write(oid, 0, p->second.length(), p->second, 0); - - if (p->first > superblock.newest_map) - superblock.newest_map = p->first; - if (p->first < superblock.oldest_map || - superblock.oldest_map == 0) - superblock.oldest_map = p->first; - - logger->inc("mapi"); - } - - // advance if we can - bool advanced = false; - - if (m->get_source().is_mon() && is_booting()) - advanced = true; - - epoch_t cur = superblock.current_epoch; - while (cur < superblock.newest_map) { - bufferlist bl; - if (m->incremental_maps.count(cur+1) || - store->exists(get_inc_osdmap_object_name(cur+1))) { - dout(10) << "handle_osd_map decoding inc map epoch " << cur+1 << endl; - - bufferlist bl; - if (m->incremental_maps.count(cur+1)) - bl = m->incremental_maps[cur+1]; - else - get_inc_map_bl(cur+1, bl); - - OSDMap::Incremental inc; - int off = 0; - inc.decode(bl, off); - - osdmap->apply_incremental(inc); - - // archive the full map - bl.clear(); - osdmap->encode(bl); - t.write( get_osdmap_object_name(cur+1), 0, bl.length(), bl); - - // notify messenger - for (map::iterator i = inc.new_down.begin(); - i != inc.new_down.end(); - i++) { - int osd = i->first; - if (osd == whoami) continue; - messenger->mark_down(i->second.addr); - peer_map_epoch.erase(MSG_ADDR_OSD(osd)); - - // kick any replica ops - for (hash_map::iterator it = pg_map.begin(); - it != pg_map.end(); - it++) { - PG *pg = it->second; - - _lock_pg(pg->info.pgid); - { - list ls; // do async; repop_ack() may modify pg->repop_gather - for (map::iterator p = pg->repop_gather.begin(); - p != pg->repop_gather.end(); - p++) { - //dout(-1) << "checking repop tid " << p->first << endl; - if (p->second->waitfor_ack.count(osd) || - p->second->waitfor_commit.count(osd)) - ls.push_back(p->second); - } - for (list::iterator p = ls.begin(); - p != ls.end(); - p++) - repop_ack(pg, *p, -1, true, osd); - } - _unlock_pg(pg->info.pgid); - } - } - for (map::iterator i = inc.new_up.begin(); - i != inc.new_up.end(); - i++) { - if (i->first == whoami) continue; - peer_map_epoch.erase(MSG_ADDR_OSD(i->first)); - } - } - else if (m->maps.count(cur+1) || - store->exists(get_osdmap_object_name(cur+1))) { - dout(10) << "handle_osd_map decoding full map epoch " << cur+1 << endl; - bufferlist bl; - if (m->maps.count(cur+1)) - bl = m->maps[cur+1]; - else - get_map_bl(cur+1, bl); - osdmap->decode(bl); - - // FIXME BUG: need to notify messenger of ups/downs!! - } - else { - dout(10) << "handle_osd_map missing epoch " << cur+1 << endl; - int mon = monmap->pick_mon(); - messenger->send_message(new MOSDGetMap(cur), monmap->get_inst(mon)); - break; - } - - cur++; - superblock.current_epoch = cur; - advance_map(t); - advanced = true; - } - - // all the way? - if (advanced && cur == superblock.newest_map) { - // yay! - activate_map(t); - - // process waiters - take_waiters(waiting_for_osdmap); - } - - // write updated pg state to store - for (hash_map::iterator i = pg_map.begin(); - i != pg_map.end(); - i++) { - pg_t pgid = i->first; - PG *pg = i->second; - t.collection_setattr( pgid, "info", &pg->info, sizeof(pg->info)); - } - - // superblock and commit - write_superblock(t); - store->apply_transaction(t); - - //if (osdmap->get_epoch() == 1) store->sync(); // in case of early death, blah - - delete m; -} - - -/** - * scan placement groups, initiate any replication - * activities. - */ -void OSD::advance_map(ObjectStore::Transaction& t) -{ - dout(7) << "advance_map epoch " << osdmap->get_epoch() - << " " << pg_map.size() << " pgs" - << endl; - - if (osdmap->is_mkfs()) { - ps_t maxps = 1ULL << osdmap->get_pg_bits(); - ps_t maxlps = 1ULL << osdmap->get_localized_pg_bits(); - dout(1) << "mkfs on " << osdmap->get_pg_bits() << " bits, " << maxps << " pgs" << endl; - assert(osdmap->get_epoch() == 1); - - //cerr << "osdmap " << osdmap->get_ctime() << " logger start " << logger->get_start() << endl; - logger->set_start( osdmap->get_ctime() ); - - assert(g_conf.osd_mkfs); // make sure we did a mkfs! - - // create PGs - for (int nrep = 1; - nrep <= MIN(g_conf.num_osd, g_conf.osd_max_rep); // for low osd counts.. hackish bleh - nrep++) { - for (ps_t ps = 0; ps < maxps; ++ps) { - vector acting; - pg_t pgid = osdmap->ps_nrep_to_pg(ps, nrep); - int nrep = osdmap->pg_to_acting_osds(pgid, acting); - int role = osdmap->calc_pg_role(whoami, acting, nrep); - if (role < 0) continue; - - PG *pg = create_pg(pgid, t); - pg->set_role(role); - pg->acting.swap(acting); - pg->last_epoch_started_any = - pg->info.last_epoch_started = - pg->info.history.same_since = - pg->info.history.same_primary_since = - pg->info.history.same_acker_since = osdmap->get_epoch(); - pg->activate(t); - - dout(7) << "created " << *pg << endl; - } - - for (ps_t ps = 0; ps < maxlps; ++ps) { - // local PG too - vector acting; - pg_t pgid = osdmap->ps_osd_nrep_to_pg(ps, whoami, nrep); - int nrep = osdmap->pg_to_acting_osds(pgid, acting); - int role = osdmap->calc_pg_role(whoami, acting, nrep); - - PG *pg = create_pg(pgid, t); - pg->acting.swap(acting); - pg->set_role(role); - pg->last_epoch_started_any = - pg->info.last_epoch_started = - pg->info.history.same_primary_since = - pg->info.history.same_acker_since = - pg->info.history.same_since = osdmap->get_epoch(); - pg->activate(t); - - dout(7) << "created " << *pg << endl; - } - } - - dout(1) << "mkfs done, created " << pg_map.size() << " pgs" << endl; - - } else { - // scan existing pg's - for (hash_map::iterator it = pg_map.begin(); - it != pg_map.end(); - it++) { - pg_t pgid = it->first; - PG *pg = it->second; - - // did i finish this epoch? - if (pg->is_active()) { - pg->info.last_epoch_finished = osdmap->get_epoch()-1; - } - - // get new acting set - vector tacting; - int nrep = osdmap->pg_to_acting_osds(pgid, tacting); - int role = osdmap->calc_pg_role(whoami, tacting, nrep); - - // no change? - if (tacting == pg->acting) - continue; - - // -- there was a change! -- - _lock_pg(pgid); - - int oldrole = pg->get_role(); - int oldprimary = pg->get_primary(); - int oldacker = pg->get_acker(); - vector oldacting = pg->acting; - - // update PG - pg->acting.swap(tacting); - pg->set_role(role); - - // did primary|acker change? - pg->info.history.same_since = osdmap->get_epoch(); - if (oldprimary != pg->get_primary()) { - pg->info.history.same_primary_since = osdmap->get_epoch(); - pg->cancel_recovery(); - } - if (oldacker != pg->get_acker()) { - pg->info.history.same_acker_since = osdmap->get_epoch(); - } - - // deactivate. - pg->state_clear(PG::STATE_ACTIVE); - - // reset primary state? - if (oldrole == 0 || pg->get_role() == 0) - pg->clear_primary_state(); - - // apply any repops in progress. - if (oldacker == whoami) { - // apply repops - for (map::iterator p = pg->repop_gather.begin(); - p != pg->repop_gather.end(); - p++) { - if (!p->second->applied) - apply_repop(pg, p->second); - delete p->second->op; - delete p->second; - } - pg->repop_gather.clear(); - - // and repop waiters - for (map >::iterator p = pg->waiting_for_repop.begin(); - p != pg->waiting_for_repop.end(); - p++) - for (list::iterator pm = p->second.begin(); - pm != p->second.end(); - pm++) - delete *pm; - pg->waiting_for_repop.clear(); - } - - if (role != oldrole) { - // old primary? - if (oldrole == 0) { - pg->state_clear(PG::STATE_CLEAN); - - // take replay queue waiters - list ls; - for (map::iterator it = pg->replay_queue.begin(); - it != pg->replay_queue.end(); - it++) - ls.push_back(it->second); - pg->replay_queue.clear(); - take_waiters(ls); - - // take active waiters - take_waiters(pg->waiting_for_active); - - // take object waiters - for (hash_map >::iterator it = pg->waiting_for_missing_object.begin(); - it != pg->waiting_for_missing_object.end(); - it++) - take_waiters(it->second); - pg->waiting_for_missing_object.clear(); - } - - // new primary? - if (role == 0) { - // i am new primary - pg->state_clear(PG::STATE_STRAY); - } else { - // i am now replica|stray. we need to send a notify. - pg->state_set(PG::STATE_STRAY); - - if (nrep == 0) { - pg->state_set(PG::STATE_CRASHED); - dout(1) << *pg << " is crashed" << endl; - } - } - - // my role changed. - dout(10) << *pg << " " << oldacting << " -> " << pg->acting - << ", role " << oldrole << " -> " << role << endl; - - } else { - // no role change. - // did primary change? - if (pg->get_primary() != oldprimary) { - // we need to announce - pg->state_set(PG::STATE_STRAY); - - dout(10) << *pg << " " << oldacting << " -> " << pg->acting - << ", acting primary " - << oldprimary << " -> " << pg->get_primary() - << endl; - } else { - // primary is the same. - if (role == 0) { - // i am (still) primary. but my replica set changed. - pg->state_clear(PG::STATE_CLEAN); - pg->state_clear(PG::STATE_REPLAY); - - dout(10) << *pg << " " << oldacting << " -> " << pg->acting - << ", replicas changed" << endl; - } - } - } - - - _unlock_pg(pgid); - } - } -} - -void OSD::activate_map(ObjectStore::Transaction& t) -{ - dout(7) << "activate_map version " << osdmap->get_epoch() << endl; - - map< int, list > notify_list; // primary -> list - map< int, map > query_map; // peer -> PG -> get_summary_since - - // scan pg's - for (hash_map::iterator it = pg_map.begin(); - it != pg_map.end(); - it++) { - //pg_t pgid = it->first; - PG *pg = it->second; - - if (pg->is_active()) { - // update started counter - pg->info.last_epoch_started = osdmap->get_epoch(); - } - else if (pg->get_role() == 0 && !pg->is_active()) { - // i am (inactive) primary - pg->build_prior(); - pg->peer(t, query_map); - } - else if (pg->is_stray() && - pg->get_primary() >= 0) { - // i am residual|replica - notify_list[pg->get_primary()].push_back(pg->info); - } - - } - - if (osdmap->is_mkfs()) // hack: skip the queries/summaries if it's a mkfs - return; - - // notify? (residual|replica) - do_notifies(notify_list); - - // do queries. - do_queries(query_map); - - logger->set("numpg", pg_map.size()); -} - - -void OSD::send_incremental_map(epoch_t since, const entity_inst_t& inst, bool full) -{ - dout(10) << "send_incremental_map " << since << " -> " << osdmap->get_epoch() - << " to " << inst << endl; - - MOSDMap *m = new MOSDMap; - - for (epoch_t e = osdmap->get_epoch(); - e > since; - e--) { - bufferlist bl; - if (get_inc_map_bl(e,bl)) { - m->incremental_maps[e].claim(bl); - } else if (get_map_bl(e,bl)) { - m->maps[e].claim(bl); - if (!full) break; - } - else { - assert(0); // we should have all maps. - } - } - - messenger->send_message(m, inst); -} - -bool OSD::get_map_bl(epoch_t e, bufferlist& bl) -{ - return store->read(get_osdmap_object_name(e), 0, 0, bl) >= 0; -} - -bool OSD::get_inc_map_bl(epoch_t e, bufferlist& bl) -{ - return store->read(get_inc_osdmap_object_name(e), 0, 0, bl) >= 0; -} - -void OSD::get_map(epoch_t epoch, OSDMap &m) -{ - // find a complete map - list incs; - epoch_t e; - for (e = epoch; e > 0; e--) { - bufferlist bl; - if (get_map_bl(e, bl)) { - //dout(10) << "get_map " << epoch << " full " << e << endl; - m.decode(bl); - break; - } else { - OSDMap::Incremental inc; - bool got = get_inc_map(e, inc); - assert(got); - incs.push_front(inc); - } - } - assert(e > 0); - - // apply incrementals - for (e++; e <= epoch; e++) { - //dout(10) << "get_map " << epoch << " inc " << e << endl; - m.apply_incremental( incs.front() ); - incs.pop_front(); - } -} - - -bool OSD::get_inc_map(epoch_t e, OSDMap::Incremental &inc) -{ - bufferlist bl; - if (!get_inc_map_bl(e, bl)) - return false; - int off = 0; - inc.decode(bl, off); - return true; -} - - - - - -bool OSD::require_current_map(Message *m, epoch_t ep) -{ - // older map? - if (ep < osdmap->get_epoch()) { - dout(7) << "require_current_map epoch " << ep << " < " << osdmap->get_epoch() << endl; - delete m; // discard and ignore. - return false; - } - - // newer map? - if (ep > osdmap->get_epoch()) { - dout(7) << "require_current_map epoch " << ep << " > " << osdmap->get_epoch() << endl; - wait_for_new_map(m); - return false; - } - - assert(ep == osdmap->get_epoch()); - return true; -} - - -/* - * require that we have same (or newer) map, and that - * the source is the pg primary. - */ -bool OSD::require_same_or_newer_map(Message *m, epoch_t epoch) -{ - dout(10) << "require_same_or_newer_map " << epoch << " (i am " << osdmap->get_epoch() << ")" << endl; - - // newer map? - if (epoch > osdmap->get_epoch()) { - dout(7) << " from newer map epoch " << epoch << " > " << osdmap->get_epoch() << endl; - wait_for_new_map(m); - return false; - } - - if (epoch < boot_epoch) { - dout(7) << " from pre-boot epoch " << epoch << " < " << boot_epoch << endl; - delete m; - return false; - } - - return true; -} - - - - -// ====================================================== -// REPLICATION - -// PG - -bool OSD::pg_exists(pg_t pgid) -{ - return store->collection_exists(pgid); -} - -PG *OSD::create_pg(pg_t pgid, ObjectStore::Transaction& t) -{ - if (pg_map.count(pgid)) { - dout(0) << "create_pg on " << pgid << ", already have " << *pg_map[pgid] << endl; - } - assert(pg_map.count(pgid) == 0); - assert(!pg_exists(pgid)); - - PG *pg = new PG(this, pgid); - pg_map[pgid] = pg; - - t.create_collection(pgid); - - return pg; -} - - - - -PG *OSD::get_pg(pg_t pgid) -{ - if (pg_map.count(pgid)) - return pg_map[pgid]; - return 0; -} - -void OSD::load_pgs() -{ - dout(10) << "load_pgs" << endl; - assert(pg_map.empty()); - - list ls; - store->list_collections(ls); - - for (list::iterator it = ls.begin(); - it != ls.end(); - it++) { - pg_t pgid = *it; - - PG *pg = new PG(this, pgid); - pg_map[pgid] = pg; - - // read pg info - store->collection_getattr(pgid, "info", &pg->info, sizeof(pg->info)); - - // read pg log - pg->read_log(store); - - // generate state for current mapping - int nrep = osdmap->pg_to_acting_osds(pgid, pg->acting); - int role = osdmap->calc_pg_role(whoami, pg->acting, nrep); - pg->set_role(role); - - dout(10) << "load_pgs loaded " << *pg << " " << pg->log << endl; - } -} - -/** - * check epochs starting from start to verify the pg acting set hasn't changed - * up until now - */ -void OSD::project_pg_history(pg_t pgid, PG::Info::History& h, epoch_t from) -{ - dout(15) << "project_pg_history " << pgid - << " from " << from << " to " << osdmap->get_epoch() - << ", start " << h - << endl; - - vector last; - osdmap->pg_to_acting_osds(pgid, last); - - for (epoch_t e = osdmap->get_epoch()-1; - e >= from; - e--) { - // verify during intermediate epoch - OSDMap oldmap; - get_map(e, oldmap); - - vector acting; - oldmap.pg_to_acting_osds(pgid, acting); - - // acting set change? - if (acting != last && - e <= h.same_since) { - dout(15) << "project_pg_history " << pgid << " changed in " << e+1 - << " from " << acting << " -> " << last << endl; - h.same_since = e+1; - } - - // primary change? - if (!(!acting.empty() && !last.empty() && acting[0] == last[0]) && - e <= h.same_primary_since) { - dout(15) << "project_pg_history " << pgid << " primary changed in " << e+1 << endl; - h.same_primary_since = e+1; - - if (g_conf.osd_rep == OSD_REP_PRIMARY) - h.same_acker_since = h.same_primary_since; - } - - // acker change? - if (g_conf.osd_rep != OSD_REP_PRIMARY) { - if (!(!acting.empty() && !last.empty() && acting[acting.size()-1] == last[last.size()-1]) && - e <= h.same_acker_since) { - dout(15) << "project_pg_history " << pgid << " acker changed in " << e+1 << endl; - h.same_acker_since = e+1; - } - } - - if (h.same_since > e && - h.same_primary_since > e && - h.same_acker_since > e) break; - } - - dout(15) << "project_pg_history end " << h << endl; -} - - -/** do_notifies - * Send an MOSDPGNotify to a primary, with a list of PGs that I have - * content for, and they are primary for. - */ - -void OSD::do_notifies(map< int, list >& notify_list) -{ - for (map< int, list >::iterator it = notify_list.begin(); - it != notify_list.end(); - it++) { - if (it->first == whoami) { - dout(7) << "do_notify osd" << it->first << " is self, skipping" << endl; - continue; - } - dout(7) << "do_notify osd" << it->first << " on " << it->second.size() << " PGs" << endl; - MOSDPGNotify *m = new MOSDPGNotify(osdmap->get_epoch(), it->second); - _share_map_outgoing(osdmap->get_inst(it->first)); - messenger->send_message(m, osdmap->get_inst(it->first)); - } -} - - -/** do_queries - * send out pending queries for info | summaries - */ -void OSD::do_queries(map< int, map >& query_map) -{ - for (map< int, map >::iterator pit = query_map.begin(); - pit != query_map.end(); - pit++) { - int who = pit->first; - dout(7) << "do_queries querying osd" << who - << " on " << pit->second.size() << " PGs" << endl; - - MOSDPGQuery *m = new MOSDPGQuery(osdmap->get_epoch(), - pit->second); - _share_map_outgoing(osdmap->get_inst(who)); - messenger->send_message(m, osdmap->get_inst(who)); - } -} - - - - -/** PGNotify - * from non-primary to primary - * includes PG::Info. - * NOTE: called with opqueue active. - */ -void OSD::handle_pg_notify(MOSDPGNotify *m) -{ - dout(7) << "handle_pg_notify from " << m->get_source() << endl; - int from = m->get_source().num(); - - if (!require_same_or_newer_map(m, m->get_epoch())) return; - - ObjectStore::Transaction t; - - // look for unknown PGs i'm primary for - map< int, map > query_map; - - for (list::iterator it = m->get_pg_list().begin(); - it != m->get_pg_list().end(); - it++) { - pg_t pgid = it->pgid; - PG *pg; - - if (pg_map.count(pgid) == 0) { - // same primary? - PG::Info::History history = it->history; - project_pg_history(pgid, history, m->get_epoch()); - - if (m->get_epoch() < history.same_primary_since) { - dout(10) << "handle_pg_notify pg " << pgid << " dne, and primary changed in " - << history.same_primary_since << " (msg from " << m->get_epoch() << ")" << endl; - continue; - } - - // ok, create PG! - pg = create_pg(pgid, t); - osdmap->pg_to_acting_osds(pgid, pg->acting); - pg->set_role(0); - pg->info.history = history; - - pg->last_epoch_started_any = it->last_epoch_started; - pg->build_prior(); - - t.collection_setattr(pgid, "info", (char*)&pg->info, sizeof(pg->info)); - - dout(10) << *pg << " is new" << endl; - - // kick any waiters - if (waiting_for_pg.count(pgid)) { - take_waiters(waiting_for_pg[pgid]); - waiting_for_pg.erase(pgid); - } - - _lock_pg(pgid); - } else { - // already had it. am i (still) the primary? - pg = _lock_pg(pgid); - if (m->get_epoch() < pg->info.history.same_primary_since) { - dout(10) << *pg << " handle_pg_notify primary changed in " - << pg->info.history.same_primary_since - << " (msg from " << m->get_epoch() << ")" << endl; - _unlock_pg(pgid); - continue; - } - } - - // ok! - - // stray? - bool acting = pg->is_acting(from); - if (!acting && (*it).last_epoch_started > 0) { - dout(10) << *pg << " osd" << from << " has stray content: " << *it << endl; - pg->stray_set.insert(from); - pg->state_clear(PG::STATE_CLEAN); - } - - // save info. - bool had = pg->peer_info.count(from); - pg->peer_info[from] = *it; - - if (had) { - if (pg->is_active() && - (*it).is_clean() && acting) { - pg->clean_set.insert(from); - dout(10) << *pg << " osd" << from << " now clean (" << pg->clean_set - << "): " << *it << endl; - if (pg->is_all_clean()) { - dout(-10) << *pg << " now clean on all replicas" << endl; - pg->state_set(PG::STATE_CLEAN); - pg->clean_replicas(); - } - } else { - // hmm, maybe keep an eye out for cases where we see this, but peer should happen. - dout(10) << *pg << " already had notify info from osd" << from << ": " << *it << endl; - } - } else { - // adjust prior? - if (it->last_epoch_started > pg->last_epoch_started_any) - pg->adjust_prior(); - - // peer - pg->peer(t, query_map); - } - - _unlock_pg(pgid); - } - - unsigned tr = store->apply_transaction(t); - assert(tr == 0); - - do_queries(query_map); - - delete m; -} - - - -/** PGLog - * from non-primary to primary - * includes log and info - * from primary to non-primary - * includes log for use in recovery - * NOTE: called with opqueue active. - */ - -void OSD::handle_pg_log(MOSDPGLog *m) -{ - int from = m->get_source().num(); - const pg_t pgid = m->get_pgid(); - - if (!require_same_or_newer_map(m, m->get_epoch())) return; - if (pg_map.count(pgid) == 0) { - dout(10) << "handle_pg_log don't have pg " << pgid << ", dropping" << endl; - assert(m->get_epoch() < osdmap->get_epoch()); - delete m; - return; - } - - PG *pg = _lock_pg(pgid); - assert(pg); - - if (m->get_epoch() < pg->info.history.same_since) { - dout(10) << "handle_pg_log " << *pg - << " from " << m->get_source() - << " is old, discarding" - << endl; - delete m; - return; - } - - dout(7) << "handle_pg_log " << *pg - << " got " << m->log << " " << m->missing - << " from " << m->get_source() << endl; - - //m->log.print(cout); - - ObjectStore::Transaction t; - - if (pg->is_primary()) { - // i am PRIMARY - assert(pg->peer_log_requested.count(from) || - pg->peer_summary_requested.count(from)); - - pg->proc_replica_log(m->log, m->missing, from); - - // peer - map< int, map > query_map; - pg->peer(t, query_map); - do_queries(query_map); - - } else { - // i am REPLICA - dout(10) << *pg << " got " << m->log << " " << m->missing << endl; - - // merge log - pg->merge_log(m->log, m->missing, from); - pg->proc_missing(m->log, m->missing, from); - assert(pg->missing.num_lost() == 0); - - // ok activate! - pg->activate(t); - } - - unsigned tr = store->apply_transaction(t); - assert(tr == 0); - - _unlock_pg(pgid); - - delete m; -} - - -/** PGQuery - * from primary to replica | stray - * NOTE: called with opqueue active. - */ -void OSD::handle_pg_query(MOSDPGQuery *m) -{ - dout(7) << "handle_pg_query from " << m->get_source() << " epoch " << m->get_epoch() << endl; - int from = m->get_source().num(); - - if (!require_same_or_newer_map(m, m->get_epoch())) return; - - map< int, list > notify_list; - - for (map::iterator it = m->pg_list.begin(); - it != m->pg_list.end(); - it++) { - pg_t pgid = it->first; - PG *pg = 0; - - if (pg_map.count(pgid) == 0) { - // same primary? - PG::Info::History history = it->second.history; - project_pg_history(pgid, history, m->get_epoch()); - - if (m->get_epoch() < history.same_since) { - dout(10) << " pg " << pgid << " dne, and pg has changed in " - << history.same_primary_since << " (msg from " << m->get_epoch() << ")" << endl; - continue; - } - - // get active rush mapping - vector acting; - int nrep = osdmap->pg_to_acting_osds(pgid, acting); - int role = osdmap->calc_pg_role(whoami, acting, nrep); - - if (role < 0) { - dout(10) << " pg " << pgid << " dne, and i am not an active replica" << endl; - PG::Info empty(pgid); - notify_list[from].push_back(empty); - continue; - } - assert(role > 0); - - ObjectStore::Transaction t; - pg = create_pg(pgid, t); - pg->acting.swap( acting ); - pg->set_role(role); - pg->info.history = history; - - t.collection_setattr(pgid, "info", (char*)&pg->info, sizeof(pg->info)); - store->apply_transaction(t); - - dout(10) << *pg << " dne (before), but i am role " << role << endl; - _lock_pg(pgid); - } else { - pg = _lock_pg(pgid); - - // same primary? - if (m->get_epoch() < pg->info.history.same_since) { - dout(10) << *pg << " handle_pg_query primary changed in " - << pg->info.history.same_since - << " (msg from " << m->get_epoch() << ")" << endl; - _unlock_pg(pgid); - continue; - } - } - - // ok, process query! - assert(!pg->acting.empty()); - assert(from == pg->acting[0]); - - if (it->second.type == PG::Query::INFO) { - // info - dout(10) << *pg << " sending info" << endl; - notify_list[from].push_back(pg->info); - } else { - MOSDPGLog *m = new MOSDPGLog(osdmap->get_epoch(), pg->get_pgid()); - m->info = pg->info; - m->missing = pg->missing; - - if (it->second.type == PG::Query::LOG) { - dout(10) << *pg << " sending info+missing+log since split " << it->second.split - << " from floor " << it->second.floor - << endl; - if (!m->log.copy_after_unless_divergent(pg->log, it->second.split, it->second.floor)) { - dout(10) << *pg << " divergent, sending backlog" << endl; - it->second.type = PG::Query::BACKLOG; - } - } - - if (it->second.type == PG::Query::BACKLOG) { - dout(10) << *pg << " sending info+missing+backlog" << endl; - if (pg->log.backlog) { - m->log = pg->log; - } else { - pg->generate_backlog(); - m->log = pg->log; - pg->drop_backlog(); - } - } - else if (it->second.type == PG::Query::FULLLOG) { - dout(10) << *pg << " sending info+missing+full log" << endl; - m->log.copy_non_backlog(pg->log); - } - - dout(10) << *pg << " sending " << m->log << " " << m->missing << endl; - //m->log.print(cout); - - _share_map_outgoing(osdmap->get_inst(from)); - messenger->send_message(m, osdmap->get_inst(from)); - } - - _unlock_pg(pgid); - } - - do_notifies(notify_list); - - delete m; -} - - -void OSD::handle_pg_remove(MOSDPGRemove *m) -{ - dout(7) << "handle_pg_remove from " << m->get_source() << endl; - - if (!require_same_or_newer_map(m, m->get_epoch())) return; - - for (set::iterator it = m->pg_list.begin(); - it != m->pg_list.end(); - it++) { - pg_t pgid = *it; - PG *pg; - - if (pg_map.count(pgid) == 0) { - dout(10) << " don't have pg " << pgid << endl; - continue; - } - - pg = _lock_pg(pgid); - - dout(10) << *pg << " removing." << endl; - assert(pg->get_role() == -1); - - _remove_pg(pgid); - - // unlock. there shouldn't be any waiters, since we're a stray, and pg is presumably clean0. - assert(pg_lock_waiters.count(pgid) == 0); - _unlock_pg(pgid); - } - - delete m; -} - - - - - - -/*** RECOVERY ***/ - -/** pull - request object from a peer - */ -void OSD::pull(PG *pg, object_t oid) -{ - assert(pg->missing.loc.count(oid)); - eversion_t v = pg->missing.missing[oid]; - int osd = pg->missing.loc[oid]; - - dout(7) << *pg << " pull " << oid - << " v " << v - << " from osd" << osd - << endl; - - // send op - tid_t tid = ++last_tid; - MOSDOp *op = new MOSDOp(messenger->get_myinst(), 0, tid, - oid, pg->get_pgid(), - osdmap->get_epoch(), - OSD_OP_PULL); - op->set_version(v); - messenger->send_message(op, osdmap->get_inst(osd)); - - // take note - assert(pg->objects_pulling.count(oid) == 0); - num_pulling++; - pg->objects_pulling[oid] = v; -} - - -/** push - send object to a peer - */ -void OSD::push(PG *pg, object_t oid, int dest) -{ - // read data+attrs - bufferlist bl; - eversion_t v; - int vlen = sizeof(v); - map attrset; - - ObjectStore::Transaction t; - t.read(oid, 0, 0, &bl); - t.getattr(oid, "version", &v, &vlen); - t.getattrs(oid, attrset); - unsigned tr = store->apply_transaction(t); - - assert(tr == 0); // !!! - - // ok - dout(7) << *pg << " push " << oid << " v " << v - << " size " << bl.length() - << " to osd" << dest - << endl; - - logger->inc("r_push"); - logger->inc("r_pushb", bl.length()); - - // send - MOSDOp *op = new MOSDOp(messenger->get_myinst(), 0, ++last_tid, - oid, pg->info.pgid, osdmap->get_epoch(), - OSD_OP_PUSH); - op->set_offset(0); - op->set_length(bl.length()); - op->set_data(bl); // note: claims bl, set length above here! - op->set_version(v); - op->set_attrset(attrset); - - messenger->send_message(op, osdmap->get_inst(dest)); -} - - -/** op_pull - * process request to pull an entire object. - * NOTE: called from opqueue. - */ -void OSD::op_pull(MOSDOp *op, PG *pg) -{ - const object_t oid = op->get_oid(); - const eversion_t v = op->get_version(); - int from = op->get_source().num(); - - dout(7) << *pg << " op_pull " << oid << " v " << op->get_version() - << " from " << op->get_source() - << endl; - - // is a replica asking? are they missing it? - if (pg->is_primary()) { - // primary - assert(pg->peer_missing.count(from)); // we had better know this, from the peering process. - - if (!pg->peer_missing[from].is_missing(oid)) { - dout(7) << *pg << " op_pull replica isn't actually missing it, we must have already pushed to them" << endl; - delete op; - return; - } - - // do we have it yet? - if (waitfor_missing_object(op, pg)) - return; - } else { - // non-primary - if (pg->missing.is_missing(oid)) { - dout(7) << *pg << " op_pull not primary, and missing " << oid << ", ignoring" << endl; - delete op; - return; - } - } - - // push it back! - push(pg, oid, op->get_source().num()); -} - - -/** op_push - * NOTE: called from opqueue. - */ -void OSD::op_push(MOSDOp *op, PG *pg) -{ - object_t oid = op->get_oid(); - eversion_t v = op->get_version(); - - if (!pg->missing.is_missing(oid)) { - dout(7) << *pg << " op_push not missing " << oid << endl; - return; - } - - dout(7) << *pg << " op_push " - << oid - << " v " << v - << " size " << op->get_length() << " " << op->get_data().length() - << endl; - - assert(op->get_data().length() == op->get_length()); - - // write object and add it to the PG - ObjectStore::Transaction t; - t.remove(oid); // in case old version exists - t.write(oid, 0, op->get_length(), op->get_data()); - t.setattrs(oid, op->get_attrset()); - t.collection_add(pg->info.pgid, oid); - - // close out pull op? - num_pulling--; - if (pg->objects_pulling.count(oid)) - pg->objects_pulling.erase(oid); - pg->missing.got(oid, v); - - - // raise last_complete? - assert(pg->log.complete_to != pg->log.log.end()); - while (pg->log.complete_to != pg->log.log.end()) { - if (pg->missing.missing.count(pg->log.complete_to->oid)) break; - if (pg->info.last_complete < pg->log.complete_to->version) - pg->info.last_complete = pg->log.complete_to->version; - pg->log.complete_to++; - } - dout(10) << *pg << " last_complete now " << pg->info.last_complete << endl; - - - // apply to disk! - t.collection_setattr(pg->info.pgid, "info", &pg->info, sizeof(pg->info)); - unsigned r = store->apply_transaction(t); - assert(r == 0); - - - - // am i primary? are others missing this too? - if (pg->is_primary()) { - for (unsigned i=1; iacting.size(); i++) { - int peer = pg->acting[i]; - assert(pg->peer_missing.count(peer)); - if (pg->peer_missing[peer].is_missing(oid)) { - // ok, push it, and they (will) have it now. - pg->peer_missing[peer].got(oid, v); - push(pg, oid, peer); - } - } - } - - // continue recovery - pg->do_recovery(); - - // kick waiters - if (pg->waiting_for_missing_object.count(oid)) - take_waiters(pg->waiting_for_missing_object[oid]); - - delete op; -} - - - - -// op_rep_modify - -// commit (to disk) callback -class C_OSD_RepModifyCommit : public Context { -public: - OSD *osd; - MOSDOp *op; - int destosd; - - eversion_t pg_last_complete; - - Mutex lock; - Cond cond; - bool acked; - bool waiting; - - C_OSD_RepModifyCommit(OSD *o, MOSDOp *oo, int dosd, eversion_t lc) : - osd(o), op(oo), destosd(dosd), pg_last_complete(lc), - acked(false), waiting(false) { } - void finish(int r) { - lock.Lock(); - assert(!waiting); - while (!acked) { - waiting = true; - cond.Wait(lock); - } - assert(acked); - lock.Unlock(); - osd->op_rep_modify_commit(op, destosd, pg_last_complete); - } - void ack() { - lock.Lock(); - assert(!acked); - acked = true; - if (waiting) cond.Signal(); - - // discard my reference to buffer - op->get_data().clear(); - - lock.Unlock(); - } -}; - -void OSD::op_rep_modify_commit(MOSDOp *op, int ackerosd, eversion_t last_complete) -{ - // send commit. - dout(10) << "rep_modify_commit on op " << *op - << ", sending commit to osd" << ackerosd - << endl; - MOSDOpReply *commit = new MOSDOpReply(op, 0, osdmap->get_epoch(), true); - commit->set_pg_complete_thru(last_complete); - messenger->send_message(commit, osdmap->get_inst(ackerosd)); - delete op; -} - -// process a modification operation - -class C_OSD_WriteCommit : public Context { -public: - OSD *osd; - pg_t pgid; - tid_t rep_tid; - eversion_t pg_last_complete; - C_OSD_WriteCommit(OSD *o, pg_t p, tid_t rt, eversion_t lc) : osd(o), pgid(p), rep_tid(rt), pg_last_complete(lc) {} - void finish(int r) { - osd->op_modify_commit(pgid, rep_tid, pg_last_complete); - } -}; - - -/** op_rep_modify - * process a replicated modify. - * NOTE: called from opqueue. - */ -void OSD::op_rep_modify(MOSDOp *op, PG *pg) -{ - object_t oid = op->get_oid(); - eversion_t nv = op->get_version(); - - const char *opname = MOSDOp::get_opname(op->get_op()); - - // check crev - objectrev_t crev = 0; - store->getattr(oid, "crev", (char*)&crev, sizeof(crev)); - - dout(10) << "op_rep_modify " << opname - << " " << oid - << " v " << nv - << " " << op->get_offset() << "~" << op->get_length() - << " in " << *pg - << endl; - - // we better not be missing this. - assert(!pg->missing.is_missing(oid)); - - // prepare our transaction - ObjectStore::Transaction t; - - // am i acker? - PG::RepOpGather *repop = 0; - int ackerosd = pg->acting[0]; - - if ((g_conf.osd_rep == OSD_REP_CHAIN || g_conf.osd_rep == OSD_REP_SPLAY)) { - ackerosd = pg->get_acker(); - - if (pg->is_acker()) { - // i am tail acker. - if (pg->repop_gather.count(op->get_rep_tid())) { - repop = pg->repop_gather[ op->get_rep_tid() ]; - } else { - repop = new_repop_gather(pg, op); - } - - // infer ack from source - int fromosd = op->get_source().num(); - get_repop_gather(repop); - { - //assert(repop->waitfor_ack.count(fromosd)); // no, we may come thru here twice. - repop->waitfor_ack.erase(fromosd); - } - put_repop_gather(pg, repop); - - // prepare dest socket - //messenger->prepare_send_message(op->get_client()); - } - - // chain? forward? - if (g_conf.osd_rep == OSD_REP_CHAIN && !pg->is_acker()) { - // chain rep, not at the tail yet. - int myrank = osdmap->calc_pg_rank(whoami, pg->acting); - int next = myrank+1; - if (next == (int)pg->acting.size()) - next = 1; - issue_repop(pg, op, pg->acting[next]); - } - } - - // do op? - C_OSD_RepModifyCommit *oncommit = 0; - - logger->inc("r_wr"); - logger->inc("r_wrb", op->get_length()); - - if (repop) { - // acker. we'll apply later. - if (op->get_op() != OSD_OP_WRNOOP) { - prepare_log_transaction(repop->t, op, nv, crev, op->get_rev(), pg, op->get_pg_trim_to()); - prepare_op_transaction(repop->t, op, nv, crev, op->get_rev(), pg); - } - } else { - // middle|replica. - if (op->get_op() != OSD_OP_WRNOOP) { - prepare_log_transaction(t, op, nv, crev, op->get_rev(), pg, op->get_pg_trim_to()); - prepare_op_transaction(t, op, nv, crev, op->get_rev(), pg); - } - - oncommit = new C_OSD_RepModifyCommit(this, op, ackerosd, pg->info.last_complete); - - // apply log update. and possibly update itself. - unsigned tr = store->apply_transaction(t, oncommit); - if (tr != 0 && // no errors - tr != 2) { // or error on collection_add - cerr << "error applying transaction: r = " << tr << endl; - assert(tr == 0); - } - } - - // ack? - if (repop) { - // (logical) local ack. this may induce the actual update. - get_repop_gather(repop); - { - assert(repop->waitfor_ack.count(whoami)); - repop->waitfor_ack.erase(whoami); - } - put_repop_gather(pg, repop); - } - else { - // send ack to acker? - if (g_conf.osd_rep != OSD_REP_CHAIN) { - MOSDOpReply *ack = new MOSDOpReply(op, 0, osdmap->get_epoch(), false); - messenger->send_message(ack, osdmap->get_inst(ackerosd)); - } - - // ack myself. - assert(oncommit); - oncommit->ack(); - } -} - - -// ========================================================= -// OPS - -void OSD::handle_op(MOSDOp *op) -{ - const pg_t pgid = op->get_pg(); - PG *pg = get_pg(pgid); - - - logger->set("buf", buffer_total_alloc); - - // update qlen stats - hb_stat_ops++; - hb_stat_qlen += pending_ops; - - - // require same or newer map - if (!require_same_or_newer_map(op, op->get_map_epoch())) return; - - // share our map with sender, if they're old - _share_map_incoming(op->get_source_inst(), op->get_map_epoch()); - - // what kind of op? - bool read = op->get_op() < 10; // read, stat. but not pull. - - if (!op->get_source().is_osd()) { - // REGULAR OP (non-replication) - - // note original source - op->set_client_inst( op->get_source_inst() ); - op->clear_payload(); // and hose encoded payload (in case we forward) - - // have pg? - if (!pg) { - dout(7) << "hit non-existent pg " - << pgid - << ", waiting" << endl; - waiting_for_pg[pgid].push_back(op); - return; - } - - if (read) { - // read. am i the (same) acker? - if (//pg->get_acker() != whoami || - op->get_map_epoch() < pg->info.history.same_acker_since) { - dout(7) << "acting acker is osd" << pg->get_acker() - << " since " << pg->info.history.same_acker_since - << ", dropping" << endl; - assert(op->get_map_epoch() < osdmap->get_epoch()); - delete op; - return; - } - } else { - // write. am i the (same) primary? - if (pg->get_primary() != whoami || - op->get_map_epoch() < pg->info.history.same_primary_since) { - dout(7) << "acting primary is osd" << pg->get_primary() - << " since " << pg->info.history.same_primary_since - << ", dropping" << endl; - assert(op->get_map_epoch() < osdmap->get_epoch()); - delete op; - return; - } - } - - // must be active. - if (!pg->is_active()) { - // replay? - if (op->get_version().version > 0) { - if (op->get_version() > pg->info.last_update) { - dout(7) << *pg << " queueing replay at " << op->get_version() - << " for " << *op << endl; - pg->replay_queue[op->get_version()] = op; - return; - } else { - dout(7) << *pg << " replay at " << op->get_version() << " <= " << pg->info.last_update - << " for " << *op - << ", will queue for WRNOOP" << endl; - } - } - - dout(7) << *pg << " not active (yet)" << endl; - pg->waiting_for_active.push_back(op); - return; - } - - // missing object? - if (read && op->get_oid().rev > 0) { - // versioned read. hrm. - // are we missing a revision that we might need? - object_t moid = op->get_oid(); - if (pick_missing_object_rev(moid, pg)) { - // is there a local revision we might use instead? - object_t loid = op->get_oid(); - if (store->pick_object_revision_lt(loid) && - moid <= loid) { - // we need moid. pull it. - dout(10) << "handle_op read on " << op->get_oid() - << ", have " << loid - << ", but need missing " << moid - << ", pulling" << endl; - pull(pg, moid); - pg->waiting_for_missing_object[moid].push_back(op); - return; - } - - dout(10) << "handle_op read on " << op->get_oid() - << ", have " << loid - << ", don't need missing " << moid - << endl; - } - } else { - // live revision. easy. - if (op->get_op() != OSD_OP_PUSH && - waitfor_missing_object(op, pg)) return; - } - - dout(7) << "handle_op " << *op << " in " << *pg << endl; - - - // balance reads? - if (read && - g_conf.osd_balance_reads && - pg->get_acker() == whoami) { - // test - if (false) { - if (pg->acting.size() > 1) { - int peer = pg->acting[1]; - dout(-10) << "fwd client read op to osd" << peer << " for " << op->get_client() << " " << op->get_client_inst() << endl; - messenger->send_message(op, osdmap->get_inst(peer)); - return; - } - } - - // am i above my average? - float my_avg = hb_stat_qlen / hb_stat_ops; - if (pending_ops > my_avg) { - // is there a peer who is below my average? - for (unsigned i=1; iacting.size(); ++i) { - int peer = pg->acting[i]; - if (peer_qlen.count(peer) && - peer_qlen[peer] < my_avg) { - // calculate a probability that we should redirect - float p = (my_avg - peer_qlen[peer]) / my_avg; // this is dumb. - - if (drand48() <= p) { - // take the first one - dout(-10) << "my qlen " << pending_ops << " > my_avg " << my_avg - << ", p=" << p - << ", fwd to peer w/ qlen " << peer_qlen[peer] - << " osd" << peer - << endl; - messenger->send_message(op, osdmap->get_inst(peer)); - return; - } - } - } - } - } - - } else { - // REPLICATION OP (it's from another OSD) - - // have pg? - if (!pg) { - derr(-7) << "handle_rep_op " << *op - << " pgid " << pgid << " dne" << endl; - delete op; - //assert(0); // wtf, shouldn't happen. - return; - } - - // check osd map: same set, or primary+acker? - if (g_conf.osd_rep == OSD_REP_CHAIN && - op->get_map_epoch() < pg->info.history.same_since) { - dout(10) << "handle_rep_op pg changed " << pg->info.history - << " after " << op->get_map_epoch() - << ", dropping" << endl; - delete op; - return; - } - if (g_conf.osd_rep != OSD_REP_CHAIN && - (op->get_map_epoch() < pg->info.history.same_primary_since || - op->get_map_epoch() < pg->info.history.same_acker_since)) { - dout(10) << "handle_rep_op pg primary|acker changed " << pg->info.history - << " after " << op->get_map_epoch() - << ", dropping" << endl; - delete op; - return; - } - - assert(pg->get_role() >= 0); - dout(7) << "handle_rep_op " << op << " in " << *pg << endl; - } - - if (g_conf.osd_maxthreads < 1) { - _lock_pg(pgid); - do_op(op, pg); // do it now - _unlock_pg(pgid); - } else { - // queue for worker threads - if (read) - enqueue_op(0, op); // no locking needed for reads - else - enqueue_op(pgid, op); - } -} - -void OSD::handle_op_reply(MOSDOpReply *op) -{ - if (op->get_map_epoch() < boot_epoch) { - dout(3) << "replica op reply from before boot" << endl; - delete op; - return; - } - - // must be a rep op. - assert(op->get_source().is_osd()); - - // make sure we have the pg - const pg_t pgid = op->get_pg(); - PG *pg = get_pg(pgid); - - // require same or newer map - if (!require_same_or_newer_map(op, op->get_map_epoch())) return; - - // share our map with sender, if they're old - _share_map_incoming(op->get_source_inst(), op->get_map_epoch()); - - if (!pg) { - // hmm. - delete op; - } - - if (g_conf.osd_maxthreads < 1) { - _lock_pg(pgid); - do_op(op, pg); // do it now - _unlock_pg(pgid); - } else { - enqueue_op(pgid, op); // queue for worker threads - } -} - - -/* - * enqueue called with osd_lock held - */ -void OSD::enqueue_op(pg_t pgid, Message *op) -{ - while (pending_ops > g_conf.osd_max_opq) { - dout(10) << "enqueue_op waiting for pending_ops " << pending_ops << " to drop to " << g_conf.osd_max_opq << endl; - op_queue_cond.Wait(osd_lock); - } - - op_queue[pgid].push_back(op); - pending_ops++; - logger->set("opq", pending_ops); - - threadpool->put_op(pgid); -} - -/* - * NOTE: dequeue called in worker thread, without osd_lock - */ -void OSD::dequeue_op(pg_t pgid) -{ - Message *op = 0; - PG *pg = 0; - - osd_lock.Lock(); - { - if (pgid) { - // lock pg - pg = _lock_pg(pgid); - } - - // get pending op - list &ls = op_queue[pgid]; - assert(!ls.empty()); - op = ls.front(); - ls.pop_front(); - - if (pgid) { - dout(10) << "dequeue_op " << op << " write pg " << pgid - << ls.size() << " / " << (pending_ops-1) << " more pending" << endl; - } else { - dout(10) << "dequeue_op " << op << " read " - << ls.size() << " / " << (pending_ops-1) << " more pending" << endl; - } - - if (ls.empty()) - op_queue.erase(pgid); - } - osd_lock.Unlock(); - - // do it - do_op(op, pg); - - // finish - osd_lock.Lock(); - { - if (pgid) { - // unlock pg - _unlock_pg(pgid); - } - - dout(10) << "dequeue_op " << op << " finish" << endl; - assert(pending_ops > 0); - - if (pending_ops > g_conf.osd_max_opq) - op_queue_cond.Signal(); - - pending_ops--; - logger->set("opq", pending_ops); - if (pending_ops == 0 && waiting_for_no_ops) - no_pending_ops.Signal(); - } - osd_lock.Unlock(); -} - - - -/** do_op - do an op - * object lock will be held (if multithreaded) - * osd_lock NOT held. - */ -void OSD::do_op(Message *m, PG *pg) -{ - //dout(15) << "do_op " << *m << endl; - - if (m->get_type() == MSG_OSD_OP) { - MOSDOp *op = (MOSDOp*)m; - - logger->inc("op"); - - switch (op->get_op()) { - - // reads - case OSD_OP_READ: - op_read(op);//, pg); - break; - case OSD_OP_STAT: - op_stat(op);//, pg); - break; - - // rep stuff - case OSD_OP_PULL: - op_pull(op, pg); - break; - case OSD_OP_PUSH: - op_push(op, pg); - break; - - // writes - case OSD_OP_WRNOOP: - case OSD_OP_WRITE: - case OSD_OP_ZERO: - case OSD_OP_DELETE: - case OSD_OP_TRUNCATE: - case OSD_OP_WRLOCK: - case OSD_OP_WRUNLOCK: - case OSD_OP_RDLOCK: - case OSD_OP_RDUNLOCK: - case OSD_OP_UPLOCK: - case OSD_OP_DNLOCK: - if (op->get_source().is_osd()) - op_rep_modify(op, pg); - else - op_modify(op, pg); - break; - - default: - assert(0); - } - } - else if (m->get_type() == MSG_OSD_OPREPLY) { - // must be replication. - MOSDOpReply *r = (MOSDOpReply*)m; - tid_t rep_tid = r->get_rep_tid(); - - if (pg->repop_gather.count(rep_tid)) { - // oh, good. - int fromosd = r->get_source().num(); - repop_ack(pg, pg->repop_gather[rep_tid], - r->get_result(), r->get_commit(), - fromosd, - r->get_pg_complete_thru()); - delete m; - } else { - // early ack. - pg->waiting_for_repop[rep_tid].push_back(r); - } - - } else - assert(0); -} - - - -void OSD::wait_for_no_ops() -{ - if (pending_ops > 0) { - dout(7) << "wait_for_no_ops - waiting for " << pending_ops << endl; - waiting_for_no_ops = true; - while (pending_ops > 0) - no_pending_ops.Wait(osd_lock); - waiting_for_no_ops = false; - assert(pending_ops == 0); - } - dout(7) << "wait_for_no_ops - none" << endl; -} - - -// ============================== -// Object locking - -// -// If the target object of the operation op is locked for writing by another client, the function puts op to the waiting queue waiting_for_wr_unlock -// returns true if object was locked, otherwise returns false -// -bool OSD::block_if_wrlocked(MOSDOp* op) -{ - object_t oid = op->get_oid(); - - entity_name_t source; - int len = store->getattr(oid, "wrlock", &source, sizeof(entity_name_t)); - //cout << "getattr returns " << len << " on " << oid << endl; - - if (len == sizeof(source) && - source != op->get_client()) { - //the object is locked for writing by someone else -- add the op to the waiting queue - waiting_for_wr_unlock[oid].push_back(op); - return true; - } - - return false; //the object wasn't locked, so the operation can be handled right away -} - - - -// =============================== -// OPS - -/* -int OSD::list_missing_revs(object_t oid, set& revs, PG *pg) -{ - int c = 0; - oid.rev = 0; - - map::iterator p = pg->missing.missing.lower_bound(oid); - if (p == pg->missing.missing.end()) - return 0; // clearly not - - while (p->first.ino == oid.ino && - p->first.bno == oid.bno) { - revs.insert(p->first); - c++; - } - return c; -}*/ - -bool OSD::pick_missing_object_rev(object_t& oid, PG *pg) -{ - map::iterator p = pg->missing.missing.upper_bound(oid); - if (p == pg->missing.missing.end()) - return false; // clearly no candidate - - if (p->first.ino == oid.ino && p->first.bno == oid.bno) { - oid = p->first; // yes! it's an upper bound revision for me. - return true; - } - return false; -} - -bool OSD::pick_object_rev(object_t& oid) -{ - object_t t = oid; - - if (!store->pick_object_revision_lt(t)) - return false; // we have no revisions of this object! - - objectrev_t crev; - int r = store->getattr(t, "crev", &crev, sizeof(crev)); - assert(r >= 0); - if (crev <= oid.rev) { - dout(10) << "pick_object_rev choosing " << t << " crev " << crev << " for " << oid << endl; - oid = t; - return true; - } - - return false; -} - -bool OSD::waitfor_missing_object(MOSDOp *op, PG *pg) -{ - const object_t oid = op->get_oid(); - - // are we missing the object? - if (pg->missing.missing.count(oid)) { - // we don't have it (yet). - eversion_t v = pg->missing.missing[oid]; - if (pg->objects_pulling.count(oid)) { - dout(7) << "missing " - << oid - << " v " << v - << " in " << *pg - << ", already pulling" - << endl; - } else { - dout(7) << "missing " - << oid - << " v " << v - << " in " << *pg - << ", pulling" - << endl; - pull(pg, oid); - } - pg->waiting_for_missing_object[oid].push_back(op); - return true; - } - - return false; -} - - - - -// READ OPS - -/** op_read - * client read op - * NOTE: called from opqueue. - */ -void OSD::op_read(MOSDOp *op)//, PG *pg) -{ - object_t oid = op->get_oid(); - - // if the target object is locked for writing by another client, put 'op' to the waiting queue - // for _any_ op type -- eg only the locker can unlock! - if (block_if_wrlocked(op)) return; // op will be handled later, after the object unlocks - - dout(10) << "op_read " << oid - << " " << op->get_offset() << "~" << op->get_length() - //<< " in " << *pg - << endl; - - long r = 0; - bufferlist bl; - - if (oid.rev && !pick_object_rev(oid)) { - // we have no revision for this request. - r = -EEXIST; - } else { - // read into a buffer - r = store->read(oid, - op->get_offset(), op->get_length(), - bl); - } - - // set up reply - MOSDOpReply *reply = new MOSDOpReply(op, 0, osdmap->get_epoch(), true); - if (r >= 0) { - reply->set_result(0); - reply->set_data(bl); - reply->set_length(r); - - logger->inc("c_rd"); - logger->inc("c_rdb", r); - - } else { - reply->set_result(r); // error - reply->set_length(0); - } - - dout(10) << " read got " << r << " / " << op->get_length() << " bytes from obj " << oid << endl; - - logger->inc("rd"); - if (r >= 0) logger->inc("rdb", r); - - // send it - messenger->send_message(reply, op->get_client_inst()); - - delete op; -} - - -/** op_stat - * client stat - * NOTE: called from opqueue - */ -void OSD::op_stat(MOSDOp *op)//, PG *pg) -{ - object_t oid = op->get_oid(); - - // if the target object is locked for writing by another client, put 'op' to the waiting queue - if (block_if_wrlocked(op)) return; //read will be handled later, after the object unlocks - - struct stat st; - memset(&st, sizeof(st), 0); - int r = 0; - - if (oid.rev && !pick_object_rev(oid)) { - // we have no revision for this request. - r = -EEXIST; - } else { - r = store->stat(oid, &st); - } - - dout(3) << "op_stat on " << oid - << " r = " << r - << " size = " << st.st_size - //<< " in " << *pg - << endl; - - MOSDOpReply *reply = new MOSDOpReply(op, r, osdmap->get_epoch(), true); - reply->set_object_size(st.st_size); - messenger->send_message(reply, op->get_client_inst()); - - logger->inc("stat"); - - delete op; -} - - - -/********* - * new repops - */ - -void OSD::get_repop_gather(PG::RepOpGather *repop) -{ - //repop->lock.Lock(); - dout(10) << "get_repop " << *repop << endl; -} - -void OSD::apply_repop(PG *pg, PG::RepOpGather *repop) -{ - dout(10) << "apply_repop applying update on " << *repop << endl; - assert(!repop->applied); - - Context *oncommit = new C_OSD_WriteCommit(this, pg->info.pgid, repop->rep_tid, repop->pg_local_last_complete); - unsigned r = store->apply_transaction(repop->t, oncommit); - if (r) - dout(-10) << "apply_repop apply transaction return " << r << " on " << *repop << endl; - - // discard my reference to buffer - repop->op->get_data().clear(); - - repop->applied = true; -} - -void OSD::put_repop_gather(PG *pg, PG::RepOpGather *repop) -{ - dout(10) << "put_repop " << *repop << endl; - - // commit? - if (repop->can_send_commit() && - repop->op->wants_commit()) { - // send commit. - MOSDOpReply *reply = new MOSDOpReply(repop->op, 0, osdmap->get_epoch(), true); - dout(10) << "put_repop sending commit on " << *repop << " " << reply << endl; - messenger->send_message(reply, repop->op->get_client_inst()); - repop->sent_commit = true; - } - - // ack? - else if (repop->can_send_ack() && - repop->op->wants_ack()) { - // apply - apply_repop(pg, repop); - - // send ack - MOSDOpReply *reply = new MOSDOpReply(repop->op, 0, osdmap->get_epoch(), false); - dout(10) << "put_repop sending ack on " << *repop << " " << reply << endl; - messenger->send_message(reply, repop->op->get_client_inst()); - repop->sent_ack = true; - - utime_t now = g_clock.now(); - now -= repop->start; - logger->finc("rlsum", now); - logger->inc("rlnum", 1); - } - - // done. - if (repop->can_delete()) { - // adjust peers_complete_thru - if (!repop->pg_complete_thru.empty()) { - eversion_t min = pg->info.last_complete; // hrm.... - for (unsigned i=0; iacting.size(); i++) { - if (repop->pg_complete_thru[pg->acting[i]] < min) // note: if we haven't heard, it'll be zero, which is what we want. - min = repop->pg_complete_thru[pg->acting[i]]; - } - - if (min > pg->peers_complete_thru) { - dout(10) << "put_repop peers_complete_thru " << pg->peers_complete_thru << " -> " << min << " in " << *pg << endl; - pg->peers_complete_thru = min; - } - } - - dout(10) << "put_repop deleting " << *repop << endl; - //repop->lock.Unlock(); - - assert(pg->repop_gather.count(repop->rep_tid)); - pg->repop_gather.erase(repop->rep_tid); - - delete repop->op; - delete repop; - - } else { - //repop->lock.Unlock(); - } -} - - -void OSD::issue_repop(PG *pg, MOSDOp *op, int osd) -{ - object_t oid = op->get_oid(); - - dout(7) << " issue_repop rep_tid " << op->get_rep_tid() - << " in " << *pg - << " o " << oid - << " to osd" << osd - << endl; - - // forward the write/update/whatever - MOSDOp *wr = new MOSDOp(op->get_client_inst(), op->get_client_inc(), op->get_reqid().tid, - oid, - pg->get_pgid(), - osdmap->get_epoch(), - op->get_op()); - wr->get_data() = op->get_data(); // _copy_ bufferlist - wr->set_length(op->get_length()); - wr->set_offset(op->get_offset()); - wr->set_version(op->get_version()); - - wr->set_rep_tid(op->get_rep_tid()); - wr->set_pg_trim_to(pg->peers_complete_thru); - - messenger->send_message(wr, osdmap->get_inst(osd)); -} - -PG::RepOpGather *OSD::new_repop_gather(PG *pg, - MOSDOp *op) -{ - dout(10) << "new_repop_gather rep_tid " << op->get_rep_tid() << " on " << *op << " in " << *pg << endl; - - PG::RepOpGather *repop = new PG::RepOpGather(op, op->get_rep_tid(), - op->get_version(), - pg->info.last_complete); - - // osds. commits all come to me. - for (unsigned i=0; iacting.size(); i++) { - int osd = pg->acting[i]; - repop->osds.insert(osd); - repop->waitfor_commit.insert(osd); - } - - // acks vary: - if (g_conf.osd_rep == OSD_REP_CHAIN) { - // chain rep. - // there's my local ack... - repop->osds.insert(whoami); - repop->waitfor_ack.insert(whoami); - repop->waitfor_commit.insert(whoami); - - // also, the previous guy will ack to me - int myrank = osdmap->calc_pg_rank(whoami, pg->acting); - if (myrank > 0) { - int osd = pg->acting[ myrank-1 ]; - repop->osds.insert(osd); - repop->waitfor_ack.insert(osd); - repop->waitfor_commit.insert(osd); - } - } else { - // primary, splay. all osds ack to me. - for (unsigned i=0; iacting.size(); i++) { - int osd = pg->acting[i]; - repop->waitfor_ack.insert(osd); - } - } - - repop->start = g_clock.now(); - - pg->repop_gather[ repop->rep_tid ] = repop; - - // anyone waiting? (acks that got here before the op did) - if (pg->waiting_for_repop.count(repop->rep_tid)) { - take_waiters(pg->waiting_for_repop[repop->rep_tid]); - pg->waiting_for_repop.erase(repop->rep_tid); - } - - return repop; -} - - -void OSD::repop_ack(PG *pg, PG::RepOpGather *repop, - int result, bool commit, - int fromosd, eversion_t pg_complete_thru) -{ - MOSDOp *op = repop->op; - - dout(7) << "repop_ack rep_tid " << repop->rep_tid << " op " << *op - << " result " << result << " commit " << commit << " from osd" << fromosd - << " in " << *pg - << endl; - - get_repop_gather(repop); - { - if (commit) { - // commit - assert(repop->waitfor_commit.count(fromosd)); - repop->waitfor_commit.erase(fromosd); - repop->waitfor_ack.erase(fromosd); - repop->pg_complete_thru[fromosd] = pg_complete_thru; - } else { - // ack - repop->waitfor_ack.erase(fromosd); - } - } - put_repop_gather(pg, repop); -} - - - - - -/** op_modify_commit - * transaction commit on the acker. - */ -void OSD::op_modify_commit(pg_t pgid, tid_t rep_tid, eversion_t pg_complete_thru) -{ - PG *pg = lock_pg(pgid); - if (pg) { - if (pg->repop_gather.count(rep_tid)) { - PG::RepOpGather *repop = pg->repop_gather[rep_tid]; - - dout(10) << "op_modify_commit " << *repop->op << endl; - get_repop_gather(repop); - { - assert(repop->waitfor_commit.count(whoami)); - repop->waitfor_commit.erase(whoami); - repop->pg_complete_thru[whoami] = pg_complete_thru; - } - put_repop_gather(pg, repop); - dout(10) << "op_modify_commit done on " << repop << endl; - } else { - dout(10) << "op_modify_commit pg " << pgid << " rep_tid " << rep_tid << " dne" << endl; - } - - unlock_pg(pgid); - } else { - dout(10) << "op_modify_commit pg " << pgid << " dne" << endl; - } -} - - -/** op_modify - * process client modify op - * NOTE: called from opqueue. - */ -void OSD::op_modify(MOSDOp *op, PG *pg) -{ - object_t oid = op->get_oid(); - - const char *opname = MOSDOp::get_opname(op->get_op()); - - // are any peers missing this? - for (unsigned i=1; iacting.size(); i++) { - int peer = pg->acting[i]; - if (pg->peer_missing.count(peer) && - pg->peer_missing[peer].is_missing(oid)) { - // push it before this update. - // FIXME, this is probably extra much work (eg if we're about to overwrite) - pg->peer_missing[peer].got(oid); - push(pg, oid, peer); - } - } - - // dup op? - if (pg->log.logged_req(op->get_reqid())) { - dout(-3) << "op_modify " << opname << " dup op " << op->get_reqid() - << ", doing WRNOOP" << endl; - op->set_op(OSD_OP_WRNOOP); - opname = MOSDOp::get_opname(op->get_op()); - } - - // locked by someone else? - // for _any_ op type -- eg only the locker can unlock! - if (op->get_op() != OSD_OP_WRNOOP && // except WRNOOP; we just want to flush - block_if_wrlocked(op)) - return; // op will be handled later, after the object unlocks - - - // check crev - objectrev_t crev = 0; - store->getattr(oid, "crev", (char*)&crev, sizeof(crev)); - - // assign version - eversion_t clone_version; - eversion_t nv = pg->log.top; - if (op->get_op() != OSD_OP_WRNOOP) { - nv.epoch = osdmap->get_epoch(); - nv.version++; - assert(nv > pg->info.last_update); - assert(nv > pg->log.top); - - // will clone? - if (crev && op->get_rev() && op->get_rev() > crev) { - clone_version = nv; - nv.version++; - } - - if (op->get_version().version) { - // replay! - if (nv.version < op->get_version().version) { - nv.version = op->get_version().version; - - // clone? - if (crev && op->get_rev() && op->get_rev() > crev) { - // backstep clone - clone_version = nv; - clone_version.version--; - } - } - } - } - - // set version in op, for benefit of client and our eventual reply - op->set_version(nv); - - dout(10) << "op_modify " << opname - << " " << oid - << " v " << nv - << " crev " << crev - << " rev " << op->get_rev() - << " " << op->get_offset() << "~" << op->get_length() - << endl; - - if (op->get_op() == OSD_OP_WRITE) { - logger->inc("c_wr"); - logger->inc("c_wrb", op->get_length()); - } - - // share latest osd map? - osd_lock.Lock(); - { - for (unsigned i=1; iacting.size(); i++) { - int osd = pg->acting[i]; - _share_map_outgoing( osdmap->get_inst(osd) ); - } - } - osd_lock.Unlock(); - - // issue replica writes - PG::RepOpGather *repop = 0; - bool alone = (pg->acting.size() == 1); - tid_t rep_tid = ++last_tid; - op->set_rep_tid(rep_tid); - - if (g_conf.osd_rep == OSD_REP_CHAIN && !alone) { - // chain rep. send to #2 only. - int next = pg->acting[1]; - if (pg->acting.size() > 2) - next = pg->acting[2]; - issue_repop(pg, op, next); - } - else if (g_conf.osd_rep == OSD_REP_SPLAY && !alone) { - // splay rep. send to rest. - for (unsigned i=1; iacting.size(); ++i) - //for (unsigned i=pg->acting.size()-1; i>=1; --i) - issue_repop(pg, op, pg->acting[i]); - } else { - // primary rep, or alone. - repop = new_repop_gather(pg, op); - - // send to rest. - if (!alone) - for (unsigned i=1; iacting.size(); i++) - issue_repop(pg, op, pg->acting[i]); - } - - if (repop) { - // we are acker. - if (op->get_op() != OSD_OP_WRNOOP) { - // log and update later. - prepare_log_transaction(repop->t, op, nv, crev, op->get_rev(), pg, pg->peers_complete_thru); - prepare_op_transaction(repop->t, op, nv, crev, op->get_rev(), pg); - } - - // (logical) local ack. - // (if alone, this will apply the update.) - get_repop_gather(repop); - { - assert(repop->waitfor_ack.count(whoami)); - repop->waitfor_ack.erase(whoami); - } - put_repop_gather(pg, repop); - - } else { - // chain or splay. apply. - ObjectStore::Transaction t; - prepare_log_transaction(t, op, nv, crev, op->get_rev(), pg, pg->peers_complete_thru); - prepare_op_transaction(t, op, nv, crev, op->get_rev(), pg); - - C_OSD_RepModifyCommit *oncommit = new C_OSD_RepModifyCommit(this, op, pg->get_acker(), - pg->info.last_complete); - unsigned r = store->apply_transaction(t, oncommit); - if (r != 0 && // no errors - r != 2) { // or error on collection_add - cerr << "error applying transaction: r = " << r << endl; - assert(r == 0); - } - - oncommit->ack(); - } -} - - - -void OSD::prepare_log_transaction(ObjectStore::Transaction& t, - MOSDOp *op, eversion_t& version, - objectrev_t crev, objectrev_t rev, - PG *pg, - eversion_t trim_to) -{ - const object_t oid = op->get_oid(); - - // clone entry? - if (crev && rev && rev > crev) { - eversion_t cv = version; - cv.version--; - PG::Log::Entry cloneentry(PG::Log::Entry::CLONE, oid, cv, op->get_reqid()); - pg->log.add(cloneentry); - - dout(10) << "prepare_log_transaction " << op->get_op() - << " " << cloneentry - << " in " << *pg << endl; - } - - // actual op - int opcode = PG::Log::Entry::MODIFY; - if (op->get_op() == OSD_OP_DELETE) opcode = PG::Log::Entry::DELETE; - PG::Log::Entry logentry(opcode, oid, version, op->get_reqid()); - - dout(10) << "prepare_log_transaction " << op->get_op() - << " " << logentry - << " in " << *pg << endl; - - // append to log - assert(version > pg->log.top); - pg->log.add(logentry); - assert(pg->log.top == version); - dout(10) << "prepare_log_transaction appended to " << *pg << endl; - - // write to pg log on disk - pg->append_log(t, logentry, trim_to); -} - - -/** prepare_op_transaction - * apply an op to the store wrapped in a transaction. - */ -void OSD::prepare_op_transaction(ObjectStore::Transaction& t, - MOSDOp *op, eversion_t& version, - objectrev_t crev, objectrev_t rev, - PG *pg) -{ - const object_t oid = op->get_oid(); - const pg_t pgid = op->get_pg(); - - bool did_clone = false; - - dout(10) << "prepare_op_transaction " << MOSDOp::get_opname( op->get_op() ) - << " " << oid - << " v " << version - << " crev " << crev - << " rev " << rev - << " in " << *pg << endl; - - // WRNOOP does nothing. - if (op->get_op() == OSD_OP_WRNOOP) - return; - - // raise last_complete? - if (pg->info.last_complete == pg->info.last_update) - pg->info.last_complete = version; - - // raise last_update. - assert(version > pg->info.last_update); - pg->info.last_update = version; - - // write pg info - t.collection_setattr(pgid, "info", &pg->info, sizeof(pg->info)); - - // clone? - if (crev && rev && rev > crev) { - object_t noid = oid; - noid.rev = rev; - dout(10) << "prepare_op_transaction cloning " << oid << " crev " << crev << " to " << noid << endl; - t.clone(oid, noid); - did_clone = true; - } - - // apply the op - switch (op->get_op()) { - case OSD_OP_WRLOCK: - { // lock object - //r = store->setattr(oid, "wrlock", &op->get_asker(), sizeof(msg_addr_t), oncommit); - t.setattr(oid, "wrlock", &op->get_client(), sizeof(entity_name_t)); - } - break; - - case OSD_OP_WRUNLOCK: - { // unlock objects - //r = store->rmattr(oid, "wrlock", oncommit); - t.rmattr(oid, "wrlock"); - - // unblock all operations that were waiting for this object to become unlocked - if (waiting_for_wr_unlock.count(oid)) { - take_waiters(waiting_for_wr_unlock[oid]); - waiting_for_wr_unlock.erase(oid); - } - } - break; - - case OSD_OP_WRITE: - { // write - assert(op->get_data().length() == op->get_length()); - bufferlist bl; - bl.claim( op->get_data() ); // give buffers to store; we keep *op in memory for a long time! - - //if (oid < 100000000000000ULL) // hack hack-- don't write client data - t.write( oid, op->get_offset(), op->get_length(), bl ); - } - break; - - case OSD_OP_ZERO: - { - assert(0); // are you sure this is what you want? - // zero, remove, or truncate? - struct stat st; - int r = store->stat(oid, &st); - if (r >= 0) { - if (op->get_offset() + (off_t)op->get_length() >= (off_t)st.st_size) { - if (op->get_offset()) - t.truncate(oid, op->get_length() + op->get_offset()); - else - t.remove(oid); - } else { - // zero. the dumb way. FIXME. - bufferptr bp(op->get_length()); - bp.zero(); - bufferlist bl; - bl.push_back(bp); - t.write(oid, op->get_offset(), op->get_length(), bl); - } - } else { - // noop? - dout(10) << "apply_transaction zero on " << oid << ", but dne? stat returns " << r << endl; - } - } - break; - - case OSD_OP_TRUNCATE: - { // truncate - //r = store->truncate(oid, op->get_offset()); - t.truncate(oid, op->get_length() ); - } - break; - - case OSD_OP_DELETE: - { // delete - //r = store->remove(oid); - t.remove(oid); - } - break; - - default: - assert(0); - } - - // object collection, version - if (op->get_op() == OSD_OP_DELETE) { - // remove object from c - t.collection_remove(pgid, oid); - } else { - // add object to c - t.collection_add(pgid, oid); - - // object version - t.setattr(oid, "version", &version, sizeof(version)); - - // set object crev - if (crev == 0 || // new object - did_clone) // we cloned - t.setattr(oid, "crev", &rev, sizeof(rev)); - } -} diff --git a/branches/marnberg/quota/osd/OSD.h b/branches/marnberg/quota/osd/OSD.h deleted file mode 100644 index 5c5205a8c1aac..0000000000000 --- a/branches/marnberg/quota/osd/OSD.h +++ /dev/null @@ -1,273 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __OSD_H -#define __OSD_H - -#include "msg/Dispatcher.h" - -#include "common/Mutex.h" -#include "common/ThreadPool.h" -#include "common/Timer.h" - -#include "mon/MonMap.h" - -#include "ObjectStore.h" -#include "PG.h" - -#include -using namespace std; -#include -#include -using namespace __gnu_cxx; - -#include "messages/MOSDOp.h" - -class Messenger; -class Message; - - - - -class OSD : public Dispatcher { -public: - - /** superblock - */ - OSDSuperblock superblock; - epoch_t boot_epoch; - - object_t get_osdmap_object_name(epoch_t epoch) { return object_t(0,epoch << 1); } - object_t get_inc_osdmap_object_name(epoch_t epoch) { return object_t(0, (epoch << 1) + 1); } - - void write_superblock(); - void write_superblock(ObjectStore::Transaction& t); - int read_superblock(); - - - /** OSD **/ - protected: - Messenger *messenger; - int whoami; - - static const int STATE_BOOTING = 1; - static const int STATE_ACTIVE = 2; - static const int STATE_STOPPING = 3; - - int state; - - bool is_booting() { return state == STATE_BOOTING; } - bool is_active() { return state == STATE_ACTIVE; } - bool is_stopping() { return state == STATE_STOPPING; } - - - MonMap *monmap; - - class Logger *logger; - - // local store - char dev_path[100]; - class ObjectStore *store; - - // heartbeat - void heartbeat(); - - class C_Heartbeat : public Context { - OSD *osd; - public: - C_Heartbeat(OSD *o) : osd(o) {} - void finish(int r) { - osd->heartbeat(); - } - }; - - // global lock - Mutex osd_lock; - SafeTimer timer; - - // -- stats -- - int hb_stat_ops; // ops since last heartbeat - int hb_stat_qlen; // cumulative queue length since last hb - - hash_map peer_qlen; - - // per-pg locking (serializing) - hash_set pg_lock; - hash_map > pg_lock_waiters; - PG *lock_pg(pg_t pgid); - PG *_lock_pg(pg_t pgid); - void unlock_pg(pg_t pgid); - void _unlock_pg(pg_t pgid); - - // finished waiting messages, that will go at tail of dispatch() - list finished; - void take_waiters(list& ls) { - finished.splice(finished.end(), ls); - } - - // object locking - hash_map > waiting_for_wr_unlock; /** list of operations for each object waiting for 'wrunlock' */ - - bool block_if_wrlocked(MOSDOp* op); - - // -- ops -- - class ThreadPool *threadpool; - hash_map > op_queue; - int pending_ops; - bool waiting_for_no_ops; - Cond no_pending_ops; - Cond op_queue_cond; - - void wait_for_no_ops(); - - void enqueue_op(pg_t pgid, Message *op); - void dequeue_op(pg_t pgid); - static void static_dequeueop(OSD *o, pg_t pgid) { - o->dequeue_op(pgid); - }; - - void do_op(Message *m, PG *pg); // actually do it - - void prepare_log_transaction(ObjectStore::Transaction& t, MOSDOp* op, eversion_t& version, - objectrev_t crev, objectrev_t rev, PG *pg, eversion_t trim_to); - void prepare_op_transaction(ObjectStore::Transaction& t, MOSDOp* op, eversion_t& version, - objectrev_t crev, objectrev_t rev, PG *pg); - - bool waitfor_missing_object(MOSDOp *op, PG *pg); - bool pick_missing_object_rev(object_t& oid, PG *pg); - bool pick_object_rev(object_t& oid); - - - - friend class PG; - - protected: - - // -- osd map -- - class OSDMap *osdmap; - list waiting_for_osdmap; - - hash_map peer_map_epoch; // FIXME types - bool _share_map_incoming(const entity_inst_t& inst, epoch_t epoch); - void _share_map_outgoing(const entity_inst_t& inst); - - void wait_for_new_map(Message *m); - void handle_osd_map(class MOSDMap *m); - - void advance_map(ObjectStore::Transaction& t); - void activate_map(ObjectStore::Transaction& t); - - void get_map(epoch_t e, OSDMap &m); - bool get_map_bl(epoch_t e, bufferlist& bl); - bool get_inc_map_bl(epoch_t e, bufferlist& bl); - bool get_inc_map(epoch_t e, OSDMap::Incremental &inc); - - void send_incremental_map(epoch_t since, const entity_inst_t& inst, bool full); - - - - // -- replication -- - - // PG - hash_map pg_map; - void load_pgs(); - bool pg_exists(pg_t pg); - PG *create_pg(pg_t pg, ObjectStore::Transaction& t); // create new PG - PG *get_pg(pg_t pg); // return existing PG, or null - void _remove_pg(pg_t pg); // remove from store and memory - - void project_pg_history(pg_t pgid, PG::Info::History& h, epoch_t from); - - void activate_pg(pg_t pgid, epoch_t epoch); - - class C_Activate : public Context { - OSD *osd; - pg_t pgid; - epoch_t epoch; - public: - C_Activate(OSD *o, pg_t p, epoch_t e) : osd(o), pgid(p), epoch(e) {} - void finish(int r) { - osd->activate_pg(pgid, epoch); - } - }; - - - tid_t last_tid; - int num_pulling; - - hash_map > waiting_for_pg; - - // replica ops - void get_repop_gather(PG::RepOpGather*); - void apply_repop(PG *pg, PG::RepOpGather *repop); - void put_repop_gather(PG *pg, PG::RepOpGather*); - void issue_repop(PG *pg, MOSDOp *op, int osd); - PG::RepOpGather *new_repop_gather(PG *pg, MOSDOp *op); - void repop_ack(PG *pg, PG::RepOpGather *repop, - int result, bool commit, - int fromosd, eversion_t pg_complete_thru=0); - - void handle_rep_op_ack(MOSDOpReply *m); - - // recovery - void do_notifies(map< int, list >& notify_list); - void do_queries(map< int, map >& query_map); - void repeer(PG *pg, map< int, map >& query_map); - - void pull(PG *pg, object_t oid); - void push(PG *pg, object_t oid, int dest); - - bool require_current_map(Message *m, epoch_t v); - bool require_same_or_newer_map(Message *m, epoch_t e); - - void handle_pg_query(class MOSDPGQuery *m); - void handle_pg_notify(class MOSDPGNotify *m); - void handle_pg_log(class MOSDPGLog *m); - void handle_pg_remove(class MOSDPGRemove *m); - - void op_pull(class MOSDOp *op, PG *pg); - void op_push(class MOSDOp *op, PG *pg); - - void op_rep_modify(class MOSDOp *op, PG *pg); // write, trucnate, delete - void op_rep_modify_commit(class MOSDOp *op, int ackerosd, - eversion_t last_complete); - friend class C_OSD_RepModifyCommit; - - - public: - OSD(int id, Messenger *m, MonMap *mm, char *dev = 0); - ~OSD(); - - // startup/shutdown - int init(); - int shutdown(); - - // messages - virtual void dispatch(Message *m); - virtual void ms_handle_failure(Message *m, const entity_inst_t& inst); - - void handle_osd_ping(class MOSDPing *m); - void handle_op(class MOSDOp *m); - - void op_read(class MOSDOp *m);//, PG *pg); - void op_stat(class MOSDOp *m);//, PG *pg); - void op_modify(class MOSDOp *m, PG *pg); - void op_modify_commit(pg_t pgid, tid_t rep_tid, eversion_t pg_complete_thru); - - // for replication - void handle_op_reply(class MOSDOpReply *m); - - void force_remount(); -}; - -#endif diff --git a/branches/marnberg/quota/osd/OSDMap.h b/branches/marnberg/quota/osd/OSDMap.h deleted file mode 100644 index 163c14e65ed24..0000000000000 --- a/branches/marnberg/quota/osd/OSDMap.h +++ /dev/null @@ -1,519 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __OSDMAP_H -#define __OSDMAP_H - -/* - * describe properties of the OSD cluster. - * disks, disk groups, total # osds, - * - */ -#include "config.h" -#include "include/types.h" -#include "osd_types.h" -#include "msg/Message.h" -#include "common/Mutex.h" -#include "common/Clock.h" - -#include "crush/crush.h" -using namespace crush; - -#include -#include -#include -#include -using namespace std; - - -/* - * some system constants - */ - -// from LSB to MSB, -#define PG_PS_BITS 16 // max bits for placement seed/group portion of PG -#define PG_REP_BITS 6 // up to 64 replicas -#define PG_TYPE_BITS 2 -#define PG_PS_MASK ((1LL< new_up; - map new_down; - list new_in; - list new_out; - map new_overload; // updated overload value - list old_overload; // no longer overload - - void encode(bufferlist& bl) { - bl.append((char*)&epoch, sizeof(epoch)); - bl.append((char*)&mon_epoch, sizeof(mon_epoch)); - bl.append((char*)&ctime, sizeof(ctime)); - ::_encode(new_up, bl); - ::_encode(new_down, bl); - ::_encode(new_in, bl); - ::_encode(new_out, bl); - ::_encode(new_overload, bl); - } - void decode(bufferlist& bl, int& off) { - bl.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - bl.copy(off, sizeof(mon_epoch), (char*)&mon_epoch); - off += sizeof(mon_epoch); - bl.copy(off, sizeof(ctime), (char*)&ctime); - off += sizeof(ctime); - ::_decode(new_up, bl, off); - ::_decode(new_down, bl, off); - ::_decode(new_in, bl, off); - ::_decode(new_out, bl, off); - ::_decode(new_overload, bl, off); - } - - Incremental(epoch_t e=0) : epoch(e), mon_epoch(0) {} - }; - -private: - epoch_t epoch; // what epoch of the osd cluster descriptor is this - epoch_t mon_epoch; // monitor epoch (election iteration) - utime_t ctime; // epoch start time - int pg_bits; // placement group bits - int localized_pg_bits; // bits for localized pgs - - set osds; // all osds - set down_osds; // list of down disks - set out_osds; // list of unmapped disks - map overload_osds; - map osd_inst; - - public: - Crush crush; // hierarchical map - - friend class OSDMonitor; - friend class MDS; - - public: - OSDMap() : epoch(0), mon_epoch(0), pg_bits(5), localized_pg_bits(3) {} - - // map info - epoch_t get_epoch() const { return epoch; } - void inc_epoch() { epoch++; } - - int get_pg_bits() const { return pg_bits; } - void set_pg_bits(int b) { pg_bits = b; } - int get_localized_pg_bits() const { return localized_pg_bits; } - - const utime_t& get_ctime() const { return ctime; } - - bool is_mkfs() const { return epoch == 1; } - //void set_mkfs() { assert(epoch == 1); } - - /***** cluster state *****/ - int num_osds() { return osds.size(); } - void get_all_osds(set& ls) { ls = osds; } - - const set& get_osds() { return osds; } - const set& get_down_osds() { return down_osds; } - const set& get_out_osds() { return out_osds; } - const map& get_overload_osds() { return overload_osds; } - - bool is_down(int osd) { return down_osds.count(osd); } - bool is_up(int osd) { return !is_down(osd); } - bool is_out(int osd) { return out_osds.count(osd); } - bool is_in(int osd) { return !is_out(osd); } - - const entity_inst_t& get_inst(int osd) { - assert(osd_inst.count(osd)); - return osd_inst[osd]; - } - bool get_inst(int osd, entity_inst_t& inst) { - if (osd_inst.count(osd)) { - inst = osd_inst[osd]; - return true; - } - return false; - } - - void mark_down(int o) { down_osds.insert(o); } - void mark_up(int o) { down_osds.erase(o); } - void mark_out(int o) { out_osds.insert(o); } - void mark_in(int o) { out_osds.erase(o); } - - - void apply_incremental(Incremental &inc) { - assert(inc.epoch == epoch+1); - epoch++; - mon_epoch = inc.mon_epoch; - ctime = inc.ctime; - - for (map::iterator i = inc.new_up.begin(); - i != inc.new_up.end(); - i++) { - assert(down_osds.count(i->first)); - down_osds.erase(i->first); - assert(osd_inst.count(i->first) == 0); - osd_inst[i->first] = i->second; - //cout << "epoch " << epoch << " up osd" << i->first << endl; - } - for (map::iterator i = inc.new_down.begin(); - i != inc.new_down.end(); - i++) { - assert(down_osds.count(i->first) == 0); - down_osds.insert(i->first); - assert(osd_inst.count(i->first) == 0 || - osd_inst[i->first] == i->second); - osd_inst.erase(i->first); - //cout << "epoch " << epoch << " down osd" << i->first << endl; - } - for (list::iterator i = inc.new_in.begin(); - i != inc.new_in.end(); - i++) { - assert(out_osds.count(*i)); - out_osds.erase(*i); - //cout << "epoch " << epoch << " in osd" << *i << endl; - } - for (list::iterator i = inc.new_out.begin(); - i != inc.new_out.end(); - i++) { - assert(out_osds.count(*i) == 0); - out_osds.insert(*i); - //cout << "epoch " << epoch << " out osd" << *i << endl; - } - for (map::iterator i = inc.new_overload.begin(); - i != inc.new_overload.end(); - i++) { - overload_osds[i->first] = i->second; - } - for (list::iterator i = inc.old_overload.begin(); - i != inc.old_overload.end(); - i++) { - assert(overload_osds.count(*i)); - overload_osds.erase(*i); - } - } - - // serialize, unserialize - void encode(bufferlist& blist) { - blist.append((char*)&epoch, sizeof(epoch)); - blist.append((char*)&mon_epoch, sizeof(mon_epoch)); - blist.append((char*)&ctime, sizeof(ctime)); - blist.append((char*)&pg_bits, sizeof(pg_bits)); - - _encode(osds, blist); - _encode(down_osds, blist); - _encode(out_osds, blist); - _encode(overload_osds, blist); - _encode(osd_inst, blist); - - crush._encode(blist); - } - - void decode(bufferlist& blist) { - int off = 0; - blist.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - blist.copy(off, sizeof(mon_epoch), (char*)&mon_epoch); - off += sizeof(mon_epoch); - blist.copy(off, sizeof(ctime), (char*)&ctime); - off += sizeof(ctime); - blist.copy(off, sizeof(pg_bits), (char*)&pg_bits); - off += sizeof(pg_bits); - - _decode(osds, blist, off); - _decode(down_osds, blist, off); - _decode(out_osds, blist, off); - _decode(overload_osds, blist, off); - _decode(osd_inst, blist, off); - - crush._decode(blist, off); - } - - - - - /**** mapping facilities ****/ - - // oid -> pg - pg_t object_to_pg(object_t oid, FileLayout& layout) { - static crush::Hash H(777); - - int policy = layout.object_layout; - if (policy == 0) - policy = g_conf.osd_object_layout; - - int type = PG_TYPE_RAND; - ps_t ps; - - switch (policy) { - case OBJECT_LAYOUT_LINEAR: - { - //const object_t ono = oid.bno; - //const inodeno_t ino = oid >> OID_ONO_BITS; - ps = (oid.bno + oid.ino) & PG_PS_MASK; - ps &= ((1ULL<> OID_ONO_BITS; - ps = (oid.bno + H(oid.ino)) & PG_PS_MASK; - ps &= ((1ULL<> 32) ) & PG_PS_MASK; - ps &= ((1ULL< pg - pg_t ps_nrep_to_pg(ps_t ps, int nrep) { - /*return ((pg_t)ps & ((1ULL< nrep - int pg_to_nrep(pg_t pg) { - return pg.u.fields.nrep; - //return (pg >> PG_PS_BITS) & ((1ULL << PG_REP_BITS)-1); - } - - // pg -> ps - int pg_to_ps(pg_t pg) { - //return pg & PG_PS_MASK; - return pg.u.fields.ps; - } - - // pg -> (osd list) - int pg_to_osds(pg_t pg, - vector& osds) { // list of osd addr's - pg_t ps = pg_to_ps(pg); - int num_rep = pg_to_nrep(pg); - assert(num_rep > 0); - - // map to osds[] - switch (g_conf.osd_pg_layout) { - case PG_LAYOUT_CRUSH: - { - int forcefeed = -1; - if (pg.u.fields.preferred > 0 && - out_osds.count(pg.u.fields.preferred-1) == 0) - forcefeed = pg.u.fields.preferred-1; - crush.do_rule(crush.rules[num_rep], // FIXME rule thing. - ps, - osds, - out_osds, overload_osds, - forcefeed); - } - break; - - case PG_LAYOUT_LINEAR: - for (int i=0; i 0 && - g_conf.osd_pg_layout != PG_LAYOUT_CRUSH) { - int osd = pg.u.fields.preferred-1; - - // already in there? - if (osds.empty()) { - osds.push_back(osd); - } else { - assert(num_rep > 0); - for (int i=1; i (up osd list) - int pg_to_acting_osds(pg_t pg, - vector& osds) { // list of osd addr's - // get rush list - vector raw; - pg_to_osds(pg, raw); - - osds.clear(); - for (unsigned i=0; i primary osd - int get_pg_primary(pg_t pg) { - vector group; - int nrep = pg_to_osds(pg, group); - if (nrep) - return group[0]; - return -1; // we fail! - } - - // pg -> acting primary osd - int get_pg_acting_primary(pg_t pg) { - vector group; - int nrep = pg_to_acting_osds(pg, group); - if (nrep > 0) - return group[0]; - return -1; // we fail! - } - int get_pg_acting_tail(pg_t pg) { - vector group; - int nrep = pg_to_acting_osds(pg, group); - if (nrep > 0) - return group[group.size()-1]; - return -1; // we fail! - } - - - /* what replica # is a given osd? 0 primary, -1 for none. */ - int calc_pg_rank(int osd, vector& acting, int nrep=0) { - if (!nrep) nrep = acting.size(); - for (int i=0; i& acting, int nrep=0) { - if (!nrep) nrep = acting.size(); - int rank = calc_pg_rank(osd, acting, nrep); - - if (rank < 0) return PG_ROLE_STRAY; - else if (rank == 0) return PG_ROLE_HEAD; - else if (rank == 1) return PG_ROLE_ACKER; - else return PG_ROLE_MIDDLE; - } - - int get_pg_role(pg_t pg, int osd) { - vector group; - int nrep = pg_to_osds(pg, group); - return calc_pg_role(osd, group, nrep); - } - - /* rank is -1 (stray), 0 (primary), 1,2,3,... (replica) */ - int get_pg_acting_rank(pg_t pg, int osd) { - vector group; - int nrep = pg_to_acting_osds(pg, group); - return calc_pg_rank(osd, group, nrep); - } - /* role is -1 (stray), 0 (primary), 1 (replica) */ - int get_pg_acting_role(pg_t pg, int osd) { - vector group; - int nrep = pg_to_acting_osds(pg, group); - return calc_pg_role(osd, group, nrep); - } - - - - -}; - - -#endif diff --git a/branches/marnberg/quota/osd/ObjectStore.cc b/branches/marnberg/quota/osd/ObjectStore.cc deleted file mode 100644 index 82af869e93775..0000000000000 --- a/branches/marnberg/quota/osd/ObjectStore.cc +++ /dev/null @@ -1,149 +0,0 @@ - -#include "ObjectStore.h" - -#include "config.h" -#include "common/Clock.h" - - -object_t ObjectStore::age_get_oid() { - if (!age_free_oids.empty()) { - object_t o = age_free_oids.front(); - age_free_oids.pop_front(); - return o; - } - return age_cur_oid++; - } - - ssize_t ObjectStore::age_pick_size() { - ssize_t max = file_size_distn.sample() * 1024; - return max/2 + (rand() % 100) * max/200 + 1; - } - - void ObjectStore::age_fill(float pc, utime_t until) { - bufferptr bp(1024*1024); - bp.zero(); - bufferlist bl; - bl.push_back(bp); - while (1) { - if (g_clock.now() > until) break; - - struct statfs st; - statfs(&st); - float a = (float)(st.f_blocks-st.f_bavail) / (float)st.f_blocks; - if (a >= pc) { - dout(10) << "age_fill at " << a << " / " << pc << " stopping" << endl; - break; - } - - object_t oid = age_get_oid(); - - int b = rand() % 10; - age_objects[b].push_back(oid); - - ssize_t s = age_pick_size(); - - dout(10) << "age_fill at " << a << " / " << pc << " creating " << hex << oid << dec << " sz " << s << endl; - - off_t off = 0; - while (s) { - ssize_t t = MIN(s, 1024*1024); - write(oid, t, off, bl, false); - off += t; - s -= t; - } - oid++; - } - } - - void ObjectStore::age_empty(float pc) { - int nper = 20; - int n = nper; - while (1) { - struct statfs st; - statfs(&st); - float a = (float)(st.f_blocks-st.f_bavail) / (float)st.f_blocks; - if (a <= pc) { - dout(10) << "age_empty at " << a << " / " << pc << " stopping" << endl; - break; - } - - int b = rand() % 10; - n--; - if (n == 0 || age_objects[b].empty()) { - dout(10) << "age_empty sync" << endl; - //sync(); - sync(); - n = nper; - continue; - } - object_t oid = age_objects[b].front(); - age_objects[b].pop_front(); - - dout(10) << "age_empty at " << a << " / " << pc << " removing " << hex << oid << dec << endl; - - remove(oid); - age_free_oids.push_back(oid); - } - } - - - void ObjectStore::age(int time, - float high_water, // fill to this % - float low_water, // then empty to this % - int count, // this many times - float final_water, // and end here ( <= low_water) - int fake_size_mb) { - utime_t until = g_clock.now(); - until.sec_ref() += time; - - while (age_objects.size() < 10) age_objects.push_back( list() ); - - if (fake_size_mb) { - int fake_bl = fake_size_mb * 256; - struct statfs st; - statfs(&st); - float f = (float)fake_bl / (float)st.f_blocks; - high_water = (float)high_water * f; - low_water = (float)low_water * f; - final_water = (float)final_water * f; - dout(10) << "fake " << fake_bl << " / " << st.f_blocks << " is " << f << ", high " << high_water << " low " << low_water << " final " << final_water << endl; - } - - // init size distn (once) - if (!did_distn) { - did_distn = true; - age_cur_oid = 1; - file_size_distn.add(1, 19.0758125+0.65434375); - file_size_distn.add(512, 35.6566); - file_size_distn.add(1024, 27.7271875); - file_size_distn.add(2*1024, 16.63503125); - //file_size_distn.add(4*1024, 106.82384375); - //file_size_distn.add(8*1024, 81.493375); - //file_size_distn.add(16*1024, 14.13553125); - //file_size_distn.add(32*1024, 2.176); - //file_size_distn.add(256*1024, 0.655938); - //file_size_distn.add(512*1024, 0.1480625); - //file_size_distn.add(1*1024*1024, 0.020125); // actually 2, but 32bit - file_size_distn.normalize(); - } - - // clear - for (int i=0; i<10; i++) - age_objects[i].clear(); - - for (int c=1; c<=count; c++) { - if (g_clock.now() > until) break; - - dout(1) << "age " << c << "/" << count << " filling to " << high_water << endl; - age_fill(high_water, until); - if (c == count) { - dout(1) << "age final empty to " << final_water << endl; - age_empty(final_water); - } else { - dout(1) << "age " << c << "/" << count << " emptying to " << low_water << endl; - age_empty(low_water); - } - } - dout(1) << "age finished" << endl; - } - diff --git a/branches/marnberg/quota/osd/ObjectStore.h b/branches/marnberg/quota/osd/ObjectStore.h deleted file mode 100644 index 9ff94adfcae99..0000000000000 --- a/branches/marnberg/quota/osd/ObjectStore.h +++ /dev/null @@ -1,505 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __OBJECTSTORE_H -#define __OBJECTSTORE_H - -#include "include/types.h" -#include "osd_types.h" -#include "include/Context.h" -#include "include/buffer.h" - -#include "include/Distribution.h" - -#include - -#ifdef DARWIN -#include -#else -#include /* or */ -#endif /* DARWIN */ - -#include -using namespace std; - -#ifndef MIN -# define MIN(a,b) ((a) < (b) ? (a):(b)) -#endif - -/* - * low-level interface to the local OSD file system - */ - - - -class ObjectStore { -public: - - - class FragmentationStat { - public: - int total; - int num_extent; - int avg_extent; - map extent_dist; // powers of two - map extent_dist_sum; // powers of two - - float avg_extent_per_object; - int avg_extent_jump; // avg distance bweteen consecutive extents - - int total_free; - int num_free_extent; - int avg_free_extent; - map free_extent_dist; // powers of two - map free_extent_dist_sum; // powers of two - }; - - - - /********************************* - * transaction - */ - class Transaction { - public: - static const int OP_READ = 1; // oid, offset, len, pbl - static const int OP_STAT = 2; // oid, pstat - static const int OP_GETATTR = 3; // oid, attrname, pattrval - static const int OP_GETATTRS = 4; // oid, pattrset - - static const int OP_WRITE = 10; // oid, offset, len, bl - static const int OP_TRUNCATE = 11; // oid, len - static const int OP_REMOVE = 13; // oid - static const int OP_SETATTR = 14; // oid, attrname, attrval - static const int OP_SETATTRS = 15; // oid, attrset - static const int OP_RMATTR = 16; // oid, attrname - static const int OP_CLONE = 17; // oid, newoid - - static const int OP_TRIMCACHE = 18; // oid, offset, len - - static const int OP_MKCOLL = 20; // cid - static const int OP_RMCOLL = 21; // cid - static const int OP_COLL_ADD = 22; // cid, oid - static const int OP_COLL_REMOVE = 23; // cid, oid - static const int OP_COLL_SETATTR = 24; // cid, attrname, attrval - static const int OP_COLL_RMATTR = 25; // cid, attrname - - list ops; - list bls; - list oids; - list cids; - list offsets; - list lengths; - list attrnames; - //list< pair > attrvals; - list attrbls; - - list pbls; - list psts; - list< pair > pattrvals; - list< map* > pattrsets; - - void read(object_t oid, off_t off, size_t len, bufferlist *pbl) { - int op = OP_READ; - ops.push_back(op); - oids.push_back(oid); - offsets.push_back(off); - lengths.push_back(len); - pbls.push_back(pbl); - } - void stat(object_t oid, struct stat *st) { - int op = OP_STAT; - ops.push_back(op); - oids.push_back(oid); - psts.push_back(st); - } - void getattr(object_t oid, const char* name, void* val, int *plen) { - int op = OP_GETATTR; - ops.push_back(op); - oids.push_back(oid); - attrnames.push_back(name); - pattrvals.push_back(pair(val,plen)); - } - void getattrs(object_t oid, map& aset) { - int op = OP_GETATTRS; - ops.push_back(op); - oids.push_back(oid); - pattrsets.push_back(&aset); - } - - void write(object_t oid, off_t off, size_t len, bufferlist& bl) { - int op = OP_WRITE; - ops.push_back(op); - oids.push_back(oid); - offsets.push_back(off); - lengths.push_back(len); - bls.push_back(bl); - } - void trim_from_cache(object_t oid, off_t off, size_t len) { - int op = OP_TRIMCACHE; - ops.push_back(op); - oids.push_back(oid); - offsets.push_back(off); - lengths.push_back(len); - } - void truncate(object_t oid, off_t off) { - int op = OP_TRUNCATE; - ops.push_back(op); - oids.push_back(oid); - offsets.push_back(off); - } - void remove(object_t oid) { - int op = OP_REMOVE; - ops.push_back(op); - oids.push_back(oid); - } - void setattr(object_t oid, const char* name, const void* val, int len) { - int op = OP_SETATTR; - ops.push_back(op); - oids.push_back(oid); - attrnames.push_back(name); - //attrvals.push_back(pair(val,len)); - bufferlist bl; - bl.append((char*)val,len); - attrbls.push_back(bl); - } - void setattrs(object_t oid, map& attrset) { - int op = OP_SETATTRS; - ops.push_back(op); - oids.push_back(oid); - pattrsets.push_back(&attrset); - } - void rmattr(object_t oid, const char* name) { - int op = OP_RMATTR; - ops.push_back(op); - oids.push_back(oid); - attrnames.push_back(name); - } - void clone(object_t oid, object_t noid) { - int op = OP_CLONE; - ops.push_back(op); - oids.push_back(oid); - oids.push_back(noid); - } - void create_collection(coll_t cid) { - int op = OP_MKCOLL; - ops.push_back(op); - cids.push_back(cid); - } - void remove_collection(coll_t cid) { - int op = OP_RMCOLL; - ops.push_back(op); - cids.push_back(cid); - } - void collection_add(coll_t cid, object_t oid) { - int op = OP_COLL_ADD; - ops.push_back(op); - cids.push_back(cid); - oids.push_back(oid); - } - void collection_remove(coll_t cid, object_t oid) { - int op = OP_COLL_REMOVE; - ops.push_back(op); - cids.push_back(cid); - oids.push_back(oid); - } - void collection_setattr(coll_t cid, const char* name, const void* val, int len) { - int op = OP_COLL_SETATTR; - ops.push_back(op); - cids.push_back(cid); - attrnames.push_back(name); - //attrvals.push_back(pair(val,len)); - bufferlist bl; - bl.append((char*)val, len); - attrbls.push_back(bl); - } - void collection_rmattr(coll_t cid, const char* name) { - int op = OP_COLL_RMATTR; - ops.push_back(op); - cids.push_back(cid); - attrnames.push_back(name); - } - - // etc. - }; - - - - /* this implementation is here only for naive ObjectStores that - * do not do atomic transactions natively. it is not atomic. - */ - virtual unsigned apply_transaction(Transaction& t, Context *onsafe=0) { - // non-atomic implementation - for (list::iterator p = t.ops.begin(); - p != t.ops.end(); - p++) { - switch (*p) { - case Transaction::OP_READ: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - off_t offset = t.offsets.front(); t.offsets.pop_front(); - size_t len = t.lengths.front(); t.lengths.pop_front(); - bufferlist *pbl = t.pbls.front(); t.pbls.pop_front(); - read(oid, offset, len, *pbl); - } - break; - case Transaction::OP_STAT: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - struct stat *st = t.psts.front(); t.psts.pop_front(); - stat(oid, st); - } - break; - case Transaction::OP_GETATTR: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - const char *attrname = t.attrnames.front(); t.attrnames.pop_front(); - pair pattrval = t.pattrvals.front(); t.pattrvals.pop_front(); - *pattrval.second = getattr(oid, attrname, pattrval.first, *pattrval.second); - } - break; - case Transaction::OP_GETATTRS: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - map *pset = t.pattrsets.front(); t.pattrsets.pop_front(); - getattrs(oid, *pset); - } - break; - - case Transaction::OP_WRITE: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - off_t offset = t.offsets.front(); t.offsets.pop_front(); - size_t len = t.lengths.front(); t.lengths.pop_front(); - bufferlist bl = t.bls.front(); t.bls.pop_front(); - write(oid, offset, len, bl, 0); - } - break; - - case Transaction::OP_TRIMCACHE: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - off_t offset = t.offsets.front(); t.offsets.pop_front(); - size_t len = t.lengths.front(); t.lengths.pop_front(); - trim_from_cache(oid, offset, len); - } - break; - - case Transaction::OP_TRUNCATE: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - off_t len = t.offsets.front(); t.offsets.pop_front(); - truncate(oid, len, 0); - } - break; - - case Transaction::OP_REMOVE: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - remove(oid, 0); - } - break; - - case Transaction::OP_SETATTR: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - const char *attrname = t.attrnames.front(); t.attrnames.pop_front(); - //pair attrval = t.attrvals.front(); t.attrvals.pop_front(); - bufferlist bl; - bl.claim( t.attrbls.front() ); - t.attrbls.pop_front(); - setattr(oid, attrname, bl.c_str(), bl.length(), 0); - } - break; - case Transaction::OP_SETATTRS: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - map *pattrset = t.pattrsets.front(); t.pattrsets.pop_front(); - setattrs(oid, *pattrset, 0); - } - break; - - case Transaction::OP_RMATTR: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - const char *attrname = t.attrnames.front(); t.attrnames.pop_front(); - rmattr(oid, attrname, 0); - } - break; - - case Transaction::OP_CLONE: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - object_t noid = t.oids.front(); t.oids.pop_front(); - clone(oid, noid); - } - break; - - case Transaction::OP_MKCOLL: - { - coll_t cid = t.cids.front(); t.cids.pop_front(); - create_collection(cid, 0); - } - break; - - case Transaction::OP_RMCOLL: - { - coll_t cid = t.cids.front(); t.cids.pop_front(); - destroy_collection(cid, 0); - } - break; - - case Transaction::OP_COLL_ADD: - { - coll_t cid = t.cids.front(); t.cids.pop_front(); - object_t oid = t.oids.front(); t.oids.pop_front(); - collection_add(cid, oid, 0); - } - break; - - case Transaction::OP_COLL_REMOVE: - { - coll_t cid = t.cids.front(); t.cids.pop_front(); - object_t oid = t.oids.front(); t.oids.pop_front(); - collection_remove(cid, oid, 0); - } - break; - - case Transaction::OP_COLL_SETATTR: - { - coll_t cid = t.cids.front(); t.cids.pop_front(); - const char *attrname = t.attrnames.front(); t.attrnames.pop_front(); - //pair attrval = t.attrvals.front(); t.attrvals.pop_front(); - bufferlist bl; - bl.claim( t.attrbls.front() ); - t.attrbls.pop_front(); - collection_setattr(cid, attrname, bl.c_str(), bl.length(), 0); - } - break; - - case Transaction::OP_COLL_RMATTR: - { - coll_t cid = t.cids.front(); t.cids.pop_front(); - const char *attrname = t.attrnames.front(); t.attrnames.pop_front(); - collection_rmattr(cid, attrname, 0); - } - break; - - - default: - cerr << "bad op " << *p << endl; - assert(0); - } - } - - if (onsafe) sync(onsafe); - - return 0; // FIXME count errors - } - - /*********************************************/ - - - - public: - ObjectStore() {} - virtual ~ObjectStore() {} - - // mgmt - virtual int mount() = 0; - virtual int umount() = 0; - virtual int mkfs() = 0; // wipe - - virtual int statfs(struct statfs *buf) = 0; - - // objects - virtual int pick_object_revision_lt(object_t& oid) = 0; - - virtual bool exists(object_t oid) = 0; // useful? - virtual int stat(object_t oid, struct stat *st) = 0; // struct stat? - - virtual int remove(object_t oid, - Context *onsafe=0) = 0; - - virtual int truncate(object_t oid, off_t size, - Context *onsafe=0) = 0; - - virtual int read(object_t oid, - off_t offset, size_t len, - bufferlist& bl) = 0; - - /*virtual int write(object_t oid, - off_t offset, size_t len, - bufferlist& bl, - bool fsync=true) = 0; - */ - virtual int write(object_t oid, - off_t offset, size_t len, - bufferlist& bl, - Context *onsafe) = 0;//{ return -1; } - virtual void trim_from_cache(object_t oid, - off_t offset, size_t len) { } - - virtual int setattr(object_t oid, const char *name, - const void *value, size_t size, - Context *onsafe=0) {return 0;} //= 0; - virtual int setattrs(object_t oid, map& aset, - Context *onsafe=0) {return 0;} //= 0; - virtual int getattr(object_t oid, const char *name, - void *value, size_t size) {return 0;} //= 0; - virtual int getattrs(object_t oid, map& aset) {return 0;}; - - virtual int rmattr(object_t oid, const char *name, - Context *onsafe=0) {return 0;} - - virtual int clone(object_t oid, object_t noid) { - return -1; - } - - //virtual int listattr(object_t oid, char *attrs, size_t size) {return 0;} //= 0; - - // collections - virtual int list_collections(list& ls) {return 0;}//= 0; - virtual int create_collection(coll_t c, - Context *onsafe=0) {return 0;}//= 0; - virtual int destroy_collection(coll_t c, - Context *onsafe=0) {return 0;}//= 0; - virtual bool collection_exists(coll_t c) {return 0;} - virtual int collection_stat(coll_t c, struct stat *st) {return 0;}//= 0; - virtual int collection_add(coll_t c, object_t o, - Context *onsafe=0) {return 0;}//= 0; - virtual int collection_remove(coll_t c, object_t o, - Context *onsafe=0) {return 0;}// = 0; - virtual int collection_list(coll_t c, list& o) {return 0;}//= 0; - - virtual int collection_setattr(coll_t cid, const char *name, - const void *value, size_t size, - Context *onsafe=0) {return 0;} //= 0; - virtual int collection_rmattr(coll_t cid, const char *name, - Context *onsafe=0) {return 0;} //= 0; - virtual int collection_getattr(coll_t cid, const char *name, - void *value, size_t size) {return 0;} //= 0; - //virtual int collection_listattr(coll_t cid, char *attrs, size_t size) {return 0;} //= 0; - - virtual void sync(Context *onsync) {} - virtual void sync() {} - - - virtual void _fake_writes(bool b) {}; - - virtual void _get_frag_stat(FragmentationStat& st) {}; - -}; - - -#endif diff --git a/branches/marnberg/quota/osd/PG.cc b/branches/marnberg/quota/osd/PG.cc deleted file mode 100644 index 218f9eac36aae..0000000000000 --- a/branches/marnberg/quota/osd/PG.cc +++ /dev/null @@ -1,1333 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "PG.h" -#include "config.h" -#include "OSD.h" - -#include "common/Timer.h" - -#include "messages/MOSDPGNotify.h" -#include "messages/MOSDPGLog.h" -#include "messages/MOSDPGRemove.h" - -#undef dout -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_osd) cout << g_clock.now() << " osd" << osd->whoami << " " << (osd->osdmap ? osd->osdmap->get_epoch():0) << " " << *this << " " - - -/******* PGLog ********/ - -void PG::Log::copy_after(const Log &other, eversion_t v) -{ - assert(v >= other.bottom); - top = bottom = other.top; - for (list::const_reverse_iterator i = other.log.rbegin(); - i != other.log.rend(); - i++) { - if (i->version == v) break; - assert(i->version > v); - log.push_front(*i); - } - bottom = v; -} - -bool PG::Log::copy_after_unless_divergent(const Log &other, eversion_t split, eversion_t floor) -{ - assert(split >= other.bottom); - assert(floor >= other.bottom); - assert(floor <= split); - top = bottom = other.top; - - /* runs on replica. split is primary's log.top. floor is how much they want. - split tell us if the primary is divergent.. e.g.: - -> i am A, B is primary, split is 2'6, floor is 2'2. -A B C -2'2 2'2 -2'3 2'3 2'3 -2'4 2'4 2'4 -3'5 | 2'5 2'5 -3'6 | 2'6 -3'7 | -3'8 | -3'9 | - -> i return full backlog. - */ - - for (list::const_reverse_iterator i = other.log.rbegin(); - i != other.log.rend(); - i++) { - // is primary divergent? - // e.g. my 3'6 vs their 2'6 split - if (i->version.version == split.version && i->version.epoch > split.epoch) { - clear(); - return false; // divergent! - } - if (i->version == floor) break; - assert(i->version > floor); - - // e.g. my 2'23 > '12 - log.push_front(*i); - } - bottom = floor; - return true; -} - -void PG::Log::copy_non_backlog(const Log &other) -{ - if (other.backlog) { - top = other.top; - bottom = other.bottom; - for (list::const_reverse_iterator i = other.log.rbegin(); - i != other.log.rend(); - i++) - if (i->version > bottom) - log.push_front(*i); - else - break; - } else { - *this = other; - } -} - - - -void PG::IndexedLog::trim(ObjectStore::Transaction& t, eversion_t s) -{ - if (backlog && s < bottom) - s = bottom; - - while (!log.empty()) { - Entry &e = *log.begin(); - - if (e.version > s) break; - - assert(complete_to != log.begin()); - assert(requested_to != log.begin()); - - // remove from index, - unindex(e); - - // from log - log.pop_front(); - } - - // raise bottom? - if (backlog) backlog = false; - if (bottom < s) bottom = s; -} - - -void PG::IndexedLog::trim_write_ahead(eversion_t last_update) -{ - while (!log.empty() && - log.rbegin()->version > last_update) { - // remove from index - unindex(*log.rbegin()); - - // remove - log.pop_back(); - } -} - -void PG::trim_write_ahead() -{ - if (info.last_update < log.top) { - dout(10) << "trim_write_ahead (" << info.last_update << "," << log.top << "]" << endl; - log.trim_write_ahead(info.last_update); - } else { - assert(info.last_update == log.top); - dout(10) << "trim_write_ahead last_update=top=" << info.last_update << endl; - } - -} - -void PG::proc_replica_log(Log &olog, Missing& omissing, int from) -{ - dout(10) << "proc_replica_log for osd" << from << ": " << olog << " " << omissing << endl; - assert(!is_active()); - - if (!have_master_log) { - // i'm building master log. - // note peer's missing. - peer_missing[from] = omissing; - - // merge log into our own log - merge_log(olog, omissing, from); - proc_missing(olog, omissing, from); - } else { - // i'm just building missing lists. - peer_missing[from] = omissing; - - // iterate over peer log. in reverse. - list::reverse_iterator pp = olog.log.rbegin(); - eversion_t lu = peer_info[from].last_update; - while (pp != olog.log.rend()) { - if (!log.objects.count(pp->oid)) { - dout(10) << " divergent " << *pp << " not in our log, generating backlog" << endl; - generate_backlog(); - } - - if (!log.objects.count(pp->oid)) { - dout(10) << " divergent " << *pp << " dne, must have been new, ignoring" << endl; - ++pp; - continue; - } - - if (log.objects[pp->oid]->version == pp->version) { - break; // we're no longer divergent. - //++pp; - //continue; - } - - if (log.objects[pp->oid]->version > pp->version) { - dout(10) << " divergent " << *pp - << " superceded by " << log.objects[pp->oid] - << ", ignoring" << endl; - } else { - dout(10) << " divergent " << *pp << ", adding to missing" << endl; - peer_missing[from].add(pp->oid, pp->version); - } - - ++pp; - if (pp != olog.log.rend()) - lu = pp->version; - else - lu = olog.bottom; - } - - if (lu < peer_info[from].last_update) { - dout(10) << " peer osd" << from << " last_update now " << lu << endl; - peer_info[from].last_update = lu; - if (lu < oldest_update) { - dout(10) << " oldest_update now " << lu << endl; - oldest_update = lu; - } - } - - proc_missing(olog, peer_missing[from], from); - } -} - -void PG::merge_log(Log &olog, Missing &omissing, int fromosd) -{ - dout(10) << "merge_log " << olog << " from osd" << fromosd - << " into " << log << endl; - - //cout << "log" << endl; - //log.print(cout); - //cout << "olog" << endl; - //olog.print(cout); - - if (log.empty() || - (olog.bottom > log.top && olog.backlog)) { // e.g. log=(0,20] olog=(40,50]+backlog) - - // swap and index - log.log.swap(olog.log); - log.index(); - - // find split point (old log.top) in new log - // add new items to missing along the way. - for (list::reverse_iterator p = log.log.rbegin(); - p != log.log.rend(); - p++) { - if (p->version <= log.top) { - // ok, p is at split point. - - // was our old log divergent? - if (log.top > p->version) { - dout(10) << "merge_log i was (possibly) divergent for (" << p->version << "," << log.top << "]" << endl; - if (p->version < oldest_update) - oldest_update = p->version; - - while (!olog.log.empty() && - olog.log.rbegin()->version > p->version) { - Log::Entry &oe = *olog.log.rbegin(); // old entry (possibly divergent) - if (log.objects.count(oe.oid)) { - if (log.objects[oe.oid]->version < oe.version) { - dout(10) << "merge_log divergent entry " << oe - << " not superceded by " << *log.objects[oe.oid] - << ", adding to missing" << endl; - missing.add(oe.oid, oe.version); - } else { - dout(10) << "merge_log divergent entry " << oe - << " superceded by " << *log.objects[oe.oid] - << ", ignoring" << endl; - } - } else { - dout(10) << "merge_log divergent entry " << oe << ", adding to missing" << endl; - missing.add(oe.oid, oe.version); - } - olog.log.pop_back(); // discard divergent entry - } - } - break; - } - - if (p->is_delete()) { - dout(10) << "merge_log merging " << *p << ", not missing" << endl; - missing.rm(p->oid, p->version); - } else { - dout(10) << "merge_log merging " << *p << ", now missing" << endl; - missing.add(p->oid, p->version); - } - } - - info.last_update = log.top = olog.top; - info.log_bottom = log.bottom = olog.bottom; - info.log_backlog = log.backlog = olog.backlog; - } - - else { - // i can merge the two logs! - - // extend on bottom? - // FIXME: what if we have backlog, but they have lower bottom? - if (olog.bottom < log.bottom && olog.top >= log.bottom && !log.backlog) { - dout(10) << "merge_log extending bottom to " << olog.bottom - << (olog.backlog ? " +backlog":"") - << endl; - - // ok - list::iterator from = olog.log.begin(); - list::iterator to; - for (to = from; - to != olog.log.end(); - to++) { - if (to->version > log.bottom) break; - - // update our index while we're here - log.index(*to); - - dout(15) << *to << endl; - - // new missing object? - if (to->version > info.last_complete) { - if (to->is_update()) - missing.add(to->oid, to->version); - else - missing.rm(to->oid, to->version); - } - } - assert(to != olog.log.end()); - - // splice into our log. - log.log.splice(log.log.begin(), - olog.log, from, to); - - info.log_bottom = log.bottom = olog.bottom; - info.log_backlog = log.backlog = olog.backlog; - } - - // extend on top? - if (olog.top > log.top && - olog.bottom <= log.top) { - dout(10) << "merge_log extending top to " << olog.top << endl; - - list::iterator to = olog.log.end(); - list::iterator from = olog.log.end(); - while (1) { - if (from == olog.log.begin()) break; - from--; - //dout(0) << "? " << *from << endl; - if (from->version < log.top) { - from++; - break; - } - - log.index(*from); - dout(10) << "merge_log " << *from << endl; - - // add to missing - if (from->is_update()) { - missing.add(from->oid, from->version); - } else - missing.rm(from->oid, from->version); - } - - // remove divergent items - while (1) { - Log::Entry *oldtail = &(*log.log.rbegin()); - if (oldtail->version.version+1 == from->version.version) break; - - // divergent! - assert(oldtail->version.version >= from->version.version); - - if (log.objects[oldtail->oid]->version == oldtail->version) { - // and significant. - dout(10) << "merge_log had divergent " << *oldtail << ", adding to missing" << endl; - //missing.add(oldtail->oid); - assert(0); - } else { - dout(10) << "merge_log had divergent " << *oldtail << ", already missing" << endl; - assert(missing.is_missing(oldtail->oid)); - } - log.log.pop_back(); - } - - // splice - log.log.splice(log.log.end(), - olog.log, from, to); - - info.last_update = log.top = olog.top; - } - } - - dout(10) << "merge_log result " << log << " " << missing << endl; - //log.print(cout); - -} - -void PG::proc_missing(Log &olog, Missing &omissing, int fromosd) -{ - // found items? - for (map::iterator p = missing.missing.begin(); - p != missing.missing.end(); - p++) { - if (omissing.is_missing(p->first)) { - assert(omissing.is_missing(p->first, p->second)); - if (omissing.loc.count(p->first)) { - dout(10) << "proc_missing missing " << p->first << " " << p->second - << " on osd" << omissing.loc[p->first] << endl; - missing.loc[p->first] = omissing.loc[p->first]; - } else { - dout(10) << "proc_missing missing " << p->first << " " << p->second - << " also LOST on source, osd" << fromosd << endl; - } - } - else if (p->second <= olog.top) { - dout(10) << "proc_missing missing " << p->first << " " << p->second - << " on source, osd" << fromosd << endl; - missing.loc[p->first] = fromosd; - } else { - dout(10) << "proc_missing " << p->first << " " << p->second - << " > olog.top " << olog.top << ", not found...." - << endl; - } - } - - dout(10) << "proc_missing missing " << missing.missing << endl; -} - - - -void PG::generate_backlog() -{ - dout(10) << "generate_backlog to " << log << endl; - assert(!log.backlog); - log.backlog = true; - - list olist; - osd->store->collection_list(info.pgid, olist); - - int local = 0; - map add; - for (list::iterator it = olist.begin(); - it != olist.end(); - it++) { - local++; - - if (log.logged_object(*it)) continue; // already have it logged. - - // add entry - Log::Entry e; - e.op = Log::Entry::MODIFY; // FIXME when we do smarter op codes! - e.oid = *it; - osd->store->getattr(*it, - "version", - &e.version, sizeof(e.version)); - add[e.version] = e; - dout(10) << "generate_backlog found " << e << endl; - } - - for (map::reverse_iterator i = add.rbegin(); - i != add.rend(); - i++) { - log.log.push_front(i->second); - log.index( *log.log.begin() ); // index - } - - dout(10) << local << " local objects, " - << add.size() << " objects added to backlog, " - << log.objects.size() << " in pg" << endl; - - //log.print(cout); -} - -void PG::drop_backlog() -{ - dout(10) << "drop_backlog for " << log << endl; - //log.print(cout); - - assert(log.backlog); - log.backlog = false; - - while (!log.log.empty()) { - Log::Entry &e = *log.log.begin(); - if (e.version > log.bottom) break; - - dout(15) << "drop_backlog trimming " << e.version << endl; - log.unindex(e); - log.log.pop_front(); - } -} - - - - - -ostream& PG::Log::print(ostream& out) const -{ - out << *this << endl; - for (list::const_iterator p = log.begin(); - p != log.end(); - p++) - out << *p << endl; - return out; -} - - - - - -/******* PG ***********/ -void PG::build_prior() -{ - // build prior set. - prior_set.clear(); - - // current - for (unsigned i=1; iosdmap->get_epoch(); - epoch++) { - OSDMap omap; - osd->get_map(epoch, omap); - - vector acting; - omap.pg_to_acting_osds(get_pgid(), acting); - - for (unsigned i=0; iosdmap->is_up(acting[i]) && // is up now - acting[i] != osd->whoami) // and is not me - prior_set.insert(acting[i]); - } - } - - dout(10) << "build_prior built " << prior_set << endl; -} - -void PG::adjust_prior() -{ - assert(!prior_set.empty()); - - // raise last_epoch_started_any - epoch_t max = 0; - for (map::iterator it = peer_info.begin(); - it != peer_info.end(); - it++) { - if (it->second.last_epoch_started > max) - max = it->second.last_epoch_started; - } - - dout(10) << "adjust_prior last_epoch_started_any " - << last_epoch_started_any << " -> " << max << endl; - assert(max > last_epoch_started_any); - last_epoch_started_any = max; - - // rebuild prior set - build_prior(); -} - - -void PG::clear_primary_state() -{ - dout(10) << "clear_primary_state" << endl; - - // clear peering state - have_master_log = false; - prior_set.clear(); - stray_set.clear(); - clean_set.clear(); - peer_info_requested.clear(); - peer_log_requested.clear(); - peer_info.clear(); - peer_missing.clear(); - - last_epoch_started_any = info.last_epoch_started; -} - -void PG::peer(ObjectStore::Transaction& t, - map< int, map >& query_map) -{ - dout(10) << "peer. acting is " << acting - << ", prior_set is " << prior_set << endl; - - - /** GET ALL PG::Info *********/ - - // -- query info from everyone in prior_set. - bool missing_info = false; - for (set::iterator it = prior_set.begin(); - it != prior_set.end(); - it++) { - if (peer_info.count(*it)) { - dout(10) << " have info from osd" << *it - << ": " << peer_info[*it] - << endl; - continue; - } - missing_info = true; - - if (peer_info_requested.count(*it)) { - dout(10) << " waiting for osd" << *it << endl; - continue; - } - - dout(10) << " querying info from osd" << *it << endl; - query_map[*it][info.pgid] = Query(Query::INFO, info.history); - peer_info_requested.insert(*it); - } - if (missing_info) return; - - - // -- ok, we have all (prior_set) info. (and maybe others.) - - // did we crash? - dout(10) << " last_epoch_started_any " << last_epoch_started_any << endl; - if (last_epoch_started_any) { - OSDMap omap; - osd->get_map(last_epoch_started_any, omap); - - // start with the last active set of replicas - set last_started; - vector acting; - omap.pg_to_acting_osds(get_pgid(), acting); - for (unsigned i=0; iosdmap->get_epoch(); - e++) { - OSDMap omap; - osd->get_map(e, omap); - - set still_up; - - for (set::iterator i = last_started.begin(); - i != last_started.end(); - i++) { - //dout(10) << " down in epoch " << e << " is " << omap.get_down_osds() << endl; - if (omap.is_up(*i)) - still_up.insert(*i); - } - - last_started.swap(still_up); - //dout(10) << " still active as of epoch " << e << ": " << last_started << endl; - } - - if (last_started.empty()) { - dout(10) << " crashed since epoch " << last_epoch_started_any << endl; - state_set(STATE_CRASHED); - } else { - dout(10) << " still active from last started: " << last_started << endl; - } - } else if (osd->osdmap->get_epoch() > 1) { - dout(10) << " crashed since epoch " << last_epoch_started_any << endl; - state_set(STATE_CRASHED); - } - - dout(10) << " peers_complete_thru " << peers_complete_thru << endl; - - - - - /** CREATE THE MASTER PG::Log *********/ - - // who (of all priors and active) has the latest PG version? - eversion_t newest_update = info.last_update; - int newest_update_osd = osd->whoami; - - oldest_update = info.last_update; // only of acting (current) osd set. - peers_complete_thru = info.last_complete; - - for (map::iterator it = peer_info.begin(); - it != peer_info.end(); - it++) { - if (it->second.last_update > newest_update) { - newest_update = it->second.last_update; - newest_update_osd = it->first; - } - if (is_acting(it->first)) { - if (it->second.last_update < oldest_update) - oldest_update = it->second.last_update; - if (it->second.last_complete < peers_complete_thru) - peers_complete_thru = it->second.last_complete; - } - } - - // gather log(+missing) from that person! - if (newest_update_osd != osd->whoami) { - if (peer_log_requested.count(newest_update_osd) || - peer_summary_requested.count(newest_update_osd)) { - dout(10) << " newest update on osd" << newest_update_osd - << " v " << newest_update - << ", already queried" - << endl; - } else { - // we'd like it back to oldest_update, but will settle for log_bottom - eversion_t since = MAX(peer_info[newest_update_osd].log_bottom, - oldest_update); - if (peer_info[newest_update_osd].log_bottom < log.top) { - dout(10) << " newest update on osd" << newest_update_osd - << " v " << newest_update - << ", querying since " << since - << endl; - query_map[newest_update_osd][info.pgid] = Query(Query::LOG, log.top, since, info.history); - peer_log_requested.insert(newest_update_osd); - } else { - dout(10) << " newest update on osd" << newest_update_osd - << " v " << newest_update - << ", querying entire summary/backlog" - << endl; - assert((peer_info[newest_update_osd].last_complete >= - peer_info[newest_update_osd].log_bottom) || - peer_info[newest_update_osd].log_backlog); // or else we're in trouble. - query_map[newest_update_osd][info.pgid] = Query(Query::BACKLOG, info.history); - peer_summary_requested.insert(newest_update_osd); - } - } - return; - } else { - dout(10) << " newest_update " << info.last_update << " (me)" << endl; - } - - dout(10) << " oldest_update " << oldest_update << endl; - - have_master_log = true; - - - // -- do i need to generate backlog for any of my peers? - if (oldest_update < log.bottom && !log.backlog) { - dout(10) << "generating backlog for some peers, bottom " - << log.bottom << " > " << oldest_update - << endl; - generate_backlog(); - } - - - /** COLLECT MISSING+LOG FROM PEERS **********/ - /* - we also detect divergent replicas here by pulling the full log - from everyone. - */ - - // gather missing from peers - for (unsigned i=1; i 0) { - dout(10) << "there are still " << missing.num_lost() << " lost objects" << endl; - - // ***** - // FIXME: i don't think this actually accomplishes anything! - // ***** - - // ok, let's get more summaries! - bool waiting = false; - for (map::iterator it = peer_info.begin(); - it != peer_info.end(); - it++) { - int peer = it->first; - - if (peer_summary_requested.count(peer)) { - dout(10) << " already requested summary/backlog from osd" << peer << endl; - waiting = true; - continue; - } - - dout(10) << " requesting summary/backlog from osd" << peer << endl; - query_map[peer][info.pgid] = Query(Query::BACKLOG, info.history); - peer_summary_requested.insert(peer); - waiting = true; - } - - if (!waiting) { - dout(10) << missing.num_lost() << " objects are still lost, waiting+hoping for a notify from someone else!" << endl; - } - return; - } - - // sanity check - assert(missing.num_lost() == 0); - assert(info.last_complete >= log.bottom || log.backlog); - - - // -- crash recovery? - if (is_crashed()) { - dout(10) << "crashed, allowing op replay for " << g_conf.osd_replay_window << endl; - state_set(STATE_REPLAY); - osd->timer.add_event_after(g_conf.osd_replay_window, - new OSD::C_Activate(osd, info.pgid, osd->osdmap->get_epoch())); - } - else if (!is_active()) { - // -- ok, activate! - activate(t); - } -} - - -void PG::activate(ObjectStore::Transaction& t) -{ - assert(!is_active()); - - // twiddle pg state - state_set(STATE_ACTIVE); - state_clear(STATE_STRAY); - if (is_crashed()) { - //assert(is_replay()); // HELP.. not on replica? - state_clear(STATE_CRASHED); - state_clear(STATE_REPLAY); - } - info.last_epoch_started = osd->osdmap->get_epoch(); - - if (role == 0) { // primary state - peers_complete_thru = 0; // we don't know (yet)! - } - - assert(info.last_complete >= log.bottom || log.backlog); - - // write pg info - t.collection_setattr(info.pgid, "info", (char*)&info, sizeof(info)); - - // write log - write_log(t); - - // clean up stray objects - clean_up_local(t); - - // init complete pointer - if (info.last_complete == info.last_update) { - dout(10) << "activate - complete" << endl; - log.complete_to == log.log.end(); - log.requested_to = log.log.end(); - } - //else if (is_primary()) { - else if (true) { - dout(10) << "activate - not complete, " << missing << ", starting recovery" << endl; - - // init complete_to - log.complete_to = log.log.begin(); - while (log.complete_to->version < info.last_complete) { - log.complete_to++; - assert(log.complete_to != log.log.end()); - } - - // start recovery - log.requested_to = log.complete_to; - do_recovery(); - } else { - dout(10) << "activate - not complete, " << missing << endl; - } - - - // if primary.. - if (role == 0 && - osd->osdmap->get_epoch() > 1) { - // who is clean? - clean_set.clear(); - if (info.is_clean()) - clean_set.insert(osd->whoami); - - // start up replicas - for (unsigned i=1; iosdmap->get_epoch(), - info.pgid); - m->info = info; - - if (peer_info[peer].last_update == info.last_update) { - // empty log - } - else if (peer_info[peer].last_update < log.bottom) { - // summary/backlog - assert(log.backlog); - m->log = log; - } - else { - // incremental log - assert(peer_info[peer].last_update < info.last_update); - m->log.copy_after(log, peer_info[peer].last_update); - } - - // update local version of peer's missing list! - { - eversion_t plu = peer_info[peer].last_update; - Missing& pm = peer_missing[peer]; - for (list::iterator p = m->log.log.begin(); - p != m->log.log.end(); - p++) - if (p->version > plu) - pm.add(p->oid, p->version); - } - - dout(10) << "activate sending " << m->log << " " << m->missing - << " to osd" << peer << endl; - //m->log.print(cout); - osd->messenger->send_message(m, osd->osdmap->get_inst(peer)); - - // update our missing - if (peer_missing[peer].num_missing() == 0) { - dout(10) << "activate peer osd" << peer << " already clean, " << peer_info[peer] << endl; - assert(peer_info[peer].last_complete == info.last_update); - clean_set.insert(peer); - } else { - dout(10) << "activate peer osd" << peer << " " << peer_info[peer] - << " missing " << peer_missing[peer] << endl; - } - - } - - // discard unneeded peering state - //peer_log.clear(); // actually, do this carefully, in case peer() is called again. - - // all clean? - if (is_all_clean()) { - state_set(STATE_CLEAN); - dout(10) << "activate all replicas clean" << endl; - clean_replicas(); - } - } - - - // replay (queue them _before_ other waiting ops!) - if (!replay_queue.empty()) { - eversion_t c = info.last_update; - list replay; - for (map::iterator p = replay_queue.begin(); - p != replay_queue.end(); - p++) { - if (p->first <= info.last_update) { - dout(10) << "activate will WRNOOP " << p->first << " " << *p->second << endl; - replay.push_back(p->second); - continue; - } - if (p->first.version != c.version+1) { - dout(10) << "activate replay " << p->first - << " skipping " << c.version+1 - p->first.version - << " ops" - << endl; - } - dout(10) << "activate replay " << p->first << " " << *p->second << endl; - replay.push_back(p->second); - c = p->first; - } - replay_queue.clear(); - osd->take_waiters(replay); - } - - // waiters - osd->take_waiters(waiting_for_active); -} - -/** clean_up_local - * remove any objects that we're storing but shouldn't. - * as determined by log. - */ -void PG::clean_up_local(ObjectStore::Transaction& t) -{ - dout(10) << "clean_up_local" << endl; - - assert(info.last_update >= log.bottom); // otherwise we need some help! - - if (log.backlog) { - // be thorough. - list ls; - osd->store->collection_list(info.pgid, ls); - set s; - - for (list::iterator i = ls.begin(); - i != ls.end(); - i++) - s.insert(*i); - - set did; - for (list::reverse_iterator p = log.log.rbegin(); - p != log.log.rend(); - p++) { - if (did.count(p->oid)) continue; - did.insert(p->oid); - - if (p->is_delete()) { - if (s.count(p->oid)) { - dout(10) << " deleting " << p->oid - << " when " << p->version << endl; - t.remove(p->oid); - } - s.erase(p->oid); - } else { - // just leave old objects.. they're missing or whatever - s.erase(p->oid); - } - } - - for (set::iterator i = s.begin(); - i != s.end(); - i++) { - dout(10) << " deleting stray " << *i << endl; - t.remove(*i); - } - - } else { - // just scan the log. - set did; - for (list::reverse_iterator p = log.log.rbegin(); - p != log.log.rend(); - p++) { - if (did.count(p->oid)) continue; - did.insert(p->oid); - - if (p->is_delete()) { - dout(10) << " deleting " << p->oid - << " when " << p->version << endl; - t.remove(p->oid); - } else { - // keep old(+missing) objects, just for kicks. - } - } - } -} - - - -void PG::cancel_recovery() -{ - // forget about where missing items are, or anything we're pulling - missing.loc.clear(); - osd->num_pulling -= objects_pulling.size(); - objects_pulling.clear(); -} - -/** - * do one recovery op. - * return true if done, false if nothing left to do. - */ -bool PG::do_recovery() -{ - dout(-10) << "do_recovery pulling " << objects_pulling.size() << " in pg, " - << osd->num_pulling << "/" << g_conf.osd_max_pull << " total" - << endl; - dout(10) << "do_recovery " << missing << endl; - - // can we slow down on this PG? - if (osd->num_pulling >= g_conf.osd_max_pull && !objects_pulling.empty()) { - dout(-10) << "do_recovery already pulling max, waiting" << endl; - return true; - } - - // look at log! - Log::Entry *latest = 0; - - while (log.requested_to != log.log.end()) { - assert(log.objects.count(log.requested_to->oid)); - latest = log.objects[log.requested_to->oid]; - assert(latest); - - dout(10) << "do_recovery " - << *log.requested_to - << (objects_pulling.count(latest->oid) ? " (pulling)":"") - << endl; - - if (latest->is_update() && - !objects_pulling.count(latest->oid) && - missing.is_missing(latest->oid)) { - osd->pull(this, latest->oid); - return true; - } - - log.requested_to++; - } - - if (!objects_pulling.empty()) { - dout(7) << "do_recovery requested everything, still waiting" << endl; - return false; - } - - // done? - assert(missing.num_missing() == 0); - assert(info.last_complete == info.last_update); - - if (is_primary()) { - // i am primary - dout(7) << "do_recovery complete, cleaning strays" << endl; - clean_set.insert(osd->whoami); - if (is_all_clean()) { - state_set(PG::STATE_CLEAN); - clean_replicas(); - } - } else { - // tell primary - dout(7) << "do_recovery complete, telling primary" << endl; - list ls; - ls.push_back(info); - osd->messenger->send_message(new MOSDPGNotify(osd->osdmap->get_epoch(), - ls), - osd->osdmap->get_inst(get_primary())); - } - - return false; -} - -void PG::do_peer_recovery() -{ - dout(10) << "do_peer_recovery" << endl; - - for (unsigned i=0; isecond; - eversion_t v = peer_missing[peer].rmissing.begin()->first; - - osd->push(this, oid, peer); - - // do other peers need it too? - for (i++; ipush(this, oid, peer); - } - - return; - } - - // nothing to do! -} - - - -void PG::clean_replicas() -{ - dout(10) << "clean_replicas. strays are " << stray_set << endl; - - for (set::iterator p = stray_set.begin(); - p != stray_set.end(); - p++) { - dout(10) << "sending PGRemove to osd" << *p << endl; - set ls; - ls.insert(info.pgid); - MOSDPGRemove *m = new MOSDPGRemove(osd->osdmap->get_epoch(), ls); - osd->messenger->send_message(m, osd->osdmap->get_inst(*p)); - } - - stray_set.clear(); -} - - - -void PG::write_log(ObjectStore::Transaction& t) -{ - dout(10) << "write_log" << endl; - - // assemble buffer - bufferlist bl; - - // build buffer - ondisklog.bottom = 0; - ondisklog.block_map.clear(); - for (list::iterator p = log.log.begin(); - p != log.log.end(); - p++) { - if (bl.length() % 4096 == 0) - ondisklog.block_map[bl.length()] = p->version; - bl.append((char*)&(*p), sizeof(*p)); - if (g_conf.osd_pad_pg_log) { // pad to 4k, until i fix ebofs reallocation crap. FIXME. - bufferptr bp(4096 - sizeof(*p)); - bl.push_back(bp); - } - } - ondisklog.top = bl.length(); - - // write it - t.remove( info.pgid.to_object() ); - t.write( info.pgid.to_object() , 0, bl.length(), bl); - t.collection_setattr(info.pgid, "ondisklog_bottom", &ondisklog.bottom, sizeof(ondisklog.bottom)); - t.collection_setattr(info.pgid, "ondisklog_top", &ondisklog.top, sizeof(ondisklog.top)); - - t.collection_setattr(info.pgid, "info", &info, sizeof(info)); -} - -void PG::trim_ondisklog_to(ObjectStore::Transaction& t, eversion_t v) -{ - dout(15) << " trim_ondisk_log_to v " << v << endl; - - map::iterator p = ondisklog.block_map.begin(); - while (p != ondisklog.block_map.end()) { - dout(15) << " " << p->first << " -> " << p->second << endl; - p++; - if (p == ondisklog.block_map.end() || - p->second > v) { // too far! - p--; // back up - break; - } - } - dout(15) << " * " << p->first << " -> " << p->second << endl; - if (p == ondisklog.block_map.begin()) - return; // can't trim anything! - - // we can trim! - off_t trim = p->first; - dout(10) << " trimming ondisklog to [" << ondisklog.bottom << "," << ondisklog.top << ")" << endl; - - ondisklog.bottom = trim; - - // adjust block_map - while (p != ondisklog.block_map.begin()) - ondisklog.block_map.erase(ondisklog.block_map.begin()); - - t.collection_setattr(info.pgid, "ondisklog_bottom", &ondisklog.bottom, sizeof(ondisklog.bottom)); - t.collection_setattr(info.pgid, "ondisklog_top", &ondisklog.top, sizeof(ondisklog.top)); -} - - -void PG::append_log(ObjectStore::Transaction& t, PG::Log::Entry& logentry, - eversion_t trim_to) -{ - dout(10) << "append_log " << ondisklog.top << " " << logentry << endl; - - // write entry on disk - bufferlist bl; - bl.append( (char*)&logentry, sizeof(logentry) ); - if (g_conf.osd_pad_pg_log) { // pad to 4k, until i fix ebofs reallocation crap. FIXME. - bufferptr bp(4096 - sizeof(logentry)); - bl.push_back(bp); - } - t.write( info.pgid.to_object(), ondisklog.top, bl.length(), bl ); - - // update block map? - if (ondisklog.top % 4096 == 0) - ondisklog.block_map[ondisklog.top] = logentry.version; - - ondisklog.top += bl.length(); - t.collection_setattr(info.pgid, "ondisklog_top", &ondisklog.top, sizeof(ondisklog.top)); - - // trim? - if (trim_to > log.bottom) { - dout(10) << " trimming " << log << " to " << trim_to << endl; - log.trim(t, trim_to); - info.log_bottom = log.bottom; - info.log_backlog = log.backlog; - trim_ondisklog_to(t, trim_to); - } - dout(10) << " ondisklog [" << ondisklog.bottom << "," << ondisklog.top << ")" << endl; -} - -void PG::read_log(ObjectStore *store) -{ - int r; - // load bounds - ondisklog.bottom = ondisklog.top = 0; - r = store->collection_getattr(info.pgid, "ondisklog_bottom", &ondisklog.bottom, sizeof(ondisklog.bottom)); - assert(r == sizeof(ondisklog.bottom)); - r = store->collection_getattr(info.pgid, "ondisklog_top", &ondisklog.top, sizeof(ondisklog.top)); - assert(r == sizeof(ondisklog.top)); - - dout(10) << "read_log [" << ondisklog.bottom << "," << ondisklog.top << ")" << endl; - - log.backlog = info.log_backlog; - log.bottom = info.log_bottom; - - if (ondisklog.top > 0) { - // read - bufferlist bl; - store->read(info.pgid.to_object(), ondisklog.bottom, ondisklog.top-ondisklog.bottom, bl); - - PG::Log::Entry e; - off_t pos = ondisklog.bottom; - assert(log.log.empty()); - while (pos < ondisklog.top) { - bl.copy(pos-ondisklog.bottom, sizeof(e), (char*)&e); - dout(10) << "read_log " << pos << " " << e << endl; - - if (e.version > log.bottom || log.backlog) { // ignore items below log.bottom - if (pos % 4096 == 0) - ondisklog.block_map[pos] = e.version; - log.log.push_back(e); - } else { - dout(10) << "read_log ignoring entry at " << pos << endl; - } - - if (g_conf.osd_pad_pg_log) // pad to 4k, until i fix ebofs reallocation crap. FIXME. - pos += 4096; - else - pos += sizeof(e); - } - } - log.top = info.last_update; - log.index(); - - // build missing - set did; - for (list::reverse_iterator i = log.log.rbegin(); - i != log.log.rend(); - i++) { - if (i->version <= info.last_complete) break; - if (did.count(i->oid)) continue; - did.insert(i->oid); - - if (i->is_delete()) continue; - - eversion_t v; - int r = osd->store->getattr(i->oid, "version", &v, sizeof(v)); - if (r < 0 || v < i->version) - missing.add(i->oid, i->version); - } -} - diff --git a/branches/marnberg/quota/osd/PG.h b/branches/marnberg/quota/osd/PG.h deleted file mode 100644 index f3b00cf935f91..0000000000000 --- a/branches/marnberg/quota/osd/PG.h +++ /dev/null @@ -1,707 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __PG_H -#define __PG_H - - -#include "include/types.h" -#include "include/buffer.h" - -#include "OSDMap.h" -#include "ObjectStore.h" -#include "msg/Messenger.h" -#include "messages/MOSDOpReply.h" - -#include "include/types.h" - -#include -using namespace std; - -#include -using namespace __gnu_cxx; - - -class OSD; - - - -/** PG - Replica Placement Group - * - */ - -class PG { -public: - - /* - * PG::Info - summary of PG statistics. - * - * some notes: - * - last_complete implies we have all objects that existed as of that - * stamp, OR a newer object, OR have already applied a later delete. - * - if last_complete >= log.bottom, then we know pg contents thru log.top. - * otherwise, we have no idea what the pg is supposed to contain. - */ - struct Info { - pg_t pgid; - eversion_t last_update; // last object version applied to store. - eversion_t last_complete; // last version pg was complete through. - - eversion_t log_bottom; // oldest log entry. - bool log_backlog; // do we store a complete log? - - epoch_t last_epoch_started; // last epoch started. - epoch_t last_epoch_finished; // last epoch finished. - - struct History { - epoch_t same_since; // same acting set since - epoch_t same_primary_since; // same primary at least back through this epoch. - epoch_t same_acker_since; // same acker at least back through this epoch. - History() : same_since(0), same_primary_since(0), same_acker_since(0) {} - } history; - - Info(pg_t p=0) : pgid(p), - log_backlog(false), - last_epoch_started(0), last_epoch_finished(0) {} - bool is_clean() const { return last_update == last_complete; } - bool is_empty() const { return last_update.version == 0; } - }; - - - /** - * Query - used to ask a peer for information about a pg. - * - * note: if version=0, type=LOG, then we just provide our full log. - * only if type=BACKLOG do we generate a backlog and provide that too. - */ - struct Query { - const static int INFO = 0; - const static int LOG = 1; - const static int BACKLOG = 2; - const static int FULLLOG = 3; - - int type; - eversion_t split, floor; - Info::History history; - - Query() : type(-1) {} - Query(int t, Info::History& h) : - type(t), history(h) { assert(t != LOG); } - Query(int t, eversion_t s, eversion_t f, Info::History& h) : - type(t), split(s), floor(f), history(h) { assert(t == LOG); } - }; - - - /* - * Missing - summary of missing objects. - * kept in memory, as a supplement to Log. - * also used to pass missing info in messages. - */ - class Missing { - public: - map missing; // oid -> v - map rmissing; // v -> oid - - map loc; // where i think i can get them. - - int num_lost() const { return missing.size() - loc.size(); } - int num_missing() const { return missing.size(); } - - bool is_missing(object_t oid) { - return missing.count(oid); - } - bool is_missing(object_t oid, eversion_t v) { - return missing.count(oid) && missing[oid] <= v; - } - void add(object_t oid) { - eversion_t z; - add(oid,z); - } - void add(object_t oid, eversion_t v) { - if (missing.count(oid)) { - if (missing[oid] > v) return; // already missing newer. - rmissing.erase(missing[oid]); - } - missing[oid] = v; - rmissing[v] = oid; - } - void rm(object_t oid, eversion_t when) { - if (missing.count(oid) && missing[oid] < when) { - rmissing.erase(missing[oid]); - missing.erase(oid); - loc.erase(oid); - } - } - void got(object_t oid, eversion_t v) { - assert(missing.count(oid)); - assert(missing[oid] <= v); - loc.erase(oid); - rmissing.erase(missing[oid]); - missing.erase(oid); - } - void got(object_t oid) { - assert(missing.count(oid)); - loc.erase(oid); - rmissing.erase(missing[oid]); - missing.erase(oid); - } - - void _encode(bufferlist& blist) { - ::_encode(missing, blist); - ::_encode(loc, blist); - } - void _decode(bufferlist& blist, int& off) { - ::_decode(missing, blist, off); - ::_decode(loc, blist, off); - - for (map::iterator it = missing.begin(); - it != missing.end(); - it++) - rmissing[it->second] = it->first; - } - }; - - - /* - * Log - incremental log of recent pg changes. - * also, serves as a recovery queue. - * - * when backlog is true, - * objects with versions <= bottom are in log. - * we do not have any deletion info before that time, however. - * log is a "summary" in that it contains all objects in the PG. - */ - class Log { - public: - /** top, bottom - * top - newest entry (update|delete) - * bottom - entry previous to oldest (update|delete) for which we have - * complete negative information. - * i.e. we can infer pg contents for any store whose last_update >= bottom. - */ - eversion_t top; // newest entry (update|delete) - eversion_t bottom; // version prior to oldest (update|delete) - - /** backlog - true if log is a complete summary of pg contents. - * updated will include all items in pg, but deleted will not include - * negative entries for items deleted prior to 'bottom'. - */ - bool backlog; - - /** Entry - * mapped from the eversion_t, so don't include that. - */ - class Entry { - public: - const static int LOST = 0; - const static int MODIFY = 1; - const static int CLONE = 2; - const static int DELETE = 3; - - int op; // write, zero, trunc, remove - object_t oid; - eversion_t version; - objectrev_t rev; - - reqid_t reqid; // caller+tid to uniquely identify request - - Entry() : op(0) {} - Entry(int _op, object_t _oid, const eversion_t& v, - const reqid_t& rid) : - op(_op), oid(_oid), version(v), reqid(rid) {} - - bool is_delete() const { return op == DELETE; } - bool is_clone() const { return op == CLONE; } - bool is_modify() const { return op == MODIFY; } - bool is_update() const { return is_clone() || is_modify(); } - }; - - list log; // the actual log. - - Log() : backlog(false) {} - - void clear() { - eversion_t z; - top = bottom = z; - backlog = false; - log.clear(); - } - bool empty() const { - return top.version == 0 && top.epoch == 0; - } - - void _encode(bufferlist& blist) const { - blist.append((char*)&top, sizeof(top)); - blist.append((char*)&bottom, sizeof(bottom)); - blist.append((char*)&backlog, sizeof(backlog)); - ::_encode(log, blist); - } - void _decode(bufferlist& blist, int& off) { - blist.copy(off, sizeof(top), (char*)&top); - off += sizeof(top); - blist.copy(off, sizeof(bottom), (char*)&bottom); - off += sizeof(bottom); - blist.copy(off, sizeof(backlog), (char*)&backlog); - off += sizeof(backlog); - - ::_decode(log, blist, off); - } - - void copy_after(const Log &other, eversion_t v); - bool copy_after_unless_divergent(const Log &other, eversion_t split, eversion_t floor); - void copy_non_backlog(const Log &other); - ostream& print(ostream& out) const; - }; - - /** - * IndexLog - adds in-memory index of the log, by oid. - * plus some methods to manipulate it all. - */ - class IndexedLog : public Log { - public: - hash_map objects; // ptrs into log. be careful! - hash_set caller_ops; - - // recovery pointers - list::iterator requested_to; // not inclusive of referenced item - list::iterator complete_to; // not inclusive of referenced item - - /****/ - IndexedLog() {} - - void clear() { - assert(0); - unindex(); - Log::clear(); - } - - bool logged_object(object_t oid) { - return objects.count(oid); - } - bool logged_req(const reqid_t &r) { - return caller_ops.count(r); - } - - void index() { - objects.clear(); - caller_ops.clear(); - for (list::iterator i = log.begin(); - i != log.end(); - i++) { - objects[i->oid] = &(*i); - caller_ops.insert(i->reqid); - } - } - - void index(Entry& e) { - if (objects.count(e.oid) == 0 || - objects[e.oid]->version < e.version) - objects[e.oid] = &e; - caller_ops.insert(e.reqid); - } - void unindex() { - objects.clear(); - caller_ops.clear(); - } - void unindex(Entry& e) { - // NOTE: this only works if we remove from the _bottom_ of the log! - assert(objects.count(e.oid)); - if (objects[e.oid]->version == e.version) - objects.erase(e.oid); - caller_ops.erase(e.reqid); - } - - - // accessors - Entry *is_updated(object_t oid) { - if (objects.count(oid) && objects[oid]->is_update()) return objects[oid]; - return 0; - } - Entry *is_deleted(object_t oid) { - if (objects.count(oid) && objects[oid]->is_delete()) return objects[oid]; - return 0; - } - - // actors - void add(Entry& e) { - // add to log - log.push_back(e); - assert(e.version > top); - assert(top.version == 0 || e.version.version > top.version); - top = e.version; - - // to our index - objects[e.oid] = &(log.back()); - caller_ops.insert(e.reqid); - } - - void trim(ObjectStore::Transaction &t, eversion_t s); - void trim_write_ahead(eversion_t last_update); - }; - - - /** - * OndiskLog - some info about how we store the log on disk. - */ - class OndiskLog { - public: - // ok - off_t bottom; // first byte of log. - off_t top; // byte following end of log. - map block_map; // block -> first stamp logged there - - OndiskLog() : bottom(0), top(0) {} - - bool trim_to(eversion_t v, ObjectStore::Transaction& t); - }; - - - /*** - */ - - class RepOpGather { - public: - class MOSDOp *op; - tid_t rep_tid; - - ObjectStore::Transaction t; - bool applied; - - set waitfor_ack; - set waitfor_commit; - - utime_t start; - - bool sent_ack, sent_commit; - - set osds; - eversion_t new_version; - - eversion_t pg_local_last_complete; - map pg_complete_thru; - - RepOpGather(MOSDOp *o, tid_t rt, eversion_t nv, eversion_t lc) : - op(o), rep_tid(rt), - applied(false), - sent_ack(false), sent_commit(false), - new_version(nv), - pg_local_last_complete(lc) { } - - bool can_send_ack() { - return !sent_ack && !sent_commit && - waitfor_ack.empty(); - } - bool can_send_commit() { - return !sent_commit && - waitfor_ack.empty() && waitfor_commit.empty(); - } - bool can_delete() { - return waitfor_ack.empty() && waitfor_commit.empty(); - } - }; - - - /*** PG ****/ -public: - // any - static const int STATE_ACTIVE = 1; // i am active. (primary: replicas too) - - // primary - static const int STATE_CLEAN = 2; // peers are complete, clean of stray replicas. - static const int STATE_CRASHED = 4; // all replicas went down. - static const int STATE_REPLAY = 8; // crashed, waiting for replay - - // non-primary - static const int STATE_STRAY = 16; // i must notify the primary i exist. - - - protected: - OSD *osd; - -public: - // pg state - Info info; - IndexedLog log; - OndiskLog ondisklog; - Missing missing; - utime_t last_heartbeat; // - -protected: - int role; // 0 = primary, 1 = replica, -1=none. - int state; // see bit defns above - - // primary state - public: - vector acting; - epoch_t last_epoch_started_any; - eversion_t last_complete_commit; - - // [primary only] content recovery state - eversion_t peers_complete_thru; - bool have_master_log; - protected: - set prior_set; // current+prior OSDs, as defined by last_epoch_started_any. - set stray_set; // non-acting osds that have PG data. - set clean_set; // current OSDs that are clean - eversion_t oldest_update; // lowest (valid) last_update in active set - map peer_info; // info from peers (stray or prior) - set peer_info_requested; - map peer_missing; - set peer_log_requested; // logs i've requested (and start stamps) - set peer_summary_requested; - friend class OSD; - - - // [primary|tail] - // old way - map replica_ops; - map > replica_tids_by_osd; // osd -> (tid,...) - - // new way - map repop_gather; - map > waiting_for_repop; - - - // [primary|replica] - // pg waiters - list waiting_for_active; - hash_map > waiting_for_missing_object; - map replay_queue; - - // recovery - map objects_pulling; // which objects are currently being pulled - -public: - void clear_primary_state(); - - public: - bool is_acting(int osd) const { - for (unsigned i=0; i peers_complete_thru) { - peers_complete_thru = t; - return true; - } - return false; - } - - void proc_replica_log(Log &olog, Missing& omissing, int from); - void merge_log(Log &olog, Missing& omissing, int from); - void proc_missing(Log &olog, Missing &omissing, int fromosd); - - void generate_backlog(); - void drop_backlog(); - - void trim_write_ahead(); - - void peer(ObjectStore::Transaction& t, map< int, map >& query_map); - - void activate(ObjectStore::Transaction& t); - - void cancel_recovery(); - bool do_recovery(); - void do_peer_recovery(); - - void clean_replicas(); - - off_t get_log_write_pos() { - return 0; - } - - public: - PG(OSD *o, pg_t p) : - osd(o), - info(p), - role(0), - state(0), - last_epoch_started_any(0), - last_complete_commit(0), - peers_complete_thru(0), - have_master_log(true) - { } - - pg_t get_pgid() const { return info.pgid; } - int get_nrep() const { return acting.size(); } - - int get_primary() { return acting.empty() ? -1:acting[0]; } - //int get_tail() { return acting.empty() ? -1:acting[ acting.size()-1 ]; } - //int get_acker() { return g_conf.osd_rep == OSD_REP_PRIMARY ? get_primary():get_tail(); } - int get_acker() { - if (g_conf.osd_rep == OSD_REP_PRIMARY || - acting.size() <= 1) - return get_primary(); - return acting[1]; - } - - int get_role() const { return role; } - void set_role(int r) { role = r; } - - bool is_primary() const { return role == PG_ROLE_HEAD; } - bool is_acker() const { return role == PG_ROLE_ACKER; } - bool is_head() const { return role == PG_ROLE_HEAD; } - bool is_middle() const { return role == PG_ROLE_MIDDLE; } - bool is_residual() const { return role == PG_ROLE_STRAY; } - - //int get_state() const { return state; } - bool state_test(int m) const { return (state & m) != 0; } - void state_set(int m) { state |= m; } - void state_clear(int m) { state &= ~m; } - - bool is_complete() const { return info.last_complete == info.last_update; } - - bool is_active() const { return state_test(STATE_ACTIVE); } - bool is_crashed() const { return state_test(STATE_CRASHED); } - bool is_replay() const { return state_test(STATE_REPLAY); } - //bool is_complete() { return state_test(STATE_COMPLETE); } - bool is_clean() const { return state_test(STATE_CLEAN); } - bool is_stray() const { return state_test(STATE_STRAY); } - - bool is_empty() const { return info.last_update == 0; } - - int num_active_ops() const { - return objects_pulling.size(); - } - - - // pg on-disk content - void clean_up_local(ObjectStore::Transaction& t); - - // pg on-disk state - void write_log(ObjectStore::Transaction& t); - void append_log(ObjectStore::Transaction& t, - PG::Log::Entry& logentry, - eversion_t trim_to); - void read_log(ObjectStore *store); - void trim_ondisklog_to(ObjectStore::Transaction& t, eversion_t v); - - - -}; - - - -inline ostream& operator<<(ostream& out, const PG::Info::History& h) -{ - return out << h.same_since << "/" << h.same_primary_since << "/" << h.same_acker_since; -} - -inline ostream& operator<<(ostream& out, const PG::Info& pgi) -{ - out << "pginfo(" << pgi.pgid; - if (pgi.is_empty()) - out << " empty"; - else - out << " v " << pgi.last_update << "/" << pgi.last_complete - << " (" << pgi.log_bottom << "," << pgi.last_update << "]" - << (pgi.log_backlog ? "+backlog":""); - out << " e " << pgi.last_epoch_started << "/" << pgi.last_epoch_finished - << " " << pgi.history - << ")"; - return out; -} - -inline ostream& operator<<(ostream& out, const PG::Log::Entry& e) -{ - return out << " " << e.version - << (e.is_delete() ? " - ": - (e.is_clone() ? " c ": - (e.is_modify() ? " m ": - " ? "))) - << e.oid << " by " << e.reqid; -} - -inline ostream& operator<<(ostream& out, const PG::Log& log) -{ - out << "log(" << log.bottom << "," << log.top << "]"; - if (log.backlog) out << "+backlog"; - return out; -} - -inline ostream& operator<<(ostream& out, const PG::Missing& missing) -{ - out << "missing(" << missing.num_missing(); - if (missing.num_lost()) out << ", " << missing.num_lost() << " lost"; - out << ")"; - return out; -} - -inline ostream& operator<<(ostream& out, const PG& pg) -{ - out << "pg[" << pg.info - << " r=" << pg.get_role(); - - if (pg.log.bottom != pg.info.log_bottom) - out << " (info mismatch, " << pg.log << ")"; - - if (pg.log.log.empty()) { - // shoudl it be? - if (pg.log.top.version - pg.log.bottom.version != 0) { - out << " (log bound mismatch, empty)"; - } - } else { - if (((pg.log.log.begin()->version.version - 1 != pg.log.bottom.version) && - !pg.log.backlog) || - (pg.log.log.rbegin()->version.version != pg.log.top.version)) { - out << " (log bound mismatch, actual=[" - << pg.log.log.begin()->version << "," - << pg.log.log.rbegin()->version << "] len=" << pg.log.log.size() << ")"; - } - } - - if (pg.get_role() == 0) out << " pct " << pg.peers_complete_thru; - if (!pg.have_master_log) out << " !hml"; - if (pg.is_active()) out << " active"; - if (pg.is_crashed()) out << " crashed"; - if (pg.is_replay()) out << " replay"; - if (pg.is_clean()) out << " clean"; - if (pg.is_stray()) out << " stray"; - //out << " (" << pg.log.bottom << "," << pg.log.top << "]"; - if (pg.missing.num_missing()) out << " m=" << pg.missing.num_missing(); - if (pg.missing.num_lost()) out << " l=" << pg.missing.num_lost(); - out << "]"; - - - return out; -} - - -inline ostream& operator<<(ostream& out, PG::RepOpGather& repop) -{ - out << "repop(" << &repop << " rep_tid=" << repop.rep_tid - << " wfack=" << repop.waitfor_ack - << " wfcommit=" << repop.waitfor_commit; - out << " pct=" << repop.pg_complete_thru; - out << " op=" << *(repop.op); - out << " repop=" << &repop; - out << ")"; - return out; -} - - -#endif diff --git a/branches/marnberg/quota/osd/osd_types.h b/branches/marnberg/quota/osd/osd_types.h deleted file mode 100644 index f8656e1f3e178..0000000000000 --- a/branches/marnberg/quota/osd/osd_types.h +++ /dev/null @@ -1,174 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __OSD_TYPES_H -#define __OSD_TYPES_H - -#include "include/reqid.h" - -#define PG_INO 1 - - -// osd types -typedef __uint64_t coll_t; // collection id - -// pg stuff -typedef __uint16_t ps_t; -typedef __uint8_t pruleset_t; - -// placement group id -struct pg_t { - union { - struct { - __uint32_t preferred:32; // 32 - ps_t ps:16; // 16 - __uint8_t nrep:8; // 8 - pruleset_t ruleset:8; // 8 - } fields; - __uint64_t val; // 64 - } u; - - pg_t() { u.val = 0; } - pg_t(const pg_t& o) { u.val = o.u.val; } - pg_t(ps_t s, int p, unsigned char n, pruleset_t r=0) { - u.fields.ps = s; - u.fields.preferred = p; - u.fields.nrep = n; - u.fields.ruleset = r; - } - pg_t(__uint64_t v) { u.val = v; } - /* - pg_t operator=(__uint64_t v) { u.val = v; return *this; } - pg_t operator&=(__uint64_t v) { u.val &= v; return *this; } - pg_t operator+=(pg_t o) { u.val += o.val; return *this; } - pg_t operator-=(pg_t o) { u.val -= o.val; return *this; } - pg_t operator++() { ++u.val; return *this; } - */ - operator __uint64_t() const { return u.val; } - - object_t to_object() const { return object_t(PG_INO, u.val >> 32, u.val & 0xffffffff); } -}; - -inline ostream& operator<<(ostream& out, pg_t pg) { - //return out << hex << pg.val << dec; - if (pg.u.fields.ruleset) - out << (int)pg.u.fields.ruleset << '.'; - out << (int)pg.u.fields.nrep << '.'; - if (pg.u.fields.preferred) - out << pg.u.fields.preferred << '.'; - out << hex << pg.u.fields.ps << dec; - out << "=" << hex << pg.u.val << dec; - out << "=" << hex << (__uint64_t)pg << dec; - return out; -} - -namespace __gnu_cxx { - template<> struct hash< pg_t > - { - size_t operator()( const pg_t& x ) const - { - static hash<__uint64_t> H; - return H(x); - } - }; -} - - - -// compound rados version type -class eversion_t { -public: - epoch_t epoch; - version_t version; - eversion_t(epoch_t e=0, version_t v=0) : epoch(e), version(v) {} -}; - -inline bool operator==(const eversion_t& l, const eversion_t& r) { - return (l.epoch == r.epoch) && (l.version == r.version); -} -inline bool operator!=(const eversion_t& l, const eversion_t& r) { - return (l.epoch != r.epoch) || (l.version != r.version); -} -inline bool operator<(const eversion_t& l, const eversion_t& r) { - return (l.epoch == r.epoch) ? (l.version < r.version):(l.epoch < r.epoch); -} -inline bool operator<=(const eversion_t& l, const eversion_t& r) { - return (l.epoch == r.epoch) ? (l.version <= r.version):(l.epoch <= r.epoch); -} -inline bool operator>(const eversion_t& l, const eversion_t& r) { - return (l.epoch == r.epoch) ? (l.version > r.version):(l.epoch > r.epoch); -} -inline bool operator>=(const eversion_t& l, const eversion_t& r) { - return (l.epoch == r.epoch) ? (l.version >= r.version):(l.epoch >= r.epoch); -} -inline ostream& operator<<(ostream& out, const eversion_t e) { - return out << e.epoch << "'" << e.version; -} - - - - - -// ----------------------------------------- - -class ObjectExtent { - public: - object_t oid; // object id - off_t start; // in object - size_t length; // in object - - objectrev_t rev; // which revision? - pg_t pgid; // where to find the object - - map buffer_extents; // off -> len. extents in buffer being mapped (may be fragmented bc of striping!) - - ObjectExtent() : start(0), length(0), rev(0), pgid(0) {} - ObjectExtent(object_t o, off_t s=0, size_t l=0) : oid(o), start(s), length(l), rev(0), pgid(0) { } -}; - -inline ostream& operator<<(ostream& out, ObjectExtent &ex) -{ - return out << "extent(" - << ex.oid << " in " << hex << ex.pgid << dec - << " " << ex.start << "~" << ex.length - << ")"; -} - - - -// --------------------------------------- - -class OSDSuperblock { -public: - const static __uint64_t MAGIC = 0xeb0f505dULL; - __uint64_t magic; - __uint64_t fsid; // unique fs id (random number) - int whoami; // my role in this fs. - epoch_t current_epoch; // most recent epoch - epoch_t oldest_map, newest_map; // oldest/newest maps we have. - OSDSuperblock(__uint64_t f=0, int w=0) : - magic(MAGIC), fsid(f), whoami(w), - current_epoch(0), oldest_map(0), newest_map(0) {} -}; - -inline ostream& operator<<(ostream& out, OSDSuperblock& sb) -{ - return out << "sb(fsid " << sb.fsid - << " osd" << sb.whoami - << " e" << sb.current_epoch - << " [" << sb.oldest_map << "," << sb.newest_map - << "])"; -} - - -#endif diff --git a/branches/marnberg/quota/osd/rush.cc b/branches/marnberg/quota/osd/rush.cc deleted file mode 100644 index aebca7ac1a351..0000000000000 --- a/branches/marnberg/quota/osd/rush.cc +++ /dev/null @@ -1,230 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -// -// -// rush.cc -// -// $Id$ -// - -#include -#include -#include -#include "rush.h" - - -static -unsigned int -myhash (unsigned int n) -{ - unsigned int v = (n ^ 0xdead1234) * (884811920 * 3 + 1); - return (v); -} - -Rush::Rush () -{ - nClusters = 0; - totalServers = 0; -} - -//---------------------------------------------------------------------- -// -// Rush::AddCluster -// -// Add a cluster. The number of servers in the cluster and -// the weight of each server is passed. The current number of -// clusters is returned. -// -//---------------------------------------------------------------------- -int -Rush::AddCluster (int nServers, double weight) -{ - clusterSize[nClusters] = nServers; - clusterWeight[nClusters] = weight; - if (nClusters == 0) { - serversInPrevious[0] = 0; - totalWeightBefore[0] = 0.0; - } else { - serversInPrevious[nClusters] = serversInPrevious[nClusters-1] + - clusterSize[nClusters-1]; - totalWeightBefore[nClusters] = - totalWeightBefore[nClusters-1] + (double)clusterSize[nClusters-1] * - clusterWeight[nClusters-1]; - } - nClusters += 1; - totalServers += nServers; -#if 0 - for (int i = 0; i < nClusters; i++) { - fprintf (stderr, "size=%-3d prev=%-3d weight=%-6.2f prevWeight=%-8.2f\n", - clusterSize[i], serversInPrevious[i], clusterWeight[i], - totalWeightBefore[i]); - } -#endif - return (nClusters); -} - - -//---------------------------------------------------------------------- -// -// Rush::GetServersByKey -// -// This function returns a list of servers on which an object -// should be placed. The servers array must be large enough to -// contain the list. -// -//---------------------------------------------------------------------- -void -Rush::GetServersByKey (int key, int nReplicas, int servers[]) -{ - int replicasLeft = nReplicas; - int cluster; - int mustAssign, numberAssigned; - int i, toDraw; - int *srv = servers; - double myWeight; - RushRNG rng; - - // There may not be more replicas than servers! - assert (nReplicas <= totalServers); - - for (cluster = nClusters-1; (cluster >= 0) && (replicasLeft > 0); cluster--) { - if (serversInPrevious[cluster] < replicasLeft) { - mustAssign = replicasLeft - serversInPrevious[cluster]; - } else { - mustAssign = 0; - } - toDraw = replicasLeft - mustAssign; - if (toDraw > (clusterSize[cluster] - mustAssign)) { - toDraw = clusterSize[cluster] - mustAssign; - } - myWeight = (double)clusterSize[cluster] * clusterWeight[cluster]; - rng.Seed (myhash (key)^cluster, cluster^0xb90738); - numberAssigned = mustAssign + - rng.HyperGeometricWeighted (toDraw, myWeight, - totalWeightBefore[cluster] + myWeight, - clusterWeight[cluster]); - if (numberAssigned > 0) { - rng.Seed (myhash (key)^cluster ^ 11, cluster^0xfea937); - rng.DrawKofN (srv, numberAssigned, clusterSize[cluster]); - for (i = 0; i < numberAssigned; i++) { - srv[i] += serversInPrevious[cluster]; - } - replicasLeft -= numberAssigned; - srv += numberAssigned; - } - } -} - - - -//---------------------------------------------------------------------- -// -// RushRNG::HyperGeometricWeighted -// -// Use an iterative method to generate a hypergeometric random -// variable. This approach guarantees that, if the number of draws -// is reduced, the number of successes must be as well as long as -// the seed for the RNG is the same. -// -//---------------------------------------------------------------------- -int -RushRNG::HyperGeometricWeighted (int nDraws, double yesWeighted, - double totalWeighted, double weightOne) -{ - int positives = 0, i; - double curRand; - - // If the weight is too small (or is negative), choose zero objects. - if (weightOne <= 1e-9 || nDraws == 0) { - return (0); - } - - // Draw nDraws items from the "bag". For each positive, subtract off - // the weight of an object from the weight of positives remaining. For - // each draw, subtract off the weight of an object from the total weight - // remaining. - for (i = 0; i < nDraws; i++) { - curRand = RandomDouble (); - if (curRand < (yesWeighted / totalWeighted)) { - positives += 1; - yesWeighted -= weightOne; - } - totalWeighted -= weightOne; - } - return (positives); -} - -//---------------------------------------------------------------------- -// -// RushRNG::DrawKofN -// -//---------------------------------------------------------------------- -void -RushRNG::DrawKofN (int vals[], int nToDraw, int setSize) -{ - int deck[setSize]; - int i, pick; - - assert(nToDraw <= setSize); - - for (i = 0; i < setSize; i++) { - deck[i] = i; - } - - for (i = 0; i < nToDraw; i++) { - pick = (int)(RandomDouble () * (double)(setSize - i)); - if (pick >= setSize-i) pick = setSize-i-1; // in case - // assert(i >= 0 && i < nToDraw); - // assert(pick >= 0 && pick < setSize); - vals[i] = deck[pick]; - deck[pick] = deck[setSize-i-1]; - } -} - -#define SEED_X 521288629 -#define SEED_Y 362436069 -RushRNG::RushRNG () -{ - Seed (0, 0); -} - -void -RushRNG::Seed (unsigned int seed1, unsigned int seed2) -{ - state1 = ((seed1 == 0) ? SEED_X : seed1); - state2 = ((seed2 == 0) ? SEED_Y : seed2); -} - -unsigned int -RushRNG::RandomInt () -{ - const unsigned int a = 18000; - const unsigned int b = 18879; - unsigned int rndValue; - - state1 = a * (state1 & 0xffff) + (state1 >> 16); - state2 = b * (state2 & 0xffff) + (state2 >> 16); - rndValue = (state1 << 16) + (state2 & 0xffff); - return (rndValue); -} - -double -RushRNG::RandomDouble () -{ - double v; - - v = (double)RandomInt() / (65536.0*65536.0); - return (v); -} diff --git a/branches/marnberg/quota/osd/rush.h b/branches/marnberg/quota/osd/rush.h deleted file mode 100644 index 3d880a32415e0..0000000000000 --- a/branches/marnberg/quota/osd/rush.h +++ /dev/null @@ -1,60 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -// -// -// rush.h -// -// Classes and definitions for the RUSH algorithm. -// -// $Id$ -// -// - -#ifndef _rush_h_ -#define _rush_h_ - -#define RUSH_MAX_CLUSTERS 100 - -class RushRNG { -public: - unsigned int RandomInt (); - double RandomDouble (); - void Seed (unsigned int a, unsigned int b); - int HyperGeometricWeighted (int nDraws, double yesWeighted, - double totalWeighted, double weightOne); - void DrawKofN (int vals[], int nToDraw, int setSize); - RushRNG(); -private: - unsigned int state1, state2; -}; - -class Rush { -public: - void GetServersByKey (int key, int nReplicas, int servers[]); - int AddCluster (int nServers, double weight); - int Clusters () {return (nClusters);} - int Servers () {return (totalServers);} - Rush (); -private: - int DrawKofN (int *servers, int n, int clusterSize, RushRNG *g); - int nClusters; - int totalServers; - int clusterSize[RUSH_MAX_CLUSTERS]; - int serversInPrevious[RUSH_MAX_CLUSTERS]; - double clusterWeight[RUSH_MAX_CLUSTERS]; - double totalWeightBefore[RUSH_MAX_CLUSTERS]; -}; - -#endif /* _rush_h_ */ diff --git a/branches/marnberg/quota/osd/tp.cc b/branches/marnberg/quota/osd/tp.cc deleted file mode 100644 index c8171895beef0..0000000000000 --- a/branches/marnberg/quota/osd/tp.cc +++ /dev/null @@ -1,80 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include -#include - -using namespace std; - -#include "common/Mutex.h" -#include "common/ThreadPool.h" -// #include - -class Op { - int i; - -public: - - Op(int i) - { - this->i = i; - } - - int get() - { - return i; - } -}; - -void foop(class TP *t, class Op *o); - -class TP { -public: - - void foo(Op *o) - { - cout << "Thread "<< pthread_self() << ": " << o->get() << "\n"; - usleep(1); - - // sched_yield(); - } - - int main(int argc, char *argv) - { - ThreadPool *t = new ThreadPool(10, (void (*)(TP*, Op*))foop, this); - - for(int i = 0; i < 100; i++) { - Op *o = new Op(i); - t->put_op(o); - } - - sleep(1); - - delete(t); - - return 0; - } -}; - -void foop(class TP *t, class Op *o) { - t->foo(o); -} - -int main(int argc, char *argv) { - TP t; - - t.main(argc,argv); -} - diff --git a/branches/marnberg/quota/osdc/Blinker.h b/branches/marnberg/quota/osdc/Blinker.h deleted file mode 100644 index 231fe47fb1e31..0000000000000 --- a/branches/marnberg/quota/osdc/Blinker.h +++ /dev/null @@ -1,91 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __BLINKER_H -#define __BLINKER_H - -class Blinker { - - public: - - class Op { - int op; - static const int LOOKUP = 1; - static const int INSERT = 2; - static const int REMOVE = 3; - static const int CLEAR = 4; - Op(int o) : op(o) {} - }; - - class OpLookup : public Op { - public: - bufferptr key; - OpLookup(bufferptr& k) : Op(Op::LOOKUP), key(k) {} - }; - - class OpInsert : public Op { - bufferptr key; - bufferlist val; - OpInsert(bufferptr& k, bufferlist& v) : Op(Op::INSERT), key(k), val(v) {} - }; - - class OpRemove : public Op { - public: - bufferptr key; - OpRemove(bufferptr& k) : Op(Op::REMOVE), key(k) {} - }; - - class OpClear : public Op { - public: - OpClear() : Op(Op::CLEAR) {} - }; - - - -private: - Objecter *objecter; - - // in-flight operations. - - - // cache information about tree structure. - - - -public: - // public interface - - // simple accessors - void lookup(inode_t& inode, bufferptr& key, bufferlist *pval, Context *onfinish); - - // simple modifiers - void insert(inode_t& inode, bufferptr& key, bufferlist& val, Context *onack, Context *onsafe); - void remove(inode_t& inode, bufferptr& key, Context *onack, Context *onsafe); - void clear(inode_t& inode, Context *onack, Context *onsafe); - - // these are dangerous: the table may be large. - void listkeys(inode_t& inode, list* pkeys, Context *onfinish); - void listvals(inode_t& inode, list* pkeys, list* pvals, Context *onfinish); - - // fetch *at least* key, but also anything else that is convenient. - // include lexical bounds for which this is a complete result. - // (if *start and *end are empty, it's the entire table) - void prefetch(inode_t& inode, bufferptr& key, - list* pkeys, list* pvals, - bufferptr *start, bufferptr *end, - Context *onfinish); - - -}; - -#endif diff --git a/branches/marnberg/quota/osdc/Filer.cc b/branches/marnberg/quota/osdc/Filer.cc deleted file mode 100644 index 2a2871e5b9e37..0000000000000 --- a/branches/marnberg/quota/osdc/Filer.cc +++ /dev/null @@ -1,235 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include - -#include "Filer.h" -#include "osd/OSDMap.h" - -//#include "messages/MOSDRead.h" -//#include "messages/MOSDReadReply.h" -//#include "messages/MOSDWrite.h" -//#include "messages/MOSDWriteReply.h" -#include "messages/MOSDOp.h" -#include "messages/MOSDOpReply.h" -#include "messages/MOSDMap.h" - -#include "msg/Messenger.h" - -#include "include/Context.h" - -#include "config.h" -#undef dout -#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_filer) cout << g_clock.now() << " " << objecter->messenger->get_myname() << ".filer " - - -class Filer::C_Probe : public Context { -public: - Filer *filer; - Probe *probe; - object_t oid; - off_t size; - C_Probe(Filer *f, Probe *p, object_t o) : filer(f), probe(p), oid(o), size(0) {} - void finish(int r) { - filer->_probed(probe, oid, size); - } -}; - -int Filer::probe_fwd(inode_t& inode, - off_t start_from, - off_t *end, - Context *onfinish) -{ - dout(10) << "probe_fwd " << hex << inode.ino << dec << " starting from " << start_from << endl; - - Probe *probe = new Probe(inode, start_from, end, onfinish); - - // period (bytes before we jump unto a new set of object(s)) - off_t period = inode.layout.period(); - - // start with 1+ periods. - probe->probing_len = period; - if (start_from % period) - probe->probing_len += period - (start_from % period); - - _probe(probe); - return 0; -} - -void Filer::_probe(Probe *probe) -{ - dout(10) << "_probe " << hex << probe->inode.ino << dec << " " << probe->from << "~" << probe->probing_len << endl; - - // map range onto objects - file_to_extents(probe->inode, probe->from, probe->probing_len, probe->probing); - - for (list::iterator p = probe->probing.begin(); - p != probe->probing.end(); - p++) { - dout(10) << "_probe probing " << p->oid << endl; - C_Probe *c = new C_Probe(this, probe, p->oid); - probe->ops[p->oid] = objecter->stat(p->oid, &c->size, c); - } -} - -void Filer::_probed(Probe *probe, object_t oid, off_t size) -{ - dout(10) << "_probed " << probe->inode.ino << " object " << hex << oid << dec << " has size " << size << endl; - - probe->known[oid] = size; - assert(probe->ops.count(oid)); - probe->ops.erase(oid); - - if (!probe->ops.empty()) - return; // waiting for more! - - // analyze! - off_t end = 0; - for (list::iterator p = probe->probing.begin(); - p != probe->probing.end(); - p++) { - off_t shouldbe = p->length+p->start; - dout(10) << "_probed " << probe->inode.ino << " object " << hex << p->oid << dec - << " should be " << shouldbe - << ", actual is " << probe->known[p->oid] - << endl; - - if (probe->known[p->oid] < 0) { end = -1; break; } // error! - - assert(probe->known[p->oid] <= shouldbe); - if (shouldbe == probe->known[p->oid]) continue; // keep going - - // aha, we found the end! - // calc offset into buffer_extent to get distance from probe->from. - off_t oleft = probe->known[p->oid] - p->start; - for (map::iterator i = p->buffer_extents.begin(); - i != p->buffer_extents.end(); - i++) { - if (oleft <= (off_t)i->second) { - end = probe->from + i->first + oleft; - dout(10) << "_probed end is in buffer_extent " << i->first << "~" << i->second << " off " << oleft - << ", from was " << probe->from << ", end is " << end - << endl; - break; - } - oleft -= i->second; - } - break; - } - - if (end == 0) { - // keep probing! - dout(10) << "_probed didn't find end, probing further" << endl; - off_t period = probe->inode.layout.object_size * probe->inode.layout.stripe_count; - probe->from += probe->probing_len; - probe->probing_len = period; - _probe(probe); - return; - } - - if (end < 0) { - dout(10) << "_probed encountered an error while probing" << endl; - *probe->end = -1; - } else { - // hooray! - dout(10) << "_probed found end at " << end << endl; - *probe->end = end; - } - - // done! finish and clean up. - probe->onfinish->finish(end > 0 ? 0:-1); - delete probe->onfinish; - delete probe; -} - - -void Filer::file_to_extents(inode_t inode, - off_t offset, size_t len, - list& extents, - objectrev_t rev) -{ - dout(10) << "file_to_extents " << offset << "~" << len - << " on " << hex << inode.ino << dec - << endl; - - /* we want only one extent per object! - * this means that each extent we read may map into different bits of the - * final read buffer.. hence OSDExtent.buffer_extents - */ - map< object_t, ObjectExtent > object_extents; - - assert(inode.layout.object_size >= inode.layout.stripe_size); - off_t stripes_per_object = inode.layout.object_size / inode.layout.stripe_size; - dout(20) << " stripes_per_object " << stripes_per_object << endl; - - off_t cur = offset; - off_t left = len; - while (left > 0) { - // layout into objects - off_t blockno = cur / inode.layout.stripe_size; - off_t stripeno = blockno / inode.layout.stripe_count; - off_t stripepos = blockno % inode.layout.stripe_count; - off_t objectsetno = stripeno / stripes_per_object; - off_t objectno = objectsetno * inode.layout.stripe_count + stripepos; - - // find oid, extent - ObjectExtent *ex = 0; - object_t oid( inode.ino, objectno ); - if (object_extents.count(oid)) - ex = &object_extents[oid]; - else { - ex = &object_extents[oid]; - ex->oid = oid; - ex->rev = rev; - ex->pgid = objecter->osdmap->object_to_pg( oid, inode.layout ); - } - - // map range into object - off_t block_start = (stripeno % stripes_per_object)*inode.layout.stripe_size; - off_t block_off = cur % inode.layout.stripe_size; - off_t max = inode.layout.stripe_size - block_off; - - off_t x_offset = block_start + block_off; - off_t x_len; - if (left > max) - x_len = max; - else - x_len = left; - - if (ex->start + (off_t)ex->length == x_offset) { - // add to extent - ex->length += x_len; - } else { - // new extent - assert(ex->length == 0); - assert(ex->start == 0); - ex->start = x_offset; - ex->length = x_len; - } - ex->buffer_extents[cur-offset] = x_len; - - dout(15) << "file_to_extents " << *ex << " in " << ex->pgid << endl; - //cout << "map: ino " << ino << " oid " << ex.oid << " osd " << ex.osd << " offset " << ex.offset << " len " << ex.len << " ... left " << left << endl; - - left -= x_len; - cur += x_len; - } - - // make final list - for (map::iterator it = object_extents.begin(); - it != object_extents.end(); - it++) { - extents.push_back(it->second); - } -} diff --git a/branches/marnberg/quota/osdc/Filer.h b/branches/marnberg/quota/osdc/Filer.h deleted file mode 100644 index 161bfec304531..0000000000000 --- a/branches/marnberg/quota/osdc/Filer.h +++ /dev/null @@ -1,158 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __FILER_H -#define __FILER_H - -/*** Filer - * - * stripe file ranges onto objects. - * build list for the objecter or objectcacher. - * - * also, provide convenience methods that call objecter for you. - * - * "files" are identified by ino. - */ - -#include -#include -using namespace std; - -#include -#include -using namespace __gnu_cxx; - -#include "include/types.h" - -#include "osd/OSDMap.h" -#include "Objecter.h" - -class Context; -class Messenger; -class OSDMap; - - -/**** Filer interface ***/ - -class Filer { - Objecter *objecter; - - // probes - struct Probe { - inode_t inode; - off_t from; - off_t *end; - Context *onfinish; - - list probing; - off_t probing_len; - - map known; - map ops; - - Probe(inode_t &i, off_t f, off_t *e, Context *c) : - inode(i), from(f), end(e), onfinish(c), probing_len(0) {} - }; - - class C_Probe; - //friend class C_Probe; - - void _probe(Probe *p); - void _probed(Probe *p, object_t oid, off_t size); - - public: - Filer(Objecter *o) : objecter(o) {} - ~Filer() {} - - bool is_active() { - return objecter->is_active(); // || (oc && oc->is_active()); - } - - /*** async file interface ***/ - int read(inode_t& inode, - off_t offset, - size_t len, - bufferlist *bl, // ptr to data - Context *onfinish) { - Objecter::OSDRead *rd = new Objecter::OSDRead(bl); - file_to_extents(inode, offset, len, rd->extents); - return objecter->readx(rd, onfinish) > 0 ? 0:-1; - } - - int write(inode_t& inode, - off_t offset, - size_t len, - bufferlist& bl, - int flags, - Context *onack, - Context *oncommit, - objectrev_t rev=0) { - Objecter::OSDWrite *wr = new Objecter::OSDWrite(bl); - file_to_extents(inode, offset, len, wr->extents, rev); - return objecter->modifyx(wr, onack, oncommit) > 0 ? 0:-1; - } - - int zero(inode_t& inode, - off_t offset, - size_t len, - Context *onack, - Context *oncommit) { - Objecter::OSDModify *z = new Objecter::OSDModify(OSD_OP_ZERO); - file_to_extents(inode, offset, len, z->extents); - return objecter->modifyx(z, onack, oncommit) > 0 ? 0:-1; - } - - int remove(inode_t& inode, - off_t offset, - size_t len, - Context *onack, - Context *oncommit) { - Objecter::OSDModify *z = new Objecter::OSDModify(OSD_OP_DELETE); - file_to_extents(inode, offset, len, z->extents); - return objecter->modifyx(z, onack, oncommit) > 0 ? 0:-1; - } - - int probe_fwd(inode_t& inode, - off_t start_from, - off_t *end, - Context *onfinish); - - - /***** mapping *****/ - - /* map (ino, ono) to an object name - (to be used on any osd in the proper replica group) */ - /*object_t file_to_object(inodeno_t ino, - size_t _ono) { - __uint64_t ono = _ono; - assert(ino < (1ULL<& extents, - objectrev_t rev=0); - -}; - - - -#endif diff --git a/branches/marnberg/quota/osdc/Journaler.cc b/branches/marnberg/quota/osdc/Journaler.cc deleted file mode 100644 index dee6448b3494d..0000000000000 --- a/branches/marnberg/quota/osdc/Journaler.cc +++ /dev/null @@ -1,610 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "Journaler.h" - -#include "include/Context.h" -#include "common/Logger.h" -#include "msg/Messenger.h" - -#include "config.h" -#undef dout -#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_objecter) cout << g_clock.now() << " " << objecter->messenger->get_myname() << ".journaler " -#define derr(x) if (x <= g_conf.debug || x <= g_conf.debug_objecter) cerr << g_clock.now() << " " << objecter->messenger->get_myname() << ".journaler " - - - -void Journaler::reset() -{ - dout(1) << "reset to blank journal" << endl; - state = STATE_ACTIVE; - write_pos = flush_pos = ack_pos = - read_pos = requested_pos = received_pos = - expire_pos = trimming_pos = trimmed_pos = inode.layout.period(); -} - - -/***************** HEADER *******************/ - -ostream& operator<<(ostream& out, Journaler::Header &h) -{ - return out << "loghead(trim " << h.trimmed_pos - << ", expire " << h.expire_pos - << ", read " << h.read_pos - << ", write " << h.write_pos - << ")"; -} - -class Journaler::C_ReadHead : public Context { - Journaler *ls; -public: - bufferlist bl; - C_ReadHead(Journaler *l) : ls(l) {} - void finish(int r) { - ls->_finish_read_head(r, bl); - } -}; - -class Journaler::C_ProbeEnd : public Context { - Journaler *ls; -public: - off_t end; - C_ProbeEnd(Journaler *l) : ls(l), end(-1) {} - void finish(int r) { - ls->_finish_probe_end(r, end); - } -}; - -void Journaler::recover(Context *onread) -{ - assert(state != STATE_ACTIVE); - - if (onread) - waitfor_recover.push_back(onread); - - if (state != STATE_UNDEF) { - dout(1) << "recover - already recoverying" << endl; - return; - } - - dout(1) << "read_head" << endl; - state = STATE_READHEAD; - C_ReadHead *fin = new C_ReadHead(this); - filer.read(inode, 0, sizeof(Header), &fin->bl, fin); -} - -void Journaler::_finish_read_head(int r, bufferlist& bl) -{ - assert(state == STATE_READHEAD); - - if (bl.length() == 0) { - dout(1) << "_finish_read_head r=" << r << " read 0 bytes, assuming empty log" << endl; - state = STATE_ACTIVE; - list ls; - ls.swap(waitfor_recover); - finish_contexts(ls, 0); - return; - } - - // unpack header - Header h; - assert(bl.length() == sizeof(h)); - bl.copy(0, sizeof(h), (char*)&h); - - write_pos = flush_pos = ack_pos = h.write_pos; - read_pos = requested_pos = received_pos = h.read_pos; - expire_pos = h.expire_pos; - trimmed_pos = trimming_pos = h.trimmed_pos; - - dout(1) << "_finish_read_head " << h << ". probing for end of log (from " << write_pos << ")..." << endl; - - // probe the log - state = STATE_PROBING; - C_ProbeEnd *fin = new C_ProbeEnd(this); - filer.probe_fwd(inode, h.write_pos, &fin->end, fin); -} - -void Journaler::_finish_probe_end(int r, off_t end) -{ - assert(state == STATE_PROBING); - - if (end == -1) { - end = write_pos; - dout(1) << "_finish_probe_end write_pos = " << end - << " (header had " << write_pos << "). log was empty. recovered." - << endl; - assert(0); // hrm. - } else { - assert(end >= write_pos); - assert(r >= 0); - dout(1) << "_finish_probe_end write_pos = " << end - << " (header had " << write_pos << "). recovered." - << endl; - } - - write_pos = flush_pos = ack_pos = end; - - // done. - list ls; - ls.swap(waitfor_recover); - finish_contexts(ls, 0); -} - - -// WRITING - -class Journaler::C_WriteHead : public Context { -public: - Journaler *ls; - Header h; - Context *oncommit; - C_WriteHead(Journaler *l, Header& h_, Context *c) : ls(l), h(h_), oncommit(c) {} - void finish(int r) { - ls->_finish_write_head(h, oncommit); - } -}; - -void Journaler::write_head(Context *oncommit) -{ - assert(state == STATE_ACTIVE); - last_written.trimmed_pos = trimmed_pos; - last_written.expire_pos = expire_pos; - last_written.read_pos = read_pos; - last_written.write_pos = ack_pos; //write_pos; - dout(10) << "write_head " << last_written << endl; - - last_wrote_head = g_clock.now(); - - bufferlist bl; - bl.append((char*)&last_written, sizeof(last_written)); - filer.write(inode, 0, bl.length(), bl, 0, - 0, new C_WriteHead(this, last_written, oncommit)); -} - -void Journaler::_finish_write_head(Header &wrote, Context *oncommit) -{ - dout(10) << "_finish_write_head " << wrote << endl; - last_committed = wrote; - if (oncommit) { - oncommit->finish(0); - delete oncommit; - } - - trim(); // trim? -} - - -/***************** WRITING *******************/ - -class Journaler::C_Flush : public Context { - Journaler *ls; - off_t start; -public: - C_Flush(Journaler *l, off_t s) : ls(l), start(s) {} - void finish(int r) { ls->_finish_flush(r, start); } -}; - -void Journaler::_finish_flush(int r, off_t start) -{ - assert(r>=0); - - assert(start >= ack_pos); - assert(start < flush_pos); - assert(pending_flush.count(start)); - - // calc latency? - if (logger) { - utime_t lat = g_clock.now(); - lat -= pending_flush[start]; - logger->finc("lsum", lat); - logger->inc("lnum"); - } - - pending_flush.erase(start); - - // adjust ack_pos - if (pending_flush.empty()) - ack_pos = flush_pos; - else - ack_pos = pending_flush.begin()->first; - - dout(10) << "_finish_flush from " << start - << ", pending_flush now " << pending_flush - << ", write positions now " << write_pos << "/" << flush_pos << "/" << ack_pos - << endl; - - // kick waiters <= ack_pos - while (!waitfor_flush.empty()) { - if (waitfor_flush.begin()->first > ack_pos) break; - finish_contexts(waitfor_flush.begin()->second); - waitfor_flush.erase(waitfor_flush.begin()); - } -} - - -off_t Journaler::append_entry(bufferlist& bl, Context *onsync) -{ - size_t s = bl.length(); - - if (!g_conf.journaler_allow_split_entries) { - // will we span a stripe boundary? - int p = inode.layout.stripe_size; - if (write_pos / p != (write_pos + (off_t)(bl.length() + sizeof(s))) / p) { - // yes. - // move write_pos forward. - off_t owp = write_pos; - write_pos += p; - write_pos -= (write_pos % p); - - // pad with zeros. - bufferptr bp(write_pos - owp); - bp.zero(); - assert(bp.length() >= 4); - write_buf.push_back(bp); - - // now flush. - flush(); - - dout(12) << "append_entry skipped " << (write_pos-owp) << " bytes to " << write_pos << " to avoid spanning stripe boundary" << endl; - } - } - - dout(10) << "append_entry len " << bl.length() << " to " << write_pos << "~" << (bl.length() + sizeof(size_t)) << endl; - - // append - write_buf.append((char*)&s, sizeof(s)); - write_buf.append(bl); - write_pos += sizeof(s) + s; - - // flush now? - if (onsync) - flush(onsync); - - return write_pos; -} - - -void Journaler::flush(Context *onsync) -{ - if (write_pos == flush_pos) { - assert(write_buf.length() == 0); - dout(10) << "flush nothing to flush, write pointers at " << write_pos << "/" << flush_pos << "/" << ack_pos << endl; - - if (onsync) { - onsync->finish(0); - delete onsync; - } - return; - } - - unsigned len = write_pos - flush_pos; - assert(len == write_buf.length()); - dout(10) << "flush flushing " << flush_pos << "~" << len << endl; - - // submit write for anything pending - filer.write(inode, flush_pos, len, write_buf, 0, - new C_Flush(this, flush_pos), 0); // flush _start_ pos to _finish_flush - pending_flush[flush_pos] = g_clock.now(); - - // adjust pointers - flush_pos = write_pos; - write_buf.clear(); - - dout(10) << "flush write pointers now at " << write_pos << "/" << flush_pos << "/" << ack_pos << endl; - - // queue waiter (at _new_ write_pos; will go when reached by ack_pos) - if (onsync) - waitfor_flush[write_pos].push_back(onsync); - - // write head? - if (last_wrote_head.sec() + 30 < g_clock.now().sec()) { - write_head(); - } -} - - - -/***************** READING *******************/ - - -class Journaler::C_Read : public Context { - Journaler *ls; -public: - C_Read(Journaler *l) : ls(l) {} - void finish(int r) { ls->_finish_read(r); } -}; - -class Journaler::C_RetryRead : public Context { - Journaler *ls; -public: - C_RetryRead(Journaler *l) : ls(l) {} - void finish(int r) { ls->is_readable(); } // this'll kickstart. -}; - -void Journaler::_finish_read(int r) -{ - assert(r>=0); - - dout(10) << "_finish_read got " << received_pos << "~" << reading_buf.length() << endl; - received_pos += reading_buf.length(); - read_buf.claim_append(reading_buf); - assert(received_pos <= requested_pos); - dout(10) << "_finish_read read_buf now " << read_pos << "~" << read_buf.length() - << ", read pointers " << read_pos << "/" << received_pos << "/" << requested_pos - << endl; - - if (is_readable()) { // NOTE: this check may read more - // readable! - dout(10) << "_finish_read now readable" << endl; - if (on_readable) { - Context *f = on_readable; - on_readable = 0; - f->finish(0); - delete f; - } - - if (read_bl) { - bool r = try_read_entry(*read_bl); - assert(r); // this should have worked. - - // clear state - Context *f = on_read_finish; - on_read_finish = 0; - read_bl = 0; - - // do callback - f->finish(0); - delete f; - } - } - - // prefetch? - _prefetch(); -} - -/* NOTE: this could be slightly smarter... we could allow - * multiple reads to be in progress. e.g., if we prefetch, but - * then discover we need even more for an especially large entry. - * i don't think that circumstance will arise particularly often. - */ -void Journaler::_issue_read(off_t len) -{ - if (_is_reading()) { - dout(10) << "_issue_read " << len << " waiting, already reading " - << received_pos << "~" << (requested_pos-received_pos) << endl; - return; - } - assert(requested_pos == received_pos); - - // stuck at ack_pos? - assert(requested_pos <= ack_pos); - if (requested_pos == ack_pos) { - dout(10) << "_issue_read requested_pos = ack_pos = " << ack_pos << ", waiting" << endl; - assert(write_pos > requested_pos); - if (flush_pos == ack_pos) - flush(); - assert(flush_pos > ack_pos); - waitfor_flush[flush_pos].push_back(new C_RetryRead(this)); - return; - } - - // don't read too much - if (requested_pos + len > ack_pos) { - len = ack_pos - requested_pos; - dout(10) << "_issue_read reading only up to ack_pos " << ack_pos << endl; - } - - // go. - dout(10) << "_issue_read reading " << requested_pos << "~" << len - << ", read pointers " << read_pos << "/" << received_pos << "/" << (requested_pos+len) - << endl; - - filer.read(inode, requested_pos, len, &reading_buf, - new C_Read(this)); - requested_pos += len; -} - -void Journaler::_prefetch() -{ - // prefetch? - off_t left = requested_pos - read_pos; - if (left <= prefetch_from && // should read more, - !_is_reading() && // and not reading anything right now - write_pos > requested_pos) { // there's something more to read... - dout(10) << "_prefetch only " << left << " < " << prefetch_from - << ", prefetching " << endl; - _issue_read(fetch_len); - } -} - - -void Journaler::read_entry(bufferlist *bl, Context *onfinish) -{ - // only one read at a time! - assert(read_bl == 0); - assert(on_read_finish == 0); - - if (is_readable()) { - dout(10) << "read_entry at " << read_pos << ", read_buf is " - << read_pos << "~" << read_buf.length() - << ", readable now" << endl; - - // nice, just do it now. - bool r = try_read_entry(*bl); - assert(r); - - // callback - onfinish->finish(0); - delete onfinish; - } else { - dout(10) << "read_entry at " << read_pos << ", read_buf is " - << read_pos << "~" << read_buf.length() - << ", not readable now" << endl; - - bl->clear(); - - // set it up - read_bl = bl; - on_read_finish = onfinish; - - // is_readable() will have already initiated a read (if it was possible) - } -} - - -/* is_readable() - * return true if next entry is ready. - * kickstart read as necessary. - */ -bool Journaler::is_readable() -{ - // anything to read? - if (read_pos == write_pos) return false; - - // have enough for entry size? - size_t s = 0; - if (read_buf.length() >= sizeof(s)) - read_buf.copy(0, sizeof(s), (char*)&s); - - // entry and payload? - if (read_buf.length() >= sizeof(s) && - read_buf.length() >= sizeof(s) + s) - return true; // yep, next entry is ready. - - // darn it! - - // partial fragment at the end? - if (received_pos == write_pos) { - dout(10) << "is_readable() detected partial entry at tail, adjusting write_pos to " << read_pos << endl; - write_pos = flush_pos = ack_pos = read_pos; - assert(write_buf.length() == 0); - - // truncate? - // FIXME: how much? - - return false; - } - - // start reading some more? - if (!_is_reading()) { - if (s) - fetch_len = MAX(fetch_len, (off_t)(sizeof(s)+s-read_buf.length())); - _issue_read(fetch_len); - } - - return false; -} - - -/* try_read_entry(bl) - * read entry into bl if it's ready. - * otherwise, do nothing. (well, we'll start fetching it for good measure.) - */ -bool Journaler::try_read_entry(bufferlist& bl) -{ - if (!is_readable()) { // this may start a read. - dout(10) << "try_read_entry at " << read_pos << " not readable" << endl; - return false; - } - - size_t s; - assert(read_buf.length() >= sizeof(s)); - read_buf.copy(0, sizeof(s), (char*)&s); - assert(read_buf.length() >= sizeof(s) + s); - - dout(10) << "try_read_entry at " << read_pos << " reading " - << read_pos << "~" << (sizeof(s)+s) << endl; - - // do it - assert(bl.length() == 0); - read_buf.splice(0, sizeof(s)); - read_buf.splice(0, s, &bl); - read_pos += sizeof(s) + s; - - // prefetch? - _prefetch(); - return true; -} - -void Journaler::wait_for_readable(Context *onreadable) -{ - dout(10) << "wait_for_readable at " << read_pos << " onreadable " << onreadable << endl; - assert(!is_readable()); - assert(on_readable == 0); - on_readable = onreadable; -} - - - - -/***************** TRIMMING *******************/ - - -class Journaler::C_Trim : public Context { - Journaler *ls; - off_t to; -public: - C_Trim(Journaler *l, off_t t) : ls(l), to(t) {} - void finish(int r) { - ls->_trim_finish(r, to); - } -}; - -void Journaler::trim() -{ - off_t trim_to = last_committed.expire_pos; - trim_to -= trim_to % inode.layout.period(); - dout(10) << "trim last_commited head was " << last_committed - << ", can trim to " << trim_to - << endl; - if (trim_to == 0 || trim_to == trimming_pos) { - dout(10) << "trim already trimmed/trimming to " - << trimmed_pos << "/" << trimming_pos << endl; - return; - } - - // trim - assert(trim_to <= write_pos); - assert(trim_to > trimming_pos); - dout(10) << "trim trimming to " << trim_to - << ", trimmed/trimming/expire are " - << trimmed_pos << "/" << trimming_pos << "/" << expire_pos - << endl; - - filer.remove(inode, trimming_pos, trim_to-trimming_pos, - 0, new C_Trim(this, trim_to)); - trimming_pos = trim_to; -} - -void Journaler::_trim_finish(int r, off_t to) -{ - dout(10) << "_trim_finish trimmed_pos was " << trimmed_pos - << ", trimmed/trimming/expire now " - << to << "/" << trimming_pos << "/" << expire_pos - << endl; - assert(r >= 0); - - assert(to <= trimming_pos); - assert(to > trimmed_pos); - trimmed_pos = to; - - // finishers? - while (!waitfor_trim.empty() && - waitfor_trim.begin()->first <= trimmed_pos) { - finish_contexts(waitfor_trim.begin()->second, 0); - waitfor_trim.erase(waitfor_trim.begin()); - } -} - - -// eof. diff --git a/branches/marnberg/quota/osdc/Journaler.h b/branches/marnberg/quota/osdc/Journaler.h deleted file mode 100644 index 0b8d7061330e8..0000000000000 --- a/branches/marnberg/quota/osdc/Journaler.h +++ /dev/null @@ -1,218 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -/* Journaler - * - * This class stripes a serial log over objects on the store. Four logical pointers: - * - * write_pos - where we're writing new entries - * read_pos - where we're reading old entires - * expire_pos - what is deemed "old" by user - * trimmed_pos - where we're expiring old items - * - * trimmed_pos <= expire_pos <= read_pos <= write_pos. - * - * Often, read_pos <= write_pos (as with MDS log). During recovery, write_pos is undefined - * until the end of the log is discovered. - * - * A "head" struct at the beginning of the log is used to store metadata at - * regular intervals. The basic invariants include: - * - * head.read_pos <= read_pos -- the head may "lag", since it's updated lazily. - * head.write_pos <= write_pos - * head.expire_pos <= expire_pos - * head.trimmed_pos <= trimmed_pos - * - * More significantly, - * - * head.expire_pos >= trimmed_pos -- this ensures we can find the "beginning" of the log - * as last recorded, before it is trimmed. trimming will - * block until a sufficiently current expire_pos is committed. - * - * To recover log state, we simply start at the last write_pos in the head, and probe the - * object sequence sizes until we read the end. - * - * Head struct is stored in the first object. Actual journal starts after layout.period() bytes. - * - */ - -#ifndef __JOURNALER_H -#define __JOURNALER_H - -#include "Objecter.h" -#include "Filer.h" - -#include -#include - -class Context; -class Logger; - -class Journaler { - - // this goes at the head of the log "file". - struct Header { - off_t trimmed_pos; - off_t expire_pos; - off_t read_pos; - off_t write_pos; - Header() : trimmed_pos(0), expire_pos(0), read_pos(0), write_pos(0) {} - } last_written, last_committed; - - friend ostream& operator<<(ostream& out, Header &h); - - - // me - inode_t inode; - Objecter *objecter; - Filer filer; - - Logger *logger; - - // my state - static const int STATE_UNDEF = 0; - static const int STATE_READHEAD = 1; - static const int STATE_PROBING = 2; - static const int STATE_ACTIVE = 2; - - int state; - - // header - utime_t last_wrote_head; - void _finish_write_head(Header &wrote, Context *oncommit); - class C_WriteHead; - friend class C_WriteHead; - - list waitfor_recover; - void _finish_read_head(int r, bufferlist& bl); - void _finish_probe_end(int r, off_t end); - class C_ReadHead; - friend class C_ReadHead; - class C_ProbeEnd; - friend class C_ProbeEnd; - - - - // writer - off_t write_pos; // logical write position, where next entry will go - off_t flush_pos; // where we will flush. if write_pos>flush_pos, we're buffering writes. - off_t ack_pos; // what has been acked. - bufferlist write_buf; // write buffer. flush_pos + write_buf.length() == write_pos. - - std::map pending_flush; // start offsets and times for pending flushes - std::map > waitfor_flush; // when flushed through given offset - - void _finish_flush(int r, off_t start); - class C_Flush; - friend class C_Flush; - - // reader - off_t read_pos; // logical read position, where next entry starts. - off_t requested_pos; // what we've requested from OSD. - off_t received_pos; // what we've received from OSD. - bufferlist read_buf; // read buffer. read_pos + read_buf.length() == prefetch_pos. - bufferlist reading_buf; // what i'm reading into - - off_t fetch_len; // how much to read at a time - off_t prefetch_from; // how far from end do we read next chunk - - // for read_entry() in-progress read - bufferlist *read_bl; - Context *on_read_finish; - // for wait_for_readable() - Context *on_readable; - - bool _is_reading() { - return requested_pos > received_pos; - } - void _finish_read(int r); // we just read some (read completion callback) - void _issue_read(off_t len); // read some more - void _prefetch(); // maybe read ahead - class C_Read; - friend class C_Read; - class C_RetryRead; - friend class C_RetryRead; - - // trimmer - off_t expire_pos; // what we're allowed to trim to - off_t trimming_pos; // what we've requested to trim through - off_t trimmed_pos; // what has been trimmed - map > waitfor_trim; - - void _trim_finish(int r, off_t to); - class C_Trim; - friend class C_Trim; - -public: - Journaler(inode_t& inode_, Objecter *obj, Logger *l, off_t fl=0, off_t pff=0) : - inode(inode_), objecter(obj), filer(objecter), logger(l), - state(STATE_UNDEF), - write_pos(0), flush_pos(0), ack_pos(0), - read_pos(0), requested_pos(0), received_pos(0), - fetch_len(fl), prefetch_from(pff), - read_bl(0), on_read_finish(0), on_readable(0), - expire_pos(0), trimming_pos(0), trimmed_pos(0) - { - // prefetch intelligently. - // (watch out, this is big if you use big objects or weird striping) - if (!fetch_len) - fetch_len = inode.layout.object_size*inode.layout.stripe_count; - if (!prefetch_from) - prefetch_from = fetch_len / 2; - } - - // me - //void open(Context *onopen); - //void claim(Context *onclaim, msg_addr_t from); - - /* reset - * NOTE: we assume the caller knows/has ensured that any objects - * in our sequence do not exist.. e.g. after a MKFS. this is _not_ - * an "erase" method. - */ - void reset(); - void recover(Context *onfinish); - void write_head(Context *onsave=0); - - bool is_active() { return state == STATE_ACTIVE; } - - off_t get_write_pos() const { return write_pos; } - off_t get_read_pos() const { return read_pos; } - off_t get_expire_pos() const { return expire_pos; } - off_t get_trimmed_pos() const { return trimmed_pos; } - - // write - off_t append_entry(bufferlist& bl, Context *onsync = 0); - void flush(Context *onsync = 0); - - // read - void set_read_pos(off_t p) { - assert(requested_pos == received_pos); // we can't cope w/ in-progress read right now. - assert(read_bl == 0); // ... - read_pos = requested_pos = received_pos = p; - read_buf.clear(); - } - bool is_readable(); - bool try_read_entry(bufferlist& bl); - void wait_for_readable(Context *onfinish); - void read_entry(bufferlist* bl, Context *onfinish); - - // trim - void set_expire_pos(off_t ep) { expire_pos = ep; } - void trim(); - //bool is_trimmable() { return trimming_pos < expire_pos; } - //void trim(off_t trim_to=0, Context *c=0); -}; - - -#endif diff --git a/branches/marnberg/quota/osdc/ObjectCacher.cc b/branches/marnberg/quota/osdc/ObjectCacher.cc deleted file mode 100644 index 0933675ae2880..0000000000000 --- a/branches/marnberg/quota/osdc/ObjectCacher.cc +++ /dev/null @@ -1,1499 +0,0 @@ - -#include "msg/Messenger.h" -#include "ObjectCacher.h" -#include "Objecter.h" - - - -/*** ObjectCacher::BufferHead ***/ - - -/*** ObjectCacher::Object ***/ - -#undef dout -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_objectcacher) cout << g_clock.now() << " " << oc->objecter->messenger->get_myname() << ".objectcacher.object(" << oid << ") " - - -ObjectCacher::BufferHead *ObjectCacher::Object::split(BufferHead *bh, off_t off) -{ - dout(20) << "split " << *bh << " at " << off << endl; - - // split off right - ObjectCacher::BufferHead *right = new BufferHead(this); - right->last_write_tid = bh->last_write_tid; - right->set_state(bh->get_state()); - - off_t newleftlen = off - bh->start(); - right->set_start( off ); - right->set_length( bh->length() - newleftlen ); - - // shorten left - oc->bh_stat_sub(bh); - bh->set_length( newleftlen ); - oc->bh_stat_add(bh); - - // add right - oc->bh_add(this, right); - - // split buffers too - bufferlist bl; - bl.claim(bh->bl); - if (bl.length()) { - assert(bl.length() == (bh->length() + right->length())); - right->bl.substr_of(bl, bh->length(), right->length()); - bh->bl.substr_of(bl, 0, bh->length()); - } - - // move read waiters - if (!bh->waitfor_read.empty()) { - map >::iterator o, p = bh->waitfor_read.end(); - p--; - while (p != bh->waitfor_read.begin()) { - if (p->first < right->start()) break; - dout(0) << "split moving waiters at byte " << p->first << " to right bh" << endl; - right->waitfor_read[p->first].swap( p->second ); - o = p; - p--; - bh->waitfor_read.erase(o); - } - } - - dout(20) << "split left is " << *bh << endl; - dout(20) << "split right is " << *right << endl; - return right; -} - - -void ObjectCacher::Object::merge_left(BufferHead *left, BufferHead *right) -{ - assert(left->end() == right->start()); - assert(left->get_state() == right->get_state()); - - dout(10) << "merge_left " << *left << " + " << *right << endl; - oc->bh_remove(this, right); - oc->bh_stat_sub(left); - left->set_length( left->length() + right->length()); - oc->bh_stat_add(left); - - // data - left->bl.claim_append(right->bl); - - // version - // note: this is sorta busted, but should only be used for dirty buffers - left->last_write_tid = MAX( left->last_write_tid, right->last_write_tid ); - left->last_write = MAX( left->last_write, right->last_write ); - - // waiters - for (map >::iterator p = right->waitfor_read.begin(); - p != right->waitfor_read.end(); - p++) - left->waitfor_read[p->first].splice( left->waitfor_read[p->first].begin(), - p->second ); - - // hose right - delete right; - - dout(10) << "merge_left result " << *left << endl; -} - - - -/* - * map a range of bytes into buffer_heads. - * - create missing buffer_heads as necessary. - */ -int ObjectCacher::Object::map_read(Objecter::OSDRead *rd, - map& hits, - map& missing, - map& rx) -{ - for (list::iterator ex_it = rd->extents.begin(); - ex_it != rd->extents.end(); - ex_it++) { - - if (ex_it->oid != oid) continue; - - dout(10) << "map_read " << ex_it->oid - << " " << ex_it->start << "~" << ex_it->length << endl; - - map::iterator p = data.lower_bound(ex_it->start); - // p->first >= start - - off_t cur = ex_it->start; - off_t left = ex_it->length; - - if (p != data.begin() && - (p == data.end() || p->first > cur)) { - p--; // might overlap! - if (p->first + p->second->length() <= cur) - p++; // doesn't overlap. - } - - while (left > 0) { - // at end? - if (p == data.end()) { - // rest is a miss. - BufferHead *n = new BufferHead(this); - n->set_start( cur ); - n->set_length( left ); - oc->bh_add(this, n); - missing[cur] = n; - dout(20) << "map_read miss " << left << " left, " << *n << endl; - cur += left; - left -= left; - assert(left == 0); - assert(cur == ex_it->start + (off_t)ex_it->length); - break; // no more. - } - - if (p->first <= cur) { - // have it (or part of it) - BufferHead *e = p->second; - - if (e->is_clean() || - e->is_dirty() || - e->is_tx()) { - hits[cur] = e; // readable! - dout(20) << "map_read hit " << *e << endl; - } - else if (e->is_rx()) { - rx[cur] = e; // missing, not readable. - dout(20) << "map_read rx " << *e << endl; - } - else assert(0); - - off_t lenfromcur = MIN(e->end() - cur, left); - cur += lenfromcur; - left -= lenfromcur; - p++; - continue; // more? - - } else if (p->first > cur) { - // gap.. miss - off_t next = p->first; - BufferHead *n = new BufferHead(this); - n->set_start( cur ); - n->set_length( MIN(next - cur, left) ); - oc->bh_add(this,n); - missing[cur] = n; - cur += MIN(left, n->length()); - left -= MIN(left, n->length()); - dout(20) << "map_read gap " << *n << endl; - continue; // more? - } - else - assert(0); - } - } - return(0); -} - -/* - * map a range of extents on an object's buffer cache. - * - combine any bh's we're writing into one - * - break up bufferheads that don't fall completely within the range - * //no! - return a bh that includes the write. may also include other dirty data to left and/or right. - */ -ObjectCacher::BufferHead *ObjectCacher::Object::map_write(Objecter::OSDWrite *wr) -{ - BufferHead *final = 0; - - for (list::iterator ex_it = wr->extents.begin(); - ex_it != wr->extents.end(); - ex_it++) { - - if (ex_it->oid != oid) continue; - - dout(10) << "map_write oex " << ex_it->oid - << " " << ex_it->start << "~" << ex_it->length << endl; - - map::iterator p = data.lower_bound(ex_it->start); - // p->first >= start - - off_t cur = ex_it->start; - off_t left = ex_it->length; - - if (p != data.begin() && - (p == data.end() || p->first > cur)) { - p--; // might overlap or butt up! - - /*// dirty and butts up? - if (p->first + p->second->length() == cur && - p->second->is_dirty()) { - dout(10) << "map_write will append to tail of " << *p->second << endl; - final = p->second; - } - */ - if (p->first + p->second->length() <= cur) - p++; // doesn't overlap. - } - - while (left > 0) { - off_t max = left; - - // at end ? - if (p == data.end()) { - if (final == NULL) { - final = new BufferHead(this); - final->set_start( cur ); - final->set_length( max ); - oc->bh_add(this, final); - dout(10) << "map_write adding trailing bh " << *final << endl; - } else { - final->set_length( final->length() + max ); - } - left -= max; - cur += max; - continue; - } - - dout(10) << "p is " << *p->second << endl; - - if (p->first <= cur) { - BufferHead *bh = p->second; - dout(10) << "map_write bh " << *bh << " intersected" << endl; - - /*if (bh->is_dirty()) { - // already dirty, let's use it. - final = bh; - } else { - */ - if (p->first < cur) { - assert(final == 0); - if (cur + max >= p->first + p->second->length()) { - // we want right bit (one splice) - final = split(bh, cur); // just split it, take right half. - p++; - assert(p->second == final); - } else { - // we want middle bit (two splices) - final = split(bh, cur); - p++; - assert(p->second == final); - split(final, cur+max); - } - } else if (p->first == cur) { - /*if (bh->is_dirty()) { - // already dirty, use it. - } - else*/ - if (p->second->length() <= max) { - // whole bufferhead, piece of cake. - } else { - // we want left bit (one splice) - split(bh, cur + max); // just split - } - if (final) - merge_left(final,bh); - else - final = bh; - } - - // keep going. - off_t lenfromcur = final->end() - cur; - cur += lenfromcur; - left -= lenfromcur; - p++; - continue; - } else { - // gap! - off_t next = p->first; - off_t glen = MIN(next - cur, max); - dout(10) << "map_write gap " << cur << "~" << glen << endl; - if (final) { - final->set_length( final->length() + glen ); - } else { - final = new BufferHead(this); - final->set_start( cur ); - final->set_length( glen ); - oc->bh_add(this, final); - } - - cur += glen; - left -= glen; - continue; // more? - } - } - } - - // set versoin - assert(final); - dout(10) << "map_write final is " << *final << endl; - - return final; -} - - - -/*** ObjectCacher ***/ - -#undef dout -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_objectcacher) cout << g_clock.now() << " " << objecter->messenger->get_myname() << ".objectcacher " - - - -/* private */ - -void ObjectCacher::close_object(Object *ob) -{ - dout(10) << "close_object " << *ob << endl; - assert(ob->can_close()); - - // ok! - objects.erase(ob->get_oid()); - objects_by_ino[ob->get_ino()].erase(ob); - if (objects_by_ino[ob->get_ino()].empty()) - objects_by_ino.erase(ob->get_ino()); - delete ob; -} - - - - -void ObjectCacher::bh_read(BufferHead *bh) -{ - dout(7) << "bh_read on " << *bh << endl; - - mark_rx(bh); - - // finisher - C_ReadFinish *onfinish = new C_ReadFinish(this, bh->ob->get_oid(), bh->start(), bh->length()); - - // go - objecter->read(bh->ob->get_oid(), bh->start(), bh->length(), &onfinish->bl, - onfinish); -} - -void ObjectCacher::bh_read_finish(object_t oid, off_t start, size_t length, bufferlist &bl) -{ - //lock.Lock(); - dout(7) << "bh_read_finish " - << oid - << " " << start << "~" << length - << endl; - - if (objects.count(oid) == 0) { - dout(7) << "bh_read_finish no object cache" << endl; - } else { - Object *ob = objects[oid]; - - // apply to bh's! - off_t opos = start; - map::iterator p = ob->data.lower_bound(opos); - - while (p != ob->data.end() && - opos < start+(off_t)length) { - BufferHead *bh = p->second; - - if (bh->start() > opos) { - dout(1) << "weirdness: gap when applying read results, " - << opos << "~" << bh->start() - opos - << endl; - opos = bh->start(); - continue; - } - - if (!bh->is_rx()) { - dout(10) << "bh_read_finish skipping non-rx " << *bh << endl; - opos = bh->end(); - p++; - continue; - } - - assert(opos >= bh->start()); - assert(bh->start() == opos); // we don't merge rx bh's... yet! - assert(bh->length() <= start+(off_t)length-opos); - - bh->bl.substr_of(bl, - opos-bh->start(), - bh->length()); - mark_clean(bh); - dout(10) << "bh_read_finish read " << *bh << endl; - - opos = bh->end(); - p++; - - // finishers? - // called with lock held. - list ls; - for (map >::iterator p = bh->waitfor_read.begin(); - p != bh->waitfor_read.end(); - p++) - ls.splice(ls.end(), p->second); - bh->waitfor_read.clear(); - finish_contexts(ls); - } - } - //lock.Unlock(); -} - - -void ObjectCacher::bh_write(BufferHead *bh) -{ - dout(7) << "bh_write " << *bh << endl; - - // finishers - C_WriteAck *onack = new C_WriteAck(this, bh->ob->get_oid(), bh->start(), bh->length()); - C_WriteCommit *oncommit = new C_WriteCommit(this, bh->ob->get_oid(), bh->start(), bh->length()); - - // go - tid_t tid = objecter->write(bh->ob->get_oid(), bh->start(), bh->length(), bh->bl, - onack, oncommit); - - // set bh last_write_tid - onack->tid = tid; - oncommit->tid = tid; - bh->ob->last_write_tid = tid; - bh->last_write_tid = tid; - - mark_tx(bh); -} - -void ObjectCacher::lock_ack(list& oids, tid_t tid) -{ - for (list::iterator i = oids.begin(); - i != oids.end(); - i++) { - object_t oid = *i; - - if (objects.count(oid) == 0) { - dout(7) << "lock_ack no object cache" << endl; - assert(0); - } - - Object *ob = objects[oid]; - - list ls; - - assert(tid <= ob->last_write_tid); - if (ob->last_write_tid == tid) { - dout(10) << "lock_ack " << *ob - << " tid " << tid << endl; - - switch (ob->lock_state) { - case Object::LOCK_RDUNLOCKING: - case Object::LOCK_WRUNLOCKING: - ob->lock_state = Object::LOCK_NONE; - break; - case Object::LOCK_RDLOCKING: - case Object::LOCK_DOWNGRADING: - ob->lock_state = Object::LOCK_RDLOCK; - ls.splice(ls.begin(), ob->waitfor_rd); - break; - case Object::LOCK_UPGRADING: - case Object::LOCK_WRLOCKING: - ob->lock_state = Object::LOCK_WRLOCK; - ls.splice(ls.begin(), ob->waitfor_wr); - ls.splice(ls.begin(), ob->waitfor_rd); - break; - - default: - assert(0); - } - - ob->last_ack_tid = tid; - - if (ob->can_close()) - close_object(ob); - } else { - dout(10) << "lock_ack " << *ob - << " tid " << tid << " obsolete" << endl; - } - - // waiters? - if (ob->waitfor_ack.count(tid)) { - ls.splice(ls.end(), ob->waitfor_ack[tid]); - ob->waitfor_ack.erase(tid); - } - - finish_contexts(ls); - - } -} - -void ObjectCacher::bh_write_ack(object_t oid, off_t start, size_t length, tid_t tid) -{ - //lock.Lock(); - - dout(7) << "bh_write_ack " - << oid - << " tid " << tid - << " " << start << "~" << length - << endl; - if (objects.count(oid) == 0) { - dout(7) << "bh_write_ack no object cache" << endl; - assert(0); - } else { - Object *ob = objects[oid]; - - // apply to bh's! - for (map::iterator p = ob->data.lower_bound(start); - p != ob->data.end(); - p++) { - BufferHead *bh = p->second; - - if (bh->start() > start+(off_t)length) break; - - if (bh->start() < start && - bh->end() > start+(off_t)length) { - dout(20) << "bh_write_ack skipping " << *bh << endl; - continue; - } - - // make sure bh is tx - if (!bh->is_tx()) { - dout(10) << "bh_write_ack skipping non-tx " << *bh << endl; - continue; - } - - // make sure bh tid matches - if (bh->last_write_tid != tid) { - assert(bh->last_write_tid > tid); - dout(10) << "bh_write_ack newer tid on " << *bh << endl; - continue; - } - - // ok! mark bh clean. - mark_clean(bh); - dout(10) << "bh_write_ack clean " << *bh << endl; - } - - // update object last_ack. - assert(ob->last_ack_tid < tid); - ob->last_ack_tid = tid; - - // waiters? - if (ob->waitfor_ack.count(tid)) { - list ls; - ls.splice(ls.begin(), ob->waitfor_ack[tid]); - ob->waitfor_ack.erase(tid); - finish_contexts(ls); - } - } - //lock.Unlock(); -} - -void ObjectCacher::bh_write_commit(object_t oid, off_t start, size_t length, tid_t tid) -{ - //lock.Lock(); - - // update object last_commit - dout(7) << "bh_write_commit " - << oid - << " tid " << tid - << " " << start << "~" << length - << endl; - if (objects.count(oid) == 0) { - dout(7) << "bh_write_commit no object cache" << endl; - //assert(0); - } else { - Object *ob = objects[oid]; - - // update last_commit. - ob->last_commit_tid = tid; - - // waiters? - if (ob->waitfor_commit.count(tid)) { - list ls; - ls.splice(ls.begin(), ob->waitfor_commit[tid]); - ob->waitfor_commit.erase(tid); - finish_contexts(ls); - } - } - - // lock.Unlock(); -} - - -void ObjectCacher::flush(off_t amount) -{ - utime_t cutoff = g_clock.now(); - //cutoff.sec_ref() -= g_conf.client_oc_max_dirty_age; - - dout(10) << "flush " << amount << endl; - - /* - * NOTE: we aren't actually pulling things off the LRU here, just looking at the - * tail item. Then we call bh_write, which moves it to the other LRU, so that we - * can call lru_dirty.lru_get_next_expire() again. - */ - off_t did = 0; - while (amount == 0 || did < amount) { - BufferHead *bh = (BufferHead*) lru_dirty.lru_get_next_expire(); - if (!bh) break; - if (bh->last_write > cutoff) break; - - did += bh->length(); - bh_write(bh); - } -} - - -void ObjectCacher::trim(off_t max) -{ - if (max < 0) - max = g_conf.client_oc_size; - - dout(10) << "trim start: max " << max - << " clean " << get_stat_clean() - << endl; - - while (get_stat_clean() > max) { - BufferHead *bh = (BufferHead*) lru_rest.lru_expire(); - if (!bh) break; - - dout(10) << "trim trimming " << *bh << endl; - assert(bh->is_clean()); - - Object *ob = bh->ob; - bh_remove(ob, bh); - delete bh; - - if (ob->can_close()) { - dout(10) << "trim trimming " << *ob << endl; - close_object(ob); - } - } - - dout(10) << "trim finish: max " << max - << " clean " << get_stat_clean() - << endl; -} - - - -/* public */ - -/* - * returns # bytes read (if in cache). onfinish is untouched (caller must delete it) - * returns 0 if doing async read - */ -int ObjectCacher::readx(Objecter::OSDRead *rd, inodeno_t ino, Context *onfinish) -{ - bool success = true; - list hit_ls; - map stripe_map; // final buffer offset -> substring - - for (list::iterator ex_it = rd->extents.begin(); - ex_it != rd->extents.end(); - ex_it++) { - dout(10) << "readx " << *ex_it << endl; - - // get Object cache - Object *o = get_object(ex_it->oid, ino); - - // map extent into bufferheads - map hits, missing, rx; - o->map_read(rd, hits, missing, rx); - - if (!missing.empty() || !rx.empty()) { - // read missing - for (map::iterator bh_it = missing.begin(); - bh_it != missing.end(); - bh_it++) { - bh_read(bh_it->second); - if (success) { - dout(10) << "readx missed, waiting on " << *bh_it->second - << " off " << bh_it->first << endl; - success = false; - bh_it->second->waitfor_read[bh_it->first].push_back( new C_RetryRead(this, rd, ino, onfinish) ); - } - } - - // bump rx - for (map::iterator bh_it = rx.begin(); - bh_it != rx.end(); - bh_it++) { - touch_bh(bh_it->second); // bump in lru, so we don't lose it. - if (success) { - dout(10) << "readx missed, waiting on " << *bh_it->second - << " off " << bh_it->first << endl; - success = false; - bh_it->second->waitfor_read[bh_it->first].push_back( new C_RetryRead(this, rd, ino, onfinish) ); - } - } - } else { - assert(!hits.empty()); - - // make a plain list - for (map::iterator bh_it = hits.begin(); - bh_it != hits.end(); - bh_it++) { - dout(10) << "readx hit bh " << *bh_it->second << endl; - hit_ls.push_back(bh_it->second); - } - - // create reverse map of buffer offset -> object for the eventual result. - // this is over a single ObjectExtent, so we know that - // - the bh's are contiguous - // - the buffer frags need not be (and almost certainly aren't) - off_t opos = ex_it->start; - map::iterator bh_it = hits.begin(); - assert(bh_it->second->start() <= opos); - size_t bhoff = opos - bh_it->second->start(); - map::iterator f_it = ex_it->buffer_extents.begin(); - size_t foff = 0; - while (1) { - BufferHead *bh = bh_it->second; - assert(opos == (off_t)(bh->start() + bhoff)); - - dout(10) << "readx rmap opos " << opos - << ": " << *bh << " +" << bhoff - << " frag " << f_it->first << "~" << f_it->second << " +" << foff - << endl; - - size_t len = MIN(f_it->second - foff, - bh->length() - bhoff); - stripe_map[f_it->first].substr_of(bh->bl, - opos - bh->start(), - len); - opos += len; - bhoff += len; - foff += len; - if (opos == bh->end()) { - bh_it++; - bhoff = 0; - } - if (foff == f_it->second) { - f_it++; - foff = 0; - } - if (bh_it == hits.end()) break; - if (f_it == ex_it->buffer_extents.end()) break; - } - assert(f_it == ex_it->buffer_extents.end()); - assert(opos == ex_it->start + (off_t)ex_it->length); - } - } - - // bump hits in lru - for (list::iterator bhit = hit_ls.begin(); - bhit != hit_ls.end(); - bhit++) - touch_bh(*bhit); - - if (!success) return 0; // wait! - - // no misses... success! do the read. - assert(!hit_ls.empty()); - dout(10) << "readx has all buffers" << endl; - - // ok, assemble into result buffer. - rd->bl->clear(); - size_t pos = 0; - for (map::iterator i = stripe_map.begin(); - i != stripe_map.end(); - i++) { - assert(pos == i->first); - dout(10) << "readx adding buffer len " << i->second.length() << " at " << pos << endl; - pos += i->second.length(); - rd->bl->claim_append(i->second); - } - dout(10) << "readx result is " << rd->bl->length() << endl; - - // done with read. - delete rd; - - trim(); - - return pos; -} - - -int ObjectCacher::writex(Objecter::OSDWrite *wr, inodeno_t ino) -{ - utime_t now = g_clock.now(); - - for (list::iterator ex_it = wr->extents.begin(); - ex_it != wr->extents.end(); - ex_it++) { - // get object cache - Object *o = get_object(ex_it->oid, ino); - - // map it all into a single bufferhead. - BufferHead *bh = o->map_write(wr); - - // adjust buffer pointers (ie "copy" data into my cache) - // this is over a single ObjectExtent, so we know that - // - there is one contiguous bh - // - the buffer frags need not be (and almost certainly aren't) - // note: i assume striping is monotonic... no jumps backwards, ever! - off_t opos = ex_it->start; - for (map::iterator f_it = ex_it->buffer_extents.begin(); - f_it != ex_it->buffer_extents.end(); - f_it++) { - dout(10) << "writex writing " << f_it->first << "~" << f_it->second << " into " << *bh << " at " << opos << endl; - size_t bhoff = bh->start() - opos; - assert(f_it->second <= bh->length() - bhoff); - - bufferlist frag; - frag.substr_of(wr->bl, - f_it->first, f_it->second); - - bh->bl.claim_append(frag); - opos += f_it->second; - } - - // it's dirty. - mark_dirty(bh); - touch_bh(bh); - bh->last_write = now; - - // recombine with left? - map::iterator p = o->data.find(bh->start()); - if (p != o->data.begin()) { - p--; - if (p->second->is_dirty()) { - o->merge_left(p->second,bh); - bh = p->second; - } - } - // right? - p = o->data.find(bh->start()); - p++; - if (p != o->data.end() && - p->second->is_dirty()) - o->merge_left(p->second,bh); - } - - delete wr; - - trim(); - return 0; -} - - -// blocking wait for write. -void ObjectCacher::wait_for_write(size_t len, Mutex& lock) -{ - while (get_stat_dirty() > g_conf.client_oc_max_dirty) { - dout(10) << "wait_for_write waiting" << endl; - flusher_cond.Signal(); - stat_waiter++; - stat_cond.Wait(lock); - stat_waiter--; - dout(10) << "wait_for_write woke up" << endl; - } -} - -void ObjectCacher::flusher_entry() -{ - dout(10) << "flusher start" << endl; - lock.Lock(); - while (!flusher_stop) { - while (!flusher_stop) { - off_t all = get_stat_tx() + get_stat_rx() + get_stat_clean() + get_stat_dirty(); - dout(11) << "flusher " - << all << " / " << g_conf.client_oc_size << ": " - << get_stat_tx() << " tx, " - << get_stat_rx() << " rx, " - << get_stat_clean() << " clean, " - << get_stat_dirty() << " / " << g_conf.client_oc_max_dirty << " dirty" - << endl; - if (get_stat_dirty() > g_conf.client_oc_max_dirty) { - // flush some dirty pages - dout(10) << "flusher " - << get_stat_dirty() << " / " << g_conf.client_oc_max_dirty << " dirty," - << " flushing some dirty bhs" << endl; - flush(get_stat_dirty() - g_conf.client_oc_max_dirty); - } - else { - // check tail of lru for old dirty items - utime_t cutoff = g_clock.now(); - cutoff.sec_ref()--; - BufferHead *bh = 0; - while ((bh = (BufferHead*)lru_dirty.lru_get_next_expire()) != 0 && - bh->last_write < cutoff) { - dout(10) << "flusher flushing aged dirty bh " << *bh << endl; - bh_write(bh); - } - break; - } - } - if (flusher_stop) break; - flusher_cond.WaitInterval(lock, utime_t(1,0)); - } - lock.Unlock(); - dout(10) << "flusher finish" << endl; -} - - - -// blocking. atomic+sync. -int ObjectCacher::atomic_sync_readx(Objecter::OSDRead *rd, inodeno_t ino, Mutex& lock) -{ - dout(10) << "atomic_sync_readx " << rd - << " in " << ino - << endl; - - if (rd->extents.size() == 1) { - // single object. - // just write synchronously. - Cond cond; - bool done = false; - objecter->readx(rd, new C_SafeCond(&lock, &cond, &done)); - - // block - while (!done) cond.Wait(lock); - } else { - // spans multiple objects, or is big. - - // sort by object... - map by_oid; - for (list::iterator ex_it = rd->extents.begin(); - ex_it != rd->extents.end(); - ex_it++) - by_oid[ex_it->oid] = *ex_it; - - // lock - for (map::iterator i = by_oid.begin(); - i != by_oid.end(); - i++) { - Object *o = get_object(i->first, ino); - rdlock(o); - } - - // readx will hose rd - list extents = rd->extents; - - // do the read, into our cache - Cond cond; - bool done = false; - readx(rd, ino, new C_SafeCond(&lock, &cond, &done)); - - // block - while (!done) cond.Wait(lock); - - // release the locks - for (list::iterator ex_it = extents.begin(); - ex_it != extents.end(); - ex_it++) { - assert(objects.count(ex_it->oid)); - Object *o = objects[ex_it->oid]; - rdunlock(o); - } - } - - return 0; -} - -int ObjectCacher::atomic_sync_writex(Objecter::OSDWrite *wr, inodeno_t ino, Mutex& lock) -{ - dout(10) << "atomic_sync_writex " << wr - << " in " << ino - << endl; - - if (wr->extents.size() == 1 && - wr->extents.front().length <= g_conf.client_oc_max_sync_write) { - // single object. - - // make sure we aren't already locking/locked... - object_t oid = wr->extents.front().oid; - Object *o = 0; - if (objects.count(oid)) o = get_object(oid, ino); - if (!o || - (o->lock_state != Object::LOCK_WRLOCK && - o->lock_state != Object::LOCK_WRLOCKING && - o->lock_state != Object::LOCK_UPGRADING)) { - // just write synchronously. - dout(10) << "atomic_sync_writex " << wr - << " in " << ino - << " doing sync write" - << endl; - - Cond cond; - bool done = false; - objecter->modifyx(wr, new C_SafeCond(&lock, &cond, &done), 0); - - // block - while (!done) cond.Wait(lock); - return 0; - } - } - - // spans multiple objects, or is big. - // sort by object... - map by_oid; - for (list::iterator ex_it = wr->extents.begin(); - ex_it != wr->extents.end(); - ex_it++) - by_oid[ex_it->oid] = *ex_it; - - // wrlock - for (map::iterator i = by_oid.begin(); - i != by_oid.end(); - i++) { - Object *o = get_object(i->first, ino); - wrlock(o); - } - - // writex will hose wr - list extents = wr->extents; - - // do the write, into our cache - writex(wr, ino); - - // flush - // ...and release the locks? - for (list::iterator ex_it = extents.begin(); - ex_it != extents.end(); - ex_it++) { - assert(objects.count(ex_it->oid)); - Object *o = objects[ex_it->oid]; - - wrunlock(o); - } - - return 0; -} - - - -// locking ----------------------------- - -void ObjectCacher::rdlock(Object *o) -{ - // lock? - if (o->lock_state == Object::LOCK_NONE || - o->lock_state == Object::LOCK_RDUNLOCKING || - o->lock_state == Object::LOCK_WRUNLOCKING) { - dout(10) << "rdlock rdlock " << *o << endl; - - o->lock_state = Object::LOCK_RDLOCKING; - - C_LockAck *ack = new C_LockAck(this, o->get_oid()); - C_WriteCommit *commit = new C_WriteCommit(this, o->get_oid(), 0, 0); - - commit->tid = - ack->tid = - o->last_write_tid = - objecter->lock(OSD_OP_RDLOCK, o->get_oid(), ack, commit); - } - - // stake our claim. - o->rdlock_ref++; - - // wait? - if (o->lock_state == Object::LOCK_RDLOCKING || - o->lock_state == Object::LOCK_WRLOCKING) { - dout(10) << "rdlock waiting for rdlock|wrlock on " << *o << endl; - Cond cond; - bool done = false; - o->waitfor_rd.push_back(new C_SafeCond(&lock, &cond, &done)); - while (!done) cond.Wait(lock); - } - assert(o->lock_state == Object::LOCK_RDLOCK || - o->lock_state == Object::LOCK_WRLOCK || - o->lock_state == Object::LOCK_UPGRADING || - o->lock_state == Object::LOCK_DOWNGRADING); -} - -void ObjectCacher::wrlock(Object *o) -{ - // lock? - if (o->lock_state != Object::LOCK_WRLOCK && - o->lock_state != Object::LOCK_WRLOCKING && - o->lock_state != Object::LOCK_UPGRADING) { - dout(10) << "wrlock wrlock " << *o << endl; - - int op = 0; - if (o->lock_state == Object::LOCK_RDLOCK) { - o->lock_state = Object::LOCK_UPGRADING; - op = OSD_OP_UPLOCK; - } else { - o->lock_state = Object::LOCK_WRLOCKING; - op = OSD_OP_WRLOCK; - } - - C_LockAck *ack = new C_LockAck(this, o->get_oid()); - C_WriteCommit *commit = new C_WriteCommit(this, o->get_oid(), 0, 0); - - commit->tid = - ack->tid = - o->last_write_tid = - objecter->lock(op, o->get_oid(), ack, commit); - } - - // stake our claim. - o->wrlock_ref++; - - // wait? - if (o->lock_state == Object::LOCK_WRLOCKING || - o->lock_state == Object::LOCK_UPGRADING) { - dout(10) << "wrlock waiting for wrlock on " << *o << endl; - Cond cond; - bool done = false; - o->waitfor_wr.push_back(new C_SafeCond(&lock, &cond, &done)); - while (!done) cond.Wait(lock); - } - assert(o->lock_state == Object::LOCK_WRLOCK); -} - - -void ObjectCacher::rdunlock(Object *o) -{ - dout(10) << "rdunlock " << *o << endl; - assert(o->lock_state == Object::LOCK_RDLOCK || - o->lock_state == Object::LOCK_WRLOCK || - o->lock_state == Object::LOCK_UPGRADING || - o->lock_state == Object::LOCK_DOWNGRADING); - - assert(o->rdlock_ref > 0); - o->rdlock_ref--; - if (o->rdlock_ref > 0 || - o->wrlock_ref > 0) { - dout(10) << "rdunlock " << *o << " still has rdlock|wrlock refs" << endl; - return; - } - - release(o); // release first - - o->lock_state = Object::LOCK_RDUNLOCKING; - - C_LockAck *lockack = new C_LockAck(this, o->get_oid()); - C_WriteCommit *commit = new C_WriteCommit(this, o->get_oid(), 0, 0); - commit->tid = - lockack->tid = - o->last_write_tid = - objecter->lock(OSD_OP_RDUNLOCK, o->get_oid(), lockack, commit); -} - -void ObjectCacher::wrunlock(Object *o) -{ - dout(10) << "wrunlock " << *o << endl; - assert(o->lock_state == Object::LOCK_WRLOCK); - - assert(o->wrlock_ref > 0); - o->wrlock_ref--; - if (o->wrlock_ref > 0) { - dout(10) << "wrunlock " << *o << " still has wrlock refs" << endl; - return; - } - - flush(o); // flush first - - int op = 0; - if (o->rdlock_ref > 0) { - dout(10) << "wrunlock rdlock " << *o << endl; - op = OSD_OP_DNLOCK; - o->lock_state = Object::LOCK_DOWNGRADING; - } else { - dout(10) << "wrunlock wrunlock " << *o << endl; - op = OSD_OP_WRUNLOCK; - o->lock_state = Object::LOCK_WRUNLOCKING; - } - - C_LockAck *lockack = new C_LockAck(this, o->get_oid()); - C_WriteCommit *commit = new C_WriteCommit(this, o->get_oid(), 0, 0); - commit->tid = - lockack->tid = - o->last_write_tid = - objecter->lock(op, o->get_oid(), lockack, commit); -} - - -// ------------------------------------------------- - - -bool ObjectCacher::set_is_cached(inodeno_t ino) -{ - if (objects_by_ino.count(ino) == 0) - return false; - - set& s = objects_by_ino[ino]; - for (set::iterator i = s.begin(); - i != s.end(); - i++) { - Object *ob = *i; - if (!ob->data.empty()) return true; - } - - return false; -} - -bool ObjectCacher::set_is_dirty_or_committing(inodeno_t ino) -{ - if (objects_by_ino.count(ino) == 0) - return false; - - set& s = objects_by_ino[ino]; - for (set::iterator i = s.begin(); - i != s.end(); - i++) { - Object *ob = *i; - - for (map::iterator p = ob->data.begin(); - p != ob->data.end(); - p++) { - BufferHead *bh = p->second; - if (bh->is_dirty() || bh->is_tx()) - return true; - } - } - - return false; -} - - -// purge. non-blocking. violently removes dirty buffers from cache. -void ObjectCacher::purge(Object *ob) -{ - dout(10) << "purge " << *ob << endl; - - for (map::iterator p = ob->data.begin(); - p != ob->data.end(); - p++) { - BufferHead *bh = p->second; - dout(0) << "purge forcibly removing " << *bh << endl; - bh_remove(ob, bh); - delete bh; - } - - if (ob->can_close()) { - dout(10) << "trim trimming " << *ob << endl; - close_object(ob); - } -} - -// flush. non-blocking. no callback. -// true if clean, already flushed. -// false if we wrote something. -bool ObjectCacher::flush(Object *ob) -{ - bool clean = true; - for (map::iterator p = ob->data.begin(); - p != ob->data.end(); - p++) { - BufferHead *bh = p->second; - if (bh->is_tx()) { - clean = false; - continue; - } - if (!bh->is_dirty()) continue; - - bh_write(bh); - clean = false; - } - return clean; -} - -// flush. non-blocking, takes callback. -// returns true if already flushed -bool ObjectCacher::flush_set(inodeno_t ino, Context *onfinish) -{ - if (objects_by_ino.count(ino) == 0) { - dout(10) << "flush_set on " << ino << " dne" << endl; - return true; - } - - dout(10) << "flush_set " << ino << endl; - - C_Gather *gather = 0; // we'll need to wait for all objects to flush! - - set& s = objects_by_ino[ino]; - bool safe = true; - for (set::iterator i = s.begin(); - i != s.end(); - i++) { - Object *ob = *i; - - if (!flush(ob)) { - // we'll need to gather... - if (!gather && onfinish) - gather = new C_Gather(onfinish); - safe = false; - - dout(10) << "flush_set " << ino << " will wait for ack tid " - << ob->last_write_tid - << " on " << *ob - << endl; - if (gather) - ob->waitfor_ack[ob->last_write_tid].push_back(gather->new_sub()); - } - } - - if (safe) { - dout(10) << "flush_set " << ino << " has no dirty|tx bhs" << endl; - return true; - } - return false; -} - - -// commit. non-blocking, takes callback. -// return true if already flushed. -bool ObjectCacher::commit_set(inodeno_t ino, Context *onfinish) -{ - assert(onfinish); // doesn't make any sense otherwise. - - if (objects_by_ino.count(ino) == 0) { - dout(10) << "commit_set on " << ino << " dne" << endl; - return true; - } - - dout(10) << "commit_set " << ino << endl; - - C_Gather *gather = 0; // we'll need to wait for all objects to commit - - set& s = objects_by_ino[ino]; - bool safe = true; - for (set::iterator i = s.begin(); - i != s.end(); - i++) { - Object *ob = *i; - - // make sure it's flushing. - flush_set(ino); - - if (ob->last_write_tid > ob->last_commit_tid) { - dout(10) << "commit_set " << ino << " " << *ob - << " will finish on commit tid " << ob->last_write_tid - << endl; - if (!gather && onfinish) gather = new C_Gather(onfinish); - safe = false; - if (gather) - ob->waitfor_commit[ob->last_write_tid].push_back( gather->new_sub() ); - } - } - - if (safe) { - dout(10) << "commit_set " << ino << " all committed" << endl; - return true; - } - return false; -} - -void ObjectCacher::purge_set(inodeno_t ino) -{ - if (objects_by_ino.count(ino) == 0) { - dout(10) << "purge_set on " << ino << " dne" << endl; - return; - } - - dout(10) << "purge_set " << ino << endl; - - set& s = objects_by_ino[ino]; - for (set::iterator i = s.begin(); - i != s.end(); - i++) { - Object *ob = *i; - purge(ob); - } -} - - -off_t ObjectCacher::release(Object *ob) -{ - list clean; - off_t o_unclean = 0; - - for (map::iterator p = ob->data.begin(); - p != ob->data.end(); - p++) { - BufferHead *bh = p->second; - if (bh->is_clean()) - clean.push_back(bh); - else - o_unclean += bh->length(); - } - - for (list::iterator p = clean.begin(); - p != clean.end(); - p++) { - bh_remove(ob, *p); - delete *p; - } - - if (ob->can_close()) { - dout(10) << "trim trimming " << *ob << endl; - close_object(ob); - } - - return o_unclean; -} - -off_t ObjectCacher::release_set(inodeno_t ino) -{ - // return # bytes not clean (and thus not released). - off_t unclean = 0; - - if (objects_by_ino.count(ino) == 0) { - dout(10) << "release_set on " << ino << " dne" << endl; - return 0; - } - - dout(10) << "release_set " << ino << endl; - - set s = objects_by_ino[ino]; - for (set::iterator i = s.begin(); - i != s.end(); - i++) { - Object *ob = *i; - - off_t o_unclean = release(ob); - unclean += o_unclean; - - if (o_unclean) - dout(10) << "release_set " << ino << " " << *ob - << " has " << o_unclean << " bytes left" - << endl; - - } - - if (unclean) { - dout(10) << "release_set " << ino - << ", " << unclean << " bytes left" << endl; - } - - return unclean; -} - - -void ObjectCacher::kick_sync_writers(inodeno_t ino) -{ - if (objects_by_ino.count(ino) == 0) { - dout(10) << "kick_sync_writers on " << ino << " dne" << endl; - return; - } - - dout(10) << "kick_sync_writers on " << ino << endl; - - list ls; - - set& s = objects_by_ino[ino]; - for (set::iterator i = s.begin(); - i != s.end(); - i++) { - Object *ob = *i; - - ls.splice(ls.begin(), ob->waitfor_wr); - } - - finish_contexts(ls); -} - -void ObjectCacher::kick_sync_readers(inodeno_t ino) -{ - if (objects_by_ino.count(ino) == 0) { - dout(10) << "kick_sync_readers on " << ino << " dne" << endl; - return; - } - - dout(10) << "kick_sync_readers on " << ino << endl; - - list ls; - - set& s = objects_by_ino[ino]; - for (set::iterator i = s.begin(); - i != s.end(); - i++) { - Object *ob = *i; - - ls.splice(ls.begin(), ob->waitfor_rd); - } - - finish_contexts(ls); -} - - - diff --git a/branches/marnberg/quota/osdc/ObjectCacher.h b/branches/marnberg/quota/osdc/ObjectCacher.h deleted file mode 100644 index e9a4041008666..0000000000000 --- a/branches/marnberg/quota/osdc/ObjectCacher.h +++ /dev/null @@ -1,555 +0,0 @@ -#ifndef __OBJECTCACHER_H_ -#define __OBJECTCACHER_H_ - -#include "include/types.h" -#include "include/lru.h" -#include "include/Context.h" - -#include "common/Cond.h" -#include "common/Thread.h" - -#include "Objecter.h" -#include "Filer.h" - -class Objecter; -class Objecter::OSDRead; -class Objecter::OSDWrite; - -class ObjectCacher { - public: - - class Object; - - // ******* BufferHead ********* - class BufferHead : public LRUObject { - public: - // states - static const int STATE_MISSING = 0; - static const int STATE_CLEAN = 1; - static const int STATE_DIRTY = 2; - static const int STATE_RX = 3; - static const int STATE_TX = 4; - - private: - // my fields - int state; - int ref; - struct { - off_t start, length; // bh extent in object - } ex; - - public: - Object *ob; - bufferlist bl; - tid_t last_write_tid; // version of bh (if non-zero) - utime_t last_write; - - map< off_t, list > waitfor_read; - - public: - // cons - BufferHead(Object *o) : - state(STATE_MISSING), - ref(0), - ob(o), - last_write_tid(0) {} - - // extent - off_t start() { return ex.start; } - void set_start(off_t s) { ex.start = s; } - off_t length() { return ex.length; } - void set_length(off_t l) { ex.length = l; } - off_t end() { return ex.start + ex.length; } - off_t last() { return end() - 1; } - - // states - void set_state(int s) { - if (s == STATE_RX || s == STATE_TX) get(); - if (state == STATE_RX || state == STATE_TX) put(); - state = s; - } - int get_state() { return state; } - - bool is_missing() { return state == STATE_MISSING; } - bool is_dirty() { return state == STATE_DIRTY; } - bool is_clean() { return state == STATE_CLEAN; } - bool is_tx() { return state == STATE_TX; } - bool is_rx() { return state == STATE_RX; } - - // reference counting - int get() { - assert(ref >= 0); - if (ref == 0) lru_pin(); - return ++ref; - } - int put() { - assert(ref > 0); - if (ref == 1) lru_unpin(); - --ref; - return ref; - } - }; - - - // ******* Object ********* - class Object { - private: - // ObjectCacher::Object fields - ObjectCacher *oc; - object_t oid; // this _always_ is oid.rev=0 - inodeno_t ino; - objectrev_t rev; // last rev we're written - - public: - map data; - - tid_t last_write_tid; // version of bh (if non-zero) - tid_t last_ack_tid; // last update acked. - tid_t last_commit_tid; // last update commited. - - map< tid_t, list > waitfor_ack; - map< tid_t, list > waitfor_commit; - list waitfor_rd; - list waitfor_wr; - - // lock - static const int LOCK_NONE = 0; - static const int LOCK_WRLOCKING = 1; - static const int LOCK_WRLOCK = 2; - static const int LOCK_WRUNLOCKING = 3; - static const int LOCK_RDLOCKING = 4; - static const int LOCK_RDLOCK = 5; - static const int LOCK_RDUNLOCKING = 6; - static const int LOCK_UPGRADING = 7; // rd -> wr - static const int LOCK_DOWNGRADING = 8; // wr -> rd - int lock_state; - int wrlock_ref; // how many ppl want or are using a WRITE lock - int rdlock_ref; // how many ppl want or are using a READ lock - - public: - Object(ObjectCacher *_oc, object_t o, inodeno_t i) : - oc(_oc), - oid(o), ino(i), - last_write_tid(0), last_ack_tid(0), last_commit_tid(0), - lock_state(LOCK_NONE), wrlock_ref(0), rdlock_ref(0) - {} - ~Object() { - assert(data.empty()); - } - - object_t get_oid() { return oid; } - inodeno_t get_ino() { return ino; } - - bool can_close() { - return data.empty() && lock_state == LOCK_NONE && - waitfor_ack.empty() && waitfor_commit.empty() && - waitfor_rd.empty() && waitfor_wr.empty(); - } - - // bh - void add_bh(BufferHead *bh) { - // add to my map - assert(data.count(bh->start()) == 0); - - if (0) { // sanity check FIXME DEBUG - //cout << "add_bh " << bh->start() << "~" << bh->length() << endl; - map::iterator p = data.lower_bound(bh->start()); - if (p != data.end()) { - //cout << " after " << *p->second << endl; - //cout << " after starts at " << p->first << endl; - assert(p->first >= bh->end()); - } - if (p != data.begin()) { - p--; - //cout << " before starts at " << p->second->start() - //<< " and ends at " << p->second->end() << endl; - //cout << " before " << *p->second << endl; - assert(p->second->end() <= bh->start()); - } - } - - data[bh->start()] = bh; - } - void remove_bh(BufferHead *bh) { - assert(data.count(bh->start())); - data.erase(bh->start()); - } - bool is_empty() { return data.empty(); } - - // mid-level - BufferHead *split(BufferHead *bh, off_t off); - void merge_left(BufferHead *left, BufferHead *right); - void merge_right(BufferHead *left, BufferHead *right); - - int map_read(Objecter::OSDRead *rd, - map& hits, - map& missing, - map& rx); - BufferHead *map_write(Objecter::OSDWrite *wr); - - }; - - // ******* ObjectCacher ********* - // ObjectCacher fields - public: - Objecter *objecter; - Filer filer; - - private: - Mutex& lock; - - hash_map objects; - hash_map > objects_by_ino; - - set dirty_bh; - LRU lru_dirty, lru_rest; - - Cond flusher_cond; - bool flusher_stop; - void flusher_entry(); - class FlusherThread : public Thread { - ObjectCacher *oc; - public: - FlusherThread(ObjectCacher *o) : oc(o) {} - void *entry() { - oc->flusher_entry(); - return 0; - } - } flusher_thread; - - - // objects - Object *get_object(object_t oid, inodeno_t ino) { - // have it? - if (objects.count(oid)) - return objects[oid]; - - // create it. - Object *o = new Object(this, oid, ino); - objects[oid] = o; - objects_by_ino[ino].insert(o); - return o; - } - void close_object(Object *ob); - - // bh stats - Cond stat_cond; - int stat_waiter; - - off_t stat_clean; - off_t stat_dirty; - off_t stat_rx; - off_t stat_tx; - off_t stat_missing; - - void bh_stat_add(BufferHead *bh) { - switch (bh->get_state()) { - case BufferHead::STATE_MISSING: stat_missing += bh->length(); break; - case BufferHead::STATE_CLEAN: stat_clean += bh->length(); break; - case BufferHead::STATE_DIRTY: stat_dirty += bh->length(); break; - case BufferHead::STATE_TX: stat_tx += bh->length(); break; - case BufferHead::STATE_RX: stat_rx += bh->length(); break; - } - if (stat_waiter) stat_cond.Signal(); - } - void bh_stat_sub(BufferHead *bh) { - switch (bh->get_state()) { - case BufferHead::STATE_MISSING: stat_missing -= bh->length(); break; - case BufferHead::STATE_CLEAN: stat_clean -= bh->length(); break; - case BufferHead::STATE_DIRTY: stat_dirty -= bh->length(); break; - case BufferHead::STATE_TX: stat_tx -= bh->length(); break; - case BufferHead::STATE_RX: stat_rx -= bh->length(); break; - } - } - off_t get_stat_tx() { return stat_tx; } - off_t get_stat_rx() { return stat_rx; } - off_t get_stat_dirty() { return stat_dirty; } - off_t get_stat_clean() { return stat_clean; } - - void touch_bh(BufferHead *bh) { - if (bh->is_dirty()) - lru_dirty.lru_touch(bh); - else - lru_rest.lru_touch(bh); - } - - // bh states - void bh_set_state(BufferHead *bh, int s) { - // move between lru lists? - if (s == BufferHead::STATE_DIRTY && bh->get_state() != BufferHead::STATE_DIRTY) { - lru_rest.lru_remove(bh); - lru_dirty.lru_insert_top(bh); - dirty_bh.insert(bh); - } - if (s != BufferHead::STATE_DIRTY && bh->get_state() == BufferHead::STATE_DIRTY) { - lru_dirty.lru_remove(bh); - lru_rest.lru_insert_mid(bh); - dirty_bh.erase(bh); - } - - // set state - bh_stat_sub(bh); - bh->set_state(s); - bh_stat_add(bh); - } - - void copy_bh_state(BufferHead *bh1, BufferHead *bh2) { - bh_set_state(bh2, bh1->get_state()); - } - - void mark_missing(BufferHead *bh) { bh_set_state(bh, BufferHead::STATE_MISSING); }; - void mark_clean(BufferHead *bh) { bh_set_state(bh, BufferHead::STATE_CLEAN); }; - void mark_rx(BufferHead *bh) { bh_set_state(bh, BufferHead::STATE_RX); }; - void mark_tx(BufferHead *bh) { bh_set_state(bh, BufferHead::STATE_TX); }; - void mark_dirty(BufferHead *bh) { - bh_set_state(bh, BufferHead::STATE_DIRTY); - lru_dirty.lru_touch(bh); - //bh->set_dirty_stamp(g_clock.now()); - }; - - void bh_add(Object *ob, BufferHead *bh) { - ob->add_bh(bh); - if (bh->is_dirty()) { - lru_dirty.lru_insert_top(bh); - dirty_bh.insert(bh); - } else { - lru_rest.lru_insert_top(bh); - } - bh_stat_add(bh); - } - void bh_remove(Object *ob, BufferHead *bh) { - ob->remove_bh(bh); - if (bh->is_dirty()) { - lru_dirty.lru_remove(bh); - dirty_bh.erase(bh); - } else { - lru_rest.lru_remove(bh); - } - bh_stat_sub(bh); - } - - // io - void bh_read(BufferHead *bh); - void bh_write(BufferHead *bh); - - void trim(off_t max=-1); - void flush(off_t amount=0); - - bool flush(Object *o); - off_t release(Object *o); - void purge(Object *o); - - void rdlock(Object *o); - void rdunlock(Object *o); - void wrlock(Object *o); - void wrunlock(Object *o); - - public: - void bh_read_finish(object_t oid, off_t offset, size_t length, bufferlist &bl); - void bh_write_ack(object_t oid, off_t offset, size_t length, tid_t t); - void bh_write_commit(object_t oid, off_t offset, size_t length, tid_t t); - void lock_ack(list& oids, tid_t tid); - - class C_ReadFinish : public Context { - ObjectCacher *oc; - object_t oid; - off_t start; - size_t length; - public: - bufferlist bl; - C_ReadFinish(ObjectCacher *c, object_t o, off_t s, size_t l) : oc(c), oid(o), start(s), length(l) {} - void finish(int r) { - oc->bh_read_finish(oid, start, length, bl); - } - }; - - class C_WriteAck : public Context { - ObjectCacher *oc; - object_t oid; - off_t start; - size_t length; - public: - tid_t tid; - C_WriteAck(ObjectCacher *c, object_t o, off_t s, size_t l) : oc(c), oid(o), start(s), length(l) {} - void finish(int r) { - oc->bh_write_ack(oid, start, length, tid); - } - }; - class C_WriteCommit : public Context { - ObjectCacher *oc; - object_t oid; - off_t start; - size_t length; - public: - tid_t tid; - C_WriteCommit(ObjectCacher *c, object_t o, off_t s, size_t l) : oc(c), oid(o), start(s), length(l) {} - void finish(int r) { - oc->bh_write_commit(oid, start, length, tid); - } - }; - - class C_LockAck : public Context { - ObjectCacher *oc; - public: - list oids; - tid_t tid; - C_LockAck(ObjectCacher *c, object_t o) : oc(c) { - oids.push_back(o); - } - void finish(int r) { - oc->lock_ack(oids, tid); - } - }; - - - - public: - ObjectCacher(Objecter *o, Mutex& l) : - objecter(o), filer(o), lock(l), - flusher_stop(false), flusher_thread(this), - stat_waiter(0), - stat_clean(0), stat_dirty(0), stat_rx(0), stat_tx(0), stat_missing(0) { - flusher_thread.create(); - } - ~ObjectCacher() { - // we should be empty. - assert(objects.empty()); - assert(lru_rest.lru_get_size() == 0); - assert(lru_dirty.lru_get_size() == 0); - assert(dirty_bh.empty()); - - assert(flusher_thread.is_started()); - lock.Lock(); // hmm.. watch out for deadlock! - flusher_stop = true; - flusher_cond.Signal(); - lock.Unlock(); - flusher_thread.join(); - } - - - class C_RetryRead : public Context { - ObjectCacher *oc; - Objecter::OSDRead *rd; - inodeno_t ino; - Context *onfinish; - public: - C_RetryRead(ObjectCacher *_oc, Objecter::OSDRead *r, inodeno_t i, Context *c) : oc(_oc), rd(r), ino(i), onfinish(c) {} - void finish(int) { - int r = oc->readx(rd, ino, onfinish); - if (r > 0) { - onfinish->finish(r); - delete onfinish; - } - } - }; - - // non-blocking. async. - int readx(Objecter::OSDRead *rd, inodeno_t ino, Context *onfinish); - int writex(Objecter::OSDWrite *wr, inodeno_t ino); - - // write blocking - void wait_for_write(size_t len, Mutex& lock); - - // blocking. atomic+sync. - int atomic_sync_readx(Objecter::OSDRead *rd, inodeno_t ino, Mutex& lock); - int atomic_sync_writex(Objecter::OSDWrite *wr, inodeno_t ino, Mutex& lock); - - bool set_is_cached(inodeno_t ino); - bool set_is_dirty_or_committing(inodeno_t ino); - - bool flush_set(inodeno_t ino, Context *onfinish=0); - void flush_all(Context *onfinish=0); - - bool commit_set(inodeno_t ino, Context *oncommit); - void commit_all(Context *oncommit=0); - - void purge_set(inodeno_t ino); - - off_t release_set(inodeno_t ino); // returns # of bytes not released (ie non-clean) - - void kick_sync_writers(inodeno_t ino); - void kick_sync_readers(inodeno_t ino); - - - // file functions - - /*** async+caching (non-blocking) file interface ***/ - int file_read(inode_t& inode, - off_t offset, size_t len, - bufferlist *bl, - Context *onfinish) { - Objecter::OSDRead *rd = new Objecter::OSDRead(bl); - filer.file_to_extents(inode, offset, len, rd->extents); - return readx(rd, inode.ino, onfinish); - } - - int file_write(inode_t& inode, - off_t offset, size_t len, - bufferlist& bl, - objectrev_t rev=0) { - Objecter::OSDWrite *wr = new Objecter::OSDWrite(bl); - filer.file_to_extents(inode, offset, len, wr->extents); - return writex(wr, inode.ino); - } - - - - /*** sync+blocking file interface ***/ - - int file_atomic_sync_read(inode_t& inode, - off_t offset, size_t len, - bufferlist *bl, - Mutex &lock) { - Objecter::OSDRead *rd = new Objecter::OSDRead(bl); - filer.file_to_extents(inode, offset, len, rd->extents); - return atomic_sync_readx(rd, inode.ino, lock); - } - - int file_atomic_sync_write(inode_t& inode, - off_t offset, size_t len, - bufferlist& bl, - Mutex &lock, - objectrev_t rev=0) { - Objecter::OSDWrite *wr = new Objecter::OSDWrite(bl); - filer.file_to_extents(inode, offset, len, wr->extents); - return atomic_sync_writex(wr, inode.ino, lock); - } - -}; - - -inline ostream& operator<<(ostream& out, ObjectCacher::BufferHead &bh) -{ - out << "bh[" - << bh.start() << "~" << bh.length() - << " (" << bh.bl.length() << ")" - << " v " << bh.last_write_tid; - if (bh.is_tx()) out << " tx"; - if (bh.is_rx()) out << " rx"; - if (bh.is_dirty()) out << " dirty"; - if (bh.is_clean()) out << " clean"; - if (bh.is_missing()) out << " missing"; - out << "]"; - return out; -} - -inline ostream& operator<<(ostream& out, ObjectCacher::Object &ob) -{ - out << "object[" - << hex << ob.get_oid() << " ino " << ob.get_ino() << dec - << " wr " << ob.last_write_tid << "/" << ob.last_ack_tid << "/" << ob.last_commit_tid; - - switch (ob.lock_state) { - case ObjectCacher::Object::LOCK_WRLOCKING: out << " wrlocking"; break; - case ObjectCacher::Object::LOCK_WRLOCK: out << " wrlock"; break; - case ObjectCacher::Object::LOCK_WRUNLOCKING: out << " wrunlocking"; break; - case ObjectCacher::Object::LOCK_RDLOCKING: out << " rdlocking"; break; - case ObjectCacher::Object::LOCK_RDLOCK: out << " rdlock"; break; - case ObjectCacher::Object::LOCK_RDUNLOCKING: out << " rdunlocking"; break; - } - - out << "]"; - return out; -} - -#endif diff --git a/branches/marnberg/quota/osdc/Objecter.cc b/branches/marnberg/quota/osdc/Objecter.cc deleted file mode 100644 index 9e49a43ace89b..0000000000000 --- a/branches/marnberg/quota/osdc/Objecter.cc +++ /dev/null @@ -1,838 +0,0 @@ - -#include "Objecter.h" -#include "osd/OSDMap.h" -#include "mon/MonMap.h" - -#include "msg/Messenger.h" -#include "msg/Message.h" - -#include "messages/MOSDOp.h" -#include "messages/MOSDOpReply.h" -#include "messages/MOSDMap.h" -#include "messages/MOSDGetMap.h" - -#include "messages/MOSDFailure.h" - -#include - -#include "config.h" -#undef dout -#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_objecter) cout << g_clock.now() << " " << messenger->get_myname() << ".objecter " -#define derr(x) if (x <= g_conf.debug || x <= g_conf.debug_objecter) cerr << g_clock.now() << " " << messenger->get_myname() << ".objecter " - - -// messages ------------------------------ - -void Objecter::dispatch(Message *m) -{ - switch (m->get_type()) { - case MSG_OSD_OPREPLY: - handle_osd_op_reply((MOSDOpReply*)m); - break; - - case MSG_OSD_MAP: - handle_osd_map((MOSDMap*)m); - break; - - default: - dout(1) << "don't know message type " << m->get_type() << endl; - assert(0); - } -} - -void Objecter::handle_osd_map(MOSDMap *m) -{ - assert(osdmap); - - if (m->get_last() <= osdmap->get_epoch()) { - dout(3) << "handle_osd_map ignoring epochs [" - << m->get_first() << "," << m->get_last() - << "] <= " << osdmap->get_epoch() << endl; - } - else { - dout(3) << "handle_osd_map got epochs [" - << m->get_first() << "," << m->get_last() - << "] > " << osdmap->get_epoch() - << endl; - - set changed_pgs; - - for (epoch_t e = osdmap->get_epoch() + 1; - e <= m->get_last(); - e++) { - if (m->incremental_maps.count(e)) { - dout(3) << "handle_osd_map decoding incremental epoch " << e << endl; - OSDMap::Incremental inc; - int off = 0; - inc.decode(m->incremental_maps[e], off); - osdmap->apply_incremental(inc); - - // notify messenger - for (map::iterator i = inc.new_down.begin(); - i != inc.new_down.end(); - i++) - messenger->mark_down(i->second.addr); - - } - else if (m->maps.count(e)) { - dout(3) << "handle_osd_map decoding full epoch " << e << endl; - osdmap->decode(m->maps[e]); - } - else { - dout(3) << "handle_osd_map requesting missing epoch " << osdmap->get_epoch()+1 << endl; - int mon = monmap->pick_mon(); - messenger->send_message(new MOSDGetMap(osdmap->get_epoch()), - monmap->get_inst(mon)); - break; - } - - // scan pgs for changes - scan_pgs(changed_pgs); - - assert(e == osdmap->get_epoch()); - } - - // kick requests who might be timing out on the wrong osds - if (!changed_pgs.empty()) - kick_requests(changed_pgs); - } - - delete m; -} - -void Objecter::scan_pgs(set& changed_pgs) -{ - dout(10) << "scan_pgs" << endl; - - for (hash_map::iterator i = pg_map.begin(); - i != pg_map.end(); - i++) { - pg_t pgid = i->first; - PG& pg = i->second; - - // calc new. - vector other; - osdmap->pg_to_acting_osds(pgid, other); - - if (other == pg.acting) - continue; // no change. - - other.swap(pg.acting); - - if (g_conf.osd_rep == OSD_REP_PRIMARY) { - // same primary? - if (!other.empty() && - !pg.acting.empty() && - other[0] == pg.acting[0]) - continue; - } - else if (g_conf.osd_rep == OSD_REP_SPLAY) { - // same primary and acker? - if (!other.empty() && - !pg.acting.empty() && - other[0] == pg.acting[0] && - other[other.size() > 1 ? 1:0] == pg.acting[pg.acting.size() > 1 ? 1:0]) - continue; - } - else if (g_conf.osd_rep == OSD_REP_CHAIN) { - // any change is significant. - } - - // changed significantly. - dout(10) << "scan_pgs pg " << pgid - << " (" << pg.active_tids << ")" - << " " << other << " -> " << pg.acting - << endl; - changed_pgs.insert(pgid); - } -} - -void Objecter::kick_requests(set& changed_pgs) -{ - dout(10) << "kick_requests in pgs " << changed_pgs << endl; - - for (set::iterator i = changed_pgs.begin(); - i != changed_pgs.end(); - i++) { - pg_t pgid = *i; - PG& pg = pg_map[pgid]; - - // resubmit ops! - set tids; - tids.swap( pg.active_tids ); - close_pg( pgid ); // will pbly reopen, unless it's just commits we're missing - - for (set::iterator p = tids.begin(); - p != tids.end(); - p++) { - tid_t tid = *p; - - if (op_modify.count(tid)) { - OSDModify *wr = op_modify[tid]; - op_modify.erase(tid); - - // WRITE - if (wr->tid_version.count(tid)) { - if (wr->op == OSD_OP_WRITE && - !g_conf.objecter_buffer_uncommitted) { - dout(0) << "kick_requests missing commit, cannot replay: objecter_buffer_uncommitted == FALSE" << endl; - } else { - dout(0) << "kick_requests missing commit, replay write " << tid - << " v " << wr->tid_version[tid] << endl; - modifyx_submit(wr, wr->waitfor_commit[tid], tid); - } - } - else if (wr->waitfor_ack.count(tid)) { - dout(0) << "kick_requests missing ack, resub write " << tid << endl; - modifyx_submit(wr, wr->waitfor_ack[tid], tid); - } - } - - else if (op_read.count(tid)) { - // READ - OSDRead *rd = op_read[tid]; - op_read.erase(tid); - dout(0) << "kick_requests resub read " << tid << endl; - - // resubmit - readx_submit(rd, rd->ops[tid]); - rd->ops.erase(tid); - } - - else if (op_stat.count(tid)) { - OSDStat *st = op_stat[tid]; - op_stat.erase(tid); - - dout(0) << "kick_requests resub stat " << tid << endl; - - // resubmit - stat_submit(st); - } - - else - assert(0); - } - } -} - - - -void Objecter::handle_osd_op_reply(MOSDOpReply *m) -{ - // read or modify? - switch (m->get_op()) { - case OSD_OP_READ: - handle_osd_read_reply(m); - break; - - case OSD_OP_STAT: - handle_osd_stat_reply(m); - break; - - case OSD_OP_WRNOOP: - case OSD_OP_WRITE: - case OSD_OP_ZERO: - case OSD_OP_DELETE: - case OSD_OP_WRUNLOCK: - case OSD_OP_WRLOCK: - case OSD_OP_RDLOCK: - case OSD_OP_RDUNLOCK: - case OSD_OP_UPLOCK: - case OSD_OP_DNLOCK: - handle_osd_modify_reply(m); - break; - - default: - assert(0); - } -} - - - -// stat ----------------------------------- - -tid_t Objecter::stat(object_t oid, off_t *size, Context *onfinish, - objectrev_t rev) -{ - OSDStat *st = new OSDStat(size); - st->extents.push_back(ObjectExtent(oid, 0, 0)); - st->extents.front().pgid = osdmap->object_to_pg( oid, g_OSD_FileLayout ); - st->extents.front().rev = rev; - st->onfinish = onfinish; - - return stat_submit(st); -} - -tid_t Objecter::stat_submit(OSDStat *st) -{ - // find OSD - ObjectExtent &ex = st->extents.front(); - PG &pg = get_pg( ex.pgid ); - - // pick tid - last_tid++; - assert(client_inc >= 0); - - // add to gather set - st->tid = last_tid; - op_stat[last_tid] = st; - - pg.active_tids.insert(last_tid); - - // send? - dout(10) << "stat_submit " << st << " tid " << last_tid - << " oid " << ex.oid - << " pg " << ex.pgid - << " osd" << pg.acker() - << endl; - - if (pg.acker() >= 0) { - MOSDOp *m = new MOSDOp(messenger->get_myinst(), client_inc, last_tid, - ex.oid, ex.pgid, osdmap->get_epoch(), - OSD_OP_STAT); - - messenger->send_message(m, osdmap->get_inst(pg.acker())); - } - - return last_tid; -} - -void Objecter::handle_osd_stat_reply(MOSDOpReply *m) -{ - // get pio - tid_t tid = m->get_tid(); - - if (op_stat.count(tid) == 0) { - dout(7) << "handle_osd_stat_reply " << tid << " ... stray" << endl; - delete m; - return; - } - - dout(7) << "handle_osd_stat_reply " << tid - << " r=" << m->get_result() - << " size=" << m->get_object_size() - << endl; - OSDStat *st = op_stat[ tid ]; - op_stat.erase( tid ); - - // remove from osd/tid maps - PG& pg = get_pg( m->get_pg() ); - assert(pg.active_tids.count(tid)); - pg.active_tids.erase(tid); - if (pg.active_tids.empty()) close_pg( m->get_pg() ); - - // success? - if (m->get_result() == -EAGAIN) { - dout(7) << " got -EAGAIN, resubmitting" << endl; - stat_submit(st); - delete m; - return; - } - //assert(m->get_result() >= 0); - - // ok! - if (m->get_result() < 0) { - *st->size = -1; - } else { - *st->size = m->get_object_size(); - } - - // finish, clean up - Context *onfinish = st->onfinish; - - // done - delete st; - if (onfinish) { - onfinish->finish(m->get_result()); - delete onfinish; - } - - delete m; -} - - -// read ----------------------------------- - - -tid_t Objecter::read(object_t oid, off_t off, size_t len, bufferlist *bl, - Context *onfinish, - objectrev_t rev) -{ - OSDRead *rd = new OSDRead(bl); - rd->extents.push_back(ObjectExtent(oid, off, len)); - rd->extents.front().pgid = osdmap->object_to_pg( oid, g_OSD_FileLayout ); - rd->extents.front().rev = rev; - readx(rd, onfinish); - return last_tid; -} - - -tid_t Objecter::readx(OSDRead *rd, Context *onfinish) -{ - rd->onfinish = onfinish; - - // issue reads - for (list::iterator it = rd->extents.begin(); - it != rd->extents.end(); - it++) - readx_submit(rd, *it); - - return last_tid; -} - -tid_t Objecter::readx_submit(OSDRead *rd, ObjectExtent &ex) -{ - // find OSD - PG &pg = get_pg( ex.pgid ); - - // pick tid - last_tid++; - assert(client_inc >= 0); - - // add to gather set - rd->ops[last_tid] = ex; - op_read[last_tid] = rd; - - pg.active_tids.insert(last_tid); - - // send? - dout(10) << "readx_submit " << rd << " tid " << last_tid - << " oid " << ex.oid << " " << ex.start << "~" << ex.length - << " (" << ex.buffer_extents.size() << " buffer fragments)" - << " pg " << ex.pgid - << " osd" << pg.acker() - << endl; - - if (pg.acker() >= 0) { - MOSDOp *m = new MOSDOp(messenger->get_myinst(), client_inc, last_tid, - ex.oid, ex.pgid, osdmap->get_epoch(), - OSD_OP_READ); - m->set_length(ex.length); - m->set_offset(ex.start); - - messenger->send_message(m, osdmap->get_inst(pg.acker())); - } - - return last_tid; -} - - -void Objecter::handle_osd_read_reply(MOSDOpReply *m) -{ - // get pio - tid_t tid = m->get_tid(); - - if (op_read.count(tid) == 0) { - dout(7) << "handle_osd_read_reply " << tid << " ... stray" << endl; - delete m; - return; - } - - dout(7) << "handle_osd_read_reply " << tid << endl; - OSDRead *rd = op_read[ tid ]; - op_read.erase( tid ); - - // remove from osd/tid maps - PG& pg = get_pg( m->get_pg() ); - assert(pg.active_tids.count(tid)); - pg.active_tids.erase(tid); - if (pg.active_tids.empty()) close_pg( m->get_pg() ); - - // our op finished - rd->ops.erase(tid); - - // success? - if (m->get_result() == -EAGAIN) { - dout(7) << " got -EAGAIN, resubmitting" << endl; - readx_submit(rd, rd->ops[tid]); - delete m; - return; - } - //assert(m->get_result() >= 0); - - // what buffer offset are we? - dout(7) << " got frag from " << m->get_oid() << " " - << m->get_offset() << "~" << m->get_length() - << ", still have " << rd->ops.size() << " more ops" << endl; - - if (rd->ops.empty()) { - // all done - size_t bytes_read = 0; - - if (rd->read_data.size()) { - dout(15) << " assembling frags" << endl; - - /** FIXME This doesn't handle holes efficiently. - * It allocates zero buffers to fill whole buffer, and - * then discards trailing ones at the end. - * - * Actually, this whole thing is pretty messy with temporary bufferlist*'s all over - * the heap. - */ - - // we have other fragments, assemble them all... blech! - rd->read_data[m->get_oid()] = new bufferlist; - rd->read_data[m->get_oid()]->claim( m->get_data() ); - - // map extents back into buffer - map by_off; // buffer offset -> bufferlist - - // for each object extent... - for (list::iterator eit = rd->extents.begin(); - eit != rd->extents.end(); - eit++) { - bufferlist *ox_buf = rd->read_data[eit->oid]; - unsigned ox_len = ox_buf->length(); - unsigned ox_off = 0; - assert(ox_len <= eit->length); - - // for each buffer extent we're mapping into... - for (map::iterator bit = eit->buffer_extents.begin(); - bit != eit->buffer_extents.end(); - bit++) { - dout(21) << " object " << eit->oid << " extent " << eit->start << "~" << eit->length << " : ox offset " << ox_off << " -> buffer extent " << bit->first << "~" << bit->second << endl; - by_off[bit->first] = new bufferlist; - - if (ox_off + bit->second <= ox_len) { - // we got the whole bx - by_off[bit->first]->substr_of(*ox_buf, ox_off, bit->second); - if (bytes_read < bit->first + bit->second) - bytes_read = bit->first + bit->second; - } else if (ox_off + bit->second > ox_len && ox_off < ox_len) { - // we got part of this bx - by_off[bit->first]->substr_of(*ox_buf, ox_off, (ox_len-ox_off)); - if (bytes_read < bit->first + ox_len-ox_off) - bytes_read = bit->first + ox_len-ox_off; - - // zero end of bx - dout(21) << " adding some zeros to the end " << ox_off + bit->second-ox_len << endl; - bufferptr z(ox_off + bit->second - ox_len); - z.zero(); - by_off[bit->first]->append( z ); - } else { - // we got none of this bx. zero whole thing. - assert(ox_off >= ox_len); - dout(21) << " adding all zeros for this bit " << bit->second << endl; - bufferptr z(bit->second); - z.zero(); - by_off[bit->first]->append( z ); - } - ox_off += bit->second; - } - assert(ox_off == eit->length); - } - - // sort and string bits together - for (map::iterator it = by_off.begin(); - it != by_off.end(); - it++) { - assert(it->second->length()); - if (it->first < (off_t)bytes_read) { - dout(21) << " concat buffer frag off " << it->first << " len " << it->second->length() << endl; - rd->bl->claim_append(*(it->second)); - } else { - dout(21) << " NO concat zero buffer frag off " << it->first << " len " << it->second->length() << endl; - } - delete it->second; - } - - // trim trailing zeros? - if (rd->bl->length() > bytes_read) { - dout(10) << " trimming off trailing zeros . bytes_read=" << bytes_read - << " len=" << rd->bl->length() << endl; - rd->bl->splice(bytes_read, rd->bl->length() - bytes_read); - assert(bytes_read == rd->bl->length()); - } - - // hose p->read_data bufferlist*'s - for (map::iterator it = rd->read_data.begin(); - it != rd->read_data.end(); - it++) { - delete it->second; - } - } else { - dout(15) << " only one frag" << endl; - - // only one fragment, easy - rd->bl->claim( m->get_data() ); - bytes_read = rd->bl->length(); - } - - // finish, clean up - Context *onfinish = rd->onfinish; - - dout(7) << " " << bytes_read << " bytes " - << rd->bl->length() - << endl; - - // done - delete rd; - if (onfinish) { - onfinish->finish(bytes_read);// > 0 ? bytes_read:m->get_result()); - delete onfinish; - } - } else { - // store my bufferlist for later assembling - rd->read_data[m->get_oid()] = new bufferlist; - rd->read_data[m->get_oid()]->claim( m->get_data() ); - } - - delete m; -} - - - -// write ------------------------------------ - -tid_t Objecter::write(object_t oid, off_t off, size_t len, bufferlist &bl, - Context *onack, Context *oncommit, - objectrev_t rev) -{ - OSDWrite *wr = new OSDWrite(bl); - wr->extents.push_back(ObjectExtent(oid, off, len)); - wr->extents.front().pgid = osdmap->object_to_pg( oid, g_OSD_FileLayout ); - wr->extents.front().buffer_extents[0] = len; - wr->extents.front().rev = rev; - modifyx(wr, onack, oncommit); - return last_tid; -} - - -// zero - -tid_t Objecter::zero(object_t oid, off_t off, size_t len, - Context *onack, Context *oncommit, - objectrev_t rev) -{ - OSDModify *z = new OSDModify(OSD_OP_ZERO); - z->extents.push_back(ObjectExtent(oid, off, len)); - z->extents.front().pgid = osdmap->object_to_pg( oid, g_OSD_FileLayout ); - z->extents.front().rev = rev; - modifyx(z, onack, oncommit); - return last_tid; -} - - -// lock ops - -tid_t Objecter::lock(int op, object_t oid, - Context *onack, Context *oncommit) -{ - OSDModify *l = new OSDModify(op); - l->extents.push_back(ObjectExtent(oid, 0, 0)); - l->extents.front().pgid = osdmap->object_to_pg( oid, g_OSD_FileLayout ); - modifyx(l, onack, oncommit); - return last_tid; -} - - - -// generic modify ----------------------------------- - -tid_t Objecter::modifyx(OSDModify *wr, Context *onack, Context *oncommit) -{ - wr->onack = onack; - wr->oncommit = oncommit; - - // issue writes/whatevers - for (list::iterator it = wr->extents.begin(); - it != wr->extents.end(); - it++) - modifyx_submit(wr, *it); - - return last_tid; -} - - -tid_t Objecter::modifyx_submit(OSDModify *wr, ObjectExtent &ex, tid_t usetid) -{ - // find - PG &pg = get_pg( ex.pgid ); - - // pick tid - tid_t tid; - if (usetid > 0) - tid = usetid; - else - tid = ++last_tid; - - // add to gather set - wr->waitfor_ack[tid] = ex; - wr->waitfor_commit[tid] = ex; - op_modify[tid] = wr; - pg.active_tids.insert(tid); - - ++num_unacked; - ++num_uncommitted; - - // send? - dout(10) << "modifyx_submit " << MOSDOp::get_opname(wr->op) << " tid " << tid - << " oid " << ex.oid - << " " << ex.start << "~" << ex.length - << " pg " << ex.pgid - << " osd" << pg.primary() - << endl; - if (pg.primary() >= 0) { - MOSDOp *m = new MOSDOp(messenger->get_myinst(), client_inc, tid, - ex.oid, ex.pgid, osdmap->get_epoch(), - wr->op); - m->set_length(ex.length); - m->set_offset(ex.start); - m->set_rev(ex.rev); - - if (wr->tid_version.count(tid)) - m->set_version(wr->tid_version[tid]); // we're replaying this op! - - // what type of op? - switch (wr->op) { - case OSD_OP_WRITE: - { - // map buffer segments into this extent - // (may be fragmented bc of striping) - bufferlist cur; - for (map::iterator bit = ex.buffer_extents.begin(); - bit != ex.buffer_extents.end(); - bit++) { - bufferlist thisbit; - thisbit.substr_of(((OSDWrite*)wr)->bl, bit->first, bit->second); - cur.claim_append(thisbit); - } - assert(cur.length() == ex.length); - m->set_data(cur);//.claim(cur); - } - break; - } - - messenger->send_message(m, osdmap->get_inst(pg.primary())); - } - - dout(5) << num_unacked << " unacked, " << num_uncommitted << " uncommitted" << endl; - - return tid; -} - - - -void Objecter::handle_osd_modify_reply(MOSDOpReply *m) -{ - // get pio - tid_t tid = m->get_tid(); - - if (op_modify.count(tid) == 0) { - dout(7) << "handle_osd_modify_reply " << tid - << (m->get_commit() ? " commit":" ack") - << " ... stray" << endl; - delete m; - return; - } - - dout(7) << "handle_osd_modify_reply " << tid - << (m->get_commit() ? " commit":" ack") - << " v " << m->get_version() - << endl; - OSDModify *wr = op_modify[ tid ]; - - Context *onack = 0; - Context *oncommit = 0; - - PG &pg = get_pg( m->get_pg() ); - - // ignore? - if (pg.acker() != m->get_source().num()) { - dout(7) << " ignoring ack|commit from non-acker" << endl; - delete m; - return; - } - - assert(m->get_result() >= 0); - - // ack or commit? - if (m->get_commit()) { - //dout(15) << " handle_osd_write_reply commit on " << tid << endl; - assert(wr->tid_version.count(tid) == 0 || - m->get_version() == wr->tid_version[tid]); - - // remove from tid/osd maps - assert(pg.active_tids.count(tid)); - pg.active_tids.erase(tid); - if (pg.active_tids.empty()) close_pg( m->get_pg() ); - - // commit. - op_modify.erase( tid ); - wr->waitfor_ack.erase(tid); - wr->waitfor_commit.erase(tid); - - num_uncommitted--; - - if (wr->waitfor_commit.empty()) { - onack = wr->onack; - oncommit = wr->oncommit; - delete wr; - } - } else { - // ack. - //dout(15) << " handle_osd_write_reply ack on " << tid << endl; - assert(wr->waitfor_ack.count(tid)); - wr->waitfor_ack.erase(tid); - - num_unacked--; - - if (wr->tid_version.count(tid) && - wr->tid_version[tid].version != m->get_version().version) { - dout(-10) << "handle_osd_modify_reply WARNING: replay of tid " << tid - << " did not achieve previous ordering" << endl; - } - wr->tid_version[tid] = m->get_version(); - - if (wr->waitfor_ack.empty()) { - onack = wr->onack; - wr->onack = 0; // only do callback once - - // buffer uncommitted? - if (!g_conf.objecter_buffer_uncommitted && - wr->op == OSD_OP_WRITE) { - // discard buffer! - ((OSDWrite*)wr)->bl.clear(); - } - } - } - - // do callbacks - if (onack) { - onack->finish(0); - delete onack; - } - if (oncommit) { - oncommit->finish(0); - delete oncommit; - } - - delete m; -} - - - -void Objecter::ms_handle_failure(Message *m, entity_name_t dest, const entity_inst_t& inst) -{ - if (dest.is_mon()) { - // try a new mon - int mon = monmap->pick_mon(true); - dout(0) << "ms_handle_failure " << dest << " inst " << inst - << ", resending to mon" << mon - << endl; - messenger->send_message(m, monmap->get_inst(mon)); - } - else if (dest.is_osd()) { - int mon = monmap->pick_mon(); - dout(0) << "ms_handle_failure " << dest << " inst " << inst - << ", dropping and reporting to mon" << mon - << endl; - messenger->send_message(new MOSDFailure(inst, osdmap->get_epoch()), - monmap->get_inst(mon)); - delete m; - } else { - dout(0) << "ms_handle_failure " << dest << " inst " << inst - << ", dropping" << endl; - delete m; - } -} diff --git a/branches/marnberg/quota/osdc/Objecter.h b/branches/marnberg/quota/osdc/Objecter.h deleted file mode 100644 index 741db052a21ea..0000000000000 --- a/branches/marnberg/quota/osdc/Objecter.h +++ /dev/null @@ -1,197 +0,0 @@ -#ifndef __OBJECTER_H -#define __OBJECTER_H - -#include "include/types.h" -#include "include/buffer.h" - -#include "osd/OSDMap.h" -#include "messages/MOSDOp.h" - -#include -#include -#include -using namespace std; -using namespace __gnu_cxx; - -class Context; -class Messenger; -class OSDMap; -class MonMap; -class Message; - -class Objecter { - public: - Messenger *messenger; - MonMap *monmap; - OSDMap *osdmap; - - private: - tid_t last_tid; - int client_inc; - int num_unacked; - int num_uncommitted; - - /*** track pending operations ***/ - // read - public: - class OSDOp { - public: - list extents; - virtual ~OSDOp() {} - }; - - class OSDRead : public OSDOp { - public: - bufferlist *bl; - Context *onfinish; - map ops; - map read_data; // bits of data as they come back - - OSDRead(bufferlist *b) : bl(b), onfinish(0) { - bl->clear(); - } - }; - - class OSDStat : public OSDOp { - public: - tid_t tid; - off_t *size; // where the size goes. - Context *onfinish; - OSDStat(off_t *s) : tid(0), size(s), onfinish(0) { } - }; - - // generic modify - class OSDModify : public OSDOp { - public: - int op; - list extents; - Context *onack; - Context *oncommit; - map waitfor_ack; - map tid_version; - map waitfor_commit; - - OSDModify(int o) : op(o), onack(0), oncommit(0) {} - }; - - // write (includes the bufferlist) - class OSDWrite : public OSDModify { - public: - bufferlist bl; - OSDWrite(bufferlist &b) : OSDModify(OSD_OP_WRITE), bl(b) {} - }; - - - - private: - // pending ops - hash_map op_stat; - hash_map op_read; - hash_map op_modify; - - /** - * track pending ops by pg - * ...so we can cope with failures, map changes - */ - class PG { - public: - vector acting; - set active_tids; // active ops - - PG() {} - - // primary - where i write - int primary() { - if (acting.empty()) return -1; - return acting[0]; - } - // acker - where i read, and receive acks from - int acker() { - if (acting.empty()) return -1; - if (g_conf.osd_rep == OSD_REP_PRIMARY) - return acting[0]; - else - return acting[acting.size() > 1 ? 1:0]; - } - }; - - hash_map pg_map; - - - PG &get_pg(pg_t pgid) { - if (!pg_map.count(pgid)) - osdmap->pg_to_acting_osds(pgid, pg_map[pgid].acting); - return pg_map[pgid]; - } - void close_pg(pg_t pgid) { - assert(pg_map.count(pgid)); - assert(pg_map[pgid].active_tids.empty()); - pg_map.erase(pgid); - } - void scan_pgs(set& chnaged_pgs); - void kick_requests(set& changed_pgs); - - - public: - Objecter(Messenger *m, MonMap *mm, OSDMap *om) : - messenger(m), monmap(mm), osdmap(om), - last_tid(0), client_inc(-1), - num_unacked(0), num_uncommitted(0) - {} - ~Objecter() { - // clean up op_* - // *** - } - - // messages - public: - void dispatch(Message *m); - void handle_osd_op_reply(class MOSDOpReply *m); - void handle_osd_stat_reply(class MOSDOpReply *m); - void handle_osd_read_reply(class MOSDOpReply *m); - void handle_osd_modify_reply(class MOSDOpReply *m); - void handle_osd_lock_reply(class MOSDOpReply *m); - void handle_osd_map(class MOSDMap *m); - - private: - tid_t readx_submit(OSDRead *rd, ObjectExtent& ex); - tid_t modifyx_submit(OSDModify *wr, ObjectExtent& ex, tid_t tid=0); - tid_t stat_submit(OSDStat *st); - - // public interface - public: - bool is_active() { - return !(op_read.empty() && op_modify.empty()); - } - - int get_client_incarnation() { return client_inc; } - void set_client_incarnation(int inc) { - client_inc = inc; - } - - // med level - tid_t readx(OSDRead *read, Context *onfinish); - tid_t modifyx(OSDModify *wr, Context *onack, Context *oncommit); - //tid_t lockx(OSDLock *l, Context *onack, Context *oncommit); - - // even lazier - tid_t read(object_t oid, off_t off, size_t len, bufferlist *bl, - Context *onfinish, - objectrev_t rev=0); - tid_t write(object_t oid, off_t off, size_t len, bufferlist &bl, - Context *onack, Context *oncommit, - objectrev_t rev=0); - tid_t zero(object_t oid, off_t off, size_t len, - Context *onack, Context *oncommit, - objectrev_t rev=0); - tid_t stat(object_t oid, off_t *size, Context *onfinish, - objectrev_t rev=0); - - tid_t lock(int op, object_t oid, Context *onack, Context *oncommit); - - - void ms_handle_failure(Message *m, entity_name_t dest, const entity_inst_t& inst); - -}; - -#endif diff --git a/branches/marnberg/quota/script/add_header.pl b/branches/marnberg/quota/script/add_header.pl deleted file mode 100755 index f5891cc668c45..0000000000000 --- a/branches/marnberg/quota/script/add_header.pl +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/perl - -use strict; -my $fn = shift @ARGV; -my $f = `cat $fn`; - -my $header = '// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -'; - -unless ($f =~ /Ceph - scalable distributed file system/) { - open(O, ">$fn.new"); - print O $header; - print O $f; - close O; - rename "$fn.new", $fn; -} - diff --git a/branches/marnberg/quota/script/comb.pl b/branches/marnberg/quota/script/comb.pl deleted file mode 100755 index 88a4bb72a7970..0000000000000 --- a/branches/marnberg/quota/script/comb.pl +++ /dev/null @@ -1,113 +0,0 @@ -#!/usr/bin/perl - -use strict; - -my $xaxis = shift @ARGV; -my @vars; -while (@ARGV) { - $_ = shift @ARGV; - last if ($_ eq '-'); - push(@vars, $_); -} -my @dirs; -while (@ARGV) { - $_ = shift @ARGV; - last if ($_ eq '-'); - push(@dirs, $_) if -d $_; -} -my @filt = @ARGV; -push( @filt, '.' ) unless @filt; - -print "#xaxis $xaxis -#vars @vars -#dirs @dirs -#filt @filt -"; - -sub load_sum { - my $fn = shift @_; - - open(I, "$fn"); - my $k = ; - chomp($k); - my @k = split(/\s+/,$k); - shift @k; - - my $s; - while () { - chomp; - s/^\#//; - next unless $_; - my @l = split(/\s+/,$_); - my $k = shift @l; - for my $f (@k) { - $s->{$k}->{$f} = shift @l; - } - - # clnode latency? - if ($fn =~ /cl/) { - $s->{$k}->{'wrlat'} = $s->{$k}->{'wrlsum'} / $s->{$k}->{'wrlnum'} if $s->{$k}->{'wrlnum'} > 0; - $s->{$k}->{'rlat'} = $s->{$k}->{'rlsum'} / $s->{$k}->{'rlnum'} if $s->{$k}->{'rlnum'} > 0; - $s->{$k}->{'lat'} = $s->{$k}->{'lsum'} / $s->{$k}->{'lnum'} if $s->{$k}->{'lnum'} > 0; - $s->{$k}->{'latw'} = $s->{$k}->{'lwsum'} / $s->{$k}->{'lwnum'} if $s->{$k}->{'lwnum'} > 0; - $s->{$k}->{'latr'} = $s->{$k}->{'lrsum'} / $s->{$k}->{'lrnum'} if $s->{$k}->{'lrnum'} > 0; - $s->{$k}->{'statlat'} = $s->{$k}->{'lstatsum'} / $s->{$k}->{'lstatnum'} if $s->{$k}->{'lstatnum'} > 0; - $s->{$k}->{'dirlat'} = $s->{$k}->{'ldirsum'} / $s->{$k}->{'ldirnum'} if $s->{$k}->{'ldirnum'} > 0; - } - } - return $s; -} - - -my %res; -my @key; -my %didkey; -for my $f (@filt) { - my @reg = split(/,/, $f); - #print "reg @reg\n"; - for my $d (@dirs) { - if ($f ne '.') { - my $r = (split(/\//,$d))[-1]; - my @db = split(/,/, $r); - #print "db @db\n"; - my $ok = 1; - for my $r (@reg) { - - $ok = 0 unless grep {$_ eq $r} @db; - } - next unless $ok; - } - #next if ($f ne '.' && $d !~ /$reg/); - #print "$d\n"; - my ($x) = $d =~ /$xaxis=(\d+)/; - - for my $v (@vars) { - my ($what, $field) = $v =~ /^(.+)\.([^\.]+)$/; - #print "$what $field .. $v .. $f.$field\n"; - my $s = &load_sum("$d/sum.$what"); - - #print "\t$v"; - if ($field =~ /^sum=/) { - #warn "SUM field $field\n"; - push( @{$res{$x}}, $s->{'sum'}->{$'} ); #'}); - } else { - #warn "avg field $field\n"; - push( @{$res{$x}}, $s->{'avgval'}->{$field} ); - } - - push( @key, "$f.$field" ) unless $didkey{"$f.$field"}; - $didkey{"$f.$field"} = 1; - - if (0 && exists $s->{'avgvaldevt'}) { - push( @{$res{$x}}, $s->{'avgvaldevt'}->{$field} ); - push( @key, "$f.$field.dev" ) unless $didkey{"$f.$field.dev"}; - $didkey{"$f.$field.dev"} = 1; - } - } - } -} - -print join("\t", "#", @key) . "\n"; -for my $x (sort {$a <=> $b} keys %res) { - print join("\t", $x, @{$res{$x}}) . "\n"; -} diff --git a/branches/marnberg/quota/script/find_auth_pins.pl b/branches/marnberg/quota/script/find_auth_pins.pl deleted file mode 100755 index c02c12922ed7b..0000000000000 --- a/branches/marnberg/quota/script/find_auth_pins.pl +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/perl - -my %pin; -my %hist; -my $l = 1; -my @pins; -while (<>) { - - #cdir:adjust_nested_auth_pins on [dir 163 /foo/ rep@13 | child] count now 0 + 1 - - if (/adjust_nested_auth_pins/) { - my ($what) = /\[(\w+ \d+) /; - $hist{$what} .= "$l: $_" - if defined $pin{$what}; - } - - # cinode:auth_pin on inode [1000000002625 /gnu/blah_client_created. 0x89b7700] count now 1 + 0 - - if (/auth_pin /) { - my ($what) = /\[(\w+ \d+) /; -# print "add_waiter $c $what\n"; - $pin{$what}++; - $hist{$what} .= "$l: $_"; - push( @pins, $what ) unless grep {$_ eq $what} @pins; - } - - # cinode:auth_unpin on inode [1000000002625 (dangling) 0x89b7700] count now 0 + 0 - - if (/auth_unpin/) { - my ($what) = /\[(\w+ \d+) /;# / on (.*\])/; - $pin{$what}--; - $hist{$what} .= "$l: $_"; - unless ($pin{$what}) { - delete $hist{$what}; - delete $pin{$what}; - @pins = grep {$_ ne $what} @pins; - } - } - $l++; -} - -for my $what (@pins) { - print "---- count $pin{$what} on $what -$hist{$what} -"; -} diff --git a/branches/marnberg/quota/tcpfuse.cc b/branches/marnberg/quota/tcpfuse.cc deleted file mode 100644 index 3d7be50d377d6..0000000000000 --- a/branches/marnberg/quota/tcpfuse.cc +++ /dev/null @@ -1,80 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include -#include -#include -using namespace std; - -#include "config.h" - -#include "mds/MDCluster.h" -#include "mds/MDS.h" -#include "osd/OSD.h" -#include "client/Client.h" -#include "client/fuse.h" - -#include "msg/TCPMessenger.h" - -#include "common/Timer.h" - -#include - -#include -#include -#include - -int main(int argc, char **argv, char *envp[]) { - - //cerr << "tcpfuse starting " << myrank << "/" << world << endl; - vector args; - argv_to_vec(argc, argv, args); - parse_config_options(args); - - // args for fuse - vec_to_argv(args, argc, argv); - - // start up tcpmessenger - tcpaddr_t nsa; - if (tcpmessenger_findns(nsa) < 0) exit(1); - tcpmessenger_init(); - tcpmessenger_start(); - tcpmessenger_start_rankserver(nsa); - - Client *client = new Client(new TCPMessenger(MSG_ADDR_CLIENT_NEW)); - client->init(); - - // start up fuse - // use my argc, argv (make sure you pass a mount point!) - cout << "mounting" << endl; - client->mount(); - - cerr << "starting fuse on pid " << getpid() << endl; - ceph_fuse_main(client, argc, argv); - cerr << "fuse finished on pid " << getpid() << endl; - - client->unmount(); - cout << "unmounted" << endl; - client->shutdown(); - - delete client; - - // wait for it to finish - tcpmessenger_wait(); - tcpmessenger_shutdown(); // shutdown MPI - - return 0; -} - diff --git a/branches/marnberg/quota/tcpsyn.cc b/branches/marnberg/quota/tcpsyn.cc deleted file mode 100644 index cc9f470640c36..0000000000000 --- a/branches/marnberg/quota/tcpsyn.cc +++ /dev/null @@ -1,292 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include -#include -#include -using namespace std; - -#include "config.h" - -#include "mds/MDCluster.h" -#include "mds/MDS.h" -#include "osd/OSD.h" -#include "mon/Monitor.h" -#include "client/Client.h" -#include "client/SyntheticClient.h" - -#include "msg/TCPMessenger.h" - -#include "common/Timer.h" - -#define NUMMDS g_conf.num_mds -#define NUMOSD g_conf.num_osd -#define NUMCLIENT g_conf.num_client - -class C_Test : public Context { -public: - void finish(int r) { - cout << "C_Test->finish(" << r << ")" << endl; - } -}; - - -#include "msg/mpistarter.cc" - -utime_t tick_start; -int tick_count = 0; - -class C_Tick : public Context { -public: - void finish(int) { - utime_t now = g_clock.now() - tick_start; - dout(0) << "tick +" << g_conf.tick << " -> " << now << " (" << tick_count << ")" << endl; - tick_count += g_conf.tick; - utime_t next = tick_start; - next.sec_ref() += tick_count; - g_timer.add_event_at(next, new C_Tick); - } -}; - -class C_Die : public Context { -public: - void finish(int) { - cerr << "die" << endl; - exit(1); - } -}; - -class C_Debug : public Context { - public: - void finish(int) { - int size = &g_conf.debug_after - &g_conf.debug; - memcpy((char*)&g_conf.debug, (char*)&g_debug_after_conf.debug, size); - dout(0) << "debug_after flipping debug settings" << endl; - } -}; - - -int main(int argc, char **argv) -{ - vector args; - argv_to_vec(argc, argv, args); - - parse_config_options(args); - - parse_syn_options(args); - - if (g_conf.kill_after) - g_timer.add_event_after(g_conf.kill_after, new C_Die); - if (g_conf.debug_after) - g_timer.add_event_after(g_conf.debug_after, new C_Debug); - - if (g_conf.tick) { - tick_start = g_clock.now(); - g_timer.add_event_after(g_conf.tick, new C_Tick); - } - - vector nargs; - for (unsigned i=0; i mpiwho = mpi_bootstrap_tcp(argc, argv); - int myrank = mpiwho.first; - int world = mpiwho.second; - - int need = 0; - if (g_conf.tcp_skip_rank0) need++; - need += NUMMDS; - need += NUMOSD; - if (NUMCLIENT) { - if (!g_conf.tcp_overlay_clients) - need += 1; - } - assert(need <= world); - - if (myrank == 0) - cerr << "nummds " << NUMMDS << " numosd " << NUMOSD << " numclient " << NUMCLIENT << " .. need " << need << ", have " << world << endl; - - MDCluster *mdc = new MDCluster(NUMMDS, NUMOSD); - - - char hostname[100]; - gethostname(hostname,100); - int pid = getpid(); - - int started = 0; - - //if (myrank == 0) g_conf.debug = 20; - - // create mon - if (myrank == 0) { - Monitor *mon = new Monitor(0, new TCPMessenger(MSG_ADDR_MON(0))); - mon->init(); - } - - // create mds - MDS *mds[NUMMDS]; - OSD *mdsosd[NUMMDS]; - for (int i=0; iinit(); - started++; - - if (g_conf.mds_local_osd) { - mdsosd[i] = new OSD(i+10000, new TCPMessenger(MSG_ADDR_OSD(i+10000))); - mdsosd[i]->init(); - } - } - - // create osd - OSD *osd[NUMOSD]; - for (int i=0; iinit(); - started++; - } - - if (g_conf.tcp_overlay_clients) sleep(5); - - // create client - int skip_osd = NUMOSD; - if (g_conf.tcp_overlay_clients) - skip_osd = 0; // put clients with osds too! - int client_nodes = world - NUMMDS - skip_osd - g_conf.tcp_skip_rank0; - int clients_per_node = 1; - if (NUMCLIENT) clients_per_node = (NUMCLIENT-1) / client_nodes + 1; - set clientlist; - Client *client[NUMCLIENT]; - SyntheticClient *syn[NUMCLIENT]; - for (int i=0; iinit(); - started++; - - syn[i] = new SyntheticClient(client[i]); - } - - if (!clientlist.empty()) dout(2) << "i have " << clientlist << endl; - - int nclients = 0; - for (set::iterator it = clientlist.begin(); - it != clientlist.end(); - it++) { - int i = *it; - - //cerr << "starting synthetic client" << i << " on rank " << myrank << endl; - client[i]->mount(); - syn[i]->start_thread(); - - nclients++; - } - if (nclients) { - cerr << nclients << " clients on tcprank " << tcpmessenger_get_rank() << " " << hostname << "." << pid << endl; - } - - for (set::iterator it = clientlist.begin(); - it != clientlist.end(); - it++) { - int i = *it; - - // cout << "waiting for synthetic client" << i << " to finish" << endl; - syn[i]->join_thread(); - delete syn[i]; - - client[i]->unmount(); - //cout << "client" << i << " unmounted" << endl; - client[i]->shutdown(); - } - - - if (myrank && !started) { - //dout(1) << "IDLE" << endl; - cerr << "idle on tcprank " << tcpmessenger_get_rank() << " " << hostname << "." << pid << endl; - tcpmessenger_stop_rankserver(); - } - - // wait for everything to finish - tcpmessenger_wait(); - - if (started) cerr << "tcpsyn finishing" << endl; - - tcpmessenger_shutdown(); - - - /* - // cleanup - for (int i=0; i ~/.valgrindrc -# --suppressions=valgrind.supp -# - - -# this one makes valgrind shut up about what appears to be a bug in libc's writev. -{ - writev uninit bytes thing -sage - Memcheck:Param - writev(vector[...]) - fun:writev - fun:_ZN11BlockDevice6_writeEijjRN6buffer4listE - fun:_ZN11BlockDevice5do_ioEiRSt4listIPNS_6biovecESaIS2_EE - fun:_ZN11BlockDevice15io_thread_entryEv - fun:_ZN11BlockDevice8IOThread5entryEv - fun:_ZN6Thread11_entry_funcEPv - fun:start_thread - fun:clone - obj:* - obj:* - obj:* - obj:* -} diff --git a/branches/sage/crush/COPYING b/branches/sage/crush/COPYING deleted file mode 100644 index 5ab7695ab8cab..0000000000000 --- a/branches/sage/crush/COPYING +++ /dev/null @@ -1,504 +0,0 @@ - GNU LESSER GENERAL PUBLIC LICENSE - Version 2.1, February 1999 - - Copyright (C) 1991, 1999 Free Software Foundation, Inc. - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - Everyone is permitted to copy and distribute verbatim copies - of this license document, but changing it is not allowed. - -[This is the first released version of the Lesser GPL. It also counts - as the successor of the GNU Library Public License, version 2, hence - the version number 2.1.] - - Preamble - - The licenses for most software are designed to take away your -freedom to share and change it. By contrast, the GNU General Public -Licenses are intended to guarantee your freedom to share and change -free software--to make sure the software is free for all its users. - - This license, the Lesser General Public License, applies to some -specially designated software packages--typically libraries--of the -Free Software Foundation and other authors who decide to use it. You -can use it too, but we suggest you first think carefully about whether -this license or the ordinary General Public License is the better -strategy to use in any particular case, based on the explanations below. - - When we speak of free software, we are referring to freedom of use, -not price. Our General Public Licenses are designed to make sure that -you have the freedom to distribute copies of free software (and charge -for this service if you wish); that you receive source code or can get -it if you want it; that you can change the software and use pieces of -it in new free programs; and that you are informed that you can do -these things. - - To protect your rights, we need to make restrictions that forbid -distributors to deny you these rights or to ask you to surrender these -rights. These restrictions translate to certain responsibilities for -you if you distribute copies of the library or if you modify it. - - For example, if you distribute copies of the library, whether gratis -or for a fee, you must give the recipients all the rights that we gave -you. You must make sure that they, too, receive or can get the source -code. If you link other code with the library, you must provide -complete object files to the recipients, so that they can relink them -with the library after making changes to the library and recompiling -it. And you must show them these terms so they know their rights. - - We protect your rights with a two-step method: (1) we copyright the -library, and (2) we offer you this license, which gives you legal -permission to copy, distribute and/or modify the library. - - To protect each distributor, we want to make it very clear that -there is no warranty for the free library. Also, if the library is -modified by someone else and passed on, the recipients should know -that what they have is not the original version, so that the original -author's reputation will not be affected by problems that might be -introduced by others. - - Finally, software patents pose a constant threat to the existence of -any free program. We wish to make sure that a company cannot -effectively restrict the users of a free program by obtaining a -restrictive license from a patent holder. Therefore, we insist that -any patent license obtained for a version of the library must be -consistent with the full freedom of use specified in this license. - - Most GNU software, including some libraries, is covered by the -ordinary GNU General Public License. This license, the GNU Lesser -General Public License, applies to certain designated libraries, and -is quite different from the ordinary General Public License. We use -this license for certain libraries in order to permit linking those -libraries into non-free programs. - - When a program is linked with a library, whether statically or using -a shared library, the combination of the two is legally speaking a -combined work, a derivative of the original library. The ordinary -General Public License therefore permits such linking only if the -entire combination fits its criteria of freedom. The Lesser General -Public License permits more lax criteria for linking other code with -the library. - - We call this license the "Lesser" General Public License because it -does Less to protect the user's freedom than the ordinary General -Public License. It also provides other free software developers Less -of an advantage over competing non-free programs. These disadvantages -are the reason we use the ordinary General Public License for many -libraries. However, the Lesser license provides advantages in certain -special circumstances. - - For example, on rare occasions, there may be a special need to -encourage the widest possible use of a certain library, so that it becomes -a de-facto standard. To achieve this, non-free programs must be -allowed to use the library. A more frequent case is that a free -library does the same job as widely used non-free libraries. In this -case, there is little to gain by limiting the free library to free -software only, so we use the Lesser General Public License. - - In other cases, permission to use a particular library in non-free -programs enables a greater number of people to use a large body of -free software. For example, permission to use the GNU C Library in -non-free programs enables many more people to use the whole GNU -operating system, as well as its variant, the GNU/Linux operating -system. - - Although the Lesser General Public License is Less protective of the -users' freedom, it does ensure that the user of a program that is -linked with the Library has the freedom and the wherewithal to run -that program using a modified version of the Library. - - The precise terms and conditions for copying, distribution and -modification follow. Pay close attention to the difference between a -"work based on the library" and a "work that uses the library". The -former contains code derived from the library, whereas the latter must -be combined with the library in order to run. - - GNU LESSER GENERAL PUBLIC LICENSE - TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION - - 0. This License Agreement applies to any software library or other -program which contains a notice placed by the copyright holder or -other authorized party saying it may be distributed under the terms of -this Lesser General Public License (also called "this License"). -Each licensee is addressed as "you". - - A "library" means a collection of software functions and/or data -prepared so as to be conveniently linked with application programs -(which use some of those functions and data) to form executables. - - The "Library", below, refers to any such software library or work -which has been distributed under these terms. A "work based on the -Library" means either the Library or any derivative work under -copyright law: that is to say, a work containing the Library or a -portion of it, either verbatim or with modifications and/or translated -straightforwardly into another language. (Hereinafter, translation is -included without limitation in the term "modification".) - - "Source code" for a work means the preferred form of the work for -making modifications to it. For a library, complete source code means -all the source code for all modules it contains, plus any associated -interface definition files, plus the scripts used to control compilation -and installation of the library. - - Activities other than copying, distribution and modification are not -covered by this License; they are outside its scope. The act of -running a program using the Library is not restricted, and output from -such a program is covered only if its contents constitute a work based -on the Library (independent of the use of the Library in a tool for -writing it). Whether that is true depends on what the Library does -and what the program that uses the Library does. - - 1. You may copy and distribute verbatim copies of the Library's -complete source code as you receive it, in any medium, provided that -you conspicuously and appropriately publish on each copy an -appropriate copyright notice and disclaimer of warranty; keep intact -all the notices that refer to this License and to the absence of any -warranty; and distribute a copy of this License along with the -Library. - - You may charge a fee for the physical act of transferring a copy, -and you may at your option offer warranty protection in exchange for a -fee. - - 2. You may modify your copy or copies of the Library or any portion -of it, thus forming a work based on the Library, and copy and -distribute such modifications or work under the terms of Section 1 -above, provided that you also meet all of these conditions: - - a) The modified work must itself be a software library. - - b) You must cause the files modified to carry prominent notices - stating that you changed the files and the date of any change. - - c) You must cause the whole of the work to be licensed at no - charge to all third parties under the terms of this License. - - d) If a facility in the modified Library refers to a function or a - table of data to be supplied by an application program that uses - the facility, other than as an argument passed when the facility - is invoked, then you must make a good faith effort to ensure that, - in the event an application does not supply such function or - table, the facility still operates, and performs whatever part of - its purpose remains meaningful. - - (For example, a function in a library to compute square roots has - a purpose that is entirely well-defined independent of the - application. Therefore, Subsection 2d requires that any - application-supplied function or table used by this function must - be optional: if the application does not supply it, the square - root function must still compute square roots.) - -These requirements apply to the modified work as a whole. If -identifiable sections of that work are not derived from the Library, -and can be reasonably considered independent and separate works in -themselves, then this License, and its terms, do not apply to those -sections when you distribute them as separate works. But when you -distribute the same sections as part of a whole which is a work based -on the Library, the distribution of the whole must be on the terms of -this License, whose permissions for other licensees extend to the -entire whole, and thus to each and every part regardless of who wrote -it. - -Thus, it is not the intent of this section to claim rights or contest -your rights to work written entirely by you; rather, the intent is to -exercise the right to control the distribution of derivative or -collective works based on the Library. - -In addition, mere aggregation of another work not based on the Library -with the Library (or with a work based on the Library) on a volume of -a storage or distribution medium does not bring the other work under -the scope of this License. - - 3. You may opt to apply the terms of the ordinary GNU General Public -License instead of this License to a given copy of the Library. To do -this, you must alter all the notices that refer to this License, so -that they refer to the ordinary GNU General Public License, version 2, -instead of to this License. (If a newer version than version 2 of the -ordinary GNU General Public License has appeared, then you can specify -that version instead if you wish.) Do not make any other change in -these notices. - - Once this change is made in a given copy, it is irreversible for -that copy, so the ordinary GNU General Public License applies to all -subsequent copies and derivative works made from that copy. - - This option is useful when you wish to copy part of the code of -the Library into a program that is not a library. - - 4. You may copy and distribute the Library (or a portion or -derivative of it, under Section 2) in object code or executable form -under the terms of Sections 1 and 2 above provided that you accompany -it with the complete corresponding machine-readable source code, which -must be distributed under the terms of Sections 1 and 2 above on a -medium customarily used for software interchange. - - If distribution of object code is made by offering access to copy -from a designated place, then offering equivalent access to copy the -source code from the same place satisfies the requirement to -distribute the source code, even though third parties are not -compelled to copy the source along with the object code. - - 5. A program that contains no derivative of any portion of the -Library, but is designed to work with the Library by being compiled or -linked with it, is called a "work that uses the Library". Such a -work, in isolation, is not a derivative work of the Library, and -therefore falls outside the scope of this License. - - However, linking a "work that uses the Library" with the Library -creates an executable that is a derivative of the Library (because it -contains portions of the Library), rather than a "work that uses the -library". The executable is therefore covered by this License. -Section 6 states terms for distribution of such executables. - - When a "work that uses the Library" uses material from a header file -that is part of the Library, the object code for the work may be a -derivative work of the Library even though the source code is not. -Whether this is true is especially significant if the work can be -linked without the Library, or if the work is itself a library. The -threshold for this to be true is not precisely defined by law. - - If such an object file uses only numerical parameters, data -structure layouts and accessors, and small macros and small inline -functions (ten lines or less in length), then the use of the object -file is unrestricted, regardless of whether it is legally a derivative -work. (Executables containing this object code plus portions of the -Library will still fall under Section 6.) - - Otherwise, if the work is a derivative of the Library, you may -distribute the object code for the work under the terms of Section 6. -Any executables containing that work also fall under Section 6, -whether or not they are linked directly with the Library itself. - - 6. As an exception to the Sections above, you may also combine or -link a "work that uses the Library" with the Library to produce a -work containing portions of the Library, and distribute that work -under terms of your choice, provided that the terms permit -modification of the work for the customer's own use and reverse -engineering for debugging such modifications. - - You must give prominent notice with each copy of the work that the -Library is used in it and that the Library and its use are covered by -this License. You must supply a copy of this License. If the work -during execution displays copyright notices, you must include the -copyright notice for the Library among them, as well as a reference -directing the user to the copy of this License. Also, you must do one -of these things: - - a) Accompany the work with the complete corresponding - machine-readable source code for the Library including whatever - changes were used in the work (which must be distributed under - Sections 1 and 2 above); and, if the work is an executable linked - with the Library, with the complete machine-readable "work that - uses the Library", as object code and/or source code, so that the - user can modify the Library and then relink to produce a modified - executable containing the modified Library. (It is understood - that the user who changes the contents of definitions files in the - Library will not necessarily be able to recompile the application - to use the modified definitions.) - - b) Use a suitable shared library mechanism for linking with the - Library. A suitable mechanism is one that (1) uses at run time a - copy of the library already present on the user's computer system, - rather than copying library functions into the executable, and (2) - will operate properly with a modified version of the library, if - the user installs one, as long as the modified version is - interface-compatible with the version that the work was made with. - - c) Accompany the work with a written offer, valid for at - least three years, to give the same user the materials - specified in Subsection 6a, above, for a charge no more - than the cost of performing this distribution. - - d) If distribution of the work is made by offering access to copy - from a designated place, offer equivalent access to copy the above - specified materials from the same place. - - e) Verify that the user has already received a copy of these - materials or that you have already sent this user a copy. - - For an executable, the required form of the "work that uses the -Library" must include any data and utility programs needed for -reproducing the executable from it. However, as a special exception, -the materials to be distributed need not include anything that is -normally distributed (in either source or binary form) with the major -components (compiler, kernel, and so on) of the operating system on -which the executable runs, unless that component itself accompanies -the executable. - - It may happen that this requirement contradicts the license -restrictions of other proprietary libraries that do not normally -accompany the operating system. Such a contradiction means you cannot -use both them and the Library together in an executable that you -distribute. - - 7. You may place library facilities that are a work based on the -Library side-by-side in a single library together with other library -facilities not covered by this License, and distribute such a combined -library, provided that the separate distribution of the work based on -the Library and of the other library facilities is otherwise -permitted, and provided that you do these two things: - - a) Accompany the combined library with a copy of the same work - based on the Library, uncombined with any other library - facilities. This must be distributed under the terms of the - Sections above. - - b) Give prominent notice with the combined library of the fact - that part of it is a work based on the Library, and explaining - where to find the accompanying uncombined form of the same work. - - 8. You may not copy, modify, sublicense, link with, or distribute -the Library except as expressly provided under this License. Any -attempt otherwise to copy, modify, sublicense, link with, or -distribute the Library is void, and will automatically terminate your -rights under this License. However, parties who have received copies, -or rights, from you under this License will not have their licenses -terminated so long as such parties remain in full compliance. - - 9. You are not required to accept this License, since you have not -signed it. However, nothing else grants you permission to modify or -distribute the Library or its derivative works. These actions are -prohibited by law if you do not accept this License. Therefore, by -modifying or distributing the Library (or any work based on the -Library), you indicate your acceptance of this License to do so, and -all its terms and conditions for copying, distributing or modifying -the Library or works based on it. - - 10. Each time you redistribute the Library (or any work based on the -Library), the recipient automatically receives a license from the -original licensor to copy, distribute, link with or modify the Library -subject to these terms and conditions. You may not impose any further -restrictions on the recipients' exercise of the rights granted herein. -You are not responsible for enforcing compliance by third parties with -this License. - - 11. If, as a consequence of a court judgment or allegation of patent -infringement or for any other reason (not limited to patent issues), -conditions are imposed on you (whether by court order, agreement or -otherwise) that contradict the conditions of this License, they do not -excuse you from the conditions of this License. If you cannot -distribute so as to satisfy simultaneously your obligations under this -License and any other pertinent obligations, then as a consequence you -may not distribute the Library at all. For example, if a patent -license would not permit royalty-free redistribution of the Library by -all those who receive copies directly or indirectly through you, then -the only way you could satisfy both it and this License would be to -refrain entirely from distribution of the Library. - -If any portion of this section is held invalid or unenforceable under any -particular circumstance, the balance of the section is intended to apply, -and the section as a whole is intended to apply in other circumstances. - -It is not the purpose of this section to induce you to infringe any -patents or other property right claims or to contest validity of any -such claims; this section has the sole purpose of protecting the -integrity of the free software distribution system which is -implemented by public license practices. Many people have made -generous contributions to the wide range of software distributed -through that system in reliance on consistent application of that -system; it is up to the author/donor to decide if he or she is willing -to distribute software through any other system and a licensee cannot -impose that choice. - -This section is intended to make thoroughly clear what is believed to -be a consequence of the rest of this License. - - 12. If the distribution and/or use of the Library is restricted in -certain countries either by patents or by copyrighted interfaces, the -original copyright holder who places the Library under this License may add -an explicit geographical distribution limitation excluding those countries, -so that distribution is permitted only in or among countries not thus -excluded. In such case, this License incorporates the limitation as if -written in the body of this License. - - 13. The Free Software Foundation may publish revised and/or new -versions of the Lesser General Public License from time to time. -Such new versions will be similar in spirit to the present version, -but may differ in detail to address new problems or concerns. - -Each version is given a distinguishing version number. If the Library -specifies a version number of this License which applies to it and -"any later version", you have the option of following the terms and -conditions either of that version or of any later version published by -the Free Software Foundation. If the Library does not specify a -license version number, you may choose any version ever published by -the Free Software Foundation. - - 14. If you wish to incorporate parts of the Library into other free -programs whose distribution conditions are incompatible with these, -write to the author to ask for permission. For software which is -copyrighted by the Free Software Foundation, write to the Free -Software Foundation; we sometimes make exceptions for this. Our -decision will be guided by the two goals of preserving the free status -of all derivatives of our free software and of promoting the sharing -and reuse of software generally. - - NO WARRANTY - - 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO -WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. -EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR -OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY -KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE -LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME -THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. - - 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN -WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY -AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU -FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR -CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE -LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING -RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A -FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF -SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH -DAMAGES. - - END OF TERMS AND CONDITIONS - - How to Apply These Terms to Your New Libraries - - If you develop a new library, and you want it to be of the greatest -possible use to the public, we recommend making it free software that -everyone can redistribute and change. You can do so by permitting -redistribution under these terms (or, alternatively, under the terms of the -ordinary General Public License). - - To apply these terms, attach the following notices to the library. It is -safest to attach them to the start of each source file to most effectively -convey the exclusion of warranty; and each file should have at least the -"copyright" line and a pointer to where the full notice is found. - - - Copyright (C) - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - -Also add information on how to contact you by electronic and paper mail. - -You should also get your employer (if you work as a programmer) or your -school, if any, to sign a "copyright disclaimer" for the library, if -necessary. Here is a sample; alter the names: - - Yoyodyne, Inc., hereby disclaims all copyright interest in the - library `Frob' (a library for tweaking knobs) written by James Random Hacker. - - , 1 April 1990 - Ty Coon, President of Vice - -That's all there is to it! - - diff --git a/branches/sage/crush/Makefile b/branches/sage/crush/Makefile deleted file mode 100644 index 52fc13494c3c6..0000000000000 --- a/branches/sage/crush/Makefile +++ /dev/null @@ -1,311 +0,0 @@ -# -# until autoconf is set up, here are the options i understand: -# -# darwin=yes -- build on darwin -# fuse=no -- don't build anything requiring FUSE -# mpi=no -- don't build newsyn (require MPI) -# use_ccpp=yes -- use Common C++ for buffer.h reference counting -# want_bdb=yes -- build berkelydb objectstore -# - -# mpicxx must be on your path to build newsyn. -# on googoo, this means that /usr/local/mpich2-1.0.2/bin must be on your path. -# on issdm, it's /usr/local/mpich2/bin. - -# Hook for extra -I options, etc. -EXTRA_CFLAGS = #-I${HOME}/include -L${HOME}/lib -EXTRA_CFLAGS += -g -EXTRA_CFLAGS += -pg -#EXTRA_CFLAGS += -O3 - -# base -CFLAGS = -Wall -I. -D_FILE_OFFSET_BITS=64 -D_REENTRANT -D_THREAD_SAFE ${EXTRA_CFLAGS} -LDINC = ld -i -o -CC = g++ -LIBS = -pthread - -# darwin? -ifeq ($(target),darwin) -CFLAGS += -DDARWIN -D__FreeBSD__=10 -LDINC = ar -rc -endif - -# use Common C++ (for buffer.h)? -ifeq ($(use_ccpp),yes) -CFLAGS += -D_GNU_SOURCE -DBUFFER_USE_CCPP -LIBS += -lccgnu2 -ldl -endif - - -#for normal mpich2 machines -MPICC = mpicxx -MPICFLAGS = -DMPICH_IGNORE_CXX_SEEK ${CFLAGS} -MPILIBS = ${LIBS} - -#for LLNL boxes without mpicxx -#MPICC = g++ -#MPICFLAGS = ${CFLAGS} -I/usr/lib/mpi/mpi_gnu/include -L/usr/lib/mpi/mpi_gnu/lib -#MPILIBS = ${LIBS} -lelan -lmpi - -EBOFS_OBJS= \ - ebofs/BlockDevice.o\ - ebofs/BufferCache.o\ - ebofs/Ebofs.o\ - ebofs/Allocator.o\ - ebofs/FileJournal.o - -MDS_OBJS= \ - mds/MDS.o\ - mds/journal.o\ - mds/Server.o\ - mds/MDCache.o\ - mds/Locker.o\ - mds/Migrator.o\ - mds/MDBalancer.o\ - mds/CDentry.o\ - mds/CDir.o\ - mds/CInode.o\ - mds/AnchorTable.o\ - mds/AnchorClient.o\ - mds/LogEvent.o\ - mds/IdAllocator.o\ - mds/ClientMap.o\ - mds/MDLog.o - -OSD_OBJS= \ - osd/PG.o\ - osd/ReplicatedPG.o\ - osd/Ager.o\ - osd/FakeStore.o\ - osd/OSD.o -# osd/RAID4PG.o\ - -OSDC_OBJS= \ - osdc/Objecter.o\ - osdc/ObjectCacher.o\ - osdc/Filer.o\ - osdc/Journaler.o - -MON_OBJS= \ - mon/Monitor.o\ - mon/Paxos.o\ - mon/PaxosService.o\ - mon/OSDMonitor.o\ - mon/MDSMonitor.o\ - mon/ClientMonitor.o\ - mon/PGMonitor.o\ - mon/Elector.o\ - mon/MonitorStore.o - -COMMON_OBJS= \ - msg/Message.o\ - common/Logger.o\ - common/Clock.o\ - common/Timer.o\ - config.o - -CLIENT_OBJS= \ - client/FileCache.o\ - client/Client.o\ - client/SyntheticClient.o\ - client/Trace.o - - -# bdbstore? -ifeq ($(want_bdb),yes) -CFLAGS += -DUSE_OSBDB -LIBS = -ldb_cxx -OSD_OBJS += osbdb/OSBDB.o -OSBDB_OBJS = \ - osbdb/OSBDB.o -endif - - -# targets -TARGETS = cmon cosd cmds csyn mkmonmap cmonctl fakesyn dupstore -SRCS=*.cc */*.cc *.h */*.h */*/*.h - -ifneq ($(fuse),no) -TARGETS += cfuse fakefuse -endif - -ifneq ($(mpi),no) -TARGETS += newsyn -endif - -all: depend ${TARGETS} - -test: depend ${TEST_TARGETS} - - -# real bits -mkmonmap: mkmonmap.cc common.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - -extractosdmaps: extractosdmaps.cc common.o osd.o mon.o ebofs.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - -cmon: cmon.o mon.o msg/SimpleMessenger.o common.o crush/libcrush.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - -cmonctl: cmonctl.cc msg/SimpleMessenger.o common.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - -cosd: cosd.o osd.o ebofs.o msg/SimpleMessenger.o common.o crush/libcrush.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - -cmds: cmds.o mds.o osdc.o msg/SimpleMessenger.o common.o crush/libcrush.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - -csyn: csyn.o client.o osdc.o msg/SimpleMessenger.o common.o crush/libcrush.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - -cfuse: cfuse.o client.o osdc.o client/fuse.o client/fuse_ll.o msg/SimpleMessenger.o common.o crush/libcrush.o - ${CC} ${CFLAGS} ${LIBS} -lfuse $^ -o $@ - - -# code shipping experiments -activemaster: active/activemaster.cc client.o osdc.o msg/SimpleMessenger.o common.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - -activeslave: active/activeslave.cc client.o osdc.o msg/SimpleMessenger.o common.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - -echotestclient: active/echotestclient.cc - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - -msgtestclient: active/msgtestclient.o client.o osdc.o msg/SimpleMessenger.o common.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - -libtrivialtask.so: active/trivial_task.cc client.o osdc.o msg/SimpleMessenger.o common.o - ${CC} -fPIC -shared -Wl,-soname,$@.1 ${CFLAGS} ${LIBS} $^ -o $@ - - - -# IPC interface -ipc_server: ceph_ipc/ipc_server.cc client.o osdc.o msg/SimpleMessenger.o common.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - -ipc_testclient: ceph_ipc/ipc_testclient.cc ceph_ipc/ipc_client.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - - -# fake* -fakefuse: fakefuse.o mon.o mds.o client.o osd.o osdc.o ebofs.o client/fuse.o client/fuse_ll.o msg/FakeMessenger.o common.o crush/libcrush.o - ${CC} ${CFLAGS} ${LIBS} -lfuse $^ -o $@ - -fakesyn: fakesyn.o mon.o mds.o client.o osd.o ebofs.o osdc.o msg/FakeMessenger.o common.o crush/libcrush.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - - -# mpi startup -newsyn: newsyn.cc mon.o mds.o client.o osd.o ebofs.o osdc.o msg/SimpleMessenger.o common.o crush/libcrush.o - ${MPICC} ${MPICFLAGS} ${MPILIBS} $^ -o $@ - - -# ebofs -mkfs.ebofs: ebofs/mkfs.ebofs.cc config.cc common/Clock.o ebofs.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - -test.ebofs: ebofs/test.ebofs.cc config.cc common/Clock.o ebofs.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - -dupstore: dupstore.cc config.cc ebofs.o common/Clock.o common/Timer.o osd/FakeStore.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - - -# hadoop -libhadoopcephfs.so: client/hadoop/CephFSInterface.cc client.o osdc.o msg/SimpleMessenger.o common.o - ${CC} -fPIC -shared -Wl,-soname,$@.1 ${CFLAGS} -I/usr/local/java/include -I/usr/local/java/include/linux ${LIBS} $^ -o $@ - -# libceph -libceph.o: client/ldceph.o client/Client.o msg/SimpleMessenger.o ${COMMON_OBJS} ${SYN_OBJS} ${OSDC_OBJS} - ${LDINC} $^ -o $@ - -# some benchmarking tools -bench/mdtest/mdtest.o: bench/mdtest/mdtest.c - mpicc -c $^ -o $@ - -mdtest: bench/mdtest/mdtest.o - ${MPICC} ${MPICFLAGS} ${MPILIBS} $^ -o $@ - -mdtest.ceph: bench/mdtest/mdtest.o libceph.o - ${MPICC} ${MPICFLAGS} ${MPILIBS} $^ -o $@ - -testos: test/testos.o ebofs.o osbdb.o common.o - ${CC} ${CFLAGS} ${LIBS} ${OSBDB_LIBS} -o $@ $^ - - -# misc -gprof-helper.so: test/gprof-helper.c - gcc -shared -fPIC test/gprof-helper.c -o gprof-helper.so -lpthread -ldl - -test_disk_bw: test/test_disk_bw.cc common.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - -# crush - -crush/libcrush.o: force_look - cd crush ; make - -force_look: - true - -# bits -common.o: ${COMMON_OBJS} - ${LDINC} $@ $^ - -ebofs.o: ${EBOFS_OBJS} - ${LDINC} $@ $^ - -client.o: ${CLIENT_OBJS} - ${LDINC} $@ $^ - -osd.o: ${OSD_OBJS} - ${LDINC} $@ $^ - -osdc.o: ${OSDC_OBJS} - ${LDINC} $@ $^ - -mds.o: ${MDS_OBJS} - ${LDINC} $@ $^ - -mon.o: ${MON_OBJS} - ${LDINC} $@ $^ - -osbdb.o: ${OSBDB_OBJS} - ${LDINC} $@ $^ - -# generic rules -%.so: %.cc - ${CC} -shared -fPIC ${CFLAGS} $< -o $@ - -%.o: %.cc - ${CC} ${CFLAGS} -c $< -o $@ - -%.po: %.cc - ${CC} -fPIC ${CFLAGS} -c $< -o $@ - - -# handy -clean: - rm -f *.o */*.o ${TARGETS} ${TEST_TARGETS} - -count: - cat ${SRCS} | wc -l - cat ${SRCS} | grep -c \; - -TAGS: - etags `find . -name "*.[h|cc]"` - -.depend: - touch .depend - -depend: - $(RM) .depend - makedepend -f- -- $(CFLAGS) -- $(SRCS) > .depend 2>/dev/null -# for f in $(SRCS) ; do cpp -MM $(CFLAGS) $$f 2> /dev/null >> .depend ; done - - -# now add a line to include the dependency list. -include .depend diff --git a/branches/sage/crush/README b/branches/sage/crush/README deleted file mode 100644 index aa016817cebf0..0000000000000 --- a/branches/sage/crush/README +++ /dev/null @@ -1,4 +0,0 @@ -Ceph - a scalable distributed file system ------------------------------------------ - -Please see http://ceph.sourceforge.net/ for current info. diff --git a/branches/sage/crush/TODO b/branches/sage/crush/TODO deleted file mode 100644 index ed581ab0b350b..0000000000000 --- a/branches/sage/crush/TODO +++ /dev/null @@ -1,292 +0,0 @@ - -some smallish projects: - -- crush rewrite in C - - generalize any memory management etc. to allow use in kernel and userspace -- userspace crush tools - - xml import/export? - - ? - -- pg monitor service - - to support statfs? - - general pg health - - some sort of (throttled) osd status reporting - - dynamic pg creation (eventually!) - -- SimpleMessenger - - clean up/merge Messenger/Dispatcher interfaces - - auto close idle connections - - delivery ack and buffering, and then reconnect - - take a look at RDS? http://oss.oracle.com/projects/rds/ - -- generalize monitor client? - - throttle message resend attempts - -- ENOSPC on client, OSD - - - - -code cleanup -- endian portability -- word size - - clean up all encoded structures - -general kernel planning -- soft consistency on (kernel) lookup? -- accurate reconstruction of (syscall) path? - - - -sage doc -- mdsmonitor beacon semantics -- cache expiration, cache invariants - - including dual expire states, transition, vs subtree grouping of expire messages -- recovery states, implicit barrier are rejoin -- journal content - - importmaps and up:resolve -- metablob version semantics - - -mdsmon -- per-mds, shared standby queues - - - -mds bugs -- open file rejournaling vs capped log... - - open files vs shutdown in general! need to export any caps on replicated metadata -- export caps to auth on unlinked inodes -- stray purge on shutdown - -- rename slave in-memory rollback on failure - -- fix purge_stray bug -- try_remove_unlinked_dn thing - -- client session open from locker.. doesn't work properly with delays - -> journal the session open _with_ the import(start) - -- proper handling of cache expire messages during rejoin phase? - -- verify once-per-segment jouranl context is working... - -mds -- extend/clean up filepath to allow paths relative to an ino - - fix path_traverse - - fix reconnect/rejoin open file weirdness - -- get rid of replicate objects for replicate_to .. encode to bufferlists directly - -- stray reintegration -- verify stray is empty on shutdown - -- real chdir (directory "open") - - relative metadata ops - -- consistency points/snapshots - - dentry versions vs dirfrags... - -- detect and deal with client failure - - failure during reconnect vs clientmap. although probalby the whole thing needs a larger overhaul... - -- inode.max_size -- inode.allocated_size - -- osd needs a set_floor_and_read op for safe failover/STOGITH-like semantics. - -- could mark dir complete in EMetaBlob by counting how many dentries are dirtied in the current log epoch in CDir... - -- fix rmdir empty exported dirfrag race - - export all frags <= 1 item? then we ensure freezing before empty, avoiding any last unlink + export vs rmdir race. - - how to know full dir size (when trimming)? - - put frag size/mtime in fragmap in inode? we will need that anyway for stat on dirs - - will need to make inode discover/import_decode smart about dirfrag auth - - or, only put frag size/mtime in inode when frag is closed. otherwise, soft (journaled) state, possibly on another mds. - - need to move state from replicas to auth. simplelock doesn't currently support that. - - ScatterLock or something? hrm. - -- FIXME how to journal/store root and stray inode content? - - in particular, i care about dirfragtree.. get it on rejoin? - - and dir sizes, if i add that... also on rejoin? - -- efficient stat for single writers -- lstat vs stat? -- add FILE_CAP_EXTEND capability bit -- only share osdmap updates with clients holding capabilities -- delayed replica caps release... we need to set a timer event? (and cancel it when appropriate?) - - - -osdmon -- allow fresh replacement osds. add osd_created in osdmap, probably -- monitor needs to monitor some osds... -- monitor pg states, notify on out? -- watch osd utilization; adjust overload in cluster map - -journaler -- fix up for large events (e.g. imports) -- use set_floor_and_read for safe takeover from possibly-not-quite-dead otherguy. -- should we pad with zeros to avoid splitting individual entries? - - make it a g_conf flag? - - have to fix reader to skip over zeros (either <4 bytes for size, or zeroed sizes) -- need to truncate at detected (valid) write_pos to clear out any other partial trailing writes - - -crush -- xml import/export? -- crush tools - - -rados snapshots -- integrate revisions into ObjectCacher -- clean up oid.rev vs op.rev in osd+osdc - -- attr.crev is rev we were created in. -- oid.rev=0 is "live". defined for attr.crev <= rev. -- otherwise, defined for attr.crev <= rev < oid.rev (i.e. oid.rev is upper bound, non-inclusive.) - -- write|delete is tagged with op.rev - - if attr.crev < op.rev - - we clone to oid.rev=rev (clone keeps old crev) - - change live attr.crev=rev. - - apply update -- read is tagged with op.rev - - if 0, we read from 0 (if it exists). - - otherwise we choose object rev based on op.rev vs oid.rev, and then verifying attr.crev <= op.rev. - -- how to get usage feedback to monitor? - -- clean up mds caps release in exporter -- figure out client failure modes -- add connection retry. - - -objecter -- maybe_request_map should set a timer event to periodically re-request. -- transaction prepare/commit -- read+floor_lockout - -osd/rados -- transaction prepare/commit - - rollback - - rollback logging (to fix slow prepare vs rollback race) -- read+floor_lockout for clean STOGITH-like/fencing semantics after failover. - -- consider implications of nvram writeahead logs -- clean shutdown? -- pgmonitor should supplement failure detection - -- flag missing log entries on crash recovery --> WRNOOP? or WRLOST? - -- efficiently replicate clone() objects -- fix heartbeat wrt new replication -- mark residual pgs obsolete ??? -- rdlocks -- optimize remove wrt recovery pushes -- report crashed pgs? - -messenger -- fix messenger shutdown.. we shouldn't delete messenger, since the caller may be referencing it, etc. - -simplemessenger -- close idle connections -- buffer sent messages until a receive is acknowledged (handshake!) - - retry, timeout on connection or transmission failure -- exponential backoff on monitor resend attempts (actually, this should go outside the messenger!) - -objectcacher -- merge clean bh's -- ocacher caps transitions vs locks -- test read locks - -reliability -- heartbeat vs ping? -- osdmonitor, filter - -ebofs -- allow holes - -- verify proper behavior of conflicting/overlapping reads of clones -- combine inodes and/or cnodes into same blocks -- allow btree sets instead of maps -- eliminate nodepools -- nonblocking write on missing onodes? -- fix bug in node rotation on insert (and reenable) -- fix NEAR_LAST_FWD (?) - -- awareness of underlying software/hardware raid in allocator so that we - write full stripes _only_. - - hmm, that's basically just a large block size. - -- rewrite the btree code! - - multithreaded - - eliminate nodepools - - allow btree sets - - allow arbitrary embedded data? - - allow arbitrary btrees - - allow root node(s?) to be embedded in onode, or whereever. - - keys and values can be uniform (fixed-size) or non-uniform. - - fixed size (if any) is a value in the btree struct. - - negative indicates bytes of length value? (1 -> 255bytes, 2 -> 65535 bytes, etc.?) - - non-uniform records preceeded by length. - - keys sorted via a comparator defined in btree root. - - lexicographically, by default. - -- goal - - object btree key->value payload, not just a data blob payload. - - better threading behavior. - - with transactional goodness! - -- onode - - object attributes.. as a btree? - - blob stream - - map stream. - - allow blob values. - - - - - - -remaining hard problems -- how to cope with file size changes and read/write sharing - - -crush -- more efficient failure when all/too many osds are down -- allow forcefeed for more complicated rule structures. (e.g. make force_stack a list< set >) - - - - -client -- fstat -- mixed lazy and non-lazy io will clobber each others' caps in the buffer cache.. how to isolate.. -- test client caps migration w/ mds exports -- some heuristic behavior to consolidate caps to inode auth? - - - - - - - -why qsync could be wrong (for very strict POSIX) : varying mds -> client message transit or processing times. -- mds -> 1,2 : qsync -- client1 writes at byte 100 -- client1 -> mds : qsync reply (size=100) -- client1 writes at byte 300 -- client1 -> client2 (outside channel) -- client2 writes at byte 200 -- client2 -> mds : qsync reply (size=200) --> stat results in size 200, even though at no single point in time was the max size 500. --> for correct result, need to _stop_ client writers while gathering metadata. - - -- dump active config in run output somewhere - - - - - - diff --git a/branches/sage/crush/active/README b/branches/sage/crush/active/README deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/branches/sage/crush/active/activeslave.cc b/branches/sage/crush/active/activeslave.cc deleted file mode 100644 index d2953490f9d69..0000000000000 --- a/branches/sage/crush/active/activeslave.cc +++ /dev/null @@ -1,510 +0,0 @@ -/* - * This is a slave for receiving and executing commands for - * compute tasks on an OSD. This supersedes the daemon - * version in activetaskd.h/cc, because it's easier to debug - * if it's not a daemon. - * - * Networking code is based off examples from Stevens' UNIX Network Programming. - */ - -#include "activeslave.h" - -int main(int argc, const char* argv[]) { - - /* Set up TCP server */ - int sockfd, newsockfd, childpid; - socklen_t clilen; - struct sockaddr_in cli_addr, serv_addr; - - //const char *pname = argv[0]; // process name - - // Open a TCP socket - if ((sockfd = socket(AF_INET, SOCK_STREAM, 0)) < 0) { - cerr << "slave: can't open TCP socket. Exiting." << endl; - exit(-1); - } - cerr << "slave: opened TCP socket." << endl; - - // set up the port - bzero((char*) &serv_addr, sizeof(serv_addr)); - serv_addr.sin_family = AF_INET; - serv_addr.sin_addr.s_addr = htonl(INADDR_ANY); - serv_addr.sin_port = htons(SERV_TCP_PORT); - - if (bind(sockfd, (struct sockaddr *) &serv_addr, sizeof(serv_addr)) < 0) { - cerr << "slave: can't bind local address. Exiting." << endl; - exit(-1); - } - - if(listen(sockfd, SOMAXCONN) < 0) { - cerr << "slave: listening error. Exiting." << endl; - exit(-1); - } - - - /* The Big Loop */ - while (1) { - - // wait for a message and fork off a child process to handle it - clilen = sizeof(cli_addr); - newsockfd = accept(sockfd, - (struct sockaddr *) &cli_addr, - &clilen); - - if (newsockfd < 0) { - cerr << "slave: accept error. Exiting." << endl; - exit(-1); - } - - if ((childpid = fork()) < 0) { - cerr << "slave: fork error. Exiting." << endl; - exit(-1); - } - - else if (childpid == 0) { // child process - cerr << "Forked child process for incoming socket" << endl; - close(sockfd); - process_request(newsockfd); - cerr << "Finished processing request. Exiting child." << endl; - exit(0); - } - - close (newsockfd); // parent - - //sleep(30); /* wait 30 seconds */ - } - exit(EXIT_SUCCESS); -} - - -/* This will process requests from the master. - * The protocol in a nutshell: - * Master opens a socket to slave, and sends - * one message. - * Slave replies with one message. - * Socket is closed. - */ - -void process_request(int newsockfd) { - - // first, read the message type. - int msg_type = readmsgtype(newsockfd); - - - // Second, call some function based on the message type to process - // the rest of the message. The function is responsible for the rest - // of the message; this includes checking the message footer. - - switch(msg_type) { - - case PING: // ping - process_ping(newsockfd); - break; - case STARTTASK: // start_task - process_start_task(newsockfd); - break; - case RETRIEVELOCALFILE: // get_local - process_get_local(newsockfd); - break; - case SHIPCODE: - process_shipcode(newsockfd); - break; - - case PINGREPLY: - case FINISHEDTASK: - case TASKFAILED: - case SENDLOCALFILE: - case LOCALFILENOTFOUND: - case CODESAVED: - case SHIPFAILED: - cerr << "activeslave: BUG received message " << CMD_LIST[msg_type] << - " from master; master should never send this message." << endl; - exit(-1); - break; - - - case -1: - cerr << "activeslave: message had an unidentifiable type. " << - "Closing socket and discarding rest of message." << endl; - default: - cerr << "activeslave: BUG! received unexpected return value of" << msg_type << - "from readmsgtype(). Closing socket and discarding rest of message." << endl; - - exit(-1); - } -} - - -// Just write a ping_reply to the socket. -void process_ping(int fd) { - - // make sure the footer is valid - if (!check_footer(fd)) { - cerr << "process_ping warning: ping message has invalid or missing footer." - << endl; - } - // Even if the footer's invalid, send the reply. - cerr << "Replying to ping..." << endl; - send_msg_header(fd, PINGREPLY); - send_msg_footer(fd); - cerr << "Ping processing completed." << endl; -} - - - -// Process a start_task message. This reads the incoming message and -// starts the corresponding task. - -// Parameter format: taskID(int) command(string) -// cephinputfile(string) offset(long) length(long) localoutputfile - -// WARNING: currently has the trivial task hardwired. It -// ignores the command and the output file. -void process_start_task(int fd) { - - char command[MAX_STRING_SIZE + 1]; - char cephinputfile[MAX_STRING_SIZE + 1]; - char localoutputfile[MAX_STRING_SIZE + 1]; - - cout << "in process_start_task: "; - int taskID = read_positive_int(fd); - cout << "read taskID " << taskID; - - read_string(fd, command); - cout << ", command " << command; - - read_string(fd, cephinputfile); - cout << ", cephinputfile " << cephinputfile; - off_t offset = read_off_t(fd); - cout << ", offset " << offset; - off_t length = read_off_t(fd); - cout << ", length " << length; - - read_string(fd, localoutputfile); - cout << ", localoutputfile " << localoutputfile << endl; - - // make sure the footer is valid - if (!check_footer(fd)) { - cerr << "process_start_task warning: message has invalid or missing footer. " - << "Discarding message." << endl; - exit(-1); - } - - - // To do: modify to load the task from a library instead of just - // using the hardwired one. - - void (*task)(const char*, const char*, off_t, off_t) = 0; - task = start_trivial_task; - - - - // start a task; create an output filename that uses the task ID, 'cause we might - // end up with multiple pieces of a file on each OSD. - // WARNING: always does the trivial task; prints answer to stdout but - // does not write it to disk. - cerr << "starting task: " << endl; - //start_trivial_task(cephinputfile, localoutputfile, offset, length); - task(cephinputfile, localoutputfile, offset, length); - cerr << "returned from task! Sending reply:" << endl; - - - - // send the reply - send_msg_header(fd, FINISHEDTASK); - write_positive_int(fd, taskID); - send_msg_footer(fd); - - // done - cout << "Done sending reply for taskID " << taskID << endl; - return; -} - - - -void start_trivial_task (const char* ceph_filename, const char* local_filename, - off_t offset, off_t length) { - // Don't bother to copy the file to disk. Read the file directly from Ceph, - // and add up all the bytes. - // Write the total to the local file as a string. - Client * client = startCephClient(); - - bufferptr bp(CHUNK); - - // get the source file's size. Sanity-check the request range. - struct stat st; - int r = client->lstat(ceph_filename, &st); - assert (r == 0); - - off_t src_total_size = st.st_size; - if (src_total_size < offset + length) { - cerr << "Error in copy ExtentToLocalFile: offset + length = " << offset << " + " << length - << " = " + (offset + length) << ", source file size is only " << src_total_size << endl; - exit(-1); - } - off_t remaining = length; - - // open the file and seek to the start position - cerr << "start_trivial_task: opening the source file and seeking " << endl; - - int fh_ceph = client->open(ceph_filename, O_RDONLY); - assert (fh_ceph > -1); - r = client->lseek(fh_ceph, offset, SEEK_SET); - assert (r == offset); - - int counter = 0; - // read through the extent and add up the bytes - cerr << "start_trivial_task: counting up bytes" << endl; - char* bp_c = bp.c_str(); - while (remaining > 0) { - off_t got = client->read(fh_ceph, bp_c, MIN(remaining,CHUNK), -1); - assert(got > 0); - remaining -= got; - for (off_t i = 0; i < got; ++i) { - counter += (unsigned int)(bp_c[i]); - } - } - cerr << "start_trivial_task: Done! Answer is " << counter << endl; - client->close(fh_ceph); - - //assert(0); -} - - -// Starts a sloppy grep count of the hardwired search string over the -// given Ceph file extent. It's sloppy because it copies the given -// extent to a local file and runs "grep" on it, with no effort to take -// care of boundary issues. -void start_sloppy_grepcount (const char* ceph_filename, const char* local_filename, - long offset, long size) { - - Client* client = startCephClient(); - char* search_string = "the"; - // copy the file to a local file. - - copyExtentToLocalFile (client, ceph_filename, offset, size, local_filename); - // we want: grep -c search_string local_filename - // to get the number of occurrences of the string. - string command = ""; - command.append("grep -c "); - command.append(search_string); - command.append(local_filename); - - assert(0); - -} - - -// Processes a SHIPCODE message. The message will have a shared -// library attached to it, which must be stored locally. - -void process_shipcode(int fd) { - - - // get the size of the shared library - size_t library_size = read_size_t(fd); - - - // save the library to a file - cerr << "saving library..." << endl; - const char* libfile = "/tmp/libslavetask.so"; - int local_fd = ::open(libfile, O_WRONLY | O_CREAT | O_TRUNC); - if (local_fd < 0) { - cerr << "Error opening " << libfile << " for writing." << endl; - exit(-1); - } - - off_t remaining = library_size; - - bufferptr bp(CHUNK); - char* bp_c = bp.c_str(); - while (remaining > 0) { - off_t got = readn(fd, bp_c, MIN(remaining, CHUNK)); - assert(got > 0); - remaining -= got; - ssize_t written = ::write(local_fd, bp_c, got); - assert (written == got); - } - cerr << "Received shared library and stored as " << libfile << endl; - -} - - -// Processes a get_local message. The message -// specifies the filename of a local file to -// return to the sender. - -// Parameter format: requestID(int) localfilename(string) - -// INCOMPLETE: currently just reads the message. - - -void process_get_local(int fd) { - cout << "in process_get_local: "; - int taskID = read_positive_int(fd); - cout << "read taskID " << taskID; - - char localfilename[MAX_STRING_SIZE+1]; - read_string(fd, localfilename); - cout << ", localfilename " << localfilename << endl; - - - // make sure the footer is valid - if (!check_footer(fd)) { - cerr << "process_get_local warning: message has invalid or missing footer." - << endl; - } - - // not implemented - cerr << "Error: get_local command unimplemented." << endl; - assert(0); -} - - -// Retrieves a formatted message from the socket. -// At the moment, this just reads and prints a fixed- -// length message type. -// DEPRECATED. -void str_getmsg(int sockfd) { - - int n; - - // read message types until the connection dies - while(true) { - n = readmsgtype(sockfd); - if (n != 0) { - cerr << "from getmsg: some sort of error" << endl; - exit(-1); - } - } -} - -// Echo a stream socket message back to the sender. -// DEPRECATED. -void str_echo(int sockfd) { - - int n; - char line[MAXLINE]; - - while(true) { - - // read from the stream - cerr << "str_echo: waiting for a line" << endl; - n = readline(sockfd, line, MAXLINE); - cerr << "str_echo: read a line" << endl; - if (0 == n) { - cerr << "str_echo: connection terminated" << endl; - return; // connection is terminated - } - else if (n < 0) { - cerr << "str_echo: readline error" << endl; - exit(-1); - } - - // write back to the stream - if (n != writen(sockfd, line, n)) { - cerr << "str_echo: writen error" << endl; - exit(-1); - } - else - cerr << "Echoed line " << endl; - } -} - - -void str_ack(int sockfd) { - - int n; - char line[MAXLINE]; - //char *ack = "ack"; - - while(true) { - - // read from the stream - n = readline(sockfd, line, MAXLINE); - - if (0 == n) - return; // connection is terminated - else if (n < 0) - //err_dump("str_echo: readline error"); - exit(-1); - - // write back to the stream - if (4 != writen(sockfd, "ack\n", 4)) - //err_dump("str_echo: writen error"); - exit(-1); - } -} - - - -// Read command lines from the socket and execute them - -void str_run(int sockfd) { - - int n; - char line[MAXLINE]; - char* error_msg = "str_run: No command interpreter found\n"; - char* ack_msg = "Running command... "; - char* commit_msg = "Command executed!\n"; - - while(true) { - - // read from the stream - n = readline(sockfd, line, MAXLINE); - - if (0 == n) - return; // connection is terminated - else if (n < 0) - //err_dump("str_echo: readline error"); - exit(-1); - - if (system(NULL)) { - writen(sockfd, ack_msg, strlen(ack_msg)); - system(line); - writen(sockfd, commit_msg, strlen(commit_msg)); - } - else if ((int)strlen(error_msg) != writen(sockfd, error_msg, strlen(error_msg))) - //err_dump("str_echo: writen error"); - exit(-1); - } -} - - -// take a filename and copy it from Ceph to a local directory. -// Not completed. - -void str_copytolocal(int sockfd) { - - int n; - char line[MAXLINE]; - char* error_msg = "str_copy: No command interpreter found\n"; - char* ack_msg = "Running command... "; - char* commit_msg = "Command executed!\n"; - //char* temp_dir = "/tmp"; - - - while(true) { - - // read from the stream - n = readline(sockfd, line, MAXLINE); - - if (0 == n) - return; // connection is terminated - else if (n < 0) - //err_dump("str_echo: readline error"); - exit(-1); - - if (system(NULL)) { - writen(sockfd, ack_msg, strlen(ack_msg)); - system(line); - writen(sockfd, commit_msg, strlen(commit_msg)); - } - else if ((int)strlen(error_msg) != writen(sockfd, error_msg, strlen(error_msg))) - //err_dump("str_echo: writen error"); - exit(-1); - } -} - - - diff --git a/branches/sage/crush/active/activeslave.h b/branches/sage/crush/active/activeslave.h deleted file mode 100644 index 574824b0478f6..0000000000000 --- a/branches/sage/crush/active/activeslave.h +++ /dev/null @@ -1,23 +0,0 @@ -#include "inet.h" -#include "common.h" -#include "utility.h" -#include "client/Client.h" - - -// The port number is "osdd" on a telephone keypad. -#define SERV_TCP_PORT 6733 - -#define MAXLINE 512 - -void str_echo(int sockfd); -void str_ack(int sockfd); -void str_run(int sockfd); -void str_getmsg(int sockfd); -void process_request(int newsockfd); -void process_ping(int fd); -void process_start_task(int fd); -void process_get_local(int fd); -void process_shipcode(int fd); - -void start_trivial_task(const char* ceph_filename, const char* local_filename, - long offset, long length); diff --git a/branches/sage/crush/active/common.h b/branches/sage/crush/active/common.h deleted file mode 100644 index bf2c73ca4052a..0000000000000 --- a/branches/sage/crush/active/common.h +++ /dev/null @@ -1,94 +0,0 @@ -#ifndef COMMON_H -#define COMMON_H - - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -// a bunch of string constants -// for commands - - - -#define CMDLENGTH 10 -#define CMDCOUNT 11 - -#define MAX_STRING_SIZE 255 - -/* - * These are the various messages that can be sent between the master - * and slave. The slave sends one reply to each message from the master. - - * PING/PINGREPLY: just what it sounds like. - - * STARTTASK: starts a task. Needs to be reworked to allow code - * shipping. The slave attempts to perform the task, and replies with - * FINISHEDTASK or TASKFAILED. - * - * RETRIEVELOCALFILE: requests a file that the slave has stored - * locally. Slave replies with SENDLOCALFILE and the file, or with - * LOCALFILENOTFOUND. - * - * SHIPCODE: sends a shared library to the slave, containing a - * function that is to be executed later by the STARTTASK - * command. Slave replies with CODESAVED or SHIPFAILED. - * - */ - - -const off_t CHUNK = 1024 * 1024 * 4; - -#define PING 0 -#define STARTTASK 1 -#define RETRIEVELOCALFILE 2 -#define PINGREPLY 3 -#define FINISHEDTASK 4 -#define TASKFAILED 5 -#define SENDLOCALFILE 6 -#define LOCALFILENOTFOUND 7 -#define SHIPCODE 8 -#define CODESAVED 9 -#define SHIPFAILED 10 - - -#define FOOTER_LENGTH 7 - -const char* CMD_LIST[CMDCOUNT] = {"______PING", - "START_TASK", - "_GET_LOCAL", - "PING_REPLY", - "_TASK_DONE", - "TASKFAILED", - "SEND_LOCAL", - "LOCAL_GONE", - "_SHIP_CODE", - "CODE_SAVED", - "SHIPFAILED"}; - -const char FOOTER[FOOTER_LENGTH + 1] = "MSG_END"; - - -// const char* strArray[] = {"string1", "string2", "string3"}; -//const char commands[2][4] = {"foo", "bar"}; - - - -// error codes -#define ARGUMENTSINVALID 1001 -#define CEPHCLIENTSTARTUPFAILED 1002 -#define INPUTFILEREADFAILED 1003 - - -// const char* name = "Njal"; - - - -#endif //COMMON_H diff --git a/branches/sage/crush/active/msgtestclient.cc b/branches/sage/crush/active/msgtestclient.cc deleted file mode 100644 index 53650e730b387..0000000000000 --- a/branches/sage/crush/active/msgtestclient.cc +++ /dev/null @@ -1,418 +0,0 @@ -/* - * This test client tests the sending of message headers to the slave. - * - * Code is based off examples in Stevens' "Unix Network Programming". - */ -#include "msgtestclient.h" -#define REQUIRED_ARGS 2 - -int main(int argc, char* argv[]) { - - - // make sure we have all the arguments we need - if (argc < REQUIRED_ARGS) { usage(argv[0]); exit(-1); } - - // This file is rewired for running tests from a - // shell script. The first parameter specifies the - // name of the Ceph file that the test will be - // run on; the second parameter specifies which of - // four different tests will be run. - const char* input_filename = argv[1]; - int test_number = atoi(argv[2]); - assert (test_number > 0); - assert (test_number < 4); - - //const char* map_command = argv[2]; - // These two variables aren't really used yet. - const char* map_command = "map_foo"; - const char* output_filename = "out_foo"; - //const char* output_filename = argv[3]; - //const char* reduce_command = argv[4]; // not implemented yet - - // start up a Ceph client - Client* client = startCephClient(); - - // open the input file as read_only - int fh = client->open(input_filename, O_RDONLY); - if (fh < 0) { - cerr << "The input file " << input_filename << " could not be opened." << endl; - exit(-1); - } - - // How big is the file? - off_t filesize; - struct stat stbuf; - if (0 > client->lstat(input_filename, &stbuf)) { - cerr << "Error: could not retrieve size of input file " << input_filename << endl; - exit(-1); - } - filesize = stbuf.st_size; - if (filesize < 1) { - cerr << "Error: input file size is " << filesize << endl; - exit(-1); - } - - // retrieve all the object extents and close the file - list extents; - off_t offset = 0; - client->enumerate_layout(fh, extents, filesize, offset); - client->close(fh); - - list::iterator i; - map::iterator j; - int osd; - int taskID = 0; - - // Pull out all the extents, and make a vector of - // (ip_address, start, length). - - vector original_splits; - - for (i = extents.begin(); i != extents.end(); i++) { - - request_split split; - // find the primary and get its IP address - osd = client->osdmap->get_pg_primary(i->layout.pgid); - entity_inst_t inst = client->osdmap->get_inst(osd); - entity_addr_t entity_addr = inst.addr; - entity_addr.make_addr(split.ip_address); - - // iterate through each buffer_extent in the ObjectExtent - for (j = i->buffer_extents.begin(); - j != i->buffer_extents.end(); j++) { - - // get the range of the buffer_extent - split.start = (*j).first; - split.length = (*j).second; - // throw the split onto the vector - original_splits.push_back(split); - } - } - - // close the client - we're done with it - kill_client(client); - - // sanity check: display the splits - cerr << "Listing original splits:" << endl; - for (vector::iterator i = original_splits.begin(); - i != original_splits.end(); i++) { - cerr << "Split: IP " << i->ip_address << ", start " - << i->start << ", length " << i->length << endl; - } - - vector test_splits; - // Now, modify the splits as needed for the test type. - // There are three types of tests. - // Test 1: regular test. - // Test 2: put all the tasks on the "wrong" OSD. - // Test 3: do the entire job off one node. - - if (1 == test_number) { - cerr << "Test type 1: using original splits." << endl; - test_splits = original_splits; - } - else if (2 == test_number) { - cerr << "Test type 2: rotating split IP addresses. " << endl; - int split_count = original_splits.size(); - for (int i = 0; i < split_count; ++i) { - request_split s; - s.start = original_splits.at(i).start; - s.length = original_splits.at(i).length; - s.ip_address = original_splits.at((i+1)%split_count).ip_address; - test_splits.push_back(s); - } - } - else if (3 == test_number) { - cerr << "Test type 3: one giant split." << endl; - request_split s; - s.start = 0; - s.length = filesize; - s.ip_address = original_splits.at(0).ip_address; - test_splits.push_back(s); - } - else { - cerr << "Error: received invalid test type " << test_number << endl; - exit(-1); - } - - cerr << "Listing test splits:" << endl; - for (vector::iterator i = test_splits.begin(); - i != test_splits.end(); i++) { - cerr << "Split: IP " << i->ip_address << ", start " - << i->start << ", length " << i->length << endl; - } - - // start the timer - utime_t start_time = g_clock.now(); - int pending_tasks = 0; - - // start up the tasks - for (vector::iterator i = test_splits.begin(); - i != test_splits.end(); i++) { - start_map_task(i->ip_address, taskID++, map_command, input_filename, - i->start, i->length, output_filename); - ++pending_tasks; - } - - cerr << "Waiting for " << pending_tasks << " tasks to return..." << endl; - - // wait for all the tasks to finish - while (pending_tasks > 0) { - int exit_status; - cerr << "Waiting for " << pending_tasks << " tasks to return..." << endl; - pid_t pid = wait(&exit_status); - if (pid < 0) { - cerr << "ERROR on wait(): result was " << pid << endl; - exit(-1); - } - --pending_tasks; - if (WIFEXITED(exit_status)) { - cerr << "Task with pid " << pid << " returned with exit status " << - WEXITSTATUS(exit_status) << endl; - } - else { cerr << "WARNING: Task with pid " << pid << " exited abnormally" << endl; } - } - - cerr << "All tasks have returned." << endl; - // report the time - double elapsed_time; - elapsed_time = (g_clock.now() - start_time); - cerr << "Elapsed time: " << elapsed_time << endl; - cerr << elapsed_time << " " << endl; - // send the time to stdout for the shell script - cout << elapsed_time << " "; - exit(0); -} - - -// sends a complete ping message -// through the file descriptor -// and waits for a reply. This -// will hang if there's no reply. - -void ping_test(int fd) { - - // send the message header and footer. - // A ping message has no body. - send_msg_header(fd, PING); - send_msg_footer(fd); - - // receive the reply. - int msg_type = readmsgtype(fd); - if (msg_type < 0) { - cerr << "ping_test: Failed reading the ping reply. Exiting." << endl; - exit(-1); - } - if (PINGREPLY != msg_type) { - assert((msg_type <= 0) && (msg_type < CMDCOUNT) && - "readmsgtype return value out of range"); - cerr << "ping_test: slave sent invalid reply: replied to ping with message type" << - msg_type << ": " << CMD_LIST[msg_type] << ". Exiting. " << endl; - exit(-1); - } - else { - cerr << "Received valid ping reply!" << endl; - } - - if(!check_footer(fd)) { - cerr << "ping_test: message footer not found. Exiting." << endl; - exit(-1); - } -} - - - - -// send a test message for starting a task -void start_task_test(int fd) { - - // The test: - // TaskID 42 - // command: "Burninate" - // input file: "countryside" - // offset: 8764 (TROG) - // length: 367 (DOR) - - send_start_task_msg(fd, 42, strlen("Burninate"), "Burninate", - strlen("countryside"), "countryside", - 8764, 367, - strlen("toast"), "toast"); -} - - -// sends a message to the fd telling it to start a task. -// Remember: the message format requires any string to be -// prefixed by its (unterminated) length. -void send_start_task_msg(int fd, - int taskID, - int command_size, const char* command, - int inputfilenamesize, const char* inputfilename, - off_t offset, - off_t length, - int outputfilenamesize, const char* outputfilename) { - - // write the header and the message to the file descriptor. - - send_msg_header(fd, STARTTASK); - - write_positive_int(fd, taskID); - write_positive_int(fd, command_size); - write_string(fd, command); - write_positive_int(fd, inputfilenamesize); - write_string(fd, inputfilename); - //write_long(fd, offset); - write_off_t (fd, offset); - //write_long(fd, length); - write_off_t (fd, length); - write_positive_int(fd, outputfilenamesize); - write_string(fd, outputfilename); - - // terminate the message - send_msg_footer(fd); -} - - - - -// creates a new connection to the slave -// at the given IP address and port. -// Overloaded to take an IP address as a -// string or as an in_addr_t. - -int create_new_connection(const char* ip_address, uint16_t port) -{ - in_addr_t ip = inet_addr(ip_address); - if ((in_addr_t)-1 == ip) { - cerr << "Error creating new connection: \"" << ip_address << - "\" is not a valid IP address." << endl; - return -1; - } - else - //cerr << "Opening connection to " << ip_address << ":" << endl; - return create_new_connection(ip, port); -} - - -int create_new_connection(in_addr_t ip_address, uint16_t port) { - - struct sockaddr_in serv_addr; - int sockfd; - - bzero((char *) &serv_addr, sizeof(serv_addr)); - serv_addr.sin_family = AF_INET; - //serv_addr.sin_addr.s_addr = inet_addr(SERV_HOST_ADDR); - serv_addr.sin_addr.s_addr = ip_address; - serv_addr.sin_port = htons(SERV_TCP_PORT); - - // open a TCP socket - if ((sockfd = socket(AF_INET, SOCK_STREAM, 0)) < 0) { - cerr << "msgtestclient: can't open stream socket. Exiting." << endl; - exit (-1); - } - - // connect to the server. - if (connect(sockfd, (struct sockaddr *) &serv_addr, - sizeof(serv_addr)) < 0) { - cerr << "msgtestclient: can't connect to server." << endl; - exit (-1); - } - //cerr << "opened connection!" << endl; - return sockfd; -} - -void msg_type_sender(int sockfd) { - - for (int i = 0; i < CMDCOUNT; ++i) { - send_msg_header(sockfd, i); - } - -} - -// Fires up the map task. -// For the moment, all it does is echo the command line, not run it. -int start_map_task(sockaddr_in ip_address, int taskID, - const char* command, const char* input_filename, - off_t start, off_t length, - const char* output_filename) -{ - int childpid; - // fork off a child process to do the work, and return - if ((childpid = fork()) < 0) { - cerr << "start_map_task: fork error. Exiting." << endl; - exit(-1); - } - - else if (childpid != 0) { // parent - cerr << "start_map_task: forked child process " - << childpid << " to start task. " << endl; - return 0; - } - - - string ip_addr_string(inet_ntoa(ip_address.sin_addr)); - // cerr << "command: " << ip_addr_string << " taskID " - // << taskID << ": " << command - // << " " << input_filename << " " << start << " " << length - // << " " << output_filename << endl; - - // open a socket to the slave, and send the message - //cerr << "Sending message: " << endl; - int sockfd = create_new_connection(ip_addr_string.c_str(), SERV_TCP_PORT); - send_start_task_msg(sockfd, taskID, strlen(command), command, - strlen(input_filename), input_filename, - start, length, - strlen(output_filename), output_filename); - - // wait for a reply - cerr << "Sent message for taskID " << taskID << ". Waiting for reply..." << endl; - - // receive the reply. - int msg_type = readmsgtype(sockfd); - if (msg_type < 0) { - cerr << "start_map_task: Failed reading the reply. Exiting." << endl; - exit(-1); - } - if (FINISHEDTASK != msg_type) { - assert((msg_type <= 0) && (msg_type < CMDCOUNT)); - cerr << "start_map_task: slave sent invalid reply: replied with message type" << - msg_type << ": " << CMD_LIST[msg_type] << ". Exiting. " << endl; - exit(-1); - } - // read the taskID of the reply - - int reply_taskID = read_positive_int(sockfd); - - if(!check_footer(sockfd)) { - cerr << "ping_test: message footer not found. Exiting." << endl; - exit(-1); - } - - // done! - close(sockfd); - cerr << "Task " << taskID << "/" << reply_taskID << - " complete! Ending child process." << endl; - exit(0); - //_exit(0); - cerr << "exit(0) returned. Strange things are afoot." << endl; -} - - - - -void usage(const char* name) { - //cout << "usage: " << name << " inputfile map_task outputfile" << endl; - //cout << "inputfile must be a valid path in the running Ceph filesystem." << endl; - //cout << "map_task should be given with an absolute path, and be present on "; - //cout << "the REGULAR filesystem every node." << endl; - //cout << "output_file will be written locally to the node." << endl; - - cout << "usage: " << name << " inputfile test_number" << endl; - cout << "inputfile must be a valid path in the running Ceph filesystem." << endl; - cout << "test_number must be 1, 2, or 3." << endl; - cout << " 1: run the test task normally (one slave per OSD)" << endl; - cout << " 2: run the test task on the \"wrong\" OSDs" << endl; - cout << " 3: run the entire task in a single process" << endl; -} - - - diff --git a/branches/sage/crush/active/msgtestclient.h b/branches/sage/crush/active/msgtestclient.h deleted file mode 100644 index 568c9057be250..0000000000000 --- a/branches/sage/crush/active/msgtestclient.h +++ /dev/null @@ -1,44 +0,0 @@ -#include "inet.h" -#include "common.h" -#include "utility.h" -#include "client/Client.h" - -// wait.h MUST NOT be #included before client/Client.h -#include -#include - - struct request_split { - tcpaddr_t ip_address; - off_t start; - off_t length; - }; - - -//#define SERV_HOST_ADDR "128.114.57.143" //issdm-8 -#define SERV_HOST_ADDR "128.114.57.166" //issdm-31 - -#define SERV_TCP_PORT 6733 -#define MAXLINE 512 - -void msg_type_sender(int sockfd); - - -int create_new_connection(const char* ip_address, uint16_t port); -int create_new_connection(in_addr_t ip_address, uint16_t port); -void usage(const char* name); -void ping_test(int fd); -void start_task_test(int fd); - -int start_map_task(sockaddr_in ip_address, int taskID, - const char* map_command, - const char* input_filename, - off_t start, off_t length, - const char* output_filename); - -void send_start_task_msg(int fd, - int taskID, - int command_size, const char* command, - int inputfilenamesize, const char* inputfilename, - off_t offset, - off_t length, - int outputfilenamesize, const char* outputfilename); diff --git a/branches/sage/crush/active/utility.h b/branches/sage/crush/active/utility.h deleted file mode 100644 index 789398c0f4527..0000000000000 --- a/branches/sage/crush/active/utility.h +++ /dev/null @@ -1,214 +0,0 @@ -/* - * Miscellaneous Active OSD helper functions. - * - */ - -//#include -#include "client/Client.h" -#include "common.h" -#include "config.h" -#include "common/Timer.h" -#include "msg/SimpleMessenger.h" -#include "socket_utility.h" - -Client* startCephClient(); -void kill_client(Client* client); - - -int send_msg_header(int fd, int header_ID); -int readmsgtype(int fd); -bool check_footer(int fd); -int send_msg_header(int fd, int header_ID); -int send_msg_footer(int fd); - -/* - * Fires up a Ceph client and returns a pointer to it. - */ - -Client* startCephClient() -{ - cout << "ActiveMaster: Initializing Ceph client:" << endl; - - // parse args from CEPH_ARGS, not command line - vector args; - env_to_vec(args); - parse_config_options(args); - - if (g_conf.clock_tare) g_clock.tare(); - - // be safe - g_conf.use_abspaths = true; - - // load monmap - MonMap* monmap = new MonMap(); - int r = monmap->read(".ceph_monmap"); - if (r < 0) { - cout << "ActiveMaster: could not find .ceph_monmap" << endl; - return 0; - } - assert(r >= 0); - - // start up network - rank.start_rank(); - - // start client - Client *client; - client = new Client(rank.register_entity(MSG_ADDR_CLIENT_NEW), monmap); - client->init(); - - // mount - client->mount(); - - return client; -} - -void kill_client (Client * client) -{ - client->unmount(); - client->shutdown(); - delete client; - - // wait for messenger to finish - rank.wait(); -} - - - -// read a message type from the socket, and print it. - -int readmsgtype(int fd) { - int rc; - char typebuf[CMDLENGTH + 1]; - - rc = read(fd, &typebuf, CMDLENGTH); - - // read a fixed-length text command - if (rc != CMDLENGTH) { - cerr << "in readmsgtype: read error: result is " << rc << endl; - return -1; - } - - // null-terminate the string - typebuf[CMDLENGTH] = 0; - - // print the command - //cerr << "readmsgtype: text type is " << typebuf << ", " ; - - // figure out which one it is, by number - for (int i = 0; i < CMDCOUNT; ++i) { - if (!strcmp(typebuf, CMD_LIST[i])) { - //cerr << "which is identified as type " << i << endl; - return i; - } - } - - // if we get here the type was invalid - cerr << "readmsgtype: unrecognized message type " << typebuf << endl; - return -1; -} - -// Attempt to read the message footer off -// the given stream. -bool check_footer(int fd) { - - // leave space for null termination - char footer_buf[FOOTER_LENGTH+1]; - - // read the footer - int rc = read(fd, &footer_buf, FOOTER_LENGTH); - if (rc != FOOTER_LENGTH) { - cerr << "in check_footer: read error: result is " << rc << endl; - return false; - } - - // null-terminate the string - footer_buf[FOOTER_LENGTH] = 0; - - // Is the footer correct? - if (0 == strcmp(footer_buf, FOOTER)) - return true; - else - return false; -} - - -// send a fixed-length message header -// given the header's ID. -int send_msg_header(int fd, int header_ID) { - if ((header_ID < 0) || (header_ID >= CMDCOUNT)) { - cerr << "In send_msg_header: received out-of-range header ID " << header_ID << - ". Exiting process." << endl; - exit(-1); - } - - //cerr << "attempting to send message " << CMD_LIST[header_ID] << - // " with ID " << header_ID << endl; - - if (CMDLENGTH != writen(fd, CMD_LIST[header_ID], CMDLENGTH)) { - cerr << "In send_msg_header: error writing header ID " << header_ID << - "to file descriptor " << fd << ". Exiting process." << endl; - exit(-1); - } - - return 0; -} - -// send the fixed-length message footer. -int send_msg_footer(int fd) { - //cerr << "attempting to send message footer: " << endl; - if (FOOTER_LENGTH != writen(fd, FOOTER, FOOTER_LENGTH)) { - cerr << "in send_msg_footer: error writing footer to file descriptor " << - fd << ". Exiting process." << endl; - exit(-1); - } else { - //cerr << "Sent message footer!" << endl; - } - return 0; -} - - - -// Copy a given extent of a Ceph file to the local disk. -// Requires a running Ceph client. -void copyExtentToLocalFile (Client* client, const char* ceph_source, - long offset, long length, - const char* local_destination) { - - // get the source file's size. Sanity-check the request range. - struct stat st; - int r = client->lstat(ceph_source, &st); - assert (r == 0); - - off_t src_total_size = st.st_size; - if (src_total_size < offset + length) { - cerr << "Error in copy ExtentToLocalFile: offset + size = " << offset << " + " << length - << " = " + (offset + length) << ", source file size is only " << src_total_size << endl; - exit(-1); - } - off_t remaining = length; - - // open the source and destination files. Advance the source - // file to the desired offset. - int fh_ceph = client->open(ceph_source, O_RDONLY); - assert (fh_ceph > -1); - r = client->lseek(fh_ceph, offset, SEEK_SET); - assert (r == offset); - - int fh_local = ::open(local_destination, O_WRONLY|O_CREAT|O_TRUNC, 0644); - assert (fh_local > -1); - - // copy the file 4 MB at a time - const int chunk = 4*1024*1024; - bufferptr bp(chunk); - - while (remaining > 0) { - off_t got = client->read(fh_ceph, bp.c_str(), MIN(remaining,chunk), -1); - assert(got > 0); - remaining -= got; - off_t wrote = ::write(fh_local, bp.c_str(), got); - assert (got == wrote); - } - // close the files - client->close(fh_ceph); - ::close(fh_local); -} diff --git a/branches/sage/crush/client/Client.cc b/branches/sage/crush/client/Client.cc deleted file mode 100644 index 67c5af7101ed5..0000000000000 --- a/branches/sage/crush/client/Client.cc +++ /dev/null @@ -1,3909 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -// unix-ey fs stuff -#include -#include -#include -#include -#include -#include - -#include - - -#include -using namespace std; - - -// ceph stuff -#include "Client.h" - -#include "messages/MStatfs.h" -#include "messages/MStatfsReply.h" - -#include "messages/MClientMount.h" -#include "messages/MClientUnmount.h" -#include "messages/MClientSession.h" -#include "messages/MClientReconnect.h" -#include "messages/MClientRequest.h" -#include "messages/MClientRequestForward.h" -#include "messages/MClientReply.h" -#include "messages/MClientFileCaps.h" - -#include "messages/MGenericMessage.h" - -#include "messages/MMDSGetMap.h" -#include "messages/MMDSMap.h" - -#include "osdc/Filer.h" -#include "osdc/Objecter.h" -#include "osdc/ObjectCacher.h" - -#include "common/Cond.h" -#include "common/Mutex.h" -#include "common/Logger.h" - - - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_client) *_dout << dbeginl << g_clock.now() << " client" << whoami /*<< "." << pthread_self() */ << " " - -#define tout if (g_conf.client_trace) traceout - - -// static logger -Mutex client_logger_lock; -LogType client_logtype; -Logger *client_logger = 0; - - - - -class C_Client_CloseRelease : public Context { - Client *cl; - Inode *in; -public: - C_Client_CloseRelease(Client *c, Inode *i) : cl(c), in(i) {} - void finish(int) { - cl->close_release(in); - } -}; - -class C_Client_CloseSafe : public Context { - Client *cl; - Inode *in; -public: - C_Client_CloseSafe(Client *c, Inode *i) : cl(c), in(i) {} - void finish(int) { - cl->close_safe(in); - } -}; - - - - - - -// cons/des - -Client::Client(Messenger *m, MonMap *mm, int in) : timer(client_lock) -{ - // which client am i? - whoami = m->get_myname().num(); - my_instance = in; - monmap = mm; - - mounted = false; - mount_timeout_event = 0; - unmounting = false; - - last_tid = 0; - unsafe_sync_write = 0; - - mdsmap = 0; - - // - root = 0; - - lru.lru_set_max(g_conf.client_cache_size); - - // file handles - free_fd_set.insert(10, 1<<30); - - // set up messengers - messenger = m; - - // osd interfaces - osdmap = new OSDMap(); // initially blank.. see mount() - objecter = new Objecter(messenger, monmap, osdmap, client_lock); - objecter->set_client_incarnation(0); // client always 0, for now. - objectcacher = new ObjectCacher(objecter, client_lock); - filer = new Filer(objecter); -} - - -Client::~Client() -{ - tear_down_cache(); - - if (objectcacher) { - delete objectcacher; - objectcacher = 0; - } - - if (filer) { delete filer; filer = 0; } - if (objecter) { delete objecter; objecter = 0; } - if (osdmap) { delete osdmap; osdmap = 0; } - if (mdsmap) { delete mdsmap; mdsmap = 0; } - - if (messenger) { delete messenger; messenger = 0; } -} - - -void Client::tear_down_cache() -{ - // fd's - for (hash_map::iterator it = fd_map.begin(); - it != fd_map.end(); - it++) { - Fh *fh = it->second; - dout(1) << "tear_down_cache forcing close of fh " << it->first << " ino " << fh->inode->inode.ino << dendl; - put_inode(fh->inode); - delete fh; - } - fd_map.clear(); - - // caps! - // *** FIXME *** - - // empty lru - lru.lru_set_max(0); - trim_cache(); - assert(lru.lru_get_size() == 0); - - // close root ino - assert(inode_map.size() <= 1); - if (root && inode_map.size() == 1) { - delete root; - root = 0; - inode_map.clear(); - } - - assert(inode_map.empty()); -} - - - -// debug crapola - -void Client::dump_inode(Inode *in, set& did) -{ - dout(1) << "dump_inode: inode " << in->ino() << " ref " << in->ref << " dir " << in->dir << dendl; - - if (in->dir) { - dout(1) << " dir size " << in->dir->dentries.size() << dendl; - //for (hash_map, eqstr>::iterator it = in->dir->dentries.begin(); - for (hash_map::iterator it = in->dir->dentries.begin(); - it != in->dir->dentries.end(); - it++) { - dout(1) << " dn " << it->first << " ref " << it->second->ref << dendl; - dump_inode(it->second->inode, did); - } - } -} - -void Client::dump_cache() -{ - set did; - - if (root) dump_inode(root, did); - - for (hash_map::iterator it = inode_map.begin(); - it != inode_map.end(); - it++) { - if (did.count(it->second)) continue; - - dout(1) << "dump_cache: inode " << it->first - << " ref " << it->second->ref - << " dir " << it->second->dir << dendl; - if (it->second->dir) { - dout(1) << " dir size " << it->second->dir->dentries.size() << dendl; - } - } - -} - - -void Client::init() -{ - - // logger? - client_logger_lock.Lock(); - if (client_logger == 0) { - client_logtype.add_inc("lsum"); - client_logtype.add_inc("lnum"); - client_logtype.add_inc("lwsum"); - client_logtype.add_inc("lwnum"); - client_logtype.add_inc("lrsum"); - client_logtype.add_inc("lrnum"); - client_logtype.add_inc("trsum"); - client_logtype.add_inc("trnum"); - client_logtype.add_inc("wrlsum"); - client_logtype.add_inc("wrlnum"); - client_logtype.add_inc("lstatsum"); - client_logtype.add_inc("lstatnum"); - client_logtype.add_inc("ldirsum"); - client_logtype.add_inc("ldirnum"); - client_logtype.add_inc("readdir"); - client_logtype.add_inc("stat"); - client_logtype.add_avg("owrlat"); - client_logtype.add_avg("ordlat"); - client_logtype.add_inc("owr"); - client_logtype.add_inc("ord"); - - char s[80]; - char hostname[80]; - gethostname(hostname, 79); - sprintf(s,"clients.%s.%d", hostname, getpid()); - client_logger = new Logger(s, &client_logtype); - } - client_logger_lock.Unlock(); - -} - -void Client::shutdown() -{ - dout(1) << "shutdown" << dendl; - messenger->shutdown(); -} - - - - -// =================== -// metadata cache stuff - -void Client::trim_cache() -{ - unsigned last = 0; - while (lru.lru_get_size() != last) { - last = lru.lru_get_size(); - - if (lru.lru_get_size() <= lru.lru_get_max()) break; - - // trim! - Dentry *dn = (Dentry*)lru.lru_expire(); - if (!dn) break; // done - - dout(15) << "trim_cache unlinking dn " << dn->name - << " in dir " << hex << dn->dir->parent_inode->inode.ino - << dendl; - unlink(dn); - } - - // hose root? - if (lru.lru_get_size() == 0 && root && root->ref == 0 && inode_map.size() == 1) { - dout(15) << "trim_cache trimmed root " << root << dendl; - delete root; - root = 0; - inode_map.clear(); - } -} - -/** insert_inode - * - * insert + link a single dentry + inode into the metadata cache. - */ -Inode* Client::insert_inode(Dir *dir, InodeStat *st, const string& dname) -{ - Dentry *dn = NULL; - if (dir->dentries.count(dname)) - dn = dir->dentries[dname]; - - dout(12) << "insert_inode " << dname << " ino " << st->inode.ino - << " size " << st->inode.size - << " mtime " << st->inode.mtime - << " mask " << st->mask - << " in dir " << dir->parent_inode->inode.ino - << dendl; - - if (dn) { - if (dn->inode->inode.ino == st->inode.ino) { - touch_dn(dn); - dout(12) << " had dentry " << dname - << " with correct ino " << dn->inode->inode.ino - << dendl; - } else { - dout(12) << " had dentry " << dname - << " with WRONG ino " << dn->inode->inode.ino - << dendl; - unlink(dn); - dn = NULL; - } - } - - if (!dn) { - // have inode linked elsewhere? -> unlink and relink! - if (inode_map.count(st->inode.ino)) { - Inode *in = inode_map[st->inode.ino]; - assert(in); - - if (in->dn) { - dout(12) << " had ino " << in->inode.ino - << " not linked or linked at the right position, relinking" - << dendl; - dn = relink(dir, dname, in); - } else { - // link - dout(12) << " had ino " << in->inode.ino - << " unlinked, linking" << dendl; - dn = link(dir, dname, in); - } - } - } - - if (!dn) { - Inode *in = new Inode(st->inode, objectcacher); - inode_map[st->inode.ino] = in; - dn = link(dir, dname, in); - dout(12) << " new dentry+node with ino " << st->inode.ino << dendl; - } else { - // actually update info - dout(12) << " stat inode mask is " << st->mask << dendl; - if (st->mask & STAT_MASK_BASE) { - dn->inode->inode = st->inode; - dn->inode->dirfragtree = st->dirfragtree; // FIXME look at the mask! - } - - // ...but don't clobber our mtime, size! - /* isn't this handled below? - if ((dn->inode->mask & STAT_MASK_SIZE) == 0 && - dn->inode->file_wr_size > dn->inode->inode.size) - dn->inode->inode.size = dn->inode->file_wr_size; - if ((dn->inode->mask & STAT_MASK_MTIME) == 0 && - dn->inode->file_wr_mtime > dn->inode->inode.mtime) - dn->inode->inode.mtime = dn->inode->file_wr_mtime; - */ - } - - // OK, we found it! - assert(dn && dn->inode); - - // save the mask - dn->inode->mask = st->mask; - - // or do we have newer size/mtime from writing? - if (dn->inode->file_caps() & CAP_FILE_WR) { - if (dn->inode->file_wr_size > dn->inode->inode.size) - dn->inode->inode.size = dn->inode->file_wr_size; - if (dn->inode->file_wr_mtime > dn->inode->inode.mtime) - dn->inode->inode.mtime = dn->inode->file_wr_mtime; - } - - // symlink? - if (dn->inode->inode.is_symlink()) { - if (!dn->inode->symlink) - dn->inode->symlink = new string; - *(dn->inode->symlink) = st->symlink; - } - - return dn->inode; -} - -/** update_inode_dist - * - * update MDS location cache for a single inode - */ -void Client::update_dir_dist(Inode *in, DirStat *dst) -{ - // auth - in->dir_auth = -1; - if (dst->frag == frag_t()) { - in->dir_auth = dst->auth; - } else { - dout(20) << "got dirfrag map for " << in->inode.ino << " frag " << dst->frag << " to mds " << dst->auth << dendl; - in->fragmap[dst->frag] = dst->auth; - } - - // replicated - in->dir_replicated = dst->is_rep; // FIXME that's just one frag! - - // dist - /* - if (!st->dirfrag_dist.empty()) { // FIXME - set dist = st->dirfrag_dist.begin()->second; - if (dist.empty() && !in->dir_contacts.empty()) - dout(9) << "lost dist spec for " << in->inode.ino - << " " << dist << dendl; - if (!dist.empty() && in->dir_contacts.empty()) - dout(9) << "got dist spec for " << in->inode.ino - << " " << dist << dendl; - in->dir_contacts = dist; - } - */ -} - - -/** insert_trace - * - * insert a trace from a MDS reply into the cache. - */ -Inode* Client::insert_trace(MClientReply *reply) -{ - Inode *cur = root; - utime_t now = g_clock.real_now(); - - dout(10) << "insert_trace got " << reply->get_trace_in().size() << " inodes" << dendl; - - list::const_iterator pdn = reply->get_trace_dn().begin(); - list::const_iterator pdir = reply->get_trace_dir().begin(); - - for (list::const_iterator pin = reply->get_trace_in().begin(); - pin != reply->get_trace_in().end(); - ++pin) { - - if (pin == reply->get_trace_in().begin()) { - // root - dout(10) << "insert_trace root" << dendl; - if (!root) { - // create - cur = root = new Inode((*pin)->inode, objectcacher); - dout(10) << "insert_trace new root is " << root << dendl; - inode_map[root->inode.ino] = root; - root->dir_auth = 0; - } - } else { - // not root. - Dir *dir = cur->open_dir(); - assert(pdn != reply->get_trace_dn().end()); - cur = this->insert_inode(dir, *pin, *pdn); - dout(10) << "insert_trace dn " << *pdn << " ino " << (*pin)->inode.ino << " -> " << cur << dendl; - ++pdn; - - // move to top of lru! - if (cur->dn) - lru.lru_touch(cur->dn); - } - - // set cache ttl - if (g_conf.client_cache_stat_ttl) { - cur->valid_until = now; - cur->valid_until += g_conf.client_cache_stat_ttl; - } - - // update dir dist info - if (pdir == reply->get_trace_dir().end()) break; - update_dir_dist(cur, *pdir); - ++pdir; - } - - return cur; -} - - - - -Dentry *Client::lookup(filepath& path) -{ - dout(14) << "lookup " << path << dendl; - - Inode *cur = root; - if (!cur) return NULL; - - Dentry *dn = 0; - for (unsigned i=0; iinode << " valid_until " << dn->inode->valid_until << dendl; - } else { - dout(14) << " dentry " << path[i] << " dne" << dendl; - return NULL; - } - cur = dn->inode; - assert(cur); - } else { - return NULL; // not a dir - } - } - - if (dn) { - dout(11) << "lookup '" << path << "' found " << dn->name << " inode " << dn->inode->inode.ino << " valid_until " << dn->inode->valid_until<< dendl; - } - - return dn; -} - -// ------- - -int Client::choose_target_mds(MClientRequest *req) -{ - int mds = 0; - - // find deepest known prefix - Inode *diri = root; // the deepest known containing dir - Inode *item = 0; // the actual item... if we know it - int missing_dn = -1; // which dn we miss on (if we miss) - - unsigned depth = req->get_filepath().depth(); - unsigned i; - for (i=0; iinode.mode & INODE_MODE_DIR && diri->dir) { - Dir *dir = diri->dir; - - // do we have the next dentry? - if (dir->dentries.count( req->get_filepath()[i] ) == 0) { - missing_dn = i; // no. - break; - } - - dout(7) << " have path seg " << i << " on " << diri->dir_auth << " ino " << diri->inode.ino << " " << req->get_filepath()[i] << dendl; - - if (i == depth-1) { // last one! - item = dir->dentries[ req->get_filepath()[i] ]->inode; - break; - } - - // continue.. - diri = dir->dentries[ req->get_filepath()[i] ]->inode; - assert(diri); - } else { - missing_dn = i; - break; - } - } - - // pick mds - if (!diri || g_conf.client_use_random_mds) { - // no root info, pick a random MDS - mds = mdsmap->get_random_in_mds(); - if (mds < 0) mds = 0; - - if (0) { - mds = 0; - dout(0) << "hack: sending all requests to mds" << mds << dendl; - } - } else { - if (req->auth_is_best()) { - // pick the actual auth (as best we can) - if (item) { - mds = item->authority(); - } else { - mds = diri->authority(req->get_filepath()[missing_dn]); - } - } else { - // balance our traffic! - mds = diri->pick_replica(mdsmap); // for the _inode_ - dout(20) << "for " << req->get_filepath() << " diri " << diri->inode.ino << " rep " - << diri->dir_contacts - << " mds" << mds << dendl; - } - } - dout(20) << "mds is " << mds << dendl; - - return mds; -} - - - -MClientReply *Client::make_request(MClientRequest *req, - int use_mds) // this param is purely for debug hacking -{ - // time the call - utime_t start = g_clock.real_now(); - - bool nojournal = false; - int op = req->get_op(); - if (op == MDS_OP_STAT || - op == MDS_OP_LSTAT || - op == MDS_OP_READDIR || - op == MDS_OP_OPEN) - nojournal = true; - - - // -- request -- - // assign a unique tid - tid_t tid = ++last_tid; - req->set_tid(tid); - - if (!mds_requests.empty()) - req->set_oldest_client_tid(mds_requests.begin()->first); - else - req->set_oldest_client_tid(tid); // this one is the oldest. - - // make note - MetaRequest request(req, tid); - mds_requests[tid] = &request; - - // encode payload now, in case we have to resend (in case of mds failure) - req->encode_payload(); - request.request_payload = req->get_payload(); - - // note idempotency - request.idempotent = req->is_idempotent(); - - // hack target mds? - if (use_mds) - request.resend_mds = use_mds; - - // set up wait cond - Cond cond; - request.caller_cond = &cond; - - while (1) { - // choose mds - int mds; - // force use of a particular mds? - if (request.resend_mds >= 0) { - mds = request.resend_mds; - request.resend_mds = -1; - dout(10) << "target resend_mds specified as mds" << mds << dendl; - } else { - mds = choose_target_mds(req); - if (mds >= 0) { - dout(10) << "chose target mds" << mds << " based on hierarchy" << dendl; - } else { - mds = mdsmap->get_random_in_mds(); - if (mds < 0) mds = 0; // hrm. - dout(10) << "chose random target mds" << mds << " for lack of anything better" << dendl; - } - } - - // open a session? - if (mds_sessions.count(mds) == 0) { - Cond cond; - - if (!mdsmap->have_inst(mds)) { - dout(10) << "no address for mds" << mds << ", requesting new mdsmap" << dendl; - int mon = monmap->pick_mon(); - messenger->send_message(new MMDSGetMap(), - monmap->get_inst(mon)); - waiting_for_mdsmap.push_back(&cond); - cond.Wait(client_lock); - - if (!mdsmap->have_inst(mds)) { - dout(10) << "hmm, still have no address for mds" << mds << ", trying a random mds" << dendl; - request.resend_mds = mdsmap->get_random_in_mds(); - continue; - } - } - - if (waiting_for_session.count(mds) == 0) { - dout(10) << "opening session to mds" << mds << dendl; - messenger->send_message(new MClientSession(MClientSession::OP_REQUEST_OPEN), - mdsmap->get_inst(mds), MDS_PORT_SERVER); - } - - // wait - waiting_for_session[mds].push_back(&cond); - while (waiting_for_session.count(mds)) { - dout(10) << "waiting for session to mds" << mds << " to open" << dendl; - cond.Wait(client_lock); - } - } - - // send request. - send_request(&request, mds); - - // wait for signal - dout(20) << "awaiting kick on " << &cond << dendl; - cond.Wait(client_lock); - - // did we get a reply? - if (request.reply) - break; - } - - // got it! - MClientReply *reply = request.reply; - - // kick dispatcher (we've got it!) - assert(request.dispatch_cond); - request.dispatch_cond->Signal(); - dout(20) << "sendrecv kickback on tid " << tid << " " << request.dispatch_cond << dendl; - - // clean up. - mds_requests.erase(tid); - - - // -- log times -- - if (client_logger) { - utime_t lat = g_clock.real_now(); - lat -= start; - dout(20) << "lat " << lat << dendl; - client_logger->finc("lsum",(double)lat); - client_logger->inc("lnum"); - - if (nojournal) { - client_logger->finc("lrsum",(double)lat); - client_logger->inc("lrnum"); - } else { - client_logger->finc("lwsum",(double)lat); - client_logger->inc("lwnum"); - } - - if (op == MDS_OP_STAT) { - client_logger->finc("lstatsum",(double)lat); - client_logger->inc("lstatnum"); - } - else if (op == MDS_OP_READDIR) { - client_logger->finc("ldirsum",(double)lat); - client_logger->inc("ldirnum"); - } - - } - - return reply; -} - - -void Client::handle_client_session(MClientSession *m) -{ - dout(10) << "handle_client_session " << *m << dendl; - int from = m->get_source().num(); - - switch (m->op) { - case MClientSession::OP_OPEN: - assert(mds_sessions.count(from) == 0); - mds_sessions[from] = 0; - break; - - case MClientSession::OP_CLOSE: - mds_sessions.erase(from); - // FIXME: kick requests (hard) so that they are redirected. or fail. - break; - - default: - assert(0); - } - - // kick waiting threads - for (list::iterator p = waiting_for_session[from].begin(); - p != waiting_for_session[from].end(); - ++p) - (*p)->Signal(); - waiting_for_session.erase(from); - - delete m; -} - - -void Client::send_request(MetaRequest *request, int mds) -{ - MClientRequest *r = request->request; - if (!r) { - // make a new one - dout(10) << "send_request rebuilding request " << request->tid - << " for mds" << mds << dendl; - r = new MClientRequest; - r->copy_payload(request->request_payload); - r->decode_payload(); - r->set_retry_attempt(request->retry_attempt); - } - request->request = 0; - - dout(10) << "send_request " << *r << " to mds" << mds << dendl; - messenger->send_message(r, mdsmap->get_inst(mds), MDS_PORT_SERVER); - - request->mds.insert(mds); -} - -void Client::handle_client_request_forward(MClientRequestForward *fwd) -{ - tid_t tid = fwd->get_tid(); - - if (mds_requests.count(tid) == 0) { - dout(10) << "handle_client_request_forward no pending request on tid " << tid << dendl; - delete fwd; - return; - } - - MetaRequest *request = mds_requests[tid]; - assert(request); - - // reset retry counter - request->retry_attempt = 0; - - if (request->idempotent && - mds_sessions.count(fwd->get_dest_mds())) { - // dest mds has a session, and request was forwarded for us. - - // note new mds set. - if (request->num_fwd < fwd->get_num_fwd()) { - // there are now exactly two mds's whose failure should trigger a resend - // of this request. - request->mds.clear(); - request->mds.insert(fwd->get_source().num()); - request->mds.insert(fwd->get_dest_mds()); - request->num_fwd = fwd->get_num_fwd(); - dout(10) << "handle_client_request tid " << tid - << " fwd " << fwd->get_num_fwd() - << " to mds" << fwd->get_dest_mds() - << ", mds set now " << request->mds - << dendl; - } else { - dout(10) << "handle_client_request tid " << tid - << " previously forwarded to mds" << fwd->get_dest_mds() - << ", mds still " << request->mds - << dendl; - } - } else { - // request not forwarded, or dest mds has no session. - // resend. - dout(10) << "handle_client_request tid " << tid - << " fwd " << fwd->get_num_fwd() - << " to mds" << fwd->get_dest_mds() - << ", non-idempotent, resending to " << fwd->get_dest_mds() - << dendl; - - request->mds.clear(); - request->num_fwd = fwd->get_num_fwd(); - request->resend_mds = fwd->get_dest_mds(); - request->caller_cond->Signal(); - } - - delete fwd; -} - -void Client::handle_client_reply(MClientReply *reply) -{ - tid_t tid = reply->get_tid(); - - if (mds_requests.count(tid) == 0) { - dout(10) << "handle_client_reply no pending request on tid " << tid << dendl; - delete reply; - return; - } - MetaRequest *request = mds_requests[tid]; - assert(request); - - // store reply - request->reply = reply; - - // wake up waiter - request->caller_cond->Signal(); - - // wake for kick back - Cond cond; - request->dispatch_cond = &cond; - while (mds_requests.count(tid)) { - dout(20) << "handle_client_reply awaiting kickback on tid " << tid << " " << &cond << dendl; - cond.Wait(client_lock); - } -} - - -// ------------------------ -// incoming messages - -void Client::dispatch(Message *m) -{ - client_lock.Lock(); - - switch (m->get_type()) { - // osd - case MSG_OSD_OPREPLY: - objecter->handle_osd_op_reply((MOSDOpReply*)m); - break; - - case MSG_OSD_MAP: - objecter->handle_osd_map((class MOSDMap*)m); - if (!mounted) mount_cond.Signal(); - break; - - // mounting and mds sessions - case MSG_MDS_MAP: - handle_mds_map((MMDSMap*)m); - break; - case MSG_CLIENT_UNMOUNT: - handle_unmount(m); - break; - case MSG_CLIENT_SESSION: - handle_client_session((MClientSession*)m); - break; - - // requests - case MSG_CLIENT_REQUEST_FORWARD: - handle_client_request_forward((MClientRequestForward*)m); - break; - case MSG_CLIENT_REPLY: - handle_client_reply((MClientReply*)m); - break; - - case MSG_CLIENT_FILECAPS: - handle_file_caps((MClientFileCaps*)m); - break; - - case MSG_STATFS_REPLY: - handle_statfs_reply((MStatfsReply*)m); - break; - - default: - dout(10) << "dispatch doesn't recognize message type " << m->get_type() << dendl; - assert(0); // fail loudly - break; - } - - // unmounting? - if (unmounting) { - dout(10) << "unmounting: trim pass, size was " << lru.lru_get_size() - << "+" << inode_map.size() << dendl; - trim_cache(); - if (lru.lru_get_size() == 0 && inode_map.empty()) { - dout(10) << "unmounting: trim pass, cache now empty, waking unmount()" << dendl; - mount_cond.Signal(); - } else { - dout(10) << "unmounting: trim pass, size still " << lru.lru_get_size() - << "+" << inode_map.size() << dendl; - dump_cache(); - } - } - - client_lock.Unlock(); -} - - -void Client::handle_mds_map(MMDSMap* m) -{ - int frommds = -1; - if (m->get_source().is_mds()) - frommds = m->get_source().num(); - - if (mdsmap == 0) { - mdsmap = new MDSMap; - - assert(m->get_source().is_mon()); - whoami = m->get_dest().num(); - dout(1) << "handle_mds_map i am now " << m->get_dest() << dendl; - - mount_cond.Signal(); // mount might be waiting for this. - } - - if (m->get_epoch() < mdsmap->get_epoch()) { - dout(1) << "handle_mds_map epoch " << m->get_epoch() << " is older than our " - << mdsmap->get_epoch() << dendl; - delete m; - return; - } - - dout(1) << "handle_mds_map epoch " << m->get_epoch() << dendl; - mdsmap->decode(m->get_encoded()); - - // send reconnect? - if (frommds >= 0 && - mdsmap->get_state(frommds) == MDSMap::STATE_RECONNECT) { - send_reconnect(frommds); - } - - // kick requests? - if (frommds >= 0 && - mdsmap->get_state(frommds) == MDSMap::STATE_ACTIVE) { - kick_requests(frommds); - //failed_mds.erase(from); - } - - // kick any waiting threads - list ls; - ls.swap(waiting_for_mdsmap); - for (list::iterator p = ls.begin(); p != ls.end(); ++p) - (*p)->Signal(); - - delete m; -} - -void Client::send_reconnect(int mds) -{ - dout(10) << "send_reconnect to mds" << mds << dendl; - - MClientReconnect *m = new MClientReconnect; - - if (mds_sessions.count(mds)) { - // i have an open session. - for (hash_map::iterator p = inode_map.begin(); - p != inode_map.end(); - p++) { - if (p->second->caps.count(mds)) { - dout(10) << " caps on " << p->first - << " " << cap_string(p->second->caps[mds].caps) - << " wants " << cap_string(p->second->file_caps_wanted()) - << dendl; - p->second->caps[mds].seq = 0; // reset seq. - m->add_inode_caps(p->first, // ino - p->second->file_caps_wanted(), // wanted - p->second->caps[mds].caps, // issued - p->second->inode.size, p->second->inode.mtime, p->second->inode.atime); - string path; - p->second->make_path(path); - dout(10) << " path on " << p->first << " is " << path << dendl; - m->add_inode_path(p->first, path); - } - if (p->second->stale_caps.count(mds)) { - dout(10) << " clearing stale caps on " << p->first << dendl; - p->second->stale_caps.erase(mds); // hrm, is this right? - } - } - - // reset my cap seq number - mds_sessions[mds] = 0; - } else { - dout(10) << " i had no session with this mds"; - m->closed = true; - } - - messenger->send_message(m, mdsmap->get_inst(mds), MDS_PORT_SERVER); -} - - -void Client::kick_requests(int mds) -{ - dout(10) << "kick_requests for mds" << mds << dendl; - - for (map::iterator p = mds_requests.begin(); - p != mds_requests.end(); - ++p) - if (p->second->mds.count(mds)) { - p->second->retry_attempt++; // inc retry counter - send_request(p->second, mds); - } -} - - -/**** - * caps - */ - - -class C_Client_ImplementedCaps : public Context { - Client *client; - MClientFileCaps *msg; - Inode *in; -public: - C_Client_ImplementedCaps(Client *c, MClientFileCaps *m, Inode *i) : client(c), msg(m), in(i) {} - void finish(int r) { - client->implemented_caps(msg,in); - } -}; - -/** handle_file_caps - * handle caps update from mds. including mds to mds caps transitions. - * do not block. - */ -void Client::handle_file_caps(MClientFileCaps *m) -{ - int mds = m->get_source().num(); - Inode *in = 0; - if (inode_map.count(m->get_ino())) in = inode_map[ m->get_ino() ]; - - m->clear_payload(); // for if/when we send back to MDS - - // note push seq increment - if (mds_sessions.count(mds) == 0) - dout(0) << "got file_caps without session from mds" << mds << " msg " << *m << dendl; - //assert(mds_sessions.count(mds)); // HACK FIXME SOON - mds_sessions[mds]++; - - // reap? - if (m->get_op() == MClientFileCaps::OP_REAP) { - int other = m->get_mds(); - - if (in && in->stale_caps.count(other)) { - dout(5) << "handle_file_caps on ino " << m->get_ino() << " from mds" << mds << " reap on mds" << other << dendl; - - // fresh from new mds? - if (!in->caps.count(mds)) { - if (in->caps.empty()) in->get(); - in->caps[mds].seq = m->get_seq(); - in->caps[mds].caps = m->get_caps(); - } - - assert(in->stale_caps.count(other)); - in->stale_caps.erase(other); - if (in->stale_caps.empty()) put_inode(in); // note: this will never delete *in - - // fall-thru! - } else { - dout(5) << "handle_file_caps on ino " << m->get_ino() << " from mds" << mds << " premature (!!) reap on mds" << other << dendl; - // delay! - cap_reap_queue[in->ino()][other] = m; - return; - } - } - - assert(in); - - // stale? - if (m->get_op() == MClientFileCaps::OP_STALE) { - dout(5) << "handle_file_caps on ino " << m->get_ino() << " seq " << m->get_seq() << " from mds" << mds << " now stale" << dendl; - - // move to stale list - assert(in->caps.count(mds)); - if (in->stale_caps.empty()) in->get(); - in->stale_caps[mds] = in->caps[mds]; - - assert(in->caps.count(mds)); - in->caps.erase(mds); - if (in->caps.empty()) in->put(); - - // delayed reap? - if (cap_reap_queue.count(in->ino()) && - cap_reap_queue[in->ino()].count(mds)) { - dout(5) << "handle_file_caps on ino " << m->get_ino() << " from mds" << mds << " delayed reap on mds" << m->get_mds() << dendl; - - // process delayed reap - handle_file_caps( cap_reap_queue[in->ino()][mds] ); - - cap_reap_queue[in->ino()].erase(mds); - if (cap_reap_queue[in->ino()].empty()) - cap_reap_queue.erase(in->ino()); - } - delete m; - return; - } - - // release? - if (m->get_op() == MClientFileCaps::OP_RELEASE) { - dout(5) << "handle_file_caps on ino " << m->get_ino() << " from mds" << mds << " release" << dendl; - assert(in->caps.count(mds)); - in->caps.erase(mds); - for (map::iterator p = in->caps.begin(); - p != in->caps.end(); - p++) - dout(20) << " left cap " << p->first << " " - << cap_string(p->second.caps) << " " - << p->second.seq << dendl; - for (map::iterator p = in->stale_caps.begin(); - p != in->stale_caps.end(); - p++) - dout(20) << " left stale cap " << p->first << " " - << cap_string(p->second.caps) << " " - << p->second.seq << dendl; - - if (in->caps.empty()) { - //dout(0) << "did put_inode" << dendl; - put_inode(in); - } else { - //dout(0) << "didn't put_inode" << dendl; - } - delete m; - return; - } - - - // don't want? - if (in->file_caps_wanted() == 0) { - dout(5) << "handle_file_caps on ino " << m->get_ino() - << " seq " << m->get_seq() - << " " << cap_string(m->get_caps()) - << ", which we don't want caps for, releasing." << dendl; - m->set_caps(0); - m->set_wanted(0); - messenger->send_message(m, m->get_source_inst(), MDS_PORT_LOCKER); - return; - } - - assert(in->caps.count(mds)); - - // update per-mds caps - const int old_caps = in->caps[mds].caps; - const int new_caps = m->get_caps(); - in->caps[mds].caps = new_caps; - in->caps[mds].seq = m->get_seq(); - dout(5) << "handle_file_caps on in " << m->get_ino() - << " mds" << mds << " seq " << m->get_seq() - << " caps now " << cap_string(new_caps) - << " was " << cap_string(old_caps) << dendl; - - // did file size decrease? - if ((old_caps & (CAP_FILE_RD|CAP_FILE_WR)) == 0 && - (new_caps & (CAP_FILE_RD|CAP_FILE_WR)) != 0 && - in->inode.size > m->get_inode().size) { - dout(10) << "*** file size decreased from " << in->inode.size << " to " << m->get_inode().size << dendl; - - // trim filecache? - if (g_conf.client_oc) - in->fc.truncate(in->inode.size, m->get_inode().size); - - in->inode.size = in->file_wr_size = m->get_inode().size; - } - - // update inode - in->inode = m->get_inode(); // might have updated size... FIXME this is overkill! - - // preserve our (possibly newer) file size, mtime - if (in->file_wr_size > in->inode.size) - m->get_inode().size = in->inode.size = in->file_wr_size; - if (in->file_wr_mtime > in->inode.mtime) - m->get_inode().mtime = in->inode.mtime = in->file_wr_mtime; - - - - if (g_conf.client_oc) { - // caching on, use FileCache. - Context *onimplement = 0; - if (old_caps & ~new_caps) { // this mds is revoking caps - if (in->fc.get_caps() & ~(in->file_caps())) // net revocation - onimplement = new C_Client_ImplementedCaps(this, m, in); - else { - implemented_caps(m, in); // ack now. - } - } - in->fc.set_caps(new_caps, onimplement); - } else { - // caching off. - - // wake up waiters? - if (new_caps & CAP_FILE_RD) { - for (list::iterator it = in->waitfor_read.begin(); - it != in->waitfor_read.end(); - it++) { - dout(5) << "signaling read waiter " << *it << dendl; - (*it)->Signal(); - } - in->waitfor_read.clear(); - } - if (new_caps & CAP_FILE_WR) { - for (list::iterator it = in->waitfor_write.begin(); - it != in->waitfor_write.end(); - it++) { - dout(5) << "signaling write waiter " << *it << dendl; - (*it)->Signal(); - } - in->waitfor_write.clear(); - } - if (new_caps & CAP_FILE_LAZYIO) { - for (list::iterator it = in->waitfor_lazy.begin(); - it != in->waitfor_lazy.end(); - it++) { - dout(5) << "signaling lazy waiter " << *it << dendl; - (*it)->Signal(); - } - in->waitfor_lazy.clear(); - } - - // ack? - if (old_caps & ~new_caps) { - if (in->sync_writes) { - // wait for sync writes to finish - dout(5) << "sync writes in progress, will ack on finish" << dendl; - in->waitfor_no_write.push_back(new C_Client_ImplementedCaps(this, m, in)); - } else { - // ok now - implemented_caps(m, in); - } - } else { - // discard - delete m; - } - } -} - -void Client::implemented_caps(MClientFileCaps *m, Inode *in) -{ - dout(5) << "implemented_caps " << cap_string(m->get_caps()) - << ", acking to " << m->get_source() << dendl; - - if (in->file_caps() == 0) { - in->file_wr_mtime = utime_t(); - in->file_wr_size = 0; - } - - messenger->send_message(m, m->get_source_inst(), MDS_PORT_LOCKER); -} - - -void Client::release_caps(Inode *in, - int retain) -{ - dout(5) << "releasing caps on ino " << in->inode.ino << dec - << " had " << cap_string(in->file_caps()) - << " retaining " << cap_string(retain) - << " want " << cap_string(in->file_caps_wanted()) - << dendl; - - for (map::iterator it = in->caps.begin(); - it != in->caps.end(); - it++) { - //if (it->second.caps & ~retain) { - if (1) { - // release (some of?) these caps - it->second.caps = retain & it->second.caps; - // note: tell mds _full_ wanted; it'll filter/behave based on what it is allowed to do - MClientFileCaps *m = new MClientFileCaps(MClientFileCaps::OP_ACK, - in->inode, - it->second.seq, - it->second.caps, - in->file_caps_wanted()); - messenger->send_message(m, mdsmap->get_inst(it->first), MDS_PORT_LOCKER); - } - } - - if (in->file_caps() == 0) { - in->file_wr_mtime = utime_t(); - in->file_wr_size = 0; - } -} - -void Client::update_caps_wanted(Inode *in) -{ - dout(5) << "updating caps wanted on ino " << in->inode.ino - << " to " << cap_string(in->file_caps_wanted()) - << dendl; - - // FIXME: pick a single mds and let the others off the hook.. - for (map::iterator it = in->caps.begin(); - it != in->caps.end(); - it++) { - MClientFileCaps *m = new MClientFileCaps(MClientFileCaps::OP_ACK, - in->inode, - it->second.seq, - it->second.caps, - in->file_caps_wanted()); - messenger->send_message(m, - mdsmap->get_inst(it->first), MDS_PORT_LOCKER); - } -} - - - -// ------------------- -// MOUNT - -void Client::_try_mount() -{ - dout(10) << "_try_mount" << dendl; - int mon = monmap->pick_mon(); - dout(2) << "sending client_mount to mon" << mon << " as instance " << my_instance << dendl; - messenger->send_first_message(this, // simultaneously go active (if we haven't already) - new MClientMount(messenger->get_myaddr(), my_instance), - monmap->get_inst(mon)); - - // schedule timeout? - assert(mount_timeout_event == 0); - mount_timeout_event = new C_MountTimeout(this); - timer.add_event_after(g_conf.client_mount_timeout, mount_timeout_event); -} - -void Client::_mount_timeout() -{ - dout(10) << "_mount_timeout" << dendl; - mount_timeout_event = 0; - _try_mount(); -} - -int Client::mount() -{ - client_lock.Lock(); - assert(!mounted); // caller is confused? - - objecter->init(); - - _try_mount(); - //messenger->set_dispatcher(this); // FIXME: there is still a race condition here! - - while (!mdsmap || - !osdmap || - osdmap->get_epoch() == 0) - mount_cond.Wait(client_lock); - - timer.cancel_event(mount_timeout_event); - mount_timeout_event = 0; - - mounted = true; - - dout(2) << "mounted: have osdmap " << osdmap->get_epoch() - << " and mdsmap " << mdsmap->get_epoch() - << dendl; - - // hack: get+pin root inode. - // fuse assumes it's always there. - Inode *root; - _do_lstat("/", STAT_MASK_ALL, &root); - _ll_get(root); - - // trace? - if (g_conf.client_trace) { - traceout.open(g_conf.client_trace); - if (traceout.is_open()) { - dout(1) << "opened trace file '" << g_conf.client_trace << "'" << dendl; - } else { - dout(1) << "FAILED to open trace file '" << g_conf.client_trace << "'" << dendl; - } - } - - client_lock.Unlock(); - - /* - dout(3) << "op: // client trace data structs" << dendl; - dout(3) << "op: struct stat st;" << dendl; - dout(3) << "op: struct utimbuf utim;" << dendl; - dout(3) << "op: int readlinkbuf_len = 1000;" << dendl; - dout(3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl; - dout(3) << "op: map dir_contents;" << dendl; - dout(3) << "op: map open_files;" << dendl; - dout(3) << "op: int fd;" << dendl; - */ - return 0; -} - - -// UNMOUNT - - -int Client::unmount() -{ - client_lock.Lock(); - - assert(mounted); // caller is confused? - - dout(2) << "unmounting" << dendl; - unmounting = true; - - // NOTE: i'm assuming all caches are already flushing (because all files are closed). - assert(fd_map.empty()); - - dout(10) << "a" << dendl; - - _ll_drop_pins(); - - dout(10) << "b" << dendl; - - // empty lru cache - lru.lru_set_max(0); - trim_cache(); - - if (g_conf.client_oc) { - // release any/all caps - for (hash_map::iterator p = inode_map.begin(); - p != inode_map.end(); - p++) { - Inode *in = p->second; - if (!in->caps.empty()) { - in->fc.release_clean(); - if (in->fc.is_dirty()) { - dout(10) << "unmount residual caps on " << in->ino() << ", flushing" << dendl; - in->fc.empty(new C_Client_CloseRelease(this, in)); - } else { - dout(10) << "unmount residual caps on " << in->ino() << ", releasing" << dendl; - release_caps(in); - } - } - } - } - - //if (0) {// hack - while (lru.lru_get_size() > 0 || - !inode_map.empty()) { - dout(2) << "cache still has " << lru.lru_get_size() - << "+" << inode_map.size() << " items" - << ", waiting (for caps to release?)" - << dendl; - dump_cache(); - mount_cond.Wait(client_lock); - } - assert(lru.lru_get_size() == 0); - assert(inode_map.empty()); - //} - - // unsafe writes - if (!g_conf.client_oc) { - while (unsafe_sync_write > 0) { - dout(0) << unsafe_sync_write << " unsafe_sync_writes, waiting" - << dendl; - mount_cond.Wait(client_lock); - } - } - - // stop tracing - if (g_conf.client_trace) { - dout(1) << "closing trace file '" << g_conf.client_trace << "'" << dendl; - traceout.close(); - } - - - // send session closes! - for (map::iterator p = mds_sessions.begin(); - p != mds_sessions.end(); - ++p) { - dout(2) << "sending client_session close to mds" << p->first << " seq " << p->second << dendl; - messenger->send_message(new MClientSession(MClientSession::OP_REQUEST_CLOSE, - p->second), - mdsmap->get_inst(p->first), MDS_PORT_SERVER); - } - - // send unmount! - int mon = monmap->pick_mon(); - dout(2) << "sending client_unmount to mon" << mon << dendl; - messenger->send_message(new MClientUnmount(messenger->get_myinst()), - monmap->get_inst(mon)); - - while (mounted) - mount_cond.Wait(client_lock); - - dout(2) << "unmounted." << dendl; - - objecter->shutdown(); - - client_lock.Unlock(); - return 0; -} - -void Client::handle_unmount(Message* m) -{ - dout(1) << "handle_unmount got ack" << dendl; - - mounted = false; - - delete mdsmap; - mdsmap = 0; - - mount_cond.Signal(); - - delete m; -} - - -// =============================================================== -// high level (POSIXy) interface - - -// namespace ops - -int Client::link(const char *existing, const char *newname) -{ - Mutex::Locker lock(client_lock); - tout << "link" << std::endl; - tout << existing << std::endl; - tout << newname << std::endl; - return _link(existing, newname); -} - -int Client::_link(const char *existing, const char *newname) -{ - // main path arg is new link name - // sarg is target (existing file) - - MClientRequest *req = new MClientRequest(MDS_OP_LINK, messenger->get_myinst()); - req->set_path(newname); - req->set_sarg(existing); - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - MClientReply *reply = make_request(req); - int res = reply->get_result(); - - insert_trace(reply); - delete reply; - dout(10) << "link result is " << res << dendl; - - trim_cache(); - dout(3) << "link(\"" << existing << "\", \"" << newname << "\") = " << res << dendl; - return res; -} - - -int Client::unlink(const char *relpath) -{ - Mutex::Locker lock(client_lock); - tout << "unlink" << std::endl; - tout << relpath << std::endl; - - string abspath; - mkabspath(relpath, abspath); - return _unlink(abspath.c_str()); -} - -int Client::_unlink(const char *path) -{ - - MClientRequest *req = new MClientRequest(MDS_OP_UNLINK, messenger->get_myinst()); - req->set_path(path); - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - //FIXME enforce caller uid rights? - - MClientReply *reply = make_request(req); - int res = reply->get_result(); - if (res == 0) { - // remove from local cache - filepath fp(path); - Dentry *dn = lookup(fp); - if (dn) { - assert(dn->inode); - unlink(dn); - } - } - insert_trace(reply); - delete reply; - dout(10) << "unlink result is " << res << dendl; - - trim_cache(); - dout(3) << "unlink(\"" << path << "\") = " << res << dendl; - return res; -} - -int Client::rename(const char *relfrom, const char *relto) -{ - Mutex::Locker lock(client_lock); - tout << "rename" << std::endl; - tout << relfrom << std::endl; - tout << relto << std::endl; - - string absfrom, absto; - mkabspath(relfrom, absfrom); - mkabspath(relto, absto); - return _rename(absfrom.c_str(), absto.c_str()); -} - -int Client::_rename(const char *from, const char *to) -{ - MClientRequest *req = new MClientRequest(MDS_OP_RENAME, messenger->get_myinst()); - req->set_path(from); - req->set_sarg(to); - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - //FIXME enforce caller uid rights? - - MClientReply *reply = make_request(req); - int res = reply->get_result(); - if (res == 0) { - // remove from local cache - filepath fp(to); - Dentry *dn = lookup(fp); - if (dn) { - assert(dn->inode); - unlink(dn); - } - } - insert_trace(reply); - delete reply; - dout(10) << "rename result is " << res << dendl; - - // renamed item from our cache - - trim_cache(); - dout(3) << "rename(\"" << from << "\", \"" << to << "\") = " << res << dendl; - return res; -} - -// dirs - -int Client::mkdir(const char *relpath, mode_t mode) -{ - Mutex::Locker lock(client_lock); - tout << "mkdir" << std::endl; - tout << relpath << std::endl; - tout << mode << std::endl; - - string abspath; - mkabspath(relpath, abspath); - return _mkdir(abspath.c_str(), mode); -} - -int Client::_mkdir(const char *path, mode_t mode) -{ - MClientRequest *req = new MClientRequest(MDS_OP_MKDIR, messenger->get_myinst()); - req->set_path(path); - req->args.mkdir.mode = mode; - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - //FIXME enforce caller uid rights? - - MClientReply *reply = make_request(req); - int res = reply->get_result(); - insert_trace(reply); - delete reply; - dout(10) << "mkdir result is " << res << dendl; - - trim_cache(); - - dout(3) << "mkdir(\"" << path << "\", 0" << oct << mode << dec << ") = " << res << dendl; - return res; -} - -int Client::rmdir(const char *relpath) -{ - Mutex::Locker lock(client_lock); - tout << "rmdir" << std::endl; - tout << relpath << std::endl; - - string abspath; - mkabspath(relpath, abspath); - return _rmdir(abspath.c_str()); -} - -int Client::_rmdir(const char *path) -{ - MClientRequest *req = new MClientRequest(MDS_OP_RMDIR, messenger->get_myinst()); - req->set_path(path); - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - //FIXME enforce caller uid rights? - - MClientReply *reply = make_request(req); - int res = reply->get_result(); - if (res == 0) { - // remove from local cache - filepath fp(path); - Dentry *dn = lookup(fp); - if (dn) { - if (dn->inode->dir && dn->inode->dir->is_empty()) - close_dir(dn->inode->dir); // FIXME: maybe i shoudl proactively hose the whole subtree from cache? - unlink(dn); - } - } - insert_trace(reply); - delete reply; - - trim_cache(); - dout(3) << "rmdir(\"" << path << "\") = " << res << dendl; - return res; -} - -// symlinks - -int Client::symlink(const char *target, const char *rellink) -{ - Mutex::Locker lock(client_lock); - tout << "symlink" << std::endl; - tout << target << std::endl; - tout << rellink << std::endl; - - string link; - mkabspath(rellink, link); - return _symlink(target, link.c_str()); -} - -int Client::_symlink(const char *target, const char *link) -{ - MClientRequest *req = new MClientRequest(MDS_OP_SYMLINK, messenger->get_myinst()); - req->set_path(link); - req->set_sarg(target); - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - //FIXME enforce caller uid rights? - - MClientReply *reply = make_request(req); - int res = reply->get_result(); - insert_trace(reply); //FIXME assuming trace of link, not of target - delete reply; - - trim_cache(); - dout(3) << "symlink(\"" << target << "\", \"" << link << "\") = " << res << dendl; - return res; -} - -int Client::readlink(const char *path, char *buf, off_t size) -{ - Mutex::Locker lock(client_lock); - tout << "readlink" << std::endl; - tout << path << std::endl; - - string abspath; - mkabspath(path, abspath); - return _readlink(abspath.c_str(), buf, size); -} - -int Client::_readlink(const char *path, char *buf, off_t size) -{ - Inode *in; - int r = _do_lstat(path, STAT_MASK_BASE, &in); - if (r == 0 && !in->inode.is_symlink()) r = -EINVAL; - if (r == 0) { - // copy into buf (at most size bytes) - r = in->symlink->length(); - if (r > size) r = size; - memcpy(buf, in->symlink->c_str(), r); - } else { - buf[0] = 0; - } - trim_cache(); - - dout(3) << "readlink(\"" << path << "\", \"" << buf << "\", " << size << ") = " << r << dendl; - return r; -} - - - -// inode stuff - -int Client::_do_lstat(const char *path, int mask, Inode **in) -{ - MClientRequest *req = 0; - filepath fpath(path); - - // check whether cache content is fresh enough - int res = 0; - - Dentry *dn = lookup(fpath); - inode_t inode; - utime_t now = g_clock.real_now(); - - if (dn && - now <= dn->inode->valid_until) - dout(10) << "_lstat has inode " << path << " with mask " << dn->inode->mask << ", want " << mask << dendl; - - if (dn && dn->inode && - now <= dn->inode->valid_until && - ((mask & ~STAT_MASK_BASE) || now <= dn->inode->valid_until) && - ((dn->inode->mask & mask) == mask)) { - inode = dn->inode->inode; - dout(10) << "lstat cache hit w/ sufficient mask, valid until " << dn->inode->valid_until << dendl; - - if (g_conf.client_cache_stat_ttl == 0) - dn->inode->valid_until = utime_t(); // only one stat allowed after each readdir - - *in = dn->inode; - } else { - // FIXME where does FUSE maintain user information - //struct fuse_context *fc = fuse_get_context(); - //req->set_caller_uid(fc->uid); - //req->set_caller_gid(fc->gid); - - req = new MClientRequest(MDS_OP_LSTAT, messenger->get_myinst()); - req->args.stat.mask = mask; - req->set_path(fpath); - - MClientReply *reply = make_request(req); - res = reply->get_result(); - dout(10) << "lstat res is " << res << dendl; - if (res == 0) { - //Transfer information from reply to stbuf - inode = reply->get_inode(); - - //Update metadata cache - *in = insert_trace(reply); - } - - delete reply; - - if (res != 0) - *in = 0; // not a success. - } - - return res; -} - - -int Client::fill_stat(Inode *in, struct stat *st) -{ - dout(10) << "fill_stat on " << in->inode.ino << " mode 0" << oct << in->inode.mode << dec - << " mtime " << in->inode.mtime << " ctime " << in->inode.ctime << dendl; - memset(st, 0, sizeof(struct stat)); - st->st_ino = in->inode.ino; - st->st_mode = in->inode.mode; - st->st_rdev = in->inode.rdev; - st->st_nlink = in->inode.nlink; - st->st_uid = in->inode.uid; - st->st_gid = in->inode.gid; - st->st_ctime = MAX(in->inode.ctime, in->inode.mtime); - st->st_atime = in->inode.atime; - st->st_mtime = in->inode.mtime; - st->st_size = in->inode.size; - st->st_blocks = in->inode.size ? ((in->inode.size - 1) / 4096 + 1):0; - st->st_blksize = 4096; - return in->mask; -} - - /* - S_REQUIREBLKSIZE(st->st_litemask); - if (inode.mask & INODE_MASK_BASE) S_REQUIRECTIME(st->st_litemask); - if (inode.mask & INODE_MASK_SIZE) { - S_REQUIRESIZE(st->st_litemask); - S_REQUIREBLOCKS(st->st_litemask); - } - if (inode.mask & INODE_MASK_MTIME) S_REQUIREMTIME(st->st_litemask); - if (inode.mask & INODE_MASK_ATIME) S_REQUIREATIME(st->st_litemask); - */ - - -int Client::lstat(const char *relpath, struct stat *stbuf) -{ - Mutex::Locker lock(client_lock); - tout << "lstat" << std::endl; - tout << relpath << std::endl; - - string abspath; - mkabspath(relpath, abspath); - return _lstat(abspath.c_str(), stbuf); -} - -int Client::_lstat(const char *path, struct stat *stbuf) -{ - Inode *in = 0; - int res = _do_lstat(path, STAT_MASK_ALL, &in); - if (res == 0) { - assert(in); - fill_stat(in, stbuf); - dout(10) << "stat sez size = " << in->inode.size << " mode = 0" << oct << stbuf->st_mode << dec << " ino = " << stbuf->st_ino << dendl; - } - - trim_cache(); - dout(3) << "lstat(\"" << path << "\", " << stbuf << ") = " << res << dendl; - return res; -} - - -/* -int Client::lstatlite(const char *relpath, struct statlite *stl) -{ - client_lock.Lock(); - - string abspath; - mkabspath(relpath, abspath); - const char *path = abspath.c_str(); - - dout(3) << "op: client->lstatlite(\"" << path << "\", &st);" << dendl; - tout << "lstatlite" << std::endl; - tout << path << std::endl; - - // make mask - // FIXME. - int mask = INODE_MASK_BASE | INODE_MASK_AUTH; - if (S_ISVALIDSIZE(stl->st_litemask) || - S_ISVALIDBLOCKS(stl->st_litemask)) mask |= INODE_MASK_SIZE; - if (S_ISVALIDMTIME(stl->st_litemask)) mask |= INODE_MASK_FILE; - if (S_ISVALIDATIME(stl->st_litemask)) mask |= INODE_MASK_FILE; - - Inode *in = 0; - int res = _lstat(path, mask, &in); - - if (res == 0) { - fill_statlite(in->inode,stl); - dout(10) << "stat sez size = " << in->inode.size << " ino = " << in->inode.ino << dendl; - } - - trim_cache(); - client_lock.Unlock(); - return res; -} -*/ - - -int Client::chmod(const char *relpath, mode_t mode) -{ - Mutex::Locker lock(client_lock); - tout << "chmod" << std::endl; - tout << relpath << std::endl; - tout << mode << std::endl; - - string abspath; - mkabspath(relpath, abspath); - return _chmod(abspath.c_str(), mode); -} - -int Client::_chmod(const char *path, mode_t mode) -{ - dout(3) << "_chmod(" << path << ", 0" << oct << mode << dec << ")" << dendl; - MClientRequest *req = new MClientRequest(MDS_OP_CHMOD, messenger->get_myinst()); - req->set_path(path); - req->args.chmod.mode = mode; - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - MClientReply *reply = make_request(req); - int res = reply->get_result(); - insert_trace(reply); - delete reply; - - trim_cache(); - dout(3) << "_chmod(\"" << path << "\", 0" << oct << mode << dec << ") = " << res << dendl; - return res; -} - -int Client::chown(const char *relpath, uid_t uid, gid_t gid) -{ - Mutex::Locker lock(client_lock); - tout << "chown" << std::endl; - tout << relpath << std::endl; - tout << uid << std::endl; - tout << gid << std::endl; - - string abspath; - mkabspath(relpath, abspath); - return _chown(abspath.c_str(), uid, gid); -} - -int Client::_chown(const char *path, uid_t uid, gid_t gid) -{ - dout(3) << "_chown(" << path << ", " << uid << ", " << gid << ")" << dendl; - MClientRequest *req = new MClientRequest(MDS_OP_CHOWN, messenger->get_myinst()); - req->set_path(path); - req->args.chown.uid = uid; - req->args.chown.gid = gid; - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - //FIXME enforce caller uid rights? - - MClientReply *reply = make_request(req); - int res = reply->get_result(); - insert_trace(reply); - delete reply; - dout(10) << "chown result is " << res << dendl; - - trim_cache(); - dout(3) << "chown(\"" << path << "\", " << uid << ", " << gid << ") = " << res << dendl; - return res; -} - -int Client::utime(const char *relpath, struct utimbuf *buf) -{ - Mutex::Locker lock(client_lock); - tout << "utime" << std::endl; - tout << relpath << std::endl; - tout << buf->modtime << std::endl; - tout << buf->actime << std::endl; - - string abspath; - mkabspath(relpath, abspath); - return _utimes(abspath.c_str(), utime_t(buf->modtime,0), utime_t(buf->actime,0)); -} - -int Client::_utimes(const char *path, utime_t mtime, utime_t atime) -{ - dout(3) << "_utimes(" << path << ", " << mtime << ", " << atime << ")" << dendl; - MClientRequest *req = new MClientRequest(MDS_OP_UTIME, messenger->get_myinst()); - req->set_path(path); - req->args.utime.mtime = mtime.tv_ref(); - req->args.utime.atime = atime.tv_ref(); - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - MClientReply *reply = make_request(req); - int res = reply->get_result(); - insert_trace(reply); - delete reply; - - dout(3) << "utimes(\"" << path << "\", " << mtime << ", " << atime << ") = " << res << dendl; - trim_cache(); - return res; -} - - - -int Client::mknod(const char *relpath, mode_t mode, dev_t rdev) -{ - Mutex::Locker lock(client_lock); - tout << "mknod" << std::endl; - tout << relpath << std::endl; - tout << mode << std::endl; - tout << rdev << std::endl; - - string abspath; - mkabspath(relpath, abspath); - return _mknod(abspath.c_str(), mode, rdev); -} - -int Client::_mknod(const char *path, mode_t mode, dev_t rdev) -{ - dout(3) << "_mknod(" << path << ", 0" << oct << mode << dec << ", " << rdev << ")" << dendl; - - MClientRequest *req = new MClientRequest(MDS_OP_MKNOD, messenger->get_myinst()); - req->set_path(path); - req->args.mknod.mode = mode; - req->args.mknod.rdev = rdev; - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - //FIXME enforce caller uid rights? - - MClientReply *reply = make_request(req); - int res = reply->get_result(); - insert_trace(reply); - - delete reply; - - trim_cache(); - - dout(3) << "mknod(\"" << path << "\", 0" << oct << mode << dec << ") = " << res << dendl; - return res; -} - - - - -int Client::getdir(const char *relpath, list& contents) -{ - dout(3) << "getdir(" << relpath << ")" << dendl; - { - Mutex::Locker lock(client_lock); - tout << "getdir" << std::endl; - tout << relpath << std::endl; - } - - DIR *d; - int r = opendir(relpath, &d); - if (r < 0) return r; - - struct dirent de; - int n = 0; - while (readdir_r(d, &de) == 0) { - contents.push_back(de.d_name); - n++; - } - closedir(d); - - return n; -} - -int Client::opendir(const char *name, DIR **dirpp) -{ - Mutex::Locker lock(client_lock); - tout << "opendir" << std::endl; - tout << name << std::endl; - - int r = _opendir(name, (DirResult**)dirpp); - tout << (unsigned long)*dirpp << std::endl; - return r; -} - -int Client::_opendir(const char *name, DirResult **dirpp) -{ - *dirpp = new DirResult(name); - - // do we have the inode in our cache? - // if so, should be we ask for a different dirfrag? - filepath path(name); - Dentry *dn = lookup(path); - if (dn && dn->inode) { - (*dirpp)->inode = dn->inode; - (*dirpp)->inode->get(); - dout(10) << "had inode " << dn->inode << " " << dn->inode->inode.ino << " ref now " << dn->inode->ref << dendl; - (*dirpp)->set_frag(dn->inode->dirfragtree[0]); - dout(10) << "_opendir " << name << ", our cache says the first dirfrag is " << (*dirpp)->frag() << dendl; - } - - // get the first frag - int r = _readdir_get_frag(*dirpp); - if (r < 0) { - _closedir(*dirpp); - *dirpp = 0; - } else { - r = 0; - } - dout(3) << "_opendir(" << name << ") = " << r << " (" << *dirpp << ")" << dendl; - - return r; -} - -void Client::_readdir_add_dirent(DirResult *dirp, const string& name, Inode *in) -{ - struct stat st; - int stmask = fill_stat(in, &st); - frag_t fg = dirp->frag(); - dirp->buffer[fg].push_back(DirEntry(name, st, stmask)); - dout(10) << "_readdir_add_dirent " << dirp << " added '" << name << "' -> " << in->inode.ino - << ", size now " << dirp->buffer[fg].size() << dendl; -} - -//struct dirent { -// ino_t d_ino; /* inode number */ -// off_t d_off; /* offset to the next dirent */ -// unsigned short d_reclen; /* length of this record */ -// unsigned char d_type; /* type of file */ -// char d_name[256]; /* filename */ -//}; -void Client::_readdir_fill_dirent(struct dirent *de, DirEntry *entry, off_t off) -{ - de->d_ino = entry->st.st_ino; - de->d_off = off + 1; - de->d_reclen = 1; - de->d_type = MODE_TO_DT(entry->st.st_mode); - strncpy(de->d_name, entry->d_name.c_str(), 256); - dout(10) << "_readdir_fill_dirent '" << de->d_name << "' -> " << de->d_ino - << " type " << (int)de->d_type << " at off " << off << dendl; -} - -void Client::_readdir_next_frag(DirResult *dirp) -{ - frag_t fg = dirp->frag(); - - // hose old data - assert(dirp->buffer.count(fg)); - dirp->buffer.erase(fg); - - // advance - dirp->next_frag(); - if (dirp->at_end()) { - dout(10) << "_readdir_next_frag advance from " << fg << " to END" << dendl; - } else { - dout(10) << "_readdir_next_frag advance from " << fg << " to " << dirp->frag() << dendl; - _readdir_rechoose_frag(dirp); - } -} - -void Client::_readdir_rechoose_frag(DirResult *dirp) -{ - assert(dirp->inode); - frag_t cur = dirp->frag(); - frag_t f = dirp->inode->dirfragtree[cur.value()]; - if (f != cur) { - dout(10) << "_readdir_rechoose_frag frag " << cur << " maps to " << f << dendl; - dirp->set_frag(f); - } -} - -int Client::_readdir_get_frag(DirResult *dirp) -{ - // get the current frag. - frag_t fg = dirp->frag(); - assert(dirp->buffer.count(fg) == 0); - - dout(10) << "_readdir_get_frag " << dirp << " on " << dirp->path << " fg " << fg << dendl; - - MClientRequest *req = new MClientRequest(MDS_OP_READDIR, messenger->get_myinst()); - req->set_path(dirp->path); - req->args.readdir.frag = fg; - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - MClientReply *reply = make_request(req); - int res = reply->get_result(); - insert_trace(reply); - inodeno_t ino = reply->get_ino(); - - // did i get directory inode? - Inode *diri = 0; - if ((res == -EAGAIN || res == 0) && - inode_map.count(ino)) { - diri = inode_map[ino]; - dout(10) << "_readdir_get_frag got diri " << diri << " " << diri->inode.ino << dendl; - assert(diri); - assert(diri->inode.mode & INODE_MODE_DIR); - } - - if (!dirp->inode && diri) { - dout(10) << "_readdir_get_frag attaching inode" << dendl; - dirp->inode = inode_map[ino]; - diri->get(); - } - - if (res == -EAGAIN) { - dout(10) << "_readdir_get_frag got EAGAIN, retrying" << dendl; - _readdir_rechoose_frag(dirp); - return _readdir_get_frag(dirp); - } - - if (res == 0) { - // stuff dir contents to cache, DirResult - assert(diri); - - // create empty result vector - dirp->buffer[fg].clear(); - - if (fg.is_leftmost()) { - // add . and ..? - string dot("."); - _readdir_add_dirent(dirp, dot, diri); - string dotdot(".."); - if (diri->dn) - _readdir_add_dirent(dirp, dotdot, diri->dn->dir->parent_inode); - //else - //_readdir_add_dirent(dirp, dotdot, DT_DIR); - } - - // the rest? - if (!reply->get_dir_dn().empty()) { - // only open dir if we're actually adding stuff to it! - Dir *dir = diri->open_dir(); - assert(dir); - utime_t now = g_clock.real_now(); - - list::const_iterator pin = reply->get_dir_in().begin(); - for (list::const_iterator pdn = reply->get_dir_dn().begin(); - pdn != reply->get_dir_dn().end(); - ++pdn, ++pin) { - // count entries - res++; - - // put in cache - Inode *in = this->insert_inode(dir, *pin, *pdn); - - if (g_conf.client_cache_stat_ttl) { - in->valid_until = now; - in->valid_until += g_conf.client_cache_stat_ttl; - } - else if (g_conf.client_cache_readdir_ttl) { - in->valid_until = now; - in->valid_until += g_conf.client_cache_readdir_ttl; - } else - in->valid_until = utime_t(); - - // contents to caller too! - dout(15) << "_readdir_get_frag got " << *pdn << " to " << in->inode.ino << dendl; - _readdir_add_dirent(dirp, *pdn, in); - } - - if (dir->is_empty()) - close_dir(dir); - } - - // FIXME: remove items in cache that weren't in my readdir? - // *** - } else { - dout(10) << "_readdir_get_frag got error " << res << ", setting end flag" << dendl; - dirp->set_end(); - } - - delete reply; - - return res; -} - -int Client::readdir_r(DIR *d, struct dirent *de) -{ - return readdirplus_r(d, de, 0, 0); -} - -int Client::readdirplus_r(DIR *d, struct dirent *de, struct stat *st, int *stmask) -{ - DirResult *dirp = (DirResult*)d; - - while (1) { - if (dirp->at_end()) return -1; - - if (dirp->buffer.count(dirp->frag()) == 0) { - Mutex::Locker lock(client_lock); - _readdir_get_frag(dirp); - if (dirp->at_end()) return -1; - } - - frag_t fg = dirp->frag(); - uint32_t pos = dirp->fragpos(); - assert(dirp->buffer.count(fg)); - vector &ent = dirp->buffer[fg]; - - if (ent.empty()) { - dout(10) << "empty frag " << fg << ", moving on to next" << dendl; - _readdir_next_frag(dirp); - continue; - } - - assert(pos < ent.size()); - _readdir_fill_dirent(de, &ent[pos], dirp->offset); - if (st) *st = ent[pos].st; - if (stmask) *stmask = ent[pos].stmask; - pos++; - dirp->offset++; - - if (pos == ent.size()) - _readdir_next_frag(dirp); - - break; - } - - return 0; -} - - -int Client::closedir(DIR *dir) -{ - Mutex::Locker lock(client_lock); - tout << "closedir" << std::endl; - tout << (unsigned long)dir << std::endl; - - dout(3) << "closedir(" << dir << ") = 0" << dendl; - _closedir((DirResult*)dir); - return 0; -} - -void Client::_closedir(DirResult *dirp) -{ - dout(10) << "_closedir(" << dirp << ")" << dendl; - if (dirp->inode) { - dout(10) << "_closedir detaching inode " << dirp->inode << dendl; - put_inode(dirp->inode); - dirp->inode = 0; - } - delete dirp; -} - -void Client::rewinddir(DIR *dirp) -{ - dout(3) << "rewinddir(" << dirp << ")" << dendl; - DirResult *d = (DirResult*)dirp; - d->offset = 0; - d->buffer.clear(); -} - -off_t Client::telldir(DIR *dirp) -{ - DirResult *d = (DirResult*)dirp; - dout(3) << "telldir(" << dirp << ") = " << d->offset << dendl; - return d->offset; -} - -void Client::seekdir(DIR *dirp, off_t offset) -{ - dout(3) << "seekdir(" << dirp << ", " << offset << ")" << dendl; - DirResult *d = (DirResult*)dirp; - d->offset = offset; -} - - - - - - - -/****** file i/o **********/ - -int Client::open(const char *relpath, int flags, mode_t mode) -{ - Mutex::Locker lock(client_lock); - tout << "open" << std::endl; - tout << relpath << std::endl; - tout << flags << std::endl; - - string abspath; - mkabspath(relpath, abspath); - - Fh *fh; - int r = _open(abspath.c_str(), flags, mode, &fh); - if (r >= 0) { - // allocate a integer file descriptor - assert(fh); - r = get_fd(); - assert(fd_map.count(r) == 0); - fd_map[r] = fh; - } - - tout << r << std::endl; - dout(3) << "open(" << relpath << ", " << flags << ") = " << r << dendl; - return r; -} - -int Client::_open(const char *path, int flags, mode_t mode, Fh **fhp) -{ - // go - MClientRequest *req = new MClientRequest(MDS_OP_OPEN, messenger->get_myinst()); - req->set_path(path); - req->args.open.flags = flags; - req->args.open.mode = mode; - - int cmode = req->get_open_file_mode(); - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - // do i have the inode? - Dentry *dn = lookup(req->get_filepath()); - Inode *in = 0; - if (dn) { - in = dn->inode; - in->add_open(cmode); // make note of pending open, since it effects _wanted_ caps. - } - - MClientReply *reply = make_request(req); - assert(reply); - - insert_trace(reply); - int result = reply->get_result(); - - // success? - if (result >= 0) { - // yay - Fh *f = new Fh; - if (fhp) *fhp = f; - f->mode = cmode; - - // inode - f->inode = inode_map[reply->get_ino()]; - assert(f->inode); - f->inode->get(); - - if (!in) { - in = f->inode; - in->add_open(f->mode); - } - - // caps included? - int mds = reply->get_source().num(); - - if (in->caps.empty()) {// first caps? - dout(7) << " first caps on " << in->inode.ino << dendl; - in->get(); - } - - int new_caps = reply->get_file_caps(); - - assert(reply->get_file_caps_seq() >= in->caps[mds].seq); - if (reply->get_file_caps_seq() > in->caps[mds].seq) { - int old_caps = in->caps[mds].caps; - - dout(7) << "open got caps " << cap_string(new_caps) - << " (had " << cap_string(old_caps) << ")" - << " for " << in->ino() - << " seq " << reply->get_file_caps_seq() - << " from mds" << mds - << dendl; - - in->caps[mds].caps = new_caps; - in->caps[mds].seq = reply->get_file_caps_seq(); - - // we shouldn't ever lose caps at this point. - // actually, we might...? - assert((old_caps & ~in->caps[mds].caps) == 0); - - if (g_conf.client_oc) - in->fc.set_caps(new_caps); - - } else { - dout(7) << "open got SAME caps " << cap_string(new_caps) - << " for " << in->ino() - << " seq " << reply->get_file_caps_seq() - << " from mds" << mds - << dendl; - } - - dout(5) << "open success, fh is " << f << " combined caps " << cap_string(in->file_caps()) << dendl; - } - - delete reply; - - trim_cache(); - - return result; -} - - - - - -void Client::close_release(Inode *in) -{ - dout(10) << "close_release on " << in->ino() << dendl; - dout(10) << " wr " << in->num_open_wr << " rd " << in->num_open_rd - << " dirty " << in->fc.is_dirty() << " cached " << in->fc.is_cached() << dendl; - - if (!in->num_open_rd) - in->fc.release_clean(); - - int retain = 0; - if (in->num_open_wr || in->fc.is_dirty()) retain |= CAP_FILE_WR | CAP_FILE_WRBUFFER | CAP_FILE_WREXTEND; - if (in->num_open_rd || in->fc.is_cached()) retain |= CAP_FILE_RD | CAP_FILE_RDCACHE; - - release_caps(in, retain); // release caps now. -} - -void Client::close_safe(Inode *in) -{ - dout(10) << "close_safe on " << in->ino() << dendl; - put_inode(in); - if (unmounting) - mount_cond.Signal(); -} - - -int Client::close(int fd) -{ - Mutex::Locker lock(client_lock); - tout << "close" << std::endl; - tout << fd << std::endl; - - dout(3) << "close(" << fd << ")" << dendl; - assert(fd_map.count(fd)); - Fh *fh = fd_map[fd]; - _release(fh); - fd_map.erase(fd); - return 0; -} - -int Client::_release(Fh *f) -{ - //dout(3) << "op: client->close(open_files[ " << fh << " ]);" << dendl; - //dout(3) << "op: open_files.erase( " << fh << " );" << dendl; - dout(5) << "_release " << f << dendl; - Inode *in = f->inode; - - // update inode rd/wr counts - int before = in->file_caps_wanted(); - in->sub_open(f->mode); - int after = in->file_caps_wanted(); - - // does this change what caps we want? - if (before != after && after) - update_caps_wanted(in); - - // release caps right away? - dout(10) << "num_open_rd " << in->num_open_rd << " num_open_wr " << in->num_open_wr << dendl; - - if (g_conf.client_oc) { - // caching on. - if (in->num_open_rd == 0 && in->num_open_wr == 0) { - dout(20) << "calling empty" << dendl; - in->fc.empty(new C_Client_CloseRelease(this, in)); - } - else if (in->num_open_rd == 0) { - dout(20) << "calling release" << dendl; - in->fc.release_clean(); - close_release(in); - } - else if (in->num_open_wr == 0) { - dout(20) << "calling flush dirty" << dendl; - in->fc.flush_dirty(new C_Client_CloseRelease(this,in)); - } - - // pin until safe? - if (in->num_open_wr == 0 && !in->fc.all_safe()) { - dout(10) << "pinning ino " << in->ino() << " until safe" << dendl; - in->get(); - in->fc.add_safe_waiter(new C_Client_CloseSafe(this, in)); - } - } else { - // caching off. - if (in->num_open_rd == 0 && in->num_open_wr == 0) { - dout(10) << " releasing caps on " << in->ino() << dendl; - release_caps(in); // release caps now. - } - } - - put_inode( in ); - return 0; -} - - - -// ------------ -// read, write - - -off_t Client::lseek(int fd, off_t offset, int whence) -{ - Mutex::Locker lock(client_lock); - tout << "lseek" << std::endl; - tout << fd << std::endl; - tout << offset << std::endl; - tout << whence << std::endl; - - assert(fd_map.count(fd)); - Fh *f = fd_map[fd]; - Inode *in = f->inode; - - switch (whence) { - case SEEK_SET: - f->pos = offset; - break; - - case SEEK_CUR: - f->pos += offset; - break; - - case SEEK_END: - f->pos = in->inode.size + offset; - break; - - default: - assert(0); - } - - off_t pos = f->pos; - - dout(3) << "lseek(" << fd << ", " << offset << ", " << whence << ") = " << pos << dendl; - return pos; -} - - - -void Client::lock_fh_pos(Fh *f) -{ - dout(10) << "lock_fh_pos " << f << dendl; - - if (f->pos_locked || !f->pos_waiters.empty()) { - Cond cond; - f->pos_waiters.push_back(&cond); - dout(10) << "lock_fh_pos BLOCKING on " << f << dendl; - while (f->pos_locked || f->pos_waiters.front() != &cond) - cond.Wait(client_lock); - dout(10) << "lock_fh_pos UNBLOCKING on " << f << dendl; - assert(f->pos_waiters.front() == &cond); - f->pos_waiters.pop_front(); - } - - f->pos_locked = true; -} - -void Client::unlock_fh_pos(Fh *f) -{ - dout(10) << "unlock_fh_pos " << f << dendl; - f->pos_locked = false; -} - - - -//char *hackbuf = 0; - - -// blocking osd interface - -int Client::read(int fd, char *buf, off_t size, off_t offset) -{ - Mutex::Locker lock(client_lock); - tout << "read" << std::endl; - tout << fd << std::endl; - tout << size << std::endl; - tout << offset << std::endl; - - assert(fd_map.count(fd)); - Fh *f = fd_map[fd]; - bufferlist bl; - int r = _read(f, offset, size, &bl); - dout(3) << "read(" << fd << ", " << buf << ", " << size << ", " << offset << ") = " << r << dendl; - if (r >= 0) { - bl.copy(0, bl.length(), buf); - r = bl.length(); - } - return r; -} - -int Client::_read(Fh *f, off_t offset, off_t size, bufferlist *bl) -{ - Inode *in = f->inode; - - bool movepos = false; - if (offset < 0) { - lock_fh_pos(f); - offset = f->pos; - movepos = true; - } - - bool lazy = f->mode == FILE_MODE_LAZY; - - // determine whether read range overlaps with file - // ...ONLY if we're doing async io - if (!lazy && (in->file_caps() & (CAP_FILE_WRBUFFER|CAP_FILE_RDCACHE))) { - // we're doing buffered i/o. make sure we're inside the file. - // we can trust size info bc we get accurate info when buffering/caching caps are issued. - dout(10) << "file size: " << in->inode.size << dendl; - if (offset > 0 && offset >= in->inode.size) { - if (movepos) unlock_fh_pos(f); - return 0; - } - if (offset + size > (off_t)in->inode.size) - size = (off_t)in->inode.size - offset; - - if (size == 0) { - dout(10) << "read is size=0, returning 0" << dendl; - if (movepos) unlock_fh_pos(f); - return 0; - } - } else { - // unbuffered, synchronous file i/o. - // or lazy. - // defer to OSDs for file bounds. - } - - int r = 0; - int rvalue = 0; - - if (g_conf.client_oc) { - // object cache ON - rvalue = r = in->fc.read(offset, size, *bl, client_lock); // may block. - - /* - if (in->inode.ino == 0x10000000075 && hackbuf) { - int s = MIN(size, bl->length()); - char *v = bl->c_str(); - for (int a=0; afile_caps() & CAP_FILE_RD) == 0) { - dout(7) << " don't have read cap, waiting" << dendl; - Cond cond; - in->waitfor_read.push_back(&cond); - cond.Wait(client_lock); - } - // lazy cap? - while (lazy && (in->file_caps() & CAP_FILE_LAZYIO) == 0) { - dout(7) << " don't have lazy cap, waiting" << dendl; - Cond cond; - in->waitfor_lazy.push_back(&cond); - cond.Wait(client_lock); - } - - // do sync read - Cond cond; - bool done = false; - C_Cond *onfinish = new C_Cond(&cond, &done, &rvalue); - - Objecter::OSDRead *rd = filer->prepare_read(in->inode, offset, size, bl); - if (in->hack_balance_reads || - g_conf.client_hack_balance_reads) - rd->balance_reads = true; - r = objecter->readx(rd, onfinish); - assert(r >= 0); - - // wait! - while (!done) - cond.Wait(client_lock); - } - - if (movepos) { - // adjust fd pos - f->pos = offset+bl->length(); - unlock_fh_pos(f); - } - - // done! - return rvalue; -} - - - -/* - * hack -- - * until we properly implement synchronous writes wrt buffer cache, - * make sure we delay shutdown until they're all safe on disk! - */ -class C_Client_HackUnsafe : public Context { - Client *cl; -public: - C_Client_HackUnsafe(Client *c) : cl(c) {} - void finish(int) { - cl->hack_sync_write_safe(); - } -}; - -void Client::hack_sync_write_safe() -{ - client_lock.Lock(); - assert(unsafe_sync_write > 0); - unsafe_sync_write--; - dout(15) << "hack_sync_write_safe unsafe_sync_write = " << unsafe_sync_write << dendl; - if (unsafe_sync_write == 0 && unmounting) { - dout(10) << "hack_sync_write_safe -- no more unsafe writes, unmount can proceed" << dendl; - mount_cond.Signal(); - } - client_lock.Unlock(); -} - -int Client::write(int fd, const char *buf, off_t size, off_t offset) -{ - Mutex::Locker lock(client_lock); - tout << "write" << std::endl; - tout << fd << std::endl; - tout << size << std::endl; - tout << offset << std::endl; - - assert(fd_map.count(fd)); - Fh *fh = fd_map[fd]; - int r = _write(fh, offset, size, buf); - dout(3) << "write(" << fd << ", \"...\", " << size << ", " << offset << ") = " << r << dendl; - return r; -} - - -int Client::_write(Fh *f, off_t offset, off_t size, const char *buf) -{ - //dout(7) << "write fh " << fh << " size " << size << " offset " << offset << dendl; - Inode *in = f->inode; - - // use/adjust fd pos? - if (offset < 0) { - lock_fh_pos(f); - offset = f->pos; - f->pos = offset+size; - unlock_fh_pos(f); - } - - bool lazy = f->mode == FILE_MODE_LAZY; - - dout(10) << "cur file size is " << in->inode.size << " wr size " << in->file_wr_size << dendl; - - // time it. - utime_t start = g_clock.real_now(); - - // copy into fresh buffer (since our write may be resub, async) - bufferptr bp; - if (size > 0) bp = buffer::copy(buf, size); - bufferlist blist; - blist.push_back( bp ); - - if (g_conf.client_oc) { // buffer cache ON? - assert(objectcacher); - - // write (this may block!) - in->fc.write(offset, size, blist, client_lock); - - } else { - // legacy, inconsistent synchronous write. - dout(7) << "synchronous write" << dendl; - - // do we have write file cap? - while (!lazy && (in->file_caps() & CAP_FILE_WR) == 0) { - dout(7) << " don't have write cap, waiting" << dendl; - Cond cond; - in->waitfor_write.push_back(&cond); - cond.Wait(client_lock); - } - while (lazy && (in->file_caps() & CAP_FILE_LAZYIO) == 0) { - dout(7) << " don't have lazy cap, waiting" << dendl; - Cond cond; - in->waitfor_lazy.push_back(&cond); - cond.Wait(client_lock); - } - - // prepare write - Cond cond; - bool done = false; - C_Cond *onfinish = new C_Cond(&cond, &done); - C_Client_HackUnsafe *onsafe = new C_Client_HackUnsafe(this); - unsafe_sync_write++; - in->sync_writes++; - - dout(20) << " sync write start " << onfinish << dendl; - - filer->write(in->inode, offset, size, blist, 0, - onfinish, onsafe - //, 1+((int)g_clock.now()) / 10 //f->pos // hack hack test osd revision snapshots - ); - - while (!done) { - cond.Wait(client_lock); - dout(20) << " sync write bump " << onfinish << dendl; - } - - in->sync_writes--; - if (in->sync_writes == 0 && - !in->waitfor_no_write.empty()) { - for (list::iterator i = in->waitfor_no_write.begin(); - i != in->waitfor_no_write.end(); - i++) - (*i)->finish(0); - in->waitfor_no_write.clear(); - } - - dout(20) << " sync write done " << onfinish << dendl; - } - - // time - utime_t lat = g_clock.real_now(); - lat -= start; - if (client_logger) { - client_logger->finc("wrlsum",(double)lat); - client_logger->inc("wrlnum"); - } - - // assume success for now. FIXME. - off_t totalwritten = size; - - // extend file? - if (totalwritten + offset > in->inode.size) { - in->inode.size = in->file_wr_size = totalwritten + offset; - dout(7) << "wrote to " << totalwritten+offset << ", extending file size" << dendl; - } else { - dout(7) << "wrote to " << totalwritten+offset << ", leaving file size at " << in->inode.size << dendl; - } - - // mtime - in->file_wr_mtime = in->inode.mtime = g_clock.real_now(); - - // ok! - return totalwritten; -} - -int Client::_flush(Fh *f) -{ - // no-op, for now. hrm. - return 0; -} - - -int Client::truncate(const char *relpath, off_t length) -{ - Mutex::Locker lock(client_lock); - tout << "truncate" << std::endl; - tout << relpath << std::endl; - tout << length << std::endl; - - string path; - mkabspath(relpath, path); - return _truncate(path.c_str(), length); -} - -int Client::_truncate(const char *file, off_t length) -{ - MClientRequest *req = new MClientRequest(MDS_OP_TRUNCATE, messenger->get_myinst()); - req->set_path(file); - req->args.truncate.length = length; - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - MClientReply *reply = make_request(req); - int res = reply->get_result(); - insert_trace(reply); - delete reply; - - dout(3) << "truncate(\"" << file << "\", " << length << ") = " << res << dendl; - return res; -} - -int Client::ftruncate(int fd, off_t length) -{ - Mutex::Locker lock(client_lock); - tout << "ftruncate" << std::endl; - tout << fd << std::endl; - tout << length << std::endl; - - assert(fd_map.count(fd)); - Fh *f = fd_map[fd]; - return _ftruncate(f, length); -} - -int Client::_ftruncate(Fh *fh, off_t length) -{ - MClientRequest *req = new MClientRequest(MDS_OP_TRUNCATE, messenger->get_myinst()); - req->args.truncate.ino = fh->inode->inode.ino; - req->args.truncate.length = length; - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - MClientReply *reply = make_request(req); - int res = reply->get_result(); - insert_trace(reply); - delete reply; - - dout(3) << "ftruncate(\"" << fh << "\", " << length << ") = " << res << dendl; - return res; -} - - -int Client::fsync(int fd, bool syncdataonly) -{ - Mutex::Locker lock(client_lock); - tout << "fsync" << std::endl; - tout << fd << std::endl; - tout << syncdataonly << std::endl; - - assert(fd_map.count(fd)); - Fh *f = fd_map[fd]; - int r = _fsync(f, syncdataonly); - dout(3) << "fsync(" << fd << ", " << syncdataonly << ") = " << r << dendl; - return r; -} - -int Client::_fsync(Fh *f, bool syncdataonly) -{ - int r = 0; - - Inode *in = f->inode; - - // metadata? - if (!syncdataonly) { - dout(0) << "fsync - not syncing metadata yet.. implement me" << dendl; - } - - // data? - Cond cond; - bool done = false; - if (!objectcacher->commit_set(in->ino(), - new C_Cond(&cond, &done))) { - // wait for callback - while (!done) cond.Wait(client_lock); - } - return r; -} - - -// not written yet, but i want to link! - -int Client::chdir(const char *path) -{ - Mutex::Locker lock(client_lock); - tout << "chdir" << std::endl; - tout << path << std::endl; - - // fake it for now! - string abs; - mkabspath(path, abs); - dout(3) << "chdir " << path << " -> cwd now " << abs << dendl; - cwd = abs; - return 0; -} - -int Client::statfs(const char *path, struct statvfs *stbuf) -{ - Mutex::Locker lock(client_lock); - tout << "statfs" << std::endl; - return _statfs(stbuf); -} - -int Client::ll_statfs(inodeno_t ino, struct statvfs *stbuf) -{ - Mutex::Locker lock(client_lock); - tout << "ll_statfs" << std::endl; - return _statfs(stbuf); -} - -int Client::_statfs(struct statvfs *stbuf) -{ - dout(3) << "_statfs" << dendl; - - Cond cond; - tid_t tid = ++last_tid; - StatfsRequest *req = new StatfsRequest(tid, &cond); - statfs_requests[tid] = req; - - int mon = monmap->pick_mon(); - messenger->send_message(new MStatfs(req->tid), monmap->get_inst(mon)); - - while (req->reply == 0) - cond.Wait(client_lock); - - // yay - memcpy(stbuf, &req->reply->stfs, sizeof(*stbuf)); - - statfs_requests.erase(req->tid); - delete req->reply; - delete req; - - int r = 0; - dout(3) << "_statfs = " << r << dendl; - return r; -} - -void Client::handle_statfs_reply(MStatfsReply *reply) -{ - if (statfs_requests.count(reply->tid) && - statfs_requests[reply->tid]->reply == 0) { - dout(10) << "handle_statfs_reply " << *reply << ", kicking waiter" << dendl; - statfs_requests[reply->tid]->reply = reply; - statfs_requests[reply->tid]->caller_cond->Signal(); - } else { - dout(10) << "handle_statfs_reply " << *reply << ", dup or old, dropping" << dendl; - delete reply; - } -} - - -int Client::lazyio_propogate(int fd, off_t offset, size_t count) -{ - client_lock.Lock(); - dout(3) << "op: client->lazyio_propogate(" << fd - << ", " << offset << ", " << count << ")" << dendl; - - assert(fd_map.count(fd)); - Fh *f = fd_map[fd]; - Inode *in = f->inode; - - if (f->mode & FILE_MODE_LAZY) { - // wait for lazy cap - while ((in->file_caps() & CAP_FILE_LAZYIO) == 0) { - dout(7) << " don't have lazy cap, waiting" << dendl; - Cond cond; - in->waitfor_lazy.push_back(&cond); - cond.Wait(client_lock); - } - - if (g_conf.client_oc) { - Cond cond; - bool done = false; - in->fc.flush_dirty(new C_SafeCond(&client_lock, &cond, &done)); - - while (!done) - cond.Wait(client_lock); - - } else { - // mmm, nothin to do. - } - } - - client_lock.Unlock(); - return 0; -} - -int Client::lazyio_synchronize(int fd, off_t offset, size_t count) -{ - client_lock.Lock(); - dout(3) << "op: client->lazyio_synchronize(" << fd - << ", " << offset << ", " << count << ")" << dendl; - - assert(fd_map.count(fd)); - Fh *f = fd_map[fd]; - Inode *in = f->inode; - - if (f->mode & FILE_MODE_LAZY) { - // wait for lazy cap - while ((in->file_caps() & CAP_FILE_LAZYIO) == 0) { - dout(7) << " don't have lazy cap, waiting" << dendl; - Cond cond; - in->waitfor_lazy.push_back(&cond); - cond.Wait(client_lock); - } - - if (g_conf.client_oc) { - in->fc.flush_dirty(0); // flush to invalidate. - in->fc.release_clean(); - } else { - // mm, nothin to do. - } - } - - client_lock.Unlock(); - return 0; -} - - - - -// ========================================= -// low level - -// ugly hack for ll -#define FUSE_SET_ATTR_MODE (1 << 0) -#define FUSE_SET_ATTR_UID (1 << 1) -#define FUSE_SET_ATTR_GID (1 << 2) -#define FUSE_SET_ATTR_SIZE (1 << 3) -#define FUSE_SET_ATTR_ATIME (1 << 4) -#define FUSE_SET_ATTR_MTIME (1 << 5) - -int Client::ll_lookup(inodeno_t parent, const char *name, struct stat *attr) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_lookup " << parent << " " << name << dendl; - tout << "ll_lookup" << std::endl; - tout << parent.val << std::endl; - tout << name << std::endl; - - string dname = name; - Inode *diri = 0; - Inode *in = 0; - int r = 0; - - if (inode_map.count(parent) == 0) { - dout(1) << "ll_lookup " << parent << " " << name << " -> ENOENT (parent DNE... WTF)" << dendl; - r = -ENOENT; - attr->st_ino = 0; - goto out; - } - diri = inode_map[parent]; - if (!diri->inode.is_dir()) { - dout(1) << "ll_lookup " << parent << " " << name << " -> ENOTDIR (parent not a dir... WTF)" << dendl; - r = -ENOTDIR; - attr->st_ino = 0; - goto out; - } - - // get the inode - if (diri->dir && - diri->dir->dentries.count(dname)) { - Dentry *dn = diri->dir->dentries[dname]; - touch_dn(dn); - in = dn->inode; - } else { - string path; - diri->make_path(path); - path += "/"; - path += name; - _do_lstat(path.c_str(), 0, &in); - } - if (in) { - fill_stat(in, attr); - _ll_get(in); - } else { - r = -ENOENT; - attr->st_ino = 0; - } - - out: - dout(3) << "ll_lookup " << parent << " " << name - << " -> " << r << " (" << hex << attr->st_ino << dec << ")" << dendl; - tout << attr->st_ino << std::endl; - return r; -} - -void Client::_ll_get(Inode *in) -{ - if (in->ll_ref == 0) - in->get(); - in->ll_get(); - dout(20) << "_ll_get " << in << " " << in->inode.ino << " -> " << in->ll_ref << dendl; -} - -int Client::_ll_put(Inode *in, int num) -{ - in->ll_put(num); - dout(20) << "_ll_put " << in << " " << in->inode.ino << " " << num << " -> " << in->ll_ref << dendl; - if (in->ll_ref == 0) { - put_inode(in); - return 0; - } else { - return in->ll_ref; - } -} - -void Client::_ll_drop_pins() -{ - dout(10) << "_ll_drop_pins" << dendl; - hash_map::iterator next; - for (hash_map::iterator it = inode_map.begin(); - it != inode_map.end(); - it = next) { - Inode *in = it->second; - next = it; - next++; - if (in->ll_ref) - _ll_put(in, in->ll_ref); - } -} - -bool Client::ll_forget(inodeno_t ino, int num) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_forget " << ino << " " << num << dendl; - tout << "ll_forget" << std::endl; - tout << ino.val << std::endl; - tout << num << std::endl; - - if (ino == 1) return true; // ignore forget on root. - - bool last = false; - if (inode_map.count(ino) == 0) { - dout(1) << "WARNING: ll_forget on " << ino << " " << num - << ", which I don't have" << dendl; - } else { - Inode *in = inode_map[ino]; - assert(in); - if (in->ll_ref < num) { - dout(1) << "WARNING: ll_forget on " << ino << " " << num << ", which only has ll_ref=" << in->ll_ref << dendl; - _ll_put(in, in->ll_ref); - last = true; - } else { - if (_ll_put(in, num) == 0) - last = true; - } - } - return last; -} - -Inode *Client::_ll_get_inode(inodeno_t ino) -{ - if (inode_map.count(ino) == 0) { - assert(ino == 1); // must be the root inode. - Inode *in; - int r = _do_lstat("/", 0, &in); - assert(r >= 0); - return in; - } else { - return inode_map[ino]; - } -} - - -int Client::ll_getattr(inodeno_t ino, struct stat *attr) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_getattr " << ino << dendl; - tout << "ll_getattr" << std::endl; - tout << ino.val << std::endl; - - Inode *in = _ll_get_inode(ino); - fill_stat(in, attr); - return 0; -} - -int Client::ll_setattr(inodeno_t ino, struct stat *attr, int mask) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_setattr " << ino << " mask " << hex << mask << dec << dendl; - tout << "ll_setattr" << std::endl; - tout << ino.val << std::endl; - tout << attr->st_mode << std::endl; - tout << attr->st_uid << std::endl; - tout << attr->st_gid << std::endl; - tout << attr->st_size << std::endl; - tout << attr->st_mtime << std::endl; - tout << attr->st_atime << std::endl; - tout << mask << std::endl; - - Inode *in = _ll_get_inode(ino); - - string path; - in->make_path(path); - - int r = 0; - if ((mask & FUSE_SET_ATTR_MODE) && - ((r = _chmod(path.c_str(), attr->st_mode)) < 0)) return r; - - if ((mask & FUSE_SET_ATTR_UID) && (mask & FUSE_SET_ATTR_GID) && - ((r = _chown(path.c_str(), attr->st_uid, attr->st_gid)) < 0)) return r; - //if ((mask & FUSE_SET_ATTR_GID) && - //(r = client->_chgrp(path.c_str(), attr->st_gid) < 0)) return r; - - if ((mask & FUSE_SET_ATTR_SIZE) && - ((r = _truncate(path.c_str(), attr->st_size)) < 0)) return r; - - if ((mask & FUSE_SET_ATTR_MTIME) && (mask & FUSE_SET_ATTR_ATIME)) { - if ((r = _utimes(path.c_str(), utime_t(attr->st_mtime,0), utime_t(attr->st_atime,0))) < 0) return r; - } else if (mask & FUSE_SET_ATTR_MTIME) { - if ((r = _utimes(path.c_str(), utime_t(attr->st_mtime,0), utime_t())) < 0) return r; - } else if (mask & FUSE_SET_ATTR_ATIME) { - if ((r = _utimes(path.c_str(), utime_t(), utime_t(attr->st_atime,0))) < 0) return r; - } - - assert(r == 0); - fill_stat(in, attr); - - dout(3) << "ll_setattr " << ino << " = " << r << dendl; - return 0; -} - -int Client::ll_readlink(inodeno_t ino, const char **value) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_readlink " << ino << dendl; - tout << "ll_readlink" << std::endl; - tout << ino.val << std::endl; - - Inode *in = _ll_get_inode(ino); - if (in->dn) touch_dn(in->dn); - - int r = 0; - if (in->inode.is_symlink()) { - *value = in->symlink->c_str(); - } else { - *value = ""; - r = -EINVAL; - } - dout(3) << "ll_readlink " << ino << " = " << r << " (" << *value << ")" << dendl; - return r; -} - -int Client::ll_mknod(inodeno_t parent, const char *name, mode_t mode, dev_t rdev, struct stat *attr) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_mknod " << parent << " " << name << dendl; - tout << "ll_mknod" << std::endl; - tout << parent.val << std::endl; - tout << name << std::endl; - tout << mode << std::endl; - tout << rdev << std::endl; - - Inode *diri = _ll_get_inode(parent); - - string path; - diri->make_path(path); - path += "/"; - path += name; - int r = _mknod(path.c_str(), mode, rdev); - if (r == 0) { - string dname(name); - Inode *in = diri->dir->dentries[dname]->inode; - fill_stat(in, attr); - _ll_get(in); - } - tout << attr->st_ino << std::endl; - dout(3) << "ll_mknod " << parent << " " << name - << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl; - return r; -} - -int Client::ll_mkdir(inodeno_t parent, const char *name, mode_t mode, struct stat *attr) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_mkdir " << parent << " " << name << dendl; - tout << "ll_mkdir" << std::endl; - tout << parent.val << std::endl; - tout << name << std::endl; - tout << mode << std::endl; - - Inode *diri = _ll_get_inode(parent); - - string path; - diri->make_path(path); - path += "/"; - path += name; - int r = _mkdir(path.c_str(), mode); - if (r == 0) { - string dname(name); - Inode *in = diri->dir->dentries[dname]->inode; - fill_stat(in, attr); - _ll_get(in); - } - tout << attr->st_ino << std::endl; - dout(3) << "ll_mkdir " << parent << " " << name - << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl; - return r; -} - -int Client::ll_symlink(inodeno_t parent, const char *name, const char *value, struct stat *attr) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_symlink " << parent << " " << name << " -> " << value << dendl; - tout << "ll_symlink" << std::endl; - tout << parent.val << std::endl; - tout << name << std::endl; - tout << value << std::endl; - - Inode *diri = _ll_get_inode(parent); - - string path; - diri->make_path(path); - path += "/"; - path += name; - int r = _symlink(value, path.c_str()); - if (r == 0) { - string dname(name); - Inode *in = diri->dir->dentries[dname]->inode; - fill_stat(in, attr); - _ll_get(in); - } - tout << attr->st_ino << std::endl; - dout(3) << "ll_symlink " << parent << " " << name - << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl; - return r; -} - -int Client::ll_unlink(inodeno_t ino, const char *name) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_unlink " << ino << " " << name << dendl; - tout << "ll_unlink" << std::endl; - tout << ino.val << std::endl; - tout << name << std::endl; - - Inode *diri = _ll_get_inode(ino); - - string path; - diri->make_path(path); - path += "/"; - path += name; - return _unlink(path.c_str()); -} - -int Client::ll_rmdir(inodeno_t ino, const char *name) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_rmdir " << ino << " " << name << dendl; - tout << "ll_rmdir" << std::endl; - tout << ino.val << std::endl; - tout << name << std::endl; - - Inode *diri = _ll_get_inode(ino); - - string path; - diri->make_path(path); - path += "/"; - path += name; - return _rmdir(path.c_str()); -} - -int Client::ll_rename(inodeno_t parent, const char *name, inodeno_t newparent, const char *newname) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_rename " << parent << " " << name << " to " - << newparent << " " << newname << dendl; - tout << "ll_rename" << std::endl; - tout << parent.val << std::endl; - tout << name << std::endl; - tout << newparent.val << std::endl; - tout << newname << std::endl; - - Inode *diri = _ll_get_inode(parent); - string path; - diri->make_path(path); - path += "/"; - path += name; - - Inode *newdiri = _ll_get_inode(newparent); - string newpath; - newdiri->make_path(newpath); - newpath += "/"; - newpath += newname; - - return _rename(path.c_str(), newpath.c_str()); -} - -int Client::ll_link(inodeno_t ino, inodeno_t newparent, const char *newname, struct stat *attr) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_link " << ino << " to " << newparent << " " << newname << dendl; - tout << "ll_link" << std::endl; - tout << ino.val << std::endl; - tout << newparent << std::endl; - tout << newname << std::endl; - - Inode *old = _ll_get_inode(ino); - Inode *diri = _ll_get_inode(newparent); - - string path; - old->make_path(path); - - string newpath; - diri->make_path(newpath); - newpath += "/"; - newpath += newname; - - int r = _link(path.c_str(), newpath.c_str()); - if (r == 0) { - Inode *in = _ll_get_inode(ino); - fill_stat(in, attr); - _ll_get(in); - } - return r; -} - -int Client::ll_opendir(inodeno_t ino, void **dirpp) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_opendir " << ino << dendl; - tout << "ll_opendir" << std::endl; - tout << ino.val << std::endl; - - Inode *diri = inode_map[ino]; - assert(diri); - string path; - diri->make_path(path); - - int r = _opendir(path.c_str(), (DirResult**)dirpp); - - tout << (unsigned long)*dirpp << std::endl; - - dout(3) << "ll_opendir " << ino << " = " << r << " (" << *dirpp << ")" << dendl; - return r; -} - -void Client::ll_releasedir(void *dirp) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_releasedir " << dirp << dendl; - tout << "ll_releasedir" << std::endl; - tout << (unsigned long)dirp << std::endl; - _closedir((DirResult*)dirp); -} - -int Client::ll_open(inodeno_t ino, int flags, Fh **fhp) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_open " << ino << " " << flags << dendl; - tout << "ll_open" << std::endl; - tout << ino.val << std::endl; - tout << flags << std::endl; - - Inode *in = _ll_get_inode(ino); - string path; - in->make_path(path); - - int r = _open(path.c_str(), flags, 0, fhp); - - tout << (unsigned long)*fhp << std::endl; - dout(3) << "ll_open " << ino << " " << flags << " = " << r << " (" << *fhp << ")" << dendl; - return r; -} - -int Client::ll_create(inodeno_t parent, const char *name, mode_t mode, int flags, - struct stat *attr, Fh **fhp) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_create " << parent << " " << name << " 0" << oct << mode << dec << " " << flags << dendl; - tout << "ll_create" << std::endl; - tout << parent.val << std::endl; - tout << name << std::endl; - tout << mode << std::endl; - tout << flags << std::endl; - - Inode *pin = _ll_get_inode(parent); - string path; - pin->make_path(path); - path += "/"; - path += name; - - int r = _open(path.c_str(), flags|O_CREAT, mode, fhp); - if (r >= 0) { - Inode *in = (*fhp)->inode; - fill_stat(in, attr); - _ll_get(in); - } else { - attr->st_ino = 0; - } - tout << (unsigned long)*fhp << std::endl; - tout << attr->st_ino << std::endl; - dout(3) << "ll_create " << parent << " " << name << " 0" << oct << mode << dec << " " << flags - << " = " << r << " (" << *fhp << " " << hex << attr->st_ino << dec << ")" << dendl; - return 0; -} - -int Client::ll_read(Fh *fh, off_t off, off_t len, bufferlist *bl) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_read " << fh << " " << off << "~" << len << dendl; - tout << "ll_read" << std::endl; - tout << (unsigned long)fh << std::endl; - tout << off << std::endl; - tout << len << std::endl; - - return _read(fh, off, len, bl); -} - -int Client::ll_write(Fh *fh, off_t off, off_t len, const char *data) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_write " << fh << " " << off << "~" << len << dendl; - tout << "ll_write" << std::endl; - tout << (unsigned long)fh << std::endl; - tout << off << std::endl; - tout << len << std::endl; - - return _write(fh, off, len, data); -} - -int Client::ll_flush(Fh *fh) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_flush " << fh << dendl; - tout << "ll_flush" << std::endl; - tout << (unsigned long)fh << std::endl; - - return _flush(fh); -} - -int Client::ll_fsync(Fh *fh, bool syncdataonly) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_fsync " << fh << dendl; - tout << "ll_fsync" << std::endl; - tout << (unsigned long)fh << std::endl; - - return _fsync(fh, syncdataonly); -} - - -int Client::ll_release(Fh *fh) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_release " << fh << dendl; - tout << "ll_release" << std::endl; - tout << (unsigned long)fh << std::endl; - - _release(fh); - return 0; -} - - - - - - -// ========================================= -// layout - - -int Client::describe_layout(int fd, FileLayout *lp) -{ - Mutex::Locker lock(client_lock); - - assert(fd_map.count(fd)); - Fh *f = fd_map[fd]; - Inode *in = f->inode; - - *lp = in->inode.layout; - - dout(3) << "describe_layout(" << fd << ") = 0" << dendl; - return 0; -} - -int Client::get_stripe_unit(int fd) -{ - FileLayout layout; - describe_layout(fd, &layout); - return layout.fl_stripe_unit; -} - -int Client::get_stripe_width(int fd) -{ - FileLayout layout; - describe_layout(fd, &layout); - return ceph_file_layout_stripe_width(layout); -} - -int Client::get_stripe_period(int fd) -{ - FileLayout layout; - describe_layout(fd, &layout); - return ceph_file_layout_period(layout); -} - -int Client::enumerate_layout(int fd, list& result, - off_t length, off_t offset) -{ - Mutex::Locker lock(client_lock); - - assert(fd_map.count(fd)); - Fh *f = fd_map[fd]; - Inode *in = f->inode; - - // map to a list of extents - filer->file_to_extents(in->inode, offset, length, result); - - dout(3) << "enumerate_layout(" << fd << ", " << length << ", " << offset << ") = 0" << dendl; - return 0; -} - - -void Client::ms_handle_failure(Message *m, const entity_inst_t& inst) -{ - entity_name_t dest = inst.name; - - if (dest.is_mon()) { - // resend to a different monitor. - int mon = monmap->pick_mon(true); - dout(0) << "ms_handle_failure " << *m << " to " << inst - << ", resending to mon" << mon - << dendl; - messenger->send_message(m, monmap->get_inst(mon)); - } - else if (dest.is_osd()) { - objecter->ms_handle_failure(m, dest, inst); - } - else if (dest.is_mds()) { - dout(0) << "ms_handle_failure " << *m << " to " << inst << dendl; - //failed_mds.insert(dest.num()); - } - else { - // client? - dout(0) << "ms_handle_failure " << *m << " to " << inst << ", dropping" << dendl; - delete m; - } -} - diff --git a/branches/sage/crush/client/Client.h b/branches/sage/crush/client/Client.h deleted file mode 100644 index 727098906c617..0000000000000 --- a/branches/sage/crush/client/Client.h +++ /dev/null @@ -1,847 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __CLIENT_H -#define __CLIENT_H - - -#include "mds/MDSMap.h" -#include "osd/OSDMap.h" -#include "mon/MonMap.h" - -#include "msg/Message.h" -#include "msg/Dispatcher.h" -#include "msg/Messenger.h" - -#include "messages/MClientReply.h" - -#include "include/types.h" -#include "include/lru.h" -#include "include/filepath.h" -#include "include/interval_set.h" - -#include "common/Mutex.h" -#include "common/Timer.h" - -#include "FileCache.h" - - -// stl -#include -#include -#include -using std::set; -using std::map; -using std::fstream; - -#include -using namespace __gnu_cxx; - - - -class MStatfsReply; -class MClientSession; -class MClientRequest; -class MClientRequestForward; - -class Filer; -class Objecter; -class ObjectCacher; - -extern class LogType client_logtype; -extern class Logger *client_logger; - - - -// ============================================ -// types for my local metadata cache -/* basic structure: - - - Dentries live in an LRU loop. they get expired based on last access. - see include/lru.h. items can be bumped to "mid" or "top" of list, etc. - - Inode has ref count for each Fh, Dir, or Dentry that points to it. - - when Inode ref goes to 0, it's expired. - - when Dir is empty, it's removed (and it's Inode ref--) - -*/ - -class Dir; -class Inode; - -class Dentry : public LRUObject { - public: - string name; // sort of lame - //const char *name; - Dir *dir; - Inode *inode; - int ref; // 1 if there's a dir beneath me. - - void get() { assert(ref == 0); ref++; lru_pin(); } - void put() { assert(ref == 1); ref--; lru_unpin(); } - - Dentry() : dir(0), inode(0), ref(0) { } - - /*Dentry() : name(0), dir(0), inode(0), ref(0) { } - Dentry(string& n) : name(0), dir(0), inode(0), ref(0) { - name = new char[n.length()+1]; - strcpy((char*)name, n.c_str()); - } - ~Dentry() { - delete[] name; - }*/ -}; - -class Dir { - public: - Inode *parent_inode; // my inode - //hash_map, eqstr> dentries; - hash_map dentries; - - Dir(Inode* in) { parent_inode = in; } - - bool is_empty() { return dentries.empty(); } -}; - - -class InodeCap { - public: - int caps; - long seq; - InodeCap() : caps(0), seq(0) {} -}; - - -class Inode { - public: - inode_t inode; // the actual inode - utime_t valid_until; - int mask; - - // about the dir (if this is one!) - int dir_auth; - set dir_contacts; - bool dir_hashed, dir_replicated; - - // per-mds caps - map caps; // mds -> InodeCap - map stale_caps; // mds -> cap .. stale - - utime_t file_wr_mtime; // [writers] time of last write - off_t file_wr_size; // [writers] largest offset we've written to - int num_open_rd, num_open_wr, num_open_lazy; // num readers, writers - - int ref; // ref count. 1 for each dentry, fh that links to me. - int ll_ref; // separate ref count for ll client - Dir *dir; // if i'm a dir. - Dentry *dn; // if i'm linked to a dentry. - string *symlink; // symlink content, if it's a symlink - fragtree_t dirfragtree; - map fragmap; // known frag -> mds mappings - - // for caching i/o mode - FileCache fc; - - // for sync i/o mode - int sync_reads; // sync reads in progress - int sync_writes; // sync writes in progress - - list waitfor_write; - list waitfor_read; - list waitfor_lazy; - list waitfor_no_read, waitfor_no_write; - - // - bool hack_balance_reads; - // - - void make_path(string& p) { - if (dn) { - if (dn->dir && dn->dir->parent_inode) - dn->dir->parent_inode->make_path(p); - p += "/"; - p += dn->name; - } - } - - void get() { - ref++; - //cout << "inode.get on " << this << " " << hex << inode.ino << dec << " now " << ref << endl; - } - void put(int n=1) { - ref -= n; assert(ref >= 0); - //cout << "inode.put on " << this << " " << hex << inode.ino << dec << " now " << ref << endl; - } - - void ll_get() { - ll_ref++; - } - void ll_put(int n=1) { - assert(ll_ref >= n); - ll_ref -= n; - } - - Inode(inode_t _inode, ObjectCacher *_oc) : - inode(_inode), - valid_until(0, 0), - dir_auth(-1), dir_hashed(false), dir_replicated(false), - file_wr_mtime(0, 0), file_wr_size(0), - num_open_rd(0), num_open_wr(0), num_open_lazy(0), - ref(0), ll_ref(0), - dir(0), dn(0), symlink(0), - fc(_oc, _inode), - sync_reads(0), sync_writes(0), - hack_balance_reads(false) - { } - ~Inode() { - if (symlink) { delete symlink; symlink = 0; } - } - - inodeno_t ino() { return inode.ino; } - - bool is_dir() { - return (inode.mode & INODE_TYPE_MASK) == INODE_MODE_DIR; - } - - int file_caps() { - int c = 0; - for (map::iterator it = caps.begin(); - it != caps.end(); - it++) - c |= it->second.caps; - for (map::iterator it = stale_caps.begin(); - it != stale_caps.end(); - it++) - c |= it->second.caps; - return c; - } - - int file_caps_wanted() { - int w = 0; - if (num_open_rd) w |= CAP_FILE_RD|CAP_FILE_RDCACHE; - if (num_open_wr) w |= CAP_FILE_WR|CAP_FILE_WRBUFFER; - if (num_open_lazy) w |= CAP_FILE_LAZYIO; - if (fc.is_dirty()) w |= CAP_FILE_WRBUFFER; - if (fc.is_cached()) w |= CAP_FILE_RDCACHE; - return w; - } - - void add_open(int cmode) { - if (cmode & FILE_MODE_R) num_open_rd++; - if (cmode & FILE_MODE_W) num_open_wr++; - if (cmode & FILE_MODE_LAZY) num_open_lazy++; - } - void sub_open(int cmode) { - if (cmode & FILE_MODE_R) num_open_rd--; - if (cmode & FILE_MODE_W) num_open_wr--; - if (cmode & FILE_MODE_LAZY) num_open_lazy--; - } - - int authority(const string& dname) { - if (!dirfragtree.empty()) { - __gnu_cxx::hash H; - frag_t fg = dirfragtree[H(dname)]; - while (fg != frag_t()) { - if (fragmap.count(fg) && - fragmap[fg] >= 0) { - //cout << "picked frag ino " << inode.ino << " dname " << dname << " fg " << fg << " mds" << fragmap[fg] << std::endl; - return fragmap[fg]; - } - fg = frag_t(fg.value(), fg.bits()-1); // try more general... - } - } - return authority(); - } - - int authority() { - if (dir_auth >= 0) - return dir_auth; - - assert(dn); - return dn->dir->parent_inode->authority(dn->name); - } - - - int pick_replica(MDSMap *mdsmap) { - // replicas? - /* fixme - if (//ino() > 1ULL && - dir_contacts.size()) { - set::iterator it = dir_contacts.begin(); - if (dir_contacts.size() == 1) - return *it; - else { - //cout << "dir_contacts on " << inode.ino << " is " << dir_contacts << std::endl; - int r = 1 + (rand() % dir_contacts.size()); - int a = authority(); - while (r--) { - it++; - if (mdsmap->is_down(*it)) it++; - if (it == dir_contacts.end()) it = dir_contacts.begin(); - if (*it == a) it++; // skip the authority - if (it == dir_contacts.end()) it = dir_contacts.begin(); - } - return *it; - } - } - */ - - if (dir_replicated) {// || ino() == 1) { - // pick a random mds that isn't the auth - set s; - mdsmap->get_in_mds_set(s); - set::iterator it = s.begin(); - if (s.empty()) - return 0; - if (s.size() == 1) - return *it; - else { - //cout << "dir_contacts on " << inode.ino << " is " << dir_contacts << std::endl; - int r = 1 + (rand() % s.size()); - int a = authority(); - while (r--) { - it++; - if (mdsmap->is_down(*it)) it++; - if (it == s.end()) it = s.begin(); - if (*it == a) it++; // skip the authority - if (it == s.end()) it = s.begin(); - } - //if (inode.ino == 1) cout << "chose " << *it << " from " << s << std::endl; - return *it; - } - //cout << "num_mds is " << mdcluster->get_num_mds() << endl; - //return mdsmap->get_random_in_mds(); - //return rand() % mdsmap->get_num_mds(); // huh.. pick a random mds! - } - else - return authority(); - } - - - // open Dir for an inode. if it's not open, allocated it (and pin dentry in memory). - Dir *open_dir() { - if (!dir) { - if (dn) dn->get(); // pin dentry - get(); // pin inode - dir = new Dir(this); - } - return dir; - } - -}; - - - - -// file handle for any open file state - -struct Fh { - Inode *inode; - off_t pos; - int mds; // have to talk to mds we opened with (for now) - int mode; // the mode i opened the file with - - bool is_lazy() { return mode & O_LAZY; } - - bool pos_locked; // pos is currently in use - list pos_waiters; // waiters for pos - - Fh() : inode(0), pos(0), mds(0), mode(0), pos_locked(false) {} -}; - - - - - -// ======================================================== -// client interface - -class Client : public Dispatcher { - public: - - /* getdir result */ - struct DirEntry { - string d_name; - struct stat st; - int stmask; - DirEntry(const string &s) : d_name(s), stmask(0) {} - DirEntry(const string &n, struct stat& s, int stm) : d_name(n), st(s), stmask(stm) {} - }; - - struct DirResult { - static const int SHIFT = 28; - static const int64_t MASK = (1 << SHIFT) - 1; - static const off_t END = 1ULL << (SHIFT + 32); - - string path; - Inode *inode; - int64_t offset; // high bits: frag_t, low bits: an offset - map > buffer; - - DirResult(const char *p, Inode *in=0) : path(p), inode(in), offset(0) { - if (inode) inode->get(); - } - DirResult(const string &p, Inode *in=0) : path(p), inode(in), offset(0) { - if (inode) inode->get(); - } - - frag_t frag() { return frag_t(offset >> SHIFT); } - unsigned fragpos() { return offset & MASK; } - - void next_frag() { - frag_t fg = offset >> SHIFT; - if (fg.is_rightmost()) - set_end(); - else - set_frag(fg.next()); - } - void set_frag(frag_t f) { - offset = (uint64_t)f << SHIFT; - assert(sizeof(offset) == 8); - } - void set_end() { offset = END; } - bool at_end() { return (offset == END); } - }; - - - // cluster descriptors - MDSMap *mdsmap; - OSDMap *osdmap; - - SafeTimer timer; - - protected: - Messenger *messenger; - int whoami; - MonMap *monmap; - - // mds sessions - map mds_sessions; // mds -> push seq - map > waiting_for_session; - list waiting_for_mdsmap; - - void handle_client_session(MClientSession *m); - void send_reconnect(int mds); - - // mds requests - struct MetaRequest { - tid_t tid; - MClientRequest *request; - bufferlist request_payload; // in case i have to retry - - bool idempotent; // is request idempotent? - set mds; // who i am asking - int resend_mds; // someone wants you to (re)send the request here - int num_fwd; // # of times i've been forwarded - int retry_attempt; - - MClientReply *reply; // the reply - - Cond *caller_cond; // who to take up - Cond *dispatch_cond; // who to kick back - - MetaRequest(MClientRequest *req, tid_t t) : - tid(t), request(req), - idempotent(false), resend_mds(-1), num_fwd(0), retry_attempt(0), - reply(0), - caller_cond(0), dispatch_cond(0) { } - }; - tid_t last_tid; - map mds_requests; - set failed_mds; - - struct StatfsRequest { - tid_t tid; - MStatfsReply *reply; - Cond *caller_cond; - StatfsRequest(tid_t t, Cond *cc) : tid(t), reply(0), caller_cond(cc) {} - }; - map statfs_requests; - - MClientReply *make_request(MClientRequest *req, int use_auth=-1); - int choose_target_mds(MClientRequest *req); - void send_request(MetaRequest *request, int mds); - void kick_requests(int mds); - void handle_client_request_forward(MClientRequestForward *reply); - void handle_client_reply(MClientReply *reply); - void handle_statfs_reply(MStatfsReply *reply); - - bool mounted; - bool unmounting; - Cond mount_cond; - int my_instance; - - int unsafe_sync_write; -public: - entity_name_t get_myname() { return messenger->get_myname(); } - void hack_sync_write_safe(); - -protected: - Filer *filer; - ObjectCacher *objectcacher; - Objecter *objecter; // (non-blocking) osd interface - - // cache - hash_map inode_map; - Inode* root; - LRU lru; // lru list of Dentry's in our local metadata cache. - - // cap weirdness - map > cap_reap_queue; // ino -> mds -> msg .. set of (would-be) stale caps to reap - - - // file handles, etc. - string cwd; - interval_set free_fd_set; // unused fds - hash_map fd_map; - - int get_fd() { - int fd = free_fd_set.start(); - free_fd_set.erase(fd, 1); - return fd; - } - void put_fd(int fd) { - free_fd_set.insert(fd, 1); - } - - void mkabspath(const char *rel, string& abs) { - if (rel[0] == '/') { - abs = rel; - } else { - abs = cwd; - abs += "/"; - abs += rel; - } - } - - - // global client lock - // - protects Client and buffer cache both! - Mutex client_lock; - - - // -- metadata cache stuff - - // decrease inode ref. delete if dangling. - void put_inode(Inode *in, int n=1) { - //cout << "put_inode on " << in << " " << in->inode.ino << endl; - in->put(n); - if (in->ref == 0) { - //cout << "put_inode deleting " << in->inode.ino << endl; - inode_map.erase(in->inode.ino); - if (in == root) root = 0; - delete in; - } - } - - void close_dir(Dir *dir) { - assert(dir->is_empty()); - - Inode *in = dir->parent_inode; - if (in->dn) in->dn->put(); // unpin dentry - - delete in->dir; - in->dir = 0; - put_inode(in); // unpin inode - } - - //int get_cache_size() { return lru.lru_get_size(); } - //void set_cache_size(int m) { lru.lru_set_max(m); } - - Dentry* link(Dir *dir, const string& name, Inode *in) { - Dentry *dn = new Dentry; - dn->name = name; - - // link to dir - dn->dir = dir; - //cout << "link dir " << dir->parent_inode->inode.ino << " '" << name << "' -> inode " << in->inode.ino << endl; - dir->dentries[dn->name] = dn; - - // link to inode - dn->inode = in; - assert(in->dn == 0); - in->dn = dn; - in->get(); - - if (in->dir) dn->get(); // dir -> dn pin - - lru.lru_insert_mid(dn); // mid or top? - return dn; - } - - void unlink(Dentry *dn) { - Inode *in = dn->inode; - assert(in->dn == dn); - - // unlink from inode - if (dn->inode->dir) dn->put(); // dir -> dn pin - dn->inode = 0; - in->dn = 0; - put_inode(in); - - // unlink from dir - dn->dir->dentries.erase(dn->name); - if (dn->dir->is_empty()) - close_dir(dn->dir); - dn->dir = 0; - - // delete den - lru.lru_remove(dn); - delete dn; - } - - Dentry *relink(Dir *dir, const string& name, Inode *in) { - Dentry *olddn = in->dn; - Dir *olddir = olddn->dir; // note: might == dir! - - // newdn, attach to inode. don't touch inode ref. - Dentry *newdn = new Dentry; - newdn->dir = dir; - newdn->name = name; - newdn->inode = in; - in->dn = newdn; - - if (in->dir) { // dir -> dn pin - newdn->get(); - olddn->put(); - } - - // unlink old dn from dir - olddir->dentries.erase(olddn->name); - olddn->inode = 0; - olddn->dir = 0; - lru.lru_remove(olddn); - - // link new dn to dir - dir->dentries[name] = newdn; - lru.lru_insert_mid(newdn); - - // olddir now empty? (remember, olddir might == dir) - if (olddir->is_empty()) - close_dir(olddir); - - return newdn; - } - - // move dentry to top of lru - void touch_dn(Dentry *dn) { lru.lru_touch(dn); } - - // trim cache. - void trim_cache(); - void dump_inode(Inode *in, set& did); - void dump_cache(); // debug - - // find dentry based on filepath - Dentry *lookup(filepath& path); - - int fill_stat(Inode *in, struct stat *st); - - - // trace generation - ofstream traceout; - - - // friends - friend class SyntheticClient; - - public: - Client(Messenger *m, MonMap *mm, int i=0); - ~Client(); - void tear_down_cache(); - - int get_nodeid() { return whoami; } - - void init(); - void shutdown(); - - // messaging - void dispatch(Message *m); - - void handle_unmount(Message*); - void handle_mds_map(class MMDSMap *m); - - // file caps - void handle_file_caps(class MClientFileCaps *m); - void implemented_caps(class MClientFileCaps *m, Inode *in); - void release_caps(Inode *in, int retain=0); - void update_caps_wanted(Inode *in); - - void close_release(Inode *in); - void close_safe(Inode *in); - - void lock_fh_pos(Fh *f); - void unlock_fh_pos(Fh *f); - - // metadata cache - Inode* insert_inode(Dir *dir, InodeStat *in_info, const string& dn); - void update_dir_dist(Inode *in, DirStat *st); - Inode* insert_trace(MClientReply *reply); - - // ---------------------- - // fs ops. -private: - void _try_mount(); - void _mount_timeout(); - Context *mount_timeout_event; - - class C_MountTimeout : public Context { - Client *client; - public: - C_MountTimeout(Client *c) : client(c) { } - void finish(int r) { - if (r >= 0) client->_mount_timeout(); - } - }; - - // some helpers - int _do_lstat(const char *path, int mask, Inode **in); - int _opendir(const char *name, DirResult **dirpp); - void _readdir_add_dirent(DirResult *dirp, const string& name, Inode *in); - void _readdir_fill_dirent(struct dirent *de, DirEntry *entry, off_t); - bool _readdir_have_frag(DirResult *dirp); - void _readdir_next_frag(DirResult *dirp); - void _readdir_rechoose_frag(DirResult *dirp); - int _readdir_get_frag(DirResult *dirp); - void _closedir(DirResult *dirp); - void _ll_get(Inode *in); - int _ll_put(Inode *in, int num); - void _ll_drop_pins(); - - // internal interface - // call these with client_lock held! - int _link(const char *existing, const char *newname); - int _unlink(const char *path); - int _rename(const char *from, const char *to); - int _mkdir(const char *path, mode_t mode); - int _rmdir(const char *path); - int _readlink(const char *path, char *buf, off_t size); - int _symlink(const char *existing, const char *newname); - int _lstat(const char *path, struct stat *stbuf); - int _chmod(const char *relpath, mode_t mode); - int _chown(const char *relpath, uid_t uid, gid_t gid); - int _utimes(const char *relpath, utime_t mtime, utime_t atime); - int _mknod(const char *path, mode_t mode, dev_t rdev); - int _open(const char *path, int flags, mode_t mode, Fh **fhp); - int _release(Fh *fh); - int _read(Fh *fh, off_t offset, off_t size, bufferlist *bl); - int _write(Fh *fh, off_t offset, off_t size, const char *buf); - int _flush(Fh *fh); - int _truncate(const char *file, off_t length); - int _ftruncate(Fh *fh, off_t length); - int _fsync(Fh *fh, bool syncdataonly); - int _statfs(struct statvfs *stbuf); - - -public: - int mount(); - int unmount(); - - // these shoud (more or less) mirror the actual system calls. - int statfs(const char *path, struct statvfs *stbuf); - - // crap - int chdir(const char *s); - const string getcwd() { return cwd; } - - // namespace ops - int getdir(const char *relpath, list& names); // get the whole dir at once. - - int opendir(const char *name, DIR **dirpp); - int closedir(DIR *dirp); - int readdir_r(DIR *dirp, struct dirent *de); - int readdirplus_r(DIR *dirp, struct dirent *de, struct stat *st, int *stmask); - void rewinddir(DIR *dirp); - off_t telldir(DIR *dirp); - void seekdir(DIR *dirp, off_t offset); - - struct dirent_plus *readdirplus(DIR *dirp); - int readdirplus_r(DIR *dirp, struct dirent_plus *entry, struct dirent_plus **result); - struct dirent_lite *readdirlite(DIR *dirp); - int readdirlite_r(DIR *dirp, struct dirent_lite *entry, struct dirent_lite **result); - - int link(const char *existing, const char *newname); - int unlink(const char *path); - int rename(const char *from, const char *to); - - // dirs - int mkdir(const char *path, mode_t mode); - int rmdir(const char *path); - - // symlinks - int readlink(const char *path, char *buf, off_t size); - int symlink(const char *existing, const char *newname); - - // inode stuff - int lstat(const char *path, struct stat *stbuf); - int lstatlite(const char *path, struct statlite *buf); - - int chmod(const char *path, mode_t mode); - int chown(const char *path, uid_t uid, gid_t gid); - int utime(const char *path, struct utimbuf *buf); - - // file ops - int mknod(const char *path, mode_t mode, dev_t rdev=0); - int open(const char *path, int flags, mode_t mode=0); - int close(int fd); - off_t lseek(int fd, off_t offset, int whence); - int read(int fd, char *buf, off_t size, off_t offset=-1); - int write(int fd, const char *buf, off_t size, off_t offset=-1); - int fake_write_size(int fd, off_t size); - int truncate(const char *file, off_t size); - int ftruncate(int fd, off_t size); - int fsync(int fd, bool syncdataonly); - - // hpc lazyio - int lazyio_propogate(int fd, off_t offset, size_t count); - int lazyio_synchronize(int fd, off_t offset, size_t count); - - // expose file layout - int describe_layout(int fd, FileLayout* layout); - int get_stripe_unit(int fd); - int get_stripe_width(int fd); - int get_stripe_period(int fd); - int enumerate_layout(int fd, list& result, - off_t length, off_t offset); - - // low-level interface - int ll_lookup(inodeno_t parent, const char *name, struct stat *attr); - bool ll_forget(inodeno_t ino, int count); - Inode *_ll_get_inode(inodeno_t ino); - int ll_getattr(inodeno_t ino, struct stat *st); - int ll_setattr(inodeno_t ino, struct stat *st, int mask); - int ll_opendir(inodeno_t ino, void **dirpp); - void ll_releasedir(void *dirp); - int ll_readlink(inodeno_t ino, const char **value); - int ll_mknod(inodeno_t ino, const char *name, mode_t mode, dev_t rdev, struct stat *attr); - int ll_mkdir(inodeno_t ino, const char *name, mode_t mode, struct stat *attr); - int ll_symlink(inodeno_t ino, const char *name, const char *value, struct stat *attr); - int ll_unlink(inodeno_t ino, const char *name); - int ll_rmdir(inodeno_t ino, const char *name); - int ll_rename(inodeno_t parent, const char *name, inodeno_t newparent, const char *newname); - int ll_link(inodeno_t ino, inodeno_t newparent, const char *newname, struct stat *attr); - int ll_open(inodeno_t ino, int flags, Fh **fh); - int ll_create(inodeno_t parent, const char *name, mode_t mode, int flags, struct stat *attr, Fh **fh); - int ll_read(Fh *fh, off_t off, off_t len, bufferlist *bl); - int ll_write(Fh *fh, off_t off, off_t len, const char *data); - int ll_flush(Fh *fh); - int ll_fsync(Fh *fh, bool syncdataonly); - int ll_release(Fh *fh); - int ll_statfs(inodeno_t, struct statvfs *stbuf); - - - // failure - void ms_handle_failure(Message*, const entity_inst_t& inst); -}; - -#endif diff --git a/branches/sage/crush/client/SyntheticClient.cc b/branches/sage/crush/client/SyntheticClient.cc deleted file mode 100644 index 1695631b8b8cb..0000000000000 --- a/branches/sage/crush/client/SyntheticClient.cc +++ /dev/null @@ -1,2882 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include -#include -using namespace std; - - - -#include "SyntheticClient.h" -#include "osdc/Objecter.h" - -#include "include/filepath.h" -#include "mds/mdstypes.h" -#include "common/Logger.h" - -#include -#include -#include -#include -#include -#include -#include - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_client) *_dout << dbeginl << g_clock.now() << " synthetic" << client->get_nodeid() << " " -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_client) *_derr << dbeginl << g_clock.now() << " synthetic" << client->get_nodeid() << " " - -// traces -//void trace_include(SyntheticClient *syn, Client *cl, string& prefix); -//void trace_openssh(SyntheticClient *syn, Client *cl, string& prefix); - - -list syn_modes; -list syn_iargs; -list syn_sargs; - -void parse_syn_options(vector& args) -{ - vector nargs; - - for (unsigned i=0; iclient = client; - thread_id = 0; - - did_readdir = false; - - run_only = -1; - exclude = -1; - - this->modes = syn_modes; - this->iargs = syn_iargs; - this->sargs = syn_sargs; - - run_start = g_clock.now(); -} - - - - -#define DBL 2 - -void *synthetic_client_thread_entry(void *ptr) -{ - SyntheticClient *sc = (SyntheticClient*)ptr; - //int r = - sc->run(); - return 0;//(void*)r; -} - -string SyntheticClient::get_sarg(int seq) -{ - string a; - if (!sargs.empty()) { - a = sargs.front(); - sargs.pop_front(); - } - if (a.length() == 0 || a == "~") { - char s[20]; - sprintf(s,"syn.%d.%d", client->whoami, seq); - a = s; - } - return a; -} - -int SyntheticClient::run() -{ - client->init(); - client->mount(); - - //run_start = g_clock.now(); - run_until = utime_t(0,0); - dout(5) << "run" << dendl; - - int seq = 0; - - for (list::iterator it = modes.begin(); - it != modes.end(); - it++) { - int mode = *it; - dout(3) << "mode " << mode << dendl; - - switch (mode) { - - - // WHO? - - case SYNCLIENT_MODE_ONLY: - { - run_only = iargs.front(); - iargs.pop_front(); - if (run_only == client->get_nodeid()) - dout(2) << "only " << run_only << dendl; - } - break; - case SYNCLIENT_MODE_ONLYRANGE: - { - int first = iargs.front(); - iargs.pop_front(); - int last = iargs.front(); - iargs.pop_front(); - if (first <= client->get_nodeid() && - last > client->get_nodeid()) { - run_only = client->get_nodeid(); - dout(2) << "onlyrange [" << first << ", " << last << ") includes me" << dendl; - } else - run_only = client->get_nodeid()+1; // not me - } - break; - case SYNCLIENT_MODE_EXCLUDE: - { - exclude = iargs.front(); - iargs.pop_front(); - if (exclude == client->get_nodeid()) { - run_only = client->get_nodeid() + 1; - dout(2) << "not running " << exclude << dendl; - } else - run_only = -1; - } - break; - - // HOW LONG? - - case SYNCLIENT_MODE_UNTIL: - { - int iarg1 = iargs.front(); - iargs.pop_front(); - if (run_me()) { - if (iarg1) { - dout(2) << "until " << iarg1 << dendl; - utime_t dur(iarg1,0); - run_until = run_start + dur; - } else { - dout(2) << "until " << iarg1 << " (no limit)" << dendl; - run_until = utime_t(0,0); - } - } - } - break; - - - // ... - - case SYNCLIENT_MODE_FOO: - if (run_me()) { - foo(); - } - did_run_me(); - break; - - case SYNCLIENT_MODE_RANDOMSLEEP: - { - int iarg1 = iargs.front(); - iargs.pop_front(); - if (run_me()) { - srand(time(0) + getpid() + client->whoami); - sleep(rand() % iarg1); - } - did_run_me(); - } - break; - - case SYNCLIENT_MODE_SLEEP: - { - int iarg1 = iargs.front(); - iargs.pop_front(); - if (run_me()) { - dout(2) << "sleep " << iarg1 << dendl; - sleep(iarg1); - } - did_run_me(); - } - break; - - case SYNCLIENT_MODE_SLEEPUNTIL: - { - int iarg1 = iargs.front(); - iargs.pop_front(); - if (iarg1 && run_me()) { - dout(2) << "sleepuntil " << iarg1 << dendl; - utime_t at = g_clock.now() - run_start; - if (at.sec() < iarg1) - sleep(iarg1 - at.sec()); - } - did_run_me(); - } - break; - - case SYNCLIENT_MODE_RANDOMWALK: - { - int iarg1 = iargs.front(); - iargs.pop_front(); - if (run_me()) { - dout(2) << "randomwalk " << iarg1 << dendl; - random_walk(iarg1); - } - did_run_me(); - } - break; - - case SYNCLIENT_MODE_MAKEDIRMESS: - { - string sarg1 = get_sarg(0); - int iarg1 = iargs.front(); iargs.pop_front(); - if (run_me()) { - dout(2) << "makedirmess " << sarg1 << " " << iarg1 << dendl; - make_dir_mess(sarg1.c_str(), iarg1); - } - did_run_me(); - } - break; - case SYNCLIENT_MODE_MAKEDIRS: - { - string sarg1 = get_sarg(seq++); - int iarg1 = iargs.front(); iargs.pop_front(); - int iarg2 = iargs.front(); iargs.pop_front(); - int iarg3 = iargs.front(); iargs.pop_front(); - if (run_me()) { - dout(2) << "makedirs " << sarg1 << " " << iarg1 << " " << iarg2 << " " << iarg3 << dendl; - make_dirs(sarg1.c_str(), iarg1, iarg2, iarg3); - } - did_run_me(); - } - break; - case SYNCLIENT_MODE_STATDIRS: - { - string sarg1 = get_sarg(0); - int iarg1 = iargs.front(); iargs.pop_front(); - int iarg2 = iargs.front(); iargs.pop_front(); - int iarg3 = iargs.front(); iargs.pop_front(); - if (run_me()) { - dout(2) << "statdirs " << sarg1 << " " << iarg1 << " " << iarg2 << " " << iarg3 << dendl; - stat_dirs(sarg1.c_str(), iarg1, iarg2, iarg3); - } - did_run_me(); - } - break; - case SYNCLIENT_MODE_READDIRS: - { - string sarg1 = get_sarg(0); - int iarg1 = iargs.front(); iargs.pop_front(); - int iarg2 = iargs.front(); iargs.pop_front(); - int iarg3 = iargs.front(); iargs.pop_front(); - if (run_me()) { - dout(2) << "readdirs " << sarg1 << " " << iarg1 << " " << iarg2 << " " << iarg3 << dendl; - read_dirs(sarg1.c_str(), iarg1, iarg2, iarg3); - } - did_run_me(); - } - break; - - - case SYNCLIENT_MODE_THRASHLINKS: - { - string sarg1 = get_sarg(0); - int iarg1 = iargs.front(); iargs.pop_front(); - int iarg2 = iargs.front(); iargs.pop_front(); - int iarg3 = iargs.front(); iargs.pop_front(); - int iarg4 = iargs.front(); iargs.pop_front(); - if (run_me()) { - dout(2) << "thrashlinks " << sarg1 << " " << iarg1 << " " << iarg2 << " " << iarg3 << dendl; - thrash_links(sarg1.c_str(), iarg1, iarg2, iarg3, iarg4); - } - did_run_me(); - } - break; - - case SYNCLIENT_MODE_LINKTEST: - { - if (run_me()) { - link_test(); - } - did_run_me(); - } - break; - - - case SYNCLIENT_MODE_MAKEFILES: - { - int num = iargs.front(); iargs.pop_front(); - int count = iargs.front(); iargs.pop_front(); - int priv = iargs.front(); iargs.pop_front(); - if (run_me()) { - dout(2) << "makefiles " << num << " " << count << " " << priv << dendl; - make_files(num, count, priv, false); - } - did_run_me(); - } - break; - case SYNCLIENT_MODE_MAKEFILES2: - { - int num = iargs.front(); iargs.pop_front(); - int count = iargs.front(); iargs.pop_front(); - int priv = iargs.front(); iargs.pop_front(); - if (run_me()) { - dout(2) << "makefiles2 " << num << " " << count << " " << priv << dendl; - make_files(num, count, priv, true); - } - did_run_me(); - } - break; - case SYNCLIENT_MODE_CREATESHARED: - { - string sarg1 = get_sarg(0); - int num = iargs.front(); iargs.pop_front(); - if (run_me()) { - dout(2) << "createshared " << num << dendl; - create_shared(num); - } - did_run_me(); - } - break; - case SYNCLIENT_MODE_OPENSHARED: - { - string sarg1 = get_sarg(0); - int num = iargs.front(); iargs.pop_front(); - int count = iargs.front(); iargs.pop_front(); - if (run_me()) { - dout(2) << "openshared " << num << dendl; - open_shared(num, count); - } - did_run_me(); - } - break; - - case SYNCLIENT_MODE_CREATEOBJECTS: - { - int count = iargs.front(); iargs.pop_front(); - int size = iargs.front(); iargs.pop_front(); - int inflight = iargs.front(); iargs.pop_front(); - if (run_me()) { - dout(2) << "createobjects " << cout << " of " << size << " bytes" - << ", " << inflight << " in flight" << dendl; - create_objects(count, size, inflight); - } - did_run_me(); - } - break; - case SYNCLIENT_MODE_OBJECTRW: - { - int count = iargs.front(); iargs.pop_front(); - int size = iargs.front(); iargs.pop_front(); - int wrpc = iargs.front(); iargs.pop_front(); - int overlap = iargs.front(); iargs.pop_front(); - int rskew = iargs.front(); iargs.pop_front(); - int wskew = iargs.front(); iargs.pop_front(); - if (run_me()) { - dout(2) << "objectrw " << cout << " " << size << " " << wrpc - << " " << overlap << " " << rskew << " " << wskew << dendl; - object_rw(count, size, wrpc, overlap, rskew, wskew); - } - did_run_me(); - } - break; - - case SYNCLIENT_MODE_FULLWALK: - { - string sarg1;// = get_sarg(0); - if (run_me()) { - dout(2) << "fullwalk" << sarg1 << dendl; - full_walk(sarg1); - } - did_run_me(); - } - break; - case SYNCLIENT_MODE_REPEATWALK: - { - string sarg1 = get_sarg(0); - if (run_me()) { - dout(2) << "repeatwalk " << sarg1 << dendl; - while (full_walk(sarg1) == 0) ; - } - did_run_me(); - } - break; - - case SYNCLIENT_MODE_WRITEFILE: - { - string sarg1 = get_sarg(0); - int iarg1 = iargs.front(); iargs.pop_front(); - int iarg2 = iargs.front(); iargs.pop_front(); - dout(1) << "WRITING SYN CLIENT" << dendl; - if (run_me()) { - write_file(sarg1, iarg1, iarg2); - } - did_run_me(); - } - break; - case SYNCLIENT_MODE_WRSHARED: - { - string sarg1 = "shared"; - int iarg1 = iargs.front(); iargs.pop_front(); - int iarg2 = iargs.front(); iargs.pop_front(); - if (run_me()) { - write_file(sarg1, iarg1, iarg2); - } - did_run_me(); - } - break; - case SYNCLIENT_MODE_READSHARED: - { - string sarg1 = "shared"; - int iarg1 = iargs.front(); iargs.pop_front(); - int iarg2 = iargs.front(); iargs.pop_front(); - if (run_me()) { - read_file(sarg1, iarg1, iarg2, true); - } - did_run_me(); - } - break; - case SYNCLIENT_MODE_WRITEBATCH: - { - int iarg1 = iargs.front(); iargs.pop_front(); - int iarg2 = iargs.front(); iargs.pop_front(); - int iarg3 = iargs.front(); iargs.pop_front(); - - if (run_me()) { - write_batch(iarg1, iarg2, iarg3); - } - did_run_me(); - } - break; - - case SYNCLIENT_MODE_READFILE: - { - string sarg1 = get_sarg(0); - int iarg1 = iargs.front(); iargs.pop_front(); - int iarg2 = iargs.front(); iargs.pop_front(); - - dout(1) << "READING SYN CLIENT" << dendl; - if (run_me()) { - read_file(sarg1, iarg1, iarg2); - } - did_run_me(); - } - break; - - case SYNCLIENT_MODE_RDWRRANDOM: - { - string sarg1 = get_sarg(0); - int iarg1 = iargs.front(); iargs.pop_front(); - int iarg2 = iargs.front(); iargs.pop_front(); - - dout(1) << "RANDOM READ WRITE SYN CLIENT" << dendl; - if (run_me()) { - read_random(sarg1, iarg1, iarg2); - } - did_run_me(); - } - break; - - case SYNCLIENT_MODE_RDWRRANDOM_EX: - { - string sarg1 = get_sarg(0); - int iarg1 = iargs.front(); iargs.pop_front(); - int iarg2 = iargs.front(); iargs.pop_front(); - - dout(1) << "RANDOM READ WRITE SYN CLIENT" << dendl; - if (run_me()) { - read_random_ex(sarg1, iarg1, iarg2); - } - did_run_me(); - } - break; - case SYNCLIENT_MODE_TRACE: - { - string tfile = get_sarg(0); - sargs.push_front(string("~")); - int iarg1 = iargs.front(); iargs.pop_front(); - int playdata = iargs.front(); iargs.pop_front(); - string prefix = get_sarg(0); - char realtfile[100]; - sprintf(realtfile, tfile.c_str(), client->get_nodeid()); - - if (run_me()) { - dout(-2) << "trace " << tfile << " prefix=" << prefix << " count=" << iarg1 << " data=" << playdata << dendl; - - Trace t(realtfile); - - if (iarg1 == 0) iarg1 = 1; // play trace at least once! - - for (int i=0; i 1) clean_dir(prefix); // clean only if repeat - - utime_t lat = g_clock.now(); - lat -= start; - - dout(0) << " trace " << tfile << " loop " << (i+1) << "/" << iarg1 << " done in " << (double)lat << " seconds" << dendl; - if (client_logger - && i > 0 - && i < iarg1-1 - ) { - client_logger->finc("trsum", (double)lat); - client_logger->inc("trnum"); - } - } - dout(1) << "done " << dendl; - } - did_run_me(); - } - break; - - - case SYNCLIENT_MODE_OPENTEST: - { - int count = iargs.front(); iargs.pop_front(); - if (run_me()) { - for (int i=0; iopen("test", rand()%2 ? (O_WRONLY|O_CREAT):O_RDONLY); - if (fd > 0) client->close(fd); - } - } - did_run_me(); - } - break; - - case SYNCLIENT_MODE_OPTEST: - { - int count = iargs.front(); iargs.pop_front(); - if (run_me()) { - client->mknod("test", 0777); - struct stat st; - for (int i=0; ilstat("test", &st); - client->chmod("test", 0777); - } - } - did_run_me(); - } - break; - - case SYNCLIENT_MODE_TRUNCATE: - { - string file = get_sarg(0); - sargs.push_front(file); - int iarg1 = iargs.front(); iargs.pop_front(); - if (run_me()) { - client->truncate(file.c_str(), iarg1); - } - did_run_me(); - } - break; - - - case SYNCLIENT_MODE_IMPORTFIND: - { - string base = get_sarg(0); - string find = get_sarg(0); - int data = get_iarg(); - if (run_me()) { - import_find(base.c_str(), find.c_str(), data); - } - did_run_me(); - } - break; - - default: - assert(0); - } - } - dout(1) << "syn done, unmounting " << dendl; - - client->unmount(); - client->shutdown(); - return 0; -} - - -int SyntheticClient::start_thread() -{ - assert(!thread_id); - - pthread_create(&thread_id, NULL, synthetic_client_thread_entry, this); - assert(thread_id); - return 0; -} - -int SyntheticClient::join_thread() -{ - assert(thread_id); - void *rv; - pthread_join(thread_id, &rv); - return 0; -} - - -bool roll_die(float p) -{ - float r = (float)(rand() % 100000) / 100000.0; - if (r < p) - return true; - else - return false; -} - -void SyntheticClient::init_op_dist() -{ - op_dist.clear(); - op_dist.add( MDS_OP_STAT, g_conf.fakeclient_op_stat ); - op_dist.add( MDS_OP_UTIME, g_conf.fakeclient_op_utime ); - op_dist.add( MDS_OP_CHMOD, g_conf.fakeclient_op_chmod ); - op_dist.add( MDS_OP_CHOWN, g_conf.fakeclient_op_chown ); - - op_dist.add( MDS_OP_READDIR, g_conf.fakeclient_op_readdir ); - op_dist.add( MDS_OP_MKNOD, g_conf.fakeclient_op_mknod ); - op_dist.add( MDS_OP_LINK, g_conf.fakeclient_op_link ); - op_dist.add( MDS_OP_UNLINK, g_conf.fakeclient_op_unlink ); - op_dist.add( MDS_OP_RENAME, g_conf.fakeclient_op_rename ); - - op_dist.add( MDS_OP_MKDIR, g_conf.fakeclient_op_mkdir ); - op_dist.add( MDS_OP_RMDIR, g_conf.fakeclient_op_rmdir ); - op_dist.add( MDS_OP_SYMLINK, g_conf.fakeclient_op_symlink ); - - op_dist.add( MDS_OP_OPEN, g_conf.fakeclient_op_openrd ); - //op_dist.add( MDS_OP_READ, g_conf.fakeclient_op_read ); - //op_dist.add( MDS_OP_WRITE, g_conf.fakeclient_op_write ); - op_dist.add( MDS_OP_TRUNCATE, g_conf.fakeclient_op_truncate ); - op_dist.add( MDS_OP_FSYNC, g_conf.fakeclient_op_fsync ); - op_dist.add( MDS_OP_RELEASE, g_conf.fakeclient_op_close ); // actually, close() - op_dist.normalize(); -} - -void SyntheticClient::up() -{ - cwd = cwd.prefixpath(cwd.depth()-1); - dout(DBL) << "cd .. -> " << cwd << dendl; - clear_dir(); -} - - -int SyntheticClient::play_trace(Trace& t, string& prefix, bool metadata_only) -{ - dout(4) << "play trace prefix '" << prefix << "'" << dendl; - t.start(); - - char buf[1024]; - char buf2[1024]; - - utime_t start = g_clock.now(); - - hash_map open_files; - hash_map open_dirs; - - hash_map ll_files; - hash_map ll_dirs; - hash_map ll_inos; - - ll_inos[1] = 1; // root inode is known. - - // prefix? - const char *p = prefix.c_str(); - if (prefix.length()) { - client->mkdir(prefix.c_str(), 0755); - struct stat attr; - if (client->ll_lookup(1, prefix.c_str(), &attr) == 0) { - ll_inos[1] = attr.st_ino; - dout(5) << "'root' ino is " << inodeno_t(attr.st_ino) << dendl; - } else { - dout(0) << "warning: play_trace coudln't lookup up my per-client directory" << dendl; - } - } - - - utime_t last_status = start; - - int n = 0; - - // for object traces - Mutex &lock = client->client_lock; - Cond cond; - bool ack; - bool safe; - C_Gather *safeg = new C_Gather(new C_SafeCond(&lock, &cond, &safe)); - Context *safegref = safeg->new_sub(); // take a ref - - while (!t.end()) { - - if (++n == 100) { - n = 00; - utime_t now = last_status; - if (now - last_status > 1.0) { - last_status = now; - dout(1) << "play_trace at line " << t.get_line() << dendl; - } - } - - if (time_to_stop()) break; - - // op - const char *op = t.get_string(buf, 0); - dout(4) << (t.get_line()-1) << ": trace op " << op << dendl; - - if (op[0] == '@') { - // timestamp... ignore it! - t.get_int(); // sec - t.get_int(); // usec - op = t.get_string(buf, 0); - } - - // high level ops --------------------- - if (strcmp(op, "link") == 0) { - const char *a = t.get_string(buf, p); - const char *b = t.get_string(buf2, p); - client->link(a,b); - } else if (strcmp(op, "unlink") == 0) { - const char *a = t.get_string(buf, p); - client->unlink(a); - } else if (strcmp(op, "rename") == 0) { - const char *a = t.get_string(buf, p); - const char *b = t.get_string(buf2, p); - client->rename(a,b); - } else if (strcmp(op, "mkdir") == 0) { - const char *a = t.get_string(buf, p); - int64_t b = t.get_int(); - client->mkdir(a, b); - } else if (strcmp(op, "rmdir") == 0) { - const char *a = t.get_string(buf, p); - client->rmdir(a); - } else if (strcmp(op, "symlink") == 0) { - const char *a = t.get_string(buf, p); - const char *b = t.get_string(buf2, p); - client->symlink(a,b); - } else if (strcmp(op, "readlink") == 0) { - const char *a = t.get_string(buf, p); - char buf[100]; - client->readlink(a, buf, 100); - } else if (strcmp(op, "lstat") == 0) { - struct stat st; - const char *a = t.get_string(buf, p); - if (strcmp(a, p) != 0 && - strcmp(a, "/") != 0 && - strcmp(a, "/lib") != 0 && // or /lib.. that would be a lookup. hack. - a[0] != 0) // stop stating the root directory already - client->lstat(a, &st); - } else if (strcmp(op, "chmod") == 0) { - const char *a = t.get_string(buf, p); - int64_t b = t.get_int(); - client->chmod(a, b); - } else if (strcmp(op, "chown") == 0) { - const char *a = t.get_string(buf, p); - int64_t b = t.get_int(); - int64_t c = t.get_int(); - client->chown(a, b, c); - } else if (strcmp(op, "utime") == 0) { - const char *a = t.get_string(buf, p); - int64_t b = t.get_int(); - int64_t c = t.get_int(); - struct utimbuf u; - u.actime = b; - u.modtime = c; - client->utime(a, &u); - } else if (strcmp(op, "mknod") == 0) { - const char *a = t.get_string(buf, p); - int64_t b = t.get_int(); - int64_t c = t.get_int(); - client->mknod(a, b, c); - } else if (strcmp(op, "oldmknod") == 0) { - const char *a = t.get_string(buf, p); - int64_t b = t.get_int(); - client->mknod(a, b, 0); - } else if (strcmp(op, "getdir") == 0) { - const char *a = t.get_string(buf, p); - list contents; - client->getdir(a, contents); - } else if (strcmp(op, "getdir") == 0) { - const char *a = t.get_string(buf, p); - list contents; - client->getdir(a, contents); - } else if (strcmp(op, "opendir") == 0) { - const char *a = t.get_string(buf, p); - int64_t b = t.get_int(); - DIR *dirp; - client->opendir(a, &dirp); - if (dirp) open_dirs[b] = dirp; - } else if (strcmp(op, "closedir") == 0) { - int64_t a = t.get_int(); - client->closedir(open_dirs[a]); - open_dirs.erase(a); - } else if (strcmp(op, "open") == 0) { - const char *a = t.get_string(buf, p); - int64_t b = t.get_int(); - int64_t c = t.get_int(); - int64_t d = t.get_int(); - int64_t fd = client->open(a, b, c); - if (fd > 0) open_files[d] = fd; - } else if (strcmp(op, "oldopen") == 0) { - const char *a = t.get_string(buf, p); - int64_t b = t.get_int(); - int64_t d = t.get_int(); - int64_t fd = client->open(a, b, 0755); - if (fd > 0) open_files[d] = fd; - } else if (strcmp(op, "close") == 0) { - int64_t id = t.get_int(); - int64_t fh = open_files[id]; - if (fh > 0) client->close(fh); - open_files.erase(id); - } else if (strcmp(op, "lseek") == 0) { - int64_t f = t.get_int(); - int fd = open_files[f]; - int64_t off = t.get_int(); - int64_t whence = t.get_int(); - client->lseek(fd, off, whence); - } else if (strcmp(op, "read") == 0) { - int64_t f = t.get_int(); - int64_t size = t.get_int(); - int64_t off = t.get_int(); - int64_t fd = open_files[f]; - if (!metadata_only) { - char *b = new char[size]; - client->read(fd, b, size, off); - delete[] b; - } - } else if (strcmp(op, "write") == 0) { - int64_t f = t.get_int(); - int64_t fd = open_files[f]; - int64_t size = t.get_int(); - int64_t off = t.get_int(); - if (!metadata_only) { - char *b = new char[size]; - memset(b, 1, size); // let's write 1's! - client->write(fd, b, size, off); - delete[] b; - } else { - client->write(fd, NULL, 0, size+off); - } - } else if (strcmp(op, "truncate") == 0) { - const char *a = t.get_string(buf, p); - int64_t l = t.get_int(); - client->truncate(a, l); - } else if (strcmp(op, "ftruncate") == 0) { - int64_t f = t.get_int(); - int fd = open_files[f]; - int64_t l = t.get_int(); - client->ftruncate(fd, l); - } else if (strcmp(op, "fsync") == 0) { - int64_t f = t.get_int(); - int64_t b = t.get_int(); - int fd = open_files[f]; - client->fsync(fd, b); - } else if (strcmp(op, "chdir") == 0) { - const char *a = t.get_string(buf, p); - client->chdir(a); - } else if (strcmp(op, "statfs") == 0) { - struct statvfs stbuf; - client->statfs("/", &stbuf); - } - - // low level ops --------------------- - else if (strcmp(op, "ll_lookup") == 0) { - int64_t i = t.get_int(); - const char *name = t.get_string(buf, p); - int64_t r = t.get_int(); - struct stat attr; - if (ll_inos.count(i) && - client->ll_lookup(ll_inos[i], name, &attr) == 0) - ll_inos[r] = attr.st_ino; - } else if (strcmp(op, "ll_forget") == 0) { - int64_t i = t.get_int(); - int64_t n = t.get_int(); - if (ll_inos.count(i) && - client->ll_forget(ll_inos[i], n)) - ll_inos.erase(i); - } else if (strcmp(op, "ll_getattr") == 0) { - int64_t i = t.get_int(); - struct stat attr; - if (ll_inos.count(i)) - client->ll_getattr(ll_inos[i], &attr); - } else if (strcmp(op, "ll_setattr") == 0) { - int64_t i = t.get_int(); - struct stat attr; - memset(&attr, 0, sizeof(attr)); - attr.st_mode = t.get_int(); - attr.st_uid = t.get_int(); - attr.st_gid = t.get_int(); - attr.st_size = t.get_int(); - attr.st_mtime = t.get_int(); - attr.st_atime = t.get_int(); - int mask = t.get_int(); - if (ll_inos.count(i)) - client->ll_setattr(ll_inos[i], &attr, mask); - } else if (strcmp(op, "ll_readlink") == 0) { - int64_t i = t.get_int(); - const char *value; - if (ll_inos.count(i)) - client->ll_readlink(ll_inos[i], &value); - } else if (strcmp(op, "ll_mknod") == 0) { - int64_t i = t.get_int(); - const char *n = t.get_string(buf, p); - int m = t.get_int(); - int r = t.get_int(); - int64_t ri = t.get_int(); - struct stat attr; - if (ll_inos.count(i) && - client->ll_mknod(ll_inos[i], n, m, r, &attr) == 0) - ll_inos[ri] = attr.st_ino; - } else if (strcmp(op, "ll_mkdir") == 0) { - int64_t i = t.get_int(); - const char *n = t.get_string(buf, p); - int m = t.get_int(); - int64_t ri = t.get_int(); - struct stat attr; - if (ll_inos.count(i) && - client->ll_mkdir(ll_inos[i], n, m, &attr) == 0) - ll_inos[ri] = attr.st_ino; - } else if (strcmp(op, "ll_symlink") == 0) { - int64_t i = t.get_int(); - const char *n = t.get_string(buf, p); - const char *v = t.get_string(buf2, p); - int64_t ri = t.get_int(); - struct stat attr; - if (ll_inos.count(i) && - client->ll_symlink(ll_inos[i], n, v, &attr) == 0) - ll_inos[ri] = attr.st_ino; - } else if (strcmp(op, "ll_unlink") == 0) { - int64_t i = t.get_int(); - const char *n = t.get_string(buf, p); - if (ll_inos.count(i)) - client->ll_unlink(ll_inos[i], n); - } else if (strcmp(op, "ll_rmdir") == 0) { - int64_t i = t.get_int(); - const char *n = t.get_string(buf, p); - if (ll_inos.count(i)) - client->ll_rmdir(ll_inos[i], n); - } else if (strcmp(op, "ll_rename") == 0) { - int64_t i = t.get_int(); - const char *n = t.get_string(buf, p); - int64_t ni = t.get_int(); - const char *nn = t.get_string(buf2, p); - if (ll_inos.count(i) && - ll_inos.count(ni)) - client->ll_rename(ll_inos[i], n, ll_inos[ni], nn); - } else if (strcmp(op, "ll_link") == 0) { - int64_t i = t.get_int(); - int64_t ni = t.get_int(); - const char *nn = t.get_string(buf, p); - struct stat attr; - if (ll_inos.count(i) && - ll_inos.count(ni)) - client->ll_link(ll_inos[i], ll_inos[ni], nn, &attr); - } else if (strcmp(op, "ll_opendir") == 0) { - int64_t i = t.get_int(); - int64_t r = t.get_int(); - void *dirp; - if (ll_inos.count(i) && - client->ll_opendir(ll_inos[i], &dirp) == 0) - ll_dirs[r] = dirp; - } else if (strcmp(op, "ll_releasedir") == 0) { - int64_t f = t.get_int(); - if (ll_dirs.count(f)) { - client->ll_releasedir(ll_dirs[f]); - ll_dirs.erase(f); - } - } else if (strcmp(op, "ll_open") == 0) { - int64_t i = t.get_int(); - int64_t f = t.get_int(); - int64_t r = t.get_int(); - Fh *fhp; - if (ll_inos.count(i) && - client->ll_open(ll_inos[i], f, &fhp) == 0) - ll_files[r] = fhp; - } else if (strcmp(op, "ll_create") == 0) { - int64_t i = t.get_int(); - const char *n = t.get_string(buf, p); - int64_t m = t.get_int(); - int64_t f = t.get_int(); - int64_t r = t.get_int(); - int64_t ri = t.get_int(); - Fh *fhp; - struct stat attr; - if (ll_inos.count(i) && - client->ll_create(ll_inos[i], n, m, f, &attr, &fhp) == 0) { - ll_inos[ri] = attr.st_ino; - ll_files[r] = fhp; - } - } else if (strcmp(op, "ll_read") == 0) { - int64_t f = t.get_int(); - int64_t off = t.get_int(); - int64_t size = t.get_int(); - if (ll_files.count(f) && - !metadata_only) { - bufferlist bl; - client->ll_read(ll_files[f], off, size, &bl); - } - } else if (strcmp(op, "ll_write") == 0) { - int64_t f = t.get_int(); - int64_t off = t.get_int(); - int64_t size = t.get_int(); - if (ll_files.count(f)) { - if (!metadata_only) { - bufferlist bl; - bufferptr bp(size); - bl.push_back(bp); - bp.zero(); - client->ll_write(ll_files[f], off, size, bl.c_str()); - } else { - client->ll_write(ll_files[f], off+size, 0, NULL); - } - } - } else if (strcmp(op, "ll_flush") == 0) { - int64_t f = t.get_int(); - if (!metadata_only && - ll_files.count(f)) - client->ll_flush(ll_files[f]); - } else if (strcmp(op, "ll_fsync") == 0) { - int64_t f = t.get_int(); - if (!metadata_only && - ll_files.count(f)) - client->ll_fsync(ll_files[f], false); // FIXME dataonly param - } else if (strcmp(op, "ll_release") == 0) { - int64_t f = t.get_int(); - if (ll_files.count(f)) { - client->ll_release(ll_files[f]); - ll_files.erase(f); - } - } else if (strcmp(op, "ll_statfs") == 0) { - int64_t i = t.get_int(); - if (ll_inos.count(i)) - {} //client->ll_statfs(ll_inos[i]); - } - - - // object-level traces - - else if (strcmp(op, "o_stat") == 0) { - int64_t oh = t.get_int(); - int64_t ol = t.get_int(); - object_t oid(oh, ol); - lock.Lock(); - ObjectLayout layout = client->osdmap->make_object_layout(oid, pg_t::TYPE_REP, 2); - off_t size; - client->objecter->stat(oid, &size, layout, new C_SafeCond(&lock, &cond, &ack)); - while (!ack) cond.Wait(lock); - lock.Unlock(); - } - else if (strcmp(op, "o_read") == 0) { - int64_t oh = t.get_int(); - int64_t ol = t.get_int(); - int64_t off = t.get_int(); - int64_t len = t.get_int(); - object_t oid(oh, ol); - lock.Lock(); - ObjectLayout layout = client->osdmap->make_object_layout(oid, pg_t::TYPE_REP, 2); - bufferlist bl; - client->objecter->read(oid, off, len, layout, &bl, new C_SafeCond(&lock, &cond, &ack)); - while (!ack) cond.Wait(lock); - lock.Unlock(); - } - else if (strcmp(op, "o_write") == 0) { - int64_t oh = t.get_int(); - int64_t ol = t.get_int(); - int64_t off = t.get_int(); - int64_t len = t.get_int(); - object_t oid(oh, ol); - lock.Lock(); - ObjectLayout layout = client->osdmap->make_object_layout(oid, pg_t::TYPE_REP, 2); - bufferptr bp(len); - bufferlist bl; - bl.push_back(bp); - client->objecter->write(oid, off, len, layout, bl, - new C_SafeCond(&lock, &cond, &ack), - safeg->new_sub()); - while (!ack) cond.Wait(lock); - lock.Unlock(); - } - else if (strcmp(op, "o_zero") == 0) { - int64_t oh = t.get_int(); - int64_t ol = t.get_int(); - int64_t off = t.get_int(); - int64_t len = t.get_int(); - object_t oid(oh, ol); - lock.Lock(); - ObjectLayout layout = client->osdmap->make_object_layout(oid, pg_t::TYPE_REP, 2); - client->objecter->zero(oid, off, len, layout, - new C_SafeCond(&lock, &cond, &ack), - safeg->new_sub()); - while (!ack) cond.Wait(lock); - lock.Unlock(); - } - - - else { - dout(0) << (t.get_line()-1) << ": *** trace hit unrecognized symbol '" << op << "' " << dendl; - assert(0); - } - } - - dout(10) << "trace finished on line " << t.get_line() << dendl; - - // wait for safe after an object trace - safegref->finish(0); - delete safegref; - lock.Lock(); - while (!safe) { - dout(10) << "waiting for safe" << dendl; - cond.Wait(lock); - } - lock.Unlock(); - - // close open files - for (hash_map::iterator fi = open_files.begin(); - fi != open_files.end(); - fi++) { - dout(1) << "leftover close " << fi->second << dendl; - if (fi->second > 0) client->close(fi->second); - } - for (hash_map::iterator fi = open_dirs.begin(); - fi != open_dirs.end(); - fi++) { - dout(1) << "leftover closedir " << fi->second << dendl; - if (fi->second != 0) client->closedir(fi->second); - } - for (hash_map::iterator fi = ll_files.begin(); - fi != ll_files.end(); - fi++) { - dout(1) << "leftover ll_release " << fi->second << dendl; - if (fi->second > 0) client->ll_release(fi->second); - } - for (hash_map::iterator fi = ll_dirs.begin(); - fi != ll_dirs.end(); - fi++) { - dout(1) << "leftover ll_releasedir " << fi->second << dendl; - if (fi->second > 0) client->ll_releasedir(fi->second); - } - - return 0; -} - - - -int SyntheticClient::clean_dir(string& basedir) -{ - // read dir - list contents; - int r = client->getdir(basedir.c_str(), contents); - if (r < 0) { - dout(1) << "readdir on " << basedir << " returns " << r << dendl; - return r; - } - - for (list::iterator it = contents.begin(); - it != contents.end(); - it++) { - if (*it == ".") continue; - if (*it == "..") continue; - string file = basedir + "/" + *it; - - if (time_to_stop()) break; - - struct stat st; - int r = client->lstat(file.c_str(), &st); - if (r < 0) { - dout(1) << "stat error on " << file << " r=" << r << dendl; - continue; - } - - if ((st.st_mode & INODE_TYPE_MASK) == INODE_MODE_DIR) { - clean_dir(file); - client->rmdir(file.c_str()); - } else { - client->unlink(file.c_str()); - } - } - - return 0; - -} - - -int SyntheticClient::full_walk(string& basedir) -{ - if (time_to_stop()) return -1; - - list dirq; - dirq.push_back(basedir); - - while (!dirq.empty()) { - string dir = dirq.front(); - dirq.pop_front(); - - // read dir - list contents; - int r = client->getdir(dir.c_str(), contents); - if (r < 0) { - dout(1) << "readdir on " << dir << " returns " << r << dendl; - continue; - } - - for (list::iterator it = contents.begin(); - it != contents.end(); - it++) { - if (*it == "." || - *it == "..") - continue; - string file = dir + "/" + *it; - - struct stat st; - int r = client->lstat(file.c_str(), &st); - if (r < 0) { - dout(1) << "stat error on " << file << " r=" << r << dendl; - continue; - } - - // print - char *tm = ctime(&st.st_mtime); - tm[strlen(tm)-1] = 0; - printf("%llx %c%c%c%c%c%c%c%c%c%c %2d %5d %5d %8d %12s %s\n", - (long long)st.st_ino, - S_ISDIR(st.st_mode) ? 'd':'-', - (st.st_mode & 0400) ? 'r':'-', - (st.st_mode & 0200) ? 'w':'-', - (st.st_mode & 0100) ? 'x':'-', - (st.st_mode & 040) ? 'r':'-', - (st.st_mode & 020) ? 'w':'-', - (st.st_mode & 010) ? 'x':'-', - (st.st_mode & 04) ? 'r':'-', - (st.st_mode & 02) ? 'w':'-', - (st.st_mode & 01) ? 'x':'-', - (int)st.st_nlink, - st.st_uid, st.st_gid, - (int)st.st_size, - tm, - file.c_str()); - - - if ((st.st_mode & INODE_TYPE_MASK) == INODE_MODE_DIR) { - dirq.push_back(file); - } - } - } - - return 0; -} - -int SyntheticClient::make_dirs(const char *basedir, int dirs, int files, int depth) -{ - if (time_to_stop()) return 0; - - // make sure base dir exists - int r = client->mkdir(basedir, 0755); - if (r != 0) { - dout(1) << "can't make base dir? " << basedir << dendl; - return -1; - } - - // children - char d[500]; - dout(3) << "make_dirs " << basedir << " dirs " << dirs << " files " << files << " depth " << depth << dendl; - for (int i=0; imknod(d, 0644); - } - - if (depth == 0) return 0; - - for (int i=0; ilstat(basedir, &st); - if (r != 0) { - dout(1) << "can't make base dir? " << basedir << dendl; - return -1; - } - - // children - char d[500]; - dout(3) << "stat_dirs " << basedir << " dirs " << dirs << " files " << files << " depth " << depth << dendl; - for (int i=0; ilstat(d, &st); - } - - if (depth == 0) return 0; - - for (int i=0; i contents; - utime_t s = g_clock.now(); - int r = client->getdir(basedir, contents); - utime_t e = g_clock.now(); - e -= s; - if (client_logger) client_logger->finc("readdir", e); - if (r < 0) { - dout(0) << "read_dirs couldn't readdir " << basedir << ", stopping" << dendl; - return -1; - } - - for (int i=0; ilstat(d, &st) < 0) { - dout(2) << "read_dirs failed stat on " << d << ", stopping" << dendl; - return -1; - } - utime_t e = g_clock.now(); - e -= s; - if (client_logger) client_logger->finc("stat", e); - } - - if (depth > 0) - for (int i=0; iget_nodeid(); - char d[255]; - - if (priv) { - for (int c=0; cmkdir(d, 0755); - } - } else { - // shared - if (true || whoami == 0) { - for (int c=0; cmkdir(d, 0755); - } - } else { - sleep(2); - } - } - - // files - struct stat st; - utime_t start = g_clock.now(); - for (int c=0; cmknod(d, 0644); - - if (more) { - client->lstat(d, &st); - int fd = client->open(d, O_RDONLY); - client->unlink(d); - client->close(fd); - } - - if (time_to_stop()) return 0; - } - } - utime_t end = g_clock.now(); - end -= start; - dout(0) << "makefiles time is " << end << " or " << ((double)end / (double)num) <<" per file" << dendl; - - return 0; -} - -int SyntheticClient::link_test() -{ - char d[255]; - char e[255]; - - // create files - int num = 200; - - client->mkdir("orig", 0755); - client->mkdir("copy", 0755); - - utime_t start = g_clock.now(); - for (int i=0; imknod(d, 0755); - } - utime_t end = g_clock.now(); - end -= start; - - dout(0) << "orig " << end << dendl; - - // link - start = g_clock.now(); - for (int i=0; ilink(d, e); - } - end = g_clock.now(); - end -= start; - dout(0) << "copy " << end << dendl; - - return 0; -} - - -int SyntheticClient::create_shared(int num) -{ - // files - char d[255]; - for (int n=0; nmknod(d, 0644); - } - - return 0; -} - -int SyntheticClient::open_shared(int num, int count) -{ - // files - char d[255]; - for (int c=0; c fds; - for (int n=0; nopen(d,O_RDONLY); - fds.push_back(fd); - } - - while (!fds.empty()) { - int fd = fds.front(); - fds.pop_front(); - client->close(fd); - } - } - - return 0; -} - - -int SyntheticClient::write_file(string& fn, int size, int wrsize) // size is in MB, wrsize in bytes -{ - //uint64_t wrsize = 1024*256; - char *buf = new char[wrsize+100]; // 1 MB - memset(buf, 7, wrsize); - uint64_t chunks = (uint64_t)size * (uint64_t)(1024*1024) / (uint64_t)wrsize; - - int fd = client->open(fn.c_str(), O_RDWR|O_CREAT); - dout(5) << "writing to " << fn << " fd " << fd << dendl; - if (fd < 0) return fd; - - for (unsigned i=0; iget_nodeid(); - p++; - } - - client->write(fd, buf, wrsize, i*wrsize); - } - - client->close(fd); - delete[] buf; - - return 0; -} - -int SyntheticClient::write_batch(int nfile, int size, int wrsize) -{ - for (int i=0; iopen(fn.c_str(), O_RDONLY); - dout(5) << "reading from " << fn << " fd " << fd << dendl; - if (fd < 0) return fd; - - for (unsigned i=0; iread(fd, buf, rdsize, i*rdsize); - if (r < rdsize) { - dout(1) << "read_file got r = " << r << ", probably end of file" << dendl; - break; - } - - // verify fingerprint - int bad = 0; - uint64_t *p = (uint64_t*)buf; - uint64_t readoff; - int64_t readclient; - while ((char*)p + 32 < buf + rdsize) { - readoff = *p; - uint64_t wantoff = (uint64_t)i*(uint64_t)rdsize + (uint64_t)((char*)p - buf); - p++; - readclient = *p; - p++; - if (readoff != wantoff || - readclient != client->get_nodeid()) { - if (!bad && !ignoreprint) - dout(0) << "WARNING: wrong data from OSD, block says fileoffset=" << readoff << " client=" << readclient - << ", should be offset " << wantoff << " clietn " << client->get_nodeid() - << dendl; - bad++; - } - } - if (bad && !ignoreprint) - dout(0) << " + " << (bad-1) << " other bad 16-byte bits in this block" << dendl; - } - - client->close(fd); - delete[] buf; - - return 0; -} - - - - -class C_Ref : public Context { - Mutex& lock; - Cond& cond; - int *ref; -public: - C_Ref(Mutex &l, Cond &c, int *r) : lock(l), cond(c), ref(r) { - lock.Lock(); - (*ref)++; - lock.Unlock(); - } - void finish(int) { - lock.Lock(); - (*ref)--; - cond.Signal(); - lock.Unlock(); - } -}; - -int SyntheticClient::create_objects(int nobj, int osize, int inflight) -{ - // divy up - int numc = g_conf.num_client ? g_conf.num_client : 1; - - int start, inc, end; - - if (1) { - // strided - start = client->get_nodeid(); //nobjs % numc; - inc = numc; - end = start + nobj; - } else { - // segments - start = nobj * client->get_nodeid() / numc; - inc = 1; - end = nobj * (client->get_nodeid()+1) / numc; - } - - dout(5) << "create_objects " << nobj << " size=" << osize - << " .. doing [" << start << "," << end << ") inc " << inc - << dendl; - - bufferptr bp(osize); - bp.zero(); - bufferlist bl; - bl.push_back(bp); - - Mutex lock; - Cond cond; - - int unack = 0; - int unsafe = 0; - - list starts; - - for (int i=start; iosdmap->make_object_layout(oid, pg_t::TYPE_REP, g_OSD_FileLayout.fl_pg_size); - - if (i % inflight == 0) { - dout(6) << "create_objects " << i << "/" << (nobj+1) << dendl; - } - dout(10) << "writing " << oid << dendl; - - starts.push_back(g_clock.now()); - client->client_lock.Lock(); - client->objecter->write(oid, 0, osize, layout, bl, - new C_Ref(lock, cond, &unack), - new C_Ref(lock, cond, &unsafe)); - client->client_lock.Unlock(); - - lock.Lock(); - while (unack > inflight) { - dout(20) << "waiting for " << unack << " unack" << dendl; - cond.Wait(lock); - } - lock.Unlock(); - - utime_t lat = g_clock.now(); - lat -= starts.front(); - starts.pop_front(); - if (client_logger) - client_logger->favg("owrlat", lat); - } - - lock.Lock(); - while (unack > 0) { - dout(20) << "waiting for " << unack << " unack" << dendl; - cond.Wait(lock); - } - while (unsafe > 0) { - dout(10) << "waiting for " << unsafe << " unsafe" << dendl; - cond.Wait(lock); - } - lock.Unlock(); - - dout(5) << "create_objects done" << dendl; - derr(0) << "create_objects done" << dendl; - return 0; -} - -int SyntheticClient::object_rw(int nobj, int osize, int wrpc, - int overlappc, - double rskew, double wskew) -{ - dout(5) << "object_rw " << nobj << " size=" << osize << " with " - << wrpc << "% writes" - << ", " << overlappc << "% overlap" - << ", rskew = " << rskew - << ", wskew = " << wskew - << dendl; - - bufferptr bp(osize); - bp.zero(); - bufferlist bl; - bl.push_back(bp); - - // start with odd number > nobj - rjhash h; - unsigned prime = nobj + 1; // this is the minimum! - prime += h(nobj) % (3*nobj); // bump it up some - prime |= 1; // make it odd - - while (true) { - unsigned j; - for (j=2; j*j<=prime; j++) - if (prime % j == 0) break; - if (j*j > prime) { - break; - //cout << "prime " << prime << endl; - } - prime += 2; - } - - Mutex lock; - Cond cond; - - int unack = 0; - int unsafe = 0; - - while (1) { - if (time_to_stop()) break; - - // read or write? - bool write = (rand() % 100) < wrpc; - - // choose object - double r = drand48(); // [0..1) - long o; - if (write) { - o = (long)trunc(pow(r, wskew) * (double)nobj); // exponentially skew towards 0 - int pnoremap = (long)(r * 100.0); - if (pnoremap >= overlappc) - o = (o*prime) % nobj; // remap - } else { - o = (long)trunc(pow(r, rskew) * (double)nobj); // exponentially skew towards 0 - } - object_t oid(0x1000, o); - - ObjectLayout layout = client->osdmap->make_object_layout(oid, pg_t::TYPE_REP, g_OSD_FileLayout.fl_pg_size); - - client->client_lock.Lock(); - utime_t start = g_clock.now(); - if (write) { - dout(10) << "write to " << oid << dendl; - client->objecter->write(oid, 0, osize, layout, bl, - new C_Ref(lock, cond, &unack), - new C_Ref(lock, cond, &unsafe)); - } else { - dout(10) << "read from " << oid << dendl; - bufferlist inbl; - client->objecter->read(oid, 0, osize, layout, &inbl, - new C_Ref(lock, cond, &unack)); - } - client->client_lock.Unlock(); - - lock.Lock(); - while (unack > 0) { - dout(20) << "waiting for " << unack << " unack" << dendl; - cond.Wait(lock); - } - lock.Unlock(); - - utime_t lat = g_clock.now(); - lat -= start; - if (client_logger) { - if (write) - client_logger->favg("owrlat", lat); - else - client_logger->favg("ordlat", lat); - } - } - - - lock.Lock(); - while (unsafe > 0) { - dout(10) << "waiting for " << unsafe << " unsafe" << dendl; - cond.Wait(lock); - } - lock.Unlock(); - return 0; -} - - - - - -int SyntheticClient::read_random(string& fn, int size, int rdsize) // size is in MB, wrsize in bytes -{ - __uint64_t chunks = (__uint64_t)size * (__uint64_t)(1024*1024) / (__uint64_t)rdsize; - - int fd = client->open(fn.c_str(), O_RDWR); - dout(5) << "reading from " << fn << " fd " << fd << dendl; - - // dout(0) << "READING FROM " << fn << " fd " << fd << dendl; - - // dout(0) << "filename " << fn << " size:" << size << " read size|" << rdsize << "|" << "\ chunks: |" << chunks <<"|" << dendl; - - if (fd < 0) return fd; - int offset = 0; - char * buf = NULL; - - for (unsigned i=0; i<2000; i++) { - if (time_to_stop()) break; - - bool read=false; - - time_t seconds; - time( &seconds); - srand(seconds); - - // use rand instead ?? - double x = drand48(); - - //dout(0) << "RANDOM NUMBER RETURN |" << x << "|" << dendl; - - if ( x < 0.5) - { - //dout(0) << "DECIDED TO READ " << x << dendl; - buf = new char[rdsize]; - memset(buf, 1, rdsize); - read=true; - } - else - { - // dout(0) << "DECIDED TO WRITE " << x << dendl; - buf = new char[rdsize+100]; // 1 MB - memset(buf, 7, rdsize); - } - - //double y = drand48() ; - - //dout(0) << "OFFSET is |" << offset << "| chunks |" << chunks<< dendl; - - if ( read) - { - offset=(rand())%(chunks+1); - dout(2) << "reading block " << offset << "/" << chunks << dendl; - - int r = client->read(fd, buf, rdsize, - offset*rdsize); - if (r < rdsize) { - dout(1) << "read_file got r = " << r << ", probably end of file" << dendl; - } - } - else - { - dout(2) << "writing block " << offset << "/" << chunks << dendl; - - // fill buf with a 16 byte fingerprint - // 64 bits : file offset - // 64 bits : client id - // = 128 bits (16 bytes) - - //if (true ) - //{ - //int count = rand()%10; - - //for ( int j=0;jget_nodeid(); - p++; - } - - client->write(fd, buf, rdsize, - offset*rdsize); - //} - //} - } - - // verify fingerprint - if ( read ) - { - int bad = 0; - __int64_t *p = (__int64_t*)buf; - __int64_t readoff, readclient; - while ((char*)p + 32 < buf + rdsize) { - readoff = *p; - __int64_t wantoff = offset*rdsize + (__int64_t)((char*)p - buf); - p++; - readclient = *p; - p++; - if (readoff != wantoff || - readclient != client->get_nodeid()) { - if (!bad) - dout(0) << "WARNING: wrong data from OSD, block says fileoffset=" << readoff << " client=" << readclient - << ", should be offset " << wantoff << " clietn " << client->get_nodeid() - << dendl; - bad++; - } - } - if (bad) - dout(0) << " + " << (bad-1) << " other bad 16-byte bits in this block" << dendl; - } - } - - client->close(fd); - delete[] buf; - - return 0; -} - - -//#include -//#include - -int normdist(int min, int max, int stdev) /* specifies input values */; -//main() -//{ - // for ( int i=0; i < 10; i++ ) - // normdist ( 0 , 10, 1 ); - -//} - - -int normdist(int min, int max, int stdev) /* specifies input values */ -{ - /* min: Minimum value; max: Maximum value; stdev: degree of deviation */ - - //int min, max, stdev; { - time_t seconds; - time( &seconds); - srand(seconds); - - int range, iterate, result; - /* declare range, iterate and result as integers, to avoid the need for - floating point math*/ - - result = 0; - /* ensure result is initialized to 0 */ - - range = max -min; - /* calculate range of possible values between the max and min values */ - - iterate = range / stdev; - /* this number of iterations ensures the proper shape of the resulting - curve */ - - stdev += 1; /* compensation for integer vs. floating point math */ - for (int c = iterate; c != 0; c--) /* loop through iterations */ - { - // result += (uniform (1, 100) * stdev) / 100; /* calculate and - result += ( (rand()%100 + 1) * stdev) / 100; - // printf("result=%d\n", result ); - } - printf("\n final result=%d\n", result ); - return result + min; /* send final result back */ -} - -int SyntheticClient::read_random_ex(string& fn, int size, int rdsize) // size is in MB, wrsize in bytes -{ - __uint64_t chunks = (__uint64_t)size * (__uint64_t)(1024*1024) / (__uint64_t)rdsize; - - int fd = client->open(fn.c_str(), O_RDWR); - dout(5) << "reading from " << fn << " fd " << fd << dendl; - - // dout(0) << "READING FROM " << fn << " fd " << fd << dendl; - - // dout(0) << "filename " << fn << " size:" << size << " read size|" << rdsize << "|" << "\ chunks: |" << chunks <<"|" << dendl; - - if (fd < 0) return fd; - int offset = 0; - char * buf = NULL; - - for (unsigned i=0; i<2000; i++) { - if (time_to_stop()) break; - - bool read=false; - - time_t seconds; - time( &seconds); - srand(seconds); - - // use rand instead ?? - double x = drand48(); - - //dout(0) << "RANDOM NUMBER RETURN |" << x << "|" << dendl; - - if ( x < 0.5) - { - //dout(0) << "DECIDED TO READ " << x << dendl; - buf = new char[rdsize]; - memset(buf, 1, rdsize); - read=true; - } - else - { - // dout(0) << "DECIDED TO WRITE " << x << dendl; - buf = new char[rdsize+100]; // 1 MB - memset(buf, 7, rdsize); - } - - //double y = drand48() ; - - //dout(0) << "OFFSET is |" << offset << "| chunks |" << chunks<< dendl; - - if ( read) - { - //offset=(rand())%(chunks+1); - - /* if ( chunks > 10000 ) - offset= normdist( 0 , chunks/1000 , 5 )*1000; - else if ( chunks > 1000 ) - offset= normdist( 0 , chunks/100 , 5 )*100; - else if ( chunks > 100 ) - offset= normdist( 0 , chunks/20 , 5 )*20;*/ - - - dout(2) << "reading block " << offset << "/" << chunks << dendl; - - int r = client->read(fd, buf, rdsize, - offset*rdsize); - if (r < rdsize) { - dout(1) << "read_file got r = " << r << ", probably end of file" << dendl; - } - } - else - { - dout(2) << "writing block " << offset << "/" << chunks << dendl; - - // fill buf with a 16 byte fingerprint - // 64 bits : file offset - // 64 bits : client id - // = 128 bits (16 bytes) - - //if (true ) - //{ - int count = rand()%10; - - for ( int j=0;jget_nodeid(); - p++; - } - - client->write(fd, buf, rdsize, - offset*rdsize); - } - //} - } - - // verify fingerprint - if ( read ) - { - int bad = 0; - __int64_t *p = (__int64_t*)buf; - __int64_t readoff, readclient; - while ((char*)p + 32 < buf + rdsize) { - readoff = *p; - __int64_t wantoff = offset*rdsize + (__int64_t)((char*)p - buf); - p++; - readclient = *p; - p++; - if (readoff != wantoff || - readclient != client->get_nodeid()) { - if (!bad) - dout(0) << "WARNING: wrong data from OSD, block says fileoffset=" << readoff << " client=" << readclient - << ", should be offset " << wantoff << " clietn " << client->get_nodeid() - << dendl; - bad++; - } - } - if (bad) - dout(0) << " + " << (bad-1) << " other bad 16-byte bits in this block" << dendl; - } - } - - client->close(fd); - delete[] buf; - - return 0; -} - - -int SyntheticClient::random_walk(int num_req) -{ - int left = num_req; - - //dout(1) << "random_walk() will do " << left << " ops" << dendl; - - init_op_dist(); // set up metadata op distribution - - while (left > 0) { - left--; - - if (time_to_stop()) break; - - // ascend? - if (cwd.depth() && !roll_die(::pow((double).9, (double)cwd.depth()))) { - dout(DBL) << "die says up" << dendl; - up(); - continue; - } - - // descend? - if (.9*roll_die(::pow((double).9,(double)cwd.depth())) && subdirs.size()) { - string s = get_random_subdir(); - cwd.push_dentry( s ); - dout(DBL) << "cd " << s << " -> " << cwd << dendl; - clear_dir(); - continue; - } - - int op = 0; - filepath path; - - if (contents.empty() && roll_die(.3)) { - if (did_readdir) { - dout(DBL) << "empty dir, up" << dendl; - up(); - } else - op = MDS_OP_READDIR; - } else { - op = op_dist.sample(); - } - //dout(DBL) << "op is " << op << dendl; - - int r = 0; - - // do op - if (op == MDS_OP_UNLINK) { - if (contents.empty()) - op = MDS_OP_READDIR; - else - r = client->unlink( get_random_sub() ); // will fail on dirs - } - - if (op == MDS_OP_RENAME) { - if (contents.empty()) - op = MDS_OP_READDIR; - else { - r = client->rename( get_random_sub(), make_sub("ren") ); - } - } - - if (op == MDS_OP_MKDIR) { - r = client->mkdir( make_sub("mkdir"), 0755); - } - - if (op == MDS_OP_RMDIR) { - if (!subdirs.empty()) - r = client->rmdir( get_random_subdir() ); - else - r = client->rmdir( cwd.c_str() ); // will pbly fail - } - - if (op == MDS_OP_SYMLINK) { - } - - if (op == MDS_OP_CHMOD) { - if (contents.empty()) - op = MDS_OP_READDIR; - else - r = client->chmod( get_random_sub(), rand() & 0755 ); - } - - if (op == MDS_OP_CHOWN) { - if (contents.empty()) r = client->chown( cwd.c_str(), rand(), rand() ); - else - r = client->chown( get_random_sub(), rand(), rand() ); - } - - if (op == MDS_OP_LINK) { - } - - if (op == MDS_OP_UTIME) { - struct utimbuf b; - memset(&b, 1, sizeof(b)); - if (contents.empty()) - r = client->utime( cwd.c_str(), &b ); - else - r = client->utime( get_random_sub(), &b ); - } - - if (op == MDS_OP_MKNOD) { - r = client->mknod( make_sub("mknod"), 0644); - } - - if (op == MDS_OP_OPEN) { - if (contents.empty()) - op = MDS_OP_READDIR; - else { - r = client->open( get_random_sub(), O_RDONLY ); - if (r > 0) { - assert(open_files.count(r) == 0); - open_files.insert(r); - } - } - } - - if (op == MDS_OP_RELEASE) { // actually, close - if (open_files.empty()) - op = MDS_OP_STAT; - else { - int fh = get_random_fh(); - r = client->close( fh ); - if (r == 0) open_files.erase(fh); - } - } - - if (op == MDS_OP_STAT) { - struct stat st; - if (contents.empty()) { - if (did_readdir) { - if (roll_die(.1)) { - dout(DBL) << "stat in empty dir, up" << dendl; - up(); - } else { - op = MDS_OP_MKNOD; - } - } else - op = MDS_OP_READDIR; - } else - r = client->lstat(get_random_sub(), &st); - } - - if (op == MDS_OP_READDIR) { - clear_dir(); - - list c; - r = client->getdir( cwd.c_str(), c ); - - for (list::iterator it = c.begin(); - it != c.end(); - it++) { - //dout(DBL) << " got " << *it << dendl; - assert(0); - /*contents[*it] = it->second; - if (it->second && - S_ISDIR(it->second->st_mode)) - subdirs.insert(*it); - */ - } - - did_readdir = true; - } - - // errors? - if (r < 0) { - // reevaluate cwd. - //while (cwd.depth()) { - //if (client->lookup(cwd)) break; // it's in the cache - - //dout(DBL) << "r = " << r << ", client doesn't have " << cwd << ", cd .." << dendl; - dout(DBL) << "r = " << r << ", client may not have " << cwd << ", cd .." << dendl; - up(); - //} - } - } - - // close files - dout(DBL) << "closing files" << dendl; - while (!open_files.empty()) { - int fh = get_random_fh(); - int r = client->close( fh ); - if (r == 0) open_files.erase(fh); - } - - dout(DBL) << "done" << dendl; - return 0; -} - - - - -void SyntheticClient::make_dir_mess(const char *basedir, int n) -{ - vector dirs; - - dirs.push_back(basedir); - dirs.push_back(basedir); - - client->mkdir(basedir, 0755); - - // motivation: - // P(dir) ~ subdirs_of(dir) + 2 - // from 5-year metadata workload paper in fast'07 - - // create dirs - for (int i=0; imkdir(dir.c_str(), 0755); - } - - -} - - - -void SyntheticClient::foo() -{ - if (1) { - // open some files - srand(0); - for (int i=0; i<20; i++) { - int s = 5; - int a = rand() % s; - int b = rand() % s; - int c = rand() % s; - char src[80]; - sprintf(src, "syn.0.0/dir.%d/dir.%d/file.%d", a, b, c); - //int fd = - client->open(src, O_RDONLY); - } - - return; - } - - if (0) { - // rename fun - for (int i=0; i<100; i++) { - int s = 5; - int a = rand() % s; - int b = rand() % s; - int c = rand() % s; - int d = rand() % s; - int e = rand() % s; - int f = rand() % s; - char src[80]; - char dst[80]; - sprintf(src, "syn.0.0/dir.%d/dir.%d/file.%d", a, b, c); - sprintf(dst, "syn.0.0/dir.%d/dir.%d/file.%d", d, e, f); - client->rename(src, dst); - } - return; - } - - if (1) { - // link fun - srand(0); - for (int i=0; i<100; i++) { - int s = 5; - int a = rand() % s; - int b = rand() % s; - int c = rand() % s; - int d = rand() % s; - int e = rand() % s; - int f = rand() % s; - char src[80]; - char dst[80]; - sprintf(src, "syn.0.0/dir.%d/dir.%d/file.%d", a, b, c); - sprintf(dst, "syn.0.0/dir.%d/dir.%d/newlink.%d", d, e, f); - client->link(src, dst); - } - srand(0); - for (int i=0; i<100; i++) { - int s = 5; - int a = rand() % s; - int b = rand() % s; - int c = rand() % s; - int d = rand() % s; - int e = rand() % s; - int f = rand() % s; - char src[80]; - char dst[80]; - sprintf(src, "syn.0.0/dir.%d/dir.%d/file.%d", a, b, c); - sprintf(dst, "syn.0.0/dir.%d/dir.%d/newlink.%d", d, e, f); - client->unlink(dst); - } - - - return; - } - - // link fun - client->mknod("one", 0755); - client->mknod("two", 0755); - client->link("one", "three"); - client->mkdir("dir", 0755); - client->link("two", "/dir/twolink"); - client->link("dir/twolink", "four"); - - // unlink fun - client->mknod("a", 0644); - client->unlink("a"); - client->mknod("b", 0644); - client->link("b", "c"); - client->unlink("c"); - client->mkdir("d", 0755); - client->unlink("d"); - client->rmdir("d"); - - // rename fun - client->mknod("p1", 0644); - client->mknod("p2", 0644); - client->rename("p1","p2"); - client->mknod("p3", 0644); - client->rename("p3","p4"); - - // check dest dir ambiguity thing - client->mkdir("dir1", 0755); - client->mkdir("dir2", 0755); - client->rename("p2","dir1/p2"); - client->rename("dir1/p2","dir2/p2"); - client->rename("dir2/p2","/p2"); - - // check primary+remote link merging - client->link("p2","p2.l"); - client->link("p4","p4.l"); - client->rename("p2.l","p2"); - client->rename("p4","p4.l"); - - // check anchor updates - client->mknod("dir1/a", 0644); - client->link("dir1/a", "da1"); - client->link("dir1/a", "da2"); - client->link("da2","da3"); - client->rename("dir1/a","dir2/a"); - client->rename("dir2/a","da2"); - client->rename("da1","da2"); - client->rename("da2","da3"); - - // check directory renames - client->mkdir("dir3", 0755); - client->mknod("dir3/asdf", 0644); - client->mkdir("dir4", 0755); - client->mkdir("dir5", 0755); - client->mknod("dir5/asdf", 0644); - client->rename("dir3","dir4"); // ok - client->rename("dir4","dir5"); // fail -} - -int SyntheticClient::thrash_links(const char *basedir, int dirs, int files, int depth, int n) -{ - dout(1) << "thrash_links " << basedir << " " << dirs << " " << files << " " << depth - << " links " << n - << dendl; - - if (time_to_stop()) return 0; - - for (int k=0; krename(dst.c_str(), "/tmp") == 0) { - client->rename(src.c_str(), dst.c_str()); - client->rename("/tmp", src.c_str()); - } - continue; - } - - // pick a dest dir - string src = basedir; - { - char t[80]; - for (int d=0; dmknod(src.c_str(), 0755); - client->rename(src.c_str(), dst.c_str()); - break; - case 1: - client->mknod(src.c_str(), 0755); - client->unlink(dst.c_str()); - client->link(src.c_str(), dst.c_str()); - break; - case 2: client->unlink(src.c_str()); break; - case 3: client->unlink(dst.c_str()); break; - //case 4: client->mknod(src.c_str(), 0755); break; - //case 5: client->mknod(dst.c_str(), 0755); break; - } - } - return 0; - - // now link shit up - for (int i=0; ilink(file.c_str(), ln.c_str()); - } - - return 0; -} - - - - -void SyntheticClient::import_find(const char *base, const char *find, bool data) -{ - dout(1) << "import_find " << base << " from " << find << " data=" << data << dendl; - - /* use this to gather the static trace: - * - * find . -exec ls -dilsn --time-style=+%s \{\} \; - * or if it's wafl, - * find . -path ./.snapshot -prune -o -exec ls -dilsn --time-style=+%s \{\} \; - * - */ - - if (base[0] != '-') - client->mkdir(base, 0755); - - ifstream f(find); - assert(f.is_open()); - - int dirnum = 0; - - while (!f.eof()) { - uint64_t ino; - int dunno, nlink; - string modestring; - int uid, gid; - off_t size; - time_t mtime; - string filename; - f >> ino; - if (f.eof()) break; - f >> dunno; - f >> modestring; - f >> nlink; - f >> uid; - f >> gid; - f >> size; - f >> mtime; - f.seekg(1, ios::cur); - getline(f, filename); - - // ignore "." - if (filename == ".") continue; - - // remove leading ./ - assert(filename[0] == '.' && filename[1] == '/'); - filename = filename.substr(2); - - // new leading dir? - int sp = filename.find("/"); - if (sp < 0) dirnum++; - - //dout(0) << "leading dir " << filename << " " << dirnum << dendl; - if (dirnum % g_conf.num_client != client->get_nodeid()) { - dout(20) << "skipping leading dir " << dirnum << " " << filename << dendl; - continue; - } - - // parse the mode - assert(modestring.length() == 10); - mode_t mode = 0; - switch (modestring[0]) { - case 'd': mode |= INODE_MODE_DIR; break; - case 'l': mode |= INODE_MODE_SYMLINK; break; - default: - case '-': mode |= INODE_MODE_FILE; break; - } - if (modestring[1] == 'r') mode |= 0400; - if (modestring[2] == 'w') mode |= 0200; - if (modestring[3] == 'x') mode |= 0100; - if (modestring[4] == 'r') mode |= 040; - if (modestring[5] == 'w') mode |= 020; - if (modestring[6] == 'x') mode |= 010; - if (modestring[7] == 'r') mode |= 04; - if (modestring[8] == 'w') mode |= 02; - if (modestring[9] == 'x') mode |= 01; - - dout(20) << " mode " << modestring << " to " << oct << mode << dec << dendl; - - if (S_ISLNK(mode)) { - // target vs destination - int pos = filename.find(" -> "); - assert(pos > 0); - string link; - if (base[0] != '-') { - link = base; - link += "/"; - } - link += filename.substr(0, pos); - string target; - if (filename[pos+4] == '/') { - if (base[0] != '-') - target = base; - target += filename.substr(pos + 4); - } else { - target = filename.substr(pos + 4); - } - dout(10) << "symlink from '" << link << "' -> '" << target << "'" << dendl; - client->symlink(target.c_str(), link.c_str()); - } else { - string f; - if (base[0] != '-') { - f = base; - f += "/"; - } - f += filename; - if (S_ISDIR(mode)) { - client->mkdir(f.c_str(), mode); - } else { - int fd = client->open(f.c_str(), O_WRONLY|O_CREAT, mode & 0777); - assert(fd > 0); - client->write(fd, "", 0, size); - client->close(fd); - - //client->chmod(f.c_str(), mode & 0777); - client->chown(f.c_str(), uid, gid); - - struct utimbuf ut; - ut.modtime = mtime; - ut.actime = mtime; - client->utime(f.c_str(), &ut); - } - } - } - - -} - diff --git a/branches/sage/crush/client/SyntheticClient.h b/branches/sage/crush/client/SyntheticClient.h deleted file mode 100644 index ce09b18addfb2..0000000000000 --- a/branches/sage/crush/client/SyntheticClient.h +++ /dev/null @@ -1,241 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __SYNTHETICCLIENT_H -#define __SYNTHETICCLIENT_H - -#include - -#include "Client.h" -#include "include/Distribution.h" - -#include "Trace.h" - -#define SYNCLIENT_MODE_RANDOMWALK 1 -#define SYNCLIENT_MODE_FULLWALK 2 -#define SYNCLIENT_MODE_REPEATWALK 3 - -#define SYNCLIENT_MODE_MAKEDIRMESS 7 -#define SYNCLIENT_MODE_MAKEDIRS 8 // dirs files depth -#define SYNCLIENT_MODE_STATDIRS 9 // dirs files depth -#define SYNCLIENT_MODE_READDIRS 10 // dirs files depth - -#define SYNCLIENT_MODE_MAKEFILES 11 // num count private -#define SYNCLIENT_MODE_MAKEFILES2 12 // num count private -#define SYNCLIENT_MODE_CREATESHARED 13 // num -#define SYNCLIENT_MODE_OPENSHARED 14 // num count - -#define SYNCLIENT_MODE_WRITEFILE 20 -#define SYNCLIENT_MODE_READFILE 21 -#define SYNCLIENT_MODE_WRITEBATCH 22 -#define SYNCLIENT_MODE_WRSHARED 23 -#define SYNCLIENT_MODE_READSHARED 24 -#define SYNCLIENT_MODE_RDWRRANDOM 25 -#define SYNCLIENT_MODE_RDWRRANDOM_EX 26 - -#define SYNCLIENT_MODE_LINKTEST 27 - -#define SYNCLIENT_MODE_TRACE 30 - -#define SYNCLIENT_MODE_CREATEOBJECTS 35 -#define SYNCLIENT_MODE_OBJECTRW 36 - -#define SYNCLIENT_MODE_OPENTEST 40 -#define SYNCLIENT_MODE_OPTEST 41 - -#define SYNCLIENT_MODE_ONLY 50 -#define SYNCLIENT_MODE_ONLYRANGE 51 -#define SYNCLIENT_MODE_EXCLUDE 52 -#define SYNCLIENT_MODE_EXCLUDERANGE 53 - -#define SYNCLIENT_MODE_UNTIL 55 -#define SYNCLIENT_MODE_SLEEPUNTIL 56 - -#define SYNCLIENT_MODE_RANDOMSLEEP 61 -#define SYNCLIENT_MODE_SLEEP 62 - -#define SYNCLIENT_MODE_TRUNCATE 200 - -#define SYNCLIENT_MODE_FOO 100 -#define SYNCLIENT_MODE_THRASHLINKS 101 - -#define SYNCLIENT_MODE_IMPORTFIND 300 - - - -void parse_syn_options(vector& args); - -class SyntheticClient { - Client *client; - - pthread_t thread_id; - - Distribution op_dist; - - void init_op_dist(); - int get_op(); - - - filepath cwd; - map contents; - set subdirs; - bool did_readdir; - set open_files; - - void up(); - - void clear_dir() { - contents.clear(); - subdirs.clear(); - did_readdir = false; - } - - int get_random_fh() { - int r = rand() % open_files.size(); - set::iterator it = open_files.begin(); - while (r--) it++; - return *it; - } - - - filepath n1; - const char *get_random_subdir() { - assert(!subdirs.empty()); - int r = ((rand() % subdirs.size()) + (rand() % subdirs.size())) / 2; // non-uniform distn - set::iterator it = subdirs.begin(); - while (r--) it++; - - n1 = cwd; - n1.push_dentry( *it ); - return n1.get_path().c_str(); - } - filepath n2; - const char *get_random_sub() { - assert(!contents.empty()); - int r = ((rand() % contents.size()) + (rand() % contents.size())) / 2; // non-uniform distn - if (cwd.depth() && cwd.last_dentry().length()) - r += cwd.last_dentry().c_str()[0]; // slightly permuted - r %= contents.size(); - - map::iterator it = contents.begin(); - while (r--) it++; - - n2 = cwd; - n2.push_dentry( it->first ); - return n2.get_path().c_str(); - } - - filepath sub; - char sub_s[50]; - const char *make_sub(char *base) { - sprintf(sub_s, "%s.%d", base, rand() % 100); - string f = sub_s; - sub = cwd; - sub.push_dentry(f); - return sub.c_str(); - } - - public: - SyntheticClient(Client *client); - - int start_thread(); - int join_thread(); - - int run(); - - bool run_me() { - if (run_only >= 0) { - if (run_only == client->get_nodeid()) - return true; - else - return false; - } - return true; - } - void did_run_me() { - run_only = -1; - run_until = utime_t(); - } - - // run() will do one of these things: - list modes; - list sargs; - list iargs; - utime_t run_start; - utime_t run_until; - - int run_only; - int exclude; - - string get_sarg(int seq); - int get_iarg() { - int i = iargs.front(); - iargs.pop_front(); - return i; - } - - bool time_to_stop() { - utime_t now = g_clock.now(); - if (0) cout << "time_to_stop .. now " << now - << " until " << run_until - << " start " << run_start - << std::endl; - if (run_until.sec() && now > run_until) - return true; - else - return false; - } - - string compose_path(string& prefix, char *rest) { - return prefix + rest; - } - - int full_walk(string& fromdir); - int random_walk(int n); - - int make_dirs(const char *basedir, int dirs, int files, int depth); - int stat_dirs(const char *basedir, int dirs, int files, int depth); - int read_dirs(const char *basedir, int dirs, int files, int depth); - int make_files(int num, int count, int priv, bool more); - int link_test(); - - int create_shared(int num); - int open_shared(int num, int count); - - int write_file(string& fn, int mb, int chunk); - int write_batch(int nfile, int mb, int chunk); - int read_file(string& fn, int mb, int chunk, bool ignoreprint=false); - - int create_objects(int nobj, int osize, int inflight); - int object_rw(int nobj, int osize, int wrpc, int overlap, - double rskew, double wskew); - - int read_random(string& fn, int mb, int chunk); - int read_random_ex(string& fn, int mb, int chunk); - - int clean_dir(string& basedir); - - int play_trace(Trace& t, string& prefix, bool metadata_only=false); - - void make_dir_mess(const char *basedir, int n); - void foo(); - - int thrash_links(const char *basedir, int dirs, int files, int depth, int n); - - void import_find(const char *basedir, const char *find, bool writedata); - -}; - -#endif diff --git a/branches/sage/crush/cmon.cc b/branches/sage/crush/cmon.cc deleted file mode 100644 index f9ada45f7ef99..0000000000000 --- a/branches/sage/crush/cmon.cc +++ /dev/null @@ -1,129 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include -#include -#include - -#include -#include -#include -using namespace std; - -#include "config.h" - -#include "mon/MonMap.h" -#include "mon/Monitor.h" - -#include "msg/SimpleMessenger.h" - -#include "common/Timer.h" - - -class C_Die : public Context { -public: - void finish(int) { - cerr << "die" << std::endl; - exit(1); - } -}; - -class C_Debug : public Context { - public: - void finish(int) { - int size = &g_conf.debug_after - &g_conf.debug; - memcpy((char*)&g_conf.debug, (char*)&g_debug_after_conf.debug, size); - generic_dout(0) << "debug_after flipping debug settings" << dendl; - } -}; - - -int main(int argc, char **argv) -{ - vector args; - argv_to_vec(argc, argv, args); - - parse_config_options(args); - - if (g_conf.kill_after) - g_timer.add_event_after(g_conf.kill_after, new C_Die); - if (g_conf.debug_after) - g_timer.add_event_after(g_conf.debug_after, new C_Debug); - - // args - int whoami = -1; - char *monmap_fn = ".ceph_monmap"; - for (unsigned i=0; i= 0); - } else { - // i am specific monitor. - - // read monmap - cout << "reading monmap from .ceph_monmap" << std::endl; - int r = monmap.read(monmap_fn); - assert(r >= 0); - - // bind to a specific port - cout << "starting mon" << whoami << " at " << monmap.get_inst(whoami) << std::endl; - g_my_addr = monmap.get_inst(whoami).addr; - rank.start_rank(); - } - - // start monitor - Messenger *m = rank.register_entity(entity_name_t::MON(whoami)); - Monitor *mon = new Monitor(whoami, m, &monmap); - mon->init(); - - // wait - cout << "waiting for shutdown ..." << std::endl; - rank.wait(); - - // done - delete mon; - - return 0; -} - diff --git a/branches/sage/crush/common/Clock.h b/branches/sage/crush/common/Clock.h deleted file mode 100644 index 1ea7227adebd4..0000000000000 --- a/branches/sage/crush/common/Clock.h +++ /dev/null @@ -1,104 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __CLOCK_H -#define __CLOCK_H - -#include -#include - -#include -#include - -#include "Mutex.h" - -#include "include/utime.h" - - - -// -- clock -- -class Clock { - protected: - //utime_t start_offset; - //utime_t abs_last; - utime_t last; - utime_t zero; - - Mutex lock; - - public: - Clock() { - // set offset - //tare(); - } - - // real time. - utime_t real_now() { - utime_t realnow = now(); - realnow += zero; - //gettimeofday(&realnow.timeval(), NULL); - return realnow; - } - - // relative time (from startup) - void tare() { - gettimeofday(&zero.timeval(), NULL); - } - void tare(utime_t z) { - zero = z; - } - utime_t now() { - //lock.Lock(); - utime_t n; - gettimeofday(&n.timeval(), NULL); - n -= zero; - if (n < last) { - //std::cerr << "WARNING: clock jumped backwards from " << last << " to " << n << std::endl; - n = last; // clock jumped backwards! - } else - last = n; - //lock.Unlock(); - return n; - } - utime_t recent_now() { - return last; - } - - void realify(utime_t& t) { - t += zero; - } - - void make_timespec(utime_t& t, struct timespec *ts) { - utime_t real = t; - realify(real); - - memset(ts, 0, sizeof(*ts)); - ts->tv_sec = real.sec(); - ts->tv_nsec = real.nsec(); - } - - - - // absolute time - time_t gettime() { - return real_now().sec(); - } - -}; - -extern Clock g_clock; - -#endif diff --git a/branches/sage/crush/common/Logger.cc b/branches/sage/crush/common/Logger.cc deleted file mode 100644 index 2e7db26027a57..0000000000000 --- a/branches/sage/crush/common/Logger.cc +++ /dev/null @@ -1,320 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include - -#include "LogType.h" -#include "Logger.h" - -#include -#include "Clock.h" - -#include "config.h" - -#include -#include - -#include "common/Timer.h" - -// per-process lock. lame, but this way I protect LogType too! -Mutex logger_lock; -SafeTimer logger_timer(logger_lock); -Context *logger_event = 0; -list logger_list; -utime_t start; -int last_flush; // in seconds since start - -static void flush_all_loggers(); - -class C_FlushLoggers : public Context { -public: - void finish(int r) { - if (logger_event == this) { - logger_event = 0; - flush_all_loggers(); - } - } -}; - -void Logger::set_start(utime_t s) -{ - logger_lock.Lock(); - - start = s; - - utime_t fromstart = g_clock.now(); - if (fromstart < start) { - cerr << "set_start: logger time jumped backwards from " << start << " to " << fromstart << std::endl; - fromstart = start; - } - fromstart -= start; - last_flush = fromstart.sec(); - - logger_lock.Unlock(); -} - -static void flush_all_loggers() -{ - generic_dout(20) << "flush_all_loggers" << dendl; - - utime_t now = g_clock.now(); - utime_t fromstart = now; - if (fromstart < start) { - cerr << "logger time jumped backwards from " << start << " to " << fromstart << std::endl; - //assert(0); - start = fromstart; - } - fromstart -= start; - int now_sec = fromstart.sec(); - - // do any catching up we need to - while (now_sec - last_flush >= g_conf.log_interval) { - generic_dout(20) << "fromstart " << fromstart << " last_flush " << last_flush << " flushign" << dendl; - for (list::iterator p = logger_list.begin(); - p != logger_list.end(); - ++p) - (*p)->_flush(); - last_flush += g_conf.log_interval; - } - - // schedule next flush event - utime_t next; - next.sec_ref() = start.sec() + last_flush + g_conf.log_interval; - next.usec_ref() = start.usec(); - generic_dout(20) << "logger now=" << now - << " start=" << start - << " next=" << next - << dendl; - logger_event = new C_FlushLoggers; - logger_timer.add_event_at(next, logger_event); -} - - - -// --------- - -Logger::Logger(string fn, LogType *type, bool append) -{ - logger_lock.Lock(); - { - filename = ""; - if (g_conf.use_abspaths) { - char *cwd = get_current_dir_name(); - filename = cwd; - free(cwd); - filename += "/"; - } - - filename = "log/"; - if (g_conf.log_name) { - filename += g_conf.log_name; - ::mkdir( filename.c_str(), 0755 ); // make sure dir exists - filename += "/"; - } - filename += fn; - - if (append) - out.open(filename.c_str(), ofstream::out|ofstream::app); - else - out.open(filename.c_str(), ofstream::out); - - this->type = type; - wrote_header = -1; - wrote_header_last = 0; - - version = 0; - - if (logger_list.empty()) { - // init logger - if (!g_conf.clock_tare) - start = g_clock.now(); // time 0! otherwise g_clock does it for us. - - last_flush = 0; - - // call manually the first time; then it'll schedule itself. - flush_all_loggers(); - } - logger_list.push_back(this); - } - logger_lock.Unlock(); -} - -Logger::~Logger() -{ - logger_lock.Lock(); - { - _flush(); - out.close(); - logger_list.remove(this); // slow, but rare. - if (logger_list.empty()) - logger_event = 0; // stop the timer events. - } - logger_lock.Unlock(); -} - - -/* -void Logger::flush() -{ - logger_lock.Lock(); - _flush(); - logger_lock.Unlock(); -} -*/ - -void Logger::_flush() -{ - // header? - wrote_header_last++; - if (wrote_header != type->version || - wrote_header_last > 10) { - out << "#" << type->keymap.size(); - for (unsigned i=0; ikeys.size(); i++) { - out << "\t" << type->keys[i]; - if (type->avg[i]) - out << "\t" << type->keys[i] << "*\t" << type->keys[i] << "~"; - } - out << std::endl; //out << "\t (" << type->keymap.size() << ")" << endl; - wrote_header = type->version; - wrote_header_last = 0; - } - - maybe_resize(type->keys.size()); - - // write line to log - out << last_flush; - for (unsigned i=0; ikeys.size(); i++) { - if (type->avg[i]) { - if (vals[i] > 0) { - double avg = (fvals[i] / (double)vals[i]); - double var = 0.0; - if (g_conf.logger_calc_variance) { - int n = vals[i]; - for (vector::iterator p = vals_to_avg[i].begin(); n--; ++p) - var += (avg - *p) * (avg - *p); - } - out << "\t" << avg << "\t" << vals[i] << "\t" << var; - } else - out << "\t0\t0\t0"; - } else { - if (fvals[i] > 0 && vals[i] == 0) - out << "\t" << fvals[i]; - else { - //cout << this << " p " << i << " and size is " << vals.size() << std::endl; - out << "\t" << vals[i]; - } - } - } - out << std::endl; - - // reset the counters - for (unsigned i=0; ikeys.size(); i++) { - if (type->inc_keys.count(i)) { - this->vals[i] = 0; - this->fvals[i] = 0; - } - } -} - - - -long Logger::inc(const char *key, long v) -{ - if (!g_conf.log) return 0; - logger_lock.Lock(); - int i = type->lookup_key(key); - if (i < 0) i = type->add_inc(key); - maybe_resize(i+1); - - vals[i] += v; - long r = vals[i]; - logger_lock.Unlock(); - return r; -} - -double Logger::finc(const char *key, double v) -{ - if (!g_conf.log) return 0; - logger_lock.Lock(); - int i = type->lookup_key(key); - if (i < 0) i = type->add_inc(key); - maybe_resize(i+1); - - fvals[i] += v; - double r = fvals[i]; - logger_lock.Unlock(); - return r; -} - -long Logger::set(const char *key, long v) -{ - if (!g_conf.log) return 0; - logger_lock.Lock(); - int i = type->lookup_key(key); - if (i < 0) i = type->add_set(key); - maybe_resize(i+1); - - //cout << this << " set " << i << " to " << v << std::endl; - long r = vals[i] = v; - logger_lock.Unlock(); - return r; -} - - -double Logger::fset(const char *key, double v) -{ - if (!g_conf.log) return 0; - logger_lock.Lock(); - int i = type->lookup_key(key); - if (i < 0) i = type->add_set(key); - maybe_resize(i+1); - - //cout << this << " fset " << i << " to " << v << std::endl; - double r = fvals[i] = v; - logger_lock.Unlock(); - return r; -} - -double Logger::favg(const char *key, double v) -{ - if (!g_conf.log) return 0; - logger_lock.Lock(); - int i = type->lookup_key(key); - if (i < 0) i = type->add_avg(key); - maybe_resize(i+1); - - vals[i]++; - double r = fvals[i] = v; - if (g_conf.logger_calc_variance) - vals_to_avg[i].push_back(v); - logger_lock.Unlock(); - return r; -} - -long Logger::get(const char* key) -{ - if (!g_conf.log) return 0; - logger_lock.Lock(); - int i = type->lookup_key(key); - maybe_resize(i+1); - - long r = 0; - if (i >= 0 && i < (int)vals.size()) - r = vals[i]; - logger_lock.Unlock(); - return r; -} - diff --git a/branches/sage/crush/config.cc b/branches/sage/crush/config.cc deleted file mode 100644 index f037fe728dfe4..0000000000000 --- a/branches/sage/crush/config.cc +++ /dev/null @@ -1,1039 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include "config.h" -#include "include/types.h" -#include - -//#define MDS_CACHE_SIZE 4*10000 -> <20mb -//#define MDS_CACHE_SIZE 80000 62mb - -#define AVG_PER_INODE_SIZE 450 -#define MDS_CACHE_MB_TO_INODES(x) ((x)*1000000/AVG_PER_INODE_SIZE) - -//#define MDS_CACHE_SIZE MDS_CACHE_MB_TO_INODES( 50 ) -//#define MDS_CACHE_SIZE 1500000 -#define MDS_CACHE_SIZE 150000 - - -// hack hack hack ugly FIXME -#include "common/Mutex.h" -long buffer_total_alloc = 0; -Mutex bufferlock; - -#include "osd/osd_types.h" - -// debug output -Mutex _dout_lock; -ostream *_dout = &std::cout; -ostream *_derr = &std::cerr; - -// file layouts -struct ceph_file_layout g_OSD_FileLayout = { - fl_stripe_unit: 1<<22, - fl_stripe_count: 1, - fl_object_size: 1<<22, - fl_object_stripe_unit: 0, - fl_pg_preferred: -1, - fl_pg_type: CEPH_PG_TYPE_REP, - fl_pg_size: 2 -}; - -struct ceph_file_layout g_OSD_MDDirLayout = { - fl_stripe_unit: 1<<22, - fl_stripe_count: 1, - fl_object_size: 1<<22, - fl_object_stripe_unit: 0, - fl_pg_preferred: -1, - fl_pg_type: CEPH_PG_TYPE_REP, - fl_pg_size: 2 -}; - -struct ceph_file_layout g_OSD_MDLogLayout = { - fl_stripe_unit: 1<<20, - fl_stripe_count: 1, - fl_object_size: 1<<20, - fl_object_stripe_unit: 0, - fl_pg_preferred: -1, - fl_pg_type: CEPH_PG_TYPE_REP, - fl_pg_size: 2 -}; - -struct ceph_file_layout g_OSD_MDAnchorTableLayout = { - fl_stripe_unit: 1<<20, - fl_stripe_count: 1, - fl_object_size: 1<<20, - fl_object_stripe_unit: 0, - fl_pg_preferred: -1, - fl_pg_type: CEPH_PG_TYPE_REP, - fl_pg_size: 2 -}; - -#include - -// fake osd failures: osd -> time -std::map g_fake_kill_after; -std::map g_fake_osd_down; -std::map g_fake_osd_out; - -entity_addr_t g_my_addr; - -md_config_t g_debug_after_conf; - -md_config_t g_conf = { - num_mon: 1, - num_mds: 1, - num_osd: 4, - num_client: 1, - - mkfs: false, - - // profiling and debugging - log: true, - log_interval: 1, - log_name: (char*)0, - - log_messages: true, - log_pins: true, - - logger_calc_variance: true, - - dout_dir: 0, - - fake_clock: false, - fakemessenger_serialize: true, - - fake_osdmap_expand: 0, - fake_osdmap_updates: 0, - fake_osd_mttf: 0, - fake_osd_mttr: 0, - - osd_remount_at: 0, - - kill_after: 0, - - tick: 0, - - debug: 0, - debug_mds: 1, - debug_mds_balancer: 1, - debug_mds_log: 1, - debug_mds_log_expire: 1, - debug_mds_migrator: 1, - debug_buffer: 0, - debug_timer: 0, - debug_filer: 0, - debug_objecter: 0, - debug_journaler: 0, - debug_objectcacher: 0, - debug_client: 0, - debug_osd: 0, - debug_ebofs: 1, - debug_bdev: 1, // block device - debug_ns: 0, - debug_ms: 0, - debug_mon: 1, - debug_paxos: 0, - - debug_after: 0, - - // -- misc -- - use_abspaths: false, // make monitorstore et al use absolute path (to workaround FUSE chdir("/")) - - // --- clock --- - clock_lock: false, - clock_tare: false, - - // --- messenger --- - ms_tcp_nodelay: true, - ms_single_dispatch: false, - ms_requeue_on_sender_fail: false, - - ms_stripe_osds: false, - ms_skip_rank0: false, - ms_overlay_clients: false, - - ms_die_on_failure: false, - - /*tcp_skip_rank0: false, - tcp_overlay_clients: false, // over osds! - tcp_log: false, - tcp_serial_marshall: true, - tcp_serial_out: false, - tcp_multi_out: true, - tcp_multi_dispatch: false, // not fully implemented yet - */ - - // --- mon --- - mon_tick_interval: 5, - mon_osd_down_out_interval: 5, // seconds - mon_lease: 5, // seconds // lease interval - mon_lease_renew_interval: 3, // on leader, to renew the lease - mon_lease_ack_timeout: 10.0, // on leader, if lease isn't acked by all peons - mon_lease_timeout: 10.0, // on peon, if lease isn't extended - mon_accept_timeout: 10.0, // on leader, if paxos update isn't accepted - mon_stop_on_last_unmount: false, - mon_stop_with_last_mds: false, - mon_allow_mds_bully: true, // allow a booting mds to (forcibly) claim an mds # - - paxos_propose_interval: 1.0, // gather updates for this long before proposing a map update - - // --- client --- - client_cache_size: 1000, - client_cache_mid: .5, - client_cache_stat_ttl: 0, // seconds until cached stat results become invalid - client_cache_readdir_ttl: 1, // 1 second only - client_use_random_mds: false, - - client_sync_writes: 0, - - client_mount_timeout: 10.0, // retry every N seconds - - client_hack_balance_reads: false, - - client_trace: 0, - fuse_direct_io: 0, - fuse_ll: true, - - // --- objectcacher --- - client_oc: true, - client_oc_size: 1024*1024* 10, // MB * n - client_oc_max_dirty: 1024*1024* 10, // MB * n (dirty OR tx) - client_oc_max_sync_write: 128*1024, // synx writes >= this use wrlock - - // --- objecter --- - objecter_buffer_uncommitted: true, // this must be true for proper failure handling - objecter_map_request_interval: 15.0, // request a new map every N seconds, if we have pending io - objecter_tick_interval: 5.0, - objecter_timeout: 10.0, // before we ask for a map - - // --- journaler --- - journaler_allow_split_entries: true, - journaler_safe: false, // wait for COMMIT on journal writes - journaler_write_head_interval: 15, - journaler_cache: false, // cache writes for later readback - journaler_prefetch_periods: 50, // * journal object size (1~MB? see above) - journaler_batch_interval: .001, // seconds.. max add'l latency we artificially incur - journaler_batch_max: 16384, // max bytes we'll delay flushing - - // --- mds --- - mds_cache_size: 300000, //MDS_CACHE_SIZE, - mds_cache_mid: .7, - - mds_decay_halflife: 5, - - mds_beacon_interval: 4, //30.0, - mds_beacon_grace: 15, //60*60.0, - - mds_log: true, - mds_log_max_events: -1, //MDS_CACHE_SIZE / 3, - mds_log_max_segments: 100, - mds_log_max_expiring: 20, - mds_log_pad_entry: 128,//256,//64, - mds_log_eopen_size: 100, // # open inodes per log entry - - mds_bal_sample_interval: 3.0, // every 5 seconds - mds_bal_replicate_threshold: 8000, - mds_bal_unreplicate_threshold: 0,//500, - mds_bal_split_size: 10000, - mds_bal_split_rd: 25000, - mds_bal_split_wr: 10000, - mds_bal_merge_size: 50, - mds_bal_merge_rd: 1000, - mds_bal_merge_wr: 1000, - mds_bal_interval: 10, // seconds - mds_bal_fragment_interval: 2, // seconds - mds_bal_idle_threshold: 0, //.1, - mds_bal_max: -1, - mds_bal_max_until: -1, - - mds_bal_mode: 0, - mds_bal_min_rebalance: .1, // must be this much above average before we export anything - mds_bal_min_start: .2, // if we need less than this, we don't do anything - mds_bal_need_min: .8, // take within this range of what we need - mds_bal_need_max: 1.2, - mds_bal_midchunk: .3, // any sub bigger than this taken in full - mds_bal_minchunk: .001, // never take anything smaller than this - - mds_trim_on_rejoin: true, - mds_shutdown_check: 0, //30, - - mds_verify_export_dirauth: true, - - mds_local_osd: false, - mds_local_osd_offset: 1000, - - mds_thrash_exports: 0, - mds_thrash_fragments: 0, - mds_dump_cache_on_map: false, - mds_dump_cache_after_rejoin: true, - - mds_hack_log_expire_for_better_stats: false, - - // --- osd --- - osd_rep: OSD_REP_PRIMARY, - - osd_balance_reads: false, // send from client to replica - osd_flash_crowd_iat_threshold: 0,//100, - osd_flash_crowd_iat_alpha: 0.125, - osd_balance_reads_temp: 100, - - osd_shed_reads: false, // forward from primary to replica - osd_shed_reads_min_latency: .01, // min local latency - osd_shed_reads_min_latency_diff: .01, // min latency difference - osd_shed_reads_min_latency_ratio: 1.5, // 1.2 == 20% higher than peer - - osd_immediate_read_from_cache: false,//true, // osds to read from the cache immediately? - osd_exclusive_caching: true, // replicas evict replicated writes - - osd_stat_refresh_interval: .5, - - osd_pg_bits: 4, // bits per osd - osd_object_layout: CEPH_OBJECT_LAYOUT_HASHINO,//LINEAR,//HASHINO, - osd_pg_layout: CEPH_PG_LAYOUT_CRUSH,//LINEAR,//CRUSH, - osd_max_rep: 4, - osd_min_raid_width: 4, - osd_max_raid_width: 3, //6, - - osd_maxthreads: 2, // 0 == no threading - osd_max_opq: 10, - osd_mkfs: false, - osd_age: .8, - osd_age_time: 0, - osd_heartbeat_interval: 1, - osd_pg_stats_interval: 5, - osd_replay_window: 5, - osd_max_pull: 2, - osd_pad_pg_log: false, - - osd_auto_weight: false, - - osd_hack_fast_startup: false, // this breaks localized pgs. - - - // --- fakestore --- - fakestore_fake_sync: .2, // seconds - fakestore_fsync: false,//true, - fakestore_writesync: false, - fakestore_syncthreads: 4, - fakestore_fake_attrs: false, - fakestore_fake_collections: false, - fakestore_dev: 0, - - // --- ebofs --- - ebofs: 1, - ebofs_cloneable: false, - ebofs_verify: false, - ebofs_commit_ms: 1000, // 0 = no forced commit timeout (for debugging/tracing) - ebofs_idle_commit_ms: 0, // 0 = no idle detection. UGLY HACK. use bdev_idle_kick_after_ms instead. - ebofs_oc_size: 10000, // onode cache - ebofs_cc_size: 10000, // cnode cache - ebofs_bc_size: (50 *256), // 4k blocks, *256 for MB - ebofs_bc_max_dirty: (30 *256), // before write() will block - ebofs_max_prefetch: 1000, // 4k blocks - ebofs_realloc: false, // hrm, this can cause bad fragmentation, don't use! - - ebofs_abp_zero: false, // zero newly allocated buffers (may shut up valgrind) - ebofs_abp_max_alloc: 4096*16, // max size of new buffers (larger -> more memory fragmentation) - - // --- block device --- - bdev_lock: true, - bdev_iothreads: 1, // number of ios to queue with kernel - bdev_idle_kick_after_ms: 100, // ms - bdev_el_fw_max_ms: 10000, // restart elevator at least once every 1000 ms - bdev_el_bw_max_ms: 3000, // restart elevator at least once every 300 ms - bdev_el_bidir: false, // bidirectional elevator? - bdev_iov_max: 512, // max # iov's to collect into a single readv()/writev() call - bdev_debug_check_io_overlap: true, // [DEBUG] check for any pending io overlaps - bdev_fake_mb: 0, - bdev_fake_max_mb: 0, - - // --- fakeclient (mds regression testing) (ancient history) --- - num_fakeclient: 100, - fakeclient_requests: 100, - fakeclient_deterministic: false, - - fakeclient_op_statfs: false, - - // loosely based on Roselli workload paper numbers - fakeclient_op_stat: 610, - fakeclient_op_lstat: false, - fakeclient_op_utime: 0, - fakeclient_op_chmod: 1, - fakeclient_op_chown: 1, - - fakeclient_op_readdir: 2, - fakeclient_op_mknod: 30, - fakeclient_op_link: false, - fakeclient_op_unlink: 20, - fakeclient_op_rename: 0,//40, - - fakeclient_op_mkdir: 10, - fakeclient_op_rmdir: 20, - fakeclient_op_symlink: 20, - - fakeclient_op_openrd: 200, - fakeclient_op_openwr: 0, - fakeclient_op_openwrc: 0, - fakeclient_op_read: false, // osd! - fakeclient_op_write: false, // osd! - fakeclient_op_truncate: false, - fakeclient_op_fsync: false, - fakeclient_op_close: 200 - -#ifdef USE_OSBDB - , - bdbstore: false, - debug_bdbstore: 1, - bdbstore_btree: false, - bdbstore_ffactor: 0, - bdbstore_nelem: 0, - bdbstore_pagesize: 0, - bdbstore_cachesize: 0, - bdbstore_transactional: false -#endif // USE_OSBDB -}; - - -#include -#include - - -void env_to_vec(std::vector& args) -{ - const char *p = getenv("CEPH_ARGS"); - if (!p) return; - - static char buf[1000]; - int len = strlen(p); - memcpy(buf, p, len); - buf[len] = 0; - //cout << "CEPH_ARGS " << buf << endl; - - int l = 0; - for (int i=0; i& args) -{ - for (int i=1; i& args, - int& argc, char **&argv) -{ - argv = (char**)malloc(sizeof(char*) * argc); - argc = 1; - argv[0] = "asdf"; - - for (unsigned i=0; i= '0' && *s <= '9') { - int digit = *s - '0'; - //cout << "digit " << digit << endl; - val *= 10; - val += digit; - numdigits++; - s++; off++; - } - //cout << "val " << val << endl; - - if (numdigits == 0) { - cerr << "no digits at off " << off << std::endl; - return false; // no digits - } - if (count < 3 && *s != '.') { - cerr << "should period at " << off << std::endl; - return false; // should have 3 periods - } - s++; off++; - - if (count <= 3) - a.v.ipq[count] = val; - else - a.v.port = val; - - count++; - if (count == 4 && *s != ':') break; - if (count == 5) break; - } - - return true; -} - - - -void parse_config_options(std::vector& args) -{ - std::vector nargs; - - for (unsigned i=0; iis_open()) { - std::cerr << "error opening output file " << fn << std::endl; - delete out; - } else { - _dout = out; - } - } - - args = nargs; -} diff --git a/branches/sage/crush/config.h b/branches/sage/crush/config.h deleted file mode 100644 index b5cdf6cbd586d..0000000000000 --- a/branches/sage/crush/config.h +++ /dev/null @@ -1,418 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __CONFIG_H -#define __CONFIG_H - -extern struct ceph_file_layout g_OSD_FileLayout; -extern struct ceph_file_layout g_OSD_MDDirLayout; -extern struct ceph_file_layout g_OSD_MDLogLayout; -extern struct ceph_file_layout g_OSD_MDAnchorTableLayout; - -#include -#include - -#include "common/Mutex.h" - -extern std::map g_fake_osd_down; -extern std::map g_fake_osd_out; - -#define OSD_REP_PRIMARY 0 -#define OSD_REP_SPLAY 1 -#define OSD_REP_CHAIN 2 - - -#include "msg/msg_types.h" - -extern entity_addr_t g_my_addr; - -struct md_config_t { - int num_mon; - int num_mds; - int num_osd; - int num_client; - - bool mkfs; - - // profiling - bool log; - int log_interval; - char *log_name; - - bool log_messages; - bool log_pins; - - bool logger_calc_variance; - - char *dout_dir; - - bool fake_clock; - bool fakemessenger_serialize; - - int fake_osdmap_expand; - int fake_osdmap_updates; - int fake_osd_mttf; - int fake_osd_mttr; - - int osd_remount_at; - - int kill_after; - - int tick; - - int debug; - int debug_mds; - int debug_mds_balancer; - int debug_mds_log; - int debug_mds_log_expire; - int debug_mds_migrator; - int debug_buffer; - int debug_timer; - int debug_filer; - int debug_objecter; - int debug_journaler; - int debug_objectcacher; - int debug_client; - int debug_osd; - int debug_ebofs; - int debug_bdev; - int debug_ns; - int debug_ms; - int debug_mon; - int debug_paxos; - - int debug_after; - - // misc - bool use_abspaths; - - // clock - bool clock_lock; - bool clock_tare; - - // messenger - - /*bool tcp_skip_rank0; - bool tcp_overlay_clients; - bool tcp_log; - bool tcp_serial_marshall; - bool tcp_serial_out; - bool tcp_multi_out; - bool tcp_multi_dispatch; - */ - - bool ms_tcp_nodelay; - bool ms_single_dispatch; - bool ms_requeue_on_sender_fail; - - bool ms_stripe_osds; - bool ms_skip_rank0; - bool ms_overlay_clients; - bool ms_die_on_failure; - - // mon - int mon_tick_interval; - int mon_osd_down_out_interval; - float mon_lease; - float mon_lease_renew_interval; - float mon_lease_ack_timeout; - float mon_lease_timeout; - float mon_accept_timeout; - bool mon_stop_on_last_unmount; - bool mon_stop_with_last_mds; - bool mon_allow_mds_bully; - - double paxos_propose_interval; - - // client - int client_cache_size; - float client_cache_mid; - int client_cache_stat_ttl; - int client_cache_readdir_ttl; - bool client_use_random_mds; // debug flag - - bool client_sync_writes; - - double client_mount_timeout; - - // hack - bool client_hack_balance_reads; - - - /* - bool client_bcache; - int client_bcache_alloc_minsize; - int client_bcache_alloc_maxsize; - int client_bcache_ttl; - off_t client_bcache_size; - int client_bcache_lowater; - int client_bcache_hiwater; - size_t client_bcache_align; - */ - - char *client_trace; - int fuse_direct_io; - bool fuse_ll; - - // objectcacher - bool client_oc; - int client_oc_size; - int client_oc_max_dirty; - size_t client_oc_max_sync_write; - - // objecter - bool objecter_buffer_uncommitted; - double objecter_map_request_interval; - double objecter_tick_interval; - double objecter_timeout; - - // journaler - bool journaler_allow_split_entries; - bool journaler_safe; - int journaler_write_head_interval; - bool journaler_cache; - int journaler_prefetch_periods; - double journaler_batch_interval; - size_t journaler_batch_max; - - // mds - int mds_cache_size; - float mds_cache_mid; - - float mds_decay_halflife; - - float mds_beacon_interval; - float mds_beacon_grace; - - bool mds_log; - int mds_log_max_events; - int mds_log_max_segments; - int mds_log_max_expiring; - int mds_log_pad_entry; - int mds_log_eopen_size; - - float mds_bal_sample_interval; - float mds_bal_replicate_threshold; - float mds_bal_unreplicate_threshold; - int mds_bal_split_size; - float mds_bal_split_rd; - float mds_bal_split_wr; - int mds_bal_merge_size; - float mds_bal_merge_rd; - float mds_bal_merge_wr; - int mds_bal_interval; - int mds_bal_fragment_interval; - float mds_bal_idle_threshold; - int mds_bal_max; - int mds_bal_max_until; - - int mds_bal_mode; - float mds_bal_min_rebalance; - float mds_bal_min_start; - float mds_bal_need_min; - float mds_bal_need_max; - float mds_bal_midchunk; - float mds_bal_minchunk; - - bool mds_trim_on_rejoin; - int mds_shutdown_check; - - bool mds_verify_export_dirauth; // debug flag - - bool mds_local_osd; - int mds_local_osd_offset; - - int mds_thrash_exports; - int mds_thrash_fragments; - bool mds_dump_cache_on_map; - bool mds_dump_cache_after_rejoin; - - bool mds_hack_log_expire_for_better_stats; - - // osd - int osd_rep; - - bool osd_balance_reads; - int osd_flash_crowd_iat_threshold; // flash crowd interarrival time threshold in ms - double osd_flash_crowd_iat_alpha; - double osd_balance_reads_temp; - - int osd_shed_reads; - double osd_shed_reads_min_latency; - double osd_shed_reads_min_latency_diff; - double osd_shed_reads_min_latency_ratio; - - bool osd_immediate_read_from_cache; - bool osd_exclusive_caching; - double osd_stat_refresh_interval; - - int osd_pg_bits; - int osd_object_layout; - int osd_pg_layout; - int osd_max_rep; - int osd_min_raid_width; - int osd_max_raid_width; - int osd_maxthreads; - int osd_max_opq; - bool osd_mkfs; - float osd_age; - int osd_age_time; - int osd_heartbeat_interval; - int osd_pg_stats_interval; - int osd_replay_window; - int osd_max_pull; - bool osd_pad_pg_log; - - bool osd_auto_weight; - - bool osd_hack_fast_startup; - - double fakestore_fake_sync; - bool fakestore_fsync; - bool fakestore_writesync; - int fakestore_syncthreads; // such crap - bool fakestore_fake_attrs; - bool fakestore_fake_collections; - char *fakestore_dev; - - // ebofs - int ebofs; - bool ebofs_cloneable; - bool ebofs_verify; - int ebofs_commit_ms; - int ebofs_idle_commit_ms; - int ebofs_oc_size; - int ebofs_cc_size; - off_t ebofs_bc_size; - off_t ebofs_bc_max_dirty; - unsigned ebofs_max_prefetch; - bool ebofs_realloc; - - bool ebofs_abp_zero; - size_t ebofs_abp_max_alloc; - - // block device - bool bdev_lock; - int bdev_iothreads; - int bdev_idle_kick_after_ms; - int bdev_el_fw_max_ms; - int bdev_el_bw_max_ms; - bool bdev_el_bidir; - int bdev_iov_max; - bool bdev_debug_check_io_overlap; - int bdev_fake_mb; - int bdev_fake_max_mb; - - // fake client - int num_fakeclient; - unsigned fakeclient_requests; - bool fakeclient_deterministic; // debug flag - - int fakeclient_op_statfs; - - int fakeclient_op_stat; - int fakeclient_op_lstat; - int fakeclient_op_utime; - int fakeclient_op_chmod; - int fakeclient_op_chown; - - int fakeclient_op_readdir; - int fakeclient_op_mknod; - int fakeclient_op_link; - int fakeclient_op_unlink; - int fakeclient_op_rename; - - int fakeclient_op_mkdir; - int fakeclient_op_rmdir; - int fakeclient_op_symlink; - - int fakeclient_op_openrd; - int fakeclient_op_openwr; - int fakeclient_op_openwrc; - int fakeclient_op_read; - int fakeclient_op_write; - int fakeclient_op_truncate; - int fakeclient_op_fsync; - int fakeclient_op_close; - -#ifdef USE_OSBDB - bool bdbstore; - int debug_bdbstore; - bool bdbstore_btree; - int bdbstore_ffactor; - int bdbstore_nelem; - int bdbstore_pagesize; - int bdbstore_cachesize; - bool bdbstore_transactional; -#endif // USE_OSBDB -}; - -extern md_config_t g_conf; -extern md_config_t g_debug_after_conf; - - -/** - * command line / environment argument parsing - */ -void env_to_vec(std::vector& args); -void argv_to_vec(int argc, char **argv, - std::vector& args); -void vec_to_argv(std::vector& args, - int& argc, char **&argv); - -void parse_config_options(std::vector& args); - -extern bool parse_ip_port(const char *s, entity_addr_t& addr); - - -/** - * for cleaner output, bracket each line with - * dbeginl (in the dout macro) and dendl (in place of endl). - */ -extern Mutex _dout_lock; -struct _dbeginl_t { _dbeginl_t(int) {} }; -struct _dendl_t { _dendl_t(int) {} }; -static const _dbeginl_t dbeginl = 0; -static const _dendl_t dendl = 0; - -// intentionally conflict with endl -class _bad_endl_use_dendl_t { public: _bad_endl_use_dendl_t(int) {} }; -static const _bad_endl_use_dendl_t endl = 0; - -inline ostream& operator<<(ostream& out, _dbeginl_t) { - _dout_lock.Lock(); - return out; -} -inline ostream& operator<<(ostream& out, _dendl_t) { - out << std::endl; - _dout_lock.Unlock(); - return out; -} -inline ostream& operator<<(ostream& out, _bad_endl_use_dendl_t) { - assert(0 && "you are using the wrong endl.. use std::endl or dendl"); - return out; -} - -// the streams -extern ostream *_dout; -extern ostream *_derr; - -// generic macros -#define generic_dout(x) if ((x) <= g_conf.debug) *_dout << dbeginl -#define generic_derr(x) if ((x) <= g_conf.debug) *_derr << dbeginl - -#define pdout(x,p) if ((x) <= (p)) *_dout << dbeginl - - -#endif diff --git a/branches/sage/crush/crush.old/test/bucket_movement.cc b/branches/sage/crush/crush.old/test/bucket_movement.cc deleted file mode 100644 index 6be17356cb64c..0000000000000 --- a/branches/sage/crush/crush.old/test/bucket_movement.cc +++ /dev/null @@ -1,166 +0,0 @@ - - -#include "../crush.h" -using namespace crush; - -#include - -#include -#include -using namespace std; - - -void place(Crush& c, Rule& rule, int numpg, int numrep, map >& placement) -{ - vector v(numrep); - map ocount; - - for (int x=1; x<=numpg; x++) { - - //cout << H(x) << "\t" << h(x) << endl; - c.do_rule(rule, x, v); - //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl; - - bool bad = false; - for (int i=0; i::iterator it = ocount.begin(); - it != ocount.end(); - it++) - cout << it->first << "\t" << it->second << endl; - -} - - -float testmovement(int n, float f, int buckettype) -{ - Hash h(73232313); - - // crush - Crush c; - - int ndisks = 0; - - // bucket - Bucket *b; - if (buckettype == 0) - b = new TreeBucket(1); - else if (buckettype == 1 || buckettype == 2) - b = new ListBucket(1); - else if (buckettype == 3) - b = new StrawBucket(1); - else if (buckettype == 4) - b = new UniformBucket(0,0); - - for (int i=0; iadd_item(ndisks++,1); - - c.add_bucket(b); - int root = b->get_id(); - - //c.print(cout,root); - - // rule - int numrep = 2; - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - //c.overload[10] = .1; - - - int pg_per = 1000; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - - /* - cout << ndisks << " disks, " << endl; - cout << pg_per << " pgs per disk" << endl; - cout << numpg << " logical pgs" << endl; - cout << "numrep is " << numrep << endl; - */ - map > placement1, placement2; - - //c.print(cout, root); - - - // ORIGINAL - place(c, rule, numpg, numrep, placement1); - - int olddisks = ndisks; - - // add item - if (buckettype == 2) { - // start over! - ndisks = 0; - b = new ListBucket(1); - for (int i=0; i<=n; i++) - b->add_item(ndisks++,1); - c.add_bucket(b); - root = b->get_id(); - - rule.steps.clear(); - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - } - else - b->add_item(ndisks++, 1); - - - // ADDED - //c.print(cout, root); - place(c, rule, numpg, numrep, placement2); - - int moved = 0; - for (int x=1; x<=numpg; x++) - if (placement1[x] != placement2[x]) - for (int j=0; j - -#include -#include -using namespace std; - - -Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - return b; - } else { - // mixed - //Bucket *b = new MixedBucket(h+1); - Bucket *b = new StrawBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); - return b->get_id(); -} - - -float go(int dep) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - if (0) { - for (int d=0; dadd_item(ndisks++, 10); - root = c.add_bucket(b); - } - if (0) { - vector disks; - for (int i=0; i<10000; i++) - disks.push_back(ndisks++); - UniformBucket *b = new UniformBucket(1, 0, 10000, disks); - Hash h(123); - b->make_primes(h); - root = c.add_bucket(b); - } - - - - // rule - int numrep = 1; - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - //c.overload[10] = .1; - - - int pg_per = 100; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - //cout << ndisks << " disks, " << endl; - //cout << pg_per << " pgs per disk" << endl; - // cout << numpg << " logical pgs" << endl; - //cout << "numrep is " << numrep << endl; - - - int place = 100000; - int times = place / numpg; - if (!times) times = 1; - - cout << "#looping " << times << " times" << endl; - - float tvar = 0; - int tvarnum = 0; - - int x = 0; - for (int t=0; t v(numrep); - - for (int z=0; z - -#include -#include -using namespace std; - -int buckettype = 0; - -Bucket *make_bucket(Crush& c, vector& wid, int h, map< int, list >& buckets, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - buckets[h].push_back(b); - return b; - } else { - // mixed - //Bucket *b = new TreeBucket(h+1); - //Bucket *b = new ListBucket(h+1); - //Bucket *b = new StrawBucket(h+1); - Bucket *b; - if (buckettype == 0) - b = new TreeBucket(h+1); - else if (buckettype == 1 || buckettype == 2) - b = new ListBucket(h+1); - else if (buckettype == 3) - b = new StrawBucket(h+1); - - c.add_bucket(b); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - n->set_parent(b->get_id()); - } - buckets[h].push_back(b); - //cout << b->get_id() << " mixedbucket with " << wid[h] << " at " << h << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, map< int, list >& buckets, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, buckets, ndisks); - return b->get_id(); -} - - -void place(Crush& c, Rule& rule, int numpg, int numrep, map >& placement) -{ - vector v(numrep); - map ocount; - - for (int x=1; x<=numpg; x++) { - - //cout << H(x) << "\t" << h(x) << endl; - c.do_rule(rule, x, v); - //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl; - - bool bad = false; - for (int i=0; i::iterator it = ocount.begin(); - it != ocount.end(); - it++) - cout << it->first << "\t" << it->second << endl; - -} - - -float testmovement(int depth, int branching, int udisks, int add, int modifydepth) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - wid.push_back(udisks); - for (int d=1; d > buckets; - - root = make_hierarchy(c, wid, buckets, ndisks); - - //c.print(cout,root); - - // rule - int numrep = 2; - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - //c.overload[10] = .1; - - - int pg_per = 100; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - - /* - cout << ndisks << " disks, " << endl; - cout << pg_per << " pgs per disk" << endl; - cout << numpg << " logical pgs" << endl; - cout << "numrep is " << numrep << endl; - */ - map > placement1, placement2; - - //c.print(cout, root); - - - // ORIGINAL - place(c, rule, numpg, numrep, placement1); - - int olddisks = ndisks; - - // add disks - //cout << " adding " << add << " disks" << endl; - vector disks; - for (int i=0; imake_primes(h); - - //Bucket *o = buckets[2].back(); - Bucket *o; - if (buckettype == 2) - o = buckets[modifydepth].front(); - else - o = buckets[modifydepth].back(); - - c.add_bucket(b); - //cout << " adding under " << o->get_id() << endl; - c.add_item(o->get_id(), b->get_id(), b->get_weight()); - //((MixedBucket*)o)->add_item(b->get_id(), b->get_weight()); - //newbucket = b; - - - // ADDED - //c.print(cout, root); - place(c, rule, numpg, numrep, placement2); - - int moved = 0; - for (int x=1; x<=numpg; x++) - if (placement1[x] != placement2[x]) - for (int j=0; j - -#include -#include -using namespace std; - - -int buckettype = 2; // 0 = mixed, 1 = linear, 2 = straw - -int big_one_skip = 255; -int big_one_size; -Bucket *big_one = 0; - -Bucket *make_bucket(Crush& c, vector& wid, int h, map< int, list >& buckets, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - - int s = wid[h]; - if (big_one_skip > 0) - big_one_skip--; - if (!big_one_skip && !big_one) - s = big_one_size; - - - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks " << disks.size()<< endl; - buckets[h].push_back(b); - return b; - } else { - // mixed - Bucket *b; - if (buckettype == 0) - b = new TreeBucket(h+1); - else if (buckettype == 1) - b = new ListBucket(h+1); - else if (buckettype == 2) - b = new StrawBucket(h+1); - c.add_bucket(b); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - n->set_parent(b->get_id()); - } - buckets[h].push_back(b); - //cout << b->get_id() << " mixedbucket with " << wid[h] << " at " << h << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, map< int, list >& buckets, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, buckets, ndisks); - return b->get_id(); -} - - -void place(Crush& c, Rule& rule, int numpg, int numrep, map >& placement) -{ - vector v(numrep); - map ocount; - - for (int x=1; x<=numpg; x++) { - - //cout << H(x) << "\t" << h(x) << endl; - c.do_rule(rule, x, v); - //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl; - - bool bad = false; - for (int i=0; i::iterator it = ocount.begin(); - it != ocount.end(); - it++) - cout << it->first << "\t" << it->second << endl; - -} - - -float testmovement(int depth, int branching, int udisks, int add) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - wid.push_back(udisks); - for (int d=1; d > buckets; - - big_one_size = add; - big_one = 0; - - //cout << "making tree" << endl; - root = make_hierarchy(c, wid, buckets, ndisks); - - //c.print(cout, root); - - - // rule - int numrep = 2; - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - //c.overload[10] = .1; - - - int pg_per = 100; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - - /* - cout << ndisks << " disks, " << endl; - cout << pg_per << " pgs per disk" << endl; - cout << numpg << " logical pgs" << endl; - cout << "numrep is " << numrep << endl; - */ - map > placement1, placement2; - - //c.print(cout, root); - - int olddisks = ndisks; - - - place(c, rule, numpg, numrep, placement1); - - if (1) { - // remove disks - assert(big_one); - c.adjust_item(big_one->get_id(), 0); - } - - int newdisks = ndisks - add; - - //c.print(cout, root); - place(c, rule, numpg, numrep, placement2); - - int moved = 0; - for (int x=1; x<=numpg; x++) - if (placement1[x] != placement2[x]) - for (int j=0; j >::iterator i = r.begin(); - i != r.end(); - i++) { - cout << i->first; - for (map::iterator j = i->second.begin(); - j != i->second.end(); - j++) - cout << "\t" << j->first << "\t" << j->second; - cout << endl; - } - */ -} - diff --git a/branches/sage/crush/crush.old/test/cluster_movement_rush.cc b/branches/sage/crush/crush.old/test/cluster_movement_rush.cc deleted file mode 100644 index 90cc197c24f65..0000000000000 --- a/branches/sage/crush/crush.old/test/cluster_movement_rush.cc +++ /dev/null @@ -1,218 +0,0 @@ - - -#include "../crush.h" -using namespace crush; - -#include - -#include -#include -using namespace std; - -int buckettype = 0; - -Bucket *make_bucket(Crush& c, vector& wid, int h, map< int, list >& buckets, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - buckets[h].push_back(b); - return b; - } else { - // mixed - //Bucket *b = new TreeBucket(h+1); - //Bucket *b = new ListBucket(h+1); - //Bucket *b = new StrawBucket(h+1); - Bucket *b; - if (buckettype == 0) - b = new TreeBucket(h+1); - else if (buckettype == 1 || buckettype == 2) - b = new ListBucket(h+1); - else if (buckettype == 3) - b = new StrawBucket(h+1); - - c.add_bucket(b); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - n->set_parent(b->get_id()); - } - buckets[h].push_back(b); - //cout << b->get_id() << " mixedbucket with " << wid[h] << " at " << h << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, map< int, list >& buckets, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, buckets, ndisks); - return b->get_id(); -} - - -void place(Crush& c, Rule& rule, int numpg, int numrep, map >& placement) -{ - vector v(numrep); - map ocount; - - for (int x=1; x<=numpg; x++) { - - //cout << H(x) << "\t" << h(x) << endl; - c.do_rule(rule, x, v); - //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl; - - bool bad = false; - for (int i=0; i::iterator it = ocount.begin(); - it != ocount.end(); - it++) - cout << it->first << "\t" << it->second << endl; - -} - - -float testmovement(int depth, int branching, int udisks, int add, int modifydepth) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - wid.push_back(udisks); - for (int d=1; d > buckets; - - root = make_hierarchy(c, wid, buckets, ndisks); - - //c.print(cout,root); - - // rule - int numrep = 2; - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - //c.overload[10] = .1; - - - int pg_per = 100; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - - /* - cout << ndisks << " disks, " << endl; - cout << pg_per << " pgs per disk" << endl; - cout << numpg << " logical pgs" << endl; - cout << "numrep is " << numrep << endl; - */ - map > placement1, placement2; - - //c.print(cout, root); - - - // ORIGINAL - place(c, rule, numpg, numrep, placement1); - - int olddisks = ndisks; - - // add disks - //cout << " adding " << add << " disks" << endl; - vector disks; - for (int i=0; imake_primes(h); - - //Bucket *o = buckets[2].back(); - Bucket *o; - if (buckettype == 2) - o = buckets[modifydepth].front(); - else - o = buckets[modifydepth].back(); - - c.add_bucket(b); - //cout << " adding under " << o->get_id() << endl; - c.add_item(o->get_id(), b->get_id(), b->get_weight(), buckettype == 2); - //((MixedBucket*)o)->add_item(b->get_id(), b->get_weight()); - //newbucket = b; - - - // ADDED - //c.print(cout, root); - place(c, rule, numpg, numrep, placement2); - - int moved = 0; - for (int x=1; x<=numpg; x++) - if (placement1[x] != placement2[x]) - for (int j=0; j - -#include -#include -using namespace std; - - -Clock g_clock; - - -Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks weight " << w << endl; - return b; - } else { - // mixed - Bucket *b = new TreeBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); - return b->get_id(); -} - - - -float go(int dep, int failpc) -{ - Hash h(73232313); - - //int overloadcutoff = (int)((float)10000.0 / (float)utilization); - - //cout << "util " << utilization << " cutoff " << overloadcutoff << endl; - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - for (int d=0; d ocount(ndisks); - //cout << ndisks << " disks, " << endl; - //cout << pg_per << " pgs per disk" << endl; - // cout << numpg << " logical pgs" << endl; - //cout << "numrep is " << numrep << endl; - - - int place = 1000000; - int times = place / numpg; - if (!times) times = 1; - - - //cout << "looping " << times << " times" << endl; - - float tavg[10]; - float tvar[10]; - for (int j=0;j<10;j++) { - tvar[j] = 0; - tavg[j] = 0; - } - int tvarnum = 0; - float trvar = 0.0; - - float overloadsum = 0.0; - float adjustsum = 0.0; - float afteroverloadsum = 0.0; - float aslowdown = 0.0; - int chooses = 0; - int xs = 1; - for (int t=0; t v(numrep); - - c.out.clear(); - - for (int z=0; z= ndisks) cout << "v[i] " << i << " is " << v[i] << " .. x = " << x << endl; - //assert(v[i] < ndisks); - ocount[v[i]]++; - } - } - utime_t t1b = g_clock.now(); - - // add in numf failed disks - for (int f = 0; f < numf; f++) { - int d = rand() % ndisks; - while (c.out.count(d)) d = rand() % ndisks; - c.out.insert(d); - } - - utime_t t3a = g_clock.now(); - for (int x=xs; x - -#include -#include -using namespace std; - - -Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - return b; - } else { - // mixed - MixedBucket *b = new MixedBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); - return b->get_id(); -} - - -Bucket *make_random(Crush& c, int wid, int height, int& ndisks) -{ - int w = rand() % (wid-1) + 2; - - if (height == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - return b; - } else { - // mixed - int h = rand() % height + 1; - MixedBucket *b = new MixedBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } - -} - - -float go(int dep, int overloadcutoff) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - for (int d=0; dget_id(); - //c.print(cout, root); - } - if (0) { - MixedBucket *b = new MixedBucket(1); - for (int i=0; i<10000; i++) - b->add_item(ndisks++, 10); - root = c.add_bucket(b); - } - if (0) { - vector disks; - for (int i=0; i<10000; i++) - disks.push_back(ndisks++); - UniformBucket *b = new UniformBucket(1, 0, 10000, disks); - Hash h(123); - b->make_primes(h); - root = c.add_bucket(b); - } - //cout << ndisks << " disks" << endl; - - - - // rule - int numrep = 1; - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - //c.overload[10] = .1; - - - int pg_per = 100; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - //cout << ndisks << " disks, " << endl; - //cout << pg_per << " pgs per disk" << endl; - // cout << numpg << " logical pgs" << endl; - //cout << "numrep is " << numrep << endl; - - - int place = 1000000; - int times = place / numpg; - if (!times) times = 1; - - - //cout << "looping " << times << " times" << endl; - - float tvar = 0; - int tvarnum = 0; - - float overloadsum = 0.0; - float adjustsum = 0.0; - float afteroverloadsum = 0.0; - int chooses = 0; - int xs = 1; - for (int t=0; t v(numrep); - - c.overload.clear(); - - for (int z=0; z overloadcutoff) - overloaded++; - - if (ocount[i] > 100+(overloadcutoff-100)/2) { - adjusted++; - c.overload[i] = 100.0 / (float)ocount[i]; - //cout << "disk " << i << " has " << ocount[i] << endl; - } - ocount[i] = 0; - } - //cout << overloaded << " overloaded" << endl; - overloadsum += (float)overloaded / (float)ndisks; - adjustsum += (float)adjusted / (float)ndisks; - - - for (int x=xs; x overloadcutoff) { - still++; - //c.overload[ocount[i]] = 100.0 / (float)ocount[i]; - //cout << "disk " << i << " has " << ocount[i] << endl; - } - } - //if (still) cout << "overload was " << overloaded << " now " << still << endl; - afteroverloadsum += (float)still / (float)ndisks; - - //cout << "collisions: " << c.collisions << endl; - //cout << "r bumps: " << c.bumps << endl; - - float avg = 0.0; - for (int i=0; i100; d -= 5) { - float var = go(3,d); - //cout << "## depth = " << d << endl; - //cout << d << "\t" << var << endl; - } -} diff --git a/branches/sage/crush/crush.old/test/depth_variance.cc b/branches/sage/crush/crush.old/test/depth_variance.cc deleted file mode 100644 index 7d60ebaae9501..0000000000000 --- a/branches/sage/crush/crush.old/test/depth_variance.cc +++ /dev/null @@ -1,185 +0,0 @@ - - -#include "../crush.h" -using namespace crush; - -#include - -#include -#include -using namespace std; - - -Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - return b; - } else { - // mixed - Bucket *b = new TreeBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); - return b->get_id(); -} - - -float go(int dep) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - if (1) { - for (int d=0; d ocount(ndisks); - //cout << ndisks << " disks, " << endl; - //cout << pg_per << " pgs per disk" << endl; - // cout << numpg << " logical pgs" << endl; - //cout << "numrep is " << numrep << endl; - - - int place = 100000; - int times = place / numpg; - if (!times) times = 1; - - cout << "#looping " << times << " times" << endl; - - float tvar = 0; - int tvarnum = 0; - float tavg = 0; - - int x = 0; - for (int t=0; t v(numrep); - - for (int z=0; z - -#include -#include -using namespace std; - - -Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks weight " << w << endl; - return b; - } else { - // mixed - Bucket *b = new TreeBucket(h+1); - //Bucket *b = new StrawBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); - return b->get_id(); -} - - - -float go(int dep, int overloadcutoff) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - for (int d=0; d ocount(ndisks); - //cout << ndisks << " disks, " << endl; - //cout << pg_per << " pgs per disk" << endl; - // cout << numpg << " logical pgs" << endl; - //cout << "numrep is " << numrep << endl; - - - int place = 100000; - int times = place / numpg; - if (!times) times = 1; - - - //cout << "looping " << times << " times" << endl; - - float tavg[10]; - float tvar[10]; - for (int j=0;j<10;j++) { - tvar[j] = 0; - tavg[j] = 0; - } - int tvarnum = 0; - - float overloadsum = 0.0; - float adjustsum = 0.0; - float afteroverloadsum = 0.0; - int chooses = 0; - int xs = 1; - for (int t=0; t v(numrep); - - c.overload.clear(); - - for (int z=0; z cutoff) - overloaded++; - - if (ocount[i] > adjoff) { - adjusted++; - c.overload[i] = (float)target / (float)ocount[i]; - //cout << "setting overload " << i << " to " << c.overload[i] << endl; - //cout << "disk " << i << " has " << ocount[i] << endl; - } - ocount[i] = 0; - } - //cout << overloaded << " overloaded" << endl; - overloadsum += (float)overloaded / (float)ndisks; - adjustsum += (float)adjusted / (float)ndisks; - - - - if (1) { - // second pass - for (int x=xs; x= adjoff) { - adjusted++; - if (c.overload.count(i) == 0) { - c.overload[i] = 1.0; - adjusted++; - } - //else cout << "(re)adjusting " << i << endl; - c.overload[i] *= (float)target / (float)ocount[i]; - //cout << "setting overload " << i << " to " << c.overload[i] << endl; - //cout << "disk " << i << " has " << ocount[i] << endl; - } - ocount[i] = 0; - } - } - - for (int x=xs; x cutoff) { - still++; - //c.overload[ocount[i]] = 100.0 / (float)ocount[i]; - if (c.overload.count(i)) cout << "[adjusted] "; - cout << "disk " << i << " has " << ocount[i] << endl; - } - } - //if (still) cout << "overload was " << overloaded << " now " << still << endl; - afteroverloadsum += (float)still / (float)ndisks; - - //cout << "collisions: " << c.collisions << endl; - //cout << "r bumps: " << c.bumps << endl; - - int n = ndisks/10; - float avg[10]; - float var[10]; - for (int i=0;i<10;i++) { - int s = n*i; - avg[i] = 0.0; - for (int j=0; j100; d -= 5) { - float var = go(3,d); - //cout << "## depth = " << d << endl; - //cout << d << "\t" << var << endl; - } -} diff --git a/branches/sage/crush/crush.old/test/movement.cc b/branches/sage/crush/crush.old/test/movement.cc deleted file mode 100644 index 2621f09457fe6..0000000000000 --- a/branches/sage/crush/crush.old/test/movement.cc +++ /dev/null @@ -1,223 +0,0 @@ - - -#include "../crush.h" -using namespace crush; - -#include - -#include -#include -using namespace std; - - -Bucket *make_bucket(Crush& c, vector& wid, int h, map< int, list >& buckets, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - buckets[h].push_back(b); - return b; - } else { - // mixed - MixedBucket *b = new MixedBucket(h+1); - c.add_bucket(b); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - n->set_parent(b->get_id()); - } - buckets[h].push_back(b); - //cout << b->get_id() << " mixedbucket with " << wid[h] << " at " << h << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, map< int, list >& buckets, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, buckets, ndisks); - return b->get_id(); -} - - -void place(Crush& c, Rule& rule, int numpg, int numrep, map >& placement) -{ - vector v(numrep); - map ocount; - - for (int x=1; x<=numpg; x++) { - - //cout << H(x) << "\t" << h(x) << endl; - c.do_rule(rule, x, v); - //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl; - - bool bad = false; - for (int i=0; i::iterator it = ocount.begin(); - it != ocount.end(); - it++) - cout << it->first << "\t" << it->second << endl; - -} - - -float testmovement(int depth, int branching, int udisks) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - wid.push_back(udisks); - for (int d=1; d > buckets; - - if (1) { - root = make_hierarchy(c, wid, buckets, ndisks); - } - if (0) { - MixedBucket *b = new MixedBucket(1); - for (int i=0; i<10000; i++) - b->add_item(ndisks++, 10); - root = c.add_bucket(b); - } - if (0) { - vector disks; - for (int i=0; i<10000; i++) - disks.push_back(ndisks++); - UniformBucket *b = new UniformBucket(1, 0, 1, disks); - Hash h(123); - b->make_primes(h); - root = c.add_bucket(b); - } - - - - // rule - int numrep = 2; - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - //c.overload[10] = .1; - - - int pg_per = 100; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - - /* - cout << ndisks << " disks, " << endl; - cout << pg_per << " pgs per disk" << endl; - cout << numpg << " logical pgs" << endl; - cout << "numrep is " << numrep << endl; - */ - map > placement1, placement2; - - //c.print(cout, root); - - place(c, rule, numpg, numrep, placement1); - - if (1) { - // failed - - //for (int i=500; i<1000; i++) - //c.failed.insert(i); - c.failed.insert(0); - } - - int olddisks = ndisks; - - if (1) { - int n = udisks; - //cout << " adding " << n << " disks" << endl; - vector disks; - for (int i=0; imake_primes(h); - Bucket *o = buckets[1].back(); - c.add_bucket(b); - //cout << " adding under " << o->get_id() << endl; - c.add_item(o->get_id(), b->get_id(), b->get_weight()); - //((MixedBucket*)o)->add_item(b->get_id(), b->get_weight()); - } - - //c.print(cout, root); - place(c, rule, numpg, numrep, placement2); - - int moved = 0; - for (int x=1; x<=numpg; x++) { - if (placement1[x] != placement2[x]) { - for (int j=0; j v; - cout << depth; - for (int branching = 3; branching < 16; branching += 1) { - float fac = testmovement(depth, branching, udisks); - v.push_back(fac); - int n = udisks * pow((float)branching, (float)depth-1); - cout << "\t" << n; - cout << "\t" << fac; - } - //for (int i=0; i - -#include -#include -using namespace std; - - -Bucket *make_bucket(Crush& c, vector& wid, int h, map< int, list >& buckets, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - buckets[h].push_back(b); - return b; - } else { - // mixed - MixedBucket *b = new MixedBucket(h+1); - c.add_bucket(b); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - n->set_parent(b->get_id()); - } - buckets[h].push_back(b); - //cout << b->get_id() << " mixedbucket with " << wid[h] << " at " << h << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, map< int, list >& buckets, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, buckets, ndisks); - return b->get_id(); -} - - -void place(Crush& c, Rule& rule, int numpg, int numrep, map >& placement) -{ - vector v(numrep); - map ocount; - - for (int x=1; x<=numpg; x++) { - - //cout << H(x) << "\t" << h(x) << endl; - c.do_rule(rule, x, v); - //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl; - - bool bad = false; - for (int i=0; i::iterator it = ocount.begin(); - it != ocount.end(); - it++) - cout << it->first << "\t" << it->second << endl; - -} - - -float testmovement(int depth, int branching, int udisks) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - wid.push_back(udisks); - for (int d=1; d > buckets; - - if (1) { - root = make_hierarchy(c, wid, buckets, ndisks); - } - if (0) { - MixedBucket *b = new MixedBucket(1); - for (int i=0; i<10000; i++) - b->add_item(ndisks++, 10); - root = c.add_bucket(b); - } - if (0) { - vector disks; - for (int i=0; i<10000; i++) - disks.push_back(ndisks++); - UniformBucket *b = new UniformBucket(1, 0, 1, disks); - Hash h(123); - b->make_primes(h); - root = c.add_bucket(b); - } - - - - // rule - int numrep = 2; - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - //c.overload[10] = .1; - - - int pg_per = 100; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - - /* - cout << ndisks << " disks, " << endl; - cout << pg_per << " pgs per disk" << endl; - cout << numpg << " logical pgs" << endl; - cout << "numrep is " << numrep << endl; - */ - map > placement1, placement2; - - //c.print(cout, root); - - place(c, rule, numpg, numrep, placement1); - - float over = .5; - - if (1) { - // failed - - //for (int i=500; i<1000; i++) - //c.failed.insert(i); - //c.failed.insert(0); - c.overload[0] = over; - } - - int olddisks = ndisks; - - - - if (0) { - int n = udisks; - //cout << " adding " << n << " disks" << endl; - vector disks; - for (int i=0; imake_primes(h); - Bucket *o = buckets[1].back(); - c.add_bucket(b); - //cout << " adding under " << o->get_id() << endl; - c.add_item(o->get_id(), b->get_id(), b->get_weight()); - //((MixedBucket*)o)->add_item(b->get_id(), b->get_weight()); - } - - //c.print(cout, root); - place(c, rule, numpg, numrep, placement2); - - vector moved(ndisks); - - //int moved = 0; - for (int d=0; d::iterator it = placement1[d].begin(); - it != placement1[d].end(); - it++) { - placement2[d].erase(*it); - } - } - - float avg = 0; - for (int d=0; d v; - cout << depth; - for (int branching = 3; branching < 16; branching += 1) { - float fac = testmovement(depth, branching, udisks); - v.push_back(fac); - int n = udisks * pow((float)branching, (float)depth-1); - //cout << "\t" << n; - //cout << "\t" << fac; - } - //for (int i=0; i - -#include -#include -using namespace std; - - -Clock g_clock; - - -Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks weight " << w << endl; - return b; - } else { - // mixed - Bucket *b = new TreeBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); - return b->get_id(); -} - - - -float go(int dep, int utilization ) -{ - Hash h(73232313); - - int overloadcutoff = (int)((float)10000.0 / (float)utilization); - - //cout << "util " << utilization << " cutoff " << overloadcutoff << endl; - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - for (int d=0; d ocount(ndisks); - //cout << ndisks << " disks, " << endl; - //cout << pg_per << " pgs per disk" << endl; - // cout << numpg << " logical pgs" << endl; - //cout << "numrep is " << numrep << endl; - - - int place = 100000; - int times = place / numpg; - if (!times) times = 1; - - - //cout << "looping " << times << " times" << endl; - - float tavg[10]; - float tvar[10]; - for (int j=0;j<10;j++) { - tvar[j] = 0; - tavg[j] = 0; - } - int tvarnum = 0; - - float overloadsum = 0.0; - float adjustsum = 0.0; - float afteroverloadsum = 0.0; - float aslowdown = 0.0; - int chooses = 0; - int xs = 1; - for (int t=0; t v(numrep); - - c.overload.clear(); - - for (int z=0; z cutoff) - overloaded++; - - if (ocount[i] > adjoff) { - adjusted++; - c.overload[i] = (float)target / (float)ocount[i]; - //cout << "setting overload " << i << " to " << c.overload[i] << endl; - //cout << "disk " << i << " has " << ocount[i] << endl; - } - ocount[i] = 0; - } - //cout << overloaded << " overloaded" << endl; - overloadsum += (float)overloaded / (float)ndisks; - adjustsum += (float)adjusted / (float)ndisks; - - - - // keep adjusting! - for (int bla=0; bla<5; bla++) { - utime_t t2a = g_clock.now(); - - // second pass - for (int x=xs; x= adjoff) { - numover++; - if (c.overload.count(i) == 0) { - c.overload[i] = 1.0; - adjusted++; - } - //else cout << "(re)adjusting " << i << endl; - c.overload[i] *= (float)target / (float)ocount[i]; - //cout << "setting overload " << i << " to " << c.overload[i] << endl; - //cout << "disk " << i << " has " << ocount[i] << endl; - } - ocount[i] = 0; - } - if (!numover) break; - cout << "readjusting" << endl; - } - - utime_t t3a = g_clock.now(); - - for (int x=xs; x cutoff) { - still++; - //c.overload[ocount[i]] = 100.0 / (float)ocount[i]; - if (c.overload.count(i)) cout << "[adjusted] "; - cout << "disk " << i << " has " << ocount[i] << endl; - } - } - //if (still) cout << "overload was " << overloaded << " now " << still << endl; - afteroverloadsum += (float)still / (float)ndisks; - - //cout << "collisions: " << c.collisions << endl; - //cout << "r bumps: " << c.bumps << endl; - - int n = ndisks/10; - float avg[10]; - float var[10]; - for (int i=0;i<10;i++) { - int s = n*i; - avg[i] = 0.0; - for (int j=0; j - -#include -#include -using namespace std; - - -Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - return b; - } else { - // mixed - MixedBucket *b = new MixedBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); - return b->get_id(); -} - - -Bucket *make_random(Crush& c, int wid, int height, int& ndisks) -{ - int w = rand() % (wid-1) + 2; - - if (height == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - return b; - } else { - // mixed - int h = rand() % height + 1; - MixedBucket *b = new MixedBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } - -} - - -float go(int dep, int overloadcutoff) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - for (int d=0; dget_id(); - //c.print(cout, root); - } - if (0) { - MixedBucket *b = new MixedBucket(1); - for (int i=0; i<10000; i++) - b->add_item(ndisks++, 10); - root = c.add_bucket(b); - } - if (0) { - vector disks; - for (int i=0; i<10000; i++) - disks.push_back(ndisks++); - UniformBucket *b = new UniformBucket(1, 0, 10000, disks); - Hash h(123); - b->make_primes(h); - root = c.add_bucket(b); - } - //cout << ndisks << " disks" << endl; - - - - // rule - int numrep = 1; - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - //c.overload[10] = .1; - - - int pg_per = 100; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - //cout << ndisks << " disks, " << endl; - //cout << pg_per << " pgs per disk" << endl; - // cout << numpg << " logical pgs" << endl; - //cout << "numrep is " << numrep << endl; - - - int place = 1000000; - int times = place / numpg; - if (!times) times = 1; - - - //cout << "looping " << times << " times" << endl; - - float tvar = 0; - int tvarnum = 0; - - float overloadsum = 0.0; - float adjustsum = 0.0; - float afteroverloadsum = 0.0; - int chooses = 0; - int xs = 1; - for (int t=0; t v(numrep); - - c.overload.clear(); - - for (int z=0; z overloadcutoff) - overloaded++; - - if (ocount[i] > 100+(overloadcutoff-100)/2) { - adjusted++; - c.overload[i] = 100.0 / (float)ocount[i]; - //cout << "disk " << i << " has " << ocount[i] << endl; - } - ocount[i] = 0; - } - //cout << overloaded << " overloaded" << endl; - overloadsum += (float)overloaded / (float)ndisks; - adjustsum += (float)adjusted / (float)ndisks; - - - for (int x=xs; x overloadcutoff) { - still++; - //c.overload[ocount[i]] = 100.0 / (float)ocount[i]; - //cout << "disk " << i << " has " << ocount[i] << endl; - } - } - //if (still) cout << "overload was " << overloaded << " now " << still << endl; - afteroverloadsum += (float)still / (float)ndisks; - - //cout << "collisions: " << c.collisions << endl; - //cout << "r bumps: " << c.bumps << endl; - - float avg = 0.0; - for (int i=0; i100; d -= 5) { - float var = go(3,d); - //cout << "## depth = " << d << endl; - //cout << d << "\t" << var << endl; - } -} diff --git a/branches/sage/crush/crush.old/test/sizes.cc b/branches/sage/crush/crush.old/test/sizes.cc deleted file mode 100644 index cc5780218210a..0000000000000 --- a/branches/sage/crush/crush.old/test/sizes.cc +++ /dev/null @@ -1,131 +0,0 @@ - -#include "include/types.h" -#include "include/Distribution.h" -#include "osd/OSDMap.h" - - -Distribution file_size_distn; //kb - - -list object_queue; -int max_object_size = 1024*1024*100; //kb - -off_t no; - -int get_object() //kb -{ - if (object_queue.empty()) { - int max = file_size_distn.sample(); - no++; - int filesize = max/2 + (rand() % 100) * max/200 + 1; - //cout << "file " << filesize << endl; - while (filesize > max_object_size) { - object_queue.push_back(max_object_size); - filesize -= max_object_size; - } - object_queue.push_back(filesize); - } - int s = object_queue.front(); - object_queue.pop_front(); - //cout << "object " << s << endl; - return s; -} - -void getdist(vector& v, float& avg, float& var) -{ - avg = 0.0; - for (int i=0; i pgs(n); - off_t did = 0; - - no = 0; - while (did < dist) { - off_t s = get_object(); - pgs[rand()%n] += s; - did += s; - } - while (!object_queue.empty()) - pgs[rand()%n] += get_object(); - - numo = no; - //cout << did/n << endl; - - //for (int i=0; i - -#include -#include -using namespace std; - - -Bucket *make_bucket(Crush& c, vector& wid, int h, map< int, list >& buckets, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - buckets[h].push_back(b); - return b; - } else { - // mixed - Bucket *b = new TreeBucket(h+1); - c.add_bucket(b); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - n->set_parent(b->get_id()); - } - buckets[h].push_back(b); - //cout << b->get_id() << " mixedbucket with " << wid[h] << " at " << h << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, map< int, list >& buckets, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, buckets, ndisks); - return b->get_id(); -} - - -void place(Crush& c, Rule& rule, int numpg, int numrep, vector& ocount) -{ - vector v(numrep); - //map ocount; - - for (int x=1; x<=numpg; x++) { - - //cout << H(x) << "\t" << h(x) << endl; - c.do_rule(rule, x, v); - //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl; - - bool bad = false; - for (int i=0; i wid; - wid.push_back(10); - wid.push_back(2); - - map< int, list > buckets; - root = make_hierarchy(c, wid, buckets, ndisks); - - // add small bucket - vector disks; - for (int i=0; i<3; i++) - disks.push_back(ndisks++); - UniformBucket *b = new UniformBucket(1, 0, 1, disks); - b->make_primes(h); - Bucket *o = buckets[1].back(); - c.add_bucket(b); - //cout << " adding under " << o->get_id() << endl; - c.add_item(o->get_id(), b->get_id(), b->get_weight()); - - - // rule - int numrep = 6; - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - //c.overload[10] = .1; - - int pg_per = 10000; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - - c.print(cout, root); - - place(c, rule, numpg, numrep, ocount); - - for (int i=0; i - -#include -#include -using namespace std; - - -int numrep = 1; - - -double go(int n, int bucket) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - Bucket *b; - vector items; - if (bucket == 0) b = new UniformBucket(1,0,10,items); - if (bucket == 1) b = new TreeBucket(1); - if (bucket == 2) b = new ListBucket(1); - if (bucket == 3) b = new StrawBucket(1); - - for (int d=0; dadd_item(ndisks++, 1); - - //if (!bucket) ((UniformBucket*)b)->make_primes(h); - - root = c.add_bucket(b); - - // rule - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - - int place = 1000000; - - - vector v(numrep); - set out; - map overload; - - utime_t start = g_clock.now(); - - for (int x=1; x <= place; x++) - c.do_rule(rule, x, v, out, overload); - - utime_t end = g_clock.now(); - - end -= start; - double el = (double)end; - - //cout << "\t" << ndisks; - - return el; -} - - -int main() -{ - - for (int n=4; n<=50; n += 4) { - cout << n; - for (int b=0; b<4; b++) { - double el = go(n,b); - cout << "\t" << el; - } - cout << endl; - } -} diff --git a/branches/sage/crush/crush.old/test/speed_depth.cc b/branches/sage/crush/crush.old/test/speed_depth.cc deleted file mode 100644 index 32275d16d2b31..0000000000000 --- a/branches/sage/crush/crush.old/test/speed_depth.cc +++ /dev/null @@ -1,174 +0,0 @@ - -#include "../../common/Clock.h" -#include "../crush.h" -using namespace crush; - - -Clock g_clock; - -#include - -#include -#include -using namespace std; - - -int uniform = 10; -int branching = 10; -int buckettype = 0; -int numrep = 1; - -Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - return b; - } else { - // mixed - Bucket *b; - if (buckettype == 0) - b = new TreeBucket(h+1); - else if (buckettype == 1 || buckettype == 2) - b = new ListBucket(h+1); - else if (buckettype == 3) - b = new StrawBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); - return b->get_id(); -} - - -double go(int dep, int per) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - if (1) { - wid.push_back(uniform); - for (int d=1; d v(numrep); - - utime_t start = g_clock.now(); - - set out; - map overload; - - for (int x=1; x <= place; x++) - c.do_rule(rule, x, v, out, overload); - - utime_t end = g_clock.now(); - - end -= start; - double el = (double)end; - - //cout << "\t" << ndisks; - - return el; -} - - -int main() -{ - uniform = branching = 8; - - cout << "// dep\tuniform\tbranch\tndisks" << endl; - - for (int d=2; d<=5; d++) { - cout << d;// << "\t" << branching; - cout << "\t" << uniform; - cout << "\t" << branching; - - int n = 1; - for (int i=0; i - -#include -#include -using namespace std; - - -int branching = 10; -bool linear = false; -int numrep = 1; - -Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - return b; - } else { - // mixed - Bucket *b; - if (linear) - b = new ListBucket(h+1); - else - b = new TreeBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); - return b->get_id(); -} - - -double go(int s) -{ - int dep = 2; - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - if (1) { - //for (int d=0; d v(numrep); - - utime_t start = g_clock.now(); - - for (int x=1; x <= place; x++) - c.do_rule(rule, x, v); - - utime_t end = g_clock.now(); - - end -= start; - double el = (double)end; - - cout << "\t" << ndisks; - - return el; -} - - -int main() -{ - branching = 8; - - int d = 2; - numrep = 2; - - for (int s = 64; s <= 32768; s *= 8) { - cout << "t"; - linear = false; - double el = go(s, d); - cout << "\t" << el; - - cout << "\tp"; - linear = true; - el = go(s, d); - cout << "\t" << el; - - cout << endl; - } -} diff --git a/branches/sage/crush/crush.old/test/t.cc b/branches/sage/crush/crush.old/test/t.cc deleted file mode 100644 index 0785ef47d6c04..0000000000000 --- a/branches/sage/crush/crush.old/test/t.cc +++ /dev/null @@ -1,25 +0,0 @@ - -#include "../../common/Clock.h" -#include "../crush.h" -using namespace crush; - - -Clock g_clock; - -#include - -#include -#include -using namespace std; - - -int branching = 10; -bool linear = false; -int numrep = 1; - -int main() { - - Bucket *b = new UniformBucket(1, 0); - //b = new TreeBucket(1); -} - diff --git a/branches/sage/crush/crush.old/test/testbucket.cc b/branches/sage/crush/crush.old/test/testbucket.cc deleted file mode 100644 index 065721c2c1967..0000000000000 --- a/branches/sage/crush/crush.old/test/testbucket.cc +++ /dev/null @@ -1,61 +0,0 @@ - - -#include "../Bucket.h" -using namespace crush; - -#include -#include -using namespace std; - - -ostream& operator<<(ostream& out, vector& v) -{ - out << "["; - for (int i=0; i ocount(ndisks); - - vector v(numrep); - int nplace = 0; - for (int x=1; x<1000000; x++) { - //cout << H(x) << "\t" << h(x) << endl; - for (int i=0; i -#include -using namespace std; - - -void getdist(vector& v, float& avg, float& var) -{ - avg = 0.0; - for (int i=0; i a(n); - vector b(n); - - for (int i=0; i c(n); - for (int i=0; i -#include - -class CrushWrapper { -public: - struct crush_map *map; - - CrushWrapper() : map(0) {} - ~CrushWrapper() { - if (map) crush_destroy(map); - } - - void create() { - if (map) crush_destroy(map); - map = crush_create(); - } - void finalize() { - assert(map); - crush_finalize(map); - } - - void update_offload_map(std::set& out_osds, - std::map& overload_osds) { - for (int i=0; imax_devices; i++) { - if (out_osds.count(i)) - map->device_offload[i] = 0x10000; - else if (overload_osds.count(i)) - map->device_offload[i] = (int)(0x10000 * overload_osds[i]); // FIXME: reverse? - else - map->device_offload[i] = 0; // normal. - } - } - - void do_rule(int rule, int x, vector& out, int maxout, int forcefeed) { - int rawout[maxout]; - - int numrep = crush_do_rule(map, rule, x, rawout, maxout, forcefeed); - - out.resize(numrep); - for (int i=0; imax_buckets, bl); - ::_encode_simple(map->max_rules, bl); - ::_encode_simple(map->max_devices, bl); - - // simple arrays - bl.append((char*)map->device_offload, sizeof(map->device_offload[0]) * map->max_devices); - - // buckets - for (unsigned i=0; imax_buckets; i++) { - __u32 type = 0; - if (map->buckets[i]) type = map->buckets[i]->bucket_type; - ::_encode_simple(type, bl); - if (!type) continue; - - ::_encode_simple(map->buckets[i]->id, bl); - ::_encode_simple(map->buckets[i]->type, bl); - ::_encode_simple(map->buckets[i]->bucket_type, bl); - ::_encode_simple(map->buckets[i]->weight, bl); - ::_encode_simple(map->buckets[i]->size, bl); - for (unsigned j=0; jbuckets[i]->size; j++) - ::_encode_simple(map->buckets[i]->items[j], bl); - - switch (map->buckets[i]->type) { - case CRUSH_BUCKET_UNIFORM: - for (unsigned j=0; jbuckets[i]->size; j++) - ::_encode_simple(((crush_bucket_uniform*)map->buckets[i])->primes[j], bl); - ::_encode_simple(((crush_bucket_uniform*)map->buckets[i])->item_weight, bl); - break; - - case CRUSH_BUCKET_LIST: - for (unsigned j=0; jbuckets[i]->size; j++) { - ::_encode_simple(((crush_bucket_list*)map->buckets[i])->item_weights[j], bl); - ::_encode_simple(((crush_bucket_list*)map->buckets[i])->sum_weights[j], bl); - } - break; - - case CRUSH_BUCKET_TREE: - for (unsigned j=0; jbuckets[i]->size; j++) - ::_encode_simple(((crush_bucket_tree*)map->buckets[i])->node_weights[j], bl); - break; - - case CRUSH_BUCKET_STRAW: - for (unsigned j=0; jbuckets[i]->size; j++) - ::_encode_simple(((crush_bucket_straw*)map->buckets[i])->straws[j], bl); - break; - } - } - - // rules - for (unsigned i=0; imax_rules; i++) { - __u32 yes = map->rules[i] ? 1:0; - ::_encode_simple(yes, bl); - if (!yes) continue; - - ::_encode_simple(map->rules[i]->len, bl); - for (unsigned j=0; jrules[i]->len; j++) - ::_encode_simple(map->rules[i]->steps[j], bl); - } - } - - void _decode(bufferlist::iterator &blp) { - create(); - ::_decode_simple(map->max_buckets, blp); - ::_decode_simple(map->max_rules, blp); - ::_decode_simple(map->max_devices, blp); - - map->device_offload = (__u32*)malloc(sizeof(map->device_offload[0])*map->max_devices); - blp.copy(sizeof(map->device_offload[0])*map->max_devices, (char*)map->device_offload); - - // buckets - map->buckets = (crush_bucket**)malloc(sizeof(crush_bucket*)*map->max_buckets); - for (unsigned i=0; imax_buckets; i++) { - __u32 type; - ::_decode_simple(type, blp); - if (!type) { - map->buckets[i] = 0; - continue; - } - - int size = 0; - switch (type) { - case CRUSH_BUCKET_UNIFORM: - size = sizeof(crush_bucket_uniform); - break; - case CRUSH_BUCKET_LIST: - size = sizeof(crush_bucket_list); - break; - case CRUSH_BUCKET_TREE: - size = sizeof(crush_bucket_tree); - break; - case CRUSH_BUCKET_STRAW: - size = sizeof(crush_bucket_straw); - break; - default: - assert(0); - } - map->buckets[i] = (crush_bucket*)malloc(size); - memset(map->buckets[i], 0, size); - - ::_decode_simple(map->buckets[i]->id, blp); - ::_decode_simple(map->buckets[i]->type, blp); - ::_decode_simple(map->buckets[i]->bucket_type, blp); - ::_decode_simple(map->buckets[i]->weight, blp); - ::_decode_simple(map->buckets[i]->size, blp); - - map->buckets[i]->items = (__s32*)malloc(sizeof(__s32)*map->buckets[i]->size); - for (unsigned j=0; jbuckets[i]->size; j++) - ::_decode_simple(map->buckets[i]->items[j], blp); - - switch (map->buckets[i]->type) { - case CRUSH_BUCKET_UNIFORM: - ((crush_bucket_uniform*)map->buckets[i])->primes = - (__u32*)malloc(map->buckets[i]->size * sizeof(__u32)); - for (unsigned j=0; jbuckets[i]->size; j++) - ::_decode_simple(((crush_bucket_uniform*)map->buckets[i])->primes[j], blp); - ::_decode_simple(((crush_bucket_uniform*)map->buckets[i])->item_weight, blp); - break; - - case CRUSH_BUCKET_LIST: - ((crush_bucket_list*)map->buckets[i])->item_weights = - (__u32*)malloc(map->buckets[i]->size * sizeof(__u32)); - ((crush_bucket_list*)map->buckets[i])->sum_weights = - (__u32*)malloc(map->buckets[i]->size * sizeof(__u32)); - - for (unsigned j=0; jbuckets[i]->size; j++) { - ::_decode_simple(((crush_bucket_list*)map->buckets[i])->item_weights[j], blp); - ::_decode_simple(((crush_bucket_list*)map->buckets[i])->sum_weights[j], blp); - } - break; - - case CRUSH_BUCKET_TREE: - ((crush_bucket_tree*)map->buckets[i])->node_weights = - (__u32*)malloc(map->buckets[i]->size * sizeof(__u32)); - for (unsigned j=0; jbuckets[i]->size; j++) - ::_decode_simple(((crush_bucket_tree*)map->buckets[i])->node_weights[j], blp); - break; - - case CRUSH_BUCKET_STRAW: - ((crush_bucket_straw*)map->buckets[i])->straws = - (__u32*)malloc(map->buckets[i]->size * sizeof(__u32)); - for (unsigned j=0; jbuckets[i]->size; j++) - ::_decode_simple(((crush_bucket_straw*)map->buckets[i])->straws[j], blp); - break; - } - } - - // rules - map->rules = (crush_rule**)malloc(sizeof(crush_rule*)*map->max_rules); - for (unsigned i=0; imax_rules; i++) { - __u32 yes; - ::_decode_simple(yes, blp); - if (!yes) { - map->rules[i] = 0; - continue; - } - - map->rules[i] = (crush_rule*)malloc(sizeof(crush_rule)); - memset(map->rules[i], 0, sizeof(crush_rule)); - - ::_decode_simple(map->rules[i]->len, blp); - map->rules[i]->steps = (crush_rule_step*)malloc(sizeof(crush_rule_step) * map->rules[i]->len); - for (unsigned j=0; jrules[i]->len; j++) - ::_decode_simple(map->rules[i]->steps[j], blp); - } - - finalize(); - } -}; - -#endif diff --git a/branches/sage/crush/crush/Makefile b/branches/sage/crush/crush/Makefile deleted file mode 100644 index 72d1b676bdb32..0000000000000 --- a/branches/sage/crush/crush/Makefile +++ /dev/null @@ -1,30 +0,0 @@ - -CC = gcc -CFLAGS = -Wall -CFLAGS += -g -CFLAGS += -O3 -LD = ld -RM = rm - -all: depend libcrush.o test - -clean: - rm -f *.o libcrush.o - -%.o: %.c - ${CC} ${CFLAGS} -c $< -o $@ - -libcrush.o: builder.o crush.o mapper.o - $(LD) -i -o $@ $^ - -test: test.c libcrush.o - $(CC) ${CFLAGS} -lm $^ -o $@ - -.depend: - touch .depend - -depend: - $(RM) .depend - makedepend -f- -- $(CFLAGS) -- *.c > .depend 2>/dev/null - -include .depend diff --git a/branches/sage/crush/crush/builder.c b/branches/sage/crush/crush/builder.c deleted file mode 100644 index a430dbd5c6284..0000000000000 --- a/branches/sage/crush/crush/builder.c +++ /dev/null @@ -1,375 +0,0 @@ - -#include -#include -#include -#include - -#include "builder.h" -#include "hash.h" - -struct crush_map *crush_create() -{ - struct crush_map *m; - m = malloc(sizeof(*m)); - memset(m, 0, sizeof(*m)); - return m; -} - -/* - * finalize should be called _after_ all buckets are added to the map. - */ -void crush_finalize(struct crush_map *map) -{ - int b, i, c; - - /* calc max_devices */ - for (b=0; bmax_buckets; b++) { - if (map->buckets[b] == 0) continue; - for (i=0; ibuckets[b]->size; i++) - if (map->buckets[b]->items[i] >= map->max_devices) - map->max_devices = map->buckets[b]->items[i] + 1; - } - - /* allocate arrays */ - map->device_parents = malloc(sizeof(map->device_parents[0]) * map->max_devices); - memset(map->device_parents, 0, sizeof(map->device_parents[0]) * map->max_devices); - map->bucket_parents = malloc(sizeof(map->bucket_parents[0]) * map->max_buckets); - memset(map->bucket_parents, 0, sizeof(map->bucket_parents[0]) * map->max_buckets); - - /* build parent maps */ - for (b=0; bmax_buckets; b++) { - if (map->buckets[b] == 0) continue; - for (i=0; ibuckets[b]->size; i++) { - c = map->buckets[b]->items[i]; - BUG_ON(c >= map->max_devices); - if (c >= 0) - map->device_parents[c] = map->buckets[b]->id; - else - map->bucket_parents[-1-c] = map->buckets[b]->id; - } - } - - /* new device offload map? */ - if (!map->device_offload) { - map->device_offload = malloc(sizeof(map->device_offload[0]) * map->max_devices); - memset(map->device_offload, 0, sizeof(map->device_offload[0]) * map->max_devices); - } -} - - - - - -/** rules **/ - -int crush_add_rule(struct crush_map *map, - int ruleno, - struct crush_rule *rule) -{ - int oldsize; - - if (ruleno < 0) { - for (ruleno=0; ruleno < map->max_rules; ruleno++) - if (map->rules[ruleno] == 0) break; - } - if (ruleno >= map->max_rules) { - /* expand array */ - oldsize = map->max_rules; - map->max_rules = ruleno+1; - map->rules = realloc(map->rules, map->max_rules * sizeof(map->rules[0])); - memset(map->rules + oldsize, 0, (map->max_rules-oldsize) * sizeof(map->rules[0])); - } - - /* add it */ - map->rules[ruleno] = rule; - return ruleno; -} - -struct crush_rule *crush_make_rule() -{ - struct crush_rule *rule; - - rule = malloc(sizeof(struct crush_rule)); - memset(rule, 0, sizeof(*rule)); - return rule; -} - -void crush_rule_add_step(struct crush_rule *rule, int op, int arg1, int arg2) -{ - rule->len++; - if (rule->steps) - rule->steps = realloc(rule->steps, sizeof(rule->steps[0])*rule->len); - else - rule->steps = malloc(sizeof(rule->steps[0])*rule->len); - rule->steps[rule->len-1].op = op; - rule->steps[rule->len-1].arg1 = arg1; - rule->steps[rule->len-1].arg2 = arg2; -} - - -/** buckets **/ - -int crush_add_bucket(struct crush_map *map, - struct crush_bucket *bucket) -{ - int id; - int oldsize; - - /* find a bucket id */ - for (id=0; id < map->max_buckets; id++) - if (map->buckets[id] == 0) break; - if (id == map->max_buckets) { - /* expand array */ - oldsize = map->max_buckets; - if (map->max_buckets) - map->max_buckets *= 2; - else - map->max_buckets = 8; - map->buckets = realloc(map->buckets, map->max_buckets * sizeof(map->buckets[0])); - memset(map->buckets + oldsize, 0, (map->max_buckets-oldsize) * sizeof(map->buckets[0])); - } - - /* add it */ - bucket->id = -1 - id; - map->buckets[id] = bucket; - return -1 - id; -} - - -/* uniform bucket */ - -struct crush_bucket_uniform * -crush_make_uniform_bucket(int type, int size, - int *items, - int item_weight) -{ - int i, j, x; - struct crush_bucket_uniform *bucket; - - bucket = malloc(sizeof(*bucket)); - memset(bucket, 0, sizeof(*bucket)); - bucket->h.bucket_type = CRUSH_BUCKET_UNIFORM; - bucket->h.type = type; - bucket->h.size = size; - bucket->h.weight = size * item_weight; - - bucket->item_weight = item_weight; - - bucket->h.items = malloc(sizeof(__u32)*size); - for (i=0; ih.items[i] = items[i]; - - /* generate some primes */ - bucket->primes = malloc(sizeof(__u32)*size); - - x = size + 1; - x += crush_hash32(size) % (3*size); /* make it big */ - x |= 1; /* and odd */ - - i=0; - while (i < size) { - for (j=2; j*j <= x; j++) - if (x % j == 0) break; - if (j*j > x) - bucket->primes[i++] = x; - x += 2; - } - - return bucket; -} - - -/* list bucket */ - -struct crush_bucket_list* -crush_make_list_bucket(int type, int size, - int *items, - int *weights) -{ - int i; - int w; - struct crush_bucket_list *bucket; - - bucket = malloc(sizeof(*bucket)); - memset(bucket, 0, sizeof(*bucket)); - bucket->h.bucket_type = CRUSH_BUCKET_LIST; - bucket->h.type = type; - bucket->h.size = size; - - bucket->h.items = malloc(sizeof(__u32)*size); - bucket->item_weights = malloc(sizeof(__u32)*size); - bucket->sum_weights = malloc(sizeof(__u32)*size); - w = 0; - for (i=size-1; i>=0; i--) { - bucket->h.items[i] = items[i]; - bucket->item_weights[i] = weights[i]; - w += weights[i]; - bucket->sum_weights[i] = w; - /*printf("%d item %d weight %d sum %d\n", - i, items[i], weights[i], bucket->sum_weights[i]);*/ - } - - bucket->h.weight = w; - - return bucket; -} - - -/* tree bucket */ - -static int height(int n) { - int h = 0; - while ((n & 1) == 0) { - h++; - n = n >> 1; - } - return h; -} -static int on_right(int n, int h) { - return n & (1 << (h+1)); -} -static int parent(int n) -{ - int h = height(n); - if (on_right(n, h)) - return n - (1<h.bucket_type = CRUSH_BUCKET_TREE; - bucket->h.type = type; - bucket->h.size = size; - - /* calc tree depth */ - depth = 1; - t = size - 1; - while (t) { - t = t >> 1; - depth++; - } - bucket->h.size = 1 << depth; - - bucket->h.items = malloc(sizeof(__u32)*bucket->h.size); - bucket->node_weights = malloc(sizeof(__u32)*bucket->h.size); - - memset(bucket->h.items, 0, sizeof(__u32)*bucket->h.size); - memset(bucket->node_weights, 0, sizeof(__u32)*bucket->h.size); - - for (i=0; ih.items[node] = items[i]; - bucket->node_weights[node] = weights[i]; - bucket->h.weight += weights[i]; - for (j=1; jnode_weights[node] += weights[i]; - } - } - BUG_ON(bucket->node_weights[bucket->h.size/2] != bucket->h.weight); - - return bucket; -} - - -/* straw bucket */ - -struct crush_bucket_straw * -crush_make_straw_bucket(int type, - int size, - int *items, - int *weights) -{ - struct crush_bucket_straw *bucket; - int *reverse; - int i, j, k; - - double straw, wbelow, lastw, wnext, pbelow; - int numleft; - - bucket = malloc(sizeof(*bucket)); - memset(bucket, 0, sizeof(*bucket)); - bucket->h.bucket_type = CRUSH_BUCKET_STRAW; - bucket->h.type = type; - bucket->h.size = size; - - bucket->h.items = malloc(sizeof(__u32)*size); - bucket->straws = malloc(sizeof(__u32)*size); - - bucket->h.weight = 0; - for (i=0; ih.items[i] = items[i]; - bucket->h.weight += weights[i]; - } - - /* reverse sort by weight (simple insertion sort) */ - reverse = malloc(sizeof(int) * size); - reverse[0] = 0; - for (i=1; ij; k--) - reverse[k] = reverse[k-1]; - reverse[j] = i; - break; - } - } - if (j == i) - reverse[i] = i; - } - - numleft = size; - straw = 1.0; - wbelow = 0; - lastw = 0; - - i=0; - while (i < size) { - /* set this item's straw */ - bucket->straws[reverse[i]] = straw * 0x10000; - /*printf("item %d at %d weight %d straw %d (%lf)\n", - items[reverse[i]], - reverse[i], weights[reverse[i]], bucket->straws[reverse[i]], straw);*/ - i++; - if (i == size) break; - - /* same weight as previous? */ - if (weights[reverse[i]] == weights[reverse[i-1]]) { - /*printf("same as previous\n");*/ - continue; - } - - /* adjust straw for next guy */ - wbelow += ((double)weights[reverse[i-1]] - lastw) * numleft; - for (j=i; j -#endif - -#include "crush.h" - -void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b) -{ - free(b->primes); - free(b->h.items); - free(b); -} - -void crush_destroy_bucket_list(struct crush_bucket_list *b) -{ - free(b->item_weights); - free(b->sum_weights); - free(b->h.items); - free(b); -} - -void crush_destroy_bucket_tree(struct crush_bucket_tree *b) -{ - free(b->node_weights); - free(b); -} - -void crush_destroy_bucket_straw(struct crush_bucket_straw *b) -{ - free(b->straws); - free(b->h.items); - free(b); -} - - -/* - * deallocate - */ -void crush_destroy(struct crush_map *map) -{ - int b; - - /* buckets */ - for (b=0; bmax_buckets; b++) { - if (map->buckets[b] == 0) continue; - switch (map->buckets[b]->type) { - case CRUSH_BUCKET_UNIFORM: - crush_destroy_bucket_uniform((struct crush_bucket_uniform*)map->buckets[b]); - break; - case CRUSH_BUCKET_LIST: - crush_destroy_bucket_list((struct crush_bucket_list*)map->buckets[b]); - break; - case CRUSH_BUCKET_TREE: - crush_destroy_bucket_tree((struct crush_bucket_tree*)map->buckets[b]); - break; - case CRUSH_BUCKET_STRAW: - crush_destroy_bucket_straw((struct crush_bucket_straw*)map->buckets[b]); - break; - } - } - free(map->buckets); - - /* rules */ - for (b=0; bmax_rules; b++) { - if (map->rules[b] == 0) continue; - if (map->rules[b]->steps) - free(map->rules[b]->steps); - free(map->rules[b]); - } - free(map->rules); - - free(map->bucket_parents); - free(map->device_parents); - free(map->device_offload); - free(map); -} - - diff --git a/branches/sage/crush/crush/crush.h b/branches/sage/crush/crush/crush.h deleted file mode 100644 index 5cf6cff498f13..0000000000000 --- a/branches/sage/crush/crush/crush.h +++ /dev/null @@ -1,117 +0,0 @@ -#ifndef _CRUSH_CRUSH_H -#define _CRUSH_CRUSH_H - -#ifdef __cplusplus -extern "C" { -#endif - -#include /* just for int types */ - -#ifndef BUG_ON -# include -# define BUG_ON(x) assert(!(x)) -#endif - - -/*** RULES ***/ -enum { - CRUSH_RULE_TAKE, - CRUSH_RULE_CHOOSE_FIRSTN, - CRUSH_RULE_CHOOSE_INDEP, - CRUSH_RULE_EMIT -}; - -#define CRUSH_MAX_DEPTH 10 -#define CRUSH_MAX_SET 10 - -struct crush_rule_step { - __u32 op; - __s32 arg1; - __s32 arg2; -}; - -struct crush_rule { - __u32 len; - struct crush_rule_step *steps; -}; - - - -/*** BUCKETS ***/ - -enum { - CRUSH_BUCKET_UNIFORM = 1, - CRUSH_BUCKET_LIST = 2, - CRUSH_BUCKET_TREE = 3, - CRUSH_BUCKET_STRAW = 4 -}; - -struct crush_bucket { - __s32 id; /* this'll be negative */ - __u16 type; - __u16 bucket_type; - __u32 weight; /* 16-bit fixed point */ - __u32 size; /* num items */ - __s32 *items; -}; - -struct crush_bucket_uniform { - struct crush_bucket h; - __u32 *primes; - __u32 item_weight; /* 16-bit fixed point */ -}; - -struct crush_bucket_list { - struct crush_bucket h; - __u32 *item_weights; /* 16-bit fixed point */ - __u32 *sum_weights; /* 16-bit fixed point. element i is sum of weights 0..i, inclusive */ -}; - -struct crush_bucket_tree { - struct crush_bucket h; /* note: h.size is tree size, not number of actual items */ - __u32 *node_weights; -}; - -struct crush_bucket_straw { - struct crush_bucket h; - __u32 *straws; /* 16-bit fixed point */ -}; - - - -/*** CRUSH ***/ - -struct crush_map { - struct crush_bucket **buckets; - struct crush_rule **rules; - - /* parent pointers */ - __u32 *bucket_parents; - __u32 *device_parents; - - /* offload - * size max_devices, values 0...0xffff - * 0 == normal - * 0x10000 == 100% offload (i.e. failed) - */ - __u32 *device_offload; - - __u32 max_buckets; - __u32 max_rules; - __s32 max_devices; -}; - - -/* common destructors */ -extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *); -extern void crush_destroy_bucket_list(struct crush_bucket_list *); -extern void crush_destroy_bucket_tree(struct crush_bucket_tree *); -extern void crush_destroy_bucket_straw(struct crush_bucket_straw *); -extern void crush_destroy(struct crush_map *map); - - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/branches/sage/crush/crush/mapper.c b/branches/sage/crush/crush/mapper.c deleted file mode 100644 index e0a71f85631ff..0000000000000 --- a/branches/sage/crush/crush/mapper.c +++ /dev/null @@ -1,351 +0,0 @@ - -#include "crush.h" -#include "hash.h" - -#include -#include - -/** bucket choose methods **/ - -/* uniform */ - -static int -crush_bucket_uniform_choose(struct crush_bucket_uniform *bucket, int x, int r) -{ - unsigned o, p, s; - o = crush_hash32_2(x, bucket->h.id) & 0xffff; - p = bucket->primes[crush_hash32_2(bucket->h.id, x) % bucket->h.size]; - s = (x + o + (r+1)*p) % bucket->h.size; - /*printf("%d %d %d %d\n", x, o, r, p);*/ - return bucket->h.items[s]; -} - - -/* list */ - -static int -crush_bucket_list_choose(struct crush_bucket_list *bucket, int x, int r) -{ - int i; - __u64 w; - - for (i=0; ih.size; i++) { - w = crush_hash32_4(x, bucket->h.items[i], r, bucket->h.id); - w &= 0xffff; - /*printf("%d item %d weight %d sum_weight %d r %lld", - i, bucket->h.items[i], bucket->item_weights[i], bucket->sum_weights[i], w);*/ - w *= bucket->sum_weights[i]; - w = w >> 16; - /*printf(" scaled %lld\n", w);*/ - if (w < bucket->item_weights[i]) - return bucket->h.items[i]; - } - - BUG_ON(1); - return 0; -} - - -/* tree */ - -static int height(int n) { - int h = 0; - while ((n & 1) == 0) { - h++; - n = n >> 1; - } - return h; -} -static int left(int x) { - int h = height(x); - return x - (1 << (h-1)); -} -static int right(int x) { - int h = height(x); - return x + (1 << (h-1)); -} -static int terminal(int x) { - return x & 1; -} - -static int -crush_bucket_tree_choose(struct crush_bucket_tree *bucket, int x, int r) -{ - int n, l; - __u32 w; - __u64 t; - - /* start at root */ - n = bucket->h.size >> 1; - - while (!terminal(n)) { - /* pick point in [0, w) */ - w = bucket->node_weights[n]; - t = (__u64)crush_hash32_4(x, n, r, bucket->h.id) * (__u64)w; - t = t >> 32; - - /* left or right? */ - l = left(n); - if (t < bucket->node_weights[l]) - n = l; - else - n = right(n); - } - - return bucket->h.items[n]; -} - - -/* straw */ - -static int -crush_bucket_straw_choose(struct crush_bucket_straw *bucket, int x, int r) -{ - int i; - int high = 0; - __u64 high_draw = 0; - __u64 draw; - - for (i=0; ih.size; i++) { - draw = crush_hash32_3(x, bucket->h.items[i], r); - draw &= 0xffff; - draw *= bucket->straws[i]; - if (i == 0 || draw > high_draw) { - high = i; - high_draw = draw; - } - } - - return bucket->h.items[high]; -} - - - - -/** crush proper **/ - - -/* - * choose numrep distinct items of given type - */ -static int crush_choose(struct crush_map *map, - struct crush_bucket *bucket, - int x, int numrep, int type, - int *out, int firstn) -{ - int rep; - int ftotal, flocal; - int retry_descent, retry_bucket, skip_rep; - struct crush_bucket *in = bucket; - int r; - int i; - int item; - int itemtype; - int outpos; - int collide, bad; - - outpos = 0; - - for (rep = 0; rep < numrep; rep++) { - /* keep trying until we get a non-out, non-colliding item */ - ftotal = 0; - skip_rep = 0; - do { - retry_descent = 0; - in = bucket; /* initial bucket */ - - /* choose through intervening buckets */ - flocal = 0; - do { - retry_bucket = 0; - r = rep; - if (in->bucket_type == CRUSH_BUCKET_UNIFORM) { - /* be careful */ - if (firstn || numrep >= in->size) - r += ftotal; /* r' = r + f_total */ - else if (in->size % numrep == 0) - r += (numrep+1) * flocal; /* r'=r+(n+1)*f_local */ - else - r += numrep * flocal; /* r' = r + n*f_local */ - } else { - if (firstn) - r += ftotal; /* r' = r + f_total */ - else - r += numrep * flocal; /* r' = r + n*f_local */ - } - - /* bucket choose */ - switch (in->bucket_type) { - case CRUSH_BUCKET_UNIFORM: - item = crush_bucket_uniform_choose((struct crush_bucket_uniform*)in, x, r); - break; - case CRUSH_BUCKET_LIST: - item = crush_bucket_list_choose((struct crush_bucket_list*)in, x, r); - break; - case CRUSH_BUCKET_TREE: - item = crush_bucket_tree_choose((struct crush_bucket_tree*)in, x, r); - break; - case CRUSH_BUCKET_STRAW: - item = crush_bucket_straw_choose((struct crush_bucket_straw*)in, x, r); - break; - default: - BUG_ON(1); - } - - /* desired type? */ - if (item < 0) - itemtype = map->buckets[-1-item]->type; - else - itemtype = 0; - - /* keep going? */ - if (itemtype != type) { - BUG_ON(item >= 0 || (-1-item) >= map->max_buckets); - in = map->buckets[-1-item]; - continue; - } - - /* collision? */ - collide = 0; - for (i=0; idevice_offload[item]) { - if (map->device_offload[item] >= 0x10000) - bad = 1; - else if ((crush_hash32_2(x, item) & 0xffff) < map->device_offload[item]) - bad = 1; - } - - if (bad || collide) { - ftotal++; - flocal++; - - if (collide && flocal < 3) - retry_bucket = 1; /* retry locally a few times */ - else if (ftotal < 10) - retry_descent = 1; /* then retry descent */ - else - skip_rep = 1; /* else give up */ - } - } while (retry_bucket); - } while (retry_descent); - - if (skip_rep) continue; - - out[outpos] = item; - outpos++; - } - - return outpos; -} - - -int crush_do_rule(struct crush_map *map, - int ruleno, - int x, int *result, int result_max, - int forcefeed) /* -1 for none */ -{ - int result_len; - int force_stack[CRUSH_MAX_DEPTH]; - int force_pos = -1; - int a[CRUSH_MAX_SET]; - int b[CRUSH_MAX_SET]; - int *w; - int wsize = 0; - int *o; - int osize; - int *tmp; - struct crush_rule *rule; - int step; - int i; - int numrep; - - rule = map->rules[ruleno]; - result_len = 0; - w = a; - o = b; - - /* determine hierarchical context of forcefeed, if any */ - if (forcefeed >= 0) { - if (map->device_parents[forcefeed] == 0) { - /*printf("CRUSH: forcefed device dne\n");*/ - return -1; /* force fed device dne */ - } - while (1) { - force_stack[++force_pos] = forcefeed; - /*printf("force_stack[%d] = %d\n", force_pos, forcefeed);*/ - if (forcefeed >= 0) - forcefeed = map->device_parents[forcefeed]; - else - forcefeed = map->bucket_parents[-1-forcefeed]; - if (forcefeed == 0) break; - } - } - - for (step = 0; step < rule->len; step++) { - switch (rule->steps[step].op) { - case CRUSH_RULE_TAKE: - if (force_pos >= 0) { - w[0] = force_stack[force_pos]; - force_pos--; - BUG_ON(w[0] != rule->steps[step].arg1); - } else { - w[0] = rule->steps[step].arg1; - } - wsize = 1; - break; - - case CRUSH_RULE_CHOOSE_FIRSTN: - case CRUSH_RULE_CHOOSE_INDEP: - BUG_ON(wsize == 0); - - /* reset output */ - osize = 0; - - for (i = 0; i < wsize; i++) { - numrep = rule->steps[step].arg1; - if (force_pos >= 0) { - o[osize++] = force_stack[force_pos]; - force_pos--; - numrep--; - } - if (!numrep) continue; - osize += crush_choose(map, - map->buckets[-1-w[i]], - x, numrep, rule->steps[step].arg2, - o+osize, rule->steps[step].op == CRUSH_RULE_CHOOSE_FIRSTN); - } - - /* swap t and w arrays */ - tmp = o; - o = w; - w = tmp; - wsize = osize; - break; - - - case CRUSH_RULE_EMIT: - for (i=0; i -#include -#include -#include - -#include "crush.h" -#include "mapper.h" -#include "builder.h" - - -int main() -{ - int sub[10]; - int subw[10]; - int i, j; - int d; - int o[100]; - int root; - int ruleno; - int r[10]; - - int uw[10] = { 1000, 1000, 500, 1000, 2000, 1000, 1000, 3000, 1000, 500 }; - - struct crush_bucket *b; - struct crush_rule *rule; - - struct crush_map *map = crush_create(); - - d = 0; - for (i=0; i<10; i++) { - for (j=0; j<10; j++) - o[j] = d++; - b = (struct crush_bucket*)crush_make_uniform_bucket(1, 10, o, uw[i]); - sub[i] = crush_add_bucket(map, b); - subw[i] = b->weight; - printf("make bucket %d weight %d\n", sub[i], subw[i]); - } - - root = crush_add_bucket(map, (struct crush_bucket*)crush_make_tree_bucket(2, 10, sub, subw)); - - rule = crush_make_rule(); - crush_rule_add_step(rule, CRUSH_RULE_TAKE, root, 0); - crush_rule_add_step(rule, CRUSH_RULE_CHOOSE_FIRSTN, 3, 1); - crush_rule_add_step(rule, CRUSH_RULE_CHOOSE_FIRSTN, 1, 0); - crush_rule_add_step(rule, CRUSH_RULE_EMIT, 0, 0); - ruleno = crush_add_rule(map, -1, rule); - - crush_finalize(map); - printf("built\n"); - - /* test */ - memset(o, 0, 100*sizeof(o[0])); - for (i=0; i<1000000; i++) { - crush_do_rule(map, ruleno, i, r, 3, -1); - /*printf("%d %d %d\n", r[0], r[1], r[2]);*/ - for (j=0; j<3; j++) - o[r[j]]++; - } - - for (i=0; i<100; i += 10) - printf("%2d : %d\n", i, o[i]); - - return 0; -} diff --git a/branches/sage/crush/doc/bdb.txt b/branches/sage/crush/doc/bdb.txt deleted file mode 100644 index 63e647f5bb3cc..0000000000000 --- a/branches/sage/crush/doc/bdb.txt +++ /dev/null @@ -1,48 +0,0 @@ -OBJECT STORE ON BERKELEY DB ---------------------------- - -OSBDB is an implementation of an object store that uses Berkeley DB as -the underlying storage. It is meant to be an alternative to EBOFS. - -BUILDING --------- - -You will need to have Berkeley DB installed, including the developent -packages. We've tested this with Berkeley DB 4.4.20 on Ubuntu 6.10. - -To compile OSBDB support, you need to pass the argument "want_bdb=yes" -to "make." If you don't specify this, OSBDB and all its associated -support is not included in the executables. - -RUNNING -------- - -To use OSBDB in Ceph, simply pass the --bdbstore flag to programs. You -don't need to create a "device" for OSBDB ahead of time; Berkeley DB -will take care of creating the files. You also *cannot* use a raw -device as your store -- it must be regular file. - -OSBDB additionally accepts the following flags: - - --bdbstore-btree Configures OSBDB to use the "Btree" - database type for Berkeley DB. The default - database type is "Hash". - - --bdbstore-hash-ffactor Sets the "fill factor" for the hash - database type. Takes an integer argument. - - --bdbstore-hash-nelem Sets the "nelem" parameter for the hash - database type. Takes an integer argument. - - --bdbstore-hash-pagesize Sets the page size for the hash database - type. Takes an integer argument. - - --bdbstore-cachesize Sets the cache size. Takes an integer - argument, which must be a power of two, and - no less than 20 KiB. - - --bdbstore-transactional Enable (in-memory-only) transactions for - all operations in the OSBDB store. - - --debug-bdbstore Set the debug level. Takes an integer - argument. diff --git a/branches/sage/crush/doc/dentries.txt b/branches/sage/crush/doc/dentries.txt deleted file mode 100644 index ab14765998b2f..0000000000000 --- a/branches/sage/crush/doc/dentries.txt +++ /dev/null @@ -1,4 +0,0 @@ - -null dentires only exist - - on auth - - on replica, if they are xlock \ No newline at end of file diff --git a/branches/sage/crush/doc/file_modes.txt b/branches/sage/crush/doc/file_modes.txt deleted file mode 100644 index d4ceba4034e5f..0000000000000 --- a/branches/sage/crush/doc/file_modes.txt +++ /dev/null @@ -1,66 +0,0 @@ - -underlying client capabilities: - -- read + cache -- read sync -- write sync -- write + buffer - (...potentially eventually augmented by byte ranges) - -whatever system of modes, tokens, etc. has to satisfy the basic -constraint that no conflicting capabilities are ever in the -hands of clients. - - -questions: -- is there any use to clients writing to a replica? - - reading, yes.. 100,000 open same file.. - - ------- - -simplest approach: -- all readers, writers go through authority -- all open, close traffic at replicas forwarded to auth - -- fh state migrates with exports. - - - --------- - -less simple: -- all writers go through authority - - open, close traffic fw -- readers from any replica - - need token from auth -- weird auth <-> replica <-> client interactions ensue! - - --------- - -even more complex (and totally FLAWED, ignore this!) - -- clients can open a file with any replica (for read or write). -- replica gets a read or write token from the primary - - primary thus knows if it's all read, all write, mixed, or none. -- once replica has a token it can service as many clients (of given type(s)) as it wants. -- on export, tokens are moved too. - - primary give _itself_ a token too! much simpler. - -- clients maintain a mode for each open file: rdonly, wronly, rdwr, lock -- globally, the mode is controlled by the primary, based on the mixture of - read and write tokens issued - - - -- [optional] if a client has a file open rdwr and the mode is rdonly or wronly, it can - request to read or write from the mds (which might twiddle the mode for performance - reasons.. e.g. lots of ppl rdwr but no actual reading) - - - - --------- - - diff --git a/branches/sage/crush/doc/inos.txt b/branches/sage/crush/doc/inos.txt deleted file mode 100644 index b5ab1db25ca60..0000000000000 --- a/branches/sage/crush/doc/inos.txt +++ /dev/null @@ -1,11 +0,0 @@ - -inodeno_t namespace - - relevant both for ino's, and for the (ino) input for Filer and object storage namespace... - -1 - root inode - -100+mds - mds log/journal -200+mds - mds ino, fh allocation tables -300+mds - mds inode files (for non-embedded inodes) - -1000+ - regular files and directories \ No newline at end of file diff --git a/branches/sage/crush/doc/journal.txt b/branches/sage/crush/doc/journal.txt deleted file mode 100644 index 22cb4fc9e21b2..0000000000000 --- a/branches/sage/crush/doc/journal.txt +++ /dev/null @@ -1,124 +0,0 @@ - - -- LogEvent.replay() is idempotent. we won't know whether the update is old or not. - - - - - - - - - - - - - - - -journal is distributed among different nodes. because authority changes over time, it's not immedicatley clear to a recoverying node relaying the journal whether the data is "real" or not (it might be exported later in the journal). - - -possibilities: - - -ONE.. bloat the journal! - -- journal entry includes full trace of dirty data (dentries, inodes) up until import point - - local renames implicit.. cache is reattached on replay - - exports are a list of exported dirs.. which are then dumped - ... - -recovery phase 1 -- each entry includes full trace (inodes + dentries) up until the import point -- cache during recovery is fragmetned/dangling beneath import points -- when export is encountered items are discarded (marked clean) - -recovery phase 2 -- import roots ping store to determine attachment points (if not already known) - - if it was imported during period, attachment point is already known. - - renames affecting imports are logged too -- import roots discovered from other nodes, attached to hierarchy - -then -- maybe resume normal operations -- if recovery is a background process on a takeover mds, "export" everything to that node. - - --> journal contains lots of clean data.. maybe 5+ times bigger as a result! - -possible fixes: - - collect dir traces into journal chunks so they aren't repeated as often - - each chunk summarizes traces in previous chunk - - hopefully next chunk will include many of the same traces - - if not, then the entry will include it - - - - -=== log entry types === -- all inode, dentry, dir items include a dirty flag. -- dirs are implicitly _never_ complete; even if they are, a fetch before commit is necessary to confirm - -ImportPath - log change in import path -Import - log import addition (w/ path, dirino) - -InoAlloc - allocate ino -InoRelease - release ino - -Inode - inode info, along with dentry+inode trace up to import point -Unlink - (null) dentry + trace, + flag (whether inode/dir is destroyed) -Link - (new) dentry + inode + trace - - ------------------------------ - -TWO.. -- directories in store contain path at time of commit (relative to import, and root) -- replay without attaching anything to heirarchy -- after replay, directories pinged in store to attach to hierarchy - --> phase 2 too slow! --> and nested dirs may reattach... that won't be apparent from journal. - - put just parent dir+dentry in dir store.. even worse on phase 2! - - -THREE -- - - - - - - - -metadata journal/log - - -event types: - -chown, chmod, utime - InodeUpdate - -mknod, mkdir, symlink - Mknod .. new inode + link - -unlink, rmdir - Unlink - -rename - Link + Unlink (foreign) -or Rename (local) - -link - Link .. link existing inode - - - - -InodeUpdate -DentryLink -DentryUnlink -InodeCreate -InodeDestroy -Mkdir? diff --git a/branches/sage/crush/doc/lazy_posix.txt b/branches/sage/crush/doc/lazy_posix.txt deleted file mode 100644 index 1d226cd03d8e4..0000000000000 --- a/branches/sage/crush/doc/lazy_posix.txt +++ /dev/null @@ -1,53 +0,0 @@ - -http://www.usenix.org/events/fast05/wips/slides/welch.pdf - - - --- STATLITE - statlite(const char *filename, struct statlite *buf); - fstatlite(int fd, struct statlite *buf); - lstatlite(const char *filename, struct statlite *buf); - - * file size, mtime are optionally not guaranteed to be correct - * mask field to specify which fields you need to be correct - - --- READDIR+ - - struct dirent_plus *readdirplus(DIR *dirp); - int readdirplus_r(DIR *dirp, struct dirent_plus *entry, struct dirent_plus **result); - struct dirent_lite *readdirlite(DIR *dirp); - int readdirlite_r(DIR *dirp, struct dirent_lite *entry, struct dirent_lite **result); - - * plus returns lstat - * lite returns lstatlite - - --- lazy i/o integrity - - O_LAZY to open(2) - - * relax data coherency - * writes may not be visible until lazyio_propagate, fsync, close - - lazyio_propagate(int fd, off_t offset, size_t count); - * my writes are safe - - lazyio_synchronize(int fd, off_t offset, size_t count); - * i will see everyone else's propagated writes - --- read/write non-serial vectors - - ssize_t readx(int fd, const struct iovec *iov, size_t iov_count, struct xtvec *xtv, size_t xtv_count); - ssize_t writex(int fd, const struct iovec *iov, size_t iov_count, struct xtvec *xtv, size_t xtv_count); - - * like readv/writev, but serial - * - - -int lockg(int fd, int cmd, lgid_t *lgid) - group locks - -int openg(char *path, int mode, fh_t *handle); - portable file handle -int sutoc(fh_t *fh); \ No newline at end of file diff --git a/branches/sage/crush/doc/osd_outline.txt b/branches/sage/crush/doc/osd_outline.txt deleted file mode 100644 index 2c6f3287aac5f..0000000000000 --- a/branches/sage/crush/doc/osd_outline.txt +++ /dev/null @@ -1,37 +0,0 @@ - -intro - -osd cluster map - requirements - desireable properties - (c)rush - -failure detection - distributed ping or heartbeat - central filter, notifier - -design - placement seed, class/superset, groups - -normal operation - reads - writes - -recovery - triggers: failed disk, or total cluster reorganization - - notify - peering - pull - push - clean - -writes during recovery - -graceful data loss + recovery? - - - - - - diff --git a/branches/sage/crush/doc/osd_replication.txt b/branches/sage/crush/doc/osd_replication.txt deleted file mode 100644 index 907d00e2050a2..0000000000000 --- a/branches/sage/crush/doc/osd_replication.txt +++ /dev/null @@ -1,226 +0,0 @@ - - -SOME GENERAL REQUIREMENTS - -- cluster expansion: - - any or all of the replicas may move to new OSDs. - -- cluster map may change frequently - - map change should translate into pending replication/migration - state quickly (or better yet, instantly), so that we could push - through a series of (say, botched) maps quickly and be fine, so long - as the final map is correct. - -- ideally, unordered osd<->osd, client<->osd communication - (mds<->mds, client<->mds communication is ordered, but file i/o - would be too slow that way?) - - - - -PRIMARY ONLY PICTURE - -let's completely ignore replication for a while, and see how -complicated the picture needs to be to reliably support cluster expansion. - -typedef __uint64_t version_t; - - -per-Object metadata: -- version #. incremented when an object is modified. - e.g. version_t version; -- on primary, keep list of stray replicas - e.g. map stray_replicas; // osds w/ stray replicas - includes old primary osd(s), until deletion is confirmed. used while rg - is importing. - - -per-RG metadata -- object list. well, a method to fetch it by querying a collection or whatever. -- negative list - e.g. map deleted_objects; - - used to enumerate deleted objects, when in "importing" state. -- a RG "state" (enum/int) - - - - - - -Normal RG state: -- role=primary - clean - i am primary, all is well. no stray copies. i can - discard my negative object list, since my local - object store tells me everything. - - -After a map change: -- new primary - undef - initially; i don't know RG exists. -- old primary - homeless - i was primary, still have unmolested data. new primary is not yet migrating - (presumably it's state=undef.) i need to contact new primary and tell them - this RG exists. - -- new primary - importing - i am migrating data from old primary. keep negative dir entries for deletions. - write locally. proxy reads (force immediately migration). do whole objects - initially (on write, block until i migrate the object). later we can do - sub-object state (where "live" object data is spread across new/old primaries.. -- old primary - exporting - primary is migrating my data. - undef - when it finishes. (i will forget this RG existed.) - - -After a second map change (scenario 1): - as above, if we were clean again. - -After a second map change (scenario 2): - we weren't clean yet. -- new primary - undef - initially (until i learn RG exists) -- old primary - importing - i'm still migrating from old old primary -- old old primary - exporting - ... -- old primary -?? importing+exporting - proxy reads as before. continue migrating from old old primary. - - -After a second map change (scenario 3): - we weren't clean yet, and old old primary is also new primary -- new primary (and old old primary) - exporting - change state to importing. be sure to compare object versions, and neg dir - entries (as we always should do, really!). -- old primary - importing - note that the old import source matches new primary, and change - state to exporting, and stop importing. (unlike scenario 2) - --> this approach could mean that a series of fast map changes could - force data to migrate down a "chain" of old primaries to reach the - new one. maybe old primary should go from importing -> exporting, - and pass along old old primary id to new primary such that the - import is a many-to-one thing, instead of one-to-one. version - numbers and neg entries will make it easy to pick out correct versions. - - - -For the importing process on a given RG: - -- metadata for each source - - each source has a state: - 'starting' - don't know anything about source yet. query source! - this probaby induces the source to change from - 'homeless' or something similar to 'exporting'. - 'importing' - i've fetched the source's object list (and neg - object list). i'm busy reading them! these lists - will shrink as the process continues. after i fetch - an object, i will erase it from the source. - (object metadata will include stray copy info - until i confirm that its removed.) - 'finishing' - i've read all my data, and i'm telling the old person - to discard any remaining RG metadata (RG contents - should already be gone) - - unmigrated object list - - migrated but not deleted object list - - stray osd is also listed in per-object MD during this stage - - negative object list - - i can remove these items if i see a newer object version (say, - from a different import source or something). - - i can remove any local objects or ignore imported ones if it is - older than deleted version - -- the lists should be sets or otherwise queryable so that while i'm - importing and a real op comes through I can quickly determine if a - given object_id is pending migration etc or if my local store is to - be trusted. - - - - - -SOME CODE BITS - - -typedef __uint64_t version_t; -class Object { - version_t version; - map stray_replicas; -}; - - -class ReplicaGroup { - int enumerate_objects(list& ls); - - int state; - - // for unstable states, - map deleted_objects; // locally - map exporters; // importing from these guys. -}; - -// primary -#define RG_STATE_CLEAN 1 -#define RG_STATE_IMPORTING 2 // pulling data - -// non-primary -#define RG_STATE_HOMELESS 5 // old primary; new primary not yet - // notified; not yet exporting. -#define RG_STATE_EXPORTING 6 // a newer primary is extracting my - // data. - - -struct RGExporter_t { - int import_state; - - set remaining_objects; // remote object list - set stray_objects; // imported but not deleted. - -}; - - - - - ----- -all crap from here on down - - - - -REPLICAS -- - - - - -OSD STATES -- primary, up to date. -- replica, up to date. - -- primary, proxy to old primary (primaries?) - -- replica, not up to date. - - -REPLICATION STUFF - -Per-RG metadata -- primary - - per-replica state: clean, catching up? -- replica - -Per-object metadata -- primary and replica - - version number/mtime - - rg (reverse indexed) -- primary - - replication level and state. - - commited to memory and/or disk, on which replicas (#1, #2, etc.) -- replica - - - - - --> \ No newline at end of file diff --git a/branches/sage/crush/doc/shared_write_states_nogo.txt b/branches/sage/crush/doc/shared_write_states_nogo.txt deleted file mode 100644 index f409617d82681..0000000000000 --- a/branches/sage/crush/doc/shared_write_states_nogo.txt +++ /dev/null @@ -1,39 +0,0 @@ - -// stable states // ------auth----- -----replica----- -#define LOCK_SYNC 0 // R . / . . . WB same ... for stat() -#define LOCK_LOCK 1 // R W / RC . . . . . / RC . . . ... for truncate(), fsync() -#define LOCK_RDONLY 2 // R . / RC R . . same -#define LOCK_MIXED 3 // . . / . R W . same -#define LOCK_WRONLY 4 // . . / . . W WB same - -// transition states -#define LOCK_GSYNCR 8 // R . / RC . . . same -#define LOCK_GSYNCMW 9 // . . / RC . . WB same -#define LOCK_GSYNCMW2 9 // . . / RC . . WB same - -#define LOCK_GLOCKSR 5 // R . / RC . . . . . / RC . . . -#define LOCK_GLOCKMW 7 // . . / RC . . . same - -#define LOCK_GRDONLYM 10 // . . / . R . . same -#define LOCK_GRDONLYM2 10 // --- . . / . R . . -#define LOCK_GRDONLYW 11 // . . / . . . . same -#define LOCK_GRDONLYW2 11 // --- . . / . . . . -#define LOCK_GRDONLYS 12 // R . / RC . . . same -#define LOCK_GRDONLYL 13 // R . / RC . . . --- - -#define LOCK_GMIXEDR 14 // R . / . R . . . . / . R . . -#define LOCK_GMIXEDR2 15 // --- . . / . R . . -#define LOCK_GMIXEDW 16 // . . / . . W . same -#define LOCK_GMIXEDW2 16 // --- . . / . . W . -#define LOCK_GMIXEDS 16 // R . / . . . . . . / . . . . -#define LOCK_GMIXEDS2 16 // --- . . / . . . . -#define LOCK_GMIXEDL 17 // R . / . . . . --- - -#define LOCK_GWRONLYR 18 // R . / . . . . same -#define LOCK_GWRONLYR2 18 // --- . . / . . . . -#define LOCK_GWRONLYM 19 // . . / . . . . same -#define LOCK_GWRONLYM2 19 // --- . . / . . . . -#define LOCK_GWRONLYS 20 // R . / . . . WB same -#define LOCK_GWRONLYS2 20 // --- . . / . . . . -#define LOCK_GWRONLYL 21 - diff --git a/branches/sage/crush/doc/shutdown.txt b/branches/sage/crush/doc/shutdown.txt deleted file mode 100644 index e5ccde3171004..0000000000000 --- a/branches/sage/crush/doc/shutdown.txt +++ /dev/null @@ -1,13 +0,0 @@ - -- mds0 triggers shutdown by sending a shutdown_start to all nodes. - -- from here on out, all client requests are discarded (unless they are a file close?) - -- each mds checks for outstanding inter-mds transations. e.g imports, discoveries, etc. once they're all done, send a shutdown_ready to mds0 - -- each mds successively disassembles its cache, flushing data to long-term storage, and sending inodeexpires, exporting imported dirs to parent (after they're clean + empty) - -- when the cache is empty, send shutdown_done to mds0 and exit. - -- mds0 exits when all mdss have finished. - diff --git a/branches/sage/crush/dupstore.cc b/branches/sage/crush/dupstore.cc deleted file mode 100644 index d43f935cb50cc..0000000000000 --- a/branches/sage/crush/dupstore.cc +++ /dev/null @@ -1,102 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include -#include "ebofs/Ebofs.h" -#include "osd/FakeStore.h" - - -int dupstore(ObjectStore* src, ObjectStore* dst) -{ - if (src->mount() < 0) return 1; - if (dst->mkfs() < 0) return 1; - if (dst->mount() < 0) return 1; - - // objects - list objects; - src->list_objects(objects); - int num = objects.size(); - cout << num << " objects" << std::endl; - int i = 1; - for (list::iterator p = objects.begin(); p != objects.end(); ++p) { - bufferlist bl; - src->read(*p, 0, 0, bl); - cout << "object " << i++ << "/" << num << " " << *p << " = " << bl.length() << " bytes" << std::endl; - dst->write(*p, 0, bl.length(), bl, 0); - map attrs; - src->getattrs(*p, attrs); - dst->setattrs(*p, attrs); - } - - // collections - list collections; - src->list_collections(collections); - num = collections.size(); - cout << num << " collections" << std::endl; - i = 1; - for (list::iterator p = collections.begin(); - p != collections.end(); - ++p) { - dst->create_collection(*p, 0); - map attrs; - src->collection_getattrs(*p, attrs); - dst->collection_setattrs(*p, attrs); - list o; - src->collection_list(*p, o); - int numo = 0; - for (list::iterator q = o.begin(); q != o.end(); q++) { - dst->collection_add(*p, *q, 0); - numo++; - } - cout << "collection " << i++ << "/" << num << " " << hex << *p << dec << " = " << numo << " objects" << std::endl; - } - - - src->umount(); - dst->umount(); - return 0; -} - -void usage() -{ - cerr << "usage: dup.ebofs (ebofs|fakestore) src (ebofs|fakestore) dst" << std::endl; - exit(0); -} - -int main(int argc, char **argv) -{ - vector args; - argv_to_vec(argc, argv, args); - parse_config_options(args); - - // args - if (args.size() != 4) - usage(); - - ObjectStore *src, *dst; - - if (strcmp(args[0], "ebofs") == 0) - src = new Ebofs(args[1]); - else if (strcmp(args[0], "fakestore") == 0) - src = new FakeStore(args[1]); - else usage(); - - if (strcmp(args[2], "ebofs") == 0) - dst = new Ebofs(args[3]); - else if (strcmp(args[2], "fakestore") == 0) - dst = new FakeStore(args[3]); - else usage(); - - return dupstore(src, dst); -} diff --git a/branches/sage/crush/ebofs/BlockDevice.cc b/branches/sage/crush/ebofs/BlockDevice.cc deleted file mode 100644 index 94c108db2612c..0000000000000 --- a/branches/sage/crush/ebofs/BlockDevice.cc +++ /dev/null @@ -1,846 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "BlockDevice.h" - -#include "config.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include - -#ifndef __CYGWIN__ -#ifndef DARWIN -#include -#endif -#endif - - - -/******************************************* - * biovec - */ - -inline ostream& operator<<(ostream& out, BlockDevice::biovec &bio) -{ - out << "bio("; - if (bio.type == BlockDevice::biovec::IO_READ) out << "rd "; - if (bio.type == BlockDevice::biovec::IO_WRITE) out << "wr "; - out << bio.start << "~" << bio.length; - if (bio.note) out << " " << bio.note; - out << " " << &bio; - out << ")"; - return out; -} - - - -/******************************************* - * ElevatorQueue - */ - -#define dout(x) if (x <= g_conf.debug_bdev) *_dout << dbeginl << g_clock.now() << " bdev(" << dev << ").elevatorq." -#define derr(x) if (x <= g_conf.debug_bdev) *_derr << dbeginl << g_clock.now() << " bdev(" << dev << ").elevatorq." - - -int BlockDevice::ElevatorQueue::dequeue_io(list& biols, - block_t& start, block_t& length, - interval_set& block_lock) -{ - // queue empty? - assert(!io_map.empty()); - - dout(20) << "dequeue_io el_pos " << el_pos << " dir " << el_dir_forward << dendl; - - // find our position: i >= pos - map::iterator i; - - int tries = 2; - while (tries > 0) { - if (el_dir_forward) { - i = io_map.lower_bound(el_pos); - if (i != io_map.end()) { - break; // not at end. good. - } - } else { - i = io_map.upper_bound(el_pos); - if (i != io_map.begin()) { - i--; // and back down one (to get i <= pos). good. - break; - } - } - - // reverse (or initial startup)? - if (g_conf.bdev_el_bidir || !el_dir_forward) { - // dout(20) << "restart reversing" << dendl; - el_dir_forward = !el_dir_forward; - } - - if (el_dir_forward) { - // forward - el_pos = 0; - - if (g_conf.bdev_el_fw_max_ms) { - el_stop = g_clock.now(); - utime_t max(0, 1000*g_conf.bdev_el_fw_max_ms); // (s,us), convert ms -> us! - el_stop += max; - // dout(20) << "restart forward sweep for " << max << dendl; - } else { - // dout(20) << "restart fowrard sweep" << dendl; - } - } else { - // reverse - el_pos = bdev->get_num_blocks(); - - if (g_conf.bdev_el_bw_max_ms) { - el_stop = g_clock.now(); - utime_t max(0, 1000*g_conf.bdev_el_bw_max_ms); // (s,us), convert ms -> us! - el_stop += max; - // dout(20) << "restart reverse sweep for " << max << dendl; - } else { - // dout(20) << "restart reverse sweep" << dendl; - } - } - - tries--; - } - - assert(tries > 0); // this shouldn't happen if the queue is non-empty. - - // get some biovecs - int num_bio = 0; - - dout(20) << "dequeue_io starting with " << i->first << " " << *i->second << dendl; - - // merge contiguous ops - char type = i->second->type; // read or write - int num_iovs = 0; // count eventual iov's for readv/writev - - start = i->first; - length = 0; - - if (el_dir_forward) - el_pos = start; - else - el_pos = i->first + i->second->length; - - // while (contiguous) - while ((( el_dir_forward && el_pos == i->first) || - (!el_dir_forward && el_pos == i->first + i->second->length)) && - type == i->second->type) { - biovec *bio = i->second; - - // allowed? (not already submitted to kernel?) - if (block_lock.intersects(bio->start, bio->length)) { - dout(20) << "dequeue_io " << bio->start << "~" << bio->length - << " intersects block_lock " << block_lock << dendl; - break; // stop, or go with what we've got so far - } - - // add to biols - int nv = bio->bl.buffers().size(); // how many iov's in this bio's bufferlist? - if (num_bio && - num_iovs + nv >= IOV_MAX) break; // to many //g_conf.bdev_iov_max) break; // too many! - num_iovs += nv; - - start = MIN(start, bio->start); - length += bio->length; - - if (el_dir_forward) { - dout(20) << "dequeue_io fw dequeue io at " << el_pos << " " << *i->second << dendl; - biols.push_back(bio); // add at back - } else { - dout(20) << "dequeue_io bw dequeue io at " << el_pos << " " << *i->second << dendl; - biols.push_front(bio); // add at front - } - num_bio++; - - // move elevator pointer - bool at_end = false; - map::iterator prev = i; - if (el_dir_forward) { - el_pos += bio->length; // cont. next would start right after us - i++; - if (i == io_map.end()) { - at_end = true; - } - } else { - el_pos -= bio->length; - if (i == io_map.begin()) { - at_end = true; - } else { - i--; - } - } - - // dequeue - io_map.erase(prev); - bio->in_queue = 0; - - if (at_end) break; - } - - return num_bio; -} - - - -/******************************************* - * BarrierQueue - */ -#undef dout -#define dout(x) if (x <= g_conf.debug_bdev) *_dout << dbeginl << g_clock.now() << " bdev(" << dev << ").barrierq." - -void BlockDevice::BarrierQueue::barrier() -{ - if (!qls.empty() && qls.front()->empty()) { - assert(qls.size() == 1); - dout(10) << "barrier not adding new queue, front is empty" << dendl; - } else { - qls.push_back(new ElevatorQueue(bdev, dev)); - dout(10) << "barrier adding new elevator queue (now " << qls.size() << "), front queue has " - << qls.front()->size() << " ios left" << dendl; - } -} - -bool BlockDevice::BarrierQueue::bump() -{ - assert(!qls.empty()); - - // is the front queue empty? - if (qls.front()->empty() && - qls.front() != qls.back()) { - delete qls.front(); - qls.pop_front(); - dout(10) << "dequeue_io front empty, moving to next queue (" << qls.front()->size() << ")" << dendl; - return true; - } - - return false; -} - -int BlockDevice::BarrierQueue::dequeue_io(list& biols, - block_t& start, block_t& length, - interval_set& locked) -{ - assert(!qls.empty()); - int n = qls.front()->dequeue_io(biols, start, length, locked); - bump(); // in case we emptied the front queue - return n; -} - - - - -/******************************************* - * BlockDevice - */ - -#undef dout -#define dout(x) if (x <= g_conf.debug_bdev) *_dout << dbeginl << g_clock.now() << " bdev(" << dev << ")." - - - -block_t BlockDevice::get_num_blocks() -{ - if (!num_blocks) { - assert(fd > 0); - - int r; -#ifdef BLKGETSIZE64 - // ioctl block device - uint64_t bytes = 0; - r = ioctl(fd, BLKGETSIZE64, &bytes); - num_blocks = bytes / (uint64_t)EBOFS_BLOCK_SIZE; - if (r == 0) { - dout(10) << "get_num_blocks ioctl BLKGETSIZE64 reports " - << num_blocks << " 4k blocks, " - << bytes << " bytes" - << dendl; -#else - // hrm, try the 32 bit ioctl? - unsigned long sectors = 0; - r = ioctl(fd, BLKGETSIZE, §ors); - num_blocks = sectors/8ULL; - bytes = sectors*512ULL; - if (r == 0) { - dout(10) << "get_num_blocks ioctl BLKGETSIZE reports " << sectors << " sectors, " - << num_blocks << " 4k blocks, " << bytes << " bytes" << dendl; -#endif - } else { - // hmm, try stat! - dout(10) << "get_num_blocks ioctl(2) failed with " << errno << " " << strerror(errno) << ", using stat(2)" << dendl; - struct stat st; - fstat(fd, &st); - uint64_t bytes = st.st_size; - num_blocks = bytes / EBOFS_BLOCK_SIZE; - dout(10) << "get_num_blocks stat reports " << num_blocks << " 4k blocks, " << bytes << " bytes" << dendl; - } - - if (g_conf.bdev_fake_mb) { - num_blocks = g_conf.bdev_fake_mb * 256; - dout(0) << "faking dev size " << g_conf.bdev_fake_mb << " mb" << dendl; - } - if (g_conf.bdev_fake_max_mb && - num_blocks > (block_t)g_conf.bdev_fake_max_mb * 256ULL) { - dout(0) << "faking dev size " << g_conf.bdev_fake_max_mb << " mb" << dendl; - num_blocks = g_conf.bdev_fake_max_mb * 256; - } - - } - return num_blocks; -} - - - -/** io thread - * each worker thread dequeues ios from the root_queue and submits them to the kernel. - */ -void* BlockDevice::io_thread_entry() -{ - lock.Lock(); - - int whoami = io_threads_started++; - io_threads_running++; - assert(io_threads_running <= g_conf.bdev_iothreads); - dout(10) << "io_thread" << whoami << " start, " << io_threads_running << " now running" << dendl; - - // get my own fd (and file position pointer) - int fd = open_fd(); - assert(fd > 0); - - while (!io_stop) { - if (!root_queue.empty()) { - dout(20) << "io_thread" << whoami << "/" << io_threads_running << " going" << dendl; - - block_t start, length; - list biols; - int n = root_queue.dequeue_io(biols, start, length, io_block_lock); - - if (n == 0) { - // failed to dequeue a do-able op, sleep for now - dout(20) << "io_thread" << whoami << "/" << io_threads_running << " couldn't dequeue doable op, sleeping" << dendl; - assert(io_threads_running > 1); // there must be someone else, if we couldn't dequeue something doable. - } - else { - // lock blocks - assert(start == biols.front()->start); - io_block_lock.insert(start, length); - - // drop lock to do the io - lock.Unlock(); - do_io(fd, biols); - lock.Lock(); - - // unlock blocks - io_block_lock.erase(start, length); - - // someone might have blocked on our block_lock? - if (io_threads_running < g_conf.bdev_iothreads && - (int)root_queue.size() > io_threads_running) - io_wakeup.SignalAll(); - - // loop again (don't sleep) - continue; - } - } - - // sleep - io_threads_running--; - dout(20) << "io_thread" << whoami << " sleeping, " - << io_threads_running << " threads now running," - << " queue has " << root_queue.size() - << dendl; - - // first wait for signal | timeout? - if (g_conf.bdev_idle_kick_after_ms > 0 && - idle_kicker && - io_threads_running == 0 && !is_idle_waiting) { // only the last thread asleep needs to kick. - // sleep, but just briefly. - dout(20) << "io_thread" << whoami << " doing short wait, to see if i stay idle" << dendl; - is_idle_waiting = true; - int r = io_wakeup.WaitInterval(lock, utime_t(0, g_conf.bdev_idle_kick_after_ms*1000)); - is_idle_waiting = false; - - if (r == ETIMEDOUT) { - dout(20) << "io_thread" << whoami << " timeout expired, kicking ebofs" << dendl; - kicker_cond.Signal(); // signal kicker thread - } else { - dout(20) << "io_thread" << whoami << " signaled during short sleep, waking up" << dendl; - goto wake_up; - } - } - - // sleeeep - io_wakeup.Wait(lock); // and wait (if condition still holds) - - wake_up: - io_threads_running++; - assert(io_threads_running <= g_conf.bdev_iothreads); - dout(20) << "io_thread" << whoami << "/" << io_threads_running << " woke up, " << io_threads_running << " threads now running" << dendl; - } - - // clean up - ::close(fd); - io_threads_running--; - - lock.Unlock(); - - dout(10) << "io_thread" << whoami << " finish" << dendl; - return 0; -} - - - -/** do_io - * do a single io operation - * (lock is NOT held, but we own the *biovec) - */ -void BlockDevice::do_io(int fd, list& biols) -{ - int r; - assert(!biols.empty()); - - // get full range, type, bl - bufferlist bl; - bl.claim(biols.front()->bl); - block_t start = biols.front()->start; - block_t length = biols.front()->length; - char type = biols.front()->type; - - list::iterator p = biols.begin(); - int numbio = 1; - for (p++; p != biols.end(); p++) { - length += (*p)->length; - bl.claim_append((*p)->bl); - numbio++; - } - - // do it - dout(20) << "do_io start " << (type==biovec::IO_WRITE?"write":"read") - << " " << start << "~" << length - << " " << numbio << " bits" << dendl; - if (type == biovec::IO_WRITE) { - r = _write(fd, start, length, bl); - } else if (type == biovec::IO_READ) { - r = _read(fd, start, length, bl); - } else assert(0); - dout(20) << "do_io finish " << (type==biovec::IO_WRITE?"write":"read") - << " " << start << "~" << length << dendl; - - // set rval - for (p = biols.begin(); p != biols.end(); p++) - (*p)->rval = r; - - if (1) { - // put in completion queue - complete_lock.Lock(); - complete_queue.splice( complete_queue.end(), biols ); - complete_queue_len += numbio; - complete_wakeup.Signal(); - complete_lock.Unlock(); - dout(20) << "do_io kicked completer on " << (type==biovec::IO_WRITE?"write":"read") - << " " << start << "~" << length << dendl; - - } else { - // be slow and finish synchronously - for (p = biols.begin(); p != biols.end(); p++) - finish_io(*p); - } -} - - -/** finish_io - * - * finish an io by signaling the cond or performing a callback. - * called by completion thread, unless that's disabled above. - */ -void BlockDevice::finish_io(biovec *bio) -{ - bio->done = true; - if (bio->cond) { - lock.Lock(); // hmm? - bio->cond->Signal(); - lock.Unlock(); - } - else if (bio->cb) { - bio->cb->finish((ioh_t)bio, bio->rval); - delete bio->cb; - delete bio; - } -} - -/*** completion_thread - * handle Cond signals or callbacks for completed ios - */ -void* BlockDevice::complete_thread_entry() -{ - complete_lock.Lock(); - dout(10) << "complete_thread start" << dendl; - - while (!io_stop) { - - while (!complete_queue.empty()) { - list ls; - ls.swap(complete_queue); - dout(10) << "complete_thread grabbed " << complete_queue_len << " biovecs" << dendl; - complete_queue_len = 0; - - complete_lock.Unlock(); - - // finish - for (list::iterator p = ls.begin(); - p != ls.end(); - p++) { - biovec *bio = *p; - dout(20) << "complete_thread finishing " << *bio << dendl; - finish_io(bio); - } - - complete_lock.Lock(); - } - if (io_stop) break; - - dout(25) << "complete_thread sleeping" << dendl; - complete_wakeup.Wait(complete_lock); - } - - dout(10) << "complete_thread finish" << dendl; - complete_lock.Unlock(); - return 0; -} - - -/*** idle kicker thread - * kick ebofs when we're idle. we're a separate thread (yuck) - * because ebofs may be holding it's lock _and_ waiting for us - * to do useful work. that rules out io_thread and complete_thread! - */ -void* BlockDevice::kicker_thread_entry() -{ - lock.Lock(); - dout(10) << "kicker_thread start" << dendl; - - while (!io_stop) { - - if (io_threads_running == 0 && idle_kicker) { - dout(25) << "kicker_thread kicking ebofs" << dendl; - lock.Unlock(); - idle_kicker->kick(); - lock.Lock(); - dout(25) << "kicker_thread done kicking ebofs" << dendl; - } - if (io_stop) break; - - dout(25) << "kicker_thread sleeping" << dendl; - kicker_cond.Wait(lock); - } - - dout(10) << "kicker_thread finish" << dendl; - lock.Unlock(); - return 0; -} - - - - -// io queue - -void BlockDevice::_submit_io(biovec *b) -{ - // NOTE: lock must be held - dout(15) << "_submit_io " << *b << dendl; - - // wake up io_thread(s)? - if ((int)root_queue.size() == io_threads_running) - io_wakeup.SignalOne(); - else if ((int)root_queue.size() > io_threads_running) - io_wakeup.SignalAll(); - - // queue - root_queue.submit_io(b); - - /* - // [DEBUG] check for overlapping ios - // BUG: this doesn't detect all overlaps w/ the next queue thing. - if (g_conf.bdev_debug_check_io_overlap) { - // BUG: this doesn't catch everything! eg 1~10000000 will be missed.... - multimap::iterator p = io_queue.lower_bound(b->start); - if ((p != io_queue.end() && - p->first < b->start+b->length) || - (p != io_queue.begin() && - (p--, p->second->start + p->second->length > b->start))) { - dout(1) << "_submit_io new io " << *b - << " overlaps with existing " << *p->second << dendl; - cerr << "_submit_io new io " << *b - << " overlaps with existing " << *p->second << dendl; - } - } - */ - -} - -int BlockDevice::_cancel_io(biovec *bio) -{ - // NOTE: lock must be held - - if (bio->in_queue == 0) { - dout(15) << "_cancel_io " << *bio << " FAILED" << dendl; - return -1; - } else { - dout(15) << "_cancel_io " << *bio << dendl; - bio->in_queue->cancel_io(bio); - if (root_queue.bump()) - io_wakeup.SignalAll(); // something happened! - return 0; - } -} - - - -// low level io - -int BlockDevice::_read(int fd, block_t bno, unsigned num, bufferlist& bl) -{ - dout(10) << "_read " << bno << "~" << num << dendl; - - assert(fd > 0); - - off_t offset = bno * EBOFS_BLOCK_SIZE; - off_t actual = lseek(fd, offset, SEEK_SET); - assert(actual == offset); - - size_t len = num*EBOFS_BLOCK_SIZE; - assert(bl.length() >= len); - - struct iovec iov[ bl.buffers().size() ]; - int n = 0; - size_t left = len; - for (list::const_iterator i = bl.buffers().begin(); - i != bl.buffers().end(); - i++) { - assert(i->length() % EBOFS_BLOCK_SIZE == 0); - - iov[n].iov_base = (void*)i->c_str(); - iov[n].iov_len = MIN(left, i->length()); - - left -= iov[n].iov_len; - n++; - if (left == 0) break; - } - - int got = ::readv(fd, iov, n); - assert(got <= (int)len); - - return 0; -} - -int BlockDevice::_write(int fd, unsigned bno, unsigned num, bufferlist& bl) -{ - dout(10) << "_write " << bno << "~" << num << dendl; - - assert(fd > 0); - - while (1) { - off_t offset = (off_t)bno << EBOFS_BLOCK_BITS; - assert((off_t)bno * (off_t)EBOFS_BLOCK_SIZE == offset); - off_t actual = lseek(fd, offset, SEEK_SET); - assert(actual == offset); - - // write buffers - size_t len = num*EBOFS_BLOCK_SIZE; - - struct iovec iov[ bl.buffers().size() ]; - - int n = 0; - size_t left = len; - for (list::const_iterator i = bl.buffers().begin(); - i != bl.buffers().end(); - i++) { - assert(i->length() % EBOFS_BLOCK_SIZE == 0); - - iov[n].iov_base = (void*)i->c_str(); - iov[n].iov_len = MIN(left, i->length()); - - assert((((intptr_t)iov[n].iov_base) & ((intptr_t)4095ULL)) == 0); - assert((iov[n].iov_len & 4095) == 0); - - left -= iov[n].iov_len; - n++; - if (left == 0 || - n == IOV_MAX) break; - } - - int r = ::writev(fd, iov, n); - - if (r < 0) { - dout(1) << "couldn't write bno " << bno << " num " << num - << " (" << len << " bytes) in " << n << " iovs, r=" << r - << " errno " << errno << " " << strerror(errno) << dendl; - dout(1) << "bl is " << bl << dendl; - assert(0); - } else if (r < (int)len) { - // hrm, we didn't write _all_ of our data. WTF kind of FS is this? - dout(-1) << "bloody hell, writev only wrote " << r << " of " << len << " bytes, looping" << dendl; - assert(r % 4096 == 0); - int wrote = r / 4096; - bno += wrote; - num -= wrote; - bufferlist tail; - tail.substr_of(bl, r, len-r); - bl.claim(tail); - continue; - } else { - // yay - assert(r == (int)len); - break; - } - } - return 0; -} - - - -// open/close - -int BlockDevice::open_fd() -{ -#ifdef DARWIN - int fd = ::open(dev.c_str(), O_RDWR|O_SYNC, 0); - ::fcntl(fd, F_NOCACHE); - return fd; -#else - return ::open(dev.c_str(), O_RDWR|O_SYNC|O_DIRECT, 0); -#endif -} - -int BlockDevice::open(kicker *idle) -{ - assert(fd == 0); - - // open? - fd = open_fd(); - if (fd < 0) { - dout(1) << "open failed, r = " << fd << " " << strerror(errno) << dendl; - fd = 0; - return -1; - } - - // lock - if (g_conf.bdev_lock) { - int r = ::flock(fd, LOCK_EX|LOCK_NB); - if (r < 0) { - derr(1) << "open " << dev << " failed to get LOCK_EX" << dendl; - return -1; - } - } - - // figure size - block_t b = get_num_blocks(); - if (!b) { - dout(0) << "open can't determine size of device" << dendl; - assert(0); - } - dout(2) << "open " << b << " blocks, " << b*4096 << " bytes" << dendl; - - // start thread - io_threads_started = 0; - io_threads.clear(); - for (int i=0; icreate(); - } - complete_thread.create(); - kicker_thread.create(); - - // idle kicker? - idle_kicker = idle; - - return fd; -} - - -/* - * warning: ebofs shoudl drop it's lock before calling close(), - * or else deadlock against the idle kicker - */ -int BlockDevice::close() -{ - assert(fd>0); - - idle_kicker = 0; - - // shut down io thread - dout(10) << "close stopping io+complete threads" << dendl; - lock.Lock(); - complete_lock.Lock(); - io_stop = true; - io_wakeup.SignalAll(); - complete_wakeup.SignalAll(); - kicker_cond.Signal(); - complete_lock.Unlock(); - lock.Unlock(); - - for (int i=0; ijoin(); - delete io_threads[i]; - } - io_threads.clear(); - - complete_thread.join(); - kicker_thread.join(); - - io_stop = false; // in case we start again - - dout(2) << "close " << dendl; - - if (g_conf.bdev_lock) - ::flock(fd, LOCK_UN); - - ::close(fd); - fd = 0; - - return 0; -} - -int BlockDevice::cancel_io(ioh_t ioh) -{ - biovec *pbio = (biovec*)ioh; - - lock.Lock(); - int r = _cancel_io(pbio); - lock.Unlock(); - - // FIXME? - if (r == 0 && pbio->cb) { - //pbio->cb->finish(ioh, 0); - delete pbio->cb; - delete pbio; - } - - return r; -} - diff --git a/branches/sage/crush/ebofs/BlockDevice.h b/branches/sage/crush/ebofs/BlockDevice.h deleted file mode 100644 index 295ea6b55b75f..0000000000000 --- a/branches/sage/crush/ebofs/BlockDevice.h +++ /dev/null @@ -1,351 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __EBOFS_BLOCKDEVICE_H -#define __EBOFS_BLOCKDEVICE_H - -#include "include/buffer.h" -#include "include/interval_set.h" -#include "include/Context.h" -#include "common/Mutex.h" -#include "common/Cond.h" -#include "common/Thread.h" - -#include "types.h" - - -typedef void *ioh_t; // opaque handle to an io request. (in actuality, a biovec*) - - -class BlockDevice { - public: - // callback type for io completion notification - class callback { - public: - virtual ~callback() {} - virtual void finish(ioh_t ioh, int rval) = 0; - }; - - // kicker for idle notification - class kicker { - public: - virtual ~kicker() {} - virtual void kick() = 0; - }; - - - /********************************************************/ - - class Queue; - - // io item - // two variants: one with Cond*, one with callback*. - class biovec { - public: - static const char IO_WRITE = 1; - static const char IO_READ = 2; - - char type; - block_t start, length; - bufferlist bl; - callback *cb; - Cond *cond; - int rval; - char *note; - bool done; - - Queue *in_queue; - - biovec(char t, block_t s, block_t l, bufferlist& b, callback *c, char *n=0) : - type(t), start(s), length(l), bl(b), cb(c), cond(0), rval(0), note(n), done(false), in_queue(0) {} - biovec(char t, block_t s, block_t l, bufferlist& b, Cond *c, char *n=0) : - type(t), start(s), length(l), bl(b), cb(0), cond(c), rval(0), note(n), done(false), in_queue(0) {} - }; - friend ostream& operator<<(ostream& out, biovec &bio); - - - /********************************************************/ - - /* - * Queue -- abstract IO queue interface - */ - class Queue { - public: - virtual ~Queue() {} - virtual void submit_io(biovec *b) = 0; - virtual void cancel_io(biovec *b) = 0; - virtual int dequeue_io(list& biols, - block_t& start, block_t& length, - interval_set& locked) = 0; - virtual int size() = 0; - virtual bool empty() { return size() == 0; } - }; - - /* - * ElevatorQueue - simple elevator scheduler queue - */ - class ElevatorQueue : public Queue { - BlockDevice *bdev; - const char *dev; - map io_map; - bool el_dir_forward; - block_t el_pos; - utime_t el_stop; - - public: - ElevatorQueue(BlockDevice *bd, const char *d) : - bdev(bd), dev(d), - el_dir_forward(false), - el_pos(0) {} - void submit_io(biovec *b) { - b->in_queue = this; - assert(io_map.count(b->start) == 0); - io_map[b->start] = b; - } - void cancel_io(biovec *b) { - assert(b->in_queue == this); - assert(io_map.count(b->start) && - io_map[b->start] == b); - io_map.erase(b->start); - b->in_queue = 0; - } - int dequeue_io(list& biols, - block_t& start, block_t& length, - interval_set& locked); - int size() { - return io_map.size(); - } - }; - - /* - * BarrierQueue - lets you specify io "barriers" - * barrier() - force completion of all prior IOs before - * future ios are started. - * bump() - must be called after cancel_io to properly - * detect empty subqueue. - */ - class BarrierQueue : public Queue { - BlockDevice *bdev; - const char *dev; - list qls; - public: - BarrierQueue(BlockDevice *bd, const char *d) : bdev(bd), dev(d) { - barrier(); - } - ~BarrierQueue() { - for (list::iterator p = qls.begin(); - p != qls.end(); - ++p) - delete *p; - qls.clear(); - } - int size() { - // this isn't perfectly accurate. - if (!qls.empty()) - return qls.front()->size(); - return 0; - } - void submit_io(biovec *b) { - assert(!qls.empty()); - qls.back()->submit_io(b); - } - void cancel_io(biovec *b) { - assert(0); // shouldn't happen. - } - int dequeue_io(list& biols, - block_t& start, block_t& length, - interval_set& locked); - void barrier(); - bool bump(); - }; - - - private: - string dev; // my device file - int fd; - block_t num_blocks; - - Mutex lock; - - /** the root io queue. - * i current assumeit's a barrier queue,but this can be changed - * with some minor rearchitecting. - */ - BarrierQueue root_queue; - - /* io_block_lock - block ranges current dispatched to kernel - * once a bio is dispatched, it cannot be canceled, so an overlapping - * io and be submitted. the overlapping io cannot be dispatched - * to the kernel, however, until the original io finishes, or else - * there will be a race condition. - */ - interval_set io_block_lock; // blocks currently dispatched to kernel - - // io threads - Cond io_wakeup; - bool io_stop; - int io_threads_started, io_threads_running; - bool is_idle_waiting; - - void *io_thread_entry(); - - class IOThread : public Thread { - BlockDevice *dev; - public: - IOThread(BlockDevice *d) : dev(d) {} - void *entry() { return (void*)dev->io_thread_entry(); } - } ; - - vector io_threads; - - // private io interface - int open_fd(); // get an fd (for a thread) - - void _submit_io(biovec *b); - int _cancel_io(biovec *bio); - void do_io(int fd, list& biols); // called by an io thread - - // low level io - int _read(int fd, block_t bno, unsigned num, bufferlist& bl); - int _write(int fd, unsigned bno, unsigned num, bufferlist& bl); - - - // completion callback queue - Mutex complete_lock; - Cond complete_wakeup; - list complete_queue; - int complete_queue_len; - - void finish_io(biovec *bio); - - // complete thread - void *complete_thread_entry(); - class CompleteThread : public Thread { - BlockDevice *dev; - public: - CompleteThread(BlockDevice *d) : dev(d) {} - void *entry() { return (void*)dev->complete_thread_entry(); } - } complete_thread; - - // kicker - kicker *idle_kicker; // not used.. - Mutex kicker_lock; - Cond kicker_cond; - void *kicker_thread_entry(); - class KickerThread : public Thread { - BlockDevice *dev; - public: - KickerThread(BlockDevice *d) : dev(d) {} - void *entry() { return (void*)dev->complete_thread_entry(); } - } kicker_thread; - - - - public: - BlockDevice(const char *d) : - dev(d), fd(0), num_blocks(0), - root_queue(this, dev.c_str()), - io_stop(false), io_threads_started(0), io_threads_running(0), is_idle_waiting(false), - complete_queue_len(0), - complete_thread(this), - idle_kicker(0), kicker_thread(this) { } - ~BlockDevice() { - if (fd > 0) close(); - } - - // get size in blocks - block_t get_num_blocks(); - const char *get_device_name() const { return dev.c_str(); } - - // open/close - int open(kicker *idle = 0); - int close(); - - // state stuff - bool is_idle() { - lock.Lock(); - bool idle = (io_threads_running == 0) && root_queue.empty(); - lock.Unlock(); - return idle; - } - void barrier() { - lock.Lock(); - root_queue.barrier(); - lock.Unlock(); - } - - // ** blocking interface ** - - // read - int read(block_t bno, unsigned num, bufferptr& bptr, char *n=0) { - bufferlist bl; - bl.push_back(bptr); - return read(bno, num, bl, n); - } - int read(block_t bno, unsigned num, bufferlist& bl, char *n=0) { - Cond c; - biovec bio(biovec::IO_READ, bno, num, bl, &c, n); - - lock.Lock(); - _submit_io(&bio); - barrier(); // need this, to prevent starvation! - while (!bio.done) - c.Wait(lock); - lock.Unlock(); - return bio.rval; - } - - // write - int write(unsigned bno, unsigned num, bufferptr& bptr, char *n=0) { - bufferlist bl; - bl.push_back(bptr); - return write(bno, num, bl, n); - } - int write(unsigned bno, unsigned num, bufferlist& bl, char *n=0) { - Cond c; - biovec bio(biovec::IO_WRITE, bno, num, bl, &c, n); - - lock.Lock(); - _submit_io(&bio); - barrier(); // need this, to prevent starvation! - while (!bio.done) - c.Wait(lock); - lock.Unlock(); - return bio.rval; - } - - // ** non-blocking interface ** - ioh_t read(block_t bno, unsigned num, bufferlist& bl, callback *fin, char *n=0) { - biovec *pbio = new biovec(biovec::IO_READ, bno, num, bl, fin, n); - lock.Lock(); - _submit_io(pbio); - lock.Unlock(); - return (ioh_t)pbio; - } - ioh_t write(block_t bno, unsigned num, bufferlist& bl, callback *fin, char *n=0) { - biovec *pbio = new biovec(biovec::IO_WRITE, bno, num, bl, fin, n); - lock.Lock(); - _submit_io(pbio); - lock.Unlock(); - return (ioh_t)pbio; - } - int cancel_io(ioh_t ioh); - -}; - - - - -#endif diff --git a/branches/sage/crush/ebofs/BufferCache.cc b/branches/sage/crush/ebofs/BufferCache.cc deleted file mode 100644 index b1c98455f8278..0000000000000 --- a/branches/sage/crush/ebofs/BufferCache.cc +++ /dev/null @@ -1,1228 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "BufferCache.h" -#include "Onode.h" - - -/*********** BufferHead **************/ - - -#undef dout -#define dout(x) if (x <= g_conf.debug_ebofs) *_dout << dbeginl << g_clock.now() << " ebofs.bh." - - - - - - -/************ ObjectCache **************/ - - -#undef dout -#define dout(x) if (x <= g_conf.debug_ebofs) *_dout << dbeginl << g_clock.now() << " ebofs.oc." - - - -void ObjectCache::rx_finish(ioh_t ioh, block_t start, block_t length, bufferlist& bl) -{ - list waiters; - - dout(10) << "rx_finish " << start << "~" << length << dendl; - for (map::iterator p = data.lower_bound(start); - p != data.end(); - p++) { - BufferHead *bh = p->second; - dout(10) << "rx_finish ?" << *bh << dendl; - assert(p->first == bh->start()); - - // past? - if (p->first >= start+length) break; - if (bh->end() > start+length) break; // past - - assert(p->first >= start); - assert(bh->end() <= start+length); - - dout(10) << "rx_finish !" << *bh << dendl; - - if (bh->rx_ioh == ioh) - bh->rx_ioh = 0; - - if (bh->is_rx()) { - assert(bh->get_version() == 0); - assert(bh->end() <= start+length); - assert(bh->start() >= start); - dout(10) << "rx_finish rx -> clean on " << *bh << dendl; - bh->data.substr_of(bl, (bh->start()-start)*EBOFS_BLOCK_SIZE, bh->length()*EBOFS_BLOCK_SIZE); - bc->mark_clean(bh); - } - else if (bh->is_partial()) { - dout(10) << "rx_finish partial -> tx on " << *bh << dendl; - - if (1) { - // double-check what block i am - vector exv; - on->map_extents(bh->start(), 1, exv); - assert(exv.size() == 1); - block_t cur_block = exv[0].start; - assert(cur_block == bh->partial_tx_to); - } - - // ok, cancel my low-level partial (since we're still here, and can bh_write ourselves) - bc->cancel_partial( bh->rx_from.start, bh->partial_tx_to, bh->partial_tx_epoch ); - - // apply partial to myself - assert(bh->data.length() == 0); - bufferptr bp = buffer::create_page_aligned(EBOFS_BLOCK_SIZE); - bh->data.push_back( bp ); - bh->data.copy_in(0, EBOFS_BLOCK_SIZE, bl); - bh->apply_partial(); - - // write "normally" - bc->mark_dirty(bh); - bc->bh_write(on, bh, bh->partial_tx_to);//cur_block); - - // clean up a bit - bh->partial_tx_to = 0; - bh->partial_tx_epoch = 0; - bh->partial.clear(); - } - else { - dout(10) << "rx_finish ignoring status on (dirty|tx|clean) " << *bh << dendl; - assert(bh->is_dirty() || // was overwritten - bh->is_tx() || // was overwritten and queued - bh->is_clean()); // was overwritten, queued, _and_ flushed to disk - } - - // trigger waiters - for (map >::iterator p = bh->waitfor_read.begin(); - p != bh->waitfor_read.end(); - p++) { - assert(p->first >= bh->start() && p->first < bh->end()); - waiters.splice(waiters.begin(), p->second); - } - bh->waitfor_read.clear(); - } - - finish_contexts(waiters); -} - - -void ObjectCache::tx_finish(ioh_t ioh, block_t start, block_t length, - version_t version, version_t epoch) -{ - dout(10) << "tx_finish " << start << "~" << length << " v" << version << dendl; - for (map::iterator p = data.lower_bound(start); - p != data.end(); - p++) { - BufferHead *bh = p->second; - dout(30) << "tx_finish ?bh " << *bh << dendl; - assert(p->first == bh->start()); - - // past? - if (p->first >= start+length) { - bh->oc->try_merge_bh_right(p); - break; - } - - if (bh->tx_ioh == ioh) - bh->tx_ioh = 0; - - if (!bh->is_tx()) { - dout(10) << "tx_finish bh not marked tx, skipping" << dendl; - continue; - } - assert(bh->is_tx()); - - if (version == bh->version) { - dout(10) << "tx_finish tx -> clean on " << *bh << dendl; - assert(bh->end() <= start+length); - bh->set_last_flushed(version); - bc->mark_clean(bh); - bh->oc->try_merge_bh_left(p); - } else { - dout(10) << "tx_finish leaving tx, " << bh->version << " > " << version - << " on " << *bh << dendl; - assert(bh->version > version); - } - } -} - - - -/* - * return any bh's that are (partially) in this range that are TX. - */ -int ObjectCache::find_tx(block_t start, block_t len, - list& tx) -{ - map::iterator p = data.lower_bound(start); - - block_t cur = start; - block_t left = len; - - /* don't care about overlap, we want things _fully_ in start~len. - if (p != data.begin() && - (p == data.end() || p->first > cur)) { - p--; // might overlap! - if (p->first + p->second->length() <= cur) - p++; // doesn't overlap. - } - */ - - while (left > 0) { - assert(cur+left == start+len); - - // at end? - if (p == data.end()) - break; - - if (p->first <= cur) { - // have it (or part of it) - BufferHead *e = p->second; - - if (e->end() <= start+len && - e->is_tx()) - tx.push_back(e); - - block_t lenfromcur = MIN(e->end() - cur, left); - cur += lenfromcur; - left -= lenfromcur; - p++; - continue; // more? - } else if (p->first > cur) { - // gap.. miss - block_t next = p->first; - left -= (next-cur); - cur = next; - continue; - } - else - assert(0); - } - - return 0; -} - - -int ObjectCache::try_map_read(block_t start, block_t len) -{ - map::iterator p = data.lower_bound(start); - - block_t cur = start; - block_t left = len; - - if (p != data.begin() && - (p == data.end() || p->first > cur)) { - p--; // might overlap! - if (p->first + p->second->length() <= cur) - p++; // doesn't overlap. - } - - int num_missing = 0; - - while (left > 0) { - // at end? - if (p == data.end()) { - // rest is a miss. - vector exv; - on->map_extents(cur, - left, // no prefetch here! - exv); - - num_missing += exv.size(); - left = 0; - cur = start+len; - break; - } - - if (p->first <= cur) { - // have it (or part of it) - BufferHead *e = p->second; - - if (e->is_clean() || - e->is_dirty() || - e->is_tx()) { - dout(20) << "try_map_read hit " << *e << dendl; - } - else if (e->is_rx()) { - dout(20) << "try_map_read rx " << *e << dendl; - num_missing++; - } - else if (e->is_partial()) { - dout(-20) << "try_map_read partial " << *e << dendl; - num_missing++; - } - else { - dout(0) << "try_map_read got unexpected " << *e << dendl; - assert(0); - } - - block_t lenfromcur = MIN(e->end() - cur, left); - cur += lenfromcur; - left -= lenfromcur; - p++; - continue; // more? - } else if (p->first > cur) { - // gap.. miss - block_t next = p->first; - vector exv; - on->map_extents(cur, - MIN(next-cur, left), // no prefetch - exv); - - dout(20) << "try_map_read gap of " << p->first-cur << " blocks, " - << exv.size() << " extents" << dendl; - num_missing += exv.size(); - left -= (p->first - cur); - cur = p->first; - continue; // more? - } - else - assert(0); - } - - assert(left == 0); - assert(cur == start+len); - return num_missing; -} - - - - - -/* - * map a range of blocks into buffer_heads. - * - create missing buffer_heads as necessary. - * - fragment along disk extent boundaries - */ -int ObjectCache::map_read(block_t start, block_t len, - map& hits, - map& missing, - map& rx, - map& partial) { - - map::iterator p = data.lower_bound(start); - - block_t cur = start; - block_t left = len; - - if (p != data.begin() && - (p == data.end() || p->first > cur)) { - p--; // might overlap! - if (p->first + p->second->length() <= cur) - p++; // doesn't overlap. - } - - while (left > 0) { - // at end? - if (p == data.end()) { - // rest is a miss. - vector exv; - //on->map_extents(cur, left, exv); // we might consider some prefetch here. - on->map_extents(cur, - //MIN(left + g_conf.ebofs_max_prefetch, // prefetch - //on->object_blocks-cur), - left, // no prefetch - exv); - for (unsigned i=0; i 0; i++) { - BufferHead *n = new BufferHead(this); - n->set_start( cur ); - n->set_length( exv[i].length ); - bc->add_bh(n); - missing[cur] = n; - dout(20) << "map_read miss " << left << " left, " << *n << dendl; - cur += MIN(left,exv[i].length); - left -= MIN(left,exv[i].length); - } - assert(left == 0); - assert(cur == start+len); - break; - } - - if (p->first <= cur) { - // have it (or part of it) - BufferHead *e = p->second; - - if (e->is_clean() || - e->is_dirty() || - e->is_tx()) { - hits[cur] = e; // readable! - dout(20) << "map_read hit " << *e << dendl; - bc->touch(e); - } - else if (e->is_rx()) { - rx[cur] = e; // missing, not readable. - dout(20) << "map_read rx " << *e << dendl; - } - else if (e->is_partial()) { - partial[cur] = e; - dout(20) << "map_read partial " << *e << dendl; - } - else { - dout(0) << "map_read ??? got unexpected " << *e << dendl; - assert(0); - } - - block_t lenfromcur = MIN(e->end() - cur, left); - cur += lenfromcur; - left -= lenfromcur; - p++; - continue; // more? - } else if (p->first > cur) { - // gap.. miss - block_t next = p->first; - vector exv; - on->map_extents(cur, - //MIN(next-cur, MIN(left + g_conf.ebofs_max_prefetch, // prefetch - // on->object_blocks-cur)), - MIN(next-cur, left), // no prefetch - exv); - - for (unsigned i=0; i0; i++) { - BufferHead *n = new BufferHead(this); - n->set_start( cur ); - n->set_length( exv[i].length ); - bc->add_bh(n); - missing[cur] = n; - cur += MIN(left, n->length()); - left -= MIN(left, n->length()); - dout(20) << "map_read gap " << *n << dendl; - } - continue; // more? - } - else - assert(0); - } - - assert(left == 0); - assert(cur == start+len); - return 0; -} - - -/* - * map a range of pages on an object's buffer cache. - * - * - break up bufferheads that don't fall completely within the range - * - cancel rx ops we obsolete. - * - resubmit rx ops if we split bufferheads - * - * - leave potentially obsoleted tx ops alone (for now) - * - don't worry about disk extent boundaries (yet) - */ -int ObjectCache::map_write(block_t start, block_t len, - map& hits, - version_t super_epoch) -{ - map::iterator p; - - // hack speed up common cases - if (start == 0) { - p = data.begin(); - } else if (start + len == on->object_blocks && len == 1 && !data.empty()) { - // append hack. - p = data.end(); - p--; - if (p->first < start) p++; - } else { - p = data.lower_bound(start); - } - - dout(10) << "map_write " << *on << " " << start << "~" << len << dendl; - // p->first >= start - - block_t cur = start; - block_t left = len; - - if (p != data.begin() && - (p == data.end() || p->first > cur)) { - p--; // might overlap! - if (p->first + p->second->length() <= cur) - p++; // doesn't overlap. - } - - //dump(); - - while (left > 0) { - // max for this bh (bc of (re)alloc on disk) - block_t max = left; - - // based on disk extent boundary ... - vector exv; - on->map_extents(cur, max, exv); - if (exv.size() > 1) - max = exv[0].length; - - dout(10) << "map_write " << cur << "~" << max << dendl; - - // at end? - if (p == data.end()) { - BufferHead *n = new BufferHead(this); - n->set_start( cur ); - n->set_length( max ); - bc->add_bh(n); - hits[cur] = n; - left -= max; - cur += max; - continue; - } - - dout(10) << "p is " << *p->second << dendl; - - - if (p->first <= cur) { - BufferHead *bh = p->second; - dout(10) << "map_write bh " << *bh << " intersected" << dendl; - - if (p->first < cur) { - if (cur+max >= p->first+p->second->length()) { - // we want right bit (one splice) - if (bh->is_rx() && bc->bh_cancel_read(bh)) { - BufferHead *right = bc->split(bh, cur); - bc->bh_read(on, bh); // reread left bit - bh = right; - } else if (bh->is_tx() && bh->epoch_modified == super_epoch && bc->bh_cancel_write(bh, super_epoch)) { - BufferHead *right = bc->split(bh, cur); - bc->bh_write(on, bh); // rewrite left bit - bh = right; - } else { - bh = bc->split(bh, cur); // just split it - } - p++; - assert(p->second == bh); - } else { - // we want middle bit (two splices) - if (bh->is_rx() && bc->bh_cancel_read(bh)) { - BufferHead *middle = bc->split(bh, cur); - bc->bh_read(on, bh); // reread left - p++; - assert(p->second == middle); - BufferHead *right = bc->split(middle, cur+max); - bc->bh_read(on, right); // reread right - bh = middle; - } else if (bh->is_tx() && bh->epoch_modified == super_epoch && bc->bh_cancel_write(bh, super_epoch)) { - BufferHead *middle = bc->split(bh, cur); - bc->bh_write(on, bh); // redo left - p++; - assert(p->second == middle); - BufferHead *right = bc->split(middle, cur+max); - bc->bh_write(on, right); // redo right - bh = middle; - } else { - BufferHead *middle = bc->split(bh, cur); - p++; - assert(p->second == middle); - bc->split(middle, cur+max); - bh = middle; - } - } - } else if (p->first == cur) { - if (p->second->length() <= max) { - // whole bufferhead, piece of cake. - } else { - // we want left bit (one splice) - if (bh->is_rx() && bc->bh_cancel_read(bh)) { - BufferHead *right = bc->split(bh, cur+max); - bc->bh_read(on, right); // re-rx the right bit - } else if (bh->is_tx() && bh->epoch_modified == super_epoch && bc->bh_cancel_write(bh, super_epoch)) { - BufferHead *right = bc->split(bh, cur+max); - bc->bh_write(on, right); // re-tx the right bit - } else { - bc->split(bh, cur+max); // just split - } - } - } - - // try to cancel tx? - if (bh->is_tx() && bh->epoch_modified == super_epoch) bc->bh_cancel_write(bh, super_epoch); - - // put in our map - hits[cur] = bh; - - // keep going. - block_t lenfromcur = bh->end() - cur; - cur += lenfromcur; - left -= lenfromcur; - p++; - continue; - } else { - // gap! - block_t next = p->first; - block_t glen = MIN(next-cur, max); - dout(10) << "map_write gap " << cur << "~" << glen << dendl; - BufferHead *n = new BufferHead(this); - n->set_start( cur ); - n->set_length( glen ); - bc->add_bh(n); - hits[cur] = n; - - cur += glen; - left -= glen; - continue; // more? - } - } - - assert(left == 0); - assert(cur == start+len); - return 0; -} - -/* don't need this. -int ObjectCache::scan_versions(block_t start, block_t len, - version_t& low, version_t& high) -{ - map::iterator p = data.lower_bound(start); - // p->first >= start - - if (p != data.begin() && p->first > start) { - p--; // might overlap? - if (p->first + p->second->length() <= start) - p++; // doesn't overlap. - } - if (p->first >= start+len) - return -1; // to the right. no hits. - - // start - low = high = p->second->get_version(); - - for (p++; p != data.end(); p++) { - // past? - if (p->first >= start+len) break; - - const version_t v = p->second->get_version(); - if (low > v) low = v; - if (high < v) high = v; - } - - return 0; -} -*/ - -void ObjectCache::touch_bottom(block_t bstart, block_t blast) -{ - for (map::iterator p = data.lower_bound(bstart); - p != data.end(); - ++p) { - BufferHead *bh = p->second; - - // don't trim unless it's entirely in our range - if (bh->start() < bstart) continue; - if (bh->end() > blast) break; - - dout(12) << "moving " << *bh << " to bottom of lru" << dendl; - bc->touch_bottom(bh); // move to bottom of lru list - } -} - - -void ObjectCache::truncate(block_t blocks, version_t super_epoch) -{ - dout(7) << "truncate " << object_id - << " " << blocks << " blocks" - << dendl; - - while (!data.empty()) { - block_t bhoff = data.rbegin()->first; - BufferHead *bh = data.rbegin()->second; - - if (bh->end() <= blocks) break; - - bool uncom = on->uncommitted.contains(bh->start(), bh->length()); - dout(10) << "truncate " << *bh << " uncom " << uncom - << " of " << on->uncommitted - << dendl; - - if (bhoff < blocks) { - // we want right bit (one splice) - if (bh->is_rx() && bc->bh_cancel_read(bh)) { - BufferHead *right = bc->split(bh, blocks); - bc->bh_read(on, bh); // reread left bit - bh = right; - } else if (bh->is_tx() && uncom && bh->epoch_modified == super_epoch && bc->bh_cancel_write(bh, super_epoch)) { - BufferHead *right = bc->split(bh, blocks); - bc->bh_write(on, bh); // rewrite left bit - bh = right; - } else { - bh = bc->split(bh, blocks); // just split it - } - // no worries about partials up here, they're always 1 block (and thus never split) - } else { - // whole thing - // cancel any pending/queued io, if possible. - if (bh->is_rx()) - bc->bh_cancel_read(bh); - if (bh->is_tx() && uncom && bh->epoch_modified == super_epoch) - bc->bh_cancel_write(bh, super_epoch); - if (bh->shadow_of) { - dout(10) << "truncate " << *bh << " unshadowing " << *bh->shadow_of << dendl; - // shadow - bh->shadow_of->remove_shadow(bh); - if (bh->is_partial()) - bc->cancel_shadow_partial(bh->rx_from.start, bh); - } else { - // normal - if (bh->is_partial() && uncom) - bc->bh_cancel_partial_write(bh); - } - } - - for (map >::iterator p = bh->waitfor_read.begin(); - p != bh->waitfor_read.end(); - p++) { - finish_contexts(p->second, -1); - } - - bc->remove_bh(bh); - delete bh; - } -} - - -void ObjectCache::clone_to(Onode *other) -{ - ObjectCache *ton = 0; - - for (map::iterator p = data.begin(); - p != data.end(); - p++) { - BufferHead *bh = p->second; - dout(10) << "clone_to ? " << *bh << dendl; - if (bh->is_dirty() || bh->is_tx() || bh->is_partial()) { - // dup dirty or tx bh's - if (!ton) - ton = other->get_oc(bc); - BufferHead *nbh = new BufferHead(ton); - nbh->set_start( bh->start() ); - nbh->set_length( bh->length() ); - nbh->data = bh->data; // just copy refs to underlying buffers. - bc->add_bh(nbh); - - if (bh->is_partial()) { - dout(0) << "clone_to PARTIAL FIXME NOT FULLY IMPLEMENTED ******" << dendl; - nbh->partial = bh->partial; - bc->mark_partial(nbh); - // register as shadow_partial - bc->add_shadow_partial(bh->rx_from.start, nbh); - } else { - // clean buffer will shadow - bh->add_shadow(nbh); - bc->mark_clean(nbh); - } - - dout(10) << "clone_to dup " << *bh << " -> " << *nbh << dendl; - } - } -} - - - -BufferHead *ObjectCache::merge_bh_left(BufferHead *left, BufferHead *right) -{ - dout(10) << "merge_bh_left " << *left << " " << *right << dendl; - assert(left->end() == right->start()); - assert(left->is_clean()); - assert(right->is_clean()); - assert(right->get_num_ref() == 0); - - // hrm, is this right? - if (right->version > left->version) left->version = right->version; - if (right->last_flushed > left->last_flushed) left->last_flushed = right->last_flushed; - - left->set_length(left->length() + right->length()); - left->data.claim_append(right->data); - - // remove right - remove_bh(right); - bc->lru_rest.lru_remove(right); - delete right; - dout(10) << "merge_bh_left result " << *left << dendl; - return left; -} - -/* wait until this has a user -void ObjectCache::try_merge_bh(BufferHead *bh) -{ - dout(-10) << "try_merge_bh " << *bh << dendl; - - map::iterator p = data.lower_bound(bh->start()); - assert(p->second == bh); - - try_merge_bh_left(p); - try_merge_bh_right(p); -} -*/ - - -void ObjectCache::try_merge_bh_left(map::iterator& p) -{ - BufferHead *bh = p->second; - dout(10) << "try_merge_bh_left " << *bh << dendl; - - // left? - if (p != data.begin()) { - p--; - if (p->second->end() == bh->start() && - p->second->is_clean() && - bh->is_clean() && - bh->get_num_ref() == 0 && - bh->data.buffers().size() < 8 && - p->second->data.buffers().size() < 8) - bh = merge_bh_left(p->second, bh); // yay! - else - p++; // nope. - } -} - -void ObjectCache::try_merge_bh_right(map::iterator& p) -{ - BufferHead *bh = p->second; - dout(10) << "try_merge_bh_right " << *bh << dendl; - - // right? - map::iterator o = p; - p++; - if (p != data.end() && - bh->end() == p->second->start() && - p->second->is_clean() && - bh->is_clean() && - p->second->get_num_ref() == 0 && - bh->data.buffers().size() < 8 && - p->second->data.buffers().size() < 8) { - BufferHead *right = p->second; - p--; - merge_bh_left(bh, right); - } else - p = o; -} - - - -/************** BufferCache ***************/ - -#undef dout -#define dout(x) if (x <= g_conf.debug_ebofs) *_dout << dbeginl << g_clock.now() << " ebofs.bc." - - - -BufferHead *BufferCache::split(BufferHead *orig, block_t after) -{ - dout(20) << "split " << *orig << " at " << after << dendl; - - // split off right - BufferHead *right = new BufferHead(orig->get_oc()); - right->set_version(orig->get_version()); - right->epoch_modified = orig->epoch_modified; - right->last_flushed = orig->last_flushed; - right->set_state(orig->get_state()); - - block_t newleftlen = after - orig->start(); - right->set_start( after ); - right->set_length( orig->length() - newleftlen ); - - // shorten left - stat_sub(orig); - orig->set_length( newleftlen ); - stat_add(orig); - - // add right - add_bh(right); - - // adjust rx_from - if (orig->is_rx()) { - right->rx_from = orig->rx_from; - orig->rx_from.length = newleftlen; - right->rx_from.length -= newleftlen; - right->rx_from.start += newleftlen; - } - - // dup shadows - for (set::iterator p = orig->shadows.begin(); - p != orig->shadows.end(); - ++p) - right->add_shadow(*p); - - // split buffers too - bufferlist bl; - bl.claim(orig->data); - if (bl.length()) { - assert(bl.length() == (orig->length()+right->length())*EBOFS_BLOCK_SIZE); - right->data.substr_of(bl, orig->length()*EBOFS_BLOCK_SIZE, right->length()*EBOFS_BLOCK_SIZE); - orig->data.substr_of(bl, 0, orig->length()*EBOFS_BLOCK_SIZE); - } - - // move read waiters - if (!orig->waitfor_read.empty()) { - map >::iterator o, p = orig->waitfor_read.end(); - p--; - while (p != orig->waitfor_read.begin()) { - if (p->first < right->start()) break; - dout(0) << "split moving waiters at block " << p->first << " to right bh" << dendl; - right->waitfor_read[p->first].swap( p->second ); - o = p; - p--; - orig->waitfor_read.erase(o); - } - } - - dout(20) << "split left is " << *orig << dendl; - dout(20) << "split right is " << *right << dendl; - return right; -} - - - - -void BufferCache::bh_read(Onode *on, BufferHead *bh, block_t from) -{ - dout(10) << "bh_read " << *on << " on " << *bh << dendl; - - if (bh->is_missing()) { - mark_rx(bh); - } else { - assert(bh->is_partial()); - } - - // get extent. there should be only one! - vector exv; - on->map_extents(bh->start(), bh->length(), exv); - assert(exv.size() == 1); - Extent ex = exv[0]; - - if (from) { // force behavior, used for reading partials - dout(10) << "bh_read forcing read from block " << from << " (for a partial)" << dendl; - ex.start = from; - ex.length = 1; - } - - // this should be empty!! - assert(bh->rx_ioh == 0); - - dout(20) << "bh_read " << *on << " " << *bh << " from " << ex << dendl; - - C_OC_RxFinish *fin = new C_OC_RxFinish(ebofs_lock, on->oc, - bh->start(), bh->length(), - ex.start); - - //bufferpool.alloc(EBOFS_BLOCK_SIZE*bh->length(), fin->bl); // new buffers! - fin->bl.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*bh->length()) ); - - bh->rx_ioh = dev.read(ex.start, ex.length, fin->bl, - fin); - bh->rx_from = ex; - on->oc->get(); - -} - -bool BufferCache::bh_cancel_read(BufferHead *bh) -{ - if (bh->rx_ioh && dev.cancel_io(bh->rx_ioh) >= 0) { - dout(10) << "bh_cancel_read on " << *bh << dendl; - bh->rx_ioh = 0; - mark_missing(bh); - int l = bh->oc->put(); - assert(l); - return true; - } - return false; -} - -void BufferCache::bh_write(Onode *on, BufferHead *bh, block_t shouldbe) -{ - dout(10) << "bh_write " << *on << " on " << *bh << " in epoch " << bh->epoch_modified << dendl; - assert(bh->get_version() > 0); - - assert(bh->is_dirty()); - mark_tx(bh); - - // get extents - vector exv; - on->map_extents(bh->start(), bh->length(), exv); - assert(exv.size() == 1); - Extent ex = exv[0]; - - if (shouldbe) - assert(ex.length == 1 && ex.start == shouldbe); - - dout(20) << "bh_write " << *on << " " << *bh << " to " << ex << dendl; - - //assert(bh->tx_ioh == 0); - - assert(bh->get_last_flushed() < bh->get_version()); - - bh->tx_block = ex.start; - bh->tx_ioh = dev.write(ex.start, ex.length, bh->data, - new C_OC_TxFinish(ebofs_lock, on->oc, - bh->start(), bh->length(), - bh->get_version(), - bh->epoch_modified), - "bh_write"); - - on->oc->get(); - inc_unflushed( EBOFS_BC_FLUSH_BHWRITE, bh->epoch_modified ); - - /* - // assert: no partials on the same block - // hose any partial on the same block - if (bh->partial_write.count(ex.start)) { - dout(10) << "bh_write hosing parital write on same block " << ex.start << " " << *bh << dendl; - dec_unflushed( bh->partial_write[ex.start].epoch ); - bh->partial_write.erase(ex.start); - } - */ -} - - -bool BufferCache::bh_cancel_write(BufferHead *bh, version_t cur_epoch) -{ - assert(bh->is_tx()); - assert(bh->epoch_modified == cur_epoch); - assert(bh->epoch_modified > 0); - if (bh->tx_ioh && dev.cancel_io(bh->tx_ioh) >= 0) { - dout(10) << "bh_cancel_write on " << *bh << dendl; - bh->tx_ioh = 0; - mark_dirty(bh); - - dec_unflushed( EBOFS_BC_FLUSH_BHWRITE, bh->epoch_modified ); // assert.. this should be the same epoch! - - int l = bh->oc->put(); - assert(l); - return true; - } - return false; -} - -void BufferCache::tx_finish(ObjectCache *oc, - ioh_t ioh, block_t start, block_t length, - version_t version, version_t epoch) -{ - ebofs_lock.Lock(); - - // finish oc - if (oc->put() == 0) { - delete oc; - } else - oc->tx_finish(ioh, start, length, version, epoch); - - // update unflushed counter - assert(get_unflushed(EBOFS_BC_FLUSH_BHWRITE, epoch) > 0); - dec_unflushed(EBOFS_BC_FLUSH_BHWRITE, epoch); - - ebofs_lock.Unlock(); -} - -void BufferCache::rx_finish(ObjectCache *oc, - ioh_t ioh, block_t start, block_t length, - block_t diskstart, - bufferlist& bl) -{ - ebofs_lock.Lock(); - dout(10) << "rx_finish ioh " << ioh << " on " << start << "~" << length - << ", at device block " << diskstart << dendl; - - // oc - if (oc->put() == 0) - delete oc; - else - oc->rx_finish(ioh, start, length, bl); - - // finish any partials? - // note: these are partials that were re-written after a commit, - // or for whom the OC was destroyed (eg truncated after a commit) - map >::iterator sp = partial_write.lower_bound(diskstart); - while (sp != partial_write.end()) { - if (sp->first >= diskstart+length) break; - assert(sp->first >= diskstart); - - block_t pblock = sp->first; - map writes; - writes.swap( sp->second ); - - map >::iterator t = sp; - sp++; - partial_write.erase(t); - - for (map::iterator p = writes.begin(); - p != writes.end(); - p++) { - dout(10) << "rx_finish partial from " << pblock << " -> " << p->first - << " for epoch " << p->second.epoch - //<< " (bh.epoch_modified is now " << bh->epoch_modified << ")" - << dendl; - // this had better be a past epoch - //assert(p->epoch == epoch_modified - 1); // ?? - - // make the combined block - bufferlist combined; - bufferptr bp = buffer::create_page_aligned(EBOFS_BLOCK_SIZE); - combined.push_back( bp ); - combined.copy_in((pblock-diskstart)*EBOFS_BLOCK_SIZE, (pblock-diskstart+1)*EBOFS_BLOCK_SIZE, bl); - BufferHead::apply_partial( combined, p->second.partial ); - - // write it! - dev.write( pblock, 1, combined, - new C_OC_PartialTxFinish( this, p->second.epoch ), - "finish_partials"); - } - } - - // shadow partials? - { - list waiters; - map >::iterator sp = shadow_partials.lower_bound(diskstart); - while (sp != shadow_partials.end()) { - if (sp->first >= diskstart+length) break; - assert(sp->first >= diskstart); - - block_t pblock = sp->first; - set ls; - ls.swap( sp->second ); - - map >::iterator t = sp; - sp++; - shadow_partials.erase(t); - - for (set::iterator p = ls.begin(); - p != ls.end(); - ++p) { - BufferHead *bh = *p; - dout(10) << "rx_finish applying shadow_partial for " << pblock - << " to " << *bh << dendl; - bufferptr bp = buffer::create_page_aligned(EBOFS_BLOCK_SIZE); - bh->data.clear(); - bh->data.push_back( bp ); - bh->data.copy_in((pblock-diskstart)*EBOFS_BLOCK_SIZE, - (pblock-diskstart+1)*EBOFS_BLOCK_SIZE, - bl); - bh->apply_partial(); - bh->set_state(BufferHead::STATE_CLEAN); - - // trigger waiters - for (map >::iterator p = bh->waitfor_read.begin(); - p != bh->waitfor_read.end(); - p++) { - assert(p->first >= bh->start() && p->first < bh->end()); - waiters.splice(waiters.begin(), p->second); - } - bh->waitfor_read.clear(); - } - } - - // kick waiters - finish_contexts(waiters); - } - - // done. - ebofs_lock.Unlock(); -} - -void BufferCache::partial_tx_finish(version_t epoch) -{ - ebofs_lock.Lock(); - - dout(10) << "partial_tx_finish in epoch " << epoch << dendl; - - // update unflushed counter - assert(get_unflushed(EBOFS_BC_FLUSH_PARTIAL, epoch) > 0); - dec_unflushed(EBOFS_BC_FLUSH_PARTIAL, epoch); - - ebofs_lock.Unlock(); -} - - - - -void BufferCache::bh_queue_partial_write(Onode *on, BufferHead *bh) -{ - assert(bh->get_version() > 0); - - assert(bh->is_partial()); - assert(bh->length() == 1); - - // get the block no - vector exv; - on->map_extents(bh->start(), bh->length(), exv); - assert(exv.size() == 1); - block_t b = exv[0].start; - assert(exv[0].length == 1); - bh->partial_tx_to = exv[0].start; - bh->partial_tx_epoch = bh->epoch_modified; - - dout(10) << "bh_queue_partial_write " << *on << " on " << *bh << " block " << b << " epoch " << bh->epoch_modified << dendl; - - - // copy map state, queue for this block - assert(bh->rx_from.length == 1); - queue_partial( bh->rx_from.start, bh->partial_tx_to, bh->partial, bh->partial_tx_epoch ); -} - -void BufferCache::bh_cancel_partial_write(BufferHead *bh) -{ - assert(bh->is_partial()); - assert(bh->length() == 1); - - cancel_partial( bh->rx_from.start, bh->partial_tx_to, bh->partial_tx_epoch ); -} - - -void BufferCache::queue_partial(block_t from, block_t to, - map& partial, version_t epoch) -{ - dout(10) << "queue_partial " << from << " -> " << to - << " in epoch " << epoch - << dendl; - - if (partial_write[from].count(to)) { - // this should be in the same epoch. - assert( partial_write[from][to].epoch == epoch); - assert(0); // actually.. no! - } else { - inc_unflushed( EBOFS_BC_FLUSH_PARTIAL, epoch ); - } - - partial_write[from][to].partial = partial; - partial_write[from][to].epoch = epoch; -} - -void BufferCache::cancel_partial(block_t from, block_t to, version_t epoch) -{ - assert(partial_write.count(from)); - assert(partial_write[from].count(to)); - assert(partial_write[from][to].epoch == epoch); - - dout(10) << "cancel_partial " << from << " -> " << to - << " (was epoch " << partial_write[from][to].epoch << ")" - << dendl; - - partial_write[from].erase(to); - if (partial_write[from].empty()) - partial_write.erase(from); - - dec_unflushed( EBOFS_BC_FLUSH_PARTIAL, epoch ); -} - - -void BufferCache::add_shadow_partial(block_t from, BufferHead *bh) -{ - dout(10) << "add_shadow_partial from " << from << " " << *bh << dendl; - shadow_partials[from].insert(bh); -} - -void BufferCache::cancel_shadow_partial(block_t from, BufferHead *bh) -{ - dout(10) << "cancel_shadow_partial from " << from << " " << *bh << dendl; - shadow_partials[from].erase(bh); -} diff --git a/branches/sage/crush/ebofs/BufferCache.h b/branches/sage/crush/ebofs/BufferCache.h deleted file mode 100644 index 346a5cc785618..0000000000000 --- a/branches/sage/crush/ebofs/BufferCache.h +++ /dev/null @@ -1,723 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __EBOFS_BUFFERCACHE_H -#define __EBOFS_BUFFERCACHE_H - -#include "include/lru.h" -#include "include/Context.h" - -#include "common/Clock.h" - -#include "types.h" -#include "BlockDevice.h" - -#include "include/interval_set.h" -#include "include/xlist.h" - -class ObjectCache; -class BufferCache; -class Onode; - -class BufferHead : public LRUObject { - public: - /* - * - buffer_heads should always break across disk extent boundaries - * - partial buffer_heads are always 1 block. - */ - const static int STATE_MISSING = 0; // missing; data is on disk, but not loaded. - const static int STATE_CLEAN = 1; // Rw clean - const static int STATE_DIRTY = 2; // RW dirty - const static int STATE_TX = 3; // Rw flushing to disk - const static int STATE_RX = 4; // w reading from disk - const static int STATE_PARTIAL = 5; // reading from disk, + partial content map. always 1 block. - - public: - ObjectCache *oc; - - bufferlist data; - - ioh_t rx_ioh; // - Extent rx_from; - ioh_t tx_ioh; // - block_t tx_block; - block_t partial_tx_to; - version_t partial_tx_epoch; - - map partial; // partial dirty content overlayed onto incoming data - - map< block_t, list > waitfor_read; - - set shadows; // shadow bh's that clone()ed me. - BufferHead* shadow_of; - - - private: - int ref; - int state; - - public: - version_t epoch_modified; - - version_t version; // current version in cache - version_t last_flushed; // last version flushed to disk - - Extent object_loc; // block position _in_object_ - - utime_t dirty_stamp; - //xlist::item xlist_dirty; - - bool want_to_expire; // wants to be at bottom of lru - - public: - BufferHead(ObjectCache *o) : - oc(o), //cancellable_ioh(0), tx_epoch(0), - rx_ioh(0), tx_ioh(0), tx_block(0), partial_tx_to(0), partial_tx_epoch(0), - shadow_of(0), - ref(0), state(STATE_MISSING), epoch_modified(0), version(0), last_flushed(0), - //xlist_dirty(this), - want_to_expire(false) - {} - ~BufferHead() { - unpin_shadows(); - } - - ObjectCache *get_oc() { return oc; } - - int get() { - assert(ref >= 0); - if (ref == 0) lru_pin(); - return ++ref; - } - int put() { - assert(ref > 0); - if (ref == 1) lru_unpin(); - --ref; - return ref; - } - int get_num_ref() { return ref; } - - block_t start() { return object_loc.start; } - void set_start(block_t s) { object_loc.start = s; } - block_t length() { return object_loc.length; } - void set_length(block_t l) { object_loc.length = l; } - block_t end() { return start() + length(); } - block_t last() { return end()-1; } - - version_t get_version() { return version; } - void set_version(version_t v) { version = v; } - version_t get_last_flushed() { return last_flushed; } - void set_last_flushed(version_t v) { - if (v <= last_flushed) cout << "last_flushed begin set to " << v << ", was " << last_flushed << std::endl; - assert(v > last_flushed); - last_flushed = v; - } - - utime_t get_dirty_stamp() { return dirty_stamp; } - void set_dirty_stamp(utime_t t) { dirty_stamp = t; } - - void set_state(int s) { - if (s == STATE_PARTIAL || s == STATE_RX || s == STATE_TX) get(); - if (state == STATE_PARTIAL || state == STATE_RX || state == STATE_TX) put(); - - if ((state == STATE_TX && s != STATE_TX) || - (state == STATE_PARTIAL && s != STATE_PARTIAL)) - unpin_shadows(); - - state = s; - } - int get_state() { return state; } - - bool is_missing() { return state == STATE_MISSING; } - bool is_dirty() { return state == STATE_DIRTY; } - bool is_clean() { return state == STATE_CLEAN; } - bool is_tx() { return state == STATE_TX; } - bool is_rx() { return state == STATE_RX; } - bool is_partial() { return state == STATE_PARTIAL; } - - //bool is_partial_writes() { return !partial_write.empty(); } - //void finish_partials(); - //void cancel_partials(); - //void queue_partial_write(block_t b); - - void add_shadow(BufferHead *dup) { - shadows.insert(dup); - dup->shadow_of = this; - dup->get(); - } - void remove_shadow(BufferHead *dup) { - shadows.erase(dup); - dup->shadow_of = 0; - dup->put(); - } - void unpin_shadows() { - for (set::iterator p = shadows.begin(); - p != shadows.end(); - ++p) { - //cout << "unpin shadow " << *p << std::endl; - (*p)->shadow_of = 0; - (*p)->put(); - } - shadows.clear(); - } - - void copy_partial_substr(off_t start, off_t end, bufferlist& bl) { - map::iterator i = partial.begin(); - - // skip first bits (fully to left) - while ((i->first + i->second.length() < start) && - i != partial.end()) - i++; - assert(i != partial.end()); - assert(i->first <= start); - - // first - unsigned bhoff = MAX(start, i->first) - i->first; - unsigned bhlen = MIN(end-start, i->second.length()); - bl.substr_of( i->second, bhoff, bhlen ); - - off_t pos = i->first + i->second.length(); - - // have continuous to end? - for (i++; i != partial.end(); i++) { - if (pos >= end) break; - assert(pos == i->first); - - pos = i->first + i->second.length(); - - if (pos <= end) { // this whole frag - bl.append( i->second ); - } else { // partial end - unsigned bhlen = end-start-bl.length(); - bufferlist frag; - frag.substr_of( i->second, 0, bhlen ); - bl.claim_append(frag); - break; // done. - } - } - - assert(pos >= end); - assert(bl.length() == (unsigned)(end-start)); - } - - bool have_partial_range(off_t start, off_t end) { - map::iterator i = partial.begin(); - - // skip first bits (fully to left) - while ((i->first + i->second.length() < start) && - i != partial.end()) - i++; - if (i == partial.end()) return false; - - // have start? - if (i->first > start) return false; - off_t pos = i->first + i->second.length(); - - // have continuous to end? - for (i++; i != partial.end(); i++) { - assert(pos <= i->first); - if (pos < i->first) return false; - assert(pos == i->first); - pos = i->first + i->second.length(); - if (pos >= end) break; // gone far enough - } - - if (pos >= end) return true; - return false; - } - - bool partial_is_complete(off_t size) { - return have_partial_range( 0, MIN(size, EBOFS_BLOCK_SIZE) ); - //(off_t)(start()*EBOFS_BLOCK_SIZE), - //MIN( size, (off_t)(end()*EBOFS_BLOCK_SIZE) ) ); - } - void apply_partial() { - apply_partial(data, partial); - partial.clear(); - } - static void apply_partial(bufferlist& bl, map& pm) { - assert(bl.length() == (unsigned)EBOFS_BLOCK_SIZE); - //assert(partial_is_complete()); - //cout << "apply_partial" << std::endl; - for (map::iterator i = pm.begin(); - i != pm.end(); - i++) { - int pos = i->first; - //cout << " frag at opos " << i->first << " bhpos " << pos << " len " << i->second.length() << std::endl; - bl.copy_in(pos, i->second.length(), i->second); - } - pm.clear(); - } - void add_partial(off_t off, bufferlist& p) { - unsigned len = p.length(); - assert(len <= (unsigned)EBOFS_BLOCK_SIZE); - //assert(off >= (off_t)(start()*EBOFS_BLOCK_SIZE)); - //assert(off + len <= (off_t)(end()*EBOFS_BLOCK_SIZE)); - assert(off >= 0); - assert(off + len <= EBOFS_BLOCK_SIZE); - - // trim any existing that overlaps - map::iterator i = partial.begin(); - while (i != partial.end()) { - // is [off,off+len)... - // past i? - if (off >= i->first + i->second.length()) { - i++; - continue; - } - // before i? - if (i->first >= off+len) break; - - // does [off,off+len)... - // overlap all of i? - if (off <= i->first && off+len >= i->first + i->second.length()) { - // erase it and move on. - partial.erase(i++); - continue; - } - // overlap tail of i? - if (off > i->first && off+len >= i->first + i->second.length()) { - // shorten i. - bufferlist o; - o.claim( i->second ); - unsigned taillen = off - i->first; - i->second.substr_of(o, 0, taillen); - i++; - continue; - } - // overlap head of i? - if (off <= i->first && off+len < i->first + i->second.length()) { - // move i (make new tail). - off_t tailoff = off+len; - unsigned trim = tailoff - i->first; - partial[tailoff].substr_of(i->second, trim, i->second.length()-trim); - partial.erase(i++); // should now be at tailoff - i++; - continue; - } - // split i? - if (off > i->first && off+len < i->first + i->second.length()) { - bufferlist o; - o.claim( i->second ); - // shorten head - unsigned headlen = off - i->first; - i->second.substr_of(o, 0, headlen); - // new tail - unsigned tailoff = off+len - i->first; - unsigned taillen = o.length() - len - headlen; - partial[off+len].substr_of(o, tailoff, taillen); - break; - } - assert(0); - } - - // insert - partial[off] = p; - } - -}; - -inline ostream& operator<<(ostream& out, BufferHead& bh) -{ - out << "bufferhead(" << bh.start() << "~" << bh.length(); - out << " v" << bh.get_version() << "/" << bh.get_last_flushed(); - if (bh.is_missing()) out << " missing"; - if (bh.is_dirty()) out << " dirty"; - if (bh.is_clean()) out << " clean"; - if (bh.is_rx()) out << " rx"; - if (bh.is_tx()) out << " tx"; - if (bh.is_partial()) out << " partial"; - - // include epoch modified? - if (bh.is_dirty() || bh.is_tx() || bh.is_partial()) - out << "(e" << bh.epoch_modified << ")"; - - //out << " " << bh.data.length(); - out << " " << &bh; - out << ")"; - return out; -} - - -class ObjectCache { - public: - object_t object_id; - Onode *on; - BufferCache *bc; - - private: - map data; - int ref; - - public: - version_t write_count; - - - public: - ObjectCache(object_t o, Onode *_on, BufferCache *b) : - object_id(o), on(_on), bc(b), ref(0), - write_count(0) { } - ~ObjectCache() { - assert(data.empty()); - assert(ref == 0); - } - - int get() { - ++ref; - //cout << "oc.get " << object_id << " " << ref << std::endl; - return ref; - } - int put() { - assert(ref > 0); - --ref; - //cout << "oc.put " << object_id << " " << ref << std::endl; - return ref; - } - - object_t get_object_id() { return object_id; } - - void add_bh(BufferHead *bh) { - // add to my map - assert(data.count(bh->start()) == 0); - - if (0) { // sanity check FIXME DEBUG - //cout << "add_bh " << bh->start() << "~" << bh->length() << std::endl; - map::iterator p = data.lower_bound(bh->start()); - if (p != data.end()) { - //cout << " after " << *p->second << std::endl; - //cout << " after starts at " << p->first << std::endl; - assert(p->first >= bh->end()); - } - if (p != data.begin()) { - p--; - //cout << " before starts at " << p->second->start() - //<< " and ends at " << p->second->end() << std::endl; - //cout << " before " << *p->second << std::endl; - assert(p->second->end() <= bh->start()); - } - } - - data[bh->start()] = bh; - } - void remove_bh(BufferHead *bh) { - assert(data.count(bh->start())); - data.erase(bh->start()); - } - bool is_empty() { return data.empty(); } - - void try_merge_bh(BufferHead *bh); - void try_merge_bh_left(map::iterator& p); - void try_merge_bh_right(map::iterator& p); - BufferHead* merge_bh_left(BufferHead *left, BufferHead *right); - - int find_tx(block_t start, block_t len, - list& tx); - - int map_read(block_t start, block_t len, - map& hits, // hits - map& missing, // read these from disk - map& rx, // wait for these to finish reading from disk - map& partial); // (maybe) wait for these to read from disk - int try_map_read(block_t start, block_t len); // just tell us how many extents we're missing. - - - int map_write(block_t start, block_t len, - map& hits, - version_t super_epoch); // can write to these. - void touch_bottom(block_t bstart, block_t blast); - - BufferHead *split(BufferHead *bh, block_t off); - - /*int scan_versions(block_t start, block_t len, - version_t& low, version_t& high); - */ - - void rx_finish(ioh_t ioh, block_t start, block_t length, bufferlist& bl); - void tx_finish(ioh_t ioh, block_t start, block_t length, version_t v, version_t epoch); - - void truncate(block_t blocks, version_t super_epoch); - // void tear_down(); - - void clone_to(Onode *other); - - void dump() { - for (map::iterator i = data.begin(); - i != data.end(); - i++) - cout << "dump: " << i->first << ": " << *i->second << std::endl; - } - -}; - - - -class BufferCache { - public: - Mutex &ebofs_lock; // hack: this is a ref to global ebofs_lock - BlockDevice &dev; - - //xlist dirty_bh; - - LRU lru_dirty, lru_rest; - - private: - Cond stat_cond; - Cond flush_cond; - int stat_waiter; - - off_t stat_clean; - off_t stat_dirty; - off_t stat_rx; - off_t stat_tx; - off_t stat_partial; - off_t stat_missing; - -#define EBOFS_BC_FLUSH_BHWRITE 0 -#define EBOFS_BC_FLUSH_PARTIAL 1 - - map epoch_unflushed[2]; - - /* partial writes - incomplete blocks that can't be written until - * their prior content is read and overlayed with the new data. - * - * we put partial block management here because objects may be deleted - * before the read completes, but the write may have been committed in a - * prior epoch. - * - * we map: src block -> dest block -> PartialWrite - * - * really, at most there will only ever be two of these, for current+previous epochs. - */ - class PartialWrite { - public: - map partial; // partial dirty content overlayed onto incoming data - version_t epoch; - }; - - map > partial_write; // queued writes w/ partial content - map > shadow_partials; - - public: - BufferCache(BlockDevice& d, Mutex& el) : - ebofs_lock(el), dev(d), - stat_waiter(0), - stat_clean(0), stat_dirty(0), stat_rx(0), stat_tx(0), stat_partial(0), stat_missing(0) - {} - - - off_t get_size() { - return stat_clean+stat_dirty+stat_rx+stat_tx+stat_partial; - } - off_t get_trimmable() { - return stat_clean; - } - - - // bh's in cache - void add_bh(BufferHead *bh) { - bh->get_oc()->add_bh(bh); - if (bh->is_dirty()) { - lru_dirty.lru_insert_mid(bh); - //dirty_bh.push_back(&bh->xlist_dirty); - } else - lru_rest.lru_insert_mid(bh); - stat_add(bh); - } - void touch(BufferHead *bh) { - if (bh->is_dirty()) { - lru_dirty.lru_touch(bh); - } else - lru_rest.lru_touch(bh); - } - void touch_bottom(BufferHead *bh) { - if (bh->is_dirty()) { - bh->want_to_expire = true; - lru_dirty.lru_bottouch(bh); - } else - lru_rest.lru_bottouch(bh); - } - void remove_bh(BufferHead *bh) { - bh->get_oc()->remove_bh(bh); - stat_sub(bh); - if (bh->is_dirty()) { - lru_dirty.lru_remove(bh); - //dirty_bh.push_back(&bh->xlist_dirty); - } else - lru_rest.lru_remove(bh); - } - - // stats - void stat_add(BufferHead *bh) { - switch (bh->get_state()) { - case BufferHead::STATE_MISSING: stat_missing += bh->length(); break; - case BufferHead::STATE_CLEAN: stat_clean += bh->length(); break; - case BufferHead::STATE_DIRTY: stat_dirty += bh->length(); break; - case BufferHead::STATE_TX: stat_tx += bh->length(); break; - case BufferHead::STATE_RX: stat_rx += bh->length(); break; - case BufferHead::STATE_PARTIAL: stat_partial += bh->length(); break; - } - if (stat_waiter) stat_cond.Signal(); - } - void stat_sub(BufferHead *bh) { - switch (bh->get_state()) { - case BufferHead::STATE_MISSING: stat_missing -= bh->length(); break; - case BufferHead::STATE_CLEAN: stat_clean -= bh->length(); break; - case BufferHead::STATE_DIRTY: stat_dirty -= bh->length(); break; - case BufferHead::STATE_TX: stat_tx -= bh->length(); break; - case BufferHead::STATE_RX: stat_rx -= bh->length(); break; - case BufferHead::STATE_PARTIAL: stat_partial -= bh->length(); break; - } - } - off_t get_stat_tx() { return stat_tx; } - off_t get_stat_rx() { return stat_rx; } - off_t get_stat_dirty() { return stat_dirty; } - off_t get_stat_clean() { return stat_clean; } - off_t get_stat_partial() { return stat_partial; } - - - map &get_unflushed(int what) { - return epoch_unflushed[what]; - } - - int get_unflushed(int what, version_t epoch) { - return epoch_unflushed[what][epoch]; - } - void inc_unflushed(int what, version_t epoch) { - epoch_unflushed[what][epoch]++; - //cout << "inc_unflushed " << epoch << " now " << epoch_unflushed[epoch] << std::endl; - } - void dec_unflushed(int what, version_t epoch) { - epoch_unflushed[what][epoch]--; - //cout << "dec_unflushed " << epoch << " now " << epoch_unflushed[epoch] << std::endl; - if (epoch_unflushed[what][epoch] == 0) - flush_cond.Signal(); - } - - void waitfor_stat() { - stat_waiter++; - stat_cond.Wait(ebofs_lock); - stat_waiter--; - } - void waitfor_flush() { - flush_cond.Wait(ebofs_lock); - } - - - // bh state - void set_state(BufferHead *bh, int s) { - // move between lru lists? - if (s == BufferHead::STATE_DIRTY && bh->get_state() != BufferHead::STATE_DIRTY) { - lru_rest.lru_remove(bh); - lru_dirty.lru_insert_top(bh); - //dirty_bh.push_back(&bh->xlist_dirty); - } - if (s != BufferHead::STATE_DIRTY && bh->get_state() == BufferHead::STATE_DIRTY) { - lru_dirty.lru_remove(bh); - if (bh->want_to_expire) - lru_rest.lru_insert_bot(bh); - else - lru_rest.lru_insert_mid(bh); - //dirty_bh.remove(&bh->xlist_dirty); - } - - // set state - stat_sub(bh); - bh->set_state(s); - stat_add(bh); - } - - void copy_state(BufferHead *bh1, BufferHead *bh2) { - set_state(bh2, bh1->get_state()); - } - - void mark_missing(BufferHead *bh) { set_state(bh, BufferHead::STATE_MISSING); }; - void mark_clean(BufferHead *bh) { set_state(bh, BufferHead::STATE_CLEAN); }; - void mark_rx(BufferHead *bh) { set_state(bh, BufferHead::STATE_RX); }; - void mark_partial(BufferHead *bh) { set_state(bh, BufferHead::STATE_PARTIAL); }; - void mark_tx(BufferHead *bh) { set_state(bh, BufferHead::STATE_TX); }; - void mark_dirty(BufferHead *bh) { - set_state(bh, BufferHead::STATE_DIRTY); - bh->set_dirty_stamp(g_clock.now()); - }; - - - // io - void bh_read(Onode *on, BufferHead *bh, block_t from=0); - void bh_write(Onode *on, BufferHead *bh, block_t shouldbe=0); - - bool bh_cancel_read(BufferHead *bh); - bool bh_cancel_write(BufferHead *bh, version_t cur_epoch); - - void bh_queue_partial_write(Onode *on, BufferHead *bh); - void bh_cancel_partial_write(BufferHead *bh); - - void queue_partial(block_t from, block_t to, map& partial, version_t epoch); - void cancel_partial(block_t from, block_t to, version_t epoch); - - void add_shadow_partial(block_t from, BufferHead *bh); - void cancel_shadow_partial(block_t from, BufferHead *bh); - - void rx_finish(ObjectCache *oc, ioh_t ioh, block_t start, block_t len, block_t diskstart, bufferlist& bl); - void tx_finish(ObjectCache *oc, ioh_t ioh, block_t start, block_t len, version_t v, version_t e); - void partial_tx_finish(version_t epoch); - - friend class C_E_FlushPartial; - - // bh fun - BufferHead *split(BufferHead *orig, block_t after); -}; - - -class C_OC_RxFinish : public BlockDevice::callback { - Mutex &lock; - ObjectCache *oc; - block_t start, length; - block_t diskstart; -public: - bufferlist bl; - C_OC_RxFinish(Mutex &m, ObjectCache *o, block_t s, block_t l, block_t ds) : - lock(m), oc(o), start(s), length(l), diskstart(ds) {} - void finish(ioh_t ioh, int r) { - oc->bc->rx_finish(oc, ioh, start, length, diskstart, bl); - } -}; - -class C_OC_TxFinish : public BlockDevice::callback { - Mutex &lock; - ObjectCache *oc; - block_t start, length; - version_t version; - version_t epoch; - public: - C_OC_TxFinish(Mutex &m, ObjectCache *o, block_t s, block_t l, version_t v, version_t e) : - lock(m), oc(o), start(s), length(l), version(v), epoch(e) {} - void finish(ioh_t ioh, int r) { - oc->bc->tx_finish(oc, ioh, start, length, version, epoch); - } -}; - -class C_OC_PartialTxFinish : public BlockDevice::callback { - BufferCache *bc; - version_t epoch; -public: - C_OC_PartialTxFinish(BufferCache *b, version_t e) : - bc(b), epoch(e) {} - void finish(ioh_t ioh, int r) { - bc->partial_tx_finish(epoch); - } -}; - - -#endif diff --git a/branches/sage/crush/ebofs/Cnode.h b/branches/sage/crush/ebofs/Cnode.h deleted file mode 100644 index 8415978893fb5..0000000000000 --- a/branches/sage/crush/ebofs/Cnode.h +++ /dev/null @@ -1,101 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __EBOFS_CNODE_H -#define __EBOFS_CNODE_H - -#include "Onode.h" - -/* - * collection node - * - * holds attribute metadata for collections. - * colletion membership is stored in b+tree tables, independent of tte cnode. - */ - -class Cnode : public LRUObject -{ - private: - int ref; - bool dirty; - - public: - coll_t coll_id; - Extent cnode_loc; - - map attr; - - public: - Cnode(coll_t cid) : ref(0), dirty(false), coll_id(cid) { - cnode_loc.length = 0; - } - ~Cnode() { - } - - block_t get_cnode_id() { return cnode_loc.start; } - int get_cnode_len() { return cnode_loc.length; } - - void get() { - if (ref == 0) lru_pin(); - ref++; - } - void put() { - ref--; - if (ref == 0) lru_unpin(); - } - int get_ref_count() { return ref; } - - void mark_dirty() { - if (!dirty) { - dirty = true; - get(); - } - } - void mark_clean() { - if (dirty) { - dirty = false; - put(); - } - } - bool is_dirty() { return dirty; } - - - int get_attr_bytes() { - int s = 0; - for (map::iterator i = attr.begin(); - i != attr.end(); - i++) { - s += i->first.length() + 1; - s += i->second.length() + sizeof(int); - } - return s; - } - - // - //???void clear(); - - -}; - -inline ostream& operator<<(ostream& out, Cnode& cn) -{ - out << "cnode(" << hex << cn.coll_id << dec; - if (cn.is_dirty()) out << " dirty"; - //out << " " << &cn; - out << ")"; - return out; -} - -#endif diff --git a/branches/sage/crush/ebofs/Ebofs.cc b/branches/sage/crush/ebofs/Ebofs.cc deleted file mode 100644 index b1f6ab7539467..0000000000000 --- a/branches/sage/crush/ebofs/Ebofs.cc +++ /dev/null @@ -1,3628 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "Ebofs.h" - -#include "FileJournal.h" - -#include - -#ifndef DARWIN -#include -#else -#include -#include -#endif // DARWIN - -// ******************* - -#define dout(x) if (x <= g_conf.debug_ebofs) *_dout << dbeginl << g_clock.now() << " ebofs(" << dev.get_device_name() << ")." -#define derr(x) if (x <= g_conf.debug_ebofs) *_derr << dbeginl << g_clock.now() << " ebofs(" << dev.get_device_name() << ")." - - -char *nice_blocks(block_t b) -{ - static char s[20]; - float sz = b*4.0; - if (sz > (10 << 20)) - sprintf(s,"%.1f GB", sz / (1024.0*1024.0)); - else if (sz > (10 << 10)) - sprintf(s,"%.1f MB", sz / (1024.0)); - else - sprintf(s,"%llu KB", b*4ULL); - return s; -} - -int Ebofs::mount() -{ - ebofs_lock.Lock(); - assert(!mounted); - - // open dev - int r = dev.open(&idle_kicker); - if (r < 0) { - ebofs_lock.Unlock(); - return r; - } - - dout(2) << "mounting " << dev.get_device_name() << " " << dev.get_num_blocks() << " blocks, " << nice_blocks(dev.get_num_blocks()) << dendl; - - // read super - bufferptr bp1 = buffer::create_page_aligned(EBOFS_BLOCK_SIZE); - bufferptr bp2 = buffer::create_page_aligned(EBOFS_BLOCK_SIZE); - dev.read(0, 1, bp1); - dev.read(1, 1, bp2); - - struct ebofs_super *sb1 = (struct ebofs_super*)bp1.c_str(); - struct ebofs_super *sb2 = (struct ebofs_super*)bp2.c_str(); - - // valid superblocks? - if (sb1->s_magic != EBOFS_MAGIC || - sb2->s_magic != EBOFS_MAGIC) { - derr(0) << "mount bad magic, not a valid EBOFS file system" << dendl; - return -EINVAL; - } - if (sb1->num_blocks > dev.get_num_blocks() || - sb2->num_blocks > dev.get_num_blocks()) { - derr(0) << "mount superblock size exceeds actual device size" << dendl; - return -EINVAL; - } - - dout(3) << "mount super @0 epoch " << sb1->epoch << dendl; - dout(3) << "mount super @1 epoch " << sb2->epoch << dendl; - - // pick newest super - struct ebofs_super *sb = 0; - if (sb1->epoch > sb2->epoch) - sb = sb1; - else - sb = sb2; - super_epoch = sb->epoch; - dout(3) << "mount epoch " << super_epoch << dendl; - assert(super_epoch == sb->epoch); - - super_fsid = sb->fsid; - - free_blocks = sb->free_blocks; - limbo_blocks = sb->limbo_blocks; - - // init node pools - dout(3) << "mount nodepool" << dendl; - nodepool.init( &sb->nodepool ); - nodepool.read_usemap_and_clean_nodes( dev, super_epoch ); - - // open tables - dout(3) << "mount opening tables" << dendl; - object_tab = new Table( nodepool, sb->object_tab ); - for (int i=0; i( nodepool, sb->free_tab[i] ); - limbo_tab = new Table( nodepool, sb->limbo_tab ); - alloc_tab = new Table >( nodepool, sb->alloc_tab ); - - collection_tab = new Table( nodepool, sb->collection_tab ); - co_tab = new Table( nodepool, sb->co_tab ); - - verify_tables(); - - allocator.release_limbo(); - - - // open journal? - if (journalfn) { - journal = new FileJournal(this, journalfn); - if (journal->open() < 0) { - dout(3) << "mount journal " << journalfn << " open failed" << dendl; - delete journal; - journal = 0; - } else { - dout(3) << "mount journal " << journalfn << " opened, replaying" << dendl; - - while (1) { - bufferlist bl; - epoch_t e; - if (!journal->read_entry(bl, e)) { - dout(3) << "mount replay: end of journal, done." << dendl; - break; - } - - if (e < super_epoch) { - dout(3) << "mount replay: skipping old entry in epoch " << e << " < " << super_epoch << dendl; - continue; - } - if (e == super_epoch+1) { - super_epoch++; - dout(3) << "mount replay: jumped to next epoch " << super_epoch << dendl; - } - assert(e == super_epoch); - - dout(3) << "mount replay: applying transaction in epoch " << e << dendl; - Transaction t; - int off = 0; - t._decode(bl, off); - _apply_transaction(t); - } - - // done reading, make writeable. - journal->make_writeable(); - } - } - - dout(3) << "mount starting commit+finisher threads" << dendl; - commit_thread.create(); - finisher_thread.create(); - - dout(1) << "mounted " << dev.get_device_name() << " " << dev.get_num_blocks() << " blocks, " << nice_blocks(dev.get_num_blocks()) - << (journal ? ", with journal":", no journal") - << dendl; - mounted = true; - - - ebofs_lock.Unlock(); - return 0; -} - - -int Ebofs::mkfs() -{ - ebofs_lock.Lock(); - assert(!mounted); - - int r = dev.open(); - if (r < 0) { - ebofs_lock.Unlock(); - return r; - } - - block_t num_blocks = dev.get_num_blocks(); - - // make a super-random fsid - srand48(time(0) ^ getpid()); - super_fsid = ((uint64_t)lrand48() << 32) ^ mrand48(); - srand(time(0) ^ getpid()); - super_fsid ^= rand(); - super_fsid ^= (uint64_t)rand() << 32; - - free_blocks = 0; - limbo_blocks = 0; - - // create first noderegion - Extent nr; - nr.start = 2; - nr.length = 20+ (num_blocks / 1000); - if (nr.length < 10) nr.length = 10; - nodepool.add_region(nr); - dout(10) << "mkfs: first node region at " << nr << dendl; - - // allocate two usemaps - block_t usemap_len = nodepool.get_usemap_len(); - nodepool.usemap_even.start = nr.end(); - nodepool.usemap_even.length = usemap_len; - nodepool.usemap_odd.start = nodepool.usemap_even.end(); - nodepool.usemap_odd.length = usemap_len; - dout(10) << "mkfs: even usemap at " << nodepool.usemap_even << dendl; - dout(10) << "mkfs: odd usemap at " << nodepool.usemap_odd << dendl; - nodepool.init_usemap(); - - // init tables - struct ebofs_table empty; - empty.num_keys = 0; - empty.root = -1; - empty.depth = 0; - - object_tab = new Table( nodepool, empty ); - collection_tab = new Table( nodepool, empty ); - - for (int i=0; i( nodepool, empty ); - limbo_tab = new Table( nodepool, empty ); - alloc_tab = new Table >( nodepool, empty ); - - co_tab = new Table( nodepool, empty ); - - // add free space - Extent left; - left.start = nodepool.usemap_odd.end(); - left.length = num_blocks - left.start; - dout(10) << "mkfs: free data blocks at " << left << dendl; - allocator._release_into_limbo( left ); - if (g_conf.ebofs_cloneable) { - allocator.alloc_inc(nr); - allocator.alloc_inc(nodepool.usemap_even); - allocator.alloc_inc(nodepool.usemap_odd); - } - allocator.commit_limbo(); // -> limbo_tab - allocator.release_limbo(); // -> free_tab - - // write nodes, super, 2x - dout(10) << "mkfs: flushing nodepool and superblocks (2x)" << dendl; - - for (epoch_t e=0; e<2; e++) { - nodepool.commit_start(dev, e); - nodepool.commit_wait(); - bufferptr superbp; - prepare_super(e, superbp); - write_super(e, superbp); - } - - // free memory - dout(10) << "mkfs: cleaning up" << dendl; - close_tables(); - - dev.close(); - - - // create journal? - if (journalfn) { - Journal *journal = new FileJournal(this, journalfn); - if (journal->create() < 0) { - dout(3) << "mount journal " << journalfn << " created failed" << dendl; - } else { - dout(3) << "mount journal " << journalfn << " created" << dendl; - } - delete journal; - } - - dout(2) << "mkfs: " << dev.get_device_name() << " " << dev.get_num_blocks() << " blocks, " << nice_blocks(dev.get_num_blocks()) << dendl; - ebofs_lock.Unlock(); - return 0; -} - -void Ebofs::close_tables() -{ - // close tables - delete object_tab; - for (int i=0; iverify("onmount"); - limbo_tab->verify("onmount"); - alloc_tab->verify("onmount"); - collection_tab->verify("onmount"); - co_tab->verify("onmount"); - for (int i=0; iverify("onmount"); - - g_conf.ebofs_verify = o; -} - -int Ebofs::umount() -{ - ebofs_lock.Lock(); - - // mark unmounting - dout(2) << "umount start" << dendl; - readonly = true; - unmounting = true; - - // kick commit thread - dout(5) << "umount stopping commit thread" << dendl; - commit_cond.Signal(); - ebofs_lock.Unlock(); - commit_thread.join(); - ebofs_lock.Lock(); - - // kick finisher thread - dout(5) << "umount stopping finisher thread" << dendl; - finisher_lock.Lock(); - finisher_stop = true; - finisher_cond.Signal(); - finisher_lock.Unlock(); - - finisher_thread.join(); - - trim_bc(0); - trim_inodes(0); - - for (hash_map::iterator i = onode_map.begin(); - i != onode_map.end(); - i++) { - dout(0) << "umount *** leftover: " << i->first << " " << *(i->second) << dendl; - } - - // free memory - dout(5) << "umount cleaning up" << dendl; - close_tables(); - dev.close(); - readonly = unmounting = mounted = false; - - dout(2) << "umount done on " << dev.get_device_name() << dendl; - ebofs_lock.Unlock(); - return 0; -} - - - -void Ebofs::prepare_super(version_t epoch, bufferptr& bp) -{ - struct ebofs_super sb; - - dout(10) << "prepare_super v" << epoch << dendl; - - // fill in super - memset(&sb, 0, sizeof(sb)); - sb.s_magic = EBOFS_MAGIC; - sb.fsid = super_fsid; - sb.epoch = epoch; - sb.num_blocks = dev.get_num_blocks(); - - sb.free_blocks = free_blocks; - sb.limbo_blocks = limbo_blocks; - - - // tables - sb.object_tab.num_keys = object_tab->get_num_keys(); - sb.object_tab.root = object_tab->get_root(); - sb.object_tab.depth = object_tab->get_depth(); - - for (int i=0; iget_num_keys(); - sb.free_tab[i].root = free_tab[i]->get_root(); - sb.free_tab[i].depth = free_tab[i]->get_depth(); - } - sb.limbo_tab.num_keys = limbo_tab->get_num_keys(); - sb.limbo_tab.root = limbo_tab->get_root(); - sb.limbo_tab.depth = limbo_tab->get_depth(); - - sb.alloc_tab.num_keys = alloc_tab->get_num_keys(); - sb.alloc_tab.root = alloc_tab->get_root(); - sb.alloc_tab.depth = alloc_tab->get_depth(); - - sb.collection_tab.num_keys = collection_tab->get_num_keys(); - sb.collection_tab.root = collection_tab->get_root(); - sb.collection_tab.depth = collection_tab->get_depth(); - - sb.co_tab.num_keys = co_tab->get_num_keys(); - sb.co_tab.root = co_tab->get_root(); - sb.co_tab.depth = co_tab->get_depth(); - - // pools - sb.nodepool.num_regions = nodepool.region_loc.size(); - for (unsigned i=0; i 0) { - // *** this is an ugly ugly hack **** - // do not use - // periodically check for idle block device - utime_t idle_wait(0, g_conf.ebofs_idle_commit_ms*1000); - dout(20) << "commit_thread sleeping (up to) " << g_conf.ebofs_commit_ms << " ms, " - << idle_wait << " ms if idle" << dendl; - utime_t now = g_clock.now(); - utime_t stop = now; - stop += (double)g_conf.ebofs_commit_ms / 1000.0; - do { - utime_t wait = MIN(stop - now, idle_wait); - if (commit_cond.WaitInterval(ebofs_lock, wait) != ETIMEDOUT) { - dout(20) << "commit_thread i got kicked" << dendl; - break; // we got kicked - } - if (dev.is_idle()) { - dout(20) << "commit_thread bdev is idle, early commit" << dendl; - break; // dev is idle - } - now = g_clock.now(); - dout(20) << "commit_thread now=" << now << ", stop at " << stop << dendl; - - // hack hack - //if (!left) g_conf.debug_ebofs = 10; - // /hack hack - } while (now < stop); - dout(20) << "commit_thread done with idle loop" << dendl; - - } else { - // normal wait+timeout - dout(20) << "commit_thread sleeping (up to) " << g_conf.ebofs_commit_ms << " ms" << dendl; - commit_cond.WaitInterval(ebofs_lock, utime_t(0, g_conf.ebofs_commit_ms*1000)); - } - - } else { - // DEBUG.. wait until kicked - dout(10) << "commit_thread no commit_ms, waiting until kicked" << dendl; - commit_cond.Wait(ebofs_lock); - } - - if (unmounting) { - dout(10) << "commit_thread unmounting: final commit pass" << dendl; - assert(readonly); - unmounting = false; - mounted = false; - dirty = true; - } - - if (!dirty && !limbo_blocks) { - dout(10) << "commit_thread not dirty" << dendl; - } - else { - super_epoch++; - dirty = false; - - derr(10) << "commit_thread commit start, new epoch " << super_epoch << dendl; - dout(10) << "commit_thread commit start, new epoch " << super_epoch << dendl; - dout(2) << "commit_thread data: " - << 100*(dev.get_num_blocks()-get_free_blocks())/dev.get_num_blocks() << "% used, " - << get_free_blocks() << " (" << 100*get_free_blocks()/dev.get_num_blocks() - << "%) free in " << get_free_extents() - << ", " << get_limbo_blocks() << " (" << 100*get_limbo_blocks()/dev.get_num_blocks() - << "%) limbo in " << get_limbo_extents() - << dendl; - dout(2) << "commit_thread nodes: " - << 100*nodepool.get_num_used()/nodepool.get_num_total() << "% used, " - << nodepool.get_num_free() << " (" << 100*nodepool.get_num_free()/nodepool.get_num_total() << "%) free, " - << nodepool.get_num_limbo() << " (" << 100*nodepool.get_num_limbo()/nodepool.get_num_total() << "%) limbo, " - << nodepool.get_num_total() << " total." << dendl; - dout(2) << "commit_thread bc: " - << "size " << bc.get_size() - << ", trimmable " << bc.get_trimmable() - << ", max " << g_conf.ebofs_bc_size - << "; dirty " << bc.get_stat_dirty() - << ", tx " << bc.get_stat_tx() - << ", max dirty " << g_conf.ebofs_bc_max_dirty - << dendl; - - if (journal) journal->commit_epoch_start(); - - // (async) write onodes+condes (do this first; it currently involves inode reallocation) - commit_inodes_start(); - - allocator.commit_limbo(); // limbo -> limbo_tab - - // (async) write btree nodes - nodepool.commit_start( dev, super_epoch ); - - // blockdev barrier (prioritize our writes!) - dout(30) << "commit_thread barrier. flushing inodes " << inodes_flushing << dendl; - dev.barrier(); - - // prepare super (before any changes get made!) - bufferptr superbp; - prepare_super(super_epoch, superbp); - - // wait for it all to flush (drops global lock) - commit_bc_wait(super_epoch-1); - dout(30) << "commit_thread bc flushed" << dendl; - commit_inodes_wait(); - dout(30) << "commit_thread inodes flushed" << dendl; - nodepool.commit_wait(); - dout(30) << "commit_thread btree nodes flushed" << dendl; - - // ok, now (synchronously) write the prior super! - dout(10) << "commit_thread commit flushed, writing super for prior epoch" << dendl; - ebofs_lock.Unlock(); - write_super(super_epoch, superbp); - ebofs_lock.Lock(); - - dout(10) << "commit_thread wrote super" << dendl; - - // free limbo space now - // (since we're done allocating things, - // AND we've flushed all previous epoch data) - allocator.release_limbo(); // limbo_tab -> free_tabs - - // do we need more node space? - if (nodepool.get_num_free() < nodepool.get_num_total() / 3) { - dout(2) << "commit_thread running low on node space, allocating more." << dendl; - alloc_more_node_space(); - } - - // signal journal - if (journal) journal->commit_epoch_finish(); - - // kick waiters - dout(10) << "commit_thread queueing commit + kicking sync waiters" << dendl; - - queue_finishers(commit_waiters[super_epoch-1]); - commit_waiters.erase(super_epoch-1); - - sync_cond.Signal(); - - dout(10) << "commit_thread commit finish" << dendl; - } - - // trim bc? - trim_bc(); - trim_inodes(); - - } - - dout(10) << "commit_thread finish" << dendl; - commit_thread_started = false; - ebofs_lock.Unlock(); - return 0; -} - - -void Ebofs::alloc_more_node_space() -{ - dout(1) << "alloc_more_node_space free " << nodepool.get_num_free() << "/" << nodepool.get_num_total() << dendl; - - if (nodepool.num_regions() < EBOFS_MAX_NODE_REGIONS) { - int want = nodepool.get_num_total(); - - Extent ex; - allocator.allocate(ex, want, 2); - dout(1) << "alloc_more_node_space wants " << want << " more, got " << ex << dendl; - - Extent even, odd; - unsigned ulen = nodepool.get_usemap_len(nodepool.get_num_total() + ex.length); - allocator.allocate(even, ulen, 2); - allocator.allocate(odd, ulen, 2); - dout(1) << "alloc_more_node_space maps need " << ulen << " x2, got " << even << " " << odd << dendl; - - if (even.length == ulen && odd.length == ulen) { - dout(1) << "alloc_more_node_space got " << ex << ", new usemaps at even " << even << " odd " << odd << dendl; - allocator.release(nodepool.usemap_even); - allocator.release(nodepool.usemap_odd); - nodepool.add_region(ex); - - // expand usemap? - nodepool.usemap_even = even; - nodepool.usemap_odd = odd; - nodepool.expand_usemap(); - } else { - dout (1) << "alloc_more_node_space failed to get space for new usemaps" << dendl; - allocator.release(ex); - allocator.release(even); - allocator.release(odd); - //assert(0); - } - } else { - dout(1) << "alloc_more_node_space already have max node regions!" << dendl; - assert(0); - } -} - - -void *Ebofs::finisher_thread_entry() -{ - finisher_lock.Lock(); - dout(10) << "finisher_thread start" << dendl; - - while (!finisher_stop) { - while (!finisher_queue.empty()) { - list ls; - ls.swap(finisher_queue); - - finisher_lock.Unlock(); - - //ebofs_lock.Lock(); // um.. why lock this? -sage - finish_contexts(ls, 0); - //ebofs_lock.Unlock(); - - finisher_lock.Lock(); - } - if (finisher_stop) break; - - dout(30) << "finisher_thread sleeping" << dendl; - finisher_cond.Wait(finisher_lock); - } - - dout(10) << "finisher_thread start" << dendl; - finisher_lock.Unlock(); - return 0; -} - - -// *** onodes *** - -Onode* Ebofs::new_onode(object_t oid) -{ - Onode* on = new Onode(oid); - - assert(onode_map.count(oid) == 0); - onode_map[oid] = on; - onode_lru.lru_insert_top(on); - - assert(object_tab->lookup(oid) < 0); - object_tab->insert( oid, on->onode_loc ); // even tho i'm not placed yet - - on->get(); - on->onode_loc.start = 0; - on->onode_loc.length = 0; - - dirty_onode(on); - - dout(7) << "new_onode " << *on << dendl; - return on; -} - - -Onode* Ebofs::get_onode(object_t oid) -{ - while (1) { - // in cache? - if (have_onode(oid)) { - // yay - Onode *on = onode_map[oid]; - on->get(); - //dout(0) << "get_onode " << *on << dendl; - return on; - } - - // on disk? - Extent onode_loc; - if (object_tab->lookup(oid, onode_loc) < 0) { - dout(10) << "onode lookup failed on " << oid << dendl; - // object dne. - return 0; - } - - // already loading? - if (waitfor_onode.count(oid)) { - // yep, just wait. - Cond c; - waitfor_onode[oid].push_back(&c); - dout(10) << "get_onode " << oid << " already loading, waiting" << dendl; - c.Wait(ebofs_lock); - continue; - } - - dout(10) << "get_onode reading " << oid << " from " << onode_loc << dendl; - - assert(waitfor_onode.count(oid) == 0); - waitfor_onode[oid].clear(); // this should be empty initially. - - // read it! - bufferlist bl; - bl.push_back( buffer::create_page_aligned( EBOFS_BLOCK_SIZE*onode_loc.length ) ); - - ebofs_lock.Unlock(); - dev.read( onode_loc.start, onode_loc.length, bl ); - ebofs_lock.Lock(); - - // add onode - Onode *on = new Onode(oid); - onode_map[oid] = on; - onode_lru.lru_insert_top(on); - - // parse data block - struct ebofs_onode *eo = (struct ebofs_onode*)bl.c_str(); - if (eo->object_id != oid) { - dout(0) << " wrong oid in onode block: " << eo->object_id << " != " << oid << dendl; - dout(0) << " onode_loc is " << eo->onode_loc << dendl; - dout(0) << " object_size " << eo->object_size << dendl; - dout(0) << " object_blocks " << eo->object_blocks << dendl; - dout(0) << " " << eo->num_collections << " coll + " - << eo->num_attr << " attr + " - << eo->num_extents << " extents" << dendl; - assert(eo->object_id == oid); - } - on->readonly = eo->readonly; - on->onode_loc = eo->onode_loc; - on->object_size = eo->object_size; - on->object_blocks = eo->object_blocks; - - // parse - char *p = bl.c_str() + sizeof(*eo); - - // parse collection list - for (int i=0; inum_collections; i++) { - coll_t c = *((coll_t*)p); - p += sizeof(c); - on->collections.insert(c); - } - - // parse attributes - for (int i=0; inum_attr; i++) { - string key = p; - p += key.length() + 1; - int len = *(int*)(p); - p += sizeof(len); - on->attr[key] = buffer::copy(p, len); - p += len; - dout(15) << "get_onode " << *on << " attr " << key << " len " << len << dendl; - } - - // parse extents - on->extent_map.clear(); - block_t n = 0; - for (int i=0; inum_extents; i++) { - Extent ex = *((Extent*)p); - on->extent_map[n] = ex; - dout(15) << "get_onode " << *on << " ex " << i << ": " << ex << dendl; - n += ex.length; - p += sizeof(Extent); - } - assert(n == on->object_blocks); - - // wake up other waiters - for (list::iterator i = waitfor_onode[oid].begin(); - i != waitfor_onode[oid].end(); - i++) - (*i)->Signal(); - waitfor_onode.erase(oid); // remove Cond list - - on->get(); - //dout(0) << "get_onode " << *on << " (loaded)" << dendl; - return on; - } -} - - -class C_E_InodeFlush : public BlockDevice::callback { - Ebofs *ebofs; -public: - C_E_InodeFlush(Ebofs *e) : ebofs(e) {} - void finish(ioh_t ioh, int r) { - ebofs->flush_inode_finish(); - } -}; - - -void Ebofs::encode_onode(Onode *on, bufferlist& bl, unsigned& off) -{ - // onode - struct ebofs_onode eo; - eo.readonly = on->readonly; - eo.onode_loc = on->onode_loc; - eo.object_id = on->object_id; - eo.object_size = on->object_size; - eo.object_blocks = on->object_blocks; - eo.num_collections = on->collections.size(); - eo.num_attr = on->attr.size(); - eo.num_extents = on->extent_map.size(); - bl.copy_in(off, sizeof(eo), (char*)&eo); - off += sizeof(eo); - - // collections - for (set::iterator i = on->collections.begin(); - i != on->collections.end(); - i++) { - bl.copy_in(off, sizeof(*i), (char*)&(*i)); - off += sizeof(*i); - } - - // attr - for (map::iterator i = on->attr.begin(); - i != on->attr.end(); - i++) { - bl.copy_in(off, i->first.length()+1, i->first.c_str()); - off += i->first.length()+1; - int l = i->second.length(); - bl.copy_in(off, sizeof(int), (char*)&l); - off += sizeof(int); - bl.copy_in(off, l, i->second.c_str()); - off += l; - dout(15) << "write_onode " << *on << " attr " << i->first << " len " << l << dendl; - } - - // extents - for (map::iterator i = on->extent_map.begin(); - i != on->extent_map.end(); - i++) { - bl.copy_in(off, sizeof(Extent), (char*)&(i->second)); - off += sizeof(Extent); - dout(15) << "write_onode " << *on << " ex " << i->first << ": " << i->second << dendl; - } -} - -void Ebofs::write_onode(Onode *on) -{ - // buffer - unsigned bytes = sizeof(ebofs_onode) + on->get_collection_bytes() + on->get_attr_bytes() + on->get_extent_bytes(); - unsigned blocks = (bytes-1)/EBOFS_BLOCK_SIZE + 1; - - bufferlist bl; - bl.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*blocks) ); - - // (always) relocate onode - if (1) { - if (on->onode_loc.length) - allocator.release(on->onode_loc); - - block_t first = 0; - if (on->extent_map.size()) - first = on->extent_map.begin()->second.start; - - allocator.allocate(on->onode_loc, blocks, first); - object_tab->remove( on->object_id ); - object_tab->insert( on->object_id, on->onode_loc ); - //object_tab->verify(); - } - - dout(10) << "write_onode " << *on << " to " << on->onode_loc << dendl; - - unsigned off = 0; - encode_onode(on, bl, off); - assert(off == bytes); - - // write - dev.write( on->onode_loc.start, on->onode_loc.length, bl, - new C_E_InodeFlush(this), "write_onode" ); -} - -void Ebofs::remove_onode(Onode *on) -{ - dout(8) << "remove_onode " << *on << dendl; - - assert(on->get_ref_count() >= 1); // caller - - // tear down buffer cache - if (on->oc) { - on->oc->truncate(0, super_epoch); // this will kick readers along the way. - on->close_oc(); - } - - // remove from onode map, mark dangling/deleted - onode_map.erase(on->object_id); - onode_lru.lru_remove(on); - on->deleted = true; - on->dangling = true; - - // remove from object table - //dout(0) << "remove_onode on " << *on << dendl; - object_tab->remove(on->object_id); - - // free onode space - if (on->onode_loc.length) - allocator.release(on->onode_loc); - - // free data space - for (map::iterator i = on->extent_map.begin(); - i != on->extent_map.end(); - i++) - allocator.release(i->second); - on->extent_map.clear(); - - // remove from collections - for (set::iterator i = on->collections.begin(); - i != on->collections.end(); - i++) { - co_tab->remove(coll_object_t(*i,on->object_id)); - } - on->collections.clear(); - - // dirty -> clean? - if (on->is_dirty()) { - on->mark_clean(); // this unpins *on - dirty_onodes.erase(on); - } - - if (on->get_ref_count() > 1) dout(10) << "remove_onode **** will survive " << *on << dendl; - put_onode(on); - - dirty = true; -} - -void Ebofs::put_onode(Onode *on) -{ - on->put(); - //dout(0) << "put_onode " << *on << dendl; - - if (on->get_ref_count() == 0 && on->dangling) { - //dot(0) << " *** hosing on " << *on << dendl; - delete on; - } -} - -void Ebofs::dirty_onode(Onode *on) -{ - if (!on->is_dirty()) { - on->mark_dirty(); - dirty_onodes.insert(on); - } - dirty = true; -} - -void Ebofs::trim_inodes(int max) -{ - unsigned omax = onode_lru.lru_get_max(); - unsigned cmax = cnode_lru.lru_get_max(); - if (max >= 0) omax = cmax = max; - dout(10) << "trim_inodes start " << onode_lru.lru_get_size() << " / " << omax << " onodes, " - << cnode_lru.lru_get_size() << " / " << cmax << " cnodes" << dendl; - - // onodes - while (onode_lru.lru_get_size() > omax) { - // expire an item - Onode *on = (Onode*)onode_lru.lru_expire(); - if (on == 0) break; // nothing to expire - - // expire - dout(20) << "trim_inodes removing onode " << *on << dendl; - onode_map.erase(on->object_id); - on->dangling = true; - - if (on->get_ref_count() == 0) { - assert(on->oc == 0); // an open oc pins the onode! - delete on; - } else { - dout(-20) << "trim_inodes still active: " << *on << dendl; - assert(0); // huh? - } - } - - - // cnodes - while (cnode_lru.lru_get_size() > cmax) { - // expire an item - Cnode *cn = (Cnode*)cnode_lru.lru_expire(); - if (cn == 0) break; // nothing to expire - - // expire - dout(20) << "trim_inodes removing cnode " << *cn << dendl; - cnode_map.erase(cn->coll_id); - - delete cn; - } - - dout(10) << "trim_inodes finish " - << onode_lru.lru_get_size() << " / " << omax << " onodes, " - << cnode_lru.lru_get_size() << " / " << cmax << " cnodes" << dendl; -} - - - -// *** cnodes **** - -Cnode* Ebofs::new_cnode(coll_t cid) -{ - Cnode* cn = new Cnode(cid); - - assert(cnode_map.count(cid) == 0); - cnode_map[cid] = cn; - cnode_lru.lru_insert_top(cn); - - assert(collection_tab->lookup(cid) < 0); - collection_tab->insert( cid, cn->cnode_loc ); // even tho i'm not placed yet - - cn->get(); - cn->cnode_loc.start = 0; - cn->cnode_loc.length = 0; - - dirty_cnode(cn); - - return cn; -} - -Cnode* Ebofs::get_cnode(coll_t cid) -{ - while (1) { - // in cache? - if (cnode_map.count(cid)) { - // yay - Cnode *cn = cnode_map[cid]; - cn->get(); - return cn; - } - - // on disk? - Extent cnode_loc; - if (collection_tab->lookup(cid, cnode_loc) < 0) { - // object dne. - return 0; - } - - // already loading? - if (waitfor_cnode.count(cid)) { - // yep, just wait. - Cond c; - waitfor_cnode[cid].push_back(&c); - dout(10) << "get_cnode " << cid << " already loading, waiting" << dendl; - c.Wait(ebofs_lock); - continue; - } - - dout(10) << "get_cnode reading " << cid << " from " << cnode_loc << dendl; - - assert(waitfor_cnode.count(cid) == 0); - waitfor_cnode[cid].clear(); // this should be empty initially. - - // read it! - bufferlist bl; - //bufferpool.alloc( EBOFS_BLOCK_SIZE*cnode_loc.length, bl ); - bl.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*cnode_loc.length) ); - - ebofs_lock.Unlock(); - dev.read( cnode_loc.start, cnode_loc.length, bl ); - ebofs_lock.Lock(); - - // parse data block - Cnode *cn = new Cnode(cid); - - cnode_map[cid] = cn; - cnode_lru.lru_insert_top(cn); - - struct ebofs_cnode *ec = (struct ebofs_cnode*)bl.c_str(); - cn->cnode_loc = ec->cnode_loc; - - // parse attributes - char *p = bl.c_str() + sizeof(*ec); - for (int i=0; inum_attr; i++) { - string key = p; - p += key.length() + 1; - int len = *(int*)(p); - p += sizeof(len); - cn->attr[key] = buffer::copy(p, len); - p += len; - dout(15) << "get_cnode " << *cn << " attr " << key << " len " << len << dendl; - } - - // wake up other waiters - for (list::iterator i = waitfor_cnode[cid].begin(); - i != waitfor_cnode[cid].end(); - i++) - (*i)->Signal(); - waitfor_cnode.erase(cid); // remove Cond list - - cn->get(); - return cn; - } -} - -void Ebofs::encode_cnode(Cnode *cn, bufferlist& bl, unsigned& off) -{ - // cnode - struct ebofs_cnode ec; - ec.cnode_loc = cn->cnode_loc; - ec.coll_id = cn->coll_id; - ec.num_attr = cn->attr.size(); - bl.copy_in(off, sizeof(ec), (char*)&ec); - off += sizeof(ec); - - // attr - for (map::iterator i = cn->attr.begin(); - i != cn->attr.end(); - i++) { - bl.copy_in(off, i->first.length()+1, i->first.c_str()); - off += i->first.length()+1; - int len = i->second.length(); - bl.copy_in(off, sizeof(int), (char*)&len); - off += sizeof(int); - bl.copy_in(off, len, i->second.c_str()); - off += len; - - dout(15) << "write_cnode " << *cn << " attr " << i->first << " len " << len << dendl; - } -} - -void Ebofs::write_cnode(Cnode *cn) -{ - // allocate buffer - unsigned bytes = sizeof(ebofs_cnode) + cn->get_attr_bytes(); - unsigned blocks = (bytes-1)/EBOFS_BLOCK_SIZE + 1; - - bufferlist bl; - //bufferpool.alloc( EBOFS_BLOCK_SIZE*blocks, bl ); - bl.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*blocks) ); - - // (always) relocate cnode! - if (1) { - if (cn->cnode_loc.length) - allocator.release(cn->cnode_loc); - - allocator.allocate(cn->cnode_loc, blocks, Allocator::NEAR_LAST_FWD); - collection_tab->remove( cn->coll_id ); - collection_tab->insert( cn->coll_id, cn->cnode_loc ); - } - - dout(10) << "write_cnode " << *cn << " to " << cn->cnode_loc << dendl; - - unsigned off = 0; - encode_cnode(cn, bl, off); - assert(off == bytes); - - // write - dev.write( cn->cnode_loc.start, cn->cnode_loc.length, bl, - new C_E_InodeFlush(this), "write_cnode" ); -} - -void Ebofs::remove_cnode(Cnode *cn) -{ - dout(10) << "remove_cnode " << *cn << dendl; - - // remove from table - collection_tab->remove(cn->coll_id); - - // free cnode space - if (cn->cnode_loc.length) - allocator.release(cn->cnode_loc); - - // remove from dirty list? - if (cn->is_dirty()) - dirty_cnodes.erase(cn); - - // remove from map and lru - cnode_map.erase(cn->coll_id); - cnode_lru.lru_remove(cn); - - // count down refs - cn->mark_clean(); - cn->put(); - assert(cn->get_ref_count() == 0); - - // hose. - delete cn; - - dirty = true; -} - -void Ebofs::put_cnode(Cnode *cn) -{ - cn->put(); -} - -void Ebofs::dirty_cnode(Cnode *cn) -{ - if (!cn->is_dirty()) { - cn->mark_dirty(); - dirty_cnodes.insert(cn); - } - dirty = true; -} - - - - - -void Ebofs::flush_inode_finish() -{ - ebofs_lock.Lock(); - { - inodes_flushing--; - if (inodes_flushing < 1000) - dout(20) << "flush_inode_finish, " << inodes_flushing << " left" << dendl; - if (inodes_flushing == 0) - inode_commit_cond.Signal(); - } - ebofs_lock.Unlock(); -} - -void Ebofs::commit_inodes_start() -{ - dout(10) << "commit_inodes_start" << dendl; - - assert(inodes_flushing == 0); - - // onodes - for (set::iterator i = dirty_onodes.begin(); - i != dirty_onodes.end(); - i++) { - Onode *on = *i; - inodes_flushing++; - write_onode(on); - on->mark_clean(); - on->uncommitted.clear(); // commit allocated blocks - on->commit_waiters.clear(); // these guys are gonna get taken care of, bc we committed. - } - dirty_onodes.clear(); - - // cnodes - for (set::iterator i = dirty_cnodes.begin(); - i != dirty_cnodes.end(); - i++) { - Cnode *cn = *i; - inodes_flushing++; - write_cnode(cn); - cn->mark_clean(); - } - dirty_cnodes.clear(); - - dout(10) << "commit_inodes_start writing " << inodes_flushing << " onodes+cnodes" << dendl; -} - -void Ebofs::commit_inodes_wait() -{ - // caller must hold ebofs_lock - while (inodes_flushing > 0) { - dout(10) << "commit_inodes_wait waiting for " << inodes_flushing << " onodes+cnodes to flush" << dendl; - inode_commit_cond.Wait(ebofs_lock); - } - dout(10) << "commit_inodes_wait all flushed" << dendl; -} - - - - - - - -// *** buffer cache *** - -void Ebofs::trim_buffer_cache() -{ - ebofs_lock.Lock(); - trim_bc(0); - ebofs_lock.Unlock(); -} - -void Ebofs::trim_bc(off_t max) -{ - if (max < 0) - max = g_conf.ebofs_bc_size; - dout(10) << "trim_bc start: size " << bc.get_size() << ", trimmable " << bc.get_trimmable() << ", max " << max << dendl; - - while (bc.get_size() > max && - bc.get_trimmable()) { - BufferHead *bh = (BufferHead*) bc.lru_rest.lru_expire(); - if (!bh) break; - - dout(25) << "trim_bc trimming " << *bh << dendl; - assert(bh->is_clean()); - - ObjectCache *oc = bh->oc; - bc.remove_bh(bh); - delete bh; - - if (oc->is_empty()) { - Onode *on = oc->on; - dout(10) << "trim_bc closing oc on " << *on << dendl; - on->close_oc(); - } - } - - dout(10) << "trim_bc finish: size " << bc.get_size() << ", trimmable " << bc.get_trimmable() << ", max " << max << dendl; -} - - -void Ebofs::kick_idle() -{ - dout(10) << "kick_idle" << dendl; - //commit_cond.Signal(); - - ebofs_lock.Lock(); - if (mounted && !unmounting && dirty) { - dout(10) << "kick_idle dirty, doing commit" << dendl; - commit_cond.Signal(); - } else { - dout(10) << "kick_idle !dirty or !mounted or unmounting, doing nothing" << dendl; - } - ebofs_lock.Unlock(); -} - -void Ebofs::sync(Context *onsafe) -{ - ebofs_lock.Lock(); - if (onsafe) { - dirty = true; - - while (1) { - if (journal) { - // journal empty transaction - Transaction t; - bufferlist bl; - t._encode(bl); - if (journal->submit_entry(bl, onsafe)) break; - } - commit_waiters[super_epoch].push_back(onsafe); - break; - } - } - ebofs_lock.Unlock(); -} - -void Ebofs::sync() -{ - ebofs_lock.Lock(); - if (!dirty) { - dout(7) << "sync in " << super_epoch << ", not dirty" << dendl; - } else { - epoch_t start = super_epoch; - dout(7) << "sync start in " << start << dendl; - while (super_epoch == start) { - dout(7) << "sync kicking commit in " << super_epoch << dendl; - dirty = true; - commit_cond.Signal(); - sync_cond.Wait(ebofs_lock); - } - dout(10) << "sync finish in " << super_epoch << dendl; - } - ebofs_lock.Unlock(); -} - - - -void Ebofs::commit_bc_wait(version_t epoch) -{ - dout(10) << "commit_bc_wait on epoch " << epoch << dendl; - - while (bc.get_unflushed(EBOFS_BC_FLUSH_BHWRITE,epoch) > 0 || - bc.get_unflushed(EBOFS_BC_FLUSH_PARTIAL,epoch) > 0) { - //dout(10) << "commit_bc_wait " << bc.get_unflushed(epoch) << " unflushed in epoch " << epoch << dendl; - dout(10) << "commit_bc_wait epoch " << epoch - << ", unflushed bhwrite " << bc.get_unflushed(EBOFS_BC_FLUSH_BHWRITE) - << ", unflushed partial " << bc.get_unflushed(EBOFS_BC_FLUSH_PARTIAL) - << dendl; - bc.waitfor_flush(); - } - - bc.get_unflushed(EBOFS_BC_FLUSH_BHWRITE).erase(epoch); - bc.get_unflushed(EBOFS_BC_FLUSH_PARTIAL).erase(epoch); - - dout(10) << "commit_bc_wait all flushed for epoch " << epoch - << "; " << bc.get_unflushed(EBOFS_BC_FLUSH_BHWRITE) - << " " << bc.get_unflushed(EBOFS_BC_FLUSH_PARTIAL) - << dendl; -} - - - -int Ebofs::statfs(struct statfs *buf) -{ - dout(7) << "statfs" << dendl; - - buf->f_type = EBOFS_MAGIC; /* type of filesystem */ - buf->f_bsize = 4096; /* optimal transfer block size */ - buf->f_blocks = dev.get_num_blocks(); /* total data blocks in file system */ - buf->f_bfree = get_free_blocks() - + get_limbo_blocks(); /* free blocks in fs */ - buf->f_bavail = get_free_blocks(); /* free blocks avail to non-superuser -- actually, for writing. */ - buf->f_files = nodepool.get_num_total(); /* total file nodes in file system */ - buf->f_ffree = nodepool.get_num_free(); /* free file nodes in fs */ - //buf->f_fsid = 0; /* file system id */ -#ifndef DARWIN - buf->f_namelen = 8; /* maximum length of filenames */ -#endif // DARWIN - - return 0; -} - - - - -/* - * allocate a write to blocks on disk. - * - take care to not overwrite any "safe" data blocks. - * - allocate/map new extents on disk as necessary - */ -void Ebofs::alloc_write(Onode *on, - block_t start, block_t len, - interval_set& alloc, - block_t& old_bfirst, block_t& old_blast) -{ - // first decide what pages to (re)allocate - alloc.insert(start, len); // start with whole range - - // figure out what bits are already uncommitted - interval_set already_uncom; - already_uncom.intersection_of(alloc, on->uncommitted); - - // subtract those off, so we're left with the committed bits (that must be reallocated). - alloc.subtract(already_uncom); - - dout(10) << "alloc_write must (re)alloc " << alloc << " on " << *on << dendl; - - // release it (into limbo) - for (map::iterator i = alloc.m.begin(); - i != alloc.m.end(); - i++) { - // get old region - vector old; - on->map_extents(i->first, i->second, old); - for (unsigned o=0; ofirst == start) { - old_bfirst = old[0].start; - dout(20) << "alloc_write old_bfirst " << old_bfirst << " of " << old[0] << dendl; - } - if (i->first+i->second == start+len) { - old_blast = old[old.size()-1].last(); - dout(20) << "alloc_write old_blast " << old_blast << " of " << old[old.size()-1] << dendl; - } - } - } - - // reallocate uncommitted too? - // ( --> yes. we can always make better allocation decisions later, with more information. ) - if (g_conf.ebofs_realloc) { - list tx; - - ObjectCache *oc = on->get_oc(&bc); - oc->find_tx(start, len, tx); - - for (list::reverse_iterator p = tx.rbegin(); - p != tx.rend(); - p++) { - BufferHead *bh = *p; - - // cancelable/moveable? - if (alloc.contains(bh->start(), bh->length())) { - dout(10) << "alloc_write " << *bh << " already in " << alloc << dendl; - continue; - } - - vector old; - on->map_extents(bh->start(), bh->length(), old); - assert(old.size() == 1); - - if (bh->start() >= start && bh->end() <= start+len) { - assert(bh->epoch_modified == super_epoch); - if (bc.bh_cancel_write(bh, super_epoch)) { - if (bh->length() == 1) - dout(10) << "alloc_write unallocated tx " << old[0] << ", canceled " << *bh << dendl; - // no, this isn't compatible with clone() and extent reference counting. - //allocator.unallocate(old[0]); // release (into free) - allocator.release(old[0]); // **FIXME** no cloning yet, my friend! - alloc.insert(bh->start(), bh->length()); - } else { - if (bh->length() == 1) - dout(10) << "alloc_write released tx " << old[0] << ", couldn't cancel " << *bh << dendl; - allocator.release(old[0]); // release (into limbo) - alloc.insert(bh->start(), bh->length()); - } - } else { - if (bh->length() == 1) - dout(10) << "alloc_write skipped tx " << old[0] << ", not entirely within " - << start << "~" << len - << " bh " << *bh << dendl; - } - } - - dout(10) << "alloc_write will (re)alloc " << alloc << " on " << *on << dendl; - } - - if (alloc.empty()) return; // no need to dirty the onode below! - - - // merge alloc into onode uncommitted map - //dout(10) << " union of " << on->uncommitted << " and " << alloc << dendl; - interval_set old = on->uncommitted; - on->uncommitted.union_of(alloc); - - dout(10) << "alloc_write onode.uncommitted is now " << on->uncommitted << dendl; - - if (0) { - // verify - interval_set ta; - ta.intersection_of(on->uncommitted, alloc); - dout(0) << " ta " << ta << dendl; - assert(alloc == ta); - - interval_set tb; - tb.intersection_of(on->uncommitted, old); - dout(0) << " tb " << tb << dendl; - assert(old == tb); - } - - dirty_onode(on); - - // allocate the space - for (map::iterator i = alloc.m.begin(); - i != alloc.m.end(); - i++) { - dout(15) << "alloc_write alloc " << i->first << "~" << i->second << " (of " << start << "~" << len << ")" << dendl; - - // allocate new space - block_t left = i->second; - block_t cur = i->first; - while (left > 0) { - Extent ex; - allocator.allocate(ex, left, Allocator::NEAR_LAST_FWD); - dout(10) << "alloc_write got " << ex << " for object offset " << cur << dendl; - on->set_extent(cur, ex); // map object to new region - left -= ex.length; - cur += ex.length; - } - } -} - - - - -void Ebofs::apply_write(Onode *on, off_t off, size_t len, const bufferlist& bl) -{ - ObjectCache *oc = on->get_oc(&bc); - - // map into blocks - off_t opos = off; // byte pos in object - size_t zleft = 0; // zeros left to write - size_t left = len; // bytes left - - block_t bstart = off / EBOFS_BLOCK_SIZE; - - if (off > on->object_size) { - zleft = off - on->object_size; - opos = on->object_size; - bstart = on->object_size / EBOFS_BLOCK_SIZE; - } - if (off+(off_t)len > on->object_size) { - dout(10) << "apply_write extending size on " << *on << ": " << on->object_size - << " -> " << off+len << dendl; - on->object_size = off+len; - dirty_onode(on); - } - if (bl.length() == 0) { - zleft += len; - left = 0; - } else { - assert(bl.length() == len); - } - if (zleft) - dout(10) << "apply_write zeroing " << zleft << " bytes before " << off << "~" << len - << " in " << *on << dendl; - - block_t blast = (len+off-1) / EBOFS_BLOCK_SIZE; - block_t blen = blast-bstart+1; - - // allocate write on disk. - interval_set alloc; - block_t old_bfirst = 0; // zero means not defined here (since we ultimately pass to bh_read) - block_t old_blast = 0; - alloc_write(on, bstart, blen, alloc, old_bfirst, old_blast); - dout(20) << "apply_write old_bfirst " << old_bfirst << ", old_blast " << old_blast << dendl; - - if (fake_writes) { - on->uncommitted.clear(); // worst case! - return; - } - - // map b range onto buffer_heads - map hits; - oc->map_write(bstart, blen, hits, super_epoch); - - // get current versions - //version_t lowv, highv; - //oc->scan_versions(bstart, blen, lowv, highv); - //highv++; - version_t highv = ++oc->write_count; - - // copy from bl into buffer cache - unsigned blpos = 0; // byte pos in input buffer - - // write data into buffers - for (map::iterator i = hits.begin(); - i != hits.end(); - i++) { - BufferHead *bh = i->second; - bh->set_version(highv); - bh->epoch_modified = super_epoch; - - // old write in progress? - if (bh->is_tx()) { // copy the buffer to avoid munging up in-flight write - dout(10) << "apply_write tx pending, copying buffer on " << *bh << dendl; - bufferlist temp; - temp.claim(bh->data); - //bc.bufferpool.alloc(EBOFS_BLOCK_SIZE*bh->length(), bh->data); - bh->data.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*bh->length()) ); - bh->data.copy_in(0, bh->length()*EBOFS_BLOCK_SIZE, temp); - } - - // need to split off partial? (partials can only be ONE block) - if ((bh->is_missing() || bh->is_rx()) && bh->length() > 1) { - if ((bh->start() == bstart && opos % EBOFS_BLOCK_SIZE != 0)) { - BufferHead *right = bc.split(bh, bh->start()+1); - hits[right->start()] = right; - dout(10) << "apply_write split off left block for partial write; rest is " << *right << dendl; - } - if ((bh->last() == blast && (len+off) % EBOFS_BLOCK_SIZE != 0) && - ((off_t)len+off < on->object_size)) { - BufferHead *right = bc.split(bh, bh->last()); - hits[right->start()] = right; - dout(10) << "apply_write split off right block for upcoming partial write; rest is " << *right << dendl; - } - } - - // partial at head or tail? - if ((bh->start() == bstart && opos % EBOFS_BLOCK_SIZE != 0) || // opos, not off, in case we're zeroing... - (bh->last() == blast && ((off_t)len+off) % EBOFS_BLOCK_SIZE != 0 && ((off_t)len+off) < on->object_size)) { - // locate ourselves in bh - unsigned off_in_bh = opos - bh->start()*EBOFS_BLOCK_SIZE; - assert(off_in_bh >= 0); - unsigned len_in_bh = MIN( (off_t)(zleft+left), - (off_t)(bh->end()*EBOFS_BLOCK_SIZE)-opos ); - - if (bh->is_partial() || bh->is_rx() || bh->is_missing()) { - assert(bh->is_partial() || bh->is_rx() || bh->is_missing()); - assert(bh->length() == 1); - - // add frag to partial - dout(10) << "apply_write writing into partial " << *bh << ":" - << " off_in_bh " << off_in_bh - << " len_in_bh " << len_in_bh - << dendl; - unsigned z = MIN( zleft, len_in_bh ); - if (z) { - bufferptr zp(z); - zp.zero(); - bufferlist zb; - zb.push_back(zp); - bh->add_partial(off_in_bh, zb); - zleft -= z; - opos += z; - } - - bufferlist sb; - sb.substr_of(bl, blpos, len_in_bh-z); // substr in existing buffer - bufferlist cp; - cp.append(sb.c_str(), len_in_bh-z); // copy the partial bit! - bh->add_partial(off_in_bh, cp); - left -= len_in_bh-z; - blpos += len_in_bh-z; - opos += len_in_bh-z; - - if (bh->partial_is_complete(on->object_size - bh->start()*EBOFS_BLOCK_SIZE)) { - dout(10) << "apply_write completed partial " << *bh << dendl; - //bc.bufferpool.alloc(EBOFS_BLOCK_SIZE*bh->length(), bh->data); // new buffers! - bh->data.clear(); - bh->data.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*bh->length()) ); - bh->data.zero(); - bh->apply_partial(); - bc.mark_dirty(bh); - bc.bh_write(on, bh); - } - else if (bh->is_rx()) { - dout(10) << "apply_write rx -> partial " << *bh << dendl; - assert(bh->length() == 1); - bc.mark_partial(bh); - bc.bh_queue_partial_write(on, bh); // queue the eventual write - } - else if (bh->is_missing()) { - dout(10) << "apply_write missing -> partial " << *bh << dendl; - assert(bh->length() == 1); - bc.mark_partial(bh); - - // take care to read from _old_ disk block locations! - if (bh->start() == bstart) - bc.bh_read(on, bh, old_bfirst); - else if (bh->start() == blast) - bc.bh_read(on, bh, old_blast); - else assert(0); - - bc.bh_queue_partial_write(on, bh); // queue the eventual write - } - else if (bh->is_partial()) { - dout(10) << "apply_write already partial, no need to submit rx on " << *bh << dendl; - if (bh->partial_tx_epoch == super_epoch) - bc.bh_cancel_partial_write(bh); - bc.bh_queue_partial_write(on, bh); // queue the eventual write - } - - - } else { - assert(bh->is_clean() || bh->is_dirty() || bh->is_tx()); - - // just write into the bh! - dout(10) << "apply_write writing leading/tailing partial into " << *bh << ":" - << " off_in_bh " << off_in_bh - << " len_in_bh " << len_in_bh - << dendl; - - // copy data into new buffers first (copy on write!) - // FIXME: only do the modified pages? this might be a big bh! - bufferlist temp; - temp.claim(bh->data); - //bc.bufferpool.alloc(EBOFS_BLOCK_SIZE*bh->length(), bh->data); - bh->data.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*bh->length()) ); - bh->data.copy_in(0, bh->length()*EBOFS_BLOCK_SIZE, temp); - - unsigned z = MIN( zleft, len_in_bh ); - if (z) { - bufferptr zp(z); - zp.zero(); - bufferlist zb; - zb.push_back(zp); - bh->data.copy_in(off_in_bh, z, zb); - zleft -= z; - opos += z; - } - - bufferlist sub; - sub.substr_of(bl, blpos, len_in_bh-z); - bh->data.copy_in(off_in_bh+z, len_in_bh-z, sub); - - blpos += len_in_bh-z; - left -= len_in_bh-z; - opos += len_in_bh-z; - - if (!bh->is_dirty()) - bc.mark_dirty(bh); - - bc.bh_write(on, bh); - } - continue; - } - - // ok, we're talking full block(s) now (modulo last block of the object) - assert(opos % EBOFS_BLOCK_SIZE == 0); - assert((off_t)(zleft+left) >= (off_t)(EBOFS_BLOCK_SIZE*bh->length()) || - opos+(off_t)(zleft+left) == on->object_size); - - unsigned len_in_bh = MIN(bh->length()*EBOFS_BLOCK_SIZE, zleft+left); - assert(len_in_bh <= zleft+left); - - dout(10) << "apply_write writing into " << *bh << ":" - << " len_in_bh " << len_in_bh - << dendl; - - // i will write: - unsigned z = MIN(len_in_bh, zleft); - bufferlist sub; - sub.substr_of(bl, blpos, len_in_bh-z); - - if (!z && - sub.is_page_aligned() && - sub.is_n_page_sized()) { - // assume caller isn't going to modify written buffers. - // just refrence them! - dout(10) << "apply_write yippee, written buffer already page aligned" << dendl; - bh->data.claim(sub); - } else { - // alloc new buffers. - bh->data.clear(); - bh->data.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*bh->length()) ); - - if (z) { - bufferptr zp(z); - zp.zero(); - bufferlist zb; - zb.push_back(zp); - bh->data.copy_in(0, z, zb); - zleft -= z; - } - - bufferlist sub; - sub.substr_of(bl, blpos, len_in_bh-z); - bh->data.copy_in(z, len_in_bh-z, sub); - } - - blpos += len_in_bh-z; - left -= len_in_bh-z; - opos += len_in_bh; - - // old partial? - if (bh->is_partial() && - bh->partial_tx_epoch == super_epoch) - bc.bh_cancel_partial_write(bh); - - // mark dirty - if (!bh->is_dirty()) - bc.mark_dirty(bh); - - bc.bh_write(on, bh); - } - - assert(zleft == 0); - assert(left == 0); - assert(opos == off+(off_t)len); - //assert(blpos == bl.length()); -} - - - - -// *** file i/o *** - -bool Ebofs::attempt_read(Onode *on, off_t off, size_t len, bufferlist& bl, - Cond *will_wait_on, bool *will_wait_on_bool) -{ - dout(10) << "attempt_read " << *on << " " << off << "~" << len << dendl; - ObjectCache *oc = on->get_oc(&bc); - - // map - block_t bstart = off / EBOFS_BLOCK_SIZE; - block_t blast = (len+off-1) / EBOFS_BLOCK_SIZE; - block_t blen = blast-bstart+1; - - map hits; - map missing; // read these - map rx; // wait for these - map partials; // ?? - oc->map_read(bstart, blen, hits, missing, rx, partials); - - // missing buffers? - if (!missing.empty()) { - for (map::iterator i = missing.begin(); - i != missing.end(); - i++) { - dout(10) << "attempt_read missing buffer " << *(i->second) << dendl; - bc.bh_read(on, i->second); - } - BufferHead *wait_on = missing.begin()->second; - block_t b = MAX(wait_on->start(), bstart); - wait_on->waitfor_read[b].push_back(new C_Cond(will_wait_on, will_wait_on_bool)); - return false; - } - - // are partials sufficient? - bool partials_ok = true; - for (map::iterator i = partials.begin(); - i != partials.end(); - i++) { - BufferHead *bh = i->second; - off_t bhstart = (off_t)(bh->start()*EBOFS_BLOCK_SIZE); - off_t bhend = (off_t)(bh->end()*EBOFS_BLOCK_SIZE); - off_t start = MAX( off, bhstart ); - off_t end = MIN( off+(off_t)len, bhend ); - - if (!i->second->have_partial_range(start-bhstart, end-bhend)) { - if (partials_ok) { - // wait on this one - Context *c = new C_Cond(will_wait_on, will_wait_on_bool); - dout(10) << "attempt_read insufficient partial buffer " << *(i->second) << " c " << c << dendl; - i->second->waitfor_read[i->second->start()].push_back(c); - } - partials_ok = false; - } - } - if (!partials_ok) return false; - - // wait on rx? - if (!rx.empty()) { - BufferHead *wait_on = rx.begin()->second; - Context *c = new C_Cond(will_wait_on, will_wait_on_bool); - dout(20) << "attempt_read waiting for read to finish on " << *wait_on << " c " << c << dendl; - block_t b = MAX(wait_on->start(), bstart); - wait_on->waitfor_read[b].push_back(c); - return false; - } - - // yay, we have it all! - // concurrently walk thru hits, partials. - map::iterator h = hits.begin(); - map::iterator p = partials.begin(); - - bl.clear(); - off_t pos = off; - block_t curblock = bstart; - while (curblock <= blast) { - BufferHead *bh = 0; - if (h->first == curblock) { - bh = h->second; - h++; - } else if (p->first == curblock) { - bh = p->second; - p++; - } else assert(0); - - off_t bhstart = (off_t)(bh->start()*EBOFS_BLOCK_SIZE); - off_t bhend = (off_t)(bh->end()*EBOFS_BLOCK_SIZE); - off_t start = MAX( pos, bhstart ); - off_t end = MIN( off+(off_t)len, bhend ); - - if (bh->is_partial()) { - // copy from a partial block. yuck! - bufferlist frag; - bh->copy_partial_substr( start-bhstart, end-bhstart, frag ); - bl.claim_append( frag ); - pos += frag.length(); - } else { - // copy from a full block. - if (bhstart == start && bhend == end) { - bl.append( bh->data ); - pos += bh->data.length(); - } else { - bufferlist frag; - dout(10) << "substr " << (start-bhstart) << "~" << (end-start) << " of " << bh->data.length() << " in " << *bh << dendl; - frag.substr_of(bh->data, start-bhstart, end-start); - pos += frag.length(); - bl.claim_append( frag ); - } - } - - curblock = bh->end(); - /* this assert is more trouble than it's worth - assert((off_t)(curblock*EBOFS_BLOCK_SIZE) == pos || // should be aligned with next block - end != bhend || // or we ended midway through bh - (bh->last() == blast && end == bhend)); // ended last block ** FIXME WRONG??? - */ - } - - assert(bl.length() == len); - return true; -} - - -/* - * is_cached -- query whether a object extent is in our cache - * return value of -1 if onode isn't loaded. otherwise, the number - * of extents that need to be read (i.e. # of seeks) - */ -int Ebofs::is_cached(object_t oid, off_t off, size_t len) -{ - ebofs_lock.Lock(); - int r = _is_cached(oid, off, len); - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_is_cached(object_t oid, off_t off, size_t len) -{ - if (!have_onode(oid)) { - dout(7) << "_is_cached " << oid << " " << off << "~" << len << " ... onode " << dendl; - return -1; // object dne? - } - Onode *on = get_onode(oid); - - if (!on->have_oc()) { - // nothing is cached. return # of extents in file. - dout(10) << "_is_cached have onode but no object cache, returning extent count" << dendl; - return on->extent_map.size(); - } - - // map - block_t bstart = off / EBOFS_BLOCK_SIZE; - block_t blast = (len+off-1) / EBOFS_BLOCK_SIZE; - block_t blen = blast-bstart+1; - - map hits; - map missing; // read these - map rx; // wait for these - map partials; // ?? - - int num_missing = on->get_oc(&bc)->try_map_read(bstart, blen); - dout(7) << "_is_cached try_map_read reports " << num_missing << " missing extents" << dendl; - return num_missing; - - // FIXME: actually, we should calculate if these extents are contiguous. - // and not using map_read, probably... - /* hrmpf - block_t dpos = 0; - block_t opos = bstart; - while (opos < blen) { - if (hits.begin()->first == opos) { - } else { - block_t d; - if (missing.begin()->first == opos) d = missing.begin()->second. - - } - */ -} - -void Ebofs::trim_from_cache(object_t oid, off_t off, size_t len) -{ - ebofs_lock.Lock(); - _trim_from_cache(oid, off, len); - ebofs_lock.Unlock(); -} - -void Ebofs::_trim_from_cache(object_t oid, off_t off, size_t len) -{ - // be careful not to load it if we don't have it - if (!have_onode(oid)) { - dout(7) << "_trim_from_cache " << oid << " " << off << "~" << len << " ... onode not in cache " << dendl; - return; - } - - // ok, we have it, get a pointer. - Onode *on = get_onode(oid); - - if (!on->have_oc()) - return; // nothing is cached. - - // map to blocks - block_t bstart = off / EBOFS_BLOCK_SIZE; - block_t blast = (len+off-1) / EBOFS_BLOCK_SIZE; - - ObjectCache *oc = on->get_oc(&bc); - oc->touch_bottom(bstart, blast); - - return; -} - - -int Ebofs::read(object_t oid, - off_t off, size_t len, - bufferlist& bl) -{ - ebofs_lock.Lock(); - int r = _read(oid, off, len, bl); - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_read(object_t oid, off_t off, size_t len, bufferlist& bl) -{ - dout(7) << "_read " << oid << " " << off << "~" << len << dendl; - - Onode *on = get_onode(oid); - if (!on) { - dout(7) << "_read " << oid << " " << off << "~" << len << " ... dne " << dendl; - return -ENOENT; // object dne? - } - - // read data into bl. block as necessary. - Cond cond; - - int r = 0; - while (1) { - // check size bound - if (off >= on->object_size) { - dout(7) << "_read " << oid << " " << off << "~" << len << " ... off past eof " << on->object_size << dendl; - r = -ESPIPE; // FIXME better errno? - break; - } - - size_t try_len = len ? len:on->object_size; - size_t will_read = MIN(off+(off_t)try_len, on->object_size) - off; - - bool done; - if (attempt_read(on, off, will_read, bl, &cond, &done)) - break; // yay - - // wait - while (!done) - cond.Wait(ebofs_lock); - - if (on->deleted) { - dout(7) << "_read " << oid << " " << off << "~" << len << " ... object deleted" << dendl; - r = -ENOENT; - break; - } - } - - put_onode(on); - - trim_bc(); - - if (r < 0) return r; // return error, - dout(7) << "_read " << oid << " " << off << "~" << len << " ... got " << bl.length() << dendl; - return bl.length(); // or bytes read. -} - - -bool Ebofs::_write_will_block() -{ - return (bc.get_stat_dirty()+bc.get_stat_tx() > g_conf.ebofs_bc_max_dirty); -} - -bool Ebofs::write_will_block() -{ - ebofs_lock.Lock(); - bool b = _write_will_block(); - ebofs_lock.Unlock(); - return b; -} - - -unsigned Ebofs::apply_transaction(Transaction& t, Context *onsafe) -{ - ebofs_lock.Lock(); - dout(7) << "apply_transaction start (" << t.get_num_ops() << " ops)" << dendl; - - unsigned r = _apply_transaction(t); - - // journal, wait for commit - if (r != 0 && onsafe) { - delete onsafe; // kill callback, but still journal below (in case transaction had side effects) - onsafe = 0; - } - while (1) { - if (journal) { - bufferlist bl; - t._encode(bl); - if (journal->submit_entry(bl, onsafe)) break; - } - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - break; - } - - ebofs_lock.Unlock(); - return r; -} - -unsigned Ebofs::_apply_transaction(Transaction& t) -{ - // do ops - unsigned r = 0; // bit fields indicate which ops failed. - int bit = 1; - while (t.have_op()) { - int op = t.get_op(); - switch (op) { - case Transaction::OP_READ: - { - object_t oid; - t.get_oid(oid); - off_t offset, len; - t.get_length(offset); - t.get_length(len); - bufferlist *pbl; - t.get_pbl(pbl); - if (_read(oid, offset, len, *pbl) < 0) { - dout(7) << "apply_transaction fail on _read" << dendl; - r &= bit; - } - } - break; - - case Transaction::OP_STAT: - { - object_t oid; - t.get_oid(oid); - struct stat *st; - t.get_pstat(st); - if (_stat(oid, st) < 0) { - dout(7) << "apply_transaction fail on _stat" << dendl; - r &= bit; - } - } - break; - - case Transaction::OP_GETATTR: - { - object_t oid; - t.get_oid(oid); - const char *attrname; - t.get_attrname(attrname); - pair pattrval; - t.get_pattrval(pattrval); - if ((*(pattrval.second) = _getattr(oid, attrname, pattrval.first, *(pattrval.second))) < 0) { - dout(7) << "apply_transaction fail on _getattr" << dendl; - r &= bit; - } - } - break; - - case Transaction::OP_GETATTRS: - { - object_t oid; - t.get_oid(oid); - map *pset; - t.get_pattrset(pset); - if (_getattrs(oid, *pset) < 0) { - dout(7) << "apply_transaction fail on _getattrs" << dendl; - r &= bit; - } - } - break; - - - case Transaction::OP_WRITE: - { - object_t oid; - t.get_oid(oid); - off_t offset, len; - t.get_length(offset); - t.get_length(len); - bufferlist bl; - t.get_bl(bl); - if (_write(oid, offset, len, bl) < 0) { - dout(7) << "apply_transaction fail on _write" << dendl; - r &= bit; - } - } - break; - - case Transaction::OP_TRIMCACHE: - { - object_t oid; - t.get_oid(oid); - off_t offset, len; - t.get_length(offset); - t.get_length(len); - _trim_from_cache(oid, offset, len); - } - break; - - case Transaction::OP_TRUNCATE: - { - object_t oid; - t.get_oid(oid); - off_t len; - t.get_length(len); - if (_truncate(oid, len) < 0) { - dout(7) << "apply_transaction fail on _truncate" << dendl; - r &= bit; - } - } - break; - - case Transaction::OP_REMOVE: - { - object_t oid; - t.get_oid(oid); - if (_remove(oid) < 0) { - dout(7) << "apply_transaction fail on _remove" << dendl; - r &= bit; - } - } - break; - - case Transaction::OP_SETATTR: - { - object_t oid; - t.get_oid(oid); - const char *attrname; - t.get_attrname(attrname); - bufferlist bl; - t.get_bl(bl); - if (_setattr(oid, attrname, bl.c_str(), bl.length()) < 0) { - dout(7) << "apply_transaction fail on _setattr" << dendl; - r &= bit; - } - } - break; - - case Transaction::OP_SETATTRS: - { - object_t oid; - t.get_oid(oid); - map *pattrset; - t.get_pattrset(pattrset); - if (_setattrs(oid, *pattrset) < 0) { - dout(7) << "apply_transaction fail on _setattrs" << dendl; - r &= bit; - } - } - break; - - case Transaction::OP_RMATTR: - { - object_t oid; - t.get_oid(oid); - const char *attrname; - t.get_attrname(attrname); - if (_rmattr(oid, attrname) < 0) { - dout(7) << "apply_transaction fail on _rmattr" << dendl; - r &= bit; - } - } - break; - - case Transaction::OP_CLONE: - { - object_t oid; - t.get_oid(oid); - object_t noid; - t.get_oid(noid); - if (_clone(oid, noid) < 0) { - dout(7) << "apply_transaction fail on _clone" << dendl; - r &= bit; - } - } - break; - - case Transaction::OP_MKCOLL: - { - coll_t cid; - t.get_cid(cid); - if (_create_collection(cid) < 0) { - dout(7) << "apply_transaction fail on _create_collection" << dendl; - r &= bit; - } - } - break; - - case Transaction::OP_RMCOLL: - { - coll_t cid; - t.get_cid(cid); - if (_destroy_collection(cid) < 0) { - dout(7) << "apply_transaction fail on _destroy_collection" << dendl; - r &= bit; - } - } - break; - - case Transaction::OP_COLL_ADD: - { - coll_t cid; - t.get_cid(cid); - object_t oid; - t.get_oid(oid); - if (_collection_add(cid, oid) < 0) { - //dout(7) << "apply_transaction fail on _collection_add" << dendl; - //r &= bit; - } - } - break; - - case Transaction::OP_COLL_REMOVE: - { - coll_t cid; - t.get_cid(cid); - object_t oid; - t.get_oid(oid); - if (_collection_remove(cid, oid) < 0) { - dout(7) << "apply_transaction fail on _collection_remove" << dendl; - r &= bit; - } - } - break; - - case Transaction::OP_COLL_SETATTR: - { - coll_t cid; - t.get_cid(cid); - const char *attrname; - t.get_attrname(attrname); - bufferlist bl; - t.get_bl(bl); - if (_collection_setattr(cid, attrname, bl.c_str(), bl.length()) < 0) { - //if (_collection_setattr(cid, attrname, attrval.first, attrval.second) < 0) { - dout(7) << "apply_transaction fail on _collection_setattr" << dendl; - r &= bit; - } - } - break; - - case Transaction::OP_COLL_RMATTR: - { - coll_t cid; - t.get_cid(cid); - const char *attrname; - t.get_attrname(attrname); - if (_collection_rmattr(cid, attrname) < 0) { - dout(7) << "apply_transaction fail on _collection_rmattr" << dendl; - r &= bit; - } - } - break; - - default: - dout(0) << "bad op " << op << dendl; - assert(0); - } - - bit = bit << 1; - } - - dout(7) << "_apply_transaction finish (r = " << r << ")" << dendl; - return r; -} - - - -int Ebofs::_write(object_t oid, off_t offset, size_t length, const bufferlist& bl) -{ - dout(7) << "_write " << oid << " " << offset << "~" << length << dendl; - assert(bl.length() == length); - - // too much unflushed dirty data? (if so, block!) - if (_write_will_block()) { - dout(10) << "_write blocking " - << oid << " " << offset << "~" << length - << " bc: " - << "size " << bc.get_size() - << ", trimmable " << bc.get_trimmable() - << ", max " << g_conf.ebofs_bc_size - << "; dirty " << bc.get_stat_dirty() - << ", tx " << bc.get_stat_tx() - << ", max dirty " << g_conf.ebofs_bc_max_dirty - << dendl; - - while (_write_will_block()) - bc.waitfor_stat(); // waits on ebofs_lock - - dout(10) << "_write unblocked " - << oid << " " << offset << "~" << length - << " bc: " - << "size " << bc.get_size() - << ", trimmable " << bc.get_trimmable() - << ", max " << g_conf.ebofs_bc_size - << "; dirty " << bc.get_stat_dirty() - << ", tx " << bc.get_stat_tx() - << ", max dirty " << g_conf.ebofs_bc_max_dirty - << dendl; - } - - // out of space? - unsigned max = (length+offset) / EBOFS_BLOCK_SIZE + 10; // very conservative; assumes we have to rewrite - max += dirty_onodes.size() + dirty_cnodes.size(); - if (max >= free_blocks) { - dout(1) << "write failing, only " << free_blocks << " blocks free, may need up to " << max << dendl; - return -ENOSPC; - } - - // get|create inode - Onode *on = get_onode(oid); - if (!on) on = new_onode(oid); // new inode! - if (on->readonly) { - put_onode(on); - return -EACCES; - } - - dirty_onode(on); // dirty onode! - - // apply write to buffer cache - if (length > 0) - apply_write(on, offset, length, bl); - - // done. - put_onode(on); - trim_bc(); - - return length; -} - - -int Ebofs::write(object_t oid, - off_t off, size_t len, - const bufferlist& bl, Context *onsafe) -{ - ebofs_lock.Lock(); - - // go - int r = _write(oid, off, len, bl); - - // commit waiter - if (r > 0) { - assert((size_t)r == len); - while (1) { - if (journal) { - Transaction t; - t.write(oid, off, len, bl); - bufferlist tbl; - t._encode(tbl); - if (journal->submit_entry(tbl, onsafe)) break; - } - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - break; - } - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return r; -} - - -int Ebofs::_remove(object_t oid) -{ - dout(7) << "_remove " << oid << dendl; - - // get inode - Onode *on = get_onode(oid); - if (!on) return -ENOENT; - - // ok remove it! - remove_onode(on); - - return 0; -} - - -int Ebofs::remove(object_t oid, Context *onsafe) -{ - ebofs_lock.Lock(); - - // do it - int r = _remove(oid); - - // journal, wait for commit - if (r >= 0) { - while (1) { - if (journal) { - Transaction t; - t.remove(oid); - bufferlist bl; - t._encode(bl); - if (journal->submit_entry(bl, onsafe)) break; - } - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - break; - } - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_truncate(object_t oid, off_t size) -{ - dout(7) << "_truncate " << oid << " size " << size << dendl; - - Onode *on = get_onode(oid); - if (!on) - return -ENOENT; - if (on->readonly) { - put_onode(on); - return -EACCES; - } - - int r = 0; - if (size > on->object_size) { - r = -EINVAL; // whatever - } - else if (size < on->object_size) { - // change size - on->object_size = size; - dirty_onode(on); - - // free blocks - block_t nblocks = 0; - if (size) nblocks = 1 + (size-1) / EBOFS_BLOCK_SIZE; - if (on->object_blocks > nblocks) { - vector extra; - on->truncate_extents(nblocks, extra); - for (unsigned i=0; ioc) { - on->oc->truncate(on->object_blocks, super_epoch); - if (on->oc->is_empty()) - on->close_oc(); - } - - // update uncommitted - interval_set uncom; - if (nblocks > 0) { - interval_set left; - left.insert(0, nblocks); - uncom.intersection_of(left, on->uncommitted); - } - dout(10) << "uncommitted was " << on->uncommitted << " now " << uncom << dendl; - on->uncommitted = uncom; - - } - else { - assert(size == on->object_size); - } - - put_onode(on); - return r; -} - - -int Ebofs::truncate(object_t oid, off_t size, Context *onsafe) -{ - ebofs_lock.Lock(); - - int r = _truncate(oid, size); - - // journal, wait for commit - if (r >= 0) { - while (1) { - if (journal) { - Transaction t; - t.truncate(oid, size); - bufferlist bl; - t._encode(bl); - if (journal->submit_entry(bl, onsafe)) break; - } - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - break; - } - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return r; -} - - - -int Ebofs::clone(object_t from, object_t to, Context *onsafe) -{ - ebofs_lock.Lock(); - - int r = _clone(from, to); - - // journal, wait for commit - if (r >= 0) { - while (1) { - if (journal) { - Transaction t; - t.clone(from, to); - bufferlist bl; - t._encode(bl); - if (journal->submit_entry(bl, onsafe)) break; - } - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - break; - } - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_clone(object_t from, object_t to) -{ - dout(7) << "_clone " << from << " -> " << to << dendl; - - if (!g_conf.ebofs_cloneable) - return -1; // no! - - Onode *fon = get_onode(from); - if (!fon) return -ENOENT; - Onode *ton = get_onode(to); - if (ton) { - put_onode(fon); - put_onode(ton); - return -EEXIST; - } - ton = new_onode(to); - assert(ton); - - // copy easy bits - ton->readonly = true; - ton->object_size = fon->object_size; - ton->object_blocks = fon->object_blocks; - ton->attr = fon->attr; - - // collections - for (set::iterator p = fon->collections.begin(); - p != fon->collections.end(); - p++) - _collection_add(*p, to); - - // extents - ton->extent_map = fon->extent_map; - for (map::iterator p = ton->extent_map.begin(); - p != ton->extent_map.end(); - ++p) { - allocator.alloc_inc(p->second); - } - - // clear uncommitted - fon->uncommitted.clear(); - - // muck with ObjectCache - if (fon->oc) - fon->oc->clone_to( ton ); - - // ok! - put_onode(ton); - put_onode(fon); - return 0; -} - - - - -/* - * pick object revision with rev < specified rev. - * (oid.rev is a noninclusive upper bound.) - * - */ -int Ebofs::pick_object_revision_lt(object_t& oid) -{ - assert(oid.rev > 0); // this is only useful for non-zero oid.rev - - int r = -EEXIST; // return code - ebofs_lock.Lock(); - { - object_t orig = oid; - object_t live = oid; - live.rev = 0; - - if (object_tab->get_num_keys() > 0) { - Table::Cursor cursor(object_tab); - - object_tab->find(oid, cursor); // this will be just _past_ highest eligible rev - if (cursor.move_left() > 0) { - bool firstpass = true; - while (1) { - object_t t = cursor.current().key; - if (t.ino != oid.ino || - t.bno != oid.bno) // passed to previous object - break; - if (oid.rev < t.rev) { // rev < desired. possible match. - r = 0; - oid = t; - break; - } - if (firstpass && oid.rev >= t.rev) { // there is no old rev < desired. try live. - r = 0; - oid = live; - break; - } - if (cursor.move_left() <= 0) break; - firstpass = false; - } - } - } - - dout(8) << "find_object_revision " << orig << " -> " << oid - << " r=" << r << dendl; - } - ebofs_lock.Unlock(); - return r; -} - - - - -bool Ebofs::exists(object_t oid) -{ - ebofs_lock.Lock(); - dout(8) << "exists " << oid << dendl; - bool e = (object_tab->lookup(oid) == 0); - ebofs_lock.Unlock(); - return e; -} - -int Ebofs::stat(object_t oid, struct stat *st) -{ - ebofs_lock.Lock(); - int r = _stat(oid,st); - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_stat(object_t oid, struct stat *st) -{ - dout(7) << "_stat " << oid << dendl; - - Onode *on = get_onode(oid); - if (!on) return -ENOENT; - - // ?? - st->st_size = on->object_size; - - put_onode(on); - return 0; -} - - -int Ebofs::_setattr(object_t oid, const char *name, const void *value, size_t size) -{ - dout(8) << "setattr " << oid << " '" << name << "' len " << size << dendl; - - Onode *on = get_onode(oid); - if (!on) return -ENOENT; - if (on->readonly) { - put_onode(on); - return -EACCES; - } - - string n(name); - on->attr[n] = buffer::copy((char*)value, size); - dirty_onode(on); - put_onode(on); - - dout(8) << "setattr " << oid << " '" << name << "' len " << size << " success" << dendl; - - return 0; -} - -int Ebofs::setattr(object_t oid, const char *name, const void *value, size_t size, Context *onsafe) -{ - ebofs_lock.Lock(); - int r = _setattr(oid, name, value, size); - - // journal, wait for commit - if (r >= 0) { - while (1) { - if (journal) { - Transaction t; - t.setattr(oid, name, value, size); - bufferlist bl; - t._encode(bl); - if (journal->submit_entry(bl, onsafe)) break; - } - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - break; - } - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_setattrs(object_t oid, map& attrset) -{ - dout(8) << "setattrs " << oid << dendl; - - Onode *on = get_onode(oid); - if (!on) return -ENOENT; - if (on->readonly) { - put_onode(on); - return -EACCES; - } - - on->attr = attrset; - dirty_onode(on); - put_onode(on); - return 0; -} - -int Ebofs::setattrs(object_t oid, map& attrset, Context *onsafe) -{ - ebofs_lock.Lock(); - int r = _setattrs(oid, attrset); - - // journal, wait for commit - if (r >= 0) { - while (1) { - if (journal) { - Transaction t; - t.setattrs(oid, attrset); - bufferlist bl; - t._encode(bl); - if (journal->submit_entry(bl, onsafe)) break; - } - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - break; - } - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return r; -} - - -int Ebofs::get_object_collections(object_t oid, set& ls) -{ - ebofs_lock.Lock(); - int r = _get_object_collections(oid, ls); - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_get_object_collections(object_t oid, set& ls) -{ - dout(8) << "_get_object_collections " << oid << dendl; - - Onode *on = get_onode(oid); - if (!on) return -ENOENT; - ls = on->collections; - put_onode(on); - return 0; -} - -int Ebofs::getattr(object_t oid, const char *name, void *value, size_t size) -{ - ebofs_lock.Lock(); - int r = _getattr(oid, name, value, size); - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_getattr(object_t oid, const char *name, void *value, size_t size) -{ - dout(8) << "_getattr " << oid << " '" << name << "' maxlen " << size << dendl; - - Onode *on = get_onode(oid); - if (!on) return -ENOENT; - - string n(name); - int r = 0; - if (on->attr.count(n) == 0) { - dout(10) << "_getattr " << oid << " '" << name << "' dne" << dendl; - r = -1; - } else { - r = MIN( on->attr[n].length(), size ); - dout(10) << "_getattr " << oid << " '" << name << "' got len " << r << dendl; - memcpy(value, on->attr[n].c_str(), r ); - } - put_onode(on); - return r; -} - -int Ebofs::getattrs(object_t oid, map &aset) -{ - ebofs_lock.Lock(); - int r = _getattrs(oid, aset); - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_getattrs(object_t oid, map &aset) -{ - dout(8) << "_getattrs " << oid << dendl; - - Onode *on = get_onode(oid); - if (!on) return -ENOENT; - aset = on->attr; - put_onode(on); - return 0; -} - - - -int Ebofs::_rmattr(object_t oid, const char *name) -{ - dout(8) << "_rmattr " << oid << " '" << name << "'" << dendl; - - Onode *on = get_onode(oid); - if (!on) return -ENOENT; - if (on->readonly) { - put_onode(on); - return -EACCES; - } - - string n(name); - on->attr.erase(n); - dirty_onode(on); - put_onode(on); - return 0; -} - -int Ebofs::rmattr(object_t oid, const char *name, Context *onsafe) -{ - ebofs_lock.Lock(); - - int r = _rmattr(oid, name); - - // journal, wait for commit - if (r >= 0) { - while (1) { - if (journal) { - Transaction t; - t.rmattr(oid, name); - bufferlist bl; - t._encode(bl); - if (journal->submit_entry(bl, onsafe)) break; - } - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - break; - } - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::listattr(object_t oid, vector& attrs) -{ - ebofs_lock.Lock(); - dout(8) << "listattr " << oid << dendl; - - Onode *on = get_onode(oid); - if (!on) { - ebofs_lock.Unlock(); - return -ENOENT; - } - - attrs.clear(); - for (map::iterator i = on->attr.begin(); - i != on->attr.end(); - i++) { - attrs.push_back(i->first); - } - - put_onode(on); - ebofs_lock.Unlock(); - return 0; -} - -int Ebofs::list_objects(list& ls) -{ - ebofs_lock.Lock(); - dout(9) << "list_objects " << dendl; - - Table::Cursor cursor(object_tab); - - int num = 0; - if (object_tab->find(object_t(), cursor) >= 0) { - while (1) { - ls.push_back(cursor.current().key); - num++; - if (cursor.move_right() <= 0) break; - } - } - - ebofs_lock.Unlock(); - return num; -} - - -/***************** collections ******************/ - -int Ebofs::list_collections(list& ls) -{ - ebofs_lock.Lock(); - dout(9) << "list_collections " << dendl; - - Table::Cursor cursor(collection_tab); - - int num = 0; - if (collection_tab->find(0, cursor) >= 0) { - while (1) { - ls.push_back(cursor.current().key); - num++; - if (cursor.move_right() <= 0) break; - } - } - - ebofs_lock.Unlock(); - return num; -} - -int Ebofs::_create_collection(coll_t cid) -{ - dout(9) << "_create_collection " << hex << cid << dec << dendl; - - if (_collection_exists(cid)) - return -EEXIST; - - Cnode *cn = new_cnode(cid); - put_cnode(cn); - - return 0; -} - -int Ebofs::create_collection(coll_t cid, Context *onsafe) -{ - ebofs_lock.Lock(); - - int r = _create_collection(cid); - - // journal, wait for commit - if (r >= 0) { - while (1) { - if (journal) { - Transaction t; - t.create_collection(cid); - bufferlist bl; - t._encode(bl); - if (journal->submit_entry(bl, onsafe)) break; - } - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - break; - } - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_destroy_collection(coll_t cid) -{ - dout(9) << "_destroy_collection " << hex << cid << dec << dendl; - - if (!_collection_exists(cid)) - return -ENOENT; - - Cnode *cn = get_cnode(cid); - assert(cn); - - // hose mappings - list objects; - collection_list(cid, objects); - for (list::iterator i = objects.begin(); - i != objects.end(); - i++) { - co_tab->remove(coll_object_t(cid,*i)); - - Onode *on = get_onode(*i); - if (on) { - on->collections.erase(cid); - dirty_onode(on); - put_onode(on); - } - } - - remove_cnode(cn); - return 0; -} - -int Ebofs::destroy_collection(coll_t cid, Context *onsafe) -{ - ebofs_lock.Lock(); - - int r = _destroy_collection(cid); - - // journal, wait for commit - if (r >= 0) { - while (1) { - if (journal) { - Transaction t; - t.remove_collection(cid); - bufferlist bl; - t._encode(bl); - if (journal->submit_entry(bl, onsafe)) break; - } - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - break; - } - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return r; -} - -bool Ebofs::collection_exists(coll_t cid) -{ - ebofs_lock.Lock(); - dout(10) << "collection_exists " << hex << cid << dec << dendl; - bool r = _collection_exists(cid); - ebofs_lock.Unlock(); - return r; -} -bool Ebofs::_collection_exists(coll_t cid) -{ - return (collection_tab->lookup(cid) == 0); -} - -int Ebofs::_collection_add(coll_t cid, object_t oid) -{ - dout(9) << "_collection_add " << hex << cid << " object " << oid << dec << dendl; - - if (!_collection_exists(cid)) - return -ENOENT; - - Onode *on = get_onode(oid); - if (!on) return -ENOENT; - - int r = 0; - - if (on->collections.count(cid) == 0) { - on->collections.insert(cid); - dirty_onode(on); - co_tab->insert(coll_object_t(cid,oid), true); - } else { - r = -ENOENT; // FIXME? already in collection. - } - - put_onode(on); - return r; -} - -int Ebofs::collection_add(coll_t cid, object_t oid, Context *onsafe) -{ - ebofs_lock.Lock(); - - int r = _collection_add(cid, oid); - - // journal, wait for commit - if (r >= 0) { - while (1) { - if (journal) { - Transaction t; - t.collection_add(cid, oid); - bufferlist bl; - t._encode(bl); - if (journal->submit_entry(bl, onsafe)) break; - } - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - break; - } - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return 0; -} - -int Ebofs::_collection_remove(coll_t cid, object_t oid) -{ - dout(9) << "_collection_remove " << hex << cid << " object " << oid << dec << dendl; - - if (!_collection_exists(cid)) - return -ENOENT; - - Onode *on = get_onode(oid); - if (!on) return -ENOENT; - - int r = 0; - - if (on->collections.count(cid)) { - on->collections.erase(cid); - dirty_onode(on); - co_tab->remove(coll_object_t(cid,oid)); - } else { - r = -ENOENT; // FIXME? - } - - put_onode(on); - return r; -} - -int Ebofs::collection_remove(coll_t cid, object_t oid, Context *onsafe) -{ - ebofs_lock.Lock(); - - int r = _collection_remove(cid, oid); - - // journal, wait for commit - if (r >= 0) { - while (1) { - if (journal) { - Transaction t; - t.collection_remove(cid, oid); - bufferlist bl; - t._encode(bl); - if (journal->submit_entry(bl, onsafe)) break; - } - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - break; - } - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return 0; -} - -int Ebofs::collection_list(coll_t cid, list& ls) -{ - ebofs_lock.Lock(); - dout(9) << "collection_list " << hex << cid << dec << dendl; - - if (!_collection_exists(cid)) { - ebofs_lock.Unlock(); - return -ENOENT; - } - - Table::Cursor cursor(co_tab); - - int num = 0; - if (co_tab->find(coll_object_t(cid,object_t()), cursor) >= 0) { - while (1) { - const coll_t c = cursor.current().key.first; - const object_t o = cursor.current().key.second; - if (c != cid) break; // end! - dout(10) << "collection_list " << hex << cid << " includes " << o << dec << dendl; - ls.push_back(o); - num++; - if (cursor.move_right() < 0) break; - } - } - - ebofs_lock.Unlock(); - return num; -} - - -int Ebofs::_collection_setattr(coll_t cid, const char *name, const void *value, size_t size) -{ - dout(10) << "_collection_setattr " << hex << cid << dec << " '" << name << "' len " << size << dendl; - - Cnode *cn = get_cnode(cid); - if (!cn) return -ENOENT; - - string n(name); - cn->attr[n] = buffer::copy((char*)value, size); - dirty_cnode(cn); - put_cnode(cn); - - return 0; -} - -int Ebofs::collection_setattr(coll_t cid, const char *name, const void *value, size_t size, Context *onsafe) -{ - ebofs_lock.Lock(); - dout(10) << "collection_setattr " << hex << cid << dec << " '" << name << "' len " << size << dendl; - - int r = _collection_setattr(cid, name, value, size); - - // journal, wait for commit - if (r >= 0) { - while (1) { - if (journal) { - Transaction t; - t.collection_setattr(cid, name, value, size); - bufferlist bl; - t._encode(bl); - if (journal->submit_entry(bl, onsafe)) break; - } - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - break; - } - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return 0; -} - -int Ebofs::collection_getattr(coll_t cid, const char *name, void *value, size_t size) -{ - ebofs_lock.Lock(); - dout(10) << "collection_setattr " << hex << cid << dec << " '" << name << "' maxlen " << size << dendl; - - Cnode *cn = get_cnode(cid); - if (!cn) { - ebofs_lock.Unlock(); - return -ENOENT; - } - - string n(name); - int r; - if (cn->attr.count(n) == 0) { - r = -1; - } else { - r = MIN( cn->attr[n].length(), size ); - memcpy(value, cn->attr[n].c_str(), r); - } - - put_cnode(cn); - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::collection_getattrs(coll_t cid, map &aset) -{ - ebofs_lock.Lock(); - int r = _collection_getattrs(cid, aset); - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_collection_getattrs(coll_t cid, map &aset) -{ - dout(8) << "_collection_getattrs " << cid << dendl; - - Cnode *cn = get_cnode(cid); - if (!cn) return -ENOENT; - aset = cn->attr; - put_cnode(cn); - return 0; -} - -int Ebofs::collection_setattrs(coll_t cid, map &aset) -{ - ebofs_lock.Lock(); - int r = _collection_setattrs(cid, aset); - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_collection_setattrs(coll_t cid, map &aset) -{ - dout(8) << "_collection_setattrs " << cid << dendl; - - Cnode *cn = get_cnode(cid); - if (!cn) return -ENOENT; - cn->attr = aset; - dirty_cnode(cn); - put_cnode(cn); - return 0; -} - - -int Ebofs::_collection_rmattr(coll_t cid, const char *name) -{ - dout(10) << "_collection_rmattr " << hex << cid << dec << " '" << name << "'" << dendl; - - Cnode *cn = get_cnode(cid); - if (!cn) return -ENOENT; - - string n(name); - cn->attr.erase(n); - - dirty_cnode(cn); - put_cnode(cn); - - return 0; -} - -int Ebofs::collection_rmattr(coll_t cid, const char *name, Context *onsafe) -{ - ebofs_lock.Lock(); - - int r = _collection_rmattr(cid, name); - - // journal, wait for commit - if (r >= 0) { - while (1) { - if (journal) { - Transaction t; - t.collection_rmattr(cid, name); - bufferlist bl; - t._encode(bl); - if (journal->submit_entry(bl, onsafe)) break; - } - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - break; - } - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return 0; -} - -int Ebofs::collection_listattr(coll_t cid, vector& attrs) -{ - ebofs_lock.Lock(); - dout(10) << "collection_listattr " << hex << cid << dec << dendl; - - Cnode *cn = get_cnode(cid); - if (!cn) { - ebofs_lock.Unlock(); - return -ENOENT; - } - - attrs.clear(); - for (map::iterator i = cn->attr.begin(); - i != cn->attr.end(); - i++) { - attrs.push_back(i->first); - } - - put_cnode(cn); - ebofs_lock.Unlock(); - return 0; -} - - - -void Ebofs::_export_freelist(bufferlist& bl) -{ - for (int b=0; b<=EBOFS_NUM_FREE_BUCKETS; b++) { - Table *tab; - if (b < EBOFS_NUM_FREE_BUCKETS) { - tab = free_tab[b]; - } else { - tab = limbo_tab; - } - - if (tab->get_num_keys() > 0) { - Table::Cursor cursor(tab); - assert(tab->find(0, cursor) >= 0); - while (1) { - assert(cursor.current().value > 0); - - Extent ex(cursor.current().key, cursor.current().value); - dout(10) << "_export_freelist " << ex << dendl; - bl.append((char*)&ex, sizeof(ex)); - if (cursor.move_right() <= 0) break; - } - } - } -} - -void Ebofs::_import_freelist(bufferlist& bl) -{ - // clear - for (int b=0; bclear(); - limbo_tab->clear(); - - // import! - int num = bl.length() / sizeof(Extent); - Extent *p = (Extent*)bl.c_str(); - for (int i=0; i *tab; - if (b < EBOFS_NUM_FREE_BUCKETS) { - tab = free_tab[b]; - dout(30) << "dump bucket " << b << " " << tab->get_num_keys() << dendl; - } else { - tab = limbo_tab; - dout(30) << "dump limbo " << tab->get_num_keys() << dendl;; - } - - if (tab->get_num_keys() > 0) { - Table::Cursor cursor(tab); - assert(tab->find(0, cursor) >= 0); - while (1) { - assert(cursor.current().value > 0); - - block_t l = cursor.current().value; - tfree += l; - int b = 0; - do { - l = l >> 1; - b++; - } while (l); - st.free_extent_dist[b]++; - st.free_extent_dist_sum[b] += cursor.current().value; - st.num_free_extent++; - - if (cursor.move_right() <= 0) break; - } - } - } - st.avg_free_extent = tfree / st.num_free_extent; -*/ - - // used extents is harder. :( - st.num_extent = 0; - st.avg_extent = 0; - st.extent_dist.clear(); - st.extent_dist_sum.clear(); - st.avg_extent_per_object = 0; - st.avg_extent_jump = 0; - - Table::Cursor cursor(object_tab); - object_tab->find(object_t(), cursor); - int nobj = 0; - int njump = 0; - while (object_tab->get_num_keys() > 0) { - Onode *on = get_onode(cursor.current().key); - assert(on); - - nobj++; - st.avg_extent_per_object += on->extent_map.size(); - - for (map::iterator p = on->extent_map.begin(); - p != on->extent_map.end(); - p++) { - block_t l = p->second.length; - - st.num_extent++; - st.avg_extent += l; - if (p->first > 0) { - njump++; - st.avg_extent_jump += l; - } - - int b = 0; - do { - l = l >> 1; - b++; - } while (l); - st.extent_dist[b]++; - st.extent_dist_sum[b] += p->second.length; - } - put_onode(on); - if (cursor.move_right() <= 0) break; - } - if (njump) st.avg_extent_jump /= njump; - if (nobj) st.avg_extent_per_object /= (float)nobj; - if (st.num_extent) st.avg_extent /= st.num_extent; - - ebofs_lock.Unlock(); -} diff --git a/branches/sage/crush/ebofs/Ebofs.h b/branches/sage/crush/ebofs/Ebofs.h deleted file mode 100644 index 13eebabd93aad..0000000000000 --- a/branches/sage/crush/ebofs/Ebofs.h +++ /dev/null @@ -1,370 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include -using namespace std; - -#include -using namespace __gnu_cxx; - -#include "include/Context.h" -#include "include/buffer.h" -#include "include/hash.h" - -#include "types.h" -#include "Onode.h" -#include "Cnode.h" -#include "BlockDevice.h" -#include "nodes.h" -#include "Allocator.h" -#include "Table.h" -#include "Journal.h" - -#include "common/Mutex.h" -#include "common/Cond.h" - -#include "osd/ObjectStore.h" - -//typedef pair object_coll_t; -typedef pair coll_object_t; - - -class Ebofs : public ObjectStore { -protected: - Mutex ebofs_lock; // a beautiful global lock - - // ** debuggy ** - bool fake_writes; - - // ** super ** -public: - BlockDevice dev; -protected: - bool mounted, unmounting, dirty; - bool readonly; - version_t super_epoch; - bool commit_thread_started, mid_commit; - Cond commit_cond; // to wake up the commit thread - Cond sync_cond; - uint64_t super_fsid; - - map > commit_waiters; - - void prepare_super(version_t epoch, bufferptr& bp); - void write_super(version_t epoch, bufferptr& bp); - int commit_thread_entry(); - - class CommitThread : public Thread { - Ebofs *ebofs; - public: - CommitThread(Ebofs *e) : ebofs(e) {} - void *entry() { - ebofs->commit_thread_entry(); - return 0; - } - } commit_thread; - -public: - uint64_t get_fsid() { return super_fsid; } - epoch_t get_super_epoch() { return super_epoch; } -protected: - - - // ** journal ** - char *journalfn; - Journal *journal; - - // ** allocator ** - block_t free_blocks, limbo_blocks; - Allocator allocator; - friend class Allocator; - - block_t get_free_blocks() { return free_blocks; } - block_t get_limbo_blocks() { return limbo_blocks; } - block_t get_free_extents() { - int n = 0; - for (int i=0; iget_num_keys(); - return n; - } - block_t get_limbo_extents() { return limbo_tab->get_num_keys(); } - - - // ** tables and sets ** - // nodes - NodePool nodepool; // for all tables... - - // tables - Table *object_tab; - Table *free_tab[EBOFS_NUM_FREE_BUCKETS]; - Table *limbo_tab; - Table > *alloc_tab; - - // collections - Table *collection_tab; - Table *co_tab; - - void close_tables(); - void verify_tables(); - - - // ** onodes ** - hash_map onode_map; // onode cache - LRU onode_lru; - set dirty_onodes; - map > waitfor_onode; - - Onode* new_onode(object_t oid); // make new onode. ref++. - bool have_onode(object_t oid) { - return onode_map.count(oid); - } - Onode* get_onode(object_t oid); // get cached onode, or read from disk. ref++. - void remove_onode(Onode *on); - void put_onode(Onode* o); // put it back down. ref--. - void dirty_onode(Onode* o); - void encode_onode(Onode *on, bufferlist& bl, unsigned& off); - void write_onode(Onode *on); - - // ** cnodes ** - hash_map > cnode_map; - LRU cnode_lru; - set dirty_cnodes; - map > waitfor_cnode; - - Cnode* new_cnode(coll_t cid); - Cnode* get_cnode(coll_t cid); - void remove_cnode(Cnode *cn); - void put_cnode(Cnode *cn); - void dirty_cnode(Cnode *cn); - void encode_cnode(Cnode *cn, bufferlist& bl, unsigned& off); - void write_cnode(Cnode *cn); - - // ** onodes+cnodes = inodes ** - int inodes_flushing; - Cond inode_commit_cond; - - void flush_inode_finish(); - void commit_inodes_start(); - void commit_inodes_wait(); - friend class C_E_InodeFlush; - - void trim_inodes(int max = -1); - - // ** buffer cache ** - BufferCache bc; - pthread_t flushd_thread_id; - - version_t trigger_commit(); - void commit_bc_wait(version_t epoch); - void trim_bc(off_t max = -1); - - public: - void kick_idle(); - void sync(); - void sync(Context *onsafe); - void trim_buffer_cache(); - - class IdleKicker : public BlockDevice::kicker { - Ebofs *ebo; - public: - IdleKicker(Ebofs *t) : ebo(t) {} - void kick() { ebo->kick_idle(); } - } idle_kicker; - - - protected: - //void zero(Onode *on, size_t len, off_t off, off_t write_thru); - void alloc_write(Onode *on, - block_t start, block_t len, - interval_set& alloc, - block_t& old_bfirst, block_t& old_blast); - void apply_write(Onode *on, off_t off, size_t len, const bufferlist& bl); - bool attempt_read(Onode *on, off_t off, size_t len, bufferlist& bl, - Cond *will_wait_on, bool *will_wait_on_bool); - - // ** finisher ** - // async write notification to users - Mutex finisher_lock; - Cond finisher_cond; - bool finisher_stop; - list finisher_queue; - -public: - void queue_finisher(Context *c) { - finisher_lock.Lock(); - finisher_queue.push_back(c); - finisher_cond.Signal(); - finisher_lock.Unlock(); - } - void queue_finishers(list& ls) { - finisher_lock.Lock(); - finisher_queue.splice(finisher_queue.end(), ls); - finisher_cond.Signal(); - finisher_lock.Unlock(); - } -protected: - - void *finisher_thread_entry(); - class FinisherThread : public Thread { - Ebofs *ebofs; - public: - FinisherThread(Ebofs *e) : ebofs(e) {} - void* entry() { return (void*)ebofs->finisher_thread_entry(); } - } finisher_thread; - - - void alloc_more_node_space(); - - void do_csetattrs(map > > &cmods); - void do_setattrs(Onode *on, map > &setattrs); - - - public: - Ebofs(char *devfn, char *jfn=0) : - fake_writes(false), - dev(devfn), - mounted(false), unmounting(false), dirty(false), readonly(false), - super_epoch(0), commit_thread_started(false), mid_commit(false), - commit_thread(this), - journalfn(jfn), journal(0), - free_blocks(0), limbo_blocks(0), - allocator(this), - nodepool(ebofs_lock), - object_tab(0), limbo_tab(0), collection_tab(0), co_tab(0), - onode_lru(g_conf.ebofs_oc_size), - cnode_lru(g_conf.ebofs_cc_size), - inodes_flushing(0), - bc(dev, ebofs_lock), - idle_kicker(this), - finisher_stop(false), finisher_thread(this) { - for (int i=0; i& ls); - - // object attr - int setattr(object_t oid, const char *name, const void *value, size_t size, Context *onsafe=0); - int setattrs(object_t oid, map& attrset, Context *onsafe=0); - int getattr(object_t oid, const char *name, void *value, size_t size); - int getattrs(object_t oid, map &aset); - int rmattr(object_t oid, const char *name, Context *onsafe=0); - int listattr(object_t oid, vector& attrs); - - int get_object_collections(object_t oid, set& ls); - - // collections - int list_collections(list& ls); - bool collection_exists(coll_t c); - - int create_collection(coll_t c, Context *onsafe); - int destroy_collection(coll_t c, Context *onsafe); - int collection_add(coll_t c, object_t o, Context *onsafe); - int collection_remove(coll_t c, object_t o, Context *onsafe); - - int collection_list(coll_t c, list& o); - - int collection_setattr(coll_t cid, const char *name, const void *value, size_t size, Context *onsafe); - int collection_setattrs(coll_t cid, map &aset); - int collection_getattr(coll_t cid, const char *name, void *value, size_t size); - int collection_getattrs(coll_t cid, map &aset); - int collection_rmattr(coll_t cid, const char *name, Context *onsafe); - int collection_listattr(coll_t oid, vector& attrs); - - // maps - int map_lookup(object_t o, bufferlist& key, bufferlist& val); - int map_insert(object_t o, bufferlist& key, bufferlist& val); - int map_remove(object_t o, bufferlist& key); - int map_list(object_t o, list& keys); - int map_list(object_t o, map& vals); - int map_list(object_t o, - bufferlist& start, bufferlist& end, - map& vals); - - // crap - void _fake_writes(bool b) { fake_writes = b; } - void _get_frag_stat(FragmentationStat& st); - - void _import_freelist(bufferlist& bl); - void _export_freelist(bufferlist& bl); - - -private: - // private interface -- use if caller already holds lock - unsigned _apply_transaction(Transaction& t); - - int _read(object_t oid, off_t off, size_t len, bufferlist& bl); - int _is_cached(object_t oid, off_t off, size_t len); - int _stat(object_t oid, struct stat *st); - int _getattr(object_t oid, const char *name, void *value, size_t size); - int _getattrs(object_t oid, map &aset); - int _get_object_collections(object_t oid, set& ls); - - bool _write_will_block(); - int _write(object_t oid, off_t off, size_t len, const bufferlist& bl); - void _trim_from_cache(object_t oid, off_t off, size_t len); - int _truncate(object_t oid, off_t size); - int _truncate_front(object_t oid, off_t size); - int _remove(object_t oid); - int _clone(object_t from, object_t to); - int _setattr(object_t oid, const char *name, const void *value, size_t size); - int _setattrs(object_t oid, map& attrset); - int _rmattr(object_t oid, const char *name); - bool _collection_exists(coll_t c); - int _create_collection(coll_t c); - int _destroy_collection(coll_t c); - int _collection_add(coll_t c, object_t o); - int _collection_remove(coll_t c, object_t o); - int _collection_getattrs(coll_t oid, map &aset); - int _collection_setattr(coll_t oid, const char *name, const void *value, size_t size); - int _collection_setattrs(coll_t oid, map &aset); - int _collection_rmattr(coll_t cid, const char *name); - - -}; diff --git a/branches/sage/crush/ebofs/FileJournal.h b/branches/sage/crush/ebofs/FileJournal.h deleted file mode 100644 index 7c9a67ccbd25f..0000000000000 --- a/branches/sage/crush/ebofs/FileJournal.h +++ /dev/null @@ -1,144 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __EBOFS_FILEJOURNAL_H -#define __EBOFS_FILEJOURNAL_H - - -#include "Journal.h" -#include "common/Cond.h" -#include "common/Mutex.h" -#include "common/Thread.h" - -class FileJournal : public Journal { -public: - /** log header - * we allow 3 pointers: - * top/initial, - * one for an epoch boundary, - * and one for a wrap in the ring buffer/journal file. - * the epoch boundary one is useful only for speedier recovery in certain cases - * (i.e. when ebofs committed, but the journal didn't rollover ... very small window!) - */ - struct header_t { - uint64_t fsid; - int num; - off_t wrap; - off_t max_size; - epoch_t epoch[3]; - off_t offset[3]; - - header_t() : fsid(0), num(0), wrap(0), max_size(0) {} - - void clear() { - num = 0; - wrap = 0; - } - void pop() { - if (num >= 2 && offset[0] > offset[1]) - wrap = 0; // we're eliminating a wrap - num--; - for (int i=0; i > writeq; // currently journaling - list commitq; // currently journaling - - // write thread - Mutex write_lock; - Cond write_cond; - bool write_stop; - - void print_header(); - void read_header(); - void write_header(); - void start_writer(); - void stop_writer(); - void write_thread_entry(); - - class Writer : public Thread { - FileJournal *journal; - public: - Writer(FileJournal *fj) : journal(fj) {} - void *entry() { - journal->write_thread_entry(); - return 0; - } - } write_thread; - - public: - FileJournal(Ebofs *e, char *f) : - Journal(e), fn(f), - full(false), - write_pos(0), queue_pos(0), read_pos(0), - fd(0), - write_stop(false), write_thread(this) { } - ~FileJournal() {} - - int create(); - int open(); - void close(); - - void make_writeable(); - - // writes - bool submit_entry(bufferlist& e, Context *oncommit); // submit an item - void commit_epoch_start(); // mark epoch boundary - void commit_epoch_finish(); // mark prior epoch as committed (we can expire) - - bool read_entry(bufferlist& bl, epoch_t& e); - - // reads -}; - -#endif diff --git a/branches/sage/crush/ebofs/Onode.h b/branches/sage/crush/ebofs/Onode.h deleted file mode 100644 index 1d79d317dd96a..0000000000000 --- a/branches/sage/crush/ebofs/Onode.h +++ /dev/null @@ -1,408 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __EBOFS_ONODE_H -#define __EBOFS_ONODE_H - -#include "include/lru.h" - -#include "types.h" -#include "BufferCache.h" - -#include "include/interval_set.h" - - -/* - * object node (like an inode) - * - * holds object metadata, including - * size - * allocation (extent list) - * attributes - * - */ - -class Onode : public LRUObject { -private: - int ref; - -public: - object_t object_id; - version_t version; // incremented on each modify. - - // data - bool readonly; - Extent onode_loc; - off_t object_size; - unsigned object_blocks; - - // onode - set collections; - map attr; - //vector extents; - map extent_map; - - interval_set uncommitted; - - ObjectCache *oc; - - bool dirty; - bool dangling; // not in onode_map - bool deleted; // deleted - - list commit_waiters; - - public: - Onode(object_t oid) : ref(0), object_id(oid), version(0), - readonly(false), - object_size(0), object_blocks(0), - oc(0), - dirty(false), dangling(false), deleted(false) { - onode_loc.length = 0; - } - ~Onode() { - if (oc) delete oc; - } - - block_t get_onode_id() { return onode_loc.start; } - int get_onode_len() { return onode_loc.length; } - - int get_ref_count() { return ref; } - void get() { - if (ref == 0) lru_pin(); - ref++; - //cout << "ebofs.onode.get " << hex << object_id << dec << " " << ref << std::endl; - } - void put() { - ref--; - if (ref == 0) lru_unpin(); - //cout << "ebofs.onode.put " << hex << object_id << dec << " " << ref << std::endl; - } - - void mark_dirty() { - if (!dirty) { - dirty = true; - get(); - } - } - void mark_clean() { - if (dirty) { - dirty = false; - put(); - } - } - bool is_dirty() { return dirty; } - bool is_deleted() { return deleted; } - bool is_dangling() { return dangling; } - - - bool have_oc() { - return oc != 0; - } - ObjectCache *get_oc(BufferCache *bc) { - if (!oc) { - oc = new ObjectCache(object_id, this, bc); - oc->get(); - get(); - } - return oc; - } - void close_oc() { - if (oc) { - //cout << "close_oc on " << object_id << std::endl; - assert(oc->is_empty()); - if (oc->put() == 0){ - //cout << "************************* hosing oc" << std::endl; - delete oc; - } - oc = 0; - put(); - } - } - - - // allocation - void verify_extents() { - if (0) { // do crazy stupid sanity checking - block_t count = 0; - interval_set is; - - set s; - cout << "verifying" << std::endl; - - for (map::iterator p = extent_map.begin(); - p != extent_map.end(); - p++) { - cout << " " << p->first << ": " << p->second << std::endl; - assert(count == p->first); - count += p->second.length; - for (unsigned j=0;jsecond.length;j++) { - assert(s.count(p->second.start+j) == 0); - s.insert(p->second.start+j); - } - } - - assert(s.size() == count); - assert(count == object_blocks); - } - } - void set_extent(block_t offset, Extent ex) { - //cout << "set_extent " << offset << " -> " << ex << " ... " << object_blocks << std::endl; - assert(offset <= object_blocks); - verify_extents(); - - // at the end? - if (offset == object_blocks) { - //cout << " appending " << ex << std::endl; - if (!extent_map.empty() && extent_map.rbegin()->second.end() == ex.start) { - //cout << "appending " << ex << " to " << extent_map.rbegin()->second << std::endl; - extent_map.rbegin()->second.length += ex.length; - } else - extent_map[object_blocks] = ex; - object_blocks += ex.length; - return; - } - - // removing any extent bits we overwrite - if (!extent_map.empty()) { - // preceeding extent? - map::iterator p = extent_map.lower_bound(offset); - if (p != extent_map.begin()) { - p--; - if (p->first + p->second.length > offset) { - //cout << " preceeding was " << p->second << std::endl; - if (p->first + p->second.length > offset+ex.length) { - // cutting chunk out of middle, add last bit - Extent &n = extent_map[offset+ex.length] = p->second; - n.start += offset+ex.length - p->first; - n.length -= offset+ex.length - p->first; - //cout << " tail frag is " << n << std::endl; - } - p->second.length = offset - p->first; // cut tail off preceeding extent - //cout << " preceeding now " << p->second << std::endl; - } - p++; - } - - // overlapping extents - while (p != extent_map.end() && - p->first < offset + ex.length) { - map::iterator next = p; - next++; - - // completely subsumed? - if (p->first + p->second.length <= offset+ex.length) { - //cout << " erasing " << p->second << std::endl; - extent_map.erase(p); - p = next; - continue; - } - - // spans new extent, cut off head - Extent &n = extent_map[ offset+ex.length ] = p->second; - //cout << " cut head off " << p->second; - n.start += offset+ex.length - p->first; - n.length -= offset+ex.length - p->first; - extent_map.erase(p); - //cout << ", now " << n << std::endl; - break; - } - } - - extent_map[ offset ] = ex; - - // extend object? - if (offset + ex.length > object_blocks) - object_blocks = offset+ex.length; - - verify_extents(); - } - - - /* map_extents(start, len, ls) - * map teh given page range into extents on disk. - */ - int map_extents(block_t start, block_t len, vector& ls) { - //cout << "map_extents " << start << " " << len << std::endl; - verify_extents(); - - //assert(start+len <= object_blocks); - - map::iterator p; - - // hack hack speed up common cases! - if (start == 0) { - p = extent_map.begin(); - } else if (start+len == object_blocks && len == 1 && !extent_map.empty()) { - // append hack. - p = extent_map.end(); - p--; - if (p->first < start) p++; - //while (p->first >= start) p--; - //p++; - } else { - // normal - p = extent_map.lower_bound(start); - } - - if (p != extent_map.begin() && - (p == extent_map.end() || p->first > start && p->first)) { - p--; - if (p->second.length > start - p->first) { - Extent ex; - ex.start = p->second.start + (start - p->first); - ex.length = MIN(len, p->second.length - (start - p->first)); - ls.push_back(ex); - - //cout << " got (tail of?) " << p->second << " : " << ex << std::endl; - - start += ex.length; - len -= ex.length; - } - p++; - } - - while (len > 0 && - p != extent_map.end()) { - assert(p->first == start); - Extent ex = p->second; - ex.length = MIN(len, ex.length); - ls.push_back(ex); - //cout << " got (head of?) " << p->second << " : " << ex << std::endl; - start += ex.length; - len -= ex.length; - p++; - } - - return 0; - } - - int truncate_extents(block_t len, vector& extra) { - verify_extents(); - - map::iterator p = extent_map.lower_bound(len); - if (p != extent_map.begin() && - (p == extent_map.end() || p->first > len && p->first)) { - p--; - if (p->second.length > len - p->first) { - Extent ex; - ex.start = p->second.start + (len - p->first); - ex.length = p->second.length - (len - p->first); - extra.push_back(ex); - - p->second.length = len - p->first; - assert(p->second.length > 0); - - //cout << " got (tail of?) " << p->second << " : " << ex << std::endl; - } - p++; - } - - while (p != extent_map.end()) { - assert(p->first >= len); - extra.push_back(p->second); - map::iterator n = p; - n++; - extent_map.erase(p); - p = n; - } - - object_blocks = len; - verify_extents(); - return 0; - } - - int truncate_front_extents(block_t len, vector& extra) { - verify_extents(); - - while (len > 0) { - Extent& ex = extent_map.begin()->second; // look, this is a reference! - if (ex.length > len) { - // partial first extent - Extent frontbit( ex.start, len ); - extra.push_back(frontbit); - ex.length -= len; - ex.start += len; - break; - } - - // pull off entire first extent. - assert(ex.length <= len); - len -= ex.length; - extra.push_back(ex); - extent_map.erase(extent_map.begin()); - } - - object_blocks -= len; - verify_extents(); - return 0; - } - - - - /* map_alloc_regions(start, len, map) - * map range into regions that need to be (re)allocated on disk - * because they overlap "safe" (or unallocated) parts of the object - */ - /* - void map_alloc_regions(block_t start, block_t len, - interval_set& alloc) { - interval_set already_uncom; - - alloc.insert(start, len); // start with whole range - already_uncom.intersection_of(alloc, uncommitted); - alloc.subtract(already_uncom); // take out the bits that aren't yet committed - } - */ - - - - // pack/unpack - int get_collection_bytes() { - return sizeof(coll_t) * collections.size(); - } - int get_attr_bytes() { - int s = 0; - for (map::iterator i = attr.begin(); - i != attr.end(); - i++) { - s += i->first.length() + 1; - s += i->second.length() + sizeof(int); - } - return s; - } - int get_extent_bytes() { - return sizeof(Extent) * extent_map.size(); - } - -}; - - -inline ostream& operator<<(ostream& out, Onode& on) -{ - out << "onode(" << hex << on.object_id << dec << " len=" << on.object_size; - out << " ref=" << on.get_ref_count(); - if (on.is_dirty()) out << " dirty"; - if (on.is_dangling()) out << " dangling"; - if (on.is_deleted()) out << " deleted"; - out << " uncom=" << on.uncommitted; - // out << " " << &on; - out << ")"; - return out; -} - - - -#endif diff --git a/branches/sage/crush/ebofs/Table.h b/branches/sage/crush/ebofs/Table.h deleted file mode 100644 index 041a55afa0c68..0000000000000 --- a/branches/sage/crush/ebofs/Table.h +++ /dev/null @@ -1,928 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __EBOFS_TABLE_H -#define __EBOFS_TABLE_H - -#include "types.h" -#include "nodes.h" - -/** table **/ - -#define dbtout if (25 <= g_conf.debug_ebofs) cout << "ebofs.table(" << this << ")." - - -template -class Table { - private: - NodePool &pool; - - nodeid_t root; - int nkeys; - int depth; - - public: - Table(NodePool &p, - struct ebofs_table& bts) : - pool(p), - root(bts.root), nkeys(bts.num_keys), depth(bts.depth) { - dbtout << "cons" << std::endl; - } - - nodeid_t get_root() { return root; } - int get_num_keys() { return nkeys; } - int get_depth() { return depth; } - - - /* - */ - class _IndexItem { // i just need a struct size for below - K k; - nodeid_t n; - }; - class IndexItem { - public: - K key; - nodeid_t node; - static const int MAX = Node::ITEM_LEN / (sizeof(_IndexItem)); - static const int MIN = MAX/2; - }; - class _LeafItem { // i just need a struct size for below - K k; - V v; - }; - class LeafItem { - public: - K key; - V value; - static const int MAX = Node::ITEM_LEN / (sizeof(_LeafItem)); - static const int MIN = MAX/2; - }; - - class Nodeptr { - public: - Node *node; - - Nodeptr() : node(0) {} - Nodeptr(Node *n) : node(n) {} - Nodeptr(NodePool& pool, nodeid_t nid) { - open(pool, nid); - } - Nodeptr& operator=(Node *n) { - node = n; - return *this; - } - - void open(NodePool& pool, nodeid_t nid) { - node = pool.get_node(nid); - if (is_index() && node->children.empty()) init_index(pool); - } - - LeafItem& leaf_item(int i) { return (( LeafItem*)(node->item_ptr()))[i]; } - IndexItem& index_item(int i) { return ((IndexItem*)(node->item_ptr()))[i]; } - K key(int i) { - if (node->is_index()) - return index_item(i).key; - else - return leaf_item(i).key; - } - - bool is_leaf() { return node->is_leaf(); } - bool is_index() { return node->is_index(); } - void set_type(int t) { node->set_type(t); } - - int max_items() const { - if (node->is_leaf()) - return LeafItem::MAX; - else - return IndexItem::MAX; - } - int min_items() const { return max_items() / 2; } - - nodeid_t get_id() { return node->get_id(); } - - int size() { return node->size(); } - void set_size(int s) { node->set_size(s); } - - void init_index(NodePool& nodepool) { - /* - node->children = vector(max_items()); - for (int i=0; ichildren[i] = nodepool.get_node(index_item(i).node); - else - node->children[i] = 0; - */ - } - - - void remove_at_pos(int p) { - if (node->is_index()) { - for (int i=p; ichildren[i] = node->children[i+1]; - } - } else { - for (int i=p; iis_index() ? "index":"leaf") << std::endl; - } - void insert_at_leaf_pos(int p, K key, V value) { - assert(is_leaf()); - for (int i=size(); i>p; i--) - leaf_item(i) = leaf_item(i-1); - leaf_item(p).key = key; - leaf_item(p).value = value; - set_size(size() + 1); - } - void insert_at_index_pos(int p, K key, nodeid_t nid) { - assert(is_index()); - for (int i=size(); i>p; i--) { - index_item(i) = index_item(i-1); - //node->children[i] = node->children[i-1]; - } - index_item(p).key = key; - index_item(p).node = nid; - set_size(size() + 1); - } - - void append_item(LeafItem& i) { - leaf_item(size()) = i; - set_size(size() + 1); - } - void append_item(IndexItem& i) { - index_item(size()) = i; - set_size(size() + 1); - } - - void split(Nodeptr& right) { - if (node->is_index()) { - for (int i=min_items(); iis_index()) - for (int i=0; i open; // open nodes - vector pos; // position within the node - //Nodeptr open[20]; - //int pos[20]; - int level; - - Cursor(Table *t) : table(t), open(t->depth), pos(t->depth), level(0) {} - - public: - - const LeafItem& current() { - assert(open[level].is_leaf()); - return open[level].leaf_item(pos[level]); - } - V& dirty_current_value() { - assert(open[level].is_leaf()); - dirty(); - return open[level].leaf_item(pos[level]).value; - } - - // ** read-only bits ** - int move_left() { - if (table->depth == 0) return OOB; - - // work up around branch - int l; - for (l = level; l >= 0; l--) - if (pos[l] > 0) break; - if (l < 0) - return OOB; // we are the first item in the btree - - // move left one - pos[l]--; - - // work back down right side - for (; lpool, open[l].index_item(pos[l]).node); - pos[l+1] = open[l+1].size() - 1; - } - return 1; - } - int move_right() { - if (table->depth == 0) return OOB; - - // work up branch - int l; - for (l=level; l>=0; l--) - if (pos[l] < open[l].size() - 1) break; - if (l < 0) { - /* we are at last item in btree. */ - if (pos[level] < open[level].size()) { - pos[level]++; /* move into add position! */ - return 0; - } - return -1; - } - - /* move right one */ - assert( pos[l] < open[l].size() ); - pos[l]++; - - /* work back down */ - for (; lpool, open[l].index_item(pos[l]).node ); - pos[l+1] = 0; // furthest left - } - return 1; - } - - // ** modifications ** - void dirty() { - for (int l=level; l>=0; l--) { - if (open[l].node->is_dirty()) break; // already dirty! (and thus parents are too) - - table->pool.dirty_node(open[l].node); - if (l > 0) - open[l-1].index_item( pos[l-1] ).node = open[l].get_id(); - else - table->root = open[0].get_id(); - } - } - private: - void repair_parents() { - // did i make a change at the start of a node? - if (pos[level] == 0) { - K key = open[level].key(0); // new key parents should have - for (int j=level-1; j>=0; j--) { - if (open[j].index_item(pos[j]).key == key) - break; /* it's the same key, we can stop fixing */ - open[j].index_item(pos[j]).key = key; - if (pos[j] > 0) break; /* last in position 0.. */ - } - } - } - - public: - void remove() { - dirty(); - - // remove from node - open[level].remove_at_pos( pos[level] ); - repair_parents(); - - // was it a key? - if (level == table->depth-1) - table->nkeys--; - } - - void insert(K key, V value) { - dirty(); - - // insert - open[level].insert_at_leaf_pos(pos[level], key, value); - repair_parents(); - - // was it a key? - if (level == table->depth-1) - table->nkeys++; - } - - int rotate_left() { - if (level == 0) return -1; // i am root - if (pos[level-1] == 0) return -1; // nothing to left - - Nodeptr here = open[level]; - Nodeptr parent = open[level-1]; - Nodeptr left(table->pool, parent.index_item(pos[level-1] - 1).node ); - if (left.size() == left.max_items()) return -1; // it's full - - // make both dirty - dirty(); - if (!left.node->is_dirty()) { - table->pool.dirty_node(left.node); - parent.index_item(pos[level-1]-1).node = left.get_id(); - } - - dbtout << "rotating item " << here.key(0) << " left from " << here.get_id() << " to " << left.get_id() << std::endl; - - /* add */ - if (here.node->is_leaf()) - left.append_item(here.leaf_item(0)); - else - left.append_item(here.index_item(0)); - - /* remove */ - here.remove_at_pos(0); - - /* fix parent index for me */ - parent.index_item( pos[level-1] ).key = here.key(0); - // we never have to update past immediate parent, since we're not at pos 0 - - /* adjust cursor */ - if (pos[level] > 0) - pos[level]--; - //else - //assert(1); /* if we were positioned here, we're equal */ - /* if it was 0, then the shifted item == our key, and we can stay here safely. */ - return 0; - } - int rotate_right() { - if (level == 0) return -1; // i am root - if (pos[level-1] + 1 >= open[level-1].size()) return -1; // nothing to right - - Nodeptr here = open[level]; - Nodeptr parent = open[level-1]; - Nodeptr right(table->pool, parent.index_item( pos[level-1] + 1 ).node ); - if (right.size() == right.max_items()) return -1; // it's full - - // make both dirty - dirty(); - if (!right.node->is_dirty()) { - table->pool.dirty_node(right.node); - parent.index_item( pos[level-1]+1 ).node = right.get_id(); - } - - if (pos[level] == here.size()) { - /* let's just move the cursor over! */ - //if (sizeof(K) == 8) - dbtout << "shifting cursor right from " << here.get_id() << " to less-full node " << right.get_id() << std::endl; - open[level] = right; - pos[level] = 0; - pos[level-1]++; - return 0; - } - - //if (sizeof(K) == 8) - dbtout << "rotating item " << hex << here.key(here.size()-1) << dec << " right from " - << here.get_id() << " to " << right.get_id() << std::endl; - - /* add */ - if (here.is_index()) - right.insert_at_index_pos(0, - here.index_item( here.size()-1 ).key, - here.index_item( here.size()-1 ).node); - else - right.insert_at_leaf_pos(0, - here.leaf_item( here.size()-1 ).key, - here.leaf_item( here.size()-1 ).value); - - /* remove */ - here.set_size(here.size() - 1); - - /* fix parent index for right */ - parent.index_item( pos[level-1] + 1 ).key = right.key(0); - - return 0; - } - }; - - - public: - bool almost_full() { - if (2*(depth+1) > pool.get_num_free()) // worst case, plus some. - return true; - return false; - } - - int find(K key, Cursor& cursor) { - dbtout << "find " << key << std::endl; - verify("find"); - - if (depth == 0) - return Cursor::OOB; - - // init - cursor.level = 0; - - // start at root - Nodeptr curnode(pool, root); - cursor.open[0] = curnode; - - if (curnode.size() == 0) return -1; // empty! - - // find leaf - for (cursor.level = 0; cursor.level < depth-1; cursor.level++) { - /* if key=5, we want 2 3 [4] 6 7, or 3 4 [5] 5 6 (err to the left) */ - int left = 0; /* i >= left */ - int right = curnode.size()-1; /* i < right */ - while (left < right) { - int i = left + (right - left) / 2; - if (curnode.index_item(i).key < key) { - left = i + 1; - } else if (i && curnode.index_item(i-1).key >= key) { - right = i; - } else { - left = right = i; - break; - } - } - int i = left; - if (i && curnode.index_item(i).key > key) i--; - -#ifdef EBOFS_DEBUG_BTREE - int j; - for (j=0; j key) break; - } - if (i != j) { - dbtout << "btree binary search failed" << std::endl; - i = j; - } -#endif - - cursor.pos[cursor.level] = i; - - /* get child node */ - curnode.open(pool, cursor.open[cursor.level].index_item(i).node ); - cursor.open[cursor.level+1] = curnode; - } - - /* search leaf */ - /* if key=5, we want 2 3 4 [6] 7, or 3 4 [5] 5 6 (err to the right) */ - int left = 0; /* i >= left */ - int right = curnode.size(); /* i < right */ - while (left < right) { - int i = left + (right - left) / 2; - if (curnode.leaf_item(i).key < key) { - left = i + 1; - } else if (i && curnode.leaf_item(i-1).key >= key) { - right = i; - } else { - left = right = i; - break; - } - } - int i = left; - -#ifdef EBOFS_DEBUG_BTREE - int j; - for (j=0; j= key) break; - } - if (i != j) { - dbtout << "btree binary search failed" << std::endl; - i = j; - } -#endif - - cursor.pos[cursor.level] = i; /* first key in this node, or key insertion point */ - - if (curnode.size() >= i+1) { - if (curnode.leaf_item(i).key == key) { - return Cursor::MATCH; /* it's the actual key */ - } else { - return Cursor::INSERT; /* it's an insertion point */ - } - } - return Cursor::OOB; /* it's the end of the btree (also a valid insertion point) */ - } - - int lookup(K key) { - dbtout << "lookup" << std::endl; - Cursor cursor(this); - if (find(key, cursor) == Cursor::MATCH) - return 0; - return -1; - } - - int lookup(K key, V& value) { - dbtout << "lookup" << std::endl; - Cursor cursor(this); - if (find(key, cursor) == Cursor::MATCH) { - value = cursor.current().value; - return 0; - } - return -1; - } - - int insert(K key, V value) { - verify("pre-insert"); - dbtout << "insert " << key << " -> " << value << std::endl; - if (almost_full()) return -1; - - // empty? - if (nkeys == 0) { - if (root == -1) { - // create a root node (leaf!) - assert(depth == 0); - Nodeptr newroot( pool.new_node(Node::TYPE_LEAF) ); - root = newroot.get_id(); - depth++; - } - assert(depth == 1); - assert(root >= 0); - } - - // start at/near key - Cursor cursor(this); - find(key, cursor); - - // insert loop - nodeid_t nodevalue = 0; - while (1) { - - /* room in this node? */ - if (cursor.open[cursor.level].size() < cursor.open[cursor.level].max_items()) { - if (cursor.open[cursor.level].is_leaf()) - cursor.insert( key, value ); // will dirty, etc. - else { - // indices are already dirty - cursor.open[cursor.level].insert_at_index_pos(cursor.pos[cursor.level], key, nodevalue); - } - verify("insert 1"); - return 0; - } - - /* this node is full. */ - assert( cursor.open[cursor.level].size() == cursor.open[cursor.level].max_items() ); - - /* can we rotate? */ - if (false) // NO! there's a bug in here somewhere, don't to it. - if (cursor.level > 0) { - if ((cursor.pos[cursor.level-1] > 0 - && cursor.rotate_left() >= 0) || - (cursor.pos[cursor.level-1] + 1 < cursor.open[cursor.level-1].size() - && cursor.rotate_right() >= 0)) { - - if (cursor.open[cursor.level].is_leaf()) - cursor.insert( key, value ); // will dirty, etc. - else { - // indices are already dirty - cursor.open[cursor.level].insert_at_index_pos(cursor.pos[cursor.level], key, nodevalue); - } - verify("insert 2"); - return 0; - } - } - - /** split node **/ - - if (cursor.level == depth-1) { - dbtout << "splitting leaf " << cursor.open[cursor.level].get_id() << std::endl; - } else { - dbtout << "splitting index " << cursor.open[cursor.level].get_id() << std::endl; - } - - cursor.dirty(); - - // split - Nodeptr leftnode = cursor.open[cursor.level]; - Nodeptr newnode( pool.new_node(leftnode.node->get_type()) ); - leftnode.split( newnode ); - - /* insert our item */ - if (cursor.pos[cursor.level] > leftnode.size()) { - // not with cursor, since this node isn't added yet! - if (newnode.is_leaf()) { - newnode.insert_at_leaf_pos( cursor.pos[cursor.level] - leftnode.size(), - key, value ); - nkeys++; - } else { - newnode.insert_at_index_pos( cursor.pos[cursor.level] - leftnode.size(), - key, nodevalue ); - } - } else { - // with cursor (if leaf) - if (leftnode.is_leaf()) - cursor.insert( key, value ); - else - leftnode.insert_at_index_pos( cursor.pos[cursor.level], - key, nodevalue ); - } - - /* are we at the root? */ - if (cursor.level == 0) { - /* split root. */ - dbtout << "that split was the root " << root << std::endl; - Nodeptr newroot( pool.new_node(Node::TYPE_INDEX) ); - - /* new root node */ - newroot.set_size(2); - newroot.index_item(0).key = leftnode.key(0); - newroot.index_item(0).node = root; - newroot.index_item(1).key = newnode.key(0); - newroot.index_item(1).node = newnode.get_id(); - - /* heighten tree */ - depth++; - root = newroot.get_id(); - verify("insert 3"); - return 0; - } - - /* now insert newindex in level-1 */ - nodevalue = newnode.get_id(); - key = newnode.key(0); - cursor.level--; - cursor.pos[cursor.level]++; // ...to the right of leftnode! - } - } - - - int remove(K key) { - verify("pre-remove"); - dbtout << "remove " << key << std::endl; - - if (almost_full()) { - cout << "table almost full, failing" << std::endl; - assert(0); - return -1; - } - - Cursor cursor(this); - if (find(key, cursor) <= 0) { - cerr << "remove " << key << " 0x" << hex << key << dec << " .. dne" << std::endl; - g_conf.debug_ebofs = 33; - g_conf.ebofs_verify = true; - verify("remove dne"); - assert(0); - return -1; // key dne - } - - - while (1) { - cursor.remove(); - verify("post-remove"); - - // balance + adjust - - if (cursor.level == 0) { - // useless root index? - if (cursor.open[0].size() == 1 && - depth > 1) { - depth--; - root = cursor.open[0].index_item(0).node; - pool.release( cursor.open[0].node ); - } - - // note: root can be small, but not empty - else if (nkeys == 0) { - assert(cursor.open[cursor.level].size() == 0); - assert(depth == 1); - root = -1; - depth = 0; - if (cursor.open[0].node) - pool.release(cursor.open[0].node); - } - verify("remove 1"); - return 0; - } - - if (cursor.open[cursor.level].size() > cursor.open[cursor.level].min_items()) { - verify("remove 2"); - return 0; - } - - // borrow from siblings? - Nodeptr left; - Nodeptr right; - - // left? - if (cursor.pos[cursor.level-1] > 0) { - int left_loc = cursor.open[cursor.level-1].index_item( cursor.pos[cursor.level-1] - 1).node; - left.open(pool, left_loc); - - if (left.size() > left.min_items()) { - /* move cursor left, shift right */ - cursor.pos[cursor.level] = 0; - cursor.open[cursor.level] = left; - cursor.pos[cursor.level-1]--; - cursor.rotate_right(); - verify("remove 3"); - return 0; - } - - /* combine to left */ - right = cursor.open[cursor.level]; - } - else { - assert(cursor.pos[cursor.level-1] < cursor.open[cursor.level-1].size() - 1); - int right_loc = cursor.open[cursor.level-1].index_item( cursor.pos[cursor.level-1] + 1 ).node; - right.open(pool, right_loc ); - - if (right.size() > right.min_items()) { - /* move cursor right, shift an item left */ - cursor.pos[cursor.level] = 1; - cursor.open[cursor.level] = right; - cursor.pos[cursor.level-1]++; - cursor.rotate_left(); - verify("remove 4"); - return 0; - } - - /* combine to left */ - left = cursor.open[cursor.level]; - cursor.pos[cursor.level-1]++; /* move cursor to (soon-to-be-empty) right side item */ - } - - // note: cursor now points to _right_ node. - - /* combine (towards left) - * (this makes it so our next delete will be in the index - * interior, which is less scary.) - */ - dbtout << "combining nodes " << left.get_id() << " and " << right.get_id() << std::endl; - - left.merge(right); - - // dirty left + right - cursor.dirty(); // right - if (!left.node->is_dirty()) { - pool.dirty_node(left.node); - cursor.open[cursor.level-1].index_item( cursor.pos[cursor.level-1]-1 ).node = left.get_id(); - } - - pool.release(right.node); - - cursor.level--; // now point to the link to the obsolete (right-side) sib */ - } - - } - - void clear(Cursor& cursor, int node_loc, int level) { - dbtout << "clear" << std::endl; - - Nodeptr node(pool, node_loc); - cursor.open[level] = node; - - // hose children? - if (level < depth-1) { - for (int i=0; i max) - max = node.key(i); - - if (level < depth-1) { - // index - cursor.pos[level] = i; - err += verify_sub( cursor, cursor.open[level].index_item(i).node, level+1, count, last, on ); - } else { - // leaf - count++; - last = node.key(i); - } - } - - if (level) { - // verify that parent's keys are appropriate - if (min != cursor.open[level-1].index_item(cursor.pos[level-1]).key) { - dbtout << ":: key in index node " << cursor.open[level-1].get_id() - << " != min in child " << node_loc - << "(key is " << hex << cursor.open[level-1].index_item(cursor.pos[level-1]).key - << ", min is " << min << ")" << dec << std::endl; - err++; - } - if (cursor.pos[level-1] < cursor.open[level-1].size()-1) { - if (max > cursor.open[level-1].index_item(1+cursor.pos[level-1]).key) { - dbtout << ":: next key in index node " << cursor.open[level-1].get_id() - << " < max in child " << node_loc - << "(key is " << hex << cursor.open[level-1].index_item(1+cursor.pos[level-1]).key - << ", max is " << max << ")" << dec << std::endl; - err++; - } - } - } - - if (err == 0) return err; - - // print it - char s[1000]; - strcpy(s," "); - s[level+1] = 0; - if (1) { - if (root == node_loc) { - dbtout << s << "root " << node_loc << ": " - << node.size() << " / " << node.max_items() << " keys, " << hex << min << "-" << max << dec << std::endl; - } else if (level == depth-1) { - dbtout << s << "leaf " << node_loc << ": " - << node.size() << " / " << node.max_items() << " keys, " << hex << min << "-" << max << dec << std::endl; - } else { - dbtout << s << "indx " << node_loc << ": " - << node.size() << " / " << node.max_items() << " keys, " << hex << min << "-" << max << dec << std::endl; - } - - if (1) { - for (int i=0; i " << node.leaf_item(i).value << dec << std::endl; - } - } - } - } - - return err; - } - - void verify(const char *on) { - if (!g_conf.ebofs_verify) - return; - - if (root == -1 && depth == 0) { - return; // empty! - } - - int count = 0; - Cursor cursor(this); - K last; - - int before = g_conf.debug_ebofs; - g_conf.debug_ebofs = 0; - - int err = verify_sub(cursor, root, 0, count, last, on); - if (count != nkeys) { - cerr << "** count " << count << " != nkeys " << nkeys << std::endl; - err++; - } - - g_conf.debug_ebofs = before; - - // ok? - if (err) { - cerr << "verify failure, called by '" << on << "'" << std::endl; - g_conf.debug_ebofs = 30; - // do it again, so we definitely get the dump. - int count = 0; - Cursor cursor(this); - K last; - verify_sub(cursor, root, 0, count, last, on); - assert(err == 0); - } - } - -}; - - -#endif diff --git a/branches/sage/crush/ebofs/nodes.h b/branches/sage/crush/ebofs/nodes.h deleted file mode 100644 index 60fb5d3640441..0000000000000 --- a/branches/sage/crush/ebofs/nodes.h +++ /dev/null @@ -1,568 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __EBOFS_NODES_H -#define __EBOFS_NODES_H - -/** nodes, node regions **/ - -#include "types.h" -#include "BlockDevice.h" -#include "include/xlist.h" -#include "include/bitmapper.h" - -/* - - disk wire memory - - free free -> free can alloc - free used -> dirty can modify - - free used used -> clean - free used free -> limbo - - used used -> clean - used free -> limbo - - - // meaningless - used free free -> free can alloc - used free used __DNE__ - - -*/ - -#undef debofs -#define debofs(x) if (x < g_conf.debug_ebofs) cout << "ebofs.nodepool." - - -class Node { - public: - // bit fields - static const int STATE_CLEAN = 1; - static const int STATE_DIRTY = 2; - - static const int ITEM_LEN = EBOFS_NODE_BYTES - sizeof(int) - sizeof(int) - sizeof(int); - - static const int TYPE_INDEX = 1; - static const int TYPE_LEAF = 2; - - protected: - nodeid_t id; - int pos_in_bitmap; // position in bitmap - int state; // use bit fields above! - - bufferptr bptr; - - // in disk buffer - int *type; - int *nrecs; - - public: - xlist::item xlist; // dirty - - vector children; - - Node(nodeid_t i, int pib, bufferptr& b, int s) : - id(i), pos_in_bitmap(pib), - state(s), bptr(b), xlist(this) { - setup_pointers(); - } - - void setup_pointers() { - nrecs = (int*)(bptr.c_str()); - type = (int*)(bptr.c_str() + sizeof(*nrecs)); - } - - bool do_cow() { - if (bptr.do_cow()) { - setup_pointers(); - return true; - } - return false; - } - - - // id - nodeid_t get_id() const { return id; } - void set_id(nodeid_t n) { id = n; } - int get_pos_in_bitmap() const { return pos_in_bitmap; } - void set_pos_in_bitmap(int i) { pos_in_bitmap = i; } - - // buffer - bufferptr& get_buffer() { return bptr; } - - char *item_ptr() { return bptr.c_str() + sizeof(*nrecs) + sizeof(*type); } - - // size - int size() { return *nrecs; } - void set_size(int s) { *nrecs = s; } - - // type - int& get_type() { return *type; } - void set_type(int t) { *type = t; } - bool is_index() { return *type == TYPE_INDEX; } - bool is_leaf() { return *type == TYPE_LEAF; } - - - // state - bool is_dirty() { return state == STATE_DIRTY; } - bool is_clean() { return state == STATE_CLEAN; } - - void set_state(int s) { state = s; } - -}; - - - - - -class NodePool { - protected: - hash_map > node_map; // open node map - - public: - vector region_loc; // region locations - Extent usemap_even; - Extent usemap_odd; - - buffer::ptr usemap_data; - bitmapper usemap_bits; - - protected: - // on-disk block states - int num_nodes; - int num_dirty; - int num_clean; - int num_free; - int num_limbo; - - xlist dirty_ls; - interval_set free; - interval_set limbo; - - Mutex &ebofs_lock; - Cond commit_cond; - int flushing; - - nodeid_t make_nodeid(int region, int offset) { - return region_loc[region].start + (block_t)offset; - } - int nodeid_pos_in_bitmap(nodeid_t nid) { - unsigned region; - int num = 0; - for (region = 0; - (block_t)nid < region_loc[region].start || (block_t)nid > region_loc[region].end(); - region++) { - //generic_dout(-20) << "node " << nid << " not in " << region << " " << region_loc[region] << dendl; - num += region_loc[region].length; - } - num += nid - region_loc[region].start; - //generic_dout(-20) << "node " << nid << " is in " << region << ", overall bitmap pos is " << num << dendl; - return num; - } - - - public: - NodePool(Mutex &el) : - num_nodes(0), - num_dirty(0), num_clean(0), num_free(0), num_limbo(0), - ebofs_lock(el), - flushing(0) {} - ~NodePool() { - // nodes - release_all(); - } - - int get_num_free() { return num_free; } - int get_num_dirty() { return num_dirty; } - int get_num_limbo() { return num_limbo; } - int get_num_clean() { return num_clean; } - int get_num_total() { return num_nodes; } - int get_num_used() { return num_clean + num_dirty; } - - int get_usemap_len(int n=0) { - if (n == 0) n = num_nodes; - return ((n-1) / 8 / EBOFS_BLOCK_SIZE) + 1; - } - - unsigned num_regions() { return region_loc.size(); } - - // the caller had better adjust usemap locations... - void add_region(Extent ex) { - assert(region_loc.size() < EBOFS_MAX_NODE_REGIONS); - region_loc.push_back(ex); - free.insert(ex.start, ex.length); - num_free += ex.length; - num_nodes += ex.length; - } - - void init_usemap() { - usemap_data = buffer::create_page_aligned(EBOFS_BLOCK_SIZE*usemap_even.length); - usemap_data.zero(); - usemap_bits.set_data(usemap_data.c_str(), usemap_data.length()); - } - - void expand_usemap() { - block_t have = usemap_data.length() / EBOFS_BLOCK_SIZE; - if (have < usemap_even.length) { - // use bufferlist to copy/merge two chunks - bufferlist bl; - bl.push_back(usemap_data); - bufferptr newbit = buffer::create_page_aligned(EBOFS_BLOCK_SIZE*(usemap_even.length - have)); - newbit.zero(); - bl.push_back(newbit); - assert(bl.buffers().size() == 1); - usemap_data = bl.buffers().front(); - usemap_bits.set_data(usemap_data.c_str(), usemap_data.length()); - } - } - - - - int init(struct ebofs_nodepool *np) { - // regions - assert(region_loc.empty()); - num_nodes = 0; - for (int i=0; inum_regions; i++) { - debofs(3) << "init region " << i << " at " << np->region_loc[i] << std::endl; - region_loc.push_back( np->region_loc[i] ); - num_nodes += np->region_loc[i].length; - } - - // usemap - usemap_even = np->node_usemap_even; - usemap_odd = np->node_usemap_odd; - debofs(3) << "init even map at " << usemap_even << std::endl; - debofs(3) << "init odd map at " << usemap_odd << std::endl; - - init_usemap(); - return 0; - } - - void close() { - release_all(); - - region_loc.clear(); - - num_free = 0; - num_dirty = 0; - num_clean = 0; - num_limbo = 0; - dirty_ls.clear(); - - free.clear(); - limbo.clear(); - - flushing = 0; - node_map.clear(); - } - - - // *** blocking i/o routines *** - - int read_usemap_and_clean_nodes(BlockDevice& dev, version_t epoch) { - // read map - Extent loc; - if (epoch & 1) - loc = usemap_odd; - else - loc = usemap_even; - - // usemap - dev.read(loc.start, loc.length, usemap_data); - - // nodes - unsigned region = 0; - unsigned region_pos = 0; - for (int i=0; iflushed_usemap(); - } - }; - - void flushed_usemap() { - ebofs_lock.Lock(); - flushing--; - if (flushing == 0) - commit_cond.Signal(); - ebofs_lock.Unlock(); - } - - public: - int write_usemap(BlockDevice& dev, version_t version) { - // alloc - Extent loc; - if (version & 1) - loc = usemap_odd; - else - loc = usemap_even; - - // write - bufferlist bl; - bufferptr bp = usemap_data.clone(); - bl.append(bp); - dev.write(loc.start, loc.length, bl, - new C_NP_FlushUsemap(this), "usemap"); - return 0; - } - - - - // *** node commit *** - private: - - class C_NP_FlushNode : public BlockDevice::callback { - NodePool *pool; - nodeid_t nid; - public: - C_NP_FlushNode(NodePool *p, nodeid_t n) : - pool(p), nid(n) {} - void finish(ioh_t ioh, int r) { - pool->flushed_node(nid); - } - }; - - void flushed_node(nodeid_t nid) { - ebofs_lock.Lock(); - flushing--; - if (flushing == 0) - commit_cond.Signal(); - ebofs_lock.Unlock(); - } - - public: - void commit_start(BlockDevice& dev, version_t version) { - debofs(20) << "ebofs.nodepool.commit_start start dirty=" << dirty_ls.size() << std::endl; - - assert(flushing == 0); - /*if (0) - for (unsigned i=0; i clean (write to disk) - while (!dirty_ls.empty()) { - Node *n = dirty_ls.front(); - assert(n); - assert(n->is_dirty()); - n->set_state(Node::STATE_CLEAN); - dirty_ls.remove(&n->xlist); - num_dirty--; - num_clean++; - - debofs(20) << "ebofs.nodepool.commit_start writing node " << n->get_id() << std::endl; - - bufferlist bl; - if (0) { - bufferptr bp = n->get_buffer().clone(); // dup it now - bl.append(bp); - } else { - bl.append(n->get_buffer()); - } - dev.write(n->get_id(), EBOFS_NODE_BLOCKS, - bl, - new C_NP_FlushNode(this, n->get_id()), "node"); - flushing++; - } - - // limbo -> free - for (map::iterator i = limbo.m.begin(); - i != limbo.m.end(); - i++) { - num_free += i->second; - num_limbo -= i->second; - free.insert(i->first, i->second); - } - limbo.clear(); - - debofs(20) << "ebofs.nodepool.commit_start finish" << std::endl; - } - - void commit_wait() { - while (flushing > 0) - commit_cond.Wait(ebofs_lock); - debofs(20) << "ebofs.nodepool.commit_wait finish" << std::endl; - } - - - - - - - - - - // *** nodes *** - // opened node - Node* get_node(nodeid_t nid) { - //dbtout << "pool.get " << nid << std::endl; - assert(node_map.count(nid)); - return node_map[nid]; - } - - // allocate id/block on disk. always free -> dirty. - nodeid_t alloc_id() { - // pick node id - assert(!free.empty()); - nodeid_t nid = free.start(); - free.erase(nid); - num_free--; - return nid; - } - - // new node - Node* new_node(int type) { - nodeid_t nid = alloc_id(); - debofs(15) << "ebofs.nodepool.new_node " << nid << std::endl; - - // alloc node - bufferptr bp = buffer::create_page_aligned(EBOFS_NODE_BYTES); - bp.zero(); - Node *n = new Node(nid, nodeid_pos_in_bitmap(nid), bp, Node::STATE_DIRTY); - n->set_type(type); - n->set_size(0); - - usemap_bits.set(n->get_pos_in_bitmap()); - - n->set_state(Node::STATE_DIRTY); - dirty_ls.push_back(&n->xlist); - num_dirty++; - - assert(node_map.count(nid) == 0); - node_map[nid] = n; - - return n; - } - - void release(Node *n) { - const nodeid_t nid = n->get_id(); - debofs(15) << "ebofs.nodepool.release on " << nid << std::endl; - node_map.erase(nid); - - if (n->is_dirty()) { - dirty_ls.remove(&n->xlist); - num_dirty--; - free.insert(nid); - num_free++; - usemap_bits.clear(n->get_pos_in_bitmap()); - } else if (n->is_clean()) { - limbo.insert(nid); - num_limbo++; - num_clean--; - usemap_bits.clear(n->get_pos_in_bitmap()); - } - - delete n; - assert(num_clean + num_dirty + num_limbo + num_free == num_nodes); - } - - void release_all() { - while (!node_map.empty()) { - hash_map >::iterator i = node_map.begin(); - debofs(2) << "ebofs.nodepool.release_all leftover " << i->first << " " << i->second << std::endl; - release( i->second ); - } - assert(node_map.empty()); - } - - void dirty_node(Node *n) { - // get new node id? - nodeid_t oldid = n->get_id(); - nodeid_t newid = alloc_id(); - debofs(15) << "ebofs.nodepool.dirty_node on " << oldid << " now " << newid << std::endl; - - // dup data? - // this only does a memcpy if there are multiple references.. - // i.e. if we are still writing the old data - if (n->do_cow()) { - //assert(0); //i'm duping on write - debofs(15) << "ebofs.nodepool.dirty_node did cow on " << oldid << " now " << newid << std::endl; - //cerr << "ebofs.nodepool.dirty_node did cow on " << oldid << " now " << newid << std::endl; - } - - // release old block - assert(n->is_clean()); - num_clean--; - limbo.insert(oldid); - num_limbo++; - usemap_bits.clear(n->get_pos_in_bitmap()); - - // rename node - node_map.erase(oldid); - n->set_id(newid); - n->set_pos_in_bitmap(nodeid_pos_in_bitmap(newid)); - node_map[newid] = n; - - // new block - n->set_state(Node::STATE_DIRTY); - dirty_ls.push_back(&n->xlist); - debofs(15) << "ebofs.nodepool.dirty_node added to dirty list, len now " << dirty_ls.size() << std::endl; - num_dirty++; - usemap_bits.set(n->get_pos_in_bitmap()); - - assert(num_clean + num_dirty + num_limbo + num_free == num_nodes); - } - - -}; - -#endif diff --git a/branches/sage/crush/ebofs/test.ebofs.cc b/branches/sage/crush/ebofs/test.ebofs.cc deleted file mode 100644 index 9a8913a52d80d..0000000000000 --- a/branches/sage/crush/ebofs/test.ebofs.cc +++ /dev/null @@ -1,226 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include -#include "ebofs/Ebofs.h" - -bool stop = false; - - -int nt = 0; -class Tester : public Thread { - Ebofs &fs; - int t; - - char b[1024*1024]; - -public: - Tester(Ebofs &e) : fs(e), t(nt) { nt++; } - void *entry() { - - while (!stop) { - object_t oid; - oid.ino = (rand() % 10) + 0x10000000; - coll_t cid = rand() % 50; - off_t off = rand() % 10000;//0;//rand() % 1000000; - off_t len = 1+rand() % 100000; - char *a = "one"; - if (rand() % 2) a = "two"; - int l = 3;//rand() % 10; - - switch (rand() % 10) { - case 0: - { - oid.rev = rand() % 10; - cout << t << " read " << hex << oid << dec << " at " << off << " len " << len << std::endl; - bufferlist bl; - fs.read(oid, off, len, bl); - int l = MIN(len,bl.length()); - if (l) { - cout << t << " got " << l << std::endl; - bl.copy(0, l, b); - char *p = b; - while (l--) { - assert(*p == 0 || - *p == (char)(off ^ oid.ino)); - off++; - p++; - } - } - } - break; - - case 1: - { - cout << t << " write " << hex << oid << dec << " at " << off << " len " << len << std::endl; - for (int j=0;j args; - argv_to_vec(argc, argv, args); - parse_config_options(args); - - // args - if (args.size() != 3) return -1; - char *filename = args[0]; - int seconds = atoi(args[1]); - int threads = atoi(args[2]); - if (!threads) threads = 1; - - cout << "dev " << filename << " .. " << threads << " threads .. " << seconds << " seconds" << std::endl; - - Ebofs fs(filename); - if (fs.mount() < 0) return -1; - - - // explicit tests - if (0) { - // verify that clone() plays nice with partial writes - object_t oid(1,1); - bufferptr bp(10000); - bp.zero(); - bufferlist bl; - bl.push_back(bp); - fs.write(oid, 0, 10000, bl, 0); - - fs.sync(); - fs.trim_buffer_cache(); - - // induce a partial write - bufferlist bl2; - bl2.substr_of(bl, 0, 100); - fs.write(oid, 100, 100, bl2, 0); - - // clone it - object_t oid2; - oid2 = oid; - oid2.rev = 1; - fs.clone(oid, oid2, 0); - - // ... - if (0) { - // make sure partial still behaves after orig is removed... - fs.remove(oid, 0); - - // or i read for oid2... - bufferlist rbl; - fs.read(oid2, 0, 200, rbl); - } - if (1) { - // make sure things behave if we remove the clone - fs.remove(oid2,0); - } - } - // /explicit tests - - list ls; - for (int i=0; icreate(); - ls.push_back(t); - } - - utime_t now = g_clock.now(); - utime_t dur(seconds,0); - utime_t end = now + dur; - cout << "stop at " << end << std::endl; - while (now < end) { - sleep(1); - now = g_clock.now(); - cout << now << std::endl; - } - - cout << "stopping" << std::endl; - stop = true; - - while (!ls.empty()) { - Tester *t = ls.front(); - ls.pop_front(); - t->join(); - delete t; - } - - fs.umount(); - return 0; -} - diff --git a/branches/sage/crush/ebofs/types.h b/branches/sage/crush/ebofs/types.h deleted file mode 100644 index 749ebddb3ccec..0000000000000 --- a/branches/sage/crush/ebofs/types.h +++ /dev/null @@ -1,171 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __EBOFS_TYPES_H -#define __EBOFS_TYPES_H - -#include -#include "include/buffer.h" -#include "include/Context.h" -#include "common/Cond.h" - -#include -#include -#include -#include -using namespace std; -using namespace __gnu_cxx; - - -#include "include/object.h" - - -#ifndef MIN -# define MIN(a,b) ((a)<=(b) ? (a):(b)) -#endif -#ifndef MAX -# define MAX(a,b) ((a)>=(b) ? (a):(b)) -#endif - - -/* -namespace __gnu_cxx { - template<> struct hash { - size_t operator()(unsigned long long __x) const { - static hash H; - return H((__x >> 32) ^ (__x & 0xffffffff)); - } - }; - - template<> struct hash< std::string > - { - size_t operator()( const std::string& x ) const - { - static hash H; - return H(x.c_str()); - } - }; -} -*/ - - -// disk -typedef uint64_t block_t; // disk location/sector/block - -static const int EBOFS_BLOCK_SIZE = 4096; -static const int EBOFS_BLOCK_BITS = 12; // 1<<12 == 4096 - -class Extent { - public: - block_t start, length; - - Extent() : start(0), length(0) {} - Extent(block_t s, block_t l) : start(s), length(l) {} - - block_t last() const { return start + length - 1; } - block_t end() const { return start + length; } -}; - -inline ostream& operator<<(ostream& out, Extent& ex) -{ - return out << ex.start << "~" << ex.length; -} - - -// tree/set nodes -//typedef int nodeid_t; -typedef int64_t nodeid_t; // actually, a block number. FIXME. - -static const unsigned EBOFS_NODE_BLOCKS = 1; -static const unsigned EBOFS_NODE_BYTES = EBOFS_NODE_BLOCKS * EBOFS_BLOCK_SIZE; -static const unsigned EBOFS_MAX_NODE_REGIONS = 10; // pick a better value! - -struct ebofs_nodepool { - Extent node_usemap_even; // for even sb versions - Extent node_usemap_odd; // for odd sb versions - - int num_regions; - Extent region_loc[EBOFS_MAX_NODE_REGIONS]; -}; - - -// objects - -typedef uint64_t coll_t; - -struct ebofs_onode { - Extent onode_loc; /* this is actually the block we live in */ - - object_t object_id; /* for kicks */ - off_t object_size; /* file size in bytes. should this be 64-bit? */ - unsigned object_blocks; - bool readonly; - - int num_collections; - int num_attr; // num attr in onode - int num_extents; /* number of extents used. if 0, data is in the onode */ -}; - -struct ebofs_cnode { - Extent cnode_loc; /* this is actually the block we live in */ - coll_t coll_id; - int num_attr; // num attr in cnode -}; - - -// table -struct ebofs_table { - nodeid_t root; /* root node of btree */ - int num_keys; - int depth; -}; - - -// super -typedef uint64_t version_t; - -static const unsigned EBOFS_MAGIC = 0x000EB0F5; - -static const int EBOFS_NUM_FREE_BUCKETS = 5; /* see alloc.h for bucket constraints */ -static const int EBOFS_FREE_BUCKET_BITS = 2; - - -struct ebofs_super { - uint64_t s_magic; - uint64_t fsid; - - epoch_t epoch; // version of this superblock. - - uint64_t num_blocks; /* # blocks in filesystem */ - - // some basic stats, for kicks - uint64_t free_blocks; /* unused blocks */ - uint64_t limbo_blocks; /* limbo blocks */ - //unsigned num_objects; - //unsigned num_fragmented; - - struct ebofs_nodepool nodepool; - - // tables - struct ebofs_table free_tab[EBOFS_NUM_FREE_BUCKETS]; - struct ebofs_table limbo_tab; - struct ebofs_table alloc_tab; - struct ebofs_table object_tab; // object directory - struct ebofs_table collection_tab; // collection directory - struct ebofs_table co_tab; -}; - - -#endif diff --git a/branches/sage/crush/fakefuse.cc b/branches/sage/crush/fakefuse.cc deleted file mode 100644 index b08d00d11a5d6..0000000000000 --- a/branches/sage/crush/fakefuse.cc +++ /dev/null @@ -1,168 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include -#include -#include -using namespace std; - -#include "config.h" - -#include "mon/Monitor.h" - -#include "mds/MDS.h" -#include "osd/OSD.h" -#include "client/Client.h" -#include "client/fuse.h" -#include "client/fuse_ll.h" - -#include "common/Timer.h" - -#include "msg/FakeMessenger.h" - - - - -#define NUMMDS g_conf.num_mds -#define NUMOSD g_conf.num_osd -#define NUMCLIENT g_conf.num_client - - -class C_Test : public Context { -public: - void finish(int r) { - cout << "C_Test->finish(" << r << ")" << std::endl; - } -}; -class C_Test2 : public Context { -public: - void finish(int r) { - cout << "C_Test2->finish(" << r << ")" << std::endl; - g_timer.add_event_after(2, new C_Test); - } -}; - - - -int main(int argc, char **argv) { - cerr << "fakefuse starting" << std::endl; - - // stop on our own (by default) - g_conf.mon_stop_on_last_unmount = true; - g_conf.mon_stop_with_last_mds = true; - - vector args; - argv_to_vec(argc, argv, args); - parse_config_options(args); - - // start messenger thread - fakemessenger_startthread(); - - //g_timer.add_event_after(5.0, new C_Test2); - //g_timer.add_event_after(10.0, new C_Test); - - vector nargs; - for (unsigned i=0; imon_inst[i] = entity_inst_t(entity_name_t::MON(i), a); // hack ; see FakeMessenger.cc - } - - Monitor *mon[g_conf.num_mon]; - for (int i=0; iinit(); - for (int i=0; iinit(); - for (int i=0; iinit(); - - - // create client - Client *client[NUMCLIENT]; - for (int i=0; iinit(); - - - // start up fuse - // use my argc, argv (make sure you pass a mount point!) - client[i]->mount(); - - char *oldcwd = get_current_dir_name(); // note previous wd - cout << "starting fuse on pid " << getpid() << std::endl; - if (g_conf.fuse_ll) - ceph_fuse_ll_main(client[i], argc, argv); - else - ceph_fuse_main(client[i], argc, argv); - cout << "fuse finished on pid " << getpid() << std::endl; - ::chdir(oldcwd); // return to previous wd - free(oldcwd); - - client[i]->unmount(); - client[i]->shutdown(); - } - - - - // wait for it to finish - cout << "DONE -----" << std::endl; - fakemessenger_wait(); // blocks until messenger stops - - - // cleanup - for (int i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include -#include -#include -using namespace std; - -#include "config.h" - -#include "mds/MDS.h" -#include "osd/OSD.h" -#include "mon/Monitor.h" -#include "client/Client.h" - -#include "client/SyntheticClient.h" - -#include "msg/FakeMessenger.h" - -#include "common/Timer.h" - - -class C_Test : public Context { -public: - void finish(int r) { - cout << "C_Test->finish(" << r << ")" << std::endl; - } -}; - -class C_Die : public Context { -public: - void finish(int) { - cerr << "die" << std::endl; - exit(1); - } -}; - - -int main(int argc, char **argv) -{ - cerr << "fakesyn start" << std::endl; - - //cerr << "inode_t " << sizeof(inode_t) << std::endl; - - vector args; - argv_to_vec(argc, argv, args); - - // stop on our own (by default) - g_conf.mon_stop_on_last_unmount = true; - g_conf.mon_stop_with_last_mds = true; - - parse_config_options(args); - - int start = 0; - - parse_syn_options(args); - - vector nargs; - - for (unsigned i=0; imon_inst[i] = entity_inst_t(entity_name_t::MON(i), a); // hack ; see FakeMessenger.cc - } - - char hostname[100]; - gethostname(hostname,100); - //int pid = getpid(); - - // create mon - Monitor *mon[g_conf.num_mon]; - for (int i=0; iinit(); - } - for (int i=0; iinit(); - } - for (int i=0; iinit(); - if (g_conf.mds_local_osd) - mdsosd[i]->init(); - } - - - // create client(s) - Client *client[g_conf.num_client]; - SyntheticClient *syn[g_conf.num_client]; - for (int i=0; istart_thread(); - start++; - } - - - for (int i=0; ijoin_thread(); - delete syn[i]; - } - - - // wait for it to finish - fakemessenger_wait(); - - // cleanup - for (int i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __BUFFER_H -#define __BUFFER_H - -#include - -#include "common/Mutex.h" - - -//#define BUFFER_USE_CCPP - -#ifdef BUFFER_USE_CCPP -# include "cc++/thread.h" -#endif - -#include -#include - -using std::cout; - -#ifndef __CYGWIN__ -# include -#endif - -#define BUFFER_PAGE_SIZE 4096 // FIXME - -// -// these are in config.o -extern Mutex bufferlock; -extern long buffer_total_alloc; -// - - - - -class buffer { -private: - - /* hack for memory utilization debugging. */ - static void inc_total_alloc(unsigned len) { - bufferlock.Lock(); - buffer_total_alloc += len; - bufferlock.Unlock(); - } - static void dec_total_alloc(unsigned len) { - bufferlock.Lock(); - buffer_total_alloc -= len; - bufferlock.Unlock(); - } - - /* - * an abstract raw buffer. with a reference count. - */ - class raw { - public: - char *data; - unsigned len; -#ifdef BUFFER_USE_CCPP - mutable ost::AtomicCounter nref; // mutable for const-ness of operator<< -#else - int nref; - Mutex lock; // we'll make it non-recursive. -#endif - - raw(unsigned l) : len(l), nref(0) -#ifndef BUFFER_USE_CCPP - , lock(false) -#endif - { } - raw(char *c, unsigned l) : data(c), len(l), nref(0) -#ifndef BUFFER_USE_CCPP - , lock(false) -#endif - { } - virtual ~raw() {}; - - // no copying. - raw(const raw &other); - const raw& operator=(const raw &other); - - virtual raw* clone_empty() = 0; - raw *clone() { - raw *c = clone_empty(); - memcpy(c->data, data, len); - return c; - } - - bool is_page_aligned() { - return (long)data % BUFFER_PAGE_SIZE == 0; - } - }; - - friend std::ostream& operator<<(std::ostream& out, const raw &r); - - /* - * primitive buffer types - */ - class raw_char : public raw { - public: - raw_char(unsigned l) : raw(l) { - data = new char[len]; - inc_total_alloc(len); - } - ~raw_char() { - delete[] data; - dec_total_alloc(len); - } - raw* clone_empty() { - return new raw_char(len); - } - }; - - class raw_static : public raw { - public: - raw_static(const char *d, unsigned l) : raw((char*)d, l) { } - ~raw_static() {} - raw* clone_empty() { - return new raw_char(len); - } - }; - -#ifndef __CYGWIN__ - class raw_mmap_pages : public raw { - public: - raw_mmap_pages(unsigned l) : raw(l) { - data = (char*)::mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, 0); - inc_total_alloc(len); - } - ~raw_mmap_pages() { - ::munmap(data, len); - dec_total_alloc(len); - } - raw* clone_empty() { - return new raw_mmap_pages(len); - } - }; - - class raw_posix_aligned : public raw { - public: - raw_posix_aligned(unsigned l) : raw(l) { -#ifdef DARWIN - data = (char *) valloc (len); -#else - ::posix_memalign((void**)(void*)&data, BUFFER_PAGE_SIZE, len); -#endif /* DARWIN */ - inc_total_alloc(len); - } - ~raw_posix_aligned() { - ::free((void*)data); - dec_total_alloc(len); - } - raw* clone_empty() { - return new raw_posix_aligned(len); - } - }; -#endif - -#ifdef __CYGWIN__ - class raw_hack_aligned : public raw { - char *realdata; - public: - raw_hack_aligned(unsigned l) : raw(l) { - realdata = new char[len+BUFFER_PAGE_SIZE-1]; - unsigned off = ((unsigned)realdata) % BUFFER_PAGE_SIZE; - if (off) - data = realdata + BUFFER_PAGE_SIZE - off; - else - data = realdata; - inc_total_alloc(len+BUFFER_PAGE_SIZE-1); - //cout << "hack aligned " << (unsigned)data - //<< " in raw " << (unsigned)realdata - //<< " off " << off << std::endl; - assert(((unsigned)data & (BUFFER_PAGE_SIZE-1)) == 0); - } - ~raw_hack_aligned() { - delete[] realdata; - dec_total_alloc(len+BUFFER_PAGE_SIZE-1); - } - raw* clone_empty() { - return new raw_hack_aligned(len); - } - }; -#endif - -public: - - /* - * named constructors - */ - - static raw* copy(const char *c, unsigned len) { - raw* r = new raw_char(len); - memcpy(r->data, c, len); - return r; - } - static raw* create(unsigned len) { - return new raw_char(len); - } - - static raw* create_page_aligned(unsigned len) { -#ifndef __CYGWIN__ - //return new raw_mmap_pages(len); - return new raw_posix_aligned(len); -#else - return new raw_hack_aligned(len); -#endif - } - - - /* - * a buffer pointer. references (a subsequence of) a raw buffer. - */ - class ptr { - raw *_raw; - unsigned _off, _len; - - public: - ptr() : _raw(0), _off(0), _len(0) {} - ptr(raw *r) : _raw(r), _off(0), _len(r->len) { // no lock needed; this is an unref raw. - ++r->nref; - } - ptr(unsigned l) : _off(0), _len(l) { - _raw = create(l); - ++_raw->nref; - } - ptr(char *d, unsigned l) : _off(0), _len(l) { // ditto. - _raw = copy(d, l); - ++_raw->nref; - } - ptr(const ptr& p) : _raw(p._raw), _off(p._off), _len(p._len) { - if (_raw) { -#ifdef BUFFER_USE_CCPP - ++_raw->nref; -#else - _raw->lock.Lock(); - ++_raw->nref; - _raw->lock.Unlock(); -#endif - } - } - ptr(const ptr& p, unsigned o, unsigned l) : _raw(p._raw), _off(p._off + o), _len(l) { - assert(o+l <= p._len); - assert(_raw); -#ifdef BUFFER_USE_CCPP - ++_raw->nref; -#else - _raw->lock.Lock(); - ++_raw->nref; - _raw->lock.Unlock(); -#endif - } - ptr& operator= (const ptr& p) { - // be careful -- we need to properly handle self-assignment. - if (p._raw) { -#ifdef BUFFER_USE_CCPP - ++p._raw->nref; // inc new -#else - p._raw->lock.Lock(); - ++p._raw->nref; // inc new - p._raw->lock.Unlock(); -#endif - } - release(); // dec (+ dealloc) old (if any) - _raw = p._raw; // change my ref - _off = p._off; - _len = p._len; - return *this; - } - ~ptr() { - release(); - } - - raw *clone() { - return _raw->clone(); - } - - bool do_cow() { - if (_raw->nref > 1) { - //std::cout << "doing cow on " << _raw << " len " << _len << std::endl; - raw *newraw = _raw->clone(); - release(); - newraw->nref++; - _raw = newraw; - return true; - } else - return false; - } - - void swap(ptr& other) { - raw *r = _raw; - unsigned o = _off; - unsigned l = _len; - _raw = other._raw; - _off = other._off; - _len = other._len; - other._raw = r; - other._off = o; - other._len = l; - } - - void release() { - if (_raw) { -#ifndef BUFFER_USE_CCPP - _raw->lock.Lock(); -#endif - if (--_raw->nref == 0) { - //cout << "hosing raw " << (void*)_raw << " len " << _raw->len << std::endl; -#ifndef BUFFER_USE_CCPP - _raw->lock.Unlock(); -#endif - delete _raw; // dealloc old (if any) - } else { -#ifndef BUFFER_USE_CCPP - _raw->lock.Unlock(); -#endif - } - _raw = 0; - } - } - - // misc - bool at_buffer_head() const { return _off == 0; } - bool at_buffer_tail() const { return _off + _len == _raw->len; } - - bool is_page_aligned() const { return (long)c_str() % BUFFER_PAGE_SIZE == 0; } - - // accessors - raw *get_raw() const { return _raw; } - const char *c_str() const { assert(_raw); return _raw->data + _off; } - char *c_str() { assert(_raw); return _raw->data + _off; } - unsigned length() const { return _len; } - unsigned offset() const { return _off; } - unsigned start() const { return _off; } - unsigned end() const { return _off + _len; } - unsigned unused_tail_length() const { - if (_raw) - return _raw->len - (_off+_len); - else - return 0; - } - const char& operator[](unsigned n) const { - assert(_raw); - assert(n < _len); - return _raw->data[_off + n]; - } - char& operator[](unsigned n) { - assert(_raw); - assert(n < _len); - return _raw->data[_off + n]; - } - - const char *raw_c_str() const { assert(_raw); return _raw->data; } - unsigned raw_length() const { assert(_raw); return _raw->len; } - int raw_nref() const { assert(_raw); return _raw->nref; } - - void copy_out(unsigned o, unsigned l, char *dest) const { - assert(_raw); - assert(o >= 0 && o <= _len); - assert(l >= 0 && o+l <= _len); - memcpy(dest, c_str()+o, l); - } - - unsigned wasted() { - assert(_raw); - return _raw->len - _len; - } - - // modifiers - void set_offset(unsigned o) { _off = o; } - void set_length(unsigned l) { _len = l; } - - void append(const char *p, unsigned l) { - assert(_raw); - assert(l <= unused_tail_length()); - memcpy(c_str() + _len, p, l); - _len += l; - } - - void copy_in(unsigned o, unsigned l, const char *src) { - assert(_raw); - assert(o >= 0 && o <= _len); - assert(l >= 0 && o+l <= _len); - memcpy(c_str()+o, src, l); - } - - void zero() { - memset(c_str(), 0, _len); - } - - void clean() { - //raw *newraw = _raw->makesib(_len); - } - }; - - friend std::ostream& operator<<(std::ostream& out, const buffer::ptr& bp); - - /* - * list - the useful bit! - */ - - class list { - // my private bits - list *bl; - std::list _buffers; - unsigned _len; - - ptr append_buffer; // where i put small appends. - - public: - class iterator { - list *bl; - std::list &ls; - unsigned off; // in bl - std::list::iterator p; - unsigned p_off; // in *p - public: - // constructor. position. - iterator(list *l, unsigned o=0) : - bl(l), ls(bl->_buffers), off(0), p(ls.begin()), p_off(0) { - advance(o); - } - iterator(list *l, unsigned o, std::list::iterator ip, unsigned po) : - bl(l), ls(bl->_buffers), off(0), p(ip), p_off(po) { } - - iterator operator=(const iterator& other) { - return iterator(bl, off, p, p_off); - } - - unsigned get_off() { return off; } - - bool end() { - return p == ls.end(); - } - - void advance(unsigned o) { - //cout << this << " advance " << o << " from " << off << " (p_off " << p_off << " in " << p->length() << ")" << std::endl; - p_off += o; - while (p_off > 0) { - assert(p != ls.end()); - if (p_off >= p->length()) { - // skip this buffer - p_off -= p->length(); - p++; - } else { - // somewhere in this buffer! - break; - } - } - off += o; - } - - void seek(unsigned o) { - //cout << this << " seek " << o << std::endl; - p = ls.begin(); - off = p_off = 0; - advance(o); - } - - char operator*() { - assert(p != ls.end()); - return (*p)[p_off]; - } - iterator& operator++() { - assert(p != ls.end()); - advance(1); - return *this; - } - - // copy data out. - // note that these all _append_ to dest! - - void copy(unsigned len, char *dest) { - if (p == ls.end()) seek(off); - while (len > 0) { - assert(p != ls.end()); - - unsigned howmuch = p->length() - p_off; - if (len < howmuch) howmuch = len; - p->copy_out(p_off, howmuch, dest); - dest += howmuch; - - len -= howmuch; - advance(howmuch); - } - } - - void copy(unsigned len, list &dest) { - if (p == ls.end()) seek(off); - while (len > 0) { - assert(p != ls.end()); - - unsigned howmuch = p->length() - p_off; - if (len < howmuch) howmuch = len; - dest.append(*p, p_off, howmuch); - - len -= howmuch; - advance(howmuch); - } - } - - void copy(unsigned len, std::string &dest) { - if (p == ls.end()) seek(off); - while (len > 0) { - assert(p != ls.end()); - - unsigned howmuch = p->length() - p_off; - if (len < howmuch) howmuch = len; - dest.append(p->c_str() + p_off, howmuch); - - len -= howmuch; - advance(howmuch); - } - } - - // copy data in - - void copy_in(unsigned len, const char *src) { - // copy - if (p == ls.end()) seek(off); - while (len > 0) { - assert(p != ls.end()); - - unsigned howmuch = p->length() - p_off; - if (len < howmuch) howmuch = len; - p->copy_in(p_off, howmuch, src); - - src += howmuch; - len -= howmuch; - advance(howmuch); - } - } - - void copy_in(unsigned len, const list& otherl) { - if (p == ls.end()) seek(off); - unsigned left = len; - for (std::list::const_iterator i = otherl._buffers.begin(); - i != otherl._buffers.end(); - i++) { - unsigned l = (*i).length(); - if (left < l) l = left; - copy_in(l, i->c_str()); - left -= l; - if (left == 0) break; - } - } - - }; - - private: - mutable iterator last_p; - - public: - // cons/des - list() : _len(0), last_p(this) {} - list(const list& other) : _buffers(other._buffers), _len(other._len), last_p(this) { } - list(unsigned l) : _len(0), last_p(this) { - ptr bp(l); - push_back(bp); - } - ~list() {} - - list& operator= (const list& other) { - _buffers = other._buffers; - _len = other._len; - return *this; - } - - const std::list& buffers() const { return _buffers; } - - void swap(list& other) { - unsigned t = _len; - _len = other._len; - other._len = t; - _buffers.swap(other._buffers); - append_buffer.swap(other.append_buffer); - } - - unsigned length() const { -#if 0 - // DEBUG: verify _len - unsigned len = 0; - for (std::list::const_iterator it = _buffers.begin(); - it != _buffers.end(); - it++) { - len += (*it).length(); - } - assert(len == _len); -#endif - return _len; - } - - bool is_page_aligned() const { - for (std::list::const_iterator it = _buffers.begin(); - it != _buffers.end(); - it++) - if (!it->is_page_aligned()) return false; - return true; - } - bool is_n_page_sized() const { - return length() % BUFFER_PAGE_SIZE == 0; - } - - // modifiers - void clear() { - _buffers.clear(); - _len = 0; - } - void push_front(ptr& bp) { - _buffers.push_front(bp); - _len += bp.length(); - } - void push_front(raw *r) { - ptr bp(r); - _buffers.push_front(bp); - _len += bp.length(); - } - void push_back(const ptr& bp) { - _buffers.push_back(bp); - _len += bp.length(); - } - void push_back(raw *r) { - ptr bp(r); - _buffers.push_back(bp); - _len += bp.length(); - } - void zero() { - for (std::list::iterator it = _buffers.begin(); - it != _buffers.end(); - it++) - it->zero(); - } - - // sort-of-like-assignment-op - void claim(list& bl) { - // free my buffers - clear(); - claim_append(bl); - } - void claim_append(list& bl) { - // steal the other guy's buffers - _len += bl._len; - _buffers.splice( _buffers.end(), bl._buffers ); - bl._len = 0; - } - - - - - iterator begin() { - return iterator(this, 0); - } - iterator end() { - return iterator(this, _len, _buffers.end(), 0); - } - - - // crope lookalikes. - // **** WARNING: this are horribly inefficient for large bufferlists. **** - - // data OUT - - void copy(unsigned off, unsigned len, char *dest) const { - assert(off >= 0); - assert(off + len <= length()); - if (last_p.get_off() != off) - last_p.seek(off); - last_p.copy(len, dest); - } - - void copy(unsigned off, unsigned len, list &dest) const { - assert(off >= 0); - assert(off + len <= length()); - if (last_p.get_off() != off) - last_p.seek(off); - last_p.copy(len, dest); - } - - void copy(unsigned off, unsigned len, std::string& dest) const { - if (last_p.get_off() != off) - last_p.seek(off); - return last_p.copy(len, dest); - } - - void copy_in(unsigned off, unsigned len, const char *src) { - assert(off >= 0); - assert(off + len <= length()); - - if (last_p.get_off() != off) - last_p.seek(off); - last_p.copy_in(len, src); - } - - void copy_in(unsigned off, unsigned len, const list& src) { - if (last_p.get_off() != off) - last_p.seek(off); - last_p.copy_in(len, src); - } - - - void append(const char *data, unsigned len) { - while (len > 0) { - // put what we can into the existing append_buffer. - unsigned gap = append_buffer.unused_tail_length(); - if (gap > 0) { - if (gap > len) gap = len; - //cout << "append first char is " << data[0] << ", last char is " << data[len-1] << std::endl; - append_buffer.append(data, gap); - append(append_buffer, append_buffer.end() - gap, gap); // add segment to the list - len -= gap; - data += gap; - } - if (len == 0) break; // done! - - // make a new append_buffer! - unsigned alen = BUFFER_PAGE_SIZE * (((len-1) / BUFFER_PAGE_SIZE) + 1); - append_buffer = create_page_aligned(alen); - append_buffer.set_length(0); // unused, so far. - } - } - void append(const ptr& bp) { - push_back(bp); - } - void append(const ptr& bp, unsigned off, unsigned len) { - assert(len+off <= bp.length()); - if (!_buffers.empty() && - _buffers.back().get_raw() == bp.get_raw() && - _buffers.back().end() == bp.start() + off) { - // yay contiguous with tail bp! - _buffers.back().set_length(_buffers.back().length()+len); - _len += len; - } else { - // add new item to list - ptr tempbp(bp, off, len); - push_back(tempbp); - } - } - void append(const list& bl) { - _len += bl._len; - for (std::list::const_iterator p = bl._buffers.begin(); - p != bl._buffers.end(); - ++p) - _buffers.push_back(*p); - } - - - /* - * get a char - */ - const char& operator[](unsigned n) { - assert(n < _len); - for (std::list::iterator p = _buffers.begin(); - p != _buffers.end(); - p++) { - if (n >= p->length()) { - n -= p->length(); - continue; - } - return (*p)[n]; - } - assert(0); - } - - /* - * return a contiguous ptr to whole bufferlist contents. - */ - char *c_str() { - if (_buffers.size() == 1) { - return _buffers.front().c_str(); // good, we're already contiguous. - } - else if (_buffers.size() == 0) { - return 0; // no buffers - } - else { - ptr newbuf = create(length()); // make one new contiguous buffer. - copy(0, length(), newbuf.c_str()); // copy myself into it. - clear(); - push_back(newbuf); - return newbuf.c_str(); // now it'll work. - } - } - - void substr_of(const list& other, unsigned off, unsigned len) { - assert(off + len <= other.length()); - clear(); - - // skip off - std::list::const_iterator curbuf = other._buffers.begin(); - while (off > 0 && - off >= curbuf->length()) { - // skip this buffer - //cout << "skipping over " << *curbuf << std::endl; - off -= (*curbuf).length(); - curbuf++; - } - assert(len == 0 || curbuf != other._buffers.end()); - - while (len > 0) { - // partial? - if (off + len < curbuf->length()) { - //cout << "copying partial of " << *curbuf << std::endl; - _buffers.push_back( ptr( *curbuf, off, len ) ); - _len += len; - break; - } - - // through end - //cout << "copying end (all?) of " << *curbuf << std::endl; - unsigned howmuch = curbuf->length() - off; - _buffers.push_back( ptr( *curbuf, off, howmuch ) ); - _len += howmuch; - len -= howmuch; - off = 0; - curbuf++; - } - } - - - - // funky modifer - void splice(unsigned off, unsigned len, list *claim_by=0 /*, bufferlist& replace_with */) { // fixme? - assert(off < length()); - assert(len > 0); - //cout << "splice off " << off << " len " << len << " ... mylen = " << length() << std::endl; - - // skip off - std::list::iterator curbuf = _buffers.begin(); - while (off > 0) { - assert(curbuf != _buffers.end()); - if (off >= (*curbuf).length()) { - // skip this buffer - //cout << "off = " << off << " skipping over " << *curbuf << std::endl; - off -= (*curbuf).length(); - curbuf++; - } else { - // somewhere in this buffer! - //cout << "off = " << off << " somewhere in " << *curbuf << std::endl; - break; - } - } - assert(off >= 0); - - if (off) { - // add a reference to the front bit - // insert it before curbuf (which we'll hose) - //cout << "keeping front " << off << " of " << *curbuf << std::endl; - _buffers.insert( curbuf, ptr( *curbuf, 0, off ) ); - _len += off; - } - - while (len > 0) { - // partial? - if (off + len < (*curbuf).length()) { - //cout << "keeping end of " << *curbuf << ", losing first " << off+len << std::endl; - if (claim_by) - claim_by->append( *curbuf, off, len ); - (*curbuf).set_offset( off+len + (*curbuf).offset() ); // ignore beginning big - (*curbuf).set_length( (*curbuf).length() - (len+off) ); - _len -= off+len; - //cout << " now " << *curbuf << std::endl; - break; - } - - // hose though the end - unsigned howmuch = (*curbuf).length() - off; - //cout << "discarding " << howmuch << " of " << *curbuf << std::endl; - if (claim_by) - claim_by->append( *curbuf, off, howmuch ); - _len -= (*curbuf).length(); - _buffers.erase( curbuf++ ); - len -= howmuch; - off = 0; - } - - // splice in *replace (implement me later?) - } - - }; - -}; - -typedef buffer::ptr bufferptr; -typedef buffer::list bufferlist; - - -inline bool operator>(bufferlist& l, bufferlist& r) { - for (unsigned p = 0; ; p++) { - if (l.length() > p && r.length() == p) return true; - if (l.length() == p) return false; - if (l[p] > r[p]) return true; - if (l[p] < r[p]) return false; - p++; - } -} -inline bool operator>=(bufferlist& l, bufferlist& r) { - for (unsigned p = 0; ; p++) { - if (l.length() > p && r.length() == p) return true; - if (r.length() == p && l.length() == p) return true; - if (l[p] > r[p]) return true; - if (l[p] < r[p]) return false; - p++; - } -} -inline bool operator<(bufferlist& l, bufferlist& r) { - return r > l; -} -inline bool operator<=(bufferlist& l, bufferlist& r) { - return r >= l; -} - - -inline std::ostream& operator<<(std::ostream& out, const buffer::raw &r) { - return out << "buffer::raw(" << (void*)r.data << " len " << r.len << " nref " << r.nref << ")"; -} - -inline std::ostream& operator<<(std::ostream& out, const buffer::ptr& bp) { - out << "buffer::ptr(" << bp.offset() << "~" << bp.length() - << " " << (void*)bp.c_str() - << " in raw " << (void*)bp.raw_c_str() - << " len " << bp.raw_length() - << " nref " << bp.raw_nref() << ")"; - return out; -} - -inline std::ostream& operator<<(std::ostream& out, const buffer::list& bl) { - out << "buffer::list(len=" << bl.length() << "," << std::endl; - - std::list::const_iterator it = bl.buffers().begin(); - while (it != bl.buffers().end()) { - out << "\t" << *it; - if (++it == bl.buffers().end()) break; - out << "," << std::endl; - } - out << std::endl << ")"; - return out; -} - - - - -// ---------------------------------------------------------- -// encoders - -// DEPRECATED, please use _(en|de)code_(simple|complex) - -// raw -template -inline void _encoderaw(const T& t, bufferlist& bl) -{ - bl.append((char*)&t, sizeof(t)); -} -template -inline void _decoderaw(T& t, bufferlist& bl, int& off) -{ - bl.copy(off, sizeof(t), (char*)&t); - off += sizeof(t); -} - -#include -#include -#include -#include -#include -#include - -// list -template -inline void _encode(const std::list& ls, bufferlist& bl) -{ - // should i pre- or post- count? - if (!ls.empty()) { - unsigned pos = bl.length(); - uint32_t n = 0; - _encoderaw(n, bl); - for (typename std::list::const_iterator p = ls.begin(); p != ls.end(); ++p) { - n++; - _encode(*p, bl); - } - bl.copy_in(pos, sizeof(n), (char*)&n); - } else { - uint32_t n = ls.size(); // FIXME: this is slow on a list. - _encoderaw(n, bl); - for (typename std::list::const_iterator p = ls.begin(); p != ls.end(); ++p) - _encode(*p, bl); - } -} -template -inline void _decode(std::list& ls, bufferlist& bl, int& off) -{ - uint32_t n; - _decoderaw(n, bl, off); - ls.clear(); - while (n--) { - T v; - _decode(v, bl, off); - ls.push_back(v); - } -} - -// deque -template -inline void _encode(const std::deque& ls, bufferlist& bl) -{ - uint32_t n = ls.size(); - _encoderaw(n, bl); - for (typename std::deque::const_iterator p = ls.begin(); p != ls.end(); ++p) - _encode(*p, bl); -} -template -inline void _decode(std::deque& ls, bufferlist& bl, int& off) -{ - uint32_t n; - _decoderaw(n, bl, off); - ls.clear(); - while (n--) { - T v; - _decode(v, bl, off); - ls.push_back(v); - } -} - -// set -template -inline void _encode(const std::set& s, bufferlist& bl) -{ - uint32_t n = s.size(); - _encoderaw(n, bl); - for (typename std::set::const_iterator p = s.begin(); p != s.end(); ++p) - _encode(*p, bl); -} -template -inline void _decode(std::set& s, bufferlist& bl, int& off) -{ - uint32_t n; - _decoderaw(n, bl, off); - s.clear(); - while (n--) { - T v; - _decode(v, bl, off); - s.insert(v); - } -} - -// vector -template -inline void _encode(const std::vector& v, bufferlist& bl) -{ - uint32_t n = v.size(); - _encoderaw(n, bl); - for (typename std::vector::const_iterator p = v.begin(); p != v.end(); ++p) - _encode(*p, bl); -} -template -inline void _decode(std::vector& v, bufferlist& bl, int& off) -{ - uint32_t n; - _decoderaw(n, bl, off); - v.resize(n); - for (uint32_t i=0; i -inline void _encode(const std::map& m, bufferlist& bl) -{ - uint32_t n = m.size(); - _encoderaw(n, bl); - for (typename std::map::const_iterator p = m.begin(); p != m.end(); ++p) { - _encode(p->first, bl); - _encode(p->second, bl); - } -} -template -inline void _decode(std::map& m, bufferlist& bl, int& off) -{ - uint32_t n; - _decoderaw(n, bl, off); - m.clear(); - while (n--) { - T k; - _decode(k, bl, off); - _decode(m[k], bl, off); - } -} - -// hash_map -template -inline void _encode(const __gnu_cxx::hash_map& m, bufferlist& bl) -{ - uint32_t n = m.size(); - _encoderaw(n, bl); - for (typename __gnu_cxx::hash_map::const_iterator p = m.begin(); p != m.end(); ++p) { - _encode(p->first, bl); - _encode(p->second, bl); - } -} -template -inline void _decode(__gnu_cxx::hash_map& m, bufferlist& bl, int& off) -{ - uint32_t n; - _decoderaw(n, bl, off); - m.clear(); - while (n--) { - T k; - _decode(k, bl, off); - _decode(m[k], bl, off); - } -} - -// string -inline void _encode(const std::string& s, bufferlist& bl) -{ - uint32_t len = s.length(); - _encoderaw(len, bl); - bl.append(s.data(), len); -} -inline void _decode(std::string& s, bufferlist& bl, int& off) -{ - uint32_t len; - _decoderaw(len, bl, off); - s.clear(); - bl.copy(off, len, s); - off += len; -} - -// const char* (encode only, string compatible) -inline void _encode(const char *s, bufferlist& bl) -{ - uint32_t len = strlen(s); - _encoderaw(len, bl); - bl.append(s, len); -} - -// bufferptr (encapsulated) -inline void _encode(const buffer::ptr& bp, bufferlist& bl) -{ - uint32_t len = bp.length(); - _encoderaw(len, bl); - bl.append(bp); -} -inline void _decode(buffer::ptr& bp, bufferlist& bl, int& off) -{ - uint32_t len; - _decoderaw(len, bl, off); - - bufferlist s; - bl.copy(off, len, s); - off += len; - - if (s.buffers().size() == 1) - bp = s.buffers().front(); - else - bp = buffer::copy(s.c_str(), s.length()); -} - -// bufferlist (encapsulated) -inline void _encode(const bufferlist& s, bufferlist& bl) -{ - uint32_t len = s.length(); - _encoderaw(len, bl); - bl.append(s); -} -inline void _encode_destructively(bufferlist& s, bufferlist& bl) -{ - uint32_t len = s.length(); - _encoderaw(len, bl); - bl.claim_append(s); -} -inline void _decode(bufferlist& s, bufferlist& bl, int& off) -{ - uint32_t len; - _decoderaw(len, bl, off); - s.clear(); - bl.copy(off, len, s); - off += len; -} - -// base -template -inline void _encode(const T& t, bufferlist& bl) -{ - _encoderaw(t, bl); -} -template -inline void _decode(T& t, bufferlist& bl, int& off) -{ - _decoderaw(t, bl, off); -} - - - -#endif diff --git a/branches/sage/crush/include/ceph_fs.h b/branches/sage/crush/include/ceph_fs.h deleted file mode 100644 index ede0663a79158..0000000000000 --- a/branches/sage/crush/include/ceph_fs.h +++ /dev/null @@ -1,163 +0,0 @@ -/* ceph_fs.h - * - * C data types to share between kernel and userspace - */ - -#ifndef _FS_CEPH_CEPH_FS_H -#define _FS_CEPH_CEPH_FS_H - -#include - - -typedef __u64 ceph_ino_t; - - -/** - * object id - */ -struct ceph_object { - ceph_ino_t ino; /* inode "file" identifier */ - __u32 bno; /* "block" (object) in that "file" */ - __u32 rev; /* revision. normally ctime (as epoch). */ -}; -typedef struct ceph_object ceph_object_t; - - - - -/** object layout - * how objects are mapped into PGs - */ -#define CEPH_OBJECT_LAYOUT_HASH 1 -#define CEPH_OBJECT_LAYOUT_LINEAR 2 -#define CEPH_OBJECT_LAYOUT_HASHINO 3 - -/** - * pg layout -- how PGs are mapped into (sets of) OSDs - */ -#define CEPH_PG_LAYOUT_CRUSH 0 -#define CEPH_PG_LAYOUT_HASH 1 -#define CEPH_PG_LAYOUT_LINEAR 2 -#define CEPH_PG_LAYOUT_HYBRID 3 - - -/** - * ceph_file_layout - describe data layout for a file/inode - */ -struct ceph_file_layout { - /* file -> object mapping */ - __u32 fl_stripe_unit; /* stripe unit, in bytes. must be multiple of page size. */ - __u32 fl_stripe_count; /* over this many objects */ - __u32 fl_object_size; /* until objects are this big, then move to new objects */ - - /* pg -> disk layout */ - __u32 fl_object_stripe_unit; /* for per-object raid */ - - /* object -> pg layout */ - __s32 fl_pg_preferred; /* preferred primary for pg */ - __u8 fl_pg_type; /* pg type; see PG_TYPE_* */ - __u8 fl_pg_size; /* pg size (num replicas, raid stripe width, etc. */ -}; - -#define ceph_file_layout_stripe_width(l) (l.fl_stripe_unit * l.fl_stripe_count) - -/* period = bytes before i start on a new set of objects */ -#define ceph_file_layout_period(l) (l.fl_object_size * l.fl_stripe_count) - - - -/** - * placement group id - */ -#define CEPH_PG_TYPE_REP 1 -#define CEPH_PG_TYPE_RAID4 2 - -union ceph_pg { - __u64 pg64; - struct { - __s32 preferred; /* preferred primary osd */ - __u16 ps; /* placement seed */ - __u8 type; - __u8 size; - } pg; -}; -typedef union ceph_pg ceph_pg_t; - -#define ceph_pg_is_rep(pg) (pg.pg.type == CEPH_PG_TYPE_REP) -#define ceph_pg_is_raid4(pg) (pg.pg.type == CEPH_PG_TYPE_RAID4) - -/** - * object layout - * - * describe how a given object should be stored. - */ -struct ceph_object_layout { - ceph_pg_t ol_pgid; - __u32 ol_stripe_unit; -}; - - - -/** - * object extent - */ -struct ceph_object_extent { - ceph_object_t oe_oid; - __u64 oe_start; - __u64 oe_length; - struct ceph_object_layout oe_object_layout; - - /* buffer extent reverse mapping? */ -}; - - - - - -/********************************************* - * message types - */ - -/* - * entity_name - */ -struct ceph_entity_name { - __u32 type; - __u32 num; -}; - -#define CEPH_ENTITY_TYPE_MON 1 -#define CEPH_ENTITY_TYPE_MDS 2 -#define CEPH_ENTITY_TYPE_OSD 3 -#define CEPH_ENTITY_TYPE_CLIENT 4 -#define CEPH_ENTITY_TYPE_ADMIN 5 - - -/* - * entity_addr - * ipv4 only for now - */ -struct ceph_entity_addr { - __u64 nonce; - __u32 port; - __u8 ipq[4]; -}; - - -struct ceph_entity_inst { - struct ceph_entity_name name; - struct ceph_entity_addr addr; -}; - - -/* - * message header - */ -struct ceph_message_header { - __u32 type; - struct ceph_entity_inst src, dst; - __u32 source_port, dest_port; - __u32 nchunks; -}; - -#endif diff --git a/branches/sage/crush/include/filepath.h b/branches/sage/crush/include/filepath.h deleted file mode 100644 index 4425e1d7c5b3a..0000000000000 --- a/branches/sage/crush/include/filepath.h +++ /dev/null @@ -1,184 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __FILEPATH_H -#define __FILEPATH_H - - -/* - * BUG: /a/b/c is equivalent to a/b/c in dentry-breakdown, but not string. - * -> should it be different? how? should this[0] be "", with depth 4? - * - */ - - -#include -#include -#include -using namespace std; - -#include "buffer.h" - - -class filepath { - /** path - * can be relative "a/b/c" or absolute "/a/b/c". - */ - string path; - - /** bits - path segemtns - * this is ['a', 'b', 'c'] for both the aboslute and relative case. - * - * NOTE: this value is LAZILY maintained... i.e. it's a cache - */ - mutable vector bits; - - void rebuild_path() { - if (absolute()) - path = "/"; - else - path.clear(); - for (unsigned i=0; i 0) parse_bits(); - return bits.size(); - } - bool empty() const { - return path.length() == 0; - } - - // FIXME: const-edness - bool absolute() { return path.length() && path[0] == '/'; } - bool relative() { return !absolute(); } - - const string& operator[](int i) const { - if (bits.empty() && path.length() > 0) parse_bits(); - return bits[i]; - } - - const string& last_dentry() const { - if (bits.empty() && path.length() > 0) parse_bits(); - return bits[ bits.size()-1 ]; - } - - filepath prefixpath(int s) const { - filepath t; - for (int i=0; i 0) parse_bits(); - bits.pop_back(); - rebuild_path(); - } - void push_dentry(const string& s) { - if (bits.empty() && path.length() > 0) parse_bits(); - bits.push_back(s); - if (path.length() && path[path.length()-1] != '/') - path += "/"; - path += s; - } - void append(const filepath& a) { - for (unsigned i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __FRAG_H -#define __FRAG_H - -#include -#include -#include -#include -#include "buffer.h" -#include "encodable.h" - -/* - * - * the goal here is to use a binary split strategy to partition a namespace. - * frag_t represents a particular fragment. bits() tells you the size of the - * fragment, and value() it's name. this is roughly analogous to an ip address - * and netmask. - * - * fragtree_t represents an entire namespace and it's partition. it essentially - * tells you where fragments are split into other fragments, and by how much - * (i.e. by how many bits, resulting in a power of 2 number of child fragments). - * - * this vaguely resembles a btree, in that when a fragment becomes large or small - * we can split or merge, except that there is no guarantee of being balanced. - * - * presumably we are partitioning the output of a (perhaps specialized) hash - * function. - */ - -/** - * frag_t - * - * description of an individual fragment. that is, a particular piece - * of the overall namespace. - * - * this is conceptually analogous to an ip address and netmask. - * - * a value v falls "within" fragment f iff (v & f.mask()) == f.value(). - * - * we write it as v/b, where v is a value and b is the number of bits. - * 0/0 (bits==0) corresponds to the entire namespace. if we bisect that, - * we get 0/1 and 1/1. quartering gives us 0/2, 1/2, 2/2, 3/2. and so on. - * - * this makes the right most bit of v the "most significant", which is the - * opposite of what we usually see. - */ - -/* - * TODO: - * - get_first_child(), next_sibling(int parent_bits) to make (possibly partial) - * iteration efficient (see, e.g., try_assimilate_children() - * - rework frag_t so that we mask the left-most (most significant) bits instead of - * the right-most (least significant) bits. just because it's more intutive, and - * matches the network/netmask concept. - */ - -typedef uint32_t _frag_t; - -class frag_t { - /* encoded value. - * 8 upper bits = "bits" - * 24 lower bits = "value" - */ - _frag_t _enc; - - public: - frag_t() : _enc(0) { } - frag_t(unsigned v, unsigned b) : _enc((b << 24) + - (v & (0xffffffffULL >> (32-b)))) { } - frag_t(_frag_t e) : _enc(e) { } - - // constructors - void from_unsigned(unsigned e) { _enc = e; } - - // accessors - unsigned value() const { return _enc & 0xffffff; } - unsigned bits() const { return _enc >> 24; } - unsigned mask() const { return 0xffffffffULL >> (32-bits()); } - - operator _frag_t() const { return _enc; } - - // tests - bool contains(unsigned v) const { - return (v & mask()) == value(); - } - bool contains(frag_t sub) const { - return (sub.bits() >= bits() && // they at least as specific as us, - (sub.value() & mask()) == value()); // and they are contained by us. - } - bool is_root() const { - return bits() == 0; - } - frag_t parent() const { - assert(bits() > 0); - return frag_t(value() & (mask() >> 1), bits()-1); - } - - // splitting - void split(int nb, std::list& fragments) const { - assert(nb > 0); - unsigned nway = 1 << nb; - for (unsigned i=0; i 0 && - (value() & (1 << (bits()-1)) == 0); - } - bool is_right() const { - return - bits() > 0 && - (value() & (1 << (bits()-1)) == 1); - } - frag_t left_child() const { - return frag_t(value(), bits()+1); - } - frag_t right_child() const { - return frag_t(value() | (1<: - // frag_t f is split by b bits. - // if child frag_t does not appear, it is not split. - std::map _splits; - - public: - // ------------- - // basics - void swap(fragtree_t& other) { - _splits.swap(other._splits); - } - - // ------------- - // accessors - bool empty() { - return _splits.empty(); - } - int get_split(const frag_t hb) const { - std::map::const_iterator p = _splits.find(hb); - if (p == _splits.end()) - return 0; - else - return p->second; - } - - - bool is_leaf(frag_t x) const { - std::list ls; - get_leaves_under(x, ls); - //cout << "is_leaf(" << x << ") -> " << ls << std::endl; - if (!ls.empty() && - ls.front() == x && - ls.size() == 1) - return true; - return false; - } - - /** - * get_leaves -- list all leaves - */ - void get_leaves(std::list& ls) const { - return get_leaves_under_split(frag_t(), ls); - } - - /** - * get_leaves_under_split -- list all leaves under a known split point (or root) - */ - void get_leaves_under_split(frag_t under, std::list& ls) const { - std::list q; - q.push_back(under); - while (!q.empty()) { - frag_t t = q.front(); - q.pop_front(); - int nb = get_split(t); - if (nb) - t.split(nb, q); // queue up children - else - ls.push_back(t); // not spit, it's a leaf. - } - } - - /** - * get_branch -- get branch point at OR above frag @x - * - may be @x itself, if @x is a split - * - may be root (frag_t()) - */ - frag_t get_branch(frag_t x) const { - while (1) { - if (x == frag_t()) return x; // root - if (get_split(x)) return x; // found it! - x = x.parent(); - } - } - - /** - * get_branch_above -- get a branch point above frag @x - * - may be root (frag_t()) - * - may NOT be @x, even if @x is a split. - */ - frag_t get_branch_above(frag_t x) const { - while (1) { - if (x == frag_t()) return x; // root - x = x.parent(); - if (get_split(x)) return x; // found it! - } - } - - - /** - * get_branch_or_leaf -- get branch or leaf point parent for frag @x - * - may be @x itself, if @x is a split or leaf - * - may be root (frag_t()) - */ - frag_t get_branch_or_leaf(frag_t x) const { - frag_t branch = get_branch(x); - int nb = get_split(branch); - if (nb > 0 && // if branch is a split, and - branch.bits() + nb <= x.bits()) // one of the children is or contains x - return frag_t(x.value(), branch.bits()+nb); // then return that child (it's a leaf) - else - return branch; - } - - /** - * get_leaves_under(x, ls) -- search for any leaves fully contained by x - */ - void get_leaves_under(frag_t x, std::list& ls) const { - std::list q; - q.push_back(get_branch(x)); - while (!q.empty()) { - frag_t t = q.front(); - q.pop_front(); - if (t.bits() >= x.bits() && // if t is more specific than x, and - !x.contains(t)) // x does not contain t, - continue; // then skip - int nb = get_split(t); - if (nb) - t.split(nb, q); // queue up children - else - ls.push_back(t); // not spit, it's a leaf. - } - } - - /** - * contains(fg) -- does fragtree contain the specific frag @x - */ - bool contains(frag_t x) const { - std::list q; - q.push_back(get_branch(x)); - while (!q.empty()) { - frag_t t = q.front(); - q.pop_front(); - if (t.bits() >= x.bits() && // if t is more specific than x, and - !x.contains(t)) // x does not contain t, - continue; // then skip - int nb = get_split(t); - if (nb) { - if (t == x) return false; // it's split. - t.split(nb, q); // queue up children - } else { - if (t == x) return true; // it's there. - } - } - return false; - } - - /** - * operator[] -- map a (hash?) value to a frag - */ - frag_t operator[](unsigned v) const { - frag_t t; - while (1) { - assert(t.contains(v)); - int nb = get_split(t); - - // is this a leaf? - if (nb == 0) return t; // done. - - // pick appropriate child fragment. - unsigned nway = 1 << nb; - unsigned i; - for (i=0; i children; - x.split(nb, children); - int childbits = 0; - for (std::list::iterator p = children.begin(); - p != children.end(); - ++p) { - int cb = get_split(*p); - if (!cb) return; // nope. - if (childbits && cb != childbits) return; // not the same - childbits = cb; - } - // all children are split with childbits! - for (std::list::iterator p = children.begin(); - p != children.end(); - ++p) - _splits.erase(*p); - _splits[x] += childbits; - } - - bool force_to_leaf(frag_t x) { - if (is_leaf(x)) - return false; - - cout << "force_to_leaf " << x << " on " << _splits << std::endl; - - frag_t parent = get_branch_or_leaf(x); - assert(parent.bits() <= x.bits()); - cout << "parent is " << parent << std::endl; - - // do we need to split from parent to x? - if (parent.bits() < x.bits()) { - int spread = x.bits() - parent.bits(); - int nb = get_split(parent); - cout << "spread " << spread << ", parent splits by " << nb << std::endl; - if (nb == 0) { - // easy: split parent (a leaf) by the difference - cout << "splitting parent " << parent << " by spread " << spread << std::endl; - split(parent, spread); - assert(is_leaf(x)); - return true; - } - assert(nb > spread); - - // add an intermediary split - merge(parent, nb); - split(parent, spread); - - std::list subs; - parent.split(spread, subs); - for (std::list::iterator p = subs.begin(); - p != subs.end(); - ++p) { - cout << "splitting intermediate " << *p << " by " << (nb-spread) << std::endl; - split(*p, nb - spread); - } - } - - // x is now a leaf or split. - // hoover up any children. - std::list q; - q.push_back(x); - while (!q.empty()) { - frag_t t = q.front(); - q.pop_front(); - int nb = get_split(t); - if (nb) { - cout << "merging child " << t << " by " << nb << std::endl; - merge(t, nb); // merge this point, and - t.split(nb, q); // queue up children - } - } - - cout << "force_to_leaf done" << std::endl; - assert(is_leaf(x)); - return true; - } - - // verify that we describe a legal partition of the namespace. - void verify() const { - std::map copy; - std::list q; - q.push_back(frag_t()); - - while (1) { - frag_t cur = q.front(); - q.pop_front(); - int b = get_split(cur); - if (!b) continue; - copy[cur] = b; - cur.split(b, q); - } - - assert(copy == _splits); - } - - // encoding - void _encode(bufferlist& bl) const { - ::_encode(_splits, bl); - } - void _decode(bufferlist& bl, int& off) { - ::_decode(_splits, bl, off); - } - void _decode(bufferlist::iterator& p) { - ::_decode_simple(_splits, p); - } - - void print(std::ostream& out) { - out << "fragtree_t("; - std::list q; - q.push_back(frag_t()); - while (!q.empty()) { - frag_t t = q.front(); - q.pop_front(); - // newline + indent? - if (t.bits()) { - out << std::endl; - for (unsigned i=0; i q; - q.push_back(frag_t()); - while (!q.empty()) { - frag_t t = q.front(); - q.pop_front(); - int nb = ft.get_split(t); - if (nb) { - if (t.bits()) out << ' '; - out << t << '%' << nb; - t.split(nb, q); // queue up children - } - } - } - if (1) { - std::list leaves; - ft.get_leaves(leaves); - out << leaves; - } - return out << ")"; -} - - -/** - * fragset_t -- a set of fragments - */ -class fragset_t { - std::set _set; - -public: - std::set &get() { return _set; } - std::set::iterator begin() { return _set.begin(); } - std::set::iterator end() { return _set.end(); } - - bool empty() const { return _set.empty(); } - - bool contains(frag_t f) const { - while (1) { - if (_set.count(f)) return true; - if (f.bits() == 0) return false; - f = f.parent(); - } - } - - void insert(frag_t f) { - _set.insert(f); - simplify(); - } - - void simplify() { - while (1) { - bool clean = true; - std::set::iterator p = _set.begin(); - while (p != _set.end()) { - if (!p->is_root() && - _set.count(p->get_sibling())) { - _set.erase(p->get_sibling()); - _set.insert(p->parent()); - _set.erase(p++); - clean = false; - } else { - p++; - } - } - if (clean) - break; - } - } -}; - -inline std::ostream& operator<<(std::ostream& out, fragset_t& fs) -{ - return out << "fragset_t(" << fs.get() << ")"; -} - -#endif diff --git a/branches/sage/crush/include/hash.h b/branches/sage/crush/include/hash.h deleted file mode 100644 index 0c27d3535174f..0000000000000 --- a/branches/sage/crush/include/hash.h +++ /dev/null @@ -1,70 +0,0 @@ -#ifndef __CEPHHASH_H -#define __CEPHHASH_H - -// Robert Jenkins' function for mixing 32-bit values -// http://burtleburtle.net/bob/hash/evahash.html -// a, b = random bits, c = input and output - -#define hashmix(a,b,c) \ - a=a-b; a=a-c; a=a^(c>>13); \ - b=b-c; b=b-a; b=b^(a<<8); \ - c=c-a; c=c-b; c=c^(b>>13); \ - a=a-b; a=a-c; a=a^(c>>12); \ - b=b-c; b=b-a; b=b^(a<<16); \ - c=c-a; c=c-b; c=c^(b>>5); \ - a=a-b; a=a-c; a=a^(c>>3); \ - b=b-c; b=b-a; b=b^(a<<10); \ - c=c-a; c=c-b; c=c^(b>>15); - - -//namespace ceph { - -template struct rjhash { }; - -inline uint64_t rjhash64(uint64_t key) { - key = (~key) + (key << 21); // key = (key << 21) - key - 1; - key = key ^ (key >> 24); - key = (key + (key << 3)) + (key << 8); // key * 265 - key = key ^ (key >> 14); - key = (key + (key << 2)) + (key << 4); // key * 21 - key = key ^ (key >> 28); - key = key + (key << 31); - return key; -} - -inline uint32_t rjhash32(uint32_t a) { - a = (a+0x7ed55d16) + (a<<12); - a = (a^0xc761c23c) ^ (a>>19); - a = (a+0x165667b1) + (a<<5); - a = (a+0xd3a2646c) ^ (a<<9); - a = (a+0xfd7046c5) + (a<<3); - a = (a^0xb55a4f09) ^ (a>>16); - return a; -} - - -template<> struct rjhash { - inline size_t operator()(const uint32_t x) const { -#ifdef __LP64__ - return rjhash64(x); -#else - return rjhash32(x); -#endif - } -}; - -template<> struct rjhash { - inline size_t operator()(const uint64_t x) const { -#ifdef __LP64__ - return rjhash64(x); -#else - return rjhash32(x) ^ rjhash32(x >> 32); -#endif - } -}; - -//} - - - -#endif diff --git a/branches/sage/crush/include/interval_set.h b/branches/sage/crush/include/interval_set.h deleted file mode 100644 index bc5edbc29441d..0000000000000 --- a/branches/sage/crush/include/interval_set.h +++ /dev/null @@ -1,315 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __INTERVAL_SET_H -#define __INTERVAL_SET_H - -#include -#include -#include -using namespace std; - -#ifndef MIN -# define MIN(a,b) ((a)<=(b) ? (a):(b)) -#endif -#ifndef MAX -# define MAX(a,b) ((a)>=(b) ? (a):(b)) -#endif - - -template -class interval_set { - public: - map m; // map start -> len - int _size; - - // helpers - private: - typename map::const_iterator find_inc(T start) const { - typename map::const_iterator p = m.lower_bound(start); // p->first >= start - if (p != m.begin() && - (p == m.end() || p->first > start)) { - p--; // might overlap? - if (p->first + p->second <= start) - p++; // it doesn't. - } - return p; - } - - typename map::iterator find_inc_m(T start) { - typename map::iterator p = m.lower_bound(start); - if (p != m.begin() && - (p == m.end() || p->first > start)) { - p--; // might overlap? - if (p->first + p->second <= start) - p++; // it doesn't. - } - return p; - } - - typename map::const_iterator find_adj(T start) const { - typename map::const_iterator p = m.lower_bound(start); - if (p != m.begin() && - (p == m.end() || p->first > start)) { - p--; // might touch? - if (p->first + p->second < start) - p++; // it doesn't. - } - return p; - } - - typename map::iterator find_adj_m(T start) { - typename map::iterator p = m.lower_bound(start); - if (p != m.begin() && - (p == m.end() || p->first > start)) { - p--; // might touch? - if (p->first + p->second < start) - p++; // it doesn't. - } - return p; - } - - public: - bool operator==(const interval_set& other) const { - return m == other.m; - } - - int size() { - return _size; - } - - void clear() { - m.clear(); - _size = 0; - } - - bool contains(T i) const { - typename map::const_iterator p = find_inc(i); - if (p == m.end()) return false; - if (p->first > i) return false; - if (p->first+p->second <= i) return false; - assert(p->first <= i && p->first+p->second > i); - return true; - } - bool contains(T start, T len) const { - typename map::const_iterator p = find_inc(start); - if (p == m.end()) return false; - if (p->first > start) return false; - if (p->first+p->second <= start) return false; - assert(p->first <= start && p->first+p->second > start); - if (p->first+p->second < start+len) return false; - return true; - } - bool intersects(T start, T len) const { - interval_set a; - a.insert(start, len); - interval_set i; - i.intersection_of( *this, a ); - if (i.empty()) return false; - return true; - } - - // outer range of set - bool empty() const { - return m.empty(); - } - T start() const { - assert(!empty()); - typename map::const_iterator p = m.begin(); - return p->first; - } - T end() const { - assert(!empty()); - typename map::const_iterator p = m.end(); - p--; - return p->first+p->second; - } - - // interval start after p (where p not in set) - bool starts_after(T i) const { - assert(!contains(i)); - typename map::const_iterator p = find_inc(i); - if (p == m.end()) return false; - return true; - } - T start_after(T i) const { - assert(!contains(i)); - typename map::const_iterator p = find_inc(i); - return p->first; - } - - // interval end that contains start - T end_after(T start) const { - assert(contains(start)); - typename map::const_iterator p = find_inc(start); - return p->first+p->second; - } - - void insert(T val) { - insert(val, 1); - } - - void insert(T start, T len) { - //cout << "insert " << start << "~" << len << endl; - assert(len > 0); - _size += len; - typename map::iterator p = find_adj_m(start); - if (p == m.end()) { - m[start] = len; // new interval - } else { - if (p->first < start) { - - if (p->first + p->second != start) { - //cout << "p is " << p->first << "~" << p->second << ", start is " << start << ", len is " << len << endl; - assert(0); - } - - assert(p->first + p->second == start); - p->second += len; // append to end - - typename map::iterator n = p; - n++; - if (n != m.end() && - start+len == n->first) { // combine with next, too! - p->second += n->second; - m.erase(n); - } - } else { - if (start+len == p->first) { - m[start] = len + p->second; // append to front - m.erase(p); - } else { - assert(p->first > start+len); - m[start] = len; // new interval - } - } - } - } - - void erase(T val) { - erase(val, 1); - } - - void erase(T start, T len) { - typename map::iterator p = find_inc_m(start); - - _size -= len; - - assert(p != m.end()); - assert(p->first <= start); - - T before = start - p->first; - assert(p->second >= before+len); - T after = p->second - before - len; - - if (before) - p->second = before; // shorten bit before - else - m.erase(p); - if (after) - m[start+len] = after; - } - - - void subtract(const interval_set &a) { - for (typename map::const_iterator p = a.m.begin(); - p != a.m.end(); - p++) - erase(p->first, p->second); - } - - void insert(const interval_set &a) { - for (typename map::const_iterator p = a.m.begin(); - p != a.m.end(); - p++) - insert(p->first, p->second); - } - - - void intersection_of(const interval_set &a, const interval_set &b) { - assert(&a != this); - assert(&b != this); - clear(); - - typename map::const_iterator pa = a.m.begin(); - typename map::const_iterator pb = b.m.begin(); - - while (pa != a.m.end() && pb != b.m.end()) { - // passing? - if (pa->first + pa->second <= pb->first) - { pa++; continue; } - if (pb->first + pb->second <= pa->first) - { pb++; continue; } - T start = MAX(pa->first, pb->first); - T end = MIN(pa->first+pa->second, pb->first+pb->second); - assert(end > start); - insert(start, end-start); - if (pa->first+pa->second > pb->first+pb->second) - pb++; - else - pa++; - } - } - - void union_of(const interval_set &a, const interval_set &b) { - assert(&a != this); - assert(&b != this); - clear(); - - //cout << "union_of" << endl; - - // a - m = a.m; - - // - (a*b) - interval_set ab; - ab.intersection_of(a, b); - subtract(ab); - - // + b - insert(b); - return; - } - void union_of(const interval_set &b) { - interval_set a; - a.m.swap(m); - union_of(a, b); - } - - bool subset_of(const interval_set &big) const { - for (typename map::const_iterator i = m.begin(); - i != m.end(); - i++) - if (!big.contains(i->first, i->second)) return false; - return true; - } - -}; - -template -inline ostream& operator<<(ostream& out, const interval_set &s) { - out << "["; - for (typename map::const_iterator i = s.m.begin(); - i != s.m.end(); - i++) { - if (i != s.m.begin()) out << ","; - out << i->first << "~" << i->second; - } - out << "]"; - return out; -} - - -#endif diff --git a/branches/sage/crush/include/object.h b/branches/sage/crush/include/object.h deleted file mode 100644 index 3b8ac05a86b38..0000000000000 --- a/branches/sage/crush/include/object.h +++ /dev/null @@ -1,99 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __OBJECT_H -#define __OBJECT_H - -#include - -#include -#include -using namespace std; - -#include -using namespace __gnu_cxx; - -#include "hash.h" - -typedef uint32_t objectrev_t; - -struct object_t { - static const uint32_t MAXREV = 0xffffffffU; - - uint64_t ino; // "file" identifier - uint32_t bno; // "block" in that "file" - objectrev_t rev; // revision. normally ctime (as epoch). - - object_t() : ino(0), bno(0), rev(0) {} - object_t(uint64_t i, uint32_t b) : ino(i), bno(b), rev(0) {} - object_t(uint64_t i, uint32_t b, uint32_t r) : ino(i), bno(b), rev(r) {} -}; - - -inline bool operator==(const object_t l, const object_t r) { - return (l.ino == r.ino) && (l.bno == r.bno) && (l.rev == r.rev); -} -inline bool operator!=(const object_t l, const object_t r) { - return (l.ino != r.ino) || (l.bno != r.bno) || (l.rev != r.rev); -} -inline bool operator>(const object_t l, const object_t r) { - if (l.ino > r.ino) return true; - if (l.ino < r.ino) return false; - if (l.bno > r.bno) return true; - if (l.bno < r.bno) return false; - if (l.rev > r.rev) return true; - return false; -} -inline bool operator<(const object_t l, const object_t r) { - if (l.ino < r.ino) return true; - if (l.ino > r.ino) return false; - if (l.bno < r.bno) return true; - if (l.bno > r.bno) return false; - if (l.rev < r.rev) return true; - return false; -} -inline bool operator>=(const object_t l, const object_t r) { - return !(l < r); -} -inline bool operator<=(const object_t l, const object_t r) { - return !(l > r); -} -inline ostream& operator<<(ostream& out, const object_t o) { - out << hex << o.ino << '.'; - out.setf(ios::right); - out.fill('0'); - out << setw(8) << o.bno << dec; - out.unsetf(ios::right); - if (o.rev) - out << '.' << o.rev; - return out; -} - - - - - -namespace __gnu_cxx { - template<> struct hash { - size_t operator()(const object_t &r) const { - static rjhash H; - static rjhash I; - //static hash H; - //static hash I; - return H(r.ino) ^ I(r.bno) ^ I(r.rev); - } - }; - -} -#endif diff --git a/branches/sage/crush/include/types.h b/branches/sage/crush/include/types.h deleted file mode 100644 index cf8374d329a77..0000000000000 --- a/branches/sage/crush/include/types.h +++ /dev/null @@ -1,303 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_TYPES_H -#define __MDS_TYPES_H - -extern "C" { -#include -#include -#include -#include -#include "statlite.h" -} - -#include -#include -#include -#include -#include -#include -#include - -using namespace std; - -#include -using namespace __gnu_cxx; - -#include "ceph_fs.h" - - -#include "object.h" -#include "utime.h" - - -#ifndef MIN -# define MIN(a,b) ((a) < (b) ? (a):(b)) -#endif -#ifndef MAX -# define MAX(a,b) ((a) > (b) ? (a):(b)) -#endif - - -// -- stl crap -- - -namespace __gnu_cxx { - template<> struct hash< std::string > - { - size_t operator()( const std::string& x ) const - { - static hash H; - return H(x.c_str()); - } - }; - -#ifndef __LP64__ - template<> struct hash { - size_t operator()(int64_t __x) const { - static hash H; - return H((__x >> 32) ^ (__x & 0xffffffff)); - } - }; - template<> struct hash { - size_t operator()(uint64_t __x) const { - static hash H; - return H((__x >> 32) ^ (__x & 0xffffffff)); - } - }; -#endif - -} - - -/* - * comparators for stl containers - */ -// for hash_map: -// hash_map, eqstr> vals; -struct eqstr -{ - bool operator()(const char* s1, const char* s2) const - { - return strcmp(s1, s2) == 0; - } -}; - -// for set, map -struct ltstr -{ - bool operator()(const char* s1, const char* s2) const - { - return strcmp(s1, s2) < 0; - } -}; - - - -// ---------------------- -// some basic types - -typedef uint64_t tid_t; // transaction id -typedef uint64_t version_t; -typedef uint32_t epoch_t; // map epoch (32bits -> 13 epochs/second for 10 years) - - - -#define O_LAZY 01000000 - - - -typedef ceph_file_layout FileLayout; - - -// -------------------------------------- -// inode - -typedef __uint64_t _inodeno_t; - -struct inodeno_t { - _inodeno_t val; - inodeno_t() : val(0) {} - inodeno_t(_inodeno_t v) : val(v) {} - inodeno_t operator+=(inodeno_t o) { val += o.val; return *this; } - operator _inodeno_t() const { return val; } -}; - -inline ostream& operator<<(ostream& out, inodeno_t ino) { - return out << hex << ino.val << dec; -} - -namespace __gnu_cxx { - template<> struct hash< inodeno_t > - { - size_t operator()( const inodeno_t& x ) const - { - static rjhash H; - return H(x.val); - } - }; -} - - -#define INODE_MODE_FILE 0100000 // S_IFREG -#define INODE_MODE_SYMLINK 0120000 // S_IFLNK -#define INODE_MODE_DIR 0040000 // S_IFDIR -#define INODE_TYPE_MASK 0170000 - -#define FILE_MODE_R 1 -#define FILE_MODE_W 2 -#define FILE_MODE_RW (1|2) -#define FILE_MODE_LAZY 4 - -/** stat masks - */ -#define STAT_MASK_INO 1 // inode nmber -#define STAT_MASK_TYPE 2 // file type bits of the mode -#define STAT_MASK_BASE 4 // layout, symlink value -#define STAT_MASK_AUTH 8 // uid, gid, mode -#define STAT_MASK_LINK 16 // nlink, anchored -#define STAT_MASK_FILE 32 // mtime, size. - -#define STAT_MASK_ALL 63 - -#define STAT_MASK_SIZE STAT_MASK_FILE // size, blksize, blocks -#define STAT_MASK_MTIME STAT_MASK_FILE // mtime -#define STAT_MASK_ATIME STAT_MASK_FILE // atime -#define STAT_MASK_CTIME (STAT_MASK_FILE|STAT_MASK_AUTH|STAT_MASK_LINK) // ctime - -inline int DT_TO_MODE(int dt) { - return dt << 12; -} - -struct inode_t { - // base (immutable) - inodeno_t ino; - FileLayout layout; // ?immutable? - uint32_t rdev; // if special file - - // affected by any inode change... - utime_t ctime; // inode change time - - // perm (namespace permissions) - uint32_t mode; - uid_t uid; - gid_t gid; - - // nlink - int32_t nlink; - bool anchored; // auth only? - - // file (data access) - int64_t size, max_size, allocated_size; - utime_t mtime; // file data modify time. - utime_t atime; // file data access time. - - // special stuff - version_t version; // auth only - version_t file_data_version; // auth only - - // file type - bool is_symlink() { return (mode & INODE_TYPE_MASK) == INODE_MODE_SYMLINK; } - bool is_dir() { return (mode & INODE_TYPE_MASK) == INODE_MODE_DIR; } - bool is_file() { return (mode & INODE_TYPE_MASK) == INODE_MODE_FILE; } - - // corresponding d_types - static const unsigned char DT_REG = 8; - static const unsigned char DT_DIR = 4; - static const unsigned char DT_LNK = 10; -}; - -inline unsigned char MODE_TO_DT(int mode) { - return mode >> 12; -} - - - - - - -// dentries -#define MAX_DENTRY_LEN 255 - - - - -// -- io helpers -- - -template -inline ostream& operator<<(ostream& out, pair v) { - return out << v.first << "," << v.second; -} - -template -inline ostream& operator<<(ostream& out, vector& v) { - out << "["; - for (unsigned i=0; i -inline ostream& operator<<(ostream& out, const list& ilist) { - for (typename list::const_iterator it = ilist.begin(); - it != ilist.end(); - it++) { - if (it != ilist.begin()) out << ","; - out << *it; - } - return out; -} - -template -inline ostream& operator<<(ostream& out, const set& iset) { - for (typename set::const_iterator it = iset.begin(); - it != iset.end(); - it++) { - if (it != iset.begin()) out << ","; - out << *it; - } - return out; -} - -template -inline ostream& operator<<(ostream& out, const multiset& iset) { - for (typename multiset::const_iterator it = iset.begin(); - it != iset.end(); - it++) { - if (it != iset.begin()) out << ","; - out << *it; - } - return out; -} - -template -inline ostream& operator<<(ostream& out, const map& m) -{ - out << "{"; - for (typename map::const_iterator it = m.begin(); - it != m.end(); - it++) { - if (it != m.begin()) out << ","; - out << it->first << "=" << it->second; - } - out << "}"; - return out; -} - - - -#endif diff --git a/branches/sage/crush/include/utime.h b/branches/sage/crush/include/utime.h deleted file mode 100644 index 7fef5a7f930d2..0000000000000 --- a/branches/sage/crush/include/utime.h +++ /dev/null @@ -1,149 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __UTIME_H -#define __UTIME_H - -#include -#include -#include - -// -------- -// utime_t - -typedef struct timeval _utime_t; - -class utime_t { - private: - struct timeval tv; - - struct timeval& timeval() { return tv; } - friend class Clock; - - - public: - void normalize() { - if (tv.tv_usec > 1000*1000) { - tv.tv_sec += tv.tv_usec / (1000*1000); - tv.tv_usec %= 1000*1000; - } - } - - // cons - utime_t() { tv.tv_sec = 0; tv.tv_usec = 0; normalize(); } - //utime_t(time_t s) { tv.tv_sec = s; tv.tv_usec = 0; } - utime_t(time_t s, int u) { tv.tv_sec = s; tv.tv_usec = u; normalize(); } - utime_t(const _utime_t &v) : tv(v) {} - /* - utime_t(double d) { - tv.tv_sec = (time_t)trunc(d); - tv.tv_usec = (__suseconds_t)((d - tv.tv_sec) / (double)1000000.0); - } - */ - - // accessors - time_t sec() const { return tv.tv_sec; } - long usec() const { return tv.tv_usec; } - int nsec() const { return tv.tv_usec*1000; } - - // ref accessors/modifiers - time_t& sec_ref() { return tv.tv_sec; } - // FIXME: tv.tv_usec is a __darwin_suseconds_t on Darwin. - // is just casting it to long& OK? - long& usec_ref() { return (long&) tv.tv_usec; } - - struct timeval& tv_ref() { return tv; } - - // cast to double - operator double() { - return (double)sec() + ((double)usec() / 1000000.0L); - } -}; - -// arithmetic operators -inline utime_t operator+(const utime_t& l, const utime_t& r) { - return utime_t( l.sec() + r.sec() + (l.usec()+r.usec())/1000000L, - (l.usec()+r.usec())%1000000L ); -} -inline utime_t& operator+=(utime_t& l, const utime_t& r) { - l.sec_ref() += r.sec() + (l.usec()+r.usec())/1000000L; - l.usec_ref() += r.usec(); - l.usec_ref() %= 1000000L; - return l; -} -inline utime_t& operator+=(utime_t& l, double f) { - double fs = trunc(f); - double us = (f - fs) * (double)1000000.0; - l.sec_ref() += (long)fs; - l.usec_ref() += (long)us; - l.normalize(); - return l; -} - -inline utime_t operator-(const utime_t& l, const utime_t& r) { - return utime_t( l.sec() - r.sec() - (l.usec()= r.usec()) - l.usec_ref() -= r.usec(); - else { - l.usec_ref() += 1000000L - r.usec(); - l.sec_ref()--; - } - return l; -} -inline utime_t& operator-=(utime_t& l, double f) { - l += -f; - return l; -} - -inline bool operator>(const utime_t& a, const utime_t& b) -{ - return (a.sec() > b.sec()) || (a.sec() == b.sec() && a.usec() > b.usec()); -} -inline bool operator<(const utime_t& a, const utime_t& b) -{ - return (a.sec() < b.sec()) || (a.sec() == b.sec() && a.usec() < b.usec()); -} - -// ostream -inline std::ostream& operator<<(std::ostream& out, const utime_t& t) -{ - out.setf(std::ios::right); - out.fill('0'); - if (t.sec() < ((time_t)(60*60*24*365*10))) { - // raw seconds. this looks like a relative time. - out << (long)t.sec(); - } else { - // localtime. this looks like an absolute time. - struct tm bdt; - time_t tt = t.sec(); - localtime_r(&tt, &bdt); - out << std::setw(2) << (bdt.tm_year-100) // 2007 -> '07' - << std::setw(2) << (bdt.tm_mon+1) - << std::setw(2) << bdt.tm_mday - << "." - << std::setw(2) << bdt.tm_hour - << std::setw(2) << bdt.tm_min - << std::setw(2) << bdt.tm_sec; - } - out << "."; - out << std::setw(6) << t.usec(); - out.unsetf(std::ios::right); - return out; -} - -#endif diff --git a/branches/sage/crush/jobs/alc.tp b/branches/sage/crush/jobs/alc.tp deleted file mode 100644 index c600850c54be0..0000000000000 --- a/branches/sage/crush/jobs/alc.tp +++ /dev/null @@ -1,38 +0,0 @@ -#PSUB -s /bin/bash # Sets your shell in batch -#PSUB -c alc # Where to run the job - -#PSUB -eo # Send std error & std out to the same file - -#PSUB -ln $NUM # Number of nodes to use -#PSUB -g $NUM # Total Number of tasks to use -#PSUB -cpn 1 # cpus per node - -####PSUB -c 1024Mb # memory limit -#PSUB -lc 1500 # Core file size per process -#PSUB -nr # Do not automatically resubmit job -#PSUB -tM 20m # Select time limit. The default time limit - # is only 30 minutes! Time can be HH:MM:SS or HH:MM - -#PSUB -o $CWD/$OUT # filename for output - -# Put your commands here. Remember to 'cd' to the appropriate -# directory, because the job will initially be in your home directory. -# To run a parallel job, you need to use the srun. - - - -echo job $PSUB_JOBID nodes $NUM name $NAME - -# environment -cd $CWD -export LD_LIBRARY_PATH=/usr/lib/mpi/mpi_gnu/lib - -# create fakestore dirs -srun -l -N $NUM -ppbatch bash -c "test -d tmp/osddata || mkdir tmp/osddata || echo cant make osddata ; uptime" - -# go -srun -l -N $NUM -ppbatch $CMD && touch $DONE - -# clean up fakestore -srun -l -N $NUM -ppbatch bash -c 'uptime ; rm -r tmp/osddata/*' - diff --git a/branches/sage/crush/jobs/alcdat/makedirs b/branches/sage/crush/jobs/alcdat/makedirs deleted file mode 100644 index af5a098a254c9..0000000000000 --- a/branches/sage/crush/jobs/alcdat/makedirs +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - #'nummds' => [1, 2, 4, 8, 16, 32, 48, 64, 80, 96, 128, 160, 192], - 'nummds' => [1, 2, 8, 16, 32, 48, 64, 80, 96, 112, 128],#144, 160, 192, 208], - - 'cper' => [15,20], - '_dep' => [ 'cnode' => '$nummds',# / 4 + 1', - 'numclient' => '$nummds * $cper', - 'numosd' => '$nummds > 1 ? $nummds:2', - 'n' => '1 + $cnode + $nummds + $numosd' ], - - # parameters - 'fs' => 'ebofs', - #'fs' => 'fakestore', - - 'mds_bal_rep' => 10000, # none of that! - 'mds_decay_halflife' => 30, - - 'mds_bal_interval' => 45, - 'mds_bal_max' => [2], - - 'until' => 300, # --syn until $n ... when to stop clients - 'kill_after' => 400, - 'start' => 100, - 'end' => 300, - - 'makedirs' => 1, - 'makedirs_dirs' => 10, - 'makedirs_files' => 10, - 'makedirs_depth' => 4, - - # --meta_log_layout_scount 32 --meta_log_layout_ssize 256 - # --osd_pg_layout linear - 'custom' => '--tcp_skip_rank0 --meta_log_layout_num_rep 1 --meta_dir_layout_num_rep 1 --mds_shutdown_check 60', - #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', - - 'comb' => { - 'x' => 'nummds', - 'vars' => [ 'mds.req' ] - } -}; diff --git a/branches/sage/crush/jobs/alcdat/makedirs.big b/branches/sage/crush/jobs/alcdat/makedirs.big deleted file mode 100644 index c67b2b93dd742..0000000000000 --- a/branches/sage/crush/jobs/alcdat/makedirs.big +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - #'nummds' => [1, 2, 4, 8, 16, 32, 48, 64, 80, 96, 128, 160, 192], - 'nummds' => [160, 200],#[1, 2, 8, 16, 32, 48, 64, 80, 96, 112, 128],#144, 160, 192, 208], - - 'cper' => [15,20], - '_dep' => [ 'cnode' => '40',#$nummds',# / 4 + 1', - 'numclient' => '$nummds * $cper', - 'numosd' => '$nummds * .8', - 'n' => '415'],#1 + $cnode + $nummds + $numosd' ], - - # parameters - 'fs' => 'ebofs', - #'fs' => 'fakestore', - - 'mds_bal_rep' => 10000, # none of that! - 'mds_decay_halflife' => 30, - - 'mds_bal_interval' => 45, - 'mds_bal_max' => 2, - - 'until' => 300, # --syn until $n ... when to stop clients - 'kill_after' => 400, - 'start' => 100, - 'end' => 300, - - 'makedirs' => 1, - 'makedirs_dirs' => 10, - 'makedirs_files' => 10, - 'makedirs_depth' => 4, - - # --meta_log_layout_scount 32 --meta_log_layout_ssize 256 - # --osd_pg_layout linear - 'custom' => '--tcp_skip_rank0 --meta_log_layout_num_rep 1 --meta_dir_layout_num_rep 1 --mds_shutdown_check 60', - #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', - - 'comb' => { - 'x' => 'nummds', - 'vars' => [ 'mds.req' ] - } -}; diff --git a/branches/sage/crush/jobs/alcdat/makedirs.tput b/branches/sage/crush/jobs/alcdat/makedirs.tput deleted file mode 100644 index 8dd5ae4c47d8c..0000000000000 --- a/branches/sage/crush/jobs/alcdat/makedirs.tput +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - #'nummds' => [1, 2, 4, 8, 16, 32, 48, 64, 80, 96, 128, 160, 192], - 'nummds' => [4, 16, 64],#[1, 16, 64, 128],#144, 160, 192, 208], - - #'cper' => [2, 5, 7, 10, 13, 16, 20, 30, 40, 50, 100, 150], - 'cper' => [13, 30, 40], # just for final run... - '_dep' => [ 'cnode' => '$nummds',# / 4 + 1', - 'numclient' => '$nummds * $cper', - 'numosd' => '$nummds', - 'n' => '1 + $cnode + $nummds + $numosd' ], - - # parameters - 'fs' => 'ebofs', - #'fs' => 'fakestore', - - 'mds_bal_rep' => 10000, # none of that! - 'mds_decay_halflife' => 30, - - 'mds_bal_interval' => 45, - 'mds_bal_max' => 2, - - 'until' => 300, # --syn until $n ... when to stop clients - 'kill_after' => 400, - 'start' => 100, - 'end' => 300, - - 'makedirs' => 1, - 'makedirs_dirs' => 10, - 'makedirs_files' => 10, - 'makedirs_depth' => 4, - - # --meta_log_layout_scount 32 --meta_log_layout_ssize 256 - # --osd_pg_layout linear - 'custom' => '--tcp_skip_rank0 --meta_log_layout_num_rep 1 --meta_dir_layout_num_rep 1 --mds_shutdown_check 60', - #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', - - 'comb' => { - 'x' => 'cper',#nummds', - 'vars' => [ 'mds.req', 'cl.lat' ] - } -}; diff --git a/branches/sage/crush/jobs/alcdat/makefiles.shared b/branches/sage/crush/jobs/alcdat/makefiles.shared deleted file mode 100644 index ab96702c73289..0000000000000 --- a/branches/sage/crush/jobs/alcdat/makefiles.shared +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - 'nummds' => [1, 2, 4, 8, 16, 32, 64, 96, 128], #2, 4, 8, 16, 32, 48, 64, 80, 96], - - 'cper' => [25, 50, 100, 150],# 100, 150, 200], - - '_dep' => [ 'cnode' => '$nummds', - 'numclient' => '$nummds * $cper', - 'numosd' => '$nummds', - 'n' => '1 + $cnode + $nummds + $numosd' ], - - # parameters - 'fs' => 'ebofs', - - 'mds_bal_hash_wr' => 1000, - - 'until' => 180, # --syn until $n ... when to stop clients - 'kill_after' => 250, - 'start' => 30, - 'end' => 180, - - 'custom' => '--tcp_skip_rank0 --meta_log_layout_num_rep 1 --meta_dir_layout_num_rep 1 --mds_shutdown_check 60 --syn makefiles 100000 1000 0', - - 'comb' => { - 'x' => 'nummds', - 'vars' => [ 'mds.req', 'cl.lat' ] - } -}; diff --git a/branches/sage/crush/jobs/alcdat/openshared b/branches/sage/crush/jobs/alcdat/openshared deleted file mode 100644 index 5ed7ba95894b3..0000000000000 --- a/branches/sage/crush/jobs/alcdat/openshared +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - 'nummds' => [1, 4, 16, 64, 128, 192 ], - - 'cper' => [10, 50, 100, 150], - '_dep' => [ 'cnode' => '$nummds',# > 30 ? 30:$nummds', - 'numclient' => '$nummds*$cper', - 'numosd' => '$nummds > 30 ? 30:$nummds', - 'n' => '1 + $cnode + $nummds + $numosd' ], - - # parameters - 'fs' => 'ebofs', - - 'mds_bal_interval' => 10000, - 'mds_bal_hash_wr' => 1000, - - 'until' => 120, # --syn until $n ... when to stop clients - 'kill_after' => 180, - 'start' => 10, - 'end' => 120, - - 'custom' => '--tcp_skip_rank0 --debug_mds_balancer 10 --mds_shutdown_check 60 --syn only 0 --syn createshared 10 --syn sleep 5 --syn openshared 10 10000', - - 'comb' => { - 'x' => 'nummds', - 'vars' => [ 'mds.req', 'cl.lat' ] - } -}; diff --git a/branches/sage/crush/jobs/alcdat/ossh.include b/branches/sage/crush/jobs/alcdat/ossh.include deleted file mode 100644 index c9a368ba5c60f..0000000000000 --- a/branches/sage/crush/jobs/alcdat/ossh.include +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 10, - - #'nummds' => [2, 4, 8, 16, 32, 48, 64, 80, 96, 128], - #'nummds' => [1, 2, 4, 6, 7], # googoo - 'nummds' => [ 2, 4, 8, 16, 32, 48, 64, 80, 96, 128 ], - - #'trace' => ['make.lib', 'make.include'], - - 'mds_bal_interval' => 45, - 'mds_bal_max' => 2,#6, #[ 2,4,6 ], - 'mds_decay_halflife' => 30, - 'mds_bal_rep' => 1500, - 'mds_bal_hash_rd' => 100000, - - 'cper' => [15, 20],#25, 50, 100], #50,#[25, 50, 75, 100],#50,# [ 50, 100 ], - #'cper' => 125, #[30, 50, 75, 100, 125, 150], #50, #[10,50,100],# [ 50, 75, 100 ], - - '_dep' => [ 'cnode' => '$nummds', - 'numclient' => '$nummds * $cper', - 'numosd' => '$nummds', - 'n' => '1 + $cnode + $nummds + $numosd' ], - - 'custom' => '--tcp_skip_rank0 --mds_shutdown_check 60 --syn only 0 --syn trace traces/openssh/untar.include 1 --syn sleep 30 --syn trace traces/openssh/make.include 1000', - - # parameters - 'fs' => 'ebofs', - - #'until' => 500, - #'kill_after' => 600, - #'start' => 200, - #'end' => 500, - 'until' => 300, # --syn until $n ... when to stop clients - 'kill_after' => 400, - 'start' => 200, - 'end' => 300, - - 'comb' => { - 'x' => 'nummds', - 'vars' => [ 'mds.req' ] - } -}; diff --git a/branches/sage/crush/jobs/alcdat/ossh.include.big b/branches/sage/crush/jobs/alcdat/ossh.include.big deleted file mode 100644 index b92895a53a763..0000000000000 --- a/branches/sage/crush/jobs/alcdat/ossh.include.big +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 10, - - #'nummds' => [2, 4, 8, 16, 32, 48, 64, 80, 96, 128], - #'nummds' => [1, 2, 4, 6, 7], # googoo - #'nummds' => [ 2, 4, 8, 16, 32, 48, 64, 80, 96, 128 ], - 'nummds' => [160,200], - - #'trace' => ['make.lib', 'make.include'], - - 'mds_bal_interval' => 45, - 'mds_bal_max' => 2,#6, #[ 2,4,6 ], - 'mds_decay_halflife' => 30, - 'mds_bal_rep' => 1500, - 'mds_bal_hash_rd' => 100000, - - 'cper' => [25, 50], #50,#[25, 50, 75, 100],#50,# [ 50, 100 ], - #'cper' => 125, #[30, 50, 75, 100, 125, 150], #50, #[10,50,100],# [ 50, 75, 100 ], - - '_dep' => [ 'cnode' => '$nummds', - 'numclient' => '$nummds * $cper', - 'numosd' => '$nummds * .6', - 'n' => '415'],#1 + $cnode + $nummds + $numosd' ], - - 'custom' => '--tcp_skip_rank0 --tcp_overlay_clients --mds_shutdown_check 60 --syn only 0 --syn trace traces/openssh/untar.include 1 --syn sleep 30 --syn trace traces/openssh/make.include 1000', - - # parameters - 'fs' => 'ebofs', - - #'until' => 500, - #'kill_after' => 600, - #'start' => 200, - #'end' => 500, - 'until' => 300, # --syn until $n ... when to stop clients - 'kill_after' => 400, - 'start' => 200, - 'end' => 300, - - 'comb' => { - 'x' => 'nummds', - 'vars' => [ 'mds.req' ] - } -}; diff --git a/branches/sage/crush/jobs/alcdat/ossh.lib b/branches/sage/crush/jobs/alcdat/ossh.lib deleted file mode 100644 index 73372866f051f..0000000000000 --- a/branches/sage/crush/jobs/alcdat/ossh.lib +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 10, - - #'nummds' => [1, 2, 4, 8, 16, 32, 48, 64, 80, 96, 128], - 'nummds' => [2, 4, 8, 16, 32, 48, 64, 80, 96, 128], - - #'nummds' => [1, 2, 4, 6, 7], # googoo - #'trace' => ['make.lib', 'make.include'], - - 'mds_bal_interval' => 90, #[30, 60, 90], #$[60,90], #[60,90],#[30, 60, 90], - #'mds_bal_max' => [4, 10],#6,#[2,4,6,8], - - 'mds_decay_halflife' => 30, - 'mds_bal_rep' => 1500, - 'cper' => [10, 16], #50,#[25, 50, 75, 100],#50,# [ 50, 100 ], - - '_dep' => [ 'cnode' => '$nummds', - 'numclient' => '$nummds * $cper', - 'numosd' => '$nummds', - 'n' => '1 + $cnode + $nummds + $numosd' ], - - - 'custom' => '--tcp_skip_rank0 --debug_mds_balancer 1 --mds_shutdown_check 60 --syn only 0 --syn trace traces/openssh/untar.lib 1 --syn sleep 10 --syn trace traces/openssh/make.lib 1000', - - # parameters - #'fs' => ['fakestore'], - 'fs' => 'ebofs', - - #'until' => 500, - #'kill_after' => 600, - #'start' => 200, - #'end' => 500, - 'until' => 300, # --syn until $n ... when to stop clients - 'kill_after' => 400, - 'start' => 150, - 'end' => 300, - - 'comb' => { - 'x' => 'nummds', - 'vars' => [ 'mds.req' ] - } -}; diff --git a/branches/sage/crush/jobs/alcdat/ossh.lib.big b/branches/sage/crush/jobs/alcdat/ossh.lib.big deleted file mode 100644 index b9e0dd1ff68cd..0000000000000 --- a/branches/sage/crush/jobs/alcdat/ossh.lib.big +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 10, - - #'nummds' => [1, 2, 4, 8, 16, 32, 48, 64, 80, 96, 128], - #'nummds' => [2, 4, 8, 16, 32, 48, 64, 80, 96, 128], - 'nummds' => [160,200], - - #'nummds' => [1, 2, 4, 6, 7], # googoo - #'trace' => ['make.lib', 'make.include'], - - 'mds_bal_interval' => 90, #[30, 60, 90], #$[60,90], #[60,90],#[30, 60, 90], - #'mds_bal_max' => [4, 10],#6,#[2,4,6,8], - - 'mds_decay_halflife' => 30, - 'mds_bal_rep' => 1500, - 'cper' => [25, 50, 100], #50,#[25, 50, 75, 100],#50,# [ 50, 100 ], - - '_dep' => [ 'cnode' => 0,#'30', - 'numclient' => '$nummds * $cper', - 'numosd' => '$nummds * .6', - 'n' => '415'],#'1 + $cnode + $nummds + $numosd' ], - - - 'custom' => '--tcp_skip_rank0 --debug_mds_balancer 1 --mds_shutdown_check 60 --syn only 0 --syn trace traces/openssh/untar.lib 1 --syn sleep 10 --syn trace traces/openssh/make.lib 1000', - - # parameters - #'fs' => ['fakestore'], - 'fs' => 'ebofs', - - #'until' => 500, - #'kill_after' => 600, - #'start' => 200, - #'end' => 500, - 'until' => 300, # --syn until $n ... when to stop clients - 'kill_after' => 400, - 'start' => 150, - 'end' => 300, - - 'comb' => { - 'x' => 'nummds', - 'vars' => [ 'mds.req' ] - } -}; diff --git a/branches/sage/crush/jobs/alcdat/striping b/branches/sage/crush/jobs/alcdat/striping deleted file mode 100644 index de71828d12bde..0000000000000 --- a/branches/sage/crush/jobs/alcdat/striping +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - 'nummds' => 1, - 'numosd' => 10, - - 'cnode' => 10, - 'cper' => [ 10, 25, 50, 100 ], - - '_dep' => [ 'numclient' => '$cper * $cnode', - 'n' => '1 + $cnode + $nummds + $numosd', - 'file_layout_osize' => '$writefile_size' ], - - # parameters - 'fs' => 'ebofs', - #'fs' => 'fakestore', - - 'until' => 160, # --syn until $n ... when to stop clients - 'kill_after' => 200, - 'start' => 100, - 'end' => 160, - - 'writefile' => 1, - 'writefile_size' => [ -# 4*1024*1024, - 1024*1024 ], -# 256*1024, -# 64*1024 - 'writefile_mb' => 100000, - - 'osd_pg_bits' => 10,#16, - #'osd_pg_bits' => [ 16, 20 ], - - #'osd_object_layout' => [ 'hash', 'hashino', 'linear' ], - 'osd_pg_layout' => [ 'crush', -# 'hash', - 'linear' ], - - 'custom' => '--tcp_skip_rank0 --file_layout_num_rep 1 --mds_shutdown_check 60', - - 'comb' => { - 'x' => 'cper',#writefile_size', - 'vars' => [ 'osd.c_wrb', 'osd.c_wr' ], - } -}; diff --git a/branches/sage/crush/jobs/example b/branches/sage/crush/jobs/example deleted file mode 100644 index 802a8b66e6332..0000000000000 --- a/branches/sage/crush/jobs/example +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/perl -# hi there -{ - # startup - 'n' => 30, # number of mpi nodes - 'sleep' => 3, # seconds to sleep between runs (so you have time to control-c out) - 'nummds' => 1, - 'numosd' => 6, - 'numclient' => 100, - - 'until' => 100, # --syn until $n ... synthetic client will stop itself after this many seconds. - 'kill_after' => 300, # seconds before everything commits suicide (in case something hangs) - - # stuff i want to vary - # here's a simple example: - - # do --syn writefile command - 'writefile' => 1, - # and very the write size - 'writefile_size' => [ # vary -# 2048*1024, - 1024*1024, - 512*1024, - 256*1024, - 128*1024, - 64*1024, - 48*1024, - 32*1024, - 28*1024, - 24*1024, - 16*1024, - 12*1024, - 8*1024, - 4096, -# 256, -# 16, -# 1 - ], - 'writefile_mb' => 1000, # each client shoudl write 1GB (or more likely, keep going until time runs out) - - 'file_layout_num_rep'=> [1,2], # also vary the replication level - - # pass some other random things to newsyn - 'custom' => '--', - - # for final summation (script/sum.pl) - # specify time period to look at the results - 'start' => 30, # skip first 30 seconds, so that caches are full etc. - 'end' => 90, # go for 60 seconds - - # what should i parse/plot? - 'comb' => { - 'x' => 'writefile_size', - 'vars' => [ 'osd.c_wrb', 'osd.r_wrb' ], - } -}; diff --git a/branches/sage/crush/jobs/mds/log_striping b/branches/sage/crush/jobs/mds/log_striping deleted file mode 100644 index 46242cdda4f00..0000000000000 --- a/branches/sage/crush/jobs/mds/log_striping +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - 'kill_after' => 300, - - 'nummds' => 1, - 'numosd' => 8, - 'numclient' => 100, - 'n' => 16, - - # parameters - 'fs' => ['ebofs','fakestore'], - 'meta_log_ssize' => [ 128, 256, 1024, 1 << 15, 1 << 20 ], - 'meta_log_scount' => 4,#[ 1, 2, 4, 8 ], - - 'until' => 200, # --syn until $n ... when to stop clients - - 'makedirs' => 1, - 'makedirs_dirs' => 10, - 'makedirs_files' => 10, - 'makedirs_depth' => 4, - - 'custom' => '--tcp_skip_rank0', - #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', - - # for final summation (script/sum.pl) - 'start' => 100, - 'end' => 550, - - 'comb' => { - 'x' => 'nummds', - 'vars' => [ 'mds.req' ] - } -}; diff --git a/branches/sage/crush/jobs/mds/makedir_lat b/branches/sage/crush/jobs/mds/makedir_lat deleted file mode 100644 index 63374f52a36c0..0000000000000 --- a/branches/sage/crush/jobs/mds/makedir_lat +++ /dev/null @@ -1,33 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - 'nummds' => 1, - 'numosd' => 8, - 'numclient' => [1],#, 40, 80, 160 ], - 'n' => 20, - - 'fs' => 'ebofs', - - 'start' => 20, - 'end' => 40, - 'until' => 40, - 'kill_after' => 60, - - 'makedirs' => 1, - 'makedirs_dirs' => 10, - 'makedirs_files' => 10, - 'makedirs_depth' => 5, - - 'mds_local_osd' => [ 0, 1 ], - 'meta_log_layout_num_rep' => [ 0, 1, 2, 3, 4], - - 'custom' => '--tcp_skip_rank0', - - 'comb' => { - 'x' => 'meta_log_layout_num_rep', - 'vars' => [ 'mds.log.lat', 'cl.lat', 'osd.rlat' ] - } -}; diff --git a/branches/sage/crush/jobs/mds/makedirs b/branches/sage/crush/jobs/mds/makedirs deleted file mode 100644 index 4ca42d72fa37e..0000000000000 --- a/branches/sage/crush/jobs/mds/makedirs +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - '_psub' => 'jobs/alc.tp', - - 'sleep' => 3, - - 'nummds' => [1, 2, 4, 6, 8, 12, 16, 24, 32, 40, 48, 64], - - 'cper' => 50, - '_dep' => [ 'cnode' => '$nummds', - 'numclient' => '$cnode * $cper', - 'numosd' => '$nummds * 2', - 'n' => '1 + $cnode + $nummds + $numosd' ], - - # parameters - #'fs' => 'ebofs', - 'fs' => 'fakestore', - - 'until' => 300, # --syn until $n ... when to stop clients - 'kill_after' => 400, - - 'makedirs' => 1, - 'makedirs_dirs' => 10, - 'makedirs_files' => 10, - 'makedirs_depth' => 3, - - 'custom' => '--tcp_skip_rank0', - #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', - - # for final summation (script/sum.pl) - 'start' => 100, - 'end' => 550, - - 'comb' => { - 'x' => 'nummds', - 'vars' => [ 'mds.req' ] - } -}; diff --git a/branches/sage/crush/jobs/mds/opensshlib b/branches/sage/crush/jobs/mds/opensshlib deleted file mode 100644 index d8b61ae52c655..0000000000000 --- a/branches/sage/crush/jobs/mds/opensshlib +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - 'nummds' => [1, 2, 4, 7], # googoo - #'nummds' => [1, 2, 4, 6, 8, 12, 16, 24, 32, 40, 48, 64], # alc - - - # parameters - 'fs' => 'ebofs', - #'fs' => 'fakestore', - - 'until' => 300, # --syn until $n ... when to stop clients - 'kill_after' => 400, - 'start' => 150, - 'end' => 300, - - 'mds_bal_interval' => 90,#[60, 90], - #'mds_bal_max' => [3,4,5], - 'mds_bal_max' => 4, - 'mds_decay_halflife' => 30,#[15, 25, 30, 45, 60], - 'mds_bal_rep' => 1500,#[1000, 1500, 2000], - - 'decay_hl' => 100,#[ 25, 50, 100, 150 ], - - 'cper' => 100, #[50, 75, 100, 125, 150, 200], - '_dep' => [ 'cnode' => '$nummds', - 'numclient' => '$nummds * $cper', - 'numosd' => '$nummds * 2', - 'n' => '1 + $cnode + $nummds + $numosd', - 'mds_bal_rep' => '$mds_decay_halflife * $decay_hl'], - - 'custom' => '--tcp_skip_rank0 --syn only 0 --syn trace traces/openssh/untar.lib 1 --syn sleep 10 --syn randomsleep 30 --syn trace traces/openssh/make.lib 100 --debug_mds_balancer 1 --mds_shutdown_check 60', - #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', - - # for final summation (script/sum.pl) - - 'comb' => { - 'x' => 'nummds',#decay_hl',#'nummds', - 'vars' => [ 'mds.req' ] - } -}; diff --git a/branches/sage/crush/jobs/meta1 b/branches/sage/crush/jobs/meta1 deleted file mode 100644 index 743212f1c3009..0000000000000 --- a/branches/sage/crush/jobs/meta1 +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/sh - -# makedirs for 300 seconds -# first bit in memory -# second bit is commiting from journal too -# then walk fs for 300 seconds -# this should all be in memory. - -JOB="meta1" -ARGS="--numosd 10 --fullmkfs --syn until 180 --syn makedirs 10 10 4 --syn until 360 --syn repeatwalk --mds_bal_max 1 --osd_fsync 0 --mds_log_max_len 200000 --mds_cache_size 500000" - -#rm core* ; make tcpsyn && mpiexec -l -n 17 ./tcpsyn $ARGS --nummds 1 --log_name $JOB/1 --numclient 25 > log/$JOB/o.1 -#rm core* ; make tcpsyn && mpiexec -l -n 18 ./tcpsyn $ARGS --nummds 2 --log_name $JOB/2 --numclient 50 > log/$JOB/o.2 -#rm core* ; make tcpsyn && mpiexec -l -n 20 ./tcpsyn $ARGS --nummds 4 --log_name $JOB/4 --numclient 100 > log/$JOB/o.4 -#rm core* ; make tcpsyn && mpiexec -l -n 24 ./tcpsyn $ARGS --nummds 8 --log_name $JOB/8 --numclient 200 > log/$JOB/o.8 -#rm core* ; make tcpsyn && mpiexec -l -n 28 ./tcpsyn $ARGS --nummds 12 --log_name $JOB/12 --numclient 300 > log/$JOB/o.12 -rm core* ; make tcpsyn && mpiexec -l -n 32 ./tcpsyn $ARGS --nummds 16 --log_name $JOB/16 --numclient 300 > log/$JOB/o.16 - - diff --git a/branches/sage/crush/jobs/meta1.proc.sh b/branches/sage/crush/jobs/meta1.proc.sh deleted file mode 100755 index 616acbefff619..0000000000000 --- a/branches/sage/crush/jobs/meta1.proc.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/sh - -for d in 1 2 4 8 12 -do - echo $d - cd $d - ../../../script/sum.pl mds? mds?? > mds.sum - ../../../script/sum.pl -avg mds? mds?? > mds.avg - - ../../../script/sum.pl -start 90 -end 180 mds? mds?? > mds.sum.makedirs - ../../../script/sum.pl -start 200 -end 300 mds? mds?? > mds.sum.walk - - cd .. -done diff --git a/branches/sage/crush/jobs/osd/ebofs b/branches/sage/crush/jobs/osd/ebofs deleted file mode 100644 index 5d11523f6f832..0000000000000 --- a/branches/sage/crush/jobs/osd/ebofs +++ /dev/null @@ -1,51 +0,0 @@ -# hi there -{ - # startup - 'n' => 30, # mpi nodes - 'sleep' => 3, # seconds between runs - 'nummds' => 1, - 'numosd' => 8, - 'numclient' => 100,#[10, 50, 100, 200, 400], - -'kill_after' => 200, - - # parameters - 'fs' => 'ebofs',#[ -# 'obfs', -# 'fakestore', -# 'ebofs' -# ], - 'until' => 100, # --syn until $n ... when to stop clients - 'writefile' => 1, - 'writefile_size' => [ -# 2560000, - 1024000, - 262144, -# 131072, -# 98304, - 65536, -# 16384, -# 4096, - 256, -# 16, -# 1 - ], - 'writefile_mb' => 1000, - - 'ebofs_idle_commit_ms' => [ 100, 500 ], - 'ebofs_abp_max_alloc' => [ 4096*16, 4096*64 ], - -# 'custom' => '--tcp_skip_rank0',# --osd_maxthreads 0', - 'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', - - # for final summation (script/sum.pl) - 'start' => 30, - 'end' => 90, - -'comb' => { - 'x' => 'writefile_size', - 'vars' => [ 'osd.c_wrb' ], -# 'maptitle' => { 'osd_object_layout=' => '', -# ',osd_pg_layout=' => ' + '} - } -}; diff --git a/branches/sage/crush/jobs/osd/mds_log b/branches/sage/crush/jobs/osd/mds_log deleted file mode 100644 index 0f99f6998dcfc..0000000000000 --- a/branches/sage/crush/jobs/osd/mds_log +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - #'_psub' => 'jobs/alc.tp', - 'sleep' => 3, - - 'nummds' => 1, - 'numclient' => [5, 10, 15, 25, 50, 100, 200, 300, 400], - #'numclient' => [ 50, 100, 200 ], - 'numosd' => [2,4],#[ 4, 8, 12, 16, 20, 24 ], - 'n' => 12, - - # parameters - 'fs' => 'fakestore',#['ebofs', 'fakestore','obfs'], - #'fs' => 'ebofs', - #'ebofs_commit_ms' => [ 1000, 5000 ], - #'osd_maxthreads' => [ 0, 1, 2, 4, 8 ], - - 'until' => 100, # --syn until $n ... when to stop clients - 'kill_after' => 300, - 'start' => 20, - 'end' => 90, - - 'makedirs' => 1, - 'makedirs_dirs' => 10, - 'makedirs_files' => 10, - 'makedirs_depth' => 3, - - - #'meta_log_layout_ssize' => [256, 512, 1024, 4096, 16384, 65536, 262400], - #'meta_log_layout_scount' => [2, 4, 8], - #'meta_log_layout_num_rep' => [1, 2], - #'meta_log_layout_num_rep' => 1, - - 'custom' => '--tcp_skip_rank0 --mds_shutdown_check 60', - #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', - - 'comb' => { - 'x' => 'numclient',#'meta_log_layout_ssize', - 'vars' => [ 'mds.req' ] - } -}; diff --git a/branches/sage/crush/jobs/osd/osd_threads b/branches/sage/crush/jobs/osd/osd_threads deleted file mode 100644 index ef271f9e88710..0000000000000 --- a/branches/sage/crush/jobs/osd/osd_threads +++ /dev/null @@ -1,33 +0,0 @@ -# hi there -{ - # startup - 'n' => 30, # mpi nodes - 'sleep' => 10, # seconds between runs - 'nummds' => 1, - 'numosd' => 8, - 'numclient' => 50, - - # parameters - 'fs' => [ -# 'obfs', - 'fakestore', - 'ebofs' - ], - 'until' => 100, # --syn until $n ... when to stop clients - 'writefile' => 1, - 'writefile_size' => [ - 1024000, - 131072, - 65536, - 16 - ], - 'writefile_mb' => 1000, - - 'osd_maxthreads' => [0, 1, 2, 4, 8], - - 'custom' => '--tcp_skip_rank0', - - # for final summation (script/sum.pl) - 'start' => 30, - 'end' => 90 -}; diff --git a/branches/sage/crush/jobs/osd/striping b/branches/sage/crush/jobs/osd/striping deleted file mode 100644 index ea8cabe643274..0000000000000 --- a/branches/sage/crush/jobs/osd/striping +++ /dev/null @@ -1,78 +0,0 @@ -#!/usr/bin/perl -# hi there -{ - # startup - #'n' => 28, # mpi nodes - - 'sleep' => 3, # seconds between runs - 'nummds' => 1, - - 'numosd' => [2,3,4,5,6,7,8,10,12], #[6, 8, 10, 12, 16], - 'numosd' => [14], - #'cper' => [4, 5, 6, 7, 8, 9, 10, 11, 12], #[1, 4, 6, 8, 16, 32, 64], - #'cper' => [4, 6, 8, 10, 12, 16, 24, 32 ], #[1, 4, 6, 8, 16, 32, 64], - 'cper' => [30], - - '_dep' => [ 'cnode' => '$numosd', - 'numclient' => '$cnode * $cper', - 'n' => 38],#'$nummds + $numosd + $cnode'], - #'numclient' => [5, 10, 20, 50, 75, 100, 150 ], - - 'start' => 30, - 'end' => 90, - 'until' => 100, # --syn until $n ... when to stop clients - 'kill_after' => 260, - - # parameters - 'fs' => 'ebofs', - 'writefile' => 1, - - 'writefile_size' => [# 4096, - # 16*1024, - # 64*1024, - # 256*1024, - 1024*1024 ], -# 'writefile_size' => [ -# 2048*1024, -# 1048576, -# 512*1024, -# 262144, -# 65536, -# 16384 -# ], - 'writefile_mb' => 1000, - - 'file_layout_num_rep'=> [1,2,3], - - 'osd_pg_bits' => 12,#[6, 8, 10, 12, 14], - - 'osd_object_layout' => [ 'hashino' ],#'hash', 'hashino', 'linear' ], - 'osd_pg_layout' => [ 'crush', 'linear' ],#, 'linear'],#, 'hash' ],#, 'linear' ],#, 'hash' ], - - #'custom' => '--tcp_skip_rank0', # --osd_maxthreads 0', - #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', - - # for final summation (script/sum.pl) - - 'comb' => { - 'x' => 'numosd',#'writefile_size', - 'vars' => [ 'osd.c_wrb', 'cl.wrlat' ], -# 'maptitle' => { 'osd_object_layout=' => '', -# ',osd_pg_layout=' => ' + '} - } -}; - - -=item some googoo notes - -for 1mb 1x writes, - - with numosd=6, min cper=6 to saturate (cper_saturate) - googoo saturates at numosd=8. (osd_saturate) - - -> so, numosd=6 or 7 is a safe size! - - - - -=cut diff --git a/branches/sage/crush/jobs/osd/wr_lat2 b/branches/sage/crush/jobs/osd/wr_lat2 deleted file mode 100644 index 47053dd61f3ab..0000000000000 --- a/branches/sage/crush/jobs/osd/wr_lat2 +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - 'nummds' => 1, - 'numosd' => [12], - 'numclient' => [1],#, 40, 80, 160 ], - 'n' => 16, - - 'fs' => 'ebofs', - - 'start' => 10, - 'end' => 40, - 'until' => 40, - 'kill_after' => 90, - - 'writefile' => 1, - 'writefile_size' => [4096, - 8*1024, - 16*1024, - 32*1024, - 64*1024, - 128*1024, - 256*1024, - 512*1024, - 1024*1024], - 'writefile_mb' => 10000, - - #'tcp_multi_out' => [0,1], - -# 'mds_local_osd' => [ 0, 1 ], - 'file_layout_num_rep' => [1,2,3],#, 2, 3, 4], - - 'client_oc' => [0,1], - - 'custom' => '--tcp_skip_rank0', - - 'comb' => { - 'x' => 'writefile_size',#'file_layout_num_rep', - 'vars' => [ 'osd.c_wrb','cl.wrlat' ] - } -}; diff --git a/branches/sage/crush/jobs/osd/write_sizes b/branches/sage/crush/jobs/osd/write_sizes deleted file mode 100644 index 57369f3a97c50..0000000000000 --- a/branches/sage/crush/jobs/osd/write_sizes +++ /dev/null @@ -1,60 +0,0 @@ -#!/usr/bin/perl -# hi there -{ - # startup - 'n' => 30, # mpi nodes - 'sleep' => 3, # seconds between runs - 'nummds' => 1, - 'numosd' => 6, - 'numclient' => 100,#[25,50,100,300],#100,#[10, 50, 100, 200, 400], - - 'until' => 100, # --syn until $n ... when to stop clients - 'kill_after' => 300, - - # parameters - 'fs' => [ -# 'obfs', - 'fakestore', -# 'ebofs' - ], - 'writefile' => 1, - 'writefile_size' => [ -# 2048*1024, - 1024*1024, - 512*1024, - 256*1024, - 128*1024, - 64*1024, - 48*1024, - 32*1024, - 28*1024, - 24*1024, - 16*1024, - 12*1024, - 8*1024, - 4096, -# 256, -# 16, -# 1 - ], - 'writefile_mb' => 1000, - - 'file_layout_num_rep'=> 1,#[1,2], - - -# 'ebofs_idle_commit_ms' => [ 100, 500 ], -# 'ebofs_abp_max_alloc' => [ 4096*16, 4096*64 ], - - 'custom' => '--debug_after 110 --debug_mds 15 --debug 5 --mds_shutdown_check 60', - - # for final summation (script/sum.pl) - 'start' => 30, - 'end' => 90, - - 'comb' => { - 'x' => 'writefile_size', - 'vars' => [ 'osd.c_wrb' ], -# 'maptitle' => { 'osd_object_layout=' => '', -# ',osd_pg_layout=' => ' + '} - } -}; diff --git a/branches/sage/crush/jobs/rados/map_dist b/branches/sage/crush/jobs/rados/map_dist deleted file mode 100644 index 39f16daa1cdc2..0000000000000 --- a/branches/sage/crush/jobs/rados/map_dist +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - 'osdbits' => [6,7,8],#,9],10,11], - 'pgperbits' => [3],#,4,5],#[4,6,8], - - 'nummds' => 1, - - '_dep' => [ 'numosd' => '1 << $osdbits', - 'osd_pg_bits' => '$pgperbits + $osdbits', - 'n' => '3 + $numosd / 32'], - 'numclient' => 0, - - 'fake_osdmap_updates' => [30], - - 'fs' => 'ebofs', - - 'start' => 30, - 'end' => 300, - 'kill_after' => 300, - - 'custom' => '--bdev_lock 0 --ms_stripe_osds --osd_maxthreads 0', - #'custom' => '--tcp_skip_rank0', - - 'comb' => { - 'x' => 'osdbits', - 'vars' => [ 'osd.sum=mapi', 'osd.sum=mapidup', 'osd.numpg', 'osd.pingset' ] - } -}; diff --git a/branches/sage/crush/jobs/rados/rep_lat b/branches/sage/crush/jobs/rados/rep_lat deleted file mode 100644 index 3f5ab0c8a7d87..0000000000000 --- a/branches/sage/crush/jobs/rados/rep_lat +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - 'nummds' => 1, - 'numosd' => 8, #[6], - 'numclient' => 1,#, 40, 80, 160 ], - 'n' => 10, - - 'fs' => 'ebofs', - - 'start' => 10, - 'end' => 40, - 'until' => 40, - 'kill_after' => 45, - - 'writefile' => 1, - 'writefile_size' => [4096, -# 8*1024, -# 16*1024, -# 32*1024, - 64*1024, -# 128*1024, -# 256*1024, -# 512*1024, -# 1024*1024 -], - 'writefile_mb' => 10000, - - 'osd_rep' => [0,1,2], - - 'file_layout_num_rep' => [1,2,3,4,5,6],#, 2, 3, 4], - - 'osd_pg_bits' => 4, - 'custom' => '--osd_max_rep 8', - - 'comb' => { - 'x' => 'file_layout_num_rep', - 'vars' => [ 'cl.wrlat' ] - } -}; diff --git a/branches/sage/crush/jobs/rados/wr_sizes b/branches/sage/crush/jobs/rados/wr_sizes deleted file mode 100644 index 9b73477ea6142..0000000000000 --- a/branches/sage/crush/jobs/rados/wr_sizes +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - 'nummds' => 1, - 'numosd' => [8],#10,14,16], - 'numclient' => [10*16], - 'n' => 15, - - 'fs' => 'ebofs', - - 'start' => 60, - 'end' => 90, - 'until' => 90, - 'kill_after' => 190, - - 'writefile' => 1, - 'writefile_size' => [4096, - 8*1024, - 16*1024, - 32*1024, - 64*1024, - 128*1024, - 256*1024, - # 512*1024, -# 4*1024*1024, -# 2*1024*1024, -# 1024*1024 -], - 'writefile_mb' => 10000, - - 'file_layout_num_rep' => 1, - 'file_layout_ssize' => 4*1024*1024, - 'file_layout_osize' => 4*1024*1024, - - 'osd_pg_bits' => 12, - -# 'ebofs_freelist' => [0, 1080, 65400], - - 'custom' => '--objecter_buffer_uncommitted 0', - - #'custom' => '--tcp_skip_rank0', - - 'comb' => { - 'x' => 'writefile_size', - 'vars' => [ 'osd.c_wrb', 'cl.wrlat' ] - } -}; diff --git a/branches/sage/crush/kernel/Makefile b/branches/sage/crush/kernel/Makefile deleted file mode 100644 index 2ad658b5566c3..0000000000000 --- a/branches/sage/crush/kernel/Makefile +++ /dev/null @@ -1,7 +0,0 @@ -# -# Makefile for CEPH filesystem. -# - -obj-$(CONFIG_CEPH_FS) += ceph.o - -ceph-objs := inode.o diff --git a/branches/sage/crush/kernel/bufferlist.h b/branches/sage/crush/kernel/bufferlist.h deleted file mode 100644 index 78e4c6f95216b..0000000000000 --- a/branches/sage/crush/kernel/bufferlist.h +++ /dev/null @@ -1,74 +0,0 @@ -#ifndef _FS_CEPH_BUFFERLIST_H -#define _FS_CEPH_BUFFERLIST_H - - - -#define CEPH_BUFFERLIST_START_IOVLEN 8 /* embed some statically, for fast normal case */ - -struct ceph_bufferlist { - struct iovec *b_iov; /* data payload */ - struct iovec b_iov_array[CEPH_BUFFERLIST_START_IOVLEN]; - int b_iovlen; /* used/defined elements in b_iov */ - int b_iovmax; /* allocated size of b_iov array */ - struct iovec b_append; /* preallocated memory for appending data to this bufferlist */ -}; - -struct ceph_bufferlist_iterator { - int i_iov; /* which iov */ - int i_off; /* offset in that iov */ -}; - -/* - * add referenced memory to the bufferlist. - * expand b_iov array if necessary. - * extend tail iovec if the added region is contiguous. - */ -void ceph_bufferlist_append_ref(struct ceph_bufferlist *bl, void *p, int len) -{ - struct iovec *tmpvec; - if (bl->b_iovlen == bl->b_iovmax) { - if (bl->b_iovmax) { - bl->b_iovmax *= 2; - tmpvec = kmalloc(bl->b_iovmax); - memcpy(tmpvec, bl->b_iov, sizeof(iovec)*bl->b_iovlen); - if (bl->b_iovlen > CEPH_BUFFERLIST_START_IOVLEN) - kfree(bl->b_iov); - bl->b_iov = tmpvec; - memset(tmpvec + bl->b_iovlen, 0, - sizeof(iovec)*(bl->b_iovmax - bl->b_iovlen)); - } else { - bl->b_iovmax = CEPH_BUFFERLIST_START_IOVLEN; - bl->b_iov = bl->b_iov_array; - } - } - - if (bl->b_iovlen && - p == bl->b_iov[bl->b_iovlen-1].iov_base + bl->b_iov[bl->b_iovlen-1].iov_base) { - bl->b_iov[bl->b_iovlen-1].iov_len += len; - } else { - bl->b_iov[bl->b_iovlen].iov_base = p; - bl->b_iov[bl->b_iovlen].iov_len = len; - bl->b_iovlen++; - } -} - -void ceph_bufferlist_append_copy(struct ceph_bufferlist *bl, void *p, int len) -{ - int s; - while (len > 0) { - /* allocate more space? */ - if (!bl->b_append.iov_len) { - bl->b_append.iov_len = (len + PAGE_SIZE - 1) & ~(PAGE_SIZE-1); - bl->b_append.iov_base = kmalloc(bl->b_append.iov_len, GFP_KERNEL); - } - - /* copy what we can */ - s = min(bl->b_append.iov_len, len); - memcpy(bl->b_append.iov_base, s); - ceph_bufferlist_append_ref(bl, b_append.iov_base, b_append.iov_len); - len -= s; - bl->b_append.iov_len -= s; - } -} - -#endif diff --git a/branches/sage/crush/kernel/inode.c b/branches/sage/crush/kernel/inode.c deleted file mode 100644 index f21fa58386935..0000000000000 --- a/branches/sage/crush/kernel/inode.c +++ /dev/null @@ -1,136 +0,0 @@ -#include -#include -#include -#include -#include "ceph_fs.h" - -MODULE_AUTHOR("Patience Warnick "); -MODULE_DESCRIPTION("Ceph filesystem for Linux"); -MODULE_LICENSE("GPL"); - - -static void ceph_read_inode(struct inode * inode) -{ - return; -} - -static int ceph_write_inode(struct inode * inode, int unused) -{ - lock_kernel(); - unlock_kernel(); - return 0; -} - -static void ceph_delete_inode(struct inode * inode) -{ - return; -} - -static void ceph_put_super(struct super_block *s) -{ - return; -} - -static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) -{ - return 0; -} - -static void ceph_write_super(struct super_block *s) -{ - lock_kernel(); - unlock_kernel(); - return; -} - -static struct kmem_cache *ceph_inode_cachep; - -static struct inode *ceph_alloc_inode(struct super_block *sb) -{ - struct ceph_inode_info *ci; - ci = kmem_cache_alloc(ceph_inode_cachep, GFP_KERNEL); - if (!ci) - return NULL; - return &ci->vfs_inode; -} - -static void ceph_destroy_inode(struct inode *inode) -{ - kmem_cache_free(ceph_inode_cachep, CEPH_I(inode)); -} - -static void init_once(void *foo, struct kmem_cache *cachep, unsigned long flags) -{ - struct ceph_inode_info *ci = (struct ceph_inode_info *) foo; - - if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == - SLAB_CTOR_CONSTRUCTOR) - inode_init_once(&ci->vfs_inode); -} - -static int init_inodecache(void) -{ - ceph_inode_cachep = kmem_cache_create("ceph_inode_cache", - sizeof(struct ceph_inode_info), - 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), - init_once, NULL); - if (ceph_inode_cachep == NULL) - return -ENOMEM; - return 0; -} - -static void destroy_inodecache(void) -{ - kmem_cache_destroy(ceph_inode_cachep); -} - -static const struct super_operations ceph_sops = { - .alloc_inode = ceph_alloc_inode, - .destroy_inode = ceph_destroy_inode, - .read_inode = ceph_read_inode, - .write_inode = ceph_write_inode, - .delete_inode = ceph_delete_inode, - .put_super = ceph_put_super, - .write_super = ceph_write_super, - .statfs = ceph_statfs, -}; - -static int ceph_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) -{ - printk(KERN_INFO "entered ceph_get_sb\n"); - return 0; -} - -static struct file_system_type ceph_fs_type = { - .owner = THIS_MODULE, - .name = "ceph", - .get_sb = ceph_get_sb, - .kill_sb = kill_block_super, -/* .fs_flags = */ -}; - -static int __init init_ceph(void) -{ - int ret = 0; - - printk(KERN_INFO "ceph init\n"); - if (!(ret = init_inodecache())) { - if ((ret = register_filesystem(&ceph_fs_type))) { - destroy_inodecache(); - } - } - return ret; -} - -static void __exit exit_ceph(void) -{ - printk(KERN_INFO "ceph exit\n"); - - unregister_filesystem(&ceph_fs_type); -} - - -module_init(init_ceph); -module_exit(exit_ceph); diff --git a/branches/sage/crush/kernel/kmsg.h b/branches/sage/crush/kernel/kmsg.h deleted file mode 100644 index cc44b9fd291e5..0000000000000 --- a/branches/sage/crush/kernel/kmsg.h +++ /dev/null @@ -1,51 +0,0 @@ -#ifndef __FS_CEPH_KMSG_H -#define __FS_CEPH_KMSG_H - -#include -#include -#include -#include "ceph_kthread.h" - - -struct ceph_kthreadpool *msg_threadpool; /* thread pool */ - -struct ceph_kmsgr { - void *m_parent; - struct radix_tree_root mpipes; /* other nodes talk to */ - struct client_thread_info cthread; /* listener thread info */ -}; - -struct ceph_message { - struct ceph_message_header *msghdr; /* header */ - struct kvec *m_iov; /* data storage */ - size_t m_iovlen; /* is this kvec.iov_len why need it in kvec? */ - struct list_head m_list_head; -}; - -struct ceph_kmsg_pipe { - int p_sd; /* socket descriptor */ - __u64 p_out_seq; /* last message sent */ - __u64 p_in_seq; /* last message received */ - - /* out queue */ - struct list_head p_out_queue; - struct ceph_message *p_out_partial; /* partially sent message */ - int p_out_partial_pos; - struct list_head p_out_sent; /* sent but unacked; may need resend if connection drops */ - - /* partially read message contents */ - struct kvec *p_in_partial_iov; /* hrm, this probably isn't what we want */ - size_t p_in_partial_iovlen; - size_t p_in_parital_iovmax; /* size of currently allocated m_iov array */ - /* .. or something like that? .. */ - -}; - -/* - * function prototypes - */ -extern void ceph_read_message(struct ceph_message *message); -extern void ceph_write_message(struct ceph_message *message); -extern void ceph_client_dispatch(void *fs_client, struct ceph_message *message ); -extern void queue_message(struct ceph_message *message); -#endif diff --git a/branches/sage/crush/kernel/kmsgbits.h b/branches/sage/crush/kernel/kmsgbits.h deleted file mode 100644 index 730ff7f74f53b..0000000000000 --- a/branches/sage/crush/kernel/kmsgbits.h +++ /dev/null @@ -1,50 +0,0 @@ - - - -struct ceph_message { - struct ceph_message_header m_hdr; /* header */ - struct iovec *m_iov; /* payload */ - int m_iovlen; - struct list_head m_list_head; /* i'll sit in a queue */ -}; - - - -/* dispatch method type */ -typedef void (*ceph_kmsg_dispatch_t)(void *h, struct ceph_message *message); - -struct ceph_kmsg { - ceph_kmsg_dispatch_t m_dispatch; /* where incoming messages go */ - void *m_parent; /* passed to dispatch method */ - - struct ceph_kmsg_threadpool *m_threadpool; /* pool of threads */ - /* possibly shared among multiple kmsg instances? */ - - /* other nodes i talk to */ - struct radix_tree_root m_pipes; /* key: dest addr, value: ceph_kmsg_pipe */ - - /* ... */ -}; - - -struct ceph_kmsg_pipe { - int p_sd; /* socket descriptor */ - __u64 p_out_seq; /* last message sent */ - __u64 p_in_seq; /* last message received */ - - /* out queue */ - struct list_head p_out_queue; - struct ceph_message *p_out_partial; /* partially sent message */ - int p_out_partial_pos; - struct list_head p_out_sent; /* sent but unacked; may need resend if connection drops */ - - /* partially read message contents */ - struct iovec *p_in_partial_iov; /* hrm, this probably isn't what we want */ - int p_in_partial_iovlen; - int p_in_parital_iovmax; /* size of currently allocated m_iov array */ - /* .. or something like that? .. */ - -}; - - - diff --git a/branches/sage/crush/kernel/mds_client.h b/branches/sage/crush/kernel/mds_client.h deleted file mode 100644 index 764d7ccd6bdf6..0000000000000 --- a/branches/sage/crush/kernel/mds_client.h +++ /dev/null @@ -1,42 +0,0 @@ -#ifndef _FS_CEPH_MDS_CLIENT_H -#define _FS_CEPH_MDS_CLIENT_H - -#include -#include "kmsg.h" - -/* - * state associated with an individual MDS<->client session - */ -struct ceph_mds_session { - __u64 s_push_seq; - /* wait queue? */ -}; - -struct ceph_mds_request { - __u64 r_tid; - struct ceph_message *r_msg; - __u8 r_idempotent; - - __u32 r_mds[4]; /* set of mds's with whom request may be outstanding */ - __u32 r_num_mds; /* items in r_mds */ - - __u32 r_num_fwd; /* number of forward attempts */ - __s32 r_resend_mds; /* mds to resend to next, if any*/ - - /* waiter/callback? */ -}; - - -struct ceph_mds_client { - struct ceph_mdsmap *s_mdsmap; /* mds map */ - - /* mds sessions */ - struct ceph_mds_session **s_mds_sessions; /* sparse array; elements NULL if no session */ - int s_max_mds_sessions; /* size of s_mds_sessions array */ - - __u64 s_last_mds_tid; /* id of last mds request */ - struct radix_tree_root s_mds_requests; /* in-flight mds requests */ - -}; - -#endif diff --git a/branches/sage/crush/kernel/mdsmap.h b/branches/sage/crush/kernel/mdsmap.h deleted file mode 100644 index c5a970992c36c..0000000000000 --- a/branches/sage/crush/kernel/mdsmap.h +++ /dev/null @@ -1,42 +0,0 @@ -#ifndef _FS_CEPH_MDSMAP_H -#define _FS_CEPH_MDSMAP_H - -/* see mds/MDSMap.h */ -#define CEPH_MDS_STATE_DNE 0 /* down, never existed. */ -#define CEPH_MDS_STATE_STOPPED -1 /* down, once existed, but no subtrees. empty log. */ -#define CEPH_MDS_STATE_FAILED 2 /* down, active subtrees needs to be recovered. */ - -#define CEPH_MDS_STATE_BOOT -3 /* up, boot announcement. destiny unknown. */ -#define CEPH_MDS_STATE_STANDBY -4 /* up, idle. waiting for assignment by monitor. */ -#define CEPH_MDS_STATE_CREATING -5 /* up, creating MDS instance (new journal, idalloc..). */ -#define CEPH_MDS_STATE_STARTING -6 /* up, starting prior stopped MDS instance. */ - -#define CEPH_MDS_STATE_REPLAY 7 /* up, starting prior failed instance. scanning journal. */ -#define CEPH_MDS_STATE_RESOLVE 8 /* up, disambiguating distributed operations (import, rename, etc.) */ -#define CEPH_MDS_STATE_RECONNECT 9 /* up, reconnect to clients */ -#define CEPH_MDS_STATE_REJOIN 10 /* up, replayed journal, rejoining distributed cache */ -#define CEPH_MDS_STATE_ACTIVE 11 /* up, active */ -#define CEPH_MDS_STATE_STOPPING 12 /* up, exporting metadata (-> standby or out) */ - -/* - * mds map - * - * fields limited to those the client cares about - */ -struct ceph_mdsmap { - __u64 m_epoch; - __u64 m_same_in_set_since; - struct timeval m_created; - __u32 m_anchortable; - __u32 m_root; - struct ceph_entity_addr *m_addr; /* array of addresses */ - __u8 *m_state; /* array of states */ - __u32 m_max_mds; /* size of m_addr, m_state arrays */ -}; - -extern int ceph_mdsmap_get_random_mds(ceph_mdsmap *m); -extern int ceph_mdsmap_get_state(ceph_mdsmap *m, int w); -extern struct ceph_entity_addr *ceph_mdsmap_get_addr(ceph_mdsmap *m, int w); -extern int ceph_mdsmap_decode(ceph_mdsmap *m, iovec *v); - -#endif diff --git a/branches/sage/crush/kernel/monmap.h b/branches/sage/crush/kernel/monmap.h deleted file mode 100644 index 2f60c8a0c3436..0000000000000 --- a/branches/sage/crush/kernel/monmap.h +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef _FS_CEPH_MONMAP_H -#define _FS_CEPH_MONMAP_H - -#include - -/* - * monitor map - */ -struct ceph_monmap { - __u64 m_epoch; - __u32 m_num_mon; - __u32 m_last_mon; - struct ceph_entity_inst m_mon_inst; -}; - -extern int ceph_monmap_pick_mon(struct ceph_monmap *m); -extern int ceph_monmap_decode(struct ceph_monmap *m, struct kvec *v); - -#endif diff --git a/branches/sage/crush/kernel/osd_client.h b/branches/sage/crush/kernel/osd_client.h deleted file mode 100644 index 6efa3b8f2ab25..0000000000000 --- a/branches/sage/crush/kernel/osd_client.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef _FS_CEPH_OSD_CLIENT_H -#define _FS_CEPH_OSD_CLIENT_H - -/* this will be equivalent to osdc/Objecter.h */ - - -/* do these later -#include "osdmap.h" -*/ -struct ceph_osdmap; - - -struct ceph_osd_client { - struct ceph_osdmap *s_osdmap; /* osd map */ - -}; - -#endif diff --git a/branches/sage/crush/kernel/super.h b/branches/sage/crush/kernel/super.h deleted file mode 100644 index 94418511ffa53..0000000000000 --- a/branches/sage/crush/kernel/super.h +++ /dev/null @@ -1,75 +0,0 @@ -#ifndef _FS_CEPH_CEPH_H -#define _FS_CEPH_CEPH_H - -/* #include */ - -#include "kmsg.h" -#include "monmap.h" -#include "mds_client.h" -#include "osd_client.h" - - - -/* - * CEPH per-filesystem client state - * - * possibly shared by multiple mount points, if they are - * mounting the same ceph filesystem/cluster. - */ -struct ceph_fs_client { - __u64 s_fsid; /* hmm this should be part of the monmap? */ - - __u32 s_whoami; /* my client number */ - struct ceph_kmsg *s_kmsg; /* messenger instance */ - - struct ceph_monmap *s_monmap; /* monitor map */ - - struct ceph_mds_client *s_mds_client; - struct ceph_osd_client *s_osd_client; - - int s_ref; /* reference count (for each sb_info that points to me) */ -}; - -/* - * directory of filesystems mounted by this host - * - * key: fsid? ipquad of monitor? hmm! - * value: struct ceph_fs_client* - */ -extern struct radix_tree ceph_fs_clients; - - -/* - * CEPH per-mount superblock info - */ -struct ceph_sb_info { - struct ceph_fs_client *sb_client; - - /* FIXME: add my relative offset into the filesystem, - so we can appropriately mangle/adjust path names in requests, etc. */ -}; - -/* - * CEPH file system in-core inode info - */ -struct ceph_inode_info { - struct ceph_file_layout i_layout; - struct inode vfs_inode; -}; - -static inline struct ceph_inode_info *CEPH_I(struct inode *inode) -{ - return list_entry(inode, struct ceph_inode_info, vfs_inode); -} - - -/* file.c */ -extern const struct inode_operations ceph_file_inops; -extern const struct file_operations ceph_file_operations; -extern const struct address_space_operations ceph_aops; - -/* dir.c */ -extern const struct inode_operations ceph_dir_inops; -extern const struct file_operations ceph_dir_operations; - -#endif /* _FS_CEPH_CEPH_H */ diff --git a/branches/sage/crush/mds/Anchor.h b/branches/sage/crush/mds/Anchor.h deleted file mode 100644 index 748091306a44d..0000000000000 --- a/branches/sage/crush/mds/Anchor.h +++ /dev/null @@ -1,108 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __ANCHOR_H -#define __ANCHOR_H - -#include -using std::string; - -#include "include/types.h" -#include "mdstypes.h" -#include "include/buffer.h" - - -// anchor ops -#define ANCHOR_OP_LOOKUP 1 -#define ANCHOR_OP_LOOKUP_REPLY 2 - -#define ANCHOR_OP_CREATE_PREPARE 11 -#define ANCHOR_OP_CREATE_AGREE 12 - -#define ANCHOR_OP_DESTROY_PREPARE 21 -#define ANCHOR_OP_DESTROY_AGREE 22 - -#define ANCHOR_OP_UPDATE_PREPARE 31 -#define ANCHOR_OP_UPDATE_AGREE 32 - -#define ANCHOR_OP_COMMIT 41 -#define ANCHOR_OP_ACK 42 -#define ANCHOR_OP_ROLLBACK 43 - - - -inline const char* get_anchor_opname(int o) { - switch (o) { - case ANCHOR_OP_LOOKUP: return "lookup"; - case ANCHOR_OP_LOOKUP_REPLY: return "lookup_reply"; - - case ANCHOR_OP_CREATE_PREPARE: return "create_prepare"; - case ANCHOR_OP_CREATE_AGREE: return "create_agree"; - case ANCHOR_OP_DESTROY_PREPARE: return "destroy_prepare"; - case ANCHOR_OP_DESTROY_AGREE: return "destroy_agree"; - case ANCHOR_OP_UPDATE_PREPARE: return "update_prepare"; - case ANCHOR_OP_UPDATE_AGREE: return "update_agree"; - - case ANCHOR_OP_COMMIT: return "commit"; - case ANCHOR_OP_ACK: return "ack"; - case ANCHOR_OP_ROLLBACK: return "rollback"; - default: assert(0); return 0; - } -} - - -// identifies a anchor table mutation - - - -// anchor type - -class Anchor { -public: - inodeno_t ino; // anchored ino - dirfrag_t dirfrag; // containing dirfrag - //string ref_dn; // referring dentry - int nref; // reference count - - Anchor() {} - Anchor(inodeno_t i, dirfrag_t df, - //string& rd, - int nr=0) : - ino(i), dirfrag(df), - //ref_dn(rd), - nref(nr) { } - - void _encode(bufferlist &bl) { - bl.append((char*)&ino, sizeof(ino)); - bl.append((char*)&dirfrag, sizeof(dirfrag)); - bl.append((char*)&nref, sizeof(nref)); - //::_encode(ref_dn, bl); - } - void _decode(bufferlist& bl, int& off) { - bl.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - bl.copy(off, sizeof(dirfrag), (char*)&dirfrag); - off += sizeof(dirfrag); - bl.copy(off, sizeof(nref), (char*)&nref); - off += sizeof(nref); - //::_decode(ref_dn, bl, off); - } -}; - -inline ostream& operator<<(ostream& out, Anchor& a) -{ - return out << "a(" << a.ino << " " << a.dirfrag << " " << a.nref << ")"; -} - -#endif diff --git a/branches/sage/crush/mds/AnchorClient.cc b/branches/sage/crush/mds/AnchorClient.cc deleted file mode 100644 index b2fb1fb50d7bd..0000000000000 --- a/branches/sage/crush/mds/AnchorClient.cc +++ /dev/null @@ -1,379 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include -using std::cout; -using std::cerr; - -#include "Anchor.h" -#include "AnchorClient.h" -#include "MDSMap.h" - -#include "include/Context.h" -#include "msg/Messenger.h" - -#include "MDS.h" -#include "MDLog.h" -#include "LogSegment.h" - -#include "events/EAnchorClient.h" -#include "messages/MAnchor.h" - -#include "config.h" - -#define dout(x) if (x <= g_conf.debug_mds) *_dout << dbeginl << g_clock.now() << " " << mds->messenger->get_myname() << ".anchorclient " -#define derr(x) if (x <= g_conf.debug_mds) *_derr << dbeginl << g_clock.now() << " " << mds->messenger->get_myname() << ".anchorclient " - - -void AnchorClient::dispatch(Message *m) -{ - switch (m->get_type()) { - case MSG_MDS_ANCHOR: - handle_anchor_reply((MAnchor*)m); - break; - - default: - assert(0); - } -} - -void AnchorClient::handle_anchor_reply(class MAnchor *m) -{ - inodeno_t ino = m->get_ino(); - version_t atid = m->get_atid(); - - dout(10) << "handle_anchor_reply " << *m << dendl; - - switch (m->get_op()) { - - // lookup - case ANCHOR_OP_LOOKUP_REPLY: - assert(pending_lookup.count(ino)); - { - *pending_lookup[ino].trace = m->get_trace(); - Context *onfinish = pending_lookup[ino].onfinish; - pending_lookup.erase(ino); - - if (onfinish) { - onfinish->finish(0); - delete onfinish; - } - } - break; - - // prepare -> agree - case ANCHOR_OP_CREATE_AGREE: - if (pending_create_prepare.count(ino)) { - dout(10) << "got create_agree on " << ino << " atid " << atid << dendl; - Context *onfinish = pending_create_prepare[ino].onfinish; - *pending_create_prepare[ino].patid = atid; - pending_create_prepare.erase(ino); - - if (onfinish) { - onfinish->finish(0); - delete onfinish; - } - } - else if (pending_commit.count(atid)) { - dout(10) << "stray create_agree on " << ino - << " atid " << atid - << ", already committing, resending COMMIT" - << dendl; - MAnchor *req = new MAnchor(ANCHOR_OP_COMMIT, 0, atid); - mds->messenger->send_message(req, - mds->mdsmap->get_inst(mds->mdsmap->get_anchortable()), - MDS_PORT_ANCHORTABLE, MDS_PORT_ANCHORCLIENT); - } - else { - dout(10) << "stray create_agree on " << ino - << " atid " << atid - << ", sending ROLLBACK" - << dendl; - MAnchor *req = new MAnchor(ANCHOR_OP_ROLLBACK, 0, atid); - mds->messenger->send_message(req, - mds->mdsmap->get_inst(mds->mdsmap->get_anchortable()), - MDS_PORT_ANCHORTABLE, MDS_PORT_ANCHORCLIENT); - } - break; - - case ANCHOR_OP_DESTROY_AGREE: - if (pending_destroy_prepare.count(ino)) { - dout(10) << "got destroy_agree on " << ino << " atid " << atid << dendl; - Context *onfinish = pending_destroy_prepare[ino].onfinish; - *pending_destroy_prepare[ino].patid = atid; - pending_destroy_prepare.erase(ino); - - if (onfinish) { - onfinish->finish(0); - delete onfinish; - } - } - else if (pending_commit.count(atid)) { - dout(10) << "stray destroy_agree on " << ino - << " atid " << atid - << ", already committing, resending COMMIT" - << dendl; - MAnchor *req = new MAnchor(ANCHOR_OP_COMMIT, 0, atid); - mds->messenger->send_message(req, - mds->mdsmap->get_inst(mds->mdsmap->get_anchortable()), - MDS_PORT_ANCHORTABLE, MDS_PORT_ANCHORCLIENT); - } - else { - dout(10) << "stray destroy_agree on " << ino - << " atid " << atid - << ", sending ROLLBACK" - << dendl; - MAnchor *req = new MAnchor(ANCHOR_OP_ROLLBACK, 0, atid); - mds->messenger->send_message(req, - mds->mdsmap->get_inst(mds->mdsmap->get_anchortable()), - MDS_PORT_ANCHORTABLE, MDS_PORT_ANCHORCLIENT); - } - break; - - case ANCHOR_OP_UPDATE_AGREE: - if (pending_update_prepare.count(ino)) { - dout(10) << "got update_agree on " << ino << " atid " << atid << dendl; - Context *onfinish = pending_update_prepare[ino].onfinish; - *pending_update_prepare[ino].patid = atid; - pending_update_prepare.erase(ino); - - if (onfinish) { - onfinish->finish(0); - delete onfinish; - } - } - else if (pending_commit.count(atid)) { - dout(10) << "stray update_agree on " << ino - << " atid " << atid - << ", already committing, resending COMMIT" - << dendl; - MAnchor *req = new MAnchor(ANCHOR_OP_COMMIT, 0, atid); - mds->messenger->send_message(req, - mds->mdsmap->get_inst(mds->mdsmap->get_anchortable()), - MDS_PORT_ANCHORTABLE, MDS_PORT_ANCHORCLIENT); - } - else { - dout(10) << "stray update_agree on " << ino - << " atid " << atid - << ", sending ROLLBACK" - << dendl; - MAnchor *req = new MAnchor(ANCHOR_OP_ROLLBACK, 0, atid); - mds->messenger->send_message(req, - mds->mdsmap->get_inst(mds->mdsmap->get_anchortable()), - MDS_PORT_ANCHORTABLE, MDS_PORT_ANCHORCLIENT); - } - break; - - // commit -> ack - case ANCHOR_OP_ACK: - { - dout(10) << "got ack on atid " << atid << ", logging" << dendl; - - // remove from committing list - assert(pending_commit.count(atid)); - assert(pending_commit[atid]->pending_commit_atids.count(atid)); - - // log ACK. - mds->mdlog->submit_entry(new EAnchorClient(ANCHOR_OP_ACK, atid), - new C_LoggedAck(this, atid)); - } - break; - - default: - assert(0); - } - - delete m; -} - - -void AnchorClient::_logged_ack(version_t atid) -{ - dout(10) << "_logged_ack" << dendl; - - assert(pending_commit.count(atid)); - assert(pending_commit[atid]->pending_commit_atids.count(atid)); - - pending_commit[atid]->pending_commit_atids.erase(atid); - pending_commit.erase(atid); - - // kick any waiters (LogSegment trim) - if (ack_waiters.count(atid)) { - dout(15) << "kicking ack waiters on atid " << atid << dendl; - mds->queue_waiters(ack_waiters[atid]); - ack_waiters.erase(atid); - } -} - - -/* - * public async interface - */ - - -/* - * FIXME: we need to be able to resubmit messages if the anchortable mds fails. - */ - - -void AnchorClient::lookup(inodeno_t ino, vector& trace, Context *onfinish) -{ - // send message - MAnchor *req = new MAnchor(ANCHOR_OP_LOOKUP, ino); - - assert(pending_lookup.count(ino) == 0); - pending_lookup[ino].onfinish = onfinish; - pending_lookup[ino].trace = &trace; - - mds->send_message_mds(req, - mds->mdsmap->get_anchortable(), - MDS_PORT_ANCHORTABLE, MDS_PORT_ANCHORCLIENT); -} - - -// PREPARE - -void AnchorClient::prepare_create(inodeno_t ino, vector& trace, - version_t *patid, Context *onfinish) -{ - dout(10) << "prepare_create " << ino << " " << trace << dendl; - - // send message - MAnchor *req = new MAnchor(ANCHOR_OP_CREATE_PREPARE, ino); - req->set_trace(trace); - - pending_create_prepare[ino].trace = trace; - pending_create_prepare[ino].patid = patid; - pending_create_prepare[ino].onfinish = onfinish; - - mds->send_message_mds(req, - mds->mdsmap->get_anchortable(), - MDS_PORT_ANCHORTABLE, MDS_PORT_ANCHORCLIENT); -} - -void AnchorClient::prepare_destroy(inodeno_t ino, - version_t *patid, Context *onfinish) -{ - dout(10) << "prepare_destroy " << ino << dendl; - - // send message - MAnchor *req = new MAnchor(ANCHOR_OP_DESTROY_PREPARE, ino); - pending_destroy_prepare[ino].onfinish = onfinish; - pending_destroy_prepare[ino].patid = patid; - mds->messenger->send_message(req, - mds->mdsmap->get_inst(mds->mdsmap->get_anchortable()), - MDS_PORT_ANCHORTABLE, MDS_PORT_ANCHORCLIENT); -} - - -void AnchorClient::prepare_update(inodeno_t ino, vector& trace, - version_t *patid, Context *onfinish) -{ - dout(10) << "prepare_update " << ino << " " << trace << dendl; - - // send message - MAnchor *req = new MAnchor(ANCHOR_OP_UPDATE_PREPARE, ino); - req->set_trace(trace); - - pending_update_prepare[ino].trace = trace; - pending_update_prepare[ino].patid = patid; - pending_update_prepare[ino].onfinish = onfinish; - - mds->messenger->send_message(req, - mds->mdsmap->get_inst(mds->mdsmap->get_anchortable()), - MDS_PORT_ANCHORTABLE, MDS_PORT_ANCHORCLIENT); -} - - -// COMMIT - -void AnchorClient::commit(version_t atid, LogSegment *ls) -{ - dout(10) << "commit " << atid << dendl; - - assert(pending_commit.count(atid) == 0); - pending_commit[atid] = ls; - ls->pending_commit_atids.insert(atid); - - // send message - MAnchor *req = new MAnchor(ANCHOR_OP_COMMIT, 0, atid); - mds->messenger->send_message(req, - mds->mdsmap->get_inst(mds->mdsmap->get_anchortable()), - MDS_PORT_ANCHORTABLE, MDS_PORT_ANCHORCLIENT); -} - - - -// RECOVERY - -void AnchorClient::finish_recovery() -{ - dout(7) << "finish_recovery" << dendl; - - resend_commits(); -} - -void AnchorClient::resend_commits() -{ - for (map::iterator p = pending_commit.begin(); - p != pending_commit.end(); - ++p) { - dout(10) << "resending commit on " << p->first << dendl; - MAnchor *req = new MAnchor(ANCHOR_OP_COMMIT, 0, p->first); - mds->send_message_mds(req, - mds->mdsmap->get_anchortable(), - MDS_PORT_ANCHORTABLE, MDS_PORT_ANCHORCLIENT); - } -} - -void AnchorClient::resend_prepares(hash_map& prepares, int op) -{ - for (hash_map::iterator p = prepares.begin(); - p != prepares.end(); - p++) { - dout(10) << "resending " << get_anchor_opname(op) << " on " << p->first << dendl; - MAnchor *req = new MAnchor(op, p->first); - req->set_trace(p->second.trace); - mds->send_message_mds(req, - mds->mdsmap->get_anchortable(), - MDS_PORT_ANCHORTABLE, MDS_PORT_ANCHORCLIENT); - } -} - - -void AnchorClient::handle_mds_recovery(int who) -{ - dout(7) << "handle_mds_recovery mds" << who << dendl; - - if (who != mds->mdsmap->get_anchortable()) - return; // do nothing. - - // resend any pending lookups. - for (hash_map::iterator p = pending_lookup.begin(); - p != pending_lookup.end(); - p++) { - dout(10) << "resending lookup on " << p->first << dendl; - mds->send_message_mds(new MAnchor(ANCHOR_OP_LOOKUP, p->first), - mds->mdsmap->get_anchortable(), - MDS_PORT_ANCHORTABLE, MDS_PORT_ANCHORCLIENT); - } - - // resend any pending prepares. - resend_prepares(pending_create_prepare, ANCHOR_OP_CREATE_PREPARE); - resend_prepares(pending_update_prepare, ANCHOR_OP_UPDATE_PREPARE); - resend_prepares(pending_destroy_prepare, ANCHOR_OP_DESTROY_PREPARE); - - // resend any pending commits. - resend_commits(); -} diff --git a/branches/sage/crush/mds/AnchorTable.cc b/branches/sage/crush/mds/AnchorTable.cc deleted file mode 100644 index 65c09278c9850..0000000000000 --- a/branches/sage/crush/mds/AnchorTable.cc +++ /dev/null @@ -1,713 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "AnchorTable.h" -#include "MDS.h" - -#include "osdc/Filer.h" - -#include "msg/Messenger.h" -#include "messages/MAnchor.h" - -#include "common/Clock.h" - -#include "MDLog.h" -#include "events/EAnchor.h" - -#include "config.h" - -#define dout(x) if (x <= g_conf.debug_mds) *_dout << dbeginl << g_clock.now() << " " << mds->messenger->get_myname() << ".anchortable " -#define derr(x) if (x <= g_conf.debug_mds) *_derr << dbeginl << g_clock.now() << " " << mds->messenger->get_myname() << ".anchortable " - - -void AnchorTable::dump() -{ - dout(7) << "dump v " << version << dendl; - for (hash_map::iterator it = anchor_map.begin(); - it != anchor_map.end(); - it++) - dout(15) << "dump " << it->second << dendl; -} - - -/* - * basic updates - */ - -bool AnchorTable::add(inodeno_t ino, dirfrag_t dirfrag) -{ - //dout(17) << "add " << ino << " dirfrag " << dirfrag << dendl; - - // parent should be there - assert(dirfrag.ino < MDS_INO_BASE || // system dirino - anchor_map.count(dirfrag.ino)); // have - - if (anchor_map.count(ino) == 0) { - // new item - anchor_map[ino] = Anchor(ino, dirfrag); - dout(7) << "add added " << anchor_map[ino] << dendl; - return true; - } else { - dout(7) << "add had " << anchor_map[ino] << dendl; - return false; - } -} - -void AnchorTable::inc(inodeno_t ino) -{ - dout(7) << "inc " << ino << dendl; - - assert(anchor_map.count(ino)); - - while (1) { - Anchor &anchor = anchor_map[ino]; - anchor.nref++; - - dout(10) << "inc now " << anchor << dendl; - ino = anchor.dirfrag.ino; - - if (ino == 0) break; - if (anchor_map.count(ino) == 0) break; - } -} - -void AnchorTable::dec(inodeno_t ino) -{ - dout(7) << "dec " << ino << dendl; - assert(anchor_map.count(ino)); - - while (true) { - Anchor &anchor = anchor_map[ino]; - anchor.nref--; - - if (anchor.nref == 0) { - dout(10) << "dec removing " << anchor << dendl; - dirfrag_t dirfrag = anchor.dirfrag; - anchor_map.erase(ino); - ino = dirfrag.ino; - } else { - dout(10) << "dec now " << anchor << dendl; - ino = anchor.dirfrag.ino; - } - - if (ino == 0) break; - if (anchor_map.count(ino) == 0) break; - } -} - - -/* - * high level - */ - - -// LOOKUP - -void AnchorTable::handle_lookup(MAnchor *req) -{ - inodeno_t curino = req->get_ino(); - dout(7) << "handle_lookup " << curino << dendl; - - vector trace; - while (true) { - assert(anchor_map.count(curino) == 1); - Anchor &anchor = anchor_map[curino]; - - dout(10) << "handle_lookup adding " << anchor << dendl; - trace.insert(trace.begin(), anchor); // lame FIXME - - if (anchor.dirfrag.ino < MDS_INO_BASE) break; - curino = anchor.dirfrag.ino; - } - - // reply - MAnchor *reply = new MAnchor(ANCHOR_OP_LOOKUP_REPLY, req->get_ino()); - reply->set_trace(trace); - mds->messenger->send_message(reply, req->get_source_inst(), req->get_source_port()); - - delete req; -} - - -// MIDLEVEL - -void AnchorTable::create_prepare(inodeno_t ino, vector& trace, int reqmds) -{ - // make sure trace is in table - for (unsigned i=0; i& trace, int reqmds) -{ - version++; - pending_update[version].first = ino; - pending_update[version].second = trace; - pending_reqmds[version] = reqmds; - //dump(); -} - -void AnchorTable::commit(version_t atid) -{ - if (pending_create.count(atid)) { - dout(7) << "commit " << atid << " create " << pending_create[atid] << dendl; - pending_create.erase(atid); - } - - else if (pending_destroy.count(atid)) { - inodeno_t ino = pending_destroy[atid]; - dout(7) << "commit " << atid << " destroy " << ino << dendl; - - dec(ino); // destroy - - pending_destroy.erase(atid); - } - - else if (pending_update.count(atid)) { - inodeno_t ino = pending_update[atid].first; - vector &trace = pending_update[atid].second; - - dout(7) << "commit " << atid << " update " << ino << dendl; - - // remove old - dec(ino); - - // add new - for (unsigned i=0; i_create_prepare_logged(req, atid); - } -}; - -void AnchorTable::handle_create_prepare(MAnchor *req) -{ - inodeno_t ino = req->get_ino(); - vector& trace = req->get_trace(); - - dout(7) << "handle_create_prepare " << ino << dendl; - - create_prepare(ino, trace, req->get_source().num()); - - // log it - EAnchor *le = new EAnchor(ANCHOR_OP_CREATE_PREPARE, ino, version, req->get_source().num()); - le->set_trace(trace); - mds->mdlog->submit_entry(le, - new C_AT_CreatePrepare(this, req, version)); -} - -void AnchorTable::_create_prepare_logged(MAnchor *req, version_t atid) -{ - inodeno_t ino = req->get_ino(); - dout(7) << "_create_prepare_logged " << ino << " atid " << atid << dendl; - - // reply - MAnchor *reply = new MAnchor(ANCHOR_OP_CREATE_AGREE, ino, atid); - mds->messenger->send_message(reply, req->get_source_inst(), req->get_source_port()); - - delete req; -} - - - - -// DESTROY - -class C_AT_DestroyPrepare : public Context { - AnchorTable *at; - MAnchor *req; - version_t atid; -public: - C_AT_DestroyPrepare(AnchorTable *a, MAnchor *r, version_t t) : - at(a), req(r), atid(t) { } - void finish(int r) { - at->_destroy_prepare_logged(req, atid); - } -}; - -void AnchorTable::handle_destroy_prepare(MAnchor *req) -{ - inodeno_t ino = req->get_ino(); - dout(7) << "handle_destroy_prepare " << ino << dendl; - - destroy_prepare(ino, req->get_source().num()); - - mds->mdlog->submit_entry(new EAnchor(ANCHOR_OP_DESTROY_PREPARE, ino, version, req->get_source().num()), - new C_AT_DestroyPrepare(this, req, version)); -} - -void AnchorTable::_destroy_prepare_logged(MAnchor *req, version_t atid) -{ - inodeno_t ino = req->get_ino(); - dout(7) << "_destroy_prepare_logged " << ino << " atid " << atid << dendl; - - // reply - MAnchor *reply = new MAnchor(ANCHOR_OP_DESTROY_AGREE, ino, atid); - mds->messenger->send_message(reply, req->get_source_inst(), req->get_source_port()); - delete req; -} - - - -// UPDATE - -class C_AT_UpdatePrepare : public Context { - AnchorTable *at; - MAnchor *req; - version_t atid; -public: - C_AT_UpdatePrepare(AnchorTable *a, MAnchor *r, version_t t) : - at(a), req(r), atid(t) { } - void finish(int r) { - at->_update_prepare_logged(req, atid); - } -}; - -void AnchorTable::handle_update_prepare(MAnchor *req) -{ - inodeno_t ino = req->get_ino(); - vector& trace = req->get_trace(); - - dout(7) << "handle_update_prepare " << ino << dendl; - - update_prepare(ino, trace, req->get_source().num()); - - // log it - EAnchor *le = new EAnchor(ANCHOR_OP_UPDATE_PREPARE, ino, version, req->get_source().num()); - le->set_trace(trace); - mds->mdlog->submit_entry(le, - new C_AT_UpdatePrepare(this, req, version)); -} - -void AnchorTable::_update_prepare_logged(MAnchor *req, version_t atid) -{ - inodeno_t ino = req->get_ino(); - dout(7) << "_update_prepare_logged " << ino << " atid " << atid << dendl; - - // reply - MAnchor *reply = new MAnchor(ANCHOR_OP_UPDATE_AGREE, ino, atid); - mds->messenger->send_message(reply, req->get_source_inst(), req->get_source_port()); - delete req; -} - - - -// COMMIT - -class C_AT_Commit : public Context { - AnchorTable *at; - MAnchor *req; -public: - C_AT_Commit(AnchorTable *a, MAnchor *r) : - at(a), req(r) { } - void finish(int r) { - at->_commit_logged(req); - } -}; - -void AnchorTable::handle_commit(MAnchor *req) -{ - version_t atid = req->get_atid(); - dout(7) << "handle_commit " << atid << dendl; - - if (pending_create.count(atid) || - pending_destroy.count(atid) || - pending_update.count(atid)) { - commit(atid); - mds->mdlog->submit_entry(new EAnchor(ANCHOR_OP_COMMIT, atid, version)); - } - else if (atid <= version) { - dout(0) << "got commit for atid " << atid << " <= " << version - << ", already committed, sending ack." - << dendl; - MAnchor *reply = new MAnchor(ANCHOR_OP_ACK, 0, atid); - mds->messenger->send_message(reply, req->get_source_inst(), req->get_source_port()); - delete req; - return; - } - else { - // wtf. - dout(0) << "got commit for atid " << atid << " > " << version << dendl; - assert(atid <= version); - } - - // wait for it to journal - mds->mdlog->wait_for_sync(new C_AT_Commit(this, req)); -} - - -void AnchorTable::_commit_logged(MAnchor *req) -{ - dout(7) << "_commit_logged, sending ACK" << dendl; - MAnchor *reply = new MAnchor(ANCHOR_OP_ACK, req->get_ino(), req->get_atid()); - mds->messenger->send_message(reply, req->get_source_inst(), req->get_source_port()); - delete req; -} - - - -// ROLLBACK - -void AnchorTable::handle_rollback(MAnchor *req) -{ - version_t atid = req->get_atid(); - dout(7) << "handle_rollback " << atid << dendl; - rollback(atid); - delete req; -} - - - -/* - * messages - */ - -void AnchorTable::dispatch(Message *m) -{ - switch (m->get_type()) { - case MSG_MDS_ANCHOR: - handle_anchor_request((MAnchor*)m); - break; - - default: - assert(0); - } -} - - -void AnchorTable::handle_anchor_request(class MAnchor *req) -{ - // make sure i'm open! - if (!opened) { - dout(7) << "not open yet" << dendl; - - waiting_for_open.push_back(new C_MDS_RetryMessage(mds, req)); - - if (!opening) { - opening = true; - load(0); - } - return; - } - - dout(10) << "handle_anchor_request " << *req << dendl; - - // go - switch (req->get_op()) { - - case ANCHOR_OP_LOOKUP: - handle_lookup(req); - break; - - case ANCHOR_OP_CREATE_PREPARE: - handle_create_prepare(req); - break; - case ANCHOR_OP_DESTROY_PREPARE: - handle_destroy_prepare(req); - break; - case ANCHOR_OP_UPDATE_PREPARE: - handle_update_prepare(req); - break; - - case ANCHOR_OP_COMMIT: - handle_commit(req); - break; - - case ANCHOR_OP_ROLLBACK: - handle_rollback(req); - break; - - default: - assert(0); - } - -} - - - - -// primitive load/save for now! - -// load/save entire table for now! - -class C_AT_Saved : public Context { - AnchorTable *at; - version_t version; -public: - C_AT_Saved(AnchorTable *a, version_t v) : at(a), version(v) {} - void finish(int r) { - at->_saved(version); - } -}; - -void AnchorTable::save(Context *onfinish) -{ - dout(7) << "save v " << version << dendl; - if (!opened) { - assert(!onfinish); - return; - } - - if (onfinish) - waiting_for_save[version].push_back(onfinish); - - if (committing_version == version) { - dout(7) << "save already committing v " << version << dendl; - return; - } - committing_version = version; - - // build up write - bufferlist bl; - - // version - bl.append((char*)&version, sizeof(version)); - - // # anchors - size_t size = anchor_map.size(); - bl.append((char*)&size, sizeof(size)); - - // anchors - for (hash_map::iterator it = anchor_map.begin(); - it != anchor_map.end(); - it++) { - it->second._encode(bl); - dout(15) << "save encoded " << it->second << dendl; - } - - // pending - ::_encode(pending_reqmds, bl); - ::_encode(pending_create, bl); - ::_encode(pending_destroy, bl); - - size_t s = pending_update.size(); - bl.append((char*)&s, sizeof(s)); - for (map > >::iterator p = pending_update.begin(); - p != pending_update.end(); - ++p) { - bl.append((char*)&p->first, sizeof(p->first)); - bl.append((char*)&p->second.first, sizeof(p->second.first)); - ::_encode(p->second.second, bl); - } - - // write! - object_t oid = object_t(MDS_INO_ANCHORTABLE+mds->get_nodeid(), 0); - mds->objecter->write(oid, - 0, bl.length(), - mds->objecter->osdmap->file_to_object_layout(oid, g_OSD_MDAnchorTableLayout), - bl, - NULL, new C_AT_Saved(this, version)); -} - -void AnchorTable::_saved(version_t v) -{ - dout(7) << "_saved v " << v << dendl; - - assert(v <= committing_version); - assert(committed_version < v); - committed_version = v; - - finish_contexts(waiting_for_save[v], 0); - waiting_for_save.erase(v); -} - - - -class C_AT_Load : public Context { - AnchorTable *at; -public: - bufferlist bl; - C_AT_Load(AnchorTable *a) : at(a) {} - void finish(int result) { - assert(result > 0); - at->_loaded(bl); - } -}; - -void AnchorTable::load(Context *onfinish) -{ - dout(7) << "load" << dendl; - assert(!opened); - - waiting_for_open.push_back(onfinish); - - C_AT_Load *fin = new C_AT_Load(this); - object_t oid = object_t(MDS_INO_ANCHORTABLE+mds->get_nodeid(), 0); - mds->objecter->read(oid, - 0, 0, - mds->objecter->osdmap->file_to_object_layout(oid, g_OSD_MDAnchorTableLayout), - &fin->bl, fin); -} - -void AnchorTable::_loaded(bufferlist& bl) -{ - dout(10) << "_loaded got " << bl.length() << " bytes" << dendl; - - int off = 0; - bl.copy(off, sizeof(version), (char*)&version); - off += sizeof(version); - - size_t size; - bl.copy(off, sizeof(size), (char*)&size); - off += sizeof(size); - - for (size_t n=0; n::iterator p = pending_reqmds.begin(); - p != pending_reqmds.end(); - p++) - resend_agree(p->first, p->second); -} - - -void AnchorTable::resend_agree(version_t v, int who) -{ - if (pending_create.count(v)) { - MAnchor *reply = new MAnchor(ANCHOR_OP_CREATE_AGREE, pending_create[v], v); - mds->send_message_mds(reply, who, MDS_PORT_ANCHORCLIENT); - } - else if (pending_destroy.count(v)) { - MAnchor *reply = new MAnchor(ANCHOR_OP_DESTROY_AGREE, pending_destroy[v], v); - mds->send_message_mds(reply, who, MDS_PORT_ANCHORCLIENT); - } - else { - assert(pending_update.count(v)); - MAnchor *reply = new MAnchor(ANCHOR_OP_UPDATE_AGREE, pending_update[v].first, v); - mds->send_message_mds(reply, who, MDS_PORT_ANCHORCLIENT); - } -} - -void AnchorTable::handle_mds_recovery(int who) -{ - dout(7) << "handle_mds_recovery mds" << who << dendl; - - // resend agrees for recovered mds - for (map::iterator p = pending_reqmds.begin(); - p != pending_reqmds.end(); - p++) { - if (p->second != who) continue; - resend_agree(p->first, p->second); - } -} diff --git a/branches/sage/crush/mds/CDentry.cc b/branches/sage/crush/mds/CDentry.cc deleted file mode 100644 index 2b6bb3470e8a8..0000000000000 --- a/branches/sage/crush/mds/CDentry.cc +++ /dev/null @@ -1,365 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "CDentry.h" -#include "CInode.h" -#include "CDir.h" -#include "Anchor.h" - -#include "MDS.h" -#include "MDCache.h" -#include "LogSegment.h" - -#include "messages/MLock.h" - -#include - -#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_mds) *_dout << dbeginl << g_clock.now() << " mds" << dir->cache->mds->get_nodeid() << ".cache.den(" << dir->dirfrag() << " " << name << ") " - - - -ostream& CDentry::print_db_line_prefix(ostream& out) -{ - return out << g_clock.now() << " mds" << dir->cache->mds->get_nodeid() << ".cache.den(" << dir->ino() << " " << name << ") "; -} - - -// CDentry - -ostream& operator<<(ostream& out, CDentry& dn) -{ - string path; - dn.make_path(path); - - out << "[dentry " << path; - - if (dn.is_auth()) { - out << " auth"; - if (dn.is_replicated()) - out << dn.get_replicas(); - } else { - out << " rep@" << dn.authority(); - out << "." << dn.get_replica_nonce(); - assert(dn.get_replica_nonce() >= 0); - } - - if (dn.is_null()) out << " NULL"; - if (dn.is_remote()) { - out << " REMOTE("; - switch (dn.get_remote_d_type()) { - case inode_t::DT_REG: out << "reg"; break; - case inode_t::DT_DIR: out << "dir"; break; - case inode_t::DT_LNK: out << "lnk"; break; - default: assert(0); - } - out << ")"; - } - - out << " " << dn.lock; - - out << " v=" << dn.get_version(); - out << " pv=" << dn.get_projected_version(); - - out << " inode=" << dn.get_inode(); - - if (dn.is_new()) out << " state=new"; - - if (dn.get_num_ref()) { - out << " |"; - dn.print_pin_set(out); - } - - out << " " << &dn; - out << "]"; - return out; -} - - -bool operator<(const CDentry& l, const CDentry& r) -{ - if (l.get_dir()->ino() < r.get_dir()->ino()) return true; - if (l.get_dir()->ino() == r.get_dir()->ino() && - l.get_name() < r.get_name()) return true; - return false; -} - - -void CDentry::print(ostream& out) -{ - out << *this; -} - - -inodeno_t CDentry::get_ino() -{ - if (inode) - return inode->ino(); - return inodeno_t(); -} - - -pair CDentry::authority() -{ - return dir->authority(); -} - - -void CDentry::add_waiter(int tag, Context *c) -{ - // wait on the directory? - if (tag & (WAIT_UNFREEZE|WAIT_SINGLEAUTH)) { - dir->add_waiter(tag, c); - return; - } - MDSCacheObject::add_waiter(tag, c); -} - - -version_t CDentry::pre_dirty(version_t min) -{ - projected_version = dir->pre_dirty(min); - dout(10) << " pre_dirty " << *this << dendl; - return projected_version; -} - - -void CDentry::_mark_dirty(LogSegment *ls) -{ - // state+pin - if (!state_test(STATE_DIRTY)) { - state_set(STATE_DIRTY); - dir->inc_num_dirty(); - get(PIN_DIRTY); - assert(ls); - } - if (ls) - ls->dirty_dentries.push_back(&xlist_dirty); -} - -void CDentry::mark_dirty(version_t pv, LogSegment *ls) -{ - dout(10) << " mark_dirty " << *this << dendl; - - // i now live in this new dir version - assert(pv <= projected_version); - version = pv; - _mark_dirty(ls); - - // mark dir too - dir->mark_dirty(pv, ls); -} - - -void CDentry::mark_clean() -{ - dout(10) << " mark_clean " << *this << dendl; - assert(is_dirty()); - assert(dir->get_version() == 0 || version <= dir->get_version()); // hmm? - - // state+pin - state_clear(STATE_DIRTY); - dir->dec_num_dirty(); - put(PIN_DIRTY); - - xlist_dirty.remove_myself(); - - if (state_test(STATE_NEW)) - state_clear(STATE_NEW); -} - -void CDentry::mark_new() -{ - dout(10) << " mark_new " << *this << dendl; - state_set(STATE_NEW); -} - -void CDentry::make_path(string& s) -{ - if (dir) { - dir->inode->make_path(s); - } else { - s = "???"; - } - s += "/"; - s += name; -} - -void CDentry::make_path(string& s, inodeno_t tobase) -{ - assert(dir); - - if (dir->inode->is_root()) { - s += "/"; // make it an absolute path (no matter what) if we hit the root. - } - else if (dir->inode->get_parent_dn() && - dir->inode->ino() != tobase) { - dir->inode->get_parent_dn()->make_path(s, tobase); - s += "/"; - } - s += name; -} - -/** make_anchor_trace - * construct an anchor trace for this dentry, as if it were linked to *in. - */ -void CDentry::make_anchor_trace(vector& trace, CInode *in) -{ - // start with parent dir inode - if (dir) - dir->inode->make_anchor_trace(trace); - - // add this inode (in my dirfrag) to the end - trace.push_back(Anchor(in->ino(), dir->dirfrag())); - dout(10) << "make_anchor_trace added " << trace.back() << dendl; -} - - - -void CDentry::link_remote(CInode *in) -{ - assert(is_remote()); - assert(in->ino() == remote_ino); - - inode = in; - in->add_remote_parent(this); -} - -void CDentry::unlink_remote() -{ - assert(is_remote()); - assert(inode); - - inode->remove_remote_parent(this); - inode = 0; -} - - -CDentryDiscover *CDentry::replicate_to(int who) -{ - int nonce = add_replica(who); - return new CDentryDiscover(this, nonce); -} - - -// ---------------------------- -// auth pins - -bool CDentry::can_auth_pin() -{ - assert(dir); - return dir->can_auth_pin(); -} - -void CDentry::auth_pin() -{ - if (auth_pins == 0) - get(PIN_AUTHPIN); - auth_pins++; - - dout(10) << "auth_pin on " << *this - << " now " << auth_pins << "+" << nested_auth_pins - << dendl; - - dir->adjust_nested_auth_pins(1); -} - -void CDentry::auth_unpin() -{ - auth_pins--; - if (auth_pins == 0) - put(PIN_AUTHPIN); - - dout(10) << "auth_unpin on " << *this - << " now " << auth_pins << "+" << nested_auth_pins - << dendl; - assert(auth_pins >= 0); - - dir->adjust_nested_auth_pins(-1); -} - -void CDentry::adjust_nested_auth_pins(int by) -{ - nested_auth_pins += by; - - dout(15) << "adjust_nested_auth_pins by " << by - << " now " << auth_pins << "+" << nested_auth_pins - << dendl; - assert(nested_auth_pins >= 0); - - dir->adjust_nested_auth_pins(by); -} - - -// ---------------------------- -// locking - -void CDentry::set_object_info(MDSCacheObjectInfo &info) -{ - info.dirfrag = dir->dirfrag(); - info.dname = name; -} - -void CDentry::encode_lock_state(int type, bufferlist& bl) -{ - // null, ino, or remote_ino? - int c; - if (is_primary()) { - c = 1; - ::_encode(c, bl); - ::_encode(inode->inode.ino, bl); - } - else if (is_remote()) { - c = 2; - ::_encode(c, bl); - ::_encode(remote_ino, bl); - } - else if (is_null()) { - // encode nothing. - } - else assert(0); -} - -void CDentry::decode_lock_state(int type, bufferlist& bl) -{ - if (bl.length() == 0) { - // null - assert(is_null()); - return; - } - - int off = 0; - char c; - inodeno_t ino; - ::_decode(c, bl, off); - - switch (c) { - case 1: - case 2: - _decode(ino, bl, off); - // newly linked? - if (is_null() && !is_auth()) { - // force trim from cache! - dout(10) << "decode_lock_state replica dentry null -> non-null, must trim" << dendl; - //assert(get_num_ref() == 0); - } else { - // verify? - - } - break; - default: - assert(0); - } -} diff --git a/branches/sage/crush/mds/CDentry.h b/branches/sage/crush/mds/CDentry.h deleted file mode 100644 index 416792beb8778..0000000000000 --- a/branches/sage/crush/mds/CDentry.h +++ /dev/null @@ -1,323 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __CDENTRY_H -#define __CDENTRY_H - -#include -#include -#include -using namespace std; - -#include "include/types.h" -#include "include/buffer.h" -#include "include/lru.h" -#include "include/xlist.h" -#include "mdstypes.h" - -#include "SimpleLock.h" - -class CInode; -class CDir; -class MDRequest; - -class Message; -class CDentryDiscover; -class Anchor; - -class CDentry; -class LogSegment; - - -// define an ordering -bool operator<(const CDentry& l, const CDentry& r); - -// dentry -class CDentry : public MDSCacheObject, public LRUObject { - public: - // -- state -- - static const int STATE_NEW = 1; - static const int STATE_FRAGMENTING = 2; - - // -- pins -- - static const int PIN_INODEPIN = 1; // linked inode is pinned - static const int PIN_FRAGMENTING = -2; // containing dir is refragmenting - const char *pin_name(int p) { - switch (p) { - case PIN_INODEPIN: return "inodepin"; - case PIN_FRAGMENTING: return "fragmenting"; - default: return generic_pin_name(p); - } - }; - - // -- wait -- - static const int WAIT_LOCK_OFFSET = 8; - - void add_waiter(int tag, Context *c); - - static const int EXPORT_NONCE = 1; - - bool is_lt(const MDSCacheObject *r) const { - return *this < *(CDentry*)r; - } - - protected: - string name; - - inodeno_t remote_ino; // if remote dentry - unsigned char remote_d_type; - - CInode *inode; // linked inode (if any) - CDir *dir; // containing dirfrag - - version_t version; // dir version when last touched. - version_t projected_version; // what it will be when i unlock/commit. - - xlist::item xlist_dirty; - - off_t dir_offset; - - int auth_pins, nested_auth_pins; - - friend class Migrator; - friend class Locker; - friend class Renamer; - friend class Server; - friend class MDCache; - friend class MDS; - friend class CInode; - friend class C_MDC_XlockRequest; - - -public: - // lock - SimpleLock lock; - - - - public: - // cons - CDentry() : - remote_ino(0), remote_d_type(0), - inode(0), dir(0), - version(0), projected_version(0), - xlist_dirty(this), - dir_offset(0), - auth_pins(0), nested_auth_pins(0), - lock(this, LOCK_OTYPE_DN, WAIT_LOCK_OFFSET) { } - CDentry(const string& n, CInode *in) : - name(n), - remote_ino(0), remote_d_type(0), - inode(in), dir(0), - version(0), projected_version(0), - xlist_dirty(this), - dir_offset(0), - auth_pins(0), nested_auth_pins(0), - lock(this, LOCK_OTYPE_DN, WAIT_LOCK_OFFSET) { } - CDentry(const string& n, inodeno_t ino, unsigned char dt, CInode *in=0) : - name(n), - remote_ino(ino), remote_d_type(dt), - inode(in), dir(0), - version(0), projected_version(0), - xlist_dirty(this), - dir_offset(0), - auth_pins(0), nested_auth_pins(0), - lock(this, LOCK_OTYPE_DN, WAIT_LOCK_OFFSET) { } - - CInode *get_inode() const { return inode; } - CDir *get_dir() const { return dir; } - const string& get_name() const { return name; } - inodeno_t get_ino(); - - off_t get_dir_offset() { return dir_offset; } - void set_dir_offset(off_t o) { dir_offset = o; } - void clear_dir_offset() { dir_offset = 0; } - - inodeno_t get_remote_ino() { return remote_ino; } - unsigned char get_remote_d_type() { return remote_d_type; } - void set_remote(inodeno_t ino, unsigned char d_type) { - remote_ino = ino; - remote_d_type = d_type; - } - - // ref counts: pin ourselves in the LRU when we're pinned. - void first_get() { - lru_pin(); - } - void last_put() { - lru_unpin(); - } - - // auth pins - bool can_auth_pin(); - void auth_pin(); - void auth_unpin(); - void adjust_nested_auth_pins(int by); - - - // dentry type is primary || remote || null - // inode ptr is required for primary, optional for remote, undefined for null - bool is_primary() { return remote_ino == 0 && inode != 0; } - bool is_remote() { return remote_ino > 0; } - bool is_null() { return (remote_ino == 0 && inode == 0) ? true:false; } - - // remote links - void link_remote(CInode *in); - void unlink_remote(); - - - // copy cons - CDentry(const CDentry& m); - const CDentry& operator= (const CDentry& right); - - // misc - void make_path(string& p); - void make_path(string& p, inodeno_t tobase); - void make_anchor_trace(vector& trace, CInode *in); - - // -- version -- - version_t get_version() { return version; } - void set_version(version_t v) { projected_version = version = v; } - version_t get_projected_version() { return projected_version; } - void set_projected_version(version_t v) { projected_version = v; } - - pair authority(); - - version_t pre_dirty(version_t min=0); - void _mark_dirty(LogSegment *ls); - void mark_dirty(version_t projected_dirv, LogSegment *ls); - void mark_clean(); - - void mark_new(); - bool is_new() { return state_test(STATE_NEW); } - - // -- replication - CDentryDiscover *replicate_to(int rep); - - - // -- exporting - // note: this assumes the dentry already exists. - // i.e., the name is already extracted... so we just need the other state. - void encode_export(bufferlist& bl) { - ::_encode_simple(state, bl); - ::_encode_simple(version, bl); - ::_encode_simple(projected_version, bl); - lock._encode(bl); - ::_encode_simple(replica_map, bl); - get(PIN_TEMPEXPORTING); - } - void finish_export() { - // twiddle - clear_replica_map(); - replica_nonce = EXPORT_NONCE; - state_clear(CDentry::STATE_AUTH); - if (is_dirty()) - mark_clean(); - put(PIN_TEMPEXPORTING); - } - void abort_export() { - put(PIN_TEMPEXPORTING); - } - void decode_import(bufferlist::iterator& blp, LogSegment *ls) { - int nstate; - ::_decode_simple(nstate, blp); - ::_decode_simple(version, blp); - ::_decode_simple(projected_version, blp); - lock._decode(blp); - ::_decode_simple(replica_map, blp); - - // twiddle - state = 0; - state_set(CDentry::STATE_AUTH); - if (nstate & STATE_DIRTY) - _mark_dirty(ls); - if (!replica_map.empty()) - get(PIN_REPLICATED); - } - - // -- locking -- - SimpleLock* get_lock(int type) { - assert(type == LOCK_OTYPE_DN); - return &lock; - } - void set_object_info(MDSCacheObjectInfo &info); - void encode_lock_state(int type, bufferlist& bl); - void decode_lock_state(int type, bufferlist& bl); - - - ostream& print_db_line_prefix(ostream& out); - void print(ostream& out); - - friend class CDir; -}; - -ostream& operator<<(ostream& out, CDentry& dn); - - - -class CDentryDiscover { - string dname; - int replica_nonce; - int lockstate; - off_t dir_offset; - inodeno_t remote_ino; - unsigned char remote_d_type; - -public: - CDentryDiscover() {} - CDentryDiscover(CDentry *dn, int nonce) : - dname(dn->get_name()), replica_nonce(nonce), - lockstate(dn->lock.get_replica_state()), - dir_offset(dn->get_dir_offset()), - remote_ino(dn->get_remote_ino()), remote_d_type(dn->get_remote_d_type()) { } - - string& get_dname() { return dname; } - int get_nonce() { return replica_nonce; } - bool is_remote() { return remote_ino ? true:false; } - inodeno_t get_remote_ino() { return remote_ino; } - unsigned char get_remote_d_type() { return remote_d_type; } - - void update_dentry(CDentry *dn) { - dn->set_dir_offset(dir_offset); - dn->set_replica_nonce(replica_nonce); - } - void init_dentry_lock(CDentry *dn) { - dn->lock.set_state( lockstate ); - } - - void _encode(bufferlist& bl) { - ::_encode(dname, bl); - ::_encode(dir_offset, bl); - ::_encode(remote_ino, bl); - ::_encode(remote_d_type, bl); - ::_encode(replica_nonce, bl); - ::_encode(lockstate, bl); - } - - void _decode(bufferlist& bl, int& off) { - ::_decode(dname, bl, off); - ::_decode(dir_offset, bl, off); - ::_decode(remote_ino, bl, off); - ::_decode(remote_d_type, bl, off); - ::_decode(replica_nonce, bl, off); - ::_decode(lockstate, bl, off); - } - -}; - - - -#endif diff --git a/branches/sage/crush/mds/CDir.cc b/branches/sage/crush/mds/CDir.cc deleted file mode 100644 index b4663b269c659..0000000000000 --- a/branches/sage/crush/mds/CDir.cc +++ /dev/null @@ -1,1676 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include "include/types.h" - -#include "CDir.h" -#include "CDentry.h" -#include "CInode.h" - -#include "MDS.h" -#include "MDCache.h" -#include "MDSMap.h" -#include "LogSegment.h" - -#include "include/Context.h" -#include "common/Clock.h" - -#include "osdc/Objecter.h" - -#include - -#include "config.h" - -#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_mds) *_dout << dbeginl << g_clock.now() << " mds" << cache->mds->get_nodeid() << ".cache.dir(" << this->dirfrag() << ") " - - - - -// PINS -//int cdir_pins[CDIR_NUM_PINS] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0 }; - - - -ostream& operator<<(ostream& out, CDir& dir) -{ - string path; - dir.get_inode()->make_path(path); - out << "[dir " << dir.dirfrag() << " " << path << "/"; - if (dir.is_auth()) { - out << " auth"; - if (dir.is_replicated()) - out << dir.get_replicas(); - - out << " pv=" << dir.get_projected_version(); - out << " v=" << dir.get_version(); - out << " cv=" << dir.get_committing_version(); - out << "/" << dir.get_committed_version(); - out << "/" << dir.get_committed_version_equivalent(); - } else { - out << " rep@" << dir.authority(); - if (dir.get_replica_nonce() > 1) - out << "." << dir.get_replica_nonce(); - } - - if (dir.is_rep()) out << " REP"; - - if (dir.get_dir_auth() != CDIR_AUTH_DEFAULT) { - if (dir.get_dir_auth().second == CDIR_AUTH_UNKNOWN) - out << " dir_auth=" << dir.get_dir_auth().first; - else - out << " dir_auth=" << dir.get_dir_auth(); - } - - if (dir.get_cum_auth_pins()) - out << " ap=" << dir.get_auth_pins() << "+" << dir.get_nested_auth_pins(); - - out << " state=" << dir.get_state(); - if (dir.state_test(CDir::STATE_COMPLETE)) out << "|complete"; - if (dir.state_test(CDir::STATE_FREEZINGTREE)) out << "|freezingtree"; - if (dir.state_test(CDir::STATE_FROZENTREE)) out << "|frozentree"; - //if (dir.state_test(CDir::STATE_FROZENTREELEAF)) out << "|frozentreeleaf"; - if (dir.state_test(CDir::STATE_FROZENDIR)) out << "|frozendir"; - if (dir.state_test(CDir::STATE_FREEZINGDIR)) out << "|freezingdir"; - if (dir.state_test(CDir::STATE_EXPORTBOUND)) out << "|exportbound"; - if (dir.state_test(CDir::STATE_IMPORTBOUND)) out << "|importbound"; - - out << " sz=" << dir.get_nitems() << "+" << dir.get_nnull(); - if (dir.get_num_dirty()) - out << " dirty=" << dir.get_num_dirty(); - - - if (dir.get_num_ref()) { - out << " |"; - dir.print_pin_set(out); - } - - out << " " << &dir; - return out << "]"; -} - - -void CDir::print(ostream& out) -{ - out << *this; -} - - - - -ostream& CDir::print_db_line_prefix(ostream& out) -{ - return out << g_clock.now() << " mds" << cache->mds->get_nodeid() << ".cache.dir(" << this->dirfrag() << ") "; -} - - - -// ------------------------------------------------------------------- -// CDir - -CDir::CDir(CInode *in, frag_t fg, MDCache *mdcache, bool auth) : - xlist_dirty(this) -{ - inode = in; - frag = fg; - this->cache = mdcache; - - nitems = 0; - nnull = 0; - num_dirty = 0; - - state = STATE_INITIAL; - - projected_version = version = 0; - committing_version = 0; - committed_version_equivalent = committed_version = 0; - - // dir_auth - dir_auth = CDIR_AUTH_DEFAULT; - - // auth - assert(in->is_dir()); - if (auth) - state |= STATE_AUTH; - - auth_pins = 0; - nested_auth_pins = 0; - request_pins = 0; - - //hack_num_accessed = -1; - - dir_rep = REP_NONE; - //dir_rep = REP_ALL; // hack: to wring out some bugs! FIXME FIXME -} - - - - -/*** - * linking fun - */ - -CDentry* CDir::add_null_dentry(const string& dname) -{ - // foreign - assert(lookup(dname) == 0); - - // create dentry - CDentry* dn = new CDentry(dname, 0); - if (is_auth()) - dn->state_set(CDentry::STATE_AUTH); - cache->lru.lru_insert_mid(dn); - - dn->dir = this; - dn->version = projected_version; - - // add to dir - assert(items.count(dn->name) == 0); - //assert(null_items.count(dn->name) == 0); - - items[dn->name] = dn; - nnull++; - - dout(12) << "add_null_dentry " << *dn << dendl; - - // pin? - if (nnull + nitems == 1) get(PIN_CHILD); - - assert(nnull + nitems == items.size()); - //assert(nnull == null_items.size()); - return dn; -} - - -CDentry* CDir::add_primary_dentry(const string& dname, CInode *in) -{ - // primary - assert(lookup(dname) == 0); - - // create dentry - CDentry* dn = new CDentry(dname, 0); - if (is_auth()) - dn->state_set(CDentry::STATE_AUTH); - cache->lru.lru_insert_mid(dn); - - dn->dir = this; - dn->version = projected_version; - - // add to dir - assert(items.count(dn->name) == 0); - //assert(null_items.count(dn->name) == 0); - - items[dn->name] = dn; - link_inode_work( dn, in ); - - dout(12) << "add_primary_dentry " << *dn << dendl; - - // pin? - if (nnull + nitems == 1) get(PIN_CHILD); - - assert(nnull + nitems == items.size()); - //assert(nnull == null_items.size()); - return dn; -} - -CDentry* CDir::add_remote_dentry(const string& dname, inodeno_t ino, unsigned char d_type) -{ - // foreign - assert(lookup(dname) == 0); - - // create dentry - CDentry* dn = new CDentry(dname, ino, d_type); - if (is_auth()) - dn->state_set(CDentry::STATE_AUTH); - cache->lru.lru_insert_mid(dn); - - dn->dir = this; - dn->version = projected_version; - - // add to dir - assert(items.count(dn->name) == 0); - //assert(null_items.count(dn->name) == 0); - - items[dn->name] = dn; - nitems++; - - dout(12) << "add_remote_dentry " << *dn << dendl; - - // pin? - if (nnull + nitems == 1) get(PIN_CHILD); - - assert(nnull + nitems == items.size()); - //assert(nnull == null_items.size()); - return dn; -} - - - -void CDir::remove_dentry(CDentry *dn) -{ - dout(12) << "remove_dentry " << *dn << dendl; - - if (dn->inode) { - // detach inode and dentry - unlink_inode_work(dn); - } else { - // remove from null list - //assert(null_items.count(dn->name) == 1); - //null_items.erase(dn->name); - nnull--; - } - - // remove from list - assert(items.count(dn->name) == 1); - items.erase(dn->name); - - // adjust dirty counter? - if (dn->state_test(CDentry::STATE_DIRTY)) - num_dirty--; - - cache->lru.lru_remove(dn); - delete dn; - - // unpin? - if (nnull + nitems == 0) put(PIN_CHILD); - - assert(nnull + nitems == items.size()); - //assert(nnull == null_items.size()); -} - -void CDir::link_remote_inode(CDentry *dn, inodeno_t ino, unsigned char d_type) -{ - dout(12) << "link_remote_inode " << *dn << " remote " << ino << dendl; - assert(dn->is_null()); - - dn->set_remote(ino, d_type); - nitems++; - dn->clear_dir_offset(); - - //assert(null_items.count(dn->name) == 1); - //null_items.erase(dn->name); - nnull--; - assert(nnull + nitems == items.size()); -} - -void CDir::link_primary_inode(CDentry *dn, CInode *in) -{ - dout(12) << "link_primary_inode " << *dn << " " << *in << dendl; - assert(dn->is_null()); - - link_inode_work(dn,in); - dn->clear_dir_offset(); - - // remove from null list - //assert(null_items.count(dn->name) == 1); - //null_items.erase(dn->name); - nnull--; - - assert(nnull + nitems == items.size()); - //assert(nnull == null_items.size()); -} - -void CDir::link_inode_work( CDentry *dn, CInode *in) -{ - assert(dn->inode == 0); - dn->inode = in; - in->set_primary_parent(dn); - - nitems++; // adjust dir size - - // set inode version - //in->inode.version = dn->get_version(); - - // pin dentry? - if (in->get_num_ref()) - dn->get(CDentry::PIN_INODEPIN); - - // adjust auth pin count - if (in->auth_pins + in->nested_auth_pins) - dn->adjust_nested_auth_pins(in->auth_pins + in->nested_auth_pins); -} - -void CDir::unlink_inode( CDentry *dn ) -{ - if (dn->is_remote()) { - dout(12) << "unlink_inode " << *dn << dendl; - } else { - dout(12) << "unlink_inode " << *dn << " " << *dn->inode << dendl; - } - - dn->clear_dir_offset(); - unlink_inode_work(dn); - - // add to null list - //assert(null_items.count(dn->name) == 0); - //null_items[dn->name] = dn; - nnull++; - - assert(nnull + nitems == items.size()); - //assert(nnull == null_items.size()); -} - -void CDir::try_remove_unlinked_dn(CDentry *dn) -{ - assert(dn->dir == this); - assert(dn->is_null()); - assert(dn->is_dirty()); - - /* FIXME: there is a bug in this. i think new dentries are properly - identified.. e.g. maybe a dentry exists, is committed, is removed, is now - dirty+null, then reused and mistakenly considered new.. then it is removed, - we remove it here, the dir is fetched, and the dentry exists again. - - somethign like that... - */ - return; - - - // no pins (besides dirty)? - if (dn->get_num_ref() != 1) - return; - - // was the dn new? or is the dir complete (i.e. we don't need negatives)? - if (dn->is_new() || is_complete()) { - dout(10) << "try_remove_unlinked_dn " << *dn << " in " << *this << dendl; - dn->mark_clean(); - remove_dentry(dn); - - if (version == projected_version && - committing_version == committed_version && - num_dirty == 0) { - dout(10) << "try_remove_unlinked_dn committed_equivalent now " << version - << " vs committed " << committed_version - << dendl; - committed_version_equivalent = committed_version; - } - } -} - - - -void CDir::unlink_inode_work( CDentry *dn ) -{ - CInode *in = dn->inode; - - if (dn->is_remote()) { - // remote - if (in) - dn->unlink_remote(); - - dn->set_remote(0, 0); - } else { - // primary - assert(dn->is_primary()); - - // unpin dentry? - if (in->get_num_ref()) - dn->put(CDentry::PIN_INODEPIN); - - // unlink auth_pin count - if (in->auth_pins + in->nested_auth_pins) - dn->adjust_nested_auth_pins(0 - (in->auth_pins + in->nested_auth_pins)); - - // detach inode - in->remove_primary_parent(dn); - dn->inode = 0; - } - - nitems--; // adjust dir size -} - -void CDir::remove_null_dentries() { - dout(12) << "remove_null_dentries " << *this << dendl; - - list dns; - for (CDir::map_t::iterator it = items.begin(); - it != items.end(); - it++) { - if (it->second->is_null()) - dns.push_back(it->second); - } - - for (list::iterator it = dns.begin(); - it != dns.end(); - it++) { - CDentry *dn = *it; - remove_dentry(dn); - } - //assert(null_items.empty()); - assert(nnull == 0); - assert(nnull + nitems == items.size()); -} - - -/** - * steal_dentry -- semi-violently move a dentry from one CDir to another - * (*) violently, in that nitems, most pins, etc. are not correctly maintained - * on the old CDir corpse; must call purge_stolen() when finished. - */ -void CDir::steal_dentry(CDentry *dn) -{ - dout(15) << "steal_dentry " << *dn << dendl; - - items[dn->name] = dn; - - dn->dir->items.erase(dn->name); - if (dn->dir->items.empty()) - dn->dir->put(PIN_CHILD); - - if (nnull + nitems == 0) - get(PIN_CHILD); - if (dn->is_null()) - nnull++; - else - nitems++; - - nested_auth_pins += dn->auth_pins + dn->nested_auth_pins; - if (dn->is_dirty()) - num_dirty++; - - dn->dir = this; -} - -void CDir::purge_stolen(list& waiters) -{ - // take waiters _before_ unfreeze... - take_waiting(WAIT_ANY, waiters); - - if (is_auth()) { - assert(is_frozen_dir()); - unfreeze_dir(); - } - - nnull = nitems = 0; - - if (is_auth()) - clear_replica_map(); - if (is_dirty()) mark_clean(); - if (state_test(STATE_IMPORTBOUND)) put(PIN_IMPORTBOUND); - if (state_test(STATE_EXPORTBOUND)) put(PIN_EXPORTBOUND); - - if (auth_pins > 0) put(PIN_AUTHPIN); - - assert(get_num_ref() == (state_test(STATE_STICKY) ? 1:0)); -} - -void CDir::init_fragment_pins() -{ - if (!replica_map.empty()) get(PIN_REPLICATED); - if (state_test(STATE_DIRTY)) get(PIN_DIRTY); - if (state_test(STATE_EXPORTBOUND)) get(PIN_EXPORTBOUND); - if (state_test(STATE_IMPORTBOUND)) get(PIN_IMPORTBOUND); -} - -void CDir::split(int bits, list& subs, list& waiters) -{ - dout(10) << "split by " << bits << " bits on " << *this << dendl; - - if (cache->mds->logger) cache->mds->logger->inc("dir_sp"); - - assert(is_complete() || !is_auth()); - - list frags; - frag.split(bits, frags); - - vector subfrags(1 << bits); - - double fac = 1.0 / (double)(1 << bits); // for scaling load vecs - - // create subfrag dirs - int n = 0; - for (list::iterator p = frags.begin(); p != frags.end(); ++p) { - CDir *f = new CDir(inode, *p, cache, is_auth()); - f->state_set(state & MASK_STATE_FRAGMENT_KEPT); - f->replica_map = replica_map; - f->dir_auth = dir_auth; - f->init_fragment_pins(); - f->version = version; - f->projected_version = projected_version; - - f->pop_me = pop_me; - f->pop_me *= fac; - - // FIXME; this is an approximation - f->pop_nested = pop_nested; - f->pop_nested *= fac; - f->pop_auth_subtree = pop_auth_subtree; - f->pop_auth_subtree *= fac; - f->pop_auth_subtree_nested = pop_auth_subtree_nested; - f->pop_auth_subtree_nested *= fac; - - dout(10) << " subfrag " << *p << " " << *f << dendl; - subfrags[n++] = f; - subs.push_back(f); - inode->add_dirfrag(f); - } - - // repartition dentries - while (!items.empty()) { - CDir::map_t::iterator p = items.begin(); - - CDentry *dn = p->second; - frag_t subfrag = inode->pick_dirfrag(p->first); - int n = subfrag.value() >> frag.bits(); - dout(15) << " subfrag " << subfrag << " n=" << n << " for " << p->first << dendl; - CDir *f = subfrags[n]; - f->steal_dentry(dn); - } - - purge_stolen(waiters); - inode->close_dirfrag(frag); // selft deletion, watch out. -} - -void CDir::merge(int bits, list& waiters) -{ - dout(10) << "merge by " << bits << " bits" << dendl; - - list frags; - frag.split(bits, frags); - - for (list::iterator p = frags.begin(); p != frags.end(); ++p) { - CDir *dir = inode->get_or_open_dirfrag(cache, *p); - assert(dir->is_complete()); - dout(10) << " subfrag " << *p << " " << *dir << dendl; - - // steal dentries - while (!dir->items.empty()) - steal_dentry(dir->items.begin()->second); - - // merge replica map - for (map::iterator p = dir->replica_map.begin(); - p != dir->replica_map.end(); - ++p) - replica_map[p->first] = MAX(replica_map[p->first], p->second); - - // merge state - state_set(dir->get_state() & MASK_STATE_FRAGMENT_KEPT); - dir_auth = dir->dir_auth; - - dir->purge_stolen(waiters); - inode->close_dirfrag(dir->get_frag()); - } - - init_fragment_pins(); -} - - - - - - - -CDirDiscover *CDir::replicate_to(int mds) -{ - assert(is_auth()); - return new CDirDiscover( this, add_replica(mds) ); -} - - - - - -/**************************************** - * WAITING - */ - -void CDir::add_dentry_waiter(const string& dname, Context *c) -{ - if (waiting_on_dentry.empty()) - get(PIN_DNWAITER); - waiting_on_dentry[dname].push_back(c); - dout(10) << "add_dentry_waiter dentry " << dname << " " << c << " on " << *this << dendl; -} - -void CDir::take_dentry_waiting(const string& dname, list& ls) -{ - if (waiting_on_dentry.empty()) return; - if (waiting_on_dentry.count(dname) == 0) return; - dout(10) << "take_dentry_waiting dentry " << dname - << " x " << waiting_on_dentry[dname].size() - << " on " << *this << dendl; - ls.splice(ls.end(), waiting_on_dentry[dname]); - waiting_on_dentry.erase(dname); - if (waiting_on_dentry.empty()) - put(PIN_DNWAITER); -} - -void CDir::add_ino_waiter(inodeno_t ino, Context *c) -{ - if (waiting_on_ino.empty()) - get(PIN_INOWAITER); - waiting_on_ino[ino].push_back(c); - dout(10) << "add_ino_waiter ino " << ino << " " << c << " on " << *this << dendl; -} - -void CDir::take_ino_waiting(inodeno_t ino, list& ls) -{ - if (waiting_on_ino.empty()) return; - if (waiting_on_ino.count(ino) == 0) return; - dout(10) << "take_ino_waiting ino " << ino - << " x " << waiting_on_ino[ino].size() - << " on " << *this << dendl; - ls.splice(ls.end(), waiting_on_ino[ino]); - waiting_on_ino.erase(ino); - if (waiting_on_ino.empty()) - put(PIN_INOWAITER); -} - -void CDir::take_sub_waiting(list& ls) -{ - dout(10) << "take_sub_waiting" << dendl; - for (hash_map >::iterator p = waiting_on_dentry.begin(); - p != waiting_on_dentry.end(); - ++p) - ls.splice(ls.end(), p->second); - waiting_on_dentry.clear(); - for (hash_map >::iterator p = waiting_on_ino.begin(); - p != waiting_on_ino.end(); - ++p) - ls.splice(ls.end(), p->second); - waiting_on_ino.clear(); -} - - - -void CDir::add_waiter(int tag, Context *c) -{ - // hierarchical? - - // at free root? - if (tag & WAIT_ATFREEZEROOT) { - if (!(is_freezing_tree_root() || is_frozen_tree_root() || - is_freezing_dir() || is_frozen_dir())) { - // try parent - dout(10) << "add_waiter " << tag << " " << c << " should be ATFREEZEROOT, " << *this << " is not root, trying parent" << dendl; - inode->parent->dir->add_waiter(tag, c); - return; - } - } - - // at subtree root? - if (tag & WAIT_ATSUBTREEROOT) { - if (!is_subtree_root()) { - // try parent - dout(10) << "add_waiter " << tag << " " << c << " should be ATSUBTREEROOT, " << *this << " is not root, trying parent" << dendl; - inode->parent->dir->add_waiter(tag, c); - return; - } - } - - MDSCacheObject::add_waiter(tag, c); -} - - - -/* NOTE: this checks dentry waiters too */ -void CDir::take_waiting(int mask, list& ls) -{ - if (mask & WAIT_DENTRY) { - // take each each dentry waiter - hash_map >::iterator it = - waiting_on_dentry.begin(); - while (it != waiting_on_dentry.end()) { - take_dentry_waiting((it++)->first, ls); // not post-inc - } - } - - // waiting - MDSCacheObject::take_waiting(mask, ls); -} - - -void CDir::finish_waiting(int mask, int result) -{ - dout(11) << "finish_waiting mask " << hex << mask << dec << " result " << result << " on " << *this << dendl; - - list finished; - take_waiting(mask, finished); - if (result < 0) - finish_contexts(finished, result); - else - cache->mds->queue_waiters(finished); -} - - - -// dirty/clean - -version_t CDir::pre_dirty(version_t min) -{ - if (min > projected_version) - projected_version = min; - ++projected_version; - dout(10) << "pre_dirty " << projected_version << dendl; - return projected_version; -} - -void CDir::_mark_dirty(LogSegment *ls) -{ - if (!state_test(STATE_DIRTY)) { - state_set(STATE_DIRTY); - dout(10) << "mark_dirty (was clean) " << *this << " version " << version << dendl; - get(PIN_DIRTY); - assert(ls); - } else { - dout(10) << "mark_dirty (already dirty) " << *this << " version " << version << dendl; - } - if (ls) - ls->dirty_dirfrags.push_back(&xlist_dirty); -} - -void CDir::mark_dirty(version_t pv, LogSegment *ls) -{ - assert(version < pv); - version = pv; - _mark_dirty(ls); -} - -void CDir::mark_clean() -{ - dout(10) << "mark_clean " << *this << " version " << version << dendl; - if (state_test(STATE_DIRTY)) { - state_clear(STATE_DIRTY); - put(PIN_DIRTY); - - xlist_dirty.remove_myself(); - } -} - - - - -void CDir::first_get() -{ - inode->get(CInode::PIN_DIRFRAG); -} - -void CDir::last_put() -{ - inode->put(CInode::PIN_DIRFRAG); -} - - - -/****************************************************************************** - * FETCH and COMMIT - */ - -// ----------------------- -// FETCH - -class C_Dir_Fetch : public Context { - protected: - CDir *dir; - public: - bufferlist bl; - - C_Dir_Fetch(CDir *d) : dir(d) { } - void finish(int result) { - dir->_fetched(bl); - } -}; - -void CDir::fetch(Context *c, bool ignore_authpinnability) -{ - dout(10) << "fetch on " << *this << dendl; - - assert(is_auth()); - assert(!is_complete()); - - if (!can_auth_pin() && !ignore_authpinnability) { - dout(7) << "fetch waiting for authpinnable" << dendl; - add_waiter(WAIT_UNFREEZE, c); - return; - } - - if (c) add_waiter(WAIT_COMPLETE, c); - - // already fetching? - if (state_test(CDir::STATE_FETCHING)) { - dout(7) << "already fetching; waiting" << dendl; - return; - } - - auth_pin(); - state_set(CDir::STATE_FETCHING); - - if (cache->mds->logger) cache->mds->logger->inc("dir_f"); - - // start by reading the first hunk of it - C_Dir_Fetch *fin = new C_Dir_Fetch(this); - cache->mds->objecter->read( get_ondisk_object(), - 0, 0, // whole object - cache->mds->objecter->osdmap->file_to_object_layout( get_ondisk_object(), - g_OSD_MDDirLayout ), - &fin->bl, - fin ); -} - -void CDir::_fetched(bufferlist &bl) -{ - dout(10) << "_fetched " << bl.length() - << " bytes for " << *this - << dendl; - - assert(is_auth()); - assert(!is_frozen()); - - // decode. - int len = bl.length(); - int off = 0; - version_t got_version; - - ::_decode(got_version, bl, off); - - dout(10) << "_fetched version " << got_version - << ", " << len << " bytes" - << dendl; - - int32_t n; - ::_decode(n, bl, off); - - //int num_new_inodes_loaded = 0; - - for (int i=0; iget_inode() == 0) { - dout(12) << "_fetched had NEG dentry " << *dn << dendl; - } else { - dout(12) << "_fetched had dentry " << *dn << dendl; - } - } else { - // (remote) link - dn = add_remote_dentry(dname, ino, d_type); - - // link to inode? - CInode *in = cache->get_inode(ino); // we may or may not have it. - if (in) { - dn->link_remote(in); - dout(12) << "_fetched got remote link " << ino << " which we have " << *in << dendl; - } else { - dout(12) << "_fetched got remote link " << ino << " (dont' have it)" << dendl; - } - } - } - else if (type == 'I') { - // inode - - // parse out inode - inode_t inode; - ::_decode(inode, bl, off); - - string symlink; - if (inode.is_symlink()) - ::_decode(symlink, bl, off); - - fragtree_t fragtree; - fragtree._decode(bl, off); - - if (dn) { - if (dn->get_inode() == 0) { - dout(12) << "_fetched had NEG dentry " << *dn << dendl; - } else { - dout(12) << "_fetched had dentry " << *dn << dendl; - } - } else { - // add inode - CInode *in = 0; - if (cache->have_inode(inode.ino)) { - in = cache->get_inode(inode.ino); - dout(-12) << "_fetched got (but i already had) " << *in - << " mode " << in->inode.mode - << " mtime " << in->inode.mtime << dendl; - assert(0); // this shouldn't happen!! - } else { - // inode - in = new CInode(cache); - in->inode = inode; - - // symlink? - if (in->is_symlink()) - in->symlink = symlink; - - // dirfragtree - in->dirfragtree.swap(fragtree); - - // add - cache->add_inode( in ); - - // link - dn = add_primary_dentry(dname, in); - dout(12) << "_fetched got " << *dn << " " << *in << dendl; - - //in->hack_accessed = false; - //in->hack_load_stamp = g_clock.now(); - //num_new_inodes_loaded++; - } - } - } else { - dout(1) << "corrupt directory, i got tag char '" << type << "' val " << (int)(type) - << " at pos " << off << dendl; - assert(0); - } - - // make note of dentry position in the directory - dn->dir_offset = dn_offset; - - /** clean underwater item? - * Underwater item is something that is dirty in our cache from - * journal replay, but was previously flushed to disk before the - * mds failed. - * - * We only do this is committed_version == 0. that implies either - * - this is a fetch after from a clean/empty CDir is created - * (and has no effect, since the dn won't exist); or - * - this is a fetch after _recovery_, which is what we're worried - * about. Items that are marked dirty from the journal should be - * marked clean if they appear on disk. - */ - if (committed_version == 0 && - dn && - dn->get_version() <= got_version && - dn->is_dirty()) { - dout(10) << "_fetched had underwater dentry " << *dn << ", marking clean" << dendl; - dn->mark_clean(); - - if (dn->get_inode()) { - assert(dn->get_inode()->get_version() <= got_version); - dout(10) << "_fetched had underwater inode " << *dn->get_inode() << ", marking clean" << dendl; - dn->get_inode()->mark_clean(); - } - } - } - //assert(off == len); no, directories may shrink. add this back in when we properly truncate objects on write. - - // take the loaded version? - // only if we are a fresh CDir* with no prior state. - if (version == 0) { - assert(projected_version == 0); - assert(!state_test(STATE_COMMITTING)); - projected_version = version = committing_version = committed_version = got_version; - } - - //cache->mds->logger->inc("newin", num_new_inodes_loaded); - //hack_num_accessed = 0; - - // mark complete, !fetching - state_set(STATE_COMPLETE); - state_clear(STATE_FETCHING); - auth_unpin(); - - // kick waiters - finish_waiting(WAIT_COMPLETE, 0); -} - - - -// ----------------------- -// COMMIT - -/** - * commit - * - * @param want - min version i want committed - * @param c - callback for completion - */ -void CDir::commit(version_t want, Context *c) -{ - dout(10) << "commit want " << want << " on " << *this << dendl; - if (want == 0) want = version; - - // preconditions - assert(want <= version || version == 0); // can't commit the future - assert(want > committed_version); // the caller is stupid - assert(is_auth()); - assert(can_auth_pin()); - - // note: queue up a noop if necessary, so that we always - // get an auth_pin. - if (!c) - c = new C_NoopContext; - - // auth_pin on first waiter - if (waiting_for_commit.empty()) - auth_pin(); - waiting_for_commit[want].push_back(c); - - // ok. - _commit(want); -} - - -class C_Dir_RetryCommit : public Context { - CDir *dir; - version_t want; -public: - C_Dir_RetryCommit(CDir *d, version_t v) : - dir(d), want(v) { } - void finish(int r) { - dir->_commit(want); - } -}; - -class C_Dir_Committed : public Context { - CDir *dir; - version_t version; -public: - C_Dir_Committed(CDir *d, version_t v) : dir(d), version(v) { } - void finish(int r) { - dir->_committed(version); - } -}; - -void CDir::_commit(version_t want) -{ - dout(10) << "_commit want " << want << " on " << *this << dendl; - - // we can't commit things in the future. - // (even the projected future.) - assert(want <= version || version == 0); - - // check pre+postconditions. - assert(is_auth()); - - // already committed? - if (committed_version >= want) { - dout(10) << "already committed " << committed_version << " >= " << want << dendl; - return; - } - // already committing >= want? - if (committing_version >= want) { - dout(10) << "already committing " << committing_version << " >= " << want << dendl; - assert(state_test(STATE_COMMITTING)); - return; - } - - // complete? - if (!is_complete()) { - dout(7) << "commit not complete, fetching first" << dendl; - if (cache->mds->logger) cache->mds->logger->inc("dir_ffc"); - fetch(new C_Dir_RetryCommit(this, want)); - return; - } - - // commit. - committing_version = version; - - // mark committing (if not already) - if (!state_test(STATE_COMMITTING)) { - dout(10) << "marking committing" << dendl; - state_set(STATE_COMMITTING); - } - - if (cache->mds->logger) cache->mds->logger->inc("dir_c"); - - // encode - bufferlist bl; - - ::_encode(version, bl); - int32_t n = nitems; - ::_encode(n, bl); - - for (map_t::iterator it = items.begin(); - it != items.end(); - it++) { - CDentry *dn = it->second; - - if (dn->is_null()) - continue; // skip negative entries - - n--; - - // primary or remote? - if (dn->is_remote()) { - inodeno_t ino = dn->get_remote_ino(); - unsigned char d_type = dn->get_remote_d_type(); - dout(14) << " pos " << bl.length() << " dn '" << it->first << "' remote ino " << ino << dendl; - - // marker, name, ino - bl.append( "L", 1 ); // remote link - ::_encode(it->first, bl); - ::_encode(ino, bl); - ::_encode(d_type, bl); - } else { - // primary link - CInode *in = dn->get_inode(); - assert(in); - - dout(14) << " pos " << bl.length() << " dn '" << it->first << "' inode " << *in << dendl; - - // marker, name, inode, [symlink string] - bl.append( "I", 1 ); // inode - ::_encode(it->first, bl); - ::_encode(in->inode, bl); - - if (in->is_symlink()) { - // include symlink destination! - dout(18) << " inlcuding symlink ptr " << in->symlink << dendl; - ::_encode(in->symlink, bl); - } - - in->dirfragtree._encode(bl); - } - } - assert(n == 0); - - // write it. - cache->mds->objecter->write( get_ondisk_object(), - 0, bl.length(), - cache->mds->objecter->osdmap->file_to_object_layout( get_ondisk_object(), - g_OSD_MDDirLayout ), - bl, - NULL, new C_Dir_Committed(this, version) ); -} - - -/** - * _committed - * - * @param v version i just committed - */ -void CDir::_committed(version_t v) -{ - dout(10) << "_committed v " << v << " on " << *this << dendl; - assert(is_auth()); - - // take note. - assert(v > committed_version); - assert(v <= committing_version); - committed_version = v; - - // _all_ commits done? - if (committing_version == committed_version) - state_clear(CDir::STATE_COMMITTING); - - // dir clean? - if (committed_version == version) - mark_clean(); - - // dentries clean? - for (map_t::iterator it = items.begin(); - it != items.end(); ) { - CDentry *dn = it->second; - it++; - - // dentry - if (committed_version >= dn->get_version()) { - if (dn->is_dirty()) { - dout(15) << " dir " << committed_version << " >= dn " << dn->get_version() << " now clean " << *dn << dendl; - dn->mark_clean(); - } - } else { - dout(15) << " dir " << committed_version << " < dn " << dn->get_version() << " still dirty " << *dn << dendl; - } - - // inode? - if (dn->is_primary()) { - CInode *in = dn->get_inode(); - assert(in); - assert(in->is_auth()); - - if (committed_version >= in->get_version()) { - if (in->is_dirty()) { - dout(15) << " dir " << committed_version << " >= inode " << in->get_version() << " now clean " << *in << dendl; - in->mark_clean(); - } - } else { - dout(15) << " dir " << committed_version << " < inode " << in->get_version() << " still dirty " << *in << dendl; - assert(in->is_dirty()); - } - } - } - - // finishers? - bool were_waiters = !waiting_for_commit.empty(); - - map >::iterator p = waiting_for_commit.begin(); - while (p != waiting_for_commit.end()) { - map >::iterator n = p; - n++; - if (p->first > committed_version) break; // haven't committed this far yet. - cache->mds->queue_waiters(p->second); - waiting_for_commit.erase(p); - p = n; - } - - // unpin if we kicked the last waiter. - if (were_waiters && - waiting_for_commit.empty()) - auth_unpin(); -} - - - - - -// IMPORT/EXPORT - -void CDir::encode_export(bufferlist& bl) -{ - ::_encode_simple(version, bl); - ::_encode_simple(committed_version, bl); - ::_encode_simple(committed_version_equivalent, bl); - - ::_encode_simple(state, bl); - ::_encode_simple(dir_rep, bl); - - ::_encode_simple(pop_me, bl); - ::_encode_simple(pop_auth_subtree, bl); - - ::_encode_simple(dir_rep_by, bl); - ::_encode_simple(replica_map, bl); - - get(PIN_TEMPEXPORTING); -} - -void CDir::finish_export(utime_t now) -{ - pop_auth_subtree_nested -= pop_auth_subtree; - pop_me.zero(now); - pop_auth_subtree.zero(now); - put(PIN_TEMPEXPORTING); -} - -void CDir::decode_import(bufferlist::iterator& blp) -{ - ::_decode_simple(version, blp); - ::_decode_simple(committed_version, blp); - ::_decode_simple(committed_version_equivalent, blp); - committing_version = committed_version; - projected_version = version; - - unsigned s; - ::_decode_simple(s, blp); - state &= MASK_STATE_IMPORT_KEPT; - state |= (s & MASK_STATE_EXPORTED); - if (is_dirty()) get(PIN_DIRTY); - - ::_decode_simple(dir_rep, blp); - - ::_decode_simple(pop_me, blp); - ::_decode_simple(pop_auth_subtree, blp); - pop_auth_subtree_nested += pop_auth_subtree; - - ::_decode_simple(dir_rep_by, blp); - ::_decode_simple(replica_map, blp); - if (!replica_map.empty()) get(PIN_REPLICATED); - - replica_nonce = 0; // no longer defined -} - - - - -/******************************** - * AUTHORITY - */ - -/* - * if dir_auth.first == parent, auth is same as inode. - * unless .second != unknown, in which case that sticks. - */ -pair CDir::authority() -{ - if (is_subtree_root()) - return dir_auth; - else - return inode->authority(); -} - -/** is_subtree_root() - * true if this is an auth delegation point. - * that is, dir_auth != default (parent,unknown) - * - * some key observations: - * if i am auth: - * - any region bound will be an export, or frozen. - * - * note that this DOES heed dir_auth.pending - */ -/* -bool CDir::is_subtree_root() -{ - if (dir_auth == CDIR_AUTH_DEFAULT) { - //dout(10) << "is_subtree_root false " << dir_auth << " != " << CDIR_AUTH_DEFAULT - //<< " on " << ino() << dendl; - return false; - } else { - //dout(10) << "is_subtree_root true " << dir_auth << " != " << CDIR_AUTH_DEFAULT - //<< " on " << ino() << dendl; - return true; - } -} -*/ - -/** contains(x) - * true if we are x, or an ancestor of x - */ -bool CDir::contains(CDir *x) -{ - while (1) { - if (x == this) return true; - x = x->get_parent_dir(); - if (x == 0) return false; - } -} - - - -/** set_dir_auth - */ -void CDir::set_dir_auth(pair a) -{ - dout(10) << "setting dir_auth=" << a - << " from " << dir_auth - << " on " << *this << dendl; - - bool was_subtree = is_subtree_root(); - bool was_ambiguous = dir_auth.second >= 0; - - // set it. - dir_auth = a; - - // new subtree root? - if (!was_subtree && is_subtree_root()) { - dout(10) << " new subtree root, adjusting auth_pins" << dendl; - - // adjust nested auth pins - inode->adjust_nested_auth_pins(-get_cum_auth_pins()); - - // unpin parent of frozen dir/tree? - if (inode->is_auth() && (is_frozen_tree_root() || is_frozen_dir())) - inode->auth_unpin(); - } - if (was_subtree && !is_subtree_root()) { - dout(10) << " old subtree root, adjusting auth_pins" << dendl; - - // adjust nested auth pins - inode->adjust_nested_auth_pins(get_cum_auth_pins()); - - // pin parent of frozen dir/tree? - if (inode->is_auth() && (is_frozen_tree_root() || is_frozen_dir())) - inode->auth_pin(); - } - - // newly single auth? - if (was_ambiguous && dir_auth.second == CDIR_AUTH_UNKNOWN) { - list ls; - take_waiting(WAIT_SINGLEAUTH, ls); - cache->mds->queue_waiters(ls); - } -} - - -/***************************************** - * AUTH PINS and FREEZING - * - * the basic plan is that auth_pins only exist in auth regions, and they - * prevent a freeze (and subsequent auth change). - * - * however, we also need to prevent a parent from freezing if a child is frozen. - * for that reason, the parent inode of a frozen directory is auth_pinned. - * - * the oddity is when the frozen directory is a subtree root. if that's the case, - * the parent inode isn't frozen. which means that when subtree authority is adjusted - * at the bounds, inodes for any frozen bound directories need to get auth_pins at that - * time. - * - */ - -void CDir::auth_pin() -{ - if (auth_pins == 0) - get(PIN_AUTHPIN); - auth_pins++; - - dout(10) << "auth_pin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << dendl; - - // nest pins? - if (is_subtree_root()) return; // no. - //assert(!is_import()); - - inode->adjust_nested_auth_pins(1); -} - -void CDir::auth_unpin() -{ - auth_pins--; - if (auth_pins == 0) - put(PIN_AUTHPIN); - - dout(10) << "auth_unpin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << dendl; - assert(auth_pins >= 0); - - maybe_finish_freeze(); // pending freeze? - - // nest? - if (is_subtree_root()) return; // no. - //assert(!is_import()); - - inode->adjust_nested_auth_pins(-1); -} - -void CDir::adjust_nested_auth_pins(int inc) -{ - nested_auth_pins += inc; - - dout(15) << "adjust_nested_auth_pins " << inc << " on " << *this - << " count now " << auth_pins << " + " << nested_auth_pins << dendl; - assert(nested_auth_pins >= 0); - - maybe_finish_freeze(); // pending freeze? - - // adjust my inode? - if (is_subtree_root()) - return; // no, stop. - - // yes. - inode->adjust_nested_auth_pins(inc); -} - - - -/***************************************************************************** - * FREEZING - */ - -// FREEZE TREE - -bool CDir::freeze_tree() -{ - assert(!is_frozen()); - assert(!is_freezing()); - - auth_pin(); - if (is_freezeable(true)) { - _freeze_tree(); - auth_unpin(); - return true; - } else { - state_set(STATE_FREEZINGTREE); - dout(10) << "freeze_tree waiting " << *this << dendl; - return false; - } -} - -void CDir::_freeze_tree() -{ - dout(10) << "_freeze_tree " << *this << dendl; - assert(is_freezeable(true)); - - // twiddle state - state_clear(STATE_FREEZINGTREE); // actually, this may get set again by next context? - state_set(STATE_FROZENTREE); - get(PIN_FROZEN); - - // auth_pin inode for duration of freeze, if we are not a subtree root. - if (is_auth() && !is_subtree_root()) - inode->auth_pin(); -} - -void CDir::unfreeze_tree() -{ - dout(10) << "unfreeze_tree " << *this << dendl; - - if (state_test(STATE_FROZENTREE)) { - // frozen. unfreeze. - state_clear(STATE_FROZENTREE); - put(PIN_FROZEN); - - // unpin (may => FREEZEABLE) FIXME: is this order good? - if (is_auth() && !is_subtree_root()) - inode->auth_unpin(); - - // waiters? - finish_waiting(WAIT_UNFREEZE); - } else { - finish_waiting(WAIT_FROZEN, -1); - - // freezing. stop it. - assert(state_test(STATE_FREEZINGTREE)); - state_clear(STATE_FREEZINGTREE); - auth_unpin(); - - finish_waiting(WAIT_UNFREEZE); - } -} - -bool CDir::is_freezing_tree() -{ - CDir *dir = this; - while (1) { - if (dir->is_freezing_tree_root()) return true; - if (dir->is_subtree_root()) return false; - if (dir->inode->parent) - dir = dir->inode->parent->dir; - else - return false; // root on replica - } -} - -bool CDir::is_frozen_tree() -{ - CDir *dir = this; - while (1) { - if (dir->is_frozen_tree_root()) return true; - if (dir->is_subtree_root()) return false; - if (dir->inode->parent) - dir = dir->inode->parent->dir; - else - return false; // root on replica - } -} - -CDir *CDir::get_frozen_tree_root() -{ - assert(is_frozen()); - CDir *dir = this; - while (1) { - if (dir->is_frozen_tree_root()) - return dir; - if (dir->inode->parent) - dir = dir->inode->parent->dir; - else - assert(0); - } -} - - - -// FREEZE DIR - -bool CDir::freeze_dir() -{ - assert(!is_frozen()); - assert(!is_freezing()); - - auth_pin(); - if (is_freezeable_dir(true)) { - _freeze_dir(); - auth_unpin(); - return true; - } else { - state_set(STATE_FREEZINGDIR); - dout(10) << "freeze_dir + wait " << *this << dendl; - return false; - } -} - -void CDir::_freeze_dir() -{ - dout(10) << "_freeze_dir " << *this << dendl; - assert(is_freezeable_dir(true)); - - state_clear(STATE_FREEZINGDIR); - state_set(STATE_FROZENDIR); - get(PIN_FROZEN); - - if (is_auth() && !is_subtree_root()) - inode->auth_pin(); // auth_pin for duration of freeze -} - - -void CDir::unfreeze_dir() -{ - dout(10) << "unfreeze_dir " << *this << dendl; - - if (state_test(STATE_FROZENDIR)) { - state_clear(STATE_FROZENDIR); - put(PIN_FROZEN); - - // unpin (may => FREEZEABLE) FIXME: is this order good? - if (is_auth() && !is_subtree_root()) - inode->auth_unpin(); - - finish_waiting(WAIT_UNFREEZE); - } else { - finish_waiting(WAIT_FROZEN, -1); - - // still freezing. stop. - assert(state_test(STATE_FREEZINGDIR)); - state_clear(STATE_FREEZINGDIR); - auth_unpin(); - - finish_waiting(WAIT_UNFREEZE); - } -} - - - - - - - - diff --git a/branches/sage/crush/mds/CInode.cc b/branches/sage/crush/mds/CInode.cc deleted file mode 100644 index 3bdfc89e3f1fa..0000000000000 --- a/branches/sage/crush/mds/CInode.cc +++ /dev/null @@ -1,838 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "CInode.h" -#include "CDir.h" -#include "CDentry.h" - -#include "MDS.h" -#include "MDCache.h" -#include "AnchorTable.h" - -#include "LogSegment.h" - -#include "common/Clock.h" - -#include "messages/MLock.h" - -#include -#include - -#include "config.h" - -#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_mds) *_dout << dbeginl << g_clock.now() << " mds" << mdcache->mds->get_nodeid() << ".cache.ino(" << inode.ino << ") " - - -//int cinode_pins[CINODE_NUM_PINS]; // counts -ostream& CInode::print_db_line_prefix(ostream& out) -{ - return out << g_clock.now() << " mds" << mdcache->mds->get_nodeid() << ".cache.ino(" << inode.ino << ") "; -} - - - -ostream& operator<<(ostream& out, CInode& in) -{ - string path; - in.make_path(path); - out << "[inode " << in.inode.ino << " " << path << (in.is_dir() ? "/ ":" "); - if (in.is_auth()) { - out << "auth"; - if (in.is_replicated()) - out << in.get_replicas(); - } else { - out << "rep@" << in.authority(); - out << "." << in.get_replica_nonce(); - assert(in.get_replica_nonce() >= 0); - } - - if (in.is_symlink()) out << " symlink"; - if (in.is_dir() && !in.dirfragtree.empty()) out << " " << in.dirfragtree; - - out << " v" << in.get_version(); - - if (in.state_test(CInode::STATE_AMBIGUOUSAUTH)) out << " AMBIGAUTH"; - if (in.is_freezing_inode()) out << " FREEZING=" << in.auth_pin_freeze_allowance; - if (in.is_frozen_inode()) out << " FROZEN"; - - // locks - out << " " << in.authlock; - out << " " << in.linklock; - out << " " << in.dirfragtreelock; - out << " " << in.filelock; - out << " " << in.dirlock; - - // hack: spit out crap on which clients have caps - if (!in.get_client_caps().empty()) { - out << " caps={"; - for (map::iterator it = in.get_client_caps().begin(); - it != in.get_client_caps().end(); - it++) { - if (it != in.get_client_caps().begin()) out << ","; - out << it->first; - } - out << "}"; - } - - if (in.get_num_ref()) { - out << " |"; - in.print_pin_set(out); - } - - out << " " << ∈ - out << "]"; - return out; -} - - -void CInode::print(ostream& out) -{ - out << *this; -} - - -inode_t *CInode::project_inode() -{ - if (projected_inode.empty()) { - projected_inode.push_back(new inode_t(inode)); - } else { - projected_inode.push_back(new inode_t(*projected_inode.back())); - } - dout(15) << "project_inode " << projected_inode.back() << dendl; - return projected_inode.back(); -} - -void CInode::pop_and_dirty_projected_inode(LogSegment *ls) -{ - assert(!projected_inode.empty()); - dout(15) << "pop_and_dirty_projected_inode " << projected_inode.front() - << " v" << projected_inode.front()->version << dendl; - mark_dirty(projected_inode.front()->version, ls); - inode = *projected_inode.front(); - delete projected_inode.front(); - projected_inode.pop_front(); -} - - -// ====== CInode ======= - -// dirfrags - -frag_t CInode::pick_dirfrag(const string& dn) -{ - if (dirfragtree.empty()) - return frag_t(); // avoid the string hash if we can. - - static hash H; - return dirfragtree[H(dn)]; -} - -void CInode::get_dirfrags_under(frag_t fg, list& ls) -{ - list fglist; - dirfragtree.get_leaves_under(fg, fglist); - for (list::iterator p = fglist.begin(); - p != fglist.end(); - ++p) - if (dirfrags.count(*p)) - ls.push_back(dirfrags[*p]); -} - -CDir *CInode::get_approx_dirfrag(frag_t fg) -{ - CDir *dir = get_dirfrag(fg); - if (dir) return dir; - - // find a child? - list ls; - get_dirfrags_under(fg, ls); - if (!ls.empty()) - return ls.front(); - - // try parents? - while (1) { - fg = fg.parent(); - dir = get_dirfrag(fg); - if (dir) return dir; - } -} - -void CInode::get_dirfrags(list& ls) -{ - // all dirfrags - for (map::iterator p = dirfrags.begin(); - p != dirfrags.end(); - ++p) - ls.push_back(p->second); -} -void CInode::get_nested_dirfrags(list& ls) -{ - // dirfrags in same subtree - for (map::iterator p = dirfrags.begin(); - p != dirfrags.end(); - ++p) - if (!p->second->is_subtree_root()) - ls.push_back(p->second); -} -void CInode::get_subtree_dirfrags(list& ls) -{ - // dirfrags that are roots of new subtrees - for (map::iterator p = dirfrags.begin(); - p != dirfrags.end(); - ++p) - if (p->second->is_subtree_root()) - ls.push_back(p->second); -} - - -CDir *CInode::get_or_open_dirfrag(MDCache *mdcache, frag_t fg) -{ - assert(is_dir()); - - // have it? - CDir *dir = get_dirfrag(fg); - if (!dir) { - // create it. - assert(is_auth()); - dir = new CDir(this, fg, mdcache, true); - add_dirfrag(dir); - } - return dir; -} - -CDir *CInode::add_dirfrag(CDir *dir) -{ - assert(dirfrags.count(dir->dirfrag().frag) == 0); - dirfrags[dir->dirfrag().frag] = dir; - - if (stickydir_ref > 0) { - dir->state_set(CDir::STATE_STICKY); - dir->get(CDir::PIN_STICKY); - } - - return dir; -} - -void CInode::close_dirfrag(frag_t fg) -{ - dout(14) << "close_dirfrag " << fg << dendl; - assert(dirfrags.count(fg)); - - CDir *dir = dirfrags[fg]; - dir->remove_null_dentries(); - - // clear dirty flag - if (dir->is_dirty()) - dir->mark_clean(); - - if (stickydir_ref > 0) { - dir->state_clear(CDir::STATE_STICKY); - dir->put(CDir::PIN_STICKY); - } - - // dump any remaining dentries, for debugging purposes - for (CDir::map_t::iterator p = dir->items.begin(); - p != dir->items.end(); - ++p) - dout(14) << "close_dirfrag LEFTOVER dn " << *p->second << dendl; - - assert(dir->get_num_ref() == 0); - delete dir; - dirfrags.erase(fg); -} - -void CInode::close_dirfrags() -{ - while (!dirfrags.empty()) - close_dirfrag(dirfrags.begin()->first); -} - -bool CInode::has_subtree_root_dirfrag() -{ - for (map::iterator p = dirfrags.begin(); - p != dirfrags.end(); - ++p) - if (p->second->is_subtree_root()) - return true; - return false; -} - - -void CInode::get_stickydirs() -{ - if (stickydir_ref == 0) { - get(PIN_STICKYDIRS); - for (map::iterator p = dirfrags.begin(); - p != dirfrags.end(); - ++p) { - p->second->state_set(CDir::STATE_STICKY); - p->second->get(CDir::PIN_STICKY); - } - } - stickydir_ref++; -} - -void CInode::put_stickydirs() -{ - assert(stickydir_ref > 0); - stickydir_ref--; - if (stickydir_ref == 0) { - put(PIN_STICKYDIRS); - for (map::iterator p = dirfrags.begin(); - p != dirfrags.end(); - ++p) { - p->second->state_clear(CDir::STATE_STICKY); - p->second->put(CDir::PIN_STICKY); - } - } -} - - - - - -// pins - -void CInode::first_get() -{ - // pin my dentry? - if (parent) - parent->get(CDentry::PIN_INODEPIN); -} - -void CInode::last_put() -{ - // unpin my dentry? - if (parent) - parent->put(CDentry::PIN_INODEPIN); -} - -void CInode::add_remote_parent(CDentry *p) -{ - if (remote_parents.empty()) - get(PIN_REMOTEPARENT); - remote_parents.insert(p); -} -void CInode::remove_remote_parent(CDentry *p) -{ - remote_parents.erase(p); - if (remote_parents.empty()) - put(PIN_REMOTEPARENT); -} - - - - -CDir *CInode::get_parent_dir() -{ - if (parent) - return parent->dir; - return NULL; -} -CInode *CInode::get_parent_inode() -{ - if (parent) - return parent->dir->inode; - return NULL; -} - - - -void CInode::make_path(string& s) -{ - if (parent) { - parent->make_path(s); - } - else if (is_root()) { - s = ""; // root - } - else if (is_stray()) { - s = "~stray"; - char n[10]; - sprintf(n, "%d", (int)(ino()-MDS_INO_STRAY_OFFSET)); - s += n; - } - else { - s = "(dangling)"; // dangling - } -} - -void CInode::make_anchor_trace(vector& trace) -{ - if (parent) { - parent->dir->inode->make_anchor_trace(trace); - trace.push_back(Anchor(ino(), parent->dir->dirfrag())); - dout(10) << "make_anchor_trace added " << trace.back() << dendl; - } - else - assert(is_root() || is_stray()); -} - -void CInode::name_stray_dentry(string& dname) -{ - char s[20]; -#ifdef __LP64__ - sprintf(s, "%lx", inode.ino.val); -#else - sprintf(s, "%llx", inode.ino.val); -#endif - dname = s; -} - - -version_t CInode::pre_dirty() -{ - assert(parent); - return parent->pre_dirty(); -} - -void CInode::_mark_dirty(LogSegment *ls) -{ - if (!state_test(STATE_DIRTY)) { - state_set(STATE_DIRTY); - get(PIN_DIRTY); - assert(ls); - } - - // move myself to this segment's dirty list - if (ls) - ls->dirty_inodes.push_back(&xlist_dirty); -} - -void CInode::mark_dirty(version_t pv, LogSegment *ls) { - - dout(10) << "mark_dirty " << *this << dendl; - - assert(parent); - - /* - NOTE: I may already be dirty, but this fn _still_ needs to be called so that - the directory is (perhaps newly) dirtied, and so that parent_dir_version is - updated below. - */ - - // only auth can get dirty. "dirty" async data in replicas is relative to - // filelock state, not the dirty flag. - assert(is_auth()); - - // touch my private version - assert(inode.version < pv); - inode.version = pv; - _mark_dirty(ls); - - // mark dentry too - parent->mark_dirty(pv, ls); -} - - -void CInode::mark_clean() -{ - dout(10) << " mark_clean " << *this << dendl; - if (state_test(STATE_DIRTY)) { - state_clear(STATE_DIRTY); - put(PIN_DIRTY); - - // remove myself from ls dirty list - xlist_dirty.remove_myself(); - } -} - - - -// ------------------ -// locking - -void CInode::set_object_info(MDSCacheObjectInfo &info) -{ - info.ino = ino(); -} - -void CInode::encode_lock_state(int type, bufferlist& bl) -{ - switch (type) { - case LOCK_OTYPE_IAUTH: - _encode(inode.ctime, bl); - _encode(inode.mode, bl); - _encode(inode.uid, bl); - _encode(inode.gid, bl); - break; - - case LOCK_OTYPE_ILINK: - _encode(inode.ctime, bl); - _encode(inode.nlink, bl); - _encode(inode.anchored, bl); - break; - - case LOCK_OTYPE_IDIRFRAGTREE: - { - // encode the raw tree - dirfragtree._encode(bl); - - // also specify which frags are mine - set myfrags; - list dfls; - get_dirfrags(dfls); - for (list::iterator p = dfls.begin(); p != dfls.end(); ++p) - if ((*p)->is_auth()) - myfrags.insert((*p)->get_frag()); - _encode(myfrags, bl); - } - break; - - case LOCK_OTYPE_IFILE: - _encode(inode.size, bl); - _encode(inode.mtime, bl); - _encode(inode.atime, bl); - break; - - case LOCK_OTYPE_IDIR: - _encode(inode.mtime, bl); - if (0) { - map dfsz; - for (map::iterator p = dirfrags.begin(); - p != dirfrags.end(); - ++p) - if (p->second->is_auth()) - dfsz[p->first] = p->second->get_nitems(); - _encode(dfsz, bl); - } - break; - - default: - assert(0); - } -} - -void CInode::decode_lock_state(int type, bufferlist& bl) -{ - int off = 0; - utime_t tm; - - switch (type) { - case LOCK_OTYPE_IAUTH: - _decode(tm, bl, off); - if (inode.ctime < tm) inode.ctime = tm; - _decode(inode.mode, bl, off); - _decode(inode.uid, bl, off); - _decode(inode.gid, bl, off); - break; - - case LOCK_OTYPE_ILINK: - _decode(tm, bl, off); - if (inode.ctime < tm) inode.ctime = tm; - _decode(inode.nlink, bl, off); - _decode(inode.anchored, bl, off); - break; - - case LOCK_OTYPE_IDIRFRAGTREE: - { - fragtree_t temp; - temp._decode(bl, off); - set authfrags; - _decode(authfrags, bl, off); - if (is_auth()) { - // auth. believe replica's auth frags only. - for (set::iterator p = authfrags.begin(); p != authfrags.end(); ++p) - dirfragtree.force_to_leaf(*p); - } else { - // replica. just take the tree. - dirfragtree.swap(temp); - } - } - break; - - case LOCK_OTYPE_IFILE: - _decode(inode.size, bl, off); - _decode(inode.mtime, bl, off); - _decode(inode.atime, bl, off); - break; - - case LOCK_OTYPE_IDIR: - //::_decode(inode.size, bl, off); - _decode(tm, bl, off); - if (inode.mtime < tm) { - inode.mtime = tm; - if (is_auth()) { - dout(10) << "decode_lock_state auth got mtime " << tm << " > my " << inode.mtime - << ", setting dirlock updated flag on " << *this - << dendl; - dirlock.set_updated(); - } - } - if (0) { - map dfsz; - ::_decode(dfsz, bl, off); - // hmm which to keep? - } - break; - - default: - assert(0); - } -} - -void CInode::clear_dirty_scattered(int type) -{ - dout(10) << "clear_dirty_scattered " << type << " on " << *this << dendl; - switch (type) { - case LOCK_OTYPE_IDIR: - xlist_dirty_inode_mtime.remove_myself(); - break; - default: - assert(0); - } -} - - - -// waiting - -bool CInode::is_frozen() -{ - if (is_frozen_inode()) return true; - if (parent && parent->dir->is_frozen()) return true; - return false; -} - -bool CInode::is_frozen_dir() -{ - if (parent && parent->dir->is_frozen_dir()) return true; - return false; -} - -bool CInode::is_freezing() -{ - if (is_freezing_inode()) return true; - if (parent && parent->dir->is_freezing()) return true; - return false; -} - -void CInode::add_waiter(int tag, Context *c) -{ - dout(10) << "add_waiter tag " << tag - << " !ambig " << !state_test(STATE_AMBIGUOUSAUTH) - << " !frozen " << !is_frozen_inode() - << " !freezing " << !is_freezing_inode() - << dendl; - // wait on the directory? - // make sure its not the inode that is explicitly ambiguous|freezing|frozen - if (((tag & WAIT_SINGLEAUTH) && !state_test(STATE_AMBIGUOUSAUTH)) || - ((tag & WAIT_UNFREEZE) && !is_frozen_inode() && !is_freezing_inode())) { - parent->dir->add_waiter(tag, c); - return; - } - MDSCacheObject::add_waiter(tag, c); -} - -bool CInode::freeze_inode(int auth_pin_allowance) -{ - assert(auth_pin_allowance > 0); // otherwise we need to adjust parent's nested_auth_pins - assert(auth_pins >= auth_pin_allowance); - if (auth_pins > auth_pin_allowance) { - dout(10) << "freeze_inode - waiting for auth_pins to drop to " << auth_pin_allowance << dendl; - auth_pin_freeze_allowance = auth_pin_allowance; - get(PIN_FREEZING); - state_set(STATE_FREEZING); - return false; - } - - dout(10) << "freeze_inode - frozen" << dendl; - assert(auth_pins == auth_pin_allowance); - get(PIN_FROZEN); - state_set(STATE_FROZEN); - return true; -} - -void CInode::unfreeze_inode(list& finished) -{ - dout(10) << "unfreeze_inode" << dendl; - if (state_test(STATE_FREEZING)) { - state_clear(STATE_FREEZING); - put(PIN_FREEZING); - } else if (state_test(STATE_FROZEN)) { - state_clear(STATE_FROZEN); - put(PIN_FROZEN); - } else - assert(0); - take_waiting(WAIT_UNFREEZE, finished); -} - - -// auth_pins -bool CInode::can_auth_pin() { - if (is_freezing_inode() || is_frozen_inode()) return false; - if (parent) - return parent->can_auth_pin(); - return true; -} - -void CInode::auth_pin() -{ - if (auth_pins == 0) - get(PIN_AUTHPIN); - auth_pins++; - - dout(10) << "auth_pin on " << *this - << " now " << auth_pins << "+" << nested_auth_pins - << dendl; - - if (parent) - parent->adjust_nested_auth_pins( 1 ); -} - -void CInode::auth_unpin() -{ - auth_pins--; - if (auth_pins == 0) - put(PIN_AUTHPIN); - - dout(10) << "auth_unpin on " << *this - << " now " << auth_pins << "+" << nested_auth_pins - << dendl; - - assert(auth_pins >= 0); - - if (parent) - parent->adjust_nested_auth_pins( -1 ); - - if (is_freezing_inode() && - auth_pins == auth_pin_freeze_allowance) { - dout(10) << "auth_unpin freezing!" << dendl; - get(PIN_FROZEN); - put(PIN_FREEZING); - state_clear(STATE_FREEZING); - state_set(STATE_FROZEN); - finish_waiting(WAIT_FROZEN); - } -} - -void CInode::adjust_nested_auth_pins(int a) -{ - if (!parent) return; - nested_auth_pins += a; - - dout(15) << "adjust_nested_auth_pins by " << a - << " now " << auth_pins << "+" << nested_auth_pins - << dendl; - assert(nested_auth_pins >= 0); - - parent->adjust_nested_auth_pins(a); -} - - - -// authority - -pair CInode::authority() -{ - if (force_auth.first >= 0) - return force_auth; - - if (parent) - return parent->dir->authority(); - - return CDIR_AUTH_UNDEF; -} - - -CInodeDiscover* CInode::replicate_to( int rep ) -{ - assert(is_auth()); - - // relax locks? - if (!is_replicated()) - replicate_relax_locks(); - - // return the thinger - int nonce = add_replica( rep ); - return new CInodeDiscover( this, nonce ); -} - - - - -// IMPORT/EXPORT - -void CInode::encode_export(bufferlist& bl) -{ - ::_encode_simple(inode, bl); - ::_encode_simple(symlink, bl); - dirfragtree._encode(bl); - - bool dirty = is_dirty(); - ::_encode_simple(dirty, bl); - - ::_encode_simple(pop, bl); - - ::_encode_simple(replica_map, bl); - - map cap_map; - export_client_caps(cap_map); - ::_encode_simple(cap_map, bl); - - authlock._encode(bl); - linklock._encode(bl); - dirfragtreelock._encode(bl); - filelock._encode(bl); - dirlock._encode(bl); - - get(PIN_TEMPEXPORTING); -} - -void CInode::finish_export(utime_t now) -{ - pop.zero(now); - - // just in case! - dirlock.clear_updated(); - - put(PIN_TEMPEXPORTING); -} - -void CInode::decode_import(bufferlist::iterator& p, - set& new_client_caps, - LogSegment *ls) -{ - utime_t old_mtime = inode.mtime; - ::_decode_simple(inode, p); - if (old_mtime > inode.mtime) { - assert(dirlock.is_updated()); - inode.mtime = old_mtime; // preserve our mtime, if it is larger - } - - ::_decode_simple(symlink, p); - dirfragtree._decode(p); - - bool dirty; - ::_decode_simple(dirty, p); - if (dirty) - _mark_dirty(ls); - - ::_decode_simple(pop, p); - - ::_decode_simple(replica_map, p); - if (!replica_map.empty()) get(PIN_REPLICATED); - - map cap_map; - ::_decode_simple(cap_map, p); - merge_client_caps(cap_map, new_client_caps); - - authlock._decode(p); - linklock._decode(p); - dirfragtreelock._decode(p); - filelock._decode(p); - dirlock._decode(p); -} diff --git a/branches/sage/crush/mds/CInode.h b/branches/sage/crush/mds/CInode.h deleted file mode 100644 index 8f453472a0477..0000000000000 --- a/branches/sage/crush/mds/CInode.h +++ /dev/null @@ -1,612 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __CINODE_H -#define __CINODE_H - -#include "config.h" -#include "include/types.h" -#include "include/lru.h" - -#include "mdstypes.h" - -#include "CDentry.h" -#include "SimpleLock.h" -#include "FileLock.h" -#include "ScatterLock.h" -#include "LocalLock.h" -#include "Capability.h" - - -#include -#include -#include -#include -#include -#include -using namespace std; - -class Context; -class CDentry; -class CDir; -class Message; -class CInode; -class CInodeDiscover; -class MDCache; -class LogSegment; - -ostream& operator<<(ostream& out, CInode& in); - - -// cached inode wrapper -class CInode : public MDSCacheObject { - public: - // -- pins -- - static const int PIN_DIRFRAG = -1; - static const int PIN_CAPS = 2; // client caps - static const int PIN_IMPORTING = -4; // importing - static const int PIN_ANCHORING = 5; - static const int PIN_UNANCHORING = 6; - static const int PIN_OPENINGDIR = 7; - static const int PIN_REMOTEPARENT = 8; - static const int PIN_BATCHOPENJOURNAL = 9; - static const int PIN_SCATTERED = 10; - static const int PIN_STICKYDIRS = 11; - static const int PIN_PURGING = -12; - static const int PIN_FREEZING = 13; - static const int PIN_FROZEN = 14; - - const char *pin_name(int p) { - switch (p) { - case PIN_DIRFRAG: return "dirfrag"; - case PIN_CAPS: return "caps"; - case PIN_IMPORTING: return "importing"; - case PIN_ANCHORING: return "anchoring"; - case PIN_UNANCHORING: return "unanchoring"; - case PIN_OPENINGDIR: return "openingdir"; - case PIN_REMOTEPARENT: return "remoteparent"; - case PIN_BATCHOPENJOURNAL: return "batchopenjournal"; - case PIN_SCATTERED: return "scattered"; - case PIN_STICKYDIRS: return "stickydirs"; - case PIN_FREEZING: return "freezing"; - case PIN_FROZEN: return "frozen"; - default: return generic_pin_name(p); - } - } - - // -- state -- - static const int STATE_EXPORTING = (1<<2); // on nonauth bystander. - static const int STATE_ANCHORING = (1<<3); - static const int STATE_UNANCHORING = (1<<4); - static const int STATE_OPENINGDIR = (1<<5); - static const int STATE_REJOINUNDEF = (1<<6); // inode contents undefined. - static const int STATE_FREEZING = (1<<7); - static const int STATE_FROZEN = (1<<8); - static const int STATE_AMBIGUOUSAUTH = (1<<9); - - // -- waiters -- - //static const int WAIT_SLAVEAGREE = (1<<0); - static const int WAIT_DIR = (1<<1); - static const int WAIT_ANCHORED = (1<<2); - static const int WAIT_UNANCHORED = (1<<3); - static const int WAIT_CAPS = (1<<4); - static const int WAIT_FROZEN = (1<<5); - - static const int WAIT_AUTHLOCK_OFFSET = 5; - static const int WAIT_LINKLOCK_OFFSET = 5 + SimpleLock::WAIT_BITS; - static const int WAIT_DIRFRAGTREELOCK_OFFSET = 5 + 2*SimpleLock::WAIT_BITS; - static const int WAIT_FILELOCK_OFFSET = 5 + 3*SimpleLock::WAIT_BITS; - static const int WAIT_DIRLOCK_OFFSET = 5 + 4*SimpleLock::WAIT_BITS; - static const int WAIT_VERSIONLOCK_OFFSET = 5 + 5*SimpleLock::WAIT_BITS; - - static const int WAIT_ANY = 0xffffffff; - - // misc - static const int EXPORT_NONCE = 1; // nonce given to replicas created by export - - ostream& print_db_line_prefix(ostream& out); - - public: - MDCache *mdcache; - - // inode contents proper - inode_t inode; // the inode itself - string symlink; // symlink dest, if symlink - fragtree_t dirfragtree; // dir frag tree, if any. always consistent with our dirfrag map. - //map dirfrag_size; // size of each dirfrag - - off_t last_journaled; // log offset for the last time i was journaled - off_t last_open_journaled; // log offset for the last journaled EOpen - - //bool hack_accessed; - //utime_t hack_load_stamp; - - // projected values (only defined while dirty) - list projected_inode; - list projected_dirfragtree; - - version_t get_projected_version() { - if (projected_inode.empty()) - return inode.version; - else - return projected_inode.back()->version; - } - - inode_t *project_inode(); - void pop_and_dirty_projected_inode(LogSegment *ls); - - // -- cache infrastructure -- -private: - map dirfrags; // cached dir fragments - int stickydir_ref; - -public: - frag_t pick_dirfrag(const string &dn); - bool has_dirfrags() { return !dirfrags.empty(); } - CDir* get_dirfrag(frag_t fg) { - if (dirfrags.count(fg)) { - assert(g_conf.debug_mds < 2 || dirfragtree.is_leaf(fg)); // performance hack FIXME - return dirfrags[fg]; - } else - return 0; - } - void get_dirfrags_under(frag_t fg, list& ls); - CDir* get_approx_dirfrag(frag_t fg); - void get_dirfrags(list& ls); - void get_nested_dirfrags(list& ls); - void get_subtree_dirfrags(list& ls); - CDir *get_or_open_dirfrag(MDCache *mdcache, frag_t fg); - CDir *add_dirfrag(CDir *dir); - void close_dirfrag(frag_t fg); - void close_dirfrags(); - bool has_subtree_root_dirfrag(); - - void get_stickydirs(); - void put_stickydirs(); - - protected: - // parent dentries in cache - CDentry *parent; // primary link - set remote_parents; // if hard linked - - pair force_auth; - - // -- distributed state -- -protected: - // file capabilities - map client_caps; // client -> caps - map mds_caps_wanted; // [auth] mds -> caps wanted - int replica_caps_wanted; // [replica] what i've requested from auth - utime_t replica_caps_wanted_keep_until; - - - // LogSegment xlists i (may) belong to - xlist::item xlist_dirty; -public: - xlist::item xlist_open_file; - xlist::item xlist_dirty_inode_mtime; - xlist::item xlist_purging_inode; - -private: - // auth pin - int auth_pins; - int nested_auth_pins; -public: - int auth_pin_freeze_allowance; - - public: - inode_load_vec_t pop; - - // friends - friend class Server; - friend class Locker; - friend class Migrator; - friend class MDCache; - friend class CDir; - friend class CInodeExport; - friend class CInodeDiscover; - - public: - // --------------------------- - CInode(MDCache *c, bool auth=true) : - mdcache(c), - last_journaled(0), last_open_journaled(0), - //hack_accessed(true), - stickydir_ref(0), - parent(0), force_auth(CDIR_AUTH_DEFAULT), - replica_caps_wanted(0), - xlist_dirty(this), xlist_open_file(this), - xlist_dirty_inode_mtime(this), xlist_purging_inode(this), - auth_pins(0), nested_auth_pins(0), - versionlock(this, LOCK_OTYPE_IVERSION, WAIT_VERSIONLOCK_OFFSET), - authlock(this, LOCK_OTYPE_IAUTH, WAIT_AUTHLOCK_OFFSET), - linklock(this, LOCK_OTYPE_ILINK, WAIT_LINKLOCK_OFFSET), - dirfragtreelock(this, LOCK_OTYPE_IDIRFRAGTREE, WAIT_DIRFRAGTREELOCK_OFFSET), - filelock(this, LOCK_OTYPE_IFILE, WAIT_FILELOCK_OFFSET), - dirlock(this, LOCK_OTYPE_IDIR, WAIT_DIRLOCK_OFFSET) - { - state = 0; - if (auth) state_set(STATE_AUTH); - }; - ~CInode() { - close_dirfrags(); - } - - - // -- accessors -- - bool is_file() { return inode.is_file(); } - bool is_symlink() { return inode.is_symlink(); } - bool is_dir() { return inode.is_dir(); } - - bool is_anchored() { return inode.anchored; } - bool is_anchoring() { return state_test(STATE_ANCHORING); } - bool is_unanchoring() { return state_test(STATE_UNANCHORING); } - - bool is_root() { return inode.ino == MDS_INO_ROOT; } - bool is_stray() { return MDS_INO_IS_STRAY(inode.ino); } - bool is_base() { return inode.ino < MDS_INO_BASE; } - - // note: this overloads MDSCacheObject - bool is_ambiguous_auth() { - return state_test(STATE_AMBIGUOUSAUTH) || - MDSCacheObject::is_ambiguous_auth(); - } - - - inodeno_t ino() const { return inode.ino; } - inode_t& get_inode() { return inode; } - CDentry* get_parent_dn() { return parent; } - CDir *get_parent_dir(); - CInode *get_parent_inode(); - - bool is_lt(const MDSCacheObject *r) const { - return ino() < ((CInode*)r)->ino(); - } - - // -- misc -- - void make_path(string& s); - void make_anchor_trace(vector& trace); - void name_stray_dentry(string& dname); - - - - // -- dirtyness -- - version_t get_version() { return inode.version; } - - version_t pre_dirty(); - void _mark_dirty(LogSegment *ls); - void mark_dirty(version_t projected_dirv, LogSegment *ls); - void mark_clean(); - - - CInodeDiscover* replicate_to(int rep); - - - // -- waiting -- - void add_waiter(int tag, Context *c); - - - // -- import/export -- - void encode_export(bufferlist& bl); - void finish_export(utime_t now); - void abort_export() { - put(PIN_TEMPEXPORTING); - } - void decode_import(bufferlist::iterator& p, - set& new_client_caps, - LogSegment *ls); - - - // -- locks -- -public: - LocalLock versionlock; - SimpleLock authlock; - SimpleLock linklock; - ScatterLock dirfragtreelock; - FileLock filelock; - ScatterLock dirlock; - - - SimpleLock* get_lock(int type) { - switch (type) { - case LOCK_OTYPE_IFILE: return &filelock; - case LOCK_OTYPE_IAUTH: return &authlock; - case LOCK_OTYPE_ILINK: return &linklock; - case LOCK_OTYPE_IDIRFRAGTREE: return &dirfragtreelock; - case LOCK_OTYPE_IDIR: return &dirlock; - default: assert(0); return 0; - } - } - void set_object_info(MDSCacheObjectInfo &info); - void encode_lock_state(int type, bufferlist& bl); - void decode_lock_state(int type, bufferlist& bl); - - void clear_dirty_scattered(int type); - - // -- caps -- (new) - // client caps - bool is_any_caps() { return !client_caps.empty(); } - map& get_client_caps() { return client_caps; } - void add_client_cap(int client, Capability& cap) { - if (client_caps.empty()) - get(PIN_CAPS); - assert(client_caps.count(client) == 0); - client_caps[client] = cap; - } - void remove_client_cap(int client) { - assert(client_caps.count(client) == 1); - client_caps.erase(client); - if (client_caps.empty()) - put(PIN_CAPS); - } - Capability* get_client_cap(int client) { - if (client_caps.count(client)) - return &client_caps[client]; - return 0; - } - void reconnect_cap(int client, inode_caps_reconnect_t& icr) { - Capability *cap = get_client_cap(client); - if (cap) { - cap->merge(icr.wanted, icr.issued); - } else { - Capability newcap(icr.wanted, 0); - newcap.issue(icr.issued); - add_client_cap(client, newcap); - } - inode.size = MAX(inode.size, icr.size); - inode.mtime = MAX(inode.mtime, icr.mtime); - inode.atime = MAX(inode.atime, icr.atime); - } - /* - void set_client_caps(map& cl) { - if (client_caps.empty() && !cl.empty()) - get(PIN_CAPS); - client_caps.clear(); - client_caps = cl; - } - */ - void clear_client_caps() { - if (!client_caps.empty()) - put(PIN_CAPS); - client_caps.clear(); - } - void export_client_caps(map& cl) { - for (map::iterator it = client_caps.begin(); - it != client_caps.end(); - it++) { - cl[it->first] = it->second.make_export(); - } - } - void merge_client_caps(map& cl, set& new_client_caps) { - if (client_caps.empty() && !cl.empty()) - get(PIN_CAPS); - - for (map::iterator it = cl.begin(); - it != cl.end(); - it++) { - new_client_caps.insert(it->first); - if (client_caps.count(it->first)) { - // merge - client_caps[it->first].merge(it->second); - } else { - // new - client_caps[it->first] = Capability(it->second); - } - } - } - - // caps issued, wanted - int get_caps_issued() { - int c = 0; - for (map::iterator it = client_caps.begin(); - it != client_caps.end(); - it++) - c |= it->second.issued(); - return c; - } - int get_caps_wanted() { - int w = 0; - for (map::iterator it = client_caps.begin(); - it != client_caps.end(); - it++) { - w |= it->second.wanted(); - //cout << " get_caps_wanted client " << it->first << " " << cap_string(it->second.wanted()) << endl; - } - if (is_auth()) - for (map::iterator it = mds_caps_wanted.begin(); - it != mds_caps_wanted.end(); - it++) { - w |= it->second; - //cout << " get_caps_wanted mds " << it->first << " " << cap_string(it->second) << endl; - } - return w; - } - - - void replicate_relax_locks() { - //dout(10) << " relaxing locks on " << *this << dendl; - assert(is_auth()); - assert(!is_replicated()); - - authlock.replicate_relax(); - linklock.replicate_relax(); - dirfragtreelock.replicate_relax(); - - if (get_caps_issued() & (CAP_FILE_WR|CAP_FILE_WRBUFFER) == 0) - filelock.replicate_relax(); - - dirlock.replicate_relax(); - } - - - // -- authority -- - pair authority(); - - - // -- auth pins -- - int is_auth_pinned() { - return auth_pins; - } - void adjust_nested_auth_pins(int a); - bool can_auth_pin(); - void auth_pin(); - void auth_unpin(); - - - // -- freeze -- - bool is_freezing_inode() { return state_test(STATE_FREEZING); } - bool is_frozen_inode() { return state_test(STATE_FROZEN); } - bool is_frozen(); - bool is_frozen_dir(); - bool is_freezing(); - - bool freeze_inode(int auth_pin_allowance=0); - void unfreeze_inode(list& finished); - - - // -- reference counting -- - void bad_put(int by) { - generic_dout(7) << " bad put " << *this << " by " << by << " " << pin_name(by) << " was " << ref << " (" << ref_set << ")" << dendl; -#ifdef MDS_REF_SET - assert(ref_set.count(by) == 1); -#endif - assert(ref > 0); - } - void bad_get(int by) { - generic_dout(7) << " bad get " << *this << " by " << by << " " << pin_name(by) << " was " << ref << " (" << ref_set << ")" << dendl; -#ifdef MDS_REF_SET - assert(ref_set.count(by) == 0); -#endif - } - void first_get(); - void last_put(); - - - // -- hierarchy stuff -- -public: - void set_primary_parent(CDentry *p) { - assert(parent == 0); - parent = p; - } - void remove_primary_parent(CDentry *dn) { - assert(dn == parent); - parent = 0; - } - void add_remote_parent(CDentry *p); - void remove_remote_parent(CDentry *p); - int num_remote_parents() { - return remote_parents.size(); - } - - - /* - // for giving to clients - void get_dist_spec(set& ls, int auth, timepair_t& now) { - if (( is_dir() && popularity[MDS_POP_CURDOM].get(now) > g_conf.mds_bal_replicate_threshold) || - (!is_dir() && popularity[MDS_POP_JUSTME].get(now) > g_conf.mds_bal_replicate_threshold)) { - //if (!cached_by.empty() && inode.ino > 1) dout(1) << "distributed spec for " << *this << dendl; - ls = cached_by; - } - } - */ - - void print(ostream& out); - -}; - - - - -// -- encoded state - -// discover - -class CInodeDiscover { - - inode_t inode; - string symlink; - fragtree_t dirfragtree; - - int replica_nonce; - - int authlock_state; - int linklock_state; - int dirfragtreelock_state; - int filelock_state; - int dirlock_state; - - public: - CInodeDiscover() {} - CInodeDiscover(CInode *in, int nonce) { - inode = in->inode; - symlink = in->symlink; - dirfragtree = in->dirfragtree; - - replica_nonce = nonce; - - authlock_state = in->authlock.get_replica_state(); - linklock_state = in->linklock.get_replica_state(); - dirfragtreelock_state = in->dirfragtreelock.get_replica_state(); - filelock_state = in->filelock.get_replica_state(); - dirlock_state = in->dirlock.get_replica_state(); - } - - inodeno_t get_ino() { return inode.ino; } - int get_replica_nonce() { return replica_nonce; } - - void update_inode(CInode *in) { - in->inode = inode; - in->symlink = symlink; - in->dirfragtree = dirfragtree; - in->replica_nonce = replica_nonce; - } - void init_inode_locks(CInode *in) { - in->authlock.set_state(authlock_state); - in->linklock.set_state(linklock_state); - in->dirfragtreelock.set_state(dirfragtreelock_state); - in->filelock.set_state(filelock_state); - in->dirlock.set_state(dirlock_state); - } - - void _encode(bufferlist& bl) { - ::_encode(inode, bl); - ::_encode(symlink, bl); - dirfragtree._encode(bl); - ::_encode(replica_nonce, bl); - ::_encode(authlock_state, bl); - ::_encode(linklock_state, bl); - ::_encode(dirfragtreelock_state, bl); - ::_encode(filelock_state, bl); - ::_encode(dirlock_state, bl); - } - - void _decode(bufferlist& bl, int& off) { - ::_decode(inode, bl, off); - ::_decode(symlink, bl, off); - dirfragtree._decode(bl, off); - ::_decode(replica_nonce, bl, off); - ::_decode(authlock_state, bl, off); - ::_decode(linklock_state, bl, off); - ::_decode(dirfragtreelock_state, bl, off); - ::_decode(filelock_state, bl, off); - ::_decode(dirlock_state, bl, off); - } - -}; - - - -#endif diff --git a/branches/sage/crush/mds/ClientMap.h b/branches/sage/crush/mds/ClientMap.h deleted file mode 100644 index c36e66d240a33..0000000000000 --- a/branches/sage/crush/mds/ClientMap.h +++ /dev/null @@ -1,194 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __CLIENTMAP_H -#define __CLIENTMAP_H - -#include "msg/Message.h" - -#include -using namespace std; - -#include -using namespace __gnu_cxx; - -#include "include/Context.h" -#include "mdstypes.h" - -class MDS; - -/* - * this structure is used by the MDS purely so that - * it can remember client addresses (entity_inst_t) - * for clients with an active session. - * - * it is also used to keep track of recently completed - * operations, should the client have to resubmit them - * (after a connection failure, etc.) - */ -class ClientMap { -private: - MDS *mds; - - version_t version; - version_t projected; - version_t committing; - version_t committed; - map > commit_waiters; - -public: - version_t get_version() { return version; } - version_t get_projected() { return projected; } - version_t get_committing() { return committing; } - version_t get_committed() { return committed; } - - version_t inc_projected() { return ++projected; } - void reset_projected() { projected = version; } - void set_committing(version_t v) { committing = v; } - void set_committed(version_t v) { committed = v; } - -private: - // affects version - hash_map client_inst; - - // does not affect version - set sessions; - set opening; - set closing; - -public: - bool empty() { - return client_inst.empty(); - } - - const entity_inst_t& get_inst(int client) { - assert(client_inst.count(client)); - return client_inst[client]; - } - const set& get_session_set() { return sessions; } - - bool is_opening(int c) { return opening.count(c); } - void add_opening(int c) { opening.insert(c); } - bool is_closing(int c) { return closing.count(c); } - void add_closing(int c) { closing.insert(c); } - bool have_session(int client) { - return client_inst.count(client); - } - void open_session(const entity_inst_t& inst) { - opening.erase(inst.name.num()); - client_inst[inst.name.num()] = inst; - sessions.insert(inst.name.num()); - version++; - } - void close_session(int client) { - closing.erase(client); - sessions.erase(client); - client_inst.erase(client); - version++; - } - -private: - // -- push sequence -- - hash_map client_push_seq; // seq # for messages pushed to client. - -public: - version_t inc_push_seq(int client) { - return ++client_push_seq[client]; - } - version_t get_push_seq(int client) { - return client_push_seq[client]; - } - - -private: - // -- completed requests -- - // client id -> tid -> result code - map > completed_requests; // completed client requests - map > waiting_for_trim; - version_t requestmapv; - -public: - void add_completed_request(metareqid_t ri) { - completed_requests[ri.client].insert(ri.tid); - requestmapv++; - } - void trim_completed_requests(int client, - tid_t mintid) { // zero means trim all! - map >::iterator p = completed_requests.find(client); - if (p == completed_requests.end()) - return; - - // trim - while (!p->second.empty() && (mintid == 0 || *p->second.begin() < mintid)) - p->second.erase(p->second.begin()); - if (p->second.empty()) - completed_requests.erase(p); - - // kick waiters - map >::iterator q = waiting_for_trim.find(client); - if (q != waiting_for_trim.end()) { - list fls; - while (!q->second.empty() && - (mintid == 0 || q->second.begin()->first < mintid)) { - fls.push_back(q->second.begin()->second); - q->second.erase(q->second.begin()); - } - if (q->second.empty()) - waiting_for_trim.erase(q); - finish_contexts(fls); - } - } - void add_trim_waiter(metareqid_t ri, Context *c) { - waiting_for_trim[ri.client][ri.tid] = c; - } - bool have_completed_request(metareqid_t ri) { - return completed_requests.count(ri.client) && - completed_requests[ri.client].count(ri.tid); - } - - - - ClientMap(MDS *m) : mds(m), - version(0), projected(0), committing(0), committed(0), - requestmapv(0) {} - - - // -- encoding -- - void encode(bufferlist& bl) { - bl.append((char*)&version, sizeof(version)); - ::_encode(client_inst, bl); - ::_encode(sessions, bl); - } - void decode(bufferlist& bl, int& off) { - bl.copy(off, sizeof(version), (char*)&version); - off += sizeof(version); - ::_decode(client_inst, bl, off); - ::_decode(sessions, bl, off); - - projected = committing = committed = version; - } - - - // -- loading, saving -- - inode_t inode; - list waiting_for_load; - - void init_inode(); - void load(Context *onload); - void _load_finish(bufferlist &bl); - void save(Context *onsave, version_t needv=0); - void _save_finish(version_t v); -}; - -#endif diff --git a/branches/sage/crush/mds/Locker.cc b/branches/sage/crush/mds/Locker.cc deleted file mode 100644 index 55f38cd799b5f..0000000000000 --- a/branches/sage/crush/mds/Locker.cc +++ /dev/null @@ -1,2905 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include "MDS.h" -#include "MDCache.h" -#include "Locker.h" -#include "CInode.h" -#include "CDir.h" -#include "CDentry.h" - -#include "MDLog.h" -#include "MDSMap.h" - -#include "include/filepath.h" - -#include "events/EString.h" -#include "events/EUpdate.h" - -#include "msg/Messenger.h" - -#include "messages/MGenericMessage.h" -#include "messages/MDiscover.h" -#include "messages/MDiscoverReply.h" - -#include "messages/MDirUpdate.h" - -#include "messages/MInodeFileCaps.h" - -#include "messages/MLock.h" -#include "messages/MDentryUnlink.h" - -#include "messages/MClientRequest.h" -#include "messages/MClientFileCaps.h" - -#include "messages/MMDSSlaveRequest.h" - -#include -#include - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) *_dout << dbeginl << g_clock.now() << " mds" << mds->get_nodeid() << ".locker " - - - -void Locker::dispatch(Message *m) -{ - - switch (m->get_type()) { - - // locking - case MSG_MDS_LOCK: - handle_lock((MLock*)m); - break; - - // cache fun - case MSG_MDS_INODEFILECAPS: - handle_inode_file_caps((MInodeFileCaps*)m); - break; - - case MSG_CLIENT_FILECAPS: - handle_client_file_caps((MClientFileCaps*)m); - break; - - - - default: - assert(0); - } -} - - -void Locker::send_lock_message(SimpleLock *lock, int msg) -{ - for (map::iterator it = lock->get_parent()->replicas_begin(); - it != lock->get_parent()->replicas_end(); - it++) { - if (mds->mdsmap->get_state(it->first) < MDSMap::STATE_REJOIN) - continue; - MLock *m = new MLock(lock, msg, mds->get_nodeid()); - mds->send_message_mds(m, it->first, MDS_PORT_LOCKER); - } -} - -void Locker::send_lock_message(SimpleLock *lock, int msg, const bufferlist &data) -{ - for (map::iterator it = lock->get_parent()->replicas_begin(); - it != lock->get_parent()->replicas_end(); - it++) { - if (mds->mdsmap->get_state(it->first) < MDSMap::STATE_REJOIN) - continue; - MLock *m = new MLock(lock, msg, mds->get_nodeid()); - m->set_data(data); - mds->send_message_mds(m, it->first, MDS_PORT_LOCKER); - } -} - - - - - - - - - - - -bool Locker::acquire_locks(MDRequest *mdr, - set &rdlocks, - set &wrlocks, - set &xlocks) -{ - if (mdr->done_locking) { - dout(10) << "acquire_locks " << *mdr << " -- done locking" << dendl; - return true; // at least we had better be! - } - dout(10) << "acquire_locks " << *mdr << dendl; - - set sorted; // sort everything we will lock - set mustpin = xlocks; // items to authpin - - // xlocks - for (set::iterator p = xlocks.begin(); p != xlocks.end(); ++p) { - dout(20) << " must xlock " << **p << " " << *(*p)->get_parent() << dendl; - sorted.insert(*p); - - // augment xlock with a versionlock? - if ((*p)->get_type() > LOCK_OTYPE_IVERSION) { - // inode version lock? - CInode *in = (CInode*)(*p)->get_parent(); - if (mdr->is_master()) { - // master. wrlock versionlock so we can pipeline inode updates to journal. - wrlocks.insert(&in->versionlock); - } else { - // slave. exclusively lock the inode version (i.e. block other journal updates) - xlocks.insert(&in->versionlock); - sorted.insert(&in->versionlock); - } - } - } - - // wrlocks - for (set::iterator p = wrlocks.begin(); p != wrlocks.end(); ++p) { - dout(20) << " must wrlock " << **p << " " << *(*p)->get_parent() << dendl; - sorted.insert(*p); - if ((*p)->get_parent()->is_auth()) - mustpin.insert(*p); - else if ((*p)->get_type() == LOCK_OTYPE_IDIR && - !(*p)->get_parent()->is_auth() && !((ScatterLock*)(*p))->can_wrlock()) { // we might have to request a scatter - dout(15) << " will also auth_pin " << *(*p)->get_parent() << " in case we need to request a scatter" << dendl; - mustpin.insert(*p); - } - } - - // rdlocks - for (set::iterator p = rdlocks.begin(); - p != rdlocks.end(); - ++p) { - dout(20) << " must rdlock " << **p << " " << *(*p)->get_parent() << dendl; - sorted.insert(*p); - } - - - // AUTH PINS - map > mustpin_remote; // mds -> (object set) - - // can i auth pin them all now? - for (set::iterator p = mustpin.begin(); - p != mustpin.end(); - ++p) { - MDSCacheObject *object = (*p)->get_parent(); - - dout(10) << " must authpin " << *object << dendl; - - if (mdr->is_auth_pinned(object)) - continue; - - if (!object->is_auth()) { - if (object->is_ambiguous_auth()) { - // wait - dout(10) << " ambiguous auth, waiting to authpin " << *object << dendl; - object->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, new C_MDS_RetryRequest(mdcache, mdr)); - mds->locker->drop_locks(mdr); - mdr->drop_local_auth_pins(); - return false; - } - mustpin_remote[object->authority().first].insert(object); - continue; - } - if (!object->can_auth_pin()) { - // wait - dout(10) << " can't auth_pin (freezing?), waiting to authpin " << *object << dendl; - object->add_waiter(MDSCacheObject::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr)); - mds->locker->drop_locks(mdr); - mdr->drop_local_auth_pins(); - return false; - } - } - - // ok, grab local auth pins - for (set::iterator p = mustpin.begin(); - p != mustpin.end(); - ++p) { - MDSCacheObject *object = (*p)->get_parent(); - if (mdr->is_auth_pinned(object)) { - dout(10) << " already auth_pinned " << *object << dendl; - } else if (object->is_auth()) { - dout(10) << " auth_pinning " << *object << dendl; - mdr->auth_pin(object); - } - } - - // request remote auth_pins - if (!mustpin_remote.empty()) { - for (map >::iterator p = mustpin_remote.begin(); - p != mustpin_remote.end(); - ++p) { - dout(10) << "requesting remote auth_pins from mds" << p->first << dendl; - - MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_AUTHPIN); - for (set::iterator q = p->second.begin(); - q != p->second.end(); - ++q) { - dout(10) << " req remote auth_pin of " << **q << dendl; - MDSCacheObjectInfo info; - (*q)->set_object_info(info); - req->get_authpins().push_back(info); - } - mds->send_message_mds(req, p->first, MDS_PORT_SERVER); - - // put in waiting list - assert(mdr->more()->waiting_on_slave.count(p->first) == 0); - mdr->more()->waiting_on_slave.insert(p->first); - } - return false; - } - - // acquire locks. - // make sure they match currently acquired locks. - set::iterator existing = mdr->locks.begin(); - for (set::iterator p = sorted.begin(); - p != sorted.end(); - ++p) { - - // already locked? - if (existing != mdr->locks.end() && *existing == *p) { - // right kind? - SimpleLock *have = *existing; - existing++; - if (xlocks.count(*p) && mdr->xlocks.count(*p)) { - dout(10) << " already xlocked " << *have << " " << *have->get_parent() << dendl; - } - else if (wrlocks.count(*p) && mdr->wrlocks.count(*p)) { - dout(10) << " already wrlocked " << *have << " " << *have->get_parent() << dendl; - } - else if (rdlocks.count(*p) && mdr->rdlocks.count(*p)) { - dout(10) << " already rdlocked " << *have << " " << *have->get_parent() << dendl; - } - else assert(0); - continue; - } - - // hose any stray locks - while (existing != mdr->locks.end()) { - SimpleLock *stray = *existing; - existing++; - dout(10) << " unlocking out-of-order " << *stray << " " << *stray->get_parent() << dendl; - if (mdr->xlocks.count(stray)) - xlock_finish(stray, mdr); - else if (mdr->wrlocks.count(stray)) - wrlock_finish(stray, mdr); - else - rdlock_finish(stray, mdr); - } - - // lock - if (xlocks.count(*p)) { - if (!xlock_start(*p, mdr)) - return false; - dout(10) << " got xlock on " << **p << " " << *(*p)->get_parent() << dendl; - } else if (wrlocks.count(*p)) { - if (!wrlock_start(*p, mdr)) - return false; - dout(10) << " got wrlock on " << **p << " " << *(*p)->get_parent() << dendl; - } else { - if (!rdlock_start(*p, mdr)) - return false; - dout(10) << " got rdlock on " << **p << " " << *(*p)->get_parent() << dendl; - } - } - - // any extra unneeded locks? - while (existing != mdr->locks.end()) { - SimpleLock *stray = *existing; - existing++; - dout(10) << " unlocking extra " << *stray << " " << *stray->get_parent() << dendl; - if (mdr->xlocks.count(stray)) - xlock_finish(stray, mdr); - else if (mdr->wrlocks.count(stray)) - wrlock_finish(stray, mdr); - else - rdlock_finish(stray, mdr); - } - - return true; -} - - -void Locker::drop_locks(MDRequest *mdr) -{ - // leftover locks - while (!mdr->xlocks.empty()) - xlock_finish(*mdr->xlocks.begin(), mdr); - while (!mdr->rdlocks.empty()) - rdlock_finish(*mdr->rdlocks.begin(), mdr); - while (!mdr->wrlocks.empty()) - wrlock_finish(*mdr->wrlocks.begin(), mdr); -} - - -// generics - -bool Locker::rdlock_start(SimpleLock *lock, MDRequest *mdr) -{ - switch (lock->get_type()) { - case LOCK_OTYPE_IFILE: - return file_rdlock_start((FileLock*)lock, mdr); - case LOCK_OTYPE_IDIRFRAGTREE: - case LOCK_OTYPE_IDIR: - return scatter_rdlock_start((ScatterLock*)lock, mdr); - default: - return simple_rdlock_start(lock, mdr); - } -} - -void Locker::rdlock_finish(SimpleLock *lock, MDRequest *mdr) -{ - switch (lock->get_type()) { - case LOCK_OTYPE_IFILE: - return file_rdlock_finish((FileLock*)lock, mdr); - case LOCK_OTYPE_IDIRFRAGTREE: - case LOCK_OTYPE_IDIR: - return scatter_rdlock_finish((ScatterLock*)lock, mdr); - default: - return simple_rdlock_finish(lock, mdr); - } -} - -bool Locker::wrlock_start(SimpleLock *lock, MDRequest *mdr) -{ - switch (lock->get_type()) { - case LOCK_OTYPE_IDIRFRAGTREE: - case LOCK_OTYPE_IDIR: - return scatter_wrlock_start((ScatterLock*)lock, mdr); - case LOCK_OTYPE_IVERSION: - return local_wrlock_start((LocalLock*)lock, mdr); - default: - assert(0); - return false; - } -} - -void Locker::wrlock_finish(SimpleLock *lock, MDRequest *mdr) -{ - switch (lock->get_type()) { - case LOCK_OTYPE_IDIRFRAGTREE: - case LOCK_OTYPE_IDIR: - return scatter_wrlock_finish((ScatterLock*)lock, mdr); - case LOCK_OTYPE_IVERSION: - return local_wrlock_finish((LocalLock*)lock, mdr); - default: - assert(0); - } -} - -bool Locker::xlock_start(SimpleLock *lock, MDRequest *mdr) -{ - switch (lock->get_type()) { - case LOCK_OTYPE_IFILE: - return file_xlock_start((FileLock*)lock, mdr); - case LOCK_OTYPE_IVERSION: - return local_xlock_start((LocalLock*)lock, mdr); - case LOCK_OTYPE_IDIRFRAGTREE: - case LOCK_OTYPE_IDIR: - assert(0); - default: - return simple_xlock_start(lock, mdr); - } -} - -void Locker::xlock_finish(SimpleLock *lock, MDRequest *mdr) -{ - switch (lock->get_type()) { - case LOCK_OTYPE_IFILE: - return file_xlock_finish((FileLock*)lock, mdr); - case LOCK_OTYPE_IVERSION: - return local_xlock_finish((LocalLock*)lock, mdr); - case LOCK_OTYPE_IDIRFRAGTREE: - case LOCK_OTYPE_IDIR: - assert(0); - default: - return simple_xlock_finish(lock, mdr); - } -} - - - -/** rejoin_set_state - * @lock the lock - * @s the new state - * @waiters list for anybody waiting on this lock - */ -void Locker::rejoin_set_state(SimpleLock *lock, int s, list& waiters) -{ - if (!lock->is_stable()) { - lock->set_state(s); - lock->get_parent()->auth_unpin(); - } else { - lock->set_state(s); - } - lock->take_waiting(SimpleLock::WAIT_ALL, waiters); -} - - - - -// file i/o ----------------------------------------- - -version_t Locker::issue_file_data_version(CInode *in) -{ - dout(7) << "issue_file_data_version on " << *in << dendl; - return in->inode.file_data_version; -} - - -Capability* Locker::issue_new_caps(CInode *in, - int mode, - MClientRequest *req) -{ - dout(7) << "issue_new_caps for mode " << mode << " on " << *in << dendl; - - // my needs - int my_client = req->get_client(); - int my_want = 0; - if (mode & FILE_MODE_R) my_want |= CAP_FILE_RDCACHE | CAP_FILE_RD; - if (mode & FILE_MODE_W) my_want |= CAP_FILE_WRBUFFER | CAP_FILE_WR; - - // register a capability - Capability *cap = in->get_client_cap(my_client); - if (!cap) { - // new cap - Capability c(my_want); - in->add_client_cap(my_client, c); - cap = in->get_client_cap(my_client); - - // suppress file cap messages for new cap (we'll bundle with the open() reply) - cap->set_suppress(true); - } else { - // make sure it has sufficient caps - if (my_want & ~cap->wanted()) { - // augment wanted caps for this client - cap->set_wanted( cap->wanted() | my_want ); - } - } - - int before = cap->pending(); - - if (in->is_auth()) { - // [auth] twiddle mode? - if (in->filelock.is_stable()) - file_eval(&in->filelock); - } else { - // [replica] tell auth about any new caps wanted - request_inode_file_caps(in); - } - - // issue caps (pot. incl new one) - issue_caps(in); // note: _eval above may have done this already... - - // re-issue whatever we can - cap->issue(cap->pending()); - - // ok, stop suppressing. - cap->set_suppress(false); - - int now = cap->pending(); - if (before != now && - (before & CAP_FILE_WR) == 0 && - (now & CAP_FILE_WR)) { - // FIXME FIXME FIXME - } - - // twiddle file_data_version? - if ((before & CAP_FILE_WRBUFFER) == 0 && - (now & CAP_FILE_WRBUFFER)) { - in->inode.file_data_version++; - dout(7) << " incrementing file_data_version, now " << in->inode.file_data_version << " for " << *in << dendl; - } - - return cap; -} - - - -bool Locker::issue_caps(CInode *in) -{ - // allowed caps are determined by the lock mode. - int allowed = in->filelock.caps_allowed(); - dout(7) << "issue_caps filelock allows=" << cap_string(allowed) - << " on " << *in << dendl; - - // count conflicts with - int nissued = 0; - - // client caps - for (map::iterator it = in->client_caps.begin(); - it != in->client_caps.end(); - it++) { - if (it->second.pending() != (it->second.wanted() & allowed)) { - // issue - nissued++; - - int before = it->second.pending(); - long seq = it->second.issue(it->second.wanted() & allowed); - int after = it->second.pending(); - - // twiddle file_data_version? - if (!(before & CAP_FILE_WRBUFFER) && - (after & CAP_FILE_WRBUFFER)) { - dout(7) << " incrementing file_data_version for " << *in << dendl; - in->inode.file_data_version++; - } - - if (seq > 0 && - !it->second.is_suppress()) { - dout(7) << " sending MClientFileCaps to client" << it->first << " seq " << it->second.get_last_seq() << " new pending " << cap_string(it->second.pending()) << " was " << cap_string(before) << dendl; - mds->send_message_client_maybe_opening(new MClientFileCaps(MClientFileCaps::OP_GRANT, - in->inode, - it->second.get_last_seq(), - it->second.pending(), - it->second.wanted()), - it->first); - } - } - } - - return (nissued == 0); // true if no re-issued, no callbacks -} - - -class C_MDL_RequestInodeFileCaps : public Context { - Locker *locker; - CInode *in; -public: - C_MDL_RequestInodeFileCaps(Locker *l, CInode *i) : locker(l), in(i) {} - void finish(int r) { - in->put(CInode::PIN_PTRWAITER); - if (!in->is_auth()) - locker->request_inode_file_caps(in); - } -}; - -void Locker::request_inode_file_caps(CInode *in) -{ - int wanted = in->get_caps_wanted(); - if (wanted != in->replica_caps_wanted) { - - if (wanted == 0) { - if (in->replica_caps_wanted_keep_until > g_clock.recent_now()) { - // ok, release them finally! - in->replica_caps_wanted_keep_until.sec_ref() = 0; - dout(7) << "request_inode_file_caps " << cap_string(wanted) - << " was " << cap_string(in->replica_caps_wanted) - << " no keeping anymore " - << " on " << *in - << dendl; - } - else if (in->replica_caps_wanted_keep_until.sec() == 0) { - in->replica_caps_wanted_keep_until = g_clock.recent_now(); - in->replica_caps_wanted_keep_until.sec_ref() += 2; - - dout(7) << "request_inode_file_caps " << cap_string(wanted) - << " was " << cap_string(in->replica_caps_wanted) - << " keeping until " << in->replica_caps_wanted_keep_until - << " on " << *in - << dendl; - return; - } else { - // wait longer - return; - } - } else { - in->replica_caps_wanted_keep_until.sec_ref() = 0; - } - assert(!in->is_auth()); - - // wait for single auth - if (in->is_ambiguous_auth()) { - in->get(CInode::PIN_PTRWAITER); - in->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, - new C_MDL_RequestInodeFileCaps(this, in)); - return; - } - - int auth = in->authority().first; - dout(7) << "request_inode_file_caps " << cap_string(wanted) - << " was " << cap_string(in->replica_caps_wanted) - << " on " << *in << " to mds" << auth << dendl; - assert(!in->is_auth()); - - in->replica_caps_wanted = wanted; - - if (mds->mdsmap->get_state(auth) >= MDSMap::STATE_REJOIN) - mds->send_message_mds(new MInodeFileCaps(in->ino(), mds->get_nodeid(), - in->replica_caps_wanted), - auth, MDS_PORT_LOCKER); - } else { - in->replica_caps_wanted_keep_until.sec_ref() = 0; - } -} - -void Locker::handle_inode_file_caps(MInodeFileCaps *m) -{ - // nobody should be talking to us during recovery. - assert(mds->is_rejoin() || mds->is_active() || mds->is_stopping()); - - // ok - CInode *in = mdcache->get_inode(m->get_ino()); - assert(in); - assert(in->is_auth()); - - if (mds->is_rejoin() && - in->is_rejoining()) { - dout(7) << "handle_inode_file_caps still rejoining " << *in << ", dropping " << *m << dendl; - delete m; - return; - } - - - dout(7) << "handle_inode_file_caps replica mds" << m->get_from() << " wants caps " << cap_string(m->get_caps()) << " on " << *in << dendl; - - if (m->get_caps()) - in->mds_caps_wanted[m->get_from()] = m->get_caps(); - else - in->mds_caps_wanted.erase(m->get_from()); - - if (in->filelock.is_stable()) - try_file_eval(&in->filelock); // ** may or may not be auth_pinned ** - delete m; -} - - -/* - * note: we only get these from the client if - * - we are calling back previously issued caps (fewer than the client previously had) - * - or if the client releases (any of) its caps on its own - */ -void Locker::handle_client_file_caps(MClientFileCaps *m) -{ - int client = m->get_source().num(); - CInode *in = mdcache->get_inode(m->get_ino()); - Capability *cap = 0; - if (in) - cap = in->get_client_cap(client); - - if (!in || !cap) { - if (!in) { - dout(7) << "handle_client_file_caps on unknown ino " << m->get_ino() << ", dropping" << dendl; - } else { - dout(7) << "handle_client_file_caps no cap for client" << client << " on " << *in << dendl; - } - delete m; - return; - } - - assert(cap); - - // filter wanted based on what we could ever give out (given auth/replica status) - int wanted = m->get_wanted() & in->filelock.caps_allowed_ever(); - - dout(7) << "handle_client_file_caps seq " << m->get_seq() - << " confirms caps " << cap_string(m->get_caps()) - << " wants " << cap_string(wanted) - << " from client" << client - << " on " << *in - << dendl; - - // update wanted - if (cap->wanted() != wanted) { - if (m->get_seq() < cap->get_last_seq()) { - /* this is awkward. - client may be trying to release caps (i.e. inode closed, etc.) by setting reducing wanted - set. - but it may also be opening the same filename, not sure that it'll map to the same inode. - so, we don't want wanted reductions to clobber mds's notion of wanted unless we're - sure the client has seen all the latest caps. - */ - dout(10) << "handle_client_file_caps ignoring wanted " << cap_string(m->get_wanted()) - << " bc seq " << m->get_seq() << " < " << cap->get_last_seq() << dendl; - } else { - cap->set_wanted(wanted); - } - } - - // confirm caps - int had = cap->confirm_receipt(m->get_seq(), m->get_caps()); - int has = cap->confirmed(); - if (cap->is_null()) { - dout(7) << " cap for client" << client << " is now null, removing from " << *in << dendl; - in->remove_client_cap(client); - if (!in->is_any_caps()) - in->xlist_open_file.remove_myself(); // unpin logsegment - if (!in->is_auth()) - request_inode_file_caps(in); - - // tell client. - MClientFileCaps *r = new MClientFileCaps(MClientFileCaps::OP_RELEASE, - in->inode, - 0, 0, 0); - mds->send_message_client_maybe_open(r, m->get_source_inst()); - } - - // merge in atime? - if (m->get_inode().atime > in->inode.atime) { - dout(7) << " taking atime " << m->get_inode().atime << " > " - << in->inode.atime << " for " << *in << dendl; - in->inode.atime = m->get_inode().atime; - } - - if ((has|had) & CAP_FILE_WR) { - bool dirty = false; - - // mtime - if (m->get_inode().mtime > in->inode.mtime) { - dout(7) << " taking mtime " << m->get_inode().mtime << " > " - << in->inode.mtime << " for " << *in << dendl; - in->inode.mtime = m->get_inode().mtime; - dirty = true; - } - // size - if (m->get_inode().size > in->inode.size) { - dout(7) << " taking size " << m->get_inode().size << " > " - << in->inode.size << " for " << *in << dendl; - in->inode.size = m->get_inode().size; - dirty = true; - } - - if (dirty) - mds->mdlog->submit_entry(new EString("cap inode update dirty fixme")); - } - - // reevaluate, waiters - if (!in->filelock.is_stable()) - file_eval_gather(&in->filelock); - else if (in->is_auth()) - file_eval(&in->filelock); - - //in->finish_waiting(CInode::WAIT_CAPS, 0); // note: any users for this? - - delete m; -} - - - - - - - - - - -// locks ---------------------------------------------------------------- - -SimpleLock *Locker::get_lock(int lock_type, MDSCacheObjectInfo &info) -{ - switch (lock_type) { - case LOCK_OTYPE_DN: - { - // be careful; info.dirfrag may have incorrect frag; recalculate based on dname. - CInode *diri = mdcache->get_inode(info.dirfrag.ino); - frag_t fg; - CDir *dir = 0; - CDentry *dn = 0; - if (diri) { - fg = diri->pick_dirfrag(info.dname); - dir = diri->get_dirfrag(fg); - if (dir) - dn = dir->lookup(info.dname); - } - if (!dn) { - dout(7) << "get_lock don't have dn " << info.dirfrag.ino << " " << info.dname << dendl; - return 0; - } - return &dn->lock; - } - - case LOCK_OTYPE_IAUTH: - case LOCK_OTYPE_ILINK: - case LOCK_OTYPE_IDIRFRAGTREE: - case LOCK_OTYPE_IFILE: - case LOCK_OTYPE_IDIR: - { - CInode *in = mdcache->get_inode(info.ino); - if (!in) { - dout(7) << "get_lock don't have ino " << info.ino << dendl; - return 0; - } - switch (lock_type) { - case LOCK_OTYPE_IAUTH: return &in->authlock; - case LOCK_OTYPE_ILINK: return &in->linklock; - case LOCK_OTYPE_IDIRFRAGTREE: return &in->dirfragtreelock; - case LOCK_OTYPE_IFILE: return &in->filelock; - case LOCK_OTYPE_IDIR: return &in->dirlock; - } - } - - default: - dout(7) << "get_lock don't know lock_type " << lock_type << dendl; - assert(0); - break; - } - - return 0; -} - - -void Locker::handle_lock(MLock *m) -{ - // nobody should be talking to us during recovery. - assert(mds->is_rejoin() || mds->is_active() || mds->is_stopping()); - - SimpleLock *lock = get_lock(m->get_lock_type(), m->get_object_info()); - if (!lock) { - dout(10) << "don't have object " << m->get_object_info() << ", must have trimmed, dropping" << dendl; - delete m; - return; - } - - switch (lock->get_type()) { - case LOCK_OTYPE_DN: - case LOCK_OTYPE_IAUTH: - case LOCK_OTYPE_ILINK: - handle_simple_lock(lock, m); - break; - - case LOCK_OTYPE_IFILE: - handle_file_lock((FileLock*)lock, m); - break; - - case LOCK_OTYPE_IDIRFRAGTREE: - case LOCK_OTYPE_IDIR: - handle_scatter_lock((ScatterLock*)lock, m); - break; - - default: - dout(7) << "handle_lock got otype " << m->get_lock_type() << dendl; - assert(0); - break; - } -} - - - - - -// ========================================================================== -// simple lock - -void Locker::handle_simple_lock(SimpleLock *lock, MLock *m) -{ - int from = m->get_asker(); - - if (mds->is_rejoin()) { - if (lock->get_parent()->is_rejoining()) { - dout(7) << "handle_simple_lock still rejoining " << *lock->get_parent() - << ", dropping " << *m << dendl; - delete m; - return; - } - } - - switch (m->get_action()) { - // -- replica -- - case LOCK_AC_SYNC: - assert(lock->get_state() == LOCK_LOCK); - lock->decode_locked_state(m->get_data()); - lock->set_state(LOCK_SYNC); - lock->finish_waiters(SimpleLock::WAIT_RD|SimpleLock::WAIT_STABLE); - - // special case: trim replica no-longer-null dentry? - if (lock->get_type() == LOCK_OTYPE_DN) { - CDentry *dn = (CDentry*)lock->get_parent(); - if (dn->is_null() && m->get_data().length() > 0) { - dout(10) << "handle_simple_lock replica dentry null -> non-null, must trim " - << *dn << dendl; - assert(dn->get_num_ref() == 0); - map expiremap; - mdcache->trim_dentry(dn, expiremap); - mdcache->send_expire_messages(expiremap); - } - } - break; - - case LOCK_AC_LOCK: - assert(lock->get_state() == LOCK_SYNC); - //|| lock->get_state() == LOCK_GLOCKR); - - // wait for readers to finish? - if (lock->is_rdlocked()) { - dout(7) << "handle_simple_lock has reader, waiting before ack on " << *lock - << " on " << *lock->get_parent() << dendl; - lock->set_state(LOCK_GLOCKR); - } else { - // update lock and reply - lock->set_state(LOCK_LOCK); - mds->send_message_mds(new MLock(lock, LOCK_AC_LOCKACK, mds->get_nodeid()), - from, MDS_PORT_LOCKER); - } - break; - - - // -- auth -- - case LOCK_AC_LOCKACK: - assert(lock->get_state() == LOCK_GLOCKR); - assert(lock->is_gathering(from)); - lock->remove_gather(from); - - if (lock->is_gathering()) { - dout(7) << "handle_simple_lock " << *lock << " on " << *lock->get_parent() << " from " << from - << ", still gathering " << lock->get_gather_set() << dendl; - } else { - dout(7) << "handle_simple_lock " << *lock << " on " << *lock->get_parent() << " from " << from - << ", last one" << dendl; - simple_eval_gather(lock); - } - break; - - } - - delete m; -} - -/* unused, currently. - -class C_Locker_SimpleEval : public Context { - Locker *locker; - SimpleLock *lock; -public: - C_Locker_SimpleEval(Locker *l, SimpleLock *lk) : locker(l), lock(lk) {} - void finish(int r) { - locker->try_simple_eval(lock); - } -}; - -void Locker::try_simple_eval(SimpleLock *lock) -{ - // unstable and ambiguous auth? - if (!lock->is_stable() && - lock->get_parent()->is_ambiguous_auth()) { - dout(7) << "simple_eval not stable and ambiguous auth, waiting on " << *lock->get_parent() << dendl; - //if (!lock->get_parent()->is_waiter(MDSCacheObject::WAIT_SINGLEAUTH)) - lock->get_parent()->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, new C_Locker_SimpleEval(this, lock)); - return; - } - - if (!lock->get_parent()->is_auth()) { - dout(7) << "try_simple_eval not auth for " << *lock->get_parent() << dendl; - return; - } - - if (!lock->get_parent()->can_auth_pin()) { - dout(7) << "try_simple_eval can't auth_pin, waiting on " << *lock->get_parent() << dendl; - //if (!lock->get_parent()->is_waiter(MDSCacheObject::WAIT_SINGLEAUTH)) - lock->get_parent()->add_waiter(MDSCacheObject::WAIT_UNFREEZE, new C_Locker_SimpleEval(this, lock)); - return; - } - - if (lock->is_stable()) - simple_eval(lock); -} -*/ - -void Locker::simple_eval_gather(SimpleLock *lock) -{ - dout(10) << "simple_eval_gather " << *lock << " on " << *lock->get_parent() << dendl; - - // finished gathering? - if (lock->get_state() == LOCK_GLOCKR && - !lock->is_gathering() && - !lock->is_rdlocked()) { - dout(7) << "simple_eval finished gather on " << *lock << " on " << *lock->get_parent() << dendl; - - // replica: tell auth - if (!lock->get_parent()->is_auth()) { - int auth = lock->get_parent()->authority().first; - if (mds->mdsmap->get_state(auth) >= MDSMap::STATE_REJOIN) - mds->send_message_mds(new MLock(lock, LOCK_AC_LOCKACK, mds->get_nodeid()), - lock->get_parent()->authority().first, MDS_PORT_LOCKER); - } - - lock->set_state(LOCK_LOCK); - lock->finish_waiters(SimpleLock::WAIT_STABLE|SimpleLock::WAIT_WR); - - if (lock->get_parent()->is_auth()) { - lock->get_parent()->auth_unpin(); - - // re-eval? - simple_eval(lock); - } - } -} - -void Locker::simple_eval(SimpleLock *lock) -{ - dout(10) << "simple_eval " << *lock << " on " << *lock->get_parent() << dendl; - - assert(lock->get_parent()->is_auth()); - assert(lock->is_stable()); - - // stable -> sync? - if (!lock->is_xlocked() && - lock->get_state() != LOCK_SYNC && - !lock->is_waiter_for(SimpleLock::WAIT_WR)) { - dout(7) << "simple_eval stable, syncing " << *lock - << " on " << *lock->get_parent() << dendl; - simple_sync(lock); - } - -} - - -// mid - -void Locker::simple_sync(SimpleLock *lock) -{ - dout(7) << "simple_sync on " << *lock << " on " << *lock->get_parent() << dendl; - assert(lock->get_parent()->is_auth()); - assert(lock->is_stable()); - - // check state - if (lock->get_state() == LOCK_SYNC) - return; // already sync - assert(lock->get_state() == LOCK_LOCK); - - // sync. - if (lock->get_parent()->is_replicated()) { - // hard data - bufferlist data; - lock->encode_locked_state(data); - - // bcast to replicas - send_lock_message(lock, LOCK_AC_SYNC, data); - } - - // change lock - lock->set_state(LOCK_SYNC); - - // waiters? - lock->finish_waiters(SimpleLock::WAIT_RD|SimpleLock::WAIT_STABLE); -} - -void Locker::simple_lock(SimpleLock *lock) -{ - dout(7) << "simple_lock on " << *lock << " on " << *lock->get_parent() << dendl; - assert(lock->get_parent()->is_auth()); - assert(lock->is_stable()); - - // check state - if (lock->get_state() == LOCK_LOCK) return; - assert(lock->get_state() == LOCK_SYNC); - - if (lock->get_parent()->is_replicated()) { - // bcast to replicas - send_lock_message(lock, LOCK_AC_LOCK); - - // change lock - lock->set_state(LOCK_GLOCKR); - lock->init_gather(); - lock->get_parent()->auth_pin(); - } else { - lock->set_state(LOCK_LOCK); - } -} - - -// top - -bool Locker::simple_rdlock_try(SimpleLock *lock, Context *con) -{ - dout(7) << "simple_rdlock_try on " << *lock << " on " << *lock->get_parent() << dendl; - - // can read? grab ref. - if (lock->can_rdlock(0)) - return true; - - assert(!lock->get_parent()->is_auth()); - - // wait! - dout(7) << "simple_rdlock_try waiting on " << *lock << " on " << *lock->get_parent() << dendl; - if (con) lock->add_waiter(SimpleLock::WAIT_RD, con); - return false; -} - -bool Locker::simple_rdlock_start(SimpleLock *lock, MDRequest *mdr) -{ - dout(7) << "simple_rdlock_start on " << *lock << " on " << *lock->get_parent() << dendl; - - // can read? grab ref. - if (lock->can_rdlock(mdr)) { - lock->get_rdlock(); - mdr->rdlocks.insert(lock); - mdr->locks.insert(lock); - return true; - } - - // wait! - dout(7) << "simple_rdlock_start waiting on " << *lock << " on " << *lock->get_parent() << dendl; - lock->add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr)); - return false; -} - -void Locker::simple_rdlock_finish(SimpleLock *lock, MDRequest *mdr) -{ - // drop ref - lock->put_rdlock(); - if (mdr) { - mdr->rdlocks.erase(lock); - mdr->locks.erase(lock); - } - - dout(7) << "simple_rdlock_finish on " << *lock << " on " << *lock->get_parent() << dendl; - - // last one? - if (!lock->is_rdlocked()) - simple_eval_gather(lock); -} - -bool Locker::simple_xlock_start(SimpleLock *lock, MDRequest *mdr) -{ - dout(7) << "simple_xlock_start on " << *lock << " on " << *lock->get_parent() << dendl; - - // xlock by me? - if (lock->is_xlocked() && - lock->get_xlocked_by() == mdr) - return true; - - // auth? - if (lock->get_parent()->is_auth()) { - // auth - - // lock. - if (lock->get_state() == LOCK_SYNC) - simple_lock(lock); - - // already locked? - if (lock->get_state() == LOCK_LOCK) { - if (lock->is_xlocked()) { - // by someone else. - lock->add_waiter(SimpleLock::WAIT_WR, new C_MDS_RetryRequest(mdcache, mdr)); - return false; - } - - // xlock. - lock->get_xlock(mdr); - mdr->xlocks.insert(lock); - mdr->locks.insert(lock); - return true; - } else { - // wait for lock - lock->add_waiter(SimpleLock::WAIT_STABLE, new C_MDS_RetryRequest(mdcache, mdr)); - return false; - } - } else { - // replica - // this had better not be a remote xlock attempt! - assert(!mdr->slave_request); - - // wait for single auth - if (lock->get_parent()->is_ambiguous_auth()) { - lock->get_parent()->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, - new C_MDS_RetryRequest(mdcache, mdr)); - return false; - } - - // send lock request - int auth = lock->get_parent()->authority().first; - mdr->more()->slaves.insert(auth); - MMDSSlaveRequest *r = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_XLOCK); - r->set_lock_type(lock->get_type()); - lock->get_parent()->set_object_info(r->get_object_info()); - mds->send_message_mds(r, auth, MDS_PORT_SERVER); - - // wait - lock->add_waiter(SimpleLock::WAIT_REMOTEXLOCK, new C_MDS_RetryRequest(mdcache, mdr)); - return false; - } -} - - -void Locker::simple_xlock_finish(SimpleLock *lock, MDRequest *mdr) -{ - dout(7) << "simple_xlock_finish on " << *lock << " on " << *lock->get_parent() << dendl; - - // drop ref - assert(lock->can_xlock(mdr)); - lock->put_xlock(); - assert(mdr); - mdr->xlocks.erase(lock); - mdr->locks.erase(lock); - - // remote xlock? - if (!lock->get_parent()->is_auth()) { - // tell auth - dout(7) << "simple_xlock_finish releasing remote xlock on " << *lock->get_parent() << dendl; - int auth = lock->get_parent()->authority().first; - if (mds->mdsmap->get_state(auth) >= MDSMap::STATE_REJOIN) { - MMDSSlaveRequest *slavereq = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_UNXLOCK); - slavereq->set_lock_type(lock->get_type()); - lock->get_parent()->set_object_info(slavereq->get_object_info()); - mds->send_message_mds(slavereq, auth, MDS_PORT_SERVER); - } - } - - // others waiting? - lock->finish_waiters(SimpleLock::WAIT_WR, 0); - - // eval? - if (lock->get_parent()->is_auth()) - simple_eval(lock); -} - - - -// dentry specific helpers - -/** dentry_can_rdlock_trace - * see if we can _anonymously_ rdlock an entire trace. - * if not, and req is specified, wait and retry that message. - */ -bool Locker::dentry_can_rdlock_trace(vector& trace) -{ - // verify dentries are rdlockable. - // we do this because - // - we're being less aggressive about locks acquisition, and - // - we're not acquiring the locks in order! - for (vector::iterator it = trace.begin(); - it != trace.end(); - it++) { - CDentry *dn = *it; - if (!dn->lock.can_rdlock(0)) { - dout(10) << "can_rdlock_trace can't rdlock " << *dn << dendl; - return false; - } - } - return true; -} - -void Locker::dentry_anon_rdlock_trace_start(vector& trace) -{ - // grab dentry rdlocks - for (vector::iterator it = trace.begin(); - it != trace.end(); - it++) { - dout(10) << "dentry_anon_rdlock_trace_start rdlocking " << (*it)->lock << " " << **it << dendl; - (*it)->lock.get_rdlock(); - } -} - - -void Locker::dentry_anon_rdlock_trace_finish(vector& trace) -{ - for (vector::iterator it = trace.begin(); - it != trace.end(); - it++) - simple_rdlock_finish(&(*it)->lock, 0); -} - - - -// ========================================================================== -// scatter lock - -bool Locker::scatter_rdlock_start(ScatterLock *lock, MDRequest *mdr) -{ - dout(7) << "scatter_rdlock_start on " << *lock - << " on " << *lock->get_parent() << dendl; - - // read on stable scattered replica? - if (lock->get_state() == LOCK_SCATTER && - !lock->get_parent()->is_auth()) { - dout(7) << "scatter_rdlock_start scatterlock read on a stable scattered replica, fw to auth" << dendl; - mdcache->request_forward(mdr, lock->get_parent()->authority().first); - return false; - } - - // pre-twiddle? - if (lock->get_state() == LOCK_SCATTER && - lock->get_parent()->is_auth() && - !lock->get_parent()->is_replicated() && - !lock->is_wrlocked()) - scatter_sync(lock); - - // can rdlock? - if (lock->can_rdlock(mdr)) { - lock->get_rdlock(); - mdr->rdlocks.insert(lock); - mdr->locks.insert(lock); - return true; - } - - // wait for read. - lock->add_waiter(SimpleLock::WAIT_RD|SimpleLock::WAIT_STABLE, new C_MDS_RetryRequest(mdcache, mdr)); - - // initiate sync or tempsync? - if (lock->is_stable() && - lock->get_parent()->is_auth()) { - if (lock->get_parent()->is_replicated()) - scatter_tempsync(lock); - else - scatter_sync(lock); - } - - return false; -} - -void Locker::scatter_rdlock_finish(ScatterLock *lock, MDRequest *mdr) -{ - dout(7) << "scatter_rdlock_finish on " << *lock - << " on " << *lock->get_parent() << dendl; - lock->put_rdlock(); - if (mdr) { - mdr->rdlocks.erase(lock); - mdr->locks.erase(lock); - } - - scatter_eval_gather(lock); -} - - -bool Locker::scatter_wrlock_start(ScatterLock *lock, MDRequest *mdr) -{ - dout(7) << "scatter_wrlock_start on " << *lock - << " on " << *lock->get_parent() << dendl; - - // pre-twiddle? - if (lock->get_parent()->is_auth() && - !lock->get_parent()->is_replicated() && - !lock->is_rdlocked() && - !lock->is_xlocked() && - lock->get_state() == LOCK_SYNC) - lock->set_state(LOCK_SCATTER); - //scatter_scatter(lock); - - // can wrlock? - if (lock->can_wrlock()) { - lock->get_wrlock(); - mdr->wrlocks.insert(lock); - mdr->locks.insert(lock); - return true; - } - - // wait for write. - lock->add_waiter(SimpleLock::WAIT_WR|SimpleLock::WAIT_STABLE, - new C_MDS_RetryRequest(mdcache, mdr)); - - // initiate scatter or lock? - if (lock->is_stable()) { - if (lock->get_parent()->is_auth()) { - // auth. scatter or lock? - if (((CInode*)lock->get_parent())->has_subtree_root_dirfrag()) - scatter_scatter(lock); - else - scatter_lock(lock); - } else { - // replica. - // auth should be auth_pinned (see acquire_locks wrlock weird mustpin case). - int auth = lock->get_parent()->authority().first; - dout(10) << "requesting scatter from auth on " - << *lock << " on " << *lock->get_parent() << dendl; - mds->send_message_mds(new MLock(lock, LOCK_AC_REQSCATTER, mds->get_nodeid()), - auth, MDS_PORT_LOCKER); - } - } - - return false; -} - -void Locker::scatter_wrlock_finish(ScatterLock *lock, MDRequest *mdr) -{ - dout(7) << "scatter_wrlock_finish on " << *lock - << " on " << *lock->get_parent() << dendl; - lock->put_wrlock(); - if (mdr) { - mdr->wrlocks.erase(lock); - mdr->locks.erase(lock); - } - - scatter_eval_gather(lock); -} - - -class C_Locker_ScatterEval : public Context { - Locker *locker; - ScatterLock *lock; -public: - C_Locker_ScatterEval(Locker *l, ScatterLock *lk) : locker(l), lock(lk) {} - void finish(int r) { - lock->get_parent()->put(CInode::PIN_PTRWAITER); - locker->try_scatter_eval(lock); - } -}; - - -void Locker::try_scatter_eval(ScatterLock *lock) -{ - // unstable and ambiguous auth? - if (!lock->is_stable() && - lock->get_parent()->is_ambiguous_auth()) { - dout(7) << "try_scatter_eval not stable and ambiguous auth, waiting on " << *lock->get_parent() << dendl; - //if (!lock->get_parent()->is_waiter(MDSCacheObject::WAIT_SINGLEAUTH)) - lock->get_parent()->get(CInode::PIN_PTRWAITER); - lock->get_parent()->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, new C_Locker_ScatterEval(this, lock)); - return; - } - - if (!lock->get_parent()->is_auth()) { - dout(7) << "try_scatter_eval not auth for " << *lock->get_parent() << dendl; - return; - } - - if (!lock->get_parent()->can_auth_pin()) { - dout(7) << "try_scatter_eval can't auth_pin, waiting on " << *lock->get_parent() << dendl; - //if (!lock->get_parent()->is_waiter(MDSCacheObject::WAIT_SINGLEAUTH)) - lock->get_parent()->get(CInode::PIN_PTRWAITER); - lock->get_parent()->add_waiter(MDSCacheObject::WAIT_UNFREEZE, new C_Locker_ScatterEval(this, lock)); - return; - } - - if (lock->is_stable()) - scatter_eval(lock); -} - - -void Locker::scatter_eval_gather(ScatterLock *lock) -{ - dout(10) << "scatter_eval_gather " << *lock << " on " << *lock->get_parent() << dendl; - - if (!lock->get_parent()->is_auth()) { - // REPLICA - - if (lock->get_state() == LOCK_GLOCKC && - !lock->is_wrlocked()) { - dout(10) << "scatter_eval no wrlocks, acking lock" << dendl; - int auth = lock->get_parent()->authority().first; - if (mds->mdsmap->get_state(auth) >= MDSMap::STATE_REJOIN) { - bufferlist data; - lock->encode_locked_state(data); - mds->send_message_mds(new MLock(lock, LOCK_AC_LOCKACK, mds->get_nodeid(), data), - auth, MDS_PORT_LOCKER); - } - lock->set_state(LOCK_LOCK); - } - - } else { - // AUTH - - // glocks|glockt -> lock? - if ((lock->get_state() == LOCK_GLOCKS || - lock->get_state() == LOCK_GLOCKT) && - !lock->is_gathering() && - !lock->is_rdlocked()) { - dout(7) << "scatter_eval finished lock gather/un-rdlock on " << *lock - << " on " << *lock->get_parent() << dendl; - lock->set_state(LOCK_LOCK); - lock->finish_waiters(ScatterLock::WAIT_XLOCK|ScatterLock::WAIT_STABLE); - lock->get_parent()->auth_unpin(); - } - - // glockc -> lock? - else if (lock->get_state() == LOCK_GLOCKC && - !lock->is_gathering() && - !lock->is_wrlocked()) { - if (lock->is_updated()) { - scatter_writebehind(lock); - } else { - dout(7) << "scatter_eval finished lock gather/un-wrlock on " << *lock - << " on " << *lock->get_parent() << dendl; - lock->set_state(LOCK_LOCK); - lock->finish_waiters(ScatterLock::WAIT_XLOCK|ScatterLock::WAIT_STABLE); - lock->get_parent()->auth_unpin(); - } - } - - // gSyncL -> sync? - else if (lock->get_state() == LOCK_GSYNCL && - !lock->is_wrlocked()) { - dout(7) << "scatter_eval finished sync un-wrlock on " << *lock - << " on " << *lock->get_parent() << dendl; - if (lock->get_parent()->is_replicated()) { - // encode and bcast - bufferlist data; - lock->encode_locked_state(data); - send_lock_message(lock, LOCK_AC_SYNC, data); - } - lock->set_state(LOCK_SYNC); - lock->finish_waiters(ScatterLock::WAIT_RD|ScatterLock::WAIT_STABLE); - lock->get_parent()->auth_unpin(); - } - - // gscattert|gscatters -> scatter? - else if ((lock->get_state() == LOCK_GSCATTERT || - lock->get_state() == LOCK_GSCATTERS) && - !lock->is_gathering() && - !lock->is_rdlocked()) { - dout(7) << "scatter_eval finished scatter un-rdlock(/gather) on " << *lock - << " on " << *lock->get_parent() << dendl; - if (lock->get_parent()->is_replicated()) { - // encode and bcast - bufferlist data; - lock->encode_locked_state(data); - send_lock_message(lock, LOCK_AC_SCATTER, data); - } - lock->set_state(LOCK_SCATTER); - lock->finish_waiters(ScatterLock::WAIT_WR|ScatterLock::WAIT_STABLE); - lock->get_parent()->auth_unpin(); - } - - // gTempsyncC|gTempsyncL -> tempsync - else if ((lock->get_state() == LOCK_GTEMPSYNCC || - lock->get_state() == LOCK_GTEMPSYNCL) && - !lock->is_gathering() && - !lock->is_wrlocked()) { - if (lock->is_updated()) { - scatter_writebehind(lock); - } else { - dout(7) << "scatter_eval finished tempsync gather/un-wrlock on " << *lock - << " on " << *lock->get_parent() << dendl; - lock->set_state(LOCK_TEMPSYNC); - lock->finish_waiters(ScatterLock::WAIT_RD|ScatterLock::WAIT_STABLE); - lock->get_parent()->auth_unpin(); - } - } - - - // re-eval? - if (lock->is_stable()) // && lock->get_parent()->can_auth_pin()) - scatter_eval(lock); - } -} - -void Locker::scatter_writebehind(ScatterLock *lock) -{ - CInode *in = (CInode*)lock->get_parent(); - dout(10) << "scatter_writebehind " << in->inode.mtime << " on " << *lock << " on " << *in << dendl; - - // journal write-behind. - inode_t *pi = in->project_inode(); - pi->mtime = in->inode.mtime; // make sure an intermediate version isn't goofing us up - pi->version = in->pre_dirty(); - - EUpdate *le = new EUpdate(mds->mdlog, "scatter writebehind"); - le->metablob.add_dir_context(in->get_parent_dn()->get_dir()); - le->metablob.add_primary_dentry(in->get_parent_dn(), true, 0, pi); - - mds->mdlog->submit_entry(le); - mds->mdlog->wait_for_sync(new C_Locker_ScatterWB(this, lock, mds->mdlog->get_current_segment())); -} - -void Locker::scatter_writebehind_finish(ScatterLock *lock, LogSegment *ls) -{ - CInode *in = (CInode*)lock->get_parent(); - dout(10) << "scatter_writebehind_finish on " << *lock << " on " << *in << dendl; - in->pop_and_dirty_projected_inode(ls); - lock->clear_updated(); - scatter_eval_gather(lock); -} - -void Locker::scatter_eval(ScatterLock *lock) -{ - dout(10) << "scatter_eval " << *lock << " on " << *lock->get_parent() << dendl; - - assert(lock->get_parent()->is_auth()); - assert(lock->is_stable()); - - CInode *in = (CInode*)lock->get_parent(); - if (in->has_subtree_root_dirfrag() && !in->is_base()) { - // i _should_ be scattered. - if (!lock->is_rdlocked() && - !lock->is_xlocked() && - lock->get_state() != LOCK_SCATTER) { - dout(10) << "scatter_eval no rdlocks|xlocks, am subtree root inode, scattering" << dendl; - scatter_scatter(lock); - autoscattered.push_back(&lock->xlistitem_autoscattered); - } - } else { - // i _should_ be sync. - lock->xlistitem_autoscattered.remove_myself(); - if (!lock->is_wrlocked() && - !lock->is_xlocked() && - lock->get_state() != LOCK_SYNC) { - dout(10) << "scatter_eval no wrlocks|xlocks, not subtree root inode, syncing" << dendl; - scatter_sync(lock); - } - } -} - -void Locker::note_autoscattered(ScatterLock *lock) -{ - dout(10) << "note_autoscattered " << *lock << " on " << *lock->get_parent() << dendl; - autoscattered.push_back(&lock->xlistitem_autoscattered); -} - - -/* - * this is called by LogSegment::try_to_trim() when trying to - * flush dirty scattered data (e.g. inode->dirlock mtime) back - * to the auth node. - */ -void Locker::scatter_try_unscatter(ScatterLock *lock, Context *c) -{ - dout(10) << "scatter_try_unscatter " << *lock << " on " << *lock->get_parent() << dendl; - assert(!lock->get_parent()->is_auth()); - assert(!lock->get_parent()->is_ambiguous_auth()); - - // request unscatter? - int auth = lock->get_parent()->authority().first; - if (lock->get_state() == LOCK_SCATTER && - mds->mdsmap->get_state(auth) >= MDSMap::STATE_ACTIVE) - mds->send_message_mds(new MLock(lock, LOCK_AC_REQUNSCATTER, mds->get_nodeid()), - auth, MDS_PORT_LOCKER); - - // wait... - lock->add_waiter(SimpleLock::WAIT_STABLE, c); -} - - -void Locker::scatter_sync(ScatterLock *lock) -{ - dout(10) << "scatter_sync " << *lock - << " on " << *lock->get_parent() << dendl; - assert(lock->get_parent()->is_auth()); - assert(lock->is_stable()); - - switch (lock->get_state()) { - case LOCK_SYNC: - return; // already sync. - - case LOCK_TEMPSYNC: - break; // just do it. - - case LOCK_LOCK: - if (lock->is_wrlocked() || lock->is_xlocked()) { - lock->set_state(LOCK_GSYNCL); - lock->get_parent()->auth_pin(); - return; - } - break; // do it. - - case LOCK_SCATTER: - // lock first. this is the slow way, incidentally. - if (lock->get_parent()->is_replicated()) { - send_lock_message(lock, LOCK_AC_LOCK); - lock->init_gather(); - } else { - if (!lock->is_wrlocked()) { - break; // do it now, we're fine - } - } - lock->set_state(LOCK_GLOCKC); - lock->get_parent()->auth_pin(); - return; - - default: - assert(0); - } - - // do sync - if (lock->get_parent()->is_replicated()) { - // encode and bcast - bufferlist data; - lock->encode_locked_state(data); - send_lock_message(lock, LOCK_AC_SYNC, data); - } - - lock->set_state(LOCK_SYNC); - lock->finish_waiters(ScatterLock::WAIT_RD|ScatterLock::WAIT_STABLE); -} - -void Locker::scatter_scatter(ScatterLock *lock) -{ - dout(10) << "scatter_scatter " << *lock - << " on " << *lock->get_parent() << dendl; - assert(lock->get_parent()->is_auth()); - assert(lock->is_stable()); - - lock->set_last_scatter(g_clock.now()); - - switch (lock->get_state()) { - case LOCK_SYNC: - if (!lock->is_rdlocked() && - !lock->get_parent()->is_replicated()) - break; // do it - if (lock->get_parent()->is_replicated()) { - send_lock_message(lock, LOCK_AC_LOCK); - lock->init_gather(); - } - lock->set_state(LOCK_GSCATTERS); - lock->get_parent()->auth_pin(); - return; - - case LOCK_LOCK: - if (lock->is_xlocked()) - return; // sorry - break; // do it. - - case LOCK_SCATTER: - return; // did it. - - case LOCK_TEMPSYNC: - if (lock->is_rdlocked()) { - lock->set_state(LOCK_GSCATTERT); - lock->get_parent()->auth_pin(); - return; - } - break; // do it - - default: - assert(0); - } - - // do scatter - if (lock->get_parent()->is_replicated()) { - // encode and bcast - bufferlist data; - lock->encode_locked_state(data); - send_lock_message(lock, LOCK_AC_SCATTER, data); - } - lock->set_state(LOCK_SCATTER); - lock->finish_waiters(ScatterLock::WAIT_WR|ScatterLock::WAIT_STABLE); -} - -void Locker::scatter_lock(ScatterLock *lock) -{ - dout(10) << "scatter_lock " << *lock - << " on " << *lock->get_parent() << dendl; - assert(lock->get_parent()->is_auth()); - assert(lock->is_stable()); - - switch (lock->get_state()) { - case LOCK_SYNC: - if (!lock->is_rdlocked() && - !lock->get_parent()->is_replicated()) - break; // do it. - - if (lock->get_parent()->is_replicated()) { - send_lock_message(lock, LOCK_AC_LOCK); - lock->init_gather(); - } - lock->set_state(LOCK_GLOCKS); - lock->get_parent()->auth_pin(); - return; - - case LOCK_LOCK: - return; // done. - - case LOCK_SCATTER: - if (!lock->is_wrlocked() && - !lock->get_parent()->is_replicated()) { - break; // do it. - } - - if (lock->get_parent()->is_replicated()) { - send_lock_message(lock, LOCK_AC_LOCK); - lock->init_gather(); - } - lock->set_state(LOCK_GLOCKC); - lock->get_parent()->auth_pin(); - return; - - case LOCK_TEMPSYNC: - if (lock->is_rdlocked()) { - lock->set_state(LOCK_GLOCKT); - lock->get_parent()->auth_pin(); - return; - } - break; // do it. - } - - // do lock - lock->set_state(LOCK_LOCK); - lock->finish_waiters(ScatterLock::WAIT_XLOCK|ScatterLock::WAIT_WR|ScatterLock::WAIT_STABLE); -} - -void Locker::scatter_tempsync(ScatterLock *lock) -{ - dout(10) << "scatter_tempsync " << *lock - << " on " << *lock->get_parent() << dendl; - assert(lock->get_parent()->is_auth()); - assert(lock->is_stable()); - - switch (lock->get_state()) { - case LOCK_SYNC: - break; // do it. - - case LOCK_LOCK: - if (lock->is_wrlocked() || - lock->is_xlocked()) { - lock->set_state(LOCK_GTEMPSYNCL); - lock->get_parent()->auth_pin(); - return; - } - break; // do it. - - case LOCK_SCATTER: - if (!lock->is_wrlocked() && - !lock->get_parent()->is_replicated()) { - break; // do it. - } - - if (lock->get_parent()->is_replicated()) { - send_lock_message(lock, LOCK_AC_LOCK); - lock->init_gather(); - } - lock->set_state(LOCK_GTEMPSYNCC); - lock->get_parent()->auth_pin(); - return; - - case LOCK_TEMPSYNC: - return; // done - } - - // do tempsync - lock->set_state(LOCK_TEMPSYNC); - lock->finish_waiters(ScatterLock::WAIT_RD|ScatterLock::WAIT_STABLE); -} - - - - -void Locker::handle_scatter_lock(ScatterLock *lock, MLock *m) -{ - int from = m->get_asker(); - dout(10) << "handle_scatter_lock " << *m << " on " << *lock << " on " << *lock->get_parent() << dendl; - - if (mds->is_rejoin()) { - if (lock->get_parent()->is_rejoining()) { - dout(7) << "handle_scatter_lock still rejoining " << *lock->get_parent() - << ", dropping " << *m << dendl; - delete m; - return; - } - } - - switch (m->get_action()) { - // -- replica -- - case LOCK_AC_SYNC: - assert(lock->get_state() == LOCK_LOCK); - lock->set_state(LOCK_SYNC); - lock->decode_locked_state(m->get_data()); - lock->clear_updated(); - lock->finish_waiters(ScatterLock::WAIT_RD|ScatterLock::WAIT_STABLE); - break; - - case LOCK_AC_LOCK: - assert(lock->get_state() == LOCK_SCATTER || - lock->get_state() == LOCK_SYNC); - - // wait for wrlocks to close? - if (lock->is_wrlocked()) { - assert(lock->get_state() == LOCK_SCATTER); - dout(7) << "handle_scatter_lock has wrlocks, waiting on " << *lock - << " on " << *lock->get_parent() << dendl; - lock->set_state(LOCK_GLOCKC); - } else if (lock->is_rdlocked()) { - assert(lock->get_state() == LOCK_SYNC); - dout(7) << "handle_scatter_lock has rdlocks, waiting on " << *lock - << " on " << *lock->get_parent() << dendl; - lock->set_state(LOCK_GLOCKS); - } else { - dout(7) << "handle_scatter_lock has no rd|wrlocks, sending lockack for " << *lock - << " on " << *lock->get_parent() << dendl; - - // encode and reply - bufferlist data; - lock->encode_locked_state(data); - mds->send_message_mds(new MLock(lock, LOCK_AC_LOCKACK, mds->get_nodeid(), data), - from, MDS_PORT_LOCKER); - lock->set_state(LOCK_LOCK); - } - break; - - case LOCK_AC_SCATTER: - assert(lock->get_state() == LOCK_LOCK); - lock->decode_locked_state(m->get_data()); - lock->clear_updated(); - lock->set_state(LOCK_SCATTER); - lock->finish_waiters(ScatterLock::WAIT_WR|ScatterLock::WAIT_STABLE); - break; - - // -- for auth -- - case LOCK_AC_LOCKACK: - assert(lock->get_state() == LOCK_GLOCKS || - lock->get_state() == LOCK_GLOCKC || - lock->get_state() == LOCK_GSCATTERS || - lock->get_state() == LOCK_GTEMPSYNCC); - assert(lock->is_gathering(from)); - lock->remove_gather(from); - lock->decode_locked_state(m->get_data()); - - if (lock->is_gathering()) { - dout(7) << "handle_scatter_lock " << *lock << " on " << *lock->get_parent() - << " from " << from << ", still gathering " << lock->get_gather_set() - << dendl; - } else { - dout(7) << "handle_scatter_lock " << *lock << " on " << *lock->get_parent() - << " from " << from << ", last one" - << dendl; - scatter_eval_gather(lock); - } - break; - - case LOCK_AC_REQSCATTER: - if (lock->is_stable()) { - /* NOTE: we can do this _even_ if !can_auth_pin (i.e. freezing) - * because the replica should be holding an auth_pin if they're - * doing this (and thus, we are freezing, not frozen, and indefinite - * starvation isn't an issue). - */ - dout(7) << "handle_scatter_lock got scatter request on " << *lock - << " on " << *lock->get_parent() << dendl; - scatter_scatter(lock); - } else { - dout(7) << "handle_scatter_lock ignoring scatter request on " << *lock - << " on " << *lock->get_parent() << dendl; - } - break; - - case LOCK_AC_REQUNSCATTER: - if (!lock->is_stable()) { - dout(7) << "handle_scatter_lock ignoring now-unnecessary unscatter request on " << *lock - << " on " << *lock->get_parent() << dendl; - } else if (lock->get_parent()->can_auth_pin()) { - dout(7) << "handle_scatter_lock got unscatter request on " << *lock - << " on " << *lock->get_parent() << dendl; - scatter_lock(lock); - } else { - dout(7) << "handle_scatter_lock DROPPING unscatter request on " << *lock - << " on " << *lock->get_parent() << dendl; - /* FIXME: if we can't auth_pin here, this request is effectively lost... */ - } - } - - delete m; -} - - - -void Locker::scatter_unscatter_autoscattered() -{ - /* - * periodically unscatter autoscattered locks - */ - - dout(10) << "scatter_unscatter_autoscattered" << dendl; - - utime_t now = g_clock.now(); - int n = autoscattered.size(); - while (!autoscattered.empty()) { - ScatterLock *lock = autoscattered.front(); - - // stop? - if (lock->get_state() == LOCK_SCATTER && - now - lock->get_last_scatter() < 10.0) - break; - - autoscattered.pop_front(); - - if (lock->get_state() == LOCK_SCATTER && - lock->get_parent()->is_replicated()) { - if (((CInode*)lock->get_parent())->is_frozen() || - ((CInode*)lock->get_parent())->is_freezing()) { - // hrm.. requeue. - dout(10) << "last_scatter " << lock->get_last_scatter() - << ", now " << now << ", but frozen|freezing, requeueing" << dendl; - autoscattered.push_back(&lock->xlistitem_autoscattered); - } else { - dout(10) << "last_scatter " << lock->get_last_scatter() - << ", now " << now << ", locking" << dendl; - scatter_lock(lock); - } - } - if (--n == 0) break; - } -} - - - -// ========================================================================== -// local lock - - -bool Locker::local_wrlock_start(LocalLock *lock, MDRequest *mdr) -{ - dout(7) << "local_wrlock_start on " << *lock - << " on " << *lock->get_parent() << dendl; - - if (lock->can_wrlock()) { - lock->get_wrlock(); - mdr->wrlocks.insert(lock); - mdr->locks.insert(lock); - return true; - } else { - lock->add_waiter(SimpleLock::WAIT_WR|SimpleLock::WAIT_STABLE, new C_MDS_RetryRequest(mdcache, mdr)); - return false; - } -} - -void Locker::local_wrlock_finish(LocalLock *lock, MDRequest *mdr) -{ - dout(7) << "local_wrlock_finish on " << *lock - << " on " << *lock->get_parent() << dendl; - lock->put_wrlock(); - mdr->wrlocks.erase(lock); - mdr->locks.erase(lock); -} - -bool Locker::local_xlock_start(LocalLock *lock, MDRequest *mdr) -{ - dout(7) << "local_xlock_start on " << *lock - << " on " << *lock->get_parent() << dendl; - - if (lock->is_xlocked_by_other(mdr)) { - lock->add_waiter(SimpleLock::WAIT_WR|SimpleLock::WAIT_STABLE, new C_MDS_RetryRequest(mdcache, mdr)); - return false; - } - - lock->get_xlock(mdr); - mdr->xlocks.insert(lock); - mdr->locks.insert(lock); - return true; -} - -void Locker::local_xlock_finish(LocalLock *lock, MDRequest *mdr) -{ - dout(7) << "local_xlock_finish on " << *lock - << " on " << *lock->get_parent() << dendl; - lock->put_xlock(); - mdr->xlocks.erase(lock); - mdr->locks.erase(lock); - - lock->finish_waiters(SimpleLock::WAIT_STABLE|SimpleLock::WAIT_WR); -} - - - -// ========================================================================== -// file lock - - -bool Locker::file_rdlock_start(FileLock *lock, MDRequest *mdr) -{ - dout(7) << "file_rdlock_start " << *lock << " on " << *lock->get_parent() << dendl; - - // can read? grab ref. - if (lock->can_rdlock(mdr)) { - lock->get_rdlock(); - mdr->rdlocks.insert(lock); - mdr->locks.insert(lock); - return true; - } - - // can't read, and replicated. - if (lock->can_rdlock_soon()) { - // wait - dout(7) << "file_rdlock_start can_rdlock_soon " << *lock << " on " << *lock->get_parent() << dendl; - } else { - if (lock->get_parent()->is_auth()) { - // auth - - // FIXME or qsync? - - if (lock->is_stable()) { - file_lock(lock); // lock, bc easiest to back off ... FIXME - - if (lock->can_rdlock(mdr)) { - lock->get_rdlock(); - mdr->rdlocks.insert(lock); - mdr->locks.insert(lock); - - lock->finish_waiters(SimpleLock::WAIT_STABLE); - return true; - } - } else { - dout(7) << "file_rdlock_start waiting until stable on " << *lock << " on " << *lock->get_parent() << dendl; - lock->add_waiter(SimpleLock::WAIT_STABLE, new C_MDS_RetryRequest(mdcache, mdr)); - return false; - } - } else { - // replica - if (lock->is_stable()) { - - // fw to auth - CInode *in = (CInode*)lock->get_parent(); - int auth = in->authority().first; - dout(7) << "file_rdlock_start " << *lock << " on " << *lock->get_parent() << " on replica and async, fw to auth " << auth << dendl; - assert(auth != mds->get_nodeid()); - mdcache->request_forward(mdr, auth); - return false; - - } else { - // wait until stable - dout(7) << "inode_file_rdlock_start waiting until stable on " << *lock << " on " << *lock->get_parent() << dendl; - lock->add_waiter(SimpleLock::WAIT_STABLE, new C_MDS_RetryRequest(mdcache, mdr)); - return false; - } - } - } - - // wait - dout(7) << "file_rdlock_start waiting on " << *lock << " on " << *lock->get_parent() << dendl; - lock->add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr)); - - return false; -} - - - -void Locker::file_rdlock_finish(FileLock *lock, MDRequest *mdr) -{ - dout(7) << "rdlock_finish on " << *lock << " on " << *lock->get_parent() << dendl; - - // drop ref - lock->put_rdlock(); - mdr->rdlocks.erase(lock); - mdr->locks.erase(lock); - - if (!lock->is_rdlocked()) - file_eval_gather(lock); -} - - -bool Locker::file_xlock_start(FileLock *lock, MDRequest *mdr) -{ - dout(7) << "file_xlock_start on " << *lock << " on " << *lock->get_parent() << dendl; - - assert(lock->get_parent()->is_auth()); // remote file xlock not implemented - - // already xlocked by me? - if (lock->get_xlocked_by() == mdr) - return true; - - // can't write? - if (!lock->can_xlock(mdr)) { - - // auth - if (!lock->can_xlock_soon()) { - if (!lock->is_stable()) { - dout(7) << "file_xlock_start on auth, waiting for stable on " << *lock << " on " << *lock->get_parent() << dendl; - lock->add_waiter(SimpleLock::WAIT_STABLE, new C_MDS_RetryRequest(mdcache, mdr)); - return false; - } - - // initiate lock - file_lock(lock); - - // fall-thru to below. - } - } - - // check again - if (lock->can_xlock(mdr)) { - assert(lock->get_parent()->is_auth()); - lock->get_xlock(mdr); - mdr->locks.insert(lock); - mdr->xlocks.insert(lock); - return true; - } else { - dout(7) << "file_xlock_start on auth, waiting for write on " << *lock << " on " << *lock->get_parent() << dendl; - lock->add_waiter(SimpleLock::WAIT_WR, new C_MDS_RetryRequest(mdcache, mdr)); - return false; - } -} - - -void Locker::file_xlock_finish(FileLock *lock, MDRequest *mdr) -{ - dout(7) << "file_xlock_finish on " << *lock << " on " << *lock->get_parent() << dendl; - - // drop ref - assert(lock->can_xlock(mdr)); - lock->put_xlock(); - mdr->locks.erase(lock); - mdr->xlocks.erase(lock); - - assert(lock->get_parent()->is_auth()); // or implement remote xlocks - - // others waiting? - lock->finish_waiters(SimpleLock::WAIT_WR, 0); - - if (lock->get_parent()->is_auth()) - file_eval(lock); -} - - -/* - * ... - * - * also called after client caps are acked to us - * - checks if we're in unstable sfot state and can now move on to next state - * - checks if soft state should change (eg bc last writer closed) - */ -class C_Locker_FileEval : public Context { - Locker *locker; - FileLock *lock; -public: - C_Locker_FileEval(Locker *l, FileLock *lk) : locker(l), lock(lk) {} - void finish(int r) { - lock->get_parent()->put(CInode::PIN_PTRWAITER); - locker->try_file_eval(lock); - } -}; - -void Locker::try_file_eval(FileLock *lock) -{ - CInode *in = (CInode*)lock->get_parent(); - - // unstable and ambiguous auth? - if (!lock->is_stable() && - in->is_ambiguous_auth()) { - dout(7) << "try_file_eval not stable and ambiguous auth, waiting on " << *in << dendl; - //if (!lock->get_parent()->is_waiter(MDSCacheObject::WAIT_SINGLEAUTH)) - in->get(CInode::PIN_PTRWAITER); - in->add_waiter(CInode::WAIT_SINGLEAUTH, new C_Locker_FileEval(this, lock)); - return; - } - - if (!lock->get_parent()->is_auth()) { - dout(7) << "try_file_eval not auth for " << *lock->get_parent() << dendl; - return; - } - - if (!lock->get_parent()->can_auth_pin()) { - dout(7) << "try_file_eval can't auth_pin, waiting on " << *in << dendl; - //if (!lock->get_parent()->is_waiter(MDSCacheObject::WAIT_SINGLEAUTH)) - in->get(CInode::PIN_PTRWAITER); - in->add_waiter(CInode::WAIT_UNFREEZE, new C_Locker_FileEval(this, lock)); - return; - } - - if (lock->is_stable()) - file_eval(lock); -} - - - -void Locker::file_eval_gather(FileLock *lock) -{ - CInode *in = (CInode*)lock->get_parent(); - int issued = in->get_caps_issued(); - - dout(7) << "file_eval_gather issued " << cap_string(issued) - << " vs " << cap_string(lock->caps_allowed()) - << " on " << *lock << " on " << *lock->get_parent() - << dendl; - - if (lock->is_stable()) - return; // nothing for us to do here! - - // [auth] finished gather? - if (in->is_auth() && - !lock->is_gathering() && - ((issued & ~lock->caps_allowed()) == 0)) { - dout(7) << "file_eval_gather finished gather" << dendl; - - switch (lock->get_state()) { - // to lock - case LOCK_GLOCKR: - case LOCK_GLOCKM: - case LOCK_GLOCKL: - lock->set_state(LOCK_LOCK); - - // waiters - lock->get_rdlock(); - lock->finish_waiters(SimpleLock::WAIT_STABLE|SimpleLock::WAIT_WR|SimpleLock::WAIT_RD); - lock->put_rdlock(); - lock->get_parent()->auth_unpin(); - break; - - // to mixed - case LOCK_GMIXEDR: - lock->set_state(LOCK_MIXED); - lock->finish_waiters(SimpleLock::WAIT_STABLE); - lock->get_parent()->auth_unpin(); - break; - - case LOCK_GMIXEDL: - lock->set_state(LOCK_MIXED); - - if (in->is_replicated()) { - // data - bufferlist softdata; - lock->encode_locked_state(softdata); - - // bcast to replicas - send_lock_message(lock, LOCK_AC_MIXED, softdata); - } - - lock->finish_waiters(SimpleLock::WAIT_STABLE); - lock->get_parent()->auth_unpin(); - break; - - // to loner - case LOCK_GLONERR: - lock->set_state(LOCK_LONER); - lock->finish_waiters(SimpleLock::WAIT_STABLE); - lock->get_parent()->auth_unpin(); - break; - - case LOCK_GLONERM: - lock->set_state(LOCK_LONER); - lock->finish_waiters(SimpleLock::WAIT_STABLE); - lock->get_parent()->auth_unpin(); - break; - - // to sync - case LOCK_GSYNCL: - case LOCK_GSYNCM: - lock->set_state(LOCK_SYNC); - - { // bcast data to replicas - bufferlist softdata; - lock->encode_locked_state(softdata); - - send_lock_message(lock, LOCK_AC_SYNC, softdata); - } - - // waiters - lock->get_rdlock(); - lock->finish_waiters(SimpleLock::WAIT_RD|SimpleLock::WAIT_STABLE); - lock->put_rdlock(); - lock->get_parent()->auth_unpin(); - break; - - default: - assert(0); - } - - issue_caps(in); - - // stable re-eval? - if (lock->is_stable()) //&& lock->get_parent()->can_auth_pin()) - file_eval(lock); - } - - // [replica] finished caps gather? - if (!in->is_auth() && - ((issued & ~lock->caps_allowed()) == 0)) { - switch (lock->get_state()) { - case LOCK_GMIXEDR: - { - lock->set_state(LOCK_MIXED); - - // ack - MLock *reply = new MLock(lock, LOCK_AC_MIXEDACK, mds->get_nodeid()); - mds->send_message_mds(reply, in->authority().first, MDS_PORT_LOCKER); - } - break; - - case LOCK_GLOCKR: - { - lock->set_state(LOCK_LOCK); - - // ack - MLock *reply = new MLock(lock, LOCK_AC_LOCKACK, mds->get_nodeid()); - mds->send_message_mds(reply, in->authority().first, MDS_PORT_LOCKER); - } - break; - - default: - assert(0); - } - } - - -} - -void Locker::file_eval(FileLock *lock) -{ - CInode *in = (CInode*)lock->get_parent(); - int wanted = in->get_caps_wanted(); - bool loner = (in->client_caps.size() == 1) && in->mds_caps_wanted.empty(); - dout(7) << "file_eval wanted=" << cap_string(wanted) - << " filelock=" << *lock << " on " << *lock->get_parent() - << " loner=" << loner - << dendl; - - assert(lock->get_parent()->is_auth()); - assert(lock->is_stable()); - - // not xlocked! - if (lock->is_xlocked()) return; - - // * -> loner? - if (!lock->is_rdlocked() && - !lock->is_waiter_for(SimpleLock::WAIT_WR) && - (wanted & CAP_FILE_WR) && - loner && - lock->get_state() != LOCK_LONER) { - dout(7) << "file_eval stable, bump to loner " << *lock << " on " << *lock->get_parent() << dendl; - file_loner(lock); - } - - // * -> mixed? - else if (!lock->is_rdlocked() && - !lock->is_waiter_for(SimpleLock::WAIT_WR) && - (wanted & CAP_FILE_RD) && - (wanted & CAP_FILE_WR) && - !(loner && lock->get_state() == LOCK_LONER) && - lock->get_state() != LOCK_MIXED) { - dout(7) << "file_eval stable, bump to mixed " << *lock << " on " << *lock->get_parent() << dendl; - file_mixed(lock); - } - - // * -> sync? - else if (!in->filelock.is_waiter_for(SimpleLock::WAIT_WR) && - !(wanted & (CAP_FILE_WR|CAP_FILE_WRBUFFER)) && - ((wanted & CAP_FILE_RD) || - in->is_replicated() || - (!loner && lock->get_state() == LOCK_LONER)) && - lock->get_state() != LOCK_SYNC) { - dout(7) << "file_eval stable, bump to sync " << *lock << " on " << *lock->get_parent() << dendl; - file_sync(lock); - } - - // * -> lock? (if not replicated or open) - else if (!in->is_replicated() && - wanted == 0 && - lock->get_state() != LOCK_LOCK) { - file_lock(lock); - } -} - - -// mid - -bool Locker::file_sync(FileLock *lock) -{ - CInode *in = (CInode*)lock->get_parent(); - dout(7) << "file_sync " << *lock << " on " << *lock->get_parent() << dendl; - - assert(in->is_auth()); - assert(lock->is_stable()); - - int issued = in->get_caps_issued(); - - assert((in->get_caps_wanted() & CAP_FILE_WR) == 0); - - if (lock->get_state() == LOCK_LOCK) { - if (in->is_replicated()) { - bufferlist softdata; - lock->encode_locked_state(softdata); - send_lock_message(lock, LOCK_AC_SYNC, softdata); - } - - // change lock - lock->set_state(LOCK_SYNC); - - issue_caps(in); // reissue caps - return true; - } - - else if (lock->get_state() == LOCK_MIXED) { - // writers? - if (issued & CAP_FILE_WR) { - // gather client write caps - lock->set_state(LOCK_GSYNCM); - lock->get_parent()->auth_pin(); - issue_caps(in); - } else { - // no writers, go straight to sync - if (in->is_replicated()) { - bufferlist softdata; - lock->encode_locked_state(softdata); - send_lock_message(lock, LOCK_AC_SYNC, softdata); - } - - // change lock - lock->set_state(LOCK_SYNC); - } - return false; - } - - else if (lock->get_state() == LOCK_LONER) { - // writers? - if (issued & CAP_FILE_WR) { - // gather client write caps - lock->set_state(LOCK_GSYNCL); - lock->get_parent()->auth_pin(); - issue_caps(in); - } else { - // no writers, go straight to sync - if (in->is_replicated()) { - bufferlist softdata; - lock->encode_locked_state(softdata); - send_lock_message(lock, LOCK_AC_SYNC, softdata); - } - - // change lock - lock->set_state(LOCK_SYNC); - } - return false; - } - else - assert(0); // wtf. - - return false; -} - - - -void Locker::file_lock(FileLock *lock) -{ - CInode *in = (CInode*)lock->get_parent(); - dout(7) << "inode_file_lock " << *lock << " on " << *lock->get_parent() << dendl; - - assert(in->is_auth()); - assert(lock->is_stable()); - - int issued = in->get_caps_issued(); - - if (lock->get_state() == LOCK_SYNC) { - if (in->is_replicated()) { - // bcast to replicas - send_lock_message(lock, LOCK_AC_LOCK); - lock->init_gather(); - - // change lock - lock->set_state(LOCK_GLOCKR); - lock->get_parent()->auth_pin(); - - // call back caps - if (issued) - issue_caps(in); - } else { - if (issued) { - // call back caps - lock->set_state(LOCK_GLOCKR); - lock->get_parent()->auth_pin(); - issue_caps(in); - } else { - lock->set_state(LOCK_LOCK); - } - } - } - - else if (lock->get_state() == LOCK_MIXED) { - if (in->is_replicated()) { - // bcast to replicas - send_lock_message(lock, LOCK_AC_LOCK); - lock->init_gather(); - - // change lock - lock->set_state(LOCK_GLOCKM); - lock->get_parent()->auth_pin(); - - // call back caps - issue_caps(in); - } else { - //assert(issued); // ??? -sage 2/19/06 - if (issued) { - // change lock - lock->set_state(LOCK_GLOCKM); - lock->get_parent()->auth_pin(); - - // call back caps - issue_caps(in); - } else { - lock->set_state(LOCK_LOCK); - } - } - - } - else if (lock->get_state() == LOCK_LONER) { - if (issued & CAP_FILE_WR) { - // change lock - lock->set_state(LOCK_GLOCKL); - lock->get_parent()->auth_pin(); - - // call back caps - issue_caps(in); - } else { - lock->set_state(LOCK_LOCK); - } - } - else - assert(0); // wtf. -} - - -void Locker::file_mixed(FileLock *lock) -{ - dout(7) << "file_mixed " << *lock << " on " << *lock->get_parent() << dendl; - - CInode *in = (CInode*)lock->get_parent(); - assert(in->is_auth()); - assert(lock->is_stable()); - - int issued = in->get_caps_issued(); - - if (lock->get_state() == LOCK_SYNC) { - if (in->is_replicated()) { - // bcast to replicas - send_lock_message(lock, LOCK_AC_MIXED); - lock->init_gather(); - - lock->set_state(LOCK_GMIXEDR); - lock->get_parent()->auth_pin(); - - issue_caps(in); - } else { - if (issued) { - lock->set_state(LOCK_GMIXEDR); - lock->get_parent()->auth_pin(); - issue_caps(in); - } else { - lock->set_state(LOCK_MIXED); - } - } - } - - else if (lock->get_state() == LOCK_LOCK) { - if (in->is_replicated()) { - // data - bufferlist softdata; - lock->encode_locked_state(softdata); - - // bcast to replicas - send_lock_message(lock, LOCK_AC_MIXED, softdata); - } - - // change lock - lock->set_state(LOCK_MIXED); - issue_caps(in); - } - - else if (lock->get_state() == LOCK_LONER) { - if (issued & CAP_FILE_WRBUFFER) { - // gather up WRBUFFER caps - lock->set_state(LOCK_GMIXEDL); - lock->get_parent()->auth_pin(); - issue_caps(in); - } - else if (in->is_replicated()) { - // bcast to replicas - send_lock_message(lock, LOCK_AC_MIXED); - lock->set_state(LOCK_MIXED); - issue_caps(in); - } else { - lock->set_state(LOCK_MIXED); - issue_caps(in); - } - } - - else - assert(0); // wtf. -} - - -void Locker::file_loner(FileLock *lock) -{ - CInode *in = (CInode*)lock->get_parent(); - dout(7) << "inode_file_loner " << *lock << " on " << *lock->get_parent() << dendl; - - assert(in->is_auth()); - assert(lock->is_stable()); - - assert((in->client_caps.size() == 1) && in->mds_caps_wanted.empty()); - - if (lock->get_state() == LOCK_SYNC) { - if (in->is_replicated()) { - // bcast to replicas - send_lock_message(lock, LOCK_AC_LOCK); - lock->init_gather(); - - // change lock - lock->set_state(LOCK_GLONERR); - lock->get_parent()->auth_pin(); - } else { - // only one guy with file open, who gets it all, so - lock->set_state(LOCK_LONER); - issue_caps(in); - } - } - - else if (lock->get_state() == LOCK_LOCK) { - // change lock. ignore replicas; they don't know about LONER. - lock->set_state(LOCK_LONER); - issue_caps(in); - } - - else if (lock->get_state() == LOCK_MIXED) { - if (in->is_replicated()) { - // bcast to replicas - send_lock_message(lock, LOCK_AC_LOCK); - lock->init_gather(); - - // change lock - lock->set_state(LOCK_GLONERM); - lock->get_parent()->auth_pin(); - } else { - lock->set_state(LOCK_LONER); - issue_caps(in); - } - } - - else - assert(0); -} - - - -// messenger - -void Locker::handle_file_lock(FileLock *lock, MLock *m) -{ - CInode *in = (CInode*)lock->get_parent(); - int from = m->get_asker(); - - if (mds->is_rejoin()) { - if (in->is_rejoining()) { - dout(7) << "handle_file_lock still rejoining " << *in - << ", dropping " << *m << dendl; - delete m; - return; - } - } - - - dout(7) << "handle_file_lock a=" << m->get_action() << " from " << from << " " - << *in << " filelock=" << *lock << dendl; - - int issued = in->get_caps_issued(); - - switch (m->get_action()) { - // -- replica -- - case LOCK_AC_SYNC: - assert(lock->get_state() == LOCK_LOCK || - lock->get_state() == LOCK_MIXED); - - lock->decode_locked_state(m->get_data()); - lock->set_state(LOCK_SYNC); - - // no need to reply. - - // waiters - lock->get_rdlock(); - lock->finish_waiters(SimpleLock::WAIT_RD|SimpleLock::WAIT_STABLE); - lock->put_rdlock(); - file_eval_gather(lock); - break; - - case LOCK_AC_LOCK: - assert(lock->get_state() == LOCK_SYNC || - lock->get_state() == LOCK_MIXED); - - lock->set_state(LOCK_GLOCKR); - - // call back caps? - if (issued & CAP_FILE_RD) { - dout(7) << "handle_file_lock client readers, gathering caps on " << *in << dendl; - issue_caps(in); - break; - } - else if (lock->is_rdlocked()) { - dout(7) << "handle_file_lock rdlocked, waiting before ack on " << *in << dendl; - break; - } - - // nothing to wait for, lock and ack. - { - lock->set_state(LOCK_LOCK); - - MLock *reply = new MLock(lock, LOCK_AC_LOCKACK, mds->get_nodeid()); - mds->send_message_mds(reply, from, MDS_PORT_LOCKER); - } - break; - - case LOCK_AC_MIXED: - assert(lock->get_state() == LOCK_SYNC || - lock->get_state() == LOCK_LOCK); - - if (lock->get_state() == LOCK_SYNC) { - // MIXED - if (issued & CAP_FILE_RD) { - // call back client caps - lock->set_state(LOCK_GMIXEDR); - issue_caps(in); - break; - } else { - // no clients, go straight to mixed - lock->set_state(LOCK_MIXED); - - // ack - MLock *reply = new MLock(lock, LOCK_AC_MIXEDACK, mds->get_nodeid()); - mds->send_message_mds(reply, from, MDS_PORT_LOCKER); - } - } else { - // LOCK - lock->set_state(LOCK_MIXED); - - // no ack needed. - } - - issue_caps(in); - - // waiters - lock->finish_waiters(SimpleLock::WAIT_WR|SimpleLock::WAIT_STABLE); - file_eval_gather(lock); - break; - - - - - // -- auth -- - case LOCK_AC_LOCKACK: - assert(lock->get_state() == LOCK_GLOCKR || - lock->get_state() == LOCK_GLOCKM || - lock->get_state() == LOCK_GLONERM || - lock->get_state() == LOCK_GLONERR); - assert(lock->is_gathering(from)); - lock->remove_gather(from); - - if (lock->is_gathering()) { - dout(7) << "handle_lock_inode_file " << *in << " from " << from - << ", still gathering " << lock->get_gather_set() << dendl; - } else { - dout(7) << "handle_lock_inode_file " << *in << " from " << from - << ", last one" << dendl; - file_eval_gather(lock); - } - break; - - case LOCK_AC_SYNCACK: - assert(lock->get_state() == LOCK_GSYNCM); - assert(lock->is_gathering(from)); - lock->remove_gather(from); - - /* not used currently - { - // merge data (keep largest size, mtime, etc.) - int off = 0; - in->decode_merge_file_state(m->get_data(), off); - } - */ - - if (lock->is_gathering()) { - dout(7) << "handle_lock_inode_file " << *in << " from " << from - << ", still gathering " << lock->get_gather_set() << dendl; - } else { - dout(7) << "handle_lock_inode_file " << *in << " from " << from - << ", last one" << dendl; - file_eval_gather(lock); - } - break; - - case LOCK_AC_MIXEDACK: - assert(lock->get_state() == LOCK_GMIXEDR); - assert(lock->is_gathering(from)); - lock->remove_gather(from); - - if (lock->is_gathering()) { - dout(7) << "handle_lock_inode_file " << *in << " from " << from - << ", still gathering " << lock->get_gather_set() << dendl; - } else { - dout(7) << "handle_lock_inode_file " << *in << " from " << from - << ", last one" << dendl; - file_eval_gather(lock); - } - break; - - - default: - assert(0); - } - - delete m; -} - - - - - - diff --git a/branches/sage/crush/mds/LogEvent.cc b/branches/sage/crush/mds/LogEvent.cc deleted file mode 100644 index 05b4336c52f05..0000000000000 --- a/branches/sage/crush/mds/LogEvent.cc +++ /dev/null @@ -1,83 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "LogEvent.h" - -#include "MDS.h" - -// events i know of -#include "events/EString.h" - -#include "events/ESession.h" -#include "events/ESubtreeMap.h" -#include "events/EExport.h" -#include "events/EImportStart.h" -#include "events/EImportFinish.h" -#include "events/EFragment.h" - -#include "events/EUpdate.h" -#include "events/ESlaveUpdate.h" -#include "events/EOpen.h" - -#include "events/EPurgeFinish.h" - -#include "events/EAnchor.h" -#include "events/EAnchorClient.h" - - - -LogEvent *LogEvent::decode(bufferlist& bl) -{ - // parse type, length - int off = 0; - int type; - bl.copy(off, sizeof(type), (char*)&type); - off += sizeof(type); - - int length = bl.length() - off; - generic_dout(15) << "decode_log_event type " << type << ", size " << length << dendl; - - assert(type > 0); - - // create event - LogEvent *le; - switch (type) { - case EVENT_STRING: le = new EString; break; - - case EVENT_SESSION: le = new ESession; break; - case EVENT_SUBTREEMAP: le = new ESubtreeMap; break; - case EVENT_EXPORT: le = new EExport; break; - case EVENT_IMPORTSTART: le = new EImportStart; break; - case EVENT_IMPORTFINISH: le = new EImportFinish; break; - case EVENT_FRAGMENT: le = new EFragment; break; - - case EVENT_UPDATE: le = new EUpdate; break; - case EVENT_SLAVEUPDATE: le = new ESlaveUpdate; break; - case EVENT_OPEN: le = new EOpen; break; - - case EVENT_PURGEFINISH: le = new EPurgeFinish; break; - - case EVENT_ANCHOR: le = new EAnchor; break; - case EVENT_ANCHORCLIENT: le = new EAnchorClient; break; - default: - generic_dout(1) << "uh oh, unknown log event type " << type << dendl; - assert(0); - } - - // decode - le->decode_payload(bl, off); - - return le; -} - diff --git a/branches/sage/crush/mds/LogEvent.h b/branches/sage/crush/mds/LogEvent.h deleted file mode 100644 index 8f2f55f342bb3..0000000000000 --- a/branches/sage/crush/mds/LogEvent.h +++ /dev/null @@ -1,95 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __LOGEVENT_H -#define __LOGEVENT_H - -#define EVENT_STRING 1 - -#define EVENT_SESSION 7 -#define EVENT_SUBTREEMAP 2 -#define EVENT_EXPORT 30 -#define EVENT_IMPORTSTART 31 -#define EVENT_IMPORTFINISH 32 -#define EVENT_FRAGMENT 33 - -#define EVENT_UPDATE 3 -#define EVENT_SLAVEUPDATE 4 -#define EVENT_OPEN 5 - -#define EVENT_PURGEFINISH 22 - -#define EVENT_ANCHOR 40 -#define EVENT_ANCHORCLIENT 41 - - - - -#include -using namespace std; - -#include "include/buffer.h" -#include "include/Context.h" - -class MDS; -class LogSegment; - -// generic log event -class LogEvent { - private: - int _type; - off_t _start_off,_end_off; - - friend class MDLog; - - public: - LogSegment *_segment; - - LogEvent(int t) : - _type(t), _start_off(0), _end_off(0), _segment(0) { } - virtual ~LogEvent() { } - - int get_type() { return _type; } - off_t get_start_off() { return _start_off; } - off_t get_end_off() { return _end_off; } - - // encoding - virtual void encode_payload(bufferlist& bl) = 0; - virtual void decode_payload(bufferlist& bl, int& off) = 0; - static LogEvent *decode(bufferlist &bl); - - - virtual void print(ostream& out) { - out << "event(" << _type << ")"; - } - - /*** live journal ***/ - /* update_segment() - adjust any state we need to in the LogSegment - */ - virtual void update_segment() { } - - /*** recovery ***/ - /* replay() - replay given event. this is idempotent. - */ - virtual void replay(MDS *m) { assert(0); } - - -}; - -inline ostream& operator<<(ostream& out, LogEvent& le) { - le.print(out); - return out; -} - -#endif diff --git a/branches/sage/crush/mds/LogSegment.h b/branches/sage/crush/mds/LogSegment.h deleted file mode 100644 index e73f5f8b61b9c..0000000000000 --- a/branches/sage/crush/mds/LogSegment.h +++ /dev/null @@ -1,69 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __LOGSEGMENT_H -#define __LOGSEGMENT_H - -#include "include/xlist.h" -#include "include/interval_set.h" -#include "include/Context.h" - -#include -using __gnu_cxx::hash_set; - -class CDir; -class CInode; -class CDentry; -class MDS; -class MDSlaveUpdate; - -class LogSegment { - public: - off_t offset; - int num_events; - - // dirty items - xlist dirty_dirfrags; - xlist dirty_inodes; - xlist dirty_dentries; - - xlist open_files; - xlist dirty_inode_mtimes; - - xlist slave_updates; - - //xlist purging_inodes; - map > purging_inodes; - - // committed anchor transactions - hash_set pending_commit_atids; - - // client request ids - map last_client_tids; - - // table version - version_t allocv; - version_t clientmapv; - version_t anchortablev; - - // try to expire - C_Gather *try_to_expire(MDS *mds); - - // cons - LogSegment(off_t off) : offset(off), num_events(0), - allocv(0), clientmapv(0), anchortablev(0) - { } -}; - -#endif diff --git a/branches/sage/crush/mds/MDBalancer.cc b/branches/sage/crush/mds/MDBalancer.cc deleted file mode 100644 index 8e9d0e2dd46fa..0000000000000 --- a/branches/sage/crush/mds/MDBalancer.cc +++ /dev/null @@ -1,1050 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "mdstypes.h" - -#include "MDBalancer.h" -#include "MDS.h" -#include "MDSMap.h" -#include "CInode.h" -#include "CDir.h" -#include "MDCache.h" -#include "Migrator.h" - -#include "include/Context.h" -#include "msg/Messenger.h" -#include "messages/MHeartbeat.h" - -#include -#include -using std::map; -using std::vector; - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug_mds || l<=g_conf.debug_mds_balancer) *_dout << dbeginl << g_clock.now() << " mds" << mds->get_nodeid() << ".bal " - -#define MIN_LOAD 50 // ?? -#define MIN_REEXPORT 5 // will automatically reexport -#define MIN_OFFLOAD 10 // point at which i stop trying, close enough - - - -int MDBalancer::proc_message(Message *m) -{ - switch (m->get_type()) { - - case MSG_MDS_HEARTBEAT: - handle_heartbeat((MHeartbeat*)m); - break; - - default: - dout(1) << " balancer unknown message " << m->get_type() << dendl; - assert(0); - break; - } - - return 0; -} - - - - -void MDBalancer::tick() -{ - static int num_bal_times = g_conf.mds_bal_max; - static utime_t first = g_clock.now(); - utime_t now = g_clock.now(); - utime_t elapsed = now; - elapsed -= first; - - // sample? - if ((double)now - (double)last_sample > g_conf.mds_bal_sample_interval) { - dout(15) << "tick last_sample now " << now << dendl; - last_sample = now; - } - - // balance? - if (last_heartbeat == utime_t()) last_heartbeat = now; - if (true && - mds->get_nodeid() == 0 && - g_conf.mds_bal_interval > 0 && - (num_bal_times || - (g_conf.mds_bal_max_until >= 0 && - elapsed.sec() > g_conf.mds_bal_max_until)) && - mds->is_active() && - now.sec() - last_heartbeat.sec() >= g_conf.mds_bal_interval) { - last_heartbeat = now; - send_heartbeat(); - num_bal_times--; - } - - // hash? - if (true && - now.sec() - last_fragment.sec() > g_conf.mds_bal_fragment_interval) { - last_fragment = now; - do_fragmenting(); - } -} - - - - -class C_Bal_SendHeartbeat : public Context { -public: - MDS *mds; - C_Bal_SendHeartbeat(MDS *mds) { - this->mds = mds; - } - virtual void finish(int f) { - mds->balancer->send_heartbeat(); - } -}; - - -double mds_load_t::mds_load() -{ - switch(g_conf.mds_bal_mode) { - case 0: - return - .8 * auth.meta_load() + - .2 * all.meta_load() + - req_rate + - 10.0 * queue_len; - - case 1: - return req_rate + 10.0*queue_len; - - case 2: - return cpu_load_avg; - - } - assert(0); - return 0; -} - -mds_load_t MDBalancer::get_load() -{ - mds_load_t load; - - if (mds->mdcache->get_root()) { - list ls; - mds->mdcache->get_root()->get_dirfrags(ls); - for (list::iterator p = ls.begin(); - p != ls.end(); - p++) { - load.auth += (*p)->pop_auth_subtree_nested; - load.all += (*p)->pop_nested; - } - } else { - dout(20) << "get_load no root, no load" << dendl; - } - - load.req_rate = mds->get_req_rate(); - load.queue_len = mds->messenger->get_dispatch_queue_len(); - - ifstream cpu("/proc/loadavg"); - if (cpu.is_open()) - cpu >> load.cpu_load_avg; - - dout(15) << "get_load " << load << dendl; - return load; -} - -void MDBalancer::send_heartbeat() -{ - utime_t now = g_clock.now(); - if (!mds->mdcache->get_root()) { - dout(5) << "no root on send_heartbeat" << dendl; - mds->mdcache->open_root(new C_Bal_SendHeartbeat(mds)); - return; - } - - mds_load.clear(); - if (mds->get_nodeid() == 0) - beat_epoch++; - - // my load - mds_load_t load = get_load(); - mds_load[ mds->get_nodeid() ] = load; - - // import_map -- how much do i import from whom - map import_map; - set authsubs; - mds->mdcache->get_auth_subtrees(authsubs); - for (set::iterator it = authsubs.begin(); - it != authsubs.end(); - it++) { - CDir *im = *it; - int from = im->inode->authority().first; - if (from == mds->get_nodeid()) continue; - if (im->get_inode()->is_stray()) continue; - import_map[from] += im->pop_auth_subtree.meta_load(now); - } - mds_import_map[ mds->get_nodeid() ] = import_map; - - - dout(5) << "mds" << mds->get_nodeid() << " epoch " << beat_epoch << " load " << load << dendl; - for (map::iterator it = import_map.begin(); - it != import_map.end(); - it++) { - dout(5) << " import_map from " << it->first << " -> " << it->second << dendl; - } - - - set up; - mds->get_mds_map()->get_in_mds_set(up); - for (set::iterator p = up.begin(); p != up.end(); ++p) { - if (*p == mds->get_nodeid()) continue; - MHeartbeat *hb = new MHeartbeat(load, beat_epoch); - hb->get_import_map() = import_map; - mds->messenger->send_message(hb, - mds->mdsmap->get_inst(*p), - MDS_PORT_BALANCER, MDS_PORT_BALANCER); - } -} - -void MDBalancer::handle_heartbeat(MHeartbeat *m) -{ - dout(25) << "=== got heartbeat " << m->get_beat() << " from " << m->get_source().num() << " " << m->get_load() << dendl; - - if (!mds->is_active()) - return; - - if (!mds->mdcache->get_root()) { - dout(10) << "opening root on handle_heartbeat" << dendl; - mds->mdcache->open_root(new C_MDS_RetryMessage(mds, m)); - return; - } - - int who = m->get_source().num(); - - if (who == 0) { - dout(20) << " from mds0, new epoch" << dendl; - beat_epoch = m->get_beat(); - send_heartbeat(); - - show_imports(); - } - - mds_load[ who ] = m->get_load(); - mds_import_map[ who ] = m->get_import_map(); - - //dout(0) << " load is " << load << " have " << mds_load.size() << dendl; - - unsigned cluster_size = mds->get_mds_map()->get_num_in_mds(); - if (mds_load.size() == cluster_size) { - // let's go! - //export_empties(); // no! - do_rebalance(m->get_beat()); - } - - // done - delete m; -} - - -void MDBalancer::export_empties() -{ - dout(5) << "export_empties checking for empty imports" << dendl; - - for (map >::iterator it = mds->mdcache->subtrees.begin(); - it != mds->mdcache->subtrees.end(); - it++) { - CDir *dir = it->first; - if (!dir->is_auth() || - dir->is_ambiguous_auth() || - dir->is_freezing() || - dir->is_frozen()) - continue; - - if (!dir->inode->is_root() && dir->get_size() == 0) - mds->mdcache->migrator->export_empty_import(dir); - } -} - - - -double MDBalancer::try_match(int ex, double& maxex, - int im, double& maxim) -{ - if (maxex <= 0 || maxim <= 0) return 0.0; - - double howmuch = MIN(maxex, maxim); - if (howmuch <= 0) return 0.0; - - dout(5) << " - mds" << ex << " exports " << howmuch << " to mds" << im << dendl; - - if (ex == mds->get_nodeid()) - my_targets[im] += howmuch; - - exported[ex] += howmuch; - imported[im] += howmuch; - - maxex -= howmuch; - maxim -= howmuch; - - return howmuch; -} - - - -void MDBalancer::do_fragmenting() -{ - if (split_queue.empty()) { - dout(20) << "do_fragmenting has nothing to do" << dendl; - return; - } - - dout(0) << "do_fragmenting " << split_queue.size() << " dirs marked for possible splitting" << dendl; - - for (set::iterator i = split_queue.begin(); - i != split_queue.end(); - i++) { - CDir *dir = mds->mdcache->get_dirfrag(*i); - if (!dir) continue; - if (!dir->is_auth()) continue; - - dout(0) << "do_fragmenting splitting " << *dir << dendl; - mds->mdcache->split_dir(dir, 4); - } - split_queue.clear(); -} - - - -void MDBalancer::do_rebalance(int beat) -{ - int cluster_size = mds->get_mds_map()->get_num_mds(); - int whoami = mds->get_nodeid(); - utime_t now = g_clock.now(); - - dump_pop_map(); - - // reset - my_targets.clear(); - imported.clear(); - exported.clear(); - - dout(5) << " do_rebalance: cluster loads are" << dendl; - - mds->mdcache->migrator->clear_export_queue(); - - // rescale! turn my mds_load back into meta_load units - double load_fac = 1.0; - if (mds_load[whoami].mds_load() > 0) { - double metald = mds_load[whoami].auth.meta_load(now); - double mdsld = mds_load[whoami].mds_load(); - load_fac = metald / mdsld; - dout(7) << " load_fac is " << load_fac - << " <- " << mds_load[whoami].auth << " " << metald - << " / " << mdsld - << dendl; - } - - double total_load = 0; - multimap load_map; - for (int i=0; i( l, i )); - } - - // target load - target_load = total_load / (double)cluster_size; - dout(5) << "do_rebalance: my load " << my_load - << " target " << target_load - << " total " << total_load - << dendl; - - // under or over? - if (my_load < target_load * (1.0 + g_conf.mds_bal_min_rebalance)) { - dout(5) << " i am underloaded or barely overloaded, doing nothing." << dendl; - last_epoch_under = beat_epoch; - show_imports(); - return; - } - - last_epoch_over = beat_epoch; - - // am i over long enough? - if (last_epoch_under && beat_epoch - last_epoch_under < 2) { - dout(5) << " i am overloaded, but only for " << (beat_epoch - last_epoch_under) << " epochs" << dendl; - return; - } - - dout(5) << " i am sufficiently overloaded" << dendl; - - - // first separate exporters and importers - multimap importers; - multimap exporters; - set importer_set; - set exporter_set; - - for (multimap::iterator it = load_map.begin(); - it != load_map.end(); - it++) { - if (it->first < target_load) { - dout(15) << " mds" << it->second << " is importer" << dendl; - importers.insert(pair(it->first,it->second)); - importer_set.insert(it->second); - } else { - dout(15) << " mds" << it->second << " is exporter" << dendl; - exporters.insert(pair(it->first,it->second)); - exporter_set.insert(it->second); - } - } - - - // determine load transfer mapping - - if (true) { - // analyze import_map; do any matches i can - - dout(15) << " matching exporters to import sources" << dendl; - - // big -> small exporters - for (multimap::reverse_iterator ex = exporters.rbegin(); - ex != exporters.rend(); - ex++) { - double maxex = get_maxex(ex->second); - if (maxex <= .001) continue; - - // check importers. for now, just in arbitrary order (no intelligent matching). - for (map::iterator im = mds_import_map[ex->second].begin(); - im != mds_import_map[ex->second].end(); - im++) { - double maxim = get_maxim(im->first); - if (maxim <= .001) continue; - try_match(ex->second, maxex, - im->first, maxim); - if (maxex <= .001) break;; - } - } - } - - - if (1) { - if (beat % 2 == 1) { - // old way - dout(15) << " matching big exporters to big importers" << dendl; - // big exporters to big importers - multimap::reverse_iterator ex = exporters.rbegin(); - multimap::iterator im = importers.begin(); - while (ex != exporters.rend() && - im != importers.end()) { - double maxex = get_maxex(ex->second); - double maxim = get_maxim(im->second); - if (maxex < .001 || maxim < .001) break; - try_match(ex->second, maxex, - im->second, maxim); - if (maxex <= .001) ex++; - if (maxim <= .001) im++; - } - } else { - // new way - dout(15) << " matching small exporters to big importers" << dendl; - // small exporters to big importers - multimap::iterator ex = exporters.begin(); - multimap::iterator im = importers.begin(); - while (ex != exporters.end() && - im != importers.end()) { - double maxex = get_maxex(ex->second); - double maxim = get_maxim(im->second); - if (maxex < .001 || maxim < .001) break; - try_match(ex->second, maxex, - im->second, maxim); - if (maxex <= .001) ex++; - if (maxim <= .001) im++; - } - } - } - - - - // make a sorted list of my imports - map import_pop_map; - multimap import_from_map; - set fullauthsubs; - - mds->mdcache->get_fullauth_subtrees(fullauthsubs); - for (set::iterator it = fullauthsubs.begin(); - it != fullauthsubs.end(); - it++) { - CDir *im = *it; - if (im->get_inode()->is_stray()) continue; - - double pop = im->pop_auth_subtree.meta_load(now); - if (g_conf.mds_bal_idle_threshold > 0 && - pop < g_conf.mds_bal_idle_threshold && - im->inode != mds->mdcache->get_root() && - im->inode->authority().first != mds->get_nodeid()) { - dout(-5) << " exporting idle (" << pop << ") import " << *im - << " back to mds" << im->inode->authority().first - << dendl; - mds->mdcache->migrator->export_dir_nicely(im, im->inode->authority().first); - continue; - } - - import_pop_map[ pop ] = im; - int from = im->inode->authority().first; - dout(15) << " map: i imported " << *im << " from " << from << dendl; - import_from_map.insert(pair(from, im)); - } - - - - // do my exports! - set already_exporting; - double total_sent = 0; - double total_goal = 0; - - for (map::iterator it = my_targets.begin(); - it != my_targets.end(); - it++) { - - /* - double fac = 1.0; - if (false && total_goal > 0 && total_sent > 0) { - fac = total_goal / total_sent; - dout(-5) << " total sent is " << total_sent << " / " << total_goal << " -> fac 1/ " << fac << dendl; - if (fac > 1.0) fac = 1.0; - } - fac = .9 - .4 * ((float)g_conf.num_mds / 128.0); // hack magic fixme - */ - - int target = (*it).first; - double amount = (*it).second; - total_goal += amount; - - if (amount < MIN_OFFLOAD) continue; - if (amount / target_load < .2) continue; - - dout(5) << "want to send " << amount << " to mds" << target - //<< " .. " << (*it).second << " * " << load_fac - << " -> " << amount - << dendl;//" .. fudge is " << fudge << dendl; - double have = 0; - - - show_imports(); - - // search imports from target - if (import_from_map.count(target)) { - dout(5) << " aha, looking through imports from target mds" << target << dendl; - pair::iterator, multimap::iterator> p = - import_from_map.equal_range(target); - while (p.first != p.second) { - CDir *dir = (*p.first).second; - dout(5) << "considering " << *dir << " from " << (*p.first).first << dendl; - multimap::iterator plast = p.first++; - - if (dir->inode->is_root()) continue; - if (dir->is_freezing() || dir->is_frozen()) continue; // export pbly already in progress - double pop = dir->pop_auth_subtree.meta_load(now); - assert(dir->inode->authority().first == target); // cuz that's how i put it in the map, dummy - - if (pop <= amount-have) { - dout(-5) << "reexporting " << *dir - << " pop " << pop - << " back to mds" << target << dendl; - mds->mdcache->migrator->export_dir_nicely(dir, target); - have += pop; - import_from_map.erase(plast); - import_pop_map.erase(pop); - } else { - dout(5) << "can't reexport " << *dir << ", too big " << pop << dendl; - } - if (amount-have < MIN_OFFLOAD) break; - } - } - if (amount-have < MIN_OFFLOAD) { - total_sent += have; - continue; - } - - // any other imports - if (false) - for (map::iterator import = import_pop_map.begin(); - import != import_pop_map.end(); - import++) { - CDir *imp = (*import).second; - if (imp->inode->is_root()) continue; - - double pop = (*import).first; - if (pop < amount-have || pop < MIN_REEXPORT) { - dout(-5) << "reexporting " << *imp - << " pop " << pop - << " back to mds" << imp->inode->authority() - << dendl; - have += pop; - mds->mdcache->migrator->export_dir_nicely(imp, imp->inode->authority().first); - } - if (amount-have < MIN_OFFLOAD) break; - } - if (amount-have < MIN_OFFLOAD) { - //fudge = amount-have; - total_sent += have; - continue; - } - - // okay, search for fragments of my workload - set candidates; - mds->mdcache->get_fullauth_subtrees(candidates); - - list exports; - - for (set::iterator pot = candidates.begin(); - pot != candidates.end(); - pot++) { - if ((*pot)->get_inode()->is_stray()) continue; - find_exports(*pot, amount, exports, have, already_exporting, now); - if (have > amount-MIN_OFFLOAD) - break; - } - //fudge = amount - have; - total_sent += have; - - for (list::iterator it = exports.begin(); it != exports.end(); it++) { - dout(-5) << " - exporting " - << (*it)->pop_auth_subtree - << " " - << (*it)->pop_auth_subtree.meta_load(now) - << " to mds" << target - << " " << **it - << dendl; - mds->mdcache->migrator->export_dir_nicely(*it, target); - } - } - - dout(5) << "rebalance done" << dendl; - show_imports(); - -} - - - -void MDBalancer::find_exports(CDir *dir, - double amount, - list& exports, - double& have, - set& already_exporting, - utime_t now) -{ - double need = amount - have; - if (need < amount * g_conf.mds_bal_min_start) - return; // good enough! - double needmax = need * g_conf.mds_bal_need_max; - double needmin = need * g_conf.mds_bal_need_min; - double midchunk = need * g_conf.mds_bal_midchunk; - double minchunk = need * g_conf.mds_bal_minchunk; - - list bigger_rep, bigger_unrep; - multimap smaller; - - double dir_pop = dir->pop_auth_subtree.meta_load(now); - dout(7) << " find_exports in " << dir_pop << " " << *dir << " need " << need << " (" << needmin << " - " << needmax << ")" << dendl; - - double subdir_sum = 0; - for (CDir::map_t::iterator it = dir->begin(); - it != dir->end(); - it++) { - CInode *in = it->second->get_inode(); - if (!in) continue; - if (!in->is_dir()) continue; - - list dfls; - in->get_dirfrags(dfls); - for (list::iterator p = dfls.begin(); - p != dfls.end(); - ++p) { - CDir *subdir = *p; - if (!subdir->is_auth()) continue; - if (already_exporting.count(subdir)) continue; - - if (subdir->is_frozen()) continue; // can't export this right now! - - // how popular? - double pop = subdir->pop_auth_subtree.meta_load(now); - subdir_sum += pop; - dout(15) << " subdir pop " << pop << " " << *subdir << dendl; - - if (pop < minchunk) continue; - - // lucky find? - if (pop > needmin && pop < needmax) { - exports.push_back(subdir); - already_exporting.insert(subdir); - have += pop; - return; - } - - if (pop > need) { - if (subdir->is_rep()) - bigger_rep.push_back(subdir); - else - bigger_unrep.push_back(subdir); - } else - smaller.insert(pair(pop, subdir)); - } - } - dout(15) << " sum " << subdir_sum << " / " << dir_pop << dendl; - - // grab some sufficiently big small items - multimap::reverse_iterator it; - for (it = smaller.rbegin(); - it != smaller.rend(); - it++) { - - if ((*it).first < midchunk) - break; // try later - - dout(7) << " taking smaller " << *(*it).second << dendl; - - exports.push_back((*it).second); - already_exporting.insert((*it).second); - have += (*it).first; - if (have > needmin) - return; - } - - // apprently not enough; drill deeper into the hierarchy (if non-replicated) - for (list::iterator it = bigger_unrep.begin(); - it != bigger_unrep.end(); - it++) { - dout(15) << " descending into " << **it << dendl; - find_exports(*it, amount, exports, have, already_exporting, now); - if (have > needmin) - return; - } - - // ok fine, use smaller bits - for (; - it != smaller.rend(); - it++) { - dout(7) << " taking (much) smaller " << it->first << " " << *(*it).second << dendl; - - exports.push_back((*it).second); - already_exporting.insert((*it).second); - have += (*it).first; - if (have > needmin) - return; - } - - // ok fine, drill into replicated dirs - for (list::iterator it = bigger_rep.begin(); - it != bigger_rep.end(); - it++) { - dout(7) << " descending into replicated " << **it << dendl; - find_exports(*it, amount, exports, have, already_exporting, now); - if (have > needmin) - return; - } - -} - - - - -void MDBalancer::hit_inode(utime_t now, CInode *in, int type, int who) -{ - // hit inode - in->pop.get(type).hit(now); - - if (in->get_parent_dn()) - hit_dir(now, in->get_parent_dn()->get_dir(), type, who); -} -/* - // hit me - in->popularity[MDS_POP_JUSTME].pop[type].hit(now); - in->popularity[MDS_POP_NESTED].pop[type].hit(now); - if (in->is_auth()) { - in->popularity[MDS_POP_CURDOM].pop[type].hit(now); - in->popularity[MDS_POP_ANYDOM].pop[type].hit(now); - - dout(20) << "hit_inode " << type << " pop " - << in->popularity[MDS_POP_JUSTME].pop[type].get(now) << " me, " - << in->popularity[MDS_POP_NESTED].pop[type].get(now) << " nested, " - << in->popularity[MDS_POP_CURDOM].pop[type].get(now) << " curdom, " - << in->popularity[MDS_POP_CURDOM].pop[type].get(now) << " anydom" - << " on " << *in - << dendl; - } else { - dout(20) << "hit_inode " << type << " pop " - << in->popularity[MDS_POP_JUSTME].pop[type].get(now) << " me, " - << in->popularity[MDS_POP_NESTED].pop[type].get(now) << " nested, " - << " on " << *in - << dendl; - } - - // hit auth up to import - CDir *dir = in->get_parent_dir(); - if (dir) hit_dir(now, dir, type); -*/ - - -void MDBalancer::hit_dir(utime_t now, CDir *dir, int type, int who, double amount) -{ - // hit me - double v = dir->pop_me.get(type).hit(now, amount); - - //if (dir->ino() == inodeno_t(0x10000000000)) - //dout(0) << "hit_dir " << type << " pop " << v << " in " << *dir << dendl; - - // hit modify counter, if this was a modify - if (g_conf.num_mds > 2 && // FIXME >2 thing - !dir->inode->is_root() && // not root (for now at least) - dir->is_auth() && - - ((g_conf.mds_bal_split_size > 0 && - dir->get_size() > (unsigned)g_conf.mds_bal_split_size) || - (v > g_conf.mds_bal_split_rd && type == META_POP_IRD) || - (v > g_conf.mds_bal_split_wr && type == META_POP_IWR)) && - split_queue.count(dir->dirfrag()) == 0) { - dout(0) << "hit_dir " << type << " pop is " << v << ", putting in split_queue: " << *dir << dendl; - split_queue.insert(dir->dirfrag()); - } - - // replicate? - if (type == META_POP_IRD && who >= 0) { - dir->pop_spread.hit(now, who); - } - - double rd_adj = 0; - if (type == META_POP_IRD && - dir->last_popularity_sample < last_sample) { - float dir_pop = dir->pop_auth_subtree.get(type).get(now); // hmm?? - dir->last_popularity_sample = last_sample; - float pop_sp = dir->pop_spread.get(now); - dir_pop += pop_sp * 10; - - //if (dir->ino() == inodeno_t(0x10000000002)) - if (pop_sp > 0) { - dout(20) << "hit_dir " << type << " pop " << dir_pop << " spread " << pop_sp - << " " << dir->pop_spread.last[0] - << " " << dir->pop_spread.last[1] - << " " << dir->pop_spread.last[2] - << " " << dir->pop_spread.last[3] - << " in " << *dir << dendl; - } - - if (dir->is_auth() && !dir->is_ambiguous_auth()) { - if (!dir->is_rep() && - dir_pop >= g_conf.mds_bal_replicate_threshold) { - // replicate - float rdp = dir->pop_me.get(META_POP_IRD).get(now); - rd_adj = rdp / mds->get_mds_map()->get_num_mds() - rdp; - rd_adj /= 2.0; // temper somewhat - - dout(0) << "replicating dir " << *dir << " pop " << dir_pop << " .. rdp " << rdp << " adj " << rd_adj << dendl; - - dir->dir_rep = CDir::REP_ALL; - mds->mdcache->send_dir_updates(dir, true); - - // fixme this should adjust the whole pop hierarchy - dir->pop_me.get(META_POP_IRD).adjust(rd_adj); - dir->pop_auth_subtree.get(META_POP_IRD).adjust(rd_adj); - } - - if (dir->ino() != 1 && - dir->is_rep() && - dir_pop < g_conf.mds_bal_unreplicate_threshold) { - // unreplicate - dout(0) << "unreplicating dir " << *dir << " pop " << dir_pop << dendl; - - dir->dir_rep = CDir::REP_NONE; - mds->mdcache->send_dir_updates(dir); - } - } - } - - // adjust ancestors - bool hit_subtree = dir->is_auth(); // current auth subtree (if any) - bool hit_subtree_nested = dir->is_auth(); // all nested auth subtrees - - while (1) { - dir->pop_nested.get(type).hit(now, amount); - if (rd_adj != 0.0) - dir->pop_nested.get(META_POP_IRD).adjust(now, rd_adj); - - if (hit_subtree) { - dir->pop_auth_subtree.get(type).hit(now, amount); - if (rd_adj != 0.0) - dir->pop_auth_subtree.get(META_POP_IRD).adjust(now, rd_adj); - } - - if (hit_subtree_nested) { - dir->pop_auth_subtree_nested.get(type).hit(now, amount); - if (rd_adj != 0.0) - dir->pop_auth_subtree_nested.get(META_POP_IRD).adjust(now, rd_adj); - } - - if (dir->is_subtree_root()) - hit_subtree = false; // end of auth domain, stop hitting auth counters. - - if (dir->inode->get_parent_dn() == 0) break; - dir = dir->inode->get_parent_dn()->get_dir(); - } -} - - -/* - * subtract off an exported chunk. - * this excludes *dir itself (encode_export_dir should have take care of that) - * we _just_ do the parents' nested counters. - * - * NOTE: call me _after_ forcing *dir into a subtree root, - * but _before_ doing the encode_export_dirs. - */ -void MDBalancer::subtract_export(CDir *dir) -{ - dirfrag_load_vec_t subload = dir->pop_auth_subtree; - - while (true) { - dir = dir->inode->get_parent_dir(); - if (!dir) break; - - dir->pop_nested -= subload; - dir->pop_auth_subtree_nested -= subload; - } -} - - -void MDBalancer::add_import(CDir *dir) -{ - dirfrag_load_vec_t subload = dir->pop_auth_subtree; - - while (true) { - dir = dir->inode->get_parent_dir(); - if (!dir) break; - - dir->pop_nested += subload; - dir->pop_auth_subtree_nested += subload; - } -} - - - - - - -void MDBalancer::show_imports(bool external) -{ - mds->mdcache->show_subtrees(); -} - - -void MDBalancer::dump_pop_map() -{ - return; // this is dumb - - - char fn[20]; - sprintf(fn, "popdump.%d.mds%d", beat_epoch, mds->get_nodeid()); - - dout(1) << "dump_pop_map to " << fn << dendl; - - ofstream myfile; - myfile.open(fn); - - list iq; - if (mds->mdcache->root) - iq.push_back(mds->mdcache->root); - - utime_t now = g_clock.now(); - while (!iq.empty()) { - CInode *in = iq.front(); - iq.pop_front(); - - // pop stats - /*for (int a=0; apopularity[a].pop[b].get(now) << "\t"; - */ - - // recurse, depth-first. - if (in->is_dir()) { - - list dirs; - in->get_dirfrags(dirs); - for (list::iterator p = dirs.begin(); - p != dirs.end(); - ++p) { - CDir *dir = *p; - - myfile << (int)dir->pop_me.meta_load(now) << "\t"; - myfile << (int)dir->pop_nested.meta_load(now) << "\t"; - myfile << (int)dir->pop_auth_subtree.meta_load(now) << "\t"; - myfile << (int)dir->pop_auth_subtree_nested.meta_load(now) << "\t"; - - // filename last - string p; - in->make_path(p); - myfile << "." << p; - if (dir->get_frag() != frag_t()) - myfile << "___" << (unsigned)dir->get_frag(); - myfile << std::endl; //"/" << dir->get_frag() << dendl; - - // add contents - for (CDir::map_t::iterator q = dir->items.begin(); - q != dir->items.end(); - q++) - if (q->second->is_primary()) - iq.push_front(q->second->get_inode()); - } - } - - } - - myfile.close(); -} - - - -/* replicate? - - float dir_pop = dir->get_popularity(); - - if (dir->is_auth()) { - if (!dir->is_rep() && - dir_pop >= g_conf.mds_bal_replicate_threshold) { - // replicate - dout(5) << "replicating dir " << *in << " pop " << dir_pop << dendl; - - dir->dir_rep = CDIR_REP_ALL; - mds->mdcache->send_dir_updates(dir); - } - - if (dir->is_rep() && - dir_pop < g_conf.mds_bal_unreplicate_threshold) { - // unreplicate - dout(5) << "unreplicating dir " << *in << " pop " << dir_pop << dendl; - - dir->dir_rep = CDIR_REP_NONE; - mds->mdcache->send_dir_updates(dir); - } - } - -*/ diff --git a/branches/sage/crush/mds/MDCache.cc b/branches/sage/crush/mds/MDCache.cc deleted file mode 100644 index 32201986d9f40..0000000000000 --- a/branches/sage/crush/mds/MDCache.cc +++ /dev/null @@ -1,6281 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "MDCache.h" -#include "MDS.h" -#include "Server.h" -#include "Locker.h" -#include "MDLog.h" -#include "MDBalancer.h" -#include "AnchorClient.h" -#include "Migrator.h" - -#include "MDSMap.h" - -#include "CInode.h" -#include "CDir.h" - -#include "include/filepath.h" - -#include "msg/Message.h" -#include "msg/Messenger.h" - -#include "common/Logger.h" - -#include "osdc/Filer.h" - -#include "events/ESubtreeMap.h" -#include "events/EUpdate.h" -#include "events/ESlaveUpdate.h" -#include "events/EString.h" -#include "events/EPurgeFinish.h" -#include "events/EImportFinish.h" -#include "events/EFragment.h" - -#include "messages/MGenericMessage.h" - -#include "messages/MMDSResolve.h" -#include "messages/MMDSResolveAck.h" -#include "messages/MMDSCacheRejoin.h" - -#include "messages/MDiscover.h" -#include "messages/MDiscoverReply.h" - -//#include "messages/MInodeUpdate.h" -#include "messages/MDirUpdate.h" -#include "messages/MCacheExpire.h" - -#include "messages/MInodeFileCaps.h" - -#include "messages/MLock.h" -#include "messages/MDentryUnlink.h" - -#include "messages/MClientRequest.h" -#include "messages/MClientFileCaps.h" - -#include "messages/MMDSSlaveRequest.h" - -#include "messages/MMDSFragmentNotify.h" - - -#include "IdAllocator.h" - -#include "common/Timer.h" - -#include -#include -#include -#include -#include -using namespace std; - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) *_dout << dbeginl << g_clock.now() << " mds" << mds->get_nodeid() << ".cache " - - - - -MDCache::MDCache(MDS *m) -{ - mds = m; - migrator = new Migrator(mds, this); - // renamer = new Renamer(mds, this); - root = NULL; - stray = NULL; - lru.lru_set_max(g_conf.mds_cache_size); - lru.lru_set_midpoint(g_conf.mds_cache_mid); - - did_shutdown_log_cap = false; -} - -MDCache::~MDCache() -{ - delete migrator; - //delete renamer; -} - - - -void MDCache::log_stat(Logger *logger) -{ - if (get_root()) { - utime_t now = g_clock.now(); - //logger->set("pop", (int)get_root()->pop_nested.meta_load(now)); - //logger->set("popauth", (int)get_root()->pop_auth_subtree_nested.meta_load(now)); - } - logger->set("c", lru.lru_get_size()); - logger->set("cpin", lru.lru_get_num_pinned()); - logger->set("ctop", lru.lru_get_top()); - logger->set("cbot", lru.lru_get_bot()); - logger->set("cptail", lru.lru_get_pintail()); -} - - -// - -bool MDCache::shutdown() -{ - if (lru.lru_get_size() > 0) { - dout(7) << "WARNING: mdcache shutodwn with non-empty cache" << dendl; - //show_cache(); - show_subtrees(); - //dump(); - } - return true; -} - - -// ==================================================================== -// some inode functions - -CInode *MDCache::create_inode() -{ - CInode *in = new CInode(this); - - // zero - memset(&in->inode, 0, sizeof(inode_t)); - - // assign ino - in->inode.ino = mds->idalloc->alloc_id(); - - in->inode.nlink = 1; // FIXME - - in->inode.layout = g_OSD_FileLayout; - - add_inode(in); // add - return in; -} - - -void MDCache::add_inode(CInode *in) -{ - // add to lru, inode map - assert(inode_map.count(in->ino()) == 0); // should be no dup inos! - inode_map[ in->ino() ] = in; - - if (in->ino() < MDS_INO_BASE) { - base_inodes.insert(in); - if (in->ino() == MDS_INO_ROOT) - set_root(in); - if (in->ino() == MDS_INO_STRAY(mds->get_nodeid())) - stray = in; - } -} - -void MDCache::remove_inode(CInode *o) -{ - dout(14) << "remove_inode " << *o << dendl; - - if (o->get_parent_dn()) { - // FIXME: multiple parents? - CDentry *dn = o->get_parent_dn(); - assert(!dn->is_dirty()); - dn->dir->unlink_inode(dn); // leave dentry ... FIXME? - } - - // remove from inode map - inode_map.erase(o->ino()); - - if (o->ino() < MDS_INO_BASE) { - assert(base_inodes.count(o)); - base_inodes.erase(o); - - if (o == root) root = 0; - if (o == stray) stray = 0; - } - - // delete it - delete o; -} - - - -CInode *MDCache::create_root_inode() -{ - CInode *root = new CInode(this); - memset(&root->inode, 0, sizeof(inode_t)); - root->inode.ino = MDS_INO_ROOT; - - // make it up (FIXME) - root->inode.mode = 0755 | INODE_MODE_DIR; - root->inode.size = 0; - root->inode.ctime = - root->inode.mtime = g_clock.now(); - - root->inode.nlink = 1; - root->inode.layout = g_OSD_MDDirLayout; - - root->force_auth = pair(0, CDIR_AUTH_UNKNOWN); - - add_inode( root ); - - return root; -} - - -void MDCache::open_root(Context *c) -{ - int whoami = mds->get_nodeid(); - - // open root inode - if (whoami == 0) { - // i am root inode - CInode *root = create_root_inode(); - - // root directory too - CDir *dir = root->get_or_open_dirfrag(this, frag_t()); - adjust_subtree_auth(dir, 0); - dir->dir_rep = CDir::REP_ALL; //NONE; - - show_subtrees(); - - if (c) { - c->finish(0); - delete c; - } - } else { - // request inode from root mds - discover_base_ino(MDS_INO_ROOT, c, 0); - } -} - -CInode *MDCache::create_stray_inode(int whose) -{ - if (whose < 0) whose = mds->get_nodeid(); - - CInode *in = new CInode(this, whose == mds->get_nodeid()); - memset(&in->inode, 0, sizeof(inode_t)); - in->inode.ino = MDS_INO_STRAY(whose); - - // make it up (FIXME) - in->inode.mode = 0755 | INODE_MODE_DIR; - in->inode.size = 0; - in->inode.ctime = - in->inode.mtime = g_clock.now(); - - in->inode.nlink = 1; - in->inode.layout = g_OSD_MDDirLayout; - - add_inode( in ); - - return in; -} - -void MDCache::open_local_stray() -{ - create_stray_inode(); - CDir *dir = stray->get_or_open_dirfrag(this, frag_t()); - adjust_subtree_auth(dir, mds->get_nodeid()); -} - -void MDCache::open_foreign_stray(int who, Context *c) -{ - inodeno_t ino = MDS_INO_STRAY(who); - dout(10) << "open_foreign_stray mds" << who << " " << ino << dendl; - assert(!have_inode(ino)); - - discover_base_ino(ino, c, who); -} - - -CDentry *MDCache::get_or_create_stray_dentry(CInode *in) -{ - string straydname; - in->name_stray_dentry(straydname); - - if (!stray) create_stray_inode(mds->get_nodeid()); - - frag_t fg = stray->pick_dirfrag(straydname); - - CDir *straydir = stray->get_or_open_dirfrag(this, fg); - - CDentry *straydn = straydir->lookup(straydname); - if (!straydn) - straydn = straydir->add_null_dentry(straydname); - - return straydn; -} - - - -MDSCacheObject *MDCache::get_object(MDSCacheObjectInfo &info) -{ - // inode? - if (info.ino) - return get_inode(info.ino); - - // dir or dentry. - CDir *dir = get_dirfrag(info.dirfrag); - if (!dir) return 0; - - if (info.dname.length()) - return dir->lookup(info.dname); - else - return dir; -} - - - - -// ==================================================================== -// subtree management - -void MDCache::list_subtrees(list& ls) -{ - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) - ls.push_back(p->first); -} - -/* - * adjust the dir_auth of a subtree. - * merge with parent and/or child subtrees, if is it appropriate. - * merge can ONLY happen if both parent and child have unambiguous auth. - */ -void MDCache::adjust_subtree_auth(CDir *dir, pair auth) -{ - dout(7) << "adjust_subtree_auth " << dir->get_dir_auth() << " -> " << auth - << " on " << *dir << dendl; - - show_subtrees(); - - CDir *root; - if (dir->ino() < MDS_INO_BASE) { - root = dir; // bootstrap hack. - if (subtrees.count(root) == 0) { - subtrees[root].clear(); - root->get(CDir::PIN_SUBTREE); - } - } else { - root = get_subtree_root(dir); // subtree root - } - assert(root); - assert(subtrees.count(root)); - dout(7) << " current root is " << *root << dendl; - - if (root == dir) { - // i am already a subtree. - dir->set_dir_auth(auth); - } else { - // i am a new subtree. - dout(10) << " new subtree at " << *dir << dendl; - assert(subtrees.count(dir) == 0); - subtrees[dir].clear(); // create empty subtree bounds list for me. - dir->get(CDir::PIN_SUBTREE); - - // set dir_auth - dir->set_dir_auth(auth); - - // move items nested beneath me, under me. - set::iterator p = subtrees[root].begin(); - while (p != subtrees[root].end()) { - set::iterator next = p; - next++; - if (get_subtree_root((*p)->get_parent_dir()) == dir) { - // move under me - dout(10) << " claiming child bound " << **p << dendl; - subtrees[dir].insert(*p); - subtrees[root].erase(p); - } - p = next; - } - - // i am a bound of the parent subtree. - subtrees[root].insert(dir); - - // i am now the subtree root. - root = dir; - - // adjust recursive pop counters - if (dir->is_auth()) { - CDir *p = dir->get_parent_dir(); - while (p) { - p->pop_auth_subtree -= dir->pop_auth_subtree; - if (p->is_subtree_root()) break; - p = p->inode->get_parent_dir(); - } - } - - eval_subtree_root(dir); - } - - show_subtrees(); -} - - -void MDCache::try_subtree_merge(CDir *dir) -{ - dout(7) << "try_subtree_merge " << *dir << dendl; - assert(subtrees.count(dir)); - set oldbounds = subtrees[dir]; - - // try merge at my root - try_subtree_merge_at(dir); - - // try merge at my old bounds - for (set::iterator p = oldbounds.begin(); - p != oldbounds.end(); - ++p) - try_subtree_merge_at(*p); -} - -class C_MDC_SubtreeMergeWB : public Context { - MDCache *mdcache; - CInode *in; - LogSegment *ls; -public: - C_MDC_SubtreeMergeWB(MDCache *mdc, CInode *i, LogSegment *s) : mdcache(mdc), in(i), ls(s) {} - void finish(int r) { - mdcache->subtree_merge_writebehind_finish(in, ls); - } -}; - -void MDCache::try_subtree_merge_at(CDir *dir) -{ - dout(10) << "try_subtree_merge_at " << *dir << dendl; - assert(subtrees.count(dir)); - - // merge with parent? - CDir *parent = dir; - if (dir->ino() >= MDS_INO_BASE) - parent = get_subtree_root(dir->get_parent_dir()); - - if (parent != dir && // we have a parent, - parent->dir_auth == dir->dir_auth && // auth matches, - dir->dir_auth.second == CDIR_AUTH_UNKNOWN && // auth is unambiguous, - !dir->state_test(CDir::STATE_EXPORTBOUND)) { // not an exportbound, - // merge with parent. - dout(10) << " subtree merge at " << *dir << dendl; - dir->set_dir_auth(CDIR_AUTH_DEFAULT); - - // move our bounds under the parent - for (set::iterator p = subtrees[dir].begin(); - p != subtrees[dir].end(); - ++p) - subtrees[parent].insert(*p); - - // we are no longer a subtree or bound - dir->put(CDir::PIN_SUBTREE); - subtrees.erase(dir); - subtrees[parent].erase(dir); - - // adjust popularity? - if (dir->is_auth()) { - CDir *p = dir->get_parent_dir(); - while (p) { - p->pop_auth_subtree += dir->pop_auth_subtree; - if (p->is_subtree_root()) break; - p = p->inode->get_parent_dir(); - } - } - - eval_subtree_root(dir); - - // journal inode? - // (this is a large hammer to ensure that dirfragtree updates will - // hit the disk before the relevant dirfrags ever close) - if (dir->inode->is_auth() && - dir->inode->can_auth_pin() && - (mds->is_active() || mds->is_stopping())) { - CInode *in = dir->inode; - dout(10) << "try_subtree_merge_at journaling merged bound " << *in << dendl; - - in->auth_pin(); - - // journal write-behind. - inode_t *pi = in->project_inode(); - pi->version = in->pre_dirty(); - - EUpdate *le = new EUpdate(mds->mdlog, "subtree merge writebehind"); - le->metablob.add_dir_context(in->get_parent_dn()->get_dir()); - le->metablob.add_primary_dentry(in->get_parent_dn(), true, 0, pi); - - mds->mdlog->submit_entry(le); - mds->mdlog->wait_for_sync(new C_MDC_SubtreeMergeWB(this, in, - mds->mdlog->get_current_segment())); - } - } - - show_subtrees(15); -} - -void MDCache::subtree_merge_writebehind_finish(CInode *in, LogSegment *ls) -{ - dout(10) << "subtree_merge_writebehind_finish on " << in << dendl; - in->pop_and_dirty_projected_inode(ls); - in->auth_unpin(); -} - -void MDCache::eval_subtree_root(CDir *dir) -{ - // evaluate subtree inode dirlock? - // (we should scatter the dirlock on subtree bounds) - if (dir->inode->is_auth() && - dir->inode->dirlock.is_stable()) { - // force the issue a bit - if (!dir->inode->is_frozen()) - mds->locker->scatter_eval(&dir->inode->dirlock); - else - mds->locker->try_scatter_eval(&dir->inode->dirlock); // ** may or may not be auth_pinned ** - } - -} - - -void MDCache::adjust_bounded_subtree_auth(CDir *dir, set& bounds, pair auth) -{ - dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth - << " on " << *dir - << " bounds " << bounds - << dendl; - - show_subtrees(); - - CDir *root; - if (dir->ino() < MDS_INO_BASE) { - root = dir; // bootstrap hack. - if (subtrees.count(root) == 0) { - subtrees[root].clear(); - root->get(CDir::PIN_SUBTREE); - } - } else { - root = get_subtree_root(dir); // subtree root - } - assert(root); - assert(subtrees.count(root)); - dout(7) << " current root is " << *root << dendl; - - pair oldauth = dir->authority(); - - if (root == dir) { - // i am already a subtree. - dir->set_dir_auth(auth); - } else { - // i am a new subtree. - dout(10) << " new subtree at " << *dir << dendl; - assert(subtrees.count(dir) == 0); - subtrees[dir].clear(); // create empty subtree bounds list for me. - dir->get(CDir::PIN_SUBTREE); - - // set dir_auth - dir->set_dir_auth(auth); - - // move items nested beneath me, under me. - set::iterator p = subtrees[root].begin(); - while (p != subtrees[root].end()) { - set::iterator next = p; - next++; - if (get_subtree_root((*p)->get_parent_dir()) == dir) { - // move under me - dout(10) << " claiming child bound " << **p << dendl; - subtrees[dir].insert(*p); - subtrees[root].erase(p); - } - p = next; - } - - // i am a bound of the parent subtree. - subtrees[root].insert(dir); - - // i am now the subtree root. - root = dir; - } - - // verify/adjust bounds. - // - these may be new, or - // - beneath existing ambiguous bounds (which will be collapsed), - // - but NOT beneath unambiguous bounds. - for (set::iterator p = bounds.begin(); - p != bounds.end(); - ++p) { - CDir *bound = *p; - - // new bound? - if (subtrees[dir].count(bound) == 0) { - if (get_subtree_root(bound) == dir) { - dout(10) << " new bound " << *bound << ", adjusting auth back to old " << oldauth << dendl; - adjust_subtree_auth(bound, oldauth); // otherwise, adjust at bound. - } - else { - dout(10) << " want bound " << *bound << dendl; - // make sure it's nested beneath ambiguous subtree(s) - while (1) { - CDir *t = get_subtree_root(bound->get_parent_dir()); - if (t == dir) break; - while (subtrees[dir].count(t) == 0) - t = get_subtree_root(t->get_parent_dir()); - dout(10) << " swallowing intervening subtree at " << *t << dendl; - adjust_subtree_auth(t, auth); - try_subtree_merge_at(t); - } - } - } - else { - dout(10) << " already have bound " << *bound << dendl; - } - } - // merge stray bounds? - set::iterator p = subtrees[dir].begin(); - while (p != subtrees[dir].end()) { - set::iterator n = p; - n++; - if (bounds.count(*p) == 0) { - CDir *stray = *p; - dout(10) << " swallowing extra subtree at " << *stray << dendl; - adjust_subtree_auth(stray, auth); - try_subtree_merge_at(stray); - } - p = n; - } - - // bound should now match. - verify_subtree_bounds(dir, bounds); - - show_subtrees(); -} - - -void MDCache::adjust_bounded_subtree_auth(CDir *dir, list& bound_dfs, pair auth) -{ - dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth - << " on " << *dir - << " bound_dfs " << bound_dfs - << dendl; - - // make bounds list - set bounds; - for (list::iterator p = bound_dfs.begin(); - p != bound_dfs.end(); - ++p) { - CDir *bd = get_dirfrag(*p); - if (bd) - bounds.insert(bd); - } - - adjust_bounded_subtree_auth(dir, bounds, auth); -} - -void MDCache::map_dirfrag_set(list& dfs, set& result) -{ - // group by inode - map ino_fragset; - for (list::iterator p = dfs.begin(); p != dfs.end(); ++p) - ino_fragset[p->ino].insert(p->frag); - - // get frags - for (map::iterator p = ino_fragset.begin(); - p != ino_fragset.end(); - ++p) { - CInode *in = get_inode(p->first); - if (!in) continue; - - list fglist; - for (set::iterator q = p->second.begin(); q != p->second.end(); ++q) - in->dirfragtree.get_leaves_under(*q, fglist); - - dout(15) << "map_dirfrag_set " << p->second << " -> " << fglist - << " on " << *in << dendl; - - for (list::iterator q = fglist.begin(); q != fglist.end(); ++q) { - CDir *dir = in->get_dirfrag(*q); - if (dir) result.insert(dir); - } - } -} - - - -CDir *MDCache::get_subtree_root(CDir *dir) -{ - // find the underlying dir that delegates (or is about to delegate) auth - while (true) { - if (dir->is_subtree_root()) - return dir; - dir = dir->get_parent_dir(); - if (!dir) - return 0; // none - } -} - -void MDCache::remove_subtree(CDir *dir) -{ - dout(10) << "remove_subtree " << *dir << dendl; - assert(subtrees.count(dir)); - assert(subtrees[dir].empty()); - subtrees.erase(dir); - dir->put(CDir::PIN_SUBTREE); - if (dir->get_parent_dir()) { - CDir *p = get_subtree_root(dir->get_parent_dir()); - assert(subtrees[p].count(dir)); - subtrees[p].erase(dir); - } -} - -void MDCache::get_subtree_bounds(CDir *dir, set& bounds) -{ - assert(subtrees.count(dir)); - bounds = subtrees[dir]; -} - -void MDCache::get_wouldbe_subtree_bounds(CDir *dir, set& bounds) -{ - if (subtrees.count(dir)) { - // just copy them, dir is a subtree. - get_subtree_bounds(dir, bounds); - } else { - // find them - CDir *root = get_subtree_root(dir); - for (set::iterator p = subtrees[root].begin(); - p != subtrees[root].end(); - ++p) { - CDir *t = *p; - while (t != root) { - t = t->get_parent_dir(); - assert(t); - if (t == dir) { - bounds.insert(*p); - continue; - } - } - } - } -} - -void MDCache::verify_subtree_bounds(CDir *dir, const set& bounds) -{ - // for debugging only. - assert(subtrees.count(dir)); - if (bounds != subtrees[dir]) { - dout(0) << "verify_subtree_bounds failed" << dendl; - set b = bounds; - for (set::iterator p = subtrees[dir].begin(); - p != subtrees[dir].end(); - ++p) { - if (bounds.count(*p)) { - b.erase(*p); - continue; - } - dout(0) << " missing bound " << **p << dendl; - } - for (set::iterator p = b.begin(); - p != b.end(); - ++p) - dout(0) << " extra bound " << **p << dendl; - } - assert(bounds == subtrees[dir]); -} - -void MDCache::verify_subtree_bounds(CDir *dir, const list& bounds) -{ - // for debugging only. - assert(subtrees.count(dir)); - - // make sure that any bounds i do have are properly noted as such. - int failed = 0; - for (list::const_iterator p = bounds.begin(); - p != bounds.end(); - ++p) { - CDir *bd = get_dirfrag(*p); - if (!bd) continue; - if (subtrees[dir].count(bd) == 0) { - dout(0) << "verify_subtree_bounds failed: extra bound " << *bd << dendl; - failed++; - } - } - assert(failed == 0); -} - -void MDCache::adjust_subtree_after_rename(CInode *diri, CDir *olddir) -{ - dout(10) << "adjust_subtree_after_rename " << *diri << " from " << *olddir << dendl; - - //show_subtrees(); - - list dfls; - diri->get_dirfrags(dfls); - for (list::iterator p = dfls.begin(); p != dfls.end(); ++p) { - CDir *dir = *p; - - dout(10) << "dirfrag " << *dir << dendl; - CDir *oldparent = get_subtree_root(olddir); - dout(10) << " old parent " << *oldparent << dendl; - CDir *newparent = get_subtree_root(diri->get_parent_dir()); - dout(10) << " new parent " << *newparent << dendl; - - if (oldparent == newparent) { - dout(10) << "parent unchanged for " << *dir << " at " << *oldparent << dendl; - continue; - } - - if (dir->is_subtree_root()) { - // children are fine. change parent. - dout(10) << "moving " << *dir << " from " << *oldparent << " to " << *newparent << dendl; - assert(subtrees[oldparent].count(dir)); - subtrees[oldparent].erase(dir); - assert(subtrees.count(newparent)); - subtrees[newparent].insert(dir); - } else { - // mid-subtree. - - // see if any old bounds move to the new parent. - list tomove; - for (set::iterator p = subtrees[oldparent].begin(); - p != subtrees[oldparent].end(); - ++p) { - CDir *bound = *p; - CDir *broot = get_subtree_root(bound->get_parent_dir()); - if (broot != oldparent) { - assert(broot == newparent); - tomove.push_back(bound); - } - } - for (list::iterator p = tomove.begin(); p != tomove.end(); ++p) { - CDir *bound = *p; - dout(10) << "moving bound " << *bound << " from " << *oldparent << " to " << *newparent << dendl; - subtrees[oldparent].erase(bound); - subtrees[newparent].insert(bound); - } - - // did auth change? - if (oldparent->authority() != newparent->authority()) - adjust_subtree_auth(dir, oldparent->authority()); // caller is responsible for *diri. - } - } - - for (list::iterator p = dfls.begin(); p != dfls.end(); ++p) { - CDir *dir = *p; - - // un-force dir to subtree root - if (dir->dir_auth == pair(dir->dir_auth.first, dir->dir_auth.first)) { - adjust_subtree_auth(dir, dir->dir_auth.first); - try_subtree_merge_at(dir); - } - } - - show_subtrees(); -} - - -void MDCache::get_fullauth_subtrees(set& s) -{ - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) { - CDir *root = p->first; - if (root->is_full_dir_auth()) - s.insert(root); - } -} -void MDCache::get_auth_subtrees(set& s) -{ - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) { - CDir *root = p->first; - if (root->is_auth()) - s.insert(root); - } -} - - -// count. - -int MDCache::num_subtrees() -{ - return subtrees.size(); -} - -int MDCache::num_subtrees_fullauth() -{ - int n = 0; - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) { - CDir *root = p->first; - if (root->is_full_dir_auth()) - n++; - } - return n; -} - -int MDCache::num_subtrees_fullnonauth() -{ - int n = 0; - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) { - CDir *root = p->first; - if (root->is_full_dir_nonauth()) - n++; - } - return n; -} - - - - - - - -// ==================================================================== -// import map, recovery - - -ESubtreeMap *MDCache::create_subtree_map() -{ - dout(10) << "create_subtree_map " << num_subtrees() << " subtrees, " - << num_subtrees_fullauth() << " fullauth" - << dendl; - - ESubtreeMap *le = new ESubtreeMap(); - - // include all auth subtrees, and their bounds. - // and a spanning tree to tie it to the root. - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) { - CDir *dir = p->first; - if (!dir->is_auth()) continue; - - dout(15) << " subtree " << *dir << dendl; - le->subtrees[dir->dirfrag()].clear(); - le->metablob.add_dir_context(dir, EMetaBlob::TO_ROOT); - le->metablob.add_dir(dir, false); - - // bounds - for (set::iterator q = p->second.begin(); - q != p->second.end(); - ++q) { - CDir *bound = *q; - dout(15) << " subtree bound " << *bound << dendl; - le->subtrees[dir->dirfrag()].push_back(bound->dirfrag()); - le->metablob.add_dir_context(bound, EMetaBlob::TO_ROOT); - le->metablob.add_dir(bound, false); - } - } - - //le->metablob.print(cout); - return le; -} - - -void MDCache::send_resolve(int who) -{ - if (migrator->is_importing() || - migrator->is_exporting()) - send_resolve_later(who); - else - send_resolve_now(who); -} - -void MDCache::send_resolve_later(int who) -{ - dout(10) << "send_resolve_later to mds" << who << dendl; - wants_resolve.insert(who); -} - -void MDCache::maybe_send_pending_resolves() -{ - if (wants_resolve.empty()) - return; // nothing to send. - - // only if it's appropriate! - if (migrator->is_exporting() || - migrator->is_importing()) { - dout(7) << "maybe_send_pending_resolves waiting, imports/exports still in progress" << dendl; - migrator->show_importing(); - migrator->show_exporting(); - return; // not now - } - - // ok, send them. - for (set::iterator p = wants_resolve.begin(); - p != wants_resolve.end(); - p++) - send_resolve_now(*p); - wants_resolve.clear(); -} - - -class C_MDC_SendResolve : public Context { - MDCache *mdc; - int who; -public: - C_MDC_SendResolve(MDCache *c, int w) : mdc(c), who(w) { } - void finish(int r) { - mdc->send_resolve_now(who); - } -}; - -void MDCache::send_resolve_now(int who) -{ - dout(10) << "send_resolve_now to mds" << who << dendl; - MMDSResolve *m = new MMDSResolve; - - show_subtrees(); - - // known - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - p++) { - CDir *dir = p->first; - - // only our subtrees - if (dir->authority().first != mds->get_nodeid()) - continue; - - if (migrator->is_importing(dir->dirfrag())) { - // ambiguous (mid-import) - // NOTE: because we are first authority, import state is at least IMPORT_LOGGINSTART. - assert(migrator->get_import_state(dir->dirfrag()) >= Migrator::IMPORT_LOGGINGSTART); - set bounds; - get_subtree_bounds(dir, bounds); - list dfls; - for (set::iterator p = bounds.begin(); p != bounds.end(); ++p) - dfls.push_back((*p)->dirfrag()); - m->add_ambiguous_import(dir->dirfrag(), dfls); - } else { - // not ambiguous. - m->add_subtree(dir->dirfrag()); - - // bounds too - for (set::iterator q = subtrees[dir].begin(); - q != subtrees[dir].end(); - ++q) { - CDir *bound = *q; - m->add_subtree_bound(dir->dirfrag(), bound->dirfrag()); - } - } - } - - // ambiguous - for (map >::iterator p = my_ambiguous_imports.begin(); - p != my_ambiguous_imports.end(); - ++p) - m->add_ambiguous_import(p->first, p->second); - - - // list prepare requests lacking a commit - // [active survivor] - for (hash_map::iterator p = active_requests.begin(); - p != active_requests.end(); - ++p) { - if (p->second->is_slave() && p->second->slave_to_mds == who) { - dout(10) << " including uncommitted " << *p->second << dendl; - m->add_slave_request(p->first); - } - } - // [resolving] - if (uncommitted_slave_updates.count(who)) { - for (map::iterator p = uncommitted_slave_updates[who].begin(); - p != uncommitted_slave_updates[who].end(); - ++p) { - dout(10) << " including uncommitted " << p->first << dendl; - m->add_slave_request(p->first); - } - need_resolve_ack.insert(who); - } - - - // send - mds->send_message_mds(m, who, MDS_PORT_CACHE); -} - - -void MDCache::handle_mds_failure(int who) -{ - dout(7) << "handle_mds_failure mds" << who << dendl; - - // make note of recovery set - mds->mdsmap->get_recovery_mds_set(recovery_set); - recovery_set.erase(mds->get_nodeid()); - dout(1) << "handle_mds_failure mds" << who << " : recovery peers are " << recovery_set << dendl; - - // adjust my recovery lists - wants_resolve.erase(who); // MDS will ask again - got_resolve.erase(who); // i'll get another. - - rejoin_sent.erase(who); // i need to send another - rejoin_ack_gather.erase(who); // i'll need/get another. - - dout(10) << " wants_resolve " << wants_resolve << dendl; - dout(10) << " got_resolve " << got_resolve << dendl; - dout(10) << " rejoin_sent " << rejoin_sent << dendl; - dout(10) << " rejoin_gather " << rejoin_gather << dendl; - dout(10) << " rejoin_ack_gather " << rejoin_ack_gather << dendl; - - - // tell the migrator too. - migrator->handle_mds_failure_or_stop(who); - - // kick any discovers that are waiting - kick_discovers(who); - - // clean up any requests slave to/from this node - list finish; - for (hash_map::iterator p = active_requests.begin(); - p != active_requests.end(); - ++p) { - // slave to the failed node? - if (p->second->slave_to_mds == who) { - if (p->second->slave_did_prepare()) { - dout(10) << " slave request " << *p->second << " uncommitted, will resolve shortly" << dendl; - } else { - dout(10) << " slave request " << *p->second << " has no prepare, finishing up" << dendl; - if (p->second->slave_request) - p->second->aborted = true; - else - finish.push_back(p->second); - } - } - - // failed node is slave? - if (!p->second->committing) { - if (p->second->more()->witnessed.count(who)) { - dout(10) << " master request " << *p->second << " no longer witnessed by slave mds" << who - << dendl; - // discard this peer's prepare (if any) - p->second->more()->witnessed.erase(who); - } - - if (p->second->more()->waiting_on_slave.count(who)) { - dout(10) << " master request " << *p->second << " waiting for slave mds" << who - << " to recover" << dendl; - // retry request when peer recovers - p->second->more()->waiting_on_slave.erase(who); - mds->wait_for_active_peer(who, new C_MDS_RetryRequest(this, p->second)); - } - } - } - - while (!finish.empty()) { - dout(10) << "cleaning up slave request " << *finish.front() << dendl; - request_finish(finish.front()); - finish.pop_front(); - } - - show_subtrees(); -} - -/* - * handle_mds_recovery - called on another node's transition - * from resolve -> active. - */ -void MDCache::handle_mds_recovery(int who) -{ - dout(7) << "handle_mds_recovery mds" << who << dendl; - - list waiters; - - // wake up any waiters in their subtrees - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) { - CDir *dir = p->first; - - if (dir->authority().first != who) continue; - assert(!dir->is_auth()); - - // wake any waiters - list q; - q.push_back(dir); - - while (!q.empty()) { - CDir *d = q.front(); - q.pop_front(); - d->take_waiting(CDir::WAIT_ANY, waiters); - - // inode waiters too - for (CDir::map_t::iterator p = d->items.begin(); - p != d->items.end(); - ++p) { - CDentry *dn = p->second; - if (dn->is_primary()) { - dn->get_inode()->take_waiting(CInode::WAIT_ANY, waiters); - - // recurse? - list ls; - dn->get_inode()->get_dirfrags(ls); - for (list::iterator p = ls.begin(); - p != ls.end(); - ++p) { - CDir *subdir = *p; - if (!subdir->is_subtree_root()) - q.push_back(subdir); - } - } - } - } - } - - // queue them up. - mds->queue_waiters(waiters); -} - -void MDCache::set_recovery_set(set& s) -{ - dout(7) << "set_recovery_set " << s << dendl; - recovery_set = s; -} - - -/* - * during resolve state, we share resolves to determine who - * is authoritative for which trees. we expect to get an resolve - * from _everyone_ in the recovery_set (the mds cluster at the time of - * the first failure). - */ -void MDCache::handle_resolve(MMDSResolve *m) -{ - dout(7) << "handle_resolve from " << m->get_source() << dendl; - int from = m->get_source().num(); - - // ambiguous slave requests? - if (!m->slave_requests.empty()) { - MMDSResolveAck *ack = new MMDSResolveAck; - - for (list::iterator p = m->slave_requests.begin(); - p != m->slave_requests.end(); - ++p) { - if (mds->clientmap.have_completed_request(*p)) { - // COMMIT - dout(10) << " ambiguous slave request " << *p << " will COMMIT" << dendl; - ack->add_commit(*p); - } else { - // ABORT - dout(10) << " ambiguous slave request " << *p << " will ABORT" << dendl; - ack->add_abort(*p); - } - } - - mds->send_message_mds(ack, from, MDS_PORT_CACHE); - } - - // am i a surviving ambiguous importer? - if (mds->is_active() || mds->is_stopping()) { - // check for any import success/failure (from this node) - map >::iterator p = my_ambiguous_imports.begin(); - while (p != my_ambiguous_imports.end()) { - map >::iterator next = p; - next++; - CDir *dir = get_dirfrag(p->first); - assert(dir); - dout(10) << "checking ambiguous import " << *dir << dendl; - if (migrator->is_importing(dir->dirfrag()) && - migrator->get_import_peer(dir->dirfrag()) == from) { - assert(migrator->get_import_state(dir->dirfrag()) == Migrator::IMPORT_ACKING); - - // check if sender claims the subtree - bool claimed_by_sender = false; - for (map >::iterator q = m->subtrees.begin(); - q != m->subtrees.end(); - ++q) { - CDir *base = get_dirfrag(q->first); - if (!base || !base->contains(dir)) - continue; // base not dir or an ancestor of dir, clearly doesn't claim dir. - - bool inside = true; - for (list::iterator r = q->second.begin(); - r != q->second.end(); - ++r) { - CDir *bound = get_dirfrag(*r); - if (bound && bound->contains(dir)) { - inside = false; // nope, bound is dir or parent of dir, not inside. - break; - } - } - if (inside) - claimed_by_sender = true; - } - - if (claimed_by_sender) { - dout(7) << "ambiguous import failed on " << *dir << dendl; - migrator->import_reverse(dir); - } else { - dout(7) << "ambiguous import succeeded on " << *dir << dendl; - migrator->import_finish(dir); - } - my_ambiguous_imports.erase(p); // no longer ambiguous. - } - p = next; - } - } - - // update my dir_auth values - for (map >::iterator pi = m->subtrees.begin(); - pi != m->subtrees.end(); - ++pi) { - CInode *diri = get_inode(pi->first.ino); - if (!diri) continue; - bool forced = diri->dirfragtree.force_to_leaf(pi->first.frag); - if (forced) { - dout(10) << " forced frag " << pi->first.frag << " to leaf in " - << diri->dirfragtree - << " on " << pi->first << dendl; - } - - CDir *dir = diri->get_dirfrag(pi->first.frag); - if (!dir) continue; - - adjust_bounded_subtree_auth(dir, pi->second, from); - try_subtree_merge(dir); - } - - show_subtrees(); - - - // note ambiguous imports too - for (map >::iterator pi = m->ambiguous_imports.begin(); - pi != m->ambiguous_imports.end(); - ++pi) { - dout(10) << "noting ambiguous import on " << pi->first << " bounds " << pi->second << dendl; - other_ambiguous_imports[from][pi->first].swap( pi->second ); - } - - // did i get them all? - got_resolve.insert(from); - - maybe_resolve_finish(); - - delete m; -} - -void MDCache::maybe_resolve_finish() -{ - if (got_resolve != recovery_set) { - dout(10) << "maybe_resolve_finish still waiting for more resolves, got (" << got_resolve - << "), need (" << recovery_set << ")" << dendl; - } - else if (!need_resolve_ack.empty()) { - dout(10) << "maybe_resolve_finish still waiting for resolve_ack from (" << need_resolve_ack << ")" << dendl; - } - else { - dout(10) << "maybe_resolve_finish got all resolves+resolve_acks, done." << dendl; - disambiguate_imports(); - if (mds->is_resolve()) { - recalc_auth_bits(); - trim_non_auth(); - mds->resolve_done(); - } - } -} - -void MDCache::handle_resolve_ack(MMDSResolveAck *ack) -{ - dout(10) << "handle_resolve_ack " << *ack << " from " << ack->get_source() << dendl; - int from = ack->get_source().num(); - - for (list::iterator p = ack->commit.begin(); - p != ack->commit.end(); - ++p) { - dout(10) << " commit on slave " << *p << dendl; - - if (mds->is_resolve()) { - // replay - assert(uncommitted_slave_updates[from].count(*p)); - uncommitted_slave_updates[from][*p].commit.replay(mds); - uncommitted_slave_updates[from].erase(*p); - // log commit - mds->mdlog->submit_entry(new ESlaveUpdate(mds->mdlog, "unknown", *p, from, ESlaveUpdate::OP_COMMIT)); - } else { - MDRequest *mdr = request_get(*p); - assert(mdr->slave_request == 0); // shouldn't be doing anything! - request_finish(mdr); - } - } - - for (list::iterator p = ack->abort.begin(); - p != ack->abort.end(); - ++p) { - dout(10) << " abort on slave " << *p << dendl; - - if (mds->is_resolve()) { - assert(uncommitted_slave_updates[from].count(*p)); - uncommitted_slave_updates[from][*p].rollback.replay(mds); - uncommitted_slave_updates[from].erase(*p); - mds->mdlog->submit_entry(new ESlaveUpdate(mds->mdlog, "unknown", *p, from, ESlaveUpdate::OP_ROLLBACK)); - } else { - MDRequest *mdr = request_get(*p); - if (mdr->more()->slave_commit) { - mdr->more()->slave_commit->finish(-1); - delete mdr->more()->slave_commit; - mdr->more()->slave_commit = 0; - } - if (mdr->slave_request) - mdr->aborted = true; - else - request_finish(mdr); - } - } - - need_resolve_ack.erase(from); - - if (mds->is_resolve()) - maybe_resolve_finish(); - - delete ack; -} - - - -void MDCache::disambiguate_imports() -{ - dout(10) << "disambiguate_imports" << dendl; - - // other nodes' ambiguous imports - for (map > >::iterator p = other_ambiguous_imports.begin(); - p != other_ambiguous_imports.end(); - ++p) { - int who = p->first; - dout(10) << "ambiguous imports for mds" << who << dendl; - - for (map >::iterator q = p->second.begin(); - q != p->second.end(); - ++q) { - dout(10) << " ambiguous import " << q->first << " bounds " << q->second << dendl; - CDir *dir = get_dirfrag(q->first); - if (!dir) continue; - - if (dir->authority().first == CDIR_AUTH_UNKNOWN || // if i am resolving - dir->is_ambiguous_auth()) { // if i am a surviving bystander - dout(10) << " mds" << who << " did import " << *dir << dendl; - adjust_bounded_subtree_auth(dir, q->second, who); - try_subtree_merge(dir); - } else { - dout(10) << " mds" << who << " did not import " << *dir << dendl; - } - } - } - other_ambiguous_imports.clear(); - - // my ambiguous imports - while (!my_ambiguous_imports.empty()) { - map >::iterator q = my_ambiguous_imports.begin(); - - CDir *dir = get_dirfrag(q->first); - if (!dir) continue; - - if (dir->authority().first != CDIR_AUTH_UNKNOWN) { - dout(10) << "ambiguous import auth known, must not be me " << *dir << dendl; - cancel_ambiguous_import(q->first); - mds->mdlog->submit_entry(new EImportFinish(dir, false)); - } else { - dout(10) << "ambiguous import auth unknown, must be me " << *dir << dendl; - finish_ambiguous_import(q->first); - mds->mdlog->submit_entry(new EImportFinish(dir, true)); - } - } - assert(my_ambiguous_imports.empty()); - - if (mds->is_resolve()) { - // verify all my subtrees are unambiguous! - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) { - CDir *dir = p->first; - if (dir->is_ambiguous_dir_auth()) { - dout(0) << "disambiguate_imports uh oh, dir_auth is still ambiguous for " << *dir << dendl; - show_subtrees(); - } - assert(!dir->is_ambiguous_dir_auth()); - } - } - - show_subtrees(); -} - - -void MDCache::add_ambiguous_import(dirfrag_t base, list& bounds) -{ - assert(my_ambiguous_imports.count(base) == 0); - my_ambiguous_imports[base].swap( bounds ); -} - - -void MDCache::add_ambiguous_import(CDir *base, const set& bounds) -{ - // make a list - list binos; - for (set::iterator p = bounds.begin(); - p != bounds.end(); - ++p) - binos.push_back((*p)->dirfrag()); - - // note: this can get called twice if the exporter fails during recovery - if (my_ambiguous_imports.count(base->dirfrag())) - my_ambiguous_imports.erase(base->dirfrag()); - - add_ambiguous_import(base->dirfrag(), binos); -} - -void MDCache::cancel_ambiguous_import(dirfrag_t df) -{ - assert(my_ambiguous_imports.count(df)); - dout(10) << "cancel_ambiguous_import " << df - << " bounds " << my_ambiguous_imports[df] - << dendl; - my_ambiguous_imports.erase(df); -} - -void MDCache::finish_ambiguous_import(dirfrag_t df) -{ - assert(my_ambiguous_imports.count(df)); - list bounds; - bounds.swap(my_ambiguous_imports[df]); - my_ambiguous_imports.erase(df); - - dout(10) << "finish_ambiguous_import " << df - << " bounds " << bounds - << dendl; - CDir *dir = get_dirfrag(df); - assert(dir); - - // adjust dir_auth, import maps - adjust_bounded_subtree_auth(dir, bounds, mds->get_nodeid()); - try_subtree_merge(dir); -} - - -/** recalc_auth_bits() - * once subtree auth is disambiguated, we need to adjust all the - * auth and dirty bits in our cache before moving on. - */ -void MDCache::recalc_auth_bits() -{ - dout(7) << "recalc_auth_bits" << dendl; - - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) { - list dfq; // dirfrag queue - dfq.push_back(p->first); - - bool auth = p->first->authority().first == mds->get_nodeid(); - dout(10) << " subtree auth=" << auth << " for " << *p->first << dendl; - - while (!dfq.empty()) { - CDir *dir = dfq.front(); - dfq.pop_front(); - - // dir - if (auth) - dir->state_set(CDir::STATE_AUTH); - else { - dir->state_set(CDir::STATE_REJOINING); - dir->state_clear(CDir::STATE_AUTH); - if (dir->is_dirty()) - dir->mark_clean(); - } - - // dentries in this dir - for (CDir::map_t::iterator q = dir->items.begin(); - q != dir->items.end(); - ++q) { - // dn - CDentry *dn = q->second; - if (auth) - dn->state_set(CDentry::STATE_AUTH); - else { - dn->state_set(CDentry::STATE_REJOINING); - dn->state_clear(CDentry::STATE_AUTH); - if (dn->is_dirty()) - dn->mark_clean(); - } - - if (dn->is_primary()) { - // inode - if (auth) - dn->inode->state_set(CInode::STATE_AUTH); - else { - dn->inode->state_set(CInode::STATE_REJOINING); - dn->inode->state_clear(CInode::STATE_AUTH); - if (dn->inode->is_dirty()) - dn->inode->mark_clean(); - } - - // recurse? - if (dn->inode->is_dir()) - dn->inode->get_nested_dirfrags(dfq); - } - } - } - } - - show_subtrees(); - show_cache(); -} - - - -// =========================================================================== -// REJOIN - - -/* - * rejoin phase! - * - * this initiates rejoin. it shoudl be called before we get any - * rejoin or rejoin_ack messages (or else mdsmap distribution is broken). - * - * we start out by sending rejoins to everyone in the recovery set. - * - * if we are rejoin, send for all regions in our cache. - * if we are active|stopping, send only to nodes that are are rejoining. - */ -void MDCache::rejoin_send_rejoins() -{ - dout(10) << "rejoin_send_rejoins with recovery_set " << recovery_set << dendl; - - map rejoins; - - // encode cap list once. - bufferlist cap_export_bl; - if (mds->is_rejoin()) { - ::_encode(cap_exports, cap_export_bl); - ::_encode(cap_export_paths, cap_export_bl); - } - - // if i am rejoining, send a rejoin to everyone. - // otherwise, just send to others who are rejoining. - for (set::iterator p = recovery_set.begin(); - p != recovery_set.end(); - ++p) { - if (*p == mds->get_nodeid()) continue; // nothing to myself! - if (rejoin_sent.count(*p)) continue; // already sent a rejoin to this node! - if (mds->is_rejoin()) { - rejoin_gather.insert(*p); - rejoins[*p] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_WEAK); - rejoins[*p]->copy_cap_exports(cap_export_bl); - } else if (mds->mdsmap->is_rejoin(*p)) - rejoins[*p] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_STRONG); - } - - assert(!migrator->is_importing()); - assert(!migrator->is_exporting()); - - // check all subtrees - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) { - CDir *dir = p->first; - assert(dir->is_subtree_root()); - assert(!dir->is_ambiguous_dir_auth()); - - int auth = dir->get_dir_auth().first; - assert(auth >= 0); - - if (auth == mds->get_nodeid()) continue; // skip my own regions! - if (rejoins.count(auth) == 0) continue; // don't care about this node's regions - - rejoin_walk(dir, rejoins[auth]); - } - - // rejoin root inodes, too - for (map::iterator p = rejoins.begin(); - p != rejoins.end(); - ++p) { - if (mds->is_rejoin()) { - // weak - if (p->first == 0 && root) - p->second->add_weak_inode(root->ino()); - if (get_inode(MDS_INO_STRAY(p->first))) - p->second->add_weak_inode(MDS_INO_STRAY(p->first)); - } else { - // strong - if (p->first == 0 && root) { - p->second->add_weak_inode(root->ino()); - p->second->add_strong_inode(root->ino(), root->get_replica_nonce(), - root->get_caps_wanted(), - root->authlock.get_state(), - root->linklock.get_state(), - root->dirfragtreelock.get_state(), - root->filelock.get_state(), - root->dirlock.get_state()); - } - if (CInode *in = get_inode(MDS_INO_STRAY(p->first))) { - p->second->add_weak_inode(in->ino()); - p->second->add_strong_inode(in->ino(), in->get_replica_nonce(), - in->get_caps_wanted(), - in->authlock.get_state(), - in->linklock.get_state(), - in->dirfragtreelock.get_state(), - in->filelock.get_state(), - in->dirlock.get_state()); - } - } - } - - if (!mds->is_rejoin()) { - // i am survivor. send strong rejoin. - // note request authpins, xlocks - for (hash_map::iterator p = active_requests.begin(); - p != active_requests.end(); - ++p) { - // auth pins - for (set::iterator q = p->second->auth_pins.begin(); - q != p->second->auth_pins.end(); - ++q) { - if (!(*q)->is_auth()) { - int who = (*q)->authority().first; - if (rejoins.count(who) == 0) continue; - MMDSCacheRejoin *rejoin = rejoins[who]; - - dout(15) << " " << *p->second << " authpin on " << **q << dendl; - MDSCacheObjectInfo i; - (*q)->set_object_info(i); - if (i.ino) - rejoin->add_inode_authpin(i.ino, p->second->reqid); - else - rejoin->add_dentry_authpin(i.dirfrag, i.dname, p->second->reqid); - } - } - // xlocks - for (set::iterator q = p->second->xlocks.begin(); - q != p->second->xlocks.end(); - ++q) { - if (!(*q)->get_parent()->is_auth()) { - int who = (*q)->get_parent()->authority().first; - if (rejoins.count(who) == 0) continue; - MMDSCacheRejoin *rejoin = rejoins[who]; - - dout(15) << " " << *p->second << " xlock on " << **q << " " << *(*q)->get_parent() << dendl; - MDSCacheObjectInfo i; - (*q)->get_parent()->set_object_info(i); - if (i.ino) - rejoin->add_inode_xlock(i.ino, (*q)->get_type(), p->second->reqid); - else - rejoin->add_dentry_xlock(i.dirfrag, i.dname, p->second->reqid); - } - } - } - } - - // send the messages - for (map::iterator p = rejoins.begin(); - p != rejoins.end(); - ++p) { - assert(rejoin_sent.count(p->first) == 0); - assert(rejoin_ack_gather.count(p->first) == 0); - rejoin_sent.insert(p->first); - rejoin_ack_gather.insert(p->first); - mds->send_message_mds(p->second, p->first, MDS_PORT_CACHE); - } - - // nothing? - if (mds->is_rejoin() && rejoins.empty()) { - dout(10) << "nothing to rejoin" << dendl; - mds->rejoin_done(); - } -} - - -/** - * rejoin_walk - build rejoin declarations for a subtree - * - * @dir subtree root - * @rejoin rejoin message - * - * from a rejoining node: - * weak dirfrag - * weak dentries (w/ connectivity) - * - * from a surviving node: - * strong dirfrag - * strong dentries (no connectivity!) - * strong inodes - */ -void MDCache::rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin) -{ - dout(10) << "rejoin_walk " << *dir << dendl; - - list nested; // finish this dir, then do nested items - - if (mds->is_rejoin()) { - // WEAK - dout(15) << " add_weak_dirfrag " << *dir << dendl; - rejoin->add_weak_dirfrag(dir->dirfrag()); - - for (CDir::map_t::iterator p = dir->items.begin(); - p != dir->items.end(); - ++p) { - CDentry *dn = p->second; - dout(15) << " add_weak_primary_dentry " << *dn << dendl; - assert(dn->is_primary()); - assert(dn->inode->is_dir()); - rejoin->add_weak_primary_dentry(dir->dirfrag(), p->first, dn->get_inode()->ino()); - dn->get_inode()->get_nested_dirfrags(nested); - - if (dn->get_inode()->dirlock.is_updated()) { - // include full inode to shed any dirtyscattered state - rejoin->add_full_inode(dn->get_inode()->inode, - dn->get_inode()->symlink, - dn->get_inode()->dirfragtree); - dn->get_inode()->dirlock.clear_updated(); - } - } - } else { - // STRONG - dout(15) << " add_strong_dirfrag " << *dir << dendl; - rejoin->add_strong_dirfrag(dir->dirfrag(), dir->get_replica_nonce(), dir->get_dir_rep()); - - for (CDir::map_t::iterator p = dir->items.begin(); - p != dir->items.end(); - ++p) { - CDentry *dn = p->second; - dout(15) << " add_strong_dentry " << *dn << dendl; - rejoin->add_strong_dentry(dir->dirfrag(), p->first, - dn->is_primary() ? dn->get_inode()->ino():inodeno_t(0), - dn->is_remote() ? dn->get_remote_ino():inodeno_t(0), - dn->is_remote() ? dn->get_remote_d_type():0, - dn->get_replica_nonce(), - dn->lock.get_state()); - if (dn->is_primary()) { - CInode *in = dn->get_inode(); - dout(15) << " add_strong_inode " << *in << dendl; - rejoin->add_strong_inode(in->ino(), in->get_replica_nonce(), - in->get_caps_wanted(), - in->authlock.get_state(), - in->linklock.get_state(), - in->dirfragtreelock.get_state(), - in->filelock.get_state(), - in->dirlock.get_state()); - in->get_nested_dirfrags(nested); - } - } - } - - // recurse into nested dirs - for (list::iterator p = nested.begin(); - p != nested.end(); - ++p) - rejoin_walk(*p, rejoin); -} - - -/* - * i got a rejoin. - * - reply with the lockstate - * - * if i am active|stopping, - * - remove source from replica list for everything not referenced here. - */ -void MDCache::handle_cache_rejoin(MMDSCacheRejoin *m) -{ - dout(7) << "handle_cache_rejoin " << *m << " from " << m->get_source() - << " (" << m->get_payload().length() << " bytes)" - << dendl; - - switch (m->op) { - case MMDSCacheRejoin::OP_WEAK: - handle_cache_rejoin_weak(m); - break; - case MMDSCacheRejoin::OP_STRONG: - handle_cache_rejoin_strong(m); - break; - - case MMDSCacheRejoin::OP_ACK: - handle_cache_rejoin_ack(m); - break; - case MMDSCacheRejoin::OP_MISSING: - handle_cache_rejoin_missing(m); - break; - - case MMDSCacheRejoin::OP_FULL: - handle_cache_rejoin_full(m); - break; - - default: - assert(0); - } - delete m; -} - - -/* - * handle_cache_rejoin_weak - * - * the sender - * - is recovering from their journal. - * - may have incorrect (out of date) inode contents - * - will include full inodes IFF they contain dirty scatterlock content - * - * if the sender didn't trim_non_auth(), they - * - may have incorrect (out of date) dentry/inode linkage - * - may have deleted/purged inodes - * and i may have to go to disk to get accurate inode contents. yuck. - */ -void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak) -{ - int from = weak->get_source().num(); - - // possible response(s) - MMDSCacheRejoin *ack = 0; // if survivor - bool survivor = false; // am i a survivor? - - if (mds->is_active() || mds->is_stopping()) { - survivor = true; - dout(10) << "i am a surivivor, and will ack immediately" << dendl; - ack = new MMDSCacheRejoin(MMDSCacheRejoin::OP_ACK); - - // check cap exports - for (map >::iterator p = weak->cap_exports.begin(); - p != weak->cap_exports.end(); - ++p) { - CInode *in = get_inode(p->first); - if (!in || !in->is_auth()) continue; - for (map::iterator q = p->second.begin(); - q != p->second.end(); - ++q) { - dout(10) << " claiming cap import " << p->first << " client" << q->first << " on " << *in << dendl; - rejoin_import_cap(in, q->first, q->second, from); - } - } - } else { - assert(mds->is_rejoin()); - - // check cap exports. - for (map >::iterator p = weak->cap_exports.begin(); - p != weak->cap_exports.end(); - ++p) { - CInode *in = get_inode(p->first); - if (in && !in->is_auth()) continue; - if (!in) { - if (!path_is_mine(weak->cap_export_paths[p->first])) - continue; - cap_import_paths[p->first] = weak->cap_export_paths[p->first]; - dout(10) << " noting cap import " << p->first << " path " << weak->cap_export_paths[p->first] << dendl; - } - - // note - for (map::iterator q = p->second.begin(); - q != p->second.end(); - ++q) { - dout(10) << " claiming cap import " << p->first << " client" << q->first << dendl; - cap_imports[p->first][q->first][from] = q->second; - } - } - } - - // full inodes? - // dirty scatterlock content! - for (list::iterator p = weak->full_inodes.begin(); - p != weak->full_inodes.end(); - ++p) { - CInode *in = get_inode(p->inode.ino); - if (!in) continue; - if (p->inode.mtime > in->inode.mtime) in->inode.mtime = p->inode.mtime; - dout(10) << " got dirty inode scatterlock content " << *in << dendl; - in->dirlock.set_updated(); - } - - // walk weak map - for (map >::iterator p = weak->weak.begin(); - p != weak->weak.end(); - ++p) { - CDir *dir = get_dirfrag(p->first); - if (!dir) dout(0) << " missing dirfrag " << p->first << dendl; - assert(dir); - - int nonce = dir->add_replica(from); - dout(10) << " have " << *dir << dendl; - if (ack) - ack->add_strong_dirfrag(p->first, nonce, dir->dir_rep); - - // weak dentries - for (map::iterator q = p->second.begin(); - q != p->second.end(); - ++q) { - CDentry *dn = dir->lookup(q->first); - assert(dn); - assert(dn->is_primary()); - - if (survivor && dn->is_replica(from)) - dentry_remove_replica(dn, from); // this induces a lock gather completion - int dnonce = dn->add_replica(from); - dout(10) << " have " << *dn << dendl; - if (ack) - ack->add_strong_dentry(p->first, q->first, - dn->get_inode()->ino(), inodeno_t(0), 0, - dnonce, dn->lock.get_replica_state()); - - // inode - CInode *in = dn->get_inode(); - assert(in); - - if (survivor && in->is_replica(from)) - inode_remove_replica(in, from); // this induces a lock gather completion - int inonce = in->add_replica(from); - dout(10) << " have " << *in << dendl; - - // scatter the dirlock, just in case? - if (!survivor && in->is_dir()) - in->dirlock.set_state(LOCK_SCATTER); - - if (ack) { - ack->add_full_inode(in->inode, in->symlink, in->dirfragtree); - ack->add_strong_inode(in->ino(), - inonce, - 0, - in->authlock.get_replica_state(), - in->linklock.get_replica_state(), - in->dirfragtreelock.get_replica_state(), - in->filelock.get_replica_state(), - in->dirlock.get_replica_state()); - } - } - } - - // weak base inodes? (root, stray, etc.) - for (set::iterator p = weak->weak_inodes.begin(); - p != weak->weak_inodes.end(); - ++p) { - CInode *in = get_inode(*p); - assert(in); // hmm fixme wrt stray? - if (survivor && in->is_replica(from)) - inode_remove_replica(in, from); // this induces a lock gather completion - int inonce = in->add_replica(from); - dout(10) << " have base " << *in << dendl; - - if (ack) - ack->add_strong_inode(in->ino(), - inonce, - 0, - in->authlock.get_replica_state(), - in->linklock.get_replica_state(), - in->dirfragtreelock.get_replica_state(), - in->filelock.get_replica_state(), - in->dirlock.get_replica_state()); - } - - if (survivor) { - // survivor. do everything now. - rejoin_scour_survivor_replicas(from, ack); - mds->send_message_mds(ack, from, MDS_PORT_CACHE); - } else { - // done? - assert(rejoin_gather.count(from)); - rejoin_gather.erase(from); - if (rejoin_gather.empty()) { - rejoin_gather_finish(); - } else { - dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl; - } - } -} - - -/** - * parallel_fetch -- make a pass at fetching a bunch of paths in parallel - * - * @pathmap - map of inodeno to full pathnames. we remove items from this map - * as we discover we have them. - * - * returns a C_Gather* is there is work to do. caller is responsible for setting - * the C_Gather completer. - */ -C_Gather *MDCache::parallel_fetch(map& pathmap) -{ - dout(10) << "parallel_fetch on " << pathmap.size() << " paths" << dendl; - - // scan list - set fetch_queue; - map::iterator p = pathmap.begin(); - while (p != pathmap.end()) { - CInode *in = get_inode(p->first); - if (in) { - dout(15) << " have " << *in << dendl; - pathmap.erase(p++); - continue; - } - - // traverse - dout(17) << " missing " << p->first << " at " << p->second << dendl; - filepath path(p->second); - CDir *dir = path_traverse_to_dir(path); - assert(dir); - fetch_queue.insert(dir); - p++; - } - - if (pathmap.empty()) { - dout(10) << "parallel_fetch done" << dendl; - assert(fetch_queue.empty()); - return false; - } - - // do a parallel fetch - C_Gather *gather = new C_Gather; - for (set::iterator p = fetch_queue.begin(); - p != fetch_queue.end(); - ++p) { - dout(10) << "parallel_fetch fetching " << **p << dendl; - (*p)->fetch(gather->new_sub()); - } - - return gather; -} - - - -/* - * rejoin_scour_survivor_replica - remove source from replica list on unmentioned objects - * - * all validated replicas are acked with a strong nonce, etc. if that isn't in the - * ack, the replica dne, and we can remove it from our replica maps. - */ -void MDCache::rejoin_scour_survivor_replicas(int from, MMDSCacheRejoin *ack) -{ - dout(10) << "rejoin_scour_survivor_replicas from mds" << from << dendl; - - // FIXME: what about root and stray inodes. - - for (hash_map::iterator p = inode_map.begin(); - p != inode_map.end(); - ++p) { - CInode *in = p->second; - - // inode? - if (in->is_auth() && - in->is_replica(from) && - ack->strong_inodes.count(p->second->ino()) == 0) { - inode_remove_replica(in, from); - dout(10) << " rem " << *in << dendl; - } - - if (!in->is_dir()) continue; - - list dfs; - in->get_dirfrags(dfs); - for (list::iterator p = dfs.begin(); - p != dfs.end(); - ++p) { - CDir *dir = *p; - - if (dir->is_auth() && - dir->is_replica(from) && - ack->strong_dirfrags.count(dir->dirfrag()) == 0) { - dir->remove_replica(from); - dout(10) << " rem " << *dir << dendl; - } - - // dentries - for (CDir::map_t::iterator p = dir->items.begin(); - p != dir->items.end(); - ++p) { - CDentry *dn = p->second; - - if (dn->is_replica(from) && - (ack->strong_dentries.count(dir->dirfrag()) == 0 || - ack->strong_dentries[dir->dirfrag()].count(dn->get_name()) == 0)) { - dentry_remove_replica(dn, from); - dout(10) << " rem " << *dn << dendl; - } - } - } - } -} - - -CInode *MDCache::rejoin_invent_inode(inodeno_t ino) -{ - CInode *in = new CInode(this); - memset(&in->inode, 0, sizeof(inode_t)); - in->inode.ino = ino; - in->state_set(CInode::STATE_REJOINUNDEF); - add_inode(in); - rejoin_undef_inodes.insert(in); - dout(10) << " invented " << *in << dendl; - return in; -} - - -void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong) -{ - int from = strong->get_source().num(); - - // only a recovering node will get a strong rejoin. - assert(mds->is_rejoin()); - - MMDSCacheRejoin *missing = 0; // if i'm missing something.. - - // strong dirfrags/dentries. - // also process auth_pins, xlocks. - for (map::iterator p = strong->strong_dirfrags.begin(); - p != strong->strong_dirfrags.end(); - ++p) { - CDir *dir = get_dirfrag(p->first); - if (!dir) { - CInode *in = get_inode(p->first.ino); - if (!in) in = rejoin_invent_inode(p->first.ino); - if (!in->is_dir()) { - assert(in->state_test(CInode::STATE_REJOINUNDEF)); - in->inode.mode = INODE_MODE_DIR; - } - dir = in->get_or_open_dirfrag(this, p->first.frag); - } else { - dout(10) << " have " << *dir << dendl; - } - dir->add_replica(from); - dir->dir_rep = p->second.dir_rep; - - for (map::iterator q = strong->strong_dentries[p->first].begin(); - q != strong->strong_dentries[p->first].end(); - ++q) { - CDentry *dn = dir->lookup(q->first); - if (!dn) { - if (q->second.is_remote()) { - dn = dir->add_remote_dentry(q->first, q->second.remote_ino, q->second.remote_d_type); - } else if (q->second.is_null()) { - dn = dir->add_null_dentry(q->first); - } else { - CInode *in = get_inode(q->second.ino); - if (!in) in = rejoin_invent_inode(q->second.ino); - dn = dir->add_primary_dentry(q->first, in); - - dout(10) << " missing " << q->second.ino << dendl; - if (!missing) missing = new MMDSCacheRejoin(MMDSCacheRejoin::OP_MISSING); - missing->add_weak_inode(q->second.ino); // we want it back! - } - dout(10) << " invented " << *dn << dendl; - } - - // dn auth_pin? - if (strong->authpinned_dentries.count(p->first) && - strong->authpinned_dentries[p->first].count(q->first)) { - metareqid_t ri = strong->authpinned_dentries[p->first][q->first]; - dout(10) << " dn authpin by " << ri << " on " << *dn << dendl; - - // get/create slave mdrequest - MDRequest *mdr; - if (have_request(ri)) - mdr = request_get(ri); - else - mdr = request_start_slave(ri, from); - mdr->auth_pin(dn); - } - - // dn xlock? - if (strong->xlocked_dentries.count(p->first) && - strong->xlocked_dentries[p->first].count(q->first)) { - metareqid_t ri = strong->xlocked_dentries[p->first][q->first]; - dout(10) << " dn xlock by " << ri << " on " << *dn << dendl; - MDRequest *mdr = request_get(ri); // should have this from auth_pin above. - assert(mdr->is_auth_pinned(dn)); - dn->lock.set_state(LOCK_LOCK); - dn->lock.get_xlock(mdr); - mdr->xlocks.insert(&dn->lock); - mdr->locks.insert(&dn->lock); - } - - dn->add_replica(from); - dout(10) << " have " << *dn << dendl; - - // inode? - if (dn->is_primary()) { - CInode *in = dn->get_inode(); - assert(in); - - if (strong->strong_inodes.count(in->ino())) { - MMDSCacheRejoin::inode_strong &is = strong->strong_inodes[in->ino()]; - - // caps_wanted - if (is.caps_wanted) { - in->mds_caps_wanted[from] = is.caps_wanted; - dout(15) << " inode caps_wanted " << cap_string(is.caps_wanted) - << " on " << *in << dendl; - } - - // scatterlock? - if (is.dirlock == LOCK_SCATTER || - is.dirlock == LOCK_GLOCKC) // replica still has wrlocks - in->dirlock.set_state(LOCK_SCATTER); - - // auth pin? - if (strong->authpinned_inodes.count(in->ino())) { - metareqid_t ri = strong->authpinned_inodes[in->ino()]; - dout(10) << " inode authpin by " << ri << " on " << *in << dendl; - - // get/create slave mdrequest - MDRequest *mdr; - if (have_request(ri)) - mdr = request_get(ri); - else - mdr = request_start_slave(ri, from); - mdr->auth_pin(in); - } - - // xlock(s)? - if (strong->xlocked_inodes.count(in->ino())) { - for (map::iterator r = strong->xlocked_inodes[in->ino()].begin(); - r != strong->xlocked_inodes[in->ino()].end(); - ++r) { - SimpleLock *lock = in->get_lock(r->first); - dout(10) << " inode xlock by " << r->second << " on " << *lock << " on " << *in << dendl; - MDRequest *mdr = request_get(r->second); // should have this from auth_pin above. - assert(mdr->is_auth_pinned(in)); - lock->set_state(LOCK_LOCK); - lock->get_xlock(mdr); - mdr->xlocks.insert(lock); - mdr->locks.insert(lock); - } - } - } else { - dout(10) << " sender has dentry but not inode, adding them as a replica" << dendl; - } - - in->add_replica(from); - dout(10) << " have " << *in << dendl; - } - } - } - - // base inodes? (root, stray, etc.) - for (set::iterator p = strong->weak_inodes.begin(); - p != strong->weak_inodes.end(); - ++p) { - CInode *in = get_inode(*p); - dout(10) << " have base " << *in << dendl; - in->add_replica(from); - } - - // send missing? - if (missing) { - // we expect a FULL soon. - mds->send_message_mds(missing, from, MDS_PORT_CACHE); - } else { - // done? - assert(rejoin_gather.count(from)); - rejoin_gather.erase(from); - if (rejoin_gather.empty()) { - rejoin_gather_finish(); - } else { - dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl; - } - } -} - - -void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack) -{ - dout(7) << "handle_cache_rejoin_ack from " << ack->get_source() << dendl; - int from = ack->get_source().num(); - - list waiters; - - // dirs - for (map::iterator p = ack->strong_dirfrags.begin(); - p != ack->strong_dirfrags.end(); - ++p) { - CDir *dir = get_dirfrag(p->first); - if (!dir) continue; - - dir->set_replica_nonce(p->second.nonce); - dir->state_clear(CDir::STATE_REJOINING); - dout(10) << " got " << *dir << dendl; - - // dentries - for (map::iterator q = ack->strong_dentries[p->first].begin(); - q != ack->strong_dentries[p->first].end(); - ++q) { - CDentry *dn = dir->lookup(q->first); - if (!dn) continue; - - // hmm, did we have the proper linkage here? - if (dn->is_null() && - !q->second.is_null()) { - dout(10) << " had bad (missing) linkage for " << *dn << dendl; - if (q->second.is_remote()) { - dn->dir->link_remote_inode(dn, q->second.remote_ino, q->second.remote_d_type); - } else { - CInode *in = get_inode(q->second.ino); - assert(in == 0); // a rename would have been caught be the resolve stage. - // barebones inode; the full inode loop below will clean up. - in = new CInode(this, false); - in->inode.ino = q->second.ino; - add_inode(in); - dn->dir->link_primary_inode(dn, in); - } - } - else if (!dn->is_null() && - q->second.is_null()) { - dout(-10) << " had bad linkage for " << *dn << dendl; - assert(0); // hrmpf. unlink should use slave requests to clean this up during resolve. - } - dn->set_replica_nonce(q->second.nonce); - mds->locker->rejoin_set_state(&dn->lock, q->second.lock, waiters); - dn->state_clear(CDentry::STATE_REJOINING); - dout(10) << " got " << *dn << dendl; - } - } - - // full inodes - for (list::iterator p = ack->full_inodes.begin(); - p != ack->full_inodes.end(); - ++p) { - CInode *in = get_inode(p->inode.ino); - if (!in) continue; - in->inode = p->inode; - in->symlink = p->symlink; - in->dirfragtree = p->dirfragtree; - dout(10) << " got inode content " << *in << dendl; - } - - // inodes - for (map::iterator p = ack->strong_inodes.begin(); - p != ack->strong_inodes.end(); - ++p) { - CInode *in = get_inode(p->first); - if (!in) continue; - in->set_replica_nonce(p->second.nonce); - mds->locker->rejoin_set_state(&in->authlock, p->second.authlock, waiters); - mds->locker->rejoin_set_state(&in->linklock, p->second.linklock, waiters); - mds->locker->rejoin_set_state(&in->dirfragtreelock, p->second.dirfragtreelock, waiters); - mds->locker->rejoin_set_state(&in->filelock, p->second.filelock, waiters); - mds->locker->rejoin_set_state(&in->dirlock, p->second.dirlock, waiters); - in->state_clear(CInode::STATE_REJOINING); - dout(10) << " got " << *in << dendl; - } - - // done? - assert(rejoin_ack_gather.count(from)); - rejoin_ack_gather.erase(from); - if (mds->is_rejoin() && - rejoin_gather.empty() && // make sure we've gotten our FULL inodes, too. - rejoin_ack_gather.empty()) { - mds->rejoin_done(); - } else { - dout(7) << "still need rejoin from (" << rejoin_gather << ")" - << ", rejoin_ack from (" << rejoin_ack_gather << ")" << dendl; - } -} - - - - -void MDCache::handle_cache_rejoin_missing(MMDSCacheRejoin *missing) -{ - dout(7) << "handle_cache_rejoin_missing from " << missing->get_source() << dendl; - - MMDSCacheRejoin *full = new MMDSCacheRejoin(MMDSCacheRejoin::OP_FULL); - - // inodes - for (set::iterator p = missing->weak_inodes.begin(); - p != missing->weak_inodes.end(); - ++p) { - CInode *in = get_inode(*p); - if (!in) { - dout(10) << " don't have inode " << *p << dendl; - continue; // we must have trimmed it after the originalo rejoin - } - - dout(10) << " sending " << *in << dendl; - full->add_full_inode(in->inode, in->symlink, in->dirfragtree); - } - - mds->send_message_mds(full, missing->get_source().num(), MDS_PORT_CACHE); -} - -void MDCache::handle_cache_rejoin_full(MMDSCacheRejoin *full) -{ - dout(7) << "handle_cache_rejoin_full from " << full->get_source() << dendl; - int from = full->get_source().num(); - - // integrate full inodes - for (list::iterator p = full->full_inodes.begin(); - p != full->full_inodes.end(); - ++p) { - CInode *in = get_inode(p->inode.ino); - assert(in); - - set::iterator q = rejoin_undef_inodes.find(in); - if (q != rejoin_undef_inodes.end()) { - CInode *in = *q; - in->inode = p->inode; - in->symlink = p->symlink; - in->dirfragtree = p->dirfragtree; - in->state_clear(CInode::STATE_REJOINUNDEF); - dout(10) << " got full " << *in << dendl; - rejoin_undef_inodes.erase(q); - } else { - dout(10) << " had full " << *in << dendl; - } - } - - // done? - assert(rejoin_gather.count(from)); - rejoin_gather.erase(from); - if (rejoin_gather.empty()) { - rejoin_gather_finish(); - } else { - dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl; - } -} - - - -/** - * rejoin_trim_undef_inodes() -- remove REJOINUNDEF flagged inodes - * - * FIXME: wait, can this actually happen? a survivor should generate cache trim - * messages that clean these guys up... - */ -void MDCache::rejoin_trim_undef_inodes() -{ - dout(10) << "rejoin_trim_undef_inodes" << dendl; - - while (!rejoin_undef_inodes.empty()) { - set::iterator p = rejoin_undef_inodes.begin(); - CInode *in = *p; - rejoin_undef_inodes.erase(p); - - in->clear_replica_map(); - - // close out dirfrags - if (in->is_dir()) { - list dfls; - in->get_dirfrags(dfls); - for (list::iterator p = dfls.begin(); - p != dfls.end(); - ++p) { - CDir *dir = *p; - dir->clear_replica_map(); - - for (CDir::map_t::iterator p = dir->items.begin(); - p != dir->items.end(); - ++p) { - CDentry *dn = p->second; - dn->clear_replica_map(); - - dout(10) << " trimming " << *dn << dendl; - dir->remove_dentry(dn); - } - - dout(10) << " trimming " << *dir << dendl; - in->close_dirfrag(dir->dirfrag().frag); - } - } - - CDentry *dn = in->get_parent_dn(); - if (dn) { - dn->clear_replica_map(); - dout(10) << " trimming " << *dn << dendl; - dn->dir->remove_dentry(dn); - } else { - dout(10) << " trimming " << *in << dendl; - remove_inode(in); - } - } - - assert(rejoin_undef_inodes.empty()); -} - -class C_MDC_RejoinGatherFinish : public Context { - MDCache *cache; -public: - C_MDC_RejoinGatherFinish(MDCache *c) : cache(c) {} - void finish(int r) { - cache->rejoin_gather_finish(); - } -}; - - - -void MDCache::rejoin_gather_finish() -{ - dout(10) << "rejoin_gather_finish" << dendl; - assert(mds->is_rejoin()); - - rejoin_trim_undef_inodes(); - - // fetch paths? - // do this before ack, since some inodes we may have already gotten - // from surviving MDSs. - if (!cap_import_paths.empty()) { - C_Gather *gather = parallel_fetch(cap_import_paths); - if (gather) { - gather->set_finisher(new C_MDC_RejoinGatherFinish(this)); - return; - } - } - - // process cap imports - // ino -> client -> frommds -> capex - for (map > >::iterator p = cap_imports.begin(); - p != cap_imports.end(); - ++p) { - CInode *in = get_inode(p->first); - assert(in); - mds->server->add_reconnected_cap_inode(in); - for (map >::iterator q = p->second.begin(); - q != p->second.end(); - ++q) - for (map::iterator r = q->second.begin(); - r != q->second.end(); - ++r) - if (r->first >= 0) - rejoin_import_cap(in, q->first, r->second, r->first); - } - - mds->server->process_reconnected_caps(); - - rejoin_send_acks(); - - // did we already get our acks too? - // this happens when the rejoin_gather has to wait on a MISSING/FULL exchange. - if (rejoin_ack_gather.empty()) - mds->rejoin_done(); -} - -void MDCache::rejoin_import_cap(CInode *in, int client, inode_caps_reconnect_t& icr, int frommds) -{ - dout(10) << "rejoin_import_cap for client" << client << " from mds" << frommds - << " on " << *in << dendl; - - // add cap - in->reconnect_cap(client, icr); - - // send REAP - // FIXME client session weirdness. - MClientFileCaps *reap = new MClientFileCaps(MClientFileCaps::OP_REAP, - in->inode, - in->client_caps[client].get_last_seq(), - in->client_caps[client].pending(), - in->client_caps[client].wanted()); - - reap->set_mds( frommds ); // reap from whom? - mds->messenger->send_message(reap, - mds->clientmap.get_inst(client), - 0, MDS_PORT_CACHE); -} - -void MDCache::rejoin_send_acks() -{ - dout(7) << "rejoin_send_acks" << dendl; - - // send acks to everyone in the recovery set - map ack; - for (set::iterator p = recovery_set.begin(); - p != recovery_set.end(); - ++p) - ack[*p] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_ACK); - - // walk subtrees - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - p++) { - CDir *dir = p->first; - if (!dir->is_auth()) continue; - dout(10) << "subtree " << *dir << dendl; - - // auth items in this subtree - list dq; - dq.push_back(dir); - - while (!dq.empty()) { - CDir *dir = dq.front(); - dq.pop_front(); - - // dir - for (map::iterator r = dir->replicas_begin(); - r != dir->replicas_end(); - ++r) - ack[r->first]->add_strong_dirfrag(dir->dirfrag(), r->second, dir->dir_rep); - - for (CDir::map_t::iterator q = dir->items.begin(); - q != dir->items.end(); - ++q) { - CDentry *dn = q->second; - - // dentry - for (map::iterator r = dn->replicas_begin(); - r != dn->replicas_end(); - ++r) - ack[r->first]->add_strong_dentry(dir->dirfrag(), dn->name, - dn->is_primary() ? dn->get_inode()->ino():inodeno_t(0), - dn->is_remote() ? dn->get_remote_ino():inodeno_t(0), - dn->is_remote() ? dn->get_remote_d_type():0, - r->second, - dn->lock.get_replica_state()); - - if (!dn->is_primary()) continue; - - // inode - CInode *in = dn->inode; - - for (map::iterator r = in->replicas_begin(); - r != in->replicas_end(); - ++r) { - ack[r->first]->add_full_inode(in->inode, in->symlink, in->dirfragtree); - ack[r->first]->add_strong_inode(in->ino(), r->second, 0, - in->authlock.get_replica_state(), - in->linklock.get_replica_state(), - in->dirfragtreelock.get_replica_state(), - in->filelock.get_replica_state(), - in->dirlock.get_replica_state()); - } - - // subdirs in this subtree? - in->get_nested_dirfrags(dq); - } - } - } - - // root inodes too - if (root) - for (map::iterator r = root->replicas_begin(); - r != root->replicas_end(); - ++r) { - ack[r->first]->add_full_inode(root->inode, root->symlink, root->dirfragtree); - ack[r->first]->add_strong_inode(root->ino(), r->second, 0, - root->authlock.get_replica_state(), - root->linklock.get_replica_state(), - root->dirfragtreelock.get_replica_state(), - root->filelock.get_replica_state(), - root->dirlock.get_replica_state()); - } - if (stray) - for (map::iterator r = stray->replicas_begin(); - r != stray->replicas_end(); - ++r) { - ack[r->first]->add_full_inode(stray->inode, stray->symlink, stray->dirfragtree); - ack[r->first]->add_strong_inode(stray->ino(), r->second, 0, - stray->authlock.get_replica_state(), - stray->linklock.get_replica_state(), - stray->dirfragtreelock.get_replica_state(), - stray->filelock.get_replica_state(), - stray->dirlock.get_replica_state()); - } - - // send acks - for (map::iterator p = ack.begin(); - p != ack.end(); - ++p) - mds->send_message_mds(p->second, p->first, MDS_PORT_CACHE); - -} - - - -// =============================================================================== - - -void MDCache::set_root(CInode *in) -{ - assert(root == 0); - root = in; - base_inodes.insert(in); -} - - - - - - -// ************** -// Inode purging -- reliably removing deleted file's objects - -class C_MDC_PurgeFinish : public Context { - MDCache *mdc; - CInode *in; - off_t newsize, oldsize; -public: - C_MDC_PurgeFinish(MDCache *c, CInode *i, off_t ns, off_t os) : - mdc(c), in(i), newsize(ns), oldsize(os) {} - void finish(int r) { - mdc->purge_inode_finish(in, newsize, oldsize); - } -}; -class C_MDC_PurgeFinish2 : public Context { - MDCache *mdc; - CInode *in; - off_t newsize, oldsize; -public: - C_MDC_PurgeFinish2(MDCache *c, CInode *i, off_t ns, off_t os) : - mdc(c), in(i), newsize(ns), oldsize(os) {} - void finish(int r) { - mdc->purge_inode_finish_2(in, newsize, oldsize); - } -}; - -/* purge_inode in - * will be called by on unlink or rmdir or truncate - * caller responsible for journaling an appropriate EUpdate - */ -void MDCache::purge_inode(CInode *in, off_t newsize, off_t oldsize, LogSegment *ls) -{ - dout(10) << "purge_inode " << oldsize << " -> " << newsize - << " on " << *in - << dendl; - - assert(oldsize >= newsize); - - purging[in][newsize] = oldsize; - purging_ls[in][newsize] = ls; - ls->purging_inodes[in][newsize] = oldsize; - - _do_purge_inode(in, newsize, oldsize); -} - -void MDCache::_do_purge_inode(CInode *in, off_t newsize, off_t oldsize) -{ - in->get(CInode::PIN_PURGING); - - // remove - if (in->inode.size > 0) { - mds->filer->remove(in->inode, newsize, oldsize, - 0, new C_MDC_PurgeFinish(this, in, newsize, oldsize)); - } else { - // no need, empty file, just log it - purge_inode_finish(in, newsize, oldsize); - } -} - -void MDCache::purge_inode_finish(CInode *in, off_t newsize, off_t oldsize) -{ - dout(10) << "purge_inode_finish " << oldsize << " -> " << newsize - << " on " << *in << dendl; - - // log completion - mds->mdlog->submit_entry(new EPurgeFinish(in->ino(), newsize, oldsize), - new C_MDC_PurgeFinish2(this, in, newsize, oldsize)); -} - -void MDCache::purge_inode_finish_2(CInode *in, off_t newsize, off_t oldsize) -{ - dout(10) << "purge_inode_finish_2 " << oldsize << " -> " << newsize - << " on " << *in << dendl; - - // remove from purging list - LogSegment *ls = purging_ls[in][newsize]; - purging[in].erase(newsize); - purging_ls[in].erase(newsize); - if (purging[in].empty()) { - purging.erase(in); - purging_ls.erase(in); - } - - assert(ls->purging_inodes.count(in)); - assert(ls->purging_inodes[in].count(newsize)); - assert(ls->purging_inodes[in][newsize] == oldsize); - ls->purging_inodes[in].erase(newsize); - if (ls->purging_inodes[in].empty()) - ls->purging_inodes.erase(in); - - in->put(CInode::PIN_PURGING); - - // tell anyone who cares (log flusher?) - if (purging.count(in) == 0 || - purging[in].rbegin()->first < newsize) { - list ls; - ls.swap(waiting_for_purge[in][newsize]); - waiting_for_purge[in].erase(newsize); - if (waiting_for_purge[in].empty()) - waiting_for_purge.erase(in); - finish_contexts(ls, 0); - } -} - -void MDCache::add_recovered_purge(CInode *in, off_t newsize, off_t oldsize, LogSegment *ls) -{ - assert(purging[in].count(newsize) == 0); - purging[in][newsize] = oldsize; - purging_ls[in][newsize] = ls; - ls->purging_inodes[in][newsize] = oldsize; -} - -void MDCache::remove_recovered_purge(CInode *in, off_t newsize, off_t oldsize) -{ - purging[in].erase(newsize); -} - -void MDCache::start_recovered_purges() -{ - dout(10) << "start_recovered_purges (" << purging.size() << " purges)" << dendl; - - for (map >::iterator p = purging.begin(); - p != purging.end(); - ++p) { - for (map::iterator q = p->second.begin(); - q != p->second.end(); - ++q) { - dout(10) << "start_recovered_purges " - << q->second << " -> " << q->first - << " on " << *p->first - << dendl; - _do_purge_inode(p->first, q->first, q->second); - } - } -} - - - -// ================================================================================ -// cache trimming - - -bool MDCache::trim(int max) -{ - // trim LRU - if (max < 0) { - max = lru.lru_get_max(); - if (!max) return false; - } - dout(7) << "trim max=" << max << " cur=" << lru.lru_get_size() << dendl; - - map expiremap; - - // trim dentries from the LRU - while (lru.lru_get_size() > (unsigned)max) { - CDentry *dn = (CDentry*)lru.lru_expire(); - if (!dn) break; - trim_dentry(dn, expiremap); - } - - // trim base inodes? - if (max == 0) { - set::iterator p = base_inodes.begin(); - while (p != base_inodes.end()) { - CInode *in = *p++; - list ls; - in->get_dirfrags(ls); - for (list::iterator p = ls.begin(); p != ls.end(); ++p) { - CDir *dir = *p; - if (dir->get_num_ref() == 1) // subtree pin - trim_dirfrag(dir, 0, expiremap); - } - if (in->get_num_ref() == 0) - trim_inode(0, in, 0, expiremap); - } - } - - // send any expire messages - send_expire_messages(expiremap); - - return true; -} - -void MDCache::send_expire_messages(map& expiremap) -{ - // send expires - for (map::iterator it = expiremap.begin(); - it != expiremap.end(); - it++) { - dout(7) << "sending cache_expire to " << it->first << dendl; - mds->send_message_mds(it->second, it->first, MDS_PORT_CACHE); - } -} - - -void MDCache::trim_dentry(CDentry *dn, map& expiremap) -{ - dout(12) << "trim_dentry " << *dn << dendl; - - CDir *dir = dn->get_dir(); - assert(dir); - - CDir *con = get_subtree_root(dir); - assert(con); - - dout(12) << " in container " << *con << dendl; - - // notify dentry authority? - if (!dn->is_auth()) { - pair auth = dn->authority(); - - for (int p=0; p<2; p++) { - int a = auth.first; - if (p) a = auth.second; - if (a < 0 || (p == 1 && auth.second == auth.first)) break; - if (mds->get_nodeid() == auth.second && - con->is_importing()) break; // don't send any expire while importing. - if (a == mds->get_nodeid()) continue; // on export, ignore myself. - - dout(12) << " sending expire to mds" << a << " on " << *dn << dendl; - assert(a != mds->get_nodeid()); - if (expiremap.count(a) == 0) - expiremap[a] = new MCacheExpire(mds->get_nodeid()); - expiremap[a]->add_dentry(con->dirfrag(), dir->dirfrag(), dn->get_name(), dn->get_replica_nonce()); - } - } - - // adjust the dir state - // NOTE: we can safely remove a clean, null dentry without effecting - // directory completeness. - // (do this _before_ we unlink the inode, below!) - if (!(dn->is_null() && dn->is_clean())) - dir->state_clear(CDir::STATE_COMPLETE); - - // unlink the dentry - if (dn->is_remote()) { - // just unlink. - dir->unlink_inode(dn); - } - else if (dn->is_primary()) { - // expire the inode, too. - CInode *in = dn->get_inode(); - assert(in); - trim_inode(dn, in, con, expiremap); - } - else { - assert(dn->is_null()); - } - - // remove dentry - dir->remove_dentry(dn); - - // reexport? - if (dir->get_size() == 0 && dir->is_subtree_root()) - migrator->export_empty_import(dir); - - if (mds->logger) mds->logger->inc("cex"); -} - - -void MDCache::trim_dirfrag(CDir *dir, CDir *con, map& expiremap) -{ - dout(15) << "trim_dirfrag " << *dir << dendl; - - if (dir->is_subtree_root()) { - assert(!dir->is_auth() || - (!dir->is_replicated() && dir->inode->is_base())); - remove_subtree(dir); // remove from subtree map - } - assert(dir->get_num_ref() == 0); - - CInode *in = dir->get_inode(); - - if (!dir->is_auth()) { - pair auth = dir->authority(); - - // was this an auth delegation? (if so, slightly modified container) - dirfrag_t condf; - if (dir->is_subtree_root()) { - dout(12) << " subtree root, container is " << *dir << dendl; - con = dir; - condf = dir->dirfrag(); - } else { - condf = con->dirfrag(); - } - - for (int p=0; p<2; p++) { - int a = auth.first; - if (p) a = auth.second; - if (a < 0 || (p == 1 && auth.second == auth.first)) break; - if (mds->get_nodeid() == auth.second && - con->is_importing()) break; // don't send any expire while importing. - if (a == mds->get_nodeid()) continue; // on export, ignore myself. - - dout(12) << " sending expire to mds" << a << " on " << *dir << dendl; - assert(a != mds->get_nodeid()); - if (expiremap.count(a) == 0) - expiremap[a] = new MCacheExpire(mds->get_nodeid()); - expiremap[a]->add_dir(condf, dir->dirfrag(), dir->replica_nonce); - } - } - - in->close_dirfrag(dir->dirfrag().frag); -} - -void MDCache::trim_inode(CDentry *dn, CInode *in, CDir *con, map& expiremap) -{ - dout(15) << "trim_inode " << *in << dendl; - assert(in->get_num_ref() == 0); - - // DIR - list dfls; - in->get_dirfrags(dfls); - for (list::iterator p = dfls.begin(); - p != dfls.end(); - ++p) - trim_dirfrag(*p, con ? con:*p, expiremap); // if no container (e.g. root dirfrag), use *p - - // INODE - if (!in->is_auth()) { - pair auth = in->authority(); - - dirfrag_t df; - if (con) - df = con->dirfrag(); - else - df = dirfrag_t(0,frag_t()); // must be a root or stray inode. - - for (int p=0; p<2; p++) { - int a = auth.first; - if (p) a = auth.second; - if (a < 0 || (p == 1 && auth.second == auth.first)) break; - if (con && mds->get_nodeid() == auth.second && - con->is_importing()) break; // don't send any expire while importing. - if (a == mds->get_nodeid()) continue; // on export, ignore myself. - - dout(12) << " sending expire to mds" << a << " on " << *in << dendl; - assert(a != mds->get_nodeid()); - if (expiremap.count(a) == 0) - expiremap[a] = new MCacheExpire(mds->get_nodeid()); - expiremap[a]->add_inode(df, in->ino(), in->get_replica_nonce()); - } - } - - /* - if (in->is_auth()) { - if (in->hack_accessed) - mds->logger->inc("outt"); - else { - mds->logger->inc("outut"); - mds->logger->favg("oututl", g_clock.now() - in->hack_load_stamp); - } - } - */ - - // unlink - if (dn) - dn->get_dir()->unlink_inode(dn); - remove_inode(in); -} - - -/** - * trim_non_auth - remove any non-auth items from our cache - * - * this reduces the amount of non-auth metadata in our cache, reducing the - * load incurred by the rejoin phase. - * - * the only non-auth items that remain are those that are needed to - * attach our own subtrees to the root. - * - * when we are done, all dentries will be in the top bit of the lru. - * - * why we have to do this: - * we may not have accurate linkage for non-auth items. which means we will - * know which subtree it falls into, and can not be sure to declare it to the - * correct authority. - */ -void MDCache::trim_non_auth() -{ - dout(7) << "trim_non_auth" << dendl; - - // temporarily pin all subtree roots - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - p++) - p->first->get(CDir::PIN_SUBTREETEMP); - - // note first auth item we see. - // when we see it the second time, stop. - CDentry *first_auth = 0; - - // trim non-auth items from the lru - while (lru.lru_get_size() > 0) { - CDentry *dn = (CDentry*)lru.lru_expire(); - if (!dn) break; - - if (dn->is_auth()) { - // add back into lru (at the top) - lru.lru_insert_top(dn); - - if (!first_auth) { - first_auth = dn; - } else { - if (first_auth == dn) - break; - } - } else { - // non-auth. expire. - CDir *dir = dn->get_dir(); - assert(dir); - - // unlink the dentry - dout(15) << "trim_non_auth removing " << *dn << dendl; - if (dn->is_remote()) { - dir->unlink_inode(dn); - } - else if (dn->is_primary()) { - CInode *in = dn->get_inode(); - list ls; - in->get_dirfrags(ls); - for (list::iterator p = ls.begin(); p != ls.end(); ++p) { - CDir *subdir = *p; - if (subdir->is_subtree_root()) - remove_subtree(subdir); - in->close_dirfrag(subdir->dirfrag().frag); - } - dir->unlink_inode(dn); - remove_inode(in); - } - else { - assert(dn->is_null()); - } - dir->remove_dentry(dn); - - // adjust the dir state - dir->state_clear(CDir::STATE_COMPLETE); // dir incomplete! - } - } - - if (lru.lru_get_size() == 0) { - // root, stray, etc.? - hash_map::iterator p = inode_map.begin(); - while (p != inode_map.end()) { - hash_map::iterator next = p; - ++next; - CInode *in = p->second; - if (!in->is_auth()) { - list ls; - in->get_dirfrags(ls); - for (list::iterator p = ls.begin(); - p != ls.end(); - ++p) { - assert((*p)->get_num_ref() == 0); - remove_subtree((*p)); - in->close_dirfrag((*p)->dirfrag().frag); - } - assert(in->get_num_ref() == 0); - remove_inode(in); - } - p = next; - } - } - - // move everything in the pintail to the top bit of the lru. - lru.lru_touch_entire_pintail(); - - // unpin all subtrees - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - p++) - p->first->put(CDir::PIN_SUBTREETEMP); - - show_subtrees(); -} - -void MDCache::handle_cache_expire(MCacheExpire *m) -{ - int from = m->get_from(); - - dout(7) << "cache_expire from mds" << from << dendl; - - if (mds->get_state() < MDSMap::STATE_REJOIN) { - delete m; - return; - } - - // loop over realms - for (map::iterator p = m->realms.begin(); - p != m->realms.end(); - ++p) { - // check container? - if (p->first.ino > 0) { - CInode *coni = get_inode(p->first.ino); - assert(coni); // we had better have this. - CDir *con = coni->get_approx_dirfrag(p->first.frag); - assert(con); - - if (!con->is_auth() || - (con->is_auth() && con->is_exporting() && - migrator->get_export_state(con) == Migrator::EXPORT_WARNING && - migrator->export_has_warned(con,from))) { - // not auth. - dout(7) << "delaying nonauth|warned expires for " << *con << dendl; - assert(con->is_frozen_tree_root()); - - // make a message container - if (delayed_expire[con].count(from) == 0) - delayed_expire[con][from] = new MCacheExpire(from); - - // merge these expires into it - delayed_expire[con][from]->add_realm(p->first, p->second); - continue; - } - dout(7) << "expires for " << *con << dendl; - } else { - dout(7) << "containerless expires (root, stray inodes)" << dendl; - } - - // INODES - for (map::iterator it = p->second.inodes.begin(); - it != p->second.inodes.end(); - it++) { - CInode *in = get_inode(it->first); - int nonce = it->second; - - if (!in) { - dout(0) << " inode expire on " << it->first << " from " << from << ", don't have it" << dendl; - assert(in); - } - assert(in->is_auth()); - - // check nonce - if (nonce == in->get_replica_nonce(from)) { - // remove from our cached_by - dout(7) << " inode expire on " << *in << " from mds" << from << " cached_by was " << in->get_replicas() << dendl; - inode_remove_replica(in, from); - } - else { - // this is an old nonce, ignore expire. - dout(7) << " inode expire on " << *in << " from mds" << from - << " with old nonce " << nonce << " (current " << in->get_replica_nonce(from) << "), dropping" - << dendl; - assert(in->get_replica_nonce(from) > nonce); - } - } - - // DIRS - for (map::iterator it = p->second.dirs.begin(); - it != p->second.dirs.end(); - it++) { - CDir *dir = get_dirfrag(it->first); - int nonce = it->second; - - if (!dir) { - dout(0) << " dir expire on " << it->first << " from " << from << ", don't have it" << dendl; - assert(dir); - } - assert(dir->is_auth()); - - // check nonce - if (nonce == dir->get_replica_nonce(from)) { - // remove from our cached_by - dout(7) << " dir expire on " << *dir << " from mds" << from - << " replicas was " << dir->replica_map << dendl; - dir->remove_replica(from); - } - else { - // this is an old nonce, ignore expire. - dout(7) << " dir expire on " << *dir << " from mds" << from - << " with old nonce " << nonce << " (current " << dir->get_replica_nonce(from) - << "), dropping" << dendl; - assert(dir->get_replica_nonce(from) > nonce); - } - } - - // DENTRIES - for (map >::iterator pd = p->second.dentries.begin(); - pd != p->second.dentries.end(); - ++pd) { - dout(10) << " dn expires in dir " << pd->first << dendl; - CInode *diri = get_inode(pd->first.ino); - assert(diri); - CDir *dir = diri->get_dirfrag(pd->first.frag); - - if (!dir) { - dout(0) << " dn expires on " << pd->first << " from " << from << ", must have refragmented" << dendl; - } else { - assert(dir->is_auth()); - } - - for (map::iterator p = pd->second.begin(); - p != pd->second.end(); - ++p) { - int nonce = p->second; - CDentry *dn; - - if (dir) { - dn = dir->lookup(p->first); - } else { - // which dirfrag for this dentry? - CDir *dir = diri->get_dirfrag(diri->pick_dirfrag(p->first)); - assert(dir->is_auth()); - dn = dir->lookup(p->first); - } - - if (!dn) - dout(0) << " missing dentry for " << p->first << " in " << *dir << dendl; - assert(dn); - - if (nonce == dn->get_replica_nonce(from)) { - dout(7) << " dentry_expire on " << *dn << " from mds" << from << dendl; - dentry_remove_replica(dn, from); - } - else { - dout(7) << " dentry_expire on " << *dn << " from mds" << from - << " with old nonce " << nonce << " (current " << dn->get_replica_nonce(from) - << "), dropping" << dendl; - assert(dn->get_replica_nonce(from) > nonce); - } - } - } - } - - - // done - delete m; -} - -void MDCache::process_delayed_expire(CDir *dir) -{ - dout(7) << "process_delayed_expire on " << *dir << dendl; - for (map::iterator p = delayed_expire[dir].begin(); - p != delayed_expire[dir].end(); - ++p) - handle_cache_expire(p->second); - delayed_expire.erase(dir); -} - -void MDCache::discard_delayed_expire(CDir *dir) -{ - dout(7) << "discard_delayed_expire on " << *dir << dendl; - for (map::iterator p = delayed_expire[dir].begin(); - p != delayed_expire[dir].end(); - ++p) - delete p->second; - delayed_expire.erase(dir); -} - -void MDCache::inode_remove_replica(CInode *in, int from) -{ - in->remove_replica(from); - in->mds_caps_wanted.erase(from); - - // note: this code calls _eval more often than it needs to! - // fix lock - if (in->authlock.remove_replica(from)) mds->locker->simple_eval_gather(&in->authlock); - if (in->linklock.remove_replica(from)) mds->locker->simple_eval_gather(&in->linklock); - if (in->dirfragtreelock.remove_replica(from)) mds->locker->simple_eval_gather(&in->dirfragtreelock); - if (in->filelock.remove_replica(from)) mds->locker->file_eval_gather(&in->filelock); - if (in->dirlock.remove_replica(from)) mds->locker->scatter_eval_gather(&in->dirlock); - - // alone now? - /* - if (!in->is_replicated()) { - mds->locker->simple_eval_gather(&in->authlock); - mds->locker->simple_eval_gather(&in->linklock); - mds->locker->simple_eval_gather(&in->dirfragtreelock); - mds->locker->file_eval_gather(&in->filelock); - mds->locker->scatter_eval_gather(&in->dirlock); - } - */ -} - -void MDCache::dentry_remove_replica(CDentry *dn, int from) -{ - dn->remove_replica(from); - - // fix lock - if (dn->lock.remove_replica(from) || - !dn->is_replicated()) - mds->locker->simple_eval_gather(&dn->lock); -} - - - -// ========================================================================================= -// shutdown - -class C_MDC_ShutdownCheck : public Context { - MDCache *mdc; -public: - C_MDC_ShutdownCheck(MDCache *m) : mdc(m) {} - void finish(int) { - mdc->shutdown_check(); - } -}; - -void MDCache::shutdown_check() -{ - dout(0) << "shutdown_check at " << g_clock.now() << dendl; - - // cache - int o = g_conf.debug_mds; - g_conf.debug_mds = 10; - show_cache(); - g_conf.debug_mds = o; - mds->timer.add_event_after(g_conf.mds_shutdown_check, new C_MDC_ShutdownCheck(this)); - - // this - dout(0) << "lru size now " << lru.lru_get_size() << dendl; - dout(0) << "log len " << mds->mdlog->get_num_events() << dendl; - - - if (mds->filer->is_active()) - dout(0) << "filer still active" << dendl; -} - -void MDCache::shutdown_start() -{ - dout(2) << "shutdown_start" << dendl; - - if (g_conf.mds_shutdown_check) - mds->timer.add_event_after(g_conf.mds_shutdown_check, new C_MDC_ShutdownCheck(this)); - - // g_conf.debug_mds = 10; -} - - - -bool MDCache::shutdown_pass() -{ - dout(7) << "shutdown_pass" << dendl; - - if (mds->is_stopped()) { - dout(7) << " already shut down" << dendl; - show_cache(); - show_subtrees(); - return true; - } - - // flush batching eopens, so that we can properly expire them. - mds->server->journal_opens(); // hrm, this is sort of a hack. - - // flush what we can from the log - mds->mdlog->set_max_events(0); - mds->mdlog->trim(); - - if (mds->mdlog->get_num_segments() > 1) { - dout(7) << "still >1 segments, waiting for log to trim" << dendl; - return false; - } - - trim(0); - dout(5) << "lru size now " << lru.lru_get_size() << dendl; - - // SUBTREES - if (!subtrees.empty() && - mds->get_nodeid() != 0 && - !migrator->is_exporting() //&& - //!migrator->is_importing() - ) { - dout(7) << "looking for subtrees to export to mds0" << dendl; - list ls; - for (map >::iterator it = subtrees.begin(); - it != subtrees.end(); - it++) { - CDir *dir = it->first; - if (dir->get_inode()->is_stray()) continue; - if (dir->is_frozen() || dir->is_freezing()) continue; - if (!dir->is_full_dir_auth()) continue; - ls.push_back(dir); - } - int max = 5; // throttle shutdown exports.. hack! - for (list::iterator p = ls.begin(); p != ls.end(); ++p) { - CDir *dir = *p; - int dest = dir->get_inode()->authority().first; - if (dest > 0 && !mds->mdsmap->is_active(dest)) dest = 0; - dout(7) << "sending " << *dir << " back to mds" << dest << dendl; - migrator->export_dir(dir, dest); - if (--max == 0) break; - } - } - - - // subtrees map not empty yet? - if (!subtrees.empty()) { - dout(7) << "still have " << num_subtrees() << " subtrees" << dendl; - show_subtrees(); - migrator->show_importing(); - migrator->show_exporting(); - if (!migrator->is_importing() && !migrator->is_exporting()) - show_cache(); - return false; - } - assert(subtrees.empty()); - assert(!migrator->is_exporting()); - assert(!migrator->is_importing()); - - - - // empty out stray contents - // FIXME - dout(7) << "FIXME: i need to empty out stray dir contents..." << dendl; - - // (only do this once!) - if (!mds->mdlog->is_capped()) { - dout(7) << "capping the log" << dendl; - mds->mdlog->cap(); - mds->mdlog->trim(); - } - - if (!mds->mdlog->empty()) { - dout(7) << "waiting for log to flush.. " << mds->mdlog->get_num_events() - << " in " << mds->mdlog->get_num_segments() << " segments" << dendl; - return false; - } - - if (!did_shutdown_log_cap) { - // flush journal header - dout(7) << "writing header for (now-empty) journal" << dendl; - assert(mds->mdlog->empty()); - mds->mdlog->write_head(0); - // NOTE: filer active checker below will block us until this completes. - did_shutdown_log_cap = true; - return false; - } - - // filer active? - if (mds->filer->is_active()) { - dout(7) << "filer still active" << dendl; - return false; - } - - // trim what we can from the cache - if (lru.lru_get_size() > 0) { - dout(7) << "there's still stuff in the cache: " << lru.lru_get_size() << dendl; - show_cache(); - //dump(); - return false; - } - - // done! - dout(2) << "shutdown done." << dendl; - return true; -} - - - - - - - - - -// ========= messaging ============== - - -void MDCache::dispatch(Message *m) -{ - switch (m->get_type()) { - - // RESOLVE - case MSG_MDS_RESOLVE: - handle_resolve((MMDSResolve*)m); - break; - case MSG_MDS_RESOLVEACK: - handle_resolve_ack((MMDSResolveAck*)m); - break; - - // REJOIN - case MSG_MDS_CACHEREJOIN: - handle_cache_rejoin((MMDSCacheRejoin*)m); - break; - - case MSG_MDS_DISCOVER: - handle_discover((MDiscover*)m); - break; - case MSG_MDS_DISCOVERREPLY: - handle_discover_reply((MDiscoverReply*)m); - break; - - /* - case MSG_MDS_INODEUPDATE: - handle_inode_update((MInodeUpdate*)m); - break; - */ - - case MSG_MDS_DIRUPDATE: - handle_dir_update((MDirUpdate*)m); - break; - - case MSG_MDS_CACHEEXPIRE: - handle_cache_expire((MCacheExpire*)m); - break; - - - - case MSG_MDS_DENTRYUNLINK: - handle_dentry_unlink((MDentryUnlink*)m); - break; - - - case MSG_MDS_FRAGMENTNOTIFY: - handle_fragment_notify((MMDSFragmentNotify*)m); - break; - - - - default: - dout(7) << "cache unknown message " << m->get_type() << dendl; - assert(0); - break; - } -} - - -/* path_traverse - * - * return values: - * <0 : traverse error (ENOTDIR, ENOENT, etc.) - * 0 : success - * >0 : delayed or forwarded - * - * onfail values: - * - * MDS_TRAVERSE_FORWARD - forward to auth (or best guess) - * MDS_TRAVERSE_DISCOVER - discover missing items. skip permission checks. - * MDS_TRAVERSE_DISCOVERXLOCK - discover XLOCKED items too (be careful!). - * MDS_TRAVERSE_FAIL - return an error - */ - -Context *MDCache::_get_waiter(MDRequest *mdr, Message *req) -{ - if (mdr) { - dout(20) << "_get_waiter retryrequest" << dendl; - return new C_MDS_RetryRequest(this, mdr); - } else { - dout(20) << "_get_waiter retrymessage" << dendl; - return new C_MDS_RetryMessage(mds, req); - } -} - -int MDCache::path_traverse(MDRequest *mdr, Message *req, // who - CInode *base, filepath& origpath, // what - vector& trace, // result - bool follow_trailing_symlink, // how - int onfail) -{ - assert(mdr || req); - bool null_okay = onfail == MDS_TRAVERSE_DISCOVERXLOCK; - bool noperm = false; - if (onfail == MDS_TRAVERSE_DISCOVER || - onfail == MDS_TRAVERSE_DISCOVERXLOCK) - noperm = true; - - // keep a list of symlinks we touch to avoid loops - set< pair > symlinks_resolved; - - // root - CInode *cur = base; - if (!cur) cur = get_root(); - if (cur == NULL) { - dout(7) << "traverse: i don't have root" << dendl; - open_root(_get_waiter(mdr, req)); - return 1; - } - - if (mds->logger) mds->logger->inc("t"); - - // start trace - trace.clear(); - - // make our own copy, since we'll modify when we hit symlinks - filepath path = origpath; - - unsigned depth = 0; - while (depth < path.depth()) { - dout(12) << "traverse: path seg depth " << depth << " = " << path[depth] << dendl; - - // ENOTDIR? - if (!cur->is_dir()) { - dout(7) << "traverse: " << *cur << " not a dir " << dendl; - return -ENOTDIR; - } - - // open dir - frag_t fg = cur->pick_dirfrag(path[depth]); - CDir *curdir = cur->get_dirfrag(fg); - if (!curdir) { - if (cur->is_auth()) { - // parent dir frozen_dir? - if (cur->is_frozen_dir()) { - dout(7) << "traverse: " << *cur->get_parent_dir() << " is frozen_dir, waiting" << dendl; - cur->get_parent_dn()->get_dir()->add_waiter(CDir::WAIT_UNFREEZE, _get_waiter(mdr, req)); - return 1; - } - curdir = cur->get_or_open_dirfrag(this, fg); - } else { - // discover? - dout(10) << "traverse: need dirfrag " << fg << ", doing discover from " << *cur << dendl; - discover_path(cur, path.postfixpath(depth), _get_waiter(mdr, req), - onfail == MDS_TRAVERSE_DISCOVERXLOCK); - if (mds->logger) mds->logger->inc("tdis"); - return 1; - } - } - assert(curdir); - - // frozen? - /* - if (curdir->is_frozen()) { - // doh! - // FIXME: traverse is allowed? - dout(7) << "traverse: " << *curdir << " is frozen, waiting" << dendl; - curdir->add_waiter(CDir::WAIT_UNFREEZE, _get_waiter(mdr, req)); - if (onfinish) delete onfinish; - return 1; - } - */ - - // must read directory hard data (permissions, x bit) to traverse - if (!noperm && - !mds->locker->simple_rdlock_try(&cur->authlock, 0)) { - dout(7) << "traverse: waiting on authlock rdlock on " << *cur << dendl; - cur->authlock.add_waiter(SimpleLock::WAIT_RD, _get_waiter(mdr, req)); - return 1; - } - - // check permissions? - // XXX - - // ..? - if (path[depth] == "..") { - trace.pop_back(); - depth++; - cur = cur->get_parent_inode(); - dout(10) << "traverse: following .. back to " << *cur << dendl; - continue; - } - - - // dentry - CDentry *dn = curdir->lookup(path[depth]); - - // null and last_bit and xlocked by me? - if (dn && dn->is_null() && null_okay) { - dout(10) << "traverse: hit null dentry at tail of traverse, succeeding" << dendl; - trace.push_back(dn); - break; // done! - } - - if (dn && !dn->is_null()) { - // dentry exists. xlocked? - if (!noperm && dn->lock.is_xlocked() && dn->lock.get_xlocked_by() != mdr) { - dout(10) << "traverse: xlocked dentry at " << *dn << dendl; - dn->lock.add_waiter(SimpleLock::WAIT_RD, _get_waiter(mdr, req)); - if (mds->logger) mds->logger->inc("tlock"); - return 1; - } - - // do we have inode? - if (!dn->inode) { - assert(dn->is_remote()); - // do i have it? - CInode *in = get_inode(dn->get_remote_ino()); - if (in) { - dout(7) << "linking in remote in " << *in << dendl; - dn->link_remote(in); - } else { - dout(7) << "remote link to " << dn->get_remote_ino() << ", which i don't have" << dendl; - assert(mdr); // we shouldn't hit non-primary dentries doing a non-mdr traversal! - open_remote_ino(dn->get_remote_ino(), mdr, _get_waiter(mdr, req)); - if (mds->logger) mds->logger->inc("trino"); - return 1; - } - } - - // symlink? - if (dn->inode->is_symlink() && - (follow_trailing_symlink || depth < path.depth()-1)) { - // symlink, resolve! - filepath sym = dn->inode->symlink; - dout(10) << "traverse: hit symlink " << *dn->inode << " to " << sym << dendl; - - // break up path components - // /head/symlink/tail - filepath head = path.prefixpath(depth); - filepath tail = path.postfixpath(depth+1); - dout(10) << "traverse: path head = " << head << dendl; - dout(10) << "traverse: path tail = " << tail << dendl; - - if (symlinks_resolved.count(pair(dn->inode, tail.get_path()))) { - dout(10) << "already hit this symlink, bailing to avoid the loop" << dendl; - return -ELOOP; - } - symlinks_resolved.insert(pair(dn->inode, tail.get_path())); - - // start at root? - if (dn->inode->symlink[0] == '/') { - // absolute - trace.clear(); - depth = 0; - path = dn->inode->symlink; - path.append(tail); - dout(10) << "traverse: absolute symlink, path now " << path << " depth " << depth << dendl; - } else { - // relative - path = head; - path.append(sym); - path.append(tail); - dout(10) << "traverse: relative symlink, path now " << path << " depth " << depth << dendl; - } - continue; - } - - // forwarder wants replicas? - if (mdr && mdr->client_request && - mdr->client_request->get_mds_wants_replica_in_dirino()) { - dout(30) << "traverse: REP is here, " - << mdr->client_request->get_mds_wants_replica_in_dirino() - << " vs " << curdir->dirfrag() << dendl; - - if (mdr->client_request->get_mds_wants_replica_in_dirino() == curdir->ino() && - curdir->is_auth() && - curdir->is_rep() && - curdir->is_replica(req->get_source().num()) && - dn->is_auth() - ) { - assert(req->get_source().is_mds()); - int from = req->get_source().num(); - - if (dn->is_replica(from)) { - dout(15) << "traverse: REP would replicate to mds" << from << ", but already cached_by " - << req->get_source() << " dn " << *dn << dendl; - } else { - dout(10) << "traverse: REP replicating to " << req->get_source() << " dn " << *dn << dendl; - MDiscoverReply *reply = new MDiscoverReply(curdir->dirfrag()); - reply->add_dentry( dn->replicate_to( from ) ); - if (dn->is_primary()) - reply->add_inode( dn->inode->replicate_to( from ) ); - mds->send_message_mds(reply, req->get_source().num(), MDS_PORT_CACHE); - } - } - } - - // add to trace, continue. - trace.push_back(dn); - cur = dn->inode; - touch_inode(cur); - depth++; - continue; - } - - // MISS. dentry doesn't exist. - dout(12) << "traverse: miss on dentry " << path[depth] << " in " << *curdir << dendl; - - if (curdir->is_auth()) { - // dentry is mine. - if (curdir->is_complete()) { - // file not found - return -ENOENT; - } else { - // directory isn't complete; reload - dout(7) << "traverse: incomplete dir contents for " << *cur << ", fetching" << dendl; - touch_inode(cur); - curdir->fetch(_get_waiter(mdr, req)); - if (mds->logger) mds->logger->inc("tdirf"); - return 1; - } - } else { - // dirfrag/dentry is not mine. - pair dauth = curdir->authority(); - - if ((onfail == MDS_TRAVERSE_DISCOVER || - onfail == MDS_TRAVERSE_DISCOVERXLOCK)) { - dout(7) << "traverse: discover from " << path[depth] << " from " << *curdir << dendl; - discover_path(curdir, path.postfixpath(depth), _get_waiter(mdr, req), - onfail == MDS_TRAVERSE_DISCOVERXLOCK); - if (mds->logger) mds->logger->inc("tdis"); - return 1; - } - if (onfail == MDS_TRAVERSE_FORWARD) { - // forward - dout(7) << "traverse: not auth for " << path << " in " << *curdir << dendl; - - if (curdir->is_ambiguous_auth()) { - // wait - dout(7) << "traverse: waiting for single auth in " << *curdir << dendl; - curdir->add_waiter(CDir::WAIT_SINGLEAUTH, _get_waiter(mdr, req)); - return 1; - } - - dout(7) << "traverse: forwarding, not auth for " << *curdir << dendl; - - // request replication? - if (mdr && mdr->client_request && curdir->is_rep()) { - dout(15) << "traverse: REP fw to mds" << dauth << ", requesting rep under " - << *curdir << " req " << *(MClientRequest*)req << dendl; - mdr->client_request->set_mds_wants_replica_in_dirino(curdir->ino()); - req->clear_payload(); // reencode! - } - - if (mdr) - request_forward(mdr, dauth.first, req->get_dest_port()); - else - mds->forward_message_mds(req, dauth.first, req->get_dest_port()); - - if (mds->logger) mds->logger->inc("tfw"); - return 2; - } - if (onfail == MDS_TRAVERSE_FAIL) - return -ENOENT; // not necessarily exactly true.... - } - - assert(0); // i shouldn't get here - } - - // success. - if (mds->logger) mds->logger->inc("thit"); - return 0; -} - -bool MDCache::path_is_mine(filepath& path) -{ - dout(15) << "path_is_mine " << path << dendl; - - // start at root. FIXME. - CInode *cur = root; - assert(cur); - - for (unsigned i=0; ipick_dirfrag(path[i]); - CDir *dir = cur->get_dirfrag(fg); - if (!dir) return cur->is_auth(); - CDentry *dn = dir->lookup(path[i]); - if (!dn) return dir->is_auth(); - assert(dn->is_primary()); - cur = dn->get_inode(); - } - - return cur->is_auth(); -} - -/** - * path_traverse_to_dir -- traverse to deepest dir we have - * - * @path - path to traverse (as far as we can) - * - * assumes we _don't_ have the full path. (if we do, we return NULL.) - */ -CDir *MDCache::path_traverse_to_dir(filepath& path) -{ - CInode *cur = root; - assert(cur); - for (unsigned i=0; ipick_dirfrag(path[i]); - CDir *dir = cur->get_or_open_dirfrag(this, fg); - CDentry *dn = dir->lookup(path[i]); - if (!dn) return dir; - assert(dn->is_primary()); - cur = dn->get_inode(); - } - - return NULL; // oh, we have the full path. -} - - -/** - * open_remote_dir -- open up a remote dirfrag - * - * @diri - base inode - * @approxfg - approximate fragment. - * @fin - completion callback - */ -void MDCache::open_remote_dirfrag(CInode *diri, frag_t approxfg, Context *fin) -{ - dout(10) << "open_remote_dir on " << *diri << dendl; - - assert(diri->is_dir()); - assert(!diri->is_auth()); - assert(diri->get_dirfrag(approxfg) == 0); - - int auth = diri->authority().first; - - if (mds->mdsmap->get_state(auth) >= MDSMap::STATE_REJOIN) { - discover_dir_frag(diri, approxfg, fin); - } else { - // mds is down or recovering. forge a replica! - forge_replica_dir(diri, approxfg, auth); - } -} - - -/** - * get_dentry_inode - get or open inode - * - * @dn the dentry - * @mdr current request - * - * will return inode for primary, or link up/open up remote link's inode as necessary. - */ -CInode *MDCache::get_dentry_inode(CDentry *dn, MDRequest *mdr) -{ - assert(!dn->is_null()); - - if (dn->is_primary()) - return dn->inode; - - assert(dn->is_remote()); - CInode *in = get_inode(dn->get_remote_ino()); - if (in) { - dout(7) << "get_dentry_inode linking in remote in " << *in << dendl; - dn->link_remote(in); - return in; - } else { - dout(10) << "get_dentry_inode on remote dn, opening inode for " << *dn << dendl; - open_remote_ino(dn->get_remote_ino(), mdr, new C_MDS_RetryRequest(this, mdr)); - return 0; - } -} - -class C_MDC_RetryOpenRemoteIno : public Context { - MDCache *mdcache; - inodeno_t ino; - MDRequest *mdr; - Context *onfinish; -public: - C_MDC_RetryOpenRemoteIno(MDCache *mdc, inodeno_t i, MDRequest *r, Context *c) : - mdcache(mdc), ino(i), mdr(r), onfinish(c) {} - void finish(int r) { - mdcache->open_remote_ino(ino, mdr, onfinish); - } -}; - - -class C_MDC_OpenRemoteIno : public Context { - MDCache *mdcache; - inodeno_t ino; - MDRequest *mdr; - Context *onfinish; -public: - vector anchortrace; - - C_MDC_OpenRemoteIno(MDCache *mdc, inodeno_t i, MDRequest *r, Context *c) : - mdcache(mdc), ino(i), mdr(r), onfinish(c) {} - C_MDC_OpenRemoteIno(MDCache *mdc, inodeno_t i, vector& at, - MDRequest *r, Context *c) : - mdcache(mdc), ino(i), mdr(r), onfinish(c), anchortrace(at) {} - - void finish(int r) { - assert(r == 0); - if (r == 0) - mdcache->open_remote_ino_2(ino, mdr, anchortrace, onfinish); - else { - onfinish->finish(r); - delete onfinish; - } - } -}; - -void MDCache::open_remote_ino(inodeno_t ino, - MDRequest *mdr, - Context *onfinish) -{ - dout(7) << "open_remote_ino on " << ino << dendl; - - C_MDC_OpenRemoteIno *c = new C_MDC_OpenRemoteIno(this, ino, mdr, onfinish); - mds->anchorclient->lookup(ino, c->anchortrace, c); -} - -void MDCache::open_remote_ino_2(inodeno_t ino, - MDRequest *mdr, - vector& anchortrace, - Context *onfinish) -{ - dout(7) << "open_remote_ino_2 on " << ino - << ", trace depth is " << anchortrace.size() << dendl; - - // find deepest cached inode in prefix - unsigned i = anchortrace.size(); // i := array index + 1 - CInode *in = 0; - while (1) { - // inode? - dout(10) << " " << i << ": " << anchortrace[i-1] << dendl; - in = get_inode(anchortrace[i-1].ino); - if (in) break; - i--; - if (!i) { - in = get_inode(anchortrace[i].dirfrag.ino); - assert(in); // actually, we may need to open the root or a foreign stray inode, here. - break; - } - } - dout(10) << "deepest cached inode at " << i << " is " << *in << dendl; - - if (in->ino() == ino) { - // success - dout(10) << "open_remote_ino_2 have " << *in << dendl; - onfinish->finish(0); - delete onfinish; - return; - } - - // open dirfrag beneath *in - frag_t frag = anchortrace[i].dirfrag.frag; - - if (!in->dirfragtree.contains(frag)) { - dout(10) << "frag " << frag << " not valid, requerying anchortable" << dendl; - open_remote_ino(ino, mdr, onfinish); - return; - } - - CDir *dir = in->get_dirfrag(frag); - - if (!dir && !in->is_auth()) { - dout(10) << "opening remote dirfrag " << frag << " under " << *in << dendl; - /* FIXME: we re-query the anchortable just to avoid a fragtree update race */ - open_remote_dirfrag(in, frag, - new C_MDC_RetryOpenRemoteIno(this, ino, mdr, onfinish)); - return; - } - - if (!dir && in->is_auth()) - dir = in->get_or_open_dirfrag(this, frag); - - assert(dir); - if (dir->is_auth()) { - if (dir->is_complete()) { - // hrm. requery anchor table. - dout(10) << "expected ino " << anchortrace[i].ino - << " in complete dir " << *dir - << ", requerying anchortable" - << dendl; - open_remote_ino(ino, mdr, onfinish); - } else { - dout(10) << "need ino " << anchortrace[i].ino - << ", fetching incomplete dir " << *dir - << dendl; - dir->fetch(new C_MDC_OpenRemoteIno(this, ino, anchortrace, mdr, onfinish)); - } - } else { - // hmm, discover. - dout(10) << "have remote dirfrag " << *dir << ", discovering " - << anchortrace[i].ino << dendl; - discover_ino(dir, anchortrace[i].ino, - new C_MDC_OpenRemoteIno(this, ino, anchortrace, mdr, onfinish)); - } -} - - - - -void MDCache::make_trace(vector& trace, CInode *in) -{ - CInode *parent = in->get_parent_inode(); - if (parent) { - make_trace(trace, parent); - - CDentry *dn = in->get_parent_dn(); - dout(15) << "make_trace adding " << *dn << dendl; - trace.push_back(dn); - } -} - - -MDRequest *MDCache::request_start(MClientRequest *req) -{ - // did we win a forward race against a slave? - if (active_requests.count(req->get_reqid())) { - MDRequest *mdr = active_requests[req->get_reqid()]; - if (mdr->is_slave()) { - dout(10) << "request_start already had " << *mdr << ", cleaning up" << dendl; - request_cleanup(mdr); - delete mdr; - } else { - dout(10) << "request_start already processing " << *mdr << ", dropping new msg" << dendl; - delete req; - return 0; - } - } - - // register new client request - MDRequest *mdr = new MDRequest(req->get_reqid(), req); - active_requests[req->get_reqid()] = mdr; - dout(7) << "request_start " << *mdr << dendl; - return mdr; -} - -MDRequest *MDCache::request_start_slave(metareqid_t ri, int by) -{ - MDRequest *mdr = new MDRequest(ri, by); - assert(active_requests.count(mdr->reqid) == 0); - active_requests[mdr->reqid] = mdr; - dout(7) << "request_start_slave " << *mdr << " by mds" << by << dendl; - return mdr; -} - - -MDRequest *MDCache::request_get(metareqid_t rid) -{ - assert(active_requests.count(rid)); - dout(7) << "request_get " << rid << " " << *active_requests[rid] << dendl; - return active_requests[rid]; -} - -void MDCache::request_finish(MDRequest *mdr) -{ - dout(7) << "request_finish " << *mdr << dendl; - - // slave finisher? - if (mdr->more()->slave_commit) { - mdr->more()->slave_commit->finish(0); - delete mdr->more()->slave_commit; - mdr->more()->slave_commit = 0; - } - - if (mdr->client_request && mds->logger) { - mds->logger->inc("reply"); - mds->logger->favg("replyl", g_clock.now() - mdr->client_request->get_recv_stamp()); - } - - delete mdr->client_request; - delete mdr->slave_request; - request_cleanup(mdr); -} - - -void MDCache::request_forward(MDRequest *mdr, int who, int port) -{ - if (!port) port = MDS_PORT_SERVER; - dout(7) << "request_forward " << *mdr << " to mds" << who << " req " << *mdr << dendl; - - mds->forward_message_mds(mdr->client_request, who, port); - request_cleanup(mdr); - - if (mds->logger) mds->logger->inc("fw"); -} - - -void MDCache::dispatch_request(MDRequest *mdr) -{ - if (mdr->client_request) { - mds->server->dispatch_client_request(mdr); - } else if (mdr->slave_request) { - mds->server->dispatch_slave_request(mdr); - } else - assert(0); -} - - - -void MDCache::request_forget_foreign_locks(MDRequest *mdr) -{ - // xlocks - set::iterator p = mdr->xlocks.begin(); - while (p != mdr->xlocks.end()) { - if ((*p)->get_parent()->is_auth()) - p++; - else { - dout(10) << "request_forget_foreign_locks " << **p - << " on " << *(*p)->get_parent() << dendl; - (*p)->put_xlock(); - mdr->locks.erase(*p); - mdr->xlocks.erase(p++); - } - } -} - -void MDCache::request_cleanup(MDRequest *mdr) -{ - dout(15) << "request_cleanup " << *mdr << dendl; - metareqid_t ri = mdr->reqid; - - // clear ref, trace - mdr->ref = 0; - mdr->trace.clear(); - - // clean up slaves - // (will implicitly drop remote dn pins) - for (set::iterator p = mdr->more()->slaves.begin(); - p != mdr->more()->slaves.end(); - ++p) { - MMDSSlaveRequest *r = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_FINISH); - mds->send_message_mds(r, *p, MDS_PORT_SERVER); - } - // strip foreign xlocks out of lock lists, since the OP_FINISH drops them implicitly. - request_forget_foreign_locks(mdr); - - - // drop locks - mds->locker->drop_locks(mdr); - - // drop (local) auth pins - mdr->drop_local_auth_pins(); - - // drop stickydirs - for (set::iterator p = mdr->stickydirs.begin(); - p != mdr->stickydirs.end(); - ++p) - (*p)->put_stickydirs(); - - // drop cache pins - for (set::iterator it = mdr->pins.begin(); - it != mdr->pins.end(); - it++) - (*it)->put(MDSCacheObject::PIN_REQUEST); - mdr->pins.clear(); - - // remove from map - active_requests.erase(mdr->reqid); - delete mdr; - - - - - // log some stats ***** - if (mds->logger) { - mds->logger->set("c", lru.lru_get_size()); - mds->logger->set("cpin", lru.lru_get_num_pinned()); - mds->logger->set("ctop", lru.lru_get_top()); - mds->logger->set("cbot", lru.lru_get_bot()); - mds->logger->set("cptail", lru.lru_get_pintail()); - //mds->logger->set("buf",buffer_total_alloc); - } - - if (g_conf.log_pins) { - // pin - /* -for (int i=0; ilogger2) mds->logger2->set(cinode_pin_names[i], - cinode_pins[i]); - } - */ - /* - for (map::iterator it = cdir_pins.begin(); - it != cdir_pins.end(); - it++) { - //string s = "D"; - //s += cdir_pin_names[it->first]; - if (mds->logger2) mds->logger2->set(//s, - cdir_pin_names[it->first], - it->second); - } - */ - } - -} - - -// -------------------------------------------------------------------- -// ANCHORS - -// CREATE - -class C_MDC_AnchorCreatePrepared : public Context { - MDCache *cache; - CInode *in; -public: - version_t atid; - C_MDC_AnchorCreatePrepared(MDCache *c, CInode *i) : cache(c), in(i) {} - void finish(int r) { - cache->_anchor_create_prepared(in, atid); - } -}; - -void MDCache::anchor_create(MDRequest *mdr, CInode *in, Context *onfinish) -{ - assert(in->is_auth()); - - // auth pin - if (!in->can_auth_pin() && - !mdr->is_auth_pinned(in)) { - dout(7) << "anchor_create not authpinnable, waiting on " << *in << dendl; - in->add_waiter(CInode::WAIT_UNFREEZE, onfinish); - return; - } - - // wait - in->add_waiter(CInode::WAIT_ANCHORED, onfinish); - - // already anchoring? - if (in->state_test(CInode::STATE_ANCHORING)) { - dout(7) << "anchor_create already anchoring " << *in << dendl; - return; - } - - dout(7) << "anchor_create " << *in << dendl; - - // auth: do it - in->state_set(CInode::STATE_ANCHORING); - in->get(CInode::PIN_ANCHORING); - in->auth_pin(); - - // make trace - vector trace; - in->make_anchor_trace(trace); - - // do it - C_MDC_AnchorCreatePrepared *fin = new C_MDC_AnchorCreatePrepared(this, in); - mds->anchorclient->prepare_create(in->ino(), trace, &fin->atid, fin); -} - -class C_MDC_AnchorCreateLogged : public Context { - MDCache *cache; - CInode *in; - version_t atid; - LogSegment *ls; -public: - C_MDC_AnchorCreateLogged(MDCache *c, CInode *i, version_t t, LogSegment *s) : - cache(c), in(i), atid(t), ls(s) {} - void finish(int r) { - cache->_anchor_create_logged(in, atid, ls); - } -}; - -void MDCache::_anchor_create_prepared(CInode *in, version_t atid) -{ - dout(10) << "_anchor_create_prepared " << *in << " atid " << atid << dendl; - assert(in->inode.anchored == false); - - // update the logged inode copy - inode_t *pi = in->project_inode(); - pi->anchored = true; - pi->version = in->pre_dirty(); - - // note anchor transaction - EUpdate *le = new EUpdate(mds->mdlog, "anchor_create"); - le->metablob.add_dir_context(in->get_parent_dir()); - le->metablob.add_primary_dentry(in->parent, true, 0, pi); - le->metablob.add_anchor_transaction(atid); - mds->mdlog->submit_entry(le, new C_MDC_AnchorCreateLogged(this, in, atid, - mds->mdlog->get_current_segment())); -} - - -void MDCache::_anchor_create_logged(CInode *in, version_t atid, LogSegment *ls) -{ - dout(10) << "_anchor_create_logged on " << *in << dendl; - - // unpin - assert(in->state_test(CInode::STATE_ANCHORING)); - in->state_clear(CInode::STATE_ANCHORING); - in->put(CInode::PIN_ANCHORING); - in->auth_unpin(); - - // apply update to cache - in->pop_and_dirty_projected_inode(ls); - - // tell the anchortable we've committed - mds->anchorclient->commit(atid, ls); - - // trigger waiters - in->finish_waiting(CInode::WAIT_ANCHORED, 0); -} - - -// DESTROY - -class C_MDC_AnchorDestroyPrepared : public Context { - MDCache *cache; - CInode *in; -public: - version_t atid; - C_MDC_AnchorDestroyPrepared(MDCache *c, CInode *i) : cache(c), in(i) {} - void finish(int r) { - cache->_anchor_destroy_prepared(in, atid); - } -}; - -void MDCache::anchor_destroy(CInode *in, Context *onfinish) -{ - assert(in->is_auth()); - - // auth pin - if (!in->can_auth_pin()/* && - !mdr->is_auth_pinned(in)*/) { - dout(7) << "anchor_destroy not authpinnable, waiting on " << *in << dendl; - in->add_waiter(CInode::WAIT_UNFREEZE, onfinish); - return; - } - - // wait - if (onfinish) - in->add_waiter(CInode::WAIT_UNANCHORED, onfinish); - - // already anchoring? - if (in->state_test(CInode::STATE_UNANCHORING)) { - dout(7) << "anchor_destroy already unanchoring " << *in << dendl; - return; - } - - dout(7) << "anchor_destroy " << *in << dendl; - - // auth: do it - in->state_set(CInode::STATE_UNANCHORING); - in->get(CInode::PIN_UNANCHORING); - in->auth_pin(); - - // do it - C_MDC_AnchorDestroyPrepared *fin = new C_MDC_AnchorDestroyPrepared(this, in); - mds->anchorclient->prepare_destroy(in->ino(), &fin->atid, fin); -} - -class C_MDC_AnchorDestroyLogged : public Context { - MDCache *cache; - CInode *in; - version_t atid; - LogSegment *ls; -public: - C_MDC_AnchorDestroyLogged(MDCache *c, CInode *i, version_t t, LogSegment *l) : - cache(c), in(i), atid(t), ls(l) {} - void finish(int r) { - cache->_anchor_destroy_logged(in, atid, ls); - } -}; - -void MDCache::_anchor_destroy_prepared(CInode *in, version_t atid) -{ - dout(10) << "_anchor_destroy_prepared " << *in << " atid " << atid << dendl; - - assert(in->inode.anchored == true); - - // update the logged inode copy - inode_t *pi = in->project_inode(); - pi->anchored = true; - pi->version = in->pre_dirty(); - - // log + wait - EUpdate *le = new EUpdate(mds->mdlog, "anchor_destroy"); - le->metablob.add_dir_context(in->get_parent_dir()); - le->metablob.add_primary_dentry(in->parent, true, 0, pi); - le->metablob.add_anchor_transaction(atid); - mds->mdlog->submit_entry(le, new C_MDC_AnchorDestroyLogged(this, in, atid, mds->mdlog->get_current_segment())); -} - - -void MDCache::_anchor_destroy_logged(CInode *in, version_t atid, LogSegment *ls) -{ - dout(10) << "_anchor_destroy_logged on " << *in << dendl; - - // unpin - assert(in->state_test(CInode::STATE_UNANCHORING)); - in->state_clear(CInode::STATE_UNANCHORING); - in->put(CInode::PIN_UNANCHORING); - in->auth_unpin(); - - // apply update to cache - in->pop_and_dirty_projected_inode(ls); - - // tell the anchortable we've committed - mds->anchorclient->commit(atid, ls); - - // trigger waiters - in->finish_waiting(CInode::WAIT_UNANCHORED, 0); -} - - -// ------------------------------------------------------------------------------- -// STRAYS - -void MDCache::eval_stray(CDentry *dn) -{ - dout(10) << "eval_stray " << *dn << dendl; - assert(dn->is_primary()); - CInode *in = dn->inode; - assert(in); - - return; // FIXME or test me rather, there is a bug here somewhere! - - // purge? - if (in->inode.nlink == 0) { - if (dn->is_replicated() || in->is_any_caps()) return; // wait - if (!in->dirfrags.empty()) return; // wait for dirs to close/trim - _purge_stray(dn); - } - else if (in->inode.nlink == 1) { - // trivial reintegrate? - if (!in->remote_parents.empty()) { - CDentry *rlink = *in->remote_parents.begin(); - if (rlink->is_auth() && - rlink->dir->can_auth_pin()) - reintegrate_stray(dn, rlink); - - if (!rlink->is_auth() && - !in->is_ambiguous_auth()) - migrate_stray(dn, rlink->authority().first); - } - } else { - // wait for next use. - } -} - - -class C_MDC_PurgeStray : public Context { - MDCache *cache; - CDentry *dn; - version_t pdv; - LogSegment *ls; -public: - C_MDC_PurgeStray(MDCache *c, CDentry *d, version_t v, LogSegment *s) : - cache(c), dn(d), pdv(v), ls(s) { } - void finish(int r) { - cache->_purge_stray_logged(dn, pdv, ls); - } -}; - -void MDCache::_purge_stray(CDentry *dn) -{ - dout(10) << "_purge_stray " << *dn << " " << *dn->inode << dendl; - assert(!dn->is_replicated()); - - // log removal - version_t pdv = dn->pre_dirty(); - - EUpdate *le = new EUpdate(mds->mdlog, "purge_stray"); - le->metablob.add_dir_context(dn->dir); - le->metablob.add_null_dentry(dn, true); - le->metablob.add_inode_truncate(dn->inode->ino(), 0, dn->inode->inode.size); - - mds->mdlog->submit_entry(le, new C_MDC_PurgeStray(this, dn, pdv, mds->mdlog->get_current_segment())); - - -} - -void MDCache::_purge_stray_logged(CDentry *dn, version_t pdv, LogSegment *ls) -{ - dout(10) << "_purge_stray_logged " << *dn << " " << *dn->inode << dendl; - CInode *in = dn->inode; - - // dirty+unlink dentry - dn->dir->mark_dirty(pdv, ls); - dn->dir->unlink_inode(dn); - dn->dir->remove_dentry(dn); - - // purge+remove inode - purge_inode(in, 0, in->inode.size, ls); - remove_inode(in); -} - - - -void MDCache::reintegrate_stray(CDentry *dn, CDentry *rlink) -{ - dout(10) << "reintegrate_stray " << *dn << " into " << *rlink << dendl; - -} - - -void MDCache::migrate_stray(CDentry *dn, int dest) -{ - dout(10) << "migrate_stray to mds" << dest << " " << *dn << dendl; - -} - - - - -// ======================================================================================== -// DISCOVER -/* - - - for all discovers (except base_inos, e.g. root, stray), waiters are attached - to the parent metadata object in the cache (pinning it). - - - the discover is also registered under the per-mds discover_ hashes, so that - waiters can be kicked in the event of a failure. that is, every discover will - be followed by a reply, unless the remote node fails.. - - - each discover_reply must reliably decrement the discover_ counts. - - - base_inos are the exception. those waiters are under waiting_for_base_ino. - -*/ - -void MDCache::discover_base_ino(inodeno_t want_ino, - Context *onfinish, - int from) -{ - dout(7) << "discover_base_ino " << want_ino << " from mds" << from << dendl; - if (waiting_for_base_ino[from].count(want_ino) == 0) { - filepath want_path; - MDiscover *dis = new MDiscover(mds->get_nodeid(), - want_ino, - want_path, - false); - mds->send_message_mds(dis, from, MDS_PORT_CACHE); - } - - waiting_for_base_ino[from][want_ino].push_back(onfinish); -} - - -void MDCache::discover_dir_frag(CInode *base, - frag_t approx_fg, - Context *onfinish, - int from) -{ - if (from < 0) from = base->authority().first; - - dout(7) << "discover_dir_frag " << base->ino() << " " << approx_fg - << " from mds" << from << dendl; - - if (!base->is_waiter_for(CInode::WAIT_DIR) || !onfinish) { // this is overly conservative - filepath want_path; - MDiscover *dis = new MDiscover(mds->get_nodeid(), - base->ino(), - want_path, - true); // need the base dir open - dis->set_base_dir_frag(approx_fg); - mds->send_message_mds(dis, from, MDS_PORT_CACHE); - } - - // register + wait - if (onfinish) - base->add_waiter(CInode::WAIT_DIR, onfinish); - discover_dir[from][base->ino()]++; -} - -void MDCache::discover_path(CInode *base, - filepath want_path, - Context *onfinish, - bool want_xlocked, - int from) -{ - if (from < 0) from = base->authority().first; - - dout(7) << "discover_path " << base->ino() << " " << want_path << " from mds" << from - << (want_xlocked ? " want_xlocked":"") - << dendl; - - if (base->is_ambiguous_auth()) { - dout(10) << " waiting for single auth on " << *base << dendl; - base->add_waiter(CInode::WAIT_SINGLEAUTH, onfinish); - return; - } - - if (!base->is_waiter_for(CInode::WAIT_DIR) || !onfinish) { // this is overly conservative - MDiscover *dis = new MDiscover(mds->get_nodeid(), - base->ino(), - want_path, - true, // we want the base dir; we are relative to ino. - want_xlocked); - mds->send_message_mds(dis, from, MDS_PORT_CACHE); - } - - // register + wait - if (onfinish) base->add_waiter(CInode::WAIT_DIR, onfinish); - discover_dir[from][base->ino()]++; -} - -void MDCache::discover_path(CDir *base, - filepath want_path, - Context *onfinish, - bool want_xlocked) -{ - int from = base->authority().first; - - dout(7) << "discover_path " << base->dirfrag() << " " << want_path << " from mds" << from - << (want_xlocked ? " want_xlocked":"") - << dendl; - - if (base->is_ambiguous_auth()) { - dout(7) << " waiting for single auth on " << *base << dendl; - base->add_waiter(CDir::WAIT_SINGLEAUTH, onfinish); - return; - } - - if (!base->is_waiting_for_dentry(want_path[0]) || !onfinish) { - MDiscover *dis = new MDiscover(mds->get_nodeid(), - base->ino(), - want_path, - false, // no base dir; we are relative to dir - want_xlocked); - mds->send_message_mds(dis, from, MDS_PORT_CACHE); - } - - // register + wait - if (onfinish) base->add_dentry_waiter(want_path[0], onfinish); - discover_dir_sub[from][base->dirfrag()]++; -} - -void MDCache::discover_ino(CDir *base, - inodeno_t want_ino, - Context *onfinish, - bool want_xlocked) -{ - int from = base->authority().first; - - dout(7) << "discover_ino " << base->dirfrag() << " " << want_ino << " from mds" << from - << (want_xlocked ? " want_xlocked":"") - << dendl; - - if (!base->is_waiting_for_ino(want_ino)) { - MDiscover *dis = new MDiscover(mds->get_nodeid(), - base->dirfrag(), - want_ino, - want_xlocked); - mds->send_message_mds(dis, from, MDS_PORT_CACHE); - } - - // register + wait - base->add_ino_waiter(want_ino, onfinish); - discover_dir_sub[from][base->dirfrag()]++; -} - - - -void MDCache::kick_discovers(int who) -{ - list waiters; - - for (hash_map >::iterator p = waiting_for_base_ino[who].begin(); - p != waiting_for_base_ino[who].end(); - ++p) { - dout(10) << "kick_discovers on base ino " << p->first << dendl; - mds->queue_waiters(p->second); - } - waiting_for_base_ino.erase(who); - - for (hash_map::iterator p = discover_dir[who].begin(); - p != discover_dir[who].end(); - ++p) { - CInode *in = get_inode(p->first); - if (!in) continue; - dout(10) << "kick_discovers dir waiters on " << *in << dendl; - in->take_waiting(CInode::WAIT_DIR, waiters); - } - discover_dir.erase(who); - - for (hash_map::iterator p = discover_dir_sub[who].begin(); - p != discover_dir_sub[who].end(); - ++p) { - CDir *dir = get_dirfrag(p->first); - if (!dir) continue; - dout(10) << "kick_discovers dentry+ino waiters on " << *dir << dendl; - dir->take_sub_waiting(waiters); - } - discover_dir_sub.erase(who); - - mds->queue_waiters(waiters); -} - - - -void MDCache::handle_discover(MDiscover *dis) -{ - int whoami = mds->get_nodeid(); - - assert(dis->get_asker() != whoami); - - /* - if (mds->get_state() < MDSMap::STATE_ACTIVE) { - dout(-7) << "discover_reply NOT ACTIVE YET" << dendl; - delete dis; - return; - } - */ - - - CInode *cur = 0; - MDiscoverReply *reply = new MDiscoverReply(dis); - - // get started. - if (dis->get_base_ino() == MDS_INO_ROOT) { - // wants root - dout(7) << "handle_discover from mds" << dis->get_asker() - << " wants root + " << dis->get_want().get_path() << dendl; - - assert(mds->get_nodeid() == 0); - assert(root->is_auth()); - - // add root - reply->add_inode( root->replicate_to( dis->get_asker() ) ); - dout(10) << "added root " << *root << dendl; - - cur = root; - } - else if (dis->get_base_ino() == MDS_INO_STRAY(whoami)) { - // wants root - dout(7) << "handle_discover from mds" << dis->get_asker() - << " wants stray + " << dis->get_want().get_path() << dendl; - - reply->add_inode( stray->replicate_to( dis->get_asker() ) ); - dout(10) << "added stray " << *stray << dendl; - - cur = stray; - } - else { - // there's a base inode - cur = get_inode(dis->get_base_ino()); - - if (!cur) { - dout(7) << "handle_discover mds" << dis->get_asker() - << " don't have base ino " << dis->get_base_ino() - << dendl; - reply->set_flag_error_dir(); - } - - if (dis->wants_base_dir()) { - dout(7) << "handle_discover mds" << dis->get_asker() - << " wants basedir+" << dis->get_want().get_path() - << " has " << *cur - << dendl; - } else { - dout(7) << "handle_discover mds" << dis->get_asker() - << " wants " << dis->get_want().get_path() - << " has " << *cur - << dendl; - } - } - - assert(reply); - - // add content - // do some fidgeting to include a dir if they asked for the base dir, or just root. - for (unsigned i = 0; - cur && (i < dis->get_want().depth() || dis->get_want().depth() == 0); - i++) { - - // -- figure out the dir - - // is *cur even a dir at all? - if (!cur->is_dir()) { - dout(7) << *cur << " not a dir" << dendl; - reply->set_flag_error_dir(); - break; - } - - // pick frag - frag_t fg; - if (dis->get_want().depth()) { - // dentry specifies - fg = cur->pick_dirfrag(dis->get_dentry(i)); - } else { - // requester explicity specified the frag - fg = dis->get_base_dir_frag(); - assert(dis->wants_base_dir() || dis->get_want_ino() || dis->get_base_ino() < MDS_INO_BASE); - } - CDir *curdir = cur->get_dirfrag(fg); - - if ((!curdir && !cur->is_auth()) || - (curdir && !curdir->is_auth())) { - - /* before: - * ONLY set flag if empty!! - * otherwise requester will wake up waiter(s) _and_ continue with discover, - * resulting in duplicate discovers in flight, - * which can wreak havoc when discovering rename srcdn (which may move) - */ - - if (reply->is_empty()) { - // only hint if empty. - // someday this could be better, but right now the waiter logic isn't smart enough. - - // hint - if (curdir) { - dout(7) << " not dirfrag auth, setting dir_auth_hint for " << *curdir << dendl; - reply->set_dir_auth_hint(curdir->authority().first); - } else { - dout(7) << " dirfrag not open, not inode auth, setting dir_auth_hint for " - << *cur << dendl; - reply->set_dir_auth_hint(cur->authority().first); - } - - // note error dentry, if any - // NOTE: important, as it allows requester to issue an equivalent discover - // to whomever we hint at. - if (dis->get_want().depth() > i) - reply->set_error_dentry(dis->get_dentry(i)); - } - - break; - } - - // open dir? - if (!curdir) - curdir = cur->get_or_open_dirfrag(this, fg); - assert(curdir); - assert(curdir->is_auth()); - - // is dir frozen? - if (curdir->is_frozen()) { - if (reply->is_empty()) { - dout(7) << *curdir << " is frozen, empty reply, waiting" << dendl; - curdir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis)); - delete reply; - return; - } else { - dout(7) << *curdir << " is frozen, non-empty reply, stopping" << dendl; - break; - } - } - - // add dir - if (reply->is_empty() && !dis->wants_base_dir()) { - dout(7) << "handle_discover not adding unwanted base dir " << *curdir << dendl; - } else { - assert(!curdir->is_ambiguous_auth()); // would be frozen. - reply->add_dir( curdir->replicate_to(dis->get_asker()) ); - dout(7) << "handle_discover added dir " << *curdir << dendl; - } - - // lookup - CDentry *dn = 0; - if (dis->get_want_ino()) { - // lookup by ino - CInode *in = get_inode(dis->get_want_ino()); - if (in && in->is_auth() && in->get_parent_dn()->get_dir() == curdir) - dn = in->get_parent_dn(); - } else if (dis->get_want().depth() > 0) { - // lookup dentry - dn = curdir->lookup( dis->get_dentry(i) ); - } else - break; // done! - - // incomplete dir? - if (!dn) { - if (!curdir->is_complete()) { - // readdir - dout(7) << "incomplete dir contents for " << *curdir << ", fetching" << dendl; - if (reply->is_empty()) { - // fetch and wait - curdir->fetch(new C_MDS_RetryMessage(mds, dis)); - return; - } else { - // initiate fetch, but send what we have so far - curdir->fetch(0); - break; - } - } - - // don't have wanted ino in this dir? - if (dis->get_want_ino()) { - // set error flag in reply - dout(7) << "ino " << dis->get_want_ino() << " in this dir, flagging error in " - << *curdir << dendl; - reply->set_flag_error_ino(); - break; - } - - // send null dentry - dout(7) << "dentry " << dis->get_dentry(i) << " dne, returning null in " - << *curdir << dendl; - dn = curdir->add_null_dentry(dis->get_dentry(i)); - } - assert(dn); - - // xlocked dentry? - // ...always block on non-tail items (they are unrelated) - // ...allow xlocked tail disocvery _only_ if explicitly requested - if (dn->lock.is_xlocked()) { - // is this the last (tail) item in the discover traversal? - bool tailitem = (dis->get_want().depth() == 0) || (i == dis->get_want().depth() - 1); - if (tailitem && dis->wants_xlocked()) { - dout(7) << "handle_discover allowing discovery of xlocked tail " << *dn << dendl; - } else { - dout(7) << "handle_discover blocking on xlocked " << *dn << dendl; - dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryMessage(mds, dis)); - delete reply; - return; - } - } - - // frozen inode? - if (dn->is_primary() && - dn->inode->is_frozen()) { - if (reply->is_empty()) { - dout(7) << *dn->inode << " is frozen, empty reply, waiting" << dendl; - dn->inode->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis)); - delete reply; - return; - } else { - dout(7) << *dn->inode << " is frozen, non-empty reply, stopping" << dendl; - break; - } - } - - // add dentry - reply->add_dentry( dn->replicate_to( dis->get_asker() ) ); - dout(7) << "handle_discover added dentry " << *dn << dendl; - - if (!dn->is_primary()) break; // stop on null or remote link. - - // add inode - CInode *next = dn->inode; - assert(next->is_auth()); - - reply->add_inode( next->replicate_to( dis->get_asker() ) ); - dout(7) << "handle_discover added inode " << *next << dendl; - - // descend, keep going. - cur = next; - continue; - } - - // how did we do? - assert(!reply->is_empty()); - dout(7) << "handle_discover sending result back to asker mds" << dis->get_asker() << dendl; - mds->send_message_mds(reply, dis->get_asker(), MDS_PORT_CACHE); - - delete dis; -} - - -void MDCache::handle_discover_reply(MDiscoverReply *m) -{ - /* - if (mds->get_state() < MDSMap::STATE_ACTIVE) { - dout(-7) << "discover_reply NOT ACTIVE YET" << dendl; - delete m; - return; - } - */ - - list finished, error; - int from = m->get_source().num(); - - // starting point - CInode *cur = get_inode(m->get_base_ino()); - - if (m->has_base_inode()) { - assert(m->get_base_ino() < MDS_INO_BASE); - assert(!m->has_base_dentry()); - assert(!m->has_base_dir()); - - // add base inode - cur = add_replica_inode(m->get_inode(0), NULL, finished); - cur->force_auth = pair(m->get_source().num(), CDIR_AUTH_UNKNOWN); - - dout(7) << "discover_reply got base inode " << *cur << dendl; - - // take waiters - finished.swap(waiting_for_base_ino[from][cur->ino()]); - waiting_for_base_ino[from].erase(cur->ino()); - } - assert(cur); - - dout(7) << "discover_reply " << *cur - << " + " << m->get_num_dentries() << " dn, " - << m->get_num_inodes() << " inodes" - << dendl; - - // fyi - if (m->is_flag_error_dir()) - dout(7) << " flag error, dir" << dendl; - if (m->is_flag_error_dn()) - dout(7) << " flag error, dentry = " << m->get_error_dentry() << dendl; - if (m->is_flag_error_ino()) - dout(7) << " flag error, ino = " << m->get_wanted_ino() << dendl; - - dout(10) << "depth = " << m->get_depth() - << ", has base_dir/base_dn/root = " - << m->has_base_dir() << " / " << m->has_base_dentry() << " / " << m->has_base_inode() - << ", num dirs/dentries/inodes = " - << m->get_num_dirs() << " / " << m->get_num_dentries() << " / " << m->get_num_inodes() - << dendl; - - // decrement discover counters - if (m->get_wanted_base_dir()) { - inodeno_t ino = m->get_base_ino(); - assert(discover_dir[from].count(ino)); - if (--discover_dir[from][ino] == 0) - discover_dir[from].erase(ino); - } else if (m->get_base_ino() >= MDS_INO_BASE) { - dirfrag_t df(m->get_base_ino(), m->get_base_dir_frag()); - assert(discover_dir_sub[from].count(df)); - if (--discover_dir_sub[from][df] == 0) - discover_dir_sub[from].erase(df); - } - - // loop over discover results. - // indexes follow each ([[dir] dentry] inode) - // can start, end with any type. - for (int i=m->has_base_inode(); iget_depth(); i++) { - dout(10) << "discover_reply i=" << i << " cur " << *cur << dendl; - - // dir - frag_t fg; - CDir *curdir = 0; - if (i > 0 || m->has_base_dir()) { - assert(m->get_dir(i).get_dirfrag().ino == cur->ino()); - fg = m->get_dir(i).get_dirfrag().frag; - curdir = add_replica_dir(cur, fg, m->get_dir(i), - m->get_source().num(), - finished); - } - if (!curdir) { - fg = cur->pick_dirfrag(m->get_dentry(i).get_dname()); - curdir = cur->get_dirfrag(fg); - } - - // dentry error? - if (i == m->get_depth()-1 && (m->is_flag_error_dn() || m->is_flag_error_ino())) { - // error! - assert(cur->is_dir()); - if (curdir) { - if (m->get_error_dentry().length()) { - dout(7) << " flag_error on dentry " << m->get_error_dentry() - << ", triggering dentry" << dendl; - curdir->take_dentry_waiting(m->get_error_dentry(), error); - } else { - dout(7) << " flag_error on ino " << m->get_wanted_ino() - << ", triggering ino" << dendl; - curdir->take_ino_waiting(m->get_wanted_ino(), error); - } - } else { - dout(7) << " flag_error on dentry " << m->get_error_dentry() - << ", triggering dir?" << dendl; - cur->take_waiting(CInode::WAIT_DIR, error); - } - break; - } - - assert(curdir); - - // dentry - CDentry *dn = 0; - if (i >= m->get_last_dentry()) break; - if (i > 0 || m->has_base_dentry()) - dn = add_replica_dentry(curdir, m->get_dentry(i), finished); - - // inode - if (i >= m->get_last_inode()) break; - cur = add_replica_inode(m->get_inode(i), dn, finished); - } - - // dir error? - // or dir_auth hint? - if (m->is_flag_error_dir() && !cur->is_dir()) { - // not a dir. - cur->take_waiting(CInode::WAIT_DIR, error); - } else if (m->is_flag_error_dir() || - (m->get_dir_auth_hint() != CDIR_AUTH_UNKNOWN && - m->get_dir_auth_hint() != mds->get_nodeid())) { - int who = m->get_dir_auth_hint(); - if (who == mds->get_nodeid()) who = -1; - if (who >= 0) - dout(7) << " dir_auth_hint is " << m->get_dir_auth_hint() << dendl; - - // try again? - if (m->get_error_dentry().length()) { - // wanted a dentry - frag_t fg = cur->pick_dirfrag(m->get_error_dentry()); - CDir *dir = cur->get_dirfrag(fg); - if (dir) { - // don't actaully need the hint, now - if (dir->lookup(m->get_error_dentry()) == 0 && - dir->is_waiting_for_dentry(m->get_error_dentry())) - discover_path(dir, m->get_error_dentry(), 0, m->get_wanted_xlocked()); - else - dout(7) << " doing nothing, have dir but nobody is waiting on dentry " - << m->get_error_dentry() << dendl; - } else { - if (cur->is_waiter_for(CInode::WAIT_DIR)) - discover_path(cur, m->get_error_dentry(), 0, m->get_wanted_xlocked(), who); - else - dout(7) << " doing nothing, nobody is waiting for dir" << dendl; - } - } else { - // wanted just the dir - frag_t fg = m->get_base_dir_frag(); - if (cur->get_dirfrag(fg) == 0 && cur->is_waiter_for(CInode::WAIT_DIR)) - discover_dir_frag(cur, fg, 0, who); - else - dout(7) << " doing nothing, nobody is waiting for dir" << dendl; - } - } - - // waiters - finish_contexts(error, -ENOENT); // finish errors directly - mds->queue_waiters(finished); - - // done - delete m; -} - - - -// ---------------------------- -// REPLICAS - -CDir *MDCache::add_replica_dir(CInode *diri, - frag_t fg, CDirDiscover &dis, int from, - list& finished) -{ - // add it (_replica_) - CDir *dir = diri->get_dirfrag(fg); - - if (dir) { - // had replica. update w/ new nonce. - dis.update_dir(dir); - dout(7) << "add_replica_dir had " << *dir << " nonce " << dir->replica_nonce << dendl; - } else { - // force frag to leaf in the diri tree - if (!diri->dirfragtree.is_leaf(fg)) { - dout(7) << "add_replica_dir forcing frag " << fg << " to leaf in the fragtree " - << diri->dirfragtree << dendl; - diri->dirfragtree.force_to_leaf(fg); - } - - // add replica. - dir = diri->add_dirfrag( new CDir(diri, fg, this, false) ); - dis.update_dir(dir); - - // is this a dir_auth delegation boundary? - if (from != diri->authority().first || - diri->is_ambiguous_auth() || - diri->ino() < MDS_INO_BASE) - adjust_subtree_auth(dir, from); - - dout(7) << "add_replica_dir added " << *dir << " nonce " << dir->replica_nonce << dendl; - - // get waiters - diri->take_waiting(CInode::WAIT_DIR, finished); - } - - return dir; -} - -CDir *MDCache::forge_replica_dir(CInode *diri, frag_t fg, int from) -{ - assert(mds->mdsmap->get_state(from) < MDSMap::STATE_REJOIN); - - // forge a replica. - CDir *dir = diri->add_dirfrag( new CDir(diri, fg, this, false) ); - - // i'm assuming this is a subtree root. - adjust_subtree_auth(dir, from); - - dout(7) << "forge_replica_dir added " << *dir << " while mds" << from << " is down" << dendl; - - return dir; -} - -CDentry *MDCache::add_replica_dentry(CDir *dir, CDentryDiscover &dis, list& finished) -{ - CDentry *dn = dir->lookup( dis.get_dname() ); - - // have it? - if (dn) { - dis.update_dentry(dn); - dout(7) << "add_replica_dentry had " << *dn << dendl; - } else { - dn = dir->add_null_dentry(dis.get_dname()); - dis.update_dentry(dn); - dis.init_dentry_lock(dn); - dout(7) << "add_replica_dentry added " << *dn << dendl; - } - - // remote_ino linkage? - if (dis.get_remote_ino()) { - if (dn->is_null()) - dir->link_remote_inode(dn, dis.get_remote_ino(), dis.get_remote_d_type()); - - // hrm. yeah. - assert(dn->is_remote() && dn->get_remote_ino() == dis.get_remote_ino()); - } - - dir->take_dentry_waiting(dis.get_dname(), finished); - - return dn; -} - -CInode *MDCache::add_replica_inode(CInodeDiscover& dis, CDentry *dn, list& finished) -{ - CInode *in = get_inode(dis.get_ino()); - if (!in) { - in = new CInode(this, false); - dis.update_inode(in); - dis.init_inode_locks(in); - add_inode(in); - dout(10) << "add_replica_inode had " << *in << dendl; - if (dn && dn->is_null()) - dn->dir->link_primary_inode(dn, in); - } else { - dis.update_inode(in); - dout(10) << "add_replica_inode added " << *in << dendl; - } - - if (dn) { - assert(dn->is_primary()); - assert(dn->inode == in); - - dn->get_dir()->take_ino_waiting(in->ino(), finished); - } - - return in; -} - - -CDentry *MDCache::add_replica_stray(bufferlist &bl, CInode *in, int from) -{ - list finished; - int off = 0; - - // inode - CInodeDiscover indis; - indis._decode(bl, off); - CInode *strayin = add_replica_inode(indis, NULL, finished); - strayin->force_auth = pair(from, CDIR_AUTH_UNKNOWN); - dout(15) << "strayin " << *strayin << dendl; - - // dir - CDirDiscover dirdis; - dirdis._decode(bl, off); - CDir *straydir = add_replica_dir(strayin, dirdis.get_dirfrag().frag, dirdis, - from, finished); - dout(15) << "straydir " << *straydir << dendl; - - // dentry - CDentryDiscover dndis; - dndis._decode(bl, off); - - string straydname; - in->name_stray_dentry(straydname); - CDentry *straydn = add_replica_dentry(straydir, dndis, finished); - - mds->queue_waiters(finished); - - return straydn; -} - - - -/* -int MDCache::send_inode_updates(CInode *in) -{ - assert(in->is_auth()); - for (set::iterator it = in->cached_by_begin(); - it != in->cached_by_end(); - it++) { - dout(7) << "sending inode_update on " << *in << " to " << *it << dendl; - assert(*it != mds->get_nodeid()); - mds->send_message_mds(new MInodeUpdate(in, in->get_cached_by_nonce(*it)), *it, MDS_PORT_CACHE); - } - - return 0; -} - - -void MDCache::handle_inode_update(MInodeUpdate *m) -{ - inodeno_t ino = m->get_ino(); - CInode *in = get_inode(m->get_ino()); - if (!in) { - //dout(7) << "inode_update on " << m->get_ino() << ", don't have it, ignoring" << dendl; - dout(7) << "inode_update on " << m->get_ino() << ", don't have it, sending expire" << dendl; - MCacheExpire *expire = new MCacheExpire(mds->get_nodeid()); - expire->add_inode(m->get_ino(), m->get_nonce()); - mds->send_message_mds(expire, m->get_source().num(), MDS_PORT_CACHE); - goto out; - } - - if (in->is_auth()) { - dout(7) << "inode_update on " << *in << ", but i'm the authority!" << dendl; - assert(0); // this should never happen - } - - dout(7) << "inode_update on " << *in << dendl; - - // update! NOTE dir_auth is unaffected by this. - in->decode_basic_state(m->get_payload()); - - out: - // done - delete m; -} -*/ - - - - - - -int MDCache::send_dir_updates(CDir *dir, bool bcast) -{ - // this is an FYI, re: replication - - set who; - if (bcast) { - mds->get_mds_map()->get_active_mds_set(who); - } else { - for (map::iterator p = dir->replicas_begin(); - p != dir->replicas_end(); - ++p) - who.insert(p->first); - } - - dout(7) << "sending dir_update on " << *dir << " bcast " << bcast << " to " << who << dendl; - - string path; - dir->inode->make_path(path); - - int whoami = mds->get_nodeid(); - for (set::iterator it = who.begin(); - it != who.end(); - it++) { - if (*it == whoami) continue; - //if (*it == except) continue; - dout(7) << "sending dir_update on " << *dir << " to " << *it << dendl; - - mds->send_message_mds(new MDirUpdate(dir->dirfrag(), - dir->dir_rep, - dir->dir_rep_by, - path, - bcast), - *it, MDS_PORT_CACHE); - } - - return 0; -} - - -void MDCache::handle_dir_update(MDirUpdate *m) -{ - CDir *dir = get_dirfrag(m->get_dirfrag()); - if (!dir) { - dout(5) << "dir_update on " << m->get_dirfrag() << ", don't have it" << dendl; - - // discover it? - if (m->should_discover()) { - // only try once! - // this is key to avoid a fragtree update race, among other things. - m->tried_discover(); - vector trace; - filepath path = m->get_path(); - - dout(5) << "trying discover on dir_update for " << path << dendl; - - int r = path_traverse(0, m, - 0, path, trace, true, - MDS_TRAVERSE_DISCOVER); - if (r > 0) - return; - assert(r == 0); - - CInode *in = get_inode(m->get_dirfrag().ino); - assert(in); - open_remote_dirfrag(in, m->get_dirfrag().frag, - new C_MDS_RetryMessage(mds, m)); - return; - } - - delete m; - return; - } - - // update - dout(5) << "dir_update on " << *dir << dendl; - dir->dir_rep = m->get_dir_rep(); - dir->dir_rep_by = m->get_dir_rep_by(); - - // done - delete m; -} - - - - - - -// UNLINK - -void MDCache::handle_dentry_unlink(MDentryUnlink *m) -{ - CDir *dir = get_dirfrag(m->get_dirfrag()); - - if (!dir) { - dout(7) << "handle_dentry_unlink don't have dirfrag " << m->get_dirfrag() << dendl; - } - else { - CDentry *dn = dir->lookup(m->get_dn()); - if (!dn) { - dout(7) << "handle_dentry_unlink don't have dentry " << *dir << " dn " << m->get_dn() << dendl; - } else { - dout(7) << "handle_dentry_unlink on " << *dn << dendl; - - // move to stray? - CDentry *straydn = 0; - if (m->strayin) { - list finished; - CInode *in = add_replica_inode(*m->strayin, NULL, finished); - CDir *dir = add_replica_dir(in, m->straydir->get_dirfrag().frag, *m->straydir, - m->get_source().num(), finished); - straydn = add_replica_dentry(dir, *m->straydn, finished); - if (!finished.empty()) mds->queue_waiters(finished); - } - - // open inode? - if (dn->is_primary()) { - CInode *in = dn->inode; - dn->dir->unlink_inode(dn); - assert(straydn); - straydn->dir->link_primary_inode(straydn, in); - } else { - assert(dn->is_remote()); - dn->dir->unlink_inode(dn); - } - assert(dn->is_null()); - - // move to bottom of lru - lru.lru_bottouch(dn); - } - } - - delete m; - return; -} - - - - - - -// =================================================================== -// FRAGMENT - - -/** - * adjust_dir_fragments -- adjust fragmentation for a directory - * - * @diri - directory inode - * @basefrag - base fragment - * @bits - bit adjustment. positive for split, negative for merge. - */ -void MDCache::adjust_dir_fragments(CInode *diri, frag_t basefrag, int bits, - list& resultfrags, - list& waiters) -{ - dout(10) << "adjust_dir_fragments " << basefrag << " " << bits - << " on " << *diri << dendl; - - // yuck. we may have discovered the inode while it was being fragmented. - if (!diri->dirfragtree.is_leaf(basefrag)) - diri->dirfragtree.force_to_leaf(basefrag); - - CDir *base = diri->get_or_open_dirfrag(this, basefrag); - - // adjust fragtree - diri->dirfragtree.split(basefrag, bits); - dout(10) << " new fragtree is " << diri->dirfragtree << dendl; - - if (bits > 0) { - if (base) { - CDir *baseparent = base->get_parent_dir(); - - base->split(bits, resultfrags, waiters); - - // did i change the subtree map? - if (base->is_subtree_root()) { - // am i a bound? - if (baseparent) { - CDir *parent = get_subtree_root(baseparent); - assert(subtrees[parent].count(base)); - subtrees[parent].erase(base); - for (list::iterator p = resultfrags.begin(); - p != resultfrags.end(); - ++p) { - subtrees[parent].insert(*p); - subtrees[*p].clear(); // new frag is now its own subtree - } - } - - // adjust my bounds. - set bounds; - bounds.swap(subtrees[base]); - subtrees.erase(base); - for (set::iterator p = bounds.begin(); - p != bounds.end(); - ++p) { - CDir *frag = get_subtree_root((*p)->get_parent_dir()); - subtrees[frag].insert(*p); - } - - show_subtrees(10); - } - } - } else { - assert(base); - base->merge(bits, waiters); - resultfrags.push_back(base); - assert(0); // FIXME adjust subtree map! and clean up this code, probably. - } -} - -class C_MDC_FragmentGo : public Context { - MDCache *mdcache; - CInode *diri; - list dirs; - frag_t basefrag; - int bits; -public: - C_MDC_FragmentGo(MDCache *m, CInode *di, list& dls, frag_t bf, int b) : - mdcache(m), diri(di), dirs(dls), basefrag(bf), bits(b) { } - virtual void finish(int r) { - mdcache->fragment_go(diri, dirs, basefrag, bits); - } -}; - -void MDCache::split_dir(CDir *dir, int bits) -{ - dout(7) << "split_dir " << *dir << " bits " << bits << dendl; - assert(dir->is_auth()); - - if (mds->mdsmap->is_degraded()) { - dout(7) << "cluster degraded, no fragmenting for now" << dendl; - return; - } - if (dir->inode->is_root()) { - dout(7) << "i won't fragment root" << dendl; - //assert(0); - return; - } - if (dir->state_test(CDir::STATE_FRAGMENTING)) { - dout(7) << "already fragmenting" << dendl; - return; - } - if (!dir->can_auth_pin()) { - dout(7) << "not authpinnable on " << *dir << dendl; - return; - } - - list startfrags; - startfrags.push_back(dir); - - dir->state_set(CDir::STATE_FRAGMENTING); - - fragment_freeze(dir->get_inode(), startfrags, dir->get_frag(), bits); - fragment_mark_and_complete(dir->get_inode(), startfrags, dir->get_frag(), bits); -} - -/* - * initial the freeze, blocking with an auth_pin. - * - * some reason(s) we have to freeze: - * - on merge, version/projected version are unified from all fragments; - * concurrent pipelined updates in the directory will have divergent - * versioning... and that's no good. - */ -void MDCache::fragment_freeze(CInode *diri, list& frags, frag_t basefrag, int bits) -{ - C_Gather *gather = new C_Gather(new C_MDC_FragmentGo(this, diri, frags, basefrag, bits)); - - // freeze the dirs - for (list::iterator p = frags.begin(); - p != frags.end(); - ++p) { - CDir *dir = *p; - dir->auth_pin(); // this will block the freeze - dir->freeze_dir(); - assert(dir->is_freezing_dir()); - dir->add_waiter(CDir::WAIT_FROZEN, gather->new_sub()); - } -} - -class C_MDC_FragmentMarking : public Context { - MDCache *mdcache; - CInode *diri; - list dirs; - frag_t basefrag; - int bits; -public: - C_MDC_FragmentMarking(MDCache *m, CInode *di, list& dls, frag_t bf, int b) : - mdcache(m), diri(di), dirs(dls), basefrag(bf), bits(b) { } - virtual void finish(int r) { - mdcache->fragment_mark_and_complete(diri, dirs, basefrag, bits); - } -}; - -void MDCache::fragment_mark_and_complete(CInode *diri, - list& startfrags, - frag_t basefrag, int bits) -{ - dout(10) << "fragment_mark_and_complete " << basefrag << " by " << bits - << " on " << *diri << dendl; - - C_Gather *gather = 0; - - for (list::iterator p = startfrags.begin(); - p != startfrags.end(); - ++p) { - CDir *dir = *p; - - if (!dir->is_complete()) { - dout(15) << " fetching incomplete " << *dir << dendl; - if (!gather) gather = new C_Gather(new C_MDC_FragmentMarking(this, diri, startfrags, basefrag, bits)); - dir->fetch(gather->new_sub(), - true); // ignore authpinnability - } - else if (!dir->state_test(CDir::STATE_DNPINNEDFRAG)) { - dout(15) << " marking " << *dir << dendl; - for (CDir::map_t::iterator p = dir->items.begin(); - p != dir->items.end(); - ++p) { - p->second->get(CDentry::PIN_FRAGMENTING); - p->second->state_set(CDentry::STATE_FRAGMENTING); - } - dir->state_set(CDir::STATE_DNPINNEDFRAG); - dir->auth_unpin(); // allow our freeze to complete - } - else { - dout(15) << " marked " << *dir << dendl; - } - } -} - - -class C_MDC_FragmentStored : public Context { - MDCache *mdcache; - CInode *diri; - frag_t basefrag; - int bits; - list resultfrags; -public: - C_MDC_FragmentStored(MDCache *m, CInode *di, frag_t bf, int b, - list& rf) : - mdcache(m), diri(di), basefrag(bf), bits(b), resultfrags(rf) { } - virtual void finish(int r) { - mdcache->fragment_stored(diri, basefrag, bits, resultfrags); - } -}; - -void MDCache::fragment_go(CInode *diri, list& startfrags, frag_t basefrag, int bits) -{ - dout(10) << "fragment_go " << basefrag << " by " << bits - << " on " << *diri << dendl; - - // refragment - list resultfrags; - list waiters; - adjust_dir_fragments(diri, basefrag, bits, resultfrags, waiters); - mds->queue_waiters(waiters); - - C_Gather *gather = new C_Gather(new C_MDC_FragmentStored(this, diri, basefrag, bits, resultfrags)); - - // freeze, store resulting frags - for (list::iterator p = resultfrags.begin(); - p != resultfrags.end(); - p++) { - CDir *dir = *p; - dout(10) << " result frag " << *dir << dendl; - dir->state_set(CDir::STATE_FRAGMENTING); - dir->commit(0, gather->new_sub()); - dir->_freeze_dir(); - } -} - -class C_MDC_FragmentLogged : public Context { - MDCache *mdcache; - CInode *diri; - frag_t basefrag; - int bits; - list resultfrags; - vector pvs; - LogSegment *ls; -public: - C_MDC_FragmentLogged(MDCache *m, CInode *di, frag_t bf, int b, - list& rf, vector& p, - LogSegment *s) : - mdcache(m), diri(di), basefrag(bf), bits(b), ls(s) { - resultfrags.swap(rf); - pvs.swap(p); - } - virtual void finish(int r) { - mdcache->fragment_logged(diri, basefrag, bits, - resultfrags, pvs, - ls); - } -}; - -void MDCache::fragment_stored(CInode *diri, frag_t basefrag, int bits, - list& resultfrags) -{ - dout(10) << "fragment_stored " << basefrag << " by " << bits - << " on " << *diri << dendl; - - EFragment *le = new EFragment(mds->mdlog, diri->ino(), basefrag, bits); - - set peers; - vector pvs; - for (list::iterator p = resultfrags.begin(); - p != resultfrags.end(); - p++) { - CDir *dir = *p; - dout(10) << " result frag " << *dir << dendl; - - if (p == resultfrags.begin()) { - le->metablob.add_dir_context(dir); - // note peers - // only do this once: all frags have identical replica_maps. - if (peers.empty()) - for (map::iterator p = dir->replica_map.begin(); - p != dir->replica_map.end(); - ++p) - peers.insert(p->first); - } - - pvs.push_back(dir->pre_dirty()); - le->metablob.add_dir(dir, true); - } - - mds->mdlog->submit_entry(le, - new C_MDC_FragmentLogged(this, diri, basefrag, bits, - resultfrags, pvs, mds->mdlog->get_current_segment())); - - // announcelist& resultfrags, - for (set::iterator p = peers.begin(); - p != peers.end(); - ++p) { - MMDSFragmentNotify *notify = new MMDSFragmentNotify(diri->ino(), basefrag, bits); - if (bits < 0) { - // freshly replicate basedir to peer on merge - CDir *base = resultfrags.front(); - CDirDiscover *basedis = base->replicate_to(*p); - basedis->_encode(notify->basebl); - delete basedis; - } - mds->send_message_mds(notify, *p, MDS_PORT_CACHE); - } - -} - -void MDCache::fragment_logged(CInode *diri, frag_t basefrag, int bits, - list& resultfrags, - vector& pvs, - LogSegment *ls) -{ - dout(10) << "fragment_logged " << basefrag << " bits " << bits - << " on " << *diri << dendl; - - - // dirty resulting frags - set peers; - vector::iterator pv = pvs.begin(); - for (list::iterator p = resultfrags.begin(); - p != resultfrags.end(); - p++) { - CDir *dir = *p; - dout(10) << " result frag " << *dir << dendl; - - // dirty, unpin, unfreeze - dir->state_clear(CDir::STATE_FRAGMENTING); - dir->mark_dirty(*pv, ls); - pv++; - - for (CDir::map_t::iterator p = dir->items.begin(); - p != dir->items.end(); - ++p) { - CDentry *dn = p->second; - if (dn->state_test(CDentry::STATE_FRAGMENTING)) - dn->put(CDentry::PIN_FRAGMENTING); - } - - dir->unfreeze_dir(); - } -} - - - -void MDCache::handle_fragment_notify(MMDSFragmentNotify *notify) -{ - dout(10) << "handle_fragment_notify " << *notify << " from " << notify->get_source() << dendl; - - CInode *diri = get_inode(notify->get_ino()); - if (diri) { - list waiters; - - // add replica dir (for merge)? - // (adjust_dir_fragments expects base to already exist, if non-auth) - if (notify->get_bits() < 0) { - CDirDiscover basedis; - int off = 0; - basedis._decode(notify->basebl, off); - add_replica_dir(diri, notify->get_basefrag(), basedis, - notify->get_source().num(), waiters); - } - - // refragment - list resultfrags; - adjust_dir_fragments(diri, notify->get_basefrag(), notify->get_bits(), - resultfrags, waiters); - mds->queue_waiters(waiters); - } - - delete notify; -} - - - - - -// ============================================================== -// debug crap - -void MDCache::show_subtrees(int dbl) -{ - //dout(10) << "show_subtrees" << dendl; - - if (dbl > g_conf.debug && dbl > g_conf.debug_mds) - return; // i won't print anything. - - if (subtrees.empty()) { - dout(dbl) << "show_subtrees - no subtrees" << dendl; - return; - } - - // root frags - list basefrags; - for (set::iterator p = base_inodes.begin(); - p != base_inodes.end(); - ++p) - (*p)->get_dirfrags(basefrags); - //dout(15) << "show_subtrees, base dirfrags " << basefrags << dendl; - dout(15) << "show_subtrees" << dendl; - - // queue stuff - list > q; - string indent; - set seen; - - // calc max depth - for (list::iterator p = basefrags.begin(); p != basefrags.end(); ++p) - q.push_back(pair(*p, 0)); - - set subtrees_seen; - - int depth = 0; - while (!q.empty()) { - CDir *dir = q.front().first; - int d = q.front().second; - q.pop_front(); - - if (subtrees.count(dir) == 0) continue; - - subtrees_seen.insert(dir); - - if (d > depth) depth = d; - - // sanity check - //dout(25) << "saw depth " << d << " " << *dir << dendl; - if (seen.count(dir)) dout(0) << "aah, already seen " << *dir << dendl; - assert(seen.count(dir) == 0); - seen.insert(dir); - - // nested items? - if (!subtrees[dir].empty()) { - for (set::iterator p = subtrees[dir].begin(); - p != subtrees[dir].end(); - ++p) { - //dout(25) << " saw sub " << **p << dendl; - q.push_front(pair(*p, d+1)); - } - } - } - - - // print tree - for (list::iterator p = basefrags.begin(); p != basefrags.end(); ++p) - q.push_back(pair(*p, 0)); - - while (!q.empty()) { - CDir *dir = q.front().first; - int d = q.front().second; - q.pop_front(); - - if (subtrees.count(dir) == 0) continue; - - // adjust indenter - while ((unsigned)d < indent.size()) - indent.resize(d); - - // pad - string pad = "______________________________________"; - pad.resize(depth*2+1-indent.size()); - if (!subtrees[dir].empty()) - pad[0] = '.'; // parent - - - string auth; - if (dir->is_auth()) - auth = "auth "; - else - auth = " rep "; - - char s[10]; - if (dir->get_dir_auth().second == CDIR_AUTH_UNKNOWN) - sprintf(s, "%2d ", dir->get_dir_auth().first); - else - sprintf(s, "%2d,%2d", dir->get_dir_auth().first, dir->get_dir_auth().second); - - // print - dout(dbl) << indent << "|_" << pad << s << " " << auth << *dir << dendl; - - if (dir->ino() == MDS_INO_ROOT) - assert(dir->inode == root); - if (dir->ino() == MDS_INO_STRAY(mds->get_nodeid())) - assert(dir->inode == stray); - - // nested items? - if (!subtrees[dir].empty()) { - // more at my level? - if (!q.empty() && q.front().second == d) - indent += "| "; - else - indent += " "; - - for (set::iterator p = subtrees[dir].begin(); - p != subtrees[dir].end(); - ++p) - q.push_front(pair(*p, d+2)); - } - } - - // verify there isn't stray crap in subtree map - int lost = 0; - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) { - if (subtrees_seen.count(p->first)) continue; - dout(10) << "*** stray/lost entry in subtree map: " << *p->first << dendl; - lost++; - } - assert(lost == 0); -} - - -void MDCache::show_cache() -{ - dout(7) << "show_cache" << dendl; - - for (hash_map::iterator it = inode_map.begin(); - it != inode_map.end(); - it++) { - // unlinked? - if (!it->second->parent) - dout(7) << " unlinked " << *it->second << dendl; - - // dirfrags? - list dfs; - it->second->get_dirfrags(dfs); - for (list::iterator p = dfs.begin(); p != dfs.end(); ++p) { - CDir *dir = *p; - dout(7) << " dirfrag " << *dir << dendl; - - for (CDir::map_t::iterator p = dir->items.begin(); - p != dir->items.end(); - ++p) { - CDentry *dn = p->second; - dout(7) << " dentry " << *dn << dendl; - if (dn->is_primary() && dn->inode) - dout(7) << " inode " << *dn->inode << dendl; - } - } - } -} - - -void MDCache::dump_cache() -{ - if (g_conf.debug_mds < 2) return; - - char fn[20]; - sprintf(fn, "cachedump.%d.mds%d", mds->mdsmap->get_epoch(), mds->get_nodeid()); - - dout(1) << "dump_cache to " << fn << dendl; - - ofstream myfile; - myfile.open(fn); - - for (hash_map::iterator it = inode_map.begin(); - it != inode_map.end(); - it++) { - list dfs; - it->second->get_dirfrags(dfs); - for (list::iterator p = dfs.begin(); p != dfs.end(); ++p) { - CDir *dir = *p; - myfile << *dir->inode << std::endl; - myfile << *dir << std::endl; - - for (CDir::map_t::iterator p = dir->items.begin(); - p != dir->items.end(); - ++p) { - CDentry *dn = p->second; - myfile << *dn << std::endl; - } - } - } - - myfile.close(); -} diff --git a/branches/sage/crush/mds/MDCache.h b/branches/sage/crush/mds/MDCache.h deleted file mode 100644 index 86e3b894c6c8d..0000000000000 --- a/branches/sage/crush/mds/MDCache.h +++ /dev/null @@ -1,721 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __MDCACHE_H -#define __MDCACHE_H - -#include -#include -#include -#include -#include - -#include "include/types.h" -#include "include/filepath.h" - -#include "CInode.h" -#include "CDentry.h" -#include "CDir.h" -#include "include/Context.h" -#include "events/EMetaBlob.h" - -class MDS; -class Migrator; -class Renamer; - -class Logger; - -class Message; - -class MMDSResolve; -class MMDSResolveAck; -class MMDSCacheRejoin; -class MMDSCacheRejoinAck; -class MDiscover; -class MDiscoverReply; -class MCacheExpire; -class MDirUpdate; -class MDentryUnlink; -class MLock; - -class Message; -class MClientRequest; -class MMDSSlaveRequest; - -class MMDSFragmentNotify; - -class ESubtreeMap; - - -// MDCache - -//typedef const char* pchar; - - -struct PVList { - map ls; - - version_t add(MDSCacheObject* o, version_t v) { - return ls[o] = v; - } -}; - -/** active_request_t - * state we track for requests we are currently processing. - * mostly information about locks held, so that we can drop them all - * the request is finished or forwarded. see request_*(). - */ -struct MDRequest { - metareqid_t reqid; - - // -- i am a client (master) request - MClientRequest *client_request; // client request (if any) - - vector trace; // original path traversal. - CInode *ref; // reference inode. if there is only one, and its path is pinned. - - // -- i am a slave request - MMDSSlaveRequest *slave_request; // slave request (if one is pending; implies slave == true) - int slave_to_mds; // this is a slave request if >= 0. - - // -- misc -- - LogSegment *ls; // the log segment i'm committing to - utime_t now; - - // -- my pins and locks -- - // cache pins (so things don't expire) - set< MDSCacheObject* > pins; - set stickydirs; - - // auth pins - set< MDSCacheObject* > remote_auth_pins; - set< MDSCacheObject* > auth_pins; - - // held locks - set< SimpleLock* > rdlocks; // always local. - set< SimpleLock* > wrlocks; // always local. - set< SimpleLock* > xlocks; // local or remote. - set< SimpleLock*, SimpleLock::ptr_lt > locks; // full ordering - - // if this flag is set, do not attempt to acquire further locks. - // (useful for wrlock, which may be a moving auth target) - bool done_locking; - bool committing; - bool aborted; - - struct More { - set slaves; // mds nodes that have slave requests to me (implies client_request) - set waiting_on_slave; // peers i'm waiting for slavereq replies from. - - // for rename/link/unlink - set witnessed; // nodes who have journaled a RenamePrepare - map pvmap; - - // for rename - set extra_witnesses; // replica list from srcdn auth (rename) - version_t src_reanchor_atid; // src->dst - version_t dst_reanchor_atid; // dst->stray - bufferlist inode_import; - version_t inode_import_v; - CInode* destdn_was_remote_inode; - bool was_link_merge; - - // called when slave commits or aborts - Context *slave_commit; - - More() : - src_reanchor_atid(0), dst_reanchor_atid(0), inode_import_v(0), - destdn_was_remote_inode(0), was_link_merge(false), - slave_commit(0) { } - } *_more; - - - // --------------------------------------------------- - MDRequest() : - client_request(0), ref(0), - slave_request(0), slave_to_mds(-1), - ls(0), - done_locking(false), committing(false), aborted(false), - _more(0) {} - MDRequest(metareqid_t ri, MClientRequest *req) : - reqid(ri), client_request(req), ref(0), - slave_request(0), slave_to_mds(-1), - ls(0), - done_locking(false), committing(false), aborted(false), - _more(0) {} - MDRequest(metareqid_t ri, int by) : - reqid(ri), client_request(0), ref(0), - slave_request(0), slave_to_mds(by), - ls(0), - done_locking(false), committing(false), aborted(false), - _more(0) {} - ~MDRequest() { - delete _more; - } - - bool is_master() { return slave_to_mds < 0; } - bool is_slave() { return slave_to_mds >= 0; } - - More* more() { - if (!_more) _more = new More(); - return _more; - } - - bool slave_did_prepare() { return more()->slave_commit; } - - - // pin items in cache - void pin(MDSCacheObject *o) { - if (pins.count(o) == 0) { - o->get(MDSCacheObject::PIN_REQUEST); - pins.insert(o); - } - } - void set_stickydirs(CInode *in) { - if (stickydirs.count(in) == 0) { - in->get_stickydirs(); - stickydirs.insert(in); - } - } - - // auth pins - bool is_auth_pinned(MDSCacheObject *object) { - return auth_pins.count(object) || remote_auth_pins.count(object); - } - void auth_pin(MDSCacheObject *object) { - if (!is_auth_pinned(object)) { - object->auth_pin(); - auth_pins.insert(object); - } - } - void auth_unpin(MDSCacheObject *object) { - assert(is_auth_pinned(object)); - object->auth_unpin(); - auth_pins.erase(object); - } - void drop_local_auth_pins() { - for (set::iterator it = auth_pins.begin(); - it != auth_pins.end(); - it++) { - assert((*it)->is_auth()); - (*it)->auth_unpin(); - } - auth_pins.clear(); - } -}; - -inline ostream& operator<<(ostream& out, MDRequest &mdr) -{ - out << "request(" << mdr.reqid; - //if (mdr.request) out << " " << *mdr.request; - if (mdr.is_slave()) out << " slave_to mds" << mdr.slave_to_mds; - if (mdr.client_request) out << " cr=" << mdr.client_request; - if (mdr.slave_request) out << " sr=" << mdr.slave_request; - out << ")"; - return out; -} - -struct MDSlaveUpdate { - EMetaBlob commit; - EMetaBlob rollback; - xlist::item xlistitem; - Context *waiter; - MDSlaveUpdate() : xlistitem(this), waiter(0) {} - MDSlaveUpdate(EMetaBlob c, EMetaBlob r, xlist &list) : - commit(c), rollback(r), - xlistitem(this), - waiter(0) { - list.push_back(&xlistitem); - } - ~MDSlaveUpdate() { - if (waiter) waiter->finish(0); - delete waiter; - } -}; - - -class MDCache { - public: - // my master - MDS *mds; - - // -- my cache -- - LRU lru; // dentry lru for expiring items from cache - protected: - hash_map inode_map; // map of inodes by ino - CInode *root; // root inode - CInode *stray; // my stray dir - - set base_inodes; // inodes < MDS_INO_BASE (root, stray, etc.) - - // -- discover -- - // waiters - map > > waiting_for_base_ino; - - // in process discovers, by mds. - // this is just enough info to kick any waiters in the event of a failure. - // FIXME: use pointers here instead of identifiers? - map > discover_dir; - map > discover_dir_sub; - - void discover_base_ino(inodeno_t want_ino, Context *onfinish, int from=-1); - void discover_dir_frag(CInode *base, frag_t approx_fg, Context *onfinish, - int from=-1); - void discover_path(CInode *base, filepath want_path, Context *onfinish, - bool want_xlocked=false, int from=-1); - void discover_path(CDir *base, filepath want_path, Context *onfinish, - bool want_xlocked=false); - void discover_ino(CDir *base, inodeno_t want_ino, Context *onfinish, - bool want_xlocked=false); - - void kick_discovers(int who); // after a failure. - - -public: - int get_num_inodes() { return inode_map.size(); } - int get_num_dentries() { return lru.lru_get_size(); } - - - // -- subtrees -- -protected: - map > subtrees; // nested bounds on subtrees. - - // adjust subtree auth specification - // dir->dir_auth - // imports/exports/nested_exports - // join/split subtrees as appropriate -public: - bool is_subtrees() { return !subtrees.empty(); } - void list_subtrees(list& ls); - void adjust_subtree_auth(CDir *root, pair auth); - void adjust_subtree_auth(CDir *root, int a, int b=CDIR_AUTH_UNKNOWN) { - adjust_subtree_auth(root, pair(a,b)); - } - void adjust_bounded_subtree_auth(CDir *dir, set& bounds, pair auth); - void adjust_bounded_subtree_auth(CDir *dir, set& bounds, int a) { - adjust_bounded_subtree_auth(dir, bounds, pair(a, CDIR_AUTH_UNKNOWN)); - } - void adjust_bounded_subtree_auth(CDir *dir, list& bounds, pair auth); - void adjust_bounded_subtree_auth(CDir *dir, list& bounds, int a) { - adjust_bounded_subtree_auth(dir, bounds, pair(a, CDIR_AUTH_UNKNOWN)); - } - void map_dirfrag_set(list& dfs, set& result); - void try_subtree_merge(CDir *root); - void try_subtree_merge_at(CDir *root); - void subtree_merge_writebehind_finish(CInode *in, LogSegment *ls); - void eval_subtree_root(CDir *dir); - CDir *get_subtree_root(CDir *dir); - void remove_subtree(CDir *dir); - void get_subtree_bounds(CDir *root, set& bounds); - void get_wouldbe_subtree_bounds(CDir *root, set& bounds); - void verify_subtree_bounds(CDir *root, const set& bounds); - void verify_subtree_bounds(CDir *root, const list& bounds); - - void adjust_subtree_after_rename(CInode *diri, CDir *olddir); - - void get_auth_subtrees(set& s); - void get_fullauth_subtrees(set& s); - - int num_subtrees(); - int num_subtrees_fullauth(); - int num_subtrees_fullnonauth(); - - -protected: - // delayed cache expire - map > delayed_expire; // subtree root -> expire msg - - - // -- requests -- -protected: - hash_map active_requests; - -public: - MDRequest* request_start(MClientRequest *req); - MDRequest* request_start_slave(metareqid_t rid, int by); - bool have_request(metareqid_t rid) { - return active_requests.count(rid); - } - MDRequest* request_get(metareqid_t rid); - void request_pin_ref(MDRequest *r, CInode *ref, vector& trace); - void request_finish(MDRequest *mdr); - void request_forward(MDRequest *mdr, int mds, int port=0); - void dispatch_request(MDRequest *mdr); - void request_forget_foreign_locks(MDRequest *mdr); - void request_cleanup(MDRequest *r); - - - // inode purging - map > purging; // inode -> newsize -> oldsize - map > purging_ls; - map > > waiting_for_purge; - - // -- recovery -- -protected: - set recovery_set; - -public: - void set_recovery_set(set& s); - void handle_mds_failure(int who); - void handle_mds_recovery(int who); - -protected: - // [resolve] - // from EImportStart w/o EImportFinish during journal replay - map > my_ambiguous_imports; - // from MMDSResolves - map > > other_ambiguous_imports; - - map > uncommitted_slave_updates; // for replay. - map ambiguous_slave_updates; // for log trimming. - map waiting_for_slave_update_commit; - friend class ESlaveUpdate; - - set wants_resolve; // nodes i need to send my resolve to - set got_resolve; // nodes i got resolves from - set need_resolve_ack; // nodes i need a resolve_ack from - - void handle_resolve(MMDSResolve *m); - void handle_resolve_ack(MMDSResolveAck *m); - void maybe_resolve_finish(); - void disambiguate_imports(); - void recalc_auth_bits(); -public: - // ambiguous imports - void add_ambiguous_import(dirfrag_t base, list& bounds); - void add_ambiguous_import(CDir *base, const set& bounds); - bool have_ambiguous_import(dirfrag_t base) { - return my_ambiguous_imports.count(base); - } - void cancel_ambiguous_import(dirfrag_t dirino); - void finish_ambiguous_import(dirfrag_t dirino); - void send_resolve(int who); - void send_resolve_now(int who); - void send_resolve_later(int who); - void maybe_send_pending_resolves(); - - ESubtreeMap *create_subtree_map(); - - -protected: - // [rejoin] - set rejoin_gather; // nodes from whom i need a rejoin - set rejoin_sent; // nodes i sent a rejoin to - set rejoin_ack_gather; // nodes from whom i need a rejoin ack - - map > cap_exports; // ino -> client -> capex - map cap_export_paths; - - map > > cap_imports; // ino -> client -> frommds -> capex - map cap_import_paths; - - set rejoin_undef_inodes; - - void rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin); - void handle_cache_rejoin(MMDSCacheRejoin *m); - void handle_cache_rejoin_weak(MMDSCacheRejoin *m); - CInode* rejoin_invent_inode(inodeno_t ino); - void handle_cache_rejoin_strong(MMDSCacheRejoin *m); - void rejoin_scour_survivor_replicas(int from, MMDSCacheRejoin *ack); - void handle_cache_rejoin_ack(MMDSCacheRejoin *m); - void handle_cache_rejoin_purge(MMDSCacheRejoin *m); - void handle_cache_rejoin_missing(MMDSCacheRejoin *m); - void handle_cache_rejoin_full(MMDSCacheRejoin *m); - void rejoin_send_acks(); - void rejoin_trim_undef_inodes(); -public: - void rejoin_gather_finish(); - void rejoin_send_rejoins(); - void rejoin_export_caps(inodeno_t ino, string& path, int client, inode_caps_reconnect_t& icr) { - cap_exports[ino][client] = icr; - cap_export_paths[ino] = path; - } - void rejoin_recovered_caps(inodeno_t ino, string& path, int client, inode_caps_reconnect_t& icr, - int frommds=-1) { - cap_imports[ino][client][frommds] = icr; - cap_import_paths[ino] = path; - } - void rejoin_import_cap(CInode *in, int client, inode_caps_reconnect_t& icr, int frommds); - - - friend class Locker; - friend class Migrator; - friend class Renamer; - friend class MDBalancer; - - - public: - - // subsystems - Migrator *migrator; - Renamer *renamer; - - public: - MDCache(MDS *m); - ~MDCache(); - - // debug - void log_stat(Logger *logger); - - // root inode - CInode *get_root() { return root; } - void set_root(CInode *r); - CInode *get_stray() { return stray; } - - // cache - void set_cache_size(size_t max) { lru.lru_set_max(max); } - size_t get_cache_size() { return lru.lru_get_size(); } - - // trimming - bool trim(int max = -1); // trim cache - void trim_dentry(CDentry *dn, map& expiremap); - void trim_dirfrag(CDir *dir, CDir *con, - map& expiremap); - void trim_inode(CDentry *dn, CInode *in, CDir *con, - map& expiremap); - void send_expire_messages(map& expiremap); - void trim_non_auth(); // trim out trimmable non-auth items - - // shutdown - void shutdown_start(); - void shutdown_check(); - bool shutdown_pass(); - bool shutdown(); // clear cache (ie at shutodwn) - - bool did_shutdown_log_cap; - - // inode_map - bool have_inode( inodeno_t ino ) { return inode_map.count(ino) ? true:false; } - CInode* get_inode( inodeno_t ino ) { - if (have_inode(ino)) - return inode_map[ino]; - return NULL; - } - CDir* get_dirfrag(dirfrag_t df) { - if (!have_inode(df.ino)) return NULL; - return inode_map[df.ino]->get_dirfrag(df.frag); - } - /* - void get_dirfrags_under(dirfrag_t df, list& ls) { - if (have_inode(df.ino)) - inode_map[df.ino]->get_dirfrags_under(df.frag, ls); - } - */ - - MDSCacheObject *get_object(MDSCacheObjectInfo &info); - - - - public: - CInode *create_inode(); - void add_inode(CInode *in); - - void remove_inode(CInode *in); - protected: - void touch_inode(CInode *in) { - if (in->get_parent_dn()) - touch_dentry(in->get_parent_dn()); - } - void touch_dentry(CDentry *dn) { - // touch ancestors - if (dn->get_dir()->get_inode()->get_parent_dn()) - touch_dentry(dn->get_dir()->get_inode()->get_parent_dn()); - - // touch me - if (dn->is_auth()) - lru.lru_touch(dn); - else - lru.lru_midtouch(dn); - } - - void inode_remove_replica(CInode *in, int rep); - void dentry_remove_replica(CDentry *dn, int rep); - - void rename_file(CDentry *srcdn, CDentry *destdn); - - public: - // inode purging - void purge_inode(CInode *in, off_t newsize, off_t oldsize, LogSegment *ls); - void _do_purge_inode(CInode *in, off_t newsize, off_t oldsize); - void purge_inode_finish(CInode *in, off_t newsize, off_t oldsize); - void purge_inode_finish_2(CInode *in, off_t newsize, off_t oldsize); - bool is_purging(CInode *in, off_t newsize, off_t oldsize) { - return purging.count(in) && purging[in].count(newsize); - } - void wait_for_purge(CInode *in, off_t newsize, Context *c) { - waiting_for_purge[in][newsize].push_back(c); - } - - void add_recovered_purge(CInode *in, off_t newsize, off_t oldsize, LogSegment *ls); - void remove_recovered_purge(CInode *in, off_t newsize, off_t oldsize); - void start_recovered_purges(); - - - public: - CDir *get_auth_container(CDir *in); - CDir *get_export_container(CDir *dir); - void find_nested_exports(CDir *dir, set& s); - void find_nested_exports_under(CDir *import, CDir *dir, set& s); - - public: - CInode *create_root_inode(); - void open_root(Context *c); - CInode *create_stray_inode(int whose=-1); - void open_local_stray(); - void open_foreign_stray(int who, Context *c); - CDentry *get_or_create_stray_dentry(CInode *in); - - Context *_get_waiter(MDRequest *mdr, Message *req); - int path_traverse(MDRequest *mdr, Message *req, - CInode *base, filepath& path, - vector& trace, bool follow_trailing_sym, - int onfail); - bool path_is_mine(filepath& path); - bool path_is_mine(string& p) { - filepath path(p); - return path_is_mine(path); - } - CDir *path_traverse_to_dir(filepath& path); - - void open_remote_dirfrag(CInode *diri, frag_t fg, Context *fin); - CInode *get_dentry_inode(CDentry *dn, MDRequest *mdr); - void open_remote_ino(inodeno_t ino, MDRequest *mdr, Context *fin); - void open_remote_ino_2(inodeno_t ino, MDRequest *mdr, - vector& anchortrace, - Context *onfinish); - - C_Gather *parallel_fetch(map& pathmap); - - void make_trace(vector& trace, CInode *in); - - // -- anchors -- -public: - void anchor_create(MDRequest *mdr, CInode *in, Context *onfinish); - void anchor_destroy(CInode *in, Context *onfinish); -protected: - void _anchor_create_prepared(CInode *in, version_t atid); - void _anchor_create_logged(CInode *in, version_t atid, LogSegment *ls); - void _anchor_destroy_prepared(CInode *in, version_t atid); - void _anchor_destroy_logged(CInode *in, version_t atid, LogSegment *ls); - - friend class C_MDC_AnchorCreatePrepared; - friend class C_MDC_AnchorCreateLogged; - friend class C_MDC_AnchorDestroyPrepared; - friend class C_MDC_AnchorDestroyLogged; - - // -- stray -- -public: - void eval_stray(CDentry *dn); -protected: - void _purge_stray(CDentry *dn); - void _purge_stray_logged(CDentry *dn, version_t pdv, LogSegment *ls); - friend class C_MDC_PurgeStray; - void reintegrate_stray(CDentry *dn, CDentry *rlink); - void migrate_stray(CDentry *dn, int dest); - - - // == messages == - public: - void dispatch(Message *m); - - protected: - // -- replicas -- - void handle_discover(MDiscover *dis); - void handle_discover_reply(MDiscoverReply *m); - - CDir* add_replica_dir(CInode *diri, - frag_t fg, CDirDiscover& dis, - int from, - list& finished); - CDir* forge_replica_dir(CInode *diri, frag_t fg, int from); - - CDentry *add_replica_dentry(CDir *dir, CDentryDiscover &dis, list& finished); -public: // for Server::handle_slave_rename_prep - CInode *add_replica_inode(CInodeDiscover& dis, CDentry *dn, list& finished); - -public: - CDentry *add_replica_stray(bufferlist &bl, CInode *strayin, int from); -protected: - - - - // -- namespace -- - void handle_dentry_unlink(MDentryUnlink *m); - - - // -- fragmenting -- -private: - void adjust_dir_fragments(CInode *diri, frag_t basefrag, int bits, - list& frags, list& waiters); - friend class EFragment; - -public: - void split_dir(CDir *dir, int byn); - -private: - void fragment_freeze(CInode *diri, list& startfrags, frag_t basefrag, int bits); - void fragment_mark_and_complete(CInode *diri, list& startfrags, frag_t basefrag, int bits); - void fragment_go(CInode *diri, list& startfrags, frag_t basefrag, int bits); - void fragment_stored(CInode *diri, frag_t basefrag, int bits, list& resultfrags); - void fragment_logged(CInode *diri, frag_t basefrag, int bits, list& resultfrags, vector& pvs, LogSegment *ls); - friend class C_MDC_FragmentGo; - friend class C_MDC_FragmentMarking; - friend class C_MDC_FragmentStored; - friend class C_MDC_FragmentLogged; - - void handle_fragment_notify(MMDSFragmentNotify *m); - - - // -- updates -- - //int send_inode_updates(CInode *in); - //void handle_inode_update(MInodeUpdate *m); - - int send_dir_updates(CDir *in, bool bcast=false); - void handle_dir_update(MDirUpdate *m); - - // -- cache expiration -- - void handle_cache_expire(MCacheExpire *m); - void process_delayed_expire(CDir *dir); - void discard_delayed_expire(CDir *dir); - - - // == crap fns == - public: - void show_cache(); - void dump_cache(); - void show_subtrees(int dbl=10); - - CInode *hack_pick_random_inode() { - assert(!inode_map.empty()); - int n = rand() % inode_map.size(); - hash_map::iterator p = inode_map.begin(); - while (n--) p++; - return p->second; - } - -}; - -class C_MDS_RetryRequest : public Context { - MDCache *cache; - MDRequest *mdr; - public: - C_MDS_RetryRequest(MDCache *c, MDRequest *r) : cache(c), mdr(r) {} - virtual void finish(int r) { - cache->dispatch_request(mdr); - } -}; - -#endif diff --git a/branches/sage/crush/mds/MDLog.cc b/branches/sage/crush/mds/MDLog.cc deleted file mode 100644 index eeea99c721751..0000000000000 --- a/branches/sage/crush/mds/MDLog.cc +++ /dev/null @@ -1,505 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "MDLog.h" -#include "MDS.h" -#include "MDCache.h" -#include "LogEvent.h" - -#include "osdc/Journaler.h" - -#include "common/LogType.h" -#include "common/Logger.h" - -#include "events/ESubtreeMap.h" - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug_mds || l <= g_conf.debug_mds_log) *_dout << dbeginl << g_clock.now() << " mds" << mds->get_nodeid() << ".log " -#define derr(l) if (l<=g_conf.debug_mds || l <= g_conf.debug_mds_log) *_derr << dbeginl << g_clock.now() << " mds" << mds->get_nodeid() << ".log " - -// cons/des - -LogType mdlog_logtype; - - -MDLog::~MDLog() -{ - if (journaler) { delete journaler; journaler = 0; } - if (logger) { delete logger; logger = 0; } -} - - -void MDLog::reopen_logger(utime_t start, bool append) -{ - // logger - char name[80]; - sprintf(name, "mds%d.log", mds->get_nodeid()); - logger = new Logger(name, &mdlog_logtype, append); - logger->set_start(start); - - static bool didit = false; - if (!didit) { - didit = true; - mdlog_logtype.add_inc("evadd"); - mdlog_logtype.add_inc("evex"); - mdlog_logtype.add_inc("evtrm"); - mdlog_logtype.add_set("ev"); - mdlog_logtype.add_set("evexg"); - mdlog_logtype.add_set("evexd"); - - mdlog_logtype.add_inc("segadd"); - mdlog_logtype.add_inc("segex"); - mdlog_logtype.add_inc("segtrm"); - mdlog_logtype.add_set("seg"); - mdlog_logtype.add_set("segexg"); - mdlog_logtype.add_set("segexd"); - - mdlog_logtype.add_set("expos"); - mdlog_logtype.add_set("wrpos"); - - mdlog_logtype.add_avg("jlat"); - } - -} - -void MDLog::init_journaler() -{ - // inode - memset(&log_inode, 0, sizeof(log_inode)); - log_inode.ino = MDS_INO_LOG_OFFSET + mds->get_nodeid(); - log_inode.layout = g_OSD_MDLogLayout; - - if (g_conf.mds_local_osd) - log_inode.layout.fl_pg_preferred = mds->get_nodeid() + g_conf.mds_local_osd_offset; // hack - - // log streamer - if (journaler) delete journaler; - journaler = new Journaler(log_inode, mds->objecter, logger, &mds->mds_lock); -} - -void MDLog::write_head(Context *c) -{ - journaler->write_head(c); -} - -off_t MDLog::get_read_pos() -{ - return journaler->get_read_pos(); -} - -off_t MDLog::get_write_pos() -{ - return journaler->get_write_pos(); -} - - - -void MDLog::create(Context *c) -{ - dout(5) << "create empty log" << dendl; - init_journaler(); - journaler->reset(); - write_head(c); - - logger->set("expos", journaler->get_expire_pos()); - logger->set("wrpos", journaler->get_write_pos()); -} - -void MDLog::open(Context *c) -{ - dout(5) << "open discovering log bounds" << dendl; - init_journaler(); - journaler->recover(c); - - // either append() or replay() will follow. -} - -void MDLog::append() -{ - dout(5) << "append positioning at end" << dendl; - journaler->set_read_pos(journaler->get_write_pos()); - journaler->set_expire_pos(journaler->get_write_pos()); - - logger->set("expos", journaler->get_write_pos()); -} - - - -// ------------------------------------------------- - -void MDLog::submit_entry( LogEvent *le, Context *c ) -{ - if (!g_conf.mds_log) { - // hack: log is disabled. - if (c) { - c->finish(0); - delete c; - } - return; - } - - dout(5) << "submit_entry " << journaler->get_write_pos() << " : " << *le << dendl; - - // let the event register itself in the segment - assert(!segments.empty()); - le->_segment = segments.rbegin()->second; - le->_segment->num_events++; - le->update_segment(); - - num_events++; - assert(!capped); - - // encode it, with event type - { - bufferlist bl; - ::_encode(le->_type, bl); - le->encode_payload(bl); - - // journal it. - journaler->append_entry(bl); // bl is destroyed. - } - - delete le; - - if (logger) { - logger->inc("evadd"); - logger->set("ev", num_events); - logger->set("wrpos", journaler->get_write_pos()); - } - - if (c) { - unflushed = 0; - journaler->flush(c); - } - else - unflushed++; - - // start a new segment? - // FIXME: should this go elsewhere? - off_t last_seg = get_last_segment_offset(); - if (!segments.empty() && - !writing_subtree_map && - (journaler->get_write_pos() / ceph_file_layout_period(log_inode.layout) != (last_seg / ceph_file_layout_period(log_inode.layout)) && - (journaler->get_write_pos() - last_seg > ceph_file_layout_period(log_inode.layout)/2))) { - dout(10) << "submit_entry also starting new segment: last = " << last_seg - << ", cur pos = " << journaler->get_write_pos() << dendl; - start_new_segment(); - } -} - -void MDLog::wait_for_sync( Context *c ) -{ - if (g_conf.mds_log) { - // wait - journaler->flush(c); - } else { - // hack: bypass. - c->finish(0); - delete c; - } -} - -void MDLog::flush() -{ - if (unflushed) - journaler->flush(); - unflushed = 0; - - // trim - trim(); -} - -void MDLog::cap() -{ - dout(5) << "cap" << dendl; - capped = true; -} - - -// ----------------------------- -// segments - -void MDLog::start_new_segment(Context *onsync) -{ - dout(7) << "start_new_segment at " << journaler->get_write_pos() << dendl; - assert(!writing_subtree_map); - - segments[journaler->get_write_pos()] = new LogSegment(journaler->get_write_pos()); - - writing_subtree_map = true; - - ESubtreeMap *le = mds->mdcache->create_subtree_map(); - submit_entry(le, new C_MDL_WroteSubtreeMap(this, mds->mdlog->get_write_pos())); - if (onsync) - wait_for_sync(onsync); - - logger->inc("segadd"); - logger->set("seg", segments.size()); -} - -void MDLog::_logged_subtree_map(off_t off) -{ - dout(10) << "_logged_subtree_map at " << off << dendl; - writing_subtree_map = false; - - /* - list ls; - take_subtree_map_expire_waiters(ls); - mds->queue_waiters(ls); - */ -} - - - -void MDLog::trim() -{ - // trim! - dout(10) << "trim " - << segments.size() << " / " << max_segments << " segments, " - << num_events << " / " << max_events << " events" - << ", " << expiring_segments.size() << " (" << expiring_events << ") expiring" - << ", " << expired_segments.size() << " (" << expired_events << ") expired" - << dendl; - - if (segments.empty()) return; - - // hack: only trim for a few seconds at a time - utime_t stop = g_clock.now(); - stop += 2.0; - - map::iterator p = segments.begin(); - int left = num_events; - while (p != segments.end() && - ((max_events >= 0 && left-expiring_events-expired_events > max_events) || - (max_segments >= 0 && (int)(segments.size()-expiring_segments.size()-expired_segments.size()) > max_segments))) { - - if (stop < g_clock.now()) - break; - - if ((int)expiring_segments.size() >= g_conf.mds_log_max_expiring) - break; - - // look at first segment - LogSegment *ls = p->second; - assert(ls); - - p++; - - left -= ls->num_events; - - if (expiring_segments.count(ls)) { - dout(5) << "trim already expiring segment " << ls->offset << ", " << ls->num_events << " events" << dendl; - } else if (expired_segments.count(ls)) { - dout(5) << "trim already expired segment " << ls->offset << ", " << ls->num_events << " events" << dendl; - } else { - try_expire(ls); - } - } -} - - -void MDLog::try_expire(LogSegment *ls) -{ - C_Gather *exp = ls->try_to_expire(mds); - if (exp) { - assert(expiring_segments.count(ls) == 0); - expiring_segments.insert(ls); - expiring_events += ls->num_events; - dout(5) << "try_expire expiring segment " << ls->offset << dendl; - exp->set_finisher(new C_MaybeExpiredSegment(this, ls)); - } else { - dout(10) << "try_expire expired segment " << ls->offset << dendl; - _expired(ls); - } - - logger->set("segexg", expiring_segments.size()); - logger->set("evexg", expiring_events); -} - -void MDLog::_maybe_expired(LogSegment *ls) -{ - dout(10) << "_maybe_expired segment " << ls->offset << " " << ls->num_events << " events" << dendl; - assert(expiring_segments.count(ls)); - expiring_segments.erase(ls); - expiring_events -= ls->num_events; - try_expire(ls); -} - -void MDLog::_expired(LogSegment *ls) -{ - dout(5) << "_expired segment " << ls->offset << " " << ls->num_events << " events" << dendl; - - if (!capped && ls == get_current_segment()) { - dout(5) << "_expired not expiring " << ls->offset << ", last one and !capped" << dendl; - } else { - // expired. - expired_segments.insert(ls); - expired_events += ls->num_events; - - logger->inc("evex", ls->num_events); - logger->inc("segex"); - - // trim expired segments? - while (!segments.empty()) { - ls = segments.begin()->second; - if (!expired_segments.count(ls)) break; - - expired_events -= ls->num_events; - expired_segments.erase(ls); - num_events -= ls->num_events; - - journaler->set_expire_pos(ls->offset); // this was the oldest segment, adjust expire pos - journaler->write_head(0); - - logger->set("expos", ls->offset); - logger->inc("segtrm"); - logger->inc("evtrm", ls->num_events); - - segments.erase(ls->offset); - delete ls; - } - } - - logger->set("ev", num_events); - logger->set("evexd", expired_events); - logger->set("seg", segments.size()); - logger->set("segexd", expired_segments.size()); -} - - - -void MDLog::replay(Context *c) -{ - assert(journaler->is_active()); - - // start reading at the last known expire point. - journaler->set_read_pos( journaler->get_expire_pos() ); - - // empty? - if (journaler->get_read_pos() == journaler->get_write_pos()) { - dout(10) << "replay - journal empty, done." << dendl; - if (c) { - c->finish(0); - delete c; - } - return; - } - - // add waiter - if (c) - waitfor_replay.push_back(c); - - // go! - dout(10) << "replay start, from " << journaler->get_read_pos() - << " to " << journaler->get_write_pos() << dendl; - - assert(num_events == 0); - - replay_thread.create(); -} - -class C_MDL_Replay : public Context { - MDLog *mdlog; -public: - C_MDL_Replay(MDLog *l) : mdlog(l) {} - void finish(int r) { - mdlog->replay_cond.Signal(); - } -}; - - - -// i am a separate thread -void MDLog::_replay_thread() -{ - mds->mds_lock.Lock(); - dout(10) << "_replay_thread start" << dendl; - - // loop - off_t new_expire_pos = journaler->get_expire_pos(); - while (1) { - // wait for read? - while (!journaler->is_readable() && - journaler->get_read_pos() < journaler->get_write_pos()) { - journaler->wait_for_readable(new C_MDL_Replay(this)); - replay_cond.Wait(mds->mds_lock); - } - - if (!journaler->is_readable() && - journaler->get_read_pos() == journaler->get_write_pos()) - break; - - assert(journaler->is_readable()); - - // read it - off_t pos = journaler->get_read_pos(); - bufferlist bl; - bool r = journaler->try_read_entry(bl); - assert(r); - - // unpack event - LogEvent *le = LogEvent::decode(bl); - - // new segment? - if (le->get_type() == EVENT_SUBTREEMAP) { - segments[pos] = new LogSegment(pos); - logger->set("seg", segments.size()); - } - - // have we seen an import map yet? - if (segments.empty()) { - dout(10) << "_replay " << pos << " / " << journaler->get_write_pos() - << " -- waiting for subtree_map. (skipping " << *le << ")" << dendl; - } else { - dout(10) << "_replay " << pos << " / " << journaler->get_write_pos() - << " : " << *le << dendl; - le->_segment = get_current_segment(); // replay may need this - le->_segment->num_events++; - num_events++; - - le->replay(mds); - - if (!new_expire_pos) - new_expire_pos = pos; - } - delete le; - - logger->set("rdpos", pos); - - // drop lock for a second, so other events/messages (e.g. beacon timer!) can go off - mds->mds_lock.Unlock(); - mds->mds_lock.Lock(); - } - - // done! - assert(journaler->get_read_pos() == journaler->get_write_pos()); - dout(10) << "_replay - complete, " << num_events << " events, new read/expire pos is " << new_expire_pos << dendl; - - // move read pointer _back_ to first subtree map we saw, for eventual trimming - journaler->set_read_pos(new_expire_pos); - journaler->set_expire_pos(new_expire_pos); - logger->set("expos", new_expire_pos); - - // kick waiter(s) - list ls; - ls.swap(waitfor_replay); - finish_contexts(ls,0); - - dout(10) << "_replay_thread finish" << dendl; - mds->mds_lock.Unlock(); -} - - - diff --git a/branches/sage/crush/mds/MDLog.h b/branches/sage/crush/mds/MDLog.h deleted file mode 100644 index f7bdcd21a5303..0000000000000 --- a/branches/sage/crush/mds/MDLog.h +++ /dev/null @@ -1,195 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MDLOG_H -#define __MDLOG_H - -#include "include/types.h" -#include "include/Context.h" - -#include "common/Thread.h" -#include "common/Cond.h" - -#include "LogSegment.h" - -#include - -//#include -//using __gnu_cxx::hash_mapset; - -class Journaler; -class LogEvent; -class MDS; -class LogSegment; -class ESubtreeMap; - -class Logger; - -#include -using std::map; - - -class MDLog { - protected: - MDS *mds; - int num_events; // in events - int max_events; - int max_segments; - - int unflushed; - - bool capped; - - inode_t log_inode; - Journaler *journaler; - - Logger *logger; - - - // -- replay -- - Cond replay_cond; - - class ReplayThread : public Thread { - MDLog *log; - public: - ReplayThread(MDLog *l) : log(l) {} - void* entry() { - log->_replay_thread(); - return 0; - } - } replay_thread; - - friend class ReplayThread; - friend class C_MDL_Replay; - - list waitfor_replay; - - void _replay(); // old way - void _replay_thread(); // new way - - - // -- segments -- - map segments; - set expiring_segments; - set expired_segments; - int expiring_events; - int expired_events; - - class C_MDL_WroteSubtreeMap : public Context { - MDLog *mdlog; - off_t off; - public: - C_MDL_WroteSubtreeMap(MDLog *l, off_t o) : mdlog(l), off(o) { } - void finish(int r) { - mdlog->_logged_subtree_map(off); - } - }; - void _logged_subtree_map(off_t off); - - - // -- subtreemaps -- - bool writing_subtree_map; // one is being written now - - friend class ESubtreeMap; - friend class C_MDS_WroteImportMap; - friend class MDCache; - -public: - off_t get_last_segment_offset() { - assert(!segments.empty()); - return segments.rbegin()->first; - } - - -private: - void init_journaler(); - -public: - void reopen_logger(utime_t start, bool append=false); - - // replay state - map > pending_exports; - - - -public: - MDLog(MDS *m) : mds(m), - num_events(0), - max_events(g_conf.mds_log_max_events), - max_segments(g_conf.mds_log_max_segments), - unflushed(0), - capped(false), - journaler(0), - logger(0), - replay_thread(this), - expiring_events(0), expired_events(0), - writing_subtree_map(false) { - } - ~MDLog(); - - - void start_new_segment(Context *onsync=0); - LogSegment *get_current_segment() { - return segments.empty() ? 0:segments.rbegin()->second; - } - - - void flush_logger(); - - size_t get_num_events() { return num_events; } - void set_max_events(int m) { max_events = m; } - size_t get_num_segments() { return segments.size(); } - void set_max_segments(int m) { max_segments = m; } - - off_t get_read_pos(); - off_t get_write_pos(); - bool empty() { return segments.empty(); } - - bool is_capped() { return capped; } - void cap(); - - void submit_entry( LogEvent *e, Context *c = 0 ); - void wait_for_sync( Context *c ); - void flush(); - -private: - class C_MaybeExpiredSegment : public Context { - MDLog *mdlog; - LogSegment *ls; - public: - C_MaybeExpiredSegment(MDLog *mdl, LogSegment *s) : mdlog(mdl), ls(s) {} - void finish(int res) { - mdlog->_maybe_expired(ls); - } - }; - - void try_expire(LogSegment *ls); - void _maybe_expired(LogSegment *ls); - void _expired(LogSegment *ls); - -public: - void trim(); - -private: - void write_head(Context *onfinish); - -public: - void create(Context *onfinish); // fresh, empty log! - void open(Context *onopen); // append() or replay() to follow! - void append(); - void replay(Context *onfinish); -}; - -#endif diff --git a/branches/sage/crush/mds/MDS.cc b/branches/sage/crush/mds/MDS.cc deleted file mode 100644 index 6fc8ef46d9039..0000000000000 --- a/branches/sage/crush/mds/MDS.cc +++ /dev/null @@ -1,1269 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "include/types.h" -#include "common/Clock.h" - -#include "msg/Messenger.h" - -#include "osd/OSDMap.h" -#include "osdc/Objecter.h" -#include "osdc/Filer.h" - -#include "MDSMap.h" - -#include "MDS.h" -#include "Server.h" -#include "Locker.h" -#include "MDCache.h" -#include "MDLog.h" -#include "MDBalancer.h" -#include "IdAllocator.h" -#include "Migrator.h" -//#include "Renamer.h" - -#include "AnchorTable.h" -#include "AnchorClient.h" - -#include "common/Logger.h" -#include "common/LogType.h" - -#include "common/Timer.h" - -#include "events/ESession.h" - -#include "messages/MMDSMap.h" -#include "messages/MMDSBeacon.h" - -#include "messages/MPing.h" -#include "messages/MPingAck.h" -#include "messages/MGenericMessage.h" - -#include "messages/MOSDMap.h" -#include "messages/MOSDGetMap.h" - -#include "messages/MClientRequest.h" -#include "messages/MClientRequestForward.h" - - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) *_dout << dbeginl << g_clock.now() << " mds" << whoami << " " -#define derr(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) *_derr << dbeginl << g_clock.now() << " mds" << whoami << " " - - - - - -// cons/des -MDS::MDS(int whoami, Messenger *m, MonMap *mm) : - timer(mds_lock), - clientmap(this) { - - this->whoami = whoami; - - monmap = mm; - messenger = m; - - mdsmap = new MDSMap; - osdmap = new OSDMap; - - objecter = new Objecter(messenger, monmap, osdmap, mds_lock); - filer = new Filer(objecter); - - mdcache = new MDCache(this); - mdlog = new MDLog(this); - balancer = new MDBalancer(this); - - anchorclient = new AnchorClient(this); - idalloc = new IdAllocator(this); - - anchortable = new AnchorTable(this); - - server = new Server(this); - locker = new Locker(this, mdcache); - - // clients - last_client_mdsmap_bcast = 0; - - // beacon - beacon_last_seq = 0; - beacon_sender = 0; - beacon_killer = 0; - - // tick - tick_event = 0; - - req_rate = 0; - - want_state = state = MDSMap::STATE_DNE; - - logger = logger2 = 0; - - // i'm ready! - messenger->set_dispatcher(this); -} - -MDS::~MDS() { - Mutex::Locker lock(mds_lock); - - if (mdcache) { delete mdcache; mdcache = NULL; } - if (mdlog) { delete mdlog; mdlog = NULL; } - if (balancer) { delete balancer; balancer = NULL; } - if (idalloc) { delete idalloc; idalloc = NULL; } - if (anchortable) { delete anchortable; anchortable = NULL; } - if (anchorclient) { delete anchorclient; anchorclient = NULL; } - if (osdmap) { delete osdmap; osdmap = 0; } - if (mdsmap) { delete mdsmap; mdsmap = 0; } - - if (server) { delete server; server = 0; } - if (locker) { delete locker; locker = 0; } - - if (filer) { delete filer; filer = 0; } - if (objecter) { delete objecter; objecter = 0; } - if (messenger) { delete messenger; messenger = NULL; } - - if (logger) { delete logger; logger = 0; } - if (logger2) { delete logger2; logger2 = 0; } - -} - - -void MDS::reopen_logger(utime_t start) -{ - static LogType mds_logtype, mds_cache_logtype; - static bool didit = false; - if (!didit) { - didit = true; - - //mds_logtype.add_inc("req"); - mds_logtype.add_inc("reply"); - mds_logtype.add_inc("fw"); - - mds_logtype.add_inc("dir_f"); - mds_logtype.add_inc("dir_c"); - //mds_logtype.add_inc("mkdir"); - - /* - mds_logtype.add_inc("newin"); // new inodes (pre)loaded - mds_logtype.add_inc("newt"); // inodes first touched/used - mds_logtype.add_inc("outt"); // trimmed touched - mds_logtype.add_inc("outut"); // trimmed untouched (wasted effort) - mds_logtype.add_avg("oututl"); // avg trim latency for untouched - - mds_logtype.add_inc("dirt1"); - mds_logtype.add_inc("dirt2"); - mds_logtype.add_inc("dirt3"); - mds_logtype.add_inc("dirt4"); - mds_logtype.add_inc("dirt5"); - */ - - mds_logtype.add_set("c"); - mds_logtype.add_set("ctop"); - mds_logtype.add_set("cbot"); - mds_logtype.add_set("cptail"); - mds_logtype.add_set("cpin"); - mds_logtype.add_inc("cex"); - mds_logtype.add_inc("dis"); - - mds_logtype.add_inc("t"); - mds_logtype.add_inc("thit"); - mds_logtype.add_inc("tfw"); - mds_logtype.add_inc("tdis"); - mds_logtype.add_inc("tdirf"); - mds_logtype.add_inc("trino"); - mds_logtype.add_inc("tlock"); - - mds_logtype.add_set("l"); - mds_logtype.add_set("q"); - mds_logtype.add_set("popanyd"); - mds_logtype.add_set("popnest"); - - mds_logtype.add_set("buf"); - - mds_logtype.add_set("sm"); - mds_logtype.add_inc("ex"); - mds_logtype.add_inc("iex"); - mds_logtype.add_inc("im"); - mds_logtype.add_inc("iim"); - /* - mds_logtype.add_inc("imex"); - mds_logtype.add_set("nex"); - mds_logtype.add_set("nim"); - */ - - mds_logtype.add_avg("replyl"); - - } - - if (whoami < 0) return; - - // flush+close old log - if (logger) delete logger; - if (logger2) delete logger2; - - // log - char name[80]; - sprintf(name, "mds%d", whoami); - - bool append = mdsmap->get_inc(whoami) > 1; - - logger = new Logger(name, (LogType*)&mds_logtype, append); - logger->set_start(start); - - char n[80]; - sprintf(n, "mds%d.cache", whoami); - logger2 = new Logger(n, (LogType*)&mds_cache_logtype, append); - logger2->set_start(start); - - mdlog->reopen_logger(start, append); - server->reopen_logger(start, append); -} - -void MDS::send_message_mds(Message *m, int mds, int port, int fromport) -{ - // send mdsmap first? - if (peer_mdsmap_epoch[mds] < mdsmap->get_epoch()) { - messenger->send_message(new MMDSMap(mdsmap), - mdsmap->get_inst(mds)); - peer_mdsmap_epoch[mds] = mdsmap->get_epoch(); - } - - // send message - if (port && !fromport) - fromport = port; - messenger->send_message(m, mdsmap->get_inst(mds), port, fromport); -} - -void MDS::forward_message_mds(Message *req, int mds, int port) -{ - // client request? - if (req->get_type() == MSG_CLIENT_REQUEST) { - MClientRequest *creq = (MClientRequest*)req; - creq->inc_num_fwd(); // inc forward counter - - // tell the client where it should go - messenger->send_message(new MClientRequestForward(creq->get_tid(), mds, creq->get_num_fwd()), - creq->get_client_inst()); - - if (!creq->is_idempotent()) { - delete req; - return; // don't actually forward if non-idempotent! client has to do it. - } - } - - // forward - send_message_mds(req, mds, port); -} - - - -void MDS::send_message_client(Message *m, int client) -{ - version_t seq = clientmap.inc_push_seq(client); - dout(10) << "send_message_client client" << client << " seq " << seq << " " << *m << dendl; - messenger->send_message(m, clientmap.get_inst(client)); -} - -void MDS::send_message_client(Message *m, entity_inst_t clientinst) -{ - version_t seq = clientmap.inc_push_seq(clientinst.name.num()); - dout(10) << "send_message_client client" << clientinst.name.num() << " seq " << seq << " " << *m << dendl; - messenger->send_message(m, clientinst); -} - - -class C_MDS_SendMessageClientSession : public Context { - MDS *mds; - Message *msg; - entity_inst_t clientinst; -public: - C_MDS_SendMessageClientSession(MDS *md, Message *ms, entity_inst_t& ci) : - mds(md), msg(ms), clientinst(ci) {} - void finish(int r) { - mds->clientmap.open_session(clientinst); - mds->send_message_client(msg, clientinst.name.num()); - } -}; - -void MDS::send_message_client_maybe_opening(Message *m, int c) -{ - send_message_client_maybe_open(m, clientmap.get_inst(c)); -} - -void MDS::send_message_client_maybe_open(Message *m, entity_inst_t clientinst) -{ - // FIXME - // _most_ ppl shoudl check for a client session, since migration may call this, - // start opening, and then e.g. locker sends something else (through non-maybe_open - // version) - int client = clientinst.name.num(); - if (!clientmap.have_session(client)) { - // no session! - dout(10) << "send_message_client opening session with " << clientinst << dendl; - clientmap.add_opening(client); - mdlog->submit_entry(new ESession(clientinst, true, clientmap.inc_projected()), - new C_MDS_SendMessageClientSession(this, m, clientinst)); - } else { - // we have a session. - send_message_client(m, clientinst); - } -} - - - -int MDS::init(bool standby) -{ - mds_lock.Lock(); - - objecter->init(); - - want_state = MDSMap::STATE_BOOT; - - // starting beacon. this will induce an MDSMap from the monitor - beacon_start(); - - // schedule tick - reset_tick(); - - mds_lock.Unlock(); - return 0; -} - -void MDS::reset_tick() -{ - // cancel old - if (tick_event) timer.cancel_event(tick_event); - - // schedule - tick_event = new C_MDS_Tick(this); - timer.add_event_after(g_conf.mon_tick_interval, tick_event); -} - -void MDS::tick() -{ - tick_event = 0; - - // reschedule - reset_tick(); - - // log - mds_load_t load = balancer->get_load(); - - if (logger) { - req_rate = logger->get("req"); - - logger->fset("l", (int)load.mds_load()); - logger->set("q", messenger->get_dispatch_queue_len()); - logger->set("buf", buffer_total_alloc); - logger->set("sm", mdcache->num_subtrees()); - - mdcache->log_stat(logger); - } - - if (is_active() || is_stopping()) - locker->scatter_unscatter_autoscattered(); - - // booted? - if (is_active()) { - - // balancer - balancer->tick(); - - } -} - - - - -// ----------------------- -// beacons - -void MDS::beacon_start() -{ - beacon_send(); // send first beacon - - //reset_beacon_killer(); // schedule killer -} - - - -void MDS::beacon_send() -{ - ++beacon_last_seq; - dout(10) << "beacon_send " << MDSMap::get_state_name(want_state) - << " seq " << beacon_last_seq - << " (currently " << MDSMap::get_state_name(state) << ")" - << dendl; - - beacon_seq_stamp[beacon_last_seq] = g_clock.now(); - - int mon = monmap->pick_mon(); - messenger->send_message(new MMDSBeacon(messenger->get_myinst(), mdsmap->get_epoch(), - want_state, beacon_last_seq), - monmap->get_inst(mon)); - - // schedule next sender - if (beacon_sender) timer.cancel_event(beacon_sender); - beacon_sender = new C_MDS_BeaconSender(this); - timer.add_event_after(g_conf.mds_beacon_interval, beacon_sender); -} - -void MDS::handle_mds_beacon(MMDSBeacon *m) -{ - dout(10) << "handle_mds_beacon " << MDSMap::get_state_name(m->get_state()) - << " seq " << m->get_seq() << dendl; - version_t seq = m->get_seq(); - - // update lab - if (beacon_seq_stamp.count(seq)) { - assert(beacon_seq_stamp[seq] > beacon_last_acked_stamp); - beacon_last_acked_stamp = beacon_seq_stamp[seq]; - - // clean up seq_stamp map - while (!beacon_seq_stamp.empty() && - beacon_seq_stamp.begin()->first <= seq) - beacon_seq_stamp.erase(beacon_seq_stamp.begin()); - - reset_beacon_killer(); - } - - delete m; -} - -void MDS::reset_beacon_killer() -{ - utime_t when = beacon_last_acked_stamp; - when += g_conf.mds_beacon_grace; - - dout(15) << "reset_beacon_killer last_acked_stamp at " << beacon_last_acked_stamp - << ", will die at " << when << dendl; - - if (beacon_killer) timer.cancel_event(beacon_killer); - - beacon_killer = new C_MDS_BeaconKiller(this, beacon_last_acked_stamp); - timer.add_event_at(when, beacon_killer); -} - -void MDS::beacon_kill(utime_t lab) -{ - if (lab == beacon_last_acked_stamp) { - dout(0) << "beacon_kill last_acked_stamp " << lab - << ", killing myself." - << dendl; - suicide(); - } else { - dout(20) << "beacon_kill last_acked_stamp " << beacon_last_acked_stamp - << " != my " << lab - << ", doing nothing." - << dendl; - } -} - - - -void MDS::handle_mds_map(MMDSMap *m) -{ - version_t hadepoch = mdsmap->get_epoch(); - version_t epoch = m->get_epoch(); - dout(5) << "handle_mds_map epoch " << epoch << " from " << m->get_source() << dendl; - - // note source's map version - if (m->get_source().is_mds() && - peer_mdsmap_epoch[m->get_source().num()] < epoch) { - dout(15) << " peer " << m->get_source() - << " has mdsmap epoch >= " << epoch - << dendl; - peer_mdsmap_epoch[m->get_source().num()] = epoch; - } - - // is it new? - if (epoch <= mdsmap->get_epoch()) { - dout(5) << " old map epoch " << epoch << " <= " << mdsmap->get_epoch() - << ", discarding" << dendl; - delete m; - return; - } - - // keep old map, for a moment - MDSMap *oldmap = mdsmap; - int oldwhoami = whoami; - int oldstate = state; - - // decode and process - mdsmap = new MDSMap; - mdsmap->decode(m->get_encoded()); - - // see who i am - whoami = mdsmap->get_addr_rank(messenger->get_myaddr()); - if (whoami < 0) { - dout(1) << "handle_mds_map i'm not in the mdsmap, killing myself" << dendl; - suicide(); - return; - } - - // open logger? - // note that fakesyn/newsyn starts knowing who they are - if (whoami >= 0 && - mdsmap->is_up(whoami) && !mdsmap->is_standby(whoami) && - (oldwhoami != whoami || !logger)) - reopen_logger(mdsmap->get_create()); // adopt mds cluster timeline - - if (oldwhoami != whoami) { - // update messenger. - dout(1) << "handle_mds_map i am now mds" << whoami - << " incarnation " << mdsmap->get_inc(whoami) - << dendl; - messenger->reset_myname(entity_name_t::MDS(whoami)); - - // do i need an osdmap? - if (oldwhoami < 0) { - // we need an osdmap too. - int mon = monmap->pick_mon(); - messenger->send_message(new MOSDGetMap(0), - monmap->get_inst(mon)); - } - } - - // tell objecter my incarnation - if (objecter->get_client_incarnation() < 0 && - mdsmap->have_inst(whoami)) { - assert(mdsmap->get_inc(whoami) > 0); - objecter->set_client_incarnation(mdsmap->get_inc(whoami)); - } - - // for debug - if (g_conf.mds_dump_cache_on_map) - mdcache->dump_cache(); - - // update my state - state = mdsmap->get_state(whoami); - - // did it change? - if (oldstate != state) { - if (state == want_state) { - dout(1) << "handle_mds_map new state " << mdsmap->get_state_name(state) << dendl; - } else { - dout(1) << "handle_mds_map new state " << mdsmap->get_state_name(state) - // << ", although i wanted " << mdsmap->get_state_name(want_state) - << dendl; - want_state = state; - } - - // now active? - if (is_active()) { - // did i just recover? - if (oldstate == MDSMap::STATE_REJOIN || - oldstate == MDSMap::STATE_RECONNECT) - recovery_done(); - finish_contexts(waiting_for_active); // kick waiters - } else if (is_replay()) { - replay_start(); - } else if (is_resolve()) { - resolve_start(); - } else if (is_reconnect()) { - reconnect_start(); - } else if (is_stopping()) { - assert(oldstate == MDSMap::STATE_ACTIVE); - stopping_start(); - } else if (is_stopped()) { - assert(oldstate == MDSMap::STATE_STOPPING); - suicide(); - return; - } - } - - - // RESOLVE - // is someone else newly resolving? - if (is_resolve() || is_rejoin() || is_active() || is_stopping()) { - set oldresolve, resolve; - oldmap->get_mds_set(oldresolve, MDSMap::STATE_RESOLVE); - mdsmap->get_mds_set(resolve, MDSMap::STATE_RESOLVE); - if (oldresolve != resolve) { - dout(10) << "resolve set is " << resolve << ", was " << oldresolve << dendl; - for (set::iterator p = resolve.begin(); p != resolve.end(); ++p) - if (*p != whoami && - oldresolve.count(*p) == 0) - mdcache->send_resolve(*p); // now or later. - } - } - - // REJOIN - // is everybody finally rejoining? - if (is_rejoin() || is_active() || is_stopping()) { - // did we start? - if (!oldmap->is_rejoining() && mdsmap->is_rejoining()) - rejoin_joint_start(); - - // did we finish? - if (g_conf.mds_dump_cache_after_rejoin && - oldmap->is_rejoining() && !mdsmap->is_rejoining()) - mdcache->dump_cache(); // for DEBUG only - } - if (oldmap->is_degraded() && !mdsmap->is_degraded() && state >= MDSMap::STATE_ACTIVE) - dout(1) << "cluster recovered." << dendl; - - // did someone go active? - if (is_active() || is_stopping()) { - set oldactive, active; - oldmap->get_mds_set(oldactive, MDSMap::STATE_ACTIVE); - mdsmap->get_mds_set(active, MDSMap::STATE_ACTIVE); - for (set::iterator p = active.begin(); p != active.end(); ++p) - if (*p != whoami && // not me - oldactive.count(*p) == 0) // newly so? - handle_mds_recovery(*p); - } - - // did someone fail or stop? - if (is_active() || is_stopping()) { - // new failed? - set oldfailed, failed; - oldmap->get_mds_set(oldfailed, MDSMap::STATE_FAILED); - mdsmap->get_mds_set(failed, MDSMap::STATE_FAILED); - for (set::iterator p = failed.begin(); p != failed.end(); ++p) - if (oldfailed.count(*p) == 0) - mdcache->handle_mds_failure(*p); - - // or down then up? - // did their addr/inst change? - set up; - mdsmap->get_up_mds_set(up); - for (set::iterator p = up.begin(); p != up.end(); ++p) - if (oldmap->have_inst(*p) && - oldmap->get_inst(*p) != mdsmap->get_inst(*p)) - mdcache->handle_mds_failure(*p); - - // did anyone stop? - set oldstopped, stopped; - oldmap->get_mds_set(oldstopped, MDSMap::STATE_STOPPED); - mdsmap->get_mds_set(stopped, MDSMap::STATE_STOPPED); - for (set::iterator p = stopped.begin(); p != stopped.end(); ++p) - if (oldstopped.count(*p) == 0) // newly so? - mdcache->migrator->handle_mds_failure_or_stop(*p); - } - - // just got mdsmap+osdmap? - if (hadepoch == 0 && - mdsmap->get_epoch() > 0 && - osdmap->get_epoch() > 0) { - boot(); - } else if (want_state != state) { - // resend beacon. - beacon_send(); - } - - delete m; - delete oldmap; -} - -void MDS::bcast_mds_map() -{ - dout(7) << "bcast_mds_map " << mdsmap->get_epoch() << dendl; - - // share the map with mounted clients - for (set::const_iterator p = clientmap.get_session_set().begin(); - p != clientmap.get_session_set().end(); - ++p) { - messenger->send_message(new MMDSMap(mdsmap), - clientmap.get_inst(*p)); - } - last_client_mdsmap_bcast = mdsmap->get_epoch(); -} - - -void MDS::handle_osd_map(MOSDMap *m) -{ - version_t hadepoch = osdmap->get_epoch(); - dout(10) << "handle_osd_map had " << hadepoch << dendl; - - // process - objecter->handle_osd_map(m); - - // just got mdsmap+osdmap? - if (hadepoch == 0 && - osdmap->get_epoch() > 0 && - mdsmap->get_epoch() > 0) - boot(); -} - - -void MDS::set_want_state(int s) -{ - dout(3) << "set_want_state " << MDSMap::get_state_name(s) << dendl; - want_state = s; - beacon_send(); -} - -void MDS::boot() -{ - if (is_creating()) - boot_create(); // new tables, journal - else if (is_starting() || is_replay()) - boot_start(); // start|replay, join - else - assert(is_standby()); -} - - -class C_MDS_CreateFinish : public Context { - MDS *mds; -public: - C_MDS_CreateFinish(MDS *m) : mds(m) {} - void finish(int r) { mds->creating_done(); } -}; - -void MDS::boot_create() -{ - dout(3) << "boot_create" << dendl; - - C_Gather *fin = new C_Gather(new C_MDS_CreateFinish(this)); - - CDir *rootdir = 0; - if (whoami == 0) { - dout(3) << "boot_create since i am also mds0, creating root inode and dir" << dendl; - - // create root inode. - mdcache->open_root(0); - CInode *root = mdcache->get_root(); - assert(root); - - // force empty root dir - rootdir = root->get_dirfrag(frag_t()); - rootdir->mark_complete(); - } - - // create my stray dir - CDir *straydir; - { - dout(10) << "boot_create creating local stray dir" << dendl; - mdcache->open_local_stray(); - CInode *stray = mdcache->get_stray(); - straydir = stray->get_dirfrag(frag_t()); - straydir->mark_complete(); - } - - // start with a fresh journal - dout(10) << "boot_create creating fresh journal" << dendl; - mdlog->create(fin->new_sub()); - - // write our first subtreemap - mdlog->start_new_segment(fin->new_sub()); - - // dirty, commit (root and) stray dir(s) - if (whoami == 0) { - rootdir->mark_dirty(rootdir->pre_dirty(), mdlog->get_current_segment()); - rootdir->commit(0, fin->new_sub()); - } - straydir->mark_dirty(straydir->pre_dirty(), mdlog->get_current_segment()); - straydir->commit(0, fin->new_sub()); - - // fixme: fake out idalloc (reset, pretend loaded) - dout(10) << "boot_create creating fresh idalloc table" << dendl; - idalloc->reset(); - idalloc->save(fin->new_sub()); - - // write empty clientmap - clientmap.save(fin->new_sub()); - - // fixme: fake out anchortable - if (mdsmap->get_anchortable() == whoami) { - dout(10) << "boot_create creating fresh anchortable" << dendl; - anchortable->create_fresh(); - anchortable->save(fin->new_sub()); - } -} - -void MDS::creating_done() -{ - dout(1)<< "creating_done" << dendl; - set_want_state(MDSMap::STATE_ACTIVE); -} - - -class C_MDS_BootStart : public Context { - MDS *mds; - int nextstep; -public: - C_MDS_BootStart(MDS *m, int n) : mds(m), nextstep(n) {} - void finish(int r) { mds->boot_start(nextstep); } -}; - -void MDS::boot_start(int step) -{ - switch (step) { - case 0: - step = 1; // fall-thru. - - case 1: - { - C_Gather *gather = new C_Gather(new C_MDS_BootStart(this, 2)); - dout(2) << "boot_start " << step << ": opening idalloc" << dendl; - idalloc->load(gather->new_sub()); - - dout(2) << "boot_start " << step << ": opening clientmap" << dendl; - clientmap.load(gather->new_sub()); - - if (mdsmap->get_anchortable() == whoami) { - dout(2) << "boot_start " << step << ": opening anchor table" << dendl; - anchortable->load(gather->new_sub()); - } - - dout(2) << "boot_start " << step << ": opening mds log" << dendl; - mdlog->open(gather->new_sub()); - } - break; - - case 2: - if (is_replay()) { - dout(2) << "boot_start " << step << ": replaying mds log" << dendl; - mdlog->replay(new C_MDS_BootStart(this, 3)); - break; - } else { - dout(2) << "boot_start " << step << ": positioning at end of old mds log" << dendl; - mdlog->append(); - step++; - } - - case 3: - if (is_replay()) { - replay_done(); - break; - } - - // starting only - assert(is_starting()); - if (mdsmap->get_root() == whoami) { - dout(2) << "boot_start " << step << ": opening root directory" << dendl; - mdcache->open_root(new C_MDS_BootStart(this, 4)); - break; - } - step++; - - case 4: - dout(2) << "boot_start " << step << ": opening local stray directory" << dendl; - mdcache->open_local_stray(); - - starting_done(); - break; - } -} - -void MDS::starting_done() -{ - dout(3) << "starting_done" << dendl; - assert(is_starting()); - set_want_state(MDSMap::STATE_ACTIVE); - - // start new segment - mdlog->start_new_segment(0); -} - - -void MDS::replay_start() -{ - dout(1) << "replay_start" << dendl; - - // initialize gather sets - set rs; - mdsmap->get_recovery_mds_set(rs); - rs.erase(whoami); - dout(1) << "now replay. my recovery peers are " << rs << dendl; - mdcache->set_recovery_set(rs); - - // start? - if (osdmap->get_epoch() > 0 && - mdsmap->get_epoch() > 0) - boot_start(); -} - -void MDS::replay_done() -{ - dout(1) << "replay_done" << dendl; - - if (mdsmap->get_num_in_mds() == 1 && - mdsmap->get_num_mds(MDSMap::STATE_FAILED) == 0) { // just me! - dout(2) << "i am alone, moving to state reconnect" << dendl; - set_want_state(MDSMap::STATE_RECONNECT); - } else { - dout(2) << "i am not alone, moving to state resolve" << dendl; - set_want_state(MDSMap::STATE_RESOLVE); - } - - // start new segment - mdlog->start_new_segment(0); -} - - -void MDS::resolve_start() -{ - dout(1) << "resolve_start" << dendl; - - set who; - mdsmap->get_mds_set(who, MDSMap::STATE_RESOLVE); - mdsmap->get_mds_set(who, MDSMap::STATE_REJOIN); - mdsmap->get_mds_set(who, MDSMap::STATE_ACTIVE); - mdsmap->get_mds_set(who, MDSMap::STATE_STOPPING); - for (set::iterator p = who.begin(); p != who.end(); ++p) { - if (*p == whoami) continue; - mdcache->send_resolve(*p); // now. - } -} -void MDS::resolve_done() -{ - dout(1) << "resolve_done" << dendl; - set_want_state(MDSMap::STATE_RECONNECT); -} - -void MDS::reconnect_start() -{ - dout(1) << "reconnect_start" << dendl; - server->reconnect_clients(); -} -void MDS::reconnect_done() -{ - dout(1) << "reconnect_done" << dendl; - set_want_state(MDSMap::STATE_REJOIN); // move to rejoin state - - /* - if (mdsmap->get_num_in_mds() == 1 && - mdsmap->get_num_mds(MDSMap::STATE_FAILED) == 0) { // just me! - - // finish processing caps (normally, this happens during rejoin, but we're skipping that...) - mdcache->rejoin_gather_finish(); - - set_want_state(MDSMap::STATE_ACTIVE); // go active - } else { - set_want_state(MDSMap::STATE_REJOIN); // move to rejoin state - } - */ -} - -void MDS::rejoin_joint_start() -{ - dout(1) << "rejoin_joint_start" << dendl; - mdcache->rejoin_send_rejoins(); -} -void MDS::rejoin_done() -{ - dout(1) << "rejoin_done" << dendl; - mdcache->show_subtrees(); - mdcache->show_cache(); - set_want_state(MDSMap::STATE_ACTIVE); -} - - -void MDS::recovery_done() -{ - dout(1) << "recovery_done -- successful recovery!" << dendl; - assert(is_active()); - - // kick anchortable (resent AGREEs) - if (mdsmap->get_anchortable() == whoami) - anchortable->finish_recovery(); - - // kick anchorclient (resent COMMITs) - anchorclient->finish_recovery(); - - mdcache->start_recovered_purges(); - - // tell connected clients - bcast_mds_map(); -} - -void MDS::handle_mds_recovery(int who) -{ - dout(5) << "handle_mds_recovery mds" << who << dendl; - - mdcache->handle_mds_recovery(who); - - if (anchortable) - anchortable->handle_mds_recovery(who); - anchorclient->handle_mds_recovery(who); - - queue_waiters(waiting_for_active_peer[who]); - waiting_for_active_peer.erase(who); -} - -void MDS::stopping_start() -{ - dout(2) << "stopping_start" << dendl; - - // start cache shutdown - mdcache->shutdown_start(); - - // terminate client sessions - server->terminate_sessions(); -} - -void MDS::stopping_done() -{ - dout(2) << "stopping_done" << dendl; - - // tell monitor we shut down cleanly. - set_want_state(MDSMap::STATE_STOPPED); -} - - - -void MDS::suicide() -{ - dout(1) << "suicide" << dendl; - - // stop timers - if (beacon_killer) { - timer.cancel_event(beacon_killer); - beacon_killer = 0; - } - if (beacon_sender) { - timer.cancel_event(beacon_sender); - beacon_sender = 0; - } - if (tick_event) { - timer.cancel_event(tick_event); - tick_event = 0; - } - timer.cancel_all(); - //timer.join(); // this will deadlock from beacon_kill -> suicide - - // shut down cache - mdcache->shutdown(); - - objecter->shutdown(); - - // shut down messenger - messenger->shutdown(); -} - - - - - -void MDS::dispatch(Message *m) -{ - mds_lock.Lock(); - my_dispatch(m); - mds_lock.Unlock(); -} - - - -void MDS::my_dispatch(Message *m) -{ - // from bad mds? - if (m->get_source().is_mds()) { - int from = m->get_source().num(); - if (!mdsmap->have_inst(from) || - mdsmap->get_inst(from) != m->get_source_inst() || - mdsmap->is_down(from)) { - // bogus mds? - if (m->get_type() == MSG_MDS_MAP) { - dout(5) << "got " << *m << " from old/bad/imposter mds " << m->get_source() - << ", but it's an mdsmap, looking at it" << dendl; - } else if (m->get_type() == MSG_MDS_CACHEEXPIRE && - mdsmap->get_inst(from) == m->get_source_inst()) { - dout(5) << "got " << *m << " from down mds " << m->get_source() - << ", but it's a cache_expire, looking at it" << dendl; - } else { - dout(5) << "got " << *m << " from down/old/bad/imposter mds " << m->get_source() - << ", dropping" << dendl; - delete m; - return; - } - } - } - - - switch (m->get_dest_port()) { - - case MDS_PORT_ANCHORTABLE: - anchortable->dispatch(m); - break; - case MDS_PORT_ANCHORCLIENT: - anchorclient->dispatch(m); - break; - - case MDS_PORT_CACHE: - mdcache->dispatch(m); - break; - case MDS_PORT_LOCKER: - locker->dispatch(m); - break; - - case MDS_PORT_MIGRATOR: - mdcache->migrator->dispatch(m); - break; - case MDS_PORT_RENAMER: - //mdcache->renamer->dispatch(m); - break; - - case MDS_PORT_BALANCER: - balancer->proc_message(m); - break; - - case MDS_PORT_MAIN: - proc_message(m); - break; - - case MDS_PORT_SERVER: - server->dispatch(m); - break; - - default: - dout(1) << "MDS dispatch unknown message port" << m->get_dest_port() << dendl; - assert(0); - } - - // finish any triggered contexts - if (finished_queue.size()) { - dout(7) << "mds has " << finished_queue.size() << " queued contexts" << dendl; - dout(10) << finished_queue << dendl; - list ls; - ls.splice(ls.begin(), finished_queue); - assert(finished_queue.empty()); - finish_contexts(ls); - } - - - // HACK FOR NOW - if (is_active()) { - // flush log to disk after every op. for now. - mdlog->flush(); - - // trim cache - mdcache->trim(); - } - - - // hack: thrash exports - static utime_t start; - utime_t now = g_clock.now(); - if (start == utime_t()) - start = now; - double el = now - start; - if (el > 30.0 && - el < 60.0) - for (int i=0; i s; - if (!is_active()) break; - mdsmap->get_mds_set(s, MDSMap::STATE_ACTIVE); - if (s.size() < 2 || mdcache->get_num_inodes() < 10) - break; // need peers for this to work. - - dout(7) << "mds thrashing exports pass " << (i+1) << "/" << g_conf.mds_thrash_exports << dendl; - - // pick a random dir inode - CInode *in = mdcache->hack_pick_random_inode(); - - list ls; - in->get_dirfrags(ls); - if (ls.empty()) continue; // must be an open dir. - CDir *dir = ls.front(); - if (!dir->get_parent_dir()) continue; // must be linked. - if (!dir->is_auth()) continue; // must be auth. - - int dest; - do { - int k = rand() % s.size(); - set::iterator p = s.begin(); - while (k--) p++; - dest = *p; - } while (dest == whoami); - mdcache->migrator->export_dir_nicely(dir,dest); - } - // hack: thrash exports - for (int i=0; ihack_pick_random_inode(); - - list ls; - in->get_dirfrags(ls); - if (ls.empty()) continue; // must be an open dir. - CDir *dir = ls.front(); - if (!dir->get_parent_dir()) continue; // must be linked. - if (!dir->is_auth()) continue; // must be auth. - mdcache->split_dir(dir, 1);// + (rand() % 3)); - } - - // hack: force hash root? - /* - if (false && - mdcache->get_root() && - mdcache->get_root()->dir && - !(mdcache->get_root()->dir->is_hashed() || - mdcache->get_root()->dir->is_hashing())) { - dout(0) << "hashing root" << dendl; - mdcache->migrator->hash_dir(mdcache->get_root()->dir); - } - */ - - - - // shut down? - if (is_stopping()) { - if (mdcache->shutdown_pass()) { - dout(7) << "shutdown_pass=true, finished w/ shutdown, moving to down:stopped" << dendl; - stopping_done(); - } - } - -} - - -void MDS::proc_message(Message *m) -{ - switch (m->get_type()) { - - // OSD - case MSG_OSD_OPREPLY: - objecter->handle_osd_op_reply((class MOSDOpReply*)m); - return; - case MSG_OSD_MAP: - handle_osd_map((MOSDMap*)m); - return; - - - // MDS - case MSG_MDS_MAP: - handle_mds_map((MMDSMap*)m); - return; - case MSG_MDS_BEACON: - handle_mds_beacon((MMDSBeacon*)m); - return; - - default: - assert(0); - } - -} - - - -void MDS::ms_handle_failure(Message *m, const entity_inst_t& inst) -{ - mds_lock.Lock(); - dout(10) << "handle_ms_failure to " << inst << " on " << *m << dendl; - - if (m->get_type() == MSG_MDS_MAP && m->get_dest().is_client()) - server->client_reconnect_failure(m->get_dest().num()); - - delete m; - mds_lock.Unlock(); -} - diff --git a/branches/sage/crush/mds/MDS.h b/branches/sage/crush/mds/MDS.h deleted file mode 100644 index 4dcd73662dbe8..0000000000000 --- a/branches/sage/crush/mds/MDS.h +++ /dev/null @@ -1,297 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __MDS_H -#define __MDS_H - -#include -#include -#include -#include -#include -using namespace std; - -#include -using namespace __gnu_cxx; - -#include "mdstypes.h" - -#include "msg/Dispatcher.h" -#include "include/types.h" -#include "include/Context.h" -#include "common/DecayCounter.h" -#include "common/Logger.h" -#include "common/Mutex.h" -#include "common/Cond.h" -#include "common/Timer.h" - -#include "mon/MonMap.h" -#include "MDSMap.h" - -#include "ClientMap.h" - - - - -class filepath; - -class OSDMap; -class Objecter; -class Filer; - -class Server; -class Locker; -class AnchorTable; -class AnchorClient; -class MDCache; -class MDLog; -class MDBalancer; -class IdAllocator; - -class CInode; -class CDir; -class CDentry; - -class Messenger; -class Message; - -class MClientRequest; -class MClientReply; -class MHashReaddir; -class MHashReaddirReply; - -class MMDSBeacon; - - -class MDS : public Dispatcher { - public: - Mutex mds_lock; - - SafeTimer timer; - - protected: - int whoami; - - public: - Messenger *messenger; - MDSMap *mdsmap; - MonMap *monmap; - OSDMap *osdmap; - Objecter *objecter; - Filer *filer; // for reading/writing to/from osds - - // sub systems - Server *server; - MDCache *mdcache; - Locker *locker; - MDLog *mdlog; - MDBalancer *balancer; - - IdAllocator *idalloc; - - AnchorTable *anchortable; - AnchorClient *anchorclient; - - Logger *logger, *logger2; - - - protected: - // -- MDS state -- - int state; // my confirmed state - int want_state; // the state i want - - list waiting_for_active; - map > waiting_for_active_peer; - - map peer_mdsmap_epoch; - - public: - void wait_for_active(Context *c) { - waiting_for_active.push_back(c); - } - void wait_for_active_peer(int who, Context *c) { - waiting_for_active_peer[who].push_back(c); - } - - int get_state() { return state; } - bool is_dne() { return state == MDSMap::STATE_DNE; } - bool is_failed() { return state == MDSMap::STATE_FAILED; } - bool is_creating() { return state == MDSMap::STATE_CREATING; } - bool is_starting() { return state == MDSMap::STATE_STARTING; } - bool is_standby() { return state == MDSMap::STATE_STANDBY; } - bool is_replay() { return state == MDSMap::STATE_REPLAY; } - bool is_resolve() { return state == MDSMap::STATE_RESOLVE; } - bool is_reconnect() { return state == MDSMap::STATE_RECONNECT; } - bool is_rejoin() { return state == MDSMap::STATE_REJOIN; } - bool is_active() { return state == MDSMap::STATE_ACTIVE; } - bool is_stopping() { return state == MDSMap::STATE_STOPPING; } - bool is_stopped() { return state == MDSMap::STATE_STOPPED; } - - void set_want_state(int s); - - - // -- waiters -- - list finished_queue; - - void queue_waiter(Context *c) { - finished_queue.push_back(c); - } - void queue_waiters(list& ls) { - finished_queue.splice( finished_queue.end(), ls ); - } - - // -- keepalive beacon -- - version_t beacon_last_seq; // last seq sent to monitor - map beacon_seq_stamp; // seq # -> time sent - utime_t beacon_last_acked_stamp; // last time we sent a beacon that got acked - - class C_MDS_BeaconSender : public Context { - MDS *mds; - public: - C_MDS_BeaconSender(MDS *m) : mds(m) {} - void finish(int r) { - mds->beacon_sender = 0; - mds->beacon_send(); - } - } *beacon_sender; - class C_MDS_BeaconKiller : public Context { - MDS *mds; - utime_t lab; - public: - C_MDS_BeaconKiller(MDS *m, utime_t l) : mds(m), lab(l) {} - void finish(int r) { - if (mds->beacon_killer) { - mds->beacon_killer = 0; - mds->beacon_kill(lab); - } - // else mds is pbly already shutting down - } - } *beacon_killer; - - // tick and other timer fun - class C_MDS_Tick : public Context { - MDS *mds; - public: - C_MDS_Tick(MDS *m) : mds(m) {} - void finish(int r) { - mds->tick_event = 0; - mds->tick(); - } - } *tick_event; - void reset_tick(); - - // -- client map -- - ClientMap clientmap; - epoch_t last_client_mdsmap_bcast; - //void log_clientmap(Context *c); - - - // shutdown crap - int req_rate; - - // ino's and fh's - public: - - int get_req_rate() { return req_rate; } - - - public: - MDS(int whoami, Messenger *m, MonMap *mm); - ~MDS(); - - // who am i etc - int get_nodeid() { return whoami; } - MDSMap *get_mds_map() { return mdsmap; } - OSDMap *get_osd_map() { return osdmap; } - - void send_message_mds(Message *m, int mds, int port=0, int fromport=0); - void forward_message_mds(Message *req, int mds, int port=0); - - void send_message_client(Message *m, int client); - void send_message_client(Message *m, entity_inst_t clientinst); - void send_message_client_maybe_opening(Message *m, int); - void send_message_client_maybe_open(Message *m, entity_inst_t clientinst); - - - // start up, shutdown - int init(bool standby=false); - void reopen_logger(utime_t start); - - void bcast_mds_map(); // to mounted clients - - void boot(); - void boot_create(); // i am new mds. - void boot_start(int step=0); // starting|replay - - void replay_start(); - void creating_done(); - void starting_done(); - void replay_done(); - - void resolve_start(); - void resolve_done(); - void reconnect_start(); - void reconnect_done(); - void rejoin_joint_start(); - void rejoin_done(); - void recovery_done(); - void handle_mds_recovery(int who); - - void stopping_start(); - void stopping_done(); - void suicide(); - - void tick(); - - void beacon_start(); - void beacon_send(); - void beacon_kill(utime_t lab); - void handle_mds_beacon(MMDSBeacon *m); - void reset_beacon_killer(); - - // messages - void proc_message(Message *m); - virtual void dispatch(Message *m); - void my_dispatch(Message *m); - - void ms_handle_failure(Message *m, const entity_inst_t& inst); - - // special message types - void handle_mds_map(class MMDSMap *m); - - // osds - void handle_osd_map(class MOSDMap *m); -}; - - - -class C_MDS_RetryMessage : public Context { - Message *m; - MDS *mds; -public: - C_MDS_RetryMessage(MDS *mds, Message *m) { - assert(m); - this->m = m; - this->mds = mds; - } - virtual void finish(int r) { - mds->my_dispatch(m); - } -}; - - - -#endif diff --git a/branches/sage/crush/mds/MDSMap.h b/branches/sage/crush/mds/MDSMap.h deleted file mode 100644 index f2b31ca0fd1c1..0000000000000 --- a/branches/sage/crush/mds/MDSMap.h +++ /dev/null @@ -1,357 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MDSMAP_H -#define __MDSMAP_H - -#include "common/Clock.h" -#include "msg/Message.h" - -#include "include/types.h" - -#include -#include -#include -using namespace std; - - -/* - - beautiful state diagram: - - STOPPED DNE FAILED - / | \ / | | - / | \________ _______/ | | -| v v v v | -| STARTING <--> STANDBY <--> CREATING | -| \ / | -| \____ ____________/ | - \ v v | - \ ACTIVE <-- REJOIN <-- RECONNECT <-- REPLAY - \ | - \ | - \ v - \-- STOPPING - - - - -*/ - - -class MDSMap { - public: - // mds states - static const int STATE_DNE = 0; // down, never existed. - static const int STATE_STOPPED = -1; // down, once existed, but no subtrees. empty log. - static const int STATE_FAILED = 2; // down, active subtrees; needs to be recovered. - - static const int STATE_BOOT = -3; // up, boot announcement. destiny unknown. - static const int STATE_STANDBY = -4; // up, idle. waiting for assignment by monitor. - static const int STATE_CREATING = -5; // up, creating MDS instance (new journal, idalloc..). - static const int STATE_STARTING = -6; // up, starting prior stopped MDS instance. - - static const int STATE_REPLAY = 7; // up, starting prior failed instance. scanning journal. - static const int STATE_RESOLVE = 8; // up, disambiguating distributed operations (import, rename, etc.) - static const int STATE_RECONNECT = 9; // up, reconnect to clients - static const int STATE_REJOIN = 10; // up, replayed journal, rejoining distributed cache - static const int STATE_ACTIVE = 11; // up, active - static const int STATE_STOPPING = 12; // up, exporting metadata (-> standby or out) - - static const char *get_state_name(int s) { - switch (s) { - // down and out - case STATE_DNE: return "down:dne"; - case STATE_STOPPED: return "down:stopped"; - // down and in - case STATE_FAILED: return "down:failed"; - // up and out - case STATE_BOOT: return "up:boot"; - case STATE_CREATING: return "up:creating"; - case STATE_STARTING: return "up:starting"; - case STATE_STANDBY: return "up:standby"; - // up and in - case STATE_REPLAY: return "up:replay"; - case STATE_RESOLVE: return "up:resolve"; - case STATE_RECONNECT: return "up:reconnect"; - case STATE_REJOIN: return "up:rejoin"; - case STATE_ACTIVE: return "up:active"; - case STATE_STOPPING: return "up:stopping"; - default: assert(0); - } - return 0; - } - - protected: - epoch_t epoch; - utime_t created; - epoch_t same_in_set_since; // note: this does not reflect exit-by-failure. - - int target_num; - int anchortable; // which MDS has anchortable (fixme someday) - int root; // which MDS has root directory - - set mds_created; // which mds ids have initialized journals and id tables. - map mds_state; // MDS state - map mds_state_seq; - map mds_inst; // up instances - map mds_inc; // incarnation count (monotonically increases) - - friend class MDSMonitor; - - public: - MDSMap() : epoch(0), same_in_set_since(0), anchortable(0), root(0) {} - - epoch_t get_epoch() const { return epoch; } - void inc_epoch() { epoch++; } - - const utime_t& get_create() const { return created; } - epoch_t get_same_in_set_since() const { return same_in_set_since; } - - int get_anchortable() const { return anchortable; } - int get_root() const { return root; } - - // counts - int get_num_mds() { - return get_num_in_mds(); - } - int get_num_mds(int state) { - int n = 0; - for (map::const_iterator p = mds_state.begin(); - p != mds_state.end(); - p++) - if (p->second == state) ++n; - return n; - } - - int get_num_in_mds() { - int n = 0; - for (map::const_iterator p = mds_state.begin(); - p != mds_state.end(); - p++) - if (p->second > 0) ++n; - return n; - } - - // sets - void get_mds_set(set& s) { - for (map::const_iterator p = mds_state.begin(); - p != mds_state.end(); - p++) - s.insert(p->first); - } - void get_mds_set(set& s, int state) { - for (map::const_iterator p = mds_state.begin(); - p != mds_state.end(); - p++) - if (p->second == state) - s.insert(p->first); - } - void get_up_mds_set(set& s) { - for (map::const_iterator p = mds_state.begin(); - p != mds_state.end(); - p++) - if (is_up(p->first)) s.insert(p->first); - } - void get_in_mds_set(set& s) { - for (map::const_iterator p = mds_state.begin(); - p != mds_state.end(); - p++) - if (is_in(p->first)) s.insert(p->first); - } - void get_active_mds_set(set& s) { - get_mds_set(s, MDSMap::STATE_ACTIVE); - } - void get_failed_mds_set(set& s) { - get_mds_set(s, MDSMap::STATE_FAILED); - } - void get_recovery_mds_set(set& s) { - for (map::const_iterator p = mds_state.begin(); - p != mds_state.end(); - p++) - if (is_failed(p->first) || - (p->second >= STATE_REPLAY && p->second <= STATE_STOPPING)) - s.insert(p->first); - } - - int get_random_in_mds() { - vector v; - for (map::const_iterator p = mds_state.begin(); - p != mds_state.end(); - p++) - if (p->second > 0) v.push_back(p->first); - if (v.empty()) - return -1; - else - return v[rand() % v.size()]; - } - - - // mds states - bool is_down(int m) { return is_dne(m) || is_stopped(m) || is_failed(m); } - bool is_up(int m) { return !is_down(m); } - bool is_in(int m) { return mds_state.count(m) && mds_state[m] > 0; } - bool is_out(int m) { return !mds_state.count(m) || mds_state[m] <= 0; } - - bool is_dne(int m) { return mds_state.count(m) == 0 || mds_state[m] == STATE_DNE; } - bool is_failed(int m) { return mds_state.count(m) && mds_state[m] == STATE_FAILED; } - - bool is_boot(int m) { return mds_state.count(m) && mds_state[m] == STATE_BOOT; } - bool is_standby(int m) { return mds_state.count(m) && mds_state[m] == STATE_STANDBY; } - bool is_creating(int m) { return mds_state.count(m) && mds_state[m] == STATE_CREATING; } - bool is_starting(int m) { return mds_state.count(m) && mds_state[m] == STATE_STARTING; } - bool is_replay(int m) { return mds_state.count(m) && mds_state[m] == STATE_REPLAY; } - bool is_resolve(int m) { return mds_state.count(m) && mds_state[m] == STATE_RESOLVE; } - bool is_reconnect(int m) { return mds_state.count(m) && mds_state[m] == STATE_RECONNECT; } - bool is_rejoin(int m) { return mds_state.count(m) && mds_state[m] == STATE_REJOIN; } - bool is_active(int m) { return mds_state.count(m) && mds_state[m] == STATE_ACTIVE; } - bool is_stopping(int m) { return mds_state.count(m) && mds_state[m] == STATE_STOPPING; } - bool is_active_or_stopping(int m) { return is_active(m) || is_stopping(m); } - bool is_stopped(int m) { return mds_state.count(m) && mds_state[m] == STATE_STOPPED; } - - bool has_created(int m) { return mds_created.count(m); } - - // cluster states - bool is_full() { - return get_num_in_mds() >= target_num; - } - bool is_degraded() { // degraded = some recovery in process. fixes active membership and recovery_set. - return - get_num_mds(STATE_REPLAY) + - get_num_mds(STATE_RESOLVE) + - get_num_mds(STATE_RECONNECT) + - get_num_mds(STATE_REJOIN) + - get_num_mds(STATE_FAILED); - } - bool is_rejoining() { - // nodes are rejoining cache state - return - get_num_mds(STATE_REJOIN) > 0 && - get_num_mds(STATE_REPLAY) == 0 && - get_num_mds(STATE_RECONNECT) == 0 && - get_num_mds(STATE_RESOLVE) == 0 && - get_num_mds(STATE_FAILED) == 0; - } - bool is_stopped() { - return - get_num_in_mds() == 0 && - get_num_mds(STATE_CREATING) == 0 && - get_num_mds(STATE_STARTING) == 0 && - get_num_mds(STATE_STANDBY) == 0; - } - - bool would_be_overfull_with(int mds) { - int in = 1; // mds! - for (map::const_iterator p = mds_state.begin(); - p != mds_state.end(); - p++) { - if (p->first == mds) continue; - if (p->second > 0 || - p->second == STATE_STARTING || - p->second == STATE_CREATING) - in++; - } - return (in > target_num); - } - - int get_state(int m) { - if (mds_state.count(m)) - return mds_state[m]; - else - return STATE_DNE; - } - - // inst - bool have_inst(int m) { - return mds_inst.count(m); - } - const entity_inst_t& get_inst(int m) { - assert(mds_inst.count(m)); - return mds_inst[m]; - } - bool get_inst(int m, entity_inst_t& inst) { - if (mds_inst.count(m)) { - inst = mds_inst[m]; - return true; - } - return false; - } - - int get_addr_rank(const entity_addr_t& addr) { - for (map::iterator p = mds_inst.begin(); - p != mds_inst.end(); - ++p) { - if (p->second.addr == addr) return p->first; - } - /*else - for (map::iterator p = mds_inst.begin(); - p != mds_inst.end(); - ++p) { - if (memcmp(&p->second.addr,&inst.addr, sizeof(inst.addr)) == 0) return p->first; - } - */ - - return -1; - } - - int get_inc(int m) { - if (mds_inc.count(m)) - return mds_inc[m]; - return 0; - } - - - void remove_mds(int m) { - mds_inst.erase(m); - mds_state.erase(m); - mds_state_seq.erase(m); - } - - - // serialize, unserialize - void encode(bufferlist& bl) { - ::_encode(epoch, bl); - ::_encode(target_num, bl); - ::_encode(created, bl); - ::_encode(same_in_set_since, bl); - ::_encode(anchortable, bl); - ::_encode(root, bl); - ::_encode(mds_state, bl); - ::_encode(mds_state_seq, bl); - ::_encode(mds_inst, bl); - ::_encode(mds_inc, bl); - } - - void decode(bufferlist& bl) { - int off = 0; - ::_decode(epoch, bl, off); - ::_decode(target_num, bl, off); - ::_decode(created, bl, off); - ::_decode(same_in_set_since, bl, off); - ::_decode(anchortable, bl, off); - ::_decode(root, bl, off); - ::_decode(mds_state, bl, off); - ::_decode(mds_state_seq, bl, off); - ::_decode(mds_inst, bl, off); - ::_decode(mds_inc, bl, off); - } - - - /*** mapping functions ***/ - - int hash_dentry( inodeno_t dirino, const string& dn ); -}; - -#endif diff --git a/branches/sage/crush/mds/Migrator.cc b/branches/sage/crush/mds/Migrator.cc deleted file mode 100644 index ac02938ddbe88..0000000000000 --- a/branches/sage/crush/mds/Migrator.cc +++ /dev/null @@ -1,2114 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "MDS.h" -#include "MDCache.h" -#include "CInode.h" -#include "CDir.h" -#include "CDentry.h" -#include "Migrator.h" -#include "Locker.h" -#include "Migrator.h" - -#include "MDBalancer.h" -#include "MDLog.h" -#include "MDSMap.h" - -#include "include/filepath.h" - -#include "events/EString.h" -#include "events/EExport.h" -#include "events/EImportStart.h" -#include "events/EImportFinish.h" - -#include "msg/Messenger.h" - -#include "messages/MClientFileCaps.h" - -#include "messages/MExportDirDiscover.h" -#include "messages/MExportDirDiscoverAck.h" -#include "messages/MExportDirCancel.h" -#include "messages/MExportDirPrep.h" -#include "messages/MExportDirPrepAck.h" -#include "messages/MExportDir.h" -#include "messages/MExportDirAck.h" -#include "messages/MExportDirNotify.h" -#include "messages/MExportDirNotifyAck.h" -#include "messages/MExportDirFinish.h" - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_mds || l <= g_conf.debug_mds_migrator) *_dout << dbeginl << g_clock.now() << " mds" << mds->get_nodeid() << ".migrator " - - - -void Migrator::dispatch(Message *m) -{ - switch (m->get_type()) { - // import - case MSG_MDS_EXPORTDIRDISCOVER: - handle_export_discover((MExportDirDiscover*)m); - break; - case MSG_MDS_EXPORTDIRPREP: - handle_export_prep((MExportDirPrep*)m); - break; - case MSG_MDS_EXPORTDIR: - handle_export_dir((MExportDir*)m); - break; - case MSG_MDS_EXPORTDIRFINISH: - handle_export_finish((MExportDirFinish*)m); - break; - case MSG_MDS_EXPORTDIRCANCEL: - handle_export_cancel((MExportDirCancel*)m); - break; - - // export - case MSG_MDS_EXPORTDIRDISCOVERACK: - handle_export_discover_ack((MExportDirDiscoverAck*)m); - break; - case MSG_MDS_EXPORTDIRPREPACK: - handle_export_prep_ack((MExportDirPrepAck*)m); - break; - case MSG_MDS_EXPORTDIRACK: - handle_export_ack((MExportDirAck*)m); - break; - case MSG_MDS_EXPORTDIRNOTIFYACK: - handle_export_notify_ack((MExportDirNotifyAck*)m); - break; - - // export 3rd party (dir_auth adjustments) - case MSG_MDS_EXPORTDIRNOTIFY: - handle_export_notify((MExportDirNotify*)m); - break; - - default: - assert(0); - } -} - - -class C_MDC_EmptyImport : public Context { - Migrator *mig; - CDir *dir; -public: - C_MDC_EmptyImport(Migrator *m, CDir *d) : mig(m), dir(d) {} - void finish(int r) { - mig->export_empty_import(dir); - } -}; - - -void Migrator::export_empty_import(CDir *dir) -{ - dout(7) << "export_empty_import " << *dir << dendl; - assert(dir->is_subtree_root()); - - if (dir->inode->is_auth()) { - dout(7) << " inode is auth" << dendl; - return; - } - if (!dir->is_auth()) { - dout(7) << " not auth" << dendl; - return; - } - if (dir->is_freezing() || dir->is_frozen()) { - dout(7) << " freezing or frozen" << dendl; - return; - } - if (dir->get_size() > 0) { - dout(7) << " not actually empty" << dendl; - return; - } - if (dir->inode->is_root()) { - dout(7) << " root" << dendl; - return; - } - - int dest = dir->inode->authority().first; - //if (mds->is_shutting_down()) dest = 0; // this is more efficient. - - dout(7) << " really empty, exporting to " << dest << dendl; - assert (dest != mds->get_nodeid()); - - dout(7) << "exporting to mds" << dest - << " empty import " << *dir << dendl; - export_dir( dir, dest ); -} - - - - -// ========================================================== -// mds failure handling - -void Migrator::handle_mds_failure_or_stop(int who) -{ - dout(5) << "handle_mds_failure_or_stop mds" << who << dendl; - - // check my exports - map::iterator p = export_state.begin(); - while (p != export_state.end()) { - map::iterator next = p; - next++; - CDir *dir = p->first; - - // abort exports: - // - that are going to the failed node - // - that aren't frozen yet (to avoid auth_pin deadlock) - if (export_peer[dir] == who || - p->second == EXPORT_DISCOVERING || p->second == EXPORT_FREEZING) { - // the guy i'm exporting to failed, or we're just freezing. - dout(10) << "cleaning up export state " << p->second << " of " << *dir << dendl; - - switch (p->second) { - case EXPORT_DISCOVERING: - dout(10) << "export state=discovering : canceling freeze and removing auth_pin" << dendl; - dir->unfreeze_tree(); // cancel the freeze - dir->auth_unpin(); - export_state.erase(dir); // clean up - dir->state_clear(CDir::STATE_EXPORTING); - if (export_peer[dir] != who) // tell them. - mds->send_message_mds(new MExportDirCancel(dir->dirfrag()), export_peer[dir], MDS_PORT_MIGRATOR); - break; - - case EXPORT_FREEZING: - dout(10) << "export state=freezing : canceling freeze" << dendl; - dir->unfreeze_tree(); // cancel the freeze - export_state.erase(dir); // clean up - dir->state_clear(CDir::STATE_EXPORTING); - if (export_peer[dir] != who) // tell them. - mds->send_message_mds(new MExportDirCancel(dir->dirfrag()), export_peer[dir], MDS_PORT_MIGRATOR); - break; - - // NOTE: state order reversal, warning comes after loggingstart+prepping - case EXPORT_WARNING: - dout(10) << "export state=warning : unpinning bounds, unfreezing, notifying" << dendl; - // fall-thru - - //case EXPORT_LOGGINGSTART: - case EXPORT_PREPPING: - if (p->second != EXPORT_WARNING) - dout(10) << "export state=loggingstart|prepping : unpinning bounds, unfreezing" << dendl; - { - // unpin bounds - set bounds; - cache->get_subtree_bounds(dir, bounds); - for (set::iterator p = bounds.begin(); - p != bounds.end(); - ++p) { - CDir *bd = *p; - bd->put(CDir::PIN_EXPORTBOUND); - bd->state_clear(CDir::STATE_EXPORTBOUND); - } - } - dir->unfreeze_tree(); - cache->adjust_subtree_auth(dir, mds->get_nodeid()); - cache->try_subtree_merge(dir); - export_state.erase(dir); // clean up - dir->state_clear(CDir::STATE_EXPORTING); - break; - - case EXPORT_EXPORTING: - dout(10) << "export state=exporting : reversing, and unfreezing" << dendl; - export_reverse(dir); - export_state.erase(dir); // clean up - dir->state_clear(CDir::STATE_EXPORTING); - break; - - case EXPORT_LOGGINGFINISH: - case EXPORT_NOTIFYING: - dout(10) << "export state=loggingfinish|notifying : ignoring dest failure, we were successful." << dendl; - // leave export_state, don't clean up now. - break; - - default: - assert(0); - } - - // finish clean-up? - if (export_state.count(dir) == 0) { - export_peer.erase(dir); - export_warning_ack_waiting.erase(dir); - export_notify_ack_waiting.erase(dir); - - // unpin the path - vector trace; - cache->make_trace(trace, dir->inode); - mds->locker->dentry_anon_rdlock_trace_finish(trace); - - // wake up any waiters - mds->queue_waiters(export_finish_waiters[dir]); - export_finish_waiters.erase(dir); - - // send pending import_maps? (these need to go out when all exports have finished.) - cache->maybe_send_pending_resolves(); - - cache->show_subtrees(); - - maybe_do_queued_export(); - } - } else { - // bystander failed. - if (export_warning_ack_waiting.count(dir) && - export_warning_ack_waiting[dir].count(who)) { - export_warning_ack_waiting[dir].erase(who); - export_notify_ack_waiting[dir].erase(who); // they won't get a notify either. - if (p->second == EXPORT_WARNING) { - // exporter waiting for warning acks, let's fake theirs. - dout(10) << "faking export_warning_ack from mds" << who - << " on " << *dir << " to mds" << export_peer[dir] - << dendl; - if (export_warning_ack_waiting[dir].empty()) - export_go(dir); - } - } - if (export_notify_ack_waiting.count(dir) && - export_notify_ack_waiting[dir].count(who)) { - export_notify_ack_waiting[dir].erase(who); - if (p->second == EXPORT_NOTIFYING) { - // exporter is waiting for notify acks, fake it - dout(10) << "faking export_notify_ack from mds" << who - << " on " << *dir << " to mds" << export_peer[dir] - << dendl; - if (export_notify_ack_waiting[dir].empty()) - export_finish(dir); - } - } - } - - // next! - p = next; - } - - - // check my imports - map::iterator q = import_state.begin(); - while (q != import_state.end()) { - map::iterator next = q; - next++; - dirfrag_t df = q->first; - CInode *diri = mds->mdcache->get_inode(df.ino); - CDir *dir = mds->mdcache->get_dirfrag(df); - - if (import_peer[df] == who) { - switch (q->second) { - case IMPORT_DISCOVERING: - dout(10) << "import state=discovering : clearing state" << dendl; - import_state.erase(df); - import_peer.erase(df); - break; - - case IMPORT_DISCOVERED: - dout(10) << "import state=discovered : unpinning inode " << *diri << dendl; - assert(diri); - // unpin base - diri->put(CInode::PIN_IMPORTING); - import_state.erase(df); - import_peer.erase(df); - break; - - case IMPORT_PREPPING: - if (q->second == IMPORT_PREPPING) { - dout(10) << "import state=prepping : unpinning base+bounds " << *dir << dendl; - } - assert(dir); - { - set bounds; - cache->map_dirfrag_set(import_bound_ls[dir], bounds); - import_remove_pins(dir, bounds); - import_reverse_final(dir); - } - break; - - case IMPORT_PREPPED: - dout(10) << "import state=prepped : unpinning base+bounds, unfreezing " << *dir << dendl; - assert(dir); - { - set bounds; - cache->get_subtree_bounds(dir, bounds); - import_remove_pins(dir, bounds); - - // adjust auth back to me - cache->adjust_subtree_auth(dir, import_peer[df]); - cache->try_subtree_merge(dir); - - // bystanders? - if (import_bystanders[dir].empty()) { - import_reverse_unfreeze(dir); - } else { - // notify them; wait in aborting state - import_notify_abort(dir, bounds); - import_state[df] = IMPORT_ABORTING; - } - } - break; - - case IMPORT_LOGGINGSTART: - dout(10) << "import state=loggingstart : reversing import on " << *dir << dendl; - import_reverse(dir); - break; - - case IMPORT_ACKING: - // hrm. make this an ambiguous import, and wait for exporter recovery to disambiguate - dout(10) << "import state=acking : noting ambiguous import " << *dir << dendl; - { - set bounds; - cache->get_subtree_bounds(dir, bounds); - cache->add_ambiguous_import(dir, bounds); - } - break; - - case IMPORT_ABORTING: - dout(10) << "import state=aborting : ignoring repeat failure " << *dir << dendl; - break; - } - } else { - if (q->second == IMPORT_ABORTING && - import_bystanders[dir].count(who)) { - dout(10) << "faking export_notify_ack from mds" << who - << " on aborting import " << *dir << " from mds" << import_peer[df] - << dendl; - import_bystanders[dir].erase(who); - if (import_bystanders[dir].empty()) { - import_bystanders.erase(dir); - import_reverse_unfreeze(dir); - } - } - } - - // next! - q = next; - } -} - - - -void Migrator::show_importing() -{ - dout(10) << "show_importing" << dendl; - for (map::iterator p = import_state.begin(); - p != import_state.end(); - p++) { - CDir *dir = mds->mdcache->get_dirfrag(p->first); - if (dir) { - dout(10) << " importing from " << import_peer[p->first] - << ": (" << p->second << ") " << get_import_statename(p->second) - << " " << p->first - << " " << *dir - << dendl; - } else { - dout(10) << " importing from " << import_peer[p->first] - << ": (" << p->second << ") " << get_import_statename(p->second) - << " " << p->first - << dendl; - } - } -} - -void Migrator::show_exporting() -{ - dout(10) << "show_exporting" << dendl; - for (map::iterator p = export_state.begin(); - p != export_state.end(); - p++) - dout(10) << " exporting to " << export_peer[p->first] - << ": (" << p->second << ") " << get_export_statename(p->second) - << " " << p->first->dirfrag() - << " " << *p->first - << dendl; -} - - - -void Migrator::audit() -{ - if (g_conf.debug_mds < 5) return; // hrm. - - // import_state - show_importing(); - for (map::iterator p = import_state.begin(); - p != import_state.end(); - p++) { - if (p->second == IMPORT_DISCOVERING) - continue; - if (p->second == IMPORT_DISCOVERED) { - CInode *in = cache->get_inode(p->first.ino); - assert(in); - continue; - } - CDir *dir = cache->get_dirfrag(p->first); - assert(dir); - if (p->second == IMPORT_PREPPING) - continue; - assert(dir->is_ambiguous_dir_auth()); - assert(dir->authority().first == mds->get_nodeid() || - dir->authority().second == mds->get_nodeid()); - } - - // export_state - show_exporting(); - for (map::iterator p = export_state.begin(); - p != export_state.end(); - p++) { - CDir *dir = p->first; - if (p->second == EXPORT_DISCOVERING || - p->second == EXPORT_FREEZING) continue; - assert(dir->is_ambiguous_dir_auth()); - assert(dir->authority().first == mds->get_nodeid() || - dir->authority().second == mds->get_nodeid()); - } - - // ambiguous+me subtrees should be importing|exporting - - // write me -} - - - - - -// ========================================================== -// EXPORT - -void Migrator::export_dir_nicely(CDir *dir, int dest) -{ - // enqueue - dout(7) << "export_dir_nicely " << *dir << " to " << dest << dendl; - export_queue.push_back(pair(dir->dirfrag(), dest)); - - maybe_do_queued_export(); -} - -void Migrator::maybe_do_queued_export() -{ - while (!export_queue.empty() && - export_state.size() <= 4) { - dirfrag_t df = export_queue.front().first; - int dest = export_queue.front().second; - export_queue.pop_front(); - - CDir *dir = mds->mdcache->get_dirfrag(df); - if (!dir) continue; - if (!dir->is_auth()) continue; - - dout(-7) << "nicely exporting to mds" << dest << " " << *dir << dendl; - - export_dir(dir, dest); - } -} - - - - -class C_MDC_ExportFreeze : public Context { - Migrator *mig; - CDir *ex; // dir i'm exporting - -public: - C_MDC_ExportFreeze(Migrator *m, CDir *e) : - mig(m), ex(e) {} - virtual void finish(int r) { - if (r >= 0) - mig->export_frozen(ex); - } -}; - - -/** export_dir(dir, dest) - * public method to initiate an export. - * will fail if the directory is freezing, frozen, unpinnable, or root. - */ -void Migrator::export_dir(CDir *dir, int dest) -{ - dout(7) << "export_dir " << *dir << " to " << dest << dendl; - assert(dir->is_auth()); - assert(dest != mds->get_nodeid()); - - if (mds->mdsmap->is_degraded()) { - dout(7) << "cluster degraded, no exports for now" << dendl; - return; - } - - if (dir->inode->is_root()) { - dout(7) << "i won't export root" << dendl; - //assert(0); - return; - } - - if (dir->is_frozen() || - dir->is_freezing()) { - dout(7) << " can't export, freezing|frozen. wait for other exports to finish first." << dendl; - return; - } - if (dir->state_test(CDir::STATE_EXPORTING)) { - dout(7) << "already exporting" << dendl; - return; - } - - // pin path? - vector trace; - cache->make_trace(trace, dir->inode); - if (!mds->locker->dentry_can_rdlock_trace(trace)) { - dout(7) << "export_dir couldn't pin path, failing." << dendl; - return; - } - - // ok. - mds->locker->dentry_anon_rdlock_trace_start(trace); - assert(export_state.count(dir) == 0); - export_state[dir] = EXPORT_DISCOVERING; - export_peer[dir] = dest; - - dir->state_set(CDir::STATE_EXPORTING); - - // send ExportDirDiscover (ask target) - mds->send_message_mds(new MExportDirDiscover(dir), dest, MDS_PORT_MIGRATOR); - - // start the freeze, but hold it up with an auth_pin. - dir->auth_pin(); - dir->freeze_tree(); - assert(dir->is_freezing_tree()); - dir->add_waiter(CDir::WAIT_FROZEN, new C_MDC_ExportFreeze(this, dir)); -} - - -/* - * called on receipt of MExportDirDiscoverAck - * the importer now has the directory's _inode_ in memory, and pinned. - */ -void Migrator::handle_export_discover_ack(MExportDirDiscoverAck *m) -{ - CDir *dir = cache->get_dirfrag(m->get_dirfrag()); - assert(dir); - - dout(7) << "export_discover_ack from " << m->get_source() - << " on " << *dir << dendl; - - if (export_state.count(dir) == 0 || - export_state[dir] != EXPORT_DISCOVERING || - export_peer[dir] != m->get_source().num()) { - dout(7) << "must have aborted" << dendl; - } else { - // freeze the subtree - export_state[dir] = EXPORT_FREEZING; - dir->auth_unpin(); - } - - delete m; // done -} - -void Migrator::export_frozen(CDir *dir) -{ - dout(7) << "export_frozen on " << *dir << dendl; - assert(dir->is_frozen()); - assert(dir->get_cum_auth_pins() == 0); - - // ok! - int dest = export_peer[dir]; - - cache->show_subtrees(); - - // note the bounds. - // force it into a subtree by listing auth as . - cache->adjust_subtree_auth(dir, mds->get_nodeid(), mds->get_nodeid()); - set bounds; - cache->get_subtree_bounds(dir, bounds); - - // generate prep message, log entry. - MExportDirPrep *prep = new MExportDirPrep(dir->dirfrag()); - - // include list of bystanders - for (map::iterator p = dir->replicas_begin(); - p != dir->replicas_end(); - p++) { - if (p->first != dest) { - dout(10) << "bystander mds" << p->first << dendl; - prep->add_bystander(p->first); - } - } - - /* include spanning tree for all nested exports. - * these need to be on the destination _before_ the final export so that - * dir_auth updates on any nested exports are properly absorbed. - * this includes inodes and dirfrags included in the subtree, but - * only the inodes at the bounds. - */ - set inodes_added; - - // include base dirfrag - prep->add_dirfrag( new CDirDiscover(dir, dir->add_replica(dest)) ); - - // check bounds - for (set::iterator it = bounds.begin(); - it != bounds.end(); - it++) { - CDir *bound = *it; - - // pin it. - bound->get(CDir::PIN_EXPORTBOUND); - bound->state_set(CDir::STATE_EXPORTBOUND); - - dout(7) << " export bound " << *bound << dendl; - - prep->add_export( bound->dirfrag() ); - - /* first assemble each trace, in trace order, and put in message */ - list inode_trace; - - // trace to dir - CDir *cur = bound; - while (cur != dir) { - // don't repeat ourselves - if (inodes_added.count(cur->ino())) break; // did already! - inodes_added.insert(cur->ino()); - - // inode - assert(cur->inode->is_auth()); - inode_trace.push_front(cur->inode); - dout(7) << " will add " << *cur->inode << dendl; - - // include the dirfrag? only if it's not the bounding subtree root. - if (cur != bound) { - assert(cur->is_auth()); - prep->add_dirfrag( cur->replicate_to(dest) ); // yay! - dout(7) << " added " << *cur << dendl; - } - - cur = cur->get_parent_dir(); - } - - for (list::iterator it = inode_trace.begin(); - it != inode_trace.end(); - it++) { - CInode *in = *it; - dout(7) << " added " << *in->parent << dendl; - dout(7) << " added " << *in << dendl; - prep->add_inode( in->parent->get_dir()->dirfrag(), - in->parent->get_name(), - in->parent->replicate_to(dest), - in->replicate_to(dest) ); - } - - } - - // send. - export_state[dir] = EXPORT_PREPPING; - mds->send_message_mds(prep, dest, MDS_PORT_MIGRATOR); -} - -void Migrator::handle_export_prep_ack(MExportDirPrepAck *m) -{ - CDir *dir = cache->get_dirfrag(m->get_dirfrag()); - assert(dir); - - dout(7) << "export_prep_ack " << *dir << dendl; - - if (export_state.count(dir) == 0 || - export_state[dir] != EXPORT_PREPPING) { - // export must have aborted. - dout(7) << "export must have aborted" << dendl; - delete m; - return; - } - - // send warnings - int dest = export_peer[dir]; - set bounds; - cache->get_subtree_bounds(dir, bounds); - - assert(export_peer.count(dir)); - assert(export_warning_ack_waiting.count(dir) == 0); - assert(export_notify_ack_waiting.count(dir) == 0); - - for (map::iterator p = dir->replicas_begin(); - p != dir->replicas_end(); - ++p) { - if (p->first == dest) continue; - if (!mds->mdsmap->is_active_or_stopping(p->first)) - continue; // only if active - export_warning_ack_waiting[dir].insert(p->first); - export_notify_ack_waiting[dir].insert(p->first); // we'll eventually get a notifyack, too! - - MExportDirNotify *notify = new MExportDirNotify(dir->dirfrag(), true, - pair(mds->get_nodeid(),CDIR_AUTH_UNKNOWN), - pair(mds->get_nodeid(),export_peer[dir])); - notify->copy_bounds(bounds); - mds->send_message_mds(notify, p->first, MDS_PORT_MIGRATOR); - - } - export_state[dir] = EXPORT_WARNING; - - // nobody to warn? - if (export_warning_ack_waiting.count(dir) == 0) - export_go(dir); // start export. - - // done. - delete m; -} - - -void Migrator::export_go(CDir *dir) -{ - assert(export_peer.count(dir)); - int dest = export_peer[dir]; - dout(7) << "export_go " << *dir << " to " << dest << dendl; - - cache->show_subtrees(); - - export_warning_ack_waiting.erase(dir); - export_state[dir] = EXPORT_EXPORTING; - - assert(dir->get_cum_auth_pins() == 0); - - // set ambiguous auth - cache->adjust_subtree_auth(dir, dest, mds->get_nodeid()); - - // take away the popularity we're sending. - mds->balancer->subtract_export(dir); - - // fill export message with cache data - utime_t now = g_clock.now(); - map exported_client_map; - bufferlist export_data; - int num_exported_inodes = encode_export_dir( export_data, - dir, // recur start point - exported_client_map, - now ); - bufferlist bl; - ::_encode(exported_client_map, bl); - bl.claim_append(export_data); - export_data.claim(bl); - - // send the export data! - MExportDir *req = new MExportDir(dir->dirfrag()); - req->take_dirstate(export_data); - - // add bounds to message - set bounds; - cache->get_subtree_bounds(dir, bounds); - for (set::iterator p = bounds.begin(); - p != bounds.end(); - ++p) - req->add_export((*p)->dirfrag()); - - // send - mds->send_message_mds(req, dest, MDS_PORT_MIGRATOR); - - // stats - if (mds->logger) mds->logger->inc("ex"); - if (mds->logger) mds->logger->inc("iex", num_exported_inodes); - - cache->show_subtrees(); -} - - -/** encode_export_inode - * update our local state for this inode to export. - * encode relevant state to be sent over the wire. - * used by: encode_export_dir, file_rename (if foreign) - * - * FIXME: the separation between CInode.encode_export and these methods - * is pretty arbitrary and dumb. - */ -void Migrator::encode_export_inode(CInode *in, bufferlist& enc_state, - map& exported_client_map) -{ - dout(7) << "encode_export_inode " << *in << dendl; - assert(!in->is_replica(mds->get_nodeid())); - - ::_encode_simple(in->inode.ino, enc_state); - in->encode_export(enc_state); - - // make note of clients named by exported capabilities - for (map::iterator it = in->client_caps.begin(); - it != in->client_caps.end(); - it++) - exported_client_map[it->first] = mds->clientmap.get_inst(it->first); -} - -void Migrator::finish_export_inode(CInode *in, utime_t now, list& finished) -{ - dout(12) << "finish_export_inode " << *in << dendl; - - in->finish_export(now); - - // tell (all) clients about migrating caps.. mark STALE - for (map::iterator it = in->client_caps.begin(); - it != in->client_caps.end(); - it++) { - dout(7) << "finish_export_inode telling client" << it->first - << " stale caps on " << *in << dendl; - MClientFileCaps *m = new MClientFileCaps(MClientFileCaps::OP_STALE, - in->inode, - it->second.get_last_seq(), - it->second.pending(), - it->second.wanted()); - entity_inst_t inst = mds->clientmap.get_inst(it->first); - mds->send_message_client_maybe_open(m, inst); - } - in->clear_client_caps(); - - // relax locks? - if (!in->is_replicated()) - in->replicate_relax_locks(); - - // clean - if (in->is_dirty()) in->mark_clean(); - - // clear/unpin cached_by (we're no longer the authority) - in->clear_replica_map(); - - // twiddle lock states for auth -> replica transition - in->authlock.export_twiddle(); - in->linklock.export_twiddle(); - in->dirfragtreelock.export_twiddle(); - in->filelock.export_twiddle(); - in->dirlock.export_twiddle(); - - // mark auth - assert(in->is_auth()); - in->state_clear(CInode::STATE_AUTH); - in->replica_nonce = CInode::EXPORT_NONCE; - - // waiters - in->take_waiting(CInode::WAIT_ANY, finished); - - // *** other state too? - - // move to end of LRU so we drop out of cache quickly! - if (in->get_parent_dn()) - cache->lru.lru_bottouch(in->get_parent_dn()); - -} - -int Migrator::encode_export_dir(bufferlist& exportbl, - CDir *dir, - map& exported_client_map, - utime_t now) -{ - int num_exported = 0; - - dout(7) << "encode_export_dir " << *dir << " " << dir->nitems << " items" << dendl; - - assert(dir->get_projected_version() == dir->get_version()); - - // dir - dirfrag_t df = dir->dirfrag(); - ::_encode_simple(df, exportbl); - dir->encode_export(exportbl); - - long nden = dir->items.size(); - ::_encode_simple(nden, exportbl); - - // dentries - list subdirs; - CDir::map_t::iterator it; - for (it = dir->begin(); it != dir->end(); it++) { - CDentry *dn = it->second; - CInode *in = dn->get_inode(); - - num_exported++; - - // -- dentry - dout(7) << "encode_export_dir exporting " << *dn << dendl; - - // dn name - ::_encode(it->first, exportbl); - - // state - dn->encode_export(exportbl); - - // points to... - - // null dentry? - if (dn->is_null()) { - exportbl.append("N", 1); // null dentry - continue; - } - - if (dn->is_remote()) { - // remote link - exportbl.append("L", 1); // remote link - - inodeno_t ino = dn->get_remote_ino(); - unsigned char d_type = dn->get_remote_d_type(); - ::_encode(ino, exportbl); - ::_encode(d_type, exportbl); - continue; - } - - // primary link - // -- inode - exportbl.append("I", 1); // inode dentry - - encode_export_inode(in, exportbl, exported_client_map); // encode, and (update state for) export - - // directory? - list dfs; - in->get_dirfrags(dfs); - for (list::iterator p = dfs.begin(); p != dfs.end(); ++p) { - CDir *dir = *p; - if (!dir->state_test(CDir::STATE_EXPORTBOUND)) { - // include nested dirfrag - assert(dir->get_dir_auth().first == CDIR_AUTH_PARENT); - subdirs.push_back(dir); // it's ours, recurse (later) - } - } - } - - // subdirs - for (list::iterator it = subdirs.begin(); it != subdirs.end(); it++) - num_exported += encode_export_dir(exportbl, *it, exported_client_map, now); - - return num_exported; -} - -void Migrator::finish_export_dir(CDir *dir, list& finished, utime_t now) -{ - dout(10) << "finish_export_dir " << *dir << dendl; - - // release open_by - dir->clear_replica_map(); - - // mark - assert(dir->is_auth()); - dir->state_clear(CDir::STATE_AUTH); - dir->replica_nonce = CDir::NONCE_EXPORT; - - if (dir->is_dirty()) - dir->mark_clean(); - - // discard most dir state - dir->state &= CDir::MASK_STATE_EXPORT_KEPT; // i only retain a few things. - - // suck up all waiters - dir->take_waiting(CDir::WAIT_ANY, finished); // all dir waiters - - // pop - dir->finish_export(now); - - // dentries - list subdirs; - CDir::map_t::iterator it; - for (it = dir->begin(); it != dir->end(); it++) { - CDentry *dn = it->second; - CInode *in = dn->get_inode(); - - // dentry - dn->finish_export(); - - // inode? - if (dn->is_primary()) { - finish_export_inode(in, now, finished); - - // subdirs? - in->get_nested_dirfrags(subdirs); - } - } - - // subdirs - for (list::iterator it = subdirs.begin(); it != subdirs.end(); it++) - finish_export_dir(*it, finished, now); -} - -class C_MDS_ExportFinishLogged : public Context { - Migrator *migrator; - CDir *dir; -public: - C_MDS_ExportFinishLogged(Migrator *m, CDir *d) : migrator(m), dir(d) {} - void finish(int r) { - migrator->export_logged_finish(dir); - } -}; - - -/* - * i should get an export_ack from the export target. - */ -void Migrator::handle_export_ack(MExportDirAck *m) -{ - CDir *dir = cache->get_dirfrag(m->get_dirfrag()); - assert(dir); - assert(dir->is_frozen_tree_root()); // i'm exporting! - - // yay! - dout(7) << "handle_export_ack " << *dir << dendl; - - export_warning_ack_waiting.erase(dir); - - export_state[dir] = EXPORT_LOGGINGFINISH; - - set bounds; - cache->get_subtree_bounds(dir, bounds); - - // log completion. - // include export bounds, to ensure they're in the journal. - EExport *le = new EExport(mds->mdlog, dir); - le->metablob.add_dir_context(dir); - le->metablob.add_dir( dir, false ); - for (set::iterator p = bounds.begin(); - p != bounds.end(); - ++p) { - CDir *bound = *p; - le->get_bounds().insert(bound->dirfrag()); - le->metablob.add_dir_context(bound); - le->metablob.add_dir(bound, false); - } - - // log export completion, then finish (unfreeze, trigger finish context, etc.) - mds->mdlog->submit_entry(le, - new C_MDS_ExportFinishLogged(this, dir)); - - delete m; -} - - - - - -/* - * this happens if hte dest failes after i send teh export data but before it is acked - * that is, we don't know they safely received and logged it, so we reverse our changes - * and go on. - */ -void Migrator::export_reverse(CDir *dir) -{ - dout(7) << "export_reverse " << *dir << dendl; - - assert(export_state[dir] == EXPORT_EXPORTING); - - set bounds; - cache->get_subtree_bounds(dir, bounds); - - // adjust auth, with possible subtree merge. - cache->adjust_subtree_auth(dir, mds->get_nodeid()); - cache->try_subtree_merge(dir); - - // remove exporting pins - list rq; - rq.push_back(dir); - while (!rq.empty()) { - CDir *dir = rq.front(); - rq.pop_front(); - dir->abort_export(); - for (CDir::map_t::iterator p = dir->items.begin(); p != dir->items.end(); ++p) { - p->second->abort_export(); - if (!p->second->is_primary()) continue; - CInode *in = p->second->get_inode(); - in->abort_export(); - if (in->is_dir()) - in->get_nested_dirfrags(rq); - } - } - - // unpin bounds - for (set::iterator p = bounds.begin(); - p != bounds.end(); - ++p) { - CDir *bd = *p; - bd->put(CDir::PIN_EXPORTBOUND); - bd->state_clear(CDir::STATE_EXPORTBOUND); - } - - // process delayed expires - cache->process_delayed_expire(dir); - - // some clean up - export_warning_ack_waiting.erase(dir); - export_notify_ack_waiting.erase(dir); - - // unfreeze - dir->unfreeze_tree(); - - cache->show_cache(); -} - - -/* - * once i get the ack, and logged the EExportFinish(true), - * send notifies (if any), otherwise go straight to finish. - * - */ -void Migrator::export_logged_finish(CDir *dir) -{ - dout(7) << "export_logged_finish " << *dir << dendl; - - // send notifies - int dest = export_peer[dir]; - - set bounds; - cache->get_subtree_bounds(dir, bounds); - - for (set::iterator p = export_notify_ack_waiting[dir].begin(); - p != export_notify_ack_waiting[dir].end(); - ++p) { - MExportDirNotify *notify; - if (mds->mdsmap->is_active_or_stopping(export_peer[dir])) - // dest is still alive. - notify = new MExportDirNotify(dir->dirfrag(), true, - pair(mds->get_nodeid(), dest), - pair(dest, CDIR_AUTH_UNKNOWN)); - else - // dest is dead. bystanders will think i am only auth, as per mdcache->handle_mds_failure() - notify = new MExportDirNotify(dir->dirfrag(), true, - pair(mds->get_nodeid(), CDIR_AUTH_UNKNOWN), - pair(dest, CDIR_AUTH_UNKNOWN)); - - notify->copy_bounds(bounds); - - mds->send_message_mds(notify, *p, MDS_PORT_MIGRATOR); - } - - // wait for notifyacks - export_state[dir] = EXPORT_NOTIFYING; - - // no notifies to wait for? - if (export_notify_ack_waiting[dir].empty()) - export_finish(dir); // skip notify/notify_ack stage. -} - -/* - * warning: - * i'll get an ack from each bystander. - * when i get them all, do the export. - * notify: - * i'll get an ack from each bystander. - * when i get them all, unfreeze and send the finish. - */ -void Migrator::handle_export_notify_ack(MExportDirNotifyAck *m) -{ - CDir *dir = cache->get_dirfrag(m->get_dirfrag()); - assert(dir); - int from = m->get_source().num(); - - if (export_state.count(dir) && export_state[dir] == EXPORT_WARNING) { - // exporting. process warning. - dout(7) << "handle_export_notify_ack from " << m->get_source() - << ": exporting, processing warning on " - << *dir << dendl; - assert(export_warning_ack_waiting.count(dir)); - export_warning_ack_waiting[dir].erase(from); - - if (export_warning_ack_waiting[dir].empty()) - export_go(dir); // start export. - } - else if (export_state.count(dir) && export_state[dir] == EXPORT_NOTIFYING) { - // exporting. process notify. - dout(7) << "handle_export_notify_ack from " << m->get_source() - << ": exporting, processing notify on " - << *dir << dendl; - assert(export_notify_ack_waiting.count(dir)); - export_notify_ack_waiting[dir].erase(from); - - if (export_notify_ack_waiting[dir].empty()) - export_finish(dir); - } - else if (import_state.count(dir->dirfrag()) && import_state[dir->dirfrag()] == IMPORT_ABORTING) { - // reversing import - dout(7) << "handle_export_notify_ack from " << m->get_source() - << ": aborting import on " - << *dir << dendl; - assert(import_bystanders[dir].count(from)); - import_bystanders[dir].erase(from); - if (import_bystanders[dir].empty()) { - import_bystanders.erase(dir); - import_reverse_unfreeze(dir); - } - } - - delete m; -} - - -void Migrator::export_finish(CDir *dir) -{ - dout(5) << "export_finish " << *dir << dendl; - - if (export_state.count(dir) == 0) { - dout(7) << "target must have failed, not sending final commit message. export succeeded anyway." << dendl; - return; - } - - // send finish/commit to new auth - if (mds->mdsmap->is_active_or_stopping(export_peer[dir])) { - mds->send_message_mds(new MExportDirFinish(dir->dirfrag()), - export_peer[dir], MDS_PORT_MIGRATOR); - } else { - dout(7) << "not sending MExportDirFinish, dest has failed" << dendl; - } - - // finish export (adjust local cache state) - C_Contexts *fin = new C_Contexts; - finish_export_dir(dir, fin->contexts, g_clock.now()); - dir->add_waiter(CDir::WAIT_UNFREEZE, fin); - - // unfreeze - dout(7) << "export_finish unfreezing" << dendl; - dir->unfreeze_tree(); - - // unpin bounds - set bounds; - cache->get_subtree_bounds(dir, bounds); - for (set::iterator p = bounds.begin(); - p != bounds.end(); - ++p) { - CDir *bd = *p; - bd->put(CDir::PIN_EXPORTBOUND); - bd->state_clear(CDir::STATE_EXPORTBOUND); - } - - // adjust auth, with possible subtree merge. - // (we do this _after_ removing EXPORTBOUND pins, to allow merges) - cache->adjust_subtree_auth(dir, export_peer[dir]); - cache->try_subtree_merge(dir); - - // unpin path - dout(7) << "export_finish unpinning path" << dendl; - vector trace; - cache->make_trace(trace, dir->inode); - mds->locker->dentry_anon_rdlock_trace_finish(trace); - - // discard delayed expires - cache->discard_delayed_expire(dir); - - // remove from exporting list, clean up state - dir->state_clear(CDir::STATE_EXPORTING); - export_state.erase(dir); - export_peer.erase(dir); - export_notify_ack_waiting.erase(dir); - - // queue finishers - mds->queue_waiters(export_finish_waiters[dir]); - export_finish_waiters.erase(dir); - - cache->show_subtrees(); - audit(); - - // send pending import_maps? - mds->mdcache->maybe_send_pending_resolves(); - - maybe_do_queued_export(); -} - - - - - - - - -// ========================================================== -// IMPORT - -void Migrator::handle_export_discover(MExportDirDiscover *m) -{ - assert(m->get_source().num() != mds->get_nodeid()); - - dout(7) << "handle_export_discover on " << m->get_path() << dendl; - - // note import state - dirfrag_t df = m->get_dirfrag(); - - // only start discovering on this message once. - if (!m->started) { - m->started = true; - import_state[df] = IMPORT_DISCOVERING; - import_peer[df] = m->get_source().num(); - } - - // am i retrying after ancient path_traverse results? - if (import_state.count(df) == 0 && - import_state[df] != IMPORT_DISCOVERING) { - dout(7) << "hmm import_state is off, i must be obsolete lookup" << dendl; - delete m; - return; - } - - // do we have it? - CInode *in = cache->get_inode(m->get_dirfrag().ino); - if (!in) { - // must discover it! - filepath fpath(m->get_path()); - vector trace; - int r = cache->path_traverse(0, m, - 0, fpath, trace, true, - MDS_TRAVERSE_DISCOVER); - if (r > 0) return; // wait - if (r < 0) { - dout(7) << "handle_export_discover_2 failed to discover or not dir " << m->get_path() << ", NAK" << dendl; - assert(0); // this shouldn't happen if the auth pins his path properly!!!! - } - - assert(0); // this shouldn't happen; the get_inode above would have succeeded. - } - - // yay - dout(7) << "handle_export_discover have " << df << " inode " << *in << dendl; - - import_state[m->get_dirfrag()] = IMPORT_DISCOVERED; - - // pin inode in the cache (for now) - assert(in->is_dir()); - in->get(CInode::PIN_IMPORTING); - - // reply - dout(7) << " sending export_discover_ack on " << *in << dendl; - mds->send_message_mds(new MExportDirDiscoverAck(df), - import_peer[df], MDS_PORT_MIGRATOR); -} - -void Migrator::handle_export_cancel(MExportDirCancel *m) -{ - dout(7) << "handle_export_cancel on " << m->get_dirfrag() << dendl; - - if (import_state[m->get_dirfrag()] == IMPORT_DISCOVERED) { - CInode *in = cache->get_inode(m->get_dirfrag().ino); - assert(in); - in->put(CInode::PIN_IMPORTING); - } else { - assert(import_state[m->get_dirfrag()] == IMPORT_DISCOVERING); - } - - import_state.erase(m->get_dirfrag()); - import_peer.erase(m->get_dirfrag()); - - delete m; -} - - -void Migrator::handle_export_prep(MExportDirPrep *m) -{ - int oldauth = m->get_source().num(); - assert(oldauth != mds->get_nodeid()); - - // make sure we didn't abort - if (import_state.count(m->get_dirfrag()) == 0 || - (import_state[m->get_dirfrag()] != IMPORT_DISCOVERED && - import_state[m->get_dirfrag()] != IMPORT_PREPPING) || - import_peer[m->get_dirfrag()] != oldauth) { - dout(10) << "handle_export_prep import has aborted, dropping" << dendl; - delete m; - return; - } - - CInode *diri = cache->get_inode(m->get_dirfrag().ino); - assert(diri); - - list finished; - - // assimilate root dir. - CDir *dir; - - if (!m->did_assim()) { - dir = cache->add_replica_dir(diri, - m->get_dirfrag().frag, *m->get_dirfrag_discover(m->get_dirfrag()), - oldauth, finished); - dout(7) << "handle_export_prep on " << *dir << " (first pass)" << dendl; - } else { - dir = cache->get_dirfrag(m->get_dirfrag()); - assert(dir); - dout(7) << "handle_export_prep on " << *dir << " (subsequent pass)" << dendl; - } - assert(dir->is_auth() == false); - - cache->show_subtrees(); - - // build import bound map - map import_bound_fragset; - for (list::iterator p = m->get_bounds().begin(); - p != m->get_bounds().end(); - ++p) { - dout(10) << " bound " << *p << dendl; - import_bound_fragset[p->ino].insert(p->frag); - } - - // assimilate contents? - if (!m->did_assim()) { - dout(7) << "doing assim on " << *dir << dendl; - m->mark_assim(); // only do this the first time! - - // move pin to dir - diri->put(CInode::PIN_IMPORTING); - dir->get(CDir::PIN_IMPORTING); - dir->state_set(CDir::STATE_IMPORTING); - - // change import state - import_state[dir->dirfrag()] = IMPORT_PREPPING; - import_bound_ls[dir] = m->get_bounds(); - - // bystander list - import_bystanders[dir] = m->get_bystanders(); - dout(7) << "bystanders are " << import_bystanders[dir] << dendl; - - // assimilate traces to exports - for (list::iterator it = m->get_inodes().begin(); - it != m->get_inodes().end(); - it++) { - // inode - CInode *in = cache->get_inode( (*it)->get_ino() ); - if (in) { - (*it)->update_inode(in); - dout(7) << " updated " << *in << dendl; - } else { - in = new CInode(mds->mdcache, false); - (*it)->update_inode(in); - - // link to the containing dir - CDir *condir = cache->get_dirfrag( m->get_containing_dirfrag(in->ino()) ); - assert(condir); - cache->add_inode( in ); - condir->add_primary_dentry( m->get_dentry(in->ino()), in ); - - dout(7) << " added " << *in << dendl; - } - - assert( in->get_parent_dir()->dirfrag() == m->get_containing_dirfrag(in->ino()) ); - - // dirs - for (list::iterator pf = m->get_inode_dirfrags(in->ino()).begin(); - pf != m->get_inode_dirfrags(in->ino()).end(); - ++pf) { - // add/update - cache->add_replica_dir(in, *pf, *m->get_dirfrag_discover(dirfrag_t(in->ino(), *pf)), - oldauth, finished); - } - } - - // make bound sticky - for (map::iterator p = import_bound_fragset.begin(); - p != import_bound_fragset.end(); - ++p) { - CInode *in = cache->get_inode(p->first); - assert(in); - in->get_stickydirs(); - dout(7) << " set stickydirs on bound inode " << *in << dendl; - } - - } else { - dout(7) << " not doing assim on " << *dir << dendl; - } - - if (!finished.empty()) - mds->queue_waiters(finished); - - - // open all bounds - set import_bounds; - for (map::iterator p = import_bound_fragset.begin(); - p != import_bound_fragset.end(); - ++p) { - CInode *in = cache->get_inode(p->first); - assert(in); - - // map fragset into a frag_t list, based on the inode fragtree - list fglist; - for (set::iterator q = p->second.begin(); q != p->second.end(); ++q) - in->dirfragtree.get_leaves_under(*q, fglist); - dout(10) << " bound inode " << p->first << " fragset " << p->second << " maps to " << fglist << dendl; - - for (list::iterator q = fglist.begin(); - q != fglist.end(); - ++q) { - CDir *bound = cache->get_dirfrag(dirfrag_t(p->first, *q)); - if (!bound) { - dout(7) << " opening bounding dirfrag " << *q << " on " << *in << dendl; - cache->open_remote_dirfrag(in, *q, - new C_MDS_RetryMessage(mds, m)); - return; - } - - if (!bound->state_test(CDir::STATE_IMPORTBOUND)) { - dout(7) << " pinning import bound " << *bound << dendl; - bound->get(CDir::PIN_IMPORTBOUND); - bound->state_set(CDir::STATE_IMPORTBOUND); - } else { - dout(7) << " already pinned import bound " << *bound << dendl; - } - import_bounds.insert(bound); - } - } - - dout(7) << " all ready, noting auth and freezing import region" << dendl; - - // note that i am an ambiguous auth for this subtree. - // specify bounds, since the exporter explicitly defines the region. - cache->adjust_bounded_subtree_auth(dir, import_bounds, - pair(oldauth, mds->get_nodeid())); - cache->verify_subtree_bounds(dir, import_bounds); - - // freeze. - dir->_freeze_tree(); - - // ok! - dout(7) << " sending export_prep_ack on " << *dir << dendl; - mds->send_message_mds(new MExportDirPrepAck(dir->dirfrag()), - m->get_source().num(), MDS_PORT_MIGRATOR); - - // note new state - import_state[dir->dirfrag()] = IMPORT_PREPPED; - - // done - delete m; - -} - - - - -class C_MDS_ImportDirLoggedStart : public Context { - Migrator *migrator; - CDir *dir; - int from; -public: - C_MDS_ImportDirLoggedStart(Migrator *m, CDir *d, int f) : - migrator(m), dir(d), from(f) { - } - void finish(int r) { - migrator->import_logged_start(dir, from); - } -}; - -void Migrator::handle_export_dir(MExportDir *m) -{ - CDir *dir = cache->get_dirfrag(m->get_dirfrag()); - assert(dir); - - int oldauth = m->get_source().num(); - dout(7) << "handle_export_dir importing " << *dir << " from " << oldauth << dendl; - assert(dir->is_auth() == false); - - cache->show_subtrees(); - - // start the journal entry - EImportStart *le = new EImportStart(dir->dirfrag(), m->get_bounds()); - le->metablob.add_dir_context(dir); - - // adjust auth (list us _first_) - cache->adjust_subtree_auth(dir, mds->get_nodeid(), oldauth); - - // add this crap to my cache - map imported_client_map; - bufferlist::iterator blp = m->get_dirstate().begin(); - ::_decode_simple(imported_client_map, blp); - - int num_imported_inodes = 0; - while (!blp.end()) { - num_imported_inodes += - decode_import_dir(blp, - oldauth, - dir, // import root - le, - imported_client_map, - mds->mdlog->get_current_segment(), - import_updated_scatterlocks[dir]); - } - dout(10) << " " << m->get_bounds().size() << " imported bounds" << dendl; - - // include bounds in EImportStart - set import_bounds; - cache->get_subtree_bounds(dir, import_bounds); - for (set::iterator it = import_bounds.begin(); - it != import_bounds.end(); - it++) - le->metablob.add_dir(*it, false); // note that parent metadata is already in the event - - // adjust popularity - mds->balancer->add_import(dir); - - dout(7) << "handle_export_dir did " << *dir << dendl; - - // log it - mds->mdlog->submit_entry(le, - new C_MDS_ImportDirLoggedStart(this, dir, m->get_source().num())); - - // note state - import_state[dir->dirfrag()] = IMPORT_LOGGINGSTART; - - // some stats - if (mds->logger) { - mds->logger->inc("im"); - mds->logger->inc("iim", num_imported_inodes); - } - - delete m; -} - - -/* - * this is an import helper - * called by import_finish, and import_reverse and friends. - */ -void Migrator::import_remove_pins(CDir *dir, set& bounds) -{ - // root - dir->put(CDir::PIN_IMPORTING); - dir->state_clear(CDir::STATE_IMPORTING); - - // bounds - set didinodes; - for (set::iterator it = bounds.begin(); - it != bounds.end(); - it++) { - CDir *bd = *it; - bd->put(CDir::PIN_IMPORTBOUND); - bd->state_clear(CDir::STATE_IMPORTBOUND); - CInode *bdi = bd->get_inode(); - if (didinodes.count(bdi) == 0) { - bdi->put_stickydirs(); - didinodes.insert(bdi); - } - } -} - - -/* - * note: this does teh full work of reversing and import and cleaning up - * state. - * called by both handle_mds_failure and by handle_resolve (if we are - * a survivor coping with an exporter failure+recovery). - */ -void Migrator::import_reverse(CDir *dir) -{ - dout(7) << "import_reverse " << *dir << dendl; - - set bounds; - cache->get_subtree_bounds(dir, bounds); - - // remove pins - import_remove_pins(dir, bounds); - - // update auth, with possible subtree merge. - assert(dir->is_subtree_root()); - cache->adjust_subtree_auth(dir, import_peer[dir->dirfrag()]); - cache->try_subtree_merge(dir); - - // adjust auth bits. - list q; - q.push_back(dir); - while (!q.empty()) { - CDir *cur = q.front(); - q.pop_front(); - - // dir - assert(cur->is_auth()); - cur->state_clear(CDir::STATE_AUTH); - cur->clear_replica_map(); - if (cur->is_dirty()) - cur->mark_clean(); - - CDir::map_t::iterator it; - for (it = cur->begin(); it != cur->end(); it++) { - CDentry *dn = it->second; - - // dentry - dn->state_clear(CDentry::STATE_AUTH); - dn->clear_replica_map(); - if (dn->is_dirty()) - dn->mark_clean(); - - // inode? - if (dn->is_primary()) { - CInode *in = dn->get_inode(); - in->state_clear(CDentry::STATE_AUTH); - in->clear_replica_map(); - if (in->is_dirty()) - in->mark_clean(); - in->authlock.clear_gather(); - in->linklock.clear_gather(); - in->dirfragtreelock.clear_gather(); - in->filelock.clear_gather(); - - // non-bounding dir? - list dfs; - in->get_dirfrags(dfs); - for (list::iterator p = dfs.begin(); p != dfs.end(); ++p) - if (bounds.count(*p) == 0) - q.push_back(*p); - } - } - } - - // log our failure - mds->mdlog->submit_entry(new EImportFinish(dir, false)); // log failure - - // bystanders? - if (import_bystanders[dir].empty()) { - dout(7) << "no bystanders, finishing reverse now" << dendl; - import_reverse_unfreeze(dir); - } else { - // notify them; wait in aborting state - dout(7) << "notifying bystanders of abort" << dendl; - import_notify_abort(dir, bounds); - import_state[dir->dirfrag()] = IMPORT_ABORTING; - } -} - -void Migrator::import_notify_abort(CDir *dir, set& bounds) -{ - dout(7) << "import_notify_abort " << *dir << dendl; - - for (set::iterator p = import_bystanders[dir].begin(); - p != import_bystanders[dir].end(); - ++p) { - // NOTE: the bystander will think i am _only_ auth, because they will have seen - // the exporter's failure and updated the subtree auth. see mdcache->handle_mds_failure(). - MExportDirNotify *notify = - new MExportDirNotify(dir->dirfrag(), true, - pair(mds->get_nodeid(), CDIR_AUTH_UNKNOWN), - pair(import_peer[dir->dirfrag()], CDIR_AUTH_UNKNOWN)); - notify->copy_bounds(bounds); - mds->send_message_mds(notify, *p, MDS_PORT_MIGRATOR); - } -} - -void Migrator::import_reverse_unfreeze(CDir *dir) -{ - dout(7) << "import_reverse_unfreeze " << *dir << dendl; - dir->unfreeze_tree(); - cache->discard_delayed_expire(dir); - import_reverse_final(dir); -} - -void Migrator::import_reverse_final(CDir *dir) -{ - dout(7) << "import_reverse_final " << *dir << dendl; - - // clean up - import_state.erase(dir->dirfrag()); - import_peer.erase(dir->dirfrag()); - import_bystanders.erase(dir); - import_bound_ls.erase(dir); - import_updated_scatterlocks.erase(dir); - - // send pending import_maps? - mds->mdcache->maybe_send_pending_resolves(); - - cache->show_subtrees(); - //audit(); // this fails, bc we munge up the subtree map during handle_import_map (resolve phase) -} - - -void Migrator::import_logged_start(CDir *dir, int from) -{ - dout(7) << "import_logged " << *dir << dendl; - - // note state - import_state[dir->dirfrag()] = IMPORT_ACKING; - - // send notify's etc. - dout(7) << "sending ack for " << *dir << " to old auth mds" << from << dendl; - mds->send_message_mds(new MExportDirAck(dir->dirfrag()), - from, MDS_PORT_MIGRATOR); - - cache->show_subtrees(); -} - - -void Migrator::handle_export_finish(MExportDirFinish *m) -{ - CDir *dir = cache->get_dirfrag(m->get_dirfrag()); - assert(dir); - dout(7) << "handle_export_finish on " << *dir << dendl; - import_finish(dir); - delete m; -} - -void Migrator::import_finish(CDir *dir) -{ - dout(7) << "import_finish on " << *dir << dendl; - - // log finish - mds->mdlog->submit_entry(new EImportFinish(dir, true)); - - // clear updated scatterlocks - for (list::iterator p = import_updated_scatterlocks[dir].begin(); - p != import_updated_scatterlocks[dir].end(); - ++p) - (*p)->clear_updated(); - - // remove pins - set bounds; - cache->get_subtree_bounds(dir, bounds); - import_remove_pins(dir, bounds); - - // adjust auth, with possible subtree merge. - cache->adjust_subtree_auth(dir, mds->get_nodeid()); - cache->try_subtree_merge(dir); - - // clear import state (we're done!) - import_state.erase(dir->dirfrag()); - import_peer.erase(dir->dirfrag()); - import_bystanders.erase(dir); - import_bound_ls.erase(dir); - import_updated_scatterlocks.erase(dir); - - // process delayed expires - cache->process_delayed_expire(dir); - - // ok now unfreeze (and thus kick waiters) - dir->unfreeze_tree(); - - cache->show_subtrees(); - //audit(); // this fails, bc we munge up the subtree map during handle_import_map (resolve phase) - - // send pending import_maps? - mds->mdcache->maybe_send_pending_resolves(); - - // is it empty? - if (dir->get_size() == 0 && - !dir->inode->is_auth()) { - // reexport! - export_empty_import(dir); - } -} - - -void Migrator::decode_import_inode(CDentry *dn, bufferlist::iterator& blp, int oldauth, - map& imported_client_map, - LogSegment *ls, - list& updated_scatterlocks) -{ - dout(15) << "decode_import_inode on " << *dn << dendl; - - inodeno_t ino; - ::_decode_simple(ino, blp); - - bool added = false; - CInode *in = cache->get_inode(ino); - if (!in) { - in = new CInode(mds->mdcache); - added = true; - } else { - in->state_set(CInode::STATE_AUTH); - } - - // state after link -- or not! -sage - set merged_client_caps; - in->decode_import(blp, merged_client_caps, ls); - - // link before state -- or not! -sage - if (dn->inode != in) { - assert(!dn->inode); - dn->dir->link_primary_inode(dn, in); - } - - // add inode? - if (added) { - cache->add_inode(in); - dout(10) << "added " << *in << dendl; - } else { - dout(10) << " had " << *in << dendl; - } - - // clear if dirtyscattered, since we're going to journal this - // but not until we _actually_ finish the import... - if (in->dirlock.is_updated()) - updated_scatterlocks.push_back(&in->dirlock); - - // put in autoscatter list? - // this is conservative, but safe. - if (in->dirlock.get_state() == LOCK_SCATTER) - mds->locker->note_autoscattered(&in->dirlock); - - // adjust replica list - //assert(!in->is_replica(oldauth)); // not true on failed export - in->add_replica( oldauth, CInode::EXPORT_NONCE ); - if (in->is_replica(mds->get_nodeid())) - in->remove_replica(mds->get_nodeid()); - - // caps - for (set::iterator it = merged_client_caps.begin(); - it != merged_client_caps.end(); - it++) { - dout(0) << "merged caps for client" << *it << " on " << *in << dendl; - MClientFileCaps *caps = new MClientFileCaps(MClientFileCaps::OP_REAP, - in->inode, - in->client_caps[*it].get_last_seq(), - in->client_caps[*it].pending(), - in->client_caps[*it].wanted()); - caps->set_mds( oldauth ); // reap from whom? - mds->send_message_client_maybe_open(caps, imported_client_map[*it]); - } -} - - -int Migrator::decode_import_dir(bufferlist::iterator& blp, - int oldauth, - CDir *import_root, - EImportStart *le, - map& imported_client_map, - LogSegment *ls, - list& updated_scatterlocks) -{ - // set up dir - dirfrag_t df; - ::_decode_simple(df, blp); - - CInode *diri = cache->get_inode(df.ino); - assert(diri); - CDir *dir = diri->get_or_open_dirfrag(mds->mdcache, df.frag); - assert(dir); - - dout(7) << "decode_import_dir " << *dir << dendl; - - // assimilate state - dir->decode_import(blp); - - // mark (may already be marked from get_or_open_dir() above) - if (!dir->is_auth()) - dir->state_set(CDir::STATE_AUTH); - - // adjust replica list - //assert(!dir->is_replica(oldauth)); // not true on failed export - dir->add_replica(oldauth); - if (dir->is_replica(mds->get_nodeid())) - dir->remove_replica(mds->get_nodeid()); - - // add to journal entry - if (le) - le->metablob.add_dir(dir, - true, // Hmm: dirty=false would be okay in some cases - dir->is_complete()); - - int num_imported = 0; - - // take all waiters on this dir - // NOTE: a pass of imported data is guaranteed to get all of my waiters because - // a replica's presense in my cache implies/forces it's presense in authority's. - list waiters; - - dir->take_waiting(CDir::WAIT_ANY, waiters); - for (list::iterator it = waiters.begin(); - it != waiters.end(); - it++) - import_root->add_waiter(CDir::WAIT_UNFREEZE, *it); // UNFREEZE will get kicked both on success or failure - - dout(15) << "doing contents" << dendl; - - // contents - long nden; - ::_decode_simple(nden, blp); - - for (; nden>0; nden--) { - num_imported++; - - // dentry - string dname; - ::_decode_simple(dname, blp); - - CDentry *dn = dir->lookup(dname); - if (!dn) - dn = dir->add_null_dentry(dname); - - dn->decode_import(blp, ls); - - dn->add_replica(oldauth, CDentry::EXPORT_NONCE); - if (dn->is_replica(mds->get_nodeid())) - dn->remove_replica(mds->get_nodeid()); - - dout(15) << "decode_import_dir got " << *dn << dendl; - - // points to... - char icode; - ::_decode_simple(icode, blp); - - if (icode == 'N') { - // null dentry - assert(dn->is_null()); - - // fall thru - } - else if (icode == 'L') { - // remote link - inodeno_t ino; - unsigned char d_type; - ::_decode_simple(ino, blp); - ::_decode_simple(d_type, blp); - if (dn->is_remote()) { - assert(dn->get_remote_ino() == ino); - } else { - dir->link_remote_inode(dn, ino, d_type); - } - } - else if (icode == 'I') { - // inode - decode_import_inode(dn, blp, oldauth, imported_client_map, ls, updated_scatterlocks); - } - - // add dentry to journal entry - if (le) - le->metablob.add_dentry(dn, dn->is_dirty()); - } - - dout(7) << "decode_import_dir done " << *dir << dendl; - return num_imported; -} - - - - - -// authority bystander - -void Migrator::handle_export_notify(MExportDirNotify *m) -{ - CDir *dir = cache->get_dirfrag(m->get_dirfrag()); - - int from = m->get_source().num(); - pair old_auth = m->get_old_auth(); - pair new_auth = m->get_new_auth(); - - if (!dir) { - dout(7) << "handle_export_notify " << old_auth << " -> " << new_auth - << " on missing dir " << m->get_dirfrag() << dendl; - } else if (dir->authority() != old_auth) { - dout(7) << "handle_export_notify old_auth was " << dir->authority() - << " != " << old_auth << " -> " << new_auth - << " on " << *dir << dendl; - } else { - dout(7) << "handle_export_notify " << old_auth << " -> " << new_auth - << " on " << *dir << dendl; - // adjust auth - set have; - cache->map_dirfrag_set(m->get_bounds(), have); - cache->adjust_bounded_subtree_auth(dir, have, new_auth); - - // induce a merge? - cache->try_subtree_merge(dir); - } - - // send ack - if (m->wants_ack()) { - mds->send_message_mds(new MExportDirNotifyAck(m->get_dirfrag()), - from, MDS_PORT_MIGRATOR); - } else { - // aborted. no ack. - dout(7) << "handle_export_notify no ack requested" << dendl; - } - - delete m; -} - - - - - - - - - - - - - diff --git a/branches/sage/crush/mds/Migrator.h b/branches/sage/crush/mds/Migrator.h deleted file mode 100644 index 07a8731868a92..0000000000000 --- a/branches/sage/crush/mds/Migrator.h +++ /dev/null @@ -1,260 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_MIGRATOR_H -#define __MDS_MIGRATOR_H - -#include "include/types.h" - -#include -#include -#include -using std::map; -using std::list; -using std::set; - - -class MDS; -class CDir; -class CInode; -class CDentry; - -class MExportDirDiscover; -class MExportDirDiscoverAck; -class MExportDirCancel; -class MExportDirPrep; -class MExportDirPrepAck; -class MExportDir; -class MExportDirAck; -class MExportDirNotify; -class MExportDirNotifyAck; -class MExportDirFinish; - -class EImportStart; - - -class Migrator { -private: - MDS *mds; - MDCache *cache; - - // -- exports -- -public: - // export stages. used to clean up intelligently if there's a failure. - const static int EXPORT_DISCOVERING = 1; // dest is disovering export dir - const static int EXPORT_FREEZING = 2; // we're freezing the dir tree - const static int EXPORT_PREPPING = 4; // sending dest spanning tree to export bounds - const static int EXPORT_WARNING = 5; // warning bystanders of dir_auth_pending - const static int EXPORT_EXPORTING = 6; // sent actual export, waiting for ack - const static int EXPORT_LOGGINGFINISH = 7; // logging EExportFinish - const static int EXPORT_NOTIFYING = 8; // waiting for notifyacks - const static int EXPORT_ABORTING = 9; // notifying bystanders of abort - static const char *get_export_statename(int s) { - switch (s) { - case EXPORT_DISCOVERING: return "discovering"; - case EXPORT_FREEZING: return "freezing"; - case EXPORT_PREPPING: return "prepping"; - case EXPORT_WARNING: return "warning"; - case EXPORT_EXPORTING: return "exporting"; - case EXPORT_LOGGINGFINISH: return "loggingfinish"; - case EXPORT_NOTIFYING: return "notifying"; - case EXPORT_ABORTING: return "aborting"; - default: assert(0); return 0; - } - } - -protected: - // export fun - map export_state; - map export_peer; - //map > export_data; // only during EXPORTING state - map > export_warning_ack_waiting; - map > export_notify_ack_waiting; - - map > export_finish_waiters; - - list< pair > export_queue; - - // -- imports -- -public: - const static int IMPORT_DISCOVERING = 1; // waiting for prep - const static int IMPORT_DISCOVERED = 2; // waiting for prep - const static int IMPORT_PREPPING = 3; // opening dirs on bounds - const static int IMPORT_PREPPED = 4; // opened bounds, waiting for import - const static int IMPORT_LOGGINGSTART = 5; // got import, logging EImportStart - const static int IMPORT_ACKING = 6; // logged EImportStart, sent ack, waiting for finish - const static int IMPORT_ABORTING = 8; // notifying bystanders of an abort before unfreezing - static const char *get_import_statename(int s) { - switch (s) { - case IMPORT_DISCOVERING: return "discovering"; - case IMPORT_DISCOVERED: return "discovered"; - case IMPORT_PREPPING: return "prepping"; - case IMPORT_PREPPED: return "prepped"; - case IMPORT_LOGGINGSTART: return "loggingstart"; - case IMPORT_ACKING: return "acking"; - case IMPORT_ABORTING: return "aborting"; - default: assert(0); return 0; - } - } - -protected: - map import_state; // FIXME make these dirfrags - map import_peer; - map > import_bystanders; - map > import_bound_ls; - map > import_updated_scatterlocks; - - /* - // -- hashing madness -- - multimap unhash_waiting; // nodes i am waiting for UnhashDirAck's from - multimap import_hashed_replicate_waiting; // nodes i am waiting to discover to complete my import of a hashed dir - // maps frozen_dir_ino's to waiting-for-discover ino's. - multimap import_hashed_frozen_waiting; // dirs i froze (for the above) - */ - - -public: - // -- cons -- - Migrator(MDS *m, MDCache *c) : mds(m), cache(c) {} - - void dispatch(Message*); - - void show_importing(); - void show_exporting(); - - // -- status -- - int is_exporting(CDir *dir) { - if (export_state.count(dir)) return export_state[dir]; - return 0; - } - bool is_exporting() { return !export_state.empty(); } - int is_importing(dirfrag_t df) { - if (import_state.count(df)) return import_state[df]; - return 0; - } - bool is_importing() { return !import_state.empty(); } - - int get_import_state(dirfrag_t df) { - assert(import_state.count(df)); - return import_state[df]; - } - int get_import_peer(dirfrag_t df) { - assert(import_peer.count(df)); - return import_peer[df]; - } - - int get_export_state(CDir *dir) { - assert(export_state.count(dir)); - return export_state[dir]; - } - // this returns true if we are export @dir, - // and are not waiting for @who to be - // be warned of ambiguous auth. - // only returns meaningful results during EXPORT_WARNING state. - bool export_has_warned(CDir *dir, int who) { - assert(is_exporting(dir)); - assert(export_state[dir] == EXPORT_WARNING); - return (export_warning_ack_waiting[dir].count(who) == 0); - } - - - // -- misc -- - void handle_mds_failure_or_stop(int who); - - void audit(); - - // -- import/export -- - // exporter - public: - void export_dir(CDir *dir, int dest); - void export_empty_import(CDir *dir); - - void export_dir_nicely(CDir *dir, int dest); - void maybe_do_queued_export(); - void clear_export_queue() { - export_queue.clear(); - } - - void encode_export_inode(CInode *in, bufferlist& enc_state, - map& exported_client_map); - void finish_export_inode(CInode *in, utime_t now, list& finished); - int encode_export_dir(bufferlist& exportbl, - CDir *dir, - map& exported_client_map, - utime_t now); - void finish_export_dir(CDir *dir, list& finished, utime_t now); - - void add_export_finish_waiter(CDir *dir, Context *c) { - export_finish_waiters[dir].push_back(c); - } - void clear_export_proxy_pins(CDir *dir); - - protected: - void handle_export_discover_ack(MExportDirDiscoverAck *m); - void export_frozen(CDir *dir); - void handle_export_prep_ack(MExportDirPrepAck *m); - void export_go(CDir *dir); - void export_reverse(CDir *dir); - void handle_export_ack(MExportDirAck *m); - void export_logged_finish(CDir *dir); - void handle_export_notify_ack(MExportDirNotifyAck *m); - void export_finish(CDir *dir); - - friend class C_MDC_ExportFreeze; - friend class C_MDS_ExportFinishLogged; - - - // importer - void handle_export_discover(MExportDirDiscover *m); - void handle_export_cancel(MExportDirCancel *m); - void handle_export_prep(MExportDirPrep *m); - void handle_export_dir(MExportDir *m); - -public: - void decode_import_inode(CDentry *dn, bufferlist::iterator& blp, int oldauth, - map& imported_client_map, - LogSegment *ls, - list& updated_scatterlocks); - int decode_import_dir(bufferlist::iterator& blp, - int oldauth, - CDir *import_root, - EImportStart *le, - map& imported_client_map, - LogSegment *ls, - list& updated_scatterlocks); - -public: - void import_reverse(CDir *dir); -protected: - void import_remove_pins(CDir *dir, set& bounds); - void import_reverse_unfreeze(CDir *dir); - void import_reverse_final(CDir *dir); - void import_notify_abort(CDir *dir, set& bounds); - void import_logged_start(CDir *dir, int from); - void handle_export_finish(MExportDirFinish *m); -public: - void import_finish(CDir *dir); -protected: - - friend class C_MDS_ImportDirLoggedStart; - friend class C_MDS_ImportDirLoggedFinish; - - // bystander - void handle_export_notify(MExportDirNotify *m); - - -}; - - -#endif diff --git a/branches/sage/crush/mds/Server.cc b/branches/sage/crush/mds/Server.cc deleted file mode 100644 index 3be92948cf0b3..0000000000000 --- a/branches/sage/crush/mds/Server.cc +++ /dev/null @@ -1,3976 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "MDS.h" -#include "Server.h" -#include "Locker.h" -#include "MDCache.h" -#include "MDLog.h" -#include "Migrator.h" -#include "MDBalancer.h" -#include "AnchorClient.h" -#include "IdAllocator.h" - -#include "msg/Messenger.h" - -#include "messages/MClientSession.h" -#include "messages/MClientRequest.h" -#include "messages/MClientReply.h" -#include "messages/MClientReconnect.h" -#include "messages/MClientFileCaps.h" - -#include "messages/MMDSSlaveRequest.h" - -#include "messages/MLock.h" - -#include "messages/MDentryUnlink.h" - -#include "events/EString.h" -#include "events/EUpdate.h" -#include "events/ESlaveUpdate.h" -#include "events/ESession.h" -#include "events/EOpen.h" - -#include "include/filepath.h" -#include "common/Timer.h" -#include "common/Logger.h" -#include "common/LogType.h" - -#include -#include - -#include -#include -using namespace std; - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) *_dout << dbeginl << g_clock.now() << " mds" << mds->get_nodeid() << ".server " -#define derr(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) *_derr << dbeginl << g_clock.now() << " mds" << mds->get_nodeid() << ".server " - - -void Server::reopen_logger(utime_t start, bool append) -{ - static LogType mdserver_logtype; - static bool didit = false; - if (!didit) { - didit = true; - mdserver_logtype.add_inc("hcreq"); // handle client req - mdserver_logtype.add_inc("hsreq"); // slave - mdserver_logtype.add_inc("hcsess"); // client session - mdserver_logtype.add_inc("dcreq"); // dispatch client req - mdserver_logtype.add_inc("dsreq"); // slave - } - - if (logger) - delete logger; - - // logger - char name[80]; - sprintf(name, "mds%d.server", mds->get_nodeid()); - logger = new Logger(name, &mdserver_logtype, append); - logger->set_start(start); -} - - -void Server::dispatch(Message *m) -{ - switch (m->get_type()) { - case MSG_CLIENT_RECONNECT: - handle_client_reconnect((MClientReconnect*)m); - return; - } - - // active? - if (!mds->is_active()) { - dout(3) << "not active yet, waiting" << dendl; - mds->wait_for_active(new C_MDS_RetryMessage(mds, m)); - return; - } - - switch (m->get_type()) { - case MSG_CLIENT_SESSION: - handle_client_session((MClientSession*)m); - return; - case MSG_CLIENT_REQUEST: - handle_client_request((MClientRequest*)m); - return; - case MSG_MDS_SLAVE_REQUEST: - handle_slave_request((MMDSSlaveRequest*)m); - return; - } - - dout(1) << "server unknown message " << m->get_type() << dendl; - assert(0); -} - - - -// ---------------------------------------------------------- -// SESSION management - - -class C_MDS_session_finish : public Context { - MDS *mds; - entity_inst_t client_inst; - bool open; - version_t cmapv; -public: - C_MDS_session_finish(MDS *m, entity_inst_t ci, bool s, version_t mv) : - mds(m), client_inst(ci), open(s), cmapv(mv) { } - void finish(int r) { - assert(r == 0); - mds->server->_session_logged(client_inst, open, cmapv); - } -}; - - -void Server::handle_client_session(MClientSession *m) -{ - dout(3) << "handle_client_session " << *m << " from " << m->get_source() << dendl; - int from = m->get_source().num(); - bool open = m->op == MClientSession::OP_REQUEST_OPEN; - - if (open) { - if (mds->clientmap.have_session(from)) { - dout(10) << "already open, dropping this req" << dendl; - delete m; - return; - } - if (mds->clientmap.is_opening(from)) { - dout(10) << "already opening, dropping this req" << dendl; - delete m; - return; - } - mds->clientmap.add_opening(from); - } else { - if (mds->clientmap.is_closing(from)) { - dout(10) << "already closing, dropping this req" << dendl; - delete m; - return; - } - if (m->seq < mds->clientmap.get_push_seq(from)) { - dout(10) << "old push seq " << m->seq << " < " << mds->clientmap.get_push_seq(from) - << ", dropping" << dendl; - delete m; - return; - } - assert(m->seq == mds->clientmap.get_push_seq(from)); - - mds->clientmap.add_closing(from); - } - - // journal it - version_t cmapv = mds->clientmap.inc_projected(); - mdlog->submit_entry(new ESession(m->get_source_inst(), open, cmapv), - new C_MDS_session_finish(mds, m->get_source_inst(), open, cmapv)); - delete m; - - if (logger) logger->inc("hcsess"); -} - -void Server::_session_logged(entity_inst_t client_inst, bool open, version_t cmapv) -{ - dout(10) << "_session_logged " << client_inst << " " << (open ? "open":"close") - << " " << cmapv - << dendl; - - // apply - int from = client_inst.name.num(); - if (open) { - assert(mds->clientmap.is_opening(from)); - mds->clientmap.open_session(client_inst); - } else { - assert(mds->clientmap.is_closing(from)); - mds->clientmap.close_session(from); - - // purge completed requests from clientmap - mds->clientmap.trim_completed_requests(from, 0); - } - - assert(cmapv == mds->clientmap.get_version()); - - // reply - if (open) - mds->messenger->send_message(new MClientSession(MClientSession::OP_OPEN), client_inst); - else - mds->messenger->send_message(new MClientSession(MClientSession::OP_CLOSE), client_inst); -} - - -void Server::terminate_sessions() -{ - dout(2) << "terminate_sessions" << dendl; - - // kill them off. clients will retry etc. - for (set::const_iterator p = mds->clientmap.get_session_set().begin(); - p != mds->clientmap.get_session_set().end(); - ++p) { - if (mds->clientmap.is_closing(*p)) - continue; - mds->clientmap.add_closing(*p); - version_t cmapv = mds->clientmap.inc_projected(); - mdlog->submit_entry(new ESession(mds->clientmap.get_inst(*p), false, cmapv), - new C_MDS_session_finish(mds, mds->clientmap.get_inst(*p), false, cmapv)); - } -} - - -void Server::reconnect_clients() -{ - // reconnect with clients - if (mds->clientmap.get_session_set().empty()) { - dout(7) << "reconnect_clients -- no sessions, doing nothing." << dendl; - reconnect_gather_finish(); - return; - } - - dout(7) << "reconnect_clients -- sending mdsmap to clients with sessions" << dendl; - - mds->bcast_mds_map(); // send mdsmap to all client sessions - - // init gather list - reconnect_start = g_clock.now(); - client_reconnect_gather = mds->clientmap.get_session_set(); - dout(1) << "reconnect_clients -- " << client_reconnect_gather.size() << " sessions" << dendl; -} - -void Server::handle_client_reconnect(MClientReconnect *m) -{ - dout(7) << "handle_client_reconnect " << m->get_source() << dendl; - int from = m->get_source().num(); - - if (m->closed) { - dout(7) << " client had no session, removing from clientmap" << dendl; - - mds->clientmap.add_closing(from); - version_t cmapv = mds->clientmap.inc_projected(); - mdlog->submit_entry(new ESession(mds->clientmap.get_inst(from), false, cmapv), - new C_MDS_session_finish(mds, mds->clientmap.get_inst(from), false, cmapv)); - - } else { - - // caps - for (map::iterator p = m->inode_caps.begin(); - p != m->inode_caps.end(); - ++p) { - CInode *in = mdcache->get_inode(p->first); - if (in && in->is_auth()) { - // we recovered it, and it's ours. take note. - dout(15) << "open caps on " << *in << dendl; - in->reconnect_cap(from, p->second); - reconnected_caps.insert(in); - continue; - } - - filepath path = m->inode_path[p->first]; - if ((in && !in->is_auth()) || - !mds->mdcache->path_is_mine(path)) { - // not mine. - dout(0) << "non-auth " << p->first << " " << m->inode_path[p->first] - << ", will pass off to authority" << dendl; - - // mark client caps stale. - inode_t fake_inode; - fake_inode.ino = p->first; - MClientFileCaps *stale = new MClientFileCaps(MClientFileCaps::OP_STALE, - fake_inode, - 0, - 0, // doesn't matter. - p->second.wanted); // doesn't matter. - mds->send_message_client(stale, m->get_source_inst()); - - // add to cap export list. - mdcache->rejoin_export_caps(p->first, m->inode_path[p->first], from, p->second); - } else { - // mine. fetch later. - dout(0) << "missing " << p->first << " " << m->inode_path[p->first] - << " (mine), will load later" << dendl; - mdcache->rejoin_recovered_caps(p->first, m->inode_path[p->first], from, p->second, - -1); // "from" me. - } - } - } - - // remove from gather set - client_reconnect_gather.erase(from); - if (client_reconnect_gather.empty()) reconnect_gather_finish(); - - delete m; -} - -/* - * called by mdcache, late in rejoin (right before acks are sent) - */ -void Server::process_reconnected_caps() -{ - dout(10) << "process_reconnected_caps" << dendl; - - // adjust filelock state appropriately - for (set::iterator p = reconnected_caps.begin(); - p != reconnected_caps.end(); - ++p) { - CInode *in = *p; - int issued = in->get_caps_issued(); - if (in->is_auth()) { - // wr? - if (issued & (CAP_FILE_WR|CAP_FILE_WRBUFFER)) { - if (issued & (CAP_FILE_RDCACHE|CAP_FILE_WRBUFFER)) { - in->filelock.set_state(LOCK_LONER); - } else { - in->filelock.set_state(LOCK_MIXED); - } - } - } else { - // note that client should perform stale/reap cleanup during reconnect. - assert(issued & (CAP_FILE_WR|CAP_FILE_WRBUFFER) == 0); // ???? - if (in->filelock.is_xlocked()) - in->filelock.set_state(LOCK_LOCK); - else - in->filelock.set_state(LOCK_SYNC); // might have been lock, previously - } - dout(15) << " issued " << cap_string(issued) - << " chose " << in->filelock - << " on " << *in << dendl; - } - reconnected_caps.clear(); // clean up -} - - -void Server::client_reconnect_failure(int from) -{ - dout(5) << "client_reconnect_failure on client" << from << dendl; - if (mds->is_reconnect() && - client_reconnect_gather.count(from)) { - client_reconnect_gather.erase(from); - if (client_reconnect_gather.empty()) - reconnect_gather_finish(); - } -} - -void Server::reconnect_gather_finish() -{ - dout(7) << "reconnect_gather_finish" << dendl; - mds->reconnect_done(); -} - - - -/******* - * some generic stuff for finishing off requests - */ - - -/* - * send generic response (just and error code) - */ -void Server::reply_request(MDRequest *mdr, int r, CInode *tracei) -{ - reply_request(mdr, new MClientReply(mdr->client_request, r), tracei); -} - - -/* - * send given reply - * include a trace to tracei - */ -void Server::reply_request(MDRequest *mdr, MClientReply *reply, CInode *tracei) -{ - MClientRequest *req = mdr->client_request; - - dout(10) << "reply_request " << reply->get_result() - << " (" << strerror(-reply->get_result()) - << ") " << *req << dendl; - - // note result code in clientmap? - if (!req->is_idempotent()) - mds->clientmap.add_completed_request(mdr->reqid); - - /* - if (tracei && !tracei->hack_accessed) { - tracei->hack_accessed = true; - mds->logger->inc("newt"); - if (tracei->parent && - tracei->parent->dir->hack_num_accessed >= 0) { - tracei->parent->dir->hack_num_accessed++; - if (tracei->parent->dir->hack_num_accessed == 1) - mds->logger->inc("dirt1"); - if (tracei->parent->dir->hack_num_accessed == 2) - mds->logger->inc("dirt2"); - if (tracei->parent->dir->hack_num_accessed == 3) - mds->logger->inc("dirt3"); - if (tracei->parent->dir->hack_num_accessed == 4) - mds->logger->inc("dirt4"); - if (tracei->parent->dir->hack_num_accessed == 5) - mds->logger->inc("dirt5"); - } - } - */ - - // include trace - if (tracei) { - reply->set_trace_dist( tracei, mds->get_nodeid() ); - } - - // send reply - messenger->send_message(reply, req->get_client_inst()); - - // finish request - mdcache->request_finish(mdr); -} - - - - - -/*** - * process a client request - */ -void Server::handle_client_request(MClientRequest *req) -{ - dout(4) << "handle_client_request " << *req << dendl; - int client = req->get_client(); - - if (logger) logger->inc("hcreq"); - - if (!mds->is_active()) { - dout(5) << " not active, discarding client request." << dendl; - delete req; - return; - } - - if (!mdcache->get_root()) { - dout(5) << "need to open root" << dendl; - mdcache->open_root(new C_MDS_RetryMessage(mds, req)); - return; - } - - // active session? - if (!mds->clientmap.have_session(client)) { - dout(5) << "no session for client" << client << ", dropping" << dendl; - delete req; - return; - } - - - // okay, i want - CInode *ref = 0; - - // retry? - if (req->get_retry_attempt()) { - if (mds->clientmap.have_completed_request(req->get_reqid())) { - dout(5) << "already completed " << req->get_reqid() << dendl; - mds->messenger->send_message(new MClientReply(req, 0), - req->get_client_inst()); - delete req; - return; - } - } - // trim completed_request list - if (req->get_oldest_client_tid() > 0) { - dout(15) << " oldest_client_tid=" << req->get_oldest_client_tid() << dendl; - mds->clientmap.trim_completed_requests(client, - req->get_oldest_client_tid()); - } - - - // ----- - // some ops are on ino's - switch (req->get_op()) { - case MDS_OP_FSTAT: - ref = mdcache->get_inode(req->args.fstat.ino); - assert(ref); - break; - - case MDS_OP_TRUNCATE: - if (!req->args.truncate.ino) - break; // can be called w/ either fh OR path - ref = mdcache->get_inode(req->args.truncate.ino); - assert(ref); - break; - - case MDS_OP_FSYNC: - ref = mdcache->get_inode(req->args.fsync.ino); // fixme someday no ino needed? - assert(ref); - break; - } - - // register + dispatch - MDRequest *mdr = mdcache->request_start(req); - if (!mdr) return; - - if (ref) { - dout(10) << "inode op on ref " << *ref << dendl; - mdr->ref = ref; - mdr->pin(ref); - } - - dispatch_client_request(mdr); - return; -} - - -void Server::dispatch_client_request(MDRequest *mdr) -{ - MClientRequest *req = mdr->client_request; - - if (logger) logger->inc("dcreq"); - - if (mdr->ref) { - dout(7) << "dispatch_client_request " << *req << " ref " << *mdr->ref << dendl; - } else { - dout(7) << "dispatch_client_request " << *req << dendl; - } - - // we shouldn't be waiting on anyone. - assert(mdr->more()->waiting_on_slave.empty()); - - switch (req->get_op()) { - - // inodes ops. - case MDS_OP_STAT: - case MDS_OP_LSTAT: - handle_client_stat(mdr); - break; - case MDS_OP_UTIME: - handle_client_utime(mdr); - break; - case MDS_OP_CHMOD: - handle_client_chmod(mdr); - break; - case MDS_OP_CHOWN: - handle_client_chown(mdr); - break; - case MDS_OP_TRUNCATE: - handle_client_truncate(mdr); - break; - case MDS_OP_READDIR: - handle_client_readdir(mdr); - break; - case MDS_OP_FSYNC: - //handle_client_fsync(req, ref); - break; - - // funky. - case MDS_OP_OPEN: - if (req->args.open.flags & O_CREAT) - handle_client_openc(mdr); - else - handle_client_open(mdr); - break; - - // namespace. - // no prior locks. - case MDS_OP_MKNOD: - handle_client_mknod(mdr); - break; - case MDS_OP_LINK: - handle_client_link(mdr); - break; - case MDS_OP_UNLINK: - case MDS_OP_RMDIR: - handle_client_unlink(mdr); - break; - case MDS_OP_RENAME: - handle_client_rename(mdr); - break; - case MDS_OP_MKDIR: - handle_client_mkdir(mdr); - break; - case MDS_OP_SYMLINK: - handle_client_symlink(mdr); - break; - - - default: - dout(1) << " unknown client op " << req->get_op() << dendl; - assert(0); - } -} - - -// --------------------------------------- -// SLAVE REQUESTS - -void Server::handle_slave_request(MMDSSlaveRequest *m) -{ - dout(4) << "handle_slave_request " << m->get_reqid() << " from " << m->get_source() << dendl; - int from = m->get_source().num(); - - if (logger) logger->inc("hsreq"); - - // reply? - if (m->is_reply()) { - - switch (m->get_op()) { - case MMDSSlaveRequest::OP_XLOCKACK: - { - // identify lock, master request - SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(), - m->get_object_info()); - MDRequest *mdr = mdcache->request_get(m->get_reqid()); - mdr->more()->slaves.insert(from); - dout(10) << "got remote xlock on " << *lock << " on " << *lock->get_parent() << dendl; - mdr->xlocks.insert(lock); - mdr->locks.insert(lock); - lock->get_xlock(mdr); - lock->finish_waiters(SimpleLock::WAIT_REMOTEXLOCK); - } - break; - - case MMDSSlaveRequest::OP_AUTHPINACK: - { - MDRequest *mdr = mdcache->request_get(m->get_reqid()); - handle_slave_auth_pin_ack(mdr, m); - } - break; - - case MMDSSlaveRequest::OP_LINKPREPACK: - { - MDRequest *mdr = mdcache->request_get(m->get_reqid()); - handle_slave_link_prep_ack(mdr, m); - } - break; - - case MMDSSlaveRequest::OP_RENAMEPREPACK: - { - MDRequest *mdr = mdcache->request_get(m->get_reqid()); - handle_slave_rename_prep_ack(mdr, m); - } - break; - - default: - assert(0); - } - - // done with reply. - delete m; - return; - - } else { - // am i a new slave? - MDRequest *mdr; - if (mdcache->have_request(m->get_reqid())) { - // existing? - mdr = mdcache->request_get(m->get_reqid()); - if (mdr->slave_to_mds != from) { // may not even be a slave! (e.g. forward race) - dout(10) << "local request " << *mdr << " not slave to mds" << from - << ", ignoring " << *m << dendl; - delete m; - return; - } - } else { - // new? - if (m->get_op() == MMDSSlaveRequest::OP_FINISH) { - dout(10) << "missing slave request for " << m->get_reqid() - << " OP_FINISH, must have lost race with a forward" << dendl; - delete m; - return; - } - mdr = mdcache->request_start_slave(m->get_reqid(), m->get_source().num()); - } - assert(mdr->slave_request == 0); // only one at a time, please! - mdr->slave_request = m; - - dispatch_slave_request(mdr); - } -} - -void Server::dispatch_slave_request(MDRequest *mdr) -{ - dout(7) << "dispatch_slave_request " << *mdr << " " << *mdr->slave_request << dendl; - - if (mdr->aborted) { - dout(7) << " abort flag set, finishing" << dendl; - mdcache->request_finish(mdr); - return; - } - - if (logger) logger->inc("dsreq"); - - switch (mdr->slave_request->get_op()) { - case MMDSSlaveRequest::OP_XLOCK: - { - // identify object - SimpleLock *lock = mds->locker->get_lock(mdr->slave_request->get_lock_type(), - mdr->slave_request->get_object_info()); - - if (lock && lock->get_parent()->is_auth()) { - // xlock. - // use acquire_locks so that we get auth_pinning. - set rdlocks; - set wrlocks; - set xlocks = mdr->xlocks; - xlocks.insert(lock); - - if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) - return; - - // ack - MMDSSlaveRequest *r = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_XLOCKACK); - r->set_lock_type(lock->get_type()); - lock->get_parent()->set_object_info(r->get_object_info()); - mds->send_message_mds(r, mdr->slave_request->get_source().num(), MDS_PORT_SERVER); - } else { - if (lock) { - dout(10) << "not auth for remote xlock attempt, dropping on " - << *lock << " on " << *lock->get_parent() << dendl; - } else { - dout(10) << "don't have object, dropping" << dendl; - assert(0); // can this happen, if we auth pinned properly. - } - } - - // done. - delete mdr->slave_request; - mdr->slave_request = 0; - } - break; - - case MMDSSlaveRequest::OP_UNXLOCK: - { - SimpleLock *lock = mds->locker->get_lock(mdr->slave_request->get_lock_type(), - mdr->slave_request->get_object_info()); - assert(lock); - mds->locker->xlock_finish(lock, mdr); - - // done. no ack necessary. - delete mdr->slave_request; - mdr->slave_request = 0; - } - break; - - case MMDSSlaveRequest::OP_AUTHPIN: - handle_slave_auth_pin(mdr); - break; - - case MMDSSlaveRequest::OP_LINKPREP: - case MMDSSlaveRequest::OP_UNLINKPREP: - handle_slave_link_prep(mdr); - break; - - case MMDSSlaveRequest::OP_RENAMEPREP: - handle_slave_rename_prep(mdr); - break; - - case MMDSSlaveRequest::OP_FINISH: - // finish off request. - mdcache->request_finish(mdr); - break; - - default: - assert(0); - } -} - - -void Server::handle_slave_auth_pin(MDRequest *mdr) -{ - dout(10) << "handle_slave_auth_pin " << *mdr << dendl; - - // build list of objects - list objects; - bool fail = false; - - for (list::iterator p = mdr->slave_request->get_authpins().begin(); - p != mdr->slave_request->get_authpins().end(); - ++p) { - MDSCacheObject *object = mdcache->get_object(*p); - if (!object) { - dout(10) << " don't have " << *p << dendl; - fail = true; - break; - } - - objects.push_back(object); - } - - // can we auth pin them? - if (!fail) { - for (list::iterator p = objects.begin(); - p != objects.end(); - ++p) { - if (!(*p)->is_auth()) { - dout(10) << " not auth for " << **p << dendl; - fail = true; - break; - } - if (!mdr->is_auth_pinned(*p) && - !(*p)->can_auth_pin()) { - // wait - dout(10) << " waiting for authpinnable on " << **p << dendl; - (*p)->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr)); - mdr->drop_local_auth_pins(); - return; - } - } - } - - // auth pin! - if (fail) { - mdr->drop_local_auth_pins(); // just in case - } else { - for (list::iterator p = objects.begin(); - p != objects.end(); - ++p) { - dout(10) << "auth_pinning " << **p << dendl; - mdr->auth_pin(*p); - } - } - - // ack! - MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_AUTHPINACK); - - // return list of my auth_pins (if any) - for (set::iterator p = mdr->auth_pins.begin(); - p != mdr->auth_pins.end(); - ++p) { - MDSCacheObjectInfo info; - (*p)->set_object_info(info); - reply->get_authpins().push_back(info); - } - - mds->send_message_mds(reply, mdr->slave_to_mds, MDS_PORT_SERVER); - - // clean up this request - delete mdr->slave_request; - mdr->slave_request = 0; - return; -} - -void Server::handle_slave_auth_pin_ack(MDRequest *mdr, MMDSSlaveRequest *ack) -{ - dout(10) << "handle_slave_auth_pin_ack on " << *mdr << " " << *ack << dendl; - int from = ack->get_source().num(); - - // added auth pins? - set pinned; - for (list::iterator p = ack->get_authpins().begin(); - p != ack->get_authpins().end(); - ++p) { - MDSCacheObject *object = mdcache->get_object(*p); - assert(object); // we pinned it - dout(10) << " remote has pinned " << *object << dendl; - if (!mdr->is_auth_pinned(object)) - mdr->remote_auth_pins.insert(object); - pinned.insert(object); - } - - // removed auth pins? - set::iterator p = mdr->remote_auth_pins.begin(); - while (p != mdr->remote_auth_pins.end()) { - if ((*p)->authority().first == from && - pinned.count(*p) == 0) { - dout(10) << " remote has unpinned " << **p << dendl; - set::iterator o = p; - ++p; - mdr->remote_auth_pins.erase(o); - } else { - ++p; - } - } - - // note slave - mdr->more()->slaves.insert(from); - - // clear from waiting list - assert(mdr->more()->waiting_on_slave.count(from)); - mdr->more()->waiting_on_slave.erase(from); - - // go again? - if (mdr->more()->waiting_on_slave.empty()) - dispatch_client_request(mdr); - else - dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl; -} - - -// --------------------------------------- -// HELPERS - - -/** validate_dentry_dir - * - * verify that the dir exists and would own the dname. - * do not check if the dentry exists. - */ -CDir *Server::validate_dentry_dir(MDRequest *mdr, CInode *diri, const string& dname) -{ - // make sure parent is a dir? - if (!diri->is_dir()) { - dout(7) << "validate_dentry_dir: not a dir" << dendl; - reply_request(mdr, -ENOTDIR); - return false; - } - - // which dirfrag? - frag_t fg = diri->pick_dirfrag(dname); - CDir *dir = try_open_auth_dirfrag(diri, fg, mdr); - if (!dir) - return 0; - - // frozen? - if (dir->is_frozen()) { - dout(7) << "dir is frozen " << *dir << dendl; - dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr)); - return false; - } - - return dir; -} - - -/** prepare_null_dentry - * prepare a null (or existing) dentry in given dir. - * wait for any dn lock. - */ -CDentry* Server::prepare_null_dentry(MDRequest *mdr, CDir *dir, const string& dname, bool okexist) -{ - dout(10) << "prepare_null_dentry " << dname << " in " << *dir << dendl; - assert(dir->is_auth()); - - // does it already exist? - CDentry *dn = dir->lookup(dname); - if (dn) { - if (dn->lock.is_xlocked_by_other(mdr)) { - dout(10) << "waiting on xlocked dentry " << *dn << dendl; - dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr)); - return 0; - } - - if (!dn->is_null()) { - // name already exists - dout(10) << "dentry " << dname << " exists in " << *dir << dendl; - if (!okexist) { - reply_request(mdr, -EEXIST); - return 0; - } - } - - return dn; - } - - // make sure dir is complete - if (!dir->is_complete()) { - dout(7) << " incomplete dir contents for " << *dir << ", fetching" << dendl; - dir->fetch(new C_MDS_RetryRequest(mdcache, mdr)); - return 0; - } - - // create - dn = dir->add_null_dentry(dname); - dn->mark_new(); - dout(10) << "prepare_null_dentry added " << *dn << dendl; - - return dn; -} - - -/** prepare_new_inode - * - * create a new inode. set c/m/atime. hit dir pop. - */ -CInode* Server::prepare_new_inode(MDRequest *mdr, CDir *dir) -{ - CInode *in = mdcache->create_inode(); - in->inode.uid = mdr->client_request->get_caller_uid(); - in->inode.gid = mdr->client_request->get_caller_gid(); - in->inode.ctime = in->inode.mtime = in->inode.atime = mdr->now; // now - dout(10) << "prepare_new_inode " << *in << dendl; - - return in; -} - - - -CDir *Server::traverse_to_auth_dir(MDRequest *mdr, vector &trace, filepath refpath) -{ - // figure parent dir vs dname - if (refpath.depth() == 0) { - dout(7) << "can't do that to root" << dendl; - reply_request(mdr, -EINVAL); - return 0; - } - string dname = refpath.last_dentry(); - refpath.pop_dentry(); - - dout(10) << "traverse_to_auth_dir dirpath " << refpath << " dname " << dname << dendl; - - // traverse to parent dir - int r = mdcache->path_traverse(mdr, mdr->client_request, - 0, refpath, trace, true, - MDS_TRAVERSE_FORWARD); - if (r > 0) return 0; // delayed - if (r < 0) { - reply_request(mdr, r); - return 0; - } - - // open inode - CInode *diri; - if (trace.empty()) - diri = mdcache->get_root(); - else - diri = mdcache->get_dentry_inode(trace[trace.size()-1], mdr); - if (!diri) - return 0; // opening inode. - - // is it an auth dir? - CDir *dir = validate_dentry_dir(mdr, diri, dname); - if (!dir) - return 0; // forwarded or waiting for freeze - - dout(10) << "traverse_to_auth_dir " << *dir << dendl; - return dir; -} - - - -CInode* Server::rdlock_path_pin_ref(MDRequest *mdr, bool want_auth) -{ - // already got ref? - if (mdr->ref) - return mdr->ref; - - MClientRequest *req = mdr->client_request; - - // traverse - filepath refpath = req->get_filepath(); - vector trace; - int r = mdcache->path_traverse(mdr, req, - 0, refpath, - trace, req->follow_trailing_symlink(), - MDS_TRAVERSE_FORWARD); - if (r > 0) return false; // delayed - if (r < 0) { // error - reply_request(mdr, r); - return 0; - } - - // open ref inode - CInode *ref = 0; - if (trace.empty()) - ref = mdcache->get_root(); - else { - CDentry *dn = trace[trace.size()-1]; - - // if no inode (null or unattached remote), fw to dentry auth? - if (want_auth && !dn->is_auth() && - (dn->is_null() || - (dn->is_remote() && dn->inode))) { - if (dn->is_ambiguous_auth()) { - dout(10) << "waiting for single auth on " << *dn << dendl; - dn->dir->add_waiter(CInode::WAIT_SINGLEAUTH, new C_MDS_RetryRequest(mdcache, mdr)); - } else { - dout(10) << "fw to auth for " << *dn << dendl; - mdcache->request_forward(mdr, dn->authority().first); - return 0; - } - } - - // open ref inode - ref = mdcache->get_dentry_inode(dn, mdr); - if (!ref) return 0; - } - dout(10) << "ref is " << *ref << dendl; - - // fw to inode auth? - if (want_auth && !ref->is_auth()) { - if (ref->is_ambiguous_auth()) { - dout(10) << "waiting for single auth on " << *ref << dendl; - ref->add_waiter(CInode::WAIT_SINGLEAUTH, new C_MDS_RetryRequest(mdcache, mdr)); - } else { - dout(10) << "fw to auth for " << *ref << dendl; - mdcache->request_forward(mdr, ref->authority().first); - } - return 0; - } - - // auth_pin? - if (want_auth) { - if (ref->is_frozen()) { - dout(7) << "waiting for !frozen/authpinnable on " << *ref << dendl; - ref->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr)); - return 0; - } - mdr->auth_pin(ref); - } - - // lock the path - set rdlocks, empty; - - for (int i=0; i<(int)trace.size(); i++) - rdlocks.insert(&trace[i]->lock); - - if (!mds->locker->acquire_locks(mdr, rdlocks, empty, empty)) - return 0; - - // set and pin ref - mdr->pin(ref); - mdr->ref = ref; - - // save the locked trace. - mdr->trace.swap(trace); - - return ref; -} - - -/** rdlock_path_xlock_dentry - * traverse path to the directory that could/would contain dentry. - * make sure i am auth for that dentry, forward as necessary. - * create null dentry in place (or use existing if okexist). - * get rdlocks on traversed dentries, xlock on new dentry. - */ -CDentry* Server::rdlock_path_xlock_dentry(MDRequest *mdr, bool okexist, bool mustexist) -{ - MClientRequest *req = mdr->client_request; - - vector trace; - CDir *dir = traverse_to_auth_dir(mdr, trace, req->get_filepath()); - if (!dir) return 0; - dout(10) << "rdlock_path_xlock_dentry dir " << *dir << dendl; - - // make sure we can auth_pin (or have already authpinned) dir - if (dir->is_frozen()) { - dout(7) << "waiting for !frozen/authpinnable on " << *dir << dendl; - dir->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr)); - return 0; - } - - // make a null dentry? - const string &dname = req->get_filepath().last_dentry(); - CDentry *dn; - if (mustexist) { - dn = dir->lookup(dname); - - // make sure dir is complete - if (!dn && !dir->is_complete()) { - dout(7) << " incomplete dir contents for " << *dir << ", fetching" << dendl; - dir->fetch(new C_MDS_RetryRequest(mdcache, mdr)); - return 0; - } - - // readable? - if (dn && dn->lock.is_xlocked_by_other(mdr)) { - dout(10) << "waiting on xlocked dentry " << *dn << dendl; - dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr)); - return 0; - } - - // exists? - if (!dn || dn->is_null()) { - dout(7) << "dentry " << dname << " dne in " << *dir << dendl; - reply_request(mdr, -ENOENT); - return 0; - } - } else { - dn = prepare_null_dentry(mdr, dir, dname, okexist); - if (!dn) - return 0; - } - - // -- lock -- - set rdlocks, wrlocks, xlocks; - - for (int i=0; i<(int)trace.size(); i++) - rdlocks.insert(&trace[i]->lock); - if (dn->is_null()) { - xlocks.insert(&dn->lock); // new dn, xlock - wrlocks.insert(&dn->dir->inode->dirlock); // also, wrlock on dir mtime - } else - rdlocks.insert(&dn->lock); // existing dn, rdlock - - if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) - return 0; - - // save the locked trace. - mdr->trace.swap(trace); - - return dn; -} - - - - - -/** - * try_open_auth_dirfrag -- open dirfrag, or forward to dirfrag auth - * - * @diri base indoe - * @fg the exact frag we want - * @mdr request - */ -CDir* Server::try_open_auth_dirfrag(CInode *diri, frag_t fg, MDRequest *mdr) -{ - CDir *dir = diri->get_dirfrag(fg); - - // not open and inode not mine? - if (!dir && !diri->is_auth()) { - int inauth = diri->authority().first; - dout(7) << "try_open_auth_dirfrag: not open, not inode auth, fw to mds" << inauth << dendl; - mdcache->request_forward(mdr, inauth); - return 0; - } - - // not open and inode frozen? - if (!dir && diri->is_frozen_dir()) { - dout(10) << "try_open_auth_dirfrag: dir inode is frozen, waiting " << *diri << dendl; - assert(diri->get_parent_dir()); - diri->get_parent_dir()->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr)); - return 0; - } - - // invent? - if (!dir) - dir = diri->get_or_open_dirfrag(mds->mdcache, fg); - - // am i auth for the dirfrag? - if (!dir->is_auth()) { - int auth = dir->authority().first; - dout(7) << "try_open_auth_dirfrag: not auth for " << *dir - << ", fw to mds" << auth << dendl; - mdcache->request_forward(mdr, auth); - return 0; - } - - return dir; -} - - - -/** predirty_dn_diri - * predirty the directory inode for a new dentry, if it is auth (and not root) - * BUG: root inode doesn't get dirtied properly, currently. blech. - */ -version_t Server::predirty_dn_diri(MDRequest *mdr, CDentry *dn, EMetaBlob *blob) -{ - version_t dirpv = 0; - CInode *diri = dn->dir->inode; - - if (diri->is_root()) return 0; - - if (diri->is_auth()) { - assert(mdr->wrlocks.count(&diri->dirlock)); - - dirpv = diri->pre_dirty(); - dout(10) << "predirty_dn_diri ctime/mtime " << mdr->now << " pv " << dirpv << " on " << *diri << dendl; - - // predirty+journal - inode_t *pi = diri->project_inode(); - if (dirpv) pi->version = dirpv; - pi->ctime = pi->mtime = mdr->now; - blob->add_dir_context(diri->get_parent_dn()->get_dir()); - blob->add_primary_dentry(diri->get_parent_dn(), true, 0, pi); - } else { - // journal the mtime change anyway. - inode_t *ji = blob->add_primary_dentry(diri->get_parent_dn(), true); - ji->ctime = ji->mtime = mdr->now; - - dout(10) << "predirty_dn_diri (non-auth) ctime/mtime " << mdr->now << " on " << *diri << dendl; - - blob->add_dirtied_inode_mtime(diri->ino(), mdr->now); - assert(mdr->ls); - mdr->ls->dirty_inode_mtimes.push_back(&diri->xlist_dirty_inode_mtime); - } - - return dirpv; -} - -/** dirty_dn_diri - * follow-up with actual dirty of inode after journal entry commits. - */ -void Server::dirty_dn_diri(MDRequest *mdr, CDentry *dn, version_t dirpv) -{ - CInode *diri = dn->dir->inode; - - if (diri->is_root()) return; - - if (dirpv) { - // we journaled and predirtied. - assert(diri->is_auth() && !diri->is_root()); - diri->pop_and_dirty_projected_inode(mdr->ls); - dout(10) << "dirty_dn_diri ctime/mtime " << mdr->now << " v " << diri->inode.version << " on " << *diri << dendl; - } else { - // dirlock scatterlock will propagate the update. - diri->inode.ctime = diri->inode.mtime = mdr->now; - diri->dirlock.set_updated(); - dout(10) << "dirty_dn_diri (non-dirty) ctime/mtime " << mdr->now << " on " << *diri << dendl; - } -} - - - - - - -// =============================================================================== -// STAT - -void Server::handle_client_stat(MDRequest *mdr) -{ - MClientRequest *req = mdr->client_request; - CInode *ref = rdlock_path_pin_ref(mdr, false); - if (!ref) return; - - // which inode locks do I want? - /* note: this works because we include existing locks in our lists, - and because all new locks are on inodes and sort to the right of - the dentry rdlocks previous acquired by rdlock_path_pin_ref(). */ - set rdlocks = mdr->rdlocks; - set wrlocks = mdr->wrlocks; - set xlocks = mdr->xlocks; - - int mask = req->args.stat.mask; - if (mask & STAT_MASK_LINK) rdlocks.insert(&ref->linklock); - if (mask & STAT_MASK_AUTH) rdlocks.insert(&ref->authlock); - if (ref->is_file() && - mask & STAT_MASK_FILE) rdlocks.insert(&ref->filelock); - if (ref->is_dir() && - mask & STAT_MASK_MTIME) rdlocks.insert(&ref->dirlock); - - if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) - return; - - mds->balancer->hit_inode(g_clock.now(), ref, META_POP_IRD, - mdr->client_request->get_client_inst().name.num()); - - // reply - dout(10) << "reply to stat on " << *req << dendl; - MClientReply *reply = new MClientReply(req); - reply_request(mdr, reply, ref); -} - - - - -// =============================================================================== -// INODE UPDATES - - -/* - * finisher for basic inode updates - */ -class C_MDS_inode_update_finish : public Context { - MDS *mds; - MDRequest *mdr; - CInode *in; -public: - C_MDS_inode_update_finish(MDS *m, MDRequest *r, CInode *i) : - mds(m), mdr(r), in(i) { } - void finish(int r) { - assert(r == 0); - - // apply - in->pop_and_dirty_projected_inode(mdr->ls); - - mds->balancer->hit_inode(mdr->now, in, META_POP_IWR); - - // reply - MClientReply *reply = new MClientReply(mdr->client_request, 0); - reply->set_result(0); - mds->server->reply_request(mdr, reply, in); - } -}; - - -// utime - -void Server::handle_client_utime(MDRequest *mdr) -{ - MClientRequest *req = mdr->client_request; - CInode *cur = rdlock_path_pin_ref(mdr, true); - if (!cur) return; - - if (cur->is_root()) { - reply_request(mdr, -EINVAL); // for now - return; - } - - // xlock inode - set rdlocks = mdr->rdlocks; - set wrlocks = mdr->wrlocks; - set xlocks = mdr->xlocks; - xlocks.insert(&cur->filelock); - if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) - return; - - // project update - inode_t *pi = cur->project_inode(); - pi->mtime = req->args.utime.mtime; - pi->atime = req->args.utime.atime; - pi->version = cur->pre_dirty(); - pi->ctime = g_clock.real_now(); - - // log + wait - mdr->ls = mdlog->get_current_segment(); - EUpdate *le = new EUpdate(mdlog, "utime"); - le->metablob.add_client_req(req->get_reqid()); - le->metablob.add_dir_context(cur->get_parent_dir()); - le->metablob.add_primary_dentry(cur->parent, true, 0, pi); - - mdlog->submit_entry(le, new C_MDS_inode_update_finish(mds, mdr, cur)); -} - - -// chmod - -void Server::handle_client_chmod(MDRequest *mdr) -{ - MClientRequest *req = mdr->client_request; - CInode *cur = rdlock_path_pin_ref(mdr, true); - if (!cur) return; - - if (cur->is_root()) { - reply_request(mdr, -EINVAL); // for now - return; - } - - // write - set rdlocks = mdr->rdlocks; - set wrlocks = mdr->wrlocks; - set xlocks = mdr->xlocks; - xlocks.insert(&cur->authlock); - if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) - return; - - // project update - inode_t *pi = cur->project_inode(); - pi->mode = - (pi->mode & ~04777) | - (req->args.chmod.mode & 04777); - pi->version = cur->pre_dirty(); - pi->ctime = g_clock.real_now(); - - // log + wait - mdr->ls = mdlog->get_current_segment(); - EUpdate *le = new EUpdate(mdlog, "chmod"); - le->metablob.add_client_req(req->get_reqid()); - le->metablob.add_dir_context(cur->get_parent_dir()); - le->metablob.add_primary_dentry(cur->parent, true, 0, pi); - - mdlog->submit_entry(le, new C_MDS_inode_update_finish(mds, mdr, cur)); -} - - -// chown - -void Server::handle_client_chown(MDRequest *mdr) -{ - MClientRequest *req = mdr->client_request; - CInode *cur = rdlock_path_pin_ref(mdr, true); - if (!cur) return; - - if (cur->is_root()) { - reply_request(mdr, -EINVAL); // for now - return; - } - - // write - set rdlocks = mdr->rdlocks; - set wrlocks = mdr->wrlocks; - set xlocks = mdr->xlocks; - xlocks.insert(&cur->authlock); - if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) - return; - - // project update - inode_t *pi = cur->project_inode(); - pi->uid = MAX(req->args.chown.uid, 0); - pi->gid = MAX(req->args.chown.gid, 0); - pi->version = cur->pre_dirty(); - pi->ctime = g_clock.real_now(); - - // log + wait - mdr->ls = mdlog->get_current_segment(); - EUpdate *le = new EUpdate(mdlog, "chown"); - le->metablob.add_client_req(req->get_reqid()); - le->metablob.add_dir_context(cur->get_parent_dir()); - le->metablob.add_primary_dentry(cur->parent, true, 0, pi); - - mdlog->submit_entry(le); - mdlog->wait_for_sync(new C_MDS_inode_update_finish(mds, mdr, cur)); -} - - - - -// ================================================================= -// DIRECTORY and NAMESPACE OPS - -// READDIR - -void Server::handle_client_readdir(MDRequest *mdr) -{ - MClientRequest *req = mdr->client_request; - CInode *diri = rdlock_path_pin_ref(mdr, false); - if (!diri) return; - - // it's a directory, right? - if (!diri->is_dir()) { - // not a dir - dout(10) << "reply to " << *req << " readdir -ENOTDIR" << dendl; - reply_request(mdr, -ENOTDIR, diri); - return; - } - - // which frag? - frag_t fg = req->args.readdir.frag; - - // does the frag exist? - if (diri->dirfragtree[fg] != fg) { - dout(10) << "frag " << fg << " doesn't appear in fragtree " << diri->dirfragtree << dendl; - reply_request(mdr, -EAGAIN, diri); - return; - } - - CDir *dir = try_open_auth_dirfrag(diri, fg, mdr); - if (!dir) return; - - // ok! - assert(dir->is_auth()); - - // check perm - /* - if (!mds->locker->inode_hard_rdlock_start(diri, mdr)) - return; - mds->locker->inode_hard_rdlock_finish(diri, mdr); - */ - - if (!dir->is_complete()) { - // fetch - dout(10) << " incomplete dir contents for readdir on " << *dir << ", fetching" << dendl; - dir->fetch(new C_MDS_RetryRequest(mdcache, mdr)); - return; - } - - // build dir contents - bufferlist dirbl; - - DirStat::_encode(dirbl, dir, mds->get_nodeid()); - - int numfiles = 0; - for (CDir::map_t::iterator it = dir->begin(); - it != dir->end(); - it++) { - CDentry *dn = it->second; - if (dn->is_null()) continue; - - CInode *in = dn->inode; - - // remote link? - // better for the MDS to do the work, if we think the client will stat any of these files. - if (dn->is_remote() && !in) { - in = mdcache->get_inode(dn->get_remote_ino()); - if (in) { - dn->link_remote(in); - } else { - mdcache->open_remote_ino(dn->get_remote_ino(), - mdr, - new C_MDS_RetryRequest(mdcache, mdr)); - - // touch everything i _do_ have - for (it = dir->begin(); - it != dir->end(); - it++) - if (!it->second->is_null()) - mdcache->lru.lru_touch(it->second); - return; - } - } - assert(in); - - - assert(in); - - dout(12) << "including inode " << *in << dendl; - - // add this dentry + inodeinfo - ::_encode(it->first, dirbl); - InodeStat::_encode(dirbl, in); - - // touch it - mdcache->lru.lru_touch(dn); - } - - // yay, reply - MClientReply *reply = new MClientReply(req); - reply->take_dir_items(dirbl); - - dout(10) << "reply to " << *req << " readdir " << numfiles << " files" << dendl; - reply->set_result(0); - - // bump popularity. NOTE: this doesn't quite capture it. - mds->balancer->hit_dir(g_clock.now(), dir, META_POP_IRD, -1, numfiles); - - // reply - reply_request(mdr, reply, diri); -} - - - -// ------------------------------------------------ - -// MKNOD - -class C_MDS_mknod_finish : public Context { - MDS *mds; - MDRequest *mdr; - CDentry *dn; - CInode *newi; - version_t dirpv; - version_t newdirpv; -public: - C_MDS_mknod_finish(MDS *m, MDRequest *r, CDentry *d, CInode *ni, version_t dirpv_, version_t newdirpv_=0) : - mds(m), mdr(r), dn(d), newi(ni), - dirpv(dirpv_), newdirpv(newdirpv_) {} - void finish(int r) { - assert(r == 0); - - // link the inode - dn->get_dir()->link_primary_inode(dn, newi); - - // dirty inode, dn, dir - newi->mark_dirty(newi->inode.version + 1, mdr->ls); - - // mkdir? - if (newdirpv) { - CDir *dir = newi->get_dirfrag(frag_t()); - assert(dir); - dir->mark_dirty(newdirpv, mdr->ls); - } - - // dir inode's mtime - mds->server->dirty_dn_diri(mdr, dn, dirpv); - - // hit pop - mds->balancer->hit_inode(mdr->now, newi, META_POP_IWR); - //mds->balancer->hit_dir(mdr->now, dn->get_dir(), META_POP_DWR); - - // reply - MClientReply *reply = new MClientReply(mdr->client_request, 0); - reply->set_result(0); - mds->server->reply_request(mdr, reply, newi); - } -}; - - -void Server::handle_client_mknod(MDRequest *mdr) -{ - MClientRequest *req = mdr->client_request; - - CDentry *dn = rdlock_path_xlock_dentry(mdr, false, false); - if (!dn) return; - - mdr->now = g_clock.real_now(); - CInode *newi = prepare_new_inode(mdr, dn->dir); - assert(newi); - - // it's a file. - newi->inode.rdev = req->args.mknod.rdev; - newi->inode.mode = req->args.mknod.mode; - newi->inode.mode &= ~INODE_TYPE_MASK; - newi->inode.mode |= INODE_MODE_FILE; - newi->inode.version = dn->pre_dirty() - 1; - - // prepare finisher - mdr->ls = mdlog->get_current_segment(); - EUpdate *le = new EUpdate(mdlog, "mknod"); - le->metablob.add_client_req(req->get_reqid()); - le->metablob.add_allocated_ino(newi->ino(), mds->idalloc->get_version()); - version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob); // dir mtime too - le->metablob.add_dir_context(dn->dir); - le->metablob.add_primary_dentry(dn, true, newi, &newi->inode); - - // log + wait - mdlog->submit_entry(le, new C_MDS_mknod_finish(mds, mdr, dn, newi, dirpv)); -} - - - -// MKDIR - -void Server::handle_client_mkdir(MDRequest *mdr) -{ - MClientRequest *req = mdr->client_request; - - CDentry *dn = rdlock_path_xlock_dentry(mdr, false, false); - if (!dn) return; - - // new inode - mdr->now = g_clock.real_now(); - CInode *newi = prepare_new_inode(mdr, dn->dir); - assert(newi); - - // it's a directory. - newi->inode.mode = req->args.mkdir.mode; - newi->inode.mode &= ~INODE_TYPE_MASK; - newi->inode.mode |= INODE_MODE_DIR; - newi->inode.layout = g_OSD_MDDirLayout; - newi->inode.version = dn->pre_dirty() - 1; - - // ...and that new dir is empty. - CDir *newdir = newi->get_or_open_dirfrag(mds->mdcache, frag_t()); - newdir->mark_complete(); - version_t newdirpv = newdir->pre_dirty(); - - //if (mds->logger) mds->logger->inc("mkdir"); - - // prepare finisher - mdr->ls = mdlog->get_current_segment(); - EUpdate *le = new EUpdate(mdlog, "mkdir"); - le->metablob.add_client_req(req->get_reqid()); - le->metablob.add_allocated_ino(newi->ino(), mds->idalloc->get_version()); - version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob); // dir mtime too - le->metablob.add_dir_context(dn->dir); - le->metablob.add_primary_dentry(dn, true, newi, &newi->inode); - le->metablob.add_dir(newdir, true, true); // dirty AND complete - - // log + wait - mdlog->submit_entry(le, new C_MDS_mknod_finish(mds, mdr, dn, newi, dirpv, newdirpv)); - - /* old export heuristic. pbly need to reimplement this at some point. - if ( - diri->dir->is_auth() && - diri->dir->is_rep() && - newdir->is_auth() && - !newdir->is_hashing()) { - int dest = rand() % mds->mdsmap->get_num_mds(); - if (dest != whoami) { - dout(10) << "exporting new dir " << *newdir << " in replicated parent " << *diri->dir << dendl; - mdcache->migrator->export_dir(newdir, dest); - } - } - */ -} - - -// SYMLINK - -void Server::handle_client_symlink(MDRequest *mdr) -{ - MClientRequest *req = mdr->client_request; - - CDentry *dn = rdlock_path_xlock_dentry(mdr, false, false); - if (!dn) return; - - mdr->now = g_clock.real_now(); - - CInode *newi = prepare_new_inode(mdr, dn->dir); - assert(newi); - - // it's a symlink - newi->inode.mode &= ~INODE_TYPE_MASK; - newi->inode.mode |= INODE_MODE_SYMLINK; - newi->symlink = req->get_sarg(); - newi->inode.version = dn->pre_dirty() - 1; - - // prepare finisher - mdr->ls = mdlog->get_current_segment(); - EUpdate *le = new EUpdate(mdlog, "symlink"); - le->metablob.add_client_req(req->get_reqid()); - le->metablob.add_allocated_ino(newi->ino(), mds->idalloc->get_version()); - version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob); // dir mtime too - le->metablob.add_dir_context(dn->dir); - le->metablob.add_primary_dentry(dn, true, newi, &newi->inode); - - // log + wait - mdlog->submit_entry(le, new C_MDS_mknod_finish(mds, mdr, dn, newi, dirpv)); -} - - - - - -// LINK - -void Server::handle_client_link(MDRequest *mdr) -{ - MClientRequest *req = mdr->client_request; - - dout(7) << "handle_client_link " << req->get_filepath() - << " to " << req->get_sarg() - << dendl; - - // traverse to dest dir, make sure it's ours. - const filepath &linkpath = req->get_filepath(); - const string &dname = linkpath.last_dentry(); - vector linktrace; - CDir *dir = traverse_to_auth_dir(mdr, linktrace, linkpath); - if (!dir) return; - dout(7) << "handle_client_link link " << dname << " in " << *dir << dendl; - - // traverse to link target - filepath targetpath = req->get_sarg(); - dout(7) << "handle_client_link discovering target " << targetpath << dendl; - vector targettrace; - int r = mdcache->path_traverse(mdr, req, - 0, targetpath, targettrace, false, - MDS_TRAVERSE_DISCOVER); - if (r > 0) return; // wait - if (targettrace.empty()) r = -EINVAL; - if (r < 0) { - reply_request(mdr, r); - return; - } - - // identify target inode - CInode *targeti = targettrace[targettrace.size()-1]->inode; - assert(targeti); - - // dir? - dout(7) << "target is " << *targeti << dendl; - if (targeti->is_dir()) { - dout(7) << "target is a dir, failing..." << dendl; - reply_request(mdr, -EINVAL); - return; - } - - // get/make null link dentry - CDentry *dn = prepare_null_dentry(mdr, dir, dname, false); - if (!dn) return; - - // create lock lists - set rdlocks, wrlocks, xlocks; - - for (int i=0; i<(int)linktrace.size(); i++) - rdlocks.insert(&linktrace[i]->lock); - xlocks.insert(&dn->lock); - wrlocks.insert(&dn->dir->inode->dirlock); - for (int i=0; i<(int)targettrace.size(); i++) - rdlocks.insert(&targettrace[i]->lock); - xlocks.insert(&targeti->linklock); - - if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) - return; - - mdr->done_locking = true; // avoid wrlock moving target issues. - - // pick mtime - if (mdr->now == utime_t()) - mdr->now = g_clock.real_now(); - - // does the target need an anchor? - if (targeti->is_auth()) { - /*if (targeti->get_parent_dir() == dn->dir) { - dout(7) << "target is in the same dirfrag, sweet" << dendl; - } - else - */ - if (targeti->is_anchored() && !targeti->is_unanchoring()) { - dout(7) << "target anchored already (nlink=" << targeti->inode.nlink << "), sweet" << dendl; - } - else { - dout(7) << "target needs anchor, nlink=" << targeti->inode.nlink << ", creating anchor" << dendl; - - mdcache->anchor_create(mdr, targeti, - new C_MDS_RetryRequest(mdcache, mdr)); - return; - } - } - - // go! - - // local or remote? - if (targeti->is_auth()) - _link_local(mdr, dn, targeti); - else - _link_remote(mdr, dn, targeti); -} - - -class C_MDS_link_local_finish : public Context { - MDS *mds; - MDRequest *mdr; - CDentry *dn; - CInode *targeti; - version_t dnpv; - version_t tipv; - version_t dirpv; -public: - C_MDS_link_local_finish(MDS *m, MDRequest *r, CDentry *d, CInode *ti, - version_t dnpv_, version_t tipv_, version_t dirpv_) : - mds(m), mdr(r), dn(d), targeti(ti), - dnpv(dnpv_), tipv(tipv_), dirpv(dirpv_) { } - void finish(int r) { - assert(r == 0); - mds->server->_link_local_finish(mdr, dn, targeti, dnpv, tipv, dirpv); - } -}; - - -void Server::_link_local(MDRequest *mdr, CDentry *dn, CInode *targeti) -{ - dout(10) << "_link_local " << *dn << " to " << *targeti << dendl; - - mdr->ls = mdlog->get_current_segment(); - - // predirty NEW dentry - version_t dnpv = dn->pre_dirty(); - version_t tipv = targeti->pre_dirty(); - - // project inode update - inode_t *pi = targeti->project_inode(); - pi->nlink++; - pi->ctime = mdr->now; - pi->version = tipv; - - // log + wait - EUpdate *le = new EUpdate(mdlog, "link_local"); - le->metablob.add_client_req(mdr->reqid); - version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob); // dir inode's mtime - le->metablob.add_dir_context(dn->get_dir()); - le->metablob.add_remote_dentry(dn, true, targeti->ino()); // new remote - le->metablob.add_dir_context(targeti->get_parent_dir()); - le->metablob.add_primary_dentry(targeti->parent, true, targeti, pi); // update old primary - - mdlog->submit_entry(le, new C_MDS_link_local_finish(mds, mdr, dn, targeti, dnpv, tipv, dirpv)); -} - -void Server::_link_local_finish(MDRequest *mdr, CDentry *dn, CInode *targeti, - version_t dnpv, version_t tipv, version_t dirpv) -{ - dout(10) << "_link_local_finish " << *dn << " to " << *targeti << dendl; - - // link and unlock the NEW dentry - dn->dir->link_remote_inode(dn, targeti->ino(), MODE_TO_DT(targeti->inode.mode)); - dn->mark_dirty(dnpv, mdr->ls); - - // target inode - targeti->pop_and_dirty_projected_inode(mdr->ls); - - // new dentry dir mtime - dirty_dn_diri(mdr, dn, dirpv); - - // bump target popularity - mds->balancer->hit_inode(mdr->now, targeti, META_POP_IWR); - //mds->balancer->hit_dir(mdr->now, dn->get_dir(), META_POP_DWR); - - // reply - MClientReply *reply = new MClientReply(mdr->client_request, 0); - reply_request(mdr, reply, dn->get_dir()->get_inode()); // FIXME: imprecise ref -} - - -// remote - -class C_MDS_link_remote_finish : public Context { - MDS *mds; - MDRequest *mdr; - CDentry *dn; - CInode *targeti; - version_t dpv; - version_t dirpv; -public: - C_MDS_link_remote_finish(MDS *m, MDRequest *r, CDentry *d, CInode *ti, version_t dirpv_) : - mds(m), mdr(r), dn(d), targeti(ti), - dpv(d->get_projected_version()), - dirpv(dirpv_) { } - void finish(int r) { - assert(r == 0); - mds->server->_link_remote_finish(mdr, dn, targeti, dpv, dirpv); - } -}; - -void Server::_link_remote(MDRequest *mdr, CDentry *dn, CInode *targeti) -{ - dout(10) << "_link_remote " << *dn << " to " << *targeti << dendl; - - // 1. send LinkPrepare to dest (journal nlink++ prepare) - int linkauth = targeti->authority().first; - if (mdr->more()->witnessed.count(linkauth) == 0) { - dout(10) << " targeti auth must prepare nlink++" << dendl; - - MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_LINKPREP); - targeti->set_object_info(req->get_object_info()); - req->now = mdr->now; - mds->send_message_mds(req, linkauth, MDS_PORT_SERVER); - - assert(mdr->more()->waiting_on_slave.count(linkauth) == 0); - mdr->more()->waiting_on_slave.insert(linkauth); - return; - } - dout(10) << " targeti auth has prepared nlink++" << dendl; - - // go. - // predirty dentry - dn->pre_dirty(); - - // add to event - mdr->ls = mdlog->get_current_segment(); - EUpdate *le = new EUpdate(mdlog, "link_remote"); - le->metablob.add_client_req(mdr->reqid); - version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob); // dir inode's mtime - le->metablob.add_dir_context(dn->get_dir()); - le->metablob.add_remote_dentry(dn, true, targeti->ino()); // new remote - - // mark committing (needed for proper recovery) - mdr->committing = true; - - // log + wait - mdlog->submit_entry(le, new C_MDS_link_remote_finish(mds, mdr, dn, targeti, dirpv)); -} - -void Server::_link_remote_finish(MDRequest *mdr, CDentry *dn, CInode *targeti, - version_t dpv, version_t dirpv) -{ - dout(10) << "_link_remote_finish " << *dn << " to " << *targeti << dendl; - - // link the new dentry - dn->dir->link_remote_inode(dn, targeti->ino(), MODE_TO_DT(targeti->inode.mode)); - dn->mark_dirty(dpv, mdr->ls); - - // dir inode's mtime - dirty_dn_diri(mdr, dn, dirpv); - - // bump target popularity - mds->balancer->hit_inode(mdr->now, targeti, META_POP_IWR); - //mds->balancer->hit_dir(mdr->now, dn->get_dir(), META_POP_DWR); - - // reply - MClientReply *reply = new MClientReply(mdr->client_request, 0); - reply_request(mdr, reply, dn->get_dir()->get_inode()); // FIXME: imprecise ref -} - - -// remote linking/unlinking - -class C_MDS_SlaveLinkPrep : public Context { - Server *server; - MDRequest *mdr; - CInode *targeti; - utime_t old_ctime; - bool inc; -public: - C_MDS_SlaveLinkPrep(Server *s, MDRequest *r, CInode *t, utime_t oct, bool in) : - server(s), mdr(r), targeti(t), old_ctime(oct), inc(in) { } - void finish(int r) { - assert(r == 0); - server->_logged_slave_link(mdr, targeti, old_ctime, inc); - } -}; - -void Server::handle_slave_link_prep(MDRequest *mdr) -{ - dout(10) << "handle_slave_link_prep " << *mdr - << " on " << mdr->slave_request->get_object_info() - << dendl; - - CInode *targeti = mdcache->get_inode(mdr->slave_request->get_object_info().ino); - assert(targeti); - dout(10) << "targeti " << *targeti << dendl; - CDentry *dn = targeti->get_parent_dn(); - assert(dn->is_primary()); - - mdr->now = mdr->slave_request->now; - - // anchor? - if (mdr->slave_request->get_op() == MMDSSlaveRequest::OP_LINKPREP) { - if (targeti->is_anchored() && !targeti->is_unanchoring()) { - dout(7) << "target anchored already (nlink=" << targeti->inode.nlink << "), sweet" << dendl; - } - else { - dout(7) << "target needs anchor, nlink=" << targeti->inode.nlink << ", creating anchor" << dendl; - mdcache->anchor_create(mdr, targeti, - new C_MDS_RetryRequest(mdcache, mdr)); - return; - } - } - - // journal it - mdr->ls = mdlog->get_current_segment(); - ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_link_prep", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_PREPARE); - - inode_t *pi = dn->inode->project_inode(); - - // rollback case - le->rollback.add_dir_context(targeti->get_parent_dir()); - le->rollback.add_primary_dentry(dn, true, targeti, pi); // update old primary - - // update journaled target inode - bool inc; - if (mdr->slave_request->get_op() == MMDSSlaveRequest::OP_LINKPREP) { - inc = true; - pi->nlink++; - } else { - inc = false; - pi->nlink--; - } - utime_t old_ctime = pi->ctime; - pi->ctime = mdr->now; - pi->version = targeti->pre_dirty(); - - dout(10) << " projected inode " << pi << " v " << pi->version << dendl; - - // commit case - le->commit.add_dir_context(targeti->get_parent_dir()); - le->commit.add_primary_dentry(dn, true, targeti, pi); // update old primary - - mdlog->submit_entry(le, new C_MDS_SlaveLinkPrep(this, mdr, targeti, old_ctime, inc)); -} - -class C_MDS_SlaveLinkCommit : public Context { - Server *server; - MDRequest *mdr; - CInode *targeti; - utime_t old_ctime; - version_t old_version; - bool inc; -public: - C_MDS_SlaveLinkCommit(Server *s, MDRequest *r, CInode *t, utime_t oct, version_t ov, bool in) : - server(s), mdr(r), targeti(t), old_ctime(oct), old_version(ov), inc(in) { } - void finish(int r) { - server->_commit_slave_link(mdr, r, targeti, - old_ctime, old_version, inc); - } -}; - -void Server::_logged_slave_link(MDRequest *mdr, CInode *targeti, utime_t old_ctime, bool inc) -{ - dout(10) << "_logged_slave_link " << *mdr - << " inc=" << inc - << " " << *targeti << dendl; - - version_t old_version = targeti->inode.version; - - // update the target - targeti->pop_and_dirty_projected_inode(mdr->ls); - - // hit pop - mds->balancer->hit_inode(mdr->now, targeti, META_POP_IWR); - - // ack - MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_LINKPREPACK); - mds->send_message_mds(reply, mdr->slave_to_mds, MDS_PORT_SERVER); - - // set up commit waiter - mdr->more()->slave_commit = new C_MDS_SlaveLinkCommit(this, mdr, targeti, old_ctime, old_version, inc); - - // done. - delete mdr->slave_request; - mdr->slave_request = 0; -} - - -void Server::_commit_slave_link(MDRequest *mdr, int r, CInode *targeti, - utime_t old_ctime, version_t old_version, bool inc) -{ - dout(10) << "_commit_slave_link " << *mdr - << " r=" << r - << " inc=" << inc - << " " << *targeti << dendl; - - ESlaveUpdate *le; - if (r == 0) { - // write a commit to the journal - le = new ESlaveUpdate(mdlog, "slave_link_commit", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_COMMIT); - } else { - le = new ESlaveUpdate(mdlog, "slave_link_rollback", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_ROLLBACK); - - // -- rollback in memory -- - assert(targeti->inode.ctime == mdr->now); - assert(targeti->projected_inode.empty()); // we're holding the version lock. - - targeti->inode.ctime = old_ctime; - targeti->inode.version = old_version; - if (inc) - targeti->inode.nlink++; - else - targeti->inode.nlink--; - } - - mdlog->submit_entry(le); -} - - - -void Server::handle_slave_link_prep_ack(MDRequest *mdr, MMDSSlaveRequest *m) -{ - dout(10) << "handle_slave_link_prep_ack " << *mdr - << " " << *m << dendl; - int from = m->get_source().num(); - - // note slave - mdr->more()->slaves.insert(from); - - // witnessed! - assert(mdr->more()->witnessed.count(from) == 0); - mdr->more()->witnessed.insert(from); - - // remove from waiting list - assert(mdr->more()->waiting_on_slave.count(from)); - mdr->more()->waiting_on_slave.erase(from); - - assert(mdr->more()->waiting_on_slave.empty()); - - dispatch_client_request(mdr); // go again! -} - - - - - -// UNLINK - -void Server::handle_client_unlink(MDRequest *mdr) -{ - MClientRequest *req = mdr->client_request; - - // traverse to path - vector trace; - int r = mdcache->path_traverse(mdr, req, - 0, req->get_filepath(), trace, false, - MDS_TRAVERSE_FORWARD); - if (r > 0) return; - if (trace.empty()) r = -EINVAL; // can't unlink root - if (r < 0) { - reply_request(mdr, r); - return; - } - - CDentry *dn = trace[trace.size()-1]; - assert(dn); - - // is it my dentry? - if (!dn->is_auth()) { - // fw to auth - mdcache->request_forward(mdr, dn->authority().first); - return; - } - - // rmdir or unlink? - bool rmdir = false; - if (req->get_op() == MDS_OP_RMDIR) rmdir = true; - - if (rmdir) { - dout(7) << "handle_client_rmdir on " << *dn << dendl; - } else { - dout(7) << "handle_client_unlink on " << *dn << dendl; - } - - // readable? - if (dn->lock.is_xlocked_by_other(mdr)) { - dout(10) << "waiting on xlocked dentry " << *dn << dendl; - dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr)); - return; - } - - // dn looks ok. - - // get/open inode. - mdr->trace.swap(trace); - CInode *in = mdcache->get_dentry_inode(dn, mdr); - if (!in) return; - dout(7) << "dn links to " << *in << dendl; - - // rmdir vs is_dir - if (in->is_dir()) { - if (rmdir) { - // do empty directory checks - if (!_verify_rmdir(mdr, in)) - return; - } else { - dout(7) << "handle_client_unlink on dir " << *in << ", returning error" << dendl; - reply_request(mdr, -EISDIR); - return; - } - } else { - if (rmdir) { - // unlink - dout(7) << "handle_client_rmdir on non-dir " << *in << ", returning error" << dendl; - reply_request(mdr, -ENOTDIR); - return; - } - } - - // lock - set rdlocks, wrlocks, xlocks; - - for (int i=0; i<(int)trace.size()-1; i++) - rdlocks.insert(&trace[i]->lock); - xlocks.insert(&dn->lock); - wrlocks.insert(&dn->dir->inode->dirlock); - xlocks.insert(&in->linklock); - - if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) - return; - - // yay! - mdr->done_locking = true; // avoid wrlock racing - if (mdr->now == utime_t()) - mdr->now = g_clock.real_now(); - - // get stray dn ready? - CDentry *straydn = 0; - if (dn->is_primary()) { - straydn = mdcache->get_or_create_stray_dentry(dn->inode); - mdr->pin(straydn); // pin it. - dout(10) << " straydn is " << *straydn << dendl; - assert(straydn->is_null()); - - if (!mdr->more()->dst_reanchor_atid && - dn->inode->is_anchored()) { - dout(10) << "reanchoring to stray " << *dn->inode << dendl; - vector trace; - straydn->make_anchor_trace(trace, dn->inode); - mds->anchorclient->prepare_update(dn->inode->ino(), trace, &mdr->more()->dst_reanchor_atid, - new C_MDS_RetryRequest(mdcache, mdr)); - return; - } - } - - // ok! - if (dn->is_remote() && !dn->inode->is_auth()) - _unlink_remote(mdr, dn); - else - _unlink_local(mdr, dn, straydn); -} - - - -class C_MDS_unlink_local_finish : public Context { - MDS *mds; - MDRequest *mdr; - CDentry *dn; - CDentry *straydn; - version_t dnpv; // deleted dentry - version_t dirpv; -public: - C_MDS_unlink_local_finish(MDS *m, MDRequest *r, CDentry *d, CDentry *sd, - version_t dirpv_) : - mds(m), mdr(r), dn(d), straydn(sd), - dnpv(d->get_projected_version()), dirpv(dirpv_) { } - void finish(int r) { - assert(r == 0); - mds->server->_unlink_local_finish(mdr, dn, straydn, dnpv, dirpv); - } -}; - - -void Server::_unlink_local(MDRequest *mdr, CDentry *dn, CDentry *straydn) -{ - dout(10) << "_unlink_local " << *dn << dendl; - - // ok, let's do it. - mdr->ls = mdlog->get_current_segment(); - - // prepare log entry - EUpdate *le = new EUpdate(mdlog, "unlink_local"); - le->metablob.add_client_req(mdr->reqid); - - version_t ipv = 0; // dirty inode version - inode_t *ji = 0; // journaled projected inode - if (dn->is_primary()) { - // primary link. add stray dentry. - assert(straydn); - ipv = straydn->pre_dirty(dn->inode->inode.version); - le->metablob.add_dir_context(straydn->dir); - ji = le->metablob.add_primary_dentry(straydn, true, dn->inode); - } else { - // remote link. update remote inode. - ipv = dn->inode->pre_dirty(); - le->metablob.add_dir_context(dn->inode->get_parent_dir()); - ji = le->metablob.add_primary_dentry(dn->inode->parent, true, dn->inode); - } - - // update journaled target inode - inode_t *pi = dn->inode->project_inode(); - pi->nlink--; - pi->ctime = mdr->now; - pi->version = ipv; - *ji = *pi; // copy into journal - - // the unlinked dentry - dn->pre_dirty(); - version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob); - le->metablob.add_dir_context(dn->get_dir()); - le->metablob.add_null_dentry(dn, true); - - if (mdr->more()->dst_reanchor_atid) - le->metablob.add_anchor_transaction(mdr->more()->dst_reanchor_atid); - - // log + wait - journal_opens(); // journal pending opens, just in case - mdlog->submit_entry(le, new C_MDS_unlink_local_finish(mds, mdr, dn, straydn, - dirpv)); -} - -void Server::_unlink_local_finish(MDRequest *mdr, - CDentry *dn, CDentry *straydn, - version_t dnpv, version_t dirpv) -{ - dout(10) << "_unlink_local_finish " << *dn << dendl; - - // unlink main dentry - CInode *in = dn->inode; - dn->dir->unlink_inode(dn); - - // relink as stray? (i.e. was primary link?) - if (straydn) { - dout(20) << " straydn is " << *straydn << dendl; - straydn->dir->link_primary_inode(straydn, in); - } - - // nlink--, dirty old dentry - in->pop_and_dirty_projected_inode(mdr->ls); - dn->mark_dirty(dnpv, mdr->ls); - - // dir inode's mtime - dirty_dn_diri(mdr, dn, dirpv); - - // share unlink news with replicas - for (map::iterator it = dn->replicas_begin(); - it != dn->replicas_end(); - it++) { - dout(7) << "_unlink_local_finish sending MDentryUnlink to mds" << it->first << dendl; - MDentryUnlink *unlink = new MDentryUnlink(dn->dir->dirfrag(), dn->name); - if (straydn) { - unlink->strayin = straydn->dir->inode->replicate_to(it->first); - unlink->straydir = straydn->dir->replicate_to(it->first); - unlink->straydn = straydn->replicate_to(it->first); - } - mds->send_message_mds(unlink, it->first, MDS_PORT_CACHE); - } - - // commit anchor update? - if (mdr->more()->dst_reanchor_atid) - mds->anchorclient->commit(mdr->more()->dst_reanchor_atid, mdr->ls); - - // bump pop - //mds->balancer->hit_dir(mdr->now, dn->dir, META_POP_DWR); - - // reply - MClientReply *reply = new MClientReply(mdr->client_request, 0); - reply_request(mdr, reply, dn->dir->get_inode()); // FIXME: imprecise ref - - // clean up? - if (straydn) - mdcache->eval_stray(straydn); - - // removing a new dn? - dn->dir->try_remove_unlinked_dn(dn); -} - - -class C_MDS_unlink_remote_finish : public Context { - MDS *mds; - MDRequest *mdr; - CDentry *dn; - version_t dnpv; // deleted dentry - version_t dirpv; -public: - C_MDS_unlink_remote_finish(MDS *m, MDRequest *r, CDentry *d, - version_t dirpv_) : - mds(m), mdr(r), dn(d), - dnpv(d->get_projected_version()), dirpv(dirpv_) { } - void finish(int r) { - assert(r == 0); - mds->server->_unlink_remote_finish(mdr, dn, dnpv, dirpv); - } -}; - -void Server::_unlink_remote(MDRequest *mdr, CDentry *dn) -{ - dout(10) << "_unlink_remote " << *dn << " " << *dn->inode << dendl; - - // 1. send LinkPrepare to dest (journal nlink-- prepare) - int inauth = dn->inode->authority().first; - if (mdr->more()->witnessed.count(inauth) == 0) { - dout(10) << " inode auth must prepare nlink--" << dendl; - - MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_UNLINKPREP); - dn->inode->set_object_info(req->get_object_info()); - req->now = mdr->now; - mds->send_message_mds(req, inauth, MDS_PORT_SERVER); - - assert(mdr->more()->waiting_on_slave.count(inauth) == 0); - mdr->more()->waiting_on_slave.insert(inauth); - return; - } - dout(10) << " inode auth has prepared nlink--" << dendl; - - // ok, let's do it. - // prepare log entry - mdr->ls = mdlog->get_current_segment(); - EUpdate *le = new EUpdate(mdlog, "unlink_remote"); - le->metablob.add_client_req(mdr->reqid); - - // the unlinked dentry - dn->pre_dirty(); - version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob); - le->metablob.add_dir_context(dn->get_dir()); - le->metablob.add_null_dentry(dn, true); - - if (mdr->more()->dst_reanchor_atid) - le->metablob.add_anchor_transaction(mdr->more()->dst_reanchor_atid); - - // finisher - C_MDS_unlink_remote_finish *fin = new C_MDS_unlink_remote_finish(mds, mdr, dn, dirpv); - - journal_opens(); // journal pending opens, just in case - - // mark committing (needed for proper recovery) - mdr->committing = true; - - // log + wait - mdlog->submit_entry(le, fin); -} - -void Server::_unlink_remote_finish(MDRequest *mdr, - CDentry *dn, - version_t dnpv, version_t dirpv) -{ - dout(10) << "_unlink_remote_finish " << *dn << dendl; - - // unlink main dentry - dn->dir->unlink_inode(dn); - dn->mark_dirty(dnpv, mdr->ls); // dirty old dentry - - // dir inode's mtime - dirty_dn_diri(mdr, dn, dirpv); - - // share unlink news with replicas - for (map::iterator it = dn->replicas_begin(); - it != dn->replicas_end(); - it++) { - dout(7) << "_unlink_remote_finish sending MDentryUnlink to mds" << it->first << dendl; - MDentryUnlink *unlink = new MDentryUnlink(dn->dir->dirfrag(), dn->name); - mds->send_message_mds(unlink, it->first, MDS_PORT_CACHE); - } - - // commit anchor update? - if (mdr->more()->dst_reanchor_atid) - mds->anchorclient->commit(mdr->more()->dst_reanchor_atid, mdr->ls); - - //mds->balancer->hit_dir(mdr->now, dn->dir, META_POP_DWR); - - // reply - MClientReply *reply = new MClientReply(mdr->client_request, 0); - reply_request(mdr, reply, dn->dir->get_inode()); // FIXME: imprecise ref - - // removing a new dn? - dn->dir->try_remove_unlinked_dn(dn); -} - - - - -/** _verify_rmdir - * - * verify that a directory is empty (i.e. we can rmdir it), - * and make sure it is part of the same subtree (i.e. local) - * so that rmdir will occur locally. - * - * @param in is the inode being rmdir'd. - */ -bool Server::_verify_rmdir(MDRequest *mdr, CInode *in) -{ - dout(10) << "_verify_rmdir " << *in << dendl; - assert(in->is_auth()); - - list frags; - in->dirfragtree.get_leaves(frags); - - for (list::iterator p = frags.begin(); - p != frags.end(); - ++p) { - CDir *dir = in->get_or_open_dirfrag(mdcache, *p); - assert(dir); - - // dir looks empty but incomplete? - if (dir->is_auth() && - dir->get_size() == 0 && - !dir->is_complete()) { - dout(7) << "_verify_rmdir fetching incomplete dir " << *dir << dendl; - dir->fetch(new C_MDS_RetryRequest(mdcache, mdr)); - return false; - } - - // does the frag _look_ empty? - if (dir->get_size()) { - dout(10) << "_verify_rmdir still " << dir->get_size() << " items in frag " << *dir << dendl; - reply_request(mdr, -ENOTEMPTY); - return false; - } - - // not dir auth? - if (!dir->is_auth()) { - dout(10) << "_verify_rmdir not auth for " << *dir << ", FIXME BUG" << dendl; - reply_request(mdr, -ENOTEMPTY); - return false; - } - } - - return true; -} -/* - // export sanity check - if (!in->is_auth()) { - // i should be exporting this now/soon, since the dir is empty. - dout(7) << "handle_client_rmdir dir is auth, but not inode." << dendl; - mdcache->migrator->export_empty_import(in->dir); - in->dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mds, req, diri)); - return; - } -*/ - - - - -// ====================================================== - - -class C_MDS_rename_finish : public Context { - MDS *mds; - MDRequest *mdr; - CDentry *srcdn; - CDentry *destdn; - CDentry *straydn; -public: - C_MDS_rename_finish(MDS *m, MDRequest *r, - CDentry *sdn, CDentry *ddn, CDentry *stdn) : - mds(m), mdr(r), - srcdn(sdn), destdn(ddn), straydn(stdn) { } - void finish(int r) { - assert(r == 0); - mds->server->_rename_finish(mdr, srcdn, destdn, straydn); - } -}; - - -/** handle_client_rename - * - */ -void Server::handle_client_rename(MDRequest *mdr) -{ - MClientRequest *req = mdr->client_request; - dout(7) << "handle_client_rename " << *req << dendl; - - // traverse to dest dir (not dest) - // we do this FIRST, because the rename should occur on the - // destdn's auth. - const filepath &destpath = req->get_sarg(); - const string &destname = destpath.last_dentry(); - vector desttrace; - CDir *destdir = traverse_to_auth_dir(mdr, desttrace, destpath); - if (!destdir) return; // fw or error out - dout(10) << "dest will be " << destname << " in " << *destdir << dendl; - assert(destdir->is_auth()); - - // traverse to src - filepath srcpath = req->get_filepath(); - vector srctrace; - int r = mdcache->path_traverse(mdr, req, - 0, srcpath, srctrace, false, - MDS_TRAVERSE_DISCOVER); - if (r > 0) return; - if (srctrace.empty()) r = -EINVAL; // can't rename root - if (r < 0) { - reply_request(mdr, r); - return; - } - CDentry *srcdn = srctrace[srctrace.size()-1]; - dout(10) << " srcdn " << *srcdn << dendl; - CInode *srci = mdcache->get_dentry_inode(srcdn, mdr); - dout(10) << " srci " << *srci << dendl; - - // -- some sanity checks -- - // src == dest? - if (srcdn->get_dir() == destdir && srcdn->name == destname) { - dout(7) << "rename src=dest, noop" << dendl; - reply_request(mdr, 0); - return; - } - - // dest a child of src? - // e.g. mv /usr /usr/foo - CDentry *pdn = destdir->inode->parent; - while (pdn) { - if (pdn == srcdn) { - dout(7) << "cannot rename item to be a child of itself" << dendl; - reply_request(mdr, -EINVAL); - return; - } - pdn = pdn->dir->inode->parent; - } - - - // identify/create dest dentry - CDentry *destdn = destdir->lookup(destname); - if (destdn && destdn->lock.is_xlocked_by_other(mdr)) { - destdn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr)); - return; - } - - CInode *oldin = 0; - if (destdn && !destdn->is_null()) { - //dout(10) << "dest dn exists " << *destdn << dendl; - oldin = mdcache->get_dentry_inode(destdn, mdr); - if (!oldin) return; - dout(10) << " oldin " << *oldin << dendl; - - // mv /some/thing /to/some/existing_other_thing - if (oldin->is_dir() && !srci->is_dir()) { - reply_request(mdr, -EISDIR); - return; - } - if (!oldin->is_dir() && srci->is_dir()) { - reply_request(mdr, -ENOTDIR); - return; - } - - // non-empty dir? - if (oldin->is_dir() && !_verify_rmdir(mdr, oldin)) - return; - } - if (!destdn) { - // mv /some/thing /to/some/non_existent_name - destdn = prepare_null_dentry(mdr, destdir, destname); - if (!destdn) return; - } - - dout(10) << " destdn " << *destdn << dendl; - - - // -- locks -- - set rdlocks, wrlocks, xlocks; - - // rdlock sourcedir path, xlock src dentry - for (int i=0; i<(int)srctrace.size()-1; i++) - rdlocks.insert(&srctrace[i]->lock); - xlocks.insert(&srcdn->lock); - wrlocks.insert(&srcdn->dir->inode->dirlock); - /* - * no, this causes problems if the dftlock is scattered... - * and what was i thinking anyway? - * rdlocks.insert(&srcdn->dir->inode->dirfragtreelock); // rd lock on srci dirfragtree. - */ - - // rdlock destdir path, xlock dest dentry - for (int i=0; i<(int)desttrace.size(); i++) - rdlocks.insert(&desttrace[i]->lock); - xlocks.insert(&destdn->lock); - wrlocks.insert(&destdn->dir->inode->dirlock); - - // xlock versionlock on srci if remote? - // this ensures it gets safely remotely auth_pinned, avoiding deadlock; - // strictly speaking, having the slave node freeze the inode is - // otherwise sufficient for avoiding conflicts with inode locks, etc. - if (!srcdn->is_auth() && srcdn->is_primary()) - xlocks.insert(&srcdn->inode->versionlock); - - // xlock oldin (for nlink--) - if (oldin) xlocks.insert(&oldin->linklock); - - if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) - return; - - // set done_locking flag, to avoid problems with wrlock moving auth target - mdr->done_locking = true; - - // -- open all srcdn inode frags, if any -- - // we need these open so that auth can properly delegate from inode to dirfrags - // after the inode is _ours_. - if (srcdn->is_primary() && - !srcdn->is_auth() && - srci->is_dir()) { - dout(10) << "srci is remote dir, setting stickydirs and opening all frags" << dendl; - mdr->set_stickydirs(srci); - - list frags; - srci->dirfragtree.get_leaves(frags); - for (list::iterator p = frags.begin(); - p != frags.end(); - ++p) { - CDir *dir = srci->get_dirfrag(*p); - if (!dir) { - dout(10) << " opening " << *p << " under " << *srci << dendl; - mdcache->open_remote_dirfrag(srci, *p, new C_MDS_RetryRequest(mdcache, mdr)); - return; - } - } - } - - // -- declare now -- - if (mdr->now == utime_t()) - mdr->now = g_clock.real_now(); - - // -- create stray dentry? -- - CDentry *straydn = 0; - if (destdn->is_primary()) { - straydn = mdcache->get_or_create_stray_dentry(destdn->inode); - mdr->pin(straydn); - dout(10) << "straydn is " << *straydn << dendl; - } - - // -- prepare witnesses -- - /* - * NOTE: we use _all_ replicas as witnesses. - * this probably isn't totally necessary (esp for file renames), - * but if/when we change that, we have to make sure rejoin is - * sufficiently robust to handle strong rejoins from survivors - * with totally wrong dentry->inode linkage. - * (currently, it can ignore rename effects, because the resolve - * stage will sort them out.) - */ - set witnesses = mdr->more()->extra_witnesses; - if (srcdn->is_auth()) - srcdn->list_replicas(witnesses); - else - witnesses.insert(srcdn->authority().first); - destdn->list_replicas(witnesses); - dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl; - - // do srcdn auth last - int last = -1; - if (!srcdn->is_auth()) - last = srcdn->authority().first; - - for (set::iterator p = witnesses.begin(); - p != witnesses.end(); - ++p) { - if (*p == last) continue; // do it last! - if (mdr->more()->witnessed.count(*p)) { - dout(10) << " already witnessed by mds" << *p << dendl; - } else if (mdr->more()->waiting_on_slave.count(*p)) { - dout(10) << " already waiting on witness mds" << *p << dendl; - } else { - _rename_prepare_witness(mdr, *p, srcdn, destdn, straydn); - } - } - if (!mdr->more()->waiting_on_slave.empty()) - return; // we're waiting for a witness. - - if (last >= 0 && - mdr->more()->witnessed.count(last) == 0 && - mdr->more()->waiting_on_slave.count(last) == 0) { - dout(10) << " preparing last witness (srcdn auth)" << dendl; - _rename_prepare_witness(mdr, last, srcdn, destdn, straydn); - return; - } - - // -- prepare anchor updates -- - bool linkmerge = (srcdn->inode == destdn->inode && - (srcdn->is_primary() || destdn->is_primary())); - - if (!linkmerge) { - C_Gather *anchorgather = 0; - - if (srcdn->is_primary() && srcdn->inode->is_anchored() && - srcdn->dir != destdn->dir && - !mdr->more()->src_reanchor_atid) { - dout(10) << "reanchoring src->dst " << *srcdn->inode << dendl; - vector trace; - destdn->make_anchor_trace(trace, srcdn->inode); - - anchorgather = new C_Gather(new C_MDS_RetryRequest(mdcache, mdr)); - mds->anchorclient->prepare_update(srcdn->inode->ino(), trace, &mdr->more()->src_reanchor_atid, - anchorgather->new_sub()); - } - if (destdn->is_primary() && - destdn->inode->is_anchored() && - !mdr->more()->dst_reanchor_atid) { - dout(10) << "reanchoring dst->stray " << *destdn->inode << dendl; - - assert(straydn); - vector trace; - straydn->make_anchor_trace(trace, destdn->inode); - - if (!anchorgather) - anchorgather = new C_Gather(new C_MDS_RetryRequest(mdcache, mdr)); - mds->anchorclient->prepare_update(destdn->inode->ino(), trace, &mdr->more()->dst_reanchor_atid, - anchorgather->new_sub()); - } - - if (anchorgather) - return; // waiting for anchor prepares - } - - // -- prepare journal entry -- - mdr->ls = mdlog->get_current_segment(); - EUpdate *le = new EUpdate(mdlog, "rename"); - le->metablob.add_client_req(mdr->reqid); - - _rename_prepare(mdr, &le->metablob, srcdn, destdn, straydn); - - // -- commit locally -- - C_MDS_rename_finish *fin = new C_MDS_rename_finish(mds, mdr, srcdn, destdn, straydn); - - journal_opens(); // journal pending opens, just in case - - // mark committing (needed for proper recovery) - mdr->committing = true; - - // log + wait - mdlog->submit_entry(le, fin); -} - - -void Server::_rename_finish(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn) -{ - dout(10) << "_rename_finish " << *mdr << dendl; - - // apply - _rename_apply(mdr, srcdn, destdn, straydn); - - // commit anchor updates? - if (mdr->more()->src_reanchor_atid) - mds->anchorclient->commit(mdr->more()->src_reanchor_atid, mdr->ls); - if (mdr->more()->dst_reanchor_atid) - mds->anchorclient->commit(mdr->more()->dst_reanchor_atid, mdr->ls); - - // bump popularity - //if (srcdn->is_auth()) - //mds->balancer->hit_dir(mdr->now, srcdn->get_dir(), META_POP_DWR); - // mds->balancer->hit_dir(mdr->now, destdn->get_dir(), META_POP_DWR); - if (destdn->is_remote() && - destdn->inode->is_auth()) - mds->balancer->hit_inode(mdr->now, destdn->get_inode(), META_POP_IWR); - - // reply - MClientReply *reply = new MClientReply(mdr->client_request, 0); - reply_request(mdr, reply, destdn->get_inode()); // FIXME: imprecise ref - - // clean up? - if (straydn) - mdcache->eval_stray(straydn); -} - - - -// helpers - -void Server::_rename_prepare_witness(MDRequest *mdr, int who, CDentry *srcdn, CDentry *destdn, CDentry *straydn) -{ - dout(10) << "_rename_prepare_witness mds" << who << dendl; - MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_RENAMEPREP); - srcdn->make_path(req->srcdnpath); - destdn->make_path(req->destdnpath); - req->now = mdr->now; - - if (straydn) { - CInodeDiscover *indis = straydn->dir->inode->replicate_to(who); - CDirDiscover *dirdis = straydn->dir->replicate_to(who); - CDentryDiscover *dndis = straydn->replicate_to(who); - indis->_encode(req->stray); - dirdis->_encode(req->stray); - dndis->_encode(req->stray); - delete indis; - delete dirdis; - delete dndis; - } - - // srcdn auth will verify our current witness list is sufficient - req->witnesses = mdr->more()->witnessed; - - mds->send_message_mds(req, who, MDS_PORT_SERVER); - - assert(mdr->more()->waiting_on_slave.count(who) == 0); - mdr->more()->waiting_on_slave.insert(who); -} - - -void Server::_rename_prepare(MDRequest *mdr, - EMetaBlob *metablob, - CDentry *srcdn, CDentry *destdn, CDentry *straydn) -{ - dout(10) << "_rename_prepare " << *mdr << " " << *srcdn << " " << *destdn << dendl; - - // primary+remote link merge? - bool linkmerge = (srcdn->inode == destdn->inode && - (srcdn->is_primary() || destdn->is_primary())); - - if (mdr->is_master()) { - mdr->more()->pvmap[destdn->dir->inode] = predirty_dn_diri(mdr, destdn, metablob); - if (destdn->dir != srcdn->dir) - mdr->more()->pvmap[srcdn->dir->inode] = predirty_dn_diri(mdr, srcdn, metablob); - } - - inode_t *ji = 0; // journaled inode getting nlink-- - version_t ipv = 0; // it's version - - if (linkmerge) { - dout(10) << "will merge remote+primary links" << dendl; - - // destdn -> primary - metablob->add_dir_context(destdn->dir); - if (destdn->is_auth()) - ipv = mdr->more()->pvmap[destdn] = destdn->pre_dirty(destdn->inode->inode.version); - ji = metablob->add_primary_dentry(destdn, true, destdn->inode); - - // do src dentry - metablob->add_dir_context(srcdn->dir); - if (srcdn->is_auth()) - mdr->more()->pvmap[srcdn] = srcdn->pre_dirty(); - metablob->add_null_dentry(srcdn, true); - - } else { - // move to stray? - if (destdn->is_primary()) { - // primary. we'll move inode to stray dir. - assert(straydn); - - // link-- inode, move to stray dir. - metablob->add_dir_context(straydn->dir); - if (straydn->is_auth()) - ipv = mdr->more()->pvmap[straydn] = straydn->pre_dirty(destdn->inode->inode.version); - ji = metablob->add_primary_dentry(straydn, true, destdn->inode); - } - else if (destdn->is_remote()) { - // remote. - // nlink-- targeti - metablob->add_dir_context(destdn->inode->get_parent_dir()); - if (destdn->inode->is_auth()) - ipv = mdr->more()->pvmap[destdn->inode] = destdn->inode->pre_dirty(); - ji = metablob->add_primary_dentry(destdn->inode->parent, true, destdn->inode); // update primary - dout(10) << "remote targeti (nlink--) is " << *destdn->inode << dendl; - } - else { - assert(destdn->is_null()); - } - - // add dest dentry - metablob->add_dir_context(destdn->dir); - if (srcdn->is_primary()) { - dout(10) << "src is a primary dentry" << dendl; - if (destdn->is_auth()) { - version_t siv; - if (srcdn->is_auth()) - siv = srcdn->inode->get_projected_version(); - else - siv = mdr->more()->inode_import_v; - mdr->more()->pvmap[destdn] = destdn->pre_dirty(siv+1); - } - metablob->add_primary_dentry(destdn, true, srcdn->inode); - - } else { - assert(srcdn->is_remote()); - dout(10) << "src is a remote dentry" << dendl; - if (destdn->is_auth()) - mdr->more()->pvmap[destdn] = destdn->pre_dirty(); - metablob->add_remote_dentry(destdn, true, srcdn->get_remote_ino()); - } - - // remove src dentry - metablob->add_dir_context(srcdn->dir); - if (srcdn->is_auth()) - mdr->more()->pvmap[srcdn] = srcdn->pre_dirty(); - metablob->add_null_dentry(srcdn, true); - - // new subtree? - if (srcdn->is_primary() && - srcdn->inode->is_dir()) { - list ls; - srcdn->inode->get_nested_dirfrags(ls); - int auth = srcdn->authority().first; - for (list::iterator p = ls.begin(); p != ls.end(); ++p) - mdcache->adjust_subtree_auth(*p, auth, auth); - } - } - - if (ipv) { - // update journaled target inode - inode_t *pi = destdn->inode->project_inode(); - pi->nlink--; - pi->ctime = mdr->now; - pi->version = ipv; - *ji = *pi; // copy into journal - } - - // anchor updates? - if (mdr->more()->src_reanchor_atid) - metablob->add_anchor_transaction(mdr->more()->src_reanchor_atid); - if (mdr->more()->dst_reanchor_atid) - metablob->add_anchor_transaction(mdr->more()->dst_reanchor_atid); -} - - -void Server::_rename_apply(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn) -{ - dout(10) << "_rename_apply " << *mdr << " " << *srcdn << " " << *destdn << dendl; - dout(10) << " pvs " << mdr->more()->pvmap << dendl; - - CInode *oldin = destdn->inode; - - // primary+remote link merge? - bool linkmerge = (srcdn->inode == destdn->inode && - (srcdn->is_primary() || destdn->is_primary())); - - // dir mtimes - if (mdr->is_master()) { - dirty_dn_diri(mdr, destdn, mdr->more()->pvmap[destdn->dir->inode]); - if (destdn->dir != srcdn->dir) - dirty_dn_diri(mdr, srcdn, mdr->more()->pvmap[srcdn->dir->inode]); - } - - if (linkmerge) { - if (destdn->is_primary()) { - dout(10) << "merging remote onto primary link" << dendl; - - // nlink-- in place - destdn->inode->inode.nlink--; - destdn->inode->inode.ctime = mdr->now; - if (destdn->inode->is_auth()) - destdn->inode->mark_dirty(mdr->more()->pvmap[destdn], mdr->ls); - - // unlink srcdn - srcdn->dir->unlink_inode(srcdn); - if (srcdn->is_auth()) - srcdn->mark_dirty(mdr->more()->pvmap[srcdn], mdr->ls); - } else { - dout(10) << "merging primary onto remote link" << dendl; - assert(srcdn->is_primary()); - - // move inode to dest - srcdn->dir->unlink_inode(srcdn); - destdn->dir->unlink_inode(destdn); - destdn->dir->link_primary_inode(destdn, oldin); - - // nlink-- - destdn->inode->inode.nlink--; - destdn->inode->inode.ctime = mdr->now; - if (destdn->inode->is_auth()) - destdn->inode->mark_dirty(mdr->more()->pvmap[destdn], mdr->ls); - - // mark src dirty - if (srcdn->is_auth()) - srcdn->mark_dirty(mdr->more()->pvmap[srcdn], mdr->ls); - } - } - else { - // unlink destdn? - if (!destdn->is_null()) - destdn->dir->unlink_inode(destdn); - - if (straydn) { - dout(10) << "straydn is " << *straydn << dendl; - - // relink oldin to stray dir. destdn was primary. - assert(oldin); - straydn->dir->link_primary_inode(straydn, oldin); - //assert(straypv == ipv); - - // nlink-- in stray dir. - oldin->inode.nlink--; - oldin->inode.ctime = mdr->now; - if (oldin->is_auth()) - oldin->pop_and_dirty_projected_inode(mdr->ls); - } - else if (oldin) { - // nlink-- remote. destdn was remote. - oldin->inode.nlink--; - oldin->inode.ctime = mdr->now; - if (oldin->is_auth()) - oldin->pop_and_dirty_projected_inode(mdr->ls); - } - - CInode *in = srcdn->inode; - assert(in); - if (srcdn->is_remote()) { - // srcdn was remote. - srcdn->dir->unlink_inode(srcdn); - destdn->dir->link_remote_inode(destdn, in->ino(), MODE_TO_DT(in->inode.mode)); - destdn->link_remote(in); - if (destdn->is_auth()) - destdn->mark_dirty(mdr->more()->pvmap[destdn], mdr->ls); - } else { - // srcdn was primary. - srcdn->dir->unlink_inode(srcdn); - destdn->dir->link_primary_inode(destdn, in); - - // srcdn inode import? - if (!srcdn->is_auth() && destdn->is_auth()) { - assert(mdr->more()->inode_import.length() > 0); - bufferlist::iterator blp = mdr->more()->inode_import.begin(); - map imported_client_map; - list updated_scatterlocks; // we clear_updated explicitly below - ::_decode_simple(imported_client_map, blp); - mdcache->migrator->decode_import_inode(destdn, blp, - srcdn->authority().first, - imported_client_map, - mdr->ls, - updated_scatterlocks); - destdn->inode->dirlock.clear_updated(); - } - if (destdn->inode->is_auth()) - destdn->inode->mark_dirty(mdr->more()->pvmap[destdn], mdr->ls); - } - - if (srcdn->is_auth()) - srcdn->mark_dirty(mdr->more()->pvmap[srcdn], mdr->ls); - } - - // update subtree map? - if (destdn->is_primary() && destdn->inode->is_dir()) - mdcache->adjust_subtree_after_rename(destdn->inode, srcdn->dir); - - // removing a new dn? - if (srcdn->is_auth()) - srcdn->dir->try_remove_unlinked_dn(srcdn); -} - - - - - -// ------------ -// SLAVE - -class C_MDS_SlaveRenamePrep : public Context { - Server *server; - MDRequest *mdr; - CDentry *srcdn, *destdn, *straydn; -public: - C_MDS_SlaveRenamePrep(Server *s, MDRequest *m, CDentry *sr, CDentry *de, CDentry *st) : - server(s), mdr(m), srcdn(sr), destdn(de), straydn(st) {} - void finish(int r) { - server->_logged_slave_rename(mdr, srcdn, destdn, straydn); - } -}; - -class C_MDS_SlaveRenameCommit : public Context { - Server *server; - MDRequest *mdr; - CDentry *srcdn, *destdn, *straydn; -public: - C_MDS_SlaveRenameCommit(Server *s, MDRequest *m, CDentry *sr, CDentry *de, CDentry *st) : - server(s), mdr(m), srcdn(sr), destdn(de), straydn(st) {} - void finish(int r) { - server->_commit_slave_rename(mdr, r, srcdn, destdn, straydn); - } -}; - -void Server::handle_slave_rename_prep(MDRequest *mdr) -{ - dout(10) << "handle_slave_rename_prep " << *mdr - << " " << mdr->slave_request->srcdnpath - << " to " << mdr->slave_request->destdnpath - << dendl; - - // discover destdn - filepath destpath(mdr->slave_request->destdnpath); - dout(10) << " dest " << destpath << dendl; - vector trace; - int r = mdcache->path_traverse(mdr, mdr->slave_request, - 0, destpath, trace, false, - MDS_TRAVERSE_DISCOVERXLOCK); - if (r > 0) return; - assert(r == 0); // we shouldn't get an error here! - - CDentry *destdn = trace[trace.size()-1]; - dout(10) << " destdn " << *destdn << dendl; - mdr->pin(destdn); - - - // discover srcdn - filepath srcpath(mdr->slave_request->srcdnpath); - dout(10) << " src " << srcpath << dendl; - r = mdcache->path_traverse(mdr, mdr->slave_request, - 0, srcpath, trace, false, - MDS_TRAVERSE_DISCOVERXLOCK); - if (r > 0) return; - assert(r == 0); // we shouldn't get an error here! - - CDentry *srcdn = trace[trace.size()-1]; - dout(10) << " srcdn " << *srcdn << dendl; - mdr->pin(srcdn); - assert(srcdn->inode); - mdr->pin(srcdn->inode); - - // stray? - CDentry *straydn = 0; - if (destdn->is_primary()) { - assert(mdr->slave_request->stray.length() > 0); - straydn = mdcache->add_replica_stray(mdr->slave_request->stray, - destdn->inode, mdr->slave_to_mds); - assert(straydn); - mdr->pin(straydn); - } - - mdr->now = mdr->slave_request->now; - - // set up commit waiter (early, to clean up any freezing etc we do) - if (!mdr->more()->slave_commit) - mdr->more()->slave_commit = new C_MDS_SlaveRenameCommit(this, mdr, srcdn, destdn, straydn); - - // am i srcdn auth? - if (srcdn->is_auth()) { - if (srcdn->is_primary() && - !srcdn->inode->is_freezing_inode() && - !srcdn->inode->is_frozen_inode()) { - // srci auth. - // set ambiguous auth. - srcdn->inode->state_set(CInode::STATE_AMBIGUOUSAUTH); - - // freeze? - // we need this to - // - avoid conflicting lock state changes - // - avoid concurrent updates to the inode - // (this could also be accomplished with the versionlock) - int allowance = 1; // for the versionlock and possible linklock xlock (both are tied to mdr) - dout(10) << " freezing srci " << *srcdn->inode << " with allowance " << allowance << dendl; - if (!srcdn->inode->freeze_inode(allowance)) { - srcdn->inode->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr)); - return; - } - } - - // is witness list sufficient? - set srcdnrep; - srcdn->list_replicas(srcdnrep); - for (set::iterator p = srcdnrep.begin(); - p != srcdnrep.end(); - ++p) { - if (*p == mdr->slave_to_mds || - mdr->slave_request->witnesses.count(*p)) continue; - dout(10) << " witness list insufficient; providing srcdn replica list" << dendl; - MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_RENAMEPREPACK); - reply->witnesses.swap(srcdnrep); - mds->send_message_mds(reply, mdr->slave_to_mds, MDS_PORT_SERVER); - delete mdr->slave_request; - mdr->slave_request = 0; - return; - } - dout(10) << " witness list sufficient: includes all srcdn replicas" << dendl; - } - - // journal it? - if (srcdn->is_auth() || - (destdn->inode && destdn->inode->is_auth()) || - srcdn->inode->is_any_caps()) { - // journal. - mdr->ls = mdlog->get_current_segment(); - ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rename_prep", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_PREPARE); - - // rollback case - if (destdn->inode && destdn->inode->is_auth()) { - assert(destdn->is_remote()); - le->rollback.add_dir_context(destdn->dir); - le->rollback.add_dentry(destdn, true); - } - if (srcdn->is_auth() || - (srcdn->inode && srcdn->inode->is_auth())) { - le->rollback.add_dir_context(srcdn->dir); - le->rollback.add_dentry(srcdn, true); - } - - // commit case - _rename_prepare(mdr, &le->commit, srcdn, destdn, straydn); - - mdlog->submit_entry(le, new C_MDS_SlaveRenamePrep(this, mdr, srcdn, destdn, straydn)); - } else { - // don't journal. - dout(10) << "not journaling, i'm not auth for anything, and srci isn't open" << dendl; - - // prepare anyway; this may twiddle dir_auth - EMetaBlob blah; - _rename_prepare(mdr, &blah, srcdn, destdn, straydn); - _logged_slave_rename(mdr, srcdn, destdn, straydn); - } -} - -void Server::_logged_slave_rename(MDRequest *mdr, - CDentry *srcdn, CDentry *destdn, CDentry *straydn) -{ - dout(10) << "_logged_slave_rename " << *mdr << dendl; - - // prepare ack - MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_RENAMEPREPACK); - - // export srci? - if (srcdn->is_auth() && srcdn->is_primary()) { - list finished; - map exported_client_map; - bufferlist inodebl; - mdcache->migrator->encode_export_inode(srcdn->inode, inodebl, - exported_client_map); - mdcache->migrator->finish_export_inode(srcdn->inode, mdr->now, finished); - mds->queue_waiters(finished); // this includes SINGLEAUTH waiters. - ::_encode(exported_client_map, reply->inode_export); - reply->inode_export.claim_append(inodebl); - reply->inode_export_v = srcdn->inode->inode.version; - - // remove mdr auth pin - mdr->auth_unpin(srcdn->inode); - assert(!srcdn->inode->is_auth_pinned()); - - dout(10) << " exported srci " << *srcdn->inode << dendl; - } - - // apply - _rename_apply(mdr, srcdn, destdn, straydn); - - mds->send_message_mds(reply, mdr->slave_to_mds, MDS_PORT_SERVER); - - // bump popularity - //if (srcdn->is_auth()) - //mds->balancer->hit_dir(mdr->now, srcdn->get_dir(), META_POP_DWR); - if (destdn->inode && destdn->inode->is_auth()) - mds->balancer->hit_inode(mdr->now, destdn->inode, META_POP_IWR); - - // done. - delete mdr->slave_request; - mdr->slave_request = 0; -} - -void Server::_commit_slave_rename(MDRequest *mdr, int r, - CDentry *srcdn, CDentry *destdn, CDentry *straydn) -{ - dout(10) << "_commit_slave_rename " << *mdr << " r=" << r << dendl; - - // unfreeze+singleauth inode - // hmm, do i really need to delay this? - if (srcdn->is_auth() && destdn->is_primary()) { - dout(10) << " unfreezing exported inode " << *destdn->inode << dendl; - list finished; - - // singleauth - assert(destdn->inode->state_test(CInode::STATE_AMBIGUOUSAUTH)); - destdn->inode->state_clear(CInode::STATE_AMBIGUOUSAUTH); - destdn->inode->take_waiting(CInode::WAIT_SINGLEAUTH, finished); - - // unfreeze - assert(destdn->inode->is_frozen_inode() || - destdn->inode->is_freezing_inode()); - destdn->inode->unfreeze_inode(finished); - - mds->queue_waiters(finished); - } - - - ESlaveUpdate *le; - if (r == 0) { - // write a commit to the journal - le = new ESlaveUpdate(mdlog, "slave_rename_commit", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_COMMIT); - - } else { - // abort - le = new ESlaveUpdate(mdlog, "slave_rename_abort", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_ROLLBACK); - - // -- rollback in memory -- - - if (mdr->more()->was_link_merge) { - // link merge - CInode *in = destdn->inode; - in->inode.nlink++; - if (mdr->more()->destdn_was_remote_inode) { - destdn->dir->unlink_inode(destdn); - srcdn->dir->link_primary_inode(srcdn, in); - destdn->dir->link_remote_inode(destdn, in->ino(), MODE_TO_DT(in->inode.mode)); - } else { - srcdn->dir->link_remote_inode(srcdn, in->ino(), MODE_TO_DT(in->inode.mode)); - } - } else { - // normal - - // revert srcdn - if (destdn->is_remote()) { - srcdn->dir->link_remote_inode(srcdn, destdn->inode->ino(), MODE_TO_DT(destdn->inode->inode.mode)); - destdn->dir->unlink_inode(destdn); - } else { - // renamed a primary - CInode *in = destdn->inode; - destdn->dir->unlink_inode(destdn); - srcdn->dir->link_primary_inode(srcdn, in); - } - - // revert destdn - if (mdr->more()->destdn_was_remote_inode) { - destdn->dir->link_remote_inode(destdn, - mdr->more()->destdn_was_remote_inode->ino(), - MODE_TO_DT(mdr->more()->destdn_was_remote_inode->inode.mode)); - mdr->more()->destdn_was_remote_inode->inode.nlink++; - } else if (straydn && straydn->inode) { - CInode *in = straydn->inode; - straydn->dir->unlink_inode(straydn); - destdn->dir->link_primary_inode(destdn, in); - straydn->dir->remove_dentry(straydn); - } - } - - // FIXME: reverse srci export? - - dout(-10) << " srcdn back to " << *srcdn << dendl; - dout(-10) << " srci back to " << *srcdn->inode << dendl; - dout(-10) << " destdn back to " << *destdn << dendl; - if (destdn->inode) dout(-10) << " desti back to " << *destdn->inode << dendl; - - // *** WRITE ME *** - assert(0); - - } - - - - mdlog->submit_entry(le); -} - -void Server::handle_slave_rename_prep_ack(MDRequest *mdr, MMDSSlaveRequest *ack) -{ - dout(10) << "handle_slave_rename_prep_ack " << *mdr - << " witnessed by " << ack->get_source() - << " " << *ack << dendl; - int from = ack->get_source().num(); - - // note slave - mdr->more()->slaves.insert(from); - - // witnessed? or add extra witnesses? - assert(mdr->more()->witnessed.count(from) == 0); - if (ack->witnesses.empty()) { - mdr->more()->witnessed.insert(from); - } else { - dout(10) << " extra witnesses (srcdn replicas) are " << ack->witnesses << dendl; - mdr->more()->extra_witnesses.swap(ack->witnesses); - mdr->more()->extra_witnesses.erase(mds->get_nodeid()); // not me! - } - - // srci import? - if (ack->inode_export.length()) { - dout(10) << " got srci import" << dendl; - mdr->more()->inode_import.claim(ack->inode_export); - mdr->more()->inode_import_v = ack->inode_export_v; - } - - // remove from waiting list - assert(mdr->more()->waiting_on_slave.count(from)); - mdr->more()->waiting_on_slave.erase(from); - - if (mdr->more()->waiting_on_slave.empty()) - dispatch_client_request(mdr); // go again! - else - dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl; -} - - - - - -// =================================== -// TRUNCATE, FSYNC - -class C_MDS_truncate_purged : public Context { - MDS *mds; - MDRequest *mdr; - CInode *in; - version_t pv; - off_t size; - utime_t ctime; -public: - C_MDS_truncate_purged(MDS *m, MDRequest *r, CInode *i, version_t pdv, off_t sz, utime_t ct) : - mds(m), mdr(r), in(i), - pv(pdv), - size(sz), ctime(ct) { } - void finish(int r) { - assert(r == 0); - - // apply to cache - in->inode.size = size; - in->inode.ctime = ctime; - in->inode.mtime = ctime; - in->mark_dirty(pv, mdr->ls); - - // reply - mds->server->reply_request(mdr, 0); - } -}; - -class C_MDS_truncate_logged : public Context { - MDS *mds; - MDRequest *mdr; - CInode *in; - version_t pv; - off_t size; - utime_t ctime; -public: - C_MDS_truncate_logged(MDS *m, MDRequest *r, CInode *i, version_t pdv, off_t sz, utime_t ct) : - mds(m), mdr(r), in(i), - pv(pdv), - size(sz), ctime(ct) { } - void finish(int r) { - assert(r == 0); - - // purge - mds->mdcache->purge_inode(in, size, in->inode.size, mdr->ls); - mds->mdcache->wait_for_purge(in, size, - new C_MDS_truncate_purged(mds, mdr, in, pv, size, ctime)); - } -}; - -void Server::handle_client_truncate(MDRequest *mdr) -{ - MClientRequest *req = mdr->client_request; - CInode *cur = rdlock_path_pin_ref(mdr, true); - if (!cur) return; - - // check permissions? - - // xlock inode - set rdlocks = mdr->rdlocks; - set wrlocks = mdr->wrlocks; - set xlocks = mdr->xlocks; - xlocks.insert(&cur->filelock); - if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) - return; - - // already small enough? - if (cur->inode.size <= req->args.truncate.length) { - reply_request(mdr, 0); - return; - } - - // prepare - version_t pdv = cur->pre_dirty(); - utime_t ctime = g_clock.real_now(); - Context *fin = new C_MDS_truncate_logged(mds, mdr, cur, - pdv, req->args.truncate.length, ctime); - - // log + wait - mdr->ls = mdlog->get_current_segment(); - EUpdate *le = new EUpdate(mdlog, "truncate"); - le->metablob.add_client_req(mdr->reqid); - le->metablob.add_dir_context(cur->get_parent_dir()); - le->metablob.add_inode_truncate(cur->ino(), req->args.truncate.length, cur->inode.size); - inode_t *pi = le->metablob.add_dentry(cur->parent, true); - pi->mtime = ctime; - pi->ctime = ctime; - pi->version = pdv; - pi->size = req->args.truncate.length; - - - mdlog->submit_entry(le, fin); -} - - -// =========================== -// open, openc, close - -void Server::handle_client_open(MDRequest *mdr) -{ - MClientRequest *req = mdr->client_request; - - int flags = req->args.open.flags; - int cmode = req->get_open_file_mode(); - bool need_auth = ((cmode != FILE_MODE_R && cmode != FILE_MODE_LAZY) || - (flags & O_TRUNC)); - dout(10) << "open flags = " << flags - << ", filemode = " << cmode - << ", need_auth = " << need_auth - << dendl; - - CInode *cur = rdlock_path_pin_ref(mdr, need_auth); - if (!cur) return; - - // regular file? - if ((cur->inode.mode & INODE_TYPE_MASK) != INODE_MODE_FILE) { - dout(7) << "not a regular file " << *cur << dendl; - reply_request(mdr, -EINVAL); // FIXME what error do we want? - return; - } - - // hmm, check permissions or something. - - - // O_TRUNC - if (flags & O_TRUNC) { - assert(cur->is_auth()); - - // xlock file size - set rdlocks = mdr->rdlocks; - set wrlocks = mdr->wrlocks; - set xlocks = mdr->xlocks; - xlocks.insert(&cur->filelock); - if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) - return; - - if (cur->inode.size > 0) { - handle_client_opent(mdr); - return; - } - } - - // do it - _do_open(mdr, cur); -} - -void Server::_do_open(MDRequest *mdr, CInode *cur) -{ - MClientRequest *req = mdr->client_request; - int cmode = req->get_open_file_mode(); - - // can we issue the caps they want? - version_t fdv = mds->locker->issue_file_data_version(cur); - Capability *cap = mds->locker->issue_new_caps(cur, cmode, req); - if (!cap) return; // can't issue (yet), so wait! - - dout(12) << "_do_open issuing caps " << cap_string(cap->pending()) - << " for " << req->get_source() - << " on " << *cur << dendl; - - // hit pop - mdr->now = g_clock.now(); - if (cmode == FILE_MODE_RW || - cmode == FILE_MODE_W) - mds->balancer->hit_inode(mdr->now, cur, META_POP_IWR); - else - mds->balancer->hit_inode(mdr->now, cur, META_POP_IRD, - mdr->client_request->get_client_inst().name.num()); - - // reply - MClientReply *reply = new MClientReply(req, 0); - reply->set_file_caps(cap->pending()); - reply->set_file_caps_seq(cap->get_last_seq()); - reply->set_file_data_version(fdv); - reply_request(mdr, reply, cur); - - // journal? - if (cur->last_open_journaled == 0) { - queue_journal_open(cur); - maybe_journal_opens(); - } - -} - -void Server::queue_journal_open(CInode *in) -{ - dout(10) << "queue_journal_open on " << *in << dendl; - - if (journal_open_queue.count(in) == 0) { - // pin so our pointer stays valid - in->get(CInode::PIN_BATCHOPENJOURNAL); - - // queue it up for a bit - journal_open_queue.insert(in); - } -} - - -void Server::journal_opens() -{ - dout(10) << "journal_opens " << journal_open_queue.size() << " inodes" << dendl; - if (journal_open_queue.empty()) return; - - EOpen *le = 0; - - // check queued inodes - LogSegment *ls = mdlog->get_current_segment(); - for (set::iterator p = journal_open_queue.begin(); - p != journal_open_queue.end(); - ++p) { - CInode *in = *p; - in->put(CInode::PIN_BATCHOPENJOURNAL); - if (in->is_any_caps()) { - if (!le) le = new EOpen(mdlog); - le->add_inode(in); - in->last_open_journaled = mds->mdlog->get_write_pos(); - ls->open_files.push_back(&in->xlist_open_file); - } - } - journal_open_queue.clear(); - - if (le) { - // journal - mdlog->submit_entry(le); - - // add waiters to journal entry - for (list::iterator p = journal_open_waiters.begin(); - p != journal_open_waiters.end(); - ++p) - mds->mdlog->wait_for_sync(*p); - journal_open_waiters.clear(); - } else { - // nothing worth journaling here, just kick the waiters. - mds->queue_waiters(journal_open_waiters); - } -} - - - - -class C_MDS_open_truncate_purged : public Context { - MDS *mds; - MDRequest *mdr; - CInode *in; - version_t pv; - utime_t ctime; -public: - C_MDS_open_truncate_purged(MDS *m, MDRequest *r, CInode *i, version_t pdv, utime_t ct) : - mds(m), mdr(r), in(i), - pv(pdv), - ctime(ct) { } - void finish(int r) { - assert(r == 0); - - // apply to cache - in->inode.size = 0; - in->inode.ctime = ctime; - in->inode.mtime = ctime; - in->mark_dirty(pv, mdr->ls); - - // do the open - mds->server->_do_open(mdr, in); - } -}; - -class C_MDS_open_truncate_logged : public Context { - MDS *mds; - MDRequest *mdr; - CInode *in; - version_t pv; - utime_t ctime; -public: - C_MDS_open_truncate_logged(MDS *m, MDRequest *r, CInode *i, version_t pdv, utime_t ct) : - mds(m), mdr(r), in(i), - pv(pdv), - ctime(ct) { } - void finish(int r) { - assert(r == 0); - - // hit pop - mds->balancer->hit_inode(mdr->now, in, META_POP_IWR); - - // purge also... - mds->mdcache->purge_inode(in, 0, in->inode.size, mdr->ls); - mds->mdcache->wait_for_purge(in, 0, - new C_MDS_open_truncate_purged(mds, mdr, in, pv, ctime)); - } -}; - - -void Server::handle_client_opent(MDRequest *mdr) -{ - CInode *cur = mdr->ref; - assert(cur); - - // prepare - version_t pdv = cur->pre_dirty(); - utime_t ctime = g_clock.real_now(); - Context *fin = new C_MDS_open_truncate_logged(mds, mdr, cur, - pdv, ctime); - - // log + wait - mdr->ls = mdlog->get_current_segment(); - EUpdate *le = new EUpdate(mdlog, "open_truncate"); - le->metablob.add_client_req(mdr->reqid); - le->metablob.add_dir_context(cur->get_parent_dir()); - le->metablob.add_inode_truncate(cur->ino(), 0, cur->inode.size); - inode_t *pi = le->metablob.add_dentry(cur->parent, true); - pi->mtime = ctime; - pi->ctime = ctime; - pi->version = pdv; - pi->size = 0; - - mdlog->submit_entry(le, fin); -} - - - -class C_MDS_openc_finish : public Context { - MDS *mds; - MDRequest *mdr; - CDentry *dn; - CInode *newi; - version_t pv; -public: - C_MDS_openc_finish(MDS *m, MDRequest *r, CDentry *d, CInode *ni) : - mds(m), mdr(r), dn(d), newi(ni), - pv(d->get_projected_version()) {} - void finish(int r) { - assert(r == 0); - - // link the inode - dn->get_dir()->link_primary_inode(dn, newi); - - // dirty inode, dn, dir - newi->mark_dirty(pv, mdr->ls); - - // downgrade xlock to rdlock - //mds->locker->dentry_xlock_downgrade_to_rdlock(dn, mdr); - - // set/pin ref inode for open() - mdr->ref = newi; - mdr->pin(newi); - - // ok, do the open. - mds->server->handle_client_open(mdr); - } -}; - - -void Server::handle_client_openc(MDRequest *mdr) -{ - MClientRequest *req = mdr->client_request; - - dout(7) << "open w/ O_CREAT on " << req->get_filepath() << dendl; - - bool excl = (req->args.open.flags & O_EXCL); - CDentry *dn = rdlock_path_xlock_dentry(mdr, !excl, false); - if (!dn) return; - - if (!dn->is_null()) { - // it existed. - if (req->args.open.flags & O_EXCL) { - dout(10) << "O_EXCL, target exists, failing with -EEXIST" << dendl; - reply_request(mdr, -EEXIST, dn->get_dir()->get_inode()); - return; - } - - // pass to regular open handler. - handle_client_open(mdr); - return; - } - - // created null dn. - - // create inode. - mdr->now = g_clock.real_now(); - CInode *in = prepare_new_inode(mdr, dn->dir); - assert(in); - - // it's a file. - in->inode.mode = req->args.open.mode; - in->inode.mode |= INODE_MODE_FILE; - in->inode.version = dn->pre_dirty() - 1; - - // prepare finisher - C_MDS_openc_finish *fin = new C_MDS_openc_finish(mds, mdr, dn, in); - mdr->ls = mdlog->get_current_segment(); - EUpdate *le = new EUpdate(mdlog, "openc"); - le->metablob.add_client_req(req->get_reqid()); - le->metablob.add_allocated_ino(in->ino(), mds->idalloc->get_version()); - le->metablob.add_dir_context(dn->dir); - le->metablob.add_primary_dentry(dn, true, in, &in->inode); - - // log + wait - mdlog->submit_entry(le, fin); - - /* - FIXME. this needs to be rewritten when the write capability stuff starts - getting journaled. - */ -} - - - - - - - - - - - - - - diff --git a/branches/sage/crush/mds/Server.h b/branches/sage/crush/mds/Server.h deleted file mode 100644 index 281fd13ca2593..0000000000000 --- a/branches/sage/crush/mds/Server.h +++ /dev/null @@ -1,184 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_SERVER_H -#define __MDS_SERVER_H - -#include "MDS.h" - -class Logger; -class LogEvent; -class C_MDS_rename_finish; -class MDRequest; -class EMetaBlob; -class PVList; -class MMDSSlaveRequest; - -class Server { - MDS *mds; - MDCache *mdcache; - MDLog *mdlog; - Messenger *messenger; - Logger *logger; - -public: - Server(MDS *m) : - mds(m), - mdcache(mds->mdcache), mdlog(mds->mdlog), - messenger(mds->messenger), - logger(0) { - } - ~Server() { - delete logger; - } - - void reopen_logger(utime_t start, bool append); - - // message handler - void dispatch(Message *m); - - - // -- sessions and recovery -- - utime_t reconnect_start; - set client_reconnect_gather; // clients i need a reconnect msg from. - set reconnected_caps; - - void handle_client_session(class MClientSession *m); - void _session_logged(entity_inst_t ci, bool open, version_t cmapv); - void terminate_sessions(); - void reconnect_clients(); - void handle_client_reconnect(class MClientReconnect *m); - void process_reconnect_cap(CInode *in, int from, inode_caps_reconnect_t& capinfo); - void add_reconnected_cap_inode(CInode *in) { - reconnected_caps.insert(in); - } - void process_reconnected_caps(); - void client_reconnect_failure(int from); - void reconnect_gather_finish(); - - - // -- requests -- - void handle_client_request(MClientRequest *m); - - void dispatch_client_request(MDRequest *mdr); - void reply_request(MDRequest *mdr, int r = 0, CInode *tracei = 0); - void reply_request(MDRequest *mdr, MClientReply *reply, CInode *tracei); - - void handle_slave_request(MMDSSlaveRequest *m); - void dispatch_slave_request(MDRequest *mdr); - void handle_slave_auth_pin(MDRequest *mdr); - void handle_slave_auth_pin_ack(MDRequest *mdr, MMDSSlaveRequest *ack); - - // some helpers - CDir *validate_dentry_dir(MDRequest *mdr, CInode *diri, const string& dname); - CDir *traverse_to_auth_dir(MDRequest *mdr, vector &trace, filepath refpath); - CDentry *prepare_null_dentry(MDRequest *mdr, CDir *dir, const string& dname, bool okexist=false); - CInode* prepare_new_inode(MDRequest *mdr, CDir *dir); - - CInode* rdlock_path_pin_ref(MDRequest *mdr, bool want_auth); - CDentry* rdlock_path_xlock_dentry(MDRequest *mdr, bool okexist, bool mustexist); - - CDir* try_open_auth_dirfrag(CInode *diri, frag_t fg, MDRequest *mdr); - - version_t predirty_dn_diri(MDRequest *mdr, CDentry *dn, class EMetaBlob *blob); - void dirty_dn_diri(MDRequest *mdr, CDentry *dn, version_t dirpv); - - - // requests on existing inodes. - void handle_client_stat(MDRequest *mdr); - void handle_client_utime(MDRequest *mdr); - void handle_client_chmod(MDRequest *mdr); - void handle_client_chown(MDRequest *mdr); - void handle_client_readdir(MDRequest *mdr); - void handle_client_truncate(MDRequest *mdr); - void handle_client_fsync(MDRequest *mdr); - - // open - void handle_client_open(MDRequest *mdr); - void handle_client_openc(MDRequest *mdr); // O_CREAT variant. - void handle_client_opent(MDRequest *mdr); // O_TRUNC variant. - void _do_open(MDRequest *mdr, CInode *ref); - - set journal_open_queue; // to be journal - list journal_open_waiters; - void queue_journal_open(CInode *in); - void add_journal_open_waiter(Context *c) { - journal_open_waiters.push_back(c); - } - void maybe_journal_opens() { - if (journal_open_queue.size() >= (unsigned)g_conf.mds_log_eopen_size) - journal_opens(); - } - void journal_opens(); - - // namespace changes - void handle_client_mknod(MDRequest *mdr); - void handle_client_mkdir(MDRequest *mdr); - void handle_client_symlink(MDRequest *mdr); - - // link - void handle_client_link(MDRequest *mdr); - void _link_local(MDRequest *mdr, CDentry *dn, CInode *targeti); - void _link_local_finish(MDRequest *mdr, - CDentry *dn, CInode *targeti, - version_t, version_t, version_t); - - void _link_remote(MDRequest *mdr, CDentry *dn, CInode *targeti); - void _link_remote_finish(MDRequest *mdr, CDentry *dn, CInode *targeti, - version_t, version_t); - - void handle_slave_link_prep(MDRequest *mdr); - void _logged_slave_link(MDRequest *mdr, CInode *targeti, utime_t old_ctime, bool inc); - void _commit_slave_link(MDRequest *mdr, int r, CInode *targeti, - utime_t old_ctime, version_t old_version, bool inc); - void handle_slave_link_prep_ack(MDRequest *mdr, MMDSSlaveRequest *m); - - // unlink - void handle_client_unlink(MDRequest *mdr); - bool _verify_rmdir(MDRequest *mdr, CInode *rmdiri); - void _unlink_local(MDRequest *mdr, CDentry *dn, CDentry *straydn); - void _unlink_local_finish(MDRequest *mdr, - CDentry *dn, CDentry *straydn, - version_t, version_t); - - void _unlink_remote(MDRequest *mdr, CDentry *dn); - void _unlink_remote_finish(MDRequest *mdr, - CDentry *dn, - version_t, version_t); - - // rename - void handle_client_rename(MDRequest *mdr); - void _rename_finish(MDRequest *mdr, - CDentry *srcdn, CDentry *destdn, CDentry *straydn); - - // helpers - void _rename_prepare_witness(MDRequest *mdr, int who, - CDentry *srcdn, CDentry *destdn, CDentry *straydn); - void _rename_prepare(MDRequest *mdr, - EMetaBlob *metablob, - CDentry *srcdn, CDentry *destdn, CDentry *straydn); - void _rename_apply(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn); - - // slaving - void handle_slave_rename_prep(MDRequest *mdr); - void handle_slave_rename_prep_ack(MDRequest *mdr, MMDSSlaveRequest *m); - void _logged_slave_rename(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn); - void _commit_slave_rename(MDRequest *mdr, int r, CDentry *srcdn, CDentry *destdn, CDentry *straydn); - -}; - - - - -#endif diff --git a/branches/sage/crush/mds/events/EImportStart.h b/branches/sage/crush/mds/events/EImportStart.h deleted file mode 100644 index aa1902576542d..0000000000000 --- a/branches/sage/crush/mds/events/EImportStart.h +++ /dev/null @@ -1,61 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __EIMPORTSTART_H -#define __EIMPORTSTART_H - -#include -#include "config.h" -#include "include/types.h" - -#include "../MDS.h" - -#include "EMetaBlob.h" - -class EImportStart : public LogEvent { -protected: - dirfrag_t base; - list bounds; - - public: - EMetaBlob metablob; - - EImportStart(dirfrag_t di, - list& b) : LogEvent(EVENT_IMPORTSTART), - base(di), bounds(b) { } - EImportStart() : LogEvent(EVENT_IMPORTSTART) { } - - void print(ostream& out) { - out << "EImportStart " << base << " " << metablob; - } - - virtual void encode_payload(bufferlist& bl) { - bl.append((char*)&base, sizeof(base)); - metablob._encode(bl); - ::_encode(bounds, bl); - } - void decode_payload(bufferlist& bl, int& off) { - bl.copy(off, sizeof(base), (char*)&base); - off += sizeof(base); - metablob._decode(bl, off); - ::_decode(bounds, bl, off); - } - - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); - void replay(MDS *mds); - -}; - -#endif diff --git a/branches/sage/crush/mds/events/ESession.h b/branches/sage/crush/mds/events/ESession.h deleted file mode 100644 index a8f9992486a18..0000000000000 --- a/branches/sage/crush/mds/events/ESession.h +++ /dev/null @@ -1,64 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_ESESSION_H -#define __MDS_ESESSION_H - -#include -#include "config.h" -#include "include/types.h" - -#include "../LogEvent.h" - -class ESession : public LogEvent { - protected: - entity_inst_t client_inst; - bool open; // open or close - version_t cmapv; // client map version - - public: - ESession() : LogEvent(EVENT_SESSION) { } - ESession(entity_inst_t inst, bool o, version_t v) : - LogEvent(EVENT_SESSION), - client_inst(inst), - open(o), - cmapv(v) { - } - - void encode_payload(bufferlist& bl) { - ::_encode(client_inst, bl); - ::_encode(open, bl); - ::_encode(cmapv, bl); - } - void decode_payload(bufferlist& bl, int& off) { - ::_decode(client_inst, bl, off); - ::_decode(open, bl, off); - ::_decode(cmapv, bl, off); - } - - - void print(ostream& out) { - if (open) - out << "ESession " << client_inst << " open cmapv " << cmapv; - else - out << "ESession " << client_inst << " close cmapv " << cmapv; - } - - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); - void update_segment(); - void replay(MDS *mds); -}; - -#endif diff --git a/branches/sage/crush/mds/events/EUpdate.h b/branches/sage/crush/mds/events/EUpdate.h deleted file mode 100644 index de965429f9bdd..0000000000000 --- a/branches/sage/crush/mds/events/EUpdate.h +++ /dev/null @@ -1,50 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_EUPDATE_H -#define __MDS_EUPDATE_H - -#include "../LogEvent.h" -#include "EMetaBlob.h" - -class EUpdate : public LogEvent { -public: - EMetaBlob metablob; - string type; - - EUpdate() : LogEvent(EVENT_UPDATE) { } - EUpdate(MDLog *mdlog, const char *s) : - LogEvent(EVENT_UPDATE), metablob(mdlog), - type(s) { } - - void print(ostream& out) { - if (type.length()) - out << type << " "; - out << metablob; - } - - void encode_payload(bufferlist& bl) { - ::_encode(type, bl); - metablob._encode(bl); - } - void decode_payload(bufferlist& bl, int& off) { - ::_decode(type, bl, off); - metablob._decode(bl, off); - } - - void update_segment(); - void replay(MDS *mds); -}; - -#endif diff --git a/branches/sage/crush/mds/journal.cc b/branches/sage/crush/mds/journal.cc deleted file mode 100644 index 1f27cf713a078..0000000000000 --- a/branches/sage/crush/mds/journal.cc +++ /dev/null @@ -1,1084 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "events/EString.h" -#include "events/ESubtreeMap.h" -#include "events/ESession.h" - -#include "events/EMetaBlob.h" - -#include "events/EUpdate.h" -#include "events/ESlaveUpdate.h" -#include "events/EOpen.h" - -#include "events/EPurgeFinish.h" - -#include "events/EExport.h" -#include "events/EImportStart.h" -#include "events/EImportFinish.h" -#include "events/EFragment.h" - -#include "events/EAnchor.h" -#include "events/EAnchorClient.h" - -#include "LogSegment.h" - -#include "MDS.h" -#include "MDLog.h" -#include "MDCache.h" -#include "Server.h" -#include "Migrator.h" -#include "AnchorTable.h" -#include "AnchorClient.h" -#include "IdAllocator.h" -#include "Locker.h" - - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug_mds || l <= g_conf.debug_mds_log || l <= g_conf.debug_mds_log_expire) *_dout << dbeginl << g_clock.now() << " mds" << mds->get_nodeid() << ".journal " -#define derr(l) if (l<=g_conf.debug_mds || l <= g_conf.debug_mds_log || l <= g_conf.debug_mds_log_expire) *_dout << dbeginl << g_clock.now() << " mds" << mds->get_nodeid() << ".journal " - - -// ----------------------- -// LogSegment - -class C_MDL_RetryExpireSegment : public Context { -public: - MDS *mds; - LogSegment *ls; - C_MDL_RetryExpireSegment(MDS *m, LogSegment *l) : mds(m), ls(l) {} - void finish(int r) { - ls->try_to_expire(mds); - } -}; - -C_Gather *LogSegment::try_to_expire(MDS *mds) -{ - C_Gather *gather = 0; - - set commit; - - dout(6) << "LogSegment(" << offset << ").try_to_expire" << dendl; - - // commit dirs - for (xlist::iterator p = dirty_dirfrags.begin(); !p.end(); ++p) - commit.insert(*p); - for (xlist::iterator p = dirty_dentries.begin(); !p.end(); ++p) - commit.insert((*p)->get_dir()); - for (xlist::iterator p = dirty_inodes.begin(); !p.end(); ++p) - commit.insert((*p)->get_parent_dn()->get_dir()); - - if (!commit.empty()) { - if (!gather) gather = new C_Gather; - - for (set::iterator p = commit.begin(); - p != commit.end(); - ++p) { - CDir *dir = *p; - if (dir->can_auth_pin()) { - dout(15) << "try_to_expire committing " << *dir << dendl; - dir->commit(0, gather->new_sub()); - } else { - dout(15) << "try_to_expire waiting for unfreeze on " << *dir << dendl; - dir->add_waiter(CDir::WAIT_UNFREEZE, gather->new_sub()); - } - } - } - - // dirty non-auth mtimes - for (xlist::iterator p = dirty_inode_mtimes.begin(); !p.end(); ++p) { - CInode *in = *p; - dout(10) << "try_to_expire waiting for dirlock mtime flush on " << *in << dendl; - if (!gather) gather = new C_Gather; - - if (in->is_ambiguous_auth()) { - dout(10) << " waiting for single auth on " << *in << dendl; - in->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, gather->new_sub()); - } else if (in->is_auth()) { - dout(10) << " i'm auth, unscattering dirlock on " << *in << dendl; - assert(in->is_replicated()); // hrm! - mds->locker->scatter_lock(&in->dirlock); - in->dirlock.add_waiter(SimpleLock::WAIT_STABLE, gather->new_sub()); - } else { - dout(10) << " i'm a replica, requesting dirlock unscatter of " << *in << dendl; - mds->locker->scatter_try_unscatter(&in->dirlock, gather->new_sub()); - } - //(*p)->dirlock.add_waiter(SimpleLock::WAIT_STABLE, gather->new_sub()); - } - - // open files - if (!open_files.empty()) { - assert(!mds->mdlog->is_capped()); // hmm FIXME - for (xlist::iterator p = open_files.begin(); !p.end(); ++p) { - dout(20) << "try_to_expire requeueing open file " << **p << dendl; - mds->server->queue_journal_open(*p); - } - if (!gather) gather = new C_Gather; - mds->server->add_journal_open_waiter(gather->new_sub()); - mds->server->maybe_journal_opens(); - dout(10) << "try_to_expire waiting for open files to rejournal" << dendl; - } - - // slave updates - for (xlist::iterator p = slave_updates.begin(); !p.end(); ++p) { - MDSlaveUpdate *su = *p; - dout(10) << "try_to_expire waiting on slave update " << su << dendl; - assert(su->waiter == 0); - if (!gather) gather = new C_Gather; - su->waiter = gather->new_sub(); - } - - // idalloc - if (allocv > mds->idalloc->get_committed_version()) { - dout(10) << "try_to_expire saving idalloc table, need " << allocv - << ", committed is " << mds->idalloc->get_committed_version() - << " (" << mds->idalloc->get_committing_version() << ")" - << dendl; - if (!gather) gather = new C_Gather; - mds->idalloc->save(gather->new_sub(), allocv); - } - - // clientmap - if (clientmapv > mds->clientmap.get_committed()) { - dout(10) << "try_to_expire saving clientmap, need " << clientmapv - << ", committed is " << mds->clientmap.get_committed() - << " (" << mds->clientmap.get_committing() << ")" - << dendl; - if (!gather) gather = new C_Gather; - mds->clientmap.save(gather->new_sub(), clientmapv); - } - - // pending commit atids - for (hash_set::iterator p = pending_commit_atids.begin(); - p != pending_commit_atids.end(); - ++p) { - if (!gather) gather = new C_Gather; - assert(!mds->anchorclient->has_committed(*p)); - dout(10) << "try_to_expire anchor transaction " << *p - << " pending commit (not yet acked), waiting" << dendl; - mds->anchorclient->wait_for_ack(*p, gather->new_sub()); - } - - // anchortable - if (anchortablev > mds->anchortable->get_committed_version()) { - dout(10) << "try_to_expire waiting for anchor table to save, need " << anchortablev << dendl; - if (!gather) gather = new C_Gather; - mds->anchortable->save(gather->new_sub()); - } - - // FIXME client requests...? - // audit handling of anchor transactions? - - if (gather) { - dout(6) << "LogSegment(" << offset << ").try_to_expire waiting" << dendl; - } else { - dout(6) << "LogSegment(" << offset << ").try_to_expire success" << dendl; - } - return gather; -} - - - -#undef dout -#undef derr -#define dout(l) if (l<=g_conf.debug_mds || l <= g_conf.debug_mds_log) *_dout << dbeginl << g_clock.now() << " mds" << mds->get_nodeid() << ".journal " -#define derr(l) if (l<=g_conf.debug_mds || l <= g_conf.debug_mds_log) *_dout << dbeginl << g_clock.now() << " mds" << mds->get_nodeid() << ".journal " - - -// ----------------------- -// EString - -bool EString::has_expired(MDS *mds) { - dout(10) << "EString.has_expired " << event << dendl; - return true; -} -void EString::expire(MDS *mds, Context *c) -{ - dout(10) << "EString.expire " << event << dendl; -} -void EString::replay(MDS *mds) -{ - dout(10) << "EString.replay " << event << dendl; -} - - - -// ----------------------- -// EMetaBlob - -EMetaBlob::EMetaBlob(MDLog *mdlog) : - last_subtree_map(mdlog->get_last_segment_offset()), - my_offset(mdlog->get_write_pos()) -{ -} - - -/* - * we need to ensure that a journaled item has either - * - * - been safely committed to its dirslice. - * - * - has been safely exported. i.e., authority().first != us. - * in particular, auth of is not enough, we need to - * wait for . - * - * note that this check is overly conservative, in that we'll - * try to flush the dir again if we reimport the subtree, even though - * later journal entries contain the same dirty data (from the import). - * - */ -bool EMetaBlob::has_expired(MDS *mds) -{ -/* - // examine dirv's for my lumps - for (map::iterator lp = lump_map.begin(); - lp != lump_map.end(); - ++lp) { - CDir *dir = mds->mdcache->get_dirfrag(lp->first); - if (!dir) - continue; // we expired it - - // FIXME: check the slice only - - if (dir->authority().first != mds->get_nodeid()) { - dout(10) << "EMetaBlob.has_expired not auth, needed dirv " << lp->second.dirv - << " for " << *dir << dendl; - continue; // not our problem - } - - if (g_conf.mds_hack_log_expire_for_better_stats) { - // FIXME HACK: this makes logger stats more accurage, for journal stats, - // but is not perfectly safe. for benchmarking ONLY! - if (dir->get_committing_version() >= lp->second.dirv || // committING, not committED. - dir->get_committed_version_equivalent() >= lp->second.dirv) { - dout(10) << "EMetaBlob.has_expired have|committING (unsafe hack!) dirv " << lp->second.dirv - << " for " << *dir << dendl; - continue; // yay - } - } else { - // this is the proper (safe) way - if (dir->get_committed_version() >= lp->second.dirv || - dir->get_committed_version_equivalent() >= lp->second.dirv) { - dout(10) << "EMetaBlob.has_expired have dirv " << lp->second.dirv - << " for " << *dir << dendl; - continue; // yay - } - } - - if (dir->is_ambiguous_dir_auth()) { - CDir *ex = mds->mdcache->get_subtree_root(dir); - if (ex->is_exporting()) { - // wait until export is acked (logged on remote) and committed (logged locally) - dout(10) << "EMetaBlob.has_expired ambiguous auth for " << *dir - << ", exporting on " << *ex << dendl; - return false; - } else { - dout(10) << "EMetaBlob.has_expired ambiguous auth for " << *dir - << ", importing on " << *ex << dendl; - return false; - } - } - - if (dir->get_committed_version() < lp->second.dirv) { - dout(10) << "EMetaBlob.has_expired need dirv " << lp->second.dirv - << " for " << *dir << dendl; - return false; // not committed. - } - - assert(0); // i goofed the logic - } - - // have my anchortable ops committed? - for (list::iterator p = atids.begin(); - p != atids.end(); - ++p) { - if (!mds->anchorclient->has_committed(*p)) { - dout(10) << "EMetaBlob.has_expired anchor transaction " << *p - << " not yet acked" << dendl; - return false; - } - } - - if (!dirty_inode_mtimes.empty()) - for (map::iterator p = dirty_inode_mtimes.begin(); - p != dirty_inode_mtimes.end(); - ++p) { - CInode *in = mds->mdcache->get_inode(p->first); - if (in) { - if (in->inode.ctime == p->second && - in->dirlock.is_updated()) { - dout(10) << "EMetaBlob.has_expired dirty mtime dirlock hasn't flushed on " << *in << dendl; - return false; - } - } - } - - // allocated_ios - if (!allocated_inos.empty()) { - version_t cv = mds->idalloc->get_committed_version(); - if (cv < alloc_tablev) { - dout(10) << "EMetaBlob.has_expired idalloc tablev " << alloc_tablev << " > " << cv - << ", still dirty" << dendl; - return false; // still dirty - } else { - dout(10) << "EMetaBlob.has_expired idalloc tablev " << alloc_tablev << " <= " << cv - << ", already flushed" << dendl; - } - } - - - // truncated inodes - for (list< pair >::iterator p = truncated_inodes.begin(); - p != truncated_inodes.end(); - ++p) { - if (mds->mdcache->is_purging(p->first.ino, p->second)) { - dout(10) << "EMetaBlob.has_expired still purging inode " << p->first.ino - << " to " << p->second << dendl; - return false; - } - } - - // client requests - for (list::iterator p = client_reqs.begin(); - p != client_reqs.end(); - ++p) { - if (mds->clientmap.have_completed_request(*p)) { - dout(10) << "EMetaBlob.has_expired still have completed request " << *p - << dendl; - return false; - } - } - - - */ - return true; // all dirlumps expired, etc. -} - - -void EMetaBlob::expire(MDS *mds, Context *c) -{ -/* - map commit; // dir -> version needed - list waitfor_export; - list waitfor_import; - int ncommit = 0; - - // examine dirv's for my lumps - // make list of dir slices i need to commit - for (map::iterator lp = lump_map.begin(); - lp != lump_map.end(); - ++lp) { - CDir *dir = mds->mdcache->get_dirfrag(lp->first); - if (!dir) - continue; // we expired it - - // FIXME: check the slice only - - if (dir->authority().first != mds->get_nodeid()) { - dout(10) << "EMetaBlob.expire not auth, needed dirv " << lp->second.dirv - << " for " << *dir << dendl; - continue; // not our problem - } - if (dir->get_committed_version() >= lp->second.dirv || - dir->get_committed_version_equivalent() >= lp->second.dirv) { - dout(10) << "EMetaBlob.expire have dirv " << lp->second.dirv - << " on " << *dir << dendl; - continue; // yay - } - - if (dir->is_ambiguous_dir_auth()) { - CDir *ex = mds->mdcache->get_subtree_root(dir); - if (ex->is_exporting()) { - // wait until export is acked (logged on remote) and committed (logged locally) - dout(10) << "EMetaBlob.expire ambiguous auth for " << *dir - << ", waiting for export finish on " << *ex << dendl; - waitfor_export.push_back(ex); - continue; - } else { - dout(10) << "EMetaBlob.expire ambiguous auth for " << *dir - << ", waiting for import finish on " << *ex << dendl; - waitfor_import.push_back(ex); - continue; - } - } - - assert(dir->get_committed_version() < lp->second.dirv); - dout(10) << "EMetaBlob.expire need dirv " << lp->second.dirv - << ", committing " << *dir << dendl; - commit[dir] = MAX(commit[dir], lp->second.dirv); - ncommit++; - } - - // set up gather context - C_Gather *gather = new C_Gather(c); - - // do or wait for exports and commits - for (map::iterator p = commit.begin(); - p != commit.end(); - ++p) { - if (p->first->can_auth_pin()) - p->first->commit(p->second, gather->new_sub()); - else - // pbly about to export|split|merge. - // just wait for it to unfreeze, then retry - p->first->add_waiter(CDir::WAIT_UNFREEZE, gather->new_sub()); - } - for (list::iterator p = waitfor_export.begin(); - p != waitfor_export.end(); - ++p) - mds->mdcache->migrator->add_export_finish_waiter(*p, gather->new_sub()); - for (list::iterator p = waitfor_import.begin(); - p != waitfor_import.end(); - ++p) - (*p)->add_waiter(CDir::WAIT_UNFREEZE, gather->new_sub()); - - - // have my anchortable ops committed? - for (list::iterator p = atids.begin(); - p != atids.end(); - ++p) { - if (!mds->anchorclient->has_committed(*p)) { - dout(10) << "EMetaBlob.expire anchor transaction " << *p - << " not yet acked, waiting" << dendl; - mds->anchorclient->wait_for_ack(*p, gather->new_sub()); - } - } - - // dirtied inode mtimes - if (!dirty_inode_mtimes.empty()) - for (map::iterator p = dirty_inode_mtimes.begin(); - p != dirty_inode_mtimes.end(); - ++p) { - CInode *in = mds->mdcache->get_inode(p->first); - if (in) { - if (in->inode.ctime == p->second && - in->dirlock.is_updated()) { - dout(10) << "EMetaBlob.expire dirty mtime dirlock hasn't flushed, waiting on " - << *in << dendl; - in->dirlock.add_waiter(SimpleLock::WAIT_STABLE, gather->new_sub()); - } - } - } - - // allocated_inos - if (!allocated_inos.empty()) { - version_t cv = mds->idalloc->get_committed_version(); - if (cv < alloc_tablev) { - dout(10) << "EMetaBlob.expire saving idalloc table, need " << alloc_tablev << dendl; - mds->idalloc->save(gather->new_sub(), alloc_tablev); - } - } - - // truncated inodes - for (list< pair >::iterator p = truncated_inodes.begin(); - p != truncated_inodes.end(); - ++p) { - if (mds->mdcache->is_purging(p->first.ino, p->second)) { - dout(10) << "EMetaBlob.expire waiting for purge of inode " << p->first.ino - << " to " << p->second << dendl; - mds->mdcache->wait_for_purge(p->first.ino, p->second, gather->new_sub()); - } - } - - // client requests - for (list::iterator p = client_reqs.begin(); - p != client_reqs.end(); - ++p) { - if (mds->clientmap.have_completed_request(*p)) { - dout(10) << "EMetaBlob.expire waiting on completed request " << *p - << dendl; - mds->clientmap.add_trim_waiter(*p, gather->new_sub()); - } - } - - dout(10) << "my gather finsher is " << gather << " with " << gather->get_num() << dendl; - -*/ -} - -void EMetaBlob::update_segment(LogSegment *ls) -{ - // atids? - //for (list::iterator p = atids.begin(); p != atids.end(); ++p) - // ls->pending_commit_atids[*p] = ls; - // -> handled directly by AnchorClient - - // dirty inode mtimes - // -> handled directly by Server.cc, replay() - - // alloc table update? - if (!allocated_inos.empty()) - ls->allocv = alloc_tablev; - - // truncated inodes - // -> handled directly by Server.cc - - // client requests - // note the newest request per client - //if (!client_reqs.empty()) - // ls->last_client_tid[client_reqs.rbegin()->client] = client_reqs.rbegin()->tid); -} - -void EMetaBlob::replay(MDS *mds, LogSegment *logseg) -{ - dout(10) << "EMetaBlob.replay " << lump_map.size() << " dirlumps" << dendl; - - if (!logseg) logseg = _segment; - assert(logseg); - - // walk through my dirs (in order!) - for (list::iterator lp = lump_order.begin(); - lp != lump_order.end(); - ++lp) { - dout(10) << "EMetaBlob.replay dir " << *lp << dendl; - dirlump &lump = lump_map[*lp]; - - // the dir - CDir *dir = mds->mdcache->get_dirfrag(*lp); - if (!dir) { - // hmm. do i have the inode? - CInode *diri = mds->mdcache->get_inode((*lp).ino); - if (!diri) { - if ((*lp).ino == MDS_INO_ROOT) { - diri = mds->mdcache->create_root_inode(); - dout(10) << "EMetaBlob.replay created root " << *diri << dendl; - } else if (MDS_INO_IS_STRAY((*lp).ino)) { - int whose = (*lp).ino - MDS_INO_STRAY_OFFSET; - diri = mds->mdcache->create_stray_inode(whose); - dout(10) << "EMetaBlob.replay created stray " << *diri << dendl; - } else { - dout(0) << "EMetaBlob.replay missing dir ino " << (*lp).ino << dendl; - assert(0); - } - } - // create the dirfrag - dir = diri->get_or_open_dirfrag(mds->mdcache, (*lp).frag); - - if ((*lp).ino < MDS_INO_BASE) - mds->mdcache->adjust_subtree_auth(dir, CDIR_AUTH_UNKNOWN); - - dout(10) << "EMetaBlob.replay added dir " << *dir << dendl; - } - dir->set_version( lump.dirv ); - if (lump.is_dirty()) - dir->_mark_dirty(logseg); - if (lump.is_complete()) - dir->mark_complete(); - - // decode bits - lump._decode_bits(); - - // full dentry+inode pairs - for (list::iterator p = lump.get_dfull().begin(); - p != lump.get_dfull().end(); - p++) { - CDentry *dn = dir->lookup(p->dn); - if (!dn) { - dn = dir->add_null_dentry(p->dn); - dn->set_version(p->dnv); - if (p->dirty) dn->_mark_dirty(logseg); - dout(10) << "EMetaBlob.replay added " << *dn << dendl; - } else { - dn->set_version(p->dnv); - if (p->dirty) dn->_mark_dirty(logseg); - dout(10) << "EMetaBlob.replay had " << *dn << dendl; - } - - CInode *in = mds->mdcache->get_inode(p->inode.ino); - if (!in) { - in = new CInode(mds->mdcache); - in->inode = p->inode; - in->dirfragtree = p->dirfragtree; - if (in->inode.is_symlink()) in->symlink = p->symlink; - mds->mdcache->add_inode(in); - if (!dn->is_null()) { - if (dn->is_primary()) - dout(-10) << "EMetaBlob.replay FIXME had dentry linked to wrong inode " << *dn - << " " << *dn->get_inode() - << " should be " << p->inode.ino - << dendl; - dir->unlink_inode(dn); - //assert(0); // hrm! fallout from sloppy unlink? or? hmmm FIXME investigate further - } - dir->link_primary_inode(dn, in); - if (p->dirty) in->_mark_dirty(logseg); - dout(10) << "EMetaBlob.replay added " << *in << dendl; - } else { - if (dn->get_inode() != in && in->get_parent_dn()) { - dout(10) << "EMetaBlob.replay unlinking " << *in << dendl; - in->get_parent_dn()->get_dir()->unlink_inode(in->get_parent_dn()); - } - in->inode = p->inode; - in->dirfragtree = p->dirfragtree; - if (in->inode.is_symlink()) in->symlink = p->symlink; - if (p->dirty) in->_mark_dirty(logseg); - if (dn->get_inode() != in) { - dir->link_primary_inode(dn, in); - dout(10) << "EMetaBlob.replay linked " << *in << dendl; - } else { - dout(10) << "EMetaBlob.replay had " << *in << dendl; - } - } - } - - // remote dentries - for (list::iterator p = lump.get_dremote().begin(); - p != lump.get_dremote().end(); - p++) { - CDentry *dn = dir->lookup(p->dn); - if (!dn) { - dn = dir->add_remote_dentry(p->dn, p->ino, p->d_type); - dn->set_version(p->dnv); - if (p->dirty) dn->_mark_dirty(logseg); - dout(10) << "EMetaBlob.replay added " << *dn << dendl; - } else { - if (!dn->is_null()) { - dout(10) << "EMetaBlob.replay unlinking " << *dn << dendl; - dir->unlink_inode(dn); - } - dn->set_remote(p->ino, p->d_type); - dn->set_version(p->dnv); - if (p->dirty) dn->_mark_dirty(logseg); - dout(10) << "EMetaBlob.replay had " << *dn << dendl; - } - } - - // null dentries - for (list::iterator p = lump.get_dnull().begin(); - p != lump.get_dnull().end(); - p++) { - CDentry *dn = dir->lookup(p->dn); - if (!dn) { - dn = dir->add_null_dentry(p->dn); - dn->set_version(p->dnv); - if (p->dirty) dn->_mark_dirty(logseg); - dout(10) << "EMetaBlob.replay added " << *dn << dendl; - } else { - if (!dn->is_null()) { - dout(10) << "EMetaBlob.replay unlinking " << *dn << dendl; - dir->unlink_inode(dn); - } - dn->set_version(p->dnv); - if (p->dirty) dn->_mark_dirty(logseg); - dout(10) << "EMetaBlob.replay had " << *dn << dendl; - } - } - } - - // anchor transactions - for (list::iterator p = atids.begin(); - p != atids.end(); - ++p) { - dout(10) << "EMetaBlob.replay noting anchor transaction " << *p << dendl; - mds->anchorclient->got_journaled_agree(*p, logseg); - } - - // dirtied inode mtimes - if (!dirty_inode_mtimes.empty()) - for (map::iterator p = dirty_inode_mtimes.begin(); - p != dirty_inode_mtimes.end(); - ++p) { - CInode *in = mds->mdcache->get_inode(p->first); - dout(10) << "EMetaBlob.replay setting dirlock updated flag on " << *in << dendl; - in->dirlock.set_updated(); - logseg->dirty_inode_mtimes.push_back(&in->xlist_dirty_inode_mtime); - } - - // allocated_inos - if (!allocated_inos.empty()) { - if (mds->idalloc->get_version() >= alloc_tablev) { - dout(10) << "EMetaBlob.replay idalloc tablev " << alloc_tablev - << " <= table " << mds->idalloc->get_version() << dendl; - } else { - for (list::iterator p = allocated_inos.begin(); - p != allocated_inos.end(); - ++p) { - dout(10) << " EMetaBlob.replay idalloc " << *p << " tablev " << alloc_tablev - << " - 1 == table " << mds->idalloc->get_version() << dendl; - assert(alloc_tablev-1 == mds->idalloc->get_version()); - - inodeno_t ino = mds->idalloc->alloc_id(); - assert(ino == *p); // this should match. - } - assert(alloc_tablev == mds->idalloc->get_version()); - } - } - - // truncated inodes - for (list< triple >::iterator p = truncated_inodes.begin(); - p != truncated_inodes.end(); - ++p) { - CInode *in = mds->mdcache->get_inode(p->first); - assert(in); - dout(10) << "EMetaBlob.replay will purge truncated " - << p->third << " -> " << p->second - << " on " << *in << dendl; - mds->mdcache->add_recovered_purge(in, p->second, p->third, logseg); - } - - // client requests - for (list::iterator p = client_reqs.begin(); - p != client_reqs.end(); - ++p) - mds->clientmap.add_completed_request(*p); - - - // update segment - update_segment(logseg); -} - -// ----------------------- -// ESession - -void ESession::update_segment() -{ - _segment->clientmapv = cmapv; -} - -void ESession::replay(MDS *mds) -{ - if (mds->clientmap.get_version() >= cmapv) { - dout(10) << "ESession.replay clientmap " << mds->clientmap.get_version() - << " >= " << cmapv << ", noop" << dendl; - - // hrm, this isn't very pretty. - if (!open) - mds->clientmap.trim_completed_requests(client_inst.name.num(), 0); - - } else { - dout(10) << "ESession.replay clientmap " << mds->clientmap.get_version() - << " < " << cmapv << dendl; - assert(mds->clientmap.get_version() + 1 == cmapv); - if (open) { - mds->clientmap.open_session(client_inst); - } else { - mds->clientmap.close_session(client_inst.name.num()); - mds->clientmap.trim_completed_requests(client_inst.name.num(), 0); - } - mds->clientmap.reset_projected(); // make it follow version. - } -} - - - -// ----------------------- -// EAnchor - -void EAnchor::update_segment() -{ - _segment->anchortablev = version; -} - -void EAnchor::replay(MDS *mds) -{ - if (mds->anchortable->get_version() >= version) { - dout(10) << "EAnchor.replay event " << version - << " <= table " << mds->anchortable->get_version() << dendl; - } else { - dout(10) << " EAnchor.replay event " << version - << " - 1 == table " << mds->anchortable->get_version() << dendl; - assert(version-1 == mds->anchortable->get_version()); - - switch (op) { - // anchortable - case ANCHOR_OP_CREATE_PREPARE: - mds->anchortable->create_prepare(ino, trace, reqmds); - break; - case ANCHOR_OP_DESTROY_PREPARE: - mds->anchortable->destroy_prepare(ino, reqmds); - break; - case ANCHOR_OP_UPDATE_PREPARE: - mds->anchortable->update_prepare(ino, trace, reqmds); - break; - case ANCHOR_OP_COMMIT: - mds->anchortable->commit(atid); - break; - - default: - assert(0); - } - - assert(version == mds->anchortable->get_version()); - } -} - - -// EAnchorClient - -void EAnchorClient::replay(MDS *mds) -{ - dout(10) << " EAnchorClient.replay op " << op << " atid " << atid << dendl; - - switch (op) { - // anchorclient - case ANCHOR_OP_ACK: - mds->anchorclient->got_journaled_ack(atid); - break; - - default: - assert(0); - } -} - - -// ----------------------- -// EUpdate - -void EUpdate::update_segment() -{ - metablob.update_segment(_segment); -} - -void EUpdate::replay(MDS *mds) -{ - metablob.replay(mds, _segment); -} - - -// ------------------------ -// EOpen - -void EOpen::update_segment() -{ - // ?? -} - -void EOpen::replay(MDS *mds) -{ - dout(10) << "EOpen.replay " << dendl; - metablob.replay(mds, _segment); -} - - -// ----------------------- -// ESlaveUpdate - -void ESlaveUpdate::replay(MDS *mds) -{ - switch (op) { - case ESlaveUpdate::OP_PREPARE: - // FIXME: horribly inefficient copy; EMetaBlob needs a swap() or something - dout(10) << "ESlaveUpdate.replay prepare " << reqid << " for mds" << master - << ": saving blobs for later commit" << dendl; - assert(mds->mdcache->uncommitted_slave_updates[master].count(reqid) == 0); - commit._segment = _segment; // may need this later - rollback._segment = _segment; // may need this later - mds->mdcache->uncommitted_slave_updates[master][reqid] = - MDSlaveUpdate(commit, rollback, _segment->slave_updates); - break; - - case ESlaveUpdate::OP_COMMIT: - if (mds->mdcache->uncommitted_slave_updates[master].count(reqid)) { - dout(10) << "ESlaveUpdate.replay commit " << reqid << " for mds" << master - << ": applying commit blob" << dendl; - mds->mdcache->uncommitted_slave_updates[master][reqid].commit.replay(mds, _segment); - mds->mdcache->uncommitted_slave_updates[master].erase(reqid); - } else { - dout(10) << "ESlaveUpdate.replay commit " << reqid << " for mds" << master - << ": ignoring, no previously saved blobs" << dendl; - } - break; - - case ESlaveUpdate::OP_ROLLBACK: - if (mds->mdcache->uncommitted_slave_updates[master].count(reqid)) { - dout(10) << "ESlaveUpdate.replay abort " << reqid << " for mds" << master - << ": applying rollback blob" << dendl; - assert(mds->mdcache->uncommitted_slave_updates[master].count(reqid)); - mds->mdcache->uncommitted_slave_updates[master][reqid].rollback.replay(mds, _segment); - mds->mdcache->uncommitted_slave_updates[master].erase(reqid); - } else { - dout(10) << "ESlaveUpdate.replay abort " << reqid << " for mds" << master - << ": ignoring, no previously saved blobs" << dendl; - } - break; - - default: - assert(0); - } -} - - -// ----------------------- -// ESubtreeMap - -void ESubtreeMap::replay(MDS *mds) -{ - // suck up the subtree map? - if (mds->mdcache->is_subtrees()) { - dout(10) << "ESubtreeMap.replay -- ignoring, already have import map" << dendl; - return; - } - - dout(10) << "ESubtreeMap.replay -- reconstructing (auth) subtree spanning tree" << dendl; - - // first, stick the spanning tree in my cache - //metablob.print(cout); - metablob.replay(mds, _segment); - - // restore import/export maps - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) { - CDir *dir = mds->mdcache->get_dirfrag(p->first); - mds->mdcache->adjust_bounded_subtree_auth(dir, p->second, mds->get_nodeid()); - } - - mds->mdcache->show_subtrees(); -} - - - -// ----------------------- -// EFragment - -void EFragment::replay(MDS *mds) -{ - dout(10) << "EFragment.replay " << ino << " " << basefrag << " by " << bits << dendl; - - CInode *in = mds->mdcache->get_inode(ino); - assert(in); - - list resultfrags; - list waiters; - mds->mdcache->adjust_dir_fragments(in, basefrag, bits, resultfrags, waiters); - - metablob.replay(mds, _segment); -} - - - -// ----------------------- -// EPurgeFinish - - -bool EPurgeFinish::has_expired(MDS *mds) -{ - return true; -} - -void EPurgeFinish::expire(MDS *mds, Context *c) -{ - assert(0); -} - -void EPurgeFinish::update_segment() -{ - // ** update purge lists? -} - -void EPurgeFinish::replay(MDS *mds) -{ - dout(10) << "EPurgeFinish.replay " << ino << " " << oldsize << " -> " << newsize << dendl; - CInode *in = mds->mdcache->get_inode(ino); - assert(in); - mds->mdcache->remove_recovered_purge(in, newsize, oldsize); -} - - - - - -// ========================================================================= - -// ----------------------- -// EExport - -bool EExport::has_expired(MDS *mds) -{ - CDir *dir = mds->mdcache->get_dirfrag(base); - if (dir && mds->mdcache->migrator->is_exporting(dir)) { - dout(10) << "EExport.has_expired still exporting " << *dir << dendl; - return false; - } - return true; -} - -void EExport::expire(MDS *mds, Context *c) -{ - CDir *dir = mds->mdcache->get_dirfrag(base); - assert(dir); - assert(mds->mdcache->migrator->is_exporting(dir)); - - dout(10) << "EExport.expire waiting for export of " << *dir << dendl; - mds->mdcache->migrator->add_export_finish_waiter(dir, c); -} - -void EExport::replay(MDS *mds) -{ - dout(10) << "EExport.replay " << base << dendl; - metablob.replay(mds, _segment); - - CDir *dir = mds->mdcache->get_dirfrag(base); - assert(dir); - - set realbounds; - for (set::iterator p = bounds.begin(); - p != bounds.end(); - ++p) { - CDir *bd = mds->mdcache->get_dirfrag(*p); - assert(bd); - realbounds.insert(bd); - } - - // adjust auth away - mds->mdcache->adjust_bounded_subtree_auth(dir, realbounds, pair(CDIR_AUTH_UNKNOWN, CDIR_AUTH_UNKNOWN)); - mds->mdcache->try_subtree_merge(dir); -} - - -// ----------------------- -// EImportStart - -void EImportStart::replay(MDS *mds) -{ - dout(10) << "EImportStart.replay " << base << dendl; - metablob.replay(mds, _segment); - - // put in ambiguous import list - mds->mdcache->add_ambiguous_import(base, bounds); -} - -// ----------------------- -// EImportFinish - -bool EImportFinish::has_expired(MDS *mds) -{ - return true; -} -void EImportFinish::expire(MDS *mds, Context *c) -{ - assert(0); // shouldn't ever happen -} - -void EImportFinish::replay(MDS *mds) -{ - if (mds->mdcache->have_ambiguous_import(base)) { - dout(10) << "EImportFinish.replay " << base << " success=" << success << dendl; - if (success) - mds->mdcache->finish_ambiguous_import(base); - else - mds->mdcache->cancel_ambiguous_import(base); - } else { - dout(10) << "EImportFinish.replay " << base << " success=" << success - << ", predates my subtree_map start point, ignoring" - << dendl; - // verify that? - } -} - - - - - diff --git a/branches/sage/crush/mds/mdstypes.h b/branches/sage/crush/mds/mdstypes.h deleted file mode 100644 index a2f779757255e..0000000000000 --- a/branches/sage/crush/mds/mdstypes.h +++ /dev/null @@ -1,689 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -#ifndef __MDSTYPES_H -#define __MDSTYPES_H - - -#include -#include -#include -#include -using namespace std; - -#include "config.h" -#include "common/DecayCounter.h" -#include "include/Context.h" - -#include - -#include "include/frag.h" -#include "include/xlist.h" - -#define MDS_REF_SET // define me for improved debug output, sanity checking - -#define MDS_PORT_MAIN 0 -#define MDS_PORT_SERVER 1 -#define MDS_PORT_CACHE 2 -#define MDS_PORT_LOCKER 3 -#define MDS_PORT_STORE 4 -#define MDS_PORT_BALANCER 5 -#define MDS_PORT_MIGRATOR 6 -#define MDS_PORT_RENAMER 7 -#define MDS_PORT_ANCHORCLIENT 10 -#define MDS_PORT_ANCHORTABLE 11 - -#define MAX_MDS 0x100 - -#define MDS_INO_ROOT 1 -#define MDS_INO_PGTABLE 2 -#define MDS_INO_ANCHORTABLE 3 -#define MDS_INO_PG 4 // *** WARNING: this should match osd/osd_types.h PG_INO *** -#define MDS_INO_LOG_OFFSET (1*MAX_MDS) -#define MDS_INO_IDS_OFFSET (2*MAX_MDS) -#define MDS_INO_CLIENTMAP_OFFSET (3*MAX_MDS) -#define MDS_INO_STRAY_OFFSET (4*MAX_MDS) -#define MDS_INO_BASE (5*MAX_MDS) - -#define MDS_INO_STRAY(x) (MDS_INO_STRAY_OFFSET+((unsigned)x)) -#define MDS_INO_IS_STRAY(i) ((i) >= MDS_INO_STRAY_OFFSET && (i) < MDS_INO_STRAY_OFFSET+MAX_MDS) - -#define MDS_TRAVERSE_FORWARD 1 -#define MDS_TRAVERSE_DISCOVER 2 // skips permissions checks etc. -#define MDS_TRAVERSE_DISCOVERXLOCK 3 // succeeds on (foreign?) null, xlocked dentries. -#define MDS_TRAVERSE_FAIL 4 - - -struct metareqid_t { - uint64_t tid; - int32_t client; - int32_t _pad; - metareqid_t() : tid(0), client(-1), _pad(0) {} - metareqid_t(int c, tid_t t) : tid(t), client(c), _pad(0) {} -}; - -inline ostream& operator<<(ostream& out, const metareqid_t& r) { - return out << "client" << r.client << ":" << r.tid; -} - -inline bool operator==(const metareqid_t& l, const metareqid_t& r) { - return (l.client == r.client) && (l.tid == r.tid); -} -inline bool operator!=(const metareqid_t& l, const metareqid_t& r) { - return (l.client != r.client) || (l.tid != r.tid); -} -inline bool operator<(const metareqid_t& l, const metareqid_t& r) { - return (l.client < r.client) || - (l.client == r.client && l.tid < r.tid); -} -inline bool operator<=(const metareqid_t& l, const metareqid_t& r) { - return (l.client < r.client) || - (l.client == r.client && l.tid <= r.tid); -} -inline bool operator>(const metareqid_t& l, const metareqid_t& r) { return !(l <= r); } -inline bool operator>=(const metareqid_t& l, const metareqid_t& r) { return !(l < r); } - -namespace __gnu_cxx { - template<> struct hash { - size_t operator()(const metareqid_t &r) const { - hash H; - return H(r.client) ^ H(r.tid); - } - }; -} - - -// inode caps info for client reconnect -struct inode_caps_reconnect_t { - int32_t wanted; - int32_t issued; - off_t size; - utime_t mtime, atime; - - inode_caps_reconnect_t() {} - inode_caps_reconnect_t(int w, int i) : - wanted(w), issued(i), size(0) {} - inode_caps_reconnect_t(int w, int i, off_t sz, utime_t mt, utime_t at) : - wanted(w), issued(i), size(sz), mtime(mt), atime(at) {} -}; - - -// ================================================================ -// dir frag - -struct dirfrag_t { - inodeno_t ino; - frag_t frag; - uint32_t _pad; - - dirfrag_t() : ino(0), _pad(0) { } - dirfrag_t(inodeno_t i, frag_t f) : ino(i), frag(f), _pad(0) { } -}; - -inline ostream& operator<<(ostream& out, const dirfrag_t df) { - out << df.ino; - if (!df.frag.is_root()) out << "." << df.frag; - return out; -} -inline bool operator<(dirfrag_t l, dirfrag_t r) { - if (l.ino < r.ino) return true; - if (l.ino == r.ino && l.frag < r.frag) return true; - return false; -} -inline bool operator==(dirfrag_t l, dirfrag_t r) { - return l.ino == r.ino && l.frag == r.frag; -} - -namespace __gnu_cxx { - template<> struct hash { - size_t operator()(const dirfrag_t &df) const { - static rjhash H; - static rjhash I; - return H(df.ino) ^ I(df.frag); - } - }; -} - - - -// ================================================================ - -#define META_POP_IRD 0 -#define META_POP_IWR 1 -#define META_POP_READDIR 2 -#define META_POP_FETCH 3 -#define META_POP_STORE 4 -#define META_NPOP 5 - -class inode_load_vec_t { - static const int NUM = 2; - DecayCounter vec[NUM]; -public: - DecayCounter &get(int t) { - assert(t < NUM); - return vec[t]; - } - void zero(utime_t now) { - for (int i=0; i"; -} - -/* -inline mds_load_t& operator+=( mds_load_t& l, mds_load_t& r ) -{ - l.root_pop += r.root_pop; - l.req_rate += r.req_rate; - l.queue_len += r.queue_len; - return l; -} - -inline mds_load_t operator/( mds_load_t& a, double d ) -{ - mds_load_t r; - r.root_pop = a.root_pop / d; - r.req_rate = a.req_rate / d; - r.queue_len = a.queue_len / d; - return r; -} -*/ - - -class load_spread_t { -public: - static const int MAX = 4; - int last[MAX]; - int p, n; - DecayCounter count; - -public: - load_spread_t() : p(0), n(0) { - for (int i=0; i= 0 is the auth mds -#define CDIR_AUTH_PARENT -1 // default -#define CDIR_AUTH_UNKNOWN -2 -#define CDIR_AUTH_DEFAULT pair(-1, -2) -#define CDIR_AUTH_UNDEF pair(-2, -2) -//#define CDIR_AUTH_ROOTINODE pair( 0, -2) - - - -// print hack -struct mdsco_db_line_prefix { - MDSCacheObject *object; - mdsco_db_line_prefix(MDSCacheObject *o) : object(o) {} -}; -ostream& operator<<(ostream& out, mdsco_db_line_prefix o); - -// printer -ostream& operator<<(ostream& out, MDSCacheObject &o); - -class MDSCacheObjectInfo { -public: - inodeno_t ino; - dirfrag_t dirfrag; - string dname; - - MDSCacheObjectInfo() : ino(0) {} - - void _encode(bufferlist& bl) const { - ::_encode(ino, bl); - ::_encode(dirfrag, bl); - ::_encode(dname, bl); - } - void _decode(bufferlist& bl, int& off) { - ::_decode(ino, bl, off); - ::_decode(dirfrag, bl, off); - ::_decode(dname, bl, off); - } - void _decode(bufferlist::iterator& p) { - ::_decode_simple(ino, p); - ::_decode_simple(dirfrag, p); - ::_decode_simple(dname, p); - } -}; - - -class MDSCacheObject { - public: - // -- pins -- - const static int PIN_REPLICATED = 1000; - const static int PIN_DIRTY = 1001; - const static int PIN_LOCK = -1002; - const static int PIN_REQUEST = -1003; - const static int PIN_WAITER = 1004; - const static int PIN_DIRTYSCATTERED = 1005; // make this neg if we start using multiple scatterlocks? - static const int PIN_AUTHPIN = 1006; - static const int PIN_PTRWAITER = -1007; - const static int PIN_TEMPEXPORTING = 1008; // temp pin between encode_ and finish_export - - const char *generic_pin_name(int p) { - switch (p) { - case PIN_REPLICATED: return "replicated"; - case PIN_DIRTY: return "dirty"; - case PIN_LOCK: return "lock"; - case PIN_REQUEST: return "request"; - case PIN_WAITER: return "waiter"; - case PIN_DIRTYSCATTERED: return "dirtyscattered"; - case PIN_AUTHPIN: return "authpin"; - case PIN_PTRWAITER: return "ptrwaiter"; - case PIN_TEMPEXPORTING: return "tempexporting"; - default: assert(0); return 0; - } - } - - // -- state -- - const static int STATE_AUTH = (1<<30); - const static int STATE_DIRTY = (1<<29); - const static int STATE_REJOINING = (1<<28); // replica has not joined w/ primary copy - - // -- wait -- - const static int WAIT_SINGLEAUTH = (1<<30); - const static int WAIT_UNFREEZE = (1<<29); // pka AUTHPINNABLE - - - // ============================================ - // cons - public: - MDSCacheObject() : - state(0), - ref(0), - replica_nonce(0) {} - virtual ~MDSCacheObject() {} - - // printing - virtual void print(ostream& out) = 0; - virtual ostream& print_db_line_prefix(ostream& out) { - return out << "mdscacheobject(" << this << ") "; - } - - // -------------------------------------------- - // state - protected: - unsigned state; // state bits - - public: - unsigned get_state() const { return state; } - unsigned state_test(unsigned mask) const { return (state & mask); } - void state_clear(unsigned mask) { state &= ~mask; } - void state_set(unsigned mask) { state |= mask; } - void state_reset(unsigned s) { state = s; } - - bool is_auth() const { return state_test(STATE_AUTH); } - bool is_dirty() const { return state_test(STATE_DIRTY); } - bool is_clean() const { return !is_dirty(); } - bool is_rejoining() const { return state_test(STATE_REJOINING); } - - // -------------------------------------------- - // authority - virtual pair authority() = 0; - bool is_ambiguous_auth() { - return authority().second != CDIR_AUTH_UNKNOWN; - } - - // -------------------------------------------- - // pins -protected: - int ref; // reference count -#ifdef MDS_REF_SET - multiset ref_set; -#endif - - public: - int get_num_ref() { return ref; } - virtual const char *pin_name(int by) = 0; - //bool is_pinned_by(int by) { return ref_set.count(by); } - //multiset& get_ref_set() { return ref_set; } - - virtual void last_put() {} - virtual void bad_put(int by) { -#ifdef MDS_REF_SET - assert(ref_set.count(by) > 0); -#endif - assert(ref > 0); - } - void put(int by) { -#ifdef MDS_REF_SET - if (ref == 0 || ref_set.count(by) == 0) { -#else - if (ref == 0) { -#endif - bad_put(by); - } else { - ref--; -#ifdef MDS_REF_SET - ref_set.erase(ref_set.find(by)); - assert(ref == (int)ref_set.size()); -#endif - if (ref == 0) - last_put(); - } - } - - virtual void first_get() {} - virtual void bad_get(int by) { -#ifdef MDS_REF_SET - assert(by < 0 || ref_set.count(by) == 0); -#endif - assert(0); - } - void get(int by) { -#ifdef MDS_REF_SET - if (by >= 0 && ref_set.count(by)) { - bad_get(by); - } else { -#endif - if (ref == 0) - first_get(); - ref++; -#ifdef MDS_REF_SET - ref_set.insert(by); - assert(ref == (int)ref_set.size()); - } -#endif - } - - void print_pin_set(ostream& out) { -#ifdef MDS_REF_SET - multiset::iterator it = ref_set.begin(); - while (it != ref_set.end()) { - out << " " << pin_name(*it); - int last = *it; - int c = 1; - do { - it++; - if (it == ref_set.end()) break; - } while (*it == last); - if (c > 1) - out << "*" << c; - } -#endif - } - - - // -------------------------------------------- - // auth pins - virtual bool can_auth_pin() = 0; - virtual void auth_pin() = 0; - virtual void auth_unpin() = 0; - - - // -------------------------------------------- - // replication - protected: - map replica_map; // [auth] mds -> nonce - int replica_nonce; // [replica] defined on replica - - public: - bool is_replicated() { return !replica_map.empty(); } - bool is_replica(int mds) { return replica_map.count(mds); } - int num_replicas() { return replica_map.size(); } - int add_replica(int mds) { - if (replica_map.count(mds)) - return ++replica_map[mds]; // inc nonce - if (replica_map.empty()) - get(PIN_REPLICATED); - return replica_map[mds] = 1; - } - void add_replica(int mds, int nonce) { - if (replica_map.empty()) - get(PIN_REPLICATED); - replica_map[mds] = nonce; - } - int get_replica_nonce(int mds) { - assert(replica_map.count(mds)); - return replica_map[mds]; - } - void remove_replica(int mds) { - assert(replica_map.count(mds)); - replica_map.erase(mds); - if (replica_map.empty()) - put(PIN_REPLICATED); - } - void clear_replica_map() { - if (!replica_map.empty()) - put(PIN_REPLICATED); - replica_map.clear(); - } - map::iterator replicas_begin() { return replica_map.begin(); } - map::iterator replicas_end() { return replica_map.end(); } - const map& get_replicas() { return replica_map; } - void list_replicas(set& ls) { - for (map::const_iterator p = replica_map.begin(); - p != replica_map.end(); - ++p) - ls.insert(p->first); - } - - int get_replica_nonce() { return replica_nonce;} - void set_replica_nonce(int n) { replica_nonce = n; } - - - // --------------------------------------------- - // waiting - protected: - multimap waiting; - - public: - bool is_waiter_for(int mask) { - return waiting.count(mask) > 0; // FIXME: not quite right. - } - virtual void add_waiter(int mask, Context *c) { - if (waiting.empty()) - get(PIN_WAITER); - waiting.insert(pair(mask, c)); - pdout(10,g_conf.debug_mds) << (mdsco_db_line_prefix(this)) - << "add_waiter " << hex << mask << dec << " " << c - << " on " << *this - << dendl; - - } - virtual void take_waiting(int mask, list& ls) { - if (waiting.empty()) return; - multimap::iterator it = waiting.begin(); - while (it != waiting.end()) { - if (it->first & mask) { - ls.push_back(it->second); - pdout(10,g_conf.debug_mds) << (mdsco_db_line_prefix(this)) - << "take_waiting mask " << hex << mask << dec << " took " << it->second - << " tag " << it->first - << " on " << *this - << dendl; - waiting.erase(it++); - } else { - pdout(10,g_conf.debug_mds) << "take_waiting mask " << hex << mask << dec << " SKIPPING " << it->second - << " tag " << it->first - << " on " << *this - << dendl; - it++; - } - } - if (waiting.empty()) - put(PIN_WAITER); - } - void finish_waiting(int mask, int result = 0) { - list finished; - take_waiting(mask, finished); - finish_contexts(finished, result); - } - - - // --------------------------------------------- - // locking - // noop unless overloaded. - virtual SimpleLock* get_lock(int type) { assert(0); return 0; } - virtual void set_object_info(MDSCacheObjectInfo &info) { assert(0); } - virtual void encode_lock_state(int type, bufferlist& bl) { assert(0); } - virtual void decode_lock_state(int type, bufferlist& bl) { assert(0); } - virtual void finish_lock_waiters(int type, int mask, int r=0) { assert(0); } - virtual void add_lock_waiter(int type, int mask, Context *c) { assert(0); } - virtual bool is_lock_waiting(int type, int mask) { assert(0); return false; } - - virtual void clear_dirty_scattered(int type) { assert(0); } - - // --------------------------------------------- - // ordering - virtual bool is_lt(const MDSCacheObject *r) const = 0; - struct ptr_lt { - bool operator()(const MDSCacheObject* l, const MDSCacheObject* r) const { - return l->is_lt(r); - } - }; - -}; - -inline ostream& operator<<(ostream& out, MDSCacheObject &o) { - o.print(out); - return out; -} - -inline ostream& operator<<(ostream& out, const MDSCacheObjectInfo &info) { - if (info.ino) return out << info.ino; - if (info.dname.length()) return out << info.dirfrag << "/" << info.dname; - return out << info.dirfrag; -} - -inline ostream& operator<<(ostream& out, mdsco_db_line_prefix o) { - o.object->print_db_line_prefix(out); - return out; -} - - -#endif diff --git a/branches/sage/crush/messages/MClientFileCaps.h b/branches/sage/crush/messages/MClientFileCaps.h deleted file mode 100644 index 979be331e5ce8..0000000000000 --- a/branches/sage/crush/messages/MClientFileCaps.h +++ /dev/null @@ -1,109 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MCLIENTFILECAPS_H -#define __MCLIENTFILECAPS_H - -#include "msg/Message.h" -#include "mds/Capability.h" - -class MClientFileCaps : public Message { - public: - static const int OP_GRANT = 0; // mds->client grant. - static const int OP_ACK = 1; // client->mds ack (if prior grant was a recall) - static const int OP_RELEASE = 2; // mds closed the cap - static const int OP_STALE = 3; // mds has exported the cap - static const int OP_REAP = 4; // mds has imported the cap from get_mds() - static const char* get_opname(int op) { - switch (op) { - case OP_GRANT: return "grant"; - case OP_ACK: return "ack"; - case OP_RELEASE: return "release"; - case OP_STALE: return "stale"; - case OP_REAP: return "reap"; - default: assert(0); return 0; - } - } - - private: - int32_t op; - inode_t inode; - capseq_t seq; - int32_t caps; - int32_t wanted; - - int32_t mds; - - public: - inodeno_t get_ino() { return inode.ino; } - inode_t& get_inode() { return inode; } - int get_caps() { return caps; } - int get_wanted() { return wanted; } - capseq_t get_seq() { return seq; } - - // for cap migration - int get_mds() { return mds; } - int get_op() { return op; } - - void set_caps(int c) { caps = c; } - void set_wanted(int w) { wanted = w; } - - void set_mds(int m) { mds = m; } - void set_op(int s) { op = s; } - - MClientFileCaps() {} - MClientFileCaps(int op_, - inode_t& inode_, - long seq_, - int caps_, - int wanted_, - int mds_=0) : - Message(MSG_CLIENT_FILECAPS), - op(op_), - inode(inode_), - seq(seq_), - caps(caps_), - wanted(wanted_), - mds(mds_) { } - - char *get_type_name() { return "Cfcap";} - void print(ostream& out) { - out << "client_file_caps(" << get_opname(op) - << " " << inode.ino - << " seq " << seq - << " caps " << cap_string(caps) - << " wanted" << cap_string(wanted) - << ")"; - } - - void decode_payload() { - int off = 0; - ::_decode(op, payload, off); - ::_decode(seq, payload, off); - ::_decode(inode, payload, off); - ::_decode(caps, payload, off); - ::_decode(wanted, payload, off); - ::_decode(mds, payload, off); - } - void encode_payload() { - ::_encode(op, payload); - ::_encode(seq, payload); - ::_encode(inode, payload); - ::_encode(caps, payload); - ::_encode(wanted, payload); - ::_encode(mds, payload); - } -}; - -#endif diff --git a/branches/sage/crush/messages/MClientMount.h b/branches/sage/crush/messages/MClientMount.h deleted file mode 100644 index a49b558c7f040..0000000000000 --- a/branches/sage/crush/messages/MClientMount.h +++ /dev/null @@ -1,43 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MCLIENTMOUNT_H -#define __MCLIENTMOUNT_H - -#include "msg/Message.h" - -class MClientMount : public Message { -public: - entity_addr_t addr; - int32_t instance; // on this node - - MClientMount() : Message(MSG_CLIENT_MOUNT) { } - MClientMount(entity_addr_t a, int i = 0) : - Message(MSG_CLIENT_MOUNT), - addr(a), instance(i) { } - - char *get_type_name() { return "client_mount"; } - - void decode_payload() { - int off = 0; - ::_decode(addr, payload, off); - ::_decode(instance, payload, off); - } - void encode_payload() { - ::_encode(addr, payload); - ::_encode(instance, payload); - } -}; - -#endif diff --git a/branches/sage/crush/messages/MClientReconnect.h b/branches/sage/crush/messages/MClientReconnect.h deleted file mode 100644 index bf1fbacd4b75c..0000000000000 --- a/branches/sage/crush/messages/MClientReconnect.h +++ /dev/null @@ -1,59 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MCLIENTRECONNECT_H -#define __MCLIENTRECONNECT_H - -#include "msg/Message.h" -#include "mds/mdstypes.h" - -class MClientReconnect : public Message { -public: - map inode_caps; - map inode_path; - bool closed; // true if this session was closed by the client. - - MClientReconnect() : Message(MSG_CLIENT_RECONNECT), - closed(false) { } - - char *get_type_name() { return "client_reconnect"; } - void print(ostream& out) { - out << "client_reconnect(" << inode_caps.size() << " caps)"; - } - - void add_inode_caps(inodeno_t ino, - int wanted, int issued, - off_t sz, utime_t mt, utime_t at) { - inode_caps[ino] = inode_caps_reconnect_t(wanted, issued, sz, mt, at); - } - void add_inode_path(inodeno_t ino, const string& path) { - inode_path[ino] = path; - } - - void encode_payload() { - ::_encode(closed, payload); - ::_encode(inode_caps, payload); - ::_encode(inode_path, payload); - } - void decode_payload() { - int off = 0; - ::_decode(closed, payload, off); - ::_decode(inode_caps, payload, off); - ::_decode(inode_path, payload, off); - } - -}; - - -#endif diff --git a/branches/sage/crush/messages/MClientReply.h b/branches/sage/crush/messages/MClientReply.h deleted file mode 100644 index 760dcc971ebad..0000000000000 --- a/branches/sage/crush/messages/MClientReply.h +++ /dev/null @@ -1,285 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MCLIENTREPLY_H -#define __MCLIENTREPLY_H - -#include "include/types.h" -#include "include/encodable.h" -#include "MClientRequest.h" - -#include "msg/Message.h" -#include "mds/CInode.h" -#include "mds/CDir.h" -#include "mds/CDentry.h" - -#include -using namespace std; - -class CInode; - -/*** - * - * MClientReply - container message for MDS reply to a client's MClientRequest - * - * key fields: - * long tid - transaction id, so the client can match up with pending request - * int result - error code, or fh if it was open - * - * for most requests: - * trace is a vector of InodeStat's tracing from root to the file/dir/whatever - * the operation referred to, so that the client can update it's info about what - * metadata lives on what MDS. - * - * for readdir replies: - * dir_contents is a vector of InodeStat*'s. - * - * that's mostly it, i think! - * - */ - -struct DirStat { - // mds distribution hints - frag_t frag; - int auth; - set dist; - bool is_rep; - - DirStat() {} - DirStat(bufferlist::iterator& p) { - _decode(p); - } - - void _decode(bufferlist::iterator& p) { - ::_decode_simple(frag, p); - ::_decode_simple(auth, p); - ::_decode_simple(dist, p); - ::_decode_simple(is_rep, p); - } - - static void _encode(bufferlist& bl, CDir *dir, int whoami) { - frag_t frag = dir->get_frag(); - int auth; - set dist; - bool is_rep; - - auth = dir->get_dir_auth().first; - if (dir->is_auth()) - dir->get_dist_spec(dist, whoami); - is_rep = dir->is_rep(); - - ::_encode_simple(frag, bl); - ::_encode_simple(auth, bl); - ::_encode_simple(dist, bl); - ::_encode_simple(is_rep, bl); - } -}; - -struct InodeStat { - inode_t inode; - string symlink; // symlink content (if symlink) - fragtree_t dirfragtree; - uint32_t mask; - - public: - InodeStat() {} - InodeStat(bufferlist::iterator& p) { - _decode(p); - } - - void _decode(bufferlist::iterator &p) { - ::_decode_simple(mask, p); - ::_decode_simple(inode, p); - ::_decode_simple(symlink, p); - dirfragtree._decode(p); - } - - static void _encode(bufferlist &bl, CInode *in) { - int mask = STAT_MASK_INO|STAT_MASK_TYPE|STAT_MASK_BASE; - - // mask - if (in->authlock.can_rdlock(0)) mask |= STAT_MASK_AUTH; - if (in->linklock.can_rdlock(0)) mask |= STAT_MASK_LINK; - if (in->filelock.can_rdlock(0)) mask |= STAT_MASK_FILE; - - ::_encode_simple(mask, bl); - ::_encode_simple(in->inode, bl); - ::_encode_simple(in->symlink, bl); - in->dirfragtree._encode(bl); - } - -}; - - -class MClientReply : public Message { - // reply data - struct st_ { - long tid; - int op; - int result; // error code - unsigned char file_caps; // for open - long file_caps_seq; - uint64_t file_data_version; // for client buffercache consistency - } st; - - string path; - - list trace_in; - list trace_dir; - list trace_dn; - bufferlist trace_bl; - - DirStat *dir_dir; - list dir_in; - list dir_dn; - bufferlist dir_bl; - - public: - long get_tid() { return st.tid; } - int get_op() { return st.op; } - - int get_result() { return st.result; } - const string& get_path() { return path; } - - inodeno_t get_ino() { return trace_in.back()->inode.ino; } - const inode_t& get_inode() { return trace_in.back()->inode; } - - unsigned char get_file_caps() { return st.file_caps; } - long get_file_caps_seq() { return st.file_caps_seq; } - uint64_t get_file_data_version() { return st.file_data_version; } - - void set_result(int r) { st.result = r; } - void set_file_caps(unsigned char c) { st.file_caps = c; } - void set_file_caps_seq(long s) { st.file_caps_seq = s; } - void set_file_data_version(uint64_t v) { st.file_data_version = v; } - - MClientReply() : dir_dir(0) {}; - MClientReply(MClientRequest *req, int result = 0) : - Message(MSG_CLIENT_REPLY), dir_dir(0) { - memset(&st, 0, sizeof(st)); - this->st.tid = req->get_tid(); - this->st.op = req->get_op(); - this->path = req->get_path(); - - this->st.result = result; - } - virtual ~MClientReply() { - list::iterator it; - - for (it = trace_in.begin(); it != trace_in.end(); ++it) - delete *it; - for (it = dir_in.begin(); it != dir_in.end(); ++it) - delete *it; - } - virtual char *get_type_name() { return "creply"; } - void print(ostream& o) { - o << "creply(" << env.dst.name << "." << st.tid; - o << " = " << st.result; - if (st.result <= 0) - o << " " << strerror(-st.result); - o << ")"; - } - - // serialization - virtual void decode_payload() { - bufferlist::iterator p = payload.begin(); - ::_decode_simple(st, p); - ::_decode_simple(path, p); - ::_decode_simple(trace_bl, p); - ::_decode_simple(dir_bl, p); - assert(p.end()); - } - virtual void encode_payload() { - ::_encode_simple(st, payload); - ::_encode_simple(path, payload); - ::_encode_simple(trace_bl, payload); - ::_encode_simple(dir_bl, payload); - } - - - // dir contents - void take_dir_items(bufferlist& bl) { - dir_bl.claim(bl); - } - void _decode_dir() { - bufferlist::iterator p = dir_bl.begin(); - dir_dir = new DirStat(p); - while (!p.end()) { - string dn; - ::_decode_simple(dn, p); - dir_dn.push_back(dn); - dir_in.push_back(new InodeStat(p)); - } - } - - const list& get_dir_in() { - if (dir_in.empty() && dir_bl.length()) _decode_dir(); - return dir_in; - } - const list& get_dir_dn() { - if (dir_dn.empty() && dir_bl.length()) _decode_dir(); - return dir_dn; - } - const DirStat* get_dir_dir() { - return dir_dir; - } - - - // trace - void set_trace_dist(CInode *in, int whoami) { - // inode, dentry, dir, ..., inode - while (in) { - InodeStat::_encode(trace_bl, in); - CDentry *dn = in->get_parent_dn(); - if (!dn) break; - ::_encode_simple(in->get_parent_dn()->get_name(), trace_bl); - DirStat::_encode(trace_bl, dn->get_dir(), whoami); - in = dn->get_dir()->get_inode(); - } - } - void _decode_trace() { - bufferlist::iterator p = trace_bl.begin(); - while (!p.end()) { - // inode - trace_in.push_front(new InodeStat(p)); - if (!p.end()) { - // dentry - string ref_dn; - ::_decode_simple(ref_dn, p); - trace_dn.push_front(ref_dn); - - // dir - trace_dir.push_front(new DirStat(p)); - } - } - } - - const list& get_trace_in() { - if (trace_in.empty() && trace_bl.length()) _decode_trace(); - return trace_in; - } - const list& get_trace_dir() { - if (trace_in.empty() && trace_bl.length()) _decode_trace(); - return trace_dir; - } - const list& get_trace_dn() { - if (trace_in.empty() && trace_bl.length()) _decode_trace(); - return trace_dn; - } - - -}; - -#endif diff --git a/branches/sage/crush/messages/MClientRequest.h b/branches/sage/crush/messages/MClientRequest.h deleted file mode 100644 index 8f03044cf5a4f..0000000000000 --- a/branches/sage/crush/messages/MClientRequest.h +++ /dev/null @@ -1,325 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MCLIENTREQUEST_H -#define __MCLIENTREQUEST_H - -/** - * - * MClientRequest - container for a client METADATA request. created/sent by clients. - * can be forwarded around between MDS's. - * - * int client - the originating client - * long tid - transaction id, unique among requests for that client. probably just a counter! - * -> the MDS passes the Request to the Reply constructor, so this always matches. - * - * int op - the metadata op code. MDS_OP_RENAME, etc. - * int caller_uid, _gid - guess - * - * fixed size arguments are in a union. - * there's also a string argument, for e.g. symlink(). - * - */ - -#include "msg/Message.h" -#include "include/filepath.h" -#include "mds/mdstypes.h" - -#include -#include -#include -#include -#include - - -// metadata ops. -// >=1000 --> an update, non-idempotent (i.e. an update) -#define MDS_OP_STATFS 1 - -#define MDS_OP_STAT 100 -#define MDS_OP_LSTAT 101 -#define MDS_OP_FSTAT 102 -#define MDS_OP_UTIME 1102 -#define MDS_OP_CHMOD 1104 -#define MDS_OP_CHOWN 1105 - -#define MDS_OP_READDIR 200 -#define MDS_OP_MKNOD 1201 -#define MDS_OP_LINK 1202 -#define MDS_OP_UNLINK 1203 -#define MDS_OP_RENAME 1204 - -#define MDS_OP_MKDIR 1220 -#define MDS_OP_RMDIR 1221 -#define MDS_OP_SYMLINK 1222 - -#define MDS_OP_OPEN 301 -#define MDS_OP_TRUNCATE 1306 -#define MDS_OP_FSYNC 307 - -#define MDS_OP_RELEASE 308 // used only by SyntheticClient op_dist thinger - - -class MClientRequest : public Message { - struct { - tid_t tid; - tid_t oldest_client_tid; - int num_fwd; - int retry_attempt; - inodeno_t mds_wants_replica_in_dirino; - - entity_inst_t client_inst; - - int op; - int caller_uid, caller_gid; - inodeno_t cwd_ino; - } st; - - // path arguments - filepath path; - string sarg; - - public: - // fixed size arguments. in a union. - // note: nothing with a constructor can go here; use underlying base - // types for _inodeno_t, _frag_t. - union { - struct { - int mask; - } stat; - struct { - _inodeno_t ino; - int mask; - } fstat; - struct { - _frag_t frag; - } readdir; - struct { - _utime_t mtime; - _utime_t atime; - } utime; - struct { - mode_t mode; - } chmod; - struct { - uid_t uid; - gid_t gid; - } chown; - struct { - mode_t mode; - dev_t rdev; - } mknod; - struct { - mode_t mode; - } mkdir; - struct { - int flags; - mode_t mode; - } open; - struct { - _inodeno_t ino; // optional - off_t length; - } truncate; - struct { - _inodeno_t ino; - } fsync; - } args; - - // cons - MClientRequest() : Message(MSG_CLIENT_REQUEST) {} - MClientRequest(int op, entity_inst_t ci) : Message(MSG_CLIENT_REQUEST) { - memset(&st, 0, sizeof(st)); - memset(&args, 0, sizeof(args)); - this->st.op = op; - this->st.client_inst = ci; - } - - metareqid_t get_reqid() { - // FIXME: for now, assume clients always have 1 incarnation - return metareqid_t(st.client_inst.name.num(), st.tid); - } - - int get_open_file_mode() { - if (args.open.flags & O_LAZY) - return FILE_MODE_LAZY; - if (args.open.flags & O_WRONLY) - return FILE_MODE_W; - if (args.open.flags & O_RDWR) - return FILE_MODE_RW; - if (args.open.flags & O_APPEND) - return FILE_MODE_W; - return FILE_MODE_R; - } - bool open_file_mode_is_readonly() { - return get_open_file_mode() == FILE_MODE_R; - } - bool is_idempotent() { - if (st.op == MDS_OP_OPEN) - return open_file_mode_is_readonly(); - return (st.op < 1000); - } - bool auth_is_best() { - if (!is_idempotent()) return true; - if (st.op == MDS_OP_READDIR) return true; - return false; - } - bool follow_trailing_symlink() { - switch (st.op) { - case MDS_OP_LSTAT: - case MDS_OP_FSTAT: - case MDS_OP_LINK: - case MDS_OP_UNLINK: - case MDS_OP_RENAME: - return false; - - case MDS_OP_STAT: - case MDS_OP_UTIME: - case MDS_OP_CHMOD: - case MDS_OP_CHOWN: - case MDS_OP_READDIR: - case MDS_OP_OPEN: - case MDS_OP_TRUNCATE: - - case MDS_OP_FSYNC: - case MDS_OP_MKNOD: - case MDS_OP_MKDIR: - case MDS_OP_RMDIR: - case MDS_OP_SYMLINK: - return true; - - default: - assert(0); - return false; - } - } - - - - // normal fields - void set_tid(tid_t t) { st.tid = t; } - void set_oldest_client_tid(tid_t t) { st.oldest_client_tid = t; } - void inc_num_fwd() { st.num_fwd++; } - void set_retry_attempt(int a) { st.retry_attempt = a; } - void set_path(string& p) { path.set_path(p); } - void set_path(const char *p) { path.set_path(p); } - void set_path(const filepath& fp) { path = fp; } - void set_caller_uid(int u) { st.caller_uid = u; } - void set_caller_gid(int g) { st.caller_gid = g; } - void set_sarg(string& arg) { this->sarg = arg; } - void set_sarg(const char *arg) { this->sarg = arg; } - void set_mds_wants_replica_in_dirino(inodeno_t dirino) { - st.mds_wants_replica_in_dirino = dirino; } - - void set_client_inst(const entity_inst_t& i) { st.client_inst = i; } - const entity_inst_t& get_client_inst() { return st.client_inst; } - - int get_client() { return st.client_inst.name.num(); } - tid_t get_tid() { return st.tid; } - tid_t get_oldest_client_tid() { return st.oldest_client_tid; } - int get_num_fwd() { return st.num_fwd; } - int get_retry_attempt() { return st.retry_attempt; } - int get_op() { return st.op; } - int get_caller_uid() { return st.caller_uid; } - int get_caller_gid() { return st.caller_gid; } - //inodeno_t get_ino() { return st.ino; } - const string& get_path() { return path.get_path(); } - filepath& get_filepath() { return path; } - string& get_sarg() { return sarg; } - inodeno_t get_mds_wants_replica_in_dirino() { - return st.mds_wants_replica_in_dirino; } - - inodeno_t get_cwd_ino() { return st.cwd_ino ? st.cwd_ino:inodeno_t(MDS_INO_ROOT); } - - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(st), (char*)&st); - off += sizeof(st); - payload.copy(off, sizeof(args), (char*)&args); - off += sizeof(args); - path._decode(payload, off); - ::_decode(sarg, payload, off); - } - - void encode_payload() { - payload.append((char*)&st, sizeof(st)); - payload.append((char*)&args, sizeof(args)); - path._encode(payload); - ::_encode(sarg, payload); - } - - char *get_type_name() { return "creq"; } - void print(ostream& out) { - out << "clientreq(client" << get_client() - << "." << get_tid() - << " "; - switch(get_op()) { - case MDS_OP_STATFS: - out << "statfs"; break; - - case MDS_OP_STAT: - out << "stat"; break; - case MDS_OP_LSTAT: - out << "lstat"; break; - case MDS_OP_FSTAT: - out << "fstat"; break; - case MDS_OP_UTIME: - out << "utime"; break; - case MDS_OP_CHMOD: - out << "chmod"; break; - case MDS_OP_CHOWN: - out << "chown"; break; - - case MDS_OP_READDIR: - out << "readdir"; break; - case MDS_OP_MKNOD: - out << "mknod"; break; - case MDS_OP_LINK: - out << "link"; break; - case MDS_OP_UNLINK: - out << "unlink"; break; - case MDS_OP_RENAME: - out << "rename"; break; - - case MDS_OP_MKDIR: - out << "mkdir"; break; - case MDS_OP_RMDIR: - out << "rmdir"; break; - case MDS_OP_SYMLINK: - out << "symlink"; break; - - case MDS_OP_OPEN: - out << "open"; break; - case MDS_OP_TRUNCATE: - out << "truncate"; break; - case MDS_OP_FSYNC: - out << "fsync"; break; - // case MDS_OP_RELEASE: - //out << "release"; break; - default: - out << "unknown=" << get_op(); - assert(0); - } - if (get_path().length()) - out << " " << get_path(); - if (get_sarg().length()) - out << " " << get_sarg(); - if (st.retry_attempt) - out << " RETRY=" << st.retry_attempt; - out << ")"; - } - -}; - -#endif diff --git a/branches/sage/crush/messages/MClientRequestForward.h b/branches/sage/crush/messages/MClientRequestForward.h deleted file mode 100644 index 53fb5270d30a9..0000000000000 --- a/branches/sage/crush/messages/MClientRequestForward.h +++ /dev/null @@ -1,59 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MCLIENTREQUESTFORWARD_H -#define __MCLIENTREQUESTFORWARD_H - -class MClientRequestForward : public Message { - tid_t tid; - int32_t dest_mds; - int32_t num_fwd; - - public: - MClientRequestForward() : Message(MSG_CLIENT_REQUEST_FORWARD) {} - MClientRequestForward(tid_t t, int dm, int nf) : - Message(MSG_CLIENT_REQUEST_FORWARD), - tid(t), dest_mds(dm), num_fwd(nf) { } - - tid_t get_tid() { return tid; } - int get_dest_mds() { return dest_mds; } - int get_num_fwd() { return num_fwd; } - - char *get_type_name() { return "cfwd"; } - void print(ostream& o) { - o << "client_request_forward(" << tid - << " to " << dest_mds - << " num_fwd=" << num_fwd - << ")"; - } - - void encode_payload() { - payload.append((char*)&tid, sizeof(tid)); - payload.append((char*)&dest_mds, sizeof(dest_mds)); - payload.append((char*)&num_fwd, sizeof(num_fwd)); - } - - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(tid), (char*)&tid); - off += sizeof(tid); - payload.copy(off, sizeof(dest_mds), (char*)&dest_mds); - off += sizeof(dest_mds); - payload.copy(off, sizeof(num_fwd), (char*)&num_fwd); - off += sizeof(num_fwd); - } -}; - -#endif diff --git a/branches/sage/crush/messages/MClientSession.h b/branches/sage/crush/messages/MClientSession.h deleted file mode 100644 index dc4252ac73d8e..0000000000000 --- a/branches/sage/crush/messages/MClientSession.h +++ /dev/null @@ -1,62 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MCLIENTSESSION_H -#define __MCLIENTSESSION_H - -#include "msg/Message.h" - -class MClientSession : public Message { -public: - const static int OP_REQUEST_OPEN = 1; - const static int OP_OPEN = 2; - const static int OP_REQUEST_CLOSE = 3; - const static int OP_CLOSE = 4; - static const char *get_opname(int o) { - switch (o) { - case OP_REQUEST_OPEN: return "request_open"; - case OP_OPEN: return "open"; - case OP_REQUEST_CLOSE: return "request_close"; - case OP_CLOSE: return "close"; - default: assert(0); return 0; - } - } - - int32_t op; - version_t seq; - - MClientSession() : Message(MSG_CLIENT_SESSION) { } - MClientSession(int o, version_t s=0) : - Message(MSG_CLIENT_SESSION), - op(o), seq(s) { } - - char *get_type_name() { return "client_session"; } - void print(ostream& out) { - out << "client_session(" << get_opname(op); - if (seq) out << " seq " << seq; - out << ")"; - } - - void decode_payload() { - int off = 0; - ::_decode(op, payload, off); - ::_decode(seq, payload, off); - } - void encode_payload() { - ::_encode(op, payload); - ::_encode(seq, payload); - } -}; - -#endif diff --git a/branches/sage/crush/messages/MClientUnmount.h b/branches/sage/crush/messages/MClientUnmount.h deleted file mode 100644 index 42fa07db7ba05..0000000000000 --- a/branches/sage/crush/messages/MClientUnmount.h +++ /dev/null @@ -1,40 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MCLIENTUNMOUNT_H -#define __MCLIENTUNMOUNT_H - -#include "msg/Message.h" - -class MClientUnmount : public Message { -public: - entity_inst_t inst; - - MClientUnmount() : Message(MSG_CLIENT_UNMOUNT) { } - MClientUnmount(entity_inst_t i) : - Message(MSG_CLIENT_UNMOUNT), - inst(i) { } - - char *get_type_name() { return "client_unmount"; } - - void decode_payload() { - int off = 0; - ::_decode(inst, payload, off); - } - void encode_payload() { - ::_encode(inst, payload); - } -}; - -#endif diff --git a/branches/sage/crush/messages/MDirUpdate.h b/branches/sage/crush/messages/MDirUpdate.h deleted file mode 100644 index 0db32208efd45..0000000000000 --- a/branches/sage/crush/messages/MDirUpdate.h +++ /dev/null @@ -1,71 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MDIRUPDATE_H -#define __MDIRUPDATE_H - -#include "msg/Message.h" - -class MDirUpdate : public Message { - struct { - dirfrag_t dirfrag; - int dir_rep; - int discover; - } st; - set dir_rep_by; - string path; - - public: - dirfrag_t get_dirfrag() { return st.dirfrag; } - int get_dir_rep() { return st.dir_rep; } - set& get_dir_rep_by() { return dir_rep_by; } - bool should_discover() { return st.discover > 0; } - string& get_path() { return path; } - - void tried_discover() { - if (st.discover) st.discover--; - } - - MDirUpdate() {} - MDirUpdate(dirfrag_t dirfrag, - int dir_rep, - set& dir_rep_by, - string& path, - bool discover = false) : - Message(MSG_MDS_DIRUPDATE) { - this->st.dirfrag = dirfrag; - this->st.dir_rep = dir_rep; - this->dir_rep_by = dir_rep_by; - if (discover) this->st.discover = 5; - this->path = path; - } - virtual char *get_type_name() { return "dir_update"; } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(st), (char*)&st); - off += sizeof(st); - ::_decode(dir_rep_by, payload, off); - ::_decode(path, payload, off); - } - - virtual void encode_payload() { - payload.append((char*)&st, sizeof(st)); - ::_encode(dir_rep_by, payload); - ::_encode(path, payload); - } -}; - -#endif diff --git a/branches/sage/crush/messages/MExportDirDiscover.h b/branches/sage/crush/messages/MExportDirDiscover.h deleted file mode 100644 index c311d1e87e940..0000000000000 --- a/branches/sage/crush/messages/MExportDirDiscover.h +++ /dev/null @@ -1,59 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MEXPORTDIRDISCOVER_H -#define __MEXPORTDIRDISCOVER_H - -#include "msg/Message.h" -#include "mds/CInode.h" -#include "include/types.h" - -class MExportDirDiscover : public Message { - dirfrag_t dirfrag; - string path; - - public: - inodeno_t get_ino() { return dirfrag.ino; } - dirfrag_t get_dirfrag() { return dirfrag; } - string& get_path() { return path; } - - bool started; - - MExportDirDiscover() : - Message(MSG_MDS_EXPORTDIRDISCOVER), - started(false) { } - MExportDirDiscover(CDir *dir) : - Message(MSG_MDS_EXPORTDIRDISCOVER), - started(false) { - dir->get_inode()->make_path(path); - dirfrag = dir->dirfrag(); - } - virtual char *get_type_name() { return "ExDis"; } - void print(ostream& o) { - o << "export_discover(" << dirfrag << " " << path << ")"; - } - - virtual void decode_payload() { - bufferlist::iterator p = payload.begin(); - ::_decode_simple(dirfrag, p); - ::_decode_simple(path, p); - } - - virtual void encode_payload() { - ::_encode_simple(dirfrag, payload); - ::_encode_simple(path, payload); - } -}; - -#endif diff --git a/branches/sage/crush/messages/MLock.h b/branches/sage/crush/messages/MLock.h deleted file mode 100644 index 95c3e5f325212..0000000000000 --- a/branches/sage/crush/messages/MLock.h +++ /dev/null @@ -1,126 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MLOCK_H -#define __MLOCK_H - -#include "msg/Message.h" -#include "mds/SimpleLock.h" - -// for replicas -#define LOCK_AC_SYNC -1 -#define LOCK_AC_MIXED -2 -#define LOCK_AC_LOCK -3 - -#define LOCK_AC_SCATTER -6 - -// for auth -#define LOCK_AC_SYNCACK 1 -#define LOCK_AC_MIXEDACK 2 -#define LOCK_AC_LOCKACK 3 - -#define LOCK_AC_REQSCATTER 7 -#define LOCK_AC_REQUNSCATTER 8 - -#define LOCK_AC_FOR_REPLICA(a) ((a) < 0) -#define LOCK_AC_FOR_AUTH(a) ((a) > 0) - - -static const char *get_lock_action_name(int a) { - switch (a) { - case LOCK_AC_SYNC: return "sync"; - case LOCK_AC_MIXED: return "mixed"; - case LOCK_AC_LOCK: return "lock"; - case LOCK_AC_SCATTER: return "scatter"; - case LOCK_AC_SYNCACK: return "syncack"; - case LOCK_AC_MIXEDACK: return "mixedack"; - case LOCK_AC_LOCKACK: return "lockack"; - case LOCK_AC_REQSCATTER: return "reqscatter"; - case LOCK_AC_REQUNSCATTER: return "requnscatter"; - default: assert(0); return 0; - } -} - - -class MLock : public Message { - int32_t action; // action type - int32_t asker; // who is initiating this request - metareqid_t reqid; // for remote lock requests - - char lock_type; // lock object type - MDSCacheObjectInfo object_info; - - bufferlist data; // and possibly some data - - public: - bufferlist& get_data() { return data; } - int get_asker() { return asker; } - int get_action() { return action; } - metareqid_t get_reqid() { return reqid; } - - int get_lock_type() { return lock_type; } - MDSCacheObjectInfo &get_object_info() { return object_info; } - - MLock() {} - MLock(int ac, int as) : - Message(MSG_MDS_LOCK), - action(ac), asker(as), - lock_type(0) { } - MLock(SimpleLock *lock, int ac, int as) : - Message(MSG_MDS_LOCK), - action(ac), asker(as), - lock_type(lock->get_type()) { - lock->get_parent()->set_object_info(object_info); - } - MLock(SimpleLock *lock, int ac, int as, bufferlist& bl) : - Message(MSG_MDS_LOCK), - action(ac), asker(as), lock_type(lock->get_type()) { - lock->get_parent()->set_object_info(object_info); - data.claim(bl); - } - virtual char *get_type_name() { return "ILock"; } - void print(ostream& out) { - out << "lock(a=" << get_lock_action_name(action) - << " " << get_lock_type_name(lock_type) - << " " << object_info - << ")"; - } - - void set_reqid(metareqid_t ri) { reqid = ri; } - void set_data(const bufferlist& data) { - this->data = data; - } - - void decode_payload() { - int off = 0; - ::_decode(asker, payload, off); - ::_decode(action, payload, off); - ::_decode(reqid, payload, off); - ::_decode(lock_type, payload, off); - object_info._decode(payload, off); - ::_decode(data, payload, off); - } - virtual void encode_payload() { - ::_encode(asker, payload); - ::_encode(action, payload); - ::_encode(reqid, payload); - ::_encode(lock_type, payload); - object_info._encode(payload); - ::_encode(data, payload); - } - -}; - -#endif diff --git a/branches/sage/crush/messages/MMDSBeacon.h b/branches/sage/crush/messages/MMDSBeacon.h deleted file mode 100644 index c18a05e77f1a8..0000000000000 --- a/branches/sage/crush/messages/MMDSBeacon.h +++ /dev/null @@ -1,63 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMDSBEACON_H -#define __MMDSBEACON_H - -#include "msg/Message.h" - -#include "include/types.h" - -#include "mds/MDSMap.h" - -class MMDSBeacon : public Message { - entity_inst_t inst; - epoch_t last_epoch_seen; // include last mdsmap epoch mds has seen to avoid race with monitor decree - int state; - version_t seq; - - public: - MMDSBeacon() : Message(MSG_MDS_BEACON) {} - MMDSBeacon(entity_inst_t i, epoch_t les, int st, version_t se) : - Message(MSG_MDS_BEACON), - inst(i), last_epoch_seen(les), state(st), seq(se) { } - - entity_inst_t& get_mds_inst() { return inst; } - epoch_t get_last_epoch_seen() { return last_epoch_seen; } - int get_state() { return state; } - version_t get_seq() { return seq; } - char *get_type_name() { return "mdsbeacon"; } - - void print(ostream& out) { - out << "mdsbeacon(" << inst - << " " << MDSMap::get_state_name(state) - << " seq " << seq << ")"; - } - - void encode_payload() { - ::_encode(inst, payload); - ::_encode(last_epoch_seen, payload); - ::_encode(state, payload); - ::_encode(seq, payload); - } - void decode_payload() { - int off = 0; - ::_decode(inst, payload, off); - ::_decode(last_epoch_seen, payload, off); - ::_decode(state, payload, off); - ::_decode(seq, payload, off); - } -}; - -#endif diff --git a/branches/sage/crush/messages/MMDSGetMap.h b/branches/sage/crush/messages/MMDSGetMap.h deleted file mode 100644 index eab9a3506a40b..0000000000000 --- a/branches/sage/crush/messages/MMDSGetMap.h +++ /dev/null @@ -1,39 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMDSGETMAP_H -#define __MMDSGETMAP_H - -#include "msg/Message.h" - -#include "include/types.h" - -class MMDSGetMap : public Message { - public: - MMDSGetMap() : Message(MSG_MDS_GETMAP) { - } - - char *get_type_name() { return "mdsgetmap"; } - - void encode_payload() { - //payload.append((char*)&sb, sizeof(sb)); - } - void decode_payload() { - //int off = 0; - //payload.copy(off, sizeof(sb), (char*)&sb); - //off += sizeof(sb); - } -}; - -#endif diff --git a/branches/sage/crush/messages/MMDSMap.h b/branches/sage/crush/messages/MMDSMap.h deleted file mode 100644 index 164e547cc513a..0000000000000 --- a/branches/sage/crush/messages/MMDSMap.h +++ /dev/null @@ -1,79 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MMDSMAP_H -#define __MMDSMAP_H - -#include "msg/Message.h" -#include "mds/MDSMap.h" - -class MMDSMap : public Message { - public: - /* - map maps; - map incremental_maps; - - epoch_t get_first() { - epoch_t e = 0; - map::iterator i = maps.begin(); - if (i != maps.end()) e = i->first; - i = incremental_maps.begin(); - if (i != incremental_maps.end() && - (e == 0 || i->first < e)) e = i->first; - return e; - } - epoch_t get_last() { - epoch_t e = 0; - map::reverse_iterator i = maps.rbegin(); - if (i != maps.rend()) e = i->first; - i = incremental_maps.rbegin(); - if (i != incremental_maps.rend() && - (e == 0 || i->first > e)) e = i->first; - return e; - } - */ - - version_t epoch; - bufferlist encoded; - - version_t get_epoch() const { return epoch; } - bufferlist& get_encoded() { return encoded; } - - MMDSMap() : - Message(MSG_MDS_MAP) {} - MMDSMap(MDSMap *mm) : - Message(MSG_MDS_MAP) { - epoch = mm->get_epoch(); - mm->encode(encoded); - } - - char *get_type_name() { return "mdsmap"; } - void print(ostream& out) { - out << "mdsmap(e " << epoch << ")"; - } - - // marshalling - void decode_payload() { - int off = 0; - ::_decode(epoch, payload, off); - ::_decode(encoded, payload, off); - } - void encode_payload() { - ::_encode(epoch, payload); - ::_encode(encoded, payload); - } -}; - -#endif diff --git a/branches/sage/crush/messages/MMDSSlaveRequest.h b/branches/sage/crush/messages/MMDSSlaveRequest.h deleted file mode 100644 index 5ef65223ec1c9..0000000000000 --- a/branches/sage/crush/messages/MMDSSlaveRequest.h +++ /dev/null @@ -1,148 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MMDSSLAVEREQUEST_H -#define __MMDSSLAVEREQUEST_H - -#include "msg/Message.h" -#include "mds/mdstypes.h" -#include "include/encodable.h" - -class MMDSSlaveRequest : public Message { - public: - static const int OP_XLOCK = 1; - static const int OP_XLOCKACK = -1; - static const int OP_UNXLOCK = 2; - static const int OP_AUTHPIN = 3; - static const int OP_AUTHPINACK = -3; - - static const int OP_LINKPREP = 4; - static const int OP_UNLINKPREP = 5; - static const int OP_LINKPREPACK = -4; - - static const int OP_RENAMEPREP = 7; - static const int OP_RENAMEPREPACK = -7; - - static const int OP_FINISH = 17; - - static const int OP_ABORT = 20; // used for recovery only - //static const int OP_COMMIT = 21; // used for recovery only - - - const static char *get_opname(int o) { - switch (o) { - case OP_XLOCK: return "xlock"; - case OP_XLOCKACK: return "xlock_ack"; - case OP_UNXLOCK: return "unxlock"; - case OP_AUTHPIN: return "authpin"; - case OP_AUTHPINACK: return "authpin_ack"; - - case OP_LINKPREP: return "link_prep"; - case OP_LINKPREPACK: return "link_prep_ack"; - case OP_UNLINKPREP: return "unlink_prep"; - - case OP_RENAMEPREP: return "rename_prep"; - case OP_RENAMEPREPACK: return "rename_prep_ack"; - - case OP_FINISH: return "finish"; // commit - case OP_ABORT: return "abort"; - //case OP_COMMIT: return "commit"; - - default: assert(0); return 0; - } - } - - private: - metareqid_t reqid; - char op; - - // for locking - char lock_type; // lock object type - MDSCacheObjectInfo object_info; - - // for authpins - list authpins; - - public: - // for rename prep - string srcdnpath; - string destdnpath; - set witnesses; - bufferlist inode_export; - version_t inode_export_v; - bufferlist srci_replica; - utime_t now; - - bufferlist stray; // stray dir + dentry - -public: - metareqid_t get_reqid() { return reqid; } - int get_op() { return op; } - bool is_reply() { return op < 0; } - - int get_lock_type() { return lock_type; } - MDSCacheObjectInfo &get_object_info() { return object_info; } - - list& get_authpins() { return authpins; } - - void set_lock_type(int t) { lock_type = t; } - - // ---- - MMDSSlaveRequest() : Message(MSG_MDS_SLAVE_REQUEST) { } - MMDSSlaveRequest(metareqid_t ri, int o) : - Message(MSG_MDS_SLAVE_REQUEST), - reqid(ri), op(o) { } - void encode_payload() { - ::_encode(reqid, payload); - ::_encode(op, payload); - ::_encode(lock_type, payload); - object_info._encode(payload); - ::_encode_complex(authpins, payload); - ::_encode(srcdnpath, payload); - ::_encode(destdnpath, payload); - ::_encode(witnesses, payload); - ::_encode(now, payload); - ::_encode(inode_export, payload); - ::_encode(inode_export_v, payload); - ::_encode(srci_replica, payload); - ::_encode(stray, payload); - } - void decode_payload() { - bufferlist::iterator p = payload.begin(); - ::_decode_simple(reqid, p); - ::_decode_simple(op, p); - ::_decode_simple(lock_type, p); - object_info._decode(p); - ::_decode_complex(authpins, p); - ::_decode_simple(srcdnpath, p); - ::_decode_simple(destdnpath, p); - ::_decode_simple(witnesses, p); - ::_decode_simple(now, p); - ::_decode_simple(inode_export, p); - ::_decode_simple(inode_export_v, p); - ::_decode_simple(srci_replica, p); - ::_decode_simple(stray, p); - } - - char *get_type_name() { return "slave_request"; } - void print(ostream& out) { - out << "slave_request(" << reqid - << " " << get_opname(op) - << ")"; - } - -}; - -#endif diff --git a/branches/sage/crush/messages/MOSDGetMap.h b/branches/sage/crush/messages/MOSDGetMap.h deleted file mode 100644 index 25f94ef3bcc92..0000000000000 --- a/branches/sage/crush/messages/MOSDGetMap.h +++ /dev/null @@ -1,51 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MOSDGETMAP_H -#define __MOSDGETMAP_H - -#include "msg/Message.h" - -#include "include/types.h" - -class MOSDGetMap : public Message { - public: - epoch_t start, want; - - MOSDGetMap(epoch_t s=0, epoch_t w=0) : - Message(MSG_OSD_GETMAP), - start(s), want(w) { } - - epoch_t get_start_epoch() { return start; } - epoch_t get_want_epoch() { return want; } - - char *get_type_name() { return "get_osd_map"; } - void print(ostream& out) { - out << "get_osd_map(have " << start; - if (want) out << " want " << want; - out << ")"; - } - - void encode_payload() { - ::_encode(start, payload); - ::_encode(want, payload); - } - void decode_payload() { - int off = 0; - ::_decode(start, payload, off); - ::_decode(want, payload, off); - } -}; - -#endif diff --git a/branches/sage/crush/messages/MOSDMap.h b/branches/sage/crush/messages/MOSDMap.h deleted file mode 100644 index 525ed82ae5c29..0000000000000 --- a/branches/sage/crush/messages/MOSDMap.h +++ /dev/null @@ -1,71 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDGETMAPACK_H -#define __MOSDGETMAPACK_H - -#include "msg/Message.h" -#include "osd/OSDMap.h" - - -class MOSDMap : public Message { - public: - map maps; - map incremental_maps; - - epoch_t get_first() { - epoch_t e = 0; - map::iterator i = maps.begin(); - if (i != maps.end()) e = i->first; - i = incremental_maps.begin(); - if (i != incremental_maps.end() && - (e == 0 || i->first < e)) e = i->first; - return e; - } - epoch_t get_last() { - epoch_t e = 0; - map::reverse_iterator i = maps.rbegin(); - if (i != maps.rend()) e = i->first; - i = incremental_maps.rbegin(); - if (i != incremental_maps.rend() && - (e == 0 || i->first > e)) e = i->first; - return e; - } - - - MOSDMap() : Message(MSG_OSD_MAP) { } - MOSDMap(OSDMap *oc) : Message(MSG_OSD_MAP) { - oc->encode(maps[oc->get_epoch()]); - } - - - // marshalling - virtual void decode_payload() { - int off = 0; - ::_decode(maps, payload, off); - ::_decode(incremental_maps, payload, off); - } - virtual void encode_payload() { - ::_encode(maps, payload); - ::_encode(incremental_maps, payload); - } - - virtual char *get_type_name() { return "omap"; } - void print(ostream& out) { - out << "osd_map(" << get_first() << "," << get_last() << ")"; - } -}; - -#endif diff --git a/branches/sage/crush/messages/MOSDOp.h b/branches/sage/crush/messages/MOSDOp.h deleted file mode 100644 index 7ac401bd75a69..0000000000000 --- a/branches/sage/crush/messages/MOSDOp.h +++ /dev/null @@ -1,280 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDOP_H -#define __MOSDOP_H - -#include "msg/Message.h" -#include "osd/osd_types.h" - -/* - * OSD op - * - * oid - object id - * op - OSD_OP_DELETE, etc. - * - */ - -// osd client ops -#define OSD_OP_READ 1 -#define OSD_OP_STAT 2 - -#define OSD_OP_REPLICATE 3 -#define OSD_OP_UNREPLICATE 4 - -#define OSD_OP_WRNOOP 10 -#define OSD_OP_WRITE 11 -#define OSD_OP_DELETE 12 -#define OSD_OP_TRUNCATE 13 -#define OSD_OP_ZERO 14 - -#define OSD_OP_WRLOCK 20 -#define OSD_OP_WRUNLOCK 21 -#define OSD_OP_RDLOCK 22 -#define OSD_OP_RDUNLOCK 23 -#define OSD_OP_UPLOCK 24 -#define OSD_OP_DNLOCK 25 -#define OSD_OP_MININCLOCK 26 // minimum incarnation lock - -#define OSD_OP_PULL 30 -#define OSD_OP_PUSH 31 - -#define OSD_OP_BALANCEREADS 101 -#define OSD_OP_UNBALANCEREADS 102 - - - -class MOSDOp : public Message { -public: - static const char* get_opname(int op) { - switch (op) { - case OSD_OP_READ: return "read"; - case OSD_OP_STAT: return "stat"; - - case OSD_OP_WRNOOP: return "wrnoop"; - case OSD_OP_WRITE: return "write"; - case OSD_OP_ZERO: return "zero"; - case OSD_OP_DELETE: return "delete"; - case OSD_OP_TRUNCATE: return "truncate"; - case OSD_OP_WRLOCK: return "wrlock"; - case OSD_OP_WRUNLOCK: return "wrunlock"; - case OSD_OP_RDLOCK: return "rdlock"; - case OSD_OP_RDUNLOCK: return "rdunlock"; - case OSD_OP_UPLOCK: return "uplock"; - case OSD_OP_DNLOCK: return "dnlock"; - - case OSD_OP_MININCLOCK: return "mininclock"; - - case OSD_OP_BALANCEREADS: return "balance-reads"; - case OSD_OP_UNBALANCEREADS: return "unbalance-reads"; - - case OSD_OP_PULL: return "pull"; - case OSD_OP_PUSH: return "push"; - default: assert(0); - } - return 0; - } - -private: - struct st_ { - // who's asking? - entity_inst_t client; - osdreqid_t reqid; // minor weirdness: entity_name_t is in reqid_t too. - - // for replication - tid_t rep_tid; - - object_t oid; - objectrev_t rev; - ObjectLayout layout; - - epoch_t map_epoch; - - eversion_t pg_trim_to; // primary->replica: trim to here - - int32_t op; - off_t offset, length; - - eversion_t version; - eversion_t old_version; - - bool want_ack; - bool want_commit; - bool retry_attempt; - - int shed_count; - osd_peer_stat_t peer_stat; - } st; - - bufferlist data; - map attrset; - - - friend class MOSDOpReply; - -public: - const osdreqid_t& get_reqid() { return st.reqid; } - const tid_t get_client_tid() { return st.reqid.tid; } - int get_client_inc() { return st.reqid.inc; } - - const entity_name_t& get_client() { return st.client.name; } - const entity_inst_t& get_client_inst() { return st.client; } - void set_client_inst(const entity_inst_t& i) { st.client = i; } - - bool wants_reply() { - if (st.op < 100) return true; - return false; // no reply needed for primary-lock, -unlock. - } - - const tid_t get_rep_tid() { return st.rep_tid; } - void set_rep_tid(tid_t t) { st.rep_tid = t; } - - bool get_retry_attempt() const { return st.retry_attempt; } - void set_retry_attempt(bool a) { st.retry_attempt = a; } - - const object_t get_oid() { return st.oid; } - const pg_t get_pg() { return st.layout.pgid; } - const ObjectLayout& get_layout() { return st.layout; } - const epoch_t get_map_epoch() { return st.map_epoch; } - - //const int get_pg_role() { return st.pg_role; } // who am i asking for? - const eversion_t get_version() { return st.version; } - //const eversion_t get_old_version() { return st.old_version; } - - void set_rev(objectrev_t r) { st.rev = r; } - objectrev_t get_rev() { return st.rev; } - - const eversion_t get_pg_trim_to() { return st.pg_trim_to; } - void set_pg_trim_to(eversion_t v) { st.pg_trim_to = v; } - - const int get_op() { return st.op; } - void set_op(int o) { st.op = o; } - bool is_read() { - return st.op < 10; - } - - const off_t get_length() { return st.length; } - const off_t get_offset() { return st.offset; } - - map& get_attrset() { return attrset; } - void set_attrset(map &as) { attrset.swap(as); } - - const bool wants_ack() { return st.want_ack; } - const bool wants_commit() { return st.want_commit; } - - void set_peer_stat(const osd_peer_stat_t& stat) { st.peer_stat = stat; } - const osd_peer_stat_t& get_peer_stat() { return st.peer_stat; } - void inc_shed_count() { st.shed_count++; } - int get_shed_count() { return st.shed_count; } - - void set_data(bufferlist &d) { - data.claim(d); - } - bufferlist& get_data() { - return data; - } - off_t get_data_len() { return data.length(); } - - - MOSDOp(entity_inst_t asker, int inc, long tid, - object_t oid, ObjectLayout ol, epoch_t mapepoch, int op) : - Message(MSG_OSD_OP) { - memset(&st, 0, sizeof(st)); - this->st.client = asker; - this->st.reqid.name = asker.name; - this->st.reqid.inc = inc; - this->st.reqid.tid = tid; - - this->st.oid = oid; - this->st.layout = ol; - this->st.map_epoch = mapepoch; - this->st.op = op; - - this->st.rep_tid = 0; - - this->st.want_ack = true; - this->st.want_commit = true; - } - MOSDOp() {} - - //void set_pg_role(int r) { st.pg_role = r; } - //void set_rg_nrep(int n) { st.rg_nrep = n; } - - void set_layout(const ObjectLayout& l) { st.layout = l; } - - void set_length(off_t l) { st.length = l; } - void set_offset(off_t o) { st.offset = o; } - void set_version(eversion_t v) { st.version = v; } - void set_old_version(eversion_t ov) { st.old_version = ov; } - - void set_want_ack(bool b) { st.want_ack = b; } - void set_want_commit(bool b) { st.want_commit = b; } - - // marshalling - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(st), (char*)&st); - off += sizeof(st); - ::_decode(attrset, payload, off); - ::_decode(data, payload, off); - } - - static void add_payload_chunk_breaks(int from, int off, int len, - list& breaks) { - if (len > 0 && - len & 4095 == 0 && - off & 4095 == 0) { - // page-sized and aligned data? easy. - breaks.push_back(from); - } else if (len > 8192) { - // there is at least 1 full page in there. somewhere. - int p = 0; - - // leading partial page? - if (off & 4095 != 0) - p = 4096 - (off & 4095); - - // full page(s) - breaks.push_back(from + p); - p += (len - p) & (~4095); - - // tail bit? - if (p != len) - breaks.push_back(from + p); - } - } - - virtual void encode_payload() { - ::_encode(st, payload); - ::_encode(attrset, payload); - add_payload_chunk_breaks(payload.length() + 4, - st.offset, data.length(), - chunk_payload_at); - ::_encode(data, payload); - } - - virtual char *get_type_name() { return "osd_op"; } - void print(ostream& out) { - out << "osd_op(" << st.reqid - << " " << get_opname(st.op) - << " " << st.oid; - if (st.length) out << " " << st.offset << "~" << st.length; - if (st.retry_attempt) out << " RETRY"; - out << ")"; - } -}; - - -#endif diff --git a/branches/sage/crush/messages/MOSDOpReply.h b/branches/sage/crush/messages/MOSDOpReply.h deleted file mode 100644 index 3c567397e6a2d..0000000000000 --- a/branches/sage/crush/messages/MOSDOpReply.h +++ /dev/null @@ -1,164 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDOPREPLY_H -#define __MOSDOPREPLY_H - -#include "msg/Message.h" - -#include "MOSDOp.h" -#include "osd/ObjectStore.h" - -/* - * OSD op reply - * - * oid - object id - * op - OSD_OP_DELETE, etc. - * - */ - -class MOSDOpReply : public Message { - struct { - // req - osdreqid_t reqid; - - tid_t rep_tid; - - object_t oid; - ObjectLayout layout; // pgid, etc. - - int32_t op; - - // reply - int32_t result; - bool commit; - off_t length, offset; - off_t object_size; - eversion_t version; - - eversion_t pg_complete_thru; - - epoch_t map_epoch; - - osd_peer_stat_t peer_stat; - } st; - - bufferlist data; - map attrset; - - public: - const osdreqid_t& get_reqid() { return st.reqid; } - long get_tid() { return st.reqid.tid; } - long get_rep_tid() { return st.rep_tid; } - object_t get_oid() { return st.oid; } - pg_t get_pg() { return st.layout.pgid; } - int get_op() { return st.op; } - bool get_commit() { return st.commit; } - - int get_result() { return st.result; } - off_t get_length() { return st.length; } - off_t get_offset() { return st.offset; } - off_t get_object_size() { return st.object_size; } - eversion_t get_version() { return st.version; } - map& get_attrset() { return attrset; } - - eversion_t get_pg_complete_thru() { return st.pg_complete_thru; } - void set_pg_complete_thru(eversion_t v) { st.pg_complete_thru = v; } - - void set_result(int r) { st.result = r; } - void set_length(off_t s) { st.length = s; } - void set_offset(off_t o) { st.offset = o; } - void set_object_size(off_t s) { st.object_size = s; } - void set_version(eversion_t v) { st.version = v; } - void set_attrset(map &as) { attrset = as; } - - void set_op(int op) { st.op = op; } - void set_rep_tid(tid_t t) { st.rep_tid = t; } - - void set_peer_stat(const osd_peer_stat_t& stat) { st.peer_stat = stat; } - const osd_peer_stat_t& get_peer_stat() { return st.peer_stat; } - - // data payload - void set_data(bufferlist &d) { - data.claim(d); - } - bufferlist& get_data() { - return data; - } - - // osdmap - epoch_t get_map_epoch() { return st.map_epoch; } - - -public: - MOSDOpReply(MOSDOp *req, int result, epoch_t e, bool commit) : - Message(MSG_OSD_OPREPLY) { - memset(&st, 0, sizeof(st)); - this->st.reqid = req->st.reqid; - this->st.op = req->st.op; - this->st.rep_tid = req->st.rep_tid; - - this->st.oid = req->st.oid; - this->st.layout = req->st.layout; - this->st.result = result; - this->st.commit = commit; - - this->st.length = req->st.length; // speculative... OSD should ensure these are correct - this->st.offset = req->st.offset; - this->st.version = req->st.version; - - this->st.map_epoch = e; - } - MOSDOpReply() {} - - - // marshalling - virtual void decode_payload() { - payload.copy(0, sizeof(st), (char*)&st); - payload.splice(0, sizeof(st)); - int off = 0; - ::_decode(attrset, payload, off); - ::_decode(data, payload, off); - } - virtual void encode_payload() { - payload.append((char*)&st, sizeof(st)); - ::_encode(attrset, payload); - MOSDOp::add_payload_chunk_breaks(payload.length() + 4, - st.offset, data.length(), - chunk_payload_at); - ::_encode(data, payload); - } - - virtual char *get_type_name() { return "osd_op_reply"; } - - void print(ostream& out) { - out << "osd_op_reply(" << st.reqid - << " " << MOSDOp::get_opname(st.op) - << " " << st.oid; - if (st.length) out << " " << st.offset << "~" << st.length; - if (st.op >= 10) { - if (st.commit) - out << " commit"; - else - out << " ack"; - } - out << " = " << st.result; - out << ")"; - } - -}; - - -#endif diff --git a/branches/sage/crush/messages/MOSDPGQuery.h b/branches/sage/crush/messages/MOSDPGQuery.h deleted file mode 100644 index 70dbfdbb96fd7..0000000000000 --- a/branches/sage/crush/messages/MOSDPGQuery.h +++ /dev/null @@ -1,52 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDPGQUERY_H -#define __MOSDPGQUERY_H - -#include "msg/Message.h" - -/* - * PGQuery - query another OSD as to the contents of their PGs - */ - -class MOSDPGQuery : public Message { - version_t epoch; - - public: - version_t get_epoch() { return epoch; } - map pg_list; - - MOSDPGQuery() {} - MOSDPGQuery(epoch_t e, map& ls) : - Message(MSG_OSD_PG_QUERY), - epoch(e), pg_list(ls) { - } - - char *get_type_name() { return "PGq"; } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - ::_encode(pg_list, payload); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - ::_decode(pg_list, payload, off); - } -}; - -#endif diff --git a/branches/sage/crush/messages/MPing.h b/branches/sage/crush/messages/MPing.h deleted file mode 100644 index 6b569666ed377..0000000000000 --- a/branches/sage/crush/messages/MPing.h +++ /dev/null @@ -1,43 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __MPING_H -#define __MPING_H - -#include "msg/Message.h" - - -class MPing : public Message { - public: - int seq; - MPing(int s) : Message(MSG_PING) { - seq = s; - } - MPing() : Message(MSG_PING) {} - - virtual void decode_payload() { - int off = 0; - payload.copy(0, sizeof(seq), (char*)&seq); - off += sizeof(seq); - } - virtual void encode_payload() { - payload.append((char*)&seq, sizeof(seq)); - } - - virtual char *get_type_name() { return "ping"; } -}; - -#endif diff --git a/branches/sage/crush/messages/MPingAck.h b/branches/sage/crush/messages/MPingAck.h deleted file mode 100644 index f8f32aee43ee0..0000000000000 --- a/branches/sage/crush/messages/MPingAck.h +++ /dev/null @@ -1,42 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MPINGACK_H -#define __MPINGACK_H - -#include "MPing.h" - - -class MPingAck : public Message { - public: - int seq; - MPingAck() {} - MPingAck(MPing *p) : Message(MSG_PING_ACK) { - this->seq = p->seq; - } - - virtual void decode_payload() { - int off = 0; - payload.copy(0, sizeof(seq), (char*)&seq); - off += sizeof(seq); - } - virtual void encode_payload() { - payload.append((char*)&seq, sizeof(seq)); - } - - virtual char *get_type_name() { return "pinga"; } -}; - -#endif diff --git a/branches/sage/crush/messages/MStatfs.h b/branches/sage/crush/messages/MStatfs.h deleted file mode 100644 index 66e5847206a7b..0000000000000 --- a/branches/sage/crush/messages/MStatfs.h +++ /dev/null @@ -1,42 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MSTATFS_H -#define __MSTATFS_H - -#include /* or */ - -class MStatfs : public Message { -public: - tid_t tid; - - MStatfs() : Message(MSG_STATFS) {} - MStatfs(tid_t t) : Message(MSG_STATFS), tid(t) {} - - char *get_type_name() { return "statfs"; } - void print(ostream& out) { - out << "statfs(" << tid << ")"; - } - - void encode_payload() { - ::_encode(tid, payload); - } - void decode_payload() { - int off = 0; - ::_decode(tid, payload, off); - } -}; - -#endif diff --git a/branches/sage/crush/messages/MStatfsReply.h b/branches/sage/crush/messages/MStatfsReply.h deleted file mode 100644 index f8e21ddcc2b31..0000000000000 --- a/branches/sage/crush/messages/MStatfsReply.h +++ /dev/null @@ -1,45 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MSTATFSREPLY_H -#define __MSTATFSREPLY_H - -#include /* or */ - -class MStatfsReply : public Message { -public: - tid_t tid; - struct statvfs stfs; - - MStatfsReply() : Message(MSG_STATFS_REPLY) {} - MStatfsReply(tid_t t) : Message(MSG_STATFS_REPLY), tid(t) {} - - char *get_type_name() { return "statfs_reply"; } - void print(ostream& out) { - out << "statfs_reply(" << tid << ")"; - } - - void encode_payload() { - ::_encode(tid, payload); - ::_encode(stfs, payload); - } - void decode_payload() { - int off = 0; - ::_decode(tid, payload, off); - ::_decode(stfs, payload, off); - } -}; - -#endif diff --git a/branches/sage/crush/mkmonmap.cc b/branches/sage/crush/mkmonmap.cc deleted file mode 100644 index 0a80e93c40bd2..0000000000000 --- a/branches/sage/crush/mkmonmap.cc +++ /dev/null @@ -1,68 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include -#include -#include - -#include -#include -#include -using namespace std; - -#include "config.h" - -#include "mon/MonMap.h" - - - - - -int main(int argc, char **argv) -{ - vector args; - argv_to_vec(argc, argv, args); - - MonMap monmap; - - char *outfn = ".ceph_monmap"; - - for (unsigned i=0; i= 0); - - return 0; -} diff --git a/branches/sage/crush/mon/ClientMonitor.cc b/branches/sage/crush/mon/ClientMonitor.cc deleted file mode 100644 index b7ac275b0afca..0000000000000 --- a/branches/sage/crush/mon/ClientMonitor.cc +++ /dev/null @@ -1,256 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include "ClientMonitor.h" -#include "Monitor.h" -#include "MDSMonitor.h" -#include "OSDMonitor.h" -#include "MonitorStore.h" - -#include "messages/MClientMount.h" -#include "messages/MClientUnmount.h" - -#include "common/Timer.h" - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_dout << dbeginl << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".client v" << client_map.version << " " -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_derr << dbeginl << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".client v" << client_map.version << " " - - - -bool ClientMonitor::update_from_paxos() -{ - assert(paxos->is_active()); - - version_t paxosv = paxos->get_version(); - if (paxosv == client_map.version) return true; - assert(paxosv >= client_map.version); - - dout(10) << "update_from_paxos paxosv " << paxosv - << ", my v " << client_map.version << dendl; - - - if (client_map.version == 0 && paxosv > 1 && - mon->store->exists_bl_ss("clientmap","latest")) { - // starting up: load latest - dout(7) << "update_from_paxos startup: loading latest full clientmap" << dendl; - bufferlist bl; - mon->store->get_bl_ss(bl, "clientmap", "latest"); - int off = 0; - client_map._decode(bl, off); - } - - // walk through incrementals - while (paxosv > client_map.version) { - bufferlist bl; - bool success = paxos->read(client_map.version+1, bl); - if (success) { - dout(7) << "update_from_paxos applying incremental " << client_map.version+1 << dendl; - Incremental inc; - int off = 0; - inc._decode(bl, off); - client_map.apply_incremental(inc); - - dout(1) << client_map.client_addr.size() << " clients (+" - << inc.mount.size() << " -" << inc.unmount.size() << ")" - << dendl; - - } else { - dout(7) << "update_from_paxos couldn't read incremental " << client_map.version+1 << dendl; - return false; - } - } - - // save latest - bufferlist bl; - client_map._encode(bl); - mon->store->put_bl_ss(bl, "clientmap", "latest"); - - return true; -} - -void ClientMonitor::create_pending() -{ - assert(mon->is_leader()); - pending_inc = Incremental(); - pending_inc.version = client_map.version + 1; - pending_inc.next_client = client_map.next_client; - dout(10) << "create_pending v " << pending_inc.version - << ", next is " << pending_inc.next_client - << dendl; -} - -void ClientMonitor::create_initial() -{ - dout(1) << "create_initial -- creating initial map" << dendl; -} - -void ClientMonitor::committed() -{ - -} - - -void ClientMonitor::encode_pending(bufferlist &bl) -{ - assert(mon->is_leader()); - dout(10) << "encode_pending v " << pending_inc.version - << ", next is " << pending_inc.next_client - << dendl; - assert(paxos->get_version() + 1 == pending_inc.version); - pending_inc._encode(bl); -} - - -// ------- - - -bool ClientMonitor::preprocess_query(Message *m) -{ - dout(10) << "preprocess_query " << *m << " from " << m->get_source_inst() << dendl; - - switch (m->get_type()) { - case MSG_CLIENT_MOUNT: - { - // already mounted? - MClientMount *mount = (MClientMount*)m; - entity_addr_t addr = m->get_source_addr(); - pair addrinst(addr, mount->instance); - if (client_map.addr_client.count(addrinst)) { - int client = client_map.addr_client[addrinst]; - dout(7) << " client" << client << " already mounted" << dendl; - _mounted(client, (MClientMount*)m); - return true; - } - } - return false; - - case MSG_CLIENT_UNMOUNT: - { - // already unmounted? - int client = m->get_source().num(); - if (client_map.client_addr.count(client) == 0) { - dout(7) << " client" << client << " not mounted" << dendl; - _unmounted((MClientUnmount*)m); - return true; - } - } - return false; - - - default: - assert(0); - delete m; - return true; - } -} - -bool ClientMonitor::prepare_update(Message *m) -{ - dout(10) << "prepare_update " << *m << " from " << m->get_source_inst() << dendl; - - switch (m->get_type()) { - case MSG_CLIENT_MOUNT: - { - MClientMount *mount = (MClientMount*)m; - pair addrinst(mount->addr, mount->instance); - int client = -1; - if (mount->get_source().is_client()) - client = mount->get_source().num(); - - // choose a client id - if (client < 0) { - client = pending_inc.next_client; - dout(10) << "mount: assigned client" << client << " to " << mount->addr << dendl; - } else { - dout(10) << "mount: client" << client << " requested by " - << mount->addr << "i" << mount->instance - << dendl; - if (client_map.client_addr.count(client)) { - assert(client_map.client_addr[client] != addrinst); - dout(0) << "mount: WARNING: client" << client << " requested by " - << mount->addr << "." << mount->instance - << ", which used to be " - << client_map.client_addr[client].first << "i" << client_map.client_addr[client].second - << dendl; - } - } - - pending_inc.add_mount(client, mount->addr, mount->instance); - paxos->wait_for_commit(new C_Mounted(this, client, mount)); - } - return true; - - case MSG_CLIENT_UNMOUNT: - { - MClientUnmount *unmount = (MClientUnmount*)m; - assert(unmount->inst.name.is_client()); - int client = unmount->inst.name.num(); - - assert(client_map.client_addr.count(client)); - - pending_inc.add_unmount(client); - paxos->wait_for_commit(new C_Unmounted(this, unmount)); - } - return true; - - default: - assert(0); - delete m; - return false; - } - -} - - -// MOUNT - - -void ClientMonitor::_mounted(int client, MClientMount *m) -{ - entity_inst_t to; - to.addr = m->addr; - to.name = entity_name_t::CLIENT(client); - - dout(10) << "_mounted client" << client << " at " << to << dendl; - - // reply with latest mds, osd maps - mon->mdsmon->send_latest(to); - mon->osdmon->send_latest(to); - - delete m; -} - -void ClientMonitor::_unmounted(MClientUnmount *m) -{ - dout(10) << "_unmounted " << m->inst << dendl; - - // reply with (same) unmount message - mon->messenger->send_message(m, m->inst); - - // auto-shutdown? - // (hack for fakesyn/newsyn, mostly) - if (mon->is_leader() && - client_map.version > 1 && - client_map.client_addr.empty() && - g_conf.mon_stop_on_last_unmount && - !mon->is_stopping()) { - dout(1) << "last client unmounted" << dendl; - mon->do_stop(); - } -} - - diff --git a/branches/sage/crush/mon/ClientMonitor.h b/branches/sage/crush/mon/ClientMonitor.h deleted file mode 100644 index f36ee9f7c18bd..0000000000000 --- a/branches/sage/crush/mon/ClientMonitor.h +++ /dev/null @@ -1,178 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __CLIENTMONITOR_H -#define __CLIENTMONITOR_H - -#include -#include -using namespace std; - -#include "include/types.h" -#include "msg/Messenger.h" - -#include "mds/MDSMap.h" - -#include "PaxosService.h" - -class Monitor; -class Paxos; -class MClientMount; -class MClientUnmount; - -class ClientMonitor : public PaxosService { -public: - - struct Incremental { - version_t version; - uint32_t next_client; - map > mount; - set unmount; - - Incremental() : version(0), next_client() {} - - bool is_empty() { return mount.empty() && unmount.empty(); } - void add_mount(uint32_t client, entity_addr_t addr, int instance) { - next_client = MAX(next_client, client+1); - mount[client] = pair(addr, instance); - } - void add_unmount(uint32_t client) { - assert(client < next_client); - if (mount.count(client)) - mount.erase(client); - else - unmount.insert(client); - } - - void _encode(bufferlist &bl) { - ::_encode(version, bl); - ::_encode(next_client, bl); - ::_encode(mount, bl); - ::_encode(unmount, bl); - } - void _decode(bufferlist &bl, int& off) { - ::_decode(version, bl, off); - ::_decode(next_client, bl, off); - ::_decode(mount, bl, off); - ::_decode(unmount, bl, off); - } - }; - - struct Map { - version_t version; - uint32_t next_client; - map > client_addr; - map,uint32_t> addr_client; - - Map() : version(0), next_client(0) {} - - void reverse() { - addr_client.clear(); - for (map >::iterator p = client_addr.begin(); - p != client_addr.end(); - ++p) { - addr_client[p->second] = p->first; - } - } - void apply_incremental(Incremental &inc) { - assert(inc.version == version+1); - version = inc.version; - next_client = inc.next_client; - for (map >::iterator p = inc.mount.begin(); - p != inc.mount.end(); - ++p) { - client_addr[p->first] = p->second; - addr_client[p->second] = p->first; - } - - for (set::iterator p = inc.unmount.begin(); - p != inc.unmount.end(); - ++p) { - assert(client_addr.count(*p)); - addr_client.erase(client_addr[*p]); - client_addr.erase(*p); - } - } - - void _encode(bufferlist &bl) { - ::_encode(version, bl); - ::_encode(next_client, bl); - ::_encode(client_addr, bl); - } - void _decode(bufferlist &bl, int& off) { - ::_decode(version, bl, off); - ::_decode(next_client, bl, off); - ::_decode(client_addr, bl, off); - reverse(); - } - }; - - class C_Mounted : public Context { - ClientMonitor *cmon; - int client; - MClientMount *m; - public: - C_Mounted(ClientMonitor *cm, int c, MClientMount *m_) : - cmon(cm), client(c), m(m_) {} - void finish(int r) { - if (r >= 0) - cmon->_mounted(client, m); - else - cmon->dispatch((Message*)m); - } - }; - - class C_Unmounted : public Context { - ClientMonitor *cmon; - MClientUnmount *m; - public: - C_Unmounted(ClientMonitor *cm, MClientUnmount *m_) : - cmon(cm), m(m_) {} - void finish(int r) { - if (r >= 0) - cmon->_unmounted(m); - else - cmon->dispatch((Message*)m); - } - }; - - -private: - Map client_map; - - // leader - Incremental pending_inc; - - void create_initial(); - bool update_from_paxos(); - void create_pending(); // prepare a new pending - void encode_pending(bufferlist &bl); // propose pending update to peers - - void committed(); - - void _mounted(int c, MClientMount *m); - void _unmounted(MClientUnmount *m); - - bool preprocess_query(Message *m); // true if processed. - bool prepare_update(Message *m); - - - public: - ClientMonitor(Monitor *mn, Paxos *p) : PaxosService(mn, p) { } - - //void tick(); // check state, take actions - -}; - -#endif diff --git a/branches/sage/crush/mon/Elector.cc b/branches/sage/crush/mon/Elector.cc deleted file mode 100644 index 4a09b58ab5073..0000000000000 --- a/branches/sage/crush/mon/Elector.cc +++ /dev/null @@ -1,293 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "Elector.h" -#include "Monitor.h" - -#include "common/Timer.h" -#include "MonitorStore.h" -#include "messages/MMonElection.h" - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_dout << dbeginl << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".elector(" << epoch << ") " -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_derr << dbeginl << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".elector(" << epoch << ") " - - -void Elector::init() -{ - epoch = mon->store->get_int("mon_epoch"); - if (!epoch) - epoch = 1; - dout(1) << "init, last seen epoch " << epoch << dendl; -} - -void Elector::shutdown() -{ - if (expire_event) - mon->timer.cancel_event(expire_event); -} - -void Elector::bump_epoch(epoch_t e) -{ - dout(10) << "bump_epoch " << epoch << " to " << e << dendl; - assert(epoch < e); - epoch = e; - mon->store->put_int(epoch, "mon_epoch"); - - // clear up some state - electing_me = false; - acked_me.clear(); - leader_acked = -1; -} - - -void Elector::start() -{ - dout(5) << "start -- can i be leader?" << dendl; - - // start by trying to elect me - if (epoch % 2 == 0) - bump_epoch(epoch+1); // odd == election cycle - start_stamp = g_clock.now(); - electing_me = true; - acked_me.insert(whoami); - - // bcast to everyone else - for (int i=0; imonmap->num_mon; ++i) { - if (i == whoami) continue; - mon->messenger->send_message(new MMonElection(MMonElection::OP_PROPOSE, epoch), - mon->monmap->get_inst(i)); - } - - reset_timer(); -} - -void Elector::defer(int who) -{ - dout(5) << "defer to " << who << dendl; - - if (electing_me) { - // drop out - acked_me.clear(); - electing_me = false; - } - - // ack them - leader_acked = who; - ack_stamp = g_clock.now(); - mon->messenger->send_message(new MMonElection(MMonElection::OP_ACK, epoch), - mon->monmap->get_inst(who)); - - // set a timer - reset_timer(1.0); // give the leader some extra time to declare victory -} - - -void Elector::reset_timer(double plus) -{ - // set the timer - cancel_timer(); - expire_event = new C_ElectionExpire(this); - mon->timer.add_event_after(g_conf.mon_lease + plus, - expire_event); -} - - -void Elector::cancel_timer() -{ - if (expire_event) { - mon->timer.cancel_event(expire_event); - expire_event = 0; - } -} - -void Elector::expire() -{ - dout(5) << "election timer expired" << dendl; - - // did i win? - if (electing_me && - acked_me.size() > (unsigned)(mon->monmap->num_mon / 2)) { - // i win - victory(); - } else { - // whoever i deferred to didn't declare victory quickly enough. - start(); - } -} - - -void Elector::victory() -{ - leader_acked = -1; - electing_me = false; - set quorum = acked_me; - - cancel_timer(); - - assert(epoch % 2 == 1); // election - bump_epoch(epoch+1); // is over! - - // tell everyone - for (set::iterator p = quorum.begin(); - p != quorum.end(); - ++p) { - if (*p == whoami) continue; - mon->messenger->send_message(new MMonElection(MMonElection::OP_VICTORY, epoch), - mon->monmap->get_inst(*p)); - } - - // tell monitor - mon->win_election(epoch, quorum); -} - - -void Elector::handle_propose(MMonElection *m) -{ - dout(5) << "handle_propose from " << m->get_source() << dendl; - int from = m->get_source().num(); - - assert(m->epoch % 2 == 1); // election - if (m->epoch > epoch) { - bump_epoch(m->epoch); - } - else if (m->epoch < epoch && // got an "old" propose, - epoch % 2 == 0 && // in a non-election cycle - mon->quorum.count(from) == 0) { // from someone outside the quorum - // a mon just started up, call a new election so they can rejoin! - dout(5) << " got propose from old epoch, " << m->get_source() << " must have just started" << dendl; - start(); - } - - if (whoami < from) { - // i would win over them. - if (leader_acked >= 0) { // we already acked someone - assert(leader_acked < from); // and they still win, of course - dout(5) << "no, we already acked " << leader_acked << dendl; - } else { - // wait, i should win! - if (!electing_me) - start(); - } - } else { - // they would win over me - if (leader_acked < 0 || // haven't acked anyone yet, or - leader_acked > from || // they would win over who you did ack, or - leader_acked == from) { // this is the guy we're already deferring to - defer(from); - } else { - // ignore them! - dout(5) << "no, we already acked " << leader_acked << dendl; - } - } - - delete m; -} - -void Elector::handle_ack(MMonElection *m) -{ - dout(5) << "handle_ack from " << m->get_source() << dendl; - int from = m->get_source().num(); - - assert(m->epoch % 2 == 1); // election - if (m->epoch > epoch) { - dout(5) << "woah, that's a newer epoch, i must have rebooted. bumping and re-starting!" << dendl; - bump_epoch(m->epoch); - start(); - delete m; - return; - } - assert(m->epoch == epoch); - - if (electing_me) { - // thanks - acked_me.insert(from); - dout(5) << " so far i have " << acked_me << dendl; - - // is that _everyone_? - if (acked_me.size() == (unsigned)mon->monmap->num_mon) { - // if yes, shortcut to election finish - victory(); - } - } else { - // ignore, i'm deferring already. - assert(leader_acked >= 0); - } - - delete m; -} - - -void Elector::handle_victory(MMonElection *m) -{ - dout(5) << "handle_victory from " << m->get_source() << dendl; - int from = m->get_source().num(); - - assert(from < whoami); - assert(m->epoch % 2 == 0); - assert(m->epoch == epoch + 1); // i should have seen this election if i'm getting the victory. - bump_epoch(m->epoch); - - // they win - mon->lose_election(epoch, from); - - // cancel my timer - cancel_timer(); -} - - - - -void Elector::dispatch(Message *m) -{ - switch (m->get_type()) { - - case MSG_MON_ELECTION: - { - MMonElection *em = (MMonElection*)m; - - switch (em->op) { - case MMonElection::OP_PROPOSE: - handle_propose(em); - return; - } - - if (em->epoch < epoch) { - dout(5) << "old epoch, dropping" << dendl; - delete em; - break; - } - - switch (em->op) { - case MMonElection::OP_ACK: - handle_ack(em); - return; - case MMonElection::OP_VICTORY: - handle_victory(em); - return; - default: - assert(0); - } - } - break; - - default: - assert(0); - } -} - - - - diff --git a/branches/sage/crush/mon/MDSMonitor.cc b/branches/sage/crush/mon/MDSMonitor.cc deleted file mode 100644 index 645f029f6b203..0000000000000 --- a/branches/sage/crush/mon/MDSMonitor.cc +++ /dev/null @@ -1,625 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include "MDSMonitor.h" -#include "Monitor.h" -#include "MonitorStore.h" -#include "OSDMonitor.h" - -#include "messages/MMDSMap.h" -#include "messages/MMDSGetMap.h" -#include "messages/MMDSBeacon.h" - -#include "messages/MMonCommand.h" -#include "messages/MMonCommandAck.h" - -#include "messages/MGenericMessage.h" - - -#include "common/Timer.h" - -#include - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_dout << dbeginl << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".mds e" << mdsmap.get_epoch() << " " -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_derr << dbeginl << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".mds e" << mdsmap.get_epoch() << " " - - - -// my methods - -void MDSMonitor::print_map(MDSMap &m) -{ - dout(7) << "print_map epoch " << m.get_epoch() << " target_num " << m.target_num << dendl; - entity_inst_t blank; - set all; - m.get_mds_set(all); - for (set::iterator p = all.begin(); - p != all.end(); - ++p) { - dout(7) << " mds" << *p << "." << m.mds_inc[*p] - << " : " << MDSMap::get_state_name(m.get_state(*p)) - << " : " << (m.have_inst(*p) ? m.get_inst(*p) : blank) - << dendl; - } -} - - - -// service methods - -void MDSMonitor::create_initial() -{ - dout(10) << "create_initial" << dendl; - pending_mdsmap.target_num = g_conf.num_mds; - pending_mdsmap.created = g_clock.now(); - print_map(pending_mdsmap); -} - -bool MDSMonitor::update_from_paxos() -{ - assert(paxos->is_active()); - - version_t paxosv = paxos->get_version(); - if (paxosv == mdsmap.epoch) return true; - assert(paxosv >= mdsmap.epoch); - - dout(10) << "update_from_paxos paxosv " << paxosv - << ", my e " << mdsmap.epoch << dendl; - - // read and decode - mdsmap_bl.clear(); - bool success = paxos->read(paxosv, mdsmap_bl); - assert(success); - dout(10) << "update_from_paxos got " << paxosv << dendl; - mdsmap.decode(mdsmap_bl); - - // new map - dout(7) << "new map:" << dendl; - print_map(mdsmap); - - // bcast map to mds, waiters - if (mon->is_leader()) - bcast_latest_mds(); - send_to_waiting(); - - return true; -} - -void MDSMonitor::create_pending() -{ - pending_mdsmap = mdsmap; - pending_mdsmap.epoch++; - dout(10) << "create_pending e" << pending_mdsmap.epoch << dendl; -} - -void MDSMonitor::encode_pending(bufferlist &bl) -{ - dout(10) << "encode_pending e" << pending_mdsmap.epoch << dendl; - - //print_map(pending_mdsmap); - - // apply to paxos - assert(paxos->get_version() + 1 == pending_mdsmap.epoch); - pending_mdsmap.encode(bl); -} - - -bool MDSMonitor::preprocess_query(Message *m) -{ - dout(10) << "preprocess_query " << *m << " from " << m->get_source_inst() << dendl; - - switch (m->get_type()) { - - case MSG_MDS_BEACON: - return preprocess_beacon((MMDSBeacon*)m); - - case MSG_MDS_GETMAP: - send_full(m->get_source_inst()); - return true; - - case MSG_MON_COMMAND: - return false; - - default: - assert(0); - delete m; - return true; - } -} - - -bool MDSMonitor::preprocess_beacon(MMDSBeacon *m) -{ - dout(12) << "preprocess_beacon " << *m - << " from " << m->get_mds_inst() - << dendl; - - // fw to leader? - if (!mon->is_leader()) { - dout(10) << "fw to leader" << dendl; - mon->messenger->send_message(m, mon->monmap->get_inst(mon->get_leader())); - return true; - } - - // let's see. - int from = m->get_mds_inst().name.num(); - int state = m->get_state(); - version_t seq = m->get_seq(); - - // can i handle this query without a map update? - - // boot? - if (state == MDSMap::STATE_BOOT) { - // already booted? - int already = mdsmap.get_addr_rank(m->get_mds_inst().addr); - if (already < 0) - return false; // need to update map - - // already booted. just reply to beacon, as per usual. - from = already; - } - - // reply to beacon - if (mdsmap.mds_state_seq[from] > seq) { - dout(7) << "mds_beacon " << *m << " has old seq, ignoring" << dendl; - delete m; - return true; - } - - // reply to beacon? - if (state != MDSMap::STATE_STOPPED) { - last_beacon[from] = g_clock.now(); // note time - mon->messenger->send_message(new MMDSBeacon(m->get_mds_inst(), mdsmap.get_epoch(), state, seq), - m->get_mds_inst()); - } - - // is there a state change here? - if (mdsmap.mds_state.count(from) == 0) { - if (state == MDSMap::STATE_BOOT) - return false; // need to add to map - dout(1) << "mds_beacon " << *m << " announcing non-boot state, ignoring" << dendl; - } else if (mdsmap.mds_state[from] != state) { - if (mdsmap.get_epoch() == m->get_last_epoch_seen()) - return false; // need to update map - dout(10) << "mds_beacon " << *m << " ignoring requested state, because mds hasn't seen latest map" << dendl; - } - - // we're done. - delete m; - return true; -} - - -bool MDSMonitor::prepare_update(Message *m) -{ - dout(7) << "prepare_update " << *m << dendl; - - switch (m->get_type()) { - - case MSG_MDS_BEACON: - return handle_beacon((MMDSBeacon*)m); - - case MSG_MON_COMMAND: - return handle_command((MMonCommand*)m); - - default: - assert(0); - delete m; - } - - return true; -} - - - -bool MDSMonitor::handle_beacon(MMDSBeacon *m) -{ - // -- this is an update -- - dout(12) << "handle_beacon " << *m - << " from " << m->get_mds_inst() - << dendl; - int from = m->get_mds_inst().name.num(); - int state = m->get_state(); - version_t seq = m->get_seq(); - - assert(state != mdsmap.get_state(from)); - - // boot? - if (state == MDSMap::STATE_BOOT) { - // assign a name. - if (from >= 0) { - // wants to be (or already is) a specific MDS. - if (!g_conf.mon_allow_mds_bully && - (!mdsmap.have_inst(from) || mdsmap.get_inst(from) != m->get_mds_inst())) { - dout(10) << "mds_beacon boot: mds" << from << " is someone else" << dendl; - from = -1; - } else { - switch (mdsmap.get_state(from)) { - case MDSMap::STATE_STOPPED: - case MDSMap::STATE_STARTING: - case MDSMap::STATE_STANDBY: - state = MDSMap::STATE_STARTING; - break; - case MDSMap::STATE_DNE: - case MDSMap::STATE_CREATING: - state = MDSMap::STATE_CREATING; - break; - case MDSMap::STATE_FAILED: - default: - state = MDSMap::STATE_REPLAY; - break; - } - dout(10) << "mds_beacon boot: mds" << from - << " was " << MDSMap::get_state_name(mdsmap.get_state(from)) - << ", " << MDSMap::get_state_name(state) - << dendl; - } - } - if (from < 0) { - from = pending_mdsmap.get_addr_rank(m->get_mds_inst().addr); - if (from >= 0) { - state = pending_mdsmap.mds_state[from]; - dout(10) << "mds_beacon boot: already pending mds" << from - << " " << MDSMap::get_state_name(state) << dendl; - delete m; - return false; - } - } - if (from < 0) { - // pick a failed mds? - set failed; - pending_mdsmap.get_failed_mds_set(failed); - if (!failed.empty()) { - from = *failed.begin(); - dout(10) << "mds_beacon boot: assigned failed mds" << from << dendl; - state = MDSMap::STATE_REPLAY; - } - } - if (from < 0) { - // ok, just pick any unused mds id. - for (from=0; ; ++from) { - if (pending_mdsmap.is_dne(from)) { - dout(10) << "mds_beacon boot: assigned new mds" << from << dendl; - state = MDSMap::STATE_CREATING; - break; - } else if (pending_mdsmap.is_stopped(from)) { - dout(10) << "mds_beacon boot: assigned stopped mds" << from << dendl; - state = MDSMap::STATE_STARTING; - break; - } - } - } - - assert(state == MDSMap::STATE_CREATING || - state == MDSMap::STATE_STARTING || - state == MDSMap::STATE_REPLAY); - - // put it in the map. - pending_mdsmap.mds_inst[from].addr = m->get_mds_inst().addr; - pending_mdsmap.mds_inst[from].name = entity_name_t::MDS(from); - pending_mdsmap.mds_inc[from]++; - - // reset the beacon timer - last_beacon[from] = g_clock.now(); - - // if starting|creating and degraded|full, go to standby - if ((state == MDSMap::STATE_CREATING || state == MDSMap::STATE_STARTING) && - (pending_mdsmap.would_be_overfull_with(from) || - pending_mdsmap.is_degraded())) { - dout(10) << "mds_beacon cluster full, mds" << from << " will be standby" << dendl; - state = MDSMap::STATE_STANDBY; - } - } - - // created? - if (state == MDSMap::STATE_ACTIVE && - mdsmap.is_creating(from)) { - pending_mdsmap.mds_created.insert(from); - dout(10) << "mds_beacon created mds" << from << dendl; - } - - // update the map - dout(10) << "mds_beacon mds" << from << " " << MDSMap::get_state_name(mdsmap.mds_state[from]) - << " -> " << MDSMap::get_state_name(state) - << dendl; - - // has someone join or leave the cluster? - if (state == MDSMap::STATE_REPLAY || - state == MDSMap::STATE_ACTIVE || - state == MDSMap::STATE_STOPPED) { - pending_mdsmap.same_in_set_since = pending_mdsmap.epoch; - } - - // change the state - pending_mdsmap.mds_state[from] = state; - if (pending_mdsmap.is_up(from)) - pending_mdsmap.mds_state_seq[from] = seq; - else - pending_mdsmap.mds_state_seq.erase(from); - - dout(7) << "pending map now:" << dendl; - print_map(pending_mdsmap); - - paxos->wait_for_commit(new C_Updated(this, from, m)); - - return true; -} - -bool MDSMonitor::should_propose(double& delay) -{ - delay = 0.0; - return true; -} - -void MDSMonitor::_updated(int from, MMDSBeacon *m) -{ - if (m->get_state() == MDSMap::STATE_BOOT) { - dout(10) << "_updated (booted) mds" << from << " " << *m << dendl; - mon->osdmon->send_latest(mdsmap.get_inst(from)); - } else { - dout(10) << "_updated mds" << from << " " << *m << dendl; - } - if (m->get_state() == MDSMap::STATE_STOPPED) { - // send the map manually (they're out of the map, so they won't get it automatic) - send_latest(m->get_mds_inst()); - } - - delete m; -} - - -void MDSMonitor::committed() -{ - // check for failed - set standby; - set failed; - mdsmap.get_mds_set(standby, MDSMap::STATE_STANDBY); - mdsmap.get_failed_mds_set(failed); - - if (!standby.empty() && !failed.empty()) { - while (!standby.empty() && !failed.empty()) { - int f = *failed.begin(); - int t = *standby.begin(); - failed.erase(failed.begin()); - standby.erase(standby.begin()); - - dout(0) << "mds" << t << " taking over for mds" << f << dendl; - - // send new map to old inst/name - waiting_for_map.push_back(mdsmap.mds_inst[t]); - - pending_mdsmap.mds_inst[f] = mdsmap.mds_inst[t]; - pending_mdsmap.mds_inst[f].name = entity_name_t::MDS(f); - pending_mdsmap.mds_inc[f]++; - pending_mdsmap.mds_state[f] = MDSMap::STATE_REPLAY; - pending_mdsmap.mds_state_seq[f] = mdsmap.mds_state_seq[t]; - - pending_mdsmap.mds_inst.erase(t); - pending_mdsmap.mds_state.erase(t); - pending_mdsmap.mds_state_seq.erase(t); - - last_beacon[f] = last_beacon[t]; - last_beacon.erase(t); - } - - dout(7) << "pending map now:" << dendl; - print_map(pending_mdsmap); - - propose_pending(); - } - - // hackish: did all mds's shut down? - if (mon->is_leader() && - g_conf.mon_stop_with_last_mds && - mdsmap.get_epoch() > 1 && - mdsmap.is_stopped()) - mon->messenger->send_message(new MGenericMessage(MSG_SHUTDOWN), - mon->monmap->get_inst(mon->whoami)); -} - - -bool MDSMonitor::handle_command(MMonCommand *m) -{ - int r = -EINVAL; - stringstream ss; - - if (m->cmd.size() > 1) { - if (m->cmd[1] == "stop" && m->cmd.size() > 2) { - int who = atoi(m->cmd[2].c_str()); - if (mdsmap.is_active(who)) { - r = 0; - ss << "telling mds" << who << " to stop"; - pending_mdsmap.mds_state[who] = MDSMap::STATE_STOPPING; - } else { - r = -EEXIST; - ss << "mds" << who << " not active (" << mdsmap.get_state_name(mdsmap.get_state(who)) << ")"; - } - } - else if (m->cmd[1] == "set_target_num" && m->cmd.size() > 2) { - pending_mdsmap.target_num = atoi(m->cmd[2].c_str()); - r = 0; - ss << "target_num = " << pending_mdsmap.target_num; - } - } - if (r == -EINVAL) { - ss << "unrecognized command"; - } - - // reply - string rs; - getline(ss,rs); - mon->messenger->send_message(new MMonCommandAck(r, rs), m->get_source_inst()); - delete m; - return r >= 0; -} - - - -void MDSMonitor::bcast_latest_mds() -{ - dout(10) << "bcast_latest_mds " << mdsmap.get_epoch() << dendl; - - // tell mds - set up; - mdsmap.get_up_mds_set(up); - for (set::iterator p = up.begin(); - p != up.end(); - p++) - send_full(mdsmap.get_inst(*p)); -} - -void MDSMonitor::send_full(entity_inst_t dest) -{ - dout(11) << "send_full to " << dest << dendl; - mon->messenger->send_message(new MMDSMap(&mdsmap), dest); -} - -void MDSMonitor::send_to_waiting() -{ - dout(10) << "send_to_waiting " << mdsmap.get_epoch() << dendl; - for (list::iterator i = waiting_for_map.begin(); - i != waiting_for_map.end(); - i++) - send_full(*i); - waiting_for_map.clear(); -} - -void MDSMonitor::send_latest(entity_inst_t dest) -{ - if (paxos->is_readable()) - send_full(dest); - else - waiting_for_map.push_back(dest); -} - - -void MDSMonitor::tick() -{ - // make sure mds's are still alive - utime_t now = g_clock.now(); - - // ...if i am an active leader - if (!mon->is_leader()) return; - if (!paxos->is_active()) return; - - if (now > g_conf.mds_beacon_grace) { - utime_t cutoff = now; - cutoff -= g_conf.mds_beacon_grace; - - bool changed = false; - - set up; - mdsmap.get_up_mds_set(up); - - for (set::iterator p = up.begin(); - p != up.end(); - ++p) { - if (last_beacon.count(*p)) { - if (last_beacon[*p] < cutoff) { - - // failure! - int newstate; - switch (mdsmap.get_state(*p)) { - case MDSMap::STATE_STANDBY: - if (mdsmap.has_created(*p)) - newstate = MDSMap::STATE_STOPPED; - else - newstate = MDSMap::STATE_DNE; - break; - - case MDSMap::STATE_CREATING: - // didn't finish creating - newstate = MDSMap::STATE_DNE; - break; - - case MDSMap::STATE_STARTING: - newstate = MDSMap::STATE_STOPPED; - break; - - case MDSMap::STATE_REPLAY: - case MDSMap::STATE_RESOLVE: - case MDSMap::STATE_RECONNECT: - case MDSMap::STATE_REJOIN: - case MDSMap::STATE_ACTIVE: - case MDSMap::STATE_STOPPING: - newstate = MDSMap::STATE_FAILED; - break; - - default: - assert(0); - } - - dout(10) << "no beacon from mds" << *p << " since " << last_beacon[*p] - << ", marking " << mdsmap.get_state_name(newstate) - << dendl; - - // update map - pending_mdsmap.mds_state[*p] = newstate; - pending_mdsmap.mds_state_seq.erase(*p); - changed = true; - } - } else { - dout(10) << "no beacons from mds" << *p << ", assuming one " << now << dendl; - last_beacon[*p] = now; - } - } - - if (changed) - propose_pending(); - } -} - - -void MDSMonitor::do_stop() -{ - // hrm... - if (!mon->is_leader() || - !paxos->is_active()) { - dout(-10) << "do_stop can't stop right now, mdsmap not writeable" << dendl; - return; - } - - dout(7) << "do_stop stopping active mds nodes" << dendl; - print_map(mdsmap); - - for (map::iterator p = mdsmap.mds_state.begin(); - p != mdsmap.mds_state.end(); - ++p) { - switch (p->second) { - case MDSMap::STATE_ACTIVE: - case MDSMap::STATE_STOPPING: - pending_mdsmap.mds_state[p->first] = MDSMap::STATE_STOPPING; - break; - case MDSMap::STATE_CREATING: - case MDSMap::STATE_STANDBY: - pending_mdsmap.mds_state[p->first] = MDSMap::STATE_DNE; - break; - case MDSMap::STATE_STARTING: - pending_mdsmap.mds_state[p->first] = MDSMap::STATE_STOPPED; - break; - case MDSMap::STATE_REPLAY: - case MDSMap::STATE_RESOLVE: - case MDSMap::STATE_RECONNECT: - case MDSMap::STATE_REJOIN: - // BUG: hrm, if this is the case, the STOPPING gusy won't be able to stop, will they? - pending_mdsmap.mds_state[p->first] = MDSMap::STATE_FAILED; - break; - } - } - - propose_pending(); -} diff --git a/branches/sage/crush/mon/MDSMonitor.h b/branches/sage/crush/mon/MDSMonitor.h deleted file mode 100644 index 4c8fc91abcbf7..0000000000000 --- a/branches/sage/crush/mon/MDSMonitor.h +++ /dev/null @@ -1,98 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDSMONITOR_H -#define __MDSMONITOR_H - -#include -#include -using namespace std; - -#include "include/types.h" -#include "msg/Messenger.h" - -#include "mds/MDSMap.h" - -#include "PaxosService.h" - -class MMDSBeacon; - -class MDSMonitor : public PaxosService { - public: - // mds maps - MDSMap mdsmap; // current - bufferlist mdsmap_bl; // encoded - - MDSMap pending_mdsmap; // current + pending updates - - // my helpers - void print_map(MDSMap &m); - - class C_Updated : public Context { - MDSMonitor *mm; - int mds; - MMDSBeacon *m; - public: - C_Updated(MDSMonitor *a, int b, MMDSBeacon *c) : - mm(a), mds(b), m(c) {} - void finish(int r) { - if (r >= 0) - mm->_updated(mds, m); // success - else - mm->dispatch((Message*)m); // try again - } - }; - - - // service methods - void create_initial(); - bool update_from_paxos(); - void create_pending(); - void encode_pending(bufferlist &bl); - - void _updated(int m, MMDSBeacon *m); - - bool preprocess_query(Message *m); // true if processed. - bool prepare_update(Message *m); - bool should_propose(double& delay); - - void committed(); - - bool preprocess_beacon(class MMDSBeacon *m); - bool handle_beacon(class MMDSBeacon *m); - bool handle_command(class MMonCommand *m); - - // beacons - map last_beacon; - -public: - MDSMonitor(Monitor *mn, Paxos *p) : PaxosService(mn, p) { } - - // sending the map -private: - list waiting_for_map; - - void bcast_latest_mds(); - void send_full(entity_inst_t dest); - void send_to_waiting(); - -public: - void send_latest(entity_inst_t dest); - - void tick(); // check state, take actions - void do_stop(); - -}; - -#endif diff --git a/branches/sage/crush/mon/MonMap.h b/branches/sage/crush/mon/MonMap.h deleted file mode 100644 index dbe9c9b5ac5e9..0000000000000 --- a/branches/sage/crush/mon/MonMap.h +++ /dev/null @@ -1,101 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MONMAP_H -#define __MONMAP_H - -#include -#include -#include - -#include "msg/Message.h" -#include "include/types.h" - -class MonMap { - public: - epoch_t epoch; // what epoch/version of the monmap - int32_t num_mon; - vector mon_inst; - - int last_mon; // last mon i talked to - - MonMap(int s=0) : epoch(0), num_mon(s), mon_inst(s), last_mon(-1) {} - - void add_mon(entity_inst_t inst) { - mon_inst.push_back(inst); - num_mon++; - } - - // pick a mon. - // choice should be stable, unless we explicitly ask for a new one. - int pick_mon(bool newmon=false) { - if (newmon || (last_mon < 0)) { - last_mon = rand() % num_mon; - } - return last_mon; - } - - const entity_inst_t &get_inst(int m) { - assert(m < num_mon); - return mon_inst[m]; - } - - void encode(bufferlist& blist) { - ::_encode(epoch, blist); - ::_encode(num_mon, blist); - ::_encode(mon_inst, blist); - } - - void decode(bufferlist& blist) { - int off = 0; - ::_decode(epoch, blist, off); - ::_decode(num_mon, blist, off); - ::_decode(mon_inst, blist, off); - } - - // read from/write to a file - int write(char *fn) { - // encode - bufferlist bl; - encode(bl); - - // write - int fd = ::open(fn, O_RDWR|O_CREAT); - if (fd < 0) return fd; - ::fchmod(fd, 0644); - ::write(fd, (void*)bl.c_str(), bl.length()); - ::close(fd); - return 0; - } - - int read(char *fn) { - // read - bufferlist bl; - int fd = ::open(fn, O_RDONLY); - if (fd < 0) return fd; - struct stat st; - ::fstat(fd, &st); - bufferptr bp(st.st_size); - bl.append(bp); - ::read(fd, (void*)bl.c_str(), bl.length()); - ::close(fd); - - // decode - decode(bl); - return 0; - } - -}; - -#endif diff --git a/branches/sage/crush/mon/Monitor.cc b/branches/sage/crush/mon/Monitor.cc deleted file mode 100644 index 1db23b0270e57..0000000000000 --- a/branches/sage/crush/mon/Monitor.cc +++ /dev/null @@ -1,405 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -// TODO: missing run() method, which creates the two main timers, refreshTimer and readTimer - -#include "Monitor.h" - -#include "osd/OSDMap.h" - -#include "MonitorStore.h" - -#include "msg/Message.h" -#include "msg/Messenger.h" - -#include "messages/MPing.h" -#include "messages/MPingAck.h" -#include "messages/MGenericMessage.h" -#include "messages/MMonCommand.h" -#include "messages/MMonCommandAck.h" - -#include "messages/MMonPaxos.h" - -#include "common/Timer.h" -#include "common/Clock.h" - -#include "OSDMonitor.h" -#include "MDSMonitor.h" -#include "ClientMonitor.h" -#include "PGMonitor.h" - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_dout << dbeginl << g_clock.now() << " mon" << whoami << (is_starting() ? (const char*)"(starting)":(is_leader() ? (const char*)"(leader)":(is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << " " -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_derr << dbeginl << g_clock.now() << " mon" << whoami << (is_starting() ? (const char*)"(starting)":(is_leader() ? (const char*)"(leader)":(is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << " " - - - -void Monitor::init() -{ - lock.Lock(); - - dout(1) << "init" << dendl; - - // store - char s[80]; - sprintf(s, "mondata/mon%d", whoami); - store = new MonitorStore(s); - - if (g_conf.mkfs) - store->mkfs(); - - store->mount(); - - // create - osdmon = new OSDMonitor(this, &paxos_osdmap); - mdsmon = new MDSMonitor(this, &paxos_mdsmap); - clientmon = new ClientMonitor(this, &paxos_clientmap); - pgmon = new PGMonitor(this, &paxos_pgmap); - - // init paxos - paxos_test.init(); - paxos_osdmap.init(); - paxos_mdsmap.init(); - paxos_clientmap.init(); - paxos_pgmap.init(); - - // i'm ready! - messenger->set_dispatcher(this); - - // start ticker - reset_tick(); - - // call election? - if (monmap->num_mon > 1) { - assert(monmap->num_mon != 2); - call_election(); - } else { - // we're standalone. - set q; - q.insert(whoami); - win_election(1, q); - } - - lock.Unlock(); -} - -void Monitor::shutdown() -{ - dout(1) << "shutdown" << dendl; - - elector.shutdown(); - - if (is_leader()) { - // stop osds. - for (set::iterator it = osdmon->osdmap.get_osds().begin(); - it != osdmon->osdmap.get_osds().end(); - it++) { - if (osdmon->osdmap.is_down(*it)) continue; - dout(10) << "sending shutdown to osd" << *it << dendl; - messenger->send_message(new MGenericMessage(MSG_SHUTDOWN), - osdmon->osdmap.get_inst(*it)); - } - osdmon->mark_all_down(); - - // monitors too. - for (int i=0; inum_mon; i++) - if (i != whoami) - messenger->send_message(new MGenericMessage(MSG_SHUTDOWN), - monmap->get_inst(i)); - } - - // cancel all events - cancel_tick(); - timer.cancel_all(); - timer.join(); - - // unmount my local storage - if (store) - delete store; - - // clean up - if (osdmon) delete osdmon; - if (mdsmon) delete mdsmon; - if (clientmon) delete clientmon; - if (pgmon) delete pgmon; - - // die. - messenger->shutdown(); -} - - -void Monitor::call_election() -{ - if (monmap->num_mon == 1) return; - - dout(10) << "call_election" << dendl; - state = STATE_STARTING; - - // tell paxos - paxos_test.election_starting(); - paxos_mdsmap.election_starting(); - paxos_osdmap.election_starting(); - paxos_clientmap.election_starting(); - - // call a new election - elector.call_election(); -} - -void Monitor::win_election(epoch_t epoch, set& active) -{ - state = STATE_LEADER; - leader = whoami; - mon_epoch = epoch; - quorum = active; - dout(10) << "win_election, epoch " << mon_epoch << " quorum is " << quorum << dendl; - - // init paxos - paxos_test.leader_init(); - paxos_mdsmap.leader_init(); - paxos_osdmap.leader_init(); - paxos_clientmap.leader_init(); - paxos_pgmap.leader_init(); - - // init - osdmon->election_finished(); - mdsmon->election_finished(); - clientmon->election_finished(); - pgmon->election_finished(); -} - -void Monitor::lose_election(epoch_t epoch, int l) -{ - state = STATE_PEON; - mon_epoch = epoch; - leader = l; - dout(10) << "lose_election, epoch " << mon_epoch << " leader is mon" << leader << dendl; - - // init paxos - paxos_test.peon_init(); - paxos_mdsmap.peon_init(); - paxos_osdmap.peon_init(); - paxos_clientmap.peon_init(); - paxos_pgmap.peon_init(); - - // init - osdmon->election_finished(); - mdsmon->election_finished(); - clientmon->election_finished(); - pgmon->election_finished(); -} - - -void Monitor::handle_command(MMonCommand *m) -{ - dout(0) << "handle_command " << *m << dendl; - - int r = -1; - string rs = "unrecognized command"; - - if (!m->cmd.empty()) { - if (m->cmd[0] == "stop") { - r = 0; - rs = "stopping"; - do_stop(); - } - else if (m->cmd[0] == "mds") { - mdsmon->dispatch(m); - return; - } - else if (m->cmd[0] == "osd") { - - } - } - - // reply - messenger->send_message(new MMonCommandAck(r, rs), m->get_source_inst()); - delete m; -} - - -void Monitor::do_stop() -{ - dout(0) << "do_stop -- shutting down" << dendl; - stopping = true; - mdsmon->do_stop(); -} - - -void Monitor::dispatch(Message *m) -{ - lock.Lock(); - { - switch (m->get_type()) { - - // misc - case MSG_PING_ACK: - handle_ping_ack((MPingAck*)m); - break; - - case MSG_SHUTDOWN: - if (m->get_source().is_osd()) - osdmon->dispatch(m); - else - handle_shutdown(m); - break; - - case MSG_MON_COMMAND: - handle_command((MMonCommand*)m); - break; - - - // OSDs - case MSG_OSD_GETMAP: - case MSG_OSD_FAILURE: - case MSG_OSD_BOOT: - case MSG_OSD_IN: - case MSG_OSD_OUT: - osdmon->dispatch(m); - break; - - - // MDSs - case MSG_MDS_BEACON: - case MSG_MDS_GETMAP: - mdsmon->dispatch(m); - break; - - // clients - case MSG_CLIENT_MOUNT: - case MSG_CLIENT_UNMOUNT: - clientmon->dispatch(m); - break; - - // pg - case MSG_STATFS: - case MSG_PGSTATS: - pgmon->dispatch(m); - break; - - - // paxos - case MSG_MON_PAXOS: - { - MMonPaxos *pm = (MMonPaxos*)m; - - // sanitize - if (pm->epoch > mon_epoch) - call_election(); - if (pm->epoch != mon_epoch) { - delete pm; - break; - } - - // send it to the right paxos instance - switch (pm->machine_id) { - case PAXOS_TEST: - paxos_test.dispatch(m); - break; - case PAXOS_OSDMAP: - paxos_osdmap.dispatch(m); - break; - case PAXOS_MDSMAP: - paxos_mdsmap.dispatch(m); - break; - case PAXOS_CLIENTMAP: - paxos_clientmap.dispatch(m); - break; - default: - assert(0); - } - } - break; - - // elector messages - case MSG_MON_ELECTION: - elector.dispatch(m); - break; - - - default: - dout(0) << "unknown message " << m << " " << *m << " from " << m->get_source_inst() << dendl; - assert(0); - } - } - lock.Unlock(); -} - - -void Monitor::handle_shutdown(Message *m) -{ - assert(m->get_source().is_mon()); - if (m->get_source().num() == get_leader()) { - dout(1) << "shutdown from leader " << m->get_source() << dendl; - shutdown(); - } else { - dout(1) << "ignoring shutdown from non-leader " << m->get_source() << dendl; - } - delete m; -} - -void Monitor::handle_ping_ack(MPingAck *m) -{ - // ... - - delete m; -} - - - - -/************ TICK ***************/ - -class C_Mon_Tick : public Context { - Monitor *mon; -public: - C_Mon_Tick(Monitor *m) : mon(m) {} - void finish(int r) { - mon->tick(); - } -}; - -void Monitor::cancel_tick() -{ - if (tick_timer) timer.cancel_event(tick_timer); -} - -void Monitor::reset_tick() -{ - cancel_tick(); - tick_timer = new C_Mon_Tick(this); - timer.add_event_after(g_conf.mon_tick_interval, tick_timer); -} - - -void Monitor::tick() -{ - tick_timer = 0; - - // ok go. - dout(11) << "tick" << dendl; - - osdmon->tick(); - mdsmon->tick(); - - // next tick! - reset_tick(); -} - - - - - - - diff --git a/branches/sage/crush/mon/Monitor.h b/branches/sage/crush/mon/Monitor.h deleted file mode 100644 index bd278a2092308..0000000000000 --- a/branches/sage/crush/mon/Monitor.h +++ /dev/null @@ -1,154 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MONITOR_H -#define __MONITOR_H - -#include "include/types.h" -#include "msg/Messenger.h" - -#include "common/Timer.h" - -#include "MonMap.h" -#include "Elector.h" -#include "Paxos.h" - - -class MonitorStore; -class OSDMonitor; -class MDSMonitor; -class ClientMonitor; -class PGMonitor; - -class Monitor : public Dispatcher { -public: - // me - int whoami; - Messenger *messenger; - Mutex lock; - - MonMap *monmap; - - // timer. - SafeTimer timer; - Context *tick_timer; - void cancel_tick(); - void reset_tick(); - friend class C_Mon_Tick; - - // -- local storage -- -public: - MonitorStore *store; - - // -- monitor state -- -private: - const static int STATE_STARTING = 0; // electing - const static int STATE_LEADER = 1; - const static int STATE_PEON = 2; - int state; - bool stopping; - -public: - bool is_starting() { return state == STATE_STARTING; } - bool is_leader() { return state == STATE_LEADER; } - bool is_peon() { return state == STATE_PEON; } - bool is_stopping() { return stopping; } - - - // -- elector -- -private: - Elector elector; - friend class Elector; - - epoch_t mon_epoch; // monitor epoch (election instance) - int leader; // current leader (to best of knowledge) - set quorum; // current active set of monitors (if !starting) - utime_t last_called_election; // [starting] last time i called an election - -public: - epoch_t get_epoch() { return mon_epoch; } - int get_leader() { return leader; } - const set& get_quorum() { return quorum; } - - void call_election(); // initiate election - void win_election(epoch_t epoch, set& q); // end election (called by Elector) - void lose_election(epoch_t epoch, int l); // end election (called by Elector) - - - // -- paxos -- - Paxos paxos_test; - Paxos paxos_mdsmap; - Paxos paxos_osdmap; - Paxos paxos_clientmap; - Paxos paxos_pgmap; - friend class Paxos; - - - // -- services -- - OSDMonitor *osdmon; - MDSMonitor *mdsmon; - ClientMonitor *clientmon; - PGMonitor *pgmon; - - friend class OSDMonitor; - friend class MDSMonitor; - friend class ClientMonitor; - friend class PGMonitor; - - - // messages - void handle_shutdown(Message *m); - void handle_ping_ack(class MPingAck *m); - void handle_command(class MMonCommand *m); - - - - public: - Monitor(int w, Messenger *m, MonMap *mm) : - whoami(w), - messenger(m), - monmap(mm), - timer(lock), tick_timer(0), - store(0), - - state(STATE_STARTING), stopping(false), - - elector(this, w), - mon_epoch(0), - leader(0), - - paxos_test(this, w, PAXOS_TEST), - paxos_mdsmap(this, w, PAXOS_MDSMAP), - paxos_osdmap(this, w, PAXOS_OSDMAP), - paxos_clientmap(this, w, PAXOS_CLIENTMAP), - paxos_pgmap(this, w, PAXOS_PGMAP), - - osdmon(0), mdsmon(0), clientmon(0) - { - } - ~Monitor() { - delete messenger; - } - - void init(); - void shutdown(); - void dispatch(Message *m); - void tick(); - - void do_stop(); - -}; - -#endif diff --git a/branches/sage/crush/mon/MonitorStore.cc b/branches/sage/crush/mon/MonitorStore.cc deleted file mode 100644 index 86df22bcd6590..0000000000000 --- a/branches/sage/crush/mon/MonitorStore.cc +++ /dev/null @@ -1,222 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "MonitorStore.h" -#include "common/Clock.h" - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_dout << dbeginl << g_clock.now() << " store(" << dir <<") " -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_derr << dbeginl << g_clock.now() << " store(" << dir <<") " - -#include -#include -#include -#include -#include - -void MonitorStore::mount() -{ - dout(1) << "mount" << dendl; - // verify dir exists - DIR *d = ::opendir(dir.c_str()); - if (!d) { - derr(1) << "basedir " << dir << " dne" << dendl; - assert(0); - } - ::closedir(d); - - if (g_conf.use_abspaths) { - // combine it with the cwd, in case fuse screws things up (i.e. fakefuse) - string old = dir; - char *cwd = get_current_dir_name(); - dir = cwd; - free(cwd); - dir += "/"; - dir += old; - } -} - - -void MonitorStore::mkfs() -{ - dout(1) << "mkfs" << dendl; - - char cmd[200]; - sprintf(cmd, "test -d %s && /bin/rm -r %s ; mkdir -p %s", dir.c_str(), dir.c_str(), dir.c_str()); - dout(1) << cmd << dendl; - system(cmd); -} - - -version_t MonitorStore::get_int(const char *a, const char *b) -{ - char fn[200]; - if (b) - sprintf(fn, "%s/%s/%s", dir.c_str(), a, b); - else - sprintf(fn, "%s/%s", dir.c_str(), a); - - FILE *f = ::fopen(fn, "r"); - if (!f) - return 0; - - char buf[20]; - ::fgets(buf, 20, f); - ::fclose(f); - - version_t val = atoi(buf); - - if (b) { - dout(15) << "get_int " << a << "/" << b << " = " << val << dendl; - } else { - dout(15) << "get_int " << a << " = " << val << dendl; - } - return val; -} - - -void MonitorStore::put_int(version_t val, const char *a, const char *b) -{ - char fn[200]; - sprintf(fn, "%s/%s", dir.c_str(), a); - if (b) { - ::mkdir(fn, 0755); - dout(15) << "set_int " << a << "/" << b << " = " << val << dendl; - sprintf(fn, "%s/%s/%s", dir.c_str(), a, b); - } else { - dout(15) << "set_int " << a << " = " << val << dendl; - } - - char vs[30]; -#ifdef __LP64__ - sprintf(vs, "%ld\n", val); -#else - sprintf(vs, "%lld\n", val); -#endif - - char tfn[200]; - sprintf(tfn, "%s.new", fn); - - int fd = ::open(tfn, O_WRONLY|O_CREAT, 0644); - assert(fd > 0); - ::write(fd, vs, strlen(vs)); - ::close(fd); - ::rename(tfn, fn); -} - - -// ---------------------------------------- -// buffers - -bool MonitorStore::exists_bl_ss(const char *a, const char *b) -{ - char fn[200]; - if (b) { - dout(15) << "exists_bl " << a << "/" << b << dendl; - sprintf(fn, "%s/%s/%s", dir.c_str(), a, b); - } else { - dout(15) << "exists_bl " << a << dendl; - sprintf(fn, "%s/%s", dir.c_str(), a); - } - - struct stat st; - int r = ::stat(fn, &st); - //dout(15) << "exists_bl stat " << fn << " r=" << r << " errno " << errno << " " << strerror(errno) << dendl; - return r == 0; -} - - -int MonitorStore::get_bl_ss(bufferlist& bl, const char *a, const char *b) -{ - char fn[200]; - if (b) { - sprintf(fn, "%s/%s/%s", dir.c_str(), a, b); - } else { - sprintf(fn, "%s/%s", dir.c_str(), a); - } - - int fd = ::open(fn, O_RDONLY); - if (!fd) { - if (b) { - dout(15) << "get_bl " << a << "/" << b << " DNE" << dendl; - } else { - dout(15) << "get_bl " << a << " DNE" << dendl; - } - return 0; - } - - // get size - struct stat st; - int rc = ::fstat(fd, &st); - assert(rc == 0); - __int32_t len = st.st_size; - - // read buffer - bl.clear(); - bufferptr bp(len); - int off = 0; - while (off < len) { - dout(20) << "reading at off " << off << " of " << len << dendl; - int r = ::read(fd, bp.c_str()+off, len-off); - if (r < 0) derr(0) << "errno on read " << strerror(errno) << dendl; - assert(r>0); - off += r; - } - bl.append(bp); - ::close(fd); - - if (b) { - dout(15) << "get_bl " << a << "/" << b << " = " << bl.length() << " bytes" << dendl; - } else { - dout(15) << "get_bl " << a << " = " << bl.length() << " bytes" << dendl; - } - - return len; -} - -int MonitorStore::put_bl_ss(bufferlist& bl, const char *a, const char *b) -{ - char fn[200]; - sprintf(fn, "%s/%s", dir.c_str(), a); - if (b) { - ::mkdir(fn, 0755); - dout(15) << "put_bl " << a << "/" << b << " = " << bl.length() << " bytes" << dendl; - sprintf(fn, "%s/%s/%s", dir.c_str(), a, b); - } else { - dout(15) << "put_bl " << a << " = " << bl.length() << " bytes" << dendl; - } - - char tfn[200]; - sprintf(tfn, "%s.new", fn); - int fd = ::open(tfn, O_WRONLY|O_CREAT, 0644); - assert(fd); - - // write data - for (list::const_iterator it = bl.buffers().begin(); - it != bl.buffers().end(); - it++) { - int r = ::write(fd, it->c_str(), it->length()); - if (r != (int)it->length()) - derr(0) << "put_bl_ss ::write() returned " << r << " not " << it->length() << dendl; - if (r < 0) - derr(0) << "put_bl_ss ::write() errored out, errno is " << strerror(errno) << dendl; - } - - ::fsync(fd); - ::close(fd); - ::rename(tfn, fn); - - return 0; -} diff --git a/branches/sage/crush/mon/OSDMonitor.cc b/branches/sage/crush/mon/OSDMonitor.cc deleted file mode 100644 index 8c8fb91b2b18c..0000000000000 --- a/branches/sage/crush/mon/OSDMonitor.cc +++ /dev/null @@ -1,847 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "OSDMonitor.h" -#include "Monitor.h" -#include "MDSMonitor.h" - -#include "MonitorStore.h" - -#include "crush/CrushWrapper.h" - -#include "messages/MOSDFailure.h" -#include "messages/MOSDMap.h" -#include "messages/MOSDGetMap.h" -#include "messages/MOSDBoot.h" -#include "messages/MOSDIn.h" -#include "messages/MOSDOut.h" - -#include "messages/MMonOSDMapInfo.h" -#include "messages/MMonOSDMapLease.h" -#include "messages/MMonOSDMapLeaseAck.h" -#include "messages/MMonOSDMapUpdatePrepare.h" -#include "messages/MMonOSDMapUpdateAck.h" -#include "messages/MMonOSDMapUpdateCommit.h" - -#include "common/Timer.h" - -#include "config.h" - - -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_dout << dbeginl << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".osd e" << osdmap.get_epoch() << " " -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_derr << dbeginl << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".osd e" << osdmap.get_epoch() << " " - - -// FAKING - -class C_Mon_FakeOSDFailure : public Context { - OSDMonitor *mon; - int osd; - bool down; -public: - C_Mon_FakeOSDFailure(OSDMonitor *m, int o, bool d) : mon(m), osd(o), down(d) {} - void finish(int r) { - mon->fake_osd_failure(osd,down); - } -}; - -void OSDMonitor::fake_osd_failure(int osd, bool down) -{ - if (down) { - dout(1) << "fake_osd_failure DOWN osd" << osd << dendl; - pending_inc.new_down[osd].first = osdmap.osd_inst[osd]; - pending_inc.new_down[osd].second = false; - } else { - dout(1) << "fake_osd_failure OUT osd" << osd << dendl; - pending_inc.new_out.push_back(osd); - } - propose_pending(); - - // fixme - //bcast_latest_osd(); - //bcast_latest_mds(); -} - -void OSDMonitor::fake_osdmap_update() -{ - dout(1) << "fake_osdmap_update" << dendl; - propose_pending(); - - // tell a random osd - int osd = rand() % g_conf.num_osd; - send_latest(osdmap.get_inst(osd)); -} - - -void OSDMonitor::fake_reorg() -{ - int r = rand() % g_conf.num_osd; - - if (osdmap.is_out(r)) { - dout(1) << "fake_reorg marking osd" << r << " in" << dendl; - pending_inc.new_in.push_back(r); - } else { - dout(1) << "fake_reorg marking osd" << r << " out" << dendl; - pending_inc.new_out.push_back(r); - } - - propose_pending(); - send_latest(osdmap.get_inst(r)); // after -} - - - -/************ MAPS ****************/ - -void OSDMonitor::create_initial() -{ - assert(mon->is_leader()); - assert(paxos->get_version() == 0); - - dout(1) << "create_initial -- creating initial osdmap from g_conf" << dendl; - - // - OSDMap newmap; - newmap.mon_epoch = mon->mon_epoch; - newmap.ctime = g_clock.now(); - - newmap.set_pg_num(g_conf.num_osd << g_conf.osd_pg_bits); - - // start at epoch 1 until all osds boot - newmap.inc_epoch(); // = 1 - assert(newmap.get_epoch() == 1); - - map weights; - build_crush_map(newmap.crush, weights); - - for (int i=0; i - - // fake osd failures - for (map::iterator i = g_fake_osd_down.begin(); - i != g_fake_osd_down.end(); - i++) { - dout(0) << "will fake osd" << i->first << " DOWN after " << i->second << dendl; - mon->timer.add_event_after(i->second, new C_Mon_FakeOSDFailure(this, i->first, 1)); - } - for (map::iterator i = g_fake_osd_out.begin(); - i != g_fake_osd_out.end(); - i++) { - dout(0) << "will fake osd" << i->first << " OUT after " << i->second << dendl; - mon->timer.add_event_after(i->second, new C_Mon_FakeOSDFailure(this, i->first, 0)); - } - - // encode into pending incremental - newmap.encode(pending_inc.fullmap); -} - - -void OSDMonitor::build_crush_map(CrushWrapper& crush, - map& weights) -{ - // new - crush.create(); - - if (g_conf.num_osd >= 12) { - int ndom = g_conf.osd_max_rep; - int ritems[ndom]; - int rweights[ndom]; - - int nper = ((g_conf.num_osd - 1) / ndom) + 1; - derr(0) << ndom << " failure domains, " << nper << " osds each" << dendl; - - int o = 0; - for (int i=0; i= i) { - crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_TAKE, nroot)); - crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE_INDEP, i, 1)); - crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE_INDEP, 1, 0)); - crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - } else { - crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_TAKE, nroot)); - crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE_INDEP, i, 0)); - crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - } - } - */ - - // test - //vector out; - //pg_to_osds(0x40200000110ULL, out); - - } else { - // one bucket - - int items[g_conf.num_osd]; - for (int i=0; imax_devices << dendl; - //vector t; - //crush.do_rule(2, 132, t, 4, -1); -} - - -bool OSDMonitor::update_from_paxos() -{ - assert(paxos->is_active()); - - version_t paxosv = paxos->get_version(); - if (paxosv == osdmap.epoch) return true; - assert(paxosv >= osdmap.epoch); - - dout(15) << "update_from_paxos paxos e " << paxosv - << ", my e " << osdmap.epoch << dendl; - - if (osdmap.epoch == 0 && paxosv > 1) { - // startup: just load latest full map - epoch_t lastfull = mon->store->get_int("osdmap_full","last_epoch"); - if (lastfull) { - dout(7) << "update_from_paxos startup: loading latest full map e" << lastfull << dendl; - bufferlist bl; - mon->store->get_bl_sn(bl, "osdmap_full", lastfull); - osdmap.decode(bl); - } - } - - // walk through incrementals - while (paxosv > osdmap.epoch) { - bufferlist bl; - bool success = paxos->read(osdmap.epoch+1, bl); - assert(success); - - dout(7) << "update_from_paxos applying incremental " << osdmap.epoch+1 << dendl; - OSDMap::Incremental inc; - int off = 0; - inc.decode(bl, off); - osdmap.apply_incremental(inc); - - // write out the full map, too. - bl.clear(); - osdmap.encode(bl); - mon->store->put_bl_sn(bl, "osdmap_full", osdmap.epoch); - - // share - dout(1) << osdmap.osds.size() << " osds, " - << osdmap.down_osds.size() << " down, " - << osdmap.out_osds.size() << " out" - << dendl; - } - mon->store->put_int(osdmap.epoch, "osdmap_full","last_epoch"); - - // new map! - bcast_latest_mds(); - send_to_waiting(); - - return true; -} - - -void OSDMonitor::create_pending() -{ - pending_inc = OSDMap::Incremental(osdmap.epoch+1); - dout(10) << "create_pending e " << pending_inc.epoch - << dendl; -} - -void OSDMonitor::encode_pending(bufferlist &bl) -{ - dout(10) << "encode_pending e " << pending_inc.epoch - << dendl; - - // finish up pending_inc - pending_inc.ctime = g_clock.now(); - pending_inc.mon_epoch = mon->mon_epoch; - - // tell me about it - for (map >::iterator i = pending_inc.new_down.begin(); - i != pending_inc.new_down.end(); - i++) { - dout(2) << " osd" << i->first << " DOWN " << i->second.first << " clean=" << i->second.second << dendl; - derr(0) << " osd" << i->first << " DOWN " << i->second.first << " clean=" << i->second.second << dendl; - mon->messenger->mark_down(i->second.first.addr); - } - for (map::iterator i = pending_inc.new_up.begin(); - i != pending_inc.new_up.end(); - i++) { - dout(2) << " osd" << i->first << " UP " << i->second << dendl; - derr(0) << " osd" << i->first << " UP " << i->second << dendl; - } - for (list::iterator i = pending_inc.new_out.begin(); - i != pending_inc.new_out.end(); - i++) { - dout(2) << " osd" << *i << " OUT" << dendl; - derr(0) << " osd" << *i << " OUT" << dendl; - } - for (list::iterator i = pending_inc.new_in.begin(); - i != pending_inc.new_in.end(); - i++) { - dout(2) << " osd" << *i << " IN" << dendl; - derr(0) << " osd" << *i << " IN" << dendl; - } - - // encode - assert(paxos->get_version() + 1 == pending_inc.epoch); - pending_inc.encode(bl); -} - - -void OSDMonitor::committed() -{ - -} - - -// ------------- - -bool OSDMonitor::preprocess_query(Message *m) -{ - dout(10) << "preprocess_query " << *m << " from " << m->get_source_inst() << dendl; - - switch (m->get_type()) { - // READs - case MSG_OSD_GETMAP: - handle_osd_getmap((MOSDGetMap*)m); - return true; - - // damp updates - case MSG_OSD_FAILURE: - return preprocess_failure((MOSDFailure*)m); - case MSG_OSD_BOOT: - return preprocess_boot((MOSDBoot*)m); - /* - case MSG_OSD_IN: - return preprocess_in((MOSDIn*)m); - case MSG_OSD_OUT: - return preprocess_out((MOSDOut*)m); - */ - - default: - assert(0); - delete m; - return true; - } -} - -bool OSDMonitor::prepare_update(Message *m) -{ - dout(7) << "prepare_update " << *m << " from " << m->get_source_inst() << dendl; - - switch (m->get_type()) { - // damp updates - case MSG_OSD_FAILURE: - return prepare_failure((MOSDFailure*)m); - case MSG_OSD_BOOT: - return prepare_boot((MOSDBoot*)m); - - /* - case MSG_OSD_IN: - return prepare_in((MOSDIn*)m); - case MSG_OSD_OUT: - return prepare_out((MOSDOut*)m); - */ - - default: - assert(0); - delete m; - } - - return false; -} - -bool OSDMonitor::should_propose(double& delay) -{ - if (osdmap.epoch == 1) { - if (pending_inc.new_up.size() == osdmap.get_osds().size()) { - delay = 0.0; - if (g_conf.osd_auto_weight) { - CrushWrapper crush; - build_crush_map(crush, osd_weight); - crush._encode(pending_inc.crush); - } - return true; - } else - return false; - } - return PaxosService::should_propose(delay); -} - - - -// --------------------------- -// READs - -void OSDMonitor::handle_osd_getmap(MOSDGetMap *m) -{ - dout(7) << "handle_osd_getmap from " << m->get_source() << " from " << m->get_start_epoch() << dendl; - - if (m->get_start_epoch()) { - if (m->get_want_epoch() <= osdmap.get_epoch()) - send_incremental(m->get_source_inst(), m->get_start_epoch()); - else - waiting_for_map[m->get_source_inst()] = pair(m->get_start_epoch(), - m->get_want_epoch()); - } else - send_full(m->get_source_inst()); - - delete m; -} - - - -// --------------------------- -// UPDATEs - -// failure -- - -bool OSDMonitor::preprocess_failure(MOSDFailure *m) -{ - int badboy = m->get_failed().name.num(); - - // weird? - if (!osdmap.have_inst(badboy)) { - dout(5) << "preprocess_failure dne(/dup?): " << m->get_failed() << ", from " << m->get_from() << dendl; - send_incremental(m->get_from(), m->get_epoch()+1); - return true; - } - if (osdmap.get_inst(badboy) != m->get_failed()) { - dout(5) << "preprocess_failure wrong osd: report " << m->get_failed() << " != map's " << osdmap.get_inst(badboy) - << ", from " << m->get_from() << dendl; - send_incremental(m->get_from(), m->get_epoch()+1); - return true; - } - // already reported? - if (osdmap.is_down(badboy)) { - dout(5) << "preprocess_failure dup: " << m->get_failed() << ", from " << m->get_from() << dendl; - send_incremental(m->get_from(), m->get_epoch()+1); - return true; - } - - dout(10) << "preprocess_failure new: " << m->get_failed() << ", from " << m->get_from() << dendl; - return false; -} - -bool OSDMonitor::prepare_failure(MOSDFailure *m) -{ - dout(1) << "prepare_failure " << m->get_failed() << " from " << m->get_from() << dendl; - - // FIXME - // take their word for it - int badboy = m->get_failed().name.num(); - assert(osdmap.is_up(badboy)); - assert(osdmap.osd_inst[badboy] == m->get_failed()); - - pending_inc.new_down[badboy].first = m->get_failed(); - pending_inc.new_down[badboy].second = false; - - if (osdmap.is_in(badboy)) - down_pending_out[badboy] = g_clock.now(); - - paxos->wait_for_commit(new C_Reported(this, m)); - - return true; -} - -void OSDMonitor::_reported_failure(MOSDFailure *m) -{ - dout(7) << "_reported_failure on " << m->get_failed() << ", telling " << m->get_from() << dendl; - send_latest(m->get_from(), m->get_epoch()); -} - - -// boot -- - -bool OSDMonitor::preprocess_boot(MOSDBoot *m) -{ - assert(m->inst.name.is_osd()); - int from = m->inst.name.num(); - - // already booted? - if (osdmap.is_up(from) && - osdmap.get_inst(from) == m->inst) { - // yup. - dout(7) << "preprocess_boot dup from " << m->inst << dendl; - _booted(m); - return true; - } - - dout(10) << "preprocess_boot from " << m->inst << dendl; - return false; -} - -bool OSDMonitor::prepare_boot(MOSDBoot *m) -{ - dout(7) << "prepare_boot from " << m->inst << dendl; - assert(m->inst.name.is_osd()); - int from = m->inst.name.num(); - - // does this osd exist? - if (!osdmap.exists(from)) { - dout(1) << "boot from non-existent osd" << from << dendl; - delete m; - return true; - } - - // already up? mark down first? - if (osdmap.is_up(from)) { - dout(7) << "prepare_boot was up, first marking down " << osdmap.get_inst(from) << dendl; - assert(osdmap.get_inst(from) != m->inst); // preproces should have caught it - - // mark previous guy down - pending_inc.new_down[from].first = osdmap.osd_inst[from]; - pending_inc.new_down[from].second = false; - - paxos->wait_for_commit(new C_RetryMessage(this, m)); - } else { - // mark new guy up. - down_pending_out.erase(from); // if any - pending_inc.new_up[from] = m->inst; - - // mark in? - if (osdmap.out_osds.count(from)) - pending_inc.new_in.push_back(from); - - osd_weight[from] = m->sb.weight; - - // wait - paxos->wait_for_commit(new C_Booted(this, m)); - } - return true; -} - -void OSDMonitor::_booted(MOSDBoot *m) -{ - dout(7) << "_booted " << m->inst << " w " << m->sb.weight << dendl; - send_latest(m->inst, m->sb.current_epoch); - delete m; -} - - - - - -// --------------- -// map helpers - -void OSDMonitor::send_to_waiting() -{ - dout(10) << "send_to_waiting " << osdmap.get_epoch() << dendl; - - map >::iterator i = waiting_for_map.begin(); - while (i != waiting_for_map.end()) { - if (i->second.first) { - if (i->second.second <= osdmap.get_epoch()) - send_incremental(i->first, i->second.first); - else { - dout(10) << "send_to_waiting skipping " << i->first - << " has " << i->second.first - << " wants " << i->second.second - << dendl; - i++; - continue; - } - } else - send_full(i->first); - - waiting_for_map.erase(i++); - } -} - -void OSDMonitor::send_latest(entity_inst_t who, epoch_t start) -{ - if (paxos->is_readable()) { - dout(5) << "send_latest to " << who << " now" << dendl; - if (start == 0) - send_full(who); - else - send_incremental(who, start); - } else { - dout(5) << "send_latest to " << who << " later" << dendl; - waiting_for_map[who] = pair(start, 0); - } -} - - -void OSDMonitor::send_full(entity_inst_t who) -{ - dout(5) << "send_full to " << who << dendl; - mon->messenger->send_message(new MOSDMap(&osdmap), who); -} - -void OSDMonitor::send_incremental(entity_inst_t dest, epoch_t from) -{ - dout(5) << "send_incremental from " << from << " -> " << osdmap.get_epoch() - << " to " << dest << dendl; - - MOSDMap *m = new MOSDMap; - - for (epoch_t e = osdmap.get_epoch(); - e >= from; - e--) { - bufferlist bl; - if (mon->store->get_bl_sn(bl, "osdmap", e) > 0) { - dout(20) << "send_incremental inc " << e << " " << bl.length() << " bytes" << dendl; - m->incremental_maps[e] = bl; - } - else if (mon->store->get_bl_sn(bl, "osdmap_full", e) > 0) { - dout(20) << "send_incremental full " << e << dendl; - m->maps[e] = bl; - } - else { - assert(0); // we should have all maps. - } - } - - mon->messenger->send_message(m, dest); -} - - -void OSDMonitor::bcast_latest_mds() -{ - epoch_t e = osdmap.get_epoch(); - dout(1) << "bcast_latest_mds epoch " << e << dendl; - - // tell mds - set up; - mon->mdsmon->mdsmap.get_up_mds_set(up); - for (set::iterator i = up.begin(); - i != up.end(); - i++) { - send_incremental(mon->mdsmon->mdsmap.get_inst(*i), osdmap.get_epoch()); - } -} - -void OSDMonitor::bcast_latest_osd() -{ - epoch_t e = osdmap.get_epoch(); - dout(1) << "bcast_latest_osd epoch " << e << dendl; - - // tell osds - set osds; - osdmap.get_all_osds(osds); - for (set::iterator it = osds.begin(); - it != osds.end(); - it++) { - if (osdmap.is_down(*it)) continue; - - send_incremental(osdmap.get_inst(*it), osdmap.get_epoch()); - } -} - -void OSDMonitor::bcast_full_osd() -{ - epoch_t e = osdmap.get_epoch(); - dout(1) << "bcast_full_osd epoch " << e << dendl; - - // tell osds - set osds; - osdmap.get_all_osds(osds); - for (set::iterator it = osds.begin(); - it != osds.end(); - it++) { - if (osdmap.is_down(*it)) continue; - send_full(osdmap.get_inst(*it)); - } -} - - -// TICK - - -void OSDMonitor::tick() -{ - // mark down osds out? - utime_t now = g_clock.now(); - list mark_out; - for (map::iterator i = down_pending_out.begin(); - i != down_pending_out.end(); - i++) { - utime_t down = now; - down -= i->second; - - if (down.sec() >= g_conf.mon_osd_down_out_interval) { - dout(10) << "tick marking osd" << i->first << " OUT after " << down << " sec" << dendl; - mark_out.push_back(i->first); - } - } - for (list::iterator i = mark_out.begin(); - i != mark_out.end(); - i++) { - down_pending_out.erase(*i); - pending_inc.new_out.push_back( *i ); - } - if (!mark_out.empty()) { - propose_pending(); - } -} - - - - - -/* -void OSDMonitor::init() -{ - // start with blank map - - // load my last state from the store - bufferlist bl; - if (get_map_bl(0, bl)) { // FIXME - // yay! - osdmap.decode(bl); - dout(1) << "init got epoch " << osdmap.get_epoch() << " from store" << dendl; - - // set up pending_inc - pending_inc.epoch = osdmap.get_epoch()+1; - } -} -*/ - - - - -void OSDMonitor::mark_all_down() -{ - assert(mon->is_leader()); - - dout(7) << "mark_all_down" << dendl; - - for (set::iterator it = osdmap.get_osds().begin(); - it != osdmap.get_osds().end(); - it++) { - if (osdmap.is_down(*it)) continue; - pending_inc.new_down[*it].first = osdmap.get_inst(*it); - pending_inc.new_down[*it].second = true; // FIXME: am i sure it's clean? we need a proper osd shutdown sequence! - } - - propose_pending(); -} - - - - - - - - - - - - - - - -/* - - -void OSDMonitor::election_finished() -{ - dout(10) << "election_finished" << dendl; - - if (mon->is_leader()) { - if (g_conf.mkfs) { - create_initial(); - save_map(); - } else { - // - epoch_t epoch = mon->store->get_int("osd_epoch"); - dout(10) << " last epoch was " << epoch << dendl; - bufferlist bl, blinc; - int r = mon->store->get_bl_sn(bl, "osdmap_full", epoch); - assert(r>0); - osdmap.decode(bl); - - // pending_inc - pending_inc.epoch = epoch+1; - } - - } - -} - - - -*/ diff --git a/branches/sage/crush/mon/OSDMonitor.h b/branches/sage/crush/mon/OSDMonitor.h deleted file mode 100644 index c22c007f2d9b6..0000000000000 --- a/branches/sage/crush/mon/OSDMonitor.h +++ /dev/null @@ -1,131 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __OSDMONITOR_H -#define __OSDMONITOR_H - -#include -#include -using namespace std; - -#include "include/types.h" -#include "msg/Messenger.h" - -#include "osd/OSDMap.h" - -#include "PaxosService.h" - -class Monitor; -class MOSDBoot; - -class OSDMonitor : public PaxosService { -public: - OSDMap osdmap; - -private: - map > waiting_for_map; // who -> (has, wants) - - // [leader] - OSDMap::Incremental pending_inc; - map down_pending_out; // osd down -> out - - map osd_weight; - - void build_crush_map(CrushWrapper& crush, - map& weights); - - // svc - void create_initial(); - bool update_from_paxos(); - void create_pending(); // prepare a new pending - void encode_pending(bufferlist &bl); - - void committed(); - - void handle_query(Message *m); - bool preprocess_query(Message *m); // true if processed. - bool prepare_update(Message *m); - bool should_propose(double &delay); - - // ... - bool get_map_bl(epoch_t epoch, bufferlist &bl); - bool get_inc_map_bl(epoch_t epoch, bufferlist &bl); - - void send_to_waiting(); // send current map to waiters. - void send_full(entity_inst_t dest); - void send_incremental(entity_inst_t dest, epoch_t since); - void bcast_latest_mds(); - void bcast_latest_osd(); - void bcast_full_osd(); - - void handle_osd_getmap(class MOSDGetMap *m); - - bool preprocess_failure(class MOSDFailure *m); - bool prepare_failure(class MOSDFailure *m); - void _reported_failure(MOSDFailure *m); - - bool preprocess_boot(class MOSDBoot *m); - bool prepare_boot(class MOSDBoot *m); - void _booted(MOSDBoot *m); - - class C_Booted : public Context { - OSDMonitor *cmon; - MOSDBoot *m; - public: - C_Booted(OSDMonitor *cm, MOSDBoot *m_) : - cmon(cm), m(m_) {} - void finish(int r) { - if (r >= 0) - cmon->_booted(m); - else - cmon->dispatch((Message*)m); - } - }; - class C_Reported : public Context { - OSDMonitor *cmon; - MOSDFailure *m; - public: - C_Reported(OSDMonitor *cm, MOSDFailure *m_) : - cmon(cm), m(m_) {} - void finish(int r) { - if (r >= 0) - cmon->_reported_failure(m); - else - cmon->dispatch((Message*)m); - } - }; - - bool preprocess_in(class MOSDIn *m); - bool prepare_in(class MOSDIn *m); - - bool preprocess_out(class MOSDOut *m); - bool prepare_out(class MOSDOut *m); - - public: - OSDMonitor(Monitor *mn, Paxos *p) : - PaxosService(mn, p) { } - - void tick(); // check state, take actions - - void mark_all_down(); - - void send_latest(entity_inst_t i, epoch_t start=0); - - void fake_osd_failure(int osd, bool down); - void fake_osdmap_update(); - void fake_reorg(); -}; - -#endif diff --git a/branches/sage/crush/mon/PGMonitor.cc b/branches/sage/crush/mon/PGMonitor.cc deleted file mode 100644 index 6e571fea7f612..0000000000000 --- a/branches/sage/crush/mon/PGMonitor.cc +++ /dev/null @@ -1,219 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include "PGMonitor.h" -#include "Monitor.h" -#include "MDSMonitor.h" -#include "OSDMonitor.h" -#include "MonitorStore.h" - -#include "messages/MPGStats.h" - -#include "messages/MStatfs.h" -#include "messages/MStatfsReply.h" - -#include "common/Timer.h" - -#include "osd/osd_types.h" -#include "osd/PG.h" // yuck - -#include "config.h" -#include - - -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_dout << dbeginl << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".pg " -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_derr << dbeginl << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".pg " - - - -void PGMonitor::create_initial() -{ - dout(1) << "create_initial -- creating initial map" << dendl; -} - -bool PGMonitor::update_from_paxos() -{ - version_t paxosv = paxos->get_version(); - if (paxosv == pg_map.version) return true; - assert(paxosv >= pg_map.version); - - if (pg_map.version == 0 && paxosv > 1 && - mon->store->exists_bl_ss("pgmap","latest")) { - // starting up: load latest - dout(7) << "update_from_paxos startup: loading latest full pgmap" << dendl; - bufferlist bl; - mon->store->get_bl_ss(bl, "pgmap", "latest"); - int off = 0; - pg_map._decode(bl, off); - } - - // walk through incrementals - while (paxosv > pg_map.version) { - bufferlist bl; - bool success = paxos->read(pg_map.version+1, bl); - if (success) { - dout(7) << "update_from_paxos applying incremental " << pg_map.version+1 << dendl; - PGMap::Incremental inc; - int off = 0; - inc._decode(bl, off); - pg_map.apply_incremental(inc); - - std::stringstream ss; - for (hash_map::iterator p = pg_map.num_pg_by_state.begin(); - p != pg_map.num_pg_by_state.end(); - ++p) { - if (p != pg_map.num_pg_by_state.begin()) - ss << ", "; - ss << p->second << " " << PG::get_state_string(p->first) << "(" << p->first << ")"; - } - string states = ss.str(); - dout(0) << "v" << pg_map.version << " " << states << dendl; - - } else { - dout(7) << "update_from_paxos couldn't read incremental " << pg_map.version+1 << dendl; - return false; - } - } - - // save latest - bufferlist bl; - pg_map._encode(bl); - mon->store->put_bl_ss(bl, "pgmap", "latest"); - - return true; -} - -void PGMonitor::create_pending() -{ - pending_inc = PGMap::Incremental(); - pending_inc.version = pg_map.version + 1; - dout(10) << "create_pending v " << pending_inc.version << dendl; -} - -void PGMonitor::encode_pending(bufferlist &bl) -{ - assert(mon->is_leader()); - dout(10) << "encode_pending v " << pending_inc.version << dendl; - assert(paxos->get_version() + 1 == pending_inc.version); - pending_inc._encode(bl); -} - -bool PGMonitor::preprocess_query(Message *m) -{ - dout(10) << "preprocess_query " << *m << " from " << m->get_source_inst() << dendl; - - switch (m->get_type()) { - case MSG_STATFS: - handle_statfs((MStatfs*)m); - return true; - - case MSG_PGSTATS: - { - MPGStats *stats = (MPGStats*)m; - for (map::iterator p = stats->pg_stat.begin(); - p != stats->pg_stat.end(); - p++) { - if (pg_map.pg_stat.count(p->first) == 0 || - pg_map.pg_stat[p->first].reported < p->second.reported) - return false; - } - dout(10) << " message contains no new pg stats" << dendl; - return true; - } - - default: - assert(0); - delete m; - return true; - } -} - -bool PGMonitor::prepare_update(Message *m) -{ - dout(10) << "prepare_update " << *m << " from " << m->get_source_inst() << dendl; - switch (m->get_type()) { - case MSG_PGSTATS: - return handle_pg_stats((MPGStats*)m); - - default: - assert(0); - delete m; - return false; - } -} - -void PGMonitor::committed() -{ - -} - -void PGMonitor::handle_statfs(MStatfs *statfs) -{ - dout(10) << "handle_statfs " << *statfs << " from " << statfs->get_source() << dendl; - - // fill out stfs - MStatfsReply *reply = new MStatfsReply(statfs->tid); - memset(&reply->stfs, 0, sizeof(reply->stfs)); - reply->stfs.f_bsize = 1024; - reply->stfs.f_frsize = 1024; - reply->stfs.f_blocks = 1024 * 1024; //pg_map.total_num_blocks; - reply->stfs.f_bfree = 1024 * 1024; - reply->stfs.f_bavail = 1024 * 1024; - reply->stfs.f_files = 1024 * 1024; - reply->stfs.f_ffree = 1024 * 1024; - reply->stfs.f_favail = 1024 * 1024; - reply->stfs.f_namemax = 1024; - reply->stfs.f_flag = ST_NOATIME|ST_NODIRATIME; // for now. - - // reply - mon->messenger->send_message(reply, statfs->get_source_inst()); - delete statfs; -} - -bool PGMonitor::handle_pg_stats(MPGStats *stats) -{ - dout(10) << "handle_pg_stats " << *stats << " from " << stats->get_source() << dendl; - - for (map::iterator p = stats->pg_stat.begin(); - p != stats->pg_stat.end(); - p++) { - pg_t pgid = p->first; - if ((pg_map.pg_stat.count(pgid) && - pg_map.pg_stat[pgid].reported > p->second.reported)) { - dout(15) << " had " << pgid << " from " << pg_map.pg_stat[pgid].reported << dendl; - continue; - } - if (pending_inc.pg_stat_updates.count(pgid) && - pending_inc.pg_stat_updates[pgid].reported > p->second.reported) { - dout(15) << " had " << pgid << " from " << pending_inc.pg_stat_updates[pgid].reported - << " (pending)" << dendl; - continue; - } - - dout(15) << " got " << pgid << " reported at " << p->second.reported - << " state " << PG::get_state_string(p->second.state) - << dendl; - pending_inc.pg_stat_updates[pgid] = p->second; - - // we don't care about consistency; apply to live map. - if (pg_map.pg_stat.count(pgid)) - pg_map.stat_sub(pg_map.pg_stat[pgid]); - pg_map.pg_stat[pgid] = p->second; - pg_map.stat_add(pg_map.pg_stat[pgid]); - } - - delete stats; - return true; -} diff --git a/branches/sage/crush/mon/Paxos.cc b/branches/sage/crush/mon/Paxos.cc deleted file mode 100644 index c1f4472059ff5..0000000000000 --- a/branches/sage/crush/mon/Paxos.cc +++ /dev/null @@ -1,784 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "Paxos.h" -#include "Monitor.h" -#include "MonitorStore.h" - -#include "messages/MMonPaxos.h" - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_paxos) *_dout << dbeginl << g_clock.now() << " mon" << whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".paxos(" << machine_name << " " << get_statename(state) << " lc " << last_committed << ") " -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_paxos) *_derr << dbeginl << g_clock.now() << " mon" << whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".paxos(" << machine_name << " " << get_statename(state) << " lc " << last_committed << ") " - - -void Paxos::init() -{ - // load paxos variables from stable storage - last_pn = mon->store->get_int(machine_name, "last_pn"); - accepted_pn = mon->store->get_int(machine_name, "accepted_pn"); - last_committed = mon->store->get_int(machine_name, "last_committed"); - - dout(10) << "init" << dendl; -} - -// --------------------------------- - -// PHASE 1 - -// leader -void Paxos::collect(version_t oldpn) -{ - // we're recoverying, it seems! - state = STATE_RECOVERING; - assert(mon->is_leader()); - - // reset the number of lasts received - uncommitted_v = 0; - uncommitted_pn = 0; - uncommitted_value.clear(); - - // look for uncommitted value - if (mon->store->exists_bl_sn(machine_name, last_committed+1)) { - uncommitted_v = last_committed+1; - uncommitted_pn = accepted_pn; - mon->store->get_bl_sn(uncommitted_value, machine_name, last_committed+1); - dout(10) << "learned uncommitted " << (last_committed+1) - << " (" << uncommitted_value.length() << " bytes) from myself" - << dendl; - } - - // pick new pn - accepted_pn = get_new_proposal_number(MAX(accepted_pn, oldpn)); - accepted_pn_from = last_committed; - num_last = 1; - dout(10) << "collect with pn " << accepted_pn << dendl; - - // send collect - for (set::const_iterator p = mon->get_quorum().begin(); - p != mon->get_quorum().end(); - ++p) { - if (*p == whoami) continue; - - MMonPaxos *collect = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_COLLECT, machine_id); - collect->last_committed = last_committed; - collect->pn = accepted_pn; - mon->messenger->send_message(collect, mon->monmap->get_inst(*p)); - } - -} - - -// peon -void Paxos::handle_collect(MMonPaxos *collect) -{ - dout(10) << "handle_collect " << *collect << dendl; - - assert(mon->is_peon()); // mon epoch filter should catch strays - - // we're recoverying, it seems! - state = STATE_RECOVERING; - - // reply - MMonPaxos *last = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_LAST, machine_id); - last->last_committed = last_committed; - - // do we have an accepted but uncommitted value? - // (it'll be at last_committed+1) - bufferlist bl; - if (mon->store->exists_bl_sn(machine_name, last_committed+1)) { - mon->store->get_bl_sn(bl, machine_name, last_committed+1); - assert(bl.length() > 0); - dout(10) << " sharing our accepted but uncommitted value for " << last_committed+1 - << " (" << bl.length() << " bytes)" << dendl; - last->values[last_committed+1] = bl; - last->uncommitted_pn = accepted_pn; - } - - // can we accept this pn? - if (collect->pn > accepted_pn) { - // ok, accept it - accepted_pn = collect->pn; - accepted_pn_from = collect->pn_from; - dout(10) << "accepting pn " << accepted_pn << " from " << accepted_pn_from << dendl; - mon->store->put_int(accepted_pn, machine_name, "accepted_pn"); - } else { - // don't accept! - dout(10) << "NOT accepting pn " << collect->pn << " from " << collect->pn_from - << ", we already accepted " << accepted_pn << " from " << accepted_pn_from - << dendl; - } - last->pn = accepted_pn; - last->pn_from = accepted_pn_from; - - // and share whatever data we have - for (version_t v = collect->last_committed+1; - v <= last_committed; - v++) { - if (mon->store->exists_bl_sn(machine_name, v)) { - mon->store->get_bl_sn(last->values[v], machine_name, v); - dout(10) << " sharing " << v << " (" - << last->values[v].length() << " bytes)" << dendl; - } - } - - // send reply - mon->messenger->send_message(last, collect->get_source_inst()); - delete collect; -} - - -// leader -void Paxos::handle_last(MMonPaxos *last) -{ - dout(10) << "handle_last " << *last << dendl; - - if (!mon->is_leader()) { - dout(10) << "not leader, dropping" << dendl; - delete last; - return; - } - - // share committed values? - if (last->last_committed < last_committed) { - // share committed values - dout(10) << "sending commit to " << last->get_source() << dendl; - MMonPaxos *commit = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_COMMIT, machine_id); - for (version_t v = last->last_committed+1; - v <= last_committed; - v++) { - mon->store->get_bl_sn(commit->values[v], machine_name, v); - dout(10) << " sharing " << v << " (" - << commit->values[v].length() << " bytes)" << dendl; - } - commit->last_committed = last_committed; - mon->messenger->send_message(commit, last->get_source_inst()); - } - - // did we receive a committed value? - if (last->last_committed > last_committed) { - for (version_t v = last_committed+1; - v <= last->last_committed; - v++) { - mon->store->put_bl_sn(last->values[v], machine_name, v); - dout(10) << "committing " << v << " " - << last->values[v].length() << " bytes" << dendl; - } - last_committed = last->last_committed; - mon->store->put_int(last_committed, machine_name, "last_committed"); - dout(10) << "last_committed now " << last_committed << dendl; - } - - // do they accept your pn? - if (last->pn > accepted_pn) { - // no, try again. - dout(10) << " they had a higher pn than us, picking a new one." << dendl; - collect(last->pn); - } else { - // yes, they accepted our pn. great. - num_last++; - dout(10) << " they accepted our pn, we now have " - << num_last << " peons" << dendl; - - // did this person send back an accepted but uncommitted value? - if (last->uncommitted_pn && - last->uncommitted_pn > uncommitted_pn) { - uncommitted_v = last->last_committed+1; - uncommitted_pn = last->uncommitted_pn; - uncommitted_value = last->values[uncommitted_v]; - dout(10) << "we learned an uncommitted value for " << uncommitted_v - << " pn " << uncommitted_pn - << " " << uncommitted_value.length() << " bytes" - << dendl; - } - - // is that everyone? - if (num_last == mon->get_quorum().size()) { - // almost... - state = STATE_ACTIVE; - - // did we learn an old value? - if (uncommitted_v == last_committed+1 && - uncommitted_value.length()) { - dout(10) << "that's everyone. begin on old learned value" << dendl; - begin(uncommitted_value); - } else { - // active! - dout(10) << "that's everyone. active!" << dendl; - extend_lease(); - - // wake people up - finish_contexts(waiting_for_active); - finish_contexts(waiting_for_readable); - finish_contexts(waiting_for_writeable); - } - } - } - - delete last; -} - - -// leader -void Paxos::begin(bufferlist& v) -{ - dout(10) << "begin for " << last_committed+1 << " " - << v.length() << " bytes" - << dendl; - - assert(mon->is_leader()); - assert(is_active()); - state = STATE_UPDATING; - - // we must already have a majority for this to work. - assert(mon->get_quorum().size() == 1 || - num_last > (unsigned)mon->monmap->num_mon/2); - - // and no value, yet. - assert(new_value.length() == 0); - - // accept it ourselves - accepted.clear(); - accepted.insert(whoami); - new_value = v; - mon->store->put_bl_sn(new_value, machine_name, last_committed+1); - - if (mon->get_quorum().size() == 1) { - // we're alone, take it easy - commit(); - state = STATE_ACTIVE; - finish_contexts(waiting_for_active); - finish_contexts(waiting_for_commit); - finish_contexts(waiting_for_readable); - finish_contexts(waiting_for_writeable); - return; - } - - // ask others to accept it to! - for (set::const_iterator p = mon->get_quorum().begin(); - p != mon->get_quorum().end(); - ++p) { - if (*p == whoami) continue; - - dout(10) << " sending begin to mon" << *p << dendl; - MMonPaxos *begin = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_BEGIN, machine_id); - begin->values[last_committed+1] = new_value; - begin->last_committed = last_committed; - begin->pn = accepted_pn; - - mon->messenger->send_message(begin, mon->monmap->get_inst(*p)); - } - - // set timeout event - accept_timeout_event = new C_AcceptTimeout(this); - mon->timer.add_event_after(g_conf.mon_accept_timeout, accept_timeout_event); -} - -// peon -void Paxos::handle_begin(MMonPaxos *begin) -{ - dout(10) << "handle_begin " << *begin << dendl; - - // can we accept this? - if (begin->pn < accepted_pn) { - dout(10) << " we accepted a higher pn " << accepted_pn << ", ignoring" << dendl; - delete begin; - return; - } - assert(begin->pn == accepted_pn); - assert(begin->last_committed == last_committed); - - // set state. - state = STATE_UPDATING; - lease_expire = utime_t(); // cancel lease - - // yes. - version_t v = last_committed+1; - dout(10) << "accepting value for " << v << " pn " << accepted_pn << dendl; - mon->store->put_bl_sn(begin->values[v], machine_name, v); - - // reply - MMonPaxos *accept = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_ACCEPT, machine_id); - accept->pn = accepted_pn; - accept->last_committed = last_committed; - mon->messenger->send_message(accept, begin->get_source_inst()); - - delete begin; -} - -// leader -void Paxos::handle_accept(MMonPaxos *accept) -{ - dout(10) << "handle_accept " << *accept << dendl; - int from = accept->get_source().num(); - - if (accept->pn != accepted_pn) { - // we accepted a higher pn, from some other leader - dout(10) << " we accepted a higher pn " << accepted_pn << ", ignoring" << dendl; - delete accept; - return; - } - if (last_committed > 0 && - accept->last_committed < last_committed-1) { - dout(10) << " this is from an old round, ignoring" << dendl; - delete accept; - return; - } - assert(accept->last_committed == last_committed || // not committed - accept->last_committed == last_committed-1); // committed - - assert(state == STATE_UPDATING); - assert(accepted.count(from) == 0); - accepted.insert(from); - dout(10) << " now " << accepted << " have accepted" << dendl; - - // new majority? - if (accepted.size() == (unsigned)mon->monmap->num_mon/2+1) { - // yay, commit! - // note: this may happen before the lease is reextended (below) - dout(10) << " got majority, committing" << dendl; - commit(); - } - - // done? - if (accepted == mon->get_quorum()) { - dout(10) << " got quorum, done with update" << dendl; - // cancel timeout event - mon->timer.cancel_event(accept_timeout_event); - accept_timeout_event = 0; - - // yay! - state = STATE_ACTIVE; - extend_lease(); - - // wake people up - finish_contexts(waiting_for_active); - finish_contexts(waiting_for_commit); - finish_contexts(waiting_for_readable); - finish_contexts(waiting_for_writeable); - } -} - -void Paxos::accept_timeout() -{ - dout(5) << "accept timeout, calling fresh election" << dendl; - accept_timeout_event = 0; - assert(mon->is_leader()); - assert(is_updating()); - cancel_events(); - mon->call_election(); -} - -void Paxos::commit() -{ - dout(10) << "commit " << last_committed+1 << dendl; - - // commit locally - last_committed++; - mon->store->put_int(last_committed, machine_name, "last_committed"); - - // tell everyone - for (set::const_iterator p = mon->get_quorum().begin(); - p != mon->get_quorum().end(); - ++p) { - if (*p == whoami) continue; - - dout(10) << " sending commit to mon" << *p << dendl; - MMonPaxos *commit = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_COMMIT, machine_id); - commit->values[last_committed] = new_value; - commit->pn = accepted_pn; - - mon->messenger->send_message(commit, mon->monmap->get_inst(*p)); - } - - // get ready for a new round. - new_value.clear(); -} - - -void Paxos::handle_commit(MMonPaxos *commit) -{ - dout(10) << "handle_commit on " << commit->last_committed << dendl; - - if (!mon->is_peon()) { - dout(10) << "not a peon, dropping" << dendl; - assert(0); - delete commit; - return; - } - - // commit locally. - for (map::iterator p = commit->values.begin(); - p != commit->values.end(); - ++p) { - assert(p->first == last_committed+1); - last_committed = p->first; - dout(10) << " storing " << last_committed << " (" << p->second.length() << " bytes)" << dendl; - mon->store->put_bl_sn(p->second, machine_name, last_committed); - } - mon->store->put_int(last_committed, machine_name, "last_committed"); - - delete commit; -} - -void Paxos::extend_lease() -{ - assert(mon->is_leader()); - assert(is_active()); - - lease_expire = g_clock.now(); - lease_expire += g_conf.mon_lease; - acked_lease.clear(); - acked_lease.insert(whoami); - - dout(7) << "extend_lease now+" << g_conf.mon_lease << " (" << lease_expire << ")" << dendl; - - // bcast - for (set::const_iterator p = mon->get_quorum().begin(); - p != mon->get_quorum().end(); - ++p) { - if (*p == whoami) continue; - MMonPaxos *lease = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_LEASE, machine_id); - lease->last_committed = last_committed; - lease->lease_expire = lease_expire; - mon->messenger->send_message(lease, mon->monmap->get_inst(*p)); - } - - // set timeout event. - // if old timeout is still in place, leave it. - if (!lease_ack_timeout_event) { - lease_ack_timeout_event = new C_LeaseAckTimeout(this); - mon->timer.add_event_after(g_conf.mon_lease_ack_timeout, lease_ack_timeout_event); - } - - // set renew event - lease_renew_event = new C_LeaseRenew(this); - utime_t at = lease_expire; - at -= g_conf.mon_lease; - at += g_conf.mon_lease_renew_interval; - mon->timer.add_event_at(at, lease_renew_event); -} - - -// peon -void Paxos::handle_lease(MMonPaxos *lease) -{ - // sanity - if (!mon->is_peon() || - last_committed != lease->last_committed) { - dout(10) << "handle_lease i'm not a peon, or they're not the leader, or the last_committed doesn't match, dropping" << dendl; - delete lease; - return; - } - - // extend lease - if (lease_expire < lease->lease_expire) - lease_expire = lease->lease_expire; - - state = STATE_ACTIVE; - - dout(10) << "handle_lease on " << lease->last_committed - << " now " << lease_expire << dendl; - - // ack - MMonPaxos *ack = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_LEASE_ACK, machine_id); - ack->last_committed = last_committed; - ack->lease_expire = lease_expire; - mon->messenger->send_message(ack, lease->get_source_inst()); - - // (re)set timeout event. - if (lease_timeout_event) - mon->timer.cancel_event(lease_timeout_event); - lease_timeout_event = new C_LeaseTimeout(this); - mon->timer.add_event_after(g_conf.mon_lease_ack_timeout, lease_timeout_event); - - // kick waiters - finish_contexts(waiting_for_active); - if (is_readable()) - finish_contexts(waiting_for_readable); - - delete lease; -} - -void Paxos::handle_lease_ack(MMonPaxos *ack) -{ - int from = ack->get_source().num(); - - if (!lease_ack_timeout_event) { - dout(10) << "handle_lease_ack from " << ack->get_source() << " -- stray (probably since revoked)" << dendl; - } - else if (acked_lease.count(from) == 0) { - acked_lease.insert(from); - - if (acked_lease == mon->get_quorum()) { - // yay! - dout(10) << "handle_lease_ack from " << ack->get_source() - << " -- got everyone" << dendl; - mon->timer.cancel_event(lease_ack_timeout_event); - lease_ack_timeout_event = 0; - } else { - dout(10) << "handle_lease_ack from " << ack->get_source() - << " -- still need " - << mon->get_quorum().size() - acked_lease.size() - << " more" << dendl; - } - } else { - dout(10) << "handle_lease_ack from " << ack->get_source() - << " dup (lagging!), ignoring" << dendl; - } - - delete ack; -} - -void Paxos::lease_ack_timeout() -{ - dout(5) << "lease_ack_timeout -- calling new election" << dendl; - assert(mon->is_leader()); - assert(is_active()); - - lease_ack_timeout_event = 0; - cancel_events(); - mon->call_election(); -} - -void Paxos::lease_timeout() -{ - dout(5) << "lease_timeout -- calling new election" << dendl; - assert(mon->is_peon()); - - lease_timeout_event = 0; - cancel_events(); - mon->call_election(); -} - -void Paxos::lease_renew_timeout() -{ - lease_renew_event = 0; - extend_lease(); -} - - -/* - * return a globally unique, monotonically increasing proposal number - */ -version_t Paxos::get_new_proposal_number(version_t gt) -{ - if (last_pn < gt) - last_pn = gt; - - // update. make it unique among all monitors. - last_pn /= 100; - last_pn++; - last_pn *= 100; - last_pn += (version_t)whoami; - - // write - mon->store->put_int(last_pn, machine_name, "last_pn"); - - dout(10) << "get_new_proposal_number = " << last_pn << dendl; - return last_pn; -} - - -void Paxos::cancel_events() -{ - if (accept_timeout_event) { - mon->timer.cancel_event(accept_timeout_event); - accept_timeout_event = 0; - } - if (lease_renew_event) { - mon->timer.cancel_event(lease_renew_event); - lease_renew_event = 0; - } - if (lease_ack_timeout_event) { - mon->timer.cancel_event(lease_ack_timeout_event); - lease_ack_timeout_event = 0; - } - if (lease_timeout_event) { - mon->timer.cancel_event(lease_timeout_event); - lease_timeout_event = 0; - } -} - -void Paxos::leader_init() -{ - if (mon->get_quorum().size() == 1) { - state = STATE_ACTIVE; - return; - } - cancel_events(); - state = STATE_RECOVERING; - lease_expire = utime_t(); - dout(10) << "leader_init -- starting paxos recovery" << dendl; - collect(0); -} - -void Paxos::peon_init() -{ - cancel_events(); - state = STATE_RECOVERING; - lease_expire = utime_t(); - dout(10) << "peon_init -- i am a peon" << dendl; - - // no chance to write now! - finish_contexts(waiting_for_writeable, -1); - finish_contexts(waiting_for_commit, -1); -} - -void Paxos::election_starting() -{ - dout(10) << "election_starting -- canceling timeouts" << dendl; - cancel_events(); - new_value.clear(); - - finish_contexts(waiting_for_commit, -1); -} - - -void Paxos::dispatch(Message *m) -{ - // election in progress? - if (mon->is_starting()) { - dout(5) << "election in progress, dropping " << *m << dendl; - delete m; - return; - } - - // check sanity - assert(mon->is_leader() || - (mon->is_peon() && m->get_source().num() == mon->get_leader())); - - switch (m->get_type()) { - - case MSG_MON_PAXOS: - { - MMonPaxos *pm = (MMonPaxos*)m; - - // NOTE: these ops are defined in messages/MMonPaxos.h - switch (pm->op) { - // learner - case MMonPaxos::OP_COLLECT: - handle_collect(pm); - break; - case MMonPaxos::OP_LAST: - handle_last(pm); - break; - case MMonPaxos::OP_BEGIN: - handle_begin(pm); - break; - case MMonPaxos::OP_ACCEPT: - handle_accept(pm); - break; - case MMonPaxos::OP_COMMIT: - handle_commit(pm); - break; - case MMonPaxos::OP_LEASE: - handle_lease(pm); - break; - case MMonPaxos::OP_LEASE_ACK: - handle_lease_ack(pm); - break; - default: - assert(0); - } - } - break; - - default: - assert(0); - } -} - - - - -// ----------------- -// service interface - -// -- READ -- - -bool Paxos::is_readable() -{ - //dout(15) << "is_readable now=" << g_clock.now() << " lease_expire=" << lease_expire << dendl; - return - (mon->is_peon() || mon->is_leader()) && - is_active() && - last_committed > 0 && // must have a value - (mon->get_quorum().size() == 1 || // alone, or - g_clock.now() < lease_expire); // have lease -} - -bool Paxos::read(version_t v, bufferlist &bl) -{ - if (!is_readable()) - return false; - - if (!mon->store->get_bl_sn(bl, machine_name, v)) - return false; - return true; -} - -version_t Paxos::read_current(bufferlist &bl) -{ - if (!is_readable()) - return 0; - if (read(last_committed, bl)) - return last_committed; - return 0; -} - - - - -// -- WRITE -- - -bool Paxos::is_writeable() -{ - if (mon->get_quorum().size() == 1) return true; - return - mon->is_leader() && - is_active() && - g_clock.now() < lease_expire; -} - -bool Paxos::propose_new_value(bufferlist& bl, Context *oncommit) -{ - /* - // writeable? - if (!is_writeable()) { - dout(5) << "propose_new_value " << last_committed+1 << " " << bl.length() << " bytes" - << " -- not writeable" << dendl; - if (oncommit) { - oncommit->finish(-1); - delete oncommit; - } - return false; - } - */ - - assert(mon->is_leader() && is_active()); - - // cancel lease renewal and timeout events. - cancel_events(); - - // ok! - dout(5) << "propose_new_value " << last_committed+1 << " " << bl.length() << " bytes" << dendl; - if (oncommit) - waiting_for_commit.push_back(oncommit); - begin(bl); - - return true; -} - diff --git a/branches/sage/crush/mon/Paxos.h b/branches/sage/crush/mon/Paxos.h deleted file mode 100644 index a6d28dd1cea9a..0000000000000 --- a/branches/sage/crush/mon/Paxos.h +++ /dev/null @@ -1,251 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -/* -time----> - -cccccccccccccccccca???????????????????????????????????????? -cccccccccccccccccca???????????????????????????????????????? -cccccccccccccccccca???????????????????????????????????????? leader -cccccccccccccccccc????????????????????????????????????????? -ccccc?????????????????????????????????????????????????????? - -last_committed - -pn_from -pn - -a 12v -b 12v -c 14v -d -e 12v - - -*/ - - -/* - * NOTE: This libary is based on the Paxos algorithm, but varies in a few key ways: - * 1- Only a single new value is generated at a time, simplifying the recovery logic. - * 2- Nodes track "committed" values, and share them generously (and trustingly) - * 3- A 'leasing' mechism is built-in, allowing nodes to determine when it is safe to - * "read" their copy of the last committed value. - * - * This provides a simple replication substrate that services can be built on top of. - */ - -#ifndef __MON_PAXOS_H -#define __MON_PAXOS_H - -#include "include/types.h" -#include "mon_types.h" -#include "include/buffer.h" -#include "msg/Message.h" - -#include "include/Context.h" - -#include "common/Timer.h" - -class Monitor; -class MMonPaxos; - - -// i am one state machine. -class Paxos { - Monitor *mon; - int whoami; - - // my state machine info - int machine_id; - const char *machine_name; - - friend class PaxosService; - - // LEADER+PEON - - // -- generic state -- -public: - const static int STATE_RECOVERING = 1; // leader|peon: recovering paxos state - const static int STATE_ACTIVE = 2; // leader|peon: idle. peon may or may not have valid lease - const static int STATE_UPDATING = 3; // leader|peon: updating to new value - const char *get_statename(int s) { - switch (s) { - case STATE_RECOVERING: return "recovering"; - case STATE_ACTIVE: return "active"; - case STATE_UPDATING: return "updating"; - default: assert(0); return 0; - } - } - -private: - int state; - -public: - bool is_recovering() { return state == STATE_RECOVERING; } - bool is_active() { return state == STATE_ACTIVE; } - bool is_updating() { return state == STATE_UPDATING; } - -private: - // recovery (phase 1) - version_t last_pn; - version_t last_committed; - version_t accepted_pn; - version_t accepted_pn_from; - - // active (phase 2) - utime_t lease_expire; - list waiting_for_active; - list waiting_for_readable; - - - // -- leader -- - // recovery (paxos phase 1) - unsigned num_last; - version_t uncommitted_v; - version_t uncommitted_pn; - bufferlist uncommitted_value; - - // active - set acked_lease; - Context *lease_renew_event; - Context *lease_ack_timeout_event; - Context *lease_timeout_event; - - // updating (paxos phase 2) - bufferlist new_value; - set accepted; - - Context *accept_timeout_event; - - list waiting_for_writeable; - list waiting_for_commit; - - class C_AcceptTimeout : public Context { - Paxos *paxos; - public: - C_AcceptTimeout(Paxos *p) : paxos(p) {} - void finish(int r) { - paxos->accept_timeout(); - } - }; - - class C_LeaseAckTimeout : public Context { - Paxos *paxos; - public: - C_LeaseAckTimeout(Paxos *p) : paxos(p) {} - void finish(int r) { - paxos->lease_ack_timeout(); - } - }; - - class C_LeaseTimeout : public Context { - Paxos *paxos; - public: - C_LeaseTimeout(Paxos *p) : paxos(p) {} - void finish(int r) { - paxos->lease_timeout(); - } - }; - - class C_LeaseRenew : public Context { - Paxos *paxos; - public: - C_LeaseRenew(Paxos *p) : paxos(p) {} - void finish(int r) { - std::cout << "HI MOM" << std::endl; - paxos->lease_renew_timeout(); - } - }; - - - void collect(version_t oldpn); - void handle_collect(MMonPaxos*); - void handle_last(MMonPaxos*); - void begin(bufferlist& value); - void handle_begin(MMonPaxos*); - void handle_accept(MMonPaxos*); - void accept_timeout(); - void commit(); - void handle_commit(MMonPaxos*); - void extend_lease(); - void handle_lease(MMonPaxos*); - void handle_lease_ack(MMonPaxos*); - - void lease_ack_timeout(); // on leader, if lease isn't acked by all peons - void lease_renew_timeout(); // on leader, to renew the lease - void lease_timeout(); // on peon, if lease isn't extended - - void cancel_events(); - - version_t get_new_proposal_number(version_t gt=0); - -public: - Paxos(Monitor *m, int w, - int mid) : mon(m), whoami(w), - machine_id(mid), - machine_name(get_paxos_name(mid)), - state(STATE_RECOVERING), - lease_renew_event(0), - lease_ack_timeout_event(0), - lease_timeout_event(0), - accept_timeout_event(0) { } - - void dispatch(Message *m); - - void init(); - - void election_starting(); - void leader_init(); - void peon_init(); - - - // -- service interface -- - void wait_for_active(Context *c) { - assert(!is_active()); - waiting_for_active.push_back(c); - } - - // read - version_t get_version() { return last_committed; } - bool is_readable(); - bool read(version_t v, bufferlist &bl); - version_t read_current(bufferlist &bl); - void wait_for_readable(Context *onreadable) { - assert(!is_readable()); - waiting_for_readable.push_back(onreadable); - } - - // write - bool is_leader(); - bool is_writeable(); - void wait_for_writeable(Context *c) { - assert(!is_writeable()); - waiting_for_writeable.push_back(c); - } - - bool propose_new_value(bufferlist& bl, Context *oncommit=0); - void wait_for_commit(Context *oncommit) { - waiting_for_commit.push_back(oncommit); - } - void wait_for_commit_front(Context *oncommit) { - waiting_for_commit.push_front(oncommit); - } - -}; - - - -#endif - diff --git a/branches/sage/crush/mon/PaxosService.h b/branches/sage/crush/mon/PaxosService.h deleted file mode 100644 index a0f39c7862273..0000000000000 --- a/branches/sage/crush/mon/PaxosService.h +++ /dev/null @@ -1,107 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __PAXOSSERVICE_H -#define __PAXOSSERVICE_H - -#include "msg/Dispatcher.h" -#include "include/Context.h" - -class Monitor; -class Paxos; - -class PaxosService : public Dispatcher { -protected: - Monitor *mon; - Paxos *paxos; - - class C_RetryMessage : public Context { - PaxosService *svc; - Message *m; - public: - C_RetryMessage(PaxosService *s, Message *m_) : svc(s), m(m_) {} - void finish(int r) { - svc->dispatch(m); - } - }; - class C_Active : public Context { - PaxosService *svc; - public: - C_Active(PaxosService *s) : svc(s) {} - void finish(int r) { - if (r >= 0) - svc->_active(); - } - }; - class C_Commit : public Context { - PaxosService *svc; - public: - C_Commit(PaxosService *s) : svc(s) {} - void finish(int r) { - if (r >= 0) - svc->_commit(); - } - }; - friend class C_Update; - - class C_Propose : public Context { - PaxosService *ps; - public: - C_Propose(PaxosService *p) : ps(p) { } - void finish(int r) { - ps->proposal_timer = 0; - ps->propose_pending(); - } - }; - friend class C_Propose; - - -private: - Context *proposal_timer; - bool have_pending; - -public: - PaxosService(Monitor *mn, Paxos *p) : mon(mn), paxos(p), - proposal_timer(0), - have_pending(false) { } - - // i implement and you ignore - void dispatch(Message *m); - void election_finished(); - -private: - void _active(); - void _commit(); - -public: - // i implement and you use - void propose_pending(); // propose current pending as new paxos state - - // you implement - virtual bool update_from_paxos() = 0; // assimilate latest paxos state - virtual void create_pending() = 0; // [leader] create new pending structures - virtual void create_initial() = 0; // [leader] populate pending with initial state (1) - virtual void encode_pending(bufferlist& bl) = 0; // [leader] finish and encode pending for next paxos state - virtual void discard_pending() { } // [leader] discard pending - - virtual bool preprocess_query(Message *m) = 0; // true if processed (e.g., read-only) - virtual bool prepare_update(Message *m) = 0; - virtual bool should_propose(double &delay); - - virtual void committed() = 0; - -}; - -#endif - diff --git a/branches/sage/crush/msg/FakeMessenger.cc b/branches/sage/crush/msg/FakeMessenger.cc deleted file mode 100644 index ee80df3dc0626..0000000000000 --- a/branches/sage/crush/msg/FakeMessenger.cc +++ /dev/null @@ -1,416 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "Message.h" -#include "FakeMessenger.h" -#include "mds/MDS.h" - -#include "common/Timer.h" - -#include "common/LogType.h" -#include "common/Logger.h" - -#include "config.h" - -#define dout(x) if ((x) <= g_conf.debug_ms) *_dout << dbeginl << g_clock.now() << " " - - - -#include -#include -#include -#include -#include - -using namespace std; - -#include -using namespace __gnu_cxx; - - -#include "common/Cond.h" -#include "common/Mutex.h" -#include - - -// global queue. - -int nranks = 0; // this identify each entity_inst_t - -map directory; -hash_map loggers; -LogType fakemsg_logtype; - -set shutdown_set; - -Mutex lock; -Cond cond; - -bool awake = false; -bool fm_shutdown = false; -pthread_t thread_id; - -extern std::map g_fake_kill_after; // in config.cc -utime_t start_time; -map fail_queue; -list sent_to_failed_queue; - -void *fakemessenger_thread(void *ptr) -{ - start_time = g_clock.now(); - - lock.Lock(); - while (1) { - if (fm_shutdown) break; - fakemessenger_do_loop_2(); - - if (directory.empty() && nranks > 0) break; - - dout(20) << "thread waiting" << dendl; - if (fm_shutdown) break; - awake = false; - cond.Wait(lock); - awake = true; - dout(20) << "thread woke up" << dendl; - } - lock.Unlock(); - - dout(1) << "thread finish (i woke up but no messages, bye)" << dendl; - return 0; -} - - -void fakemessenger_startthread() { - pthread_create(&thread_id, NULL, fakemessenger_thread, 0); -} - -void fakemessenger_stopthread() { - dout(0) << "fakemessenger_stopthread setting stop flag" << dendl; - lock.Lock(); - fm_shutdown = true; - lock.Unlock(); - cond.Signal(); - - fakemessenger_wait(); -} - -void fakemessenger_wait() -{ - dout(0) << "fakemessenger_wait waiting" << dendl; - void *ptr; - pthread_join(thread_id, &ptr); -} - - -// fake failure - - - -// lame main looper - -int fakemessenger_do_loop() -{ - lock.Lock(); - fakemessenger_do_loop_2(); - lock.Unlock(); - - g_timer.shutdown(); - return 0; -} - - -int fakemessenger_do_loop_2() -{ - //lock.Lock(); - dout(18) << "do_loop begin." << dendl; - - while (1) { - bool didone = false; - - dout(18) << "do_loop top" << dendl; - - // fail_queue - while (!fail_queue.empty() && - fail_queue.begin()->first < g_clock.now()) { - entity_name_t nm = fail_queue.begin()->second; - fail_queue.erase(fail_queue.begin()); - - dout(0) << "MUST FAKE KILL " << nm << dendl; - - for (map::iterator p = directory.begin(); - p != directory.end(); - ++p) { - if (p->second->get_myname() == nm) { - dout(0) << "FAKING FAILURE of " << nm << " at " << p->first << dendl; - directory.erase(p); - p->second->failed = true; - break; - } - } - } - - list ls; - ls.swap(sent_to_failed_queue); - for (list::iterator p = ls.begin(); - p != ls.end(); - ++p) { - Message *m = *p; - FakeMessenger *mgr = 0; - Dispatcher *dis = 0; - if (directory.count(m->get_source_addr())) { - mgr = directory[m->get_source_addr()]; - if (mgr) - dis = mgr->get_dispatcher(); - } - if (dis) { - dout(1) << "fail on " << *m - << " to " << m->get_dest() << " from " << m->get_source() - << ", passing back to sender." << dendl; - dis->ms_handle_failure(m, m->get_dest_inst()); - } else { - dout(1) << "fail on " << *m - << " to " << m->get_dest() << " from " << m->get_source() - << ", sender gone, dropping." << dendl; - delete m; - } - } - - // messages - map::iterator it = directory.begin(); - while (it != directory.end()) { - FakeMessenger *mgr = it->second; - - dout(18) << "messenger " << mgr << " at " << mgr->get_myname() << " has " << mgr->num_incoming() << " queued" << dendl; - - if (!mgr->is_ready()) { - dout(18) << "messenger " << mgr << " at " << mgr->get_myname() << " has no dispatcher, skipping" << dendl; - it++; - continue; - } - - Message *m = mgr->get_message(); - it++; - - if (m) { - m->set_recv_stamp(g_clock.now()); - - //dout(18) << "got " << m << dendl; - dout(1) << "==== " << m->get_dest() - << " <- " << m->get_source() - << " ==== " << *m - << " ---- " << m - << dendl; - - if (g_conf.fakemessenger_serialize) { - // encode - if (m->empty_payload()) - m->encode_payload(); - ceph_message_header env = m->get_envelope(); - bufferlist bl; - bl.claim( m->get_payload() ); - //bl.c_str(); // condense into 1 buffer - - delete m; - - // decode - m = decode_message(env, bl); - assert(m); - } - - didone = true; - - lock.Unlock(); - mgr->dispatch(m); - lock.Lock(); - } - } - - // deal with shutdowns.. delayed to avoid concurrent directory modification - if (!shutdown_set.empty()) { - for (set::iterator it = shutdown_set.begin(); - it != shutdown_set.end(); - it++) { - dout(7) << "fakemessenger: removing " << *it << " from directory" << dendl; - assert(directory.count(*it)); - directory.erase(*it); - if (directory.empty()) { - dout(1) << "fakemessenger: last shutdown" << dendl; - ::fm_shutdown = true; - } - } - shutdown_set.clear(); - } - - if (!didone) - break; - } - - - dout(18) << "do_loop end (no more messages)." << dendl; - //lock.Unlock(); - return 0; -} - - -FakeMessenger::FakeMessenger(entity_name_t me) : Messenger(me) -{ - failed = false; - - lock.Lock(); - { - // assign rank - _myinst.name = me; - _myinst.addr.v.port = nranks++; - //if (!me.is_mon()) - _myinst.addr.v.nonce = getpid(); - - // add to directory - directory[ _myinst.addr ] = this; - - // put myself in the fail queue? - if (g_fake_kill_after.count(me)) { - utime_t w = start_time; - w += g_fake_kill_after[me]; - dout(0) << "will fake failure of " << me << " at " << w << dendl; - fail_queue[w] = me; - } - } - lock.Unlock(); - - - dout(0) << "fakemessenger " << get_myname() << " messenger is " << this << " at " << _myinst << dendl; - - qlen = 0; - - /* - string name; - name = "m."; - name += MSG_ADDR_TYPE(myaddr); - int w = MSG_ADDR_NUM(myaddr); - if (w >= 1000) name += ('0' + ((w/1000)%10)); - if (w >= 100) name += ('0' + ((w/100)%10)); - if (w >= 10) name += ('0' + ((w/10)%10)); - name += ('0' + ((w/1)%10)); - - loggers[ myaddr ] = new Logger(name, (LogType*)&fakemsg_logtype); - */ -} - -FakeMessenger::~FakeMessenger() -{ - // hose any undelivered messages - for (list::iterator p = incoming.begin(); - p != incoming.end(); - ++p) - delete *p; -} - - -int FakeMessenger::shutdown() -{ - dout(2) << "shutdown on messenger " << this << " has " << num_incoming() << " queued" << dendl; - lock.Lock(); - assert(directory.count(_myinst.addr) == 1); - shutdown_set.insert(_myinst.addr); - - /* - if (loggers[myaddr]) { - delete loggers[myaddr]; - loggers.erase(myaddr); - } - */ - - lock.Unlock(); - return 0; -} - - -void FakeMessenger::reset_myname(entity_name_t m) -{ - dout(1) << "reset_myname from " << get_myname() << " to " << m << dendl; - _set_myname(m); - - directory.erase(_myinst.addr); - _myinst.name = m; - directory[_myinst.addr] = this; - - // put myself in the fail queue? - if (g_fake_kill_after.count(m)) { - utime_t w = start_time; - w += g_fake_kill_after[m]; - dout(0) << "will fake failure of " << m << " at " << w << dendl; - fail_queue[w] = m; - } - -} - - -int FakeMessenger::send_message(Message *m, entity_inst_t inst, int port, int fromport) -{ - entity_name_t dest = inst.name; - - m->set_source(get_myname(), fromport); - m->set_source_addr(get_myaddr()); - - m->set_dest_inst(inst); - m->set_dest_port(port); - - lock.Lock(); - -#ifdef LOG_MESSAGES - // stats - loggers[get_myaddr()]->inc("+send",1); - loggers[dest]->inc("-recv",1); - - char s[20]; - sprintf(s,"+%s", m->get_type_name()); - loggers[get_myaddr()]->inc(s); - sprintf(s,"-%s", m->get_type_name()); - loggers[dest]->inc(s); -#endif - - // queue - if (directory.count(inst.addr) && - shutdown_set.count(inst.addr) == 0) { - dout(1) << "--> " << get_myname() << " -> " << inst.name << " --- " << *m << " -- " << m - << dendl; - directory[inst.addr]->queue_incoming(m); - } else { - dout(0) << "--> " << get_myname() << " -> " << inst.name << " " << *m << " -- " << m - << " *** destination " << inst.addr << " DNE ***" - << dendl; - for (map::iterator p = directory.begin(); - p != directory.end(); - ++p) { - dout(20) << "** have " << p->first << " to " << p->second << dendl; - } - - // do the failure callback - sent_to_failed_queue.push_back(m); - } - - // wake up loop? - if (!awake) { - dout(10) << "waking up fakemessenger thread" << dendl; - cond.Signal(); - lock.Unlock(); - } else - lock.Unlock(); - - return 0; -} - - diff --git a/branches/sage/crush/msg/FakeMessenger.h b/branches/sage/crush/msg/FakeMessenger.h deleted file mode 100644 index 2284ea110b51f..0000000000000 --- a/branches/sage/crush/msg/FakeMessenger.h +++ /dev/null @@ -1,98 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __FAKEMESSENGER_H -#define __FAKEMESSENGER_H - -#include "Messenger.h" -#include "Dispatcher.h" - -#include -#include - -class Timer; - -class FakeMessenger : public Messenger { - protected: - class Logger *logger; - - int qlen; - list incoming; // incoming queue - - entity_inst_t _myinst; - - public: - bool failed; - - FakeMessenger(entity_name_t me); - ~FakeMessenger(); - - virtual int shutdown(); - - const entity_inst_t& get_myinst() { - return _myinst; - }; - const entity_addr_t& get_myaddr() { - return _myinst.addr; - } - - void reset_myname(entity_name_t m); - - // msg interface - virtual int send_message(Message *m, entity_inst_t dest, int port=0, int fromport=0); - - // events - //virtual void trigger_timer(Timer *t); - - int get_dispatch_queue_len() { return qlen; } - - // -- incoming queue -- - // (that nothing uses) - Message *get_message() { - if (!incoming.empty()) { - Message *m = incoming.front(); - incoming.pop_front(); - qlen--; - return m; - } - return NULL; - } - bool queue_incoming(Message *m) { - incoming.push_back(m); - qlen++; - return true; - } - int num_incoming() { - //return incoming.size(); - return qlen; - } - - void suicide() { - if (!failed) { - failed = true; - } - shutdown(); - } - -}; - -int fakemessenger_do_loop(); -int fakemessenger_do_loop_2(); -void fakemessenger_startthread(); -void fakemessenger_stopthread(); -void fakemessenger_wait(); - -#endif diff --git a/branches/sage/crush/msg/Message.cc b/branches/sage/crush/msg/Message.cc deleted file mode 100644 index e3c7ce827ac61..0000000000000 --- a/branches/sage/crush/msg/Message.cc +++ /dev/null @@ -1,372 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#include -#include -using namespace std; - -#include "include/types.h" - -#include "Message.h" - -#include "messages/MGenericMessage.h" - -#include "messages/MPGStats.h" - -#include "messages/MStatfs.h" -#include "messages/MStatfsReply.h" - -#include "messages/MMonCommand.h" -#include "messages/MMonCommandAck.h" -#include "messages/MMonPaxos.h" - -#include "messages/MMonElection.h" - -#include "messages/MPing.h" -#include "messages/MPingAck.h" -//#include "messages/MFailure.h" -//#include "messages/MFailureAck.h" - -#include "messages/MOSDBoot.h" -#include "messages/MOSDIn.h" -#include "messages/MOSDOut.h" -#include "messages/MOSDFailure.h" -#include "messages/MOSDPing.h" -#include "messages/MOSDOp.h" -#include "messages/MOSDOpReply.h" -#include "messages/MOSDMap.h" -#include "messages/MOSDGetMap.h" - -#include "messages/MOSDPGNotify.h" -#include "messages/MOSDPGQuery.h" -#include "messages/MOSDPGLog.h" -#include "messages/MOSDPGRemove.h" -#include "messages/MOSDPGActivateSet.h" - -#include "messages/MClientMount.h" -#include "messages/MClientUnmount.h" -#include "messages/MClientSession.h" -#include "messages/MClientReconnect.h" -#include "messages/MClientRequest.h" -#include "messages/MClientRequestForward.h" -#include "messages/MClientReply.h" -#include "messages/MClientFileCaps.h" - -#include "messages/MMDSSlaveRequest.h" - -#include "messages/MMDSGetMap.h" -#include "messages/MMDSMap.h" -#include "messages/MMDSBeacon.h" -#include "messages/MMDSResolve.h" -#include "messages/MMDSResolveAck.h" -#include "messages/MMDSCacheRejoin.h" -//#include "messages/MMDSCacheRejoinAck.h" - -#include "messages/MDirUpdate.h" -#include "messages/MDiscover.h" -#include "messages/MDiscoverReply.h" - -#include "messages/MMDSFragmentNotify.h" - -#include "messages/MExportDirDiscover.h" -#include "messages/MExportDirDiscoverAck.h" -#include "messages/MExportDirCancel.h" -#include "messages/MExportDirPrep.h" -#include "messages/MExportDirPrepAck.h" -#include "messages/MExportDirWarning.h" -#include "messages/MExportDirWarningAck.h" -#include "messages/MExportDir.h" -#include "messages/MExportDirAck.h" -#include "messages/MExportDirNotify.h" -#include "messages/MExportDirNotifyAck.h" -#include "messages/MExportDirFinish.h" - -#include "messages/MDentryUnlink.h" - -#include "messages/MHeartbeat.h" - -#include "messages/MAnchor.h" - -//#include "messages/MInodeUpdate.h" -#include "messages/MCacheExpire.h" -#include "messages/MInodeFileCaps.h" - -#include "messages/MLock.h" - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug) *_dout << dbeginl << g_clock.now() << " MESSENGER: " -#define DEBUGLVL 10 // debug level of output - - - - - - - -Message * -decode_message(ceph_message_header& env, bufferlist& payload) -{ - // make message - Message *m = 0; - switch(env.type) { - - // -- with payload -- - - case MSG_PGSTATS: - m = new MPGStats; - break; - - case MSG_STATFS: - m = new MStatfs; - break; - case MSG_STATFS_REPLY: - m = new MStatfsReply; - break; - - case MSG_MON_COMMAND: - m = new MMonCommand; - break; - case MSG_MON_COMMAND_ACK: - m = new MMonCommandAck; - break; - case MSG_MON_PAXOS: - m = new MMonPaxos; - break; - - case MSG_MON_ELECTION: - m = new MMonElection; - break; - - case MSG_PING: - m = new MPing(); - break; - case MSG_PING_ACK: - m = new MPingAck(); - break; - /* - case MSG_FAILURE: - m = new MFailure(); - break; - case MSG_FAILURE_ACK: - m = new MFailureAck(); - break; - */ - - case MSG_OSD_BOOT: - m = new MOSDBoot(); - break; - case MSG_OSD_IN: - m = new MOSDIn(); - break; - case MSG_OSD_OUT: - m = new MOSDOut(); - break; - case MSG_OSD_FAILURE: - m = new MOSDFailure(); - break; - case MSG_OSD_PING: - m = new MOSDPing(); - break; - case MSG_OSD_OP: - m = new MOSDOp(); - break; - case MSG_OSD_OPREPLY: - m = new MOSDOpReply(); - break; - - case MSG_OSD_MAP: - m = new MOSDMap(); - break; - case MSG_OSD_GETMAP: - m = new MOSDGetMap(); - break; - - case MSG_OSD_PG_NOTIFY: - m = new MOSDPGNotify(); - break; - case MSG_OSD_PG_QUERY: - m = new MOSDPGQuery(); - break; - case MSG_OSD_PG_LOG: - m = new MOSDPGLog(); - break; - case MSG_OSD_PG_REMOVE: - m = new MOSDPGRemove(); - break; - case MSG_OSD_PG_ACTIVATE_SET: - m = new MOSDPGActivateSet(); - break; - - // clients - case MSG_CLIENT_MOUNT: - m = new MClientMount; - break; - case MSG_CLIENT_UNMOUNT: - m = new MClientUnmount; - break; - case MSG_CLIENT_SESSION: - m = new MClientSession; - break; - case MSG_CLIENT_RECONNECT: - m = new MClientReconnect; - break; - case MSG_CLIENT_REQUEST: - m = new MClientRequest; - break; - case MSG_CLIENT_REQUEST_FORWARD: - m = new MClientRequestForward; - break; - case MSG_CLIENT_REPLY: - m = new MClientReply; - break; - case MSG_CLIENT_FILECAPS: - m = new MClientFileCaps; - break; - - // mds - case MSG_MDS_SLAVE_REQUEST: - m = new MMDSSlaveRequest; - break; - - case MSG_MDS_GETMAP: - m = new MMDSGetMap(); - break; - case MSG_MDS_MAP: - m = new MMDSMap(); - break; - case MSG_MDS_BEACON: - m = new MMDSBeacon; - break; - case MSG_MDS_RESOLVE: - m = new MMDSResolve; - break; - case MSG_MDS_RESOLVEACK: - m = new MMDSResolveAck; - break; - case MSG_MDS_CACHEREJOIN: - m = new MMDSCacheRejoin; - break; - /* - case MSG_MDS_CACHEREJOINACK: - m = new MMDSCacheRejoinAck; - break; - */ - - case MSG_MDS_DIRUPDATE: - m = new MDirUpdate(); - break; - - case MSG_MDS_DISCOVER: - m = new MDiscover(); - break; - case MSG_MDS_DISCOVERREPLY: - m = new MDiscoverReply(); - break; - - case MSG_MDS_FRAGMENTNOTIFY: - m = new MMDSFragmentNotify; - break; - - case MSG_MDS_EXPORTDIRDISCOVER: - m = new MExportDirDiscover(); - break; - case MSG_MDS_EXPORTDIRDISCOVERACK: - m = new MExportDirDiscoverAck(); - break; - case MSG_MDS_EXPORTDIRCANCEL: - m = new MExportDirCancel(); - break; - - case MSG_MDS_EXPORTDIR: - m = new MExportDir; - break; - case MSG_MDS_EXPORTDIRACK: - m = new MExportDirAck; - break; - case MSG_MDS_EXPORTDIRFINISH: - m = new MExportDirFinish; - break; - - case MSG_MDS_EXPORTDIRNOTIFY: - m = new MExportDirNotify(); - break; - - case MSG_MDS_EXPORTDIRNOTIFYACK: - m = new MExportDirNotifyAck(); - break; - - case MSG_MDS_EXPORTDIRPREP: - m = new MExportDirPrep(); - break; - - case MSG_MDS_EXPORTDIRPREPACK: - m = new MExportDirPrepAck(); - break; - - case MSG_MDS_EXPORTDIRWARNING: - m = new MExportDirWarning; - break; - case MSG_MDS_EXPORTDIRWARNINGACK: - m = new MExportDirWarningAck; - break; - - - - case MSG_MDS_DENTRYUNLINK: - m = new MDentryUnlink(); - break; - - case MSG_MDS_HEARTBEAT: - m = new MHeartbeat(); - break; - - case MSG_MDS_CACHEEXPIRE: - m = new MCacheExpire(); - break; - - case MSG_MDS_ANCHOR: - m = new MAnchor(); - break; - - /* case MSG_MDS_INODEUPDATE: - m = new MInodeUpdate(); - break; - */ - - case MSG_MDS_INODEFILECAPS: - m = new MInodeFileCaps(); - break; - - case MSG_MDS_LOCK: - m = new MLock(); - break; - - - // -- simple messages without payload -- - - case MSG_CLOSE: - case MSG_SHUTDOWN: - case MSG_MDS_SHUTDOWNSTART: - case MSG_MDS_SHUTDOWNFINISH: - case MSG_OSD_MKFS_ACK: - m = new MGenericMessage(env.type); - break; - - default: - dout(1) << "can't decode unknown message type " << env.type << dendl; - assert(0); - } - - // env - m->set_envelope(env); - - // decode - m->set_payload(payload); - m->decode_payload(); - - // done! - return m; -} - - diff --git a/branches/sage/crush/msg/Message.h b/branches/sage/crush/msg/Message.h deleted file mode 100644 index 9f0175e7a7d1e..0000000000000 --- a/branches/sage/crush/msg/Message.h +++ /dev/null @@ -1,259 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MESSAGE_H -#define __MESSAGE_H - -#define MSG_CLOSE 0 - -#define MSG_STATFS 1 -#define MSG_STATFS_REPLY 2 -#define MSG_PGSTATS 3 - -#define MSG_PING 10 -#define MSG_PING_ACK 11 - -#define MSG_SHUTDOWN 99999 - -#define MSG_MON_COMMAND 13 -#define MSG_MON_COMMAND_ACK 14 - - -#define MSG_MON_ELECTION 15 - -#define MSG_MON_OSDMAP_INFO 20 -#define MSG_MON_OSDMAP_LEASE 21 -#define MSG_MON_OSDMAP_LEASE_ACK 22 -#define MSG_MON_OSDMAP_UPDATE_PREPARE 23 -#define MSG_MON_OSDMAP_UPDATE_ACK 24 -#define MSG_MON_OSDMAP_UPDATE_COMMIT 25 - -#define MSG_MON_PAXOS 30 - -#define MSG_OSD_OP 40 // delete, etc. -#define MSG_OSD_OPREPLY 41 // delete, etc. -#define MSG_OSD_PING 42 - -#define MSG_OSD_GETMAP 43 -#define MSG_OSD_MAP 44 - -#define MSG_OSD_BOOT 45 -#define MSG_OSD_MKFS_ACK 46 - -#define MSG_OSD_FAILURE 47 - -#define MSG_OSD_IN 48 -#define MSG_OSD_OUT 49 - - - -#define MSG_OSD_PG_NOTIFY 50 -#define MSG_OSD_PG_QUERY 51 -#define MSG_OSD_PG_SUMMARY 52 -#define MSG_OSD_PG_LOG 53 -#define MSG_OSD_PG_REMOVE 54 -#define MSG_OSD_PG_ACTIVATE_SET 55 - -// -- client -- -// to monitor -#define MSG_CLIENT_MOUNT 60 -#define MSG_CLIENT_UNMOUNT 61 - -// to mds -#define MSG_CLIENT_SESSION 70 // start or stop -#define MSG_CLIENT_RECONNECT 71 - -#define MSG_CLIENT_REQUEST 80 -#define MSG_CLIENT_REQUEST_FORWARD 81 -#define MSG_CLIENT_REPLY 82 -#define MSG_CLIENT_FILECAPS 83 - - - -// *** MDS *** - -#define MSG_MDS_GETMAP 102 -#define MSG_MDS_MAP 103 -#define MSG_MDS_HEARTBEAT 104 // for mds load balancer -#define MSG_MDS_BEACON 105 // to monitor - -#define MSG_MDS_RESOLVE 106 -#define MSG_MDS_RESOLVEACK 107 - -#define MSG_MDS_CACHEREJOIN 108 - -#define MSG_MDS_DISCOVER 110 -#define MSG_MDS_DISCOVERREPLY 111 - -#define MSG_MDS_INODEGETREPLICA 112 -#define MSG_MDS_INODEGETREPLICAACK 113 - -#define MSG_MDS_INODEFILECAPS 115 - -#define MSG_MDS_INODEUPDATE 120 -#define MSG_MDS_DIRUPDATE 121 -#define MSG_MDS_INODEEXPIRE 122 -#define MSG_MDS_DIREXPIRE 123 - -#define MSG_MDS_DIREXPIREREQ 124 - -#define MSG_MDS_CACHEEXPIRE 125 - -#define MSG_MDS_ANCHOR 130 - -#define MSG_MDS_FRAGMENTNOTIFY 140 - -#define MSG_MDS_EXPORTDIRDISCOVER 149 -#define MSG_MDS_EXPORTDIRDISCOVERACK 150 -#define MSG_MDS_EXPORTDIRCANCEL 151 -#define MSG_MDS_EXPORTDIRPREP 152 -#define MSG_MDS_EXPORTDIRPREPACK 153 -#define MSG_MDS_EXPORTDIRWARNING 154 -#define MSG_MDS_EXPORTDIRWARNINGACK 155 -#define MSG_MDS_EXPORTDIR 156 -#define MSG_MDS_EXPORTDIRACK 157 -#define MSG_MDS_EXPORTDIRNOTIFY 158 -#define MSG_MDS_EXPORTDIRNOTIFYACK 159 -#define MSG_MDS_EXPORTDIRFINISH 160 - -#define MSG_MDS_SLAVE_REQUEST 170 - -#define MSG_MDS_DENTRYUNLINK 200 - -#define MSG_MDS_LOCK 500 - -#define MSG_MDS_SHUTDOWNSTART 900 -#define MSG_MDS_SHUTDOWNFINISH 901 - - -#include -#include - -#include -#include -using std::list; - -#include - - -#include "include/types.h" -#include "include/buffer.h" -#include "msg_types.h" - - - - -// ====================================================== - -// abstract Message class - - -class Message { - private: - - protected: - ceph_message_header env; // envelope - bufferlist payload; // payload - list chunk_payload_at; - - utime_t recv_stamp; - - friend class Messenger; -public: - - public: - Message() { - env.source_port = env.dest_port = 0; - env.nchunks = 0; - }; - Message(int t) { - env.source_port = env.dest_port = 0; - env.nchunks = 0; - env.type = t; - } - virtual ~Message() { - } - - - void clear_payload() { payload.clear(); } - bool empty_payload() { return payload.length() == 0; } - bufferlist& get_payload() { - return payload; - } - void set_payload(bufferlist& bl) { - payload.claim(bl); - } - void copy_payload(const bufferlist& bl) { - payload = bl; - } - const list& get_chunk_payload_at() const { return chunk_payload_at; } - void set_chunk_payload_at(list& o) { chunk_payload_at.swap(o); } - ceph_message_header& get_envelope() { - return env; - } - void set_envelope(ceph_message_header& env) { - this->env = env; - } - - - void set_recv_stamp(utime_t t) { recv_stamp = t; } - utime_t get_recv_stamp() { return recv_stamp; } - - // ENVELOPE ---- - - // type - int get_type() { return env.type; } - void set_type(int t) { env.type = t; } - virtual char *get_type_name() = 0; - - // source/dest - entity_inst_t& get_dest_inst() { return *(entity_inst_t*)&env.dst; } - void set_dest_inst(entity_inst_t& inst) { env.dst = *(ceph_entity_inst*)&inst; } - - entity_inst_t& get_source_inst() { return *(entity_inst_t*)&env.src; } - void set_source_inst(entity_inst_t& inst) { env.src = *(ceph_entity_inst*)&inst; } - - entity_name_t& get_dest() { return *(entity_name_t*)&env.dst.name; } - void set_dest(entity_name_t a, int p) { env.dst.name = *(ceph_entity_name*)&a; env.dest_port = p; } - int get_dest_port() { return env.dest_port; } - void set_dest_port(int p) { env.dest_port = p; } - - entity_name_t& get_source() { return *(entity_name_t*)&env.src.name; } - void set_source(entity_name_t a, int p) { env.src.name = *(ceph_entity_name*)&a; env.source_port = p; } - int get_source_port() { return env.source_port; } - - entity_addr_t& get_source_addr() { return *(entity_addr_t*)&env.src.addr; } - void set_source_addr(const entity_addr_t &i) { env.src.addr = *(ceph_entity_addr*)&i; } - - // PAYLOAD ---- - void reset_payload() { - payload.clear(); - } - - virtual void decode_payload() = 0; - virtual void encode_payload() = 0; - - virtual void print(ostream& out) { - out << get_type_name(); - } - -}; - -extern Message *decode_message(ceph_message_header &env, bufferlist& bl); -inline ostream& operator<<(ostream& out, Message& m) { - m.print(out); - return out; -} - -#endif diff --git a/branches/sage/crush/msg/Messenger.h b/branches/sage/crush/msg/Messenger.h deleted file mode 100644 index d29441a744ca0..0000000000000 --- a/branches/sage/crush/msg/Messenger.h +++ /dev/null @@ -1,99 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __MESSENGER_H -#define __MESSENGER_H - -#include -using namespace std; - -#include "Message.h" -#include "Dispatcher.h" -#include "common/Mutex.h" -#include "common/Cond.h" -#include "include/Context.h" - - - -class MDS; -class Timer; - -class Messenger { - private: - Dispatcher *dispatcher; - entity_name_t _myname; - - public: - Messenger(entity_name_t w) : dispatcher(0), _myname(w) { } - virtual ~Messenger() { } - - // accessors - entity_name_t get_myname() { return _myname; } - void _set_myname(entity_name_t m) { _myname = m; } - - virtual void reset_myname(entity_name_t m) = 0; - - virtual const entity_addr_t &get_myaddr() = 0; - - entity_inst_t get_myinst() { return entity_inst_t(_myname, get_myaddr()); } - - // hrmpf. - virtual int get_dispatch_queue_len() { return 0; }; - - // setup - void set_dispatcher(Dispatcher *d) { - if (!dispatcher) { - dispatcher = d; - ready(); - } - } - Dispatcher *get_dispatcher() { return dispatcher; } - virtual void ready() { } - bool is_ready() { return dispatcher != 0; } - - // dispatch incoming messages - virtual void dispatch(Message *m) { - assert(dispatcher); - dispatcher->dispatch(m); - } - - // shutdown - virtual int shutdown() = 0; - virtual void suicide() = 0; - - // send message - virtual void prepare_dest(const entity_addr_t& addr) {} - virtual int send_message(Message *m, entity_inst_t dest, - int port=0, int fromport=0) = 0; - virtual int send_first_message(Dispatcher *d, - Message *m, entity_inst_t dest, - int port=0, int fromport=0) { - set_dispatcher(d); - return send_message(m, dest, port, fromport); - } - - // make a procedure call - //virtual Message* sendrecv(Message *m, msg_name_t dest, int port=0); - - virtual void mark_down(entity_addr_t a) {} - -}; - - - - - -#endif diff --git a/branches/sage/crush/msg/SimpleMessenger.cc b/branches/sage/crush/msg/SimpleMessenger.cc deleted file mode 100644 index 7e29f033d83b5..0000000000000 --- a/branches/sage/crush/msg/SimpleMessenger.cc +++ /dev/null @@ -1,1445 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "SimpleMessenger.h" - -#include -#include -#include -#include -#include -#include -#include - -#include "config.h" - -#include "messages/MGenericMessage.h" - -#include - -#include -#include - -#define dout(l) if (l<=g_conf.debug_ms) *_dout << dbeginl << g_clock.now() << " " << pthread_self() << " -- " << rank.my_addr << " " -#define derr(l) if (l<=g_conf.debug_ms) *_derr << dbeginl << g_clock.now() << " " << pthread_self() << " -- " << rank.my_addr << " " - - - -#include "tcp.cc" - - -Rank rank; - - -sighandler_t old_sigint_handler = 0; - - -/******************************************** - * Accepter - */ - -void simplemessenger_sigint(int r) -{ - rank.sigint(); - if (old_sigint_handler) - old_sigint_handler(r); -} - -void Rank::sigint() -{ - lock.Lock(); - derr(0) << "got control-c, exiting" << dendl; - - // force close listener socket - ::close(accepter.listen_sd); - - // force close all pipe sockets, too - for (hash_map::iterator p = rank_pipe.begin(); - p != rank_pipe.end(); - ++p) - p->second->force_close(); - - lock.Unlock(); -} - - - -void noop_signal_handler(int s) -{ - //dout(0) << "blah_handler got " << s << dendl; -} - -int Rank::Accepter::start() -{ - // bind to a socket - dout(10) << "accepter.start" << dendl; - - char hostname[100]; - memset(hostname, 0, 100); - gethostname(hostname, 100); - dout(2) << "accepter.start my hostname is " << hostname << dendl; - - // is there a .ceph_hosts file? - { - ifstream fh; - fh.open(".ceph_hosts"); - if (fh.is_open()) { - while (1) { - string line; - getline(fh, line); - if (fh.eof()) break; - if (line[0] == '#' || line[0] == ';') continue; - int ospace = line.find(" "); - if (!ospace) continue; - string host = line.substr(0, ospace); - string addr = line.substr(ospace+1); - dout(15) << ".ceph_hosts: host '" << host << "' -> '" << addr << "'" << dendl; - if (host == hostname) { - parse_ip_port(addr.c_str(), g_my_addr); - dout(1) << ".ceph_hosts: my addr is " << g_my_addr << dendl; - break; - } - } - fh.close(); - } - } - - // use whatever user specified (if anything) - tcpaddr_t listen_addr; - g_my_addr.make_addr(listen_addr); - - /* socket creation */ - listen_sd = socket(AF_INET,SOCK_STREAM,0); - assert(listen_sd > 0); - - /* bind to port */ - int rc = bind(listen_sd, (struct sockaddr *) &listen_addr, sizeof(listen_addr)); - if (rc < 0) - derr(0) << "accepter.start unable to bind to " << listen_addr << dendl; - assert(rc >= 0); - - // what port did we get? - socklen_t llen = sizeof(listen_addr); - getsockname(listen_sd, (sockaddr*)&listen_addr, &llen); - - dout(10) << "accepter.start bound to " << listen_addr << dendl; - - // listen! - rc = ::listen(listen_sd, 1000); - assert(rc >= 0); - - // figure out my_addr - if (g_my_addr != entity_addr_t()) { - // user specified it, easy peasy. - rank.my_addr = g_my_addr; - } else { - // my IP is... HELP! - struct hostent *myhostname = gethostbyname(hostname); - - // look up my hostname. - listen_addr.sin_family = myhostname->h_addrtype; - memcpy((char*)&listen_addr.sin_addr.s_addr, - myhostname->h_addr_list[0], - myhostname->h_length); - rank.my_addr.set_addr(listen_addr); - rank.my_addr.v.port = 0; // see below - } - if (rank.my_addr.v.port == 0) { - entity_addr_t tmp; - tmp.set_addr(listen_addr); - rank.my_addr.v.port = tmp.v.port; - rank.my_addr.v.nonce = getpid(); // FIXME: pid might not be best choice here. - } - - dout(1) << "accepter.start my_addr is " << rank.my_addr << dendl; - - // set up signal handler - //old_sigint_handler = signal(SIGINT, simplemessenger_sigint); - - // set a harmless handle for SIGUSR1 (we'll use it to stop the accepter) - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_handler = noop_signal_handler; - sa.sa_flags = 0; - sigemptyset(&sa.sa_mask); - sigaction(SIGUSR1, &sa, NULL); - - // start thread - create(); - - return 0; -} - -void *Rank::Accepter::entry() -{ - dout(10) << "accepter starting" << dendl; - - fd_set fds; - while (!done) { - FD_ZERO(&fds); - FD_SET(listen_sd, &fds); - dout(20) << "accepter calling select" << dendl; - int r = ::select(listen_sd+1, &fds, 0, &fds, 0); - dout(20) << "accepter select got " << r << dendl; - - if (done) break; - - // accept - struct sockaddr_in addr; - socklen_t slen = sizeof(addr); - int sd = ::accept(listen_sd, (sockaddr*)&addr, &slen); - if (sd > 0) { - dout(10) << "accepted incoming on sd " << sd << dendl; - - rank.lock.Lock(); - if (!rank.local.empty()) { - Pipe *p = new Pipe(sd); - rank.pipes.insert(p); - } - rank.lock.Unlock(); - } else { - dout(10) << "no incoming connection?" << dendl; - } - } - - dout(20) << "accepter closing" << dendl; - ::close(listen_sd); - dout(10) << "accepter stopping" << dendl; - return 0; -} - -void Rank::Accepter::stop() -{ - done = true; - this->kill(SIGUSR1); - join(); -} - - -/************************************** - * Pipe - */ - -int Rank::Pipe::accept() -{ - // my creater gave me sd via accept() - - // announce myself. - int rc = tcp_write(sd, (char*)&rank.my_addr, sizeof(rank.my_addr)); - if (rc < 0) { - ::close(sd); - done = true; - return -1; - } - - // identify peer - rc = tcp_read(sd, (char*)&peer_addr, sizeof(peer_addr)); - if (rc < 0) { - dout(10) << "pipe(? " << this << ").accept couldn't read peer inst" << dendl; - ::close(sd); - done = true; - return -1; - } - - // register pipe. - rank.lock.Lock(); - { - if (rank.rank_pipe.count(peer_addr) == 0) { - // install as outgoing pipe! - dout(10) << "pipe(" << peer_addr << ' ' << this << ").accept peer is " << peer_addr << dendl; - rank.rank_pipe[peer_addr] = this; - - // create writer thread. - writer_running = true; - writer_thread.create(); - } else { - // hrm, this may affect message delivery order.. keep both pipes! - dout(10) << "pipe(" << peer_addr << ' ' << this << ").accept already have a pipe for this peer (" << rank.rank_pipe[peer_addr] << "), will receive on this pipe only" << dendl; - - // FIXME i could stop the receiver on the other pipe.. - - /* - // low ranks' Pipes "win" - if (peer_addr < rank.my_addr) { - dout(10) << "pipe(" << peer_addr << ' ' << this << ").accept peer is " << peer_addr - << ", already had pipe, but switching to this new one" << dendl; - // switch to this new Pipe - rank.rank_pipe[peer_addr]->unregister(); // close old one - rank.rank_pipe[peer_addr]->close(); // close old one - rank.rank_pipe[peer_addr] = this; - } else { - dout(10) << "pipe(" << peer_addr << ' ' << this << ").accept peer is " << peer_addr - << ", already had pipe, sticking with it" << dendl; - } - */ - } - } - rank.lock.Unlock(); - - return 0; // success. -} - -int Rank::Pipe::connect() -{ - dout(10) << "pipe(" << peer_addr << ' ' << this << ").connect" << dendl; - - // create socket? - sd = socket(AF_INET,SOCK_STREAM,0); - assert(sd > 0); - - // bind any port - struct sockaddr_in myAddr; - myAddr.sin_family = AF_INET; - myAddr.sin_addr.s_addr = htonl(INADDR_ANY); - myAddr.sin_port = htons( 0 ); - - int rc = bind(sd, (struct sockaddr *) &myAddr, sizeof(myAddr)); - assert(rc>=0); - - // connect! - tcpaddr_t tcpaddr; - peer_addr.make_addr(tcpaddr); - rc = ::connect(sd, (sockaddr*)&tcpaddr, sizeof(myAddr)); - if (rc < 0) { - dout(10) << "connect error " << peer_addr - << ", " << errno << ": " << strerror(errno) << dendl; - return rc; - } - - // identify peer ..... FIXME - entity_addr_t paddr; - rc = tcp_read(sd, (char*)&paddr, sizeof(paddr)); - if (!rc) { // bool - dout(10) << "pipe(" << peer_addr << ' ' << this << ").connect couldn't read peer addr" << dendl; - return -1; - } - if (peer_addr != paddr) { - dout(10) << "pipe(" << peer_addr << ' ' << this << ").connect peer identifies itself as " << paddr << ", wrong guy!" << dendl; - ::close(sd); - sd = 0; - return -1; - } - - // identify myself - rc = tcp_write(sd, (char*)&rank.my_addr, sizeof(rank.my_addr)); - if (rc < 0) - return -1; - - // register pipe - /* - rank.lock.Lock(); - { - if (rank.rank_pipe.count(peer_addr) == 0) { - dout(10) << "pipe(" << peer_addr << ' ' << this << ").connect registering pipe" << dendl; - rank.rank_pipe[peer_addr] = this; - } else { - // this is normal. - dout(10) << "pipe(" << peer_addr << ' ' << this << ").connect pipe already registered." << dendl; - } - } - rank.lock.Unlock(); - */ - - // start reader - reader_running = true; - reader_thread.create(); - - return 0; -} - - -void Rank::Pipe::unregister() -{ - assert(rank.lock.is_locked()); - if (rank.rank_pipe.count(peer_addr) && - rank.rank_pipe[peer_addr] == this) { - dout(10) << "pipe(" << peer_addr << ' ' << this - << ").unregister" << dendl; - rank.rank_pipe.erase(peer_addr); - } else { - dout(10) << "pipe(" << peer_addr << ' ' << this - << ").unregister - not registerd" << dendl; - } -} - -void Rank::Pipe::close() -{ - dout(10) << "pipe(" << peer_addr << ' ' << this << ").close" << dendl; - - // queue close message? - if (!need_to_send_close) { - dout(10) << "pipe(" << peer_addr << ' ' << this - << ").close already closing/closed" << dendl; - return; - } - - if (!writer_running) { - dout(10) << "pipe(" << peer_addr << ' ' << this - << ").close not queueing MSG_CLOSE, no writer running" << dendl; - } else { - dout(10) << "pipe(" << peer_addr << ' ' << this - << ").close queueing MSG_CLOSE" << dendl; - lock.Lock(); - q.push_back(new MGenericMessage(MSG_CLOSE)); - cond.Signal(); - need_to_send_close = false; - lock.Unlock(); - } -} - - -/* read msgs from socket. - * also, server. - * - */ -void Rank::Pipe::reader() -{ - if (server) - accept(); - - // loop. - while (!done) { - Message *m = read_message(); - if (!m || m->get_type() == 0) { - if (m) { - delete m; - dout(10) << "pipe(" << peer_addr << ' ' << this << ").reader read MSG_CLOSE message" << dendl; - need_to_send_close = false; - } else { - derr(10) << "pipe(" << peer_addr << ' ' << this << ").reader read null message" << dendl; - } - - rank.lock.Lock(); - unregister(); - rank.lock.Unlock(); - close(); - - done = true; - cond.Signal(); // wake up writer too. - break; - } - - dout(10) << "pipe(" << peer_addr << ' ' << this << ").reader got message " - << m << " " << *m - << " for " << m->get_dest() << dendl; - - // deliver - EntityMessenger *entity = 0; - - rank.lock.Lock(); - { - if (g_conf.ms_single_dispatch) { - // submit to single dispatch queue - rank._submit_single_dispatch(m); - } else { - if (rank.local.count(m->get_dest())) { - // find entity - entity = rank.local[m->get_dest()]; - } else { - entity = rank.find_unnamed(m->get_dest()); - if (entity) { - dout(3) << "pipe(" << peer_addr << ' ' << this << ").reader blessing " << m->get_dest() << dendl; - //entity->reset_myname(m->get_dest()); - rank.local.erase(entity->get_myname()); - rank.local[m->get_dest()] = entity; - entity->_set_myname(m->get_dest()); - - } else { - if (rank.stopped.count(m->get_dest())) { - // ignore it - } else { - derr(0) << "pipe(" << peer_addr << ' ' << this << ").reader got message " << *m << " for " << m->get_dest() << ", which isn't local" << dendl; - //assert(0); // FIXME do this differently - } - } - } - } - } - rank.lock.Unlock(); - - if (entity) - entity->queue_message(m); // queue - } - - - // reap? - bool reap = false; - lock.Lock(); - { - reader_running = false; - if (!writer_running) reap = true; - } - lock.Unlock(); - - if (reap) { - dout(20) << "pipe(" << peer_addr << ' ' << this << ").reader queueing for reap" << dendl; - ::close(sd); - rank.lock.Lock(); - { - rank.pipe_reap_queue.push_back(this); - rank.wait_cond.Signal(); - } - rank.lock.Unlock(); - } -} - - -/* write msgs to socket. - * also, client. - */ -void Rank::Pipe::writer() -{ - if (!server) { - int rc = connect(); - if (rc < 0) { - derr(1) << "pipe(" << peer_addr << ' ' << this << ").writer error connecting, " - << errno << ": " << strerror(errno) - << dendl; - done = true; - list out; - fail(out); - } - } - - // disable Nagle algorithm? - if (g_conf.ms_tcp_nodelay) { - int flag = 1; - int r = ::setsockopt(sd, IPPROTO_TCP, TCP_NODELAY, (char*)&flag, sizeof(flag)); - if (r < 0) - dout(0) << "pipe(" << peer_addr << ' ' << this << ").writer couldn't set TCP_NODELAY: " << strerror(errno) << dendl; - } - - // loop. - lock.Lock(); - while (!q.empty() || !done) { - - if (!q.empty()) { - dout(20) << "pipe(" << peer_addr << ' ' << this << ").writer grabbing message(s)" << dendl; - - // grab outgoing list - list out; - out.swap(q); - - // drop lock while i send these - lock.Unlock(); - - while (!out.empty()) { - Message *m = out.front(); - out.pop_front(); - - dout(20) << "pipe(" << peer_addr << ' ' << this << ").writer sending " << m << " " << *m << dendl; - - // stamp. - m->set_source_addr(rank.my_addr); - - // marshall - if (m->empty_payload()) - m->encode_payload(); - - if (write_message(m) < 0) { - // failed! - derr(1) << "pipe(" << peer_addr << ' ' << this << ").writer error sending " << *m << " to " << m->get_dest() - << ", " << errno << ": " << strerror(errno) - << dendl; - out.push_front(m); - fail(out); - done = true; - break; - } - - // did i just send a close? - if (m->get_type() == MSG_CLOSE) - done = true; - - // clean up - delete m; - } - - lock.Lock(); - continue; - } - - // wait - dout(20) << "pipe(" << peer_addr << ' ' << this << ").writer sleeping" << dendl; - cond.Wait(lock); - } - lock.Unlock(); - - dout(20) << "pipe(" << peer_addr << ' ' << this << ").writer finishing" << dendl; - - // reap? - bool reap = false; - lock.Lock(); - { - writer_running = false; - if (!reader_running) reap = true; - } - lock.Unlock(); - - if (reap) { - dout(20) << "pipe(" << peer_addr << ' ' << this << ").writer queueing for reap" << dendl; - ::close(sd); - rank.lock.Lock(); - { - rank.pipe_reap_queue.push_back(this); - rank.wait_cond.Signal(); - } - rank.lock.Unlock(); - } -} - - -Message *Rank::Pipe::read_message() -{ - // envelope - //dout(10) << "receiver.read_message from sd " << sd << dendl; - - ceph_message_header env; - if (!tcp_read( sd, (char*)&env, sizeof(env) )) { - need_to_send_close = false; - return 0; - } - - dout(20) << "pipe(" << peer_addr << ' ' << this << ").reader got envelope type=" << env.type - << " src " << env.src << " dst " << env.dst - << " nchunks=" << env.nchunks - << dendl; - - // payload - bufferlist blist; - int32_t pos = 0; - list chunk_at; - for (unsigned i=0; iset_chunk_payload_at(chunk_at); - - dout(20) << "pipe(" << peer_addr << ' ' << this << ").reader got " << s << " byte message from " - << m->get_source() << dendl; - - return m; -} - - -int Rank::Pipe::do_sendmsg(Message *m, struct msghdr *msg, int len) -{ - while (len > 0) { - if (0) { // sanity - int l = 0; - for (unsigned i=0; imsg_iovlen; i++) - l += msg->msg_iov[i].iov_len; - assert(l == len); - } - - int r = ::sendmsg(sd, msg, 0); - if (r < 0) { - assert(r == -1); - derr(1) << "pipe(" << peer_addr << ' ' << this << ").writer error on sendmsg for " << *m - << " to " << m->get_dest() - << ", " << strerror(errno) - << dendl; - need_to_send_close = false; - return -1; - } - len -= r; - if (len == 0) break; - - // hrmph. trim r bytes off the front of our message. - dout(20) << "pipe(" << peer_addr << ' ' << this << ").writer partial sendmsg for " << *m - << " to " << m->get_dest() - << " did " << r << ", still have " << len - << dendl; - while (r > 0) { - if (msg->msg_iov[0].iov_len <= (size_t)r) { - // lose this whole item - //dout(30) << "skipping " << msg->msg_iov[0].iov_len << ", " << (msg->msg_iovlen-1) << " v, " << r << " left" << dendl; - r -= msg->msg_iov[0].iov_len; - msg->msg_iov++; - msg->msg_iovlen--; - } else { - // partial! - //dout(30) << "adjusting " << msg->msg_iov[0].iov_len << ", " << msg->msg_iovlen << " v, " << r << " left" << dendl; - msg->msg_iov[0].iov_base = (void*)((long)msg->msg_iov[0].iov_base + r); - msg->msg_iov[0].iov_len -= r; - break; - } - } - } - return 0; -} - - -int Rank::Pipe::write_message(Message *m) -{ - // get envelope, buffers - ceph_message_header *env = &m->get_envelope(); - bufferlist blist; - blist.claim( m->get_payload() ); - - // chunk out page aligned buffers? - if (blist.length() == 0) - env->nchunks = 0; - else { - env->nchunks = 1 + m->get_chunk_payload_at().size(); // header + explicit chunk points - if (!m->get_chunk_payload_at().empty()) - dout(20) << "chunking at " << m->get_chunk_payload_at() - << " in " << *m << " len " << blist.length() - << dendl; - } - - dout(20) << "pipe(" << peer_addr << ' ' << this << ").write_message " << m << " " << *m - << " to " << m->get_dest() - << " in " << env->nchunks - << dendl; - - // set up msghdr and iovecs - struct msghdr msg; - memset(&msg, 0, sizeof(msg)); - struct iovec msgvec[1 + blist.buffers().size() + env->nchunks*2]; // conservative upper bound - msg.msg_iov = msgvec; - int msglen = 0; - - // send envelope - msgvec[0].iov_base = (char*)env; - msgvec[0].iov_len = sizeof(*env); - msglen += sizeof(*env); - msg.msg_iovlen++; - - // payload - list::const_iterator pb = blist.buffers().begin(); - list::const_iterator pc = m->get_chunk_payload_at().begin(); - int b_off = 0; // carry-over buffer offset, if any - int bl_pos = 0; // blist pos - int nchunks = env->nchunks; - int32_t chunksizes[nchunks]; - - for (int curchunk=0; curchunk < nchunks; curchunk++) { - // start a chunk - int32_t size = blist.length() - bl_pos; - if (pc != m->get_chunk_payload_at().end()) { - assert(*pc > bl_pos); - size = *pc - bl_pos; - dout(30) << "pos " << bl_pos << " explicit chunk at " << *pc << " size " << size << " of " << blist.length() << dendl; - pc++; - } - assert(size > 0); - dout(30) << "chunk " << curchunk << " pos " << bl_pos << " size " << size << dendl; - - // chunk size - chunksizes[curchunk] = size; - msgvec[msg.msg_iovlen].iov_base = &chunksizes[curchunk]; - msgvec[msg.msg_iovlen].iov_len = sizeof(int32_t); - msglen += sizeof(int32_t); - msg.msg_iovlen++; - - // chunk contents - int left = size; - while (left > 0) { - int donow = MIN(left, (int)pb->length()-b_off); - assert(donow > 0); - dout(30) << " bl_pos " << bl_pos << " b_off " << b_off - << " leftinchunk " << left - << " buffer len " << pb->length() - << " writing " << donow - << dendl; - - if (msg.msg_iovlen >= IOV_MAX-1) { - if (do_sendmsg(m, &msg, msglen)) - return -1; - - // and restart the iov - msg.msg_iov = msgvec; - msg.msg_iovlen = 0; - msglen = 0; - } - - msgvec[msg.msg_iovlen].iov_base = (void*)(pb->c_str()+b_off); - msgvec[msg.msg_iovlen].iov_len = donow; - msglen += donow; - msg.msg_iovlen++; - - left -= donow; - assert(left >= 0); - b_off += donow; - bl_pos += donow; - if (b_off != (int)pb->length()) - break; - pb++; - b_off = 0; - } - assert(left == 0); - } - assert(pb == blist.buffers().end()); - - // send - if (do_sendmsg(m, &msg, msglen)) - return -1; - - return 0; -} - - -void Rank::Pipe::fail(list& out) -{ - derr(10) << "pipe(" << peer_addr << ' ' << this << ").fail" << dendl; - - // FIXME: possible race before i reclaim lock here? - - // deactivate myself - rank.lock.Lock(); - { - if (rank.rank_pipe.count(peer_addr) && - rank.rank_pipe[peer_addr] == this) - rank.rank_pipe.erase(peer_addr); - } - rank.lock.Unlock(); - - // what do i do about reader()? FIXME - - // sort my messages by (source) dispatcher, dest. - map > > by_dis; - lock.Lock(); - { - // include out at front of queue - q.splice(q.begin(), out); - - // sort - while (!q.empty()) { - if (q.front()->get_type() == MSG_CLOSE) { - delete q.front(); - } - else if (rank.local.count(q.front()->get_source())) { - EntityMessenger *mgr = rank.local[q.front()->get_source()]; - Dispatcher *dis = mgr->get_dispatcher(); - if (mgr->is_stopped()) { - // ignore. - dout(1) << "pipe(" << peer_addr << ' ' << this << ").fail on " << *q.front() << ", dispatcher stopping, ignoring." << dendl; - delete q.front(); - } else { - by_dis[dis][q.front()->get_dest()].push_back(q.front()); - } - } - else { - // oh well. sending entity musta just shut down? - delete q.front(); - } - q.pop_front(); - } - } - lock.Unlock(); - - // report failure(s) to dispatcher(s) - for (map > >::iterator i = by_dis.begin(); - i != by_dis.end(); - ++i) - for (map >::iterator j = i->second.begin(); - j != i->second.end(); - ++j) - for (list::iterator k = j->second.begin(); - k != j->second.end(); - ++k) { - derr(1) << "pipe(" << peer_addr << ' ' << this << ").fail on " << **k << " to " << (*k)->get_dest_inst() << dendl; - if (i->first) - i->first->ms_handle_failure(*k, (*k)->get_dest_inst()); - } -} - - - - - - -/******************************************** - * Rank - */ - -Rank::Rank() : - single_dispatcher(this), - started(false) { -} -Rank::~Rank() -{ -} - -/* -void Rank::set_listen_addr(tcpaddr_t& a) -{ - dout(10) << "set_listen_addr " << a << dendl; - memcpy((char*)&listen_addr.sin_addr.s_addr, (char*)&a.sin_addr.s_addr, 4); - listen_addr.sin_port = a.sin_port; -} -*/ - -void Rank::_submit_single_dispatch(Message *m) -{ - assert(lock.is_locked()); - - if (local.count(m->get_dest()) && - local[m->get_dest()]->is_ready()) { - rank.single_dispatch_queue.push_back(m); - rank.single_dispatch_cond.Signal(); - } else { - waiting_for_ready[m->get_dest()].push_back(m); - } -} - - -void Rank::single_dispatcher_entry() -{ - lock.Lock(); - while (!single_dispatch_stop || !single_dispatch_queue.empty()) { - if (!single_dispatch_queue.empty()) { - list ls; - ls.swap(single_dispatch_queue); - - lock.Unlock(); - { - while (!ls.empty()) { - Message *m = ls.front(); - ls.pop_front(); - - dout(1) << m->get_dest() - << " <-- " << m->get_source_inst() - << " ---- " << *m - << " -- " << m - << dendl; - - assert(local.count(m->get_dest())); - local[m->get_dest()]->dispatch(m); - } - } - lock.Lock(); - continue; - } - single_dispatch_cond.Wait(lock); - } - lock.Unlock(); -} - - -/* - * note: assumes lock is held - */ -void Rank::reaper() -{ - dout(10) << "reaper" << dendl; - assert(lock.is_locked()); - - while (!pipe_reap_queue.empty()) { - Pipe *p = pipe_reap_queue.front(); - dout(10) << "reaper reaping pipe " << p->get_peer_addr() << dendl; - pipe_reap_queue.pop_front(); - assert(pipes.count(p)); - pipes.erase(p); - p->join(); - dout(10) << "reaper reaped pipe " << p->get_peer_addr() << dendl; - delete p; - } -} - - -int Rank::start_rank() -{ - lock.Lock(); - if (started) { - dout(10) << "start_rank already started" << dendl; - lock.Unlock(); - return 0; - } - dout(10) << "start_rank" << dendl; - lock.Unlock(); - - // bind to a socket - if (accepter.start() < 0) - return -1; - - // start single thread dispatcher? - if (g_conf.ms_single_dispatch) { - single_dispatch_stop = false; - single_dispatcher.create(); - } - - lock.Lock(); - - dout(1) << "start_rank at " << my_addr << dendl; - started = true; - lock.Unlock(); - return 0; -} - - - -/* connect_rank - * NOTE: assumes rank.lock held. - */ -Rank::Pipe *Rank::connect_rank(const entity_addr_t& addr) -{ - assert(rank.lock.is_locked()); - assert(addr != rank.my_addr); - - dout(10) << "connect_rank to " << addr << ", creating pipe and registering" << dendl; - - // create pipe - Pipe *pipe = new Pipe(addr); - rank.rank_pipe[addr] = pipe; - pipes.insert(pipe); - - // register - rank.rank_pipe[addr] = pipe; - - return pipe; -} - - - - - - -Rank::EntityMessenger *Rank::find_unnamed(entity_name_t a) -{ - // find an unnamed (and _ready_) local entity of the right type - for (map::iterator p = local.begin(); - p != local.end(); - ++p) { - if (p->first.type() == a.type() && p->first.is_new() && - p->second->is_ready()) - return p->second; - } - return 0; -} - - - - -/* register_entity - */ -Rank::EntityMessenger *Rank::register_entity(entity_name_t name) -{ - dout(10) << "register_entity " << name << dendl; - lock.Lock(); - - // create messenger - EntityMessenger *msgr = new EntityMessenger(name); - - // add to directory - assert(local.count(name) == 0); - local[name] = msgr; - - lock.Unlock(); - return msgr; -} - - -void Rank::unregister_entity(EntityMessenger *msgr) -{ - lock.Lock(); - dout(10) << "unregister_entity " << msgr->get_myname() << dendl; - - // remove from local directory. - entity_name_t name = msgr->get_myname(); - assert(local.count(name)); - local.erase(name); - - stopped.insert(name); - wait_cond.Signal(); - - lock.Unlock(); -} - - -void Rank::submit_message(Message *m, const entity_addr_t& dest_addr) -{ - const entity_name_t dest = m->get_dest(); - - // lookup - EntityMessenger *entity = 0; - Pipe *pipe = 0; - - lock.Lock(); - { - // local? - if (dest_addr == my_addr) { - if (local.count(dest)) { - // local - dout(20) << "submit_message " << *m << " dest " << dest << " local" << dendl; - if (g_conf.ms_single_dispatch) { - _submit_single_dispatch(m); - } else { - entity = local[dest]; - } - } else { - derr(0) << "submit_message " << *m << " dest " << dest << " " << dest_addr << " local but not in local map?" << dendl; - //assert(0); // hmpf, this is probably mds->mon beacon from newsyn. - } - } - else { - // remote. - if (rank_pipe.count( dest_addr )) { - dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << dest_addr << ", already connected." << dendl; - // connected. - pipe = rank_pipe[ dest_addr ]; - } else { - dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << dest_addr << ", connecting." << dendl; - // not connected. - pipe = connect_rank( dest_addr ); - } - } - } - - // do it - if (entity) { - // local! - dout(20) << "submit_message " << *m << " dest " << dest << " local, queueing" << dendl; - entity->queue_message(m); - } - else if (pipe) { - // remote! - dout(20) << "submit_message " << *m << " dest " << dest << " remote, sending" << dendl; - pipe->send(m); - } - - lock.Unlock(); -} - - - - - -void Rank::wait() -{ - lock.Lock(); - while (1) { - // reap dead pipes - reaper(); - - if (local.empty()) { - dout(10) << "wait: everything stopped" << dendl; - break; // everything stopped. - } else { - dout(10) << "wait: local still has " << local.size() << " items, waiting" << dendl; - } - - wait_cond.Wait(lock); - } - lock.Unlock(); - - // done! clean up. - dout(20) << "wait: stopping accepter thread" << dendl; - accepter.stop(); - dout(20) << "wait: stopped accepter thread" << dendl; - - // stop dispatch thread - if (g_conf.ms_single_dispatch) { - dout(10) << "wait: stopping dispatch thread" << dendl; - lock.Lock(); - single_dispatch_stop = true; - single_dispatch_cond.Signal(); - lock.Unlock(); - single_dispatcher.join(); - } - - // close+reap all pipes - lock.Lock(); - { - dout(10) << "wait: closing pipes" << dendl; - list toclose; - for (hash_map::iterator i = rank_pipe.begin(); - i != rank_pipe.end(); - i++) - toclose.push_back(i->second); - for (list::iterator i = toclose.begin(); - i != toclose.end(); - i++) { - (*i)->unregister(); - (*i)->close(); - } - - reaper(); - dout(10) << "wait: waiting for pipes " << pipes << " to close" << dendl; - while (!pipes.empty()) { - wait_cond.Wait(lock); - reaper(); - } - } - lock.Unlock(); - - dout(10) << "wait: done." << dendl; - dout(1) << "shutdown complete." << dendl; -} - - - - - - -/********************************** - * EntityMessenger - */ - -void Rank::EntityMessenger::dispatch_entry() -{ - lock.Lock(); - while (!stop) { - if (!dispatch_queue.empty() || !prio_dispatch_queue.empty()) { - list ls; - if (!prio_dispatch_queue.empty()) { - ls.swap(prio_dispatch_queue); - pqlen = 0; - } else { - if (0) { - ls.swap(dispatch_queue); - qlen = 0; - } else { - // limit how much low-prio stuff we grab, to avoid starving high-prio messages! - ls.push_back(dispatch_queue.front()); - dispatch_queue.pop_front(); - qlen--; - } - } - - lock.Unlock(); - { - // deliver - while (!ls.empty()) { - if (stop) { - dout(1) << "dispatch: stop=true, discarding " << ls.size() - << " messages in dispatch queue" << dendl; - break; - } - Message *m = ls.front(); - ls.pop_front(); - dout(1) << m->get_dest() - << " <== " << m->get_source_inst() - << " ==== " << *m - << " ==== " << m - << dendl; - dispatch(m); - dout(20) << "done calling dispatch on " << m << dendl; - } - } - lock.Lock(); - continue; - } - cond.Wait(lock); - } - lock.Unlock(); - - // deregister - rank.unregister_entity(this); -} - -void Rank::EntityMessenger::ready() -{ - dout(10) << "ready " << get_myaddr() << dendl; - assert(!dispatch_thread.is_started()); - - if (g_conf.ms_single_dispatch) { - rank.lock.Lock(); - if (rank.waiting_for_ready.count(get_myname())) { - rank.single_dispatch_queue.splice(rank.single_dispatch_queue.end(), - rank.waiting_for_ready[get_myname()]); - rank.waiting_for_ready.erase(get_myname()); - rank.single_dispatch_cond.Signal(); - } - rank.lock.Unlock(); - } else { - // start my dispatch thread - dispatch_thread.create(); - } -} - - -int Rank::EntityMessenger::shutdown() -{ - dout(10) << "shutdown " << get_myaddr() << dendl; - - // stop my dispatch thread - if (dispatch_thread.am_self()) { - dout(10) << "shutdown i am dispatch, setting stop flag" << dendl; - stop = true; - } else { - dout(10) << "shutdown i am not dispatch, setting stop flag and joining thread." << dendl; - lock.Lock(); - stop = true; - cond.Signal(); - lock.Unlock(); - } - - return 0; -} - -void Rank::EntityMessenger::suicide() -{ - dout(10) << "suicide " << get_myaddr() << dendl; - shutdown(); - // hmm, or exit(0)? -} - -void Rank::EntityMessenger::prepare_dest(const entity_addr_t& addr) -{ - rank.lock.Lock(); - { - if (rank.rank_pipe.count(addr) == 0) - rank.connect_rank(addr); - } - rank.lock.Unlock(); -} - -int Rank::EntityMessenger::send_message(Message *m, entity_inst_t dest, - int port, int fromport) -{ - // set envelope - m->set_source(get_myname(), fromport); - m->set_source_addr(rank.my_addr); - m->set_dest_inst(dest); - m->set_dest_port(port); - - dout(1) << m->get_source() - << " --> " << dest.name << " " << dest.addr - << " -- " << *m - << " -- " << m - << dendl; - - rank.submit_message(m, dest.addr); - - return 0; -} - -int Rank::EntityMessenger::send_first_message(Dispatcher *d, - Message *m, entity_inst_t dest, - int port, int fromport) -{ - /* hacky thing for csyn and newsyn: - * set dispatcher (go active) AND set sender for this - * message while holding rank.lock. this prevents any - * races against incoming unnamed messages naming us before - * we fire off our first message. - */ - rank.lock.Lock(); - set_dispatcher(d); - - // set envelope - m->set_source(get_myname(), fromport); - m->set_source_addr(rank.my_addr); - m->set_dest_inst(dest); - m->set_dest_port(port); - rank.lock.Unlock(); - - dout(1) << m->get_source() - << " --> " << dest.name << " " << dest.addr - << " -- " << *m - << " -- " << m - << dendl; - - rank.submit_message(m, dest.addr); - - return 0; -} - - -const entity_addr_t &Rank::EntityMessenger::get_myaddr() -{ - return rank.my_addr; -} - - -void Rank::EntityMessenger::reset_myname(entity_name_t newname) -{ - rank.lock.Lock(); - { - entity_name_t oldname = get_myname(); - dout(10) << "reset_myname " << oldname << " to " << newname << dendl; - - rank.local.erase(oldname); - rank.local[newname] = this; - - _set_myname(newname); - } - rank.lock.Unlock(); -} - - - - -void Rank::EntityMessenger::mark_down(entity_addr_t a) -{ - rank.mark_down(a); -} - -void Rank::mark_down(entity_addr_t addr) -{ - //if (my_rank == 0) return; // ugh.. rank0 already handles this stuff in the namer - lock.Lock(); - /* - if (entity_map.count(a) && - entity_map[a] > inst) { - dout(10) << "mark_down " << a << " inst " << inst << " < " << entity_map[a] << dendl; - derr(10) << "mark_down " << a << " inst " << inst << " < " << entity_map[a] << dendl; - // do nothing! - } else { - if (entity_map.count(a) == 0) { - // don't know it - dout(10) << "mark_down " << a << " inst " << inst << " ... unknown by me" << dendl; - derr(10) << "mark_down " << a << " inst " << inst << " ... unknown by me" << dendl; - } else { - // know it - assert(entity_map[a] <= inst); - dout(10) << "mark_down " << a << " inst " << inst << dendl; - derr(10) << "mark_down " << a << " inst " << inst << dendl; - - entity_map.erase(a); - - if (rank_pipe.count(inst)) { - rank_pipe[inst]->close(); - rank_pipe.erase(inst); - } - } - } - */ - lock.Unlock(); -} - - diff --git a/branches/sage/crush/msg/SimpleMessenger.h b/branches/sage/crush/msg/SimpleMessenger.h deleted file mode 100644 index 6bd417adc8e10..0000000000000 --- a/branches/sage/crush/msg/SimpleMessenger.h +++ /dev/null @@ -1,314 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __SIMPLEMESSENGER_H -#define __SIMPLEMESSENGER_H - - -#include -#include -using namespace std; -#include -#include -using namespace __gnu_cxx; - - -#include "include/types.h" - -#include "common/Mutex.h" -#include "common/Cond.h" -#include "common/Thread.h" - -#include "Messenger.h" -#include "Message.h" -#include "tcp.h" - - - -/* Rank - per-process - */ -class Rank { -public: - void sigint(); - -private: - class EntityMessenger; - class Pipe; - - // incoming - class Accepter : public Thread { - public: - bool done; - - int listen_sd; - - Accepter() : done(false) {} - - void *entry(); - void stop(); - int start(); - } accepter; - - void sigint(int r); - - - // pipe - class Pipe { - protected: - int sd; - bool done; - entity_addr_t peer_addr; - bool server; - bool need_to_send_close; - - bool reader_running; - bool writer_running; - - list q; - Mutex lock; - Cond cond; - - int accept(); // server handshake - int connect(); // client handshake - void reader(); - void writer(); - - Message *read_message(); - int write_message(Message *m); - int do_sendmsg(Message *m, struct msghdr *msg, int len); - void fail(list& ls); - - // threads - class Reader : public Thread { - Pipe *pipe; - public: - Reader(Pipe *p) : pipe(p) {} - void *entry() { pipe->reader(); return 0; } - } reader_thread; - friend class Reader; - - class Writer : public Thread { - Pipe *pipe; - public: - Writer(Pipe *p) : pipe(p) {} - void *entry() { pipe->writer(); return 0; } - } writer_thread; - friend class Writer; - - public: - Pipe(int s) : sd(s), - done(false), server(true), - need_to_send_close(true), - reader_running(false), writer_running(false), - reader_thread(this), writer_thread(this) { - // server - reader_running = true; - reader_thread.create(); - } - Pipe(const entity_addr_t &pi) : sd(0), - done(false), peer_addr(pi), server(false), - need_to_send_close(true), - reader_running(false), writer_running(false), - reader_thread(this), writer_thread(this) { - // client - writer_running = true; - writer_thread.create(); - } - - // public constructors - static const Pipe& Server(int s); - static const Pipe& Client(const entity_addr_t& pi); - - entity_addr_t& get_peer_addr() { return peer_addr; } - - void unregister(); - void close(); - void join() { - if (writer_thread.is_started()) writer_thread.join(); - if (reader_thread.is_started()) reader_thread.join(); - } - - void send(Message *m) { - lock.Lock(); - q.push_back(m); - cond.Signal(); - lock.Unlock(); - } - void send(list& ls) { - lock.Lock(); - q.splice(q.end(), ls); - cond.Signal(); - lock.Unlock(); - } - - void force_close() { - ::close(sd); - } - }; - - - - // messenger interface - class EntityMessenger : public Messenger { - Mutex lock; - Cond cond; - list dispatch_queue; - list prio_dispatch_queue; - bool stop; - int qlen, pqlen; - - class DispatchThread : public Thread { - EntityMessenger *m; - public: - DispatchThread(EntityMessenger *_m) : m(_m) {} - void *entry() { - m->dispatch_entry(); - return 0; - } - } dispatch_thread; - void dispatch_entry(); - - public: - void queue_message(Message *m) { - // set recv stamp - m->set_recv_stamp(g_clock.now()); - - lock.Lock(); - if (m->get_source().is_mon()) { - prio_dispatch_queue.push_back(m); - pqlen++; - } else { - qlen++; - dispatch_queue.push_back(m); - } - cond.Signal(); - lock.Unlock(); - } - - public: - EntityMessenger(entity_name_t myaddr) : - Messenger(myaddr), - stop(false), - qlen(0), pqlen(0), - dispatch_thread(this) { } - ~EntityMessenger() { - // join dispatch thread - if (dispatch_thread.is_started()) - dispatch_thread.join(); - } - - void ready(); - bool is_stopped() { return stop; } - - void wait() { - dispatch_thread.join(); - } - - const entity_addr_t &get_myaddr(); - - int get_dispatch_queue_len() { return qlen + pqlen; } - - void reset_myname(entity_name_t m); - - int shutdown(); - void suicide(); - void prepare_dest(const entity_addr_t& addr); - int send_message(Message *m, entity_inst_t dest, - int port=0, int fromport=0); - int send_first_message(Dispatcher *d, - Message *m, entity_inst_t dest, - int port=0, int fromport=0); - - void mark_down(entity_addr_t a); - void mark_up(entity_name_t a, entity_addr_t& i); - }; - - - class SingleDispatcher : public Thread { - Rank *rank; - public: - SingleDispatcher(Rank *r) : rank(r) {} - void *entry() { - rank->single_dispatcher_entry(); - return 0; - } - } single_dispatcher; - - Cond single_dispatch_cond; - bool single_dispatch_stop; - list single_dispatch_queue; - - map > waiting_for_ready; - - void single_dispatcher_entry(); - void _submit_single_dispatch(Message *m); - - - // Rank stuff - public: - Mutex lock; - Cond wait_cond; // for wait() - bool started; - - // where i listen - entity_addr_t my_addr; - - // local - map local; - set stopped; - //hash_set entity_unstarted; - - // remote - hash_map rank_pipe; - - set pipes; - list pipe_reap_queue; - - Pipe *connect_rank(const entity_addr_t& addr); - - void mark_down(entity_addr_t addr); - //void mark_up(entity_name_t addr, entity_addr_t& i); - - entity_addr_t get_my_addr() { return my_addr; } - - void reaper(); - - EntityMessenger *find_unnamed(entity_name_t a); - -public: - Rank(); - ~Rank(); - - //void set_listen_addr(tcpaddr_t& a); - - int start_rank(); - void wait(); - - EntityMessenger *register_entity(entity_name_t addr); - void rename_entity(EntityMessenger *ms, entity_name_t newaddr); - void unregister_entity(EntityMessenger *ms); - - void submit_message(Message *m, const entity_addr_t& addr); - void prepare_dest(const entity_addr_t& addr); - - // create a new messenger - EntityMessenger *new_entity(entity_name_t addr); - -} ; - - - -extern Rank rank; - -#endif diff --git a/branches/sage/crush/msg/msg_types.h b/branches/sage/crush/msg/msg_types.h deleted file mode 100644 index 52b1e69c8886c..0000000000000 --- a/branches/sage/crush/msg/msg_types.h +++ /dev/null @@ -1,192 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MSG_TYPES_H -#define __MSG_TYPES_H - -#include "include/types.h" -#include "include/blobhash.h" -#include "tcp.h" - -class entity_name_t { - struct ceph_entity_name v; - -public: - static const int TYPE_MON = CEPH_ENTITY_TYPE_MON; - static const int TYPE_MDS = CEPH_ENTITY_TYPE_MDS; - static const int TYPE_OSD = CEPH_ENTITY_TYPE_OSD; - static const int TYPE_CLIENT = CEPH_ENTITY_TYPE_CLIENT; - static const int TYPE_ADMIN = CEPH_ENTITY_TYPE_ADMIN; - - static const int NEW = -1; - - // cons - entity_name_t() { v.type = v.num = 0; } - entity_name_t(int t, int n=NEW) { v.type = t; v.num = n; } - - // static cons - static entity_name_t MON(int i=NEW) { return entity_name_t(TYPE_MON, i); } - static entity_name_t MDS(int i=NEW) { return entity_name_t(TYPE_MDS, i); } - static entity_name_t OSD(int i=NEW) { return entity_name_t(TYPE_OSD, i); } - static entity_name_t CLIENT(int i=NEW) { return entity_name_t(TYPE_CLIENT, i); } - static entity_name_t ADMIN(int i=NEW) { return entity_name_t(TYPE_ADMIN, i); } - - int num() const { return v.num; } - int type() const { return v.type; } - const char *type_str() const { - switch (type()) { - case TYPE_MDS: return "mds"; - case TYPE_OSD: return "osd"; - case TYPE_MON: return "mon"; - case TYPE_CLIENT: return "client"; - case TYPE_ADMIN: return "admin"; - default: return "unknown"; - } - } - - bool is_new() const { return num() < 0; } - - bool is_client() const { return type() == TYPE_CLIENT; } - bool is_mds() const { return type() == TYPE_MDS; } - bool is_osd() const { return type() == TYPE_OSD; } - bool is_mon() const { return type() == TYPE_MON; } - bool is_admin() const { return type() == TYPE_ADMIN; } -}; - -inline bool operator== (const entity_name_t& l, const entity_name_t& r) { - return (l.type() == r.type()) && (l.num() == r.num()); } -inline bool operator!= (const entity_name_t& l, const entity_name_t& r) { - return (l.type() != r.type()) || (l.num() != r.num()); } -inline bool operator< (const entity_name_t& l, const entity_name_t& r) { - return (l.type() < r.type()) || (l.type() == r.type() && l.num() < r.num()); } - -inline std::ostream& operator<<(std::ostream& out, const entity_name_t& addr) { - //if (addr.is_namer()) return out << "namer"; - if (addr.is_new() || addr.num() < 0) - return out << addr.type_str() << "?"; - else - return out << addr.type_str() << addr.num(); -} -inline std::ostream& operator<<(std::ostream& out, const ceph_entity_name& addr) { - return out << *(const entity_name_t*)&addr; -} - -namespace __gnu_cxx { - template<> struct hash< entity_name_t > - { - size_t operator()( const entity_name_t m ) const - { - static blobhash H; - return H((const char*)&m, sizeof(m)); - } - }; -} - - - -/* - * an entity's network address. - * includes a random value that prevents it from being reused. - * thus identifies a particular process instance. - * ipv4 for now. - */ -struct entity_addr_t { - struct ceph_entity_addr v; - - entity_addr_t() { - memset(&v, 0, sizeof(v)); - } - - void set_addr(tcpaddr_t a) { - memcpy((char*)v.ipq, (char*)&a.sin_addr.s_addr, 4); - v.port = ntohs(a.sin_port); - } - void make_addr(tcpaddr_t& a) const { - memset(&a, 0, sizeof(a)); - a.sin_family = AF_INET; - memcpy((char*)&a.sin_addr.s_addr, (char*)v.ipq, 4); - a.sin_port = htons(v.port); - } -}; - -inline ostream& operator<<(ostream& out, const entity_addr_t &addr) -{ - return out << (int)addr.v.ipq[0] - << '.' << (int)addr.v.ipq[1] - << '.' << (int)addr.v.ipq[2] - << '.' << (int)addr.v.ipq[3] - << ':' << addr.v.port - << '.' << addr.v.nonce; -} - -inline bool operator==(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) == 0; } -inline bool operator!=(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) != 0; } -inline bool operator<(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) < 0; } -inline bool operator<=(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) <= 0; } -inline bool operator>(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) > 0; } -inline bool operator>=(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) >= 0; } - -namespace __gnu_cxx { - template<> struct hash< entity_addr_t > - { - size_t operator()( const entity_addr_t& x ) const - { - static blobhash H; - return H((const char*)&x, sizeof(x)); - } - }; -} - - -/* - * a particular entity instance - */ -struct entity_inst_t { - entity_name_t name; - entity_addr_t addr; - entity_inst_t() {} - entity_inst_t(entity_name_t n, const entity_addr_t& a) : name(n), addr(a) {} -}; - - -inline bool operator==(const entity_inst_t& a, const entity_inst_t& b) { return memcmp(&a, &b, sizeof(a)) == 0; } -inline bool operator!=(const entity_inst_t& a, const entity_inst_t& b) { return memcmp(&a, &b, sizeof(a)) != 0; } -inline bool operator<(const entity_inst_t& a, const entity_inst_t& b) { return memcmp(&a, &b, sizeof(a)) < 0; } -inline bool operator<=(const entity_inst_t& a, const entity_inst_t& b) { return memcmp(&a, &b, sizeof(a)) <= 0; } -inline bool operator>(const entity_inst_t& a, const entity_inst_t& b) { return memcmp(&a, &b, sizeof(a)) > 0; } -inline bool operator>=(const entity_inst_t& a, const entity_inst_t& b) { return memcmp(&a, &b, sizeof(a)) >= 0; } - -namespace __gnu_cxx { - template<> struct hash< entity_inst_t > - { - size_t operator()( const entity_inst_t& x ) const - { - static blobhash H; - return H((const char*)&x, sizeof(x)); - } - }; -} - -inline ostream& operator<<(ostream& out, const entity_inst_t &i) -{ - return out << i.name << " " << i.addr; -} -inline ostream& operator<<(ostream& out, const ceph_entity_inst &i) -{ - return out << *(const entity_inst_t*)&i; -} - - - -#endif diff --git a/branches/sage/crush/msg/tcp.cc b/branches/sage/crush/msg/tcp.cc deleted file mode 100644 index a131e3d6dd7dc..0000000000000 --- a/branches/sage/crush/msg/tcp.cc +++ /dev/null @@ -1,93 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#include "tcp.h" - -/****************** - * tcp crap - */ - -/* -inlined, see tcp.h - - -bool tcp_read(int sd, char *buf, int len) -{ - while (len > 0) { - int got = ::recv( sd, buf, len, 0 ); - if (got == 0) { - generic_dout(18) << "tcp_read socket " << sd << " closed" << dendl; - return false; - } - if (got < 0) { - generic_dout(18) << "tcp_read bailing with " << got << dendl; - return false; - } - assert(got >= 0); - len -= got; - buf += got; - //generic_dout(DBL) << "tcp_read got " << got << ", " << len << " left" << dendl; - } - return true; -} - -int tcp_write(int sd, char *buf, int len) -{ - //generic_dout(DBL) << "tcp_write writing " << len << dendl; - assert(len > 0); - while (len > 0) { - int did = ::send( sd, buf, len, 0 ); - if (did < 0) { - generic_dout(1) << "tcp_write error did = " << did << " errno " << errno << " " << strerror(errno) << dendl; - //derr(0) << "tcp_write error did = " << did << " errno " << errno << " " << strerror(errno) << dendl; - } - //assert(did >= 0); - if (did < 0) return did; - len -= did; - buf += did; - //generic_dout(DBL) << "tcp_write did " << did << ", " << len << " left" << dendl; - } - return 0; -} -*/ - -int tcp_hostlookup(char *str, tcpaddr_t& ta) -{ - char *host = str; - char *port = 0; - - for (int i=0; str[i]; i++) { - if (str[i] == ':') { - port = str+i+1; - str[i] = 0; - break; - } - } - if (!port) { - cerr << "addr '" << str << "' doesn't look like 'host:port'" << std::endl; - return -1; - } - //cout << "host '" << host << "' port '" << port << "'" << std::endl; - - int iport = atoi(port); - - struct hostent *myhostname = gethostbyname( host ); - if (!myhostname) { - cerr << "host " << host << " not found" << std::endl; - return -1; - } - - memset(&ta, 0, sizeof(ta)); - - //cout << "addrtype " << myhostname->h_addrtype << " len " << myhostname->h_length << std::endl; - - ta.sin_family = myhostname->h_addrtype; - memcpy((char *)&ta.sin_addr, - myhostname->h_addr, - myhostname->h_length); - ta.sin_port = iport; - - cout << "lookup '" << host << ":" << port << "' -> " << ta << std::endl; - - return 0; -} diff --git a/branches/sage/crush/msg/tcp.h b/branches/sage/crush/msg/tcp.h deleted file mode 100644 index e234da400dfe4..0000000000000 --- a/branches/sage/crush/msg/tcp.h +++ /dev/null @@ -1,69 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -#ifndef __TCP_H -#define __TCP_H - -#include -#include -#include -#include - -typedef struct sockaddr_in tcpaddr_t; - -using std::ostream; - -inline ostream& operator<<(ostream& out, const tcpaddr_t &a) -{ - unsigned char addr[4]; - memcpy((char*)addr, (char*)&a.sin_addr.s_addr, 4); - out << (unsigned)addr[0] << "." - << (unsigned)addr[1] << "." - << (unsigned)addr[2] << "." - << (unsigned)addr[3] << ":" - << ntohs(a.sin_port); - return out; -} - -inline bool tcp_read(int sd, char *buf, int len) { - while (len > 0) { - int got = ::recv( sd, buf, len, 0 ); - if (got <= 0) { - //generic_dout(18) << "tcp_read socket " << sd << " closed" << dendl; - return false; - } - len -= got; - buf += got; - //generic_dout(DBL) << "tcp_read got " << got << ", " << len << " left" << dendl; - } - return true; -} - -inline int tcp_write(int sd, const char *buf, int len) { - //generic_dout(DBL) << "tcp_write writing " << len << dendl; - assert(len > 0); - while (len > 0) { - int did = ::send( sd, buf, len, 0 ); - if (did < 0) { - //generic_dout(1) << "tcp_write error did = " << did << " errno " << errno << " " << strerror(errno) << dendl; - //generic_derr(1) << "tcp_write error did = " << did << " errno " << errno << " " << strerror(errno) << dendl; - return did; - } - len -= did; - buf += did; - //generic_dout(DBL) << "tcp_write did " << did << ", " << len << " left" << dendl; - } - return 0; -} - - -extern int tcp_hostlookup(char *str, tcpaddr_t& ta); - -inline bool operator==(const tcpaddr_t& a, const tcpaddr_t& b) { - return strncmp((const char*)&a, (const char*)&b, sizeof(a)) == 0; -} -inline bool operator!=(const tcpaddr_t& a, const tcpaddr_t& b) { - return strncmp((const char*)&a, (const char*)&b, sizeof(a)) != 0; -} - - -#endif diff --git a/branches/sage/crush/newsyn.cc b/branches/sage/crush/newsyn.cc deleted file mode 100644 index e580e49a9b7e9..0000000000000 --- a/branches/sage/crush/newsyn.cc +++ /dev/null @@ -1,438 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#define intabs(x) ((x) >= 0 ? (x):(-(x))) - -#include - -#include -#include -#include -using namespace std; - -#include - -#include "config.h" - -#include "mds/MDS.h" -#include "osd/OSD.h" -#include "mon/Monitor.h" -#include "client/Client.h" -#include "client/SyntheticClient.h" - -#include "msg/SimpleMessenger.h" - -#include "common/Timer.h" - -class C_Test : public Context { -public: - void finish(int r) { - cout << "C_Test->finish(" << r << ")" << std::endl; - } -}; - -extern std::map g_fake_kill_after; - - -/* - * start up NewMessenger via MPI. - */ - -pair mpi_bootstrap_new(int& argc, char**& argv, MonMap *monmap) -{ - MPI_Init(&argc, &argv); - - int mpi_world; - int mpi_rank; - MPI_Comm_size(MPI_COMM_WORLD, &mpi_world); - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - - // first, synchronize clocks. - if (g_conf.clock_tare) { - if (1) { - // use an MPI barrier. probably not terribly precise. - MPI_Barrier(MPI_COMM_WORLD); - g_clock.tare(); - } else { - // use wall clock; assume NTP has all nodes synchronized already. - // FIXME someday: this hangs for some reason. whatever. - utime_t z = g_clock.now(); - MPI_Bcast( &z, sizeof(z), MPI_CHAR, - 0, MPI_COMM_WORLD); - cout << "z is " << z << std::endl; - g_clock.tare(z); - } - } - - // start up all monitors at known addresses. - entity_inst_t moninst[mpi_world]; // only care about first g_conf.num_mon of these. - - rank.start_rank(); // bind and listen - - if (mpi_rank < g_conf.num_mon) { - moninst[mpi_rank].addr = rank.my_addr; - moninst[mpi_rank].name = entity_name_t(entity_name_t::TYPE_MON, mpi_rank); - - //cerr << mpi_rank << " at " << rank.get_listen_addr() << std::endl; - } - - MPI_Gather( &moninst[mpi_rank], sizeof(entity_inst_t), MPI_CHAR, - moninst, sizeof(entity_inst_t), MPI_CHAR, - 0, MPI_COMM_WORLD); - - if (mpi_rank == 0) { - for (int i=0; imon_inst[i] = moninst[i]; - } - } - - - // distribute monmap - bufferlist bl; - if (mpi_rank == 0) { - monmap->encode(bl); - monmap->write(".ceph_monmap"); - } else { - int l = g_conf.num_mon * 1000; // nice'n big. - bufferptr bp(l); - bl.append(bp); - } - - MPI_Bcast(bl.c_str(), bl.length(), MPI_CHAR, - 0, MPI_COMM_WORLD); - - if (mpi_rank > 0) { - monmap->decode(bl); - } - - // wait for everyone! - MPI_Barrier(MPI_COMM_WORLD); - - return pair(mpi_rank, mpi_world); -} - -utime_t tick_start; -int tick_count = 0; - -class C_Tick : public Context { -public: - void finish(int) { - utime_t now = g_clock.now() - tick_start; - cout << "tick +" << g_conf.tick << " -> " << now << " (" << tick_count << ")" << std::endl; - tick_count += g_conf.tick; - utime_t next = tick_start; - next.sec_ref() += tick_count; - g_timer.add_event_at(next, new C_Tick); - } -}; - -class C_Die : public Context { -public: - void finish(int) { - cerr << "die" << std::endl; - _exit(1); - } -}; - -class C_Debug : public Context { - public: - void finish(int) { - int size = (long)&g_conf.debug_after - (long)&g_conf.debug; - memcpy((char*)&g_conf.debug, (char*)&g_debug_after_conf.debug, size); - cout << "debug_after flipping debug settings" << std::endl; - //g_conf.debug_ms = 1; - } -}; - - -int main(int argc, char **argv) -{ - vector args; - argv_to_vec(argc, argv, args); - - map kill_osd_after; - if (1) { - vector nargs; - for (unsigned i=0; i 0 ? g_conf.num_mon:0; - int start_mds = g_conf.num_mds > 0 ? g_conf.num_mds:0; - int start_osd = g_conf.num_osd > 0 ? g_conf.num_osd:0; - int start_client = g_conf.num_client > 0 ? g_conf.num_client:0; - - //g_conf.num_mon = intabs(g_conf.num_mon); - g_conf.num_mds = intabs(g_conf.num_mds); - g_conf.num_client = intabs(g_conf.num_client); - g_conf.num_osd = intabs(g_conf.num_osd); - - - if (g_conf.kill_after) - g_timer.add_event_after(g_conf.kill_after, new C_Die); - if (g_conf.debug_after) - g_timer.add_event_after(g_conf.debug_after, new C_Debug); - - if (g_conf.tick) { - tick_start = g_clock.now(); - g_timer.add_event_after(g_conf.tick, new C_Tick); - } - - vector nargs; - for (unsigned i=0; i mpiwho = mpi_bootstrap_new(argc, argv, monmap); - int myrank = mpiwho.first; - int world = mpiwho.second; - - int need = 0; - if (g_conf.ms_skip_rank0) need++; - need += start_mds; - if (g_conf.ms_stripe_osds) - need++; - else - need += start_osd; - if (start_client) { - if (!g_conf.ms_overlay_clients) - need += 1; - } - assert(need <= world); - - if (myrank == 0) - cerr << "nummds " << start_mds << " numosd " << start_osd << " numclient " << start_client << " .. need " << need << ", have " << world << std::endl; - - - char hostname[100]; - gethostname(hostname,100); - int pid = getpid(); - - int started = 0; - - //if (myrank == 0) g_conf.debug = 20; - - // courtesy symlinks - char ffrom[100]; - char fto[100]; - sprintf(fto, "%s.%d", hostname, pid); - - - // create mon - if (myrank < g_conf.num_mon) { - Monitor *mon = new Monitor(myrank, rank.register_entity(entity_name_t(entity_name_t::TYPE_MON, myrank)), monmap); - mon->init(); - if (g_conf.dout_dir) { - sprintf(ffrom, "%s/mon%d", g_conf.dout_dir, myrank); - ::symlink(fto, ffrom); - } - } - - // wait for monitors to start. - MPI_Barrier(MPI_COMM_WORLD); - - // okay, home free! - MPI_Finalize(); - - - // create mds - map mds; - map mdsosd; - for (int i=0; iinit(); - started++; - - if (g_conf.mds_local_osd) { - int n = i+g_conf.mds_local_osd_offset; - mdsosd[i] = new OSD(n, rank.register_entity(entity_name_t(entity_name_t::TYPE_OSD, n)), monmap); - mdsosd[i]->init(); - } - - if (g_fake_kill_after.count(entity_name_t::MDS(i))) { - cerr << "mds" << i << " will die after " << g_fake_kill_after[entity_name_t::MDS(i)] << std::endl; - g_timer.add_event_after(g_fake_kill_after[entity_name_t::MDS(i)], new C_Die); - } - } - - // create osd - map osd; - int max_osd_nodes = world - start_mds - g_conf.ms_skip_rank0; // assumes 0 clients, if we stripe. - int osds_per_node = (start_osd-1)/max_osd_nodes + 1; - for (int i=0; iinit() < 0) - return 1; - started++; - } - - if (g_conf.ms_overlay_clients) sleep(5); - - // create client - int skip_osd = start_osd; - if (g_conf.ms_overlay_clients) - skip_osd = 0; // put clients with osds too! - int client_nodes = world - start_mds - skip_osd - g_conf.ms_skip_rank0; - int clients_per_node = 1; - if (start_client && client_nodes > 0) clients_per_node = (start_client-1) / client_nodes + 1; - set clientlist; - map client;//[start_client]; - map syn;//[start_client]; - int nclients = 0; - for (int i=0; i::iterator it = clientlist.begin(); - it != clientlist.end(); - it++) { - int i = *it; - - //cerr << "starting synthetic client" << i << " on rank " << myrank << std::endl; - syn[i]->start_thread(); - } - if (nclients) { - cerr << nclients << " clients at " << rank.my_addr << " " << hostname << "." << pid << std::endl; - } - - for (set::iterator it = clientlist.begin(); - it != clientlist.end(); - it++) { - int i = *it; - // cout << "waiting for synthetic client" << i << " to finish" << std::endl; - syn[i]->join_thread(); - // fix simpelmeessenger race first! - //delete syn[i]; - //delete client[i]; - } - - - if (myrank && !started) { - //dout(1) << "IDLE" << dendl; - cerr << "idle at " << rank.my_addr << " rank " << myrank << " " << hostname << "." << pid << std::endl; - } - - // wait for everything to finish - rank.wait(); - - cerr << "newsyn done on " << hostname << "." << pid << std::endl; - - // cd on exit, so that gmon.out (if any) goes into a separate directory for each node. - char s[20]; - sprintf(s, "gmon/%d", myrank); - mkdir(s, 0755); - chdir(s); - - return 0; // whatever, cleanup hangs sometimes (stopping ebofs threads?). - - // cleanup - for (map::iterator i = mds.begin(); i != mds.end(); i++) - delete i->second; - for (map::iterator i = mdsosd.begin(); i != mdsosd.end(); i++) - delete i->second; - for (map::iterator i = osd.begin(); i != osd.end(); i++) - delete i->second; - /* - for (map::iterator i = client.begin(); i != client.end(); i++) - delete i->second; - for (map::iterator i = syn.begin(); i != syn.end(); i++) - delete i->second; - */ - /* - for (int i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __FAKE_H -#define __FAKE_H - -#include "include/types.h" - -#include -#include -#include -using namespace std; -using namespace __gnu_cxx; - -class FakeStoreCollections { - private: - Mutex faker_lock; - ObjectStore *store; - hash_map > fakecollections; - - public: - FakeStoreCollections(ObjectStore *s) : store(s) {} - - // faked collections - int list_collections(list& ls) { - faker_lock.Lock(); - int r = 0; - for (hash_map< coll_t, set >::iterator p = fakecollections.begin(); - p != fakecollections.end(); - p++) { - r++; - ls.push_back(p->first); - } - faker_lock.Unlock(); - return r; - } - - int create_collection(coll_t c, - Context *onsafe=0) { - faker_lock.Lock(); - fakecollections[c].size(); - if (onsafe) store->sync(onsafe); - faker_lock.Unlock(); - return 0; - } - - int destroy_collection(coll_t c, - Context *onsafe=0) { - int r = 0; - faker_lock.Lock(); - if (fakecollections.count(c)) { - fakecollections.erase(c); - //fakecattr.erase(c); - if (onsafe) store->sync(onsafe); - } else - r = -1; - faker_lock.Unlock(); - return r; - } - - int collection_stat(coll_t c, struct stat *st) { - return collection_exists(c) ? 0:-1; - } - - bool collection_exists(coll_t c) { - faker_lock.Lock(); - int r = fakecollections.count(c); - faker_lock.Unlock(); - return r; - } - - int collection_add(coll_t c, object_t o, - Context *onsafe=0) { - faker_lock.Lock(); - fakecollections[c].insert(o); - if (onsafe) store->sync(onsafe); - faker_lock.Unlock(); - return 0; - } - - int collection_remove(coll_t c, object_t o, - Context *onsafe=0) { - faker_lock.Lock(); - fakecollections[c].erase(o); - if (onsafe) store->sync(onsafe); - faker_lock.Unlock(); - return 0; - } - - int collection_list(coll_t c, list& o) { - faker_lock.Lock(); - int r = 0; - for (set::iterator p = fakecollections[c].begin(); - p != fakecollections[c].end(); - p++) { - o.push_back(*p); - r++; - } - faker_lock.Unlock(); - return r; - } - -}; - -class FakeStoreAttrs { - private: - - class FakeAttrSet { - public: - map attrs; - - int getattr(const char *name, void *value, size_t size) { - string n = name; - if (attrs.count(n)) { - size_t l = MIN( attrs[n].length(), size ); - bufferlist bl; - bl.append(attrs[n]); - bl.copy(0, l, (char*)value); - return l; - } - return -1; - } - int getattrs(map& aset) { - aset = attrs; - return 0; - } - int setattrs(map& aset) { - attrs = aset; - return 0; - } - - int setattr(const char *name, const void *value, size_t size) { - string n = name; - bufferptr bp = buffer::copy((char*)value, size); - attrs[n] = bp; - return 0; - } - - int listattr(char *attrs, size_t size) { - assert(0); - return 0; - } - - int rmattr(const char *name) { - string n = name; - attrs.erase(n); - return 0; - } - - bool empty() { return attrs.empty(); } - }; - - Mutex faker_lock; - ObjectStore *store; - hash_map fakeoattrs; - hash_map fakecattrs; - - public: - FakeStoreAttrs(ObjectStore *s) : store(s) {} - - int setattr(object_t oid, const char *name, - const void *value, size_t size, - Context *onsafe=0) { - faker_lock.Lock(); - int r = fakeoattrs[oid].setattr(name, value, size); - if (onsafe) store->sync(onsafe); - faker_lock.Unlock(); - return r; - } - int setattrs(object_t oid, map& aset) { - faker_lock.Lock(); - int r = fakeoattrs[oid].setattrs(aset); - faker_lock.Unlock(); - return r; - } - int getattr(object_t oid, const char *name, - void *value, size_t size) { - faker_lock.Lock(); - int r = fakeoattrs[oid].getattr(name, value, size); - faker_lock.Unlock(); - return r; - } - int getattrs(object_t oid, map& aset) { - faker_lock.Lock(); - int r = fakeoattrs[oid].getattrs(aset); - faker_lock.Unlock(); - return r; - } - int rmattr(object_t oid, const char *name, - Context *onsafe=0) { - faker_lock.Lock(); - int r = fakeoattrs[oid].rmattr(name); - if (onsafe) store->sync(onsafe); - faker_lock.Unlock(); - return r; - } - - int listattr(object_t oid, char *attrs, size_t size) { - faker_lock.Lock(); - int r = fakeoattrs[oid].listattr(attrs,size); - faker_lock.Unlock(); - return r; - } - - int collection_setattr(coll_t c, const char *name, - void *value, size_t size, - Context *onsafe=0) { - faker_lock.Lock(); - int r = fakecattrs[c].setattr(name, value, size); - if (onsafe) store->sync(onsafe); - faker_lock.Unlock(); - return r; - } - int collection_setattrs(coll_t cid, map& aset) { - faker_lock.Lock(); - int r = fakecattrs[cid].setattrs(aset); - faker_lock.Unlock(); - return r; - } - int collection_getattrs(coll_t cid, map& aset) { - faker_lock.Lock(); - int r = fakecattrs[cid].getattrs(aset); - faker_lock.Unlock(); - return r; - } - int collection_rmattr(coll_t c, const char *name, - Context *onsafe=0) { - faker_lock.Lock(); - int r = fakecattrs[c].rmattr(name); - if (onsafe) store->sync(onsafe); - faker_lock.Unlock(); - return r; - } - int collection_getattr(coll_t c, const char *name, - void *value, size_t size) { - faker_lock.Lock(); - int r = fakecattrs[c].getattr(name, value, size); - faker_lock.Unlock(); - return r; - } - int collection_listattr(coll_t c, char *attrs, size_t size) { - faker_lock.Lock(); - int r = fakecattrs[c].listattr(attrs,size); - faker_lock.Unlock(); - return r; - } - -}; - -#endif diff --git a/branches/sage/crush/osd/FakeStore.cc b/branches/sage/crush/osd/FakeStore.cc deleted file mode 100644 index e7c77f3eab558..0000000000000 --- a/branches/sage/crush/osd/FakeStore.cc +++ /dev/null @@ -1,742 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "FakeStore.h" -#include "include/types.h" - -#include "common/Timer.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#ifndef __CYGWIN__ -# include -#endif -//#include - -#ifdef DARWIN -#include -#include -#endif // DARWIN - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug) *_dout << dbeginl << g_clock.now() << " fakestore(" << basedir << ") " -#define derr(l) if (l<=g_conf.debug) *_derr << dbeginl << g_clock.now() << " fakestore(" << basedir << ") " - -#include "include/buffer.h" - -#include - - -// crap-a-crap hash -//#define HASH_DIRS 0x80 -//#define HASH_MASK 0x7f -// end crap hash - - - - -int FakeStore::statfs(struct statfs *buf) -{ - return ::statfs(basedir.c_str(), buf); -} - - -/* - * sorry, these are sentitive to the object_t and coll_t typing. - */ -void FakeStore::get_oname(object_t oid, char *s) -{ - //static hash H; - assert(sizeof(oid) == 16); -#ifdef __LP64__ - //sprintf(s, "%s/objects/%02lx/%016lx.%016lx", basedir.c_str(), H(oid) & HASH_MASK, - sprintf(s, "%s/objects/%016lx.%016lx", basedir.c_str(), - *((uint64_t*)&oid), - *(((uint64_t*)&oid) + 1)); -#else - //sprintf(s, "%s/objects/%02x/%016llx.%016llx", basedir.c_str(), H(oid) & HASH_MASK, - sprintf(s, "%s/objects/%016llx.%016llx", basedir.c_str(), - *((uint64_t*)&oid), - *(((uint64_t*)&oid) + 1)); -#endif -} - -void FakeStore::get_cdir(coll_t cid, char *s) -{ - assert(sizeof(cid) == 8); -#ifdef __LP64__ - sprintf(s, "%s/collections/%016lx", basedir.c_str(), - cid); -#else - sprintf(s, "%s/collections/%016llx", basedir.c_str(), - cid); -#endif -} - -void FakeStore::get_coname(coll_t cid, object_t oid, char *s) -{ - assert(sizeof(oid) == 16); -#ifdef __LP64__ - sprintf(s, "%s/collections/%016lx/%016lx.%016lx", basedir.c_str(), cid, - *((uint64_t*)&oid), - *(((uint64_t*)&oid) + 1)); -#else - sprintf(s, "%s/collections/%016llx/%016llx.%016llx", basedir.c_str(), cid, - *((uint64_t*)&oid), - *(((uint64_t*)&oid) + 1)); -#endif -} - - - - -int FakeStore::mkfs() -{ - char cmd[200]; - if (g_conf.fakestore_dev) { - dout(0) << "mounting" << dendl; - sprintf(cmd,"mount %s", g_conf.fakestore_dev); - system(cmd); - } - - dout(1) << "mkfs in " << basedir << dendl; - - // wipe - sprintf(cmd, "test -d %s && rm -r %s ; mkdir -p %s/collections && mkdir -p %s/objects", - basedir.c_str(), basedir.c_str(), basedir.c_str(), basedir.c_str()); - - dout(5) << "wipe: " << cmd << dendl; - system(cmd); - - // hashed bits too - /* - for (int i=0; i 0) bl.push_back( bptr ); // put it in the target bufferlist - } - ::flock(fd, LOCK_UN); - ::close(fd); - return got; -} - - -int FakeStore::write(object_t oid, - off_t offset, size_t len, - const bufferlist& bl, - Context *onsafe) -{ - char fn[200]; - get_oname(oid,fn); - - dout(20) << "write " << fn << " len " << len << " off " << offset << dendl; - - - ::mknod(fn, 0644, 0); // in case it doesn't exist yet. - - int flags = O_WRONLY;//|O_CREAT; - int fd = ::open(fn, flags); - if (fd < 0) { - derr(0) << "write couldn't open " << fn << " flags " << flags << " errno " << errno << " " << strerror(errno) << dendl; - return fd; - } - ::fchmod(fd, 0664); - ::flock(fd, LOCK_EX); // lock for safety - - // seek - off_t actual = ::lseek(fd, offset, SEEK_SET); - int did = 0; - assert(actual == offset); - - // write buffers - for (list::const_iterator it = bl.buffers().begin(); - it != bl.buffers().end(); - it++) { - int r = ::write(fd, (char*)(*it).c_str(), (*it).length()); - if (r > 0) - did += r; - else { - derr(0) << "couldn't write to " << fn << " len " << len << " off " << offset << " errno " << errno << " " << strerror(errno) << dendl; - } - } - - if (did < 0) { - derr(0) << "couldn't write to " << fn << " len " << len << " off " << offset << " errno " << errno << " " << strerror(errno) << dendl; - } - - ::flock(fd, LOCK_UN); - - // schedule sync - if (onsafe) sync(onsafe); - - ::close(fd); - - return did; -} - - -class C_FakeSync : public Context { - Context *c; - int *n; - Mutex *lock; - Cond *cond; - -public: - C_FakeSync(Context *c_, int *n_, Mutex *lo, Cond *co) : - c(c_), n(n_), - lock(lo), cond(co) { - lock->Lock(); - ++*n; - lock->Unlock(); - } - void finish(int r) { - c->finish(r); - - lock->Lock(); - --(*n); - if (*n == 0) cond->Signal(); - lock->Unlock(); - } -}; - -void FakeStore::sync() -{ - synclock.Lock(); - while (unsync > 0) { - dout(0) << "sync waiting for " << unsync << " items to (fake) sync" << dendl; - synccond.Wait(synclock); - } - synclock.Unlock(); -} - -void FakeStore::sync(Context *onsafe) -{ - if (g_conf.fakestore_fake_sync > 0.0) { - g_timer.add_event_after((float)g_conf.fakestore_fake_sync, - new C_FakeSync(onsafe, &unsync, &synclock, &synccond)); - - } else { - assert(0); // der..no implemented anymore - } -} - - -// ------------------------------- -// attributes - -// objects - -int FakeStore::setattr(object_t oid, const char *name, - const void *value, size_t size, - Context *onsafe) -{ - if (fake_attrs) return attrs.setattr(oid, name, value, size, onsafe); - - int r = 0; -#ifndef __CYGWIN__ - char fn[100]; - get_oname(oid, fn); - r = ::setxattr(fn, name, value, size, 0); -#endif - return r; -} - -int FakeStore::setattrs(object_t oid, map& aset) -{ - if (fake_attrs) return attrs.setattrs(oid, aset); - - char fn[100]; - get_oname(oid, fn); - int r = 0; -#ifndef __CYGWIN__ - for (map::iterator p = aset.begin(); - p != aset.end(); - ++p) { - r = ::setxattr(fn, p->first.c_str(), p->second.c_str(), p->second.length(), 0); - if (r < 0) { - cerr << "error setxattr " << strerror(errno) << std::endl; - break; - } - } -#endif - return r; -} - -int FakeStore::getattr(object_t oid, const char *name, - void *value, size_t size) -{ - if (fake_attrs) return attrs.getattr(oid, name, value, size); - int r = 0; -#ifndef __CYGWIN__ - char fn[100]; - get_oname(oid, fn); - r = ::getxattr(fn, name, value, size); -#endif - return r; -} - -int FakeStore::getattrs(object_t oid, map& aset) -{ - if (fake_attrs) return attrs.getattrs(oid, aset); - -#ifndef __CYGWIN__ - char fn[100]; - get_oname(oid, fn); - - char val[1000]; - char names[1000]; - int num = ::listxattr(fn, names, 1000); - - char *name = names; - for (int i=0; i& aset) -{ - if (fake_attrs) return attrs.collection_setattrs(cid, aset); - - char fn[100]; - get_cdir(cid, fn); - int r = 0; -#ifndef __CYGWIN__ - for (map::iterator p = aset.begin(); - p != aset.end(); - ++p) { - r = ::setxattr(fn, p->first.c_str(), p->second.c_str(), p->second.length(), 0); - if (r < 0) break; - } -#endif - return r; -} - -int FakeStore::collection_getattrs(coll_t cid, map& aset) -{ - if (fake_attrs) return attrs.collection_getattrs(cid, aset); - -#ifndef __CYGWIN__ - char fn[100]; - get_cdir(cid, fn); - - char val[1000]; - char names[1000]; - int num = ::listxattr(fn, names, 1000); - - char *name = names; - for (int i=0; i& ls) -{ - char fn[200]; - sprintf(fn, "%s/objects", basedir.c_str()); - - DIR *dir = ::opendir(fn); - assert(dir); - - struct dirent *de; - while ((de = ::readdir(dir)) != 0) { - if (de->d_name[0] == '.') continue; - // parse - object_t o; - assert(sizeof(o) == 16); - //cout << " got object " << de->d_name << std::endl; - *(((uint64_t*)&o) + 0) = strtoll(de->d_name, 0, 16); - assert(de->d_name[16] == '.'); - *(((uint64_t*)&o) + 1) = strtoll(de->d_name+17, 0, 16); - //dout(0) << " got " << o << " errno " << errno << " on " << de->d_name << dendl; - if (errno) continue; - ls.push_back(o); - } - - ::closedir(dir); - return 0; -} - - -// -------------------------- -// collections - -int FakeStore::list_collections(list& ls) -{ - if (fake_collections) return collections.list_collections(ls); - - char fn[200]; - sprintf(fn, "%s/collections", basedir.c_str()); - - DIR *dir = ::opendir(fn); - assert(dir); - - struct dirent *de; - while ((de = ::readdir(dir)) != 0) { - // parse - errno = 0; - coll_t c = strtoll(de->d_name, 0, 16); - if (c) ls.push_back(c); - } - - ::closedir(dir); - return 0; -} - -int FakeStore::create_collection(coll_t c, - Context *onsafe) -{ - if (fake_collections) return collections.create_collection(c, onsafe); - - char fn[200]; - get_cdir(c, fn); - - int r = ::mkdir(fn, 0755); - - if (onsafe) sync(onsafe); - return r; -} - -int FakeStore::destroy_collection(coll_t c, - Context *onsafe) -{ - if (fake_collections) return collections.destroy_collection(c, onsafe); - - char fn[200]; - get_cdir(c, fn); - char cmd[200]; - sprintf(cmd, "test -d %s && rm -r %s", fn, fn); - system(cmd); - - if (onsafe) sync(onsafe); - return 0; -} - -int FakeStore::collection_stat(coll_t c, struct stat *st) -{ - if (fake_collections) return collections.collection_stat(c, st); - - char fn[200]; - get_cdir(c, fn); - return ::lstat(fn, st); -} - -bool FakeStore::collection_exists(coll_t c) -{ - if (fake_collections) return collections.collection_exists(c); - - struct stat st; - return collection_stat(c, &st) == 0; -} - - -int FakeStore::collection_add(coll_t c, object_t o, - Context *onsafe) -{ - if (fake_collections) return collections.collection_add(c, o, onsafe); - - char cof[200]; - get_coname(c, o, cof); - char of[200]; - get_oname(o, of); - - int r = ::link(of, cof); - if (onsafe) sync(onsafe); - return r; -} - -int FakeStore::collection_remove(coll_t c, object_t o, - Context *onsafe) -{ - if (fake_collections) return collections.collection_remove(c, o, onsafe); - - char cof[200]; - get_coname(c, o, cof); - - int r = ::unlink(cof); - if (onsafe) sync(onsafe); - return r; -} - -int FakeStore::collection_list(coll_t c, list& ls) -{ - if (fake_collections) return collections.collection_list(c, ls); - - char fn[200]; - get_cdir(c, fn); - - DIR *dir = ::opendir(fn); - assert(dir); - - struct dirent *de; - while ((de = ::readdir(dir)) != 0) { - // parse - if (de->d_name[0] == '.') continue; - //cout << " got object " << de->d_name << std::endl; - object_t o; - assert(sizeof(o) == 16); - *(((uint64_t*)&o) + 0) = strtoll(de->d_name, 0, 16); - assert(de->d_name[16] == '.'); - *(((uint64_t*)&o) + 1) = strtoll(de->d_name+17, 0, 16); - dout(0) << " got " << o << " errno " << errno << " on " << de->d_name << dendl; - if (errno) continue; - ls.push_back(o); - } - - ::closedir(dir); - return 0; -} - -// eof. diff --git a/branches/sage/crush/osd/FakeStore.h b/branches/sage/crush/osd/FakeStore.h deleted file mode 100644 index 5828c27c14d96..0000000000000 --- a/branches/sage/crush/osd/FakeStore.h +++ /dev/null @@ -1,114 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __FAKESTORE_H -#define __FAKESTORE_H - -#include "ObjectStore.h" -#include "common/ThreadPool.h" -#include "common/Mutex.h" - -#include "Fake.h" -//#include "FakeStoreBDBCollections.h" - - -#include -using namespace std; - -#include -using namespace __gnu_cxx; - - -// fake attributes in memory, if we need to. - -class FakeStore : public ObjectStore { - string basedir; - - Mutex synclock; - Cond synccond; - int unsync; - - // fake attrs? - FakeStoreAttrs attrs; - bool fake_attrs; - - // fake collections? - FakeStoreCollections collections; - bool fake_collections; - - // helper fns - void get_oname(object_t oid, char *s); - void get_cdir(coll_t cid, char *s); - void get_coname(coll_t cid, object_t oid, char *s); - - public: - FakeStore(char *base) : - basedir(base), - unsync(0), - attrs(this), fake_attrs(false), - collections(this), fake_collections(false) { } - - int mount(); - int umount(); - int mkfs(); - - int statfs(struct statfs *buf); - - // ------------------ - // objects - int pick_object_revision_lt(object_t& oid) { - return 0; - } - bool exists(object_t oid); - int stat(object_t oid, struct stat *st); - int remove(object_t oid, Context *onsafe); - int truncate(object_t oid, off_t size, Context *onsafe); - int read(object_t oid, off_t offset, size_t len, bufferlist& bl); - int write(object_t oid, off_t offset, size_t len, const bufferlist& bl, Context *onsafe); - - void sync(); - void sync(Context *onsafe); - - int list_objects(list& ls); - - // attrs - int setattr(object_t oid, const char *name, const void *value, size_t size, Context *onsafe=0); - int setattrs(object_t oid, map& aset); - int getattr(object_t oid, const char *name, void *value, size_t size); - int getattrs(object_t oid, map& aset); - int rmattr(object_t oid, const char *name, Context *onsafe=0); - //int listattr(object_t oid, char *attrs, size_t size); - int collection_setattr(coll_t c, const char *name, void *value, size_t size, Context *onsafe=0); - int collection_rmattr(coll_t c, const char *name, Context *onsafe=0); - int collection_getattr(coll_t c, const char *name, void *value, size_t size); - //int collection_listattr(coll_t c, char *attrs, size_t size); - int collection_getattrs(coll_t cid, map &aset); - int collection_setattrs(coll_t cid, map &aset); - - // collections - int list_collections(list& ls); - int create_collection(coll_t c, Context *onsafe=0); - int destroy_collection(coll_t c, Context *onsafe=0); - int collection_stat(coll_t c, struct stat *st); - bool collection_exists(coll_t c); - int collection_add(coll_t c, object_t o, Context *onsafe=0); - int collection_remove(coll_t c, object_t o, Context *onsafe=0); - int collection_list(coll_t c, list& o); - - - -}; - -#endif diff --git a/branches/sage/crush/osd/OSD.cc b/branches/sage/crush/osd/OSD.cc deleted file mode 100644 index ab57f0c603302..0000000000000 --- a/branches/sage/crush/osd/OSD.cc +++ /dev/null @@ -1,2377 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "include/types.h" - -#include "OSD.h" -#include "OSDMap.h" - -#include "FakeStore.h" - -#include "ebofs/Ebofs.h" - -#ifdef USE_OSBDB -#include "osbdb/OSBDB.h" -#endif // USE_OSBDB - - -#include "ReplicatedPG.h" -//#include "RAID4PG.h" - -#include "Ager.h" - - -#include "msg/Messenger.h" -#include "msg/Message.h" - -#include "messages/MGenericMessage.h" -#include "messages/MPing.h" -#include "messages/MPingAck.h" -#include "messages/MOSDPing.h" -#include "messages/MOSDFailure.h" -#include "messages/MOSDOp.h" -#include "messages/MOSDOpReply.h" -#include "messages/MOSDBoot.h" -#include "messages/MOSDIn.h" -#include "messages/MOSDOut.h" - -#include "messages/MOSDMap.h" -#include "messages/MOSDGetMap.h" -#include "messages/MOSDPGNotify.h" -#include "messages/MOSDPGQuery.h" -#include "messages/MOSDPGLog.h" -#include "messages/MOSDPGRemove.h" -#include "messages/MOSDPGActivateSet.h" - -#include "messages/MPGStats.h" - -#include "common/Logger.h" -#include "common/LogType.h" -#include "common/Timer.h" -#include "common/ThreadPool.h" - -#include -#include -#include -#include - - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_osd) *_dout << dbeginl << g_clock.now() << " osd" << whoami << " " << (osdmap ? osdmap->get_epoch():0) << " " -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_osd) *_derr << dbeginl << g_clock.now() << " osd" << whoami << " " << (osdmap ? osdmap->get_epoch():0) << " " - -char *osd_base_path = "./osddata"; -char *ebofs_base_path = "./dev"; - -static const object_t SUPERBLOCK_OBJECT(0,0); - -// force remount hack for performance testing FakeStore -class C_Remount : public Context { - OSD *osd; -public: - C_Remount(OSD *o) : osd(o) {} - void finish(int) { - osd->force_remount(); - } -}; - -void OSD::force_remount() -{ - dout(0) << "forcing remount" << dendl; - osd_lock.Lock(); - { - store->umount(); - store->mount(); - } - osd_lock.Unlock(); - dout(0) << "finished remount" << dendl; -} -// - - -// cons/des - -LogType osd_logtype; - -OSD::OSD(int id, Messenger *m, MonMap *mm, char *dev) : - timer(osd_lock), - stat_oprate(5.0), - read_latency_calc(g_conf.osd_max_opq<1 ? 1:g_conf.osd_max_opq), - qlen_calc(3), - iat_averager(g_conf.osd_flash_crowd_iat_alpha) -{ - whoami = id; - messenger = m; - monmap = mm; - - osdmap = 0; - boot_epoch = 0; - - last_tid = 0; - num_pulling = 0; - - state = STATE_BOOTING; - - stat_ops = 0; - stat_qlen = 0; - stat_rd_ops = stat_rd_ops_shed_in = stat_rd_ops_shed_out = 0; - stat_rd_ops_in_queue = 0; - - pending_ops = 0; - waiting_for_no_ops = false; - - if (g_conf.osd_remount_at) - timer.add_event_after(g_conf.osd_remount_at, new C_Remount(this)); - - - // init object store - // try in this order: - // dev/osd$num - // dev/osd.$hostname - // dev/osd.all - - if (dev) { - strcpy(dev_path,dev); - } else { - char hostname[100]; - hostname[0] = 0; - gethostname(hostname,100); - - sprintf(dev_path, "%s/osd%d", ebofs_base_path, whoami); - - struct stat sta; - if (::lstat(dev_path, &sta) != 0) - sprintf(dev_path, "%s/osd.%s", ebofs_base_path, hostname); - - if (::lstat(dev_path, &sta) != 0) - sprintf(dev_path, "%s/osd.all", ebofs_base_path); - } - - if (g_conf.ebofs) { - store = new Ebofs(dev_path); - //store->_fake_writes(true); - } -#ifdef USE_OSBDB - else if (g_conf.bdbstore) { - store = new OSBDB(dev_path); - } -#endif // USE_OSBDB - else { - sprintf(dev_path, "osddata/osd%d", whoami); - store = new FakeStore(dev_path); - } - -} - -OSD::~OSD() -{ - if (threadpool) { delete threadpool; threadpool = 0; } - if (osdmap) { delete osdmap; osdmap = 0; } - //if (monitor) { delete monitor; monitor = 0; } - if (messenger) { delete messenger; messenger = 0; } - if (logger) { delete logger; logger = 0; } - if (store) { delete store; store = 0; } -} - -int OSD::init() -{ - Mutex::Locker lock(osd_lock); - - // mkfs? - if (g_conf.osd_mkfs) { - dout(2) << "mkfs on local store" << dendl; - if (store->mkfs() < 0) - return -1; - - // make up a superblock - //superblock.fsid = ???; - superblock.whoami = whoami; - } - - // mount. - dout(2) << "mounting " << dev_path << dendl; - int r = store->mount(); - if (r < 0) return -1; - - if (g_conf.osd_mkfs) { - // age? - if (g_conf.osd_age_time != 0) { - dout(2) << "age" << dendl; - Ager ager(store); - if (g_conf.osd_age_time < 0) - ager.load_freelist(); - else - ager.age(g_conf.osd_age_time, - g_conf.osd_age, - g_conf.osd_age - .05, - 50000, - g_conf.osd_age - .05); - } - - if (g_conf.osd_auto_weight) { - // benchmark - bufferlist bl; - bufferptr bp(1048576); - bp.zero(); - bl.push_back(bp); - utime_t start = g_clock.now(); - for (int i=0; i<1000; i++) - store->write(object_t(999,i), 0, bl.length(), bl, 0); - store->sync(); - utime_t end = g_clock.now(); - end -= start; - dout(0) << "measured " << (1000.0 / (double)end) << " mb/sec" << dendl; - for (int i=0; i<1000; i++) - store->remove(object_t(999,i), 0); - - // set osd weight - superblock.weight = (1000.0 / (double)end); - } - } - else { - dout(2) << "boot" << dendl; - - // read superblock - read_superblock(); - - // load up pgs (as they previously existed) - load_pgs(); - - dout(2) << "superblock: i am osd" << superblock.whoami << dendl; - assert(whoami == superblock.whoami); - } - - - - - // log - char name[80]; - sprintf(name, "osd%d", whoami); - logger = new Logger(name, (LogType*)&osd_logtype); - osd_logtype.add_set("opq"); - osd_logtype.add_inc("op"); - osd_logtype.add_inc("c_rd"); - osd_logtype.add_inc("c_rdb"); - osd_logtype.add_inc("c_wr"); - osd_logtype.add_inc("c_wrb"); - - osd_logtype.add_inc("r_push"); - osd_logtype.add_inc("r_pushb"); - osd_logtype.add_inc("r_wr"); - osd_logtype.add_inc("r_wrb"); - - osd_logtype.add_set("qlen"); - osd_logtype.add_set("rqlen"); - osd_logtype.add_set("rdlat"); - osd_logtype.add_set("rdlatm"); - osd_logtype.add_set("fshdin"); - osd_logtype.add_set("fshdout"); - osd_logtype.add_inc("shdout"); - osd_logtype.add_inc("shdin"); - - osd_logtype.add_set("loadavg"); - - osd_logtype.add_inc("rlsum"); - osd_logtype.add_inc("rlnum"); - - osd_logtype.add_set("numpg"); - osd_logtype.add_set("pingset"); - - osd_logtype.add_set("buf"); - - osd_logtype.add_inc("map"); - osd_logtype.add_inc("mapi"); - osd_logtype.add_inc("mapidup"); - osd_logtype.add_inc("mapf"); - osd_logtype.add_inc("mapfdup"); - - // request thread pool - { - char name[80]; - sprintf(name,"osd%d.threadpool", whoami); - threadpool = new ThreadPool(name, g_conf.osd_maxthreads, - static_dequeueop, - this); - } - - // i'm ready! - messenger->set_dispatcher(this); - - // announce to monitor i exist and have booted. - int mon = monmap->pick_mon(); - messenger->send_message(new MOSDBoot(messenger->get_myinst(), superblock), monmap->get_inst(mon)); - - // start the heart - timer.add_event_after(g_conf.osd_heartbeat_interval, new C_Heartbeat(this)); - - // and stat beacon - timer.add_event_after(g_conf.osd_pg_stats_interval, new C_Stats(this)); - - return 0; -} - -int OSD::shutdown() -{ - dout(1) << "shutdown" << dendl; - - state = STATE_STOPPING; - - // cancel timers - timer.cancel_all(); - timer.join(); - - // finish ops - wait_for_no_ops(); - - // stop threads - delete threadpool; - threadpool = 0; - - // close pgs - for (hash_map::iterator p = pg_map.begin(); - p != pg_map.end(); - p++) { - delete p->second; - } - pg_map.clear(); - - // shut everything else down - //monitor->shutdown(); - messenger->shutdown(); - - osd_lock.Unlock(); - int r = store->umount(); - osd_lock.Lock(); - return r; -} - - - -void OSD::write_superblock(ObjectStore::Transaction& t) -{ - dout(10) << "write_superblock " << superblock << dendl; - - bufferlist bl; - bl.append((char*)&superblock, sizeof(superblock)); - t.write(SUPERBLOCK_OBJECT, 0, sizeof(superblock), bl); -} - -int OSD::read_superblock() -{ - bufferlist bl; - int r = store->read(SUPERBLOCK_OBJECT, 0, sizeof(superblock), bl); - if (bl.length() != sizeof(superblock)) { - dout(10) << "read_superblock failed, r = " << r << ", i got " << bl.length() << " bytes, not " << sizeof(superblock) << dendl; - return -1; - } - - bl.copy(0, sizeof(superblock), (char*)&superblock); - - dout(10) << "read_superblock " << superblock << dendl; - - // load up "current" osdmap - assert(!osdmap); - osdmap = new OSDMap; - bl.clear(); - get_map_bl(superblock.current_epoch, bl); - osdmap->decode(bl); - - assert(whoami == superblock.whoami); // fixme! - return 0; -} - - - - - -// ====================================================== -// PG's - -PG *OSD::_new_lock_pg(pg_t pgid) -{ - // create - PG *pg; - if (pgid.is_rep()) - pg = new ReplicatedPG(this, pgid); - //else if (pgid.is_raid4()) - //pg = new RAID4PG(this, pgid); - else - assert(0); - - assert(pg_map.count(pgid) == 0); - pg_map[pgid] = pg; - - pg->lock(); // always lock. - pg->get(); // because it's in pg_map - return pg; -} - - -PG *OSD::_create_lock_pg(pg_t pgid, ObjectStore::Transaction& t) -{ - dout(10) << "_create_lock_pg " << pgid << dendl; - - if (pg_map.count(pgid)) - dout(0) << "_create_lock_pg on " << pgid << ", already have " << *pg_map[pgid] << dendl; - - // open - PG *pg = _new_lock_pg(pgid); - - // create collection - assert(!store->collection_exists(pgid)); - t.create_collection(pgid); - - return pg; -} - -bool OSD::_have_pg(pg_t pgid) -{ - return pg_map.count(pgid); -} - -PG *OSD::_lookup_lock_pg(pg_t pgid) -{ - assert(pg_map.count(pgid)); - PG *pg = pg_map[pgid]; - pg->lock(); - return pg; -} - - -void OSD::_remove_unlock_pg(PG *pg) -{ - pg_t pgid = pg->info.pgid; - - dout(10) << "_remove_unlock_pg " << pgid << dendl; - - // remove from store - list olist; - store->collection_list(pgid, olist); - - ObjectStore::Transaction t; - { - for (list::iterator p = olist.begin(); - p != olist.end(); - p++) - t.remove(*p); - t.remove_collection(pgid); - t.remove(pgid.to_object()); // log too - } - store->apply_transaction(t); - - // mark deleted - pg->mark_deleted(); - - // remove from map - pg_map.erase(pgid); - - // unlock, and probably delete - pg->put_unlock(); // will delete, if last reference -} - - -void OSD::try_create_pg(pg_t pgid, ObjectStore::Transaction& t) -{ - vector acting; - int nrep = osdmap->pg_to_acting_osds(pgid, acting); - int role = osdmap->calc_pg_role(whoami, acting, nrep); - if (role < 0) return; - - PG *pg = _create_lock_pg(pgid, t); - pg->set_role(role); - pg->acting.swap(acting); - pg->last_epoch_started_any = - pg->info.last_epoch_started = - pg->info.history.same_since = - pg->info.history.same_primary_since = - pg->info.history.same_acker_since = osdmap->get_epoch(); - pg->write_log(t); - if (g_conf.osd_hack_fast_startup) - pg->activate(t); - - dout(7) << "created " << *pg << dendl; - pg->unlock(); -} - -void OSD::load_pgs() -{ - dout(10) << "load_pgs" << dendl; - assert(pg_map.empty()); - - list ls; - store->list_collections(ls); - - for (list::iterator it = ls.begin(); - it != ls.end(); - it++) { - pg_t pgid = *it; - PG *pg = _new_lock_pg(pgid); - - // read pg info - store->collection_getattr(pgid, "info", &pg->info, sizeof(pg->info)); - - // read pg log - pg->read_log(store); - - // generate state for current mapping - int nrep = osdmap->pg_to_acting_osds(pgid, pg->acting); - int role = osdmap->calc_pg_role(whoami, pg->acting, nrep); - pg->set_role(role); - - dout(10) << "load_pgs loaded " << *pg << " " << pg->log << dendl; - pg->unlock(); - } -} - - - -/** - * check epochs starting from start to verify the pg acting set hasn't changed - * up until now - */ -void OSD::project_pg_history(pg_t pgid, PG::Info::History& h, epoch_t from, - vector& last) -{ - dout(15) << "project_pg_history " << pgid - << " from " << from << " to " << osdmap->get_epoch() - << ", start " << h - << dendl; - - for (epoch_t e = osdmap->get_epoch()-1; - e >= from; - e--) { - // verify during intermediate epoch - OSDMap oldmap; - get_map(e, oldmap); - - vector acting; - oldmap.pg_to_acting_osds(pgid, acting); - - // acting set change? - if (acting != last && - e > h.same_since) { - dout(15) << "project_pg_history " << pgid << " changed in " << e+1 - << " from " << acting << " -> " << last << dendl; - h.same_since = e+1; - } - - // primary change? - if (!(!acting.empty() && !last.empty() && acting[0] == last[0]) && - e > h.same_primary_since) { - dout(15) << "project_pg_history " << pgid << " primary changed in " << e+1 << dendl; - h.same_primary_since = e+1; - - if (g_conf.osd_rep == OSD_REP_PRIMARY) - h.same_acker_since = h.same_primary_since; - } - - // acker change? - if (g_conf.osd_rep != OSD_REP_PRIMARY) { - if (!(!acting.empty() && !last.empty() && acting[acting.size()-1] == last[last.size()-1]) && - e > h.same_acker_since) { - dout(15) << "project_pg_history " << pgid << " acker changed in " << e+1 << dendl; - h.same_acker_since = e+1; - } - } - - if (h.same_since > e && - h.same_primary_since > e && - h.same_acker_since > e) break; - } - - dout(15) << "project_pg_history end " << h << dendl; -} - -void OSD::activate_pg(pg_t pgid, epoch_t epoch) -{ - osd_lock.Lock(); - { - if (pg_map.count(pgid)) { - PG *pg = _lookup_lock_pg(pgid); - if (pg->is_crashed() && - pg->is_replay() && - pg->get_role() == 0 && - pg->info.history.same_primary_since <= epoch) { - ObjectStore::Transaction t; - pg->activate(t); - store->apply_transaction(t); - } - pg->unlock(); - } - } - - // finishers? - finished_lock.Lock(); - if (finished.empty()) { - finished_lock.Unlock(); - osd_lock.Unlock(); - } else { - list waiting; - waiting.splice(waiting.begin(), finished); - - finished_lock.Unlock(); - osd_lock.Unlock(); - - for (list::iterator it = waiting.begin(); - it != waiting.end(); - it++) { - dispatch(*it); - } - } -} - - -// ------------------------------------- - -void OSD::_refresh_my_stat(utime_t now) -{ - assert(peer_stat_lock.is_locked()); - - // refresh? - if (now - my_stat.stamp > g_conf.osd_stat_refresh_interval || - pending_ops > 2*my_stat.qlen) { - - my_stat.stamp = now; - my_stat.oprate = stat_oprate.get(now); - - //read_latency_calc.set_size( 20 ); // hrm. - - // qlen - my_stat.qlen = 0; - if (stat_ops) my_stat.qlen = (float)stat_qlen / (float)stat_ops; //get_average(); - - // rd ops shed in - float frac_rd_ops_shed_in = 0; - float frac_rd_ops_shed_out = 0; - if (stat_rd_ops) { - frac_rd_ops_shed_in = (float)stat_rd_ops_shed_in / (float)stat_rd_ops; - frac_rd_ops_shed_out = (float)stat_rd_ops_shed_out / (float)stat_rd_ops; - } - my_stat.frac_rd_ops_shed_in = (my_stat.frac_rd_ops_shed_in + frac_rd_ops_shed_in) / 2.0; - my_stat.frac_rd_ops_shed_out = (my_stat.frac_rd_ops_shed_out + frac_rd_ops_shed_out) / 2.0; - - // recent_qlen - qlen_calc.add(my_stat.qlen); - my_stat.recent_qlen = qlen_calc.get_average(); - - // read latency - if (stat_rd_ops) { - my_stat.read_latency = read_latency_calc.get_average(); - if (my_stat.read_latency < 0) my_stat.read_latency = 0; - } else { - my_stat.read_latency = 0; - } - - my_stat.read_latency_mine = my_stat.read_latency * (1.0 - frac_rd_ops_shed_in); - - logger->fset("qlen", my_stat.qlen); - logger->fset("rqlen", my_stat.recent_qlen); - logger->fset("rdlat", my_stat.read_latency); - logger->fset("rdlatm", my_stat.read_latency_mine); - logger->fset("fshdin", my_stat.frac_rd_ops_shed_in); - logger->fset("fshdout", my_stat.frac_rd_ops_shed_out); - dout(12) << "_refresh_my_stat " << my_stat << dendl; - - stat_rd_ops = 0; - stat_rd_ops_shed_in = 0; - stat_rd_ops_shed_out = 0; - stat_ops = 0; - stat_qlen = 0; - } -} - -osd_peer_stat_t OSD::get_my_stat_for(utime_t now, int peer) -{ - Mutex::Locker lock(peer_stat_lock); - _refresh_my_stat(now); - my_stat_on_peer[peer] = my_stat; - return my_stat; -} - -void OSD::take_peer_stat(int peer, const osd_peer_stat_t& stat) -{ - Mutex::Locker lock(peer_stat_lock); - dout(10) << "take_peer_stat peer osd" << peer << " " << stat << dendl; - peer_stat[peer] = stat; -} - -void OSD::heartbeat() -{ - utime_t now = g_clock.now(); - utime_t since = now; - since.sec_ref() -= g_conf.osd_heartbeat_interval; - - // get CPU load avg - ifstream in("/proc/loadavg"); - if (in.is_open()) { - float oneminavg; - in >> oneminavg; - logger->fset("loadavg", oneminavg); - in.close(); - } - - // calc my stats - Mutex::Locker lock(peer_stat_lock); - _refresh_my_stat(now); - - dout(5) << "heartbeat: " << my_stat << dendl; - - //load_calc.set_size(stat_ops); - - // send pings - set pingset; - for (hash_map::iterator i = pg_map.begin(); - i != pg_map.end(); - i++) { - PG *pg = i->second; - - // we want to ping the primary. - if (pg->get_role() <= 0) continue; - if (pg->acting.size() < 1) continue; - - if (pg->last_heartbeat < since) { - pg->last_heartbeat = now; - pingset.insert(pg->acting[0]); - } - } - my_stat_on_peer.clear(); - for (set::iterator i = pingset.begin(); - i != pingset.end(); - i++) { - _share_map_outgoing( osdmap->get_inst(*i) ); - my_stat_on_peer[*i] = my_stat; - messenger->send_message(new MOSDPing(osdmap->get_epoch(), my_stat), - osdmap->get_inst(*i)); - } - - if (logger) logger->set("pingset", pingset.size()); - - // hack: fake reorg? - if (osdmap && g_conf.fake_osdmap_updates) { - int mon = monmap->pick_mon(); - if ((rand() % g_conf.fake_osdmap_updates) == 0) { - //if ((rand() % (g_conf.num_osd / g_conf.fake_osdmap_updates)) == whoami / g_conf.fake_osdmap_updates) { - messenger->send_message(new MOSDIn(osdmap->get_epoch()), - monmap->get_inst(mon)); - } - /* - if (osdmap->is_out(whoami)) { - messenger->send_message(new MOSDIn(osdmap->get_epoch()), - MSG_ADDR_MON(mon), monmap->get_inst(mon)); - } - else if ((rand() % g_conf.fake_osdmap_updates) == 0) { - //messenger->send_message(new MOSDOut(osdmap->get_epoch()), - //MSG_ADDR_MON(mon), monmap->get_inst(mon)); - } - } - */ - } - - // schedule next! randomly. - float wait = .5 + ((float)(rand() % 10)/10.0) * (float)g_conf.osd_heartbeat_interval; - timer.add_event_after(wait, new C_Heartbeat(this)); -} - - - -void OSD::send_pg_stats() -{ - //dout(-10) << "send_pg_stats" << dendl; - - // grab queue - set q; - pg_stat_queue_lock.Lock(); - q.swap(pg_stat_queue); - pg_stat_queue_lock.Unlock(); - - if (!q.empty()) { - dout(1) << "send_pg_stats - " << q.size() << " pgs updated" << dendl; - - MPGStats *m = new MPGStats; - while (!q.empty()) { - pg_t pgid = *q.begin(); - q.erase(q.begin()); - - if (!pg_map.count(pgid)) continue; - PG *pg = pg_map[pgid]; - pg->pg_stats_lock.Lock(); - m->pg_stat[pgid] = pg->pg_stats; - dout(20) << " sending " << pgid << " " << pg->pg_stats.state << dendl; - pg->pg_stats_lock.Unlock(); - } - - // fill in osd stats too - struct statfs stbuf; - store->statfs(&stbuf); - m->osd_stat.num_blocks = stbuf.f_blocks; - m->osd_stat.num_blocks_avail = stbuf.f_bavail; - m->osd_stat.num_objects = stbuf.f_files; - - int mon = monmap->pick_mon(); - messenger->send_message(m, monmap->get_inst(mon)); - } - - // reschedule - timer.add_event_after(g_conf.osd_pg_stats_interval, new C_Stats(this)); -} - - - - -// -------------------------------------- -// dispatch - -bool OSD::_share_map_incoming(const entity_inst_t& inst, epoch_t epoch) -{ - bool shared = false; - dout(20) << "_share_map_incoming " << inst << " " << epoch << dendl; - assert(osd_lock.is_locked()); - - // does client have old map? - if (inst.name.is_client()) { - if (epoch < osdmap->get_epoch()) { - dout(10) << inst.name << " has old map " << epoch << " < " << osdmap->get_epoch() << dendl; - send_incremental_map(epoch, inst, true); - shared = true; - } - } - - // does peer have old map? - if (inst.name.is_osd()) { - // remember - if (peer_map_epoch[inst.name] < epoch) { - dout(20) << "peer " << inst.name << " has " << epoch << dendl; - peer_map_epoch[inst.name] = epoch; - } - - // older? - if (peer_map_epoch[inst.name] < osdmap->get_epoch()) { - dout(10) << inst.name << " has old map " << epoch << " < " << osdmap->get_epoch() << dendl; - send_incremental_map(epoch, inst, true); - peer_map_epoch[inst.name] = osdmap->get_epoch(); // so we don't send it again. - shared = true; - } - } - - return shared; -} - - -void OSD::_share_map_outgoing(const entity_inst_t& inst) -{ - assert(inst.name.is_osd()); - - if (inst.name.is_osd()) { - // send map? - if (peer_map_epoch.count(inst.name)) { - epoch_t pe = peer_map_epoch[inst.name]; - if (pe < osdmap->get_epoch()) { - send_incremental_map(pe, inst, true); - peer_map_epoch[inst.name] = osdmap->get_epoch(); - } - } else { - // no idea about peer's epoch. - // ??? send recent ??? - // do nothing. - } - } -} - - - -void OSD::dispatch(Message *m) -{ - // lock! - osd_lock.Lock(); - dout(20) << "dispatch " << m << dendl; - - switch (m->get_type()) { - - // -- don't need lock -- - case MSG_PING: - dout(10) << "ping from " << m->get_source() << dendl; - delete m; - break; - - // -- don't need OSDMap -- - - // map and replication - case MSG_OSD_MAP: - handle_osd_map((MOSDMap*)m); - break; - - // osd - case MSG_SHUTDOWN: - shutdown(); - delete m; - break; - - - - // -- need OSDMap -- - - default: - { - // no map? starting up? - if (!osdmap) { - dout(7) << "no OSDMap, not booted" << dendl; - waiting_for_osdmap.push_back(m); - break; - } - - // down? - if (osdmap->is_down(whoami)) { - dout(7) << "i am marked down, dropping " << *m << dendl; - delete m; - break; - } - - - - - // need OSDMap - switch (m->get_type()) { - - case MSG_OSD_PING: - // take note. - handle_osd_ping((MOSDPing*)m); - break; - - case MSG_OSD_PG_NOTIFY: - handle_pg_notify((MOSDPGNotify*)m); - break; - case MSG_OSD_PG_QUERY: - handle_pg_query((MOSDPGQuery*)m); - break; - case MSG_OSD_PG_LOG: - handle_pg_log((MOSDPGLog*)m); - break; - case MSG_OSD_PG_REMOVE: - handle_pg_remove((MOSDPGRemove*)m); - break; - case MSG_OSD_PG_ACTIVATE_SET: - handle_pg_activate_set((MOSDPGActivateSet*)m); - break; - - case MSG_OSD_OP: - handle_op((MOSDOp*)m); - break; - - // for replication etc. - case MSG_OSD_OPREPLY: - handle_op_reply((MOSDOpReply*)m); - break; - - - default: - dout(1) << " got unknown message " << m->get_type() << dendl; - assert(0); - } - } - } - - // finishers? - finished_lock.Lock(); - if (!finished.empty()) { - list waiting; - waiting.splice(waiting.begin(), finished); - - finished_lock.Unlock(); - osd_lock.Unlock(); - - while (!waiting.empty()) { - dout(20) << "doing finished " << waiting.front() << dendl; - dispatch(waiting.front()); - waiting.pop_front(); - } - return; - } - - finished_lock.Unlock(); - osd_lock.Unlock(); -} - - -void OSD::ms_handle_failure(Message *m, const entity_inst_t& inst) -{ - entity_name_t dest = inst.name; - - if (g_conf.ms_die_on_failure) { - dout(0) << "ms_handle_failure " << inst << " on " << *m << dendl; - exit(0); - } - - if (is_stopping()) { - delete m; - return; - } - - if (dest.is_osd()) { - // failed osd. drop message, report to mon. - int mon = monmap->pick_mon(); - dout(1) << "ms_handle_failure " << inst - << ", dropping and reporting to mon" << mon - << " " << *m - << dendl; - messenger->send_message(new MOSDFailure(messenger->get_myinst(), inst, osdmap->get_epoch()), - monmap->get_inst(mon)); - delete m; - } else if (dest.is_mon()) { - // resend to a different monitor. - int mon = monmap->pick_mon(true); - dout(1) << "ms_handle_failure " << inst - << ", resending to mon" << mon - << " " << *m - << dendl; - messenger->send_message(m, monmap->get_inst(mon)); - } - else { - // client? - dout(1) << "ms_handle_failure " << inst - << ", dropping " << *m << dendl; - delete m; - } -} - - - - -void OSD::handle_osd_ping(MOSDPing *m) -{ - dout(20) << "osdping from " << m->get_source() << " got stat " << m->peer_stat << dendl; - - _share_map_incoming(m->get_source_inst(), ((MOSDPing*)m)->map_epoch); - - int from = m->get_source().num(); - take_peer_stat(from, m->peer_stat); - - delete m; -} - - - - -// ===================================================== -// MAP - -void OSD::wait_for_new_map(Message *m) -{ - // ask - if (waiting_for_osdmap.empty()) { - int mon = monmap->pick_mon(); - messenger->send_message(new MOSDGetMap(osdmap->get_epoch()+1), - monmap->get_inst(mon)); - } - - waiting_for_osdmap.push_back(m); -} - - -/** update_map - * assimilate new OSDMap(s). scan pgs, etc. - */ -void OSD::handle_osd_map(MOSDMap *m) -{ - wait_for_no_ops(); - - assert(osd_lock.is_locked()); - - ObjectStore::Transaction t; - - if (osdmap) { - dout(3) << "handle_osd_map epochs [" - << m->get_first() << "," << m->get_last() - << "], i have " << osdmap->get_epoch() - << dendl; - } else { - dout(3) << "handle_osd_map epochs [" - << m->get_first() << "," << m->get_last() - << "], i have none" - << dendl; - osdmap = new OSDMap; - boot_epoch = m->get_last(); // hrm...? - } - - logger->inc("mapmsg"); - - // store them? - for (map::iterator p = m->maps.begin(); - p != m->maps.end(); - p++) { - object_t oid = get_osdmap_object_name(p->first); - if (store->exists(oid)) { - dout(10) << "handle_osd_map already had full map epoch " << p->first << dendl; - logger->inc("mapfdup"); - bufferlist bl; - get_map_bl(p->first, bl); - dout(10) << " .. it is " << bl.length() << " bytes" << dendl; - continue; - } - - dout(10) << "handle_osd_map got full map epoch " << p->first << dendl; - store->write(oid, 0, p->second.length(), p->second, 0); // store _outside_ transaction; activate_map reads it. - - if (p->first > superblock.newest_map) - superblock.newest_map = p->first; - if (p->first < superblock.oldest_map || - superblock.oldest_map == 0) - superblock.oldest_map = p->first; - - logger->inc("mapf"); - } - for (map::iterator p = m->incremental_maps.begin(); - p != m->incremental_maps.end(); - p++) { - object_t oid = get_inc_osdmap_object_name(p->first); - if (store->exists(oid)) { - dout(10) << "handle_osd_map already had incremental map epoch " << p->first << dendl; - logger->inc("mapidup"); - bufferlist bl; - get_inc_map_bl(p->first, bl); - dout(10) << " .. it is " << bl.length() << " bytes" << dendl; - continue; - } - - dout(10) << "handle_osd_map got incremental map epoch " << p->first << dendl; - store->write(oid, 0, p->second.length(), p->second, 0); // store _outside_ transaction; activate_map reads it. - - if (p->first > superblock.newest_map) - superblock.newest_map = p->first; - if (p->first < superblock.oldest_map || - superblock.oldest_map == 0) - superblock.oldest_map = p->first; - - logger->inc("mapi"); - } - - // advance if we can - bool advanced = false; - - epoch_t cur = superblock.current_epoch; - while (cur < superblock.newest_map) { - dout(10) << "cur " << cur << " < newest " << superblock.newest_map << dendl; - - if (m->incremental_maps.count(cur+1) || - store->exists(get_inc_osdmap_object_name(cur+1))) { - dout(10) << "handle_osd_map decoding inc map epoch " << cur+1 << dendl; - - bufferlist bl; - if (m->incremental_maps.count(cur+1)) { - dout(10) << " using provided inc map" << dendl; - bl = m->incremental_maps[cur+1]; - } else { - dout(10) << " using my locally stored inc map" << dendl; - get_inc_map_bl(cur+1, bl); - } - - OSDMap::Incremental inc; - int off = 0; - inc.decode(bl, off); - - osdmap->apply_incremental(inc); - - // archive the full map - bl.clear(); - osdmap->encode(bl); - t.write( get_osdmap_object_name(cur+1), 0, bl.length(), bl); - - // notify messenger - for (map >::iterator i = inc.new_down.begin(); - i != inc.new_down.end(); - i++) { - int osd = i->first; - if (osd == whoami) continue; - messenger->mark_down(i->second.first.addr); - peer_map_epoch.erase(i->second.first.name); - - // kick any replica ops - for (hash_map::iterator it = pg_map.begin(); - it != pg_map.end(); - it++) { - PG *pg = it->second; - - pg->lock(); - pg->note_failed_osd(osd); - pg->unlock(); - } - } - for (map::iterator i = inc.new_up.begin(); - i != inc.new_up.end(); - i++) { - if (i->first == whoami) continue; - peer_map_epoch.erase(i->second.name); - } - } - else if (m->maps.count(cur+1) || - store->exists(get_osdmap_object_name(cur+1))) { - dout(10) << "handle_osd_map decoding full map epoch " << cur+1 << dendl; - bufferlist bl; - if (m->maps.count(cur+1)) - bl = m->maps[cur+1]; - else - get_map_bl(cur+1, bl); - osdmap->decode(bl); - - // FIXME BUG: need to notify messenger of ups/downs!! - } - else { - dout(10) << "handle_osd_map missing epoch " << cur+1 << dendl; - int mon = monmap->pick_mon(); - messenger->send_message(new MOSDGetMap(cur+1), monmap->get_inst(mon)); - break; - } - - cur++; - superblock.current_epoch = cur; - advance_map(t); - advanced = true; - } - - // all the way? - if (advanced && cur == superblock.newest_map) { - // yay! - activate_map(t); - - // process waiters - take_waiters(waiting_for_osdmap); - } - - // write updated pg state to store - for (hash_map::iterator i = pg_map.begin(); - i != pg_map.end(); - i++) { - pg_t pgid = i->first; - PG *pg = i->second; - t.collection_setattr( pgid, "info", &pg->info, sizeof(pg->info)); - } - - // superblock and commit - write_superblock(t); - store->apply_transaction(t); - - //if (osdmap->get_epoch() == 1) store->sync(); // in case of early death, blah - - delete m; -} - - -/** - * scan placement groups, initiate any replication - * activities. - */ -void OSD::advance_map(ObjectStore::Transaction& t) -{ - dout(7) << "advance_map epoch " << osdmap->get_epoch() - << " " << pg_map.size() << " pgs" - << dendl; - - if (osdmap->is_mkfs()) { - ps_t numps = osdmap->get_pg_num(); - ps_t numlps = osdmap->get_localized_pg_num(); - dout(1) << "mkfs on " << numps << " normal, " << numlps << " localized pg sets" << dendl; - int minrep = 1; - int maxrep = MIN(g_conf.num_osd, g_conf.osd_max_rep); - int minraid = g_conf.osd_min_raid_width; - int maxraid = g_conf.osd_max_raid_width; - dout(1) << "mkfs " << minrep << ".." << maxrep << " replicas, " - << minraid << ".." << maxraid << " osd raid groups" << dendl; - - //derr(0) << "osdmap " << osdmap->get_ctime() << " logger start " << logger->get_start() << dendl; - logger->set_start( osdmap->get_ctime() ); - - assert(g_conf.osd_mkfs); // make sure we did a mkfs! - - // create PGs - // replicated - for (int nrep = 1; nrep <= maxrep; nrep++) { - for (ps_t ps = 0; ps < numps; ++ps) - try_create_pg(pg_t(pg_t::TYPE_REP, nrep, ps, -1), t); - for (ps_t ps = 0; ps < numlps; ++ps) - try_create_pg(pg_t(pg_t::TYPE_REP, nrep, ps, whoami), t); - } - - // raided - /* - for (int size = minraid; size <= maxraid; size++) { - for (ps_t ps = 0; ps < numps; ++ps) - try_create_pg(pg_t(pg_t::TYPE_RAID4, size, ps, -1), t); - for (ps_t ps = 0; ps < numlps; ++ps) - try_create_pg(pg_t(pg_t::TYPE_RAID4, size, ps, whoami), t); - } - */ - dout(1) << "mkfs done, created " << pg_map.size() << " pgs" << dendl; - - } else { - // scan existing pg's - for (hash_map::iterator it = pg_map.begin(); - it != pg_map.end(); - it++) { - pg_t pgid = it->first; - PG *pg = it->second; - - // did i finish this epoch? - if (pg->is_active()) { - pg->info.last_epoch_finished = osdmap->get_epoch()-1; - } - - // get new acting set - vector tacting; - int nrep = osdmap->pg_to_acting_osds(pgid, tacting); - int role = osdmap->calc_pg_role(whoami, tacting, nrep); - - // no change? - if (tacting == pg->acting) - continue; - - // -- there was a change! -- - pg->lock(); - - int oldrole = pg->get_role(); - int oldprimary = pg->get_primary(); - int oldacker = pg->get_acker(); - vector oldacting = pg->acting; - - // update PG - pg->acting.swap(tacting); - pg->set_role(role); - - // did primary|acker change? - pg->info.history.same_since = osdmap->get_epoch(); - if (oldprimary != pg->get_primary()) { - pg->info.history.same_primary_since = osdmap->get_epoch(); - pg->cancel_recovery(); - } - if (oldacker != pg->get_acker()) { - pg->info.history.same_acker_since = osdmap->get_epoch(); - } - - // deactivate. - pg->state_clear(PG::STATE_ACTIVE); - - // reset primary state? - if (oldrole == 0 || pg->get_role() == 0) - pg->clear_primary_state(); - - // apply any repops in progress. - if (oldacker == whoami) { - pg->on_acker_change(); - } - - if (role != oldrole) { - // old primary? - if (oldrole == 0) { - pg->state_clear(PG::STATE_CLEAN); - - // take replay queue waiters - list ls; - for (map::iterator it = pg->replay_queue.begin(); - it != pg->replay_queue.end(); - it++) - ls.push_back(it->second); - pg->replay_queue.clear(); - take_waiters(ls); - - // take active waiters - take_waiters(pg->waiting_for_active); - - pg->on_role_change(); - } - - // new primary? - if (role == 0) { - // i am new primary - pg->state_clear(PG::STATE_STRAY); - } else { - // i am now replica|stray. we need to send a notify. - pg->state_set(PG::STATE_STRAY); - - if (nrep == 0) { - // did they all shut down cleanly? - bool clean = true; - vector inset; - osdmap->pg_to_osds(pg->info.pgid, inset); - for (unsigned i=0; iis_down_clean(inset[i])) clean = false; - if (clean) { - dout(1) << *pg << " is cleanly inactive" << dendl; - } else { - pg->state_set(PG::STATE_CRASHED); - dout(1) << *pg << " is crashed" << dendl; - } - } - } - - // my role changed. - dout(10) << *pg << " " << oldacting << " -> " << pg->acting - << ", role " << oldrole << " -> " << role << dendl; - - } else { - // no role change. - // did primary change? - if (pg->get_primary() != oldprimary) { - // we need to announce - pg->state_set(PG::STATE_STRAY); - - dout(10) << *pg << " " << oldacting << " -> " << pg->acting - << ", acting primary " - << oldprimary << " -> " << pg->get_primary() - << dendl; - } else { - // primary is the same. - if (role == 0) { - // i am (still) primary. but my replica set changed. - pg->state_clear(PG::STATE_CLEAN); - pg->state_clear(PG::STATE_REPLAY); - - dout(10) << *pg << " " << oldacting << " -> " << pg->acting - << ", replicas changed" << dendl; - } - } - } - - pg->unlock(); - } - } -} - -void OSD::activate_map(ObjectStore::Transaction& t) -{ - dout(7) << "activate_map version " << osdmap->get_epoch() << dendl; - - map< int, list > notify_list; // primary -> list - map< int, map > query_map; // peer -> PG -> get_summary_since - map activator_map; // peer -> message - - // scan pg's - for (hash_map::iterator it = pg_map.begin(); - it != pg_map.end(); - it++) { - //pg_t pgid = it->first; - PG *pg = it->second; - pg->lock(); - if (pg->is_active()) { - // update started counter - pg->info.last_epoch_started = osdmap->get_epoch(); - } - else if (pg->get_role() == 0 && !pg->is_active()) { - // i am (inactive) primary - pg->build_prior(); - pg->peer(t, query_map, &activator_map); - } - else if (pg->is_stray() && - pg->get_primary() >= 0) { - // i am residual|replica - notify_list[pg->get_primary()].push_back(pg->info); - } - if (pg->is_primary()) - pg->update_stats(); - pg->unlock(); - } - - if (g_conf.osd_hack_fast_startup && - osdmap->is_mkfs()) // hack: skip the queries/summaries if it's a mkfs - return; - - do_notifies(notify_list); // notify? (residual|replica) - do_queries(query_map); - do_activators(activator_map); - - logger->set("numpg", pg_map.size()); -} - - -void OSD::send_incremental_map(epoch_t since, const entity_inst_t& inst, bool full) -{ - dout(10) << "send_incremental_map " << since << " -> " << osdmap->get_epoch() - << " to " << inst << dendl; - - MOSDMap *m = new MOSDMap; - - for (epoch_t e = osdmap->get_epoch(); - e > since; - e--) { - bufferlist bl; - if (get_inc_map_bl(e,bl)) { - m->incremental_maps[e].claim(bl); - } else if (get_map_bl(e,bl)) { - m->maps[e].claim(bl); - if (!full) break; - } - else { - assert(0); // we should have all maps. - } - } - - messenger->send_message(m, inst); -} - -bool OSD::get_map_bl(epoch_t e, bufferlist& bl) -{ - return store->read(get_osdmap_object_name(e), 0, 0, bl) >= 0; -} - -bool OSD::get_inc_map_bl(epoch_t e, bufferlist& bl) -{ - return store->read(get_inc_osdmap_object_name(e), 0, 0, bl) >= 0; -} - -void OSD::get_map(epoch_t epoch, OSDMap &m) -{ - // find a complete map - list incs; - epoch_t e; - for (e = epoch; e > 0; e--) { - bufferlist bl; - if (get_map_bl(e, bl)) { - //dout(10) << "get_map " << epoch << " full " << e << dendl; - m.decode(bl); - break; - } else { - OSDMap::Incremental inc; - bool got = get_inc_map(e, inc); - assert(got); - incs.push_front(inc); - } - } - assert(e >= 0); - - // apply incrementals - for (e++; e <= epoch; e++) { - //dout(10) << "get_map " << epoch << " inc " << e << dendl; - m.apply_incremental( incs.front() ); - incs.pop_front(); - } -} - - -bool OSD::get_inc_map(epoch_t e, OSDMap::Incremental &inc) -{ - bufferlist bl; - if (!get_inc_map_bl(e, bl)) - return false; - int off = 0; - inc.decode(bl, off); - return true; -} - - - - - -bool OSD::require_current_map(Message *m, epoch_t ep) -{ - // older map? - if (ep < osdmap->get_epoch()) { - dout(7) << "require_current_map epoch " << ep << " < " << osdmap->get_epoch() << dendl; - delete m; // discard and ignore. - return false; - } - - // newer map? - if (ep > osdmap->get_epoch()) { - dout(7) << "require_current_map epoch " << ep << " > " << osdmap->get_epoch() << dendl; - wait_for_new_map(m); - return false; - } - - assert(ep == osdmap->get_epoch()); - return true; -} - - -/* - * require that we have same (or newer) map, and that - * the source is the pg primary. - */ -bool OSD::require_same_or_newer_map(Message *m, epoch_t epoch) -{ - dout(15) << "require_same_or_newer_map " << epoch << " (i am " << osdmap->get_epoch() << ") " << m << dendl; - - // newer map? - if (epoch > osdmap->get_epoch()) { - dout(7) << "waiting for newer map epoch " << epoch << " > my " << osdmap->get_epoch() << " with " << m << dendl; - wait_for_new_map(m); - return false; - } - - if (epoch < boot_epoch) { - dout(7) << "from pre-boot epoch " << epoch << " < " << boot_epoch << dendl; - delete m; - return false; - } - - return true; -} - - - - - -/** do_notifies - * Send an MOSDPGNotify to a primary, with a list of PGs that I have - * content for, and they are primary for. - */ - -void OSD::do_notifies(map< int, list >& notify_list) -{ - for (map< int, list >::iterator it = notify_list.begin(); - it != notify_list.end(); - it++) { - if (it->first == whoami) { - dout(7) << "do_notify osd" << it->first << " is self, skipping" << dendl; - continue; - } - dout(7) << "do_notify osd" << it->first << " on " << it->second.size() << " PGs" << dendl; - MOSDPGNotify *m = new MOSDPGNotify(osdmap->get_epoch(), it->second); - _share_map_outgoing(osdmap->get_inst(it->first)); - messenger->send_message(m, osdmap->get_inst(it->first)); - } -} - - -/** do_queries - * send out pending queries for info | summaries - */ -void OSD::do_queries(map< int, map >& query_map) -{ - for (map< int, map >::iterator pit = query_map.begin(); - pit != query_map.end(); - pit++) { - int who = pit->first; - dout(7) << "do_queries querying osd" << who - << " on " << pit->second.size() << " PGs" << dendl; - - MOSDPGQuery *m = new MOSDPGQuery(osdmap->get_epoch(), - pit->second); - _share_map_outgoing(osdmap->get_inst(who)); - messenger->send_message(m, osdmap->get_inst(who)); - } -} - - -void OSD::do_activators(map& activator_map) -{ - for (map::iterator p = activator_map.begin(); - p != activator_map.end(); - ++p) - messenger->send_message(p->second, osdmap->get_inst(p->first)); - activator_map.clear(); -} - - - - - -/** PGNotify - * from non-primary to primary - * includes PG::Info. - * NOTE: called with opqueue active. - */ -void OSD::handle_pg_notify(MOSDPGNotify *m) -{ - dout(7) << "handle_pg_notify from " << m->get_source() << dendl; - int from = m->get_source().num(); - - if (!require_same_or_newer_map(m, m->get_epoch())) return; - - ObjectStore::Transaction t; - - // look for unknown PGs i'm primary for - map< int, map > query_map; - map activator_map; - - for (list::iterator it = m->get_pg_list().begin(); - it != m->get_pg_list().end(); - it++) { - pg_t pgid = it->pgid; - PG *pg; - - if (!_have_pg(pgid)) { - // same primary? - vector acting; - int nrep = osdmap->pg_to_acting_osds(pgid, acting); - int role = osdmap->calc_pg_role(whoami, acting, nrep); - - PG::Info::History history = it->history; - project_pg_history(pgid, history, m->get_epoch(), acting); - - if (m->get_epoch() < history.same_primary_since) { - dout(10) << "handle_pg_notify pg " << pgid << " dne, and primary changed in " - << history.same_primary_since << " (msg from " << m->get_epoch() << ")" << dendl; - continue; - } - - assert(role == 0); // otherwise, probably bug in project_pg_history. - - // ok, create PG! - pg = _create_lock_pg(pgid, t); - pg->acting.swap(acting); - pg->set_role(role); - pg->info.history = history; - pg->last_epoch_started_any = it->last_epoch_started; - pg->clear_primary_state(); // yep, notably, set hml=false - pg->build_prior(); - pg->write_log(t); - - dout(10) << *pg << " is new" << dendl; - - // kick any waiters - if (waiting_for_pg.count(pgid)) { - take_waiters(waiting_for_pg[pgid]); - waiting_for_pg.erase(pgid); - } - } else { - // already had it. am i (still) the primary? - pg = _lookup_lock_pg(pgid); - if (m->get_epoch() < pg->info.history.same_primary_since) { - dout(10) << *pg << " handle_pg_notify primary changed in " - << pg->info.history.same_primary_since - << " (msg from " << m->get_epoch() << ")" << dendl; - pg->unlock(); - continue; - } - } - - // ok! - - // stray? - bool acting = pg->is_acting(from); - if (!acting && (*it).last_epoch_started > 0) { - dout(10) << *pg << " osd" << from << " has stray content: " << *it << dendl; - pg->stray_set.insert(from); - pg->state_clear(PG::STATE_CLEAN); - } - - // save info. - bool had = pg->peer_info.count(from); - pg->peer_info[from] = *it; - - if (had) { - if (pg->is_active() && - (*it).is_uptodate() && acting) { - pg->uptodate_set.insert(from); - dout(10) << *pg << " osd" << from << " now uptodate (" << pg->uptodate_set - << "): " << *it << dendl; - if (pg->is_all_uptodate()) - pg->finish_recovery(); - } else { - // hmm, maybe keep an eye out for cases where we see this, but peer should happen. - dout(10) << *pg << " already had notify info from osd" << from << ": " << *it << dendl; - } - } else { - // adjust prior? - if (it->last_epoch_started > pg->last_epoch_started_any) - pg->adjust_prior(); - - // peer - pg->peer(t, query_map, &activator_map); - } - - pg->unlock(); - } - - unsigned tr = store->apply_transaction(t); - assert(tr == 0); - - do_queries(query_map); - do_activators(activator_map); - - delete m; -} - - - -/** PGLog - * from non-primary to primary - * includes log and info - * from primary to non-primary - * includes log for use in recovery - * NOTE: called with opqueue active. - */ - - -void OSD::_process_pg_info(epoch_t epoch, int from, - PG::Info &info, - PG::Log &log, - PG::Missing &missing, - map* activator_map) -{ - if (pg_map.count(info.pgid) == 0) { - dout(10) << "_process_pg_info " << info << " don't have pg" << dendl; - assert(epoch < osdmap->get_epoch()); - return; - } - - PG *pg = _lookup_lock_pg(info.pgid); - assert(pg); - - dout(10) << *pg << " got " << info << " " << log << " " << missing << dendl; - - if (epoch < pg->info.history.same_since) { - dout(10) << *pg << " got old info " << info << ", ignoring" << dendl; - pg->unlock(); - return; - } - - //m->log.print(cout); - - ObjectStore::Transaction t; - - if (pg->is_primary()) { - // i am PRIMARY - assert(pg->peer_log_requested.count(from) || - pg->peer_summary_requested.count(from)); - - pg->proc_replica_log(log, missing, from); - - // peer - map< int, map > query_map; - pg->peer(t, query_map, activator_map); - do_queries(query_map); - - } else { - // i am REPLICA - // merge log - pg->merge_log(log, missing, from); - pg->proc_missing(log, missing, from); - assert(pg->missing.num_lost() == 0); - - // ok activate! - pg->activate(t, activator_map); - } - - unsigned tr = store->apply_transaction(t); - assert(tr == 0); - - pg->unlock(); -} - - -void OSD::handle_pg_log(MOSDPGLog *m) -{ - dout(7) << "handle_pg_log " << *m << " from " << m->get_source() << dendl; - - int from = m->get_source().num(); - if (!require_same_or_newer_map(m, m->get_epoch())) return; - - _process_pg_info(m->get_epoch(), from, - m->info, m->log, m->missing, 0); - - delete m; -} - -void OSD::handle_pg_activate_set(MOSDPGActivateSet *m) -{ - dout(7) << "handle_pg_activate_set " << *m << " from " << m->get_source() << dendl; - - int from = m->get_source().num(); - if (!require_same_or_newer_map(m, m->get_epoch())) return; - - PG::Log empty_log; - PG::Missing empty_missing; - map activator_map; - - for (list::iterator p = m->pg_info.begin(); - p != m->pg_info.end(); - ++p) - _process_pg_info(m->get_epoch(), from, *p, empty_log, empty_missing, &activator_map); - - do_activators(activator_map); - - delete m; -} - - -/** PGQuery - * from primary to replica | stray - * NOTE: called with opqueue active. - */ -void OSD::handle_pg_query(MOSDPGQuery *m) -{ - dout(7) << "handle_pg_query from " << m->get_source() << " epoch " << m->get_epoch() << dendl; - int from = m->get_source().num(); - - if (!require_same_or_newer_map(m, m->get_epoch())) return; - - map< int, list > notify_list; - - for (map::iterator it = m->pg_list.begin(); - it != m->pg_list.end(); - it++) { - pg_t pgid = it->first; - PG *pg = 0; - - if (pg_map.count(pgid) == 0) { - // get active crush mapping - vector acting; - int nrep = osdmap->pg_to_acting_osds(pgid, acting); - int role = osdmap->calc_pg_role(whoami, acting, nrep); - - // same primary? - PG::Info::History history = it->second.history; - project_pg_history(pgid, history, m->get_epoch(), acting); - - if (m->get_epoch() < history.same_since) { - dout(10) << " pg " << pgid << " dne, and pg has changed in " - << history.same_primary_since << " (msg from " << m->get_epoch() << ")" << dendl; - continue; - } - - if (role < 0) { - dout(10) << " pg " << pgid << " dne, and i am not an active replica" << dendl; - PG::Info empty(pgid); - notify_list[from].push_back(empty); - continue; - } - assert(role > 0); - - ObjectStore::Transaction t; - pg = _create_lock_pg(pgid, t); - pg->acting.swap( acting ); - pg->set_role(role); - pg->info.history = history; - pg->write_log(t); - store->apply_transaction(t); - - dout(10) << *pg << " dne (before), but i am role " << role << dendl; - } else { - pg = _lookup_lock_pg(pgid); - - // same primary? - if (m->get_epoch() < pg->info.history.same_since) { - dout(10) << *pg << " handle_pg_query primary changed in " - << pg->info.history.same_since - << " (msg from " << m->get_epoch() << ")" << dendl; - pg->unlock(); - continue; - } - } - - // ok, process query! - assert(!pg->acting.empty()); - assert(from == pg->acting[0]); - - if (it->second.type == PG::Query::INFO) { - // info - dout(10) << *pg << " sending info" << dendl; - notify_list[from].push_back(pg->info); - } else { - MOSDPGLog *m = new MOSDPGLog(osdmap->get_epoch(), pg->info); - m->missing = pg->missing; - - if (it->second.type == PG::Query::LOG) { - dout(10) << *pg << " sending info+missing+log since split " << it->second.split - << " from floor " << it->second.floor - << dendl; - if (!m->log.copy_after_unless_divergent(pg->log, it->second.split, it->second.floor)) { - dout(10) << *pg << " divergent, sending backlog" << dendl; - it->second.type = PG::Query::BACKLOG; - } - } - - if (it->second.type == PG::Query::BACKLOG) { - dout(10) << *pg << " sending info+missing+backlog" << dendl; - if (pg->log.backlog) { - m->log = pg->log; - } else { - pg->generate_backlog(); - m->log = pg->log; - pg->drop_backlog(); - } - } - else if (it->second.type == PG::Query::FULLLOG) { - dout(10) << *pg << " sending info+missing+full log" << dendl; - m->log.copy_non_backlog(pg->log); - } - - dout(10) << *pg << " sending " << m->log << " " << m->missing << dendl; - //m->log.print(cout); - - _share_map_outgoing(osdmap->get_inst(from)); - messenger->send_message(m, osdmap->get_inst(from)); - } - - pg->unlock(); - } - - do_notifies(notify_list); - - delete m; -} - - -void OSD::handle_pg_remove(MOSDPGRemove *m) -{ - dout(7) << "handle_pg_remove from " << m->get_source() << dendl; - - if (!require_same_or_newer_map(m, m->get_epoch())) return; - - for (set::iterator it = m->pg_list.begin(); - it != m->pg_list.end(); - it++) { - pg_t pgid = *it; - PG *pg; - - if (pg_map.count(pgid) == 0) { - dout(10) << " don't have pg " << pgid << dendl; - continue; - } - - pg = _lookup_lock_pg(pgid); - - dout(10) << *pg << " removing." << dendl; - assert(pg->get_role() == -1); - - _remove_unlock_pg(pg); - } - - delete m; -} - - - - - - -// ========================================================= -// OPS - -void OSD::handle_op(MOSDOp *op) -{ - // throttle? FIXME PROBABLY! - while (pending_ops > g_conf.osd_max_opq) { - dout(10) << "enqueue_op waiting for pending_ops " << pending_ops << " to drop to " << g_conf.osd_max_opq << dendl; - op_queue_cond.Wait(osd_lock); - } - - // get and lock *pg. - const pg_t pgid = op->get_pg(); - PG *pg = _have_pg(pgid) ? _lookup_lock_pg(pgid):0; - - logger->set("buf", buffer_total_alloc); - - utime_t now = g_clock.now(); - - // update qlen stats - stat_oprate.hit(now); - stat_ops++; - stat_qlen += pending_ops; - if (op->get_op() == OSD_OP_READ) { - stat_rd_ops++; - if (op->get_source().is_osd()) { - //derr(-10) << "shed in " << stat_rd_ops_shed_in << " / " << stat_rd_ops << dendl; - stat_rd_ops_shed_in++; - } - } - - // require same or newer map - if (!require_same_or_newer_map(op, op->get_map_epoch())) { - if (pg) pg->unlock(); - return; - } - - // share our map with sender, if they're old - _share_map_incoming(op->get_source_inst(), op->get_map_epoch()); - - - if (!op->get_source().is_osd()) { - // REGULAR OP (non-replication) - - // note original source - op->set_client_inst( op->get_source_inst() ); - op->clear_payload(); // and hose encoded payload (in case we forward) - - // have pg? - if (!pg) { - dout(7) << "hit non-existent pg " - << pgid - << ", waiting" << dendl; - waiting_for_pg[pgid].push_back(op); - return; - } - - // pg must be same-ish... - if (op->is_read()) { - // read - if (!pg->same_for_read_since(op->get_map_epoch())) { - dout(7) << "handle_rep_op pg changed " << pg->info.history - << " after " << op->get_map_epoch() - << ", dropping" << dendl; - assert(op->get_map_epoch() < osdmap->get_epoch()); - pg->unlock(); - delete op; - return; - } - - /* - if (read && op->get_oid().rev > 0) { - // versioned read. hrm. - // are we missing a revision that we might need? - object_t moid = op->get_oid(); - if (pick_missing_object_rev(moid, pg)) { - // is there a local revision we might use instead? - object_t loid = op->get_oid(); - if (store->pick_object_revision_lt(loid) && - moid <= loid) { - // we need moid. pull it. - dout(10) << "handle_op read on " << op->get_oid() - << ", have " << loid - << ", but need missing " << moid - << ", pulling" << dendl; - pull(pg, moid); - pg->waiting_for_missing_object[moid].push_back(op); - return; - } - - dout(10) << "handle_op read on " << op->get_oid() - << ", have " << loid - << ", don't need missing " << moid - << dendl; - } - } else { - // live revision. easy. - if (op->get_op() != OSD_OP_PUSH && - waitfor_missing_object(op, pg)) return; - } - */ - - } else { - // modify - if ((pg->get_primary() != whoami || - !pg->same_for_modify_since(op->get_map_epoch()))) { - dout(7) << "handle_rep_op pg changed " << pg->info.history - << " after " << op->get_map_epoch() - << ", dropping" << dendl; - assert(op->get_map_epoch() < osdmap->get_epoch()); - pg->unlock(); - delete op; - return; - } - } - - // pg must be active. - if (!pg->is_active()) { - // replay? - if (op->get_version().version > 0) { - if (op->get_version() > pg->info.last_update) { - dout(7) << *pg << " queueing replay at " << op->get_version() - << " for " << *op << dendl; - pg->replay_queue[op->get_version()] = op; - pg->unlock(); - return; - } else { - dout(7) << *pg << " replay at " << op->get_version() << " <= " << pg->info.last_update - << " for " << *op - << ", will queue for WRNOOP" << dendl; - } - } - - dout(7) << *pg << " not active (yet)" << dendl; - pg->waiting_for_active.push_back(op); - pg->unlock(); - return; - } - - // missing object? - if (pg->is_missing_object(op->get_oid())) { - pg->wait_for_missing_object(op->get_oid(), op); - pg->unlock(); - return; - } - - dout(10) << "handle_op " << *op << " in " << *pg << dendl; - - } else { - // REPLICATION OP (it's from another OSD) - - // have pg? - if (!pg) { - derr(-7) << "handle_rep_op " << *op - << " pgid " << pgid << " dne" << dendl; - delete op; - //assert(0); // wtf, shouldn't happen. - return; - } - - // check osd map: same set, or primary+acker? - if (!pg->same_for_rep_modify_since(op->get_map_epoch())) { - dout(10) << "handle_rep_op pg changed " << pg->info.history - << " after " << op->get_map_epoch() - << ", dropping" << dendl; - pg->unlock(); - delete op; - return; - } - - assert(pg->get_role() >= 0); - dout(7) << "handle_rep_op " << op << " in " << *pg << dendl; - } - - // proprocess op? - if (pg->preprocess_op(op, now)) { - pg->unlock(); - return; - } - - if (op->get_op() == OSD_OP_READ) { - Mutex::Locker lock(peer_stat_lock); - stat_rd_ops_in_queue++; - } - - if (g_conf.osd_maxthreads < 1) { - // do it now. - if (op->get_type() == MSG_OSD_OP) - pg->do_op((MOSDOp*)op); - else if (op->get_type() == MSG_OSD_OPREPLY) - pg->do_op_reply((MOSDOpReply*)op); - else - assert(0); - } else { - // queue for worker threads - enqueue_op(pg, op); - } - - pg->unlock(); -} - - -void OSD::handle_op_reply(MOSDOpReply *op) -{ - if (op->get_map_epoch() < boot_epoch) { - dout(3) << "replica op reply from before boot" << dendl; - delete op; - return; - } - - // must be a rep op. - assert(op->get_source().is_osd()); - - // make sure we have the pg - const pg_t pgid = op->get_pg(); - - // require same or newer map - if (!require_same_or_newer_map(op, op->get_map_epoch())) return; - - // share our map with sender, if they're old - _share_map_incoming(op->get_source_inst(), op->get_map_epoch()); - - if (!_have_pg(pgid)) { - // hmm. - delete op; - return; - } - - PG *pg = _lookup_lock_pg(pgid); - if (g_conf.osd_maxthreads < 1) { - pg->do_op_reply(op); // do it now - } else { - enqueue_op(pg, op); // queue for worker threads - } - pg->unlock(); -} - - -/* - * enqueue called with osd_lock held - */ -void OSD::enqueue_op(PG *pg, Message *op) -{ - // add to pg's op_queue - pg->op_queue.push_back(op); - pending_ops++; - logger->set("opq", pending_ops); - - // add pg to threadpool queue - pg->get(); // we're exposing the pointer, here. - threadpool->put_op(pg); -} - -/* - * NOTE: dequeue called in worker thread, without osd_lock - */ -void OSD::dequeue_op(PG *pg) -{ - Message *op = 0; - - osd_lock.Lock(); - { - // lock pg and get pending op - pg->lock(); - - assert(!pg->op_queue.empty()); - op = pg->op_queue.front(); - pg->op_queue.pop_front(); - - dout(10) << "dequeue_op " << *op << " pg " << *pg - << ", " << (pending_ops-1) << " more pending" - << dendl; - - // share map? - // do this preemptively while we hold osd_lock and pg->lock - // to avoid lock ordering issues later. - for (unsigned i=1; iacting.size(); i++) - _share_map_outgoing( osdmap->get_inst(pg->acting[i]) ); - } - osd_lock.Unlock(); - - // do it - if (op->get_type() == MSG_OSD_OP) - pg->do_op((MOSDOp*)op); // do it now - else if (op->get_type() == MSG_OSD_OPREPLY) - pg->do_op_reply((MOSDOpReply*)op); - else - assert(0); - - // unlock and put pg - pg->put_unlock(); - - // finish - osd_lock.Lock(); - { - dout(10) << "dequeue_op " << op << " finish" << dendl; - assert(pending_ops > 0); - - if (pending_ops > g_conf.osd_max_opq) - op_queue_cond.Signal(); - - pending_ops--; - logger->set("opq", pending_ops); - if (pending_ops == 0 && waiting_for_no_ops) - no_pending_ops.Signal(); - } - osd_lock.Unlock(); -} - - - - -void OSD::wait_for_no_ops() -{ - if (pending_ops > 0) { - dout(7) << "wait_for_no_ops - waiting for " << pending_ops << dendl; - waiting_for_no_ops = true; - while (pending_ops > 0) - no_pending_ops.Wait(osd_lock); - waiting_for_no_ops = false; - assert(pending_ops == 0); - } - dout(7) << "wait_for_no_ops - none" << dendl; -} - - - - diff --git a/branches/sage/crush/osd/OSD.h b/branches/sage/crush/osd/OSD.h deleted file mode 100644 index be6348eceb126..0000000000000 --- a/branches/sage/crush/osd/OSD.h +++ /dev/null @@ -1,366 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __OSD_H -#define __OSD_H - -#include "msg/Dispatcher.h" - -#include "common/Mutex.h" -#include "common/ThreadPool.h" -#include "common/Timer.h" - -#include "mon/MonMap.h" - -#include "ObjectStore.h" -#include "PG.h" - -#include "common/DecayCounter.h" - - -#include -using namespace std; - -#include -#include -using namespace __gnu_cxx; - - -class Messenger; -class Message; -class Logger; -class ObjectStore; -class OSDMap; - -class OSD : public Dispatcher { -public: - // -- states -- - static const int STATE_BOOTING = 1; - static const int STATE_ACTIVE = 2; - static const int STATE_STOPPING = 3; - - - - /** OSD **/ -protected: - Mutex osd_lock; // global lock - SafeTimer timer; // safe timer - - Messenger *messenger; - Logger *logger; - ObjectStore *store; - MonMap *monmap; - - int whoami; - char dev_path[100]; - -public: - int get_nodeid() { return whoami; } - - static object_t get_osdmap_object_name(epoch_t epoch) { return object_t(0,epoch << 1); } - static object_t get_inc_osdmap_object_name(epoch_t epoch) { return object_t(0, (epoch << 1) + 1); } - - -private: - /** superblock **/ - OSDSuperblock superblock; - epoch_t boot_epoch; - - void write_superblock(); - void write_superblock(ObjectStore::Transaction& t); - int read_superblock(); - - - // -- state -- - int state; - -public: - bool is_booting() { return state == STATE_BOOTING; } - bool is_active() { return state == STATE_ACTIVE; } - bool is_stopping() { return state == STATE_STOPPING; } - -private: - - // heartbeat - void heartbeat(); - - class C_Heartbeat : public Context { - OSD *osd; - public: - C_Heartbeat(OSD *o) : osd(o) {} - void finish(int r) { - osd->heartbeat(); - } - }; - - - // -- stats -- - DecayCounter stat_oprate; - int stat_ops; // ops since last heartbeat - int stat_rd_ops; - int stat_rd_ops_shed_in; - int stat_rd_ops_shed_out; - int stat_qlen; // cumulative queue length since last refresh - int stat_rd_ops_in_queue; // in queue - - Mutex peer_stat_lock; - osd_peer_stat_t my_stat; - hash_map > peer_stat; - hash_map > my_stat_on_peer; // what the peer thinks of me - - void _refresh_my_stat(utime_t now); - osd_peer_stat_t get_my_stat_for(utime_t now, int peer); - void take_peer_stat(int peer, const osd_peer_stat_t& stat); - - // load calculation - //current implementation is moving averges. - class MovingAverager { - private: - Mutex lock; - deque m_Data; - unsigned m_Size; - double m_Total; - - public: - MovingAverager(unsigned size) : m_Size(size), m_Total(0) { } - - void set_size(unsigned size) { - m_Size = size; - } - - void add(double value) { - Mutex::Locker locker(lock); - - // add item - m_Data.push_back(value); - m_Total += value; - - // trim - while (m_Data.size() > m_Size) { - m_Total -= m_Data.front(); - m_Data.pop_front(); - } - } - - double get_average() { - Mutex::Locker locker(lock); - if (m_Data.empty()) return -1; - return m_Total / (double)m_Data.size(); - } - } read_latency_calc, qlen_calc; - - class IATAverager { - public: - struct iat_data { - double last_req_stamp; - double average_iat; - iat_data() : last_req_stamp(0), average_iat(0) {} - }; - private: - mutable Mutex lock; - double alpha; - hash_map iat_map; - - public: - IATAverager(double a) : alpha(a) {} - - void add_sample(object_t oid, double now) { - Mutex::Locker locker(lock); - iat_data &r = iat_map[oid]; - double iat = now - r.last_req_stamp; - r.last_req_stamp = now; - r.average_iat = r.average_iat*(1.0-alpha) + iat*alpha; - } - - bool have(object_t oid) const { - Mutex::Locker locker(lock); - return iat_map.count(oid); - } - - double get_average_iat(object_t oid) const { - Mutex::Locker locker(lock); - hash_map::const_iterator p = iat_map.find(oid); - assert(p != iat_map.end()); - return p->second.average_iat; - } - - bool is_flash_crowd_candidate(object_t oid) const { - Mutex::Locker locker(lock); - return get_average_iat(oid) <= g_conf.osd_flash_crowd_iat_threshold; - } - }; - - IATAverager iat_averager; - - - // -- waiters -- - list finished; - Mutex finished_lock; - - void take_waiters(list& ls) { - finished_lock.Lock(); - finished.splice(finished.end(), ls); - finished_lock.Unlock(); - } - - // -- op queue -- - class ThreadPool *threadpool; - - int pending_ops; - bool waiting_for_no_ops; - Cond no_pending_ops; - Cond op_queue_cond; - - void wait_for_no_ops(); - - void enqueue_op(PG *pg, Message *op); - void dequeue_op(PG *pg); - static void static_dequeueop(OSD *o, PG *pg) { - o->dequeue_op(pg); - }; - - - friend class PG; - friend class ReplicatedPG; - friend class RAID4PG; - - - protected: - - // -- osd map -- - OSDMap *osdmap; - list waiting_for_osdmap; - - hash_map peer_map_epoch; // FIXME types - bool _share_map_incoming(const entity_inst_t& inst, epoch_t epoch); - void _share_map_outgoing(const entity_inst_t& inst); - - void wait_for_new_map(Message *m); - void handle_osd_map(class MOSDMap *m); - - void advance_map(ObjectStore::Transaction& t); - void activate_map(ObjectStore::Transaction& t); - - void get_map(epoch_t e, OSDMap &m); - bool get_map_bl(epoch_t e, bufferlist& bl); - bool get_inc_map_bl(epoch_t e, bufferlist& bl); - bool get_inc_map(epoch_t e, OSDMap::Incremental &inc); - - void send_incremental_map(epoch_t since, const entity_inst_t& inst, bool full); - - - - // -- placement groups -- - hash_map pg_map; - hash_map > waiting_for_pg; - - bool _have_pg(pg_t pgid); - PG *_lookup_lock_pg(pg_t pgid); - PG *_new_lock_pg(pg_t pg); // create new PG (in memory) - PG *_create_lock_pg(pg_t pg, ObjectStore::Transaction& t); // create new PG - void _remove_unlock_pg(PG *pg); // remove from store and memory - - void try_create_pg(pg_t pgid, ObjectStore::Transaction& t); - - void load_pgs(); - void project_pg_history(pg_t pgid, PG::Info::History& h, epoch_t from, - vector& last); - void activate_pg(pg_t pgid, epoch_t epoch); - - class C_Activate : public Context { - OSD *osd; - pg_t pgid; - epoch_t epoch; - public: - C_Activate(OSD *o, pg_t p, epoch_t e) : osd(o), pgid(p), epoch(e) {} - void finish(int r) { - osd->activate_pg(pgid, epoch); - } - }; - - - // -- pg stats -- - Mutex pg_stat_queue_lock; - set pg_stat_queue; - - class C_Stats : public Context { - OSD *osd; - public: - C_Stats(OSD *o) : osd(o) {} - void finish(int r) { - osd->send_pg_stats(); - } - }; - void send_pg_stats(); - - - // -- tids -- - // for ops i issue - tid_t last_tid; - - Mutex tid_lock; - tid_t get_tid() { - tid_t t; - tid_lock.Lock(); - t = ++last_tid; - tid_lock.Unlock(); - return t; - } - - - // -- generic pg recovery -- - int num_pulling; - - void do_notifies(map< int, list >& notify_list); - void do_queries(map< int, map >& query_map); - void do_activators(map& activator_map); - void repeer(PG *pg, map< int, map >& query_map); - - bool require_current_map(Message *m, epoch_t v); - bool require_same_or_newer_map(Message *m, epoch_t e); - - void handle_pg_query(class MOSDPGQuery *m); - void handle_pg_notify(class MOSDPGNotify *m); - void handle_pg_log(class MOSDPGLog *m); - void handle_pg_activate_set(class MOSDPGActivateSet *m); - void handle_pg_remove(class MOSDPGRemove *m); - - // helper for handle_pg_log and handle_pg_activate_set - void _process_pg_info(epoch_t epoch, int from, - PG::Info &info, - PG::Log &log, - PG::Missing &missing, - map* activator_map); - - - public: - OSD(int id, Messenger *m, MonMap *mm, char *dev = 0); - ~OSD(); - - // startup/shutdown - int init(); - int shutdown(); - - // messages - virtual void dispatch(Message *m); - virtual void ms_handle_failure(Message *m, const entity_inst_t& inst); - - void handle_osd_ping(class MOSDPing *m); - void handle_op(class MOSDOp *m); - void handle_op_reply(class MOSDOpReply *m); - - void force_remount(); -}; - -#endif diff --git a/branches/sage/crush/osd/OSDMap.h b/branches/sage/crush/osd/OSDMap.h deleted file mode 100644 index 2b476e0456168..0000000000000 --- a/branches/sage/crush/osd/OSDMap.h +++ /dev/null @@ -1,539 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __OSDMAP_H -#define __OSDMAP_H - -/* - * describe properties of the OSD cluster. - * disks, disk groups, total # osds, - * - */ -#include "config.h" -#include "include/types.h" -#include "osd_types.h" -#include "msg/Message.h" -#include "common/Mutex.h" -#include "common/Clock.h" - -#include "crush/CrushWrapper.h" - -#include -#include -#include -#include -using namespace std; - - -/* - * some system constants - */ - -// from LSB to MSB, -#define PG_PS_BITS 16 // max bits for placement seed/group portion of PG -#define PG_REP_BITS 6 // up to 64 replicas -#define PG_TYPE_BITS 2 -#define PG_PS_MASK ((1LL<>1)); -} - -inline int calc_bits_of(int t) { - int b = 0; - while (t) { - t = t >> 1; - b++; - } - return b; -} - - - -/** OSDMap - */ -class OSDMap { - -public: - class Incremental { - public: - epoch_t epoch; // new epoch; we are a diff from epoch-1 to epoch - epoch_t mon_epoch; // monitor epoch (election iteration) - utime_t ctime; - - // full (rare) - bufferlist fullmap; // in leiu of below. - bufferlist crush; - - // incremental - map new_up; - map > new_down; - list new_in; - list new_out; - map new_overload; // updated overload value - list old_overload; // no longer overload - - void encode(bufferlist& bl) { - ::_encode(epoch, bl); - ::_encode(mon_epoch, bl); - ::_encode(ctime, bl); - ::_encode(new_up, bl); - ::_encode(new_down, bl); - ::_encode(new_in, bl); - ::_encode(new_out, bl); - ::_encode(new_overload, bl); - ::_encode(fullmap, bl); - ::_encode(crush, bl); - } - void decode(bufferlist& bl, int& off) { - ::_decode(epoch, bl, off); - ::_decode(mon_epoch, bl, off); - ::_decode(ctime, bl, off); - ::_decode(new_up, bl, off); - ::_decode(new_down, bl, off); - ::_decode(new_in, bl, off); - ::_decode(new_out, bl, off); - ::_decode(new_overload, bl, off); - ::_decode(fullmap, bl, off); - ::_decode(crush, bl, off); - } - - Incremental(epoch_t e=0) : epoch(e), mon_epoch(0) {} - }; - -private: - epoch_t epoch; // what epoch of the osd cluster descriptor is this - epoch_t mon_epoch; // monitor epoch (election iteration) - utime_t ctime; // epoch start time - int32_t pg_num; // placement group count - int32_t pg_num_mask; // bitmask for above - int32_t localized_pg_num; // localized place group count - int32_t localized_pg_num_mask; // ditto - - set osds; // all osds - map down_osds; // list of down disks, -> clean shutdown (true/false) - set out_osds; // list of unmapped disks - map overload_osds; - map osd_inst; - - public: - CrushWrapper crush; // hierarchical map - - friend class OSDMonitor; - friend class MDS; - - public: - OSDMap() : epoch(0), mon_epoch(0), - pg_num(1<<5), - localized_pg_num(1<<3) { - calc_pg_masks(); - } - - // map info - epoch_t get_epoch() const { return epoch; } - void inc_epoch() { epoch++; } - - void calc_pg_masks() { - pg_num_mask = (1 << calc_bits_of(pg_num-1)) - 1; - localized_pg_num_mask = (1 << calc_bits_of(localized_pg_num-1)) - 1; - } - - int get_pg_num() const { return pg_num; } - void set_pg_num(int m) { pg_num = m; calc_pg_masks(); } - int get_localized_pg_num() const { return localized_pg_num; } - - const utime_t& get_ctime() const { return ctime; } - - bool is_mkfs() const { return epoch == 2; } - bool post_mkfs() const { return epoch > 2; } - - /***** cluster state *****/ - int num_osds() { return osds.size(); } - void get_all_osds(set& ls) { ls = osds; } - - const set& get_osds() { return osds; } - const map& get_down_osds() { return down_osds; } - const set& get_out_osds() { return out_osds; } - const map& get_overload_osds() { return overload_osds; } - - bool exists(int osd) { return osds.count(osd); } - bool is_down(int osd) { return down_osds.count(osd); } - bool is_down_clean(int osd) { return down_osds.count(osd) && down_osds[osd]; } - bool is_up(int osd) { return exists(osd) && !is_down(osd); } - bool is_out(int osd) { return out_osds.count(osd); } - bool is_in(int osd) { return exists(osd) && !is_out(osd); } - - bool have_inst(int osd) { - return osd_inst.count(osd); - } - const entity_inst_t& get_inst(int osd) { - assert(osd_inst.count(osd)); - return osd_inst[osd]; - } - bool get_inst(int osd, entity_inst_t& inst) { - if (osd_inst.count(osd)) { - inst = osd_inst[osd]; - return true; - } - return false; - } - - void mark_down(int o, bool clean) { down_osds[o] = clean; } - void mark_up(int o) { down_osds.erase(o); } - void mark_out(int o) { - out_osds.insert(o); - crush.update_offload_map(out_osds, overload_osds); - } - void mark_in(int o) { - out_osds.erase(o); - crush.update_offload_map(out_osds, overload_osds); - } - - void apply_incremental(Incremental &inc) { - assert(inc.epoch == epoch+1); - epoch++; - mon_epoch = inc.mon_epoch; - ctime = inc.ctime; - - // full map? - if (inc.fullmap.length()) { - decode(inc.fullmap); - return; - } - if (inc.crush.length()) { - bufferlist::iterator blp = inc.crush.begin(); - crush._decode(blp); - } - - // nope, incremental. - for (map >::iterator i = inc.new_down.begin(); - i != inc.new_down.end(); - i++) { - assert(down_osds.count(i->first) == 0); - down_osds[i->first] = i->second.second; - //assert(osd_inst.count(i->first) == 0 || osd_inst[i->first] == i->second.first); - osd_inst.erase(i->first); - //cout << "epoch " << epoch << " down osd" << i->first << endl; - } - for (list::iterator i = inc.new_out.begin(); - i != inc.new_out.end(); - i++) { - assert(out_osds.count(*i) == 0); - out_osds.insert(*i); - //cout << "epoch " << epoch << " out osd" << *i << endl; - } - for (list::iterator i = inc.old_overload.begin(); - i != inc.old_overload.end(); - i++) { - assert(overload_osds.count(*i)); - overload_osds.erase(*i); - } - - for (map::iterator i = inc.new_up.begin(); - i != inc.new_up.end(); - i++) { - assert(down_osds.count(i->first)); - down_osds.erase(i->first); - assert(osd_inst.count(i->first) == 0); - osd_inst[i->first] = i->second; - //cout << "epoch " << epoch << " up osd" << i->first << endl; - } - for (list::iterator i = inc.new_in.begin(); - i != inc.new_in.end(); - i++) { - assert(out_osds.count(*i)); - out_osds.erase(*i); - //cout << "epoch " << epoch << " in osd" << *i << endl; - } - for (map::iterator i = inc.new_overload.begin(); - i != inc.new_overload.end(); - i++) { - overload_osds[i->first] = i->second; - } - - crush.update_offload_map(out_osds, overload_osds); - } - - // serialize, unserialize - void encode(bufferlist& blist) { - ::_encode(epoch, blist); - ::_encode(mon_epoch, blist); - ::_encode(ctime, blist); - ::_encode(pg_num, blist); - ::_encode(localized_pg_num, blist); - - ::_encode(osds, blist); - ::_encode(down_osds, blist); - ::_encode(out_osds, blist); - ::_encode(overload_osds, blist); - ::_encode(osd_inst, blist); - - bufferlist cbl; - crush._encode(cbl); - ::_encode(cbl, blist); - } - - void decode(bufferlist& blist) { - int off = 0; - ::_decode(epoch, blist, off); - ::_decode(mon_epoch, blist, off); - ::_decode(ctime, blist, off); - ::_decode(pg_num, blist, off); - ::_decode(localized_pg_num, blist, off); - calc_pg_masks(); - - ::_decode(osds, blist, off); - ::_decode(down_osds, blist, off); - ::_decode(out_osds, blist, off); - ::_decode(overload_osds, blist, off); - ::_decode(osd_inst, blist, off); - - bufferlist cbl; - ::_decode(cbl, blist, off); - bufferlist::iterator cblp = cbl.begin(); - crush._decode(cblp); - - crush.update_offload_map(out_osds, overload_osds); - } - - - - - /**** mapping facilities ****/ - - // oid -> pg - ObjectLayout file_to_object_layout(object_t oid, FileLayout& layout) { - return make_object_layout(oid, layout.fl_pg_type, layout.fl_pg_size, layout.fl_pg_preferred, layout.fl_object_stripe_unit); - } - - ObjectLayout make_object_layout(object_t oid, int pg_type, int pg_size, int preferred=-1, int object_stripe_unit = 0) { - int num = preferred >= 0 ? localized_pg_num:pg_num; - int num_mask = preferred >= 0 ? localized_pg_num_mask:pg_num_mask; - - // calculate ps (placement seed) - ps_t ps; - switch (g_conf.osd_object_layout) { - case CEPH_OBJECT_LAYOUT_LINEAR: - ps = stable_mod(oid.bno + oid.ino, num, num_mask); - break; - - case CEPH_OBJECT_LAYOUT_HASHINO: - //ps = stable_mod(oid.bno + H(oid.bno+oid.ino)^H(oid.ino>>32), num, num_mask); - ps = stable_mod(oid.bno + crush_hash32_2(oid.ino, oid.ino>>32), num, num_mask); - break; - - case CEPH_OBJECT_LAYOUT_HASH: - //ps = stable_mod(H( (oid.bno & oid.ino) ^ ((oid.bno^oid.ino) >> 32) ), num, num_mask); - //ps = stable_mod(H(oid.bno) + H(oid.ino)^H(oid.ino>>32), num, num_mask); - //ps = stable_mod(oid.bno + H(oid.bno+oid.ino)^H(oid.bno+oid.ino>>32), num, num_mask); - ps = stable_mod(oid.bno + crush_hash32_2(oid.ino, oid.ino>>32), num, num_mask); - break; - - default: - assert(0); - } - - //cout << "preferred " << preferred << " num " << num << " mask " << num_mask << " ps " << ps << endl; - - // construct object layout - return ObjectLayout(pg_t(pg_type, pg_size, ps, preferred), - object_stripe_unit); - } - - - // pg -> (osd list) - int pg_to_osds(pg_t pg, - vector& osds) { // list of osd addr's - // map to osds[] - switch (g_conf.osd_pg_layout) { - case CEPH_PG_LAYOUT_CRUSH: - { - // what crush rule? - int rule; - if (pg.is_rep()) rule = CRUSH_REP_RULE(pg.size()); - else if (pg.is_raid4()) rule = CRUSH_RAID_RULE(pg.size()); - else assert(0); - - // forcefeed? - int forcefeed = -1; - if (pg.preferred() >= 0 && - out_osds.count(pg.preferred()) == 0) - forcefeed = pg.preferred(); - crush.do_rule(rule, - pg.ps(), - osds, pg.size(), - forcefeed); - } - break; - - case CEPH_PG_LAYOUT_LINEAR: - for (int i=0; i= 0 && - g_conf.osd_pg_layout != CEPH_PG_LAYOUT_CRUSH) { - int osd = pg.preferred(); - - // already in there? - if (osds.empty()) { - osds.push_back(osd); - } else { - assert(pg.size() > 0); - for (int i=1; i (up osd list) - int pg_to_acting_osds(pg_t pg, - vector& osds) { // list of osd addr's - // get rush list - vector raw; - pg_to_osds(pg, raw); - - osds.clear(); - for (unsigned i=0; i primary osd - int get_pg_primary(pg_t pg) { - vector group; - int nrep = pg_to_osds(pg, group); - if (nrep) - return group[0]; - return -1; // we fail! - } - - // pg -> acting primary osd - int get_pg_acting_primary(pg_t pg) { - vector group; - int nrep = pg_to_acting_osds(pg, group); - if (nrep > 0) - return group[0]; - return -1; // we fail! - } - int get_pg_acting_tail(pg_t pg) { - vector group; - int nrep = pg_to_acting_osds(pg, group); - if (nrep > 0) - return group[group.size()-1]; - return -1; // we fail! - } - - - /* what replica # is a given osd? 0 primary, -1 for none. */ - int calc_pg_rank(int osd, vector& acting, int nrep=0) { - if (!nrep) nrep = acting.size(); - for (int i=0; i& acting, int nrep=0) { - if (!nrep) nrep = acting.size(); - int rank = calc_pg_rank(osd, acting, nrep); - - if (rank < 0) return PG_ROLE_STRAY; - else if (rank == 0) return PG_ROLE_HEAD; - else if (rank == 1) return PG_ROLE_ACKER; - else return PG_ROLE_MIDDLE; - } - - int get_pg_role(pg_t pg, int osd) { - vector group; - int nrep = pg_to_osds(pg, group); - return calc_pg_role(osd, group, nrep); - } - - /* rank is -1 (stray), 0 (primary), 1,2,3,... (replica) */ - int get_pg_acting_rank(pg_t pg, int osd) { - vector group; - int nrep = pg_to_acting_osds(pg, group); - return calc_pg_rank(osd, group, nrep); - } - /* role is -1 (stray), 0 (primary), 1 (replica) */ - int get_pg_acting_role(pg_t pg, int osd) { - vector group; - int nrep = pg_to_acting_osds(pg, group); - return calc_pg_role(osd, group, nrep); - } - - - - -}; - - -#endif diff --git a/branches/sage/crush/osd/ObjectStore.h b/branches/sage/crush/osd/ObjectStore.h deleted file mode 100644 index c8df5d8218fed..0000000000000 --- a/branches/sage/crush/osd/ObjectStore.h +++ /dev/null @@ -1,611 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __OBJECTSTORE_H -#define __OBJECTSTORE_H - -#include "include/types.h" -#include "osd_types.h" -#include "include/Context.h" -#include "include/buffer.h" - -#include "include/Distribution.h" - -#include - -#ifdef DARWIN -#include -#else -#include /* or */ -#endif /* DARWIN */ - -#include -using std::list; - -#ifndef MIN -# define MIN(a,b) ((a) < (b) ? (a):(b)) -#endif - -/* - * low-level interface to the local OSD file system - */ - - - -class ObjectStore { -public: - - - class FragmentationStat { - public: - int total; - int num_extent; - int avg_extent; - map extent_dist; // powers of two - map extent_dist_sum; // powers of two - - float avg_extent_per_object; - int avg_extent_jump; // avg distance bweteen consecutive extents - - int total_free; - int num_free_extent; - int avg_free_extent; - map free_extent_dist; // powers of two - map free_extent_dist_sum; // powers of two - }; - - - - /********************************* - * transaction - */ - class Transaction { - public: - static const int OP_READ = 1; // oid, offset, len, pbl - static const int OP_STAT = 2; // oid, pstat - static const int OP_GETATTR = 3; // oid, attrname, pattrval - static const int OP_GETATTRS = 4; // oid, pattrset - - static const int OP_WRITE = 10; // oid, offset, len, bl - static const int OP_TRUNCATE = 11; // oid, len - static const int OP_REMOVE = 13; // oid - static const int OP_SETATTR = 14; // oid, attrname, attrval - static const int OP_SETATTRS = 15; // oid, attrset - static const int OP_RMATTR = 16; // oid, attrname - static const int OP_CLONE = 17; // oid, newoid - - static const int OP_TRIMCACHE = 18; // oid, offset, len - - static const int OP_MKCOLL = 20; // cid - static const int OP_RMCOLL = 21; // cid - static const int OP_COLL_ADD = 22; // cid, oid - static const int OP_COLL_REMOVE = 23; // cid, oid - static const int OP_COLL_SETATTR = 24; // cid, attrname, attrval - static const int OP_COLL_RMATTR = 25; // cid, attrname - - private: - list ops; - list bls; - list oids; - list cids; - list lengths; - list attrnames; - list attrnames2; - - // for reads only (not encoded) - list pbls; - list psts; - list< pair > pattrvals; - list< map* > pattrsets; - - public: - bool have_op() { - return !ops.empty(); - } - int get_num_ops() { return ops.size(); } - int get_op() { - int op = ops.front(); - ops.pop_front(); - return op; - } - void get_bl(bufferlist& bl) { - bl.claim(bls.front()); - bls.pop_front(); - } - void get_oid(object_t& oid) { - oid = oids.front(); - oids.pop_front(); - } - void get_cid(coll_t& cid) { - cid = cids.front(); - cids.pop_front(); - } - void get_length(off_t& len) { - len = lengths.front(); - lengths.pop_front(); - } - void get_attrname(const char * &p) { - p = attrnames.front(); - attrnames.pop_front(); - } - void get_pbl(bufferlist* &pbl) { - pbl = pbls.front(); - pbls.pop_front(); - } - void get_pstat(struct stat* &pst) { - pst = psts.front(); - psts.pop_front(); - } - void get_pattrval(pair& p) { - p = pattrvals.front(); - pattrvals.pop_front(); - } - void get_pattrset(map* &ps) { - ps = pattrsets.front(); - pattrsets.pop_front(); - } - - - void read(object_t oid, off_t off, size_t len, bufferlist *pbl) { - int op = OP_READ; - ops.push_back(op); - oids.push_back(oid); - lengths.push_back(off); - lengths.push_back(len); - pbls.push_back(pbl); - } - void stat(object_t oid, struct stat *st) { - int op = OP_STAT; - ops.push_back(op); - oids.push_back(oid); - psts.push_back(st); - } - void getattr(object_t oid, const char* name, void* val, int *plen) { - int op = OP_GETATTR; - ops.push_back(op); - oids.push_back(oid); - attrnames.push_back(name); - pattrvals.push_back(pair(val,plen)); - } - void getattrs(object_t oid, map& aset) { - int op = OP_GETATTRS; - ops.push_back(op); - oids.push_back(oid); - pattrsets.push_back(&aset); - } - - void write(object_t oid, off_t off, size_t len, const bufferlist& bl) { - int op = OP_WRITE; - ops.push_back(op); - oids.push_back(oid); - lengths.push_back(off); - lengths.push_back(len); - bls.push_back(bl); - } - void trim_from_cache(object_t oid, off_t off, size_t len) { - int op = OP_TRIMCACHE; - ops.push_back(op); - oids.push_back(oid); - lengths.push_back(off); - lengths.push_back(len); - } - void truncate(object_t oid, off_t off) { - int op = OP_TRUNCATE; - ops.push_back(op); - oids.push_back(oid); - lengths.push_back(off); - } - void remove(object_t oid) { - int op = OP_REMOVE; - ops.push_back(op); - oids.push_back(oid); - } - void setattr(object_t oid, const char* name, const void* val, int len) { - int op = OP_SETATTR; - ops.push_back(op); - oids.push_back(oid); - attrnames.push_back(name); - //attrvals.push_back(pair(val,len)); - bufferlist bl; - bl.append((char*)val,len); - bls.push_back(bl); - } - void setattrs(object_t oid, map& attrset) { - int op = OP_SETATTRS; - ops.push_back(op); - oids.push_back(oid); - pattrsets.push_back(&attrset); - } - void rmattr(object_t oid, const char* name) { - int op = OP_RMATTR; - ops.push_back(op); - oids.push_back(oid); - attrnames.push_back(name); - } - void clone(object_t oid, object_t noid) { - int op = OP_CLONE; - ops.push_back(op); - oids.push_back(oid); - oids.push_back(noid); - } - void create_collection(coll_t cid) { - int op = OP_MKCOLL; - ops.push_back(op); - cids.push_back(cid); - } - void remove_collection(coll_t cid) { - int op = OP_RMCOLL; - ops.push_back(op); - cids.push_back(cid); - } - void collection_add(coll_t cid, object_t oid) { - int op = OP_COLL_ADD; - ops.push_back(op); - cids.push_back(cid); - oids.push_back(oid); - } - void collection_remove(coll_t cid, object_t oid) { - int op = OP_COLL_REMOVE; - ops.push_back(op); - cids.push_back(cid); - oids.push_back(oid); - } - void collection_setattr(coll_t cid, const char* name, const void* val, int len) { - int op = OP_COLL_SETATTR; - ops.push_back(op); - cids.push_back(cid); - attrnames.push_back(name); - bufferlist bl; - bl.append((char*)val, len); - bls.push_back(bl); - } - void collection_rmattr(coll_t cid, const char* name) { - int op = OP_COLL_RMATTR; - ops.push_back(op); - cids.push_back(cid); - attrnames.push_back(name); - } - - // etc. - - void _encode(bufferlist& bl) { - ::_encode(ops, bl); - ::_encode(bls, bl); - ::_encode(oids, bl); - ::_encode(cids, bl); - ::_encode(lengths, bl); - ::_encode(attrnames, bl); - } - void _decode(bufferlist& bl, int& off) { - ::_decode(ops, bl, off); - ::_decode(bls, bl, off); - ::_decode(oids, bl, off); - ::_decode(cids, bl, off); - ::_decode(lengths, bl, off); - ::_decode(attrnames2, bl, off); - for (list::iterator p = attrnames2.begin(); - p != attrnames2.end(); - ++p) - attrnames.push_back((*p).c_str()); - } - }; - - - - /* this implementation is here only for naive ObjectStores that - * do not do atomic transactions natively. it is not atomic. - */ - virtual unsigned apply_transaction(Transaction& t, Context *onsafe=0) { - // non-atomic implementation - while (t.have_op()) { - int op = t.get_op(); - switch (op) { - case Transaction::OP_READ: - { - object_t oid; - off_t offset, len; - t.get_oid(oid); - t.get_length(offset); - t.get_length(len); - bufferlist *pbl; - t.get_pbl(pbl); - read(oid, offset, len, *pbl); - } - break; - case Transaction::OP_STAT: - { - object_t oid; - t.get_oid(oid); - struct stat *st; - t.get_pstat(st); - stat(oid, st); - } - break; - case Transaction::OP_GETATTR: - { - object_t oid; - t.get_oid(oid); - const char *attrname; - t.get_attrname(attrname); - pair pattrval; - t.get_pattrval(pattrval); - *pattrval.second = getattr(oid, attrname, pattrval.first, *pattrval.second); - } - break; - case Transaction::OP_GETATTRS: - { - object_t oid; - t.get_oid(oid); - map *pset; - t.get_pattrset(pset); - getattrs(oid, *pset); - } - break; - - case Transaction::OP_WRITE: - { - object_t oid; - t.get_oid(oid); - off_t offset, len; - t.get_length(offset); - t.get_length(len); - bufferlist bl; - t.get_bl(bl); - write(oid, offset, len, bl, 0); - } - break; - - case Transaction::OP_TRIMCACHE: - { - object_t oid; - t.get_oid(oid); - off_t offset, len; - t.get_length(offset); - t.get_length(len); - trim_from_cache(oid, offset, len); - } - break; - - case Transaction::OP_TRUNCATE: - { - object_t oid; - t.get_oid(oid); - off_t len; - t.get_length(len); - truncate(oid, len, 0); - } - break; - - case Transaction::OP_REMOVE: - { - object_t oid; - t.get_oid(oid); - remove(oid, 0); - } - break; - - case Transaction::OP_SETATTR: - { - object_t oid; - t.get_oid(oid); - const char *attrname; - t.get_attrname(attrname); - bufferlist bl; - t.get_bl(bl); - setattr(oid, attrname, bl.c_str(), bl.length(), 0); - } - break; - case Transaction::OP_SETATTRS: - { - object_t oid; - t.get_oid(oid); - map *pattrset; - t.get_pattrset(pattrset); - setattrs(oid, *pattrset, 0); - } - break; - - case Transaction::OP_RMATTR: - { - object_t oid; - t.get_oid(oid); - const char *attrname; - t.get_attrname(attrname); - rmattr(oid, attrname, 0); - } - break; - - case Transaction::OP_CLONE: - { - object_t oid; - t.get_oid(oid); - object_t noid; - t.get_oid(noid); - clone(oid, noid); - } - break; - - case Transaction::OP_MKCOLL: - { - coll_t cid; - t.get_cid(cid); - create_collection(cid, 0); - } - break; - - case Transaction::OP_RMCOLL: - { - coll_t cid; - t.get_cid(cid); - destroy_collection(cid, 0); - } - break; - - case Transaction::OP_COLL_ADD: - { - coll_t cid; - t.get_cid(cid); - object_t oid; - t.get_oid(oid); - collection_add(cid, oid, 0); - } - break; - - case Transaction::OP_COLL_REMOVE: - { - coll_t cid; - t.get_cid(cid); - object_t oid; - t.get_oid(oid); - collection_remove(cid, oid, 0); - } - break; - - case Transaction::OP_COLL_SETATTR: - { - coll_t cid; - t.get_cid(cid); - const char *attrname; - t.get_attrname(attrname); - bufferlist bl; - t.get_bl(bl); - collection_setattr(cid, attrname, bl.c_str(), bl.length(), 0); - } - break; - - case Transaction::OP_COLL_RMATTR: - { - coll_t cid; - t.get_cid(cid); - const char *attrname; - t.get_attrname(attrname); - collection_rmattr(cid, attrname, 0); - } - break; - - - default: - cerr << "bad op " << op << std::endl; - assert(0); - } - } - - if (onsafe) sync(onsafe); - - return 0; // FIXME count errors - } - - /*********************************************/ - - - - public: - ObjectStore() {} - virtual ~ObjectStore() {} - - // mgmt - virtual int mount() = 0; - virtual int umount() = 0; - virtual int mkfs() = 0; // wipe - - virtual int statfs(struct statfs *buf) = 0; - - // objects - virtual int pick_object_revision_lt(object_t& oid) = 0; - - virtual bool exists(object_t oid) = 0; // useful? - virtual int stat(object_t oid, struct stat *st) = 0; // struct stat? - - virtual int remove(object_t oid, - Context *onsafe=0) = 0; - - virtual int truncate(object_t oid, off_t size, - Context *onsafe=0) = 0; - - virtual int read(object_t oid, - off_t offset, size_t len, - bufferlist& bl) = 0; - virtual int write(object_t oid, - off_t offset, size_t len, - const bufferlist& bl, - Context *onsafe) = 0;//{ return -1; } - virtual void trim_from_cache(object_t oid, - off_t offset, size_t len) { } - virtual int is_cached(object_t oid, - off_t offset, - size_t len) { return -1; } - - virtual int setattr(object_t oid, const char *name, - const void *value, size_t size, - Context *onsafe=0) {return 0;} //= 0; - virtual int setattrs(object_t oid, map& aset, - Context *onsafe=0) {return 0;} //= 0; - virtual int getattr(object_t oid, const char *name, - void *value, size_t size) {return 0;} //= 0; - virtual int getattrs(object_t oid, map& aset) {return 0;}; - - virtual int rmattr(object_t oid, const char *name, - Context *onsafe=0) {return 0;} - - virtual int clone(object_t oid, object_t noid) { - return -1; - } - - virtual int list_objects(list& ls) = 0;//{ return -1; } - - virtual int get_object_collections(object_t oid, set& ls) { return -1; } - - //virtual int listattr(object_t oid, char *attrs, size_t size) {return 0;} //= 0; - - // collections - virtual int list_collections(list& ls) {return 0;}//= 0; - virtual int create_collection(coll_t c, - Context *onsafe=0) {return 0;}//= 0; - virtual int destroy_collection(coll_t c, - Context *onsafe=0) {return 0;}//= 0; - virtual bool collection_exists(coll_t c) {return 0;} - virtual int collection_stat(coll_t c, struct stat *st) {return 0;}//= 0; - virtual int collection_add(coll_t c, object_t o, - Context *onsafe=0) {return 0;}//= 0; - virtual int collection_remove(coll_t c, object_t o, - Context *onsafe=0) {return 0;}// = 0; - virtual int collection_list(coll_t c, list& o) {return 0;}//= 0; - - virtual int collection_setattr(coll_t cid, const char *name, - const void *value, size_t size, - Context *onsafe=0) {return 0;} //= 0; - virtual int collection_rmattr(coll_t cid, const char *name, - Context *onsafe=0) {return 0;} //= 0; - virtual int collection_getattr(coll_t cid, const char *name, - void *value, size_t size) {return 0;} //= 0; - - virtual int collection_getattrs(coll_t cid, map &aset) = 0;//{ return -1; } - virtual int collection_setattrs(coll_t cid, map &aset) = 0;//{ return -1; } - - - //virtual int collection_listattr(coll_t cid, char *attrs, size_t size) {return 0;} //= 0; - - virtual void sync(Context *onsync) {} - virtual void sync() {} - - - virtual void _fake_writes(bool b) {}; - - virtual void _get_frag_stat(FragmentationStat& st) {}; - -}; - - -#endif diff --git a/branches/sage/crush/osd/PG.cc b/branches/sage/crush/osd/PG.cc deleted file mode 100644 index 5b55c9a88e1de..0000000000000 --- a/branches/sage/crush/osd/PG.cc +++ /dev/null @@ -1,1289 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "PG.h" -#include "config.h" -#include "OSD.h" - -#include "common/Timer.h" - -#include "messages/MOSDOp.h" -#include "messages/MOSDPGNotify.h" -#include "messages/MOSDPGLog.h" -#include "messages/MOSDPGRemove.h" -#include "messages/MOSDPGActivateSet.h" - -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_osd) *_dout << dbeginl << g_clock.now() << " osd" << osd->whoami << " " << (osd->osdmap ? osd->osdmap->get_epoch():0) << " " << *this << " " - - -/******* PGLog ********/ - -void PG::Log::copy_after(const Log &other, eversion_t v) -{ - assert(v >= other.bottom); - top = bottom = other.top; - for (list::const_reverse_iterator i = other.log.rbegin(); - i != other.log.rend(); - i++) { - if (i->version == v) break; - assert(i->version > v); - log.push_front(*i); - } - bottom = v; -} - -bool PG::Log::copy_after_unless_divergent(const Log &other, eversion_t split, eversion_t floor) -{ - assert(split >= other.bottom); - assert(floor >= other.bottom); - assert(floor <= split); - top = bottom = other.top; - - /* runs on replica. split is primary's log.top. floor is how much they want. - split tell us if the primary is divergent.. e.g.: - -> i am A, B is primary, split is 2'6, floor is 2'2. -A B C -2'2 2'2 -2'3 2'3 2'3 -2'4 2'4 2'4 -3'5 | 2'5 2'5 -3'6 | 2'6 -3'7 | -3'8 | -3'9 | - -> i return full backlog. - */ - - for (list::const_reverse_iterator i = other.log.rbegin(); - i != other.log.rend(); - i++) { - // is primary divergent? - // e.g. my 3'6 vs their 2'6 split - if (i->version.version == split.version && i->version.epoch > split.epoch) { - clear(); - return false; // divergent! - } - if (i->version == floor) break; - assert(i->version > floor); - - // e.g. my 2'23 > '12 - log.push_front(*i); - } - bottom = floor; - return true; -} - -void PG::Log::copy_non_backlog(const Log &other) -{ - if (other.backlog) { - top = other.top; - bottom = other.bottom; - for (list::const_reverse_iterator i = other.log.rbegin(); - i != other.log.rend(); - i++) - if (i->version > bottom) - log.push_front(*i); - else - break; - } else { - *this = other; - } -} - - - -void PG::IndexedLog::trim(ObjectStore::Transaction& t, eversion_t s) -{ - if (backlog && s < bottom) - s = bottom; - - while (!log.empty()) { - Entry &e = *log.begin(); - - if (e.version > s) break; - - assert(complete_to != log.begin()); - assert(requested_to != log.begin()); - - // remove from index, - unindex(e); - - // from log - log.pop_front(); - } - - // raise bottom? - if (backlog) backlog = false; - if (bottom < s) bottom = s; -} - - -void PG::IndexedLog::trim_write_ahead(eversion_t last_update) -{ - while (!log.empty() && - log.rbegin()->version > last_update) { - // remove from index - unindex(*log.rbegin()); - - // remove - log.pop_back(); - } -} - -void PG::trim_write_ahead() -{ - if (info.last_update < log.top) { - dout(10) << "trim_write_ahead (" << info.last_update << "," << log.top << "]" << dendl; - log.trim_write_ahead(info.last_update); - } else { - assert(info.last_update == log.top); - dout(10) << "trim_write_ahead last_update=top=" << info.last_update << dendl; - } - -} - -void PG::proc_replica_log(Log &olog, Missing& omissing, int from) -{ - dout(10) << "proc_replica_log for osd" << from << ": " << olog << " " << omissing << dendl; - assert(!is_active()); - - if (!have_master_log) { - // i'm building master log. - // note peer's missing. - peer_missing[from] = omissing; - - // merge log into our own log - merge_log(olog, omissing, from); - proc_missing(olog, omissing, from); - } else { - // i'm just building missing lists. - peer_missing[from] = omissing; - - // iterate over peer log. in reverse. - list::reverse_iterator pp = olog.log.rbegin(); - eversion_t lu = peer_info[from].last_update; - while (pp != olog.log.rend()) { - if (!log.objects.count(pp->oid)) { - dout(10) << " divergent " << *pp << " not in our log, generating backlog" << dendl; - generate_backlog(); - } - - if (!log.objects.count(pp->oid)) { - dout(10) << " divergent " << *pp << " dne, must have been new, ignoring" << dendl; - ++pp; - continue; - } - - if (log.objects[pp->oid]->version == pp->version) { - break; // we're no longer divergent. - //++pp; - //continue; - } - - if (log.objects[pp->oid]->version > pp->version) { - dout(10) << " divergent " << *pp - << " superceded by " << log.objects[pp->oid] - << ", ignoring" << dendl; - } else { - dout(10) << " divergent " << *pp << ", adding to missing" << dendl; - peer_missing[from].add(pp->oid, pp->version); - } - - ++pp; - if (pp != olog.log.rend()) - lu = pp->version; - else - lu = olog.bottom; - } - - if (lu < peer_info[from].last_update) { - dout(10) << " peer osd" << from << " last_update now " << lu << dendl; - peer_info[from].last_update = lu; - if (lu < oldest_update) { - dout(10) << " oldest_update now " << lu << dendl; - oldest_update = lu; - } - } - - proc_missing(olog, peer_missing[from], from); - } -} - -void PG::merge_log(Log &olog, Missing &omissing, int fromosd) -{ - dout(10) << "merge_log " << olog << " from osd" << fromosd - << " into " << log << dendl; - - //dout(0) << "log" << dendl; - //log.print(cout); - //dout(0) << "olog" << dendl; - //olog.print(cout); - - if (log.empty() || - (olog.bottom > log.top && olog.backlog)) { // e.g. log=(0,20] olog=(40,50]+backlog) - - // swap and index - log.log.swap(olog.log); - log.index(); - - // find split point (old log.top) in new log - // add new items to missing along the way. - for (list::reverse_iterator p = log.log.rbegin(); - p != log.log.rend(); - p++) { - if (p->version <= log.top) { - // ok, p is at split point. - - // was our old log divergent? - if (log.top > p->version) { - dout(10) << "merge_log i was (possibly) divergent for (" << p->version << "," << log.top << "]" << dendl; - if (p->version < oldest_update) - oldest_update = p->version; - - while (!olog.log.empty() && - olog.log.rbegin()->version > p->version) { - Log::Entry &oe = *olog.log.rbegin(); // old entry (possibly divergent) - if (log.objects.count(oe.oid)) { - if (log.objects[oe.oid]->version < oe.version) { - dout(10) << "merge_log divergent entry " << oe - << " not superceded by " << *log.objects[oe.oid] - << ", adding to missing" << dendl; - missing.add(oe.oid, oe.version); - } else { - dout(10) << "merge_log divergent entry " << oe - << " superceded by " << *log.objects[oe.oid] - << ", ignoring" << dendl; - } - } else { - dout(10) << "merge_log divergent entry " << oe << ", adding to missing" << dendl; - missing.add(oe.oid, oe.version); - } - olog.log.pop_back(); // discard divergent entry - } - } - break; - } - - if (p->is_delete()) { - dout(10) << "merge_log merging " << *p << ", not missing" << dendl; - missing.rm(p->oid, p->version); - } else { - dout(10) << "merge_log merging " << *p << ", now missing" << dendl; - missing.add(p->oid, p->version); - } - } - - info.last_update = log.top = olog.top; - info.log_bottom = log.bottom = olog.bottom; - info.log_backlog = log.backlog = olog.backlog; - } - - else { - // i can merge the two logs! - - // extend on bottom? - // FIXME: what if we have backlog, but they have lower bottom? - if (olog.bottom < log.bottom && olog.top >= log.bottom && !log.backlog) { - dout(10) << "merge_log extending bottom to " << olog.bottom - << (olog.backlog ? " +backlog":"") - << dendl; - - // ok - list::iterator from = olog.log.begin(); - list::iterator to; - for (to = from; - to != olog.log.end(); - to++) { - if (to->version > log.bottom) break; - - // update our index while we're here - log.index(*to); - - dout(15) << *to << dendl; - - // new missing object? - if (to->version > info.last_complete) { - if (to->is_update()) - missing.add(to->oid, to->version); - else - missing.rm(to->oid, to->version); - } - } - assert(to != olog.log.end()); - - // splice into our log. - log.log.splice(log.log.begin(), - olog.log, from, to); - - info.log_bottom = log.bottom = olog.bottom; - info.log_backlog = log.backlog = olog.backlog; - } - - // extend on top? - if (olog.top > log.top && - olog.bottom <= log.top) { - dout(10) << "merge_log extending top to " << olog.top << dendl; - - list::iterator to = olog.log.end(); - list::iterator from = olog.log.end(); - while (1) { - if (from == olog.log.begin()) break; - from--; - //dout(0) << "? " << *from << dendl; - if (from->version < log.top) { - from++; - break; - } - - log.index(*from); - dout(10) << "merge_log " << *from << dendl; - - // add to missing - if (from->is_update()) { - missing.add(from->oid, from->version); - } else - missing.rm(from->oid, from->version); - } - - // remove divergent items - while (1) { - Log::Entry *oldtail = &(*log.log.rbegin()); - if (oldtail->version.version+1 == from->version.version) break; - - // divergent! - assert(oldtail->version.version >= from->version.version); - - if (log.objects[oldtail->oid]->version == oldtail->version) { - // and significant. - dout(10) << "merge_log had divergent " << *oldtail << ", adding to missing" << dendl; - //missing.add(oldtail->oid); - assert(0); - } else { - dout(10) << "merge_log had divergent " << *oldtail << ", already missing" << dendl; - assert(missing.is_missing(oldtail->oid)); - } - log.log.pop_back(); - } - - // splice - log.log.splice(log.log.end(), - olog.log, from, to); - - info.last_update = log.top = olog.top; - } - } - - dout(10) << "merge_log result " << log << " " << missing << dendl; - //log.print(cout); - -} - -void PG::proc_missing(Log &olog, Missing &omissing, int fromosd) -{ - // found items? - for (map::iterator p = missing.missing.begin(); - p != missing.missing.end(); - p++) { - if (omissing.is_missing(p->first)) { - assert(omissing.is_missing(p->first, p->second)); - if (omissing.loc.count(p->first)) { - dout(10) << "proc_missing missing " << p->first << " " << p->second - << " on osd" << omissing.loc[p->first] << dendl; - missing.loc[p->first] = omissing.loc[p->first]; - } else { - dout(10) << "proc_missing missing " << p->first << " " << p->second - << " also LOST on source, osd" << fromosd << dendl; - } - } - else if (p->second <= olog.top) { - dout(10) << "proc_missing missing " << p->first << " " << p->second - << " on source, osd" << fromosd << dendl; - missing.loc[p->first] = fromosd; - } else { - dout(10) << "proc_missing " << p->first << " " << p->second - << " > olog.top " << olog.top << ", not found...." - << dendl; - } - } - - dout(10) << "proc_missing missing " << missing.missing << dendl; -} - - - -void PG::generate_backlog() -{ - dout(10) << "generate_backlog to " << log << dendl; - assert(!log.backlog); - log.backlog = true; - - list olist; - osd->store->collection_list(info.pgid, olist); - - int local = 0; - map add; - for (list::iterator it = olist.begin(); - it != olist.end(); - it++) { - local++; - - if (log.logged_object(*it)) continue; // already have it logged. - - // add entry - Log::Entry e; - e.op = Log::Entry::MODIFY; // FIXME when we do smarter op codes! - e.oid = *it; - osd->store->getattr(*it, - "version", - &e.version, sizeof(e.version)); - add[e.version] = e; - dout(10) << "generate_backlog found " << e << dendl; - } - - for (map::reverse_iterator i = add.rbegin(); - i != add.rend(); - i++) { - log.log.push_front(i->second); - log.index( *log.log.begin() ); // index - } - - dout(10) << local << " local objects, " - << add.size() << " objects added to backlog, " - << log.objects.size() << " in pg" << dendl; - - //log.print(cout); -} - -void PG::drop_backlog() -{ - dout(10) << "drop_backlog for " << log << dendl; - //log.print(cout); - - assert(log.backlog); - log.backlog = false; - - while (!log.log.empty()) { - Log::Entry &e = *log.log.begin(); - if (e.version > log.bottom) break; - - dout(15) << "drop_backlog trimming " << e.version << dendl; - log.unindex(e); - log.log.pop_front(); - } -} - - - - - -ostream& PG::Log::print(ostream& out) const -{ - out << *this << dendl; - for (list::const_iterator p = log.begin(); - p != log.end(); - p++) - out << *p << dendl; - return out; -} - - - - - -/******* PG ***********/ -void PG::build_prior() -{ - // build prior set. - prior_set.clear(); - - // current - for (unsigned i=1; iosdmap->get_epoch(); - epoch++) { - OSDMap omap; - osd->get_map(epoch, omap); - - vector acting; - omap.pg_to_acting_osds(get_pgid(), acting); - - for (unsigned i=0; iosdmap->is_up(acting[i]) && // is up now - acting[i] != osd->whoami) // and is not me - prior_set.insert(acting[i]); - } - } - - dout(10) << "build_prior built " << prior_set << dendl; -} - -void PG::adjust_prior() -{ - assert(!prior_set.empty()); - - // raise last_epoch_started_any - epoch_t max = 0; - for (map::iterator it = peer_info.begin(); - it != peer_info.end(); - it++) { - if (it->second.last_epoch_started > max) - max = it->second.last_epoch_started; - } - - dout(10) << "adjust_prior last_epoch_started_any " - << last_epoch_started_any << " -> " << max << dendl; - assert(max > last_epoch_started_any); - last_epoch_started_any = max; - - // rebuild prior set - build_prior(); -} - - -void PG::clear_primary_state() -{ - dout(10) << "clear_primary_state" << dendl; - - // clear peering state - have_master_log = false; - prior_set.clear(); - stray_set.clear(); - uptodate_set.clear(); - peer_info_requested.clear(); - peer_log_requested.clear(); - peer_info.clear(); - peer_missing.clear(); - - stat_object_temp_rd.clear(); - - last_epoch_started_any = info.last_epoch_started; -} - -void PG::peer(ObjectStore::Transaction& t, - map< int, map >& query_map, - map *activator_map) -{ - dout(10) << "peer. acting is " << acting - << ", prior_set is " << prior_set << dendl; - - - /** GET ALL PG::Info *********/ - - // -- query info from everyone in prior_set. - bool missing_info = false; - for (set::iterator it = prior_set.begin(); - it != prior_set.end(); - it++) { - if (peer_info.count(*it)) { - dout(10) << " have info from osd" << *it - << ": " << peer_info[*it] - << dendl; - continue; - } - missing_info = true; - - if (peer_info_requested.count(*it)) { - dout(10) << " waiting for osd" << *it << dendl; - continue; - } - - dout(10) << " querying info from osd" << *it << dendl; - query_map[*it][info.pgid] = Query(Query::INFO, info.history); - peer_info_requested.insert(*it); - } - if (missing_info) return; - - - // -- ok, we have all (prior_set) info. (and maybe others.) - - // did we crash? - dout(10) << " last_epoch_started_any " << last_epoch_started_any << dendl; - if (last_epoch_started_any) { - OSDMap omap; - osd->get_map(last_epoch_started_any, omap); - - // start with the last active set of replicas - set last_started; - vector acting; - bool cleanly_down = true; - omap.pg_to_acting_osds(get_pgid(), acting); - for (unsigned i=0; iosdmap->get_epoch(); - e++) { - OSDMap omap; - osd->get_map(e, omap); - - set still_up; - - for (set::iterator i = last_started.begin(); - i != last_started.end(); - i++) { - //dout(10) << " down in epoch " << e << " is " << omap.get_down_osds() << dendl; - if (omap.is_up(*i)) - still_up.insert(*i); - else if (!omap.is_down_clean(*i)) - cleanly_down = false; - } - - last_started.swap(still_up); - //dout(10) << " still active as of epoch " << e << ": " << last_started << dendl; - } - - if (last_started.empty()) { - if (cleanly_down) { - dout(10) << " cleanly stopped since epoch " << last_epoch_started_any << dendl; - } else { - dout(10) << " crashed since epoch " << last_epoch_started_any << dendl; - state_set(STATE_CRASHED); - } - } else { - dout(10) << " still active from last started: " << last_started << dendl; - } - } else if (osd->osdmap->post_mkfs()) { - dout(10) << " crashed since epoch " << last_epoch_started_any << dendl; - state_set(STATE_CRASHED); - } - - dout(10) << " peers_complete_thru " << peers_complete_thru << dendl; - - - - - /** CREATE THE MASTER PG::Log *********/ - - // who (of all priors and active) has the latest PG version? - eversion_t newest_update = info.last_update; - int newest_update_osd = osd->whoami; - - oldest_update = info.last_update; // only of acting (current) osd set. - peers_complete_thru = info.last_complete; - - for (map::iterator it = peer_info.begin(); - it != peer_info.end(); - it++) { - if (it->second.last_update > newest_update) { - newest_update = it->second.last_update; - newest_update_osd = it->first; - } - if (is_acting(it->first)) { - if (it->second.last_update < oldest_update) - oldest_update = it->second.last_update; - if (it->second.last_complete < peers_complete_thru) - peers_complete_thru = it->second.last_complete; - } - } - - // gather log(+missing) from that person! - if (newest_update_osd != osd->whoami) { - if (peer_log_requested.count(newest_update_osd) || - peer_summary_requested.count(newest_update_osd)) { - dout(10) << " newest update on osd" << newest_update_osd - << " v " << newest_update - << ", already queried" - << dendl; - } else { - // we'd like it back to oldest_update, but will settle for log_bottom - eversion_t since = MAX(peer_info[newest_update_osd].log_bottom, - oldest_update); - if (peer_info[newest_update_osd].log_bottom < log.top) { - dout(10) << " newest update on osd" << newest_update_osd - << " v " << newest_update - << ", querying since " << since - << dendl; - query_map[newest_update_osd][info.pgid] = Query(Query::LOG, log.top, since, info.history); - peer_log_requested.insert(newest_update_osd); - } else { - dout(10) << " newest update on osd" << newest_update_osd - << " v " << newest_update - << ", querying entire summary/backlog" - << dendl; - assert((peer_info[newest_update_osd].last_complete >= - peer_info[newest_update_osd].log_bottom) || - peer_info[newest_update_osd].log_backlog); // or else we're in trouble. - query_map[newest_update_osd][info.pgid] = Query(Query::BACKLOG, info.history); - peer_summary_requested.insert(newest_update_osd); - } - } - return; - } else { - dout(10) << " newest_update " << info.last_update << " (me)" << dendl; - } - - dout(10) << " oldest_update " << oldest_update << dendl; - - have_master_log = true; - - - // -- do i need to generate backlog for any of my peers? - if (oldest_update < log.bottom && !log.backlog) { - dout(10) << "generating backlog for some peers, bottom " - << log.bottom << " > " << oldest_update - << dendl; - generate_backlog(); - } - - - /** COLLECT MISSING+LOG FROM PEERS **********/ - /* - we also detect divergent replicas here by pulling the full log - from everyone. - */ - - // gather missing from peers - for (unsigned i=1; i 0) { - dout(10) << "there are still " << missing.num_lost() << " lost objects" << dendl; - - // ***** - // FIXME: i don't think this actually accomplishes anything! - // ***** - - // ok, let's get more summaries! - bool waiting = false; - for (map::iterator it = peer_info.begin(); - it != peer_info.end(); - it++) { - int peer = it->first; - - if (peer_summary_requested.count(peer)) { - dout(10) << " already requested summary/backlog from osd" << peer << dendl; - waiting = true; - continue; - } - - dout(10) << " requesting summary/backlog from osd" << peer << dendl; - query_map[peer][info.pgid] = Query(Query::BACKLOG, info.history); - peer_summary_requested.insert(peer); - waiting = true; - } - - if (!waiting) { - dout(10) << missing.num_lost() << " objects are still lost, waiting+hoping for a notify from someone else!" << dendl; - } - return; - } - - // sanity check - assert(missing.num_lost() == 0); - assert(info.last_complete >= log.bottom || log.backlog); - - - // -- crash recovery? - if (is_crashed()) { - dout(10) << "crashed, allowing op replay for " << g_conf.osd_replay_window << dendl; - state_set(STATE_REPLAY); - osd->timer.add_event_after(g_conf.osd_replay_window, - new OSD::C_Activate(osd, info.pgid, osd->osdmap->get_epoch())); - } - else if (!is_active()) { - // -- ok, activate! - activate(t, activator_map); - } -} - - -void PG::activate(ObjectStore::Transaction& t, - map *activator_map) -{ - assert(!is_active()); - - // twiddle pg state - state_set(STATE_ACTIVE); - state_clear(STATE_STRAY); - if (is_crashed()) { - //assert(is_replay()); // HELP.. not on replica? - state_clear(STATE_CRASHED); - state_clear(STATE_REPLAY); - } - info.last_epoch_started = osd->osdmap->get_epoch(); - - if (role == 0) { // primary state - peers_complete_thru = 0; // we don't know (yet)! - } - - assert(info.last_complete >= log.bottom || log.backlog); - - // write pg info - t.collection_setattr(info.pgid, "info", (char*)&info, sizeof(info)); - - // write log - write_log(t); - - // clean up stray objects - clean_up_local(t); - - // init complete pointer - if (info.last_complete == info.last_update) { - dout(10) << "activate - complete" << dendl; - log.complete_to == log.log.end(); - log.requested_to = log.log.end(); - } - else if (true) { - dout(10) << "activate - not complete, " << missing << dendl; - - // init complete_to - log.complete_to = log.log.begin(); - while (log.complete_to->version < info.last_complete) { - log.complete_to++; - assert(log.complete_to != log.log.end()); - } - - if (is_primary()) { - // start recovery - dout(10) << "activate - starting recovery" << dendl; - log.requested_to = log.complete_to; - do_recovery(); - } - } else { - dout(10) << "activate - not complete, " << missing << dendl; - } - - // if primary.. - if (role == 0 && - (!g_conf.osd_hack_fast_startup || osd->osdmap->post_mkfs())) { - // who is clean? - uptodate_set.clear(); - if (info.is_uptodate()) - uptodate_set.insert(osd->whoami); - - // start up replicas - for (unsigned i=1; icount(peer) == 0) - (*activator_map)[peer] = new MOSDPGActivateSet(osd->osdmap->get_epoch()); - (*activator_map)[peer]->pg_info.push_back(info); - } else { - dout(10) << "activate - peer osd" << peer << " is up to date, but sending pg_log anyway" << dendl; - m = new MOSDPGLog(osd->osdmap->get_epoch(), info); - } - } - else { - m = new MOSDPGLog(osd->osdmap->get_epoch(), info); - if (peer_info[peer].last_update < log.bottom) { - // summary/backlog - assert(log.backlog); - m->log = log; - } else { - // incremental log - assert(peer_info[peer].last_update < info.last_update); - m->log.copy_after(log, peer_info[peer].last_update); - } - } - - // update local version of peer's missing list! - if (m) { - eversion_t plu = peer_info[peer].last_update; - Missing& pm = peer_missing[peer]; - for (list::iterator p = m->log.log.begin(); - p != m->log.log.end(); - p++) - if (p->version > plu) - pm.add(p->oid, p->version); - } - - if (m) { - dout(10) << "activate sending " << m->log << " " << m->missing - << " to osd" << peer << dendl; - //m->log.print(cout); - osd->messenger->send_message(m, osd->osdmap->get_inst(peer)); - } - - // update our missing - if (peer_missing[peer].num_missing() == 0) { - dout(10) << "activate peer osd" << peer << " already uptodate, " << peer_info[peer] << dendl; - assert(peer_info[peer].is_uptodate()); - uptodate_set.insert(peer); - } else { - dout(10) << "activate peer osd" << peer << " " << peer_info[peer] - << " missing " << peer_missing[peer] << dendl; - } - - } - - // discard unneeded peering state - //peer_log.clear(); // actually, do this carefully, in case peer() is called again. - - // all clean? - if (is_all_uptodate()) - finish_recovery(); - else { - dout(10) << "activate not all replicas are uptodate, starting recovery" << dendl; - do_recovery(); - } - } - - - // replay (queue them _before_ other waiting ops!) - if (!replay_queue.empty()) { - eversion_t c = info.last_update; - list replay; - for (map::iterator p = replay_queue.begin(); - p != replay_queue.end(); - p++) { - if (p->first <= info.last_update) { - dout(10) << "activate will WRNOOP " << p->first << " " << *p->second << dendl; - replay.push_back(p->second); - continue; - } - if (p->first.version != c.version+1) { - dout(10) << "activate replay " << p->first - << " skipping " << c.version+1 - p->first.version - << " ops" - << dendl; - } - dout(10) << "activate replay " << p->first << " " << *p->second << dendl; - replay.push_back(p->second); - c = p->first; - } - replay_queue.clear(); - osd->take_waiters(replay); - } - - if (is_primary()) - update_stats(); // update stats - - // waiters - osd->take_waiters(waiting_for_active); -} - - -void PG::finish_recovery() -{ - dout(10) << "finish_recovery" << dendl; - - state_set(PG::STATE_CLEAN); - purge_strays(); - update_stats(); -} - - - -void PG::update_stats() -{ - dout(15) << "update_stats" << dendl; - assert(is_primary()); - - // update our stat summary - pg_stats_lock.Lock(); - pg_stats.reported = info.last_update; - pg_stats.state = state; - pg_stats.size = stat_size; - pg_stats.num_blocks = stat_num_blocks; - pg_stats_lock.Unlock(); - - // put in osd stat_queue - osd->pg_stat_queue_lock.Lock(); - osd->pg_stat_queue.insert(info.pgid); - osd->pg_stat_queue_lock.Unlock(); -} - - -void PG::write_log(ObjectStore::Transaction& t) -{ - dout(10) << "write_log" << dendl; - - // assemble buffer - bufferlist bl; - - // build buffer - ondisklog.bottom = 0; - ondisklog.block_map.clear(); - for (list::iterator p = log.log.begin(); - p != log.log.end(); - p++) { - if (bl.length() % 4096 == 0) - ondisklog.block_map[bl.length()] = p->version; - bl.append((char*)&(*p), sizeof(*p)); - if (g_conf.osd_pad_pg_log) { // pad to 4k, until i fix ebofs reallocation crap. FIXME. - bufferptr bp(4096 - sizeof(*p)); - bl.push_back(bp); - } - } - ondisklog.top = bl.length(); - - // write it - t.remove( info.pgid.to_object() ); - t.write( info.pgid.to_object() , 0, bl.length(), bl); - t.collection_setattr(info.pgid, "ondisklog_bottom", &ondisklog.bottom, sizeof(ondisklog.bottom)); - t.collection_setattr(info.pgid, "ondisklog_top", &ondisklog.top, sizeof(ondisklog.top)); - - t.collection_setattr(info.pgid, "info", &info, sizeof(info)); -} - -void PG::trim_ondisklog_to(ObjectStore::Transaction& t, eversion_t v) -{ - dout(15) << " trim_ondisk_log_to v " << v << dendl; - - map::iterator p = ondisklog.block_map.begin(); - while (p != ondisklog.block_map.end()) { - dout(15) << " " << p->first << " -> " << p->second << dendl; - p++; - if (p == ondisklog.block_map.end() || - p->second > v) { // too far! - p--; // back up - break; - } - } - dout(15) << " * " << p->first << " -> " << p->second << dendl; - if (p == ondisklog.block_map.begin()) - return; // can't trim anything! - - // we can trim! - off_t trim = p->first; - dout(10) << " trimming ondisklog to [" << ondisklog.bottom << "," << ondisklog.top << ")" << dendl; - - assert(trim >= ondisklog.bottom); - ondisklog.bottom = trim; - - // adjust block_map - while (p != ondisklog.block_map.begin()) - ondisklog.block_map.erase(ondisklog.block_map.begin()); - - t.collection_setattr(info.pgid, "ondisklog_bottom", &ondisklog.bottom, sizeof(ondisklog.bottom)); - t.collection_setattr(info.pgid, "ondisklog_top", &ondisklog.top, sizeof(ondisklog.top)); -} - - -void PG::append_log(ObjectStore::Transaction& t, PG::Log::Entry& logentry, - eversion_t trim_to) -{ - dout(10) << "append_log " << ondisklog.top << " " << logentry << dendl; - - // write entry on disk - bufferlist bl; - bl.append( (char*)&logentry, sizeof(logentry) ); - if (g_conf.osd_pad_pg_log) { // pad to 4k, until i fix ebofs reallocation crap. FIXME. - bufferptr bp(4096 - sizeof(logentry)); - bl.push_back(bp); - } - t.write( info.pgid.to_object(), ondisklog.top, bl.length(), bl ); - - // update block map? - if (ondisklog.top % 4096 == 0) - ondisklog.block_map[ondisklog.top] = logentry.version; - - ondisklog.top += bl.length(); - t.collection_setattr(info.pgid, "ondisklog_top", &ondisklog.top, sizeof(ondisklog.top)); - - // trim? - if (trim_to > log.bottom) { - dout(10) << " trimming " << log << " to " << trim_to << dendl; - log.trim(t, trim_to); - info.log_bottom = log.bottom; - info.log_backlog = log.backlog; - trim_ondisklog_to(t, trim_to); - } - dout(10) << " ondisklog [" << ondisklog.bottom << "," << ondisklog.top << ")" << dendl; -} - -void PG::read_log(ObjectStore *store) -{ - int r; - // load bounds - ondisklog.bottom = ondisklog.top = 0; - r = store->collection_getattr(info.pgid, "ondisklog_bottom", &ondisklog.bottom, sizeof(ondisklog.bottom)); - assert(r == sizeof(ondisklog.bottom)); - r = store->collection_getattr(info.pgid, "ondisklog_top", &ondisklog.top, sizeof(ondisklog.top)); - assert(r == sizeof(ondisklog.top)); - - dout(10) << "read_log [" << ondisklog.bottom << "," << ondisklog.top << ")" << dendl; - - log.backlog = info.log_backlog; - log.bottom = info.log_bottom; - - if (ondisklog.top > 0) { - // read - bufferlist bl; - store->read(info.pgid.to_object(), ondisklog.bottom, ondisklog.top-ondisklog.bottom, bl); - if (bl.length() < ondisklog.top-ondisklog.bottom) { - dout(0) << "read_log data doesn't match attrs" << dendl; - assert(0); - } - - PG::Log::Entry e; - off_t pos = ondisklog.bottom; - assert(log.log.empty()); - while (pos < ondisklog.top) { - bl.copy(pos-ondisklog.bottom, sizeof(e), (char*)&e); - dout(10) << "read_log " << pos << " " << e << dendl; - - if (e.version > log.bottom || log.backlog) { // ignore items below log.bottom - if (pos % 4096 == 0) - ondisklog.block_map[pos] = e.version; - log.log.push_back(e); - } else { - dout(10) << "read_log ignoring entry at " << pos << dendl; - } - - if (g_conf.osd_pad_pg_log) // pad to 4k, until i fix ebofs reallocation crap. FIXME. - pos += 4096; - else - pos += sizeof(e); - } - } - log.top = info.last_update; - log.index(); - - // build missing - set did; - for (list::reverse_iterator i = log.log.rbegin(); - i != log.log.rend(); - i++) { - if (i->version <= info.last_complete) break; - if (did.count(i->oid)) continue; - did.insert(i->oid); - - if (i->is_delete()) continue; - - eversion_t v; - int r = osd->store->getattr(i->oid, "version", &v, sizeof(v)); - if (r < 0 || v < i->version) - missing.add(i->oid, i->version); - } -} - - - - -// ============================== -// Object locking - -// -// If the target object of the operation op is locked for writing by another client, the function puts op to the waiting queue waiting_for_wr_unlock -// returns true if object was locked, otherwise returns false -// -bool PG::block_if_wrlocked(MOSDOp* op) -{ - object_t oid = op->get_oid(); - - entity_name_t source; - int len = osd->store->getattr(oid, "wrlock", &source, sizeof(entity_name_t)); - //dout(0) << "getattr returns " << len << " on " << oid << dendl; - - if (len == sizeof(source) && - source != op->get_client()) { - //the object is locked for writing by someone else -- add the op to the waiting queue - waiting_for_wr_unlock[oid].push_back(op); - return true; - } - - return false; //the object wasn't locked, so the operation can be handled right away -} - - - - -// ======================= -// revisions - - -/* -int OSD::list_missing_revs(object_t oid, set& revs, PG *pg) -{ - int c = 0; - oid.rev = 0; - - map::iterator p = pg->missing.missing.lower_bound(oid); - if (p == pg->missing.missing.end()) - return 0; // clearly not - - while (p->first.ino == oid.ino && - p->first.bno == oid.bno) { - revs.insert(p->first); - c++; - } - return c; -}*/ - -bool PG::pick_missing_object_rev(object_t& oid) -{ - map::iterator p = missing.missing.upper_bound(oid); - if (p == missing.missing.end()) - return false; // clearly no candidate - - if (p->first.ino == oid.ino && p->first.bno == oid.bno) { - oid = p->first; // yes! it's an upper bound revision for me. - return true; - } - return false; -} - -bool PG::pick_object_rev(object_t& oid) -{ - object_t t = oid; - - if (!osd->store->pick_object_revision_lt(t)) - return false; // we have no revisions of this object! - - objectrev_t crev; - int r = osd->store->getattr(t, "crev", &crev, sizeof(crev)); - assert(r >= 0); - if (crev <= oid.rev) { - dout(10) << "pick_object_rev choosing " << t << " crev " << crev << " for " << oid << dendl; - oid = t; - return true; - } - - return false; -} - - - - - diff --git a/branches/sage/crush/osd/PG.h b/branches/sage/crush/osd/PG.h deleted file mode 100644 index 0e14ea3a2ed63..0000000000000 --- a/branches/sage/crush/osd/PG.h +++ /dev/null @@ -1,754 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __PG_H -#define __PG_H - - -#include "include/types.h" -#include "osd_types.h" -#include "include/buffer.h" - -#include "OSDMap.h" -#include "ObjectStore.h" -#include "msg/Messenger.h" - -#include "common/DecayCounter.h" - -#include -#include -using namespace std; - -#include -#include -using namespace __gnu_cxx; - - -class OSD; -class MOSDOp; -class MOSDOpReply; -class MOSDPGActivateSet; - -/** PG - Replica Placement Group - * - */ - -class PG { -public: - - /* - * PG::Info - summary of PG statistics. - * - * some notes: - * - last_complete implies we have all objects that existed as of that - * stamp, OR a newer object, OR have already applied a later delete. - * - if last_complete >= log.bottom, then we know pg contents thru log.top. - * otherwise, we have no idea what the pg is supposed to contain. - */ - struct Info { - pg_t pgid; - eversion_t last_update; // last object version applied to store. - eversion_t last_complete; // last version pg was complete through. - - eversion_t log_bottom; // oldest log entry. - bool log_backlog; // do we store a complete log? - - epoch_t last_epoch_started; // last epoch started. - epoch_t last_epoch_finished; // last epoch finished. - - struct History { - epoch_t same_since; // same acting set since - epoch_t same_primary_since; // same primary at least back through this epoch. - epoch_t same_acker_since; // same acker at least back through this epoch. - History() : same_since(0), same_primary_since(0), same_acker_since(0) {} - } history; - - Info(pg_t p=0) : pgid(p), - log_backlog(false), - last_epoch_started(0), last_epoch_finished(0) {} - bool is_uptodate() const { return last_update == last_complete; } - bool is_empty() const { return last_update.version == 0; } - }; - - - /** - * Query - used to ask a peer for information about a pg. - * - * note: if version=0, type=LOG, then we just provide our full log. - * only if type=BACKLOG do we generate a backlog and provide that too. - */ - struct Query { - const static int INFO = 0; - const static int LOG = 1; - const static int BACKLOG = 2; - const static int FULLLOG = 3; - - int type; - eversion_t split, floor; - Info::History history; - - Query() : type(-1) {} - Query(int t, Info::History& h) : - type(t), history(h) { assert(t != LOG); } - Query(int t, eversion_t s, eversion_t f, Info::History& h) : - type(t), split(s), floor(f), history(h) { assert(t == LOG); } - }; - - - /* - * Missing - summary of missing objects. - * kept in memory, as a supplement to Log. - * also used to pass missing info in messages. - */ - class Missing { - public: - map missing; // oid -> v - map rmissing; // v -> oid - - map loc; // where i think i can get them. - - int num_lost() const { return missing.size() - loc.size(); } - int num_missing() const { return missing.size(); } - - bool is_missing(object_t oid) { - return missing.count(oid); - } - bool is_missing(object_t oid, eversion_t v) { - return missing.count(oid) && missing[oid] <= v; - } - void add(object_t oid) { - eversion_t z; - add(oid,z); - } - void add(object_t oid, eversion_t v) { - if (missing.count(oid)) { - if (missing[oid] > v) return; // already missing newer. - rmissing.erase(missing[oid]); - } - missing[oid] = v; - rmissing[v] = oid; - } - void rm(object_t oid, eversion_t when) { - if (missing.count(oid) && missing[oid] < when) { - rmissing.erase(missing[oid]); - missing.erase(oid); - loc.erase(oid); - } - } - void got(object_t oid, eversion_t v) { - assert(missing.count(oid)); - assert(missing[oid] <= v); - loc.erase(oid); - rmissing.erase(missing[oid]); - missing.erase(oid); - } - void got(object_t oid) { - assert(missing.count(oid)); - loc.erase(oid); - rmissing.erase(missing[oid]); - missing.erase(oid); - } - - void _encode(bufferlist& blist) { - ::_encode(missing, blist); - ::_encode(loc, blist); - } - void _decode(bufferlist& blist, int& off) { - ::_decode(missing, blist, off); - ::_decode(loc, blist, off); - - for (map::iterator it = missing.begin(); - it != missing.end(); - it++) - rmissing[it->second] = it->first; - } - }; - - - /* - * Log - incremental log of recent pg changes. - * also, serves as a recovery queue. - * - * when backlog is true, - * objects with versions <= bottom are in log. - * we do not have any deletion info before that time, however. - * log is a "summary" in that it contains all objects in the PG. - */ - class Log { - public: - /** top, bottom - * top - newest entry (update|delete) - * bottom - entry previous to oldest (update|delete) for which we have - * complete negative information. - * i.e. we can infer pg contents for any store whose last_update >= bottom. - */ - eversion_t top; // newest entry (update|delete) - eversion_t bottom; // version prior to oldest (update|delete) - - /** backlog - true if log is a complete summary of pg contents. - * updated will include all items in pg, but deleted will not include - * negative entries for items deleted prior to 'bottom'. - */ - bool backlog; - - /** Entry - * mapped from the eversion_t, so don't include that. - */ - class Entry { - public: - const static int LOST = 0; - const static int MODIFY = 1; - const static int CLONE = 2; - const static int DELETE = 3; - - int op; // write, zero, trunc, remove - object_t oid; - eversion_t version; - - osdreqid_t reqid; // caller+tid to uniquely identify request - - Entry() : op(0) {} - Entry(int _op, object_t _oid, const eversion_t& v, - const osdreqid_t& rid) : - op(_op), oid(_oid), version(v), reqid(rid) {} - - bool is_delete() const { return op == DELETE; } - bool is_clone() const { return op == CLONE; } - bool is_modify() const { return op == MODIFY; } - bool is_update() const { return is_clone() || is_modify(); } - }; - - list log; // the actual log. - - Log() : backlog(false) {} - - void clear() { - eversion_t z; - top = bottom = z; - backlog = false; - log.clear(); - } - bool empty() const { - return top.version == 0 && top.epoch == 0; - } - - void _encode(bufferlist& blist) const { - blist.append((char*)&top, sizeof(top)); - blist.append((char*)&bottom, sizeof(bottom)); - blist.append((char*)&backlog, sizeof(backlog)); - ::_encode(log, blist); - } - void _decode(bufferlist& blist, int& off) { - blist.copy(off, sizeof(top), (char*)&top); - off += sizeof(top); - blist.copy(off, sizeof(bottom), (char*)&bottom); - off += sizeof(bottom); - blist.copy(off, sizeof(backlog), (char*)&backlog); - off += sizeof(backlog); - - ::_decode(log, blist, off); - } - - void copy_after(const Log &other, eversion_t v); - bool copy_after_unless_divergent(const Log &other, eversion_t split, eversion_t floor); - void copy_non_backlog(const Log &other); - ostream& print(ostream& out) const; - }; - - /** - * IndexLog - adds in-memory index of the log, by oid. - * plus some methods to manipulate it all. - */ - class IndexedLog : public Log { - public: - hash_map objects; // ptrs into log. be careful! - hash_set caller_ops; - - // recovery pointers - list::iterator requested_to; // not inclusive of referenced item - list::iterator complete_to; // not inclusive of referenced item - - /****/ - IndexedLog() {} - - void clear() { - assert(0); - unindex(); - Log::clear(); - } - - bool logged_object(object_t oid) { - return objects.count(oid); - } - bool logged_req(const osdreqid_t &r) { - return caller_ops.count(r); - } - - void index() { - objects.clear(); - caller_ops.clear(); - for (list::iterator i = log.begin(); - i != log.end(); - i++) { - objects[i->oid] = &(*i); - caller_ops.insert(i->reqid); - } - } - - void index(Entry& e) { - if (objects.count(e.oid) == 0 || - objects[e.oid]->version < e.version) - objects[e.oid] = &e; - caller_ops.insert(e.reqid); - } - void unindex() { - objects.clear(); - caller_ops.clear(); - } - void unindex(Entry& e) { - // NOTE: this only works if we remove from the _bottom_ of the log! - assert(objects.count(e.oid)); - if (objects[e.oid]->version == e.version) - objects.erase(e.oid); - caller_ops.erase(e.reqid); - } - - - // accessors - Entry *is_updated(object_t oid) { - if (objects.count(oid) && objects[oid]->is_update()) return objects[oid]; - return 0; - } - Entry *is_deleted(object_t oid) { - if (objects.count(oid) && objects[oid]->is_delete()) return objects[oid]; - return 0; - } - - // actors - void add(Entry& e) { - // add to log - log.push_back(e); - assert(e.version > top); - assert(top.version == 0 || e.version.version > top.version); - top = e.version; - - // to our index - objects[e.oid] = &(log.back()); - caller_ops.insert(e.reqid); - } - - void trim(ObjectStore::Transaction &t, eversion_t s); - void trim_write_ahead(eversion_t last_update); - }; - - - /** - * OndiskLog - some info about how we store the log on disk. - */ - class OndiskLog { - public: - // ok - off_t bottom; // first byte of log. - off_t top; // byte following end of log. - map block_map; // block -> first stamp logged there - - OndiskLog() : bottom(0), top(0) {} - - bool trim_to(eversion_t v, ObjectStore::Transaction& t); - }; - - - /*** PG ****/ -public: - // any - static const int STATE_ACTIVE = 1; // i am active. (primary: replicas too) - - // primary - static const int STATE_CLEAN = 2; // peers are complete, clean of stray replicas. - static const int STATE_CRASHED = 4; // all replicas went down. - static const int STATE_REPLAY = 8; // crashed, waiting for replay - - // non-primary - static const int STATE_STRAY = 16; // i must notify the primary i exist. - - static std::string get_state_string(int state) { - std::string st; - if (state & STATE_ACTIVE) st += "active+"; - if (state & STATE_CLEAN) st += "clean+"; - if (state & STATE_CRASHED) st += "crashed+"; - if (state & STATE_REPLAY) st += "replay+"; - if (state & STATE_STRAY) st += "stray+"; - if (!st.length()) - st = "inactive"; - else - st.resize(st.length()-1); - return st; - } - -protected: - OSD *osd; - - /** locking and reference counting. - * I destroy myself when the reference count hits zero. - * lock() should be called before doing anything. - * get() should be called on pointer copy (to another thread, etc.). - * put() should be called on destruction of some previously copied pointer. - * put_unlock() when done with the current pointer (_most common_). - */ - Mutex _lock; - int ref; - bool deleted; - -public: - void lock() { - //cout << this << " " << info.pgid << " lock" << endl; - _lock.Lock(); - } - void unlock() { - //cout << this << " " << info.pgid << " unlock" << endl; - _lock.Unlock(); - } - void get() { - //cout << this << " " << info.pgid << " get " << ref << endl; - assert(_lock.is_locked()); - ++ref; - } - void put() { - //cout << this << " " << info.pgid << " put " << ref << endl; - assert(_lock.is_locked()); - --ref; - assert(ref > 0); // last put must be a put_unlock. - } - void put_unlock() { - //cout << this << " " << info.pgid << " put_unlock " << ref << endl; - assert(_lock.is_locked()); - --ref; - _lock.Unlock(); - if (ref == 0) delete this; - } - - - list op_queue; // op queue - - - void mark_deleted() { deleted = true; } - bool is_deleted() { return deleted; } - -public: - // pg state - Info info; - IndexedLog log; - OndiskLog ondisklog; - Missing missing; - utime_t last_heartbeat; // - -protected: - int role; // 0 = primary, 1 = replica, -1=none. - int state; // see bit defns above - - // primary state - public: - vector acting; - epoch_t last_epoch_started_any; - eversion_t last_complete_commit; - - // [primary only] content recovery state - eversion_t peers_complete_thru; - bool have_master_log; - protected: - set prior_set; // current+prior OSDs, as defined by last_epoch_started_any. - set stray_set; // non-acting osds that have PG data. - set uptodate_set; // current OSDs that are uptodate - eversion_t oldest_update; // lowest (valid) last_update in active set - map peer_info; // info from peers (stray or prior) - set peer_info_requested; - map peer_missing; - set peer_log_requested; // logs i've requested (and start stamps) - set peer_summary_requested; - friend class OSD; - - - // pg waiters - list waiting_for_active; - hash_map > waiting_for_missing_object; - map replay_queue; - - hash_map > waiting_for_wr_unlock; - - bool block_if_wrlocked(MOSDOp* op); - - - // recovery - map objects_pulling; // which objects are currently being pulled - - - - // stats - off_t stat_size; - off_t stat_num_blocks; - - hash_map stat_object_temp_rd; - - Mutex pg_stats_lock; - pg_stat_t pg_stats; - - void update_stats(); - -public: - void clear_primary_state(); - - public: - bool is_acting(int osd) const { - for (unsigned i=0; i peers_complete_thru) { - peers_complete_thru = t; - return true; - } - return false; - } - - void proc_replica_log(Log &olog, Missing& omissing, int from); - void merge_log(Log &olog, Missing& omissing, int from); - void proc_missing(Log &olog, Missing &omissing, int fromosd); - - void generate_backlog(); - void drop_backlog(); - - void trim_write_ahead(); - - void peer(ObjectStore::Transaction& t, - map< int, map >& query_map, - map *activator_map=0); - void activate(ObjectStore::Transaction& t, - map *activator_map=0); - - virtual void clean_up_local(ObjectStore::Transaction& t) = 0; - - virtual void cancel_recovery() = 0; - virtual bool do_recovery() = 0; - virtual void purge_strays() = 0; - - void finish_recovery(); - - off_t get_log_write_pos() { - return 0; - } - - friend class C_OSD_RepModify_Commit; - - public: - PG(OSD *o, pg_t p) : - osd(o), - ref(0), deleted(false), - info(p), - role(0), - state(0), - last_epoch_started_any(0), - last_complete_commit(0), - peers_complete_thru(0), - have_master_log(true), - stat_size(0), stat_num_blocks(0) - { } - virtual ~PG() { } - - pg_t get_pgid() const { return info.pgid; } - int get_nrep() const { return acting.size(); } - - int get_primary() { return acting.empty() ? -1:acting[0]; } - //int get_tail() { return acting.empty() ? -1:acting[ acting.size()-1 ]; } - //int get_acker() { return g_conf.osd_rep == OSD_REP_PRIMARY ? get_primary():get_tail(); } - int get_acker() { - if (g_conf.osd_rep == OSD_REP_PRIMARY || - acting.size() <= 1) - return get_primary(); - return acting[1]; - } - - int get_role() const { return role; } - void set_role(int r) { role = r; } - - bool is_primary() const { return role == PG_ROLE_HEAD; } - bool is_acker() const { - if (g_conf.osd_rep == OSD_REP_PRIMARY) - return is_primary(); - else - return role == PG_ROLE_ACKER; - } - bool is_head() const { return role == PG_ROLE_HEAD; } - bool is_middle() const { return role == PG_ROLE_MIDDLE; } - bool is_residual() const { return role == PG_ROLE_STRAY; } - - //int get_state() const { return state; } - bool state_test(int m) const { return (state & m) != 0; } - void state_set(int m) { state |= m; } - void state_clear(int m) { state &= ~m; } - - bool is_complete() const { return info.last_complete == info.last_update; } - - bool is_active() const { return state_test(STATE_ACTIVE); } - bool is_crashed() const { return state_test(STATE_CRASHED); } - bool is_replay() const { return state_test(STATE_REPLAY); } - //bool is_complete() { return state_test(STATE_COMPLETE); } - bool is_clean() const { return state_test(STATE_CLEAN); } - bool is_stray() const { return state_test(STATE_STRAY); } - - bool is_empty() const { return info.last_update == 0; } - - int num_active_ops() const { - return objects_pulling.size(); - } - - // pg on-disk state - void write_log(ObjectStore::Transaction& t); - void append_log(ObjectStore::Transaction& t, - PG::Log::Entry& logentry, - eversion_t trim_to); - void read_log(ObjectStore *store); - void trim_ondisklog_to(ObjectStore::Transaction& t, eversion_t v); - - - bool is_dup(osdreqid_t rid) { - return log.logged_req(rid); - } - - - bool pick_missing_object_rev(object_t& oid); - bool pick_object_rev(object_t& oid); - - - - // abstract bits - virtual bool preprocess_op(MOSDOp *op, utime_t now) { return false; } - virtual void do_op(MOSDOp *op) = 0; - virtual void do_op_reply(MOSDOpReply *op) = 0; - - virtual bool same_for_read_since(epoch_t e) = 0; - virtual bool same_for_modify_since(epoch_t e) = 0; - virtual bool same_for_rep_modify_since(epoch_t e) = 0; - - virtual bool is_missing_object(object_t oid) = 0; - virtual void wait_for_missing_object(object_t oid, MOSDOp *op) = 0; - - virtual void note_failed_osd(int osd) = 0; - - virtual void on_acker_change() = 0; - virtual void on_role_change() = 0; -}; - - - -inline ostream& operator<<(ostream& out, const PG::Info::History& h) -{ - return out << h.same_since << "/" << h.same_primary_since << "/" << h.same_acker_since; -} - -inline ostream& operator<<(ostream& out, const PG::Info& pgi) -{ - out << pgi.pgid << "("; - if (pgi.is_empty()) - out << " empty"; - else - out << " v " << pgi.last_update << "/" << pgi.last_complete - << " (" << pgi.log_bottom << "," << pgi.last_update << "]" - << (pgi.log_backlog ? "+backlog":""); - out << " e " << pgi.last_epoch_started << "/" << pgi.last_epoch_finished - << " " << pgi.history - << ")"; - return out; -} - -inline ostream& operator<<(ostream& out, const PG::Log::Entry& e) -{ - return out << " " << e.version - << (e.is_delete() ? " - ": - (e.is_clone() ? " c ": - (e.is_modify() ? " m ": - " ? "))) - << e.oid << " by " << e.reqid; -} - -inline ostream& operator<<(ostream& out, const PG::Log& log) -{ - out << "log(" << log.bottom << "," << log.top << "]"; - if (log.backlog) out << "+backlog"; - return out; -} - -inline ostream& operator<<(ostream& out, const PG::Missing& missing) -{ - out << "missing(" << missing.num_missing(); - if (missing.num_lost()) out << ", " << missing.num_lost() << " lost"; - out << ")"; - return out; -} - -inline ostream& operator<<(ostream& out, const PG& pg) -{ - out << "pg[" << pg.info - << " r=" << pg.get_role(); - - if (pg.log.bottom != pg.info.log_bottom) - out << " (info mismatch, " << pg.log << ")"; - - if (pg.log.log.empty()) { - // shoudl it be? - if (pg.log.top.version - pg.log.bottom.version != 0) { - out << " (log bound mismatch, empty)"; - } - } else { - if (((pg.log.log.begin()->version.version - 1 != pg.log.bottom.version) && - !pg.log.backlog) || - (pg.log.log.rbegin()->version.version != pg.log.top.version)) { - out << " (log bound mismatch, actual=[" - << pg.log.log.begin()->version << "," - << pg.log.log.rbegin()->version << "] len=" << pg.log.log.size() << ")"; - } - } - - if (pg.get_role() == 0) out << " pct " << pg.peers_complete_thru; - if (!pg.have_master_log) out << " !hml"; - if (pg.is_active()) out << " active"; - if (pg.is_crashed()) out << " crashed"; - if (pg.is_replay()) out << " replay"; - if (pg.is_clean()) out << " clean"; - if (pg.is_stray()) out << " stray"; - //out << " (" << pg.log.bottom << "," << pg.log.top << "]"; - if (pg.missing.num_missing()) out << " m=" << pg.missing.num_missing(); - if (pg.missing.num_lost()) out << " l=" << pg.missing.num_lost(); - out << "]"; - - - return out; -} - - - -#endif diff --git a/branches/sage/crush/osd/RAID4PG.cc b/branches/sage/crush/osd/RAID4PG.cc deleted file mode 100644 index 20cd6d8ab416b..0000000000000 --- a/branches/sage/crush/osd/RAID4PG.cc +++ /dev/null @@ -1,123 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "RAID4PG.h" -#include "OSD.h" - -#include "common/Logger.h" - -#include "messages/MOSDOp.h" -#include "messages/MOSDOpReply.h" - -#include "messages/MOSDPGNotify.h" -#include "messages/MOSDPGRemove.h" - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_osd) *_dout << dbeginl << g_clock.now() << " osd" << osd->get_nodeid() << " " << (osd->osdmap ? osd->osdmap->get_epoch():0) << " " << *this << " " - -#include -#include - - - - -void RAID4PG::do_op(MOSDOp *op) -{ - - -} - - - -void RAID4PG::do_op_reply(MOSDOpReply *reply) -{ - -} - - - -// ----------------- -// pg changes - -bool RAID4PG::same_for_read_since(epoch_t e) -{ - return e >= info.history.same_since; // whole pg set same -} - -bool RAID4PG::same_for_modify_since(epoch_t e) -{ - return e >= info.history.same_since; // whole pg set same -} - -bool RAID4PG::same_for_rep_modify_since(epoch_t e) -{ - return e >= info.history.same_since; // whole pg set same -} - - -// ----------------- -// RECOVERY - -bool RAID4PG::is_missing_object(object_t oid) -{ - return false; -} - -void RAID4PG::wait_for_missing_object(object_t oid, MOSDOp *op) -{ - //assert(0); -} - -void RAID4PG::note_failed_osd(int o) -{ - dout(10) << "note_failed_osd osd" << o << dendl; - //assert(0); -} - -void RAID4PG::on_acker_change() -{ - dout(10) << "on_acker_change" << dendl; - //assert(0); -} - - -void RAID4PG::on_role_change() -{ - dout(10) << "on_role_change" << dendl; - //assert(0); -} - - -void RAID4PG::clean_up_local(ObjectStore::Transaction&) -{ -} - -void RAID4PG::cancel_recovery() -{ - //assert(0); -} - -bool RAID4PG::do_recovery() -{ - //assert(0); - return false; -} - -void RAID4PG::purge_strays() -{ - //assert(0); -} - - - diff --git a/branches/sage/crush/osd/RAID4PG.h b/branches/sage/crush/osd/RAID4PG.h deleted file mode 100644 index 98e4deab56895..0000000000000 --- a/branches/sage/crush/osd/RAID4PG.h +++ /dev/null @@ -1,74 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __RAID4PG_H -#define __RAID4PG_H - - -#include "PG.h" - -#include "messages/MOSDOp.h" - - -class RAID4PG : public PG { -public: - -protected: - - void prepare_log_transaction(ObjectStore::Transaction& t, - MOSDOp *op, eversion_t& version, - objectrev_t crev, objectrev_t rev, - eversion_t trim_to); - void prepare_op_transaction(ObjectStore::Transaction& t, - MOSDOp *op, eversion_t& version, - objectrev_t crev, objectrev_t rev); - - void op_stat(MOSDOp *op); - int op_read(MOSDOp *op); - void op_modify(MOSDOp *op); - void op_rep_modify(MOSDOp *op); - void op_push(MOSDOp *op); - void op_pull(MOSDOp *op); - - - -public: - RAID4PG(OSD *o, pg_t p) : PG(o,p) { } - - void do_op(MOSDOp *op); - void do_op_reply(MOSDOpReply *r); - - bool same_for_read_since(epoch_t e); - bool same_for_modify_since(epoch_t e); - bool same_for_rep_modify_since(epoch_t e); - - bool is_missing_object(object_t oid); - void wait_for_missing_object(object_t oid, MOSDOp *op); - - void note_failed_osd(int osd); - - void on_acker_change(); - void on_role_change(); - - void clean_up_local(ObjectStore::Transaction& t); - - void cancel_recovery(); - bool do_recovery(); - - void purge_strays(); - - -}; - - -#endif diff --git a/branches/sage/crush/osd/ReplicatedPG.cc b/branches/sage/crush/osd/ReplicatedPG.cc deleted file mode 100644 index 7b5bdf581d643..0000000000000 --- a/branches/sage/crush/osd/ReplicatedPG.cc +++ /dev/null @@ -1,1972 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include "ReplicatedPG.h" -#include "OSD.h" - -#include "common/Logger.h" - -#include "messages/MOSDOp.h" -#include "messages/MOSDOpReply.h" - -#include "messages/MOSDPGNotify.h" -#include "messages/MOSDPGRemove.h" - -#include "messages/MOSDPing.h" - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_osd) *_dout << dbeginl << g_clock.now() << " osd" << osd->get_nodeid() << " " << (osd->osdmap ? osd->osdmap->get_epoch():0) << " " << *this << " " -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_osd) *_derr << dbeginl << g_clock.now() << " osd" << osd->get_nodeid() << " " << (osd->osdmap ? osd->osdmap->get_epoch():0) << " " << *this << " " - -#include -#include - -static const int LOAD_LATENCY = 1; -static const int LOAD_QUEUE_SIZE = 2; -static const int LOAD_HYBRID = 3; - - -// ======================= -// pg changes - -bool ReplicatedPG::same_for_read_since(epoch_t e) -{ - return (e >= info.history.same_acker_since); -} - -bool ReplicatedPG::same_for_modify_since(epoch_t e) -{ - return (e >= info.history.same_primary_since); -} - -bool ReplicatedPG::same_for_rep_modify_since(epoch_t e) -{ - // check osd map: same set, or primary+acker? - - if (g_conf.osd_rep == OSD_REP_CHAIN) { - return e >= info.history.same_since; // whole pg set same - } else { - // primary, splay - return (e >= info.history.same_primary_since && - e >= info.history.same_acker_since); - } -} - -// ==================== -// missing objects - -bool ReplicatedPG::is_missing_object(object_t oid) -{ - return missing.missing.count(oid); -} - - -void ReplicatedPG::wait_for_missing_object(object_t oid, MOSDOp *op) -{ - assert(is_missing_object(oid)); - - // we don't have it (yet). - eversion_t v = missing.missing[oid]; - if (objects_pulling.count(oid)) { - dout(7) << "missing " - << oid - << " v " << v - << ", already pulling" - << dendl; - } else { - dout(7) << "missing " - << oid - << " v " << v - << ", pulling" - << dendl; - pull(oid); - } - waiting_for_missing_object[oid].push_back(op); -} - - - - -/** preprocess_op - preprocess an op (before it gets queued). - * fasttrack read - */ -bool ReplicatedPG::preprocess_op(MOSDOp *op, utime_t now) -{ - // we only care about reads here on out.. - if (!op->is_read()) - return false; - - object_t oid = op->get_oid(); - - // -- load balance reads -- - if (is_primary() && - g_conf.osd_rep == OSD_REP_PRIMARY) { - // -- read on primary+acker --- - - // test - if (false) { - if (acting.size() > 1) { - int peer = acting[1]; - dout(-10) << "preprocess_op fwd client read op to osd" << peer << " for " << op->get_client() << " " << op->get_client_inst() << dendl; - osd->messenger->send_message(op, osd->osdmap->get_inst(peer)); - return true; - } - } - - // -- balance reads? - if (g_conf.osd_balance_reads && - !op->get_source().is_osd()) { - // flash crowd? - bool is_flash_crowd_candidate = false; - if (g_conf.osd_flash_crowd_iat_threshold > 0) { - osd->iat_averager.add_sample( oid, (double)g_clock.now() ); - is_flash_crowd_candidate = osd->iat_averager.is_flash_crowd_candidate( oid ); - } - - // hot? - double temp = 0; - if (stat_object_temp_rd.count(oid)) - temp = stat_object_temp_rd[oid].get(op->get_recv_stamp()); - bool is_hotly_read = temp > g_conf.osd_balance_reads_temp; - - dout(20) << "balance_reads oid " << oid << " temp " << temp - << (is_hotly_read ? " hotly_read":"") - << (is_flash_crowd_candidate ? " flash_crowd_candidate":"") - << dendl; - - bool should_balance = is_flash_crowd_candidate || is_hotly_read; - bool is_balanced = false; - bool b; - // *** FIXME *** this may block, and we're in the fast path! *** - if (osd->store->getattr(oid, "balance-reads", &b, 1) >= 0) - is_balanced = true; - - if (!is_balanced && should_balance && - balancing_reads.count(oid) == 0) { - dout(-10) << "preprocess_op balance-reads on " << oid << dendl; - balancing_reads.insert(oid); - MOSDOp *pop = new MOSDOp(osd->messenger->get_myinst(), 0, osd->get_tid(), - oid, - ObjectLayout(info.pgid), - osd->osdmap->get_epoch(), - OSD_OP_BALANCEREADS); - do_op(pop); - } - if (is_balanced && !should_balance && - !unbalancing_reads.count(oid) == 0) { - dout(-10) << "preprocess_op unbalance-reads on " << oid << dendl; - unbalancing_reads.insert(oid); - MOSDOp *pop = new MOSDOp(osd->messenger->get_myinst(), 0, osd->get_tid(), - oid, - ObjectLayout(info.pgid), - osd->osdmap->get_epoch(), - OSD_OP_UNBALANCEREADS); - do_op(pop); - } - } - - // -- read shedding - if (g_conf.osd_shed_reads && - g_conf.osd_stat_refresh_interval > 0 && - !op->get_source().is_osd()) { // no re-shedding! - Mutex::Locker lock(osd->peer_stat_lock); - - osd->_refresh_my_stat(now); - - // check my load. - // TODO xxx we must also compare with our own load - // if i am x percentage higher than replica , - // redirect the read - - int shedto = -1; - double bestscore = 0.0; // highest positive score wins - - // we calculate score values such that we can interpret them as a probability. - - switch (g_conf.osd_shed_reads) { - case LOAD_LATENCY: - // above some minimum? - if (osd->my_stat.read_latency >= g_conf.osd_shed_reads_min_latency) { - for (unsigned i=1; ipeer_stat.count(peer) == 0) continue; - - // assume a read_latency of 0 (technically, undefined) is OK, since - // we'll be corrected soon enough if we're wrong. - - double plat = osd->peer_stat[peer].read_latency_mine; - - double diff = osd->my_stat.read_latency - plat; - if (diff < g_conf.osd_shed_reads_min_latency_diff) continue; - - double c = .002; // add in a constant to smooth it a bit - double latratio = - (c+osd->my_stat.read_latency) / - (c+plat); - double p = (latratio - 1.0) / 2.0 / latratio; - dout(15) << "preprocess_op " << op->get_reqid() - << " my read latency " << osd->my_stat.read_latency - << ", peer osd" << peer << " is " << plat << " (" << osd->peer_stat[peer].read_latency << ")" - << ", latratio " << latratio - << ", p=" << p - << dendl; - if (latratio > g_conf.osd_shed_reads_min_latency_ratio && - p > bestscore && - drand48() < p) { - shedto = peer; - bestscore = p; - } - } - } - break; - - case LOAD_HYBRID: - // dumb mostly - if (osd->my_stat.read_latency >= g_conf.osd_shed_reads_min_latency) { - for (unsigned i=1; ipeer_stat.count(peer) == 0/* || - osd->peer_stat[peer].read_latency <= 0*/) continue; - - if (osd->peer_stat[peer].qlen < osd->my_stat.qlen) { - - if (osd->my_stat.read_latency - osd->peer_stat[peer].read_latency > - g_conf.osd_shed_reads_min_latency_diff) continue; - - double qratio = osd->pending_ops / osd->peer_stat[peer].qlen; - - double c = .002; // add in a constant to smooth it a bit - double latratio = - (c+osd->my_stat.read_latency)/ - (c+osd->peer_stat[peer].read_latency); - double p = (latratio - 1.0) / 2.0 / latratio; - - dout(-15) << "preprocess_op " << op->get_reqid() - << " my qlen / rdlat " - << osd->pending_ops << " " << osd->my_stat.read_latency - << ", peer osd" << peer << " is " - << osd->peer_stat[peer].qlen << " " << osd->peer_stat[peer].read_latency - << ", qratio " << qratio - << ", latratio " << latratio - << ", p=" << p - << dendl; - if (latratio > g_conf.osd_shed_reads_min_latency_ratio && - p > bestscore && - drand48() < p) { - shedto = peer; - bestscore = p; - } - } - } - } - break; - - /* - case LOAD_QUEUE_SIZE: - // am i above my average? -- dumb - if (osd->pending_ops > osd->my_stat.qlen) { - // yes. is there a peer who is below my average? - for (unsigned i=1; ipeer_stat.count(peer) == 0) continue; - if (osd->peer_stat[peer].qlen < osd->my_stat.qlen) { - // calculate a probability that we should redirect - float p = (osd->my_stat.qlen - osd->peer_stat[peer].qlen) / osd->my_stat.qlen; // this is dumb. - float v = 1.0 - p; - - dout(10) << "my qlen " << osd->pending_ops << " > my_avg " << osd->my_stat.qlen - << ", peer osd" << peer << " has qlen " << osd->peer_stat[peer].qlen - << ", p=" << p - << ", v= "<< v - << dendl; - - if (v > bestscore) { - shedto = peer; - bestscore = v; - } - } - } - } - break;*/ - - } - - // shed? - if (shedto >= 0) { - dout(10) << "preprocess_op shedding read to peer osd" << shedto - << " " << op->get_reqid() - << dendl; - op->set_peer_stat(osd->my_stat); - osd->messenger->send_message(op, osd->osdmap->get_inst(shedto)); - osd->stat_rd_ops_shed_out++; - osd->logger->inc("shdout"); - return true; - } - } - } // endif balance reads - - - // -- fastpath read? - // if this is a read and the data is in the cache, do an immediate read.. - if ( g_conf.osd_immediate_read_from_cache ) { - if (osd->store->is_cached( oid , - op->get_offset(), - op->get_length() ) == 0) { - if (!is_primary() && !op->get_source().is_osd()) { - // am i allowed? - bool v; - if (osd->store->getattr(oid, "balance-reads", &v, 1) < 0) { - dout(-10) << "preprocess_op in-cache but no balance-reads on " << oid - << ", fwd to primary" << dendl; - osd->messenger->send_message(op, osd->osdmap->get_inst(get_primary())); - return true; - } - } - - // do it now - dout(10) << "preprocess_op data is in cache, reading from cache" << *op << dendl; - do_op(op); - return true; - } - } - - return false; -} - - -/** do_op - do an op - * pg lock will be held (if multithreaded) - * osd_lock NOT held. - */ -void ReplicatedPG::do_op(MOSDOp *op) -{ - //dout(15) << "do_op " << *op << dendl; - - osd->logger->inc("op"); - - switch (op->get_op()) { - - // reads - case OSD_OP_READ: - case OSD_OP_STAT: - op_read(op); - break; - - // rep stuff - case OSD_OP_PULL: - op_pull(op); - break; - case OSD_OP_PUSH: - op_push(op); - break; - - // writes - case OSD_OP_WRNOOP: - case OSD_OP_WRITE: - case OSD_OP_ZERO: - case OSD_OP_DELETE: - case OSD_OP_TRUNCATE: - case OSD_OP_WRLOCK: - case OSD_OP_WRUNLOCK: - case OSD_OP_RDLOCK: - case OSD_OP_RDUNLOCK: - case OSD_OP_UPLOCK: - case OSD_OP_DNLOCK: - case OSD_OP_BALANCEREADS: - case OSD_OP_UNBALANCEREADS: - if (op->get_source().is_osd()) { - op_rep_modify(op); - } else { - // go go gadget pg - op_modify(op); - } - break; - - default: - assert(0); - } -} - -void ReplicatedPG::do_op_reply(MOSDOpReply *r) -{ - if (r->get_op() == OSD_OP_PUSH) { - // continue peer recovery - op_push_reply(r); - } else { - // must be replication. - tid_t rep_tid = r->get_rep_tid(); - int fromosd = r->get_source().num(); - - osd->take_peer_stat(fromosd, r->get_peer_stat()); - - if (rep_gather.count(rep_tid)) { - // oh, good. - repop_ack(rep_gather[rep_tid], - r->get_result(), r->get_commit(), - fromosd, - r->get_pg_complete_thru()); - delete r; - } else { - // early ack. - waiting_for_repop[rep_tid].push_back(r); - } - } -} - - - - -// ======================================================================== -// READS - -void ReplicatedPG::op_read(MOSDOp *op) -{ - object_t oid = op->get_oid(); - - dout(10) << "op_read " << MOSDOp::get_opname(op->get_op()) - << " " << oid - << " " << op->get_offset() << "~" << op->get_length() - << dendl; - - // wrlocked? - if (block_if_wrlocked(op)) - return; - - // !primary and unbalanced? - // (ignore ops forwarded from the primary) - if (!is_primary()) { - if (op->get_source().is_osd() && - op->get_source().num() == get_primary()) { - // read was shed to me by the primary - int from = op->get_source().num(); - osd->take_peer_stat(from, op->get_peer_stat()); - dout(10) << "read shed IN from " << op->get_source() - << " " << op->get_reqid() - << ", me = " << osd->my_stat.read_latency_mine - << ", them = " << op->get_peer_stat().read_latency - << (osd->my_stat.read_latency_mine > op->get_peer_stat().read_latency ? " WTF":"") - << dendl; - osd->logger->inc("shdin"); - - // does it look like they were wrong to do so? - Mutex::Locker lock(osd->peer_stat_lock); - if (osd->my_stat.read_latency_mine > op->get_peer_stat().read_latency && - osd->my_stat_on_peer[from].read_latency_mine < op->get_peer_stat().read_latency) { - dout(-10) << "read shed IN from " << op->get_source() - << " " << op->get_reqid() - << " and me " << osd->my_stat.read_latency_mine - << " > them " << op->get_peer_stat().read_latency - << ", but they didn't know better, sharing" << dendl; - osd->my_stat_on_peer[from] = osd->my_stat; - osd->messenger->send_message(new MOSDPing(osd->osdmap->get_epoch(), osd->my_stat), - osd->osdmap->get_inst(from)); - } - } else { - // make sure i exist and am balanced, otherwise fw back to acker. - bool b; - if (!osd->store->exists(oid) || - osd->store->getattr(oid, "balance-reads", &b, 1) < 0) { - dout(-10) << "read on replica, object " << oid - << " dne or no balance-reads, fw back to primary" << dendl; - osd->messenger->send_message(op, osd->osdmap->get_inst(get_acker())); - return; - } - } - } - - - // set up reply - MOSDOpReply *reply = new MOSDOpReply(op, 0, osd->osdmap->get_epoch(), true); - long r = 0; - - // do it. - if (oid.rev && !pick_object_rev(oid)) { - // we have no revision for this request. - r = -EEXIST; - } else { - switch (op->get_op()) { - case OSD_OP_READ: - { - // read into a buffer - bufferlist bl; - r = osd->store->read(oid, - op->get_offset(), op->get_length(), - bl); - reply->set_data(bl); - reply->set_length(r); - dout(15) << " read got " << r << " / " << op->get_length() << " bytes from obj " << oid << dendl; - } - osd->logger->inc("c_rd"); - osd->logger->inc("c_rdb", op->get_length()); - break; - - case OSD_OP_STAT: - { - struct stat st; - memset(&st, sizeof(st), 0); - r = osd->store->stat(oid, &st); - if (r >= 0) - reply->set_object_size(st.st_size); - } - break; - - default: - assert(0); - } - } - - if (r >= 0) { - reply->set_result(0); - - utime_t now = g_clock.now(); - utime_t diff = now; - diff -= op->get_recv_stamp(); - dout(10) << "op_read " << op->get_reqid() << " total op latency " << diff << dendl; - Mutex::Locker lock(osd->peer_stat_lock); - osd->stat_rd_ops_in_queue--; - osd->read_latency_calc.add(diff); - - if (is_primary() && - g_conf.osd_balance_reads) - stat_object_temp_rd[oid].hit(now); // hit temp. - - } else { - reply->set_result(r); // error - } - - // send it - osd->messenger->send_message(reply, op->get_client_inst()); - - delete op; -} - - - - - - -// ======================================================================== -// MODIFY - -void ReplicatedPG::prepare_log_transaction(ObjectStore::Transaction& t, - MOSDOp *op, eversion_t& version, - objectrev_t crev, objectrev_t rev, - eversion_t trim_to) -{ - const object_t oid = op->get_oid(); - - // clone entry? - if (crev && rev && rev > crev) { - eversion_t cv = version; - cv.version--; - Log::Entry cloneentry(PG::Log::Entry::CLONE, oid, cv, op->get_reqid()); - log.add(cloneentry); - - dout(10) << "prepare_log_transaction " << op->get_op() - << " " << cloneentry - << dendl; - } - - // actual op - int opcode = Log::Entry::MODIFY; - if (op->get_op() == OSD_OP_DELETE) opcode = Log::Entry::DELETE; - Log::Entry logentry(opcode, oid, version, op->get_reqid()); - - dout(10) << "prepare_log_transaction " << op->get_op() - << " " << logentry - << dendl; - - // append to log - assert(version > log.top); - log.add(logentry); - assert(log.top == version); - dout(10) << "prepare_log_transaction appended" << dendl; - - // write to pg log on disk - append_log(t, logentry, trim_to); -} - - -/** prepare_op_transaction - * apply an op to the store wrapped in a transaction. - */ -void ReplicatedPG::prepare_op_transaction(ObjectStore::Transaction& t, - MOSDOp *op, eversion_t& version, - objectrev_t crev, objectrev_t rev) -{ - const object_t oid = op->get_oid(); - const pg_t pgid = op->get_pg(); - - bool did_clone = false; - - dout(10) << "prepare_op_transaction " << MOSDOp::get_opname( op->get_op() ) - << " " << oid - << " v " << version - << " crev " << crev - << " rev " << rev - << dendl; - - // WRNOOP does nothing. - if (op->get_op() == OSD_OP_WRNOOP) - return; - - // raise last_complete? - if (info.last_complete == info.last_update) - info.last_complete = version; - - // raise last_update. - assert(version > info.last_update); - info.last_update = version; - - // write pg info - t.collection_setattr(pgid, "info", &info, sizeof(info)); - - // clone? - if (crev && rev && rev > crev) { - object_t noid = oid; - noid.rev = rev; - dout(10) << "prepare_op_transaction cloning " << oid << " crev " << crev << " to " << noid << dendl; - t.clone(oid, noid); - did_clone = true; - } - - // apply the op - switch (op->get_op()) { - - // -- locking -- - - case OSD_OP_WRLOCK: - { // lock object - t.setattr(oid, "wrlock", &op->get_client(), sizeof(entity_name_t)); - } - break; - case OSD_OP_WRUNLOCK: - { // unlock objects - t.rmattr(oid, "wrlock"); - } - break; - - case OSD_OP_MININCLOCK: - { - uint32_t mininc = op->get_length(); - t.setattr(oid, "mininclock", &mininc, sizeof(mininc)); - } - break; - - case OSD_OP_BALANCEREADS: - { - bool bal = true; - t.setattr(oid, "balance-reads", &bal, sizeof(bal)); - } - break; - case OSD_OP_UNBALANCEREADS: - { - t.rmattr(oid, "balance-reads"); - } - break; - - - // -- modify -- - - case OSD_OP_WRITE: - { // write - assert(op->get_data().length() == op->get_length()); - bufferlist bl; - bl.claim( op->get_data() ); // give buffers to store; we keep *op in memory for a long time! - - //if (oid < 100000000000000ULL) // hack hack-- don't write client data - t.write( oid, op->get_offset(), op->get_length(), bl ); - } - break; - - case OSD_OP_ZERO: - { - // zero, remove, or truncate? - struct stat st; - int r = osd->store->stat(oid, &st); - if (r >= 0) { - if (op->get_length() == 0 || - op->get_offset() + (off_t)op->get_length() >= (off_t)st.st_size) { - if (op->get_offset()) - t.truncate(oid, op->get_length() + op->get_offset()); - else - t.remove(oid); - } else { - // zero. the dumb way. FIXME. - bufferptr bp(op->get_length()); - bp.zero(); - bufferlist bl; - bl.push_back(bp); - t.write(oid, op->get_offset(), op->get_length(), bl); - } - } else { - // noop? - dout(10) << "apply_transaction zero on " << oid << ", but dne? stat returns " << r << dendl; - } - } - break; - - case OSD_OP_TRUNCATE: - { // truncate - t.truncate(oid, op->get_length() ); - } - break; - - case OSD_OP_DELETE: - { // delete - t.remove(oid); - } - break; - - default: - assert(0); - } - - // object collection, version - if (op->get_op() == OSD_OP_DELETE) { - // remove object from c - t.collection_remove(pgid, oid); - } else { - // add object to c - t.collection_add(pgid, oid); - - // object version - t.setattr(oid, "version", &version, sizeof(version)); - - // set object crev - if (crev == 0 || // new object - did_clone) // we cloned - t.setattr(oid, "crev", &rev, sizeof(rev)); - } -} - - - -// ======================================================================== -// rep op gather - -class C_OSD_ModifyCommit : public Context { -public: - ReplicatedPG *pg; - tid_t rep_tid; - eversion_t pg_last_complete; - C_OSD_ModifyCommit(ReplicatedPG *p, tid_t rt, eversion_t lc) : pg(p), rep_tid(rt), pg_last_complete(lc) { - pg->get(); // we're copying the pointer - } - void finish(int r) { - pg->lock(); - if (!pg->is_deleted()) - pg->op_modify_commit(rep_tid, pg_last_complete); - pg->put_unlock(); - } -}; - - -void ReplicatedPG::get_rep_gather(RepGather *repop) -{ - //repop->lock.Lock(); - dout(10) << "get_repop " << *repop << dendl; -} - -void ReplicatedPG::apply_repop(RepGather *repop) -{ - dout(10) << "apply_repop applying update on " << *repop << dendl; - assert(!repop->applied); - - Context *oncommit = new C_OSD_ModifyCommit(this, repop->rep_tid, repop->pg_local_last_complete); - unsigned r = osd->store->apply_transaction(repop->t, oncommit); - if (r) - dout(-10) << "apply_repop apply transaction return " << r << " on " << *repop << dendl; - - // discard my reference to the buffer - repop->op->get_data().clear(); - - repop->applied = true; - - - // any completion stuff to do here? - object_t oid = repop->op->get_oid(); - - switch (repop->op->get_op()) { - case OSD_OP_UNBALANCEREADS: - dout(-10) << "apply_repop completed unbalance-reads on " << oid << dendl; - unbalancing_reads.erase(oid); - if (waiting_for_unbalanced_reads.count(oid)) { - osd->take_waiters(waiting_for_unbalanced_reads[oid]); - waiting_for_unbalanced_reads.erase(oid); - } - break; - - case OSD_OP_BALANCEREADS: - dout(-10) << "apply_repop completed balance-reads on " << oid << dendl; - /* - if (waiting_for_balanced_reads.count(oid)) { - osd->take_waiters(waiting_for_balanced_reads[oid]); - waiting_for_balanced_reads.erase(oid); - } - */ - break; - - case OSD_OP_WRUNLOCK: - dout(-10) << "apply_repop completed wrunlock on " << oid << dendl; - if (waiting_for_wr_unlock.count(oid)) { - osd->take_waiters(waiting_for_wr_unlock[oid]); - waiting_for_wr_unlock.erase(oid); - } - break; - } - - -} - -void ReplicatedPG::put_rep_gather(RepGather *repop) -{ - dout(10) << "put_repop " << *repop << dendl; - - // commit? - if (repop->can_send_commit() && - repop->op->wants_commit()) { - // send commit. - if (repop->op->wants_reply()) { - MOSDOpReply *reply = new MOSDOpReply(repop->op, 0, osd->osdmap->get_epoch(), true); - dout(10) << "put_repop sending commit on " << *repop << " " << reply << dendl; - osd->messenger->send_message(reply, repop->op->get_client_inst()); - } - repop->sent_commit = true; - } - - // ack? - else if (repop->can_send_ack() && - repop->op->wants_ack()) { - // apply - apply_repop(repop); - - // send ack - if (repop->op->wants_reply()) { - MOSDOpReply *reply = new MOSDOpReply(repop->op, 0, osd->osdmap->get_epoch(), false); - dout(10) << "put_repop sending ack on " << *repop << " " << reply << dendl; - osd->messenger->send_message(reply, repop->op->get_client_inst()); - } - repop->sent_ack = true; - - utime_t now = g_clock.now(); - now -= repop->start; - osd->logger->finc("rlsum", now); - osd->logger->inc("rlnum", 1); - } - - // done. - if (repop->can_delete()) { - // adjust peers_complete_thru - if (!repop->pg_complete_thru.empty()) { - eversion_t min = info.last_complete; // hrm.... - for (unsigned i=0; ipg_complete_thru[acting[i]] < min) // note: if we haven't heard, it'll be zero, which is what we want. - min = repop->pg_complete_thru[acting[i]]; - } - - if (min > peers_complete_thru) { - dout(10) << "put_repop peers_complete_thru " - << peers_complete_thru << " -> " << min - << dendl; - peers_complete_thru = min; - } - } - - dout(10) << "put_repop deleting " << *repop << dendl; - - assert(rep_gather.count(repop->rep_tid)); - rep_gather.erase(repop->rep_tid); - - delete repop->op; - delete repop; - } -} - - -void ReplicatedPG::issue_repop(MOSDOp *op, int dest, utime_t now) -{ - object_t oid = op->get_oid(); - - dout(7) << " issue_repop rep_tid " << op->get_rep_tid() - << " o " << oid - << " to osd" << dest - << dendl; - - // forward the write/update/whatever - MOSDOp *wr = new MOSDOp(op->get_client_inst(), op->get_client_inc(), op->get_reqid().tid, - oid, - ObjectLayout(info.pgid), - osd->osdmap->get_epoch(), - op->get_op()); - wr->get_data() = op->get_data(); // _copy_ bufferlist - wr->set_length(op->get_length()); - wr->set_offset(op->get_offset()); - wr->set_version(op->get_version()); - - wr->set_rep_tid(op->get_rep_tid()); - wr->set_pg_trim_to(peers_complete_thru); - - wr->set_peer_stat(osd->get_my_stat_for(now, dest)); - - osd->messenger->send_message(wr, osd->osdmap->get_inst(dest)); -} - -ReplicatedPG::RepGather *ReplicatedPG::new_rep_gather(MOSDOp *op) -{ - dout(10) << "new_rep_gather rep_tid " << op->get_rep_tid() << " on " << *op << dendl; - int whoami = osd->get_nodeid(); - - RepGather *repop = new RepGather(op, op->get_rep_tid(), - op->get_version(), - info.last_complete); - - // osds. commits all come to me. - for (unsigned i=0; iosds.insert(osd); - repop->waitfor_commit.insert(osd); - } - - // acks vary: - if (g_conf.osd_rep == OSD_REP_CHAIN) { - // chain rep. - // there's my local ack... - repop->osds.insert(whoami); - repop->waitfor_ack.insert(whoami); - repop->waitfor_commit.insert(whoami); - - // also, the previous guy will ack to me - int myrank = osd->osdmap->calc_pg_rank(whoami, acting); - if (myrank > 0) { - int osd = acting[ myrank-1 ]; - repop->osds.insert(osd); - repop->waitfor_ack.insert(osd); - repop->waitfor_commit.insert(osd); - } - } else { - // primary, splay. all osds ack to me. - for (unsigned i=0; iwaitfor_ack.insert(osd); - } - } - - repop->start = g_clock.now(); - - rep_gather[ repop->rep_tid ] = repop; - - // anyone waiting? (acks that got here before the op did) - if (waiting_for_repop.count(repop->rep_tid)) { - osd->take_waiters(waiting_for_repop[repop->rep_tid]); - waiting_for_repop.erase(repop->rep_tid); - } - - return repop; -} - - -void ReplicatedPG::repop_ack(RepGather *repop, - int result, bool commit, - int fromosd, eversion_t pg_complete_thru) -{ - MOSDOp *op = repop->op; - - dout(7) << "repop_ack rep_tid " << repop->rep_tid << " op " << *op - << " result " << result << " commit " << commit << " from osd" << fromosd - << dendl; - - get_rep_gather(repop); - { - if (commit) { - // commit - assert(repop->waitfor_commit.count(fromosd)); - repop->waitfor_commit.erase(fromosd); - repop->waitfor_ack.erase(fromosd); - repop->pg_complete_thru[fromosd] = pg_complete_thru; - } else { - // ack - repop->waitfor_ack.erase(fromosd); - } - } - put_rep_gather(repop); -} - - - - - - - - - - - - - - - - - - - - - - - -/** op_modify_commit - * transaction commit on the acker. - */ -void ReplicatedPG::op_modify_commit(tid_t rep_tid, eversion_t pg_complete_thru) -{ - if (rep_gather.count(rep_tid)) { - RepGather *repop = rep_gather[rep_tid]; - - dout(10) << "op_modify_commit " << *repop->op << dendl; - get_rep_gather(repop); - { - assert(repop->waitfor_commit.count(osd->get_nodeid())); - repop->waitfor_commit.erase(osd->get_nodeid()); - repop->pg_complete_thru[osd->get_nodeid()] = pg_complete_thru; - } - put_rep_gather(repop); - dout(10) << "op_modify_commit done on " << repop << dendl; - } else { - dout(10) << "op_modify_commit rep_tid " << rep_tid << " dne" << dendl; - } -} - - - -objectrev_t ReplicatedPG::assign_version(MOSDOp *op) -{ - object_t oid = op->get_oid(); - - // check crev - objectrev_t crev = 0; - osd->store->getattr(oid, "crev", (char*)&crev, sizeof(crev)); - - // assign version - eversion_t clone_version; - eversion_t nv = log.top; - if (op->get_op() != OSD_OP_WRNOOP) { - nv.epoch = osd->osdmap->get_epoch(); - nv.version++; - assert(nv > info.last_update); - assert(nv > log.top); - - // will clone? - if (crev && op->get_rev() && op->get_rev() > crev) { - clone_version = nv; - nv.version++; - } - - if (op->get_version().version) { - // replay! - if (nv.version < op->get_version().version) { - nv.version = op->get_version().version; - - // clone? - if (crev && op->get_rev() && op->get_rev() > crev) { - // backstep clone - clone_version = nv; - clone_version.version--; - } - } - } - } - - // set version in op, for benefit of client and our eventual reply - op->set_version(nv); - - return crev; -} - - -// commit (to disk) callback -class C_OSD_RepModifyCommit : public Context { -public: - ReplicatedPG *pg; - MOSDOp *op; - int destosd; - - eversion_t pg_last_complete; - - Mutex lock; - Cond cond; - bool acked; - bool waiting; - - C_OSD_RepModifyCommit(ReplicatedPG *p, MOSDOp *oo, int dosd, eversion_t lc) : - pg(p), op(oo), destosd(dosd), pg_last_complete(lc), - acked(false), waiting(false) { - pg->get(); // we're copying the pointer. - } - void finish(int r) { - lock.Lock(); - assert(!waiting); - while (!acked) { - waiting = true; - cond.Wait(lock); - } - assert(acked); - lock.Unlock(); - - pg->lock(); - pg->op_rep_modify_commit(op, destosd, pg_last_complete); - pg->put_unlock(); - } - void ack() { - lock.Lock(); - assert(!acked); - acked = true; - if (waiting) cond.Signal(); - - // discard my reference to buffer - op->get_data().clear(); - - lock.Unlock(); - } -}; - - -void ReplicatedPG::op_modify(MOSDOp *op) -{ - int whoami = osd->get_nodeid(); - object_t oid = op->get_oid(); - const char *opname = MOSDOp::get_opname(op->get_op()); - - // --- locking --- - - // wrlock? - if (op->get_op() != OSD_OP_WRNOOP && // except WRNOOP; we just want to flush - block_if_wrlocked(op)) - return; // op will be handled later, after the object unlocks - - // balance-reads set? - char v; - if ((op->get_op() != OSD_OP_BALANCEREADS && op->get_op() != OSD_OP_UNBALANCEREADS) && - (osd->store->getattr(op->get_oid(), "balance-reads", &v, 1) >= 0 || - balancing_reads.count(op->get_oid()))) { - - if (!unbalancing_reads.count(op->get_oid())) { - // unbalance - dout(-10) << "preprocess_op unbalancing-reads on " << op->get_oid() << dendl; - unbalancing_reads.insert(op->get_oid()); - - MOSDOp *pop = new MOSDOp(osd->messenger->get_myinst(), 0, osd->get_tid(), - op->get_oid(), - ObjectLayout(info.pgid), - osd->osdmap->get_epoch(), - OSD_OP_UNBALANCEREADS); - do_op(pop); - } - - // add to wait queue - dout(-10) << "preprocess_op waiting for unbalance-reads on " << op->get_oid() << dendl; - waiting_for_unbalanced_reads[op->get_oid()].push_back(op); - return; - } - - - // dup op? - if (is_dup(op->get_reqid())) { - dout(3) << "op_modify " << opname << " dup op " << op->get_reqid() - << ", doing WRNOOP" << dendl; - op->set_op(OSD_OP_WRNOOP); - opname = MOSDOp::get_opname(op->get_op()); - } - - // assign the op a version - objectrev_t crev = assign_version(op); - eversion_t nv = op->get_version(); - - // are any peers missing this? - for (unsigned i=1; iget_rev() - << " " << op->get_offset() << "~" << op->get_length() - << dendl; - - if (op->get_op() == OSD_OP_WRITE) { - osd->logger->inc("c_wr"); - osd->logger->inc("c_wrb", op->get_length()); - } - - // note my stats - utime_t now = g_clock.now(); - - // issue replica writes - RepGather *repop = 0; - bool alone = (acting.size() == 1); - tid_t rep_tid = osd->get_tid(); - op->set_rep_tid(rep_tid); - - if (g_conf.osd_rep == OSD_REP_CHAIN && !alone) { - // chain rep. send to #2 only. - int next = acting[1]; - if (acting.size() > 2) - next = acting[2]; - issue_repop(op, next, now); - } - else if (g_conf.osd_rep == OSD_REP_SPLAY && !alone) { - // splay rep. send to rest. - for (unsigned i=1; i=1; --i) - issue_repop(op, acting[i], now); - } else { - // primary rep, or alone. - repop = new_rep_gather(op); - - // send to rest. - if (!alone) - for (unsigned i=1; iget_op() != OSD_OP_WRNOOP) { - // log and update later. - prepare_log_transaction(repop->t, op, nv, crev, op->get_rev(), peers_complete_thru); - prepare_op_transaction(repop->t, op, nv, crev, op->get_rev()); - } - - // (logical) local ack. - // (if alone, this will apply the update.) - get_rep_gather(repop); - { - assert(repop->waitfor_ack.count(whoami)); - repop->waitfor_ack.erase(whoami); - } - put_rep_gather(repop); - - } else { - // not acker. - // chain or splay. apply. - ObjectStore::Transaction t; - prepare_log_transaction(t, op, nv, crev, op->get_rev(), peers_complete_thru); - prepare_op_transaction(t, op, nv, crev, op->get_rev()); - - C_OSD_RepModifyCommit *oncommit = new C_OSD_RepModifyCommit(this, op, get_acker(), - info.last_complete); - unsigned r = osd->store->apply_transaction(t, oncommit); - if (r != 0 && // no errors - r != 2) { // or error on collection_add - derr(0) << "error applying transaction: r = " << r << dendl; - assert(r == 0); - } - - // lets evict the data from our cache to maintain a total large cache size - if (g_conf.osd_exclusive_caching) - osd->store->trim_from_cache(op->get_oid(), op->get_offset(), op->get_length()); - - oncommit->ack(); - } - -} - - - -// replicated - - - - -void ReplicatedPG::op_rep_modify(MOSDOp *op) -{ - object_t oid = op->get_oid(); - eversion_t nv = op->get_version(); - - const char *opname = MOSDOp::get_opname(op->get_op()); - - // check crev - objectrev_t crev = 0; - osd->store->getattr(oid, "crev", (char*)&crev, sizeof(crev)); - - dout(10) << "op_rep_modify " << opname - << " " << oid - << " v " << nv - << " " << op->get_offset() << "~" << op->get_length() - << dendl; - - // note peer's stat - int fromosd = op->get_source().num(); - osd->take_peer_stat(fromosd, op->get_peer_stat()); - - // we better not be missing this. - assert(!missing.is_missing(oid)); - - // prepare our transaction - ObjectStore::Transaction t; - - // am i acker? - RepGather *repop = 0; - int ackerosd = acting[0]; - - if ((g_conf.osd_rep == OSD_REP_CHAIN || g_conf.osd_rep == OSD_REP_SPLAY)) { - ackerosd = get_acker(); - - if (is_acker()) { - // i am tail acker. - if (rep_gather.count(op->get_rep_tid())) { - repop = rep_gather[ op->get_rep_tid() ]; - } else { - repop = new_rep_gather(op); - } - - // infer ack from source - get_rep_gather(repop); - { - //assert(repop->waitfor_ack.count(fromosd)); // no, we may come thru here twice. - repop->waitfor_ack.erase(fromosd); - } - put_rep_gather(repop); - - // prepare dest socket - //messenger->prepare_send_message(op->get_client()); - } - - // chain? forward? - if (g_conf.osd_rep == OSD_REP_CHAIN && !is_acker()) { - // chain rep, not at the tail yet. - int myrank = osd->osdmap->calc_pg_rank(osd->get_nodeid(), acting); - int next = myrank+1; - if (next == (int)acting.size()) - next = 1; - issue_repop(op, acting[next], g_clock.now()); - } - } - - // do op? - C_OSD_RepModifyCommit *oncommit = 0; - - osd->logger->inc("r_wr"); - osd->logger->inc("r_wrb", op->get_length()); - - if (repop) { - // acker. we'll apply later. - if (op->get_op() != OSD_OP_WRNOOP) { - prepare_log_transaction(repop->t, op, nv, crev, op->get_rev(), op->get_pg_trim_to()); - prepare_op_transaction(repop->t, op, nv, crev, op->get_rev()); - } - } else { - // middle|replica. - if (op->get_op() != OSD_OP_WRNOOP) { - prepare_log_transaction(t, op, nv, crev, op->get_rev(), op->get_pg_trim_to()); - prepare_op_transaction(t, op, nv, crev, op->get_rev()); - } - - oncommit = new C_OSD_RepModifyCommit(this, op, ackerosd, info.last_complete); - - // apply log update. and possibly update itself. - unsigned tr = osd->store->apply_transaction(t, oncommit); - if (tr != 0 && // no errors - tr != 2) { // or error on collection_add - derr(0) << "error applying transaction: r = " << tr << dendl; - assert(tr == 0); - } - } - - // ack? - if (repop) { - // (logical) local ack. this may induce the actual update. - get_rep_gather(repop); - { - assert(repop->waitfor_ack.count(osd->get_nodeid())); - repop->waitfor_ack.erase(osd->get_nodeid()); - } - put_rep_gather(repop); - } - else { - // send ack to acker? - if (g_conf.osd_rep != OSD_REP_CHAIN) { - MOSDOpReply *ack = new MOSDOpReply(op, 0, osd->osdmap->get_epoch(), false); - ack->set_peer_stat(osd->get_my_stat_for(g_clock.now(), ackerosd)); - osd->messenger->send_message(ack, osd->osdmap->get_inst(ackerosd)); - } - - // ack myself. - assert(oncommit); - oncommit->ack(); - } - -} - - -void ReplicatedPG::op_rep_modify_commit(MOSDOp *op, int ackerosd, eversion_t last_complete) -{ - // send commit. - dout(10) << "rep_modify_commit on op " << *op - << ", sending commit to osd" << ackerosd - << dendl; - if (osd->osdmap->is_up(ackerosd)) { - MOSDOpReply *commit = new MOSDOpReply(op, 0, osd->osdmap->get_epoch(), true); - commit->set_pg_complete_thru(last_complete); - commit->set_peer_stat(osd->get_my_stat_for(g_clock.now(), ackerosd)); - osd->messenger->send_message(commit, osd->osdmap->get_inst(ackerosd)); - delete op; - } -} - - - - - - - - - - -// =========================================================== - -/** pull - request object from a peer - */ -void ReplicatedPG::pull(object_t oid) -{ - assert(missing.loc.count(oid)); - eversion_t v = missing.missing[oid]; - int fromosd = missing.loc[oid]; - - dout(7) << "pull " << oid - << " v " << v - << " from osd" << fromosd - << dendl; - - // send op - tid_t tid = osd->get_tid(); - MOSDOp *op = new MOSDOp(osd->messenger->get_myinst(), 0, tid, - oid, info.pgid, - osd->osdmap->get_epoch(), - OSD_OP_PULL); - op->set_version(v); - osd->messenger->send_message(op, osd->osdmap->get_inst(fromosd)); - - // take note - assert(objects_pulling.count(oid) == 0); - num_pulling++; - objects_pulling[oid] = v; -} - - -/** push - send object to a peer - */ -void ReplicatedPG::push(object_t oid, int peer) -{ - // read data+attrs - bufferlist bl; - eversion_t v; - int vlen = sizeof(v); - map attrset; - - ObjectStore::Transaction t; - t.read(oid, 0, 0, &bl); - t.getattr(oid, "version", &v, &vlen); - t.getattrs(oid, attrset); - unsigned tr = osd->store->apply_transaction(t); - - assert(tr == 0); // !!! - - // ok - dout(7) << "push " << oid << " v " << v - << " size " << bl.length() - << " to osd" << peer - << dendl; - - osd->logger->inc("r_push"); - osd->logger->inc("r_pushb", bl.length()); - - // send - MOSDOp *op = new MOSDOp(osd->messenger->get_myinst(), 0, osd->get_tid(), - oid, info.pgid, osd->osdmap->get_epoch(), - OSD_OP_PUSH); - op->set_offset(0); - op->set_length(bl.length()); - op->set_data(bl); // note: claims bl, set length above here! - op->set_version(v); - op->set_attrset(attrset); - - osd->messenger->send_message(op, osd->osdmap->get_inst(peer)); - - if (is_primary()) { - peer_missing[peer].got(oid); - pushing[oid].insert(peer); - } -} - - - -/** op_pull - * process request to pull an entire object. - * NOTE: called from opqueue. - */ -void ReplicatedPG::op_pull(MOSDOp *op) -{ - const object_t oid = op->get_oid(); - const eversion_t v = op->get_version(); - int from = op->get_source().num(); - - dout(7) << "op_pull " << oid << " v " << op->get_version() - << " from " << op->get_source() - << dendl; - - // is a replica asking? are they missing it? - if (is_primary()) { - // primary - assert(peer_missing.count(from)); // we had better know this, from the peering process. - - if (!peer_missing[from].is_missing(oid)) { - dout(7) << "op_pull replica isn't actually missing it, we must have already pushed to them" << dendl; - delete op; - return; - } - - // do we have it yet? - if (is_missing_object(oid)) { - wait_for_missing_object(oid, op); - return; - } - } else { - // non-primary - if (missing.is_missing(oid)) { - dout(7) << "op_pull not primary, and missing " << oid << ", ignoring" << dendl; - delete op; - return; - } - } - - // push it back! - push(oid, op->get_source().num()); -} - - -/** op_push - * NOTE: called from opqueue. - */ -void ReplicatedPG::op_push(MOSDOp *op) -{ - object_t oid = op->get_oid(); - eversion_t v = op->get_version(); - - if (!is_missing_object(oid)) { - dout(7) << "op_push not missing " << oid << dendl; - return; - } - - dout(7) << "op_push " - << oid - << " v " << v - << " size " << op->get_length() << " " << op->get_data().length() - << dendl; - - assert(op->get_data().length() == op->get_length()); - - // write object and add it to the PG - ObjectStore::Transaction t; - t.remove(oid); // in case old version exists - t.write(oid, 0, op->get_length(), op->get_data()); - t.setattrs(oid, op->get_attrset()); - t.collection_add(info.pgid, oid); - - // close out pull op? - num_pulling--; - if (objects_pulling.count(oid)) - objects_pulling.erase(oid); - missing.got(oid, v); - - - // raise last_complete? - assert(log.complete_to != log.log.end()); - while (log.complete_to != log.log.end()) { - if (missing.missing.count(log.complete_to->oid)) break; - if (info.last_complete < log.complete_to->version) - info.last_complete = log.complete_to->version; - log.complete_to++; - } - dout(10) << "last_complete now " << info.last_complete << dendl; - - - // apply to disk! - t.collection_setattr(info.pgid, "info", &info, sizeof(info)); - unsigned r = osd->store->apply_transaction(t); - assert(r == 0); - - - - // am i primary? are others missing this too? - if (is_primary()) { - for (unsigned i=1; itake_waiters(waiting_for_missing_object[oid]); - waiting_for_missing_object.erase(oid); - } - - if (is_primary()) { - // continue recovery - do_recovery(); - } else { - // ack if i'm a replica and being pushed to. - MOSDOpReply *reply = new MOSDOpReply(op, 0, osd->osdmap->get_epoch(), true); - osd->messenger->send_message(reply, op->get_source_inst()); - } - - delete op; -} - - - - - - -void ReplicatedPG::note_failed_osd(int o) -{ - dout(10) << "note_failed_osd " << o << dendl; - // do async; repop_ack() may modify pg->repop_gather - list ls; - for (hash_map::iterator p = rep_gather.begin(); - p != rep_gather.end(); - p++) { - //dout(-1) << "checking repop tid " << p->first << dendl; - if (p->second->waitfor_ack.count(o) || - p->second->waitfor_commit.count(o)) - ls.push_back(p->second); - } - for (list::iterator p = ls.begin(); - p != ls.end(); - p++) - repop_ack(*p, -1, true, o); -} - - -void ReplicatedPG::on_acker_change() -{ - dout(10) << "on_acker_change" << dendl; - - if (g_conf.osd_rep == OSD_REP_PRIMARY) { - // we're fine. - // note that note_failed_osd() above shoudl ahve implicitly acked/committed - // from the failed guy. - } else { - // for splay or chain replication, any change is significant. - // apply repops - for (hash_map::iterator p = rep_gather.begin(); - p != rep_gather.end(); - p++) { - if (!p->second->applied) - apply_repop(p->second); - delete p->second->op; - delete p->second; - } - rep_gather.clear(); - - // and repop waiters - for (hash_map >::iterator p = waiting_for_repop.begin(); - p != waiting_for_repop.end(); - p++) - for (list::iterator pm = p->second.begin(); - pm != p->second.end(); - pm++) - delete *pm; - waiting_for_repop.clear(); - } -} - - -void ReplicatedPG::on_role_change() -{ - dout(10) << "on_role_change" << dendl; - - // take object waiters - for (hash_map >::iterator it = waiting_for_missing_object.begin(); - it != waiting_for_missing_object.end(); - it++) - osd->take_waiters(it->second); - waiting_for_missing_object.clear(); -} - - - - - - - - - -/** clean_up_local - * remove any objects that we're storing but shouldn't. - * as determined by log. - */ -void ReplicatedPG::clean_up_local(ObjectStore::Transaction& t) -{ - dout(10) << "clean_up_local" << dendl; - - assert(info.last_update >= log.bottom); // otherwise we need some help! - - if (log.backlog) { - // be thorough. - list ls; - osd->store->collection_list(info.pgid, ls); - set s; - - for (list::iterator i = ls.begin(); - i != ls.end(); - i++) - s.insert(*i); - - set did; - for (list::reverse_iterator p = log.log.rbegin(); - p != log.log.rend(); - p++) { - if (did.count(p->oid)) continue; - did.insert(p->oid); - - if (p->is_delete()) { - if (s.count(p->oid)) { - dout(10) << " deleting " << p->oid - << " when " << p->version << dendl; - t.remove(p->oid); - } - s.erase(p->oid); - } else { - // just leave old objects.. they're missing or whatever - s.erase(p->oid); - } - } - - for (set::iterator i = s.begin(); - i != s.end(); - i++) { - dout(10) << " deleting stray " << *i << dendl; - t.remove(*i); - } - - } else { - // just scan the log. - set did; - for (list::reverse_iterator p = log.log.rbegin(); - p != log.log.rend(); - p++) { - if (did.count(p->oid)) continue; - did.insert(p->oid); - - if (p->is_delete()) { - dout(10) << " deleting " << p->oid - << " when " << p->version << dendl; - t.remove(p->oid); - } else { - // keep old(+missing) objects, just for kicks. - } - } - } -} - - - -void ReplicatedPG::cancel_recovery() -{ - // forget about where missing items are, or anything we're pulling - missing.loc.clear(); - osd->num_pulling -= objects_pulling.size(); - objects_pulling.clear(); - num_pulling = 0; - pushing.clear(); -} - -/** - * do one recovery op. - * return true if done, false if nothing left to do. - */ -bool ReplicatedPG::do_recovery() -{ - assert(is_primary()); - /*if (!is_primary()) { - dout(10) << "do_recovery not primary, doing nothing" << dendl; - return true; - } - */ - - if (info.is_uptodate()) { // am i up to date? - if (!is_all_uptodate()) { - dout(-10) << "do_recovery i'm clean but replicas aren't, starting peer recovery" << dendl; - do_peer_recovery(); - } else { - dout(-10) << "do_recovery all clean, nothing to do" << dendl; - } - return true; - } - - dout(-10) << "do_recovery pulling " << objects_pulling.size() << " in pg, " - << osd->num_pulling << "/" << g_conf.osd_max_pull << " total" - << dendl; - dout(10) << "do_recovery " << missing << dendl; - - // can we slow down on this PG? - if (osd->num_pulling >= g_conf.osd_max_pull && !objects_pulling.empty()) { - dout(-10) << "do_recovery already pulling max, waiting" << dendl; - return true; - } - - // look at log! - Log::Entry *latest = 0; - - while (log.requested_to != log.log.end()) { - assert(log.objects.count(log.requested_to->oid)); - latest = log.objects[log.requested_to->oid]; - assert(latest); - - dout(10) << "do_recovery " - << *log.requested_to - << (objects_pulling.count(latest->oid) ? " (pulling)":"") - << dendl; - - if (latest->is_update() && - !objects_pulling.count(latest->oid) && - missing.is_missing(latest->oid)) { - pull(latest->oid); - return true; - } - - log.requested_to++; - } - - if (!objects_pulling.empty()) { - dout(7) << "do_recovery requested everything, still waiting" << dendl; - return false; - } - - // done? - assert(missing.num_missing() == 0); - assert(info.last_complete == info.last_update); - - if (is_primary()) { - // i am primary - dout(-7) << "do_recovery complete, cleaning strays" << dendl; - uptodate_set.insert(osd->whoami); - if (is_all_uptodate()) - finish_recovery(); - } else { - // tell primary - dout(7) << "do_recovery complete, telling primary" << dendl; - list ls; - ls.push_back(info); - osd->messenger->send_message(new MOSDPGNotify(osd->osdmap->get_epoch(), - ls), - osd->osdmap->get_inst(get_primary())); - } - - return false; -} - -void ReplicatedPG::do_peer_recovery() -{ - dout(-10) << "do_peer_recovery" << dendl; - - // this is FAR from an optimal recovery order. pretty lame, really. - for (unsigned i=0; isecond; - eversion_t v = peer_missing[peer].rmissing.begin()->first; - - push(oid, peer); - - // do other peers need it too? - for (i++; iget_source() << " " << *reply << dendl; - - int peer = reply->get_source().num(); - object_t oid = reply->get_oid(); - - if (pushing.count(oid) && - pushing[oid].count(peer)) { - pushing[oid].erase(peer); - - if (peer_missing.count(peer) == 0 || - peer_missing[peer].num_missing() == 0) - uptodate_set.insert(peer); - - if (pushing[oid].empty()) { - dout(10) << "pushed " << oid << " to all replicas" << dendl; - do_peer_recovery(); - } else { - dout(10) << "pushed " << oid << ", still waiting for push ack from " - << pushing[oid] << dendl; - } - } else { - dout(10) << "huh, i wasn't pushing " << oid << dendl; - } - delete reply; -} - -void ReplicatedPG::purge_strays() -{ - dout(10) << "purge_strays " << stray_set << dendl; - - for (set::iterator p = stray_set.begin(); - p != stray_set.end(); - p++) { - dout(10) << "sending PGRemove to osd" << *p << dendl; - set ls; - ls.insert(info.pgid); - MOSDPGRemove *m = new MOSDPGRemove(osd->osdmap->get_epoch(), ls); - osd->messenger->send_message(m, osd->osdmap->get_inst(*p)); - } - - stray_set.clear(); -} - diff --git a/branches/sage/crush/osd/ReplicatedPG.h b/branches/sage/crush/osd/ReplicatedPG.h deleted file mode 100644 index ab44026b43fb2..0000000000000 --- a/branches/sage/crush/osd/ReplicatedPG.h +++ /dev/null @@ -1,170 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __REPLICATEDPG_H -#define __REPLICATEDPG_H - - -#include "PG.h" - -#include "messages/MOSDOp.h" - - -class ReplicatedPG : public PG { -public: - /* - * gather state on the primary/head while replicating an osd op. - */ - class RepGather { - public: - class MOSDOp *op; - tid_t rep_tid; - - ObjectStore::Transaction t; - bool applied; - - set waitfor_ack; - set waitfor_commit; - - utime_t start; - - bool sent_ack, sent_commit; - - set osds; - eversion_t new_version; - - eversion_t pg_local_last_complete; - map pg_complete_thru; - - RepGather(MOSDOp *o, tid_t rt, eversion_t nv, eversion_t lc) : - op(o), rep_tid(rt), - applied(false), - sent_ack(false), sent_commit(false), - new_version(nv), - pg_local_last_complete(lc) { } - - bool can_send_ack() { - return !sent_ack && !sent_commit && - waitfor_ack.empty(); - } - bool can_send_commit() { - return !sent_commit && - waitfor_ack.empty() && waitfor_commit.empty(); - } - bool can_delete() { - return waitfor_ack.empty() && waitfor_commit.empty(); - } - }; - -protected: - // replica ops - // [primary|tail] - hash_map rep_gather; - hash_map > waiting_for_repop; - - // load balancing - set balancing_reads; - set unbalancing_reads; - hash_map > waiting_for_unbalanced_reads; // i.e. primary-lock - - void get_rep_gather(RepGather*); - void apply_repop(RepGather *repop); - void put_rep_gather(RepGather*); - void issue_repop(MOSDOp *op, int osd, utime_t now); - RepGather *new_rep_gather(MOSDOp *op); - void repop_ack(RepGather *repop, - int result, bool commit, - int fromosd, eversion_t pg_complete_thru=0); - - // push/pull - int num_pulling; - map > pushing; - - void push(object_t oid, int dest); - void pull(object_t oid); - - // modify - objectrev_t assign_version(MOSDOp *op); - void op_modify_commit(tid_t rep_tid, eversion_t pg_complete_thru); - void op_rep_modify_commit(MOSDOp *op, int ackerosd, eversion_t last_complete); - - void prepare_log_transaction(ObjectStore::Transaction& t, - MOSDOp *op, eversion_t& version, - objectrev_t crev, objectrev_t rev, - eversion_t trim_to); - void prepare_op_transaction(ObjectStore::Transaction& t, - MOSDOp *op, eversion_t& version, - objectrev_t crev, objectrev_t rev); - - friend class C_OSD_ModifyCommit; - friend class C_OSD_RepModifyCommit; - - - // pg on-disk content - void clean_up_local(ObjectStore::Transaction& t); - - void cancel_recovery(); - bool do_recovery(); - void do_peer_recovery(); - - void purge_strays(); - - - void op_read(MOSDOp *op); - void op_modify(MOSDOp *op); - void op_rep_modify(MOSDOp *op); - void op_push(MOSDOp *op); - void op_pull(MOSDOp *op); - - void op_push_reply(MOSDOpReply *reply); - - -public: - ReplicatedPG(OSD *o, pg_t p) : - PG(o,p), - num_pulling(0) - { } - ~ReplicatedPG() {} - - bool preprocess_op(MOSDOp *op, utime_t now); - void do_op(MOSDOp *op); - void do_op_reply(MOSDOpReply *r); - - bool same_for_read_since(epoch_t e); - bool same_for_modify_since(epoch_t e); - bool same_for_rep_modify_since(epoch_t e); - - bool is_missing_object(object_t oid); - void wait_for_missing_object(object_t oid, MOSDOp *op); - - void note_failed_osd(int o); - void on_acker_change(); - void on_role_change(); - -}; - - -inline ostream& operator<<(ostream& out, ReplicatedPG::RepGather& repop) -{ - out << "repgather(" << &repop << " rep_tid=" << repop.rep_tid - << " wfack=" << repop.waitfor_ack - << " wfcommit=" << repop.waitfor_commit; - out << " pct=" << repop.pg_complete_thru; - out << " op=" << *(repop.op); - out << " repop=" << &repop; - out << ")"; - return out; -} - - -#endif diff --git a/branches/sage/crush/osd/osd_types.h b/branches/sage/crush/osd/osd_types.h deleted file mode 100644 index 0ae9d0831b0d7..0000000000000 --- a/branches/sage/crush/osd/osd_types.h +++ /dev/null @@ -1,321 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __OSD_TYPES_H -#define __OSD_TYPES_H - -#include "msg/msg_types.h" -#include "include/types.h" - -/* osdreqid_t - caller name + incarnation# + tid to unique identify this request - * use for metadata and osd ops. - */ -class osdreqid_t { -public: - entity_name_t name; // who - int32_t inc; // incarnation - tid_t tid; - osdreqid_t() : inc(0), tid(0) {} - osdreqid_t(const entity_name_t& a, int i, tid_t t) : name(a), inc(i), tid(t) {} -}; - -inline ostream& operator<<(ostream& out, const osdreqid_t& r) { - return out << r.name << "." << r.inc << ":" << r.tid; -} - -inline bool operator==(const osdreqid_t& l, const osdreqid_t& r) { - return (l.name == r.name) && (l.inc == r.inc) && (l.tid == r.tid); -} -inline bool operator!=(const osdreqid_t& l, const osdreqid_t& r) { - return (l.name != r.name) || (l.inc != r.inc) || (l.tid != r.tid); -} -inline bool operator<(const osdreqid_t& l, const osdreqid_t& r) { - return (l.name < r.name) || (l.inc < r.inc) || - (l.name == r.name && l.inc == r.inc && l.tid < r.tid); -} -inline bool operator<=(const osdreqid_t& l, const osdreqid_t& r) { - return (l.name < r.name) || (l.inc < r.inc) || - (l.name == r.name && l.inc == r.inc && l.tid <= r.tid); -} -inline bool operator>(const osdreqid_t& l, const osdreqid_t& r) { return !(l <= r); } -inline bool operator>=(const osdreqid_t& l, const osdreqid_t& r) { return !(l < r); } - -namespace __gnu_cxx { - template<> struct hash { - size_t operator()(const osdreqid_t &r) const { - static blobhash H; - return H((const char*)&r, sizeof(r)); - } - }; -} - - - -// osd types -typedef uint64_t coll_t; // collection id - -// pg stuff - -#define PG_INO 4 // this should match mds/mdstypes.h MDS_INO_PG - -typedef uint16_t ps_t; -typedef uint8_t pruleset_t; - - -// crush rule ids -#define CRUSH_REP_RULE(nrep) (nrep) // replication -#define CRUSH_RAID_RULE(num) (10+num) // raid - - - -// placement group id -struct pg_t { -public: - static const int TYPE_REP = CEPH_PG_TYPE_REP; - static const int TYPE_RAID4 = CEPH_PG_TYPE_RAID4; - -private: - union ceph_pg u; - -public: - pg_t() { u.pg64 = 0; } - pg_t(const pg_t& o) { u.pg64 = o.u.pg64; } - pg_t(int type, int size, ps_t seed, int pref) {//, pruleset_t r=0) { - u.pg.type = type; - u.pg.size = size; - u.pg.ps = seed; - u.pg.preferred = pref; // hack: avoid negative. - //u.pg.ruleset = r; - assert(sizeof(u.pg) == sizeof(u.pg64)); - } - pg_t(uint64_t v) { u.pg64 = v; } - - int type() { return u.pg.type; } - bool is_rep() { return type() == TYPE_REP; } - bool is_raid4() { return type() == TYPE_RAID4; } - - int size() { return u.pg.size; } - ps_t ps() { return u.pg.ps; } - //pruleset_t ruleset() { return u.pg.ruleset; } - int preferred() { return u.pg.preferred; } // hack: avoid negative. - - /* - pg_t operator=(uint64_t v) { u.val = v; return *this; } - pg_t operator&=(uint64_t v) { u.val &= v; return *this; } - pg_t operator+=(pg_t o) { u.val += o.val; return *this; } - pg_t operator-=(pg_t o) { u.val -= o.val; return *this; } - pg_t operator++() { ++u.val; return *this; } - */ - operator uint64_t() const { return u.pg64; } - - object_t to_object() const { return object_t(PG_INO, u.pg64 >> 32, u.pg64 & 0xffffffff); } -}; - -inline ostream& operator<<(ostream& out, pg_t pg) -{ - if (pg.is_rep()) - out << pg.size() << 'x'; - else if (pg.is_raid4()) - out << pg.size() << 'r'; - else - out << pg.size() << '?'; - - //if (pg.ruleset()) - //out << (int)pg.ruleset() << 's'; - - out << hex << pg.ps() << dec; - - if (pg.preferred() >= 0) - out << 'p' << pg.preferred(); - - //out << "=" << hex << (__uint64_t)pg << dec; - return out; -} - -namespace __gnu_cxx { - template<> struct hash< pg_t > - { - size_t operator()( const pg_t& x ) const - { - static rjhash H; - return H(x); - } - }; -} - - - - - -/** ObjectLayout - * - * describes an object's placement and layout in the storage cluster. - * most importatly, which pg it belongs to. - * if that pg is raided, it also specifies the object's stripe_unit. - */ -struct ObjectLayout { - pg_t pgid; // what pg do i belong to - int32_t stripe_unit; // for object raid in raid pgs - - ObjectLayout() : pgid(0), stripe_unit(0) { } - ObjectLayout(pg_t p, int su=0) : pgid(p), stripe_unit(su) { } -}; - -inline ostream& operator<<(ostream& out, const ObjectLayout &ol) -{ - out << "pg" << ol.pgid; - if (ol.stripe_unit) - out << ".su=" << ol.stripe_unit; - return out; -} - - - -// compound rados version type -class eversion_t { -public: - epoch_t epoch; - version_t version; - eversion_t(epoch_t e=0, version_t v=0) : epoch(e), version(v) {} -}; - -inline bool operator==(const eversion_t& l, const eversion_t& r) { - return (l.epoch == r.epoch) && (l.version == r.version); -} -inline bool operator!=(const eversion_t& l, const eversion_t& r) { - return (l.epoch != r.epoch) || (l.version != r.version); -} -inline bool operator<(const eversion_t& l, const eversion_t& r) { - return (l.epoch == r.epoch) ? (l.version < r.version):(l.epoch < r.epoch); -} -inline bool operator<=(const eversion_t& l, const eversion_t& r) { - return (l.epoch == r.epoch) ? (l.version <= r.version):(l.epoch <= r.epoch); -} -inline bool operator>(const eversion_t& l, const eversion_t& r) { - return (l.epoch == r.epoch) ? (l.version > r.version):(l.epoch > r.epoch); -} -inline bool operator>=(const eversion_t& l, const eversion_t& r) { - return (l.epoch == r.epoch) ? (l.version >= r.version):(l.epoch >= r.epoch); -} -inline ostream& operator<<(ostream& out, const eversion_t e) { - return out << e.epoch << "'" << e.version; -} - - - -/** osd_stat - * aggregate stats for an osd - */ -struct osd_stat_t { - int64_t num_blocks; - int64_t num_blocks_avail; - int64_t num_objects; - - osd_stat_t() : num_blocks(0), num_blocks_avail(0), num_objects(0) {} -}; - - -/** pg_stat - * aggregate stats for a single PG. - */ -struct pg_stat_t { - eversion_t reported; - - int32_t state; - int64_t size; // in bytes - int64_t num_blocks; // in 4k blocks - int64_t num_objects; - - pg_stat_t() : reported(0), state(0), size(0), num_blocks(0), num_objects(0) {} -}; - - - -struct osd_peer_stat_t { - utime_t stamp; - double oprate; - double qlen; - double recent_qlen; - double read_latency; - double read_latency_mine; - double frac_rd_ops_shed_in; - double frac_rd_ops_shed_out; - osd_peer_stat_t() : oprate(0), qlen(0), recent_qlen(0), - read_latency(0), read_latency_mine(0), - frac_rd_ops_shed_in(0), frac_rd_ops_shed_out(0) {} -}; - -inline ostream& operator<<(ostream& out, const osd_peer_stat_t &stat) { - return out << "stat(" << stat.stamp - //<< " oprate=" << stat.oprate - // << " qlen=" << stat.qlen - // << " recent_qlen=" << stat.recent_qlen - << " rdlat=" << stat.read_latency_mine << " / " << stat.read_latency - << " fshedin=" << stat.frac_rd_ops_shed_in - << ")"; -} - -// ----------------------------------------- - -class ObjectExtent { - public: - object_t oid; // object id - off_t start; // in object - size_t length; // in object - - ObjectLayout layout; // object layout (pgid, etc.) - - map buffer_extents; // off -> len. extents in buffer being mapped (may be fragmented bc of striping!) - - ObjectExtent() : start(0), length(0) {} - ObjectExtent(object_t o, off_t s=0, size_t l=0) : oid(o), start(s), length(l) { } -}; - -inline ostream& operator<<(ostream& out, ObjectExtent &ex) -{ - return out << "extent(" - << ex.oid << " in " << ex.layout - << " " << ex.start << "~" << ex.length - << ")"; -} - - - -// --------------------------------------- - -class OSDSuperblock { -public: - const static uint64_t MAGIC = 0xeb0f505dULL; - uint64_t magic; - uint64_t fsid; // unique fs id (random number) - int32_t whoami; // my role in this fs. - epoch_t current_epoch; // most recent epoch - epoch_t oldest_map, newest_map; // oldest/newest maps we have. - double weight; - OSDSuperblock(uint64_t f=0, int w=0) : - magic(MAGIC), fsid(f), whoami(w), - current_epoch(0), oldest_map(0), newest_map(0), weight(0) {} -}; - -inline ostream& operator<<(ostream& out, OSDSuperblock& sb) -{ - return out << "sb(fsid " << sb.fsid - << " osd" << sb.whoami - << " e" << sb.current_epoch - << " [" << sb.oldest_map << "," << sb.newest_map - << "])"; -} - - -#endif diff --git a/branches/sage/crush/osdc/Journaler.h b/branches/sage/crush/osdc/Journaler.h deleted file mode 100644 index a90ec5f9e348f..0000000000000 --- a/branches/sage/crush/osdc/Journaler.h +++ /dev/null @@ -1,236 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -/* Journaler - * - * This class stripes a serial log over objects on the store. Four logical pointers: - * - * write_pos - where we're writing new entries - * read_pos - where we're reading old entires - * expire_pos - what is deemed "old" by user - * trimmed_pos - where we're expiring old items - * - * trimmed_pos <= expire_pos <= read_pos <= write_pos. - * - * Often, read_pos <= write_pos (as with MDS log). During recovery, write_pos is undefined - * until the end of the log is discovered. - * - * A "head" struct at the beginning of the log is used to store metadata at - * regular intervals. The basic invariants include: - * - * head.read_pos <= read_pos -- the head may "lag", since it's updated lazily. - * head.write_pos <= write_pos - * head.expire_pos <= expire_pos - * head.trimmed_pos <= trimmed_pos - * - * More significantly, - * - * head.expire_pos >= trimmed_pos -- this ensures we can find the "beginning" of the log - * as last recorded, before it is trimmed. trimming will - * block until a sufficiently current expire_pos is committed. - * - * To recover log state, we simply start at the last write_pos in the head, and probe the - * object sequence sizes until we read the end. - * - * Head struct is stored in the first object. Actual journal starts after layout.period() bytes. - * - */ - -#ifndef __JOURNALER_H -#define __JOURNALER_H - -#include "Objecter.h" -#include "Filer.h" - -#include -#include - -class Context; -class Logger; - -class Journaler { - - // this goes at the head of the log "file". - struct Header { - off_t trimmed_pos; - off_t expire_pos; - off_t read_pos; - off_t write_pos; - Header() : trimmed_pos(0), expire_pos(0), read_pos(0), write_pos(0) {} - } last_written, last_committed; - - friend ostream& operator<<(ostream& out, Header &h); - - - // me - inode_t inode; - Objecter *objecter; - Filer filer; - - Logger *logger; - - Mutex *lock; - SafeTimer timer; - - class C_DelayFlush : public Context { - Journaler *journaler; - public: - C_DelayFlush(Journaler *j) : journaler(j) {} - void finish(int r) { - journaler->delay_flush_event = 0; - journaler->_do_flush(); - } - } *delay_flush_event; - - - // my state - static const int STATE_UNDEF = 0; - static const int STATE_READHEAD = 1; - static const int STATE_PROBING = 2; - static const int STATE_ACTIVE = 2; - - int state; - - // header - utime_t last_wrote_head; - void _finish_write_head(Header &wrote, Context *oncommit); - class C_WriteHead; - friend class C_WriteHead; - - list waitfor_recover; - void _finish_read_head(int r, bufferlist& bl); - void _finish_probe_end(int r, off_t end); - class C_ReadHead; - friend class C_ReadHead; - class C_ProbeEnd; - friend class C_ProbeEnd; - - - - // writer - off_t write_pos; // logical write position, where next entry will go - off_t flush_pos; // where we will flush. if write_pos>flush_pos, we're buffering writes. - off_t ack_pos; // what has been acked. - bufferlist write_buf; // write buffer. flush_pos + write_buf.length() == write_pos. - - std::map pending_flush; // start offsets and times for pending flushes - std::map > waitfor_flush; // when flushed through given offset - - void _do_flush(); - void _finish_flush(int r, off_t start); - class C_Flush; - friend class C_Flush; - - // reader - off_t read_pos; // logical read position, where next entry starts. - off_t requested_pos; // what we've requested from OSD. - off_t received_pos; // what we've received from OSD. - bufferlist read_buf; // read buffer. read_pos + read_buf.length() == prefetch_pos. - bufferlist reading_buf; // what i'm reading into - - off_t fetch_len; // how much to read at a time - off_t prefetch_from; // how far from end do we read next chunk - - // for read_entry() in-progress read - bufferlist *read_bl; - Context *on_read_finish; - // for wait_for_readable() - Context *on_readable; - - bool _is_reading() { - return requested_pos > received_pos; - } - void _finish_read(int r); // we just read some (read completion callback) - void _issue_read(off_t len); // read some more - void _prefetch(); // maybe read ahead - class C_Read; - friend class C_Read; - class C_RetryRead; - friend class C_RetryRead; - - // trimmer - off_t expire_pos; // what we're allowed to trim to - off_t trimming_pos; // what we've requested to trim through - off_t trimmed_pos; // what has been trimmed - map > waitfor_trim; - - void _trim_finish(int r, off_t to); - class C_Trim; - friend class C_Trim; - -public: - Journaler(inode_t& inode_, Objecter *obj, Logger *l, Mutex *lk, off_t fl=0, off_t pff=0) : - inode(inode_), objecter(obj), filer(objecter), logger(l), - lock(lk), timer(*lk), delay_flush_event(0), - state(STATE_UNDEF), - write_pos(0), flush_pos(0), ack_pos(0), - read_pos(0), requested_pos(0), received_pos(0), - fetch_len(fl), prefetch_from(pff), - read_bl(0), on_read_finish(0), on_readable(0), - expire_pos(0), trimming_pos(0), trimmed_pos(0) - { - // prefetch intelligently. - // (watch out, this is big if you use big objects or weird striping) - if (!fetch_len) - fetch_len = inode.layout.fl_object_size*inode.layout.fl_stripe_count * - g_conf.journaler_prefetch_periods; - if (!prefetch_from) - prefetch_from = fetch_len / 2; - } - - // me - //void open(Context *onopen); - //void claim(Context *onclaim, msg_addr_t from); - - /* reset - * NOTE: we assume the caller knows/has ensured that any objects - * in our sequence do not exist.. e.g. after a MKFS. this is _not_ - * an "erase" method. - */ - void reset(); - void recover(Context *onfinish); - void write_head(Context *onsave=0); - - bool is_active() { return state == STATE_ACTIVE; } - - off_t get_write_pos() const { return write_pos; } - off_t get_read_pos() const { return read_pos; } - off_t get_expire_pos() const { return expire_pos; } - off_t get_trimmed_pos() const { return trimmed_pos; } - - // write - off_t append_entry(bufferlist& bl, Context *onsync = 0); - void flush(Context *onsync = 0); - - // read - void set_read_pos(off_t p) { - assert(requested_pos == received_pos); // we can't cope w/ in-progress read right now. - assert(read_bl == 0); // ... - read_pos = requested_pos = received_pos = p; - read_buf.clear(); - } - bool is_readable(); - bool try_read_entry(bufferlist& bl); - void wait_for_readable(Context *onfinish); - void read_entry(bufferlist* bl, Context *onfinish); - - // trim - void set_expire_pos(off_t ep) { expire_pos = ep; } - void trim(); - //bool is_trimmable() { return trimming_pos < expire_pos; } - //void trim(off_t trim_to=0, Context *c=0); -}; - - -#endif diff --git a/branches/sage/crush/osdc/Objecter.cc b/branches/sage/crush/osdc/Objecter.cc deleted file mode 100644 index 84563b0af9720..0000000000000 --- a/branches/sage/crush/osdc/Objecter.cc +++ /dev/null @@ -1,913 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "Objecter.h" -#include "osd/OSDMap.h" -#include "mon/MonMap.h" - -#include "msg/Messenger.h" -#include "msg/Message.h" - -#include "messages/MOSDOp.h" -#include "messages/MOSDOpReply.h" -#include "messages/MOSDMap.h" -#include "messages/MOSDGetMap.h" - -#include "messages/MOSDFailure.h" - -#include - -#include "config.h" - -#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_objecter) *_dout << dbeginl << g_clock.now() << " " << messenger->get_myname() << ".objecter " -#define derr(x) if (x <= g_conf.debug || x <= g_conf.debug_objecter) *_derr << dbeginl << g_clock.now() << " " << messenger->get_myname() << ".objecter " - - -// messages ------------------------------ - -void Objecter::init() -{ - assert(client_lock.is_locked()); // otherwise event cancellation is unsafe - timer.add_event_after(g_conf.objecter_tick_interval, new C_Tick(this)); -} - -void Objecter::shutdown() -{ - assert(client_lock.is_locked()); // otherwise event cancellation is unsafe - timer.cancel_all(); -} - - -void Objecter::dispatch(Message *m) -{ - switch (m->get_type()) { - case MSG_OSD_OPREPLY: - handle_osd_op_reply((MOSDOpReply*)m); - break; - - case MSG_OSD_MAP: - handle_osd_map((MOSDMap*)m); - break; - - default: - dout(1) << "don't know message type " << m->get_type() << dendl; - assert(0); - } -} - -void Objecter::handle_osd_map(MOSDMap *m) -{ - assert(osdmap); - - if (m->get_last() <= osdmap->get_epoch()) { - dout(3) << "handle_osd_map ignoring epochs [" - << m->get_first() << "," << m->get_last() - << "] <= " << osdmap->get_epoch() << dendl; - } - else { - dout(3) << "handle_osd_map got epochs [" - << m->get_first() << "," << m->get_last() - << "] > " << osdmap->get_epoch() - << dendl; - - set changed_pgs; - - for (epoch_t e = osdmap->get_epoch() + 1; - e <= m->get_last(); - e++) { - if (m->incremental_maps.count(e)) { - dout(3) << "handle_osd_map decoding incremental epoch " << e << dendl; - OSDMap::Incremental inc; - int off = 0; - inc.decode(m->incremental_maps[e], off); - osdmap->apply_incremental(inc); - - // notify messenger - for (map >::iterator i = inc.new_down.begin(); - i != inc.new_down.end(); - i++) - messenger->mark_down(i->second.first.addr); - - } - else if (m->maps.count(e)) { - dout(3) << "handle_osd_map decoding full epoch " << e << dendl; - osdmap->decode(m->maps[e]); - } - else { - dout(3) << "handle_osd_map requesting missing epoch " << osdmap->get_epoch()+1 << dendl; - int mon = monmap->pick_mon(); - messenger->send_message(new MOSDGetMap(osdmap->get_epoch()+1), - monmap->get_inst(mon)); - break; - } - - // scan pgs for changes - scan_pgs(changed_pgs); - - assert(e == osdmap->get_epoch()); - } - - // kick requests who might be timing out on the wrong osds - if (!changed_pgs.empty()) - kick_requests(changed_pgs); - } - - delete m; -} - - -void Objecter::maybe_request_map() -{ - utime_t now; - if (!osdmap) goto yes; - if (last_epoch_requested <= osdmap->get_epoch()) goto yes; - now = g_clock.now(); - if (now - last_epoch_requested_stamp > g_conf.objecter_map_request_interval) goto yes; - return; - - yes: - dout(10) << "maybe_request_map requesting next osd map" << dendl; - last_epoch_requested_stamp = now; - last_epoch_requested = osdmap->get_epoch()+1; - messenger->send_message(new MOSDGetMap(osdmap->get_epoch(), last_epoch_requested), - monmap->get_inst(monmap->pick_mon())); -} - - - -void Objecter::scan_pgs(set& changed_pgs) -{ - dout(10) << "scan_pgs" << dendl; - - for (hash_map::iterator i = pg_map.begin(); - i != pg_map.end(); - i++) { - pg_t pgid = i->first; - PG& pg = i->second; - - // calc new. - vector other; - osdmap->pg_to_acting_osds(pgid, other); - - if (other == pg.acting) - continue; // no change. - - other.swap(pg.acting); - - if (g_conf.osd_rep == OSD_REP_PRIMARY) { - // same primary? - if (!other.empty() && - !pg.acting.empty() && - other[0] == pg.acting[0]) - continue; - } - else if (g_conf.osd_rep == OSD_REP_SPLAY) { - // same primary and acker? - if (!other.empty() && - !pg.acting.empty() && - other[0] == pg.acting[0] && - other[other.size() > 1 ? 1:0] == pg.acting[pg.acting.size() > 1 ? 1:0]) - continue; - } - else if (g_conf.osd_rep == OSD_REP_CHAIN) { - // any change is significant. - } - - // changed significantly. - dout(10) << "scan_pgs pg " << pgid - << " (" << pg.active_tids << ")" - << " " << other << " -> " << pg.acting - << dendl; - changed_pgs.insert(pgid); - } -} - -void Objecter::kick_requests(set& changed_pgs) -{ - dout(10) << "kick_requests in pgs " << changed_pgs << dendl; - - for (set::iterator i = changed_pgs.begin(); - i != changed_pgs.end(); - i++) { - pg_t pgid = *i; - PG& pg = pg_map[pgid]; - - // resubmit ops! - set tids; - tids.swap( pg.active_tids ); - close_pg( pgid ); // will pbly reopen, unless it's just commits we're missing - - for (set::iterator p = tids.begin(); - p != tids.end(); - p++) { - tid_t tid = *p; - - if (op_modify.count(tid)) { - OSDModify *wr = op_modify[tid]; - op_modify.erase(tid); - - // WRITE - if (wr->tid_version.count(tid)) { - if (wr->op == OSD_OP_WRITE && - !g_conf.objecter_buffer_uncommitted) { - dout(0) << "kick_requests missing commit, cannot replay: objecter_buffer_uncommitted == FALSE" << dendl; - } else { - dout(3) << "kick_requests missing commit, replay write " << tid - << " v " << wr->tid_version[tid] << dendl; - modifyx_submit(wr, wr->waitfor_commit[tid], tid); - } - } - else if (wr->waitfor_ack.count(tid)) { - dout(3) << "kick_requests missing ack, resub write " << tid << dendl; - modifyx_submit(wr, wr->waitfor_ack[tid], tid); - } - } - - else if (op_read.count(tid)) { - // READ - OSDRead *rd = op_read[tid]; - op_read.erase(tid); - dout(3) << "kick_requests resub read " << tid << dendl; - - // resubmit - readx_submit(rd, rd->ops[tid], true); - rd->ops.erase(tid); - } - - else if (op_stat.count(tid)) { - OSDStat *st = op_stat[tid]; - op_stat.erase(tid); - - dout(3) << "kick_requests resub stat " << tid << dendl; - - // resubmit - stat_submit(st); - } - - else - assert(0); - } - } -} - - -void Objecter::tick() -{ - dout(10) << "tick" << dendl; - - // look for laggy pgs - utime_t cutoff = g_clock.now(); - cutoff -= g_conf.objecter_timeout; // timeout - for (hash_map::iterator i = pg_map.begin(); - i != pg_map.end(); - i++) { - if (!i->second.active_tids.empty() && - i->second.last < cutoff) { - dout(10) << "tick pg " << i->first << " is laggy" << dendl; - maybe_request_map(); - break; - } - } - - // reschedule - timer.add_event_after(g_conf.objecter_tick_interval, new C_Tick(this)); -} - - - -void Objecter::handle_osd_op_reply(MOSDOpReply *m) -{ - // read or modify? - switch (m->get_op()) { - case OSD_OP_READ: - handle_osd_read_reply(m); - break; - - case OSD_OP_STAT: - handle_osd_stat_reply(m); - break; - - case OSD_OP_WRNOOP: - case OSD_OP_WRITE: - case OSD_OP_ZERO: - case OSD_OP_DELETE: - case OSD_OP_WRUNLOCK: - case OSD_OP_WRLOCK: - case OSD_OP_RDLOCK: - case OSD_OP_RDUNLOCK: - case OSD_OP_UPLOCK: - case OSD_OP_DNLOCK: - handle_osd_modify_reply(m); - break; - - default: - assert(0); - } -} - - - -// stat ----------------------------------- - -tid_t Objecter::stat(object_t oid, off_t *size, ObjectLayout ol, Context *onfinish) -{ - OSDStat *st = new OSDStat(size); - st->extents.push_back(ObjectExtent(oid, 0, 0)); - st->extents.front().layout = ol; - st->onfinish = onfinish; - - return stat_submit(st); -} - -tid_t Objecter::stat_submit(OSDStat *st) -{ - // find OSD - ObjectExtent &ex = st->extents.front(); - PG &pg = get_pg( ex.layout.pgid ); - - // pick tid - last_tid++; - assert(client_inc >= 0); - - // add to gather set - st->tid = last_tid; - op_stat[last_tid] = st; - - pg.active_tids.insert(last_tid); - - // send? - - dout(10) << "stat_submit " << st << " tid " << last_tid - << " oid " << ex.oid - << " " << ex.layout - << " osd" << pg.acker() - << dendl; - - if (pg.acker() >= 0) { - MOSDOp *m = new MOSDOp(messenger->get_myinst(), client_inc, last_tid, - ex.oid, ex.layout, osdmap->get_epoch(), - OSD_OP_STAT); - - messenger->send_message(m, osdmap->get_inst(pg.acker())); - } - - return last_tid; -} - -void Objecter::handle_osd_stat_reply(MOSDOpReply *m) -{ - // get pio - tid_t tid = m->get_tid(); - - if (op_stat.count(tid) == 0) { - dout(7) << "handle_osd_stat_reply " << tid << " ... stray" << dendl; - delete m; - return; - } - - dout(7) << "handle_osd_stat_reply " << tid - << " r=" << m->get_result() - << " size=" << m->get_object_size() - << dendl; - OSDStat *st = op_stat[ tid ]; - op_stat.erase( tid ); - - // remove from osd/tid maps - PG& pg = get_pg( m->get_pg() ); - assert(pg.active_tids.count(tid)); - pg.active_tids.erase(tid); - if (pg.active_tids.empty()) close_pg( m->get_pg() ); - - // success? - if (m->get_result() == -EAGAIN) { - dout(7) << " got -EAGAIN, resubmitting" << dendl; - stat_submit(st); - delete m; - return; - } - //assert(m->get_result() >= 0); - - // ok! - if (m->get_result() < 0) { - *st->size = -1; - } else { - *st->size = m->get_object_size(); - } - - // finish, clean up - Context *onfinish = st->onfinish; - - // done - delete st; - if (onfinish) { - onfinish->finish(m->get_result()); - delete onfinish; - } - - delete m; -} - - -// read ----------------------------------- - - -tid_t Objecter::read(object_t oid, off_t off, size_t len, ObjectLayout ol, bufferlist *bl, - Context *onfinish) -{ - OSDRead *rd = new OSDRead(bl); - rd->extents.push_back(ObjectExtent(oid, off, len)); - rd->extents.front().layout = ol; - readx(rd, onfinish); - return last_tid; -} - - -tid_t Objecter::readx(OSDRead *rd, Context *onfinish) -{ - rd->onfinish = onfinish; - - // issue reads - for (list::iterator it = rd->extents.begin(); - it != rd->extents.end(); - it++) - readx_submit(rd, *it); - - return last_tid; -} - -tid_t Objecter::readx_submit(OSDRead *rd, ObjectExtent &ex, bool retry) -{ - // find OSD - PG &pg = get_pg( ex.layout.pgid ); - - // pick tid - last_tid++; - assert(client_inc >= 0); - - // add to gather set - rd->ops[last_tid] = ex; - op_read[last_tid] = rd; - - pg.active_tids.insert(last_tid); - pg.last = g_clock.now(); - - // send? - dout(10) << "readx_submit " << rd << " tid " << last_tid - << " oid " << ex.oid << " " << ex.start << "~" << ex.length - << " (" << ex.buffer_extents.size() << " buffer fragments)" - << " " << ex.layout - << " osd" << pg.acker() - << dendl; - - if (pg.acker() >= 0) { - MOSDOp *m = new MOSDOp(messenger->get_myinst(), client_inc, last_tid, - ex.oid, ex.layout, osdmap->get_epoch(), - OSD_OP_READ); - m->set_length(ex.length); - m->set_offset(ex.start); - m->set_retry_attempt(retry); - - int who = pg.acker(); - if (rd->balance_reads) { - int replica = messenger->get_myname().num() % pg.acting.size(); - who = pg.acting[replica]; - dout(-10) << "readx_submit reading from random replica " << replica - << " = osd" << who << dendl; - } - messenger->send_message(m, osdmap->get_inst(who)); - } else - maybe_request_map(); - - return last_tid; -} - - -void Objecter::handle_osd_read_reply(MOSDOpReply *m) -{ - // get pio - tid_t tid = m->get_tid(); - - if (op_read.count(tid) == 0) { - dout(7) << "handle_osd_read_reply " << tid << " ... stray" << dendl; - delete m; - return; - } - - dout(7) << "handle_osd_read_reply " << tid << dendl; - OSDRead *rd = op_read[ tid ]; - op_read.erase( tid ); - - // remove from osd/tid maps - PG& pg = get_pg( m->get_pg() ); - assert(pg.active_tids.count(tid)); - pg.active_tids.erase(tid); - if (pg.active_tids.empty()) close_pg( m->get_pg() ); - - // our op finished - rd->ops.erase(tid); - - // success? - if (m->get_result() == -EAGAIN) { - dout(7) << " got -EAGAIN, resubmitting" << dendl; - readx_submit(rd, rd->ops[tid], true); - delete m; - return; - } - //assert(m->get_result() >= 0); - - // what buffer offset are we? - dout(7) << " got frag from " << m->get_oid() << " " - << m->get_offset() << "~" << m->get_length() - << ", still have " << rd->ops.size() << " more ops" << dendl; - - if (rd->ops.empty()) { - // all done - size_t bytes_read = 0; - - if (rd->read_data.size()) { - dout(15) << " assembling frags" << dendl; - - /** FIXME This doesn't handle holes efficiently. - * It allocates zero buffers to fill whole buffer, and - * then discards trailing ones at the end. - * - * Actually, this whole thing is pretty messy with temporary bufferlist*'s all over - * the heap. - */ - - // we have other fragments, assemble them all... blech! - rd->read_data[m->get_oid()] = new bufferlist; - rd->read_data[m->get_oid()]->claim( m->get_data() ); - - // map extents back into buffer - map by_off; // buffer offset -> bufferlist - - // for each object extent... - for (list::iterator eit = rd->extents.begin(); - eit != rd->extents.end(); - eit++) { - bufferlist *ox_buf = rd->read_data[eit->oid]; - unsigned ox_len = ox_buf->length(); - unsigned ox_off = 0; - assert(ox_len <= eit->length); - - // for each buffer extent we're mapping into... - for (map::iterator bit = eit->buffer_extents.begin(); - bit != eit->buffer_extents.end(); - bit++) { - dout(21) << " object " << eit->oid << " extent " << eit->start << "~" << eit->length << " : ox offset " << ox_off << " -> buffer extent " << bit->first << "~" << bit->second << dendl; - by_off[bit->first] = new bufferlist; - - if (ox_off + bit->second <= ox_len) { - // we got the whole bx - by_off[bit->first]->substr_of(*ox_buf, ox_off, bit->second); - if (bytes_read < bit->first + bit->second) - bytes_read = bit->first + bit->second; - } else if (ox_off + bit->second > ox_len && ox_off < ox_len) { - // we got part of this bx - by_off[bit->first]->substr_of(*ox_buf, ox_off, (ox_len-ox_off)); - if (bytes_read < bit->first + ox_len-ox_off) - bytes_read = bit->first + ox_len-ox_off; - - // zero end of bx - dout(21) << " adding some zeros to the end " << ox_off + bit->second-ox_len << dendl; - bufferptr z(ox_off + bit->second - ox_len); - z.zero(); - by_off[bit->first]->append( z ); - } else { - // we got none of this bx. zero whole thing. - assert(ox_off >= ox_len); - dout(21) << " adding all zeros for this bit " << bit->second << dendl; - bufferptr z(bit->second); - z.zero(); - by_off[bit->first]->append( z ); - } - ox_off += bit->second; - } - assert(ox_off == eit->length); - } - - // sort and string bits together - for (map::iterator it = by_off.begin(); - it != by_off.end(); - it++) { - assert(it->second->length()); - if (it->first < (off_t)bytes_read) { - dout(21) << " concat buffer frag off " << it->first << " len " << it->second->length() << dendl; - rd->bl->claim_append(*(it->second)); - } else { - dout(21) << " NO concat zero buffer frag off " << it->first << " len " << it->second->length() << dendl; - } - delete it->second; - } - - // trim trailing zeros? - if (rd->bl->length() > bytes_read) { - dout(10) << " trimming off trailing zeros . bytes_read=" << bytes_read - << " len=" << rd->bl->length() << dendl; - rd->bl->splice(bytes_read, rd->bl->length() - bytes_read); - assert(bytes_read == rd->bl->length()); - } - - // hose p->read_data bufferlist*'s - for (map::iterator it = rd->read_data.begin(); - it != rd->read_data.end(); - it++) { - delete it->second; - } - } else { - dout(15) << " only one frag" << dendl; - - // only one fragment, easy - rd->bl->claim( m->get_data() ); - bytes_read = rd->bl->length(); - } - - // finish, clean up - Context *onfinish = rd->onfinish; - - dout(7) << " " << bytes_read << " bytes " - << rd->bl->length() - << dendl; - - // done - delete rd; - if (onfinish) { - onfinish->finish(bytes_read);// > 0 ? bytes_read:m->get_result()); - delete onfinish; - } - } else { - // store my bufferlist for later assembling - rd->read_data[m->get_oid()] = new bufferlist; - rd->read_data[m->get_oid()]->claim( m->get_data() ); - } - - delete m; -} - - - -// write ------------------------------------ - -tid_t Objecter::write(object_t oid, off_t off, size_t len, ObjectLayout ol, bufferlist &bl, - Context *onack, Context *oncommit) -{ - OSDWrite *wr = new OSDWrite(bl); - wr->extents.push_back(ObjectExtent(oid, off, len)); - wr->extents.front().layout = ol; - wr->extents.front().buffer_extents[0] = len; - modifyx(wr, onack, oncommit); - return last_tid; -} - - -// zero - -tid_t Objecter::zero(object_t oid, off_t off, size_t len, ObjectLayout ol, - Context *onack, Context *oncommit) -{ - OSDModify *z = new OSDModify(OSD_OP_ZERO); - z->extents.push_back(ObjectExtent(oid, off, len)); - z->extents.front().layout = ol; - modifyx(z, onack, oncommit); - return last_tid; -} - - -// lock ops - -tid_t Objecter::lock(int op, object_t oid, ObjectLayout ol, - Context *onack, Context *oncommit) -{ - OSDModify *l = new OSDModify(op); - l->extents.push_back(ObjectExtent(oid, 0, 0)); - l->extents.front().layout = ol; - modifyx(l, onack, oncommit); - return last_tid; -} - - - -// generic modify ----------------------------------- - -tid_t Objecter::modifyx(OSDModify *wr, Context *onack, Context *oncommit) -{ - wr->onack = onack; - wr->oncommit = oncommit; - - // issue writes/whatevers - for (list::iterator it = wr->extents.begin(); - it != wr->extents.end(); - it++) - modifyx_submit(wr, *it); - - return last_tid; -} - - -tid_t Objecter::modifyx_submit(OSDModify *wr, ObjectExtent &ex, tid_t usetid) -{ - // find - PG &pg = get_pg( ex.layout.pgid ); - - // pick tid - tid_t tid; - if (usetid > 0) - tid = usetid; - else - tid = ++last_tid; - assert(client_inc >= 0); - - // add to gather set - wr->waitfor_ack[tid] = ex; - wr->waitfor_commit[tid] = ex; - op_modify[tid] = wr; - pg.active_tids.insert(tid); - pg.last = g_clock.now(); - - ++num_unacked; - ++num_uncommitted; - - // send? - dout(10) << "modifyx_submit " << MOSDOp::get_opname(wr->op) << " tid " << tid - << " oid " << ex.oid - << " " << ex.start << "~" << ex.length - << " " << ex.layout - << " osd" << pg.primary() - << dendl; - if (pg.primary() >= 0) { - MOSDOp *m = new MOSDOp(messenger->get_myinst(), client_inc, tid, - ex.oid, ex.layout, osdmap->get_epoch(), - wr->op); - m->set_length(ex.length); - m->set_offset(ex.start); - if (usetid > 0) - m->set_retry_attempt(true); - - if (wr->tid_version.count(tid)) - m->set_version(wr->tid_version[tid]); // we're replaying this op! - - // what type of op? - switch (wr->op) { - case OSD_OP_WRITE: - { - // map buffer segments into this extent - // (may be fragmented bc of striping) - bufferlist cur; - for (map::iterator bit = ex.buffer_extents.begin(); - bit != ex.buffer_extents.end(); - bit++) - ((OSDWrite*)wr)->bl.copy(bit->first, bit->second, cur); - assert(cur.length() == ex.length); - m->set_data(cur);//.claim(cur); - } - break; - } - - messenger->send_message(m, osdmap->get_inst(pg.primary())); - } else - maybe_request_map(); - - dout(5) << num_unacked << " unacked, " << num_uncommitted << " uncommitted" << dendl; - - return tid; -} - - - -void Objecter::handle_osd_modify_reply(MOSDOpReply *m) -{ - // get pio - tid_t tid = m->get_tid(); - - if (op_modify.count(tid) == 0) { - dout(7) << "handle_osd_modify_reply " << tid - << (m->get_commit() ? " commit":" ack") - << " ... stray" << dendl; - delete m; - return; - } - - dout(7) << "handle_osd_modify_reply " << tid - << (m->get_commit() ? " commit":" ack") - << " v " << m->get_version() - << dendl; - OSDModify *wr = op_modify[ tid ]; - - Context *onack = 0; - Context *oncommit = 0; - - PG &pg = get_pg( m->get_pg() ); - - // ignore? - if (pg.acker() != m->get_source().num()) { - dout(7) << " ignoring ack|commit from non-acker" << dendl; - delete m; - return; - } - - assert(m->get_result() >= 0); - - // ack or commit? - if (m->get_commit()) { - //dout(15) << " handle_osd_write_reply commit on " << tid << dendl; - assert(wr->tid_version.count(tid) == 0 || - m->get_version() == wr->tid_version[tid]); - - // remove from tid/osd maps - assert(pg.active_tids.count(tid)); - pg.active_tids.erase(tid); - if (pg.active_tids.empty()) close_pg( m->get_pg() ); - - // commit. - op_modify.erase( tid ); - wr->waitfor_ack.erase(tid); - wr->waitfor_commit.erase(tid); - - num_uncommitted--; - - if (wr->waitfor_commit.empty()) { - onack = wr->onack; - oncommit = wr->oncommit; - delete wr; - } - } else { - // ack. - //dout(15) << " handle_osd_write_reply ack on " << tid << dendl; - assert(wr->waitfor_ack.count(tid)); - wr->waitfor_ack.erase(tid); - - num_unacked--; - - if (wr->tid_version.count(tid) && - wr->tid_version[tid].version != m->get_version().version) { - dout(-10) << "handle_osd_modify_reply WARNING: replay of tid " << tid - << " did not achieve previous ordering" << dendl; - } - wr->tid_version[tid] = m->get_version(); - - if (wr->waitfor_ack.empty()) { - onack = wr->onack; - wr->onack = 0; // only do callback once - - // buffer uncommitted? - if (!g_conf.objecter_buffer_uncommitted && - wr->op == OSD_OP_WRITE) { - // discard buffer! - ((OSDWrite*)wr)->bl.clear(); - } - } - } - - dout(5) << num_unacked << " unacked, " << num_uncommitted << " uncommitted" << dendl; - - // do callbacks - if (onack) { - onack->finish(0); - delete onack; - } - if (oncommit) { - oncommit->finish(0); - delete oncommit; - } - - delete m; -} - - - -void Objecter::ms_handle_failure(Message *m, entity_name_t dest, const entity_inst_t& inst) -{ - if (dest.is_mon()) { - // try a new mon - int mon = monmap->pick_mon(true); - dout(0) << "ms_handle_failure " << dest << " inst " << inst - << ", resending to mon" << mon - << dendl; - messenger->send_message(m, monmap->get_inst(mon)); - } - else if (dest.is_osd()) { - int mon = monmap->pick_mon(); - dout(0) << "ms_handle_failure " << dest << " inst " << inst - << ", dropping and reporting to mon" << mon - << dendl; - messenger->send_message(new MOSDFailure(messenger->get_myinst(), inst, osdmap->get_epoch()), - monmap->get_inst(mon)); - delete m; - } else { - dout(0) << "ms_handle_failure " << dest << " inst " << inst - << ", dropping" << dendl; - delete m; - } -} diff --git a/branches/sage/crush/osdc/Objecter.h b/branches/sage/crush/osdc/Objecter.h deleted file mode 100644 index 82a437aa04f8d..0000000000000 --- a/branches/sage/crush/osdc/Objecter.h +++ /dev/null @@ -1,230 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __OBJECTER_H -#define __OBJECTER_H - -#include "include/types.h" -#include "include/buffer.h" - -#include "osd/OSDMap.h" -#include "messages/MOSDOp.h" - -#include "common/Timer.h" - -#include -#include -#include -using namespace std; -using namespace __gnu_cxx; - -class Context; -class Messenger; -class OSDMap; -class MonMap; -class Message; - -class Objecter { - public: - Messenger *messenger; - MonMap *monmap; - OSDMap *osdmap; - - private: - tid_t last_tid; - int client_inc; - int num_unacked; - int num_uncommitted; - - epoch_t last_epoch_requested; - utime_t last_epoch_requested_stamp; - - void maybe_request_map(); - - Mutex &client_lock; - SafeTimer timer; - - class C_Tick : public Context { - Objecter *ob; - public: - C_Tick(Objecter *o) : ob(o) {} - void finish(int r) { ob->tick(); } - }; - void tick(); - - - /*** track pending operations ***/ - // read - public: - class OSDOp { - public: - list extents; - virtual ~OSDOp() {} - }; - - class OSDRead : public OSDOp { - public: - bufferlist *bl; - Context *onfinish; - map ops; - map read_data; // bits of data as they come back - int balance_reads; // if non-zero, direct reads to a pseudo-random replica - - OSDRead(bufferlist *b) : bl(b), onfinish(0), balance_reads(0) { - bl->clear(); - } - }; - - class OSDStat : public OSDOp { - public: - tid_t tid; - off_t *size; // where the size goes. - Context *onfinish; - OSDStat(off_t *s) : tid(0), size(s), onfinish(0) { } - }; - - // generic modify - class OSDModify : public OSDOp { - public: - int op; - list extents; - Context *onack; - Context *oncommit; - map waitfor_ack; - map tid_version; - map waitfor_commit; - - OSDModify(int o) : op(o), onack(0), oncommit(0) {} - }; - - // write (includes the bufferlist) - class OSDWrite : public OSDModify { - public: - bufferlist bl; - OSDWrite(bufferlist &b) : OSDModify(OSD_OP_WRITE), bl(b) {} - }; - - - - private: - // pending ops - hash_map op_stat; - hash_map op_read; - hash_map op_modify; - - /** - * track pending ops by pg - * ...so we can cope with failures, map changes - */ - class PG { - public: - vector acting; - set active_tids; // active ops - utime_t last; - - PG() {} - - // primary - where i write - int primary() { - if (acting.empty()) return -1; - return acting[0]; - } - // acker - where i read, and receive acks from - int acker() { - if (acting.empty()) return -1; - if (g_conf.osd_rep == OSD_REP_PRIMARY) - return acting[0]; - else - return acting[acting.size() > 1 ? 1:0]; - } - }; - - hash_map pg_map; - - - PG &get_pg(pg_t pgid) { - if (!pg_map.count(pgid)) - osdmap->pg_to_acting_osds(pgid, pg_map[pgid].acting); - return pg_map[pgid]; - } - void close_pg(pg_t pgid) { - assert(pg_map.count(pgid)); - assert(pg_map[pgid].active_tids.empty()); - pg_map.erase(pgid); - } - void scan_pgs(set& chnaged_pgs); - void kick_requests(set& changed_pgs); - - - public: - Objecter(Messenger *m, MonMap *mm, OSDMap *om, Mutex& l) : - messenger(m), monmap(mm), osdmap(om), - last_tid(0), client_inc(-1), - num_unacked(0), num_uncommitted(0), - last_epoch_requested(0), - client_lock(l), timer(l) - { } - ~Objecter() { } - - void init(); - void shutdown(); - - // messages - public: - void dispatch(Message *m); - void handle_osd_op_reply(class MOSDOpReply *m); - void handle_osd_stat_reply(class MOSDOpReply *m); - void handle_osd_read_reply(class MOSDOpReply *m); - void handle_osd_modify_reply(class MOSDOpReply *m); - void handle_osd_lock_reply(class MOSDOpReply *m); - void handle_osd_map(class MOSDMap *m); - - private: - tid_t readx_submit(OSDRead *rd, ObjectExtent& ex, bool retry=false); - tid_t modifyx_submit(OSDModify *wr, ObjectExtent& ex, tid_t tid=0); - tid_t stat_submit(OSDStat *st); - - // public interface - public: - bool is_active() { - return !(op_read.empty() && op_modify.empty()); - } - - int get_client_incarnation() { return client_inc; } - void set_client_incarnation(int inc) { - client_inc = inc; - } - - // med level - tid_t readx(OSDRead *read, Context *onfinish); - tid_t modifyx(OSDModify *wr, Context *onack, Context *oncommit); - //tid_t lockx(OSDLock *l, Context *onack, Context *oncommit); - - // even lazier - tid_t read(object_t oid, off_t off, size_t len, ObjectLayout ol, bufferlist *bl, - Context *onfinish); - tid_t write(object_t oid, off_t off, size_t len, ObjectLayout ol, bufferlist &bl, - Context *onack, Context *oncommit); - tid_t zero(object_t oid, off_t off, size_t len, ObjectLayout ol, - Context *onack, Context *oncommit); - tid_t stat(object_t oid, off_t *size, ObjectLayout ol, Context *onfinish); - - tid_t lock(int op, object_t oid, ObjectLayout ol, Context *onack, Context *oncommit); - - - void ms_handle_failure(Message *m, entity_name_t dest, const entity_inst_t& inst); - -}; - -#endif diff --git a/branches/sage/crush/script/adjusttabs.pl b/branches/sage/crush/script/adjusttabs.pl deleted file mode 100755 index 66edff2ac6c02..0000000000000 --- a/branches/sage/crush/script/adjusttabs.pl +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/perl - -my $tablen = shift @ARGV; -my $fn = shift @ARGV; - -my $tab = ' ' x $tablen; -open(I, $fn); -my $f; -my $oldtab = ' ' x 4; -while () { - if (my ($oldlen) = /\-\*\- .*tab-width:(\d)/) { - print "old length was $oldlen\n"; - $oldtab = ' ' x $oldlen; - s/tab-width:\d/tab-width:$tablen/; - } - s/\t/$oldtab/g; - $f .= $_; -} -close I; -open(O, ">$fn.new"); -print O $f; -close O; - -rename "$fn.new", $fn; diff --git a/branches/sage/crush/script/clean_osd_cow.sh b/branches/sage/crush/script/clean_osd_cow.sh deleted file mode 100755 index 1e443c95e7ebc..0000000000000 --- a/branches/sage/crush/script/clean_osd_cow.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/sh - -rm osddata/*/*\.* diff --git a/branches/sage/crush/script/clean_trace.pl b/branches/sage/crush/script/clean_trace.pl deleted file mode 100755 index cb02ff7abe7c2..0000000000000 --- a/branches/sage/crush/script/clean_trace.pl +++ /dev/null @@ -1,8 +0,0 @@ -#!/usr/bin/perl - -my $n = 0; -while (<>) { - next unless /trace: /; - my $l = $'; $'; - print $l; -} diff --git a/branches/sage/crush/script/find_bufferleaks.pl b/branches/sage/crush/script/find_bufferleaks.pl deleted file mode 100755 index 152515d5e788e..0000000000000 --- a/branches/sage/crush/script/find_bufferleaks.pl +++ /dev/null @@ -1,69 +0,0 @@ -#!/usr/bin/perl - -use strict; -my %buffers; -my %bufferlists; -my %ref; -my %mal; -my $l = 1; -while (<>) { - #print "$l: $_"; - - # cinode:auth_pin on inode [1000000002625 /gnu/blah_client_created. 0x89b7700] count now 1 + 0 - - if (/^buffer\.cons /) { - my ($x) = /(0x\S+)/; - $buffers{$x} = 1; - } - if (/^buffer\.des /) { - my ($x) = /(0x\S+)/; - die "des without cons at $l: $_" unless $buffers{$x}; - delete $buffers{$x}; - die "des with ref>0 at $l: $_" unless $ref{$x} == 0; - delete $ref{$x}; - } - - if (/^bufferlist\.cons /) { - my ($x) = /(0x\S+)/; - $bufferlists{$x} = 1; - } - if (/^bufferlist\.des /) { - my ($x) = /(0x\S+)/; - warn "des without cons at $l: $_" unless $bufferlists{$x}; - delete $bufferlists{$x}; - } - - - if (/^buffer\.malloc /) { - my ($x) = /(0x\S+)/; - $mal{$x} = 1; - } - if (/^buffer\.free /) { - my ($x) = /(0x\S+)/; - die "free with malloc at $l: $_" unless $mal{$x}; - delete $mal{$x}; - } - - if (/^buffer\.get /) { - my ($x) = /(0x\S+)/; - $ref{$x}++; - } - if (/^buffer\.get /) { - my ($x) = /(0x\S+)/; - $ref{$x}--; - } - -$l++; -} - -for my $x (keys %bufferlists) { - print "leaked bufferlist $x\n"; -} - -for my $x (keys %buffers) { - print "leaked buffer $x ref $ref{$x}\n"; -} - -for my $x (keys %mal) { - print "leaked buffer dataptr $x ref $ref{$x}\n"; -} diff --git a/branches/sage/crush/script/find_lost_bdev_ops.pl b/branches/sage/crush/script/find_lost_bdev_ops.pl deleted file mode 100755 index ac1793b42dfac..0000000000000 --- a/branches/sage/crush/script/find_lost_bdev_ops.pl +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/perl - -use strict; -my %op; - -my $line = 0; -while (<>) { - #print $line . $_ if /0x8d4f6a0/; - chomp; - $line++; - - #bdev(./ebofsdev/0)._submit_io bio(wr 269~1 usemap 0x4de33cc0) - if (my ($bio) = /_submit_io bio\(.*(0x\w+)\)/) { - $op{$bio} = $line; - } - - # cancel - #bdev(./ebofsdev/3)._cancel_io bio(wr 1525~1 bh_write 0x8a437b8) - if (my ($bio) = /_cancel_io bio\(.*(0x\w+)\)/ && - !(/FAILED/)) { - delete $op{$bio}; - } - - # finish - #bdev(./ebofsdev/3).complete_thread finishing bio(wr 1131~1 write_cnode 0x832c1f8) - if (my ($bio) = /complete_thread finishing bio\(.*(0x\w+)\)/) { - delete $op{$bio}; - } - -} - -for my $bio (keys %op) { - print "---- lost bio $bio\n"; -} diff --git a/branches/sage/crush/script/find_lost_commit.pl b/branches/sage/crush/script/find_lost_commit.pl deleted file mode 100755 index 73934248ad5c0..0000000000000 --- a/branches/sage/crush/script/find_lost_commit.pl +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/perl - -use strict; -my %op; - -my $line = 0; -while (<>) { - #print "$line: $_"; - $line++; - - #osd3 do_op MOSDOp(client0.933 oid 100000000000008 0x84b4480) in pg[pginfo(4020000000d v 5662/0 e 2/1) r=0 active (0,5662]] - if (my ($from, $opno, $oid, $op) = /do_op MOSDOp\((\S+) op (\d+) oid (\d+) (\w+)\)/) { -# print "$op\n"; - if ($opno == 2 || $opno == 11 || $opno == 12 || $opno == 14 || $opno == 15) { - $op{$op} = $from; - } - } - - # commits - #osd1 op_modify_commit on op MOSDOp(client1.289 oid 100000100000002 0x51a2f788) - if (my ($op) = /op_modify_commit.* (\w+)\)/) { - delete $op{$op}; - } - #osd4 rep_modify_commit on op MOSDOp(osd3.289 oid 100000000000008 0x84b0980) - if (my ($op) = /rep_modify_commit.* (\w+)\)/) { - delete $op{$op}; - } - - # forwarded? - if (my ($op) = /sending (\w+) to osd/) { - delete $op{$op}; - } - -} - -for my $op (keys %op) { - print "---- lost op $op $op{$op}\n"; -} diff --git a/branches/sage/crush/script/find_lost_objecter.pl b/branches/sage/crush/script/find_lost_objecter.pl deleted file mode 100755 index a0c2089140e23..0000000000000 --- a/branches/sage/crush/script/find_lost_objecter.pl +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/perl - -use strict; -my %ack; -my %commit; - -my $line = 0; -while (<>) { - #print "$line: $_"; - $line++; - - #client0.objecter writex_submit tid 21 osd0 oid 100000000000001 851424~100000 - if (my ($who, $tid) = /(\S+)\.objecter writex_submit tid\D+(\d+)\D+osd/) { -# print "$who.$tid\n"; - $ack{"$who.$tid"} = $line; - $commit{"$who.$tid"} = $line; - } - - #client1.objecter handle_osd_write_reply 304 commit 0 - #client1.objecter handle_osd_write_reply 777 commit 1 - if (my ($who, $tid, $commit) = /(\S+)\.objecter handle_osd_write_reply\D+(\d+)\D+commit\D+(\d)/) { -# print "$who.$tid\n"; - delete $ack{"$who.$tid"}; - delete $commit{"$who.$tid"} if $commit; - } - -} - -for my $op (keys %commit) { - print "---- lost commit $op $commit{$op}\n"; -} -for my $op (keys %ack) { - print "---- lost ack $op $commit{$op}\n"; -} diff --git a/branches/sage/crush/script/find_pathpins.pl b/branches/sage/crush/script/find_pathpins.pl deleted file mode 100755 index e4a7d81dfb7b7..0000000000000 --- a/branches/sage/crush/script/find_pathpins.pl +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/perl - -my %pin; -my %hist; -my $l = 1; -my @pins; -while (<>) { - - # cinode:auth_pin on inode [1000000002625 /gnu/blah_client_created. 0x89b7700] count now 1 + 0 - - if (/path_pinned /) { - my ($dname, $dir) = /\[dentry (\S+) .* in \[dir (\d+) /; - $what = "$dname $dir"; - #print "$l pin $what\n"; - $pin{$what}++; - $hist{$what} .= "$l: $_"; - push( @pins, $what ) unless grep {$_ eq $what} @pins; - } - - # cinode:auth_unpin on inode [1000000002625 (dangling) 0x89b7700] count now 0 + 0 - - if (/path_unpinned/) { - my ($dname, $dir) = /\[dentry (\S+) .* in \[dir (\d+) /; - $what = "$dname $dir"; - #print "$l unpin $what\n"; - $pin{$what}--; - $hist{$what} .= "$l: $_"; - unless ($pin{$what}) { - delete $hist{$what}; - delete $pin{$what}; - @pins = grep {$_ ne $what} @pins; - } - } - $l++; -} - -for my $what (@pins) { - print "---- count $pin{$what} on $what -$hist{$what} -"; -} diff --git a/branches/sage/crush/script/find_requests.pl b/branches/sage/crush/script/find_requests.pl deleted file mode 100755 index 5144896249413..0000000000000 --- a/branches/sage/crush/script/find_requests.pl +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/perl - -my %waiting; # context => what where what is "inode ..." or "dir ..." -my %hist; # context => history since waited -my @waiting; - -my $line = 0; -while (<>) { - - #print $line . $_ if /0x8d4f6a0/; - $line++; - if (/request_start/) { - my ($c) = /(0x\w+)/; - my ($what) = $'; #'; - chomp $what; - #print "$line add_waiter $c $what\n" if /0x8d4f6a0/; - $waiting{$c} = $what - if $what && !$waiting{$c}; - $hist{$c} .= "$line: $_"; - unless (grep {$_ eq $c} @waiting) { - push( @waiting, $c ); - } - } - #if (/finish_waiting/) { - # my ($c) = /(0x\w+)/; - # $hist{$c} .= "$line: $_"; - #} - if (/request_finish/ || - /request_forward/) { - my ($c) = /(0x\w+)/; - #print "took\n" if /0x8d4f6a0/; - delete $waiting{$c}; - delete $hist{$c}; - @waiting = grep {$_ ne $c} @waiting; - } -} - -for my $c (@waiting) { - print "---- lost request $c $waiting{$c} -$hist{$c} -"; -} diff --git a/branches/sage/crush/script/find_waiters.pl b/branches/sage/crush/script/find_waiters.pl deleted file mode 100755 index c89d2b1a49db7..0000000000000 --- a/branches/sage/crush/script/find_waiters.pl +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/perl - -my %waiting; # context => what where what is "inode ..." or "dir ..." -my %hist; # context => history since waited -my @waiting; - -my $line = 0; -while (<>) { - #print $line . $_ if /0x8d4f6a0/; - $line++; - if (/add_waiter/) { - my ($c) = /(0x\w+)/; - my ($what) = / on (.*\])/; - #print "$line add_waiter $c $what\n" if /0x8d4f6a0/; - $waiting{$c} = $what - if $what && !$waiting{$c}; - $hist{$c} .= "$line: $_"; - unless (grep {$_ eq $c} @waiting) { - push( @waiting, $c ); - } - } - #if (/finish_waiting/) { - # my ($c) = /(0x\w+)/; - # $hist{$c} .= "$line: $_"; - #} - if (/take_waiting/) { - my ($c) = /(0x\w+)/; - if (/SKIPPING/) { - #print "skipping\n" if /0x8d4f6a0/; - $hist{$c} .= "$line: $_"; - } elsif (/took/) { - #print "took\n" if /0x8d4f6a0/; - delete $waiting{$c}; - delete $hist{$c}; - @waiting = grep {$_ ne $c} @waiting; - } else { - die "i don't understand: $_"; - } - } -} - -for my $c (@waiting) { - print "---- lost waiter $c $waiting{$c} -$hist{$c} -"; -} diff --git a/branches/sage/crush/script/grepblock b/branches/sage/crush/script/grepblock deleted file mode 100755 index f5acf95732abb..0000000000000 --- a/branches/sage/crush/script/grepblock +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/perl - -use strict; - -my $block = shift ARGV; -die unless int $block; - -while (<>) { - my $yes = 0; - for my $x (/(\d+\~\d+)/) { - my ($s,$l) = split(/\~/,$x); - $yes = 1 if ($block >= $s && $block < $s+$l); - } - print if $yes; -} diff --git a/branches/sage/crush/script/merge_trace_rw.pl b/branches/sage/crush/script/merge_trace_rw.pl deleted file mode 100644 index 378d629ef43f6..0000000000000 --- a/branches/sage/crush/script/merge_trace_rw.pl +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/perl - -use strict; - -my @file = <>; -sub get_op { - my @op = shift @file; - while (@file && - $file[0] !~ /^[a-z]+$/) { - push( @op, shift @file ); - } - #print "op = ( @op )\n"; - return @op; -} - -my $n = 0; -while (@file) { - my ($op, @args) = &get_op; - while ($op eq "read\n" || - $op eq "write\n") { - die unless scalar(@args) == 3; - my ($nop, @nargs) = &get_op; - if ($nop eq $op - && ($args[0] == $nargs[0] ) - && ($args[2] + $args[1] == $nargs[2]) - ) { - die unless scalar(@nargs) == 3; - $args[1] += $nargs[1]; - $args[1] .= "\n"; - die unless scalar(@args) == 3; - #print STDOUT "combining $n $op @args\n"; - $n++; - } else { -# print STDERR "not combinging\n"; - unshift( @file, $nop, @nargs ); - die unless scalar(@args) == 3; - last; - } - } - print $op; - print join('', @args); -} diff --git a/branches/sage/crush/script/profonly.pl b/branches/sage/crush/script/profonly.pl deleted file mode 100755 index 6a05dec473ca0..0000000000000 --- a/branches/sage/crush/script/profonly.pl +++ /dev/null @@ -1,12 +0,0 @@ -#!/usr/bin/perl - -my $rank = shift @ARGV; -my $args = join(' ',@ARGV); -if ($rank == $ENV{MPD_JRANK}) { - $c = "LD_PRELOAD=$ENV{'HOME'}/csl/obsd/src/pmds/gprof-helper.so ./newsyn $args"; -} else { - $c = "./newsyn.nopg $args"; -} - -#print "$rank: $c\n"; -system $c; diff --git a/branches/sage/crush/script/runset.pl b/branches/sage/crush/script/runset.pl deleted file mode 100755 index 966cf4e5100cb..0000000000000 --- a/branches/sage/crush/script/runset.pl +++ /dev/null @@ -1,380 +0,0 @@ -#!/usr/bin/perl - -use strict; -use Data::Dumper; - -=item sample input file - -# hi there -{ - # startup - 'n' => 30, # mpi nodes - 'sleep' => 10, # seconds between runs - 'nummds' => 1, - 'numosd' => 8, - 'numclient' => 400,#[10, 50, 100, 200, 400], - - # parameters - 'fs' => [ 'ebofs', 'fakestore' ], - 'until' => 150, # --syn until $n ... when to stop clients - 'writefile' => 1, - 'writefile_size' => [ 4096, 65526, 256000, 1024000, 2560000 ], - 'writefile_mb' => 1000, - - 'custom' => '--tcp_skip_rank0 --osd_maxthreads 0'; - - # for final summation (script/sum.pl) - 'start' => 30, - 'end' => 120, - - '_psub' => 'alc.tp' # switch to psub mode! -}; - -=cut - -my $usage = "script/runset.pl [--clean] jobs/some/job blah\n"; - -my $clean; -my $use_srun; -my $nobg = '&'; -my $in = shift || die $usage; -if ($in eq '--clean') { - $clean = 1; - $in = shift || die $usage; -} -if ($in eq '--srun') { - $use_srun = 1; - $in = shift || die $usage; -} -if ($in eq '--nobg') { - $nobg = ''; - $in = shift || die $usage; -} -my $tag = shift || die $usage; -my $fake = shift; - - -my ($job) = $in =~ /^jobs\/(.*)/; -my ($jname) = $job =~ /\/(\w+)$/; -$jname ||= $job; -die "not jobs/?" unless defined $job; -my $out = "log/$job.$tag"; -my $relout = "$job.$tag"; - - -my $cwd = `/bin/pwd`; -chomp($cwd); - - - -print "# --- job $job, tag $tag ---\n"; - - -# get input -my $raw = `cat $in`; -my $sim = eval $raw; -unless (ref $sim) { - print "bad input: $in\n"; - system "perl -c $in"; - exit 1; -} - -# prep output -system "mkdir -p $out" unless -d "$out"; - -open(W, ">$out/in"); -print W $raw; -close W; - -my $comb = $sim->{'comb'}; -delete $sim->{'comb'}; -my %filters; -my @fulldirs; - - - -sub reset { - print "reset: restarting mpd in 3 seconds\n"; - system "sleep 3 && (mpiexec -l -n 32 killall newsyn ; restartmpd.sh)"; - print "reset: done\n"; -} - - -if (`hostname` =~ /alc/ && !$use_srun) { - print "# this looks like alc\n"; - $sim->{'_psub'} = 'jobs/alc.tp'; -} - - -sub iterate { - my $sim = shift @_; - my $fix = shift @_ || {}; - my $vary; - my @r; - - my $this; - for my $k (sort keys %$sim) { - next if $k =~ /^_/; - if (defined $fix->{$k}) { - $this->{$k} = $fix->{$k}; - } - elsif (ref $sim->{$k} eq 'HASH') { - # nothing - } - elsif (!(ref $sim->{$k})) { - $this->{$k} = $sim->{$k}; - } - else { - #print ref $sim->{$k}; - if (!(defined $vary)) { - $vary = $k; - } - } - } - - if ($vary) { - #print "vary $vary\n"; - for my $v (@{$sim->{$vary}}) { - $this->{$vary} = $v; - push(@r, &iterate($sim, $this)); - } - } else { - - if ($sim->{'_dep'}) { - my @s = @{$sim->{'_dep'}}; - while (@s) { - my $dv = shift @s; - my $eq = shift @s; - - $eq =~ s/\$(\w+)/"\$this->{'$1'}"/eg; - $this->{$dv} = eval $eq; - #print "$dv : $eq -> $this->{$dv}\n"; - } - } - - push(@r, $this); - } - return @r; -} - - - -sub run { - my $h = shift @_; - - my @fn; - my @filt; - my @vals; - for my $k (sort keys %$sim) { - next if $k =~ /^_/; - next unless ref $sim->{$k} eq 'ARRAY'; - push(@fn, "$k=$h->{$k}"); - push(@vals, $h->{$k}); - next if $comb && $k eq $comb->{'x'}; - push(@filt, "$k=$h->{$k}"); - } - my $keys = join(",", @fn); - $keys =~ s/ /_/g; - my $fn = $out . '/' . $keys; - my $name = $jname . '_' . join('_',@vals); #$tag . '_' . $keys; - - push( @fulldirs, "" . $fn ); - - - # filters - $filters{ join(',', @filt) } = 1; - - - #system "sh $fn/sh.post" if -e "$fn/sh.post";# && !(-e "$fn/.post"); - if (-e "$fn/.done") { - print "already done.\n"; - return; - } - system "rm -r $fn" if $clean && -d "$fn"; - system "mkdir $fn" unless -d "$fn"; - - my $e = './newsyn'; - #$e = './tcpsynobfs' if $h->{'fs'} eq 'obfs'; - my $c = "$e"; - $c .= " --mkfs" unless $h->{'no_mkfs'}; - $c .= " --$h->{'fs'}" if $h->{'fs'}; - $c .= " --syn until $h->{'until'}" if $h->{'until'}; - - $c .= " --syn writefile $h->{'writefile_mb'} $h->{'writefile_size'}" if $h->{'writefile'}; - $c .= " --syn rw $h->{'rw_mb'} $h->{'rw_size'}" if $h->{'rw'}; - $c .= " --syn readfile $h->{'readfile_mb'} $h->{'readfile_size'}" if $h->{'readfile'}; - $c .= " --syn makedirs $h->{'makedirs_dirs'} $h->{'makedirs_files'} $h->{'makedirs_depth'}" if $h->{'makedirs'}; - - if ($h->{'ebofs_freelist'}) { - system "cp freelist/ebofs.freelist.$h->{'ebofs_freelist'} ebofs.freelist"; - $c .= " --osd_age_time -1"; - } - - for my $k ('nummds', 'numclient', 'numosd', 'kill_after', - 'osd_maxthreads', 'osd_object_layout', 'osd_pg_layout','osd_pg_bits', - 'mds_bal_rep', 'mds_bal_interval', 'mds_bal_max','mds_decay_halflife', - 'mds_bal_hash_rd','mds_bal_hash_wr','mds_bal_unhash_rd','mds_bal_unhash_wr', - 'mds_cache_size','mds_log_max_len', - 'mds_local_osd', - 'osd_age_time','osd_age', - 'osd_rep', - 'osd_pad_pg_log','ebofs_realloc', - 'osd_balance_reads', - 'tcp_multi_out', - 'client_cache_stat_ttl','client_cache_readdir_ttl', - 'client_oc', - 'fake_osdmap_updates', - 'bdev_el_bidir', 'ebofs_idle_commit_ms', 'ebofs_commit_ms', - 'ebofs_oc_size','ebofs_cc_size','ebofs_bc_size','ebofs_bc_max_dirty','ebofs_abp_max_alloc', - 'file_layout_ssize','file_layout_scount','file_layout_osize','file_layout_num_rep', - 'meta_dir_layout_ssize','meta_dir_layout_scount','meta_dir_layout_osize','meta_dir_layout_num_rep', - 'meta_log_layout_ssize','meta_log_layout_scount','meta_log_layout_osize','meta_log_layout_num_rep') { - $c .= " --$k $h->{$k}" if defined $h->{$k}; - } - - $c .= ' ' . $h->{'custom'} if $h->{'custom'}; - - $c .= " --log_name $relout/$keys"; - - my $post = "#!/bin/sh -script/sum.pl -start $h->{'start'} -end $h->{'end'} $fn/osd\\* > $fn/sum.osd -script/sum.pl -start $h->{'start'} -end $h->{'end'} $fn/mds? $fn/mds?? > $fn/sum.mds -script/sum.pl -start $h->{'start'} -end $h->{'end'} $fn/mds*.log > $fn/sum.mds.log -script/sum.pl -start $h->{'start'} -end $h->{'end'} $fn/clnode* > $fn/sum.cl -touch $fn/.post -"; - open(O,">$fn/sh.post"); - print O $post; - close O; - - my $killmin = 1 + int ($h->{'kill_after'} / 60); - - $c = "bash -c \"ulimit -c 0 ; $c\""; - #$c = "bash -c \"$c\""; - - my $srun = "srun --wait=600 --exclude=jobs/ltest.ignore -l -t $killmin -N $h->{'n'} -p ltest"; - my $mpiexec = "mpiexec -l -n $h->{'n'}"; - my $launch; - if ($use_srun) { - $launch = $srun; - } else { - $launch = $mpiexec; - } - - if ($sim->{'_psub'}) { - # template! - my $tp = `cat $sim->{'_psub'}`; - $tp =~ s/\$CWD/$cwd/g; - $tp =~ s/\$NAME/$name/g; - $tp =~ s/\$NUM/$h->{'n'}/g; - $tp =~ s/\$OUT/$fn\/o/g; - $tp =~ s/\$DONE/$fn\/.done/g; - $tp =~ s/\$CMD/$c/g; - open(O,">$out/$name"); - print O $tp; - close O; - print "\npsub $out/$name\n"; - return; - } else { - # run - my $cmd = "\n$launch $c > $fn/o && touch $fn/.done";# - #my $cmd = "\n$launch $c > $fn/o ; touch $fn/.done"; - print "$cmd $nobg\n"; - my $r = undef; - unless ($fake) { - if ($sim->{'_pre'}) { - print "pre: $launch $sim->{'_pre'}\n"; - system "$launch $sim->{'_pre'}"; - } - $r = system $cmd; - if ($sim->{'_post'}) { - print "post: $launch $sim->{'_post'}\n"; - system "$launch $sim->{'_post'}"; - } - if ($r) { - print "r = $r\n"; - #&reset; - } - system "sh $fn/sh.post"; - } - return $r; - } -} - - - -my @r = &iterate($sim); -my $n = scalar(@r); -my $c = 1; -my %r; -my $nfailed = 0; -for my $h (@r) { - my $d = `date`; - chomp($d); - $d =~ s/ P.T .*//; - print "# === $c/$n"; - print " ($nfailed failed)" if $nfailed; - print " $d: "; - my $r = &run($h); - - if (!(defined $r)) { - # already done - } else { - if ($r) { - $nfailed++; - } - print "sleep $h->{'sleep'}\n"; - sleep $h->{'sleep'}; - } - - $c++; -} -print "$nfailed failed\n"; - - -my @comb; -if ($comb) { - my $x = $comb->{'x'}; - my @vars = @{$comb->{'vars'}}; - - print "\n\n# post\n"; - for my $p (@fulldirs) { - print "sh $p/sh.post\n"; - } - - my @filters = sort keys %filters; - my $cmd = "script/comb.pl $x @vars - @fulldirs - @filters > $out/c"; - print "$cmd\n"; - open(O,">$out/comb"); - print O "$cmd\n"; - close O; - system $cmd; - - print "\n\n"; - - my $plot; - $plot .= "set data style linespoints;\n"; - my $s = 2; - for my $v (@vars) { - my $c = $s; - $s++; - my @p; - for my $f (@filters) { - my $t = $f; - if ($comb->{'maptitle'}) { - for my $a (keys %{$comb->{'maptitle'}}) { - my $b = $comb->{'maptitle'}->{$a}; - $t =~ s/$a/$b/; - } - } - push (@p, "\"$out/c\" u 1:$c t \"$t\"" ); - $c += scalar(@vars); - } - $plot .= "# $v\nplot " . join(", ", @p) . ";\n\n"; - } - print $plot; - open(O,">$out/plot"); - print O $plot; - close O; -} - diff --git a/branches/sage/crush/script/sum.pl b/branches/sage/crush/script/sum.pl deleted file mode 100755 index 92ef9a9b222a8..0000000000000 --- a/branches/sage/crush/script/sum.pl +++ /dev/null @@ -1,148 +0,0 @@ -#!/usr/bin/perl - -use strict; -my $starttime = 1; -my $endtime = -1; - -my $avgrows = 0; - -while ($ARGV[0] =~ /^-/) { - $_ = shift @ARGV; - if ($_ eq '-avg') { - $avgrows = 1; - } - elsif ($_ eq '-start') { - $starttime = shift @ARGV; - } - elsif ($_ eq '-end') { - $endtime = shift @ARGV; - } - else { - die "i don't understand arg $_"; - } -} -my @files = @ARGV; - -if (scalar(@files) == 1 && $files[0] =~ /\*/) { - my ($dir, $pat) = $files[0] =~ /^(.*)\/([^\/]+)$/; - @files = (); - $pat =~ s/\*//; -# print "dir $dir pat $pat\n"; - opendir(D,"$dir"); - for my $f (readdir(D)) { - # print "$f\n"; - next unless $f =~ /^$pat/; - push(@files, "$dir/$f"); - } - closedir(D); - -# print "files = @files\n"; -} - -my @data; -for my $f (@files) { - open(I,$f); - push( @data, ); - close I; -} - -my %sum; # time -> name -> val -my %col; # colnum -> name .. colnums start at 0 (time doesn't count) -my %min; -my %max; -my %avg; -my %tcount; -my $files; -for (@data) { - chomp; - my @r = split(/\s+/,$_); - my $r = shift @r; - - # column headings? - if ($r =~ /^\#/) { - my $num = 0; - while (my $name = shift @r) { - $col{$num} = $name; - $num++; - } - next; - } - - next unless int $r; - next if $r < $starttime; - next if $endtime > 0 && $r > $endtime; - - $tcount{$r}++; - $files = $tcount{$r} if $tcount{$r} > $files; - #print "$r: @r\n"; - my $i = 0; - while (@r) { - my $v = shift @r; - $sum{$r}->{$col{$i}} += $v; # if $v > 0; - - $min{$col{$i}} = $v - if ($min{$col{$i}} > $v || !(defined $min{$col{$i}})); - $max{$col{$i}} = $v - if ($max{$col{$i}} < $v); - - $avg{$col{$i}} += $v; - $i++; - } -} - -## dump -my @c = sort {$a <=> $b} keys %col; -# cols -print join("\t",'#', map { $col{$_} } @c) . "\n"; -my $n = 0; -for my $k (sort {$a <=> $b} keys %sum) { - if ($avgrows) { - print join("\t",$k, #map int, - map { $sum{$k}->{$col{$_}}/$tcount{$k} } @c ) . "\n"; - } else { - print join("\t",$k, map { $sum{$k}->{$col{$_}} } @c ) . "\n"; - } - $n++; -} - -my $rows = $n || 1; -#my $files = $tcount{$starttime}; -my %avgval; - -## devt -#warn "rows $rows, files $files\n"; -my %avgvalvart; # std dev of each col avg, over time -for my $k (keys %avg) { - my $av = $avgval{$k} = $avg{$k} / ($rows*$files); - - my $var = 0.0; - for my $t (sort {$a <=> $b} keys %sum) { - my $a = $sum{$t}->{$k} / $files; - $var += ($a - $av) * ($a - $av); - } - - $avgvalvart{$k} = $var / $rows; -} - - - - -print "\n"; -print join("\t",'#', map { $col{$_} } @c) . "\n"; -print join("\t", '#minval', map { $min{$col{$_}} } @c ) . "\n"; -print join("\t", '#maxval', map { $max{$col{$_}} } @c ) . "\n"; -print join("\t", '#rows', map { $rows } @c) . "\n"; -print join("\t", '#files', map { $files } @c) . "\n"; -print join("\t", '#sum', - map { $avg{$col{$_}} } @c ) . "\n"; -print join("\t", '#avgval', #map int, - map { $avgval{$col{$_}} } @c ) . "\n"; -# map { ($rows*$files) ? ($_ / ($rows*$files)):0 } map { $avg{$col{$_}} } @c ) . "\n"; - -print join("\t", '#avgvalvart', - map { $avgvalvart{$col{$_}} } @c ) . "\n"; -print join("\t", '#avgvaldevt', - map { sqrt($_) } map { $avgvalvart{$col{$_}} } @c ) . "\n"; - -print join("\t", '#avgsum', #map int, - map { $_ / $rows } map { $avg{$col{$_}} } @c ) . "\n"; diff --git a/branches/sage/crush/test/fakemds.cc b/branches/sage/crush/test/fakemds.cc deleted file mode 100644 index b75b62d58152c..0000000000000 --- a/branches/sage/crush/test/fakemds.cc +++ /dev/null @@ -1,104 +0,0 @@ - - -#include -#include -#include - -#include "mds/MDS.h" -#include "osd/OSD.h" -#include "fakeclient/FakeClient.h" - -#include "mds/MDCluster.h" -#include "mds/MDCache.h" -#include "mds/MDStore.h" - -#include "msg/FakeMessenger.h" - -#include "messages/MPing.h" - -using namespace std; - -__uint64_t ino = 1; - - - -#include "config.h" -#define NUMMDS g_conf.num_mds -#define NUMOSD g_conf.num_osd -#define NUMCLIENT g_conf.num_fakeclient - -// this parses find output -int play(); - -int main(int oargc, char **oargv) { - cerr << "hi there" << endl; - - int argc; - char **argv; - parse_config_options(oargc, oargv, - argc, argv); - - MDCluster *mdc = new MDCluster(NUMMDS, NUMOSD); - - // local config settings - g_conf.num_client = g_conf.num_fakeclient; // to fool mds, hack gross - - // create osds - OSD *osd[NUMOSD]; - for (int i=0; iinit(); - } - - // create mds - MDS *mds[NUMMDS]; - for (int i=0; iinit(); - } - - - // create clients - FakeClient *client[NUMCLIENT]; - for (int i=0; iinit(); - } - - // mount clients - for (int i=0; imount(); - - // loop - fakemessenger_do_loop(); - - //mds[0]->shutdown_start(); - //fakemessenger_do_loop(); - - // - if (argc > 1 && - strcmp(argv[1], "nocheck") == 0) { - cerr << "---- nocheck" << endl; - } else { - cout << "---- check ----" << endl; - for (int i=0; imdcache->shutdown_pass(); - } - - // cleanup - cout << "cleanup" << endl; - for (int i=0; i - * Daniel Jönsson - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the Do What The Fuck You Want To - * Public License as published by Banlu Kemiyatorn. See - * http://sam.zoy.org/projects/COPYING.WTFPL for more details. - * - * Compilation example: - * gcc -shared -fPIC gprof-helper.c -o gprof-helper.so -lpthread -ldl - * - * Usage example: - * LD_PRELOAD=./gprof-helper.so your_program - */ - -#define _GNU_SOURCE -#include -#include -#include -#include -#include - -static void * wrapper_routine(void *); - -/* Original pthread function */ -static int (*pthread_create_orig)(pthread_t *__restrict, - __const pthread_attr_t *__restrict, - void *(*)(void *), - void *__restrict) = NULL; - -/* Library initialization function */ -void wooinit(void) __attribute__((constructor)); - -void wooinit(void) -{ - pthread_create_orig = dlsym(RTLD_NEXT, "pthread_create"); - fprintf(stderr, "pthreads: using profiling hooks for gprof\n"); - if(pthread_create_orig == NULL) - { - char *error = dlerror(); - if(error == NULL) - { - error = "pthread_create is NULL"; - } - fprintf(stderr, "%s\n", error); - exit(EXIT_FAILURE); - } -} - -/* Our data structure passed to the wrapper */ -typedef struct wrapper_s -{ - void * (*start_routine)(void *); - void * arg; - - pthread_mutex_t lock; - pthread_cond_t wait; - - struct itimerval itimer; - -} wrapper_t; - -/* The wrapper function in charge for setting the itimer value */ -static void * wrapper_routine(void * data) -{ - /* Put user data in thread-local variables */ - void * (*start_routine)(void *) = ((wrapper_t*)data)->start_routine; - void * arg = ((wrapper_t*)data)->arg; - - /* Set the profile timer value */ - setitimer(ITIMER_PROF, &((wrapper_t*)data)->itimer, NULL); - - /* Tell the calling thread that we don't need its data anymore */ - pthread_mutex_lock(&((wrapper_t*)data)->lock); - pthread_cond_signal(&((wrapper_t*)data)->wait); - pthread_mutex_unlock(&((wrapper_t*)data)->lock); - - /* Call the real function */ - return start_routine(arg); -} - -/* Our wrapper function for the real pthread_create() */ -int pthread_create(pthread_t *__restrict thread, - __const pthread_attr_t *__restrict attr, - void * (*start_routine)(void *), - void *__restrict arg) -{ - wrapper_t wrapper_data; - int i_return; - - /* Initialize the wrapper structure */ - wrapper_data.start_routine = start_routine; - wrapper_data.arg = arg; - getitimer(ITIMER_PROF, &wrapper_data.itimer); - pthread_cond_init(&wrapper_data.wait, NULL); - pthread_mutex_init(&wrapper_data.lock, NULL); - pthread_mutex_lock(&wrapper_data.lock); - - /* The real pthread_create call */ - i_return = pthread_create_orig(thread, - attr, - &wrapper_routine, - &wrapper_data); - - /* If the thread was successfully spawned, wait for the data - * to be released */ - if(i_return == 0) - { - pthread_cond_wait(&wrapper_data.wait, &wrapper_data.lock); - } - - pthread_mutex_unlock(&wrapper_data.lock); - pthread_mutex_destroy(&wrapper_data.lock); - pthread_cond_destroy(&wrapper_data.wait); - - return i_return; -} - diff --git a/branches/sage/crush/test/makedirs.cc b/branches/sage/crush/test/makedirs.cc deleted file mode 100644 index 8fd74d996ef9f..0000000000000 --- a/branches/sage/crush/test/makedirs.cc +++ /dev/null @@ -1,38 +0,0 @@ -#include -#include -using namespace std; - -int make_dirs(const char *basedir, int dirs, int files, int depth) -{ - //if (time_to_stop()) return 0; - - // make sure base dir exists - int r = mkdir(basedir, 0755); - if (r != 0) { - cout << "can't make base dir? " << basedir << endl; - return -1; - } - - // children - char d[500]; - cout << "make_dirs " << basedir << " dirs " << dirs << " files " << files << " depth " << depth << endl; - for (int i=0; i -#include -#include -using namespace std; - -#include "mds/MDCluster.h" -#include "mds/MDS.h" -#include "osd/OSD.h" -#include "fakeclient/FakeClient.h" - -#include "mds/MDCache.h" -#include "mds/MDStore.h" - -#include "msg/MPIMessenger.h" -//#include "msg/CheesySerializer.h" - -#include "messages/MPing.h" - - -__uint64_t ino = 1; - - - -#include "config.h" -#define NUMMDS g_conf.num_mds -#define NUMOSD g_conf.num_osd -#define NUMCLIENT g_conf.num_client - -// this parses find output -int play(); - -int main(int argc, char **argv) { - cout << "mpitest starting" << endl; - - int myrank = mpimessenger_init(argc, argv); - int world = mpimessenger_world(); - - - - MDCluster *mdc = new MDCluster(NUMMDS, NUMOSD); - - // create osds - OSD *osd[NUMOSD]; - for (int i=0; iinit(); - } - - // create mds - MDS *mds[NUMMDS]; - for (int i=0; iinit(); - } - - // create clients - FakeClient *client[NUMCLIENT]; - for (int i=0; iset_dispatcher(serializer); - - client[i] = new FakeClient(mdc, i, real, g_conf.fakeclient_requests); - client[i]->init(); - } - - // seed initial requests - for (int i=0; iissue_request(); - } - - mpimessenger_start(); // start message loop - mpimessenger_wait(); // wait for thread to finish - mpimessenger_shutdown(); // shutdown MPI - - // - /* - cout << "---- check ----" << endl; - for (int i=0; imdcache->shutdown_pass(); - } - */ - - // cleanup - //cout << "cleanup" << endl; - for (int i=0; i -#include "mpi.h" - -#include "messages/MClientRequest.h" -#include "msg/MTMessenger.h" -#include "include/error.h" - -#define SARG_SIZE 64 -#define SERVER_RANK 0 -#define NTHREADS 11 // number of threads per rank -#define NMESSAGES 31 // number of messages per thread - -static void server_loop(MTMessenger &msgr, int world_size) -{ - // we expect this many messages from clients, then we quit - // (world_size-1 since server is one of the processes). - int totmsg = NTHREADS * NMESSAGES * (world_size - 1); - int nmsg = 0; - - char buf[SARG_SIZE]; - - while(nmsg < totmsg) { - MClientRequest *req = (MClientRequest*)msgr.recvreq(); - ASSERT(req->get_type() == MSG_CLIENT_REQUEST); - - //cout << "Server acknowledging " << req->get_sarg() << endl; - - sprintf(buf, "%s reply", req->get_sarg().c_str()); - MClientRequest resp(0, 0); - resp.set_sarg(buf); - msgr.sendresp(req, &resp); - - delete req; - nmsg++; - } - - cout << "Server successful" << endl; -} - -// arguments for client thread start function (see pthread_create) -struct client_arg -{ - MTMessenger *msgr; - int rank; - int thread; -}; - -static void *client_session(void *_carg) -{ - client_arg *carg = (client_arg *)_carg; - - char buf[SARG_SIZE]; - - // repeat some number (arbitrary really) of rounds - for (int i = 0; i < NMESSAGES; i++) { - - // send the message, receive the reply and check reply is as - // expected - - MClientRequest request(0, 0); - sprintf(buf, "r%d:t%d:m%d", carg->rank, carg->thread, i); - request.set_sarg(buf); - - //cout << "Client sending " << request.get_sarg() << endl; - - MClientRequest *resp = - (MClientRequest*)carg->msgr->sendrecv(&request, SERVER_RANK); - - ASSERT(resp->get_type() == MSG_CLIENT_REQUEST); - sprintf(buf, "r%d:t%d:m%d reply", carg->rank, carg->thread, i); - ASSERT(strcmp(buf, resp->get_sarg().c_str()) == 0); - - //cout << "Client verified " << resp->get_sarg() << endl; - - delete resp; - } - - cout << "Client (" << carg->rank << "," << carg->thread - << ") successful" << endl; - - delete carg; - return NULL; -} - -static void launch_clients(MTMessenger &msgr, int rank) -{ - pthread_t tid[NTHREADS]; - - // launch some number (arbitrary really) of threads - for (int i = 0; i < NTHREADS; i++) { - - client_arg *carg = (client_arg*)malloc(sizeof(client_arg)); - ASSERT(carg); - carg->msgr = &msgr; - carg->rank = rank; - carg->thread = i; - - if (pthread_create(&tid[i], NULL, client_session, carg) < 0) - SYSERROR(); - } - - // we must wait for all the threads to exit before returning, - // otherwise we shutdown MPI before while the threads are - // chatting. - for (int i = 0; i < NTHREADS; i++) { - void *retval; - - if (pthread_join(tid[i], &retval) < 0) - SYSERROR(); - } -} - -int main(int argc, char **argv) -{ - MTMessenger msgr(argc, argv); - - int rank; - ASSERT(MPI_Comm_rank(MPI_COMM_WORLD, &rank) == MPI_SUCCESS); - int world_size; - ASSERT(MPI_Comm_size(MPI_COMM_WORLD, &world_size) == MPI_SUCCESS); - - if (rank == SERVER_RANK) - server_loop(msgr, world_size); - else - launch_clients(msgr, rank); - - return 0; -} diff --git a/branches/sage/crush/test/rushconfig b/branches/sage/crush/test/rushconfig deleted file mode 100644 index 40d82702ea0a5..0000000000000 --- a/branches/sage/crush/test/rushconfig +++ /dev/null @@ -1,7 +0,0 @@ -6 -8 10.0 -4 20.0 -7 30.0 -9 10.0 -8 15.0 -5 11.0 diff --git a/branches/sage/crush/test/rushtest.cc b/branches/sage/crush/test/rushtest.cc deleted file mode 100644 index ecff83523e0c6..0000000000000 --- a/branches/sage/crush/test/rushtest.cc +++ /dev/null @@ -1,49 +0,0 @@ -// -// $Id$ -// - -#include -#include -#include "../osd/rush.h" - -main (int argc, char *argv[]) -{ - Rush rush; - char buf[200]; - int i, j, k, numClusters; - int numKeys = 5; - int numReplicas = 4; - int curSize; - double curWeight; - int servers[1000]; - - if (argc > 1) { - numKeys = atoi (argv[1]); - } - if (argc > 2) { - numReplicas = atoi (argv[2]); - } - - fgets (buf, sizeof (buf) - 2, stdin); - sscanf (buf, "%d", &numClusters); - for (i = 0; i < numClusters; i++) { - fgets (buf, sizeof (buf) - 2, stdin); - sscanf (buf, "%d %lf", &curSize, &curWeight); - rush.AddCluster (curSize, curWeight); - if (rush.Servers () < numReplicas) { - fprintf (stderr, "ERROR: must have at least %d disks in the system!\n", - rush.Clusters ()); - exit (-1); - } - for (j = 0; j < numKeys; j++) { - rush.GetServersByKey (j, numReplicas, servers); -#if 0 - printf ("%-3d %-6d ", i, j); - for (k = 0; k < numReplicas; k++) { - printf ("%-5d ", servers[k]); - } - putchar ('\n'); -#endif - } - } -} diff --git a/branches/sage/crush/test/rushtest.cc~ b/branches/sage/crush/test/rushtest.cc~ deleted file mode 100644 index 0b9512ccd0c3d..0000000000000 --- a/branches/sage/crush/test/rushtest.cc~ +++ /dev/null @@ -1,49 +0,0 @@ -// -// $Id$ -// - -#include -#include -#include "rush.h" - -main (int argc, char *argv[]) -{ - Rush rush; - char buf[200]; - int i, j, k, numClusters; - int numKeys = 5; - int numReplicas = 4; - int curSize; - double curWeight; - int servers[1000]; - - if (argc > 1) { - numKeys = atoi (argv[1]); - } - if (argc > 2) { - numReplicas = atoi (argv[2]); - } - - fgets (buf, sizeof (buf) - 2, stdin); - sscanf (buf, "%d", &numClusters); - for (i = 0; i < numClusters; i++) { - fgets (buf, sizeof (buf) - 2, stdin); - sscanf (buf, "%d %lf", &curSize, &curWeight); - rush.AddCluster (curSize, curWeight); - if (rush.Servers () < numReplicas) { - fprintf (stderr, "ERROR: must have at least %d disks in the system!\n", - rush.Clusters ()); - exit (-1); - } - for (j = 0; j < numKeys; j++) { - rush.GetServersByKey (j, numReplicas, servers); -#if 0 - printf ("%-3d %-6d ", i, j); - for (k = 0; k < numReplicas; k++) { - printf ("%-5d ", servers[k]); - } - putchar ('\n'); -#endif - } - } -} diff --git a/branches/sage/crush/test/test_seek_read.c b/branches/sage/crush/test/test_seek_read.c deleted file mode 100644 index 988ff1dcec88d..0000000000000 --- a/branches/sage/crush/test/test_seek_read.c +++ /dev/null @@ -1,53 +0,0 @@ -#include "include/types.h" -#include "common/Clock.h" - -#include -#include -#include -#include -#include -#include - -int main(int argc, char **argv) -{ - char *fn = argv[1]; - uint64_t numblocks = atoll(argv[2]) / 4096; - int count = 400; - - cout << "fn " << fn << endl; - cout << "numblocks " << numblocks << endl; - - int blocks = 1; - while (blocks <= 1024) { - int fd = ::open(fn, O_RDWR|O_DIRECT);//|O_SYNC|O_DIRECT); - if (fd < 0) return 1; - //cout << "fd is " << fd << endl; - - void *buf; - ::posix_memalign(&buf, 4096, 4096*blocks); - - int s = blocks*4096; - - utime_t start = g_clock.now(); - for (int i=0; i -#include -using namespace std; - - -ostream& operator<<(ostream& out, vector& v) -{ - out << "["; - for (int i=0; i disks; - for (int i=0; i<20; i++) - disks.push_back(i); - - - /* - UniformBucket ub(1, 1, 0, 10, disks); - ub.make_primes(h); - cout << "primes are " << ub.primes << endl; - */ - - MixedBucket mb(2, 1); - for (int i=0;i<20;i++) - mb.add_item(i, 10); - - /* - MixedBucket b(3, 1); - b.add_item(1, ub.get_weight()); - b.add_item(2, mb.get_weight()); - */ - MixedBucket b= mb; - - vector ocount(disks.size()); - int numrep = 3; - - vector v(numrep); - for (int x=1; x<1000000; x++) { - //cout << H(x) << "\t" << h(x) << endl; - for (int i=0; i -using namespace std; - -#include "include/bufferlist.h" - - -int main() -{ - - bufferptr p1 = new buffer("123456",6); - bufferptr p2 = p1; - - cout << "it is '" << p1.c_str() << "'" << endl; - - bufferptr p3 = new buffer("abcdef",6); - - cout << "p3 is " << p3 << endl; - - bufferlist bl; - bl.push_back(p2); - bl.push_back(p1); - bl.push_back(p3); - - cout << "bl is " << bl << endl; - - cout << "len is " << bl.length() << endl; - - bufferlist took; - bl.splice(10,4,&took); - - cout << "took out " << took << "leftover is " << bl << endl; - //cout << "len is " << bl.length() << endl; - - bufferlist bl2; - bl2.substr_of(bl, 3, 5); - cout << "bl2 is " << bl2 << endl; - - -} diff --git a/branches/sage/crush/test/testcrush.cc b/branches/sage/crush/test/testcrush.cc deleted file mode 100644 index bd432b23ee95c..0000000000000 --- a/branches/sage/crush/test/testcrush.cc +++ /dev/null @@ -1,266 +0,0 @@ - - -#include "../crush/crush.h" -using namespace crush; - -#include - -#include -#include -using namespace std; - -/* -ostream& operator<<(ostream& out, vector& v) -{ - out << "["; - for (int i=0; i& d) -{ - d.clear(); - while (n) { - d.push_back(no); - no++; - n--; - } -} - - -Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks, int& nbuckets) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - return b; - } else { - // mixed - MixedBucket *b = new MixedBucket(nbuckets--, h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, int& ndisks, int& nbuckets) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks, nbuckets); - return b->get_id(); -} - - - -int main() -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - vector disks; - int root = -1; - int nbuckets = -1; - int ndisks = 0; - - if (0) { - make_disks(12, ndisks, disks); - UniformBucket ub1(-1, 1, 0, 30, disks); - ub1.make_primes(h); - cout << "ub1 primes are " << ub1.primes << endl; - c.add_bucket(&ub1); - - make_disks(17, ndisks, disks); - UniformBucket ub2(-2, 1, 0, 30, disks); - ub2.make_primes(h); - cout << "ub2 primes are " << ub2.primes << endl; - c.add_bucket(&ub2); - - make_disks(4, ndisks, disks); - UniformBucket ub3(-3, 1, 0, 30, disks); - ub3.make_primes(h); - cout << "ub3 primes are " << ub3.primes << endl; - c.add_bucket(&ub3); - - make_disks(20, ndisks, disks); - MixedBucket umb1(-4, 1); - for (int i=0; i<20; i++) - umb1.add_item(disks[i], 30); - c.add_bucket(&umb1); - - MixedBucket b(-100, 1); - //b.add_item(-2, ub1.get_weight()); - b.add_item(-4, umb1.get_weight()); - //b.add_item(-2, ub2.get_weight()); - //b.add_item(-3, ub3.get_weight()); - } - - if (0) { - int bucket = -1; - MixedBucket *root = new MixedBucket(bucket--, 2); - - for (int i=0; i<5; i++) { - MixedBucket *b = new MixedBucket(bucket--, 1); - - int n = 5; - - if (1) { - // add n buckets of n disks - for (int j=0; jadd_item(disks[k], 10); - - //b->add_item(disks[j], 10); - c.add_bucket(d); - b->add_item(d->get_id(), d->get_weight()); - } - - c.add_bucket(b); - root->add_item(b->get_id(), b->get_weight()); - } else { - // add n*n disks - make_disks(n*n, ndisks, disks); - for (int k=0; kadd_item(disks[k], 10); - - c.add_bucket(b); - root->add_item(b->get_id(), b->get_weight()); - } - } - - c.add_bucket(root); - } - - - if (1) { - vector wid; - for (int d=0; d<5; d++) - wid.push_back(10); - root = make_hierarchy(c, wid, ndisks, nbuckets); - } - - - - // rule - int numrep = 1; - - Rule rule; - if (0) { - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, -100)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - } - if (1) { - /* - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, -4)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, 2, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - */ - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, 1, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - } - - //c.overload[10] = .1; - - - int pg_per = 100; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - cout << ndisks << " disks, " << 1-nbuckets << " buckets" << endl; - cout << pg_per << " pgs per disk" << endl; - cout << numpg << " logical pgs" << endl; - cout << "numrep is " << numrep << endl; - - - int place = 1000000; - int times = place / numpg; - if (!times) times = 1; - - cout << "looping " << times << " times" << endl; - - float tvar = 0; - int tvarnum = 0; - - int x = 0; - for (int t=0; t v(numrep); - - for (int z=0; z -using namespace std; - -int print(string s) { - filepath fp = s; - cout << "s = " << s << " filepath = " << fp << endl; - cout << " depth " << fp.depth() << endl; - for (int i=0; i -#include -#include -using namespace std; - -#include "config.h" -#include "messages/MPing.h" -#include "common/Mutex.h" - -#include "msg/MPIMessenger.h" - -class Pinger : public Dispatcher { -public: - Messenger *messenger; - Pinger(Messenger *m) : messenger(m) { - m->set_dispatcher(this); - } - void dispatch(Message *m) { - //dout(1) << "got incoming " << m << endl; - delete m; - - } -}; - -int main(int argc, char **argv) { - int num = 1000; - - int myrank = mpimessenger_init(argc, argv); - int world = mpimessenger_world(); - - Pinger *p = new Pinger( new MPIMessenger(myrank) ); - - mpimessenger_start(); - - //while (1) { - for (int i=0; i<10000; i++) { - - // ping random nodes - int d = rand() % world; - if (d != myrank) { - //cout << "sending " << i << " to " << d << endl; - p->messenger->send_message(new MPing(), d); - } - - } - - - //cout << "shutting down" << endl; - //p->messenger->shutdown(); - - mpimessenger_wait(); - mpimessenger_shutdown(); // shutdown MPI -} diff --git a/branches/sage/crush/test/testnewbuffers.cc b/branches/sage/crush/test/testnewbuffers.cc deleted file mode 100644 index 0fea7571a4572..0000000000000 --- a/branches/sage/crush/test/testnewbuffers.cc +++ /dev/null @@ -1,91 +0,0 @@ - -#include -#include -using namespace std; - - -#include "include/newbuffer.h" -//#include "include/bufferlist.h" - -#include "common/Thread.h" - - - class Th : public Thread { - public: - bufferlist bl; - Th(bufferlist& o) : bl(o) { } - - void *entry() { - //cout << "start" << endl; - // thrash it a bit. - for (int n=0; n<10000; n++) { - bufferlist bl2; - unsigned off = rand() % (bl.length() -1); - unsigned len = 1 + rand() % (bl.length() - off - 1); - bl2.substr_of(bl, off, len); - bufferlist bl3; - bl3.append(bl); - bl3.append(bl2); - //cout << bl3 << endl; - bl2.clear(); - bl3.clear(); - } - //cout << "end" << endl; - } - }; - -int main() -{ - - bufferptr p1 = buffer::copy("123456",7); - //bufferptr p1 = new buffer("123456",7); - bufferptr p2 = p1; - - cout << "p1 is '" << p1.c_str() << "'" << " " << p1 << endl; - cout << "p2 is '" << p2.c_str() << "'" << " " << p2 << endl; - - bufferptr p3 = buffer::copy("abcdef",7); - //bufferptr p3 = new buffer("abcdef",7); - - cout << "p3 is " << p3.c_str() << " " << p3 << endl; - - bufferlist bl; - bl.push_back(p2); - bl.push_back(p1); - bl.push_back(p3); - - cout << "bl is " << bl << endl; - - bufferlist took; - bl.splice(10,4,&took); - - cout << "took out " << took << ", leftover is " << bl << endl; - //cout << "len is " << bl.length() << endl; - - bufferlist bl2; - bl2.substr_of(bl, 3, 5); - cout << "bl2 is " << bl2 << endl; - - - cout << "bl before " << bl << endl; - - list ls; - for (int t=0; t<40; t++) { - Th *t = new Th(bl); - cout << "create" << endl; - t->create(); - ls.push_back(t); - } - - bl.clear(); - - while (!ls.empty()) { - cout << "join" << endl; - ls.front()->join(); - delete ls.front(); - ls.pop_front(); - } - - cout << "bl after " << bl << endl; - -} diff --git a/branches/sage/crush/test/testos.cc b/branches/sage/crush/test/testos.cc deleted file mode 100644 index 24c81590d899c..0000000000000 --- a/branches/sage/crush/test/testos.cc +++ /dev/null @@ -1,343 +0,0 @@ -/* testos.cc -- simple ObjectStore test harness. - Copyright (C) 2007 Casey Marshall - -Ceph - scalable distributed file system - -This is free software; you can redistribute it and/or -modify it under the terms of the GNU Lesser General Public -License version 2.1, as published by the Free Software -Foundation. See file COPYING. */ - - -#include "osd/ObjectStore.h" -#include "ebofs/Ebofs.h" -#include "osbdb/OSBDB.h" -#include "include/buffer.h" - -#include -#include -#include - -#include -#include - -using namespace std; - -static inline unsigned long long -to_usec (struct timeval &time) -{ - return (((unsigned long long) time.tv_sec * 1000000) - + ((unsigned long long) time.tv_usec)); -} - -static inline unsigned long long -to_msec (struct timeval &time) -{ - return (((unsigned long long) time.tv_sec * 1000) - + ((unsigned long long) time.tv_usec / 1000)); -} - -int main (int argc, char **argv) -{ - vector args; - char *osd_name = "ebofs"; - unsigned object_size = 1024; - unsigned object_count = 1024; - unsigned write_iter = 64; - unsigned random_seed = ::time(NULL); - char *device = "/tmp/testos"; - char *mountcmd = "mount /tmp/testos"; - char *umountcmd = "umount /tmp/testos"; - - bool ebofs_raw_device = false; - bool inhibit_remount = (getenv("TESTOS_INHIBIT_REMOUNT") != NULL); - - if (argc > 1 - && (strcmp (argv[1], "-h") == 0 - || strcmp (argv[1], "-help") == 0 - || strcmp (argv[1], "--help") == 0)) - { - cout << "usage: " << argv[0] << " [store [object-size [object-count [iterations [seed]]]]]" << endl; - cout << endl; - cout << "Where the arguments are:" << endl << endl; - cout << " store -- store type; default \"ebofs\"" << endl; - cout << " object-size -- size of objects; default 1024" << endl; - cout << " object-count -- number of objects to write; default 1024" - << endl; - cout << " iterations -- write the objects that many times; default 5" - << endl; - cout << " seed -- random seed; default current time" << endl; - exit (0); - } - - argv_to_vec (argc, argv, args); - for (vector::iterator it = args.begin(); it != args.end(); - it++) - cout << *it << " "; - cout << endl; - parse_config_options (args); - for (vector::iterator it = args.begin(); it != args.end(); - it++) - cout << *it << " "; - cout << endl; - - argc = args.size(); - if (argc > 0) - osd_name = args[0]; - if (argc > 1) - object_size = (unsigned) atol (args[1]); - if (argc > 2) - object_count = (unsigned) atol (args[2]); - if (argc > 3) - write_iter = (unsigned) atol (args[3]); - if (argc > 4) - random_seed = (unsigned) atol (args[4]); - - // algin object size to 'long' - object_size = ((object_size + (sizeof (long) - 1)) / sizeof (long)) * sizeof (long); - - char *osd_file = new char[32]; - strcpy (osd_file, "/tmp/testos/testos.XXXXXX"); - mktemp (osd_file); - - if (strcasecmp (osd_name, "ebofs") == 0) - { - char *dev_env = getenv ("TESTOS_EBOFS_DEV"); - if (dev_env != NULL) - { - // Assume it is a true device. - strncpy (osd_file, dev_env, 32); - inhibit_remount = true; - ebofs_raw_device = true; - } - } - - if (!inhibit_remount) - { - if (system (mountcmd) != 0) - { - cerr << "mount failed" << endl; - exit (1); - } - } - - ObjectStore *os = NULL; - if (strcasecmp (osd_name, "ebofs") == 0) - { - if (!ebofs_raw_device) - { - FILE *f = fopen (osd_file, "w"); - if (f == NULL) - { - cerr << "failed to open " << osd_file << ": " << strerror (errno) - << endl; - exit (1); - } - // 1G file. - fseek (f, 1024 * 1024 * 1024, SEEK_SET); - fputc ('\0', f); - fclose (f); - } - os = new Ebofs (osd_file); - } - else if (strcasecmp (osd_name, "osbdb") == 0) - { - os = new OSBDB (osd_file); - } - else if (strcasecmp (osd_name, "osbdb-btree") == 0) - { - g_conf.bdbstore_btree = true; - os = new OSBDB (osd_file); - } - else - { - cerr << "I don't know about object store \"" << osd_name << "\"" - << endl; - exit (1); - } - - cout << "Writing " << object_count << " objects of size " - << object_size << " to " << osd_name << endl; - - char *val = (char *) malloc (object_size); - char *val2 = (char *) malloc (object_size); - auto_ptr valptr (val); - auto_ptr valptr2(val2); - if (getenv ("TESTOS_UNALIGNED") != NULL) - { - val = val + 1; - val2 = val2 + 1; - } - - for (unsigned i = 0; i < object_size; i++) - { - val[i] = (char) i; - val2[i] = (char) i; - } - object_t *oids = new object_t[object_count]; - - utime_t writes[write_iter]; - utime_t total_write; - utime_t reads[write_iter]; - utime_t total_read; - for (unsigned i = 0; i < write_iter; i++) - { - cerr << "Iteration " << i << endl; - - int ret = os->mkfs(); - if (ret != 0) - { - cerr << "mkfs(" << osd_file << "): " << strerror (-ret) << endl; - exit (1); - } - ret = os->mount(); - if (ret != 0) - { - cerr << "mount(): " << strerror (-ret) << endl; - exit (1); - } - - srandom (random_seed + i); - - for (unsigned j = 0; j < object_count; j++) - { - oids[j].ino = (uint64_t) random() << 32 | random(); - oids[j].bno = random(); - } - - utime_t begin = g_clock.now(); - for (unsigned o = 0; o < object_count; o++) - { - bufferptr bp (val, object_size); - bufferlist bl; - bl.push_back (bp); - int ret; - if ((ret = os->write (oids[o], 0L, object_size, bl, NULL)) < 0) - cerr << "write " << oids[o] << " failed: " - << strerror (-ret) << endl; - } - os->sync(); - - utime_t end = g_clock.now() - begin; - - cerr << "Write finished in " << end << endl; - total_write += end; - writes[i] = end; - - os->umount(); - sync(); - - if (!inhibit_remount) - { - if (system (umountcmd) != 0) - { - cerr << "umount failed" << endl; - exit (1); - } - - if (system (mountcmd) != 0) - { - cerr << "mount(2) failed" << endl; - exit (1); - } - } - - os->mount(); - - // Shuffle the OIDs. - for (int j = 0; j < object_count; j++) - { - int x = random() % object_count; - if (x < 0) - x = -x; - object_t o = oids[j]; - oids[j] = oids[x]; - oids[x] = o; - } - - begin = g_clock.now(); - for (unsigned o = 0; o < object_count; o++) - { - bufferptr bp (val2, object_size); - bufferlist bl; - bl.push_back (bp); - - if (os->read (oids[o], 0L, object_size, bl) < 0) - { - cerr << "object " << oids[o] << " not found!" << endl; - } - } - end = g_clock.now() - begin; - - cerr << "Read finished in " << end << endl; - total_read += end; - reads[i] = end; - - os->umount(); - sync(); - - if (!inhibit_remount) - { - if (system (umountcmd) != 0) - { - cerr << "umount(2) failed" << endl; - exit (1); - } - - if (system (mountcmd) != 0) - { - cerr << "mount(3) failed" << endl; - exit (1); - } - } - } - - cerr << "Finished in " << (total_write + total_read) << endl; - - double write_mean = ((double) total_write) / ((double) write_iter); - double write_sd = 0.0; - for (unsigned i = 0; i < write_iter; i++) - { - double x = ((double) writes[i]) - write_mean; - write_sd += x * x; - } - write_sd = sqrt (write_sd / ((double) write_iter)); - - double read_mean = ((double) total_read) / ((double) write_iter); - double read_sd = 0.0; - for (unsigned i = 0; i < write_iter; i++) - { - double x = ((double) reads[i]) - read_mean; - write_sd += x * x; - } - read_sd = sqrt (read_sd / ((double) write_iter)); - - cout << "TESTOS: write " << osd_name << ":" << object_size << ":" - << object_count << ":" << write_iter << ":" << random_seed - << " -- " << write_mean << " " << write_sd << endl; - - cout << "TESTOS: write.raw -- "; - for (int i = 0; i < write_iter; i++) - cout << ((double) writes[i]) << " "; - cout << endl; - - cout << "TESTOS: read " << osd_name << ":" << object_size << ":" - << object_count << ":" << write_iter << ":" << random_seed - << " -- " << read_mean << " " << read_sd << endl; - - cout << "TESTOS: read.raw -- "; - for (int i = 0; i < write_iter; i++) - cout << ((double) reads[i]) << " "; - cout << endl; - - unlink (osd_file); - if (!inhibit_remount) - { - if (system (umountcmd) != 0) - { - cerr << "umount(3) failed" << endl; - exit (1); - } - } - exit (0); -} diff --git a/branches/sage/crush/test/testosbdb.cc b/branches/sage/crush/test/testosbdb.cc deleted file mode 100644 index 19268e7587531..0000000000000 --- a/branches/sage/crush/test/testosbdb.cc +++ /dev/null @@ -1,347 +0,0 @@ -/* testosbdb.cc -- test OSBDB. - Copyright (C) 2007 Casey Marshall */ - - -#include -#include "osbdb/OSBDB.h" - -using namespace std; - -int -main (int argc, char **argv) -{ - vector args; - argv_to_vec (argc, argv, args); - parse_config_options (args); - - g_conf.debug_bdbstore = 10; - //g_conf.bdbstore_btree = true; - char dbfile[256]; - strncpy (dbfile, "/tmp/testosbdb/db.XXXXXX", 256); - mktemp (dbfile); - OSBDB *os = new OSBDB(dbfile); - auto_ptr osPtr (os); - os->mkfs(); - os->mount(); - - // Put an object. - object_t oid (0xDEADBEEF00000000ULL, 0xFEEDFACE); - - cout << "sizeof oid_t is " << sizeof (oid_t) << endl; - cout << "offsetof oid_t.id " << offsetof (oid_t, id) << endl; - - cout << sizeof (object_t) << endl; - cout << sizeof (oid.ino) << endl; - cout << sizeof (oid.bno) << endl; - cout << sizeof (oid.rev) << endl; - - // Shouldn't be there. - if (os->exists (oid)) - { - cout << "FAIL: oid shouldn't be there " << oid << endl; - } - - // Write an object. - char *x = (char *) malloc (1024); - memset(x, 0xaa, 1024); - bufferptr bp (x, 1024); - bufferlist bl; - bl.push_back (bp); - - if (os->write (oid, 0L, 1024, bl, NULL) != 1024) - { - cout << "FAIL: writing object" << endl; - } - - os->sync(); - - // Should be there. - if (!os->exists (oid)) - { - cout << "FAIL: oid should be there: " << oid << endl; - } - - memset(x, 0, 1024); - if (os->read (oid, 0, 1024, bl) != 1024) - { - cout << "FAIL: reading object" << endl; - } - - for (int i = 0; i < 1024; i++) - { - if ((x[i] & 0xFF) != 0xaa) - { - cout << "FAIL: data read out is different" << endl; - break; - } - } - - // Set some attributes - if (os->setattr (oid, "alpha", "value", strlen ("value")) != 0) - { - cout << "FAIL: set attribute" << endl; - } - if (os->setattr (oid, "beta", "value", strlen ("value")) != 0) - { - cout << "FAIL: set attribute" << endl; - } - if (os->setattr (oid, "gamma", "value", strlen ("value")) != 0) - { - cout << "FAIL: set attribute" << endl; - } - if (os->setattr (oid, "fred", "value", strlen ("value")) != 0) - { - cout << "FAIL: set attribute" << endl; - } - - char *attrs = (char *) malloc (1024); - if (os->listattr (oid, attrs, 1024) != 0) - { - cout << "FAIL: listing attributes" << endl; - } - else - { - char *p = attrs; - if (strcmp (p, "alpha") != 0) - { - cout << "FAIL: should be \"alpha:\" \"" << p << "\"" << endl; - } - p = p + strlen (p) + 1; - if (strcmp (p, "beta") != 0) - { - cout << "FAIL: should be \"beta:\" \"" << p << "\"" << endl; - } - p = p + strlen (p) + 1; - if (strcmp (p, "fred") != 0) - { - cout << "FAIL: should be \"fred:\" \"" << p << "\"" << endl; - } - p = p + strlen (p) + 1; - if (strcmp (p, "gamma") != 0) - { - cout << "FAIL: should be \"gamma:\" \"" << p << "\"" << endl; - } - } - - char attrvalue[256]; - memset(attrvalue, 0, sizeof (attrvalue)); - if (os->getattr (oid, "alpha", attrvalue, sizeof(attrvalue)) < 0) - { - cout << "FAIL: getattr alpha" << endl; - } - else if (strncmp ("value", attrvalue, strlen("value")) != 0) - { - cout << "FAIL: read attribute value differs" << endl; - } - memset(attrvalue, 0, sizeof (attrvalue)); - if (os->getattr (oid, "fred", attrvalue, sizeof(attrvalue)) < 0) - { - cout << "FAIL: getattr fred" << endl; - } - else if (strncmp ("value", attrvalue, strlen("value")) != 0) - { - cout << "FAIL: read attribute value differs" << endl; - } - memset(attrvalue, 0, sizeof (attrvalue)); - if (os->getattr (oid, "beta", attrvalue, sizeof(attrvalue)) < 0) - { - cout << "FAIL: getattr beta" << endl; - } - else if (strncmp ("value", attrvalue, strlen("value")) != 0) - { - cout << "FAIL: read attribute value differs" << endl; - } - memset(attrvalue, 0, sizeof (attrvalue)); - if (os->getattr (oid, "gamma", attrvalue, sizeof(attrvalue)) < 0) - { - cout << "FAIL: getattr gamma" << endl; - } - else if (strncmp ("value", attrvalue, strlen("value")) != 0) - { - cout << "FAIL: read attribute value differs" << endl; - } - - if (os->setattr (oid, "alpha", "different", strlen("different")) != 0) - cout << "FAIL: setattr overwrite" << endl; - memset(attrvalue, 0, sizeof (attrvalue)); - if (os->getattr (oid, "alpha", attrvalue, sizeof(attrvalue)) < 0) - { - cout << "FAIL: getattr alpha" << endl; - } - else if (strncmp ("different", attrvalue, strlen("different")) != 0) - { - cout << "FAIL: read attribute value differs" << endl; - } - - if (os->rmattr (oid, "alpha") != 0) - { - cout << "FAIL: rmattr alpha" << endl; - } - if (os->rmattr (oid, "fred") != 0) - { - cout << "FAIL: rmattr fred" << endl; - } - if (os->rmattr (oid, "beta") != 0) - { - cout << "FAIL: rmattr beta" << endl; - } - if (os->rmattr (oid, "gamma") != 0) - { - cout << "FAIL: rmattr gamma" << endl; - } - - coll_t cid = 0xCAFEBABE; - if (os->create_collection (cid) != 0) - { - cout << "FAIL: create_collection" << endl; - } - if (os->create_collection (cid + 10) != 0) - { - cout << "FAIL: create_collection" << endl; - } - if (os->create_collection (cid + 5) != 0) - { - cout << "FAIL: create_collection" << endl; - } - if (os->create_collection (42) != 0) - { - cout << "FAIL: create_collection" << endl; - } - - if (os->collection_add (cid, oid) != 0) - { - cout << "FAIL: collection_add" << endl; - } - - list ls; - if (os->list_collections (ls) < 0) - { - cout << "FAIL: list_collections" << endl; - } - cout << "collections: "; - for (list::iterator it = ls.begin(); it != ls.end(); it++) - { - cout << *it << ", "; - } - cout << endl; - - if (os->destroy_collection (0xCAFEBABE + 10) != 0) - { - cout << "FAIL: destroy_collection" << endl; - } - - if (os->destroy_collection (0xCAFEBADE + 10) == 0) - { - cout << "FAIL: destroy_collection" << endl; - } - - object_t oid2 (12345, 12345); - for (int i = 0; i < 8; i++) - { - oid2.rev++; - if (os->collection_add (cid, oid2) != 0) - { - cout << "FAIL: collection_add" << endl; - } - } - for (int i = 0; i < 8; i++) - { - if (os->collection_remove (cid, oid2) != 0) - { - cout << "FAIL: collection_remove" << endl; - } - oid2.rev--; - } - - if (os->collection_setattr (cid, "alpha", "value", 5) != 0) - cout << "FAIL: collection_setattr" << endl; - if (os->collection_setattr (cid, "beta", "value", 5) != 0) - cout << "FAIL: collection_setattr" << endl; - if (os->collection_setattr (cid, "gamma", "value", 5) != 0) - cout << "FAIL: collection_setattr" << endl; - if (os->collection_setattr (cid, "fred", "value", 5) != 0) - cout << "FAIL: collection_setattr" << endl; - - memset (attrvalue, 0, sizeof (attrvalue)); - if (os->collection_getattr (cid, "alpha", attrvalue, sizeof (attrvalue)) < 0) - cout << "FAIL: collection_getattr" << endl; - else if (strncmp (attrvalue, "value", 5) != 0) - cout << "FAIL: collection attribute value different" << endl; - memset (attrvalue, 0, sizeof (attrvalue)); - if (os->collection_getattr (cid, "beta", attrvalue, sizeof (attrvalue)) < 0) - cout << "FAIL: collection_getattr" << endl; - else if (strncmp (attrvalue, "value", 5) != 0) - cout << "FAIL: collection attribute value different" << endl; - memset (attrvalue, 0, sizeof (attrvalue)); - if (os->collection_getattr (cid, "gamma", attrvalue, sizeof (attrvalue)) < 0) - cout << "FAIL: collection_getattr" << endl; - else if (strncmp (attrvalue, "value", 5) != 0) - cout << "FAIL: collection attribute value different" << endl; - memset (attrvalue, 0, sizeof (attrvalue)); - if (os->collection_getattr (cid, "fred", attrvalue, sizeof (attrvalue)) < 0) - cout << "FAIL: collection_getattr" << endl; - else if (strncmp (attrvalue, "value", 5) != 0) - cout << "FAIL: collection attribute value different" << endl; - - if (os->collection_setattr (cid, "alpha", "eulavvalue", 10) != 0) - cout << "FAIL: collection setattr overwrite" << endl; - memset (attrvalue, 0, sizeof (attrvalue)); - if (os->collection_getattr (cid, "alpha", attrvalue, sizeof (attrvalue)) < 0) - cout << "FAIL: collection_getattr" << endl; - else if (strncmp (attrvalue, "eulavvalue", 10) != 0) - cout << "FAIL: collection attribute value different" << endl; - memset (attrvalue, 0, sizeof (attrvalue)); - if (os->collection_getattr (cid, "beta", attrvalue, sizeof (attrvalue)) < 0) - cout << "FAIL: collection_getattr" << endl; - else if (strncmp (attrvalue, "value", 5) != 0) - cout << "FAIL: collection attribute value different" << endl; - memset (attrvalue, 0, sizeof (attrvalue)); - if (os->collection_getattr (cid, "gamma", attrvalue, sizeof (attrvalue)) < 0) - cout << "FAIL: collection_getattr" << endl; - else if (strncmp (attrvalue, "value", 5) != 0) - cout << "FAIL: collection attribute value different" << endl; - memset (attrvalue, 0, sizeof (attrvalue)); - if (os->collection_getattr (cid, "fred", attrvalue, sizeof (attrvalue)) < 0) - cout << "FAIL: collection_getattr" << endl; - else if (strncmp (attrvalue, "value", 5) != 0) - cout << "FAIL: collection attribute value different" << endl; - - if (os->collection_rmattr (cid, "alpha") != 0) - cout << "FAIL: collection_rmattr" << endl; - if (os->collection_rmattr (cid, "fred") != 0) - cout << "FAIL: collection_rmattr" << endl; - if (os->collection_rmattr (cid, "beta") != 0) - cout << "FAIL: collection_rmattr" << endl; - if (os->collection_rmattr (cid, "gamma") != 0) - cout << "FAIL: collection_rmattr" << endl; - - if (os->collection_rmattr (cid, "alpha") == 0) - cout << "FAIL: collection_rmattr (nonexistent)" << endl; - - // Truncate the object. - if (os->truncate (oid, 512, NULL) != 0) - { - cout << "FAIL: truncate" << endl; - } - - // Expand the object. - if (os->truncate (oid, 1200, NULL) != 0) - { - cout << "FAIL: expand" << endl; - } - - // Delete the object. - if (os->remove (oid) != 0) - { - cout << "FAIL: could not remove object" << endl; - } - - // Shouldn't be there - if (os->exists (oid)) - { - cout << "FAIL: should not be there" << endl; - } - - os->sync(); - exit (0); -} diff --git a/branches/sage/crush/test/testtree.cc b/branches/sage/crush/test/testtree.cc deleted file mode 100644 index 2c21bcbe52e25..0000000000000 --- a/branches/sage/crush/test/testtree.cc +++ /dev/null @@ -1,46 +0,0 @@ - - -#include "../crush/BinaryTree.h" -using namespace crush; - -#include -#include -using namespace std; - -int main() -{ - BinaryTree t; - - vector nodes; - - for (int i=0; i<30; i++) { - cout << "adding " << i << endl; - int n = t.add_node(1); - nodes.push_back(n); - //cout << t << endl; - } - cout << t << endl; - - for (int k=0; k<10000; k++) { - if (rand() % 2) { - cout << "adding" << endl; - nodes.push_back( t.add_node(1) ); - } else { - if (!nodes.empty()) { - //for (int i=0; i -using namespace std; - - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -int main(int argc, char**argv) -{ - int a = 1; - int b = 2; - - mknod("test", 0600, 0); - - cout << "setxattr " << setxattr("test", "asdf", &a, sizeof(a), 0) << endl; - cout << "errno " << errno << " " << strerror(errno) << endl; - cout << "getxattr " << getxattr("test", "asdf", &b, sizeof(b)) << endl; - cout << "errno " << errno << " " << strerror(errno) << endl; - cout << "a is " << a << " and b is " << b << endl; - return 0; -} diff --git a/branches/sage/crush/valgrind.supp b/branches/sage/crush/valgrind.supp deleted file mode 100644 index 356df039050c4..0000000000000 --- a/branches/sage/crush/valgrind.supp +++ /dev/null @@ -1,62 +0,0 @@ -# some valgrind suppressions -# to load these automagically, -# cat > ~/.valgrindrc -# --suppressions=valgrind.supp -# - - -# this one makes valgrind shut up about what appears to be a bug in libc's writev. -{ - writev uninit bytes thing -sage - Memcheck:Param - writev(vector[...]) - fun:writev - fun:_ZN11BlockDevice6_writeEijjRN6buffer4listE - fun:_ZN11BlockDevice5do_ioEiRSt4listIPNS_6biovecESaIS2_EE - fun:_ZN11BlockDevice15io_thread_entryEv - fun:_ZN11BlockDevice8IOThread5entryEv - fun:_ZN6Thread11_entry_funcEPv - fun:start_thread - fun:clone - obj:* - obj:* - obj:* - obj:* -} - -# gethostbyname -{ - gethostbyname on issdm - Memcheck:Param - socketcall.sendto(msg) - fun:send - fun:get_mapping - fun:__nscd_get_map_ref - fun:nscd_gethst_r - fun:__nscd_gethostbyname_r - fun:gethostbyname_r@@GLIBC_2.2.5 - fun:gethostbyname - fun:_ZN4Rank8Accepter5startEv - fun:_ZN4Rank10start_rankEv - fun:main -} - -# gethostbyname - -{ - gethostbyname on foil - Memcheck:Addr8 - obj:/lib/ld-2.6.1.so - obj:/lib/ld-2.6.1.so - obj:/lib/ld-2.6.1.so - obj:/lib/ld-2.6.1.so - obj:/lib/ld-2.6.1.so - obj:/lib/ld-2.6.1.so - obj:/lib/ld-2.6.1.so - obj:/lib/libc-2.6.1.so - obj:/lib/ld-2.6.1.so - fun:__libc_dlopen_mode - fun:__nss_lookup_function - obj:/lib/libc-2.6.1.so -} - diff --git a/branches/sage/ebofs2/COPYING b/branches/sage/ebofs2/COPYING deleted file mode 100644 index 5ab7695ab8cab..0000000000000 --- a/branches/sage/ebofs2/COPYING +++ /dev/null @@ -1,504 +0,0 @@ - GNU LESSER GENERAL PUBLIC LICENSE - Version 2.1, February 1999 - - Copyright (C) 1991, 1999 Free Software Foundation, Inc. - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - Everyone is permitted to copy and distribute verbatim copies - of this license document, but changing it is not allowed. - -[This is the first released version of the Lesser GPL. It also counts - as the successor of the GNU Library Public License, version 2, hence - the version number 2.1.] - - Preamble - - The licenses for most software are designed to take away your -freedom to share and change it. By contrast, the GNU General Public -Licenses are intended to guarantee your freedom to share and change -free software--to make sure the software is free for all its users. - - This license, the Lesser General Public License, applies to some -specially designated software packages--typically libraries--of the -Free Software Foundation and other authors who decide to use it. You -can use it too, but we suggest you first think carefully about whether -this license or the ordinary General Public License is the better -strategy to use in any particular case, based on the explanations below. - - When we speak of free software, we are referring to freedom of use, -not price. Our General Public Licenses are designed to make sure that -you have the freedom to distribute copies of free software (and charge -for this service if you wish); that you receive source code or can get -it if you want it; that you can change the software and use pieces of -it in new free programs; and that you are informed that you can do -these things. - - To protect your rights, we need to make restrictions that forbid -distributors to deny you these rights or to ask you to surrender these -rights. These restrictions translate to certain responsibilities for -you if you distribute copies of the library or if you modify it. - - For example, if you distribute copies of the library, whether gratis -or for a fee, you must give the recipients all the rights that we gave -you. You must make sure that they, too, receive or can get the source -code. If you link other code with the library, you must provide -complete object files to the recipients, so that they can relink them -with the library after making changes to the library and recompiling -it. And you must show them these terms so they know their rights. - - We protect your rights with a two-step method: (1) we copyright the -library, and (2) we offer you this license, which gives you legal -permission to copy, distribute and/or modify the library. - - To protect each distributor, we want to make it very clear that -there is no warranty for the free library. Also, if the library is -modified by someone else and passed on, the recipients should know -that what they have is not the original version, so that the original -author's reputation will not be affected by problems that might be -introduced by others. - - Finally, software patents pose a constant threat to the existence of -any free program. We wish to make sure that a company cannot -effectively restrict the users of a free program by obtaining a -restrictive license from a patent holder. Therefore, we insist that -any patent license obtained for a version of the library must be -consistent with the full freedom of use specified in this license. - - Most GNU software, including some libraries, is covered by the -ordinary GNU General Public License. This license, the GNU Lesser -General Public License, applies to certain designated libraries, and -is quite different from the ordinary General Public License. We use -this license for certain libraries in order to permit linking those -libraries into non-free programs. - - When a program is linked with a library, whether statically or using -a shared library, the combination of the two is legally speaking a -combined work, a derivative of the original library. The ordinary -General Public License therefore permits such linking only if the -entire combination fits its criteria of freedom. The Lesser General -Public License permits more lax criteria for linking other code with -the library. - - We call this license the "Lesser" General Public License because it -does Less to protect the user's freedom than the ordinary General -Public License. It also provides other free software developers Less -of an advantage over competing non-free programs. These disadvantages -are the reason we use the ordinary General Public License for many -libraries. However, the Lesser license provides advantages in certain -special circumstances. - - For example, on rare occasions, there may be a special need to -encourage the widest possible use of a certain library, so that it becomes -a de-facto standard. To achieve this, non-free programs must be -allowed to use the library. A more frequent case is that a free -library does the same job as widely used non-free libraries. In this -case, there is little to gain by limiting the free library to free -software only, so we use the Lesser General Public License. - - In other cases, permission to use a particular library in non-free -programs enables a greater number of people to use a large body of -free software. For example, permission to use the GNU C Library in -non-free programs enables many more people to use the whole GNU -operating system, as well as its variant, the GNU/Linux operating -system. - - Although the Lesser General Public License is Less protective of the -users' freedom, it does ensure that the user of a program that is -linked with the Library has the freedom and the wherewithal to run -that program using a modified version of the Library. - - The precise terms and conditions for copying, distribution and -modification follow. Pay close attention to the difference between a -"work based on the library" and a "work that uses the library". The -former contains code derived from the library, whereas the latter must -be combined with the library in order to run. - - GNU LESSER GENERAL PUBLIC LICENSE - TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION - - 0. This License Agreement applies to any software library or other -program which contains a notice placed by the copyright holder or -other authorized party saying it may be distributed under the terms of -this Lesser General Public License (also called "this License"). -Each licensee is addressed as "you". - - A "library" means a collection of software functions and/or data -prepared so as to be conveniently linked with application programs -(which use some of those functions and data) to form executables. - - The "Library", below, refers to any such software library or work -which has been distributed under these terms. A "work based on the -Library" means either the Library or any derivative work under -copyright law: that is to say, a work containing the Library or a -portion of it, either verbatim or with modifications and/or translated -straightforwardly into another language. (Hereinafter, translation is -included without limitation in the term "modification".) - - "Source code" for a work means the preferred form of the work for -making modifications to it. For a library, complete source code means -all the source code for all modules it contains, plus any associated -interface definition files, plus the scripts used to control compilation -and installation of the library. - - Activities other than copying, distribution and modification are not -covered by this License; they are outside its scope. The act of -running a program using the Library is not restricted, and output from -such a program is covered only if its contents constitute a work based -on the Library (independent of the use of the Library in a tool for -writing it). Whether that is true depends on what the Library does -and what the program that uses the Library does. - - 1. You may copy and distribute verbatim copies of the Library's -complete source code as you receive it, in any medium, provided that -you conspicuously and appropriately publish on each copy an -appropriate copyright notice and disclaimer of warranty; keep intact -all the notices that refer to this License and to the absence of any -warranty; and distribute a copy of this License along with the -Library. - - You may charge a fee for the physical act of transferring a copy, -and you may at your option offer warranty protection in exchange for a -fee. - - 2. You may modify your copy or copies of the Library or any portion -of it, thus forming a work based on the Library, and copy and -distribute such modifications or work under the terms of Section 1 -above, provided that you also meet all of these conditions: - - a) The modified work must itself be a software library. - - b) You must cause the files modified to carry prominent notices - stating that you changed the files and the date of any change. - - c) You must cause the whole of the work to be licensed at no - charge to all third parties under the terms of this License. - - d) If a facility in the modified Library refers to a function or a - table of data to be supplied by an application program that uses - the facility, other than as an argument passed when the facility - is invoked, then you must make a good faith effort to ensure that, - in the event an application does not supply such function or - table, the facility still operates, and performs whatever part of - its purpose remains meaningful. - - (For example, a function in a library to compute square roots has - a purpose that is entirely well-defined independent of the - application. Therefore, Subsection 2d requires that any - application-supplied function or table used by this function must - be optional: if the application does not supply it, the square - root function must still compute square roots.) - -These requirements apply to the modified work as a whole. If -identifiable sections of that work are not derived from the Library, -and can be reasonably considered independent and separate works in -themselves, then this License, and its terms, do not apply to those -sections when you distribute them as separate works. But when you -distribute the same sections as part of a whole which is a work based -on the Library, the distribution of the whole must be on the terms of -this License, whose permissions for other licensees extend to the -entire whole, and thus to each and every part regardless of who wrote -it. - -Thus, it is not the intent of this section to claim rights or contest -your rights to work written entirely by you; rather, the intent is to -exercise the right to control the distribution of derivative or -collective works based on the Library. - -In addition, mere aggregation of another work not based on the Library -with the Library (or with a work based on the Library) on a volume of -a storage or distribution medium does not bring the other work under -the scope of this License. - - 3. You may opt to apply the terms of the ordinary GNU General Public -License instead of this License to a given copy of the Library. To do -this, you must alter all the notices that refer to this License, so -that they refer to the ordinary GNU General Public License, version 2, -instead of to this License. (If a newer version than version 2 of the -ordinary GNU General Public License has appeared, then you can specify -that version instead if you wish.) Do not make any other change in -these notices. - - Once this change is made in a given copy, it is irreversible for -that copy, so the ordinary GNU General Public License applies to all -subsequent copies and derivative works made from that copy. - - This option is useful when you wish to copy part of the code of -the Library into a program that is not a library. - - 4. You may copy and distribute the Library (or a portion or -derivative of it, under Section 2) in object code or executable form -under the terms of Sections 1 and 2 above provided that you accompany -it with the complete corresponding machine-readable source code, which -must be distributed under the terms of Sections 1 and 2 above on a -medium customarily used for software interchange. - - If distribution of object code is made by offering access to copy -from a designated place, then offering equivalent access to copy the -source code from the same place satisfies the requirement to -distribute the source code, even though third parties are not -compelled to copy the source along with the object code. - - 5. A program that contains no derivative of any portion of the -Library, but is designed to work with the Library by being compiled or -linked with it, is called a "work that uses the Library". Such a -work, in isolation, is not a derivative work of the Library, and -therefore falls outside the scope of this License. - - However, linking a "work that uses the Library" with the Library -creates an executable that is a derivative of the Library (because it -contains portions of the Library), rather than a "work that uses the -library". The executable is therefore covered by this License. -Section 6 states terms for distribution of such executables. - - When a "work that uses the Library" uses material from a header file -that is part of the Library, the object code for the work may be a -derivative work of the Library even though the source code is not. -Whether this is true is especially significant if the work can be -linked without the Library, or if the work is itself a library. The -threshold for this to be true is not precisely defined by law. - - If such an object file uses only numerical parameters, data -structure layouts and accessors, and small macros and small inline -functions (ten lines or less in length), then the use of the object -file is unrestricted, regardless of whether it is legally a derivative -work. (Executables containing this object code plus portions of the -Library will still fall under Section 6.) - - Otherwise, if the work is a derivative of the Library, you may -distribute the object code for the work under the terms of Section 6. -Any executables containing that work also fall under Section 6, -whether or not they are linked directly with the Library itself. - - 6. As an exception to the Sections above, you may also combine or -link a "work that uses the Library" with the Library to produce a -work containing portions of the Library, and distribute that work -under terms of your choice, provided that the terms permit -modification of the work for the customer's own use and reverse -engineering for debugging such modifications. - - You must give prominent notice with each copy of the work that the -Library is used in it and that the Library and its use are covered by -this License. You must supply a copy of this License. If the work -during execution displays copyright notices, you must include the -copyright notice for the Library among them, as well as a reference -directing the user to the copy of this License. Also, you must do one -of these things: - - a) Accompany the work with the complete corresponding - machine-readable source code for the Library including whatever - changes were used in the work (which must be distributed under - Sections 1 and 2 above); and, if the work is an executable linked - with the Library, with the complete machine-readable "work that - uses the Library", as object code and/or source code, so that the - user can modify the Library and then relink to produce a modified - executable containing the modified Library. (It is understood - that the user who changes the contents of definitions files in the - Library will not necessarily be able to recompile the application - to use the modified definitions.) - - b) Use a suitable shared library mechanism for linking with the - Library. A suitable mechanism is one that (1) uses at run time a - copy of the library already present on the user's computer system, - rather than copying library functions into the executable, and (2) - will operate properly with a modified version of the library, if - the user installs one, as long as the modified version is - interface-compatible with the version that the work was made with. - - c) Accompany the work with a written offer, valid for at - least three years, to give the same user the materials - specified in Subsection 6a, above, for a charge no more - than the cost of performing this distribution. - - d) If distribution of the work is made by offering access to copy - from a designated place, offer equivalent access to copy the above - specified materials from the same place. - - e) Verify that the user has already received a copy of these - materials or that you have already sent this user a copy. - - For an executable, the required form of the "work that uses the -Library" must include any data and utility programs needed for -reproducing the executable from it. However, as a special exception, -the materials to be distributed need not include anything that is -normally distributed (in either source or binary form) with the major -components (compiler, kernel, and so on) of the operating system on -which the executable runs, unless that component itself accompanies -the executable. - - It may happen that this requirement contradicts the license -restrictions of other proprietary libraries that do not normally -accompany the operating system. Such a contradiction means you cannot -use both them and the Library together in an executable that you -distribute. - - 7. You may place library facilities that are a work based on the -Library side-by-side in a single library together with other library -facilities not covered by this License, and distribute such a combined -library, provided that the separate distribution of the work based on -the Library and of the other library facilities is otherwise -permitted, and provided that you do these two things: - - a) Accompany the combined library with a copy of the same work - based on the Library, uncombined with any other library - facilities. This must be distributed under the terms of the - Sections above. - - b) Give prominent notice with the combined library of the fact - that part of it is a work based on the Library, and explaining - where to find the accompanying uncombined form of the same work. - - 8. You may not copy, modify, sublicense, link with, or distribute -the Library except as expressly provided under this License. Any -attempt otherwise to copy, modify, sublicense, link with, or -distribute the Library is void, and will automatically terminate your -rights under this License. However, parties who have received copies, -or rights, from you under this License will not have their licenses -terminated so long as such parties remain in full compliance. - - 9. You are not required to accept this License, since you have not -signed it. However, nothing else grants you permission to modify or -distribute the Library or its derivative works. These actions are -prohibited by law if you do not accept this License. Therefore, by -modifying or distributing the Library (or any work based on the -Library), you indicate your acceptance of this License to do so, and -all its terms and conditions for copying, distributing or modifying -the Library or works based on it. - - 10. Each time you redistribute the Library (or any work based on the -Library), the recipient automatically receives a license from the -original licensor to copy, distribute, link with or modify the Library -subject to these terms and conditions. You may not impose any further -restrictions on the recipients' exercise of the rights granted herein. -You are not responsible for enforcing compliance by third parties with -this License. - - 11. If, as a consequence of a court judgment or allegation of patent -infringement or for any other reason (not limited to patent issues), -conditions are imposed on you (whether by court order, agreement or -otherwise) that contradict the conditions of this License, they do not -excuse you from the conditions of this License. If you cannot -distribute so as to satisfy simultaneously your obligations under this -License and any other pertinent obligations, then as a consequence you -may not distribute the Library at all. For example, if a patent -license would not permit royalty-free redistribution of the Library by -all those who receive copies directly or indirectly through you, then -the only way you could satisfy both it and this License would be to -refrain entirely from distribution of the Library. - -If any portion of this section is held invalid or unenforceable under any -particular circumstance, the balance of the section is intended to apply, -and the section as a whole is intended to apply in other circumstances. - -It is not the purpose of this section to induce you to infringe any -patents or other property right claims or to contest validity of any -such claims; this section has the sole purpose of protecting the -integrity of the free software distribution system which is -implemented by public license practices. Many people have made -generous contributions to the wide range of software distributed -through that system in reliance on consistent application of that -system; it is up to the author/donor to decide if he or she is willing -to distribute software through any other system and a licensee cannot -impose that choice. - -This section is intended to make thoroughly clear what is believed to -be a consequence of the rest of this License. - - 12. If the distribution and/or use of the Library is restricted in -certain countries either by patents or by copyrighted interfaces, the -original copyright holder who places the Library under this License may add -an explicit geographical distribution limitation excluding those countries, -so that distribution is permitted only in or among countries not thus -excluded. In such case, this License incorporates the limitation as if -written in the body of this License. - - 13. The Free Software Foundation may publish revised and/or new -versions of the Lesser General Public License from time to time. -Such new versions will be similar in spirit to the present version, -but may differ in detail to address new problems or concerns. - -Each version is given a distinguishing version number. If the Library -specifies a version number of this License which applies to it and -"any later version", you have the option of following the terms and -conditions either of that version or of any later version published by -the Free Software Foundation. If the Library does not specify a -license version number, you may choose any version ever published by -the Free Software Foundation. - - 14. If you wish to incorporate parts of the Library into other free -programs whose distribution conditions are incompatible with these, -write to the author to ask for permission. For software which is -copyrighted by the Free Software Foundation, write to the Free -Software Foundation; we sometimes make exceptions for this. Our -decision will be guided by the two goals of preserving the free status -of all derivatives of our free software and of promoting the sharing -and reuse of software generally. - - NO WARRANTY - - 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO -WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. -EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR -OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY -KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE -LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME -THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. - - 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN -WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY -AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU -FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR -CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE -LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING -RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A -FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF -SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH -DAMAGES. - - END OF TERMS AND CONDITIONS - - How to Apply These Terms to Your New Libraries - - If you develop a new library, and you want it to be of the greatest -possible use to the public, we recommend making it free software that -everyone can redistribute and change. You can do so by permitting -redistribution under these terms (or, alternatively, under the terms of the -ordinary General Public License). - - To apply these terms, attach the following notices to the library. It is -safest to attach them to the start of each source file to most effectively -convey the exclusion of warranty; and each file should have at least the -"copyright" line and a pointer to where the full notice is found. - - - Copyright (C) - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - -Also add information on how to contact you by electronic and paper mail. - -You should also get your employer (if you work as a programmer) or your -school, if any, to sign a "copyright disclaimer" for the library, if -necessary. Here is a sample; alter the names: - - Yoyodyne, Inc., hereby disclaims all copyright interest in the - library `Frob' (a library for tweaking knobs) written by James Random Hacker. - - , 1 April 1990 - Ty Coon, President of Vice - -That's all there is to it! - - diff --git a/branches/sage/ebofs2/Makefile b/branches/sage/ebofs2/Makefile deleted file mode 100644 index 52fc13494c3c6..0000000000000 --- a/branches/sage/ebofs2/Makefile +++ /dev/null @@ -1,311 +0,0 @@ -# -# until autoconf is set up, here are the options i understand: -# -# darwin=yes -- build on darwin -# fuse=no -- don't build anything requiring FUSE -# mpi=no -- don't build newsyn (require MPI) -# use_ccpp=yes -- use Common C++ for buffer.h reference counting -# want_bdb=yes -- build berkelydb objectstore -# - -# mpicxx must be on your path to build newsyn. -# on googoo, this means that /usr/local/mpich2-1.0.2/bin must be on your path. -# on issdm, it's /usr/local/mpich2/bin. - -# Hook for extra -I options, etc. -EXTRA_CFLAGS = #-I${HOME}/include -L${HOME}/lib -EXTRA_CFLAGS += -g -EXTRA_CFLAGS += -pg -#EXTRA_CFLAGS += -O3 - -# base -CFLAGS = -Wall -I. -D_FILE_OFFSET_BITS=64 -D_REENTRANT -D_THREAD_SAFE ${EXTRA_CFLAGS} -LDINC = ld -i -o -CC = g++ -LIBS = -pthread - -# darwin? -ifeq ($(target),darwin) -CFLAGS += -DDARWIN -D__FreeBSD__=10 -LDINC = ar -rc -endif - -# use Common C++ (for buffer.h)? -ifeq ($(use_ccpp),yes) -CFLAGS += -D_GNU_SOURCE -DBUFFER_USE_CCPP -LIBS += -lccgnu2 -ldl -endif - - -#for normal mpich2 machines -MPICC = mpicxx -MPICFLAGS = -DMPICH_IGNORE_CXX_SEEK ${CFLAGS} -MPILIBS = ${LIBS} - -#for LLNL boxes without mpicxx -#MPICC = g++ -#MPICFLAGS = ${CFLAGS} -I/usr/lib/mpi/mpi_gnu/include -L/usr/lib/mpi/mpi_gnu/lib -#MPILIBS = ${LIBS} -lelan -lmpi - -EBOFS_OBJS= \ - ebofs/BlockDevice.o\ - ebofs/BufferCache.o\ - ebofs/Ebofs.o\ - ebofs/Allocator.o\ - ebofs/FileJournal.o - -MDS_OBJS= \ - mds/MDS.o\ - mds/journal.o\ - mds/Server.o\ - mds/MDCache.o\ - mds/Locker.o\ - mds/Migrator.o\ - mds/MDBalancer.o\ - mds/CDentry.o\ - mds/CDir.o\ - mds/CInode.o\ - mds/AnchorTable.o\ - mds/AnchorClient.o\ - mds/LogEvent.o\ - mds/IdAllocator.o\ - mds/ClientMap.o\ - mds/MDLog.o - -OSD_OBJS= \ - osd/PG.o\ - osd/ReplicatedPG.o\ - osd/Ager.o\ - osd/FakeStore.o\ - osd/OSD.o -# osd/RAID4PG.o\ - -OSDC_OBJS= \ - osdc/Objecter.o\ - osdc/ObjectCacher.o\ - osdc/Filer.o\ - osdc/Journaler.o - -MON_OBJS= \ - mon/Monitor.o\ - mon/Paxos.o\ - mon/PaxosService.o\ - mon/OSDMonitor.o\ - mon/MDSMonitor.o\ - mon/ClientMonitor.o\ - mon/PGMonitor.o\ - mon/Elector.o\ - mon/MonitorStore.o - -COMMON_OBJS= \ - msg/Message.o\ - common/Logger.o\ - common/Clock.o\ - common/Timer.o\ - config.o - -CLIENT_OBJS= \ - client/FileCache.o\ - client/Client.o\ - client/SyntheticClient.o\ - client/Trace.o - - -# bdbstore? -ifeq ($(want_bdb),yes) -CFLAGS += -DUSE_OSBDB -LIBS = -ldb_cxx -OSD_OBJS += osbdb/OSBDB.o -OSBDB_OBJS = \ - osbdb/OSBDB.o -endif - - -# targets -TARGETS = cmon cosd cmds csyn mkmonmap cmonctl fakesyn dupstore -SRCS=*.cc */*.cc *.h */*.h */*/*.h - -ifneq ($(fuse),no) -TARGETS += cfuse fakefuse -endif - -ifneq ($(mpi),no) -TARGETS += newsyn -endif - -all: depend ${TARGETS} - -test: depend ${TEST_TARGETS} - - -# real bits -mkmonmap: mkmonmap.cc common.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - -extractosdmaps: extractosdmaps.cc common.o osd.o mon.o ebofs.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - -cmon: cmon.o mon.o msg/SimpleMessenger.o common.o crush/libcrush.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - -cmonctl: cmonctl.cc msg/SimpleMessenger.o common.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - -cosd: cosd.o osd.o ebofs.o msg/SimpleMessenger.o common.o crush/libcrush.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - -cmds: cmds.o mds.o osdc.o msg/SimpleMessenger.o common.o crush/libcrush.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - -csyn: csyn.o client.o osdc.o msg/SimpleMessenger.o common.o crush/libcrush.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - -cfuse: cfuse.o client.o osdc.o client/fuse.o client/fuse_ll.o msg/SimpleMessenger.o common.o crush/libcrush.o - ${CC} ${CFLAGS} ${LIBS} -lfuse $^ -o $@ - - -# code shipping experiments -activemaster: active/activemaster.cc client.o osdc.o msg/SimpleMessenger.o common.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - -activeslave: active/activeslave.cc client.o osdc.o msg/SimpleMessenger.o common.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - -echotestclient: active/echotestclient.cc - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - -msgtestclient: active/msgtestclient.o client.o osdc.o msg/SimpleMessenger.o common.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - -libtrivialtask.so: active/trivial_task.cc client.o osdc.o msg/SimpleMessenger.o common.o - ${CC} -fPIC -shared -Wl,-soname,$@.1 ${CFLAGS} ${LIBS} $^ -o $@ - - - -# IPC interface -ipc_server: ceph_ipc/ipc_server.cc client.o osdc.o msg/SimpleMessenger.o common.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - -ipc_testclient: ceph_ipc/ipc_testclient.cc ceph_ipc/ipc_client.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - - -# fake* -fakefuse: fakefuse.o mon.o mds.o client.o osd.o osdc.o ebofs.o client/fuse.o client/fuse_ll.o msg/FakeMessenger.o common.o crush/libcrush.o - ${CC} ${CFLAGS} ${LIBS} -lfuse $^ -o $@ - -fakesyn: fakesyn.o mon.o mds.o client.o osd.o ebofs.o osdc.o msg/FakeMessenger.o common.o crush/libcrush.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - - -# mpi startup -newsyn: newsyn.cc mon.o mds.o client.o osd.o ebofs.o osdc.o msg/SimpleMessenger.o common.o crush/libcrush.o - ${MPICC} ${MPICFLAGS} ${MPILIBS} $^ -o $@ - - -# ebofs -mkfs.ebofs: ebofs/mkfs.ebofs.cc config.cc common/Clock.o ebofs.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - -test.ebofs: ebofs/test.ebofs.cc config.cc common/Clock.o ebofs.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - -dupstore: dupstore.cc config.cc ebofs.o common/Clock.o common/Timer.o osd/FakeStore.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - - -# hadoop -libhadoopcephfs.so: client/hadoop/CephFSInterface.cc client.o osdc.o msg/SimpleMessenger.o common.o - ${CC} -fPIC -shared -Wl,-soname,$@.1 ${CFLAGS} -I/usr/local/java/include -I/usr/local/java/include/linux ${LIBS} $^ -o $@ - -# libceph -libceph.o: client/ldceph.o client/Client.o msg/SimpleMessenger.o ${COMMON_OBJS} ${SYN_OBJS} ${OSDC_OBJS} - ${LDINC} $^ -o $@ - -# some benchmarking tools -bench/mdtest/mdtest.o: bench/mdtest/mdtest.c - mpicc -c $^ -o $@ - -mdtest: bench/mdtest/mdtest.o - ${MPICC} ${MPICFLAGS} ${MPILIBS} $^ -o $@ - -mdtest.ceph: bench/mdtest/mdtest.o libceph.o - ${MPICC} ${MPICFLAGS} ${MPILIBS} $^ -o $@ - -testos: test/testos.o ebofs.o osbdb.o common.o - ${CC} ${CFLAGS} ${LIBS} ${OSBDB_LIBS} -o $@ $^ - - -# misc -gprof-helper.so: test/gprof-helper.c - gcc -shared -fPIC test/gprof-helper.c -o gprof-helper.so -lpthread -ldl - -test_disk_bw: test/test_disk_bw.cc common.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - -# crush - -crush/libcrush.o: force_look - cd crush ; make - -force_look: - true - -# bits -common.o: ${COMMON_OBJS} - ${LDINC} $@ $^ - -ebofs.o: ${EBOFS_OBJS} - ${LDINC} $@ $^ - -client.o: ${CLIENT_OBJS} - ${LDINC} $@ $^ - -osd.o: ${OSD_OBJS} - ${LDINC} $@ $^ - -osdc.o: ${OSDC_OBJS} - ${LDINC} $@ $^ - -mds.o: ${MDS_OBJS} - ${LDINC} $@ $^ - -mon.o: ${MON_OBJS} - ${LDINC} $@ $^ - -osbdb.o: ${OSBDB_OBJS} - ${LDINC} $@ $^ - -# generic rules -%.so: %.cc - ${CC} -shared -fPIC ${CFLAGS} $< -o $@ - -%.o: %.cc - ${CC} ${CFLAGS} -c $< -o $@ - -%.po: %.cc - ${CC} -fPIC ${CFLAGS} -c $< -o $@ - - -# handy -clean: - rm -f *.o */*.o ${TARGETS} ${TEST_TARGETS} - -count: - cat ${SRCS} | wc -l - cat ${SRCS} | grep -c \; - -TAGS: - etags `find . -name "*.[h|cc]"` - -.depend: - touch .depend - -depend: - $(RM) .depend - makedepend -f- -- $(CFLAGS) -- $(SRCS) > .depend 2>/dev/null -# for f in $(SRCS) ; do cpp -MM $(CFLAGS) $$f 2> /dev/null >> .depend ; done - - -# now add a line to include the dependency list. -include .depend diff --git a/branches/sage/ebofs2/README b/branches/sage/ebofs2/README deleted file mode 100644 index aa016817cebf0..0000000000000 --- a/branches/sage/ebofs2/README +++ /dev/null @@ -1,4 +0,0 @@ -Ceph - a scalable distributed file system ------------------------------------------ - -Please see http://ceph.sourceforge.net/ for current info. diff --git a/branches/sage/ebofs2/TODO b/branches/sage/ebofs2/TODO deleted file mode 100644 index 78c0b4e8918f8..0000000000000 --- a/branches/sage/ebofs2/TODO +++ /dev/null @@ -1,259 +0,0 @@ - -some smallish projects: - -- userspace crush tools - - xml import/export? - - ? - -- generalize monitor client? - - throttle message resend attempts - -- ENOSPC on client, OSD - - -code cleanup -- endian portability -- word size - - clean up all encoded structures - -kernel planning -- soft consistency on (kernel) lookup? -- accurate reconstruction of (syscall) path? - - -mds mustfix -- open file rejournaling vs capped log... - - open files vs shutdown in general! need to export any caps on replicated metadata -- export caps to auth on unlinked inodes -- stray purge on shutdown - -- rename slave in-memory rollback on failure - -- fix purge_stray bug -- try_remove_unlinked_dn thing - -- client session open from locker.. doesn't work properly with delays - -> journal the session open _with_ the import(start) - -- proper handling of cache expire messages during rejoin phase? - -- verify once-per-segment jouranl context is working... - -mds -- extend/clean up filepath to allow paths relative to an ino - - fix path_traverse - - fix reconnect/rejoin open file weirdness - -- get rid of replicate objects for replicate_to .. encode to bufferlists directly - -- stray reintegration -- verify stray is empty on shutdown - -- real chdir (directory "open") - - relative metadata ops - -- consistency points/snapshots - - dentry versions vs dirfrags... - -- detect and deal with client failure - - failure during reconnect vs clientmap. although probalby the whole thing needs a larger overhaul... - -- inode.max_size -- inode.allocated_size - -- osd needs a set_floor_and_read op for safe failover/STOGITH-like semantics. - -- could mark dir complete in EMetaBlob by counting how many dentries are dirtied in the current log epoch in CDir... - -- fix rmdir empty exported dirfrag race - - export all frags <= 1 item? then we ensure freezing before empty, avoiding any last unlink + export vs rmdir race. - - how to know full dir size (when trimming)? - - put frag size/mtime in fragmap in inode? we will need that anyway for stat on dirs - - will need to make inode discover/import_decode smart about dirfrag auth - - or, only put frag size/mtime in inode when frag is closed. otherwise, soft (journaled) state, possibly on another mds. - - need to move state from replicas to auth. simplelock doesn't currently support that. - - ScatterLock or something? hrm. - -- FIXME how to journal/store root and stray inode content? - - in particular, i care about dirfragtree.. get it on rejoin? - - and dir sizes, if i add that... also on rejoin? - -- efficient stat for single writers -- lstat vs stat? -- add FILE_CAP_EXTEND capability bit -- only share osdmap updates with clients holding capabilities -- delayed replica caps release... we need to set a timer event? (and cancel it when appropriate?) - - - -osdmon -- allow fresh replacement osds. add osd_created in osdmap, probably -- monitor needs to monitor some osds... -- monitor pg states, notify on out? -- watch osd utilization; adjust overload in cluster map - -journaler -- fix up for large events (e.g. imports) -- use set_floor_and_read for safe takeover from possibly-not-quite-dead otherguy. -- should we pad with zeros to avoid splitting individual entries? - - make it a g_conf flag? - - have to fix reader to skip over zeros (either <4 bytes for size, or zeroed sizes) -- need to truncate at detected (valid) write_pos to clear out any other partial trailing writes - - -crush -- xml import/export? -- crush tools - - -rados snapshots -- integrate revisions into ObjectCacher -- clean up oid.rev vs op.rev in osd+osdc - -- attr.crev is rev we were created in. -- oid.rev=0 is "live". defined for attr.crev <= rev. -- otherwise, defined for attr.crev <= rev < oid.rev (i.e. oid.rev is upper bound, non-inclusive.) - -- write|delete is tagged with op.rev - - if attr.crev < op.rev - - we clone to oid.rev=rev (clone keeps old crev) - - change live attr.crev=rev. - - apply update -- read is tagged with op.rev - - if 0, we read from 0 (if it exists). - - otherwise we choose object rev based on op.rev vs oid.rev, and then verifying attr.crev <= op.rev. - -- how to get usage feedback to monitor? - -- clean up mds caps release in exporter -- figure out client failure modes -- add connection retry. - - -objecter -- maybe_request_map should set a timer event to periodically re-request. -- transaction prepare/commit -- read+floor_lockout - -osd/rados -- transaction prepare/commit - - rollback - - rollback logging (to fix slow prepare vs rollback race) -- read+floor_lockout for clean STOGITH-like/fencing semantics after failover. - -- consider implications of nvram writeahead logs -- clean shutdown? -- pgmonitor should supplement failure detection - -- flag missing log entries on crash recovery --> WRNOOP? or WRLOST? - -- efficiently replicate clone() objects -- fix heartbeat wrt new replication -- mark residual pgs obsolete ??? -- rdlocks -- optimize remove wrt recovery pushes -- report crashed pgs? - -messenger -- fix messenger shutdown.. we shouldn't delete messenger, since the caller may be referencing it, etc. - -simplemessenger -- close idle connections -- take a look at RDS? http://oss.oracle.com/projects/rds/ - - -objectcacher -- merge clean bh's -- ocacher caps transitions vs locks -- test read locks - -reliability -- heartbeat vs ping? -- osdmonitor, filter - -ebofs -- allow holes - -- verify proper behavior of conflicting/overlapping reads of clones -- combine inodes and/or cnodes into same blocks -- allow btree sets instead of maps -- eliminate nodepools -- nonblocking write on missing onodes? -- fix bug in node rotation on insert (and reenable) -- fix NEAR_LAST_FWD (?) - -- awareness of underlying software/hardware raid in allocator so that we - write full stripes _only_. - - hmm, that's basically just a large block size. - -- rewrite the btree code! - - multithreaded - - eliminate nodepools - - allow btree sets - - allow arbitrary embedded data? - - allow arbitrary btrees - - allow root node(s?) to be embedded in onode, or whereever. - - keys and values can be uniform (fixed-size) or non-uniform. - - fixed size (if any) is a value in the btree struct. - - negative indicates bytes of length value? (1 -> 255bytes, 2 -> 65535 bytes, etc.?) - - non-uniform records preceeded by length. - - keys sorted via a comparator defined in btree root. - - lexicographically, by default. - -- goal - - object btree key->value payload, not just a data blob payload. - - better threading behavior. - - with transactional goodness! - -- onode - - object attributes.. as a btree? - - blob stream - - map stream. - - allow blob values. - - - - - - -remaining hard problems -- how to cope with file size changes and read/write sharing - - -crush -- more efficient failure when all/too many osds are down -- allow forcefeed for more complicated rule structures. (e.g. make force_stack a list< set >) - - - - -client -- fstat -- mixed lazy and non-lazy io will clobber each others' caps in the buffer cache.. how to isolate.. -- test client caps migration w/ mds exports -- some heuristic behavior to consolidate caps to inode auth? - - - - - - - -why qsync could be wrong (for very strict POSIX) : varying mds -> client message transit or processing times. -- mds -> 1,2 : qsync -- client1 writes at byte 100 -- client1 -> mds : qsync reply (size=100) -- client1 writes at byte 300 -- client1 -> client2 (outside channel) -- client2 writes at byte 200 -- client2 -> mds : qsync reply (size=200) --> stat results in size 200, even though at no single point in time was the max size 500. --> for correct result, need to _stop_ client writers while gathering metadata. - - -- dump active config in run output somewhere - - - - - - diff --git a/branches/sage/ebofs2/active/README b/branches/sage/ebofs2/active/README deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/branches/sage/ebofs2/active/activemaster.cc b/branches/sage/ebofs2/active/activemaster.cc deleted file mode 100644 index b4dc742c414ab..0000000000000 --- a/branches/sage/ebofs2/active/activemaster.cc +++ /dev/null @@ -1,115 +0,0 @@ -/* - * Startup executable for - * Ceph Active Storage. See README for details. - * - */ -#include "activemaster.h" - - -/* - * What main() must do: - * - * - start up a Ceph client - * - find the set of OSDs that the file is striped across - * - start up the Map task on each OSD, using ssh - * - eat lunch? - * - start up the Reduce task locally - */ - -int main(int argc, const char* argv[]) { - - if (argc < 4) { - usage(argv[0]); - exit(-1); - } - - const char* input_filename = argv[1]; - const char* map_command = argv[2]; - //const char* reduce_command = argv[3]; - - // fire up the client - Client* client = startCephClient(); - - // open the file as read_only - int fh = client->open(input_filename, O_RDONLY); - if (fh < 0) { - cout << "The input file " << input_filename << " could not be opened." << endl; - exit(-1); - } - - // How big is the file? - int filesize; - struct stat stbuf; - if (0 > client->lstat(input_filename, &stbuf)) { - cout << "Error: could not retrieve size of input file " << input_filename << endl; - exit(-1); - } - filesize = stbuf.st_size; - if (filesize < 1) { - cout << "Error: input file size is " << filesize << endl; - exit(-1); - } - - // retrieve all the object extents - list extents; - int offset = 0; - client->enumerate_layout(fh, extents, filesize, offset); - - // for each object extent, retrieve the OSD IP address and start up a Map task - list::iterator i; - map::iterator j; - int osd; - int start, length; - tcpaddr_t tcpaddr; - - for (i = extents.begin(); i != extents.end(); i++) - { - // find the primary and get its IP address - osd = client->osdmap->get_pg_primary(i->pgid); - entity_inst_t inst = client->osdmap->get_inst(osd); - entity_addr_t entity_addr = inst.addr; - entity_addr.make_addr(tcpaddr); - - // iterate through each buffer_extent in the ObjectExtent - for (j = i->buffer_extents.begin(); - j != i->buffer_extents.end(); j++) - { - // get the range of the buffer_extent - start = (*j).first; - length = (*j).second; - // fire up the Map task - start_map_task(map_command, input_filename, start, length, tcpaddr); - } - } - return 0; -} - -// Fires up the map task. -// For the moment, all it does is echo the command line, not run it. -int start_map_task(const char* command, const char* input_filename, - long start, long length, sockaddr_in ip_address) -{ - string ip_addr_string(inet_ntoa(ip_address.sin_addr)); - - - - - - cout << "ssh " << ip_addr_string << " " << command - << " " << input_filename << " " << start << " " << length << endl; - return 0; -} - - - -void usage(const char* name) { - cout << "usage: " << name << " inputfile map_task reduce_task" << endl; - cout << "inputfile must be a valid path in the running Ceph filesystem." << endl; - cout << "map_task should be given with an absolute path, and be present on "; - cout << "the REGULAR filesystem every node." << endl; - cout << "reduce_task need be present on this node only." << endl; -} - - - - diff --git a/branches/sage/ebofs2/active/activemaster.h b/branches/sage/ebofs2/active/activemaster.h deleted file mode 100644 index 524138e253c7b..0000000000000 --- a/branches/sage/ebofs2/active/activemaster.h +++ /dev/null @@ -1,18 +0,0 @@ -/* - * This is the master executable to start up - * a compute task across several nodes. - * - * - */ - - -//#include -#include "utility.h" - -int start_map_task(const char* command, const char* input_filename, - long start, long length, tcpaddr_t ip_address); - -void usage(const char* name); - -//Client* startCephClient(); -//void kill_client(Client* client); diff --git a/branches/sage/ebofs2/active/activeslave.cc b/branches/sage/ebofs2/active/activeslave.cc deleted file mode 100644 index d2953490f9d69..0000000000000 --- a/branches/sage/ebofs2/active/activeslave.cc +++ /dev/null @@ -1,510 +0,0 @@ -/* - * This is a slave for receiving and executing commands for - * compute tasks on an OSD. This supersedes the daemon - * version in activetaskd.h/cc, because it's easier to debug - * if it's not a daemon. - * - * Networking code is based off examples from Stevens' UNIX Network Programming. - */ - -#include "activeslave.h" - -int main(int argc, const char* argv[]) { - - /* Set up TCP server */ - int sockfd, newsockfd, childpid; - socklen_t clilen; - struct sockaddr_in cli_addr, serv_addr; - - //const char *pname = argv[0]; // process name - - // Open a TCP socket - if ((sockfd = socket(AF_INET, SOCK_STREAM, 0)) < 0) { - cerr << "slave: can't open TCP socket. Exiting." << endl; - exit(-1); - } - cerr << "slave: opened TCP socket." << endl; - - // set up the port - bzero((char*) &serv_addr, sizeof(serv_addr)); - serv_addr.sin_family = AF_INET; - serv_addr.sin_addr.s_addr = htonl(INADDR_ANY); - serv_addr.sin_port = htons(SERV_TCP_PORT); - - if (bind(sockfd, (struct sockaddr *) &serv_addr, sizeof(serv_addr)) < 0) { - cerr << "slave: can't bind local address. Exiting." << endl; - exit(-1); - } - - if(listen(sockfd, SOMAXCONN) < 0) { - cerr << "slave: listening error. Exiting." << endl; - exit(-1); - } - - - /* The Big Loop */ - while (1) { - - // wait for a message and fork off a child process to handle it - clilen = sizeof(cli_addr); - newsockfd = accept(sockfd, - (struct sockaddr *) &cli_addr, - &clilen); - - if (newsockfd < 0) { - cerr << "slave: accept error. Exiting." << endl; - exit(-1); - } - - if ((childpid = fork()) < 0) { - cerr << "slave: fork error. Exiting." << endl; - exit(-1); - } - - else if (childpid == 0) { // child process - cerr << "Forked child process for incoming socket" << endl; - close(sockfd); - process_request(newsockfd); - cerr << "Finished processing request. Exiting child." << endl; - exit(0); - } - - close (newsockfd); // parent - - //sleep(30); /* wait 30 seconds */ - } - exit(EXIT_SUCCESS); -} - - -/* This will process requests from the master. - * The protocol in a nutshell: - * Master opens a socket to slave, and sends - * one message. - * Slave replies with one message. - * Socket is closed. - */ - -void process_request(int newsockfd) { - - // first, read the message type. - int msg_type = readmsgtype(newsockfd); - - - // Second, call some function based on the message type to process - // the rest of the message. The function is responsible for the rest - // of the message; this includes checking the message footer. - - switch(msg_type) { - - case PING: // ping - process_ping(newsockfd); - break; - case STARTTASK: // start_task - process_start_task(newsockfd); - break; - case RETRIEVELOCALFILE: // get_local - process_get_local(newsockfd); - break; - case SHIPCODE: - process_shipcode(newsockfd); - break; - - case PINGREPLY: - case FINISHEDTASK: - case TASKFAILED: - case SENDLOCALFILE: - case LOCALFILENOTFOUND: - case CODESAVED: - case SHIPFAILED: - cerr << "activeslave: BUG received message " << CMD_LIST[msg_type] << - " from master; master should never send this message." << endl; - exit(-1); - break; - - - case -1: - cerr << "activeslave: message had an unidentifiable type. " << - "Closing socket and discarding rest of message." << endl; - default: - cerr << "activeslave: BUG! received unexpected return value of" << msg_type << - "from readmsgtype(). Closing socket and discarding rest of message." << endl; - - exit(-1); - } -} - - -// Just write a ping_reply to the socket. -void process_ping(int fd) { - - // make sure the footer is valid - if (!check_footer(fd)) { - cerr << "process_ping warning: ping message has invalid or missing footer." - << endl; - } - // Even if the footer's invalid, send the reply. - cerr << "Replying to ping..." << endl; - send_msg_header(fd, PINGREPLY); - send_msg_footer(fd); - cerr << "Ping processing completed." << endl; -} - - - -// Process a start_task message. This reads the incoming message and -// starts the corresponding task. - -// Parameter format: taskID(int) command(string) -// cephinputfile(string) offset(long) length(long) localoutputfile - -// WARNING: currently has the trivial task hardwired. It -// ignores the command and the output file. -void process_start_task(int fd) { - - char command[MAX_STRING_SIZE + 1]; - char cephinputfile[MAX_STRING_SIZE + 1]; - char localoutputfile[MAX_STRING_SIZE + 1]; - - cout << "in process_start_task: "; - int taskID = read_positive_int(fd); - cout << "read taskID " << taskID; - - read_string(fd, command); - cout << ", command " << command; - - read_string(fd, cephinputfile); - cout << ", cephinputfile " << cephinputfile; - off_t offset = read_off_t(fd); - cout << ", offset " << offset; - off_t length = read_off_t(fd); - cout << ", length " << length; - - read_string(fd, localoutputfile); - cout << ", localoutputfile " << localoutputfile << endl; - - // make sure the footer is valid - if (!check_footer(fd)) { - cerr << "process_start_task warning: message has invalid or missing footer. " - << "Discarding message." << endl; - exit(-1); - } - - - // To do: modify to load the task from a library instead of just - // using the hardwired one. - - void (*task)(const char*, const char*, off_t, off_t) = 0; - task = start_trivial_task; - - - - // start a task; create an output filename that uses the task ID, 'cause we might - // end up with multiple pieces of a file on each OSD. - // WARNING: always does the trivial task; prints answer to stdout but - // does not write it to disk. - cerr << "starting task: " << endl; - //start_trivial_task(cephinputfile, localoutputfile, offset, length); - task(cephinputfile, localoutputfile, offset, length); - cerr << "returned from task! Sending reply:" << endl; - - - - // send the reply - send_msg_header(fd, FINISHEDTASK); - write_positive_int(fd, taskID); - send_msg_footer(fd); - - // done - cout << "Done sending reply for taskID " << taskID << endl; - return; -} - - - -void start_trivial_task (const char* ceph_filename, const char* local_filename, - off_t offset, off_t length) { - // Don't bother to copy the file to disk. Read the file directly from Ceph, - // and add up all the bytes. - // Write the total to the local file as a string. - Client * client = startCephClient(); - - bufferptr bp(CHUNK); - - // get the source file's size. Sanity-check the request range. - struct stat st; - int r = client->lstat(ceph_filename, &st); - assert (r == 0); - - off_t src_total_size = st.st_size; - if (src_total_size < offset + length) { - cerr << "Error in copy ExtentToLocalFile: offset + length = " << offset << " + " << length - << " = " + (offset + length) << ", source file size is only " << src_total_size << endl; - exit(-1); - } - off_t remaining = length; - - // open the file and seek to the start position - cerr << "start_trivial_task: opening the source file and seeking " << endl; - - int fh_ceph = client->open(ceph_filename, O_RDONLY); - assert (fh_ceph > -1); - r = client->lseek(fh_ceph, offset, SEEK_SET); - assert (r == offset); - - int counter = 0; - // read through the extent and add up the bytes - cerr << "start_trivial_task: counting up bytes" << endl; - char* bp_c = bp.c_str(); - while (remaining > 0) { - off_t got = client->read(fh_ceph, bp_c, MIN(remaining,CHUNK), -1); - assert(got > 0); - remaining -= got; - for (off_t i = 0; i < got; ++i) { - counter += (unsigned int)(bp_c[i]); - } - } - cerr << "start_trivial_task: Done! Answer is " << counter << endl; - client->close(fh_ceph); - - //assert(0); -} - - -// Starts a sloppy grep count of the hardwired search string over the -// given Ceph file extent. It's sloppy because it copies the given -// extent to a local file and runs "grep" on it, with no effort to take -// care of boundary issues. -void start_sloppy_grepcount (const char* ceph_filename, const char* local_filename, - long offset, long size) { - - Client* client = startCephClient(); - char* search_string = "the"; - // copy the file to a local file. - - copyExtentToLocalFile (client, ceph_filename, offset, size, local_filename); - // we want: grep -c search_string local_filename - // to get the number of occurrences of the string. - string command = ""; - command.append("grep -c "); - command.append(search_string); - command.append(local_filename); - - assert(0); - -} - - -// Processes a SHIPCODE message. The message will have a shared -// library attached to it, which must be stored locally. - -void process_shipcode(int fd) { - - - // get the size of the shared library - size_t library_size = read_size_t(fd); - - - // save the library to a file - cerr << "saving library..." << endl; - const char* libfile = "/tmp/libslavetask.so"; - int local_fd = ::open(libfile, O_WRONLY | O_CREAT | O_TRUNC); - if (local_fd < 0) { - cerr << "Error opening " << libfile << " for writing." << endl; - exit(-1); - } - - off_t remaining = library_size; - - bufferptr bp(CHUNK); - char* bp_c = bp.c_str(); - while (remaining > 0) { - off_t got = readn(fd, bp_c, MIN(remaining, CHUNK)); - assert(got > 0); - remaining -= got; - ssize_t written = ::write(local_fd, bp_c, got); - assert (written == got); - } - cerr << "Received shared library and stored as " << libfile << endl; - -} - - -// Processes a get_local message. The message -// specifies the filename of a local file to -// return to the sender. - -// Parameter format: requestID(int) localfilename(string) - -// INCOMPLETE: currently just reads the message. - - -void process_get_local(int fd) { - cout << "in process_get_local: "; - int taskID = read_positive_int(fd); - cout << "read taskID " << taskID; - - char localfilename[MAX_STRING_SIZE+1]; - read_string(fd, localfilename); - cout << ", localfilename " << localfilename << endl; - - - // make sure the footer is valid - if (!check_footer(fd)) { - cerr << "process_get_local warning: message has invalid or missing footer." - << endl; - } - - // not implemented - cerr << "Error: get_local command unimplemented." << endl; - assert(0); -} - - -// Retrieves a formatted message from the socket. -// At the moment, this just reads and prints a fixed- -// length message type. -// DEPRECATED. -void str_getmsg(int sockfd) { - - int n; - - // read message types until the connection dies - while(true) { - n = readmsgtype(sockfd); - if (n != 0) { - cerr << "from getmsg: some sort of error" << endl; - exit(-1); - } - } -} - -// Echo a stream socket message back to the sender. -// DEPRECATED. -void str_echo(int sockfd) { - - int n; - char line[MAXLINE]; - - while(true) { - - // read from the stream - cerr << "str_echo: waiting for a line" << endl; - n = readline(sockfd, line, MAXLINE); - cerr << "str_echo: read a line" << endl; - if (0 == n) { - cerr << "str_echo: connection terminated" << endl; - return; // connection is terminated - } - else if (n < 0) { - cerr << "str_echo: readline error" << endl; - exit(-1); - } - - // write back to the stream - if (n != writen(sockfd, line, n)) { - cerr << "str_echo: writen error" << endl; - exit(-1); - } - else - cerr << "Echoed line " << endl; - } -} - - -void str_ack(int sockfd) { - - int n; - char line[MAXLINE]; - //char *ack = "ack"; - - while(true) { - - // read from the stream - n = readline(sockfd, line, MAXLINE); - - if (0 == n) - return; // connection is terminated - else if (n < 0) - //err_dump("str_echo: readline error"); - exit(-1); - - // write back to the stream - if (4 != writen(sockfd, "ack\n", 4)) - //err_dump("str_echo: writen error"); - exit(-1); - } -} - - - -// Read command lines from the socket and execute them - -void str_run(int sockfd) { - - int n; - char line[MAXLINE]; - char* error_msg = "str_run: No command interpreter found\n"; - char* ack_msg = "Running command... "; - char* commit_msg = "Command executed!\n"; - - while(true) { - - // read from the stream - n = readline(sockfd, line, MAXLINE); - - if (0 == n) - return; // connection is terminated - else if (n < 0) - //err_dump("str_echo: readline error"); - exit(-1); - - if (system(NULL)) { - writen(sockfd, ack_msg, strlen(ack_msg)); - system(line); - writen(sockfd, commit_msg, strlen(commit_msg)); - } - else if ((int)strlen(error_msg) != writen(sockfd, error_msg, strlen(error_msg))) - //err_dump("str_echo: writen error"); - exit(-1); - } -} - - -// take a filename and copy it from Ceph to a local directory. -// Not completed. - -void str_copytolocal(int sockfd) { - - int n; - char line[MAXLINE]; - char* error_msg = "str_copy: No command interpreter found\n"; - char* ack_msg = "Running command... "; - char* commit_msg = "Command executed!\n"; - //char* temp_dir = "/tmp"; - - - while(true) { - - // read from the stream - n = readline(sockfd, line, MAXLINE); - - if (0 == n) - return; // connection is terminated - else if (n < 0) - //err_dump("str_echo: readline error"); - exit(-1); - - if (system(NULL)) { - writen(sockfd, ack_msg, strlen(ack_msg)); - system(line); - writen(sockfd, commit_msg, strlen(commit_msg)); - } - else if ((int)strlen(error_msg) != writen(sockfd, error_msg, strlen(error_msg))) - //err_dump("str_echo: writen error"); - exit(-1); - } -} - - - diff --git a/branches/sage/ebofs2/active/activeslave.h b/branches/sage/ebofs2/active/activeslave.h deleted file mode 100644 index 574824b0478f6..0000000000000 --- a/branches/sage/ebofs2/active/activeslave.h +++ /dev/null @@ -1,23 +0,0 @@ -#include "inet.h" -#include "common.h" -#include "utility.h" -#include "client/Client.h" - - -// The port number is "osdd" on a telephone keypad. -#define SERV_TCP_PORT 6733 - -#define MAXLINE 512 - -void str_echo(int sockfd); -void str_ack(int sockfd); -void str_run(int sockfd); -void str_getmsg(int sockfd); -void process_request(int newsockfd); -void process_ping(int fd); -void process_start_task(int fd); -void process_get_local(int fd); -void process_shipcode(int fd); - -void start_trivial_task(const char* ceph_filename, const char* local_filename, - long offset, long length); diff --git a/branches/sage/ebofs2/active/activetaskd.cc b/branches/sage/ebofs2/active/activetaskd.cc deleted file mode 100644 index ec9f290543093..0000000000000 --- a/branches/sage/ebofs2/active/activetaskd.cc +++ /dev/null @@ -1,241 +0,0 @@ -/* - * This is a daemon for receiving and executing commands for compute tasks on an OSD. - * - * The daemon uses skeleton code from - * http://www.linuxprofilm.com/articles/linux-daemon-howto.html. The - * site is no longer up, but can be seen through the archive.org. - * Networking code is based off examples from Stevens' UNIX Network Programming. - */ - -#include "activetaskd.h" - - -#define SERVER - -#undef SERVER - -int main(int argc, const char* argv[]) { - - /* Our process ID and Session ID */ - pid_t pid, sid; - - /* Fork off the parent process */ - pid = fork(); - if (pid < 0) { - exit(EXIT_FAILURE); - } - /* If we got a good PID, then - we can exit the parent process. */ - if (pid > 0) { - exit(EXIT_SUCCESS); - } - - /* Change the file mode mask */ - umask(0); - - /* Open any logs here */ - - /* Create a new SID for the child process */ - sid = setsid(); - if (sid < 0) { - /* Log the failure */ - exit(EXIT_FAILURE); - } - - - /* Change the current working directory */ - if ((chdir("/")) < 0) { - /* Log the failure */ - exit(EXIT_FAILURE); - } - - /* Close out the standard file descriptors */ - close(STDIN_FILENO); - close(STDOUT_FILENO); - close(STDERR_FILENO); - - /* Daemon-specific initialization goes here */ - - - - /* Set up TCP server */ - int sockfd, newsockfd, childpid; - socklen_t clilen; - struct sockaddr_in cli_addr, serv_addr; - - const char *pname = argv[0]; // process name - - // Open a TCP socket - if ((sockfd = socket(AF_INET, SOCK_STREAM, 0)) < 0) - exit(-1); - //err_dump("server: can't open stream socket"); - - // set up the port - bzero((char*) &serv_addr, sizeof(serv_addr)); - serv_addr.sin_family = AF_INET; - serv_addr.sin_addr.s_addr = htonl(INADDR_ANY); - serv_addr.sin_port = htons(SERV_TCP_PORT); - - if (bind(sockfd, (struct sockaddr *) &serv_addr, sizeof(serv_addr)) < 0) - exit(-1); - //err_dump("server: can't bind local address"); - - if(listen(sockfd, SOMAXCONN) < 0) - exit(-1); - //err_dump("server: listening error"); - - /* The Big Loop */ - while (1) { - - // wait for a message and fork off a child process to handle it - clilen = sizeof(cli_addr); - newsockfd = accept(sockfd, - (struct sockaddr *) &cli_addr, - &clilen); - - if (newsockfd < 0) - exit(-1); - //err_dump("server: accept error"); - - if ( (childpid = fork()) < 0) - exit(-1); - // err_dump("server: fork error"); - - else if (childpid == 0) { // child process - close(sockfd); - //str_echo(newsockfd); - str_run(newsockfd); - // insert code to process the request - exit(0); - } - - close (newsockfd); // parent - - //sleep(30); /* wait 30 seconds */ - } - exit(EXIT_SUCCESS); -} - - -// Echo a stream socket message back to the sender. - -void str_echo(int sockfd) { - - int n; - char line[MAXLINE]; - - while(true) { - - // read from the stream - n = readline(sockfd, line, MAXLINE); - - if (0 == n) - return; // connection is terminated - else if (n < 0) - //err_dump("str_echo: readline error"); - exit(-1); - - // write back to the stream - if (n != writen(sockfd, line, n)) - //err_dump("str_echo: writen error"); - exit(-1); - } -} - - -void str_ack(int sockfd) { - - int n; - char line[MAXLINE]; - char *ack = "ack"; - - while(true) { - - // read from the stream - n = readline(sockfd, line, MAXLINE); - - if (0 == n) - return; // connection is terminated - else if (n < 0) - //err_dump("str_echo: readline error"); - exit(-1); - - // write back to the stream - if (4 != writen(sockfd, "ack\n", 4)) - //err_dump("str_echo: writen error"); - exit(-1); - } -} - - - - -// Read command lines from the socket and execute them - -void str_run(int sockfd) { - - int n; - char line[MAXLINE]; - char* error_msg = "str_run: No command interpreter found\n"; - char* ack_msg = "Running command... "; - char* commit_msg = "Command executed!\n"; - - while(true) { - - // read from the stream - n = readline(sockfd, line, MAXLINE); - - if (0 == n) - return; // connection is terminated - else if (n < 0) - //err_dump("str_echo: readline error"); - exit(-1); - - if (system(NULL)) { - writen(sockfd, ack_msg, strlen(ack_msg)); - system(line); - writen(sockfd, commit_msg, strlen(commit_msg)); - } - else if (strlen(error_msg) != writen(sockfd, error_msg, strlen(error_msg))) - //err_dump("str_echo: writen error"); - exit(-1); - } -} - - -// take a filename and copy it from Ceph to a local directory - -void str_copytolocal(int sockfd) { - - int n; - char line[MAXLINE]; - char* error_msg = "str_copy: No command interpreter found\n"; - char* ack_msg = "Running command... "; - char* commit_msg = "Command executed!\n"; - char* temp_dir = "/tmp"; - - - while(true) { - - // read from the stream - n = readline(sockfd, line, MAXLINE); - - if (0 == n) - return; // connection is terminated - else if (n < 0) - //err_dump("str_echo: readline error"); - exit(-1); - - if (system(NULL)) { - writen(sockfd, ack_msg, strlen(ack_msg)); - system(line); - writen(sockfd, commit_msg, strlen(commit_msg)); - } - else if (strlen(error_msg) != writen(sockfd, error_msg, strlen(error_msg))) - //err_dump("str_echo: writen error"); - exit(-1); - } -} - - - diff --git a/branches/sage/ebofs2/active/activetaskd.h b/branches/sage/ebofs2/active/activetaskd.h deleted file mode 100644 index fc5cec923c4bc..0000000000000 --- a/branches/sage/ebofs2/active/activetaskd.h +++ /dev/null @@ -1,14 +0,0 @@ -#include "inet.h" -#include "common.h" -#include "utility.h" -#include "client/Client.h" - - -// The port number is "osdd" on a telephone keypad. -#define SERV_TCP_PORT 6733 - -#define MAXLINE 512 - -void str_echo(int sockfd); -void str_ack(int sockfd); -void str_run(int sockfd); diff --git a/branches/sage/ebofs2/active/client_init.cc b/branches/sage/ebofs2/active/client_init.cc deleted file mode 100644 index 8b137891791fe..0000000000000 --- a/branches/sage/ebofs2/active/client_init.cc +++ /dev/null @@ -1 +0,0 @@ - diff --git a/branches/sage/ebofs2/active/client_init.h b/branches/sage/ebofs2/active/client_init.h deleted file mode 100644 index 139597f9cb07c..0000000000000 --- a/branches/sage/ebofs2/active/client_init.h +++ /dev/null @@ -1,2 +0,0 @@ - - diff --git a/branches/sage/ebofs2/active/common.h b/branches/sage/ebofs2/active/common.h deleted file mode 100644 index bf2c73ca4052a..0000000000000 --- a/branches/sage/ebofs2/active/common.h +++ /dev/null @@ -1,94 +0,0 @@ -#ifndef COMMON_H -#define COMMON_H - - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -// a bunch of string constants -// for commands - - - -#define CMDLENGTH 10 -#define CMDCOUNT 11 - -#define MAX_STRING_SIZE 255 - -/* - * These are the various messages that can be sent between the master - * and slave. The slave sends one reply to each message from the master. - - * PING/PINGREPLY: just what it sounds like. - - * STARTTASK: starts a task. Needs to be reworked to allow code - * shipping. The slave attempts to perform the task, and replies with - * FINISHEDTASK or TASKFAILED. - * - * RETRIEVELOCALFILE: requests a file that the slave has stored - * locally. Slave replies with SENDLOCALFILE and the file, or with - * LOCALFILENOTFOUND. - * - * SHIPCODE: sends a shared library to the slave, containing a - * function that is to be executed later by the STARTTASK - * command. Slave replies with CODESAVED or SHIPFAILED. - * - */ - - -const off_t CHUNK = 1024 * 1024 * 4; - -#define PING 0 -#define STARTTASK 1 -#define RETRIEVELOCALFILE 2 -#define PINGREPLY 3 -#define FINISHEDTASK 4 -#define TASKFAILED 5 -#define SENDLOCALFILE 6 -#define LOCALFILENOTFOUND 7 -#define SHIPCODE 8 -#define CODESAVED 9 -#define SHIPFAILED 10 - - -#define FOOTER_LENGTH 7 - -const char* CMD_LIST[CMDCOUNT] = {"______PING", - "START_TASK", - "_GET_LOCAL", - "PING_REPLY", - "_TASK_DONE", - "TASKFAILED", - "SEND_LOCAL", - "LOCAL_GONE", - "_SHIP_CODE", - "CODE_SAVED", - "SHIPFAILED"}; - -const char FOOTER[FOOTER_LENGTH + 1] = "MSG_END"; - - -// const char* strArray[] = {"string1", "string2", "string3"}; -//const char commands[2][4] = {"foo", "bar"}; - - - -// error codes -#define ARGUMENTSINVALID 1001 -#define CEPHCLIENTSTARTUPFAILED 1002 -#define INPUTFILEREADFAILED 1003 - - -// const char* name = "Njal"; - - - -#endif //COMMON_H diff --git a/branches/sage/ebofs2/active/echotestclient.cc b/branches/sage/ebofs2/active/echotestclient.cc deleted file mode 100644 index 2b2d15e7ca5cb..0000000000000 --- a/branches/sage/ebofs2/active/echotestclient.cc +++ /dev/null @@ -1,74 +0,0 @@ -/* - * This is merely a test of an echo server; it's an early step in - * building up the Ceph distributed compute service. This is - * discardable once the next stage is up and running. - * - * Code is based off examples in Stevens' "Unix Network Programming". - */ - -#include "echotestclient.h" - -int main(int argc, char* argv[]) { - - int sockfd; - struct sockaddr_in serv_addr; - - char* pname = argv[0]; - - bzero((char *) &serv_addr, sizeof(serv_addr)); - serv_addr.sin_family = AF_INET; - serv_addr.sin_addr.s_addr = inet_addr(SERV_HOST_ADDR); - serv_addr.sin_port = htons(SERV_TCP_PORT); - - - // open a TCP socket - if ((sockfd = socket(AF_INET, SOCK_STREAM, 0)) < 0) { - printf("client: can't open stream socket"); - exit (-1); - } - - // connect to the server. - if (connect(sockfd, (struct sockaddr *) &serv_addr, - sizeof(serv_addr)) < 0) { - printf("client: can't connect to server"); - exit (-1); - } - - // start the test echoer - str_cli(stdin, sockfd); - - - close (sockfd); - exit(0); -} - - -void str_cli(FILE *fp, int sockfd) { - - int n; - char sendline[MAXLINE], recvline[MAXLINE + 1]; - - // read from the fp and write to the socket; - // then read from the socket and write to stdout - while (fgets(sendline, MAXLINE, fp) != NULL) { - - n = strlen(sendline); - if (writen(sockfd, sendline, n) != n) { - printf("str_cli: writen error on socket"); - exit(-1); - } - n = readline(sockfd, recvline, MAXLINE); - if (n < 0) { - printf("str_cli: readline error"); - exit(-1); - } - recvline[n] = 0; - fputs(recvline, stdout); - } - - if (ferror(fp)) { - printf("str_cli: error reading file"); - exit(-1); - } - -} diff --git a/branches/sage/ebofs2/active/echotestclient.h b/branches/sage/ebofs2/active/echotestclient.h deleted file mode 100644 index 9b26416640bc2..0000000000000 --- a/branches/sage/ebofs2/active/echotestclient.h +++ /dev/null @@ -1,10 +0,0 @@ -#include "inet.h" -#include "common.h" -#include "socket_utility.h" - -#define SERV_HOST_ADDR "128.114.57.143" //issdm-8 -#define SERV_TCP_PORT 6733 -#define MAXLINE 512 - -void str_cli(FILE *fp, int sockfd); - diff --git a/branches/sage/ebofs2/active/inet.h b/branches/sage/ebofs2/active/inet.h deleted file mode 100644 index 385fa915f9dc7..0000000000000 --- a/branches/sage/ebofs2/active/inet.h +++ /dev/null @@ -1,9 +0,0 @@ -/* - * Generic TCP/IP definitions - */ - -#include -#include -#include -#include -#include diff --git a/branches/sage/ebofs2/active/msgtestclient.cc b/branches/sage/ebofs2/active/msgtestclient.cc deleted file mode 100644 index 53650e730b387..0000000000000 --- a/branches/sage/ebofs2/active/msgtestclient.cc +++ /dev/null @@ -1,418 +0,0 @@ -/* - * This test client tests the sending of message headers to the slave. - * - * Code is based off examples in Stevens' "Unix Network Programming". - */ -#include "msgtestclient.h" -#define REQUIRED_ARGS 2 - -int main(int argc, char* argv[]) { - - - // make sure we have all the arguments we need - if (argc < REQUIRED_ARGS) { usage(argv[0]); exit(-1); } - - // This file is rewired for running tests from a - // shell script. The first parameter specifies the - // name of the Ceph file that the test will be - // run on; the second parameter specifies which of - // four different tests will be run. - const char* input_filename = argv[1]; - int test_number = atoi(argv[2]); - assert (test_number > 0); - assert (test_number < 4); - - //const char* map_command = argv[2]; - // These two variables aren't really used yet. - const char* map_command = "map_foo"; - const char* output_filename = "out_foo"; - //const char* output_filename = argv[3]; - //const char* reduce_command = argv[4]; // not implemented yet - - // start up a Ceph client - Client* client = startCephClient(); - - // open the input file as read_only - int fh = client->open(input_filename, O_RDONLY); - if (fh < 0) { - cerr << "The input file " << input_filename << " could not be opened." << endl; - exit(-1); - } - - // How big is the file? - off_t filesize; - struct stat stbuf; - if (0 > client->lstat(input_filename, &stbuf)) { - cerr << "Error: could not retrieve size of input file " << input_filename << endl; - exit(-1); - } - filesize = stbuf.st_size; - if (filesize < 1) { - cerr << "Error: input file size is " << filesize << endl; - exit(-1); - } - - // retrieve all the object extents and close the file - list extents; - off_t offset = 0; - client->enumerate_layout(fh, extents, filesize, offset); - client->close(fh); - - list::iterator i; - map::iterator j; - int osd; - int taskID = 0; - - // Pull out all the extents, and make a vector of - // (ip_address, start, length). - - vector original_splits; - - for (i = extents.begin(); i != extents.end(); i++) { - - request_split split; - // find the primary and get its IP address - osd = client->osdmap->get_pg_primary(i->layout.pgid); - entity_inst_t inst = client->osdmap->get_inst(osd); - entity_addr_t entity_addr = inst.addr; - entity_addr.make_addr(split.ip_address); - - // iterate through each buffer_extent in the ObjectExtent - for (j = i->buffer_extents.begin(); - j != i->buffer_extents.end(); j++) { - - // get the range of the buffer_extent - split.start = (*j).first; - split.length = (*j).second; - // throw the split onto the vector - original_splits.push_back(split); - } - } - - // close the client - we're done with it - kill_client(client); - - // sanity check: display the splits - cerr << "Listing original splits:" << endl; - for (vector::iterator i = original_splits.begin(); - i != original_splits.end(); i++) { - cerr << "Split: IP " << i->ip_address << ", start " - << i->start << ", length " << i->length << endl; - } - - vector test_splits; - // Now, modify the splits as needed for the test type. - // There are three types of tests. - // Test 1: regular test. - // Test 2: put all the tasks on the "wrong" OSD. - // Test 3: do the entire job off one node. - - if (1 == test_number) { - cerr << "Test type 1: using original splits." << endl; - test_splits = original_splits; - } - else if (2 == test_number) { - cerr << "Test type 2: rotating split IP addresses. " << endl; - int split_count = original_splits.size(); - for (int i = 0; i < split_count; ++i) { - request_split s; - s.start = original_splits.at(i).start; - s.length = original_splits.at(i).length; - s.ip_address = original_splits.at((i+1)%split_count).ip_address; - test_splits.push_back(s); - } - } - else if (3 == test_number) { - cerr << "Test type 3: one giant split." << endl; - request_split s; - s.start = 0; - s.length = filesize; - s.ip_address = original_splits.at(0).ip_address; - test_splits.push_back(s); - } - else { - cerr << "Error: received invalid test type " << test_number << endl; - exit(-1); - } - - cerr << "Listing test splits:" << endl; - for (vector::iterator i = test_splits.begin(); - i != test_splits.end(); i++) { - cerr << "Split: IP " << i->ip_address << ", start " - << i->start << ", length " << i->length << endl; - } - - // start the timer - utime_t start_time = g_clock.now(); - int pending_tasks = 0; - - // start up the tasks - for (vector::iterator i = test_splits.begin(); - i != test_splits.end(); i++) { - start_map_task(i->ip_address, taskID++, map_command, input_filename, - i->start, i->length, output_filename); - ++pending_tasks; - } - - cerr << "Waiting for " << pending_tasks << " tasks to return..." << endl; - - // wait for all the tasks to finish - while (pending_tasks > 0) { - int exit_status; - cerr << "Waiting for " << pending_tasks << " tasks to return..." << endl; - pid_t pid = wait(&exit_status); - if (pid < 0) { - cerr << "ERROR on wait(): result was " << pid << endl; - exit(-1); - } - --pending_tasks; - if (WIFEXITED(exit_status)) { - cerr << "Task with pid " << pid << " returned with exit status " << - WEXITSTATUS(exit_status) << endl; - } - else { cerr << "WARNING: Task with pid " << pid << " exited abnormally" << endl; } - } - - cerr << "All tasks have returned." << endl; - // report the time - double elapsed_time; - elapsed_time = (g_clock.now() - start_time); - cerr << "Elapsed time: " << elapsed_time << endl; - cerr << elapsed_time << " " << endl; - // send the time to stdout for the shell script - cout << elapsed_time << " "; - exit(0); -} - - -// sends a complete ping message -// through the file descriptor -// and waits for a reply. This -// will hang if there's no reply. - -void ping_test(int fd) { - - // send the message header and footer. - // A ping message has no body. - send_msg_header(fd, PING); - send_msg_footer(fd); - - // receive the reply. - int msg_type = readmsgtype(fd); - if (msg_type < 0) { - cerr << "ping_test: Failed reading the ping reply. Exiting." << endl; - exit(-1); - } - if (PINGREPLY != msg_type) { - assert((msg_type <= 0) && (msg_type < CMDCOUNT) && - "readmsgtype return value out of range"); - cerr << "ping_test: slave sent invalid reply: replied to ping with message type" << - msg_type << ": " << CMD_LIST[msg_type] << ". Exiting. " << endl; - exit(-1); - } - else { - cerr << "Received valid ping reply!" << endl; - } - - if(!check_footer(fd)) { - cerr << "ping_test: message footer not found. Exiting." << endl; - exit(-1); - } -} - - - - -// send a test message for starting a task -void start_task_test(int fd) { - - // The test: - // TaskID 42 - // command: "Burninate" - // input file: "countryside" - // offset: 8764 (TROG) - // length: 367 (DOR) - - send_start_task_msg(fd, 42, strlen("Burninate"), "Burninate", - strlen("countryside"), "countryside", - 8764, 367, - strlen("toast"), "toast"); -} - - -// sends a message to the fd telling it to start a task. -// Remember: the message format requires any string to be -// prefixed by its (unterminated) length. -void send_start_task_msg(int fd, - int taskID, - int command_size, const char* command, - int inputfilenamesize, const char* inputfilename, - off_t offset, - off_t length, - int outputfilenamesize, const char* outputfilename) { - - // write the header and the message to the file descriptor. - - send_msg_header(fd, STARTTASK); - - write_positive_int(fd, taskID); - write_positive_int(fd, command_size); - write_string(fd, command); - write_positive_int(fd, inputfilenamesize); - write_string(fd, inputfilename); - //write_long(fd, offset); - write_off_t (fd, offset); - //write_long(fd, length); - write_off_t (fd, length); - write_positive_int(fd, outputfilenamesize); - write_string(fd, outputfilename); - - // terminate the message - send_msg_footer(fd); -} - - - - -// creates a new connection to the slave -// at the given IP address and port. -// Overloaded to take an IP address as a -// string or as an in_addr_t. - -int create_new_connection(const char* ip_address, uint16_t port) -{ - in_addr_t ip = inet_addr(ip_address); - if ((in_addr_t)-1 == ip) { - cerr << "Error creating new connection: \"" << ip_address << - "\" is not a valid IP address." << endl; - return -1; - } - else - //cerr << "Opening connection to " << ip_address << ":" << endl; - return create_new_connection(ip, port); -} - - -int create_new_connection(in_addr_t ip_address, uint16_t port) { - - struct sockaddr_in serv_addr; - int sockfd; - - bzero((char *) &serv_addr, sizeof(serv_addr)); - serv_addr.sin_family = AF_INET; - //serv_addr.sin_addr.s_addr = inet_addr(SERV_HOST_ADDR); - serv_addr.sin_addr.s_addr = ip_address; - serv_addr.sin_port = htons(SERV_TCP_PORT); - - // open a TCP socket - if ((sockfd = socket(AF_INET, SOCK_STREAM, 0)) < 0) { - cerr << "msgtestclient: can't open stream socket. Exiting." << endl; - exit (-1); - } - - // connect to the server. - if (connect(sockfd, (struct sockaddr *) &serv_addr, - sizeof(serv_addr)) < 0) { - cerr << "msgtestclient: can't connect to server." << endl; - exit (-1); - } - //cerr << "opened connection!" << endl; - return sockfd; -} - -void msg_type_sender(int sockfd) { - - for (int i = 0; i < CMDCOUNT; ++i) { - send_msg_header(sockfd, i); - } - -} - -// Fires up the map task. -// For the moment, all it does is echo the command line, not run it. -int start_map_task(sockaddr_in ip_address, int taskID, - const char* command, const char* input_filename, - off_t start, off_t length, - const char* output_filename) -{ - int childpid; - // fork off a child process to do the work, and return - if ((childpid = fork()) < 0) { - cerr << "start_map_task: fork error. Exiting." << endl; - exit(-1); - } - - else if (childpid != 0) { // parent - cerr << "start_map_task: forked child process " - << childpid << " to start task. " << endl; - return 0; - } - - - string ip_addr_string(inet_ntoa(ip_address.sin_addr)); - // cerr << "command: " << ip_addr_string << " taskID " - // << taskID << ": " << command - // << " " << input_filename << " " << start << " " << length - // << " " << output_filename << endl; - - // open a socket to the slave, and send the message - //cerr << "Sending message: " << endl; - int sockfd = create_new_connection(ip_addr_string.c_str(), SERV_TCP_PORT); - send_start_task_msg(sockfd, taskID, strlen(command), command, - strlen(input_filename), input_filename, - start, length, - strlen(output_filename), output_filename); - - // wait for a reply - cerr << "Sent message for taskID " << taskID << ". Waiting for reply..." << endl; - - // receive the reply. - int msg_type = readmsgtype(sockfd); - if (msg_type < 0) { - cerr << "start_map_task: Failed reading the reply. Exiting." << endl; - exit(-1); - } - if (FINISHEDTASK != msg_type) { - assert((msg_type <= 0) && (msg_type < CMDCOUNT)); - cerr << "start_map_task: slave sent invalid reply: replied with message type" << - msg_type << ": " << CMD_LIST[msg_type] << ". Exiting. " << endl; - exit(-1); - } - // read the taskID of the reply - - int reply_taskID = read_positive_int(sockfd); - - if(!check_footer(sockfd)) { - cerr << "ping_test: message footer not found. Exiting." << endl; - exit(-1); - } - - // done! - close(sockfd); - cerr << "Task " << taskID << "/" << reply_taskID << - " complete! Ending child process." << endl; - exit(0); - //_exit(0); - cerr << "exit(0) returned. Strange things are afoot." << endl; -} - - - - -void usage(const char* name) { - //cout << "usage: " << name << " inputfile map_task outputfile" << endl; - //cout << "inputfile must be a valid path in the running Ceph filesystem." << endl; - //cout << "map_task should be given with an absolute path, and be present on "; - //cout << "the REGULAR filesystem every node." << endl; - //cout << "output_file will be written locally to the node." << endl; - - cout << "usage: " << name << " inputfile test_number" << endl; - cout << "inputfile must be a valid path in the running Ceph filesystem." << endl; - cout << "test_number must be 1, 2, or 3." << endl; - cout << " 1: run the test task normally (one slave per OSD)" << endl; - cout << " 2: run the test task on the \"wrong\" OSDs" << endl; - cout << " 3: run the entire task in a single process" << endl; -} - - - diff --git a/branches/sage/ebofs2/active/msgtestclient.h b/branches/sage/ebofs2/active/msgtestclient.h deleted file mode 100644 index 568c9057be250..0000000000000 --- a/branches/sage/ebofs2/active/msgtestclient.h +++ /dev/null @@ -1,44 +0,0 @@ -#include "inet.h" -#include "common.h" -#include "utility.h" -#include "client/Client.h" - -// wait.h MUST NOT be #included before client/Client.h -#include -#include - - struct request_split { - tcpaddr_t ip_address; - off_t start; - off_t length; - }; - - -//#define SERV_HOST_ADDR "128.114.57.143" //issdm-8 -#define SERV_HOST_ADDR "128.114.57.166" //issdm-31 - -#define SERV_TCP_PORT 6733 -#define MAXLINE 512 - -void msg_type_sender(int sockfd); - - -int create_new_connection(const char* ip_address, uint16_t port); -int create_new_connection(in_addr_t ip_address, uint16_t port); -void usage(const char* name); -void ping_test(int fd); -void start_task_test(int fd); - -int start_map_task(sockaddr_in ip_address, int taskID, - const char* map_command, - const char* input_filename, - off_t start, off_t length, - const char* output_filename); - -void send_start_task_msg(int fd, - int taskID, - int command_size, const char* command, - int inputfilenamesize, const char* inputfilename, - off_t offset, - off_t length, - int outputfilenamesize, const char* outputfilename); diff --git a/branches/sage/ebofs2/active/trivial_task.cc b/branches/sage/ebofs2/active/trivial_task.cc deleted file mode 100644 index 7a72ecb277c4b..0000000000000 --- a/branches/sage/ebofs2/active/trivial_task.cc +++ /dev/null @@ -1,50 +0,0 @@ -#include "trivial_task.h" - -void start_trivial_task (const char* ceph_filename, const char* local_filename, - off_t offset, off_t length) { - // Don't bother to copy the file to disk. Read the file directly from Ceph, - // and add up all the bytes. - // Write the total to the local file as a string. - Client * client = startCephClient(); - - bufferptr bp(CHUNK); - - // get the source file's size. Sanity-check the request range. - struct stat st; - int r = client->lstat(ceph_filename, &st); - assert (r == 0); - - off_t src_total_size = st.st_size; - if (src_total_size < offset + length) { - cerr << "Error in copy ExtentToLocalFile: offset + length = " << offset << " + " << length - << " = " + (offset + length) << ", source file size is only " << src_total_size << endl; - exit(-1); - } - off_t remaining = length; - - // open the file and seek to the start position - cerr << "start_trivial_task: opening the source file and seeking " << endl; - - int fh_ceph = client->open(ceph_filename, O_RDONLY); - assert (fh_ceph > -1); - r = client->lseek(fh_ceph, offset, SEEK_SET); - assert (r == offset); - - int counter = 0; - // read through the extent and add up the bytes - cerr << "start_trivial_task: counting up bytes" << endl; - char* bp_c = bp.c_str(); - while (remaining > 0) { - off_t got = client->read(fh_ceph, bp_c, MIN(remaining,CHUNK), -1); - assert(got > 0); - remaining -= got; - for (off_t i = 0; i < got; ++i) { - counter += (unsigned int)(bp_c[i]); - } - } - cerr << "start_trivial_task: Done! Answer is " << counter << endl; - client->close(fh_ceph); - - //assert(0); -} - diff --git a/branches/sage/ebofs2/active/trivial_task.h b/branches/sage/ebofs2/active/trivial_task.h deleted file mode 100644 index ce9b47c82ceb6..0000000000000 --- a/branches/sage/ebofs2/active/trivial_task.h +++ /dev/null @@ -1,12 +0,0 @@ -// Shared library for the trivial task of adding up all the bytes in a file - -//#include "inet.h" -#include "common.h" -#include "utility.h" -#include "client/Client.h" - - -extern "C" void start_trivial_task (const char* ceph_filename, - const char* local_filename, - off_t offset, off_t length); - diff --git a/branches/sage/ebofs2/active/utility.h b/branches/sage/ebofs2/active/utility.h deleted file mode 100644 index 789398c0f4527..0000000000000 --- a/branches/sage/ebofs2/active/utility.h +++ /dev/null @@ -1,214 +0,0 @@ -/* - * Miscellaneous Active OSD helper functions. - * - */ - -//#include -#include "client/Client.h" -#include "common.h" -#include "config.h" -#include "common/Timer.h" -#include "msg/SimpleMessenger.h" -#include "socket_utility.h" - -Client* startCephClient(); -void kill_client(Client* client); - - -int send_msg_header(int fd, int header_ID); -int readmsgtype(int fd); -bool check_footer(int fd); -int send_msg_header(int fd, int header_ID); -int send_msg_footer(int fd); - -/* - * Fires up a Ceph client and returns a pointer to it. - */ - -Client* startCephClient() -{ - cout << "ActiveMaster: Initializing Ceph client:" << endl; - - // parse args from CEPH_ARGS, not command line - vector args; - env_to_vec(args); - parse_config_options(args); - - if (g_conf.clock_tare) g_clock.tare(); - - // be safe - g_conf.use_abspaths = true; - - // load monmap - MonMap* monmap = new MonMap(); - int r = monmap->read(".ceph_monmap"); - if (r < 0) { - cout << "ActiveMaster: could not find .ceph_monmap" << endl; - return 0; - } - assert(r >= 0); - - // start up network - rank.start_rank(); - - // start client - Client *client; - client = new Client(rank.register_entity(MSG_ADDR_CLIENT_NEW), monmap); - client->init(); - - // mount - client->mount(); - - return client; -} - -void kill_client (Client * client) -{ - client->unmount(); - client->shutdown(); - delete client; - - // wait for messenger to finish - rank.wait(); -} - - - -// read a message type from the socket, and print it. - -int readmsgtype(int fd) { - int rc; - char typebuf[CMDLENGTH + 1]; - - rc = read(fd, &typebuf, CMDLENGTH); - - // read a fixed-length text command - if (rc != CMDLENGTH) { - cerr << "in readmsgtype: read error: result is " << rc << endl; - return -1; - } - - // null-terminate the string - typebuf[CMDLENGTH] = 0; - - // print the command - //cerr << "readmsgtype: text type is " << typebuf << ", " ; - - // figure out which one it is, by number - for (int i = 0; i < CMDCOUNT; ++i) { - if (!strcmp(typebuf, CMD_LIST[i])) { - //cerr << "which is identified as type " << i << endl; - return i; - } - } - - // if we get here the type was invalid - cerr << "readmsgtype: unrecognized message type " << typebuf << endl; - return -1; -} - -// Attempt to read the message footer off -// the given stream. -bool check_footer(int fd) { - - // leave space for null termination - char footer_buf[FOOTER_LENGTH+1]; - - // read the footer - int rc = read(fd, &footer_buf, FOOTER_LENGTH); - if (rc != FOOTER_LENGTH) { - cerr << "in check_footer: read error: result is " << rc << endl; - return false; - } - - // null-terminate the string - footer_buf[FOOTER_LENGTH] = 0; - - // Is the footer correct? - if (0 == strcmp(footer_buf, FOOTER)) - return true; - else - return false; -} - - -// send a fixed-length message header -// given the header's ID. -int send_msg_header(int fd, int header_ID) { - if ((header_ID < 0) || (header_ID >= CMDCOUNT)) { - cerr << "In send_msg_header: received out-of-range header ID " << header_ID << - ". Exiting process." << endl; - exit(-1); - } - - //cerr << "attempting to send message " << CMD_LIST[header_ID] << - // " with ID " << header_ID << endl; - - if (CMDLENGTH != writen(fd, CMD_LIST[header_ID], CMDLENGTH)) { - cerr << "In send_msg_header: error writing header ID " << header_ID << - "to file descriptor " << fd << ". Exiting process." << endl; - exit(-1); - } - - return 0; -} - -// send the fixed-length message footer. -int send_msg_footer(int fd) { - //cerr << "attempting to send message footer: " << endl; - if (FOOTER_LENGTH != writen(fd, FOOTER, FOOTER_LENGTH)) { - cerr << "in send_msg_footer: error writing footer to file descriptor " << - fd << ". Exiting process." << endl; - exit(-1); - } else { - //cerr << "Sent message footer!" << endl; - } - return 0; -} - - - -// Copy a given extent of a Ceph file to the local disk. -// Requires a running Ceph client. -void copyExtentToLocalFile (Client* client, const char* ceph_source, - long offset, long length, - const char* local_destination) { - - // get the source file's size. Sanity-check the request range. - struct stat st; - int r = client->lstat(ceph_source, &st); - assert (r == 0); - - off_t src_total_size = st.st_size; - if (src_total_size < offset + length) { - cerr << "Error in copy ExtentToLocalFile: offset + size = " << offset << " + " << length - << " = " + (offset + length) << ", source file size is only " << src_total_size << endl; - exit(-1); - } - off_t remaining = length; - - // open the source and destination files. Advance the source - // file to the desired offset. - int fh_ceph = client->open(ceph_source, O_RDONLY); - assert (fh_ceph > -1); - r = client->lseek(fh_ceph, offset, SEEK_SET); - assert (r == offset); - - int fh_local = ::open(local_destination, O_WRONLY|O_CREAT|O_TRUNC, 0644); - assert (fh_local > -1); - - // copy the file 4 MB at a time - const int chunk = 4*1024*1024; - bufferptr bp(chunk); - - while (remaining > 0) { - off_t got = client->read(fh_ceph, bp.c_str(), MIN(remaining,chunk), -1); - assert(got > 0); - remaining -= got; - off_t wrote = ::write(fh_local, bp.c_str(), got); - assert (got == wrote); - } - // close the files - client->close(fh_ceph); - ::close(fh_local); -} diff --git a/branches/sage/ebofs2/cfuse.cc b/branches/sage/ebofs2/cfuse.cc deleted file mode 100644 index 3c157fefadf89..0000000000000 --- a/branches/sage/ebofs2/cfuse.cc +++ /dev/null @@ -1,88 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include -#include -#include -using namespace std; - -#include "config.h" - -#include "client/Client.h" -#include "client/fuse.h" -#include "client/fuse_ll.h" - -#include "msg/SimpleMessenger.h" - -#include "common/Timer.h" - -#ifndef DARWIN -#include -#endif // DARWIN - -#include -#include -#include - -int main(int argc, char **argv, char *envp[]) { - - //cerr << "cfuse starting " << myrank << "/" << world << std::endl; - vector args; - argv_to_vec(argc, argv, args); - parse_config_options(args); - - // args for fuse - vec_to_argv(args, argc, argv); - - // FUSE will chdir("/"); be ready. - g_conf.use_abspaths = true; - - if (g_conf.clock_tare) g_clock.tare(); - - // load monmap - MonMap monmap; - int r = monmap.read(".ceph_monmap"); - assert(r >= 0); - - // start up network - rank.start_rank(); - - // start client - Client *client = new Client(rank.register_entity(entity_name_t::CLIENT()), &monmap); - client->init(); - - // start up fuse - // use my argc, argv (make sure you pass a mount point!) - cout << "mounting" << std::endl; - client->mount(); - - //cerr << "starting fuse on pid " << getpid() << std::endl; - if (g_conf.fuse_ll) - ceph_fuse_ll_main(client, argc, argv); - else - ceph_fuse_main(client, argc, argv); - //cerr << "fuse finished on pid " << getpid() << std::endl; - - client->unmount(); - cout << "unmounted" << std::endl; - client->shutdown(); - - delete client; - - // wait for messenger to finish - rank.wait(); - - return 0; -} - diff --git a/branches/sage/ebofs2/client/Client.cc b/branches/sage/ebofs2/client/Client.cc deleted file mode 100644 index 5ffbf890f42e9..0000000000000 --- a/branches/sage/ebofs2/client/Client.cc +++ /dev/null @@ -1,3909 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -// unix-ey fs stuff -#include -#include -#include -#include -#include -#include - -#include - - -#include -using namespace std; - - -// ceph stuff -#include "Client.h" - -#include "messages/MStatfs.h" -#include "messages/MStatfsReply.h" - -#include "messages/MClientMount.h" -#include "messages/MClientUnmount.h" -#include "messages/MClientSession.h" -#include "messages/MClientReconnect.h" -#include "messages/MClientRequest.h" -#include "messages/MClientRequestForward.h" -#include "messages/MClientReply.h" -#include "messages/MClientFileCaps.h" - -#include "messages/MGenericMessage.h" - -#include "messages/MMDSGetMap.h" -#include "messages/MMDSMap.h" - -#include "osdc/Filer.h" -#include "osdc/Objecter.h" -#include "osdc/ObjectCacher.h" - -#include "common/Cond.h" -#include "common/Mutex.h" -#include "common/Logger.h" - - - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_client) *_dout << dbeginl << g_clock.now() << " client" << whoami /*<< "." << pthread_self() */ << " " - -#define tout if (g_conf.client_trace) traceout - - -// static logger -Mutex client_logger_lock; -LogType client_logtype; -Logger *client_logger = 0; - - - - -class C_Client_CloseRelease : public Context { - Client *cl; - Inode *in; -public: - C_Client_CloseRelease(Client *c, Inode *i) : cl(c), in(i) {} - void finish(int) { - cl->close_release(in); - } -}; - -class C_Client_CloseSafe : public Context { - Client *cl; - Inode *in; -public: - C_Client_CloseSafe(Client *c, Inode *i) : cl(c), in(i) {} - void finish(int) { - cl->close_safe(in); - } -}; - - - - - - -// cons/des - -Client::Client(Messenger *m, MonMap *mm, int in) : timer(client_lock) -{ - // which client am i? - whoami = m->get_myname().num(); - my_instance = in; - monmap = mm; - - mounted = false; - mount_timeout_event = 0; - unmounting = false; - - last_tid = 0; - unsafe_sync_write = 0; - - mdsmap = 0; - - // - root = 0; - - lru.lru_set_max(g_conf.client_cache_size); - - // file handles - free_fd_set.insert(10, 1<<30); - - // set up messengers - messenger = m; - - // osd interfaces - osdmap = new OSDMap(); // initially blank.. see mount() - objecter = new Objecter(messenger, monmap, osdmap, client_lock); - objecter->set_client_incarnation(0); // client always 0, for now. - objectcacher = new ObjectCacher(objecter, client_lock); - filer = new Filer(objecter); -} - - -Client::~Client() -{ - tear_down_cache(); - - if (objectcacher) { - delete objectcacher; - objectcacher = 0; - } - - if (filer) { delete filer; filer = 0; } - if (objecter) { delete objecter; objecter = 0; } - if (osdmap) { delete osdmap; osdmap = 0; } - if (mdsmap) { delete mdsmap; mdsmap = 0; } - - if (messenger) { delete messenger; messenger = 0; } -} - - -void Client::tear_down_cache() -{ - // fd's - for (hash_map::iterator it = fd_map.begin(); - it != fd_map.end(); - it++) { - Fh *fh = it->second; - dout(1) << "tear_down_cache forcing close of fh " << it->first << " ino " << fh->inode->inode.ino << dendl; - put_inode(fh->inode); - delete fh; - } - fd_map.clear(); - - // caps! - // *** FIXME *** - - // empty lru - lru.lru_set_max(0); - trim_cache(); - assert(lru.lru_get_size() == 0); - - // close root ino - assert(inode_map.size() <= 1); - if (root && inode_map.size() == 1) { - delete root; - root = 0; - inode_map.clear(); - } - - assert(inode_map.empty()); -} - - - -// debug crapola - -void Client::dump_inode(Inode *in, set& did) -{ - dout(1) << "dump_inode: inode " << in->ino() << " ref " << in->ref << " dir " << in->dir << dendl; - - if (in->dir) { - dout(1) << " dir size " << in->dir->dentries.size() << dendl; - //for (hash_map, eqstr>::iterator it = in->dir->dentries.begin(); - for (hash_map::iterator it = in->dir->dentries.begin(); - it != in->dir->dentries.end(); - it++) { - dout(1) << " dn " << it->first << " ref " << it->second->ref << dendl; - dump_inode(it->second->inode, did); - } - } -} - -void Client::dump_cache() -{ - set did; - - if (root) dump_inode(root, did); - - for (hash_map::iterator it = inode_map.begin(); - it != inode_map.end(); - it++) { - if (did.count(it->second)) continue; - - dout(1) << "dump_cache: inode " << it->first - << " ref " << it->second->ref - << " dir " << it->second->dir << dendl; - if (it->second->dir) { - dout(1) << " dir size " << it->second->dir->dentries.size() << dendl; - } - } - -} - - -void Client::init() -{ - - // logger? - client_logger_lock.Lock(); - if (client_logger == 0) { - client_logtype.add_inc("lsum"); - client_logtype.add_inc("lnum"); - client_logtype.add_inc("lwsum"); - client_logtype.add_inc("lwnum"); - client_logtype.add_inc("lrsum"); - client_logtype.add_inc("lrnum"); - client_logtype.add_inc("trsum"); - client_logtype.add_inc("trnum"); - client_logtype.add_inc("wrlsum"); - client_logtype.add_inc("wrlnum"); - client_logtype.add_inc("lstatsum"); - client_logtype.add_inc("lstatnum"); - client_logtype.add_inc("ldirsum"); - client_logtype.add_inc("ldirnum"); - client_logtype.add_inc("readdir"); - client_logtype.add_inc("stat"); - client_logtype.add_avg("owrlat"); - client_logtype.add_avg("ordlat"); - client_logtype.add_inc("owr"); - client_logtype.add_inc("ord"); - - char s[80]; - char hostname[80]; - gethostname(hostname, 79); - sprintf(s,"clients.%s.%d", hostname, getpid()); - client_logger = new Logger(s, &client_logtype); - } - client_logger_lock.Unlock(); - -} - -void Client::shutdown() -{ - dout(1) << "shutdown" << dendl; - messenger->shutdown(); -} - - - - -// =================== -// metadata cache stuff - -void Client::trim_cache() -{ - unsigned last = 0; - while (lru.lru_get_size() != last) { - last = lru.lru_get_size(); - - if (lru.lru_get_size() <= lru.lru_get_max()) break; - - // trim! - Dentry *dn = (Dentry*)lru.lru_expire(); - if (!dn) break; // done - - dout(15) << "trim_cache unlinking dn " << dn->name - << " in dir " << hex << dn->dir->parent_inode->inode.ino - << dendl; - unlink(dn); - } - - // hose root? - if (lru.lru_get_size() == 0 && root && root->ref == 0 && inode_map.size() == 1) { - dout(15) << "trim_cache trimmed root " << root << dendl; - delete root; - root = 0; - inode_map.clear(); - } -} - -/** insert_inode - * - * insert + link a single dentry + inode into the metadata cache. - */ -Inode* Client::insert_inode(Dir *dir, InodeStat *st, const string& dname) -{ - Dentry *dn = NULL; - if (dir->dentries.count(dname)) - dn = dir->dentries[dname]; - - dout(12) << "insert_inode " << dname << " ino " << st->inode.ino - << " size " << st->inode.size - << " mtime " << st->inode.mtime - << " mask " << st->mask - << " in dir " << dir->parent_inode->inode.ino - << dendl; - - if (dn) { - if (dn->inode->inode.ino == st->inode.ino) { - touch_dn(dn); - dout(12) << " had dentry " << dname - << " with correct ino " << dn->inode->inode.ino - << dendl; - } else { - dout(12) << " had dentry " << dname - << " with WRONG ino " << dn->inode->inode.ino - << dendl; - unlink(dn); - dn = NULL; - } - } - - if (!dn) { - // have inode linked elsewhere? -> unlink and relink! - if (inode_map.count(st->inode.ino)) { - Inode *in = inode_map[st->inode.ino]; - assert(in); - - if (in->dn) { - dout(12) << " had ino " << in->inode.ino - << " not linked or linked at the right position, relinking" - << dendl; - dn = relink(dir, dname, in); - } else { - // link - dout(12) << " had ino " << in->inode.ino - << " unlinked, linking" << dendl; - dn = link(dir, dname, in); - } - } - } - - if (!dn) { - Inode *in = new Inode(st->inode, objectcacher); - inode_map[st->inode.ino] = in; - dn = link(dir, dname, in); - dout(12) << " new dentry+node with ino " << st->inode.ino << dendl; - } else { - // actually update info - dout(12) << " stat inode mask is " << st->mask << dendl; - if (st->mask & STAT_MASK_BASE) { - dn->inode->inode = st->inode; - dn->inode->dirfragtree = st->dirfragtree; // FIXME look at the mask! - } - - // ...but don't clobber our mtime, size! - /* isn't this handled below? - if ((dn->inode->mask & STAT_MASK_SIZE) == 0 && - dn->inode->file_wr_size > dn->inode->inode.size) - dn->inode->inode.size = dn->inode->file_wr_size; - if ((dn->inode->mask & STAT_MASK_MTIME) == 0 && - dn->inode->file_wr_mtime > dn->inode->inode.mtime) - dn->inode->inode.mtime = dn->inode->file_wr_mtime; - */ - } - - // OK, we found it! - assert(dn && dn->inode); - - // save the mask - dn->inode->mask = st->mask; - - // or do we have newer size/mtime from writing? - if (dn->inode->file_caps() & CAP_FILE_WR) { - if (dn->inode->file_wr_size > dn->inode->inode.size) - dn->inode->inode.size = dn->inode->file_wr_size; - if (dn->inode->file_wr_mtime > dn->inode->inode.mtime) - dn->inode->inode.mtime = dn->inode->file_wr_mtime; - } - - // symlink? - if (dn->inode->inode.is_symlink()) { - if (!dn->inode->symlink) - dn->inode->symlink = new string; - *(dn->inode->symlink) = st->symlink; - } - - return dn->inode; -} - -/** update_inode_dist - * - * update MDS location cache for a single inode - */ -void Client::update_dir_dist(Inode *in, DirStat *dst) -{ - // auth - in->dir_auth = -1; - if (dst->frag == frag_t()) { - in->dir_auth = dst->auth; - } else { - dout(20) << "got dirfrag map for " << in->inode.ino << " frag " << dst->frag << " to mds " << dst->auth << dendl; - in->fragmap[dst->frag] = dst->auth; - } - - // replicated - in->dir_replicated = dst->is_rep; // FIXME that's just one frag! - - // dist - /* - if (!st->dirfrag_dist.empty()) { // FIXME - set dist = st->dirfrag_dist.begin()->second; - if (dist.empty() && !in->dir_contacts.empty()) - dout(9) << "lost dist spec for " << in->inode.ino - << " " << dist << dendl; - if (!dist.empty() && in->dir_contacts.empty()) - dout(9) << "got dist spec for " << in->inode.ino - << " " << dist << dendl; - in->dir_contacts = dist; - } - */ -} - - -/** insert_trace - * - * insert a trace from a MDS reply into the cache. - */ -Inode* Client::insert_trace(MClientReply *reply) -{ - Inode *cur = root; - utime_t now = g_clock.real_now(); - - dout(10) << "insert_trace got " << reply->get_trace_in().size() << " inodes" << dendl; - - list::const_iterator pdn = reply->get_trace_dn().begin(); - list::const_iterator pdir = reply->get_trace_dir().begin(); - - for (list::const_iterator pin = reply->get_trace_in().begin(); - pin != reply->get_trace_in().end(); - ++pin) { - - if (pin == reply->get_trace_in().begin()) { - // root - dout(10) << "insert_trace root" << dendl; - if (!root) { - // create - cur = root = new Inode((*pin)->inode, objectcacher); - dout(10) << "insert_trace new root is " << root << dendl; - inode_map[root->inode.ino] = root; - root->dir_auth = 0; - } - } else { - // not root. - Dir *dir = cur->open_dir(); - assert(pdn != reply->get_trace_dn().end()); - cur = this->insert_inode(dir, *pin, *pdn); - dout(10) << "insert_trace dn " << *pdn << " ino " << (*pin)->inode.ino << " -> " << cur << dendl; - ++pdn; - - // move to top of lru! - if (cur->dn) - lru.lru_touch(cur->dn); - } - - // set cache ttl - if (g_conf.client_cache_stat_ttl) { - cur->valid_until = now; - cur->valid_until += g_conf.client_cache_stat_ttl; - } - - // update dir dist info - if (pdir == reply->get_trace_dir().end()) break; - update_dir_dist(cur, *pdir); - ++pdir; - } - - return cur; -} - - - - -Dentry *Client::lookup(filepath& path) -{ - dout(14) << "lookup " << path << dendl; - - Inode *cur = root; - if (!cur) return NULL; - - Dentry *dn = 0; - for (unsigned i=0; iinode << " valid_until " << dn->inode->valid_until << dendl; - } else { - dout(14) << " dentry " << path[i] << " dne" << dendl; - return NULL; - } - cur = dn->inode; - assert(cur); - } else { - return NULL; // not a dir - } - } - - if (dn) { - dout(11) << "lookup '" << path << "' found " << dn->name << " inode " << dn->inode->inode.ino << " valid_until " << dn->inode->valid_until<< dendl; - } - - return dn; -} - -// ------- - -int Client::choose_target_mds(MClientRequest *req) -{ - int mds = 0; - - // find deepest known prefix - Inode *diri = root; // the deepest known containing dir - Inode *item = 0; // the actual item... if we know it - int missing_dn = -1; // which dn we miss on (if we miss) - - unsigned depth = req->get_filepath().depth(); - unsigned i; - for (i=0; iinode.mode & INODE_MODE_DIR && diri->dir) { - Dir *dir = diri->dir; - - // do we have the next dentry? - if (dir->dentries.count( req->get_filepath()[i] ) == 0) { - missing_dn = i; // no. - break; - } - - dout(7) << " have path seg " << i << " on " << diri->dir_auth << " ino " << diri->inode.ino << " " << req->get_filepath()[i] << dendl; - - if (i == depth-1) { // last one! - item = dir->dentries[ req->get_filepath()[i] ]->inode; - break; - } - - // continue.. - diri = dir->dentries[ req->get_filepath()[i] ]->inode; - assert(diri); - } else { - missing_dn = i; - break; - } - } - - // pick mds - if (!diri || g_conf.client_use_random_mds) { - // no root info, pick a random MDS - mds = mdsmap->get_random_in_mds(); - if (mds < 0) mds = 0; - - if (0) { - mds = 0; - dout(0) << "hack: sending all requests to mds" << mds << dendl; - } - } else { - if (req->auth_is_best()) { - // pick the actual auth (as best we can) - if (item) { - mds = item->authority(); - } else { - mds = diri->authority(req->get_filepath()[missing_dn]); - } - } else { - // balance our traffic! - mds = diri->pick_replica(mdsmap); // for the _inode_ - dout(20) << "for " << req->get_filepath() << " diri " << diri->inode.ino << " rep " - << diri->dir_contacts - << " mds" << mds << dendl; - } - } - dout(20) << "mds is " << mds << dendl; - - return mds; -} - - - -MClientReply *Client::make_request(MClientRequest *req, - int use_mds) // this param is purely for debug hacking -{ - // time the call - utime_t start = g_clock.real_now(); - - bool nojournal = false; - int op = req->get_op(); - if (op == MDS_OP_STAT || - op == MDS_OP_LSTAT || - op == MDS_OP_READDIR || - op == MDS_OP_OPEN) - nojournal = true; - - - // -- request -- - // assign a unique tid - tid_t tid = ++last_tid; - req->set_tid(tid); - - if (!mds_requests.empty()) - req->set_oldest_client_tid(mds_requests.begin()->first); - else - req->set_oldest_client_tid(tid); // this one is the oldest. - - // make note - MetaRequest request(req, tid); - mds_requests[tid] = &request; - - // encode payload now, in case we have to resend (in case of mds failure) - req->encode_payload(); - request.request_payload = req->get_payload(); - - // note idempotency - request.idempotent = req->is_idempotent(); - - // hack target mds? - if (use_mds) - request.resend_mds = use_mds; - - // set up wait cond - Cond cond; - request.caller_cond = &cond; - - while (1) { - // choose mds - int mds; - // force use of a particular mds? - if (request.resend_mds >= 0) { - mds = request.resend_mds; - request.resend_mds = -1; - dout(10) << "target resend_mds specified as mds" << mds << dendl; - } else { - mds = choose_target_mds(req); - if (mds >= 0) { - dout(10) << "chose target mds" << mds << " based on hierarchy" << dendl; - } else { - mds = mdsmap->get_random_in_mds(); - if (mds < 0) mds = 0; // hrm. - dout(10) << "chose random target mds" << mds << " for lack of anything better" << dendl; - } - } - - // open a session? - if (mds_sessions.count(mds) == 0) { - Cond cond; - - if (!mdsmap->have_inst(mds)) { - dout(10) << "no address for mds" << mds << ", requesting new mdsmap" << dendl; - int mon = monmap->pick_mon(); - messenger->send_message(new MMDSGetMap(), - monmap->get_inst(mon)); - waiting_for_mdsmap.push_back(&cond); - cond.Wait(client_lock); - - if (!mdsmap->have_inst(mds)) { - dout(10) << "hmm, still have no address for mds" << mds << ", trying a random mds" << dendl; - request.resend_mds = mdsmap->get_random_in_mds(); - continue; - } - } - - if (waiting_for_session.count(mds) == 0) { - dout(10) << "opening session to mds" << mds << dendl; - messenger->send_message(new MClientSession(MClientSession::OP_REQUEST_OPEN), - mdsmap->get_inst(mds)); - } - - // wait - waiting_for_session[mds].push_back(&cond); - while (waiting_for_session.count(mds)) { - dout(10) << "waiting for session to mds" << mds << " to open" << dendl; - cond.Wait(client_lock); - } - } - - // send request. - send_request(&request, mds); - - // wait for signal - dout(20) << "awaiting kick on " << &cond << dendl; - cond.Wait(client_lock); - - // did we get a reply? - if (request.reply) - break; - } - - // got it! - MClientReply *reply = request.reply; - - // kick dispatcher (we've got it!) - assert(request.dispatch_cond); - request.dispatch_cond->Signal(); - dout(20) << "sendrecv kickback on tid " << tid << " " << request.dispatch_cond << dendl; - - // clean up. - mds_requests.erase(tid); - - - // -- log times -- - if (client_logger) { - utime_t lat = g_clock.real_now(); - lat -= start; - dout(20) << "lat " << lat << dendl; - client_logger->finc("lsum",(double)lat); - client_logger->inc("lnum"); - - if (nojournal) { - client_logger->finc("lrsum",(double)lat); - client_logger->inc("lrnum"); - } else { - client_logger->finc("lwsum",(double)lat); - client_logger->inc("lwnum"); - } - - if (op == MDS_OP_STAT) { - client_logger->finc("lstatsum",(double)lat); - client_logger->inc("lstatnum"); - } - else if (op == MDS_OP_READDIR) { - client_logger->finc("ldirsum",(double)lat); - client_logger->inc("ldirnum"); - } - - } - - return reply; -} - - -void Client::handle_client_session(MClientSession *m) -{ - dout(10) << "handle_client_session " << *m << dendl; - int from = m->get_source().num(); - - switch (m->op) { - case MClientSession::OP_OPEN: - assert(mds_sessions.count(from) == 0); - mds_sessions[from] = 0; - break; - - case MClientSession::OP_CLOSE: - mds_sessions.erase(from); - // FIXME: kick requests (hard) so that they are redirected. or fail. - break; - - default: - assert(0); - } - - // kick waiting threads - for (list::iterator p = waiting_for_session[from].begin(); - p != waiting_for_session[from].end(); - ++p) - (*p)->Signal(); - waiting_for_session.erase(from); - - delete m; -} - - -void Client::send_request(MetaRequest *request, int mds) -{ - MClientRequest *r = request->request; - if (!r) { - // make a new one - dout(10) << "send_request rebuilding request " << request->tid - << " for mds" << mds << dendl; - r = new MClientRequest; - r->copy_payload(request->request_payload); - r->decode_payload(); - r->set_retry_attempt(request->retry_attempt); - } - request->request = 0; - - dout(10) << "send_request " << *r << " to mds" << mds << dendl; - messenger->send_message(r, mdsmap->get_inst(mds)); - - request->mds.insert(mds); -} - -void Client::handle_client_request_forward(MClientRequestForward *fwd) -{ - tid_t tid = fwd->get_tid(); - - if (mds_requests.count(tid) == 0) { - dout(10) << "handle_client_request_forward no pending request on tid " << tid << dendl; - delete fwd; - return; - } - - MetaRequest *request = mds_requests[tid]; - assert(request); - - // reset retry counter - request->retry_attempt = 0; - - if (request->idempotent && - mds_sessions.count(fwd->get_dest_mds())) { - // dest mds has a session, and request was forwarded for us. - - // note new mds set. - if (request->num_fwd < fwd->get_num_fwd()) { - // there are now exactly two mds's whose failure should trigger a resend - // of this request. - request->mds.clear(); - request->mds.insert(fwd->get_source().num()); - request->mds.insert(fwd->get_dest_mds()); - request->num_fwd = fwd->get_num_fwd(); - dout(10) << "handle_client_request tid " << tid - << " fwd " << fwd->get_num_fwd() - << " to mds" << fwd->get_dest_mds() - << ", mds set now " << request->mds - << dendl; - } else { - dout(10) << "handle_client_request tid " << tid - << " previously forwarded to mds" << fwd->get_dest_mds() - << ", mds still " << request->mds - << dendl; - } - } else { - // request not forwarded, or dest mds has no session. - // resend. - dout(10) << "handle_client_request tid " << tid - << " fwd " << fwd->get_num_fwd() - << " to mds" << fwd->get_dest_mds() - << ", non-idempotent, resending to " << fwd->get_dest_mds() - << dendl; - - request->mds.clear(); - request->num_fwd = fwd->get_num_fwd(); - request->resend_mds = fwd->get_dest_mds(); - request->caller_cond->Signal(); - } - - delete fwd; -} - -void Client::handle_client_reply(MClientReply *reply) -{ - tid_t tid = reply->get_tid(); - - if (mds_requests.count(tid) == 0) { - dout(10) << "handle_client_reply no pending request on tid " << tid << dendl; - delete reply; - return; - } - MetaRequest *request = mds_requests[tid]; - assert(request); - - // store reply - request->reply = reply; - - // wake up waiter - request->caller_cond->Signal(); - - // wake for kick back - Cond cond; - request->dispatch_cond = &cond; - while (mds_requests.count(tid)) { - dout(20) << "handle_client_reply awaiting kickback on tid " << tid << " " << &cond << dendl; - cond.Wait(client_lock); - } -} - - -// ------------------------ -// incoming messages - -void Client::dispatch(Message *m) -{ - client_lock.Lock(); - - switch (m->get_type()) { - // osd - case MSG_OSD_OPREPLY: - objecter->handle_osd_op_reply((MOSDOpReply*)m); - break; - - case MSG_OSD_MAP: - objecter->handle_osd_map((class MOSDMap*)m); - if (!mounted) mount_cond.Signal(); - break; - - // mounting and mds sessions - case MSG_MDS_MAP: - handle_mds_map((MMDSMap*)m); - break; - case MSG_CLIENT_UNMOUNT: - handle_unmount(m); - break; - case MSG_CLIENT_SESSION: - handle_client_session((MClientSession*)m); - break; - - // requests - case MSG_CLIENT_REQUEST_FORWARD: - handle_client_request_forward((MClientRequestForward*)m); - break; - case MSG_CLIENT_REPLY: - handle_client_reply((MClientReply*)m); - break; - - case MSG_CLIENT_FILECAPS: - handle_file_caps((MClientFileCaps*)m); - break; - - case MSG_STATFS_REPLY: - handle_statfs_reply((MStatfsReply*)m); - break; - - default: - dout(10) << "dispatch doesn't recognize message type " << m->get_type() << dendl; - assert(0); // fail loudly - break; - } - - // unmounting? - if (unmounting) { - dout(10) << "unmounting: trim pass, size was " << lru.lru_get_size() - << "+" << inode_map.size() << dendl; - trim_cache(); - if (lru.lru_get_size() == 0 && inode_map.empty()) { - dout(10) << "unmounting: trim pass, cache now empty, waking unmount()" << dendl; - mount_cond.Signal(); - } else { - dout(10) << "unmounting: trim pass, size still " << lru.lru_get_size() - << "+" << inode_map.size() << dendl; - dump_cache(); - } - } - - client_lock.Unlock(); -} - - -void Client::handle_mds_map(MMDSMap* m) -{ - int frommds = -1; - if (m->get_source().is_mds()) - frommds = m->get_source().num(); - - if (mdsmap == 0) { - mdsmap = new MDSMap; - - assert(m->get_source().is_mon()); - whoami = m->get_dest().num(); - messenger->reset_myname(entity_name_t::CLIENT(whoami)); - dout(1) << "handle_mds_map i am now " << m->get_dest() << dendl; - - mount_cond.Signal(); // mount might be waiting for this. - } - - if (m->get_epoch() < mdsmap->get_epoch()) { - dout(1) << "handle_mds_map epoch " << m->get_epoch() << " is older than our " - << mdsmap->get_epoch() << dendl; - delete m; - return; - } - - dout(1) << "handle_mds_map epoch " << m->get_epoch() << dendl; - mdsmap->decode(m->get_encoded()); - - // send reconnect? - if (frommds >= 0 && - mdsmap->get_state(frommds) == MDSMap::STATE_RECONNECT) { - send_reconnect(frommds); - } - - // kick requests? - if (frommds >= 0 && - mdsmap->get_state(frommds) == MDSMap::STATE_ACTIVE) { - kick_requests(frommds); - //failed_mds.erase(from); - } - - // kick any waiting threads - list ls; - ls.swap(waiting_for_mdsmap); - for (list::iterator p = ls.begin(); p != ls.end(); ++p) - (*p)->Signal(); - - delete m; -} - -void Client::send_reconnect(int mds) -{ - dout(10) << "send_reconnect to mds" << mds << dendl; - - MClientReconnect *m = new MClientReconnect; - - if (mds_sessions.count(mds)) { - // i have an open session. - for (hash_map::iterator p = inode_map.begin(); - p != inode_map.end(); - p++) { - if (p->second->caps.count(mds)) { - dout(10) << " caps on " << p->first - << " " << cap_string(p->second->caps[mds].caps) - << " wants " << cap_string(p->second->file_caps_wanted()) - << dendl; - p->second->caps[mds].seq = 0; // reset seq. - m->add_inode_caps(p->first, // ino - p->second->file_caps_wanted(), // wanted - p->second->caps[mds].caps, // issued - p->second->inode.size, p->second->inode.mtime, p->second->inode.atime); - string path; - p->second->make_path(path); - dout(10) << " path on " << p->first << " is " << path << dendl; - m->add_inode_path(p->first, path); - } - if (p->second->stale_caps.count(mds)) { - dout(10) << " clearing stale caps on " << p->first << dendl; - p->second->stale_caps.erase(mds); // hrm, is this right? - } - } - - // reset my cap seq number - mds_sessions[mds] = 0; - } else { - dout(10) << " i had no session with this mds"; - m->closed = true; - } - - messenger->send_message(m, mdsmap->get_inst(mds)); -} - - -void Client::kick_requests(int mds) -{ - dout(10) << "kick_requests for mds" << mds << dendl; - - for (map::iterator p = mds_requests.begin(); - p != mds_requests.end(); - ++p) - if (p->second->mds.count(mds)) { - p->second->retry_attempt++; // inc retry counter - send_request(p->second, mds); - } -} - - -/**** - * caps - */ - - -class C_Client_ImplementedCaps : public Context { - Client *client; - MClientFileCaps *msg; - Inode *in; -public: - C_Client_ImplementedCaps(Client *c, MClientFileCaps *m, Inode *i) : client(c), msg(m), in(i) {} - void finish(int r) { - client->implemented_caps(msg,in); - } -}; - -/** handle_file_caps - * handle caps update from mds. including mds to mds caps transitions. - * do not block. - */ -void Client::handle_file_caps(MClientFileCaps *m) -{ - int mds = m->get_source().num(); - Inode *in = 0; - if (inode_map.count(m->get_ino())) in = inode_map[ m->get_ino() ]; - - m->clear_payload(); // for if/when we send back to MDS - - // note push seq increment - if (mds_sessions.count(mds) == 0) - dout(0) << "got file_caps without session from mds" << mds << " msg " << *m << dendl; - //assert(mds_sessions.count(mds)); // HACK FIXME SOON - mds_sessions[mds]++; - - // reap? - if (m->get_op() == MClientFileCaps::OP_REAP) { - int other = m->get_mds(); - - if (in && in->stale_caps.count(other)) { - dout(5) << "handle_file_caps on ino " << m->get_ino() << " from mds" << mds << " reap on mds" << other << dendl; - - // fresh from new mds? - if (!in->caps.count(mds)) { - if (in->caps.empty()) in->get(); - in->caps[mds].seq = m->get_seq(); - in->caps[mds].caps = m->get_caps(); - } - - assert(in->stale_caps.count(other)); - in->stale_caps.erase(other); - if (in->stale_caps.empty()) put_inode(in); // note: this will never delete *in - - // fall-thru! - } else { - dout(5) << "handle_file_caps on ino " << m->get_ino() << " from mds" << mds << " premature (!!) reap on mds" << other << dendl; - // delay! - cap_reap_queue[in->ino()][other] = m; - return; - } - } - - assert(in); - - // stale? - if (m->get_op() == MClientFileCaps::OP_STALE) { - dout(5) << "handle_file_caps on ino " << m->get_ino() << " seq " << m->get_seq() << " from mds" << mds << " now stale" << dendl; - - // move to stale list - assert(in->caps.count(mds)); - if (in->stale_caps.empty()) in->get(); - in->stale_caps[mds] = in->caps[mds]; - - assert(in->caps.count(mds)); - in->caps.erase(mds); - if (in->caps.empty()) in->put(); - - // delayed reap? - if (cap_reap_queue.count(in->ino()) && - cap_reap_queue[in->ino()].count(mds)) { - dout(5) << "handle_file_caps on ino " << m->get_ino() << " from mds" << mds << " delayed reap on mds" << m->get_mds() << dendl; - - // process delayed reap - handle_file_caps( cap_reap_queue[in->ino()][mds] ); - - cap_reap_queue[in->ino()].erase(mds); - if (cap_reap_queue[in->ino()].empty()) - cap_reap_queue.erase(in->ino()); - } - delete m; - return; - } - - // release? - if (m->get_op() == MClientFileCaps::OP_RELEASE) { - dout(5) << "handle_file_caps on ino " << m->get_ino() << " from mds" << mds << " release" << dendl; - assert(in->caps.count(mds)); - in->caps.erase(mds); - for (map::iterator p = in->caps.begin(); - p != in->caps.end(); - p++) - dout(20) << " left cap " << p->first << " " - << cap_string(p->second.caps) << " " - << p->second.seq << dendl; - for (map::iterator p = in->stale_caps.begin(); - p != in->stale_caps.end(); - p++) - dout(20) << " left stale cap " << p->first << " " - << cap_string(p->second.caps) << " " - << p->second.seq << dendl; - - if (in->caps.empty()) { - //dout(0) << "did put_inode" << dendl; - put_inode(in); - } else { - //dout(0) << "didn't put_inode" << dendl; - } - delete m; - return; - } - - - // don't want? - if (in->file_caps_wanted() == 0) { - dout(5) << "handle_file_caps on ino " << m->get_ino() - << " seq " << m->get_seq() - << " " << cap_string(m->get_caps()) - << ", which we don't want caps for, releasing." << dendl; - m->set_caps(0); - m->set_wanted(0); - messenger->send_message(m, m->get_source_inst()); - return; - } - - assert(in->caps.count(mds)); - - // update per-mds caps - const int old_caps = in->caps[mds].caps; - const int new_caps = m->get_caps(); - in->caps[mds].caps = new_caps; - in->caps[mds].seq = m->get_seq(); - dout(5) << "handle_file_caps on in " << m->get_ino() - << " mds" << mds << " seq " << m->get_seq() - << " caps now " << cap_string(new_caps) - << " was " << cap_string(old_caps) << dendl; - - // did file size decrease? - if ((old_caps & (CAP_FILE_RD|CAP_FILE_WR)) == 0 && - (new_caps & (CAP_FILE_RD|CAP_FILE_WR)) != 0 && - in->inode.size > m->get_inode().size) { - dout(10) << "*** file size decreased from " << in->inode.size << " to " << m->get_inode().size << dendl; - - // trim filecache? - if (g_conf.client_oc) - in->fc.truncate(in->inode.size, m->get_inode().size); - - in->inode.size = in->file_wr_size = m->get_inode().size; - } - - // update inode - in->inode = m->get_inode(); // might have updated size... FIXME this is overkill! - - // preserve our (possibly newer) file size, mtime - if (in->file_wr_size > in->inode.size) - m->get_inode().size = in->inode.size = in->file_wr_size; - if (in->file_wr_mtime > in->inode.mtime) - m->get_inode().mtime = in->inode.mtime = in->file_wr_mtime; - - - - if (g_conf.client_oc) { - // caching on, use FileCache. - Context *onimplement = 0; - if (old_caps & ~new_caps) { // this mds is revoking caps - if (in->fc.get_caps() & ~(in->file_caps())) // net revocation - onimplement = new C_Client_ImplementedCaps(this, m, in); - else { - implemented_caps(m, in); // ack now. - } - } - in->fc.set_caps(new_caps, onimplement); - } else { - // caching off. - - // wake up waiters? - if (new_caps & CAP_FILE_RD) { - for (list::iterator it = in->waitfor_read.begin(); - it != in->waitfor_read.end(); - it++) { - dout(5) << "signaling read waiter " << *it << dendl; - (*it)->Signal(); - } - in->waitfor_read.clear(); - } - if (new_caps & CAP_FILE_WR) { - for (list::iterator it = in->waitfor_write.begin(); - it != in->waitfor_write.end(); - it++) { - dout(5) << "signaling write waiter " << *it << dendl; - (*it)->Signal(); - } - in->waitfor_write.clear(); - } - if (new_caps & CAP_FILE_LAZYIO) { - for (list::iterator it = in->waitfor_lazy.begin(); - it != in->waitfor_lazy.end(); - it++) { - dout(5) << "signaling lazy waiter " << *it << dendl; - (*it)->Signal(); - } - in->waitfor_lazy.clear(); - } - - // ack? - if (old_caps & ~new_caps) { - if (in->sync_writes) { - // wait for sync writes to finish - dout(5) << "sync writes in progress, will ack on finish" << dendl; - in->waitfor_no_write.push_back(new C_Client_ImplementedCaps(this, m, in)); - } else { - // ok now - implemented_caps(m, in); - } - } else { - // discard - delete m; - } - } -} - -void Client::implemented_caps(MClientFileCaps *m, Inode *in) -{ - dout(5) << "implemented_caps " << cap_string(m->get_caps()) - << ", acking to " << m->get_source() << dendl; - - if (in->file_caps() == 0) { - in->file_wr_mtime = utime_t(); - in->file_wr_size = 0; - } - - messenger->send_message(m, m->get_source_inst()); -} - - -void Client::release_caps(Inode *in, - int retain) -{ - dout(5) << "releasing caps on ino " << in->inode.ino << dec - << " had " << cap_string(in->file_caps()) - << " retaining " << cap_string(retain) - << " want " << cap_string(in->file_caps_wanted()) - << dendl; - - for (map::iterator it = in->caps.begin(); - it != in->caps.end(); - it++) { - //if (it->second.caps & ~retain) { - if (1) { - // release (some of?) these caps - it->second.caps = retain & it->second.caps; - // note: tell mds _full_ wanted; it'll filter/behave based on what it is allowed to do - MClientFileCaps *m = new MClientFileCaps(MClientFileCaps::OP_ACK, - in->inode, - it->second.seq, - it->second.caps, - in->file_caps_wanted()); - messenger->send_message(m, mdsmap->get_inst(it->first)); - } - } - - if (in->file_caps() == 0) { - in->file_wr_mtime = utime_t(); - in->file_wr_size = 0; - } -} - -void Client::update_caps_wanted(Inode *in) -{ - dout(5) << "updating caps wanted on ino " << in->inode.ino - << " to " << cap_string(in->file_caps_wanted()) - << dendl; - - // FIXME: pick a single mds and let the others off the hook.. - for (map::iterator it = in->caps.begin(); - it != in->caps.end(); - it++) { - MClientFileCaps *m = new MClientFileCaps(MClientFileCaps::OP_ACK, - in->inode, - it->second.seq, - it->second.caps, - in->file_caps_wanted()); - messenger->send_message(m, mdsmap->get_inst(it->first)); - } -} - - - -// ------------------- -// MOUNT - -void Client::_try_mount() -{ - dout(10) << "_try_mount" << dendl; - int mon = monmap->pick_mon(); - dout(2) << "sending client_mount to mon" << mon << " as instance " << my_instance << dendl; - messenger->set_dispatcher(this); - messenger->send_message(new MClientMount(messenger->get_myaddr(), my_instance), - monmap->get_inst(mon)); - - // schedule timeout? - assert(mount_timeout_event == 0); - mount_timeout_event = new C_MountTimeout(this); - timer.add_event_after(g_conf.client_mount_timeout, mount_timeout_event); -} - -void Client::_mount_timeout() -{ - dout(10) << "_mount_timeout" << dendl; - mount_timeout_event = 0; - _try_mount(); -} - -int Client::mount() -{ - client_lock.Lock(); - assert(!mounted); // caller is confused? - - objecter->init(); - - _try_mount(); - //messenger->set_dispatcher(this); // FIXME: there is still a race condition here! - - while (!mdsmap || - !osdmap || - osdmap->get_epoch() == 0) - mount_cond.Wait(client_lock); - - timer.cancel_event(mount_timeout_event); - mount_timeout_event = 0; - - mounted = true; - - dout(2) << "mounted: have osdmap " << osdmap->get_epoch() - << " and mdsmap " << mdsmap->get_epoch() - << dendl; - - // hack: get+pin root inode. - // fuse assumes it's always there. - Inode *root; - _do_lstat("/", STAT_MASK_ALL, &root); - _ll_get(root); - - // trace? - if (g_conf.client_trace) { - traceout.open(g_conf.client_trace); - if (traceout.is_open()) { - dout(1) << "opened trace file '" << g_conf.client_trace << "'" << dendl; - } else { - dout(1) << "FAILED to open trace file '" << g_conf.client_trace << "'" << dendl; - } - } - - client_lock.Unlock(); - - /* - dout(3) << "op: // client trace data structs" << dendl; - dout(3) << "op: struct stat st;" << dendl; - dout(3) << "op: struct utimbuf utim;" << dendl; - dout(3) << "op: int readlinkbuf_len = 1000;" << dendl; - dout(3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl; - dout(3) << "op: map dir_contents;" << dendl; - dout(3) << "op: map open_files;" << dendl; - dout(3) << "op: int fd;" << dendl; - */ - return 0; -} - - -// UNMOUNT - - -int Client::unmount() -{ - client_lock.Lock(); - - assert(mounted); // caller is confused? - - dout(2) << "unmounting" << dendl; - unmounting = true; - - // NOTE: i'm assuming all caches are already flushing (because all files are closed). - assert(fd_map.empty()); - - dout(10) << "a" << dendl; - - _ll_drop_pins(); - - dout(10) << "b" << dendl; - - // empty lru cache - lru.lru_set_max(0); - trim_cache(); - - if (g_conf.client_oc) { - // release any/all caps - for (hash_map::iterator p = inode_map.begin(); - p != inode_map.end(); - p++) { - Inode *in = p->second; - if (!in->caps.empty()) { - in->fc.release_clean(); - if (in->fc.is_dirty()) { - dout(10) << "unmount residual caps on " << in->ino() << ", flushing" << dendl; - in->fc.empty(new C_Client_CloseRelease(this, in)); - } else { - dout(10) << "unmount residual caps on " << in->ino() << ", releasing" << dendl; - release_caps(in); - } - } - } - } - - //if (0) {// hack - while (lru.lru_get_size() > 0 || - !inode_map.empty()) { - dout(2) << "cache still has " << lru.lru_get_size() - << "+" << inode_map.size() << " items" - << ", waiting (for caps to release?)" - << dendl; - dump_cache(); - mount_cond.Wait(client_lock); - } - assert(lru.lru_get_size() == 0); - assert(inode_map.empty()); - //} - - // unsafe writes - if (!g_conf.client_oc) { - while (unsafe_sync_write > 0) { - dout(0) << unsafe_sync_write << " unsafe_sync_writes, waiting" - << dendl; - mount_cond.Wait(client_lock); - } - } - - // stop tracing - if (g_conf.client_trace) { - dout(1) << "closing trace file '" << g_conf.client_trace << "'" << dendl; - traceout.close(); - } - - - // send session closes! - for (map::iterator p = mds_sessions.begin(); - p != mds_sessions.end(); - ++p) { - dout(2) << "sending client_session close to mds" << p->first << " seq " << p->second << dendl; - messenger->send_message(new MClientSession(MClientSession::OP_REQUEST_CLOSE, - p->second), - mdsmap->get_inst(p->first)); - } - - // send unmount! - int mon = monmap->pick_mon(); - dout(2) << "sending client_unmount to mon" << mon << dendl; - messenger->send_message(new MClientUnmount(messenger->get_myinst()), - monmap->get_inst(mon)); - - while (mounted) - mount_cond.Wait(client_lock); - - dout(2) << "unmounted." << dendl; - - objecter->shutdown(); - - client_lock.Unlock(); - return 0; -} - -void Client::handle_unmount(Message* m) -{ - dout(1) << "handle_unmount got ack" << dendl; - - mounted = false; - - delete mdsmap; - mdsmap = 0; - - mount_cond.Signal(); - - delete m; -} - - -// =============================================================== -// high level (POSIXy) interface - - -// namespace ops - -int Client::link(const char *existing, const char *newname) -{ - Mutex::Locker lock(client_lock); - tout << "link" << std::endl; - tout << existing << std::endl; - tout << newname << std::endl; - return _link(existing, newname); -} - -int Client::_link(const char *existing, const char *newname) -{ - // main path arg is new link name - // sarg is target (existing file) - - MClientRequest *req = new MClientRequest(MDS_OP_LINK, messenger->get_myinst()); - req->set_path(newname); - req->set_sarg(existing); - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - MClientReply *reply = make_request(req); - int res = reply->get_result(); - - insert_trace(reply); - delete reply; - dout(10) << "link result is " << res << dendl; - - trim_cache(); - dout(3) << "link(\"" << existing << "\", \"" << newname << "\") = " << res << dendl; - return res; -} - - -int Client::unlink(const char *relpath) -{ - Mutex::Locker lock(client_lock); - tout << "unlink" << std::endl; - tout << relpath << std::endl; - - string abspath; - mkabspath(relpath, abspath); - return _unlink(abspath.c_str()); -} - -int Client::_unlink(const char *path) -{ - - MClientRequest *req = new MClientRequest(MDS_OP_UNLINK, messenger->get_myinst()); - req->set_path(path); - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - //FIXME enforce caller uid rights? - - MClientReply *reply = make_request(req); - int res = reply->get_result(); - if (res == 0) { - // remove from local cache - filepath fp(path); - Dentry *dn = lookup(fp); - if (dn) { - assert(dn->inode); - unlink(dn); - } - } - insert_trace(reply); - delete reply; - dout(10) << "unlink result is " << res << dendl; - - trim_cache(); - dout(3) << "unlink(\"" << path << "\") = " << res << dendl; - return res; -} - -int Client::rename(const char *relfrom, const char *relto) -{ - Mutex::Locker lock(client_lock); - tout << "rename" << std::endl; - tout << relfrom << std::endl; - tout << relto << std::endl; - - string absfrom, absto; - mkabspath(relfrom, absfrom); - mkabspath(relto, absto); - return _rename(absfrom.c_str(), absto.c_str()); -} - -int Client::_rename(const char *from, const char *to) -{ - MClientRequest *req = new MClientRequest(MDS_OP_RENAME, messenger->get_myinst()); - req->set_path(from); - req->set_sarg(to); - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - //FIXME enforce caller uid rights? - - MClientReply *reply = make_request(req); - int res = reply->get_result(); - if (res == 0) { - // remove from local cache - filepath fp(to); - Dentry *dn = lookup(fp); - if (dn) { - assert(dn->inode); - unlink(dn); - } - } - insert_trace(reply); - delete reply; - dout(10) << "rename result is " << res << dendl; - - // renamed item from our cache - - trim_cache(); - dout(3) << "rename(\"" << from << "\", \"" << to << "\") = " << res << dendl; - return res; -} - -// dirs - -int Client::mkdir(const char *relpath, mode_t mode) -{ - Mutex::Locker lock(client_lock); - tout << "mkdir" << std::endl; - tout << relpath << std::endl; - tout << mode << std::endl; - - string abspath; - mkabspath(relpath, abspath); - return _mkdir(abspath.c_str(), mode); -} - -int Client::_mkdir(const char *path, mode_t mode) -{ - MClientRequest *req = new MClientRequest(MDS_OP_MKDIR, messenger->get_myinst()); - req->set_path(path); - req->args.mkdir.mode = mode; - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - //FIXME enforce caller uid rights? - - MClientReply *reply = make_request(req); - int res = reply->get_result(); - insert_trace(reply); - delete reply; - dout(10) << "mkdir result is " << res << dendl; - - trim_cache(); - - dout(3) << "mkdir(\"" << path << "\", 0" << oct << mode << dec << ") = " << res << dendl; - return res; -} - -int Client::rmdir(const char *relpath) -{ - Mutex::Locker lock(client_lock); - tout << "rmdir" << std::endl; - tout << relpath << std::endl; - - string abspath; - mkabspath(relpath, abspath); - return _rmdir(abspath.c_str()); -} - -int Client::_rmdir(const char *path) -{ - MClientRequest *req = new MClientRequest(MDS_OP_RMDIR, messenger->get_myinst()); - req->set_path(path); - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - //FIXME enforce caller uid rights? - - MClientReply *reply = make_request(req); - int res = reply->get_result(); - if (res == 0) { - // remove from local cache - filepath fp(path); - Dentry *dn = lookup(fp); - if (dn) { - if (dn->inode->dir && dn->inode->dir->is_empty()) - close_dir(dn->inode->dir); // FIXME: maybe i shoudl proactively hose the whole subtree from cache? - unlink(dn); - } - } - insert_trace(reply); - delete reply; - - trim_cache(); - dout(3) << "rmdir(\"" << path << "\") = " << res << dendl; - return res; -} - -// symlinks - -int Client::symlink(const char *target, const char *rellink) -{ - Mutex::Locker lock(client_lock); - tout << "symlink" << std::endl; - tout << target << std::endl; - tout << rellink << std::endl; - - string link; - mkabspath(rellink, link); - return _symlink(target, link.c_str()); -} - -int Client::_symlink(const char *target, const char *link) -{ - MClientRequest *req = new MClientRequest(MDS_OP_SYMLINK, messenger->get_myinst()); - req->set_path(link); - req->set_sarg(target); - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - //FIXME enforce caller uid rights? - - MClientReply *reply = make_request(req); - int res = reply->get_result(); - insert_trace(reply); //FIXME assuming trace of link, not of target - delete reply; - - trim_cache(); - dout(3) << "symlink(\"" << target << "\", \"" << link << "\") = " << res << dendl; - return res; -} - -int Client::readlink(const char *path, char *buf, off_t size) -{ - Mutex::Locker lock(client_lock); - tout << "readlink" << std::endl; - tout << path << std::endl; - - string abspath; - mkabspath(path, abspath); - return _readlink(abspath.c_str(), buf, size); -} - -int Client::_readlink(const char *path, char *buf, off_t size) -{ - Inode *in; - int r = _do_lstat(path, STAT_MASK_BASE, &in); - if (r == 0 && !in->inode.is_symlink()) r = -EINVAL; - if (r == 0) { - // copy into buf (at most size bytes) - r = in->symlink->length(); - if (r > size) r = size; - memcpy(buf, in->symlink->c_str(), r); - } else { - buf[0] = 0; - } - trim_cache(); - - dout(3) << "readlink(\"" << path << "\", \"" << buf << "\", " << size << ") = " << r << dendl; - return r; -} - - - -// inode stuff - -int Client::_do_lstat(const char *path, int mask, Inode **in) -{ - MClientRequest *req = 0; - filepath fpath(path); - - // check whether cache content is fresh enough - int res = 0; - - Dentry *dn = lookup(fpath); - inode_t inode; - utime_t now = g_clock.real_now(); - - if (dn && - now <= dn->inode->valid_until) - dout(10) << "_lstat has inode " << path << " with mask " << dn->inode->mask << ", want " << mask << dendl; - - if (dn && dn->inode && - now <= dn->inode->valid_until && - ((mask & ~STAT_MASK_BASE) || now <= dn->inode->valid_until) && - ((dn->inode->mask & mask) == mask)) { - inode = dn->inode->inode; - dout(10) << "lstat cache hit w/ sufficient mask, valid until " << dn->inode->valid_until << dendl; - - if (g_conf.client_cache_stat_ttl == 0) - dn->inode->valid_until = utime_t(); // only one stat allowed after each readdir - - *in = dn->inode; - } else { - // FIXME where does FUSE maintain user information - //struct fuse_context *fc = fuse_get_context(); - //req->set_caller_uid(fc->uid); - //req->set_caller_gid(fc->gid); - - req = new MClientRequest(MDS_OP_LSTAT, messenger->get_myinst()); - req->args.stat.mask = mask; - req->set_path(fpath); - - MClientReply *reply = make_request(req); - res = reply->get_result(); - dout(10) << "lstat res is " << res << dendl; - if (res == 0) { - //Transfer information from reply to stbuf - inode = reply->get_inode(); - - //Update metadata cache - *in = insert_trace(reply); - } - - delete reply; - - if (res != 0) - *in = 0; // not a success. - } - - return res; -} - - -int Client::fill_stat(Inode *in, struct stat *st) -{ - dout(10) << "fill_stat on " << in->inode.ino << " mode 0" << oct << in->inode.mode << dec - << " mtime " << in->inode.mtime << " ctime " << in->inode.ctime << dendl; - memset(st, 0, sizeof(struct stat)); - st->st_ino = in->inode.ino; - st->st_mode = in->inode.mode; - st->st_rdev = in->inode.rdev; - st->st_nlink = in->inode.nlink; - st->st_uid = in->inode.uid; - st->st_gid = in->inode.gid; - st->st_ctime = MAX(in->inode.ctime, in->inode.mtime); - st->st_atime = in->inode.atime; - st->st_mtime = in->inode.mtime; - st->st_size = in->inode.size; - st->st_blocks = in->inode.size ? ((in->inode.size - 1) / 4096 + 1):0; - st->st_blksize = 4096; - return in->mask; -} - - /* - S_REQUIREBLKSIZE(st->st_litemask); - if (inode.mask & INODE_MASK_BASE) S_REQUIRECTIME(st->st_litemask); - if (inode.mask & INODE_MASK_SIZE) { - S_REQUIRESIZE(st->st_litemask); - S_REQUIREBLOCKS(st->st_litemask); - } - if (inode.mask & INODE_MASK_MTIME) S_REQUIREMTIME(st->st_litemask); - if (inode.mask & INODE_MASK_ATIME) S_REQUIREATIME(st->st_litemask); - */ - - -int Client::lstat(const char *relpath, struct stat *stbuf) -{ - Mutex::Locker lock(client_lock); - tout << "lstat" << std::endl; - tout << relpath << std::endl; - - string abspath; - mkabspath(relpath, abspath); - return _lstat(abspath.c_str(), stbuf); -} - -int Client::_lstat(const char *path, struct stat *stbuf) -{ - Inode *in = 0; - int res = _do_lstat(path, STAT_MASK_ALL, &in); - if (res == 0) { - assert(in); - fill_stat(in, stbuf); - dout(10) << "stat sez size = " << in->inode.size << " mode = 0" << oct << stbuf->st_mode << dec << " ino = " << stbuf->st_ino << dendl; - } - - trim_cache(); - dout(3) << "lstat(\"" << path << "\", " << stbuf << ") = " << res << dendl; - return res; -} - - -/* -int Client::lstatlite(const char *relpath, struct statlite *stl) -{ - client_lock.Lock(); - - string abspath; - mkabspath(relpath, abspath); - const char *path = abspath.c_str(); - - dout(3) << "op: client->lstatlite(\"" << path << "\", &st);" << dendl; - tout << "lstatlite" << std::endl; - tout << path << std::endl; - - // make mask - // FIXME. - int mask = INODE_MASK_BASE | INODE_MASK_AUTH; - if (S_ISVALIDSIZE(stl->st_litemask) || - S_ISVALIDBLOCKS(stl->st_litemask)) mask |= INODE_MASK_SIZE; - if (S_ISVALIDMTIME(stl->st_litemask)) mask |= INODE_MASK_FILE; - if (S_ISVALIDATIME(stl->st_litemask)) mask |= INODE_MASK_FILE; - - Inode *in = 0; - int res = _lstat(path, mask, &in); - - if (res == 0) { - fill_statlite(in->inode,stl); - dout(10) << "stat sez size = " << in->inode.size << " ino = " << in->inode.ino << dendl; - } - - trim_cache(); - client_lock.Unlock(); - return res; -} -*/ - - -int Client::chmod(const char *relpath, mode_t mode) -{ - Mutex::Locker lock(client_lock); - tout << "chmod" << std::endl; - tout << relpath << std::endl; - tout << mode << std::endl; - - string abspath; - mkabspath(relpath, abspath); - return _chmod(abspath.c_str(), mode); -} - -int Client::_chmod(const char *path, mode_t mode) -{ - dout(3) << "_chmod(" << path << ", 0" << oct << mode << dec << ")" << dendl; - MClientRequest *req = new MClientRequest(MDS_OP_CHMOD, messenger->get_myinst()); - req->set_path(path); - req->args.chmod.mode = mode; - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - MClientReply *reply = make_request(req); - int res = reply->get_result(); - insert_trace(reply); - delete reply; - - trim_cache(); - dout(3) << "_chmod(\"" << path << "\", 0" << oct << mode << dec << ") = " << res << dendl; - return res; -} - -int Client::chown(const char *relpath, uid_t uid, gid_t gid) -{ - Mutex::Locker lock(client_lock); - tout << "chown" << std::endl; - tout << relpath << std::endl; - tout << uid << std::endl; - tout << gid << std::endl; - - string abspath; - mkabspath(relpath, abspath); - return _chown(abspath.c_str(), uid, gid); -} - -int Client::_chown(const char *path, uid_t uid, gid_t gid) -{ - dout(3) << "_chown(" << path << ", " << uid << ", " << gid << ")" << dendl; - MClientRequest *req = new MClientRequest(MDS_OP_CHOWN, messenger->get_myinst()); - req->set_path(path); - req->args.chown.uid = uid; - req->args.chown.gid = gid; - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - //FIXME enforce caller uid rights? - - MClientReply *reply = make_request(req); - int res = reply->get_result(); - insert_trace(reply); - delete reply; - dout(10) << "chown result is " << res << dendl; - - trim_cache(); - dout(3) << "chown(\"" << path << "\", " << uid << ", " << gid << ") = " << res << dendl; - return res; -} - -int Client::utime(const char *relpath, struct utimbuf *buf) -{ - Mutex::Locker lock(client_lock); - tout << "utime" << std::endl; - tout << relpath << std::endl; - tout << buf->modtime << std::endl; - tout << buf->actime << std::endl; - - string abspath; - mkabspath(relpath, abspath); - return _utimes(abspath.c_str(), utime_t(buf->modtime,0), utime_t(buf->actime,0)); -} - -int Client::_utimes(const char *path, utime_t mtime, utime_t atime) -{ - dout(3) << "_utimes(" << path << ", " << mtime << ", " << atime << ")" << dendl; - MClientRequest *req = new MClientRequest(MDS_OP_UTIME, messenger->get_myinst()); - req->set_path(path); - req->args.utime.mtime = mtime.tv_ref(); - req->args.utime.atime = atime.tv_ref(); - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - MClientReply *reply = make_request(req); - int res = reply->get_result(); - insert_trace(reply); - delete reply; - - dout(3) << "utimes(\"" << path << "\", " << mtime << ", " << atime << ") = " << res << dendl; - trim_cache(); - return res; -} - - - -int Client::mknod(const char *relpath, mode_t mode, dev_t rdev) -{ - Mutex::Locker lock(client_lock); - tout << "mknod" << std::endl; - tout << relpath << std::endl; - tout << mode << std::endl; - tout << rdev << std::endl; - - string abspath; - mkabspath(relpath, abspath); - return _mknod(abspath.c_str(), mode, rdev); -} - -int Client::_mknod(const char *path, mode_t mode, dev_t rdev) -{ - dout(3) << "_mknod(" << path << ", 0" << oct << mode << dec << ", " << rdev << ")" << dendl; - - MClientRequest *req = new MClientRequest(MDS_OP_MKNOD, messenger->get_myinst()); - req->set_path(path); - req->args.mknod.mode = mode; - req->args.mknod.rdev = rdev; - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - //FIXME enforce caller uid rights? - - MClientReply *reply = make_request(req); - int res = reply->get_result(); - insert_trace(reply); - - delete reply; - - trim_cache(); - - dout(3) << "mknod(\"" << path << "\", 0" << oct << mode << dec << ") = " << res << dendl; - return res; -} - - - - -int Client::getdir(const char *relpath, list& contents) -{ - dout(3) << "getdir(" << relpath << ")" << dendl; - { - Mutex::Locker lock(client_lock); - tout << "getdir" << std::endl; - tout << relpath << std::endl; - } - - DIR *d; - int r = opendir(relpath, &d); - if (r < 0) return r; - - struct dirent de; - int n = 0; - while (readdir_r(d, &de) == 0) { - contents.push_back(de.d_name); - n++; - } - closedir(d); - - return n; -} - -int Client::opendir(const char *name, DIR **dirpp) -{ - Mutex::Locker lock(client_lock); - tout << "opendir" << std::endl; - tout << name << std::endl; - - int r = _opendir(name, (DirResult**)dirpp); - tout << (unsigned long)*dirpp << std::endl; - return r; -} - -int Client::_opendir(const char *name, DirResult **dirpp) -{ - *dirpp = new DirResult(name); - - // do we have the inode in our cache? - // if so, should be we ask for a different dirfrag? - filepath path(name); - Dentry *dn = lookup(path); - if (dn && dn->inode) { - (*dirpp)->inode = dn->inode; - (*dirpp)->inode->get(); - dout(10) << "had inode " << dn->inode << " " << dn->inode->inode.ino << " ref now " << dn->inode->ref << dendl; - (*dirpp)->set_frag(dn->inode->dirfragtree[0]); - dout(10) << "_opendir " << name << ", our cache says the first dirfrag is " << (*dirpp)->frag() << dendl; - } - - // get the first frag - int r = _readdir_get_frag(*dirpp); - if (r < 0) { - _closedir(*dirpp); - *dirpp = 0; - } else { - r = 0; - } - dout(3) << "_opendir(" << name << ") = " << r << " (" << *dirpp << ")" << dendl; - - return r; -} - -void Client::_readdir_add_dirent(DirResult *dirp, const string& name, Inode *in) -{ - struct stat st; - int stmask = fill_stat(in, &st); - frag_t fg = dirp->frag(); - dirp->buffer[fg].push_back(DirEntry(name, st, stmask)); - dout(10) << "_readdir_add_dirent " << dirp << " added '" << name << "' -> " << in->inode.ino - << ", size now " << dirp->buffer[fg].size() << dendl; -} - -//struct dirent { -// ino_t d_ino; /* inode number */ -// off_t d_off; /* offset to the next dirent */ -// unsigned short d_reclen; /* length of this record */ -// unsigned char d_type; /* type of file */ -// char d_name[256]; /* filename */ -//}; -void Client::_readdir_fill_dirent(struct dirent *de, DirEntry *entry, off_t off) -{ - de->d_ino = entry->st.st_ino; - de->d_off = off + 1; - de->d_reclen = 1; - de->d_type = MODE_TO_DT(entry->st.st_mode); - strncpy(de->d_name, entry->d_name.c_str(), 256); - dout(10) << "_readdir_fill_dirent '" << de->d_name << "' -> " << de->d_ino - << " type " << (int)de->d_type << " at off " << off << dendl; -} - -void Client::_readdir_next_frag(DirResult *dirp) -{ - frag_t fg = dirp->frag(); - - // hose old data - assert(dirp->buffer.count(fg)); - dirp->buffer.erase(fg); - - // advance - dirp->next_frag(); - if (dirp->at_end()) { - dout(10) << "_readdir_next_frag advance from " << fg << " to END" << dendl; - } else { - dout(10) << "_readdir_next_frag advance from " << fg << " to " << dirp->frag() << dendl; - _readdir_rechoose_frag(dirp); - } -} - -void Client::_readdir_rechoose_frag(DirResult *dirp) -{ - assert(dirp->inode); - frag_t cur = dirp->frag(); - frag_t f = dirp->inode->dirfragtree[cur.value()]; - if (f != cur) { - dout(10) << "_readdir_rechoose_frag frag " << cur << " maps to " << f << dendl; - dirp->set_frag(f); - } -} - -int Client::_readdir_get_frag(DirResult *dirp) -{ - // get the current frag. - frag_t fg = dirp->frag(); - assert(dirp->buffer.count(fg) == 0); - - dout(10) << "_readdir_get_frag " << dirp << " on " << dirp->path << " fg " << fg << dendl; - - MClientRequest *req = new MClientRequest(MDS_OP_READDIR, messenger->get_myinst()); - req->set_path(dirp->path); - req->args.readdir.frag = fg; - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - MClientReply *reply = make_request(req); - int res = reply->get_result(); - insert_trace(reply); - inodeno_t ino = reply->get_ino(); - - // did i get directory inode? - Inode *diri = 0; - if ((res == -EAGAIN || res == 0) && - inode_map.count(ino)) { - diri = inode_map[ino]; - dout(10) << "_readdir_get_frag got diri " << diri << " " << diri->inode.ino << dendl; - assert(diri); - assert(diri->inode.mode & INODE_MODE_DIR); - } - - if (!dirp->inode && diri) { - dout(10) << "_readdir_get_frag attaching inode" << dendl; - dirp->inode = inode_map[ino]; - diri->get(); - } - - if (res == -EAGAIN) { - dout(10) << "_readdir_get_frag got EAGAIN, retrying" << dendl; - _readdir_rechoose_frag(dirp); - return _readdir_get_frag(dirp); - } - - if (res == 0) { - // stuff dir contents to cache, DirResult - assert(diri); - - // create empty result vector - dirp->buffer[fg].clear(); - - if (fg.is_leftmost()) { - // add . and ..? - string dot("."); - _readdir_add_dirent(dirp, dot, diri); - string dotdot(".."); - if (diri->dn) - _readdir_add_dirent(dirp, dotdot, diri->dn->dir->parent_inode); - //else - //_readdir_add_dirent(dirp, dotdot, DT_DIR); - } - - // the rest? - if (!reply->get_dir_dn().empty()) { - // only open dir if we're actually adding stuff to it! - Dir *dir = diri->open_dir(); - assert(dir); - utime_t now = g_clock.real_now(); - - list::const_iterator pin = reply->get_dir_in().begin(); - for (list::const_iterator pdn = reply->get_dir_dn().begin(); - pdn != reply->get_dir_dn().end(); - ++pdn, ++pin) { - // count entries - res++; - - // put in cache - Inode *in = this->insert_inode(dir, *pin, *pdn); - - if (g_conf.client_cache_stat_ttl) { - in->valid_until = now; - in->valid_until += g_conf.client_cache_stat_ttl; - } - else if (g_conf.client_cache_readdir_ttl) { - in->valid_until = now; - in->valid_until += g_conf.client_cache_readdir_ttl; - } else - in->valid_until = utime_t(); - - // contents to caller too! - dout(15) << "_readdir_get_frag got " << *pdn << " to " << in->inode.ino << dendl; - _readdir_add_dirent(dirp, *pdn, in); - } - - if (dir->is_empty()) - close_dir(dir); - } - - // FIXME: remove items in cache that weren't in my readdir? - // *** - } else { - dout(10) << "_readdir_get_frag got error " << res << ", setting end flag" << dendl; - dirp->set_end(); - } - - delete reply; - - return res; -} - -int Client::readdir_r(DIR *d, struct dirent *de) -{ - return readdirplus_r(d, de, 0, 0); -} - -int Client::readdirplus_r(DIR *d, struct dirent *de, struct stat *st, int *stmask) -{ - DirResult *dirp = (DirResult*)d; - - while (1) { - if (dirp->at_end()) return -1; - - if (dirp->buffer.count(dirp->frag()) == 0) { - Mutex::Locker lock(client_lock); - _readdir_get_frag(dirp); - if (dirp->at_end()) return -1; - } - - frag_t fg = dirp->frag(); - uint32_t pos = dirp->fragpos(); - assert(dirp->buffer.count(fg)); - vector &ent = dirp->buffer[fg]; - - if (ent.empty()) { - dout(10) << "empty frag " << fg << ", moving on to next" << dendl; - _readdir_next_frag(dirp); - continue; - } - - assert(pos < ent.size()); - _readdir_fill_dirent(de, &ent[pos], dirp->offset); - if (st) *st = ent[pos].st; - if (stmask) *stmask = ent[pos].stmask; - pos++; - dirp->offset++; - - if (pos == ent.size()) - _readdir_next_frag(dirp); - - break; - } - - return 0; -} - - -int Client::closedir(DIR *dir) -{ - Mutex::Locker lock(client_lock); - tout << "closedir" << std::endl; - tout << (unsigned long)dir << std::endl; - - dout(3) << "closedir(" << dir << ") = 0" << dendl; - _closedir((DirResult*)dir); - return 0; -} - -void Client::_closedir(DirResult *dirp) -{ - dout(10) << "_closedir(" << dirp << ")" << dendl; - if (dirp->inode) { - dout(10) << "_closedir detaching inode " << dirp->inode << dendl; - put_inode(dirp->inode); - dirp->inode = 0; - } - delete dirp; -} - -void Client::rewinddir(DIR *dirp) -{ - dout(3) << "rewinddir(" << dirp << ")" << dendl; - DirResult *d = (DirResult*)dirp; - d->offset = 0; - d->buffer.clear(); -} - -off_t Client::telldir(DIR *dirp) -{ - DirResult *d = (DirResult*)dirp; - dout(3) << "telldir(" << dirp << ") = " << d->offset << dendl; - return d->offset; -} - -void Client::seekdir(DIR *dirp, off_t offset) -{ - dout(3) << "seekdir(" << dirp << ", " << offset << ")" << dendl; - DirResult *d = (DirResult*)dirp; - d->offset = offset; -} - - - - - - - -/****** file i/o **********/ - -int Client::open(const char *relpath, int flags, mode_t mode) -{ - Mutex::Locker lock(client_lock); - tout << "open" << std::endl; - tout << relpath << std::endl; - tout << flags << std::endl; - - string abspath; - mkabspath(relpath, abspath); - - Fh *fh; - int r = _open(abspath.c_str(), flags, mode, &fh); - if (r >= 0) { - // allocate a integer file descriptor - assert(fh); - r = get_fd(); - assert(fd_map.count(r) == 0); - fd_map[r] = fh; - } - - tout << r << std::endl; - dout(3) << "open(" << relpath << ", " << flags << ") = " << r << dendl; - return r; -} - -int Client::_open(const char *path, int flags, mode_t mode, Fh **fhp) -{ - // go - MClientRequest *req = new MClientRequest(MDS_OP_OPEN, messenger->get_myinst()); - req->set_path(path); - req->args.open.flags = flags; - req->args.open.mode = mode; - - int cmode = req->get_open_file_mode(); - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - // do i have the inode? - Dentry *dn = lookup(req->get_filepath()); - Inode *in = 0; - if (dn) { - in = dn->inode; - in->add_open(cmode); // make note of pending open, since it effects _wanted_ caps. - } - - MClientReply *reply = make_request(req); - assert(reply); - - insert_trace(reply); - int result = reply->get_result(); - - // success? - if (result >= 0) { - // yay - Fh *f = new Fh; - if (fhp) *fhp = f; - f->mode = cmode; - - // inode - f->inode = inode_map[reply->get_ino()]; - assert(f->inode); - f->inode->get(); - - if (!in) { - in = f->inode; - in->add_open(f->mode); - } - - // caps included? - int mds = reply->get_source().num(); - - if (in->caps.empty()) {// first caps? - dout(7) << " first caps on " << in->inode.ino << dendl; - in->get(); - } - - int new_caps = reply->get_file_caps(); - - assert(reply->get_file_caps_seq() >= in->caps[mds].seq); - if (reply->get_file_caps_seq() > in->caps[mds].seq) { - int old_caps = in->caps[mds].caps; - - dout(7) << "open got caps " << cap_string(new_caps) - << " (had " << cap_string(old_caps) << ")" - << " for " << in->ino() - << " seq " << reply->get_file_caps_seq() - << " from mds" << mds - << dendl; - - in->caps[mds].caps = new_caps; - in->caps[mds].seq = reply->get_file_caps_seq(); - - // we shouldn't ever lose caps at this point. - // actually, we might...? - assert((old_caps & ~in->caps[mds].caps) == 0); - - if (g_conf.client_oc) - in->fc.set_caps(new_caps); - - } else { - dout(7) << "open got SAME caps " << cap_string(new_caps) - << " for " << in->ino() - << " seq " << reply->get_file_caps_seq() - << " from mds" << mds - << dendl; - } - - dout(5) << "open success, fh is " << f << " combined caps " << cap_string(in->file_caps()) << dendl; - } - - delete reply; - - trim_cache(); - - return result; -} - - - - - -void Client::close_release(Inode *in) -{ - dout(10) << "close_release on " << in->ino() << dendl; - dout(10) << " wr " << in->num_open_wr << " rd " << in->num_open_rd - << " dirty " << in->fc.is_dirty() << " cached " << in->fc.is_cached() << dendl; - - if (!in->num_open_rd) - in->fc.release_clean(); - - int retain = 0; - if (in->num_open_wr || in->fc.is_dirty()) retain |= CAP_FILE_WR | CAP_FILE_WRBUFFER | CAP_FILE_WREXTEND; - if (in->num_open_rd || in->fc.is_cached()) retain |= CAP_FILE_RD | CAP_FILE_RDCACHE; - - release_caps(in, retain); // release caps now. -} - -void Client::close_safe(Inode *in) -{ - dout(10) << "close_safe on " << in->ino() << dendl; - put_inode(in); - if (unmounting) - mount_cond.Signal(); -} - - -int Client::close(int fd) -{ - Mutex::Locker lock(client_lock); - tout << "close" << std::endl; - tout << fd << std::endl; - - dout(3) << "close(" << fd << ")" << dendl; - assert(fd_map.count(fd)); - Fh *fh = fd_map[fd]; - _release(fh); - fd_map.erase(fd); - return 0; -} - -int Client::_release(Fh *f) -{ - //dout(3) << "op: client->close(open_files[ " << fh << " ]);" << dendl; - //dout(3) << "op: open_files.erase( " << fh << " );" << dendl; - dout(5) << "_release " << f << dendl; - Inode *in = f->inode; - - // update inode rd/wr counts - int before = in->file_caps_wanted(); - in->sub_open(f->mode); - int after = in->file_caps_wanted(); - - // does this change what caps we want? - if (before != after && after) - update_caps_wanted(in); - - // release caps right away? - dout(10) << "num_open_rd " << in->num_open_rd << " num_open_wr " << in->num_open_wr << dendl; - - if (g_conf.client_oc) { - // caching on. - if (in->num_open_rd == 0 && in->num_open_wr == 0) { - dout(20) << "calling empty" << dendl; - in->fc.empty(new C_Client_CloseRelease(this, in)); - } - else if (in->num_open_rd == 0) { - dout(20) << "calling release" << dendl; - in->fc.release_clean(); - close_release(in); - } - else if (in->num_open_wr == 0) { - dout(20) << "calling flush dirty" << dendl; - in->fc.flush_dirty(new C_Client_CloseRelease(this,in)); - } - - // pin until safe? - if (in->num_open_wr == 0 && !in->fc.all_safe()) { - dout(10) << "pinning ino " << in->ino() << " until safe" << dendl; - in->get(); - in->fc.add_safe_waiter(new C_Client_CloseSafe(this, in)); - } - } else { - // caching off. - if (in->num_open_rd == 0 && in->num_open_wr == 0) { - dout(10) << " releasing caps on " << in->ino() << dendl; - release_caps(in); // release caps now. - } - } - - put_inode( in ); - return 0; -} - - - -// ------------ -// read, write - - -off_t Client::lseek(int fd, off_t offset, int whence) -{ - Mutex::Locker lock(client_lock); - tout << "lseek" << std::endl; - tout << fd << std::endl; - tout << offset << std::endl; - tout << whence << std::endl; - - assert(fd_map.count(fd)); - Fh *f = fd_map[fd]; - Inode *in = f->inode; - - switch (whence) { - case SEEK_SET: - f->pos = offset; - break; - - case SEEK_CUR: - f->pos += offset; - break; - - case SEEK_END: - f->pos = in->inode.size + offset; - break; - - default: - assert(0); - } - - off_t pos = f->pos; - - dout(3) << "lseek(" << fd << ", " << offset << ", " << whence << ") = " << pos << dendl; - return pos; -} - - - -void Client::lock_fh_pos(Fh *f) -{ - dout(10) << "lock_fh_pos " << f << dendl; - - if (f->pos_locked || !f->pos_waiters.empty()) { - Cond cond; - f->pos_waiters.push_back(&cond); - dout(10) << "lock_fh_pos BLOCKING on " << f << dendl; - while (f->pos_locked || f->pos_waiters.front() != &cond) - cond.Wait(client_lock); - dout(10) << "lock_fh_pos UNBLOCKING on " << f << dendl; - assert(f->pos_waiters.front() == &cond); - f->pos_waiters.pop_front(); - } - - f->pos_locked = true; -} - -void Client::unlock_fh_pos(Fh *f) -{ - dout(10) << "unlock_fh_pos " << f << dendl; - f->pos_locked = false; -} - - - -//char *hackbuf = 0; - - -// blocking osd interface - -int Client::read(int fd, char *buf, off_t size, off_t offset) -{ - Mutex::Locker lock(client_lock); - tout << "read" << std::endl; - tout << fd << std::endl; - tout << size << std::endl; - tout << offset << std::endl; - - assert(fd_map.count(fd)); - Fh *f = fd_map[fd]; - bufferlist bl; - int r = _read(f, offset, size, &bl); - dout(3) << "read(" << fd << ", " << buf << ", " << size << ", " << offset << ") = " << r << dendl; - if (r >= 0) { - bl.copy(0, bl.length(), buf); - r = bl.length(); - } - return r; -} - -int Client::_read(Fh *f, off_t offset, off_t size, bufferlist *bl) -{ - Inode *in = f->inode; - - bool movepos = false; - if (offset < 0) { - lock_fh_pos(f); - offset = f->pos; - movepos = true; - } - - bool lazy = f->mode == FILE_MODE_LAZY; - - // determine whether read range overlaps with file - // ...ONLY if we're doing async io - if (!lazy && (in->file_caps() & (CAP_FILE_WRBUFFER|CAP_FILE_RDCACHE))) { - // we're doing buffered i/o. make sure we're inside the file. - // we can trust size info bc we get accurate info when buffering/caching caps are issued. - dout(10) << "file size: " << in->inode.size << dendl; - if (offset > 0 && offset >= in->inode.size) { - if (movepos) unlock_fh_pos(f); - return 0; - } - if (offset + size > (off_t)in->inode.size) - size = (off_t)in->inode.size - offset; - - if (size == 0) { - dout(10) << "read is size=0, returning 0" << dendl; - if (movepos) unlock_fh_pos(f); - return 0; - } - } else { - // unbuffered, synchronous file i/o. - // or lazy. - // defer to OSDs for file bounds. - } - - int r = 0; - int rvalue = 0; - - if (g_conf.client_oc) { - // object cache ON - rvalue = r = in->fc.read(offset, size, *bl, client_lock); // may block. - - /* - if (in->inode.ino == 0x10000000075 && hackbuf) { - int s = MIN(size, bl->length()); - char *v = bl->c_str(); - for (int a=0; afile_caps() & CAP_FILE_RD) == 0) { - dout(7) << " don't have read cap, waiting" << dendl; - Cond cond; - in->waitfor_read.push_back(&cond); - cond.Wait(client_lock); - } - // lazy cap? - while (lazy && (in->file_caps() & CAP_FILE_LAZYIO) == 0) { - dout(7) << " don't have lazy cap, waiting" << dendl; - Cond cond; - in->waitfor_lazy.push_back(&cond); - cond.Wait(client_lock); - } - - // do sync read - Cond cond; - bool done = false; - C_Cond *onfinish = new C_Cond(&cond, &done, &rvalue); - - Objecter::OSDRead *rd = filer->prepare_read(in->inode, offset, size, bl); - if (in->hack_balance_reads || - g_conf.client_hack_balance_reads) - rd->balance_reads = true; - r = objecter->readx(rd, onfinish); - assert(r >= 0); - - // wait! - while (!done) - cond.Wait(client_lock); - } - - if (movepos) { - // adjust fd pos - f->pos = offset+bl->length(); - unlock_fh_pos(f); - } - - // done! - return rvalue; -} - - - -/* - * hack -- - * until we properly implement synchronous writes wrt buffer cache, - * make sure we delay shutdown until they're all safe on disk! - */ -class C_Client_HackUnsafe : public Context { - Client *cl; -public: - C_Client_HackUnsafe(Client *c) : cl(c) {} - void finish(int) { - cl->hack_sync_write_safe(); - } -}; - -void Client::hack_sync_write_safe() -{ - client_lock.Lock(); - assert(unsafe_sync_write > 0); - unsafe_sync_write--; - dout(15) << "hack_sync_write_safe unsafe_sync_write = " << unsafe_sync_write << dendl; - if (unsafe_sync_write == 0 && unmounting) { - dout(10) << "hack_sync_write_safe -- no more unsafe writes, unmount can proceed" << dendl; - mount_cond.Signal(); - } - client_lock.Unlock(); -} - -int Client::write(int fd, const char *buf, off_t size, off_t offset) -{ - Mutex::Locker lock(client_lock); - tout << "write" << std::endl; - tout << fd << std::endl; - tout << size << std::endl; - tout << offset << std::endl; - - assert(fd_map.count(fd)); - Fh *fh = fd_map[fd]; - int r = _write(fh, offset, size, buf); - dout(3) << "write(" << fd << ", \"...\", " << size << ", " << offset << ") = " << r << dendl; - return r; -} - - -int Client::_write(Fh *f, off_t offset, off_t size, const char *buf) -{ - //dout(7) << "write fh " << fh << " size " << size << " offset " << offset << dendl; - Inode *in = f->inode; - - // use/adjust fd pos? - if (offset < 0) { - lock_fh_pos(f); - offset = f->pos; - f->pos = offset+size; - unlock_fh_pos(f); - } - - bool lazy = f->mode == FILE_MODE_LAZY; - - dout(10) << "cur file size is " << in->inode.size << " wr size " << in->file_wr_size << dendl; - - // time it. - utime_t start = g_clock.real_now(); - - // copy into fresh buffer (since our write may be resub, async) - bufferptr bp; - if (size > 0) bp = buffer::copy(buf, size); - bufferlist blist; - blist.push_back( bp ); - - if (g_conf.client_oc) { // buffer cache ON? - assert(objectcacher); - - // write (this may block!) - in->fc.write(offset, size, blist, client_lock); - - } else { - // legacy, inconsistent synchronous write. - dout(7) << "synchronous write" << dendl; - - // do we have write file cap? - while (!lazy && (in->file_caps() & CAP_FILE_WR) == 0) { - dout(7) << " don't have write cap, waiting" << dendl; - Cond cond; - in->waitfor_write.push_back(&cond); - cond.Wait(client_lock); - } - while (lazy && (in->file_caps() & CAP_FILE_LAZYIO) == 0) { - dout(7) << " don't have lazy cap, waiting" << dendl; - Cond cond; - in->waitfor_lazy.push_back(&cond); - cond.Wait(client_lock); - } - - // prepare write - Cond cond; - bool done = false; - C_Cond *onfinish = new C_Cond(&cond, &done); - C_Client_HackUnsafe *onsafe = new C_Client_HackUnsafe(this); - unsafe_sync_write++; - in->sync_writes++; - - dout(20) << " sync write start " << onfinish << dendl; - - filer->write(in->inode, offset, size, blist, 0, - onfinish, onsafe - //, 1+((int)g_clock.now()) / 10 //f->pos // hack hack test osd revision snapshots - ); - - while (!done) { - cond.Wait(client_lock); - dout(20) << " sync write bump " << onfinish << dendl; - } - - in->sync_writes--; - if (in->sync_writes == 0 && - !in->waitfor_no_write.empty()) { - for (list::iterator i = in->waitfor_no_write.begin(); - i != in->waitfor_no_write.end(); - i++) - (*i)->finish(0); - in->waitfor_no_write.clear(); - } - - dout(20) << " sync write done " << onfinish << dendl; - } - - // time - utime_t lat = g_clock.real_now(); - lat -= start; - if (client_logger) { - client_logger->finc("wrlsum",(double)lat); - client_logger->inc("wrlnum"); - } - - // assume success for now. FIXME. - off_t totalwritten = size; - - // extend file? - if (totalwritten + offset > in->inode.size) { - in->inode.size = in->file_wr_size = totalwritten + offset; - dout(7) << "wrote to " << totalwritten+offset << ", extending file size" << dendl; - } else { - dout(7) << "wrote to " << totalwritten+offset << ", leaving file size at " << in->inode.size << dendl; - } - - // mtime - in->file_wr_mtime = in->inode.mtime = g_clock.real_now(); - - // ok! - return totalwritten; -} - -int Client::_flush(Fh *f) -{ - // no-op, for now. hrm. - return 0; -} - - -int Client::truncate(const char *relpath, off_t length) -{ - Mutex::Locker lock(client_lock); - tout << "truncate" << std::endl; - tout << relpath << std::endl; - tout << length << std::endl; - - string path; - mkabspath(relpath, path); - return _truncate(path.c_str(), length); -} - -int Client::_truncate(const char *file, off_t length) -{ - MClientRequest *req = new MClientRequest(MDS_OP_TRUNCATE, messenger->get_myinst()); - req->set_path(file); - req->args.truncate.length = length; - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - MClientReply *reply = make_request(req); - int res = reply->get_result(); - insert_trace(reply); - delete reply; - - dout(3) << "truncate(\"" << file << "\", " << length << ") = " << res << dendl; - return res; -} - -int Client::ftruncate(int fd, off_t length) -{ - Mutex::Locker lock(client_lock); - tout << "ftruncate" << std::endl; - tout << fd << std::endl; - tout << length << std::endl; - - assert(fd_map.count(fd)); - Fh *f = fd_map[fd]; - return _ftruncate(f, length); -} - -int Client::_ftruncate(Fh *fh, off_t length) -{ - MClientRequest *req = new MClientRequest(MDS_OP_TRUNCATE, messenger->get_myinst()); - req->args.truncate.ino = fh->inode->inode.ino; - req->args.truncate.length = length; - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - MClientReply *reply = make_request(req); - int res = reply->get_result(); - insert_trace(reply); - delete reply; - - dout(3) << "ftruncate(\"" << fh << "\", " << length << ") = " << res << dendl; - return res; -} - - -int Client::fsync(int fd, bool syncdataonly) -{ - Mutex::Locker lock(client_lock); - tout << "fsync" << std::endl; - tout << fd << std::endl; - tout << syncdataonly << std::endl; - - assert(fd_map.count(fd)); - Fh *f = fd_map[fd]; - int r = _fsync(f, syncdataonly); - dout(3) << "fsync(" << fd << ", " << syncdataonly << ") = " << r << dendl; - return r; -} - -int Client::_fsync(Fh *f, bool syncdataonly) -{ - int r = 0; - - Inode *in = f->inode; - - // metadata? - if (!syncdataonly) { - dout(0) << "fsync - not syncing metadata yet.. implement me" << dendl; - } - - // data? - Cond cond; - bool done = false; - if (!objectcacher->commit_set(in->ino(), - new C_Cond(&cond, &done))) { - // wait for callback - while (!done) cond.Wait(client_lock); - } - return r; -} - - -// not written yet, but i want to link! - -int Client::chdir(const char *path) -{ - Mutex::Locker lock(client_lock); - tout << "chdir" << std::endl; - tout << path << std::endl; - - // fake it for now! - string abs; - mkabspath(path, abs); - dout(3) << "chdir " << path << " -> cwd now " << abs << dendl; - cwd = abs; - return 0; -} - -int Client::statfs(const char *path, struct statvfs *stbuf) -{ - Mutex::Locker lock(client_lock); - tout << "statfs" << std::endl; - return _statfs(stbuf); -} - -int Client::ll_statfs(inodeno_t ino, struct statvfs *stbuf) -{ - Mutex::Locker lock(client_lock); - tout << "ll_statfs" << std::endl; - return _statfs(stbuf); -} - -int Client::_statfs(struct statvfs *stbuf) -{ - dout(3) << "_statfs" << dendl; - - Cond cond; - tid_t tid = ++last_tid; - StatfsRequest *req = new StatfsRequest(tid, &cond); - statfs_requests[tid] = req; - - int mon = monmap->pick_mon(); - messenger->send_message(new MStatfs(req->tid), monmap->get_inst(mon)); - - while (req->reply == 0) - cond.Wait(client_lock); - - // yay - memcpy(stbuf, &req->reply->stfs, sizeof(*stbuf)); - - statfs_requests.erase(req->tid); - delete req->reply; - delete req; - - int r = 0; - dout(3) << "_statfs = " << r << dendl; - return r; -} - -void Client::handle_statfs_reply(MStatfsReply *reply) -{ - if (statfs_requests.count(reply->tid) && - statfs_requests[reply->tid]->reply == 0) { - dout(10) << "handle_statfs_reply " << *reply << ", kicking waiter" << dendl; - statfs_requests[reply->tid]->reply = reply; - statfs_requests[reply->tid]->caller_cond->Signal(); - } else { - dout(10) << "handle_statfs_reply " << *reply << ", dup or old, dropping" << dendl; - delete reply; - } -} - - -int Client::lazyio_propogate(int fd, off_t offset, size_t count) -{ - client_lock.Lock(); - dout(3) << "op: client->lazyio_propogate(" << fd - << ", " << offset << ", " << count << ")" << dendl; - - assert(fd_map.count(fd)); - Fh *f = fd_map[fd]; - Inode *in = f->inode; - - if (f->mode & FILE_MODE_LAZY) { - // wait for lazy cap - while ((in->file_caps() & CAP_FILE_LAZYIO) == 0) { - dout(7) << " don't have lazy cap, waiting" << dendl; - Cond cond; - in->waitfor_lazy.push_back(&cond); - cond.Wait(client_lock); - } - - if (g_conf.client_oc) { - Cond cond; - bool done = false; - in->fc.flush_dirty(new C_SafeCond(&client_lock, &cond, &done)); - - while (!done) - cond.Wait(client_lock); - - } else { - // mmm, nothin to do. - } - } - - client_lock.Unlock(); - return 0; -} - -int Client::lazyio_synchronize(int fd, off_t offset, size_t count) -{ - client_lock.Lock(); - dout(3) << "op: client->lazyio_synchronize(" << fd - << ", " << offset << ", " << count << ")" << dendl; - - assert(fd_map.count(fd)); - Fh *f = fd_map[fd]; - Inode *in = f->inode; - - if (f->mode & FILE_MODE_LAZY) { - // wait for lazy cap - while ((in->file_caps() & CAP_FILE_LAZYIO) == 0) { - dout(7) << " don't have lazy cap, waiting" << dendl; - Cond cond; - in->waitfor_lazy.push_back(&cond); - cond.Wait(client_lock); - } - - if (g_conf.client_oc) { - in->fc.flush_dirty(0); // flush to invalidate. - in->fc.release_clean(); - } else { - // mm, nothin to do. - } - } - - client_lock.Unlock(); - return 0; -} - - - - -// ========================================= -// low level - -// ugly hack for ll -#define FUSE_SET_ATTR_MODE (1 << 0) -#define FUSE_SET_ATTR_UID (1 << 1) -#define FUSE_SET_ATTR_GID (1 << 2) -#define FUSE_SET_ATTR_SIZE (1 << 3) -#define FUSE_SET_ATTR_ATIME (1 << 4) -#define FUSE_SET_ATTR_MTIME (1 << 5) - -int Client::ll_lookup(inodeno_t parent, const char *name, struct stat *attr) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_lookup " << parent << " " << name << dendl; - tout << "ll_lookup" << std::endl; - tout << parent.val << std::endl; - tout << name << std::endl; - - string dname = name; - Inode *diri = 0; - Inode *in = 0; - int r = 0; - - if (inode_map.count(parent) == 0) { - dout(1) << "ll_lookup " << parent << " " << name << " -> ENOENT (parent DNE... WTF)" << dendl; - r = -ENOENT; - attr->st_ino = 0; - goto out; - } - diri = inode_map[parent]; - if (!diri->inode.is_dir()) { - dout(1) << "ll_lookup " << parent << " " << name << " -> ENOTDIR (parent not a dir... WTF)" << dendl; - r = -ENOTDIR; - attr->st_ino = 0; - goto out; - } - - // get the inode - if (diri->dir && - diri->dir->dentries.count(dname)) { - Dentry *dn = diri->dir->dentries[dname]; - touch_dn(dn); - in = dn->inode; - } else { - string path; - diri->make_path(path); - path += "/"; - path += name; - _do_lstat(path.c_str(), 0, &in); - } - if (in) { - fill_stat(in, attr); - _ll_get(in); - } else { - r = -ENOENT; - attr->st_ino = 0; - } - - out: - dout(3) << "ll_lookup " << parent << " " << name - << " -> " << r << " (" << hex << attr->st_ino << dec << ")" << dendl; - tout << attr->st_ino << std::endl; - return r; -} - -void Client::_ll_get(Inode *in) -{ - if (in->ll_ref == 0) - in->get(); - in->ll_get(); - dout(20) << "_ll_get " << in << " " << in->inode.ino << " -> " << in->ll_ref << dendl; -} - -int Client::_ll_put(Inode *in, int num) -{ - in->ll_put(num); - dout(20) << "_ll_put " << in << " " << in->inode.ino << " " << num << " -> " << in->ll_ref << dendl; - if (in->ll_ref == 0) { - put_inode(in); - return 0; - } else { - return in->ll_ref; - } -} - -void Client::_ll_drop_pins() -{ - dout(10) << "_ll_drop_pins" << dendl; - hash_map::iterator next; - for (hash_map::iterator it = inode_map.begin(); - it != inode_map.end(); - it = next) { - Inode *in = it->second; - next = it; - next++; - if (in->ll_ref) - _ll_put(in, in->ll_ref); - } -} - -bool Client::ll_forget(inodeno_t ino, int num) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_forget " << ino << " " << num << dendl; - tout << "ll_forget" << std::endl; - tout << ino.val << std::endl; - tout << num << std::endl; - - if (ino == 1) return true; // ignore forget on root. - - bool last = false; - if (inode_map.count(ino) == 0) { - dout(1) << "WARNING: ll_forget on " << ino << " " << num - << ", which I don't have" << dendl; - } else { - Inode *in = inode_map[ino]; - assert(in); - if (in->ll_ref < num) { - dout(1) << "WARNING: ll_forget on " << ino << " " << num << ", which only has ll_ref=" << in->ll_ref << dendl; - _ll_put(in, in->ll_ref); - last = true; - } else { - if (_ll_put(in, num) == 0) - last = true; - } - } - return last; -} - -Inode *Client::_ll_get_inode(inodeno_t ino) -{ - if (inode_map.count(ino) == 0) { - assert(ino == 1); // must be the root inode. - Inode *in; - int r = _do_lstat("/", 0, &in); - assert(r >= 0); - return in; - } else { - return inode_map[ino]; - } -} - - -int Client::ll_getattr(inodeno_t ino, struct stat *attr) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_getattr " << ino << dendl; - tout << "ll_getattr" << std::endl; - tout << ino.val << std::endl; - - Inode *in = _ll_get_inode(ino); - fill_stat(in, attr); - return 0; -} - -int Client::ll_setattr(inodeno_t ino, struct stat *attr, int mask) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_setattr " << ino << " mask " << hex << mask << dec << dendl; - tout << "ll_setattr" << std::endl; - tout << ino.val << std::endl; - tout << attr->st_mode << std::endl; - tout << attr->st_uid << std::endl; - tout << attr->st_gid << std::endl; - tout << attr->st_size << std::endl; - tout << attr->st_mtime << std::endl; - tout << attr->st_atime << std::endl; - tout << mask << std::endl; - - Inode *in = _ll_get_inode(ino); - - string path; - in->make_path(path); - - int r = 0; - if ((mask & FUSE_SET_ATTR_MODE) && - ((r = _chmod(path.c_str(), attr->st_mode)) < 0)) return r; - - if ((mask & FUSE_SET_ATTR_UID) && (mask & FUSE_SET_ATTR_GID) && - ((r = _chown(path.c_str(), attr->st_uid, attr->st_gid)) < 0)) return r; - //if ((mask & FUSE_SET_ATTR_GID) && - //(r = client->_chgrp(path.c_str(), attr->st_gid) < 0)) return r; - - if ((mask & FUSE_SET_ATTR_SIZE) && - ((r = _truncate(path.c_str(), attr->st_size)) < 0)) return r; - - if ((mask & FUSE_SET_ATTR_MTIME) && (mask & FUSE_SET_ATTR_ATIME)) { - if ((r = _utimes(path.c_str(), utime_t(attr->st_mtime,0), utime_t(attr->st_atime,0))) < 0) return r; - } else if (mask & FUSE_SET_ATTR_MTIME) { - if ((r = _utimes(path.c_str(), utime_t(attr->st_mtime,0), utime_t())) < 0) return r; - } else if (mask & FUSE_SET_ATTR_ATIME) { - if ((r = _utimes(path.c_str(), utime_t(), utime_t(attr->st_atime,0))) < 0) return r; - } - - assert(r == 0); - fill_stat(in, attr); - - dout(3) << "ll_setattr " << ino << " = " << r << dendl; - return 0; -} - -int Client::ll_readlink(inodeno_t ino, const char **value) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_readlink " << ino << dendl; - tout << "ll_readlink" << std::endl; - tout << ino.val << std::endl; - - Inode *in = _ll_get_inode(ino); - if (in->dn) touch_dn(in->dn); - - int r = 0; - if (in->inode.is_symlink()) { - *value = in->symlink->c_str(); - } else { - *value = ""; - r = -EINVAL; - } - dout(3) << "ll_readlink " << ino << " = " << r << " (" << *value << ")" << dendl; - return r; -} - -int Client::ll_mknod(inodeno_t parent, const char *name, mode_t mode, dev_t rdev, struct stat *attr) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_mknod " << parent << " " << name << dendl; - tout << "ll_mknod" << std::endl; - tout << parent.val << std::endl; - tout << name << std::endl; - tout << mode << std::endl; - tout << rdev << std::endl; - - Inode *diri = _ll_get_inode(parent); - - string path; - diri->make_path(path); - path += "/"; - path += name; - int r = _mknod(path.c_str(), mode, rdev); - if (r == 0) { - string dname(name); - Inode *in = diri->dir->dentries[dname]->inode; - fill_stat(in, attr); - _ll_get(in); - } - tout << attr->st_ino << std::endl; - dout(3) << "ll_mknod " << parent << " " << name - << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl; - return r; -} - -int Client::ll_mkdir(inodeno_t parent, const char *name, mode_t mode, struct stat *attr) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_mkdir " << parent << " " << name << dendl; - tout << "ll_mkdir" << std::endl; - tout << parent.val << std::endl; - tout << name << std::endl; - tout << mode << std::endl; - - Inode *diri = _ll_get_inode(parent); - - string path; - diri->make_path(path); - path += "/"; - path += name; - int r = _mkdir(path.c_str(), mode); - if (r == 0) { - string dname(name); - Inode *in = diri->dir->dentries[dname]->inode; - fill_stat(in, attr); - _ll_get(in); - } - tout << attr->st_ino << std::endl; - dout(3) << "ll_mkdir " << parent << " " << name - << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl; - return r; -} - -int Client::ll_symlink(inodeno_t parent, const char *name, const char *value, struct stat *attr) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_symlink " << parent << " " << name << " -> " << value << dendl; - tout << "ll_symlink" << std::endl; - tout << parent.val << std::endl; - tout << name << std::endl; - tout << value << std::endl; - - Inode *diri = _ll_get_inode(parent); - - string path; - diri->make_path(path); - path += "/"; - path += name; - int r = _symlink(value, path.c_str()); - if (r == 0) { - string dname(name); - Inode *in = diri->dir->dentries[dname]->inode; - fill_stat(in, attr); - _ll_get(in); - } - tout << attr->st_ino << std::endl; - dout(3) << "ll_symlink " << parent << " " << name - << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl; - return r; -} - -int Client::ll_unlink(inodeno_t ino, const char *name) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_unlink " << ino << " " << name << dendl; - tout << "ll_unlink" << std::endl; - tout << ino.val << std::endl; - tout << name << std::endl; - - Inode *diri = _ll_get_inode(ino); - - string path; - diri->make_path(path); - path += "/"; - path += name; - return _unlink(path.c_str()); -} - -int Client::ll_rmdir(inodeno_t ino, const char *name) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_rmdir " << ino << " " << name << dendl; - tout << "ll_rmdir" << std::endl; - tout << ino.val << std::endl; - tout << name << std::endl; - - Inode *diri = _ll_get_inode(ino); - - string path; - diri->make_path(path); - path += "/"; - path += name; - return _rmdir(path.c_str()); -} - -int Client::ll_rename(inodeno_t parent, const char *name, inodeno_t newparent, const char *newname) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_rename " << parent << " " << name << " to " - << newparent << " " << newname << dendl; - tout << "ll_rename" << std::endl; - tout << parent.val << std::endl; - tout << name << std::endl; - tout << newparent.val << std::endl; - tout << newname << std::endl; - - Inode *diri = _ll_get_inode(parent); - string path; - diri->make_path(path); - path += "/"; - path += name; - - Inode *newdiri = _ll_get_inode(newparent); - string newpath; - newdiri->make_path(newpath); - newpath += "/"; - newpath += newname; - - return _rename(path.c_str(), newpath.c_str()); -} - -int Client::ll_link(inodeno_t ino, inodeno_t newparent, const char *newname, struct stat *attr) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_link " << ino << " to " << newparent << " " << newname << dendl; - tout << "ll_link" << std::endl; - tout << ino.val << std::endl; - tout << newparent << std::endl; - tout << newname << std::endl; - - Inode *old = _ll_get_inode(ino); - Inode *diri = _ll_get_inode(newparent); - - string path; - old->make_path(path); - - string newpath; - diri->make_path(newpath); - newpath += "/"; - newpath += newname; - - int r = _link(path.c_str(), newpath.c_str()); - if (r == 0) { - Inode *in = _ll_get_inode(ino); - fill_stat(in, attr); - _ll_get(in); - } - return r; -} - -int Client::ll_opendir(inodeno_t ino, void **dirpp) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_opendir " << ino << dendl; - tout << "ll_opendir" << std::endl; - tout << ino.val << std::endl; - - Inode *diri = inode_map[ino]; - assert(diri); - string path; - diri->make_path(path); - - int r = _opendir(path.c_str(), (DirResult**)dirpp); - - tout << (unsigned long)*dirpp << std::endl; - - dout(3) << "ll_opendir " << ino << " = " << r << " (" << *dirpp << ")" << dendl; - return r; -} - -void Client::ll_releasedir(void *dirp) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_releasedir " << dirp << dendl; - tout << "ll_releasedir" << std::endl; - tout << (unsigned long)dirp << std::endl; - _closedir((DirResult*)dirp); -} - -int Client::ll_open(inodeno_t ino, int flags, Fh **fhp) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_open " << ino << " " << flags << dendl; - tout << "ll_open" << std::endl; - tout << ino.val << std::endl; - tout << flags << std::endl; - - Inode *in = _ll_get_inode(ino); - string path; - in->make_path(path); - - int r = _open(path.c_str(), flags, 0, fhp); - - tout << (unsigned long)*fhp << std::endl; - dout(3) << "ll_open " << ino << " " << flags << " = " << r << " (" << *fhp << ")" << dendl; - return r; -} - -int Client::ll_create(inodeno_t parent, const char *name, mode_t mode, int flags, - struct stat *attr, Fh **fhp) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_create " << parent << " " << name << " 0" << oct << mode << dec << " " << flags << dendl; - tout << "ll_create" << std::endl; - tout << parent.val << std::endl; - tout << name << std::endl; - tout << mode << std::endl; - tout << flags << std::endl; - - Inode *pin = _ll_get_inode(parent); - string path; - pin->make_path(path); - path += "/"; - path += name; - - int r = _open(path.c_str(), flags|O_CREAT, mode, fhp); - if (r >= 0) { - Inode *in = (*fhp)->inode; - fill_stat(in, attr); - _ll_get(in); - } else { - attr->st_ino = 0; - } - tout << (unsigned long)*fhp << std::endl; - tout << attr->st_ino << std::endl; - dout(3) << "ll_create " << parent << " " << name << " 0" << oct << mode << dec << " " << flags - << " = " << r << " (" << *fhp << " " << hex << attr->st_ino << dec << ")" << dendl; - return 0; -} - -int Client::ll_read(Fh *fh, off_t off, off_t len, bufferlist *bl) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_read " << fh << " " << off << "~" << len << dendl; - tout << "ll_read" << std::endl; - tout << (unsigned long)fh << std::endl; - tout << off << std::endl; - tout << len << std::endl; - - return _read(fh, off, len, bl); -} - -int Client::ll_write(Fh *fh, off_t off, off_t len, const char *data) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_write " << fh << " " << off << "~" << len << dendl; - tout << "ll_write" << std::endl; - tout << (unsigned long)fh << std::endl; - tout << off << std::endl; - tout << len << std::endl; - - return _write(fh, off, len, data); -} - -int Client::ll_flush(Fh *fh) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_flush " << fh << dendl; - tout << "ll_flush" << std::endl; - tout << (unsigned long)fh << std::endl; - - return _flush(fh); -} - -int Client::ll_fsync(Fh *fh, bool syncdataonly) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_fsync " << fh << dendl; - tout << "ll_fsync" << std::endl; - tout << (unsigned long)fh << std::endl; - - return _fsync(fh, syncdataonly); -} - - -int Client::ll_release(Fh *fh) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_release " << fh << dendl; - tout << "ll_release" << std::endl; - tout << (unsigned long)fh << std::endl; - - _release(fh); - return 0; -} - - - - - - -// ========================================= -// layout - - -int Client::describe_layout(int fd, FileLayout *lp) -{ - Mutex::Locker lock(client_lock); - - assert(fd_map.count(fd)); - Fh *f = fd_map[fd]; - Inode *in = f->inode; - - *lp = in->inode.layout; - - dout(3) << "describe_layout(" << fd << ") = 0" << dendl; - return 0; -} - -int Client::get_stripe_unit(int fd) -{ - FileLayout layout; - describe_layout(fd, &layout); - return layout.fl_stripe_unit; -} - -int Client::get_stripe_width(int fd) -{ - FileLayout layout; - describe_layout(fd, &layout); - return ceph_file_layout_stripe_width(layout); -} - -int Client::get_stripe_period(int fd) -{ - FileLayout layout; - describe_layout(fd, &layout); - return ceph_file_layout_period(layout); -} - -int Client::enumerate_layout(int fd, list& result, - off_t length, off_t offset) -{ - Mutex::Locker lock(client_lock); - - assert(fd_map.count(fd)); - Fh *f = fd_map[fd]; - Inode *in = f->inode; - - // map to a list of extents - filer->file_to_extents(in->inode, offset, length, result); - - dout(3) << "enumerate_layout(" << fd << ", " << length << ", " << offset << ") = 0" << dendl; - return 0; -} - - -void Client::ms_handle_failure(Message *m, const entity_inst_t& inst) -{ - entity_name_t dest = inst.name; - - if (dest.is_mon()) { - // resend to a different monitor. - int mon = monmap->pick_mon(true); - dout(0) << "ms_handle_failure " << *m << " to " << inst - << ", resending to mon" << mon - << dendl; - messenger->send_message(m, monmap->get_inst(mon)); - } - else if (dest.is_osd()) { - objecter->ms_handle_failure(m, dest, inst); - } - else if (dest.is_mds()) { - dout(0) << "ms_handle_failure " << *m << " to " << inst << dendl; - //failed_mds.insert(dest.num()); - } - else { - // client? - dout(0) << "ms_handle_failure " << *m << " to " << inst << ", dropping" << dendl; - delete m; - } -} - diff --git a/branches/sage/ebofs2/client/Client.h b/branches/sage/ebofs2/client/Client.h deleted file mode 100644 index 727098906c617..0000000000000 --- a/branches/sage/ebofs2/client/Client.h +++ /dev/null @@ -1,847 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __CLIENT_H -#define __CLIENT_H - - -#include "mds/MDSMap.h" -#include "osd/OSDMap.h" -#include "mon/MonMap.h" - -#include "msg/Message.h" -#include "msg/Dispatcher.h" -#include "msg/Messenger.h" - -#include "messages/MClientReply.h" - -#include "include/types.h" -#include "include/lru.h" -#include "include/filepath.h" -#include "include/interval_set.h" - -#include "common/Mutex.h" -#include "common/Timer.h" - -#include "FileCache.h" - - -// stl -#include -#include -#include -using std::set; -using std::map; -using std::fstream; - -#include -using namespace __gnu_cxx; - - - -class MStatfsReply; -class MClientSession; -class MClientRequest; -class MClientRequestForward; - -class Filer; -class Objecter; -class ObjectCacher; - -extern class LogType client_logtype; -extern class Logger *client_logger; - - - -// ============================================ -// types for my local metadata cache -/* basic structure: - - - Dentries live in an LRU loop. they get expired based on last access. - see include/lru.h. items can be bumped to "mid" or "top" of list, etc. - - Inode has ref count for each Fh, Dir, or Dentry that points to it. - - when Inode ref goes to 0, it's expired. - - when Dir is empty, it's removed (and it's Inode ref--) - -*/ - -class Dir; -class Inode; - -class Dentry : public LRUObject { - public: - string name; // sort of lame - //const char *name; - Dir *dir; - Inode *inode; - int ref; // 1 if there's a dir beneath me. - - void get() { assert(ref == 0); ref++; lru_pin(); } - void put() { assert(ref == 1); ref--; lru_unpin(); } - - Dentry() : dir(0), inode(0), ref(0) { } - - /*Dentry() : name(0), dir(0), inode(0), ref(0) { } - Dentry(string& n) : name(0), dir(0), inode(0), ref(0) { - name = new char[n.length()+1]; - strcpy((char*)name, n.c_str()); - } - ~Dentry() { - delete[] name; - }*/ -}; - -class Dir { - public: - Inode *parent_inode; // my inode - //hash_map, eqstr> dentries; - hash_map dentries; - - Dir(Inode* in) { parent_inode = in; } - - bool is_empty() { return dentries.empty(); } -}; - - -class InodeCap { - public: - int caps; - long seq; - InodeCap() : caps(0), seq(0) {} -}; - - -class Inode { - public: - inode_t inode; // the actual inode - utime_t valid_until; - int mask; - - // about the dir (if this is one!) - int dir_auth; - set dir_contacts; - bool dir_hashed, dir_replicated; - - // per-mds caps - map caps; // mds -> InodeCap - map stale_caps; // mds -> cap .. stale - - utime_t file_wr_mtime; // [writers] time of last write - off_t file_wr_size; // [writers] largest offset we've written to - int num_open_rd, num_open_wr, num_open_lazy; // num readers, writers - - int ref; // ref count. 1 for each dentry, fh that links to me. - int ll_ref; // separate ref count for ll client - Dir *dir; // if i'm a dir. - Dentry *dn; // if i'm linked to a dentry. - string *symlink; // symlink content, if it's a symlink - fragtree_t dirfragtree; - map fragmap; // known frag -> mds mappings - - // for caching i/o mode - FileCache fc; - - // for sync i/o mode - int sync_reads; // sync reads in progress - int sync_writes; // sync writes in progress - - list waitfor_write; - list waitfor_read; - list waitfor_lazy; - list waitfor_no_read, waitfor_no_write; - - // - bool hack_balance_reads; - // - - void make_path(string& p) { - if (dn) { - if (dn->dir && dn->dir->parent_inode) - dn->dir->parent_inode->make_path(p); - p += "/"; - p += dn->name; - } - } - - void get() { - ref++; - //cout << "inode.get on " << this << " " << hex << inode.ino << dec << " now " << ref << endl; - } - void put(int n=1) { - ref -= n; assert(ref >= 0); - //cout << "inode.put on " << this << " " << hex << inode.ino << dec << " now " << ref << endl; - } - - void ll_get() { - ll_ref++; - } - void ll_put(int n=1) { - assert(ll_ref >= n); - ll_ref -= n; - } - - Inode(inode_t _inode, ObjectCacher *_oc) : - inode(_inode), - valid_until(0, 0), - dir_auth(-1), dir_hashed(false), dir_replicated(false), - file_wr_mtime(0, 0), file_wr_size(0), - num_open_rd(0), num_open_wr(0), num_open_lazy(0), - ref(0), ll_ref(0), - dir(0), dn(0), symlink(0), - fc(_oc, _inode), - sync_reads(0), sync_writes(0), - hack_balance_reads(false) - { } - ~Inode() { - if (symlink) { delete symlink; symlink = 0; } - } - - inodeno_t ino() { return inode.ino; } - - bool is_dir() { - return (inode.mode & INODE_TYPE_MASK) == INODE_MODE_DIR; - } - - int file_caps() { - int c = 0; - for (map::iterator it = caps.begin(); - it != caps.end(); - it++) - c |= it->second.caps; - for (map::iterator it = stale_caps.begin(); - it != stale_caps.end(); - it++) - c |= it->second.caps; - return c; - } - - int file_caps_wanted() { - int w = 0; - if (num_open_rd) w |= CAP_FILE_RD|CAP_FILE_RDCACHE; - if (num_open_wr) w |= CAP_FILE_WR|CAP_FILE_WRBUFFER; - if (num_open_lazy) w |= CAP_FILE_LAZYIO; - if (fc.is_dirty()) w |= CAP_FILE_WRBUFFER; - if (fc.is_cached()) w |= CAP_FILE_RDCACHE; - return w; - } - - void add_open(int cmode) { - if (cmode & FILE_MODE_R) num_open_rd++; - if (cmode & FILE_MODE_W) num_open_wr++; - if (cmode & FILE_MODE_LAZY) num_open_lazy++; - } - void sub_open(int cmode) { - if (cmode & FILE_MODE_R) num_open_rd--; - if (cmode & FILE_MODE_W) num_open_wr--; - if (cmode & FILE_MODE_LAZY) num_open_lazy--; - } - - int authority(const string& dname) { - if (!dirfragtree.empty()) { - __gnu_cxx::hash H; - frag_t fg = dirfragtree[H(dname)]; - while (fg != frag_t()) { - if (fragmap.count(fg) && - fragmap[fg] >= 0) { - //cout << "picked frag ino " << inode.ino << " dname " << dname << " fg " << fg << " mds" << fragmap[fg] << std::endl; - return fragmap[fg]; - } - fg = frag_t(fg.value(), fg.bits()-1); // try more general... - } - } - return authority(); - } - - int authority() { - if (dir_auth >= 0) - return dir_auth; - - assert(dn); - return dn->dir->parent_inode->authority(dn->name); - } - - - int pick_replica(MDSMap *mdsmap) { - // replicas? - /* fixme - if (//ino() > 1ULL && - dir_contacts.size()) { - set::iterator it = dir_contacts.begin(); - if (dir_contacts.size() == 1) - return *it; - else { - //cout << "dir_contacts on " << inode.ino << " is " << dir_contacts << std::endl; - int r = 1 + (rand() % dir_contacts.size()); - int a = authority(); - while (r--) { - it++; - if (mdsmap->is_down(*it)) it++; - if (it == dir_contacts.end()) it = dir_contacts.begin(); - if (*it == a) it++; // skip the authority - if (it == dir_contacts.end()) it = dir_contacts.begin(); - } - return *it; - } - } - */ - - if (dir_replicated) {// || ino() == 1) { - // pick a random mds that isn't the auth - set s; - mdsmap->get_in_mds_set(s); - set::iterator it = s.begin(); - if (s.empty()) - return 0; - if (s.size() == 1) - return *it; - else { - //cout << "dir_contacts on " << inode.ino << " is " << dir_contacts << std::endl; - int r = 1 + (rand() % s.size()); - int a = authority(); - while (r--) { - it++; - if (mdsmap->is_down(*it)) it++; - if (it == s.end()) it = s.begin(); - if (*it == a) it++; // skip the authority - if (it == s.end()) it = s.begin(); - } - //if (inode.ino == 1) cout << "chose " << *it << " from " << s << std::endl; - return *it; - } - //cout << "num_mds is " << mdcluster->get_num_mds() << endl; - //return mdsmap->get_random_in_mds(); - //return rand() % mdsmap->get_num_mds(); // huh.. pick a random mds! - } - else - return authority(); - } - - - // open Dir for an inode. if it's not open, allocated it (and pin dentry in memory). - Dir *open_dir() { - if (!dir) { - if (dn) dn->get(); // pin dentry - get(); // pin inode - dir = new Dir(this); - } - return dir; - } - -}; - - - - -// file handle for any open file state - -struct Fh { - Inode *inode; - off_t pos; - int mds; // have to talk to mds we opened with (for now) - int mode; // the mode i opened the file with - - bool is_lazy() { return mode & O_LAZY; } - - bool pos_locked; // pos is currently in use - list pos_waiters; // waiters for pos - - Fh() : inode(0), pos(0), mds(0), mode(0), pos_locked(false) {} -}; - - - - - -// ======================================================== -// client interface - -class Client : public Dispatcher { - public: - - /* getdir result */ - struct DirEntry { - string d_name; - struct stat st; - int stmask; - DirEntry(const string &s) : d_name(s), stmask(0) {} - DirEntry(const string &n, struct stat& s, int stm) : d_name(n), st(s), stmask(stm) {} - }; - - struct DirResult { - static const int SHIFT = 28; - static const int64_t MASK = (1 << SHIFT) - 1; - static const off_t END = 1ULL << (SHIFT + 32); - - string path; - Inode *inode; - int64_t offset; // high bits: frag_t, low bits: an offset - map > buffer; - - DirResult(const char *p, Inode *in=0) : path(p), inode(in), offset(0) { - if (inode) inode->get(); - } - DirResult(const string &p, Inode *in=0) : path(p), inode(in), offset(0) { - if (inode) inode->get(); - } - - frag_t frag() { return frag_t(offset >> SHIFT); } - unsigned fragpos() { return offset & MASK; } - - void next_frag() { - frag_t fg = offset >> SHIFT; - if (fg.is_rightmost()) - set_end(); - else - set_frag(fg.next()); - } - void set_frag(frag_t f) { - offset = (uint64_t)f << SHIFT; - assert(sizeof(offset) == 8); - } - void set_end() { offset = END; } - bool at_end() { return (offset == END); } - }; - - - // cluster descriptors - MDSMap *mdsmap; - OSDMap *osdmap; - - SafeTimer timer; - - protected: - Messenger *messenger; - int whoami; - MonMap *monmap; - - // mds sessions - map mds_sessions; // mds -> push seq - map > waiting_for_session; - list waiting_for_mdsmap; - - void handle_client_session(MClientSession *m); - void send_reconnect(int mds); - - // mds requests - struct MetaRequest { - tid_t tid; - MClientRequest *request; - bufferlist request_payload; // in case i have to retry - - bool idempotent; // is request idempotent? - set mds; // who i am asking - int resend_mds; // someone wants you to (re)send the request here - int num_fwd; // # of times i've been forwarded - int retry_attempt; - - MClientReply *reply; // the reply - - Cond *caller_cond; // who to take up - Cond *dispatch_cond; // who to kick back - - MetaRequest(MClientRequest *req, tid_t t) : - tid(t), request(req), - idempotent(false), resend_mds(-1), num_fwd(0), retry_attempt(0), - reply(0), - caller_cond(0), dispatch_cond(0) { } - }; - tid_t last_tid; - map mds_requests; - set failed_mds; - - struct StatfsRequest { - tid_t tid; - MStatfsReply *reply; - Cond *caller_cond; - StatfsRequest(tid_t t, Cond *cc) : tid(t), reply(0), caller_cond(cc) {} - }; - map statfs_requests; - - MClientReply *make_request(MClientRequest *req, int use_auth=-1); - int choose_target_mds(MClientRequest *req); - void send_request(MetaRequest *request, int mds); - void kick_requests(int mds); - void handle_client_request_forward(MClientRequestForward *reply); - void handle_client_reply(MClientReply *reply); - void handle_statfs_reply(MStatfsReply *reply); - - bool mounted; - bool unmounting; - Cond mount_cond; - int my_instance; - - int unsafe_sync_write; -public: - entity_name_t get_myname() { return messenger->get_myname(); } - void hack_sync_write_safe(); - -protected: - Filer *filer; - ObjectCacher *objectcacher; - Objecter *objecter; // (non-blocking) osd interface - - // cache - hash_map inode_map; - Inode* root; - LRU lru; // lru list of Dentry's in our local metadata cache. - - // cap weirdness - map > cap_reap_queue; // ino -> mds -> msg .. set of (would-be) stale caps to reap - - - // file handles, etc. - string cwd; - interval_set free_fd_set; // unused fds - hash_map fd_map; - - int get_fd() { - int fd = free_fd_set.start(); - free_fd_set.erase(fd, 1); - return fd; - } - void put_fd(int fd) { - free_fd_set.insert(fd, 1); - } - - void mkabspath(const char *rel, string& abs) { - if (rel[0] == '/') { - abs = rel; - } else { - abs = cwd; - abs += "/"; - abs += rel; - } - } - - - // global client lock - // - protects Client and buffer cache both! - Mutex client_lock; - - - // -- metadata cache stuff - - // decrease inode ref. delete if dangling. - void put_inode(Inode *in, int n=1) { - //cout << "put_inode on " << in << " " << in->inode.ino << endl; - in->put(n); - if (in->ref == 0) { - //cout << "put_inode deleting " << in->inode.ino << endl; - inode_map.erase(in->inode.ino); - if (in == root) root = 0; - delete in; - } - } - - void close_dir(Dir *dir) { - assert(dir->is_empty()); - - Inode *in = dir->parent_inode; - if (in->dn) in->dn->put(); // unpin dentry - - delete in->dir; - in->dir = 0; - put_inode(in); // unpin inode - } - - //int get_cache_size() { return lru.lru_get_size(); } - //void set_cache_size(int m) { lru.lru_set_max(m); } - - Dentry* link(Dir *dir, const string& name, Inode *in) { - Dentry *dn = new Dentry; - dn->name = name; - - // link to dir - dn->dir = dir; - //cout << "link dir " << dir->parent_inode->inode.ino << " '" << name << "' -> inode " << in->inode.ino << endl; - dir->dentries[dn->name] = dn; - - // link to inode - dn->inode = in; - assert(in->dn == 0); - in->dn = dn; - in->get(); - - if (in->dir) dn->get(); // dir -> dn pin - - lru.lru_insert_mid(dn); // mid or top? - return dn; - } - - void unlink(Dentry *dn) { - Inode *in = dn->inode; - assert(in->dn == dn); - - // unlink from inode - if (dn->inode->dir) dn->put(); // dir -> dn pin - dn->inode = 0; - in->dn = 0; - put_inode(in); - - // unlink from dir - dn->dir->dentries.erase(dn->name); - if (dn->dir->is_empty()) - close_dir(dn->dir); - dn->dir = 0; - - // delete den - lru.lru_remove(dn); - delete dn; - } - - Dentry *relink(Dir *dir, const string& name, Inode *in) { - Dentry *olddn = in->dn; - Dir *olddir = olddn->dir; // note: might == dir! - - // newdn, attach to inode. don't touch inode ref. - Dentry *newdn = new Dentry; - newdn->dir = dir; - newdn->name = name; - newdn->inode = in; - in->dn = newdn; - - if (in->dir) { // dir -> dn pin - newdn->get(); - olddn->put(); - } - - // unlink old dn from dir - olddir->dentries.erase(olddn->name); - olddn->inode = 0; - olddn->dir = 0; - lru.lru_remove(olddn); - - // link new dn to dir - dir->dentries[name] = newdn; - lru.lru_insert_mid(newdn); - - // olddir now empty? (remember, olddir might == dir) - if (olddir->is_empty()) - close_dir(olddir); - - return newdn; - } - - // move dentry to top of lru - void touch_dn(Dentry *dn) { lru.lru_touch(dn); } - - // trim cache. - void trim_cache(); - void dump_inode(Inode *in, set& did); - void dump_cache(); // debug - - // find dentry based on filepath - Dentry *lookup(filepath& path); - - int fill_stat(Inode *in, struct stat *st); - - - // trace generation - ofstream traceout; - - - // friends - friend class SyntheticClient; - - public: - Client(Messenger *m, MonMap *mm, int i=0); - ~Client(); - void tear_down_cache(); - - int get_nodeid() { return whoami; } - - void init(); - void shutdown(); - - // messaging - void dispatch(Message *m); - - void handle_unmount(Message*); - void handle_mds_map(class MMDSMap *m); - - // file caps - void handle_file_caps(class MClientFileCaps *m); - void implemented_caps(class MClientFileCaps *m, Inode *in); - void release_caps(Inode *in, int retain=0); - void update_caps_wanted(Inode *in); - - void close_release(Inode *in); - void close_safe(Inode *in); - - void lock_fh_pos(Fh *f); - void unlock_fh_pos(Fh *f); - - // metadata cache - Inode* insert_inode(Dir *dir, InodeStat *in_info, const string& dn); - void update_dir_dist(Inode *in, DirStat *st); - Inode* insert_trace(MClientReply *reply); - - // ---------------------- - // fs ops. -private: - void _try_mount(); - void _mount_timeout(); - Context *mount_timeout_event; - - class C_MountTimeout : public Context { - Client *client; - public: - C_MountTimeout(Client *c) : client(c) { } - void finish(int r) { - if (r >= 0) client->_mount_timeout(); - } - }; - - // some helpers - int _do_lstat(const char *path, int mask, Inode **in); - int _opendir(const char *name, DirResult **dirpp); - void _readdir_add_dirent(DirResult *dirp, const string& name, Inode *in); - void _readdir_fill_dirent(struct dirent *de, DirEntry *entry, off_t); - bool _readdir_have_frag(DirResult *dirp); - void _readdir_next_frag(DirResult *dirp); - void _readdir_rechoose_frag(DirResult *dirp); - int _readdir_get_frag(DirResult *dirp); - void _closedir(DirResult *dirp); - void _ll_get(Inode *in); - int _ll_put(Inode *in, int num); - void _ll_drop_pins(); - - // internal interface - // call these with client_lock held! - int _link(const char *existing, const char *newname); - int _unlink(const char *path); - int _rename(const char *from, const char *to); - int _mkdir(const char *path, mode_t mode); - int _rmdir(const char *path); - int _readlink(const char *path, char *buf, off_t size); - int _symlink(const char *existing, const char *newname); - int _lstat(const char *path, struct stat *stbuf); - int _chmod(const char *relpath, mode_t mode); - int _chown(const char *relpath, uid_t uid, gid_t gid); - int _utimes(const char *relpath, utime_t mtime, utime_t atime); - int _mknod(const char *path, mode_t mode, dev_t rdev); - int _open(const char *path, int flags, mode_t mode, Fh **fhp); - int _release(Fh *fh); - int _read(Fh *fh, off_t offset, off_t size, bufferlist *bl); - int _write(Fh *fh, off_t offset, off_t size, const char *buf); - int _flush(Fh *fh); - int _truncate(const char *file, off_t length); - int _ftruncate(Fh *fh, off_t length); - int _fsync(Fh *fh, bool syncdataonly); - int _statfs(struct statvfs *stbuf); - - -public: - int mount(); - int unmount(); - - // these shoud (more or less) mirror the actual system calls. - int statfs(const char *path, struct statvfs *stbuf); - - // crap - int chdir(const char *s); - const string getcwd() { return cwd; } - - // namespace ops - int getdir(const char *relpath, list& names); // get the whole dir at once. - - int opendir(const char *name, DIR **dirpp); - int closedir(DIR *dirp); - int readdir_r(DIR *dirp, struct dirent *de); - int readdirplus_r(DIR *dirp, struct dirent *de, struct stat *st, int *stmask); - void rewinddir(DIR *dirp); - off_t telldir(DIR *dirp); - void seekdir(DIR *dirp, off_t offset); - - struct dirent_plus *readdirplus(DIR *dirp); - int readdirplus_r(DIR *dirp, struct dirent_plus *entry, struct dirent_plus **result); - struct dirent_lite *readdirlite(DIR *dirp); - int readdirlite_r(DIR *dirp, struct dirent_lite *entry, struct dirent_lite **result); - - int link(const char *existing, const char *newname); - int unlink(const char *path); - int rename(const char *from, const char *to); - - // dirs - int mkdir(const char *path, mode_t mode); - int rmdir(const char *path); - - // symlinks - int readlink(const char *path, char *buf, off_t size); - int symlink(const char *existing, const char *newname); - - // inode stuff - int lstat(const char *path, struct stat *stbuf); - int lstatlite(const char *path, struct statlite *buf); - - int chmod(const char *path, mode_t mode); - int chown(const char *path, uid_t uid, gid_t gid); - int utime(const char *path, struct utimbuf *buf); - - // file ops - int mknod(const char *path, mode_t mode, dev_t rdev=0); - int open(const char *path, int flags, mode_t mode=0); - int close(int fd); - off_t lseek(int fd, off_t offset, int whence); - int read(int fd, char *buf, off_t size, off_t offset=-1); - int write(int fd, const char *buf, off_t size, off_t offset=-1); - int fake_write_size(int fd, off_t size); - int truncate(const char *file, off_t size); - int ftruncate(int fd, off_t size); - int fsync(int fd, bool syncdataonly); - - // hpc lazyio - int lazyio_propogate(int fd, off_t offset, size_t count); - int lazyio_synchronize(int fd, off_t offset, size_t count); - - // expose file layout - int describe_layout(int fd, FileLayout* layout); - int get_stripe_unit(int fd); - int get_stripe_width(int fd); - int get_stripe_period(int fd); - int enumerate_layout(int fd, list& result, - off_t length, off_t offset); - - // low-level interface - int ll_lookup(inodeno_t parent, const char *name, struct stat *attr); - bool ll_forget(inodeno_t ino, int count); - Inode *_ll_get_inode(inodeno_t ino); - int ll_getattr(inodeno_t ino, struct stat *st); - int ll_setattr(inodeno_t ino, struct stat *st, int mask); - int ll_opendir(inodeno_t ino, void **dirpp); - void ll_releasedir(void *dirp); - int ll_readlink(inodeno_t ino, const char **value); - int ll_mknod(inodeno_t ino, const char *name, mode_t mode, dev_t rdev, struct stat *attr); - int ll_mkdir(inodeno_t ino, const char *name, mode_t mode, struct stat *attr); - int ll_symlink(inodeno_t ino, const char *name, const char *value, struct stat *attr); - int ll_unlink(inodeno_t ino, const char *name); - int ll_rmdir(inodeno_t ino, const char *name); - int ll_rename(inodeno_t parent, const char *name, inodeno_t newparent, const char *newname); - int ll_link(inodeno_t ino, inodeno_t newparent, const char *newname, struct stat *attr); - int ll_open(inodeno_t ino, int flags, Fh **fh); - int ll_create(inodeno_t parent, const char *name, mode_t mode, int flags, struct stat *attr, Fh **fh); - int ll_read(Fh *fh, off_t off, off_t len, bufferlist *bl); - int ll_write(Fh *fh, off_t off, off_t len, const char *data); - int ll_flush(Fh *fh); - int ll_fsync(Fh *fh, bool syncdataonly); - int ll_release(Fh *fh); - int ll_statfs(inodeno_t, struct statvfs *stbuf); - - - // failure - void ms_handle_failure(Message*, const entity_inst_t& inst); -}; - -#endif diff --git a/branches/sage/ebofs2/client/FileCache.cc b/branches/sage/ebofs2/client/FileCache.cc deleted file mode 100644 index 1adec4aaabee7..0000000000000 --- a/branches/sage/ebofs2/client/FileCache.cc +++ /dev/null @@ -1,266 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "include/types.h" - -#include "FileCache.h" -#include "osdc/ObjectCacher.h" - -#include "msg/Messenger.h" - -#include "config.h" -#define dout(x) if (x <= g_conf.debug_client) *_dout << dbeginl << g_clock.now() << " " << oc->objecter->messenger->get_myname() << ".filecache " -#define derr(x) if (x <= g_conf.debug_client) *_derr << dbeginl << g_clock.now() << " " << oc->objecter->messenger->get_myname() << ".filecache " - - - -// flush/release/clean - -void FileCache::flush_dirty(Context *onflush) -{ - if (oc->flush_set(inode.ino, onflush)) { - onflush->finish(0); - delete onflush; - } -} - -off_t FileCache::release_clean() -{ - return oc->release_set(inode.ino); -} - -bool FileCache::is_cached() -{ - return oc->set_is_cached(inode.ino); -} - -bool FileCache::is_dirty() -{ - return oc->set_is_dirty_or_committing(inode.ino); -} - -void FileCache::empty(Context *onempty) -{ - off_t unclean = release_clean(); - bool clean = oc->flush_set(inode.ino, onempty); - assert(!unclean == clean); - - if (clean) { - onempty->finish(0); - delete onempty; - } -} - - -void FileCache::tear_down() -{ - off_t unclean = release_clean(); - if (unclean) { - dout(0) << "tear_down " << unclean << " unclean bytes, purging" << dendl; - oc->purge_set(inode.ino); - } -} - -// truncate - -void FileCache::truncate(off_t olds, off_t news) -{ - dout(5) << "truncate " << olds << " -> " << news << dendl; - - // map range to objects - list ls; - oc->filer.file_to_extents(inode, news, olds-news, ls); - oc->truncate_set(inode.ino, ls); -} - -// caps - -class C_FC_CheckCaps : public Context { - FileCache *fc; -public: - C_FC_CheckCaps(FileCache *f) : fc(f) {} - void finish(int r) { - fc->check_caps(); - } -}; - -void FileCache::set_caps(int caps, Context *onimplement) -{ - if (onimplement) { - dout(10) << "set_caps setting onimplement context for " << cap_string(caps) << dendl; - assert(latest_caps & ~caps); // we should be losing caps. - caps_callbacks[caps].push_back(onimplement); - } - - latest_caps = caps; - check_caps(); - - // kick waiters? (did we gain caps?) - if (can_read() && !waitfor_read.empty()) - for (set::iterator p = waitfor_read.begin(); - p != waitfor_read.end(); - ++p) - (*p)->Signal(); - if (can_write() && !waitfor_write.empty()) - for (set::iterator p = waitfor_write.begin(); - p != waitfor_write.end(); - ++p) - (*p)->Signal(); - -} - -int FileCache::get_used_caps() -{ - int used = 0; - if (num_reading) used |= CAP_FILE_RD; - if (oc->set_is_cached(inode.ino)) used |= CAP_FILE_RDCACHE; - if (num_writing) used |= CAP_FILE_WR; - if (oc->set_is_dirty_or_committing(inode.ino)) used |= CAP_FILE_WRBUFFER; - return used; -} - -void FileCache::check_caps() -{ - // calc used - int used = get_used_caps(); - dout(10) << "check_caps used was " << cap_string(used) << dendl; - - // try to implement caps? - // BUG? latest_caps, not least caps i've seen? - if ((latest_caps & CAP_FILE_RDCACHE) == 0 && - (used & CAP_FILE_RDCACHE)) - release_clean(); - if ((latest_caps & CAP_FILE_WRBUFFER) == 0 && - (used & CAP_FILE_WRBUFFER)) - flush_dirty(new C_FC_CheckCaps(this)); - - used = get_used_caps(); - dout(10) << "check_caps used now " << cap_string(used) << dendl; - - // check callbacks - map >::iterator p = caps_callbacks.begin(); - while (p != caps_callbacks.end()) { - if (used == 0 || (~(p->first) & used) == 0) { - // implemented. - dout(10) << "used is " << cap_string(used) - << ", caps " << cap_string(p->first) << " implemented, doing callback(s)" << dendl; - finish_contexts(p->second); - map >::iterator o = p; - p++; - caps_callbacks.erase(o); - } else { - dout(10) << "used is " << cap_string(used) - << ", caps " << cap_string(p->first) << " not yet implemented" << dendl; - p++; - } - } -} - - - -// read/write - -int FileCache::read(off_t offset, size_t size, bufferlist& blist, Mutex& client_lock) -{ - int r = 0; - - // can i read? - while ((latest_caps & CAP_FILE_RD) == 0) { - dout(10) << "read doesn't have RD cap, blocking" << dendl; - Cond c; - waitfor_read.insert(&c); - c.Wait(client_lock); - waitfor_read.erase(&c); - } - - // inc reading counter - num_reading++; - - if (latest_caps & CAP_FILE_RDCACHE) { - // read (and block) - Cond cond; - bool done = false; - int rvalue = 0; - C_Cond *onfinish = new C_Cond(&cond, &done, &rvalue); - - r = oc->file_read(inode, offset, size, &blist, onfinish); - - if (r == 0) { - // block - while (!done) - cond.Wait(client_lock); - r = rvalue; - } else { - // it was cached. - delete onfinish; - } - } else { - r = oc->file_atomic_sync_read(inode, offset, size, &blist, client_lock); - } - - // dec reading counter - num_reading--; - - if (num_reading == 0 && !caps_callbacks.empty()) - check_caps(); - - return r; -} - -void FileCache::write(off_t offset, size_t size, bufferlist& blist, Mutex& client_lock) -{ - // can i write - while ((latest_caps & CAP_FILE_WR) == 0) { - dout(10) << "write doesn't have WR cap, blocking" << dendl; - Cond c; - waitfor_write.insert(&c); - c.Wait(client_lock); - waitfor_write.erase(&c); - } - - // inc writing counter - num_writing++; - - if (size > 0) { - if (latest_caps & CAP_FILE_WRBUFFER) { // caps buffered write? - // wait? (this may block!) - oc->wait_for_write(size, client_lock); - - // async, caching, non-blocking. - oc->file_write(inode, offset, size, blist); - } else { - // atomic, synchronous, blocking. - oc->file_atomic_sync_write(inode, offset, size, blist, client_lock); - } - } - - // dec writing counter - num_writing--; - if (num_writing == 0 && !caps_callbacks.empty()) - check_caps(); -} - -bool FileCache::all_safe() -{ - return !oc->set_is_dirty_or_committing(inode.ino); -} - -void FileCache::add_safe_waiter(Context *c) -{ - bool safe = oc->commit_set(inode.ino, c); - if (safe) { - c->finish(0); - delete c; - } -} diff --git a/branches/sage/ebofs2/client/FileCache.h b/branches/sage/ebofs2/client/FileCache.h deleted file mode 100644 index 8d6e08146b508..0000000000000 --- a/branches/sage/ebofs2/client/FileCache.h +++ /dev/null @@ -1,85 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __FILECACHE_H -#define __FILECACHE_H - -#include -using std::iostream; - -#include "common/Cond.h" -#include "mds/Capability.h" - -class ObjectCacher; - -class FileCache { - ObjectCacher *oc; - inode_t inode; - - // caps - int latest_caps; - map > caps_callbacks; - - int num_reading; - int num_writing; - //int num_unsafe; - - // waiters - set waitfor_read; - set waitfor_write; - - bool waitfor_release; - - public: - FileCache(ObjectCacher *_oc, inode_t _inode) : - oc(_oc), - inode(_inode), - latest_caps(0), - num_reading(0), num_writing(0),// num_unsafe(0), - waitfor_release(false) {} - ~FileCache() { - tear_down(); - } - - // waiters/waiting - bool can_read() { return latest_caps & CAP_FILE_RD; } - bool can_write() { return latest_caps & CAP_FILE_WR; } - bool all_safe();// { return num_unsafe == 0; } - - void add_safe_waiter(Context *c); - - void truncate(off_t olds, off_t news); - - // ... - void flush_dirty(Context *onflush=0); - off_t release_clean(); - void empty(Context *onempty=0); - bool is_empty() { return !(is_cached() || is_dirty()); } - bool is_cached(); - bool is_dirty(); - - void tear_down(); - - int get_caps() { return latest_caps; } - int get_used_caps(); - void set_caps(int caps, Context *onimplement=0); - void check_caps(); - - int read(off_t offset, size_t size, bufferlist& blist, Mutex& client_lock); // may block. - void write(off_t offset, size_t size, bufferlist& blist, Mutex& client_lock); // may block. - -}; - - -#endif diff --git a/branches/sage/ebofs2/client/SyntheticClient.cc b/branches/sage/ebofs2/client/SyntheticClient.cc deleted file mode 100644 index 1695631b8b8cb..0000000000000 --- a/branches/sage/ebofs2/client/SyntheticClient.cc +++ /dev/null @@ -1,2882 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include -#include -using namespace std; - - - -#include "SyntheticClient.h" -#include "osdc/Objecter.h" - -#include "include/filepath.h" -#include "mds/mdstypes.h" -#include "common/Logger.h" - -#include -#include -#include -#include -#include -#include -#include - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_client) *_dout << dbeginl << g_clock.now() << " synthetic" << client->get_nodeid() << " " -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_client) *_derr << dbeginl << g_clock.now() << " synthetic" << client->get_nodeid() << " " - -// traces -//void trace_include(SyntheticClient *syn, Client *cl, string& prefix); -//void trace_openssh(SyntheticClient *syn, Client *cl, string& prefix); - - -list syn_modes; -list syn_iargs; -list syn_sargs; - -void parse_syn_options(vector& args) -{ - vector nargs; - - for (unsigned i=0; iclient = client; - thread_id = 0; - - did_readdir = false; - - run_only = -1; - exclude = -1; - - this->modes = syn_modes; - this->iargs = syn_iargs; - this->sargs = syn_sargs; - - run_start = g_clock.now(); -} - - - - -#define DBL 2 - -void *synthetic_client_thread_entry(void *ptr) -{ - SyntheticClient *sc = (SyntheticClient*)ptr; - //int r = - sc->run(); - return 0;//(void*)r; -} - -string SyntheticClient::get_sarg(int seq) -{ - string a; - if (!sargs.empty()) { - a = sargs.front(); - sargs.pop_front(); - } - if (a.length() == 0 || a == "~") { - char s[20]; - sprintf(s,"syn.%d.%d", client->whoami, seq); - a = s; - } - return a; -} - -int SyntheticClient::run() -{ - client->init(); - client->mount(); - - //run_start = g_clock.now(); - run_until = utime_t(0,0); - dout(5) << "run" << dendl; - - int seq = 0; - - for (list::iterator it = modes.begin(); - it != modes.end(); - it++) { - int mode = *it; - dout(3) << "mode " << mode << dendl; - - switch (mode) { - - - // WHO? - - case SYNCLIENT_MODE_ONLY: - { - run_only = iargs.front(); - iargs.pop_front(); - if (run_only == client->get_nodeid()) - dout(2) << "only " << run_only << dendl; - } - break; - case SYNCLIENT_MODE_ONLYRANGE: - { - int first = iargs.front(); - iargs.pop_front(); - int last = iargs.front(); - iargs.pop_front(); - if (first <= client->get_nodeid() && - last > client->get_nodeid()) { - run_only = client->get_nodeid(); - dout(2) << "onlyrange [" << first << ", " << last << ") includes me" << dendl; - } else - run_only = client->get_nodeid()+1; // not me - } - break; - case SYNCLIENT_MODE_EXCLUDE: - { - exclude = iargs.front(); - iargs.pop_front(); - if (exclude == client->get_nodeid()) { - run_only = client->get_nodeid() + 1; - dout(2) << "not running " << exclude << dendl; - } else - run_only = -1; - } - break; - - // HOW LONG? - - case SYNCLIENT_MODE_UNTIL: - { - int iarg1 = iargs.front(); - iargs.pop_front(); - if (run_me()) { - if (iarg1) { - dout(2) << "until " << iarg1 << dendl; - utime_t dur(iarg1,0); - run_until = run_start + dur; - } else { - dout(2) << "until " << iarg1 << " (no limit)" << dendl; - run_until = utime_t(0,0); - } - } - } - break; - - - // ... - - case SYNCLIENT_MODE_FOO: - if (run_me()) { - foo(); - } - did_run_me(); - break; - - case SYNCLIENT_MODE_RANDOMSLEEP: - { - int iarg1 = iargs.front(); - iargs.pop_front(); - if (run_me()) { - srand(time(0) + getpid() + client->whoami); - sleep(rand() % iarg1); - } - did_run_me(); - } - break; - - case SYNCLIENT_MODE_SLEEP: - { - int iarg1 = iargs.front(); - iargs.pop_front(); - if (run_me()) { - dout(2) << "sleep " << iarg1 << dendl; - sleep(iarg1); - } - did_run_me(); - } - break; - - case SYNCLIENT_MODE_SLEEPUNTIL: - { - int iarg1 = iargs.front(); - iargs.pop_front(); - if (iarg1 && run_me()) { - dout(2) << "sleepuntil " << iarg1 << dendl; - utime_t at = g_clock.now() - run_start; - if (at.sec() < iarg1) - sleep(iarg1 - at.sec()); - } - did_run_me(); - } - break; - - case SYNCLIENT_MODE_RANDOMWALK: - { - int iarg1 = iargs.front(); - iargs.pop_front(); - if (run_me()) { - dout(2) << "randomwalk " << iarg1 << dendl; - random_walk(iarg1); - } - did_run_me(); - } - break; - - case SYNCLIENT_MODE_MAKEDIRMESS: - { - string sarg1 = get_sarg(0); - int iarg1 = iargs.front(); iargs.pop_front(); - if (run_me()) { - dout(2) << "makedirmess " << sarg1 << " " << iarg1 << dendl; - make_dir_mess(sarg1.c_str(), iarg1); - } - did_run_me(); - } - break; - case SYNCLIENT_MODE_MAKEDIRS: - { - string sarg1 = get_sarg(seq++); - int iarg1 = iargs.front(); iargs.pop_front(); - int iarg2 = iargs.front(); iargs.pop_front(); - int iarg3 = iargs.front(); iargs.pop_front(); - if (run_me()) { - dout(2) << "makedirs " << sarg1 << " " << iarg1 << " " << iarg2 << " " << iarg3 << dendl; - make_dirs(sarg1.c_str(), iarg1, iarg2, iarg3); - } - did_run_me(); - } - break; - case SYNCLIENT_MODE_STATDIRS: - { - string sarg1 = get_sarg(0); - int iarg1 = iargs.front(); iargs.pop_front(); - int iarg2 = iargs.front(); iargs.pop_front(); - int iarg3 = iargs.front(); iargs.pop_front(); - if (run_me()) { - dout(2) << "statdirs " << sarg1 << " " << iarg1 << " " << iarg2 << " " << iarg3 << dendl; - stat_dirs(sarg1.c_str(), iarg1, iarg2, iarg3); - } - did_run_me(); - } - break; - case SYNCLIENT_MODE_READDIRS: - { - string sarg1 = get_sarg(0); - int iarg1 = iargs.front(); iargs.pop_front(); - int iarg2 = iargs.front(); iargs.pop_front(); - int iarg3 = iargs.front(); iargs.pop_front(); - if (run_me()) { - dout(2) << "readdirs " << sarg1 << " " << iarg1 << " " << iarg2 << " " << iarg3 << dendl; - read_dirs(sarg1.c_str(), iarg1, iarg2, iarg3); - } - did_run_me(); - } - break; - - - case SYNCLIENT_MODE_THRASHLINKS: - { - string sarg1 = get_sarg(0); - int iarg1 = iargs.front(); iargs.pop_front(); - int iarg2 = iargs.front(); iargs.pop_front(); - int iarg3 = iargs.front(); iargs.pop_front(); - int iarg4 = iargs.front(); iargs.pop_front(); - if (run_me()) { - dout(2) << "thrashlinks " << sarg1 << " " << iarg1 << " " << iarg2 << " " << iarg3 << dendl; - thrash_links(sarg1.c_str(), iarg1, iarg2, iarg3, iarg4); - } - did_run_me(); - } - break; - - case SYNCLIENT_MODE_LINKTEST: - { - if (run_me()) { - link_test(); - } - did_run_me(); - } - break; - - - case SYNCLIENT_MODE_MAKEFILES: - { - int num = iargs.front(); iargs.pop_front(); - int count = iargs.front(); iargs.pop_front(); - int priv = iargs.front(); iargs.pop_front(); - if (run_me()) { - dout(2) << "makefiles " << num << " " << count << " " << priv << dendl; - make_files(num, count, priv, false); - } - did_run_me(); - } - break; - case SYNCLIENT_MODE_MAKEFILES2: - { - int num = iargs.front(); iargs.pop_front(); - int count = iargs.front(); iargs.pop_front(); - int priv = iargs.front(); iargs.pop_front(); - if (run_me()) { - dout(2) << "makefiles2 " << num << " " << count << " " << priv << dendl; - make_files(num, count, priv, true); - } - did_run_me(); - } - break; - case SYNCLIENT_MODE_CREATESHARED: - { - string sarg1 = get_sarg(0); - int num = iargs.front(); iargs.pop_front(); - if (run_me()) { - dout(2) << "createshared " << num << dendl; - create_shared(num); - } - did_run_me(); - } - break; - case SYNCLIENT_MODE_OPENSHARED: - { - string sarg1 = get_sarg(0); - int num = iargs.front(); iargs.pop_front(); - int count = iargs.front(); iargs.pop_front(); - if (run_me()) { - dout(2) << "openshared " << num << dendl; - open_shared(num, count); - } - did_run_me(); - } - break; - - case SYNCLIENT_MODE_CREATEOBJECTS: - { - int count = iargs.front(); iargs.pop_front(); - int size = iargs.front(); iargs.pop_front(); - int inflight = iargs.front(); iargs.pop_front(); - if (run_me()) { - dout(2) << "createobjects " << cout << " of " << size << " bytes" - << ", " << inflight << " in flight" << dendl; - create_objects(count, size, inflight); - } - did_run_me(); - } - break; - case SYNCLIENT_MODE_OBJECTRW: - { - int count = iargs.front(); iargs.pop_front(); - int size = iargs.front(); iargs.pop_front(); - int wrpc = iargs.front(); iargs.pop_front(); - int overlap = iargs.front(); iargs.pop_front(); - int rskew = iargs.front(); iargs.pop_front(); - int wskew = iargs.front(); iargs.pop_front(); - if (run_me()) { - dout(2) << "objectrw " << cout << " " << size << " " << wrpc - << " " << overlap << " " << rskew << " " << wskew << dendl; - object_rw(count, size, wrpc, overlap, rskew, wskew); - } - did_run_me(); - } - break; - - case SYNCLIENT_MODE_FULLWALK: - { - string sarg1;// = get_sarg(0); - if (run_me()) { - dout(2) << "fullwalk" << sarg1 << dendl; - full_walk(sarg1); - } - did_run_me(); - } - break; - case SYNCLIENT_MODE_REPEATWALK: - { - string sarg1 = get_sarg(0); - if (run_me()) { - dout(2) << "repeatwalk " << sarg1 << dendl; - while (full_walk(sarg1) == 0) ; - } - did_run_me(); - } - break; - - case SYNCLIENT_MODE_WRITEFILE: - { - string sarg1 = get_sarg(0); - int iarg1 = iargs.front(); iargs.pop_front(); - int iarg2 = iargs.front(); iargs.pop_front(); - dout(1) << "WRITING SYN CLIENT" << dendl; - if (run_me()) { - write_file(sarg1, iarg1, iarg2); - } - did_run_me(); - } - break; - case SYNCLIENT_MODE_WRSHARED: - { - string sarg1 = "shared"; - int iarg1 = iargs.front(); iargs.pop_front(); - int iarg2 = iargs.front(); iargs.pop_front(); - if (run_me()) { - write_file(sarg1, iarg1, iarg2); - } - did_run_me(); - } - break; - case SYNCLIENT_MODE_READSHARED: - { - string sarg1 = "shared"; - int iarg1 = iargs.front(); iargs.pop_front(); - int iarg2 = iargs.front(); iargs.pop_front(); - if (run_me()) { - read_file(sarg1, iarg1, iarg2, true); - } - did_run_me(); - } - break; - case SYNCLIENT_MODE_WRITEBATCH: - { - int iarg1 = iargs.front(); iargs.pop_front(); - int iarg2 = iargs.front(); iargs.pop_front(); - int iarg3 = iargs.front(); iargs.pop_front(); - - if (run_me()) { - write_batch(iarg1, iarg2, iarg3); - } - did_run_me(); - } - break; - - case SYNCLIENT_MODE_READFILE: - { - string sarg1 = get_sarg(0); - int iarg1 = iargs.front(); iargs.pop_front(); - int iarg2 = iargs.front(); iargs.pop_front(); - - dout(1) << "READING SYN CLIENT" << dendl; - if (run_me()) { - read_file(sarg1, iarg1, iarg2); - } - did_run_me(); - } - break; - - case SYNCLIENT_MODE_RDWRRANDOM: - { - string sarg1 = get_sarg(0); - int iarg1 = iargs.front(); iargs.pop_front(); - int iarg2 = iargs.front(); iargs.pop_front(); - - dout(1) << "RANDOM READ WRITE SYN CLIENT" << dendl; - if (run_me()) { - read_random(sarg1, iarg1, iarg2); - } - did_run_me(); - } - break; - - case SYNCLIENT_MODE_RDWRRANDOM_EX: - { - string sarg1 = get_sarg(0); - int iarg1 = iargs.front(); iargs.pop_front(); - int iarg2 = iargs.front(); iargs.pop_front(); - - dout(1) << "RANDOM READ WRITE SYN CLIENT" << dendl; - if (run_me()) { - read_random_ex(sarg1, iarg1, iarg2); - } - did_run_me(); - } - break; - case SYNCLIENT_MODE_TRACE: - { - string tfile = get_sarg(0); - sargs.push_front(string("~")); - int iarg1 = iargs.front(); iargs.pop_front(); - int playdata = iargs.front(); iargs.pop_front(); - string prefix = get_sarg(0); - char realtfile[100]; - sprintf(realtfile, tfile.c_str(), client->get_nodeid()); - - if (run_me()) { - dout(-2) << "trace " << tfile << " prefix=" << prefix << " count=" << iarg1 << " data=" << playdata << dendl; - - Trace t(realtfile); - - if (iarg1 == 0) iarg1 = 1; // play trace at least once! - - for (int i=0; i 1) clean_dir(prefix); // clean only if repeat - - utime_t lat = g_clock.now(); - lat -= start; - - dout(0) << " trace " << tfile << " loop " << (i+1) << "/" << iarg1 << " done in " << (double)lat << " seconds" << dendl; - if (client_logger - && i > 0 - && i < iarg1-1 - ) { - client_logger->finc("trsum", (double)lat); - client_logger->inc("trnum"); - } - } - dout(1) << "done " << dendl; - } - did_run_me(); - } - break; - - - case SYNCLIENT_MODE_OPENTEST: - { - int count = iargs.front(); iargs.pop_front(); - if (run_me()) { - for (int i=0; iopen("test", rand()%2 ? (O_WRONLY|O_CREAT):O_RDONLY); - if (fd > 0) client->close(fd); - } - } - did_run_me(); - } - break; - - case SYNCLIENT_MODE_OPTEST: - { - int count = iargs.front(); iargs.pop_front(); - if (run_me()) { - client->mknod("test", 0777); - struct stat st; - for (int i=0; ilstat("test", &st); - client->chmod("test", 0777); - } - } - did_run_me(); - } - break; - - case SYNCLIENT_MODE_TRUNCATE: - { - string file = get_sarg(0); - sargs.push_front(file); - int iarg1 = iargs.front(); iargs.pop_front(); - if (run_me()) { - client->truncate(file.c_str(), iarg1); - } - did_run_me(); - } - break; - - - case SYNCLIENT_MODE_IMPORTFIND: - { - string base = get_sarg(0); - string find = get_sarg(0); - int data = get_iarg(); - if (run_me()) { - import_find(base.c_str(), find.c_str(), data); - } - did_run_me(); - } - break; - - default: - assert(0); - } - } - dout(1) << "syn done, unmounting " << dendl; - - client->unmount(); - client->shutdown(); - return 0; -} - - -int SyntheticClient::start_thread() -{ - assert(!thread_id); - - pthread_create(&thread_id, NULL, synthetic_client_thread_entry, this); - assert(thread_id); - return 0; -} - -int SyntheticClient::join_thread() -{ - assert(thread_id); - void *rv; - pthread_join(thread_id, &rv); - return 0; -} - - -bool roll_die(float p) -{ - float r = (float)(rand() % 100000) / 100000.0; - if (r < p) - return true; - else - return false; -} - -void SyntheticClient::init_op_dist() -{ - op_dist.clear(); - op_dist.add( MDS_OP_STAT, g_conf.fakeclient_op_stat ); - op_dist.add( MDS_OP_UTIME, g_conf.fakeclient_op_utime ); - op_dist.add( MDS_OP_CHMOD, g_conf.fakeclient_op_chmod ); - op_dist.add( MDS_OP_CHOWN, g_conf.fakeclient_op_chown ); - - op_dist.add( MDS_OP_READDIR, g_conf.fakeclient_op_readdir ); - op_dist.add( MDS_OP_MKNOD, g_conf.fakeclient_op_mknod ); - op_dist.add( MDS_OP_LINK, g_conf.fakeclient_op_link ); - op_dist.add( MDS_OP_UNLINK, g_conf.fakeclient_op_unlink ); - op_dist.add( MDS_OP_RENAME, g_conf.fakeclient_op_rename ); - - op_dist.add( MDS_OP_MKDIR, g_conf.fakeclient_op_mkdir ); - op_dist.add( MDS_OP_RMDIR, g_conf.fakeclient_op_rmdir ); - op_dist.add( MDS_OP_SYMLINK, g_conf.fakeclient_op_symlink ); - - op_dist.add( MDS_OP_OPEN, g_conf.fakeclient_op_openrd ); - //op_dist.add( MDS_OP_READ, g_conf.fakeclient_op_read ); - //op_dist.add( MDS_OP_WRITE, g_conf.fakeclient_op_write ); - op_dist.add( MDS_OP_TRUNCATE, g_conf.fakeclient_op_truncate ); - op_dist.add( MDS_OP_FSYNC, g_conf.fakeclient_op_fsync ); - op_dist.add( MDS_OP_RELEASE, g_conf.fakeclient_op_close ); // actually, close() - op_dist.normalize(); -} - -void SyntheticClient::up() -{ - cwd = cwd.prefixpath(cwd.depth()-1); - dout(DBL) << "cd .. -> " << cwd << dendl; - clear_dir(); -} - - -int SyntheticClient::play_trace(Trace& t, string& prefix, bool metadata_only) -{ - dout(4) << "play trace prefix '" << prefix << "'" << dendl; - t.start(); - - char buf[1024]; - char buf2[1024]; - - utime_t start = g_clock.now(); - - hash_map open_files; - hash_map open_dirs; - - hash_map ll_files; - hash_map ll_dirs; - hash_map ll_inos; - - ll_inos[1] = 1; // root inode is known. - - // prefix? - const char *p = prefix.c_str(); - if (prefix.length()) { - client->mkdir(prefix.c_str(), 0755); - struct stat attr; - if (client->ll_lookup(1, prefix.c_str(), &attr) == 0) { - ll_inos[1] = attr.st_ino; - dout(5) << "'root' ino is " << inodeno_t(attr.st_ino) << dendl; - } else { - dout(0) << "warning: play_trace coudln't lookup up my per-client directory" << dendl; - } - } - - - utime_t last_status = start; - - int n = 0; - - // for object traces - Mutex &lock = client->client_lock; - Cond cond; - bool ack; - bool safe; - C_Gather *safeg = new C_Gather(new C_SafeCond(&lock, &cond, &safe)); - Context *safegref = safeg->new_sub(); // take a ref - - while (!t.end()) { - - if (++n == 100) { - n = 00; - utime_t now = last_status; - if (now - last_status > 1.0) { - last_status = now; - dout(1) << "play_trace at line " << t.get_line() << dendl; - } - } - - if (time_to_stop()) break; - - // op - const char *op = t.get_string(buf, 0); - dout(4) << (t.get_line()-1) << ": trace op " << op << dendl; - - if (op[0] == '@') { - // timestamp... ignore it! - t.get_int(); // sec - t.get_int(); // usec - op = t.get_string(buf, 0); - } - - // high level ops --------------------- - if (strcmp(op, "link") == 0) { - const char *a = t.get_string(buf, p); - const char *b = t.get_string(buf2, p); - client->link(a,b); - } else if (strcmp(op, "unlink") == 0) { - const char *a = t.get_string(buf, p); - client->unlink(a); - } else if (strcmp(op, "rename") == 0) { - const char *a = t.get_string(buf, p); - const char *b = t.get_string(buf2, p); - client->rename(a,b); - } else if (strcmp(op, "mkdir") == 0) { - const char *a = t.get_string(buf, p); - int64_t b = t.get_int(); - client->mkdir(a, b); - } else if (strcmp(op, "rmdir") == 0) { - const char *a = t.get_string(buf, p); - client->rmdir(a); - } else if (strcmp(op, "symlink") == 0) { - const char *a = t.get_string(buf, p); - const char *b = t.get_string(buf2, p); - client->symlink(a,b); - } else if (strcmp(op, "readlink") == 0) { - const char *a = t.get_string(buf, p); - char buf[100]; - client->readlink(a, buf, 100); - } else if (strcmp(op, "lstat") == 0) { - struct stat st; - const char *a = t.get_string(buf, p); - if (strcmp(a, p) != 0 && - strcmp(a, "/") != 0 && - strcmp(a, "/lib") != 0 && // or /lib.. that would be a lookup. hack. - a[0] != 0) // stop stating the root directory already - client->lstat(a, &st); - } else if (strcmp(op, "chmod") == 0) { - const char *a = t.get_string(buf, p); - int64_t b = t.get_int(); - client->chmod(a, b); - } else if (strcmp(op, "chown") == 0) { - const char *a = t.get_string(buf, p); - int64_t b = t.get_int(); - int64_t c = t.get_int(); - client->chown(a, b, c); - } else if (strcmp(op, "utime") == 0) { - const char *a = t.get_string(buf, p); - int64_t b = t.get_int(); - int64_t c = t.get_int(); - struct utimbuf u; - u.actime = b; - u.modtime = c; - client->utime(a, &u); - } else if (strcmp(op, "mknod") == 0) { - const char *a = t.get_string(buf, p); - int64_t b = t.get_int(); - int64_t c = t.get_int(); - client->mknod(a, b, c); - } else if (strcmp(op, "oldmknod") == 0) { - const char *a = t.get_string(buf, p); - int64_t b = t.get_int(); - client->mknod(a, b, 0); - } else if (strcmp(op, "getdir") == 0) { - const char *a = t.get_string(buf, p); - list contents; - client->getdir(a, contents); - } else if (strcmp(op, "getdir") == 0) { - const char *a = t.get_string(buf, p); - list contents; - client->getdir(a, contents); - } else if (strcmp(op, "opendir") == 0) { - const char *a = t.get_string(buf, p); - int64_t b = t.get_int(); - DIR *dirp; - client->opendir(a, &dirp); - if (dirp) open_dirs[b] = dirp; - } else if (strcmp(op, "closedir") == 0) { - int64_t a = t.get_int(); - client->closedir(open_dirs[a]); - open_dirs.erase(a); - } else if (strcmp(op, "open") == 0) { - const char *a = t.get_string(buf, p); - int64_t b = t.get_int(); - int64_t c = t.get_int(); - int64_t d = t.get_int(); - int64_t fd = client->open(a, b, c); - if (fd > 0) open_files[d] = fd; - } else if (strcmp(op, "oldopen") == 0) { - const char *a = t.get_string(buf, p); - int64_t b = t.get_int(); - int64_t d = t.get_int(); - int64_t fd = client->open(a, b, 0755); - if (fd > 0) open_files[d] = fd; - } else if (strcmp(op, "close") == 0) { - int64_t id = t.get_int(); - int64_t fh = open_files[id]; - if (fh > 0) client->close(fh); - open_files.erase(id); - } else if (strcmp(op, "lseek") == 0) { - int64_t f = t.get_int(); - int fd = open_files[f]; - int64_t off = t.get_int(); - int64_t whence = t.get_int(); - client->lseek(fd, off, whence); - } else if (strcmp(op, "read") == 0) { - int64_t f = t.get_int(); - int64_t size = t.get_int(); - int64_t off = t.get_int(); - int64_t fd = open_files[f]; - if (!metadata_only) { - char *b = new char[size]; - client->read(fd, b, size, off); - delete[] b; - } - } else if (strcmp(op, "write") == 0) { - int64_t f = t.get_int(); - int64_t fd = open_files[f]; - int64_t size = t.get_int(); - int64_t off = t.get_int(); - if (!metadata_only) { - char *b = new char[size]; - memset(b, 1, size); // let's write 1's! - client->write(fd, b, size, off); - delete[] b; - } else { - client->write(fd, NULL, 0, size+off); - } - } else if (strcmp(op, "truncate") == 0) { - const char *a = t.get_string(buf, p); - int64_t l = t.get_int(); - client->truncate(a, l); - } else if (strcmp(op, "ftruncate") == 0) { - int64_t f = t.get_int(); - int fd = open_files[f]; - int64_t l = t.get_int(); - client->ftruncate(fd, l); - } else if (strcmp(op, "fsync") == 0) { - int64_t f = t.get_int(); - int64_t b = t.get_int(); - int fd = open_files[f]; - client->fsync(fd, b); - } else if (strcmp(op, "chdir") == 0) { - const char *a = t.get_string(buf, p); - client->chdir(a); - } else if (strcmp(op, "statfs") == 0) { - struct statvfs stbuf; - client->statfs("/", &stbuf); - } - - // low level ops --------------------- - else if (strcmp(op, "ll_lookup") == 0) { - int64_t i = t.get_int(); - const char *name = t.get_string(buf, p); - int64_t r = t.get_int(); - struct stat attr; - if (ll_inos.count(i) && - client->ll_lookup(ll_inos[i], name, &attr) == 0) - ll_inos[r] = attr.st_ino; - } else if (strcmp(op, "ll_forget") == 0) { - int64_t i = t.get_int(); - int64_t n = t.get_int(); - if (ll_inos.count(i) && - client->ll_forget(ll_inos[i], n)) - ll_inos.erase(i); - } else if (strcmp(op, "ll_getattr") == 0) { - int64_t i = t.get_int(); - struct stat attr; - if (ll_inos.count(i)) - client->ll_getattr(ll_inos[i], &attr); - } else if (strcmp(op, "ll_setattr") == 0) { - int64_t i = t.get_int(); - struct stat attr; - memset(&attr, 0, sizeof(attr)); - attr.st_mode = t.get_int(); - attr.st_uid = t.get_int(); - attr.st_gid = t.get_int(); - attr.st_size = t.get_int(); - attr.st_mtime = t.get_int(); - attr.st_atime = t.get_int(); - int mask = t.get_int(); - if (ll_inos.count(i)) - client->ll_setattr(ll_inos[i], &attr, mask); - } else if (strcmp(op, "ll_readlink") == 0) { - int64_t i = t.get_int(); - const char *value; - if (ll_inos.count(i)) - client->ll_readlink(ll_inos[i], &value); - } else if (strcmp(op, "ll_mknod") == 0) { - int64_t i = t.get_int(); - const char *n = t.get_string(buf, p); - int m = t.get_int(); - int r = t.get_int(); - int64_t ri = t.get_int(); - struct stat attr; - if (ll_inos.count(i) && - client->ll_mknod(ll_inos[i], n, m, r, &attr) == 0) - ll_inos[ri] = attr.st_ino; - } else if (strcmp(op, "ll_mkdir") == 0) { - int64_t i = t.get_int(); - const char *n = t.get_string(buf, p); - int m = t.get_int(); - int64_t ri = t.get_int(); - struct stat attr; - if (ll_inos.count(i) && - client->ll_mkdir(ll_inos[i], n, m, &attr) == 0) - ll_inos[ri] = attr.st_ino; - } else if (strcmp(op, "ll_symlink") == 0) { - int64_t i = t.get_int(); - const char *n = t.get_string(buf, p); - const char *v = t.get_string(buf2, p); - int64_t ri = t.get_int(); - struct stat attr; - if (ll_inos.count(i) && - client->ll_symlink(ll_inos[i], n, v, &attr) == 0) - ll_inos[ri] = attr.st_ino; - } else if (strcmp(op, "ll_unlink") == 0) { - int64_t i = t.get_int(); - const char *n = t.get_string(buf, p); - if (ll_inos.count(i)) - client->ll_unlink(ll_inos[i], n); - } else if (strcmp(op, "ll_rmdir") == 0) { - int64_t i = t.get_int(); - const char *n = t.get_string(buf, p); - if (ll_inos.count(i)) - client->ll_rmdir(ll_inos[i], n); - } else if (strcmp(op, "ll_rename") == 0) { - int64_t i = t.get_int(); - const char *n = t.get_string(buf, p); - int64_t ni = t.get_int(); - const char *nn = t.get_string(buf2, p); - if (ll_inos.count(i) && - ll_inos.count(ni)) - client->ll_rename(ll_inos[i], n, ll_inos[ni], nn); - } else if (strcmp(op, "ll_link") == 0) { - int64_t i = t.get_int(); - int64_t ni = t.get_int(); - const char *nn = t.get_string(buf, p); - struct stat attr; - if (ll_inos.count(i) && - ll_inos.count(ni)) - client->ll_link(ll_inos[i], ll_inos[ni], nn, &attr); - } else if (strcmp(op, "ll_opendir") == 0) { - int64_t i = t.get_int(); - int64_t r = t.get_int(); - void *dirp; - if (ll_inos.count(i) && - client->ll_opendir(ll_inos[i], &dirp) == 0) - ll_dirs[r] = dirp; - } else if (strcmp(op, "ll_releasedir") == 0) { - int64_t f = t.get_int(); - if (ll_dirs.count(f)) { - client->ll_releasedir(ll_dirs[f]); - ll_dirs.erase(f); - } - } else if (strcmp(op, "ll_open") == 0) { - int64_t i = t.get_int(); - int64_t f = t.get_int(); - int64_t r = t.get_int(); - Fh *fhp; - if (ll_inos.count(i) && - client->ll_open(ll_inos[i], f, &fhp) == 0) - ll_files[r] = fhp; - } else if (strcmp(op, "ll_create") == 0) { - int64_t i = t.get_int(); - const char *n = t.get_string(buf, p); - int64_t m = t.get_int(); - int64_t f = t.get_int(); - int64_t r = t.get_int(); - int64_t ri = t.get_int(); - Fh *fhp; - struct stat attr; - if (ll_inos.count(i) && - client->ll_create(ll_inos[i], n, m, f, &attr, &fhp) == 0) { - ll_inos[ri] = attr.st_ino; - ll_files[r] = fhp; - } - } else if (strcmp(op, "ll_read") == 0) { - int64_t f = t.get_int(); - int64_t off = t.get_int(); - int64_t size = t.get_int(); - if (ll_files.count(f) && - !metadata_only) { - bufferlist bl; - client->ll_read(ll_files[f], off, size, &bl); - } - } else if (strcmp(op, "ll_write") == 0) { - int64_t f = t.get_int(); - int64_t off = t.get_int(); - int64_t size = t.get_int(); - if (ll_files.count(f)) { - if (!metadata_only) { - bufferlist bl; - bufferptr bp(size); - bl.push_back(bp); - bp.zero(); - client->ll_write(ll_files[f], off, size, bl.c_str()); - } else { - client->ll_write(ll_files[f], off+size, 0, NULL); - } - } - } else if (strcmp(op, "ll_flush") == 0) { - int64_t f = t.get_int(); - if (!metadata_only && - ll_files.count(f)) - client->ll_flush(ll_files[f]); - } else if (strcmp(op, "ll_fsync") == 0) { - int64_t f = t.get_int(); - if (!metadata_only && - ll_files.count(f)) - client->ll_fsync(ll_files[f], false); // FIXME dataonly param - } else if (strcmp(op, "ll_release") == 0) { - int64_t f = t.get_int(); - if (ll_files.count(f)) { - client->ll_release(ll_files[f]); - ll_files.erase(f); - } - } else if (strcmp(op, "ll_statfs") == 0) { - int64_t i = t.get_int(); - if (ll_inos.count(i)) - {} //client->ll_statfs(ll_inos[i]); - } - - - // object-level traces - - else if (strcmp(op, "o_stat") == 0) { - int64_t oh = t.get_int(); - int64_t ol = t.get_int(); - object_t oid(oh, ol); - lock.Lock(); - ObjectLayout layout = client->osdmap->make_object_layout(oid, pg_t::TYPE_REP, 2); - off_t size; - client->objecter->stat(oid, &size, layout, new C_SafeCond(&lock, &cond, &ack)); - while (!ack) cond.Wait(lock); - lock.Unlock(); - } - else if (strcmp(op, "o_read") == 0) { - int64_t oh = t.get_int(); - int64_t ol = t.get_int(); - int64_t off = t.get_int(); - int64_t len = t.get_int(); - object_t oid(oh, ol); - lock.Lock(); - ObjectLayout layout = client->osdmap->make_object_layout(oid, pg_t::TYPE_REP, 2); - bufferlist bl; - client->objecter->read(oid, off, len, layout, &bl, new C_SafeCond(&lock, &cond, &ack)); - while (!ack) cond.Wait(lock); - lock.Unlock(); - } - else if (strcmp(op, "o_write") == 0) { - int64_t oh = t.get_int(); - int64_t ol = t.get_int(); - int64_t off = t.get_int(); - int64_t len = t.get_int(); - object_t oid(oh, ol); - lock.Lock(); - ObjectLayout layout = client->osdmap->make_object_layout(oid, pg_t::TYPE_REP, 2); - bufferptr bp(len); - bufferlist bl; - bl.push_back(bp); - client->objecter->write(oid, off, len, layout, bl, - new C_SafeCond(&lock, &cond, &ack), - safeg->new_sub()); - while (!ack) cond.Wait(lock); - lock.Unlock(); - } - else if (strcmp(op, "o_zero") == 0) { - int64_t oh = t.get_int(); - int64_t ol = t.get_int(); - int64_t off = t.get_int(); - int64_t len = t.get_int(); - object_t oid(oh, ol); - lock.Lock(); - ObjectLayout layout = client->osdmap->make_object_layout(oid, pg_t::TYPE_REP, 2); - client->objecter->zero(oid, off, len, layout, - new C_SafeCond(&lock, &cond, &ack), - safeg->new_sub()); - while (!ack) cond.Wait(lock); - lock.Unlock(); - } - - - else { - dout(0) << (t.get_line()-1) << ": *** trace hit unrecognized symbol '" << op << "' " << dendl; - assert(0); - } - } - - dout(10) << "trace finished on line " << t.get_line() << dendl; - - // wait for safe after an object trace - safegref->finish(0); - delete safegref; - lock.Lock(); - while (!safe) { - dout(10) << "waiting for safe" << dendl; - cond.Wait(lock); - } - lock.Unlock(); - - // close open files - for (hash_map::iterator fi = open_files.begin(); - fi != open_files.end(); - fi++) { - dout(1) << "leftover close " << fi->second << dendl; - if (fi->second > 0) client->close(fi->second); - } - for (hash_map::iterator fi = open_dirs.begin(); - fi != open_dirs.end(); - fi++) { - dout(1) << "leftover closedir " << fi->second << dendl; - if (fi->second != 0) client->closedir(fi->second); - } - for (hash_map::iterator fi = ll_files.begin(); - fi != ll_files.end(); - fi++) { - dout(1) << "leftover ll_release " << fi->second << dendl; - if (fi->second > 0) client->ll_release(fi->second); - } - for (hash_map::iterator fi = ll_dirs.begin(); - fi != ll_dirs.end(); - fi++) { - dout(1) << "leftover ll_releasedir " << fi->second << dendl; - if (fi->second > 0) client->ll_releasedir(fi->second); - } - - return 0; -} - - - -int SyntheticClient::clean_dir(string& basedir) -{ - // read dir - list contents; - int r = client->getdir(basedir.c_str(), contents); - if (r < 0) { - dout(1) << "readdir on " << basedir << " returns " << r << dendl; - return r; - } - - for (list::iterator it = contents.begin(); - it != contents.end(); - it++) { - if (*it == ".") continue; - if (*it == "..") continue; - string file = basedir + "/" + *it; - - if (time_to_stop()) break; - - struct stat st; - int r = client->lstat(file.c_str(), &st); - if (r < 0) { - dout(1) << "stat error on " << file << " r=" << r << dendl; - continue; - } - - if ((st.st_mode & INODE_TYPE_MASK) == INODE_MODE_DIR) { - clean_dir(file); - client->rmdir(file.c_str()); - } else { - client->unlink(file.c_str()); - } - } - - return 0; - -} - - -int SyntheticClient::full_walk(string& basedir) -{ - if (time_to_stop()) return -1; - - list dirq; - dirq.push_back(basedir); - - while (!dirq.empty()) { - string dir = dirq.front(); - dirq.pop_front(); - - // read dir - list contents; - int r = client->getdir(dir.c_str(), contents); - if (r < 0) { - dout(1) << "readdir on " << dir << " returns " << r << dendl; - continue; - } - - for (list::iterator it = contents.begin(); - it != contents.end(); - it++) { - if (*it == "." || - *it == "..") - continue; - string file = dir + "/" + *it; - - struct stat st; - int r = client->lstat(file.c_str(), &st); - if (r < 0) { - dout(1) << "stat error on " << file << " r=" << r << dendl; - continue; - } - - // print - char *tm = ctime(&st.st_mtime); - tm[strlen(tm)-1] = 0; - printf("%llx %c%c%c%c%c%c%c%c%c%c %2d %5d %5d %8d %12s %s\n", - (long long)st.st_ino, - S_ISDIR(st.st_mode) ? 'd':'-', - (st.st_mode & 0400) ? 'r':'-', - (st.st_mode & 0200) ? 'w':'-', - (st.st_mode & 0100) ? 'x':'-', - (st.st_mode & 040) ? 'r':'-', - (st.st_mode & 020) ? 'w':'-', - (st.st_mode & 010) ? 'x':'-', - (st.st_mode & 04) ? 'r':'-', - (st.st_mode & 02) ? 'w':'-', - (st.st_mode & 01) ? 'x':'-', - (int)st.st_nlink, - st.st_uid, st.st_gid, - (int)st.st_size, - tm, - file.c_str()); - - - if ((st.st_mode & INODE_TYPE_MASK) == INODE_MODE_DIR) { - dirq.push_back(file); - } - } - } - - return 0; -} - -int SyntheticClient::make_dirs(const char *basedir, int dirs, int files, int depth) -{ - if (time_to_stop()) return 0; - - // make sure base dir exists - int r = client->mkdir(basedir, 0755); - if (r != 0) { - dout(1) << "can't make base dir? " << basedir << dendl; - return -1; - } - - // children - char d[500]; - dout(3) << "make_dirs " << basedir << " dirs " << dirs << " files " << files << " depth " << depth << dendl; - for (int i=0; imknod(d, 0644); - } - - if (depth == 0) return 0; - - for (int i=0; ilstat(basedir, &st); - if (r != 0) { - dout(1) << "can't make base dir? " << basedir << dendl; - return -1; - } - - // children - char d[500]; - dout(3) << "stat_dirs " << basedir << " dirs " << dirs << " files " << files << " depth " << depth << dendl; - for (int i=0; ilstat(d, &st); - } - - if (depth == 0) return 0; - - for (int i=0; i contents; - utime_t s = g_clock.now(); - int r = client->getdir(basedir, contents); - utime_t e = g_clock.now(); - e -= s; - if (client_logger) client_logger->finc("readdir", e); - if (r < 0) { - dout(0) << "read_dirs couldn't readdir " << basedir << ", stopping" << dendl; - return -1; - } - - for (int i=0; ilstat(d, &st) < 0) { - dout(2) << "read_dirs failed stat on " << d << ", stopping" << dendl; - return -1; - } - utime_t e = g_clock.now(); - e -= s; - if (client_logger) client_logger->finc("stat", e); - } - - if (depth > 0) - for (int i=0; iget_nodeid(); - char d[255]; - - if (priv) { - for (int c=0; cmkdir(d, 0755); - } - } else { - // shared - if (true || whoami == 0) { - for (int c=0; cmkdir(d, 0755); - } - } else { - sleep(2); - } - } - - // files - struct stat st; - utime_t start = g_clock.now(); - for (int c=0; cmknod(d, 0644); - - if (more) { - client->lstat(d, &st); - int fd = client->open(d, O_RDONLY); - client->unlink(d); - client->close(fd); - } - - if (time_to_stop()) return 0; - } - } - utime_t end = g_clock.now(); - end -= start; - dout(0) << "makefiles time is " << end << " or " << ((double)end / (double)num) <<" per file" << dendl; - - return 0; -} - -int SyntheticClient::link_test() -{ - char d[255]; - char e[255]; - - // create files - int num = 200; - - client->mkdir("orig", 0755); - client->mkdir("copy", 0755); - - utime_t start = g_clock.now(); - for (int i=0; imknod(d, 0755); - } - utime_t end = g_clock.now(); - end -= start; - - dout(0) << "orig " << end << dendl; - - // link - start = g_clock.now(); - for (int i=0; ilink(d, e); - } - end = g_clock.now(); - end -= start; - dout(0) << "copy " << end << dendl; - - return 0; -} - - -int SyntheticClient::create_shared(int num) -{ - // files - char d[255]; - for (int n=0; nmknod(d, 0644); - } - - return 0; -} - -int SyntheticClient::open_shared(int num, int count) -{ - // files - char d[255]; - for (int c=0; c fds; - for (int n=0; nopen(d,O_RDONLY); - fds.push_back(fd); - } - - while (!fds.empty()) { - int fd = fds.front(); - fds.pop_front(); - client->close(fd); - } - } - - return 0; -} - - -int SyntheticClient::write_file(string& fn, int size, int wrsize) // size is in MB, wrsize in bytes -{ - //uint64_t wrsize = 1024*256; - char *buf = new char[wrsize+100]; // 1 MB - memset(buf, 7, wrsize); - uint64_t chunks = (uint64_t)size * (uint64_t)(1024*1024) / (uint64_t)wrsize; - - int fd = client->open(fn.c_str(), O_RDWR|O_CREAT); - dout(5) << "writing to " << fn << " fd " << fd << dendl; - if (fd < 0) return fd; - - for (unsigned i=0; iget_nodeid(); - p++; - } - - client->write(fd, buf, wrsize, i*wrsize); - } - - client->close(fd); - delete[] buf; - - return 0; -} - -int SyntheticClient::write_batch(int nfile, int size, int wrsize) -{ - for (int i=0; iopen(fn.c_str(), O_RDONLY); - dout(5) << "reading from " << fn << " fd " << fd << dendl; - if (fd < 0) return fd; - - for (unsigned i=0; iread(fd, buf, rdsize, i*rdsize); - if (r < rdsize) { - dout(1) << "read_file got r = " << r << ", probably end of file" << dendl; - break; - } - - // verify fingerprint - int bad = 0; - uint64_t *p = (uint64_t*)buf; - uint64_t readoff; - int64_t readclient; - while ((char*)p + 32 < buf + rdsize) { - readoff = *p; - uint64_t wantoff = (uint64_t)i*(uint64_t)rdsize + (uint64_t)((char*)p - buf); - p++; - readclient = *p; - p++; - if (readoff != wantoff || - readclient != client->get_nodeid()) { - if (!bad && !ignoreprint) - dout(0) << "WARNING: wrong data from OSD, block says fileoffset=" << readoff << " client=" << readclient - << ", should be offset " << wantoff << " clietn " << client->get_nodeid() - << dendl; - bad++; - } - } - if (bad && !ignoreprint) - dout(0) << " + " << (bad-1) << " other bad 16-byte bits in this block" << dendl; - } - - client->close(fd); - delete[] buf; - - return 0; -} - - - - -class C_Ref : public Context { - Mutex& lock; - Cond& cond; - int *ref; -public: - C_Ref(Mutex &l, Cond &c, int *r) : lock(l), cond(c), ref(r) { - lock.Lock(); - (*ref)++; - lock.Unlock(); - } - void finish(int) { - lock.Lock(); - (*ref)--; - cond.Signal(); - lock.Unlock(); - } -}; - -int SyntheticClient::create_objects(int nobj, int osize, int inflight) -{ - // divy up - int numc = g_conf.num_client ? g_conf.num_client : 1; - - int start, inc, end; - - if (1) { - // strided - start = client->get_nodeid(); //nobjs % numc; - inc = numc; - end = start + nobj; - } else { - // segments - start = nobj * client->get_nodeid() / numc; - inc = 1; - end = nobj * (client->get_nodeid()+1) / numc; - } - - dout(5) << "create_objects " << nobj << " size=" << osize - << " .. doing [" << start << "," << end << ") inc " << inc - << dendl; - - bufferptr bp(osize); - bp.zero(); - bufferlist bl; - bl.push_back(bp); - - Mutex lock; - Cond cond; - - int unack = 0; - int unsafe = 0; - - list starts; - - for (int i=start; iosdmap->make_object_layout(oid, pg_t::TYPE_REP, g_OSD_FileLayout.fl_pg_size); - - if (i % inflight == 0) { - dout(6) << "create_objects " << i << "/" << (nobj+1) << dendl; - } - dout(10) << "writing " << oid << dendl; - - starts.push_back(g_clock.now()); - client->client_lock.Lock(); - client->objecter->write(oid, 0, osize, layout, bl, - new C_Ref(lock, cond, &unack), - new C_Ref(lock, cond, &unsafe)); - client->client_lock.Unlock(); - - lock.Lock(); - while (unack > inflight) { - dout(20) << "waiting for " << unack << " unack" << dendl; - cond.Wait(lock); - } - lock.Unlock(); - - utime_t lat = g_clock.now(); - lat -= starts.front(); - starts.pop_front(); - if (client_logger) - client_logger->favg("owrlat", lat); - } - - lock.Lock(); - while (unack > 0) { - dout(20) << "waiting for " << unack << " unack" << dendl; - cond.Wait(lock); - } - while (unsafe > 0) { - dout(10) << "waiting for " << unsafe << " unsafe" << dendl; - cond.Wait(lock); - } - lock.Unlock(); - - dout(5) << "create_objects done" << dendl; - derr(0) << "create_objects done" << dendl; - return 0; -} - -int SyntheticClient::object_rw(int nobj, int osize, int wrpc, - int overlappc, - double rskew, double wskew) -{ - dout(5) << "object_rw " << nobj << " size=" << osize << " with " - << wrpc << "% writes" - << ", " << overlappc << "% overlap" - << ", rskew = " << rskew - << ", wskew = " << wskew - << dendl; - - bufferptr bp(osize); - bp.zero(); - bufferlist bl; - bl.push_back(bp); - - // start with odd number > nobj - rjhash h; - unsigned prime = nobj + 1; // this is the minimum! - prime += h(nobj) % (3*nobj); // bump it up some - prime |= 1; // make it odd - - while (true) { - unsigned j; - for (j=2; j*j<=prime; j++) - if (prime % j == 0) break; - if (j*j > prime) { - break; - //cout << "prime " << prime << endl; - } - prime += 2; - } - - Mutex lock; - Cond cond; - - int unack = 0; - int unsafe = 0; - - while (1) { - if (time_to_stop()) break; - - // read or write? - bool write = (rand() % 100) < wrpc; - - // choose object - double r = drand48(); // [0..1) - long o; - if (write) { - o = (long)trunc(pow(r, wskew) * (double)nobj); // exponentially skew towards 0 - int pnoremap = (long)(r * 100.0); - if (pnoremap >= overlappc) - o = (o*prime) % nobj; // remap - } else { - o = (long)trunc(pow(r, rskew) * (double)nobj); // exponentially skew towards 0 - } - object_t oid(0x1000, o); - - ObjectLayout layout = client->osdmap->make_object_layout(oid, pg_t::TYPE_REP, g_OSD_FileLayout.fl_pg_size); - - client->client_lock.Lock(); - utime_t start = g_clock.now(); - if (write) { - dout(10) << "write to " << oid << dendl; - client->objecter->write(oid, 0, osize, layout, bl, - new C_Ref(lock, cond, &unack), - new C_Ref(lock, cond, &unsafe)); - } else { - dout(10) << "read from " << oid << dendl; - bufferlist inbl; - client->objecter->read(oid, 0, osize, layout, &inbl, - new C_Ref(lock, cond, &unack)); - } - client->client_lock.Unlock(); - - lock.Lock(); - while (unack > 0) { - dout(20) << "waiting for " << unack << " unack" << dendl; - cond.Wait(lock); - } - lock.Unlock(); - - utime_t lat = g_clock.now(); - lat -= start; - if (client_logger) { - if (write) - client_logger->favg("owrlat", lat); - else - client_logger->favg("ordlat", lat); - } - } - - - lock.Lock(); - while (unsafe > 0) { - dout(10) << "waiting for " << unsafe << " unsafe" << dendl; - cond.Wait(lock); - } - lock.Unlock(); - return 0; -} - - - - - -int SyntheticClient::read_random(string& fn, int size, int rdsize) // size is in MB, wrsize in bytes -{ - __uint64_t chunks = (__uint64_t)size * (__uint64_t)(1024*1024) / (__uint64_t)rdsize; - - int fd = client->open(fn.c_str(), O_RDWR); - dout(5) << "reading from " << fn << " fd " << fd << dendl; - - // dout(0) << "READING FROM " << fn << " fd " << fd << dendl; - - // dout(0) << "filename " << fn << " size:" << size << " read size|" << rdsize << "|" << "\ chunks: |" << chunks <<"|" << dendl; - - if (fd < 0) return fd; - int offset = 0; - char * buf = NULL; - - for (unsigned i=0; i<2000; i++) { - if (time_to_stop()) break; - - bool read=false; - - time_t seconds; - time( &seconds); - srand(seconds); - - // use rand instead ?? - double x = drand48(); - - //dout(0) << "RANDOM NUMBER RETURN |" << x << "|" << dendl; - - if ( x < 0.5) - { - //dout(0) << "DECIDED TO READ " << x << dendl; - buf = new char[rdsize]; - memset(buf, 1, rdsize); - read=true; - } - else - { - // dout(0) << "DECIDED TO WRITE " << x << dendl; - buf = new char[rdsize+100]; // 1 MB - memset(buf, 7, rdsize); - } - - //double y = drand48() ; - - //dout(0) << "OFFSET is |" << offset << "| chunks |" << chunks<< dendl; - - if ( read) - { - offset=(rand())%(chunks+1); - dout(2) << "reading block " << offset << "/" << chunks << dendl; - - int r = client->read(fd, buf, rdsize, - offset*rdsize); - if (r < rdsize) { - dout(1) << "read_file got r = " << r << ", probably end of file" << dendl; - } - } - else - { - dout(2) << "writing block " << offset << "/" << chunks << dendl; - - // fill buf with a 16 byte fingerprint - // 64 bits : file offset - // 64 bits : client id - // = 128 bits (16 bytes) - - //if (true ) - //{ - //int count = rand()%10; - - //for ( int j=0;jget_nodeid(); - p++; - } - - client->write(fd, buf, rdsize, - offset*rdsize); - //} - //} - } - - // verify fingerprint - if ( read ) - { - int bad = 0; - __int64_t *p = (__int64_t*)buf; - __int64_t readoff, readclient; - while ((char*)p + 32 < buf + rdsize) { - readoff = *p; - __int64_t wantoff = offset*rdsize + (__int64_t)((char*)p - buf); - p++; - readclient = *p; - p++; - if (readoff != wantoff || - readclient != client->get_nodeid()) { - if (!bad) - dout(0) << "WARNING: wrong data from OSD, block says fileoffset=" << readoff << " client=" << readclient - << ", should be offset " << wantoff << " clietn " << client->get_nodeid() - << dendl; - bad++; - } - } - if (bad) - dout(0) << " + " << (bad-1) << " other bad 16-byte bits in this block" << dendl; - } - } - - client->close(fd); - delete[] buf; - - return 0; -} - - -//#include -//#include - -int normdist(int min, int max, int stdev) /* specifies input values */; -//main() -//{ - // for ( int i=0; i < 10; i++ ) - // normdist ( 0 , 10, 1 ); - -//} - - -int normdist(int min, int max, int stdev) /* specifies input values */ -{ - /* min: Minimum value; max: Maximum value; stdev: degree of deviation */ - - //int min, max, stdev; { - time_t seconds; - time( &seconds); - srand(seconds); - - int range, iterate, result; - /* declare range, iterate and result as integers, to avoid the need for - floating point math*/ - - result = 0; - /* ensure result is initialized to 0 */ - - range = max -min; - /* calculate range of possible values between the max and min values */ - - iterate = range / stdev; - /* this number of iterations ensures the proper shape of the resulting - curve */ - - stdev += 1; /* compensation for integer vs. floating point math */ - for (int c = iterate; c != 0; c--) /* loop through iterations */ - { - // result += (uniform (1, 100) * stdev) / 100; /* calculate and - result += ( (rand()%100 + 1) * stdev) / 100; - // printf("result=%d\n", result ); - } - printf("\n final result=%d\n", result ); - return result + min; /* send final result back */ -} - -int SyntheticClient::read_random_ex(string& fn, int size, int rdsize) // size is in MB, wrsize in bytes -{ - __uint64_t chunks = (__uint64_t)size * (__uint64_t)(1024*1024) / (__uint64_t)rdsize; - - int fd = client->open(fn.c_str(), O_RDWR); - dout(5) << "reading from " << fn << " fd " << fd << dendl; - - // dout(0) << "READING FROM " << fn << " fd " << fd << dendl; - - // dout(0) << "filename " << fn << " size:" << size << " read size|" << rdsize << "|" << "\ chunks: |" << chunks <<"|" << dendl; - - if (fd < 0) return fd; - int offset = 0; - char * buf = NULL; - - for (unsigned i=0; i<2000; i++) { - if (time_to_stop()) break; - - bool read=false; - - time_t seconds; - time( &seconds); - srand(seconds); - - // use rand instead ?? - double x = drand48(); - - //dout(0) << "RANDOM NUMBER RETURN |" << x << "|" << dendl; - - if ( x < 0.5) - { - //dout(0) << "DECIDED TO READ " << x << dendl; - buf = new char[rdsize]; - memset(buf, 1, rdsize); - read=true; - } - else - { - // dout(0) << "DECIDED TO WRITE " << x << dendl; - buf = new char[rdsize+100]; // 1 MB - memset(buf, 7, rdsize); - } - - //double y = drand48() ; - - //dout(0) << "OFFSET is |" << offset << "| chunks |" << chunks<< dendl; - - if ( read) - { - //offset=(rand())%(chunks+1); - - /* if ( chunks > 10000 ) - offset= normdist( 0 , chunks/1000 , 5 )*1000; - else if ( chunks > 1000 ) - offset= normdist( 0 , chunks/100 , 5 )*100; - else if ( chunks > 100 ) - offset= normdist( 0 , chunks/20 , 5 )*20;*/ - - - dout(2) << "reading block " << offset << "/" << chunks << dendl; - - int r = client->read(fd, buf, rdsize, - offset*rdsize); - if (r < rdsize) { - dout(1) << "read_file got r = " << r << ", probably end of file" << dendl; - } - } - else - { - dout(2) << "writing block " << offset << "/" << chunks << dendl; - - // fill buf with a 16 byte fingerprint - // 64 bits : file offset - // 64 bits : client id - // = 128 bits (16 bytes) - - //if (true ) - //{ - int count = rand()%10; - - for ( int j=0;jget_nodeid(); - p++; - } - - client->write(fd, buf, rdsize, - offset*rdsize); - } - //} - } - - // verify fingerprint - if ( read ) - { - int bad = 0; - __int64_t *p = (__int64_t*)buf; - __int64_t readoff, readclient; - while ((char*)p + 32 < buf + rdsize) { - readoff = *p; - __int64_t wantoff = offset*rdsize + (__int64_t)((char*)p - buf); - p++; - readclient = *p; - p++; - if (readoff != wantoff || - readclient != client->get_nodeid()) { - if (!bad) - dout(0) << "WARNING: wrong data from OSD, block says fileoffset=" << readoff << " client=" << readclient - << ", should be offset " << wantoff << " clietn " << client->get_nodeid() - << dendl; - bad++; - } - } - if (bad) - dout(0) << " + " << (bad-1) << " other bad 16-byte bits in this block" << dendl; - } - } - - client->close(fd); - delete[] buf; - - return 0; -} - - -int SyntheticClient::random_walk(int num_req) -{ - int left = num_req; - - //dout(1) << "random_walk() will do " << left << " ops" << dendl; - - init_op_dist(); // set up metadata op distribution - - while (left > 0) { - left--; - - if (time_to_stop()) break; - - // ascend? - if (cwd.depth() && !roll_die(::pow((double).9, (double)cwd.depth()))) { - dout(DBL) << "die says up" << dendl; - up(); - continue; - } - - // descend? - if (.9*roll_die(::pow((double).9,(double)cwd.depth())) && subdirs.size()) { - string s = get_random_subdir(); - cwd.push_dentry( s ); - dout(DBL) << "cd " << s << " -> " << cwd << dendl; - clear_dir(); - continue; - } - - int op = 0; - filepath path; - - if (contents.empty() && roll_die(.3)) { - if (did_readdir) { - dout(DBL) << "empty dir, up" << dendl; - up(); - } else - op = MDS_OP_READDIR; - } else { - op = op_dist.sample(); - } - //dout(DBL) << "op is " << op << dendl; - - int r = 0; - - // do op - if (op == MDS_OP_UNLINK) { - if (contents.empty()) - op = MDS_OP_READDIR; - else - r = client->unlink( get_random_sub() ); // will fail on dirs - } - - if (op == MDS_OP_RENAME) { - if (contents.empty()) - op = MDS_OP_READDIR; - else { - r = client->rename( get_random_sub(), make_sub("ren") ); - } - } - - if (op == MDS_OP_MKDIR) { - r = client->mkdir( make_sub("mkdir"), 0755); - } - - if (op == MDS_OP_RMDIR) { - if (!subdirs.empty()) - r = client->rmdir( get_random_subdir() ); - else - r = client->rmdir( cwd.c_str() ); // will pbly fail - } - - if (op == MDS_OP_SYMLINK) { - } - - if (op == MDS_OP_CHMOD) { - if (contents.empty()) - op = MDS_OP_READDIR; - else - r = client->chmod( get_random_sub(), rand() & 0755 ); - } - - if (op == MDS_OP_CHOWN) { - if (contents.empty()) r = client->chown( cwd.c_str(), rand(), rand() ); - else - r = client->chown( get_random_sub(), rand(), rand() ); - } - - if (op == MDS_OP_LINK) { - } - - if (op == MDS_OP_UTIME) { - struct utimbuf b; - memset(&b, 1, sizeof(b)); - if (contents.empty()) - r = client->utime( cwd.c_str(), &b ); - else - r = client->utime( get_random_sub(), &b ); - } - - if (op == MDS_OP_MKNOD) { - r = client->mknod( make_sub("mknod"), 0644); - } - - if (op == MDS_OP_OPEN) { - if (contents.empty()) - op = MDS_OP_READDIR; - else { - r = client->open( get_random_sub(), O_RDONLY ); - if (r > 0) { - assert(open_files.count(r) == 0); - open_files.insert(r); - } - } - } - - if (op == MDS_OP_RELEASE) { // actually, close - if (open_files.empty()) - op = MDS_OP_STAT; - else { - int fh = get_random_fh(); - r = client->close( fh ); - if (r == 0) open_files.erase(fh); - } - } - - if (op == MDS_OP_STAT) { - struct stat st; - if (contents.empty()) { - if (did_readdir) { - if (roll_die(.1)) { - dout(DBL) << "stat in empty dir, up" << dendl; - up(); - } else { - op = MDS_OP_MKNOD; - } - } else - op = MDS_OP_READDIR; - } else - r = client->lstat(get_random_sub(), &st); - } - - if (op == MDS_OP_READDIR) { - clear_dir(); - - list c; - r = client->getdir( cwd.c_str(), c ); - - for (list::iterator it = c.begin(); - it != c.end(); - it++) { - //dout(DBL) << " got " << *it << dendl; - assert(0); - /*contents[*it] = it->second; - if (it->second && - S_ISDIR(it->second->st_mode)) - subdirs.insert(*it); - */ - } - - did_readdir = true; - } - - // errors? - if (r < 0) { - // reevaluate cwd. - //while (cwd.depth()) { - //if (client->lookup(cwd)) break; // it's in the cache - - //dout(DBL) << "r = " << r << ", client doesn't have " << cwd << ", cd .." << dendl; - dout(DBL) << "r = " << r << ", client may not have " << cwd << ", cd .." << dendl; - up(); - //} - } - } - - // close files - dout(DBL) << "closing files" << dendl; - while (!open_files.empty()) { - int fh = get_random_fh(); - int r = client->close( fh ); - if (r == 0) open_files.erase(fh); - } - - dout(DBL) << "done" << dendl; - return 0; -} - - - - -void SyntheticClient::make_dir_mess(const char *basedir, int n) -{ - vector dirs; - - dirs.push_back(basedir); - dirs.push_back(basedir); - - client->mkdir(basedir, 0755); - - // motivation: - // P(dir) ~ subdirs_of(dir) + 2 - // from 5-year metadata workload paper in fast'07 - - // create dirs - for (int i=0; imkdir(dir.c_str(), 0755); - } - - -} - - - -void SyntheticClient::foo() -{ - if (1) { - // open some files - srand(0); - for (int i=0; i<20; i++) { - int s = 5; - int a = rand() % s; - int b = rand() % s; - int c = rand() % s; - char src[80]; - sprintf(src, "syn.0.0/dir.%d/dir.%d/file.%d", a, b, c); - //int fd = - client->open(src, O_RDONLY); - } - - return; - } - - if (0) { - // rename fun - for (int i=0; i<100; i++) { - int s = 5; - int a = rand() % s; - int b = rand() % s; - int c = rand() % s; - int d = rand() % s; - int e = rand() % s; - int f = rand() % s; - char src[80]; - char dst[80]; - sprintf(src, "syn.0.0/dir.%d/dir.%d/file.%d", a, b, c); - sprintf(dst, "syn.0.0/dir.%d/dir.%d/file.%d", d, e, f); - client->rename(src, dst); - } - return; - } - - if (1) { - // link fun - srand(0); - for (int i=0; i<100; i++) { - int s = 5; - int a = rand() % s; - int b = rand() % s; - int c = rand() % s; - int d = rand() % s; - int e = rand() % s; - int f = rand() % s; - char src[80]; - char dst[80]; - sprintf(src, "syn.0.0/dir.%d/dir.%d/file.%d", a, b, c); - sprintf(dst, "syn.0.0/dir.%d/dir.%d/newlink.%d", d, e, f); - client->link(src, dst); - } - srand(0); - for (int i=0; i<100; i++) { - int s = 5; - int a = rand() % s; - int b = rand() % s; - int c = rand() % s; - int d = rand() % s; - int e = rand() % s; - int f = rand() % s; - char src[80]; - char dst[80]; - sprintf(src, "syn.0.0/dir.%d/dir.%d/file.%d", a, b, c); - sprintf(dst, "syn.0.0/dir.%d/dir.%d/newlink.%d", d, e, f); - client->unlink(dst); - } - - - return; - } - - // link fun - client->mknod("one", 0755); - client->mknod("two", 0755); - client->link("one", "three"); - client->mkdir("dir", 0755); - client->link("two", "/dir/twolink"); - client->link("dir/twolink", "four"); - - // unlink fun - client->mknod("a", 0644); - client->unlink("a"); - client->mknod("b", 0644); - client->link("b", "c"); - client->unlink("c"); - client->mkdir("d", 0755); - client->unlink("d"); - client->rmdir("d"); - - // rename fun - client->mknod("p1", 0644); - client->mknod("p2", 0644); - client->rename("p1","p2"); - client->mknod("p3", 0644); - client->rename("p3","p4"); - - // check dest dir ambiguity thing - client->mkdir("dir1", 0755); - client->mkdir("dir2", 0755); - client->rename("p2","dir1/p2"); - client->rename("dir1/p2","dir2/p2"); - client->rename("dir2/p2","/p2"); - - // check primary+remote link merging - client->link("p2","p2.l"); - client->link("p4","p4.l"); - client->rename("p2.l","p2"); - client->rename("p4","p4.l"); - - // check anchor updates - client->mknod("dir1/a", 0644); - client->link("dir1/a", "da1"); - client->link("dir1/a", "da2"); - client->link("da2","da3"); - client->rename("dir1/a","dir2/a"); - client->rename("dir2/a","da2"); - client->rename("da1","da2"); - client->rename("da2","da3"); - - // check directory renames - client->mkdir("dir3", 0755); - client->mknod("dir3/asdf", 0644); - client->mkdir("dir4", 0755); - client->mkdir("dir5", 0755); - client->mknod("dir5/asdf", 0644); - client->rename("dir3","dir4"); // ok - client->rename("dir4","dir5"); // fail -} - -int SyntheticClient::thrash_links(const char *basedir, int dirs, int files, int depth, int n) -{ - dout(1) << "thrash_links " << basedir << " " << dirs << " " << files << " " << depth - << " links " << n - << dendl; - - if (time_to_stop()) return 0; - - for (int k=0; krename(dst.c_str(), "/tmp") == 0) { - client->rename(src.c_str(), dst.c_str()); - client->rename("/tmp", src.c_str()); - } - continue; - } - - // pick a dest dir - string src = basedir; - { - char t[80]; - for (int d=0; dmknod(src.c_str(), 0755); - client->rename(src.c_str(), dst.c_str()); - break; - case 1: - client->mknod(src.c_str(), 0755); - client->unlink(dst.c_str()); - client->link(src.c_str(), dst.c_str()); - break; - case 2: client->unlink(src.c_str()); break; - case 3: client->unlink(dst.c_str()); break; - //case 4: client->mknod(src.c_str(), 0755); break; - //case 5: client->mknod(dst.c_str(), 0755); break; - } - } - return 0; - - // now link shit up - for (int i=0; ilink(file.c_str(), ln.c_str()); - } - - return 0; -} - - - - -void SyntheticClient::import_find(const char *base, const char *find, bool data) -{ - dout(1) << "import_find " << base << " from " << find << " data=" << data << dendl; - - /* use this to gather the static trace: - * - * find . -exec ls -dilsn --time-style=+%s \{\} \; - * or if it's wafl, - * find . -path ./.snapshot -prune -o -exec ls -dilsn --time-style=+%s \{\} \; - * - */ - - if (base[0] != '-') - client->mkdir(base, 0755); - - ifstream f(find); - assert(f.is_open()); - - int dirnum = 0; - - while (!f.eof()) { - uint64_t ino; - int dunno, nlink; - string modestring; - int uid, gid; - off_t size; - time_t mtime; - string filename; - f >> ino; - if (f.eof()) break; - f >> dunno; - f >> modestring; - f >> nlink; - f >> uid; - f >> gid; - f >> size; - f >> mtime; - f.seekg(1, ios::cur); - getline(f, filename); - - // ignore "." - if (filename == ".") continue; - - // remove leading ./ - assert(filename[0] == '.' && filename[1] == '/'); - filename = filename.substr(2); - - // new leading dir? - int sp = filename.find("/"); - if (sp < 0) dirnum++; - - //dout(0) << "leading dir " << filename << " " << dirnum << dendl; - if (dirnum % g_conf.num_client != client->get_nodeid()) { - dout(20) << "skipping leading dir " << dirnum << " " << filename << dendl; - continue; - } - - // parse the mode - assert(modestring.length() == 10); - mode_t mode = 0; - switch (modestring[0]) { - case 'd': mode |= INODE_MODE_DIR; break; - case 'l': mode |= INODE_MODE_SYMLINK; break; - default: - case '-': mode |= INODE_MODE_FILE; break; - } - if (modestring[1] == 'r') mode |= 0400; - if (modestring[2] == 'w') mode |= 0200; - if (modestring[3] == 'x') mode |= 0100; - if (modestring[4] == 'r') mode |= 040; - if (modestring[5] == 'w') mode |= 020; - if (modestring[6] == 'x') mode |= 010; - if (modestring[7] == 'r') mode |= 04; - if (modestring[8] == 'w') mode |= 02; - if (modestring[9] == 'x') mode |= 01; - - dout(20) << " mode " << modestring << " to " << oct << mode << dec << dendl; - - if (S_ISLNK(mode)) { - // target vs destination - int pos = filename.find(" -> "); - assert(pos > 0); - string link; - if (base[0] != '-') { - link = base; - link += "/"; - } - link += filename.substr(0, pos); - string target; - if (filename[pos+4] == '/') { - if (base[0] != '-') - target = base; - target += filename.substr(pos + 4); - } else { - target = filename.substr(pos + 4); - } - dout(10) << "symlink from '" << link << "' -> '" << target << "'" << dendl; - client->symlink(target.c_str(), link.c_str()); - } else { - string f; - if (base[0] != '-') { - f = base; - f += "/"; - } - f += filename; - if (S_ISDIR(mode)) { - client->mkdir(f.c_str(), mode); - } else { - int fd = client->open(f.c_str(), O_WRONLY|O_CREAT, mode & 0777); - assert(fd > 0); - client->write(fd, "", 0, size); - client->close(fd); - - //client->chmod(f.c_str(), mode & 0777); - client->chown(f.c_str(), uid, gid); - - struct utimbuf ut; - ut.modtime = mtime; - ut.actime = mtime; - client->utime(f.c_str(), &ut); - } - } - } - - -} - diff --git a/branches/sage/ebofs2/client/SyntheticClient.h b/branches/sage/ebofs2/client/SyntheticClient.h deleted file mode 100644 index ce09b18addfb2..0000000000000 --- a/branches/sage/ebofs2/client/SyntheticClient.h +++ /dev/null @@ -1,241 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __SYNTHETICCLIENT_H -#define __SYNTHETICCLIENT_H - -#include - -#include "Client.h" -#include "include/Distribution.h" - -#include "Trace.h" - -#define SYNCLIENT_MODE_RANDOMWALK 1 -#define SYNCLIENT_MODE_FULLWALK 2 -#define SYNCLIENT_MODE_REPEATWALK 3 - -#define SYNCLIENT_MODE_MAKEDIRMESS 7 -#define SYNCLIENT_MODE_MAKEDIRS 8 // dirs files depth -#define SYNCLIENT_MODE_STATDIRS 9 // dirs files depth -#define SYNCLIENT_MODE_READDIRS 10 // dirs files depth - -#define SYNCLIENT_MODE_MAKEFILES 11 // num count private -#define SYNCLIENT_MODE_MAKEFILES2 12 // num count private -#define SYNCLIENT_MODE_CREATESHARED 13 // num -#define SYNCLIENT_MODE_OPENSHARED 14 // num count - -#define SYNCLIENT_MODE_WRITEFILE 20 -#define SYNCLIENT_MODE_READFILE 21 -#define SYNCLIENT_MODE_WRITEBATCH 22 -#define SYNCLIENT_MODE_WRSHARED 23 -#define SYNCLIENT_MODE_READSHARED 24 -#define SYNCLIENT_MODE_RDWRRANDOM 25 -#define SYNCLIENT_MODE_RDWRRANDOM_EX 26 - -#define SYNCLIENT_MODE_LINKTEST 27 - -#define SYNCLIENT_MODE_TRACE 30 - -#define SYNCLIENT_MODE_CREATEOBJECTS 35 -#define SYNCLIENT_MODE_OBJECTRW 36 - -#define SYNCLIENT_MODE_OPENTEST 40 -#define SYNCLIENT_MODE_OPTEST 41 - -#define SYNCLIENT_MODE_ONLY 50 -#define SYNCLIENT_MODE_ONLYRANGE 51 -#define SYNCLIENT_MODE_EXCLUDE 52 -#define SYNCLIENT_MODE_EXCLUDERANGE 53 - -#define SYNCLIENT_MODE_UNTIL 55 -#define SYNCLIENT_MODE_SLEEPUNTIL 56 - -#define SYNCLIENT_MODE_RANDOMSLEEP 61 -#define SYNCLIENT_MODE_SLEEP 62 - -#define SYNCLIENT_MODE_TRUNCATE 200 - -#define SYNCLIENT_MODE_FOO 100 -#define SYNCLIENT_MODE_THRASHLINKS 101 - -#define SYNCLIENT_MODE_IMPORTFIND 300 - - - -void parse_syn_options(vector& args); - -class SyntheticClient { - Client *client; - - pthread_t thread_id; - - Distribution op_dist; - - void init_op_dist(); - int get_op(); - - - filepath cwd; - map contents; - set subdirs; - bool did_readdir; - set open_files; - - void up(); - - void clear_dir() { - contents.clear(); - subdirs.clear(); - did_readdir = false; - } - - int get_random_fh() { - int r = rand() % open_files.size(); - set::iterator it = open_files.begin(); - while (r--) it++; - return *it; - } - - - filepath n1; - const char *get_random_subdir() { - assert(!subdirs.empty()); - int r = ((rand() % subdirs.size()) + (rand() % subdirs.size())) / 2; // non-uniform distn - set::iterator it = subdirs.begin(); - while (r--) it++; - - n1 = cwd; - n1.push_dentry( *it ); - return n1.get_path().c_str(); - } - filepath n2; - const char *get_random_sub() { - assert(!contents.empty()); - int r = ((rand() % contents.size()) + (rand() % contents.size())) / 2; // non-uniform distn - if (cwd.depth() && cwd.last_dentry().length()) - r += cwd.last_dentry().c_str()[0]; // slightly permuted - r %= contents.size(); - - map::iterator it = contents.begin(); - while (r--) it++; - - n2 = cwd; - n2.push_dentry( it->first ); - return n2.get_path().c_str(); - } - - filepath sub; - char sub_s[50]; - const char *make_sub(char *base) { - sprintf(sub_s, "%s.%d", base, rand() % 100); - string f = sub_s; - sub = cwd; - sub.push_dentry(f); - return sub.c_str(); - } - - public: - SyntheticClient(Client *client); - - int start_thread(); - int join_thread(); - - int run(); - - bool run_me() { - if (run_only >= 0) { - if (run_only == client->get_nodeid()) - return true; - else - return false; - } - return true; - } - void did_run_me() { - run_only = -1; - run_until = utime_t(); - } - - // run() will do one of these things: - list modes; - list sargs; - list iargs; - utime_t run_start; - utime_t run_until; - - int run_only; - int exclude; - - string get_sarg(int seq); - int get_iarg() { - int i = iargs.front(); - iargs.pop_front(); - return i; - } - - bool time_to_stop() { - utime_t now = g_clock.now(); - if (0) cout << "time_to_stop .. now " << now - << " until " << run_until - << " start " << run_start - << std::endl; - if (run_until.sec() && now > run_until) - return true; - else - return false; - } - - string compose_path(string& prefix, char *rest) { - return prefix + rest; - } - - int full_walk(string& fromdir); - int random_walk(int n); - - int make_dirs(const char *basedir, int dirs, int files, int depth); - int stat_dirs(const char *basedir, int dirs, int files, int depth); - int read_dirs(const char *basedir, int dirs, int files, int depth); - int make_files(int num, int count, int priv, bool more); - int link_test(); - - int create_shared(int num); - int open_shared(int num, int count); - - int write_file(string& fn, int mb, int chunk); - int write_batch(int nfile, int mb, int chunk); - int read_file(string& fn, int mb, int chunk, bool ignoreprint=false); - - int create_objects(int nobj, int osize, int inflight); - int object_rw(int nobj, int osize, int wrpc, int overlap, - double rskew, double wskew); - - int read_random(string& fn, int mb, int chunk); - int read_random_ex(string& fn, int mb, int chunk); - - int clean_dir(string& basedir); - - int play_trace(Trace& t, string& prefix, bool metadata_only=false); - - void make_dir_mess(const char *basedir, int n); - void foo(); - - int thrash_links(const char *basedir, int dirs, int files, int depth, int n); - - void import_find(const char *basedir, const char *find, bool writedata); - -}; - -#endif diff --git a/branches/sage/ebofs2/client/Trace.cc b/branches/sage/ebofs2/client/Trace.cc deleted file mode 100644 index 31bb1c4cf5c4a..0000000000000 --- a/branches/sage/ebofs2/client/Trace.cc +++ /dev/null @@ -1,83 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "Trace.h" - -#include -#include -#include -#include -using namespace __gnu_cxx; - -#include "common/Mutex.h" - -#include "config.h" - -#include -#include -#include - - - - - -void Trace::start() -{ - //cout << "start" << std::endl; - delete fs; - - fs = new ifstream(); - fs->open(filename); - if (!fs->is_open()) { - generic_dout(0) << "** unable to open trace file " << filename << dendl; - assert(0); - } - generic_dout(2) << "opened traced file '" << filename << "'" << dendl; - - // read first line - getline(*fs, line); - //cout << "first line is " << line << std::endl; - - _line = 1; -} - -const char *Trace::peek_string(char *buf, const char *prefix) -{ - //if (prefix) cout << "prefix '" << prefix << "' line '" << line << "'" << std::endl; - if (prefix && - strstr(line.c_str(), "/prefix") == line.c_str()) { - strcpy(buf, prefix); - strcpy(buf + strlen(prefix), - line.c_str() + strlen("/prefix")); - } else { - strcpy(buf, line.c_str()); - } - return buf; -} - - -const char *Trace::get_string(char *buf, const char *prefix) -{ - peek_string(buf, prefix); - - //cout << "buf is " << buf << std::endl; - // read next line (and detect eof early) - _line++; - getline(*fs, line); - //cout << "next line is " << line << std::endl; - - return buf; -} diff --git a/branches/sage/ebofs2/client/Trace.h b/branches/sage/ebofs2/client/Trace.h deleted file mode 100644 index 97821f4e95e56..0000000000000 --- a/branches/sage/ebofs2/client/Trace.h +++ /dev/null @@ -1,63 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __CLIENT_TRACE_H -#define __CLIENT_TRACE_H - -#include -#include -#include -#include -using std::list; -using std::string; -using std::ifstream; - -/* - - this class is more like an iterator over a constant tokenlist (which - is protected by a mutex, see Trace.cc) - - */ - -class Trace { - int _line; - const char *filename; - ifstream *fs; - string line; - - public: - Trace(const char* f) : filename(f), fs(0) {} - ~Trace() { - delete fs; - } - - int get_line() { return _line; } - - void start(); - - const char *peek_string(char *buf, const char *prefix); - const char *get_string(char *buf, const char *prefix); - - __int64_t get_int() { - char buf[20]; - return atoll(get_string(buf, 0)); - } - bool end() { - return !fs || fs->eof(); - //return _cur == _end; - } -}; - -#endif diff --git a/branches/sage/ebofs2/client/fuse.cc b/branches/sage/ebofs2/client/fuse.cc deleted file mode 100644 index 64198dc41df51..0000000000000 --- a/branches/sage/ebofs2/client/fuse.cc +++ /dev/null @@ -1,306 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -/* - FUSE: Filesystem in Userspace - Copyright (C) 2001-2005 Miklos Szeredi - - This program can be distributed under the terms of the GNU GPL. - See the file COPYING. -*/ - - -// fuse crap -#ifdef linux -/* For pread()/pwrite() */ -#define _XOPEN_SOURCE 500 -#endif - -#define FUSE_USE_VERSION 26 - -#include -#include -#include -#include -#include -#include -#include -#include - - -// ceph stuff -#include "include/types.h" - -#include "Client.h" - -#include "config.h" - -// globals -static Client *client; // the ceph client - - - -// ------ -// fuse hooks - -static int ceph_getattr(const char *path, struct stat *stbuf) -{ - return client->lstat(path, stbuf); -} - -static int ceph_readlink(const char *path, char *buf, size_t size) -{ - int res; - - res = client->readlink(path, buf, size - 1); - if (res < 0) return res; - - buf[res] = '\0'; - return 0; -} - -static int ceph_mknod(const char *path, mode_t mode, dev_t rdev) -{ - return client->mknod(path, mode); -} - -static int ceph_mkdir(const char *path, mode_t mode) -{ - return client->mkdir(path, mode); -} - -static int ceph_unlink(const char *path) -{ - return client->unlink(path); -} - -static int ceph_rmdir(const char *path) -{ - return client->rmdir(path); -} - -static int ceph_symlink(const char *from, const char *to) -{ - return client->symlink(from, to); -} - -static int ceph_rename(const char *from, const char *to) -{ - return client->rename(from, to); -} - -static int ceph_link(const char *from, const char *to) -{ - return client->link(from, to); -} - -static int ceph_chmod(const char *path, mode_t mode) -{ - return client->chmod(path, mode); -} - -static int ceph_chown(const char *path, uid_t uid, gid_t gid) -{ - return client->chown(path, uid, gid); -} - -static int ceph_truncate(const char *path, off_t size) -{ - return client->truncate(path, size); -} - -static int ceph_utime(const char *path, struct utimbuf *buf) -{ - return client->utime(path, buf); -} - - -// ------------------ -// file i/o - -static int ceph_open(const char *path, struct fuse_file_info *fi) -{ - int res; - - res = client->open(path, fi->flags, 0); - if (res < 0) return res; - fi->fh = res; - return 0; // fuse wants 0 onsucess -} - -static int ceph_read(const char *path, char *buf, size_t size, off_t offset, - struct fuse_file_info *fi) -{ - int fd = fi->fh; - return client->read(fd, buf, size, offset); -} - -static int ceph_write(const char *path, const char *buf, size_t size, - off_t offset, struct fuse_file_info *fi) -{ - int fd = fi->fh; - return client->write(fd, buf, size, offset); -} - -static int ceph_flush(const char *path, struct fuse_file_info *fi) -{ - //int fh = fi->fh; - //return client->flush(fh); - return 0; -} - -static int ceph_statfs(const char *path, struct statvfs *stbuf) -{ - return client->statfs(path, stbuf); -} - -static int ceph_release(const char *path, struct fuse_file_info *fi) -{ - int fd = fi->fh; - int r = client->close(fd); // close the file - return r; -} - -static int ceph_fsync(const char *path, int isdatasync, - struct fuse_file_info *fi) -{ - int fd = fi->fh; - return client->fsync(fd, isdatasync ? true:false); -} - - -// --------------------- -// directory i/o - -static int ceph_opendir(const char *path, struct fuse_file_info *fi) -{ - DIR *dirp; - int r = client->opendir(path, &dirp); - if (r < 0) return r; - fi->fh = (uint64_t)(void*)dirp; - return 0; -} - -static int ceph_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t off, fuse_file_info *fi) -{ - DIR *dirp = (DIR*)fi->fh; - - client->seekdir(dirp, off); - - int res = 0; - struct dirent de; - struct stat st; - int stmask = 0; - while (res == 0) { - int r = client->readdirplus_r(dirp, &de, &st, &stmask); - if (r != 0) break; - int stneed = STAT_MASK_INO | STAT_MASK_TYPE; - res = filler(buf, - de.d_name, - ((stmask & stneed) == stneed) ? &st:0, - client->telldir(dirp)); - } - return 0; -} - -static int ceph_releasedir(const char *path, struct fuse_file_info *fi) -{ - DIR *dirp = (DIR*)fi->fh; - int r = client->closedir(dirp); // close the file - return r; -} - - - - - -static struct fuse_operations ceph_oper = { - getattr: ceph_getattr, - readlink: ceph_readlink, - getdir: 0, - mknod: ceph_mknod, - mkdir: ceph_mkdir, - unlink: ceph_unlink, - rmdir: ceph_rmdir, - symlink: ceph_symlink, - rename: ceph_rename, - link: ceph_link, - chmod: ceph_chmod, - chown: ceph_chown, - truncate: ceph_truncate, - utime: ceph_utime, - open: ceph_open, - read: ceph_read, - write: ceph_write, - statfs: ceph_statfs, - flush: ceph_flush, - release: ceph_release, - fsync: ceph_fsync, - setxattr: 0, - getxattr: 0, - listxattr: 0, - removexattr: 0, - opendir: ceph_opendir, - readdir: ceph_readdir, - releasedir: ceph_releasedir -}; - - -int ceph_fuse_main(Client *c, int argc, char *argv[]) -{ - // init client - client = c; - - // set up fuse argc/argv - int newargc = 0; - char **newargv = (char **) malloc((argc + 10) * sizeof(char *)); - newargv[newargc++] = argv[0]; - - // allow other (all!) users to see my file system - // NOTE: echo user_allow_other >> /etc/fuse.conf - // NB: seems broken on Darwin -#ifndef DARWIN - newargv[newargc++] = "-o"; - newargv[newargc++] = "allow_other"; -#endif // DARWIN - - // use inos - newargv[newargc++] = "-o"; - newargv[newargc++] = "use_ino"; - - // large reads, direct_io (no kernel cachine) - //newargv[newargc++] = "-o"; - //newargv[newargc++] = "large_read"; - if (g_conf.fuse_direct_io) { - newargv[newargc++] = "-o"; - newargv[newargc++] = "direct_io"; - } - - // disable stupid fuse unlink hiding thing - newargv[newargc++] = "-o"; - newargv[newargc++] = "hard_remove"; - - // force into foreground - // -> we can watch stdout this way!! - newargv[newargc++] = "-f"; - - // copy rest of cmdline (hopefully, the mount point!) - for (int argctr = 1; argctr < argc; argctr++) newargv[newargc++] = argv[argctr]; - - // go fuse go - cout << "ok, calling fuse_main" << std::endl; - int r = fuse_main(newargc, newargv, &ceph_oper, 0); - return r; -} diff --git a/branches/sage/ebofs2/client/fuse.h b/branches/sage/ebofs2/client/fuse.h deleted file mode 100644 index dfacbaa4fdd85..0000000000000 --- a/branches/sage/ebofs2/client/fuse.h +++ /dev/null @@ -1,24 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -/* ceph_fuse_main - * - start up fuse glue, attached to Client* cl. - * - argc, argv should include a mount point, and - * any weird fuse options you want. by default, - * we will put fuse in the foreground so that it - * won't fork and we can see stdout. - */ -int ceph_fuse_main(Client *cl, int argc, char *argv[]); diff --git a/branches/sage/ebofs2/client/fuse_ll.cc b/branches/sage/ebofs2/client/fuse_ll.cc deleted file mode 100644 index f1f92b0cd01b3..0000000000000 --- a/branches/sage/ebofs2/client/fuse_ll.cc +++ /dev/null @@ -1,397 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#define FUSE_USE_VERSION 26 - -#include -#include -#include -#include -#include -#include -#include -#include - -// ceph -#include "include/types.h" -#include "Client.h" -#include "config.h" - -static Client *client; - - -static void ceph_ll_lookup(fuse_req_t req, fuse_ino_t parent, const char *name) -{ - struct fuse_entry_param fe; - int stmask; - - memset(&fe, 0, sizeof(fe)); - stmask = client->ll_lookup(parent, name, &fe.attr); - if (stmask >= 0) { - fe.ino = fe.attr.st_ino; - fuse_reply_entry(req, &fe); - } else { - fuse_reply_err(req, ENOENT); - } -} - -static void ceph_ll_forget(fuse_req_t req, fuse_ino_t ino, long unsigned nlookup) -{ - client->ll_forget(ino, nlookup); - fuse_reply_none(req); -} - -static void ceph_ll_getattr(fuse_req_t req, fuse_ino_t ino, - struct fuse_file_info *fi) -{ - struct stat stbuf; - - (void) fi; - - if (client->ll_getattr(ino, &stbuf) == 0) - fuse_reply_attr(req, &stbuf, 0); - else - fuse_reply_err(req, ENOENT); -} - -static void ceph_ll_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr, - int to_set, struct fuse_file_info *fi) -{ - int r = client->ll_setattr(ino, attr, to_set); - if (r == 0) - fuse_reply_attr(req, attr, 0); - else - fuse_reply_err(req, -r); -} - -static void ceph_ll_opendir(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) -{ - void *dirp; - int r = client->ll_opendir(ino, &dirp); - if (r >= 0) { - fi->fh = (long)dirp; - fuse_reply_open(req, fi); - } else { - fuse_reply_err(req, -r); - } -} - -static void ceph_ll_readlink(fuse_req_t req, fuse_ino_t ino) -{ - const char *value; - int r = client->ll_readlink(ino, &value); - if (r == 0) - fuse_reply_readlink(req, value); - else - fuse_reply_err(req, -r); -} - -static void ceph_ll_mknod(fuse_req_t req, fuse_ino_t parent, const char *name, - mode_t mode, dev_t rdev) -{ - struct fuse_entry_param fe; - memset(&fe, 0, sizeof(fe)); - - int r = client->ll_mknod(parent, name, mode, rdev, &fe.attr); - if (r == 0) { - fe.ino = fe.attr.st_ino; - fuse_reply_entry(req, &fe); - } else { - fuse_reply_err(req, -r); - } -} - -static void ceph_ll_mkdir(fuse_req_t req, fuse_ino_t parent, const char *name, - mode_t mode) -{ - struct fuse_entry_param fe; - memset(&fe, 0, sizeof(fe)); - - int r = client->ll_mkdir(parent, name, mode, &fe.attr); - if (r == 0) { - fe.ino = fe.attr.st_ino; - fuse_reply_entry(req, &fe); - } else { - fuse_reply_err(req, -r); - } -} - -static void ceph_ll_unlink(fuse_req_t req, fuse_ino_t parent, const char *name) -{ - int r = client->ll_unlink(parent, name); - fuse_reply_err(req, -r); -} - -static void ceph_ll_rmdir(fuse_req_t req, fuse_ino_t parent, const char *name) -{ - int r = client->ll_rmdir(parent, name); - fuse_reply_err(req, -r); -} - -static void ceph_ll_symlink(fuse_req_t req, const char *existing, fuse_ino_t parent, const char *name) -{ - struct fuse_entry_param fe; - memset(&fe, 0, sizeof(fe)); - - int r = client->ll_symlink(parent, name, existing, &fe.attr); - if (r == 0) { - fe.ino = fe.attr.st_ino; - fuse_reply_entry(req, &fe); - } else { - fuse_reply_err(req, -r); - } -} - -static void ceph_ll_rename(fuse_req_t req, fuse_ino_t parent, const char *name, - fuse_ino_t newparent, const char *newname) -{ - int r = client->ll_rename(parent, name, newparent, newname); - fuse_reply_err(req, -r); -} - -static void ceph_ll_link(fuse_req_t req, fuse_ino_t ino, fuse_ino_t newparent, - const char *newname) -{ - struct fuse_entry_param fe; - memset(&fe, 0, sizeof(fe)); - - int r = client->ll_link(ino, newparent, newname, &fe.attr); - if (r == 0) { - fe.ino = fe.attr.st_ino; - fuse_reply_entry(req, &fe); - } else { - fuse_reply_err(req, -r); - } -} - -static void ceph_ll_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) -{ - Fh *fh; - int r = client->ll_open(ino, fi->flags, &fh); - if (r == 0) { - fi->fh = (long)fh; - fuse_reply_open(req, fi); - } else { - fuse_reply_err(req, -r); - } -} - -static void ceph_ll_read(fuse_req_t req, fuse_ino_t ino, size_t size, off_t off, - struct fuse_file_info *fi) -{ - Fh *fh = (Fh*)fi->fh; - bufferlist bl; - int r = client->ll_read(fh, off, size, &bl); - if (r >= 0) - fuse_reply_buf(req, bl.c_str(), bl.length()); - else - fuse_reply_err(req, -r); -} - -static void ceph_ll_write(fuse_req_t req, fuse_ino_t ino, const char *buf, - size_t size, off_t off, struct fuse_file_info *fi) -{ - Fh *fh = (Fh*)fi->fh; - int r = client->ll_write(fh, off, size, buf); - if (r >= 0) - fuse_reply_write(req, r); - else - fuse_reply_err(req, -r); -} - -static void ceph_ll_flush(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) -{ - // NOOP - fuse_reply_err(req, 0); -} - -static void ceph_ll_release(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) -{ - Fh *fh = (Fh*)fi->fh; - int r = client->ll_release(fh); - fuse_reply_err(req, -r); -} - -static void ceph_ll_fsync(fuse_req_t req, fuse_ino_t ino, int datasync, - struct fuse_file_info *fi) -{ - -} - -static void ceph_ll_readdir(fuse_req_t req, fuse_ino_t ino, size_t size, - off_t off, struct fuse_file_info *fi) -{ - (void) fi; - - // buffer - char *buf; - size_t pos = 0; - - buf = new char[size]; - if (!buf) { - fuse_reply_err(req, ENOMEM); - return; - } - - DIR *dirp = (DIR*)fi->fh; - client->seekdir(dirp, off); - - struct dirent de; - struct stat st; - memset(&st, 0, sizeof(st)); - - while (1) { - int r = client->readdir_r(dirp, &de); - if (r < 0) break; - st.st_ino = de.d_ino; - st.st_mode = DT_TO_MODE(de.d_type); - - off_t off = client->telldir(dirp); - size_t entrysize = fuse_add_direntry(req, buf + pos, size - pos, - de.d_name, &st, off); - - /* - cout << "ceph_ll_readdir added " << de.d_name << " at " << pos << " len " << entrysize - << " (buffer size is " << size << ")" - << " .. off = " << off - << std::endl; - */ - - if (entrysize > size - pos) - break; // didn't fit, done for now. - pos += entrysize; - } - - fuse_reply_buf(req, buf, pos); - delete[] buf; -} - -static void ceph_ll_releasedir(fuse_req_t req, fuse_ino_t ino, - struct fuse_file_info *fi) -{ - DIR *dirp = (DIR*)fi->fh; - client->ll_releasedir(dirp); - fuse_reply_err(req, 0); -} - -static void ceph_ll_create(fuse_req_t req, fuse_ino_t parent, const char *name, - mode_t mode, struct fuse_file_info *fi) -{ - struct fuse_entry_param fe; - memset(&fe, 0, sizeof(fe)); - Fh *fh; - int r = client->ll_create(parent, name, mode, fi->flags, &fe.attr, &fh); - if (r == 0) { - fi->fh = (long)fh; - fe.ino = fe.attr.st_ino; - fuse_reply_create(req, &fe, fi); - } else { - fuse_reply_err(req, -r); - } -} - -static void ceph_ll_statfs(fuse_req_t req, fuse_ino_t ino) -{ - struct statvfs stbuf; - int r = client->ll_statfs(ino, &stbuf); - if (r == 0) - fuse_reply_statfs(req, &stbuf); - else - fuse_reply_err(req, -r); -} - -static struct fuse_lowlevel_ops ceph_ll_oper = { - init: 0, - destroy: 0, - lookup: ceph_ll_lookup, - forget: ceph_ll_forget, - getattr: ceph_ll_getattr, - setattr: ceph_ll_setattr, - readlink: ceph_ll_readlink, - mknod: ceph_ll_mknod, - mkdir: ceph_ll_mkdir, - unlink: ceph_ll_unlink, - rmdir: ceph_ll_rmdir, - symlink: ceph_ll_symlink, - rename: ceph_ll_rename, - link: ceph_ll_link, - open: ceph_ll_open, - read: ceph_ll_read, - write: ceph_ll_write, - flush: ceph_ll_flush, - release: ceph_ll_release, - fsync: ceph_ll_fsync, - opendir: ceph_ll_opendir, - readdir: ceph_ll_readdir, - releasedir: ceph_ll_releasedir, - fsyncdir: 0, - statfs: ceph_ll_statfs, - setxattr: 0, - getxattr: 0, - listxattr: 0, - removexattr: 0, - access: 0, - create: ceph_ll_create, - getlk: 0, - setlk: 0, - bmap: 0 -}; - -int ceph_fuse_ll_main(Client *c, int argc, char *argv[]) -{ - cout << "ceph_fuse_ll_main starting fuse on pid " << getpid() << std::endl; - - client = c; - - // set up fuse argc/argv - int newargc = 0; - char **newargv = (char **) malloc((argc + 10) * sizeof(char *)); - newargv[newargc++] = argv[0]; - newargv[newargc++] = "-f"; // stay in foreground - - newargv[newargc++] = "-o"; - newargv[newargc++] = "allow_other"; - - for (int argctr = 1; argctr < argc; argctr++) newargv[newargc++] = argv[argctr]; - - // go go gadget fuse - struct fuse_args args = FUSE_ARGS_INIT(newargc, newargv); - struct fuse_chan *ch; - char *mountpoint; - int err = -1; - - if (fuse_parse_cmdline(&args, &mountpoint, NULL, NULL) != -1 && - (ch = fuse_mount(mountpoint, &args)) != NULL) { - struct fuse_session *se; - - // init fuse - se = fuse_lowlevel_new(&args, &ceph_ll_oper, sizeof(ceph_ll_oper), - NULL); - if (se != NULL) { - if (fuse_set_signal_handlers(se) != -1) { - fuse_session_add_chan(se, ch); - err = fuse_session_loop(se); - fuse_remove_signal_handlers(se); - fuse_session_remove_chan(ch); - } - fuse_session_destroy(se); - } - fuse_unmount(mountpoint, ch); - } - fuse_opt_free_args(&args); - - cout << "ceph_fuse_ll_main done, err=" << err << std::endl; - return err ? 1 : 0; -} - diff --git a/branches/sage/ebofs2/client/fuse_ll.h b/branches/sage/ebofs2/client/fuse_ll.h deleted file mode 100644 index 068969c4f7487..0000000000000 --- a/branches/sage/ebofs2/client/fuse_ll.h +++ /dev/null @@ -1,15 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -int ceph_fuse_ll_main(Client *c, int argc, char *argv[]); diff --git a/branches/sage/ebofs2/client/hadoop/CephFSInterface.cc b/branches/sage/ebofs2/client/hadoop/CephFSInterface.cc deleted file mode 100644 index 7aa8c133d370b..0000000000000 --- a/branches/sage/ebofs2/client/hadoop/CephFSInterface.cc +++ /dev/null @@ -1,789 +0,0 @@ -#include "CephFSInterface.h" - -using namespace std; - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_initializeClient - * Signature: ()J - * Initializes a ceph client. - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1initializeClient - (JNIEnv *, jobject) -{ - - dout(3) << "CephFSInterface: Initializing Ceph client:" << endl; - - // parse args from CEPH_ARGS - vector args; - env_to_vec(args); - parse_config_options(args); - - if (g_conf.clock_tare) g_clock.tare(); - - // be safe - g_conf.use_abspaths = true; - - // load monmap - MonMap monmap; - // int r = monmap.read(".ceph_monmap"); - int r = monmap.read("/cse/grads/eestolan/ceph/trunk/ceph/.ceph_monmap"); - if (r < 0) { - dout(0) << "CephFSInterface: could not find .ceph_monmap" << endl; - assert(0 && "could not find .ceph_monmap"); - // return 0; - } - assert(r >= 0); - - // start up network - rank.start_rank(); - - // start client - Client *client; - client = new Client(rank.register_entity(MSG_ADDR_CLIENT_NEW), &monmap); - client->init(); - - // mount - client->mount(); - - jlong clientp = *(jlong*)&client; - return clientp; -} - - - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_copyFromLocalFile - * Signature: (JLjava/lang/String;Ljava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1copyFromLocalFile -(JNIEnv * env, jobject obj, jlong clientp, jstring j_local_path, jstring j_ceph_path) { - - dout(10) << "CephFSInterface: In copyFromLocalFile" << endl; - Client* client; - //client = (Client*) clientp; - client = *(Client**)&clientp; - - const char* c_local_path = env->GetStringUTFChars(j_local_path, 0); - const char* c_ceph_path = env->GetStringUTFChars(j_ceph_path, 0); - - dout(10) << "CephFSInterface: Local source file is "<< c_local_path << " and Ceph destination file is " << c_ceph_path << endl; - struct stat st; - int r = ::stat(c_local_path, &st); - assert (r == 0); - - // open the files - int fh_local = ::open(c_local_path, O_RDONLY); - int fh_ceph = client->open(c_ceph_path, O_WRONLY|O_CREAT|O_TRUNC); - assert (fh_local > -1); - assert (fh_ceph > -1); - dout(10) << "CephFSInterface: local fd is " << fh_local << " and Ceph fd is " << fh_ceph << endl; - - // get the source file size - off_t remaining = st.st_size; - - // copy the file a MB at a time - const int chunk = 1048576; - bufferptr bp(chunk); - - while (remaining > 0) { - off_t got = ::read(fh_local, bp.c_str(), MIN(remaining,chunk)); - assert(got > 0); - remaining -= got; - off_t wrote = client->write(fh_ceph, bp.c_str(), got, -1); - assert (got == wrote); - } - client->close(fh_ceph); - ::close(fh_local); - - env->ReleaseStringUTFChars(j_local_path, c_local_path); - env->ReleaseStringUTFChars(j_ceph_path, c_ceph_path); - - return JNI_TRUE; -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_copyToLocalFile - * Signature: (JLjava/lang/String;Ljava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1copyToLocalFile -(JNIEnv *env, jobject obj, jlong clientp, jstring j_ceph_path, jstring j_local_path) -{ - Client* client; - client = *(Client**)&clientp; - const char* c_ceph_path = env->GetStringUTFChars(j_ceph_path, 0); - const char* c_local_path = env->GetStringUTFChars(j_local_path, 0); - - dout(3) << "CephFSInterface: dout(3): In copyToLocalFile, copying from Ceph file " << c_ceph_path << - " to local file " << c_local_path << endl; - - cout << "CephFSInterface: cout: In copyToLocalFile, copying from Ceph file " << c_ceph_path << - " to local file " << c_local_path << endl; - - - // get source file size - struct stat st; - //dout(10) << "Attempting lstat with file " << c_ceph_path << ":" << endl; - int r = client->lstat(c_ceph_path, &st); - assert (r == 0); - - dout(10) << "CephFSInterface: Opening Ceph source file for read: " << endl; - int fh_ceph = client->open(c_ceph_path, O_RDONLY); - assert (fh_ceph > -1); - - dout(10) << "CephFSInterface: Opened Ceph file! Opening local destination file: " << endl; - int fh_local = ::open(c_local_path, O_WRONLY|O_CREAT|O_TRUNC, 0644); - assert (fh_local > -1); - - // copy the file a chunk at a time - const int chunk = 1048576; - bufferptr bp(chunk); - - off_t remaining = st.st_size; - while (remaining > 0) { - off_t got = client->read(fh_ceph, bp.c_str(), MIN(remaining,chunk), -1); - assert(got > 0); - remaining -= got; - off_t wrote = ::write(fh_local, bp.c_str(), got); - assert (got == wrote); - } - client->close(fh_ceph); - ::close(fh_local); - - env->ReleaseStringUTFChars(j_local_path, c_local_path); - env->ReleaseStringUTFChars(j_ceph_path, c_ceph_path); - - return JNI_TRUE; -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_getcwd - * Signature: (J)Ljava/lang/String; - * Returns the current working directory. - */ -JNIEXPORT jstring JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1getcwd - (JNIEnv *env, jobject obj, jlong clientp) -{ - dout(10) << "CephFSInterface: In getcwd" << endl; - - Client* client; - client = *(Client**)&clientp; - - return (env->NewStringUTF(client->getcwd().c_str())); -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_setcwd - * Signature: (JLjava/lang/String;)Z - * - * Changes the working directory. - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1setcwd -(JNIEnv *env, jobject obj, jlong clientp, jstring j_path) -{ - dout(10) << "CephFSInterface: In setcwd" << endl; - - Client* client; - client = *(Client**)&clientp; - - const char* c_path = env->GetStringUTFChars(j_path, 0); - return (0 <= client->chdir(c_path)) ? JNI_TRUE : JNI_FALSE; - env->ReleaseStringUTFChars(j_path, c_path); -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_rmdir - * Signature: (JLjava/lang/String;)Z - * Removes an empty directory. - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1rmdir - (JNIEnv *env, jobject, jlong clientp, jstring j_path) -{ - dout(10) << "CephFSInterface: In rmdir" << endl; - - Client* client; - client = *(Client**)&clientp; - - const char* c_path = env->GetStringUTFChars(j_path, 0); - return (0 == client->rmdir(c_path)) ? JNI_TRUE : JNI_FALSE; - env->ReleaseStringUTFChars(j_path, c_path); -} - - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_mkdir - * Signature: (JLjava/lang/String;)Z - * Creates a directory with full permissions. - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1mkdir - (JNIEnv * env, jobject, jlong clientp, jstring j_path) -{ - dout(10) << "CephFSInterface: In mkdir" << endl; - - Client* client; - client = *(Client**)&clientp; - - const char* c_path = env->GetStringUTFChars(j_path, 0); - return (0 == client->mkdir(c_path, 0xFF)) ? JNI_TRUE : JNI_FALSE; - env->ReleaseStringUTFChars(j_path, c_path); -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_unlink - * Signature: (JLjava/lang/String;)Z - * Unlinks a path. - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1unlink - (JNIEnv * env, jobject, jlong clientp, jstring j_path) -{ - Client* client; - client = *(Client**)&clientp; - - const char* c_path = env->GetStringUTFChars(j_path, 0); - dout(10) << "CephFSInterface: In unlink for path " << c_path << ":" << endl; - - // is it a file or a directory? - struct stat stbuf; - int stat_result = client->lstat(c_path, &stbuf); - if (stat_result < 0) {// then the path doesn't even exist - dout(0) << "ceph_unlink: path " << c_path << " does not exist" << endl; - return false; - } - int result; - if (0 != S_ISDIR(stbuf.st_mode)) { // it's a directory - dout(10) << "ceph_unlink: path " << c_path << " is a directory. Calling client->rmdir()" << endl; - result = client->rmdir(c_path); - } - else if (0 != S_ISREG(stbuf.st_mode)) { // it's a file - dout(10) << "ceph_unlink: path " << c_path << " is a file. Calling client->unlink()" << endl; - result = client->unlink(c_path); - } - else { - dout(0) << "ceph_unlink: path " << c_path << " is not a file or a directory. Failing:" << endl; - result = -1; - } - - dout(10) << "In ceph_unlink for path " << c_path << - ": got result " - << result << ". Returning..."<< endl; - - env->ReleaseStringUTFChars(j_path, c_path); - return (0 == result) ? JNI_TRUE : JNI_FALSE; -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_rename - * Signature: (JLjava/lang/String;Ljava/lang/String;)Z - * Renames a file. - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1rename - (JNIEnv *env, jobject, jlong clientp, jstring j_from, jstring j_to) -{ - dout(10) << "CephFSInterface: In rename" << endl; - - Client* client; - client = *(Client**)&clientp; - - const char* c_from = env->GetStringUTFChars(j_from, 0); - const char* c_to = env->GetStringUTFChars(j_to, 0); - - return (0 <= client->rename(c_from, c_to)) ? JNI_TRUE : JNI_FALSE; - env->ReleaseStringUTFChars(j_from, c_from); - env->ReleaseStringUTFChars(j_to, c_to); -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_exists - * Signature: (JLjava/lang/String;)Z - * Returns true if the path exists. - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1exists -(JNIEnv *env, jobject, jlong clientp, jstring j_path) -{ - - dout(10) << "CephFSInterface: In exists" << endl; - - Client* client; - struct stat stbuf; - client = *(Client**)&clientp; - - const char* c_path = env->GetStringUTFChars(j_path, 0); - dout(10) << "Attempting lstat with file " << c_path << ":" ; - int result = client->lstat(c_path, &stbuf); - dout(10) << "result is " << result << endl; - env->ReleaseStringUTFChars(j_path, c_path); - if (result < 0) { - dout(10) << "Returning false (file does not exist)" << endl; - return JNI_FALSE; - } - else { - dout(10) << "Returning true (file exists)" << endl; - return JNI_TRUE; - } -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_getblocksize - * Signature: (JLjava/lang/String;)J - * Returns the block size. Size is -1 if the file - * does not exist. - * TODO: see if Hadoop wants something more like stripe size - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1getblocksize - (JNIEnv *env, jobject obj, jlong clientp, jstring j_path) -{ - dout(10) << "In getblocksize" << endl; - - Client* client; - //struct stat stbuf; - client = *(Client**)&clientp; - - jint result; - - const char* c_path = env->GetStringUTFChars(j_path, 0); - - /* - if (0 > client->lstat(c_path, &stbuf)) - result = -1; - else - result = stbuf.st_blksize; - */ - - // we need to open the file to retrieve the stripe size - dout(10) << "CephFSInterface: getblocksize: opening file" << endl; - int fh = client->open(c_path, O_RDONLY); - if (fh < 0) - return -1; - - result = client->get_stripe_unit(fh); - - int close_result = client->close(fh); - assert (close_result > -1); - - - env->ReleaseStringUTFChars(j_path, c_path); - return result; -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_getfilesize - * Signature: (JLjava/lang/String;)J - * Returns the file size, or -1 on failure. - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1getfilesize - (JNIEnv *env, jobject, jlong clientp, jstring j_path) -{ - dout(10) << "In getfilesize" << endl; - - Client* client; - struct stat stbuf; - client = *(Client**)&clientp; - - jlong result; - - const char* c_path = env->GetStringUTFChars(j_path, 0); - if (0 > client->lstat(c_path, &stbuf)) result = -1; - else result = stbuf.st_size; - env->ReleaseStringUTFChars(j_path, c_path); - - return result; -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_isfile - * Signature: (JLjava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1isfile - (JNIEnv *env, jobject obj, jlong clientp, jstring j_path) -{ - dout(10) << "In isfile" << endl; - - Client* client; - struct stat stbuf; - client = *(Client**)&clientp; - - - const char* c_path = env->GetStringUTFChars(j_path, 0); - int result = client->lstat(c_path, &stbuf); - - env->ReleaseStringUTFChars(j_path, c_path); - - // if the stat call failed, it's definitely not a file... - if (0 > result) return JNI_FALSE; - - // check the stat result - return (0 == S_ISREG(stbuf.st_mode)) ? JNI_FALSE : JNI_TRUE; -} - - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_isdirectory - * Signature: (JLjava/lang/String;)Z - * Returns true if the path is a directory. - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1isdirectory - (JNIEnv *env, jobject, jlong clientp, jstring j_path) -{ - dout(10) << "In isdirectory" << endl; - - Client* client; - struct stat stbuf; - client = *(Client**)&clientp; - - const char* c_path = env->GetStringUTFChars(j_path, 0); - int result = client->lstat(c_path, &stbuf); - env->ReleaseStringUTFChars(j_path, c_path); - - // if the stat call failed, it's definitely not a directory... - if (0 > result) return JNI_FALSE; - - // check the stat result - return (0 == S_ISDIR(stbuf.st_mode)) ? JNI_FALSE : JNI_TRUE; -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_getdir - * Signature: (JLjava/lang/String;)[Ljava/lang/String; - * Returns a Java array of Strings with the directory contents - */ -JNIEXPORT jobjectArray JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1getdir -(JNIEnv *env, jobject obj, jlong clientp, jstring j_path) { - - dout(10) << "In getdir" << endl; - - Client* client; - client = *(Client**)&clientp; - - // get the directory listing - map contents; - const char* c_path = env->GetStringUTFChars(j_path, 0); - int result = client->getdir(c_path, contents); - env->ReleaseStringUTFChars(j_path, c_path); - - if (result < 0) return NULL; - - dout(10) << "checking for empty dir" << endl; - jint dir_size = contents.size(); - - // Hadoop freaks out if the listing contains "." or "..". Shrink - // the listing size by two, or by one if the directory is the root. - if(('/' == c_path[0]) && (0 == c_path[1])) - dir_size -= 1; - else - dir_size -= 2; - assert (dir_size >= 0); - - // Create a Java String array of the size of the directory listing - // jstring blankString = env->NewStringUTF(""); - jclass stringClass = env->FindClass("java/lang/String"); - if (NULL == stringClass) { - dout(0) << "ERROR: java String class not found; dying a horrible, painful death" << endl; - assert(0); - } - jobjectArray dirListingStringArray = (jobjectArray) env->NewObjectArray(dir_size, stringClass, NULL); - - // populate the array with the elements of the directory list, - // omitting . and .. - int i = 0; - string dot("."); - string dotdot (".."); - for (map::iterator it = contents.begin(); - it != contents.end(); - it++) { - // is it "."? - if (it->first == dot) continue; - if (it->first == dotdot) continue; - - if (0 == dir_size) - dout(0) << "CephFSInterface: WARNING: adding stuff to an empty array." << endl; - assert (i < dir_size); - env->SetObjectArrayElement(dirListingStringArray, i, - env->NewStringUTF(it->first.c_str())); - ++i; - } - - return dirListingStringArray; -} - - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_open_for_read - * Signature: (JLjava/lang/String;)I - * Open a file for reading. - */ -JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1open_1for_1read - (JNIEnv *env, jobject obj, jlong clientp, jstring j_path) - -{ - dout(10) << "In open_for_read" << endl; - - Client* client; - client = *(Client**)&clientp; - - jint result; - - // open as read-only: flag = O_RDONLY - - const char* c_path = env->GetStringUTFChars(j_path, 0); - result = client->open(c_path, O_RDONLY); - env->ReleaseStringUTFChars(j_path, c_path); - - // returns file handle, or -1 on failure - return result; -} - - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_open_for_overwrite - * Signature: (JLjava/lang/String;)I - * Opens a file for overwriting; creates it if necessary. - */ -JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1open_1for_1overwrite - (JNIEnv *env, jobject obj, jlong clientp, jstring j_path) -{ - dout(10) << "In open_for_overwrite" << endl; - - Client* client; - client = *(Client**)&clientp; - - jint result; - - - const char* c_path = env->GetStringUTFChars(j_path, 0); - result = client->open(c_path, O_WRONLY|O_CREAT|O_TRUNC); - env->ReleaseStringUTFChars(j_path, c_path); - - // returns file handle, or -1 on failure - return result; -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_kill_client - * Signature: (J)Z - * - * Closes the Ceph client. - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1kill_1client - (JNIEnv *env, jobject obj, jlong clientp) -{ - Client* client; - client = *(Client**)&clientp; - - client->unmount(); - client->shutdown(); - delete client; - - // wait for messenger to finish - rank.wait(); - - return true; -} - - - -/* - * Class: org_apache_hadoop_fs_ceph_CephInputStream - * Method: ceph_read - * Signature: (JI[BII)I - * Reads into the given byte array from the current position. - */ -JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephInputStream_ceph_1read - (JNIEnv *env, jobject obj, jlong clientp, jint fh, jbyteArray j_buffer, jint buffer_offset, jint length) -{ - dout(10) << "In read" << endl; - - - // IMPORTANT NOTE: Hadoop read arguments are a bit different from POSIX so we - // have to convert. The read is *always* from the current position in the file, - // and buffer_offset is the location in the *buffer* where we start writing. - - - Client* client; - client = *(Client**)&clientp; - jint result; - - // Step 1: get a pointer to the buffer. - jbyte* j_buffer_ptr = env->GetByteArrayElements(j_buffer, NULL); - char* c_buffer = (char*) j_buffer_ptr; - - // Step 2: pointer arithmetic to start in the right buffer position - c_buffer += (int)buffer_offset; - - // Step 3: do the read - result = client->read((int)fh, c_buffer, length, -1); - - // Step 4: release the pointer to the buffer - env->ReleaseByteArrayElements(j_buffer, j_buffer_ptr, 0); - - return result; -} - - -/* - * Class: org_apache_hadoop_fs_ceph_CephInputStream - * Method: ceph_seek_from_start - * Signature: (JIJ)J - * Seeks to the given position. - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephInputStream_ceph_1seek_1from_1start - (JNIEnv *env, jobject obj, jlong clientp, jint fh, jlong pos) -{ - dout(10) << "In CephInputStream::seek_from_start" << endl; - - Client* client; - client = *(Client**)&clientp; - jint result; - - result = client->lseek(fh, pos, SEEK_SET); - - return result; -} - -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephInputStream_ceph_1getpos - (JNIEnv *env, jobject obj, jlong clientp, jint fh) -{ - dout(10) << "In CephInputStream::ceph_getpos" << endl; - - Client* client; - client = *(Client**)&clientp; - jint result; - - // seek a distance of 0 to get current offset - result = client->lseek(fh, 0, SEEK_CUR); - - return result; -} - - -/* - * Class: org_apache_hadoop_fs_ceph_CephInputStream - * Method: ceph_close - * Signature: (JI)I - * Closes the file. - */ -JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephInputStream_ceph_1close - (JNIEnv *env, jobject obj, jlong clientp, jint fh) -{ - dout(10) << "In CephInputStream::ceph_close" << endl; - - Client* client; - client = *(Client**)&clientp; - jint result; - - result = client->close(fh); - - return result; -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephOutputStream - * Method: ceph_seek_from_start - * Signature: (JIJ)J - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephOutputStream_ceph_1seek_1from_1start - (JNIEnv *env, jobject obj, jlong clientp, jint fh, jlong pos) -{ - dout(10) << "In CephOutputStream::ceph_seek_from_start" << endl; - - Client* client; - client = *(Client**)&clientp; - jint result; - - result = client->lseek(fh, pos, SEEK_SET); - - return result; -} - - -/* - * Class: org_apache_hadoop_fs_ceph_CephOutputStream - * Method: ceph_getpos - * Signature: (JI)J - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephOutputStream_ceph_1getpos - (JNIEnv *env, jobject obj, jlong clientp, jint fh) -{ - dout(10) << "In CephOutputStream::ceph_getpos" << endl; - - Client* client; - client = *(Client**)&clientp; - jint result; - - // seek a distance of 0 to get current offset - result = client->lseek(fh, 0, SEEK_CUR); - - return result; -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephOutputStream - * Method: ceph_close - * Signature: (JI)I - */ -JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephOutputStream_ceph_1close - (JNIEnv *env, jobject obj, jlong clientp, jint fh) -{ - dout(10) << "In CephOutputStream::ceph_close" << endl; - - Client* client; - client = *(Client**)&clientp; - jint result; - - result = client->close(fh); - - return result; -} - - - -/* - * Class: org_apache_hadoop_fs_ceph_CephOutputStream - * Method: ceph_write - * Signature: (JI[BII)I - */ -JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephOutputStream_ceph_1write - (JNIEnv *env, jobject obj, jlong clientp, jint fh, jbyteArray j_buffer, jint buffer_offset, jint length) -{ - dout(10) << "In write" << endl; - - // IMPORTANT NOTE: Hadoop write arguments are a bit different from POSIX so we - // have to convert. The write is *always* from the current position in the file, - // and buffer_offset is the location in the *buffer* where we start writing. - - Client* client; - client = *(Client**)&clientp; - jint result; - - // Step 1: get a pointer to the buffer. - jbyte* j_buffer_ptr = env->GetByteArrayElements(j_buffer, NULL); - char* c_buffer = (char*) j_buffer_ptr; - - // Step 2: pointer arithmetic to start in the right buffer position - c_buffer += (int)buffer_offset; - - // Step 3: do the write - result = client->write((int)fh, c_buffer, length, -1); - - // Step 4: release the pointer to the buffer - env->ReleaseByteArrayElements(j_buffer, j_buffer_ptr, 0); - - return result; -} - diff --git a/branches/sage/ebofs2/client/hadoop/CephFSInterface.h b/branches/sage/ebofs2/client/hadoop/CephFSInterface.h deleted file mode 100644 index 549925aba6e64..0000000000000 --- a/branches/sage/ebofs2/client/hadoop/CephFSInterface.h +++ /dev/null @@ -1,239 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* DO NOT EDIT THIS FILE - it is machine generated */ -#include -/* Header for class org_apache_hadoop_fs_ceph_CephFileSystem */ - -#include -#include "client/Client.h" -#include "config.h" -#include "client/fuse.h" -#include "msg/SimpleMessenger.h" -#include "common/Timer.h" - -#ifndef _Included_org_apache_hadoop_fs_ceph_CephFileSystem -#define _Included_org_apache_hadoop_fs_ceph_CephFileSystem -#ifdef __cplusplus -extern "C" { -#endif - -#undef org_apache_hadoop_fs_ceph_CephFileSystem_DEFAULT_BLOCK_SIZE -#define org_apache_hadoop_fs_ceph_CephFileSystem_DEFAULT_BLOCK_SIZE 1048576LL -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_initializeClient - * Signature: ()J - * Initializes a ceph client. - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1initializeClient -(JNIEnv *, jobject); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_copyFromLocalFile - * Signature: (JLjava/lang/String;Ljava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1copyFromLocalFile - (JNIEnv *, jobject, jlong, jstring, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_copyToLocalFile - * Signature: (JLjava/lang/String;Ljava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1copyToLocalFile - (JNIEnv *, jobject, jlong, jstring, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_getcwd - * Signature: (J)Ljava/lang/String; - */ -JNIEXPORT jstring JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1getcwd - (JNIEnv *, jobject, jlong); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_setcwd - * Signature: (JLjava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1setcwd - (JNIEnv *, jobject, jlong, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_rmdir - * Signature: (JLjava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1rmdir - (JNIEnv *, jobject, jlong, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_mkdir - * Signature: (JLjava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1mkdir - (JNIEnv *, jobject, jlong, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_unlink - * Signature: (JLjava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1unlink - (JNIEnv *, jobject, jlong, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_rename - * Signature: (JLjava/lang/String;Ljava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1rename - (JNIEnv *, jobject, jlong, jstring, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_exists - * Signature: (JLjava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1exists - (JNIEnv *, jobject, jlong, jstring); - - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_getblocksize - * Signature: (JLjava/lang/String;)J - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1getblocksize - (JNIEnv *, jobject, jlong, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_getfilesize - * Signature: (JLjava/lang/String;)J - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1getfilesize - (JNIEnv *, jobject, jlong, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_isdirectory - * Signature: (JLjava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1isdirectory - (JNIEnv *, jobject, jlong, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_isfile - * Signature: (JLjava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1isfile - (JNIEnv *, jobject, jlong, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_getdir - * Signature: (JLjava/lang/String;)[Ljava/lang/String; - */ -JNIEXPORT jobjectArray JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1getdir - (JNIEnv *, jobject, jlong, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_open_for_read - * Signature: (JLjava/lang/String;)I - */ -JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1open_1for_1read - (JNIEnv *, jobject, jlong, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_open_for_overwrite - * Signature: (JLjava/lang/String;)I - */ -JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1open_1for_1overwrite - (JNIEnv *, jobject, jlong, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_kill_client - * Signature: (J)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1kill_1client - (JNIEnv *, jobject, jlong); - -#undef org_apache_hadoop_fs_ceph_CephInputStream_SKIP_BUFFER_SIZE -#define org_apache_hadoop_fs_ceph_CephInputStream_SKIP_BUFFER_SIZE 2048L -/* - * Class: org_apache_hadoop_fs_ceph_CephInputStream - * Method: ceph_read - * Signature: (JI[BII)I - */ -JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephInputStream_ceph_1read - (JNIEnv *, jobject, jlong, jint, jbyteArray, jint, jint); - -/* - * Class: org_apache_hadoop_fs_ceph_CephInputStream - * Method: ceph_seek_from_start - * Signature: (JIJ)J - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephInputStream_ceph_1seek_1from_1start - (JNIEnv *, jobject, jlong, jint, jlong); - -/* - * Class: org_apache_hadoop_fs_ceph_CephInputStream - * Method: ceph_getpos - * Signature: (JI)J - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephInputStream_ceph_1getpos - (JNIEnv *, jobject, jlong, jint); - -/* - * Class: org_apache_hadoop_fs_ceph_CephInputStream - * Method: ceph_close - * Signature: (JI)I - */ -JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephInputStream_ceph_1close - (JNIEnv *, jobject, jlong, jint); - -/* Header for class org_apache_hadoop_fs_ceph_CephOutputStream */ - -/* - * Class: org_apache_hadoop_fs_ceph_CephOutputStream - * Method: ceph_seek_from_start - * Signature: (JIJ)J - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephOutputStream_ceph_1seek_1from_1start - (JNIEnv *, jobject, jlong, jint, jlong); - -/* - * Class: org_apache_hadoop_fs_ceph_CephOutputStream - * Method: ceph_getpos - * Signature: (JI)J - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephOutputStream_ceph_1getpos - (JNIEnv *, jobject, jlong, jint); - -/* - * Class: org_apache_hadoop_fs_ceph_CephOutputStream - * Method: ceph_close - * Signature: (JI)I - */ -JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephOutputStream_ceph_1close - (JNIEnv *, jobject, jlong, jint); - -/* - * Class: org_apache_hadoop_fs_ceph_CephOutputStream - * Method: ceph_write - * Signature: (JI[BII)I - */ -JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephOutputStream_ceph_1write - (JNIEnv *, jobject, jlong, jint, jbyteArray, jint, jint); - -#ifdef __cplusplus -} -#endif -#endif diff --git a/branches/sage/ebofs2/client/ldceph.cc b/branches/sage/ebofs2/client/ldceph.cc deleted file mode 100644 index b17133ee1e6f2..0000000000000 --- a/branches/sage/ebofs2/client/ldceph.cc +++ /dev/null @@ -1,298 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include -using namespace std; - -// ceph stuff -#include "config.h" -#include "client/Client.h" -#include "msg/SimpleMessenger.h" - -// syscall fun -#include -#include -#include -//#include - -#define _FCNTL_H -#include - -#define CEPH_FD_OFF 50000 - - -/****** startup etc *******/ - -class LdCeph { -public: - // globals - bool started; - char *mount_point; - char *mount_point_parent; - int mount_point_len; - - Client *client; - - filepath fp_mount_point; - filepath cwd; - bool cwd_above_mp, cwd_in_mp; - - const char *get_ceph_path(const char *orig, char *buf) { - if (!started) return 0; - - // relative path? BUG: this won't catch "blah/../../asdf" - if (orig[0] && - orig[0] != '/' && - !(orig[0] == '.' && orig[1] == '.')) { - - if (cwd_in_mp) return orig; // inside mount point, definitely ceph - if (!cwd_above_mp) return 0; // not above mount point, definitely not ceph - - // relative, above mp. - filepath o = orig; - filepath p = cwd; - for (unsigned b = 0; b < o.depth(); b++) { - if (o[b] == "..") - p.pop_dentry(); - else - p.add_dentry(o[b]); - } - - // FIXME rewrite - if (strncmp(p.c_str(), mount_point, mount_point_len) == 0) { - if (p.c_str()[mount_point_len] == 0) - return "/"; - if (p.c_str()[mount_point_len] == '/') { - strcpy(buf, p.c_str() + mount_point_len); - return buf; - } - } - return 0; - } else { - // absolute - if (strncmp(orig, mount_point, mount_point_len) == 0) { - if (orig[mount_point_len] == 0) - return "/"; - if (orig[mount_point_len] == '/') - return orig + mount_point_len; - } - return 0; - } - } - - void refresh_cwd() { - char buf[255]; - syscall(SYS_getcwd, buf, 255); - cwd = buf; - - if (strncmp(buf, mount_point, mount_point_len) == 0 && - (buf[mount_point_len] == 0 || - buf[mount_point_len] == '/')) - cwd_in_mp = true; - else { - if (cwd.depth() > fp_mount_point.depth()) - cwd_above_mp = false; - else { - cwd_above_mp = true; - for (unsigned i=0; iget_myaddr() << endl; - - refresh_cwd(); - } - } - ~LdCeph() { - cout << "ldceph fini" << endl; - if (false && client) { - client->unmount(); - client->shutdown(); - delete client; - client = 0; - tcpmessenger_wait(); - tcpmessenger_shutdown(); - } - } - -} ldceph; - - - -/****** original functions ****/ - - - -/****** captured functions ****/ - - -#define MYFD(f) ((fd) > CEPH_FD_OFF && ldceph.started) -#define TO_FD(fd) (fd > 0 ? fd+CEPH_FD_OFF:fd) -#define FROM_FD(fd) (fd - CEPH_FD_OFF) - -extern "C" { - - // open/close - //int open(const char *pathname, int flags) { - int open(const char *pathname, int flags, mode_t mode) { - char buf[255]; - if (const char *c = ldceph.get_ceph_path(pathname, buf)) - return TO_FD(ldceph.client->open(c, flags)); - else - return syscall(SYS_open, pathname, flags, mode); - } - - int creat(const char *pathname, mode_t mode) { - return open(pathname, O_CREAT|O_WRONLY|O_TRUNC, mode); - } - int close(int fd) { - if (MYFD(fd)) - return ldceph.client->close(FROM_FD(fd)); - else - return syscall(SYS_close, fd); - } - - - // read/write - ssize_t write(int fd, const void *buf, size_t count) { - if (MYFD(fd)) - return ldceph.client->write(FROM_FD(fd), (char*)buf, count); - else - return syscall(SYS_write, fd, buf, count); - } - - ssize_t read(int fd, void *buf, size_t count) { - if (MYFD(fd)) - return ldceph.client->read(FROM_FD(fd), (char*)buf, count); - else - return syscall(SYS_read, fd, buf, count); - } - - //int fsync(int fd); - //int fdatasync(int fd); - - - // namespace - int rmdir(const char *pathname) { - char buf[255]; - if (const char *c = ldceph.get_ceph_path(pathname, buf)) - return ldceph.client->rmdir(c); - else - return syscall(SYS_rmdir, pathname); - } - int mkdir(const char *pathname, mode_t mode) { - char buf[255]; - if (const char *c = ldceph.get_ceph_path(pathname, buf)) - return ldceph.client->mkdir(c, mode); - else - return syscall(SYS_mkdir, pathname, mode); - } - int unlink(const char *pathname) { - char buf[255]; - if (const char *c = ldceph.get_ceph_path(pathname, buf)) - return ldceph.client->unlink(c); - else - return syscall(SYS_unlink, pathname); - } - - int stat(const char *pathname, struct stat *st) { - //int __xstat64(int __ver, const char *pathname, struct stat64 *st64) { // stoopid GLIBC - //struct stat *st = (struct stat*)st64; - char buf[255]; - if (const char *c = ldceph.get_ceph_path(pathname, buf)) - return ldceph.client->lstat(c, st); // FIXME - else - return syscall(SYS_stat, pathname, st); - } - //int fstat(int filedes, struct stat *buf); - //int lstat(const char *file_name, struct stat *buf); - - int chdir(const char *pathname) { - char buf[255]; - if (const char *c = ldceph.get_ceph_path(pathname, buf)) { - int r = ldceph.client->chdir(c); - if (r == 0) { - if (!ldceph.cwd_in_mp) - syscall(SYS_chdir, ldceph.mount_point_parent); - ldceph.cwd_in_mp = true; - ldceph.cwd_above_mp = false; - ldceph.cwd = ldceph.mount_point; - filepath fpc = c; - ldceph.cwd.append(fpc); - } - return r; - } else { - int r = syscall(SYS_chdir, pathname); - if (r) { - ldceph.refresh_cwd(); - } - return r; - } - } - char *getcwd(char *buf, size_t size) { - strncpy(buf, ldceph.cwd.c_str(), size); - return buf; - } - //int fchdir(int fd); - - - - -} diff --git a/branches/sage/ebofs2/cmds.cc b/branches/sage/ebofs2/cmds.cc deleted file mode 100644 index 6e475ad4b588d..0000000000000 --- a/branches/sage/ebofs2/cmds.cc +++ /dev/null @@ -1,108 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include -#include -#include - -#include -#include -#include -using namespace std; - -#include "config.h" - -#include "mon/MonMap.h" -#include "mds/MDS.h" - -#include "msg/SimpleMessenger.h" - -#include "common/Timer.h" - - -class C_Die : public Context { -public: - void finish(int) { - cerr << "die" << std::endl; - exit(1); - } -}; - -class C_Debug : public Context { - public: - void finish(int) { - int size = &g_conf.debug_after - &g_conf.debug; - memcpy((char*)&g_conf.debug, (char*)&g_debug_after_conf.debug, size); - generic_dout(0) << "debug_after flipping debug settings" << dendl; - } -}; - - -int main(int argc, char **argv) -{ - vector args; - argv_to_vec(argc, argv, args); - - parse_config_options(args); - - if (g_conf.kill_after) - g_timer.add_event_after(g_conf.kill_after, new C_Die); - if (g_conf.debug_after) - g_timer.add_event_after(g_conf.debug_after, new C_Debug); - - // mds specific args - int whoami = -1; - bool standby = false; // by default, i'll start active. - for (unsigned i=0; i= 0); - - // start up network - rank.start_rank(); - - // start mds - Messenger *m = rank.register_entity(entity_name_t::MDS(whoami)); - assert(m); - - MDS *mds = new MDS(whoami, m, &monmap); - mds->init(standby); - - // wait - rank.wait(); - - // yuck: grab the mds lock, so we can be sure that whoever in *mds - // called shutdown finishes what they were doing. - mds->mds_lock.Lock(); - mds->mds_lock.Unlock(); - - // done - //delete mds; - - return 0; -} - diff --git a/branches/sage/ebofs2/cmonctl.cc b/branches/sage/ebofs2/cmonctl.cc deleted file mode 100644 index 85f4e1dc49392..0000000000000 --- a/branches/sage/ebofs2/cmonctl.cc +++ /dev/null @@ -1,92 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include -#include -#include -using namespace std; - -#include "config.h" - -#include "mon/MonMap.h" -#include "msg/SimpleMessenger.h" -#include "messages/MMonCommand.h" -#include "messages/MMonCommandAck.h" - -#include "common/Timer.h" - -#ifndef DARWIN -#include -#endif // DARWIN - -#include -#include -#include - - -Messenger *messenger = 0; - -class Admin : public Dispatcher { - void dispatch(Message *m) { - switch (m->get_type()) { - case MSG_MON_COMMAND_ACK: - generic_dout(0) << m->get_source() << " -> '" - << ((MMonCommandAck*)m)->rs << "' (" << ((MMonCommandAck*)m)->r << ")" - << dendl; - messenger->shutdown(); - break; - } - } -} dispatcher; - -int main(int argc, char **argv, char *envp[]) { - - vector args; - argv_to_vec(argc, argv, args); - parse_config_options(args); - - // args for fuse - vec_to_argv(args, argc, argv); - - // load monmap - MonMap monmap; - int r = monmap.read(".ceph_monmap"); - assert(r >= 0); - - // start up network - rank.start_rank(); - messenger = rank.register_entity(entity_name_t::ADMIN()); - messenger->set_dispatcher(&dispatcher); - - // build command - MMonCommand *m = new MMonCommand(messenger->get_myinst()); - string cmd; - for (unsigned i=0; icmd.push_back(string(args[i])); - } - int mon = monmap.pick_mon(); - - generic_dout(0) << "mon" << mon << " <- '" << cmd << "'" << dendl; - - // send it - messenger->send_message(m, monmap.get_inst(mon)); - - // wait for messenger to finish - rank.wait(); - - return 0; -} - diff --git a/branches/sage/ebofs2/common/Clock.cc b/branches/sage/ebofs2/common/Clock.cc deleted file mode 100644 index 8b07f6d9eb15f..0000000000000 --- a/branches/sage/ebofs2/common/Clock.cc +++ /dev/null @@ -1,20 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include "Clock.h" - -// public -Clock g_clock; - diff --git a/branches/sage/ebofs2/common/Clock.h b/branches/sage/ebofs2/common/Clock.h deleted file mode 100644 index 1ea7227adebd4..0000000000000 --- a/branches/sage/ebofs2/common/Clock.h +++ /dev/null @@ -1,104 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __CLOCK_H -#define __CLOCK_H - -#include -#include - -#include -#include - -#include "Mutex.h" - -#include "include/utime.h" - - - -// -- clock -- -class Clock { - protected: - //utime_t start_offset; - //utime_t abs_last; - utime_t last; - utime_t zero; - - Mutex lock; - - public: - Clock() { - // set offset - //tare(); - } - - // real time. - utime_t real_now() { - utime_t realnow = now(); - realnow += zero; - //gettimeofday(&realnow.timeval(), NULL); - return realnow; - } - - // relative time (from startup) - void tare() { - gettimeofday(&zero.timeval(), NULL); - } - void tare(utime_t z) { - zero = z; - } - utime_t now() { - //lock.Lock(); - utime_t n; - gettimeofday(&n.timeval(), NULL); - n -= zero; - if (n < last) { - //std::cerr << "WARNING: clock jumped backwards from " << last << " to " << n << std::endl; - n = last; // clock jumped backwards! - } else - last = n; - //lock.Unlock(); - return n; - } - utime_t recent_now() { - return last; - } - - void realify(utime_t& t) { - t += zero; - } - - void make_timespec(utime_t& t, struct timespec *ts) { - utime_t real = t; - realify(real); - - memset(ts, 0, sizeof(*ts)); - ts->tv_sec = real.sec(); - ts->tv_nsec = real.nsec(); - } - - - - // absolute time - time_t gettime() { - return real_now().sec(); - } - -}; - -extern Clock g_clock; - -#endif diff --git a/branches/sage/ebofs2/common/Cond.h b/branches/sage/ebofs2/common/Cond.h deleted file mode 100644 index 4cb3d721b423f..0000000000000 --- a/branches/sage/ebofs2/common/Cond.h +++ /dev/null @@ -1,119 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __COND_H -#define __COND_H - -#include - -#include "Mutex.h" -#include "Clock.h" - -#include "include/Context.h" - -#include -#include - -class Cond { - // my bits - pthread_cond_t _c; - - // don't allow copying. - void operator=(Cond &C) {} - Cond( const Cond &C ) {} - - public: - Cond() { - int r = pthread_cond_init(&_c,NULL); - assert(r == 0); - } - virtual ~Cond() { - pthread_cond_destroy(&_c); - } - - int Wait(Mutex &mutex) { - int r = pthread_cond_wait(&_c, &mutex._m); - return r; - } - - int Wait(Mutex &mutex, char* s) { - //cout << "Wait: " << s << endl; - int r = pthread_cond_wait(&_c, &mutex._m); - return r; - } - - int WaitUntil(Mutex &mutex, utime_t when) { - struct timespec ts; - g_clock.make_timespec(when, &ts); - //cout << "timedwait for " << ts.tv_sec << " sec " << ts.tv_nsec << " nsec" << endl; - int r = pthread_cond_timedwait(&_c, &mutex._m, &ts); - return r; - } - int WaitInterval(Mutex &mutex, utime_t interval) { - utime_t when = g_clock.now(); - when += interval; - return WaitUntil(mutex, when); - } - - int Signal() { - //int r = pthread_cond_signal(&_c); - int r = pthread_cond_broadcast(&_c); - return r; - } - int SignalOne() { - int r = pthread_cond_signal(&_c); - return r; - } - int SignalAll() { - //int r = pthread_cond_signal(&_c); - int r = pthread_cond_broadcast(&_c); - return r; - } -}; - -class C_Cond : public Context { - Cond *cond; - bool *done; - int *rval; -public: - C_Cond(Cond *c, bool *d, int *r=0) : cond(c), done(d), rval(r) { - *done = false; - } - void finish(int r) { - if (rval) *rval = r; - *done = true; - cond->Signal(); - } -}; - -class C_SafeCond : public Context { - Mutex *lock; - Cond *cond; - bool *done; - int *rval; -public: - C_SafeCond(Mutex *l, Cond *c, bool *d, int *r=0) : lock(l), cond(c), done(d), rval(r) { - *done = false; - } - void finish(int r) { - lock->Lock(); - if (rval) *rval = r; - *done = true; - cond->Signal(); - lock->Unlock(); - } -}; - -#endif diff --git a/branches/sage/ebofs2/common/DecayCounter.h b/branches/sage/ebofs2/common/DecayCounter.h deleted file mode 100644 index f431fb2073cd7..0000000000000 --- a/branches/sage/ebofs2/common/DecayCounter.h +++ /dev/null @@ -1,138 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __DECAYCOUNTER_H -#define __DECAYCOUNTER_H - -#include -#include "Clock.h" - -#include "config.h" - -/** - * - * TODO: normalize value based on some fucntion of half_life, - * so that it can be interpreted as an approximation of a - * moving average of N seconds. currently, changing half-life - * skews the scale of the value, even at steady state. - * - */ - -class DecayCounter { - protected: -public: - double half_life; - double k; // k = ln(.5)/half_life - double val; // value - double delta; // delta since last decay - double vel; // recent velocity - utime_t last_decay; // time of last decay - - public: - DecayCounter() : val(0), delta(0), vel(0) { - set_halflife( g_conf.mds_decay_halflife ); - reset(); - } - DecayCounter(double hl) : val(0), delta(0), vel(0) { - set_halflife( hl ); - reset(); - } - - /** - * reading - */ - - double get() { - return get(g_clock.now()); - } - - double get(utime_t now) { - decay(now); - return val; - } - - double get_last() { - return val; - } - - double get_last_vel() { - return vel; - } - - utime_t get_last_decay() { - return last_decay; - } - - /** - * adjusting - */ - - double hit(utime_t now, double v = 1.0) { - decay(now); - delta += v; - return val+delta; - } - - void adjust(double a) { - val += a; - } - void adjust(utime_t now, double a) { - decay(now); - val += a; - } - void scale(double f) { - val *= f; - delta *= f; - vel *= f; - } - - /** - * decay etc. - */ - - void set_halflife(double hl) { - half_life = hl; - k = log(.5) / hl; - } - - void reset() { - reset(g_clock.now()); - } - void reset(utime_t now) { - last_decay = g_clock.now(); - val = delta = 0; - } - - void decay(utime_t now) { - utime_t el = now; - el -= last_decay; - - if (el.sec() >= 1) { - // calculate new value - double newval = (val+delta) * exp((double)el * k); - if (newval < .01) newval = 0.0; - - // calculate velocity approx - vel += (newval - val) * (double)el; - vel *= exp((double)el * k); - - val = newval; - delta = 0; - last_decay = now; - } - } -}; - - -#endif diff --git a/branches/sage/ebofs2/common/LogType.h b/branches/sage/ebofs2/common/LogType.h deleted file mode 100644 index a0889545acb6a..0000000000000 --- a/branches/sage/ebofs2/common/LogType.h +++ /dev/null @@ -1,122 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __LOGTYPE_H -#define __LOGTYPE_H - -#include "include/types.h" - -#include -#include -using std::string; -using std::ofstream; - -#include -#include -using __gnu_cxx::hash_map; -using __gnu_cxx::hash_set; - -#include "Mutex.h" - - -class LogType { - protected: - hash_map keymap; - vector keys; - set inc_keys; - vector avg; - - int version; - - // HACK to avoid the hash table as often as possible... - // cache recent key name lookups in a small ring buffer - const static int cache_keys = 10; - intptr_t kc_ptr[cache_keys]; - int kc_val[cache_keys]; - int kc_pos; - - friend class Logger; - - public: - LogType() { - version = 1; - - for (int i=0;i= 0) return i; - - i = keys.size(); - keys.push_back(key); - avg.push_back(false); - - intptr_t p = (intptr_t)key; - keymap[p] = i; - if (is_inc) inc_keys.insert(i); - - version++; - return i; - } - int add_inc(const char* key) { - return add_key(key, true); - } - int add_set(const char *key) { - return add_key(key, false); - } - int add_avg(const char *key) { - int i = add_key(key, true); - avg[i] = true; - return i; - } - - bool have_key(const char* key) { - return lookup_key(key) < 0; - } - - int lookup_key(const char* key) { - intptr_t p = (intptr_t)key; - - if (keymap.count(p)) - return keymap[p]; - - // try kc ringbuffer - int pos = kc_pos-1; - for (int j=0; j - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include - -#include "LogType.h" -#include "Logger.h" - -#include -#include "Clock.h" - -#include "config.h" - -#include -#include - -#include "common/Timer.h" - -// per-process lock. lame, but this way I protect LogType too! -Mutex logger_lock; -SafeTimer logger_timer(logger_lock); -Context *logger_event = 0; -list logger_list; -utime_t start; -int last_flush; // in seconds since start - -static void flush_all_loggers(); - -class C_FlushLoggers : public Context { -public: - void finish(int r) { - if (logger_event == this) { - logger_event = 0; - flush_all_loggers(); - } - } -}; - -void Logger::set_start(utime_t s) -{ - logger_lock.Lock(); - - start = s; - - utime_t fromstart = g_clock.now(); - if (fromstart < start) { - cerr << "set_start: logger time jumped backwards from " << start << " to " << fromstart << std::endl; - fromstart = start; - } - fromstart -= start; - last_flush = fromstart.sec(); - - logger_lock.Unlock(); -} - -static void flush_all_loggers() -{ - generic_dout(20) << "flush_all_loggers" << dendl; - - utime_t now = g_clock.now(); - utime_t fromstart = now; - if (fromstart < start) { - cerr << "logger time jumped backwards from " << start << " to " << fromstart << std::endl; - //assert(0); - start = fromstart; - } - fromstart -= start; - int now_sec = fromstart.sec(); - - // do any catching up we need to - while (now_sec - last_flush >= g_conf.log_interval) { - generic_dout(20) << "fromstart " << fromstart << " last_flush " << last_flush << " flushign" << dendl; - for (list::iterator p = logger_list.begin(); - p != logger_list.end(); - ++p) - (*p)->_flush(); - last_flush += g_conf.log_interval; - } - - // schedule next flush event - utime_t next; - next.sec_ref() = start.sec() + last_flush + g_conf.log_interval; - next.usec_ref() = start.usec(); - generic_dout(20) << "logger now=" << now - << " start=" << start - << " next=" << next - << dendl; - logger_event = new C_FlushLoggers; - logger_timer.add_event_at(next, logger_event); -} - - - -// --------- - -Logger::Logger(string fn, LogType *type, bool append) -{ - logger_lock.Lock(); - { - filename = ""; - if (g_conf.use_abspaths) { - char *cwd = get_current_dir_name(); - filename = cwd; - free(cwd); - filename += "/"; - } - - filename = "log/"; - if (g_conf.log_name) { - filename += g_conf.log_name; - ::mkdir( filename.c_str(), 0755 ); // make sure dir exists - filename += "/"; - } - filename += fn; - - if (append) - out.open(filename.c_str(), ofstream::out|ofstream::app); - else - out.open(filename.c_str(), ofstream::out); - - this->type = type; - wrote_header = -1; - wrote_header_last = 0; - - version = 0; - - if (logger_list.empty()) { - // init logger - if (!g_conf.clock_tare) - start = g_clock.now(); // time 0! otherwise g_clock does it for us. - - last_flush = 0; - - // call manually the first time; then it'll schedule itself. - flush_all_loggers(); - } - logger_list.push_back(this); - } - logger_lock.Unlock(); -} - -Logger::~Logger() -{ - logger_lock.Lock(); - { - _flush(); - out.close(); - logger_list.remove(this); // slow, but rare. - if (logger_list.empty()) - logger_event = 0; // stop the timer events. - } - logger_lock.Unlock(); -} - - -/* -void Logger::flush() -{ - logger_lock.Lock(); - _flush(); - logger_lock.Unlock(); -} -*/ - -void Logger::_flush() -{ - // header? - wrote_header_last++; - if (wrote_header != type->version || - wrote_header_last > 10) { - out << "#" << type->keymap.size(); - for (unsigned i=0; ikeys.size(); i++) { - out << "\t" << type->keys[i]; - if (type->avg[i]) - out << "\t" << type->keys[i] << "*\t" << type->keys[i] << "~"; - } - out << std::endl; //out << "\t (" << type->keymap.size() << ")" << endl; - wrote_header = type->version; - wrote_header_last = 0; - } - - maybe_resize(type->keys.size()); - - // write line to log - out << last_flush; - for (unsigned i=0; ikeys.size(); i++) { - if (type->avg[i]) { - if (vals[i] > 0) { - double avg = (fvals[i] / (double)vals[i]); - double var = 0.0; - if (g_conf.logger_calc_variance) { - int n = vals[i]; - for (vector::iterator p = vals_to_avg[i].begin(); n--; ++p) - var += (avg - *p) * (avg - *p); - } - out << "\t" << avg << "\t" << vals[i] << "\t" << var; - } else - out << "\t0\t0\t0"; - } else { - if (fvals[i] > 0 && vals[i] == 0) - out << "\t" << fvals[i]; - else { - //cout << this << " p " << i << " and size is " << vals.size() << std::endl; - out << "\t" << vals[i]; - } - } - } - out << std::endl; - - // reset the counters - for (unsigned i=0; ikeys.size(); i++) { - if (type->inc_keys.count(i)) { - this->vals[i] = 0; - this->fvals[i] = 0; - } - } -} - - - -long Logger::inc(const char *key, long v) -{ - if (!g_conf.log) return 0; - logger_lock.Lock(); - int i = type->lookup_key(key); - if (i < 0) i = type->add_inc(key); - maybe_resize(i+1); - - vals[i] += v; - long r = vals[i]; - logger_lock.Unlock(); - return r; -} - -double Logger::finc(const char *key, double v) -{ - if (!g_conf.log) return 0; - logger_lock.Lock(); - int i = type->lookup_key(key); - if (i < 0) i = type->add_inc(key); - maybe_resize(i+1); - - fvals[i] += v; - double r = fvals[i]; - logger_lock.Unlock(); - return r; -} - -long Logger::set(const char *key, long v) -{ - if (!g_conf.log) return 0; - logger_lock.Lock(); - int i = type->lookup_key(key); - if (i < 0) i = type->add_set(key); - maybe_resize(i+1); - - //cout << this << " set " << i << " to " << v << std::endl; - long r = vals[i] = v; - logger_lock.Unlock(); - return r; -} - - -double Logger::fset(const char *key, double v) -{ - if (!g_conf.log) return 0; - logger_lock.Lock(); - int i = type->lookup_key(key); - if (i < 0) i = type->add_set(key); - maybe_resize(i+1); - - //cout << this << " fset " << i << " to " << v << std::endl; - double r = fvals[i] = v; - logger_lock.Unlock(); - return r; -} - -double Logger::favg(const char *key, double v) -{ - if (!g_conf.log) return 0; - logger_lock.Lock(); - int i = type->lookup_key(key); - if (i < 0) i = type->add_avg(key); - maybe_resize(i+1); - - vals[i]++; - double r = fvals[i] = v; - if (g_conf.logger_calc_variance) - vals_to_avg[i].push_back(v); - logger_lock.Unlock(); - return r; -} - -long Logger::get(const char* key) -{ - if (!g_conf.log) return 0; - logger_lock.Lock(); - int i = type->lookup_key(key); - maybe_resize(i+1); - - long r = 0; - if (i >= 0 && i < (int)vals.size()) - r = vals[i]; - logger_lock.Unlock(); - return r; -} - diff --git a/branches/sage/ebofs2/common/Logger.h b/branches/sage/ebofs2/common/Logger.h deleted file mode 100644 index 70fc1fa978024..0000000000000 --- a/branches/sage/ebofs2/common/Logger.h +++ /dev/null @@ -1,77 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __LOGGER_H -#define __LOGGER_H - -#include "include/types.h" -#include "Clock.h" - -#include -#include -#include -using std::vector; -using std::string; -using std::ofstream; - -#include "LogType.h" - - -class Logger { - protected: - // values for this instance - vector vals; - vector fvals; - vector< vector > vals_to_avg; - - void maybe_resize(unsigned s) { - while (s >= vals.size()) { - vals.push_back(0); - fvals.push_back(0.0); - vals_to_avg.push_back(vector()); - } - } - - // my type - LogType *type; - int version; - - string filename; - ofstream out; - - // what i've written - //int last_logged; - int wrote_header; - int wrote_header_last; - - public: - Logger(string fn, LogType *type, bool append=false); - ~Logger(); - - long inc(const char *s, long v = 1); - long set(const char *s, long v); - long get(const char *s); - - double fset(const char *s, double v); - double finc(const char *s, double v); - double favg(const char *s, double v); - - //void flush(); - void _flush(); - - void set_start(utime_t s); -}; - -#endif diff --git a/branches/sage/ebofs2/common/Mutex.h b/branches/sage/ebofs2/common/Mutex.h deleted file mode 100755 index 724c4dbed2a76..0000000000000 --- a/branches/sage/ebofs2/common/Mutex.h +++ /dev/null @@ -1,83 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MUTEX_H -#define __MUTEX_H - -#include -#include - -class Mutex { -private: - pthread_mutex_t _m; - int nlock; - bool recursive; - - // don't allow copying. - void operator=(Mutex &M) {} - Mutex( const Mutex &M ) {} - -public: - Mutex(bool r = true) : nlock(0), recursive(r) { - if (recursive) { - pthread_mutexattr_t attr; - pthread_mutexattr_init(&attr); - pthread_mutexattr_settype(&attr,PTHREAD_MUTEX_RECURSIVE); - pthread_mutex_init(&_m,&attr); - pthread_mutexattr_destroy(&attr); - } else { - pthread_mutex_init(&_m,NULL); - } - } - virtual ~Mutex() { - assert(nlock == 0); - pthread_mutex_destroy(&_m); - } - - bool is_locked() { - return (nlock > 0); - } - - void Lock() { - int r = pthread_mutex_lock(&_m); - assert(r == 0); - nlock++; - assert(nlock == 1 || recursive); - } - - void Unlock() { - assert(nlock > 0); - --nlock; - int r = pthread_mutex_unlock(&_m); - assert(r == 0); - } - - friend class Cond; - - -public: - class Locker { - Mutex &mutex; - - public: - Locker(Mutex& m) : mutex(m) { - mutex.Lock(); - } - ~Locker() { - mutex.Unlock(); - } - }; -}; - -#endif diff --git a/branches/sage/ebofs2/common/RWLock.h b/branches/sage/ebofs2/common/RWLock.h deleted file mode 100644 index 14e158a64ab97..0000000000000 --- a/branches/sage/ebofs2/common/RWLock.h +++ /dev/null @@ -1,50 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef _RWLock_Posix_ -#define _RWLock_Posix_ - -#include - -class RWLock -{ - mutable pthread_rwlock_t L; - - public: - - RWLock() { - pthread_rwlock_init(&L, NULL); - } - - virtual ~RWLock() { - pthread_rwlock_unlock(&L); - pthread_rwlock_destroy(&L); - } - - void unlock() { - pthread_rwlock_unlock(&L); - } - void get_read() { - pthread_rwlock_rdlock(&L); - } - void put_read() { unlock(); } - void get_write() { - pthread_rwlock_wrlock(&L); - } - void put_write() { unlock(); } -}; - -#endif // !_Mutex_Posix_ diff --git a/branches/sage/ebofs2/common/Semaphore.h b/branches/sage/ebofs2/common/Semaphore.h deleted file mode 100644 index bc0a9e60d7ffa..0000000000000 --- a/branches/sage/ebofs2/common/Semaphore.h +++ /dev/null @@ -1,53 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef _Sem_Posix_ -#define _Sem_Posix_ - -#include - -class Semaphore -{ - Mutex m; - Cond c; - int count; - - public: - - Semaphore() - { - count = 0; - } - - void Put() - { - m.Lock(); - count++; - c.Signal(); - m.Unlock(); - } - - void Get() - { - m.Lock(); - while(count <= 0) { - c.Wait(m); - } - count--; - m.Unlock(); - } -}; - -#endif // !_Mutex_Posix_ diff --git a/branches/sage/ebofs2/common/Thread.h b/branches/sage/ebofs2/common/Thread.h deleted file mode 100644 index 06e20047da57f..0000000000000 --- a/branches/sage/ebofs2/common/Thread.h +++ /dev/null @@ -1,81 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __THREAD_H -#define __THREAD_H - -#include -#include -#include - -class Thread { - private: - pthread_t thread_id; - - public: - Thread() : thread_id(0) {} - virtual ~Thread() {} - - protected: - virtual void *entry() = 0; - - private: - static void *_entry_func(void *arg) { - return ((Thread*)arg)->entry(); - } - - public: - pthread_t &get_thread_id() { return thread_id; } - bool is_started() { return thread_id != 0; } - bool am_self() { return (pthread_self() == thread_id); } - - int kill(int signal) { - return pthread_kill(thread_id, signal); - } - int create() { - return pthread_create( &thread_id, NULL, _entry_func, (void*)this ); - } - int join(void **prval = 0) { - if (thread_id == 0) { - generic_derr(0) << "WARNING: join on thread that was never started" << dendl; - //assert(0); - return -EINVAL; // never started. - } - - int status = pthread_join(thread_id, prval); - if (status != 0) { - switch (status) { - case -EINVAL: - generic_derr(0) << "thread " << thread_id << " join status = EINVAL" << dendl; - break; - case -ESRCH: - generic_derr(0) << "thread " << thread_id << " join status = ESRCH" << dendl; - assert(0); - break; - case -EDEADLK: - generic_derr(0) << "thread " << thread_id << " join status = EDEADLK" << dendl; - break; - default: - generic_derr(0) << "thread " << thread_id << " join status = " << status << dendl; - } - assert(0); // none of these should happen. - } - thread_id = 0; - return status; - } - -}; - -#endif diff --git a/branches/sage/ebofs2/common/ThreadPool.h b/branches/sage/ebofs2/common/ThreadPool.h deleted file mode 100644 index 62855a240cd0c..0000000000000 --- a/branches/sage/ebofs2/common/ThreadPool.h +++ /dev/null @@ -1,139 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef THREADPOOL -#define THREADPOOL - -#include -using std::list; - - -#include -#include -#include -#include - - -// debug output -#include "config.h" -#define tpdout(x) if (x <= g_conf.debug) *_dout << myname -#define DBLVL 15 - - -using namespace std; - -#define MAX_THREADS 1000 - -template -class ThreadPool { - - private: - list q; - Mutex q_lock; - Semaphore q_sem; - - int num_ops; - int num_threads; - vector thread; - - U u; - void (*func)(U,T); - void (*prefunc)(U,T); - string myname; - - static void *foo(void *arg) - { - ThreadPool *t = (ThreadPool *)arg; - t->do_ops(arg); - return 0; - } - - void *do_ops(void *nothing) - { - tpdout(DBLVL) << ".do_ops thread " << pthread_self() << " starting" << std::endl; - while (1) { - q_sem.Get(); - if (q.empty()) break; - - T op = get_op(); - tpdout(DBLVL) << ".func thread "<< pthread_self() << " on " << op << std::endl; - func(u, op); - } - tpdout(DBLVL) << ".do_ops thread " << pthread_self() << " exiting" << std::endl; - return 0; - } - - - T get_op() - { - T op; - q_lock.Lock(); - { - op = q.front(); - q.pop_front(); - num_ops--; - - if (prefunc && op) { - tpdout(DBLVL) << ".prefunc thread "<< pthread_self() << " on " << op << std::endl; - prefunc(u, op); - } - } - q_lock.Unlock(); - - return op; - } - - public: - - ThreadPool(char *myname, int howmany, void (*f)(U,T), U obj, void (*pf)(U,T) = 0) : - num_ops(0), num_threads(howmany), - thread(num_threads), - u(obj), - func(f), prefunc(pf), - myname(myname) { - tpdout(DBLVL) << ".cons num_threads " << num_threads << std::endl; - - // start threads - int status; - for(int i = 0; i < howmany; i++) { - status = pthread_create(&thread[i], NULL, (void*(*)(void *))&ThreadPool::foo, this); - assert(status == 0); - } - } - - ~ThreadPool() { - // bump sem to make threads exit cleanly - for(int i = 0; i < num_threads; i++) - q_sem.Put(); - - // wait for them to die - for(int i = 0; i < num_threads; i++) { - tpdout(DBLVL) << ".des joining thread " << thread[i] << std::endl; - void *rval = 0; // we don't actually care - pthread_join(thread[i], &rval); - } - } - - void put_op(T op) { - tpdout(DBLVL) << ".put_op " << op << std::endl; - q_lock.Lock(); - q.push_back(op); - num_ops++; - q_sem.Put(); - q_lock.Unlock(); - } - -}; -#endif diff --git a/branches/sage/ebofs2/common/Timer.cc b/branches/sage/ebofs2/common/Timer.cc deleted file mode 100644 index 1705bc759ac9f..0000000000000 --- a/branches/sage/ebofs2/common/Timer.cc +++ /dev/null @@ -1,335 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - - -#include "Timer.h" -#include "Cond.h" - -#include "config.h" -#include "include/Context.h" - -#define dout(x) if (x <= g_conf.debug_timer) *_dout << dbeginl << g_clock.now() << " TIMER " -#define derr(x) if (x <= g_conf.debug_timer) *_derr << dbeginl << g_clock.now() << " TIMER " - -#define DBL 10 - -#include -#include -#include - -// single global instance -Timer g_timer; - - - -/**** thread solution *****/ - -bool Timer::get_next_due(utime_t& when) -{ - if (scheduled.empty()) { - dout(10) << "get_next_due - nothing scheduled" << dendl; - return false; - } else { - map< utime_t, set >::iterator it = scheduled.begin(); - when = it->first; - dout(10) << "get_next_due - " << when << dendl; - return true; - } -} - - -void Timer::timer_entry() -{ - lock.Lock(); - - while (!thread_stop) { - - // now - utime_t now = g_clock.now(); - - // any events due? - utime_t next; - bool next_due = get_next_due(next); - - if (next_due && now >= next) { - // move to pending list - list pending; - - map< utime_t, set >::iterator it = scheduled.begin(); - while (it != scheduled.end()) { - if (it->first > now) break; - - utime_t t = it->first; - dout(DBL) << "queueing event(s) scheduled at " << t << dendl; - - for (set::iterator cit = it->second.begin(); - cit != it->second.end(); - cit++) { - pending.push_back(*cit); - event_times.erase(*cit); - num_event--; - } - - map< utime_t, set >::iterator previt = it; - it++; - scheduled.erase(previt); - } - - if (!pending.empty()) { - sleeping = false; - lock.Unlock(); - { - // make sure we're not holding any locks while we do callbacks - // make the callbacks myself. - for (list::iterator cit = pending.begin(); - cit != pending.end(); - cit++) { - dout(DBL) << "start callback " << *cit << dendl; - (*cit)->finish(0); - dout(DBL) << "finish callback " << *cit << dendl; - delete *cit; - } - pending.clear(); - assert(pending.empty()); - } - lock.Lock(); - } - - } - else { - // sleep - if (next_due) { - dout(DBL) << "sleeping until " << next << dendl; - timed_sleep = true; - sleeping = true; - timeout_cond.WaitUntil(lock, next); // wait for waker or time - utime_t now = g_clock.now(); - dout(DBL) << "kicked or timed out at " << now << dendl; - } else { - dout(DBL) << "sleeping" << dendl; - timed_sleep = false; - sleeping = true; - sleep_cond.Wait(lock); // wait for waker - utime_t now = g_clock.now(); - dout(DBL) << "kicked at " << now << dendl; - } - } - } - - lock.Unlock(); -} - - - -/** - * Timer bits - */ - -void Timer::register_timer() -{ - if (timer_thread.is_started()) { - if (sleeping) { - dout(DBL) << "register_timer kicking thread" << dendl; - if (timed_sleep) - timeout_cond.SignalAll(); - else - sleep_cond.SignalAll(); - } else { - dout(DBL) << "register_timer doing nothing; thread is awake" << dendl; - // it's probably doing callbacks. - } - } else { - dout(DBL) << "register_timer starting thread" << dendl; - timer_thread.create(); - } -} - -void Timer::cancel_timer() -{ - // clear my callback pointers - if (timer_thread.is_started()) { - dout(10) << "setting thread_stop flag" << dendl; - lock.Lock(); - thread_stop = true; - if (timed_sleep) - timeout_cond.SignalAll(); - else - sleep_cond.SignalAll(); - lock.Unlock(); - - dout(10) << "waiting for thread to finish" << dendl; - void *ptr; - timer_thread.join(&ptr); - - dout(10) << "thread finished, exit code " << ptr << dendl; - } -} - - -/* - * schedule - */ - - -void Timer::add_event_after(double seconds, - Context *callback) -{ - utime_t when = g_clock.now(); - when += seconds; - add_event_at(when, callback); -} - -void Timer::add_event_at(utime_t when, - Context *callback) -{ - lock.Lock(); - - dout(DBL) << "add_event " << callback << " at " << when << dendl; - - // insert - scheduled[when].insert(callback); - assert(event_times.count(callback) == 0); - event_times[callback] = when; - - num_event++; - - // make sure i wake up on time - register_timer(); - - lock.Unlock(); -} - -bool Timer::cancel_event(Context *callback) -{ - lock.Lock(); - - dout(DBL) << "cancel_event " << callback << dendl; - - if (!event_times.count(callback)) { - dout(DBL) << "cancel_event " << callback << " isn't scheduled (probably executing)" << dendl; - lock.Unlock(); - return false; // wasn't scheduled. - } - - utime_t tp = event_times[callback]; - event_times.erase(callback); - - assert(scheduled.count(tp)); - assert(scheduled[tp].count(callback)); - scheduled[tp].erase(callback); - if (scheduled[tp].empty()) - scheduled.erase(tp); - - lock.Unlock(); - - // delete the canceled event. - delete callback; - - return true; -} - - -// ------------------------------- - -void SafeTimer::add_event_after(double seconds, Context *c) -{ - assert(lock.is_locked()); - Context *w = new EventWrapper(this, c); - dout(DBL) << "SafeTimer.add_event_after wrapping " << c << " with " << w << dendl; - scheduled[c] = w; - g_timer.add_event_after(seconds, w); -} - -void SafeTimer::add_event_at(utime_t when, Context *c) -{ - assert(lock.is_locked()); - Context *w = new EventWrapper(this, c); - dout(DBL) << "SafeTimer.add_event_at wrapping " << c << " with " << w << dendl; - scheduled[c] = w; - g_timer.add_event_at(when, w); -} - -void SafeTimer::EventWrapper::finish(int r) -{ - timer->lock.Lock(); - if (timer->scheduled.count(actual)) { - // still scheduled. execute. - actual->finish(r); - timer->scheduled.erase(actual); - } else { - // i was canceled. - assert(timer->canceled.count(actual)); - } - - // did i get canceled? - // (this can happen even if i just executed above. e.g., i may have canceled myself.) - if (timer->canceled.count(actual)) { - timer->canceled.erase(actual); - timer->cond.Signal(); - } - - // delete the original event - delete actual; - - timer->lock.Unlock(); -} - -void SafeTimer::cancel_event(Context *c) -{ - assert(lock.is_locked()); - assert(scheduled.count(c)); - - if (g_timer.cancel_event(scheduled[c])) { - // hosed wrapper. hose original event too. - delete c; - } else { - // clean up later. - canceled[c] = scheduled[c]; - } - scheduled.erase(c); -} - -void SafeTimer::cancel_all() -{ - assert(lock.is_locked()); - - while (!scheduled.empty()) - cancel_event(scheduled.begin()->first); -} - -void SafeTimer::join() -{ - assert(lock.is_locked()); - assert(scheduled.empty()); - - if (!canceled.empty()) { - while (!canceled.empty()) { - // wait - dout(2) << "SafeTimer.join waiting for " << canceled.size() << " to join: " << canceled << dendl; - cond.Wait(lock); - } - dout(2) << "SafeTimer.join done" << dendl; - } -} - -SafeTimer::~SafeTimer() -{ - if (!scheduled.empty() && !canceled.empty()) { - derr(0) << "SafeTimer.~SafeTimer " << scheduled.size() << " events scheduled, " - << canceled.size() << " canceled but unflushed" - << dendl; - } -} diff --git a/branches/sage/ebofs2/common/Timer.h b/branches/sage/ebofs2/common/Timer.h deleted file mode 100644 index 3574833c342c3..0000000000000 --- a/branches/sage/ebofs2/common/Timer.h +++ /dev/null @@ -1,175 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __TIMER_H -#define __TIMER_H - -#include "include/types.h" -#include "include/Context.h" -#include "Clock.h" - -#include "Mutex.h" -#include "Cond.h" -#include "Thread.h" - -#include -#include -using std::map; -using std::set; - -#include -using namespace __gnu_cxx; - - -/*** Timer - * schedule callbacks - */ - -//class Messenger; - - -namespace __gnu_cxx { - template<> struct hash { - size_t operator()(const Context *p) const { - static hash H; - return H((unsigned long)p); - } - }; -} - - -class Timer { - private: - map< utime_t, set > scheduled; // time -> (context ...) - hash_map< Context*, utime_t > event_times; // event -> time - - bool get_next_due(utime_t &when); - - void register_timer(); // make sure i get a callback - void cancel_timer(); // make sure i get a callback - - bool thread_stop; - Mutex lock; - bool timed_sleep; - bool sleeping; - Cond sleep_cond; - Cond timeout_cond; - - public: - void timer_entry(); // waiter thread (that wakes us up) - - class TimerThread : public Thread { - Timer *t; - public: - void *entry() { - t->timer_entry(); - return 0; - } - TimerThread(Timer *_t) : t(_t) {} - } timer_thread; - - - int num_event; - - - public: - Timer() : - thread_stop(false), - timed_sleep(false), - sleeping(false), - timer_thread(this), - num_event(0) - { - } - ~Timer() { - // stop. - cancel_timer(); - - // scheduled - for (map< utime_t, set >::iterator it = scheduled.begin(); - it != scheduled.end(); - it++) { - for (set::iterator sit = it->second.begin(); - sit != it->second.end(); - sit++) - delete *sit; - } - scheduled.clear(); - } - - void init() { - register_timer(); - } - void shutdown() { - cancel_timer(); - } - - // schedule events - void add_event_after(double seconds, - Context *callback); - void add_event_at(utime_t when, - Context *callback); - bool cancel_event(Context *callback); - - // execute pending events - void execute_pending(); - -}; - - -/* - * SafeTimer is a wrapper around the raw Timer (or rather, g_timer, it's global - * instantiation) that protects event execution with an existing mutex. It - * provides for, among other things, reliable event cancellation on class - * destruction. The caller just needs to cancel each event (or cancel_all()), - * and then call join() to ensure any concurrently exectuting events (in other - * threads) get flushed. - */ -class SafeTimer { - Mutex& lock; - Cond cond; - map scheduled; // actual -> wrapper - map canceled; - - class EventWrapper : public Context { - SafeTimer *timer; - Context *actual; - public: - EventWrapper(SafeTimer *st, Context *c) : timer(st), - actual(c) {} - void finish(int r); - }; - -public: - SafeTimer(Mutex& l) : lock(l) { } - ~SafeTimer(); - - void add_event_after(double seconds, Context *c); - void add_event_at(utime_t when, Context *c); - void cancel_event(Context *c); - void cancel_all(); - void join(); - - int get_num_scheduled() { return scheduled.size(); } - int get_num_canceled() { return canceled.size(); } -}; - - -// single global instance -extern Timer g_timer; - - - -#endif diff --git a/branches/sage/ebofs2/config.cc b/branches/sage/ebofs2/config.cc deleted file mode 100644 index 390178655a8e7..0000000000000 --- a/branches/sage/ebofs2/config.cc +++ /dev/null @@ -1,1029 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include "config.h" -#include "include/types.h" -#include - -//#define MDS_CACHE_SIZE 4*10000 -> <20mb -//#define MDS_CACHE_SIZE 80000 62mb - -#define AVG_PER_INODE_SIZE 450 -#define MDS_CACHE_MB_TO_INODES(x) ((x)*1000000/AVG_PER_INODE_SIZE) - -//#define MDS_CACHE_SIZE MDS_CACHE_MB_TO_INODES( 50 ) -//#define MDS_CACHE_SIZE 1500000 -#define MDS_CACHE_SIZE 150000 - - -// hack hack hack ugly FIXME -#include "common/Mutex.h" -long buffer_total_alloc = 0; -Mutex bufferlock; - -#include "osd/osd_types.h" - -// debug output -Mutex _dout_lock; -ostream *_dout = &std::cout; -ostream *_derr = &std::cerr; - -// file layouts -struct ceph_file_layout g_OSD_FileLayout = { - fl_stripe_unit: 1<<22, - fl_stripe_count: 1, - fl_object_size: 1<<22, - fl_object_stripe_unit: 0, - fl_pg_preferred: -1, - fl_pg_type: CEPH_PG_TYPE_REP, - fl_pg_size: 2 -}; - -struct ceph_file_layout g_OSD_MDDirLayout = { - fl_stripe_unit: 1<<22, - fl_stripe_count: 1, - fl_object_size: 1<<22, - fl_object_stripe_unit: 0, - fl_pg_preferred: -1, - fl_pg_type: CEPH_PG_TYPE_REP, - fl_pg_size: 2 -}; - -struct ceph_file_layout g_OSD_MDLogLayout = { - fl_stripe_unit: 1<<20, - fl_stripe_count: 1, - fl_object_size: 1<<20, - fl_object_stripe_unit: 0, - fl_pg_preferred: -1, - fl_pg_type: CEPH_PG_TYPE_REP, - fl_pg_size: 2 -}; - -struct ceph_file_layout g_OSD_MDAnchorTableLayout = { - fl_stripe_unit: 1<<20, - fl_stripe_count: 1, - fl_object_size: 1<<20, - fl_object_stripe_unit: 0, - fl_pg_preferred: -1, - fl_pg_type: CEPH_PG_TYPE_REP, - fl_pg_size: 2 -}; - -#include - -// fake osd failures: osd -> time -std::map g_fake_kill_after; -std::map g_fake_osd_down; -std::map g_fake_osd_out; - -entity_addr_t g_my_addr; - -md_config_t g_debug_after_conf; - -md_config_t g_conf = { - num_mon: 1, - num_mds: 1, - num_osd: 4, - num_client: 1, - - mkfs: false, - - // profiling and debugging - log: true, - log_interval: 1, - log_name: (char*)0, - - log_messages: true, - log_pins: true, - - logger_calc_variance: true, - - dout_dir: 0, - - fake_clock: false, - fakemessenger_serialize: true, - - fake_osdmap_expand: 0, - fake_osdmap_updates: 0, - fake_osd_mttf: 0, - fake_osd_mttr: 0, - - osd_remount_at: 0, - - kill_after: 0, - - tick: 0, - - debug: 0, - debug_mds: 1, - debug_mds_balancer: 1, - debug_mds_log: 1, - debug_mds_log_expire: 1, - debug_mds_migrator: 1, - debug_buffer: 0, - debug_timer: 0, - debug_filer: 0, - debug_objecter: 0, - debug_journaler: 0, - debug_objectcacher: 0, - debug_client: 0, - debug_osd: 0, - debug_ebofs: 1, - debug_bdev: 1, // block device - debug_ns: 0, - debug_ms: 0, - debug_mon: 1, - debug_paxos: 0, - - debug_after: 0, - - // -- misc -- - use_abspaths: false, // make monitorstore et al use absolute path (to workaround FUSE chdir("/")) - - // --- clock --- - clock_lock: false, - clock_tare: false, - - // --- messenger --- - ms_tcp_nodelay: true, - ms_retry_interval: 2.0, // how often to attempt reconnect - ms_fail_interval: 15.0, // fail after this long - ms_die_on_failure: false, - - ms_stripe_osds: false, - ms_skip_rank0: false, - ms_overlay_clients: false, - - - // --- mon --- - mon_tick_interval: 5, - mon_osd_down_out_interval: 5, // seconds - mon_lease: 5, // seconds // lease interval - mon_lease_renew_interval: 3, // on leader, to renew the lease - mon_lease_ack_timeout: 10.0, // on leader, if lease isn't acked by all peons - mon_lease_timeout: 10.0, // on peon, if lease isn't extended - mon_accept_timeout: 10.0, // on leader, if paxos update isn't accepted - mon_stop_on_last_unmount: false, - mon_stop_with_last_mds: false, - mon_allow_mds_bully: false, // allow a booting mds to (forcibly) claim an mds # .. FIXME - - paxos_propose_interval: 1.0, // gather updates for this long before proposing a map update - - // --- client --- - client_cache_size: 1000, - client_cache_mid: .5, - client_cache_stat_ttl: 0, // seconds until cached stat results become invalid - client_cache_readdir_ttl: 1, // 1 second only - client_use_random_mds: false, - - client_sync_writes: 0, - - client_mount_timeout: 10.0, // retry every N seconds - - client_hack_balance_reads: false, - - client_trace: 0, - fuse_direct_io: 0, - fuse_ll: true, - - // --- objectcacher --- - client_oc: true, - client_oc_size: 1024*1024* 10, // MB * n - client_oc_max_dirty: 1024*1024* 10, // MB * n (dirty OR tx) - client_oc_max_sync_write: 128*1024, // synx writes >= this use wrlock - - // --- objecter --- - objecter_buffer_uncommitted: true, // this must be true for proper failure handling - objecter_map_request_interval: 15.0, // request a new map every N seconds, if we have pending io - objecter_tick_interval: 5.0, - objecter_timeout: 10.0, // before we ask for a map - - // --- journaler --- - journaler_allow_split_entries: true, - journaler_safe: false, // wait for COMMIT on journal writes - journaler_write_head_interval: 15, - journaler_cache: false, // cache writes for later readback - journaler_prefetch_periods: 50, // * journal object size (1~MB? see above) - journaler_batch_interval: .001, // seconds.. max add'l latency we artificially incur - journaler_batch_max: 16384, // max bytes we'll delay flushing - - // --- mds --- - mds_cache_size: 300000, //MDS_CACHE_SIZE, - mds_cache_mid: .7, - - mds_decay_halflife: 5, - - mds_beacon_interval: 4, //30.0, - mds_beacon_grace: 15, //60*60.0, - - mds_log: true, - mds_log_max_events: -1, //MDS_CACHE_SIZE / 3, - mds_log_max_segments: 100, - mds_log_max_expiring: 20, - mds_log_pad_entry: 128,//256,//64, - mds_log_eopen_size: 100, // # open inodes per log entry - - mds_bal_sample_interval: 3.0, // every 5 seconds - mds_bal_replicate_threshold: 8000, - mds_bal_unreplicate_threshold: 0,//500, - mds_bal_split_size: 10000, - mds_bal_split_rd: 25000, - mds_bal_split_wr: 10000, - mds_bal_merge_size: 50, - mds_bal_merge_rd: 1000, - mds_bal_merge_wr: 1000, - mds_bal_interval: 10, // seconds - mds_bal_fragment_interval: 2, // seconds - mds_bal_idle_threshold: 0, //.1, - mds_bal_max: -1, - mds_bal_max_until: -1, - - mds_bal_mode: 0, - mds_bal_min_rebalance: .1, // must be this much above average before we export anything - mds_bal_min_start: .2, // if we need less than this, we don't do anything - mds_bal_need_min: .8, // take within this range of what we need - mds_bal_need_max: 1.2, - mds_bal_midchunk: .3, // any sub bigger than this taken in full - mds_bal_minchunk: .001, // never take anything smaller than this - - mds_trim_on_rejoin: true, - mds_shutdown_check: 0, //30, - - mds_verify_export_dirauth: true, - - mds_local_osd: false, - mds_local_osd_offset: 1000, - - mds_thrash_exports: 0, - mds_thrash_fragments: 0, - mds_dump_cache_on_map: false, - mds_dump_cache_after_rejoin: true, - - mds_hack_log_expire_for_better_stats: false, - - // --- osd --- - osd_rep: OSD_REP_PRIMARY, - - osd_balance_reads: false, // send from client to replica - osd_flash_crowd_iat_threshold: 0,//100, - osd_flash_crowd_iat_alpha: 0.125, - osd_balance_reads_temp: 100, - - osd_shed_reads: false, // forward from primary to replica - osd_shed_reads_min_latency: .01, // min local latency - osd_shed_reads_min_latency_diff: .01, // min latency difference - osd_shed_reads_min_latency_ratio: 1.5, // 1.2 == 20% higher than peer - - osd_immediate_read_from_cache: false,//true, // osds to read from the cache immediately? - osd_exclusive_caching: true, // replicas evict replicated writes - - osd_stat_refresh_interval: .5, - - osd_pg_bits: 4, // bits per osd - osd_object_layout: CEPH_OBJECT_LAYOUT_HASHINO,//LINEAR,//HASHINO, - osd_pg_layout: CEPH_PG_LAYOUT_CRUSH,//LINEAR,//CRUSH, - osd_max_rep: 4, - osd_min_raid_width: 4, - osd_max_raid_width: 3, //6, - - osd_maxthreads: 2, // 0 == no threading - osd_max_opq: 10, - osd_mkfs: false, - osd_age: .8, - osd_age_time: 0, - osd_heartbeat_interval: 1, - osd_pg_stats_interval: 5, - osd_replay_window: 5, - osd_max_pull: 2, - osd_pad_pg_log: false, - - osd_auto_weight: false, - - osd_hack_fast_startup: false, // this breaks localized pgs. - - - // --- fakestore --- - fakestore_fake_sync: .2, // seconds - fakestore_fsync: false,//true, - fakestore_writesync: false, - fakestore_syncthreads: 4, - fakestore_fake_attrs: false, - fakestore_fake_collections: false, - fakestore_dev: 0, - - // --- ebofs --- - ebofs: 1, - ebofs_cloneable: false, - ebofs_verify: false, - ebofs_commit_ms: 1000, // 0 = no forced commit timeout (for debugging/tracing) - ebofs_idle_commit_ms: 0, // 0 = no idle detection. UGLY HACK. use bdev_idle_kick_after_ms instead. - ebofs_oc_size: 10000, // onode cache - ebofs_cc_size: 10000, // cnode cache - ebofs_bc_size: (50 *256), // 4k blocks, *256 for MB - ebofs_bc_max_dirty: (30 *256), // before write() will block - ebofs_max_prefetch: 1000, // 4k blocks - ebofs_realloc: false, // hrm, this can cause bad fragmentation, don't use! - - ebofs_abp_zero: false, // zero newly allocated buffers (may shut up valgrind) - ebofs_abp_max_alloc: 4096*16, // max size of new buffers (larger -> more memory fragmentation) - - // --- block device --- - bdev_lock: true, - bdev_iothreads: 1, // number of ios to queue with kernel - bdev_idle_kick_after_ms: 100, // ms - bdev_el_fw_max_ms: 10000, // restart elevator at least once every 1000 ms - bdev_el_bw_max_ms: 3000, // restart elevator at least once every 300 ms - bdev_el_bidir: false, // bidirectional elevator? - bdev_iov_max: 512, // max # iov's to collect into a single readv()/writev() call - bdev_debug_check_io_overlap: true, // [DEBUG] check for any pending io overlaps - bdev_fake_mb: 0, - bdev_fake_max_mb: 0, - - // --- fakeclient (mds regression testing) (ancient history) --- - num_fakeclient: 100, - fakeclient_requests: 100, - fakeclient_deterministic: false, - - fakeclient_op_statfs: false, - - // loosely based on Roselli workload paper numbers - fakeclient_op_stat: 610, - fakeclient_op_lstat: false, - fakeclient_op_utime: 0, - fakeclient_op_chmod: 1, - fakeclient_op_chown: 1, - - fakeclient_op_readdir: 2, - fakeclient_op_mknod: 30, - fakeclient_op_link: false, - fakeclient_op_unlink: 20, - fakeclient_op_rename: 0,//40, - - fakeclient_op_mkdir: 10, - fakeclient_op_rmdir: 20, - fakeclient_op_symlink: 20, - - fakeclient_op_openrd: 200, - fakeclient_op_openwr: 0, - fakeclient_op_openwrc: 0, - fakeclient_op_read: false, // osd! - fakeclient_op_write: false, // osd! - fakeclient_op_truncate: false, - fakeclient_op_fsync: false, - fakeclient_op_close: 200 - -#ifdef USE_OSBDB - , - bdbstore: false, - debug_bdbstore: 1, - bdbstore_btree: false, - bdbstore_ffactor: 0, - bdbstore_nelem: 0, - bdbstore_pagesize: 0, - bdbstore_cachesize: 0, - bdbstore_transactional: false -#endif // USE_OSBDB -}; - - -#include -#include - - -void env_to_vec(std::vector& args) -{ - const char *p = getenv("CEPH_ARGS"); - if (!p) return; - - static char buf[1000]; - int len = strlen(p); - memcpy(buf, p, len); - buf[len] = 0; - //cout << "CEPH_ARGS " << buf << endl; - - int l = 0; - for (int i=0; i& args) -{ - for (int i=1; i& args, - int& argc, char **&argv) -{ - argv = (char**)malloc(sizeof(char*) * argc); - argc = 1; - argv[0] = "asdf"; - - for (unsigned i=0; i= '0' && *s <= '9') { - int digit = *s - '0'; - //cout << "digit " << digit << endl; - val *= 10; - val += digit; - numdigits++; - s++; off++; - } - //cout << "val " << val << endl; - - if (numdigits == 0) { - cerr << "no digits at off " << off << std::endl; - return false; // no digits - } - if (count < 3 && *s != '.') { - cerr << "should period at " << off << std::endl; - return false; // should have 3 periods - } - s++; off++; - - unsigned char *ipq = (unsigned char*)&a.v.ipaddr.sin_addr.s_addr; - if (count <= 3) - ipq[count] = val; - else - a.set_port(val); - - count++; - if (count == 4 && *s != ':') break; - if (count == 5) break; - } - - return true; -} - - - -void parse_config_options(std::vector& args) -{ - std::vector nargs; - - for (unsigned i=0; iis_open()) { - std::cerr << "error opening output file " << fn << std::endl; - delete out; - } else { - _dout = out; - } - } - - args = nargs; -} diff --git a/branches/sage/ebofs2/config.h b/branches/sage/ebofs2/config.h deleted file mode 100644 index ef286a9c86052..0000000000000 --- a/branches/sage/ebofs2/config.h +++ /dev/null @@ -1,418 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __CONFIG_H -#define __CONFIG_H - -extern struct ceph_file_layout g_OSD_FileLayout; -extern struct ceph_file_layout g_OSD_MDDirLayout; -extern struct ceph_file_layout g_OSD_MDLogLayout; -extern struct ceph_file_layout g_OSD_MDAnchorTableLayout; - -#include -#include - -#include "common/Mutex.h" - -extern std::map g_fake_osd_down; -extern std::map g_fake_osd_out; - -#define OSD_REP_PRIMARY 0 -#define OSD_REP_SPLAY 1 -#define OSD_REP_CHAIN 2 - - -#include "msg/msg_types.h" - -extern entity_addr_t g_my_addr; - -struct md_config_t { - int num_mon; - int num_mds; - int num_osd; - int num_client; - - bool mkfs; - - // profiling - bool log; - int log_interval; - char *log_name; - - bool log_messages; - bool log_pins; - - bool logger_calc_variance; - - char *dout_dir; - - bool fake_clock; - bool fakemessenger_serialize; - - int fake_osdmap_expand; - int fake_osdmap_updates; - int fake_osd_mttf; - int fake_osd_mttr; - - int osd_remount_at; - - int kill_after; - - int tick; - - int debug; - int debug_mds; - int debug_mds_balancer; - int debug_mds_log; - int debug_mds_log_expire; - int debug_mds_migrator; - int debug_buffer; - int debug_timer; - int debug_filer; - int debug_objecter; - int debug_journaler; - int debug_objectcacher; - int debug_client; - int debug_osd; - int debug_ebofs; - int debug_bdev; - int debug_ns; - int debug_ms; - int debug_mon; - int debug_paxos; - - int debug_after; - - // misc - bool use_abspaths; - - // clock - bool clock_lock; - bool clock_tare; - - // messenger - - /*bool tcp_skip_rank0; - bool tcp_overlay_clients; - bool tcp_log; - bool tcp_serial_marshall; - bool tcp_serial_out; - bool tcp_multi_out; - bool tcp_multi_dispatch; - */ - - bool ms_tcp_nodelay; - double ms_retry_interval; - double ms_fail_interval; - bool ms_die_on_failure; - - bool ms_stripe_osds; - bool ms_skip_rank0; - bool ms_overlay_clients; - - // mon - int mon_tick_interval; - int mon_osd_down_out_interval; - float mon_lease; - float mon_lease_renew_interval; - float mon_lease_ack_timeout; - float mon_lease_timeout; - float mon_accept_timeout; - bool mon_stop_on_last_unmount; - bool mon_stop_with_last_mds; - bool mon_allow_mds_bully; - - double paxos_propose_interval; - - // client - int client_cache_size; - float client_cache_mid; - int client_cache_stat_ttl; - int client_cache_readdir_ttl; - bool client_use_random_mds; // debug flag - - bool client_sync_writes; - - double client_mount_timeout; - - // hack - bool client_hack_balance_reads; - - - /* - bool client_bcache; - int client_bcache_alloc_minsize; - int client_bcache_alloc_maxsize; - int client_bcache_ttl; - off_t client_bcache_size; - int client_bcache_lowater; - int client_bcache_hiwater; - size_t client_bcache_align; - */ - - char *client_trace; - int fuse_direct_io; - bool fuse_ll; - - // objectcacher - bool client_oc; - int client_oc_size; - int client_oc_max_dirty; - size_t client_oc_max_sync_write; - - // objecter - bool objecter_buffer_uncommitted; - double objecter_map_request_interval; - double objecter_tick_interval; - double objecter_timeout; - - // journaler - bool journaler_allow_split_entries; - bool journaler_safe; - int journaler_write_head_interval; - bool journaler_cache; - int journaler_prefetch_periods; - double journaler_batch_interval; - size_t journaler_batch_max; - - // mds - int mds_cache_size; - float mds_cache_mid; - - float mds_decay_halflife; - - float mds_beacon_interval; - float mds_beacon_grace; - - bool mds_log; - int mds_log_max_events; - int mds_log_max_segments; - int mds_log_max_expiring; - int mds_log_pad_entry; - int mds_log_eopen_size; - - float mds_bal_sample_interval; - float mds_bal_replicate_threshold; - float mds_bal_unreplicate_threshold; - int mds_bal_split_size; - float mds_bal_split_rd; - float mds_bal_split_wr; - int mds_bal_merge_size; - float mds_bal_merge_rd; - float mds_bal_merge_wr; - int mds_bal_interval; - int mds_bal_fragment_interval; - float mds_bal_idle_threshold; - int mds_bal_max; - int mds_bal_max_until; - - int mds_bal_mode; - float mds_bal_min_rebalance; - float mds_bal_min_start; - float mds_bal_need_min; - float mds_bal_need_max; - float mds_bal_midchunk; - float mds_bal_minchunk; - - bool mds_trim_on_rejoin; - int mds_shutdown_check; - - bool mds_verify_export_dirauth; // debug flag - - bool mds_local_osd; - int mds_local_osd_offset; - - int mds_thrash_exports; - int mds_thrash_fragments; - bool mds_dump_cache_on_map; - bool mds_dump_cache_after_rejoin; - - bool mds_hack_log_expire_for_better_stats; - - // osd - int osd_rep; - - bool osd_balance_reads; - int osd_flash_crowd_iat_threshold; // flash crowd interarrival time threshold in ms - double osd_flash_crowd_iat_alpha; - double osd_balance_reads_temp; - - int osd_shed_reads; - double osd_shed_reads_min_latency; - double osd_shed_reads_min_latency_diff; - double osd_shed_reads_min_latency_ratio; - - bool osd_immediate_read_from_cache; - bool osd_exclusive_caching; - double osd_stat_refresh_interval; - - int osd_pg_bits; - int osd_object_layout; - int osd_pg_layout; - int osd_max_rep; - int osd_min_raid_width; - int osd_max_raid_width; - int osd_maxthreads; - int osd_max_opq; - bool osd_mkfs; - float osd_age; - int osd_age_time; - int osd_heartbeat_interval; - int osd_pg_stats_interval; - int osd_replay_window; - int osd_max_pull; - bool osd_pad_pg_log; - - bool osd_auto_weight; - - bool osd_hack_fast_startup; - - double fakestore_fake_sync; - bool fakestore_fsync; - bool fakestore_writesync; - int fakestore_syncthreads; // such crap - bool fakestore_fake_attrs; - bool fakestore_fake_collections; - char *fakestore_dev; - - // ebofs - int ebofs; - bool ebofs_cloneable; - bool ebofs_verify; - int ebofs_commit_ms; - int ebofs_idle_commit_ms; - int ebofs_oc_size; - int ebofs_cc_size; - off_t ebofs_bc_size; - off_t ebofs_bc_max_dirty; - unsigned ebofs_max_prefetch; - bool ebofs_realloc; - - bool ebofs_abp_zero; - size_t ebofs_abp_max_alloc; - - // block device - bool bdev_lock; - int bdev_iothreads; - int bdev_idle_kick_after_ms; - int bdev_el_fw_max_ms; - int bdev_el_bw_max_ms; - bool bdev_el_bidir; - int bdev_iov_max; - bool bdev_debug_check_io_overlap; - int bdev_fake_mb; - int bdev_fake_max_mb; - - // fake client - int num_fakeclient; - unsigned fakeclient_requests; - bool fakeclient_deterministic; // debug flag - - int fakeclient_op_statfs; - - int fakeclient_op_stat; - int fakeclient_op_lstat; - int fakeclient_op_utime; - int fakeclient_op_chmod; - int fakeclient_op_chown; - - int fakeclient_op_readdir; - int fakeclient_op_mknod; - int fakeclient_op_link; - int fakeclient_op_unlink; - int fakeclient_op_rename; - - int fakeclient_op_mkdir; - int fakeclient_op_rmdir; - int fakeclient_op_symlink; - - int fakeclient_op_openrd; - int fakeclient_op_openwr; - int fakeclient_op_openwrc; - int fakeclient_op_read; - int fakeclient_op_write; - int fakeclient_op_truncate; - int fakeclient_op_fsync; - int fakeclient_op_close; - -#ifdef USE_OSBDB - bool bdbstore; - int debug_bdbstore; - bool bdbstore_btree; - int bdbstore_ffactor; - int bdbstore_nelem; - int bdbstore_pagesize; - int bdbstore_cachesize; - bool bdbstore_transactional; -#endif // USE_OSBDB -}; - -extern md_config_t g_conf; -extern md_config_t g_debug_after_conf; - - -/** - * command line / environment argument parsing - */ -void env_to_vec(std::vector& args); -void argv_to_vec(int argc, char **argv, - std::vector& args); -void vec_to_argv(std::vector& args, - int& argc, char **&argv); - -void parse_config_options(std::vector& args); - -extern bool parse_ip_port(const char *s, entity_addr_t& addr); - - -/** - * for cleaner output, bracket each line with - * dbeginl (in the dout macro) and dendl (in place of endl). - */ -extern Mutex _dout_lock; -struct _dbeginl_t { _dbeginl_t(int) {} }; -struct _dendl_t { _dendl_t(int) {} }; -static const _dbeginl_t dbeginl = 0; -static const _dendl_t dendl = 0; - -// intentionally conflict with endl -class _bad_endl_use_dendl_t { public: _bad_endl_use_dendl_t(int) {} }; -static const _bad_endl_use_dendl_t endl = 0; - -inline ostream& operator<<(ostream& out, _dbeginl_t) { - _dout_lock.Lock(); - return out; -} -inline ostream& operator<<(ostream& out, _dendl_t) { - out << std::endl; - _dout_lock.Unlock(); - return out; -} -inline ostream& operator<<(ostream& out, _bad_endl_use_dendl_t) { - assert(0 && "you are using the wrong endl.. use std::endl or dendl"); - return out; -} - -// the streams -extern ostream *_dout; -extern ostream *_derr; - -// generic macros -#define generic_dout(x) if ((x) <= g_conf.debug) *_dout << dbeginl -#define generic_derr(x) if ((x) <= g_conf.debug) *_derr << dbeginl - -#define pdout(x,p) if ((x) <= (p)) *_dout << dbeginl - - -#endif diff --git a/branches/sage/ebofs2/cosd.cc b/branches/sage/ebofs2/cosd.cc deleted file mode 100644 index e575c72836e69..0000000000000 --- a/branches/sage/ebofs2/cosd.cc +++ /dev/null @@ -1,135 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include -#include -#include - -#include -#include -#include -using namespace std; - -#include "config.h" - -#include "mon/MonMap.h" - -#include "osd/OSD.h" -#include "ebofs/Ebofs.h" - -#include "msg/SimpleMessenger.h" - -#include "common/Timer.h" - - -class C_Die : public Context { -public: - void finish(int) { - cerr << "die" << std::endl; - exit(1); - } -}; - -class C_Debug : public Context { - public: - void finish(int) { - int size = &g_conf.debug_after - &g_conf.debug; - memcpy((char*)&g_conf.debug, (char*)&g_debug_after_conf.debug, size); - cout << "debug_after flipping debug settings" << std::endl; - } -}; - - -int main(int argc, char **argv) -{ - vector args; - argv_to_vec(argc, argv, args); - - parse_config_options(args); - - if (g_conf.kill_after) - g_timer.add_event_after(g_conf.kill_after, new C_Die); - if (g_conf.debug_after) - g_timer.add_event_after(g_conf.debug_after, new C_Debug); - - if (g_conf.clock_tare) g_clock.tare(); - - // osd specific args - char *dev = 0; - char dev_default[20]; - int whoami = -1; - for (unsigned i=0; imount(); - int r = store->read(object_t(0,0), 0, sizeof(sb), bl); - if (r < 0) { - cerr << "couldn't read superblock object on " << dev << std::endl; - exit(0); - } - bl.copy(0, sizeof(sb), (char*)&sb); - store->umount(); - delete store; - whoami = sb.whoami; - - cout << "osd fs says i am osd" << whoami << std::endl; - } else { - cout << "command line arg says i am osd" << whoami << std::endl; - } - - // load monmap - MonMap monmap; - int r = monmap.read(".ceph_monmap"); - assert(r >= 0); - - // start up network - rank.start_rank(); - - // start osd - Messenger *m = rank.register_entity(entity_name_t::OSD(whoami)); - assert(m); - OSD *osd = new OSD(whoami, m, &monmap, dev); - osd->init(); - - // wait - rank.wait(); - - // done - delete osd; - - return 0; -} - diff --git a/branches/sage/ebofs2/crush.old/BinaryTree.h b/branches/sage/ebofs2/crush.old/BinaryTree.h deleted file mode 100644 index 7573fc02ed6dc..0000000000000 --- a/branches/sage/ebofs2/crush.old/BinaryTree.h +++ /dev/null @@ -1,285 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __crush_BINARYTREE_H -#define __crush_BINARYTREE_H - -#include -#include -#include -#include -using std::map; -using std::vector; - -#include "include/buffer.h" - -namespace crush { - - class BinaryTree { - private: - // tree def - int root_node; // 0 for empty tree. - int alloc; - vector node_nested; // all existing nodes in this map - vector node_weight; // and this one - vector node_complete; // only nodes with all possible children - - public: - BinaryTree() : root_node(0), alloc(0) {} - - void _encode(bufferlist& bl) { - bl.append((char*)&root_node, sizeof(root_node)); - bl.append((char*)&alloc, sizeof(alloc)); - ::_encode(node_nested, bl); - ::_encode(node_weight, bl); - ::_encode(node_complete, bl); - } - void _decode(bufferlist& bl, int& off) { - bl.copy(off, sizeof(root_node), (char*)&root_node); - off += sizeof(root_node); - bl.copy(off, sizeof(alloc), (char*)&alloc); - off += sizeof(alloc); - ::_decode(node_nested, bl, off); - ::_decode(node_weight, bl, off); - ::_decode(node_complete, bl, off); - } - - // accessors - bool empty() const { return root_node == 0; } - bool exists(int n) const { return n < alloc && node_nested[n]; } - int nested(int n) const { return exists(n) ? node_nested[n]:0; } - float weight(int n) const { return exists(n) ? node_weight[n]:0; } - bool complete(int n) const { return exists(n) ? node_complete[n]:false; } - - int root() const { return root_node; } - - void realloc(int n) { - /* - while (alloc <= n) { - node_nested.push_back(0); - node_weight.push_back(0); - node_complete.push_back(0); - alloc++; - } - */ - if (alloc <= n) { - int add = n - alloc + 1; - node_nested.insert(node_nested.end(), add, 0); - node_weight.insert(node_weight.end(), add, 0); - node_complete.insert(node_complete.end(), add, 0); - alloc = n+1; - } - } - - // tree navigation - bool terminal(int n) const { return n & 1; } // odd nodes are leaves. - int height(int n) const { - assert(n); - int h = 0; - while ((n & 1) == 0) { - assert(n > 0); - h++; n = n >> 1; - } - return h; - } - int left(int n) const { - int h = height(n); - //cout << "left of " << n << " is " << (n - (1 << h)) << std::endl; - return n - (1 << (h-1)); - } - int right(int n) const { - int h = height(n); - //cout << "right of " << n << " is " << (n + (1 << h)) << std::endl; - return n + (1 << (h-1)); - } - bool on_right(int n, int h = -1) const { - if (h < 0) h = height(n); - return n & (1 << (h+1)); - } - bool on_left(int n) const { return !on_right(n); } - int parent(int n) const { - int h = height(n); - if (on_right(n, h)) - return n - (1<0; t--) out << " "; - if (tree.root() == n) - out << "root "; - else { - if (tree.on_left(n)) - out << "left "; - else - out << "right "; - } - out << n << " : nested " << tree.nested(n) << " weight " << tree.weight(n); - if (tree.complete(n)) out << " complete"; - out << std::endl; - if (!tree.terminal(n)) { - if (tree.exists(tree.left(n))) - print_binary_tree_node(out, tree, tree.left(n), i+2); - if (tree.exists(tree.right(n))) - print_binary_tree_node(out, tree, tree.right(n), i+2); - } - } - - inline ostream& operator<<(ostream& out, const BinaryTree& tree) { - if (tree.empty()) - return out << "tree is empty"; - print_binary_tree_node(out, tree, tree.root(), 0); - return out; - } - -} - -#endif diff --git a/branches/sage/ebofs2/crush.old/Bucket.h b/branches/sage/ebofs2/crush.old/Bucket.h deleted file mode 100644 index 81a2576697bd7..0000000000000 --- a/branches/sage/ebofs2/crush.old/Bucket.h +++ /dev/null @@ -1,632 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __crush_BUCKET_H -#define __crush_BUCKET_H - -#include "BinaryTree.h" -#include "Hash.h" - -#include -#include -#include -#include -using namespace std; - -#include - -#include "include/buffer.h" - -namespace crush { - - - const int CRUSH_BUCKET_UNIFORM = 1; - const int CRUSH_BUCKET_TREE = 2; - const int CRUSH_BUCKET_LIST = 3; - const int CRUSH_BUCKET_STRAW = 4; - - /** abstract bucket **/ - class Bucket { - protected: - int id; - int parent; - int type; - float weight; - - public: - Bucket(int _type, - float _weight) : - id(0), parent(0), - type(_type), - weight(_weight) { } - - Bucket(bufferlist& bl, int& off) { - bl.copy(off, sizeof(id), (char*)&id); - off += sizeof(id); - bl.copy(off, sizeof(parent), (char*)&parent); - off += sizeof(parent); - bl.copy(off, sizeof(type), (char*)&type); - off += sizeof(type); - bl.copy(off, sizeof(weight), (char*)&weight); - off += sizeof(weight); - } - - virtual ~Bucket() { } - - virtual const char *get_bucket_type() const = 0; - virtual bool is_uniform() const = 0; - - int get_id() const { return id; } - int get_type() const { return type; } - float get_weight() const { return weight; } - int get_parent() const { return parent; } - virtual int get_size() const = 0; - - void set_id(int i) { id = i; } - void set_parent(int p) { parent = p; } - void set_weight(float w) { weight = w; } - - virtual void get_items(vector& i) const = 0; - virtual float get_item_weight(int item) const = 0; - virtual void add_item(int item, float w, bool back=false) = 0; - virtual void adjust_item_weight(int item, float w) = 0; - virtual void set_item_weight(int item, float w) { - adjust_item_weight(item, w - get_item_weight(item)); - } - - virtual int choose_r(int x, int r, Hash& h) const = 0; - - virtual void _encode(bufferlist& bl) = 0; - }; - - - - - /** uniform bucket **/ - class UniformBucket : public Bucket { - protected: - public: - vector items; - int item_type; - float item_weight; - - // primes - vector primes; - - int get_prime(int j) const { - return primes[ j % primes.size() ]; - } - void make_primes() { - if (items.empty()) return; - - //cout << "make_primes " << get_id() << " " << items.size() << endl; - Hash h(123+get_id()); - primes.clear(); - - // start with odd number > num_items - unsigned x = items.size() + 1; // this is the minimum! - x += h(items.size()) % (3*items.size()); // bump it up some - x |= 1; // make it odd - - while (primes.size() < items.size()) { - unsigned j; - for (j=2; j*j<=x; j++) - if (x % j == 0) break; - if (j*j > x) { - primes.push_back(x); - //cout << "prime " << x << endl; - } - x += 2; - } - } - - public: - UniformBucket(int _type, int _item_type) : - Bucket(_type, 0), - item_type(_item_type) { } - UniformBucket(int _type, int _item_type, - float _item_weight, vector& _items) : - Bucket(_type, _item_weight*_items.size()), - item_type(_item_type), - item_weight(_item_weight) { - items = _items; - make_primes(); - } - - UniformBucket(bufferlist& bl, int& off) : Bucket(bl, off) { - bl.copy(off, sizeof(item_type), (char*)&item_type); - off += sizeof(item_type); - bl.copy(off, sizeof(item_weight), (char*)&item_weight); - off += sizeof(item_weight); - ::_decode(items, bl, off); - make_primes(); - } - - void _encode(bufferlist& bl) { - char t = CRUSH_BUCKET_UNIFORM; - bl.append((char*)&t, sizeof(t)); - bl.append((char*)&id, sizeof(id)); - bl.append((char*)&parent, sizeof(parent)); - bl.append((char*)&type, sizeof(type)); - bl.append((char*)&weight, sizeof(weight)); - - bl.append((char*)&item_type, sizeof(item_type)); - bl.append((char*)&item_weight, sizeof(item_weight)); - - ::_encode(items, bl); - } - - const char *get_bucket_type() const { return "uniform"; } - bool is_uniform() const { return true; } - - int get_size() const { return items.size(); } - - // items - void get_items(vector& i) const { - i = items; - } - int get_item_type() const { return item_type; } - float get_item_weight(int item) const { return item_weight; } - - void add_item(int item, float w, bool back=false) { - if (items.empty()) - item_weight = w; - items.push_back(item); - weight += item_weight; - make_primes(); - } - - void adjust_item_weight(int item, float w) { - assert(0); - } - - int choose_r(int x, int r, Hash& hash) const { - //cout << "uniformbucket.choose_r(" << x << ", " << r << ")" << endl; - //if (r >= get_size()) cout << "warning: r " << r << " >= " << get_size() << " uniformbucket.size" << endl; - - unsigned v = hash(x, get_id());// % get_size(); - unsigned p = get_prime( hash(get_id(), x) ); // choose a prime based on hash(x, get_id(), 2) - unsigned s = (x + v + (r+1)*p) % get_size(); - return items[s]; - } - - }; - - - - - - // list bucket.. RUSH_P sorta - - class ListBucket : public Bucket { - protected: - list items; - list item_weight; - list sum_weight; - - public: - ListBucket(int _type) : Bucket(_type, 0) { } - - ListBucket(bufferlist& bl, int& off) : Bucket(bl, off) { - ::_decode(items, bl, off); - ::_decode(item_weight, bl, off); - ::_decode(sum_weight, bl, off); - } - - void _encode(bufferlist& bl) { - char t = CRUSH_BUCKET_LIST; - bl.append((char*)&t, sizeof(t)); - bl.append((char*)&id, sizeof(id)); - bl.append((char*)&parent, sizeof(parent)); - bl.append((char*)&type, sizeof(type)); - bl.append((char*)&weight, sizeof(weight)); - - ::_encode(items, bl); - ::_encode(item_weight, bl); - ::_encode(sum_weight, bl); - } - - const char *get_bucket_type() const { return "list"; } - bool is_uniform() const { return false; } - - int get_size() const { return items.size(); } - - void get_items(vector& i) const { - for (list::const_iterator it = items.begin(); - it != items.end(); - it++) - i.push_back(*it); - } - float get_item_weight(int item) const { - list::const_iterator i = items.begin(); - list::const_iterator w = item_weight.begin(); - while (i != items.end()) { - if (*i == item) return *w; - i++; w++; - } - assert(0); - return 0; - } - - void add_item(int item, float w, bool back=false) { - if (back) { - items.push_back(item); - item_weight.push_back(w); - sum_weight.clear(); - float s = 0.0; - for (list::reverse_iterator i = item_weight.rbegin(); - i != item_weight.rend(); - i++) { - s += *i; - sum_weight.push_front(s); - } - weight += w; - assert(weight == s); - } else { - items.push_front(item); - item_weight.push_front(w); - weight += w; - sum_weight.push_front(weight); - } - } - - void adjust_item_weight(int item, float dw) { - // find it - list::iterator p = items.begin(); - list::iterator pw = item_weight.begin(); - list::iterator ps = sum_weight.begin(); - - while (*p != item) { - *ps += dw; - p++; pw++; ps++; // next! - assert(p != items.end()); - } - - assert(*p == item); - *pw += dw; - *ps += dw; - } - - - int choose_r(int x, int r, Hash& h) const { - //cout << "linearbucket.choose_r(" << x << ", " << r << ")" << endl; - - list::const_iterator p = items.begin(); - list::const_iterator pw = item_weight.begin(); - list::const_iterator ps = sum_weight.begin(); - - while (p != items.end()) { - const int item = *p; - const float iw = *pw; - const float tw = *ps; - const float f = (float)(h(x, item, r, get_id()) % 10000) * tw / 10000.0; - //cout << "item " << item << " iw = " << iw << " tw = " << tw << " f = " << f << endl; - if (f < iw) { - //cout << "linearbucket.choose_r(" << x << ", " << r << ") = " << item << endl; - return item; - } - p++; pw++; ps++; // next! - } - assert(0); - return 0; - } - - - }; - - - - - // mixed bucket, based on RUSH_T type binary tree - - class TreeBucket : public Bucket { - protected: - //vector item_weight; - - // public: - BinaryTree tree; - map node_item; // node id -> item - vector node_item_vec; // fast version of above - map item_node; // item -> node id - map item_weight; - - public: - TreeBucket(int _type) : Bucket(_type, 0) { } - - TreeBucket(bufferlist& bl, int& off) : Bucket(bl, off) { - tree._decode(bl, off); - - ::_decode(node_item, bl, off); - ::_decode(node_item_vec, bl, off); - ::_decode(item_node, bl, off); - ::_decode(item_weight, bl, off); - } - - void _encode(bufferlist& bl) { - char t = CRUSH_BUCKET_TREE; - bl.append((char*)&t, sizeof(t)); - bl.append((char*)&id, sizeof(id)); - bl.append((char*)&parent, sizeof(parent)); - bl.append((char*)&type, sizeof(type)); - bl.append((char*)&weight, sizeof(weight)); - - tree._encode(bl); - - ::_encode(node_item, bl); - ::_encode(node_item_vec, bl); - ::_encode(item_node, bl); - ::_encode(item_weight, bl); - } - - const char *get_bucket_type() const { return "tree"; } - bool is_uniform() const { return false; } - - int get_size() const { return node_item.size(); } - - // items - void get_items(vector& i) const { - for (map::const_iterator it = node_item.begin(); - it != node_item.end(); - it++) - i.push_back(it->second); - } - float get_item_weight(int i) const { - assert(item_weight.count(i)); - return ((map)item_weight)[i]; - } - - - void add_item(int item, float w, bool back=false) { - item_weight[item] = w; - weight += w; - - unsigned n = tree.add_node(w); - node_item[n] = item; - item_node[item] = n; - - while (node_item_vec.size() <= n) - node_item_vec.push_back(0); - node_item_vec[n] = item; - } - - void adjust_item_weight(int item, float dw) { - // adjust my weight - weight += dw; - item_weight[item] += dw; - - // adjust tree weights - tree.adjust_node_weight(item_node[item], dw); - } - - int choose_r(int x, int r, Hash& h) const { - //cout << "mixedbucket.choose_r(" << x << ", " << r << ")" << endl; - int n = tree.root(); - while (!tree.terminal(n)) { - // pick a point in [0,w) - float w = tree.weight(n); - float f = (float)(h(x, n, r, get_id()) % 10000) * w / 10000.0; - - // left or right? - int l = tree.left(n); - if (tree.exists(l) && - f < tree.weight(l)) - n = l; - else - n = tree.right(n); - } - //assert(node_item.count(n)); - //return ((map)node_item)[n]; - return node_item_vec[n]; - } - }; - - - - - - // straw bucket.. new thing! - - class StrawBucket : public Bucket { - protected: - map item_weight; - map item_straw; - - list _items; - list _straws; - - public: - StrawBucket(int _type) : Bucket(_type, 0) { } - - StrawBucket(bufferlist& bl, int& off) : Bucket(bl, off) { - ::_decode(item_weight, bl, off); - calc_straws(); - } - - void _encode(bufferlist& bl) { - char t = CRUSH_BUCKET_TREE; - bl.append((char*)&t, sizeof(t)); - bl.append((char*)&id, sizeof(id)); - bl.append((char*)&parent, sizeof(parent)); - bl.append((char*)&type, sizeof(type)); - bl.append((char*)&weight, sizeof(weight)); - - ::_encode(item_weight, bl); - } - - const char *get_bucket_type() const { return "straw"; } - bool is_uniform() const { return false; } - - int get_size() const { return item_weight.size(); } - - - // items - void get_items(vector& i) const { - for (map::const_iterator it = item_weight.begin(); - it != item_weight.end(); - it++) - i.push_back(it->first); - } - float get_item_weight(int item) const { - assert(item_weight.count(item)); - return ((map)item_weight)[item]; - } - - void add_item(int item, float w, bool back=false) { - item_weight[item] = w; - weight += w; - calc_straws(); - } - - void adjust_item_weight(int item, float dw) { - //cout << "adjust " << item << " " << dw << endl; - weight += dw; - item_weight[item] += dw; - calc_straws(); - } - - - /* calculate straw lengths. - this is kind of ugly. not sure if there's a closed form way to calculate this or not! - */ - void calc_straws() { - //cout << get_id() << ": calc_straws ============" << endl; - - item_straw.clear(); - _items.clear(); - _straws.clear(); - - // reverse sort by weight; skip zero weight items - map > reverse; - for (map::iterator p = item_weight.begin(); - p != item_weight.end(); - p++) { - //cout << get_id() << ":" << p->first << " " << p->second << endl; - if (p->second > 0) { - //p->second /= minw; - reverse[p->second].insert(p->first); - } - } - - /* 1:2:7 - item_straw[0] = 1.0; - item_straw[1] = item_straw[0]*sqrt(1.0/.6); - item_straw[2] = item_straw[1]*2.0; - */ - - // work from low to high weights - float straw = 1.0; - float numleft = item_weight.size(); - float wbelow = 0.0; - float lastw = 0.0; - - map >::iterator next = reverse.begin(); - //while (next != reverse.end()) { - while (1) { - //cout << "hi " << next->first << endl; - map >::iterator cur = next; - - // set straw length for this set - for (set::iterator s = cur->second.begin(); - s != cur->second.end(); - s++) { - item_straw[*s] = straw; - //cout << "straw " << *s << " w " << item_weight[*s] << " -> " << straw << endl; - _items.push_back(*s); - _straws.push_back(straw); - } - - next++; - if (next == reverse.end()) break; - - wbelow += (cur->first-lastw) * numleft; - //cout << "wbelow " << wbelow << endl; - - numleft -= 1.0 * (float)cur->second.size(); - //cout << "numleft now " << numleft << endl; - - float wnext = numleft * (next->first - cur->first); - //cout << "wnext " << wnext << endl; - - float pbelow = wbelow / (wbelow+wnext); - //cout << "pbelow " << pbelow << endl; - - straw *= pow((double)(1.0/pbelow), (double)1.0/numleft); - - lastw = cur->first; - } - //cout << "============" << endl; - } - - int choose_r(int x, int r, Hash& h) const { - //cout << "strawbucket.choose_r(" << x << ", " << r << ")" << endl; - - float high_draw = -1; - int high = 0; - - list::const_iterator pi = _items.begin(); - list::const_iterator ps = _straws.begin(); - while (pi != _items.end()) { - const int item = *pi; - const float rnd = (float)(h(x, item, r) % 1000000) / 1000000.0; - const float straw = *ps * rnd; - - if (high_draw < 0 || - straw > high_draw) { - high = *pi; - high_draw = straw; - } - - pi++; - ps++; - } - return high; - } - }; - - - - - - inline Bucket* decode_bucket(bufferlist& bl, int& off) { - char t; - bl.copy(off, sizeof(t), (char*)&t); - off += sizeof(t); - - switch (t) { - case CRUSH_BUCKET_UNIFORM: - return new UniformBucket(bl, off); - case CRUSH_BUCKET_LIST: - return new ListBucket(bl, off); - case CRUSH_BUCKET_TREE: - return new TreeBucket(bl, off); - case CRUSH_BUCKET_STRAW: - return new StrawBucket(bl, off); - default: - assert(0); - } - return 0; - } - - - -} - - - - - - - - -#endif diff --git a/branches/sage/ebofs2/crush.old/Hash.h b/branches/sage/ebofs2/crush.old/Hash.h deleted file mode 100644 index 2f0d9e4db918b..0000000000000 --- a/branches/sage/ebofs2/crush.old/Hash.h +++ /dev/null @@ -1,301 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -// Robert Jenkins' function for mixing 32-bit values -// http://burtleburtle.net/bob/hash/evahash.html -// a, b = random bits, c = input and output -#define hashmix(a,b,c) \ - a=a-b; a=a-c; a=a^(c>>13); \ - b=b-c; b=b-a; b=b^(a<<8); \ - c=c-a; c=c-b; c=c^(b>>13); \ - a=a-b; a=a-c; a=a^(c>>12); \ - b=b-c; b=b-a; b=b^(a<<16); \ - c=c-a; c=c-b; c=c^(b>>5); \ - a=a-b; a=a-c; a=a^(c>>3); \ - b=b-c; b=b-a; b=b^(a<<10); \ - c=c-a; c=c-b; c=c^(b>>15); - -namespace crush { - - class Hash { - int seed; - - public: - int get_seed() { return seed; } - void set_seed(int s) { seed = s; } - - Hash(int s) { - unsigned int hash = 1315423911; - int x = 231232; - int y = 1232; - hashmix(s, x, hash); - hashmix(y, s, hash); - seed = s; - } - - inline int operator()(int a) { - unsigned int hash = seed ^ a; - int b = a; - int x = 231232; - int y = 1232; - hashmix(b, x, hash); - hashmix(y, a, hash); - return (hash & 0x7FFFFFFF); - } - - inline int operator()(int a, int b) { - unsigned int hash = seed ^ a ^ b; - int x = 231232; - int y = 1232; - hashmix(a, b, hash); - hashmix(x, a, hash); - hashmix(b, y, hash); - return (hash & 0x7FFFFFFF); - } - - inline int operator()(int a, int b, int c) { - unsigned int hash = seed ^ a ^ b ^ c; - int x = 231232; - int y = 1232; - hashmix(a, b, hash); - hashmix(c, x, hash); - hashmix(y, a, hash); - hashmix(b, x, hash); - hashmix(y, c, hash); - return (hash & 0x7FFFFFFF); - } - - inline int operator()(int a, int b, int c, int d) { - unsigned int hash = seed ^a ^ b ^ c ^ d; - int x = 231232; - int y = 1232; - hashmix(a, b, hash); - hashmix(c, d, hash); - hashmix(a, x, hash); - hashmix(y, b, hash); - hashmix(c, x, hash); - hashmix(y, d, hash); - return (hash & 0x7FFFFFFF); - } - - inline int operator()(int a, int b, int c, int d, int e) { - unsigned int hash = seed ^ a ^ b ^ c ^ d ^ e; - int x = 231232; - int y = 1232; - hashmix(a, b, hash); - hashmix(c, d, hash); - hashmix(e, x, hash); - hashmix(y, a, hash); - hashmix(b, x, hash); - hashmix(y, c, hash); - hashmix(d, x, hash); - hashmix(y, e, hash); - return (hash & 0x7FFFFFFF); - } - }; - -} - - - -#if 0 - - - //return myhash(a) ^ seed; - return myhash(a, seed); - } - int operator()(int a, int b) { - //return myhash( myhash(a) ^ myhash(b) ^ seed ); - return myhash(a, b, seed); - } - int operator()(int a, int b, int c) { - //return myhash( myhash(a ^ seed) ^ myhash(b ^ seed) ^ myhash(c ^ seed) ^ seed ); - return myhash(a, b, c, seed); - } - int operator()(int a, int b, int c, int d) { - //return myhash( myhash(a ^ seed) ^ myhash(b ^ seed) ^ myhash(c ^ seed) ^ myhash(d ^ seed) ^ seed ); - return myhash(a, b, c, d, seed); - } - - // ethan's rush hash? - if (0) - return (n ^ 0xdead1234) * (884811920 * 3 + 1); - - if (1) { - - // before - hash ^= ((hash << 5) + (n&255) + (hash >> 2)); - hashmix(a, b, hash); - n = n >> 8; - hash ^= ((hash << 5) + (n&255) + (hash >> 2)); - hashmix(a, b, hash); - n = n >> 8; - hash ^= ((hash << 5) + (n&255) + (hash >> 2)); - hashmix(a, b, hash); - n = n >> 8; - hash ^= ((hash << 5) + (n&255) + (hash >> 2)); - hashmix(a, b, hash); - n = n >> 8; - - //return hash; - return (hash & 0x7FFFFFFF); - } - - // JS - // a little better than RS - // + jenkin's mixing thing (which sucks on its own but helps tons here) - // best so far - if (1) { - unsigned int hash = 1315423911; - int a = 231232; - int b = 1232; - - for(unsigned int i = 0; i < 4; i++) - { - hash ^= ((hash << 5) + (n&255) + (hash >> 2)); - hashmix(a, b, hash); - n = n >> 8; - } - - return (hash & 0x7FFFFFFF); - } - - - // Robert jenkins' 96 bit mix - // sucks - if (0) { - int c = n; - int a = 12378912; - int b = 2982827; - a=a-b; a=a-c; a=a^(c>>13); - b=b-c; b=b-a; b=b^(a<<8); - c=c-a; c=c-b; c=c^(b>>13); - a=a-b; a=a-c; a=a^(c>>12); - b=b-c; b=b-a; b=b^(a<<16); - c=c-a; c=c-b; c=c^(b>>5); - a=a-b; a=a-c; a=a^(c>>3); - b=b-c; b=b-a; b=b^(a<<10); - c=c-a; c=c-b; c=c^(b>>15); - return c; - } - // robert jenkins 32-bit - // sucks - if (0) { - n += (n << 12); - n ^= (n >> 22); - n += (n << 4); - n ^= (n >> 9); - n += (n << 10); - n ^= (n >> 2); - n += (n << 7); - n ^= (n >> 12); - return n; - } - - // djb2 - if (0) { - unsigned int hash = 5381; - for (int i=0; i<4; i++) { - hash = ((hash << 5) + hash) + ((n&255) ^ 123); - n = n >> 8; - } - return hash; - } - - - // SDBM - if (1) { - unsigned int hash = 0; - - for(unsigned int i = 0; i < 4; i++) - { - hash = (n&255) + (hash << 6) + (hash << 16) - hash; - n = n >> 8; - } - - return (hash & 0x7FFFFFFF); - } - - // PJW - // horrid - if (0) { - unsigned int BitsInUnsignedInt = (unsigned int)(sizeof(unsigned int) * 8); - unsigned int ThreeQuarters = (unsigned int)((BitsInUnsignedInt * 3) / 4); - unsigned int OneEighth = (unsigned int)(BitsInUnsignedInt / 8); - unsigned int HighBits = (unsigned int)(0xFFFFFFFF) << (BitsInUnsignedInt - OneEighth); - unsigned int hash = 0; - unsigned int test = 0; - - for(unsigned int i = 0; i < 4; i++) - { - hash = (hash << OneEighth) + (n&255); - - if((test = hash & HighBits) != 0) - { - hash = (( hash ^ (test >> ThreeQuarters)) & (~HighBits)); - } - n = n >> 8; - } - - return (hash & 0x7FFFFFFF); - } - - // RS Hash function, from Robert Sedgwicks Algorithms in C book, w/ some changes. - if (0) { - unsigned int b = 378551; - unsigned int a = 63689; - unsigned int hash = 0; - - for(unsigned int i=0; i<4; i++) - { - hash = hash * a + (n&0xff); - a = a * b; - n = n >> 8; - } - - return (hash & 0x7FFFFFFF); - } - - // DJB - // worse than rs - if (0) { - unsigned int hash = 5381; - - for(unsigned int i = 0; i < 4; i++) - { - hash = ((hash << 5) + hash) + (n&255); - n = n >> 8; - } - - return (hash & 0x7FFFFFFF); - } - - // AP - // even worse - if (1) { - unsigned int hash = 0; - - for(unsigned int i = 0; i < 4; i++) - { - hash ^= ((i & 1) == 0) ? ( (hash << 7) ^ (n&255) ^ (hash >> 3)) : - (~((hash << 11) ^ (n&255) ^ (hash >> 5))); - n = n >> 8; - } - - return (hash & 0x7FFFFFFF); - } - - -#endif diff --git a/branches/sage/ebofs2/crush.old/crush.h b/branches/sage/ebofs2/crush.old/crush.h deleted file mode 100644 index 376e7d9b3fc86..0000000000000 --- a/branches/sage/ebofs2/crush.old/crush.h +++ /dev/null @@ -1,543 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __crush_CRUSH_H -#define __crush_CRUSH_H - -#include -#include -#include -#include -#include -using std::set; -using std::map; -using std::vector; -using std::list; -#include -#include -using namespace __gnu_cxx; - - -#include "Bucket.h" - -#include "include/buffer.h" - - -namespace crush { - - - // *** RULES *** - - class RuleStep { - public: - int cmd; - vector args; - - RuleStep(int c) : cmd(c) {} - RuleStep(int c, int a) : cmd(c) { - args.push_back(a); - } - RuleStep(int c, int a, int b) : cmd(c) { - args.push_back(a); - args.push_back(b); - } - RuleStep(int o, int a, int b, int c) : cmd(o) { - args.push_back(a); - args.push_back(b); - args.push_back(c); - } - - void _encode(bufferlist& bl) { - bl.append((char*)&cmd, sizeof(cmd)); - ::_encode(args, bl); - } - void _decode(bufferlist& bl, int& off) { - bl.copy(off, sizeof(cmd), (char*)&cmd); - off += sizeof(cmd); - ::_decode(args, bl, off); - } - }; - - - // Rule operations - const int CRUSH_RULE_TAKE = 0; - const int CRUSH_RULE_CHOOSE = 1; // first n by default - const int CRUSH_RULE_CHOOSE_FIRSTN = 1; - const int CRUSH_RULE_CHOOSE_INDEP = 2; - const int CRUSH_RULE_EMIT = 3; - - class Rule { - public: - vector< RuleStep > steps; - - void _encode(bufferlist& bl) { - int n = steps.size(); - bl.append((char*)&n, sizeof(n)); - for (int i=0; i buckets; - int bucketno; - Hash h; - - hash_map parent_map; // what bucket each leaf/bucket lives in - - public: - map rules; - - //map collisions; - //map bumps; - - void _encode(bufferlist& bl) { - // buckets - int n = buckets.size(); - bl.append((char*)&n, sizeof(n)); - for (map::const_iterator it = buckets.begin(); - it != buckets.end(); - it++) { - bl.append((char*)&it->first, sizeof(it->first)); - it->second->_encode(bl); - } - bl.append((char*)&bucketno, sizeof(bucketno)); - - // hash - int s = h.get_seed(); - bl.append((char*)&s, sizeof(s)); - - //::_encode(out, bl); - //::_encode(overload, bl); - - // rules - n = rules.size(); - bl.append((char*)&n, sizeof(n)); - for(map::iterator it = rules.begin(); - it != rules.end(); - it++) { - bl.append((char*)&it->first, sizeof(it->first)); - it->second._encode(bl); - } - - } - - void _decode(bufferlist& bl, int& off) { - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i::iterator bp = buckets.begin(); - bp != buckets.end(); - ++bp) { - // index bucket items - vector items; - bp->second->get_items(items); - for (vector::iterator ip = items.begin(); - ip != items.end(); - ++ip) - parent_map[*ip] = bp->first; - } - } - - - - public: - Crush(int seed=123) : bucketno(-1), h(seed) {} - ~Crush() { - // hose buckets - for (map::iterator it = buckets.begin(); - it != buckets.end(); - it++) { - delete it->second; - } - } - - int print(ostream& out, int root, int indent=0) { - for (int i=0; iget_weight() << "\t" << b->get_id() << "\t"; - for (int i=0; iget_bucket_type() << ": "; - - vector items; - b->get_items(items); - - if (buckets.count(items[0])) { - out << std::endl; - for (unsigned i=0; iset_id(n); - buckets[n] = b; - return n; - } - - void add_item(int parent, int item, float w, bool back=false) { - // add item - assert(!buckets[parent]->is_uniform()); - Bucket *p = buckets[parent]; - - p->add_item(item, w, back); - - // set item's parent - Bucket *n = buckets[item]; - if (n) - n->set_parent(parent); - - // update weights - while (buckets.count(p->get_parent())) { - int child = p->get_id(); - p = buckets[p->get_parent()]; - p->adjust_item_weight(child, w); - } - } - - - /* - this is a hack, fix me! weights should be consistent throughout hierarchy! - - */ - void set_bucket_weight(int item, float w) { - Bucket *b = buckets[item]; - float adj = w - b->get_weight(); - - while (buckets.count(b->get_parent())) { - Bucket *p = buckets[b->get_parent()]; - p->adjust_item_weight(b->get_id(), adj); - b = p; - } - } - - - /* - * choose numrep distinct items of type type - */ - void choose(int x, - int numrep, - int type, - Bucket *inbucket, - vector& outvec, - bool firstn, - set& outset, map& overloadmap, - bool forcefeed=false, - int forcefeedval=-1) { - int off = outvec.size(); - - // for each replica - for (int rep=0; repis_uniform()) { - // uniform bucket; be careful! - if (firstn || numrep >= in->get_size()) { - // uniform bucket is too small; just walk thru elements - r += ftotal; // r' = r + f_total (first n) - } else { - // make sure numrep is not a multple of bucket size - int add = numrep*flocal; // r' = r + n*f_local - if (in->get_size() % numrep == 0) { - add += add/in->get_size(); // shift seq once per pass through the bucket - } - r += add; - } - } else { - // mixed bucket; just make a distinct-ish r sequence - if (firstn) - r += ftotal; // r' = r + f_total - else - r += numrep * flocal; // r' = r + n*f_local - } - - // choose - outv = in->choose_r(x, r, h); - - // did we get the type we want? - int itemtype = 0; // 0 is terminal type - Bucket *newin = 0; // remember bucket we hit - if (in->is_uniform()) { - itemtype = ((UniformBucket*)in)->get_item_type(); - } else { - if (buckets.count(outv)) { // another bucket - newin = buckets[outv]; - itemtype = newin->get_type(); - } - } - if (itemtype == type) { // this is what we want! - // collision? - bool collide = false; - for (int prep=0; prep overloadmap[outv]) - bad = true; - } - - if (collide || bad) { - ftotal++; - flocal++; - - if (collide && flocal < 3) - continue; // try locally a few times! - - if (ftotal >= 10) { - // ok fine, just ignore dup. FIXME. - skip_rep = true; - break; - } - - retry_rep = true; - } - - break; // ok then! - } - - // next - in = newin; - } - - if (retry_rep) continue; // try again - - break; - } - - // skip this rep? (e.g. too many collisions, we give up) - if (skip_rep) continue; - - // output this value - outvec.push_back(outv); - } // for rep - - // double check! - if (0) { - for (unsigned i=1; i& result, - set& outset, map& overloadmap, - int forcefeed=-1) { - //int numresult = 0; - result.clear(); - - // determine hierarchical context for forcefeed (if any) - list force_stack; - if (forcefeed >= 0 && parent_map.count(forcefeed)) { - int t = forcefeed; - while (1) { - force_stack.push_front(t); - //cout << "push " << t << " onto force_stack" << std::endl; - if (parent_map.count(t) == 0) break; // reached root, presumably. - //cout << " " << t << " parent is " << parent_map[t] << std::endl; - t = parent_map[t]; - } - } - - // working vector - vector w; // working variable - - // go through each statement - for (vector::iterator pc = rule.steps.begin(); - pc != rule.steps.end(); - pc++) { - // move input? - - // do it - switch (pc->cmd) { - case CRUSH_RULE_TAKE: - { - const int arg = pc->args[0]; - //cout << "take " << arg << std::endl; - - if (!force_stack.empty()) { - assert(force_stack.front() == arg); - force_stack.pop_front(); - } - - w.clear(); - w.push_back(arg); - } - break; - - case CRUSH_RULE_CHOOSE_FIRSTN: - case CRUSH_RULE_CHOOSE_INDEP: - { - const bool firstn = pc->cmd == CRUSH_RULE_CHOOSE_FIRSTN; - const int numrep = pc->args[0]; - const int type = pc->args[1]; - - //cout << "choose " << numrep << " of type " << type << std::endl; - - assert(!w.empty()); - - // reset output - vector out; - - // forcefeeding? - bool forcing = false; - int forceval = -1; - if (!force_stack.empty()) { - forceval = force_stack.front(); - force_stack.pop_front(); - //cout << "priming out with " << forceval << std::endl; - forcing = true; - } else if (forcefeed >= 0 && type == 0) { - //cout << "forcing context-less " << forcefeed << std::endl; - forceval = forcefeed; - forcefeed = -1; - forcing = true; - } - - // do each row independently - for (vector::iterator i = w.begin(); - i != w.end(); - i++) { - assert(buckets.count(*i)); - Bucket *b = buckets[*i]; - choose(x, numrep, type, b, out, firstn, - outset, overloadmap, - forcing, - forceval); - forcing = false; // only once - } // for inrow - - // put back into w - w.swap(out); - out.clear(); - } - break; - - case CRUSH_RULE_EMIT: - { - for (unsigned i=0; i - -#include -#include -using namespace std; - - -void place(Crush& c, Rule& rule, int numpg, int numrep, map >& placement) -{ - vector v(numrep); - map ocount; - - for (int x=1; x<=numpg; x++) { - - //cout << H(x) << "\t" << h(x) << endl; - c.do_rule(rule, x, v); - //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl; - - bool bad = false; - for (int i=0; i::iterator it = ocount.begin(); - it != ocount.end(); - it++) - cout << it->first << "\t" << it->second << endl; - -} - - -float testmovement(int n, float f, int buckettype) -{ - Hash h(73232313); - - // crush - Crush c; - - int ndisks = 0; - - // bucket - Bucket *b; - if (buckettype == 0) - b = new TreeBucket(1); - else if (buckettype == 1 || buckettype == 2) - b = new ListBucket(1); - else if (buckettype == 3) - b = new StrawBucket(1); - else if (buckettype == 4) - b = new UniformBucket(0,0); - - for (int i=0; iadd_item(ndisks++,1); - - c.add_bucket(b); - int root = b->get_id(); - - //c.print(cout,root); - - // rule - int numrep = 2; - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - //c.overload[10] = .1; - - - int pg_per = 1000; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - - /* - cout << ndisks << " disks, " << endl; - cout << pg_per << " pgs per disk" << endl; - cout << numpg << " logical pgs" << endl; - cout << "numrep is " << numrep << endl; - */ - map > placement1, placement2; - - //c.print(cout, root); - - - // ORIGINAL - place(c, rule, numpg, numrep, placement1); - - int olddisks = ndisks; - - // add item - if (buckettype == 2) { - // start over! - ndisks = 0; - b = new ListBucket(1); - for (int i=0; i<=n; i++) - b->add_item(ndisks++,1); - c.add_bucket(b); - root = b->get_id(); - - rule.steps.clear(); - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - } - else - b->add_item(ndisks++, 1); - - - // ADDED - //c.print(cout, root); - place(c, rule, numpg, numrep, placement2); - - int moved = 0; - for (int x=1; x<=numpg; x++) - if (placement1[x] != placement2[x]) - for (int j=0; j - -#include -#include -using namespace std; - - -Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - return b; - } else { - // mixed - //Bucket *b = new MixedBucket(h+1); - Bucket *b = new StrawBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); - return b->get_id(); -} - - -float go(int dep) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - if (0) { - for (int d=0; dadd_item(ndisks++, 10); - root = c.add_bucket(b); - } - if (0) { - vector disks; - for (int i=0; i<10000; i++) - disks.push_back(ndisks++); - UniformBucket *b = new UniformBucket(1, 0, 10000, disks); - Hash h(123); - b->make_primes(h); - root = c.add_bucket(b); - } - - - - // rule - int numrep = 1; - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - //c.overload[10] = .1; - - - int pg_per = 100; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - //cout << ndisks << " disks, " << endl; - //cout << pg_per << " pgs per disk" << endl; - // cout << numpg << " logical pgs" << endl; - //cout << "numrep is " << numrep << endl; - - - int place = 100000; - int times = place / numpg; - if (!times) times = 1; - - cout << "#looping " << times << " times" << endl; - - float tvar = 0; - int tvarnum = 0; - - int x = 0; - for (int t=0; t v(numrep); - - for (int z=0; z - -#include -#include -using namespace std; - -int buckettype = 0; - -Bucket *make_bucket(Crush& c, vector& wid, int h, map< int, list >& buckets, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - buckets[h].push_back(b); - return b; - } else { - // mixed - //Bucket *b = new TreeBucket(h+1); - //Bucket *b = new ListBucket(h+1); - //Bucket *b = new StrawBucket(h+1); - Bucket *b; - if (buckettype == 0) - b = new TreeBucket(h+1); - else if (buckettype == 1 || buckettype == 2) - b = new ListBucket(h+1); - else if (buckettype == 3) - b = new StrawBucket(h+1); - - c.add_bucket(b); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - n->set_parent(b->get_id()); - } - buckets[h].push_back(b); - //cout << b->get_id() << " mixedbucket with " << wid[h] << " at " << h << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, map< int, list >& buckets, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, buckets, ndisks); - return b->get_id(); -} - - -void place(Crush& c, Rule& rule, int numpg, int numrep, map >& placement) -{ - vector v(numrep); - map ocount; - - for (int x=1; x<=numpg; x++) { - - //cout << H(x) << "\t" << h(x) << endl; - c.do_rule(rule, x, v); - //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl; - - bool bad = false; - for (int i=0; i::iterator it = ocount.begin(); - it != ocount.end(); - it++) - cout << it->first << "\t" << it->second << endl; - -} - - -float testmovement(int depth, int branching, int udisks, int add, int modifydepth) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - wid.push_back(udisks); - for (int d=1; d > buckets; - - root = make_hierarchy(c, wid, buckets, ndisks); - - //c.print(cout,root); - - // rule - int numrep = 2; - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - //c.overload[10] = .1; - - - int pg_per = 100; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - - /* - cout << ndisks << " disks, " << endl; - cout << pg_per << " pgs per disk" << endl; - cout << numpg << " logical pgs" << endl; - cout << "numrep is " << numrep << endl; - */ - map > placement1, placement2; - - //c.print(cout, root); - - - // ORIGINAL - place(c, rule, numpg, numrep, placement1); - - int olddisks = ndisks; - - // add disks - //cout << " adding " << add << " disks" << endl; - vector disks; - for (int i=0; imake_primes(h); - - //Bucket *o = buckets[2].back(); - Bucket *o; - if (buckettype == 2) - o = buckets[modifydepth].front(); - else - o = buckets[modifydepth].back(); - - c.add_bucket(b); - //cout << " adding under " << o->get_id() << endl; - c.add_item(o->get_id(), b->get_id(), b->get_weight()); - //((MixedBucket*)o)->add_item(b->get_id(), b->get_weight()); - //newbucket = b; - - - // ADDED - //c.print(cout, root); - place(c, rule, numpg, numrep, placement2); - - int moved = 0; - for (int x=1; x<=numpg; x++) - if (placement1[x] != placement2[x]) - for (int j=0; j - -#include -#include -using namespace std; - - -int buckettype = 2; // 0 = mixed, 1 = linear, 2 = straw - -int big_one_skip = 255; -int big_one_size; -Bucket *big_one = 0; - -Bucket *make_bucket(Crush& c, vector& wid, int h, map< int, list >& buckets, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - - int s = wid[h]; - if (big_one_skip > 0) - big_one_skip--; - if (!big_one_skip && !big_one) - s = big_one_size; - - - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks " << disks.size()<< endl; - buckets[h].push_back(b); - return b; - } else { - // mixed - Bucket *b; - if (buckettype == 0) - b = new TreeBucket(h+1); - else if (buckettype == 1) - b = new ListBucket(h+1); - else if (buckettype == 2) - b = new StrawBucket(h+1); - c.add_bucket(b); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - n->set_parent(b->get_id()); - } - buckets[h].push_back(b); - //cout << b->get_id() << " mixedbucket with " << wid[h] << " at " << h << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, map< int, list >& buckets, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, buckets, ndisks); - return b->get_id(); -} - - -void place(Crush& c, Rule& rule, int numpg, int numrep, map >& placement) -{ - vector v(numrep); - map ocount; - - for (int x=1; x<=numpg; x++) { - - //cout << H(x) << "\t" << h(x) << endl; - c.do_rule(rule, x, v); - //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl; - - bool bad = false; - for (int i=0; i::iterator it = ocount.begin(); - it != ocount.end(); - it++) - cout << it->first << "\t" << it->second << endl; - -} - - -float testmovement(int depth, int branching, int udisks, int add) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - wid.push_back(udisks); - for (int d=1; d > buckets; - - big_one_size = add; - big_one = 0; - - //cout << "making tree" << endl; - root = make_hierarchy(c, wid, buckets, ndisks); - - //c.print(cout, root); - - - // rule - int numrep = 2; - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - //c.overload[10] = .1; - - - int pg_per = 100; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - - /* - cout << ndisks << " disks, " << endl; - cout << pg_per << " pgs per disk" << endl; - cout << numpg << " logical pgs" << endl; - cout << "numrep is " << numrep << endl; - */ - map > placement1, placement2; - - //c.print(cout, root); - - int olddisks = ndisks; - - - place(c, rule, numpg, numrep, placement1); - - if (1) { - // remove disks - assert(big_one); - c.adjust_item(big_one->get_id(), 0); - } - - int newdisks = ndisks - add; - - //c.print(cout, root); - place(c, rule, numpg, numrep, placement2); - - int moved = 0; - for (int x=1; x<=numpg; x++) - if (placement1[x] != placement2[x]) - for (int j=0; j >::iterator i = r.begin(); - i != r.end(); - i++) { - cout << i->first; - for (map::iterator j = i->second.begin(); - j != i->second.end(); - j++) - cout << "\t" << j->first << "\t" << j->second; - cout << endl; - } - */ -} - diff --git a/branches/sage/ebofs2/crush.old/test/cluster_movement_rush.cc b/branches/sage/ebofs2/crush.old/test/cluster_movement_rush.cc deleted file mode 100644 index 90cc197c24f65..0000000000000 --- a/branches/sage/ebofs2/crush.old/test/cluster_movement_rush.cc +++ /dev/null @@ -1,218 +0,0 @@ - - -#include "../crush.h" -using namespace crush; - -#include - -#include -#include -using namespace std; - -int buckettype = 0; - -Bucket *make_bucket(Crush& c, vector& wid, int h, map< int, list >& buckets, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - buckets[h].push_back(b); - return b; - } else { - // mixed - //Bucket *b = new TreeBucket(h+1); - //Bucket *b = new ListBucket(h+1); - //Bucket *b = new StrawBucket(h+1); - Bucket *b; - if (buckettype == 0) - b = new TreeBucket(h+1); - else if (buckettype == 1 || buckettype == 2) - b = new ListBucket(h+1); - else if (buckettype == 3) - b = new StrawBucket(h+1); - - c.add_bucket(b); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - n->set_parent(b->get_id()); - } - buckets[h].push_back(b); - //cout << b->get_id() << " mixedbucket with " << wid[h] << " at " << h << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, map< int, list >& buckets, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, buckets, ndisks); - return b->get_id(); -} - - -void place(Crush& c, Rule& rule, int numpg, int numrep, map >& placement) -{ - vector v(numrep); - map ocount; - - for (int x=1; x<=numpg; x++) { - - //cout << H(x) << "\t" << h(x) << endl; - c.do_rule(rule, x, v); - //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl; - - bool bad = false; - for (int i=0; i::iterator it = ocount.begin(); - it != ocount.end(); - it++) - cout << it->first << "\t" << it->second << endl; - -} - - -float testmovement(int depth, int branching, int udisks, int add, int modifydepth) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - wid.push_back(udisks); - for (int d=1; d > buckets; - - root = make_hierarchy(c, wid, buckets, ndisks); - - //c.print(cout,root); - - // rule - int numrep = 2; - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - //c.overload[10] = .1; - - - int pg_per = 100; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - - /* - cout << ndisks << " disks, " << endl; - cout << pg_per << " pgs per disk" << endl; - cout << numpg << " logical pgs" << endl; - cout << "numrep is " << numrep << endl; - */ - map > placement1, placement2; - - //c.print(cout, root); - - - // ORIGINAL - place(c, rule, numpg, numrep, placement1); - - int olddisks = ndisks; - - // add disks - //cout << " adding " << add << " disks" << endl; - vector disks; - for (int i=0; imake_primes(h); - - //Bucket *o = buckets[2].back(); - Bucket *o; - if (buckettype == 2) - o = buckets[modifydepth].front(); - else - o = buckets[modifydepth].back(); - - c.add_bucket(b); - //cout << " adding under " << o->get_id() << endl; - c.add_item(o->get_id(), b->get_id(), b->get_weight(), buckettype == 2); - //((MixedBucket*)o)->add_item(b->get_id(), b->get_weight()); - //newbucket = b; - - - // ADDED - //c.print(cout, root); - place(c, rule, numpg, numrep, placement2); - - int moved = 0; - for (int x=1; x<=numpg; x++) - if (placement1[x] != placement2[x]) - for (int j=0; j - -#include -#include -using namespace std; - - -Clock g_clock; - - -Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks weight " << w << endl; - return b; - } else { - // mixed - Bucket *b = new TreeBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); - return b->get_id(); -} - - - -float go(int dep, int failpc) -{ - Hash h(73232313); - - //int overloadcutoff = (int)((float)10000.0 / (float)utilization); - - //cout << "util " << utilization << " cutoff " << overloadcutoff << endl; - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - for (int d=0; d ocount(ndisks); - //cout << ndisks << " disks, " << endl; - //cout << pg_per << " pgs per disk" << endl; - // cout << numpg << " logical pgs" << endl; - //cout << "numrep is " << numrep << endl; - - - int place = 1000000; - int times = place / numpg; - if (!times) times = 1; - - - //cout << "looping " << times << " times" << endl; - - float tavg[10]; - float tvar[10]; - for (int j=0;j<10;j++) { - tvar[j] = 0; - tavg[j] = 0; - } - int tvarnum = 0; - float trvar = 0.0; - - float overloadsum = 0.0; - float adjustsum = 0.0; - float afteroverloadsum = 0.0; - float aslowdown = 0.0; - int chooses = 0; - int xs = 1; - for (int t=0; t v(numrep); - - c.out.clear(); - - for (int z=0; z= ndisks) cout << "v[i] " << i << " is " << v[i] << " .. x = " << x << endl; - //assert(v[i] < ndisks); - ocount[v[i]]++; - } - } - utime_t t1b = g_clock.now(); - - // add in numf failed disks - for (int f = 0; f < numf; f++) { - int d = rand() % ndisks; - while (c.out.count(d)) d = rand() % ndisks; - c.out.insert(d); - } - - utime_t t3a = g_clock.now(); - for (int x=xs; x - -#include -#include -using namespace std; - - -Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - return b; - } else { - // mixed - MixedBucket *b = new MixedBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); - return b->get_id(); -} - - -Bucket *make_random(Crush& c, int wid, int height, int& ndisks) -{ - int w = rand() % (wid-1) + 2; - - if (height == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - return b; - } else { - // mixed - int h = rand() % height + 1; - MixedBucket *b = new MixedBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } - -} - - -float go(int dep, int overloadcutoff) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - for (int d=0; dget_id(); - //c.print(cout, root); - } - if (0) { - MixedBucket *b = new MixedBucket(1); - for (int i=0; i<10000; i++) - b->add_item(ndisks++, 10); - root = c.add_bucket(b); - } - if (0) { - vector disks; - for (int i=0; i<10000; i++) - disks.push_back(ndisks++); - UniformBucket *b = new UniformBucket(1, 0, 10000, disks); - Hash h(123); - b->make_primes(h); - root = c.add_bucket(b); - } - //cout << ndisks << " disks" << endl; - - - - // rule - int numrep = 1; - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - //c.overload[10] = .1; - - - int pg_per = 100; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - //cout << ndisks << " disks, " << endl; - //cout << pg_per << " pgs per disk" << endl; - // cout << numpg << " logical pgs" << endl; - //cout << "numrep is " << numrep << endl; - - - int place = 1000000; - int times = place / numpg; - if (!times) times = 1; - - - //cout << "looping " << times << " times" << endl; - - float tvar = 0; - int tvarnum = 0; - - float overloadsum = 0.0; - float adjustsum = 0.0; - float afteroverloadsum = 0.0; - int chooses = 0; - int xs = 1; - for (int t=0; t v(numrep); - - c.overload.clear(); - - for (int z=0; z overloadcutoff) - overloaded++; - - if (ocount[i] > 100+(overloadcutoff-100)/2) { - adjusted++; - c.overload[i] = 100.0 / (float)ocount[i]; - //cout << "disk " << i << " has " << ocount[i] << endl; - } - ocount[i] = 0; - } - //cout << overloaded << " overloaded" << endl; - overloadsum += (float)overloaded / (float)ndisks; - adjustsum += (float)adjusted / (float)ndisks; - - - for (int x=xs; x overloadcutoff) { - still++; - //c.overload[ocount[i]] = 100.0 / (float)ocount[i]; - //cout << "disk " << i << " has " << ocount[i] << endl; - } - } - //if (still) cout << "overload was " << overloaded << " now " << still << endl; - afteroverloadsum += (float)still / (float)ndisks; - - //cout << "collisions: " << c.collisions << endl; - //cout << "r bumps: " << c.bumps << endl; - - float avg = 0.0; - for (int i=0; i100; d -= 5) { - float var = go(3,d); - //cout << "## depth = " << d << endl; - //cout << d << "\t" << var << endl; - } -} diff --git a/branches/sage/ebofs2/crush.old/test/depth_variance.cc b/branches/sage/ebofs2/crush.old/test/depth_variance.cc deleted file mode 100644 index 7d60ebaae9501..0000000000000 --- a/branches/sage/ebofs2/crush.old/test/depth_variance.cc +++ /dev/null @@ -1,185 +0,0 @@ - - -#include "../crush.h" -using namespace crush; - -#include - -#include -#include -using namespace std; - - -Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - return b; - } else { - // mixed - Bucket *b = new TreeBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); - return b->get_id(); -} - - -float go(int dep) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - if (1) { - for (int d=0; d ocount(ndisks); - //cout << ndisks << " disks, " << endl; - //cout << pg_per << " pgs per disk" << endl; - // cout << numpg << " logical pgs" << endl; - //cout << "numrep is " << numrep << endl; - - - int place = 100000; - int times = place / numpg; - if (!times) times = 1; - - cout << "#looping " << times << " times" << endl; - - float tvar = 0; - int tvarnum = 0; - float tavg = 0; - - int x = 0; - for (int t=0; t v(numrep); - - for (int z=0; z - -#include -#include -using namespace std; - - -Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks weight " << w << endl; - return b; - } else { - // mixed - Bucket *b = new TreeBucket(h+1); - //Bucket *b = new StrawBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); - return b->get_id(); -} - - - -float go(int dep, int overloadcutoff) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - for (int d=0; d ocount(ndisks); - //cout << ndisks << " disks, " << endl; - //cout << pg_per << " pgs per disk" << endl; - // cout << numpg << " logical pgs" << endl; - //cout << "numrep is " << numrep << endl; - - - int place = 100000; - int times = place / numpg; - if (!times) times = 1; - - - //cout << "looping " << times << " times" << endl; - - float tavg[10]; - float tvar[10]; - for (int j=0;j<10;j++) { - tvar[j] = 0; - tavg[j] = 0; - } - int tvarnum = 0; - - float overloadsum = 0.0; - float adjustsum = 0.0; - float afteroverloadsum = 0.0; - int chooses = 0; - int xs = 1; - for (int t=0; t v(numrep); - - c.overload.clear(); - - for (int z=0; z cutoff) - overloaded++; - - if (ocount[i] > adjoff) { - adjusted++; - c.overload[i] = (float)target / (float)ocount[i]; - //cout << "setting overload " << i << " to " << c.overload[i] << endl; - //cout << "disk " << i << " has " << ocount[i] << endl; - } - ocount[i] = 0; - } - //cout << overloaded << " overloaded" << endl; - overloadsum += (float)overloaded / (float)ndisks; - adjustsum += (float)adjusted / (float)ndisks; - - - - if (1) { - // second pass - for (int x=xs; x= adjoff) { - adjusted++; - if (c.overload.count(i) == 0) { - c.overload[i] = 1.0; - adjusted++; - } - //else cout << "(re)adjusting " << i << endl; - c.overload[i] *= (float)target / (float)ocount[i]; - //cout << "setting overload " << i << " to " << c.overload[i] << endl; - //cout << "disk " << i << " has " << ocount[i] << endl; - } - ocount[i] = 0; - } - } - - for (int x=xs; x cutoff) { - still++; - //c.overload[ocount[i]] = 100.0 / (float)ocount[i]; - if (c.overload.count(i)) cout << "[adjusted] "; - cout << "disk " << i << " has " << ocount[i] << endl; - } - } - //if (still) cout << "overload was " << overloaded << " now " << still << endl; - afteroverloadsum += (float)still / (float)ndisks; - - //cout << "collisions: " << c.collisions << endl; - //cout << "r bumps: " << c.bumps << endl; - - int n = ndisks/10; - float avg[10]; - float var[10]; - for (int i=0;i<10;i++) { - int s = n*i; - avg[i] = 0.0; - for (int j=0; j100; d -= 5) { - float var = go(3,d); - //cout << "## depth = " << d << endl; - //cout << d << "\t" << var << endl; - } -} diff --git a/branches/sage/ebofs2/crush.old/test/movement.cc b/branches/sage/ebofs2/crush.old/test/movement.cc deleted file mode 100644 index 2621f09457fe6..0000000000000 --- a/branches/sage/ebofs2/crush.old/test/movement.cc +++ /dev/null @@ -1,223 +0,0 @@ - - -#include "../crush.h" -using namespace crush; - -#include - -#include -#include -using namespace std; - - -Bucket *make_bucket(Crush& c, vector& wid, int h, map< int, list >& buckets, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - buckets[h].push_back(b); - return b; - } else { - // mixed - MixedBucket *b = new MixedBucket(h+1); - c.add_bucket(b); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - n->set_parent(b->get_id()); - } - buckets[h].push_back(b); - //cout << b->get_id() << " mixedbucket with " << wid[h] << " at " << h << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, map< int, list >& buckets, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, buckets, ndisks); - return b->get_id(); -} - - -void place(Crush& c, Rule& rule, int numpg, int numrep, map >& placement) -{ - vector v(numrep); - map ocount; - - for (int x=1; x<=numpg; x++) { - - //cout << H(x) << "\t" << h(x) << endl; - c.do_rule(rule, x, v); - //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl; - - bool bad = false; - for (int i=0; i::iterator it = ocount.begin(); - it != ocount.end(); - it++) - cout << it->first << "\t" << it->second << endl; - -} - - -float testmovement(int depth, int branching, int udisks) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - wid.push_back(udisks); - for (int d=1; d > buckets; - - if (1) { - root = make_hierarchy(c, wid, buckets, ndisks); - } - if (0) { - MixedBucket *b = new MixedBucket(1); - for (int i=0; i<10000; i++) - b->add_item(ndisks++, 10); - root = c.add_bucket(b); - } - if (0) { - vector disks; - for (int i=0; i<10000; i++) - disks.push_back(ndisks++); - UniformBucket *b = new UniformBucket(1, 0, 1, disks); - Hash h(123); - b->make_primes(h); - root = c.add_bucket(b); - } - - - - // rule - int numrep = 2; - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - //c.overload[10] = .1; - - - int pg_per = 100; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - - /* - cout << ndisks << " disks, " << endl; - cout << pg_per << " pgs per disk" << endl; - cout << numpg << " logical pgs" << endl; - cout << "numrep is " << numrep << endl; - */ - map > placement1, placement2; - - //c.print(cout, root); - - place(c, rule, numpg, numrep, placement1); - - if (1) { - // failed - - //for (int i=500; i<1000; i++) - //c.failed.insert(i); - c.failed.insert(0); - } - - int olddisks = ndisks; - - if (1) { - int n = udisks; - //cout << " adding " << n << " disks" << endl; - vector disks; - for (int i=0; imake_primes(h); - Bucket *o = buckets[1].back(); - c.add_bucket(b); - //cout << " adding under " << o->get_id() << endl; - c.add_item(o->get_id(), b->get_id(), b->get_weight()); - //((MixedBucket*)o)->add_item(b->get_id(), b->get_weight()); - } - - //c.print(cout, root); - place(c, rule, numpg, numrep, placement2); - - int moved = 0; - for (int x=1; x<=numpg; x++) { - if (placement1[x] != placement2[x]) { - for (int j=0; j v; - cout << depth; - for (int branching = 3; branching < 16; branching += 1) { - float fac = testmovement(depth, branching, udisks); - v.push_back(fac); - int n = udisks * pow((float)branching, (float)depth-1); - cout << "\t" << n; - cout << "\t" << fac; - } - //for (int i=0; i - -#include -#include -using namespace std; - - -Bucket *make_bucket(Crush& c, vector& wid, int h, map< int, list >& buckets, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - buckets[h].push_back(b); - return b; - } else { - // mixed - MixedBucket *b = new MixedBucket(h+1); - c.add_bucket(b); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - n->set_parent(b->get_id()); - } - buckets[h].push_back(b); - //cout << b->get_id() << " mixedbucket with " << wid[h] << " at " << h << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, map< int, list >& buckets, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, buckets, ndisks); - return b->get_id(); -} - - -void place(Crush& c, Rule& rule, int numpg, int numrep, map >& placement) -{ - vector v(numrep); - map ocount; - - for (int x=1; x<=numpg; x++) { - - //cout << H(x) << "\t" << h(x) << endl; - c.do_rule(rule, x, v); - //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl; - - bool bad = false; - for (int i=0; i::iterator it = ocount.begin(); - it != ocount.end(); - it++) - cout << it->first << "\t" << it->second << endl; - -} - - -float testmovement(int depth, int branching, int udisks) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - wid.push_back(udisks); - for (int d=1; d > buckets; - - if (1) { - root = make_hierarchy(c, wid, buckets, ndisks); - } - if (0) { - MixedBucket *b = new MixedBucket(1); - for (int i=0; i<10000; i++) - b->add_item(ndisks++, 10); - root = c.add_bucket(b); - } - if (0) { - vector disks; - for (int i=0; i<10000; i++) - disks.push_back(ndisks++); - UniformBucket *b = new UniformBucket(1, 0, 1, disks); - Hash h(123); - b->make_primes(h); - root = c.add_bucket(b); - } - - - - // rule - int numrep = 2; - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - //c.overload[10] = .1; - - - int pg_per = 100; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - - /* - cout << ndisks << " disks, " << endl; - cout << pg_per << " pgs per disk" << endl; - cout << numpg << " logical pgs" << endl; - cout << "numrep is " << numrep << endl; - */ - map > placement1, placement2; - - //c.print(cout, root); - - place(c, rule, numpg, numrep, placement1); - - float over = .5; - - if (1) { - // failed - - //for (int i=500; i<1000; i++) - //c.failed.insert(i); - //c.failed.insert(0); - c.overload[0] = over; - } - - int olddisks = ndisks; - - - - if (0) { - int n = udisks; - //cout << " adding " << n << " disks" << endl; - vector disks; - for (int i=0; imake_primes(h); - Bucket *o = buckets[1].back(); - c.add_bucket(b); - //cout << " adding under " << o->get_id() << endl; - c.add_item(o->get_id(), b->get_id(), b->get_weight()); - //((MixedBucket*)o)->add_item(b->get_id(), b->get_weight()); - } - - //c.print(cout, root); - place(c, rule, numpg, numrep, placement2); - - vector moved(ndisks); - - //int moved = 0; - for (int d=0; d::iterator it = placement1[d].begin(); - it != placement1[d].end(); - it++) { - placement2[d].erase(*it); - } - } - - float avg = 0; - for (int d=0; d v; - cout << depth; - for (int branching = 3; branching < 16; branching += 1) { - float fac = testmovement(depth, branching, udisks); - v.push_back(fac); - int n = udisks * pow((float)branching, (float)depth-1); - //cout << "\t" << n; - //cout << "\t" << fac; - } - //for (int i=0; i - -#include -#include -using namespace std; - - -Clock g_clock; - - -Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks weight " << w << endl; - return b; - } else { - // mixed - Bucket *b = new TreeBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); - return b->get_id(); -} - - - -float go(int dep, int utilization ) -{ - Hash h(73232313); - - int overloadcutoff = (int)((float)10000.0 / (float)utilization); - - //cout << "util " << utilization << " cutoff " << overloadcutoff << endl; - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - for (int d=0; d ocount(ndisks); - //cout << ndisks << " disks, " << endl; - //cout << pg_per << " pgs per disk" << endl; - // cout << numpg << " logical pgs" << endl; - //cout << "numrep is " << numrep << endl; - - - int place = 100000; - int times = place / numpg; - if (!times) times = 1; - - - //cout << "looping " << times << " times" << endl; - - float tavg[10]; - float tvar[10]; - for (int j=0;j<10;j++) { - tvar[j] = 0; - tavg[j] = 0; - } - int tvarnum = 0; - - float overloadsum = 0.0; - float adjustsum = 0.0; - float afteroverloadsum = 0.0; - float aslowdown = 0.0; - int chooses = 0; - int xs = 1; - for (int t=0; t v(numrep); - - c.overload.clear(); - - for (int z=0; z cutoff) - overloaded++; - - if (ocount[i] > adjoff) { - adjusted++; - c.overload[i] = (float)target / (float)ocount[i]; - //cout << "setting overload " << i << " to " << c.overload[i] << endl; - //cout << "disk " << i << " has " << ocount[i] << endl; - } - ocount[i] = 0; - } - //cout << overloaded << " overloaded" << endl; - overloadsum += (float)overloaded / (float)ndisks; - adjustsum += (float)adjusted / (float)ndisks; - - - - // keep adjusting! - for (int bla=0; bla<5; bla++) { - utime_t t2a = g_clock.now(); - - // second pass - for (int x=xs; x= adjoff) { - numover++; - if (c.overload.count(i) == 0) { - c.overload[i] = 1.0; - adjusted++; - } - //else cout << "(re)adjusting " << i << endl; - c.overload[i] *= (float)target / (float)ocount[i]; - //cout << "setting overload " << i << " to " << c.overload[i] << endl; - //cout << "disk " << i << " has " << ocount[i] << endl; - } - ocount[i] = 0; - } - if (!numover) break; - cout << "readjusting" << endl; - } - - utime_t t3a = g_clock.now(); - - for (int x=xs; x cutoff) { - still++; - //c.overload[ocount[i]] = 100.0 / (float)ocount[i]; - if (c.overload.count(i)) cout << "[adjusted] "; - cout << "disk " << i << " has " << ocount[i] << endl; - } - } - //if (still) cout << "overload was " << overloaded << " now " << still << endl; - afteroverloadsum += (float)still / (float)ndisks; - - //cout << "collisions: " << c.collisions << endl; - //cout << "r bumps: " << c.bumps << endl; - - int n = ndisks/10; - float avg[10]; - float var[10]; - for (int i=0;i<10;i++) { - int s = n*i; - avg[i] = 0.0; - for (int j=0; j - -#include -#include -using namespace std; - - -Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - return b; - } else { - // mixed - MixedBucket *b = new MixedBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); - return b->get_id(); -} - - -Bucket *make_random(Crush& c, int wid, int height, int& ndisks) -{ - int w = rand() % (wid-1) + 2; - - if (height == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - return b; - } else { - // mixed - int h = rand() % height + 1; - MixedBucket *b = new MixedBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } - -} - - -float go(int dep, int overloadcutoff) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - for (int d=0; dget_id(); - //c.print(cout, root); - } - if (0) { - MixedBucket *b = new MixedBucket(1); - for (int i=0; i<10000; i++) - b->add_item(ndisks++, 10); - root = c.add_bucket(b); - } - if (0) { - vector disks; - for (int i=0; i<10000; i++) - disks.push_back(ndisks++); - UniformBucket *b = new UniformBucket(1, 0, 10000, disks); - Hash h(123); - b->make_primes(h); - root = c.add_bucket(b); - } - //cout << ndisks << " disks" << endl; - - - - // rule - int numrep = 1; - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - //c.overload[10] = .1; - - - int pg_per = 100; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - //cout << ndisks << " disks, " << endl; - //cout << pg_per << " pgs per disk" << endl; - // cout << numpg << " logical pgs" << endl; - //cout << "numrep is " << numrep << endl; - - - int place = 1000000; - int times = place / numpg; - if (!times) times = 1; - - - //cout << "looping " << times << " times" << endl; - - float tvar = 0; - int tvarnum = 0; - - float overloadsum = 0.0; - float adjustsum = 0.0; - float afteroverloadsum = 0.0; - int chooses = 0; - int xs = 1; - for (int t=0; t v(numrep); - - c.overload.clear(); - - for (int z=0; z overloadcutoff) - overloaded++; - - if (ocount[i] > 100+(overloadcutoff-100)/2) { - adjusted++; - c.overload[i] = 100.0 / (float)ocount[i]; - //cout << "disk " << i << " has " << ocount[i] << endl; - } - ocount[i] = 0; - } - //cout << overloaded << " overloaded" << endl; - overloadsum += (float)overloaded / (float)ndisks; - adjustsum += (float)adjusted / (float)ndisks; - - - for (int x=xs; x overloadcutoff) { - still++; - //c.overload[ocount[i]] = 100.0 / (float)ocount[i]; - //cout << "disk " << i << " has " << ocount[i] << endl; - } - } - //if (still) cout << "overload was " << overloaded << " now " << still << endl; - afteroverloadsum += (float)still / (float)ndisks; - - //cout << "collisions: " << c.collisions << endl; - //cout << "r bumps: " << c.bumps << endl; - - float avg = 0.0; - for (int i=0; i100; d -= 5) { - float var = go(3,d); - //cout << "## depth = " << d << endl; - //cout << d << "\t" << var << endl; - } -} diff --git a/branches/sage/ebofs2/crush.old/test/sizes.cc b/branches/sage/ebofs2/crush.old/test/sizes.cc deleted file mode 100644 index cc5780218210a..0000000000000 --- a/branches/sage/ebofs2/crush.old/test/sizes.cc +++ /dev/null @@ -1,131 +0,0 @@ - -#include "include/types.h" -#include "include/Distribution.h" -#include "osd/OSDMap.h" - - -Distribution file_size_distn; //kb - - -list object_queue; -int max_object_size = 1024*1024*100; //kb - -off_t no; - -int get_object() //kb -{ - if (object_queue.empty()) { - int max = file_size_distn.sample(); - no++; - int filesize = max/2 + (rand() % 100) * max/200 + 1; - //cout << "file " << filesize << endl; - while (filesize > max_object_size) { - object_queue.push_back(max_object_size); - filesize -= max_object_size; - } - object_queue.push_back(filesize); - } - int s = object_queue.front(); - object_queue.pop_front(); - //cout << "object " << s << endl; - return s; -} - -void getdist(vector& v, float& avg, float& var) -{ - avg = 0.0; - for (int i=0; i pgs(n); - off_t did = 0; - - no = 0; - while (did < dist) { - off_t s = get_object(); - pgs[rand()%n] += s; - did += s; - } - while (!object_queue.empty()) - pgs[rand()%n] += get_object(); - - numo = no; - //cout << did/n << endl; - - //for (int i=0; i - -#include -#include -using namespace std; - - -Bucket *make_bucket(Crush& c, vector& wid, int h, map< int, list >& buckets, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - buckets[h].push_back(b); - return b; - } else { - // mixed - Bucket *b = new TreeBucket(h+1); - c.add_bucket(b); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - n->set_parent(b->get_id()); - } - buckets[h].push_back(b); - //cout << b->get_id() << " mixedbucket with " << wid[h] << " at " << h << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, map< int, list >& buckets, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, buckets, ndisks); - return b->get_id(); -} - - -void place(Crush& c, Rule& rule, int numpg, int numrep, vector& ocount) -{ - vector v(numrep); - //map ocount; - - for (int x=1; x<=numpg; x++) { - - //cout << H(x) << "\t" << h(x) << endl; - c.do_rule(rule, x, v); - //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl; - - bool bad = false; - for (int i=0; i wid; - wid.push_back(10); - wid.push_back(2); - - map< int, list > buckets; - root = make_hierarchy(c, wid, buckets, ndisks); - - // add small bucket - vector disks; - for (int i=0; i<3; i++) - disks.push_back(ndisks++); - UniformBucket *b = new UniformBucket(1, 0, 1, disks); - b->make_primes(h); - Bucket *o = buckets[1].back(); - c.add_bucket(b); - //cout << " adding under " << o->get_id() << endl; - c.add_item(o->get_id(), b->get_id(), b->get_weight()); - - - // rule - int numrep = 6; - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - //c.overload[10] = .1; - - int pg_per = 10000; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - - c.print(cout, root); - - place(c, rule, numpg, numrep, ocount); - - for (int i=0; i - -#include -#include -using namespace std; - - -int numrep = 1; - - -double go(int n, int bucket) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - Bucket *b; - vector items; - if (bucket == 0) b = new UniformBucket(1,0,10,items); - if (bucket == 1) b = new TreeBucket(1); - if (bucket == 2) b = new ListBucket(1); - if (bucket == 3) b = new StrawBucket(1); - - for (int d=0; dadd_item(ndisks++, 1); - - //if (!bucket) ((UniformBucket*)b)->make_primes(h); - - root = c.add_bucket(b); - - // rule - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - - int place = 1000000; - - - vector v(numrep); - set out; - map overload; - - utime_t start = g_clock.now(); - - for (int x=1; x <= place; x++) - c.do_rule(rule, x, v, out, overload); - - utime_t end = g_clock.now(); - - end -= start; - double el = (double)end; - - //cout << "\t" << ndisks; - - return el; -} - - -int main() -{ - - for (int n=4; n<=50; n += 4) { - cout << n; - for (int b=0; b<4; b++) { - double el = go(n,b); - cout << "\t" << el; - } - cout << endl; - } -} diff --git a/branches/sage/ebofs2/crush.old/test/speed_depth.cc b/branches/sage/ebofs2/crush.old/test/speed_depth.cc deleted file mode 100644 index 32275d16d2b31..0000000000000 --- a/branches/sage/ebofs2/crush.old/test/speed_depth.cc +++ /dev/null @@ -1,174 +0,0 @@ - -#include "../../common/Clock.h" -#include "../crush.h" -using namespace crush; - - -Clock g_clock; - -#include - -#include -#include -using namespace std; - - -int uniform = 10; -int branching = 10; -int buckettype = 0; -int numrep = 1; - -Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - return b; - } else { - // mixed - Bucket *b; - if (buckettype == 0) - b = new TreeBucket(h+1); - else if (buckettype == 1 || buckettype == 2) - b = new ListBucket(h+1); - else if (buckettype == 3) - b = new StrawBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); - return b->get_id(); -} - - -double go(int dep, int per) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - if (1) { - wid.push_back(uniform); - for (int d=1; d v(numrep); - - utime_t start = g_clock.now(); - - set out; - map overload; - - for (int x=1; x <= place; x++) - c.do_rule(rule, x, v, out, overload); - - utime_t end = g_clock.now(); - - end -= start; - double el = (double)end; - - //cout << "\t" << ndisks; - - return el; -} - - -int main() -{ - uniform = branching = 8; - - cout << "// dep\tuniform\tbranch\tndisks" << endl; - - for (int d=2; d<=5; d++) { - cout << d;// << "\t" << branching; - cout << "\t" << uniform; - cout << "\t" << branching; - - int n = 1; - for (int i=0; i - -#include -#include -using namespace std; - - -int branching = 10; -bool linear = false; -int numrep = 1; - -Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - return b; - } else { - // mixed - Bucket *b; - if (linear) - b = new ListBucket(h+1); - else - b = new TreeBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); - return b->get_id(); -} - - -double go(int s) -{ - int dep = 2; - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - if (1) { - //for (int d=0; d v(numrep); - - utime_t start = g_clock.now(); - - for (int x=1; x <= place; x++) - c.do_rule(rule, x, v); - - utime_t end = g_clock.now(); - - end -= start; - double el = (double)end; - - cout << "\t" << ndisks; - - return el; -} - - -int main() -{ - branching = 8; - - int d = 2; - numrep = 2; - - for (int s = 64; s <= 32768; s *= 8) { - cout << "t"; - linear = false; - double el = go(s, d); - cout << "\t" << el; - - cout << "\tp"; - linear = true; - el = go(s, d); - cout << "\t" << el; - - cout << endl; - } -} diff --git a/branches/sage/ebofs2/crush.old/test/t.cc b/branches/sage/ebofs2/crush.old/test/t.cc deleted file mode 100644 index 0785ef47d6c04..0000000000000 --- a/branches/sage/ebofs2/crush.old/test/t.cc +++ /dev/null @@ -1,25 +0,0 @@ - -#include "../../common/Clock.h" -#include "../crush.h" -using namespace crush; - - -Clock g_clock; - -#include - -#include -#include -using namespace std; - - -int branching = 10; -bool linear = false; -int numrep = 1; - -int main() { - - Bucket *b = new UniformBucket(1, 0); - //b = new TreeBucket(1); -} - diff --git a/branches/sage/ebofs2/crush.old/test/testbucket.cc b/branches/sage/ebofs2/crush.old/test/testbucket.cc deleted file mode 100644 index 065721c2c1967..0000000000000 --- a/branches/sage/ebofs2/crush.old/test/testbucket.cc +++ /dev/null @@ -1,61 +0,0 @@ - - -#include "../Bucket.h" -using namespace crush; - -#include -#include -using namespace std; - - -ostream& operator<<(ostream& out, vector& v) -{ - out << "["; - for (int i=0; i ocount(ndisks); - - vector v(numrep); - int nplace = 0; - for (int x=1; x<1000000; x++) { - //cout << H(x) << "\t" << h(x) << endl; - for (int i=0; i -#include -using namespace std; - - -void getdist(vector& v, float& avg, float& var) -{ - avg = 0.0; - for (int i=0; i a(n); - vector b(n); - - for (int i=0; i c(n); - for (int i=0; i -#include - -class CrushWrapper { -public: - struct crush_map *map; - - CrushWrapper() : map(0) {} - ~CrushWrapper() { - if (map) crush_destroy(map); - } - - void create() { - if (map) crush_destroy(map); - map = crush_create(); - } - void finalize() { - assert(map); - crush_finalize(map); - } - - void update_offload_map(std::set& out_osds, - std::map& overload_osds) { - for (int i=0; imax_devices; i++) { - if (out_osds.count(i)) - map->device_offload[i] = 0x10000; - else if (overload_osds.count(i)) - map->device_offload[i] = (int)(0x10000 * overload_osds[i]); // FIXME: reverse? - else - map->device_offload[i] = 0; // normal. - } - } - - void do_rule(int rule, int x, vector& out, int maxout, int forcefeed) { - int rawout[maxout]; - - int numrep = crush_do_rule(map, rule, x, rawout, maxout, forcefeed); - - out.resize(numrep); - for (int i=0; imax_buckets, bl); - ::_encode_simple(map->max_rules, bl); - ::_encode_simple(map->max_devices, bl); - - // simple arrays - bl.append((char*)map->device_offload, sizeof(map->device_offload[0]) * map->max_devices); - - // buckets - for (unsigned i=0; imax_buckets; i++) { - __u32 type = 0; - if (map->buckets[i]) type = map->buckets[i]->bucket_type; - ::_encode_simple(type, bl); - if (!type) continue; - - ::_encode_simple(map->buckets[i]->id, bl); - ::_encode_simple(map->buckets[i]->type, bl); - ::_encode_simple(map->buckets[i]->bucket_type, bl); - ::_encode_simple(map->buckets[i]->weight, bl); - ::_encode_simple(map->buckets[i]->size, bl); - for (unsigned j=0; jbuckets[i]->size; j++) - ::_encode_simple(map->buckets[i]->items[j], bl); - - switch (map->buckets[i]->type) { - case CRUSH_BUCKET_UNIFORM: - for (unsigned j=0; jbuckets[i]->size; j++) - ::_encode_simple(((crush_bucket_uniform*)map->buckets[i])->primes[j], bl); - ::_encode_simple(((crush_bucket_uniform*)map->buckets[i])->item_weight, bl); - break; - - case CRUSH_BUCKET_LIST: - for (unsigned j=0; jbuckets[i]->size; j++) { - ::_encode_simple(((crush_bucket_list*)map->buckets[i])->item_weights[j], bl); - ::_encode_simple(((crush_bucket_list*)map->buckets[i])->sum_weights[j], bl); - } - break; - - case CRUSH_BUCKET_TREE: - for (unsigned j=0; jbuckets[i]->size; j++) - ::_encode_simple(((crush_bucket_tree*)map->buckets[i])->node_weights[j], bl); - break; - - case CRUSH_BUCKET_STRAW: - for (unsigned j=0; jbuckets[i]->size; j++) - ::_encode_simple(((crush_bucket_straw*)map->buckets[i])->straws[j], bl); - break; - } - } - - // rules - for (unsigned i=0; imax_rules; i++) { - __u32 yes = map->rules[i] ? 1:0; - ::_encode_simple(yes, bl); - if (!yes) continue; - - ::_encode_simple(map->rules[i]->len, bl); - for (unsigned j=0; jrules[i]->len; j++) - ::_encode_simple(map->rules[i]->steps[j], bl); - } - } - - void _decode(bufferlist::iterator &blp) { - create(); - ::_decode_simple(map->max_buckets, blp); - ::_decode_simple(map->max_rules, blp); - ::_decode_simple(map->max_devices, blp); - - map->device_offload = (__u32*)malloc(sizeof(map->device_offload[0])*map->max_devices); - blp.copy(sizeof(map->device_offload[0])*map->max_devices, (char*)map->device_offload); - - // buckets - map->buckets = (crush_bucket**)malloc(sizeof(crush_bucket*)*map->max_buckets); - for (unsigned i=0; imax_buckets; i++) { - __u32 type; - ::_decode_simple(type, blp); - if (!type) { - map->buckets[i] = 0; - continue; - } - - int size = 0; - switch (type) { - case CRUSH_BUCKET_UNIFORM: - size = sizeof(crush_bucket_uniform); - break; - case CRUSH_BUCKET_LIST: - size = sizeof(crush_bucket_list); - break; - case CRUSH_BUCKET_TREE: - size = sizeof(crush_bucket_tree); - break; - case CRUSH_BUCKET_STRAW: - size = sizeof(crush_bucket_straw); - break; - default: - assert(0); - } - map->buckets[i] = (crush_bucket*)malloc(size); - memset(map->buckets[i], 0, size); - - ::_decode_simple(map->buckets[i]->id, blp); - ::_decode_simple(map->buckets[i]->type, blp); - ::_decode_simple(map->buckets[i]->bucket_type, blp); - ::_decode_simple(map->buckets[i]->weight, blp); - ::_decode_simple(map->buckets[i]->size, blp); - - map->buckets[i]->items = (__s32*)malloc(sizeof(__s32)*map->buckets[i]->size); - for (unsigned j=0; jbuckets[i]->size; j++) - ::_decode_simple(map->buckets[i]->items[j], blp); - - switch (map->buckets[i]->type) { - case CRUSH_BUCKET_UNIFORM: - ((crush_bucket_uniform*)map->buckets[i])->primes = - (__u32*)malloc(map->buckets[i]->size * sizeof(__u32)); - for (unsigned j=0; jbuckets[i]->size; j++) - ::_decode_simple(((crush_bucket_uniform*)map->buckets[i])->primes[j], blp); - ::_decode_simple(((crush_bucket_uniform*)map->buckets[i])->item_weight, blp); - break; - - case CRUSH_BUCKET_LIST: - ((crush_bucket_list*)map->buckets[i])->item_weights = - (__u32*)malloc(map->buckets[i]->size * sizeof(__u32)); - ((crush_bucket_list*)map->buckets[i])->sum_weights = - (__u32*)malloc(map->buckets[i]->size * sizeof(__u32)); - - for (unsigned j=0; jbuckets[i]->size; j++) { - ::_decode_simple(((crush_bucket_list*)map->buckets[i])->item_weights[j], blp); - ::_decode_simple(((crush_bucket_list*)map->buckets[i])->sum_weights[j], blp); - } - break; - - case CRUSH_BUCKET_TREE: - ((crush_bucket_tree*)map->buckets[i])->node_weights = - (__u32*)malloc(map->buckets[i]->size * sizeof(__u32)); - for (unsigned j=0; jbuckets[i]->size; j++) - ::_decode_simple(((crush_bucket_tree*)map->buckets[i])->node_weights[j], blp); - break; - - case CRUSH_BUCKET_STRAW: - ((crush_bucket_straw*)map->buckets[i])->straws = - (__u32*)malloc(map->buckets[i]->size * sizeof(__u32)); - for (unsigned j=0; jbuckets[i]->size; j++) - ::_decode_simple(((crush_bucket_straw*)map->buckets[i])->straws[j], blp); - break; - } - } - - // rules - map->rules = (crush_rule**)malloc(sizeof(crush_rule*)*map->max_rules); - for (unsigned i=0; imax_rules; i++) { - __u32 yes; - ::_decode_simple(yes, blp); - if (!yes) { - map->rules[i] = 0; - continue; - } - - map->rules[i] = (crush_rule*)malloc(sizeof(crush_rule)); - memset(map->rules[i], 0, sizeof(crush_rule)); - - ::_decode_simple(map->rules[i]->len, blp); - map->rules[i]->steps = (crush_rule_step*)malloc(sizeof(crush_rule_step) * map->rules[i]->len); - for (unsigned j=0; jrules[i]->len; j++) - ::_decode_simple(map->rules[i]->steps[j], blp); - } - - finalize(); - } -}; - -#endif diff --git a/branches/sage/ebofs2/crush/Makefile b/branches/sage/ebofs2/crush/Makefile deleted file mode 100644 index 72d1b676bdb32..0000000000000 --- a/branches/sage/ebofs2/crush/Makefile +++ /dev/null @@ -1,30 +0,0 @@ - -CC = gcc -CFLAGS = -Wall -CFLAGS += -g -CFLAGS += -O3 -LD = ld -RM = rm - -all: depend libcrush.o test - -clean: - rm -f *.o libcrush.o - -%.o: %.c - ${CC} ${CFLAGS} -c $< -o $@ - -libcrush.o: builder.o crush.o mapper.o - $(LD) -i -o $@ $^ - -test: test.c libcrush.o - $(CC) ${CFLAGS} -lm $^ -o $@ - -.depend: - touch .depend - -depend: - $(RM) .depend - makedepend -f- -- $(CFLAGS) -- *.c > .depend 2>/dev/null - -include .depend diff --git a/branches/sage/ebofs2/crush/buckets.c b/branches/sage/ebofs2/crush/buckets.c deleted file mode 100644 index 72441d19792bb..0000000000000 --- a/branches/sage/ebofs2/crush/buckets.c +++ /dev/null @@ -1,6 +0,0 @@ - -#include "crush.h" -#include "hash.h" - -int - diff --git a/branches/sage/ebofs2/crush/builder.c b/branches/sage/ebofs2/crush/builder.c deleted file mode 100644 index a430dbd5c6284..0000000000000 --- a/branches/sage/ebofs2/crush/builder.c +++ /dev/null @@ -1,375 +0,0 @@ - -#include -#include -#include -#include - -#include "builder.h" -#include "hash.h" - -struct crush_map *crush_create() -{ - struct crush_map *m; - m = malloc(sizeof(*m)); - memset(m, 0, sizeof(*m)); - return m; -} - -/* - * finalize should be called _after_ all buckets are added to the map. - */ -void crush_finalize(struct crush_map *map) -{ - int b, i, c; - - /* calc max_devices */ - for (b=0; bmax_buckets; b++) { - if (map->buckets[b] == 0) continue; - for (i=0; ibuckets[b]->size; i++) - if (map->buckets[b]->items[i] >= map->max_devices) - map->max_devices = map->buckets[b]->items[i] + 1; - } - - /* allocate arrays */ - map->device_parents = malloc(sizeof(map->device_parents[0]) * map->max_devices); - memset(map->device_parents, 0, sizeof(map->device_parents[0]) * map->max_devices); - map->bucket_parents = malloc(sizeof(map->bucket_parents[0]) * map->max_buckets); - memset(map->bucket_parents, 0, sizeof(map->bucket_parents[0]) * map->max_buckets); - - /* build parent maps */ - for (b=0; bmax_buckets; b++) { - if (map->buckets[b] == 0) continue; - for (i=0; ibuckets[b]->size; i++) { - c = map->buckets[b]->items[i]; - BUG_ON(c >= map->max_devices); - if (c >= 0) - map->device_parents[c] = map->buckets[b]->id; - else - map->bucket_parents[-1-c] = map->buckets[b]->id; - } - } - - /* new device offload map? */ - if (!map->device_offload) { - map->device_offload = malloc(sizeof(map->device_offload[0]) * map->max_devices); - memset(map->device_offload, 0, sizeof(map->device_offload[0]) * map->max_devices); - } -} - - - - - -/** rules **/ - -int crush_add_rule(struct crush_map *map, - int ruleno, - struct crush_rule *rule) -{ - int oldsize; - - if (ruleno < 0) { - for (ruleno=0; ruleno < map->max_rules; ruleno++) - if (map->rules[ruleno] == 0) break; - } - if (ruleno >= map->max_rules) { - /* expand array */ - oldsize = map->max_rules; - map->max_rules = ruleno+1; - map->rules = realloc(map->rules, map->max_rules * sizeof(map->rules[0])); - memset(map->rules + oldsize, 0, (map->max_rules-oldsize) * sizeof(map->rules[0])); - } - - /* add it */ - map->rules[ruleno] = rule; - return ruleno; -} - -struct crush_rule *crush_make_rule() -{ - struct crush_rule *rule; - - rule = malloc(sizeof(struct crush_rule)); - memset(rule, 0, sizeof(*rule)); - return rule; -} - -void crush_rule_add_step(struct crush_rule *rule, int op, int arg1, int arg2) -{ - rule->len++; - if (rule->steps) - rule->steps = realloc(rule->steps, sizeof(rule->steps[0])*rule->len); - else - rule->steps = malloc(sizeof(rule->steps[0])*rule->len); - rule->steps[rule->len-1].op = op; - rule->steps[rule->len-1].arg1 = arg1; - rule->steps[rule->len-1].arg2 = arg2; -} - - -/** buckets **/ - -int crush_add_bucket(struct crush_map *map, - struct crush_bucket *bucket) -{ - int id; - int oldsize; - - /* find a bucket id */ - for (id=0; id < map->max_buckets; id++) - if (map->buckets[id] == 0) break; - if (id == map->max_buckets) { - /* expand array */ - oldsize = map->max_buckets; - if (map->max_buckets) - map->max_buckets *= 2; - else - map->max_buckets = 8; - map->buckets = realloc(map->buckets, map->max_buckets * sizeof(map->buckets[0])); - memset(map->buckets + oldsize, 0, (map->max_buckets-oldsize) * sizeof(map->buckets[0])); - } - - /* add it */ - bucket->id = -1 - id; - map->buckets[id] = bucket; - return -1 - id; -} - - -/* uniform bucket */ - -struct crush_bucket_uniform * -crush_make_uniform_bucket(int type, int size, - int *items, - int item_weight) -{ - int i, j, x; - struct crush_bucket_uniform *bucket; - - bucket = malloc(sizeof(*bucket)); - memset(bucket, 0, sizeof(*bucket)); - bucket->h.bucket_type = CRUSH_BUCKET_UNIFORM; - bucket->h.type = type; - bucket->h.size = size; - bucket->h.weight = size * item_weight; - - bucket->item_weight = item_weight; - - bucket->h.items = malloc(sizeof(__u32)*size); - for (i=0; ih.items[i] = items[i]; - - /* generate some primes */ - bucket->primes = malloc(sizeof(__u32)*size); - - x = size + 1; - x += crush_hash32(size) % (3*size); /* make it big */ - x |= 1; /* and odd */ - - i=0; - while (i < size) { - for (j=2; j*j <= x; j++) - if (x % j == 0) break; - if (j*j > x) - bucket->primes[i++] = x; - x += 2; - } - - return bucket; -} - - -/* list bucket */ - -struct crush_bucket_list* -crush_make_list_bucket(int type, int size, - int *items, - int *weights) -{ - int i; - int w; - struct crush_bucket_list *bucket; - - bucket = malloc(sizeof(*bucket)); - memset(bucket, 0, sizeof(*bucket)); - bucket->h.bucket_type = CRUSH_BUCKET_LIST; - bucket->h.type = type; - bucket->h.size = size; - - bucket->h.items = malloc(sizeof(__u32)*size); - bucket->item_weights = malloc(sizeof(__u32)*size); - bucket->sum_weights = malloc(sizeof(__u32)*size); - w = 0; - for (i=size-1; i>=0; i--) { - bucket->h.items[i] = items[i]; - bucket->item_weights[i] = weights[i]; - w += weights[i]; - bucket->sum_weights[i] = w; - /*printf("%d item %d weight %d sum %d\n", - i, items[i], weights[i], bucket->sum_weights[i]);*/ - } - - bucket->h.weight = w; - - return bucket; -} - - -/* tree bucket */ - -static int height(int n) { - int h = 0; - while ((n & 1) == 0) { - h++; - n = n >> 1; - } - return h; -} -static int on_right(int n, int h) { - return n & (1 << (h+1)); -} -static int parent(int n) -{ - int h = height(n); - if (on_right(n, h)) - return n - (1<h.bucket_type = CRUSH_BUCKET_TREE; - bucket->h.type = type; - bucket->h.size = size; - - /* calc tree depth */ - depth = 1; - t = size - 1; - while (t) { - t = t >> 1; - depth++; - } - bucket->h.size = 1 << depth; - - bucket->h.items = malloc(sizeof(__u32)*bucket->h.size); - bucket->node_weights = malloc(sizeof(__u32)*bucket->h.size); - - memset(bucket->h.items, 0, sizeof(__u32)*bucket->h.size); - memset(bucket->node_weights, 0, sizeof(__u32)*bucket->h.size); - - for (i=0; ih.items[node] = items[i]; - bucket->node_weights[node] = weights[i]; - bucket->h.weight += weights[i]; - for (j=1; jnode_weights[node] += weights[i]; - } - } - BUG_ON(bucket->node_weights[bucket->h.size/2] != bucket->h.weight); - - return bucket; -} - - -/* straw bucket */ - -struct crush_bucket_straw * -crush_make_straw_bucket(int type, - int size, - int *items, - int *weights) -{ - struct crush_bucket_straw *bucket; - int *reverse; - int i, j, k; - - double straw, wbelow, lastw, wnext, pbelow; - int numleft; - - bucket = malloc(sizeof(*bucket)); - memset(bucket, 0, sizeof(*bucket)); - bucket->h.bucket_type = CRUSH_BUCKET_STRAW; - bucket->h.type = type; - bucket->h.size = size; - - bucket->h.items = malloc(sizeof(__u32)*size); - bucket->straws = malloc(sizeof(__u32)*size); - - bucket->h.weight = 0; - for (i=0; ih.items[i] = items[i]; - bucket->h.weight += weights[i]; - } - - /* reverse sort by weight (simple insertion sort) */ - reverse = malloc(sizeof(int) * size); - reverse[0] = 0; - for (i=1; ij; k--) - reverse[k] = reverse[k-1]; - reverse[j] = i; - break; - } - } - if (j == i) - reverse[i] = i; - } - - numleft = size; - straw = 1.0; - wbelow = 0; - lastw = 0; - - i=0; - while (i < size) { - /* set this item's straw */ - bucket->straws[reverse[i]] = straw * 0x10000; - /*printf("item %d at %d weight %d straw %d (%lf)\n", - items[reverse[i]], - reverse[i], weights[reverse[i]], bucket->straws[reverse[i]], straw);*/ - i++; - if (i == size) break; - - /* same weight as previous? */ - if (weights[reverse[i]] == weights[reverse[i-1]]) { - /*printf("same as previous\n");*/ - continue; - } - - /* adjust straw for next guy */ - wbelow += ((double)weights[reverse[i-1]] - lastw) * numleft; - for (j=i; j -#endif - -#include "crush.h" - -void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b) -{ - free(b->primes); - free(b->h.items); - free(b); -} - -void crush_destroy_bucket_list(struct crush_bucket_list *b) -{ - free(b->item_weights); - free(b->sum_weights); - free(b->h.items); - free(b); -} - -void crush_destroy_bucket_tree(struct crush_bucket_tree *b) -{ - free(b->node_weights); - free(b); -} - -void crush_destroy_bucket_straw(struct crush_bucket_straw *b) -{ - free(b->straws); - free(b->h.items); - free(b); -} - - -/* - * deallocate - */ -void crush_destroy(struct crush_map *map) -{ - int b; - - /* buckets */ - for (b=0; bmax_buckets; b++) { - if (map->buckets[b] == 0) continue; - switch (map->buckets[b]->type) { - case CRUSH_BUCKET_UNIFORM: - crush_destroy_bucket_uniform((struct crush_bucket_uniform*)map->buckets[b]); - break; - case CRUSH_BUCKET_LIST: - crush_destroy_bucket_list((struct crush_bucket_list*)map->buckets[b]); - break; - case CRUSH_BUCKET_TREE: - crush_destroy_bucket_tree((struct crush_bucket_tree*)map->buckets[b]); - break; - case CRUSH_BUCKET_STRAW: - crush_destroy_bucket_straw((struct crush_bucket_straw*)map->buckets[b]); - break; - } - } - free(map->buckets); - - /* rules */ - for (b=0; bmax_rules; b++) { - if (map->rules[b] == 0) continue; - if (map->rules[b]->steps) - free(map->rules[b]->steps); - free(map->rules[b]); - } - free(map->rules); - - free(map->bucket_parents); - free(map->device_parents); - free(map->device_offload); - free(map); -} - - diff --git a/branches/sage/ebofs2/crush/crush.h b/branches/sage/ebofs2/crush/crush.h deleted file mode 100644 index 5cf6cff498f13..0000000000000 --- a/branches/sage/ebofs2/crush/crush.h +++ /dev/null @@ -1,117 +0,0 @@ -#ifndef _CRUSH_CRUSH_H -#define _CRUSH_CRUSH_H - -#ifdef __cplusplus -extern "C" { -#endif - -#include /* just for int types */ - -#ifndef BUG_ON -# include -# define BUG_ON(x) assert(!(x)) -#endif - - -/*** RULES ***/ -enum { - CRUSH_RULE_TAKE, - CRUSH_RULE_CHOOSE_FIRSTN, - CRUSH_RULE_CHOOSE_INDEP, - CRUSH_RULE_EMIT -}; - -#define CRUSH_MAX_DEPTH 10 -#define CRUSH_MAX_SET 10 - -struct crush_rule_step { - __u32 op; - __s32 arg1; - __s32 arg2; -}; - -struct crush_rule { - __u32 len; - struct crush_rule_step *steps; -}; - - - -/*** BUCKETS ***/ - -enum { - CRUSH_BUCKET_UNIFORM = 1, - CRUSH_BUCKET_LIST = 2, - CRUSH_BUCKET_TREE = 3, - CRUSH_BUCKET_STRAW = 4 -}; - -struct crush_bucket { - __s32 id; /* this'll be negative */ - __u16 type; - __u16 bucket_type; - __u32 weight; /* 16-bit fixed point */ - __u32 size; /* num items */ - __s32 *items; -}; - -struct crush_bucket_uniform { - struct crush_bucket h; - __u32 *primes; - __u32 item_weight; /* 16-bit fixed point */ -}; - -struct crush_bucket_list { - struct crush_bucket h; - __u32 *item_weights; /* 16-bit fixed point */ - __u32 *sum_weights; /* 16-bit fixed point. element i is sum of weights 0..i, inclusive */ -}; - -struct crush_bucket_tree { - struct crush_bucket h; /* note: h.size is tree size, not number of actual items */ - __u32 *node_weights; -}; - -struct crush_bucket_straw { - struct crush_bucket h; - __u32 *straws; /* 16-bit fixed point */ -}; - - - -/*** CRUSH ***/ - -struct crush_map { - struct crush_bucket **buckets; - struct crush_rule **rules; - - /* parent pointers */ - __u32 *bucket_parents; - __u32 *device_parents; - - /* offload - * size max_devices, values 0...0xffff - * 0 == normal - * 0x10000 == 100% offload (i.e. failed) - */ - __u32 *device_offload; - - __u32 max_buckets; - __u32 max_rules; - __s32 max_devices; -}; - - -/* common destructors */ -extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *); -extern void crush_destroy_bucket_list(struct crush_bucket_list *); -extern void crush_destroy_bucket_tree(struct crush_bucket_tree *); -extern void crush_destroy_bucket_straw(struct crush_bucket_straw *); -extern void crush_destroy(struct crush_map *map); - - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/branches/sage/ebofs2/crush/hash.h b/branches/sage/ebofs2/crush/hash.h deleted file mode 100644 index 1ff4cca7f7b89..0000000000000 --- a/branches/sage/ebofs2/crush/hash.h +++ /dev/null @@ -1,80 +0,0 @@ -#ifndef _CRUSH_HASH_H -#define _CRUSH_HASH_H - -// Robert Jenkins' function for mixing 32-bit values -// http://burtleburtle.net/bob/hash/evahash.html -// a, b = random bits, c = input and output -#define hashmix(a,b,c) \ - a=a-b; a=a-c; a=a^(c>>13); \ - b=b-c; b=b-a; b=b^(a<<8); \ - c=c-a; c=c-b; c=c^(b>>13); \ - a=a-b; a=a-c; a=a^(c>>12); \ - b=b-c; b=b-a; b=b^(a<<16); \ - c=c-a; c=c-b; c=c^(b>>5); \ - a=a-b; a=a-c; a=a^(c>>3); \ - b=b-c; b=b-a; b=b^(a<<10); \ - c=c-a; c=c-b; c=c^(b>>15); - -#define crush_hash_seed 1315423911 - -static __inline__ unsigned crush_hash32(unsigned a) { - unsigned hash = crush_hash_seed ^ a; - unsigned b = a; - unsigned x = 231232; - unsigned y = 1232; - hashmix(b, x, hash); - hashmix(y, a, hash); - return (hash & 0xFFFFFFFF); -} - -static __inline__ unsigned crush_hash32_2(unsigned a, unsigned b) { - unsigned hash = crush_hash_seed ^ a ^ b; - unsigned x = 231232; - unsigned y = 1232; - hashmix(a, b, hash); - hashmix(x, a, hash); - hashmix(b, y, hash); - return (hash & 0xFFFFFFFF); -} - -static __inline__ unsigned crush_hash32_3(unsigned a, unsigned b, unsigned c) { - unsigned int hash = crush_hash_seed ^ a ^ b ^ c; - unsigned x = 231232; - unsigned y = 1232; - hashmix(a, b, hash); - hashmix(c, x, hash); - hashmix(y, a, hash); - hashmix(b, x, hash); - hashmix(y, c, hash); - return (hash & 0xFFFFFFFF); -} - -static __inline__ unsigned crush_hash32_4(unsigned a, unsigned b, unsigned c, unsigned d) { - unsigned int hash = crush_hash_seed ^a ^ b ^ c ^ d; - unsigned x = 231232; - unsigned y = 1232; - hashmix(a, b, hash); - hashmix(c, d, hash); - hashmix(a, x, hash); - hashmix(y, b, hash); - hashmix(c, x, hash); - hashmix(y, d, hash); - return (hash & 0xFFFFFFFF); -} - -static __inline__ unsigned crush_hash32_5(unsigned a, unsigned b, unsigned c, unsigned d, unsigned e) { - unsigned int hash = crush_hash_seed ^ a ^ b ^ c ^ d ^ e; - unsigned x = 231232; - unsigned y = 1232; - hashmix(a, b, hash); - hashmix(c, d, hash); - hashmix(e, x, hash); - hashmix(y, a, hash); - hashmix(b, x, hash); - hashmix(y, c, hash); - hashmix(d, x, hash); - hashmix(y, e, hash); - return (hash & 0xFFFFFFFF); -} - -#endif diff --git a/branches/sage/ebofs2/crush/mapper.c b/branches/sage/ebofs2/crush/mapper.c deleted file mode 100644 index e0a71f85631ff..0000000000000 --- a/branches/sage/ebofs2/crush/mapper.c +++ /dev/null @@ -1,351 +0,0 @@ - -#include "crush.h" -#include "hash.h" - -#include -#include - -/** bucket choose methods **/ - -/* uniform */ - -static int -crush_bucket_uniform_choose(struct crush_bucket_uniform *bucket, int x, int r) -{ - unsigned o, p, s; - o = crush_hash32_2(x, bucket->h.id) & 0xffff; - p = bucket->primes[crush_hash32_2(bucket->h.id, x) % bucket->h.size]; - s = (x + o + (r+1)*p) % bucket->h.size; - /*printf("%d %d %d %d\n", x, o, r, p);*/ - return bucket->h.items[s]; -} - - -/* list */ - -static int -crush_bucket_list_choose(struct crush_bucket_list *bucket, int x, int r) -{ - int i; - __u64 w; - - for (i=0; ih.size; i++) { - w = crush_hash32_4(x, bucket->h.items[i], r, bucket->h.id); - w &= 0xffff; - /*printf("%d item %d weight %d sum_weight %d r %lld", - i, bucket->h.items[i], bucket->item_weights[i], bucket->sum_weights[i], w);*/ - w *= bucket->sum_weights[i]; - w = w >> 16; - /*printf(" scaled %lld\n", w);*/ - if (w < bucket->item_weights[i]) - return bucket->h.items[i]; - } - - BUG_ON(1); - return 0; -} - - -/* tree */ - -static int height(int n) { - int h = 0; - while ((n & 1) == 0) { - h++; - n = n >> 1; - } - return h; -} -static int left(int x) { - int h = height(x); - return x - (1 << (h-1)); -} -static int right(int x) { - int h = height(x); - return x + (1 << (h-1)); -} -static int terminal(int x) { - return x & 1; -} - -static int -crush_bucket_tree_choose(struct crush_bucket_tree *bucket, int x, int r) -{ - int n, l; - __u32 w; - __u64 t; - - /* start at root */ - n = bucket->h.size >> 1; - - while (!terminal(n)) { - /* pick point in [0, w) */ - w = bucket->node_weights[n]; - t = (__u64)crush_hash32_4(x, n, r, bucket->h.id) * (__u64)w; - t = t >> 32; - - /* left or right? */ - l = left(n); - if (t < bucket->node_weights[l]) - n = l; - else - n = right(n); - } - - return bucket->h.items[n]; -} - - -/* straw */ - -static int -crush_bucket_straw_choose(struct crush_bucket_straw *bucket, int x, int r) -{ - int i; - int high = 0; - __u64 high_draw = 0; - __u64 draw; - - for (i=0; ih.size; i++) { - draw = crush_hash32_3(x, bucket->h.items[i], r); - draw &= 0xffff; - draw *= bucket->straws[i]; - if (i == 0 || draw > high_draw) { - high = i; - high_draw = draw; - } - } - - return bucket->h.items[high]; -} - - - - -/** crush proper **/ - - -/* - * choose numrep distinct items of given type - */ -static int crush_choose(struct crush_map *map, - struct crush_bucket *bucket, - int x, int numrep, int type, - int *out, int firstn) -{ - int rep; - int ftotal, flocal; - int retry_descent, retry_bucket, skip_rep; - struct crush_bucket *in = bucket; - int r; - int i; - int item; - int itemtype; - int outpos; - int collide, bad; - - outpos = 0; - - for (rep = 0; rep < numrep; rep++) { - /* keep trying until we get a non-out, non-colliding item */ - ftotal = 0; - skip_rep = 0; - do { - retry_descent = 0; - in = bucket; /* initial bucket */ - - /* choose through intervening buckets */ - flocal = 0; - do { - retry_bucket = 0; - r = rep; - if (in->bucket_type == CRUSH_BUCKET_UNIFORM) { - /* be careful */ - if (firstn || numrep >= in->size) - r += ftotal; /* r' = r + f_total */ - else if (in->size % numrep == 0) - r += (numrep+1) * flocal; /* r'=r+(n+1)*f_local */ - else - r += numrep * flocal; /* r' = r + n*f_local */ - } else { - if (firstn) - r += ftotal; /* r' = r + f_total */ - else - r += numrep * flocal; /* r' = r + n*f_local */ - } - - /* bucket choose */ - switch (in->bucket_type) { - case CRUSH_BUCKET_UNIFORM: - item = crush_bucket_uniform_choose((struct crush_bucket_uniform*)in, x, r); - break; - case CRUSH_BUCKET_LIST: - item = crush_bucket_list_choose((struct crush_bucket_list*)in, x, r); - break; - case CRUSH_BUCKET_TREE: - item = crush_bucket_tree_choose((struct crush_bucket_tree*)in, x, r); - break; - case CRUSH_BUCKET_STRAW: - item = crush_bucket_straw_choose((struct crush_bucket_straw*)in, x, r); - break; - default: - BUG_ON(1); - } - - /* desired type? */ - if (item < 0) - itemtype = map->buckets[-1-item]->type; - else - itemtype = 0; - - /* keep going? */ - if (itemtype != type) { - BUG_ON(item >= 0 || (-1-item) >= map->max_buckets); - in = map->buckets[-1-item]; - continue; - } - - /* collision? */ - collide = 0; - for (i=0; idevice_offload[item]) { - if (map->device_offload[item] >= 0x10000) - bad = 1; - else if ((crush_hash32_2(x, item) & 0xffff) < map->device_offload[item]) - bad = 1; - } - - if (bad || collide) { - ftotal++; - flocal++; - - if (collide && flocal < 3) - retry_bucket = 1; /* retry locally a few times */ - else if (ftotal < 10) - retry_descent = 1; /* then retry descent */ - else - skip_rep = 1; /* else give up */ - } - } while (retry_bucket); - } while (retry_descent); - - if (skip_rep) continue; - - out[outpos] = item; - outpos++; - } - - return outpos; -} - - -int crush_do_rule(struct crush_map *map, - int ruleno, - int x, int *result, int result_max, - int forcefeed) /* -1 for none */ -{ - int result_len; - int force_stack[CRUSH_MAX_DEPTH]; - int force_pos = -1; - int a[CRUSH_MAX_SET]; - int b[CRUSH_MAX_SET]; - int *w; - int wsize = 0; - int *o; - int osize; - int *tmp; - struct crush_rule *rule; - int step; - int i; - int numrep; - - rule = map->rules[ruleno]; - result_len = 0; - w = a; - o = b; - - /* determine hierarchical context of forcefeed, if any */ - if (forcefeed >= 0) { - if (map->device_parents[forcefeed] == 0) { - /*printf("CRUSH: forcefed device dne\n");*/ - return -1; /* force fed device dne */ - } - while (1) { - force_stack[++force_pos] = forcefeed; - /*printf("force_stack[%d] = %d\n", force_pos, forcefeed);*/ - if (forcefeed >= 0) - forcefeed = map->device_parents[forcefeed]; - else - forcefeed = map->bucket_parents[-1-forcefeed]; - if (forcefeed == 0) break; - } - } - - for (step = 0; step < rule->len; step++) { - switch (rule->steps[step].op) { - case CRUSH_RULE_TAKE: - if (force_pos >= 0) { - w[0] = force_stack[force_pos]; - force_pos--; - BUG_ON(w[0] != rule->steps[step].arg1); - } else { - w[0] = rule->steps[step].arg1; - } - wsize = 1; - break; - - case CRUSH_RULE_CHOOSE_FIRSTN: - case CRUSH_RULE_CHOOSE_INDEP: - BUG_ON(wsize == 0); - - /* reset output */ - osize = 0; - - for (i = 0; i < wsize; i++) { - numrep = rule->steps[step].arg1; - if (force_pos >= 0) { - o[osize++] = force_stack[force_pos]; - force_pos--; - numrep--; - } - if (!numrep) continue; - osize += crush_choose(map, - map->buckets[-1-w[i]], - x, numrep, rule->steps[step].arg2, - o+osize, rule->steps[step].op == CRUSH_RULE_CHOOSE_FIRSTN); - } - - /* swap t and w arrays */ - tmp = o; - o = w; - w = tmp; - wsize = osize; - break; - - - case CRUSH_RULE_EMIT: - for (i=0; i -#include -#include -#include - -#include "crush.h" -#include "mapper.h" -#include "builder.h" - - -int main() -{ - int sub[10]; - int subw[10]; - int i, j; - int d; - int o[100]; - int root; - int ruleno; - int r[10]; - - int uw[10] = { 1000, 1000, 500, 1000, 2000, 1000, 1000, 3000, 1000, 500 }; - - struct crush_bucket *b; - struct crush_rule *rule; - - struct crush_map *map = crush_create(); - - d = 0; - for (i=0; i<10; i++) { - for (j=0; j<10; j++) - o[j] = d++; - b = (struct crush_bucket*)crush_make_uniform_bucket(1, 10, o, uw[i]); - sub[i] = crush_add_bucket(map, b); - subw[i] = b->weight; - printf("make bucket %d weight %d\n", sub[i], subw[i]); - } - - root = crush_add_bucket(map, (struct crush_bucket*)crush_make_tree_bucket(2, 10, sub, subw)); - - rule = crush_make_rule(); - crush_rule_add_step(rule, CRUSH_RULE_TAKE, root, 0); - crush_rule_add_step(rule, CRUSH_RULE_CHOOSE_FIRSTN, 3, 1); - crush_rule_add_step(rule, CRUSH_RULE_CHOOSE_FIRSTN, 1, 0); - crush_rule_add_step(rule, CRUSH_RULE_EMIT, 0, 0); - ruleno = crush_add_rule(map, -1, rule); - - crush_finalize(map); - printf("built\n"); - - /* test */ - memset(o, 0, 100*sizeof(o[0])); - for (i=0; i<1000000; i++) { - crush_do_rule(map, ruleno, i, r, 3, -1); - /*printf("%d %d %d\n", r[0], r[1], r[2]);*/ - for (j=0; j<3; j++) - o[r[j]]++; - } - - for (i=0; i<100; i += 10) - printf("%2d : %d\n", i, o[i]); - - return 0; -} diff --git a/branches/sage/ebofs2/crush/types.h b/branches/sage/ebofs2/crush/types.h deleted file mode 100644 index ffb208b2fec01..0000000000000 --- a/branches/sage/ebofs2/crush/types.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef _CRUSH_TYPES_H -#define _CRUSH_TYPES_H - -#ifdef KERNEL -# define free(x) kfree(x) -#else -# include -#endif - - -#include /* just for int types */ - -#ifndef BUG_ON -# include -# define BUG_ON(x) assert(!(x)) -#endif - -#endif diff --git a/branches/sage/ebofs2/csyn.cc b/branches/sage/ebofs2/csyn.cc deleted file mode 100644 index 562f00e3f861b..0000000000000 --- a/branches/sage/ebofs2/csyn.cc +++ /dev/null @@ -1,87 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include -#include -#include -using namespace std; - -#include "config.h" - -#include "client/SyntheticClient.h" -#include "client/Client.h" -#include "client/fuse.h" - -#include "msg/SimpleMessenger.h" - -#include "common/Timer.h" - -#ifndef DARWIN -#include -#endif // DARWIN - -#include -#include -#include - -int main(int argc, char **argv, char *envp[]) { - - //cerr << "cfuse starting " << myrank << "/" << world << std::endl; - vector args; - argv_to_vec(argc, argv, args); - parse_config_options(args); - parse_syn_options(args); // for SyntheticClient - - // args for fuse - vec_to_argv(args, argc, argv); - - if (g_conf.clock_tare) g_clock.tare(); - - // load monmap - MonMap monmap; - int r = monmap.read(".ceph_monmap"); - assert(r >= 0); - - // start up network - rank.start_rank(); - - list clients; - list synclients; - - cout << "mounting and starting " << g_conf.num_client << " syn client(s)" << std::endl; - for (int i=0; istart_thread(); - clients.push_back(client); - synclients.push_back(syn); - } - - cout << "waiting for client(s) to finish" << std::endl; - while (!clients.empty()) { - Client *client = clients.front(); - SyntheticClient *syn = synclients.front(); - clients.pop_front(); - synclients.pop_front(); - syn->join_thread(); - delete syn; - delete client; - } - - // wait for messenger to finish - rank.wait(); - - return 0; -} - diff --git a/branches/sage/ebofs2/doc/Commitdir.txt b/branches/sage/ebofs2/doc/Commitdir.txt deleted file mode 100644 index 05c727be60ae6..0000000000000 --- a/branches/sage/ebofs2/doc/Commitdir.txt +++ /dev/null @@ -1,24 +0,0 @@ -OLD - - -How Directory Committing Works: - -Each CDir has: - version - current version of directory - committing_version - which version was sent to stable storage - last_committed_version - last version to be safely stored - -Each Inode has: - parent_dir_version - what dir version i was in when i was dirtied. (*) - - (*) note that if you change an inode, mark_dirty() again, even if it's already dirty! - - -How committing works: - -A call to commit_dir(dir, context) will ensure tha the _current_ version is stored safely on disk before the context is finished. - -When a commit completes, inodes in the directory are checked. If they are dirty and belonged to the _committed_ (or earlier) version, then they are marked clean. If they belong to a newer version, then they are _still dirty_. - - - diff --git a/branches/sage/ebofs2/doc/anchortable.txt b/branches/sage/ebofs2/doc/anchortable.txt deleted file mode 100644 index d9c0fefc31e08..0000000000000 --- a/branches/sage/ebofs2/doc/anchortable.txt +++ /dev/null @@ -1,54 +0,0 @@ - -ANCHOR TABLE PROTOCOL - -MDS sends an update PREPARE to the anchortable MDS. The prepare is -identified by the ino and operation type; only one for each type -(create, update, destroy) can be pending at any time. Both parties -may actually be the same local node, but for simplicity we treat that -situation the same. (That is, we act as if they may fail -independently, even if they can't.) - -The anchortable journals the proposed update, and responds with an -AGREE and a version number. This uniquely identifies the request. - -The MDS can then update the filesystem metadata however it sees fit. -When it is finished (and the results journaled), it sends a COMMIT to -the anchortable. The table journals the commit, frees any state from -the transaction, and sends an ACK. The initiating MDS should then -journal the ACK to complete the transaction. - - -ANCHOR TABLE FAILURE - -If the AT fails before journaling the PREPARE and sending the AGREE, -the initiating MDS will simply retry the request. - -If the AT fails after journaling PREPARE but before journaling COMMIT, -it will resend AGREE to the initiating MDS. - -If the AT fails after the COMMIT, the transaction has been closed, and it -takes no action. If it receives a COMMIT for which it has no open -transaction, it will reply with ACK. - - -INITIATING MDS FAILURE - -If the MDS fails before the metadata update has been journaled, no -action is taken, since nothing is known about the previously proposed -transaction. If an AGREE message is received and there is no -corresponding PREPARE or pending-commit state, and ROLLBACK is sent to -the anchor table. - -If the MDS fails after journaling the metadata update but before -journaling the ACK, it resends COMMIT to the anchor table. If it -receives an AGREE after resending the COMMIT, it simply ignores the -AGREE. The anchortable will respond with an ACK, allowing the -initiating MDS to journal the final ACK and close out the transaction -locally. - -On journal replay, each metadata update (EMetaBlob) encountered that -includes an anchor transaction is noted in the AnchorClient by adding -it to the pending_commit list, and each journaled ACK is removed from -that list. Journal replay may enounter ACKs with no prior metadata -update; these are ignored. When recovery finishes, a COMMIT is sent -for all outstanding transactions. diff --git a/branches/sage/ebofs2/doc/bdb.txt b/branches/sage/ebofs2/doc/bdb.txt deleted file mode 100644 index 63e647f5bb3cc..0000000000000 --- a/branches/sage/ebofs2/doc/bdb.txt +++ /dev/null @@ -1,48 +0,0 @@ -OBJECT STORE ON BERKELEY DB ---------------------------- - -OSBDB is an implementation of an object store that uses Berkeley DB as -the underlying storage. It is meant to be an alternative to EBOFS. - -BUILDING --------- - -You will need to have Berkeley DB installed, including the developent -packages. We've tested this with Berkeley DB 4.4.20 on Ubuntu 6.10. - -To compile OSBDB support, you need to pass the argument "want_bdb=yes" -to "make." If you don't specify this, OSBDB and all its associated -support is not included in the executables. - -RUNNING -------- - -To use OSBDB in Ceph, simply pass the --bdbstore flag to programs. You -don't need to create a "device" for OSBDB ahead of time; Berkeley DB -will take care of creating the files. You also *cannot* use a raw -device as your store -- it must be regular file. - -OSBDB additionally accepts the following flags: - - --bdbstore-btree Configures OSBDB to use the "Btree" - database type for Berkeley DB. The default - database type is "Hash". - - --bdbstore-hash-ffactor Sets the "fill factor" for the hash - database type. Takes an integer argument. - - --bdbstore-hash-nelem Sets the "nelem" parameter for the hash - database type. Takes an integer argument. - - --bdbstore-hash-pagesize Sets the page size for the hash database - type. Takes an integer argument. - - --bdbstore-cachesize Sets the cache size. Takes an integer - argument, which must be a power of two, and - no less than 20 KiB. - - --bdbstore-transactional Enable (in-memory-only) transactions for - all operations in the OSBDB store. - - --debug-bdbstore Set the debug level. Takes an integer - argument. diff --git a/branches/sage/ebofs2/doc/caching.txt b/branches/sage/ebofs2/doc/caching.txt deleted file mode 100644 index 161eaf7428a53..0000000000000 --- a/branches/sage/ebofs2/doc/caching.txt +++ /dev/null @@ -1,303 +0,0 @@ - -SPANNING TREE PROPERTY - -All metadata that exists in the cache is attached directly or -indirectly to the root inode. That is, if the /usr/bin/vi inode is in -the cache, then /usr/bin, /usr, and / are too, including the inodes, -directory objects, and dentries. - - -AUTHORITY - -The authority maintains a list of what nodes cache each inode. -Additionally, each replica is assigned a nonce (initial 0) to -disambiguate multiple replicas of the same item (see below). - - map replicas; // maps replicating mds# to nonce - -The cached_by set _always_ includes all nodes that cache the -partcuarly object, but may additionally include nodes that used to -cache it but no longer do. In those cases, an expire message should -be in transit. That is, we have two invariants: - - 1) the authority's replica set will always include all actual - replicas, and - - 2) cache expiration notices will be reliably delivered to the - authority. - -The second invariant is particularly important because the presence of -replicas will pin the metadata object in memory on the authority, -preventing it from being trimmed from the cache. Notification of -expiration of the replicas is required to allow previously replicated -objects from eventually being trimmed from the cache as well. - -Each metdata object has a authority bit that indicates whether it is -authoritative or a replica. - - -REPLICA NONCE - -Each replicated object maintains a "nonce" value, issued by the -authority at the time the replica was created. If the authority has -already created a replica for the given MDS, the new replica will be -issues a new (incremented) nonce. This nonce is attached -to cache expirations, and allows the authority to disambiguate -expirations when multiple replicas of the same object are created and -cache expiration is coincident with replication. That is, when an -old replica is expired from the replicating MDS at the same time that -a new replica is issued by the authority and the resulting messages -cross paths, the authority can tell that it was the old replica that -was expired and effectively ignore the expiration message. The -replica is removed from the replicas map only if the nonce matches. - - -SUBTREE PARTITION - -Authority of the file system namespace is partitioned using a -subtree-based partitioning strategy. This strategy effectively -separates directory inodes from directory contents, such that the -directory contents are the unit of redelegation. That is, if / is -assigned to mds0 and /usr to mds1, the inode for /usr will be managed -by mds0 (it is part of the / directory), while the contents of /usr -(and everything nested beneath it) will be managed by mds1. - -The description for this partition exists solely in the collective -memory of the MDS cluster and in the individual MDS journals. It is -not described in the regular on-disk metadata structures. This is -related to the fact that authority delegation is a property of the -{\it directory} and not the directory's {\it inode}. - -Subsequently, if an MDS is authoritative for a directory inode and does -not yet have any state associated with the directory in its cache, -then it can assume that it is also authoritative for the directory. - -Directory state consists of a data object that describes any cached -dentries contained in the directory, information about the -relationship between the cached contents and what appears on disk, and -any delegation of authority. That is, each CDir object has a dir_auth -element. Normally dir_auth has a value of AUTH_PARENT, meaning that -the authority for the directory is the same as the directory's inode. -When dir_auth specifies another metadata server, that directory is -point of authority delegation and becomes a {\it subtree root}. A -CDir is a subtree root iff its dir_auth specifies an MDS id (and is not -AUTH_PARENT). - - - A dir is a subtree root iff dir_auth != AUTH_PARENT. - - - If dir_auth = AUTH_PARENT then the inode auth == dir auth, but the - converse may not be true. - -The authority for any metadata object in the cache can be determined -by following the parent pointers toward the root until a subtree root -CDir object is reached, at which point the authority is specified by -its dir_auth. - -Each MDS cache maintains a subtree data structure that describes the -subtree partition for all objects currently in the cache: - - map< CDir*, set > subtrees; - - - A dir will appear in the subtree map (as a key) IFF it is a subtree - root. - -Each subtree root will have an entry in the map. The map value is a -set of all other subtree roots nested beneath that point. Nested -subtree roots effectively bound or prune a subtree. For example, if -we had the following partition: - - mds0 / - mds1 /usr - mds0 /usr/local - mds0 /home - -The subtree map on mds0 would be - - / -> (/usr, /home) - /usr/local -> () - /home -> () - -and on mds1: - - /usr -> (/usr/local) - - -AMBIGUOUS DIR_AUTH - -While metadata for a subtree is being migrated between two MDS nodes, -the dir_auth for the subtree root is allowed to be ambiguous. That -is, it will specify both the old and new MDS ids, indicating that a -migration is in progress. - -If a replicated metadata object is expired from the cache from a -subtree whose authority is ambiguous, the cache expiration is sent to -both potential authorities. This ensures that the message will be -reliably delivered, even if either of those nodes fails. A number of -alternative strategies were considered. Sending the expiration to the -old or new authority and having it forwarded if authority has been -delegated can result in message loss if the forwarding node fails. -Pinning ambiguous metadata in cache is computationally expensive for -implementation reasons, and while delaying the transmission of expiration -messages is difficult to implement because the replicating must send -the final expiration messages when the subtree authority is -disambiguated, forcing it to keep certain elements of it cache in -memory. Although duplicated expirations incurs a small communications -overhead, the implementation is much simpler. - - -AUTH PINS - -Most operations that modify metadata must allow some amount of time to -pass in order for the operation to be journaled or for communication -to take place between the object's authority and any replicas. For -this reason it must not only be pinned in the authority's metadata -cache, but also be locked such that the object's authority is not -allowed to change until the operation completes. This is accomplished -using {\it auth pins}, which increment a reference counter on the -object in question, as well as all parent metadata objects up to the -root of the subtree. As long as the pin is in place, it is impossible -for that subtree (or any fragment of it that contains one or more -pins) to be migrated to a different MDS node. Pins can be placed on -both inodes and directories. - -Auth pins can only exist for authoritative metadata, because they are -only created if the object is authoritative, and their presense -prevents the migration of authority. - - -FREEZING - -More specifically, auth pins prevent a subtree from being frozen. -When a subtree is frozen, all updates to metadata are forbidden. This -includes updates to the replicas map that describes which replicas -(and nonces) exist for each object. - -In order for metadata to be migrated between MDS nodes, it must first -be frozen. The root of the subtree is initially marked as {\it -freezing}. This prevents the creation of any new auth pins within the -subtree. After all existing auth pins are removed, the subtree is -then marked as {\it frozen}, at which point all updates are -forbidden. This allows metadata state to be packaged up in a message -and transmitted to the new authority, without worrying about -intervening updates. - -If the directory at the base of a freezing or frozen subtree is not -also a subtree root (that is, it has dir_auth == AUTH_PARENT), the -directory's parent inode is auth pinned. - - - a frozen tree root dir will auth_pin its inode IFF it is auth AND - not a subtree root. - -This prevents a parent directory from being concurrently frozen, and a -range of resulting implementation complications relating metadata -migration. - - -CACHE EXPIRATION FOR FROZEN SUBTREES - -Cache expiration messages that are received for a subtree that is -frozen are temporarily set aside instead of being processed. Only -when the subtree is unfrozen are the expirations either processed (if -the MDS is authoritative) or discarded (if it is not). Because either -the exporting or importing metadata can fail during the migration -process, the MDS cannot tell whether it will be authoritative or not -until the process completes. - -During a migration, the subtree will first be frozen on both the -exporter and importer, and then all other replicas will be informed of -a subtrees ambiguous authority. This ensures that all expirations -during migration will go to both parties, and nothing will be lost in -the event of a failure. - - - - -NORMAL MIGRATION - -The exporter begins by doing some checks in export_dir() to verify -that it is permissible to export the subtree at this time. In -particular, the cluster must not be degraded, the subtree root may not -be freezing or frozen, and the path must be pinned (\ie not conflicted -with a rename). If these conditions are met, the subtree root -directory is temporarily auth pinned, the subtree freeze is initiated, -and the exporter is committed to the subtree migration, barring an -intervening failure of the importer or itself. - -The MExportDiscover serves simply to ensure that the inode for the -base directory being exported is open on the destination node. It is -pinned by the importer to prevent it from being trimmed. This occurs -before the exporter completes the freeze of the subtree to ensure that -the importer is able to replicate the necessary metadata. When the -exporter receives the MDiscoverAck, it allows the freeze to proceed by -removing its temporary auth pin. - -The MExportPrep message then follows to populate the importer with a -spanning tree that includes all dirs, inodes, and dentries necessary -to reach any nested subtrees within the exported region. This -replicates metadata as well, but it is pushed out by the exporter, -avoiding deadlock with the regular discover and replication process. -The importer is responsible for opening the bounding directories from -any third parties authoritative for those subtrees before -acknowledging. This ensures that the importer has correct dir_auth -information about where authority is redelegated for all points nested -beneath the subtree being migrated. While processing the MExportPrep, -the importer freezes the entire subtree region to prevent any new -replication or cache expiration. - -A warning stage occurs only if the base subtree directory is open by -nodes other than the importer and exporter. If it is not, then this -implies that no metadata within or nested beneath the subtree is -replicated by any node other than the importer an exporter. If it is, -then a MExportWarning message informs any bystanders that the -authority for the region is temporarily ambiguous, and lists both the -exporter and importer as authoritative MDS nodes. In particular, -bystanders who are trimming items from their cache must send -MCacheExpire messages to both the old and new authorities. This is -necessary to ensure that the surviving authority reliably receives all -expirations even if the importer or exporter fails. While the subtree -is frozen (on both the importer and exporter), expirations will not be -immediately processed; instead, they will be queued until the region -is unfrozen and it can be determined that the node is or is not -authoritative. - -The exporter walks the subtree hierarchy and packages up an MExport -message containing all metadata and important state (\eg, information -about metadata replicas). At the same time, the expoter's metadata -objects are flagged as non-authoritative. The MExport message sends -the actual subtree metadata to the importer. Upon receipt, the -importer inserts the data into its cache, marks all objects as -authoritative, and logs a copy of all metadata in an EImportStart -journal message. Once that has safely flushed, it replies with an -MExportAck. The exporter can now log an EExport journal entry, which -ultimately specifies that the export was a success. In the presence -of failures, it is the existence of the EExport entry only that -disambiguates authority during recovery. - -Once logged, the exporter will send an MExportNotify to any -bystanders, informing them that the authority is no longer ambiguous -and cache expirations should be sent only to the new authority (the -importer). Once these are acknowledged back to the exporter, -implicitly flushing the bystander to exporter message streams of any -stray expiration notices, the exporter unfreezes the subtree, cleans -up its migration-related state, and sends a final MExportFinish to the -importer. Upon receipt, the importer logs an EImportFinish(true) -(noting locally that the export was indeed a success), unfreezes its -subtree, processes any queued cache expierations, and cleans up its -state. - - -PARTIAL FAILURE RECOVERY - - - - -RECOVERY FROM JOURNAL - - - - - - - - - diff --git a/branches/sage/ebofs2/doc/dentries.txt b/branches/sage/ebofs2/doc/dentries.txt deleted file mode 100644 index ab14765998b2f..0000000000000 --- a/branches/sage/ebofs2/doc/dentries.txt +++ /dev/null @@ -1,4 +0,0 @@ - -null dentires only exist - - on auth - - on replica, if they are xlock \ No newline at end of file diff --git a/branches/sage/ebofs2/doc/exports.txt b/branches/sage/ebofs2/doc/exports.txt deleted file mode 100644 index 8e0e146bea2fe..0000000000000 --- a/branches/sage/ebofs2/doc/exports.txt +++ /dev/null @@ -1,72 +0,0 @@ - -NORMAL MIGRATION - -The exporter begins by doing some checks in export_dir() to verify -that it is permissible to export the subtree at this time. In -particular, the cluster must not be degraded, the subtree root may not -be freezing or frozen (\ie already exporting, or nested beneath -something that is exporting), and the path must be pinned (\ie not -conflicted with a rename). If these conditions are met, the subtree -freeze is initiated, and the exporter is committed to the subtree -migration, barring an intervening failure of the importer or itself. - -The MExportDiscover serves simply to ensure that the base directory -being exported is open on the destination node. It is pinned by the -importer to prevent it from being trimmed. This occurs before the -exporter completes the freeze of the subtree to ensure that the -importer is able to replicate the necessary metadata. When the -exporter receives the MDiscoverAck, it allows the freeze to proceed. - -The MExportPrep message then follows to populate a spanning tree that -includes all dirs, inodes, and dentries necessary to reach any nested -exports within the exported region. This replicates metadata as well, -but it is pushed out by the exporter, avoiding deadlock with the -regular discover and replication process. The importer is responsible -for opening the bounding directories from any third parties before -acknowledging. This ensures that the importer has correct dir_auth -information about where authority is delegated for all points nested -within the subtree being migrated. While processing the MExportPrep, -the importer freezes the entire subtree region to prevent any new -replication or cache expiration. - -The warning stage occurs only if the base subtree directory is open by -nodes other than the importer and exporter. If so, then a -MExportWarning message informs any bystanders that the authority for -the region is temporarily ambiguous. In particular, bystanders who -are trimming items from their cache must send MCacheExpire messages to -both the old and new authorities. This is necessary to ensure that -the surviving authority reliably receives all expirations even if the -importer or exporter fails. While the subtree is frozen (on both the -importer and exporter), expirations will not be immediately processed; -instead, they will be queued until the region is unfrozen and it can -be determined that the node is or is not authoritative for the region. - -The MExport message sends the actual subtree metadata to the importer. -Upon receipt, the importer inserts the data into its cache, logs a -copy in the EImportStart, and replies with an ExportAck. The exporter -can now log an EExportFinish(true), which ultimately specifies that -the export was a success. In the presence of failures, it is the -existence (and value) of the EExportFinish that disambiguates -authority during recovery. - -Once logged, the exporter will send an MExportNotify to any -bystanders, informing them that the authority is no longer ambiguous -and cache expirations should be sent only to the new authority (the -importer). Once these are acknowledged, implicitly flushing the -bystander to exporter message streams of any stray expiration notices, -the exporter unfreezes the subtree, cleans up its state, and sends a -final MExportFinish to the importer. Upon receipt, the importer logs -an EImportFinish(true), unfreezes its subtree, and cleans up its -state. - - -PARTIAL FAILURE RECOVERY - - - -RECOVERY FROM JOURNAL - - - - - diff --git a/branches/sage/ebofs2/doc/file_modes.txt b/branches/sage/ebofs2/doc/file_modes.txt deleted file mode 100644 index d4ceba4034e5f..0000000000000 --- a/branches/sage/ebofs2/doc/file_modes.txt +++ /dev/null @@ -1,66 +0,0 @@ - -underlying client capabilities: - -- read + cache -- read sync -- write sync -- write + buffer - (...potentially eventually augmented by byte ranges) - -whatever system of modes, tokens, etc. has to satisfy the basic -constraint that no conflicting capabilities are ever in the -hands of clients. - - -questions: -- is there any use to clients writing to a replica? - - reading, yes.. 100,000 open same file.. - - ------- - -simplest approach: -- all readers, writers go through authority -- all open, close traffic at replicas forwarded to auth - -- fh state migrates with exports. - - - --------- - -less simple: -- all writers go through authority - - open, close traffic fw -- readers from any replica - - need token from auth -- weird auth <-> replica <-> client interactions ensue! - - --------- - -even more complex (and totally FLAWED, ignore this!) - -- clients can open a file with any replica (for read or write). -- replica gets a read or write token from the primary - - primary thus knows if it's all read, all write, mixed, or none. -- once replica has a token it can service as many clients (of given type(s)) as it wants. -- on export, tokens are moved too. - - primary give _itself_ a token too! much simpler. - -- clients maintain a mode for each open file: rdonly, wronly, rdwr, lock -- globally, the mode is controlled by the primary, based on the mixture of - read and write tokens issued - - - -- [optional] if a client has a file open rdwr and the mode is rdonly or wronly, it can - request to read or write from the mds (which might twiddle the mode for performance - reasons.. e.g. lots of ppl rdwr but no actual reading) - - - - --------- - - diff --git a/branches/sage/ebofs2/doc/header.txt b/branches/sage/ebofs2/doc/header.txt deleted file mode 100644 index bccdb81533b6f..0000000000000 --- a/branches/sage/ebofs2/doc/header.txt +++ /dev/null @@ -1,13 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ diff --git a/branches/sage/ebofs2/doc/inos.txt b/branches/sage/ebofs2/doc/inos.txt deleted file mode 100644 index b5ab1db25ca60..0000000000000 --- a/branches/sage/ebofs2/doc/inos.txt +++ /dev/null @@ -1,11 +0,0 @@ - -inodeno_t namespace - - relevant both for ino's, and for the (ino) input for Filer and object storage namespace... - -1 - root inode - -100+mds - mds log/journal -200+mds - mds ino, fh allocation tables -300+mds - mds inode files (for non-embedded inodes) - -1000+ - regular files and directories \ No newline at end of file diff --git a/branches/sage/ebofs2/doc/journal.txt b/branches/sage/ebofs2/doc/journal.txt deleted file mode 100644 index 22cb4fc9e21b2..0000000000000 --- a/branches/sage/ebofs2/doc/journal.txt +++ /dev/null @@ -1,124 +0,0 @@ - - -- LogEvent.replay() is idempotent. we won't know whether the update is old or not. - - - - - - - - - - - - - - - -journal is distributed among different nodes. because authority changes over time, it's not immedicatley clear to a recoverying node relaying the journal whether the data is "real" or not (it might be exported later in the journal). - - -possibilities: - - -ONE.. bloat the journal! - -- journal entry includes full trace of dirty data (dentries, inodes) up until import point - - local renames implicit.. cache is reattached on replay - - exports are a list of exported dirs.. which are then dumped - ... - -recovery phase 1 -- each entry includes full trace (inodes + dentries) up until the import point -- cache during recovery is fragmetned/dangling beneath import points -- when export is encountered items are discarded (marked clean) - -recovery phase 2 -- import roots ping store to determine attachment points (if not already known) - - if it was imported during period, attachment point is already known. - - renames affecting imports are logged too -- import roots discovered from other nodes, attached to hierarchy - -then -- maybe resume normal operations -- if recovery is a background process on a takeover mds, "export" everything to that node. - - --> journal contains lots of clean data.. maybe 5+ times bigger as a result! - -possible fixes: - - collect dir traces into journal chunks so they aren't repeated as often - - each chunk summarizes traces in previous chunk - - hopefully next chunk will include many of the same traces - - if not, then the entry will include it - - - - -=== log entry types === -- all inode, dentry, dir items include a dirty flag. -- dirs are implicitly _never_ complete; even if they are, a fetch before commit is necessary to confirm - -ImportPath - log change in import path -Import - log import addition (w/ path, dirino) - -InoAlloc - allocate ino -InoRelease - release ino - -Inode - inode info, along with dentry+inode trace up to import point -Unlink - (null) dentry + trace, + flag (whether inode/dir is destroyed) -Link - (new) dentry + inode + trace - - ------------------------------ - -TWO.. -- directories in store contain path at time of commit (relative to import, and root) -- replay without attaching anything to heirarchy -- after replay, directories pinged in store to attach to hierarchy - --> phase 2 too slow! --> and nested dirs may reattach... that won't be apparent from journal. - - put just parent dir+dentry in dir store.. even worse on phase 2! - - -THREE -- - - - - - - - -metadata journal/log - - -event types: - -chown, chmod, utime - InodeUpdate - -mknod, mkdir, symlink - Mknod .. new inode + link - -unlink, rmdir - Unlink - -rename - Link + Unlink (foreign) -or Rename (local) - -link - Link .. link existing inode - - - - -InodeUpdate -DentryLink -DentryUnlink -InodeCreate -InodeDestroy -Mkdir? diff --git a/branches/sage/ebofs2/doc/lazy_posix.txt b/branches/sage/ebofs2/doc/lazy_posix.txt deleted file mode 100644 index 1d226cd03d8e4..0000000000000 --- a/branches/sage/ebofs2/doc/lazy_posix.txt +++ /dev/null @@ -1,53 +0,0 @@ - -http://www.usenix.org/events/fast05/wips/slides/welch.pdf - - - --- STATLITE - statlite(const char *filename, struct statlite *buf); - fstatlite(int fd, struct statlite *buf); - lstatlite(const char *filename, struct statlite *buf); - - * file size, mtime are optionally not guaranteed to be correct - * mask field to specify which fields you need to be correct - - --- READDIR+ - - struct dirent_plus *readdirplus(DIR *dirp); - int readdirplus_r(DIR *dirp, struct dirent_plus *entry, struct dirent_plus **result); - struct dirent_lite *readdirlite(DIR *dirp); - int readdirlite_r(DIR *dirp, struct dirent_lite *entry, struct dirent_lite **result); - - * plus returns lstat - * lite returns lstatlite - - --- lazy i/o integrity - - O_LAZY to open(2) - - * relax data coherency - * writes may not be visible until lazyio_propagate, fsync, close - - lazyio_propagate(int fd, off_t offset, size_t count); - * my writes are safe - - lazyio_synchronize(int fd, off_t offset, size_t count); - * i will see everyone else's propagated writes - --- read/write non-serial vectors - - ssize_t readx(int fd, const struct iovec *iov, size_t iov_count, struct xtvec *xtv, size_t xtv_count); - ssize_t writex(int fd, const struct iovec *iov, size_t iov_count, struct xtvec *xtv, size_t xtv_count); - - * like readv/writev, but serial - * - - -int lockg(int fd, int cmd, lgid_t *lgid) - group locks - -int openg(char *path, int mode, fh_t *handle); - portable file handle -int sutoc(fh_t *fh); \ No newline at end of file diff --git a/branches/sage/ebofs2/doc/mds_locks.txt b/branches/sage/ebofs2/doc/mds_locks.txt deleted file mode 100644 index f41a89a9b31e5..0000000000000 --- a/branches/sage/ebofs2/doc/mds_locks.txt +++ /dev/null @@ -1,66 +0,0 @@ - -new names - dentry_read (not path_pins) - dentry_xlock - - inode_read - inode_xlock (not inode_write) - -locks are always tied to active_requests. - -read locks can be placed on any node. -xlocks must be applied at the authority. - -for multi-lock operations (link, unlink, rename), we must acquire xlocks on a remote node. lock requests are associated with a reqid. the authoritative node keeps track of which remote xlocks it holds. when forwarded/restarted, it can drop remote locks. - -when restarting, drop all locks. -on remote, drop locks and state, and notify main req node. -recover dist request state on rejoin: - - surviving op initiator will assert read or xlock - - recovering op initiator will restart requests. (from initiator's perspective, ops have either happened or they haven't, depending on whether the event is journaled.) - - recovering or surviving op cohort will determine lock state during rejoin, or get a commit or rollback... - - - - ---- path_pin = read lock on /some/random/path - - blocks a dentry xlock - ---- dnxlock = exclusive lock on /some/random/path - - locking: prevents subsequent path pins. - - locked: prevents dn read - - on auth - --> grab _all_ path pins at onces; hold none while waiting. --> grab xlocks in order. - ---- auth_pin = pin to authority, on *dir, *in - - prevents freezing -> frozen. - - freezing blocks new auth pins, thus blocking other local auth_pins. (hangs up local export.) - - does not block remote auth_pins, because remote side is not auth (or frozen!) until after local subtree is frozen. - --> blocking on auth_pins is dangerous. _never_ block if we are holding other auth_pins on the same node (subtree?). --> grab _all_ auth pins at once; hold none while waiting. - ---- hard/file_wrlock = exlusive lock on inode content - - prevents inode read - - on auth - --> grab locks in order. - - -ORDERING -- namespace(dentries) < inodes -- order dentries on (dirino, dname) -- order inodes on (ino); -- need to order both read and write locks, esp with dentries. so, if we need to lock /usr/bin/foo with read on usr and bin and xwrite on foo, we need to acquire all of those locks using the same ordering. - - on same host, we can be 'nice' and check lockability of all items, then lock all, and drop everything while waiting. (actually, is there any use to this?) - - on mutiple hosts, we need to use full ordering (at least as things separate across host boundaries). and if needed lock set changes (such that the order of already acquired locks changes), we need to drop those locks and start over. - -- how do auth pins fit into all this? - - auth pin on xlocks only. no need on read locks. - - pre-grab all auth pins on a node the first time it is visiting during lock acquisition. - - what if things move? if we find we are missing a needed auth pin when we revisit a host at any point, and the item is not still authpinnable, we back off and restart. (we cannot block.) - - - - if we find we are not authpinnable, drop all locks and wait. - - diff --git a/branches/sage/ebofs2/doc/modeline.txt b/branches/sage/ebofs2/doc/modeline.txt deleted file mode 100644 index 1b3956f4d486b..0000000000000 --- a/branches/sage/ebofs2/doc/modeline.txt +++ /dev/null @@ -1,2 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab diff --git a/branches/sage/ebofs2/doc/osd_outline.txt b/branches/sage/ebofs2/doc/osd_outline.txt deleted file mode 100644 index 2c6f3287aac5f..0000000000000 --- a/branches/sage/ebofs2/doc/osd_outline.txt +++ /dev/null @@ -1,37 +0,0 @@ - -intro - -osd cluster map - requirements - desireable properties - (c)rush - -failure detection - distributed ping or heartbeat - central filter, notifier - -design - placement seed, class/superset, groups - -normal operation - reads - writes - -recovery - triggers: failed disk, or total cluster reorganization - - notify - peering - pull - push - clean - -writes during recovery - -graceful data loss + recovery? - - - - - - diff --git a/branches/sage/ebofs2/doc/osd_replication.txt b/branches/sage/ebofs2/doc/osd_replication.txt deleted file mode 100644 index 907d00e2050a2..0000000000000 --- a/branches/sage/ebofs2/doc/osd_replication.txt +++ /dev/null @@ -1,226 +0,0 @@ - - -SOME GENERAL REQUIREMENTS - -- cluster expansion: - - any or all of the replicas may move to new OSDs. - -- cluster map may change frequently - - map change should translate into pending replication/migration - state quickly (or better yet, instantly), so that we could push - through a series of (say, botched) maps quickly and be fine, so long - as the final map is correct. - -- ideally, unordered osd<->osd, client<->osd communication - (mds<->mds, client<->mds communication is ordered, but file i/o - would be too slow that way?) - - - - -PRIMARY ONLY PICTURE - -let's completely ignore replication for a while, and see how -complicated the picture needs to be to reliably support cluster expansion. - -typedef __uint64_t version_t; - - -per-Object metadata: -- version #. incremented when an object is modified. - e.g. version_t version; -- on primary, keep list of stray replicas - e.g. map stray_replicas; // osds w/ stray replicas - includes old primary osd(s), until deletion is confirmed. used while rg - is importing. - - -per-RG metadata -- object list. well, a method to fetch it by querying a collection or whatever. -- negative list - e.g. map deleted_objects; - - used to enumerate deleted objects, when in "importing" state. -- a RG "state" (enum/int) - - - - - - -Normal RG state: -- role=primary - clean - i am primary, all is well. no stray copies. i can - discard my negative object list, since my local - object store tells me everything. - - -After a map change: -- new primary - undef - initially; i don't know RG exists. -- old primary - homeless - i was primary, still have unmolested data. new primary is not yet migrating - (presumably it's state=undef.) i need to contact new primary and tell them - this RG exists. - -- new primary - importing - i am migrating data from old primary. keep negative dir entries for deletions. - write locally. proxy reads (force immediately migration). do whole objects - initially (on write, block until i migrate the object). later we can do - sub-object state (where "live" object data is spread across new/old primaries.. -- old primary - exporting - primary is migrating my data. - undef - when it finishes. (i will forget this RG existed.) - - -After a second map change (scenario 1): - as above, if we were clean again. - -After a second map change (scenario 2): - we weren't clean yet. -- new primary - undef - initially (until i learn RG exists) -- old primary - importing - i'm still migrating from old old primary -- old old primary - exporting - ... -- old primary -?? importing+exporting - proxy reads as before. continue migrating from old old primary. - - -After a second map change (scenario 3): - we weren't clean yet, and old old primary is also new primary -- new primary (and old old primary) - exporting - change state to importing. be sure to compare object versions, and neg dir - entries (as we always should do, really!). -- old primary - importing - note that the old import source matches new primary, and change - state to exporting, and stop importing. (unlike scenario 2) - --> this approach could mean that a series of fast map changes could - force data to migrate down a "chain" of old primaries to reach the - new one. maybe old primary should go from importing -> exporting, - and pass along old old primary id to new primary such that the - import is a many-to-one thing, instead of one-to-one. version - numbers and neg entries will make it easy to pick out correct versions. - - - -For the importing process on a given RG: - -- metadata for each source - - each source has a state: - 'starting' - don't know anything about source yet. query source! - this probaby induces the source to change from - 'homeless' or something similar to 'exporting'. - 'importing' - i've fetched the source's object list (and neg - object list). i'm busy reading them! these lists - will shrink as the process continues. after i fetch - an object, i will erase it from the source. - (object metadata will include stray copy info - until i confirm that its removed.) - 'finishing' - i've read all my data, and i'm telling the old person - to discard any remaining RG metadata (RG contents - should already be gone) - - unmigrated object list - - migrated but not deleted object list - - stray osd is also listed in per-object MD during this stage - - negative object list - - i can remove these items if i see a newer object version (say, - from a different import source or something). - - i can remove any local objects or ignore imported ones if it is - older than deleted version - -- the lists should be sets or otherwise queryable so that while i'm - importing and a real op comes through I can quickly determine if a - given object_id is pending migration etc or if my local store is to - be trusted. - - - - - -SOME CODE BITS - - -typedef __uint64_t version_t; -class Object { - version_t version; - map stray_replicas; -}; - - -class ReplicaGroup { - int enumerate_objects(list& ls); - - int state; - - // for unstable states, - map deleted_objects; // locally - map exporters; // importing from these guys. -}; - -// primary -#define RG_STATE_CLEAN 1 -#define RG_STATE_IMPORTING 2 // pulling data - -// non-primary -#define RG_STATE_HOMELESS 5 // old primary; new primary not yet - // notified; not yet exporting. -#define RG_STATE_EXPORTING 6 // a newer primary is extracting my - // data. - - -struct RGExporter_t { - int import_state; - - set remaining_objects; // remote object list - set stray_objects; // imported but not deleted. - -}; - - - - - ----- -all crap from here on down - - - - -REPLICAS -- - - - - -OSD STATES -- primary, up to date. -- replica, up to date. - -- primary, proxy to old primary (primaries?) - -- replica, not up to date. - - -REPLICATION STUFF - -Per-RG metadata -- primary - - per-replica state: clean, catching up? -- replica - -Per-object metadata -- primary and replica - - version number/mtime - - rg (reverse indexed) -- primary - - replication level and state. - - commited to memory and/or disk, on which replicas (#1, #2, etc.) -- replica - - - - - --> \ No newline at end of file diff --git a/branches/sage/ebofs2/doc/shared_write_states_nogo.txt b/branches/sage/ebofs2/doc/shared_write_states_nogo.txt deleted file mode 100644 index f409617d82681..0000000000000 --- a/branches/sage/ebofs2/doc/shared_write_states_nogo.txt +++ /dev/null @@ -1,39 +0,0 @@ - -// stable states // ------auth----- -----replica----- -#define LOCK_SYNC 0 // R . / . . . WB same ... for stat() -#define LOCK_LOCK 1 // R W / RC . . . . . / RC . . . ... for truncate(), fsync() -#define LOCK_RDONLY 2 // R . / RC R . . same -#define LOCK_MIXED 3 // . . / . R W . same -#define LOCK_WRONLY 4 // . . / . . W WB same - -// transition states -#define LOCK_GSYNCR 8 // R . / RC . . . same -#define LOCK_GSYNCMW 9 // . . / RC . . WB same -#define LOCK_GSYNCMW2 9 // . . / RC . . WB same - -#define LOCK_GLOCKSR 5 // R . / RC . . . . . / RC . . . -#define LOCK_GLOCKMW 7 // . . / RC . . . same - -#define LOCK_GRDONLYM 10 // . . / . R . . same -#define LOCK_GRDONLYM2 10 // --- . . / . R . . -#define LOCK_GRDONLYW 11 // . . / . . . . same -#define LOCK_GRDONLYW2 11 // --- . . / . . . . -#define LOCK_GRDONLYS 12 // R . / RC . . . same -#define LOCK_GRDONLYL 13 // R . / RC . . . --- - -#define LOCK_GMIXEDR 14 // R . / . R . . . . / . R . . -#define LOCK_GMIXEDR2 15 // --- . . / . R . . -#define LOCK_GMIXEDW 16 // . . / . . W . same -#define LOCK_GMIXEDW2 16 // --- . . / . . W . -#define LOCK_GMIXEDS 16 // R . / . . . . . . / . . . . -#define LOCK_GMIXEDS2 16 // --- . . / . . . . -#define LOCK_GMIXEDL 17 // R . / . . . . --- - -#define LOCK_GWRONLYR 18 // R . / . . . . same -#define LOCK_GWRONLYR2 18 // --- . . / . . . . -#define LOCK_GWRONLYM 19 // . . / . . . . same -#define LOCK_GWRONLYM2 19 // --- . . / . . . . -#define LOCK_GWRONLYS 20 // R . / . . . WB same -#define LOCK_GWRONLYS2 20 // --- . . / . . . . -#define LOCK_GWRONLYL 21 - diff --git a/branches/sage/ebofs2/doc/shutdown.txt b/branches/sage/ebofs2/doc/shutdown.txt deleted file mode 100644 index e5ccde3171004..0000000000000 --- a/branches/sage/ebofs2/doc/shutdown.txt +++ /dev/null @@ -1,13 +0,0 @@ - -- mds0 triggers shutdown by sending a shutdown_start to all nodes. - -- from here on out, all client requests are discarded (unless they are a file close?) - -- each mds checks for outstanding inter-mds transations. e.g imports, discoveries, etc. once they're all done, send a shutdown_ready to mds0 - -- each mds successively disassembles its cache, flushing data to long-term storage, and sending inodeexpires, exporting imported dirs to parent (after they're clean + empty) - -- when the cache is empty, send shutdown_done to mds0 and exit. - -- mds0 exits when all mdss have finished. - diff --git a/branches/sage/ebofs2/dupstore.cc b/branches/sage/ebofs2/dupstore.cc deleted file mode 100644 index d43f935cb50cc..0000000000000 --- a/branches/sage/ebofs2/dupstore.cc +++ /dev/null @@ -1,102 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include -#include "ebofs/Ebofs.h" -#include "osd/FakeStore.h" - - -int dupstore(ObjectStore* src, ObjectStore* dst) -{ - if (src->mount() < 0) return 1; - if (dst->mkfs() < 0) return 1; - if (dst->mount() < 0) return 1; - - // objects - list objects; - src->list_objects(objects); - int num = objects.size(); - cout << num << " objects" << std::endl; - int i = 1; - for (list::iterator p = objects.begin(); p != objects.end(); ++p) { - bufferlist bl; - src->read(*p, 0, 0, bl); - cout << "object " << i++ << "/" << num << " " << *p << " = " << bl.length() << " bytes" << std::endl; - dst->write(*p, 0, bl.length(), bl, 0); - map attrs; - src->getattrs(*p, attrs); - dst->setattrs(*p, attrs); - } - - // collections - list collections; - src->list_collections(collections); - num = collections.size(); - cout << num << " collections" << std::endl; - i = 1; - for (list::iterator p = collections.begin(); - p != collections.end(); - ++p) { - dst->create_collection(*p, 0); - map attrs; - src->collection_getattrs(*p, attrs); - dst->collection_setattrs(*p, attrs); - list o; - src->collection_list(*p, o); - int numo = 0; - for (list::iterator q = o.begin(); q != o.end(); q++) { - dst->collection_add(*p, *q, 0); - numo++; - } - cout << "collection " << i++ << "/" << num << " " << hex << *p << dec << " = " << numo << " objects" << std::endl; - } - - - src->umount(); - dst->umount(); - return 0; -} - -void usage() -{ - cerr << "usage: dup.ebofs (ebofs|fakestore) src (ebofs|fakestore) dst" << std::endl; - exit(0); -} - -int main(int argc, char **argv) -{ - vector args; - argv_to_vec(argc, argv, args); - parse_config_options(args); - - // args - if (args.size() != 4) - usage(); - - ObjectStore *src, *dst; - - if (strcmp(args[0], "ebofs") == 0) - src = new Ebofs(args[1]); - else if (strcmp(args[0], "fakestore") == 0) - src = new FakeStore(args[1]); - else usage(); - - if (strcmp(args[2], "ebofs") == 0) - dst = new Ebofs(args[3]); - else if (strcmp(args[2], "fakestore") == 0) - dst = new FakeStore(args[3]); - else usage(); - - return dupstore(src, dst); -} diff --git a/branches/sage/ebofs2/ebofs/Allocator.cc b/branches/sage/ebofs2/ebofs/Allocator.cc deleted file mode 100644 index 35b0db16b84c2..0000000000000 --- a/branches/sage/ebofs2/ebofs/Allocator.cc +++ /dev/null @@ -1,693 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "Allocator.h" -#include "Ebofs.h" - - -#undef dout -#define dout(x) if (x <= g_conf.debug_ebofs) *_dout << dbeginl << g_clock.now() << " ebofs(" << fs->dev.get_device_name() << ").allocator." - - -void Allocator::dump_freelist() -{ - if (1) { - interval_set free; // validate too - - block_t n = 0; - for (int b=0; b<=EBOFS_NUM_FREE_BUCKETS; b++) { - Table *tab; - if (b < EBOFS_NUM_FREE_BUCKETS) { - tab = fs->free_tab[b]; - dout(0) << "dump bucket " << b << " " << tab->get_num_keys() << dendl; - } else { - tab = fs->limbo_tab; - dout(0) << "dump limbo " << tab->get_num_keys() << dendl;; - } - - if (tab->get_num_keys() > 0) { - Table::Cursor cursor(tab); - assert(tab->find(0, cursor) >= 0); - while (1) { - dout(0) << "dump ex " << cursor.current().key << "~" << cursor.current().value << dendl; - assert(cursor.current().value > 0); - - if (b < EBOFS_NUM_FREE_BUCKETS) - n += cursor.current().value; - - if (free.contains( cursor.current().key, cursor.current().value )) - dout(0) << "dump bad " << cursor.current().key << "~" << cursor.current().value << dendl; - assert(!free.contains( cursor.current().key, cursor.current().value )); - free.insert( cursor.current().key, cursor.current().value ); - if (cursor.move_right() <= 0) break; - } - } else { - //dout(0) << " empty" << dendl; - } - } - - assert(n == fs->free_blocks); - dout(0) << "dump combined freelist is " << free << dendl; - - - // alloc_tab - if (fs->alloc_tab->get_num_keys() > 0) { - Table >::Cursor cursor(fs->alloc_tab); - assert(fs->alloc_tab->find(0, cursor) >= 0); - while (1) { - dout(0) << "alloc ex " << cursor.current().key << "~" << cursor.current().value.first << " ref " - << cursor.current().value.second - << dendl; - assert(cursor.current().value.first > 0); - - if (cursor.move_right() <= 0) break; - } - } - } -} - - -int Allocator::find(Extent& ex, int bucket, block_t num, block_t near, int dir) -{ - Table::Cursor cursor(fs->free_tab[bucket]); - bool found = false; - - if ((dir == DIR_ANY || dir == DIR_FWD) && - fs->free_tab[bucket]->find( near, cursor ) >= 0) { - // look to the right - do { - if (cursor.current().value >= num) - found = true; - } while (!found && cursor.move_right() > 0); - } - - if ((dir == DIR_ANY || dir == DIR_BACK) && - !found) { - // look to the left - fs->free_tab[bucket]->find( near, cursor ); - - while (!found && cursor.move_left() >= 0) - if (cursor.current().value >= num) - found = true; - } - - if (found) { - ex.start = cursor.current().key; - ex.length = cursor.current().value; - return 0; - } - - return -1; -} - -int Allocator::allocate(Extent& ex, block_t num, block_t near) -{ - //dump_freelist(); - - int dir = DIR_ANY; // no dir - if (near == NEAR_LAST_FWD) { - near = last_pos; - dir = DIR_FWD; // fwd - } - else if (near == NEAR_LAST) - near = last_pos; - - int bucket; - - while (1) { // try twice, if fwd = true - - // look for contiguous extent - for (bucket = pick_bucket(num); bucket < EBOFS_NUM_FREE_BUCKETS; bucket++) { - if (find(ex, bucket, num, near, dir) >= 0) { - // yay! - - // remove original - fs->free_tab[bucket]->remove( ex.start ); - fs->free_blocks -= ex.length; - - if (ex.length > num) { - if (ex.start < near) { - // to the left - if (ex.start + ex.length - num <= near) { - // by a lot. take right-most portion. - Extent left; - left.start = ex.start; - left.length = ex.length - num; - ex.start += left.length; - ex.length -= left.length; - assert(ex.length == num); - _release_loner(left); - } else { - // take middle part. - Extent left,right; - left.start = ex.start; - left.length = near - ex.start; - ex.start = near; - right.start = ex.start + num; - right.length = ex.length - left.length - num; - ex.length = num; - _release_loner(left); - _release_loner(right); - } - } - else { - // to the right. take left-most part. - Extent right; - right.start = ex.start + num; - right.length = ex.length - num; - ex.length = num; - _release_loner(right); - } - } - - dout(20) << "allocate " << ex << " near " << near << dendl; - last_pos = ex.end(); - //dump_freelist(); - if (g_conf.ebofs_cloneable) - alloc_inc(ex); - return num; - } - } - - if (dir == DIR_BACK || dir == DIR_ANY) break; - dir = DIR_BACK; - } - - // ok, find partial extent instead. - for (block_t trysize = num/2; trysize >= 1; trysize /= 2) { - int bucket = pick_bucket(trysize); - if (find(ex, bucket, trysize, near) >= 0) { - // yay! - assert(ex.length < num); - - fs->free_tab[bucket]->remove(ex.start); - fs->free_blocks -= ex.length; - last_pos = ex.end(); - dout(20) << "allocate partial " << ex << " (wanted " << num << ") near " << near << dendl; - //dump_freelist(); - if (g_conf.ebofs_cloneable) - alloc_inc(ex); - return ex.length; - } - } - - dout(1) << "allocate failed, fs completely full! " << fs->free_blocks << dendl; - assert(0); - //dump_freelist(); - return -1; -} - -int Allocator::_release_into_limbo(Extent& ex) -{ - dout(10) << "_release_into_limbo " << ex << dendl; - dout(10) << "limbo is " << limbo << dendl; - assert(ex.length > 0); - limbo.insert(ex.start, ex.length); - fs->limbo_blocks += ex.length; - return 0; -} - -int Allocator::release(Extent& ex) -{ - if (g_conf.ebofs_cloneable) - return alloc_dec(ex); - - _release_into_limbo(ex); - return 0; -} - -int Allocator::commit_limbo() -{ - dout(20) << "commit_limbo" << dendl; - for (map::iterator i = limbo.m.begin(); - i != limbo.m.end(); - i++) { - fs->limbo_tab->insert(i->first, i->second); - //fs->free_blocks += i->second; - } - limbo.clear(); - //fs->limbo_blocks = 0; - //dump_freelist(); - return 0; -} - -int Allocator::release_limbo() -{ - //dump_freelist(); - if (fs->limbo_tab->get_num_keys() > 0) { - Table::Cursor cursor(fs->limbo_tab); - fs->limbo_tab->find(0, cursor); - while (1) { - Extent ex(cursor.current().key, cursor.current().value); - dout(20) << "release_limbo ex " << ex << dendl; - - fs->limbo_blocks -= ex.length; - _release_merge(ex); - - if (cursor.move_right() <= 0) break; - } - } - fs->limbo_tab->clear(); - //dump_freelist(); - return 0; -} - - - -/* -int Allocator::_alloc_loner_inc(Extent& ex) -{ - Table >::Cursor cursor(fs->alloc_tab); - - if (fs->alloc_tab->find( ex.start, cursor ) - == Table >::Cursor::MATCH) { - assert(cursor.current().value.first == ex.length); - pair& v = cursor.dirty_current_value(); - v.second++; - dout(10) << "_alloc_loner_inc " << ex << " " - << (v.second-1) << " -> " << v.second - << dendl; - } else { - // insert it, @1 - fs->alloc_tab->insert(ex.start, pair(ex.length,1)); - dout(10) << "_alloc_loner_inc " << ex << " 0 -> 1" << dendl; - } - return 0; -} - -int Allocator::_alloc_loner_dec(Extent& ex) -{ - Table >::Cursor cursor(fs->alloc_tab); - - if (fs->alloc_tab->find( ex.start, cursor ) - == Table >::Cursor::MATCH) { - assert(cursor.current().value.first == ex.length); - if (cursor.current().value.second == 1) { - dout(10) << "_alloc_loner_dec " << ex << " 1 -> 0" << dendl; - fs->alloc_tab->remove( cursor.current().key ); - } else { - pair& v = cursor.dirty_current_value(); - --v.second; - dout(10) << "_alloc_loner_dec " << ex << " " - << (v.second+1) << " -> " << v.second - << dendl; - } - } else { - assert(0); - } - return 0; -} -*/ - - -int Allocator::alloc_inc(Extent ex) -{ - dout(10) << "alloc_inc " << ex << dendl; - - // empty table? - if (fs->alloc_tab->get_num_keys() == 0) { - // easy. - fs->alloc_tab->insert(ex.start, pair(ex.length,1)); - dout(10) << "alloc_inc + " << ex << " 0 -> 1 (first entry)" << dendl; - return 0; - } - - Table >::Cursor cursor(fs->alloc_tab); - - // try to move to left (to check for overlap) - int r = fs->alloc_tab->find( ex.start, cursor ); - if (r == Table >::Cursor::OOB || - cursor.current().key > ex.start) { - r = cursor.move_left(); - dout(10) << "alloc_inc move_left r = " << r << dendl; - } - - while (1) { - dout(10) << "alloc_inc loop at " << cursor.current().key - << "~" << cursor.current().value.first - << " ref " << cursor.current().value.second - << dendl; - - // too far left? - if (cursor.current().key < ex.start && - cursor.current().key + cursor.current().value.first <= ex.start) { - // adjacent? - bool adjacent = false; - if (cursor.current().key + cursor.current().value.first == ex.start && - cursor.current().value.second == 1) - adjacent = true; - - // no overlap. - r = cursor.move_right(); - dout(10) << "alloc_inc move_right r = " << r << dendl; - - // at end? - if (r <= 0) { - // hmm! - if (adjacent) { - // adjust previous entry - cursor.move_left(); - pair &v = cursor.dirty_current_value(); - v.first += ex.length; // yay! - dout(10) << "alloc_inc + " << ex << " 0 -> 1 (adjust at end)" << dendl; - } else { - // insert at end, finish. - int r = fs->alloc_tab->insert(ex.start, pair(ex.length,1)); - dout(10) << "alloc_inc + " << ex << " 0 -> 1 (at end) .. r = " << r << dendl; - //dump_freelist(); - } - return 0; - } - } - - if (cursor.current().key > ex.start) { - // gap. - // oooooo - // nnnnn..... - block_t l = MIN(ex.length, cursor.current().key - ex.start); - - fs->alloc_tab->insert(ex.start, pair(l,1)); - dout(10) << "alloc_inc + " << ex.start << "~" << l << " 0 -> 1" << dendl; - ex.start += l; - ex.length -= l; - if (ex.length == 0) break; - fs->alloc_tab->find( ex.start, cursor ); - } - else if (cursor.current().key < ex.start) { - block_t end = cursor.current().value.first + cursor.current().key; - - if (end <= ex.end()) { - // single split - // oooooo - // nnnnn - pair& v = cursor.dirty_current_value(); - v.first = ex.start - cursor.current().key; - int ref = v.second; - - block_t l = end - ex.start; - fs->alloc_tab->insert(ex.start, pair(l, 1+ref)); - - dout(10) << "alloc_inc " << ex.start << "~" << l - << " " << ref << " -> " << ref+1 - << " (right split)" << dendl; - - ex.start += l; - ex.length -= l; - if (ex.length == 0) break; - fs->alloc_tab->find( ex.start, cursor ); - - } else { - // double split, finish. - // ------------- - // ------ - pair& v = cursor.dirty_current_value(); - v.first = ex.start - cursor.current().key; - int ref = v.second; - - fs->alloc_tab->insert(ex.start, pair(ex.length, 1+ref)); - - int rl = end - ex.end(); - fs->alloc_tab->insert(ex.end(), pair(rl, ref)); - - dout(10) << "alloc_inc " << ex - << " " << ref << " -> " << ref+1 - << " (double split finish)" - << dendl; - - break; - } - } - else { - assert(cursor.current().key == ex.start); - - if (cursor.current().value.first <= ex.length) { - // inc. - // oooooo - // nnnnnnnn - pair& v = cursor.dirty_current_value(); - v.second++; - dout(10) << "alloc_inc " << ex.start << "~" << cursor.current().value.first - << " " << cursor.current().value.second-1 << " -> " - << cursor.current().value.second - << " (left split)" << dendl; - ex.start += v.first; - ex.length -= v.first; - if (ex.length == 0) break; - cursor.move_right(); - } else { - // single split, finish. - // oooooo - // nnn - block_t l = cursor.current().value.first - ex.length; - int ref = cursor.current().value.second; - - pair& v = cursor.dirty_current_value(); - v.first = ex.length; - v.second++; - - fs->alloc_tab->insert(ex.end(), pair(l, ref)); - - dout(10) << "alloc_inc " << ex - << " " << ref << " -> " << ref+1 - << " (left split finish)" - << dendl; - - break; - } - } - } - - return 0; -} - - -int Allocator::alloc_dec(Extent ex) -{ - dout(10) << "alloc_dec " << ex << dendl; - - assert(fs->alloc_tab->get_num_keys() >= 0); - - Table >::Cursor cursor(fs->alloc_tab); - - // try to move to left (to check for overlap) - int r = fs->alloc_tab->find( ex.start, cursor ); - dout(10) << "alloc_dec find r = " << r << dendl; - - if (r == Table >::Cursor::OOB || - cursor.current().key > ex.start) { - r = cursor.move_left(); - dout(10) << "alloc_dec move_left r = " << r << dendl; - - // too far left? - if (cursor.current().key < ex.start && - cursor.current().key + cursor.current().value.first <= ex.start) { - // no overlap. - dump_freelist(); - assert(0); - } - } - - while (1) { - dout(10) << "alloc_dec ? " << cursor.current().key - << "~" << cursor.current().value.first - << " " << cursor.current().value.second - << ", ex is " << ex - << dendl; - - assert(cursor.current().key <= ex.start); // no gap allowed. - - if (cursor.current().key < ex.start) { - block_t end = cursor.current().value.first + cursor.current().key; - - if (end <= ex.end()) { - // single split - // oooooo - // ----- - pair& v = cursor.dirty_current_value(); - v.first = ex.start - cursor.current().key; - int ref = v.second; - dout(10) << "alloc_dec s " << cursor.current().key << "~" << cursor.current().value.first - << " " << ref - << " shortened left bit of single" << dendl; - - block_t l = end - ex.start; - if (ref > 1) { - fs->alloc_tab->insert(ex.start, pair(l, ref-1)); - dout(10) << "alloc_dec . " << ex.start << "~" << l - << " " << ref << " -> " << ref-1 - << dendl; - } else { - Extent r(ex.start, l); - _release_into_limbo(r); - } - - ex.start += l; - ex.length -= l; - if (ex.length == 0) break; - fs->alloc_tab->find( ex.start, cursor ); - - } else { - // double split, finish. - // ooooooooooooo - // ------ - pair& v = cursor.dirty_current_value(); - v.first = ex.start - cursor.current().key; - int ref = v.second; - dout(10) << "alloc_dec s " << cursor.current().key << "~" << cursor.current().value.first - << " " << ref - << " shorted left bit of double split" << dendl; - - if (ref > 1) { - fs->alloc_tab->insert(ex.start, pair(ex.length, ref-1)); - dout(10) << "alloc_inc s " << ex - << " " << ref << " -> " << ref-1 - << " reinserted middle bit of double split" - << dendl; - } else { - _release_into_limbo(ex); - } - - int rl = end - ex.end(); - fs->alloc_tab->insert(ex.end(), pair(rl, ref)); - dout(10) << "alloc_dec s " << ex.end() << "~" << rl - << " " << ref - << " reinserted right bit of double split" << dendl; - break; - } - } - else { - assert(cursor.current().key == ex.start); - - if (cursor.current().value.first <= ex.length) { - // inc. - // oooooo - // nnnnnnnn - if (cursor.current().value.second > 1) { - pair& v = cursor.dirty_current_value(); - v.second--; - dout(10) << "alloc_dec s " << ex.start << "~" << cursor.current().value.first - << " " << cursor.current().value.second+1 << " -> " << cursor.current().value.second - << dendl; - ex.start += v.first; - ex.length -= v.first; - if (ex.length == 0) break; - cursor.move_right(); - } else { - Extent r(cursor.current().key, cursor.current().value.first); - _release_into_limbo(r); - - ex.start += cursor.current().value.first; - ex.length -= cursor.current().value.first; - cursor.remove(); - - if (ex.length == 0) break; - fs->alloc_tab->find( ex.start, cursor ); - } - } else { - // single split, finish. - // oooooo - // nnn - block_t l = cursor.current().value.first - ex.length; - int ref = cursor.current().value.second; - - if (ref > 1) { - pair& v = cursor.dirty_current_value(); - v.first = ex.length; - v.second--; - dout(10) << "alloc_inc . " << ex - << " " << ref << " -> " << ref-1 - << dendl; - } else { - _release_into_limbo(ex); - cursor.remove(); - } - - dout(10) << "alloc_dec s " << ex.end() << "~" << l - << " " << ref - << " reinserted right bit of single split" << dendl; - fs->alloc_tab->insert(ex.end(), pair(l, ref)); - break; - } - } - - - } - - return 0; -} - - -/* - * release extent into freelist - * WARNING: *ONLY* use this if you _know_ there are no adjacent free extents - */ -int Allocator::_release_loner(Extent& ex) -{ - assert(ex.length > 0); - int b = pick_bucket(ex.length); - fs->free_tab[b]->insert(ex.start, ex.length); - fs->free_blocks += ex.length; - return 0; -} - -/* - * release extent into freelist - * look for any adjacent extents and merge with them! - */ -int Allocator::_release_merge(Extent& orig) -{ - dout(15) << "_release_merge " << orig << dendl; - assert(orig.length > 0); - - Extent newex = orig; - - // one after us? - for (int b=0; b::Cursor cursor(fs->free_tab[b]); - - if (fs->free_tab[b]->find( newex.start+newex.length, cursor ) - == Table::Cursor::MATCH) { - // add following extent to ours - newex.length += cursor.current().value; - - // remove it - fs->free_blocks -= cursor.current().value; - fs->free_tab[b]->remove( cursor.current().key ); - break; - } - } - - // one before us? - for (int b=0; b::Cursor cursor(fs->free_tab[b]); - fs->free_tab[b]->find( newex.start+newex.length, cursor ); - if (cursor.move_left() >= 0 && - (cursor.current().key + cursor.current().value == newex.start)) { - // merge - newex.start = cursor.current().key; - newex.length += cursor.current().value; - - // remove it - fs->free_blocks -= cursor.current().value; - fs->free_tab[b]->remove( cursor.current().key ); - break; - } - } - - // ok, insert newex - _release_loner(newex); - return 0; -} diff --git a/branches/sage/ebofs2/ebofs/Allocator.h b/branches/sage/ebofs2/ebofs/Allocator.h deleted file mode 100644 index c1898784d50a7..0000000000000 --- a/branches/sage/ebofs2/ebofs/Allocator.h +++ /dev/null @@ -1,85 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __EBOFS_ALLOCATOR_H -#define __EBOFS_ALLOCATOR_H - -#include "types.h" - -#include "include/interval_set.h" - -class Ebofs; - -class Allocator { -public: - const static block_t NEAR_LAST = 0; - const static block_t NEAR_LAST_FWD = 1; - - const static int DIR_ANY = 0; - const static int DIR_FWD = 2; - const static int DIR_BACK = 1; - -protected: - Ebofs *fs; - block_t last_pos; - - - interval_set limbo; - - static int pick_bucket(block_t num) { - int b = 0; - while (num > 1) { - b++; - num = num >> EBOFS_FREE_BUCKET_BITS; - } - if (b >= EBOFS_NUM_FREE_BUCKETS) - b = EBOFS_NUM_FREE_BUCKETS-1; - return b; - } - - int find(Extent& ex, int bucket, block_t num, block_t near, int dir = DIR_ANY); - - void dump_freelist(); - - public: - int _release_into_limbo(Extent& ex); - - int _release_loner(Extent& ex); // release loner extent - int _release_merge(Extent& ex); // release any extent (searches for adjacent) - - //int _alloc_loner_inc(Extent& ex); - //int _alloc_loner_dec(Extent& ex); - - - public: - Allocator(Ebofs *f) : fs(f), last_pos(0) {} - - int allocate(Extent& ex, block_t num, block_t near=NEAR_LAST); - int release(Extent& ex); // alias for alloc_dec - - int alloc_inc(Extent ex); - int alloc_dec(Extent ex); - - - int unallocate(Extent& ex) { // skip limbo - return _release_merge(ex); - } - - int commit_limbo(); // limbo -> fs->limbo_tab - int release_limbo(); // fs->limbo_tab -> free_tabs - -}; - -#endif diff --git a/branches/sage/ebofs2/ebofs/BlockDevice.cc b/branches/sage/ebofs2/ebofs/BlockDevice.cc deleted file mode 100644 index 94c108db2612c..0000000000000 --- a/branches/sage/ebofs2/ebofs/BlockDevice.cc +++ /dev/null @@ -1,846 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "BlockDevice.h" - -#include "config.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include - -#ifndef __CYGWIN__ -#ifndef DARWIN -#include -#endif -#endif - - - -/******************************************* - * biovec - */ - -inline ostream& operator<<(ostream& out, BlockDevice::biovec &bio) -{ - out << "bio("; - if (bio.type == BlockDevice::biovec::IO_READ) out << "rd "; - if (bio.type == BlockDevice::biovec::IO_WRITE) out << "wr "; - out << bio.start << "~" << bio.length; - if (bio.note) out << " " << bio.note; - out << " " << &bio; - out << ")"; - return out; -} - - - -/******************************************* - * ElevatorQueue - */ - -#define dout(x) if (x <= g_conf.debug_bdev) *_dout << dbeginl << g_clock.now() << " bdev(" << dev << ").elevatorq." -#define derr(x) if (x <= g_conf.debug_bdev) *_derr << dbeginl << g_clock.now() << " bdev(" << dev << ").elevatorq." - - -int BlockDevice::ElevatorQueue::dequeue_io(list& biols, - block_t& start, block_t& length, - interval_set& block_lock) -{ - // queue empty? - assert(!io_map.empty()); - - dout(20) << "dequeue_io el_pos " << el_pos << " dir " << el_dir_forward << dendl; - - // find our position: i >= pos - map::iterator i; - - int tries = 2; - while (tries > 0) { - if (el_dir_forward) { - i = io_map.lower_bound(el_pos); - if (i != io_map.end()) { - break; // not at end. good. - } - } else { - i = io_map.upper_bound(el_pos); - if (i != io_map.begin()) { - i--; // and back down one (to get i <= pos). good. - break; - } - } - - // reverse (or initial startup)? - if (g_conf.bdev_el_bidir || !el_dir_forward) { - // dout(20) << "restart reversing" << dendl; - el_dir_forward = !el_dir_forward; - } - - if (el_dir_forward) { - // forward - el_pos = 0; - - if (g_conf.bdev_el_fw_max_ms) { - el_stop = g_clock.now(); - utime_t max(0, 1000*g_conf.bdev_el_fw_max_ms); // (s,us), convert ms -> us! - el_stop += max; - // dout(20) << "restart forward sweep for " << max << dendl; - } else { - // dout(20) << "restart fowrard sweep" << dendl; - } - } else { - // reverse - el_pos = bdev->get_num_blocks(); - - if (g_conf.bdev_el_bw_max_ms) { - el_stop = g_clock.now(); - utime_t max(0, 1000*g_conf.bdev_el_bw_max_ms); // (s,us), convert ms -> us! - el_stop += max; - // dout(20) << "restart reverse sweep for " << max << dendl; - } else { - // dout(20) << "restart reverse sweep" << dendl; - } - } - - tries--; - } - - assert(tries > 0); // this shouldn't happen if the queue is non-empty. - - // get some biovecs - int num_bio = 0; - - dout(20) << "dequeue_io starting with " << i->first << " " << *i->second << dendl; - - // merge contiguous ops - char type = i->second->type; // read or write - int num_iovs = 0; // count eventual iov's for readv/writev - - start = i->first; - length = 0; - - if (el_dir_forward) - el_pos = start; - else - el_pos = i->first + i->second->length; - - // while (contiguous) - while ((( el_dir_forward && el_pos == i->first) || - (!el_dir_forward && el_pos == i->first + i->second->length)) && - type == i->second->type) { - biovec *bio = i->second; - - // allowed? (not already submitted to kernel?) - if (block_lock.intersects(bio->start, bio->length)) { - dout(20) << "dequeue_io " << bio->start << "~" << bio->length - << " intersects block_lock " << block_lock << dendl; - break; // stop, or go with what we've got so far - } - - // add to biols - int nv = bio->bl.buffers().size(); // how many iov's in this bio's bufferlist? - if (num_bio && - num_iovs + nv >= IOV_MAX) break; // to many //g_conf.bdev_iov_max) break; // too many! - num_iovs += nv; - - start = MIN(start, bio->start); - length += bio->length; - - if (el_dir_forward) { - dout(20) << "dequeue_io fw dequeue io at " << el_pos << " " << *i->second << dendl; - biols.push_back(bio); // add at back - } else { - dout(20) << "dequeue_io bw dequeue io at " << el_pos << " " << *i->second << dendl; - biols.push_front(bio); // add at front - } - num_bio++; - - // move elevator pointer - bool at_end = false; - map::iterator prev = i; - if (el_dir_forward) { - el_pos += bio->length; // cont. next would start right after us - i++; - if (i == io_map.end()) { - at_end = true; - } - } else { - el_pos -= bio->length; - if (i == io_map.begin()) { - at_end = true; - } else { - i--; - } - } - - // dequeue - io_map.erase(prev); - bio->in_queue = 0; - - if (at_end) break; - } - - return num_bio; -} - - - -/******************************************* - * BarrierQueue - */ -#undef dout -#define dout(x) if (x <= g_conf.debug_bdev) *_dout << dbeginl << g_clock.now() << " bdev(" << dev << ").barrierq." - -void BlockDevice::BarrierQueue::barrier() -{ - if (!qls.empty() && qls.front()->empty()) { - assert(qls.size() == 1); - dout(10) << "barrier not adding new queue, front is empty" << dendl; - } else { - qls.push_back(new ElevatorQueue(bdev, dev)); - dout(10) << "barrier adding new elevator queue (now " << qls.size() << "), front queue has " - << qls.front()->size() << " ios left" << dendl; - } -} - -bool BlockDevice::BarrierQueue::bump() -{ - assert(!qls.empty()); - - // is the front queue empty? - if (qls.front()->empty() && - qls.front() != qls.back()) { - delete qls.front(); - qls.pop_front(); - dout(10) << "dequeue_io front empty, moving to next queue (" << qls.front()->size() << ")" << dendl; - return true; - } - - return false; -} - -int BlockDevice::BarrierQueue::dequeue_io(list& biols, - block_t& start, block_t& length, - interval_set& locked) -{ - assert(!qls.empty()); - int n = qls.front()->dequeue_io(biols, start, length, locked); - bump(); // in case we emptied the front queue - return n; -} - - - - -/******************************************* - * BlockDevice - */ - -#undef dout -#define dout(x) if (x <= g_conf.debug_bdev) *_dout << dbeginl << g_clock.now() << " bdev(" << dev << ")." - - - -block_t BlockDevice::get_num_blocks() -{ - if (!num_blocks) { - assert(fd > 0); - - int r; -#ifdef BLKGETSIZE64 - // ioctl block device - uint64_t bytes = 0; - r = ioctl(fd, BLKGETSIZE64, &bytes); - num_blocks = bytes / (uint64_t)EBOFS_BLOCK_SIZE; - if (r == 0) { - dout(10) << "get_num_blocks ioctl BLKGETSIZE64 reports " - << num_blocks << " 4k blocks, " - << bytes << " bytes" - << dendl; -#else - // hrm, try the 32 bit ioctl? - unsigned long sectors = 0; - r = ioctl(fd, BLKGETSIZE, §ors); - num_blocks = sectors/8ULL; - bytes = sectors*512ULL; - if (r == 0) { - dout(10) << "get_num_blocks ioctl BLKGETSIZE reports " << sectors << " sectors, " - << num_blocks << " 4k blocks, " << bytes << " bytes" << dendl; -#endif - } else { - // hmm, try stat! - dout(10) << "get_num_blocks ioctl(2) failed with " << errno << " " << strerror(errno) << ", using stat(2)" << dendl; - struct stat st; - fstat(fd, &st); - uint64_t bytes = st.st_size; - num_blocks = bytes / EBOFS_BLOCK_SIZE; - dout(10) << "get_num_blocks stat reports " << num_blocks << " 4k blocks, " << bytes << " bytes" << dendl; - } - - if (g_conf.bdev_fake_mb) { - num_blocks = g_conf.bdev_fake_mb * 256; - dout(0) << "faking dev size " << g_conf.bdev_fake_mb << " mb" << dendl; - } - if (g_conf.bdev_fake_max_mb && - num_blocks > (block_t)g_conf.bdev_fake_max_mb * 256ULL) { - dout(0) << "faking dev size " << g_conf.bdev_fake_max_mb << " mb" << dendl; - num_blocks = g_conf.bdev_fake_max_mb * 256; - } - - } - return num_blocks; -} - - - -/** io thread - * each worker thread dequeues ios from the root_queue and submits them to the kernel. - */ -void* BlockDevice::io_thread_entry() -{ - lock.Lock(); - - int whoami = io_threads_started++; - io_threads_running++; - assert(io_threads_running <= g_conf.bdev_iothreads); - dout(10) << "io_thread" << whoami << " start, " << io_threads_running << " now running" << dendl; - - // get my own fd (and file position pointer) - int fd = open_fd(); - assert(fd > 0); - - while (!io_stop) { - if (!root_queue.empty()) { - dout(20) << "io_thread" << whoami << "/" << io_threads_running << " going" << dendl; - - block_t start, length; - list biols; - int n = root_queue.dequeue_io(biols, start, length, io_block_lock); - - if (n == 0) { - // failed to dequeue a do-able op, sleep for now - dout(20) << "io_thread" << whoami << "/" << io_threads_running << " couldn't dequeue doable op, sleeping" << dendl; - assert(io_threads_running > 1); // there must be someone else, if we couldn't dequeue something doable. - } - else { - // lock blocks - assert(start == biols.front()->start); - io_block_lock.insert(start, length); - - // drop lock to do the io - lock.Unlock(); - do_io(fd, biols); - lock.Lock(); - - // unlock blocks - io_block_lock.erase(start, length); - - // someone might have blocked on our block_lock? - if (io_threads_running < g_conf.bdev_iothreads && - (int)root_queue.size() > io_threads_running) - io_wakeup.SignalAll(); - - // loop again (don't sleep) - continue; - } - } - - // sleep - io_threads_running--; - dout(20) << "io_thread" << whoami << " sleeping, " - << io_threads_running << " threads now running," - << " queue has " << root_queue.size() - << dendl; - - // first wait for signal | timeout? - if (g_conf.bdev_idle_kick_after_ms > 0 && - idle_kicker && - io_threads_running == 0 && !is_idle_waiting) { // only the last thread asleep needs to kick. - // sleep, but just briefly. - dout(20) << "io_thread" << whoami << " doing short wait, to see if i stay idle" << dendl; - is_idle_waiting = true; - int r = io_wakeup.WaitInterval(lock, utime_t(0, g_conf.bdev_idle_kick_after_ms*1000)); - is_idle_waiting = false; - - if (r == ETIMEDOUT) { - dout(20) << "io_thread" << whoami << " timeout expired, kicking ebofs" << dendl; - kicker_cond.Signal(); // signal kicker thread - } else { - dout(20) << "io_thread" << whoami << " signaled during short sleep, waking up" << dendl; - goto wake_up; - } - } - - // sleeeep - io_wakeup.Wait(lock); // and wait (if condition still holds) - - wake_up: - io_threads_running++; - assert(io_threads_running <= g_conf.bdev_iothreads); - dout(20) << "io_thread" << whoami << "/" << io_threads_running << " woke up, " << io_threads_running << " threads now running" << dendl; - } - - // clean up - ::close(fd); - io_threads_running--; - - lock.Unlock(); - - dout(10) << "io_thread" << whoami << " finish" << dendl; - return 0; -} - - - -/** do_io - * do a single io operation - * (lock is NOT held, but we own the *biovec) - */ -void BlockDevice::do_io(int fd, list& biols) -{ - int r; - assert(!biols.empty()); - - // get full range, type, bl - bufferlist bl; - bl.claim(biols.front()->bl); - block_t start = biols.front()->start; - block_t length = biols.front()->length; - char type = biols.front()->type; - - list::iterator p = biols.begin(); - int numbio = 1; - for (p++; p != biols.end(); p++) { - length += (*p)->length; - bl.claim_append((*p)->bl); - numbio++; - } - - // do it - dout(20) << "do_io start " << (type==biovec::IO_WRITE?"write":"read") - << " " << start << "~" << length - << " " << numbio << " bits" << dendl; - if (type == biovec::IO_WRITE) { - r = _write(fd, start, length, bl); - } else if (type == biovec::IO_READ) { - r = _read(fd, start, length, bl); - } else assert(0); - dout(20) << "do_io finish " << (type==biovec::IO_WRITE?"write":"read") - << " " << start << "~" << length << dendl; - - // set rval - for (p = biols.begin(); p != biols.end(); p++) - (*p)->rval = r; - - if (1) { - // put in completion queue - complete_lock.Lock(); - complete_queue.splice( complete_queue.end(), biols ); - complete_queue_len += numbio; - complete_wakeup.Signal(); - complete_lock.Unlock(); - dout(20) << "do_io kicked completer on " << (type==biovec::IO_WRITE?"write":"read") - << " " << start << "~" << length << dendl; - - } else { - // be slow and finish synchronously - for (p = biols.begin(); p != biols.end(); p++) - finish_io(*p); - } -} - - -/** finish_io - * - * finish an io by signaling the cond or performing a callback. - * called by completion thread, unless that's disabled above. - */ -void BlockDevice::finish_io(biovec *bio) -{ - bio->done = true; - if (bio->cond) { - lock.Lock(); // hmm? - bio->cond->Signal(); - lock.Unlock(); - } - else if (bio->cb) { - bio->cb->finish((ioh_t)bio, bio->rval); - delete bio->cb; - delete bio; - } -} - -/*** completion_thread - * handle Cond signals or callbacks for completed ios - */ -void* BlockDevice::complete_thread_entry() -{ - complete_lock.Lock(); - dout(10) << "complete_thread start" << dendl; - - while (!io_stop) { - - while (!complete_queue.empty()) { - list ls; - ls.swap(complete_queue); - dout(10) << "complete_thread grabbed " << complete_queue_len << " biovecs" << dendl; - complete_queue_len = 0; - - complete_lock.Unlock(); - - // finish - for (list::iterator p = ls.begin(); - p != ls.end(); - p++) { - biovec *bio = *p; - dout(20) << "complete_thread finishing " << *bio << dendl; - finish_io(bio); - } - - complete_lock.Lock(); - } - if (io_stop) break; - - dout(25) << "complete_thread sleeping" << dendl; - complete_wakeup.Wait(complete_lock); - } - - dout(10) << "complete_thread finish" << dendl; - complete_lock.Unlock(); - return 0; -} - - -/*** idle kicker thread - * kick ebofs when we're idle. we're a separate thread (yuck) - * because ebofs may be holding it's lock _and_ waiting for us - * to do useful work. that rules out io_thread and complete_thread! - */ -void* BlockDevice::kicker_thread_entry() -{ - lock.Lock(); - dout(10) << "kicker_thread start" << dendl; - - while (!io_stop) { - - if (io_threads_running == 0 && idle_kicker) { - dout(25) << "kicker_thread kicking ebofs" << dendl; - lock.Unlock(); - idle_kicker->kick(); - lock.Lock(); - dout(25) << "kicker_thread done kicking ebofs" << dendl; - } - if (io_stop) break; - - dout(25) << "kicker_thread sleeping" << dendl; - kicker_cond.Wait(lock); - } - - dout(10) << "kicker_thread finish" << dendl; - lock.Unlock(); - return 0; -} - - - - -// io queue - -void BlockDevice::_submit_io(biovec *b) -{ - // NOTE: lock must be held - dout(15) << "_submit_io " << *b << dendl; - - // wake up io_thread(s)? - if ((int)root_queue.size() == io_threads_running) - io_wakeup.SignalOne(); - else if ((int)root_queue.size() > io_threads_running) - io_wakeup.SignalAll(); - - // queue - root_queue.submit_io(b); - - /* - // [DEBUG] check for overlapping ios - // BUG: this doesn't detect all overlaps w/ the next queue thing. - if (g_conf.bdev_debug_check_io_overlap) { - // BUG: this doesn't catch everything! eg 1~10000000 will be missed.... - multimap::iterator p = io_queue.lower_bound(b->start); - if ((p != io_queue.end() && - p->first < b->start+b->length) || - (p != io_queue.begin() && - (p--, p->second->start + p->second->length > b->start))) { - dout(1) << "_submit_io new io " << *b - << " overlaps with existing " << *p->second << dendl; - cerr << "_submit_io new io " << *b - << " overlaps with existing " << *p->second << dendl; - } - } - */ - -} - -int BlockDevice::_cancel_io(biovec *bio) -{ - // NOTE: lock must be held - - if (bio->in_queue == 0) { - dout(15) << "_cancel_io " << *bio << " FAILED" << dendl; - return -1; - } else { - dout(15) << "_cancel_io " << *bio << dendl; - bio->in_queue->cancel_io(bio); - if (root_queue.bump()) - io_wakeup.SignalAll(); // something happened! - return 0; - } -} - - - -// low level io - -int BlockDevice::_read(int fd, block_t bno, unsigned num, bufferlist& bl) -{ - dout(10) << "_read " << bno << "~" << num << dendl; - - assert(fd > 0); - - off_t offset = bno * EBOFS_BLOCK_SIZE; - off_t actual = lseek(fd, offset, SEEK_SET); - assert(actual == offset); - - size_t len = num*EBOFS_BLOCK_SIZE; - assert(bl.length() >= len); - - struct iovec iov[ bl.buffers().size() ]; - int n = 0; - size_t left = len; - for (list::const_iterator i = bl.buffers().begin(); - i != bl.buffers().end(); - i++) { - assert(i->length() % EBOFS_BLOCK_SIZE == 0); - - iov[n].iov_base = (void*)i->c_str(); - iov[n].iov_len = MIN(left, i->length()); - - left -= iov[n].iov_len; - n++; - if (left == 0) break; - } - - int got = ::readv(fd, iov, n); - assert(got <= (int)len); - - return 0; -} - -int BlockDevice::_write(int fd, unsigned bno, unsigned num, bufferlist& bl) -{ - dout(10) << "_write " << bno << "~" << num << dendl; - - assert(fd > 0); - - while (1) { - off_t offset = (off_t)bno << EBOFS_BLOCK_BITS; - assert((off_t)bno * (off_t)EBOFS_BLOCK_SIZE == offset); - off_t actual = lseek(fd, offset, SEEK_SET); - assert(actual == offset); - - // write buffers - size_t len = num*EBOFS_BLOCK_SIZE; - - struct iovec iov[ bl.buffers().size() ]; - - int n = 0; - size_t left = len; - for (list::const_iterator i = bl.buffers().begin(); - i != bl.buffers().end(); - i++) { - assert(i->length() % EBOFS_BLOCK_SIZE == 0); - - iov[n].iov_base = (void*)i->c_str(); - iov[n].iov_len = MIN(left, i->length()); - - assert((((intptr_t)iov[n].iov_base) & ((intptr_t)4095ULL)) == 0); - assert((iov[n].iov_len & 4095) == 0); - - left -= iov[n].iov_len; - n++; - if (left == 0 || - n == IOV_MAX) break; - } - - int r = ::writev(fd, iov, n); - - if (r < 0) { - dout(1) << "couldn't write bno " << bno << " num " << num - << " (" << len << " bytes) in " << n << " iovs, r=" << r - << " errno " << errno << " " << strerror(errno) << dendl; - dout(1) << "bl is " << bl << dendl; - assert(0); - } else if (r < (int)len) { - // hrm, we didn't write _all_ of our data. WTF kind of FS is this? - dout(-1) << "bloody hell, writev only wrote " << r << " of " << len << " bytes, looping" << dendl; - assert(r % 4096 == 0); - int wrote = r / 4096; - bno += wrote; - num -= wrote; - bufferlist tail; - tail.substr_of(bl, r, len-r); - bl.claim(tail); - continue; - } else { - // yay - assert(r == (int)len); - break; - } - } - return 0; -} - - - -// open/close - -int BlockDevice::open_fd() -{ -#ifdef DARWIN - int fd = ::open(dev.c_str(), O_RDWR|O_SYNC, 0); - ::fcntl(fd, F_NOCACHE); - return fd; -#else - return ::open(dev.c_str(), O_RDWR|O_SYNC|O_DIRECT, 0); -#endif -} - -int BlockDevice::open(kicker *idle) -{ - assert(fd == 0); - - // open? - fd = open_fd(); - if (fd < 0) { - dout(1) << "open failed, r = " << fd << " " << strerror(errno) << dendl; - fd = 0; - return -1; - } - - // lock - if (g_conf.bdev_lock) { - int r = ::flock(fd, LOCK_EX|LOCK_NB); - if (r < 0) { - derr(1) << "open " << dev << " failed to get LOCK_EX" << dendl; - return -1; - } - } - - // figure size - block_t b = get_num_blocks(); - if (!b) { - dout(0) << "open can't determine size of device" << dendl; - assert(0); - } - dout(2) << "open " << b << " blocks, " << b*4096 << " bytes" << dendl; - - // start thread - io_threads_started = 0; - io_threads.clear(); - for (int i=0; icreate(); - } - complete_thread.create(); - kicker_thread.create(); - - // idle kicker? - idle_kicker = idle; - - return fd; -} - - -/* - * warning: ebofs shoudl drop it's lock before calling close(), - * or else deadlock against the idle kicker - */ -int BlockDevice::close() -{ - assert(fd>0); - - idle_kicker = 0; - - // shut down io thread - dout(10) << "close stopping io+complete threads" << dendl; - lock.Lock(); - complete_lock.Lock(); - io_stop = true; - io_wakeup.SignalAll(); - complete_wakeup.SignalAll(); - kicker_cond.Signal(); - complete_lock.Unlock(); - lock.Unlock(); - - for (int i=0; ijoin(); - delete io_threads[i]; - } - io_threads.clear(); - - complete_thread.join(); - kicker_thread.join(); - - io_stop = false; // in case we start again - - dout(2) << "close " << dendl; - - if (g_conf.bdev_lock) - ::flock(fd, LOCK_UN); - - ::close(fd); - fd = 0; - - return 0; -} - -int BlockDevice::cancel_io(ioh_t ioh) -{ - biovec *pbio = (biovec*)ioh; - - lock.Lock(); - int r = _cancel_io(pbio); - lock.Unlock(); - - // FIXME? - if (r == 0 && pbio->cb) { - //pbio->cb->finish(ioh, 0); - delete pbio->cb; - delete pbio; - } - - return r; -} - diff --git a/branches/sage/ebofs2/ebofs/BlockDevice.h b/branches/sage/ebofs2/ebofs/BlockDevice.h deleted file mode 100644 index 295ea6b55b75f..0000000000000 --- a/branches/sage/ebofs2/ebofs/BlockDevice.h +++ /dev/null @@ -1,351 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __EBOFS_BLOCKDEVICE_H -#define __EBOFS_BLOCKDEVICE_H - -#include "include/buffer.h" -#include "include/interval_set.h" -#include "include/Context.h" -#include "common/Mutex.h" -#include "common/Cond.h" -#include "common/Thread.h" - -#include "types.h" - - -typedef void *ioh_t; // opaque handle to an io request. (in actuality, a biovec*) - - -class BlockDevice { - public: - // callback type for io completion notification - class callback { - public: - virtual ~callback() {} - virtual void finish(ioh_t ioh, int rval) = 0; - }; - - // kicker for idle notification - class kicker { - public: - virtual ~kicker() {} - virtual void kick() = 0; - }; - - - /********************************************************/ - - class Queue; - - // io item - // two variants: one with Cond*, one with callback*. - class biovec { - public: - static const char IO_WRITE = 1; - static const char IO_READ = 2; - - char type; - block_t start, length; - bufferlist bl; - callback *cb; - Cond *cond; - int rval; - char *note; - bool done; - - Queue *in_queue; - - biovec(char t, block_t s, block_t l, bufferlist& b, callback *c, char *n=0) : - type(t), start(s), length(l), bl(b), cb(c), cond(0), rval(0), note(n), done(false), in_queue(0) {} - biovec(char t, block_t s, block_t l, bufferlist& b, Cond *c, char *n=0) : - type(t), start(s), length(l), bl(b), cb(0), cond(c), rval(0), note(n), done(false), in_queue(0) {} - }; - friend ostream& operator<<(ostream& out, biovec &bio); - - - /********************************************************/ - - /* - * Queue -- abstract IO queue interface - */ - class Queue { - public: - virtual ~Queue() {} - virtual void submit_io(biovec *b) = 0; - virtual void cancel_io(biovec *b) = 0; - virtual int dequeue_io(list& biols, - block_t& start, block_t& length, - interval_set& locked) = 0; - virtual int size() = 0; - virtual bool empty() { return size() == 0; } - }; - - /* - * ElevatorQueue - simple elevator scheduler queue - */ - class ElevatorQueue : public Queue { - BlockDevice *bdev; - const char *dev; - map io_map; - bool el_dir_forward; - block_t el_pos; - utime_t el_stop; - - public: - ElevatorQueue(BlockDevice *bd, const char *d) : - bdev(bd), dev(d), - el_dir_forward(false), - el_pos(0) {} - void submit_io(biovec *b) { - b->in_queue = this; - assert(io_map.count(b->start) == 0); - io_map[b->start] = b; - } - void cancel_io(biovec *b) { - assert(b->in_queue == this); - assert(io_map.count(b->start) && - io_map[b->start] == b); - io_map.erase(b->start); - b->in_queue = 0; - } - int dequeue_io(list& biols, - block_t& start, block_t& length, - interval_set& locked); - int size() { - return io_map.size(); - } - }; - - /* - * BarrierQueue - lets you specify io "barriers" - * barrier() - force completion of all prior IOs before - * future ios are started. - * bump() - must be called after cancel_io to properly - * detect empty subqueue. - */ - class BarrierQueue : public Queue { - BlockDevice *bdev; - const char *dev; - list qls; - public: - BarrierQueue(BlockDevice *bd, const char *d) : bdev(bd), dev(d) { - barrier(); - } - ~BarrierQueue() { - for (list::iterator p = qls.begin(); - p != qls.end(); - ++p) - delete *p; - qls.clear(); - } - int size() { - // this isn't perfectly accurate. - if (!qls.empty()) - return qls.front()->size(); - return 0; - } - void submit_io(biovec *b) { - assert(!qls.empty()); - qls.back()->submit_io(b); - } - void cancel_io(biovec *b) { - assert(0); // shouldn't happen. - } - int dequeue_io(list& biols, - block_t& start, block_t& length, - interval_set& locked); - void barrier(); - bool bump(); - }; - - - private: - string dev; // my device file - int fd; - block_t num_blocks; - - Mutex lock; - - /** the root io queue. - * i current assumeit's a barrier queue,but this can be changed - * with some minor rearchitecting. - */ - BarrierQueue root_queue; - - /* io_block_lock - block ranges current dispatched to kernel - * once a bio is dispatched, it cannot be canceled, so an overlapping - * io and be submitted. the overlapping io cannot be dispatched - * to the kernel, however, until the original io finishes, or else - * there will be a race condition. - */ - interval_set io_block_lock; // blocks currently dispatched to kernel - - // io threads - Cond io_wakeup; - bool io_stop; - int io_threads_started, io_threads_running; - bool is_idle_waiting; - - void *io_thread_entry(); - - class IOThread : public Thread { - BlockDevice *dev; - public: - IOThread(BlockDevice *d) : dev(d) {} - void *entry() { return (void*)dev->io_thread_entry(); } - } ; - - vector io_threads; - - // private io interface - int open_fd(); // get an fd (for a thread) - - void _submit_io(biovec *b); - int _cancel_io(biovec *bio); - void do_io(int fd, list& biols); // called by an io thread - - // low level io - int _read(int fd, block_t bno, unsigned num, bufferlist& bl); - int _write(int fd, unsigned bno, unsigned num, bufferlist& bl); - - - // completion callback queue - Mutex complete_lock; - Cond complete_wakeup; - list complete_queue; - int complete_queue_len; - - void finish_io(biovec *bio); - - // complete thread - void *complete_thread_entry(); - class CompleteThread : public Thread { - BlockDevice *dev; - public: - CompleteThread(BlockDevice *d) : dev(d) {} - void *entry() { return (void*)dev->complete_thread_entry(); } - } complete_thread; - - // kicker - kicker *idle_kicker; // not used.. - Mutex kicker_lock; - Cond kicker_cond; - void *kicker_thread_entry(); - class KickerThread : public Thread { - BlockDevice *dev; - public: - KickerThread(BlockDevice *d) : dev(d) {} - void *entry() { return (void*)dev->complete_thread_entry(); } - } kicker_thread; - - - - public: - BlockDevice(const char *d) : - dev(d), fd(0), num_blocks(0), - root_queue(this, dev.c_str()), - io_stop(false), io_threads_started(0), io_threads_running(0), is_idle_waiting(false), - complete_queue_len(0), - complete_thread(this), - idle_kicker(0), kicker_thread(this) { } - ~BlockDevice() { - if (fd > 0) close(); - } - - // get size in blocks - block_t get_num_blocks(); - const char *get_device_name() const { return dev.c_str(); } - - // open/close - int open(kicker *idle = 0); - int close(); - - // state stuff - bool is_idle() { - lock.Lock(); - bool idle = (io_threads_running == 0) && root_queue.empty(); - lock.Unlock(); - return idle; - } - void barrier() { - lock.Lock(); - root_queue.barrier(); - lock.Unlock(); - } - - // ** blocking interface ** - - // read - int read(block_t bno, unsigned num, bufferptr& bptr, char *n=0) { - bufferlist bl; - bl.push_back(bptr); - return read(bno, num, bl, n); - } - int read(block_t bno, unsigned num, bufferlist& bl, char *n=0) { - Cond c; - biovec bio(biovec::IO_READ, bno, num, bl, &c, n); - - lock.Lock(); - _submit_io(&bio); - barrier(); // need this, to prevent starvation! - while (!bio.done) - c.Wait(lock); - lock.Unlock(); - return bio.rval; - } - - // write - int write(unsigned bno, unsigned num, bufferptr& bptr, char *n=0) { - bufferlist bl; - bl.push_back(bptr); - return write(bno, num, bl, n); - } - int write(unsigned bno, unsigned num, bufferlist& bl, char *n=0) { - Cond c; - biovec bio(biovec::IO_WRITE, bno, num, bl, &c, n); - - lock.Lock(); - _submit_io(&bio); - barrier(); // need this, to prevent starvation! - while (!bio.done) - c.Wait(lock); - lock.Unlock(); - return bio.rval; - } - - // ** non-blocking interface ** - ioh_t read(block_t bno, unsigned num, bufferlist& bl, callback *fin, char *n=0) { - biovec *pbio = new biovec(biovec::IO_READ, bno, num, bl, fin, n); - lock.Lock(); - _submit_io(pbio); - lock.Unlock(); - return (ioh_t)pbio; - } - ioh_t write(block_t bno, unsigned num, bufferlist& bl, callback *fin, char *n=0) { - biovec *pbio = new biovec(biovec::IO_WRITE, bno, num, bl, fin, n); - lock.Lock(); - _submit_io(pbio); - lock.Unlock(); - return (ioh_t)pbio; - } - int cancel_io(ioh_t ioh); - -}; - - - - -#endif diff --git a/branches/sage/ebofs2/ebofs/BufferCache.cc b/branches/sage/ebofs2/ebofs/BufferCache.cc deleted file mode 100644 index b1c98455f8278..0000000000000 --- a/branches/sage/ebofs2/ebofs/BufferCache.cc +++ /dev/null @@ -1,1228 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "BufferCache.h" -#include "Onode.h" - - -/*********** BufferHead **************/ - - -#undef dout -#define dout(x) if (x <= g_conf.debug_ebofs) *_dout << dbeginl << g_clock.now() << " ebofs.bh." - - - - - - -/************ ObjectCache **************/ - - -#undef dout -#define dout(x) if (x <= g_conf.debug_ebofs) *_dout << dbeginl << g_clock.now() << " ebofs.oc." - - - -void ObjectCache::rx_finish(ioh_t ioh, block_t start, block_t length, bufferlist& bl) -{ - list waiters; - - dout(10) << "rx_finish " << start << "~" << length << dendl; - for (map::iterator p = data.lower_bound(start); - p != data.end(); - p++) { - BufferHead *bh = p->second; - dout(10) << "rx_finish ?" << *bh << dendl; - assert(p->first == bh->start()); - - // past? - if (p->first >= start+length) break; - if (bh->end() > start+length) break; // past - - assert(p->first >= start); - assert(bh->end() <= start+length); - - dout(10) << "rx_finish !" << *bh << dendl; - - if (bh->rx_ioh == ioh) - bh->rx_ioh = 0; - - if (bh->is_rx()) { - assert(bh->get_version() == 0); - assert(bh->end() <= start+length); - assert(bh->start() >= start); - dout(10) << "rx_finish rx -> clean on " << *bh << dendl; - bh->data.substr_of(bl, (bh->start()-start)*EBOFS_BLOCK_SIZE, bh->length()*EBOFS_BLOCK_SIZE); - bc->mark_clean(bh); - } - else if (bh->is_partial()) { - dout(10) << "rx_finish partial -> tx on " << *bh << dendl; - - if (1) { - // double-check what block i am - vector exv; - on->map_extents(bh->start(), 1, exv); - assert(exv.size() == 1); - block_t cur_block = exv[0].start; - assert(cur_block == bh->partial_tx_to); - } - - // ok, cancel my low-level partial (since we're still here, and can bh_write ourselves) - bc->cancel_partial( bh->rx_from.start, bh->partial_tx_to, bh->partial_tx_epoch ); - - // apply partial to myself - assert(bh->data.length() == 0); - bufferptr bp = buffer::create_page_aligned(EBOFS_BLOCK_SIZE); - bh->data.push_back( bp ); - bh->data.copy_in(0, EBOFS_BLOCK_SIZE, bl); - bh->apply_partial(); - - // write "normally" - bc->mark_dirty(bh); - bc->bh_write(on, bh, bh->partial_tx_to);//cur_block); - - // clean up a bit - bh->partial_tx_to = 0; - bh->partial_tx_epoch = 0; - bh->partial.clear(); - } - else { - dout(10) << "rx_finish ignoring status on (dirty|tx|clean) " << *bh << dendl; - assert(bh->is_dirty() || // was overwritten - bh->is_tx() || // was overwritten and queued - bh->is_clean()); // was overwritten, queued, _and_ flushed to disk - } - - // trigger waiters - for (map >::iterator p = bh->waitfor_read.begin(); - p != bh->waitfor_read.end(); - p++) { - assert(p->first >= bh->start() && p->first < bh->end()); - waiters.splice(waiters.begin(), p->second); - } - bh->waitfor_read.clear(); - } - - finish_contexts(waiters); -} - - -void ObjectCache::tx_finish(ioh_t ioh, block_t start, block_t length, - version_t version, version_t epoch) -{ - dout(10) << "tx_finish " << start << "~" << length << " v" << version << dendl; - for (map::iterator p = data.lower_bound(start); - p != data.end(); - p++) { - BufferHead *bh = p->second; - dout(30) << "tx_finish ?bh " << *bh << dendl; - assert(p->first == bh->start()); - - // past? - if (p->first >= start+length) { - bh->oc->try_merge_bh_right(p); - break; - } - - if (bh->tx_ioh == ioh) - bh->tx_ioh = 0; - - if (!bh->is_tx()) { - dout(10) << "tx_finish bh not marked tx, skipping" << dendl; - continue; - } - assert(bh->is_tx()); - - if (version == bh->version) { - dout(10) << "tx_finish tx -> clean on " << *bh << dendl; - assert(bh->end() <= start+length); - bh->set_last_flushed(version); - bc->mark_clean(bh); - bh->oc->try_merge_bh_left(p); - } else { - dout(10) << "tx_finish leaving tx, " << bh->version << " > " << version - << " on " << *bh << dendl; - assert(bh->version > version); - } - } -} - - - -/* - * return any bh's that are (partially) in this range that are TX. - */ -int ObjectCache::find_tx(block_t start, block_t len, - list& tx) -{ - map::iterator p = data.lower_bound(start); - - block_t cur = start; - block_t left = len; - - /* don't care about overlap, we want things _fully_ in start~len. - if (p != data.begin() && - (p == data.end() || p->first > cur)) { - p--; // might overlap! - if (p->first + p->second->length() <= cur) - p++; // doesn't overlap. - } - */ - - while (left > 0) { - assert(cur+left == start+len); - - // at end? - if (p == data.end()) - break; - - if (p->first <= cur) { - // have it (or part of it) - BufferHead *e = p->second; - - if (e->end() <= start+len && - e->is_tx()) - tx.push_back(e); - - block_t lenfromcur = MIN(e->end() - cur, left); - cur += lenfromcur; - left -= lenfromcur; - p++; - continue; // more? - } else if (p->first > cur) { - // gap.. miss - block_t next = p->first; - left -= (next-cur); - cur = next; - continue; - } - else - assert(0); - } - - return 0; -} - - -int ObjectCache::try_map_read(block_t start, block_t len) -{ - map::iterator p = data.lower_bound(start); - - block_t cur = start; - block_t left = len; - - if (p != data.begin() && - (p == data.end() || p->first > cur)) { - p--; // might overlap! - if (p->first + p->second->length() <= cur) - p++; // doesn't overlap. - } - - int num_missing = 0; - - while (left > 0) { - // at end? - if (p == data.end()) { - // rest is a miss. - vector exv; - on->map_extents(cur, - left, // no prefetch here! - exv); - - num_missing += exv.size(); - left = 0; - cur = start+len; - break; - } - - if (p->first <= cur) { - // have it (or part of it) - BufferHead *e = p->second; - - if (e->is_clean() || - e->is_dirty() || - e->is_tx()) { - dout(20) << "try_map_read hit " << *e << dendl; - } - else if (e->is_rx()) { - dout(20) << "try_map_read rx " << *e << dendl; - num_missing++; - } - else if (e->is_partial()) { - dout(-20) << "try_map_read partial " << *e << dendl; - num_missing++; - } - else { - dout(0) << "try_map_read got unexpected " << *e << dendl; - assert(0); - } - - block_t lenfromcur = MIN(e->end() - cur, left); - cur += lenfromcur; - left -= lenfromcur; - p++; - continue; // more? - } else if (p->first > cur) { - // gap.. miss - block_t next = p->first; - vector exv; - on->map_extents(cur, - MIN(next-cur, left), // no prefetch - exv); - - dout(20) << "try_map_read gap of " << p->first-cur << " blocks, " - << exv.size() << " extents" << dendl; - num_missing += exv.size(); - left -= (p->first - cur); - cur = p->first; - continue; // more? - } - else - assert(0); - } - - assert(left == 0); - assert(cur == start+len); - return num_missing; -} - - - - - -/* - * map a range of blocks into buffer_heads. - * - create missing buffer_heads as necessary. - * - fragment along disk extent boundaries - */ -int ObjectCache::map_read(block_t start, block_t len, - map& hits, - map& missing, - map& rx, - map& partial) { - - map::iterator p = data.lower_bound(start); - - block_t cur = start; - block_t left = len; - - if (p != data.begin() && - (p == data.end() || p->first > cur)) { - p--; // might overlap! - if (p->first + p->second->length() <= cur) - p++; // doesn't overlap. - } - - while (left > 0) { - // at end? - if (p == data.end()) { - // rest is a miss. - vector exv; - //on->map_extents(cur, left, exv); // we might consider some prefetch here. - on->map_extents(cur, - //MIN(left + g_conf.ebofs_max_prefetch, // prefetch - //on->object_blocks-cur), - left, // no prefetch - exv); - for (unsigned i=0; i 0; i++) { - BufferHead *n = new BufferHead(this); - n->set_start( cur ); - n->set_length( exv[i].length ); - bc->add_bh(n); - missing[cur] = n; - dout(20) << "map_read miss " << left << " left, " << *n << dendl; - cur += MIN(left,exv[i].length); - left -= MIN(left,exv[i].length); - } - assert(left == 0); - assert(cur == start+len); - break; - } - - if (p->first <= cur) { - // have it (or part of it) - BufferHead *e = p->second; - - if (e->is_clean() || - e->is_dirty() || - e->is_tx()) { - hits[cur] = e; // readable! - dout(20) << "map_read hit " << *e << dendl; - bc->touch(e); - } - else if (e->is_rx()) { - rx[cur] = e; // missing, not readable. - dout(20) << "map_read rx " << *e << dendl; - } - else if (e->is_partial()) { - partial[cur] = e; - dout(20) << "map_read partial " << *e << dendl; - } - else { - dout(0) << "map_read ??? got unexpected " << *e << dendl; - assert(0); - } - - block_t lenfromcur = MIN(e->end() - cur, left); - cur += lenfromcur; - left -= lenfromcur; - p++; - continue; // more? - } else if (p->first > cur) { - // gap.. miss - block_t next = p->first; - vector exv; - on->map_extents(cur, - //MIN(next-cur, MIN(left + g_conf.ebofs_max_prefetch, // prefetch - // on->object_blocks-cur)), - MIN(next-cur, left), // no prefetch - exv); - - for (unsigned i=0; i0; i++) { - BufferHead *n = new BufferHead(this); - n->set_start( cur ); - n->set_length( exv[i].length ); - bc->add_bh(n); - missing[cur] = n; - cur += MIN(left, n->length()); - left -= MIN(left, n->length()); - dout(20) << "map_read gap " << *n << dendl; - } - continue; // more? - } - else - assert(0); - } - - assert(left == 0); - assert(cur == start+len); - return 0; -} - - -/* - * map a range of pages on an object's buffer cache. - * - * - break up bufferheads that don't fall completely within the range - * - cancel rx ops we obsolete. - * - resubmit rx ops if we split bufferheads - * - * - leave potentially obsoleted tx ops alone (for now) - * - don't worry about disk extent boundaries (yet) - */ -int ObjectCache::map_write(block_t start, block_t len, - map& hits, - version_t super_epoch) -{ - map::iterator p; - - // hack speed up common cases - if (start == 0) { - p = data.begin(); - } else if (start + len == on->object_blocks && len == 1 && !data.empty()) { - // append hack. - p = data.end(); - p--; - if (p->first < start) p++; - } else { - p = data.lower_bound(start); - } - - dout(10) << "map_write " << *on << " " << start << "~" << len << dendl; - // p->first >= start - - block_t cur = start; - block_t left = len; - - if (p != data.begin() && - (p == data.end() || p->first > cur)) { - p--; // might overlap! - if (p->first + p->second->length() <= cur) - p++; // doesn't overlap. - } - - //dump(); - - while (left > 0) { - // max for this bh (bc of (re)alloc on disk) - block_t max = left; - - // based on disk extent boundary ... - vector exv; - on->map_extents(cur, max, exv); - if (exv.size() > 1) - max = exv[0].length; - - dout(10) << "map_write " << cur << "~" << max << dendl; - - // at end? - if (p == data.end()) { - BufferHead *n = new BufferHead(this); - n->set_start( cur ); - n->set_length( max ); - bc->add_bh(n); - hits[cur] = n; - left -= max; - cur += max; - continue; - } - - dout(10) << "p is " << *p->second << dendl; - - - if (p->first <= cur) { - BufferHead *bh = p->second; - dout(10) << "map_write bh " << *bh << " intersected" << dendl; - - if (p->first < cur) { - if (cur+max >= p->first+p->second->length()) { - // we want right bit (one splice) - if (bh->is_rx() && bc->bh_cancel_read(bh)) { - BufferHead *right = bc->split(bh, cur); - bc->bh_read(on, bh); // reread left bit - bh = right; - } else if (bh->is_tx() && bh->epoch_modified == super_epoch && bc->bh_cancel_write(bh, super_epoch)) { - BufferHead *right = bc->split(bh, cur); - bc->bh_write(on, bh); // rewrite left bit - bh = right; - } else { - bh = bc->split(bh, cur); // just split it - } - p++; - assert(p->second == bh); - } else { - // we want middle bit (two splices) - if (bh->is_rx() && bc->bh_cancel_read(bh)) { - BufferHead *middle = bc->split(bh, cur); - bc->bh_read(on, bh); // reread left - p++; - assert(p->second == middle); - BufferHead *right = bc->split(middle, cur+max); - bc->bh_read(on, right); // reread right - bh = middle; - } else if (bh->is_tx() && bh->epoch_modified == super_epoch && bc->bh_cancel_write(bh, super_epoch)) { - BufferHead *middle = bc->split(bh, cur); - bc->bh_write(on, bh); // redo left - p++; - assert(p->second == middle); - BufferHead *right = bc->split(middle, cur+max); - bc->bh_write(on, right); // redo right - bh = middle; - } else { - BufferHead *middle = bc->split(bh, cur); - p++; - assert(p->second == middle); - bc->split(middle, cur+max); - bh = middle; - } - } - } else if (p->first == cur) { - if (p->second->length() <= max) { - // whole bufferhead, piece of cake. - } else { - // we want left bit (one splice) - if (bh->is_rx() && bc->bh_cancel_read(bh)) { - BufferHead *right = bc->split(bh, cur+max); - bc->bh_read(on, right); // re-rx the right bit - } else if (bh->is_tx() && bh->epoch_modified == super_epoch && bc->bh_cancel_write(bh, super_epoch)) { - BufferHead *right = bc->split(bh, cur+max); - bc->bh_write(on, right); // re-tx the right bit - } else { - bc->split(bh, cur+max); // just split - } - } - } - - // try to cancel tx? - if (bh->is_tx() && bh->epoch_modified == super_epoch) bc->bh_cancel_write(bh, super_epoch); - - // put in our map - hits[cur] = bh; - - // keep going. - block_t lenfromcur = bh->end() - cur; - cur += lenfromcur; - left -= lenfromcur; - p++; - continue; - } else { - // gap! - block_t next = p->first; - block_t glen = MIN(next-cur, max); - dout(10) << "map_write gap " << cur << "~" << glen << dendl; - BufferHead *n = new BufferHead(this); - n->set_start( cur ); - n->set_length( glen ); - bc->add_bh(n); - hits[cur] = n; - - cur += glen; - left -= glen; - continue; // more? - } - } - - assert(left == 0); - assert(cur == start+len); - return 0; -} - -/* don't need this. -int ObjectCache::scan_versions(block_t start, block_t len, - version_t& low, version_t& high) -{ - map::iterator p = data.lower_bound(start); - // p->first >= start - - if (p != data.begin() && p->first > start) { - p--; // might overlap? - if (p->first + p->second->length() <= start) - p++; // doesn't overlap. - } - if (p->first >= start+len) - return -1; // to the right. no hits. - - // start - low = high = p->second->get_version(); - - for (p++; p != data.end(); p++) { - // past? - if (p->first >= start+len) break; - - const version_t v = p->second->get_version(); - if (low > v) low = v; - if (high < v) high = v; - } - - return 0; -} -*/ - -void ObjectCache::touch_bottom(block_t bstart, block_t blast) -{ - for (map::iterator p = data.lower_bound(bstart); - p != data.end(); - ++p) { - BufferHead *bh = p->second; - - // don't trim unless it's entirely in our range - if (bh->start() < bstart) continue; - if (bh->end() > blast) break; - - dout(12) << "moving " << *bh << " to bottom of lru" << dendl; - bc->touch_bottom(bh); // move to bottom of lru list - } -} - - -void ObjectCache::truncate(block_t blocks, version_t super_epoch) -{ - dout(7) << "truncate " << object_id - << " " << blocks << " blocks" - << dendl; - - while (!data.empty()) { - block_t bhoff = data.rbegin()->first; - BufferHead *bh = data.rbegin()->second; - - if (bh->end() <= blocks) break; - - bool uncom = on->uncommitted.contains(bh->start(), bh->length()); - dout(10) << "truncate " << *bh << " uncom " << uncom - << " of " << on->uncommitted - << dendl; - - if (bhoff < blocks) { - // we want right bit (one splice) - if (bh->is_rx() && bc->bh_cancel_read(bh)) { - BufferHead *right = bc->split(bh, blocks); - bc->bh_read(on, bh); // reread left bit - bh = right; - } else if (bh->is_tx() && uncom && bh->epoch_modified == super_epoch && bc->bh_cancel_write(bh, super_epoch)) { - BufferHead *right = bc->split(bh, blocks); - bc->bh_write(on, bh); // rewrite left bit - bh = right; - } else { - bh = bc->split(bh, blocks); // just split it - } - // no worries about partials up here, they're always 1 block (and thus never split) - } else { - // whole thing - // cancel any pending/queued io, if possible. - if (bh->is_rx()) - bc->bh_cancel_read(bh); - if (bh->is_tx() && uncom && bh->epoch_modified == super_epoch) - bc->bh_cancel_write(bh, super_epoch); - if (bh->shadow_of) { - dout(10) << "truncate " << *bh << " unshadowing " << *bh->shadow_of << dendl; - // shadow - bh->shadow_of->remove_shadow(bh); - if (bh->is_partial()) - bc->cancel_shadow_partial(bh->rx_from.start, bh); - } else { - // normal - if (bh->is_partial() && uncom) - bc->bh_cancel_partial_write(bh); - } - } - - for (map >::iterator p = bh->waitfor_read.begin(); - p != bh->waitfor_read.end(); - p++) { - finish_contexts(p->second, -1); - } - - bc->remove_bh(bh); - delete bh; - } -} - - -void ObjectCache::clone_to(Onode *other) -{ - ObjectCache *ton = 0; - - for (map::iterator p = data.begin(); - p != data.end(); - p++) { - BufferHead *bh = p->second; - dout(10) << "clone_to ? " << *bh << dendl; - if (bh->is_dirty() || bh->is_tx() || bh->is_partial()) { - // dup dirty or tx bh's - if (!ton) - ton = other->get_oc(bc); - BufferHead *nbh = new BufferHead(ton); - nbh->set_start( bh->start() ); - nbh->set_length( bh->length() ); - nbh->data = bh->data; // just copy refs to underlying buffers. - bc->add_bh(nbh); - - if (bh->is_partial()) { - dout(0) << "clone_to PARTIAL FIXME NOT FULLY IMPLEMENTED ******" << dendl; - nbh->partial = bh->partial; - bc->mark_partial(nbh); - // register as shadow_partial - bc->add_shadow_partial(bh->rx_from.start, nbh); - } else { - // clean buffer will shadow - bh->add_shadow(nbh); - bc->mark_clean(nbh); - } - - dout(10) << "clone_to dup " << *bh << " -> " << *nbh << dendl; - } - } -} - - - -BufferHead *ObjectCache::merge_bh_left(BufferHead *left, BufferHead *right) -{ - dout(10) << "merge_bh_left " << *left << " " << *right << dendl; - assert(left->end() == right->start()); - assert(left->is_clean()); - assert(right->is_clean()); - assert(right->get_num_ref() == 0); - - // hrm, is this right? - if (right->version > left->version) left->version = right->version; - if (right->last_flushed > left->last_flushed) left->last_flushed = right->last_flushed; - - left->set_length(left->length() + right->length()); - left->data.claim_append(right->data); - - // remove right - remove_bh(right); - bc->lru_rest.lru_remove(right); - delete right; - dout(10) << "merge_bh_left result " << *left << dendl; - return left; -} - -/* wait until this has a user -void ObjectCache::try_merge_bh(BufferHead *bh) -{ - dout(-10) << "try_merge_bh " << *bh << dendl; - - map::iterator p = data.lower_bound(bh->start()); - assert(p->second == bh); - - try_merge_bh_left(p); - try_merge_bh_right(p); -} -*/ - - -void ObjectCache::try_merge_bh_left(map::iterator& p) -{ - BufferHead *bh = p->second; - dout(10) << "try_merge_bh_left " << *bh << dendl; - - // left? - if (p != data.begin()) { - p--; - if (p->second->end() == bh->start() && - p->second->is_clean() && - bh->is_clean() && - bh->get_num_ref() == 0 && - bh->data.buffers().size() < 8 && - p->second->data.buffers().size() < 8) - bh = merge_bh_left(p->second, bh); // yay! - else - p++; // nope. - } -} - -void ObjectCache::try_merge_bh_right(map::iterator& p) -{ - BufferHead *bh = p->second; - dout(10) << "try_merge_bh_right " << *bh << dendl; - - // right? - map::iterator o = p; - p++; - if (p != data.end() && - bh->end() == p->second->start() && - p->second->is_clean() && - bh->is_clean() && - p->second->get_num_ref() == 0 && - bh->data.buffers().size() < 8 && - p->second->data.buffers().size() < 8) { - BufferHead *right = p->second; - p--; - merge_bh_left(bh, right); - } else - p = o; -} - - - -/************** BufferCache ***************/ - -#undef dout -#define dout(x) if (x <= g_conf.debug_ebofs) *_dout << dbeginl << g_clock.now() << " ebofs.bc." - - - -BufferHead *BufferCache::split(BufferHead *orig, block_t after) -{ - dout(20) << "split " << *orig << " at " << after << dendl; - - // split off right - BufferHead *right = new BufferHead(orig->get_oc()); - right->set_version(orig->get_version()); - right->epoch_modified = orig->epoch_modified; - right->last_flushed = orig->last_flushed; - right->set_state(orig->get_state()); - - block_t newleftlen = after - orig->start(); - right->set_start( after ); - right->set_length( orig->length() - newleftlen ); - - // shorten left - stat_sub(orig); - orig->set_length( newleftlen ); - stat_add(orig); - - // add right - add_bh(right); - - // adjust rx_from - if (orig->is_rx()) { - right->rx_from = orig->rx_from; - orig->rx_from.length = newleftlen; - right->rx_from.length -= newleftlen; - right->rx_from.start += newleftlen; - } - - // dup shadows - for (set::iterator p = orig->shadows.begin(); - p != orig->shadows.end(); - ++p) - right->add_shadow(*p); - - // split buffers too - bufferlist bl; - bl.claim(orig->data); - if (bl.length()) { - assert(bl.length() == (orig->length()+right->length())*EBOFS_BLOCK_SIZE); - right->data.substr_of(bl, orig->length()*EBOFS_BLOCK_SIZE, right->length()*EBOFS_BLOCK_SIZE); - orig->data.substr_of(bl, 0, orig->length()*EBOFS_BLOCK_SIZE); - } - - // move read waiters - if (!orig->waitfor_read.empty()) { - map >::iterator o, p = orig->waitfor_read.end(); - p--; - while (p != orig->waitfor_read.begin()) { - if (p->first < right->start()) break; - dout(0) << "split moving waiters at block " << p->first << " to right bh" << dendl; - right->waitfor_read[p->first].swap( p->second ); - o = p; - p--; - orig->waitfor_read.erase(o); - } - } - - dout(20) << "split left is " << *orig << dendl; - dout(20) << "split right is " << *right << dendl; - return right; -} - - - - -void BufferCache::bh_read(Onode *on, BufferHead *bh, block_t from) -{ - dout(10) << "bh_read " << *on << " on " << *bh << dendl; - - if (bh->is_missing()) { - mark_rx(bh); - } else { - assert(bh->is_partial()); - } - - // get extent. there should be only one! - vector exv; - on->map_extents(bh->start(), bh->length(), exv); - assert(exv.size() == 1); - Extent ex = exv[0]; - - if (from) { // force behavior, used for reading partials - dout(10) << "bh_read forcing read from block " << from << " (for a partial)" << dendl; - ex.start = from; - ex.length = 1; - } - - // this should be empty!! - assert(bh->rx_ioh == 0); - - dout(20) << "bh_read " << *on << " " << *bh << " from " << ex << dendl; - - C_OC_RxFinish *fin = new C_OC_RxFinish(ebofs_lock, on->oc, - bh->start(), bh->length(), - ex.start); - - //bufferpool.alloc(EBOFS_BLOCK_SIZE*bh->length(), fin->bl); // new buffers! - fin->bl.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*bh->length()) ); - - bh->rx_ioh = dev.read(ex.start, ex.length, fin->bl, - fin); - bh->rx_from = ex; - on->oc->get(); - -} - -bool BufferCache::bh_cancel_read(BufferHead *bh) -{ - if (bh->rx_ioh && dev.cancel_io(bh->rx_ioh) >= 0) { - dout(10) << "bh_cancel_read on " << *bh << dendl; - bh->rx_ioh = 0; - mark_missing(bh); - int l = bh->oc->put(); - assert(l); - return true; - } - return false; -} - -void BufferCache::bh_write(Onode *on, BufferHead *bh, block_t shouldbe) -{ - dout(10) << "bh_write " << *on << " on " << *bh << " in epoch " << bh->epoch_modified << dendl; - assert(bh->get_version() > 0); - - assert(bh->is_dirty()); - mark_tx(bh); - - // get extents - vector exv; - on->map_extents(bh->start(), bh->length(), exv); - assert(exv.size() == 1); - Extent ex = exv[0]; - - if (shouldbe) - assert(ex.length == 1 && ex.start == shouldbe); - - dout(20) << "bh_write " << *on << " " << *bh << " to " << ex << dendl; - - //assert(bh->tx_ioh == 0); - - assert(bh->get_last_flushed() < bh->get_version()); - - bh->tx_block = ex.start; - bh->tx_ioh = dev.write(ex.start, ex.length, bh->data, - new C_OC_TxFinish(ebofs_lock, on->oc, - bh->start(), bh->length(), - bh->get_version(), - bh->epoch_modified), - "bh_write"); - - on->oc->get(); - inc_unflushed( EBOFS_BC_FLUSH_BHWRITE, bh->epoch_modified ); - - /* - // assert: no partials on the same block - // hose any partial on the same block - if (bh->partial_write.count(ex.start)) { - dout(10) << "bh_write hosing parital write on same block " << ex.start << " " << *bh << dendl; - dec_unflushed( bh->partial_write[ex.start].epoch ); - bh->partial_write.erase(ex.start); - } - */ -} - - -bool BufferCache::bh_cancel_write(BufferHead *bh, version_t cur_epoch) -{ - assert(bh->is_tx()); - assert(bh->epoch_modified == cur_epoch); - assert(bh->epoch_modified > 0); - if (bh->tx_ioh && dev.cancel_io(bh->tx_ioh) >= 0) { - dout(10) << "bh_cancel_write on " << *bh << dendl; - bh->tx_ioh = 0; - mark_dirty(bh); - - dec_unflushed( EBOFS_BC_FLUSH_BHWRITE, bh->epoch_modified ); // assert.. this should be the same epoch! - - int l = bh->oc->put(); - assert(l); - return true; - } - return false; -} - -void BufferCache::tx_finish(ObjectCache *oc, - ioh_t ioh, block_t start, block_t length, - version_t version, version_t epoch) -{ - ebofs_lock.Lock(); - - // finish oc - if (oc->put() == 0) { - delete oc; - } else - oc->tx_finish(ioh, start, length, version, epoch); - - // update unflushed counter - assert(get_unflushed(EBOFS_BC_FLUSH_BHWRITE, epoch) > 0); - dec_unflushed(EBOFS_BC_FLUSH_BHWRITE, epoch); - - ebofs_lock.Unlock(); -} - -void BufferCache::rx_finish(ObjectCache *oc, - ioh_t ioh, block_t start, block_t length, - block_t diskstart, - bufferlist& bl) -{ - ebofs_lock.Lock(); - dout(10) << "rx_finish ioh " << ioh << " on " << start << "~" << length - << ", at device block " << diskstart << dendl; - - // oc - if (oc->put() == 0) - delete oc; - else - oc->rx_finish(ioh, start, length, bl); - - // finish any partials? - // note: these are partials that were re-written after a commit, - // or for whom the OC was destroyed (eg truncated after a commit) - map >::iterator sp = partial_write.lower_bound(diskstart); - while (sp != partial_write.end()) { - if (sp->first >= diskstart+length) break; - assert(sp->first >= diskstart); - - block_t pblock = sp->first; - map writes; - writes.swap( sp->second ); - - map >::iterator t = sp; - sp++; - partial_write.erase(t); - - for (map::iterator p = writes.begin(); - p != writes.end(); - p++) { - dout(10) << "rx_finish partial from " << pblock << " -> " << p->first - << " for epoch " << p->second.epoch - //<< " (bh.epoch_modified is now " << bh->epoch_modified << ")" - << dendl; - // this had better be a past epoch - //assert(p->epoch == epoch_modified - 1); // ?? - - // make the combined block - bufferlist combined; - bufferptr bp = buffer::create_page_aligned(EBOFS_BLOCK_SIZE); - combined.push_back( bp ); - combined.copy_in((pblock-diskstart)*EBOFS_BLOCK_SIZE, (pblock-diskstart+1)*EBOFS_BLOCK_SIZE, bl); - BufferHead::apply_partial( combined, p->second.partial ); - - // write it! - dev.write( pblock, 1, combined, - new C_OC_PartialTxFinish( this, p->second.epoch ), - "finish_partials"); - } - } - - // shadow partials? - { - list waiters; - map >::iterator sp = shadow_partials.lower_bound(diskstart); - while (sp != shadow_partials.end()) { - if (sp->first >= diskstart+length) break; - assert(sp->first >= diskstart); - - block_t pblock = sp->first; - set ls; - ls.swap( sp->second ); - - map >::iterator t = sp; - sp++; - shadow_partials.erase(t); - - for (set::iterator p = ls.begin(); - p != ls.end(); - ++p) { - BufferHead *bh = *p; - dout(10) << "rx_finish applying shadow_partial for " << pblock - << " to " << *bh << dendl; - bufferptr bp = buffer::create_page_aligned(EBOFS_BLOCK_SIZE); - bh->data.clear(); - bh->data.push_back( bp ); - bh->data.copy_in((pblock-diskstart)*EBOFS_BLOCK_SIZE, - (pblock-diskstart+1)*EBOFS_BLOCK_SIZE, - bl); - bh->apply_partial(); - bh->set_state(BufferHead::STATE_CLEAN); - - // trigger waiters - for (map >::iterator p = bh->waitfor_read.begin(); - p != bh->waitfor_read.end(); - p++) { - assert(p->first >= bh->start() && p->first < bh->end()); - waiters.splice(waiters.begin(), p->second); - } - bh->waitfor_read.clear(); - } - } - - // kick waiters - finish_contexts(waiters); - } - - // done. - ebofs_lock.Unlock(); -} - -void BufferCache::partial_tx_finish(version_t epoch) -{ - ebofs_lock.Lock(); - - dout(10) << "partial_tx_finish in epoch " << epoch << dendl; - - // update unflushed counter - assert(get_unflushed(EBOFS_BC_FLUSH_PARTIAL, epoch) > 0); - dec_unflushed(EBOFS_BC_FLUSH_PARTIAL, epoch); - - ebofs_lock.Unlock(); -} - - - - -void BufferCache::bh_queue_partial_write(Onode *on, BufferHead *bh) -{ - assert(bh->get_version() > 0); - - assert(bh->is_partial()); - assert(bh->length() == 1); - - // get the block no - vector exv; - on->map_extents(bh->start(), bh->length(), exv); - assert(exv.size() == 1); - block_t b = exv[0].start; - assert(exv[0].length == 1); - bh->partial_tx_to = exv[0].start; - bh->partial_tx_epoch = bh->epoch_modified; - - dout(10) << "bh_queue_partial_write " << *on << " on " << *bh << " block " << b << " epoch " << bh->epoch_modified << dendl; - - - // copy map state, queue for this block - assert(bh->rx_from.length == 1); - queue_partial( bh->rx_from.start, bh->partial_tx_to, bh->partial, bh->partial_tx_epoch ); -} - -void BufferCache::bh_cancel_partial_write(BufferHead *bh) -{ - assert(bh->is_partial()); - assert(bh->length() == 1); - - cancel_partial( bh->rx_from.start, bh->partial_tx_to, bh->partial_tx_epoch ); -} - - -void BufferCache::queue_partial(block_t from, block_t to, - map& partial, version_t epoch) -{ - dout(10) << "queue_partial " << from << " -> " << to - << " in epoch " << epoch - << dendl; - - if (partial_write[from].count(to)) { - // this should be in the same epoch. - assert( partial_write[from][to].epoch == epoch); - assert(0); // actually.. no! - } else { - inc_unflushed( EBOFS_BC_FLUSH_PARTIAL, epoch ); - } - - partial_write[from][to].partial = partial; - partial_write[from][to].epoch = epoch; -} - -void BufferCache::cancel_partial(block_t from, block_t to, version_t epoch) -{ - assert(partial_write.count(from)); - assert(partial_write[from].count(to)); - assert(partial_write[from][to].epoch == epoch); - - dout(10) << "cancel_partial " << from << " -> " << to - << " (was epoch " << partial_write[from][to].epoch << ")" - << dendl; - - partial_write[from].erase(to); - if (partial_write[from].empty()) - partial_write.erase(from); - - dec_unflushed( EBOFS_BC_FLUSH_PARTIAL, epoch ); -} - - -void BufferCache::add_shadow_partial(block_t from, BufferHead *bh) -{ - dout(10) << "add_shadow_partial from " << from << " " << *bh << dendl; - shadow_partials[from].insert(bh); -} - -void BufferCache::cancel_shadow_partial(block_t from, BufferHead *bh) -{ - dout(10) << "cancel_shadow_partial from " << from << " " << *bh << dendl; - shadow_partials[from].erase(bh); -} diff --git a/branches/sage/ebofs2/ebofs/BufferCache.h b/branches/sage/ebofs2/ebofs/BufferCache.h deleted file mode 100644 index 346a5cc785618..0000000000000 --- a/branches/sage/ebofs2/ebofs/BufferCache.h +++ /dev/null @@ -1,723 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __EBOFS_BUFFERCACHE_H -#define __EBOFS_BUFFERCACHE_H - -#include "include/lru.h" -#include "include/Context.h" - -#include "common/Clock.h" - -#include "types.h" -#include "BlockDevice.h" - -#include "include/interval_set.h" -#include "include/xlist.h" - -class ObjectCache; -class BufferCache; -class Onode; - -class BufferHead : public LRUObject { - public: - /* - * - buffer_heads should always break across disk extent boundaries - * - partial buffer_heads are always 1 block. - */ - const static int STATE_MISSING = 0; // missing; data is on disk, but not loaded. - const static int STATE_CLEAN = 1; // Rw clean - const static int STATE_DIRTY = 2; // RW dirty - const static int STATE_TX = 3; // Rw flushing to disk - const static int STATE_RX = 4; // w reading from disk - const static int STATE_PARTIAL = 5; // reading from disk, + partial content map. always 1 block. - - public: - ObjectCache *oc; - - bufferlist data; - - ioh_t rx_ioh; // - Extent rx_from; - ioh_t tx_ioh; // - block_t tx_block; - block_t partial_tx_to; - version_t partial_tx_epoch; - - map partial; // partial dirty content overlayed onto incoming data - - map< block_t, list > waitfor_read; - - set shadows; // shadow bh's that clone()ed me. - BufferHead* shadow_of; - - - private: - int ref; - int state; - - public: - version_t epoch_modified; - - version_t version; // current version in cache - version_t last_flushed; // last version flushed to disk - - Extent object_loc; // block position _in_object_ - - utime_t dirty_stamp; - //xlist::item xlist_dirty; - - bool want_to_expire; // wants to be at bottom of lru - - public: - BufferHead(ObjectCache *o) : - oc(o), //cancellable_ioh(0), tx_epoch(0), - rx_ioh(0), tx_ioh(0), tx_block(0), partial_tx_to(0), partial_tx_epoch(0), - shadow_of(0), - ref(0), state(STATE_MISSING), epoch_modified(0), version(0), last_flushed(0), - //xlist_dirty(this), - want_to_expire(false) - {} - ~BufferHead() { - unpin_shadows(); - } - - ObjectCache *get_oc() { return oc; } - - int get() { - assert(ref >= 0); - if (ref == 0) lru_pin(); - return ++ref; - } - int put() { - assert(ref > 0); - if (ref == 1) lru_unpin(); - --ref; - return ref; - } - int get_num_ref() { return ref; } - - block_t start() { return object_loc.start; } - void set_start(block_t s) { object_loc.start = s; } - block_t length() { return object_loc.length; } - void set_length(block_t l) { object_loc.length = l; } - block_t end() { return start() + length(); } - block_t last() { return end()-1; } - - version_t get_version() { return version; } - void set_version(version_t v) { version = v; } - version_t get_last_flushed() { return last_flushed; } - void set_last_flushed(version_t v) { - if (v <= last_flushed) cout << "last_flushed begin set to " << v << ", was " << last_flushed << std::endl; - assert(v > last_flushed); - last_flushed = v; - } - - utime_t get_dirty_stamp() { return dirty_stamp; } - void set_dirty_stamp(utime_t t) { dirty_stamp = t; } - - void set_state(int s) { - if (s == STATE_PARTIAL || s == STATE_RX || s == STATE_TX) get(); - if (state == STATE_PARTIAL || state == STATE_RX || state == STATE_TX) put(); - - if ((state == STATE_TX && s != STATE_TX) || - (state == STATE_PARTIAL && s != STATE_PARTIAL)) - unpin_shadows(); - - state = s; - } - int get_state() { return state; } - - bool is_missing() { return state == STATE_MISSING; } - bool is_dirty() { return state == STATE_DIRTY; } - bool is_clean() { return state == STATE_CLEAN; } - bool is_tx() { return state == STATE_TX; } - bool is_rx() { return state == STATE_RX; } - bool is_partial() { return state == STATE_PARTIAL; } - - //bool is_partial_writes() { return !partial_write.empty(); } - //void finish_partials(); - //void cancel_partials(); - //void queue_partial_write(block_t b); - - void add_shadow(BufferHead *dup) { - shadows.insert(dup); - dup->shadow_of = this; - dup->get(); - } - void remove_shadow(BufferHead *dup) { - shadows.erase(dup); - dup->shadow_of = 0; - dup->put(); - } - void unpin_shadows() { - for (set::iterator p = shadows.begin(); - p != shadows.end(); - ++p) { - //cout << "unpin shadow " << *p << std::endl; - (*p)->shadow_of = 0; - (*p)->put(); - } - shadows.clear(); - } - - void copy_partial_substr(off_t start, off_t end, bufferlist& bl) { - map::iterator i = partial.begin(); - - // skip first bits (fully to left) - while ((i->first + i->second.length() < start) && - i != partial.end()) - i++; - assert(i != partial.end()); - assert(i->first <= start); - - // first - unsigned bhoff = MAX(start, i->first) - i->first; - unsigned bhlen = MIN(end-start, i->second.length()); - bl.substr_of( i->second, bhoff, bhlen ); - - off_t pos = i->first + i->second.length(); - - // have continuous to end? - for (i++; i != partial.end(); i++) { - if (pos >= end) break; - assert(pos == i->first); - - pos = i->first + i->second.length(); - - if (pos <= end) { // this whole frag - bl.append( i->second ); - } else { // partial end - unsigned bhlen = end-start-bl.length(); - bufferlist frag; - frag.substr_of( i->second, 0, bhlen ); - bl.claim_append(frag); - break; // done. - } - } - - assert(pos >= end); - assert(bl.length() == (unsigned)(end-start)); - } - - bool have_partial_range(off_t start, off_t end) { - map::iterator i = partial.begin(); - - // skip first bits (fully to left) - while ((i->first + i->second.length() < start) && - i != partial.end()) - i++; - if (i == partial.end()) return false; - - // have start? - if (i->first > start) return false; - off_t pos = i->first + i->second.length(); - - // have continuous to end? - for (i++; i != partial.end(); i++) { - assert(pos <= i->first); - if (pos < i->first) return false; - assert(pos == i->first); - pos = i->first + i->second.length(); - if (pos >= end) break; // gone far enough - } - - if (pos >= end) return true; - return false; - } - - bool partial_is_complete(off_t size) { - return have_partial_range( 0, MIN(size, EBOFS_BLOCK_SIZE) ); - //(off_t)(start()*EBOFS_BLOCK_SIZE), - //MIN( size, (off_t)(end()*EBOFS_BLOCK_SIZE) ) ); - } - void apply_partial() { - apply_partial(data, partial); - partial.clear(); - } - static void apply_partial(bufferlist& bl, map& pm) { - assert(bl.length() == (unsigned)EBOFS_BLOCK_SIZE); - //assert(partial_is_complete()); - //cout << "apply_partial" << std::endl; - for (map::iterator i = pm.begin(); - i != pm.end(); - i++) { - int pos = i->first; - //cout << " frag at opos " << i->first << " bhpos " << pos << " len " << i->second.length() << std::endl; - bl.copy_in(pos, i->second.length(), i->second); - } - pm.clear(); - } - void add_partial(off_t off, bufferlist& p) { - unsigned len = p.length(); - assert(len <= (unsigned)EBOFS_BLOCK_SIZE); - //assert(off >= (off_t)(start()*EBOFS_BLOCK_SIZE)); - //assert(off + len <= (off_t)(end()*EBOFS_BLOCK_SIZE)); - assert(off >= 0); - assert(off + len <= EBOFS_BLOCK_SIZE); - - // trim any existing that overlaps - map::iterator i = partial.begin(); - while (i != partial.end()) { - // is [off,off+len)... - // past i? - if (off >= i->first + i->second.length()) { - i++; - continue; - } - // before i? - if (i->first >= off+len) break; - - // does [off,off+len)... - // overlap all of i? - if (off <= i->first && off+len >= i->first + i->second.length()) { - // erase it and move on. - partial.erase(i++); - continue; - } - // overlap tail of i? - if (off > i->first && off+len >= i->first + i->second.length()) { - // shorten i. - bufferlist o; - o.claim( i->second ); - unsigned taillen = off - i->first; - i->second.substr_of(o, 0, taillen); - i++; - continue; - } - // overlap head of i? - if (off <= i->first && off+len < i->first + i->second.length()) { - // move i (make new tail). - off_t tailoff = off+len; - unsigned trim = tailoff - i->first; - partial[tailoff].substr_of(i->second, trim, i->second.length()-trim); - partial.erase(i++); // should now be at tailoff - i++; - continue; - } - // split i? - if (off > i->first && off+len < i->first + i->second.length()) { - bufferlist o; - o.claim( i->second ); - // shorten head - unsigned headlen = off - i->first; - i->second.substr_of(o, 0, headlen); - // new tail - unsigned tailoff = off+len - i->first; - unsigned taillen = o.length() - len - headlen; - partial[off+len].substr_of(o, tailoff, taillen); - break; - } - assert(0); - } - - // insert - partial[off] = p; - } - -}; - -inline ostream& operator<<(ostream& out, BufferHead& bh) -{ - out << "bufferhead(" << bh.start() << "~" << bh.length(); - out << " v" << bh.get_version() << "/" << bh.get_last_flushed(); - if (bh.is_missing()) out << " missing"; - if (bh.is_dirty()) out << " dirty"; - if (bh.is_clean()) out << " clean"; - if (bh.is_rx()) out << " rx"; - if (bh.is_tx()) out << " tx"; - if (bh.is_partial()) out << " partial"; - - // include epoch modified? - if (bh.is_dirty() || bh.is_tx() || bh.is_partial()) - out << "(e" << bh.epoch_modified << ")"; - - //out << " " << bh.data.length(); - out << " " << &bh; - out << ")"; - return out; -} - - -class ObjectCache { - public: - object_t object_id; - Onode *on; - BufferCache *bc; - - private: - map data; - int ref; - - public: - version_t write_count; - - - public: - ObjectCache(object_t o, Onode *_on, BufferCache *b) : - object_id(o), on(_on), bc(b), ref(0), - write_count(0) { } - ~ObjectCache() { - assert(data.empty()); - assert(ref == 0); - } - - int get() { - ++ref; - //cout << "oc.get " << object_id << " " << ref << std::endl; - return ref; - } - int put() { - assert(ref > 0); - --ref; - //cout << "oc.put " << object_id << " " << ref << std::endl; - return ref; - } - - object_t get_object_id() { return object_id; } - - void add_bh(BufferHead *bh) { - // add to my map - assert(data.count(bh->start()) == 0); - - if (0) { // sanity check FIXME DEBUG - //cout << "add_bh " << bh->start() << "~" << bh->length() << std::endl; - map::iterator p = data.lower_bound(bh->start()); - if (p != data.end()) { - //cout << " after " << *p->second << std::endl; - //cout << " after starts at " << p->first << std::endl; - assert(p->first >= bh->end()); - } - if (p != data.begin()) { - p--; - //cout << " before starts at " << p->second->start() - //<< " and ends at " << p->second->end() << std::endl; - //cout << " before " << *p->second << std::endl; - assert(p->second->end() <= bh->start()); - } - } - - data[bh->start()] = bh; - } - void remove_bh(BufferHead *bh) { - assert(data.count(bh->start())); - data.erase(bh->start()); - } - bool is_empty() { return data.empty(); } - - void try_merge_bh(BufferHead *bh); - void try_merge_bh_left(map::iterator& p); - void try_merge_bh_right(map::iterator& p); - BufferHead* merge_bh_left(BufferHead *left, BufferHead *right); - - int find_tx(block_t start, block_t len, - list& tx); - - int map_read(block_t start, block_t len, - map& hits, // hits - map& missing, // read these from disk - map& rx, // wait for these to finish reading from disk - map& partial); // (maybe) wait for these to read from disk - int try_map_read(block_t start, block_t len); // just tell us how many extents we're missing. - - - int map_write(block_t start, block_t len, - map& hits, - version_t super_epoch); // can write to these. - void touch_bottom(block_t bstart, block_t blast); - - BufferHead *split(BufferHead *bh, block_t off); - - /*int scan_versions(block_t start, block_t len, - version_t& low, version_t& high); - */ - - void rx_finish(ioh_t ioh, block_t start, block_t length, bufferlist& bl); - void tx_finish(ioh_t ioh, block_t start, block_t length, version_t v, version_t epoch); - - void truncate(block_t blocks, version_t super_epoch); - // void tear_down(); - - void clone_to(Onode *other); - - void dump() { - for (map::iterator i = data.begin(); - i != data.end(); - i++) - cout << "dump: " << i->first << ": " << *i->second << std::endl; - } - -}; - - - -class BufferCache { - public: - Mutex &ebofs_lock; // hack: this is a ref to global ebofs_lock - BlockDevice &dev; - - //xlist dirty_bh; - - LRU lru_dirty, lru_rest; - - private: - Cond stat_cond; - Cond flush_cond; - int stat_waiter; - - off_t stat_clean; - off_t stat_dirty; - off_t stat_rx; - off_t stat_tx; - off_t stat_partial; - off_t stat_missing; - -#define EBOFS_BC_FLUSH_BHWRITE 0 -#define EBOFS_BC_FLUSH_PARTIAL 1 - - map epoch_unflushed[2]; - - /* partial writes - incomplete blocks that can't be written until - * their prior content is read and overlayed with the new data. - * - * we put partial block management here because objects may be deleted - * before the read completes, but the write may have been committed in a - * prior epoch. - * - * we map: src block -> dest block -> PartialWrite - * - * really, at most there will only ever be two of these, for current+previous epochs. - */ - class PartialWrite { - public: - map partial; // partial dirty content overlayed onto incoming data - version_t epoch; - }; - - map > partial_write; // queued writes w/ partial content - map > shadow_partials; - - public: - BufferCache(BlockDevice& d, Mutex& el) : - ebofs_lock(el), dev(d), - stat_waiter(0), - stat_clean(0), stat_dirty(0), stat_rx(0), stat_tx(0), stat_partial(0), stat_missing(0) - {} - - - off_t get_size() { - return stat_clean+stat_dirty+stat_rx+stat_tx+stat_partial; - } - off_t get_trimmable() { - return stat_clean; - } - - - // bh's in cache - void add_bh(BufferHead *bh) { - bh->get_oc()->add_bh(bh); - if (bh->is_dirty()) { - lru_dirty.lru_insert_mid(bh); - //dirty_bh.push_back(&bh->xlist_dirty); - } else - lru_rest.lru_insert_mid(bh); - stat_add(bh); - } - void touch(BufferHead *bh) { - if (bh->is_dirty()) { - lru_dirty.lru_touch(bh); - } else - lru_rest.lru_touch(bh); - } - void touch_bottom(BufferHead *bh) { - if (bh->is_dirty()) { - bh->want_to_expire = true; - lru_dirty.lru_bottouch(bh); - } else - lru_rest.lru_bottouch(bh); - } - void remove_bh(BufferHead *bh) { - bh->get_oc()->remove_bh(bh); - stat_sub(bh); - if (bh->is_dirty()) { - lru_dirty.lru_remove(bh); - //dirty_bh.push_back(&bh->xlist_dirty); - } else - lru_rest.lru_remove(bh); - } - - // stats - void stat_add(BufferHead *bh) { - switch (bh->get_state()) { - case BufferHead::STATE_MISSING: stat_missing += bh->length(); break; - case BufferHead::STATE_CLEAN: stat_clean += bh->length(); break; - case BufferHead::STATE_DIRTY: stat_dirty += bh->length(); break; - case BufferHead::STATE_TX: stat_tx += bh->length(); break; - case BufferHead::STATE_RX: stat_rx += bh->length(); break; - case BufferHead::STATE_PARTIAL: stat_partial += bh->length(); break; - } - if (stat_waiter) stat_cond.Signal(); - } - void stat_sub(BufferHead *bh) { - switch (bh->get_state()) { - case BufferHead::STATE_MISSING: stat_missing -= bh->length(); break; - case BufferHead::STATE_CLEAN: stat_clean -= bh->length(); break; - case BufferHead::STATE_DIRTY: stat_dirty -= bh->length(); break; - case BufferHead::STATE_TX: stat_tx -= bh->length(); break; - case BufferHead::STATE_RX: stat_rx -= bh->length(); break; - case BufferHead::STATE_PARTIAL: stat_partial -= bh->length(); break; - } - } - off_t get_stat_tx() { return stat_tx; } - off_t get_stat_rx() { return stat_rx; } - off_t get_stat_dirty() { return stat_dirty; } - off_t get_stat_clean() { return stat_clean; } - off_t get_stat_partial() { return stat_partial; } - - - map &get_unflushed(int what) { - return epoch_unflushed[what]; - } - - int get_unflushed(int what, version_t epoch) { - return epoch_unflushed[what][epoch]; - } - void inc_unflushed(int what, version_t epoch) { - epoch_unflushed[what][epoch]++; - //cout << "inc_unflushed " << epoch << " now " << epoch_unflushed[epoch] << std::endl; - } - void dec_unflushed(int what, version_t epoch) { - epoch_unflushed[what][epoch]--; - //cout << "dec_unflushed " << epoch << " now " << epoch_unflushed[epoch] << std::endl; - if (epoch_unflushed[what][epoch] == 0) - flush_cond.Signal(); - } - - void waitfor_stat() { - stat_waiter++; - stat_cond.Wait(ebofs_lock); - stat_waiter--; - } - void waitfor_flush() { - flush_cond.Wait(ebofs_lock); - } - - - // bh state - void set_state(BufferHead *bh, int s) { - // move between lru lists? - if (s == BufferHead::STATE_DIRTY && bh->get_state() != BufferHead::STATE_DIRTY) { - lru_rest.lru_remove(bh); - lru_dirty.lru_insert_top(bh); - //dirty_bh.push_back(&bh->xlist_dirty); - } - if (s != BufferHead::STATE_DIRTY && bh->get_state() == BufferHead::STATE_DIRTY) { - lru_dirty.lru_remove(bh); - if (bh->want_to_expire) - lru_rest.lru_insert_bot(bh); - else - lru_rest.lru_insert_mid(bh); - //dirty_bh.remove(&bh->xlist_dirty); - } - - // set state - stat_sub(bh); - bh->set_state(s); - stat_add(bh); - } - - void copy_state(BufferHead *bh1, BufferHead *bh2) { - set_state(bh2, bh1->get_state()); - } - - void mark_missing(BufferHead *bh) { set_state(bh, BufferHead::STATE_MISSING); }; - void mark_clean(BufferHead *bh) { set_state(bh, BufferHead::STATE_CLEAN); }; - void mark_rx(BufferHead *bh) { set_state(bh, BufferHead::STATE_RX); }; - void mark_partial(BufferHead *bh) { set_state(bh, BufferHead::STATE_PARTIAL); }; - void mark_tx(BufferHead *bh) { set_state(bh, BufferHead::STATE_TX); }; - void mark_dirty(BufferHead *bh) { - set_state(bh, BufferHead::STATE_DIRTY); - bh->set_dirty_stamp(g_clock.now()); - }; - - - // io - void bh_read(Onode *on, BufferHead *bh, block_t from=0); - void bh_write(Onode *on, BufferHead *bh, block_t shouldbe=0); - - bool bh_cancel_read(BufferHead *bh); - bool bh_cancel_write(BufferHead *bh, version_t cur_epoch); - - void bh_queue_partial_write(Onode *on, BufferHead *bh); - void bh_cancel_partial_write(BufferHead *bh); - - void queue_partial(block_t from, block_t to, map& partial, version_t epoch); - void cancel_partial(block_t from, block_t to, version_t epoch); - - void add_shadow_partial(block_t from, BufferHead *bh); - void cancel_shadow_partial(block_t from, BufferHead *bh); - - void rx_finish(ObjectCache *oc, ioh_t ioh, block_t start, block_t len, block_t diskstart, bufferlist& bl); - void tx_finish(ObjectCache *oc, ioh_t ioh, block_t start, block_t len, version_t v, version_t e); - void partial_tx_finish(version_t epoch); - - friend class C_E_FlushPartial; - - // bh fun - BufferHead *split(BufferHead *orig, block_t after); -}; - - -class C_OC_RxFinish : public BlockDevice::callback { - Mutex &lock; - ObjectCache *oc; - block_t start, length; - block_t diskstart; -public: - bufferlist bl; - C_OC_RxFinish(Mutex &m, ObjectCache *o, block_t s, block_t l, block_t ds) : - lock(m), oc(o), start(s), length(l), diskstart(ds) {} - void finish(ioh_t ioh, int r) { - oc->bc->rx_finish(oc, ioh, start, length, diskstart, bl); - } -}; - -class C_OC_TxFinish : public BlockDevice::callback { - Mutex &lock; - ObjectCache *oc; - block_t start, length; - version_t version; - version_t epoch; - public: - C_OC_TxFinish(Mutex &m, ObjectCache *o, block_t s, block_t l, version_t v, version_t e) : - lock(m), oc(o), start(s), length(l), version(v), epoch(e) {} - void finish(ioh_t ioh, int r) { - oc->bc->tx_finish(oc, ioh, start, length, version, epoch); - } -}; - -class C_OC_PartialTxFinish : public BlockDevice::callback { - BufferCache *bc; - version_t epoch; -public: - C_OC_PartialTxFinish(BufferCache *b, version_t e) : - bc(b), epoch(e) {} - void finish(ioh_t ioh, int r) { - bc->partial_tx_finish(epoch); - } -}; - - -#endif diff --git a/branches/sage/ebofs2/ebofs/Cnode.h b/branches/sage/ebofs2/ebofs/Cnode.h deleted file mode 100644 index 8415978893fb5..0000000000000 --- a/branches/sage/ebofs2/ebofs/Cnode.h +++ /dev/null @@ -1,101 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __EBOFS_CNODE_H -#define __EBOFS_CNODE_H - -#include "Onode.h" - -/* - * collection node - * - * holds attribute metadata for collections. - * colletion membership is stored in b+tree tables, independent of tte cnode. - */ - -class Cnode : public LRUObject -{ - private: - int ref; - bool dirty; - - public: - coll_t coll_id; - Extent cnode_loc; - - map attr; - - public: - Cnode(coll_t cid) : ref(0), dirty(false), coll_id(cid) { - cnode_loc.length = 0; - } - ~Cnode() { - } - - block_t get_cnode_id() { return cnode_loc.start; } - int get_cnode_len() { return cnode_loc.length; } - - void get() { - if (ref == 0) lru_pin(); - ref++; - } - void put() { - ref--; - if (ref == 0) lru_unpin(); - } - int get_ref_count() { return ref; } - - void mark_dirty() { - if (!dirty) { - dirty = true; - get(); - } - } - void mark_clean() { - if (dirty) { - dirty = false; - put(); - } - } - bool is_dirty() { return dirty; } - - - int get_attr_bytes() { - int s = 0; - for (map::iterator i = attr.begin(); - i != attr.end(); - i++) { - s += i->first.length() + 1; - s += i->second.length() + sizeof(int); - } - return s; - } - - // - //???void clear(); - - -}; - -inline ostream& operator<<(ostream& out, Cnode& cn) -{ - out << "cnode(" << hex << cn.coll_id << dec; - if (cn.is_dirty()) out << " dirty"; - //out << " " << &cn; - out << ")"; - return out; -} - -#endif diff --git a/branches/sage/ebofs2/ebofs/Ebofs.cc b/branches/sage/ebofs2/ebofs/Ebofs.cc deleted file mode 100644 index b1f6ab7539467..0000000000000 --- a/branches/sage/ebofs2/ebofs/Ebofs.cc +++ /dev/null @@ -1,3628 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "Ebofs.h" - -#include "FileJournal.h" - -#include - -#ifndef DARWIN -#include -#else -#include -#include -#endif // DARWIN - -// ******************* - -#define dout(x) if (x <= g_conf.debug_ebofs) *_dout << dbeginl << g_clock.now() << " ebofs(" << dev.get_device_name() << ")." -#define derr(x) if (x <= g_conf.debug_ebofs) *_derr << dbeginl << g_clock.now() << " ebofs(" << dev.get_device_name() << ")." - - -char *nice_blocks(block_t b) -{ - static char s[20]; - float sz = b*4.0; - if (sz > (10 << 20)) - sprintf(s,"%.1f GB", sz / (1024.0*1024.0)); - else if (sz > (10 << 10)) - sprintf(s,"%.1f MB", sz / (1024.0)); - else - sprintf(s,"%llu KB", b*4ULL); - return s; -} - -int Ebofs::mount() -{ - ebofs_lock.Lock(); - assert(!mounted); - - // open dev - int r = dev.open(&idle_kicker); - if (r < 0) { - ebofs_lock.Unlock(); - return r; - } - - dout(2) << "mounting " << dev.get_device_name() << " " << dev.get_num_blocks() << " blocks, " << nice_blocks(dev.get_num_blocks()) << dendl; - - // read super - bufferptr bp1 = buffer::create_page_aligned(EBOFS_BLOCK_SIZE); - bufferptr bp2 = buffer::create_page_aligned(EBOFS_BLOCK_SIZE); - dev.read(0, 1, bp1); - dev.read(1, 1, bp2); - - struct ebofs_super *sb1 = (struct ebofs_super*)bp1.c_str(); - struct ebofs_super *sb2 = (struct ebofs_super*)bp2.c_str(); - - // valid superblocks? - if (sb1->s_magic != EBOFS_MAGIC || - sb2->s_magic != EBOFS_MAGIC) { - derr(0) << "mount bad magic, not a valid EBOFS file system" << dendl; - return -EINVAL; - } - if (sb1->num_blocks > dev.get_num_blocks() || - sb2->num_blocks > dev.get_num_blocks()) { - derr(0) << "mount superblock size exceeds actual device size" << dendl; - return -EINVAL; - } - - dout(3) << "mount super @0 epoch " << sb1->epoch << dendl; - dout(3) << "mount super @1 epoch " << sb2->epoch << dendl; - - // pick newest super - struct ebofs_super *sb = 0; - if (sb1->epoch > sb2->epoch) - sb = sb1; - else - sb = sb2; - super_epoch = sb->epoch; - dout(3) << "mount epoch " << super_epoch << dendl; - assert(super_epoch == sb->epoch); - - super_fsid = sb->fsid; - - free_blocks = sb->free_blocks; - limbo_blocks = sb->limbo_blocks; - - // init node pools - dout(3) << "mount nodepool" << dendl; - nodepool.init( &sb->nodepool ); - nodepool.read_usemap_and_clean_nodes( dev, super_epoch ); - - // open tables - dout(3) << "mount opening tables" << dendl; - object_tab = new Table( nodepool, sb->object_tab ); - for (int i=0; i( nodepool, sb->free_tab[i] ); - limbo_tab = new Table( nodepool, sb->limbo_tab ); - alloc_tab = new Table >( nodepool, sb->alloc_tab ); - - collection_tab = new Table( nodepool, sb->collection_tab ); - co_tab = new Table( nodepool, sb->co_tab ); - - verify_tables(); - - allocator.release_limbo(); - - - // open journal? - if (journalfn) { - journal = new FileJournal(this, journalfn); - if (journal->open() < 0) { - dout(3) << "mount journal " << journalfn << " open failed" << dendl; - delete journal; - journal = 0; - } else { - dout(3) << "mount journal " << journalfn << " opened, replaying" << dendl; - - while (1) { - bufferlist bl; - epoch_t e; - if (!journal->read_entry(bl, e)) { - dout(3) << "mount replay: end of journal, done." << dendl; - break; - } - - if (e < super_epoch) { - dout(3) << "mount replay: skipping old entry in epoch " << e << " < " << super_epoch << dendl; - continue; - } - if (e == super_epoch+1) { - super_epoch++; - dout(3) << "mount replay: jumped to next epoch " << super_epoch << dendl; - } - assert(e == super_epoch); - - dout(3) << "mount replay: applying transaction in epoch " << e << dendl; - Transaction t; - int off = 0; - t._decode(bl, off); - _apply_transaction(t); - } - - // done reading, make writeable. - journal->make_writeable(); - } - } - - dout(3) << "mount starting commit+finisher threads" << dendl; - commit_thread.create(); - finisher_thread.create(); - - dout(1) << "mounted " << dev.get_device_name() << " " << dev.get_num_blocks() << " blocks, " << nice_blocks(dev.get_num_blocks()) - << (journal ? ", with journal":", no journal") - << dendl; - mounted = true; - - - ebofs_lock.Unlock(); - return 0; -} - - -int Ebofs::mkfs() -{ - ebofs_lock.Lock(); - assert(!mounted); - - int r = dev.open(); - if (r < 0) { - ebofs_lock.Unlock(); - return r; - } - - block_t num_blocks = dev.get_num_blocks(); - - // make a super-random fsid - srand48(time(0) ^ getpid()); - super_fsid = ((uint64_t)lrand48() << 32) ^ mrand48(); - srand(time(0) ^ getpid()); - super_fsid ^= rand(); - super_fsid ^= (uint64_t)rand() << 32; - - free_blocks = 0; - limbo_blocks = 0; - - // create first noderegion - Extent nr; - nr.start = 2; - nr.length = 20+ (num_blocks / 1000); - if (nr.length < 10) nr.length = 10; - nodepool.add_region(nr); - dout(10) << "mkfs: first node region at " << nr << dendl; - - // allocate two usemaps - block_t usemap_len = nodepool.get_usemap_len(); - nodepool.usemap_even.start = nr.end(); - nodepool.usemap_even.length = usemap_len; - nodepool.usemap_odd.start = nodepool.usemap_even.end(); - nodepool.usemap_odd.length = usemap_len; - dout(10) << "mkfs: even usemap at " << nodepool.usemap_even << dendl; - dout(10) << "mkfs: odd usemap at " << nodepool.usemap_odd << dendl; - nodepool.init_usemap(); - - // init tables - struct ebofs_table empty; - empty.num_keys = 0; - empty.root = -1; - empty.depth = 0; - - object_tab = new Table( nodepool, empty ); - collection_tab = new Table( nodepool, empty ); - - for (int i=0; i( nodepool, empty ); - limbo_tab = new Table( nodepool, empty ); - alloc_tab = new Table >( nodepool, empty ); - - co_tab = new Table( nodepool, empty ); - - // add free space - Extent left; - left.start = nodepool.usemap_odd.end(); - left.length = num_blocks - left.start; - dout(10) << "mkfs: free data blocks at " << left << dendl; - allocator._release_into_limbo( left ); - if (g_conf.ebofs_cloneable) { - allocator.alloc_inc(nr); - allocator.alloc_inc(nodepool.usemap_even); - allocator.alloc_inc(nodepool.usemap_odd); - } - allocator.commit_limbo(); // -> limbo_tab - allocator.release_limbo(); // -> free_tab - - // write nodes, super, 2x - dout(10) << "mkfs: flushing nodepool and superblocks (2x)" << dendl; - - for (epoch_t e=0; e<2; e++) { - nodepool.commit_start(dev, e); - nodepool.commit_wait(); - bufferptr superbp; - prepare_super(e, superbp); - write_super(e, superbp); - } - - // free memory - dout(10) << "mkfs: cleaning up" << dendl; - close_tables(); - - dev.close(); - - - // create journal? - if (journalfn) { - Journal *journal = new FileJournal(this, journalfn); - if (journal->create() < 0) { - dout(3) << "mount journal " << journalfn << " created failed" << dendl; - } else { - dout(3) << "mount journal " << journalfn << " created" << dendl; - } - delete journal; - } - - dout(2) << "mkfs: " << dev.get_device_name() << " " << dev.get_num_blocks() << " blocks, " << nice_blocks(dev.get_num_blocks()) << dendl; - ebofs_lock.Unlock(); - return 0; -} - -void Ebofs::close_tables() -{ - // close tables - delete object_tab; - for (int i=0; iverify("onmount"); - limbo_tab->verify("onmount"); - alloc_tab->verify("onmount"); - collection_tab->verify("onmount"); - co_tab->verify("onmount"); - for (int i=0; iverify("onmount"); - - g_conf.ebofs_verify = o; -} - -int Ebofs::umount() -{ - ebofs_lock.Lock(); - - // mark unmounting - dout(2) << "umount start" << dendl; - readonly = true; - unmounting = true; - - // kick commit thread - dout(5) << "umount stopping commit thread" << dendl; - commit_cond.Signal(); - ebofs_lock.Unlock(); - commit_thread.join(); - ebofs_lock.Lock(); - - // kick finisher thread - dout(5) << "umount stopping finisher thread" << dendl; - finisher_lock.Lock(); - finisher_stop = true; - finisher_cond.Signal(); - finisher_lock.Unlock(); - - finisher_thread.join(); - - trim_bc(0); - trim_inodes(0); - - for (hash_map::iterator i = onode_map.begin(); - i != onode_map.end(); - i++) { - dout(0) << "umount *** leftover: " << i->first << " " << *(i->second) << dendl; - } - - // free memory - dout(5) << "umount cleaning up" << dendl; - close_tables(); - dev.close(); - readonly = unmounting = mounted = false; - - dout(2) << "umount done on " << dev.get_device_name() << dendl; - ebofs_lock.Unlock(); - return 0; -} - - - -void Ebofs::prepare_super(version_t epoch, bufferptr& bp) -{ - struct ebofs_super sb; - - dout(10) << "prepare_super v" << epoch << dendl; - - // fill in super - memset(&sb, 0, sizeof(sb)); - sb.s_magic = EBOFS_MAGIC; - sb.fsid = super_fsid; - sb.epoch = epoch; - sb.num_blocks = dev.get_num_blocks(); - - sb.free_blocks = free_blocks; - sb.limbo_blocks = limbo_blocks; - - - // tables - sb.object_tab.num_keys = object_tab->get_num_keys(); - sb.object_tab.root = object_tab->get_root(); - sb.object_tab.depth = object_tab->get_depth(); - - for (int i=0; iget_num_keys(); - sb.free_tab[i].root = free_tab[i]->get_root(); - sb.free_tab[i].depth = free_tab[i]->get_depth(); - } - sb.limbo_tab.num_keys = limbo_tab->get_num_keys(); - sb.limbo_tab.root = limbo_tab->get_root(); - sb.limbo_tab.depth = limbo_tab->get_depth(); - - sb.alloc_tab.num_keys = alloc_tab->get_num_keys(); - sb.alloc_tab.root = alloc_tab->get_root(); - sb.alloc_tab.depth = alloc_tab->get_depth(); - - sb.collection_tab.num_keys = collection_tab->get_num_keys(); - sb.collection_tab.root = collection_tab->get_root(); - sb.collection_tab.depth = collection_tab->get_depth(); - - sb.co_tab.num_keys = co_tab->get_num_keys(); - sb.co_tab.root = co_tab->get_root(); - sb.co_tab.depth = co_tab->get_depth(); - - // pools - sb.nodepool.num_regions = nodepool.region_loc.size(); - for (unsigned i=0; i 0) { - // *** this is an ugly ugly hack **** - // do not use - // periodically check for idle block device - utime_t idle_wait(0, g_conf.ebofs_idle_commit_ms*1000); - dout(20) << "commit_thread sleeping (up to) " << g_conf.ebofs_commit_ms << " ms, " - << idle_wait << " ms if idle" << dendl; - utime_t now = g_clock.now(); - utime_t stop = now; - stop += (double)g_conf.ebofs_commit_ms / 1000.0; - do { - utime_t wait = MIN(stop - now, idle_wait); - if (commit_cond.WaitInterval(ebofs_lock, wait) != ETIMEDOUT) { - dout(20) << "commit_thread i got kicked" << dendl; - break; // we got kicked - } - if (dev.is_idle()) { - dout(20) << "commit_thread bdev is idle, early commit" << dendl; - break; // dev is idle - } - now = g_clock.now(); - dout(20) << "commit_thread now=" << now << ", stop at " << stop << dendl; - - // hack hack - //if (!left) g_conf.debug_ebofs = 10; - // /hack hack - } while (now < stop); - dout(20) << "commit_thread done with idle loop" << dendl; - - } else { - // normal wait+timeout - dout(20) << "commit_thread sleeping (up to) " << g_conf.ebofs_commit_ms << " ms" << dendl; - commit_cond.WaitInterval(ebofs_lock, utime_t(0, g_conf.ebofs_commit_ms*1000)); - } - - } else { - // DEBUG.. wait until kicked - dout(10) << "commit_thread no commit_ms, waiting until kicked" << dendl; - commit_cond.Wait(ebofs_lock); - } - - if (unmounting) { - dout(10) << "commit_thread unmounting: final commit pass" << dendl; - assert(readonly); - unmounting = false; - mounted = false; - dirty = true; - } - - if (!dirty && !limbo_blocks) { - dout(10) << "commit_thread not dirty" << dendl; - } - else { - super_epoch++; - dirty = false; - - derr(10) << "commit_thread commit start, new epoch " << super_epoch << dendl; - dout(10) << "commit_thread commit start, new epoch " << super_epoch << dendl; - dout(2) << "commit_thread data: " - << 100*(dev.get_num_blocks()-get_free_blocks())/dev.get_num_blocks() << "% used, " - << get_free_blocks() << " (" << 100*get_free_blocks()/dev.get_num_blocks() - << "%) free in " << get_free_extents() - << ", " << get_limbo_blocks() << " (" << 100*get_limbo_blocks()/dev.get_num_blocks() - << "%) limbo in " << get_limbo_extents() - << dendl; - dout(2) << "commit_thread nodes: " - << 100*nodepool.get_num_used()/nodepool.get_num_total() << "% used, " - << nodepool.get_num_free() << " (" << 100*nodepool.get_num_free()/nodepool.get_num_total() << "%) free, " - << nodepool.get_num_limbo() << " (" << 100*nodepool.get_num_limbo()/nodepool.get_num_total() << "%) limbo, " - << nodepool.get_num_total() << " total." << dendl; - dout(2) << "commit_thread bc: " - << "size " << bc.get_size() - << ", trimmable " << bc.get_trimmable() - << ", max " << g_conf.ebofs_bc_size - << "; dirty " << bc.get_stat_dirty() - << ", tx " << bc.get_stat_tx() - << ", max dirty " << g_conf.ebofs_bc_max_dirty - << dendl; - - if (journal) journal->commit_epoch_start(); - - // (async) write onodes+condes (do this first; it currently involves inode reallocation) - commit_inodes_start(); - - allocator.commit_limbo(); // limbo -> limbo_tab - - // (async) write btree nodes - nodepool.commit_start( dev, super_epoch ); - - // blockdev barrier (prioritize our writes!) - dout(30) << "commit_thread barrier. flushing inodes " << inodes_flushing << dendl; - dev.barrier(); - - // prepare super (before any changes get made!) - bufferptr superbp; - prepare_super(super_epoch, superbp); - - // wait for it all to flush (drops global lock) - commit_bc_wait(super_epoch-1); - dout(30) << "commit_thread bc flushed" << dendl; - commit_inodes_wait(); - dout(30) << "commit_thread inodes flushed" << dendl; - nodepool.commit_wait(); - dout(30) << "commit_thread btree nodes flushed" << dendl; - - // ok, now (synchronously) write the prior super! - dout(10) << "commit_thread commit flushed, writing super for prior epoch" << dendl; - ebofs_lock.Unlock(); - write_super(super_epoch, superbp); - ebofs_lock.Lock(); - - dout(10) << "commit_thread wrote super" << dendl; - - // free limbo space now - // (since we're done allocating things, - // AND we've flushed all previous epoch data) - allocator.release_limbo(); // limbo_tab -> free_tabs - - // do we need more node space? - if (nodepool.get_num_free() < nodepool.get_num_total() / 3) { - dout(2) << "commit_thread running low on node space, allocating more." << dendl; - alloc_more_node_space(); - } - - // signal journal - if (journal) journal->commit_epoch_finish(); - - // kick waiters - dout(10) << "commit_thread queueing commit + kicking sync waiters" << dendl; - - queue_finishers(commit_waiters[super_epoch-1]); - commit_waiters.erase(super_epoch-1); - - sync_cond.Signal(); - - dout(10) << "commit_thread commit finish" << dendl; - } - - // trim bc? - trim_bc(); - trim_inodes(); - - } - - dout(10) << "commit_thread finish" << dendl; - commit_thread_started = false; - ebofs_lock.Unlock(); - return 0; -} - - -void Ebofs::alloc_more_node_space() -{ - dout(1) << "alloc_more_node_space free " << nodepool.get_num_free() << "/" << nodepool.get_num_total() << dendl; - - if (nodepool.num_regions() < EBOFS_MAX_NODE_REGIONS) { - int want = nodepool.get_num_total(); - - Extent ex; - allocator.allocate(ex, want, 2); - dout(1) << "alloc_more_node_space wants " << want << " more, got " << ex << dendl; - - Extent even, odd; - unsigned ulen = nodepool.get_usemap_len(nodepool.get_num_total() + ex.length); - allocator.allocate(even, ulen, 2); - allocator.allocate(odd, ulen, 2); - dout(1) << "alloc_more_node_space maps need " << ulen << " x2, got " << even << " " << odd << dendl; - - if (even.length == ulen && odd.length == ulen) { - dout(1) << "alloc_more_node_space got " << ex << ", new usemaps at even " << even << " odd " << odd << dendl; - allocator.release(nodepool.usemap_even); - allocator.release(nodepool.usemap_odd); - nodepool.add_region(ex); - - // expand usemap? - nodepool.usemap_even = even; - nodepool.usemap_odd = odd; - nodepool.expand_usemap(); - } else { - dout (1) << "alloc_more_node_space failed to get space for new usemaps" << dendl; - allocator.release(ex); - allocator.release(even); - allocator.release(odd); - //assert(0); - } - } else { - dout(1) << "alloc_more_node_space already have max node regions!" << dendl; - assert(0); - } -} - - -void *Ebofs::finisher_thread_entry() -{ - finisher_lock.Lock(); - dout(10) << "finisher_thread start" << dendl; - - while (!finisher_stop) { - while (!finisher_queue.empty()) { - list ls; - ls.swap(finisher_queue); - - finisher_lock.Unlock(); - - //ebofs_lock.Lock(); // um.. why lock this? -sage - finish_contexts(ls, 0); - //ebofs_lock.Unlock(); - - finisher_lock.Lock(); - } - if (finisher_stop) break; - - dout(30) << "finisher_thread sleeping" << dendl; - finisher_cond.Wait(finisher_lock); - } - - dout(10) << "finisher_thread start" << dendl; - finisher_lock.Unlock(); - return 0; -} - - -// *** onodes *** - -Onode* Ebofs::new_onode(object_t oid) -{ - Onode* on = new Onode(oid); - - assert(onode_map.count(oid) == 0); - onode_map[oid] = on; - onode_lru.lru_insert_top(on); - - assert(object_tab->lookup(oid) < 0); - object_tab->insert( oid, on->onode_loc ); // even tho i'm not placed yet - - on->get(); - on->onode_loc.start = 0; - on->onode_loc.length = 0; - - dirty_onode(on); - - dout(7) << "new_onode " << *on << dendl; - return on; -} - - -Onode* Ebofs::get_onode(object_t oid) -{ - while (1) { - // in cache? - if (have_onode(oid)) { - // yay - Onode *on = onode_map[oid]; - on->get(); - //dout(0) << "get_onode " << *on << dendl; - return on; - } - - // on disk? - Extent onode_loc; - if (object_tab->lookup(oid, onode_loc) < 0) { - dout(10) << "onode lookup failed on " << oid << dendl; - // object dne. - return 0; - } - - // already loading? - if (waitfor_onode.count(oid)) { - // yep, just wait. - Cond c; - waitfor_onode[oid].push_back(&c); - dout(10) << "get_onode " << oid << " already loading, waiting" << dendl; - c.Wait(ebofs_lock); - continue; - } - - dout(10) << "get_onode reading " << oid << " from " << onode_loc << dendl; - - assert(waitfor_onode.count(oid) == 0); - waitfor_onode[oid].clear(); // this should be empty initially. - - // read it! - bufferlist bl; - bl.push_back( buffer::create_page_aligned( EBOFS_BLOCK_SIZE*onode_loc.length ) ); - - ebofs_lock.Unlock(); - dev.read( onode_loc.start, onode_loc.length, bl ); - ebofs_lock.Lock(); - - // add onode - Onode *on = new Onode(oid); - onode_map[oid] = on; - onode_lru.lru_insert_top(on); - - // parse data block - struct ebofs_onode *eo = (struct ebofs_onode*)bl.c_str(); - if (eo->object_id != oid) { - dout(0) << " wrong oid in onode block: " << eo->object_id << " != " << oid << dendl; - dout(0) << " onode_loc is " << eo->onode_loc << dendl; - dout(0) << " object_size " << eo->object_size << dendl; - dout(0) << " object_blocks " << eo->object_blocks << dendl; - dout(0) << " " << eo->num_collections << " coll + " - << eo->num_attr << " attr + " - << eo->num_extents << " extents" << dendl; - assert(eo->object_id == oid); - } - on->readonly = eo->readonly; - on->onode_loc = eo->onode_loc; - on->object_size = eo->object_size; - on->object_blocks = eo->object_blocks; - - // parse - char *p = bl.c_str() + sizeof(*eo); - - // parse collection list - for (int i=0; inum_collections; i++) { - coll_t c = *((coll_t*)p); - p += sizeof(c); - on->collections.insert(c); - } - - // parse attributes - for (int i=0; inum_attr; i++) { - string key = p; - p += key.length() + 1; - int len = *(int*)(p); - p += sizeof(len); - on->attr[key] = buffer::copy(p, len); - p += len; - dout(15) << "get_onode " << *on << " attr " << key << " len " << len << dendl; - } - - // parse extents - on->extent_map.clear(); - block_t n = 0; - for (int i=0; inum_extents; i++) { - Extent ex = *((Extent*)p); - on->extent_map[n] = ex; - dout(15) << "get_onode " << *on << " ex " << i << ": " << ex << dendl; - n += ex.length; - p += sizeof(Extent); - } - assert(n == on->object_blocks); - - // wake up other waiters - for (list::iterator i = waitfor_onode[oid].begin(); - i != waitfor_onode[oid].end(); - i++) - (*i)->Signal(); - waitfor_onode.erase(oid); // remove Cond list - - on->get(); - //dout(0) << "get_onode " << *on << " (loaded)" << dendl; - return on; - } -} - - -class C_E_InodeFlush : public BlockDevice::callback { - Ebofs *ebofs; -public: - C_E_InodeFlush(Ebofs *e) : ebofs(e) {} - void finish(ioh_t ioh, int r) { - ebofs->flush_inode_finish(); - } -}; - - -void Ebofs::encode_onode(Onode *on, bufferlist& bl, unsigned& off) -{ - // onode - struct ebofs_onode eo; - eo.readonly = on->readonly; - eo.onode_loc = on->onode_loc; - eo.object_id = on->object_id; - eo.object_size = on->object_size; - eo.object_blocks = on->object_blocks; - eo.num_collections = on->collections.size(); - eo.num_attr = on->attr.size(); - eo.num_extents = on->extent_map.size(); - bl.copy_in(off, sizeof(eo), (char*)&eo); - off += sizeof(eo); - - // collections - for (set::iterator i = on->collections.begin(); - i != on->collections.end(); - i++) { - bl.copy_in(off, sizeof(*i), (char*)&(*i)); - off += sizeof(*i); - } - - // attr - for (map::iterator i = on->attr.begin(); - i != on->attr.end(); - i++) { - bl.copy_in(off, i->first.length()+1, i->first.c_str()); - off += i->first.length()+1; - int l = i->second.length(); - bl.copy_in(off, sizeof(int), (char*)&l); - off += sizeof(int); - bl.copy_in(off, l, i->second.c_str()); - off += l; - dout(15) << "write_onode " << *on << " attr " << i->first << " len " << l << dendl; - } - - // extents - for (map::iterator i = on->extent_map.begin(); - i != on->extent_map.end(); - i++) { - bl.copy_in(off, sizeof(Extent), (char*)&(i->second)); - off += sizeof(Extent); - dout(15) << "write_onode " << *on << " ex " << i->first << ": " << i->second << dendl; - } -} - -void Ebofs::write_onode(Onode *on) -{ - // buffer - unsigned bytes = sizeof(ebofs_onode) + on->get_collection_bytes() + on->get_attr_bytes() + on->get_extent_bytes(); - unsigned blocks = (bytes-1)/EBOFS_BLOCK_SIZE + 1; - - bufferlist bl; - bl.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*blocks) ); - - // (always) relocate onode - if (1) { - if (on->onode_loc.length) - allocator.release(on->onode_loc); - - block_t first = 0; - if (on->extent_map.size()) - first = on->extent_map.begin()->second.start; - - allocator.allocate(on->onode_loc, blocks, first); - object_tab->remove( on->object_id ); - object_tab->insert( on->object_id, on->onode_loc ); - //object_tab->verify(); - } - - dout(10) << "write_onode " << *on << " to " << on->onode_loc << dendl; - - unsigned off = 0; - encode_onode(on, bl, off); - assert(off == bytes); - - // write - dev.write( on->onode_loc.start, on->onode_loc.length, bl, - new C_E_InodeFlush(this), "write_onode" ); -} - -void Ebofs::remove_onode(Onode *on) -{ - dout(8) << "remove_onode " << *on << dendl; - - assert(on->get_ref_count() >= 1); // caller - - // tear down buffer cache - if (on->oc) { - on->oc->truncate(0, super_epoch); // this will kick readers along the way. - on->close_oc(); - } - - // remove from onode map, mark dangling/deleted - onode_map.erase(on->object_id); - onode_lru.lru_remove(on); - on->deleted = true; - on->dangling = true; - - // remove from object table - //dout(0) << "remove_onode on " << *on << dendl; - object_tab->remove(on->object_id); - - // free onode space - if (on->onode_loc.length) - allocator.release(on->onode_loc); - - // free data space - for (map::iterator i = on->extent_map.begin(); - i != on->extent_map.end(); - i++) - allocator.release(i->second); - on->extent_map.clear(); - - // remove from collections - for (set::iterator i = on->collections.begin(); - i != on->collections.end(); - i++) { - co_tab->remove(coll_object_t(*i,on->object_id)); - } - on->collections.clear(); - - // dirty -> clean? - if (on->is_dirty()) { - on->mark_clean(); // this unpins *on - dirty_onodes.erase(on); - } - - if (on->get_ref_count() > 1) dout(10) << "remove_onode **** will survive " << *on << dendl; - put_onode(on); - - dirty = true; -} - -void Ebofs::put_onode(Onode *on) -{ - on->put(); - //dout(0) << "put_onode " << *on << dendl; - - if (on->get_ref_count() == 0 && on->dangling) { - //dot(0) << " *** hosing on " << *on << dendl; - delete on; - } -} - -void Ebofs::dirty_onode(Onode *on) -{ - if (!on->is_dirty()) { - on->mark_dirty(); - dirty_onodes.insert(on); - } - dirty = true; -} - -void Ebofs::trim_inodes(int max) -{ - unsigned omax = onode_lru.lru_get_max(); - unsigned cmax = cnode_lru.lru_get_max(); - if (max >= 0) omax = cmax = max; - dout(10) << "trim_inodes start " << onode_lru.lru_get_size() << " / " << omax << " onodes, " - << cnode_lru.lru_get_size() << " / " << cmax << " cnodes" << dendl; - - // onodes - while (onode_lru.lru_get_size() > omax) { - // expire an item - Onode *on = (Onode*)onode_lru.lru_expire(); - if (on == 0) break; // nothing to expire - - // expire - dout(20) << "trim_inodes removing onode " << *on << dendl; - onode_map.erase(on->object_id); - on->dangling = true; - - if (on->get_ref_count() == 0) { - assert(on->oc == 0); // an open oc pins the onode! - delete on; - } else { - dout(-20) << "trim_inodes still active: " << *on << dendl; - assert(0); // huh? - } - } - - - // cnodes - while (cnode_lru.lru_get_size() > cmax) { - // expire an item - Cnode *cn = (Cnode*)cnode_lru.lru_expire(); - if (cn == 0) break; // nothing to expire - - // expire - dout(20) << "trim_inodes removing cnode " << *cn << dendl; - cnode_map.erase(cn->coll_id); - - delete cn; - } - - dout(10) << "trim_inodes finish " - << onode_lru.lru_get_size() << " / " << omax << " onodes, " - << cnode_lru.lru_get_size() << " / " << cmax << " cnodes" << dendl; -} - - - -// *** cnodes **** - -Cnode* Ebofs::new_cnode(coll_t cid) -{ - Cnode* cn = new Cnode(cid); - - assert(cnode_map.count(cid) == 0); - cnode_map[cid] = cn; - cnode_lru.lru_insert_top(cn); - - assert(collection_tab->lookup(cid) < 0); - collection_tab->insert( cid, cn->cnode_loc ); // even tho i'm not placed yet - - cn->get(); - cn->cnode_loc.start = 0; - cn->cnode_loc.length = 0; - - dirty_cnode(cn); - - return cn; -} - -Cnode* Ebofs::get_cnode(coll_t cid) -{ - while (1) { - // in cache? - if (cnode_map.count(cid)) { - // yay - Cnode *cn = cnode_map[cid]; - cn->get(); - return cn; - } - - // on disk? - Extent cnode_loc; - if (collection_tab->lookup(cid, cnode_loc) < 0) { - // object dne. - return 0; - } - - // already loading? - if (waitfor_cnode.count(cid)) { - // yep, just wait. - Cond c; - waitfor_cnode[cid].push_back(&c); - dout(10) << "get_cnode " << cid << " already loading, waiting" << dendl; - c.Wait(ebofs_lock); - continue; - } - - dout(10) << "get_cnode reading " << cid << " from " << cnode_loc << dendl; - - assert(waitfor_cnode.count(cid) == 0); - waitfor_cnode[cid].clear(); // this should be empty initially. - - // read it! - bufferlist bl; - //bufferpool.alloc( EBOFS_BLOCK_SIZE*cnode_loc.length, bl ); - bl.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*cnode_loc.length) ); - - ebofs_lock.Unlock(); - dev.read( cnode_loc.start, cnode_loc.length, bl ); - ebofs_lock.Lock(); - - // parse data block - Cnode *cn = new Cnode(cid); - - cnode_map[cid] = cn; - cnode_lru.lru_insert_top(cn); - - struct ebofs_cnode *ec = (struct ebofs_cnode*)bl.c_str(); - cn->cnode_loc = ec->cnode_loc; - - // parse attributes - char *p = bl.c_str() + sizeof(*ec); - for (int i=0; inum_attr; i++) { - string key = p; - p += key.length() + 1; - int len = *(int*)(p); - p += sizeof(len); - cn->attr[key] = buffer::copy(p, len); - p += len; - dout(15) << "get_cnode " << *cn << " attr " << key << " len " << len << dendl; - } - - // wake up other waiters - for (list::iterator i = waitfor_cnode[cid].begin(); - i != waitfor_cnode[cid].end(); - i++) - (*i)->Signal(); - waitfor_cnode.erase(cid); // remove Cond list - - cn->get(); - return cn; - } -} - -void Ebofs::encode_cnode(Cnode *cn, bufferlist& bl, unsigned& off) -{ - // cnode - struct ebofs_cnode ec; - ec.cnode_loc = cn->cnode_loc; - ec.coll_id = cn->coll_id; - ec.num_attr = cn->attr.size(); - bl.copy_in(off, sizeof(ec), (char*)&ec); - off += sizeof(ec); - - // attr - for (map::iterator i = cn->attr.begin(); - i != cn->attr.end(); - i++) { - bl.copy_in(off, i->first.length()+1, i->first.c_str()); - off += i->first.length()+1; - int len = i->second.length(); - bl.copy_in(off, sizeof(int), (char*)&len); - off += sizeof(int); - bl.copy_in(off, len, i->second.c_str()); - off += len; - - dout(15) << "write_cnode " << *cn << " attr " << i->first << " len " << len << dendl; - } -} - -void Ebofs::write_cnode(Cnode *cn) -{ - // allocate buffer - unsigned bytes = sizeof(ebofs_cnode) + cn->get_attr_bytes(); - unsigned blocks = (bytes-1)/EBOFS_BLOCK_SIZE + 1; - - bufferlist bl; - //bufferpool.alloc( EBOFS_BLOCK_SIZE*blocks, bl ); - bl.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*blocks) ); - - // (always) relocate cnode! - if (1) { - if (cn->cnode_loc.length) - allocator.release(cn->cnode_loc); - - allocator.allocate(cn->cnode_loc, blocks, Allocator::NEAR_LAST_FWD); - collection_tab->remove( cn->coll_id ); - collection_tab->insert( cn->coll_id, cn->cnode_loc ); - } - - dout(10) << "write_cnode " << *cn << " to " << cn->cnode_loc << dendl; - - unsigned off = 0; - encode_cnode(cn, bl, off); - assert(off == bytes); - - // write - dev.write( cn->cnode_loc.start, cn->cnode_loc.length, bl, - new C_E_InodeFlush(this), "write_cnode" ); -} - -void Ebofs::remove_cnode(Cnode *cn) -{ - dout(10) << "remove_cnode " << *cn << dendl; - - // remove from table - collection_tab->remove(cn->coll_id); - - // free cnode space - if (cn->cnode_loc.length) - allocator.release(cn->cnode_loc); - - // remove from dirty list? - if (cn->is_dirty()) - dirty_cnodes.erase(cn); - - // remove from map and lru - cnode_map.erase(cn->coll_id); - cnode_lru.lru_remove(cn); - - // count down refs - cn->mark_clean(); - cn->put(); - assert(cn->get_ref_count() == 0); - - // hose. - delete cn; - - dirty = true; -} - -void Ebofs::put_cnode(Cnode *cn) -{ - cn->put(); -} - -void Ebofs::dirty_cnode(Cnode *cn) -{ - if (!cn->is_dirty()) { - cn->mark_dirty(); - dirty_cnodes.insert(cn); - } - dirty = true; -} - - - - - -void Ebofs::flush_inode_finish() -{ - ebofs_lock.Lock(); - { - inodes_flushing--; - if (inodes_flushing < 1000) - dout(20) << "flush_inode_finish, " << inodes_flushing << " left" << dendl; - if (inodes_flushing == 0) - inode_commit_cond.Signal(); - } - ebofs_lock.Unlock(); -} - -void Ebofs::commit_inodes_start() -{ - dout(10) << "commit_inodes_start" << dendl; - - assert(inodes_flushing == 0); - - // onodes - for (set::iterator i = dirty_onodes.begin(); - i != dirty_onodes.end(); - i++) { - Onode *on = *i; - inodes_flushing++; - write_onode(on); - on->mark_clean(); - on->uncommitted.clear(); // commit allocated blocks - on->commit_waiters.clear(); // these guys are gonna get taken care of, bc we committed. - } - dirty_onodes.clear(); - - // cnodes - for (set::iterator i = dirty_cnodes.begin(); - i != dirty_cnodes.end(); - i++) { - Cnode *cn = *i; - inodes_flushing++; - write_cnode(cn); - cn->mark_clean(); - } - dirty_cnodes.clear(); - - dout(10) << "commit_inodes_start writing " << inodes_flushing << " onodes+cnodes" << dendl; -} - -void Ebofs::commit_inodes_wait() -{ - // caller must hold ebofs_lock - while (inodes_flushing > 0) { - dout(10) << "commit_inodes_wait waiting for " << inodes_flushing << " onodes+cnodes to flush" << dendl; - inode_commit_cond.Wait(ebofs_lock); - } - dout(10) << "commit_inodes_wait all flushed" << dendl; -} - - - - - - - -// *** buffer cache *** - -void Ebofs::trim_buffer_cache() -{ - ebofs_lock.Lock(); - trim_bc(0); - ebofs_lock.Unlock(); -} - -void Ebofs::trim_bc(off_t max) -{ - if (max < 0) - max = g_conf.ebofs_bc_size; - dout(10) << "trim_bc start: size " << bc.get_size() << ", trimmable " << bc.get_trimmable() << ", max " << max << dendl; - - while (bc.get_size() > max && - bc.get_trimmable()) { - BufferHead *bh = (BufferHead*) bc.lru_rest.lru_expire(); - if (!bh) break; - - dout(25) << "trim_bc trimming " << *bh << dendl; - assert(bh->is_clean()); - - ObjectCache *oc = bh->oc; - bc.remove_bh(bh); - delete bh; - - if (oc->is_empty()) { - Onode *on = oc->on; - dout(10) << "trim_bc closing oc on " << *on << dendl; - on->close_oc(); - } - } - - dout(10) << "trim_bc finish: size " << bc.get_size() << ", trimmable " << bc.get_trimmable() << ", max " << max << dendl; -} - - -void Ebofs::kick_idle() -{ - dout(10) << "kick_idle" << dendl; - //commit_cond.Signal(); - - ebofs_lock.Lock(); - if (mounted && !unmounting && dirty) { - dout(10) << "kick_idle dirty, doing commit" << dendl; - commit_cond.Signal(); - } else { - dout(10) << "kick_idle !dirty or !mounted or unmounting, doing nothing" << dendl; - } - ebofs_lock.Unlock(); -} - -void Ebofs::sync(Context *onsafe) -{ - ebofs_lock.Lock(); - if (onsafe) { - dirty = true; - - while (1) { - if (journal) { - // journal empty transaction - Transaction t; - bufferlist bl; - t._encode(bl); - if (journal->submit_entry(bl, onsafe)) break; - } - commit_waiters[super_epoch].push_back(onsafe); - break; - } - } - ebofs_lock.Unlock(); -} - -void Ebofs::sync() -{ - ebofs_lock.Lock(); - if (!dirty) { - dout(7) << "sync in " << super_epoch << ", not dirty" << dendl; - } else { - epoch_t start = super_epoch; - dout(7) << "sync start in " << start << dendl; - while (super_epoch == start) { - dout(7) << "sync kicking commit in " << super_epoch << dendl; - dirty = true; - commit_cond.Signal(); - sync_cond.Wait(ebofs_lock); - } - dout(10) << "sync finish in " << super_epoch << dendl; - } - ebofs_lock.Unlock(); -} - - - -void Ebofs::commit_bc_wait(version_t epoch) -{ - dout(10) << "commit_bc_wait on epoch " << epoch << dendl; - - while (bc.get_unflushed(EBOFS_BC_FLUSH_BHWRITE,epoch) > 0 || - bc.get_unflushed(EBOFS_BC_FLUSH_PARTIAL,epoch) > 0) { - //dout(10) << "commit_bc_wait " << bc.get_unflushed(epoch) << " unflushed in epoch " << epoch << dendl; - dout(10) << "commit_bc_wait epoch " << epoch - << ", unflushed bhwrite " << bc.get_unflushed(EBOFS_BC_FLUSH_BHWRITE) - << ", unflushed partial " << bc.get_unflushed(EBOFS_BC_FLUSH_PARTIAL) - << dendl; - bc.waitfor_flush(); - } - - bc.get_unflushed(EBOFS_BC_FLUSH_BHWRITE).erase(epoch); - bc.get_unflushed(EBOFS_BC_FLUSH_PARTIAL).erase(epoch); - - dout(10) << "commit_bc_wait all flushed for epoch " << epoch - << "; " << bc.get_unflushed(EBOFS_BC_FLUSH_BHWRITE) - << " " << bc.get_unflushed(EBOFS_BC_FLUSH_PARTIAL) - << dendl; -} - - - -int Ebofs::statfs(struct statfs *buf) -{ - dout(7) << "statfs" << dendl; - - buf->f_type = EBOFS_MAGIC; /* type of filesystem */ - buf->f_bsize = 4096; /* optimal transfer block size */ - buf->f_blocks = dev.get_num_blocks(); /* total data blocks in file system */ - buf->f_bfree = get_free_blocks() - + get_limbo_blocks(); /* free blocks in fs */ - buf->f_bavail = get_free_blocks(); /* free blocks avail to non-superuser -- actually, for writing. */ - buf->f_files = nodepool.get_num_total(); /* total file nodes in file system */ - buf->f_ffree = nodepool.get_num_free(); /* free file nodes in fs */ - //buf->f_fsid = 0; /* file system id */ -#ifndef DARWIN - buf->f_namelen = 8; /* maximum length of filenames */ -#endif // DARWIN - - return 0; -} - - - - -/* - * allocate a write to blocks on disk. - * - take care to not overwrite any "safe" data blocks. - * - allocate/map new extents on disk as necessary - */ -void Ebofs::alloc_write(Onode *on, - block_t start, block_t len, - interval_set& alloc, - block_t& old_bfirst, block_t& old_blast) -{ - // first decide what pages to (re)allocate - alloc.insert(start, len); // start with whole range - - // figure out what bits are already uncommitted - interval_set already_uncom; - already_uncom.intersection_of(alloc, on->uncommitted); - - // subtract those off, so we're left with the committed bits (that must be reallocated). - alloc.subtract(already_uncom); - - dout(10) << "alloc_write must (re)alloc " << alloc << " on " << *on << dendl; - - // release it (into limbo) - for (map::iterator i = alloc.m.begin(); - i != alloc.m.end(); - i++) { - // get old region - vector old; - on->map_extents(i->first, i->second, old); - for (unsigned o=0; ofirst == start) { - old_bfirst = old[0].start; - dout(20) << "alloc_write old_bfirst " << old_bfirst << " of " << old[0] << dendl; - } - if (i->first+i->second == start+len) { - old_blast = old[old.size()-1].last(); - dout(20) << "alloc_write old_blast " << old_blast << " of " << old[old.size()-1] << dendl; - } - } - } - - // reallocate uncommitted too? - // ( --> yes. we can always make better allocation decisions later, with more information. ) - if (g_conf.ebofs_realloc) { - list tx; - - ObjectCache *oc = on->get_oc(&bc); - oc->find_tx(start, len, tx); - - for (list::reverse_iterator p = tx.rbegin(); - p != tx.rend(); - p++) { - BufferHead *bh = *p; - - // cancelable/moveable? - if (alloc.contains(bh->start(), bh->length())) { - dout(10) << "alloc_write " << *bh << " already in " << alloc << dendl; - continue; - } - - vector old; - on->map_extents(bh->start(), bh->length(), old); - assert(old.size() == 1); - - if (bh->start() >= start && bh->end() <= start+len) { - assert(bh->epoch_modified == super_epoch); - if (bc.bh_cancel_write(bh, super_epoch)) { - if (bh->length() == 1) - dout(10) << "alloc_write unallocated tx " << old[0] << ", canceled " << *bh << dendl; - // no, this isn't compatible with clone() and extent reference counting. - //allocator.unallocate(old[0]); // release (into free) - allocator.release(old[0]); // **FIXME** no cloning yet, my friend! - alloc.insert(bh->start(), bh->length()); - } else { - if (bh->length() == 1) - dout(10) << "alloc_write released tx " << old[0] << ", couldn't cancel " << *bh << dendl; - allocator.release(old[0]); // release (into limbo) - alloc.insert(bh->start(), bh->length()); - } - } else { - if (bh->length() == 1) - dout(10) << "alloc_write skipped tx " << old[0] << ", not entirely within " - << start << "~" << len - << " bh " << *bh << dendl; - } - } - - dout(10) << "alloc_write will (re)alloc " << alloc << " on " << *on << dendl; - } - - if (alloc.empty()) return; // no need to dirty the onode below! - - - // merge alloc into onode uncommitted map - //dout(10) << " union of " << on->uncommitted << " and " << alloc << dendl; - interval_set old = on->uncommitted; - on->uncommitted.union_of(alloc); - - dout(10) << "alloc_write onode.uncommitted is now " << on->uncommitted << dendl; - - if (0) { - // verify - interval_set ta; - ta.intersection_of(on->uncommitted, alloc); - dout(0) << " ta " << ta << dendl; - assert(alloc == ta); - - interval_set tb; - tb.intersection_of(on->uncommitted, old); - dout(0) << " tb " << tb << dendl; - assert(old == tb); - } - - dirty_onode(on); - - // allocate the space - for (map::iterator i = alloc.m.begin(); - i != alloc.m.end(); - i++) { - dout(15) << "alloc_write alloc " << i->first << "~" << i->second << " (of " << start << "~" << len << ")" << dendl; - - // allocate new space - block_t left = i->second; - block_t cur = i->first; - while (left > 0) { - Extent ex; - allocator.allocate(ex, left, Allocator::NEAR_LAST_FWD); - dout(10) << "alloc_write got " << ex << " for object offset " << cur << dendl; - on->set_extent(cur, ex); // map object to new region - left -= ex.length; - cur += ex.length; - } - } -} - - - - -void Ebofs::apply_write(Onode *on, off_t off, size_t len, const bufferlist& bl) -{ - ObjectCache *oc = on->get_oc(&bc); - - // map into blocks - off_t opos = off; // byte pos in object - size_t zleft = 0; // zeros left to write - size_t left = len; // bytes left - - block_t bstart = off / EBOFS_BLOCK_SIZE; - - if (off > on->object_size) { - zleft = off - on->object_size; - opos = on->object_size; - bstart = on->object_size / EBOFS_BLOCK_SIZE; - } - if (off+(off_t)len > on->object_size) { - dout(10) << "apply_write extending size on " << *on << ": " << on->object_size - << " -> " << off+len << dendl; - on->object_size = off+len; - dirty_onode(on); - } - if (bl.length() == 0) { - zleft += len; - left = 0; - } else { - assert(bl.length() == len); - } - if (zleft) - dout(10) << "apply_write zeroing " << zleft << " bytes before " << off << "~" << len - << " in " << *on << dendl; - - block_t blast = (len+off-1) / EBOFS_BLOCK_SIZE; - block_t blen = blast-bstart+1; - - // allocate write on disk. - interval_set alloc; - block_t old_bfirst = 0; // zero means not defined here (since we ultimately pass to bh_read) - block_t old_blast = 0; - alloc_write(on, bstart, blen, alloc, old_bfirst, old_blast); - dout(20) << "apply_write old_bfirst " << old_bfirst << ", old_blast " << old_blast << dendl; - - if (fake_writes) { - on->uncommitted.clear(); // worst case! - return; - } - - // map b range onto buffer_heads - map hits; - oc->map_write(bstart, blen, hits, super_epoch); - - // get current versions - //version_t lowv, highv; - //oc->scan_versions(bstart, blen, lowv, highv); - //highv++; - version_t highv = ++oc->write_count; - - // copy from bl into buffer cache - unsigned blpos = 0; // byte pos in input buffer - - // write data into buffers - for (map::iterator i = hits.begin(); - i != hits.end(); - i++) { - BufferHead *bh = i->second; - bh->set_version(highv); - bh->epoch_modified = super_epoch; - - // old write in progress? - if (bh->is_tx()) { // copy the buffer to avoid munging up in-flight write - dout(10) << "apply_write tx pending, copying buffer on " << *bh << dendl; - bufferlist temp; - temp.claim(bh->data); - //bc.bufferpool.alloc(EBOFS_BLOCK_SIZE*bh->length(), bh->data); - bh->data.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*bh->length()) ); - bh->data.copy_in(0, bh->length()*EBOFS_BLOCK_SIZE, temp); - } - - // need to split off partial? (partials can only be ONE block) - if ((bh->is_missing() || bh->is_rx()) && bh->length() > 1) { - if ((bh->start() == bstart && opos % EBOFS_BLOCK_SIZE != 0)) { - BufferHead *right = bc.split(bh, bh->start()+1); - hits[right->start()] = right; - dout(10) << "apply_write split off left block for partial write; rest is " << *right << dendl; - } - if ((bh->last() == blast && (len+off) % EBOFS_BLOCK_SIZE != 0) && - ((off_t)len+off < on->object_size)) { - BufferHead *right = bc.split(bh, bh->last()); - hits[right->start()] = right; - dout(10) << "apply_write split off right block for upcoming partial write; rest is " << *right << dendl; - } - } - - // partial at head or tail? - if ((bh->start() == bstart && opos % EBOFS_BLOCK_SIZE != 0) || // opos, not off, in case we're zeroing... - (bh->last() == blast && ((off_t)len+off) % EBOFS_BLOCK_SIZE != 0 && ((off_t)len+off) < on->object_size)) { - // locate ourselves in bh - unsigned off_in_bh = opos - bh->start()*EBOFS_BLOCK_SIZE; - assert(off_in_bh >= 0); - unsigned len_in_bh = MIN( (off_t)(zleft+left), - (off_t)(bh->end()*EBOFS_BLOCK_SIZE)-opos ); - - if (bh->is_partial() || bh->is_rx() || bh->is_missing()) { - assert(bh->is_partial() || bh->is_rx() || bh->is_missing()); - assert(bh->length() == 1); - - // add frag to partial - dout(10) << "apply_write writing into partial " << *bh << ":" - << " off_in_bh " << off_in_bh - << " len_in_bh " << len_in_bh - << dendl; - unsigned z = MIN( zleft, len_in_bh ); - if (z) { - bufferptr zp(z); - zp.zero(); - bufferlist zb; - zb.push_back(zp); - bh->add_partial(off_in_bh, zb); - zleft -= z; - opos += z; - } - - bufferlist sb; - sb.substr_of(bl, blpos, len_in_bh-z); // substr in existing buffer - bufferlist cp; - cp.append(sb.c_str(), len_in_bh-z); // copy the partial bit! - bh->add_partial(off_in_bh, cp); - left -= len_in_bh-z; - blpos += len_in_bh-z; - opos += len_in_bh-z; - - if (bh->partial_is_complete(on->object_size - bh->start()*EBOFS_BLOCK_SIZE)) { - dout(10) << "apply_write completed partial " << *bh << dendl; - //bc.bufferpool.alloc(EBOFS_BLOCK_SIZE*bh->length(), bh->data); // new buffers! - bh->data.clear(); - bh->data.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*bh->length()) ); - bh->data.zero(); - bh->apply_partial(); - bc.mark_dirty(bh); - bc.bh_write(on, bh); - } - else if (bh->is_rx()) { - dout(10) << "apply_write rx -> partial " << *bh << dendl; - assert(bh->length() == 1); - bc.mark_partial(bh); - bc.bh_queue_partial_write(on, bh); // queue the eventual write - } - else if (bh->is_missing()) { - dout(10) << "apply_write missing -> partial " << *bh << dendl; - assert(bh->length() == 1); - bc.mark_partial(bh); - - // take care to read from _old_ disk block locations! - if (bh->start() == bstart) - bc.bh_read(on, bh, old_bfirst); - else if (bh->start() == blast) - bc.bh_read(on, bh, old_blast); - else assert(0); - - bc.bh_queue_partial_write(on, bh); // queue the eventual write - } - else if (bh->is_partial()) { - dout(10) << "apply_write already partial, no need to submit rx on " << *bh << dendl; - if (bh->partial_tx_epoch == super_epoch) - bc.bh_cancel_partial_write(bh); - bc.bh_queue_partial_write(on, bh); // queue the eventual write - } - - - } else { - assert(bh->is_clean() || bh->is_dirty() || bh->is_tx()); - - // just write into the bh! - dout(10) << "apply_write writing leading/tailing partial into " << *bh << ":" - << " off_in_bh " << off_in_bh - << " len_in_bh " << len_in_bh - << dendl; - - // copy data into new buffers first (copy on write!) - // FIXME: only do the modified pages? this might be a big bh! - bufferlist temp; - temp.claim(bh->data); - //bc.bufferpool.alloc(EBOFS_BLOCK_SIZE*bh->length(), bh->data); - bh->data.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*bh->length()) ); - bh->data.copy_in(0, bh->length()*EBOFS_BLOCK_SIZE, temp); - - unsigned z = MIN( zleft, len_in_bh ); - if (z) { - bufferptr zp(z); - zp.zero(); - bufferlist zb; - zb.push_back(zp); - bh->data.copy_in(off_in_bh, z, zb); - zleft -= z; - opos += z; - } - - bufferlist sub; - sub.substr_of(bl, blpos, len_in_bh-z); - bh->data.copy_in(off_in_bh+z, len_in_bh-z, sub); - - blpos += len_in_bh-z; - left -= len_in_bh-z; - opos += len_in_bh-z; - - if (!bh->is_dirty()) - bc.mark_dirty(bh); - - bc.bh_write(on, bh); - } - continue; - } - - // ok, we're talking full block(s) now (modulo last block of the object) - assert(opos % EBOFS_BLOCK_SIZE == 0); - assert((off_t)(zleft+left) >= (off_t)(EBOFS_BLOCK_SIZE*bh->length()) || - opos+(off_t)(zleft+left) == on->object_size); - - unsigned len_in_bh = MIN(bh->length()*EBOFS_BLOCK_SIZE, zleft+left); - assert(len_in_bh <= zleft+left); - - dout(10) << "apply_write writing into " << *bh << ":" - << " len_in_bh " << len_in_bh - << dendl; - - // i will write: - unsigned z = MIN(len_in_bh, zleft); - bufferlist sub; - sub.substr_of(bl, blpos, len_in_bh-z); - - if (!z && - sub.is_page_aligned() && - sub.is_n_page_sized()) { - // assume caller isn't going to modify written buffers. - // just refrence them! - dout(10) << "apply_write yippee, written buffer already page aligned" << dendl; - bh->data.claim(sub); - } else { - // alloc new buffers. - bh->data.clear(); - bh->data.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*bh->length()) ); - - if (z) { - bufferptr zp(z); - zp.zero(); - bufferlist zb; - zb.push_back(zp); - bh->data.copy_in(0, z, zb); - zleft -= z; - } - - bufferlist sub; - sub.substr_of(bl, blpos, len_in_bh-z); - bh->data.copy_in(z, len_in_bh-z, sub); - } - - blpos += len_in_bh-z; - left -= len_in_bh-z; - opos += len_in_bh; - - // old partial? - if (bh->is_partial() && - bh->partial_tx_epoch == super_epoch) - bc.bh_cancel_partial_write(bh); - - // mark dirty - if (!bh->is_dirty()) - bc.mark_dirty(bh); - - bc.bh_write(on, bh); - } - - assert(zleft == 0); - assert(left == 0); - assert(opos == off+(off_t)len); - //assert(blpos == bl.length()); -} - - - - -// *** file i/o *** - -bool Ebofs::attempt_read(Onode *on, off_t off, size_t len, bufferlist& bl, - Cond *will_wait_on, bool *will_wait_on_bool) -{ - dout(10) << "attempt_read " << *on << " " << off << "~" << len << dendl; - ObjectCache *oc = on->get_oc(&bc); - - // map - block_t bstart = off / EBOFS_BLOCK_SIZE; - block_t blast = (len+off-1) / EBOFS_BLOCK_SIZE; - block_t blen = blast-bstart+1; - - map hits; - map missing; // read these - map rx; // wait for these - map partials; // ?? - oc->map_read(bstart, blen, hits, missing, rx, partials); - - // missing buffers? - if (!missing.empty()) { - for (map::iterator i = missing.begin(); - i != missing.end(); - i++) { - dout(10) << "attempt_read missing buffer " << *(i->second) << dendl; - bc.bh_read(on, i->second); - } - BufferHead *wait_on = missing.begin()->second; - block_t b = MAX(wait_on->start(), bstart); - wait_on->waitfor_read[b].push_back(new C_Cond(will_wait_on, will_wait_on_bool)); - return false; - } - - // are partials sufficient? - bool partials_ok = true; - for (map::iterator i = partials.begin(); - i != partials.end(); - i++) { - BufferHead *bh = i->second; - off_t bhstart = (off_t)(bh->start()*EBOFS_BLOCK_SIZE); - off_t bhend = (off_t)(bh->end()*EBOFS_BLOCK_SIZE); - off_t start = MAX( off, bhstart ); - off_t end = MIN( off+(off_t)len, bhend ); - - if (!i->second->have_partial_range(start-bhstart, end-bhend)) { - if (partials_ok) { - // wait on this one - Context *c = new C_Cond(will_wait_on, will_wait_on_bool); - dout(10) << "attempt_read insufficient partial buffer " << *(i->second) << " c " << c << dendl; - i->second->waitfor_read[i->second->start()].push_back(c); - } - partials_ok = false; - } - } - if (!partials_ok) return false; - - // wait on rx? - if (!rx.empty()) { - BufferHead *wait_on = rx.begin()->second; - Context *c = new C_Cond(will_wait_on, will_wait_on_bool); - dout(20) << "attempt_read waiting for read to finish on " << *wait_on << " c " << c << dendl; - block_t b = MAX(wait_on->start(), bstart); - wait_on->waitfor_read[b].push_back(c); - return false; - } - - // yay, we have it all! - // concurrently walk thru hits, partials. - map::iterator h = hits.begin(); - map::iterator p = partials.begin(); - - bl.clear(); - off_t pos = off; - block_t curblock = bstart; - while (curblock <= blast) { - BufferHead *bh = 0; - if (h->first == curblock) { - bh = h->second; - h++; - } else if (p->first == curblock) { - bh = p->second; - p++; - } else assert(0); - - off_t bhstart = (off_t)(bh->start()*EBOFS_BLOCK_SIZE); - off_t bhend = (off_t)(bh->end()*EBOFS_BLOCK_SIZE); - off_t start = MAX( pos, bhstart ); - off_t end = MIN( off+(off_t)len, bhend ); - - if (bh->is_partial()) { - // copy from a partial block. yuck! - bufferlist frag; - bh->copy_partial_substr( start-bhstart, end-bhstart, frag ); - bl.claim_append( frag ); - pos += frag.length(); - } else { - // copy from a full block. - if (bhstart == start && bhend == end) { - bl.append( bh->data ); - pos += bh->data.length(); - } else { - bufferlist frag; - dout(10) << "substr " << (start-bhstart) << "~" << (end-start) << " of " << bh->data.length() << " in " << *bh << dendl; - frag.substr_of(bh->data, start-bhstart, end-start); - pos += frag.length(); - bl.claim_append( frag ); - } - } - - curblock = bh->end(); - /* this assert is more trouble than it's worth - assert((off_t)(curblock*EBOFS_BLOCK_SIZE) == pos || // should be aligned with next block - end != bhend || // or we ended midway through bh - (bh->last() == blast && end == bhend)); // ended last block ** FIXME WRONG??? - */ - } - - assert(bl.length() == len); - return true; -} - - -/* - * is_cached -- query whether a object extent is in our cache - * return value of -1 if onode isn't loaded. otherwise, the number - * of extents that need to be read (i.e. # of seeks) - */ -int Ebofs::is_cached(object_t oid, off_t off, size_t len) -{ - ebofs_lock.Lock(); - int r = _is_cached(oid, off, len); - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_is_cached(object_t oid, off_t off, size_t len) -{ - if (!have_onode(oid)) { - dout(7) << "_is_cached " << oid << " " << off << "~" << len << " ... onode " << dendl; - return -1; // object dne? - } - Onode *on = get_onode(oid); - - if (!on->have_oc()) { - // nothing is cached. return # of extents in file. - dout(10) << "_is_cached have onode but no object cache, returning extent count" << dendl; - return on->extent_map.size(); - } - - // map - block_t bstart = off / EBOFS_BLOCK_SIZE; - block_t blast = (len+off-1) / EBOFS_BLOCK_SIZE; - block_t blen = blast-bstart+1; - - map hits; - map missing; // read these - map rx; // wait for these - map partials; // ?? - - int num_missing = on->get_oc(&bc)->try_map_read(bstart, blen); - dout(7) << "_is_cached try_map_read reports " << num_missing << " missing extents" << dendl; - return num_missing; - - // FIXME: actually, we should calculate if these extents are contiguous. - // and not using map_read, probably... - /* hrmpf - block_t dpos = 0; - block_t opos = bstart; - while (opos < blen) { - if (hits.begin()->first == opos) { - } else { - block_t d; - if (missing.begin()->first == opos) d = missing.begin()->second. - - } - */ -} - -void Ebofs::trim_from_cache(object_t oid, off_t off, size_t len) -{ - ebofs_lock.Lock(); - _trim_from_cache(oid, off, len); - ebofs_lock.Unlock(); -} - -void Ebofs::_trim_from_cache(object_t oid, off_t off, size_t len) -{ - // be careful not to load it if we don't have it - if (!have_onode(oid)) { - dout(7) << "_trim_from_cache " << oid << " " << off << "~" << len << " ... onode not in cache " << dendl; - return; - } - - // ok, we have it, get a pointer. - Onode *on = get_onode(oid); - - if (!on->have_oc()) - return; // nothing is cached. - - // map to blocks - block_t bstart = off / EBOFS_BLOCK_SIZE; - block_t blast = (len+off-1) / EBOFS_BLOCK_SIZE; - - ObjectCache *oc = on->get_oc(&bc); - oc->touch_bottom(bstart, blast); - - return; -} - - -int Ebofs::read(object_t oid, - off_t off, size_t len, - bufferlist& bl) -{ - ebofs_lock.Lock(); - int r = _read(oid, off, len, bl); - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_read(object_t oid, off_t off, size_t len, bufferlist& bl) -{ - dout(7) << "_read " << oid << " " << off << "~" << len << dendl; - - Onode *on = get_onode(oid); - if (!on) { - dout(7) << "_read " << oid << " " << off << "~" << len << " ... dne " << dendl; - return -ENOENT; // object dne? - } - - // read data into bl. block as necessary. - Cond cond; - - int r = 0; - while (1) { - // check size bound - if (off >= on->object_size) { - dout(7) << "_read " << oid << " " << off << "~" << len << " ... off past eof " << on->object_size << dendl; - r = -ESPIPE; // FIXME better errno? - break; - } - - size_t try_len = len ? len:on->object_size; - size_t will_read = MIN(off+(off_t)try_len, on->object_size) - off; - - bool done; - if (attempt_read(on, off, will_read, bl, &cond, &done)) - break; // yay - - // wait - while (!done) - cond.Wait(ebofs_lock); - - if (on->deleted) { - dout(7) << "_read " << oid << " " << off << "~" << len << " ... object deleted" << dendl; - r = -ENOENT; - break; - } - } - - put_onode(on); - - trim_bc(); - - if (r < 0) return r; // return error, - dout(7) << "_read " << oid << " " << off << "~" << len << " ... got " << bl.length() << dendl; - return bl.length(); // or bytes read. -} - - -bool Ebofs::_write_will_block() -{ - return (bc.get_stat_dirty()+bc.get_stat_tx() > g_conf.ebofs_bc_max_dirty); -} - -bool Ebofs::write_will_block() -{ - ebofs_lock.Lock(); - bool b = _write_will_block(); - ebofs_lock.Unlock(); - return b; -} - - -unsigned Ebofs::apply_transaction(Transaction& t, Context *onsafe) -{ - ebofs_lock.Lock(); - dout(7) << "apply_transaction start (" << t.get_num_ops() << " ops)" << dendl; - - unsigned r = _apply_transaction(t); - - // journal, wait for commit - if (r != 0 && onsafe) { - delete onsafe; // kill callback, but still journal below (in case transaction had side effects) - onsafe = 0; - } - while (1) { - if (journal) { - bufferlist bl; - t._encode(bl); - if (journal->submit_entry(bl, onsafe)) break; - } - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - break; - } - - ebofs_lock.Unlock(); - return r; -} - -unsigned Ebofs::_apply_transaction(Transaction& t) -{ - // do ops - unsigned r = 0; // bit fields indicate which ops failed. - int bit = 1; - while (t.have_op()) { - int op = t.get_op(); - switch (op) { - case Transaction::OP_READ: - { - object_t oid; - t.get_oid(oid); - off_t offset, len; - t.get_length(offset); - t.get_length(len); - bufferlist *pbl; - t.get_pbl(pbl); - if (_read(oid, offset, len, *pbl) < 0) { - dout(7) << "apply_transaction fail on _read" << dendl; - r &= bit; - } - } - break; - - case Transaction::OP_STAT: - { - object_t oid; - t.get_oid(oid); - struct stat *st; - t.get_pstat(st); - if (_stat(oid, st) < 0) { - dout(7) << "apply_transaction fail on _stat" << dendl; - r &= bit; - } - } - break; - - case Transaction::OP_GETATTR: - { - object_t oid; - t.get_oid(oid); - const char *attrname; - t.get_attrname(attrname); - pair pattrval; - t.get_pattrval(pattrval); - if ((*(pattrval.second) = _getattr(oid, attrname, pattrval.first, *(pattrval.second))) < 0) { - dout(7) << "apply_transaction fail on _getattr" << dendl; - r &= bit; - } - } - break; - - case Transaction::OP_GETATTRS: - { - object_t oid; - t.get_oid(oid); - map *pset; - t.get_pattrset(pset); - if (_getattrs(oid, *pset) < 0) { - dout(7) << "apply_transaction fail on _getattrs" << dendl; - r &= bit; - } - } - break; - - - case Transaction::OP_WRITE: - { - object_t oid; - t.get_oid(oid); - off_t offset, len; - t.get_length(offset); - t.get_length(len); - bufferlist bl; - t.get_bl(bl); - if (_write(oid, offset, len, bl) < 0) { - dout(7) << "apply_transaction fail on _write" << dendl; - r &= bit; - } - } - break; - - case Transaction::OP_TRIMCACHE: - { - object_t oid; - t.get_oid(oid); - off_t offset, len; - t.get_length(offset); - t.get_length(len); - _trim_from_cache(oid, offset, len); - } - break; - - case Transaction::OP_TRUNCATE: - { - object_t oid; - t.get_oid(oid); - off_t len; - t.get_length(len); - if (_truncate(oid, len) < 0) { - dout(7) << "apply_transaction fail on _truncate" << dendl; - r &= bit; - } - } - break; - - case Transaction::OP_REMOVE: - { - object_t oid; - t.get_oid(oid); - if (_remove(oid) < 0) { - dout(7) << "apply_transaction fail on _remove" << dendl; - r &= bit; - } - } - break; - - case Transaction::OP_SETATTR: - { - object_t oid; - t.get_oid(oid); - const char *attrname; - t.get_attrname(attrname); - bufferlist bl; - t.get_bl(bl); - if (_setattr(oid, attrname, bl.c_str(), bl.length()) < 0) { - dout(7) << "apply_transaction fail on _setattr" << dendl; - r &= bit; - } - } - break; - - case Transaction::OP_SETATTRS: - { - object_t oid; - t.get_oid(oid); - map *pattrset; - t.get_pattrset(pattrset); - if (_setattrs(oid, *pattrset) < 0) { - dout(7) << "apply_transaction fail on _setattrs" << dendl; - r &= bit; - } - } - break; - - case Transaction::OP_RMATTR: - { - object_t oid; - t.get_oid(oid); - const char *attrname; - t.get_attrname(attrname); - if (_rmattr(oid, attrname) < 0) { - dout(7) << "apply_transaction fail on _rmattr" << dendl; - r &= bit; - } - } - break; - - case Transaction::OP_CLONE: - { - object_t oid; - t.get_oid(oid); - object_t noid; - t.get_oid(noid); - if (_clone(oid, noid) < 0) { - dout(7) << "apply_transaction fail on _clone" << dendl; - r &= bit; - } - } - break; - - case Transaction::OP_MKCOLL: - { - coll_t cid; - t.get_cid(cid); - if (_create_collection(cid) < 0) { - dout(7) << "apply_transaction fail on _create_collection" << dendl; - r &= bit; - } - } - break; - - case Transaction::OP_RMCOLL: - { - coll_t cid; - t.get_cid(cid); - if (_destroy_collection(cid) < 0) { - dout(7) << "apply_transaction fail on _destroy_collection" << dendl; - r &= bit; - } - } - break; - - case Transaction::OP_COLL_ADD: - { - coll_t cid; - t.get_cid(cid); - object_t oid; - t.get_oid(oid); - if (_collection_add(cid, oid) < 0) { - //dout(7) << "apply_transaction fail on _collection_add" << dendl; - //r &= bit; - } - } - break; - - case Transaction::OP_COLL_REMOVE: - { - coll_t cid; - t.get_cid(cid); - object_t oid; - t.get_oid(oid); - if (_collection_remove(cid, oid) < 0) { - dout(7) << "apply_transaction fail on _collection_remove" << dendl; - r &= bit; - } - } - break; - - case Transaction::OP_COLL_SETATTR: - { - coll_t cid; - t.get_cid(cid); - const char *attrname; - t.get_attrname(attrname); - bufferlist bl; - t.get_bl(bl); - if (_collection_setattr(cid, attrname, bl.c_str(), bl.length()) < 0) { - //if (_collection_setattr(cid, attrname, attrval.first, attrval.second) < 0) { - dout(7) << "apply_transaction fail on _collection_setattr" << dendl; - r &= bit; - } - } - break; - - case Transaction::OP_COLL_RMATTR: - { - coll_t cid; - t.get_cid(cid); - const char *attrname; - t.get_attrname(attrname); - if (_collection_rmattr(cid, attrname) < 0) { - dout(7) << "apply_transaction fail on _collection_rmattr" << dendl; - r &= bit; - } - } - break; - - default: - dout(0) << "bad op " << op << dendl; - assert(0); - } - - bit = bit << 1; - } - - dout(7) << "_apply_transaction finish (r = " << r << ")" << dendl; - return r; -} - - - -int Ebofs::_write(object_t oid, off_t offset, size_t length, const bufferlist& bl) -{ - dout(7) << "_write " << oid << " " << offset << "~" << length << dendl; - assert(bl.length() == length); - - // too much unflushed dirty data? (if so, block!) - if (_write_will_block()) { - dout(10) << "_write blocking " - << oid << " " << offset << "~" << length - << " bc: " - << "size " << bc.get_size() - << ", trimmable " << bc.get_trimmable() - << ", max " << g_conf.ebofs_bc_size - << "; dirty " << bc.get_stat_dirty() - << ", tx " << bc.get_stat_tx() - << ", max dirty " << g_conf.ebofs_bc_max_dirty - << dendl; - - while (_write_will_block()) - bc.waitfor_stat(); // waits on ebofs_lock - - dout(10) << "_write unblocked " - << oid << " " << offset << "~" << length - << " bc: " - << "size " << bc.get_size() - << ", trimmable " << bc.get_trimmable() - << ", max " << g_conf.ebofs_bc_size - << "; dirty " << bc.get_stat_dirty() - << ", tx " << bc.get_stat_tx() - << ", max dirty " << g_conf.ebofs_bc_max_dirty - << dendl; - } - - // out of space? - unsigned max = (length+offset) / EBOFS_BLOCK_SIZE + 10; // very conservative; assumes we have to rewrite - max += dirty_onodes.size() + dirty_cnodes.size(); - if (max >= free_blocks) { - dout(1) << "write failing, only " << free_blocks << " blocks free, may need up to " << max << dendl; - return -ENOSPC; - } - - // get|create inode - Onode *on = get_onode(oid); - if (!on) on = new_onode(oid); // new inode! - if (on->readonly) { - put_onode(on); - return -EACCES; - } - - dirty_onode(on); // dirty onode! - - // apply write to buffer cache - if (length > 0) - apply_write(on, offset, length, bl); - - // done. - put_onode(on); - trim_bc(); - - return length; -} - - -int Ebofs::write(object_t oid, - off_t off, size_t len, - const bufferlist& bl, Context *onsafe) -{ - ebofs_lock.Lock(); - - // go - int r = _write(oid, off, len, bl); - - // commit waiter - if (r > 0) { - assert((size_t)r == len); - while (1) { - if (journal) { - Transaction t; - t.write(oid, off, len, bl); - bufferlist tbl; - t._encode(tbl); - if (journal->submit_entry(tbl, onsafe)) break; - } - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - break; - } - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return r; -} - - -int Ebofs::_remove(object_t oid) -{ - dout(7) << "_remove " << oid << dendl; - - // get inode - Onode *on = get_onode(oid); - if (!on) return -ENOENT; - - // ok remove it! - remove_onode(on); - - return 0; -} - - -int Ebofs::remove(object_t oid, Context *onsafe) -{ - ebofs_lock.Lock(); - - // do it - int r = _remove(oid); - - // journal, wait for commit - if (r >= 0) { - while (1) { - if (journal) { - Transaction t; - t.remove(oid); - bufferlist bl; - t._encode(bl); - if (journal->submit_entry(bl, onsafe)) break; - } - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - break; - } - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_truncate(object_t oid, off_t size) -{ - dout(7) << "_truncate " << oid << " size " << size << dendl; - - Onode *on = get_onode(oid); - if (!on) - return -ENOENT; - if (on->readonly) { - put_onode(on); - return -EACCES; - } - - int r = 0; - if (size > on->object_size) { - r = -EINVAL; // whatever - } - else if (size < on->object_size) { - // change size - on->object_size = size; - dirty_onode(on); - - // free blocks - block_t nblocks = 0; - if (size) nblocks = 1 + (size-1) / EBOFS_BLOCK_SIZE; - if (on->object_blocks > nblocks) { - vector extra; - on->truncate_extents(nblocks, extra); - for (unsigned i=0; ioc) { - on->oc->truncate(on->object_blocks, super_epoch); - if (on->oc->is_empty()) - on->close_oc(); - } - - // update uncommitted - interval_set uncom; - if (nblocks > 0) { - interval_set left; - left.insert(0, nblocks); - uncom.intersection_of(left, on->uncommitted); - } - dout(10) << "uncommitted was " << on->uncommitted << " now " << uncom << dendl; - on->uncommitted = uncom; - - } - else { - assert(size == on->object_size); - } - - put_onode(on); - return r; -} - - -int Ebofs::truncate(object_t oid, off_t size, Context *onsafe) -{ - ebofs_lock.Lock(); - - int r = _truncate(oid, size); - - // journal, wait for commit - if (r >= 0) { - while (1) { - if (journal) { - Transaction t; - t.truncate(oid, size); - bufferlist bl; - t._encode(bl); - if (journal->submit_entry(bl, onsafe)) break; - } - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - break; - } - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return r; -} - - - -int Ebofs::clone(object_t from, object_t to, Context *onsafe) -{ - ebofs_lock.Lock(); - - int r = _clone(from, to); - - // journal, wait for commit - if (r >= 0) { - while (1) { - if (journal) { - Transaction t; - t.clone(from, to); - bufferlist bl; - t._encode(bl); - if (journal->submit_entry(bl, onsafe)) break; - } - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - break; - } - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_clone(object_t from, object_t to) -{ - dout(7) << "_clone " << from << " -> " << to << dendl; - - if (!g_conf.ebofs_cloneable) - return -1; // no! - - Onode *fon = get_onode(from); - if (!fon) return -ENOENT; - Onode *ton = get_onode(to); - if (ton) { - put_onode(fon); - put_onode(ton); - return -EEXIST; - } - ton = new_onode(to); - assert(ton); - - // copy easy bits - ton->readonly = true; - ton->object_size = fon->object_size; - ton->object_blocks = fon->object_blocks; - ton->attr = fon->attr; - - // collections - for (set::iterator p = fon->collections.begin(); - p != fon->collections.end(); - p++) - _collection_add(*p, to); - - // extents - ton->extent_map = fon->extent_map; - for (map::iterator p = ton->extent_map.begin(); - p != ton->extent_map.end(); - ++p) { - allocator.alloc_inc(p->second); - } - - // clear uncommitted - fon->uncommitted.clear(); - - // muck with ObjectCache - if (fon->oc) - fon->oc->clone_to( ton ); - - // ok! - put_onode(ton); - put_onode(fon); - return 0; -} - - - - -/* - * pick object revision with rev < specified rev. - * (oid.rev is a noninclusive upper bound.) - * - */ -int Ebofs::pick_object_revision_lt(object_t& oid) -{ - assert(oid.rev > 0); // this is only useful for non-zero oid.rev - - int r = -EEXIST; // return code - ebofs_lock.Lock(); - { - object_t orig = oid; - object_t live = oid; - live.rev = 0; - - if (object_tab->get_num_keys() > 0) { - Table::Cursor cursor(object_tab); - - object_tab->find(oid, cursor); // this will be just _past_ highest eligible rev - if (cursor.move_left() > 0) { - bool firstpass = true; - while (1) { - object_t t = cursor.current().key; - if (t.ino != oid.ino || - t.bno != oid.bno) // passed to previous object - break; - if (oid.rev < t.rev) { // rev < desired. possible match. - r = 0; - oid = t; - break; - } - if (firstpass && oid.rev >= t.rev) { // there is no old rev < desired. try live. - r = 0; - oid = live; - break; - } - if (cursor.move_left() <= 0) break; - firstpass = false; - } - } - } - - dout(8) << "find_object_revision " << orig << " -> " << oid - << " r=" << r << dendl; - } - ebofs_lock.Unlock(); - return r; -} - - - - -bool Ebofs::exists(object_t oid) -{ - ebofs_lock.Lock(); - dout(8) << "exists " << oid << dendl; - bool e = (object_tab->lookup(oid) == 0); - ebofs_lock.Unlock(); - return e; -} - -int Ebofs::stat(object_t oid, struct stat *st) -{ - ebofs_lock.Lock(); - int r = _stat(oid,st); - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_stat(object_t oid, struct stat *st) -{ - dout(7) << "_stat " << oid << dendl; - - Onode *on = get_onode(oid); - if (!on) return -ENOENT; - - // ?? - st->st_size = on->object_size; - - put_onode(on); - return 0; -} - - -int Ebofs::_setattr(object_t oid, const char *name, const void *value, size_t size) -{ - dout(8) << "setattr " << oid << " '" << name << "' len " << size << dendl; - - Onode *on = get_onode(oid); - if (!on) return -ENOENT; - if (on->readonly) { - put_onode(on); - return -EACCES; - } - - string n(name); - on->attr[n] = buffer::copy((char*)value, size); - dirty_onode(on); - put_onode(on); - - dout(8) << "setattr " << oid << " '" << name << "' len " << size << " success" << dendl; - - return 0; -} - -int Ebofs::setattr(object_t oid, const char *name, const void *value, size_t size, Context *onsafe) -{ - ebofs_lock.Lock(); - int r = _setattr(oid, name, value, size); - - // journal, wait for commit - if (r >= 0) { - while (1) { - if (journal) { - Transaction t; - t.setattr(oid, name, value, size); - bufferlist bl; - t._encode(bl); - if (journal->submit_entry(bl, onsafe)) break; - } - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - break; - } - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_setattrs(object_t oid, map& attrset) -{ - dout(8) << "setattrs " << oid << dendl; - - Onode *on = get_onode(oid); - if (!on) return -ENOENT; - if (on->readonly) { - put_onode(on); - return -EACCES; - } - - on->attr = attrset; - dirty_onode(on); - put_onode(on); - return 0; -} - -int Ebofs::setattrs(object_t oid, map& attrset, Context *onsafe) -{ - ebofs_lock.Lock(); - int r = _setattrs(oid, attrset); - - // journal, wait for commit - if (r >= 0) { - while (1) { - if (journal) { - Transaction t; - t.setattrs(oid, attrset); - bufferlist bl; - t._encode(bl); - if (journal->submit_entry(bl, onsafe)) break; - } - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - break; - } - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return r; -} - - -int Ebofs::get_object_collections(object_t oid, set& ls) -{ - ebofs_lock.Lock(); - int r = _get_object_collections(oid, ls); - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_get_object_collections(object_t oid, set& ls) -{ - dout(8) << "_get_object_collections " << oid << dendl; - - Onode *on = get_onode(oid); - if (!on) return -ENOENT; - ls = on->collections; - put_onode(on); - return 0; -} - -int Ebofs::getattr(object_t oid, const char *name, void *value, size_t size) -{ - ebofs_lock.Lock(); - int r = _getattr(oid, name, value, size); - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_getattr(object_t oid, const char *name, void *value, size_t size) -{ - dout(8) << "_getattr " << oid << " '" << name << "' maxlen " << size << dendl; - - Onode *on = get_onode(oid); - if (!on) return -ENOENT; - - string n(name); - int r = 0; - if (on->attr.count(n) == 0) { - dout(10) << "_getattr " << oid << " '" << name << "' dne" << dendl; - r = -1; - } else { - r = MIN( on->attr[n].length(), size ); - dout(10) << "_getattr " << oid << " '" << name << "' got len " << r << dendl; - memcpy(value, on->attr[n].c_str(), r ); - } - put_onode(on); - return r; -} - -int Ebofs::getattrs(object_t oid, map &aset) -{ - ebofs_lock.Lock(); - int r = _getattrs(oid, aset); - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_getattrs(object_t oid, map &aset) -{ - dout(8) << "_getattrs " << oid << dendl; - - Onode *on = get_onode(oid); - if (!on) return -ENOENT; - aset = on->attr; - put_onode(on); - return 0; -} - - - -int Ebofs::_rmattr(object_t oid, const char *name) -{ - dout(8) << "_rmattr " << oid << " '" << name << "'" << dendl; - - Onode *on = get_onode(oid); - if (!on) return -ENOENT; - if (on->readonly) { - put_onode(on); - return -EACCES; - } - - string n(name); - on->attr.erase(n); - dirty_onode(on); - put_onode(on); - return 0; -} - -int Ebofs::rmattr(object_t oid, const char *name, Context *onsafe) -{ - ebofs_lock.Lock(); - - int r = _rmattr(oid, name); - - // journal, wait for commit - if (r >= 0) { - while (1) { - if (journal) { - Transaction t; - t.rmattr(oid, name); - bufferlist bl; - t._encode(bl); - if (journal->submit_entry(bl, onsafe)) break; - } - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - break; - } - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::listattr(object_t oid, vector& attrs) -{ - ebofs_lock.Lock(); - dout(8) << "listattr " << oid << dendl; - - Onode *on = get_onode(oid); - if (!on) { - ebofs_lock.Unlock(); - return -ENOENT; - } - - attrs.clear(); - for (map::iterator i = on->attr.begin(); - i != on->attr.end(); - i++) { - attrs.push_back(i->first); - } - - put_onode(on); - ebofs_lock.Unlock(); - return 0; -} - -int Ebofs::list_objects(list& ls) -{ - ebofs_lock.Lock(); - dout(9) << "list_objects " << dendl; - - Table::Cursor cursor(object_tab); - - int num = 0; - if (object_tab->find(object_t(), cursor) >= 0) { - while (1) { - ls.push_back(cursor.current().key); - num++; - if (cursor.move_right() <= 0) break; - } - } - - ebofs_lock.Unlock(); - return num; -} - - -/***************** collections ******************/ - -int Ebofs::list_collections(list& ls) -{ - ebofs_lock.Lock(); - dout(9) << "list_collections " << dendl; - - Table::Cursor cursor(collection_tab); - - int num = 0; - if (collection_tab->find(0, cursor) >= 0) { - while (1) { - ls.push_back(cursor.current().key); - num++; - if (cursor.move_right() <= 0) break; - } - } - - ebofs_lock.Unlock(); - return num; -} - -int Ebofs::_create_collection(coll_t cid) -{ - dout(9) << "_create_collection " << hex << cid << dec << dendl; - - if (_collection_exists(cid)) - return -EEXIST; - - Cnode *cn = new_cnode(cid); - put_cnode(cn); - - return 0; -} - -int Ebofs::create_collection(coll_t cid, Context *onsafe) -{ - ebofs_lock.Lock(); - - int r = _create_collection(cid); - - // journal, wait for commit - if (r >= 0) { - while (1) { - if (journal) { - Transaction t; - t.create_collection(cid); - bufferlist bl; - t._encode(bl); - if (journal->submit_entry(bl, onsafe)) break; - } - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - break; - } - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_destroy_collection(coll_t cid) -{ - dout(9) << "_destroy_collection " << hex << cid << dec << dendl; - - if (!_collection_exists(cid)) - return -ENOENT; - - Cnode *cn = get_cnode(cid); - assert(cn); - - // hose mappings - list objects; - collection_list(cid, objects); - for (list::iterator i = objects.begin(); - i != objects.end(); - i++) { - co_tab->remove(coll_object_t(cid,*i)); - - Onode *on = get_onode(*i); - if (on) { - on->collections.erase(cid); - dirty_onode(on); - put_onode(on); - } - } - - remove_cnode(cn); - return 0; -} - -int Ebofs::destroy_collection(coll_t cid, Context *onsafe) -{ - ebofs_lock.Lock(); - - int r = _destroy_collection(cid); - - // journal, wait for commit - if (r >= 0) { - while (1) { - if (journal) { - Transaction t; - t.remove_collection(cid); - bufferlist bl; - t._encode(bl); - if (journal->submit_entry(bl, onsafe)) break; - } - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - break; - } - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return r; -} - -bool Ebofs::collection_exists(coll_t cid) -{ - ebofs_lock.Lock(); - dout(10) << "collection_exists " << hex << cid << dec << dendl; - bool r = _collection_exists(cid); - ebofs_lock.Unlock(); - return r; -} -bool Ebofs::_collection_exists(coll_t cid) -{ - return (collection_tab->lookup(cid) == 0); -} - -int Ebofs::_collection_add(coll_t cid, object_t oid) -{ - dout(9) << "_collection_add " << hex << cid << " object " << oid << dec << dendl; - - if (!_collection_exists(cid)) - return -ENOENT; - - Onode *on = get_onode(oid); - if (!on) return -ENOENT; - - int r = 0; - - if (on->collections.count(cid) == 0) { - on->collections.insert(cid); - dirty_onode(on); - co_tab->insert(coll_object_t(cid,oid), true); - } else { - r = -ENOENT; // FIXME? already in collection. - } - - put_onode(on); - return r; -} - -int Ebofs::collection_add(coll_t cid, object_t oid, Context *onsafe) -{ - ebofs_lock.Lock(); - - int r = _collection_add(cid, oid); - - // journal, wait for commit - if (r >= 0) { - while (1) { - if (journal) { - Transaction t; - t.collection_add(cid, oid); - bufferlist bl; - t._encode(bl); - if (journal->submit_entry(bl, onsafe)) break; - } - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - break; - } - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return 0; -} - -int Ebofs::_collection_remove(coll_t cid, object_t oid) -{ - dout(9) << "_collection_remove " << hex << cid << " object " << oid << dec << dendl; - - if (!_collection_exists(cid)) - return -ENOENT; - - Onode *on = get_onode(oid); - if (!on) return -ENOENT; - - int r = 0; - - if (on->collections.count(cid)) { - on->collections.erase(cid); - dirty_onode(on); - co_tab->remove(coll_object_t(cid,oid)); - } else { - r = -ENOENT; // FIXME? - } - - put_onode(on); - return r; -} - -int Ebofs::collection_remove(coll_t cid, object_t oid, Context *onsafe) -{ - ebofs_lock.Lock(); - - int r = _collection_remove(cid, oid); - - // journal, wait for commit - if (r >= 0) { - while (1) { - if (journal) { - Transaction t; - t.collection_remove(cid, oid); - bufferlist bl; - t._encode(bl); - if (journal->submit_entry(bl, onsafe)) break; - } - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - break; - } - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return 0; -} - -int Ebofs::collection_list(coll_t cid, list& ls) -{ - ebofs_lock.Lock(); - dout(9) << "collection_list " << hex << cid << dec << dendl; - - if (!_collection_exists(cid)) { - ebofs_lock.Unlock(); - return -ENOENT; - } - - Table::Cursor cursor(co_tab); - - int num = 0; - if (co_tab->find(coll_object_t(cid,object_t()), cursor) >= 0) { - while (1) { - const coll_t c = cursor.current().key.first; - const object_t o = cursor.current().key.second; - if (c != cid) break; // end! - dout(10) << "collection_list " << hex << cid << " includes " << o << dec << dendl; - ls.push_back(o); - num++; - if (cursor.move_right() < 0) break; - } - } - - ebofs_lock.Unlock(); - return num; -} - - -int Ebofs::_collection_setattr(coll_t cid, const char *name, const void *value, size_t size) -{ - dout(10) << "_collection_setattr " << hex << cid << dec << " '" << name << "' len " << size << dendl; - - Cnode *cn = get_cnode(cid); - if (!cn) return -ENOENT; - - string n(name); - cn->attr[n] = buffer::copy((char*)value, size); - dirty_cnode(cn); - put_cnode(cn); - - return 0; -} - -int Ebofs::collection_setattr(coll_t cid, const char *name, const void *value, size_t size, Context *onsafe) -{ - ebofs_lock.Lock(); - dout(10) << "collection_setattr " << hex << cid << dec << " '" << name << "' len " << size << dendl; - - int r = _collection_setattr(cid, name, value, size); - - // journal, wait for commit - if (r >= 0) { - while (1) { - if (journal) { - Transaction t; - t.collection_setattr(cid, name, value, size); - bufferlist bl; - t._encode(bl); - if (journal->submit_entry(bl, onsafe)) break; - } - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - break; - } - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return 0; -} - -int Ebofs::collection_getattr(coll_t cid, const char *name, void *value, size_t size) -{ - ebofs_lock.Lock(); - dout(10) << "collection_setattr " << hex << cid << dec << " '" << name << "' maxlen " << size << dendl; - - Cnode *cn = get_cnode(cid); - if (!cn) { - ebofs_lock.Unlock(); - return -ENOENT; - } - - string n(name); - int r; - if (cn->attr.count(n) == 0) { - r = -1; - } else { - r = MIN( cn->attr[n].length(), size ); - memcpy(value, cn->attr[n].c_str(), r); - } - - put_cnode(cn); - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::collection_getattrs(coll_t cid, map &aset) -{ - ebofs_lock.Lock(); - int r = _collection_getattrs(cid, aset); - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_collection_getattrs(coll_t cid, map &aset) -{ - dout(8) << "_collection_getattrs " << cid << dendl; - - Cnode *cn = get_cnode(cid); - if (!cn) return -ENOENT; - aset = cn->attr; - put_cnode(cn); - return 0; -} - -int Ebofs::collection_setattrs(coll_t cid, map &aset) -{ - ebofs_lock.Lock(); - int r = _collection_setattrs(cid, aset); - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_collection_setattrs(coll_t cid, map &aset) -{ - dout(8) << "_collection_setattrs " << cid << dendl; - - Cnode *cn = get_cnode(cid); - if (!cn) return -ENOENT; - cn->attr = aset; - dirty_cnode(cn); - put_cnode(cn); - return 0; -} - - -int Ebofs::_collection_rmattr(coll_t cid, const char *name) -{ - dout(10) << "_collection_rmattr " << hex << cid << dec << " '" << name << "'" << dendl; - - Cnode *cn = get_cnode(cid); - if (!cn) return -ENOENT; - - string n(name); - cn->attr.erase(n); - - dirty_cnode(cn); - put_cnode(cn); - - return 0; -} - -int Ebofs::collection_rmattr(coll_t cid, const char *name, Context *onsafe) -{ - ebofs_lock.Lock(); - - int r = _collection_rmattr(cid, name); - - // journal, wait for commit - if (r >= 0) { - while (1) { - if (journal) { - Transaction t; - t.collection_rmattr(cid, name); - bufferlist bl; - t._encode(bl); - if (journal->submit_entry(bl, onsafe)) break; - } - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - break; - } - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return 0; -} - -int Ebofs::collection_listattr(coll_t cid, vector& attrs) -{ - ebofs_lock.Lock(); - dout(10) << "collection_listattr " << hex << cid << dec << dendl; - - Cnode *cn = get_cnode(cid); - if (!cn) { - ebofs_lock.Unlock(); - return -ENOENT; - } - - attrs.clear(); - for (map::iterator i = cn->attr.begin(); - i != cn->attr.end(); - i++) { - attrs.push_back(i->first); - } - - put_cnode(cn); - ebofs_lock.Unlock(); - return 0; -} - - - -void Ebofs::_export_freelist(bufferlist& bl) -{ - for (int b=0; b<=EBOFS_NUM_FREE_BUCKETS; b++) { - Table *tab; - if (b < EBOFS_NUM_FREE_BUCKETS) { - tab = free_tab[b]; - } else { - tab = limbo_tab; - } - - if (tab->get_num_keys() > 0) { - Table::Cursor cursor(tab); - assert(tab->find(0, cursor) >= 0); - while (1) { - assert(cursor.current().value > 0); - - Extent ex(cursor.current().key, cursor.current().value); - dout(10) << "_export_freelist " << ex << dendl; - bl.append((char*)&ex, sizeof(ex)); - if (cursor.move_right() <= 0) break; - } - } - } -} - -void Ebofs::_import_freelist(bufferlist& bl) -{ - // clear - for (int b=0; bclear(); - limbo_tab->clear(); - - // import! - int num = bl.length() / sizeof(Extent); - Extent *p = (Extent*)bl.c_str(); - for (int i=0; i *tab; - if (b < EBOFS_NUM_FREE_BUCKETS) { - tab = free_tab[b]; - dout(30) << "dump bucket " << b << " " << tab->get_num_keys() << dendl; - } else { - tab = limbo_tab; - dout(30) << "dump limbo " << tab->get_num_keys() << dendl;; - } - - if (tab->get_num_keys() > 0) { - Table::Cursor cursor(tab); - assert(tab->find(0, cursor) >= 0); - while (1) { - assert(cursor.current().value > 0); - - block_t l = cursor.current().value; - tfree += l; - int b = 0; - do { - l = l >> 1; - b++; - } while (l); - st.free_extent_dist[b]++; - st.free_extent_dist_sum[b] += cursor.current().value; - st.num_free_extent++; - - if (cursor.move_right() <= 0) break; - } - } - } - st.avg_free_extent = tfree / st.num_free_extent; -*/ - - // used extents is harder. :( - st.num_extent = 0; - st.avg_extent = 0; - st.extent_dist.clear(); - st.extent_dist_sum.clear(); - st.avg_extent_per_object = 0; - st.avg_extent_jump = 0; - - Table::Cursor cursor(object_tab); - object_tab->find(object_t(), cursor); - int nobj = 0; - int njump = 0; - while (object_tab->get_num_keys() > 0) { - Onode *on = get_onode(cursor.current().key); - assert(on); - - nobj++; - st.avg_extent_per_object += on->extent_map.size(); - - for (map::iterator p = on->extent_map.begin(); - p != on->extent_map.end(); - p++) { - block_t l = p->second.length; - - st.num_extent++; - st.avg_extent += l; - if (p->first > 0) { - njump++; - st.avg_extent_jump += l; - } - - int b = 0; - do { - l = l >> 1; - b++; - } while (l); - st.extent_dist[b]++; - st.extent_dist_sum[b] += p->second.length; - } - put_onode(on); - if (cursor.move_right() <= 0) break; - } - if (njump) st.avg_extent_jump /= njump; - if (nobj) st.avg_extent_per_object /= (float)nobj; - if (st.num_extent) st.avg_extent /= st.num_extent; - - ebofs_lock.Unlock(); -} diff --git a/branches/sage/ebofs2/ebofs/Ebofs.h b/branches/sage/ebofs2/ebofs/Ebofs.h deleted file mode 100644 index 13eebabd93aad..0000000000000 --- a/branches/sage/ebofs2/ebofs/Ebofs.h +++ /dev/null @@ -1,370 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include -using namespace std; - -#include -using namespace __gnu_cxx; - -#include "include/Context.h" -#include "include/buffer.h" -#include "include/hash.h" - -#include "types.h" -#include "Onode.h" -#include "Cnode.h" -#include "BlockDevice.h" -#include "nodes.h" -#include "Allocator.h" -#include "Table.h" -#include "Journal.h" - -#include "common/Mutex.h" -#include "common/Cond.h" - -#include "osd/ObjectStore.h" - -//typedef pair object_coll_t; -typedef pair coll_object_t; - - -class Ebofs : public ObjectStore { -protected: - Mutex ebofs_lock; // a beautiful global lock - - // ** debuggy ** - bool fake_writes; - - // ** super ** -public: - BlockDevice dev; -protected: - bool mounted, unmounting, dirty; - bool readonly; - version_t super_epoch; - bool commit_thread_started, mid_commit; - Cond commit_cond; // to wake up the commit thread - Cond sync_cond; - uint64_t super_fsid; - - map > commit_waiters; - - void prepare_super(version_t epoch, bufferptr& bp); - void write_super(version_t epoch, bufferptr& bp); - int commit_thread_entry(); - - class CommitThread : public Thread { - Ebofs *ebofs; - public: - CommitThread(Ebofs *e) : ebofs(e) {} - void *entry() { - ebofs->commit_thread_entry(); - return 0; - } - } commit_thread; - -public: - uint64_t get_fsid() { return super_fsid; } - epoch_t get_super_epoch() { return super_epoch; } -protected: - - - // ** journal ** - char *journalfn; - Journal *journal; - - // ** allocator ** - block_t free_blocks, limbo_blocks; - Allocator allocator; - friend class Allocator; - - block_t get_free_blocks() { return free_blocks; } - block_t get_limbo_blocks() { return limbo_blocks; } - block_t get_free_extents() { - int n = 0; - for (int i=0; iget_num_keys(); - return n; - } - block_t get_limbo_extents() { return limbo_tab->get_num_keys(); } - - - // ** tables and sets ** - // nodes - NodePool nodepool; // for all tables... - - // tables - Table *object_tab; - Table *free_tab[EBOFS_NUM_FREE_BUCKETS]; - Table *limbo_tab; - Table > *alloc_tab; - - // collections - Table *collection_tab; - Table *co_tab; - - void close_tables(); - void verify_tables(); - - - // ** onodes ** - hash_map onode_map; // onode cache - LRU onode_lru; - set dirty_onodes; - map > waitfor_onode; - - Onode* new_onode(object_t oid); // make new onode. ref++. - bool have_onode(object_t oid) { - return onode_map.count(oid); - } - Onode* get_onode(object_t oid); // get cached onode, or read from disk. ref++. - void remove_onode(Onode *on); - void put_onode(Onode* o); // put it back down. ref--. - void dirty_onode(Onode* o); - void encode_onode(Onode *on, bufferlist& bl, unsigned& off); - void write_onode(Onode *on); - - // ** cnodes ** - hash_map > cnode_map; - LRU cnode_lru; - set dirty_cnodes; - map > waitfor_cnode; - - Cnode* new_cnode(coll_t cid); - Cnode* get_cnode(coll_t cid); - void remove_cnode(Cnode *cn); - void put_cnode(Cnode *cn); - void dirty_cnode(Cnode *cn); - void encode_cnode(Cnode *cn, bufferlist& bl, unsigned& off); - void write_cnode(Cnode *cn); - - // ** onodes+cnodes = inodes ** - int inodes_flushing; - Cond inode_commit_cond; - - void flush_inode_finish(); - void commit_inodes_start(); - void commit_inodes_wait(); - friend class C_E_InodeFlush; - - void trim_inodes(int max = -1); - - // ** buffer cache ** - BufferCache bc; - pthread_t flushd_thread_id; - - version_t trigger_commit(); - void commit_bc_wait(version_t epoch); - void trim_bc(off_t max = -1); - - public: - void kick_idle(); - void sync(); - void sync(Context *onsafe); - void trim_buffer_cache(); - - class IdleKicker : public BlockDevice::kicker { - Ebofs *ebo; - public: - IdleKicker(Ebofs *t) : ebo(t) {} - void kick() { ebo->kick_idle(); } - } idle_kicker; - - - protected: - //void zero(Onode *on, size_t len, off_t off, off_t write_thru); - void alloc_write(Onode *on, - block_t start, block_t len, - interval_set& alloc, - block_t& old_bfirst, block_t& old_blast); - void apply_write(Onode *on, off_t off, size_t len, const bufferlist& bl); - bool attempt_read(Onode *on, off_t off, size_t len, bufferlist& bl, - Cond *will_wait_on, bool *will_wait_on_bool); - - // ** finisher ** - // async write notification to users - Mutex finisher_lock; - Cond finisher_cond; - bool finisher_stop; - list finisher_queue; - -public: - void queue_finisher(Context *c) { - finisher_lock.Lock(); - finisher_queue.push_back(c); - finisher_cond.Signal(); - finisher_lock.Unlock(); - } - void queue_finishers(list& ls) { - finisher_lock.Lock(); - finisher_queue.splice(finisher_queue.end(), ls); - finisher_cond.Signal(); - finisher_lock.Unlock(); - } -protected: - - void *finisher_thread_entry(); - class FinisherThread : public Thread { - Ebofs *ebofs; - public: - FinisherThread(Ebofs *e) : ebofs(e) {} - void* entry() { return (void*)ebofs->finisher_thread_entry(); } - } finisher_thread; - - - void alloc_more_node_space(); - - void do_csetattrs(map > > &cmods); - void do_setattrs(Onode *on, map > &setattrs); - - - public: - Ebofs(char *devfn, char *jfn=0) : - fake_writes(false), - dev(devfn), - mounted(false), unmounting(false), dirty(false), readonly(false), - super_epoch(0), commit_thread_started(false), mid_commit(false), - commit_thread(this), - journalfn(jfn), journal(0), - free_blocks(0), limbo_blocks(0), - allocator(this), - nodepool(ebofs_lock), - object_tab(0), limbo_tab(0), collection_tab(0), co_tab(0), - onode_lru(g_conf.ebofs_oc_size), - cnode_lru(g_conf.ebofs_cc_size), - inodes_flushing(0), - bc(dev, ebofs_lock), - idle_kicker(this), - finisher_stop(false), finisher_thread(this) { - for (int i=0; i& ls); - - // object attr - int setattr(object_t oid, const char *name, const void *value, size_t size, Context *onsafe=0); - int setattrs(object_t oid, map& attrset, Context *onsafe=0); - int getattr(object_t oid, const char *name, void *value, size_t size); - int getattrs(object_t oid, map &aset); - int rmattr(object_t oid, const char *name, Context *onsafe=0); - int listattr(object_t oid, vector& attrs); - - int get_object_collections(object_t oid, set& ls); - - // collections - int list_collections(list& ls); - bool collection_exists(coll_t c); - - int create_collection(coll_t c, Context *onsafe); - int destroy_collection(coll_t c, Context *onsafe); - int collection_add(coll_t c, object_t o, Context *onsafe); - int collection_remove(coll_t c, object_t o, Context *onsafe); - - int collection_list(coll_t c, list& o); - - int collection_setattr(coll_t cid, const char *name, const void *value, size_t size, Context *onsafe); - int collection_setattrs(coll_t cid, map &aset); - int collection_getattr(coll_t cid, const char *name, void *value, size_t size); - int collection_getattrs(coll_t cid, map &aset); - int collection_rmattr(coll_t cid, const char *name, Context *onsafe); - int collection_listattr(coll_t oid, vector& attrs); - - // maps - int map_lookup(object_t o, bufferlist& key, bufferlist& val); - int map_insert(object_t o, bufferlist& key, bufferlist& val); - int map_remove(object_t o, bufferlist& key); - int map_list(object_t o, list& keys); - int map_list(object_t o, map& vals); - int map_list(object_t o, - bufferlist& start, bufferlist& end, - map& vals); - - // crap - void _fake_writes(bool b) { fake_writes = b; } - void _get_frag_stat(FragmentationStat& st); - - void _import_freelist(bufferlist& bl); - void _export_freelist(bufferlist& bl); - - -private: - // private interface -- use if caller already holds lock - unsigned _apply_transaction(Transaction& t); - - int _read(object_t oid, off_t off, size_t len, bufferlist& bl); - int _is_cached(object_t oid, off_t off, size_t len); - int _stat(object_t oid, struct stat *st); - int _getattr(object_t oid, const char *name, void *value, size_t size); - int _getattrs(object_t oid, map &aset); - int _get_object_collections(object_t oid, set& ls); - - bool _write_will_block(); - int _write(object_t oid, off_t off, size_t len, const bufferlist& bl); - void _trim_from_cache(object_t oid, off_t off, size_t len); - int _truncate(object_t oid, off_t size); - int _truncate_front(object_t oid, off_t size); - int _remove(object_t oid); - int _clone(object_t from, object_t to); - int _setattr(object_t oid, const char *name, const void *value, size_t size); - int _setattrs(object_t oid, map& attrset); - int _rmattr(object_t oid, const char *name); - bool _collection_exists(coll_t c); - int _create_collection(coll_t c); - int _destroy_collection(coll_t c); - int _collection_add(coll_t c, object_t o); - int _collection_remove(coll_t c, object_t o); - int _collection_getattrs(coll_t oid, map &aset); - int _collection_setattr(coll_t oid, const char *name, const void *value, size_t size); - int _collection_setattrs(coll_t oid, map &aset); - int _collection_rmattr(coll_t cid, const char *name); - - -}; diff --git a/branches/sage/ebofs2/ebofs/FileJournal.cc b/branches/sage/ebofs2/ebofs/FileJournal.cc deleted file mode 100644 index 35a1e6f4127b6..0000000000000 --- a/branches/sage/ebofs2/ebofs/FileJournal.cc +++ /dev/null @@ -1,456 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "FileJournal.h" -#include "Ebofs.h" - -#include -#include -#include -#include - - -#include "config.h" - -#define dout(x) if (x <= g_conf.debug_ebofs) *_dout << dbeginl << g_clock.now() << " ebofs(" << ebofs->dev.get_device_name() << ").journal " -#define derr(x) if (x <= g_conf.debug_ebofs) *_derr << dbeginl << g_clock.now() << " ebofs(" << ebofs->dev.get_device_name() << ").journal " - - -int FileJournal::create() -{ - dout(2) << "create " << fn << dendl; - - // open/create - fd = ::open(fn.c_str(), O_RDWR|O_SYNC); - if (fd < 0) { - dout(2) << "create failed " << errno << " " << strerror(errno) << dendl; - return -errno; - } - assert(fd > 0); - - //::ftruncate(fd, 0); - //::fchmod(fd, 0644); - - // get size - struct stat st; - ::fstat(fd, &st); - dout(2) << "create " << fn << " " << st.st_size << " bytes" << dendl; - - // write empty header - memset(&header, 0, sizeof(header)); - header.clear(); - header.fsid = ebofs->get_fsid(); - header.max_size = st.st_size; - write_header(); - - // writeable. - read_pos = 0; - write_pos = queue_pos = sizeof(header); - - ::close(fd); - - return 0; -} - -int FileJournal::open() -{ - //dout(1) << "open " << fn << dendl; - - // open and file - assert(fd == 0); - fd = ::open(fn.c_str(), O_RDWR|O_SYNC); - if (fd < 0) { - dout(2) << "open failed " << errno << " " << strerror(errno) << dendl; - return -errno; - } - assert(fd > 0); - - // assume writeable, unless... - read_pos = 0; - write_pos = queue_pos = sizeof(header); - - // read header? - read_header(); - if (header.fsid != ebofs->get_fsid()) { - dout(2) << "open journal fsid doesn't match, invalid (someone else's?) journal" << dendl; - } - else if (header.num > 0) { - // valid header, pick an offset - for (int i=0; iget_super_epoch()) { - dout(2) << "using read_pos header pointer " - << header.epoch[i] << " at " << header.offset[i] - << dendl; - read_pos = header.offset[i]; - write_pos = queue_pos = 0; - break; - } - else if (header.epoch[i] < ebofs->get_super_epoch()) { - dout(2) << "super_epoch is " << ebofs->get_super_epoch() - << ", skipping old " << header.epoch[i] << " at " << header.offset[i] - << dendl; - } - else if (header.epoch[i] > ebofs->get_super_epoch()) { - dout(2) << "super_epoch is " << ebofs->get_super_epoch() - << ", but wtf, journal is later " << header.epoch[i] << " at " << header.offset[i] - << dendl; - break; - } - } - } - - start_writer(); - - return 0; -} - -void FileJournal::close() -{ - dout(1) << "close " << fn << dendl; - - // stop writer thread - stop_writer(); - - // close - assert(writeq.empty()); - assert(commitq.empty()); - assert(fd > 0); - ::close(fd); - fd = 0; -} - -void FileJournal::start_writer() -{ - write_stop = false; - write_thread.create(); -} - -void FileJournal::stop_writer() -{ - write_lock.Lock(); - { - write_stop = true; - write_cond.Signal(); - } - write_lock.Unlock(); - write_thread.join(); -} - - -void FileJournal::print_header() -{ - for (int i=0; i::const_iterator it = bl.buffers().begin(); - it != bl.buffers().end(); - it++) { - if ((*it).length() == 0) continue; // blank buffer. - ::write(fd, (char*)(*it).c_str(), (*it).length() ); - } - - ::write(fd, &h, sizeof(h)); - - // move position pointer - write_pos += 2*sizeof(entry_header_t) + bl.length(); - - if (oncommit) { - if (1) { - // queue callback - ebofs->queue_finisher(oncommit); - } else { - // callback now - oncommit->finish(0); - delete oncommit; - } - } - } - } - - write_lock.Unlock(); - dout(10) << "write_thread_entry finish" << dendl; -} - -bool FileJournal::submit_entry(bufferlist& e, Context *oncommit) -{ - assert(queue_pos != 0); // bad create(), or journal didn't replay to completion. - - // ** lock ** - Mutex::Locker locker(write_lock); - - // wrap? full? - off_t size = 2*sizeof(entry_header_t) + e.length(); - - if (full) return false; // already marked full. - - if (header.wrap) { - // we're wrapped. don't overwrite ourselves. - if (queue_pos + size >= header.offset[0]) { - dout(10) << "submit_entry JOURNAL FULL (and wrapped), " << queue_pos << "+" << size - << " >= " << header.offset[0] - << dendl; - full = true; - print_header(); - return false; - } - } else { - // we haven't wrapped. - if (queue_pos + size >= header.max_size) { - // is there room if we wrap? - if ((off_t)sizeof(header_t) + size < header.offset[0]) { - // yes! - dout(10) << "submit_entry wrapped from " << queue_pos << " to " << sizeof(header_t) << dendl; - header.wrap = queue_pos; - queue_pos = sizeof(header_t); - header.push(ebofs->get_super_epoch(), queue_pos); - } else { - // no room. - dout(10) << "submit_entry JOURNAL FULL (and can't wrap), " << queue_pos << "+" << size - << " >= " << header.max_size - << dendl; - full = true; - return false; - } - } - } - - dout(10) << "submit_entry " << queue_pos << " : " << e.length() - << " epoch " << ebofs->get_super_epoch() - << " " << oncommit << dendl; - - // dump on queue - writeq.push_back(pair(ebofs->get_super_epoch(), e)); - commitq.push_back(oncommit); - - queue_pos += size; - - // kick writer thread - write_cond.Signal(); - - return true; -} - - -void FileJournal::commit_epoch_start() -{ - dout(10) << "commit_epoch_start on " << ebofs->get_super_epoch()-1 - << " -- new epoch " << ebofs->get_super_epoch() - << dendl; - - Mutex::Locker locker(write_lock); - - // was full -> empty -> now usable? - if (full) { - if (header.num != 0) { - dout(1) << " journal FULL, ignoring this epoch" << dendl; - return; - } - - dout(1) << " clearing FULL flag, journal now usable" << dendl; - full = false; - } - - // note epoch boundary - header.push(ebofs->get_super_epoch(), queue_pos); // note: these entries may not yet be written. - //write_header(); // no need to write it now, though... -} - -void FileJournal::commit_epoch_finish() -{ - dout(10) << "commit_epoch_finish committed " << ebofs->get_super_epoch()-1 << dendl; - - write_lock.Lock(); - { - if (full) { - // full journal damage control. - dout(15) << " journal was FULL, contents now committed, clearing header. journal still not usable until next epoch." << dendl; - header.clear(); - write_pos = queue_pos = sizeof(header_t); - } else { - // update header -- trim/discard old (committed) epochs - while (header.epoch[0] < ebofs->get_super_epoch()) - header.pop(); - } - write_header(); - - // discard any unwritten items in previous epoch, and do callbacks - epoch_t epoch = ebofs->get_super_epoch(); - list callbacks; - while (!writeq.empty() && writeq.front().first < epoch) { - dout(15) << " dropping unwritten and committed " - << write_pos << " : " << writeq.front().second.length() - << " epoch " << writeq.front().first - << dendl; - // finisher? - Context *oncommit = commitq.front(); - if (oncommit) callbacks.push_back(oncommit); - - write_pos += 2*sizeof(entry_header_t) + writeq.front().second.length(); - - // discard. - writeq.pop_front(); - commitq.pop_front(); - } - - // queue the finishers - ebofs->queue_finishers(callbacks); - } - write_lock.Unlock(); - -} - - -void FileJournal::make_writeable() -{ - if (read_pos) - write_pos = queue_pos = read_pos; - else - write_pos = queue_pos = sizeof(header_t); - read_pos = 0; -} - - -bool FileJournal::read_entry(bufferlist& bl, epoch_t& epoch) -{ - if (!read_pos) { - dout(2) << "read_entry -- not readable" << dendl; - return false; - } - - if (read_pos == header.wrap) { - // find wrap point - for (int i=1; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __EBOFS_FILEJOURNAL_H -#define __EBOFS_FILEJOURNAL_H - - -#include "Journal.h" -#include "common/Cond.h" -#include "common/Mutex.h" -#include "common/Thread.h" - -class FileJournal : public Journal { -public: - /** log header - * we allow 3 pointers: - * top/initial, - * one for an epoch boundary, - * and one for a wrap in the ring buffer/journal file. - * the epoch boundary one is useful only for speedier recovery in certain cases - * (i.e. when ebofs committed, but the journal didn't rollover ... very small window!) - */ - struct header_t { - uint64_t fsid; - int num; - off_t wrap; - off_t max_size; - epoch_t epoch[3]; - off_t offset[3]; - - header_t() : fsid(0), num(0), wrap(0), max_size(0) {} - - void clear() { - num = 0; - wrap = 0; - } - void pop() { - if (num >= 2 && offset[0] > offset[1]) - wrap = 0; // we're eliminating a wrap - num--; - for (int i=0; i > writeq; // currently journaling - list commitq; // currently journaling - - // write thread - Mutex write_lock; - Cond write_cond; - bool write_stop; - - void print_header(); - void read_header(); - void write_header(); - void start_writer(); - void stop_writer(); - void write_thread_entry(); - - class Writer : public Thread { - FileJournal *journal; - public: - Writer(FileJournal *fj) : journal(fj) {} - void *entry() { - journal->write_thread_entry(); - return 0; - } - } write_thread; - - public: - FileJournal(Ebofs *e, char *f) : - Journal(e), fn(f), - full(false), - write_pos(0), queue_pos(0), read_pos(0), - fd(0), - write_stop(false), write_thread(this) { } - ~FileJournal() {} - - int create(); - int open(); - void close(); - - void make_writeable(); - - // writes - bool submit_entry(bufferlist& e, Context *oncommit); // submit an item - void commit_epoch_start(); // mark epoch boundary - void commit_epoch_finish(); // mark prior epoch as committed (we can expire) - - bool read_entry(bufferlist& bl, epoch_t& e); - - // reads -}; - -#endif diff --git a/branches/sage/ebofs2/ebofs/Journal.h b/branches/sage/ebofs2/ebofs/Journal.h deleted file mode 100644 index 9bab0b7f3c109..0000000000000 --- a/branches/sage/ebofs2/ebofs/Journal.h +++ /dev/null @@ -1,47 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __EBOFS_JOURNAL_H -#define __EBOFS_JOURNAL_H - -class Ebofs; - -#include "include/buffer.h" -#include "include/Context.h" - -class Journal { -protected: - Ebofs *ebofs; - -public: - Journal(Ebofs *e) : ebofs(e) { } - virtual ~Journal() { } - - virtual int create() = 0; - virtual int open() = 0; - virtual void close() = 0; - - // writes - virtual void make_writeable() = 0; - virtual bool submit_entry(bufferlist& e, Context *oncommit) = 0;// submit an item - virtual void commit_epoch_start() = 0; // mark epoch boundary - virtual void commit_epoch_finish() = 0; // mark prior epoch as committed (we can expire) - virtual bool read_entry(bufferlist& bl, epoch_t &e) = 0; - - // reads/recovery - -}; - -#endif diff --git a/branches/sage/ebofs2/ebofs/Onode.h b/branches/sage/ebofs2/ebofs/Onode.h deleted file mode 100644 index 1d79d317dd96a..0000000000000 --- a/branches/sage/ebofs2/ebofs/Onode.h +++ /dev/null @@ -1,408 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __EBOFS_ONODE_H -#define __EBOFS_ONODE_H - -#include "include/lru.h" - -#include "types.h" -#include "BufferCache.h" - -#include "include/interval_set.h" - - -/* - * object node (like an inode) - * - * holds object metadata, including - * size - * allocation (extent list) - * attributes - * - */ - -class Onode : public LRUObject { -private: - int ref; - -public: - object_t object_id; - version_t version; // incremented on each modify. - - // data - bool readonly; - Extent onode_loc; - off_t object_size; - unsigned object_blocks; - - // onode - set collections; - map attr; - //vector extents; - map extent_map; - - interval_set uncommitted; - - ObjectCache *oc; - - bool dirty; - bool dangling; // not in onode_map - bool deleted; // deleted - - list commit_waiters; - - public: - Onode(object_t oid) : ref(0), object_id(oid), version(0), - readonly(false), - object_size(0), object_blocks(0), - oc(0), - dirty(false), dangling(false), deleted(false) { - onode_loc.length = 0; - } - ~Onode() { - if (oc) delete oc; - } - - block_t get_onode_id() { return onode_loc.start; } - int get_onode_len() { return onode_loc.length; } - - int get_ref_count() { return ref; } - void get() { - if (ref == 0) lru_pin(); - ref++; - //cout << "ebofs.onode.get " << hex << object_id << dec << " " << ref << std::endl; - } - void put() { - ref--; - if (ref == 0) lru_unpin(); - //cout << "ebofs.onode.put " << hex << object_id << dec << " " << ref << std::endl; - } - - void mark_dirty() { - if (!dirty) { - dirty = true; - get(); - } - } - void mark_clean() { - if (dirty) { - dirty = false; - put(); - } - } - bool is_dirty() { return dirty; } - bool is_deleted() { return deleted; } - bool is_dangling() { return dangling; } - - - bool have_oc() { - return oc != 0; - } - ObjectCache *get_oc(BufferCache *bc) { - if (!oc) { - oc = new ObjectCache(object_id, this, bc); - oc->get(); - get(); - } - return oc; - } - void close_oc() { - if (oc) { - //cout << "close_oc on " << object_id << std::endl; - assert(oc->is_empty()); - if (oc->put() == 0){ - //cout << "************************* hosing oc" << std::endl; - delete oc; - } - oc = 0; - put(); - } - } - - - // allocation - void verify_extents() { - if (0) { // do crazy stupid sanity checking - block_t count = 0; - interval_set is; - - set s; - cout << "verifying" << std::endl; - - for (map::iterator p = extent_map.begin(); - p != extent_map.end(); - p++) { - cout << " " << p->first << ": " << p->second << std::endl; - assert(count == p->first); - count += p->second.length; - for (unsigned j=0;jsecond.length;j++) { - assert(s.count(p->second.start+j) == 0); - s.insert(p->second.start+j); - } - } - - assert(s.size() == count); - assert(count == object_blocks); - } - } - void set_extent(block_t offset, Extent ex) { - //cout << "set_extent " << offset << " -> " << ex << " ... " << object_blocks << std::endl; - assert(offset <= object_blocks); - verify_extents(); - - // at the end? - if (offset == object_blocks) { - //cout << " appending " << ex << std::endl; - if (!extent_map.empty() && extent_map.rbegin()->second.end() == ex.start) { - //cout << "appending " << ex << " to " << extent_map.rbegin()->second << std::endl; - extent_map.rbegin()->second.length += ex.length; - } else - extent_map[object_blocks] = ex; - object_blocks += ex.length; - return; - } - - // removing any extent bits we overwrite - if (!extent_map.empty()) { - // preceeding extent? - map::iterator p = extent_map.lower_bound(offset); - if (p != extent_map.begin()) { - p--; - if (p->first + p->second.length > offset) { - //cout << " preceeding was " << p->second << std::endl; - if (p->first + p->second.length > offset+ex.length) { - // cutting chunk out of middle, add last bit - Extent &n = extent_map[offset+ex.length] = p->second; - n.start += offset+ex.length - p->first; - n.length -= offset+ex.length - p->first; - //cout << " tail frag is " << n << std::endl; - } - p->second.length = offset - p->first; // cut tail off preceeding extent - //cout << " preceeding now " << p->second << std::endl; - } - p++; - } - - // overlapping extents - while (p != extent_map.end() && - p->first < offset + ex.length) { - map::iterator next = p; - next++; - - // completely subsumed? - if (p->first + p->second.length <= offset+ex.length) { - //cout << " erasing " << p->second << std::endl; - extent_map.erase(p); - p = next; - continue; - } - - // spans new extent, cut off head - Extent &n = extent_map[ offset+ex.length ] = p->second; - //cout << " cut head off " << p->second; - n.start += offset+ex.length - p->first; - n.length -= offset+ex.length - p->first; - extent_map.erase(p); - //cout << ", now " << n << std::endl; - break; - } - } - - extent_map[ offset ] = ex; - - // extend object? - if (offset + ex.length > object_blocks) - object_blocks = offset+ex.length; - - verify_extents(); - } - - - /* map_extents(start, len, ls) - * map teh given page range into extents on disk. - */ - int map_extents(block_t start, block_t len, vector& ls) { - //cout << "map_extents " << start << " " << len << std::endl; - verify_extents(); - - //assert(start+len <= object_blocks); - - map::iterator p; - - // hack hack speed up common cases! - if (start == 0) { - p = extent_map.begin(); - } else if (start+len == object_blocks && len == 1 && !extent_map.empty()) { - // append hack. - p = extent_map.end(); - p--; - if (p->first < start) p++; - //while (p->first >= start) p--; - //p++; - } else { - // normal - p = extent_map.lower_bound(start); - } - - if (p != extent_map.begin() && - (p == extent_map.end() || p->first > start && p->first)) { - p--; - if (p->second.length > start - p->first) { - Extent ex; - ex.start = p->second.start + (start - p->first); - ex.length = MIN(len, p->second.length - (start - p->first)); - ls.push_back(ex); - - //cout << " got (tail of?) " << p->second << " : " << ex << std::endl; - - start += ex.length; - len -= ex.length; - } - p++; - } - - while (len > 0 && - p != extent_map.end()) { - assert(p->first == start); - Extent ex = p->second; - ex.length = MIN(len, ex.length); - ls.push_back(ex); - //cout << " got (head of?) " << p->second << " : " << ex << std::endl; - start += ex.length; - len -= ex.length; - p++; - } - - return 0; - } - - int truncate_extents(block_t len, vector& extra) { - verify_extents(); - - map::iterator p = extent_map.lower_bound(len); - if (p != extent_map.begin() && - (p == extent_map.end() || p->first > len && p->first)) { - p--; - if (p->second.length > len - p->first) { - Extent ex; - ex.start = p->second.start + (len - p->first); - ex.length = p->second.length - (len - p->first); - extra.push_back(ex); - - p->second.length = len - p->first; - assert(p->second.length > 0); - - //cout << " got (tail of?) " << p->second << " : " << ex << std::endl; - } - p++; - } - - while (p != extent_map.end()) { - assert(p->first >= len); - extra.push_back(p->second); - map::iterator n = p; - n++; - extent_map.erase(p); - p = n; - } - - object_blocks = len; - verify_extents(); - return 0; - } - - int truncate_front_extents(block_t len, vector& extra) { - verify_extents(); - - while (len > 0) { - Extent& ex = extent_map.begin()->second; // look, this is a reference! - if (ex.length > len) { - // partial first extent - Extent frontbit( ex.start, len ); - extra.push_back(frontbit); - ex.length -= len; - ex.start += len; - break; - } - - // pull off entire first extent. - assert(ex.length <= len); - len -= ex.length; - extra.push_back(ex); - extent_map.erase(extent_map.begin()); - } - - object_blocks -= len; - verify_extents(); - return 0; - } - - - - /* map_alloc_regions(start, len, map) - * map range into regions that need to be (re)allocated on disk - * because they overlap "safe" (or unallocated) parts of the object - */ - /* - void map_alloc_regions(block_t start, block_t len, - interval_set& alloc) { - interval_set already_uncom; - - alloc.insert(start, len); // start with whole range - already_uncom.intersection_of(alloc, uncommitted); - alloc.subtract(already_uncom); // take out the bits that aren't yet committed - } - */ - - - - // pack/unpack - int get_collection_bytes() { - return sizeof(coll_t) * collections.size(); - } - int get_attr_bytes() { - int s = 0; - for (map::iterator i = attr.begin(); - i != attr.end(); - i++) { - s += i->first.length() + 1; - s += i->second.length() + sizeof(int); - } - return s; - } - int get_extent_bytes() { - return sizeof(Extent) * extent_map.size(); - } - -}; - - -inline ostream& operator<<(ostream& out, Onode& on) -{ - out << "onode(" << hex << on.object_id << dec << " len=" << on.object_size; - out << " ref=" << on.get_ref_count(); - if (on.is_dirty()) out << " dirty"; - if (on.is_dangling()) out << " dangling"; - if (on.is_deleted()) out << " deleted"; - out << " uncom=" << on.uncommitted; - // out << " " << &on; - out << ")"; - return out; -} - - - -#endif diff --git a/branches/sage/ebofs2/ebofs/Table.h b/branches/sage/ebofs2/ebofs/Table.h deleted file mode 100644 index 041a55afa0c68..0000000000000 --- a/branches/sage/ebofs2/ebofs/Table.h +++ /dev/null @@ -1,928 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __EBOFS_TABLE_H -#define __EBOFS_TABLE_H - -#include "types.h" -#include "nodes.h" - -/** table **/ - -#define dbtout if (25 <= g_conf.debug_ebofs) cout << "ebofs.table(" << this << ")." - - -template -class Table { - private: - NodePool &pool; - - nodeid_t root; - int nkeys; - int depth; - - public: - Table(NodePool &p, - struct ebofs_table& bts) : - pool(p), - root(bts.root), nkeys(bts.num_keys), depth(bts.depth) { - dbtout << "cons" << std::endl; - } - - nodeid_t get_root() { return root; } - int get_num_keys() { return nkeys; } - int get_depth() { return depth; } - - - /* - */ - class _IndexItem { // i just need a struct size for below - K k; - nodeid_t n; - }; - class IndexItem { - public: - K key; - nodeid_t node; - static const int MAX = Node::ITEM_LEN / (sizeof(_IndexItem)); - static const int MIN = MAX/2; - }; - class _LeafItem { // i just need a struct size for below - K k; - V v; - }; - class LeafItem { - public: - K key; - V value; - static const int MAX = Node::ITEM_LEN / (sizeof(_LeafItem)); - static const int MIN = MAX/2; - }; - - class Nodeptr { - public: - Node *node; - - Nodeptr() : node(0) {} - Nodeptr(Node *n) : node(n) {} - Nodeptr(NodePool& pool, nodeid_t nid) { - open(pool, nid); - } - Nodeptr& operator=(Node *n) { - node = n; - return *this; - } - - void open(NodePool& pool, nodeid_t nid) { - node = pool.get_node(nid); - if (is_index() && node->children.empty()) init_index(pool); - } - - LeafItem& leaf_item(int i) { return (( LeafItem*)(node->item_ptr()))[i]; } - IndexItem& index_item(int i) { return ((IndexItem*)(node->item_ptr()))[i]; } - K key(int i) { - if (node->is_index()) - return index_item(i).key; - else - return leaf_item(i).key; - } - - bool is_leaf() { return node->is_leaf(); } - bool is_index() { return node->is_index(); } - void set_type(int t) { node->set_type(t); } - - int max_items() const { - if (node->is_leaf()) - return LeafItem::MAX; - else - return IndexItem::MAX; - } - int min_items() const { return max_items() / 2; } - - nodeid_t get_id() { return node->get_id(); } - - int size() { return node->size(); } - void set_size(int s) { node->set_size(s); } - - void init_index(NodePool& nodepool) { - /* - node->children = vector(max_items()); - for (int i=0; ichildren[i] = nodepool.get_node(index_item(i).node); - else - node->children[i] = 0; - */ - } - - - void remove_at_pos(int p) { - if (node->is_index()) { - for (int i=p; ichildren[i] = node->children[i+1]; - } - } else { - for (int i=p; iis_index() ? "index":"leaf") << std::endl; - } - void insert_at_leaf_pos(int p, K key, V value) { - assert(is_leaf()); - for (int i=size(); i>p; i--) - leaf_item(i) = leaf_item(i-1); - leaf_item(p).key = key; - leaf_item(p).value = value; - set_size(size() + 1); - } - void insert_at_index_pos(int p, K key, nodeid_t nid) { - assert(is_index()); - for (int i=size(); i>p; i--) { - index_item(i) = index_item(i-1); - //node->children[i] = node->children[i-1]; - } - index_item(p).key = key; - index_item(p).node = nid; - set_size(size() + 1); - } - - void append_item(LeafItem& i) { - leaf_item(size()) = i; - set_size(size() + 1); - } - void append_item(IndexItem& i) { - index_item(size()) = i; - set_size(size() + 1); - } - - void split(Nodeptr& right) { - if (node->is_index()) { - for (int i=min_items(); iis_index()) - for (int i=0; i open; // open nodes - vector pos; // position within the node - //Nodeptr open[20]; - //int pos[20]; - int level; - - Cursor(Table *t) : table(t), open(t->depth), pos(t->depth), level(0) {} - - public: - - const LeafItem& current() { - assert(open[level].is_leaf()); - return open[level].leaf_item(pos[level]); - } - V& dirty_current_value() { - assert(open[level].is_leaf()); - dirty(); - return open[level].leaf_item(pos[level]).value; - } - - // ** read-only bits ** - int move_left() { - if (table->depth == 0) return OOB; - - // work up around branch - int l; - for (l = level; l >= 0; l--) - if (pos[l] > 0) break; - if (l < 0) - return OOB; // we are the first item in the btree - - // move left one - pos[l]--; - - // work back down right side - for (; lpool, open[l].index_item(pos[l]).node); - pos[l+1] = open[l+1].size() - 1; - } - return 1; - } - int move_right() { - if (table->depth == 0) return OOB; - - // work up branch - int l; - for (l=level; l>=0; l--) - if (pos[l] < open[l].size() - 1) break; - if (l < 0) { - /* we are at last item in btree. */ - if (pos[level] < open[level].size()) { - pos[level]++; /* move into add position! */ - return 0; - } - return -1; - } - - /* move right one */ - assert( pos[l] < open[l].size() ); - pos[l]++; - - /* work back down */ - for (; lpool, open[l].index_item(pos[l]).node ); - pos[l+1] = 0; // furthest left - } - return 1; - } - - // ** modifications ** - void dirty() { - for (int l=level; l>=0; l--) { - if (open[l].node->is_dirty()) break; // already dirty! (and thus parents are too) - - table->pool.dirty_node(open[l].node); - if (l > 0) - open[l-1].index_item( pos[l-1] ).node = open[l].get_id(); - else - table->root = open[0].get_id(); - } - } - private: - void repair_parents() { - // did i make a change at the start of a node? - if (pos[level] == 0) { - K key = open[level].key(0); // new key parents should have - for (int j=level-1; j>=0; j--) { - if (open[j].index_item(pos[j]).key == key) - break; /* it's the same key, we can stop fixing */ - open[j].index_item(pos[j]).key = key; - if (pos[j] > 0) break; /* last in position 0.. */ - } - } - } - - public: - void remove() { - dirty(); - - // remove from node - open[level].remove_at_pos( pos[level] ); - repair_parents(); - - // was it a key? - if (level == table->depth-1) - table->nkeys--; - } - - void insert(K key, V value) { - dirty(); - - // insert - open[level].insert_at_leaf_pos(pos[level], key, value); - repair_parents(); - - // was it a key? - if (level == table->depth-1) - table->nkeys++; - } - - int rotate_left() { - if (level == 0) return -1; // i am root - if (pos[level-1] == 0) return -1; // nothing to left - - Nodeptr here = open[level]; - Nodeptr parent = open[level-1]; - Nodeptr left(table->pool, parent.index_item(pos[level-1] - 1).node ); - if (left.size() == left.max_items()) return -1; // it's full - - // make both dirty - dirty(); - if (!left.node->is_dirty()) { - table->pool.dirty_node(left.node); - parent.index_item(pos[level-1]-1).node = left.get_id(); - } - - dbtout << "rotating item " << here.key(0) << " left from " << here.get_id() << " to " << left.get_id() << std::endl; - - /* add */ - if (here.node->is_leaf()) - left.append_item(here.leaf_item(0)); - else - left.append_item(here.index_item(0)); - - /* remove */ - here.remove_at_pos(0); - - /* fix parent index for me */ - parent.index_item( pos[level-1] ).key = here.key(0); - // we never have to update past immediate parent, since we're not at pos 0 - - /* adjust cursor */ - if (pos[level] > 0) - pos[level]--; - //else - //assert(1); /* if we were positioned here, we're equal */ - /* if it was 0, then the shifted item == our key, and we can stay here safely. */ - return 0; - } - int rotate_right() { - if (level == 0) return -1; // i am root - if (pos[level-1] + 1 >= open[level-1].size()) return -1; // nothing to right - - Nodeptr here = open[level]; - Nodeptr parent = open[level-1]; - Nodeptr right(table->pool, parent.index_item( pos[level-1] + 1 ).node ); - if (right.size() == right.max_items()) return -1; // it's full - - // make both dirty - dirty(); - if (!right.node->is_dirty()) { - table->pool.dirty_node(right.node); - parent.index_item( pos[level-1]+1 ).node = right.get_id(); - } - - if (pos[level] == here.size()) { - /* let's just move the cursor over! */ - //if (sizeof(K) == 8) - dbtout << "shifting cursor right from " << here.get_id() << " to less-full node " << right.get_id() << std::endl; - open[level] = right; - pos[level] = 0; - pos[level-1]++; - return 0; - } - - //if (sizeof(K) == 8) - dbtout << "rotating item " << hex << here.key(here.size()-1) << dec << " right from " - << here.get_id() << " to " << right.get_id() << std::endl; - - /* add */ - if (here.is_index()) - right.insert_at_index_pos(0, - here.index_item( here.size()-1 ).key, - here.index_item( here.size()-1 ).node); - else - right.insert_at_leaf_pos(0, - here.leaf_item( here.size()-1 ).key, - here.leaf_item( here.size()-1 ).value); - - /* remove */ - here.set_size(here.size() - 1); - - /* fix parent index for right */ - parent.index_item( pos[level-1] + 1 ).key = right.key(0); - - return 0; - } - }; - - - public: - bool almost_full() { - if (2*(depth+1) > pool.get_num_free()) // worst case, plus some. - return true; - return false; - } - - int find(K key, Cursor& cursor) { - dbtout << "find " << key << std::endl; - verify("find"); - - if (depth == 0) - return Cursor::OOB; - - // init - cursor.level = 0; - - // start at root - Nodeptr curnode(pool, root); - cursor.open[0] = curnode; - - if (curnode.size() == 0) return -1; // empty! - - // find leaf - for (cursor.level = 0; cursor.level < depth-1; cursor.level++) { - /* if key=5, we want 2 3 [4] 6 7, or 3 4 [5] 5 6 (err to the left) */ - int left = 0; /* i >= left */ - int right = curnode.size()-1; /* i < right */ - while (left < right) { - int i = left + (right - left) / 2; - if (curnode.index_item(i).key < key) { - left = i + 1; - } else if (i && curnode.index_item(i-1).key >= key) { - right = i; - } else { - left = right = i; - break; - } - } - int i = left; - if (i && curnode.index_item(i).key > key) i--; - -#ifdef EBOFS_DEBUG_BTREE - int j; - for (j=0; j key) break; - } - if (i != j) { - dbtout << "btree binary search failed" << std::endl; - i = j; - } -#endif - - cursor.pos[cursor.level] = i; - - /* get child node */ - curnode.open(pool, cursor.open[cursor.level].index_item(i).node ); - cursor.open[cursor.level+1] = curnode; - } - - /* search leaf */ - /* if key=5, we want 2 3 4 [6] 7, or 3 4 [5] 5 6 (err to the right) */ - int left = 0; /* i >= left */ - int right = curnode.size(); /* i < right */ - while (left < right) { - int i = left + (right - left) / 2; - if (curnode.leaf_item(i).key < key) { - left = i + 1; - } else if (i && curnode.leaf_item(i-1).key >= key) { - right = i; - } else { - left = right = i; - break; - } - } - int i = left; - -#ifdef EBOFS_DEBUG_BTREE - int j; - for (j=0; j= key) break; - } - if (i != j) { - dbtout << "btree binary search failed" << std::endl; - i = j; - } -#endif - - cursor.pos[cursor.level] = i; /* first key in this node, or key insertion point */ - - if (curnode.size() >= i+1) { - if (curnode.leaf_item(i).key == key) { - return Cursor::MATCH; /* it's the actual key */ - } else { - return Cursor::INSERT; /* it's an insertion point */ - } - } - return Cursor::OOB; /* it's the end of the btree (also a valid insertion point) */ - } - - int lookup(K key) { - dbtout << "lookup" << std::endl; - Cursor cursor(this); - if (find(key, cursor) == Cursor::MATCH) - return 0; - return -1; - } - - int lookup(K key, V& value) { - dbtout << "lookup" << std::endl; - Cursor cursor(this); - if (find(key, cursor) == Cursor::MATCH) { - value = cursor.current().value; - return 0; - } - return -1; - } - - int insert(K key, V value) { - verify("pre-insert"); - dbtout << "insert " << key << " -> " << value << std::endl; - if (almost_full()) return -1; - - // empty? - if (nkeys == 0) { - if (root == -1) { - // create a root node (leaf!) - assert(depth == 0); - Nodeptr newroot( pool.new_node(Node::TYPE_LEAF) ); - root = newroot.get_id(); - depth++; - } - assert(depth == 1); - assert(root >= 0); - } - - // start at/near key - Cursor cursor(this); - find(key, cursor); - - // insert loop - nodeid_t nodevalue = 0; - while (1) { - - /* room in this node? */ - if (cursor.open[cursor.level].size() < cursor.open[cursor.level].max_items()) { - if (cursor.open[cursor.level].is_leaf()) - cursor.insert( key, value ); // will dirty, etc. - else { - // indices are already dirty - cursor.open[cursor.level].insert_at_index_pos(cursor.pos[cursor.level], key, nodevalue); - } - verify("insert 1"); - return 0; - } - - /* this node is full. */ - assert( cursor.open[cursor.level].size() == cursor.open[cursor.level].max_items() ); - - /* can we rotate? */ - if (false) // NO! there's a bug in here somewhere, don't to it. - if (cursor.level > 0) { - if ((cursor.pos[cursor.level-1] > 0 - && cursor.rotate_left() >= 0) || - (cursor.pos[cursor.level-1] + 1 < cursor.open[cursor.level-1].size() - && cursor.rotate_right() >= 0)) { - - if (cursor.open[cursor.level].is_leaf()) - cursor.insert( key, value ); // will dirty, etc. - else { - // indices are already dirty - cursor.open[cursor.level].insert_at_index_pos(cursor.pos[cursor.level], key, nodevalue); - } - verify("insert 2"); - return 0; - } - } - - /** split node **/ - - if (cursor.level == depth-1) { - dbtout << "splitting leaf " << cursor.open[cursor.level].get_id() << std::endl; - } else { - dbtout << "splitting index " << cursor.open[cursor.level].get_id() << std::endl; - } - - cursor.dirty(); - - // split - Nodeptr leftnode = cursor.open[cursor.level]; - Nodeptr newnode( pool.new_node(leftnode.node->get_type()) ); - leftnode.split( newnode ); - - /* insert our item */ - if (cursor.pos[cursor.level] > leftnode.size()) { - // not with cursor, since this node isn't added yet! - if (newnode.is_leaf()) { - newnode.insert_at_leaf_pos( cursor.pos[cursor.level] - leftnode.size(), - key, value ); - nkeys++; - } else { - newnode.insert_at_index_pos( cursor.pos[cursor.level] - leftnode.size(), - key, nodevalue ); - } - } else { - // with cursor (if leaf) - if (leftnode.is_leaf()) - cursor.insert( key, value ); - else - leftnode.insert_at_index_pos( cursor.pos[cursor.level], - key, nodevalue ); - } - - /* are we at the root? */ - if (cursor.level == 0) { - /* split root. */ - dbtout << "that split was the root " << root << std::endl; - Nodeptr newroot( pool.new_node(Node::TYPE_INDEX) ); - - /* new root node */ - newroot.set_size(2); - newroot.index_item(0).key = leftnode.key(0); - newroot.index_item(0).node = root; - newroot.index_item(1).key = newnode.key(0); - newroot.index_item(1).node = newnode.get_id(); - - /* heighten tree */ - depth++; - root = newroot.get_id(); - verify("insert 3"); - return 0; - } - - /* now insert newindex in level-1 */ - nodevalue = newnode.get_id(); - key = newnode.key(0); - cursor.level--; - cursor.pos[cursor.level]++; // ...to the right of leftnode! - } - } - - - int remove(K key) { - verify("pre-remove"); - dbtout << "remove " << key << std::endl; - - if (almost_full()) { - cout << "table almost full, failing" << std::endl; - assert(0); - return -1; - } - - Cursor cursor(this); - if (find(key, cursor) <= 0) { - cerr << "remove " << key << " 0x" << hex << key << dec << " .. dne" << std::endl; - g_conf.debug_ebofs = 33; - g_conf.ebofs_verify = true; - verify("remove dne"); - assert(0); - return -1; // key dne - } - - - while (1) { - cursor.remove(); - verify("post-remove"); - - // balance + adjust - - if (cursor.level == 0) { - // useless root index? - if (cursor.open[0].size() == 1 && - depth > 1) { - depth--; - root = cursor.open[0].index_item(0).node; - pool.release( cursor.open[0].node ); - } - - // note: root can be small, but not empty - else if (nkeys == 0) { - assert(cursor.open[cursor.level].size() == 0); - assert(depth == 1); - root = -1; - depth = 0; - if (cursor.open[0].node) - pool.release(cursor.open[0].node); - } - verify("remove 1"); - return 0; - } - - if (cursor.open[cursor.level].size() > cursor.open[cursor.level].min_items()) { - verify("remove 2"); - return 0; - } - - // borrow from siblings? - Nodeptr left; - Nodeptr right; - - // left? - if (cursor.pos[cursor.level-1] > 0) { - int left_loc = cursor.open[cursor.level-1].index_item( cursor.pos[cursor.level-1] - 1).node; - left.open(pool, left_loc); - - if (left.size() > left.min_items()) { - /* move cursor left, shift right */ - cursor.pos[cursor.level] = 0; - cursor.open[cursor.level] = left; - cursor.pos[cursor.level-1]--; - cursor.rotate_right(); - verify("remove 3"); - return 0; - } - - /* combine to left */ - right = cursor.open[cursor.level]; - } - else { - assert(cursor.pos[cursor.level-1] < cursor.open[cursor.level-1].size() - 1); - int right_loc = cursor.open[cursor.level-1].index_item( cursor.pos[cursor.level-1] + 1 ).node; - right.open(pool, right_loc ); - - if (right.size() > right.min_items()) { - /* move cursor right, shift an item left */ - cursor.pos[cursor.level] = 1; - cursor.open[cursor.level] = right; - cursor.pos[cursor.level-1]++; - cursor.rotate_left(); - verify("remove 4"); - return 0; - } - - /* combine to left */ - left = cursor.open[cursor.level]; - cursor.pos[cursor.level-1]++; /* move cursor to (soon-to-be-empty) right side item */ - } - - // note: cursor now points to _right_ node. - - /* combine (towards left) - * (this makes it so our next delete will be in the index - * interior, which is less scary.) - */ - dbtout << "combining nodes " << left.get_id() << " and " << right.get_id() << std::endl; - - left.merge(right); - - // dirty left + right - cursor.dirty(); // right - if (!left.node->is_dirty()) { - pool.dirty_node(left.node); - cursor.open[cursor.level-1].index_item( cursor.pos[cursor.level-1]-1 ).node = left.get_id(); - } - - pool.release(right.node); - - cursor.level--; // now point to the link to the obsolete (right-side) sib */ - } - - } - - void clear(Cursor& cursor, int node_loc, int level) { - dbtout << "clear" << std::endl; - - Nodeptr node(pool, node_loc); - cursor.open[level] = node; - - // hose children? - if (level < depth-1) { - for (int i=0; i max) - max = node.key(i); - - if (level < depth-1) { - // index - cursor.pos[level] = i; - err += verify_sub( cursor, cursor.open[level].index_item(i).node, level+1, count, last, on ); - } else { - // leaf - count++; - last = node.key(i); - } - } - - if (level) { - // verify that parent's keys are appropriate - if (min != cursor.open[level-1].index_item(cursor.pos[level-1]).key) { - dbtout << ":: key in index node " << cursor.open[level-1].get_id() - << " != min in child " << node_loc - << "(key is " << hex << cursor.open[level-1].index_item(cursor.pos[level-1]).key - << ", min is " << min << ")" << dec << std::endl; - err++; - } - if (cursor.pos[level-1] < cursor.open[level-1].size()-1) { - if (max > cursor.open[level-1].index_item(1+cursor.pos[level-1]).key) { - dbtout << ":: next key in index node " << cursor.open[level-1].get_id() - << " < max in child " << node_loc - << "(key is " << hex << cursor.open[level-1].index_item(1+cursor.pos[level-1]).key - << ", max is " << max << ")" << dec << std::endl; - err++; - } - } - } - - if (err == 0) return err; - - // print it - char s[1000]; - strcpy(s," "); - s[level+1] = 0; - if (1) { - if (root == node_loc) { - dbtout << s << "root " << node_loc << ": " - << node.size() << " / " << node.max_items() << " keys, " << hex << min << "-" << max << dec << std::endl; - } else if (level == depth-1) { - dbtout << s << "leaf " << node_loc << ": " - << node.size() << " / " << node.max_items() << " keys, " << hex << min << "-" << max << dec << std::endl; - } else { - dbtout << s << "indx " << node_loc << ": " - << node.size() << " / " << node.max_items() << " keys, " << hex << min << "-" << max << dec << std::endl; - } - - if (1) { - for (int i=0; i " << node.leaf_item(i).value << dec << std::endl; - } - } - } - } - - return err; - } - - void verify(const char *on) { - if (!g_conf.ebofs_verify) - return; - - if (root == -1 && depth == 0) { - return; // empty! - } - - int count = 0; - Cursor cursor(this); - K last; - - int before = g_conf.debug_ebofs; - g_conf.debug_ebofs = 0; - - int err = verify_sub(cursor, root, 0, count, last, on); - if (count != nkeys) { - cerr << "** count " << count << " != nkeys " << nkeys << std::endl; - err++; - } - - g_conf.debug_ebofs = before; - - // ok? - if (err) { - cerr << "verify failure, called by '" << on << "'" << std::endl; - g_conf.debug_ebofs = 30; - // do it again, so we definitely get the dump. - int count = 0; - Cursor cursor(this); - K last; - verify_sub(cursor, root, 0, count, last, on); - assert(err == 0); - } - } - -}; - - -#endif diff --git a/branches/sage/ebofs2/ebofs/mkfs.ebofs.cc b/branches/sage/ebofs2/ebofs/mkfs.ebofs.cc deleted file mode 100644 index d1d5975e7fd65..0000000000000 --- a/branches/sage/ebofs2/ebofs/mkfs.ebofs.cc +++ /dev/null @@ -1,349 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include -#include "ebofs/Ebofs.h" - - -int main(int argc, char **argv) -{ - // args - vector args; - argv_to_vec(argc, argv, args); - parse_config_options(args); - - if (args.size() < 1) { - cerr << "usage: mkfs.ebofs [options] " << std::endl; - return -1; - } - char *filename = args[0]; - - // mkfs - Ebofs mfs(filename); - int r = mfs.mkfs(); - if (r < 0) exit(r); - - if (args.size() > 1) { // pass an extra arg of some sort to trigger the test crapola - // test-o-rama! - Ebofs fs(filename); - fs.mount(); - - // zillion objects - if (1) { - char crap[1024*1024]; - memset(crap, 0, 1024*1024); - bufferlist bl; - int sz = 10000; - bl.append(crap, sz); - - int n = 100000; - utime_t start = g_clock.now(); - for (int i=0; i nsec - - while (1) { - cout << g_clock.now() << " writing " << pos << "~" << sz << std::endl; - fs.write(oid, pos, sz, bl, (Context*)0); - pos += sz; - nanosleep(&ts, 0); - } - - } - - /* - if (1) { - // partial write tests - char crap[1024*1024]; - memset(crap, 0, 1024*1024); - - bufferlist small; - small.append(crap, 10); - bufferlist med; - med.append(crap, 1000); - bufferlist big; - big.append(crap, 1024*1024); - - cout << "0" << std::endl; - fs.write(10, 0, 1024*1024, big, (Context*)0); - fs.sync(); - fs.trim_buffer_cache(); - - cout << "1" << std::endl; - fs.write(10, 10, 10, small, 0); - fs.write(10, 1, 1000, med, 0); - fs.sync(); - fs.trim_buffer_cache(); - - cout << "2" << std::endl; - fs.write(10, 10, 10, small, 0); - //fs.sync(); - fs.write(10, 1, 1000, med, 0); - fs.sync(); - fs.trim_buffer_cache(); - - cout << "3" << std::endl; - fs.write(10, 1, 1000, med, 0); - fs.write(10, 10000, 10, small, 0); - fs.truncate(10, 100, 0); - fs.sync(); - fs.trim_buffer_cache(); - - cout << "4" << std::endl; - fs.remove(10); - fs.sync(); - fs.write(10, 10, 10, small, 0); - fs.sync(); - fs.write(10, 1, 1000, med, 0); - fs.sync(); - fs.truncate(10, 100, 0); - fs.write(10, 10, 10, small, 0); - fs.trim_buffer_cache(); - - - - } - - if (0) { // onode write+read test - bufferlist bl; - char crap[1024*1024]; - memset(crap, 0, 1024*1024); - bl.append(crap, 10); - - fs.write(10, 10, 0, bl, (Context*)0); - fs.umount(); - - Ebofs fs2(filename); - fs2.mount(); - fs2.read(10, 10, 0, bl); - fs2.umount(); - - return 0; - } - - - if (0) { // small write + read test - bufferlist bl; - char crap[1024*1024]; - memset(crap, 0, 1024*1024); - - object_t oid = 10; - int n = 10000; - int l = 128; - bl.append(crap, l); - - - char *p = bl.c_str(); - off_t o = 0; - for (int i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __EBOFS_NODES_H -#define __EBOFS_NODES_H - -/** nodes, node regions **/ - -#include "types.h" -#include "BlockDevice.h" -#include "include/xlist.h" -#include "include/bitmapper.h" - -/* - - disk wire memory - - free free -> free can alloc - free used -> dirty can modify - - free used used -> clean - free used free -> limbo - - used used -> clean - used free -> limbo - - - // meaningless - used free free -> free can alloc - used free used __DNE__ - - -*/ - -#undef debofs -#define debofs(x) if (x < g_conf.debug_ebofs) cout << "ebofs.nodepool." - - -class Node { - public: - // bit fields - static const int STATE_CLEAN = 1; - static const int STATE_DIRTY = 2; - - static const int ITEM_LEN = EBOFS_NODE_BYTES - sizeof(int) - sizeof(int) - sizeof(int); - - static const int TYPE_INDEX = 1; - static const int TYPE_LEAF = 2; - - protected: - nodeid_t id; - int pos_in_bitmap; // position in bitmap - int state; // use bit fields above! - - bufferptr bptr; - - // in disk buffer - int *type; - int *nrecs; - - public: - xlist::item xlist; // dirty - - vector children; - - Node(nodeid_t i, int pib, bufferptr& b, int s) : - id(i), pos_in_bitmap(pib), - state(s), bptr(b), xlist(this) { - setup_pointers(); - } - - void setup_pointers() { - nrecs = (int*)(bptr.c_str()); - type = (int*)(bptr.c_str() + sizeof(*nrecs)); - } - - bool do_cow() { - if (bptr.do_cow()) { - setup_pointers(); - return true; - } - return false; - } - - - // id - nodeid_t get_id() const { return id; } - void set_id(nodeid_t n) { id = n; } - int get_pos_in_bitmap() const { return pos_in_bitmap; } - void set_pos_in_bitmap(int i) { pos_in_bitmap = i; } - - // buffer - bufferptr& get_buffer() { return bptr; } - - char *item_ptr() { return bptr.c_str() + sizeof(*nrecs) + sizeof(*type); } - - // size - int size() { return *nrecs; } - void set_size(int s) { *nrecs = s; } - - // type - int& get_type() { return *type; } - void set_type(int t) { *type = t; } - bool is_index() { return *type == TYPE_INDEX; } - bool is_leaf() { return *type == TYPE_LEAF; } - - - // state - bool is_dirty() { return state == STATE_DIRTY; } - bool is_clean() { return state == STATE_CLEAN; } - - void set_state(int s) { state = s; } - -}; - - - - - -class NodePool { - protected: - hash_map > node_map; // open node map - - public: - vector region_loc; // region locations - Extent usemap_even; - Extent usemap_odd; - - buffer::ptr usemap_data; - bitmapper usemap_bits; - - protected: - // on-disk block states - int num_nodes; - int num_dirty; - int num_clean; - int num_free; - int num_limbo; - - xlist dirty_ls; - interval_set free; - interval_set limbo; - - Mutex &ebofs_lock; - Cond commit_cond; - int flushing; - - nodeid_t make_nodeid(int region, int offset) { - return region_loc[region].start + (block_t)offset; - } - int nodeid_pos_in_bitmap(nodeid_t nid) { - unsigned region; - int num = 0; - for (region = 0; - (block_t)nid < region_loc[region].start || (block_t)nid > region_loc[region].end(); - region++) { - //generic_dout(-20) << "node " << nid << " not in " << region << " " << region_loc[region] << dendl; - num += region_loc[region].length; - } - num += nid - region_loc[region].start; - //generic_dout(-20) << "node " << nid << " is in " << region << ", overall bitmap pos is " << num << dendl; - return num; - } - - - public: - NodePool(Mutex &el) : - num_nodes(0), - num_dirty(0), num_clean(0), num_free(0), num_limbo(0), - ebofs_lock(el), - flushing(0) {} - ~NodePool() { - // nodes - release_all(); - } - - int get_num_free() { return num_free; } - int get_num_dirty() { return num_dirty; } - int get_num_limbo() { return num_limbo; } - int get_num_clean() { return num_clean; } - int get_num_total() { return num_nodes; } - int get_num_used() { return num_clean + num_dirty; } - - int get_usemap_len(int n=0) { - if (n == 0) n = num_nodes; - return ((n-1) / 8 / EBOFS_BLOCK_SIZE) + 1; - } - - unsigned num_regions() { return region_loc.size(); } - - // the caller had better adjust usemap locations... - void add_region(Extent ex) { - assert(region_loc.size() < EBOFS_MAX_NODE_REGIONS); - region_loc.push_back(ex); - free.insert(ex.start, ex.length); - num_free += ex.length; - num_nodes += ex.length; - } - - void init_usemap() { - usemap_data = buffer::create_page_aligned(EBOFS_BLOCK_SIZE*usemap_even.length); - usemap_data.zero(); - usemap_bits.set_data(usemap_data.c_str(), usemap_data.length()); - } - - void expand_usemap() { - block_t have = usemap_data.length() / EBOFS_BLOCK_SIZE; - if (have < usemap_even.length) { - // use bufferlist to copy/merge two chunks - bufferlist bl; - bl.push_back(usemap_data); - bufferptr newbit = buffer::create_page_aligned(EBOFS_BLOCK_SIZE*(usemap_even.length - have)); - newbit.zero(); - bl.push_back(newbit); - assert(bl.buffers().size() == 1); - usemap_data = bl.buffers().front(); - usemap_bits.set_data(usemap_data.c_str(), usemap_data.length()); - } - } - - - - int init(struct ebofs_nodepool *np) { - // regions - assert(region_loc.empty()); - num_nodes = 0; - for (int i=0; inum_regions; i++) { - debofs(3) << "init region " << i << " at " << np->region_loc[i] << std::endl; - region_loc.push_back( np->region_loc[i] ); - num_nodes += np->region_loc[i].length; - } - - // usemap - usemap_even = np->node_usemap_even; - usemap_odd = np->node_usemap_odd; - debofs(3) << "init even map at " << usemap_even << std::endl; - debofs(3) << "init odd map at " << usemap_odd << std::endl; - - init_usemap(); - return 0; - } - - void close() { - release_all(); - - region_loc.clear(); - - num_free = 0; - num_dirty = 0; - num_clean = 0; - num_limbo = 0; - dirty_ls.clear(); - - free.clear(); - limbo.clear(); - - flushing = 0; - node_map.clear(); - } - - - // *** blocking i/o routines *** - - int read_usemap_and_clean_nodes(BlockDevice& dev, version_t epoch) { - // read map - Extent loc; - if (epoch & 1) - loc = usemap_odd; - else - loc = usemap_even; - - // usemap - dev.read(loc.start, loc.length, usemap_data); - - // nodes - unsigned region = 0; - unsigned region_pos = 0; - for (int i=0; iflushed_usemap(); - } - }; - - void flushed_usemap() { - ebofs_lock.Lock(); - flushing--; - if (flushing == 0) - commit_cond.Signal(); - ebofs_lock.Unlock(); - } - - public: - int write_usemap(BlockDevice& dev, version_t version) { - // alloc - Extent loc; - if (version & 1) - loc = usemap_odd; - else - loc = usemap_even; - - // write - bufferlist bl; - bufferptr bp = usemap_data.clone(); - bl.append(bp); - dev.write(loc.start, loc.length, bl, - new C_NP_FlushUsemap(this), "usemap"); - return 0; - } - - - - // *** node commit *** - private: - - class C_NP_FlushNode : public BlockDevice::callback { - NodePool *pool; - nodeid_t nid; - public: - C_NP_FlushNode(NodePool *p, nodeid_t n) : - pool(p), nid(n) {} - void finish(ioh_t ioh, int r) { - pool->flushed_node(nid); - } - }; - - void flushed_node(nodeid_t nid) { - ebofs_lock.Lock(); - flushing--; - if (flushing == 0) - commit_cond.Signal(); - ebofs_lock.Unlock(); - } - - public: - void commit_start(BlockDevice& dev, version_t version) { - debofs(20) << "ebofs.nodepool.commit_start start dirty=" << dirty_ls.size() << std::endl; - - assert(flushing == 0); - /*if (0) - for (unsigned i=0; i clean (write to disk) - while (!dirty_ls.empty()) { - Node *n = dirty_ls.front(); - assert(n); - assert(n->is_dirty()); - n->set_state(Node::STATE_CLEAN); - dirty_ls.remove(&n->xlist); - num_dirty--; - num_clean++; - - debofs(20) << "ebofs.nodepool.commit_start writing node " << n->get_id() << std::endl; - - bufferlist bl; - if (0) { - bufferptr bp = n->get_buffer().clone(); // dup it now - bl.append(bp); - } else { - bl.append(n->get_buffer()); - } - dev.write(n->get_id(), EBOFS_NODE_BLOCKS, - bl, - new C_NP_FlushNode(this, n->get_id()), "node"); - flushing++; - } - - // limbo -> free - for (map::iterator i = limbo.m.begin(); - i != limbo.m.end(); - i++) { - num_free += i->second; - num_limbo -= i->second; - free.insert(i->first, i->second); - } - limbo.clear(); - - debofs(20) << "ebofs.nodepool.commit_start finish" << std::endl; - } - - void commit_wait() { - while (flushing > 0) - commit_cond.Wait(ebofs_lock); - debofs(20) << "ebofs.nodepool.commit_wait finish" << std::endl; - } - - - - - - - - - - // *** nodes *** - // opened node - Node* get_node(nodeid_t nid) { - //dbtout << "pool.get " << nid << std::endl; - assert(node_map.count(nid)); - return node_map[nid]; - } - - // allocate id/block on disk. always free -> dirty. - nodeid_t alloc_id() { - // pick node id - assert(!free.empty()); - nodeid_t nid = free.start(); - free.erase(nid); - num_free--; - return nid; - } - - // new node - Node* new_node(int type) { - nodeid_t nid = alloc_id(); - debofs(15) << "ebofs.nodepool.new_node " << nid << std::endl; - - // alloc node - bufferptr bp = buffer::create_page_aligned(EBOFS_NODE_BYTES); - bp.zero(); - Node *n = new Node(nid, nodeid_pos_in_bitmap(nid), bp, Node::STATE_DIRTY); - n->set_type(type); - n->set_size(0); - - usemap_bits.set(n->get_pos_in_bitmap()); - - n->set_state(Node::STATE_DIRTY); - dirty_ls.push_back(&n->xlist); - num_dirty++; - - assert(node_map.count(nid) == 0); - node_map[nid] = n; - - return n; - } - - void release(Node *n) { - const nodeid_t nid = n->get_id(); - debofs(15) << "ebofs.nodepool.release on " << nid << std::endl; - node_map.erase(nid); - - if (n->is_dirty()) { - dirty_ls.remove(&n->xlist); - num_dirty--; - free.insert(nid); - num_free++; - usemap_bits.clear(n->get_pos_in_bitmap()); - } else if (n->is_clean()) { - limbo.insert(nid); - num_limbo++; - num_clean--; - usemap_bits.clear(n->get_pos_in_bitmap()); - } - - delete n; - assert(num_clean + num_dirty + num_limbo + num_free == num_nodes); - } - - void release_all() { - while (!node_map.empty()) { - hash_map >::iterator i = node_map.begin(); - debofs(2) << "ebofs.nodepool.release_all leftover " << i->first << " " << i->second << std::endl; - release( i->second ); - } - assert(node_map.empty()); - } - - void dirty_node(Node *n) { - // get new node id? - nodeid_t oldid = n->get_id(); - nodeid_t newid = alloc_id(); - debofs(15) << "ebofs.nodepool.dirty_node on " << oldid << " now " << newid << std::endl; - - // dup data? - // this only does a memcpy if there are multiple references.. - // i.e. if we are still writing the old data - if (n->do_cow()) { - //assert(0); //i'm duping on write - debofs(15) << "ebofs.nodepool.dirty_node did cow on " << oldid << " now " << newid << std::endl; - //cerr << "ebofs.nodepool.dirty_node did cow on " << oldid << " now " << newid << std::endl; - } - - // release old block - assert(n->is_clean()); - num_clean--; - limbo.insert(oldid); - num_limbo++; - usemap_bits.clear(n->get_pos_in_bitmap()); - - // rename node - node_map.erase(oldid); - n->set_id(newid); - n->set_pos_in_bitmap(nodeid_pos_in_bitmap(newid)); - node_map[newid] = n; - - // new block - n->set_state(Node::STATE_DIRTY); - dirty_ls.push_back(&n->xlist); - debofs(15) << "ebofs.nodepool.dirty_node added to dirty list, len now " << dirty_ls.size() << std::endl; - num_dirty++; - usemap_bits.set(n->get_pos_in_bitmap()); - - assert(num_clean + num_dirty + num_limbo + num_free == num_nodes); - } - - -}; - -#endif diff --git a/branches/sage/ebofs2/ebofs/test.ebofs.cc b/branches/sage/ebofs2/ebofs/test.ebofs.cc deleted file mode 100644 index 9a8913a52d80d..0000000000000 --- a/branches/sage/ebofs2/ebofs/test.ebofs.cc +++ /dev/null @@ -1,226 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include -#include "ebofs/Ebofs.h" - -bool stop = false; - - -int nt = 0; -class Tester : public Thread { - Ebofs &fs; - int t; - - char b[1024*1024]; - -public: - Tester(Ebofs &e) : fs(e), t(nt) { nt++; } - void *entry() { - - while (!stop) { - object_t oid; - oid.ino = (rand() % 10) + 0x10000000; - coll_t cid = rand() % 50; - off_t off = rand() % 10000;//0;//rand() % 1000000; - off_t len = 1+rand() % 100000; - char *a = "one"; - if (rand() % 2) a = "two"; - int l = 3;//rand() % 10; - - switch (rand() % 10) { - case 0: - { - oid.rev = rand() % 10; - cout << t << " read " << hex << oid << dec << " at " << off << " len " << len << std::endl; - bufferlist bl; - fs.read(oid, off, len, bl); - int l = MIN(len,bl.length()); - if (l) { - cout << t << " got " << l << std::endl; - bl.copy(0, l, b); - char *p = b; - while (l--) { - assert(*p == 0 || - *p == (char)(off ^ oid.ino)); - off++; - p++; - } - } - } - break; - - case 1: - { - cout << t << " write " << hex << oid << dec << " at " << off << " len " << len << std::endl; - for (int j=0;j args; - argv_to_vec(argc, argv, args); - parse_config_options(args); - - // args - if (args.size() != 3) return -1; - char *filename = args[0]; - int seconds = atoi(args[1]); - int threads = atoi(args[2]); - if (!threads) threads = 1; - - cout << "dev " << filename << " .. " << threads << " threads .. " << seconds << " seconds" << std::endl; - - Ebofs fs(filename); - if (fs.mount() < 0) return -1; - - - // explicit tests - if (0) { - // verify that clone() plays nice with partial writes - object_t oid(1,1); - bufferptr bp(10000); - bp.zero(); - bufferlist bl; - bl.push_back(bp); - fs.write(oid, 0, 10000, bl, 0); - - fs.sync(); - fs.trim_buffer_cache(); - - // induce a partial write - bufferlist bl2; - bl2.substr_of(bl, 0, 100); - fs.write(oid, 100, 100, bl2, 0); - - // clone it - object_t oid2; - oid2 = oid; - oid2.rev = 1; - fs.clone(oid, oid2, 0); - - // ... - if (0) { - // make sure partial still behaves after orig is removed... - fs.remove(oid, 0); - - // or i read for oid2... - bufferlist rbl; - fs.read(oid2, 0, 200, rbl); - } - if (1) { - // make sure things behave if we remove the clone - fs.remove(oid2,0); - } - } - // /explicit tests - - list ls; - for (int i=0; icreate(); - ls.push_back(t); - } - - utime_t now = g_clock.now(); - utime_t dur(seconds,0); - utime_t end = now + dur; - cout << "stop at " << end << std::endl; - while (now < end) { - sleep(1); - now = g_clock.now(); - cout << now << std::endl; - } - - cout << "stopping" << std::endl; - stop = true; - - while (!ls.empty()) { - Tester *t = ls.front(); - ls.pop_front(); - t->join(); - delete t; - } - - fs.umount(); - return 0; -} - diff --git a/branches/sage/ebofs2/ebofs/types.h b/branches/sage/ebofs2/ebofs/types.h deleted file mode 100644 index 749ebddb3ccec..0000000000000 --- a/branches/sage/ebofs2/ebofs/types.h +++ /dev/null @@ -1,171 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __EBOFS_TYPES_H -#define __EBOFS_TYPES_H - -#include -#include "include/buffer.h" -#include "include/Context.h" -#include "common/Cond.h" - -#include -#include -#include -#include -using namespace std; -using namespace __gnu_cxx; - - -#include "include/object.h" - - -#ifndef MIN -# define MIN(a,b) ((a)<=(b) ? (a):(b)) -#endif -#ifndef MAX -# define MAX(a,b) ((a)>=(b) ? (a):(b)) -#endif - - -/* -namespace __gnu_cxx { - template<> struct hash { - size_t operator()(unsigned long long __x) const { - static hash H; - return H((__x >> 32) ^ (__x & 0xffffffff)); - } - }; - - template<> struct hash< std::string > - { - size_t operator()( const std::string& x ) const - { - static hash H; - return H(x.c_str()); - } - }; -} -*/ - - -// disk -typedef uint64_t block_t; // disk location/sector/block - -static const int EBOFS_BLOCK_SIZE = 4096; -static const int EBOFS_BLOCK_BITS = 12; // 1<<12 == 4096 - -class Extent { - public: - block_t start, length; - - Extent() : start(0), length(0) {} - Extent(block_t s, block_t l) : start(s), length(l) {} - - block_t last() const { return start + length - 1; } - block_t end() const { return start + length; } -}; - -inline ostream& operator<<(ostream& out, Extent& ex) -{ - return out << ex.start << "~" << ex.length; -} - - -// tree/set nodes -//typedef int nodeid_t; -typedef int64_t nodeid_t; // actually, a block number. FIXME. - -static const unsigned EBOFS_NODE_BLOCKS = 1; -static const unsigned EBOFS_NODE_BYTES = EBOFS_NODE_BLOCKS * EBOFS_BLOCK_SIZE; -static const unsigned EBOFS_MAX_NODE_REGIONS = 10; // pick a better value! - -struct ebofs_nodepool { - Extent node_usemap_even; // for even sb versions - Extent node_usemap_odd; // for odd sb versions - - int num_regions; - Extent region_loc[EBOFS_MAX_NODE_REGIONS]; -}; - - -// objects - -typedef uint64_t coll_t; - -struct ebofs_onode { - Extent onode_loc; /* this is actually the block we live in */ - - object_t object_id; /* for kicks */ - off_t object_size; /* file size in bytes. should this be 64-bit? */ - unsigned object_blocks; - bool readonly; - - int num_collections; - int num_attr; // num attr in onode - int num_extents; /* number of extents used. if 0, data is in the onode */ -}; - -struct ebofs_cnode { - Extent cnode_loc; /* this is actually the block we live in */ - coll_t coll_id; - int num_attr; // num attr in cnode -}; - - -// table -struct ebofs_table { - nodeid_t root; /* root node of btree */ - int num_keys; - int depth; -}; - - -// super -typedef uint64_t version_t; - -static const unsigned EBOFS_MAGIC = 0x000EB0F5; - -static const int EBOFS_NUM_FREE_BUCKETS = 5; /* see alloc.h for bucket constraints */ -static const int EBOFS_FREE_BUCKET_BITS = 2; - - -struct ebofs_super { - uint64_t s_magic; - uint64_t fsid; - - epoch_t epoch; // version of this superblock. - - uint64_t num_blocks; /* # blocks in filesystem */ - - // some basic stats, for kicks - uint64_t free_blocks; /* unused blocks */ - uint64_t limbo_blocks; /* limbo blocks */ - //unsigned num_objects; - //unsigned num_fragmented; - - struct ebofs_nodepool nodepool; - - // tables - struct ebofs_table free_tab[EBOFS_NUM_FREE_BUCKETS]; - struct ebofs_table limbo_tab; - struct ebofs_table alloc_tab; - struct ebofs_table object_tab; // object directory - struct ebofs_table collection_tab; // collection directory - struct ebofs_table co_tab; -}; - - -#endif diff --git a/branches/sage/ebofs2/extractosdmaps.cc b/branches/sage/ebofs2/extractosdmaps.cc deleted file mode 100644 index bc8ec91984d1e..0000000000000 --- a/branches/sage/ebofs2/extractosdmaps.cc +++ /dev/null @@ -1,64 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include -#include -#include - -#include -#include -#include -using namespace std; - -#include "config.h" - -#include "mon/MonMap.h" - -#include "ebofs/Ebofs.h" - -#include "osd/OSD.h" -#include "mon/MonitorStore.h" - -int main(int argc, char **argv) -{ - vector args; - argv_to_vec(argc, argv, args); - - Ebofs eb("dev/osd0"); - eb.mount(); - MonitorStore ms("mondata/mon0"); - ms.mount(); - - epoch_t e = 1; - while (1) { - bufferlist bl; - object_t oid = OSD::get_osdmap_object_name(e); - eb.read(oid, 0, 0, bl); - if (bl.length() == 0) break; - cout << "saving epoch " << e << std::endl; - - bufferlist ibl; - oid = OSD::get_inc_osdmap_object_name(e); - eb.read(oid, 0, 0, ibl); - - ms.put_bl_sn(ibl, "osdmap", e); - ms.put_bl_sn(bl, "osdmap_full", e); - e++; - } - - eb.umount(); - //ms.umount(); - - return 0; -} diff --git a/branches/sage/ebofs2/fakefuse.cc b/branches/sage/ebofs2/fakefuse.cc deleted file mode 100644 index 3a778e1b64e39..0000000000000 --- a/branches/sage/ebofs2/fakefuse.cc +++ /dev/null @@ -1,168 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include -#include -#include -using namespace std; - -#include "config.h" - -#include "mon/Monitor.h" - -#include "mds/MDS.h" -#include "osd/OSD.h" -#include "client/Client.h" -#include "client/fuse.h" -#include "client/fuse_ll.h" - -#include "common/Timer.h" - -#include "msg/FakeMessenger.h" - - - - -#define NUMMDS g_conf.num_mds -#define NUMOSD g_conf.num_osd -#define NUMCLIENT g_conf.num_client - - -class C_Test : public Context { -public: - void finish(int r) { - cout << "C_Test->finish(" << r << ")" << std::endl; - } -}; -class C_Test2 : public Context { -public: - void finish(int r) { - cout << "C_Test2->finish(" << r << ")" << std::endl; - g_timer.add_event_after(2, new C_Test); - } -}; - - - -int main(int argc, char **argv) { - cerr << "fakefuse starting" << std::endl; - - // stop on our own (by default) - g_conf.mon_stop_on_last_unmount = true; - g_conf.mon_stop_with_last_mds = true; - - vector args; - argv_to_vec(argc, argv, args); - parse_config_options(args); - - // start messenger thread - fakemessenger_startthread(); - - //g_timer.add_event_after(5.0, new C_Test2); - //g_timer.add_event_after(10.0, new C_Test); - - vector nargs; - for (unsigned i=0; imon_inst[i] = entity_inst_t(entity_name_t::MON(i), a); // hack ; see FakeMessenger.cc - } - - Monitor *mon[g_conf.num_mon]; - for (int i=0; iinit(); - for (int i=0; iinit(); - for (int i=0; iinit(); - - - // create client - Client *client[NUMCLIENT]; - for (int i=0; iinit(); - - - // start up fuse - // use my argc, argv (make sure you pass a mount point!) - client[i]->mount(); - - char *oldcwd = get_current_dir_name(); // note previous wd - cout << "starting fuse on pid " << getpid() << std::endl; - if (g_conf.fuse_ll) - ceph_fuse_ll_main(client[i], argc, argv); - else - ceph_fuse_main(client[i], argc, argv); - cout << "fuse finished on pid " << getpid() << std::endl; - ::chdir(oldcwd); // return to previous wd - free(oldcwd); - - client[i]->unmount(); - client[i]->shutdown(); - } - - - - // wait for it to finish - cout << "DONE -----" << std::endl; - fakemessenger_wait(); // blocks until messenger stops - - - // cleanup - for (int i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include -#include -#include -using namespace std; - -#include "config.h" - -#include "mds/MDS.h" -#include "osd/OSD.h" -#include "mon/Monitor.h" -#include "client/Client.h" - -#include "client/SyntheticClient.h" - -#include "msg/FakeMessenger.h" - -#include "common/Timer.h" - - -class C_Test : public Context { -public: - void finish(int r) { - cout << "C_Test->finish(" << r << ")" << std::endl; - } -}; - -class C_Die : public Context { -public: - void finish(int) { - cerr << "die" << std::endl; - exit(1); - } -}; - - -int main(int argc, char **argv) -{ - cerr << "fakesyn start" << std::endl; - - //cerr << "inode_t " << sizeof(inode_t) << std::endl; - - vector args; - argv_to_vec(argc, argv, args); - - // stop on our own (by default) - g_conf.mon_stop_on_last_unmount = true; - g_conf.mon_stop_with_last_mds = true; - - parse_config_options(args); - - int start = 0; - - parse_syn_options(args); - - vector nargs; - - for (unsigned i=0; imon_inst[i] = entity_inst_t(entity_name_t::MON(i), a); // hack ; see FakeMessenger.cc - } - - char hostname[100]; - gethostname(hostname,100); - //int pid = getpid(); - - // create mon - Monitor *mon[g_conf.num_mon]; - for (int i=0; iinit(); - } - for (int i=0; iinit(); - } - for (int i=0; iinit(); - if (g_conf.mds_local_osd) - mdsosd[i]->init(); - } - - - // create client(s) - Client *client[g_conf.num_client]; - SyntheticClient *syn[g_conf.num_client]; - for (int i=0; istart_thread(); - start++; - } - - - for (int i=0; ijoin_thread(); - delete syn[i]; - } - - - // wait for it to finish - fakemessenger_wait(); - - // cleanup - for (int i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __CONTEXT_H -#define __CONTEXT_H - -#include "config.h" - -#include -#include -#include - -#include - - -/* - * Context - abstract callback class - */ -class Context { - public: - virtual ~Context() {} // we want a virtual destructor!!! - virtual void finish(int r) = 0; -}; - - -/* - * finish and destroy a list of Contexts - */ -inline void finish_contexts(std::list& finished, - int result = 0) -{ - using std::cout; - using std::endl; - - list ls; - if (finished.empty()) return; - - ls.swap(finished); // swap out of place to avoid weird loops - - generic_dout(10) << ls.size() << " contexts to finish with " << result << dendl; - for (std::list::iterator it = ls.begin(); - it != ls.end(); - it++) { - Context *c = *it; - generic_dout(10) << "---- " << c << dendl; - c->finish(result); - delete c; - } -} - -class C_NoopContext : public Context { -public: - void finish(int r) { } -}; - - -/* - * C_Contexts - set of Contexts - */ -class C_Contexts : public Context { -public: - std::list contexts; - - void add(Context* c) { - contexts.push_back(c); - } - void take(std::list& ls) { - contexts.splice(contexts.end(), ls); - } - void finish(int r) { - finish_contexts(contexts, r); - } -}; - - -/* - * C_Gather - * - * BUG: does not report errors. - */ -class C_Gather : public Context { -public: - bool sub_finish(int r) { - //cout << "C_Gather sub_finish " << this << " got " << r << " of " << waitfor << endl; - assert(waitfor.count(r)); - waitfor.erase(r); - if (!waitfor.empty()) - return false; // more subs left - - // last one - onfinish->finish(0); - delete onfinish; - onfinish = 0; - return true; - } - - class C_GatherSub : public Context { - C_Gather *gather; - int num; - public: - C_GatherSub(C_Gather *g, int n) : gather(g), num(n) {} - void finish(int r) { - if (gather->sub_finish(num)) - delete gather; // last one! - } - }; - -private: - Context *onfinish; - std::set waitfor; - int num; - -public: - C_Gather(Context *f=0) : onfinish(f), num(0) { - //cout << "C_Gather new " << this << endl; - } - ~C_Gather() { - //cout << "C_Gather delete " << this << endl; - assert(!onfinish); - } - - void set_finisher(Context *c) { - assert(!onfinish); - onfinish = c; - } - Context *new_sub() { - num++; - waitfor.insert(num); - return new C_GatherSub(this, num); - } - - bool empty() { return num == 0; } - int get_num() { return num; } - - void finish(int r) { - assert(0); // nobody should ever call me. - } - -}; - -#endif diff --git a/branches/sage/ebofs2/include/Distribution.h b/branches/sage/ebofs2/include/Distribution.h deleted file mode 100644 index efc0795a72fcb..0000000000000 --- a/branches/sage/ebofs2/include/Distribution.h +++ /dev/null @@ -1,75 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __DISTRIBUTION_H -#define __DISTRIBUTION_H - -#include -#include -using namespace std; - -class Distribution { - vector p; - vector v; - - public: - //Distribution() { - //} - - unsigned get_width() { - return p.size(); - } - - void clear() { - p.clear(); - v.clear(); - } - void add(int val, float pr) { - p.push_back(pr); - v.push_back(val); - } - - void random() { - float sum = 0.0; - for (unsigned i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __CEPH_ATOMIC_H -#define __CEPH_ATOMIC_H - -#ifdef BUFFER_USE_CCPP -# include "cc++/thread.h" - -class atomic_t { - mutable ost::AtomicCounter nref; // mutable for const-ness of operator<< -public: - atomic_t(int i=0) : nref(i) {} - void get() { ++nref; } - int put() { return --nref; } - int test() const { return nref; } -}; - -#else -# include "common/Mutex.h" - -class atomic_t { - Mutex lock; - int nref; -public: - atomic_t(int i=0) : lock(false), nref(i) {} - void get() { - lock.Lock(); - ++nref; - lock.Unlock(); - } - int put() { - lock.Lock(); - int r = --nref; - lock.Unlock(); - return r; - } - int test() const { - return nref; - } -}; - -#endif - -#endif diff --git a/branches/sage/ebofs2/include/bitmapper.h b/branches/sage/ebofs2/include/bitmapper.h deleted file mode 100644 index 4bed9e0a81d99..0000000000000 --- a/branches/sage/ebofs2/include/bitmapper.h +++ /dev/null @@ -1,48 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __BITMAPPER_H -#define __BITMAPPER_H - -class bitmapper { - char *_data; - int _len; - - public: - bitmapper() : _data(0), _len(0) { } - bitmapper(char *data, int len) : _data(data), _len(len) { } - - void set_data(char *data, int len) { _data = data; _len = len; } - - int bytes() const { return _len; } - int bits() const { return _len * 8; } - - bool operator[](int b) const { - return get(b); - } - bool get(int b) const { - return _data[b >> 3] & (1 << (b&7)); - } - void set(int b) { - _data[b >> 3] |= 1 << (b&7); - } - void clear(int b) { - _data[b >> 3] &= ~(1 << (b&7)); - } - void toggle(int b) { - _data[b >> 3] ^= 1 << (b&7); - } -}; - -#endif diff --git a/branches/sage/ebofs2/include/blobhash.h b/branches/sage/ebofs2/include/blobhash.h deleted file mode 100644 index a3703e46d67f5..0000000000000 --- a/branches/sage/ebofs2/include/blobhash.h +++ /dev/null @@ -1,47 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __BLOBHASH_H -#define __BLOBHASH_H - -#include "hash.h" - -/* -- this is to make some of the STL types work with 64 bit values, string hash keys, etc. -- added when i was using an old STL.. maybe try taking these out and see if things - compile now? -*/ - -class blobhash { -public: - size_t operator()(const char *p, unsigned len) { - static rjhash H; - size_t acc = 0; - while (len >= sizeof(size_t)) { - acc ^= *(size_t*)p; - p += sizeof(size_t); - len -= sizeof(size_t); - } - int sh = 0; - while (len) { - acc ^= (size_t)*p << sh; - sh += 8; - len--; - p++; - } - return H(acc); - } -}; - - -#endif diff --git a/branches/sage/ebofs2/include/buffer.h b/branches/sage/ebofs2/include/buffer.h deleted file mode 100644 index 12be95a7a51c8..0000000000000 --- a/branches/sage/ebofs2/include/buffer.h +++ /dev/null @@ -1,1161 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __BUFFER_H -#define __BUFFER_H - -#include - -#include "common/Mutex.h" - -#include "atomic.h" - -#include -#include - -using std::cout; - -#ifndef __CYGWIN__ -# include -#endif - -#define BUFFER_PAGE_SIZE 4096 // FIXME - -// -// these are in config.o -extern Mutex bufferlock; -extern long buffer_total_alloc; -// - - - - -class buffer { -private: - - /* hack for memory utilization debugging. */ - static void inc_total_alloc(unsigned len) { - bufferlock.Lock(); - buffer_total_alloc += len; - bufferlock.Unlock(); - } - static void dec_total_alloc(unsigned len) { - bufferlock.Lock(); - buffer_total_alloc -= len; - bufferlock.Unlock(); - } - - /* - * an abstract raw buffer. with a reference count. - */ - class raw { - public: - char *data; - unsigned len; - atomic_t nref; - - raw(unsigned l) : len(l), nref(0) - { } - raw(char *c, unsigned l) : data(c), len(l), nref(0) - { } - virtual ~raw() {}; - - // no copying. - raw(const raw &other); - const raw& operator=(const raw &other); - - virtual raw* clone_empty() = 0; - raw *clone() { - raw *c = clone_empty(); - memcpy(c->data, data, len); - return c; - } - - bool is_page_aligned() { - return (long)data % BUFFER_PAGE_SIZE == 0; - } - }; - - friend std::ostream& operator<<(std::ostream& out, const raw &r); - - /* - * primitive buffer types - */ - class raw_char : public raw { - public: - raw_char(unsigned l) : raw(l) { - data = new char[len]; - inc_total_alloc(len); - } - ~raw_char() { - delete[] data; - dec_total_alloc(len); - } - raw* clone_empty() { - return new raw_char(len); - } - }; - - class raw_static : public raw { - public: - raw_static(const char *d, unsigned l) : raw((char*)d, l) { } - ~raw_static() {} - raw* clone_empty() { - return new raw_char(len); - } - }; - -#ifndef __CYGWIN__ - class raw_mmap_pages : public raw { - public: - raw_mmap_pages(unsigned l) : raw(l) { - data = (char*)::mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, 0); - inc_total_alloc(len); - } - ~raw_mmap_pages() { - ::munmap(data, len); - dec_total_alloc(len); - } - raw* clone_empty() { - return new raw_mmap_pages(len); - } - }; - - class raw_posix_aligned : public raw { - public: - raw_posix_aligned(unsigned l) : raw(l) { -#ifdef DARWIN - data = (char *) valloc (len); -#else - ::posix_memalign((void**)(void*)&data, BUFFER_PAGE_SIZE, len); -#endif /* DARWIN */ - inc_total_alloc(len); - } - ~raw_posix_aligned() { - ::free((void*)data); - dec_total_alloc(len); - } - raw* clone_empty() { - return new raw_posix_aligned(len); - } - }; -#endif - -#ifdef __CYGWIN__ - class raw_hack_aligned : public raw { - char *realdata; - public: - raw_hack_aligned(unsigned l) : raw(l) { - realdata = new char[len+BUFFER_PAGE_SIZE-1]; - unsigned off = ((unsigned)realdata) % BUFFER_PAGE_SIZE; - if (off) - data = realdata + BUFFER_PAGE_SIZE - off; - else - data = realdata; - inc_total_alloc(len+BUFFER_PAGE_SIZE-1); - //cout << "hack aligned " << (unsigned)data - //<< " in raw " << (unsigned)realdata - //<< " off " << off << std::endl; - assert(((unsigned)data & (BUFFER_PAGE_SIZE-1)) == 0); - } - ~raw_hack_aligned() { - delete[] realdata; - dec_total_alloc(len+BUFFER_PAGE_SIZE-1); - } - raw* clone_empty() { - return new raw_hack_aligned(len); - } - }; -#endif - -public: - - /* - * named constructors - */ - - static raw* copy(const char *c, unsigned len) { - raw* r = new raw_char(len); - memcpy(r->data, c, len); - return r; - } - static raw* create(unsigned len) { - return new raw_char(len); - } - - static raw* create_page_aligned(unsigned len) { -#ifndef __CYGWIN__ - //return new raw_mmap_pages(len); - return new raw_posix_aligned(len); -#else - return new raw_hack_aligned(len); -#endif - } - - - /* - * a buffer pointer. references (a subsequence of) a raw buffer. - */ - class ptr { - raw *_raw; - unsigned _off, _len; - - public: - ptr() : _raw(0), _off(0), _len(0) {} - ptr(raw *r) : _raw(r), _off(0), _len(r->len) { // no lock needed; this is an unref raw. - r->nref.get(); - } - ptr(unsigned l) : _off(0), _len(l) { - _raw = create(l); - _raw->nref.get(); - } - ptr(char *d, unsigned l) : _off(0), _len(l) { // ditto. - _raw = copy(d, l); - _raw->nref.get(); - } - ptr(const ptr& p) : _raw(p._raw), _off(p._off), _len(p._len) { - if (_raw) { - _raw->nref.get(); - } - } - ptr(const ptr& p, unsigned o, unsigned l) : _raw(p._raw), _off(p._off + o), _len(l) { - assert(o+l <= p._len); - assert(_raw); - _raw->nref.get(); - } - ptr& operator= (const ptr& p) { - // be careful -- we need to properly handle self-assignment. - if (p._raw) { - p._raw->nref.get(); // inc new - } - release(); // dec (+ dealloc) old (if any) - _raw = p._raw; // change my ref - _off = p._off; - _len = p._len; - return *this; - } - ~ptr() { - release(); - } - - raw *clone() { - return _raw->clone(); - } - - bool do_cow() { - if (_raw->nref.test() > 1) { - //std::cout << "doing cow on " << _raw << " len " << _len << std::endl; - raw *newraw = _raw->clone(); - release(); - newraw->nref.get(); - _raw = newraw; - return true; - } else - return false; - } - - void swap(ptr& other) { - raw *r = _raw; - unsigned o = _off; - unsigned l = _len; - _raw = other._raw; - _off = other._off; - _len = other._len; - other._raw = r; - other._off = o; - other._len = l; - } - - void release() { - if (_raw) { - if (_raw->nref.put() == 0) { - //cout << "hosing raw " << (void*)_raw << " len " << _raw->len << std::endl; - delete _raw; // dealloc old (if any) - } - _raw = 0; - } - } - - // misc - bool at_buffer_head() const { return _off == 0; } - bool at_buffer_tail() const { return _off + _len == _raw->len; } - - bool is_page_aligned() const { return (long)c_str() % BUFFER_PAGE_SIZE == 0; } - - // accessors - raw *get_raw() const { return _raw; } - const char *c_str() const { assert(_raw); return _raw->data + _off; } - char *c_str() { assert(_raw); return _raw->data + _off; } - unsigned length() const { return _len; } - unsigned offset() const { return _off; } - unsigned start() const { return _off; } - unsigned end() const { return _off + _len; } - unsigned unused_tail_length() const { - if (_raw) - return _raw->len - (_off+_len); - else - return 0; - } - const char& operator[](unsigned n) const { - assert(_raw); - assert(n < _len); - return _raw->data[_off + n]; - } - char& operator[](unsigned n) { - assert(_raw); - assert(n < _len); - return _raw->data[_off + n]; - } - - const char *raw_c_str() const { assert(_raw); return _raw->data; } - unsigned raw_length() const { assert(_raw); return _raw->len; } - int raw_nref() const { assert(_raw); return _raw->nref.test(); } - - void copy_out(unsigned o, unsigned l, char *dest) const { - assert(_raw); - assert(o >= 0 && o <= _len); - assert(l >= 0 && o+l <= _len); - memcpy(dest, c_str()+o, l); - } - - unsigned wasted() { - assert(_raw); - return _raw->len - _len; - } - - // modifiers - void set_offset(unsigned o) { _off = o; } - void set_length(unsigned l) { _len = l; } - - void append(const char *p, unsigned l) { - assert(_raw); - assert(l <= unused_tail_length()); - memcpy(c_str() + _len, p, l); - _len += l; - } - - void copy_in(unsigned o, unsigned l, const char *src) { - assert(_raw); - assert(o >= 0 && o <= _len); - assert(l >= 0 && o+l <= _len); - memcpy(c_str()+o, src, l); - } - - void zero() { - memset(c_str(), 0, _len); - } - - void clean() { - //raw *newraw = _raw->makesib(_len); - } - }; - - friend std::ostream& operator<<(std::ostream& out, const buffer::ptr& bp); - - /* - * list - the useful bit! - */ - - class list { - // my private bits - list *bl; - std::list _buffers; - unsigned _len; - - ptr append_buffer; // where i put small appends. - - public: - class iterator { - list *bl; - std::list &ls; - unsigned off; // in bl - std::list::iterator p; - unsigned p_off; // in *p - public: - // constructor. position. - iterator(list *l, unsigned o=0) : - bl(l), ls(bl->_buffers), off(0), p(ls.begin()), p_off(0) { - advance(o); - } - iterator(list *l, unsigned o, std::list::iterator ip, unsigned po) : - bl(l), ls(bl->_buffers), off(0), p(ip), p_off(po) { } - - iterator operator=(const iterator& other) { - return iterator(bl, off, p, p_off); - } - - unsigned get_off() { return off; } - - bool end() { - return p == ls.end(); - } - - void advance(unsigned o) { - //cout << this << " advance " << o << " from " << off << " (p_off " << p_off << " in " << p->length() << ")" << std::endl; - p_off += o; - while (p_off > 0) { - assert(p != ls.end()); - if (p_off >= p->length()) { - // skip this buffer - p_off -= p->length(); - p++; - } else { - // somewhere in this buffer! - break; - } - } - off += o; - } - - void seek(unsigned o) { - //cout << this << " seek " << o << std::endl; - p = ls.begin(); - off = p_off = 0; - advance(o); - } - - char operator*() { - assert(p != ls.end()); - return (*p)[p_off]; - } - iterator& operator++() { - assert(p != ls.end()); - advance(1); - return *this; - } - - // copy data out. - // note that these all _append_ to dest! - - void copy(unsigned len, char *dest) { - if (p == ls.end()) seek(off); - while (len > 0) { - assert(p != ls.end()); - - unsigned howmuch = p->length() - p_off; - if (len < howmuch) howmuch = len; - p->copy_out(p_off, howmuch, dest); - dest += howmuch; - - len -= howmuch; - advance(howmuch); - } - } - - void copy(unsigned len, list &dest) { - if (p == ls.end()) seek(off); - while (len > 0) { - assert(p != ls.end()); - - unsigned howmuch = p->length() - p_off; - if (len < howmuch) howmuch = len; - dest.append(*p, p_off, howmuch); - - len -= howmuch; - advance(howmuch); - } - } - - void copy(unsigned len, std::string &dest) { - if (p == ls.end()) seek(off); - while (len > 0) { - assert(p != ls.end()); - - unsigned howmuch = p->length() - p_off; - if (len < howmuch) howmuch = len; - dest.append(p->c_str() + p_off, howmuch); - - len -= howmuch; - advance(howmuch); - } - } - - // copy data in - - void copy_in(unsigned len, const char *src) { - // copy - if (p == ls.end()) seek(off); - while (len > 0) { - assert(p != ls.end()); - - unsigned howmuch = p->length() - p_off; - if (len < howmuch) howmuch = len; - p->copy_in(p_off, howmuch, src); - - src += howmuch; - len -= howmuch; - advance(howmuch); - } - } - - void copy_in(unsigned len, const list& otherl) { - if (p == ls.end()) seek(off); - unsigned left = len; - for (std::list::const_iterator i = otherl._buffers.begin(); - i != otherl._buffers.end(); - i++) { - unsigned l = (*i).length(); - if (left < l) l = left; - copy_in(l, i->c_str()); - left -= l; - if (left == 0) break; - } - } - - }; - - private: - mutable iterator last_p; - - public: - // cons/des - list() : _len(0), last_p(this) {} - list(const list& other) : _buffers(other._buffers), _len(other._len), last_p(this) { } - list(unsigned l) : _len(0), last_p(this) { - ptr bp(l); - push_back(bp); - } - ~list() {} - - list& operator= (const list& other) { - _buffers = other._buffers; - _len = other._len; - return *this; - } - - const std::list& buffers() const { return _buffers; } - - void swap(list& other) { - unsigned t = _len; - _len = other._len; - other._len = t; - _buffers.swap(other._buffers); - append_buffer.swap(other.append_buffer); - } - - unsigned length() const { -#if 0 - // DEBUG: verify _len - unsigned len = 0; - for (std::list::const_iterator it = _buffers.begin(); - it != _buffers.end(); - it++) { - len += (*it).length(); - } - assert(len == _len); -#endif - return _len; - } - - bool is_page_aligned() const { - for (std::list::const_iterator it = _buffers.begin(); - it != _buffers.end(); - it++) - if (!it->is_page_aligned()) return false; - return true; - } - bool is_n_page_sized() const { - return length() % BUFFER_PAGE_SIZE == 0; - } - - // modifiers - void clear() { - _buffers.clear(); - _len = 0; - } - void push_front(ptr& bp) { - _buffers.push_front(bp); - _len += bp.length(); - } - void push_front(raw *r) { - ptr bp(r); - _buffers.push_front(bp); - _len += bp.length(); - } - void push_back(const ptr& bp) { - _buffers.push_back(bp); - _len += bp.length(); - } - void push_back(raw *r) { - ptr bp(r); - _buffers.push_back(bp); - _len += bp.length(); - } - void zero() { - for (std::list::iterator it = _buffers.begin(); - it != _buffers.end(); - it++) - it->zero(); - } - - // sort-of-like-assignment-op - void claim(list& bl) { - // free my buffers - clear(); - claim_append(bl); - } - void claim_append(list& bl) { - // steal the other guy's buffers - _len += bl._len; - _buffers.splice( _buffers.end(), bl._buffers ); - bl._len = 0; - } - - - - - iterator begin() { - return iterator(this, 0); - } - iterator end() { - return iterator(this, _len, _buffers.end(), 0); - } - - - // crope lookalikes. - // **** WARNING: this are horribly inefficient for large bufferlists. **** - - // data OUT - - void copy(unsigned off, unsigned len, char *dest) const { - assert(off >= 0); - assert(off + len <= length()); - if (last_p.get_off() != off) - last_p.seek(off); - last_p.copy(len, dest); - } - - void copy(unsigned off, unsigned len, list &dest) const { - assert(off >= 0); - assert(off + len <= length()); - if (last_p.get_off() != off) - last_p.seek(off); - last_p.copy(len, dest); - } - - void copy(unsigned off, unsigned len, std::string& dest) const { - if (last_p.get_off() != off) - last_p.seek(off); - return last_p.copy(len, dest); - } - - void copy_in(unsigned off, unsigned len, const char *src) { - assert(off >= 0); - assert(off + len <= length()); - - if (last_p.get_off() != off) - last_p.seek(off); - last_p.copy_in(len, src); - } - - void copy_in(unsigned off, unsigned len, const list& src) { - if (last_p.get_off() != off) - last_p.seek(off); - last_p.copy_in(len, src); - } - - - void append(const char *data, unsigned len) { - while (len > 0) { - // put what we can into the existing append_buffer. - unsigned gap = append_buffer.unused_tail_length(); - if (gap > 0) { - if (gap > len) gap = len; - //cout << "append first char is " << data[0] << ", last char is " << data[len-1] << std::endl; - append_buffer.append(data, gap); - append(append_buffer, append_buffer.end() - gap, gap); // add segment to the list - len -= gap; - data += gap; - } - if (len == 0) break; // done! - - // make a new append_buffer! - unsigned alen = BUFFER_PAGE_SIZE * (((len-1) / BUFFER_PAGE_SIZE) + 1); - append_buffer = create_page_aligned(alen); - append_buffer.set_length(0); // unused, so far. - } - } - void append(const ptr& bp) { - push_back(bp); - } - void append(const ptr& bp, unsigned off, unsigned len) { - assert(len+off <= bp.length()); - if (!_buffers.empty() && - _buffers.back().get_raw() == bp.get_raw() && - _buffers.back().end() == bp.start() + off) { - // yay contiguous with tail bp! - _buffers.back().set_length(_buffers.back().length()+len); - _len += len; - } else { - // add new item to list - ptr tempbp(bp, off, len); - push_back(tempbp); - } - } - void append(const list& bl) { - _len += bl._len; - for (std::list::const_iterator p = bl._buffers.begin(); - p != bl._buffers.end(); - ++p) - _buffers.push_back(*p); - } - - - /* - * get a char - */ - const char& operator[](unsigned n) { - assert(n < _len); - for (std::list::iterator p = _buffers.begin(); - p != _buffers.end(); - p++) { - if (n >= p->length()) { - n -= p->length(); - continue; - } - return (*p)[n]; - } - assert(0); - } - - /* - * return a contiguous ptr to whole bufferlist contents. - */ - char *c_str() { - if (_buffers.size() == 1) { - return _buffers.front().c_str(); // good, we're already contiguous. - } - else if (_buffers.size() == 0) { - return 0; // no buffers - } - else { - ptr newbuf = create(length()); // make one new contiguous buffer. - copy(0, length(), newbuf.c_str()); // copy myself into it. - clear(); - push_back(newbuf); - return newbuf.c_str(); // now it'll work. - } - } - - void substr_of(const list& other, unsigned off, unsigned len) { - assert(off + len <= other.length()); - clear(); - - // skip off - std::list::const_iterator curbuf = other._buffers.begin(); - while (off > 0 && - off >= curbuf->length()) { - // skip this buffer - //cout << "skipping over " << *curbuf << std::endl; - off -= (*curbuf).length(); - curbuf++; - } - assert(len == 0 || curbuf != other._buffers.end()); - - while (len > 0) { - // partial? - if (off + len < curbuf->length()) { - //cout << "copying partial of " << *curbuf << std::endl; - _buffers.push_back( ptr( *curbuf, off, len ) ); - _len += len; - break; - } - - // through end - //cout << "copying end (all?) of " << *curbuf << std::endl; - unsigned howmuch = curbuf->length() - off; - _buffers.push_back( ptr( *curbuf, off, howmuch ) ); - _len += howmuch; - len -= howmuch; - off = 0; - curbuf++; - } - } - - - - // funky modifer - void splice(unsigned off, unsigned len, list *claim_by=0 /*, bufferlist& replace_with */) { // fixme? - assert(off < length()); - assert(len > 0); - //cout << "splice off " << off << " len " << len << " ... mylen = " << length() << std::endl; - - // skip off - std::list::iterator curbuf = _buffers.begin(); - while (off > 0) { - assert(curbuf != _buffers.end()); - if (off >= (*curbuf).length()) { - // skip this buffer - //cout << "off = " << off << " skipping over " << *curbuf << std::endl; - off -= (*curbuf).length(); - curbuf++; - } else { - // somewhere in this buffer! - //cout << "off = " << off << " somewhere in " << *curbuf << std::endl; - break; - } - } - assert(off >= 0); - - if (off) { - // add a reference to the front bit - // insert it before curbuf (which we'll hose) - //cout << "keeping front " << off << " of " << *curbuf << std::endl; - _buffers.insert( curbuf, ptr( *curbuf, 0, off ) ); - _len += off; - } - - while (len > 0) { - // partial? - if (off + len < (*curbuf).length()) { - //cout << "keeping end of " << *curbuf << ", losing first " << off+len << std::endl; - if (claim_by) - claim_by->append( *curbuf, off, len ); - (*curbuf).set_offset( off+len + (*curbuf).offset() ); // ignore beginning big - (*curbuf).set_length( (*curbuf).length() - (len+off) ); - _len -= off+len; - //cout << " now " << *curbuf << std::endl; - break; - } - - // hose though the end - unsigned howmuch = (*curbuf).length() - off; - //cout << "discarding " << howmuch << " of " << *curbuf << std::endl; - if (claim_by) - claim_by->append( *curbuf, off, howmuch ); - _len -= (*curbuf).length(); - _buffers.erase( curbuf++ ); - len -= howmuch; - off = 0; - } - - // splice in *replace (implement me later?) - } - - }; - -}; - -typedef buffer::ptr bufferptr; -typedef buffer::list bufferlist; - - -inline bool operator>(bufferlist& l, bufferlist& r) { - for (unsigned p = 0; ; p++) { - if (l.length() > p && r.length() == p) return true; - if (l.length() == p) return false; - if (l[p] > r[p]) return true; - if (l[p] < r[p]) return false; - p++; - } -} -inline bool operator>=(bufferlist& l, bufferlist& r) { - for (unsigned p = 0; ; p++) { - if (l.length() > p && r.length() == p) return true; - if (r.length() == p && l.length() == p) return true; - if (l[p] > r[p]) return true; - if (l[p] < r[p]) return false; - p++; - } -} -inline bool operator<(bufferlist& l, bufferlist& r) { - return r > l; -} -inline bool operator<=(bufferlist& l, bufferlist& r) { - return r >= l; -} - - -inline std::ostream& operator<<(std::ostream& out, const buffer::raw &r) { - return out << "buffer::raw(" << (void*)r.data << " len " << r.len << " nref " << r.nref.test() << ")"; -} - -inline std::ostream& operator<<(std::ostream& out, const buffer::ptr& bp) { - out << "buffer::ptr(" << bp.offset() << "~" << bp.length() - << " " << (void*)bp.c_str() - << " in raw " << (void*)bp.raw_c_str() - << " len " << bp.raw_length() - << " nref " << bp.raw_nref() << ")"; - return out; -} - -inline std::ostream& operator<<(std::ostream& out, const buffer::list& bl) { - out << "buffer::list(len=" << bl.length() << "," << std::endl; - - std::list::const_iterator it = bl.buffers().begin(); - while (it != bl.buffers().end()) { - out << "\t" << *it; - if (++it == bl.buffers().end()) break; - out << "," << std::endl; - } - out << std::endl << ")"; - return out; -} - - - - -// ---------------------------------------------------------- -// encoders - -// DEPRECATED, please use _(en|de)code_(simple|complex) - -// raw -template -inline void _encoderaw(const T& t, bufferlist& bl) -{ - bl.append((char*)&t, sizeof(t)); -} -template -inline void _decoderaw(T& t, bufferlist& bl, int& off) -{ - bl.copy(off, sizeof(t), (char*)&t); - off += sizeof(t); -} - -#include -#include -#include -#include -#include -#include - -// list -template -inline void _encode(const std::list& ls, bufferlist& bl) -{ - // should i pre- or post- count? - if (!ls.empty()) { - unsigned pos = bl.length(); - uint32_t n = 0; - _encoderaw(n, bl); - for (typename std::list::const_iterator p = ls.begin(); p != ls.end(); ++p) { - n++; - _encode(*p, bl); - } - bl.copy_in(pos, sizeof(n), (char*)&n); - } else { - uint32_t n = ls.size(); // FIXME: this is slow on a list. - _encoderaw(n, bl); - for (typename std::list::const_iterator p = ls.begin(); p != ls.end(); ++p) - _encode(*p, bl); - } -} -template -inline void _decode(std::list& ls, bufferlist& bl, int& off) -{ - uint32_t n; - _decoderaw(n, bl, off); - ls.clear(); - while (n--) { - T v; - _decode(v, bl, off); - ls.push_back(v); - } -} - -// deque -template -inline void _encode(const std::deque& ls, bufferlist& bl) -{ - uint32_t n = ls.size(); - _encoderaw(n, bl); - for (typename std::deque::const_iterator p = ls.begin(); p != ls.end(); ++p) - _encode(*p, bl); -} -template -inline void _decode(std::deque& ls, bufferlist& bl, int& off) -{ - uint32_t n; - _decoderaw(n, bl, off); - ls.clear(); - while (n--) { - T v; - _decode(v, bl, off); - ls.push_back(v); - } -} - -// set -template -inline void _encode(const std::set& s, bufferlist& bl) -{ - uint32_t n = s.size(); - _encoderaw(n, bl); - for (typename std::set::const_iterator p = s.begin(); p != s.end(); ++p) - _encode(*p, bl); -} -template -inline void _decode(std::set& s, bufferlist& bl, int& off) -{ - uint32_t n; - _decoderaw(n, bl, off); - s.clear(); - while (n--) { - T v; - _decode(v, bl, off); - s.insert(v); - } -} - -// vector -template -inline void _encode(const std::vector& v, bufferlist& bl) -{ - uint32_t n = v.size(); - _encoderaw(n, bl); - for (typename std::vector::const_iterator p = v.begin(); p != v.end(); ++p) - _encode(*p, bl); -} -template -inline void _decode(std::vector& v, bufferlist& bl, int& off) -{ - uint32_t n; - _decoderaw(n, bl, off); - v.resize(n); - for (uint32_t i=0; i -inline void _encode(const std::map& m, bufferlist& bl) -{ - uint32_t n = m.size(); - _encoderaw(n, bl); - for (typename std::map::const_iterator p = m.begin(); p != m.end(); ++p) { - _encode(p->first, bl); - _encode(p->second, bl); - } -} -template -inline void _decode(std::map& m, bufferlist& bl, int& off) -{ - uint32_t n; - _decoderaw(n, bl, off); - m.clear(); - while (n--) { - T k; - _decode(k, bl, off); - _decode(m[k], bl, off); - } -} - -// hash_map -template -inline void _encode(const __gnu_cxx::hash_map& m, bufferlist& bl) -{ - uint32_t n = m.size(); - _encoderaw(n, bl); - for (typename __gnu_cxx::hash_map::const_iterator p = m.begin(); p != m.end(); ++p) { - _encode(p->first, bl); - _encode(p->second, bl); - } -} -template -inline void _decode(__gnu_cxx::hash_map& m, bufferlist& bl, int& off) -{ - uint32_t n; - _decoderaw(n, bl, off); - m.clear(); - while (n--) { - T k; - _decode(k, bl, off); - _decode(m[k], bl, off); - } -} - -// string -inline void _encode(const std::string& s, bufferlist& bl) -{ - uint32_t len = s.length(); - _encoderaw(len, bl); - bl.append(s.data(), len); -} -inline void _decode(std::string& s, bufferlist& bl, int& off) -{ - uint32_t len; - _decoderaw(len, bl, off); - s.clear(); - bl.copy(off, len, s); - off += len; -} - -// const char* (encode only, string compatible) -inline void _encode(const char *s, bufferlist& bl) -{ - uint32_t len = strlen(s); - _encoderaw(len, bl); - bl.append(s, len); -} - -// bufferptr (encapsulated) -inline void _encode(const buffer::ptr& bp, bufferlist& bl) -{ - uint32_t len = bp.length(); - _encoderaw(len, bl); - bl.append(bp); -} -inline void _decode(buffer::ptr& bp, bufferlist& bl, int& off) -{ - uint32_t len; - _decoderaw(len, bl, off); - - bufferlist s; - bl.copy(off, len, s); - off += len; - - if (s.buffers().size() == 1) - bp = s.buffers().front(); - else - bp = buffer::copy(s.c_str(), s.length()); -} - -// bufferlist (encapsulated) -inline void _encode(const bufferlist& s, bufferlist& bl) -{ - uint32_t len = s.length(); - _encoderaw(len, bl); - bl.append(s); -} -inline void _encode_destructively(bufferlist& s, bufferlist& bl) -{ - uint32_t len = s.length(); - _encoderaw(len, bl); - bl.claim_append(s); -} -inline void _decode(bufferlist& s, bufferlist& bl, int& off) -{ - uint32_t len; - _decoderaw(len, bl, off); - s.clear(); - bl.copy(off, len, s); - off += len; -} - -// base -template -inline void _encode(const T& t, bufferlist& bl) -{ - _encoderaw(t, bl); -} -template -inline void _decode(T& t, bufferlist& bl, int& off) -{ - _decoderaw(t, bl, off); -} - - - -#endif diff --git a/branches/sage/ebofs2/include/ceph_fs.h b/branches/sage/ebofs2/include/ceph_fs.h deleted file mode 100644 index 7be2afdd8e3a2..0000000000000 --- a/branches/sage/ebofs2/include/ceph_fs.h +++ /dev/null @@ -1,179 +0,0 @@ -/* ceph_fs.h - * - * C data types to share between kernel and userspace - */ - -#ifndef _FS_CEPH_CEPH_FS_H -#define _FS_CEPH_CEPH_FS_H - -#include -#include - - -typedef __u64 ceph_ino_t; - - -/** - * object id - */ -struct ceph_object { - ceph_ino_t ino; /* inode "file" identifier */ - __u32 bno; /* "block" (object) in that "file" */ - __u32 rev; /* revision. normally ctime (as epoch). */ -}; -typedef struct ceph_object ceph_object_t; - - -struct ceph_timeval { - __u32 tv_sec; - __u32 tv_usec; -}; - - -/** object layout - * how objects are mapped into PGs - */ -#define CEPH_OBJECT_LAYOUT_HASH 1 -#define CEPH_OBJECT_LAYOUT_LINEAR 2 -#define CEPH_OBJECT_LAYOUT_HASHINO 3 - -/** - * pg layout -- how PGs are mapped into (sets of) OSDs - */ -#define CEPH_PG_LAYOUT_CRUSH 0 -#define CEPH_PG_LAYOUT_HASH 1 -#define CEPH_PG_LAYOUT_LINEAR 2 -#define CEPH_PG_LAYOUT_HYBRID 3 - - -/** - * ceph_file_layout - describe data layout for a file/inode - */ -struct ceph_file_layout { - /* file -> object mapping */ - __u32 fl_stripe_unit; /* stripe unit, in bytes. must be multiple of page size. */ - __u32 fl_stripe_count; /* over this many objects */ - __u32 fl_object_size; /* until objects are this big, then move to new objects */ - - /* pg -> disk layout */ - __u32 fl_object_stripe_unit; /* for per-object raid */ - - /* object -> pg layout */ - __s32 fl_pg_preferred; /* preferred primary for pg */ - __u8 fl_pg_type; /* pg type; see PG_TYPE_* */ - __u8 fl_pg_size; /* pg size (num replicas, raid stripe width, etc. */ -}; - -#define ceph_file_layout_stripe_width(l) (l.fl_stripe_unit * l.fl_stripe_count) - -/* period = bytes before i start on a new set of objects */ -#define ceph_file_layout_period(l) (l.fl_object_size * l.fl_stripe_count) - - - -/** - * placement group id - */ -#define CEPH_PG_TYPE_REP 1 -#define CEPH_PG_TYPE_RAID4 2 - -union ceph_pg { - __u64 pg64; - struct { - __s32 preferred; /* preferred primary osd */ - __u16 ps; /* placement seed */ - __u8 type; - __u8 size; - } pg; -}; -typedef union ceph_pg ceph_pg_t; - -#define ceph_pg_is_rep(pg) (pg.pg.type == CEPH_PG_TYPE_REP) -#define ceph_pg_is_raid4(pg) (pg.pg.type == CEPH_PG_TYPE_RAID4) - -/** - * object layout - * - * describe how a given object should be stored. - */ -struct ceph_object_layout { - ceph_pg_t ol_pgid; - __u32 ol_stripe_unit; -}; - - - -/** - * object extent - */ -struct ceph_object_extent { - ceph_object_t oe_oid; - __u64 oe_start; - __u64 oe_length; - struct ceph_object_layout oe_object_layout; - - /* buffer extent reverse mapping? */ -}; - - - - - -/********************************************* - * message types - */ - -/* - * entity_name - */ -struct ceph_entity_name { - __u32 type; - __u32 num; -}; - -#define CEPH_ENTITY_TYPE_MON 1 -#define CEPH_ENTITY_TYPE_MDS 2 -#define CEPH_ENTITY_TYPE_OSD 3 -#define CEPH_ENTITY_TYPE_CLIENT 4 -#define CEPH_ENTITY_TYPE_ADMIN 5 - -#define CEPH_MSGR_TAG_READY 0 // server -> client + oseq: ready for messages -#define CEPH_MSGR_TAG_REJECT 1 // server -> client + oseq: decline socket -#define CEPH_MSGR_TAG_MSG 2 // message -#define CEPH_MSGR_TAG_ACK 3 // message ack -#define CEPH_MSGR_TAG_CLOSE 4 // closing pipe - - -/* - * entity_addr - * ipv4 only for now - * 16 bytes. - */ -struct ceph_entity_addr { - __u32 erank; /* entity's rank in process */ - __u32 nonce; /* unique id for process (e.g. pid) */ - struct sockaddr_in ipaddr; -}; - -#define ceph_entity_addr_is_local(a,b) \ - ((a).nonce == (b).nonce && \ - (a).ipaddr == (b).ipaddr) - - -struct ceph_entity_inst { - struct ceph_entity_name name; - struct ceph_entity_addr addr; -}; - - -/* - * message header - */ -struct ceph_message_header { - __u32 seq; - __u32 type; - struct ceph_entity_inst src, dst; - __u32 nchunks; -}; - -#endif diff --git a/branches/sage/ebofs2/include/encodable.h b/branches/sage/ebofs2/include/encodable.h deleted file mode 100644 index 321361866ec9b..0000000000000 --- a/branches/sage/ebofs2/include/encodable.h +++ /dev/null @@ -1,424 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __ENCODABLE_H -#define __ENCODABLE_H - -#include "buffer.h" - -#include -#include -#include -#include -#include -#include - - -// ================================================================== -// simple - - -// raw -template -inline void _encode_raw(const T& t, bufferlist& bl) -{ - bl.append((char*)&t, sizeof(t)); -} -template -inline void _decode_raw(T& t, bufferlist::iterator &p) -{ - p.copy(sizeof(t), (char*)&t); -} - -#include -#include -#include -#include -#include -#include - -// list -template -inline void _encode_simple(const std::list& ls, bufferlist& bl) -{ - // should i pre- or post- count? - if (!ls.empty()) { - unsigned pos = bl.length(); - uint32_t n = 0; - _encode_raw(n, bl); - for (typename std::list::const_iterator p = ls.begin(); p != ls.end(); ++p) { - n++; - _encode_simple(*p, bl); - } - bl.copy_in(pos, sizeof(n), (char*)&n); - } else { - uint32_t n = ls.size(); // FIXME: this is slow on a list. - _encode_raw(n, bl); - for (typename std::list::const_iterator p = ls.begin(); p != ls.end(); ++p) - _encode_simple(*p, bl); - } -} -template -inline void _decode_simple(std::list& ls, bufferlist::iterator& p) -{ - uint32_t n; - _decode_raw(n, p); - ls.clear(); - while (n--) { - T v; - _decode_simple(v, p); - ls.push_back(v); - } -} - -// deque -template -inline void _encode_simple(const std::deque& ls, bufferlist& bl) -{ - uint32_t n = ls.size(); - _encode_raw(n, bl); - for (typename std::deque::const_iterator p = ls.begin(); p != ls.end(); ++p) - _encode_simple(*p, bl); -} -template -inline void _decode_simple(std::deque& ls, bufferlist::iterator& p) -{ - uint32_t n; - _decode_raw(n, p); - ls.clear(); - while (n--) { - T v; - _decode_simple(v, p); - ls.push_back(v); - } -} - -// set -template -inline void _encode_simple(const std::set& s, bufferlist& bl) -{ - uint32_t n = s.size(); - _encode_raw(n, bl); - for (typename std::set::const_iterator p = s.begin(); p != s.end(); ++p) - _encode_simple(*p, bl); -} -template -inline void _decode_simple(std::set& s, bufferlist::iterator& p) -{ - uint32_t n; - _decode_raw(n, p); - s.clear(); - while (n--) { - T v; - _decode_simple(v, p); - s.insert(v); - } -} - -// vector -template -inline void _encode_simple(const std::vector& v, bufferlist& bl) -{ - uint32_t n = v.size(); - _encode_raw(n, bl); - for (typename std::vector::const_iterator p = v.begin(); p != v.end(); ++p) - _encode_simple(*p, bl); -} -template -inline void _decode_simple(std::vector& v, bufferlist::iterator& p) -{ - uint32_t n; - _decode_raw(n, p); - v.resize(n); - for (uint32_t i=0; i -inline void _encode_simple(const std::map& m, bufferlist& bl) -{ - uint32_t n = m.size(); - _encode_raw(n, bl); - for (typename std::map::const_iterator p = m.begin(); p != m.end(); ++p) { - _encode_simple(p->first, bl); - _encode_simple(p->second, bl); - } -} -template -inline void _decode_simple(std::map& m, bufferlist::iterator& p) -{ - uint32_t n; - _decode_raw(n, p); - m.clear(); - while (n--) { - T k; - _decode_simple(k, p); - _decode_simple(m[k], p); - } -} - -// hash_map -template -inline void _encode_simple(const __gnu_cxx::hash_map& m, bufferlist& bl) -{ - uint32_t n = m.size(); - _encode_raw(n, bl); - for (typename __gnu_cxx::hash_map::const_iterator p = m.begin(); p != m.end(); ++p) { - _encode_simple(p->first, bl); - _encode_simple(p->second, bl); - } -} -template -inline void _decode_simple(__gnu_cxx::hash_map& m, bufferlist::iterator& p) -{ - uint32_t n; - _decode_raw(n, p); - m.clear(); - while (n--) { - T k; - _decode_simple(k, p); - _decode_simple(m[k], p); - } -} - -// string -inline void _encode_simple(const std::string& s, bufferlist& bl) -{ - uint32_t len = s.length(); - _encode_raw(len, bl); - bl.append(s.data(), len); -} -inline void _decode_simple(std::string& s, bufferlist::iterator& p) -{ - uint32_t len; - _decode_raw(len, p); - s.clear(); - p.copy(len, s); -} - -// const char* (encode only, string compatible) -inline void _encode_simple(const char *s, bufferlist& bl) -{ - uint32_t len = strlen(s); - _encode_raw(len, bl); - bl.append(s, len); -} - -// bufferptr (encapsulated) -inline void _encode_simple(const buffer::ptr& bp, bufferlist& bl) -{ - uint32_t len = bp.length(); - _encode_raw(len, bl); - bl.append(bp); -} -inline void _decode_simple(buffer::ptr& bp, bufferlist::iterator& p) -{ - uint32_t len; - _decode_raw(len, p); - - bufferlist s; - p.copy(len, s); - - if (s.buffers().size() == 1) - bp = s.buffers().front(); - else - bp = buffer::copy(s.c_str(), s.length()); -} - -// bufferlist (encapsulated) -inline void _encode_simple(const bufferlist& s, bufferlist& bl) -{ - uint32_t len = s.length(); - _encode_raw(len, bl); - bl.append(s); -} -inline void _encode_simple_destructively(bufferlist& s, bufferlist& bl) -{ - uint32_t len = s.length(); - _encode_raw(len, bl); - bl.claim_append(s); -} -inline void _decode_simple(bufferlist& s, bufferlist::iterator& p) -{ - uint32_t len; - _decode_raw(len, p); - s.clear(); - p.copy(len, s); -} - -// base -template -inline void _encode_simple(const T& t, bufferlist& bl) -{ - _encode_raw(t, bl); -} -template -inline void _decode_simple(T& t, bufferlist::iterator& p) -{ - _decode_raw(t, p); -} - - - - -// ================================================================== -// complex - -// list -template -inline void _encode_complex(const std::list& ls, bufferlist& bl) -{ - uint32_t n = ls.size(); - _encode_raw(n, bl); - for (typename std::list::const_iterator p = ls.begin(); p != ls.end(); ++p) - _encode_complex(*p, bl); -} -template -inline void _decode_complex(std::list& ls, bufferlist::iterator& p) -{ - uint32_t n; - _decode_raw(n, p); - ls.clear(); - while (n--) { - T v; - _decode_complex(v, p); - ls.push_back(v); - } -} - -// deque -template -inline void _encode_complex(const std::deque& ls, bufferlist& bl) -{ - uint32_t n = ls.size(); - _encode_raw(n, bl); - for (typename std::deque::const_iterator p = ls.begin(); p != ls.end(); ++p) - _encode_complex(*p, bl); -} -template -inline void _decode_complex(std::deque& ls, bufferlist::iterator& p) -{ - uint32_t n; - _decode_raw(n, p); - ls.clear(); - while (n--) { - T v; - _decode_complex(v, p); - ls.push_back(v); - } -} - -// set -template -inline void _encode_complex(const std::set& s, bufferlist& bl) -{ - uint32_t n = s.size(); - _encode_raw(n, bl); - for (typename std::set::const_iterator p = s.begin(); p != s.end(); ++p) - _encode_complex(*p, bl); -} -template -inline void _decode_complex(std::set& s, bufferlist::iterator& p) -{ - uint32_t n; - _decode_raw(n, p); - s.clear(); - while (n--) { - T v; - _decode_complex(v, p); - s.insert(v); - } -} - -// vector -template -inline void _encode_complex(const std::vector& v, bufferlist& bl) -{ - uint32_t n = v.size(); - _encode_raw(n, bl); - for (typename std::vector::const_iterator p = v.begin(); p != v.end(); ++p) - _encode_complex(*p, bl); -} -template -inline void _decode_complex(std::vector& v, bufferlist::iterator& p) -{ - uint32_t n; - _decode_raw(n, p); - v.resize(n); - for (uint32_t i=0; i -inline void _encode_complex(const std::map& m, bufferlist& bl) -{ - uint32_t n = m.size(); - _encode_raw(n, bl); - for (typename std::map::const_iterator p = m.begin(); p != m.end(); ++p) { - _encode_simple(p->first, bl); - _encode_complex(p->second, bl); - } -} -template -inline void _decode_complex(std::map& m, bufferlist::iterator& p) -{ - uint32_t n; - _decode_raw(n, p); - m.clear(); - while (n--) { - T k; - _decode_simple(k, p); - _decode_complex(m[k], p); - } -} - -// hash_map -template -inline void _encode_complex(const __gnu_cxx::hash_map& m, bufferlist& bl) -{ - uint32_t n = m.size(); - _encode_raw(n, bl); - for (typename __gnu_cxx::hash_map::const_iterator p = m.begin(); p != m.end(); ++p) { - _encode_simple(p->first, bl); - _encode_complex(p->second, bl); - } -} -template -inline void _decode_complex(__gnu_cxx::hash_map& m, bufferlist::iterator& p) -{ - uint32_t n; - _decode_raw(n, p); - m.clear(); - while (n--) { - T k; - _decode_simple(k, p); - _decode_complex(m[k], p); - } -} - -// base case -template -inline void _encode_complex(const T& t, bufferlist& bl) -{ - t._encode(bl); -} -template -inline void _decode_complex(T& t, bufferlist::iterator& p) -{ - t._decode(p); -} - -#endif diff --git a/branches/sage/ebofs2/include/error.h b/branches/sage/ebofs2/include/error.h deleted file mode 100644 index a548d9756b9b8..0000000000000 --- a/branches/sage/ebofs2/include/error.h +++ /dev/null @@ -1,41 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -#define SYSERROR() syserror("At %s:%d", __FILE__, __LINE__) - -#define ASSERT(c) \ - ((c) || (exiterror("Assertion failed at %s:%d", __FILE__, __LINE__), 1)) - -/* print usage error message and exit */ -extern void userror(const char *use, const char *fmt, ...); - -/* print system error message and exit */ -extern void syserror(const char *fmt, ...); - -/* print error message and exit */ -extern void exiterror(const char *fmt, ...); - -/* print error message */ -extern void error(const char *fmt, ...); - -#ifdef __cplusplus -} // extern "C" -#endif diff --git a/branches/sage/ebofs2/include/filepath.h b/branches/sage/ebofs2/include/filepath.h deleted file mode 100644 index 4425e1d7c5b3a..0000000000000 --- a/branches/sage/ebofs2/include/filepath.h +++ /dev/null @@ -1,184 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __FILEPATH_H -#define __FILEPATH_H - - -/* - * BUG: /a/b/c is equivalent to a/b/c in dentry-breakdown, but not string. - * -> should it be different? how? should this[0] be "", with depth 4? - * - */ - - -#include -#include -#include -using namespace std; - -#include "buffer.h" - - -class filepath { - /** path - * can be relative "a/b/c" or absolute "/a/b/c". - */ - string path; - - /** bits - path segemtns - * this is ['a', 'b', 'c'] for both the aboslute and relative case. - * - * NOTE: this value is LAZILY maintained... i.e. it's a cache - */ - mutable vector bits; - - void rebuild_path() { - if (absolute()) - path = "/"; - else - path.clear(); - for (unsigned i=0; i 0) parse_bits(); - return bits.size(); - } - bool empty() const { - return path.length() == 0; - } - - // FIXME: const-edness - bool absolute() { return path.length() && path[0] == '/'; } - bool relative() { return !absolute(); } - - const string& operator[](int i) const { - if (bits.empty() && path.length() > 0) parse_bits(); - return bits[i]; - } - - const string& last_dentry() const { - if (bits.empty() && path.length() > 0) parse_bits(); - return bits[ bits.size()-1 ]; - } - - filepath prefixpath(int s) const { - filepath t; - for (int i=0; i 0) parse_bits(); - bits.pop_back(); - rebuild_path(); - } - void push_dentry(const string& s) { - if (bits.empty() && path.length() > 0) parse_bits(); - bits.push_back(s); - if (path.length() && path[path.length()-1] != '/') - path += "/"; - path += s; - } - void append(const filepath& a) { - for (unsigned i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __FRAG_H -#define __FRAG_H - -#include -#include -#include -#include -#include "buffer.h" -#include "encodable.h" - -/* - * - * the goal here is to use a binary split strategy to partition a namespace. - * frag_t represents a particular fragment. bits() tells you the size of the - * fragment, and value() it's name. this is roughly analogous to an ip address - * and netmask. - * - * fragtree_t represents an entire namespace and it's partition. it essentially - * tells you where fragments are split into other fragments, and by how much - * (i.e. by how many bits, resulting in a power of 2 number of child fragments). - * - * this vaguely resembles a btree, in that when a fragment becomes large or small - * we can split or merge, except that there is no guarantee of being balanced. - * - * presumably we are partitioning the output of a (perhaps specialized) hash - * function. - */ - -/** - * frag_t - * - * description of an individual fragment. that is, a particular piece - * of the overall namespace. - * - * this is conceptually analogous to an ip address and netmask. - * - * a value v falls "within" fragment f iff (v & f.mask()) == f.value(). - * - * we write it as v/b, where v is a value and b is the number of bits. - * 0/0 (bits==0) corresponds to the entire namespace. if we bisect that, - * we get 0/1 and 1/1. quartering gives us 0/2, 1/2, 2/2, 3/2. and so on. - * - * this makes the right most bit of v the "most significant", which is the - * opposite of what we usually see. - */ - -/* - * TODO: - * - get_first_child(), next_sibling(int parent_bits) to make (possibly partial) - * iteration efficient (see, e.g., try_assimilate_children() - * - rework frag_t so that we mask the left-most (most significant) bits instead of - * the right-most (least significant) bits. just because it's more intutive, and - * matches the network/netmask concept. - */ - -typedef uint32_t _frag_t; - -class frag_t { - /* encoded value. - * 8 upper bits = "bits" - * 24 lower bits = "value" - */ - _frag_t _enc; - - public: - frag_t() : _enc(0) { } - frag_t(unsigned v, unsigned b) : _enc((b << 24) + - (v & (0xffffffffULL >> (32-b)))) { } - frag_t(_frag_t e) : _enc(e) { } - - // constructors - void from_unsigned(unsigned e) { _enc = e; } - - // accessors - unsigned value() const { return _enc & 0xffffff; } - unsigned bits() const { return _enc >> 24; } - unsigned mask() const { return 0xffffffffULL >> (32-bits()); } - - operator _frag_t() const { return _enc; } - - // tests - bool contains(unsigned v) const { - return (v & mask()) == value(); - } - bool contains(frag_t sub) const { - return (sub.bits() >= bits() && // they at least as specific as us, - (sub.value() & mask()) == value()); // and they are contained by us. - } - bool is_root() const { - return bits() == 0; - } - frag_t parent() const { - assert(bits() > 0); - return frag_t(value() & (mask() >> 1), bits()-1); - } - - // splitting - void split(int nb, std::list& fragments) const { - assert(nb > 0); - unsigned nway = 1 << nb; - for (unsigned i=0; i 0 && - (value() & (1 << (bits()-1)) == 0); - } - bool is_right() const { - return - bits() > 0 && - (value() & (1 << (bits()-1)) == 1); - } - frag_t left_child() const { - return frag_t(value(), bits()+1); - } - frag_t right_child() const { - return frag_t(value() | (1<: - // frag_t f is split by b bits. - // if child frag_t does not appear, it is not split. - std::map _splits; - - public: - // ------------- - // basics - void swap(fragtree_t& other) { - _splits.swap(other._splits); - } - - // ------------- - // accessors - bool empty() { - return _splits.empty(); - } - int get_split(const frag_t hb) const { - std::map::const_iterator p = _splits.find(hb); - if (p == _splits.end()) - return 0; - else - return p->second; - } - - - bool is_leaf(frag_t x) const { - std::list ls; - get_leaves_under(x, ls); - //cout << "is_leaf(" << x << ") -> " << ls << std::endl; - if (!ls.empty() && - ls.front() == x && - ls.size() == 1) - return true; - return false; - } - - /** - * get_leaves -- list all leaves - */ - void get_leaves(std::list& ls) const { - return get_leaves_under_split(frag_t(), ls); - } - - /** - * get_leaves_under_split -- list all leaves under a known split point (or root) - */ - void get_leaves_under_split(frag_t under, std::list& ls) const { - std::list q; - q.push_back(under); - while (!q.empty()) { - frag_t t = q.front(); - q.pop_front(); - int nb = get_split(t); - if (nb) - t.split(nb, q); // queue up children - else - ls.push_back(t); // not spit, it's a leaf. - } - } - - /** - * get_branch -- get branch point at OR above frag @x - * - may be @x itself, if @x is a split - * - may be root (frag_t()) - */ - frag_t get_branch(frag_t x) const { - while (1) { - if (x == frag_t()) return x; // root - if (get_split(x)) return x; // found it! - x = x.parent(); - } - } - - /** - * get_branch_above -- get a branch point above frag @x - * - may be root (frag_t()) - * - may NOT be @x, even if @x is a split. - */ - frag_t get_branch_above(frag_t x) const { - while (1) { - if (x == frag_t()) return x; // root - x = x.parent(); - if (get_split(x)) return x; // found it! - } - } - - - /** - * get_branch_or_leaf -- get branch or leaf point parent for frag @x - * - may be @x itself, if @x is a split or leaf - * - may be root (frag_t()) - */ - frag_t get_branch_or_leaf(frag_t x) const { - frag_t branch = get_branch(x); - int nb = get_split(branch); - if (nb > 0 && // if branch is a split, and - branch.bits() + nb <= x.bits()) // one of the children is or contains x - return frag_t(x.value(), branch.bits()+nb); // then return that child (it's a leaf) - else - return branch; - } - - /** - * get_leaves_under(x, ls) -- search for any leaves fully contained by x - */ - void get_leaves_under(frag_t x, std::list& ls) const { - std::list q; - q.push_back(get_branch(x)); - while (!q.empty()) { - frag_t t = q.front(); - q.pop_front(); - if (t.bits() >= x.bits() && // if t is more specific than x, and - !x.contains(t)) // x does not contain t, - continue; // then skip - int nb = get_split(t); - if (nb) - t.split(nb, q); // queue up children - else - ls.push_back(t); // not spit, it's a leaf. - } - } - - /** - * contains(fg) -- does fragtree contain the specific frag @x - */ - bool contains(frag_t x) const { - std::list q; - q.push_back(get_branch(x)); - while (!q.empty()) { - frag_t t = q.front(); - q.pop_front(); - if (t.bits() >= x.bits() && // if t is more specific than x, and - !x.contains(t)) // x does not contain t, - continue; // then skip - int nb = get_split(t); - if (nb) { - if (t == x) return false; // it's split. - t.split(nb, q); // queue up children - } else { - if (t == x) return true; // it's there. - } - } - return false; - } - - /** - * operator[] -- map a (hash?) value to a frag - */ - frag_t operator[](unsigned v) const { - frag_t t; - while (1) { - assert(t.contains(v)); - int nb = get_split(t); - - // is this a leaf? - if (nb == 0) return t; // done. - - // pick appropriate child fragment. - unsigned nway = 1 << nb; - unsigned i; - for (i=0; i children; - x.split(nb, children); - int childbits = 0; - for (std::list::iterator p = children.begin(); - p != children.end(); - ++p) { - int cb = get_split(*p); - if (!cb) return; // nope. - if (childbits && cb != childbits) return; // not the same - childbits = cb; - } - // all children are split with childbits! - for (std::list::iterator p = children.begin(); - p != children.end(); - ++p) - _splits.erase(*p); - _splits[x] += childbits; - } - - bool force_to_leaf(frag_t x) { - if (is_leaf(x)) - return false; - - cout << "force_to_leaf " << x << " on " << _splits << std::endl; - - frag_t parent = get_branch_or_leaf(x); - assert(parent.bits() <= x.bits()); - cout << "parent is " << parent << std::endl; - - // do we need to split from parent to x? - if (parent.bits() < x.bits()) { - int spread = x.bits() - parent.bits(); - int nb = get_split(parent); - cout << "spread " << spread << ", parent splits by " << nb << std::endl; - if (nb == 0) { - // easy: split parent (a leaf) by the difference - cout << "splitting parent " << parent << " by spread " << spread << std::endl; - split(parent, spread); - assert(is_leaf(x)); - return true; - } - assert(nb > spread); - - // add an intermediary split - merge(parent, nb); - split(parent, spread); - - std::list subs; - parent.split(spread, subs); - for (std::list::iterator p = subs.begin(); - p != subs.end(); - ++p) { - cout << "splitting intermediate " << *p << " by " << (nb-spread) << std::endl; - split(*p, nb - spread); - } - } - - // x is now a leaf or split. - // hoover up any children. - std::list q; - q.push_back(x); - while (!q.empty()) { - frag_t t = q.front(); - q.pop_front(); - int nb = get_split(t); - if (nb) { - cout << "merging child " << t << " by " << nb << std::endl; - merge(t, nb); // merge this point, and - t.split(nb, q); // queue up children - } - } - - cout << "force_to_leaf done" << std::endl; - assert(is_leaf(x)); - return true; - } - - // verify that we describe a legal partition of the namespace. - void verify() const { - std::map copy; - std::list q; - q.push_back(frag_t()); - - while (1) { - frag_t cur = q.front(); - q.pop_front(); - int b = get_split(cur); - if (!b) continue; - copy[cur] = b; - cur.split(b, q); - } - - assert(copy == _splits); - } - - // encoding - void _encode(bufferlist& bl) const { - ::_encode(_splits, bl); - } - void _decode(bufferlist& bl, int& off) { - ::_decode(_splits, bl, off); - } - void _decode(bufferlist::iterator& p) { - ::_decode_simple(_splits, p); - } - - void print(std::ostream& out) { - out << "fragtree_t("; - std::list q; - q.push_back(frag_t()); - while (!q.empty()) { - frag_t t = q.front(); - q.pop_front(); - // newline + indent? - if (t.bits()) { - out << std::endl; - for (unsigned i=0; i q; - q.push_back(frag_t()); - while (!q.empty()) { - frag_t t = q.front(); - q.pop_front(); - int nb = ft.get_split(t); - if (nb) { - if (t.bits()) out << ' '; - out << t << '%' << nb; - t.split(nb, q); // queue up children - } - } - } - if (1) { - std::list leaves; - ft.get_leaves(leaves); - out << leaves; - } - return out << ")"; -} - - -/** - * fragset_t -- a set of fragments - */ -class fragset_t { - std::set _set; - -public: - std::set &get() { return _set; } - std::set::iterator begin() { return _set.begin(); } - std::set::iterator end() { return _set.end(); } - - bool empty() const { return _set.empty(); } - - bool contains(frag_t f) const { - while (1) { - if (_set.count(f)) return true; - if (f.bits() == 0) return false; - f = f.parent(); - } - } - - void insert(frag_t f) { - _set.insert(f); - simplify(); - } - - void simplify() { - while (1) { - bool clean = true; - std::set::iterator p = _set.begin(); - while (p != _set.end()) { - if (!p->is_root() && - _set.count(p->get_sibling())) { - _set.erase(p->get_sibling()); - _set.insert(p->parent()); - _set.erase(p++); - clean = false; - } else { - p++; - } - } - if (clean) - break; - } - } -}; - -inline std::ostream& operator<<(std::ostream& out, fragset_t& fs) -{ - return out << "fragset_t(" << fs.get() << ")"; -} - -#endif diff --git a/branches/sage/ebofs2/include/hash.h b/branches/sage/ebofs2/include/hash.h deleted file mode 100644 index 0c27d3535174f..0000000000000 --- a/branches/sage/ebofs2/include/hash.h +++ /dev/null @@ -1,70 +0,0 @@ -#ifndef __CEPHHASH_H -#define __CEPHHASH_H - -// Robert Jenkins' function for mixing 32-bit values -// http://burtleburtle.net/bob/hash/evahash.html -// a, b = random bits, c = input and output - -#define hashmix(a,b,c) \ - a=a-b; a=a-c; a=a^(c>>13); \ - b=b-c; b=b-a; b=b^(a<<8); \ - c=c-a; c=c-b; c=c^(b>>13); \ - a=a-b; a=a-c; a=a^(c>>12); \ - b=b-c; b=b-a; b=b^(a<<16); \ - c=c-a; c=c-b; c=c^(b>>5); \ - a=a-b; a=a-c; a=a^(c>>3); \ - b=b-c; b=b-a; b=b^(a<<10); \ - c=c-a; c=c-b; c=c^(b>>15); - - -//namespace ceph { - -template struct rjhash { }; - -inline uint64_t rjhash64(uint64_t key) { - key = (~key) + (key << 21); // key = (key << 21) - key - 1; - key = key ^ (key >> 24); - key = (key + (key << 3)) + (key << 8); // key * 265 - key = key ^ (key >> 14); - key = (key + (key << 2)) + (key << 4); // key * 21 - key = key ^ (key >> 28); - key = key + (key << 31); - return key; -} - -inline uint32_t rjhash32(uint32_t a) { - a = (a+0x7ed55d16) + (a<<12); - a = (a^0xc761c23c) ^ (a>>19); - a = (a+0x165667b1) + (a<<5); - a = (a+0xd3a2646c) ^ (a<<9); - a = (a+0xfd7046c5) + (a<<3); - a = (a^0xb55a4f09) ^ (a>>16); - return a; -} - - -template<> struct rjhash { - inline size_t operator()(const uint32_t x) const { -#ifdef __LP64__ - return rjhash64(x); -#else - return rjhash32(x); -#endif - } -}; - -template<> struct rjhash { - inline size_t operator()(const uint64_t x) const { -#ifdef __LP64__ - return rjhash64(x); -#else - return rjhash32(x) ^ rjhash32(x >> 32); -#endif - } -}; - -//} - - - -#endif diff --git a/branches/sage/ebofs2/include/interval_set.h b/branches/sage/ebofs2/include/interval_set.h deleted file mode 100644 index bc5edbc29441d..0000000000000 --- a/branches/sage/ebofs2/include/interval_set.h +++ /dev/null @@ -1,315 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __INTERVAL_SET_H -#define __INTERVAL_SET_H - -#include -#include -#include -using namespace std; - -#ifndef MIN -# define MIN(a,b) ((a)<=(b) ? (a):(b)) -#endif -#ifndef MAX -# define MAX(a,b) ((a)>=(b) ? (a):(b)) -#endif - - -template -class interval_set { - public: - map m; // map start -> len - int _size; - - // helpers - private: - typename map::const_iterator find_inc(T start) const { - typename map::const_iterator p = m.lower_bound(start); // p->first >= start - if (p != m.begin() && - (p == m.end() || p->first > start)) { - p--; // might overlap? - if (p->first + p->second <= start) - p++; // it doesn't. - } - return p; - } - - typename map::iterator find_inc_m(T start) { - typename map::iterator p = m.lower_bound(start); - if (p != m.begin() && - (p == m.end() || p->first > start)) { - p--; // might overlap? - if (p->first + p->second <= start) - p++; // it doesn't. - } - return p; - } - - typename map::const_iterator find_adj(T start) const { - typename map::const_iterator p = m.lower_bound(start); - if (p != m.begin() && - (p == m.end() || p->first > start)) { - p--; // might touch? - if (p->first + p->second < start) - p++; // it doesn't. - } - return p; - } - - typename map::iterator find_adj_m(T start) { - typename map::iterator p = m.lower_bound(start); - if (p != m.begin() && - (p == m.end() || p->first > start)) { - p--; // might touch? - if (p->first + p->second < start) - p++; // it doesn't. - } - return p; - } - - public: - bool operator==(const interval_set& other) const { - return m == other.m; - } - - int size() { - return _size; - } - - void clear() { - m.clear(); - _size = 0; - } - - bool contains(T i) const { - typename map::const_iterator p = find_inc(i); - if (p == m.end()) return false; - if (p->first > i) return false; - if (p->first+p->second <= i) return false; - assert(p->first <= i && p->first+p->second > i); - return true; - } - bool contains(T start, T len) const { - typename map::const_iterator p = find_inc(start); - if (p == m.end()) return false; - if (p->first > start) return false; - if (p->first+p->second <= start) return false; - assert(p->first <= start && p->first+p->second > start); - if (p->first+p->second < start+len) return false; - return true; - } - bool intersects(T start, T len) const { - interval_set a; - a.insert(start, len); - interval_set i; - i.intersection_of( *this, a ); - if (i.empty()) return false; - return true; - } - - // outer range of set - bool empty() const { - return m.empty(); - } - T start() const { - assert(!empty()); - typename map::const_iterator p = m.begin(); - return p->first; - } - T end() const { - assert(!empty()); - typename map::const_iterator p = m.end(); - p--; - return p->first+p->second; - } - - // interval start after p (where p not in set) - bool starts_after(T i) const { - assert(!contains(i)); - typename map::const_iterator p = find_inc(i); - if (p == m.end()) return false; - return true; - } - T start_after(T i) const { - assert(!contains(i)); - typename map::const_iterator p = find_inc(i); - return p->first; - } - - // interval end that contains start - T end_after(T start) const { - assert(contains(start)); - typename map::const_iterator p = find_inc(start); - return p->first+p->second; - } - - void insert(T val) { - insert(val, 1); - } - - void insert(T start, T len) { - //cout << "insert " << start << "~" << len << endl; - assert(len > 0); - _size += len; - typename map::iterator p = find_adj_m(start); - if (p == m.end()) { - m[start] = len; // new interval - } else { - if (p->first < start) { - - if (p->first + p->second != start) { - //cout << "p is " << p->first << "~" << p->second << ", start is " << start << ", len is " << len << endl; - assert(0); - } - - assert(p->first + p->second == start); - p->second += len; // append to end - - typename map::iterator n = p; - n++; - if (n != m.end() && - start+len == n->first) { // combine with next, too! - p->second += n->second; - m.erase(n); - } - } else { - if (start+len == p->first) { - m[start] = len + p->second; // append to front - m.erase(p); - } else { - assert(p->first > start+len); - m[start] = len; // new interval - } - } - } - } - - void erase(T val) { - erase(val, 1); - } - - void erase(T start, T len) { - typename map::iterator p = find_inc_m(start); - - _size -= len; - - assert(p != m.end()); - assert(p->first <= start); - - T before = start - p->first; - assert(p->second >= before+len); - T after = p->second - before - len; - - if (before) - p->second = before; // shorten bit before - else - m.erase(p); - if (after) - m[start+len] = after; - } - - - void subtract(const interval_set &a) { - for (typename map::const_iterator p = a.m.begin(); - p != a.m.end(); - p++) - erase(p->first, p->second); - } - - void insert(const interval_set &a) { - for (typename map::const_iterator p = a.m.begin(); - p != a.m.end(); - p++) - insert(p->first, p->second); - } - - - void intersection_of(const interval_set &a, const interval_set &b) { - assert(&a != this); - assert(&b != this); - clear(); - - typename map::const_iterator pa = a.m.begin(); - typename map::const_iterator pb = b.m.begin(); - - while (pa != a.m.end() && pb != b.m.end()) { - // passing? - if (pa->first + pa->second <= pb->first) - { pa++; continue; } - if (pb->first + pb->second <= pa->first) - { pb++; continue; } - T start = MAX(pa->first, pb->first); - T end = MIN(pa->first+pa->second, pb->first+pb->second); - assert(end > start); - insert(start, end-start); - if (pa->first+pa->second > pb->first+pb->second) - pb++; - else - pa++; - } - } - - void union_of(const interval_set &a, const interval_set &b) { - assert(&a != this); - assert(&b != this); - clear(); - - //cout << "union_of" << endl; - - // a - m = a.m; - - // - (a*b) - interval_set ab; - ab.intersection_of(a, b); - subtract(ab); - - // + b - insert(b); - return; - } - void union_of(const interval_set &b) { - interval_set a; - a.m.swap(m); - union_of(a, b); - } - - bool subset_of(const interval_set &big) const { - for (typename map::const_iterator i = m.begin(); - i != m.end(); - i++) - if (!big.contains(i->first, i->second)) return false; - return true; - } - -}; - -template -inline ostream& operator<<(ostream& out, const interval_set &s) { - out << "["; - for (typename map::const_iterator i = s.m.begin(); - i != s.m.end(); - i++) { - if (i != s.m.begin()) out << ","; - out << i->first << "~" << i->second; - } - out << "]"; - return out; -} - - -#endif diff --git a/branches/sage/ebofs2/include/lru.h b/branches/sage/ebofs2/include/lru.h deleted file mode 100644 index 40dce1aa191ab..0000000000000 --- a/branches/sage/ebofs2/include/lru.h +++ /dev/null @@ -1,341 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __LRU_H -#define __LRU_H - -#include -#include - -#include "config.h" - - - -class LRUObject { - private: - LRUObject *lru_next, *lru_prev; - bool lru_pinned; - class LRU *lru; - class LRUList *lru_list; - - public: - LRUObject() { - lru_next = lru_prev = NULL; - lru_list = 0; - lru_pinned = false; - lru = 0; - } - - // pin/unpin item in cache - void lru_pin(); - void lru_unpin(); - bool lru_is_expireable() { return !lru_pinned; } - - friend class LRU; - friend class LRUList; -}; - - -class LRUList { - private: - LRUObject *head, *tail; - uint32_t len; - - public: - LRUList() { - head = tail = 0; - len = 0; - } - - uint32_t get_length() { return len; } - - LRUObject *get_head() { - return head; - } - LRUObject *get_tail() { - return tail; - } - - void clear() { - while (len > 0) { - remove(get_head()); - } - } - - void insert_head(LRUObject *o) { - o->lru_next = head; - o->lru_prev = NULL; - if (head) { - head->lru_prev = o; - } else { - tail = o; - } - head = o; - o->lru_list = this; - len++; - } - void insert_tail(LRUObject *o) { - o->lru_next = NULL; - o->lru_prev = tail; - if (tail) { - tail->lru_next = o; - } else { - head = o; - } - tail = o; - o->lru_list = this; - len++; - } - - void remove(LRUObject *o) { - assert(o->lru_list == this); - if (o->lru_next) - o->lru_next->lru_prev = o->lru_prev; - else - tail = o->lru_prev; - if (o->lru_prev) - o->lru_prev->lru_next = o->lru_next; - else - head = o->lru_next; - o->lru_next = o->lru_prev = NULL; - o->lru_list = 0; - assert(len>0); - len--; - } - -}; - - -class LRU { - protected: - LRUList lru_top, lru_bot, lru_pintail; - uint32_t lru_num, lru_num_pinned; - uint32_t lru_max; // max items - double lru_midpoint; - - friend class LRUObject; - //friend class MDCache; // hack - - public: - LRU(int max = 0) { - lru_num = 0; - lru_num_pinned = 0; - lru_midpoint = .6; - lru_max = max; - } - - uint32_t lru_get_size() { return lru_num; } - uint32_t lru_get_top() { return lru_top.get_length(); } - uint32_t lru_get_bot() { return lru_bot.get_length(); } - uint32_t lru_get_pintail() { return lru_pintail.get_length(); } - uint32_t lru_get_max() { return lru_max; } - uint32_t lru_get_num_pinned() { return lru_num_pinned; } - - void lru_set_max(uint32_t m) { lru_max = m; } - void lru_set_midpoint(float f) { lru_midpoint = f; } - - void lru_clear() { - lru_top.clear(); - lru_bot.clear(); - lru_pintail.clear(); - } - - // insert at top of lru - void lru_insert_top(LRUObject *o) { - //assert(!o->lru_in_lru); - //o->lru_in_lru = true; - assert(!o->lru); - o->lru = this; - lru_top.insert_head( o ); - lru_num++; - if (o->lru_pinned) lru_num_pinned++; - lru_adjust(); - } - - // insert at mid point in lru - void lru_insert_mid(LRUObject *o) { - //assert(!o->lru_in_lru); - //o->lru_in_lru = true; - assert(!o->lru); - o->lru = this; - lru_bot.insert_head(o); - lru_num++; - if (o->lru_pinned) lru_num_pinned++; - } - - // insert at bottom of lru - void lru_insert_bot(LRUObject *o) { - assert(!o->lru); - o->lru = this; - lru_bot.insert_tail(o); - lru_num++; - if (o->lru_pinned) lru_num_pinned++; - } - - /* - // insert at bottom of lru - void lru_insert_pintail(LRUObject *o) { - assert(!o->lru); - o->lru = this; - - assert(o->lru_pinned); - - lru_pintail.insert_head(o); - lru_num++; - lru_num_pinned += o->lru_pinned; - } - */ - - - - - // adjust top/bot balance, as necessary - void lru_adjust() { - if (!lru_max) return; - - unsigned toplen = lru_top.get_length(); - unsigned topwant = (unsigned)(lru_midpoint * ((double)lru_max - lru_num_pinned)); - while (toplen > 0 && - toplen > topwant) { - // remove from tail of top, stick at head of bot - // FIXME: this could be way more efficient by moving a whole chain of items. - - LRUObject *o = lru_top.get_tail(); - lru_top.remove(o); - lru_bot.insert_head(o); - toplen--; - } - } - - - // remove an item - LRUObject *lru_remove(LRUObject *o) { - // not in list - //assert(o->lru_in_lru); - //if (!o->lru_in_lru) return o; // might have expired and been removed that way. - if (!o->lru) return o; - - - if (o->lru_list == &lru_top) - lru_top.remove(o); - else if (o->lru_list == &lru_bot) - lru_bot.remove(o); - else if (o->lru_list == &lru_pintail) - lru_pintail.remove(o); - else - assert(0); - - lru_num--; - if (o->lru_pinned) lru_num_pinned--; - o->lru = 0; - return o; - } - - // touch item -- move to head of lru - bool lru_touch(LRUObject *o) { - lru_remove(o); - lru_insert_top(o); - return true; - } - - // touch item -- move to midpoint (unless already higher) - bool lru_midtouch(LRUObject *o) { - if (o->lru_list == &lru_top) return false; - - lru_remove(o); - lru_insert_mid(o); - return true; - } - - // touch item -- move to bottom - bool lru_bottouch(LRUObject *o) { - lru_remove(o); - lru_insert_bot(o); - return true; - } - - void lru_touch_entire_pintail() { - // promote entire pintail to the top lru - while (lru_pintail.get_length() > 0) { - LRUObject *o = lru_pintail.get_head(); - lru_pintail.remove(o); - lru_top.insert_tail(o); - } - } - - - // expire -- expire a single item - LRUObject *lru_get_next_expire() { - LRUObject *p; - - // look through tail of bot - while (lru_bot.get_length()) { - p = lru_bot.get_tail(); - if (!p->lru_pinned) return p; - - // move to pintail - lru_bot.remove(p); - lru_pintail.insert_head(p); - } - - // ok, try head then - while (lru_top.get_length()) { - p = lru_top.get_tail(); - if (!p->lru_pinned) return p; - - // move to pintail - lru_top.remove(p); - lru_pintail.insert_head(p); - } - - // no luck! - return NULL; - } - - LRUObject *lru_expire() { - LRUObject *p = lru_get_next_expire(); - if (p) - return lru_remove(p); - return NULL; - } - - - void lru_status() { - generic_dout(10) << "lru: " << lru_num << " items, " << lru_top.get_length() << " top, " << lru_bot.get_length() << " bot, " << lru_pintail.get_length() << " pintail" << dendl; - } - -}; - - -inline void LRUObject::lru_pin() -{ - lru_pinned = true; - if (lru) lru->lru_num_pinned++; -} -inline void LRUObject::lru_unpin() { - lru_pinned = false; - if (lru) { - lru->lru_num_pinned--; - - // move from pintail -> bot - if (lru_list == &lru->lru_pintail) { - lru->lru_pintail.remove(this); - lru->lru_bot.insert_tail(this); - } - } -} - -#endif diff --git a/branches/sage/ebofs2/include/object.h b/branches/sage/ebofs2/include/object.h deleted file mode 100644 index 3b8ac05a86b38..0000000000000 --- a/branches/sage/ebofs2/include/object.h +++ /dev/null @@ -1,99 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __OBJECT_H -#define __OBJECT_H - -#include - -#include -#include -using namespace std; - -#include -using namespace __gnu_cxx; - -#include "hash.h" - -typedef uint32_t objectrev_t; - -struct object_t { - static const uint32_t MAXREV = 0xffffffffU; - - uint64_t ino; // "file" identifier - uint32_t bno; // "block" in that "file" - objectrev_t rev; // revision. normally ctime (as epoch). - - object_t() : ino(0), bno(0), rev(0) {} - object_t(uint64_t i, uint32_t b) : ino(i), bno(b), rev(0) {} - object_t(uint64_t i, uint32_t b, uint32_t r) : ino(i), bno(b), rev(r) {} -}; - - -inline bool operator==(const object_t l, const object_t r) { - return (l.ino == r.ino) && (l.bno == r.bno) && (l.rev == r.rev); -} -inline bool operator!=(const object_t l, const object_t r) { - return (l.ino != r.ino) || (l.bno != r.bno) || (l.rev != r.rev); -} -inline bool operator>(const object_t l, const object_t r) { - if (l.ino > r.ino) return true; - if (l.ino < r.ino) return false; - if (l.bno > r.bno) return true; - if (l.bno < r.bno) return false; - if (l.rev > r.rev) return true; - return false; -} -inline bool operator<(const object_t l, const object_t r) { - if (l.ino < r.ino) return true; - if (l.ino > r.ino) return false; - if (l.bno < r.bno) return true; - if (l.bno > r.bno) return false; - if (l.rev < r.rev) return true; - return false; -} -inline bool operator>=(const object_t l, const object_t r) { - return !(l < r); -} -inline bool operator<=(const object_t l, const object_t r) { - return !(l > r); -} -inline ostream& operator<<(ostream& out, const object_t o) { - out << hex << o.ino << '.'; - out.setf(ios::right); - out.fill('0'); - out << setw(8) << o.bno << dec; - out.unsetf(ios::right); - if (o.rev) - out << '.' << o.rev; - return out; -} - - - - - -namespace __gnu_cxx { - template<> struct hash { - size_t operator()(const object_t &r) const { - static rjhash H; - static rjhash I; - //static hash H; - //static hash I; - return H(r.ino) ^ I(r.bno) ^ I(r.rev); - } - }; - -} -#endif diff --git a/branches/sage/ebofs2/include/rangeset.h b/branches/sage/ebofs2/include/rangeset.h deleted file mode 100644 index 547ea3ab72274..0000000000000 --- a/branches/sage/ebofs2/include/rangeset.h +++ /dev/null @@ -1,253 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __RANGESET_H -#define __RANGESET_H - -/* - * - * my first container with iterator! it's pretty ugly. - * - */ - -#include -#include -#include -using namespace std; - -//typedef int T; - -template -struct _rangeset_base { - map ranges; // pair(first,last) (inclusive, e.g. [first,last]) - - typedef typename map::iterator mapit; - - // get iterator for range including val. or ranges.end(). - mapit get_range_for(T val) { - mapit it = ranges.lower_bound(val); - if (it == ranges.end()) { - // search backwards - typename map::reverse_iterator it = ranges.rbegin(); - if (it == ranges.rend()) return ranges.end(); - if (it->first <= val && it->second >= val) - return ranges.find(it->first); - return ranges.end(); - } else { - if (it->first == val) return - it--; - if (it->first <= val && it->second >= val) - return it; - return ranges.end(); - } - } - -}; - - -template -class rangeset_iterator : - public std::iterator -{ - //typedef typename map::iterator mapit; - - map ranges; - typename map::iterator it; - T current; - -public: - // cons - rangeset_iterator() {} - - rangeset_iterator(typename map::iterator& it, map& ranges) { - this->ranges = ranges; - this->it = it; - if (this->it != ranges.end()) - current = it->first; - } - - bool operator==(rangeset_iterator rit) { - return (it == rit.it && rit.current == current); - } - bool operator!=(rangeset_iterator rit) { - return (it != rit.it) || (rit.current != current); - } - - T& operator*() { - return current; - } - - rangeset_iterator operator++(int) { - if (current < it->second) - current++; - else { - it++; - if (it != ranges.end()) - current = it->first; - } - - return *this; - } -}; - - -template -class rangeset -{ - typedef typename map::iterator map_iterator; - - _rangeset_base theset; - inodeno_t _size; - -public: - rangeset() { _size = 0; } - typedef rangeset_iterator iterator; - - iterator begin() { - map_iterator it = theset.ranges.begin(); - return iterator(it, theset.ranges); - } - - iterator end() { - map_iterator it = theset.ranges.end(); - return iterator(it, theset.ranges); - } - - map_iterator map_begin() { - return theset.ranges.begin(); - } - map_iterator map_end() { - return theset.ranges.end(); - } - int map_size() { - return theset.ranges.size(); - } - - void map_insert(T v1, T v2) { - theset.ranges.insert(pair(v1,v2)); - _size += v2 - v1+1; - } - - - // ... - bool contains(T val) { - if (theset.get_range_for(val) == theset.ranges.end()) return false; - assert(!empty()); - return true; - } - - void insert(T val) { - assert(!contains(val)); - - map_iterator left = theset.get_range_for(val-1); - map_iterator right = theset.get_range_for(val+1); - - if (left != theset.ranges.end() && - right != theset.ranges.end()) { - // join! - left->second = right->second; - theset.ranges.erase(right); - _size++; - return; - } - - if (left != theset.ranges.end()) { - // add to left range - left->second = val; - _size++; - return; - } - - if (right != theset.ranges.end()) { - // add to right range - theset.ranges.insert(pair(val, right->second)); - theset.ranges.erase(val+1); - _size++; - return; - } - - // new range - theset.ranges.insert(pair(val,val)); - _size++; - return; - } - - unsigned size() { - return size(); - } - - bool empty() { - if (theset.ranges.empty()) { - assert(_size == 0); - return true; - } - assert(_size>0); - return false; - } - - - T first() { - assert(!empty()); - map_iterator it = theset.ranges.begin(); - return it->first; - } - - void erase(T val) { - assert(contains(val)); - map_iterator it = theset.get_range_for(val); - assert(it != theset.ranges.end()); - - // entire range - if (val == it->first && val == it->second) { - theset.ranges.erase(it); - _size--; - return; - } - - // beginning - if (val == it->first) { - theset.ranges.insert(pair(val+1, it->second)); - theset.ranges.erase(it); - _size--; - return; - } - - // end - if (val == it->second) { - it->second = val-1; - _size--; - return; - } - - // middle split - theset.ranges.insert(pair(it->first, val-1)); - theset.ranges.insert(pair(val+1, it->second)); - theset.ranges.erase(it); - _size--; - return; - } - - void dump() { - for (typename map::iterator it = theset.ranges.begin(); - it != theset.ranges.end(); - it++) { - cout << " " << it->first << "-" << it->second << endl; - } - } - -}; - - -#endif diff --git a/branches/sage/ebofs2/include/statlite.h b/branches/sage/ebofs2/include/statlite.h deleted file mode 100644 index a9c0433e4a4e8..0000000000000 --- a/branches/sage/ebofs2/include/statlite.h +++ /dev/null @@ -1,72 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -#ifndef _STATLITE_H -#define _STATLITE_H - -extern "C" { - -#include -#include -#include -#include -#include - -struct statlite { - dev_t st_dev; /* device */ - ino_t st_ino; /* inode */ - mode_t st_mode; /* protection */ - nlink_t st_nlink; /* number of hard links */ - uid_t st_uid; /* user ID of owner */ - gid_t st_gid; /* group ID of owner */ - dev_t st_rdev; /* device type (if inode device)*/ - unsigned long st_litemask; /* bit mask for optional fields */ - /***************************************************************/ - /**** Remaining fields are optional according to st_litemask ***/ - off_t st_size; /* total size, in bytes */ - blksize_t st_blksize; /* blocksize for filesystem I/O */ - blkcnt_t st_blocks; /* number of blocks allocated */ - struct timespec st_atim; /* Time of last access. */ - struct timespec st_mtim; /* Time of last modification. */ - struct timespec st_ctim; /* Time of last status change. */ - //time_t st_atime; /* time of last access */ - //time_t st_mtime; /* time of last modification */ - //time_t st_ctime; /* time of last change */ -}; - -#define S_STATLITE_SIZE 1 -#define S_STATLITE_BLKSIZE 2 -#define S_STATLITE_BLOCKS 4 -#define S_STATLITE_ATIME 8 -#define S_STATLITE_MTIME 16 -#define S_STATLITE_CTIME 32 - -#define S_REQUIRESIZE(m) (m | S_STATLITE_SIZE) -#define S_REQUIREBLKSIZE(m) (m | S_STATLITE_BLKSIZE) -#define S_REQUIREBLOCKS(m) (m | S_STATLITE_BLOCKS) -#define S_REQUIREATIME(m) (m | S_STATLITE_ATIME) -#define S_REQUIREMTIME(m) (m | S_STATLITE_MTIME) -#define S_REQUIRECTIME(m) (m | S_STATLITE_CTIME) - -#define S_ISVALIDSIZE(m) (m & S_STATLITE_SIZE) -#define S_ISVALIDBLKSIZE(m) (m & S_STATLITE_BLKSIZE) -#define S_ISVALIDBLOCKS(m) (m & S_STATLITE_BLOCKS) -#define S_ISVALIDATIME(m) (m & S_STATLITE_ATIME) -#define S_ISVALIDMTIME(m) (m & S_STATLITE_MTIME) -#define S_ISVALIDCTIME(m) (m & S_STATLITE_CTIME) - - -// readdirplus etc. - -struct dirent_plus { - struct dirent d_dirent; /* dirent struct for this entry */ - struct stat d_stat; /* attributes for this entry */ - int d_stat_err;/* errno for d_stat, or 0 */ -}; -struct dirent_lite { - struct dirent d_dirent; /* dirent struct for this entry */ - struct statlite d_stat; /* attributes for this entry */ - int d_stat_err;/* errno for d_stat, or 0 */ -}; - -} -#endif diff --git a/branches/sage/ebofs2/include/triple.h b/branches/sage/ebofs2/include/triple.h deleted file mode 100644 index e9f43b9315d21..0000000000000 --- a/branches/sage/ebofs2/include/triple.h +++ /dev/null @@ -1,28 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __TRIPLE_H -#define __TRIPLE_H - -template -class triple { - public: - A first; - B second; - C third; - triple() {} - triple(A f, B s, C t) : first(f), second(s), third(t) {} -}; - -#endif diff --git a/branches/sage/ebofs2/include/types.h b/branches/sage/ebofs2/include/types.h deleted file mode 100644 index cf8374d329a77..0000000000000 --- a/branches/sage/ebofs2/include/types.h +++ /dev/null @@ -1,303 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_TYPES_H -#define __MDS_TYPES_H - -extern "C" { -#include -#include -#include -#include -#include "statlite.h" -} - -#include -#include -#include -#include -#include -#include -#include - -using namespace std; - -#include -using namespace __gnu_cxx; - -#include "ceph_fs.h" - - -#include "object.h" -#include "utime.h" - - -#ifndef MIN -# define MIN(a,b) ((a) < (b) ? (a):(b)) -#endif -#ifndef MAX -# define MAX(a,b) ((a) > (b) ? (a):(b)) -#endif - - -// -- stl crap -- - -namespace __gnu_cxx { - template<> struct hash< std::string > - { - size_t operator()( const std::string& x ) const - { - static hash H; - return H(x.c_str()); - } - }; - -#ifndef __LP64__ - template<> struct hash { - size_t operator()(int64_t __x) const { - static hash H; - return H((__x >> 32) ^ (__x & 0xffffffff)); - } - }; - template<> struct hash { - size_t operator()(uint64_t __x) const { - static hash H; - return H((__x >> 32) ^ (__x & 0xffffffff)); - } - }; -#endif - -} - - -/* - * comparators for stl containers - */ -// for hash_map: -// hash_map, eqstr> vals; -struct eqstr -{ - bool operator()(const char* s1, const char* s2) const - { - return strcmp(s1, s2) == 0; - } -}; - -// for set, map -struct ltstr -{ - bool operator()(const char* s1, const char* s2) const - { - return strcmp(s1, s2) < 0; - } -}; - - - -// ---------------------- -// some basic types - -typedef uint64_t tid_t; // transaction id -typedef uint64_t version_t; -typedef uint32_t epoch_t; // map epoch (32bits -> 13 epochs/second for 10 years) - - - -#define O_LAZY 01000000 - - - -typedef ceph_file_layout FileLayout; - - -// -------------------------------------- -// inode - -typedef __uint64_t _inodeno_t; - -struct inodeno_t { - _inodeno_t val; - inodeno_t() : val(0) {} - inodeno_t(_inodeno_t v) : val(v) {} - inodeno_t operator+=(inodeno_t o) { val += o.val; return *this; } - operator _inodeno_t() const { return val; } -}; - -inline ostream& operator<<(ostream& out, inodeno_t ino) { - return out << hex << ino.val << dec; -} - -namespace __gnu_cxx { - template<> struct hash< inodeno_t > - { - size_t operator()( const inodeno_t& x ) const - { - static rjhash H; - return H(x.val); - } - }; -} - - -#define INODE_MODE_FILE 0100000 // S_IFREG -#define INODE_MODE_SYMLINK 0120000 // S_IFLNK -#define INODE_MODE_DIR 0040000 // S_IFDIR -#define INODE_TYPE_MASK 0170000 - -#define FILE_MODE_R 1 -#define FILE_MODE_W 2 -#define FILE_MODE_RW (1|2) -#define FILE_MODE_LAZY 4 - -/** stat masks - */ -#define STAT_MASK_INO 1 // inode nmber -#define STAT_MASK_TYPE 2 // file type bits of the mode -#define STAT_MASK_BASE 4 // layout, symlink value -#define STAT_MASK_AUTH 8 // uid, gid, mode -#define STAT_MASK_LINK 16 // nlink, anchored -#define STAT_MASK_FILE 32 // mtime, size. - -#define STAT_MASK_ALL 63 - -#define STAT_MASK_SIZE STAT_MASK_FILE // size, blksize, blocks -#define STAT_MASK_MTIME STAT_MASK_FILE // mtime -#define STAT_MASK_ATIME STAT_MASK_FILE // atime -#define STAT_MASK_CTIME (STAT_MASK_FILE|STAT_MASK_AUTH|STAT_MASK_LINK) // ctime - -inline int DT_TO_MODE(int dt) { - return dt << 12; -} - -struct inode_t { - // base (immutable) - inodeno_t ino; - FileLayout layout; // ?immutable? - uint32_t rdev; // if special file - - // affected by any inode change... - utime_t ctime; // inode change time - - // perm (namespace permissions) - uint32_t mode; - uid_t uid; - gid_t gid; - - // nlink - int32_t nlink; - bool anchored; // auth only? - - // file (data access) - int64_t size, max_size, allocated_size; - utime_t mtime; // file data modify time. - utime_t atime; // file data access time. - - // special stuff - version_t version; // auth only - version_t file_data_version; // auth only - - // file type - bool is_symlink() { return (mode & INODE_TYPE_MASK) == INODE_MODE_SYMLINK; } - bool is_dir() { return (mode & INODE_TYPE_MASK) == INODE_MODE_DIR; } - bool is_file() { return (mode & INODE_TYPE_MASK) == INODE_MODE_FILE; } - - // corresponding d_types - static const unsigned char DT_REG = 8; - static const unsigned char DT_DIR = 4; - static const unsigned char DT_LNK = 10; -}; - -inline unsigned char MODE_TO_DT(int mode) { - return mode >> 12; -} - - - - - - -// dentries -#define MAX_DENTRY_LEN 255 - - - - -// -- io helpers -- - -template -inline ostream& operator<<(ostream& out, pair v) { - return out << v.first << "," << v.second; -} - -template -inline ostream& operator<<(ostream& out, vector& v) { - out << "["; - for (unsigned i=0; i -inline ostream& operator<<(ostream& out, const list& ilist) { - for (typename list::const_iterator it = ilist.begin(); - it != ilist.end(); - it++) { - if (it != ilist.begin()) out << ","; - out << *it; - } - return out; -} - -template -inline ostream& operator<<(ostream& out, const set& iset) { - for (typename set::const_iterator it = iset.begin(); - it != iset.end(); - it++) { - if (it != iset.begin()) out << ","; - out << *it; - } - return out; -} - -template -inline ostream& operator<<(ostream& out, const multiset& iset) { - for (typename multiset::const_iterator it = iset.begin(); - it != iset.end(); - it++) { - if (it != iset.begin()) out << ","; - out << *it; - } - return out; -} - -template -inline ostream& operator<<(ostream& out, const map& m) -{ - out << "{"; - for (typename map::const_iterator it = m.begin(); - it != m.end(); - it++) { - if (it != m.begin()) out << ","; - out << it->first << "=" << it->second; - } - out << "}"; - return out; -} - - - -#endif diff --git a/branches/sage/ebofs2/include/uofs.h b/branches/sage/ebofs2/include/uofs.h deleted file mode 100644 index a4673aaa616ea..0000000000000 --- a/branches/sage/ebofs2/include/uofs.h +++ /dev/null @@ -1,51 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -/* - * uofs.h - * - * user-level object-based file system - */ - - #ifndef _UOFS_H_ - #define _UOFS_H_ - - #include - #include - #include - - - int device_open(char *path, int xflags); - void device_findsizes(int fd, long long *sz, int *bsz); - - int uofs_format(int bdev_id, int donode_size, int bd_ratio, int reg_size, int sb_size, int lb_size, - int nr_hash_table_buckets, int delay_allocation, int flush_interval); - - int uofs_mount(int bdev_id); - void uofs_shutdown(void); - - int uofs_read(long long oid, void *buf, off_t offset, size_t count); - int uofs_write(long long oid, void *buf, off_t offset, size_t count); - int uofs_del(long long oid); - int uofs_sync(long long oid); - int uofs_exist(long long oid); - - int uofs_get_size(long long oid); - - void uofs_superblock_printout(void); - int get_large_object_pages(void); - - int uofs_buffer_size(void); - #endif diff --git a/branches/sage/ebofs2/include/utime.h b/branches/sage/ebofs2/include/utime.h deleted file mode 100644 index 7fef5a7f930d2..0000000000000 --- a/branches/sage/ebofs2/include/utime.h +++ /dev/null @@ -1,149 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __UTIME_H -#define __UTIME_H - -#include -#include -#include - -// -------- -// utime_t - -typedef struct timeval _utime_t; - -class utime_t { - private: - struct timeval tv; - - struct timeval& timeval() { return tv; } - friend class Clock; - - - public: - void normalize() { - if (tv.tv_usec > 1000*1000) { - tv.tv_sec += tv.tv_usec / (1000*1000); - tv.tv_usec %= 1000*1000; - } - } - - // cons - utime_t() { tv.tv_sec = 0; tv.tv_usec = 0; normalize(); } - //utime_t(time_t s) { tv.tv_sec = s; tv.tv_usec = 0; } - utime_t(time_t s, int u) { tv.tv_sec = s; tv.tv_usec = u; normalize(); } - utime_t(const _utime_t &v) : tv(v) {} - /* - utime_t(double d) { - tv.tv_sec = (time_t)trunc(d); - tv.tv_usec = (__suseconds_t)((d - tv.tv_sec) / (double)1000000.0); - } - */ - - // accessors - time_t sec() const { return tv.tv_sec; } - long usec() const { return tv.tv_usec; } - int nsec() const { return tv.tv_usec*1000; } - - // ref accessors/modifiers - time_t& sec_ref() { return tv.tv_sec; } - // FIXME: tv.tv_usec is a __darwin_suseconds_t on Darwin. - // is just casting it to long& OK? - long& usec_ref() { return (long&) tv.tv_usec; } - - struct timeval& tv_ref() { return tv; } - - // cast to double - operator double() { - return (double)sec() + ((double)usec() / 1000000.0L); - } -}; - -// arithmetic operators -inline utime_t operator+(const utime_t& l, const utime_t& r) { - return utime_t( l.sec() + r.sec() + (l.usec()+r.usec())/1000000L, - (l.usec()+r.usec())%1000000L ); -} -inline utime_t& operator+=(utime_t& l, const utime_t& r) { - l.sec_ref() += r.sec() + (l.usec()+r.usec())/1000000L; - l.usec_ref() += r.usec(); - l.usec_ref() %= 1000000L; - return l; -} -inline utime_t& operator+=(utime_t& l, double f) { - double fs = trunc(f); - double us = (f - fs) * (double)1000000.0; - l.sec_ref() += (long)fs; - l.usec_ref() += (long)us; - l.normalize(); - return l; -} - -inline utime_t operator-(const utime_t& l, const utime_t& r) { - return utime_t( l.sec() - r.sec() - (l.usec()= r.usec()) - l.usec_ref() -= r.usec(); - else { - l.usec_ref() += 1000000L - r.usec(); - l.sec_ref()--; - } - return l; -} -inline utime_t& operator-=(utime_t& l, double f) { - l += -f; - return l; -} - -inline bool operator>(const utime_t& a, const utime_t& b) -{ - return (a.sec() > b.sec()) || (a.sec() == b.sec() && a.usec() > b.usec()); -} -inline bool operator<(const utime_t& a, const utime_t& b) -{ - return (a.sec() < b.sec()) || (a.sec() == b.sec() && a.usec() < b.usec()); -} - -// ostream -inline std::ostream& operator<<(std::ostream& out, const utime_t& t) -{ - out.setf(std::ios::right); - out.fill('0'); - if (t.sec() < ((time_t)(60*60*24*365*10))) { - // raw seconds. this looks like a relative time. - out << (long)t.sec(); - } else { - // localtime. this looks like an absolute time. - struct tm bdt; - time_t tt = t.sec(); - localtime_r(&tt, &bdt); - out << std::setw(2) << (bdt.tm_year-100) // 2007 -> '07' - << std::setw(2) << (bdt.tm_mon+1) - << std::setw(2) << bdt.tm_mday - << "." - << std::setw(2) << bdt.tm_hour - << std::setw(2) << bdt.tm_min - << std::setw(2) << bdt.tm_sec; - } - out << "."; - out << std::setw(6) << t.usec(); - out.unsetf(std::ios::right); - return out; -} - -#endif diff --git a/branches/sage/ebofs2/include/xlist.h b/branches/sage/ebofs2/include/xlist.h deleted file mode 100644 index 2ea2cbec6c815..0000000000000 --- a/branches/sage/ebofs2/include/xlist.h +++ /dev/null @@ -1,123 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __XLIST_H -#define __XLIST_H - -template -class xlist { -public: - struct item { - T _item; - item *_prev, *_next; - xlist *_head; - - item(T i) : _item(i), _prev(0), _next(0), _head(0) {} - ~item() { - remove_myself(); - } - - xlist* get_xlist() { return _head; } - void remove_myself() { - if (_head) - _head->remove(this); - assert(_head == 0); - } - }; - -private: - item *_front, *_back; - int _size; - -public: - xlist() : _front(0), _back(0), _size(0) {} - ~xlist() { - assert(_size == 0); - assert(_front == 0); - assert(_back == 0); - } - - int size() { return _size; } - bool empty() { - assert((bool)_front == (bool)_size); - return _front == 0; - } - - void clear() { - while (_front) remove(_front); - } - - void push_back(item *item) { - if (item->_head) - item->_head->remove(item); - - item->_head = this; - item->_next = 0; - item->_prev = _back; - if (_back) - _back->_next = item; - else - _front = item; - _back = item; - _size++; - } - void remove(item *item) { - assert(item->_head == this); - - if (item->_prev) - item->_prev->_next = item->_next; - else - _front = item->_next; - if (item->_next) - item->_next->_prev = item->_prev; - else - _back = item->_prev; - _size--; - - item->_head = 0; - item->_next = item->_prev = 0; - } - - T front() { return (T)_front->_item; } - T back() { return (T)_back->_item; } - - void pop_front() { - assert(!empty()); - remove(_front); - } - void pop_back() { - assert(!empty()); - remove(_back); - } - - class iterator { - private: - item *cur; - public: - iterator(item *i = 0) : cur(i) {} - T operator*() { return (T)cur->_item; } - iterator& operator++() { - assert(cur); - cur = cur->_next; - return *this; - } - bool end() { return cur == 0; } - }; - - iterator begin() { return iterator(_front); } - iterator end() { return iterator(NULL); } -}; - - -#endif diff --git a/branches/sage/ebofs2/jobs/alc.tp b/branches/sage/ebofs2/jobs/alc.tp deleted file mode 100644 index c600850c54be0..0000000000000 --- a/branches/sage/ebofs2/jobs/alc.tp +++ /dev/null @@ -1,38 +0,0 @@ -#PSUB -s /bin/bash # Sets your shell in batch -#PSUB -c alc # Where to run the job - -#PSUB -eo # Send std error & std out to the same file - -#PSUB -ln $NUM # Number of nodes to use -#PSUB -g $NUM # Total Number of tasks to use -#PSUB -cpn 1 # cpus per node - -####PSUB -c 1024Mb # memory limit -#PSUB -lc 1500 # Core file size per process -#PSUB -nr # Do not automatically resubmit job -#PSUB -tM 20m # Select time limit. The default time limit - # is only 30 minutes! Time can be HH:MM:SS or HH:MM - -#PSUB -o $CWD/$OUT # filename for output - -# Put your commands here. Remember to 'cd' to the appropriate -# directory, because the job will initially be in your home directory. -# To run a parallel job, you need to use the srun. - - - -echo job $PSUB_JOBID nodes $NUM name $NAME - -# environment -cd $CWD -export LD_LIBRARY_PATH=/usr/lib/mpi/mpi_gnu/lib - -# create fakestore dirs -srun -l -N $NUM -ppbatch bash -c "test -d tmp/osddata || mkdir tmp/osddata || echo cant make osddata ; uptime" - -# go -srun -l -N $NUM -ppbatch $CMD && touch $DONE - -# clean up fakestore -srun -l -N $NUM -ppbatch bash -c 'uptime ; rm -r tmp/osddata/*' - diff --git a/branches/sage/ebofs2/jobs/alcdat/makedirs b/branches/sage/ebofs2/jobs/alcdat/makedirs deleted file mode 100644 index af5a098a254c9..0000000000000 --- a/branches/sage/ebofs2/jobs/alcdat/makedirs +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - #'nummds' => [1, 2, 4, 8, 16, 32, 48, 64, 80, 96, 128, 160, 192], - 'nummds' => [1, 2, 8, 16, 32, 48, 64, 80, 96, 112, 128],#144, 160, 192, 208], - - 'cper' => [15,20], - '_dep' => [ 'cnode' => '$nummds',# / 4 + 1', - 'numclient' => '$nummds * $cper', - 'numosd' => '$nummds > 1 ? $nummds:2', - 'n' => '1 + $cnode + $nummds + $numosd' ], - - # parameters - 'fs' => 'ebofs', - #'fs' => 'fakestore', - - 'mds_bal_rep' => 10000, # none of that! - 'mds_decay_halflife' => 30, - - 'mds_bal_interval' => 45, - 'mds_bal_max' => [2], - - 'until' => 300, # --syn until $n ... when to stop clients - 'kill_after' => 400, - 'start' => 100, - 'end' => 300, - - 'makedirs' => 1, - 'makedirs_dirs' => 10, - 'makedirs_files' => 10, - 'makedirs_depth' => 4, - - # --meta_log_layout_scount 32 --meta_log_layout_ssize 256 - # --osd_pg_layout linear - 'custom' => '--tcp_skip_rank0 --meta_log_layout_num_rep 1 --meta_dir_layout_num_rep 1 --mds_shutdown_check 60', - #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', - - 'comb' => { - 'x' => 'nummds', - 'vars' => [ 'mds.req' ] - } -}; diff --git a/branches/sage/ebofs2/jobs/alcdat/makedirs.big b/branches/sage/ebofs2/jobs/alcdat/makedirs.big deleted file mode 100644 index c67b2b93dd742..0000000000000 --- a/branches/sage/ebofs2/jobs/alcdat/makedirs.big +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - #'nummds' => [1, 2, 4, 8, 16, 32, 48, 64, 80, 96, 128, 160, 192], - 'nummds' => [160, 200],#[1, 2, 8, 16, 32, 48, 64, 80, 96, 112, 128],#144, 160, 192, 208], - - 'cper' => [15,20], - '_dep' => [ 'cnode' => '40',#$nummds',# / 4 + 1', - 'numclient' => '$nummds * $cper', - 'numosd' => '$nummds * .8', - 'n' => '415'],#1 + $cnode + $nummds + $numosd' ], - - # parameters - 'fs' => 'ebofs', - #'fs' => 'fakestore', - - 'mds_bal_rep' => 10000, # none of that! - 'mds_decay_halflife' => 30, - - 'mds_bal_interval' => 45, - 'mds_bal_max' => 2, - - 'until' => 300, # --syn until $n ... when to stop clients - 'kill_after' => 400, - 'start' => 100, - 'end' => 300, - - 'makedirs' => 1, - 'makedirs_dirs' => 10, - 'makedirs_files' => 10, - 'makedirs_depth' => 4, - - # --meta_log_layout_scount 32 --meta_log_layout_ssize 256 - # --osd_pg_layout linear - 'custom' => '--tcp_skip_rank0 --meta_log_layout_num_rep 1 --meta_dir_layout_num_rep 1 --mds_shutdown_check 60', - #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', - - 'comb' => { - 'x' => 'nummds', - 'vars' => [ 'mds.req' ] - } -}; diff --git a/branches/sage/ebofs2/jobs/alcdat/makedirs.tput b/branches/sage/ebofs2/jobs/alcdat/makedirs.tput deleted file mode 100644 index 8dd5ae4c47d8c..0000000000000 --- a/branches/sage/ebofs2/jobs/alcdat/makedirs.tput +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - #'nummds' => [1, 2, 4, 8, 16, 32, 48, 64, 80, 96, 128, 160, 192], - 'nummds' => [4, 16, 64],#[1, 16, 64, 128],#144, 160, 192, 208], - - #'cper' => [2, 5, 7, 10, 13, 16, 20, 30, 40, 50, 100, 150], - 'cper' => [13, 30, 40], # just for final run... - '_dep' => [ 'cnode' => '$nummds',# / 4 + 1', - 'numclient' => '$nummds * $cper', - 'numosd' => '$nummds', - 'n' => '1 + $cnode + $nummds + $numosd' ], - - # parameters - 'fs' => 'ebofs', - #'fs' => 'fakestore', - - 'mds_bal_rep' => 10000, # none of that! - 'mds_decay_halflife' => 30, - - 'mds_bal_interval' => 45, - 'mds_bal_max' => 2, - - 'until' => 300, # --syn until $n ... when to stop clients - 'kill_after' => 400, - 'start' => 100, - 'end' => 300, - - 'makedirs' => 1, - 'makedirs_dirs' => 10, - 'makedirs_files' => 10, - 'makedirs_depth' => 4, - - # --meta_log_layout_scount 32 --meta_log_layout_ssize 256 - # --osd_pg_layout linear - 'custom' => '--tcp_skip_rank0 --meta_log_layout_num_rep 1 --meta_dir_layout_num_rep 1 --mds_shutdown_check 60', - #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', - - 'comb' => { - 'x' => 'cper',#nummds', - 'vars' => [ 'mds.req', 'cl.lat' ] - } -}; diff --git a/branches/sage/ebofs2/jobs/alcdat/makefiles.shared b/branches/sage/ebofs2/jobs/alcdat/makefiles.shared deleted file mode 100644 index ab96702c73289..0000000000000 --- a/branches/sage/ebofs2/jobs/alcdat/makefiles.shared +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - 'nummds' => [1, 2, 4, 8, 16, 32, 64, 96, 128], #2, 4, 8, 16, 32, 48, 64, 80, 96], - - 'cper' => [25, 50, 100, 150],# 100, 150, 200], - - '_dep' => [ 'cnode' => '$nummds', - 'numclient' => '$nummds * $cper', - 'numosd' => '$nummds', - 'n' => '1 + $cnode + $nummds + $numosd' ], - - # parameters - 'fs' => 'ebofs', - - 'mds_bal_hash_wr' => 1000, - - 'until' => 180, # --syn until $n ... when to stop clients - 'kill_after' => 250, - 'start' => 30, - 'end' => 180, - - 'custom' => '--tcp_skip_rank0 --meta_log_layout_num_rep 1 --meta_dir_layout_num_rep 1 --mds_shutdown_check 60 --syn makefiles 100000 1000 0', - - 'comb' => { - 'x' => 'nummds', - 'vars' => [ 'mds.req', 'cl.lat' ] - } -}; diff --git a/branches/sage/ebofs2/jobs/alcdat/openshared b/branches/sage/ebofs2/jobs/alcdat/openshared deleted file mode 100644 index 5ed7ba95894b3..0000000000000 --- a/branches/sage/ebofs2/jobs/alcdat/openshared +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - 'nummds' => [1, 4, 16, 64, 128, 192 ], - - 'cper' => [10, 50, 100, 150], - '_dep' => [ 'cnode' => '$nummds',# > 30 ? 30:$nummds', - 'numclient' => '$nummds*$cper', - 'numosd' => '$nummds > 30 ? 30:$nummds', - 'n' => '1 + $cnode + $nummds + $numosd' ], - - # parameters - 'fs' => 'ebofs', - - 'mds_bal_interval' => 10000, - 'mds_bal_hash_wr' => 1000, - - 'until' => 120, # --syn until $n ... when to stop clients - 'kill_after' => 180, - 'start' => 10, - 'end' => 120, - - 'custom' => '--tcp_skip_rank0 --debug_mds_balancer 10 --mds_shutdown_check 60 --syn only 0 --syn createshared 10 --syn sleep 5 --syn openshared 10 10000', - - 'comb' => { - 'x' => 'nummds', - 'vars' => [ 'mds.req', 'cl.lat' ] - } -}; diff --git a/branches/sage/ebofs2/jobs/alcdat/ossh.include b/branches/sage/ebofs2/jobs/alcdat/ossh.include deleted file mode 100644 index c9a368ba5c60f..0000000000000 --- a/branches/sage/ebofs2/jobs/alcdat/ossh.include +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 10, - - #'nummds' => [2, 4, 8, 16, 32, 48, 64, 80, 96, 128], - #'nummds' => [1, 2, 4, 6, 7], # googoo - 'nummds' => [ 2, 4, 8, 16, 32, 48, 64, 80, 96, 128 ], - - #'trace' => ['make.lib', 'make.include'], - - 'mds_bal_interval' => 45, - 'mds_bal_max' => 2,#6, #[ 2,4,6 ], - 'mds_decay_halflife' => 30, - 'mds_bal_rep' => 1500, - 'mds_bal_hash_rd' => 100000, - - 'cper' => [15, 20],#25, 50, 100], #50,#[25, 50, 75, 100],#50,# [ 50, 100 ], - #'cper' => 125, #[30, 50, 75, 100, 125, 150], #50, #[10,50,100],# [ 50, 75, 100 ], - - '_dep' => [ 'cnode' => '$nummds', - 'numclient' => '$nummds * $cper', - 'numosd' => '$nummds', - 'n' => '1 + $cnode + $nummds + $numosd' ], - - 'custom' => '--tcp_skip_rank0 --mds_shutdown_check 60 --syn only 0 --syn trace traces/openssh/untar.include 1 --syn sleep 30 --syn trace traces/openssh/make.include 1000', - - # parameters - 'fs' => 'ebofs', - - #'until' => 500, - #'kill_after' => 600, - #'start' => 200, - #'end' => 500, - 'until' => 300, # --syn until $n ... when to stop clients - 'kill_after' => 400, - 'start' => 200, - 'end' => 300, - - 'comb' => { - 'x' => 'nummds', - 'vars' => [ 'mds.req' ] - } -}; diff --git a/branches/sage/ebofs2/jobs/alcdat/ossh.include.big b/branches/sage/ebofs2/jobs/alcdat/ossh.include.big deleted file mode 100644 index b92895a53a763..0000000000000 --- a/branches/sage/ebofs2/jobs/alcdat/ossh.include.big +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 10, - - #'nummds' => [2, 4, 8, 16, 32, 48, 64, 80, 96, 128], - #'nummds' => [1, 2, 4, 6, 7], # googoo - #'nummds' => [ 2, 4, 8, 16, 32, 48, 64, 80, 96, 128 ], - 'nummds' => [160,200], - - #'trace' => ['make.lib', 'make.include'], - - 'mds_bal_interval' => 45, - 'mds_bal_max' => 2,#6, #[ 2,4,6 ], - 'mds_decay_halflife' => 30, - 'mds_bal_rep' => 1500, - 'mds_bal_hash_rd' => 100000, - - 'cper' => [25, 50], #50,#[25, 50, 75, 100],#50,# [ 50, 100 ], - #'cper' => 125, #[30, 50, 75, 100, 125, 150], #50, #[10,50,100],# [ 50, 75, 100 ], - - '_dep' => [ 'cnode' => '$nummds', - 'numclient' => '$nummds * $cper', - 'numosd' => '$nummds * .6', - 'n' => '415'],#1 + $cnode + $nummds + $numosd' ], - - 'custom' => '--tcp_skip_rank0 --tcp_overlay_clients --mds_shutdown_check 60 --syn only 0 --syn trace traces/openssh/untar.include 1 --syn sleep 30 --syn trace traces/openssh/make.include 1000', - - # parameters - 'fs' => 'ebofs', - - #'until' => 500, - #'kill_after' => 600, - #'start' => 200, - #'end' => 500, - 'until' => 300, # --syn until $n ... when to stop clients - 'kill_after' => 400, - 'start' => 200, - 'end' => 300, - - 'comb' => { - 'x' => 'nummds', - 'vars' => [ 'mds.req' ] - } -}; diff --git a/branches/sage/ebofs2/jobs/alcdat/ossh.lib b/branches/sage/ebofs2/jobs/alcdat/ossh.lib deleted file mode 100644 index 73372866f051f..0000000000000 --- a/branches/sage/ebofs2/jobs/alcdat/ossh.lib +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 10, - - #'nummds' => [1, 2, 4, 8, 16, 32, 48, 64, 80, 96, 128], - 'nummds' => [2, 4, 8, 16, 32, 48, 64, 80, 96, 128], - - #'nummds' => [1, 2, 4, 6, 7], # googoo - #'trace' => ['make.lib', 'make.include'], - - 'mds_bal_interval' => 90, #[30, 60, 90], #$[60,90], #[60,90],#[30, 60, 90], - #'mds_bal_max' => [4, 10],#6,#[2,4,6,8], - - 'mds_decay_halflife' => 30, - 'mds_bal_rep' => 1500, - 'cper' => [10, 16], #50,#[25, 50, 75, 100],#50,# [ 50, 100 ], - - '_dep' => [ 'cnode' => '$nummds', - 'numclient' => '$nummds * $cper', - 'numosd' => '$nummds', - 'n' => '1 + $cnode + $nummds + $numosd' ], - - - 'custom' => '--tcp_skip_rank0 --debug_mds_balancer 1 --mds_shutdown_check 60 --syn only 0 --syn trace traces/openssh/untar.lib 1 --syn sleep 10 --syn trace traces/openssh/make.lib 1000', - - # parameters - #'fs' => ['fakestore'], - 'fs' => 'ebofs', - - #'until' => 500, - #'kill_after' => 600, - #'start' => 200, - #'end' => 500, - 'until' => 300, # --syn until $n ... when to stop clients - 'kill_after' => 400, - 'start' => 150, - 'end' => 300, - - 'comb' => { - 'x' => 'nummds', - 'vars' => [ 'mds.req' ] - } -}; diff --git a/branches/sage/ebofs2/jobs/alcdat/ossh.lib.big b/branches/sage/ebofs2/jobs/alcdat/ossh.lib.big deleted file mode 100644 index b9e0dd1ff68cd..0000000000000 --- a/branches/sage/ebofs2/jobs/alcdat/ossh.lib.big +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 10, - - #'nummds' => [1, 2, 4, 8, 16, 32, 48, 64, 80, 96, 128], - #'nummds' => [2, 4, 8, 16, 32, 48, 64, 80, 96, 128], - 'nummds' => [160,200], - - #'nummds' => [1, 2, 4, 6, 7], # googoo - #'trace' => ['make.lib', 'make.include'], - - 'mds_bal_interval' => 90, #[30, 60, 90], #$[60,90], #[60,90],#[30, 60, 90], - #'mds_bal_max' => [4, 10],#6,#[2,4,6,8], - - 'mds_decay_halflife' => 30, - 'mds_bal_rep' => 1500, - 'cper' => [25, 50, 100], #50,#[25, 50, 75, 100],#50,# [ 50, 100 ], - - '_dep' => [ 'cnode' => 0,#'30', - 'numclient' => '$nummds * $cper', - 'numosd' => '$nummds * .6', - 'n' => '415'],#'1 + $cnode + $nummds + $numosd' ], - - - 'custom' => '--tcp_skip_rank0 --debug_mds_balancer 1 --mds_shutdown_check 60 --syn only 0 --syn trace traces/openssh/untar.lib 1 --syn sleep 10 --syn trace traces/openssh/make.lib 1000', - - # parameters - #'fs' => ['fakestore'], - 'fs' => 'ebofs', - - #'until' => 500, - #'kill_after' => 600, - #'start' => 200, - #'end' => 500, - 'until' => 300, # --syn until $n ... when to stop clients - 'kill_after' => 400, - 'start' => 150, - 'end' => 300, - - 'comb' => { - 'x' => 'nummds', - 'vars' => [ 'mds.req' ] - } -}; diff --git a/branches/sage/ebofs2/jobs/alcdat/striping b/branches/sage/ebofs2/jobs/alcdat/striping deleted file mode 100644 index de71828d12bde..0000000000000 --- a/branches/sage/ebofs2/jobs/alcdat/striping +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - 'nummds' => 1, - 'numosd' => 10, - - 'cnode' => 10, - 'cper' => [ 10, 25, 50, 100 ], - - '_dep' => [ 'numclient' => '$cper * $cnode', - 'n' => '1 + $cnode + $nummds + $numosd', - 'file_layout_osize' => '$writefile_size' ], - - # parameters - 'fs' => 'ebofs', - #'fs' => 'fakestore', - - 'until' => 160, # --syn until $n ... when to stop clients - 'kill_after' => 200, - 'start' => 100, - 'end' => 160, - - 'writefile' => 1, - 'writefile_size' => [ -# 4*1024*1024, - 1024*1024 ], -# 256*1024, -# 64*1024 - 'writefile_mb' => 100000, - - 'osd_pg_bits' => 10,#16, - #'osd_pg_bits' => [ 16, 20 ], - - #'osd_object_layout' => [ 'hash', 'hashino', 'linear' ], - 'osd_pg_layout' => [ 'crush', -# 'hash', - 'linear' ], - - 'custom' => '--tcp_skip_rank0 --file_layout_num_rep 1 --mds_shutdown_check 60', - - 'comb' => { - 'x' => 'cper',#writefile_size', - 'vars' => [ 'osd.c_wrb', 'osd.c_wr' ], - } -}; diff --git a/branches/sage/ebofs2/jobs/example b/branches/sage/ebofs2/jobs/example deleted file mode 100644 index 802a8b66e6332..0000000000000 --- a/branches/sage/ebofs2/jobs/example +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/perl -# hi there -{ - # startup - 'n' => 30, # number of mpi nodes - 'sleep' => 3, # seconds to sleep between runs (so you have time to control-c out) - 'nummds' => 1, - 'numosd' => 6, - 'numclient' => 100, - - 'until' => 100, # --syn until $n ... synthetic client will stop itself after this many seconds. - 'kill_after' => 300, # seconds before everything commits suicide (in case something hangs) - - # stuff i want to vary - # here's a simple example: - - # do --syn writefile command - 'writefile' => 1, - # and very the write size - 'writefile_size' => [ # vary -# 2048*1024, - 1024*1024, - 512*1024, - 256*1024, - 128*1024, - 64*1024, - 48*1024, - 32*1024, - 28*1024, - 24*1024, - 16*1024, - 12*1024, - 8*1024, - 4096, -# 256, -# 16, -# 1 - ], - 'writefile_mb' => 1000, # each client shoudl write 1GB (or more likely, keep going until time runs out) - - 'file_layout_num_rep'=> [1,2], # also vary the replication level - - # pass some other random things to newsyn - 'custom' => '--', - - # for final summation (script/sum.pl) - # specify time period to look at the results - 'start' => 30, # skip first 30 seconds, so that caches are full etc. - 'end' => 90, # go for 60 seconds - - # what should i parse/plot? - 'comb' => { - 'x' => 'writefile_size', - 'vars' => [ 'osd.c_wrb', 'osd.r_wrb' ], - } -}; diff --git a/branches/sage/ebofs2/jobs/mds/log_striping b/branches/sage/ebofs2/jobs/mds/log_striping deleted file mode 100644 index 46242cdda4f00..0000000000000 --- a/branches/sage/ebofs2/jobs/mds/log_striping +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - 'kill_after' => 300, - - 'nummds' => 1, - 'numosd' => 8, - 'numclient' => 100, - 'n' => 16, - - # parameters - 'fs' => ['ebofs','fakestore'], - 'meta_log_ssize' => [ 128, 256, 1024, 1 << 15, 1 << 20 ], - 'meta_log_scount' => 4,#[ 1, 2, 4, 8 ], - - 'until' => 200, # --syn until $n ... when to stop clients - - 'makedirs' => 1, - 'makedirs_dirs' => 10, - 'makedirs_files' => 10, - 'makedirs_depth' => 4, - - 'custom' => '--tcp_skip_rank0', - #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', - - # for final summation (script/sum.pl) - 'start' => 100, - 'end' => 550, - - 'comb' => { - 'x' => 'nummds', - 'vars' => [ 'mds.req' ] - } -}; diff --git a/branches/sage/ebofs2/jobs/mds/makedir_lat b/branches/sage/ebofs2/jobs/mds/makedir_lat deleted file mode 100644 index 63374f52a36c0..0000000000000 --- a/branches/sage/ebofs2/jobs/mds/makedir_lat +++ /dev/null @@ -1,33 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - 'nummds' => 1, - 'numosd' => 8, - 'numclient' => [1],#, 40, 80, 160 ], - 'n' => 20, - - 'fs' => 'ebofs', - - 'start' => 20, - 'end' => 40, - 'until' => 40, - 'kill_after' => 60, - - 'makedirs' => 1, - 'makedirs_dirs' => 10, - 'makedirs_files' => 10, - 'makedirs_depth' => 5, - - 'mds_local_osd' => [ 0, 1 ], - 'meta_log_layout_num_rep' => [ 0, 1, 2, 3, 4], - - 'custom' => '--tcp_skip_rank0', - - 'comb' => { - 'x' => 'meta_log_layout_num_rep', - 'vars' => [ 'mds.log.lat', 'cl.lat', 'osd.rlat' ] - } -}; diff --git a/branches/sage/ebofs2/jobs/mds/makedirs b/branches/sage/ebofs2/jobs/mds/makedirs deleted file mode 100644 index 4ca42d72fa37e..0000000000000 --- a/branches/sage/ebofs2/jobs/mds/makedirs +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - '_psub' => 'jobs/alc.tp', - - 'sleep' => 3, - - 'nummds' => [1, 2, 4, 6, 8, 12, 16, 24, 32, 40, 48, 64], - - 'cper' => 50, - '_dep' => [ 'cnode' => '$nummds', - 'numclient' => '$cnode * $cper', - 'numosd' => '$nummds * 2', - 'n' => '1 + $cnode + $nummds + $numosd' ], - - # parameters - #'fs' => 'ebofs', - 'fs' => 'fakestore', - - 'until' => 300, # --syn until $n ... when to stop clients - 'kill_after' => 400, - - 'makedirs' => 1, - 'makedirs_dirs' => 10, - 'makedirs_files' => 10, - 'makedirs_depth' => 3, - - 'custom' => '--tcp_skip_rank0', - #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', - - # for final summation (script/sum.pl) - 'start' => 100, - 'end' => 550, - - 'comb' => { - 'x' => 'nummds', - 'vars' => [ 'mds.req' ] - } -}; diff --git a/branches/sage/ebofs2/jobs/mds/opensshlib b/branches/sage/ebofs2/jobs/mds/opensshlib deleted file mode 100644 index d8b61ae52c655..0000000000000 --- a/branches/sage/ebofs2/jobs/mds/opensshlib +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - 'nummds' => [1, 2, 4, 7], # googoo - #'nummds' => [1, 2, 4, 6, 8, 12, 16, 24, 32, 40, 48, 64], # alc - - - # parameters - 'fs' => 'ebofs', - #'fs' => 'fakestore', - - 'until' => 300, # --syn until $n ... when to stop clients - 'kill_after' => 400, - 'start' => 150, - 'end' => 300, - - 'mds_bal_interval' => 90,#[60, 90], - #'mds_bal_max' => [3,4,5], - 'mds_bal_max' => 4, - 'mds_decay_halflife' => 30,#[15, 25, 30, 45, 60], - 'mds_bal_rep' => 1500,#[1000, 1500, 2000], - - 'decay_hl' => 100,#[ 25, 50, 100, 150 ], - - 'cper' => 100, #[50, 75, 100, 125, 150, 200], - '_dep' => [ 'cnode' => '$nummds', - 'numclient' => '$nummds * $cper', - 'numosd' => '$nummds * 2', - 'n' => '1 + $cnode + $nummds + $numosd', - 'mds_bal_rep' => '$mds_decay_halflife * $decay_hl'], - - 'custom' => '--tcp_skip_rank0 --syn only 0 --syn trace traces/openssh/untar.lib 1 --syn sleep 10 --syn randomsleep 30 --syn trace traces/openssh/make.lib 100 --debug_mds_balancer 1 --mds_shutdown_check 60', - #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', - - # for final summation (script/sum.pl) - - 'comb' => { - 'x' => 'nummds',#decay_hl',#'nummds', - 'vars' => [ 'mds.req' ] - } -}; diff --git a/branches/sage/ebofs2/jobs/meta1 b/branches/sage/ebofs2/jobs/meta1 deleted file mode 100644 index 743212f1c3009..0000000000000 --- a/branches/sage/ebofs2/jobs/meta1 +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/sh - -# makedirs for 300 seconds -# first bit in memory -# second bit is commiting from journal too -# then walk fs for 300 seconds -# this should all be in memory. - -JOB="meta1" -ARGS="--numosd 10 --fullmkfs --syn until 180 --syn makedirs 10 10 4 --syn until 360 --syn repeatwalk --mds_bal_max 1 --osd_fsync 0 --mds_log_max_len 200000 --mds_cache_size 500000" - -#rm core* ; make tcpsyn && mpiexec -l -n 17 ./tcpsyn $ARGS --nummds 1 --log_name $JOB/1 --numclient 25 > log/$JOB/o.1 -#rm core* ; make tcpsyn && mpiexec -l -n 18 ./tcpsyn $ARGS --nummds 2 --log_name $JOB/2 --numclient 50 > log/$JOB/o.2 -#rm core* ; make tcpsyn && mpiexec -l -n 20 ./tcpsyn $ARGS --nummds 4 --log_name $JOB/4 --numclient 100 > log/$JOB/o.4 -#rm core* ; make tcpsyn && mpiexec -l -n 24 ./tcpsyn $ARGS --nummds 8 --log_name $JOB/8 --numclient 200 > log/$JOB/o.8 -#rm core* ; make tcpsyn && mpiexec -l -n 28 ./tcpsyn $ARGS --nummds 12 --log_name $JOB/12 --numclient 300 > log/$JOB/o.12 -rm core* ; make tcpsyn && mpiexec -l -n 32 ./tcpsyn $ARGS --nummds 16 --log_name $JOB/16 --numclient 300 > log/$JOB/o.16 - - diff --git a/branches/sage/ebofs2/jobs/meta1.proc.sh b/branches/sage/ebofs2/jobs/meta1.proc.sh deleted file mode 100755 index 616acbefff619..0000000000000 --- a/branches/sage/ebofs2/jobs/meta1.proc.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/sh - -for d in 1 2 4 8 12 -do - echo $d - cd $d - ../../../script/sum.pl mds? mds?? > mds.sum - ../../../script/sum.pl -avg mds? mds?? > mds.avg - - ../../../script/sum.pl -start 90 -end 180 mds? mds?? > mds.sum.makedirs - ../../../script/sum.pl -start 200 -end 300 mds? mds?? > mds.sum.walk - - cd .. -done diff --git a/branches/sage/ebofs2/jobs/osd/ebofs b/branches/sage/ebofs2/jobs/osd/ebofs deleted file mode 100644 index 5d11523f6f832..0000000000000 --- a/branches/sage/ebofs2/jobs/osd/ebofs +++ /dev/null @@ -1,51 +0,0 @@ -# hi there -{ - # startup - 'n' => 30, # mpi nodes - 'sleep' => 3, # seconds between runs - 'nummds' => 1, - 'numosd' => 8, - 'numclient' => 100,#[10, 50, 100, 200, 400], - -'kill_after' => 200, - - # parameters - 'fs' => 'ebofs',#[ -# 'obfs', -# 'fakestore', -# 'ebofs' -# ], - 'until' => 100, # --syn until $n ... when to stop clients - 'writefile' => 1, - 'writefile_size' => [ -# 2560000, - 1024000, - 262144, -# 131072, -# 98304, - 65536, -# 16384, -# 4096, - 256, -# 16, -# 1 - ], - 'writefile_mb' => 1000, - - 'ebofs_idle_commit_ms' => [ 100, 500 ], - 'ebofs_abp_max_alloc' => [ 4096*16, 4096*64 ], - -# 'custom' => '--tcp_skip_rank0',# --osd_maxthreads 0', - 'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', - - # for final summation (script/sum.pl) - 'start' => 30, - 'end' => 90, - -'comb' => { - 'x' => 'writefile_size', - 'vars' => [ 'osd.c_wrb' ], -# 'maptitle' => { 'osd_object_layout=' => '', -# ',osd_pg_layout=' => ' + '} - } -}; diff --git a/branches/sage/ebofs2/jobs/osd/mds_log b/branches/sage/ebofs2/jobs/osd/mds_log deleted file mode 100644 index 0f99f6998dcfc..0000000000000 --- a/branches/sage/ebofs2/jobs/osd/mds_log +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - #'_psub' => 'jobs/alc.tp', - 'sleep' => 3, - - 'nummds' => 1, - 'numclient' => [5, 10, 15, 25, 50, 100, 200, 300, 400], - #'numclient' => [ 50, 100, 200 ], - 'numosd' => [2,4],#[ 4, 8, 12, 16, 20, 24 ], - 'n' => 12, - - # parameters - 'fs' => 'fakestore',#['ebofs', 'fakestore','obfs'], - #'fs' => 'ebofs', - #'ebofs_commit_ms' => [ 1000, 5000 ], - #'osd_maxthreads' => [ 0, 1, 2, 4, 8 ], - - 'until' => 100, # --syn until $n ... when to stop clients - 'kill_after' => 300, - 'start' => 20, - 'end' => 90, - - 'makedirs' => 1, - 'makedirs_dirs' => 10, - 'makedirs_files' => 10, - 'makedirs_depth' => 3, - - - #'meta_log_layout_ssize' => [256, 512, 1024, 4096, 16384, 65536, 262400], - #'meta_log_layout_scount' => [2, 4, 8], - #'meta_log_layout_num_rep' => [1, 2], - #'meta_log_layout_num_rep' => 1, - - 'custom' => '--tcp_skip_rank0 --mds_shutdown_check 60', - #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', - - 'comb' => { - 'x' => 'numclient',#'meta_log_layout_ssize', - 'vars' => [ 'mds.req' ] - } -}; diff --git a/branches/sage/ebofs2/jobs/osd/osd_threads b/branches/sage/ebofs2/jobs/osd/osd_threads deleted file mode 100644 index ef271f9e88710..0000000000000 --- a/branches/sage/ebofs2/jobs/osd/osd_threads +++ /dev/null @@ -1,33 +0,0 @@ -# hi there -{ - # startup - 'n' => 30, # mpi nodes - 'sleep' => 10, # seconds between runs - 'nummds' => 1, - 'numosd' => 8, - 'numclient' => 50, - - # parameters - 'fs' => [ -# 'obfs', - 'fakestore', - 'ebofs' - ], - 'until' => 100, # --syn until $n ... when to stop clients - 'writefile' => 1, - 'writefile_size' => [ - 1024000, - 131072, - 65536, - 16 - ], - 'writefile_mb' => 1000, - - 'osd_maxthreads' => [0, 1, 2, 4, 8], - - 'custom' => '--tcp_skip_rank0', - - # for final summation (script/sum.pl) - 'start' => 30, - 'end' => 90 -}; diff --git a/branches/sage/ebofs2/jobs/osd/striping b/branches/sage/ebofs2/jobs/osd/striping deleted file mode 100644 index ea8cabe643274..0000000000000 --- a/branches/sage/ebofs2/jobs/osd/striping +++ /dev/null @@ -1,78 +0,0 @@ -#!/usr/bin/perl -# hi there -{ - # startup - #'n' => 28, # mpi nodes - - 'sleep' => 3, # seconds between runs - 'nummds' => 1, - - 'numosd' => [2,3,4,5,6,7,8,10,12], #[6, 8, 10, 12, 16], - 'numosd' => [14], - #'cper' => [4, 5, 6, 7, 8, 9, 10, 11, 12], #[1, 4, 6, 8, 16, 32, 64], - #'cper' => [4, 6, 8, 10, 12, 16, 24, 32 ], #[1, 4, 6, 8, 16, 32, 64], - 'cper' => [30], - - '_dep' => [ 'cnode' => '$numosd', - 'numclient' => '$cnode * $cper', - 'n' => 38],#'$nummds + $numosd + $cnode'], - #'numclient' => [5, 10, 20, 50, 75, 100, 150 ], - - 'start' => 30, - 'end' => 90, - 'until' => 100, # --syn until $n ... when to stop clients - 'kill_after' => 260, - - # parameters - 'fs' => 'ebofs', - 'writefile' => 1, - - 'writefile_size' => [# 4096, - # 16*1024, - # 64*1024, - # 256*1024, - 1024*1024 ], -# 'writefile_size' => [ -# 2048*1024, -# 1048576, -# 512*1024, -# 262144, -# 65536, -# 16384 -# ], - 'writefile_mb' => 1000, - - 'file_layout_num_rep'=> [1,2,3], - - 'osd_pg_bits' => 12,#[6, 8, 10, 12, 14], - - 'osd_object_layout' => [ 'hashino' ],#'hash', 'hashino', 'linear' ], - 'osd_pg_layout' => [ 'crush', 'linear' ],#, 'linear'],#, 'hash' ],#, 'linear' ],#, 'hash' ], - - #'custom' => '--tcp_skip_rank0', # --osd_maxthreads 0', - #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', - - # for final summation (script/sum.pl) - - 'comb' => { - 'x' => 'numosd',#'writefile_size', - 'vars' => [ 'osd.c_wrb', 'cl.wrlat' ], -# 'maptitle' => { 'osd_object_layout=' => '', -# ',osd_pg_layout=' => ' + '} - } -}; - - -=item some googoo notes - -for 1mb 1x writes, - - with numosd=6, min cper=6 to saturate (cper_saturate) - googoo saturates at numosd=8. (osd_saturate) - - -> so, numosd=6 or 7 is a safe size! - - - - -=cut diff --git a/branches/sage/ebofs2/jobs/osd/wr_lat2 b/branches/sage/ebofs2/jobs/osd/wr_lat2 deleted file mode 100644 index 47053dd61f3ab..0000000000000 --- a/branches/sage/ebofs2/jobs/osd/wr_lat2 +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - 'nummds' => 1, - 'numosd' => [12], - 'numclient' => [1],#, 40, 80, 160 ], - 'n' => 16, - - 'fs' => 'ebofs', - - 'start' => 10, - 'end' => 40, - 'until' => 40, - 'kill_after' => 90, - - 'writefile' => 1, - 'writefile_size' => [4096, - 8*1024, - 16*1024, - 32*1024, - 64*1024, - 128*1024, - 256*1024, - 512*1024, - 1024*1024], - 'writefile_mb' => 10000, - - #'tcp_multi_out' => [0,1], - -# 'mds_local_osd' => [ 0, 1 ], - 'file_layout_num_rep' => [1,2,3],#, 2, 3, 4], - - 'client_oc' => [0,1], - - 'custom' => '--tcp_skip_rank0', - - 'comb' => { - 'x' => 'writefile_size',#'file_layout_num_rep', - 'vars' => [ 'osd.c_wrb','cl.wrlat' ] - } -}; diff --git a/branches/sage/ebofs2/jobs/osd/write_sizes b/branches/sage/ebofs2/jobs/osd/write_sizes deleted file mode 100644 index 57369f3a97c50..0000000000000 --- a/branches/sage/ebofs2/jobs/osd/write_sizes +++ /dev/null @@ -1,60 +0,0 @@ -#!/usr/bin/perl -# hi there -{ - # startup - 'n' => 30, # mpi nodes - 'sleep' => 3, # seconds between runs - 'nummds' => 1, - 'numosd' => 6, - 'numclient' => 100,#[25,50,100,300],#100,#[10, 50, 100, 200, 400], - - 'until' => 100, # --syn until $n ... when to stop clients - 'kill_after' => 300, - - # parameters - 'fs' => [ -# 'obfs', - 'fakestore', -# 'ebofs' - ], - 'writefile' => 1, - 'writefile_size' => [ -# 2048*1024, - 1024*1024, - 512*1024, - 256*1024, - 128*1024, - 64*1024, - 48*1024, - 32*1024, - 28*1024, - 24*1024, - 16*1024, - 12*1024, - 8*1024, - 4096, -# 256, -# 16, -# 1 - ], - 'writefile_mb' => 1000, - - 'file_layout_num_rep'=> 1,#[1,2], - - -# 'ebofs_idle_commit_ms' => [ 100, 500 ], -# 'ebofs_abp_max_alloc' => [ 4096*16, 4096*64 ], - - 'custom' => '--debug_after 110 --debug_mds 15 --debug 5 --mds_shutdown_check 60', - - # for final summation (script/sum.pl) - 'start' => 30, - 'end' => 90, - - 'comb' => { - 'x' => 'writefile_size', - 'vars' => [ 'osd.c_wrb' ], -# 'maptitle' => { 'osd_object_layout=' => '', -# ',osd_pg_layout=' => ' + '} - } -}; diff --git a/branches/sage/ebofs2/jobs/rados/map_dist b/branches/sage/ebofs2/jobs/rados/map_dist deleted file mode 100644 index 39f16daa1cdc2..0000000000000 --- a/branches/sage/ebofs2/jobs/rados/map_dist +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - 'osdbits' => [6,7,8],#,9],10,11], - 'pgperbits' => [3],#,4,5],#[4,6,8], - - 'nummds' => 1, - - '_dep' => [ 'numosd' => '1 << $osdbits', - 'osd_pg_bits' => '$pgperbits + $osdbits', - 'n' => '3 + $numosd / 32'], - 'numclient' => 0, - - 'fake_osdmap_updates' => [30], - - 'fs' => 'ebofs', - - 'start' => 30, - 'end' => 300, - 'kill_after' => 300, - - 'custom' => '--bdev_lock 0 --ms_stripe_osds --osd_maxthreads 0', - #'custom' => '--tcp_skip_rank0', - - 'comb' => { - 'x' => 'osdbits', - 'vars' => [ 'osd.sum=mapi', 'osd.sum=mapidup', 'osd.numpg', 'osd.pingset' ] - } -}; diff --git a/branches/sage/ebofs2/jobs/rados/rep_lat b/branches/sage/ebofs2/jobs/rados/rep_lat deleted file mode 100644 index 3f5ab0c8a7d87..0000000000000 --- a/branches/sage/ebofs2/jobs/rados/rep_lat +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - 'nummds' => 1, - 'numosd' => 8, #[6], - 'numclient' => 1,#, 40, 80, 160 ], - 'n' => 10, - - 'fs' => 'ebofs', - - 'start' => 10, - 'end' => 40, - 'until' => 40, - 'kill_after' => 45, - - 'writefile' => 1, - 'writefile_size' => [4096, -# 8*1024, -# 16*1024, -# 32*1024, - 64*1024, -# 128*1024, -# 256*1024, -# 512*1024, -# 1024*1024 -], - 'writefile_mb' => 10000, - - 'osd_rep' => [0,1,2], - - 'file_layout_num_rep' => [1,2,3,4,5,6],#, 2, 3, 4], - - 'osd_pg_bits' => 4, - 'custom' => '--osd_max_rep 8', - - 'comb' => { - 'x' => 'file_layout_num_rep', - 'vars' => [ 'cl.wrlat' ] - } -}; diff --git a/branches/sage/ebofs2/jobs/rados/wr_sizes b/branches/sage/ebofs2/jobs/rados/wr_sizes deleted file mode 100644 index 9b73477ea6142..0000000000000 --- a/branches/sage/ebofs2/jobs/rados/wr_sizes +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - 'nummds' => 1, - 'numosd' => [8],#10,14,16], - 'numclient' => [10*16], - 'n' => 15, - - 'fs' => 'ebofs', - - 'start' => 60, - 'end' => 90, - 'until' => 90, - 'kill_after' => 190, - - 'writefile' => 1, - 'writefile_size' => [4096, - 8*1024, - 16*1024, - 32*1024, - 64*1024, - 128*1024, - 256*1024, - # 512*1024, -# 4*1024*1024, -# 2*1024*1024, -# 1024*1024 -], - 'writefile_mb' => 10000, - - 'file_layout_num_rep' => 1, - 'file_layout_ssize' => 4*1024*1024, - 'file_layout_osize' => 4*1024*1024, - - 'osd_pg_bits' => 12, - -# 'ebofs_freelist' => [0, 1080, 65400], - - 'custom' => '--objecter_buffer_uncommitted 0', - - #'custom' => '--tcp_skip_rank0', - - 'comb' => { - 'x' => 'writefile_size', - 'vars' => [ 'osd.c_wrb', 'cl.wrlat' ] - } -}; diff --git a/branches/sage/ebofs2/jobs/runjobsample b/branches/sage/ebofs2/jobs/runjobsample deleted file mode 100644 index 590be207771b2..0000000000000 --- a/branches/sage/ebofs2/jobs/runjobsample +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - '_sleep' => 3, - - 'nummds' => 1, - 'numosd' => 16, #[8],#10,14,16], - 'numclient' => 32,#,4,10,20,40], #[10*16], - '_n' => 32, - - '_start' => 15, - '_end' => 45, - '_kill_after' => 190, - - 'osd_pg_bits' => [4, 6], - 'osd_auto_weight' => [0,1], - 'file_layout_pg_size' => [1,2], - - '_custom' => '--syn createobjects 1000000 1048576 2', - - '_comb' => { - 'x' => 'osd_pg_bits', - 'vars' => [ 'osd.c_wrb' ] - } -}; diff --git a/branches/sage/ebofs2/kernel/Makefile b/branches/sage/ebofs2/kernel/Makefile deleted file mode 100644 index e55c79563af49..0000000000000 --- a/branches/sage/ebofs2/kernel/Makefile +++ /dev/null @@ -1,7 +0,0 @@ -# -# Makefile for CEPH filesystem. -# - -obj-$(CONFIG_CEPH_FS) += ceph.o - -ceph-objs := inode.o bufferlist.o ktcp.o diff --git a/branches/sage/ebofs2/kernel/accepter.h b/branches/sage/ebofs2/kernel/accepter.h deleted file mode 100644 index da692e7ddcef1..0000000000000 --- a/branches/sage/ebofs2/kernel/accepter.h +++ /dev/null @@ -1,21 +0,0 @@ -#ifndef __FS_CEPH_ACCEPTER_H -#define __FS_CEPH_ACCEPTOR_H - -#include -#include - -/* - * Information about client thread - */ -struct ceph_accepter { - struct task_struct accepter_thread; /* thread */ - struct socket sock; /* Socket */ -}; - -/* - * Prototypes definitions - */ -int ceph_accepter_start(void); -void ceph_accepter_shutdown(struct ceph_accepter *accepter); - -#endif diff --git a/branches/sage/ebofs2/kernel/bufferlist.c b/branches/sage/ebofs2/kernel/bufferlist.c deleted file mode 100644 index 5b8dda45a689b..0000000000000 --- a/branches/sage/ebofs2/kernel/bufferlist.c +++ /dev/null @@ -1,147 +0,0 @@ -#include -#include -#include -#include "bufferlist.h" - -void ceph_bl_init(struct ceph_bufferlist *bl) -{ - memset(bl, 0, sizeof(*bl)); - bl->b_kvmax = CEPH_BUFFERLIST_START_KVLEN; - bl->b_kv = bl->b_kv_array; -} - -void ceph_bl_clear(struct ceph_bufferlist *bl) -{ - int i; - /* for (i=0; ib_kvlen; i++) - kfree(bl->b_kv[i]->iov_base); */ - for (i=0; ib_kvlen; i++, bl->b_kv++) - kfree(bl->b_kv->iov_base); - bl->b_kvlen = 0; - if (bl->b_kv != bl->b_kv_array) { - kfree(bl->b_kv); - bl->b_kv = bl->b_kv_array; - bl->b_kvmax = CEPH_BUFFERLIST_START_KVLEN; - } - if (bl->b_append.iov_base) { - kfree(bl->b_append.iov_base); - bl->b_append.iov_base = 0; - } -} - - -/* - * add referenced memory to the bufferlist. - * expand b_kv array if necessary. - * extend tail kvec if the added region is contiguous. - * - * bl bufferlist we want to append to - * dp pointer to data we want to append - * len length of data we want to append - */ -void ceph_bl_append_ref(struct ceph_bufferlist *bl, void *dp, int len) -{ - /* check for optimal case of dp being at end of our last kvec */ - if (bl->b_kvlen) { - struct kvec lastvec = bl->b_kv[bl->b_kvlen-1]; - if (dp == lastvec.iov_base + lastvec.iov_len) { - lastvec.iov_len += len; - return; - } - } - - /* check if we need to allocate more entries in our kvec array */ - if (bl->b_kvlen == bl->b_kvmax) { - struct kvec *tmpvec; - - bl->b_kvmax *= 2; - - /* TBD: check result of kmalloc */ - tmpvec = kmalloc(bl->b_kvmax, GFP_KERNEL); - memcpy(tmpvec, bl->b_kv, sizeof(struct kvec)*bl->b_kvlen); - - /* if the old array wasn't our original array (kmalloc'ed) */ - if (bl->b_kvlen > CEPH_BUFFERLIST_START_KVLEN) - kfree(bl->b_kv); - - bl->b_kv = tmpvec; - memset(tmpvec + bl->b_kvlen, 0, sizeof(struct kvec)*(bl->b_kvmax - bl->b_kvlen)); - } - - bl->b_kv[bl->b_kvlen].iov_base = dp; - bl->b_kv[bl->b_kvlen].iov_len = len; - bl->b_kvlen++; -} - -void ceph_bl_append_copy(struct ceph_bufferlist *bl, void *p, size_t len) -{ - int s; - while (len > 0) { - /* allocate more space? */ - if ( ! bl->b_append.iov_len) { - bl->b_append.iov_len = (len + PAGE_SIZE - 1) & ~(PAGE_SIZE-1); - /* TBD: check result of kmalloc */ - bl->b_append.iov_base = kmalloc(bl->b_append.iov_len, GFP_KERNEL); - } - - /* copy what we can */ - s = min(bl->b_append.iov_len, len); - memcpy(bl->b_append.iov_base, p, s); - ceph_bl_append_ref(bl, bl->b_append.iov_base, bl->b_append.iov_len); - - p += s; - len -= s; - - bl->b_append.iov_base += s; - bl->b_append.iov_len -= s; - } -} - - -void ceph_bl_iterator_init(struct ceph_bufferlist_iterator *bli) -{ - memset(bli, 0, sizeof(*bli)); -} - -void ceph_bl_iterator_advance(struct ceph_bufferlist *bl, - struct ceph_bufferlist_iterator *bli, - int off) -{ - -} - -/* TBD: comment until builds... -__u64 ceph_bl_decode_u64(struct ceph_bufferlist *bl, struct ceph_bufferlist_iterator *bli) -{ - __u64 r; - r = le64_to_cpu((__u64*)(bl->b_kv[bli->i_kv] + bli->i_off)); - ceph_bl_iterator_advance(bl, bli, sizeof(__u64)); -} -__s64 ceph_bl_decode_s64(struct ceph_bufferlist *bl, ceph_bufferlist_iterator *bli) -{ - __s64 r; - r = le64_to_cpu((__s64*)(bl->b_kv[bli->i_kv] + bli->i_off)); - ceph_bl_iterator_advance(bl, bli, sizeof(__s64)); -} - -__u32 ceph_bl_decode_u32(struct ceph_bufferlist *bl, ceph_bufferlist_iterator *bli) -{ - __u32 r; - r = le32_to_cpu((__u32*)(bl->b_kv[bli->i_kv] + bli->i_off)); - ceph_bl_iterator_advance(bl, bli, sizeof(__u32)); -} -__s32 ceph_bl_decode_s32(struct ceph_bufferlist *bl, ceph_bufferlist_iterator *bli) -{ - __s32 r; - r = le32_to_cpu((__s32*)(bl->b_kv[bli->i_kv] + bli->i_off)); - ceph_bl_iterator_advance(bl, bli, sizeof(__s32)); -} - -__u8 ceph_bl_decode_u8(struct ceph_bufferlist *bl, ceph_bufferlist_iterator *bli) -{ - __u8 r; - r = (__u8*)(bl->b_kv[bli->i_kv] + bli->i_off); - ceph_bl_iterator_advance(bl, bli, sizeof(__u8)); -} - -*/ diff --git a/branches/sage/ebofs2/kernel/bufferlist.h b/branches/sage/ebofs2/kernel/bufferlist.h deleted file mode 100644 index dca3012a5251e..0000000000000 --- a/branches/sage/ebofs2/kernel/bufferlist.h +++ /dev/null @@ -1,29 +0,0 @@ -#ifndef _FS_CEPH_BUFFERLIST_H -#define _FS_CEPH_BUFFERLIST_H - -#include - - -/* - * quick and dirty bufferlist struct. - * - * preallocates memory in large chunks, allowing you to append small bits at a - * time in a reasonably efficient fashion... - */ - -#define CEPH_BUFFERLIST_START_KVLEN 8 /* embed some statically, for fast normal case */ - -struct ceph_bufferlist { - struct kvec *b_kv; /* data payload */ - struct kvec b_kv_array[CEPH_BUFFERLIST_START_KVLEN]; - size_t b_kvlen; /* used/defined elements in b_kv */ - size_t b_kvmax; /* allocated size of b_kv array */ - struct kvec b_append; /* preallocated memory for appending data to this bufferlist */ -}; - -struct ceph_bufferlist_iterator { - int i_kv; /* which kv */ - int i_off; /* offset in that kv */ -}; - -#endif diff --git a/branches/sage/ebofs2/kernel/inode.c b/branches/sage/ebofs2/kernel/inode.c deleted file mode 100644 index f21fa58386935..0000000000000 --- a/branches/sage/ebofs2/kernel/inode.c +++ /dev/null @@ -1,136 +0,0 @@ -#include -#include -#include -#include -#include "ceph_fs.h" - -MODULE_AUTHOR("Patience Warnick "); -MODULE_DESCRIPTION("Ceph filesystem for Linux"); -MODULE_LICENSE("GPL"); - - -static void ceph_read_inode(struct inode * inode) -{ - return; -} - -static int ceph_write_inode(struct inode * inode, int unused) -{ - lock_kernel(); - unlock_kernel(); - return 0; -} - -static void ceph_delete_inode(struct inode * inode) -{ - return; -} - -static void ceph_put_super(struct super_block *s) -{ - return; -} - -static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) -{ - return 0; -} - -static void ceph_write_super(struct super_block *s) -{ - lock_kernel(); - unlock_kernel(); - return; -} - -static struct kmem_cache *ceph_inode_cachep; - -static struct inode *ceph_alloc_inode(struct super_block *sb) -{ - struct ceph_inode_info *ci; - ci = kmem_cache_alloc(ceph_inode_cachep, GFP_KERNEL); - if (!ci) - return NULL; - return &ci->vfs_inode; -} - -static void ceph_destroy_inode(struct inode *inode) -{ - kmem_cache_free(ceph_inode_cachep, CEPH_I(inode)); -} - -static void init_once(void *foo, struct kmem_cache *cachep, unsigned long flags) -{ - struct ceph_inode_info *ci = (struct ceph_inode_info *) foo; - - if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == - SLAB_CTOR_CONSTRUCTOR) - inode_init_once(&ci->vfs_inode); -} - -static int init_inodecache(void) -{ - ceph_inode_cachep = kmem_cache_create("ceph_inode_cache", - sizeof(struct ceph_inode_info), - 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), - init_once, NULL); - if (ceph_inode_cachep == NULL) - return -ENOMEM; - return 0; -} - -static void destroy_inodecache(void) -{ - kmem_cache_destroy(ceph_inode_cachep); -} - -static const struct super_operations ceph_sops = { - .alloc_inode = ceph_alloc_inode, - .destroy_inode = ceph_destroy_inode, - .read_inode = ceph_read_inode, - .write_inode = ceph_write_inode, - .delete_inode = ceph_delete_inode, - .put_super = ceph_put_super, - .write_super = ceph_write_super, - .statfs = ceph_statfs, -}; - -static int ceph_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) -{ - printk(KERN_INFO "entered ceph_get_sb\n"); - return 0; -} - -static struct file_system_type ceph_fs_type = { - .owner = THIS_MODULE, - .name = "ceph", - .get_sb = ceph_get_sb, - .kill_sb = kill_block_super, -/* .fs_flags = */ -}; - -static int __init init_ceph(void) -{ - int ret = 0; - - printk(KERN_INFO "ceph init\n"); - if (!(ret = init_inodecache())) { - if ((ret = register_filesystem(&ceph_fs_type))) { - destroy_inodecache(); - } - } - return ret; -} - -static void __exit exit_ceph(void) -{ - printk(KERN_INFO "ceph exit\n"); - - unregister_filesystem(&ceph_fs_type); -} - - -module_init(init_ceph); -module_exit(exit_ceph); diff --git a/branches/sage/ebofs2/kernel/kmsg.h b/branches/sage/ebofs2/kernel/kmsg.h deleted file mode 100644 index f3650e3a59f5c..0000000000000 --- a/branches/sage/ebofs2/kernel/kmsg.h +++ /dev/null @@ -1,68 +0,0 @@ -#ifndef __FS_CEPH_KMSG_H -#define __FS_CEPH_KMSG_H - -#include -#include -#include -#include -#include "accepter.h" -#include "bufferlist.h" - -/* dispatch function type */ -typedef void (*ceph_kmsg_work_dispatch_t)(struct work_struct *); - -extern struct workqueue_struct *rwq; /* receive work queue (worker threads) */ -extern struct workqueue_struct *swq; /* send work queue (worker threads) */ - -struct ceph_kmsgr { - void *m_parent; - struct radix_tree_root mpipes; /* other nodes talk to */ - struct ceph_accepter accepter; /* listener or select thread info */ -}; - -struct ceph_message { - atomic_t nref; - int mflags; - struct ceph_message_header *msghdr; /* header */ - struct ceph_bufferlist *payload; - struct list_head m_list_head; -}; - -struct ceph_connection { - struct socket sock; /* connection socket */ - __u64 out_seq; /* last message sent */ - __u64 in_seq; /* last message received */ - - /* out queue */ - struct list_head out_queue; - spinlock_t out_queue_lock; - struct ceph_message *out_partial; /* partially sent message */ - struct ceph_bufferlist_iterator out_pos; - struct list_head out_sent; /* sent but unacked; may need resend if connection drops */ - - /* partially read message contents */ - struct ceph_message *in_partial; - struct work_struct *rwork; /* received work */ - struct work_struct *swork; /* send work */ -/* note: work->func = dispatch func */ - int retries; -}; - -/* - * function prototypes - */ -extern struct ceph_message *ceph_read_message(void); -extern int ceph_send_message(struct ceph_message *message); - -static __inline__ void ceph_put_msg(struct ceph_message *msg) { - if (atomic_dec_and_test(&msg->nref)) { - ceph_bl_clear(msg->payload); - kfree(msg); - } -} - -static __inline__ void ceph_get_msg(struct ceph_message *msg) { - atomic_inc(&msg->nref); -} - -#endif diff --git a/branches/sage/ebofs2/kernel/ktcp.c b/branches/sage/ebofs2/kernel/ktcp.c deleted file mode 100644 index e210769bb38f2..0000000000000 --- a/branches/sage/ebofs2/kernel/ktcp.c +++ /dev/null @@ -1,138 +0,0 @@ -#include -#include -#include -#include -#include "kmsg.h" -#include "ktcp.h" - - -struct socket * _kconnect(struct sockaddr *saddr) -{ - int ret; - struct socket *sd = NULL; - - ret = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sd); - if (ret < 0) { - printk(KERN_INFO "sock_create_kern error: %d\n", ret); - } else { - /* or could call kernel_connect(), opted to reduce call overhead */ - ret = sd->ops->connect(sd, (struct sockaddr *) saddr, - sizeof (struct sockaddr_in),0); - if (ret < 0) { - printk(KERN_INFO "kernel_connect error: %d\n", ret); - sock_release(sd); - } - } - return(sd); -} - -struct socket * _klisten(struct sockaddr *saddr) -{ - int ret; - struct socket *sd = NULL; - struct sockaddr_in *in_addr = (struct sockaddr_in *)saddr; - - - ret = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sd); - if (ret < 0) { - printk(KERN_INFO "sock_create_kern error: %d\n", ret); - return(NULL); - } - - /* no user specified address given so create, will allow arg to mount */ - if (!in_addr->sin_addr.s_addr) { - in_addr->sin_family = AF_INET; - in_addr->sin_addr.s_addr = htonl(INADDR_ANY); - in_addr->sin_port = htons(CEPH_PORT); /* known port for now */ - } - -/* TBD: set sock options... */ - /* ret = kernel_setsockopt(sd, SOL_SOCKET, SO_REUSEADDR, - (char *)optval, optlen); - if (ret < 0) { - printk("Failed to set SO_REUSEADDR: %d\n", ret); - } */ - ret = sd->ops->bind(sd, saddr, sizeof(saddr)); -/* TBD: probaby want to tune the backlog queue .. */ - ret = sd->ops->listen(sd, NUM_BACKUP); - if (ret < 0) { - printk(KERN_INFO "kernel_listen error: %d\n", ret); - sock_release(sd); - sd = NULL; - } - return(sd); -} - -/* - * Note: Maybe don't need this, or make inline... keep for now for debugging.. - * we may need to add more functionality - */ -struct socket *_kaccept(struct socket *sd) -{ - struct socket *new_sd = NULL; - int ret; - - -/* TBD: somewhere check for a connection already established to this node? */ - ret = kernel_accept(sd, &new_sd, sd->file->f_flags); - if (ret < 0) { - printk(KERN_INFO "kernel_accept error: %d\n", ret); - return(new_sd); - } -/* TBD: shall we check name for validity? */ - return(new_sd); -} - -/* - * receive a message this may return after partial send - */ -int _krecvmsg(struct socket *sd, void *buf, size_t len, unsigned msgflags) -{ - struct kvec iov = {buf, len}; - struct msghdr msg = {.msg_flags = msgflags}; - int rlen = 0; /* length read */ - - printk(KERN_INFO "entered krevmsg\n"); - msg.msg_flags |= MSG_DONTWAIT | MSG_NOSIGNAL; - - /* receive one kvec for now... */ - rlen = kernel_recvmsg(sd, &msg, &iov, 1, len, msg.msg_flags); - if (rlen < 0) { - printk(KERN_INFO "kernel_recvmsg error: %d\n", rlen); - } - return(rlen); - -} - -/* - * Send a message this may return after partial send - */ -int _ksendmsg(struct socket *sd, struct kvec *iov, - size_t len, size_t kvlen, unsigned msgflags) -{ - struct msghdr msg = {.msg_flags = msgflags}; - int rlen = 0; - - printk(KERN_INFO "entered ksendmsg\n"); - msg.msg_flags |= MSG_DONTWAIT | MSG_NOSIGNAL; - - rlen = kernel_sendmsg(sd, &msg, iov, kvlen, len); - if (rlen < 0) { - printk(KERN_INFO "kernel_sendmsg error: %d\n", rlen); - } - return(rlen); -} - -struct sockaddr *_kgetname(struct socket *sd) -{ - struct sockaddr *saddr = NULL; - int len; - int ret; - - if ((ret = sd->ops->getname(sd, (struct sockaddr *)saddr, - &len, 2) < 0)) { - printk(KERN_INFO "kernel getname error: %d\n", ret); - } - return(saddr); - -} diff --git a/branches/sage/ebofs2/kernel/ktcp.h b/branches/sage/ebofs2/kernel/ktcp.h deleted file mode 100644 index 6608144e90139..0000000000000 --- a/branches/sage/ebofs2/kernel/ktcp.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef _FS_CEPH_TCP_H -#define _FS_CEPH_TCP_H - -/* Well known port for ceph client listener.. */ -#define CEPH_PORT 2002 -/* Max number of outstanding connections in listener queueu */ -#define NUM_BACKUP 10 -#endif diff --git a/branches/sage/ebofs2/kernel/mds_client.c b/branches/sage/ebofs2/kernel/mds_client.c deleted file mode 100644 index e78f5f14fcb66..0000000000000 --- a/branches/sage/ebofs2/kernel/mds_client.c +++ /dev/null @@ -1,287 +0,0 @@ - -#include "mds_client.h" -#include "mon_client.h" -#include "kmsg.h" - - -/* - * reference count request - */ -static void get_request(struct ceph_mds_request *req) -{ - atomic_inc(&req->r_ref); -} - -static void put_request(struct ceph_mds_request *req) -{ - if (atomic_dec_and_test(&req->r_ref)) { - ceph_put_msg(req->r_request); - kfree(req); - } -} - -static void get_session(struct ceph_mds_session *s) -{ - atomic_inc(&s->s_ref); -} - -static void put_session(struct ceph_mds_session *s) -{ - if (atomic_dec_and_test(&s->s_ref)) - kfree(s); -} - -/* - * register an in-flight request - */ -static struct ceph_mds_request * -register_request(struct ceph_mds_client *mdsc, struct ceph_message *msg, int mds) -{ - struct ceph_mds_request *req; - - req = kmalloc(sizeof(*req), GFP_KERNEL); - - req->r_request = msg; - ceph_get_msg(msg); /* grab reference */ - req->r_reply = 0; - req->r_num_mds = 0; - req->r_attempts = 0; - req->r_num_fwd = 0; - req->r_resend_mds = mds; - req->r_ref = ATOMIC_INIT(2); /* one for request_tree, one for caller */ - - req->r_tid = ++mdsc->last_tid; - radix_tree_insert(&mdsc->request_tree, req->r_tid, (void*)req); - - return req; -} - -void -unregister_request(struct ceph_mds_client *mdsc, struct ceph_mds_request *req) -{ - radix_tree_remove(&mdsc->request_tree, req->r_tid); - put_request(req); -} - - -/* - * choose mds to send request to next - */ -static int choose_mds(struct ceph_mds_client *mdsc, struct ceph_mds_request *req) -{ - /* is there a specific mds we should try? */ - if (req->r_resend_mds >= 0 && - ceph_mdsmap_get_state(mdsc->mdsmap, req->r_resend_mds) > 0) - return req->r_resend_mds; - - /* pick one at random */ - return ceph_mdsmap_get_random_mds(mdsc->mdsmap); -} - -static void register_session(struct ceph_mds_client *mdsc, int mds) -{ - /* register */ - if (mds >= mdsc->max_sessions) { - /* realloc */ - } - mdsc->session[mds] = kmalloc(sizeof(struct ceph_mds_session)); - mdsc->session[mds]->s_open = 0; - init_completion(&mdsc->session[mds]->s_completion); - mdsc->session[mds]->s_ref = ATOMIC_INIT(1); -} - -static void open_session(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, int mds) -{ - struct ceph_message *msg; - - /* connect */ - if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) { - ceph_monc_request_mdsmap(mdsc->mon_client, mdsc->mdsmap->m_epoch); /* race fixme */ - return; - } - - /* prepareconnect message */ - - /* send */ - ceph_kmsg_send(mdsc->kmessenger, msg, ceph_mdsmap_get_addr(mdsc->mdsmap, mds); -} - -static void wait_for_new_map(struct ceph_mds_client *mdsc) -{ - if (mdsc->last_requested_map < mdsc->mdsmap->m_epoch) - ceph_monc_request_mdsmap(mdsc->client->monc, mdsc->mdsmap->m_epoch); - - wait_for_completion(&mdsc->map_waiters); -} - -/* exported functions */ - -void ceph_mdsc_init(struct ceph_mds_client *mdsc, - struct ceph_kmessenger *kmessenger) -{ - mdsc->kmessenger = kmessenger; - mdsc->mdsmap = 0; /* none yet */ - mdsc->sessions = 0; - mdsc->max_sessions = 0; - mdsc->last_tid = 0; - INIT_RADIX_TREE(&mdsc->request_tree); - init_completion(&mdsc->map_waiters); -} - - -struct ceph_message * -ceph_mdsc_make_request(struct ceph_mds_client *mdsc, struct ceph_message *msg, int mds) -{ - struct ceph_mds_request *req; - struct ceph_mds_session *session; - struct ceph_message *reply = 0; - int mds; - - spin_lock(&mdsc->lock); - req = register_request(mdsc, msg, mds); - -retry: - mds = choose_mds(mdsc, req); - if (mds < 0) { - /* wait for new mdsmap */ - spin_unlock(&mdsc->lock); - wait_for_new_map(mdsc); - spin_lock(&mdsc->lock); - goto retry; - } - - /* get session */ - if (mds >= mdsc->max_sessions || mdsc->sessions[mds] == 0) - register_session(mdsc, mds); - session = mdsc->sessions[mds]; - get_session(session); - - /* open? */ - if (mdsc->sessions[mds]->s_state == CEPH_MDS_SESSION_IDLE) - open_session(session); - if (mdsc->sessions[mds]->s_state != CEPH_MDS_OPEN) { - /* wait for session to open (or fail, or close) */ - spin_unlock(&mdsc->lock); - wait_for_completion(&session->s_completion); - put_session(session); - spin_lock(&mdsc->lock); - goto retry; - } - put_session(session); - - /* make request? */ - if (req->r_num_mds < 4) { - req->r_mds[req->r_num_mds++] = mds; - req->r_resend_mds = -1; /* forget any specific mds hint */ - req->r_attempts++; - ceph_kmsg_send(mdsc->kmessenger, req->r_request, ceph_mdsmap_get_addr(mds)); - } - - /* wait */ - spin_unlock(&mdsc->lock); - wait_for_completion(&req->r_completion); - - if (!req->r_reply) - goto retry_locked; - reply = req->r_reply; - - spin_lock(&mdsc->lock); - unregister_request(req); - spin_unlock(&mdsc->lock); - - put_request(req); - - return reply; -} - - -void ceph_mdsc_handle_reply(struct ceph_mds_client *mdsc, struct ceph_message *msg) -{ - struct ceph_mds_request *req; - __u64 tid; - - /* parse reply */ - - spin_lock(&mdsc->lock); - req = radix_tree_lookup(&mdsc->request_tree, tid); - if (!req) { - spin_unlock(&mdsc->lock); - return; - } - - get_request(req); - BUG_ON(req->m_reply); - req->r_reply = msg; - spin_unlock(&mdsc->lock); - - complete(&req->r_complete); - put_request(req); -} - -void ceph_mdsc_handle_forward(struct ceph_mds_client *mdsc, struct ceph_message *msg) -{ - int next_mds; - int fwd_seq; - __u64 tid; - - /* parse reply */ - - spin_lock(&mdsc->lock); - req = radix_tree_lookup(&mdsc->request_tree, tid); - if (req) get_request(req); - if (!req) { - spin_unlock(&mdsc->lock); - return; /* dup reply? */ - } - - /* do we have a session with the dest mds? */ - if (next_mds < mdsc->max_sessions && - mdsc->sessions[next_mds] && - mdsc->sessions[next_mds]->open) { - /* yes. adjust mds set */ - if (fwd_seq > req->r_num_fwd) { - req->r_num_fwd = fwd_seq; - req->r_resend_mds = next_mds; - req->r_num_mds = 1; - req->r_mds[0] = msg->header.src.num; - } - spin_unlock(&mdsc->lock); - } else { - /* no, resend. */ - BUG_ON(fwd_seq <= req->r_num_fwd); /* forward race not possible; mds would drop */ - - req->r_num_mds = 0; - req->resend_mds = next_mds; - spin_unlock(&mdsc->r_lock); - complete(&req->r_complete); - } - - put_request(req); - ceph_put_msg(msg); -} - - -void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, - struct ceph_message *msg) -{ - struct ceph_bufferlist_iterator bli; - __u64 epoch; - __u32 left; - - ceph_bl_iterator_init(&bli); - epoch = ceph_bl_decode_u64(&msg->payload, &bli); - left = ceph_bl_decode_u32(&msg->payload, &bli); - - printk("ceph_mdsc_handle_map epoch %ld\n", epoch); - - spin_lock(&mdsc->lock); - if (epoch > mdsc->mdsmap->m_epoch) { - ceph_mdsmap_decode(mdsc->mdsmap, &msg->payload, bli); - spin_unlock(&mdsc->lock); - complete(&mdsc->waiting_for_map); - } else { - spin_unlock(&mdsc->lock); - } - - ceph_put_msg(msg); -} diff --git a/branches/sage/ebofs2/kernel/mds_client.h b/branches/sage/ebofs2/kernel/mds_client.h deleted file mode 100644 index 355ff4ae689e4..0000000000000 --- a/branches/sage/ebofs2/kernel/mds_client.h +++ /dev/null @@ -1,67 +0,0 @@ -#ifndef _FS_CEPH_MDS_CLIENT_H -#define _FS_CEPH_MDS_CLIENT_H - -#include -#include -#include -#include - -#include "kmsg.h" - -/* - * state associated with an individual MDS<->client session - */ -enum { - CEPH_MDS_SESSION_IDLE, - CEPH_MDS_SESSION_OPENING, - CEPH_MDS_SESSION_OPEN, - CEPH_MDS_SESSION_CLOSING -}; -struct ceph_mds_session { - int s_state; - __u64 s_cap_seq; /* cap message count from mds */ - atomic_t s_ref; - struct completion s_completion; -}; - -struct ceph_mds_request { - __u64 r_tid; - struct ceph_message *r_request; - struct ceph_message *r_reply; - - __u32 r_mds[4]; /* set of mds's with whom request may be outstanding */ - int r_num_mds; /* items in r_mds */ - - int r_attempts; - int r_num_fwd; /* number of forward attempts */ - int r_resend_mds; /* mds to resend to next, if any*/ - - atomic_t r_ref; - struct completion r_completion; -}; - - -struct ceph_mds_client { - spinlock_t lock; - - struct ceph_kmessenger *kmessenger; - struct ceph_mdsmap *mdsmap; /* mds map */ - - /* mds sessions */ - struct ceph_mds_session **sessions; /* NULL if no session */ - int max_sessions; /* size of s_mds_sessions array */ - - __u64 last_tid; /* id of last mds request */ - struct radix_tree_root request_tree; /* pending mds requests */ - - __u64 last_requested_map; - struct completion map_waiters; -}; - -extern void ceph_mdsc_init(struct ceph_mds_client *mdsc, - struct ceph_kmessenger *kmessenger); -extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct ceph_message *msg, int mds); -extern void ceph_mdsc_got_reply(struct ceph_mds_client *mdsc, __u64 tid); -extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_message *msg); - -#endif diff --git a/branches/sage/ebofs2/kernel/mdsmap.c b/branches/sage/ebofs2/kernel/mdsmap.c deleted file mode 100644 index 93e26f57fb679..0000000000000 --- a/branches/sage/ebofs2/kernel/mdsmap.c +++ /dev/null @@ -1,96 +0,0 @@ - -#include "mdsmap.h" -#include - -int ceph_mdsmap_get_state(ceph_mdsmap *m, int w) -{ - BUG_ON(w < 0); - if (w >= m->m_max_mds) - return CEPH_MDS_STATE_DNE; - return = m->m_state[w]; -} - -int ceph_mdsmap_get_random_mds(ceph_mdsmap *m) -{ - int n = 0; - int i; - - /* count */ - for (i=0; im_max_mds; i++) - if (m->m_state > 0) n++; - if (n == 0) - return -1; - - /* pick */ - n = get_random_int() % n; - i = 0; - for (i=0; n>0; i++, n--) - while (m->state[i] <= 0) i++; - - return i; -} - - -struct ceph_entity_addr *ceph_mdsmap_get_addr(ceph_mdsmap *m, int w) -{ - if (w >= m->m_max_mds) - return NULL; - return m->m_addr[w]; -} - -int ceph_mdsmap_decode(struct ceph_mdsmap *m, - struct ceph_bufferlist *bl, - struct ceph_bufferlist_iterator *bli) -{ - int i, n; - __u32 mds; - struct ceph_entity_inst *inst; - - m->m_epoch = ceph_bl_decode_u64(bl, bli); - ceph_bl_decode_u32(bl, bli); /* target_num */ - m->m_created.tv_sec = ceph_bl_decode_u32(bl, bli); - m->m_created.tv_usec = ceph_bl_decode_u32(bl, bli); - ceph_bl_decode_u64(bl, bli); /* same_in_set_since */ - m->m_anchortable = ceph_bl_decode_s32(bl, bli); - m->m_root = ceph_bl_decode_s32(bl, bli); - m->m_max_mds = ceph_bl_decode_u32(bl, bli); - - m->m_addr = kmalloc(sizeof(struct ceph_entity_addr)*m->m_max_mds, GFP_KERNEL); - m->m_state = kmalloc(sizeof(__u8)*m->m_max_mds, GFP_KERNEL); - memset(m->m_state, 0, sizeof(__u8)*m->m_max_mds); - - /* created */ - n = ceph_bl_decode_u32(bl, bli); - ceph_bl_iterator_advance(bli, n*sizeof(__u32)); - - /* state */ - n = ceph_bl_decode_u32(bl, bli); - for (i=0; im_state[mds] = ceph_bl_decode_s32(bl, bli); - } - - /* state_seq */ - n = ceph_bl_decode_u32(bl, bli); - ceph_bl_iterator_advance(bli, n*2*sizeof(__u32)); - - /* mds_inst */ - n = ceph_bl_decode_u32(bl, bli); - for (i=0; im_addr[mds].nonce = ceph_bl_decode_u64(bl, bli); - m->m_addr[mds].port = ceph_bl_decode_u32(bl, bli); - m->m_addr[mds].ipq[0] = ceph_bl_decode_u8(bl, bli); - m->m_addr[mds].ipq[1] = ceph_bl_decode_u8(bl, bli); - m->m_addr[mds].ipq[2] = ceph_bl_decode_u8(bl, bli); - m->m_addr[mds].ipq[3] = ceph_bl_decode_u8(bl, bli); - } - - /* mds_inc */ - - return 0; -} - - diff --git a/branches/sage/ebofs2/kernel/mdsmap.h b/branches/sage/ebofs2/kernel/mdsmap.h deleted file mode 100644 index da620b99c89ca..0000000000000 --- a/branches/sage/ebofs2/kernel/mdsmap.h +++ /dev/null @@ -1,46 +0,0 @@ -#ifndef _FS_CEPH_MDSMAP_H -#define _FS_CEPH_MDSMAP_H - -#include - -/* see mds/MDSMap.h */ -#define CEPH_MDS_STATE_DNE 0 /* down, never existed. */ -#define CEPH_MDS_STATE_STOPPED -1 /* down, once existed, but no subtrees. empty log. */ -#define CEPH_MDS_STATE_FAILED 2 /* down, active subtrees needs to be recovered. */ - -#define CEPH_MDS_STATE_BOOT -3 /* up, boot announcement. destiny unknown. */ -#define CEPH_MDS_STATE_STANDBY -4 /* up, idle. waiting for assignment by monitor. */ -#define CEPH_MDS_STATE_CREATING -5 /* up, creating MDS instance (new journal, idalloc..). */ -#define CEPH_MDS_STATE_STARTING -6 /* up, starting prior stopped MDS instance. */ - -#define CEPH_MDS_STATE_REPLAY 7 /* up, starting prior failed instance. scanning journal. */ -#define CEPH_MDS_STATE_RESOLVE 8 /* up, disambiguating distributed operations (import, rename, etc.) */ -#define CEPH_MDS_STATE_RECONNECT 9 /* up, reconnect to clients */ -#define CEPH_MDS_STATE_REJOIN 10 /* up, replayed journal, rejoining distributed cache */ -#define CEPH_MDS_STATE_ACTIVE 11 /* up, active */ -#define CEPH_MDS_STATE_STOPPING 12 /* up, exporting metadata (-> standby or out) */ - -/* - * mds map - * - * fields limited to those the client cares about - */ -struct ceph_mdsmap { - __u64 m_epoch; - struct ceph_timeval m_created; - __u32 m_anchortable; - __u32 m_root; - __u32 m_max_mds; /* size of m_addr, m_state arrays */ - struct ceph_entity_addr *m_addr; /* array of addresses */ - __u8 *m_state; /* array of states */ -}; - -extern int ceph_mdsmap_get_random_mds(ceph_mdsmap *m); -extern int ceph_mdsmap_get_state(ceph_mdsmap *m, int w); -extern struct ceph_entity_addr *ceph_mdsmap_get_addr(ceph_mdsmap *m, int w); - -extern int ceph_mdsmap_decode(struct ceph_mdsmap *m, - struct ceph_bufferlist *bl, - struct ceph_bufferlist_iterator *bli); - -#endif diff --git a/branches/sage/ebofs2/kernel/messenger.c b/branches/sage/ebofs2/kernel/messenger.c deleted file mode 100644 index b980f10ff0032..0000000000000 --- a/branches/sage/ebofs2/kernel/messenger.c +++ /dev/null @@ -1,60 +0,0 @@ -#include -#include -#include -#include -#include "kmsg.h" - -/* note: early stages, doesn't build... */ -extern struct ceph_message *ceph_read_message() -{ - int ret; - int received = 0; - kvec *iov = message->payload->b_kv; - - while (received < len) { - _krecvmsg(socket, iov->iov_base, iov->iov_len); - } -} - -extern int ceph_send_message(struct ceph_message *message) -{ - int ret; - int sent = 0; - int len = message->bufferlist->b_kvlen; - kvec *iov = message->payload->b_kv; - - /* while (num left to send > 0) { */ - while (sent < len) { - ret = _ksendmsg(socket, iov->iov_base + sent, iov->iov_len, len - sent); - sent += ret; - } - return sent; -} - -struct ceph_accepter ceph_accepter_init() -{ - struct socket *sd; - struct sockaddr saddr; - - memset(&saddr, 0, sizeof(saddr)); - /* if .ceph.hosts file get host info from file */ - /* make my address from user specified address, fill in saddr */ - sd = _klisten(&saddr); -} - -void ceph_dispatch(ceph_message *msg) -{ -} - -void make_addr(struct sockaddr *saddr, struct ceph_entity_addr *v) -{ - struct sockaddr_in *in_addr = (struct sockaddr_in *)saddr; - - memset(in_addr,0,sizof(in_addr)); - in_addr.sin_family = AF_INET; - memcpy((char*)in_addr.sin_addr.s_addr, (char*)v.ipq, 4); - in_addr.sin_port = htons(v.port); -} -void set_addr() -{ -} diff --git a/branches/sage/ebofs2/kernel/mon_client.h b/branches/sage/ebofs2/kernel/mon_client.h deleted file mode 100644 index ae1a4394265ba..0000000000000 --- a/branches/sage/ebofs2/kernel/mon_client.h +++ /dev/null @@ -1,17 +0,0 @@ -#ifndef _FS_CEPH_MON_CLIENT_H -#define _FS_CEPH_MON_CLIENT_H - - -struct ceph_mon_client { - - -}; - - -extern ceph_monc_request_mdsmap(struct ceph_mon_client *monc, epoch_t have); -extern ceph_monc_request_osdmap(struct ceph_mon_client *monc, epoch_t have); -extern ceph_monc_request_mount(struct ceph_mon_client *monc); -extern ceph_monc_request_umount(struct ceph_mon_client *monc); -extern ceph_monc_report_failure(struct ceph_mon_client *monc, struct entity_inst_t who); - -#endif diff --git a/branches/sage/ebofs2/kernel/monmap.h b/branches/sage/ebofs2/kernel/monmap.h deleted file mode 100644 index 2f60c8a0c3436..0000000000000 --- a/branches/sage/ebofs2/kernel/monmap.h +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef _FS_CEPH_MONMAP_H -#define _FS_CEPH_MONMAP_H - -#include - -/* - * monitor map - */ -struct ceph_monmap { - __u64 m_epoch; - __u32 m_num_mon; - __u32 m_last_mon; - struct ceph_entity_inst m_mon_inst; -}; - -extern int ceph_monmap_pick_mon(struct ceph_monmap *m); -extern int ceph_monmap_decode(struct ceph_monmap *m, struct kvec *v); - -#endif diff --git a/branches/sage/ebofs2/kernel/osd_client.h b/branches/sage/ebofs2/kernel/osd_client.h deleted file mode 100644 index 123c07374536a..0000000000000 --- a/branches/sage/ebofs2/kernel/osd_client.h +++ /dev/null @@ -1,17 +0,0 @@ -#ifndef _FS_CEPH_OSD_CLIENT_H -#define _FS_CEPH_OSD_CLIENT_H - -/* this will be equivalent to osdc/Objecter.h */ - -/* do these later -#include "osdmap.h" -*/ -struct ceph_osdmap; - - -struct ceph_osd_client { - struct ceph_osdmap *s_osdmap; /* osd map */ - -}; - -#endif diff --git a/branches/sage/ebofs2/kernel/super.h b/branches/sage/ebofs2/kernel/super.h deleted file mode 100644 index 94418511ffa53..0000000000000 --- a/branches/sage/ebofs2/kernel/super.h +++ /dev/null @@ -1,75 +0,0 @@ -#ifndef _FS_CEPH_CEPH_H -#define _FS_CEPH_CEPH_H - -/* #include */ - -#include "kmsg.h" -#include "monmap.h" -#include "mds_client.h" -#include "osd_client.h" - - - -/* - * CEPH per-filesystem client state - * - * possibly shared by multiple mount points, if they are - * mounting the same ceph filesystem/cluster. - */ -struct ceph_fs_client { - __u64 s_fsid; /* hmm this should be part of the monmap? */ - - __u32 s_whoami; /* my client number */ - struct ceph_kmsg *s_kmsg; /* messenger instance */ - - struct ceph_monmap *s_monmap; /* monitor map */ - - struct ceph_mds_client *s_mds_client; - struct ceph_osd_client *s_osd_client; - - int s_ref; /* reference count (for each sb_info that points to me) */ -}; - -/* - * directory of filesystems mounted by this host - * - * key: fsid? ipquad of monitor? hmm! - * value: struct ceph_fs_client* - */ -extern struct radix_tree ceph_fs_clients; - - -/* - * CEPH per-mount superblock info - */ -struct ceph_sb_info { - struct ceph_fs_client *sb_client; - - /* FIXME: add my relative offset into the filesystem, - so we can appropriately mangle/adjust path names in requests, etc. */ -}; - -/* - * CEPH file system in-core inode info - */ -struct ceph_inode_info { - struct ceph_file_layout i_layout; - struct inode vfs_inode; -}; - -static inline struct ceph_inode_info *CEPH_I(struct inode *inode) -{ - return list_entry(inode, struct ceph_inode_info, vfs_inode); -} - - -/* file.c */ -extern const struct inode_operations ceph_file_inops; -extern const struct file_operations ceph_file_operations; -extern const struct address_space_operations ceph_aops; - -/* dir.c */ -extern const struct inode_operations ceph_dir_inops; -extern const struct file_operations ceph_dir_operations; - -#endif /* _FS_CEPH_CEPH_H */ diff --git a/branches/sage/ebofs2/mds/AnchorClient.h b/branches/sage/ebofs2/mds/AnchorClient.h deleted file mode 100644 index fd790f39c399d..0000000000000 --- a/branches/sage/ebofs2/mds/AnchorClient.h +++ /dev/null @@ -1,107 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __ANCHORCLIENT_H -#define __ANCHORCLIENT_H - -#include -using std::vector; -#include -using __gnu_cxx::hash_map; - -#include "include/types.h" -#include "msg/Dispatcher.h" - -#include "Anchor.h" - -class Context; -class MDS; -class LogSegment; - -class AnchorClient : public Dispatcher { - MDS *mds; - - // lookups - struct _pending_lookup { - vector *trace; - Context *onfinish; - }; - hash_map pending_lookup; - - // prepares - struct _pending_prepare { - vector trace; - Context *onfinish; - version_t *patid; // ptr to atid - }; - hash_map pending_create_prepare; - hash_map pending_destroy_prepare; - hash_map pending_update_prepare; - - // pending commits - map pending_commit; - map > ack_waiters; - - void handle_anchor_reply(class MAnchor *m); - - class C_LoggedAck : public Context { - AnchorClient *ac; - version_t atid; - public: - C_LoggedAck(AnchorClient *a, version_t t) : ac(a), atid(t) {} - void finish(int r) { - ac->_logged_ack(atid); - } - }; - void _logged_ack(version_t atid); - -public: - AnchorClient(MDS *m) : mds(m) {} - - void dispatch(Message *m); - - // async user interface - void lookup(inodeno_t ino, vector& trace, Context *onfinish); - - void prepare_create(inodeno_t ino, vector& trace, version_t *atid, Context *onfinish); - void prepare_destroy(inodeno_t ino, version_t *atid, Context *onfinish); - void prepare_update(inodeno_t ino, vector& trace, version_t *atid, Context *onfinish); - - void commit(version_t atid, LogSegment *ls); - - // for recovery (by other nodes) - void handle_mds_recovery(int mds); // called when someone else recovers - - void resend_commits(); - void resend_prepares(hash_map& prepares, int op); - - // for recovery (by me) - void got_journaled_agree(version_t atid, LogSegment *ls) { - pending_commit[atid] = ls; - } - void got_journaled_ack(version_t atid) { - pending_commit.erase(atid); - } - bool has_committed(version_t atid) { - return pending_commit.count(atid) == 0; - } - void wait_for_ack(version_t atid, Context *c) { - ack_waiters[atid].push_back(c); - } - void finish_recovery(); // called when i recover and go active - - -}; - -#endif diff --git a/branches/sage/ebofs2/mds/AnchorTable.h b/branches/sage/ebofs2/mds/AnchorTable.h deleted file mode 100644 index 64a2002ba7c85..0000000000000 --- a/branches/sage/ebofs2/mds/AnchorTable.h +++ /dev/null @@ -1,127 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __ANCHORTABLE_H -#define __ANCHORTABLE_H - -#include "Anchor.h" -#include "include/Context.h" - -#include -using namespace __gnu_cxx; - -class MDS; -class MAnchor; - -class AnchorTable { - MDS *mds; - - // keep the entire table in memory. - hash_map anchor_map; - - // uncommitted operations - map pending_reqmds; - map pending_create; - map pending_destroy; - map > > pending_update; - - version_t version; // this includes anchor_map AND pending_* state. - version_t committing_version; - version_t committed_version; - - // load/save state - bool opening, opened; - - // waiters - list waiting_for_open; - map > waiting_for_save; - -protected: - - // basic updates - bool add(inodeno_t ino, dirfrag_t dirfrag); - void inc(inodeno_t ino); - void dec(inodeno_t ino); - - // mid-level - void create_prepare(inodeno_t ino, vector& trace, int reqmds); - void destroy_prepare(inodeno_t ino, int reqmds); - void update_prepare(inodeno_t ino, vector& trace, int reqmds); - void commit(version_t atid); - void rollback(version_t atid); - friend class EAnchor; // used for journal replay. - - // high level interface - void handle_lookup(MAnchor *req); - - void handle_create_prepare(MAnchor *req); - void _create_prepare_logged(MAnchor *req, version_t atid); - friend class C_AT_CreatePrepare; - - void handle_destroy_prepare(MAnchor *req); - void _destroy_prepare_logged(MAnchor *req, version_t atid); - friend class C_AT_DestroyPrepare; - - void handle_update_prepare(MAnchor *req); - void _update_prepare_logged(MAnchor *req, version_t atid); - friend class C_AT_UpdatePrepare; - - void handle_commit(MAnchor *req); - void _commit_logged(MAnchor *req); - friend class C_AT_Commit; - - void handle_rollback(MAnchor *req); - - // messages - void handle_anchor_request(MAnchor *m); - - void dump(); - -public: - AnchorTable(MDS *m) : - mds(m), - version(0), committing_version(0), committed_version(0), - opening(false), opened(false) { } - - void dispatch(class Message *m); - - version_t get_version() { return version; } - version_t get_committed_version() { return committed_version; } - - void create_fresh() { - // reset (i.e. on mkfs) to empty, but unsaved table. - version = 1; - opened = true; - opening = false; - anchor_map.clear(); - pending_create.clear(); - pending_destroy.clear(); - pending_update.clear(); - } - - // load/save entire table for now! - void save(Context *onfinish); - void _saved(version_t v); - void load(Context *onfinish); - void _loaded(bufferlist& bl); - - // recovery - void handle_mds_recovery(int who); - void finish_recovery(); - void resend_agree(version_t v, int who); - -}; - -#endif diff --git a/branches/sage/ebofs2/mds/CDentry.cc b/branches/sage/ebofs2/mds/CDentry.cc deleted file mode 100644 index 2b6bb3470e8a8..0000000000000 --- a/branches/sage/ebofs2/mds/CDentry.cc +++ /dev/null @@ -1,365 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "CDentry.h" -#include "CInode.h" -#include "CDir.h" -#include "Anchor.h" - -#include "MDS.h" -#include "MDCache.h" -#include "LogSegment.h" - -#include "messages/MLock.h" - -#include - -#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_mds) *_dout << dbeginl << g_clock.now() << " mds" << dir->cache->mds->get_nodeid() << ".cache.den(" << dir->dirfrag() << " " << name << ") " - - - -ostream& CDentry::print_db_line_prefix(ostream& out) -{ - return out << g_clock.now() << " mds" << dir->cache->mds->get_nodeid() << ".cache.den(" << dir->ino() << " " << name << ") "; -} - - -// CDentry - -ostream& operator<<(ostream& out, CDentry& dn) -{ - string path; - dn.make_path(path); - - out << "[dentry " << path; - - if (dn.is_auth()) { - out << " auth"; - if (dn.is_replicated()) - out << dn.get_replicas(); - } else { - out << " rep@" << dn.authority(); - out << "." << dn.get_replica_nonce(); - assert(dn.get_replica_nonce() >= 0); - } - - if (dn.is_null()) out << " NULL"; - if (dn.is_remote()) { - out << " REMOTE("; - switch (dn.get_remote_d_type()) { - case inode_t::DT_REG: out << "reg"; break; - case inode_t::DT_DIR: out << "dir"; break; - case inode_t::DT_LNK: out << "lnk"; break; - default: assert(0); - } - out << ")"; - } - - out << " " << dn.lock; - - out << " v=" << dn.get_version(); - out << " pv=" << dn.get_projected_version(); - - out << " inode=" << dn.get_inode(); - - if (dn.is_new()) out << " state=new"; - - if (dn.get_num_ref()) { - out << " |"; - dn.print_pin_set(out); - } - - out << " " << &dn; - out << "]"; - return out; -} - - -bool operator<(const CDentry& l, const CDentry& r) -{ - if (l.get_dir()->ino() < r.get_dir()->ino()) return true; - if (l.get_dir()->ino() == r.get_dir()->ino() && - l.get_name() < r.get_name()) return true; - return false; -} - - -void CDentry::print(ostream& out) -{ - out << *this; -} - - -inodeno_t CDentry::get_ino() -{ - if (inode) - return inode->ino(); - return inodeno_t(); -} - - -pair CDentry::authority() -{ - return dir->authority(); -} - - -void CDentry::add_waiter(int tag, Context *c) -{ - // wait on the directory? - if (tag & (WAIT_UNFREEZE|WAIT_SINGLEAUTH)) { - dir->add_waiter(tag, c); - return; - } - MDSCacheObject::add_waiter(tag, c); -} - - -version_t CDentry::pre_dirty(version_t min) -{ - projected_version = dir->pre_dirty(min); - dout(10) << " pre_dirty " << *this << dendl; - return projected_version; -} - - -void CDentry::_mark_dirty(LogSegment *ls) -{ - // state+pin - if (!state_test(STATE_DIRTY)) { - state_set(STATE_DIRTY); - dir->inc_num_dirty(); - get(PIN_DIRTY); - assert(ls); - } - if (ls) - ls->dirty_dentries.push_back(&xlist_dirty); -} - -void CDentry::mark_dirty(version_t pv, LogSegment *ls) -{ - dout(10) << " mark_dirty " << *this << dendl; - - // i now live in this new dir version - assert(pv <= projected_version); - version = pv; - _mark_dirty(ls); - - // mark dir too - dir->mark_dirty(pv, ls); -} - - -void CDentry::mark_clean() -{ - dout(10) << " mark_clean " << *this << dendl; - assert(is_dirty()); - assert(dir->get_version() == 0 || version <= dir->get_version()); // hmm? - - // state+pin - state_clear(STATE_DIRTY); - dir->dec_num_dirty(); - put(PIN_DIRTY); - - xlist_dirty.remove_myself(); - - if (state_test(STATE_NEW)) - state_clear(STATE_NEW); -} - -void CDentry::mark_new() -{ - dout(10) << " mark_new " << *this << dendl; - state_set(STATE_NEW); -} - -void CDentry::make_path(string& s) -{ - if (dir) { - dir->inode->make_path(s); - } else { - s = "???"; - } - s += "/"; - s += name; -} - -void CDentry::make_path(string& s, inodeno_t tobase) -{ - assert(dir); - - if (dir->inode->is_root()) { - s += "/"; // make it an absolute path (no matter what) if we hit the root. - } - else if (dir->inode->get_parent_dn() && - dir->inode->ino() != tobase) { - dir->inode->get_parent_dn()->make_path(s, tobase); - s += "/"; - } - s += name; -} - -/** make_anchor_trace - * construct an anchor trace for this dentry, as if it were linked to *in. - */ -void CDentry::make_anchor_trace(vector& trace, CInode *in) -{ - // start with parent dir inode - if (dir) - dir->inode->make_anchor_trace(trace); - - // add this inode (in my dirfrag) to the end - trace.push_back(Anchor(in->ino(), dir->dirfrag())); - dout(10) << "make_anchor_trace added " << trace.back() << dendl; -} - - - -void CDentry::link_remote(CInode *in) -{ - assert(is_remote()); - assert(in->ino() == remote_ino); - - inode = in; - in->add_remote_parent(this); -} - -void CDentry::unlink_remote() -{ - assert(is_remote()); - assert(inode); - - inode->remove_remote_parent(this); - inode = 0; -} - - -CDentryDiscover *CDentry::replicate_to(int who) -{ - int nonce = add_replica(who); - return new CDentryDiscover(this, nonce); -} - - -// ---------------------------- -// auth pins - -bool CDentry::can_auth_pin() -{ - assert(dir); - return dir->can_auth_pin(); -} - -void CDentry::auth_pin() -{ - if (auth_pins == 0) - get(PIN_AUTHPIN); - auth_pins++; - - dout(10) << "auth_pin on " << *this - << " now " << auth_pins << "+" << nested_auth_pins - << dendl; - - dir->adjust_nested_auth_pins(1); -} - -void CDentry::auth_unpin() -{ - auth_pins--; - if (auth_pins == 0) - put(PIN_AUTHPIN); - - dout(10) << "auth_unpin on " << *this - << " now " << auth_pins << "+" << nested_auth_pins - << dendl; - assert(auth_pins >= 0); - - dir->adjust_nested_auth_pins(-1); -} - -void CDentry::adjust_nested_auth_pins(int by) -{ - nested_auth_pins += by; - - dout(15) << "adjust_nested_auth_pins by " << by - << " now " << auth_pins << "+" << nested_auth_pins - << dendl; - assert(nested_auth_pins >= 0); - - dir->adjust_nested_auth_pins(by); -} - - -// ---------------------------- -// locking - -void CDentry::set_object_info(MDSCacheObjectInfo &info) -{ - info.dirfrag = dir->dirfrag(); - info.dname = name; -} - -void CDentry::encode_lock_state(int type, bufferlist& bl) -{ - // null, ino, or remote_ino? - int c; - if (is_primary()) { - c = 1; - ::_encode(c, bl); - ::_encode(inode->inode.ino, bl); - } - else if (is_remote()) { - c = 2; - ::_encode(c, bl); - ::_encode(remote_ino, bl); - } - else if (is_null()) { - // encode nothing. - } - else assert(0); -} - -void CDentry::decode_lock_state(int type, bufferlist& bl) -{ - if (bl.length() == 0) { - // null - assert(is_null()); - return; - } - - int off = 0; - char c; - inodeno_t ino; - ::_decode(c, bl, off); - - switch (c) { - case 1: - case 2: - _decode(ino, bl, off); - // newly linked? - if (is_null() && !is_auth()) { - // force trim from cache! - dout(10) << "decode_lock_state replica dentry null -> non-null, must trim" << dendl; - //assert(get_num_ref() == 0); - } else { - // verify? - - } - break; - default: - assert(0); - } -} diff --git a/branches/sage/ebofs2/mds/CDentry.h b/branches/sage/ebofs2/mds/CDentry.h deleted file mode 100644 index 416792beb8778..0000000000000 --- a/branches/sage/ebofs2/mds/CDentry.h +++ /dev/null @@ -1,323 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __CDENTRY_H -#define __CDENTRY_H - -#include -#include -#include -using namespace std; - -#include "include/types.h" -#include "include/buffer.h" -#include "include/lru.h" -#include "include/xlist.h" -#include "mdstypes.h" - -#include "SimpleLock.h" - -class CInode; -class CDir; -class MDRequest; - -class Message; -class CDentryDiscover; -class Anchor; - -class CDentry; -class LogSegment; - - -// define an ordering -bool operator<(const CDentry& l, const CDentry& r); - -// dentry -class CDentry : public MDSCacheObject, public LRUObject { - public: - // -- state -- - static const int STATE_NEW = 1; - static const int STATE_FRAGMENTING = 2; - - // -- pins -- - static const int PIN_INODEPIN = 1; // linked inode is pinned - static const int PIN_FRAGMENTING = -2; // containing dir is refragmenting - const char *pin_name(int p) { - switch (p) { - case PIN_INODEPIN: return "inodepin"; - case PIN_FRAGMENTING: return "fragmenting"; - default: return generic_pin_name(p); - } - }; - - // -- wait -- - static const int WAIT_LOCK_OFFSET = 8; - - void add_waiter(int tag, Context *c); - - static const int EXPORT_NONCE = 1; - - bool is_lt(const MDSCacheObject *r) const { - return *this < *(CDentry*)r; - } - - protected: - string name; - - inodeno_t remote_ino; // if remote dentry - unsigned char remote_d_type; - - CInode *inode; // linked inode (if any) - CDir *dir; // containing dirfrag - - version_t version; // dir version when last touched. - version_t projected_version; // what it will be when i unlock/commit. - - xlist::item xlist_dirty; - - off_t dir_offset; - - int auth_pins, nested_auth_pins; - - friend class Migrator; - friend class Locker; - friend class Renamer; - friend class Server; - friend class MDCache; - friend class MDS; - friend class CInode; - friend class C_MDC_XlockRequest; - - -public: - // lock - SimpleLock lock; - - - - public: - // cons - CDentry() : - remote_ino(0), remote_d_type(0), - inode(0), dir(0), - version(0), projected_version(0), - xlist_dirty(this), - dir_offset(0), - auth_pins(0), nested_auth_pins(0), - lock(this, LOCK_OTYPE_DN, WAIT_LOCK_OFFSET) { } - CDentry(const string& n, CInode *in) : - name(n), - remote_ino(0), remote_d_type(0), - inode(in), dir(0), - version(0), projected_version(0), - xlist_dirty(this), - dir_offset(0), - auth_pins(0), nested_auth_pins(0), - lock(this, LOCK_OTYPE_DN, WAIT_LOCK_OFFSET) { } - CDentry(const string& n, inodeno_t ino, unsigned char dt, CInode *in=0) : - name(n), - remote_ino(ino), remote_d_type(dt), - inode(in), dir(0), - version(0), projected_version(0), - xlist_dirty(this), - dir_offset(0), - auth_pins(0), nested_auth_pins(0), - lock(this, LOCK_OTYPE_DN, WAIT_LOCK_OFFSET) { } - - CInode *get_inode() const { return inode; } - CDir *get_dir() const { return dir; } - const string& get_name() const { return name; } - inodeno_t get_ino(); - - off_t get_dir_offset() { return dir_offset; } - void set_dir_offset(off_t o) { dir_offset = o; } - void clear_dir_offset() { dir_offset = 0; } - - inodeno_t get_remote_ino() { return remote_ino; } - unsigned char get_remote_d_type() { return remote_d_type; } - void set_remote(inodeno_t ino, unsigned char d_type) { - remote_ino = ino; - remote_d_type = d_type; - } - - // ref counts: pin ourselves in the LRU when we're pinned. - void first_get() { - lru_pin(); - } - void last_put() { - lru_unpin(); - } - - // auth pins - bool can_auth_pin(); - void auth_pin(); - void auth_unpin(); - void adjust_nested_auth_pins(int by); - - - // dentry type is primary || remote || null - // inode ptr is required for primary, optional for remote, undefined for null - bool is_primary() { return remote_ino == 0 && inode != 0; } - bool is_remote() { return remote_ino > 0; } - bool is_null() { return (remote_ino == 0 && inode == 0) ? true:false; } - - // remote links - void link_remote(CInode *in); - void unlink_remote(); - - - // copy cons - CDentry(const CDentry& m); - const CDentry& operator= (const CDentry& right); - - // misc - void make_path(string& p); - void make_path(string& p, inodeno_t tobase); - void make_anchor_trace(vector& trace, CInode *in); - - // -- version -- - version_t get_version() { return version; } - void set_version(version_t v) { projected_version = version = v; } - version_t get_projected_version() { return projected_version; } - void set_projected_version(version_t v) { projected_version = v; } - - pair authority(); - - version_t pre_dirty(version_t min=0); - void _mark_dirty(LogSegment *ls); - void mark_dirty(version_t projected_dirv, LogSegment *ls); - void mark_clean(); - - void mark_new(); - bool is_new() { return state_test(STATE_NEW); } - - // -- replication - CDentryDiscover *replicate_to(int rep); - - - // -- exporting - // note: this assumes the dentry already exists. - // i.e., the name is already extracted... so we just need the other state. - void encode_export(bufferlist& bl) { - ::_encode_simple(state, bl); - ::_encode_simple(version, bl); - ::_encode_simple(projected_version, bl); - lock._encode(bl); - ::_encode_simple(replica_map, bl); - get(PIN_TEMPEXPORTING); - } - void finish_export() { - // twiddle - clear_replica_map(); - replica_nonce = EXPORT_NONCE; - state_clear(CDentry::STATE_AUTH); - if (is_dirty()) - mark_clean(); - put(PIN_TEMPEXPORTING); - } - void abort_export() { - put(PIN_TEMPEXPORTING); - } - void decode_import(bufferlist::iterator& blp, LogSegment *ls) { - int nstate; - ::_decode_simple(nstate, blp); - ::_decode_simple(version, blp); - ::_decode_simple(projected_version, blp); - lock._decode(blp); - ::_decode_simple(replica_map, blp); - - // twiddle - state = 0; - state_set(CDentry::STATE_AUTH); - if (nstate & STATE_DIRTY) - _mark_dirty(ls); - if (!replica_map.empty()) - get(PIN_REPLICATED); - } - - // -- locking -- - SimpleLock* get_lock(int type) { - assert(type == LOCK_OTYPE_DN); - return &lock; - } - void set_object_info(MDSCacheObjectInfo &info); - void encode_lock_state(int type, bufferlist& bl); - void decode_lock_state(int type, bufferlist& bl); - - - ostream& print_db_line_prefix(ostream& out); - void print(ostream& out); - - friend class CDir; -}; - -ostream& operator<<(ostream& out, CDentry& dn); - - - -class CDentryDiscover { - string dname; - int replica_nonce; - int lockstate; - off_t dir_offset; - inodeno_t remote_ino; - unsigned char remote_d_type; - -public: - CDentryDiscover() {} - CDentryDiscover(CDentry *dn, int nonce) : - dname(dn->get_name()), replica_nonce(nonce), - lockstate(dn->lock.get_replica_state()), - dir_offset(dn->get_dir_offset()), - remote_ino(dn->get_remote_ino()), remote_d_type(dn->get_remote_d_type()) { } - - string& get_dname() { return dname; } - int get_nonce() { return replica_nonce; } - bool is_remote() { return remote_ino ? true:false; } - inodeno_t get_remote_ino() { return remote_ino; } - unsigned char get_remote_d_type() { return remote_d_type; } - - void update_dentry(CDentry *dn) { - dn->set_dir_offset(dir_offset); - dn->set_replica_nonce(replica_nonce); - } - void init_dentry_lock(CDentry *dn) { - dn->lock.set_state( lockstate ); - } - - void _encode(bufferlist& bl) { - ::_encode(dname, bl); - ::_encode(dir_offset, bl); - ::_encode(remote_ino, bl); - ::_encode(remote_d_type, bl); - ::_encode(replica_nonce, bl); - ::_encode(lockstate, bl); - } - - void _decode(bufferlist& bl, int& off) { - ::_decode(dname, bl, off); - ::_decode(dir_offset, bl, off); - ::_decode(remote_ino, bl, off); - ::_decode(remote_d_type, bl, off); - ::_decode(replica_nonce, bl, off); - ::_decode(lockstate, bl, off); - } - -}; - - - -#endif diff --git a/branches/sage/ebofs2/mds/CDir.cc b/branches/sage/ebofs2/mds/CDir.cc deleted file mode 100644 index b4663b269c659..0000000000000 --- a/branches/sage/ebofs2/mds/CDir.cc +++ /dev/null @@ -1,1676 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include "include/types.h" - -#include "CDir.h" -#include "CDentry.h" -#include "CInode.h" - -#include "MDS.h" -#include "MDCache.h" -#include "MDSMap.h" -#include "LogSegment.h" - -#include "include/Context.h" -#include "common/Clock.h" - -#include "osdc/Objecter.h" - -#include - -#include "config.h" - -#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_mds) *_dout << dbeginl << g_clock.now() << " mds" << cache->mds->get_nodeid() << ".cache.dir(" << this->dirfrag() << ") " - - - - -// PINS -//int cdir_pins[CDIR_NUM_PINS] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0 }; - - - -ostream& operator<<(ostream& out, CDir& dir) -{ - string path; - dir.get_inode()->make_path(path); - out << "[dir " << dir.dirfrag() << " " << path << "/"; - if (dir.is_auth()) { - out << " auth"; - if (dir.is_replicated()) - out << dir.get_replicas(); - - out << " pv=" << dir.get_projected_version(); - out << " v=" << dir.get_version(); - out << " cv=" << dir.get_committing_version(); - out << "/" << dir.get_committed_version(); - out << "/" << dir.get_committed_version_equivalent(); - } else { - out << " rep@" << dir.authority(); - if (dir.get_replica_nonce() > 1) - out << "." << dir.get_replica_nonce(); - } - - if (dir.is_rep()) out << " REP"; - - if (dir.get_dir_auth() != CDIR_AUTH_DEFAULT) { - if (dir.get_dir_auth().second == CDIR_AUTH_UNKNOWN) - out << " dir_auth=" << dir.get_dir_auth().first; - else - out << " dir_auth=" << dir.get_dir_auth(); - } - - if (dir.get_cum_auth_pins()) - out << " ap=" << dir.get_auth_pins() << "+" << dir.get_nested_auth_pins(); - - out << " state=" << dir.get_state(); - if (dir.state_test(CDir::STATE_COMPLETE)) out << "|complete"; - if (dir.state_test(CDir::STATE_FREEZINGTREE)) out << "|freezingtree"; - if (dir.state_test(CDir::STATE_FROZENTREE)) out << "|frozentree"; - //if (dir.state_test(CDir::STATE_FROZENTREELEAF)) out << "|frozentreeleaf"; - if (dir.state_test(CDir::STATE_FROZENDIR)) out << "|frozendir"; - if (dir.state_test(CDir::STATE_FREEZINGDIR)) out << "|freezingdir"; - if (dir.state_test(CDir::STATE_EXPORTBOUND)) out << "|exportbound"; - if (dir.state_test(CDir::STATE_IMPORTBOUND)) out << "|importbound"; - - out << " sz=" << dir.get_nitems() << "+" << dir.get_nnull(); - if (dir.get_num_dirty()) - out << " dirty=" << dir.get_num_dirty(); - - - if (dir.get_num_ref()) { - out << " |"; - dir.print_pin_set(out); - } - - out << " " << &dir; - return out << "]"; -} - - -void CDir::print(ostream& out) -{ - out << *this; -} - - - - -ostream& CDir::print_db_line_prefix(ostream& out) -{ - return out << g_clock.now() << " mds" << cache->mds->get_nodeid() << ".cache.dir(" << this->dirfrag() << ") "; -} - - - -// ------------------------------------------------------------------- -// CDir - -CDir::CDir(CInode *in, frag_t fg, MDCache *mdcache, bool auth) : - xlist_dirty(this) -{ - inode = in; - frag = fg; - this->cache = mdcache; - - nitems = 0; - nnull = 0; - num_dirty = 0; - - state = STATE_INITIAL; - - projected_version = version = 0; - committing_version = 0; - committed_version_equivalent = committed_version = 0; - - // dir_auth - dir_auth = CDIR_AUTH_DEFAULT; - - // auth - assert(in->is_dir()); - if (auth) - state |= STATE_AUTH; - - auth_pins = 0; - nested_auth_pins = 0; - request_pins = 0; - - //hack_num_accessed = -1; - - dir_rep = REP_NONE; - //dir_rep = REP_ALL; // hack: to wring out some bugs! FIXME FIXME -} - - - - -/*** - * linking fun - */ - -CDentry* CDir::add_null_dentry(const string& dname) -{ - // foreign - assert(lookup(dname) == 0); - - // create dentry - CDentry* dn = new CDentry(dname, 0); - if (is_auth()) - dn->state_set(CDentry::STATE_AUTH); - cache->lru.lru_insert_mid(dn); - - dn->dir = this; - dn->version = projected_version; - - // add to dir - assert(items.count(dn->name) == 0); - //assert(null_items.count(dn->name) == 0); - - items[dn->name] = dn; - nnull++; - - dout(12) << "add_null_dentry " << *dn << dendl; - - // pin? - if (nnull + nitems == 1) get(PIN_CHILD); - - assert(nnull + nitems == items.size()); - //assert(nnull == null_items.size()); - return dn; -} - - -CDentry* CDir::add_primary_dentry(const string& dname, CInode *in) -{ - // primary - assert(lookup(dname) == 0); - - // create dentry - CDentry* dn = new CDentry(dname, 0); - if (is_auth()) - dn->state_set(CDentry::STATE_AUTH); - cache->lru.lru_insert_mid(dn); - - dn->dir = this; - dn->version = projected_version; - - // add to dir - assert(items.count(dn->name) == 0); - //assert(null_items.count(dn->name) == 0); - - items[dn->name] = dn; - link_inode_work( dn, in ); - - dout(12) << "add_primary_dentry " << *dn << dendl; - - // pin? - if (nnull + nitems == 1) get(PIN_CHILD); - - assert(nnull + nitems == items.size()); - //assert(nnull == null_items.size()); - return dn; -} - -CDentry* CDir::add_remote_dentry(const string& dname, inodeno_t ino, unsigned char d_type) -{ - // foreign - assert(lookup(dname) == 0); - - // create dentry - CDentry* dn = new CDentry(dname, ino, d_type); - if (is_auth()) - dn->state_set(CDentry::STATE_AUTH); - cache->lru.lru_insert_mid(dn); - - dn->dir = this; - dn->version = projected_version; - - // add to dir - assert(items.count(dn->name) == 0); - //assert(null_items.count(dn->name) == 0); - - items[dn->name] = dn; - nitems++; - - dout(12) << "add_remote_dentry " << *dn << dendl; - - // pin? - if (nnull + nitems == 1) get(PIN_CHILD); - - assert(nnull + nitems == items.size()); - //assert(nnull == null_items.size()); - return dn; -} - - - -void CDir::remove_dentry(CDentry *dn) -{ - dout(12) << "remove_dentry " << *dn << dendl; - - if (dn->inode) { - // detach inode and dentry - unlink_inode_work(dn); - } else { - // remove from null list - //assert(null_items.count(dn->name) == 1); - //null_items.erase(dn->name); - nnull--; - } - - // remove from list - assert(items.count(dn->name) == 1); - items.erase(dn->name); - - // adjust dirty counter? - if (dn->state_test(CDentry::STATE_DIRTY)) - num_dirty--; - - cache->lru.lru_remove(dn); - delete dn; - - // unpin? - if (nnull + nitems == 0) put(PIN_CHILD); - - assert(nnull + nitems == items.size()); - //assert(nnull == null_items.size()); -} - -void CDir::link_remote_inode(CDentry *dn, inodeno_t ino, unsigned char d_type) -{ - dout(12) << "link_remote_inode " << *dn << " remote " << ino << dendl; - assert(dn->is_null()); - - dn->set_remote(ino, d_type); - nitems++; - dn->clear_dir_offset(); - - //assert(null_items.count(dn->name) == 1); - //null_items.erase(dn->name); - nnull--; - assert(nnull + nitems == items.size()); -} - -void CDir::link_primary_inode(CDentry *dn, CInode *in) -{ - dout(12) << "link_primary_inode " << *dn << " " << *in << dendl; - assert(dn->is_null()); - - link_inode_work(dn,in); - dn->clear_dir_offset(); - - // remove from null list - //assert(null_items.count(dn->name) == 1); - //null_items.erase(dn->name); - nnull--; - - assert(nnull + nitems == items.size()); - //assert(nnull == null_items.size()); -} - -void CDir::link_inode_work( CDentry *dn, CInode *in) -{ - assert(dn->inode == 0); - dn->inode = in; - in->set_primary_parent(dn); - - nitems++; // adjust dir size - - // set inode version - //in->inode.version = dn->get_version(); - - // pin dentry? - if (in->get_num_ref()) - dn->get(CDentry::PIN_INODEPIN); - - // adjust auth pin count - if (in->auth_pins + in->nested_auth_pins) - dn->adjust_nested_auth_pins(in->auth_pins + in->nested_auth_pins); -} - -void CDir::unlink_inode( CDentry *dn ) -{ - if (dn->is_remote()) { - dout(12) << "unlink_inode " << *dn << dendl; - } else { - dout(12) << "unlink_inode " << *dn << " " << *dn->inode << dendl; - } - - dn->clear_dir_offset(); - unlink_inode_work(dn); - - // add to null list - //assert(null_items.count(dn->name) == 0); - //null_items[dn->name] = dn; - nnull++; - - assert(nnull + nitems == items.size()); - //assert(nnull == null_items.size()); -} - -void CDir::try_remove_unlinked_dn(CDentry *dn) -{ - assert(dn->dir == this); - assert(dn->is_null()); - assert(dn->is_dirty()); - - /* FIXME: there is a bug in this. i think new dentries are properly - identified.. e.g. maybe a dentry exists, is committed, is removed, is now - dirty+null, then reused and mistakenly considered new.. then it is removed, - we remove it here, the dir is fetched, and the dentry exists again. - - somethign like that... - */ - return; - - - // no pins (besides dirty)? - if (dn->get_num_ref() != 1) - return; - - // was the dn new? or is the dir complete (i.e. we don't need negatives)? - if (dn->is_new() || is_complete()) { - dout(10) << "try_remove_unlinked_dn " << *dn << " in " << *this << dendl; - dn->mark_clean(); - remove_dentry(dn); - - if (version == projected_version && - committing_version == committed_version && - num_dirty == 0) { - dout(10) << "try_remove_unlinked_dn committed_equivalent now " << version - << " vs committed " << committed_version - << dendl; - committed_version_equivalent = committed_version; - } - } -} - - - -void CDir::unlink_inode_work( CDentry *dn ) -{ - CInode *in = dn->inode; - - if (dn->is_remote()) { - // remote - if (in) - dn->unlink_remote(); - - dn->set_remote(0, 0); - } else { - // primary - assert(dn->is_primary()); - - // unpin dentry? - if (in->get_num_ref()) - dn->put(CDentry::PIN_INODEPIN); - - // unlink auth_pin count - if (in->auth_pins + in->nested_auth_pins) - dn->adjust_nested_auth_pins(0 - (in->auth_pins + in->nested_auth_pins)); - - // detach inode - in->remove_primary_parent(dn); - dn->inode = 0; - } - - nitems--; // adjust dir size -} - -void CDir::remove_null_dentries() { - dout(12) << "remove_null_dentries " << *this << dendl; - - list dns; - for (CDir::map_t::iterator it = items.begin(); - it != items.end(); - it++) { - if (it->second->is_null()) - dns.push_back(it->second); - } - - for (list::iterator it = dns.begin(); - it != dns.end(); - it++) { - CDentry *dn = *it; - remove_dentry(dn); - } - //assert(null_items.empty()); - assert(nnull == 0); - assert(nnull + nitems == items.size()); -} - - -/** - * steal_dentry -- semi-violently move a dentry from one CDir to another - * (*) violently, in that nitems, most pins, etc. are not correctly maintained - * on the old CDir corpse; must call purge_stolen() when finished. - */ -void CDir::steal_dentry(CDentry *dn) -{ - dout(15) << "steal_dentry " << *dn << dendl; - - items[dn->name] = dn; - - dn->dir->items.erase(dn->name); - if (dn->dir->items.empty()) - dn->dir->put(PIN_CHILD); - - if (nnull + nitems == 0) - get(PIN_CHILD); - if (dn->is_null()) - nnull++; - else - nitems++; - - nested_auth_pins += dn->auth_pins + dn->nested_auth_pins; - if (dn->is_dirty()) - num_dirty++; - - dn->dir = this; -} - -void CDir::purge_stolen(list& waiters) -{ - // take waiters _before_ unfreeze... - take_waiting(WAIT_ANY, waiters); - - if (is_auth()) { - assert(is_frozen_dir()); - unfreeze_dir(); - } - - nnull = nitems = 0; - - if (is_auth()) - clear_replica_map(); - if (is_dirty()) mark_clean(); - if (state_test(STATE_IMPORTBOUND)) put(PIN_IMPORTBOUND); - if (state_test(STATE_EXPORTBOUND)) put(PIN_EXPORTBOUND); - - if (auth_pins > 0) put(PIN_AUTHPIN); - - assert(get_num_ref() == (state_test(STATE_STICKY) ? 1:0)); -} - -void CDir::init_fragment_pins() -{ - if (!replica_map.empty()) get(PIN_REPLICATED); - if (state_test(STATE_DIRTY)) get(PIN_DIRTY); - if (state_test(STATE_EXPORTBOUND)) get(PIN_EXPORTBOUND); - if (state_test(STATE_IMPORTBOUND)) get(PIN_IMPORTBOUND); -} - -void CDir::split(int bits, list& subs, list& waiters) -{ - dout(10) << "split by " << bits << " bits on " << *this << dendl; - - if (cache->mds->logger) cache->mds->logger->inc("dir_sp"); - - assert(is_complete() || !is_auth()); - - list frags; - frag.split(bits, frags); - - vector subfrags(1 << bits); - - double fac = 1.0 / (double)(1 << bits); // for scaling load vecs - - // create subfrag dirs - int n = 0; - for (list::iterator p = frags.begin(); p != frags.end(); ++p) { - CDir *f = new CDir(inode, *p, cache, is_auth()); - f->state_set(state & MASK_STATE_FRAGMENT_KEPT); - f->replica_map = replica_map; - f->dir_auth = dir_auth; - f->init_fragment_pins(); - f->version = version; - f->projected_version = projected_version; - - f->pop_me = pop_me; - f->pop_me *= fac; - - // FIXME; this is an approximation - f->pop_nested = pop_nested; - f->pop_nested *= fac; - f->pop_auth_subtree = pop_auth_subtree; - f->pop_auth_subtree *= fac; - f->pop_auth_subtree_nested = pop_auth_subtree_nested; - f->pop_auth_subtree_nested *= fac; - - dout(10) << " subfrag " << *p << " " << *f << dendl; - subfrags[n++] = f; - subs.push_back(f); - inode->add_dirfrag(f); - } - - // repartition dentries - while (!items.empty()) { - CDir::map_t::iterator p = items.begin(); - - CDentry *dn = p->second; - frag_t subfrag = inode->pick_dirfrag(p->first); - int n = subfrag.value() >> frag.bits(); - dout(15) << " subfrag " << subfrag << " n=" << n << " for " << p->first << dendl; - CDir *f = subfrags[n]; - f->steal_dentry(dn); - } - - purge_stolen(waiters); - inode->close_dirfrag(frag); // selft deletion, watch out. -} - -void CDir::merge(int bits, list& waiters) -{ - dout(10) << "merge by " << bits << " bits" << dendl; - - list frags; - frag.split(bits, frags); - - for (list::iterator p = frags.begin(); p != frags.end(); ++p) { - CDir *dir = inode->get_or_open_dirfrag(cache, *p); - assert(dir->is_complete()); - dout(10) << " subfrag " << *p << " " << *dir << dendl; - - // steal dentries - while (!dir->items.empty()) - steal_dentry(dir->items.begin()->second); - - // merge replica map - for (map::iterator p = dir->replica_map.begin(); - p != dir->replica_map.end(); - ++p) - replica_map[p->first] = MAX(replica_map[p->first], p->second); - - // merge state - state_set(dir->get_state() & MASK_STATE_FRAGMENT_KEPT); - dir_auth = dir->dir_auth; - - dir->purge_stolen(waiters); - inode->close_dirfrag(dir->get_frag()); - } - - init_fragment_pins(); -} - - - - - - - -CDirDiscover *CDir::replicate_to(int mds) -{ - assert(is_auth()); - return new CDirDiscover( this, add_replica(mds) ); -} - - - - - -/**************************************** - * WAITING - */ - -void CDir::add_dentry_waiter(const string& dname, Context *c) -{ - if (waiting_on_dentry.empty()) - get(PIN_DNWAITER); - waiting_on_dentry[dname].push_back(c); - dout(10) << "add_dentry_waiter dentry " << dname << " " << c << " on " << *this << dendl; -} - -void CDir::take_dentry_waiting(const string& dname, list& ls) -{ - if (waiting_on_dentry.empty()) return; - if (waiting_on_dentry.count(dname) == 0) return; - dout(10) << "take_dentry_waiting dentry " << dname - << " x " << waiting_on_dentry[dname].size() - << " on " << *this << dendl; - ls.splice(ls.end(), waiting_on_dentry[dname]); - waiting_on_dentry.erase(dname); - if (waiting_on_dentry.empty()) - put(PIN_DNWAITER); -} - -void CDir::add_ino_waiter(inodeno_t ino, Context *c) -{ - if (waiting_on_ino.empty()) - get(PIN_INOWAITER); - waiting_on_ino[ino].push_back(c); - dout(10) << "add_ino_waiter ino " << ino << " " << c << " on " << *this << dendl; -} - -void CDir::take_ino_waiting(inodeno_t ino, list& ls) -{ - if (waiting_on_ino.empty()) return; - if (waiting_on_ino.count(ino) == 0) return; - dout(10) << "take_ino_waiting ino " << ino - << " x " << waiting_on_ino[ino].size() - << " on " << *this << dendl; - ls.splice(ls.end(), waiting_on_ino[ino]); - waiting_on_ino.erase(ino); - if (waiting_on_ino.empty()) - put(PIN_INOWAITER); -} - -void CDir::take_sub_waiting(list& ls) -{ - dout(10) << "take_sub_waiting" << dendl; - for (hash_map >::iterator p = waiting_on_dentry.begin(); - p != waiting_on_dentry.end(); - ++p) - ls.splice(ls.end(), p->second); - waiting_on_dentry.clear(); - for (hash_map >::iterator p = waiting_on_ino.begin(); - p != waiting_on_ino.end(); - ++p) - ls.splice(ls.end(), p->second); - waiting_on_ino.clear(); -} - - - -void CDir::add_waiter(int tag, Context *c) -{ - // hierarchical? - - // at free root? - if (tag & WAIT_ATFREEZEROOT) { - if (!(is_freezing_tree_root() || is_frozen_tree_root() || - is_freezing_dir() || is_frozen_dir())) { - // try parent - dout(10) << "add_waiter " << tag << " " << c << " should be ATFREEZEROOT, " << *this << " is not root, trying parent" << dendl; - inode->parent->dir->add_waiter(tag, c); - return; - } - } - - // at subtree root? - if (tag & WAIT_ATSUBTREEROOT) { - if (!is_subtree_root()) { - // try parent - dout(10) << "add_waiter " << tag << " " << c << " should be ATSUBTREEROOT, " << *this << " is not root, trying parent" << dendl; - inode->parent->dir->add_waiter(tag, c); - return; - } - } - - MDSCacheObject::add_waiter(tag, c); -} - - - -/* NOTE: this checks dentry waiters too */ -void CDir::take_waiting(int mask, list& ls) -{ - if (mask & WAIT_DENTRY) { - // take each each dentry waiter - hash_map >::iterator it = - waiting_on_dentry.begin(); - while (it != waiting_on_dentry.end()) { - take_dentry_waiting((it++)->first, ls); // not post-inc - } - } - - // waiting - MDSCacheObject::take_waiting(mask, ls); -} - - -void CDir::finish_waiting(int mask, int result) -{ - dout(11) << "finish_waiting mask " << hex << mask << dec << " result " << result << " on " << *this << dendl; - - list finished; - take_waiting(mask, finished); - if (result < 0) - finish_contexts(finished, result); - else - cache->mds->queue_waiters(finished); -} - - - -// dirty/clean - -version_t CDir::pre_dirty(version_t min) -{ - if (min > projected_version) - projected_version = min; - ++projected_version; - dout(10) << "pre_dirty " << projected_version << dendl; - return projected_version; -} - -void CDir::_mark_dirty(LogSegment *ls) -{ - if (!state_test(STATE_DIRTY)) { - state_set(STATE_DIRTY); - dout(10) << "mark_dirty (was clean) " << *this << " version " << version << dendl; - get(PIN_DIRTY); - assert(ls); - } else { - dout(10) << "mark_dirty (already dirty) " << *this << " version " << version << dendl; - } - if (ls) - ls->dirty_dirfrags.push_back(&xlist_dirty); -} - -void CDir::mark_dirty(version_t pv, LogSegment *ls) -{ - assert(version < pv); - version = pv; - _mark_dirty(ls); -} - -void CDir::mark_clean() -{ - dout(10) << "mark_clean " << *this << " version " << version << dendl; - if (state_test(STATE_DIRTY)) { - state_clear(STATE_DIRTY); - put(PIN_DIRTY); - - xlist_dirty.remove_myself(); - } -} - - - - -void CDir::first_get() -{ - inode->get(CInode::PIN_DIRFRAG); -} - -void CDir::last_put() -{ - inode->put(CInode::PIN_DIRFRAG); -} - - - -/****************************************************************************** - * FETCH and COMMIT - */ - -// ----------------------- -// FETCH - -class C_Dir_Fetch : public Context { - protected: - CDir *dir; - public: - bufferlist bl; - - C_Dir_Fetch(CDir *d) : dir(d) { } - void finish(int result) { - dir->_fetched(bl); - } -}; - -void CDir::fetch(Context *c, bool ignore_authpinnability) -{ - dout(10) << "fetch on " << *this << dendl; - - assert(is_auth()); - assert(!is_complete()); - - if (!can_auth_pin() && !ignore_authpinnability) { - dout(7) << "fetch waiting for authpinnable" << dendl; - add_waiter(WAIT_UNFREEZE, c); - return; - } - - if (c) add_waiter(WAIT_COMPLETE, c); - - // already fetching? - if (state_test(CDir::STATE_FETCHING)) { - dout(7) << "already fetching; waiting" << dendl; - return; - } - - auth_pin(); - state_set(CDir::STATE_FETCHING); - - if (cache->mds->logger) cache->mds->logger->inc("dir_f"); - - // start by reading the first hunk of it - C_Dir_Fetch *fin = new C_Dir_Fetch(this); - cache->mds->objecter->read( get_ondisk_object(), - 0, 0, // whole object - cache->mds->objecter->osdmap->file_to_object_layout( get_ondisk_object(), - g_OSD_MDDirLayout ), - &fin->bl, - fin ); -} - -void CDir::_fetched(bufferlist &bl) -{ - dout(10) << "_fetched " << bl.length() - << " bytes for " << *this - << dendl; - - assert(is_auth()); - assert(!is_frozen()); - - // decode. - int len = bl.length(); - int off = 0; - version_t got_version; - - ::_decode(got_version, bl, off); - - dout(10) << "_fetched version " << got_version - << ", " << len << " bytes" - << dendl; - - int32_t n; - ::_decode(n, bl, off); - - //int num_new_inodes_loaded = 0; - - for (int i=0; iget_inode() == 0) { - dout(12) << "_fetched had NEG dentry " << *dn << dendl; - } else { - dout(12) << "_fetched had dentry " << *dn << dendl; - } - } else { - // (remote) link - dn = add_remote_dentry(dname, ino, d_type); - - // link to inode? - CInode *in = cache->get_inode(ino); // we may or may not have it. - if (in) { - dn->link_remote(in); - dout(12) << "_fetched got remote link " << ino << " which we have " << *in << dendl; - } else { - dout(12) << "_fetched got remote link " << ino << " (dont' have it)" << dendl; - } - } - } - else if (type == 'I') { - // inode - - // parse out inode - inode_t inode; - ::_decode(inode, bl, off); - - string symlink; - if (inode.is_symlink()) - ::_decode(symlink, bl, off); - - fragtree_t fragtree; - fragtree._decode(bl, off); - - if (dn) { - if (dn->get_inode() == 0) { - dout(12) << "_fetched had NEG dentry " << *dn << dendl; - } else { - dout(12) << "_fetched had dentry " << *dn << dendl; - } - } else { - // add inode - CInode *in = 0; - if (cache->have_inode(inode.ino)) { - in = cache->get_inode(inode.ino); - dout(-12) << "_fetched got (but i already had) " << *in - << " mode " << in->inode.mode - << " mtime " << in->inode.mtime << dendl; - assert(0); // this shouldn't happen!! - } else { - // inode - in = new CInode(cache); - in->inode = inode; - - // symlink? - if (in->is_symlink()) - in->symlink = symlink; - - // dirfragtree - in->dirfragtree.swap(fragtree); - - // add - cache->add_inode( in ); - - // link - dn = add_primary_dentry(dname, in); - dout(12) << "_fetched got " << *dn << " " << *in << dendl; - - //in->hack_accessed = false; - //in->hack_load_stamp = g_clock.now(); - //num_new_inodes_loaded++; - } - } - } else { - dout(1) << "corrupt directory, i got tag char '" << type << "' val " << (int)(type) - << " at pos " << off << dendl; - assert(0); - } - - // make note of dentry position in the directory - dn->dir_offset = dn_offset; - - /** clean underwater item? - * Underwater item is something that is dirty in our cache from - * journal replay, but was previously flushed to disk before the - * mds failed. - * - * We only do this is committed_version == 0. that implies either - * - this is a fetch after from a clean/empty CDir is created - * (and has no effect, since the dn won't exist); or - * - this is a fetch after _recovery_, which is what we're worried - * about. Items that are marked dirty from the journal should be - * marked clean if they appear on disk. - */ - if (committed_version == 0 && - dn && - dn->get_version() <= got_version && - dn->is_dirty()) { - dout(10) << "_fetched had underwater dentry " << *dn << ", marking clean" << dendl; - dn->mark_clean(); - - if (dn->get_inode()) { - assert(dn->get_inode()->get_version() <= got_version); - dout(10) << "_fetched had underwater inode " << *dn->get_inode() << ", marking clean" << dendl; - dn->get_inode()->mark_clean(); - } - } - } - //assert(off == len); no, directories may shrink. add this back in when we properly truncate objects on write. - - // take the loaded version? - // only if we are a fresh CDir* with no prior state. - if (version == 0) { - assert(projected_version == 0); - assert(!state_test(STATE_COMMITTING)); - projected_version = version = committing_version = committed_version = got_version; - } - - //cache->mds->logger->inc("newin", num_new_inodes_loaded); - //hack_num_accessed = 0; - - // mark complete, !fetching - state_set(STATE_COMPLETE); - state_clear(STATE_FETCHING); - auth_unpin(); - - // kick waiters - finish_waiting(WAIT_COMPLETE, 0); -} - - - -// ----------------------- -// COMMIT - -/** - * commit - * - * @param want - min version i want committed - * @param c - callback for completion - */ -void CDir::commit(version_t want, Context *c) -{ - dout(10) << "commit want " << want << " on " << *this << dendl; - if (want == 0) want = version; - - // preconditions - assert(want <= version || version == 0); // can't commit the future - assert(want > committed_version); // the caller is stupid - assert(is_auth()); - assert(can_auth_pin()); - - // note: queue up a noop if necessary, so that we always - // get an auth_pin. - if (!c) - c = new C_NoopContext; - - // auth_pin on first waiter - if (waiting_for_commit.empty()) - auth_pin(); - waiting_for_commit[want].push_back(c); - - // ok. - _commit(want); -} - - -class C_Dir_RetryCommit : public Context { - CDir *dir; - version_t want; -public: - C_Dir_RetryCommit(CDir *d, version_t v) : - dir(d), want(v) { } - void finish(int r) { - dir->_commit(want); - } -}; - -class C_Dir_Committed : public Context { - CDir *dir; - version_t version; -public: - C_Dir_Committed(CDir *d, version_t v) : dir(d), version(v) { } - void finish(int r) { - dir->_committed(version); - } -}; - -void CDir::_commit(version_t want) -{ - dout(10) << "_commit want " << want << " on " << *this << dendl; - - // we can't commit things in the future. - // (even the projected future.) - assert(want <= version || version == 0); - - // check pre+postconditions. - assert(is_auth()); - - // already committed? - if (committed_version >= want) { - dout(10) << "already committed " << committed_version << " >= " << want << dendl; - return; - } - // already committing >= want? - if (committing_version >= want) { - dout(10) << "already committing " << committing_version << " >= " << want << dendl; - assert(state_test(STATE_COMMITTING)); - return; - } - - // complete? - if (!is_complete()) { - dout(7) << "commit not complete, fetching first" << dendl; - if (cache->mds->logger) cache->mds->logger->inc("dir_ffc"); - fetch(new C_Dir_RetryCommit(this, want)); - return; - } - - // commit. - committing_version = version; - - // mark committing (if not already) - if (!state_test(STATE_COMMITTING)) { - dout(10) << "marking committing" << dendl; - state_set(STATE_COMMITTING); - } - - if (cache->mds->logger) cache->mds->logger->inc("dir_c"); - - // encode - bufferlist bl; - - ::_encode(version, bl); - int32_t n = nitems; - ::_encode(n, bl); - - for (map_t::iterator it = items.begin(); - it != items.end(); - it++) { - CDentry *dn = it->second; - - if (dn->is_null()) - continue; // skip negative entries - - n--; - - // primary or remote? - if (dn->is_remote()) { - inodeno_t ino = dn->get_remote_ino(); - unsigned char d_type = dn->get_remote_d_type(); - dout(14) << " pos " << bl.length() << " dn '" << it->first << "' remote ino " << ino << dendl; - - // marker, name, ino - bl.append( "L", 1 ); // remote link - ::_encode(it->first, bl); - ::_encode(ino, bl); - ::_encode(d_type, bl); - } else { - // primary link - CInode *in = dn->get_inode(); - assert(in); - - dout(14) << " pos " << bl.length() << " dn '" << it->first << "' inode " << *in << dendl; - - // marker, name, inode, [symlink string] - bl.append( "I", 1 ); // inode - ::_encode(it->first, bl); - ::_encode(in->inode, bl); - - if (in->is_symlink()) { - // include symlink destination! - dout(18) << " inlcuding symlink ptr " << in->symlink << dendl; - ::_encode(in->symlink, bl); - } - - in->dirfragtree._encode(bl); - } - } - assert(n == 0); - - // write it. - cache->mds->objecter->write( get_ondisk_object(), - 0, bl.length(), - cache->mds->objecter->osdmap->file_to_object_layout( get_ondisk_object(), - g_OSD_MDDirLayout ), - bl, - NULL, new C_Dir_Committed(this, version) ); -} - - -/** - * _committed - * - * @param v version i just committed - */ -void CDir::_committed(version_t v) -{ - dout(10) << "_committed v " << v << " on " << *this << dendl; - assert(is_auth()); - - // take note. - assert(v > committed_version); - assert(v <= committing_version); - committed_version = v; - - // _all_ commits done? - if (committing_version == committed_version) - state_clear(CDir::STATE_COMMITTING); - - // dir clean? - if (committed_version == version) - mark_clean(); - - // dentries clean? - for (map_t::iterator it = items.begin(); - it != items.end(); ) { - CDentry *dn = it->second; - it++; - - // dentry - if (committed_version >= dn->get_version()) { - if (dn->is_dirty()) { - dout(15) << " dir " << committed_version << " >= dn " << dn->get_version() << " now clean " << *dn << dendl; - dn->mark_clean(); - } - } else { - dout(15) << " dir " << committed_version << " < dn " << dn->get_version() << " still dirty " << *dn << dendl; - } - - // inode? - if (dn->is_primary()) { - CInode *in = dn->get_inode(); - assert(in); - assert(in->is_auth()); - - if (committed_version >= in->get_version()) { - if (in->is_dirty()) { - dout(15) << " dir " << committed_version << " >= inode " << in->get_version() << " now clean " << *in << dendl; - in->mark_clean(); - } - } else { - dout(15) << " dir " << committed_version << " < inode " << in->get_version() << " still dirty " << *in << dendl; - assert(in->is_dirty()); - } - } - } - - // finishers? - bool were_waiters = !waiting_for_commit.empty(); - - map >::iterator p = waiting_for_commit.begin(); - while (p != waiting_for_commit.end()) { - map >::iterator n = p; - n++; - if (p->first > committed_version) break; // haven't committed this far yet. - cache->mds->queue_waiters(p->second); - waiting_for_commit.erase(p); - p = n; - } - - // unpin if we kicked the last waiter. - if (were_waiters && - waiting_for_commit.empty()) - auth_unpin(); -} - - - - - -// IMPORT/EXPORT - -void CDir::encode_export(bufferlist& bl) -{ - ::_encode_simple(version, bl); - ::_encode_simple(committed_version, bl); - ::_encode_simple(committed_version_equivalent, bl); - - ::_encode_simple(state, bl); - ::_encode_simple(dir_rep, bl); - - ::_encode_simple(pop_me, bl); - ::_encode_simple(pop_auth_subtree, bl); - - ::_encode_simple(dir_rep_by, bl); - ::_encode_simple(replica_map, bl); - - get(PIN_TEMPEXPORTING); -} - -void CDir::finish_export(utime_t now) -{ - pop_auth_subtree_nested -= pop_auth_subtree; - pop_me.zero(now); - pop_auth_subtree.zero(now); - put(PIN_TEMPEXPORTING); -} - -void CDir::decode_import(bufferlist::iterator& blp) -{ - ::_decode_simple(version, blp); - ::_decode_simple(committed_version, blp); - ::_decode_simple(committed_version_equivalent, blp); - committing_version = committed_version; - projected_version = version; - - unsigned s; - ::_decode_simple(s, blp); - state &= MASK_STATE_IMPORT_KEPT; - state |= (s & MASK_STATE_EXPORTED); - if (is_dirty()) get(PIN_DIRTY); - - ::_decode_simple(dir_rep, blp); - - ::_decode_simple(pop_me, blp); - ::_decode_simple(pop_auth_subtree, blp); - pop_auth_subtree_nested += pop_auth_subtree; - - ::_decode_simple(dir_rep_by, blp); - ::_decode_simple(replica_map, blp); - if (!replica_map.empty()) get(PIN_REPLICATED); - - replica_nonce = 0; // no longer defined -} - - - - -/******************************** - * AUTHORITY - */ - -/* - * if dir_auth.first == parent, auth is same as inode. - * unless .second != unknown, in which case that sticks. - */ -pair CDir::authority() -{ - if (is_subtree_root()) - return dir_auth; - else - return inode->authority(); -} - -/** is_subtree_root() - * true if this is an auth delegation point. - * that is, dir_auth != default (parent,unknown) - * - * some key observations: - * if i am auth: - * - any region bound will be an export, or frozen. - * - * note that this DOES heed dir_auth.pending - */ -/* -bool CDir::is_subtree_root() -{ - if (dir_auth == CDIR_AUTH_DEFAULT) { - //dout(10) << "is_subtree_root false " << dir_auth << " != " << CDIR_AUTH_DEFAULT - //<< " on " << ino() << dendl; - return false; - } else { - //dout(10) << "is_subtree_root true " << dir_auth << " != " << CDIR_AUTH_DEFAULT - //<< " on " << ino() << dendl; - return true; - } -} -*/ - -/** contains(x) - * true if we are x, or an ancestor of x - */ -bool CDir::contains(CDir *x) -{ - while (1) { - if (x == this) return true; - x = x->get_parent_dir(); - if (x == 0) return false; - } -} - - - -/** set_dir_auth - */ -void CDir::set_dir_auth(pair a) -{ - dout(10) << "setting dir_auth=" << a - << " from " << dir_auth - << " on " << *this << dendl; - - bool was_subtree = is_subtree_root(); - bool was_ambiguous = dir_auth.second >= 0; - - // set it. - dir_auth = a; - - // new subtree root? - if (!was_subtree && is_subtree_root()) { - dout(10) << " new subtree root, adjusting auth_pins" << dendl; - - // adjust nested auth pins - inode->adjust_nested_auth_pins(-get_cum_auth_pins()); - - // unpin parent of frozen dir/tree? - if (inode->is_auth() && (is_frozen_tree_root() || is_frozen_dir())) - inode->auth_unpin(); - } - if (was_subtree && !is_subtree_root()) { - dout(10) << " old subtree root, adjusting auth_pins" << dendl; - - // adjust nested auth pins - inode->adjust_nested_auth_pins(get_cum_auth_pins()); - - // pin parent of frozen dir/tree? - if (inode->is_auth() && (is_frozen_tree_root() || is_frozen_dir())) - inode->auth_pin(); - } - - // newly single auth? - if (was_ambiguous && dir_auth.second == CDIR_AUTH_UNKNOWN) { - list ls; - take_waiting(WAIT_SINGLEAUTH, ls); - cache->mds->queue_waiters(ls); - } -} - - -/***************************************** - * AUTH PINS and FREEZING - * - * the basic plan is that auth_pins only exist in auth regions, and they - * prevent a freeze (and subsequent auth change). - * - * however, we also need to prevent a parent from freezing if a child is frozen. - * for that reason, the parent inode of a frozen directory is auth_pinned. - * - * the oddity is when the frozen directory is a subtree root. if that's the case, - * the parent inode isn't frozen. which means that when subtree authority is adjusted - * at the bounds, inodes for any frozen bound directories need to get auth_pins at that - * time. - * - */ - -void CDir::auth_pin() -{ - if (auth_pins == 0) - get(PIN_AUTHPIN); - auth_pins++; - - dout(10) << "auth_pin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << dendl; - - // nest pins? - if (is_subtree_root()) return; // no. - //assert(!is_import()); - - inode->adjust_nested_auth_pins(1); -} - -void CDir::auth_unpin() -{ - auth_pins--; - if (auth_pins == 0) - put(PIN_AUTHPIN); - - dout(10) << "auth_unpin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << dendl; - assert(auth_pins >= 0); - - maybe_finish_freeze(); // pending freeze? - - // nest? - if (is_subtree_root()) return; // no. - //assert(!is_import()); - - inode->adjust_nested_auth_pins(-1); -} - -void CDir::adjust_nested_auth_pins(int inc) -{ - nested_auth_pins += inc; - - dout(15) << "adjust_nested_auth_pins " << inc << " on " << *this - << " count now " << auth_pins << " + " << nested_auth_pins << dendl; - assert(nested_auth_pins >= 0); - - maybe_finish_freeze(); // pending freeze? - - // adjust my inode? - if (is_subtree_root()) - return; // no, stop. - - // yes. - inode->adjust_nested_auth_pins(inc); -} - - - -/***************************************************************************** - * FREEZING - */ - -// FREEZE TREE - -bool CDir::freeze_tree() -{ - assert(!is_frozen()); - assert(!is_freezing()); - - auth_pin(); - if (is_freezeable(true)) { - _freeze_tree(); - auth_unpin(); - return true; - } else { - state_set(STATE_FREEZINGTREE); - dout(10) << "freeze_tree waiting " << *this << dendl; - return false; - } -} - -void CDir::_freeze_tree() -{ - dout(10) << "_freeze_tree " << *this << dendl; - assert(is_freezeable(true)); - - // twiddle state - state_clear(STATE_FREEZINGTREE); // actually, this may get set again by next context? - state_set(STATE_FROZENTREE); - get(PIN_FROZEN); - - // auth_pin inode for duration of freeze, if we are not a subtree root. - if (is_auth() && !is_subtree_root()) - inode->auth_pin(); -} - -void CDir::unfreeze_tree() -{ - dout(10) << "unfreeze_tree " << *this << dendl; - - if (state_test(STATE_FROZENTREE)) { - // frozen. unfreeze. - state_clear(STATE_FROZENTREE); - put(PIN_FROZEN); - - // unpin (may => FREEZEABLE) FIXME: is this order good? - if (is_auth() && !is_subtree_root()) - inode->auth_unpin(); - - // waiters? - finish_waiting(WAIT_UNFREEZE); - } else { - finish_waiting(WAIT_FROZEN, -1); - - // freezing. stop it. - assert(state_test(STATE_FREEZINGTREE)); - state_clear(STATE_FREEZINGTREE); - auth_unpin(); - - finish_waiting(WAIT_UNFREEZE); - } -} - -bool CDir::is_freezing_tree() -{ - CDir *dir = this; - while (1) { - if (dir->is_freezing_tree_root()) return true; - if (dir->is_subtree_root()) return false; - if (dir->inode->parent) - dir = dir->inode->parent->dir; - else - return false; // root on replica - } -} - -bool CDir::is_frozen_tree() -{ - CDir *dir = this; - while (1) { - if (dir->is_frozen_tree_root()) return true; - if (dir->is_subtree_root()) return false; - if (dir->inode->parent) - dir = dir->inode->parent->dir; - else - return false; // root on replica - } -} - -CDir *CDir::get_frozen_tree_root() -{ - assert(is_frozen()); - CDir *dir = this; - while (1) { - if (dir->is_frozen_tree_root()) - return dir; - if (dir->inode->parent) - dir = dir->inode->parent->dir; - else - assert(0); - } -} - - - -// FREEZE DIR - -bool CDir::freeze_dir() -{ - assert(!is_frozen()); - assert(!is_freezing()); - - auth_pin(); - if (is_freezeable_dir(true)) { - _freeze_dir(); - auth_unpin(); - return true; - } else { - state_set(STATE_FREEZINGDIR); - dout(10) << "freeze_dir + wait " << *this << dendl; - return false; - } -} - -void CDir::_freeze_dir() -{ - dout(10) << "_freeze_dir " << *this << dendl; - assert(is_freezeable_dir(true)); - - state_clear(STATE_FREEZINGDIR); - state_set(STATE_FROZENDIR); - get(PIN_FROZEN); - - if (is_auth() && !is_subtree_root()) - inode->auth_pin(); // auth_pin for duration of freeze -} - - -void CDir::unfreeze_dir() -{ - dout(10) << "unfreeze_dir " << *this << dendl; - - if (state_test(STATE_FROZENDIR)) { - state_clear(STATE_FROZENDIR); - put(PIN_FROZEN); - - // unpin (may => FREEZEABLE) FIXME: is this order good? - if (is_auth() && !is_subtree_root()) - inode->auth_unpin(); - - finish_waiting(WAIT_UNFREEZE); - } else { - finish_waiting(WAIT_FROZEN, -1); - - // still freezing. stop. - assert(state_test(STATE_FREEZINGDIR)); - state_clear(STATE_FREEZINGDIR); - auth_unpin(); - - finish_waiting(WAIT_UNFREEZE); - } -} - - - - - - - - diff --git a/branches/sage/ebofs2/mds/CDir.h b/branches/sage/ebofs2/mds/CDir.h deleted file mode 100644 index 99bad3801e130..0000000000000 --- a/branches/sage/ebofs2/mds/CDir.h +++ /dev/null @@ -1,540 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __CDIR_H -#define __CDIR_H - -#include "include/types.h" -#include "include/buffer.h" -#include "mdstypes.h" -#include "config.h" -#include "common/DecayCounter.h" - -#include -#include - -#include -#include -#include -#include -using namespace std; - -#include -using __gnu_cxx::hash_map; - - -#include "CInode.h" - -class CDentry; -class MDCache; -class MDCluster; -class Context; -class CDirDiscover; - - -ostream& operator<<(ostream& out, class CDir& dir); - - -class CDir : public MDSCacheObject { - public: - // -- pins -- - static const int PIN_DNWAITER = 1; - static const int PIN_INOWAITER = 2; - static const int PIN_CHILD = 3; - static const int PIN_FROZEN = 4; - static const int PIN_SUBTREE = 5; - static const int PIN_IMPORTING = 7; - static const int PIN_IMPORTBOUND = 9; - static const int PIN_EXPORTBOUND = 10; - static const int PIN_STICKY = 11; - static const int PIN_SUBTREETEMP = 12; // used by MDCache::trim_non_auth() - const char *pin_name(int p) { - switch (p) { - case PIN_DNWAITER: return "dnwaiter"; - case PIN_INOWAITER: return "inowaiter"; - case PIN_CHILD: return "child"; - case PIN_FROZEN: return "frozen"; - case PIN_SUBTREE: return "subtree"; - case PIN_IMPORTING: return "importing"; - case PIN_IMPORTBOUND: return "importbound"; - case PIN_EXPORTBOUND: return "exportbound"; - case PIN_STICKY: return "sticky"; - case PIN_SUBTREETEMP: return "subtreetemp"; - default: return generic_pin_name(p); - } - } - - // -- state -- - static const unsigned STATE_COMPLETE = (1<< 1); // the complete contents are in cache - static const unsigned STATE_FROZENTREE = (1<< 2); // root of tree (bounded by exports) - static const unsigned STATE_FREEZINGTREE = (1<< 3); // in process of freezing - static const unsigned STATE_FROZENDIR = (1<< 4); - static const unsigned STATE_FREEZINGDIR = (1<< 5); - static const unsigned STATE_COMMITTING = (1<< 6); // mid-commit - static const unsigned STATE_FETCHING = (1<< 7); // currenting fetching - static const unsigned STATE_IMPORTBOUND = (1<<10); - static const unsigned STATE_EXPORTBOUND = (1<<11); - static const unsigned STATE_EXPORTING = (1<<12); - static const unsigned STATE_IMPORTING = (1<<13); - static const unsigned STATE_FRAGMENTING = (1<<14); - static const unsigned STATE_STICKY = (1<<15); // sticky pin due to inode stickydirs - static const unsigned STATE_DNPINNEDFRAG = (1<<16); // dir is refragmenting - - // common states - static const unsigned STATE_CLEAN = 0; - static const unsigned STATE_INITIAL = 0; - - // these state bits are preserved by an import/export - // ...except if the directory is hashed, in which case none of them are! - static const unsigned MASK_STATE_EXPORTED = - (STATE_COMPLETE|STATE_DIRTY); - static const unsigned MASK_STATE_IMPORT_KEPT = - ( - STATE_IMPORTING - |STATE_IMPORTBOUND|STATE_EXPORTBOUND - |STATE_FROZENTREE - |STATE_STICKY); - static const unsigned MASK_STATE_EXPORT_KEPT = - (STATE_EXPORTING - |STATE_IMPORTBOUND|STATE_EXPORTBOUND - |STATE_FROZENTREE - |STATE_FROZENDIR - |STATE_STICKY); - static const unsigned MASK_STATE_FRAGMENT_KEPT = - (STATE_DIRTY | - STATE_COMPLETE | - STATE_EXPORTBOUND | - STATE_IMPORTBOUND); - - // -- rep spec -- - static const int REP_NONE = 0; - static const int REP_ALL = 1; - static const int REP_LIST = 2; - - - static const int NONCE_EXPORT = 1; - - - // -- wait masks -- - static const int WAIT_DENTRY = (1<<0); // wait for item to be in cache - static const int WAIT_COMPLETE = (1<<1); // wait for complete dir contents - static const int WAIT_FROZEN = (1<<2); // auth pins removed - - static const int WAIT_DNLOCK_OFFSET = 4; - - static const int WAIT_ANY = (0xffffffff); - static const int WAIT_ATFREEZEROOT = (WAIT_UNFREEZE); - static const int WAIT_ATSUBTREEROOT = (WAIT_SINGLEAUTH); - - - - - public: - // context - MDCache *cache; - - CInode *inode; // my inode - frag_t frag; // my frag - - bool is_lt(const MDSCacheObject *r) const { - return dirfrag() < ((const CDir*)r)->dirfrag(); - } - - //int hack_num_accessed; - -public: - //typedef hash_map map_t; // there is a bug somewhere, valgrind me. - typedef map map_t; -protected: - // contents - map_t items; // non-null AND null - unsigned nitems; // # non-null - unsigned nnull; // # null - - int num_dirty; - - - - // state - version_t version; - version_t committing_version; - version_t committed_version; - version_t committed_version_equivalent; // in case of, e.g., temporary file - version_t projected_version; - - xlist::item xlist_dirty; - - // lock nesting, freeze - int auth_pins; - int nested_auth_pins; - int request_pins; - - // cache control (defined for authority; hints for replicas) - int dir_rep; - set dir_rep_by; // if dir_rep == REP_LIST - - // popularity - dirfrag_load_vec_t pop_me; - dirfrag_load_vec_t pop_nested; - dirfrag_load_vec_t pop_auth_subtree; - dirfrag_load_vec_t pop_auth_subtree_nested; - - utime_t last_popularity_sample; - - load_spread_t pop_spread; - - // and to provide density - int num_dentries_nested; - int num_dentries_auth_subtree; - int num_dentries_auth_subtree_nested; - - - // friends - friend class Migrator; - friend class CInode; - friend class MDCache; - friend class MDiscover; - friend class MDBalancer; - - friend class CDirDiscover; - friend class CDirExport; - - public: - CDir(CInode *in, frag_t fg, MDCache *mdcache, bool auth); - - - - // -- accessors -- - inodeno_t ino() const { return inode->ino(); } // deprecate me? - frag_t get_frag() const { return frag; } - dirfrag_t dirfrag() const { return dirfrag_t(inode->ino(), frag); } - - CInode *get_inode() { return inode; } - CDir *get_parent_dir() { return inode->get_parent_dir(); } - - map_t::iterator begin() { return items.begin(); } - map_t::iterator end() { return items.end(); } - unsigned get_size() { - return nitems; - } - unsigned get_nitems() { return nitems; } - unsigned get_nnull() { return nnull; } - - void inc_num_dirty() { num_dirty++; } - void dec_num_dirty() { - assert(num_dirty > 0); - num_dirty--; - } - int get_num_dirty() { - return num_dirty; - } - - - // -- dentries and inodes -- - public: - CDentry* lookup(const string& n) { - map_t::iterator iter = items.find(n); - if (iter == items.end()) - return 0; - else - return iter->second; - } - - CDentry* add_null_dentry(const string& dname); - CDentry* add_primary_dentry(const string& dname, CInode *in); - CDentry* add_remote_dentry(const string& dname, inodeno_t ino, unsigned char d_type); - void remove_dentry( CDentry *dn ); // delete dentry - void link_remote_inode( CDentry *dn, inodeno_t ino, unsigned char d_type); - void link_primary_inode( CDentry *dn, CInode *in ); - void unlink_inode( CDentry *dn ); - void try_remove_unlinked_dn(CDentry *dn); -private: - void link_inode_work( CDentry *dn, CInode *in ); - void unlink_inode_work( CDentry *dn ); - void remove_null_dentries(); - -public: - void split(int bits, list& subs, list& waiters); - void merge(int bits, list& waiters); -private: - void steal_dentry(CDentry *dn); // from another dir. used by merge/split. - void purge_stolen(list& waiters); - void init_fragment_pins(); - - - // -- authority -- - /* - * normal: !subtree_root - * delegation: subtree_root - * ambiguous: subtree_root - * subtree_root - */ - pair dir_auth; - - public: - pair authority(); - pair get_dir_auth() { return dir_auth; } - void set_dir_auth(pair a); - void set_dir_auth(int a) { set_dir_auth(pair(a, CDIR_AUTH_UNKNOWN)); } - bool is_ambiguous_dir_auth() { - return dir_auth.second != CDIR_AUTH_UNKNOWN; - } - bool is_full_dir_auth() { - return is_auth() && !is_ambiguous_dir_auth(); - } - bool is_full_dir_nonauth() { - return !is_auth() && !is_ambiguous_dir_auth(); - } - - bool is_subtree_root() { - return dir_auth != CDIR_AUTH_DEFAULT; - } - - bool contains(CDir *x); // true if we are x or an ancestor of x - - - // for giving to clients - void get_dist_spec(set& ls, int auth) { - if (is_rep()) { - for (map::iterator p = replicas_begin(); - p != replicas_end(); - ++p) - ls.insert(p->first); - if (!ls.empty()) - ls.insert(auth); - } - } - - CDirDiscover *replicate_to(int mds); - - - // -- state -- - bool is_complete() { return state & STATE_COMPLETE; } - bool is_exporting() { return state & STATE_EXPORTING; } - bool is_importing() { return state & STATE_IMPORTING; } - - int get_dir_rep() { return dir_rep; } - bool is_rep() { - if (dir_rep == REP_NONE) return false; - return true; - } - - // -- fetch -- - object_t get_ondisk_object() { return object_t(ino(), frag); } - void fetch(Context *c, bool ignore_authpinnability=false); - void _fetched(bufferlist &bl); - - // -- commit -- - map > waiting_for_commit; - - void commit_to(version_t want); - void commit(version_t want, Context *c); - void _commit(version_t want); - void _committed(version_t v); - void wait_for_commit(Context *c, version_t v=0); - - // -- dirtyness -- - version_t get_version() { return version; } - void set_version(version_t v) { projected_version = version = v; } - version_t get_projected_version() { return projected_version; } - version_t get_committing_version() { return committing_version; } - version_t get_committed_version() { return committed_version; } - version_t get_committed_version_equivalent() { return committed_version_equivalent; } - void set_committed_version(version_t v) { committed_version = v; } - - version_t pre_dirty(version_t min=0); - void _mark_dirty(LogSegment *ls); - void mark_dirty(version_t pv, LogSegment *ls); - void mark_clean(); - void mark_complete() { state_set(STATE_COMPLETE); } - - - // -- reference counting -- - void first_get(); - void last_put(); - - void request_pin_get() { - if (request_pins == 0) get(PIN_REQUEST); - request_pins++; - } - void request_pin_put() { - request_pins--; - if (request_pins == 0) put(PIN_REQUEST); - } - - - // -- waiters -- -protected: - hash_map< string, list > waiting_on_dentry; - hash_map< inodeno_t, list > waiting_on_ino; - -public: - bool is_waiting_for_dentry(const string& dn) { - return waiting_on_dentry.count(dn); - } - void add_dentry_waiter(const string& dentry, Context *c); - void take_dentry_waiting(const string& dentry, list& ls); - - bool is_waiting_for_ino(inodeno_t ino) { - return waiting_on_ino.count(ino); - } - void add_ino_waiter(inodeno_t ino, Context *c); - void take_ino_waiting(inodeno_t ino, list& ls); - - void take_sub_waiting(list& ls); // dentry or ino - - void add_waiter(int mask, Context *c); - void take_waiting(int mask, list& ls); // may include dentry waiters - void finish_waiting(int mask, int result = 0); // ditto - - - // -- import/export -- - void encode_export(bufferlist& bl); - void finish_export(utime_t now); - void abort_export() { - put(PIN_TEMPEXPORTING); - } - void decode_import(bufferlist::iterator& blp); - - - // -- auth pins -- - bool can_auth_pin() { return is_auth() && !(is_frozen() || is_freezing()); } - int is_auth_pinned() { return auth_pins; } - int get_cum_auth_pins() { return auth_pins + nested_auth_pins; } - int get_auth_pins() { return auth_pins; } - int get_nested_auth_pins() { return nested_auth_pins; } - void auth_pin(); - void auth_unpin(); - void adjust_nested_auth_pins(int inc); - - // -- freezing -- - bool freeze_tree(); - void _freeze_tree(); - void unfreeze_tree(); - - bool freeze_dir(); - void _freeze_dir(); - void unfreeze_dir(); - - void maybe_finish_freeze() { - if (auth_pins != 1 || nested_auth_pins != 0) - return; - if (state_test(STATE_FREEZINGTREE)) { - _freeze_tree(); - auth_unpin(); - finish_waiting(WAIT_FROZEN); - } - if (state_test(STATE_FREEZINGDIR)) { - _freeze_dir(); - auth_unpin(); - finish_waiting(WAIT_FROZEN); - } - } - - bool is_freezing() { return is_freezing_tree() || is_freezing_dir(); } - bool is_freezing_tree(); - bool is_freezing_tree_root() { return state & STATE_FREEZINGTREE; } - bool is_freezing_dir() { return state & STATE_FREEZINGDIR; } - - bool is_frozen() { return is_frozen_dir() || is_frozen_tree(); } - bool is_frozen_tree(); - bool is_frozen_tree_root() { return state & STATE_FROZENTREE; } - bool is_frozen_dir() { return state & STATE_FROZENDIR; } - - bool is_freezeable(bool freezing=false) { - // no nested auth pins. - if ((auth_pins-freezing) > 0 || nested_auth_pins > 0) - return false; - - // inode must not be frozen. - if (!is_subtree_root() && inode->is_frozen()) - return false; - - return true; - } - bool is_freezeable_dir(bool freezing=false) { - if ((auth_pins-freezing) > 0) - return false; - - // if not subtree root, inode must not be frozen (tree--frozen_dir is okay). - if (!is_subtree_root() && inode->is_frozen() && !inode->is_frozen_dir()) - return false; - - return true; - } - - CDir *get_frozen_tree_root(); - - - - ostream& print_db_line_prefix(ostream& out); - void print(ostream& out); -}; - - - -// -- encoded state -- - -// discover - -class CDirDiscover { - dirfrag_t dirfrag; - int nonce; - int dir_rep; - set rep_by; - - public: - CDirDiscover() {} - CDirDiscover(CDir *dir, int nonce) { - dirfrag = dir->dirfrag(); - this->nonce = nonce; - dir_rep = dir->dir_rep; - rep_by = dir->dir_rep_by; - } - - void update_dir(CDir *dir) { - assert(dir->dirfrag() == dirfrag); - assert(!dir->is_auth()); - - dir->replica_nonce = nonce; - dir->dir_rep = dir_rep; - dir->dir_rep_by = rep_by; - } - - dirfrag_t get_dirfrag() { return dirfrag; } - - void _encode(bufferlist& bl) { - bl.append((char*)&dirfrag, sizeof(dirfrag)); - bl.append((char*)&nonce, sizeof(nonce)); - bl.append((char*)&dir_rep, sizeof(dir_rep)); - ::_encode(rep_by, bl); - } - - void _decode(bufferlist& bl, int& off) { - bl.copy(off, sizeof(dirfrag), (char*)&dirfrag); - off += sizeof(dirfrag); - bl.copy(off, sizeof(nonce), (char*)&nonce); - off += sizeof(nonce); - bl.copy(off, sizeof(dir_rep), (char*)&dir_rep); - off += sizeof(dir_rep); - ::_decode(rep_by, bl, off); - } - -}; - - - -#endif diff --git a/branches/sage/ebofs2/mds/CInode.cc b/branches/sage/ebofs2/mds/CInode.cc deleted file mode 100644 index 3bdfc89e3f1fa..0000000000000 --- a/branches/sage/ebofs2/mds/CInode.cc +++ /dev/null @@ -1,838 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "CInode.h" -#include "CDir.h" -#include "CDentry.h" - -#include "MDS.h" -#include "MDCache.h" -#include "AnchorTable.h" - -#include "LogSegment.h" - -#include "common/Clock.h" - -#include "messages/MLock.h" - -#include -#include - -#include "config.h" - -#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_mds) *_dout << dbeginl << g_clock.now() << " mds" << mdcache->mds->get_nodeid() << ".cache.ino(" << inode.ino << ") " - - -//int cinode_pins[CINODE_NUM_PINS]; // counts -ostream& CInode::print_db_line_prefix(ostream& out) -{ - return out << g_clock.now() << " mds" << mdcache->mds->get_nodeid() << ".cache.ino(" << inode.ino << ") "; -} - - - -ostream& operator<<(ostream& out, CInode& in) -{ - string path; - in.make_path(path); - out << "[inode " << in.inode.ino << " " << path << (in.is_dir() ? "/ ":" "); - if (in.is_auth()) { - out << "auth"; - if (in.is_replicated()) - out << in.get_replicas(); - } else { - out << "rep@" << in.authority(); - out << "." << in.get_replica_nonce(); - assert(in.get_replica_nonce() >= 0); - } - - if (in.is_symlink()) out << " symlink"; - if (in.is_dir() && !in.dirfragtree.empty()) out << " " << in.dirfragtree; - - out << " v" << in.get_version(); - - if (in.state_test(CInode::STATE_AMBIGUOUSAUTH)) out << " AMBIGAUTH"; - if (in.is_freezing_inode()) out << " FREEZING=" << in.auth_pin_freeze_allowance; - if (in.is_frozen_inode()) out << " FROZEN"; - - // locks - out << " " << in.authlock; - out << " " << in.linklock; - out << " " << in.dirfragtreelock; - out << " " << in.filelock; - out << " " << in.dirlock; - - // hack: spit out crap on which clients have caps - if (!in.get_client_caps().empty()) { - out << " caps={"; - for (map::iterator it = in.get_client_caps().begin(); - it != in.get_client_caps().end(); - it++) { - if (it != in.get_client_caps().begin()) out << ","; - out << it->first; - } - out << "}"; - } - - if (in.get_num_ref()) { - out << " |"; - in.print_pin_set(out); - } - - out << " " << ∈ - out << "]"; - return out; -} - - -void CInode::print(ostream& out) -{ - out << *this; -} - - -inode_t *CInode::project_inode() -{ - if (projected_inode.empty()) { - projected_inode.push_back(new inode_t(inode)); - } else { - projected_inode.push_back(new inode_t(*projected_inode.back())); - } - dout(15) << "project_inode " << projected_inode.back() << dendl; - return projected_inode.back(); -} - -void CInode::pop_and_dirty_projected_inode(LogSegment *ls) -{ - assert(!projected_inode.empty()); - dout(15) << "pop_and_dirty_projected_inode " << projected_inode.front() - << " v" << projected_inode.front()->version << dendl; - mark_dirty(projected_inode.front()->version, ls); - inode = *projected_inode.front(); - delete projected_inode.front(); - projected_inode.pop_front(); -} - - -// ====== CInode ======= - -// dirfrags - -frag_t CInode::pick_dirfrag(const string& dn) -{ - if (dirfragtree.empty()) - return frag_t(); // avoid the string hash if we can. - - static hash H; - return dirfragtree[H(dn)]; -} - -void CInode::get_dirfrags_under(frag_t fg, list& ls) -{ - list fglist; - dirfragtree.get_leaves_under(fg, fglist); - for (list::iterator p = fglist.begin(); - p != fglist.end(); - ++p) - if (dirfrags.count(*p)) - ls.push_back(dirfrags[*p]); -} - -CDir *CInode::get_approx_dirfrag(frag_t fg) -{ - CDir *dir = get_dirfrag(fg); - if (dir) return dir; - - // find a child? - list ls; - get_dirfrags_under(fg, ls); - if (!ls.empty()) - return ls.front(); - - // try parents? - while (1) { - fg = fg.parent(); - dir = get_dirfrag(fg); - if (dir) return dir; - } -} - -void CInode::get_dirfrags(list& ls) -{ - // all dirfrags - for (map::iterator p = dirfrags.begin(); - p != dirfrags.end(); - ++p) - ls.push_back(p->second); -} -void CInode::get_nested_dirfrags(list& ls) -{ - // dirfrags in same subtree - for (map::iterator p = dirfrags.begin(); - p != dirfrags.end(); - ++p) - if (!p->second->is_subtree_root()) - ls.push_back(p->second); -} -void CInode::get_subtree_dirfrags(list& ls) -{ - // dirfrags that are roots of new subtrees - for (map::iterator p = dirfrags.begin(); - p != dirfrags.end(); - ++p) - if (p->second->is_subtree_root()) - ls.push_back(p->second); -} - - -CDir *CInode::get_or_open_dirfrag(MDCache *mdcache, frag_t fg) -{ - assert(is_dir()); - - // have it? - CDir *dir = get_dirfrag(fg); - if (!dir) { - // create it. - assert(is_auth()); - dir = new CDir(this, fg, mdcache, true); - add_dirfrag(dir); - } - return dir; -} - -CDir *CInode::add_dirfrag(CDir *dir) -{ - assert(dirfrags.count(dir->dirfrag().frag) == 0); - dirfrags[dir->dirfrag().frag] = dir; - - if (stickydir_ref > 0) { - dir->state_set(CDir::STATE_STICKY); - dir->get(CDir::PIN_STICKY); - } - - return dir; -} - -void CInode::close_dirfrag(frag_t fg) -{ - dout(14) << "close_dirfrag " << fg << dendl; - assert(dirfrags.count(fg)); - - CDir *dir = dirfrags[fg]; - dir->remove_null_dentries(); - - // clear dirty flag - if (dir->is_dirty()) - dir->mark_clean(); - - if (stickydir_ref > 0) { - dir->state_clear(CDir::STATE_STICKY); - dir->put(CDir::PIN_STICKY); - } - - // dump any remaining dentries, for debugging purposes - for (CDir::map_t::iterator p = dir->items.begin(); - p != dir->items.end(); - ++p) - dout(14) << "close_dirfrag LEFTOVER dn " << *p->second << dendl; - - assert(dir->get_num_ref() == 0); - delete dir; - dirfrags.erase(fg); -} - -void CInode::close_dirfrags() -{ - while (!dirfrags.empty()) - close_dirfrag(dirfrags.begin()->first); -} - -bool CInode::has_subtree_root_dirfrag() -{ - for (map::iterator p = dirfrags.begin(); - p != dirfrags.end(); - ++p) - if (p->second->is_subtree_root()) - return true; - return false; -} - - -void CInode::get_stickydirs() -{ - if (stickydir_ref == 0) { - get(PIN_STICKYDIRS); - for (map::iterator p = dirfrags.begin(); - p != dirfrags.end(); - ++p) { - p->second->state_set(CDir::STATE_STICKY); - p->second->get(CDir::PIN_STICKY); - } - } - stickydir_ref++; -} - -void CInode::put_stickydirs() -{ - assert(stickydir_ref > 0); - stickydir_ref--; - if (stickydir_ref == 0) { - put(PIN_STICKYDIRS); - for (map::iterator p = dirfrags.begin(); - p != dirfrags.end(); - ++p) { - p->second->state_clear(CDir::STATE_STICKY); - p->second->put(CDir::PIN_STICKY); - } - } -} - - - - - -// pins - -void CInode::first_get() -{ - // pin my dentry? - if (parent) - parent->get(CDentry::PIN_INODEPIN); -} - -void CInode::last_put() -{ - // unpin my dentry? - if (parent) - parent->put(CDentry::PIN_INODEPIN); -} - -void CInode::add_remote_parent(CDentry *p) -{ - if (remote_parents.empty()) - get(PIN_REMOTEPARENT); - remote_parents.insert(p); -} -void CInode::remove_remote_parent(CDentry *p) -{ - remote_parents.erase(p); - if (remote_parents.empty()) - put(PIN_REMOTEPARENT); -} - - - - -CDir *CInode::get_parent_dir() -{ - if (parent) - return parent->dir; - return NULL; -} -CInode *CInode::get_parent_inode() -{ - if (parent) - return parent->dir->inode; - return NULL; -} - - - -void CInode::make_path(string& s) -{ - if (parent) { - parent->make_path(s); - } - else if (is_root()) { - s = ""; // root - } - else if (is_stray()) { - s = "~stray"; - char n[10]; - sprintf(n, "%d", (int)(ino()-MDS_INO_STRAY_OFFSET)); - s += n; - } - else { - s = "(dangling)"; // dangling - } -} - -void CInode::make_anchor_trace(vector& trace) -{ - if (parent) { - parent->dir->inode->make_anchor_trace(trace); - trace.push_back(Anchor(ino(), parent->dir->dirfrag())); - dout(10) << "make_anchor_trace added " << trace.back() << dendl; - } - else - assert(is_root() || is_stray()); -} - -void CInode::name_stray_dentry(string& dname) -{ - char s[20]; -#ifdef __LP64__ - sprintf(s, "%lx", inode.ino.val); -#else - sprintf(s, "%llx", inode.ino.val); -#endif - dname = s; -} - - -version_t CInode::pre_dirty() -{ - assert(parent); - return parent->pre_dirty(); -} - -void CInode::_mark_dirty(LogSegment *ls) -{ - if (!state_test(STATE_DIRTY)) { - state_set(STATE_DIRTY); - get(PIN_DIRTY); - assert(ls); - } - - // move myself to this segment's dirty list - if (ls) - ls->dirty_inodes.push_back(&xlist_dirty); -} - -void CInode::mark_dirty(version_t pv, LogSegment *ls) { - - dout(10) << "mark_dirty " << *this << dendl; - - assert(parent); - - /* - NOTE: I may already be dirty, but this fn _still_ needs to be called so that - the directory is (perhaps newly) dirtied, and so that parent_dir_version is - updated below. - */ - - // only auth can get dirty. "dirty" async data in replicas is relative to - // filelock state, not the dirty flag. - assert(is_auth()); - - // touch my private version - assert(inode.version < pv); - inode.version = pv; - _mark_dirty(ls); - - // mark dentry too - parent->mark_dirty(pv, ls); -} - - -void CInode::mark_clean() -{ - dout(10) << " mark_clean " << *this << dendl; - if (state_test(STATE_DIRTY)) { - state_clear(STATE_DIRTY); - put(PIN_DIRTY); - - // remove myself from ls dirty list - xlist_dirty.remove_myself(); - } -} - - - -// ------------------ -// locking - -void CInode::set_object_info(MDSCacheObjectInfo &info) -{ - info.ino = ino(); -} - -void CInode::encode_lock_state(int type, bufferlist& bl) -{ - switch (type) { - case LOCK_OTYPE_IAUTH: - _encode(inode.ctime, bl); - _encode(inode.mode, bl); - _encode(inode.uid, bl); - _encode(inode.gid, bl); - break; - - case LOCK_OTYPE_ILINK: - _encode(inode.ctime, bl); - _encode(inode.nlink, bl); - _encode(inode.anchored, bl); - break; - - case LOCK_OTYPE_IDIRFRAGTREE: - { - // encode the raw tree - dirfragtree._encode(bl); - - // also specify which frags are mine - set myfrags; - list dfls; - get_dirfrags(dfls); - for (list::iterator p = dfls.begin(); p != dfls.end(); ++p) - if ((*p)->is_auth()) - myfrags.insert((*p)->get_frag()); - _encode(myfrags, bl); - } - break; - - case LOCK_OTYPE_IFILE: - _encode(inode.size, bl); - _encode(inode.mtime, bl); - _encode(inode.atime, bl); - break; - - case LOCK_OTYPE_IDIR: - _encode(inode.mtime, bl); - if (0) { - map dfsz; - for (map::iterator p = dirfrags.begin(); - p != dirfrags.end(); - ++p) - if (p->second->is_auth()) - dfsz[p->first] = p->second->get_nitems(); - _encode(dfsz, bl); - } - break; - - default: - assert(0); - } -} - -void CInode::decode_lock_state(int type, bufferlist& bl) -{ - int off = 0; - utime_t tm; - - switch (type) { - case LOCK_OTYPE_IAUTH: - _decode(tm, bl, off); - if (inode.ctime < tm) inode.ctime = tm; - _decode(inode.mode, bl, off); - _decode(inode.uid, bl, off); - _decode(inode.gid, bl, off); - break; - - case LOCK_OTYPE_ILINK: - _decode(tm, bl, off); - if (inode.ctime < tm) inode.ctime = tm; - _decode(inode.nlink, bl, off); - _decode(inode.anchored, bl, off); - break; - - case LOCK_OTYPE_IDIRFRAGTREE: - { - fragtree_t temp; - temp._decode(bl, off); - set authfrags; - _decode(authfrags, bl, off); - if (is_auth()) { - // auth. believe replica's auth frags only. - for (set::iterator p = authfrags.begin(); p != authfrags.end(); ++p) - dirfragtree.force_to_leaf(*p); - } else { - // replica. just take the tree. - dirfragtree.swap(temp); - } - } - break; - - case LOCK_OTYPE_IFILE: - _decode(inode.size, bl, off); - _decode(inode.mtime, bl, off); - _decode(inode.atime, bl, off); - break; - - case LOCK_OTYPE_IDIR: - //::_decode(inode.size, bl, off); - _decode(tm, bl, off); - if (inode.mtime < tm) { - inode.mtime = tm; - if (is_auth()) { - dout(10) << "decode_lock_state auth got mtime " << tm << " > my " << inode.mtime - << ", setting dirlock updated flag on " << *this - << dendl; - dirlock.set_updated(); - } - } - if (0) { - map dfsz; - ::_decode(dfsz, bl, off); - // hmm which to keep? - } - break; - - default: - assert(0); - } -} - -void CInode::clear_dirty_scattered(int type) -{ - dout(10) << "clear_dirty_scattered " << type << " on " << *this << dendl; - switch (type) { - case LOCK_OTYPE_IDIR: - xlist_dirty_inode_mtime.remove_myself(); - break; - default: - assert(0); - } -} - - - -// waiting - -bool CInode::is_frozen() -{ - if (is_frozen_inode()) return true; - if (parent && parent->dir->is_frozen()) return true; - return false; -} - -bool CInode::is_frozen_dir() -{ - if (parent && parent->dir->is_frozen_dir()) return true; - return false; -} - -bool CInode::is_freezing() -{ - if (is_freezing_inode()) return true; - if (parent && parent->dir->is_freezing()) return true; - return false; -} - -void CInode::add_waiter(int tag, Context *c) -{ - dout(10) << "add_waiter tag " << tag - << " !ambig " << !state_test(STATE_AMBIGUOUSAUTH) - << " !frozen " << !is_frozen_inode() - << " !freezing " << !is_freezing_inode() - << dendl; - // wait on the directory? - // make sure its not the inode that is explicitly ambiguous|freezing|frozen - if (((tag & WAIT_SINGLEAUTH) && !state_test(STATE_AMBIGUOUSAUTH)) || - ((tag & WAIT_UNFREEZE) && !is_frozen_inode() && !is_freezing_inode())) { - parent->dir->add_waiter(tag, c); - return; - } - MDSCacheObject::add_waiter(tag, c); -} - -bool CInode::freeze_inode(int auth_pin_allowance) -{ - assert(auth_pin_allowance > 0); // otherwise we need to adjust parent's nested_auth_pins - assert(auth_pins >= auth_pin_allowance); - if (auth_pins > auth_pin_allowance) { - dout(10) << "freeze_inode - waiting for auth_pins to drop to " << auth_pin_allowance << dendl; - auth_pin_freeze_allowance = auth_pin_allowance; - get(PIN_FREEZING); - state_set(STATE_FREEZING); - return false; - } - - dout(10) << "freeze_inode - frozen" << dendl; - assert(auth_pins == auth_pin_allowance); - get(PIN_FROZEN); - state_set(STATE_FROZEN); - return true; -} - -void CInode::unfreeze_inode(list& finished) -{ - dout(10) << "unfreeze_inode" << dendl; - if (state_test(STATE_FREEZING)) { - state_clear(STATE_FREEZING); - put(PIN_FREEZING); - } else if (state_test(STATE_FROZEN)) { - state_clear(STATE_FROZEN); - put(PIN_FROZEN); - } else - assert(0); - take_waiting(WAIT_UNFREEZE, finished); -} - - -// auth_pins -bool CInode::can_auth_pin() { - if (is_freezing_inode() || is_frozen_inode()) return false; - if (parent) - return parent->can_auth_pin(); - return true; -} - -void CInode::auth_pin() -{ - if (auth_pins == 0) - get(PIN_AUTHPIN); - auth_pins++; - - dout(10) << "auth_pin on " << *this - << " now " << auth_pins << "+" << nested_auth_pins - << dendl; - - if (parent) - parent->adjust_nested_auth_pins( 1 ); -} - -void CInode::auth_unpin() -{ - auth_pins--; - if (auth_pins == 0) - put(PIN_AUTHPIN); - - dout(10) << "auth_unpin on " << *this - << " now " << auth_pins << "+" << nested_auth_pins - << dendl; - - assert(auth_pins >= 0); - - if (parent) - parent->adjust_nested_auth_pins( -1 ); - - if (is_freezing_inode() && - auth_pins == auth_pin_freeze_allowance) { - dout(10) << "auth_unpin freezing!" << dendl; - get(PIN_FROZEN); - put(PIN_FREEZING); - state_clear(STATE_FREEZING); - state_set(STATE_FROZEN); - finish_waiting(WAIT_FROZEN); - } -} - -void CInode::adjust_nested_auth_pins(int a) -{ - if (!parent) return; - nested_auth_pins += a; - - dout(15) << "adjust_nested_auth_pins by " << a - << " now " << auth_pins << "+" << nested_auth_pins - << dendl; - assert(nested_auth_pins >= 0); - - parent->adjust_nested_auth_pins(a); -} - - - -// authority - -pair CInode::authority() -{ - if (force_auth.first >= 0) - return force_auth; - - if (parent) - return parent->dir->authority(); - - return CDIR_AUTH_UNDEF; -} - - -CInodeDiscover* CInode::replicate_to( int rep ) -{ - assert(is_auth()); - - // relax locks? - if (!is_replicated()) - replicate_relax_locks(); - - // return the thinger - int nonce = add_replica( rep ); - return new CInodeDiscover( this, nonce ); -} - - - - -// IMPORT/EXPORT - -void CInode::encode_export(bufferlist& bl) -{ - ::_encode_simple(inode, bl); - ::_encode_simple(symlink, bl); - dirfragtree._encode(bl); - - bool dirty = is_dirty(); - ::_encode_simple(dirty, bl); - - ::_encode_simple(pop, bl); - - ::_encode_simple(replica_map, bl); - - map cap_map; - export_client_caps(cap_map); - ::_encode_simple(cap_map, bl); - - authlock._encode(bl); - linklock._encode(bl); - dirfragtreelock._encode(bl); - filelock._encode(bl); - dirlock._encode(bl); - - get(PIN_TEMPEXPORTING); -} - -void CInode::finish_export(utime_t now) -{ - pop.zero(now); - - // just in case! - dirlock.clear_updated(); - - put(PIN_TEMPEXPORTING); -} - -void CInode::decode_import(bufferlist::iterator& p, - set& new_client_caps, - LogSegment *ls) -{ - utime_t old_mtime = inode.mtime; - ::_decode_simple(inode, p); - if (old_mtime > inode.mtime) { - assert(dirlock.is_updated()); - inode.mtime = old_mtime; // preserve our mtime, if it is larger - } - - ::_decode_simple(symlink, p); - dirfragtree._decode(p); - - bool dirty; - ::_decode_simple(dirty, p); - if (dirty) - _mark_dirty(ls); - - ::_decode_simple(pop, p); - - ::_decode_simple(replica_map, p); - if (!replica_map.empty()) get(PIN_REPLICATED); - - map cap_map; - ::_decode_simple(cap_map, p); - merge_client_caps(cap_map, new_client_caps); - - authlock._decode(p); - linklock._decode(p); - dirfragtreelock._decode(p); - filelock._decode(p); - dirlock._decode(p); -} diff --git a/branches/sage/ebofs2/mds/CInode.h b/branches/sage/ebofs2/mds/CInode.h deleted file mode 100644 index 8f453472a0477..0000000000000 --- a/branches/sage/ebofs2/mds/CInode.h +++ /dev/null @@ -1,612 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __CINODE_H -#define __CINODE_H - -#include "config.h" -#include "include/types.h" -#include "include/lru.h" - -#include "mdstypes.h" - -#include "CDentry.h" -#include "SimpleLock.h" -#include "FileLock.h" -#include "ScatterLock.h" -#include "LocalLock.h" -#include "Capability.h" - - -#include -#include -#include -#include -#include -#include -using namespace std; - -class Context; -class CDentry; -class CDir; -class Message; -class CInode; -class CInodeDiscover; -class MDCache; -class LogSegment; - -ostream& operator<<(ostream& out, CInode& in); - - -// cached inode wrapper -class CInode : public MDSCacheObject { - public: - // -- pins -- - static const int PIN_DIRFRAG = -1; - static const int PIN_CAPS = 2; // client caps - static const int PIN_IMPORTING = -4; // importing - static const int PIN_ANCHORING = 5; - static const int PIN_UNANCHORING = 6; - static const int PIN_OPENINGDIR = 7; - static const int PIN_REMOTEPARENT = 8; - static const int PIN_BATCHOPENJOURNAL = 9; - static const int PIN_SCATTERED = 10; - static const int PIN_STICKYDIRS = 11; - static const int PIN_PURGING = -12; - static const int PIN_FREEZING = 13; - static const int PIN_FROZEN = 14; - - const char *pin_name(int p) { - switch (p) { - case PIN_DIRFRAG: return "dirfrag"; - case PIN_CAPS: return "caps"; - case PIN_IMPORTING: return "importing"; - case PIN_ANCHORING: return "anchoring"; - case PIN_UNANCHORING: return "unanchoring"; - case PIN_OPENINGDIR: return "openingdir"; - case PIN_REMOTEPARENT: return "remoteparent"; - case PIN_BATCHOPENJOURNAL: return "batchopenjournal"; - case PIN_SCATTERED: return "scattered"; - case PIN_STICKYDIRS: return "stickydirs"; - case PIN_FREEZING: return "freezing"; - case PIN_FROZEN: return "frozen"; - default: return generic_pin_name(p); - } - } - - // -- state -- - static const int STATE_EXPORTING = (1<<2); // on nonauth bystander. - static const int STATE_ANCHORING = (1<<3); - static const int STATE_UNANCHORING = (1<<4); - static const int STATE_OPENINGDIR = (1<<5); - static const int STATE_REJOINUNDEF = (1<<6); // inode contents undefined. - static const int STATE_FREEZING = (1<<7); - static const int STATE_FROZEN = (1<<8); - static const int STATE_AMBIGUOUSAUTH = (1<<9); - - // -- waiters -- - //static const int WAIT_SLAVEAGREE = (1<<0); - static const int WAIT_DIR = (1<<1); - static const int WAIT_ANCHORED = (1<<2); - static const int WAIT_UNANCHORED = (1<<3); - static const int WAIT_CAPS = (1<<4); - static const int WAIT_FROZEN = (1<<5); - - static const int WAIT_AUTHLOCK_OFFSET = 5; - static const int WAIT_LINKLOCK_OFFSET = 5 + SimpleLock::WAIT_BITS; - static const int WAIT_DIRFRAGTREELOCK_OFFSET = 5 + 2*SimpleLock::WAIT_BITS; - static const int WAIT_FILELOCK_OFFSET = 5 + 3*SimpleLock::WAIT_BITS; - static const int WAIT_DIRLOCK_OFFSET = 5 + 4*SimpleLock::WAIT_BITS; - static const int WAIT_VERSIONLOCK_OFFSET = 5 + 5*SimpleLock::WAIT_BITS; - - static const int WAIT_ANY = 0xffffffff; - - // misc - static const int EXPORT_NONCE = 1; // nonce given to replicas created by export - - ostream& print_db_line_prefix(ostream& out); - - public: - MDCache *mdcache; - - // inode contents proper - inode_t inode; // the inode itself - string symlink; // symlink dest, if symlink - fragtree_t dirfragtree; // dir frag tree, if any. always consistent with our dirfrag map. - //map dirfrag_size; // size of each dirfrag - - off_t last_journaled; // log offset for the last time i was journaled - off_t last_open_journaled; // log offset for the last journaled EOpen - - //bool hack_accessed; - //utime_t hack_load_stamp; - - // projected values (only defined while dirty) - list projected_inode; - list projected_dirfragtree; - - version_t get_projected_version() { - if (projected_inode.empty()) - return inode.version; - else - return projected_inode.back()->version; - } - - inode_t *project_inode(); - void pop_and_dirty_projected_inode(LogSegment *ls); - - // -- cache infrastructure -- -private: - map dirfrags; // cached dir fragments - int stickydir_ref; - -public: - frag_t pick_dirfrag(const string &dn); - bool has_dirfrags() { return !dirfrags.empty(); } - CDir* get_dirfrag(frag_t fg) { - if (dirfrags.count(fg)) { - assert(g_conf.debug_mds < 2 || dirfragtree.is_leaf(fg)); // performance hack FIXME - return dirfrags[fg]; - } else - return 0; - } - void get_dirfrags_under(frag_t fg, list& ls); - CDir* get_approx_dirfrag(frag_t fg); - void get_dirfrags(list& ls); - void get_nested_dirfrags(list& ls); - void get_subtree_dirfrags(list& ls); - CDir *get_or_open_dirfrag(MDCache *mdcache, frag_t fg); - CDir *add_dirfrag(CDir *dir); - void close_dirfrag(frag_t fg); - void close_dirfrags(); - bool has_subtree_root_dirfrag(); - - void get_stickydirs(); - void put_stickydirs(); - - protected: - // parent dentries in cache - CDentry *parent; // primary link - set remote_parents; // if hard linked - - pair force_auth; - - // -- distributed state -- -protected: - // file capabilities - map client_caps; // client -> caps - map mds_caps_wanted; // [auth] mds -> caps wanted - int replica_caps_wanted; // [replica] what i've requested from auth - utime_t replica_caps_wanted_keep_until; - - - // LogSegment xlists i (may) belong to - xlist::item xlist_dirty; -public: - xlist::item xlist_open_file; - xlist::item xlist_dirty_inode_mtime; - xlist::item xlist_purging_inode; - -private: - // auth pin - int auth_pins; - int nested_auth_pins; -public: - int auth_pin_freeze_allowance; - - public: - inode_load_vec_t pop; - - // friends - friend class Server; - friend class Locker; - friend class Migrator; - friend class MDCache; - friend class CDir; - friend class CInodeExport; - friend class CInodeDiscover; - - public: - // --------------------------- - CInode(MDCache *c, bool auth=true) : - mdcache(c), - last_journaled(0), last_open_journaled(0), - //hack_accessed(true), - stickydir_ref(0), - parent(0), force_auth(CDIR_AUTH_DEFAULT), - replica_caps_wanted(0), - xlist_dirty(this), xlist_open_file(this), - xlist_dirty_inode_mtime(this), xlist_purging_inode(this), - auth_pins(0), nested_auth_pins(0), - versionlock(this, LOCK_OTYPE_IVERSION, WAIT_VERSIONLOCK_OFFSET), - authlock(this, LOCK_OTYPE_IAUTH, WAIT_AUTHLOCK_OFFSET), - linklock(this, LOCK_OTYPE_ILINK, WAIT_LINKLOCK_OFFSET), - dirfragtreelock(this, LOCK_OTYPE_IDIRFRAGTREE, WAIT_DIRFRAGTREELOCK_OFFSET), - filelock(this, LOCK_OTYPE_IFILE, WAIT_FILELOCK_OFFSET), - dirlock(this, LOCK_OTYPE_IDIR, WAIT_DIRLOCK_OFFSET) - { - state = 0; - if (auth) state_set(STATE_AUTH); - }; - ~CInode() { - close_dirfrags(); - } - - - // -- accessors -- - bool is_file() { return inode.is_file(); } - bool is_symlink() { return inode.is_symlink(); } - bool is_dir() { return inode.is_dir(); } - - bool is_anchored() { return inode.anchored; } - bool is_anchoring() { return state_test(STATE_ANCHORING); } - bool is_unanchoring() { return state_test(STATE_UNANCHORING); } - - bool is_root() { return inode.ino == MDS_INO_ROOT; } - bool is_stray() { return MDS_INO_IS_STRAY(inode.ino); } - bool is_base() { return inode.ino < MDS_INO_BASE; } - - // note: this overloads MDSCacheObject - bool is_ambiguous_auth() { - return state_test(STATE_AMBIGUOUSAUTH) || - MDSCacheObject::is_ambiguous_auth(); - } - - - inodeno_t ino() const { return inode.ino; } - inode_t& get_inode() { return inode; } - CDentry* get_parent_dn() { return parent; } - CDir *get_parent_dir(); - CInode *get_parent_inode(); - - bool is_lt(const MDSCacheObject *r) const { - return ino() < ((CInode*)r)->ino(); - } - - // -- misc -- - void make_path(string& s); - void make_anchor_trace(vector& trace); - void name_stray_dentry(string& dname); - - - - // -- dirtyness -- - version_t get_version() { return inode.version; } - - version_t pre_dirty(); - void _mark_dirty(LogSegment *ls); - void mark_dirty(version_t projected_dirv, LogSegment *ls); - void mark_clean(); - - - CInodeDiscover* replicate_to(int rep); - - - // -- waiting -- - void add_waiter(int tag, Context *c); - - - // -- import/export -- - void encode_export(bufferlist& bl); - void finish_export(utime_t now); - void abort_export() { - put(PIN_TEMPEXPORTING); - } - void decode_import(bufferlist::iterator& p, - set& new_client_caps, - LogSegment *ls); - - - // -- locks -- -public: - LocalLock versionlock; - SimpleLock authlock; - SimpleLock linklock; - ScatterLock dirfragtreelock; - FileLock filelock; - ScatterLock dirlock; - - - SimpleLock* get_lock(int type) { - switch (type) { - case LOCK_OTYPE_IFILE: return &filelock; - case LOCK_OTYPE_IAUTH: return &authlock; - case LOCK_OTYPE_ILINK: return &linklock; - case LOCK_OTYPE_IDIRFRAGTREE: return &dirfragtreelock; - case LOCK_OTYPE_IDIR: return &dirlock; - default: assert(0); return 0; - } - } - void set_object_info(MDSCacheObjectInfo &info); - void encode_lock_state(int type, bufferlist& bl); - void decode_lock_state(int type, bufferlist& bl); - - void clear_dirty_scattered(int type); - - // -- caps -- (new) - // client caps - bool is_any_caps() { return !client_caps.empty(); } - map& get_client_caps() { return client_caps; } - void add_client_cap(int client, Capability& cap) { - if (client_caps.empty()) - get(PIN_CAPS); - assert(client_caps.count(client) == 0); - client_caps[client] = cap; - } - void remove_client_cap(int client) { - assert(client_caps.count(client) == 1); - client_caps.erase(client); - if (client_caps.empty()) - put(PIN_CAPS); - } - Capability* get_client_cap(int client) { - if (client_caps.count(client)) - return &client_caps[client]; - return 0; - } - void reconnect_cap(int client, inode_caps_reconnect_t& icr) { - Capability *cap = get_client_cap(client); - if (cap) { - cap->merge(icr.wanted, icr.issued); - } else { - Capability newcap(icr.wanted, 0); - newcap.issue(icr.issued); - add_client_cap(client, newcap); - } - inode.size = MAX(inode.size, icr.size); - inode.mtime = MAX(inode.mtime, icr.mtime); - inode.atime = MAX(inode.atime, icr.atime); - } - /* - void set_client_caps(map& cl) { - if (client_caps.empty() && !cl.empty()) - get(PIN_CAPS); - client_caps.clear(); - client_caps = cl; - } - */ - void clear_client_caps() { - if (!client_caps.empty()) - put(PIN_CAPS); - client_caps.clear(); - } - void export_client_caps(map& cl) { - for (map::iterator it = client_caps.begin(); - it != client_caps.end(); - it++) { - cl[it->first] = it->second.make_export(); - } - } - void merge_client_caps(map& cl, set& new_client_caps) { - if (client_caps.empty() && !cl.empty()) - get(PIN_CAPS); - - for (map::iterator it = cl.begin(); - it != cl.end(); - it++) { - new_client_caps.insert(it->first); - if (client_caps.count(it->first)) { - // merge - client_caps[it->first].merge(it->second); - } else { - // new - client_caps[it->first] = Capability(it->second); - } - } - } - - // caps issued, wanted - int get_caps_issued() { - int c = 0; - for (map::iterator it = client_caps.begin(); - it != client_caps.end(); - it++) - c |= it->second.issued(); - return c; - } - int get_caps_wanted() { - int w = 0; - for (map::iterator it = client_caps.begin(); - it != client_caps.end(); - it++) { - w |= it->second.wanted(); - //cout << " get_caps_wanted client " << it->first << " " << cap_string(it->second.wanted()) << endl; - } - if (is_auth()) - for (map::iterator it = mds_caps_wanted.begin(); - it != mds_caps_wanted.end(); - it++) { - w |= it->second; - //cout << " get_caps_wanted mds " << it->first << " " << cap_string(it->second) << endl; - } - return w; - } - - - void replicate_relax_locks() { - //dout(10) << " relaxing locks on " << *this << dendl; - assert(is_auth()); - assert(!is_replicated()); - - authlock.replicate_relax(); - linklock.replicate_relax(); - dirfragtreelock.replicate_relax(); - - if (get_caps_issued() & (CAP_FILE_WR|CAP_FILE_WRBUFFER) == 0) - filelock.replicate_relax(); - - dirlock.replicate_relax(); - } - - - // -- authority -- - pair authority(); - - - // -- auth pins -- - int is_auth_pinned() { - return auth_pins; - } - void adjust_nested_auth_pins(int a); - bool can_auth_pin(); - void auth_pin(); - void auth_unpin(); - - - // -- freeze -- - bool is_freezing_inode() { return state_test(STATE_FREEZING); } - bool is_frozen_inode() { return state_test(STATE_FROZEN); } - bool is_frozen(); - bool is_frozen_dir(); - bool is_freezing(); - - bool freeze_inode(int auth_pin_allowance=0); - void unfreeze_inode(list& finished); - - - // -- reference counting -- - void bad_put(int by) { - generic_dout(7) << " bad put " << *this << " by " << by << " " << pin_name(by) << " was " << ref << " (" << ref_set << ")" << dendl; -#ifdef MDS_REF_SET - assert(ref_set.count(by) == 1); -#endif - assert(ref > 0); - } - void bad_get(int by) { - generic_dout(7) << " bad get " << *this << " by " << by << " " << pin_name(by) << " was " << ref << " (" << ref_set << ")" << dendl; -#ifdef MDS_REF_SET - assert(ref_set.count(by) == 0); -#endif - } - void first_get(); - void last_put(); - - - // -- hierarchy stuff -- -public: - void set_primary_parent(CDentry *p) { - assert(parent == 0); - parent = p; - } - void remove_primary_parent(CDentry *dn) { - assert(dn == parent); - parent = 0; - } - void add_remote_parent(CDentry *p); - void remove_remote_parent(CDentry *p); - int num_remote_parents() { - return remote_parents.size(); - } - - - /* - // for giving to clients - void get_dist_spec(set& ls, int auth, timepair_t& now) { - if (( is_dir() && popularity[MDS_POP_CURDOM].get(now) > g_conf.mds_bal_replicate_threshold) || - (!is_dir() && popularity[MDS_POP_JUSTME].get(now) > g_conf.mds_bal_replicate_threshold)) { - //if (!cached_by.empty() && inode.ino > 1) dout(1) << "distributed spec for " << *this << dendl; - ls = cached_by; - } - } - */ - - void print(ostream& out); - -}; - - - - -// -- encoded state - -// discover - -class CInodeDiscover { - - inode_t inode; - string symlink; - fragtree_t dirfragtree; - - int replica_nonce; - - int authlock_state; - int linklock_state; - int dirfragtreelock_state; - int filelock_state; - int dirlock_state; - - public: - CInodeDiscover() {} - CInodeDiscover(CInode *in, int nonce) { - inode = in->inode; - symlink = in->symlink; - dirfragtree = in->dirfragtree; - - replica_nonce = nonce; - - authlock_state = in->authlock.get_replica_state(); - linklock_state = in->linklock.get_replica_state(); - dirfragtreelock_state = in->dirfragtreelock.get_replica_state(); - filelock_state = in->filelock.get_replica_state(); - dirlock_state = in->dirlock.get_replica_state(); - } - - inodeno_t get_ino() { return inode.ino; } - int get_replica_nonce() { return replica_nonce; } - - void update_inode(CInode *in) { - in->inode = inode; - in->symlink = symlink; - in->dirfragtree = dirfragtree; - in->replica_nonce = replica_nonce; - } - void init_inode_locks(CInode *in) { - in->authlock.set_state(authlock_state); - in->linklock.set_state(linklock_state); - in->dirfragtreelock.set_state(dirfragtreelock_state); - in->filelock.set_state(filelock_state); - in->dirlock.set_state(dirlock_state); - } - - void _encode(bufferlist& bl) { - ::_encode(inode, bl); - ::_encode(symlink, bl); - dirfragtree._encode(bl); - ::_encode(replica_nonce, bl); - ::_encode(authlock_state, bl); - ::_encode(linklock_state, bl); - ::_encode(dirfragtreelock_state, bl); - ::_encode(filelock_state, bl); - ::_encode(dirlock_state, bl); - } - - void _decode(bufferlist& bl, int& off) { - ::_decode(inode, bl, off); - ::_decode(symlink, bl, off); - dirfragtree._decode(bl, off); - ::_decode(replica_nonce, bl, off); - ::_decode(authlock_state, bl, off); - ::_decode(linklock_state, bl, off); - ::_decode(dirfragtreelock_state, bl, off); - ::_decode(filelock_state, bl, off); - ::_decode(dirlock_state, bl, off); - } - -}; - - - -#endif diff --git a/branches/sage/ebofs2/mds/Capability.h b/branches/sage/ebofs2/mds/Capability.h deleted file mode 100644 index d7619d13ca156..0000000000000 --- a/branches/sage/ebofs2/mds/Capability.h +++ /dev/null @@ -1,245 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __CAPABILITY_H -#define __CAPABILITY_H - -#include "include/buffer.h" - -#include -using namespace std; - -#include "config.h" - - -// definite caps -#define CAP_FILE_RDCACHE 1 // client can safely cache reads -#define CAP_FILE_RD 2 // client can read -#define CAP_FILE_WR 4 // client can write -#define CAP_FILE_WREXTEND 8 // client can extend file -#define CAP_FILE_WRBUFFER 16 // client can safely buffer writes -#define CAP_FILE_LAZYIO 32 // client can perform lazy io - - -// heuristics -//#define CAP_FILE_DELAYFLUSH 32 - -inline string cap_string(int cap) -{ - string s; - s = "["; - if (cap & CAP_FILE_RDCACHE) s += " rdcache"; - if (cap & CAP_FILE_RD) s += " rd"; - if (cap & CAP_FILE_WR) s += " wr"; - if (cap & CAP_FILE_WRBUFFER) s += " wrbuffer"; - if (cap & CAP_FILE_WRBUFFER) s += " wrextend"; - if (cap & CAP_FILE_LAZYIO) s += " lazyio"; - s += " ]"; - return s; -} - -typedef uint32_t capseq_t; - -class Capability { -public: - struct Export { - int wanted; - int issued; - int pending; - Export() {} - Export(int w, int i, int p) : wanted(w), issued(i), pending(p) {} - }; - -private: - int wanted_caps; // what the client wants (ideally) - - map cap_history; // seq -> cap - capseq_t last_sent, last_recv; - - bool suppress; - -public: - Capability(int want=0, capseq_t s=0) : - wanted_caps(want), - last_sent(s), - last_recv(s), - suppress(false) { - } - Capability(Export& other) : - wanted_caps(other.wanted), - last_sent(0), last_recv(0) { - // issued vs pending - if (other.issued & ~other.pending) - issue(other.issued); - issue(other.pending); - } - - bool is_suppress() { return suppress; } - void set_suppress(bool b) { suppress = b; } - - bool is_null() { return cap_history.empty() && wanted_caps == 0; } - - // most recently issued caps. - int pending() { - if (cap_history.count(last_sent)) - return cap_history[ last_sent ]; - return 0; - } - - // caps client has confirmed receipt of - int confirmed() { - if (cap_history.count(last_recv)) - return cap_history[ last_recv ]; - return 0; - } - - // caps potentially issued - int issued() { - int c = 0; - for (capseq_t seq = last_recv; seq <= last_sent; seq++) { - if (cap_history.count(seq)) { - c |= cap_history[seq]; - generic_dout(10) << " cap issued: seq " << seq << " " << cap_string(cap_history[seq]) << " -> " << cap_string(c) << dendl; - } - } - return c; - } - - // caps this client wants to hold - int wanted() { return wanted_caps; } - void set_wanted(int w) { - wanted_caps = w; - } - - // needed - static int needed(int from) { - // strip out wrbuffer, rdcache - return from & (CAP_FILE_WR|CAP_FILE_RD); - } - int needed() { return needed(wanted_caps); } - - // conflicts - static int conflicts(int from) { - int c = 0; - if (from & CAP_FILE_WRBUFFER) c |= CAP_FILE_RDCACHE|CAP_FILE_RD; - if (from & CAP_FILE_WR) c |= CAP_FILE_RDCACHE; - if (from & CAP_FILE_RD) c |= CAP_FILE_WRBUFFER; - if (from & CAP_FILE_RDCACHE) c |= CAP_FILE_WRBUFFER|CAP_FILE_WR; - return c; - } - int wanted_conflicts() { return conflicts(wanted()); } - int needed_conflicts() { return conflicts(needed()); } - int issued_conflicts() { return conflicts(issued()); } - - // issue caps; return seq number. - capseq_t issue(int c) { - //int was = pending(); - //no! if (c == was && last_sent) return -1; // repeat of previous? - - ++last_sent; - cap_history[last_sent] = c; - - /* no! - // not recalling, just adding? - if (c & ~was && - cap_history.count(last_sent-1)) { - cap_history.erase(last_sent-1); - } - */ - return last_sent; - } - capseq_t get_last_seq() { return last_sent; } - - Export make_export() { - return Export(wanted_caps, issued(), pending()); - } - void merge(Export& other) { - // issued + pending - int newpending = other.pending | pending(); - if (other.issued & ~newpending) - issue(other.issued | newpending); - issue(newpending); - - // wanted - wanted_caps = wanted_caps | other.wanted; - } - void merge(int otherwanted, int otherissued) { - // issued + pending - int newpending = pending(); - if (otherissued & ~newpending) - issue(otherissued | newpending); - issue(newpending); - - // wanted - wanted_caps = wanted_caps | otherwanted; - } - - // confirm receipt of a previous sent/issued seq. - int confirm_receipt(capseq_t seq, int caps) { - int r = 0; - - // old seqs - while (last_recv < seq) { - generic_dout(10) << " cap.confirm_receipt forgetting seq " << last_recv << " " << cap_string(cap_history[last_recv]) << dendl; - r |= cap_history[last_recv]; - cap_history.erase(last_recv); - ++last_recv; - } - - // release current? - if (cap_history.count(seq) && - cap_history[seq] != caps) { - generic_dout(10) << " cap.confirm_receipt revising seq " << seq << " " << cap_string(cap_history[seq]) << " -> " << cap_string(caps) << dendl; - // note what we're releasing.. - assert(cap_history[seq] & ~caps); - r |= cap_history[seq] & ~caps; - - cap_history[seq] = caps; // confirmed() now less than before.. - } - - // null? - if (caps == 0 && - cap_history.size() == 1 && - cap_history.count(seq)) { - cap_history.clear(); // viola, null! - } - - return r; - } - - // serializers - void _encode(bufferlist& bl) { - bl.append((char*)&wanted_caps, sizeof(wanted_caps)); - bl.append((char*)&last_sent, sizeof(last_sent)); - bl.append((char*)&last_recv, sizeof(last_recv)); - ::_encode(cap_history, bl); - } - void _decode(bufferlist& bl, int& off) { - bl.copy(off, sizeof(wanted_caps), (char*)&wanted_caps); - off += sizeof(wanted_caps); - bl.copy(off, sizeof(last_sent), (char*)&last_sent); - off += sizeof(last_sent); - bl.copy(off, sizeof(last_recv), (char*)&last_recv); - off += sizeof(last_recv); - ::_decode(cap_history, bl, off); - } - -}; - - - - - -#endif diff --git a/branches/sage/ebofs2/mds/ClientMap.cc b/branches/sage/ebofs2/mds/ClientMap.cc deleted file mode 100644 index 1d781b9ba48c3..0000000000000 --- a/branches/sage/ebofs2/mds/ClientMap.cc +++ /dev/null @@ -1,126 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#define DBLEVEL 20 - -#include "include/types.h" - -#include "MDS.h" -#include "ClientMap.h" - -#include "osdc/Filer.h" - -#include "config.h" - -#define dout(x) if (x <= g_conf.debug_mds) *_dout << dbeginl << g_clock.now() << " mds" << mds->get_nodeid() << ".clientmap " - - - -void ClientMap::init_inode() -{ - memset(&inode, 0, sizeof(inode)); - inode.ino = MDS_INO_CLIENTMAP_OFFSET + mds->get_nodeid(); - inode.layout = g_OSD_FileLayout; -} - - -// ---------------- -// LOAD - -class C_CM_Load : public Context { - ClientMap *clientmap; -public: - bufferlist bl; - C_CM_Load(ClientMap *cm) : clientmap(cm) {} - void finish(int r) { - clientmap->_load_finish(bl); - } -}; - -void ClientMap::load(Context *onload) -{ - dout(10) << "load" << dendl; - - init_inode(); - - if (onload) - waiting_for_load.push_back(onload); - - C_CM_Load *c = new C_CM_Load(this); - mds->filer->read(inode, - 0, inode.layout.fl_stripe_unit, - &c->bl, - c); - -} - -void ClientMap::_load_finish(bufferlist &bl) -{ - int off = 0; - decode(bl, off); - dout(10) << "_load_finish v " << version - << ", " << client_inst.size() << " clients, " - << bl.length() << " bytes" - << dendl; - projected = committing = committed = version; - finish_contexts(waiting_for_load); -} - - -// ---------------- -// SAVE - -class C_CM_Save : public Context { - ClientMap *clientmap; - version_t version; -public: - C_CM_Save(ClientMap *cm, version_t v) : clientmap(cm), version(v) {} - void finish(int r) { - clientmap->_save_finish(version); - } -}; - -void ClientMap::save(Context *onsave, version_t needv) -{ - dout(10) << "save needv " << needv << ", v " << version << dendl; - - if (needv && committing >= needv) { - assert(committing > committed); - commit_waiters[committing].push_back(onsave); - return; - } - - commit_waiters[version].push_back(onsave); - - bufferlist bl; - - init_inode(); - encode(bl); - committing = version; - mds->filer->write(inode, - 0, bl.length(), bl, - 0, - 0, new C_CM_Save(this, version)); -} - -void ClientMap::_save_finish(version_t v) -{ - dout(10) << "_save_finish v" << v << dendl; - committed = v; - - finish_contexts(commit_waiters[v]); - commit_waiters.erase(v); -} diff --git a/branches/sage/ebofs2/mds/ClientMap.h b/branches/sage/ebofs2/mds/ClientMap.h deleted file mode 100644 index c36e66d240a33..0000000000000 --- a/branches/sage/ebofs2/mds/ClientMap.h +++ /dev/null @@ -1,194 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __CLIENTMAP_H -#define __CLIENTMAP_H - -#include "msg/Message.h" - -#include -using namespace std; - -#include -using namespace __gnu_cxx; - -#include "include/Context.h" -#include "mdstypes.h" - -class MDS; - -/* - * this structure is used by the MDS purely so that - * it can remember client addresses (entity_inst_t) - * for clients with an active session. - * - * it is also used to keep track of recently completed - * operations, should the client have to resubmit them - * (after a connection failure, etc.) - */ -class ClientMap { -private: - MDS *mds; - - version_t version; - version_t projected; - version_t committing; - version_t committed; - map > commit_waiters; - -public: - version_t get_version() { return version; } - version_t get_projected() { return projected; } - version_t get_committing() { return committing; } - version_t get_committed() { return committed; } - - version_t inc_projected() { return ++projected; } - void reset_projected() { projected = version; } - void set_committing(version_t v) { committing = v; } - void set_committed(version_t v) { committed = v; } - -private: - // affects version - hash_map client_inst; - - // does not affect version - set sessions; - set opening; - set closing; - -public: - bool empty() { - return client_inst.empty(); - } - - const entity_inst_t& get_inst(int client) { - assert(client_inst.count(client)); - return client_inst[client]; - } - const set& get_session_set() { return sessions; } - - bool is_opening(int c) { return opening.count(c); } - void add_opening(int c) { opening.insert(c); } - bool is_closing(int c) { return closing.count(c); } - void add_closing(int c) { closing.insert(c); } - bool have_session(int client) { - return client_inst.count(client); - } - void open_session(const entity_inst_t& inst) { - opening.erase(inst.name.num()); - client_inst[inst.name.num()] = inst; - sessions.insert(inst.name.num()); - version++; - } - void close_session(int client) { - closing.erase(client); - sessions.erase(client); - client_inst.erase(client); - version++; - } - -private: - // -- push sequence -- - hash_map client_push_seq; // seq # for messages pushed to client. - -public: - version_t inc_push_seq(int client) { - return ++client_push_seq[client]; - } - version_t get_push_seq(int client) { - return client_push_seq[client]; - } - - -private: - // -- completed requests -- - // client id -> tid -> result code - map > completed_requests; // completed client requests - map > waiting_for_trim; - version_t requestmapv; - -public: - void add_completed_request(metareqid_t ri) { - completed_requests[ri.client].insert(ri.tid); - requestmapv++; - } - void trim_completed_requests(int client, - tid_t mintid) { // zero means trim all! - map >::iterator p = completed_requests.find(client); - if (p == completed_requests.end()) - return; - - // trim - while (!p->second.empty() && (mintid == 0 || *p->second.begin() < mintid)) - p->second.erase(p->second.begin()); - if (p->second.empty()) - completed_requests.erase(p); - - // kick waiters - map >::iterator q = waiting_for_trim.find(client); - if (q != waiting_for_trim.end()) { - list fls; - while (!q->second.empty() && - (mintid == 0 || q->second.begin()->first < mintid)) { - fls.push_back(q->second.begin()->second); - q->second.erase(q->second.begin()); - } - if (q->second.empty()) - waiting_for_trim.erase(q); - finish_contexts(fls); - } - } - void add_trim_waiter(metareqid_t ri, Context *c) { - waiting_for_trim[ri.client][ri.tid] = c; - } - bool have_completed_request(metareqid_t ri) { - return completed_requests.count(ri.client) && - completed_requests[ri.client].count(ri.tid); - } - - - - ClientMap(MDS *m) : mds(m), - version(0), projected(0), committing(0), committed(0), - requestmapv(0) {} - - - // -- encoding -- - void encode(bufferlist& bl) { - bl.append((char*)&version, sizeof(version)); - ::_encode(client_inst, bl); - ::_encode(sessions, bl); - } - void decode(bufferlist& bl, int& off) { - bl.copy(off, sizeof(version), (char*)&version); - off += sizeof(version); - ::_decode(client_inst, bl, off); - ::_decode(sessions, bl, off); - - projected = committing = committed = version; - } - - - // -- loading, saving -- - inode_t inode; - list waiting_for_load; - - void init_inode(); - void load(Context *onload); - void _load_finish(bufferlist &bl); - void save(Context *onsave, version_t needv=0); - void _save_finish(version_t v); -}; - -#endif diff --git a/branches/sage/ebofs2/mds/FileLock.h b/branches/sage/ebofs2/mds/FileLock.h deleted file mode 100644 index 09868f7563fb6..0000000000000 --- a/branches/sage/ebofs2/mds/FileLock.h +++ /dev/null @@ -1,227 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __FILELOCK_H -#define __FILELOCK_H - -#include -#include -using namespace std; - -#include "include/buffer.h" - -#include "SimpleLock.h" -#include "Capability.h" - -// states and such. -// C = cache reads, R = read, W = write, A = append, B = buffer writes, L = lazyio - -// -----auth-------- ---replica------- -#define LOCK_SYNC_ 1 // AR R . / C R . . . L R . / C R . . . L stat() -#define LOCK_GSYNCL -12 // A . . / C ? . . . L loner -> sync (*) -#define LOCK_GSYNCM -13 // A . . / . R . . . L - -#define LOCK_LOCK_ 2 // AR R W / C . . . . . . . / C . . . . . truncate() -#define LOCK_GLOCKR_ -3 // AR R . / C . . . . . . . / C . . . . . -#define LOCK_GLOCKL -4 // A . . / C . . . . . loner -> lock -#define LOCK_GLOCKM -5 // A . . / . . . . . . - -#define LOCK_MIXED 6 // AR . . / . R W A . L . . / . R . . . L -#define LOCK_GMIXEDR -7 // AR R . / . R . . . L . . / . R . . . L -#define LOCK_GMIXEDL -8 // A . . / . . . . . L loner -> mixed - -#define LOCK_LONER 9 // A . . / C R W A B L (lock) -#define LOCK_GLONERR -10 // A . . / . R . . . L -#define LOCK_GLONERM -11 // A . . / . R W A . L - -// (*) FIXME: how to let old loner keep R, somehow, during GSYNCL - -// 4 stable -// +9 transition -// 13 total - -inline const char *get_filelock_state_name(int n) { - switch (n) { - case LOCK_SYNC: return "sync"; - case LOCK_GSYNCL: return "gsyncl"; - case LOCK_GSYNCM: return "gsyncm"; - case LOCK_LOCK: return "lock"; - case LOCK_GLOCKR: return "glockr"; - case LOCK_GLOCKL: return "glockl"; - case LOCK_GLOCKM: return "glockm"; - case LOCK_MIXED: return "mixed"; - case LOCK_GMIXEDR: return "gmixedr"; - case LOCK_GMIXEDL: return "gmixedl"; - case LOCK_LONER: return "loner"; - case LOCK_GLONERR: return "glonerr"; - case LOCK_GLONERM: return "glonerm"; - default: assert(0); return 0; - } -} - - -/* no append scenarios: - -loner + truncate(): - - loner needs to lose A (?unless it's the loner doing the truncate?) -loner + statlite(size): - - loner needs to lose A - -any + statlite(size) - - all lose A - -any + statlite(mtime) - - all lose W - --> we need to add lonerfixed and mixedfixed states (and associated transitions) - in order to efficiently support statlite(size) and truncate(). until then, - we have to LOCK. - - */ - -// -- lock... hard or file - -class MDRequest; - -class FileLock : public SimpleLock { - public: - FileLock(MDSCacheObject *o, int t, int wo) : SimpleLock(o, t, wo) { } - - int get_replica_state() { - switch (state) { - case LOCK_LOCK: - case LOCK_GLOCKM: - case LOCK_GLOCKL: - case LOCK_GLOCKR: - case LOCK_LONER: - case LOCK_GLONERR: - case LOCK_GLONERM: - return LOCK_LOCK; - case LOCK_MIXED: - case LOCK_GMIXEDR: - return LOCK_MIXED; - case LOCK_SYNC: - return LOCK_SYNC; - - // after gather auth will bc LOCK_AC_MIXED or whatever - case LOCK_GSYNCM: - return LOCK_MIXED; - case LOCK_GSYNCL: - case LOCK_GMIXEDL: // ** LOCK isn't exact right state, but works. - return LOCK_LOCK; - - default: - assert(0); - } - return 0; - } - void export_twiddle() { - clear_gather(); - state = get_replica_state(); - } - - // read/write access - bool can_rdlock(MDRequest *mdr) { - if (!parent->is_auth()) return (state == LOCK_SYNC); - //if (state == LOCK_LOCK && mdr && xlock_by == mdr) return true; - if (state == LOCK_LOCK && !xlock_by) return true; - return - (state == LOCK_SYNC) || - (state == LOCK_GMIXEDR) || - (state == LOCK_GLOCKR); - } - bool can_rdlock_soon() { - if (parent->is_auth()) - return (state == LOCK_GLOCKL); - else - return false; - } - bool can_xlock_soon() { - if (parent->is_auth()) - return (state == LOCK_GLOCKR) || (state == LOCK_GLOCKL) - || (state == LOCK_GLOCKM); - else - return false; - } - - // client caps allowed - int caps_allowed_ever() { - if (parent->is_auth()) - return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_WR | CAP_FILE_WREXTEND | CAP_FILE_WRBUFFER | CAP_FILE_LAZYIO; - else - return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_LAZYIO; - } - int caps_allowed() { - if (parent->is_auth()) - switch (state) { - case LOCK_SYNC: - return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_LAZYIO; - case LOCK_LOCK: - case LOCK_GLOCKR: - case LOCK_GLOCKL: - return CAP_FILE_RDCACHE; - - case LOCK_GLOCKM: - return 0; - - case LOCK_MIXED: - return CAP_FILE_RD | CAP_FILE_WR | CAP_FILE_WREXTEND | CAP_FILE_LAZYIO; - case LOCK_GMIXEDR: - return CAP_FILE_RD | CAP_FILE_LAZYIO; - case LOCK_GMIXEDL: - return 0; - - case LOCK_LONER: // single client writer, of course. - return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_WR | CAP_FILE_WREXTEND | CAP_FILE_WRBUFFER | CAP_FILE_LAZYIO; - case LOCK_GLONERR: - return CAP_FILE_RD | CAP_FILE_LAZYIO; - case LOCK_GLONERM: - return CAP_FILE_RD | CAP_FILE_WR | CAP_FILE_WREXTEND | CAP_FILE_LAZYIO; - - case LOCK_GSYNCL: - return CAP_FILE_RDCACHE | CAP_FILE_LAZYIO; - case LOCK_GSYNCM: - return CAP_FILE_RD | CAP_FILE_LAZYIO; - } - else - switch (state) { - case LOCK_SYNC: - return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_LAZYIO; - case LOCK_LOCK: - case LOCK_GLOCKR: - return CAP_FILE_RDCACHE; - case LOCK_GMIXEDR: - case LOCK_MIXED: - return CAP_FILE_RD | CAP_FILE_LAZYIO; - } - assert(0); - return 0; - } - - void print(ostream& out) { - out << "("; - out << get_lock_type_name(get_type()) << " "; - out << get_filelock_state_name(get_state()); - if (!get_gather_set().empty()) out << " g=" << get_gather_set(); - if (is_rdlocked()) - out << " r=" << get_num_rdlocks(); - if (is_xlocked()) - out << " x=" << get_xlocked_by(); - out << ")"; - } -}; - - -#endif diff --git a/branches/sage/ebofs2/mds/IdAllocator.cc b/branches/sage/ebofs2/mds/IdAllocator.cc deleted file mode 100644 index 36a36ea9eb037..0000000000000 --- a/branches/sage/ebofs2/mds/IdAllocator.cc +++ /dev/null @@ -1,205 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#define DBLEVEL 20 - -#include "IdAllocator.h" -#include "MDS.h" -#include "MDLog.h" - -#include "osdc/Filer.h" - -#include "include/types.h" - -#include "config.h" - -#define dout(x) if (x <= g_conf.debug_mds) *_dout << dbeginl << g_clock.now() << " mds" << mds->get_nodeid() << ".idalloc: " - - -void IdAllocator::init_inode() -{ - memset(&inode, 0, sizeof(inode)); - inode.ino = MDS_INO_IDS_OFFSET + mds->get_nodeid(); - inode.layout = g_OSD_FileLayout; -} - - -inodeno_t IdAllocator::alloc_id() -{ - assert(is_active()); - - // pick one - inodeno_t id = free.start(); - free.erase(id); - dout(10) << "idalloc " << this << ": alloc id " << id << dendl; - - version++; - - // log it - /* - if (!replay) - mds->mdlog->submit_entry(new EAlloc(IDTYPE_INO, id, EALLOC_EV_ALLOC, version)); - */ - - return id; -} - -void IdAllocator::reclaim_id(inodeno_t id) -{ - assert(is_active()); - - dout(10) << "idalloc " << this << ": reclaim id " << id << dendl; - free.insert(id); - - version++; - - /* - if (!replay) - mds->mdlog->submit_entry(new EAlloc(IDTYPE_INO, id, EALLOC_EV_FREE, version)); - */ -} - - - -class C_ID_Save : public Context { - IdAllocator *ida; - version_t version; -public: - C_ID_Save(IdAllocator *i, version_t v) : ida(i), version(v) {} - void finish(int r) { - ida->save_2(version); - } -}; - -void IdAllocator::save(Context *onfinish, version_t v) -{ - if (v > 0 && v <= committing_version) { - dout(10) << "save v " << version << " - already saving " - << committing_version << " >= needed " << v << dendl; - waitfor_save[v].push_back(onfinish); - return; - } - - dout(10) << "save v " << version << dendl; - assert(is_active()); - - bufferlist bl; - - bl.append((char*)&version, sizeof(version)); - ::_encode(free.m, bl); - - committing_version = version; - - if (onfinish) - waitfor_save[version].push_back(onfinish); - - // write (async) - mds->filer->write(inode, - 0, bl.length(), bl, - 0, - 0, new C_ID_Save(this, version)); -} - -void IdAllocator::save_2(version_t v) -{ - dout(10) << "save_2 v " << v << dendl; - - committed_version = v; - - list ls; - while (!waitfor_save.empty()) { - if (waitfor_save.begin()->first > v) break; - ls.splice(ls.end(), waitfor_save.begin()->second); - waitfor_save.erase(waitfor_save.begin()); - } - finish_contexts(ls,0); -} - - -void IdAllocator::reset() -{ - init_inode(); - - // use generic range. FIXME THIS IS CRAP - free.clear(); -#ifdef __LP64__ - uint64_t start = (uint64_t)(mds->get_nodeid()+1) << 40; - uint64_t end = ((uint64_t)(mds->get_nodeid()+2) << 40) - 1; -#else -# warning this looks like a 32-bit system, using small inode numbers. - uint64_t start = (uint64_t)(mds->get_nodeid()+1) << 25; - uint64_t end = ((uint64_t)(mds->get_nodeid()+2) << 25) - 1; -#endif - free.insert(start, end); - - state = STATE_ACTIVE; -} - - - -// ----------------------- - -class C_ID_Load : public Context { -public: - IdAllocator *ida; - Context *onfinish; - bufferlist bl; - C_ID_Load(IdAllocator *i, Context *o) : ida(i), onfinish(o) {} - void finish(int r) { - ida->load_2(r, bl, onfinish); - } -}; - -void IdAllocator::load(Context *onfinish) -{ - dout(10) << "load" << dendl; - - init_inode(); - - assert(is_undef()); - state = STATE_OPENING; - - C_ID_Load *c = new C_ID_Load(this, onfinish); - mds->filer->read(inode, - 0, inode.layout.fl_stripe_unit, - &c->bl, - c); -} - -void IdAllocator::load_2(int r, bufferlist& bl, Context *onfinish) -{ - assert(is_opening()); - state = STATE_ACTIVE; - - if (r > 0) { - dout(10) << "load_2 got " << bl.length() << " bytes" << dendl; - int off = 0; - bl.copy(off, sizeof(version), (char*)&version); - off += sizeof(version); - ::_decode(free.m, bl, off); - committed_version = version; - } - else { - dout(10) << "load_2 found no alloc file" << dendl; - assert(0); // this shouldn't happen if mkfs finished. - reset(); - } - - if (onfinish) { - onfinish->finish(0); - delete onfinish; - } -} diff --git a/branches/sage/ebofs2/mds/IdAllocator.h b/branches/sage/ebofs2/mds/IdAllocator.h deleted file mode 100644 index 51001f2236627..0000000000000 --- a/branches/sage/ebofs2/mds/IdAllocator.h +++ /dev/null @@ -1,78 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __IDALLOCATOR_H -#define __IDALLOCATOR_H - -#include "include/types.h" -#include "include/interval_set.h" -#include "include/buffer.h" -#include "include/Context.h" - -class MDS; - -class IdAllocator { - MDS *mds; - inode_t inode; - - static const int STATE_UNDEF = 0; - static const int STATE_OPENING = 1; - static const int STATE_ACTIVE = 2; - //static const int STATE_COMMITTING = 3; - int state; - - version_t version, committing_version, committed_version; - - interval_set free; // unused ids - - map > waitfor_save; - - public: - IdAllocator(MDS *m) : - mds(m), - state(STATE_UNDEF), - version(0), committing_version(0), committed_version(0) - { - } - - void init_inode(); - - // alloc or reclaim ids - inodeno_t alloc_id(); - void reclaim_id(inodeno_t ino); - - version_t get_version() { return version; } - version_t get_committed_version() { return committed_version; } - version_t get_committing_version() { return committing_version; } - - // load/save from disk (hack) - bool is_undef() { return state == STATE_UNDEF; } - bool is_active() { return state == STATE_ACTIVE; } - bool is_opening() { return state == STATE_OPENING; } - - void reset(); - void save(Context *onfinish=0, version_t need=0); - void save_2(version_t v); - - void shutdown() { - if (is_active()) save(0); - } - - void load(Context *onfinish); - void load_2(int, bufferlist&, Context *onfinish); - -}; - -#endif diff --git a/branches/sage/ebofs2/mds/LocalLock.h b/branches/sage/ebofs2/mds/LocalLock.h deleted file mode 100644 index 752fdcb4d3fd1..0000000000000 --- a/branches/sage/ebofs2/mds/LocalLock.h +++ /dev/null @@ -1,61 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __LOCALLOCK_H -#define __LOCALLOCK_H - -#include "SimpleLock.h" - -class LocalLock : public SimpleLock { -protected: - int num_wrlock; - -public: - LocalLock(MDSCacheObject *o, int t, int wo) : - SimpleLock(o, t, wo), - num_wrlock(0) { - set_state(LOCK_LOCK); // always. - } - - bool can_wrlock() { - return !is_xlocked(); - } - void get_wrlock() { - assert(can_wrlock()); - if (num_wrlock == 0) parent->get(MDSCacheObject::PIN_LOCK); - ++num_wrlock; - } - void put_wrlock() { - --num_wrlock; - if (num_wrlock == 0) parent->put(MDSCacheObject::PIN_LOCK); - } - bool is_wrlocked() { return num_wrlock > 0; } - int get_num_wrlocks() { return num_wrlock; } - - - void print(ostream& out) { - out << "("; - out << get_lock_type_name(get_type()); - if (is_xlocked()) - out << " x=" << get_xlocked_by(); - if (is_wrlocked()) - out << " wr=" << get_num_wrlocks(); - out << ")"; - } - -}; - - -#endif diff --git a/branches/sage/ebofs2/mds/Locker.cc b/branches/sage/ebofs2/mds/Locker.cc deleted file mode 100644 index 10b7adc0d0eaf..0000000000000 --- a/branches/sage/ebofs2/mds/Locker.cc +++ /dev/null @@ -1,2900 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include "MDS.h" -#include "MDCache.h" -#include "Locker.h" -#include "CInode.h" -#include "CDir.h" -#include "CDentry.h" - -#include "MDLog.h" -#include "MDSMap.h" - -#include "include/filepath.h" - -#include "events/EString.h" -#include "events/EUpdate.h" - -#include "msg/Messenger.h" - -#include "messages/MGenericMessage.h" -#include "messages/MDiscover.h" -#include "messages/MDiscoverReply.h" - -#include "messages/MDirUpdate.h" - -#include "messages/MInodeFileCaps.h" - -#include "messages/MLock.h" -#include "messages/MDentryUnlink.h" - -#include "messages/MClientRequest.h" -#include "messages/MClientFileCaps.h" - -#include "messages/MMDSSlaveRequest.h" - -#include -#include - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) *_dout << dbeginl << g_clock.now() << " mds" << mds->get_nodeid() << ".locker " - - - -void Locker::dispatch(Message *m) -{ - - switch (m->get_type()) { - - // locking - case MSG_MDS_LOCK: - handle_lock((MLock*)m); - break; - - // cache fun - case MSG_MDS_INODEFILECAPS: - handle_inode_file_caps((MInodeFileCaps*)m); - break; - - case MSG_CLIENT_FILECAPS: - handle_client_file_caps((MClientFileCaps*)m); - break; - - - - default: - assert(0); - } -} - - -void Locker::send_lock_message(SimpleLock *lock, int msg) -{ - for (map::iterator it = lock->get_parent()->replicas_begin(); - it != lock->get_parent()->replicas_end(); - it++) { - if (mds->mdsmap->get_state(it->first) < MDSMap::STATE_REJOIN) - continue; - MLock *m = new MLock(lock, msg, mds->get_nodeid()); - mds->send_message_mds(m, it->first); - } -} - -void Locker::send_lock_message(SimpleLock *lock, int msg, const bufferlist &data) -{ - for (map::iterator it = lock->get_parent()->replicas_begin(); - it != lock->get_parent()->replicas_end(); - it++) { - if (mds->mdsmap->get_state(it->first) < MDSMap::STATE_REJOIN) - continue; - MLock *m = new MLock(lock, msg, mds->get_nodeid()); - m->set_data(data); - mds->send_message_mds(m, it->first); - } -} - - - - - - - - - - - -bool Locker::acquire_locks(MDRequest *mdr, - set &rdlocks, - set &wrlocks, - set &xlocks) -{ - if (mdr->done_locking) { - dout(10) << "acquire_locks " << *mdr << " -- done locking" << dendl; - return true; // at least we had better be! - } - dout(10) << "acquire_locks " << *mdr << dendl; - - set sorted; // sort everything we will lock - set mustpin = xlocks; // items to authpin - - // xlocks - for (set::iterator p = xlocks.begin(); p != xlocks.end(); ++p) { - dout(20) << " must xlock " << **p << " " << *(*p)->get_parent() << dendl; - sorted.insert(*p); - - // augment xlock with a versionlock? - if ((*p)->get_type() > LOCK_OTYPE_IVERSION) { - // inode version lock? - CInode *in = (CInode*)(*p)->get_parent(); - if (mdr->is_master()) { - // master. wrlock versionlock so we can pipeline inode updates to journal. - wrlocks.insert(&in->versionlock); - } else { - // slave. exclusively lock the inode version (i.e. block other journal updates) - xlocks.insert(&in->versionlock); - sorted.insert(&in->versionlock); - } - } - } - - // wrlocks - for (set::iterator p = wrlocks.begin(); p != wrlocks.end(); ++p) { - dout(20) << " must wrlock " << **p << " " << *(*p)->get_parent() << dendl; - sorted.insert(*p); - if ((*p)->get_parent()->is_auth()) - mustpin.insert(*p); - else if ((*p)->get_type() == LOCK_OTYPE_IDIR && - !(*p)->get_parent()->is_auth() && !((ScatterLock*)(*p))->can_wrlock()) { // we might have to request a scatter - dout(15) << " will also auth_pin " << *(*p)->get_parent() << " in case we need to request a scatter" << dendl; - mustpin.insert(*p); - } - } - - // rdlocks - for (set::iterator p = rdlocks.begin(); - p != rdlocks.end(); - ++p) { - dout(20) << " must rdlock " << **p << " " << *(*p)->get_parent() << dendl; - sorted.insert(*p); - } - - - // AUTH PINS - map > mustpin_remote; // mds -> (object set) - - // can i auth pin them all now? - for (set::iterator p = mustpin.begin(); - p != mustpin.end(); - ++p) { - MDSCacheObject *object = (*p)->get_parent(); - - dout(10) << " must authpin " << *object << dendl; - - if (mdr->is_auth_pinned(object)) - continue; - - if (!object->is_auth()) { - if (object->is_ambiguous_auth()) { - // wait - dout(10) << " ambiguous auth, waiting to authpin " << *object << dendl; - object->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, new C_MDS_RetryRequest(mdcache, mdr)); - mds->locker->drop_locks(mdr); - mdr->drop_local_auth_pins(); - return false; - } - mustpin_remote[object->authority().first].insert(object); - continue; - } - if (!object->can_auth_pin()) { - // wait - dout(10) << " can't auth_pin (freezing?), waiting to authpin " << *object << dendl; - object->add_waiter(MDSCacheObject::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr)); - mds->locker->drop_locks(mdr); - mdr->drop_local_auth_pins(); - return false; - } - } - - // ok, grab local auth pins - for (set::iterator p = mustpin.begin(); - p != mustpin.end(); - ++p) { - MDSCacheObject *object = (*p)->get_parent(); - if (mdr->is_auth_pinned(object)) { - dout(10) << " already auth_pinned " << *object << dendl; - } else if (object->is_auth()) { - dout(10) << " auth_pinning " << *object << dendl; - mdr->auth_pin(object); - } - } - - // request remote auth_pins - if (!mustpin_remote.empty()) { - for (map >::iterator p = mustpin_remote.begin(); - p != mustpin_remote.end(); - ++p) { - dout(10) << "requesting remote auth_pins from mds" << p->first << dendl; - - MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_AUTHPIN); - for (set::iterator q = p->second.begin(); - q != p->second.end(); - ++q) { - dout(10) << " req remote auth_pin of " << **q << dendl; - MDSCacheObjectInfo info; - (*q)->set_object_info(info); - req->get_authpins().push_back(info); - } - mds->send_message_mds(req, p->first); - - // put in waiting list - assert(mdr->more()->waiting_on_slave.count(p->first) == 0); - mdr->more()->waiting_on_slave.insert(p->first); - } - return false; - } - - // acquire locks. - // make sure they match currently acquired locks. - set::iterator existing = mdr->locks.begin(); - for (set::iterator p = sorted.begin(); - p != sorted.end(); - ++p) { - - // already locked? - if (existing != mdr->locks.end() && *existing == *p) { - // right kind? - SimpleLock *have = *existing; - existing++; - if (xlocks.count(*p) && mdr->xlocks.count(*p)) { - dout(10) << " already xlocked " << *have << " " << *have->get_parent() << dendl; - } - else if (wrlocks.count(*p) && mdr->wrlocks.count(*p)) { - dout(10) << " already wrlocked " << *have << " " << *have->get_parent() << dendl; - } - else if (rdlocks.count(*p) && mdr->rdlocks.count(*p)) { - dout(10) << " already rdlocked " << *have << " " << *have->get_parent() << dendl; - } - else assert(0); - continue; - } - - // hose any stray locks - while (existing != mdr->locks.end()) { - SimpleLock *stray = *existing; - existing++; - dout(10) << " unlocking out-of-order " << *stray << " " << *stray->get_parent() << dendl; - if (mdr->xlocks.count(stray)) - xlock_finish(stray, mdr); - else if (mdr->wrlocks.count(stray)) - wrlock_finish(stray, mdr); - else - rdlock_finish(stray, mdr); - } - - // lock - if (xlocks.count(*p)) { - if (!xlock_start(*p, mdr)) - return false; - dout(10) << " got xlock on " << **p << " " << *(*p)->get_parent() << dendl; - } else if (wrlocks.count(*p)) { - if (!wrlock_start(*p, mdr)) - return false; - dout(10) << " got wrlock on " << **p << " " << *(*p)->get_parent() << dendl; - } else { - if (!rdlock_start(*p, mdr)) - return false; - dout(10) << " got rdlock on " << **p << " " << *(*p)->get_parent() << dendl; - } - } - - // any extra unneeded locks? - while (existing != mdr->locks.end()) { - SimpleLock *stray = *existing; - existing++; - dout(10) << " unlocking extra " << *stray << " " << *stray->get_parent() << dendl; - if (mdr->xlocks.count(stray)) - xlock_finish(stray, mdr); - else if (mdr->wrlocks.count(stray)) - wrlock_finish(stray, mdr); - else - rdlock_finish(stray, mdr); - } - - return true; -} - - -void Locker::drop_locks(MDRequest *mdr) -{ - // leftover locks - while (!mdr->xlocks.empty()) - xlock_finish(*mdr->xlocks.begin(), mdr); - while (!mdr->rdlocks.empty()) - rdlock_finish(*mdr->rdlocks.begin(), mdr); - while (!mdr->wrlocks.empty()) - wrlock_finish(*mdr->wrlocks.begin(), mdr); -} - - -// generics - -bool Locker::rdlock_start(SimpleLock *lock, MDRequest *mdr) -{ - switch (lock->get_type()) { - case LOCK_OTYPE_IFILE: - return file_rdlock_start((FileLock*)lock, mdr); - case LOCK_OTYPE_IDIRFRAGTREE: - case LOCK_OTYPE_IDIR: - return scatter_rdlock_start((ScatterLock*)lock, mdr); - default: - return simple_rdlock_start(lock, mdr); - } -} - -void Locker::rdlock_finish(SimpleLock *lock, MDRequest *mdr) -{ - switch (lock->get_type()) { - case LOCK_OTYPE_IFILE: - return file_rdlock_finish((FileLock*)lock, mdr); - case LOCK_OTYPE_IDIRFRAGTREE: - case LOCK_OTYPE_IDIR: - return scatter_rdlock_finish((ScatterLock*)lock, mdr); - default: - return simple_rdlock_finish(lock, mdr); - } -} - -bool Locker::wrlock_start(SimpleLock *lock, MDRequest *mdr) -{ - switch (lock->get_type()) { - case LOCK_OTYPE_IDIRFRAGTREE: - case LOCK_OTYPE_IDIR: - return scatter_wrlock_start((ScatterLock*)lock, mdr); - case LOCK_OTYPE_IVERSION: - return local_wrlock_start((LocalLock*)lock, mdr); - default: - assert(0); - return false; - } -} - -void Locker::wrlock_finish(SimpleLock *lock, MDRequest *mdr) -{ - switch (lock->get_type()) { - case LOCK_OTYPE_IDIRFRAGTREE: - case LOCK_OTYPE_IDIR: - return scatter_wrlock_finish((ScatterLock*)lock, mdr); - case LOCK_OTYPE_IVERSION: - return local_wrlock_finish((LocalLock*)lock, mdr); - default: - assert(0); - } -} - -bool Locker::xlock_start(SimpleLock *lock, MDRequest *mdr) -{ - switch (lock->get_type()) { - case LOCK_OTYPE_IFILE: - return file_xlock_start((FileLock*)lock, mdr); - case LOCK_OTYPE_IVERSION: - return local_xlock_start((LocalLock*)lock, mdr); - case LOCK_OTYPE_IDIRFRAGTREE: - case LOCK_OTYPE_IDIR: - assert(0); - default: - return simple_xlock_start(lock, mdr); - } -} - -void Locker::xlock_finish(SimpleLock *lock, MDRequest *mdr) -{ - switch (lock->get_type()) { - case LOCK_OTYPE_IFILE: - return file_xlock_finish((FileLock*)lock, mdr); - case LOCK_OTYPE_IVERSION: - return local_xlock_finish((LocalLock*)lock, mdr); - case LOCK_OTYPE_IDIRFRAGTREE: - case LOCK_OTYPE_IDIR: - assert(0); - default: - return simple_xlock_finish(lock, mdr); - } -} - - - -/** rejoin_set_state - * @lock the lock - * @s the new state - * @waiters list for anybody waiting on this lock - */ -void Locker::rejoin_set_state(SimpleLock *lock, int s, list& waiters) -{ - if (!lock->is_stable()) { - lock->set_state(s); - lock->get_parent()->auth_unpin(); - } else { - lock->set_state(s); - } - lock->take_waiting(SimpleLock::WAIT_ALL, waiters); -} - - - - -// file i/o ----------------------------------------- - -version_t Locker::issue_file_data_version(CInode *in) -{ - dout(7) << "issue_file_data_version on " << *in << dendl; - return in->inode.file_data_version; -} - - -Capability* Locker::issue_new_caps(CInode *in, - int mode, - MClientRequest *req) -{ - dout(7) << "issue_new_caps for mode " << mode << " on " << *in << dendl; - - // my needs - int my_client = req->get_client(); - int my_want = 0; - if (mode & FILE_MODE_R) my_want |= CAP_FILE_RDCACHE | CAP_FILE_RD; - if (mode & FILE_MODE_W) my_want |= CAP_FILE_WRBUFFER | CAP_FILE_WR; - - // register a capability - Capability *cap = in->get_client_cap(my_client); - if (!cap) { - // new cap - Capability c(my_want); - in->add_client_cap(my_client, c); - cap = in->get_client_cap(my_client); - - // suppress file cap messages for new cap (we'll bundle with the open() reply) - cap->set_suppress(true); - } else { - // make sure it has sufficient caps - if (my_want & ~cap->wanted()) { - // augment wanted caps for this client - cap->set_wanted( cap->wanted() | my_want ); - } - } - - int before = cap->pending(); - - if (in->is_auth()) { - // [auth] twiddle mode? - if (in->filelock.is_stable()) - file_eval(&in->filelock); - } else { - // [replica] tell auth about any new caps wanted - request_inode_file_caps(in); - } - - // issue caps (pot. incl new one) - issue_caps(in); // note: _eval above may have done this already... - - // re-issue whatever we can - cap->issue(cap->pending()); - - // ok, stop suppressing. - cap->set_suppress(false); - - int now = cap->pending(); - if (before != now && - (before & CAP_FILE_WR) == 0 && - (now & CAP_FILE_WR)) { - // FIXME FIXME FIXME - } - - // twiddle file_data_version? - if ((before & CAP_FILE_WRBUFFER) == 0 && - (now & CAP_FILE_WRBUFFER)) { - in->inode.file_data_version++; - dout(7) << " incrementing file_data_version, now " << in->inode.file_data_version << " for " << *in << dendl; - } - - return cap; -} - - - -bool Locker::issue_caps(CInode *in) -{ - // allowed caps are determined by the lock mode. - int allowed = in->filelock.caps_allowed(); - dout(7) << "issue_caps filelock allows=" << cap_string(allowed) - << " on " << *in << dendl; - - // count conflicts with - int nissued = 0; - - // client caps - for (map::iterator it = in->client_caps.begin(); - it != in->client_caps.end(); - it++) { - if (it->second.pending() != (it->second.wanted() & allowed)) { - // issue - nissued++; - - int before = it->second.pending(); - long seq = it->second.issue(it->second.wanted() & allowed); - int after = it->second.pending(); - - // twiddle file_data_version? - if (!(before & CAP_FILE_WRBUFFER) && - (after & CAP_FILE_WRBUFFER)) { - dout(7) << " incrementing file_data_version for " << *in << dendl; - in->inode.file_data_version++; - } - - if (seq > 0 && - !it->second.is_suppress()) { - dout(7) << " sending MClientFileCaps to client" << it->first << " seq " << it->second.get_last_seq() << " new pending " << cap_string(it->second.pending()) << " was " << cap_string(before) << dendl; - mds->send_message_client_maybe_opening(new MClientFileCaps(MClientFileCaps::OP_GRANT, - in->inode, - it->second.get_last_seq(), - it->second.pending(), - it->second.wanted()), - it->first); - } - } - } - - return (nissued == 0); // true if no re-issued, no callbacks -} - - -class C_MDL_RequestInodeFileCaps : public Context { - Locker *locker; - CInode *in; -public: - C_MDL_RequestInodeFileCaps(Locker *l, CInode *i) : locker(l), in(i) {} - void finish(int r) { - in->put(CInode::PIN_PTRWAITER); - if (!in->is_auth()) - locker->request_inode_file_caps(in); - } -}; - -void Locker::request_inode_file_caps(CInode *in) -{ - int wanted = in->get_caps_wanted(); - if (wanted != in->replica_caps_wanted) { - - if (wanted == 0) { - if (in->replica_caps_wanted_keep_until > g_clock.recent_now()) { - // ok, release them finally! - in->replica_caps_wanted_keep_until.sec_ref() = 0; - dout(7) << "request_inode_file_caps " << cap_string(wanted) - << " was " << cap_string(in->replica_caps_wanted) - << " no keeping anymore " - << " on " << *in - << dendl; - } - else if (in->replica_caps_wanted_keep_until.sec() == 0) { - in->replica_caps_wanted_keep_until = g_clock.recent_now(); - in->replica_caps_wanted_keep_until.sec_ref() += 2; - - dout(7) << "request_inode_file_caps " << cap_string(wanted) - << " was " << cap_string(in->replica_caps_wanted) - << " keeping until " << in->replica_caps_wanted_keep_until - << " on " << *in - << dendl; - return; - } else { - // wait longer - return; - } - } else { - in->replica_caps_wanted_keep_until.sec_ref() = 0; - } - assert(!in->is_auth()); - - // wait for single auth - if (in->is_ambiguous_auth()) { - in->get(CInode::PIN_PTRWAITER); - in->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, - new C_MDL_RequestInodeFileCaps(this, in)); - return; - } - - int auth = in->authority().first; - dout(7) << "request_inode_file_caps " << cap_string(wanted) - << " was " << cap_string(in->replica_caps_wanted) - << " on " << *in << " to mds" << auth << dendl; - assert(!in->is_auth()); - - in->replica_caps_wanted = wanted; - - if (mds->mdsmap->get_state(auth) >= MDSMap::STATE_REJOIN) - mds->send_message_mds(new MInodeFileCaps(in->ino(), mds->get_nodeid(), - in->replica_caps_wanted), - auth); - } else { - in->replica_caps_wanted_keep_until.sec_ref() = 0; - } -} - -void Locker::handle_inode_file_caps(MInodeFileCaps *m) -{ - // nobody should be talking to us during recovery. - assert(mds->is_rejoin() || mds->is_active() || mds->is_stopping()); - - // ok - CInode *in = mdcache->get_inode(m->get_ino()); - assert(in); - assert(in->is_auth()); - - if (mds->is_rejoin() && - in->is_rejoining()) { - dout(7) << "handle_inode_file_caps still rejoining " << *in << ", dropping " << *m << dendl; - delete m; - return; - } - - - dout(7) << "handle_inode_file_caps replica mds" << m->get_from() << " wants caps " << cap_string(m->get_caps()) << " on " << *in << dendl; - - if (m->get_caps()) - in->mds_caps_wanted[m->get_from()] = m->get_caps(); - else - in->mds_caps_wanted.erase(m->get_from()); - - if (in->filelock.is_stable()) - try_file_eval(&in->filelock); // ** may or may not be auth_pinned ** - delete m; -} - - -/* - * note: we only get these from the client if - * - we are calling back previously issued caps (fewer than the client previously had) - * - or if the client releases (any of) its caps on its own - */ -void Locker::handle_client_file_caps(MClientFileCaps *m) -{ - int client = m->get_source().num(); - CInode *in = mdcache->get_inode(m->get_ino()); - Capability *cap = 0; - if (in) - cap = in->get_client_cap(client); - - if (!in || !cap) { - if (!in) { - dout(7) << "handle_client_file_caps on unknown ino " << m->get_ino() << ", dropping" << dendl; - } else { - dout(7) << "handle_client_file_caps no cap for client" << client << " on " << *in << dendl; - } - delete m; - return; - } - - assert(cap); - - // filter wanted based on what we could ever give out (given auth/replica status) - int wanted = m->get_wanted() & in->filelock.caps_allowed_ever(); - - dout(7) << "handle_client_file_caps seq " << m->get_seq() - << " confirms caps " << cap_string(m->get_caps()) - << " wants " << cap_string(wanted) - << " from client" << client - << " on " << *in - << dendl; - - // update wanted - if (cap->wanted() != wanted) { - if (m->get_seq() < cap->get_last_seq()) { - /* this is awkward. - client may be trying to release caps (i.e. inode closed, etc.) by setting reducing wanted - set. - but it may also be opening the same filename, not sure that it'll map to the same inode. - so, we don't want wanted reductions to clobber mds's notion of wanted unless we're - sure the client has seen all the latest caps. - */ - dout(10) << "handle_client_file_caps ignoring wanted " << cap_string(m->get_wanted()) - << " bc seq " << m->get_seq() << " < " << cap->get_last_seq() << dendl; - } else { - cap->set_wanted(wanted); - } - } - - // confirm caps - int had = cap->confirm_receipt(m->get_seq(), m->get_caps()); - int has = cap->confirmed(); - if (cap->is_null()) { - dout(7) << " cap for client" << client << " is now null, removing from " << *in << dendl; - in->remove_client_cap(client); - if (!in->is_any_caps()) - in->xlist_open_file.remove_myself(); // unpin logsegment - if (!in->is_auth()) - request_inode_file_caps(in); - - // tell client. - MClientFileCaps *r = new MClientFileCaps(MClientFileCaps::OP_RELEASE, - in->inode, - 0, 0, 0); - mds->send_message_client_maybe_open(r, m->get_source_inst()); - } - - // merge in atime? - if (m->get_inode().atime > in->inode.atime) { - dout(7) << " taking atime " << m->get_inode().atime << " > " - << in->inode.atime << " for " << *in << dendl; - in->inode.atime = m->get_inode().atime; - } - - if ((has|had) & CAP_FILE_WR) { - bool dirty = false; - - // mtime - if (m->get_inode().mtime > in->inode.mtime) { - dout(7) << " taking mtime " << m->get_inode().mtime << " > " - << in->inode.mtime << " for " << *in << dendl; - in->inode.mtime = m->get_inode().mtime; - dirty = true; - } - // size - if (m->get_inode().size > in->inode.size) { - dout(7) << " taking size " << m->get_inode().size << " > " - << in->inode.size << " for " << *in << dendl; - in->inode.size = m->get_inode().size; - dirty = true; - } - - if (dirty) - mds->mdlog->submit_entry(new EString("cap inode update dirty fixme")); - } - - // reevaluate, waiters - if (!in->filelock.is_stable()) - file_eval_gather(&in->filelock); - else if (in->is_auth()) - file_eval(&in->filelock); - - //in->finish_waiting(CInode::WAIT_CAPS, 0); // note: any users for this? - - delete m; -} - - - - - - - - - - -// locks ---------------------------------------------------------------- - -SimpleLock *Locker::get_lock(int lock_type, MDSCacheObjectInfo &info) -{ - switch (lock_type) { - case LOCK_OTYPE_DN: - { - // be careful; info.dirfrag may have incorrect frag; recalculate based on dname. - CInode *diri = mdcache->get_inode(info.dirfrag.ino); - frag_t fg; - CDir *dir = 0; - CDentry *dn = 0; - if (diri) { - fg = diri->pick_dirfrag(info.dname); - dir = diri->get_dirfrag(fg); - if (dir) - dn = dir->lookup(info.dname); - } - if (!dn) { - dout(7) << "get_lock don't have dn " << info.dirfrag.ino << " " << info.dname << dendl; - return 0; - } - return &dn->lock; - } - - case LOCK_OTYPE_IAUTH: - case LOCK_OTYPE_ILINK: - case LOCK_OTYPE_IDIRFRAGTREE: - case LOCK_OTYPE_IFILE: - case LOCK_OTYPE_IDIR: - { - CInode *in = mdcache->get_inode(info.ino); - if (!in) { - dout(7) << "get_lock don't have ino " << info.ino << dendl; - return 0; - } - switch (lock_type) { - case LOCK_OTYPE_IAUTH: return &in->authlock; - case LOCK_OTYPE_ILINK: return &in->linklock; - case LOCK_OTYPE_IDIRFRAGTREE: return &in->dirfragtreelock; - case LOCK_OTYPE_IFILE: return &in->filelock; - case LOCK_OTYPE_IDIR: return &in->dirlock; - } - } - - default: - dout(7) << "get_lock don't know lock_type " << lock_type << dendl; - assert(0); - break; - } - - return 0; -} - - -void Locker::handle_lock(MLock *m) -{ - // nobody should be talking to us during recovery. - assert(mds->is_rejoin() || mds->is_active() || mds->is_stopping()); - - SimpleLock *lock = get_lock(m->get_lock_type(), m->get_object_info()); - if (!lock) { - dout(10) << "don't have object " << m->get_object_info() << ", must have trimmed, dropping" << dendl; - delete m; - return; - } - - switch (lock->get_type()) { - case LOCK_OTYPE_DN: - case LOCK_OTYPE_IAUTH: - case LOCK_OTYPE_ILINK: - handle_simple_lock(lock, m); - break; - - case LOCK_OTYPE_IFILE: - handle_file_lock((FileLock*)lock, m); - break; - - case LOCK_OTYPE_IDIRFRAGTREE: - case LOCK_OTYPE_IDIR: - handle_scatter_lock((ScatterLock*)lock, m); - break; - - default: - dout(7) << "handle_lock got otype " << m->get_lock_type() << dendl; - assert(0); - break; - } -} - - - - - -// ========================================================================== -// simple lock - -void Locker::handle_simple_lock(SimpleLock *lock, MLock *m) -{ - int from = m->get_asker(); - - if (mds->is_rejoin()) { - if (lock->get_parent()->is_rejoining()) { - dout(7) << "handle_simple_lock still rejoining " << *lock->get_parent() - << ", dropping " << *m << dendl; - delete m; - return; - } - } - - switch (m->get_action()) { - // -- replica -- - case LOCK_AC_SYNC: - assert(lock->get_state() == LOCK_LOCK); - lock->decode_locked_state(m->get_data()); - lock->set_state(LOCK_SYNC); - lock->finish_waiters(SimpleLock::WAIT_RD|SimpleLock::WAIT_STABLE); - - // special case: trim replica no-longer-null dentry? - if (lock->get_type() == LOCK_OTYPE_DN) { - CDentry *dn = (CDentry*)lock->get_parent(); - if (dn->is_null() && m->get_data().length() > 0) { - dout(10) << "handle_simple_lock replica dentry null -> non-null, must trim " - << *dn << dendl; - assert(dn->get_num_ref() == 0); - map expiremap; - mdcache->trim_dentry(dn, expiremap); - mdcache->send_expire_messages(expiremap); - } - } - break; - - case LOCK_AC_LOCK: - assert(lock->get_state() == LOCK_SYNC); - //|| lock->get_state() == LOCK_GLOCKR); - - // wait for readers to finish? - if (lock->is_rdlocked()) { - dout(7) << "handle_simple_lock has reader, waiting before ack on " << *lock - << " on " << *lock->get_parent() << dendl; - lock->set_state(LOCK_GLOCKR); - } else { - // update lock and reply - lock->set_state(LOCK_LOCK); - mds->send_message_mds(new MLock(lock, LOCK_AC_LOCKACK, mds->get_nodeid()), from); - } - break; - - - // -- auth -- - case LOCK_AC_LOCKACK: - assert(lock->get_state() == LOCK_GLOCKR); - assert(lock->is_gathering(from)); - lock->remove_gather(from); - - if (lock->is_gathering()) { - dout(7) << "handle_simple_lock " << *lock << " on " << *lock->get_parent() << " from " << from - << ", still gathering " << lock->get_gather_set() << dendl; - } else { - dout(7) << "handle_simple_lock " << *lock << " on " << *lock->get_parent() << " from " << from - << ", last one" << dendl; - simple_eval_gather(lock); - } - break; - - } - - delete m; -} - -/* unused, currently. - -class C_Locker_SimpleEval : public Context { - Locker *locker; - SimpleLock *lock; -public: - C_Locker_SimpleEval(Locker *l, SimpleLock *lk) : locker(l), lock(lk) {} - void finish(int r) { - locker->try_simple_eval(lock); - } -}; - -void Locker::try_simple_eval(SimpleLock *lock) -{ - // unstable and ambiguous auth? - if (!lock->is_stable() && - lock->get_parent()->is_ambiguous_auth()) { - dout(7) << "simple_eval not stable and ambiguous auth, waiting on " << *lock->get_parent() << dendl; - //if (!lock->get_parent()->is_waiter(MDSCacheObject::WAIT_SINGLEAUTH)) - lock->get_parent()->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, new C_Locker_SimpleEval(this, lock)); - return; - } - - if (!lock->get_parent()->is_auth()) { - dout(7) << "try_simple_eval not auth for " << *lock->get_parent() << dendl; - return; - } - - if (!lock->get_parent()->can_auth_pin()) { - dout(7) << "try_simple_eval can't auth_pin, waiting on " << *lock->get_parent() << dendl; - //if (!lock->get_parent()->is_waiter(MDSCacheObject::WAIT_SINGLEAUTH)) - lock->get_parent()->add_waiter(MDSCacheObject::WAIT_UNFREEZE, new C_Locker_SimpleEval(this, lock)); - return; - } - - if (lock->is_stable()) - simple_eval(lock); -} -*/ - -void Locker::simple_eval_gather(SimpleLock *lock) -{ - dout(10) << "simple_eval_gather " << *lock << " on " << *lock->get_parent() << dendl; - - // finished gathering? - if (lock->get_state() == LOCK_GLOCKR && - !lock->is_gathering() && - !lock->is_rdlocked()) { - dout(7) << "simple_eval finished gather on " << *lock << " on " << *lock->get_parent() << dendl; - - // replica: tell auth - if (!lock->get_parent()->is_auth()) { - int auth = lock->get_parent()->authority().first; - if (mds->mdsmap->get_state(auth) >= MDSMap::STATE_REJOIN) - mds->send_message_mds(new MLock(lock, LOCK_AC_LOCKACK, mds->get_nodeid()), - lock->get_parent()->authority().first); - } - - lock->set_state(LOCK_LOCK); - lock->finish_waiters(SimpleLock::WAIT_STABLE|SimpleLock::WAIT_WR); - - if (lock->get_parent()->is_auth()) { - lock->get_parent()->auth_unpin(); - - // re-eval? - simple_eval(lock); - } - } -} - -void Locker::simple_eval(SimpleLock *lock) -{ - dout(10) << "simple_eval " << *lock << " on " << *lock->get_parent() << dendl; - - assert(lock->get_parent()->is_auth()); - assert(lock->is_stable()); - - // stable -> sync? - if (!lock->is_xlocked() && - lock->get_state() != LOCK_SYNC && - !lock->is_waiter_for(SimpleLock::WAIT_WR)) { - dout(7) << "simple_eval stable, syncing " << *lock - << " on " << *lock->get_parent() << dendl; - simple_sync(lock); - } - -} - - -// mid - -void Locker::simple_sync(SimpleLock *lock) -{ - dout(7) << "simple_sync on " << *lock << " on " << *lock->get_parent() << dendl; - assert(lock->get_parent()->is_auth()); - assert(lock->is_stable()); - - // check state - if (lock->get_state() == LOCK_SYNC) - return; // already sync - assert(lock->get_state() == LOCK_LOCK); - - // sync. - if (lock->get_parent()->is_replicated()) { - // hard data - bufferlist data; - lock->encode_locked_state(data); - - // bcast to replicas - send_lock_message(lock, LOCK_AC_SYNC, data); - } - - // change lock - lock->set_state(LOCK_SYNC); - - // waiters? - lock->finish_waiters(SimpleLock::WAIT_RD|SimpleLock::WAIT_STABLE); -} - -void Locker::simple_lock(SimpleLock *lock) -{ - dout(7) << "simple_lock on " << *lock << " on " << *lock->get_parent() << dendl; - assert(lock->get_parent()->is_auth()); - assert(lock->is_stable()); - - // check state - if (lock->get_state() == LOCK_LOCK) return; - assert(lock->get_state() == LOCK_SYNC); - - if (lock->get_parent()->is_replicated()) { - // bcast to replicas - send_lock_message(lock, LOCK_AC_LOCK); - - // change lock - lock->set_state(LOCK_GLOCKR); - lock->init_gather(); - lock->get_parent()->auth_pin(); - } else { - lock->set_state(LOCK_LOCK); - } -} - - -// top - -bool Locker::simple_rdlock_try(SimpleLock *lock, Context *con) -{ - dout(7) << "simple_rdlock_try on " << *lock << " on " << *lock->get_parent() << dendl; - - // can read? grab ref. - if (lock->can_rdlock(0)) - return true; - - assert(!lock->get_parent()->is_auth()); - - // wait! - dout(7) << "simple_rdlock_try waiting on " << *lock << " on " << *lock->get_parent() << dendl; - if (con) lock->add_waiter(SimpleLock::WAIT_RD, con); - return false; -} - -bool Locker::simple_rdlock_start(SimpleLock *lock, MDRequest *mdr) -{ - dout(7) << "simple_rdlock_start on " << *lock << " on " << *lock->get_parent() << dendl; - - // can read? grab ref. - if (lock->can_rdlock(mdr)) { - lock->get_rdlock(); - mdr->rdlocks.insert(lock); - mdr->locks.insert(lock); - return true; - } - - // wait! - dout(7) << "simple_rdlock_start waiting on " << *lock << " on " << *lock->get_parent() << dendl; - lock->add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr)); - return false; -} - -void Locker::simple_rdlock_finish(SimpleLock *lock, MDRequest *mdr) -{ - // drop ref - lock->put_rdlock(); - if (mdr) { - mdr->rdlocks.erase(lock); - mdr->locks.erase(lock); - } - - dout(7) << "simple_rdlock_finish on " << *lock << " on " << *lock->get_parent() << dendl; - - // last one? - if (!lock->is_rdlocked()) - simple_eval_gather(lock); -} - -bool Locker::simple_xlock_start(SimpleLock *lock, MDRequest *mdr) -{ - dout(7) << "simple_xlock_start on " << *lock << " on " << *lock->get_parent() << dendl; - - // xlock by me? - if (lock->is_xlocked() && - lock->get_xlocked_by() == mdr) - return true; - - // auth? - if (lock->get_parent()->is_auth()) { - // auth - - // lock. - if (lock->get_state() == LOCK_SYNC) - simple_lock(lock); - - // already locked? - if (lock->get_state() == LOCK_LOCK) { - if (lock->is_xlocked()) { - // by someone else. - lock->add_waiter(SimpleLock::WAIT_WR, new C_MDS_RetryRequest(mdcache, mdr)); - return false; - } - - // xlock. - lock->get_xlock(mdr); - mdr->xlocks.insert(lock); - mdr->locks.insert(lock); - return true; - } else { - // wait for lock - lock->add_waiter(SimpleLock::WAIT_STABLE, new C_MDS_RetryRequest(mdcache, mdr)); - return false; - } - } else { - // replica - // this had better not be a remote xlock attempt! - assert(!mdr->slave_request); - - // wait for single auth - if (lock->get_parent()->is_ambiguous_auth()) { - lock->get_parent()->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, - new C_MDS_RetryRequest(mdcache, mdr)); - return false; - } - - // send lock request - int auth = lock->get_parent()->authority().first; - mdr->more()->slaves.insert(auth); - MMDSSlaveRequest *r = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_XLOCK); - r->set_lock_type(lock->get_type()); - lock->get_parent()->set_object_info(r->get_object_info()); - mds->send_message_mds(r, auth); - - // wait - lock->add_waiter(SimpleLock::WAIT_REMOTEXLOCK, new C_MDS_RetryRequest(mdcache, mdr)); - return false; - } -} - - -void Locker::simple_xlock_finish(SimpleLock *lock, MDRequest *mdr) -{ - dout(7) << "simple_xlock_finish on " << *lock << " on " << *lock->get_parent() << dendl; - - // drop ref - assert(lock->can_xlock(mdr)); - lock->put_xlock(); - assert(mdr); - mdr->xlocks.erase(lock); - mdr->locks.erase(lock); - - // remote xlock? - if (!lock->get_parent()->is_auth()) { - // tell auth - dout(7) << "simple_xlock_finish releasing remote xlock on " << *lock->get_parent() << dendl; - int auth = lock->get_parent()->authority().first; - if (mds->mdsmap->get_state(auth) >= MDSMap::STATE_REJOIN) { - MMDSSlaveRequest *slavereq = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_UNXLOCK); - slavereq->set_lock_type(lock->get_type()); - lock->get_parent()->set_object_info(slavereq->get_object_info()); - mds->send_message_mds(slavereq, auth); - } - } - - // others waiting? - lock->finish_waiters(SimpleLock::WAIT_WR, 0); - - // eval? - if (lock->get_parent()->is_auth()) - simple_eval(lock); -} - - - -// dentry specific helpers - -/** dentry_can_rdlock_trace - * see if we can _anonymously_ rdlock an entire trace. - * if not, and req is specified, wait and retry that message. - */ -bool Locker::dentry_can_rdlock_trace(vector& trace) -{ - // verify dentries are rdlockable. - // we do this because - // - we're being less aggressive about locks acquisition, and - // - we're not acquiring the locks in order! - for (vector::iterator it = trace.begin(); - it != trace.end(); - it++) { - CDentry *dn = *it; - if (!dn->lock.can_rdlock(0)) { - dout(10) << "can_rdlock_trace can't rdlock " << *dn << dendl; - return false; - } - } - return true; -} - -void Locker::dentry_anon_rdlock_trace_start(vector& trace) -{ - // grab dentry rdlocks - for (vector::iterator it = trace.begin(); - it != trace.end(); - it++) { - dout(10) << "dentry_anon_rdlock_trace_start rdlocking " << (*it)->lock << " " << **it << dendl; - (*it)->lock.get_rdlock(); - } -} - - -void Locker::dentry_anon_rdlock_trace_finish(vector& trace) -{ - for (vector::iterator it = trace.begin(); - it != trace.end(); - it++) - simple_rdlock_finish(&(*it)->lock, 0); -} - - - -// ========================================================================== -// scatter lock - -bool Locker::scatter_rdlock_start(ScatterLock *lock, MDRequest *mdr) -{ - dout(7) << "scatter_rdlock_start on " << *lock - << " on " << *lock->get_parent() << dendl; - - // read on stable scattered replica? - if (lock->get_state() == LOCK_SCATTER && - !lock->get_parent()->is_auth()) { - dout(7) << "scatter_rdlock_start scatterlock read on a stable scattered replica, fw to auth" << dendl; - mdcache->request_forward(mdr, lock->get_parent()->authority().first); - return false; - } - - // pre-twiddle? - if (lock->get_state() == LOCK_SCATTER && - lock->get_parent()->is_auth() && - !lock->get_parent()->is_replicated() && - !lock->is_wrlocked()) - scatter_sync(lock); - - // can rdlock? - if (lock->can_rdlock(mdr)) { - lock->get_rdlock(); - mdr->rdlocks.insert(lock); - mdr->locks.insert(lock); - return true; - } - - // wait for read. - lock->add_waiter(SimpleLock::WAIT_RD|SimpleLock::WAIT_STABLE, new C_MDS_RetryRequest(mdcache, mdr)); - - // initiate sync or tempsync? - if (lock->is_stable() && - lock->get_parent()->is_auth()) { - if (lock->get_parent()->is_replicated()) - scatter_tempsync(lock); - else - scatter_sync(lock); - } - - return false; -} - -void Locker::scatter_rdlock_finish(ScatterLock *lock, MDRequest *mdr) -{ - dout(7) << "scatter_rdlock_finish on " << *lock - << " on " << *lock->get_parent() << dendl; - lock->put_rdlock(); - if (mdr) { - mdr->rdlocks.erase(lock); - mdr->locks.erase(lock); - } - - scatter_eval_gather(lock); -} - - -bool Locker::scatter_wrlock_start(ScatterLock *lock, MDRequest *mdr) -{ - dout(7) << "scatter_wrlock_start on " << *lock - << " on " << *lock->get_parent() << dendl; - - // pre-twiddle? - if (lock->get_parent()->is_auth() && - !lock->get_parent()->is_replicated() && - !lock->is_rdlocked() && - !lock->is_xlocked() && - lock->get_state() == LOCK_SYNC) - lock->set_state(LOCK_SCATTER); - //scatter_scatter(lock); - - // can wrlock? - if (lock->can_wrlock()) { - lock->get_wrlock(); - mdr->wrlocks.insert(lock); - mdr->locks.insert(lock); - return true; - } - - // wait for write. - lock->add_waiter(SimpleLock::WAIT_WR|SimpleLock::WAIT_STABLE, - new C_MDS_RetryRequest(mdcache, mdr)); - - // initiate scatter or lock? - if (lock->is_stable()) { - if (lock->get_parent()->is_auth()) { - // auth. scatter or lock? - if (((CInode*)lock->get_parent())->has_subtree_root_dirfrag()) - scatter_scatter(lock); - else - scatter_lock(lock); - } else { - // replica. - // auth should be auth_pinned (see acquire_locks wrlock weird mustpin case). - int auth = lock->get_parent()->authority().first; - dout(10) << "requesting scatter from auth on " - << *lock << " on " << *lock->get_parent() << dendl; - mds->send_message_mds(new MLock(lock, LOCK_AC_REQSCATTER, mds->get_nodeid()), auth); - } - } - - return false; -} - -void Locker::scatter_wrlock_finish(ScatterLock *lock, MDRequest *mdr) -{ - dout(7) << "scatter_wrlock_finish on " << *lock - << " on " << *lock->get_parent() << dendl; - lock->put_wrlock(); - if (mdr) { - mdr->wrlocks.erase(lock); - mdr->locks.erase(lock); - } - - scatter_eval_gather(lock); -} - - -class C_Locker_ScatterEval : public Context { - Locker *locker; - ScatterLock *lock; -public: - C_Locker_ScatterEval(Locker *l, ScatterLock *lk) : locker(l), lock(lk) {} - void finish(int r) { - lock->get_parent()->put(CInode::PIN_PTRWAITER); - locker->try_scatter_eval(lock); - } -}; - - -void Locker::try_scatter_eval(ScatterLock *lock) -{ - // unstable and ambiguous auth? - if (!lock->is_stable() && - lock->get_parent()->is_ambiguous_auth()) { - dout(7) << "try_scatter_eval not stable and ambiguous auth, waiting on " << *lock->get_parent() << dendl; - //if (!lock->get_parent()->is_waiter(MDSCacheObject::WAIT_SINGLEAUTH)) - lock->get_parent()->get(CInode::PIN_PTRWAITER); - lock->get_parent()->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, new C_Locker_ScatterEval(this, lock)); - return; - } - - if (!lock->get_parent()->is_auth()) { - dout(7) << "try_scatter_eval not auth for " << *lock->get_parent() << dendl; - return; - } - - if (!lock->get_parent()->can_auth_pin()) { - dout(7) << "try_scatter_eval can't auth_pin, waiting on " << *lock->get_parent() << dendl; - //if (!lock->get_parent()->is_waiter(MDSCacheObject::WAIT_SINGLEAUTH)) - lock->get_parent()->get(CInode::PIN_PTRWAITER); - lock->get_parent()->add_waiter(MDSCacheObject::WAIT_UNFREEZE, new C_Locker_ScatterEval(this, lock)); - return; - } - - if (lock->is_stable()) - scatter_eval(lock); -} - - -void Locker::scatter_eval_gather(ScatterLock *lock) -{ - dout(10) << "scatter_eval_gather " << *lock << " on " << *lock->get_parent() << dendl; - - if (!lock->get_parent()->is_auth()) { - // REPLICA - - if (lock->get_state() == LOCK_GLOCKC && - !lock->is_wrlocked()) { - dout(10) << "scatter_eval no wrlocks, acking lock" << dendl; - int auth = lock->get_parent()->authority().first; - if (mds->mdsmap->get_state(auth) >= MDSMap::STATE_REJOIN) { - bufferlist data; - lock->encode_locked_state(data); - mds->send_message_mds(new MLock(lock, LOCK_AC_LOCKACK, mds->get_nodeid(), data), auth); - } - lock->set_state(LOCK_LOCK); - } - - } else { - // AUTH - - // glocks|glockt -> lock? - if ((lock->get_state() == LOCK_GLOCKS || - lock->get_state() == LOCK_GLOCKT) && - !lock->is_gathering() && - !lock->is_rdlocked()) { - dout(7) << "scatter_eval finished lock gather/un-rdlock on " << *lock - << " on " << *lock->get_parent() << dendl; - lock->set_state(LOCK_LOCK); - lock->finish_waiters(ScatterLock::WAIT_XLOCK|ScatterLock::WAIT_STABLE); - lock->get_parent()->auth_unpin(); - } - - // glockc -> lock? - else if (lock->get_state() == LOCK_GLOCKC && - !lock->is_gathering() && - !lock->is_wrlocked()) { - if (lock->is_updated()) { - scatter_writebehind(lock); - } else { - dout(7) << "scatter_eval finished lock gather/un-wrlock on " << *lock - << " on " << *lock->get_parent() << dendl; - lock->set_state(LOCK_LOCK); - lock->finish_waiters(ScatterLock::WAIT_XLOCK|ScatterLock::WAIT_STABLE); - lock->get_parent()->auth_unpin(); - } - } - - // gSyncL -> sync? - else if (lock->get_state() == LOCK_GSYNCL && - !lock->is_wrlocked()) { - dout(7) << "scatter_eval finished sync un-wrlock on " << *lock - << " on " << *lock->get_parent() << dendl; - if (lock->get_parent()->is_replicated()) { - // encode and bcast - bufferlist data; - lock->encode_locked_state(data); - send_lock_message(lock, LOCK_AC_SYNC, data); - } - lock->set_state(LOCK_SYNC); - lock->finish_waiters(ScatterLock::WAIT_RD|ScatterLock::WAIT_STABLE); - lock->get_parent()->auth_unpin(); - } - - // gscattert|gscatters -> scatter? - else if ((lock->get_state() == LOCK_GSCATTERT || - lock->get_state() == LOCK_GSCATTERS) && - !lock->is_gathering() && - !lock->is_rdlocked()) { - dout(7) << "scatter_eval finished scatter un-rdlock(/gather) on " << *lock - << " on " << *lock->get_parent() << dendl; - if (lock->get_parent()->is_replicated()) { - // encode and bcast - bufferlist data; - lock->encode_locked_state(data); - send_lock_message(lock, LOCK_AC_SCATTER, data); - } - lock->set_state(LOCK_SCATTER); - lock->finish_waiters(ScatterLock::WAIT_WR|ScatterLock::WAIT_STABLE); - lock->get_parent()->auth_unpin(); - } - - // gTempsyncC|gTempsyncL -> tempsync - else if ((lock->get_state() == LOCK_GTEMPSYNCC || - lock->get_state() == LOCK_GTEMPSYNCL) && - !lock->is_gathering() && - !lock->is_wrlocked()) { - if (lock->is_updated()) { - scatter_writebehind(lock); - } else { - dout(7) << "scatter_eval finished tempsync gather/un-wrlock on " << *lock - << " on " << *lock->get_parent() << dendl; - lock->set_state(LOCK_TEMPSYNC); - lock->finish_waiters(ScatterLock::WAIT_RD|ScatterLock::WAIT_STABLE); - lock->get_parent()->auth_unpin(); - } - } - - - // re-eval? - if (lock->is_stable()) // && lock->get_parent()->can_auth_pin()) - scatter_eval(lock); - } -} - -void Locker::scatter_writebehind(ScatterLock *lock) -{ - CInode *in = (CInode*)lock->get_parent(); - dout(10) << "scatter_writebehind " << in->inode.mtime << " on " << *lock << " on " << *in << dendl; - - // journal write-behind. - inode_t *pi = in->project_inode(); - pi->mtime = in->inode.mtime; // make sure an intermediate version isn't goofing us up - pi->version = in->pre_dirty(); - - EUpdate *le = new EUpdate(mds->mdlog, "scatter writebehind"); - le->metablob.add_dir_context(in->get_parent_dn()->get_dir()); - le->metablob.add_primary_dentry(in->get_parent_dn(), true, 0, pi); - - mds->mdlog->submit_entry(le); - mds->mdlog->wait_for_sync(new C_Locker_ScatterWB(this, lock, mds->mdlog->get_current_segment())); -} - -void Locker::scatter_writebehind_finish(ScatterLock *lock, LogSegment *ls) -{ - CInode *in = (CInode*)lock->get_parent(); - dout(10) << "scatter_writebehind_finish on " << *lock << " on " << *in << dendl; - in->pop_and_dirty_projected_inode(ls); - lock->clear_updated(); - scatter_eval_gather(lock); -} - -void Locker::scatter_eval(ScatterLock *lock) -{ - dout(10) << "scatter_eval " << *lock << " on " << *lock->get_parent() << dendl; - - assert(lock->get_parent()->is_auth()); - assert(lock->is_stable()); - - CInode *in = (CInode*)lock->get_parent(); - if (in->has_subtree_root_dirfrag() && !in->is_base()) { - // i _should_ be scattered. - if (!lock->is_rdlocked() && - !lock->is_xlocked() && - lock->get_state() != LOCK_SCATTER) { - dout(10) << "scatter_eval no rdlocks|xlocks, am subtree root inode, scattering" << dendl; - scatter_scatter(lock); - autoscattered.push_back(&lock->xlistitem_autoscattered); - } - } else { - // i _should_ be sync. - lock->xlistitem_autoscattered.remove_myself(); - if (!lock->is_wrlocked() && - !lock->is_xlocked() && - lock->get_state() != LOCK_SYNC) { - dout(10) << "scatter_eval no wrlocks|xlocks, not subtree root inode, syncing" << dendl; - scatter_sync(lock); - } - } -} - -void Locker::note_autoscattered(ScatterLock *lock) -{ - dout(10) << "note_autoscattered " << *lock << " on " << *lock->get_parent() << dendl; - autoscattered.push_back(&lock->xlistitem_autoscattered); -} - - -/* - * this is called by LogSegment::try_to_trim() when trying to - * flush dirty scattered data (e.g. inode->dirlock mtime) back - * to the auth node. - */ -void Locker::scatter_try_unscatter(ScatterLock *lock, Context *c) -{ - dout(10) << "scatter_try_unscatter " << *lock << " on " << *lock->get_parent() << dendl; - assert(!lock->get_parent()->is_auth()); - assert(!lock->get_parent()->is_ambiguous_auth()); - - // request unscatter? - int auth = lock->get_parent()->authority().first; - if (lock->get_state() == LOCK_SCATTER && - mds->mdsmap->get_state(auth) >= MDSMap::STATE_ACTIVE) - mds->send_message_mds(new MLock(lock, LOCK_AC_REQUNSCATTER, mds->get_nodeid()), auth); - - // wait... - lock->add_waiter(SimpleLock::WAIT_STABLE, c); -} - - -void Locker::scatter_sync(ScatterLock *lock) -{ - dout(10) << "scatter_sync " << *lock - << " on " << *lock->get_parent() << dendl; - assert(lock->get_parent()->is_auth()); - assert(lock->is_stable()); - - switch (lock->get_state()) { - case LOCK_SYNC: - return; // already sync. - - case LOCK_TEMPSYNC: - break; // just do it. - - case LOCK_LOCK: - if (lock->is_wrlocked() || lock->is_xlocked()) { - lock->set_state(LOCK_GSYNCL); - lock->get_parent()->auth_pin(); - return; - } - break; // do it. - - case LOCK_SCATTER: - // lock first. this is the slow way, incidentally. - if (lock->get_parent()->is_replicated()) { - send_lock_message(lock, LOCK_AC_LOCK); - lock->init_gather(); - } else { - if (!lock->is_wrlocked()) { - break; // do it now, we're fine - } - } - lock->set_state(LOCK_GLOCKC); - lock->get_parent()->auth_pin(); - return; - - default: - assert(0); - } - - // do sync - if (lock->get_parent()->is_replicated()) { - // encode and bcast - bufferlist data; - lock->encode_locked_state(data); - send_lock_message(lock, LOCK_AC_SYNC, data); - } - - lock->set_state(LOCK_SYNC); - lock->finish_waiters(ScatterLock::WAIT_RD|ScatterLock::WAIT_STABLE); -} - -void Locker::scatter_scatter(ScatterLock *lock) -{ - dout(10) << "scatter_scatter " << *lock - << " on " << *lock->get_parent() << dendl; - assert(lock->get_parent()->is_auth()); - assert(lock->is_stable()); - - lock->set_last_scatter(g_clock.now()); - - switch (lock->get_state()) { - case LOCK_SYNC: - if (!lock->is_rdlocked() && - !lock->get_parent()->is_replicated()) - break; // do it - if (lock->get_parent()->is_replicated()) { - send_lock_message(lock, LOCK_AC_LOCK); - lock->init_gather(); - } - lock->set_state(LOCK_GSCATTERS); - lock->get_parent()->auth_pin(); - return; - - case LOCK_LOCK: - if (lock->is_xlocked()) - return; // sorry - break; // do it. - - case LOCK_SCATTER: - return; // did it. - - case LOCK_TEMPSYNC: - if (lock->is_rdlocked()) { - lock->set_state(LOCK_GSCATTERT); - lock->get_parent()->auth_pin(); - return; - } - break; // do it - - default: - assert(0); - } - - // do scatter - if (lock->get_parent()->is_replicated()) { - // encode and bcast - bufferlist data; - lock->encode_locked_state(data); - send_lock_message(lock, LOCK_AC_SCATTER, data); - } - lock->set_state(LOCK_SCATTER); - lock->finish_waiters(ScatterLock::WAIT_WR|ScatterLock::WAIT_STABLE); -} - -void Locker::scatter_lock(ScatterLock *lock) -{ - dout(10) << "scatter_lock " << *lock - << " on " << *lock->get_parent() << dendl; - assert(lock->get_parent()->is_auth()); - assert(lock->is_stable()); - - switch (lock->get_state()) { - case LOCK_SYNC: - if (!lock->is_rdlocked() && - !lock->get_parent()->is_replicated()) - break; // do it. - - if (lock->get_parent()->is_replicated()) { - send_lock_message(lock, LOCK_AC_LOCK); - lock->init_gather(); - } - lock->set_state(LOCK_GLOCKS); - lock->get_parent()->auth_pin(); - return; - - case LOCK_LOCK: - return; // done. - - case LOCK_SCATTER: - if (!lock->is_wrlocked() && - !lock->get_parent()->is_replicated()) { - break; // do it. - } - - if (lock->get_parent()->is_replicated()) { - send_lock_message(lock, LOCK_AC_LOCK); - lock->init_gather(); - } - lock->set_state(LOCK_GLOCKC); - lock->get_parent()->auth_pin(); - return; - - case LOCK_TEMPSYNC: - if (lock->is_rdlocked()) { - lock->set_state(LOCK_GLOCKT); - lock->get_parent()->auth_pin(); - return; - } - break; // do it. - } - - // do lock - lock->set_state(LOCK_LOCK); - lock->finish_waiters(ScatterLock::WAIT_XLOCK|ScatterLock::WAIT_WR|ScatterLock::WAIT_STABLE); -} - -void Locker::scatter_tempsync(ScatterLock *lock) -{ - dout(10) << "scatter_tempsync " << *lock - << " on " << *lock->get_parent() << dendl; - assert(lock->get_parent()->is_auth()); - assert(lock->is_stable()); - - switch (lock->get_state()) { - case LOCK_SYNC: - break; // do it. - - case LOCK_LOCK: - if (lock->is_wrlocked() || - lock->is_xlocked()) { - lock->set_state(LOCK_GTEMPSYNCL); - lock->get_parent()->auth_pin(); - return; - } - break; // do it. - - case LOCK_SCATTER: - if (!lock->is_wrlocked() && - !lock->get_parent()->is_replicated()) { - break; // do it. - } - - if (lock->get_parent()->is_replicated()) { - send_lock_message(lock, LOCK_AC_LOCK); - lock->init_gather(); - } - lock->set_state(LOCK_GTEMPSYNCC); - lock->get_parent()->auth_pin(); - return; - - case LOCK_TEMPSYNC: - return; // done - } - - // do tempsync - lock->set_state(LOCK_TEMPSYNC); - lock->finish_waiters(ScatterLock::WAIT_RD|ScatterLock::WAIT_STABLE); -} - - - - -void Locker::handle_scatter_lock(ScatterLock *lock, MLock *m) -{ - int from = m->get_asker(); - dout(10) << "handle_scatter_lock " << *m << " on " << *lock << " on " << *lock->get_parent() << dendl; - - if (mds->is_rejoin()) { - if (lock->get_parent()->is_rejoining()) { - dout(7) << "handle_scatter_lock still rejoining " << *lock->get_parent() - << ", dropping " << *m << dendl; - delete m; - return; - } - } - - switch (m->get_action()) { - // -- replica -- - case LOCK_AC_SYNC: - assert(lock->get_state() == LOCK_LOCK); - lock->set_state(LOCK_SYNC); - lock->decode_locked_state(m->get_data()); - lock->clear_updated(); - lock->finish_waiters(ScatterLock::WAIT_RD|ScatterLock::WAIT_STABLE); - break; - - case LOCK_AC_LOCK: - assert(lock->get_state() == LOCK_SCATTER || - lock->get_state() == LOCK_SYNC); - - // wait for wrlocks to close? - if (lock->is_wrlocked()) { - assert(lock->get_state() == LOCK_SCATTER); - dout(7) << "handle_scatter_lock has wrlocks, waiting on " << *lock - << " on " << *lock->get_parent() << dendl; - lock->set_state(LOCK_GLOCKC); - } else if (lock->is_rdlocked()) { - assert(lock->get_state() == LOCK_SYNC); - dout(7) << "handle_scatter_lock has rdlocks, waiting on " << *lock - << " on " << *lock->get_parent() << dendl; - lock->set_state(LOCK_GLOCKS); - } else { - dout(7) << "handle_scatter_lock has no rd|wrlocks, sending lockack for " << *lock - << " on " << *lock->get_parent() << dendl; - - // encode and reply - bufferlist data; - lock->encode_locked_state(data); - mds->send_message_mds(new MLock(lock, LOCK_AC_LOCKACK, mds->get_nodeid(), data), from); - lock->set_state(LOCK_LOCK); - } - break; - - case LOCK_AC_SCATTER: - assert(lock->get_state() == LOCK_LOCK); - lock->decode_locked_state(m->get_data()); - lock->clear_updated(); - lock->set_state(LOCK_SCATTER); - lock->finish_waiters(ScatterLock::WAIT_WR|ScatterLock::WAIT_STABLE); - break; - - // -- for auth -- - case LOCK_AC_LOCKACK: - assert(lock->get_state() == LOCK_GLOCKS || - lock->get_state() == LOCK_GLOCKC || - lock->get_state() == LOCK_GSCATTERS || - lock->get_state() == LOCK_GTEMPSYNCC); - assert(lock->is_gathering(from)); - lock->remove_gather(from); - lock->decode_locked_state(m->get_data()); - - if (lock->is_gathering()) { - dout(7) << "handle_scatter_lock " << *lock << " on " << *lock->get_parent() - << " from " << from << ", still gathering " << lock->get_gather_set() - << dendl; - } else { - dout(7) << "handle_scatter_lock " << *lock << " on " << *lock->get_parent() - << " from " << from << ", last one" - << dendl; - scatter_eval_gather(lock); - } - break; - - case LOCK_AC_REQSCATTER: - if (lock->is_stable()) { - /* NOTE: we can do this _even_ if !can_auth_pin (i.e. freezing) - * because the replica should be holding an auth_pin if they're - * doing this (and thus, we are freezing, not frozen, and indefinite - * starvation isn't an issue). - */ - dout(7) << "handle_scatter_lock got scatter request on " << *lock - << " on " << *lock->get_parent() << dendl; - scatter_scatter(lock); - } else { - dout(7) << "handle_scatter_lock ignoring scatter request on " << *lock - << " on " << *lock->get_parent() << dendl; - } - break; - - case LOCK_AC_REQUNSCATTER: - if (!lock->is_stable()) { - dout(7) << "handle_scatter_lock ignoring now-unnecessary unscatter request on " << *lock - << " on " << *lock->get_parent() << dendl; - } else if (lock->get_parent()->can_auth_pin()) { - dout(7) << "handle_scatter_lock got unscatter request on " << *lock - << " on " << *lock->get_parent() << dendl; - scatter_lock(lock); - } else { - dout(7) << "handle_scatter_lock DROPPING unscatter request on " << *lock - << " on " << *lock->get_parent() << dendl; - /* FIXME: if we can't auth_pin here, this request is effectively lost... */ - } - } - - delete m; -} - - - -void Locker::scatter_unscatter_autoscattered() -{ - /* - * periodically unscatter autoscattered locks - */ - - dout(10) << "scatter_unscatter_autoscattered" << dendl; - - utime_t now = g_clock.now(); - int n = autoscattered.size(); - while (!autoscattered.empty()) { - ScatterLock *lock = autoscattered.front(); - - // stop? - if (lock->get_state() == LOCK_SCATTER && - now - lock->get_last_scatter() < 10.0) - break; - - autoscattered.pop_front(); - - if (lock->get_state() == LOCK_SCATTER && - lock->get_parent()->is_replicated()) { - if (((CInode*)lock->get_parent())->is_frozen() || - ((CInode*)lock->get_parent())->is_freezing()) { - // hrm.. requeue. - dout(10) << "last_scatter " << lock->get_last_scatter() - << ", now " << now << ", but frozen|freezing, requeueing" << dendl; - autoscattered.push_back(&lock->xlistitem_autoscattered); - } else { - dout(10) << "last_scatter " << lock->get_last_scatter() - << ", now " << now << ", locking" << dendl; - scatter_lock(lock); - } - } - if (--n == 0) break; - } -} - - - -// ========================================================================== -// local lock - - -bool Locker::local_wrlock_start(LocalLock *lock, MDRequest *mdr) -{ - dout(7) << "local_wrlock_start on " << *lock - << " on " << *lock->get_parent() << dendl; - - if (lock->can_wrlock()) { - lock->get_wrlock(); - mdr->wrlocks.insert(lock); - mdr->locks.insert(lock); - return true; - } else { - lock->add_waiter(SimpleLock::WAIT_WR|SimpleLock::WAIT_STABLE, new C_MDS_RetryRequest(mdcache, mdr)); - return false; - } -} - -void Locker::local_wrlock_finish(LocalLock *lock, MDRequest *mdr) -{ - dout(7) << "local_wrlock_finish on " << *lock - << " on " << *lock->get_parent() << dendl; - lock->put_wrlock(); - mdr->wrlocks.erase(lock); - mdr->locks.erase(lock); -} - -bool Locker::local_xlock_start(LocalLock *lock, MDRequest *mdr) -{ - dout(7) << "local_xlock_start on " << *lock - << " on " << *lock->get_parent() << dendl; - - if (lock->is_xlocked_by_other(mdr)) { - lock->add_waiter(SimpleLock::WAIT_WR|SimpleLock::WAIT_STABLE, new C_MDS_RetryRequest(mdcache, mdr)); - return false; - } - - lock->get_xlock(mdr); - mdr->xlocks.insert(lock); - mdr->locks.insert(lock); - return true; -} - -void Locker::local_xlock_finish(LocalLock *lock, MDRequest *mdr) -{ - dout(7) << "local_xlock_finish on " << *lock - << " on " << *lock->get_parent() << dendl; - lock->put_xlock(); - mdr->xlocks.erase(lock); - mdr->locks.erase(lock); - - lock->finish_waiters(SimpleLock::WAIT_STABLE|SimpleLock::WAIT_WR); -} - - - -// ========================================================================== -// file lock - - -bool Locker::file_rdlock_start(FileLock *lock, MDRequest *mdr) -{ - dout(7) << "file_rdlock_start " << *lock << " on " << *lock->get_parent() << dendl; - - // can read? grab ref. - if (lock->can_rdlock(mdr)) { - lock->get_rdlock(); - mdr->rdlocks.insert(lock); - mdr->locks.insert(lock); - return true; - } - - // can't read, and replicated. - if (lock->can_rdlock_soon()) { - // wait - dout(7) << "file_rdlock_start can_rdlock_soon " << *lock << " on " << *lock->get_parent() << dendl; - } else { - if (lock->get_parent()->is_auth()) { - // auth - - // FIXME or qsync? - - if (lock->is_stable()) { - file_lock(lock); // lock, bc easiest to back off ... FIXME - - if (lock->can_rdlock(mdr)) { - lock->get_rdlock(); - mdr->rdlocks.insert(lock); - mdr->locks.insert(lock); - - lock->finish_waiters(SimpleLock::WAIT_STABLE); - return true; - } - } else { - dout(7) << "file_rdlock_start waiting until stable on " << *lock << " on " << *lock->get_parent() << dendl; - lock->add_waiter(SimpleLock::WAIT_STABLE, new C_MDS_RetryRequest(mdcache, mdr)); - return false; - } - } else { - // replica - if (lock->is_stable()) { - - // fw to auth - CInode *in = (CInode*)lock->get_parent(); - int auth = in->authority().first; - dout(7) << "file_rdlock_start " << *lock << " on " << *lock->get_parent() << " on replica and async, fw to auth " << auth << dendl; - assert(auth != mds->get_nodeid()); - mdcache->request_forward(mdr, auth); - return false; - - } else { - // wait until stable - dout(7) << "inode_file_rdlock_start waiting until stable on " << *lock << " on " << *lock->get_parent() << dendl; - lock->add_waiter(SimpleLock::WAIT_STABLE, new C_MDS_RetryRequest(mdcache, mdr)); - return false; - } - } - } - - // wait - dout(7) << "file_rdlock_start waiting on " << *lock << " on " << *lock->get_parent() << dendl; - lock->add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr)); - - return false; -} - - - -void Locker::file_rdlock_finish(FileLock *lock, MDRequest *mdr) -{ - dout(7) << "rdlock_finish on " << *lock << " on " << *lock->get_parent() << dendl; - - // drop ref - lock->put_rdlock(); - mdr->rdlocks.erase(lock); - mdr->locks.erase(lock); - - if (!lock->is_rdlocked()) - file_eval_gather(lock); -} - - -bool Locker::file_xlock_start(FileLock *lock, MDRequest *mdr) -{ - dout(7) << "file_xlock_start on " << *lock << " on " << *lock->get_parent() << dendl; - - assert(lock->get_parent()->is_auth()); // remote file xlock not implemented - - // already xlocked by me? - if (lock->get_xlocked_by() == mdr) - return true; - - // can't write? - if (!lock->can_xlock(mdr)) { - - // auth - if (!lock->can_xlock_soon()) { - if (!lock->is_stable()) { - dout(7) << "file_xlock_start on auth, waiting for stable on " << *lock << " on " << *lock->get_parent() << dendl; - lock->add_waiter(SimpleLock::WAIT_STABLE, new C_MDS_RetryRequest(mdcache, mdr)); - return false; - } - - // initiate lock - file_lock(lock); - - // fall-thru to below. - } - } - - // check again - if (lock->can_xlock(mdr)) { - assert(lock->get_parent()->is_auth()); - lock->get_xlock(mdr); - mdr->locks.insert(lock); - mdr->xlocks.insert(lock); - return true; - } else { - dout(7) << "file_xlock_start on auth, waiting for write on " << *lock << " on " << *lock->get_parent() << dendl; - lock->add_waiter(SimpleLock::WAIT_WR, new C_MDS_RetryRequest(mdcache, mdr)); - return false; - } -} - - -void Locker::file_xlock_finish(FileLock *lock, MDRequest *mdr) -{ - dout(7) << "file_xlock_finish on " << *lock << " on " << *lock->get_parent() << dendl; - - // drop ref - assert(lock->can_xlock(mdr)); - lock->put_xlock(); - mdr->locks.erase(lock); - mdr->xlocks.erase(lock); - - assert(lock->get_parent()->is_auth()); // or implement remote xlocks - - // others waiting? - lock->finish_waiters(SimpleLock::WAIT_WR, 0); - - if (lock->get_parent()->is_auth()) - file_eval(lock); -} - - -/* - * ... - * - * also called after client caps are acked to us - * - checks if we're in unstable sfot state and can now move on to next state - * - checks if soft state should change (eg bc last writer closed) - */ -class C_Locker_FileEval : public Context { - Locker *locker; - FileLock *lock; -public: - C_Locker_FileEval(Locker *l, FileLock *lk) : locker(l), lock(lk) {} - void finish(int r) { - lock->get_parent()->put(CInode::PIN_PTRWAITER); - locker->try_file_eval(lock); - } -}; - -void Locker::try_file_eval(FileLock *lock) -{ - CInode *in = (CInode*)lock->get_parent(); - - // unstable and ambiguous auth? - if (!lock->is_stable() && - in->is_ambiguous_auth()) { - dout(7) << "try_file_eval not stable and ambiguous auth, waiting on " << *in << dendl; - //if (!lock->get_parent()->is_waiter(MDSCacheObject::WAIT_SINGLEAUTH)) - in->get(CInode::PIN_PTRWAITER); - in->add_waiter(CInode::WAIT_SINGLEAUTH, new C_Locker_FileEval(this, lock)); - return; - } - - if (!lock->get_parent()->is_auth()) { - dout(7) << "try_file_eval not auth for " << *lock->get_parent() << dendl; - return; - } - - if (!lock->get_parent()->can_auth_pin()) { - dout(7) << "try_file_eval can't auth_pin, waiting on " << *in << dendl; - //if (!lock->get_parent()->is_waiter(MDSCacheObject::WAIT_SINGLEAUTH)) - in->get(CInode::PIN_PTRWAITER); - in->add_waiter(CInode::WAIT_UNFREEZE, new C_Locker_FileEval(this, lock)); - return; - } - - if (lock->is_stable()) - file_eval(lock); -} - - - -void Locker::file_eval_gather(FileLock *lock) -{ - CInode *in = (CInode*)lock->get_parent(); - int issued = in->get_caps_issued(); - - dout(7) << "file_eval_gather issued " << cap_string(issued) - << " vs " << cap_string(lock->caps_allowed()) - << " on " << *lock << " on " << *lock->get_parent() - << dendl; - - if (lock->is_stable()) - return; // nothing for us to do here! - - // [auth] finished gather? - if (in->is_auth() && - !lock->is_gathering() && - ((issued & ~lock->caps_allowed()) == 0)) { - dout(7) << "file_eval_gather finished gather" << dendl; - - switch (lock->get_state()) { - // to lock - case LOCK_GLOCKR: - case LOCK_GLOCKM: - case LOCK_GLOCKL: - lock->set_state(LOCK_LOCK); - - // waiters - lock->get_rdlock(); - lock->finish_waiters(SimpleLock::WAIT_STABLE|SimpleLock::WAIT_WR|SimpleLock::WAIT_RD); - lock->put_rdlock(); - lock->get_parent()->auth_unpin(); - break; - - // to mixed - case LOCK_GMIXEDR: - lock->set_state(LOCK_MIXED); - lock->finish_waiters(SimpleLock::WAIT_STABLE); - lock->get_parent()->auth_unpin(); - break; - - case LOCK_GMIXEDL: - lock->set_state(LOCK_MIXED); - - if (in->is_replicated()) { - // data - bufferlist softdata; - lock->encode_locked_state(softdata); - - // bcast to replicas - send_lock_message(lock, LOCK_AC_MIXED, softdata); - } - - lock->finish_waiters(SimpleLock::WAIT_STABLE); - lock->get_parent()->auth_unpin(); - break; - - // to loner - case LOCK_GLONERR: - lock->set_state(LOCK_LONER); - lock->finish_waiters(SimpleLock::WAIT_STABLE); - lock->get_parent()->auth_unpin(); - break; - - case LOCK_GLONERM: - lock->set_state(LOCK_LONER); - lock->finish_waiters(SimpleLock::WAIT_STABLE); - lock->get_parent()->auth_unpin(); - break; - - // to sync - case LOCK_GSYNCL: - case LOCK_GSYNCM: - lock->set_state(LOCK_SYNC); - - { // bcast data to replicas - bufferlist softdata; - lock->encode_locked_state(softdata); - - send_lock_message(lock, LOCK_AC_SYNC, softdata); - } - - // waiters - lock->get_rdlock(); - lock->finish_waiters(SimpleLock::WAIT_RD|SimpleLock::WAIT_STABLE); - lock->put_rdlock(); - lock->get_parent()->auth_unpin(); - break; - - default: - assert(0); - } - - issue_caps(in); - - // stable re-eval? - if (lock->is_stable()) //&& lock->get_parent()->can_auth_pin()) - file_eval(lock); - } - - // [replica] finished caps gather? - if (!in->is_auth() && - ((issued & ~lock->caps_allowed()) == 0)) { - switch (lock->get_state()) { - case LOCK_GMIXEDR: - { - lock->set_state(LOCK_MIXED); - - // ack - MLock *reply = new MLock(lock, LOCK_AC_MIXEDACK, mds->get_nodeid()); - mds->send_message_mds(reply, in->authority().first); - } - break; - - case LOCK_GLOCKR: - { - lock->set_state(LOCK_LOCK); - - // ack - MLock *reply = new MLock(lock, LOCK_AC_LOCKACK, mds->get_nodeid()); - mds->send_message_mds(reply, in->authority().first); - } - break; - - default: - assert(0); - } - } - - -} - -void Locker::file_eval(FileLock *lock) -{ - CInode *in = (CInode*)lock->get_parent(); - int wanted = in->get_caps_wanted(); - bool loner = (in->client_caps.size() == 1) && in->mds_caps_wanted.empty(); - dout(7) << "file_eval wanted=" << cap_string(wanted) - << " filelock=" << *lock << " on " << *lock->get_parent() - << " loner=" << loner - << dendl; - - assert(lock->get_parent()->is_auth()); - assert(lock->is_stable()); - - // not xlocked! - if (lock->is_xlocked()) return; - - // * -> loner? - if (!lock->is_rdlocked() && - !lock->is_waiter_for(SimpleLock::WAIT_WR) && - (wanted & CAP_FILE_WR) && - loner && - lock->get_state() != LOCK_LONER) { - dout(7) << "file_eval stable, bump to loner " << *lock << " on " << *lock->get_parent() << dendl; - file_loner(lock); - } - - // * -> mixed? - else if (!lock->is_rdlocked() && - !lock->is_waiter_for(SimpleLock::WAIT_WR) && - (wanted & CAP_FILE_RD) && - (wanted & CAP_FILE_WR) && - !(loner && lock->get_state() == LOCK_LONER) && - lock->get_state() != LOCK_MIXED) { - dout(7) << "file_eval stable, bump to mixed " << *lock << " on " << *lock->get_parent() << dendl; - file_mixed(lock); - } - - // * -> sync? - else if (!in->filelock.is_waiter_for(SimpleLock::WAIT_WR) && - !(wanted & (CAP_FILE_WR|CAP_FILE_WRBUFFER)) && - ((wanted & CAP_FILE_RD) || - in->is_replicated() || - (!loner && lock->get_state() == LOCK_LONER)) && - lock->get_state() != LOCK_SYNC) { - dout(7) << "file_eval stable, bump to sync " << *lock << " on " << *lock->get_parent() << dendl; - file_sync(lock); - } - - // * -> lock? (if not replicated or open) - else if (!in->is_replicated() && - wanted == 0 && - lock->get_state() != LOCK_LOCK) { - file_lock(lock); - } -} - - -// mid - -bool Locker::file_sync(FileLock *lock) -{ - CInode *in = (CInode*)lock->get_parent(); - dout(7) << "file_sync " << *lock << " on " << *lock->get_parent() << dendl; - - assert(in->is_auth()); - assert(lock->is_stable()); - - int issued = in->get_caps_issued(); - - assert((in->get_caps_wanted() & CAP_FILE_WR) == 0); - - if (lock->get_state() == LOCK_LOCK) { - if (in->is_replicated()) { - bufferlist softdata; - lock->encode_locked_state(softdata); - send_lock_message(lock, LOCK_AC_SYNC, softdata); - } - - // change lock - lock->set_state(LOCK_SYNC); - - issue_caps(in); // reissue caps - return true; - } - - else if (lock->get_state() == LOCK_MIXED) { - // writers? - if (issued & CAP_FILE_WR) { - // gather client write caps - lock->set_state(LOCK_GSYNCM); - lock->get_parent()->auth_pin(); - issue_caps(in); - } else { - // no writers, go straight to sync - if (in->is_replicated()) { - bufferlist softdata; - lock->encode_locked_state(softdata); - send_lock_message(lock, LOCK_AC_SYNC, softdata); - } - - // change lock - lock->set_state(LOCK_SYNC); - } - return false; - } - - else if (lock->get_state() == LOCK_LONER) { - // writers? - if (issued & CAP_FILE_WR) { - // gather client write caps - lock->set_state(LOCK_GSYNCL); - lock->get_parent()->auth_pin(); - issue_caps(in); - } else { - // no writers, go straight to sync - if (in->is_replicated()) { - bufferlist softdata; - lock->encode_locked_state(softdata); - send_lock_message(lock, LOCK_AC_SYNC, softdata); - } - - // change lock - lock->set_state(LOCK_SYNC); - } - return false; - } - else - assert(0); // wtf. - - return false; -} - - - -void Locker::file_lock(FileLock *lock) -{ - CInode *in = (CInode*)lock->get_parent(); - dout(7) << "inode_file_lock " << *lock << " on " << *lock->get_parent() << dendl; - - assert(in->is_auth()); - assert(lock->is_stable()); - - int issued = in->get_caps_issued(); - - if (lock->get_state() == LOCK_SYNC) { - if (in->is_replicated()) { - // bcast to replicas - send_lock_message(lock, LOCK_AC_LOCK); - lock->init_gather(); - - // change lock - lock->set_state(LOCK_GLOCKR); - lock->get_parent()->auth_pin(); - - // call back caps - if (issued) - issue_caps(in); - } else { - if (issued) { - // call back caps - lock->set_state(LOCK_GLOCKR); - lock->get_parent()->auth_pin(); - issue_caps(in); - } else { - lock->set_state(LOCK_LOCK); - } - } - } - - else if (lock->get_state() == LOCK_MIXED) { - if (in->is_replicated()) { - // bcast to replicas - send_lock_message(lock, LOCK_AC_LOCK); - lock->init_gather(); - - // change lock - lock->set_state(LOCK_GLOCKM); - lock->get_parent()->auth_pin(); - - // call back caps - issue_caps(in); - } else { - //assert(issued); // ??? -sage 2/19/06 - if (issued) { - // change lock - lock->set_state(LOCK_GLOCKM); - lock->get_parent()->auth_pin(); - - // call back caps - issue_caps(in); - } else { - lock->set_state(LOCK_LOCK); - } - } - - } - else if (lock->get_state() == LOCK_LONER) { - if (issued & CAP_FILE_WR) { - // change lock - lock->set_state(LOCK_GLOCKL); - lock->get_parent()->auth_pin(); - - // call back caps - issue_caps(in); - } else { - lock->set_state(LOCK_LOCK); - } - } - else - assert(0); // wtf. -} - - -void Locker::file_mixed(FileLock *lock) -{ - dout(7) << "file_mixed " << *lock << " on " << *lock->get_parent() << dendl; - - CInode *in = (CInode*)lock->get_parent(); - assert(in->is_auth()); - assert(lock->is_stable()); - - int issued = in->get_caps_issued(); - - if (lock->get_state() == LOCK_SYNC) { - if (in->is_replicated()) { - // bcast to replicas - send_lock_message(lock, LOCK_AC_MIXED); - lock->init_gather(); - - lock->set_state(LOCK_GMIXEDR); - lock->get_parent()->auth_pin(); - - issue_caps(in); - } else { - if (issued) { - lock->set_state(LOCK_GMIXEDR); - lock->get_parent()->auth_pin(); - issue_caps(in); - } else { - lock->set_state(LOCK_MIXED); - } - } - } - - else if (lock->get_state() == LOCK_LOCK) { - if (in->is_replicated()) { - // data - bufferlist softdata; - lock->encode_locked_state(softdata); - - // bcast to replicas - send_lock_message(lock, LOCK_AC_MIXED, softdata); - } - - // change lock - lock->set_state(LOCK_MIXED); - issue_caps(in); - } - - else if (lock->get_state() == LOCK_LONER) { - if (issued & CAP_FILE_WRBUFFER) { - // gather up WRBUFFER caps - lock->set_state(LOCK_GMIXEDL); - lock->get_parent()->auth_pin(); - issue_caps(in); - } - else if (in->is_replicated()) { - // bcast to replicas - send_lock_message(lock, LOCK_AC_MIXED); - lock->set_state(LOCK_MIXED); - issue_caps(in); - } else { - lock->set_state(LOCK_MIXED); - issue_caps(in); - } - } - - else - assert(0); // wtf. -} - - -void Locker::file_loner(FileLock *lock) -{ - CInode *in = (CInode*)lock->get_parent(); - dout(7) << "inode_file_loner " << *lock << " on " << *lock->get_parent() << dendl; - - assert(in->is_auth()); - assert(lock->is_stable()); - - assert((in->client_caps.size() == 1) && in->mds_caps_wanted.empty()); - - if (lock->get_state() == LOCK_SYNC) { - if (in->is_replicated()) { - // bcast to replicas - send_lock_message(lock, LOCK_AC_LOCK); - lock->init_gather(); - - // change lock - lock->set_state(LOCK_GLONERR); - lock->get_parent()->auth_pin(); - } else { - // only one guy with file open, who gets it all, so - lock->set_state(LOCK_LONER); - issue_caps(in); - } - } - - else if (lock->get_state() == LOCK_LOCK) { - // change lock. ignore replicas; they don't know about LONER. - lock->set_state(LOCK_LONER); - issue_caps(in); - } - - else if (lock->get_state() == LOCK_MIXED) { - if (in->is_replicated()) { - // bcast to replicas - send_lock_message(lock, LOCK_AC_LOCK); - lock->init_gather(); - - // change lock - lock->set_state(LOCK_GLONERM); - lock->get_parent()->auth_pin(); - } else { - lock->set_state(LOCK_LONER); - issue_caps(in); - } - } - - else - assert(0); -} - - - -// messenger - -void Locker::handle_file_lock(FileLock *lock, MLock *m) -{ - CInode *in = (CInode*)lock->get_parent(); - int from = m->get_asker(); - - if (mds->is_rejoin()) { - if (in->is_rejoining()) { - dout(7) << "handle_file_lock still rejoining " << *in - << ", dropping " << *m << dendl; - delete m; - return; - } - } - - - dout(7) << "handle_file_lock a=" << m->get_action() << " from " << from << " " - << *in << " filelock=" << *lock << dendl; - - int issued = in->get_caps_issued(); - - switch (m->get_action()) { - // -- replica -- - case LOCK_AC_SYNC: - assert(lock->get_state() == LOCK_LOCK || - lock->get_state() == LOCK_MIXED); - - lock->decode_locked_state(m->get_data()); - lock->set_state(LOCK_SYNC); - - // no need to reply. - - // waiters - lock->get_rdlock(); - lock->finish_waiters(SimpleLock::WAIT_RD|SimpleLock::WAIT_STABLE); - lock->put_rdlock(); - file_eval_gather(lock); - break; - - case LOCK_AC_LOCK: - assert(lock->get_state() == LOCK_SYNC || - lock->get_state() == LOCK_MIXED); - - lock->set_state(LOCK_GLOCKR); - - // call back caps? - if (issued & CAP_FILE_RD) { - dout(7) << "handle_file_lock client readers, gathering caps on " << *in << dendl; - issue_caps(in); - break; - } - else if (lock->is_rdlocked()) { - dout(7) << "handle_file_lock rdlocked, waiting before ack on " << *in << dendl; - break; - } - - // nothing to wait for, lock and ack. - { - lock->set_state(LOCK_LOCK); - - MLock *reply = new MLock(lock, LOCK_AC_LOCKACK, mds->get_nodeid()); - mds->send_message_mds(reply, from); - } - break; - - case LOCK_AC_MIXED: - assert(lock->get_state() == LOCK_SYNC || - lock->get_state() == LOCK_LOCK); - - if (lock->get_state() == LOCK_SYNC) { - // MIXED - if (issued & CAP_FILE_RD) { - // call back client caps - lock->set_state(LOCK_GMIXEDR); - issue_caps(in); - break; - } else { - // no clients, go straight to mixed - lock->set_state(LOCK_MIXED); - - // ack - MLock *reply = new MLock(lock, LOCK_AC_MIXEDACK, mds->get_nodeid()); - mds->send_message_mds(reply, from); - } - } else { - // LOCK - lock->set_state(LOCK_MIXED); - - // no ack needed. - } - - issue_caps(in); - - // waiters - lock->finish_waiters(SimpleLock::WAIT_WR|SimpleLock::WAIT_STABLE); - file_eval_gather(lock); - break; - - - - - // -- auth -- - case LOCK_AC_LOCKACK: - assert(lock->get_state() == LOCK_GLOCKR || - lock->get_state() == LOCK_GLOCKM || - lock->get_state() == LOCK_GLONERM || - lock->get_state() == LOCK_GLONERR); - assert(lock->is_gathering(from)); - lock->remove_gather(from); - - if (lock->is_gathering()) { - dout(7) << "handle_lock_inode_file " << *in << " from " << from - << ", still gathering " << lock->get_gather_set() << dendl; - } else { - dout(7) << "handle_lock_inode_file " << *in << " from " << from - << ", last one" << dendl; - file_eval_gather(lock); - } - break; - - case LOCK_AC_SYNCACK: - assert(lock->get_state() == LOCK_GSYNCM); - assert(lock->is_gathering(from)); - lock->remove_gather(from); - - /* not used currently - { - // merge data (keep largest size, mtime, etc.) - int off = 0; - in->decode_merge_file_state(m->get_data(), off); - } - */ - - if (lock->is_gathering()) { - dout(7) << "handle_lock_inode_file " << *in << " from " << from - << ", still gathering " << lock->get_gather_set() << dendl; - } else { - dout(7) << "handle_lock_inode_file " << *in << " from " << from - << ", last one" << dendl; - file_eval_gather(lock); - } - break; - - case LOCK_AC_MIXEDACK: - assert(lock->get_state() == LOCK_GMIXEDR); - assert(lock->is_gathering(from)); - lock->remove_gather(from); - - if (lock->is_gathering()) { - dout(7) << "handle_lock_inode_file " << *in << " from " << from - << ", still gathering " << lock->get_gather_set() << dendl; - } else { - dout(7) << "handle_lock_inode_file " << *in << " from " << from - << ", last one" << dendl; - file_eval_gather(lock); - } - break; - - - default: - assert(0); - } - - delete m; -} - - - - - - diff --git a/branches/sage/ebofs2/mds/Locker.h b/branches/sage/ebofs2/mds/Locker.h deleted file mode 100644 index a69055f49449e..0000000000000 --- a/branches/sage/ebofs2/mds/Locker.h +++ /dev/null @@ -1,195 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_LOCKER_H -#define __MDS_LOCKER_H - -#include "include/types.h" - -#include -#include -#include -using std::map; -using std::list; -using std::set; - -class MDS; -class CDir; -class CInode; -class CDentry; - -class Message; - -class MDiscover; -class MDiscoverReply; -class MCacheExpire; -class MDirUpdate; -class MDentryUnlink; -class MLock; - -class MClientRequest; - -class Anchor; -class Capability; -class LogSegment; - -class SimpleLock; -class FileLock; -class ScatterLock; -class LocalLock; - -class Locker { -private: - MDS *mds; - MDCache *mdcache; - - public: - Locker(MDS *m, MDCache *c) : mds(m), mdcache(c) {} - - SimpleLock *get_lock(int lock_type, MDSCacheObjectInfo &info); - - void dispatch(Message *m); - void handle_lock(MLock *m); - - -protected: - void send_lock_message(SimpleLock *lock, int msg); - void send_lock_message(SimpleLock *lock, int msg, const bufferlist &data); - - // -- locks -- -public: - bool acquire_locks(MDRequest *mdr, - set &rdlocks, - set &wrlocks, - set &xlocks); - - void drop_locks(MDRequest *mdr); - -protected: - bool rdlock_start(SimpleLock *lock, MDRequest *mdr); - void rdlock_finish(SimpleLock *lock, MDRequest *mdr); - bool xlock_start(SimpleLock *lock, MDRequest *mdr); -public: - void xlock_finish(SimpleLock *lock, MDRequest *mdr); // public for Server's slave UNXLOCK -protected: - bool wrlock_start(SimpleLock *lock, MDRequest *mdr); - void wrlock_finish(SimpleLock *lock, MDRequest *mdr); - -public: - void rejoin_set_state(SimpleLock *lock, int s, list& waiters); - - // simple -public: - void try_simple_eval(SimpleLock *lock); - void simple_eval_gather(SimpleLock *lock); - bool simple_rdlock_try(SimpleLock *lock, Context *con); -protected: - void simple_eval(SimpleLock *lock); - void handle_simple_lock(SimpleLock *lock, MLock *m); - void simple_sync(SimpleLock *lock); - void simple_lock(SimpleLock *lock); - bool simple_rdlock_start(SimpleLock *lock, MDRequest *mdr); - void simple_rdlock_finish(SimpleLock *lock, MDRequest *mdr); - bool simple_xlock_start(SimpleLock *lock, MDRequest *mdr); - void simple_xlock_finish(SimpleLock *lock, MDRequest *mdr); - -public: - bool dentry_can_rdlock_trace(vector& trace); - void dentry_anon_rdlock_trace_start(vector& trace); - void dentry_anon_rdlock_trace_finish(vector& trace); - - // scatter -protected: - xlist autoscattered; - -public: - void try_scatter_eval(ScatterLock *lock); - void scatter_eval(ScatterLock *lock); // public for MDCache::adjust_subtree_auth() - void scatter_eval_gather(ScatterLock *lock); - - void scatter_unscatter_autoscattered(); - void scatter_try_unscatter(ScatterLock *lock, Context *c); - void note_autoscattered(ScatterLock *lock); - - void scatter_lock(ScatterLock *lock); // called by LogSegment::try_to_expire - -protected: - void handle_scatter_lock(ScatterLock *lock, MLock *m); - void _scatter_replica_lock(ScatterLock *lock, int auth); - void scatter_sync(ScatterLock *lock); - void scatter_scatter(ScatterLock *lock); - void scatter_tempsync(ScatterLock *lock); - bool scatter_rdlock_start(ScatterLock *lock, MDRequest *mdr); - void scatter_rdlock_finish(ScatterLock *lock, MDRequest *mdr); - bool scatter_wrlock_start(ScatterLock *lock, MDRequest *mdr); - void scatter_wrlock_finish(ScatterLock *lock, MDRequest *mdr); - - void scatter_writebehind(ScatterLock *lock); - class C_Locker_ScatterWB : public Context { - Locker *locker; - ScatterLock *lock; - LogSegment *ls; - public: - C_Locker_ScatterWB(Locker *l, ScatterLock *sl, LogSegment *s) : locker(l), lock(sl), ls(s) {} - void finish(int r) { - locker->scatter_writebehind_finish(lock, ls); - } - }; - void scatter_writebehind_finish(ScatterLock *lock, LogSegment *ls); - - // local -protected: - bool local_wrlock_start(LocalLock *lock, MDRequest *mdr); - void local_wrlock_finish(LocalLock *lock, MDRequest *mdr); - bool local_xlock_start(LocalLock *lock, MDRequest *mdr); - void local_xlock_finish(LocalLock *lock, MDRequest *mdr); - - - // file -public: - void file_eval_gather(FileLock *lock); - void try_file_eval(FileLock *lock); -protected: - void file_eval(FileLock *lock); - void handle_file_lock(FileLock *lock, MLock *m); - bool file_sync(FileLock *lock); - void file_lock(FileLock *lock); - void file_mixed(FileLock *lock); - void file_loner(FileLock *lock); - bool file_rdlock_try(FileLock *lock, Context *con); - bool file_rdlock_start(FileLock *lock, MDRequest *mdr); - void file_rdlock_finish(FileLock *lock, MDRequest *mdr); - bool file_xlock_start(FileLock *lock, MDRequest *mdr); - void file_xlock_finish(FileLock *lock, MDRequest *mdr); - - - - // -- file i/o -- - public: - version_t issue_file_data_version(CInode *in); - Capability* issue_new_caps(CInode *in, int mode, MClientRequest *req); - bool issue_caps(CInode *in); - - protected: - void handle_client_file_caps(class MClientFileCaps *m); - - void request_inode_file_caps(CInode *in); - void handle_inode_file_caps(class MInodeFileCaps *m); - - friend class C_MDL_RequestInodeFileCaps; - -}; - - -#endif diff --git a/branches/sage/ebofs2/mds/LogEvent.cc b/branches/sage/ebofs2/mds/LogEvent.cc deleted file mode 100644 index 05b4336c52f05..0000000000000 --- a/branches/sage/ebofs2/mds/LogEvent.cc +++ /dev/null @@ -1,83 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "LogEvent.h" - -#include "MDS.h" - -// events i know of -#include "events/EString.h" - -#include "events/ESession.h" -#include "events/ESubtreeMap.h" -#include "events/EExport.h" -#include "events/EImportStart.h" -#include "events/EImportFinish.h" -#include "events/EFragment.h" - -#include "events/EUpdate.h" -#include "events/ESlaveUpdate.h" -#include "events/EOpen.h" - -#include "events/EPurgeFinish.h" - -#include "events/EAnchor.h" -#include "events/EAnchorClient.h" - - - -LogEvent *LogEvent::decode(bufferlist& bl) -{ - // parse type, length - int off = 0; - int type; - bl.copy(off, sizeof(type), (char*)&type); - off += sizeof(type); - - int length = bl.length() - off; - generic_dout(15) << "decode_log_event type " << type << ", size " << length << dendl; - - assert(type > 0); - - // create event - LogEvent *le; - switch (type) { - case EVENT_STRING: le = new EString; break; - - case EVENT_SESSION: le = new ESession; break; - case EVENT_SUBTREEMAP: le = new ESubtreeMap; break; - case EVENT_EXPORT: le = new EExport; break; - case EVENT_IMPORTSTART: le = new EImportStart; break; - case EVENT_IMPORTFINISH: le = new EImportFinish; break; - case EVENT_FRAGMENT: le = new EFragment; break; - - case EVENT_UPDATE: le = new EUpdate; break; - case EVENT_SLAVEUPDATE: le = new ESlaveUpdate; break; - case EVENT_OPEN: le = new EOpen; break; - - case EVENT_PURGEFINISH: le = new EPurgeFinish; break; - - case EVENT_ANCHOR: le = new EAnchor; break; - case EVENT_ANCHORCLIENT: le = new EAnchorClient; break; - default: - generic_dout(1) << "uh oh, unknown log event type " << type << dendl; - assert(0); - } - - // decode - le->decode_payload(bl, off); - - return le; -} - diff --git a/branches/sage/ebofs2/mds/LogEvent.h b/branches/sage/ebofs2/mds/LogEvent.h deleted file mode 100644 index 8f2f55f342bb3..0000000000000 --- a/branches/sage/ebofs2/mds/LogEvent.h +++ /dev/null @@ -1,95 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __LOGEVENT_H -#define __LOGEVENT_H - -#define EVENT_STRING 1 - -#define EVENT_SESSION 7 -#define EVENT_SUBTREEMAP 2 -#define EVENT_EXPORT 30 -#define EVENT_IMPORTSTART 31 -#define EVENT_IMPORTFINISH 32 -#define EVENT_FRAGMENT 33 - -#define EVENT_UPDATE 3 -#define EVENT_SLAVEUPDATE 4 -#define EVENT_OPEN 5 - -#define EVENT_PURGEFINISH 22 - -#define EVENT_ANCHOR 40 -#define EVENT_ANCHORCLIENT 41 - - - - -#include -using namespace std; - -#include "include/buffer.h" -#include "include/Context.h" - -class MDS; -class LogSegment; - -// generic log event -class LogEvent { - private: - int _type; - off_t _start_off,_end_off; - - friend class MDLog; - - public: - LogSegment *_segment; - - LogEvent(int t) : - _type(t), _start_off(0), _end_off(0), _segment(0) { } - virtual ~LogEvent() { } - - int get_type() { return _type; } - off_t get_start_off() { return _start_off; } - off_t get_end_off() { return _end_off; } - - // encoding - virtual void encode_payload(bufferlist& bl) = 0; - virtual void decode_payload(bufferlist& bl, int& off) = 0; - static LogEvent *decode(bufferlist &bl); - - - virtual void print(ostream& out) { - out << "event(" << _type << ")"; - } - - /*** live journal ***/ - /* update_segment() - adjust any state we need to in the LogSegment - */ - virtual void update_segment() { } - - /*** recovery ***/ - /* replay() - replay given event. this is idempotent. - */ - virtual void replay(MDS *m) { assert(0); } - - -}; - -inline ostream& operator<<(ostream& out, LogEvent& le) { - le.print(out); - return out; -} - -#endif diff --git a/branches/sage/ebofs2/mds/LogSegment.h b/branches/sage/ebofs2/mds/LogSegment.h deleted file mode 100644 index e73f5f8b61b9c..0000000000000 --- a/branches/sage/ebofs2/mds/LogSegment.h +++ /dev/null @@ -1,69 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __LOGSEGMENT_H -#define __LOGSEGMENT_H - -#include "include/xlist.h" -#include "include/interval_set.h" -#include "include/Context.h" - -#include -using __gnu_cxx::hash_set; - -class CDir; -class CInode; -class CDentry; -class MDS; -class MDSlaveUpdate; - -class LogSegment { - public: - off_t offset; - int num_events; - - // dirty items - xlist dirty_dirfrags; - xlist dirty_inodes; - xlist dirty_dentries; - - xlist open_files; - xlist dirty_inode_mtimes; - - xlist slave_updates; - - //xlist purging_inodes; - map > purging_inodes; - - // committed anchor transactions - hash_set pending_commit_atids; - - // client request ids - map last_client_tids; - - // table version - version_t allocv; - version_t clientmapv; - version_t anchortablev; - - // try to expire - C_Gather *try_to_expire(MDS *mds); - - // cons - LogSegment(off_t off) : offset(off), num_events(0), - allocv(0), clientmapv(0), anchortablev(0) - { } -}; - -#endif diff --git a/branches/sage/ebofs2/mds/MDBalancer.cc b/branches/sage/ebofs2/mds/MDBalancer.cc deleted file mode 100644 index 7bf6ea4f7eb80..0000000000000 --- a/branches/sage/ebofs2/mds/MDBalancer.cc +++ /dev/null @@ -1,1049 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "mdstypes.h" - -#include "MDBalancer.h" -#include "MDS.h" -#include "MDSMap.h" -#include "CInode.h" -#include "CDir.h" -#include "MDCache.h" -#include "Migrator.h" - -#include "include/Context.h" -#include "msg/Messenger.h" -#include "messages/MHeartbeat.h" - -#include -#include -using std::map; -using std::vector; - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug_mds || l<=g_conf.debug_mds_balancer) *_dout << dbeginl << g_clock.now() << " mds" << mds->get_nodeid() << ".bal " - -#define MIN_LOAD 50 // ?? -#define MIN_REEXPORT 5 // will automatically reexport -#define MIN_OFFLOAD 10 // point at which i stop trying, close enough - - - -int MDBalancer::proc_message(Message *m) -{ - switch (m->get_type()) { - - case MSG_MDS_HEARTBEAT: - handle_heartbeat((MHeartbeat*)m); - break; - - default: - dout(1) << " balancer unknown message " << m->get_type() << dendl; - assert(0); - break; - } - - return 0; -} - - - - -void MDBalancer::tick() -{ - static int num_bal_times = g_conf.mds_bal_max; - static utime_t first = g_clock.now(); - utime_t now = g_clock.now(); - utime_t elapsed = now; - elapsed -= first; - - // sample? - if ((double)now - (double)last_sample > g_conf.mds_bal_sample_interval) { - dout(15) << "tick last_sample now " << now << dendl; - last_sample = now; - } - - // balance? - if (last_heartbeat == utime_t()) last_heartbeat = now; - if (true && - mds->get_nodeid() == 0 && - g_conf.mds_bal_interval > 0 && - (num_bal_times || - (g_conf.mds_bal_max_until >= 0 && - elapsed.sec() > g_conf.mds_bal_max_until)) && - mds->is_active() && - now.sec() - last_heartbeat.sec() >= g_conf.mds_bal_interval) { - last_heartbeat = now; - send_heartbeat(); - num_bal_times--; - } - - // hash? - if (true && - now.sec() - last_fragment.sec() > g_conf.mds_bal_fragment_interval) { - last_fragment = now; - do_fragmenting(); - } -} - - - - -class C_Bal_SendHeartbeat : public Context { -public: - MDS *mds; - C_Bal_SendHeartbeat(MDS *mds) { - this->mds = mds; - } - virtual void finish(int f) { - mds->balancer->send_heartbeat(); - } -}; - - -double mds_load_t::mds_load() -{ - switch(g_conf.mds_bal_mode) { - case 0: - return - .8 * auth.meta_load() + - .2 * all.meta_load() + - req_rate + - 10.0 * queue_len; - - case 1: - return req_rate + 10.0*queue_len; - - case 2: - return cpu_load_avg; - - } - assert(0); - return 0; -} - -mds_load_t MDBalancer::get_load() -{ - mds_load_t load; - - if (mds->mdcache->get_root()) { - list ls; - mds->mdcache->get_root()->get_dirfrags(ls); - for (list::iterator p = ls.begin(); - p != ls.end(); - p++) { - load.auth += (*p)->pop_auth_subtree_nested; - load.all += (*p)->pop_nested; - } - } else { - dout(20) << "get_load no root, no load" << dendl; - } - - load.req_rate = mds->get_req_rate(); - load.queue_len = mds->messenger->get_dispatch_queue_len(); - - ifstream cpu("/proc/loadavg"); - if (cpu.is_open()) - cpu >> load.cpu_load_avg; - - dout(15) << "get_load " << load << dendl; - return load; -} - -void MDBalancer::send_heartbeat() -{ - utime_t now = g_clock.now(); - if (!mds->mdcache->get_root()) { - dout(5) << "no root on send_heartbeat" << dendl; - mds->mdcache->open_root(new C_Bal_SendHeartbeat(mds)); - return; - } - - mds_load.clear(); - if (mds->get_nodeid() == 0) - beat_epoch++; - - // my load - mds_load_t load = get_load(); - mds_load[ mds->get_nodeid() ] = load; - - // import_map -- how much do i import from whom - map import_map; - set authsubs; - mds->mdcache->get_auth_subtrees(authsubs); - for (set::iterator it = authsubs.begin(); - it != authsubs.end(); - it++) { - CDir *im = *it; - int from = im->inode->authority().first; - if (from == mds->get_nodeid()) continue; - if (im->get_inode()->is_stray()) continue; - import_map[from] += im->pop_auth_subtree.meta_load(now); - } - mds_import_map[ mds->get_nodeid() ] = import_map; - - - dout(5) << "mds" << mds->get_nodeid() << " epoch " << beat_epoch << " load " << load << dendl; - for (map::iterator it = import_map.begin(); - it != import_map.end(); - it++) { - dout(5) << " import_map from " << it->first << " -> " << it->second << dendl; - } - - - set up; - mds->get_mds_map()->get_in_mds_set(up); - for (set::iterator p = up.begin(); p != up.end(); ++p) { - if (*p == mds->get_nodeid()) continue; - MHeartbeat *hb = new MHeartbeat(load, beat_epoch); - hb->get_import_map() = import_map; - mds->messenger->send_message(hb, - mds->mdsmap->get_inst(*p)); - } -} - -void MDBalancer::handle_heartbeat(MHeartbeat *m) -{ - dout(25) << "=== got heartbeat " << m->get_beat() << " from " << m->get_source().num() << " " << m->get_load() << dendl; - - if (!mds->is_active()) - return; - - if (!mds->mdcache->get_root()) { - dout(10) << "opening root on handle_heartbeat" << dendl; - mds->mdcache->open_root(new C_MDS_RetryMessage(mds, m)); - return; - } - - int who = m->get_source().num(); - - if (who == 0) { - dout(20) << " from mds0, new epoch" << dendl; - beat_epoch = m->get_beat(); - send_heartbeat(); - - show_imports(); - } - - mds_load[ who ] = m->get_load(); - mds_import_map[ who ] = m->get_import_map(); - - //dout(0) << " load is " << load << " have " << mds_load.size() << dendl; - - unsigned cluster_size = mds->get_mds_map()->get_num_in_mds(); - if (mds_load.size() == cluster_size) { - // let's go! - //export_empties(); // no! - do_rebalance(m->get_beat()); - } - - // done - delete m; -} - - -void MDBalancer::export_empties() -{ - dout(5) << "export_empties checking for empty imports" << dendl; - - for (map >::iterator it = mds->mdcache->subtrees.begin(); - it != mds->mdcache->subtrees.end(); - it++) { - CDir *dir = it->first; - if (!dir->is_auth() || - dir->is_ambiguous_auth() || - dir->is_freezing() || - dir->is_frozen()) - continue; - - if (!dir->inode->is_root() && dir->get_size() == 0) - mds->mdcache->migrator->export_empty_import(dir); - } -} - - - -double MDBalancer::try_match(int ex, double& maxex, - int im, double& maxim) -{ - if (maxex <= 0 || maxim <= 0) return 0.0; - - double howmuch = MIN(maxex, maxim); - if (howmuch <= 0) return 0.0; - - dout(5) << " - mds" << ex << " exports " << howmuch << " to mds" << im << dendl; - - if (ex == mds->get_nodeid()) - my_targets[im] += howmuch; - - exported[ex] += howmuch; - imported[im] += howmuch; - - maxex -= howmuch; - maxim -= howmuch; - - return howmuch; -} - - - -void MDBalancer::do_fragmenting() -{ - if (split_queue.empty()) { - dout(20) << "do_fragmenting has nothing to do" << dendl; - return; - } - - dout(0) << "do_fragmenting " << split_queue.size() << " dirs marked for possible splitting" << dendl; - - for (set::iterator i = split_queue.begin(); - i != split_queue.end(); - i++) { - CDir *dir = mds->mdcache->get_dirfrag(*i); - if (!dir) continue; - if (!dir->is_auth()) continue; - - dout(0) << "do_fragmenting splitting " << *dir << dendl; - mds->mdcache->split_dir(dir, 4); - } - split_queue.clear(); -} - - - -void MDBalancer::do_rebalance(int beat) -{ - int cluster_size = mds->get_mds_map()->get_num_mds(); - int whoami = mds->get_nodeid(); - utime_t now = g_clock.now(); - - dump_pop_map(); - - // reset - my_targets.clear(); - imported.clear(); - exported.clear(); - - dout(5) << " do_rebalance: cluster loads are" << dendl; - - mds->mdcache->migrator->clear_export_queue(); - - // rescale! turn my mds_load back into meta_load units - double load_fac = 1.0; - if (mds_load[whoami].mds_load() > 0) { - double metald = mds_load[whoami].auth.meta_load(now); - double mdsld = mds_load[whoami].mds_load(); - load_fac = metald / mdsld; - dout(7) << " load_fac is " << load_fac - << " <- " << mds_load[whoami].auth << " " << metald - << " / " << mdsld - << dendl; - } - - double total_load = 0; - multimap load_map; - for (int i=0; i( l, i )); - } - - // target load - target_load = total_load / (double)cluster_size; - dout(5) << "do_rebalance: my load " << my_load - << " target " << target_load - << " total " << total_load - << dendl; - - // under or over? - if (my_load < target_load * (1.0 + g_conf.mds_bal_min_rebalance)) { - dout(5) << " i am underloaded or barely overloaded, doing nothing." << dendl; - last_epoch_under = beat_epoch; - show_imports(); - return; - } - - last_epoch_over = beat_epoch; - - // am i over long enough? - if (last_epoch_under && beat_epoch - last_epoch_under < 2) { - dout(5) << " i am overloaded, but only for " << (beat_epoch - last_epoch_under) << " epochs" << dendl; - return; - } - - dout(5) << " i am sufficiently overloaded" << dendl; - - - // first separate exporters and importers - multimap importers; - multimap exporters; - set importer_set; - set exporter_set; - - for (multimap::iterator it = load_map.begin(); - it != load_map.end(); - it++) { - if (it->first < target_load) { - dout(15) << " mds" << it->second << " is importer" << dendl; - importers.insert(pair(it->first,it->second)); - importer_set.insert(it->second); - } else { - dout(15) << " mds" << it->second << " is exporter" << dendl; - exporters.insert(pair(it->first,it->second)); - exporter_set.insert(it->second); - } - } - - - // determine load transfer mapping - - if (true) { - // analyze import_map; do any matches i can - - dout(15) << " matching exporters to import sources" << dendl; - - // big -> small exporters - for (multimap::reverse_iterator ex = exporters.rbegin(); - ex != exporters.rend(); - ex++) { - double maxex = get_maxex(ex->second); - if (maxex <= .001) continue; - - // check importers. for now, just in arbitrary order (no intelligent matching). - for (map::iterator im = mds_import_map[ex->second].begin(); - im != mds_import_map[ex->second].end(); - im++) { - double maxim = get_maxim(im->first); - if (maxim <= .001) continue; - try_match(ex->second, maxex, - im->first, maxim); - if (maxex <= .001) break;; - } - } - } - - - if (1) { - if (beat % 2 == 1) { - // old way - dout(15) << " matching big exporters to big importers" << dendl; - // big exporters to big importers - multimap::reverse_iterator ex = exporters.rbegin(); - multimap::iterator im = importers.begin(); - while (ex != exporters.rend() && - im != importers.end()) { - double maxex = get_maxex(ex->second); - double maxim = get_maxim(im->second); - if (maxex < .001 || maxim < .001) break; - try_match(ex->second, maxex, - im->second, maxim); - if (maxex <= .001) ex++; - if (maxim <= .001) im++; - } - } else { - // new way - dout(15) << " matching small exporters to big importers" << dendl; - // small exporters to big importers - multimap::iterator ex = exporters.begin(); - multimap::iterator im = importers.begin(); - while (ex != exporters.end() && - im != importers.end()) { - double maxex = get_maxex(ex->second); - double maxim = get_maxim(im->second); - if (maxex < .001 || maxim < .001) break; - try_match(ex->second, maxex, - im->second, maxim); - if (maxex <= .001) ex++; - if (maxim <= .001) im++; - } - } - } - - - - // make a sorted list of my imports - map import_pop_map; - multimap import_from_map; - set fullauthsubs; - - mds->mdcache->get_fullauth_subtrees(fullauthsubs); - for (set::iterator it = fullauthsubs.begin(); - it != fullauthsubs.end(); - it++) { - CDir *im = *it; - if (im->get_inode()->is_stray()) continue; - - double pop = im->pop_auth_subtree.meta_load(now); - if (g_conf.mds_bal_idle_threshold > 0 && - pop < g_conf.mds_bal_idle_threshold && - im->inode != mds->mdcache->get_root() && - im->inode->authority().first != mds->get_nodeid()) { - dout(-5) << " exporting idle (" << pop << ") import " << *im - << " back to mds" << im->inode->authority().first - << dendl; - mds->mdcache->migrator->export_dir_nicely(im, im->inode->authority().first); - continue; - } - - import_pop_map[ pop ] = im; - int from = im->inode->authority().first; - dout(15) << " map: i imported " << *im << " from " << from << dendl; - import_from_map.insert(pair(from, im)); - } - - - - // do my exports! - set already_exporting; - double total_sent = 0; - double total_goal = 0; - - for (map::iterator it = my_targets.begin(); - it != my_targets.end(); - it++) { - - /* - double fac = 1.0; - if (false && total_goal > 0 && total_sent > 0) { - fac = total_goal / total_sent; - dout(-5) << " total sent is " << total_sent << " / " << total_goal << " -> fac 1/ " << fac << dendl; - if (fac > 1.0) fac = 1.0; - } - fac = .9 - .4 * ((float)g_conf.num_mds / 128.0); // hack magic fixme - */ - - int target = (*it).first; - double amount = (*it).second; - total_goal += amount; - - if (amount < MIN_OFFLOAD) continue; - if (amount / target_load < .2) continue; - - dout(5) << "want to send " << amount << " to mds" << target - //<< " .. " << (*it).second << " * " << load_fac - << " -> " << amount - << dendl;//" .. fudge is " << fudge << dendl; - double have = 0; - - - show_imports(); - - // search imports from target - if (import_from_map.count(target)) { - dout(5) << " aha, looking through imports from target mds" << target << dendl; - pair::iterator, multimap::iterator> p = - import_from_map.equal_range(target); - while (p.first != p.second) { - CDir *dir = (*p.first).second; - dout(5) << "considering " << *dir << " from " << (*p.first).first << dendl; - multimap::iterator plast = p.first++; - - if (dir->inode->is_root()) continue; - if (dir->is_freezing() || dir->is_frozen()) continue; // export pbly already in progress - double pop = dir->pop_auth_subtree.meta_load(now); - assert(dir->inode->authority().first == target); // cuz that's how i put it in the map, dummy - - if (pop <= amount-have) { - dout(-5) << "reexporting " << *dir - << " pop " << pop - << " back to mds" << target << dendl; - mds->mdcache->migrator->export_dir_nicely(dir, target); - have += pop; - import_from_map.erase(plast); - import_pop_map.erase(pop); - } else { - dout(5) << "can't reexport " << *dir << ", too big " << pop << dendl; - } - if (amount-have < MIN_OFFLOAD) break; - } - } - if (amount-have < MIN_OFFLOAD) { - total_sent += have; - continue; - } - - // any other imports - if (false) - for (map::iterator import = import_pop_map.begin(); - import != import_pop_map.end(); - import++) { - CDir *imp = (*import).second; - if (imp->inode->is_root()) continue; - - double pop = (*import).first; - if (pop < amount-have || pop < MIN_REEXPORT) { - dout(-5) << "reexporting " << *imp - << " pop " << pop - << " back to mds" << imp->inode->authority() - << dendl; - have += pop; - mds->mdcache->migrator->export_dir_nicely(imp, imp->inode->authority().first); - } - if (amount-have < MIN_OFFLOAD) break; - } - if (amount-have < MIN_OFFLOAD) { - //fudge = amount-have; - total_sent += have; - continue; - } - - // okay, search for fragments of my workload - set candidates; - mds->mdcache->get_fullauth_subtrees(candidates); - - list exports; - - for (set::iterator pot = candidates.begin(); - pot != candidates.end(); - pot++) { - if ((*pot)->get_inode()->is_stray()) continue; - find_exports(*pot, amount, exports, have, already_exporting, now); - if (have > amount-MIN_OFFLOAD) - break; - } - //fudge = amount - have; - total_sent += have; - - for (list::iterator it = exports.begin(); it != exports.end(); it++) { - dout(-5) << " - exporting " - << (*it)->pop_auth_subtree - << " " - << (*it)->pop_auth_subtree.meta_load(now) - << " to mds" << target - << " " << **it - << dendl; - mds->mdcache->migrator->export_dir_nicely(*it, target); - } - } - - dout(5) << "rebalance done" << dendl; - show_imports(); - -} - - - -void MDBalancer::find_exports(CDir *dir, - double amount, - list& exports, - double& have, - set& already_exporting, - utime_t now) -{ - double need = amount - have; - if (need < amount * g_conf.mds_bal_min_start) - return; // good enough! - double needmax = need * g_conf.mds_bal_need_max; - double needmin = need * g_conf.mds_bal_need_min; - double midchunk = need * g_conf.mds_bal_midchunk; - double minchunk = need * g_conf.mds_bal_minchunk; - - list bigger_rep, bigger_unrep; - multimap smaller; - - double dir_pop = dir->pop_auth_subtree.meta_load(now); - dout(7) << " find_exports in " << dir_pop << " " << *dir << " need " << need << " (" << needmin << " - " << needmax << ")" << dendl; - - double subdir_sum = 0; - for (CDir::map_t::iterator it = dir->begin(); - it != dir->end(); - it++) { - CInode *in = it->second->get_inode(); - if (!in) continue; - if (!in->is_dir()) continue; - - list dfls; - in->get_dirfrags(dfls); - for (list::iterator p = dfls.begin(); - p != dfls.end(); - ++p) { - CDir *subdir = *p; - if (!subdir->is_auth()) continue; - if (already_exporting.count(subdir)) continue; - - if (subdir->is_frozen()) continue; // can't export this right now! - - // how popular? - double pop = subdir->pop_auth_subtree.meta_load(now); - subdir_sum += pop; - dout(15) << " subdir pop " << pop << " " << *subdir << dendl; - - if (pop < minchunk) continue; - - // lucky find? - if (pop > needmin && pop < needmax) { - exports.push_back(subdir); - already_exporting.insert(subdir); - have += pop; - return; - } - - if (pop > need) { - if (subdir->is_rep()) - bigger_rep.push_back(subdir); - else - bigger_unrep.push_back(subdir); - } else - smaller.insert(pair(pop, subdir)); - } - } - dout(15) << " sum " << subdir_sum << " / " << dir_pop << dendl; - - // grab some sufficiently big small items - multimap::reverse_iterator it; - for (it = smaller.rbegin(); - it != smaller.rend(); - it++) { - - if ((*it).first < midchunk) - break; // try later - - dout(7) << " taking smaller " << *(*it).second << dendl; - - exports.push_back((*it).second); - already_exporting.insert((*it).second); - have += (*it).first; - if (have > needmin) - return; - } - - // apprently not enough; drill deeper into the hierarchy (if non-replicated) - for (list::iterator it = bigger_unrep.begin(); - it != bigger_unrep.end(); - it++) { - dout(15) << " descending into " << **it << dendl; - find_exports(*it, amount, exports, have, already_exporting, now); - if (have > needmin) - return; - } - - // ok fine, use smaller bits - for (; - it != smaller.rend(); - it++) { - dout(7) << " taking (much) smaller " << it->first << " " << *(*it).second << dendl; - - exports.push_back((*it).second); - already_exporting.insert((*it).second); - have += (*it).first; - if (have > needmin) - return; - } - - // ok fine, drill into replicated dirs - for (list::iterator it = bigger_rep.begin(); - it != bigger_rep.end(); - it++) { - dout(7) << " descending into replicated " << **it << dendl; - find_exports(*it, amount, exports, have, already_exporting, now); - if (have > needmin) - return; - } - -} - - - - -void MDBalancer::hit_inode(utime_t now, CInode *in, int type, int who) -{ - // hit inode - in->pop.get(type).hit(now); - - if (in->get_parent_dn()) - hit_dir(now, in->get_parent_dn()->get_dir(), type, who); -} -/* - // hit me - in->popularity[MDS_POP_JUSTME].pop[type].hit(now); - in->popularity[MDS_POP_NESTED].pop[type].hit(now); - if (in->is_auth()) { - in->popularity[MDS_POP_CURDOM].pop[type].hit(now); - in->popularity[MDS_POP_ANYDOM].pop[type].hit(now); - - dout(20) << "hit_inode " << type << " pop " - << in->popularity[MDS_POP_JUSTME].pop[type].get(now) << " me, " - << in->popularity[MDS_POP_NESTED].pop[type].get(now) << " nested, " - << in->popularity[MDS_POP_CURDOM].pop[type].get(now) << " curdom, " - << in->popularity[MDS_POP_CURDOM].pop[type].get(now) << " anydom" - << " on " << *in - << dendl; - } else { - dout(20) << "hit_inode " << type << " pop " - << in->popularity[MDS_POP_JUSTME].pop[type].get(now) << " me, " - << in->popularity[MDS_POP_NESTED].pop[type].get(now) << " nested, " - << " on " << *in - << dendl; - } - - // hit auth up to import - CDir *dir = in->get_parent_dir(); - if (dir) hit_dir(now, dir, type); -*/ - - -void MDBalancer::hit_dir(utime_t now, CDir *dir, int type, int who, double amount) -{ - // hit me - double v = dir->pop_me.get(type).hit(now, amount); - - //if (dir->ino() == inodeno_t(0x10000000000)) - //dout(0) << "hit_dir " << type << " pop " << v << " in " << *dir << dendl; - - // hit modify counter, if this was a modify - if (g_conf.num_mds > 2 && // FIXME >2 thing - !dir->inode->is_root() && // not root (for now at least) - dir->is_auth() && - - ((g_conf.mds_bal_split_size > 0 && - dir->get_size() > (unsigned)g_conf.mds_bal_split_size) || - (v > g_conf.mds_bal_split_rd && type == META_POP_IRD) || - (v > g_conf.mds_bal_split_wr && type == META_POP_IWR)) && - split_queue.count(dir->dirfrag()) == 0) { - dout(0) << "hit_dir " << type << " pop is " << v << ", putting in split_queue: " << *dir << dendl; - split_queue.insert(dir->dirfrag()); - } - - // replicate? - if (type == META_POP_IRD && who >= 0) { - dir->pop_spread.hit(now, who); - } - - double rd_adj = 0; - if (type == META_POP_IRD && - dir->last_popularity_sample < last_sample) { - float dir_pop = dir->pop_auth_subtree.get(type).get(now); // hmm?? - dir->last_popularity_sample = last_sample; - float pop_sp = dir->pop_spread.get(now); - dir_pop += pop_sp * 10; - - //if (dir->ino() == inodeno_t(0x10000000002)) - if (pop_sp > 0) { - dout(20) << "hit_dir " << type << " pop " << dir_pop << " spread " << pop_sp - << " " << dir->pop_spread.last[0] - << " " << dir->pop_spread.last[1] - << " " << dir->pop_spread.last[2] - << " " << dir->pop_spread.last[3] - << " in " << *dir << dendl; - } - - if (dir->is_auth() && !dir->is_ambiguous_auth()) { - if (!dir->is_rep() && - dir_pop >= g_conf.mds_bal_replicate_threshold) { - // replicate - float rdp = dir->pop_me.get(META_POP_IRD).get(now); - rd_adj = rdp / mds->get_mds_map()->get_num_mds() - rdp; - rd_adj /= 2.0; // temper somewhat - - dout(0) << "replicating dir " << *dir << " pop " << dir_pop << " .. rdp " << rdp << " adj " << rd_adj << dendl; - - dir->dir_rep = CDir::REP_ALL; - mds->mdcache->send_dir_updates(dir, true); - - // fixme this should adjust the whole pop hierarchy - dir->pop_me.get(META_POP_IRD).adjust(rd_adj); - dir->pop_auth_subtree.get(META_POP_IRD).adjust(rd_adj); - } - - if (dir->ino() != 1 && - dir->is_rep() && - dir_pop < g_conf.mds_bal_unreplicate_threshold) { - // unreplicate - dout(0) << "unreplicating dir " << *dir << " pop " << dir_pop << dendl; - - dir->dir_rep = CDir::REP_NONE; - mds->mdcache->send_dir_updates(dir); - } - } - } - - // adjust ancestors - bool hit_subtree = dir->is_auth(); // current auth subtree (if any) - bool hit_subtree_nested = dir->is_auth(); // all nested auth subtrees - - while (1) { - dir->pop_nested.get(type).hit(now, amount); - if (rd_adj != 0.0) - dir->pop_nested.get(META_POP_IRD).adjust(now, rd_adj); - - if (hit_subtree) { - dir->pop_auth_subtree.get(type).hit(now, amount); - if (rd_adj != 0.0) - dir->pop_auth_subtree.get(META_POP_IRD).adjust(now, rd_adj); - } - - if (hit_subtree_nested) { - dir->pop_auth_subtree_nested.get(type).hit(now, amount); - if (rd_adj != 0.0) - dir->pop_auth_subtree_nested.get(META_POP_IRD).adjust(now, rd_adj); - } - - if (dir->is_subtree_root()) - hit_subtree = false; // end of auth domain, stop hitting auth counters. - - if (dir->inode->get_parent_dn() == 0) break; - dir = dir->inode->get_parent_dn()->get_dir(); - } -} - - -/* - * subtract off an exported chunk. - * this excludes *dir itself (encode_export_dir should have take care of that) - * we _just_ do the parents' nested counters. - * - * NOTE: call me _after_ forcing *dir into a subtree root, - * but _before_ doing the encode_export_dirs. - */ -void MDBalancer::subtract_export(CDir *dir) -{ - dirfrag_load_vec_t subload = dir->pop_auth_subtree; - - while (true) { - dir = dir->inode->get_parent_dir(); - if (!dir) break; - - dir->pop_nested -= subload; - dir->pop_auth_subtree_nested -= subload; - } -} - - -void MDBalancer::add_import(CDir *dir) -{ - dirfrag_load_vec_t subload = dir->pop_auth_subtree; - - while (true) { - dir = dir->inode->get_parent_dir(); - if (!dir) break; - - dir->pop_nested += subload; - dir->pop_auth_subtree_nested += subload; - } -} - - - - - - -void MDBalancer::show_imports(bool external) -{ - mds->mdcache->show_subtrees(); -} - - -void MDBalancer::dump_pop_map() -{ - return; // this is dumb - - - char fn[20]; - sprintf(fn, "popdump.%d.mds%d", beat_epoch, mds->get_nodeid()); - - dout(1) << "dump_pop_map to " << fn << dendl; - - ofstream myfile; - myfile.open(fn); - - list iq; - if (mds->mdcache->root) - iq.push_back(mds->mdcache->root); - - utime_t now = g_clock.now(); - while (!iq.empty()) { - CInode *in = iq.front(); - iq.pop_front(); - - // pop stats - /*for (int a=0; apopularity[a].pop[b].get(now) << "\t"; - */ - - // recurse, depth-first. - if (in->is_dir()) { - - list dirs; - in->get_dirfrags(dirs); - for (list::iterator p = dirs.begin(); - p != dirs.end(); - ++p) { - CDir *dir = *p; - - myfile << (int)dir->pop_me.meta_load(now) << "\t"; - myfile << (int)dir->pop_nested.meta_load(now) << "\t"; - myfile << (int)dir->pop_auth_subtree.meta_load(now) << "\t"; - myfile << (int)dir->pop_auth_subtree_nested.meta_load(now) << "\t"; - - // filename last - string p; - in->make_path(p); - myfile << "." << p; - if (dir->get_frag() != frag_t()) - myfile << "___" << (unsigned)dir->get_frag(); - myfile << std::endl; //"/" << dir->get_frag() << dendl; - - // add contents - for (CDir::map_t::iterator q = dir->items.begin(); - q != dir->items.end(); - q++) - if (q->second->is_primary()) - iq.push_front(q->second->get_inode()); - } - } - - } - - myfile.close(); -} - - - -/* replicate? - - float dir_pop = dir->get_popularity(); - - if (dir->is_auth()) { - if (!dir->is_rep() && - dir_pop >= g_conf.mds_bal_replicate_threshold) { - // replicate - dout(5) << "replicating dir " << *in << " pop " << dir_pop << dendl; - - dir->dir_rep = CDIR_REP_ALL; - mds->mdcache->send_dir_updates(dir); - } - - if (dir->is_rep() && - dir_pop < g_conf.mds_bal_unreplicate_threshold) { - // unreplicate - dout(5) << "unreplicating dir " << *in << " pop " << dir_pop << dendl; - - dir->dir_rep = CDIR_REP_NONE; - mds->mdcache->send_dir_updates(dir); - } - } - -*/ diff --git a/branches/sage/ebofs2/mds/MDBalancer.h b/branches/sage/ebofs2/mds/MDBalancer.h deleted file mode 100644 index 819c69f0616c0..0000000000000 --- a/branches/sage/ebofs2/mds/MDBalancer.h +++ /dev/null @@ -1,118 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __MDBALANCER_H -#define __MDBALANCER_H - -#include -#include -using std::list; -using std::map; - -#include -using namespace __gnu_cxx; - -#include "include/types.h" -#include "common/Clock.h" -#include "CInode.h" - - -class MDS; -class Message; -class MHeartbeat; -class CInode; -class Context; -class CDir; - -class MDBalancer { - protected: - MDS *mds; - int beat_epoch; - - int last_epoch_under; - int last_epoch_over; - - utime_t last_heartbeat; - utime_t last_fragment; - utime_t last_sample; - - - // todo - set split_queue; - - // per-epoch scatter/gathered info - hash_map mds_load; - hash_map mds_meta_load; - map > mds_import_map; - - // per-epoch state - double my_load, target_load; - map my_targets; - map imported; - map exported; - - double try_match(int ex, double& maxex, - int im, double& maxim); - double get_maxim(int im) { - return target_load - mds_meta_load[im] - imported[im]; - } - double get_maxex(int ex) { - return mds_meta_load[ex] - target_load - exported[ex]; - } - - public: - MDBalancer(MDS *m) : - mds(m), - beat_epoch(0), - last_epoch_under(0), last_epoch_over(0) { } - - mds_load_t get_load(); - - int proc_message(Message *m); - - void send_heartbeat(); - void handle_heartbeat(MHeartbeat *m); - - void tick(); - - void do_fragmenting(); - - void export_empties(); - void do_rebalance(int beat); - void find_exports(CDir *dir, - double amount, - list& exports, - double& have, - set& already_exporting, - utime_t now); - - - void subtract_export(class CDir *ex); - void add_import(class CDir *im); - - void hit_inode(utime_t now, class CInode *in, int type, int who=-1); - void hit_dir(utime_t now, class CDir *dir, int type, int who, double amount=1.0); - void hit_recursive(utime_t now, class CDir *dir, int type, double amount, double rd_adj); - - - void show_imports(bool external=false); - void dump_pop_map(); - -}; - - - -#endif diff --git a/branches/sage/ebofs2/mds/MDCache.cc b/branches/sage/ebofs2/mds/MDCache.cc deleted file mode 100644 index 1fc19c2f57874..0000000000000 --- a/branches/sage/ebofs2/mds/MDCache.cc +++ /dev/null @@ -1,6278 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "MDCache.h" -#include "MDS.h" -#include "Server.h" -#include "Locker.h" -#include "MDLog.h" -#include "MDBalancer.h" -#include "AnchorClient.h" -#include "Migrator.h" - -#include "MDSMap.h" - -#include "CInode.h" -#include "CDir.h" - -#include "include/filepath.h" - -#include "msg/Message.h" -#include "msg/Messenger.h" - -#include "common/Logger.h" - -#include "osdc/Filer.h" - -#include "events/ESubtreeMap.h" -#include "events/EUpdate.h" -#include "events/ESlaveUpdate.h" -#include "events/EString.h" -#include "events/EPurgeFinish.h" -#include "events/EImportFinish.h" -#include "events/EFragment.h" - -#include "messages/MGenericMessage.h" - -#include "messages/MMDSResolve.h" -#include "messages/MMDSResolveAck.h" -#include "messages/MMDSCacheRejoin.h" - -#include "messages/MDiscover.h" -#include "messages/MDiscoverReply.h" - -//#include "messages/MInodeUpdate.h" -#include "messages/MDirUpdate.h" -#include "messages/MCacheExpire.h" - -#include "messages/MInodeFileCaps.h" - -#include "messages/MLock.h" -#include "messages/MDentryUnlink.h" - -#include "messages/MClientRequest.h" -#include "messages/MClientFileCaps.h" - -#include "messages/MMDSSlaveRequest.h" - -#include "messages/MMDSFragmentNotify.h" - - -#include "IdAllocator.h" - -#include "common/Timer.h" - -#include -#include -#include -#include -#include -using namespace std; - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) *_dout << dbeginl << g_clock.now() << " mds" << mds->get_nodeid() << ".cache " - - - - -MDCache::MDCache(MDS *m) -{ - mds = m; - migrator = new Migrator(mds, this); - // renamer = new Renamer(mds, this); - root = NULL; - stray = NULL; - lru.lru_set_max(g_conf.mds_cache_size); - lru.lru_set_midpoint(g_conf.mds_cache_mid); - - did_shutdown_log_cap = false; -} - -MDCache::~MDCache() -{ - delete migrator; - //delete renamer; -} - - - -void MDCache::log_stat(Logger *logger) -{ - if (get_root()) { - utime_t now = g_clock.now(); - //logger->set("pop", (int)get_root()->pop_nested.meta_load(now)); - //logger->set("popauth", (int)get_root()->pop_auth_subtree_nested.meta_load(now)); - } - logger->set("c", lru.lru_get_size()); - logger->set("cpin", lru.lru_get_num_pinned()); - logger->set("ctop", lru.lru_get_top()); - logger->set("cbot", lru.lru_get_bot()); - logger->set("cptail", lru.lru_get_pintail()); -} - - -// - -bool MDCache::shutdown() -{ - if (lru.lru_get_size() > 0) { - dout(7) << "WARNING: mdcache shutodwn with non-empty cache" << dendl; - //show_cache(); - show_subtrees(); - //dump(); - } - return true; -} - - -// ==================================================================== -// some inode functions - -CInode *MDCache::create_inode() -{ - CInode *in = new CInode(this); - - // zero - memset(&in->inode, 0, sizeof(inode_t)); - - // assign ino - in->inode.ino = mds->idalloc->alloc_id(); - - in->inode.nlink = 1; // FIXME - - in->inode.layout = g_OSD_FileLayout; - - add_inode(in); // add - return in; -} - - -void MDCache::add_inode(CInode *in) -{ - // add to lru, inode map - assert(inode_map.count(in->ino()) == 0); // should be no dup inos! - inode_map[ in->ino() ] = in; - - if (in->ino() < MDS_INO_BASE) { - base_inodes.insert(in); - if (in->ino() == MDS_INO_ROOT) - set_root(in); - if (in->ino() == MDS_INO_STRAY(mds->get_nodeid())) - stray = in; - } -} - -void MDCache::remove_inode(CInode *o) -{ - dout(14) << "remove_inode " << *o << dendl; - - if (o->get_parent_dn()) { - // FIXME: multiple parents? - CDentry *dn = o->get_parent_dn(); - assert(!dn->is_dirty()); - dn->dir->unlink_inode(dn); // leave dentry ... FIXME? - } - - // remove from inode map - inode_map.erase(o->ino()); - - if (o->ino() < MDS_INO_BASE) { - assert(base_inodes.count(o)); - base_inodes.erase(o); - - if (o == root) root = 0; - if (o == stray) stray = 0; - } - - // delete it - delete o; -} - - - -CInode *MDCache::create_root_inode() -{ - CInode *root = new CInode(this); - memset(&root->inode, 0, sizeof(inode_t)); - root->inode.ino = MDS_INO_ROOT; - - // make it up (FIXME) - root->inode.mode = 0755 | INODE_MODE_DIR; - root->inode.size = 0; - root->inode.ctime = - root->inode.mtime = g_clock.now(); - - root->inode.nlink = 1; - root->inode.layout = g_OSD_MDDirLayout; - - root->force_auth = pair(0, CDIR_AUTH_UNKNOWN); - - add_inode( root ); - - return root; -} - - -void MDCache::open_root(Context *c) -{ - int whoami = mds->get_nodeid(); - - // open root inode - if (whoami == 0) { - // i am root inode - CInode *root = create_root_inode(); - - // root directory too - CDir *dir = root->get_or_open_dirfrag(this, frag_t()); - adjust_subtree_auth(dir, 0); - dir->dir_rep = CDir::REP_ALL; //NONE; - - show_subtrees(); - - if (c) { - c->finish(0); - delete c; - } - } else { - // request inode from root mds - discover_base_ino(MDS_INO_ROOT, c, 0); - } -} - -CInode *MDCache::create_stray_inode(int whose) -{ - if (whose < 0) whose = mds->get_nodeid(); - - CInode *in = new CInode(this, whose == mds->get_nodeid()); - memset(&in->inode, 0, sizeof(inode_t)); - in->inode.ino = MDS_INO_STRAY(whose); - - // make it up (FIXME) - in->inode.mode = 0755 | INODE_MODE_DIR; - in->inode.size = 0; - in->inode.ctime = - in->inode.mtime = g_clock.now(); - - in->inode.nlink = 1; - in->inode.layout = g_OSD_MDDirLayout; - - add_inode( in ); - - return in; -} - -void MDCache::open_local_stray() -{ - create_stray_inode(); - CDir *dir = stray->get_or_open_dirfrag(this, frag_t()); - adjust_subtree_auth(dir, mds->get_nodeid()); -} - -void MDCache::open_foreign_stray(int who, Context *c) -{ - inodeno_t ino = MDS_INO_STRAY(who); - dout(10) << "open_foreign_stray mds" << who << " " << ino << dendl; - assert(!have_inode(ino)); - - discover_base_ino(ino, c, who); -} - - -CDentry *MDCache::get_or_create_stray_dentry(CInode *in) -{ - string straydname; - in->name_stray_dentry(straydname); - - if (!stray) create_stray_inode(mds->get_nodeid()); - - frag_t fg = stray->pick_dirfrag(straydname); - - CDir *straydir = stray->get_or_open_dirfrag(this, fg); - - CDentry *straydn = straydir->lookup(straydname); - if (!straydn) - straydn = straydir->add_null_dentry(straydname); - - return straydn; -} - - - -MDSCacheObject *MDCache::get_object(MDSCacheObjectInfo &info) -{ - // inode? - if (info.ino) - return get_inode(info.ino); - - // dir or dentry. - CDir *dir = get_dirfrag(info.dirfrag); - if (!dir) return 0; - - if (info.dname.length()) - return dir->lookup(info.dname); - else - return dir; -} - - - - -// ==================================================================== -// subtree management - -void MDCache::list_subtrees(list& ls) -{ - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) - ls.push_back(p->first); -} - -/* - * adjust the dir_auth of a subtree. - * merge with parent and/or child subtrees, if is it appropriate. - * merge can ONLY happen if both parent and child have unambiguous auth. - */ -void MDCache::adjust_subtree_auth(CDir *dir, pair auth) -{ - dout(7) << "adjust_subtree_auth " << dir->get_dir_auth() << " -> " << auth - << " on " << *dir << dendl; - - show_subtrees(); - - CDir *root; - if (dir->ino() < MDS_INO_BASE) { - root = dir; // bootstrap hack. - if (subtrees.count(root) == 0) { - subtrees[root].clear(); - root->get(CDir::PIN_SUBTREE); - } - } else { - root = get_subtree_root(dir); // subtree root - } - assert(root); - assert(subtrees.count(root)); - dout(7) << " current root is " << *root << dendl; - - if (root == dir) { - // i am already a subtree. - dir->set_dir_auth(auth); - } else { - // i am a new subtree. - dout(10) << " new subtree at " << *dir << dendl; - assert(subtrees.count(dir) == 0); - subtrees[dir].clear(); // create empty subtree bounds list for me. - dir->get(CDir::PIN_SUBTREE); - - // set dir_auth - dir->set_dir_auth(auth); - - // move items nested beneath me, under me. - set::iterator p = subtrees[root].begin(); - while (p != subtrees[root].end()) { - set::iterator next = p; - next++; - if (get_subtree_root((*p)->get_parent_dir()) == dir) { - // move under me - dout(10) << " claiming child bound " << **p << dendl; - subtrees[dir].insert(*p); - subtrees[root].erase(p); - } - p = next; - } - - // i am a bound of the parent subtree. - subtrees[root].insert(dir); - - // i am now the subtree root. - root = dir; - - // adjust recursive pop counters - if (dir->is_auth()) { - CDir *p = dir->get_parent_dir(); - while (p) { - p->pop_auth_subtree -= dir->pop_auth_subtree; - if (p->is_subtree_root()) break; - p = p->inode->get_parent_dir(); - } - } - - eval_subtree_root(dir); - } - - show_subtrees(); -} - - -void MDCache::try_subtree_merge(CDir *dir) -{ - dout(7) << "try_subtree_merge " << *dir << dendl; - assert(subtrees.count(dir)); - set oldbounds = subtrees[dir]; - - // try merge at my root - try_subtree_merge_at(dir); - - // try merge at my old bounds - for (set::iterator p = oldbounds.begin(); - p != oldbounds.end(); - ++p) - try_subtree_merge_at(*p); -} - -class C_MDC_SubtreeMergeWB : public Context { - MDCache *mdcache; - CInode *in; - LogSegment *ls; -public: - C_MDC_SubtreeMergeWB(MDCache *mdc, CInode *i, LogSegment *s) : mdcache(mdc), in(i), ls(s) {} - void finish(int r) { - mdcache->subtree_merge_writebehind_finish(in, ls); - } -}; - -void MDCache::try_subtree_merge_at(CDir *dir) -{ - dout(10) << "try_subtree_merge_at " << *dir << dendl; - assert(subtrees.count(dir)); - - // merge with parent? - CDir *parent = dir; - if (dir->ino() >= MDS_INO_BASE) - parent = get_subtree_root(dir->get_parent_dir()); - - if (parent != dir && // we have a parent, - parent->dir_auth == dir->dir_auth && // auth matches, - dir->dir_auth.second == CDIR_AUTH_UNKNOWN && // auth is unambiguous, - !dir->state_test(CDir::STATE_EXPORTBOUND)) { // not an exportbound, - // merge with parent. - dout(10) << " subtree merge at " << *dir << dendl; - dir->set_dir_auth(CDIR_AUTH_DEFAULT); - - // move our bounds under the parent - for (set::iterator p = subtrees[dir].begin(); - p != subtrees[dir].end(); - ++p) - subtrees[parent].insert(*p); - - // we are no longer a subtree or bound - dir->put(CDir::PIN_SUBTREE); - subtrees.erase(dir); - subtrees[parent].erase(dir); - - // adjust popularity? - if (dir->is_auth()) { - CDir *p = dir->get_parent_dir(); - while (p) { - p->pop_auth_subtree += dir->pop_auth_subtree; - if (p->is_subtree_root()) break; - p = p->inode->get_parent_dir(); - } - } - - eval_subtree_root(dir); - - // journal inode? - // (this is a large hammer to ensure that dirfragtree updates will - // hit the disk before the relevant dirfrags ever close) - if (dir->inode->is_auth() && - dir->inode->can_auth_pin() && - (mds->is_active() || mds->is_stopping())) { - CInode *in = dir->inode; - dout(10) << "try_subtree_merge_at journaling merged bound " << *in << dendl; - - in->auth_pin(); - - // journal write-behind. - inode_t *pi = in->project_inode(); - pi->version = in->pre_dirty(); - - EUpdate *le = new EUpdate(mds->mdlog, "subtree merge writebehind"); - le->metablob.add_dir_context(in->get_parent_dn()->get_dir()); - le->metablob.add_primary_dentry(in->get_parent_dn(), true, 0, pi); - - mds->mdlog->submit_entry(le); - mds->mdlog->wait_for_sync(new C_MDC_SubtreeMergeWB(this, in, - mds->mdlog->get_current_segment())); - } - } - - show_subtrees(15); -} - -void MDCache::subtree_merge_writebehind_finish(CInode *in, LogSegment *ls) -{ - dout(10) << "subtree_merge_writebehind_finish on " << in << dendl; - in->pop_and_dirty_projected_inode(ls); - in->auth_unpin(); -} - -void MDCache::eval_subtree_root(CDir *dir) -{ - // evaluate subtree inode dirlock? - // (we should scatter the dirlock on subtree bounds) - if (dir->inode->is_auth() && - dir->inode->dirlock.is_stable()) { - // force the issue a bit - if (!dir->inode->is_frozen()) - mds->locker->scatter_eval(&dir->inode->dirlock); - else - mds->locker->try_scatter_eval(&dir->inode->dirlock); // ** may or may not be auth_pinned ** - } - -} - - -void MDCache::adjust_bounded_subtree_auth(CDir *dir, set& bounds, pair auth) -{ - dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth - << " on " << *dir - << " bounds " << bounds - << dendl; - - show_subtrees(); - - CDir *root; - if (dir->ino() < MDS_INO_BASE) { - root = dir; // bootstrap hack. - if (subtrees.count(root) == 0) { - subtrees[root].clear(); - root->get(CDir::PIN_SUBTREE); - } - } else { - root = get_subtree_root(dir); // subtree root - } - assert(root); - assert(subtrees.count(root)); - dout(7) << " current root is " << *root << dendl; - - pair oldauth = dir->authority(); - - if (root == dir) { - // i am already a subtree. - dir->set_dir_auth(auth); - } else { - // i am a new subtree. - dout(10) << " new subtree at " << *dir << dendl; - assert(subtrees.count(dir) == 0); - subtrees[dir].clear(); // create empty subtree bounds list for me. - dir->get(CDir::PIN_SUBTREE); - - // set dir_auth - dir->set_dir_auth(auth); - - // move items nested beneath me, under me. - set::iterator p = subtrees[root].begin(); - while (p != subtrees[root].end()) { - set::iterator next = p; - next++; - if (get_subtree_root((*p)->get_parent_dir()) == dir) { - // move under me - dout(10) << " claiming child bound " << **p << dendl; - subtrees[dir].insert(*p); - subtrees[root].erase(p); - } - p = next; - } - - // i am a bound of the parent subtree. - subtrees[root].insert(dir); - - // i am now the subtree root. - root = dir; - } - - // verify/adjust bounds. - // - these may be new, or - // - beneath existing ambiguous bounds (which will be collapsed), - // - but NOT beneath unambiguous bounds. - for (set::iterator p = bounds.begin(); - p != bounds.end(); - ++p) { - CDir *bound = *p; - - // new bound? - if (subtrees[dir].count(bound) == 0) { - if (get_subtree_root(bound) == dir) { - dout(10) << " new bound " << *bound << ", adjusting auth back to old " << oldauth << dendl; - adjust_subtree_auth(bound, oldauth); // otherwise, adjust at bound. - } - else { - dout(10) << " want bound " << *bound << dendl; - // make sure it's nested beneath ambiguous subtree(s) - while (1) { - CDir *t = get_subtree_root(bound->get_parent_dir()); - if (t == dir) break; - while (subtrees[dir].count(t) == 0) - t = get_subtree_root(t->get_parent_dir()); - dout(10) << " swallowing intervening subtree at " << *t << dendl; - adjust_subtree_auth(t, auth); - try_subtree_merge_at(t); - } - } - } - else { - dout(10) << " already have bound " << *bound << dendl; - } - } - // merge stray bounds? - set::iterator p = subtrees[dir].begin(); - while (p != subtrees[dir].end()) { - set::iterator n = p; - n++; - if (bounds.count(*p) == 0) { - CDir *stray = *p; - dout(10) << " swallowing extra subtree at " << *stray << dendl; - adjust_subtree_auth(stray, auth); - try_subtree_merge_at(stray); - } - p = n; - } - - // bound should now match. - verify_subtree_bounds(dir, bounds); - - show_subtrees(); -} - - -void MDCache::adjust_bounded_subtree_auth(CDir *dir, list& bound_dfs, pair auth) -{ - dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth - << " on " << *dir - << " bound_dfs " << bound_dfs - << dendl; - - // make bounds list - set bounds; - for (list::iterator p = bound_dfs.begin(); - p != bound_dfs.end(); - ++p) { - CDir *bd = get_dirfrag(*p); - if (bd) - bounds.insert(bd); - } - - adjust_bounded_subtree_auth(dir, bounds, auth); -} - -void MDCache::map_dirfrag_set(list& dfs, set& result) -{ - // group by inode - map ino_fragset; - for (list::iterator p = dfs.begin(); p != dfs.end(); ++p) - ino_fragset[p->ino].insert(p->frag); - - // get frags - for (map::iterator p = ino_fragset.begin(); - p != ino_fragset.end(); - ++p) { - CInode *in = get_inode(p->first); - if (!in) continue; - - list fglist; - for (set::iterator q = p->second.begin(); q != p->second.end(); ++q) - in->dirfragtree.get_leaves_under(*q, fglist); - - dout(15) << "map_dirfrag_set " << p->second << " -> " << fglist - << " on " << *in << dendl; - - for (list::iterator q = fglist.begin(); q != fglist.end(); ++q) { - CDir *dir = in->get_dirfrag(*q); - if (dir) result.insert(dir); - } - } -} - - - -CDir *MDCache::get_subtree_root(CDir *dir) -{ - // find the underlying dir that delegates (or is about to delegate) auth - while (true) { - if (dir->is_subtree_root()) - return dir; - dir = dir->get_parent_dir(); - if (!dir) - return 0; // none - } -} - -void MDCache::remove_subtree(CDir *dir) -{ - dout(10) << "remove_subtree " << *dir << dendl; - assert(subtrees.count(dir)); - assert(subtrees[dir].empty()); - subtrees.erase(dir); - dir->put(CDir::PIN_SUBTREE); - if (dir->get_parent_dir()) { - CDir *p = get_subtree_root(dir->get_parent_dir()); - assert(subtrees[p].count(dir)); - subtrees[p].erase(dir); - } -} - -void MDCache::get_subtree_bounds(CDir *dir, set& bounds) -{ - assert(subtrees.count(dir)); - bounds = subtrees[dir]; -} - -void MDCache::get_wouldbe_subtree_bounds(CDir *dir, set& bounds) -{ - if (subtrees.count(dir)) { - // just copy them, dir is a subtree. - get_subtree_bounds(dir, bounds); - } else { - // find them - CDir *root = get_subtree_root(dir); - for (set::iterator p = subtrees[root].begin(); - p != subtrees[root].end(); - ++p) { - CDir *t = *p; - while (t != root) { - t = t->get_parent_dir(); - assert(t); - if (t == dir) { - bounds.insert(*p); - continue; - } - } - } - } -} - -void MDCache::verify_subtree_bounds(CDir *dir, const set& bounds) -{ - // for debugging only. - assert(subtrees.count(dir)); - if (bounds != subtrees[dir]) { - dout(0) << "verify_subtree_bounds failed" << dendl; - set b = bounds; - for (set::iterator p = subtrees[dir].begin(); - p != subtrees[dir].end(); - ++p) { - if (bounds.count(*p)) { - b.erase(*p); - continue; - } - dout(0) << " missing bound " << **p << dendl; - } - for (set::iterator p = b.begin(); - p != b.end(); - ++p) - dout(0) << " extra bound " << **p << dendl; - } - assert(bounds == subtrees[dir]); -} - -void MDCache::verify_subtree_bounds(CDir *dir, const list& bounds) -{ - // for debugging only. - assert(subtrees.count(dir)); - - // make sure that any bounds i do have are properly noted as such. - int failed = 0; - for (list::const_iterator p = bounds.begin(); - p != bounds.end(); - ++p) { - CDir *bd = get_dirfrag(*p); - if (!bd) continue; - if (subtrees[dir].count(bd) == 0) { - dout(0) << "verify_subtree_bounds failed: extra bound " << *bd << dendl; - failed++; - } - } - assert(failed == 0); -} - -void MDCache::adjust_subtree_after_rename(CInode *diri, CDir *olddir) -{ - dout(10) << "adjust_subtree_after_rename " << *diri << " from " << *olddir << dendl; - - //show_subtrees(); - - list dfls; - diri->get_dirfrags(dfls); - for (list::iterator p = dfls.begin(); p != dfls.end(); ++p) { - CDir *dir = *p; - - dout(10) << "dirfrag " << *dir << dendl; - CDir *oldparent = get_subtree_root(olddir); - dout(10) << " old parent " << *oldparent << dendl; - CDir *newparent = get_subtree_root(diri->get_parent_dir()); - dout(10) << " new parent " << *newparent << dendl; - - if (oldparent == newparent) { - dout(10) << "parent unchanged for " << *dir << " at " << *oldparent << dendl; - continue; - } - - if (dir->is_subtree_root()) { - // children are fine. change parent. - dout(10) << "moving " << *dir << " from " << *oldparent << " to " << *newparent << dendl; - assert(subtrees[oldparent].count(dir)); - subtrees[oldparent].erase(dir); - assert(subtrees.count(newparent)); - subtrees[newparent].insert(dir); - } else { - // mid-subtree. - - // see if any old bounds move to the new parent. - list tomove; - for (set::iterator p = subtrees[oldparent].begin(); - p != subtrees[oldparent].end(); - ++p) { - CDir *bound = *p; - CDir *broot = get_subtree_root(bound->get_parent_dir()); - if (broot != oldparent) { - assert(broot == newparent); - tomove.push_back(bound); - } - } - for (list::iterator p = tomove.begin(); p != tomove.end(); ++p) { - CDir *bound = *p; - dout(10) << "moving bound " << *bound << " from " << *oldparent << " to " << *newparent << dendl; - subtrees[oldparent].erase(bound); - subtrees[newparent].insert(bound); - } - - // did auth change? - if (oldparent->authority() != newparent->authority()) - adjust_subtree_auth(dir, oldparent->authority()); // caller is responsible for *diri. - } - } - - for (list::iterator p = dfls.begin(); p != dfls.end(); ++p) { - CDir *dir = *p; - - // un-force dir to subtree root - if (dir->dir_auth == pair(dir->dir_auth.first, dir->dir_auth.first)) { - adjust_subtree_auth(dir, dir->dir_auth.first); - try_subtree_merge_at(dir); - } - } - - show_subtrees(); -} - - -void MDCache::get_fullauth_subtrees(set& s) -{ - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) { - CDir *root = p->first; - if (root->is_full_dir_auth()) - s.insert(root); - } -} -void MDCache::get_auth_subtrees(set& s) -{ - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) { - CDir *root = p->first; - if (root->is_auth()) - s.insert(root); - } -} - - -// count. - -int MDCache::num_subtrees() -{ - return subtrees.size(); -} - -int MDCache::num_subtrees_fullauth() -{ - int n = 0; - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) { - CDir *root = p->first; - if (root->is_full_dir_auth()) - n++; - } - return n; -} - -int MDCache::num_subtrees_fullnonauth() -{ - int n = 0; - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) { - CDir *root = p->first; - if (root->is_full_dir_nonauth()) - n++; - } - return n; -} - - - - - - - -// ==================================================================== -// import map, recovery - - -ESubtreeMap *MDCache::create_subtree_map() -{ - dout(10) << "create_subtree_map " << num_subtrees() << " subtrees, " - << num_subtrees_fullauth() << " fullauth" - << dendl; - - ESubtreeMap *le = new ESubtreeMap(); - - // include all auth subtrees, and their bounds. - // and a spanning tree to tie it to the root. - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) { - CDir *dir = p->first; - if (!dir->is_auth()) continue; - - dout(15) << " subtree " << *dir << dendl; - le->subtrees[dir->dirfrag()].clear(); - le->metablob.add_dir_context(dir, EMetaBlob::TO_ROOT); - le->metablob.add_dir(dir, false); - - // bounds - for (set::iterator q = p->second.begin(); - q != p->second.end(); - ++q) { - CDir *bound = *q; - dout(15) << " subtree bound " << *bound << dendl; - le->subtrees[dir->dirfrag()].push_back(bound->dirfrag()); - le->metablob.add_dir_context(bound, EMetaBlob::TO_ROOT); - le->metablob.add_dir(bound, false); - } - } - - //le->metablob.print(cout); - return le; -} - - -void MDCache::send_resolve(int who) -{ - if (migrator->is_importing() || - migrator->is_exporting()) - send_resolve_later(who); - else - send_resolve_now(who); -} - -void MDCache::send_resolve_later(int who) -{ - dout(10) << "send_resolve_later to mds" << who << dendl; - wants_resolve.insert(who); -} - -void MDCache::maybe_send_pending_resolves() -{ - if (wants_resolve.empty()) - return; // nothing to send. - - // only if it's appropriate! - if (migrator->is_exporting() || - migrator->is_importing()) { - dout(7) << "maybe_send_pending_resolves waiting, imports/exports still in progress" << dendl; - migrator->show_importing(); - migrator->show_exporting(); - return; // not now - } - - // ok, send them. - for (set::iterator p = wants_resolve.begin(); - p != wants_resolve.end(); - p++) - send_resolve_now(*p); - wants_resolve.clear(); -} - - -class C_MDC_SendResolve : public Context { - MDCache *mdc; - int who; -public: - C_MDC_SendResolve(MDCache *c, int w) : mdc(c), who(w) { } - void finish(int r) { - mdc->send_resolve_now(who); - } -}; - -void MDCache::send_resolve_now(int who) -{ - dout(10) << "send_resolve_now to mds" << who << dendl; - MMDSResolve *m = new MMDSResolve; - - show_subtrees(); - - // known - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - p++) { - CDir *dir = p->first; - - // only our subtrees - if (dir->authority().first != mds->get_nodeid()) - continue; - - if (migrator->is_importing(dir->dirfrag())) { - // ambiguous (mid-import) - // NOTE: because we are first authority, import state is at least IMPORT_LOGGINSTART. - assert(migrator->get_import_state(dir->dirfrag()) >= Migrator::IMPORT_LOGGINGSTART); - set bounds; - get_subtree_bounds(dir, bounds); - list dfls; - for (set::iterator p = bounds.begin(); p != bounds.end(); ++p) - dfls.push_back((*p)->dirfrag()); - m->add_ambiguous_import(dir->dirfrag(), dfls); - } else { - // not ambiguous. - m->add_subtree(dir->dirfrag()); - - // bounds too - for (set::iterator q = subtrees[dir].begin(); - q != subtrees[dir].end(); - ++q) { - CDir *bound = *q; - m->add_subtree_bound(dir->dirfrag(), bound->dirfrag()); - } - } - } - - // ambiguous - for (map >::iterator p = my_ambiguous_imports.begin(); - p != my_ambiguous_imports.end(); - ++p) - m->add_ambiguous_import(p->first, p->second); - - - // list prepare requests lacking a commit - // [active survivor] - for (hash_map::iterator p = active_requests.begin(); - p != active_requests.end(); - ++p) { - if (p->second->is_slave() && p->second->slave_to_mds == who) { - dout(10) << " including uncommitted " << *p->second << dendl; - m->add_slave_request(p->first); - } - } - // [resolving] - if (uncommitted_slave_updates.count(who)) { - for (map::iterator p = uncommitted_slave_updates[who].begin(); - p != uncommitted_slave_updates[who].end(); - ++p) { - dout(10) << " including uncommitted " << p->first << dendl; - m->add_slave_request(p->first); - } - need_resolve_ack.insert(who); - } - - - // send - mds->send_message_mds(m, who); -} - - -void MDCache::handle_mds_failure(int who) -{ - dout(7) << "handle_mds_failure mds" << who << dendl; - - // make note of recovery set - mds->mdsmap->get_recovery_mds_set(recovery_set); - recovery_set.erase(mds->get_nodeid()); - dout(1) << "handle_mds_failure mds" << who << " : recovery peers are " << recovery_set << dendl; - - // adjust my recovery lists - wants_resolve.erase(who); // MDS will ask again - got_resolve.erase(who); // i'll get another. - - rejoin_sent.erase(who); // i need to send another - rejoin_ack_gather.erase(who); // i'll need/get another. - - dout(10) << " wants_resolve " << wants_resolve << dendl; - dout(10) << " got_resolve " << got_resolve << dendl; - dout(10) << " rejoin_sent " << rejoin_sent << dendl; - dout(10) << " rejoin_gather " << rejoin_gather << dendl; - dout(10) << " rejoin_ack_gather " << rejoin_ack_gather << dendl; - - - // tell the migrator too. - migrator->handle_mds_failure_or_stop(who); - - // kick any discovers that are waiting - kick_discovers(who); - - // clean up any requests slave to/from this node - list finish; - for (hash_map::iterator p = active_requests.begin(); - p != active_requests.end(); - ++p) { - // slave to the failed node? - if (p->second->slave_to_mds == who) { - if (p->second->slave_did_prepare()) { - dout(10) << " slave request " << *p->second << " uncommitted, will resolve shortly" << dendl; - } else { - dout(10) << " slave request " << *p->second << " has no prepare, finishing up" << dendl; - if (p->second->slave_request) - p->second->aborted = true; - else - finish.push_back(p->second); - } - } - - // failed node is slave? - if (!p->second->committing) { - if (p->second->more()->witnessed.count(who)) { - dout(10) << " master request " << *p->second << " no longer witnessed by slave mds" << who - << dendl; - // discard this peer's prepare (if any) - p->second->more()->witnessed.erase(who); - } - - if (p->second->more()->waiting_on_slave.count(who)) { - dout(10) << " master request " << *p->second << " waiting for slave mds" << who - << " to recover" << dendl; - // retry request when peer recovers - p->second->more()->waiting_on_slave.erase(who); - mds->wait_for_active_peer(who, new C_MDS_RetryRequest(this, p->second)); - } - } - } - - while (!finish.empty()) { - dout(10) << "cleaning up slave request " << *finish.front() << dendl; - request_finish(finish.front()); - finish.pop_front(); - } - - show_subtrees(); -} - -/* - * handle_mds_recovery - called on another node's transition - * from resolve -> active. - */ -void MDCache::handle_mds_recovery(int who) -{ - dout(7) << "handle_mds_recovery mds" << who << dendl; - - list waiters; - - // wake up any waiters in their subtrees - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) { - CDir *dir = p->first; - - if (dir->authority().first != who) continue; - assert(!dir->is_auth()); - - // wake any waiters - list q; - q.push_back(dir); - - while (!q.empty()) { - CDir *d = q.front(); - q.pop_front(); - d->take_waiting(CDir::WAIT_ANY, waiters); - - // inode waiters too - for (CDir::map_t::iterator p = d->items.begin(); - p != d->items.end(); - ++p) { - CDentry *dn = p->second; - if (dn->is_primary()) { - dn->get_inode()->take_waiting(CInode::WAIT_ANY, waiters); - - // recurse? - list ls; - dn->get_inode()->get_dirfrags(ls); - for (list::iterator p = ls.begin(); - p != ls.end(); - ++p) { - CDir *subdir = *p; - if (!subdir->is_subtree_root()) - q.push_back(subdir); - } - } - } - } - } - - // queue them up. - mds->queue_waiters(waiters); -} - -void MDCache::set_recovery_set(set& s) -{ - dout(7) << "set_recovery_set " << s << dendl; - recovery_set = s; -} - - -/* - * during resolve state, we share resolves to determine who - * is authoritative for which trees. we expect to get an resolve - * from _everyone_ in the recovery_set (the mds cluster at the time of - * the first failure). - */ -void MDCache::handle_resolve(MMDSResolve *m) -{ - dout(7) << "handle_resolve from " << m->get_source() << dendl; - int from = m->get_source().num(); - - // ambiguous slave requests? - if (!m->slave_requests.empty()) { - MMDSResolveAck *ack = new MMDSResolveAck; - - for (list::iterator p = m->slave_requests.begin(); - p != m->slave_requests.end(); - ++p) { - if (mds->clientmap.have_completed_request(*p)) { - // COMMIT - dout(10) << " ambiguous slave request " << *p << " will COMMIT" << dendl; - ack->add_commit(*p); - } else { - // ABORT - dout(10) << " ambiguous slave request " << *p << " will ABORT" << dendl; - ack->add_abort(*p); - } - } - - mds->send_message_mds(ack, from); - } - - // am i a surviving ambiguous importer? - if (mds->is_active() || mds->is_stopping()) { - // check for any import success/failure (from this node) - map >::iterator p = my_ambiguous_imports.begin(); - while (p != my_ambiguous_imports.end()) { - map >::iterator next = p; - next++; - CDir *dir = get_dirfrag(p->first); - assert(dir); - dout(10) << "checking ambiguous import " << *dir << dendl; - if (migrator->is_importing(dir->dirfrag()) && - migrator->get_import_peer(dir->dirfrag()) == from) { - assert(migrator->get_import_state(dir->dirfrag()) == Migrator::IMPORT_ACKING); - - // check if sender claims the subtree - bool claimed_by_sender = false; - for (map >::iterator q = m->subtrees.begin(); - q != m->subtrees.end(); - ++q) { - CDir *base = get_dirfrag(q->first); - if (!base || !base->contains(dir)) - continue; // base not dir or an ancestor of dir, clearly doesn't claim dir. - - bool inside = true; - for (list::iterator r = q->second.begin(); - r != q->second.end(); - ++r) { - CDir *bound = get_dirfrag(*r); - if (bound && bound->contains(dir)) { - inside = false; // nope, bound is dir or parent of dir, not inside. - break; - } - } - if (inside) - claimed_by_sender = true; - } - - if (claimed_by_sender) { - dout(7) << "ambiguous import failed on " << *dir << dendl; - migrator->import_reverse(dir); - } else { - dout(7) << "ambiguous import succeeded on " << *dir << dendl; - migrator->import_finish(dir); - } - my_ambiguous_imports.erase(p); // no longer ambiguous. - } - p = next; - } - } - - // update my dir_auth values - for (map >::iterator pi = m->subtrees.begin(); - pi != m->subtrees.end(); - ++pi) { - CInode *diri = get_inode(pi->first.ino); - if (!diri) continue; - bool forced = diri->dirfragtree.force_to_leaf(pi->first.frag); - if (forced) { - dout(10) << " forced frag " << pi->first.frag << " to leaf in " - << diri->dirfragtree - << " on " << pi->first << dendl; - } - - CDir *dir = diri->get_dirfrag(pi->first.frag); - if (!dir) continue; - - adjust_bounded_subtree_auth(dir, pi->second, from); - try_subtree_merge(dir); - } - - show_subtrees(); - - - // note ambiguous imports too - for (map >::iterator pi = m->ambiguous_imports.begin(); - pi != m->ambiguous_imports.end(); - ++pi) { - dout(10) << "noting ambiguous import on " << pi->first << " bounds " << pi->second << dendl; - other_ambiguous_imports[from][pi->first].swap( pi->second ); - } - - // did i get them all? - got_resolve.insert(from); - - maybe_resolve_finish(); - - delete m; -} - -void MDCache::maybe_resolve_finish() -{ - if (got_resolve != recovery_set) { - dout(10) << "maybe_resolve_finish still waiting for more resolves, got (" << got_resolve - << "), need (" << recovery_set << ")" << dendl; - } - else if (!need_resolve_ack.empty()) { - dout(10) << "maybe_resolve_finish still waiting for resolve_ack from (" << need_resolve_ack << ")" << dendl; - } - else { - dout(10) << "maybe_resolve_finish got all resolves+resolve_acks, done." << dendl; - disambiguate_imports(); - if (mds->is_resolve()) { - recalc_auth_bits(); - trim_non_auth(); - mds->resolve_done(); - } - } -} - -void MDCache::handle_resolve_ack(MMDSResolveAck *ack) -{ - dout(10) << "handle_resolve_ack " << *ack << " from " << ack->get_source() << dendl; - int from = ack->get_source().num(); - - for (list::iterator p = ack->commit.begin(); - p != ack->commit.end(); - ++p) { - dout(10) << " commit on slave " << *p << dendl; - - if (mds->is_resolve()) { - // replay - assert(uncommitted_slave_updates[from].count(*p)); - uncommitted_slave_updates[from][*p].commit.replay(mds); - uncommitted_slave_updates[from].erase(*p); - // log commit - mds->mdlog->submit_entry(new ESlaveUpdate(mds->mdlog, "unknown", *p, from, ESlaveUpdate::OP_COMMIT)); - } else { - MDRequest *mdr = request_get(*p); - assert(mdr->slave_request == 0); // shouldn't be doing anything! - request_finish(mdr); - } - } - - for (list::iterator p = ack->abort.begin(); - p != ack->abort.end(); - ++p) { - dout(10) << " abort on slave " << *p << dendl; - - if (mds->is_resolve()) { - assert(uncommitted_slave_updates[from].count(*p)); - uncommitted_slave_updates[from][*p].rollback.replay(mds); - uncommitted_slave_updates[from].erase(*p); - mds->mdlog->submit_entry(new ESlaveUpdate(mds->mdlog, "unknown", *p, from, ESlaveUpdate::OP_ROLLBACK)); - } else { - MDRequest *mdr = request_get(*p); - if (mdr->more()->slave_commit) { - mdr->more()->slave_commit->finish(-1); - delete mdr->more()->slave_commit; - mdr->more()->slave_commit = 0; - } - if (mdr->slave_request) - mdr->aborted = true; - else - request_finish(mdr); - } - } - - need_resolve_ack.erase(from); - - if (mds->is_resolve()) - maybe_resolve_finish(); - - delete ack; -} - - - -void MDCache::disambiguate_imports() -{ - dout(10) << "disambiguate_imports" << dendl; - - // other nodes' ambiguous imports - for (map > >::iterator p = other_ambiguous_imports.begin(); - p != other_ambiguous_imports.end(); - ++p) { - int who = p->first; - dout(10) << "ambiguous imports for mds" << who << dendl; - - for (map >::iterator q = p->second.begin(); - q != p->second.end(); - ++q) { - dout(10) << " ambiguous import " << q->first << " bounds " << q->second << dendl; - CDir *dir = get_dirfrag(q->first); - if (!dir) continue; - - if (dir->authority().first == CDIR_AUTH_UNKNOWN || // if i am resolving - dir->is_ambiguous_auth()) { // if i am a surviving bystander - dout(10) << " mds" << who << " did import " << *dir << dendl; - adjust_bounded_subtree_auth(dir, q->second, who); - try_subtree_merge(dir); - } else { - dout(10) << " mds" << who << " did not import " << *dir << dendl; - } - } - } - other_ambiguous_imports.clear(); - - // my ambiguous imports - while (!my_ambiguous_imports.empty()) { - map >::iterator q = my_ambiguous_imports.begin(); - - CDir *dir = get_dirfrag(q->first); - if (!dir) continue; - - if (dir->authority().first != CDIR_AUTH_UNKNOWN) { - dout(10) << "ambiguous import auth known, must not be me " << *dir << dendl; - cancel_ambiguous_import(q->first); - mds->mdlog->submit_entry(new EImportFinish(dir, false)); - } else { - dout(10) << "ambiguous import auth unknown, must be me " << *dir << dendl; - finish_ambiguous_import(q->first); - mds->mdlog->submit_entry(new EImportFinish(dir, true)); - } - } - assert(my_ambiguous_imports.empty()); - - if (mds->is_resolve()) { - // verify all my subtrees are unambiguous! - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) { - CDir *dir = p->first; - if (dir->is_ambiguous_dir_auth()) { - dout(0) << "disambiguate_imports uh oh, dir_auth is still ambiguous for " << *dir << dendl; - show_subtrees(); - } - assert(!dir->is_ambiguous_dir_auth()); - } - } - - show_subtrees(); -} - - -void MDCache::add_ambiguous_import(dirfrag_t base, list& bounds) -{ - assert(my_ambiguous_imports.count(base) == 0); - my_ambiguous_imports[base].swap( bounds ); -} - - -void MDCache::add_ambiguous_import(CDir *base, const set& bounds) -{ - // make a list - list binos; - for (set::iterator p = bounds.begin(); - p != bounds.end(); - ++p) - binos.push_back((*p)->dirfrag()); - - // note: this can get called twice if the exporter fails during recovery - if (my_ambiguous_imports.count(base->dirfrag())) - my_ambiguous_imports.erase(base->dirfrag()); - - add_ambiguous_import(base->dirfrag(), binos); -} - -void MDCache::cancel_ambiguous_import(dirfrag_t df) -{ - assert(my_ambiguous_imports.count(df)); - dout(10) << "cancel_ambiguous_import " << df - << " bounds " << my_ambiguous_imports[df] - << dendl; - my_ambiguous_imports.erase(df); -} - -void MDCache::finish_ambiguous_import(dirfrag_t df) -{ - assert(my_ambiguous_imports.count(df)); - list bounds; - bounds.swap(my_ambiguous_imports[df]); - my_ambiguous_imports.erase(df); - - dout(10) << "finish_ambiguous_import " << df - << " bounds " << bounds - << dendl; - CDir *dir = get_dirfrag(df); - assert(dir); - - // adjust dir_auth, import maps - adjust_bounded_subtree_auth(dir, bounds, mds->get_nodeid()); - try_subtree_merge(dir); -} - - -/** recalc_auth_bits() - * once subtree auth is disambiguated, we need to adjust all the - * auth and dirty bits in our cache before moving on. - */ -void MDCache::recalc_auth_bits() -{ - dout(7) << "recalc_auth_bits" << dendl; - - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) { - list dfq; // dirfrag queue - dfq.push_back(p->first); - - bool auth = p->first->authority().first == mds->get_nodeid(); - dout(10) << " subtree auth=" << auth << " for " << *p->first << dendl; - - while (!dfq.empty()) { - CDir *dir = dfq.front(); - dfq.pop_front(); - - // dir - if (auth) - dir->state_set(CDir::STATE_AUTH); - else { - dir->state_set(CDir::STATE_REJOINING); - dir->state_clear(CDir::STATE_AUTH); - if (dir->is_dirty()) - dir->mark_clean(); - } - - // dentries in this dir - for (CDir::map_t::iterator q = dir->items.begin(); - q != dir->items.end(); - ++q) { - // dn - CDentry *dn = q->second; - if (auth) - dn->state_set(CDentry::STATE_AUTH); - else { - dn->state_set(CDentry::STATE_REJOINING); - dn->state_clear(CDentry::STATE_AUTH); - if (dn->is_dirty()) - dn->mark_clean(); - } - - if (dn->is_primary()) { - // inode - if (auth) - dn->inode->state_set(CInode::STATE_AUTH); - else { - dn->inode->state_set(CInode::STATE_REJOINING); - dn->inode->state_clear(CInode::STATE_AUTH); - if (dn->inode->is_dirty()) - dn->inode->mark_clean(); - } - - // recurse? - if (dn->inode->is_dir()) - dn->inode->get_nested_dirfrags(dfq); - } - } - } - } - - show_subtrees(); - show_cache(); -} - - - -// =========================================================================== -// REJOIN - - -/* - * rejoin phase! - * - * this initiates rejoin. it shoudl be called before we get any - * rejoin or rejoin_ack messages (or else mdsmap distribution is broken). - * - * we start out by sending rejoins to everyone in the recovery set. - * - * if we are rejoin, send for all regions in our cache. - * if we are active|stopping, send only to nodes that are are rejoining. - */ -void MDCache::rejoin_send_rejoins() -{ - dout(10) << "rejoin_send_rejoins with recovery_set " << recovery_set << dendl; - - map rejoins; - - // encode cap list once. - bufferlist cap_export_bl; - if (mds->is_rejoin()) { - ::_encode(cap_exports, cap_export_bl); - ::_encode(cap_export_paths, cap_export_bl); - } - - // if i am rejoining, send a rejoin to everyone. - // otherwise, just send to others who are rejoining. - for (set::iterator p = recovery_set.begin(); - p != recovery_set.end(); - ++p) { - if (*p == mds->get_nodeid()) continue; // nothing to myself! - if (rejoin_sent.count(*p)) continue; // already sent a rejoin to this node! - if (mds->is_rejoin()) { - rejoin_gather.insert(*p); - rejoins[*p] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_WEAK); - rejoins[*p]->copy_cap_exports(cap_export_bl); - } else if (mds->mdsmap->is_rejoin(*p)) - rejoins[*p] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_STRONG); - } - - assert(!migrator->is_importing()); - assert(!migrator->is_exporting()); - - // check all subtrees - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) { - CDir *dir = p->first; - assert(dir->is_subtree_root()); - assert(!dir->is_ambiguous_dir_auth()); - - int auth = dir->get_dir_auth().first; - assert(auth >= 0); - - if (auth == mds->get_nodeid()) continue; // skip my own regions! - if (rejoins.count(auth) == 0) continue; // don't care about this node's regions - - rejoin_walk(dir, rejoins[auth]); - } - - // rejoin root inodes, too - for (map::iterator p = rejoins.begin(); - p != rejoins.end(); - ++p) { - if (mds->is_rejoin()) { - // weak - if (p->first == 0 && root) - p->second->add_weak_inode(root->ino()); - if (get_inode(MDS_INO_STRAY(p->first))) - p->second->add_weak_inode(MDS_INO_STRAY(p->first)); - } else { - // strong - if (p->first == 0 && root) { - p->second->add_weak_inode(root->ino()); - p->second->add_strong_inode(root->ino(), root->get_replica_nonce(), - root->get_caps_wanted(), - root->authlock.get_state(), - root->linklock.get_state(), - root->dirfragtreelock.get_state(), - root->filelock.get_state(), - root->dirlock.get_state()); - } - if (CInode *in = get_inode(MDS_INO_STRAY(p->first))) { - p->second->add_weak_inode(in->ino()); - p->second->add_strong_inode(in->ino(), in->get_replica_nonce(), - in->get_caps_wanted(), - in->authlock.get_state(), - in->linklock.get_state(), - in->dirfragtreelock.get_state(), - in->filelock.get_state(), - in->dirlock.get_state()); - } - } - } - - if (!mds->is_rejoin()) { - // i am survivor. send strong rejoin. - // note request authpins, xlocks - for (hash_map::iterator p = active_requests.begin(); - p != active_requests.end(); - ++p) { - // auth pins - for (set::iterator q = p->second->auth_pins.begin(); - q != p->second->auth_pins.end(); - ++q) { - if (!(*q)->is_auth()) { - int who = (*q)->authority().first; - if (rejoins.count(who) == 0) continue; - MMDSCacheRejoin *rejoin = rejoins[who]; - - dout(15) << " " << *p->second << " authpin on " << **q << dendl; - MDSCacheObjectInfo i; - (*q)->set_object_info(i); - if (i.ino) - rejoin->add_inode_authpin(i.ino, p->second->reqid); - else - rejoin->add_dentry_authpin(i.dirfrag, i.dname, p->second->reqid); - } - } - // xlocks - for (set::iterator q = p->second->xlocks.begin(); - q != p->second->xlocks.end(); - ++q) { - if (!(*q)->get_parent()->is_auth()) { - int who = (*q)->get_parent()->authority().first; - if (rejoins.count(who) == 0) continue; - MMDSCacheRejoin *rejoin = rejoins[who]; - - dout(15) << " " << *p->second << " xlock on " << **q << " " << *(*q)->get_parent() << dendl; - MDSCacheObjectInfo i; - (*q)->get_parent()->set_object_info(i); - if (i.ino) - rejoin->add_inode_xlock(i.ino, (*q)->get_type(), p->second->reqid); - else - rejoin->add_dentry_xlock(i.dirfrag, i.dname, p->second->reqid); - } - } - } - } - - // send the messages - for (map::iterator p = rejoins.begin(); - p != rejoins.end(); - ++p) { - assert(rejoin_sent.count(p->first) == 0); - assert(rejoin_ack_gather.count(p->first) == 0); - rejoin_sent.insert(p->first); - rejoin_ack_gather.insert(p->first); - mds->send_message_mds(p->second, p->first); - } - - // nothing? - if (mds->is_rejoin() && rejoins.empty()) { - dout(10) << "nothing to rejoin" << dendl; - mds->rejoin_done(); - } -} - - -/** - * rejoin_walk - build rejoin declarations for a subtree - * - * @dir subtree root - * @rejoin rejoin message - * - * from a rejoining node: - * weak dirfrag - * weak dentries (w/ connectivity) - * - * from a surviving node: - * strong dirfrag - * strong dentries (no connectivity!) - * strong inodes - */ -void MDCache::rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin) -{ - dout(10) << "rejoin_walk " << *dir << dendl; - - list nested; // finish this dir, then do nested items - - if (mds->is_rejoin()) { - // WEAK - dout(15) << " add_weak_dirfrag " << *dir << dendl; - rejoin->add_weak_dirfrag(dir->dirfrag()); - - for (CDir::map_t::iterator p = dir->items.begin(); - p != dir->items.end(); - ++p) { - CDentry *dn = p->second; - dout(15) << " add_weak_primary_dentry " << *dn << dendl; - assert(dn->is_primary()); - assert(dn->inode->is_dir()); - rejoin->add_weak_primary_dentry(dir->dirfrag(), p->first, dn->get_inode()->ino()); - dn->get_inode()->get_nested_dirfrags(nested); - - if (dn->get_inode()->dirlock.is_updated()) { - // include full inode to shed any dirtyscattered state - rejoin->add_full_inode(dn->get_inode()->inode, - dn->get_inode()->symlink, - dn->get_inode()->dirfragtree); - dn->get_inode()->dirlock.clear_updated(); - } - } - } else { - // STRONG - dout(15) << " add_strong_dirfrag " << *dir << dendl; - rejoin->add_strong_dirfrag(dir->dirfrag(), dir->get_replica_nonce(), dir->get_dir_rep()); - - for (CDir::map_t::iterator p = dir->items.begin(); - p != dir->items.end(); - ++p) { - CDentry *dn = p->second; - dout(15) << " add_strong_dentry " << *dn << dendl; - rejoin->add_strong_dentry(dir->dirfrag(), p->first, - dn->is_primary() ? dn->get_inode()->ino():inodeno_t(0), - dn->is_remote() ? dn->get_remote_ino():inodeno_t(0), - dn->is_remote() ? dn->get_remote_d_type():0, - dn->get_replica_nonce(), - dn->lock.get_state()); - if (dn->is_primary()) { - CInode *in = dn->get_inode(); - dout(15) << " add_strong_inode " << *in << dendl; - rejoin->add_strong_inode(in->ino(), in->get_replica_nonce(), - in->get_caps_wanted(), - in->authlock.get_state(), - in->linklock.get_state(), - in->dirfragtreelock.get_state(), - in->filelock.get_state(), - in->dirlock.get_state()); - in->get_nested_dirfrags(nested); - } - } - } - - // recurse into nested dirs - for (list::iterator p = nested.begin(); - p != nested.end(); - ++p) - rejoin_walk(*p, rejoin); -} - - -/* - * i got a rejoin. - * - reply with the lockstate - * - * if i am active|stopping, - * - remove source from replica list for everything not referenced here. - */ -void MDCache::handle_cache_rejoin(MMDSCacheRejoin *m) -{ - dout(7) << "handle_cache_rejoin " << *m << " from " << m->get_source() - << " (" << m->get_payload().length() << " bytes)" - << dendl; - - switch (m->op) { - case MMDSCacheRejoin::OP_WEAK: - handle_cache_rejoin_weak(m); - break; - case MMDSCacheRejoin::OP_STRONG: - handle_cache_rejoin_strong(m); - break; - - case MMDSCacheRejoin::OP_ACK: - handle_cache_rejoin_ack(m); - break; - case MMDSCacheRejoin::OP_MISSING: - handle_cache_rejoin_missing(m); - break; - - case MMDSCacheRejoin::OP_FULL: - handle_cache_rejoin_full(m); - break; - - default: - assert(0); - } - delete m; -} - - -/* - * handle_cache_rejoin_weak - * - * the sender - * - is recovering from their journal. - * - may have incorrect (out of date) inode contents - * - will include full inodes IFF they contain dirty scatterlock content - * - * if the sender didn't trim_non_auth(), they - * - may have incorrect (out of date) dentry/inode linkage - * - may have deleted/purged inodes - * and i may have to go to disk to get accurate inode contents. yuck. - */ -void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak) -{ - int from = weak->get_source().num(); - - // possible response(s) - MMDSCacheRejoin *ack = 0; // if survivor - bool survivor = false; // am i a survivor? - - if (mds->is_active() || mds->is_stopping()) { - survivor = true; - dout(10) << "i am a surivivor, and will ack immediately" << dendl; - ack = new MMDSCacheRejoin(MMDSCacheRejoin::OP_ACK); - - // check cap exports - for (map >::iterator p = weak->cap_exports.begin(); - p != weak->cap_exports.end(); - ++p) { - CInode *in = get_inode(p->first); - if (!in || !in->is_auth()) continue; - for (map::iterator q = p->second.begin(); - q != p->second.end(); - ++q) { - dout(10) << " claiming cap import " << p->first << " client" << q->first << " on " << *in << dendl; - rejoin_import_cap(in, q->first, q->second, from); - } - } - } else { - assert(mds->is_rejoin()); - - // check cap exports. - for (map >::iterator p = weak->cap_exports.begin(); - p != weak->cap_exports.end(); - ++p) { - CInode *in = get_inode(p->first); - if (in && !in->is_auth()) continue; - if (!in) { - if (!path_is_mine(weak->cap_export_paths[p->first])) - continue; - cap_import_paths[p->first] = weak->cap_export_paths[p->first]; - dout(10) << " noting cap import " << p->first << " path " << weak->cap_export_paths[p->first] << dendl; - } - - // note - for (map::iterator q = p->second.begin(); - q != p->second.end(); - ++q) { - dout(10) << " claiming cap import " << p->first << " client" << q->first << dendl; - cap_imports[p->first][q->first][from] = q->second; - } - } - } - - // full inodes? - // dirty scatterlock content! - for (list::iterator p = weak->full_inodes.begin(); - p != weak->full_inodes.end(); - ++p) { - CInode *in = get_inode(p->inode.ino); - if (!in) continue; - if (p->inode.mtime > in->inode.mtime) in->inode.mtime = p->inode.mtime; - dout(10) << " got dirty inode scatterlock content " << *in << dendl; - in->dirlock.set_updated(); - } - - // walk weak map - for (map >::iterator p = weak->weak.begin(); - p != weak->weak.end(); - ++p) { - CDir *dir = get_dirfrag(p->first); - if (!dir) dout(0) << " missing dirfrag " << p->first << dendl; - assert(dir); - - int nonce = dir->add_replica(from); - dout(10) << " have " << *dir << dendl; - if (ack) - ack->add_strong_dirfrag(p->first, nonce, dir->dir_rep); - - // weak dentries - for (map::iterator q = p->second.begin(); - q != p->second.end(); - ++q) { - CDentry *dn = dir->lookup(q->first); - assert(dn); - assert(dn->is_primary()); - - if (survivor && dn->is_replica(from)) - dentry_remove_replica(dn, from); // this induces a lock gather completion - int dnonce = dn->add_replica(from); - dout(10) << " have " << *dn << dendl; - if (ack) - ack->add_strong_dentry(p->first, q->first, - dn->get_inode()->ino(), inodeno_t(0), 0, - dnonce, dn->lock.get_replica_state()); - - // inode - CInode *in = dn->get_inode(); - assert(in); - - if (survivor && in->is_replica(from)) - inode_remove_replica(in, from); // this induces a lock gather completion - int inonce = in->add_replica(from); - dout(10) << " have " << *in << dendl; - - // scatter the dirlock, just in case? - if (!survivor && in->is_dir()) - in->dirlock.set_state(LOCK_SCATTER); - - if (ack) { - ack->add_full_inode(in->inode, in->symlink, in->dirfragtree); - ack->add_strong_inode(in->ino(), - inonce, - 0, - in->authlock.get_replica_state(), - in->linklock.get_replica_state(), - in->dirfragtreelock.get_replica_state(), - in->filelock.get_replica_state(), - in->dirlock.get_replica_state()); - } - } - } - - // weak base inodes? (root, stray, etc.) - for (set::iterator p = weak->weak_inodes.begin(); - p != weak->weak_inodes.end(); - ++p) { - CInode *in = get_inode(*p); - assert(in); // hmm fixme wrt stray? - if (survivor && in->is_replica(from)) - inode_remove_replica(in, from); // this induces a lock gather completion - int inonce = in->add_replica(from); - dout(10) << " have base " << *in << dendl; - - if (ack) - ack->add_strong_inode(in->ino(), - inonce, - 0, - in->authlock.get_replica_state(), - in->linklock.get_replica_state(), - in->dirfragtreelock.get_replica_state(), - in->filelock.get_replica_state(), - in->dirlock.get_replica_state()); - } - - if (survivor) { - // survivor. do everything now. - rejoin_scour_survivor_replicas(from, ack); - mds->send_message_mds(ack, from); - } else { - // done? - assert(rejoin_gather.count(from)); - rejoin_gather.erase(from); - if (rejoin_gather.empty()) { - rejoin_gather_finish(); - } else { - dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl; - } - } -} - - -/** - * parallel_fetch -- make a pass at fetching a bunch of paths in parallel - * - * @pathmap - map of inodeno to full pathnames. we remove items from this map - * as we discover we have them. - * - * returns a C_Gather* is there is work to do. caller is responsible for setting - * the C_Gather completer. - */ -C_Gather *MDCache::parallel_fetch(map& pathmap) -{ - dout(10) << "parallel_fetch on " << pathmap.size() << " paths" << dendl; - - // scan list - set fetch_queue; - map::iterator p = pathmap.begin(); - while (p != pathmap.end()) { - CInode *in = get_inode(p->first); - if (in) { - dout(15) << " have " << *in << dendl; - pathmap.erase(p++); - continue; - } - - // traverse - dout(17) << " missing " << p->first << " at " << p->second << dendl; - filepath path(p->second); - CDir *dir = path_traverse_to_dir(path); - assert(dir); - fetch_queue.insert(dir); - p++; - } - - if (pathmap.empty()) { - dout(10) << "parallel_fetch done" << dendl; - assert(fetch_queue.empty()); - return false; - } - - // do a parallel fetch - C_Gather *gather = new C_Gather; - for (set::iterator p = fetch_queue.begin(); - p != fetch_queue.end(); - ++p) { - dout(10) << "parallel_fetch fetching " << **p << dendl; - (*p)->fetch(gather->new_sub()); - } - - return gather; -} - - - -/* - * rejoin_scour_survivor_replica - remove source from replica list on unmentioned objects - * - * all validated replicas are acked with a strong nonce, etc. if that isn't in the - * ack, the replica dne, and we can remove it from our replica maps. - */ -void MDCache::rejoin_scour_survivor_replicas(int from, MMDSCacheRejoin *ack) -{ - dout(10) << "rejoin_scour_survivor_replicas from mds" << from << dendl; - - // FIXME: what about root and stray inodes. - - for (hash_map::iterator p = inode_map.begin(); - p != inode_map.end(); - ++p) { - CInode *in = p->second; - - // inode? - if (in->is_auth() && - in->is_replica(from) && - ack->strong_inodes.count(p->second->ino()) == 0) { - inode_remove_replica(in, from); - dout(10) << " rem " << *in << dendl; - } - - if (!in->is_dir()) continue; - - list dfs; - in->get_dirfrags(dfs); - for (list::iterator p = dfs.begin(); - p != dfs.end(); - ++p) { - CDir *dir = *p; - - if (dir->is_auth() && - dir->is_replica(from) && - ack->strong_dirfrags.count(dir->dirfrag()) == 0) { - dir->remove_replica(from); - dout(10) << " rem " << *dir << dendl; - } - - // dentries - for (CDir::map_t::iterator p = dir->items.begin(); - p != dir->items.end(); - ++p) { - CDentry *dn = p->second; - - if (dn->is_replica(from) && - (ack->strong_dentries.count(dir->dirfrag()) == 0 || - ack->strong_dentries[dir->dirfrag()].count(dn->get_name()) == 0)) { - dentry_remove_replica(dn, from); - dout(10) << " rem " << *dn << dendl; - } - } - } - } -} - - -CInode *MDCache::rejoin_invent_inode(inodeno_t ino) -{ - CInode *in = new CInode(this); - memset(&in->inode, 0, sizeof(inode_t)); - in->inode.ino = ino; - in->state_set(CInode::STATE_REJOINUNDEF); - add_inode(in); - rejoin_undef_inodes.insert(in); - dout(10) << " invented " << *in << dendl; - return in; -} - - -void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong) -{ - int from = strong->get_source().num(); - - // only a recovering node will get a strong rejoin. - assert(mds->is_rejoin()); - - MMDSCacheRejoin *missing = 0; // if i'm missing something.. - - // strong dirfrags/dentries. - // also process auth_pins, xlocks. - for (map::iterator p = strong->strong_dirfrags.begin(); - p != strong->strong_dirfrags.end(); - ++p) { - CDir *dir = get_dirfrag(p->first); - if (!dir) { - CInode *in = get_inode(p->first.ino); - if (!in) in = rejoin_invent_inode(p->first.ino); - if (!in->is_dir()) { - assert(in->state_test(CInode::STATE_REJOINUNDEF)); - in->inode.mode = INODE_MODE_DIR; - } - dir = in->get_or_open_dirfrag(this, p->first.frag); - } else { - dout(10) << " have " << *dir << dendl; - } - dir->add_replica(from); - dir->dir_rep = p->second.dir_rep; - - for (map::iterator q = strong->strong_dentries[p->first].begin(); - q != strong->strong_dentries[p->first].end(); - ++q) { - CDentry *dn = dir->lookup(q->first); - if (!dn) { - if (q->second.is_remote()) { - dn = dir->add_remote_dentry(q->first, q->second.remote_ino, q->second.remote_d_type); - } else if (q->second.is_null()) { - dn = dir->add_null_dentry(q->first); - } else { - CInode *in = get_inode(q->second.ino); - if (!in) in = rejoin_invent_inode(q->second.ino); - dn = dir->add_primary_dentry(q->first, in); - - dout(10) << " missing " << q->second.ino << dendl; - if (!missing) missing = new MMDSCacheRejoin(MMDSCacheRejoin::OP_MISSING); - missing->add_weak_inode(q->second.ino); // we want it back! - } - dout(10) << " invented " << *dn << dendl; - } - - // dn auth_pin? - if (strong->authpinned_dentries.count(p->first) && - strong->authpinned_dentries[p->first].count(q->first)) { - metareqid_t ri = strong->authpinned_dentries[p->first][q->first]; - dout(10) << " dn authpin by " << ri << " on " << *dn << dendl; - - // get/create slave mdrequest - MDRequest *mdr; - if (have_request(ri)) - mdr = request_get(ri); - else - mdr = request_start_slave(ri, from); - mdr->auth_pin(dn); - } - - // dn xlock? - if (strong->xlocked_dentries.count(p->first) && - strong->xlocked_dentries[p->first].count(q->first)) { - metareqid_t ri = strong->xlocked_dentries[p->first][q->first]; - dout(10) << " dn xlock by " << ri << " on " << *dn << dendl; - MDRequest *mdr = request_get(ri); // should have this from auth_pin above. - assert(mdr->is_auth_pinned(dn)); - dn->lock.set_state(LOCK_LOCK); - dn->lock.get_xlock(mdr); - mdr->xlocks.insert(&dn->lock); - mdr->locks.insert(&dn->lock); - } - - dn->add_replica(from); - dout(10) << " have " << *dn << dendl; - - // inode? - if (dn->is_primary()) { - CInode *in = dn->get_inode(); - assert(in); - - if (strong->strong_inodes.count(in->ino())) { - MMDSCacheRejoin::inode_strong &is = strong->strong_inodes[in->ino()]; - - // caps_wanted - if (is.caps_wanted) { - in->mds_caps_wanted[from] = is.caps_wanted; - dout(15) << " inode caps_wanted " << cap_string(is.caps_wanted) - << " on " << *in << dendl; - } - - // scatterlock? - if (is.dirlock == LOCK_SCATTER || - is.dirlock == LOCK_GLOCKC) // replica still has wrlocks - in->dirlock.set_state(LOCK_SCATTER); - - // auth pin? - if (strong->authpinned_inodes.count(in->ino())) { - metareqid_t ri = strong->authpinned_inodes[in->ino()]; - dout(10) << " inode authpin by " << ri << " on " << *in << dendl; - - // get/create slave mdrequest - MDRequest *mdr; - if (have_request(ri)) - mdr = request_get(ri); - else - mdr = request_start_slave(ri, from); - mdr->auth_pin(in); - } - - // xlock(s)? - if (strong->xlocked_inodes.count(in->ino())) { - for (map::iterator r = strong->xlocked_inodes[in->ino()].begin(); - r != strong->xlocked_inodes[in->ino()].end(); - ++r) { - SimpleLock *lock = in->get_lock(r->first); - dout(10) << " inode xlock by " << r->second << " on " << *lock << " on " << *in << dendl; - MDRequest *mdr = request_get(r->second); // should have this from auth_pin above. - assert(mdr->is_auth_pinned(in)); - lock->set_state(LOCK_LOCK); - lock->get_xlock(mdr); - mdr->xlocks.insert(lock); - mdr->locks.insert(lock); - } - } - } else { - dout(10) << " sender has dentry but not inode, adding them as a replica" << dendl; - } - - in->add_replica(from); - dout(10) << " have " << *in << dendl; - } - } - } - - // base inodes? (root, stray, etc.) - for (set::iterator p = strong->weak_inodes.begin(); - p != strong->weak_inodes.end(); - ++p) { - CInode *in = get_inode(*p); - dout(10) << " have base " << *in << dendl; - in->add_replica(from); - } - - // send missing? - if (missing) { - // we expect a FULL soon. - mds->send_message_mds(missing, from); - } else { - // done? - assert(rejoin_gather.count(from)); - rejoin_gather.erase(from); - if (rejoin_gather.empty()) { - rejoin_gather_finish(); - } else { - dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl; - } - } -} - - -void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack) -{ - dout(7) << "handle_cache_rejoin_ack from " << ack->get_source() << dendl; - int from = ack->get_source().num(); - - list waiters; - - // dirs - for (map::iterator p = ack->strong_dirfrags.begin(); - p != ack->strong_dirfrags.end(); - ++p) { - CDir *dir = get_dirfrag(p->first); - if (!dir) continue; - - dir->set_replica_nonce(p->second.nonce); - dir->state_clear(CDir::STATE_REJOINING); - dout(10) << " got " << *dir << dendl; - - // dentries - for (map::iterator q = ack->strong_dentries[p->first].begin(); - q != ack->strong_dentries[p->first].end(); - ++q) { - CDentry *dn = dir->lookup(q->first); - if (!dn) continue; - - // hmm, did we have the proper linkage here? - if (dn->is_null() && - !q->second.is_null()) { - dout(10) << " had bad (missing) linkage for " << *dn << dendl; - if (q->second.is_remote()) { - dn->dir->link_remote_inode(dn, q->second.remote_ino, q->second.remote_d_type); - } else { - CInode *in = get_inode(q->second.ino); - assert(in == 0); // a rename would have been caught be the resolve stage. - // barebones inode; the full inode loop below will clean up. - in = new CInode(this, false); - in->inode.ino = q->second.ino; - add_inode(in); - dn->dir->link_primary_inode(dn, in); - } - } - else if (!dn->is_null() && - q->second.is_null()) { - dout(-10) << " had bad linkage for " << *dn << dendl; - assert(0); // hrmpf. unlink should use slave requests to clean this up during resolve. - } - dn->set_replica_nonce(q->second.nonce); - mds->locker->rejoin_set_state(&dn->lock, q->second.lock, waiters); - dn->state_clear(CDentry::STATE_REJOINING); - dout(10) << " got " << *dn << dendl; - } - } - - // full inodes - for (list::iterator p = ack->full_inodes.begin(); - p != ack->full_inodes.end(); - ++p) { - CInode *in = get_inode(p->inode.ino); - if (!in) continue; - in->inode = p->inode; - in->symlink = p->symlink; - in->dirfragtree = p->dirfragtree; - dout(10) << " got inode content " << *in << dendl; - } - - // inodes - for (map::iterator p = ack->strong_inodes.begin(); - p != ack->strong_inodes.end(); - ++p) { - CInode *in = get_inode(p->first); - if (!in) continue; - in->set_replica_nonce(p->second.nonce); - mds->locker->rejoin_set_state(&in->authlock, p->second.authlock, waiters); - mds->locker->rejoin_set_state(&in->linklock, p->second.linklock, waiters); - mds->locker->rejoin_set_state(&in->dirfragtreelock, p->second.dirfragtreelock, waiters); - mds->locker->rejoin_set_state(&in->filelock, p->second.filelock, waiters); - mds->locker->rejoin_set_state(&in->dirlock, p->second.dirlock, waiters); - in->state_clear(CInode::STATE_REJOINING); - dout(10) << " got " << *in << dendl; - } - - // done? - assert(rejoin_ack_gather.count(from)); - rejoin_ack_gather.erase(from); - if (mds->is_rejoin() && - rejoin_gather.empty() && // make sure we've gotten our FULL inodes, too. - rejoin_ack_gather.empty()) { - mds->rejoin_done(); - } else { - dout(7) << "still need rejoin from (" << rejoin_gather << ")" - << ", rejoin_ack from (" << rejoin_ack_gather << ")" << dendl; - } -} - - - - -void MDCache::handle_cache_rejoin_missing(MMDSCacheRejoin *missing) -{ - dout(7) << "handle_cache_rejoin_missing from " << missing->get_source() << dendl; - - MMDSCacheRejoin *full = new MMDSCacheRejoin(MMDSCacheRejoin::OP_FULL); - - // inodes - for (set::iterator p = missing->weak_inodes.begin(); - p != missing->weak_inodes.end(); - ++p) { - CInode *in = get_inode(*p); - if (!in) { - dout(10) << " don't have inode " << *p << dendl; - continue; // we must have trimmed it after the originalo rejoin - } - - dout(10) << " sending " << *in << dendl; - full->add_full_inode(in->inode, in->symlink, in->dirfragtree); - } - - mds->send_message_mds(full, missing->get_source().num()); -} - -void MDCache::handle_cache_rejoin_full(MMDSCacheRejoin *full) -{ - dout(7) << "handle_cache_rejoin_full from " << full->get_source() << dendl; - int from = full->get_source().num(); - - // integrate full inodes - for (list::iterator p = full->full_inodes.begin(); - p != full->full_inodes.end(); - ++p) { - CInode *in = get_inode(p->inode.ino); - assert(in); - - set::iterator q = rejoin_undef_inodes.find(in); - if (q != rejoin_undef_inodes.end()) { - CInode *in = *q; - in->inode = p->inode; - in->symlink = p->symlink; - in->dirfragtree = p->dirfragtree; - in->state_clear(CInode::STATE_REJOINUNDEF); - dout(10) << " got full " << *in << dendl; - rejoin_undef_inodes.erase(q); - } else { - dout(10) << " had full " << *in << dendl; - } - } - - // done? - assert(rejoin_gather.count(from)); - rejoin_gather.erase(from); - if (rejoin_gather.empty()) { - rejoin_gather_finish(); - } else { - dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl; - } -} - - - -/** - * rejoin_trim_undef_inodes() -- remove REJOINUNDEF flagged inodes - * - * FIXME: wait, can this actually happen? a survivor should generate cache trim - * messages that clean these guys up... - */ -void MDCache::rejoin_trim_undef_inodes() -{ - dout(10) << "rejoin_trim_undef_inodes" << dendl; - - while (!rejoin_undef_inodes.empty()) { - set::iterator p = rejoin_undef_inodes.begin(); - CInode *in = *p; - rejoin_undef_inodes.erase(p); - - in->clear_replica_map(); - - // close out dirfrags - if (in->is_dir()) { - list dfls; - in->get_dirfrags(dfls); - for (list::iterator p = dfls.begin(); - p != dfls.end(); - ++p) { - CDir *dir = *p; - dir->clear_replica_map(); - - for (CDir::map_t::iterator p = dir->items.begin(); - p != dir->items.end(); - ++p) { - CDentry *dn = p->second; - dn->clear_replica_map(); - - dout(10) << " trimming " << *dn << dendl; - dir->remove_dentry(dn); - } - - dout(10) << " trimming " << *dir << dendl; - in->close_dirfrag(dir->dirfrag().frag); - } - } - - CDentry *dn = in->get_parent_dn(); - if (dn) { - dn->clear_replica_map(); - dout(10) << " trimming " << *dn << dendl; - dn->dir->remove_dentry(dn); - } else { - dout(10) << " trimming " << *in << dendl; - remove_inode(in); - } - } - - assert(rejoin_undef_inodes.empty()); -} - -class C_MDC_RejoinGatherFinish : public Context { - MDCache *cache; -public: - C_MDC_RejoinGatherFinish(MDCache *c) : cache(c) {} - void finish(int r) { - cache->rejoin_gather_finish(); - } -}; - - - -void MDCache::rejoin_gather_finish() -{ - dout(10) << "rejoin_gather_finish" << dendl; - assert(mds->is_rejoin()); - - rejoin_trim_undef_inodes(); - - // fetch paths? - // do this before ack, since some inodes we may have already gotten - // from surviving MDSs. - if (!cap_import_paths.empty()) { - C_Gather *gather = parallel_fetch(cap_import_paths); - if (gather) { - gather->set_finisher(new C_MDC_RejoinGatherFinish(this)); - return; - } - } - - // process cap imports - // ino -> client -> frommds -> capex - for (map > >::iterator p = cap_imports.begin(); - p != cap_imports.end(); - ++p) { - CInode *in = get_inode(p->first); - assert(in); - mds->server->add_reconnected_cap_inode(in); - for (map >::iterator q = p->second.begin(); - q != p->second.end(); - ++q) - for (map::iterator r = q->second.begin(); - r != q->second.end(); - ++r) - if (r->first >= 0) - rejoin_import_cap(in, q->first, r->second, r->first); - } - - mds->server->process_reconnected_caps(); - - rejoin_send_acks(); - - // did we already get our acks too? - // this happens when the rejoin_gather has to wait on a MISSING/FULL exchange. - if (rejoin_ack_gather.empty()) - mds->rejoin_done(); -} - -void MDCache::rejoin_import_cap(CInode *in, int client, inode_caps_reconnect_t& icr, int frommds) -{ - dout(10) << "rejoin_import_cap for client" << client << " from mds" << frommds - << " on " << *in << dendl; - - // add cap - in->reconnect_cap(client, icr); - - // send REAP - // FIXME client session weirdness. - MClientFileCaps *reap = new MClientFileCaps(MClientFileCaps::OP_REAP, - in->inode, - in->client_caps[client].get_last_seq(), - in->client_caps[client].pending(), - in->client_caps[client].wanted()); - - reap->set_mds( frommds ); // reap from whom? - mds->messenger->send_message(reap, mds->clientmap.get_inst(client)); -} - -void MDCache::rejoin_send_acks() -{ - dout(7) << "rejoin_send_acks" << dendl; - - // send acks to everyone in the recovery set - map ack; - for (set::iterator p = recovery_set.begin(); - p != recovery_set.end(); - ++p) - ack[*p] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_ACK); - - // walk subtrees - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - p++) { - CDir *dir = p->first; - if (!dir->is_auth()) continue; - dout(10) << "subtree " << *dir << dendl; - - // auth items in this subtree - list dq; - dq.push_back(dir); - - while (!dq.empty()) { - CDir *dir = dq.front(); - dq.pop_front(); - - // dir - for (map::iterator r = dir->replicas_begin(); - r != dir->replicas_end(); - ++r) - ack[r->first]->add_strong_dirfrag(dir->dirfrag(), r->second, dir->dir_rep); - - for (CDir::map_t::iterator q = dir->items.begin(); - q != dir->items.end(); - ++q) { - CDentry *dn = q->second; - - // dentry - for (map::iterator r = dn->replicas_begin(); - r != dn->replicas_end(); - ++r) - ack[r->first]->add_strong_dentry(dir->dirfrag(), dn->name, - dn->is_primary() ? dn->get_inode()->ino():inodeno_t(0), - dn->is_remote() ? dn->get_remote_ino():inodeno_t(0), - dn->is_remote() ? dn->get_remote_d_type():0, - r->second, - dn->lock.get_replica_state()); - - if (!dn->is_primary()) continue; - - // inode - CInode *in = dn->inode; - - for (map::iterator r = in->replicas_begin(); - r != in->replicas_end(); - ++r) { - ack[r->first]->add_full_inode(in->inode, in->symlink, in->dirfragtree); - ack[r->first]->add_strong_inode(in->ino(), r->second, 0, - in->authlock.get_replica_state(), - in->linklock.get_replica_state(), - in->dirfragtreelock.get_replica_state(), - in->filelock.get_replica_state(), - in->dirlock.get_replica_state()); - } - - // subdirs in this subtree? - in->get_nested_dirfrags(dq); - } - } - } - - // root inodes too - if (root) - for (map::iterator r = root->replicas_begin(); - r != root->replicas_end(); - ++r) { - ack[r->first]->add_full_inode(root->inode, root->symlink, root->dirfragtree); - ack[r->first]->add_strong_inode(root->ino(), r->second, 0, - root->authlock.get_replica_state(), - root->linklock.get_replica_state(), - root->dirfragtreelock.get_replica_state(), - root->filelock.get_replica_state(), - root->dirlock.get_replica_state()); - } - if (stray) - for (map::iterator r = stray->replicas_begin(); - r != stray->replicas_end(); - ++r) { - ack[r->first]->add_full_inode(stray->inode, stray->symlink, stray->dirfragtree); - ack[r->first]->add_strong_inode(stray->ino(), r->second, 0, - stray->authlock.get_replica_state(), - stray->linklock.get_replica_state(), - stray->dirfragtreelock.get_replica_state(), - stray->filelock.get_replica_state(), - stray->dirlock.get_replica_state()); - } - - // send acks - for (map::iterator p = ack.begin(); - p != ack.end(); - ++p) - mds->send_message_mds(p->second, p->first); - -} - - - -// =============================================================================== - - -void MDCache::set_root(CInode *in) -{ - assert(root == 0); - root = in; - base_inodes.insert(in); -} - - - - - - -// ************** -// Inode purging -- reliably removing deleted file's objects - -class C_MDC_PurgeFinish : public Context { - MDCache *mdc; - CInode *in; - off_t newsize, oldsize; -public: - C_MDC_PurgeFinish(MDCache *c, CInode *i, off_t ns, off_t os) : - mdc(c), in(i), newsize(ns), oldsize(os) {} - void finish(int r) { - mdc->purge_inode_finish(in, newsize, oldsize); - } -}; -class C_MDC_PurgeFinish2 : public Context { - MDCache *mdc; - CInode *in; - off_t newsize, oldsize; -public: - C_MDC_PurgeFinish2(MDCache *c, CInode *i, off_t ns, off_t os) : - mdc(c), in(i), newsize(ns), oldsize(os) {} - void finish(int r) { - mdc->purge_inode_finish_2(in, newsize, oldsize); - } -}; - -/* purge_inode in - * will be called by on unlink or rmdir or truncate - * caller responsible for journaling an appropriate EUpdate - */ -void MDCache::purge_inode(CInode *in, off_t newsize, off_t oldsize, LogSegment *ls) -{ - dout(10) << "purge_inode " << oldsize << " -> " << newsize - << " on " << *in - << dendl; - - assert(oldsize >= newsize); - - purging[in][newsize] = oldsize; - purging_ls[in][newsize] = ls; - ls->purging_inodes[in][newsize] = oldsize; - - _do_purge_inode(in, newsize, oldsize); -} - -void MDCache::_do_purge_inode(CInode *in, off_t newsize, off_t oldsize) -{ - in->get(CInode::PIN_PURGING); - - // remove - if (in->inode.size > 0) { - mds->filer->remove(in->inode, newsize, oldsize, - 0, new C_MDC_PurgeFinish(this, in, newsize, oldsize)); - } else { - // no need, empty file, just log it - purge_inode_finish(in, newsize, oldsize); - } -} - -void MDCache::purge_inode_finish(CInode *in, off_t newsize, off_t oldsize) -{ - dout(10) << "purge_inode_finish " << oldsize << " -> " << newsize - << " on " << *in << dendl; - - // log completion - mds->mdlog->submit_entry(new EPurgeFinish(in->ino(), newsize, oldsize), - new C_MDC_PurgeFinish2(this, in, newsize, oldsize)); -} - -void MDCache::purge_inode_finish_2(CInode *in, off_t newsize, off_t oldsize) -{ - dout(10) << "purge_inode_finish_2 " << oldsize << " -> " << newsize - << " on " << *in << dendl; - - // remove from purging list - LogSegment *ls = purging_ls[in][newsize]; - purging[in].erase(newsize); - purging_ls[in].erase(newsize); - if (purging[in].empty()) { - purging.erase(in); - purging_ls.erase(in); - } - - assert(ls->purging_inodes.count(in)); - assert(ls->purging_inodes[in].count(newsize)); - assert(ls->purging_inodes[in][newsize] == oldsize); - ls->purging_inodes[in].erase(newsize); - if (ls->purging_inodes[in].empty()) - ls->purging_inodes.erase(in); - - in->put(CInode::PIN_PURGING); - - // tell anyone who cares (log flusher?) - if (purging.count(in) == 0 || - purging[in].rbegin()->first < newsize) { - list ls; - ls.swap(waiting_for_purge[in][newsize]); - waiting_for_purge[in].erase(newsize); - if (waiting_for_purge[in].empty()) - waiting_for_purge.erase(in); - finish_contexts(ls, 0); - } -} - -void MDCache::add_recovered_purge(CInode *in, off_t newsize, off_t oldsize, LogSegment *ls) -{ - assert(purging[in].count(newsize) == 0); - purging[in][newsize] = oldsize; - purging_ls[in][newsize] = ls; - ls->purging_inodes[in][newsize] = oldsize; -} - -void MDCache::remove_recovered_purge(CInode *in, off_t newsize, off_t oldsize) -{ - purging[in].erase(newsize); -} - -void MDCache::start_recovered_purges() -{ - dout(10) << "start_recovered_purges (" << purging.size() << " purges)" << dendl; - - for (map >::iterator p = purging.begin(); - p != purging.end(); - ++p) { - for (map::iterator q = p->second.begin(); - q != p->second.end(); - ++q) { - dout(10) << "start_recovered_purges " - << q->second << " -> " << q->first - << " on " << *p->first - << dendl; - _do_purge_inode(p->first, q->first, q->second); - } - } -} - - - -// ================================================================================ -// cache trimming - - -bool MDCache::trim(int max) -{ - // trim LRU - if (max < 0) { - max = lru.lru_get_max(); - if (!max) return false; - } - dout(7) << "trim max=" << max << " cur=" << lru.lru_get_size() << dendl; - - map expiremap; - - // trim dentries from the LRU - while (lru.lru_get_size() > (unsigned)max) { - CDentry *dn = (CDentry*)lru.lru_expire(); - if (!dn) break; - trim_dentry(dn, expiremap); - } - - // trim base inodes? - if (max == 0) { - set::iterator p = base_inodes.begin(); - while (p != base_inodes.end()) { - CInode *in = *p++; - list ls; - in->get_dirfrags(ls); - for (list::iterator p = ls.begin(); p != ls.end(); ++p) { - CDir *dir = *p; - if (dir->get_num_ref() == 1) // subtree pin - trim_dirfrag(dir, 0, expiremap); - } - if (in->get_num_ref() == 0) - trim_inode(0, in, 0, expiremap); - } - } - - // send any expire messages - send_expire_messages(expiremap); - - return true; -} - -void MDCache::send_expire_messages(map& expiremap) -{ - // send expires - for (map::iterator it = expiremap.begin(); - it != expiremap.end(); - it++) { - dout(7) << "sending cache_expire to " << it->first << dendl; - mds->send_message_mds(it->second, it->first); - } -} - - -void MDCache::trim_dentry(CDentry *dn, map& expiremap) -{ - dout(12) << "trim_dentry " << *dn << dendl; - - CDir *dir = dn->get_dir(); - assert(dir); - - CDir *con = get_subtree_root(dir); - assert(con); - - dout(12) << " in container " << *con << dendl; - - // notify dentry authority? - if (!dn->is_auth()) { - pair auth = dn->authority(); - - for (int p=0; p<2; p++) { - int a = auth.first; - if (p) a = auth.second; - if (a < 0 || (p == 1 && auth.second == auth.first)) break; - if (mds->get_nodeid() == auth.second && - con->is_importing()) break; // don't send any expire while importing. - if (a == mds->get_nodeid()) continue; // on export, ignore myself. - - dout(12) << " sending expire to mds" << a << " on " << *dn << dendl; - assert(a != mds->get_nodeid()); - if (expiremap.count(a) == 0) - expiremap[a] = new MCacheExpire(mds->get_nodeid()); - expiremap[a]->add_dentry(con->dirfrag(), dir->dirfrag(), dn->get_name(), dn->get_replica_nonce()); - } - } - - // adjust the dir state - // NOTE: we can safely remove a clean, null dentry without effecting - // directory completeness. - // (do this _before_ we unlink the inode, below!) - if (!(dn->is_null() && dn->is_clean())) - dir->state_clear(CDir::STATE_COMPLETE); - - // unlink the dentry - if (dn->is_remote()) { - // just unlink. - dir->unlink_inode(dn); - } - else if (dn->is_primary()) { - // expire the inode, too. - CInode *in = dn->get_inode(); - assert(in); - trim_inode(dn, in, con, expiremap); - } - else { - assert(dn->is_null()); - } - - // remove dentry - dir->remove_dentry(dn); - - // reexport? - if (dir->get_size() == 0 && dir->is_subtree_root()) - migrator->export_empty_import(dir); - - if (mds->logger) mds->logger->inc("cex"); -} - - -void MDCache::trim_dirfrag(CDir *dir, CDir *con, map& expiremap) -{ - dout(15) << "trim_dirfrag " << *dir << dendl; - - if (dir->is_subtree_root()) { - assert(!dir->is_auth() || - (!dir->is_replicated() && dir->inode->is_base())); - remove_subtree(dir); // remove from subtree map - } - assert(dir->get_num_ref() == 0); - - CInode *in = dir->get_inode(); - - if (!dir->is_auth()) { - pair auth = dir->authority(); - - // was this an auth delegation? (if so, slightly modified container) - dirfrag_t condf; - if (dir->is_subtree_root()) { - dout(12) << " subtree root, container is " << *dir << dendl; - con = dir; - condf = dir->dirfrag(); - } else { - condf = con->dirfrag(); - } - - for (int p=0; p<2; p++) { - int a = auth.first; - if (p) a = auth.second; - if (a < 0 || (p == 1 && auth.second == auth.first)) break; - if (mds->get_nodeid() == auth.second && - con->is_importing()) break; // don't send any expire while importing. - if (a == mds->get_nodeid()) continue; // on export, ignore myself. - - dout(12) << " sending expire to mds" << a << " on " << *dir << dendl; - assert(a != mds->get_nodeid()); - if (expiremap.count(a) == 0) - expiremap[a] = new MCacheExpire(mds->get_nodeid()); - expiremap[a]->add_dir(condf, dir->dirfrag(), dir->replica_nonce); - } - } - - in->close_dirfrag(dir->dirfrag().frag); -} - -void MDCache::trim_inode(CDentry *dn, CInode *in, CDir *con, map& expiremap) -{ - dout(15) << "trim_inode " << *in << dendl; - assert(in->get_num_ref() == 0); - - // DIR - list dfls; - in->get_dirfrags(dfls); - for (list::iterator p = dfls.begin(); - p != dfls.end(); - ++p) - trim_dirfrag(*p, con ? con:*p, expiremap); // if no container (e.g. root dirfrag), use *p - - // INODE - if (!in->is_auth()) { - pair auth = in->authority(); - - dirfrag_t df; - if (con) - df = con->dirfrag(); - else - df = dirfrag_t(0,frag_t()); // must be a root or stray inode. - - for (int p=0; p<2; p++) { - int a = auth.first; - if (p) a = auth.second; - if (a < 0 || (p == 1 && auth.second == auth.first)) break; - if (con && mds->get_nodeid() == auth.second && - con->is_importing()) break; // don't send any expire while importing. - if (a == mds->get_nodeid()) continue; // on export, ignore myself. - - dout(12) << " sending expire to mds" << a << " on " << *in << dendl; - assert(a != mds->get_nodeid()); - if (expiremap.count(a) == 0) - expiremap[a] = new MCacheExpire(mds->get_nodeid()); - expiremap[a]->add_inode(df, in->ino(), in->get_replica_nonce()); - } - } - - /* - if (in->is_auth()) { - if (in->hack_accessed) - mds->logger->inc("outt"); - else { - mds->logger->inc("outut"); - mds->logger->favg("oututl", g_clock.now() - in->hack_load_stamp); - } - } - */ - - // unlink - if (dn) - dn->get_dir()->unlink_inode(dn); - remove_inode(in); -} - - -/** - * trim_non_auth - remove any non-auth items from our cache - * - * this reduces the amount of non-auth metadata in our cache, reducing the - * load incurred by the rejoin phase. - * - * the only non-auth items that remain are those that are needed to - * attach our own subtrees to the root. - * - * when we are done, all dentries will be in the top bit of the lru. - * - * why we have to do this: - * we may not have accurate linkage for non-auth items. which means we will - * know which subtree it falls into, and can not be sure to declare it to the - * correct authority. - */ -void MDCache::trim_non_auth() -{ - dout(7) << "trim_non_auth" << dendl; - - // temporarily pin all subtree roots - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - p++) - p->first->get(CDir::PIN_SUBTREETEMP); - - // note first auth item we see. - // when we see it the second time, stop. - CDentry *first_auth = 0; - - // trim non-auth items from the lru - while (lru.lru_get_size() > 0) { - CDentry *dn = (CDentry*)lru.lru_expire(); - if (!dn) break; - - if (dn->is_auth()) { - // add back into lru (at the top) - lru.lru_insert_top(dn); - - if (!first_auth) { - first_auth = dn; - } else { - if (first_auth == dn) - break; - } - } else { - // non-auth. expire. - CDir *dir = dn->get_dir(); - assert(dir); - - // unlink the dentry - dout(15) << "trim_non_auth removing " << *dn << dendl; - if (dn->is_remote()) { - dir->unlink_inode(dn); - } - else if (dn->is_primary()) { - CInode *in = dn->get_inode(); - list ls; - in->get_dirfrags(ls); - for (list::iterator p = ls.begin(); p != ls.end(); ++p) { - CDir *subdir = *p; - if (subdir->is_subtree_root()) - remove_subtree(subdir); - in->close_dirfrag(subdir->dirfrag().frag); - } - dir->unlink_inode(dn); - remove_inode(in); - } - else { - assert(dn->is_null()); - } - dir->remove_dentry(dn); - - // adjust the dir state - dir->state_clear(CDir::STATE_COMPLETE); // dir incomplete! - } - } - - if (lru.lru_get_size() == 0) { - // root, stray, etc.? - hash_map::iterator p = inode_map.begin(); - while (p != inode_map.end()) { - hash_map::iterator next = p; - ++next; - CInode *in = p->second; - if (!in->is_auth()) { - list ls; - in->get_dirfrags(ls); - for (list::iterator p = ls.begin(); - p != ls.end(); - ++p) { - assert((*p)->get_num_ref() == 0); - remove_subtree((*p)); - in->close_dirfrag((*p)->dirfrag().frag); - } - assert(in->get_num_ref() == 0); - remove_inode(in); - } - p = next; - } - } - - // move everything in the pintail to the top bit of the lru. - lru.lru_touch_entire_pintail(); - - // unpin all subtrees - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - p++) - p->first->put(CDir::PIN_SUBTREETEMP); - - show_subtrees(); -} - -void MDCache::handle_cache_expire(MCacheExpire *m) -{ - int from = m->get_from(); - - dout(7) << "cache_expire from mds" << from << dendl; - - if (mds->get_state() < MDSMap::STATE_REJOIN) { - delete m; - return; - } - - // loop over realms - for (map::iterator p = m->realms.begin(); - p != m->realms.end(); - ++p) { - // check container? - if (p->first.ino > 0) { - CInode *coni = get_inode(p->first.ino); - assert(coni); // we had better have this. - CDir *con = coni->get_approx_dirfrag(p->first.frag); - assert(con); - - if (!con->is_auth() || - (con->is_auth() && con->is_exporting() && - migrator->get_export_state(con) == Migrator::EXPORT_WARNING && - migrator->export_has_warned(con,from))) { - // not auth. - dout(7) << "delaying nonauth|warned expires for " << *con << dendl; - assert(con->is_frozen_tree_root()); - - // make a message container - if (delayed_expire[con].count(from) == 0) - delayed_expire[con][from] = new MCacheExpire(from); - - // merge these expires into it - delayed_expire[con][from]->add_realm(p->first, p->second); - continue; - } - dout(7) << "expires for " << *con << dendl; - } else { - dout(7) << "containerless expires (root, stray inodes)" << dendl; - } - - // INODES - for (map::iterator it = p->second.inodes.begin(); - it != p->second.inodes.end(); - it++) { - CInode *in = get_inode(it->first); - int nonce = it->second; - - if (!in) { - dout(0) << " inode expire on " << it->first << " from " << from << ", don't have it" << dendl; - assert(in); - } - assert(in->is_auth()); - - // check nonce - if (nonce == in->get_replica_nonce(from)) { - // remove from our cached_by - dout(7) << " inode expire on " << *in << " from mds" << from << " cached_by was " << in->get_replicas() << dendl; - inode_remove_replica(in, from); - } - else { - // this is an old nonce, ignore expire. - dout(7) << " inode expire on " << *in << " from mds" << from - << " with old nonce " << nonce << " (current " << in->get_replica_nonce(from) << "), dropping" - << dendl; - assert(in->get_replica_nonce(from) > nonce); - } - } - - // DIRS - for (map::iterator it = p->second.dirs.begin(); - it != p->second.dirs.end(); - it++) { - CDir *dir = get_dirfrag(it->first); - int nonce = it->second; - - if (!dir) { - dout(0) << " dir expire on " << it->first << " from " << from << ", don't have it" << dendl; - assert(dir); - } - assert(dir->is_auth()); - - // check nonce - if (nonce == dir->get_replica_nonce(from)) { - // remove from our cached_by - dout(7) << " dir expire on " << *dir << " from mds" << from - << " replicas was " << dir->replica_map << dendl; - dir->remove_replica(from); - } - else { - // this is an old nonce, ignore expire. - dout(7) << " dir expire on " << *dir << " from mds" << from - << " with old nonce " << nonce << " (current " << dir->get_replica_nonce(from) - << "), dropping" << dendl; - assert(dir->get_replica_nonce(from) > nonce); - } - } - - // DENTRIES - for (map >::iterator pd = p->second.dentries.begin(); - pd != p->second.dentries.end(); - ++pd) { - dout(10) << " dn expires in dir " << pd->first << dendl; - CInode *diri = get_inode(pd->first.ino); - assert(diri); - CDir *dir = diri->get_dirfrag(pd->first.frag); - - if (!dir) { - dout(0) << " dn expires on " << pd->first << " from " << from << ", must have refragmented" << dendl; - } else { - assert(dir->is_auth()); - } - - for (map::iterator p = pd->second.begin(); - p != pd->second.end(); - ++p) { - int nonce = p->second; - CDentry *dn; - - if (dir) { - dn = dir->lookup(p->first); - } else { - // which dirfrag for this dentry? - CDir *dir = diri->get_dirfrag(diri->pick_dirfrag(p->first)); - assert(dir->is_auth()); - dn = dir->lookup(p->first); - } - - if (!dn) - dout(0) << " missing dentry for " << p->first << " in " << *dir << dendl; - assert(dn); - - if (nonce == dn->get_replica_nonce(from)) { - dout(7) << " dentry_expire on " << *dn << " from mds" << from << dendl; - dentry_remove_replica(dn, from); - } - else { - dout(7) << " dentry_expire on " << *dn << " from mds" << from - << " with old nonce " << nonce << " (current " << dn->get_replica_nonce(from) - << "), dropping" << dendl; - assert(dn->get_replica_nonce(from) > nonce); - } - } - } - } - - - // done - delete m; -} - -void MDCache::process_delayed_expire(CDir *dir) -{ - dout(7) << "process_delayed_expire on " << *dir << dendl; - for (map::iterator p = delayed_expire[dir].begin(); - p != delayed_expire[dir].end(); - ++p) - handle_cache_expire(p->second); - delayed_expire.erase(dir); -} - -void MDCache::discard_delayed_expire(CDir *dir) -{ - dout(7) << "discard_delayed_expire on " << *dir << dendl; - for (map::iterator p = delayed_expire[dir].begin(); - p != delayed_expire[dir].end(); - ++p) - delete p->second; - delayed_expire.erase(dir); -} - -void MDCache::inode_remove_replica(CInode *in, int from) -{ - in->remove_replica(from); - in->mds_caps_wanted.erase(from); - - // note: this code calls _eval more often than it needs to! - // fix lock - if (in->authlock.remove_replica(from)) mds->locker->simple_eval_gather(&in->authlock); - if (in->linklock.remove_replica(from)) mds->locker->simple_eval_gather(&in->linklock); - if (in->dirfragtreelock.remove_replica(from)) mds->locker->simple_eval_gather(&in->dirfragtreelock); - if (in->filelock.remove_replica(from)) mds->locker->file_eval_gather(&in->filelock); - if (in->dirlock.remove_replica(from)) mds->locker->scatter_eval_gather(&in->dirlock); - - // alone now? - /* - if (!in->is_replicated()) { - mds->locker->simple_eval_gather(&in->authlock); - mds->locker->simple_eval_gather(&in->linklock); - mds->locker->simple_eval_gather(&in->dirfragtreelock); - mds->locker->file_eval_gather(&in->filelock); - mds->locker->scatter_eval_gather(&in->dirlock); - } - */ -} - -void MDCache::dentry_remove_replica(CDentry *dn, int from) -{ - dn->remove_replica(from); - - // fix lock - if (dn->lock.remove_replica(from) || - !dn->is_replicated()) - mds->locker->simple_eval_gather(&dn->lock); -} - - - -// ========================================================================================= -// shutdown - -class C_MDC_ShutdownCheck : public Context { - MDCache *mdc; -public: - C_MDC_ShutdownCheck(MDCache *m) : mdc(m) {} - void finish(int) { - mdc->shutdown_check(); - } -}; - -void MDCache::shutdown_check() -{ - dout(0) << "shutdown_check at " << g_clock.now() << dendl; - - // cache - int o = g_conf.debug_mds; - g_conf.debug_mds = 10; - show_cache(); - g_conf.debug_mds = o; - mds->timer.add_event_after(g_conf.mds_shutdown_check, new C_MDC_ShutdownCheck(this)); - - // this - dout(0) << "lru size now " << lru.lru_get_size() << dendl; - dout(0) << "log len " << mds->mdlog->get_num_events() << dendl; - - - if (mds->filer->is_active()) - dout(0) << "filer still active" << dendl; -} - -void MDCache::shutdown_start() -{ - dout(2) << "shutdown_start" << dendl; - - if (g_conf.mds_shutdown_check) - mds->timer.add_event_after(g_conf.mds_shutdown_check, new C_MDC_ShutdownCheck(this)); - - // g_conf.debug_mds = 10; -} - - - -bool MDCache::shutdown_pass() -{ - dout(7) << "shutdown_pass" << dendl; - - if (mds->is_stopped()) { - dout(7) << " already shut down" << dendl; - show_cache(); - show_subtrees(); - return true; - } - - // flush batching eopens, so that we can properly expire them. - mds->server->journal_opens(); // hrm, this is sort of a hack. - - // flush what we can from the log - mds->mdlog->set_max_events(0); - mds->mdlog->trim(); - - if (mds->mdlog->get_num_segments() > 1) { - dout(7) << "still >1 segments, waiting for log to trim" << dendl; - return false; - } - - trim(0); - dout(5) << "lru size now " << lru.lru_get_size() << dendl; - - // SUBTREES - if (!subtrees.empty() && - mds->get_nodeid() != 0 && - !migrator->is_exporting() //&& - //!migrator->is_importing() - ) { - dout(7) << "looking for subtrees to export to mds0" << dendl; - list ls; - for (map >::iterator it = subtrees.begin(); - it != subtrees.end(); - it++) { - CDir *dir = it->first; - if (dir->get_inode()->is_stray()) continue; - if (dir->is_frozen() || dir->is_freezing()) continue; - if (!dir->is_full_dir_auth()) continue; - ls.push_back(dir); - } - int max = 5; // throttle shutdown exports.. hack! - for (list::iterator p = ls.begin(); p != ls.end(); ++p) { - CDir *dir = *p; - int dest = dir->get_inode()->authority().first; - if (dest > 0 && !mds->mdsmap->is_active(dest)) dest = 0; - dout(7) << "sending " << *dir << " back to mds" << dest << dendl; - migrator->export_dir(dir, dest); - if (--max == 0) break; - } - } - - - // subtrees map not empty yet? - if (!subtrees.empty()) { - dout(7) << "still have " << num_subtrees() << " subtrees" << dendl; - show_subtrees(); - migrator->show_importing(); - migrator->show_exporting(); - if (!migrator->is_importing() && !migrator->is_exporting()) - show_cache(); - return false; - } - assert(subtrees.empty()); - assert(!migrator->is_exporting()); - assert(!migrator->is_importing()); - - - - // empty out stray contents - // FIXME - dout(7) << "FIXME: i need to empty out stray dir contents..." << dendl; - - // (only do this once!) - if (!mds->mdlog->is_capped()) { - dout(7) << "capping the log" << dendl; - mds->mdlog->cap(); - mds->mdlog->trim(); - } - - if (!mds->mdlog->empty()) { - dout(7) << "waiting for log to flush.. " << mds->mdlog->get_num_events() - << " in " << mds->mdlog->get_num_segments() << " segments" << dendl; - return false; - } - - if (!did_shutdown_log_cap) { - // flush journal header - dout(7) << "writing header for (now-empty) journal" << dendl; - assert(mds->mdlog->empty()); - mds->mdlog->write_head(0); - // NOTE: filer active checker below will block us until this completes. - did_shutdown_log_cap = true; - return false; - } - - // filer active? - if (mds->filer->is_active()) { - dout(7) << "filer still active" << dendl; - return false; - } - - // trim what we can from the cache - if (lru.lru_get_size() > 0) { - dout(7) << "there's still stuff in the cache: " << lru.lru_get_size() << dendl; - show_cache(); - //dump(); - return false; - } - - // done! - dout(2) << "shutdown done." << dendl; - return true; -} - - - - - - - - - -// ========= messaging ============== - - -void MDCache::dispatch(Message *m) -{ - switch (m->get_type()) { - - // RESOLVE - case MSG_MDS_RESOLVE: - handle_resolve((MMDSResolve*)m); - break; - case MSG_MDS_RESOLVEACK: - handle_resolve_ack((MMDSResolveAck*)m); - break; - - // REJOIN - case MSG_MDS_CACHEREJOIN: - handle_cache_rejoin((MMDSCacheRejoin*)m); - break; - - case MSG_MDS_DISCOVER: - handle_discover((MDiscover*)m); - break; - case MSG_MDS_DISCOVERREPLY: - handle_discover_reply((MDiscoverReply*)m); - break; - - /* - case MSG_MDS_INODEUPDATE: - handle_inode_update((MInodeUpdate*)m); - break; - */ - - case MSG_MDS_DIRUPDATE: - handle_dir_update((MDirUpdate*)m); - break; - - case MSG_MDS_CACHEEXPIRE: - handle_cache_expire((MCacheExpire*)m); - break; - - - - case MSG_MDS_DENTRYUNLINK: - handle_dentry_unlink((MDentryUnlink*)m); - break; - - - case MSG_MDS_FRAGMENTNOTIFY: - handle_fragment_notify((MMDSFragmentNotify*)m); - break; - - - - default: - dout(7) << "cache unknown message " << m->get_type() << dendl; - assert(0); - break; - } -} - - -/* path_traverse - * - * return values: - * <0 : traverse error (ENOTDIR, ENOENT, etc.) - * 0 : success - * >0 : delayed or forwarded - * - * onfail values: - * - * MDS_TRAVERSE_FORWARD - forward to auth (or best guess) - * MDS_TRAVERSE_DISCOVER - discover missing items. skip permission checks. - * MDS_TRAVERSE_DISCOVERXLOCK - discover XLOCKED items too (be careful!). - * MDS_TRAVERSE_FAIL - return an error - */ - -Context *MDCache::_get_waiter(MDRequest *mdr, Message *req) -{ - if (mdr) { - dout(20) << "_get_waiter retryrequest" << dendl; - return new C_MDS_RetryRequest(this, mdr); - } else { - dout(20) << "_get_waiter retrymessage" << dendl; - return new C_MDS_RetryMessage(mds, req); - } -} - -int MDCache::path_traverse(MDRequest *mdr, Message *req, // who - CInode *base, filepath& origpath, // what - vector& trace, // result - bool follow_trailing_symlink, // how - int onfail) -{ - assert(mdr || req); - bool null_okay = onfail == MDS_TRAVERSE_DISCOVERXLOCK; - bool noperm = false; - if (onfail == MDS_TRAVERSE_DISCOVER || - onfail == MDS_TRAVERSE_DISCOVERXLOCK) - noperm = true; - - // keep a list of symlinks we touch to avoid loops - set< pair > symlinks_resolved; - - // root - CInode *cur = base; - if (!cur) cur = get_root(); - if (cur == NULL) { - dout(7) << "traverse: i don't have root" << dendl; - open_root(_get_waiter(mdr, req)); - return 1; - } - - if (mds->logger) mds->logger->inc("t"); - - // start trace - trace.clear(); - - // make our own copy, since we'll modify when we hit symlinks - filepath path = origpath; - - unsigned depth = 0; - while (depth < path.depth()) { - dout(12) << "traverse: path seg depth " << depth << " = " << path[depth] << dendl; - - // ENOTDIR? - if (!cur->is_dir()) { - dout(7) << "traverse: " << *cur << " not a dir " << dendl; - return -ENOTDIR; - } - - // open dir - frag_t fg = cur->pick_dirfrag(path[depth]); - CDir *curdir = cur->get_dirfrag(fg); - if (!curdir) { - if (cur->is_auth()) { - // parent dir frozen_dir? - if (cur->is_frozen_dir()) { - dout(7) << "traverse: " << *cur->get_parent_dir() << " is frozen_dir, waiting" << dendl; - cur->get_parent_dn()->get_dir()->add_waiter(CDir::WAIT_UNFREEZE, _get_waiter(mdr, req)); - return 1; - } - curdir = cur->get_or_open_dirfrag(this, fg); - } else { - // discover? - dout(10) << "traverse: need dirfrag " << fg << ", doing discover from " << *cur << dendl; - discover_path(cur, path.postfixpath(depth), _get_waiter(mdr, req), - onfail == MDS_TRAVERSE_DISCOVERXLOCK); - if (mds->logger) mds->logger->inc("tdis"); - return 1; - } - } - assert(curdir); - - // frozen? - /* - if (curdir->is_frozen()) { - // doh! - // FIXME: traverse is allowed? - dout(7) << "traverse: " << *curdir << " is frozen, waiting" << dendl; - curdir->add_waiter(CDir::WAIT_UNFREEZE, _get_waiter(mdr, req)); - if (onfinish) delete onfinish; - return 1; - } - */ - - // must read directory hard data (permissions, x bit) to traverse - if (!noperm && - !mds->locker->simple_rdlock_try(&cur->authlock, 0)) { - dout(7) << "traverse: waiting on authlock rdlock on " << *cur << dendl; - cur->authlock.add_waiter(SimpleLock::WAIT_RD, _get_waiter(mdr, req)); - return 1; - } - - // check permissions? - // XXX - - // ..? - if (path[depth] == "..") { - trace.pop_back(); - depth++; - cur = cur->get_parent_inode(); - dout(10) << "traverse: following .. back to " << *cur << dendl; - continue; - } - - - // dentry - CDentry *dn = curdir->lookup(path[depth]); - - // null and last_bit and xlocked by me? - if (dn && dn->is_null() && null_okay) { - dout(10) << "traverse: hit null dentry at tail of traverse, succeeding" << dendl; - trace.push_back(dn); - break; // done! - } - - if (dn && !dn->is_null()) { - // dentry exists. xlocked? - if (!noperm && dn->lock.is_xlocked() && dn->lock.get_xlocked_by() != mdr) { - dout(10) << "traverse: xlocked dentry at " << *dn << dendl; - dn->lock.add_waiter(SimpleLock::WAIT_RD, _get_waiter(mdr, req)); - if (mds->logger) mds->logger->inc("tlock"); - return 1; - } - - // do we have inode? - if (!dn->inode) { - assert(dn->is_remote()); - // do i have it? - CInode *in = get_inode(dn->get_remote_ino()); - if (in) { - dout(7) << "linking in remote in " << *in << dendl; - dn->link_remote(in); - } else { - dout(7) << "remote link to " << dn->get_remote_ino() << ", which i don't have" << dendl; - assert(mdr); // we shouldn't hit non-primary dentries doing a non-mdr traversal! - open_remote_ino(dn->get_remote_ino(), mdr, _get_waiter(mdr, req)); - if (mds->logger) mds->logger->inc("trino"); - return 1; - } - } - - // symlink? - if (dn->inode->is_symlink() && - (follow_trailing_symlink || depth < path.depth()-1)) { - // symlink, resolve! - filepath sym = dn->inode->symlink; - dout(10) << "traverse: hit symlink " << *dn->inode << " to " << sym << dendl; - - // break up path components - // /head/symlink/tail - filepath head = path.prefixpath(depth); - filepath tail = path.postfixpath(depth+1); - dout(10) << "traverse: path head = " << head << dendl; - dout(10) << "traverse: path tail = " << tail << dendl; - - if (symlinks_resolved.count(pair(dn->inode, tail.get_path()))) { - dout(10) << "already hit this symlink, bailing to avoid the loop" << dendl; - return -ELOOP; - } - symlinks_resolved.insert(pair(dn->inode, tail.get_path())); - - // start at root? - if (dn->inode->symlink[0] == '/') { - // absolute - trace.clear(); - depth = 0; - path = dn->inode->symlink; - path.append(tail); - dout(10) << "traverse: absolute symlink, path now " << path << " depth " << depth << dendl; - } else { - // relative - path = head; - path.append(sym); - path.append(tail); - dout(10) << "traverse: relative symlink, path now " << path << " depth " << depth << dendl; - } - continue; - } - - // forwarder wants replicas? - if (mdr && mdr->client_request && - mdr->client_request->get_mds_wants_replica_in_dirino()) { - dout(30) << "traverse: REP is here, " - << mdr->client_request->get_mds_wants_replica_in_dirino() - << " vs " << curdir->dirfrag() << dendl; - - if (mdr->client_request->get_mds_wants_replica_in_dirino() == curdir->ino() && - curdir->is_auth() && - curdir->is_rep() && - curdir->is_replica(req->get_source().num()) && - dn->is_auth() - ) { - assert(req->get_source().is_mds()); - int from = req->get_source().num(); - - if (dn->is_replica(from)) { - dout(15) << "traverse: REP would replicate to mds" << from << ", but already cached_by " - << req->get_source() << " dn " << *dn << dendl; - } else { - dout(10) << "traverse: REP replicating to " << req->get_source() << " dn " << *dn << dendl; - MDiscoverReply *reply = new MDiscoverReply(curdir->dirfrag()); - reply->add_dentry( dn->replicate_to( from ) ); - if (dn->is_primary()) - reply->add_inode( dn->inode->replicate_to( from ) ); - mds->send_message_mds(reply, req->get_source().num()); - } - } - } - - // add to trace, continue. - trace.push_back(dn); - cur = dn->inode; - touch_inode(cur); - depth++; - continue; - } - - // MISS. dentry doesn't exist. - dout(12) << "traverse: miss on dentry " << path[depth] << " in " << *curdir << dendl; - - if (curdir->is_auth()) { - // dentry is mine. - if (curdir->is_complete()) { - // file not found - return -ENOENT; - } else { - // directory isn't complete; reload - dout(7) << "traverse: incomplete dir contents for " << *cur << ", fetching" << dendl; - touch_inode(cur); - curdir->fetch(_get_waiter(mdr, req)); - if (mds->logger) mds->logger->inc("tdirf"); - return 1; - } - } else { - // dirfrag/dentry is not mine. - pair dauth = curdir->authority(); - - if ((onfail == MDS_TRAVERSE_DISCOVER || - onfail == MDS_TRAVERSE_DISCOVERXLOCK)) { - dout(7) << "traverse: discover from " << path[depth] << " from " << *curdir << dendl; - discover_path(curdir, path.postfixpath(depth), _get_waiter(mdr, req), - onfail == MDS_TRAVERSE_DISCOVERXLOCK); - if (mds->logger) mds->logger->inc("tdis"); - return 1; - } - if (onfail == MDS_TRAVERSE_FORWARD) { - // forward - dout(7) << "traverse: not auth for " << path << " in " << *curdir << dendl; - - if (curdir->is_ambiguous_auth()) { - // wait - dout(7) << "traverse: waiting for single auth in " << *curdir << dendl; - curdir->add_waiter(CDir::WAIT_SINGLEAUTH, _get_waiter(mdr, req)); - return 1; - } - - dout(7) << "traverse: forwarding, not auth for " << *curdir << dendl; - - // request replication? - if (mdr && mdr->client_request && curdir->is_rep()) { - dout(15) << "traverse: REP fw to mds" << dauth << ", requesting rep under " - << *curdir << " req " << *(MClientRequest*)req << dendl; - mdr->client_request->set_mds_wants_replica_in_dirino(curdir->ino()); - req->clear_payload(); // reencode! - } - - if (mdr) - request_forward(mdr, dauth.first); - else - mds->forward_message_mds(req, dauth.first); - - if (mds->logger) mds->logger->inc("tfw"); - return 2; - } - if (onfail == MDS_TRAVERSE_FAIL) - return -ENOENT; // not necessarily exactly true.... - } - - assert(0); // i shouldn't get here - } - - // success. - if (mds->logger) mds->logger->inc("thit"); - return 0; -} - -bool MDCache::path_is_mine(filepath& path) -{ - dout(15) << "path_is_mine " << path << dendl; - - // start at root. FIXME. - CInode *cur = root; - assert(cur); - - for (unsigned i=0; ipick_dirfrag(path[i]); - CDir *dir = cur->get_dirfrag(fg); - if (!dir) return cur->is_auth(); - CDentry *dn = dir->lookup(path[i]); - if (!dn) return dir->is_auth(); - assert(dn->is_primary()); - cur = dn->get_inode(); - } - - return cur->is_auth(); -} - -/** - * path_traverse_to_dir -- traverse to deepest dir we have - * - * @path - path to traverse (as far as we can) - * - * assumes we _don't_ have the full path. (if we do, we return NULL.) - */ -CDir *MDCache::path_traverse_to_dir(filepath& path) -{ - CInode *cur = root; - assert(cur); - for (unsigned i=0; ipick_dirfrag(path[i]); - CDir *dir = cur->get_or_open_dirfrag(this, fg); - CDentry *dn = dir->lookup(path[i]); - if (!dn) return dir; - assert(dn->is_primary()); - cur = dn->get_inode(); - } - - return NULL; // oh, we have the full path. -} - - -/** - * open_remote_dir -- open up a remote dirfrag - * - * @diri - base inode - * @approxfg - approximate fragment. - * @fin - completion callback - */ -void MDCache::open_remote_dirfrag(CInode *diri, frag_t approxfg, Context *fin) -{ - dout(10) << "open_remote_dir on " << *diri << dendl; - - assert(diri->is_dir()); - assert(!diri->is_auth()); - assert(diri->get_dirfrag(approxfg) == 0); - - int auth = diri->authority().first; - - if (mds->mdsmap->get_state(auth) >= MDSMap::STATE_REJOIN) { - discover_dir_frag(diri, approxfg, fin); - } else { - // mds is down or recovering. forge a replica! - forge_replica_dir(diri, approxfg, auth); - } -} - - -/** - * get_dentry_inode - get or open inode - * - * @dn the dentry - * @mdr current request - * - * will return inode for primary, or link up/open up remote link's inode as necessary. - */ -CInode *MDCache::get_dentry_inode(CDentry *dn, MDRequest *mdr) -{ - assert(!dn->is_null()); - - if (dn->is_primary()) - return dn->inode; - - assert(dn->is_remote()); - CInode *in = get_inode(dn->get_remote_ino()); - if (in) { - dout(7) << "get_dentry_inode linking in remote in " << *in << dendl; - dn->link_remote(in); - return in; - } else { - dout(10) << "get_dentry_inode on remote dn, opening inode for " << *dn << dendl; - open_remote_ino(dn->get_remote_ino(), mdr, new C_MDS_RetryRequest(this, mdr)); - return 0; - } -} - -class C_MDC_RetryOpenRemoteIno : public Context { - MDCache *mdcache; - inodeno_t ino; - MDRequest *mdr; - Context *onfinish; -public: - C_MDC_RetryOpenRemoteIno(MDCache *mdc, inodeno_t i, MDRequest *r, Context *c) : - mdcache(mdc), ino(i), mdr(r), onfinish(c) {} - void finish(int r) { - mdcache->open_remote_ino(ino, mdr, onfinish); - } -}; - - -class C_MDC_OpenRemoteIno : public Context { - MDCache *mdcache; - inodeno_t ino; - MDRequest *mdr; - Context *onfinish; -public: - vector anchortrace; - - C_MDC_OpenRemoteIno(MDCache *mdc, inodeno_t i, MDRequest *r, Context *c) : - mdcache(mdc), ino(i), mdr(r), onfinish(c) {} - C_MDC_OpenRemoteIno(MDCache *mdc, inodeno_t i, vector& at, - MDRequest *r, Context *c) : - mdcache(mdc), ino(i), mdr(r), onfinish(c), anchortrace(at) {} - - void finish(int r) { - assert(r == 0); - if (r == 0) - mdcache->open_remote_ino_2(ino, mdr, anchortrace, onfinish); - else { - onfinish->finish(r); - delete onfinish; - } - } -}; - -void MDCache::open_remote_ino(inodeno_t ino, - MDRequest *mdr, - Context *onfinish) -{ - dout(7) << "open_remote_ino on " << ino << dendl; - - C_MDC_OpenRemoteIno *c = new C_MDC_OpenRemoteIno(this, ino, mdr, onfinish); - mds->anchorclient->lookup(ino, c->anchortrace, c); -} - -void MDCache::open_remote_ino_2(inodeno_t ino, - MDRequest *mdr, - vector& anchortrace, - Context *onfinish) -{ - dout(7) << "open_remote_ino_2 on " << ino - << ", trace depth is " << anchortrace.size() << dendl; - - // find deepest cached inode in prefix - unsigned i = anchortrace.size(); // i := array index + 1 - CInode *in = 0; - while (1) { - // inode? - dout(10) << " " << i << ": " << anchortrace[i-1] << dendl; - in = get_inode(anchortrace[i-1].ino); - if (in) break; - i--; - if (!i) { - in = get_inode(anchortrace[i].dirfrag.ino); - assert(in); // actually, we may need to open the root or a foreign stray inode, here. - break; - } - } - dout(10) << "deepest cached inode at " << i << " is " << *in << dendl; - - if (in->ino() == ino) { - // success - dout(10) << "open_remote_ino_2 have " << *in << dendl; - onfinish->finish(0); - delete onfinish; - return; - } - - // open dirfrag beneath *in - frag_t frag = anchortrace[i].dirfrag.frag; - - if (!in->dirfragtree.contains(frag)) { - dout(10) << "frag " << frag << " not valid, requerying anchortable" << dendl; - open_remote_ino(ino, mdr, onfinish); - return; - } - - CDir *dir = in->get_dirfrag(frag); - - if (!dir && !in->is_auth()) { - dout(10) << "opening remote dirfrag " << frag << " under " << *in << dendl; - /* FIXME: we re-query the anchortable just to avoid a fragtree update race */ - open_remote_dirfrag(in, frag, - new C_MDC_RetryOpenRemoteIno(this, ino, mdr, onfinish)); - return; - } - - if (!dir && in->is_auth()) - dir = in->get_or_open_dirfrag(this, frag); - - assert(dir); - if (dir->is_auth()) { - if (dir->is_complete()) { - // hrm. requery anchor table. - dout(10) << "expected ino " << anchortrace[i].ino - << " in complete dir " << *dir - << ", requerying anchortable" - << dendl; - open_remote_ino(ino, mdr, onfinish); - } else { - dout(10) << "need ino " << anchortrace[i].ino - << ", fetching incomplete dir " << *dir - << dendl; - dir->fetch(new C_MDC_OpenRemoteIno(this, ino, anchortrace, mdr, onfinish)); - } - } else { - // hmm, discover. - dout(10) << "have remote dirfrag " << *dir << ", discovering " - << anchortrace[i].ino << dendl; - discover_ino(dir, anchortrace[i].ino, - new C_MDC_OpenRemoteIno(this, ino, anchortrace, mdr, onfinish)); - } -} - - - - -void MDCache::make_trace(vector& trace, CInode *in) -{ - CInode *parent = in->get_parent_inode(); - if (parent) { - make_trace(trace, parent); - - CDentry *dn = in->get_parent_dn(); - dout(15) << "make_trace adding " << *dn << dendl; - trace.push_back(dn); - } -} - - -MDRequest *MDCache::request_start(MClientRequest *req) -{ - // did we win a forward race against a slave? - if (active_requests.count(req->get_reqid())) { - MDRequest *mdr = active_requests[req->get_reqid()]; - if (mdr->is_slave()) { - dout(10) << "request_start already had " << *mdr << ", cleaning up" << dendl; - request_cleanup(mdr); - delete mdr; - } else { - dout(10) << "request_start already processing " << *mdr << ", dropping new msg" << dendl; - delete req; - return 0; - } - } - - // register new client request - MDRequest *mdr = new MDRequest(req->get_reqid(), req); - active_requests[req->get_reqid()] = mdr; - dout(7) << "request_start " << *mdr << dendl; - return mdr; -} - -MDRequest *MDCache::request_start_slave(metareqid_t ri, int by) -{ - MDRequest *mdr = new MDRequest(ri, by); - assert(active_requests.count(mdr->reqid) == 0); - active_requests[mdr->reqid] = mdr; - dout(7) << "request_start_slave " << *mdr << " by mds" << by << dendl; - return mdr; -} - - -MDRequest *MDCache::request_get(metareqid_t rid) -{ - assert(active_requests.count(rid)); - dout(7) << "request_get " << rid << " " << *active_requests[rid] << dendl; - return active_requests[rid]; -} - -void MDCache::request_finish(MDRequest *mdr) -{ - dout(7) << "request_finish " << *mdr << dendl; - - // slave finisher? - if (mdr->more()->slave_commit) { - mdr->more()->slave_commit->finish(0); - delete mdr->more()->slave_commit; - mdr->more()->slave_commit = 0; - } - - if (mdr->client_request && mds->logger) { - mds->logger->inc("reply"); - mds->logger->favg("replyl", g_clock.now() - mdr->client_request->get_recv_stamp()); - } - - delete mdr->client_request; - delete mdr->slave_request; - request_cleanup(mdr); -} - - -void MDCache::request_forward(MDRequest *mdr, int who, int port) -{ - dout(7) << "request_forward " << *mdr << " to mds" << who << " req " << *mdr << dendl; - - mds->forward_message_mds(mdr->client_request, who); - request_cleanup(mdr); - - if (mds->logger) mds->logger->inc("fw"); -} - - -void MDCache::dispatch_request(MDRequest *mdr) -{ - if (mdr->client_request) { - mds->server->dispatch_client_request(mdr); - } else if (mdr->slave_request) { - mds->server->dispatch_slave_request(mdr); - } else - assert(0); -} - - - -void MDCache::request_forget_foreign_locks(MDRequest *mdr) -{ - // xlocks - set::iterator p = mdr->xlocks.begin(); - while (p != mdr->xlocks.end()) { - if ((*p)->get_parent()->is_auth()) - p++; - else { - dout(10) << "request_forget_foreign_locks " << **p - << " on " << *(*p)->get_parent() << dendl; - (*p)->put_xlock(); - mdr->locks.erase(*p); - mdr->xlocks.erase(p++); - } - } -} - -void MDCache::request_cleanup(MDRequest *mdr) -{ - dout(15) << "request_cleanup " << *mdr << dendl; - metareqid_t ri = mdr->reqid; - - // clear ref, trace - mdr->ref = 0; - mdr->trace.clear(); - - // clean up slaves - // (will implicitly drop remote dn pins) - for (set::iterator p = mdr->more()->slaves.begin(); - p != mdr->more()->slaves.end(); - ++p) { - MMDSSlaveRequest *r = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_FINISH); - mds->send_message_mds(r, *p); - } - // strip foreign xlocks out of lock lists, since the OP_FINISH drops them implicitly. - request_forget_foreign_locks(mdr); - - - // drop locks - mds->locker->drop_locks(mdr); - - // drop (local) auth pins - mdr->drop_local_auth_pins(); - - // drop stickydirs - for (set::iterator p = mdr->stickydirs.begin(); - p != mdr->stickydirs.end(); - ++p) - (*p)->put_stickydirs(); - - // drop cache pins - for (set::iterator it = mdr->pins.begin(); - it != mdr->pins.end(); - it++) - (*it)->put(MDSCacheObject::PIN_REQUEST); - mdr->pins.clear(); - - // remove from map - active_requests.erase(mdr->reqid); - delete mdr; - - - - - // log some stats ***** - if (mds->logger) { - mds->logger->set("c", lru.lru_get_size()); - mds->logger->set("cpin", lru.lru_get_num_pinned()); - mds->logger->set("ctop", lru.lru_get_top()); - mds->logger->set("cbot", lru.lru_get_bot()); - mds->logger->set("cptail", lru.lru_get_pintail()); - //mds->logger->set("buf",buffer_total_alloc); - } - - if (g_conf.log_pins) { - // pin - /* -for (int i=0; ilogger2) mds->logger2->set(cinode_pin_names[i], - cinode_pins[i]); - } - */ - /* - for (map::iterator it = cdir_pins.begin(); - it != cdir_pins.end(); - it++) { - //string s = "D"; - //s += cdir_pin_names[it->first]; - if (mds->logger2) mds->logger2->set(//s, - cdir_pin_names[it->first], - it->second); - } - */ - } - -} - - -// -------------------------------------------------------------------- -// ANCHORS - -// CREATE - -class C_MDC_AnchorCreatePrepared : public Context { - MDCache *cache; - CInode *in; -public: - version_t atid; - C_MDC_AnchorCreatePrepared(MDCache *c, CInode *i) : cache(c), in(i) {} - void finish(int r) { - cache->_anchor_create_prepared(in, atid); - } -}; - -void MDCache::anchor_create(MDRequest *mdr, CInode *in, Context *onfinish) -{ - assert(in->is_auth()); - - // auth pin - if (!in->can_auth_pin() && - !mdr->is_auth_pinned(in)) { - dout(7) << "anchor_create not authpinnable, waiting on " << *in << dendl; - in->add_waiter(CInode::WAIT_UNFREEZE, onfinish); - return; - } - - // wait - in->add_waiter(CInode::WAIT_ANCHORED, onfinish); - - // already anchoring? - if (in->state_test(CInode::STATE_ANCHORING)) { - dout(7) << "anchor_create already anchoring " << *in << dendl; - return; - } - - dout(7) << "anchor_create " << *in << dendl; - - // auth: do it - in->state_set(CInode::STATE_ANCHORING); - in->get(CInode::PIN_ANCHORING); - in->auth_pin(); - - // make trace - vector trace; - in->make_anchor_trace(trace); - - // do it - C_MDC_AnchorCreatePrepared *fin = new C_MDC_AnchorCreatePrepared(this, in); - mds->anchorclient->prepare_create(in->ino(), trace, &fin->atid, fin); -} - -class C_MDC_AnchorCreateLogged : public Context { - MDCache *cache; - CInode *in; - version_t atid; - LogSegment *ls; -public: - C_MDC_AnchorCreateLogged(MDCache *c, CInode *i, version_t t, LogSegment *s) : - cache(c), in(i), atid(t), ls(s) {} - void finish(int r) { - cache->_anchor_create_logged(in, atid, ls); - } -}; - -void MDCache::_anchor_create_prepared(CInode *in, version_t atid) -{ - dout(10) << "_anchor_create_prepared " << *in << " atid " << atid << dendl; - assert(in->inode.anchored == false); - - // update the logged inode copy - inode_t *pi = in->project_inode(); - pi->anchored = true; - pi->version = in->pre_dirty(); - - // note anchor transaction - EUpdate *le = new EUpdate(mds->mdlog, "anchor_create"); - le->metablob.add_dir_context(in->get_parent_dir()); - le->metablob.add_primary_dentry(in->parent, true, 0, pi); - le->metablob.add_anchor_transaction(atid); - mds->mdlog->submit_entry(le, new C_MDC_AnchorCreateLogged(this, in, atid, - mds->mdlog->get_current_segment())); -} - - -void MDCache::_anchor_create_logged(CInode *in, version_t atid, LogSegment *ls) -{ - dout(10) << "_anchor_create_logged on " << *in << dendl; - - // unpin - assert(in->state_test(CInode::STATE_ANCHORING)); - in->state_clear(CInode::STATE_ANCHORING); - in->put(CInode::PIN_ANCHORING); - in->auth_unpin(); - - // apply update to cache - in->pop_and_dirty_projected_inode(ls); - - // tell the anchortable we've committed - mds->anchorclient->commit(atid, ls); - - // trigger waiters - in->finish_waiting(CInode::WAIT_ANCHORED, 0); -} - - -// DESTROY - -class C_MDC_AnchorDestroyPrepared : public Context { - MDCache *cache; - CInode *in; -public: - version_t atid; - C_MDC_AnchorDestroyPrepared(MDCache *c, CInode *i) : cache(c), in(i) {} - void finish(int r) { - cache->_anchor_destroy_prepared(in, atid); - } -}; - -void MDCache::anchor_destroy(CInode *in, Context *onfinish) -{ - assert(in->is_auth()); - - // auth pin - if (!in->can_auth_pin()/* && - !mdr->is_auth_pinned(in)*/) { - dout(7) << "anchor_destroy not authpinnable, waiting on " << *in << dendl; - in->add_waiter(CInode::WAIT_UNFREEZE, onfinish); - return; - } - - // wait - if (onfinish) - in->add_waiter(CInode::WAIT_UNANCHORED, onfinish); - - // already anchoring? - if (in->state_test(CInode::STATE_UNANCHORING)) { - dout(7) << "anchor_destroy already unanchoring " << *in << dendl; - return; - } - - dout(7) << "anchor_destroy " << *in << dendl; - - // auth: do it - in->state_set(CInode::STATE_UNANCHORING); - in->get(CInode::PIN_UNANCHORING); - in->auth_pin(); - - // do it - C_MDC_AnchorDestroyPrepared *fin = new C_MDC_AnchorDestroyPrepared(this, in); - mds->anchorclient->prepare_destroy(in->ino(), &fin->atid, fin); -} - -class C_MDC_AnchorDestroyLogged : public Context { - MDCache *cache; - CInode *in; - version_t atid; - LogSegment *ls; -public: - C_MDC_AnchorDestroyLogged(MDCache *c, CInode *i, version_t t, LogSegment *l) : - cache(c), in(i), atid(t), ls(l) {} - void finish(int r) { - cache->_anchor_destroy_logged(in, atid, ls); - } -}; - -void MDCache::_anchor_destroy_prepared(CInode *in, version_t atid) -{ - dout(10) << "_anchor_destroy_prepared " << *in << " atid " << atid << dendl; - - assert(in->inode.anchored == true); - - // update the logged inode copy - inode_t *pi = in->project_inode(); - pi->anchored = true; - pi->version = in->pre_dirty(); - - // log + wait - EUpdate *le = new EUpdate(mds->mdlog, "anchor_destroy"); - le->metablob.add_dir_context(in->get_parent_dir()); - le->metablob.add_primary_dentry(in->parent, true, 0, pi); - le->metablob.add_anchor_transaction(atid); - mds->mdlog->submit_entry(le, new C_MDC_AnchorDestroyLogged(this, in, atid, mds->mdlog->get_current_segment())); -} - - -void MDCache::_anchor_destroy_logged(CInode *in, version_t atid, LogSegment *ls) -{ - dout(10) << "_anchor_destroy_logged on " << *in << dendl; - - // unpin - assert(in->state_test(CInode::STATE_UNANCHORING)); - in->state_clear(CInode::STATE_UNANCHORING); - in->put(CInode::PIN_UNANCHORING); - in->auth_unpin(); - - // apply update to cache - in->pop_and_dirty_projected_inode(ls); - - // tell the anchortable we've committed - mds->anchorclient->commit(atid, ls); - - // trigger waiters - in->finish_waiting(CInode::WAIT_UNANCHORED, 0); -} - - -// ------------------------------------------------------------------------------- -// STRAYS - -void MDCache::eval_stray(CDentry *dn) -{ - dout(10) << "eval_stray " << *dn << dendl; - assert(dn->is_primary()); - CInode *in = dn->inode; - assert(in); - - return; // FIXME or test me rather, there is a bug here somewhere! - - // purge? - if (in->inode.nlink == 0) { - if (dn->is_replicated() || in->is_any_caps()) return; // wait - if (!in->dirfrags.empty()) return; // wait for dirs to close/trim - _purge_stray(dn); - } - else if (in->inode.nlink == 1) { - // trivial reintegrate? - if (!in->remote_parents.empty()) { - CDentry *rlink = *in->remote_parents.begin(); - if (rlink->is_auth() && - rlink->dir->can_auth_pin()) - reintegrate_stray(dn, rlink); - - if (!rlink->is_auth() && - !in->is_ambiguous_auth()) - migrate_stray(dn, rlink->authority().first); - } - } else { - // wait for next use. - } -} - - -class C_MDC_PurgeStray : public Context { - MDCache *cache; - CDentry *dn; - version_t pdv; - LogSegment *ls; -public: - C_MDC_PurgeStray(MDCache *c, CDentry *d, version_t v, LogSegment *s) : - cache(c), dn(d), pdv(v), ls(s) { } - void finish(int r) { - cache->_purge_stray_logged(dn, pdv, ls); - } -}; - -void MDCache::_purge_stray(CDentry *dn) -{ - dout(10) << "_purge_stray " << *dn << " " << *dn->inode << dendl; - assert(!dn->is_replicated()); - - // log removal - version_t pdv = dn->pre_dirty(); - - EUpdate *le = new EUpdate(mds->mdlog, "purge_stray"); - le->metablob.add_dir_context(dn->dir); - le->metablob.add_null_dentry(dn, true); - le->metablob.add_inode_truncate(dn->inode->ino(), 0, dn->inode->inode.size); - - mds->mdlog->submit_entry(le, new C_MDC_PurgeStray(this, dn, pdv, mds->mdlog->get_current_segment())); - - -} - -void MDCache::_purge_stray_logged(CDentry *dn, version_t pdv, LogSegment *ls) -{ - dout(10) << "_purge_stray_logged " << *dn << " " << *dn->inode << dendl; - CInode *in = dn->inode; - - // dirty+unlink dentry - dn->dir->mark_dirty(pdv, ls); - dn->dir->unlink_inode(dn); - dn->dir->remove_dentry(dn); - - // purge+remove inode - purge_inode(in, 0, in->inode.size, ls); - remove_inode(in); -} - - - -void MDCache::reintegrate_stray(CDentry *dn, CDentry *rlink) -{ - dout(10) << "reintegrate_stray " << *dn << " into " << *rlink << dendl; - -} - - -void MDCache::migrate_stray(CDentry *dn, int dest) -{ - dout(10) << "migrate_stray to mds" << dest << " " << *dn << dendl; - -} - - - - -// ======================================================================================== -// DISCOVER -/* - - - for all discovers (except base_inos, e.g. root, stray), waiters are attached - to the parent metadata object in the cache (pinning it). - - - the discover is also registered under the per-mds discover_ hashes, so that - waiters can be kicked in the event of a failure. that is, every discover will - be followed by a reply, unless the remote node fails.. - - - each discover_reply must reliably decrement the discover_ counts. - - - base_inos are the exception. those waiters are under waiting_for_base_ino. - -*/ - -void MDCache::discover_base_ino(inodeno_t want_ino, - Context *onfinish, - int from) -{ - dout(7) << "discover_base_ino " << want_ino << " from mds" << from << dendl; - if (waiting_for_base_ino[from].count(want_ino) == 0) { - filepath want_path; - MDiscover *dis = new MDiscover(mds->get_nodeid(), - want_ino, - want_path, - false); - mds->send_message_mds(dis, from); - } - - waiting_for_base_ino[from][want_ino].push_back(onfinish); -} - - -void MDCache::discover_dir_frag(CInode *base, - frag_t approx_fg, - Context *onfinish, - int from) -{ - if (from < 0) from = base->authority().first; - - dout(7) << "discover_dir_frag " << base->ino() << " " << approx_fg - << " from mds" << from << dendl; - - if (!base->is_waiter_for(CInode::WAIT_DIR) || !onfinish) { // this is overly conservative - filepath want_path; - MDiscover *dis = new MDiscover(mds->get_nodeid(), - base->ino(), - want_path, - true); // need the base dir open - dis->set_base_dir_frag(approx_fg); - mds->send_message_mds(dis, from); - } - - // register + wait - if (onfinish) - base->add_waiter(CInode::WAIT_DIR, onfinish); - discover_dir[from][base->ino()]++; -} - -void MDCache::discover_path(CInode *base, - filepath want_path, - Context *onfinish, - bool want_xlocked, - int from) -{ - if (from < 0) from = base->authority().first; - - dout(7) << "discover_path " << base->ino() << " " << want_path << " from mds" << from - << (want_xlocked ? " want_xlocked":"") - << dendl; - - if (base->is_ambiguous_auth()) { - dout(10) << " waiting for single auth on " << *base << dendl; - base->add_waiter(CInode::WAIT_SINGLEAUTH, onfinish); - return; - } - - if (!base->is_waiter_for(CInode::WAIT_DIR) || !onfinish) { // this is overly conservative - MDiscover *dis = new MDiscover(mds->get_nodeid(), - base->ino(), - want_path, - true, // we want the base dir; we are relative to ino. - want_xlocked); - mds->send_message_mds(dis, from); - } - - // register + wait - if (onfinish) base->add_waiter(CInode::WAIT_DIR, onfinish); - discover_dir[from][base->ino()]++; -} - -void MDCache::discover_path(CDir *base, - filepath want_path, - Context *onfinish, - bool want_xlocked) -{ - int from = base->authority().first; - - dout(7) << "discover_path " << base->dirfrag() << " " << want_path << " from mds" << from - << (want_xlocked ? " want_xlocked":"") - << dendl; - - if (base->is_ambiguous_auth()) { - dout(7) << " waiting for single auth on " << *base << dendl; - base->add_waiter(CDir::WAIT_SINGLEAUTH, onfinish); - return; - } - - if (!base->is_waiting_for_dentry(want_path[0]) || !onfinish) { - MDiscover *dis = new MDiscover(mds->get_nodeid(), - base->ino(), - want_path, - false, // no base dir; we are relative to dir - want_xlocked); - mds->send_message_mds(dis, from); - } - - // register + wait - if (onfinish) base->add_dentry_waiter(want_path[0], onfinish); - discover_dir_sub[from][base->dirfrag()]++; -} - -void MDCache::discover_ino(CDir *base, - inodeno_t want_ino, - Context *onfinish, - bool want_xlocked) -{ - int from = base->authority().first; - - dout(7) << "discover_ino " << base->dirfrag() << " " << want_ino << " from mds" << from - << (want_xlocked ? " want_xlocked":"") - << dendl; - - if (!base->is_waiting_for_ino(want_ino)) { - MDiscover *dis = new MDiscover(mds->get_nodeid(), - base->dirfrag(), - want_ino, - want_xlocked); - mds->send_message_mds(dis, from); - } - - // register + wait - base->add_ino_waiter(want_ino, onfinish); - discover_dir_sub[from][base->dirfrag()]++; -} - - - -void MDCache::kick_discovers(int who) -{ - list waiters; - - for (hash_map >::iterator p = waiting_for_base_ino[who].begin(); - p != waiting_for_base_ino[who].end(); - ++p) { - dout(10) << "kick_discovers on base ino " << p->first << dendl; - mds->queue_waiters(p->second); - } - waiting_for_base_ino.erase(who); - - for (hash_map::iterator p = discover_dir[who].begin(); - p != discover_dir[who].end(); - ++p) { - CInode *in = get_inode(p->first); - if (!in) continue; - dout(10) << "kick_discovers dir waiters on " << *in << dendl; - in->take_waiting(CInode::WAIT_DIR, waiters); - } - discover_dir.erase(who); - - for (hash_map::iterator p = discover_dir_sub[who].begin(); - p != discover_dir_sub[who].end(); - ++p) { - CDir *dir = get_dirfrag(p->first); - if (!dir) continue; - dout(10) << "kick_discovers dentry+ino waiters on " << *dir << dendl; - dir->take_sub_waiting(waiters); - } - discover_dir_sub.erase(who); - - mds->queue_waiters(waiters); -} - - - -void MDCache::handle_discover(MDiscover *dis) -{ - int whoami = mds->get_nodeid(); - - assert(dis->get_asker() != whoami); - - /* - if (mds->get_state() < MDSMap::STATE_ACTIVE) { - dout(-7) << "discover_reply NOT ACTIVE YET" << dendl; - delete dis; - return; - } - */ - - - CInode *cur = 0; - MDiscoverReply *reply = new MDiscoverReply(dis); - - // get started. - if (dis->get_base_ino() == MDS_INO_ROOT) { - // wants root - dout(7) << "handle_discover from mds" << dis->get_asker() - << " wants root + " << dis->get_want().get_path() << dendl; - - assert(mds->get_nodeid() == 0); - assert(root->is_auth()); - - // add root - reply->add_inode( root->replicate_to( dis->get_asker() ) ); - dout(10) << "added root " << *root << dendl; - - cur = root; - } - else if (dis->get_base_ino() == MDS_INO_STRAY(whoami)) { - // wants root - dout(7) << "handle_discover from mds" << dis->get_asker() - << " wants stray + " << dis->get_want().get_path() << dendl; - - reply->add_inode( stray->replicate_to( dis->get_asker() ) ); - dout(10) << "added stray " << *stray << dendl; - - cur = stray; - } - else { - // there's a base inode - cur = get_inode(dis->get_base_ino()); - - if (!cur) { - dout(7) << "handle_discover mds" << dis->get_asker() - << " don't have base ino " << dis->get_base_ino() - << dendl; - reply->set_flag_error_dir(); - } - - if (dis->wants_base_dir()) { - dout(7) << "handle_discover mds" << dis->get_asker() - << " wants basedir+" << dis->get_want().get_path() - << " has " << *cur - << dendl; - } else { - dout(7) << "handle_discover mds" << dis->get_asker() - << " wants " << dis->get_want().get_path() - << " has " << *cur - << dendl; - } - } - - assert(reply); - - // add content - // do some fidgeting to include a dir if they asked for the base dir, or just root. - for (unsigned i = 0; - cur && (i < dis->get_want().depth() || dis->get_want().depth() == 0); - i++) { - - // -- figure out the dir - - // is *cur even a dir at all? - if (!cur->is_dir()) { - dout(7) << *cur << " not a dir" << dendl; - reply->set_flag_error_dir(); - break; - } - - // pick frag - frag_t fg; - if (dis->get_want().depth()) { - // dentry specifies - fg = cur->pick_dirfrag(dis->get_dentry(i)); - } else { - // requester explicity specified the frag - fg = dis->get_base_dir_frag(); - assert(dis->wants_base_dir() || dis->get_want_ino() || dis->get_base_ino() < MDS_INO_BASE); - } - CDir *curdir = cur->get_dirfrag(fg); - - if ((!curdir && !cur->is_auth()) || - (curdir && !curdir->is_auth())) { - - /* before: - * ONLY set flag if empty!! - * otherwise requester will wake up waiter(s) _and_ continue with discover, - * resulting in duplicate discovers in flight, - * which can wreak havoc when discovering rename srcdn (which may move) - */ - - if (reply->is_empty()) { - // only hint if empty. - // someday this could be better, but right now the waiter logic isn't smart enough. - - // hint - if (curdir) { - dout(7) << " not dirfrag auth, setting dir_auth_hint for " << *curdir << dendl; - reply->set_dir_auth_hint(curdir->authority().first); - } else { - dout(7) << " dirfrag not open, not inode auth, setting dir_auth_hint for " - << *cur << dendl; - reply->set_dir_auth_hint(cur->authority().first); - } - - // note error dentry, if any - // NOTE: important, as it allows requester to issue an equivalent discover - // to whomever we hint at. - if (dis->get_want().depth() > i) - reply->set_error_dentry(dis->get_dentry(i)); - } - - break; - } - - // open dir? - if (!curdir) - curdir = cur->get_or_open_dirfrag(this, fg); - assert(curdir); - assert(curdir->is_auth()); - - // is dir frozen? - if (curdir->is_frozen()) { - if (reply->is_empty()) { - dout(7) << *curdir << " is frozen, empty reply, waiting" << dendl; - curdir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis)); - delete reply; - return; - } else { - dout(7) << *curdir << " is frozen, non-empty reply, stopping" << dendl; - break; - } - } - - // add dir - if (reply->is_empty() && !dis->wants_base_dir()) { - dout(7) << "handle_discover not adding unwanted base dir " << *curdir << dendl; - } else { - assert(!curdir->is_ambiguous_auth()); // would be frozen. - reply->add_dir( curdir->replicate_to(dis->get_asker()) ); - dout(7) << "handle_discover added dir " << *curdir << dendl; - } - - // lookup - CDentry *dn = 0; - if (dis->get_want_ino()) { - // lookup by ino - CInode *in = get_inode(dis->get_want_ino()); - if (in && in->is_auth() && in->get_parent_dn()->get_dir() == curdir) - dn = in->get_parent_dn(); - } else if (dis->get_want().depth() > 0) { - // lookup dentry - dn = curdir->lookup( dis->get_dentry(i) ); - } else - break; // done! - - // incomplete dir? - if (!dn) { - if (!curdir->is_complete()) { - // readdir - dout(7) << "incomplete dir contents for " << *curdir << ", fetching" << dendl; - if (reply->is_empty()) { - // fetch and wait - curdir->fetch(new C_MDS_RetryMessage(mds, dis)); - return; - } else { - // initiate fetch, but send what we have so far - curdir->fetch(0); - break; - } - } - - // don't have wanted ino in this dir? - if (dis->get_want_ino()) { - // set error flag in reply - dout(7) << "ino " << dis->get_want_ino() << " in this dir, flagging error in " - << *curdir << dendl; - reply->set_flag_error_ino(); - break; - } - - // send null dentry - dout(7) << "dentry " << dis->get_dentry(i) << " dne, returning null in " - << *curdir << dendl; - dn = curdir->add_null_dentry(dis->get_dentry(i)); - } - assert(dn); - - // xlocked dentry? - // ...always block on non-tail items (they are unrelated) - // ...allow xlocked tail disocvery _only_ if explicitly requested - if (dn->lock.is_xlocked()) { - // is this the last (tail) item in the discover traversal? - bool tailitem = (dis->get_want().depth() == 0) || (i == dis->get_want().depth() - 1); - if (tailitem && dis->wants_xlocked()) { - dout(7) << "handle_discover allowing discovery of xlocked tail " << *dn << dendl; - } else { - dout(7) << "handle_discover blocking on xlocked " << *dn << dendl; - dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryMessage(mds, dis)); - delete reply; - return; - } - } - - // frozen inode? - if (dn->is_primary() && - dn->inode->is_frozen()) { - if (reply->is_empty()) { - dout(7) << *dn->inode << " is frozen, empty reply, waiting" << dendl; - dn->inode->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis)); - delete reply; - return; - } else { - dout(7) << *dn->inode << " is frozen, non-empty reply, stopping" << dendl; - break; - } - } - - // add dentry - reply->add_dentry( dn->replicate_to( dis->get_asker() ) ); - dout(7) << "handle_discover added dentry " << *dn << dendl; - - if (!dn->is_primary()) break; // stop on null or remote link. - - // add inode - CInode *next = dn->inode; - assert(next->is_auth()); - - reply->add_inode( next->replicate_to( dis->get_asker() ) ); - dout(7) << "handle_discover added inode " << *next << dendl; - - // descend, keep going. - cur = next; - continue; - } - - // how did we do? - assert(!reply->is_empty()); - dout(7) << "handle_discover sending result back to asker mds" << dis->get_asker() << dendl; - mds->send_message_mds(reply, dis->get_asker()); - - delete dis; -} - - -void MDCache::handle_discover_reply(MDiscoverReply *m) -{ - /* - if (mds->get_state() < MDSMap::STATE_ACTIVE) { - dout(-7) << "discover_reply NOT ACTIVE YET" << dendl; - delete m; - return; - } - */ - - list finished, error; - int from = m->get_source().num(); - - // starting point - CInode *cur = get_inode(m->get_base_ino()); - - if (m->has_base_inode()) { - assert(m->get_base_ino() < MDS_INO_BASE); - assert(!m->has_base_dentry()); - assert(!m->has_base_dir()); - - // add base inode - cur = add_replica_inode(m->get_inode(0), NULL, finished); - cur->force_auth = pair(m->get_source().num(), CDIR_AUTH_UNKNOWN); - - dout(7) << "discover_reply got base inode " << *cur << dendl; - - // take waiters - finished.swap(waiting_for_base_ino[from][cur->ino()]); - waiting_for_base_ino[from].erase(cur->ino()); - } - assert(cur); - - dout(7) << "discover_reply " << *cur - << " + " << m->get_num_dentries() << " dn, " - << m->get_num_inodes() << " inodes" - << dendl; - - // fyi - if (m->is_flag_error_dir()) - dout(7) << " flag error, dir" << dendl; - if (m->is_flag_error_dn()) - dout(7) << " flag error, dentry = " << m->get_error_dentry() << dendl; - if (m->is_flag_error_ino()) - dout(7) << " flag error, ino = " << m->get_wanted_ino() << dendl; - - dout(10) << "depth = " << m->get_depth() - << ", has base_dir/base_dn/root = " - << m->has_base_dir() << " / " << m->has_base_dentry() << " / " << m->has_base_inode() - << ", num dirs/dentries/inodes = " - << m->get_num_dirs() << " / " << m->get_num_dentries() << " / " << m->get_num_inodes() - << dendl; - - // decrement discover counters - if (m->get_wanted_base_dir()) { - inodeno_t ino = m->get_base_ino(); - assert(discover_dir[from].count(ino)); - if (--discover_dir[from][ino] == 0) - discover_dir[from].erase(ino); - } else if (m->get_base_ino() >= MDS_INO_BASE) { - dirfrag_t df(m->get_base_ino(), m->get_base_dir_frag()); - assert(discover_dir_sub[from].count(df)); - if (--discover_dir_sub[from][df] == 0) - discover_dir_sub[from].erase(df); - } - - // loop over discover results. - // indexes follow each ([[dir] dentry] inode) - // can start, end with any type. - for (int i=m->has_base_inode(); iget_depth(); i++) { - dout(10) << "discover_reply i=" << i << " cur " << *cur << dendl; - - // dir - frag_t fg; - CDir *curdir = 0; - if (i > 0 || m->has_base_dir()) { - assert(m->get_dir(i).get_dirfrag().ino == cur->ino()); - fg = m->get_dir(i).get_dirfrag().frag; - curdir = add_replica_dir(cur, fg, m->get_dir(i), - m->get_source().num(), - finished); - } - if (!curdir) { - fg = cur->pick_dirfrag(m->get_dentry(i).get_dname()); - curdir = cur->get_dirfrag(fg); - } - - // dentry error? - if (i == m->get_depth()-1 && (m->is_flag_error_dn() || m->is_flag_error_ino())) { - // error! - assert(cur->is_dir()); - if (curdir) { - if (m->get_error_dentry().length()) { - dout(7) << " flag_error on dentry " << m->get_error_dentry() - << ", triggering dentry" << dendl; - curdir->take_dentry_waiting(m->get_error_dentry(), error); - } else { - dout(7) << " flag_error on ino " << m->get_wanted_ino() - << ", triggering ino" << dendl; - curdir->take_ino_waiting(m->get_wanted_ino(), error); - } - } else { - dout(7) << " flag_error on dentry " << m->get_error_dentry() - << ", triggering dir?" << dendl; - cur->take_waiting(CInode::WAIT_DIR, error); - } - break; - } - - assert(curdir); - - // dentry - CDentry *dn = 0; - if (i >= m->get_last_dentry()) break; - if (i > 0 || m->has_base_dentry()) - dn = add_replica_dentry(curdir, m->get_dentry(i), finished); - - // inode - if (i >= m->get_last_inode()) break; - cur = add_replica_inode(m->get_inode(i), dn, finished); - } - - // dir error? - // or dir_auth hint? - if (m->is_flag_error_dir() && !cur->is_dir()) { - // not a dir. - cur->take_waiting(CInode::WAIT_DIR, error); - } else if (m->is_flag_error_dir() || - (m->get_dir_auth_hint() != CDIR_AUTH_UNKNOWN && - m->get_dir_auth_hint() != mds->get_nodeid())) { - int who = m->get_dir_auth_hint(); - if (who == mds->get_nodeid()) who = -1; - if (who >= 0) - dout(7) << " dir_auth_hint is " << m->get_dir_auth_hint() << dendl; - - // try again? - if (m->get_error_dentry().length()) { - // wanted a dentry - frag_t fg = cur->pick_dirfrag(m->get_error_dentry()); - CDir *dir = cur->get_dirfrag(fg); - if (dir) { - // don't actaully need the hint, now - if (dir->lookup(m->get_error_dentry()) == 0 && - dir->is_waiting_for_dentry(m->get_error_dentry())) - discover_path(dir, m->get_error_dentry(), 0, m->get_wanted_xlocked()); - else - dout(7) << " doing nothing, have dir but nobody is waiting on dentry " - << m->get_error_dentry() << dendl; - } else { - if (cur->is_waiter_for(CInode::WAIT_DIR)) - discover_path(cur, m->get_error_dentry(), 0, m->get_wanted_xlocked(), who); - else - dout(7) << " doing nothing, nobody is waiting for dir" << dendl; - } - } else { - // wanted just the dir - frag_t fg = m->get_base_dir_frag(); - if (cur->get_dirfrag(fg) == 0 && cur->is_waiter_for(CInode::WAIT_DIR)) - discover_dir_frag(cur, fg, 0, who); - else - dout(7) << " doing nothing, nobody is waiting for dir" << dendl; - } - } - - // waiters - finish_contexts(error, -ENOENT); // finish errors directly - mds->queue_waiters(finished); - - // done - delete m; -} - - - -// ---------------------------- -// REPLICAS - -CDir *MDCache::add_replica_dir(CInode *diri, - frag_t fg, CDirDiscover &dis, int from, - list& finished) -{ - // add it (_replica_) - CDir *dir = diri->get_dirfrag(fg); - - if (dir) { - // had replica. update w/ new nonce. - dis.update_dir(dir); - dout(7) << "add_replica_dir had " << *dir << " nonce " << dir->replica_nonce << dendl; - } else { - // force frag to leaf in the diri tree - if (!diri->dirfragtree.is_leaf(fg)) { - dout(7) << "add_replica_dir forcing frag " << fg << " to leaf in the fragtree " - << diri->dirfragtree << dendl; - diri->dirfragtree.force_to_leaf(fg); - } - - // add replica. - dir = diri->add_dirfrag( new CDir(diri, fg, this, false) ); - dis.update_dir(dir); - - // is this a dir_auth delegation boundary? - if (from != diri->authority().first || - diri->is_ambiguous_auth() || - diri->ino() < MDS_INO_BASE) - adjust_subtree_auth(dir, from); - - dout(7) << "add_replica_dir added " << *dir << " nonce " << dir->replica_nonce << dendl; - - // get waiters - diri->take_waiting(CInode::WAIT_DIR, finished); - } - - return dir; -} - -CDir *MDCache::forge_replica_dir(CInode *diri, frag_t fg, int from) -{ - assert(mds->mdsmap->get_state(from) < MDSMap::STATE_REJOIN); - - // forge a replica. - CDir *dir = diri->add_dirfrag( new CDir(diri, fg, this, false) ); - - // i'm assuming this is a subtree root. - adjust_subtree_auth(dir, from); - - dout(7) << "forge_replica_dir added " << *dir << " while mds" << from << " is down" << dendl; - - return dir; -} - -CDentry *MDCache::add_replica_dentry(CDir *dir, CDentryDiscover &dis, list& finished) -{ - CDentry *dn = dir->lookup( dis.get_dname() ); - - // have it? - if (dn) { - dis.update_dentry(dn); - dout(7) << "add_replica_dentry had " << *dn << dendl; - } else { - dn = dir->add_null_dentry(dis.get_dname()); - dis.update_dentry(dn); - dis.init_dentry_lock(dn); - dout(7) << "add_replica_dentry added " << *dn << dendl; - } - - // remote_ino linkage? - if (dis.get_remote_ino()) { - if (dn->is_null()) - dir->link_remote_inode(dn, dis.get_remote_ino(), dis.get_remote_d_type()); - - // hrm. yeah. - assert(dn->is_remote() && dn->get_remote_ino() == dis.get_remote_ino()); - } - - dir->take_dentry_waiting(dis.get_dname(), finished); - - return dn; -} - -CInode *MDCache::add_replica_inode(CInodeDiscover& dis, CDentry *dn, list& finished) -{ - CInode *in = get_inode(dis.get_ino()); - if (!in) { - in = new CInode(this, false); - dis.update_inode(in); - dis.init_inode_locks(in); - add_inode(in); - dout(10) << "add_replica_inode had " << *in << dendl; - if (dn && dn->is_null()) - dn->dir->link_primary_inode(dn, in); - } else { - dis.update_inode(in); - dout(10) << "add_replica_inode added " << *in << dendl; - } - - if (dn) { - assert(dn->is_primary()); - assert(dn->inode == in); - - dn->get_dir()->take_ino_waiting(in->ino(), finished); - } - - return in; -} - - -CDentry *MDCache::add_replica_stray(bufferlist &bl, CInode *in, int from) -{ - list finished; - int off = 0; - - // inode - CInodeDiscover indis; - indis._decode(bl, off); - CInode *strayin = add_replica_inode(indis, NULL, finished); - strayin->force_auth = pair(from, CDIR_AUTH_UNKNOWN); - dout(15) << "strayin " << *strayin << dendl; - - // dir - CDirDiscover dirdis; - dirdis._decode(bl, off); - CDir *straydir = add_replica_dir(strayin, dirdis.get_dirfrag().frag, dirdis, - from, finished); - dout(15) << "straydir " << *straydir << dendl; - - // dentry - CDentryDiscover dndis; - dndis._decode(bl, off); - - string straydname; - in->name_stray_dentry(straydname); - CDentry *straydn = add_replica_dentry(straydir, dndis, finished); - - mds->queue_waiters(finished); - - return straydn; -} - - - -/* -int MDCache::send_inode_updates(CInode *in) -{ - assert(in->is_auth()); - for (set::iterator it = in->cached_by_begin(); - it != in->cached_by_end(); - it++) { - dout(7) << "sending inode_update on " << *in << " to " << *it << dendl; - assert(*it != mds->get_nodeid()); - mds->send_message_mds(new MInodeUpdate(in, in->get_cached_by_nonce(*it)), *it); - } - - return 0; -} - - -void MDCache::handle_inode_update(MInodeUpdate *m) -{ - inodeno_t ino = m->get_ino(); - CInode *in = get_inode(m->get_ino()); - if (!in) { - //dout(7) << "inode_update on " << m->get_ino() << ", don't have it, ignoring" << dendl; - dout(7) << "inode_update on " << m->get_ino() << ", don't have it, sending expire" << dendl; - MCacheExpire *expire = new MCacheExpire(mds->get_nodeid()); - expire->add_inode(m->get_ino(), m->get_nonce()); - mds->send_message_mds(expire, m->get_source().num()); - goto out; - } - - if (in->is_auth()) { - dout(7) << "inode_update on " << *in << ", but i'm the authority!" << dendl; - assert(0); // this should never happen - } - - dout(7) << "inode_update on " << *in << dendl; - - // update! NOTE dir_auth is unaffected by this. - in->decode_basic_state(m->get_payload()); - - out: - // done - delete m; -} -*/ - - - - - - -int MDCache::send_dir_updates(CDir *dir, bool bcast) -{ - // this is an FYI, re: replication - - set who; - if (bcast) { - mds->get_mds_map()->get_active_mds_set(who); - } else { - for (map::iterator p = dir->replicas_begin(); - p != dir->replicas_end(); - ++p) - who.insert(p->first); - } - - dout(7) << "sending dir_update on " << *dir << " bcast " << bcast << " to " << who << dendl; - - string path; - dir->inode->make_path(path); - - int whoami = mds->get_nodeid(); - for (set::iterator it = who.begin(); - it != who.end(); - it++) { - if (*it == whoami) continue; - //if (*it == except) continue; - dout(7) << "sending dir_update on " << *dir << " to " << *it << dendl; - - mds->send_message_mds(new MDirUpdate(dir->dirfrag(), - dir->dir_rep, - dir->dir_rep_by, - path, - bcast), - *it); - } - - return 0; -} - - -void MDCache::handle_dir_update(MDirUpdate *m) -{ - CDir *dir = get_dirfrag(m->get_dirfrag()); - if (!dir) { - dout(5) << "dir_update on " << m->get_dirfrag() << ", don't have it" << dendl; - - // discover it? - if (m->should_discover()) { - // only try once! - // this is key to avoid a fragtree update race, among other things. - m->tried_discover(); - vector trace; - filepath path = m->get_path(); - - dout(5) << "trying discover on dir_update for " << path << dendl; - - int r = path_traverse(0, m, - 0, path, trace, true, - MDS_TRAVERSE_DISCOVER); - if (r > 0) - return; - assert(r == 0); - - CInode *in = get_inode(m->get_dirfrag().ino); - assert(in); - open_remote_dirfrag(in, m->get_dirfrag().frag, - new C_MDS_RetryMessage(mds, m)); - return; - } - - delete m; - return; - } - - // update - dout(5) << "dir_update on " << *dir << dendl; - dir->dir_rep = m->get_dir_rep(); - dir->dir_rep_by = m->get_dir_rep_by(); - - // done - delete m; -} - - - - - - -// UNLINK - -void MDCache::handle_dentry_unlink(MDentryUnlink *m) -{ - CDir *dir = get_dirfrag(m->get_dirfrag()); - - if (!dir) { - dout(7) << "handle_dentry_unlink don't have dirfrag " << m->get_dirfrag() << dendl; - } - else { - CDentry *dn = dir->lookup(m->get_dn()); - if (!dn) { - dout(7) << "handle_dentry_unlink don't have dentry " << *dir << " dn " << m->get_dn() << dendl; - } else { - dout(7) << "handle_dentry_unlink on " << *dn << dendl; - - // move to stray? - CDentry *straydn = 0; - if (m->strayin) { - list finished; - CInode *in = add_replica_inode(*m->strayin, NULL, finished); - CDir *dir = add_replica_dir(in, m->straydir->get_dirfrag().frag, *m->straydir, - m->get_source().num(), finished); - straydn = add_replica_dentry(dir, *m->straydn, finished); - if (!finished.empty()) mds->queue_waiters(finished); - } - - // open inode? - if (dn->is_primary()) { - CInode *in = dn->inode; - dn->dir->unlink_inode(dn); - assert(straydn); - straydn->dir->link_primary_inode(straydn, in); - } else { - assert(dn->is_remote()); - dn->dir->unlink_inode(dn); - } - assert(dn->is_null()); - - // move to bottom of lru - lru.lru_bottouch(dn); - } - } - - delete m; - return; -} - - - - - - -// =================================================================== -// FRAGMENT - - -/** - * adjust_dir_fragments -- adjust fragmentation for a directory - * - * @diri - directory inode - * @basefrag - base fragment - * @bits - bit adjustment. positive for split, negative for merge. - */ -void MDCache::adjust_dir_fragments(CInode *diri, frag_t basefrag, int bits, - list& resultfrags, - list& waiters) -{ - dout(10) << "adjust_dir_fragments " << basefrag << " " << bits - << " on " << *diri << dendl; - - // yuck. we may have discovered the inode while it was being fragmented. - if (!diri->dirfragtree.is_leaf(basefrag)) - diri->dirfragtree.force_to_leaf(basefrag); - - CDir *base = diri->get_or_open_dirfrag(this, basefrag); - - // adjust fragtree - diri->dirfragtree.split(basefrag, bits); - dout(10) << " new fragtree is " << diri->dirfragtree << dendl; - - if (bits > 0) { - if (base) { - CDir *baseparent = base->get_parent_dir(); - - base->split(bits, resultfrags, waiters); - - // did i change the subtree map? - if (base->is_subtree_root()) { - // am i a bound? - if (baseparent) { - CDir *parent = get_subtree_root(baseparent); - assert(subtrees[parent].count(base)); - subtrees[parent].erase(base); - for (list::iterator p = resultfrags.begin(); - p != resultfrags.end(); - ++p) { - subtrees[parent].insert(*p); - subtrees[*p].clear(); // new frag is now its own subtree - } - } - - // adjust my bounds. - set bounds; - bounds.swap(subtrees[base]); - subtrees.erase(base); - for (set::iterator p = bounds.begin(); - p != bounds.end(); - ++p) { - CDir *frag = get_subtree_root((*p)->get_parent_dir()); - subtrees[frag].insert(*p); - } - - show_subtrees(10); - } - } - } else { - assert(base); - base->merge(bits, waiters); - resultfrags.push_back(base); - assert(0); // FIXME adjust subtree map! and clean up this code, probably. - } -} - -class C_MDC_FragmentGo : public Context { - MDCache *mdcache; - CInode *diri; - list dirs; - frag_t basefrag; - int bits; -public: - C_MDC_FragmentGo(MDCache *m, CInode *di, list& dls, frag_t bf, int b) : - mdcache(m), diri(di), dirs(dls), basefrag(bf), bits(b) { } - virtual void finish(int r) { - mdcache->fragment_go(diri, dirs, basefrag, bits); - } -}; - -void MDCache::split_dir(CDir *dir, int bits) -{ - dout(7) << "split_dir " << *dir << " bits " << bits << dendl; - assert(dir->is_auth()); - - if (mds->mdsmap->is_degraded()) { - dout(7) << "cluster degraded, no fragmenting for now" << dendl; - return; - } - if (dir->inode->is_root()) { - dout(7) << "i won't fragment root" << dendl; - //assert(0); - return; - } - if (dir->state_test(CDir::STATE_FRAGMENTING)) { - dout(7) << "already fragmenting" << dendl; - return; - } - if (!dir->can_auth_pin()) { - dout(7) << "not authpinnable on " << *dir << dendl; - return; - } - - list startfrags; - startfrags.push_back(dir); - - dir->state_set(CDir::STATE_FRAGMENTING); - - fragment_freeze(dir->get_inode(), startfrags, dir->get_frag(), bits); - fragment_mark_and_complete(dir->get_inode(), startfrags, dir->get_frag(), bits); -} - -/* - * initial the freeze, blocking with an auth_pin. - * - * some reason(s) we have to freeze: - * - on merge, version/projected version are unified from all fragments; - * concurrent pipelined updates in the directory will have divergent - * versioning... and that's no good. - */ -void MDCache::fragment_freeze(CInode *diri, list& frags, frag_t basefrag, int bits) -{ - C_Gather *gather = new C_Gather(new C_MDC_FragmentGo(this, diri, frags, basefrag, bits)); - - // freeze the dirs - for (list::iterator p = frags.begin(); - p != frags.end(); - ++p) { - CDir *dir = *p; - dir->auth_pin(); // this will block the freeze - dir->freeze_dir(); - assert(dir->is_freezing_dir()); - dir->add_waiter(CDir::WAIT_FROZEN, gather->new_sub()); - } -} - -class C_MDC_FragmentMarking : public Context { - MDCache *mdcache; - CInode *diri; - list dirs; - frag_t basefrag; - int bits; -public: - C_MDC_FragmentMarking(MDCache *m, CInode *di, list& dls, frag_t bf, int b) : - mdcache(m), diri(di), dirs(dls), basefrag(bf), bits(b) { } - virtual void finish(int r) { - mdcache->fragment_mark_and_complete(diri, dirs, basefrag, bits); - } -}; - -void MDCache::fragment_mark_and_complete(CInode *diri, - list& startfrags, - frag_t basefrag, int bits) -{ - dout(10) << "fragment_mark_and_complete " << basefrag << " by " << bits - << " on " << *diri << dendl; - - C_Gather *gather = 0; - - for (list::iterator p = startfrags.begin(); - p != startfrags.end(); - ++p) { - CDir *dir = *p; - - if (!dir->is_complete()) { - dout(15) << " fetching incomplete " << *dir << dendl; - if (!gather) gather = new C_Gather(new C_MDC_FragmentMarking(this, diri, startfrags, basefrag, bits)); - dir->fetch(gather->new_sub(), - true); // ignore authpinnability - } - else if (!dir->state_test(CDir::STATE_DNPINNEDFRAG)) { - dout(15) << " marking " << *dir << dendl; - for (CDir::map_t::iterator p = dir->items.begin(); - p != dir->items.end(); - ++p) { - p->second->get(CDentry::PIN_FRAGMENTING); - p->second->state_set(CDentry::STATE_FRAGMENTING); - } - dir->state_set(CDir::STATE_DNPINNEDFRAG); - dir->auth_unpin(); // allow our freeze to complete - } - else { - dout(15) << " marked " << *dir << dendl; - } - } -} - - -class C_MDC_FragmentStored : public Context { - MDCache *mdcache; - CInode *diri; - frag_t basefrag; - int bits; - list resultfrags; -public: - C_MDC_FragmentStored(MDCache *m, CInode *di, frag_t bf, int b, - list& rf) : - mdcache(m), diri(di), basefrag(bf), bits(b), resultfrags(rf) { } - virtual void finish(int r) { - mdcache->fragment_stored(diri, basefrag, bits, resultfrags); - } -}; - -void MDCache::fragment_go(CInode *diri, list& startfrags, frag_t basefrag, int bits) -{ - dout(10) << "fragment_go " << basefrag << " by " << bits - << " on " << *diri << dendl; - - // refragment - list resultfrags; - list waiters; - adjust_dir_fragments(diri, basefrag, bits, resultfrags, waiters); - mds->queue_waiters(waiters); - - C_Gather *gather = new C_Gather(new C_MDC_FragmentStored(this, diri, basefrag, bits, resultfrags)); - - // freeze, store resulting frags - for (list::iterator p = resultfrags.begin(); - p != resultfrags.end(); - p++) { - CDir *dir = *p; - dout(10) << " result frag " << *dir << dendl; - dir->state_set(CDir::STATE_FRAGMENTING); - dir->commit(0, gather->new_sub()); - dir->_freeze_dir(); - } -} - -class C_MDC_FragmentLogged : public Context { - MDCache *mdcache; - CInode *diri; - frag_t basefrag; - int bits; - list resultfrags; - vector pvs; - LogSegment *ls; -public: - C_MDC_FragmentLogged(MDCache *m, CInode *di, frag_t bf, int b, - list& rf, vector& p, - LogSegment *s) : - mdcache(m), diri(di), basefrag(bf), bits(b), ls(s) { - resultfrags.swap(rf); - pvs.swap(p); - } - virtual void finish(int r) { - mdcache->fragment_logged(diri, basefrag, bits, - resultfrags, pvs, - ls); - } -}; - -void MDCache::fragment_stored(CInode *diri, frag_t basefrag, int bits, - list& resultfrags) -{ - dout(10) << "fragment_stored " << basefrag << " by " << bits - << " on " << *diri << dendl; - - EFragment *le = new EFragment(mds->mdlog, diri->ino(), basefrag, bits); - - set peers; - vector pvs; - for (list::iterator p = resultfrags.begin(); - p != resultfrags.end(); - p++) { - CDir *dir = *p; - dout(10) << " result frag " << *dir << dendl; - - if (p == resultfrags.begin()) { - le->metablob.add_dir_context(dir); - // note peers - // only do this once: all frags have identical replica_maps. - if (peers.empty()) - for (map::iterator p = dir->replica_map.begin(); - p != dir->replica_map.end(); - ++p) - peers.insert(p->first); - } - - pvs.push_back(dir->pre_dirty()); - le->metablob.add_dir(dir, true); - } - - mds->mdlog->submit_entry(le, - new C_MDC_FragmentLogged(this, diri, basefrag, bits, - resultfrags, pvs, mds->mdlog->get_current_segment())); - - // announcelist& resultfrags, - for (set::iterator p = peers.begin(); - p != peers.end(); - ++p) { - MMDSFragmentNotify *notify = new MMDSFragmentNotify(diri->ino(), basefrag, bits); - if (bits < 0) { - // freshly replicate basedir to peer on merge - CDir *base = resultfrags.front(); - CDirDiscover *basedis = base->replicate_to(*p); - basedis->_encode(notify->basebl); - delete basedis; - } - mds->send_message_mds(notify, *p); - } - -} - -void MDCache::fragment_logged(CInode *diri, frag_t basefrag, int bits, - list& resultfrags, - vector& pvs, - LogSegment *ls) -{ - dout(10) << "fragment_logged " << basefrag << " bits " << bits - << " on " << *diri << dendl; - - - // dirty resulting frags - set peers; - vector::iterator pv = pvs.begin(); - for (list::iterator p = resultfrags.begin(); - p != resultfrags.end(); - p++) { - CDir *dir = *p; - dout(10) << " result frag " << *dir << dendl; - - // dirty, unpin, unfreeze - dir->state_clear(CDir::STATE_FRAGMENTING); - dir->mark_dirty(*pv, ls); - pv++; - - for (CDir::map_t::iterator p = dir->items.begin(); - p != dir->items.end(); - ++p) { - CDentry *dn = p->second; - if (dn->state_test(CDentry::STATE_FRAGMENTING)) - dn->put(CDentry::PIN_FRAGMENTING); - } - - dir->unfreeze_dir(); - } -} - - - -void MDCache::handle_fragment_notify(MMDSFragmentNotify *notify) -{ - dout(10) << "handle_fragment_notify " << *notify << " from " << notify->get_source() << dendl; - - CInode *diri = get_inode(notify->get_ino()); - if (diri) { - list waiters; - - // add replica dir (for merge)? - // (adjust_dir_fragments expects base to already exist, if non-auth) - if (notify->get_bits() < 0) { - CDirDiscover basedis; - int off = 0; - basedis._decode(notify->basebl, off); - add_replica_dir(diri, notify->get_basefrag(), basedis, - notify->get_source().num(), waiters); - } - - // refragment - list resultfrags; - adjust_dir_fragments(diri, notify->get_basefrag(), notify->get_bits(), - resultfrags, waiters); - mds->queue_waiters(waiters); - } - - delete notify; -} - - - - - -// ============================================================== -// debug crap - -void MDCache::show_subtrees(int dbl) -{ - //dout(10) << "show_subtrees" << dendl; - - if (dbl > g_conf.debug && dbl > g_conf.debug_mds) - return; // i won't print anything. - - if (subtrees.empty()) { - dout(dbl) << "show_subtrees - no subtrees" << dendl; - return; - } - - // root frags - list basefrags; - for (set::iterator p = base_inodes.begin(); - p != base_inodes.end(); - ++p) - (*p)->get_dirfrags(basefrags); - //dout(15) << "show_subtrees, base dirfrags " << basefrags << dendl; - dout(15) << "show_subtrees" << dendl; - - // queue stuff - list > q; - string indent; - set seen; - - // calc max depth - for (list::iterator p = basefrags.begin(); p != basefrags.end(); ++p) - q.push_back(pair(*p, 0)); - - set subtrees_seen; - - int depth = 0; - while (!q.empty()) { - CDir *dir = q.front().first; - int d = q.front().second; - q.pop_front(); - - if (subtrees.count(dir) == 0) continue; - - subtrees_seen.insert(dir); - - if (d > depth) depth = d; - - // sanity check - //dout(25) << "saw depth " << d << " " << *dir << dendl; - if (seen.count(dir)) dout(0) << "aah, already seen " << *dir << dendl; - assert(seen.count(dir) == 0); - seen.insert(dir); - - // nested items? - if (!subtrees[dir].empty()) { - for (set::iterator p = subtrees[dir].begin(); - p != subtrees[dir].end(); - ++p) { - //dout(25) << " saw sub " << **p << dendl; - q.push_front(pair(*p, d+1)); - } - } - } - - - // print tree - for (list::iterator p = basefrags.begin(); p != basefrags.end(); ++p) - q.push_back(pair(*p, 0)); - - while (!q.empty()) { - CDir *dir = q.front().first; - int d = q.front().second; - q.pop_front(); - - if (subtrees.count(dir) == 0) continue; - - // adjust indenter - while ((unsigned)d < indent.size()) - indent.resize(d); - - // pad - string pad = "______________________________________"; - pad.resize(depth*2+1-indent.size()); - if (!subtrees[dir].empty()) - pad[0] = '.'; // parent - - - string auth; - if (dir->is_auth()) - auth = "auth "; - else - auth = " rep "; - - char s[10]; - if (dir->get_dir_auth().second == CDIR_AUTH_UNKNOWN) - sprintf(s, "%2d ", dir->get_dir_auth().first); - else - sprintf(s, "%2d,%2d", dir->get_dir_auth().first, dir->get_dir_auth().second); - - // print - dout(dbl) << indent << "|_" << pad << s << " " << auth << *dir << dendl; - - if (dir->ino() == MDS_INO_ROOT) - assert(dir->inode == root); - if (dir->ino() == MDS_INO_STRAY(mds->get_nodeid())) - assert(dir->inode == stray); - - // nested items? - if (!subtrees[dir].empty()) { - // more at my level? - if (!q.empty() && q.front().second == d) - indent += "| "; - else - indent += " "; - - for (set::iterator p = subtrees[dir].begin(); - p != subtrees[dir].end(); - ++p) - q.push_front(pair(*p, d+2)); - } - } - - // verify there isn't stray crap in subtree map - int lost = 0; - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) { - if (subtrees_seen.count(p->first)) continue; - dout(10) << "*** stray/lost entry in subtree map: " << *p->first << dendl; - lost++; - } - assert(lost == 0); -} - - -void MDCache::show_cache() -{ - dout(7) << "show_cache" << dendl; - - for (hash_map::iterator it = inode_map.begin(); - it != inode_map.end(); - it++) { - // unlinked? - if (!it->second->parent) - dout(7) << " unlinked " << *it->second << dendl; - - // dirfrags? - list dfs; - it->second->get_dirfrags(dfs); - for (list::iterator p = dfs.begin(); p != dfs.end(); ++p) { - CDir *dir = *p; - dout(7) << " dirfrag " << *dir << dendl; - - for (CDir::map_t::iterator p = dir->items.begin(); - p != dir->items.end(); - ++p) { - CDentry *dn = p->second; - dout(7) << " dentry " << *dn << dendl; - if (dn->is_primary() && dn->inode) - dout(7) << " inode " << *dn->inode << dendl; - } - } - } -} - - -void MDCache::dump_cache() -{ - if (g_conf.debug_mds < 2) return; - - char fn[20]; - sprintf(fn, "cachedump.%d.mds%d", mds->mdsmap->get_epoch(), mds->get_nodeid()); - - dout(1) << "dump_cache to " << fn << dendl; - - ofstream myfile; - myfile.open(fn); - - for (hash_map::iterator it = inode_map.begin(); - it != inode_map.end(); - it++) { - list dfs; - it->second->get_dirfrags(dfs); - for (list::iterator p = dfs.begin(); p != dfs.end(); ++p) { - CDir *dir = *p; - myfile << *dir->inode << std::endl; - myfile << *dir << std::endl; - - for (CDir::map_t::iterator p = dir->items.begin(); - p != dir->items.end(); - ++p) { - CDentry *dn = p->second; - myfile << *dn << std::endl; - } - } - } - - myfile.close(); -} diff --git a/branches/sage/ebofs2/mds/MDCache.h b/branches/sage/ebofs2/mds/MDCache.h deleted file mode 100644 index 86e3b894c6c8d..0000000000000 --- a/branches/sage/ebofs2/mds/MDCache.h +++ /dev/null @@ -1,721 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __MDCACHE_H -#define __MDCACHE_H - -#include -#include -#include -#include -#include - -#include "include/types.h" -#include "include/filepath.h" - -#include "CInode.h" -#include "CDentry.h" -#include "CDir.h" -#include "include/Context.h" -#include "events/EMetaBlob.h" - -class MDS; -class Migrator; -class Renamer; - -class Logger; - -class Message; - -class MMDSResolve; -class MMDSResolveAck; -class MMDSCacheRejoin; -class MMDSCacheRejoinAck; -class MDiscover; -class MDiscoverReply; -class MCacheExpire; -class MDirUpdate; -class MDentryUnlink; -class MLock; - -class Message; -class MClientRequest; -class MMDSSlaveRequest; - -class MMDSFragmentNotify; - -class ESubtreeMap; - - -// MDCache - -//typedef const char* pchar; - - -struct PVList { - map ls; - - version_t add(MDSCacheObject* o, version_t v) { - return ls[o] = v; - } -}; - -/** active_request_t - * state we track for requests we are currently processing. - * mostly information about locks held, so that we can drop them all - * the request is finished or forwarded. see request_*(). - */ -struct MDRequest { - metareqid_t reqid; - - // -- i am a client (master) request - MClientRequest *client_request; // client request (if any) - - vector trace; // original path traversal. - CInode *ref; // reference inode. if there is only one, and its path is pinned. - - // -- i am a slave request - MMDSSlaveRequest *slave_request; // slave request (if one is pending; implies slave == true) - int slave_to_mds; // this is a slave request if >= 0. - - // -- misc -- - LogSegment *ls; // the log segment i'm committing to - utime_t now; - - // -- my pins and locks -- - // cache pins (so things don't expire) - set< MDSCacheObject* > pins; - set stickydirs; - - // auth pins - set< MDSCacheObject* > remote_auth_pins; - set< MDSCacheObject* > auth_pins; - - // held locks - set< SimpleLock* > rdlocks; // always local. - set< SimpleLock* > wrlocks; // always local. - set< SimpleLock* > xlocks; // local or remote. - set< SimpleLock*, SimpleLock::ptr_lt > locks; // full ordering - - // if this flag is set, do not attempt to acquire further locks. - // (useful for wrlock, which may be a moving auth target) - bool done_locking; - bool committing; - bool aborted; - - struct More { - set slaves; // mds nodes that have slave requests to me (implies client_request) - set waiting_on_slave; // peers i'm waiting for slavereq replies from. - - // for rename/link/unlink - set witnessed; // nodes who have journaled a RenamePrepare - map pvmap; - - // for rename - set extra_witnesses; // replica list from srcdn auth (rename) - version_t src_reanchor_atid; // src->dst - version_t dst_reanchor_atid; // dst->stray - bufferlist inode_import; - version_t inode_import_v; - CInode* destdn_was_remote_inode; - bool was_link_merge; - - // called when slave commits or aborts - Context *slave_commit; - - More() : - src_reanchor_atid(0), dst_reanchor_atid(0), inode_import_v(0), - destdn_was_remote_inode(0), was_link_merge(false), - slave_commit(0) { } - } *_more; - - - // --------------------------------------------------- - MDRequest() : - client_request(0), ref(0), - slave_request(0), slave_to_mds(-1), - ls(0), - done_locking(false), committing(false), aborted(false), - _more(0) {} - MDRequest(metareqid_t ri, MClientRequest *req) : - reqid(ri), client_request(req), ref(0), - slave_request(0), slave_to_mds(-1), - ls(0), - done_locking(false), committing(false), aborted(false), - _more(0) {} - MDRequest(metareqid_t ri, int by) : - reqid(ri), client_request(0), ref(0), - slave_request(0), slave_to_mds(by), - ls(0), - done_locking(false), committing(false), aborted(false), - _more(0) {} - ~MDRequest() { - delete _more; - } - - bool is_master() { return slave_to_mds < 0; } - bool is_slave() { return slave_to_mds >= 0; } - - More* more() { - if (!_more) _more = new More(); - return _more; - } - - bool slave_did_prepare() { return more()->slave_commit; } - - - // pin items in cache - void pin(MDSCacheObject *o) { - if (pins.count(o) == 0) { - o->get(MDSCacheObject::PIN_REQUEST); - pins.insert(o); - } - } - void set_stickydirs(CInode *in) { - if (stickydirs.count(in) == 0) { - in->get_stickydirs(); - stickydirs.insert(in); - } - } - - // auth pins - bool is_auth_pinned(MDSCacheObject *object) { - return auth_pins.count(object) || remote_auth_pins.count(object); - } - void auth_pin(MDSCacheObject *object) { - if (!is_auth_pinned(object)) { - object->auth_pin(); - auth_pins.insert(object); - } - } - void auth_unpin(MDSCacheObject *object) { - assert(is_auth_pinned(object)); - object->auth_unpin(); - auth_pins.erase(object); - } - void drop_local_auth_pins() { - for (set::iterator it = auth_pins.begin(); - it != auth_pins.end(); - it++) { - assert((*it)->is_auth()); - (*it)->auth_unpin(); - } - auth_pins.clear(); - } -}; - -inline ostream& operator<<(ostream& out, MDRequest &mdr) -{ - out << "request(" << mdr.reqid; - //if (mdr.request) out << " " << *mdr.request; - if (mdr.is_slave()) out << " slave_to mds" << mdr.slave_to_mds; - if (mdr.client_request) out << " cr=" << mdr.client_request; - if (mdr.slave_request) out << " sr=" << mdr.slave_request; - out << ")"; - return out; -} - -struct MDSlaveUpdate { - EMetaBlob commit; - EMetaBlob rollback; - xlist::item xlistitem; - Context *waiter; - MDSlaveUpdate() : xlistitem(this), waiter(0) {} - MDSlaveUpdate(EMetaBlob c, EMetaBlob r, xlist &list) : - commit(c), rollback(r), - xlistitem(this), - waiter(0) { - list.push_back(&xlistitem); - } - ~MDSlaveUpdate() { - if (waiter) waiter->finish(0); - delete waiter; - } -}; - - -class MDCache { - public: - // my master - MDS *mds; - - // -- my cache -- - LRU lru; // dentry lru for expiring items from cache - protected: - hash_map inode_map; // map of inodes by ino - CInode *root; // root inode - CInode *stray; // my stray dir - - set base_inodes; // inodes < MDS_INO_BASE (root, stray, etc.) - - // -- discover -- - // waiters - map > > waiting_for_base_ino; - - // in process discovers, by mds. - // this is just enough info to kick any waiters in the event of a failure. - // FIXME: use pointers here instead of identifiers? - map > discover_dir; - map > discover_dir_sub; - - void discover_base_ino(inodeno_t want_ino, Context *onfinish, int from=-1); - void discover_dir_frag(CInode *base, frag_t approx_fg, Context *onfinish, - int from=-1); - void discover_path(CInode *base, filepath want_path, Context *onfinish, - bool want_xlocked=false, int from=-1); - void discover_path(CDir *base, filepath want_path, Context *onfinish, - bool want_xlocked=false); - void discover_ino(CDir *base, inodeno_t want_ino, Context *onfinish, - bool want_xlocked=false); - - void kick_discovers(int who); // after a failure. - - -public: - int get_num_inodes() { return inode_map.size(); } - int get_num_dentries() { return lru.lru_get_size(); } - - - // -- subtrees -- -protected: - map > subtrees; // nested bounds on subtrees. - - // adjust subtree auth specification - // dir->dir_auth - // imports/exports/nested_exports - // join/split subtrees as appropriate -public: - bool is_subtrees() { return !subtrees.empty(); } - void list_subtrees(list& ls); - void adjust_subtree_auth(CDir *root, pair auth); - void adjust_subtree_auth(CDir *root, int a, int b=CDIR_AUTH_UNKNOWN) { - adjust_subtree_auth(root, pair(a,b)); - } - void adjust_bounded_subtree_auth(CDir *dir, set& bounds, pair auth); - void adjust_bounded_subtree_auth(CDir *dir, set& bounds, int a) { - adjust_bounded_subtree_auth(dir, bounds, pair(a, CDIR_AUTH_UNKNOWN)); - } - void adjust_bounded_subtree_auth(CDir *dir, list& bounds, pair auth); - void adjust_bounded_subtree_auth(CDir *dir, list& bounds, int a) { - adjust_bounded_subtree_auth(dir, bounds, pair(a, CDIR_AUTH_UNKNOWN)); - } - void map_dirfrag_set(list& dfs, set& result); - void try_subtree_merge(CDir *root); - void try_subtree_merge_at(CDir *root); - void subtree_merge_writebehind_finish(CInode *in, LogSegment *ls); - void eval_subtree_root(CDir *dir); - CDir *get_subtree_root(CDir *dir); - void remove_subtree(CDir *dir); - void get_subtree_bounds(CDir *root, set& bounds); - void get_wouldbe_subtree_bounds(CDir *root, set& bounds); - void verify_subtree_bounds(CDir *root, const set& bounds); - void verify_subtree_bounds(CDir *root, const list& bounds); - - void adjust_subtree_after_rename(CInode *diri, CDir *olddir); - - void get_auth_subtrees(set& s); - void get_fullauth_subtrees(set& s); - - int num_subtrees(); - int num_subtrees_fullauth(); - int num_subtrees_fullnonauth(); - - -protected: - // delayed cache expire - map > delayed_expire; // subtree root -> expire msg - - - // -- requests -- -protected: - hash_map active_requests; - -public: - MDRequest* request_start(MClientRequest *req); - MDRequest* request_start_slave(metareqid_t rid, int by); - bool have_request(metareqid_t rid) { - return active_requests.count(rid); - } - MDRequest* request_get(metareqid_t rid); - void request_pin_ref(MDRequest *r, CInode *ref, vector& trace); - void request_finish(MDRequest *mdr); - void request_forward(MDRequest *mdr, int mds, int port=0); - void dispatch_request(MDRequest *mdr); - void request_forget_foreign_locks(MDRequest *mdr); - void request_cleanup(MDRequest *r); - - - // inode purging - map > purging; // inode -> newsize -> oldsize - map > purging_ls; - map > > waiting_for_purge; - - // -- recovery -- -protected: - set recovery_set; - -public: - void set_recovery_set(set& s); - void handle_mds_failure(int who); - void handle_mds_recovery(int who); - -protected: - // [resolve] - // from EImportStart w/o EImportFinish during journal replay - map > my_ambiguous_imports; - // from MMDSResolves - map > > other_ambiguous_imports; - - map > uncommitted_slave_updates; // for replay. - map ambiguous_slave_updates; // for log trimming. - map waiting_for_slave_update_commit; - friend class ESlaveUpdate; - - set wants_resolve; // nodes i need to send my resolve to - set got_resolve; // nodes i got resolves from - set need_resolve_ack; // nodes i need a resolve_ack from - - void handle_resolve(MMDSResolve *m); - void handle_resolve_ack(MMDSResolveAck *m); - void maybe_resolve_finish(); - void disambiguate_imports(); - void recalc_auth_bits(); -public: - // ambiguous imports - void add_ambiguous_import(dirfrag_t base, list& bounds); - void add_ambiguous_import(CDir *base, const set& bounds); - bool have_ambiguous_import(dirfrag_t base) { - return my_ambiguous_imports.count(base); - } - void cancel_ambiguous_import(dirfrag_t dirino); - void finish_ambiguous_import(dirfrag_t dirino); - void send_resolve(int who); - void send_resolve_now(int who); - void send_resolve_later(int who); - void maybe_send_pending_resolves(); - - ESubtreeMap *create_subtree_map(); - - -protected: - // [rejoin] - set rejoin_gather; // nodes from whom i need a rejoin - set rejoin_sent; // nodes i sent a rejoin to - set rejoin_ack_gather; // nodes from whom i need a rejoin ack - - map > cap_exports; // ino -> client -> capex - map cap_export_paths; - - map > > cap_imports; // ino -> client -> frommds -> capex - map cap_import_paths; - - set rejoin_undef_inodes; - - void rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin); - void handle_cache_rejoin(MMDSCacheRejoin *m); - void handle_cache_rejoin_weak(MMDSCacheRejoin *m); - CInode* rejoin_invent_inode(inodeno_t ino); - void handle_cache_rejoin_strong(MMDSCacheRejoin *m); - void rejoin_scour_survivor_replicas(int from, MMDSCacheRejoin *ack); - void handle_cache_rejoin_ack(MMDSCacheRejoin *m); - void handle_cache_rejoin_purge(MMDSCacheRejoin *m); - void handle_cache_rejoin_missing(MMDSCacheRejoin *m); - void handle_cache_rejoin_full(MMDSCacheRejoin *m); - void rejoin_send_acks(); - void rejoin_trim_undef_inodes(); -public: - void rejoin_gather_finish(); - void rejoin_send_rejoins(); - void rejoin_export_caps(inodeno_t ino, string& path, int client, inode_caps_reconnect_t& icr) { - cap_exports[ino][client] = icr; - cap_export_paths[ino] = path; - } - void rejoin_recovered_caps(inodeno_t ino, string& path, int client, inode_caps_reconnect_t& icr, - int frommds=-1) { - cap_imports[ino][client][frommds] = icr; - cap_import_paths[ino] = path; - } - void rejoin_import_cap(CInode *in, int client, inode_caps_reconnect_t& icr, int frommds); - - - friend class Locker; - friend class Migrator; - friend class Renamer; - friend class MDBalancer; - - - public: - - // subsystems - Migrator *migrator; - Renamer *renamer; - - public: - MDCache(MDS *m); - ~MDCache(); - - // debug - void log_stat(Logger *logger); - - // root inode - CInode *get_root() { return root; } - void set_root(CInode *r); - CInode *get_stray() { return stray; } - - // cache - void set_cache_size(size_t max) { lru.lru_set_max(max); } - size_t get_cache_size() { return lru.lru_get_size(); } - - // trimming - bool trim(int max = -1); // trim cache - void trim_dentry(CDentry *dn, map& expiremap); - void trim_dirfrag(CDir *dir, CDir *con, - map& expiremap); - void trim_inode(CDentry *dn, CInode *in, CDir *con, - map& expiremap); - void send_expire_messages(map& expiremap); - void trim_non_auth(); // trim out trimmable non-auth items - - // shutdown - void shutdown_start(); - void shutdown_check(); - bool shutdown_pass(); - bool shutdown(); // clear cache (ie at shutodwn) - - bool did_shutdown_log_cap; - - // inode_map - bool have_inode( inodeno_t ino ) { return inode_map.count(ino) ? true:false; } - CInode* get_inode( inodeno_t ino ) { - if (have_inode(ino)) - return inode_map[ino]; - return NULL; - } - CDir* get_dirfrag(dirfrag_t df) { - if (!have_inode(df.ino)) return NULL; - return inode_map[df.ino]->get_dirfrag(df.frag); - } - /* - void get_dirfrags_under(dirfrag_t df, list& ls) { - if (have_inode(df.ino)) - inode_map[df.ino]->get_dirfrags_under(df.frag, ls); - } - */ - - MDSCacheObject *get_object(MDSCacheObjectInfo &info); - - - - public: - CInode *create_inode(); - void add_inode(CInode *in); - - void remove_inode(CInode *in); - protected: - void touch_inode(CInode *in) { - if (in->get_parent_dn()) - touch_dentry(in->get_parent_dn()); - } - void touch_dentry(CDentry *dn) { - // touch ancestors - if (dn->get_dir()->get_inode()->get_parent_dn()) - touch_dentry(dn->get_dir()->get_inode()->get_parent_dn()); - - // touch me - if (dn->is_auth()) - lru.lru_touch(dn); - else - lru.lru_midtouch(dn); - } - - void inode_remove_replica(CInode *in, int rep); - void dentry_remove_replica(CDentry *dn, int rep); - - void rename_file(CDentry *srcdn, CDentry *destdn); - - public: - // inode purging - void purge_inode(CInode *in, off_t newsize, off_t oldsize, LogSegment *ls); - void _do_purge_inode(CInode *in, off_t newsize, off_t oldsize); - void purge_inode_finish(CInode *in, off_t newsize, off_t oldsize); - void purge_inode_finish_2(CInode *in, off_t newsize, off_t oldsize); - bool is_purging(CInode *in, off_t newsize, off_t oldsize) { - return purging.count(in) && purging[in].count(newsize); - } - void wait_for_purge(CInode *in, off_t newsize, Context *c) { - waiting_for_purge[in][newsize].push_back(c); - } - - void add_recovered_purge(CInode *in, off_t newsize, off_t oldsize, LogSegment *ls); - void remove_recovered_purge(CInode *in, off_t newsize, off_t oldsize); - void start_recovered_purges(); - - - public: - CDir *get_auth_container(CDir *in); - CDir *get_export_container(CDir *dir); - void find_nested_exports(CDir *dir, set& s); - void find_nested_exports_under(CDir *import, CDir *dir, set& s); - - public: - CInode *create_root_inode(); - void open_root(Context *c); - CInode *create_stray_inode(int whose=-1); - void open_local_stray(); - void open_foreign_stray(int who, Context *c); - CDentry *get_or_create_stray_dentry(CInode *in); - - Context *_get_waiter(MDRequest *mdr, Message *req); - int path_traverse(MDRequest *mdr, Message *req, - CInode *base, filepath& path, - vector& trace, bool follow_trailing_sym, - int onfail); - bool path_is_mine(filepath& path); - bool path_is_mine(string& p) { - filepath path(p); - return path_is_mine(path); - } - CDir *path_traverse_to_dir(filepath& path); - - void open_remote_dirfrag(CInode *diri, frag_t fg, Context *fin); - CInode *get_dentry_inode(CDentry *dn, MDRequest *mdr); - void open_remote_ino(inodeno_t ino, MDRequest *mdr, Context *fin); - void open_remote_ino_2(inodeno_t ino, MDRequest *mdr, - vector& anchortrace, - Context *onfinish); - - C_Gather *parallel_fetch(map& pathmap); - - void make_trace(vector& trace, CInode *in); - - // -- anchors -- -public: - void anchor_create(MDRequest *mdr, CInode *in, Context *onfinish); - void anchor_destroy(CInode *in, Context *onfinish); -protected: - void _anchor_create_prepared(CInode *in, version_t atid); - void _anchor_create_logged(CInode *in, version_t atid, LogSegment *ls); - void _anchor_destroy_prepared(CInode *in, version_t atid); - void _anchor_destroy_logged(CInode *in, version_t atid, LogSegment *ls); - - friend class C_MDC_AnchorCreatePrepared; - friend class C_MDC_AnchorCreateLogged; - friend class C_MDC_AnchorDestroyPrepared; - friend class C_MDC_AnchorDestroyLogged; - - // -- stray -- -public: - void eval_stray(CDentry *dn); -protected: - void _purge_stray(CDentry *dn); - void _purge_stray_logged(CDentry *dn, version_t pdv, LogSegment *ls); - friend class C_MDC_PurgeStray; - void reintegrate_stray(CDentry *dn, CDentry *rlink); - void migrate_stray(CDentry *dn, int dest); - - - // == messages == - public: - void dispatch(Message *m); - - protected: - // -- replicas -- - void handle_discover(MDiscover *dis); - void handle_discover_reply(MDiscoverReply *m); - - CDir* add_replica_dir(CInode *diri, - frag_t fg, CDirDiscover& dis, - int from, - list& finished); - CDir* forge_replica_dir(CInode *diri, frag_t fg, int from); - - CDentry *add_replica_dentry(CDir *dir, CDentryDiscover &dis, list& finished); -public: // for Server::handle_slave_rename_prep - CInode *add_replica_inode(CInodeDiscover& dis, CDentry *dn, list& finished); - -public: - CDentry *add_replica_stray(bufferlist &bl, CInode *strayin, int from); -protected: - - - - // -- namespace -- - void handle_dentry_unlink(MDentryUnlink *m); - - - // -- fragmenting -- -private: - void adjust_dir_fragments(CInode *diri, frag_t basefrag, int bits, - list& frags, list& waiters); - friend class EFragment; - -public: - void split_dir(CDir *dir, int byn); - -private: - void fragment_freeze(CInode *diri, list& startfrags, frag_t basefrag, int bits); - void fragment_mark_and_complete(CInode *diri, list& startfrags, frag_t basefrag, int bits); - void fragment_go(CInode *diri, list& startfrags, frag_t basefrag, int bits); - void fragment_stored(CInode *diri, frag_t basefrag, int bits, list& resultfrags); - void fragment_logged(CInode *diri, frag_t basefrag, int bits, list& resultfrags, vector& pvs, LogSegment *ls); - friend class C_MDC_FragmentGo; - friend class C_MDC_FragmentMarking; - friend class C_MDC_FragmentStored; - friend class C_MDC_FragmentLogged; - - void handle_fragment_notify(MMDSFragmentNotify *m); - - - // -- updates -- - //int send_inode_updates(CInode *in); - //void handle_inode_update(MInodeUpdate *m); - - int send_dir_updates(CDir *in, bool bcast=false); - void handle_dir_update(MDirUpdate *m); - - // -- cache expiration -- - void handle_cache_expire(MCacheExpire *m); - void process_delayed_expire(CDir *dir); - void discard_delayed_expire(CDir *dir); - - - // == crap fns == - public: - void show_cache(); - void dump_cache(); - void show_subtrees(int dbl=10); - - CInode *hack_pick_random_inode() { - assert(!inode_map.empty()); - int n = rand() % inode_map.size(); - hash_map::iterator p = inode_map.begin(); - while (n--) p++; - return p->second; - } - -}; - -class C_MDS_RetryRequest : public Context { - MDCache *cache; - MDRequest *mdr; - public: - C_MDS_RetryRequest(MDCache *c, MDRequest *r) : cache(c), mdr(r) {} - virtual void finish(int r) { - cache->dispatch_request(mdr); - } -}; - -#endif diff --git a/branches/sage/ebofs2/mds/MDLog.cc b/branches/sage/ebofs2/mds/MDLog.cc deleted file mode 100644 index eeea99c721751..0000000000000 --- a/branches/sage/ebofs2/mds/MDLog.cc +++ /dev/null @@ -1,505 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "MDLog.h" -#include "MDS.h" -#include "MDCache.h" -#include "LogEvent.h" - -#include "osdc/Journaler.h" - -#include "common/LogType.h" -#include "common/Logger.h" - -#include "events/ESubtreeMap.h" - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug_mds || l <= g_conf.debug_mds_log) *_dout << dbeginl << g_clock.now() << " mds" << mds->get_nodeid() << ".log " -#define derr(l) if (l<=g_conf.debug_mds || l <= g_conf.debug_mds_log) *_derr << dbeginl << g_clock.now() << " mds" << mds->get_nodeid() << ".log " - -// cons/des - -LogType mdlog_logtype; - - -MDLog::~MDLog() -{ - if (journaler) { delete journaler; journaler = 0; } - if (logger) { delete logger; logger = 0; } -} - - -void MDLog::reopen_logger(utime_t start, bool append) -{ - // logger - char name[80]; - sprintf(name, "mds%d.log", mds->get_nodeid()); - logger = new Logger(name, &mdlog_logtype, append); - logger->set_start(start); - - static bool didit = false; - if (!didit) { - didit = true; - mdlog_logtype.add_inc("evadd"); - mdlog_logtype.add_inc("evex"); - mdlog_logtype.add_inc("evtrm"); - mdlog_logtype.add_set("ev"); - mdlog_logtype.add_set("evexg"); - mdlog_logtype.add_set("evexd"); - - mdlog_logtype.add_inc("segadd"); - mdlog_logtype.add_inc("segex"); - mdlog_logtype.add_inc("segtrm"); - mdlog_logtype.add_set("seg"); - mdlog_logtype.add_set("segexg"); - mdlog_logtype.add_set("segexd"); - - mdlog_logtype.add_set("expos"); - mdlog_logtype.add_set("wrpos"); - - mdlog_logtype.add_avg("jlat"); - } - -} - -void MDLog::init_journaler() -{ - // inode - memset(&log_inode, 0, sizeof(log_inode)); - log_inode.ino = MDS_INO_LOG_OFFSET + mds->get_nodeid(); - log_inode.layout = g_OSD_MDLogLayout; - - if (g_conf.mds_local_osd) - log_inode.layout.fl_pg_preferred = mds->get_nodeid() + g_conf.mds_local_osd_offset; // hack - - // log streamer - if (journaler) delete journaler; - journaler = new Journaler(log_inode, mds->objecter, logger, &mds->mds_lock); -} - -void MDLog::write_head(Context *c) -{ - journaler->write_head(c); -} - -off_t MDLog::get_read_pos() -{ - return journaler->get_read_pos(); -} - -off_t MDLog::get_write_pos() -{ - return journaler->get_write_pos(); -} - - - -void MDLog::create(Context *c) -{ - dout(5) << "create empty log" << dendl; - init_journaler(); - journaler->reset(); - write_head(c); - - logger->set("expos", journaler->get_expire_pos()); - logger->set("wrpos", journaler->get_write_pos()); -} - -void MDLog::open(Context *c) -{ - dout(5) << "open discovering log bounds" << dendl; - init_journaler(); - journaler->recover(c); - - // either append() or replay() will follow. -} - -void MDLog::append() -{ - dout(5) << "append positioning at end" << dendl; - journaler->set_read_pos(journaler->get_write_pos()); - journaler->set_expire_pos(journaler->get_write_pos()); - - logger->set("expos", journaler->get_write_pos()); -} - - - -// ------------------------------------------------- - -void MDLog::submit_entry( LogEvent *le, Context *c ) -{ - if (!g_conf.mds_log) { - // hack: log is disabled. - if (c) { - c->finish(0); - delete c; - } - return; - } - - dout(5) << "submit_entry " << journaler->get_write_pos() << " : " << *le << dendl; - - // let the event register itself in the segment - assert(!segments.empty()); - le->_segment = segments.rbegin()->second; - le->_segment->num_events++; - le->update_segment(); - - num_events++; - assert(!capped); - - // encode it, with event type - { - bufferlist bl; - ::_encode(le->_type, bl); - le->encode_payload(bl); - - // journal it. - journaler->append_entry(bl); // bl is destroyed. - } - - delete le; - - if (logger) { - logger->inc("evadd"); - logger->set("ev", num_events); - logger->set("wrpos", journaler->get_write_pos()); - } - - if (c) { - unflushed = 0; - journaler->flush(c); - } - else - unflushed++; - - // start a new segment? - // FIXME: should this go elsewhere? - off_t last_seg = get_last_segment_offset(); - if (!segments.empty() && - !writing_subtree_map && - (journaler->get_write_pos() / ceph_file_layout_period(log_inode.layout) != (last_seg / ceph_file_layout_period(log_inode.layout)) && - (journaler->get_write_pos() - last_seg > ceph_file_layout_period(log_inode.layout)/2))) { - dout(10) << "submit_entry also starting new segment: last = " << last_seg - << ", cur pos = " << journaler->get_write_pos() << dendl; - start_new_segment(); - } -} - -void MDLog::wait_for_sync( Context *c ) -{ - if (g_conf.mds_log) { - // wait - journaler->flush(c); - } else { - // hack: bypass. - c->finish(0); - delete c; - } -} - -void MDLog::flush() -{ - if (unflushed) - journaler->flush(); - unflushed = 0; - - // trim - trim(); -} - -void MDLog::cap() -{ - dout(5) << "cap" << dendl; - capped = true; -} - - -// ----------------------------- -// segments - -void MDLog::start_new_segment(Context *onsync) -{ - dout(7) << "start_new_segment at " << journaler->get_write_pos() << dendl; - assert(!writing_subtree_map); - - segments[journaler->get_write_pos()] = new LogSegment(journaler->get_write_pos()); - - writing_subtree_map = true; - - ESubtreeMap *le = mds->mdcache->create_subtree_map(); - submit_entry(le, new C_MDL_WroteSubtreeMap(this, mds->mdlog->get_write_pos())); - if (onsync) - wait_for_sync(onsync); - - logger->inc("segadd"); - logger->set("seg", segments.size()); -} - -void MDLog::_logged_subtree_map(off_t off) -{ - dout(10) << "_logged_subtree_map at " << off << dendl; - writing_subtree_map = false; - - /* - list ls; - take_subtree_map_expire_waiters(ls); - mds->queue_waiters(ls); - */ -} - - - -void MDLog::trim() -{ - // trim! - dout(10) << "trim " - << segments.size() << " / " << max_segments << " segments, " - << num_events << " / " << max_events << " events" - << ", " << expiring_segments.size() << " (" << expiring_events << ") expiring" - << ", " << expired_segments.size() << " (" << expired_events << ") expired" - << dendl; - - if (segments.empty()) return; - - // hack: only trim for a few seconds at a time - utime_t stop = g_clock.now(); - stop += 2.0; - - map::iterator p = segments.begin(); - int left = num_events; - while (p != segments.end() && - ((max_events >= 0 && left-expiring_events-expired_events > max_events) || - (max_segments >= 0 && (int)(segments.size()-expiring_segments.size()-expired_segments.size()) > max_segments))) { - - if (stop < g_clock.now()) - break; - - if ((int)expiring_segments.size() >= g_conf.mds_log_max_expiring) - break; - - // look at first segment - LogSegment *ls = p->second; - assert(ls); - - p++; - - left -= ls->num_events; - - if (expiring_segments.count(ls)) { - dout(5) << "trim already expiring segment " << ls->offset << ", " << ls->num_events << " events" << dendl; - } else if (expired_segments.count(ls)) { - dout(5) << "trim already expired segment " << ls->offset << ", " << ls->num_events << " events" << dendl; - } else { - try_expire(ls); - } - } -} - - -void MDLog::try_expire(LogSegment *ls) -{ - C_Gather *exp = ls->try_to_expire(mds); - if (exp) { - assert(expiring_segments.count(ls) == 0); - expiring_segments.insert(ls); - expiring_events += ls->num_events; - dout(5) << "try_expire expiring segment " << ls->offset << dendl; - exp->set_finisher(new C_MaybeExpiredSegment(this, ls)); - } else { - dout(10) << "try_expire expired segment " << ls->offset << dendl; - _expired(ls); - } - - logger->set("segexg", expiring_segments.size()); - logger->set("evexg", expiring_events); -} - -void MDLog::_maybe_expired(LogSegment *ls) -{ - dout(10) << "_maybe_expired segment " << ls->offset << " " << ls->num_events << " events" << dendl; - assert(expiring_segments.count(ls)); - expiring_segments.erase(ls); - expiring_events -= ls->num_events; - try_expire(ls); -} - -void MDLog::_expired(LogSegment *ls) -{ - dout(5) << "_expired segment " << ls->offset << " " << ls->num_events << " events" << dendl; - - if (!capped && ls == get_current_segment()) { - dout(5) << "_expired not expiring " << ls->offset << ", last one and !capped" << dendl; - } else { - // expired. - expired_segments.insert(ls); - expired_events += ls->num_events; - - logger->inc("evex", ls->num_events); - logger->inc("segex"); - - // trim expired segments? - while (!segments.empty()) { - ls = segments.begin()->second; - if (!expired_segments.count(ls)) break; - - expired_events -= ls->num_events; - expired_segments.erase(ls); - num_events -= ls->num_events; - - journaler->set_expire_pos(ls->offset); // this was the oldest segment, adjust expire pos - journaler->write_head(0); - - logger->set("expos", ls->offset); - logger->inc("segtrm"); - logger->inc("evtrm", ls->num_events); - - segments.erase(ls->offset); - delete ls; - } - } - - logger->set("ev", num_events); - logger->set("evexd", expired_events); - logger->set("seg", segments.size()); - logger->set("segexd", expired_segments.size()); -} - - - -void MDLog::replay(Context *c) -{ - assert(journaler->is_active()); - - // start reading at the last known expire point. - journaler->set_read_pos( journaler->get_expire_pos() ); - - // empty? - if (journaler->get_read_pos() == journaler->get_write_pos()) { - dout(10) << "replay - journal empty, done." << dendl; - if (c) { - c->finish(0); - delete c; - } - return; - } - - // add waiter - if (c) - waitfor_replay.push_back(c); - - // go! - dout(10) << "replay start, from " << journaler->get_read_pos() - << " to " << journaler->get_write_pos() << dendl; - - assert(num_events == 0); - - replay_thread.create(); -} - -class C_MDL_Replay : public Context { - MDLog *mdlog; -public: - C_MDL_Replay(MDLog *l) : mdlog(l) {} - void finish(int r) { - mdlog->replay_cond.Signal(); - } -}; - - - -// i am a separate thread -void MDLog::_replay_thread() -{ - mds->mds_lock.Lock(); - dout(10) << "_replay_thread start" << dendl; - - // loop - off_t new_expire_pos = journaler->get_expire_pos(); - while (1) { - // wait for read? - while (!journaler->is_readable() && - journaler->get_read_pos() < journaler->get_write_pos()) { - journaler->wait_for_readable(new C_MDL_Replay(this)); - replay_cond.Wait(mds->mds_lock); - } - - if (!journaler->is_readable() && - journaler->get_read_pos() == journaler->get_write_pos()) - break; - - assert(journaler->is_readable()); - - // read it - off_t pos = journaler->get_read_pos(); - bufferlist bl; - bool r = journaler->try_read_entry(bl); - assert(r); - - // unpack event - LogEvent *le = LogEvent::decode(bl); - - // new segment? - if (le->get_type() == EVENT_SUBTREEMAP) { - segments[pos] = new LogSegment(pos); - logger->set("seg", segments.size()); - } - - // have we seen an import map yet? - if (segments.empty()) { - dout(10) << "_replay " << pos << " / " << journaler->get_write_pos() - << " -- waiting for subtree_map. (skipping " << *le << ")" << dendl; - } else { - dout(10) << "_replay " << pos << " / " << journaler->get_write_pos() - << " : " << *le << dendl; - le->_segment = get_current_segment(); // replay may need this - le->_segment->num_events++; - num_events++; - - le->replay(mds); - - if (!new_expire_pos) - new_expire_pos = pos; - } - delete le; - - logger->set("rdpos", pos); - - // drop lock for a second, so other events/messages (e.g. beacon timer!) can go off - mds->mds_lock.Unlock(); - mds->mds_lock.Lock(); - } - - // done! - assert(journaler->get_read_pos() == journaler->get_write_pos()); - dout(10) << "_replay - complete, " << num_events << " events, new read/expire pos is " << new_expire_pos << dendl; - - // move read pointer _back_ to first subtree map we saw, for eventual trimming - journaler->set_read_pos(new_expire_pos); - journaler->set_expire_pos(new_expire_pos); - logger->set("expos", new_expire_pos); - - // kick waiter(s) - list ls; - ls.swap(waitfor_replay); - finish_contexts(ls,0); - - dout(10) << "_replay_thread finish" << dendl; - mds->mds_lock.Unlock(); -} - - - diff --git a/branches/sage/ebofs2/mds/MDLog.h b/branches/sage/ebofs2/mds/MDLog.h deleted file mode 100644 index f7bdcd21a5303..0000000000000 --- a/branches/sage/ebofs2/mds/MDLog.h +++ /dev/null @@ -1,195 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MDLOG_H -#define __MDLOG_H - -#include "include/types.h" -#include "include/Context.h" - -#include "common/Thread.h" -#include "common/Cond.h" - -#include "LogSegment.h" - -#include - -//#include -//using __gnu_cxx::hash_mapset; - -class Journaler; -class LogEvent; -class MDS; -class LogSegment; -class ESubtreeMap; - -class Logger; - -#include -using std::map; - - -class MDLog { - protected: - MDS *mds; - int num_events; // in events - int max_events; - int max_segments; - - int unflushed; - - bool capped; - - inode_t log_inode; - Journaler *journaler; - - Logger *logger; - - - // -- replay -- - Cond replay_cond; - - class ReplayThread : public Thread { - MDLog *log; - public: - ReplayThread(MDLog *l) : log(l) {} - void* entry() { - log->_replay_thread(); - return 0; - } - } replay_thread; - - friend class ReplayThread; - friend class C_MDL_Replay; - - list waitfor_replay; - - void _replay(); // old way - void _replay_thread(); // new way - - - // -- segments -- - map segments; - set expiring_segments; - set expired_segments; - int expiring_events; - int expired_events; - - class C_MDL_WroteSubtreeMap : public Context { - MDLog *mdlog; - off_t off; - public: - C_MDL_WroteSubtreeMap(MDLog *l, off_t o) : mdlog(l), off(o) { } - void finish(int r) { - mdlog->_logged_subtree_map(off); - } - }; - void _logged_subtree_map(off_t off); - - - // -- subtreemaps -- - bool writing_subtree_map; // one is being written now - - friend class ESubtreeMap; - friend class C_MDS_WroteImportMap; - friend class MDCache; - -public: - off_t get_last_segment_offset() { - assert(!segments.empty()); - return segments.rbegin()->first; - } - - -private: - void init_journaler(); - -public: - void reopen_logger(utime_t start, bool append=false); - - // replay state - map > pending_exports; - - - -public: - MDLog(MDS *m) : mds(m), - num_events(0), - max_events(g_conf.mds_log_max_events), - max_segments(g_conf.mds_log_max_segments), - unflushed(0), - capped(false), - journaler(0), - logger(0), - replay_thread(this), - expiring_events(0), expired_events(0), - writing_subtree_map(false) { - } - ~MDLog(); - - - void start_new_segment(Context *onsync=0); - LogSegment *get_current_segment() { - return segments.empty() ? 0:segments.rbegin()->second; - } - - - void flush_logger(); - - size_t get_num_events() { return num_events; } - void set_max_events(int m) { max_events = m; } - size_t get_num_segments() { return segments.size(); } - void set_max_segments(int m) { max_segments = m; } - - off_t get_read_pos(); - off_t get_write_pos(); - bool empty() { return segments.empty(); } - - bool is_capped() { return capped; } - void cap(); - - void submit_entry( LogEvent *e, Context *c = 0 ); - void wait_for_sync( Context *c ); - void flush(); - -private: - class C_MaybeExpiredSegment : public Context { - MDLog *mdlog; - LogSegment *ls; - public: - C_MaybeExpiredSegment(MDLog *mdl, LogSegment *s) : mdlog(mdl), ls(s) {} - void finish(int res) { - mdlog->_maybe_expired(ls); - } - }; - - void try_expire(LogSegment *ls); - void _maybe_expired(LogSegment *ls); - void _expired(LogSegment *ls); - -public: - void trim(); - -private: - void write_head(Context *onfinish); - -public: - void create(Context *onfinish); // fresh, empty log! - void open(Context *onopen); // append() or replay() to follow! - void append(); - void replay(Context *onfinish); -}; - -#endif diff --git a/branches/sage/ebofs2/mds/MDS.cc b/branches/sage/ebofs2/mds/MDS.cc deleted file mode 100644 index 69cc54a6bc61f..0000000000000 --- a/branches/sage/ebofs2/mds/MDS.cc +++ /dev/null @@ -1,1295 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "include/types.h" -#include "common/Clock.h" - -#include "msg/Messenger.h" - -#include "osd/OSDMap.h" -#include "osdc/Objecter.h" -#include "osdc/Filer.h" - -#include "MDSMap.h" - -#include "MDS.h" -#include "Server.h" -#include "Locker.h" -#include "MDCache.h" -#include "MDLog.h" -#include "MDBalancer.h" -#include "IdAllocator.h" -#include "Migrator.h" -//#include "Renamer.h" - -#include "AnchorTable.h" -#include "AnchorClient.h" - -#include "common/Logger.h" -#include "common/LogType.h" - -#include "common/Timer.h" - -#include "events/ESession.h" - -#include "messages/MMDSMap.h" -#include "messages/MMDSBeacon.h" - -#include "messages/MPing.h" -#include "messages/MPingAck.h" -#include "messages/MGenericMessage.h" - -#include "messages/MOSDMap.h" -#include "messages/MOSDGetMap.h" - -#include "messages/MClientRequest.h" -#include "messages/MClientRequestForward.h" - -#include "messages/MAnchor.h" - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) *_dout << dbeginl << g_clock.now() << " mds" << whoami << " " -#define derr(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) *_derr << dbeginl << g_clock.now() << " mds" << whoami << " " - - - - - -// cons/des -MDS::MDS(int whoami, Messenger *m, MonMap *mm) : - timer(mds_lock), - clientmap(this) { - - this->whoami = whoami; - - monmap = mm; - messenger = m; - - mdsmap = new MDSMap; - osdmap = new OSDMap; - - objecter = new Objecter(messenger, monmap, osdmap, mds_lock); - filer = new Filer(objecter); - - mdcache = new MDCache(this); - mdlog = new MDLog(this); - balancer = new MDBalancer(this); - - anchorclient = new AnchorClient(this); - idalloc = new IdAllocator(this); - - anchortable = new AnchorTable(this); - - server = new Server(this); - locker = new Locker(this, mdcache); - - // clients - last_client_mdsmap_bcast = 0; - - // beacon - beacon_last_seq = 0; - beacon_sender = 0; - beacon_killer = 0; - - // tick - tick_event = 0; - - req_rate = 0; - - want_state = state = MDSMap::STATE_DNE; - - logger = logger2 = 0; - - // i'm ready! - messenger->set_dispatcher(this); -} - -MDS::~MDS() { - Mutex::Locker lock(mds_lock); - - if (mdcache) { delete mdcache; mdcache = NULL; } - if (mdlog) { delete mdlog; mdlog = NULL; } - if (balancer) { delete balancer; balancer = NULL; } - if (idalloc) { delete idalloc; idalloc = NULL; } - if (anchortable) { delete anchortable; anchortable = NULL; } - if (anchorclient) { delete anchorclient; anchorclient = NULL; } - if (osdmap) { delete osdmap; osdmap = 0; } - if (mdsmap) { delete mdsmap; mdsmap = 0; } - - if (server) { delete server; server = 0; } - if (locker) { delete locker; locker = 0; } - - if (filer) { delete filer; filer = 0; } - if (objecter) { delete objecter; objecter = 0; } - if (messenger) { delete messenger; messenger = NULL; } - - if (logger) { delete logger; logger = 0; } - if (logger2) { delete logger2; logger2 = 0; } - -} - - -void MDS::reopen_logger(utime_t start) -{ - static LogType mds_logtype, mds_cache_logtype; - static bool didit = false; - if (!didit) { - didit = true; - - //mds_logtype.add_inc("req"); - mds_logtype.add_inc("reply"); - mds_logtype.add_inc("fw"); - - mds_logtype.add_inc("dir_f"); - mds_logtype.add_inc("dir_c"); - //mds_logtype.add_inc("mkdir"); - - /* - mds_logtype.add_inc("newin"); // new inodes (pre)loaded - mds_logtype.add_inc("newt"); // inodes first touched/used - mds_logtype.add_inc("outt"); // trimmed touched - mds_logtype.add_inc("outut"); // trimmed untouched (wasted effort) - mds_logtype.add_avg("oututl"); // avg trim latency for untouched - - mds_logtype.add_inc("dirt1"); - mds_logtype.add_inc("dirt2"); - mds_logtype.add_inc("dirt3"); - mds_logtype.add_inc("dirt4"); - mds_logtype.add_inc("dirt5"); - */ - - mds_logtype.add_set("c"); - mds_logtype.add_set("ctop"); - mds_logtype.add_set("cbot"); - mds_logtype.add_set("cptail"); - mds_logtype.add_set("cpin"); - mds_logtype.add_inc("cex"); - mds_logtype.add_inc("dis"); - - mds_logtype.add_inc("t"); - mds_logtype.add_inc("thit"); - mds_logtype.add_inc("tfw"); - mds_logtype.add_inc("tdis"); - mds_logtype.add_inc("tdirf"); - mds_logtype.add_inc("trino"); - mds_logtype.add_inc("tlock"); - - mds_logtype.add_set("l"); - mds_logtype.add_set("q"); - mds_logtype.add_set("popanyd"); - mds_logtype.add_set("popnest"); - - mds_logtype.add_set("buf"); - - mds_logtype.add_set("sm"); - mds_logtype.add_inc("ex"); - mds_logtype.add_inc("iex"); - mds_logtype.add_inc("im"); - mds_logtype.add_inc("iim"); - /* - mds_logtype.add_inc("imex"); - mds_logtype.add_set("nex"); - mds_logtype.add_set("nim"); - */ - - mds_logtype.add_avg("replyl"); - - } - - if (whoami < 0) return; - - // flush+close old log - if (logger) delete logger; - if (logger2) delete logger2; - - // log - char name[80]; - sprintf(name, "mds%d", whoami); - - bool append = mdsmap->get_inc(whoami) > 1; - - logger = new Logger(name, (LogType*)&mds_logtype, append); - logger->set_start(start); - - char n[80]; - sprintf(n, "mds%d.cache", whoami); - logger2 = new Logger(n, (LogType*)&mds_cache_logtype, append); - logger2->set_start(start); - - mdlog->reopen_logger(start, append); - server->reopen_logger(start, append); -} - -void MDS::send_message_mds(Message *m, int mds) -{ - // send mdsmap first? - if (peer_mdsmap_epoch[mds] < mdsmap->get_epoch()) { - messenger->send_message(new MMDSMap(mdsmap), - mdsmap->get_inst(mds)); - peer_mdsmap_epoch[mds] = mdsmap->get_epoch(); - } - - // send message - messenger->send_message(m, mdsmap->get_inst(mds)); -} - -void MDS::forward_message_mds(Message *req, int mds) -{ - // client request? - if (req->get_type() == MSG_CLIENT_REQUEST) { - MClientRequest *creq = (MClientRequest*)req; - creq->inc_num_fwd(); // inc forward counter - - // tell the client where it should go - messenger->send_message(new MClientRequestForward(creq->get_tid(), mds, creq->get_num_fwd()), - creq->get_client_inst()); - - if (!creq->is_idempotent()) { - delete req; - return; // don't actually forward if non-idempotent! client has to do it. - } - } - - // forward - send_message_mds(req, mds); -} - - - -void MDS::send_message_client(Message *m, int client) -{ - version_t seq = clientmap.inc_push_seq(client); - dout(10) << "send_message_client client" << client << " seq " << seq << " " << *m << dendl; - messenger->send_message(m, clientmap.get_inst(client)); -} - -void MDS::send_message_client(Message *m, entity_inst_t clientinst) -{ - version_t seq = clientmap.inc_push_seq(clientinst.name.num()); - dout(10) << "send_message_client client" << clientinst.name.num() << " seq " << seq << " " << *m << dendl; - messenger->send_message(m, clientinst); -} - - -class C_MDS_SendMessageClientSession : public Context { - MDS *mds; - Message *msg; - entity_inst_t clientinst; -public: - C_MDS_SendMessageClientSession(MDS *md, Message *ms, entity_inst_t& ci) : - mds(md), msg(ms), clientinst(ci) {} - void finish(int r) { - mds->clientmap.open_session(clientinst); - mds->send_message_client(msg, clientinst.name.num()); - } -}; - -void MDS::send_message_client_maybe_opening(Message *m, int c) -{ - send_message_client_maybe_open(m, clientmap.get_inst(c)); -} - -void MDS::send_message_client_maybe_open(Message *m, entity_inst_t clientinst) -{ - // FIXME - // _most_ ppl shoudl check for a client session, since migration may call this, - // start opening, and then e.g. locker sends something else (through non-maybe_open - // version) - int client = clientinst.name.num(); - if (!clientmap.have_session(client)) { - // no session! - dout(10) << "send_message_client opening session with " << clientinst << dendl; - clientmap.add_opening(client); - mdlog->submit_entry(new ESession(clientinst, true, clientmap.inc_projected()), - new C_MDS_SendMessageClientSession(this, m, clientinst)); - } else { - // we have a session. - send_message_client(m, clientinst); - } -} - - - -int MDS::init(bool standby) -{ - mds_lock.Lock(); - - // starting beacon. this will induce an MDSMap from the monitor - want_state = MDSMap::STATE_BOOT; - want_rank = whoami; - beacon_start(); - whoami = -1; - messenger->reset_myname(entity_name_t::MDS(whoami)); - - objecter->init(); - - // schedule tick - reset_tick(); - - mds_lock.Unlock(); - return 0; -} - -void MDS::reset_tick() -{ - // cancel old - if (tick_event) timer.cancel_event(tick_event); - - // schedule - tick_event = new C_MDS_Tick(this); - timer.add_event_after(g_conf.mon_tick_interval, tick_event); -} - -void MDS::tick() -{ - tick_event = 0; - - // reschedule - reset_tick(); - - // log - mds_load_t load = balancer->get_load(); - - if (logger) { - req_rate = logger->get("req"); - - logger->fset("l", (int)load.mds_load()); - logger->set("q", messenger->get_dispatch_queue_len()); - logger->set("buf", buffer_total_alloc); - logger->set("sm", mdcache->num_subtrees()); - - mdcache->log_stat(logger); - } - - if (is_active() || is_stopping()) - locker->scatter_unscatter_autoscattered(); - - // booted? - if (is_active()) { - - // balancer - balancer->tick(); - - } -} - - - - -// ----------------------- -// beacons - -void MDS::beacon_start() -{ - beacon_send(); // send first beacon - - //reset_beacon_killer(); // schedule killer -} - - - -void MDS::beacon_send() -{ - ++beacon_last_seq; - dout(10) << "beacon_send " << MDSMap::get_state_name(want_state) - << " seq " << beacon_last_seq - << " (currently " << MDSMap::get_state_name(state) << ")" - << dendl; - - beacon_seq_stamp[beacon_last_seq] = g_clock.now(); - - int mon = monmap->pick_mon(); - messenger->send_message(new MMDSBeacon(messenger->get_myinst(), mdsmap->get_epoch(), - want_state, beacon_last_seq, want_rank), - monmap->get_inst(mon)); - - // schedule next sender - if (beacon_sender) timer.cancel_event(beacon_sender); - beacon_sender = new C_MDS_BeaconSender(this); - timer.add_event_after(g_conf.mds_beacon_interval, beacon_sender); -} - -void MDS::handle_mds_beacon(MMDSBeacon *m) -{ - dout(10) << "handle_mds_beacon " << MDSMap::get_state_name(m->get_state()) - << " seq " << m->get_seq() << dendl; - version_t seq = m->get_seq(); - - // update lab - if (beacon_seq_stamp.count(seq)) { - assert(beacon_seq_stamp[seq] > beacon_last_acked_stamp); - beacon_last_acked_stamp = beacon_seq_stamp[seq]; - - // clean up seq_stamp map - while (!beacon_seq_stamp.empty() && - beacon_seq_stamp.begin()->first <= seq) - beacon_seq_stamp.erase(beacon_seq_stamp.begin()); - - reset_beacon_killer(); - } - - delete m; -} - -void MDS::reset_beacon_killer() -{ - utime_t when = beacon_last_acked_stamp; - when += g_conf.mds_beacon_grace; - - dout(15) << "reset_beacon_killer last_acked_stamp at " << beacon_last_acked_stamp - << ", will die at " << when << dendl; - - if (beacon_killer) timer.cancel_event(beacon_killer); - - beacon_killer = new C_MDS_BeaconKiller(this, beacon_last_acked_stamp); - timer.add_event_at(when, beacon_killer); -} - -void MDS::beacon_kill(utime_t lab) -{ - if (lab == beacon_last_acked_stamp) { - dout(0) << "beacon_kill last_acked_stamp " << lab - << ", killing myself." - << dendl; - suicide(); - } else { - dout(20) << "beacon_kill last_acked_stamp " << beacon_last_acked_stamp - << " != my " << lab - << ", doing nothing." - << dendl; - } -} - - - -void MDS::handle_mds_map(MMDSMap *m) -{ - version_t hadepoch = mdsmap->get_epoch(); - version_t epoch = m->get_epoch(); - dout(5) << "handle_mds_map epoch " << epoch << " from " << m->get_source() << dendl; - - // note source's map version - if (m->get_source().is_mds() && - peer_mdsmap_epoch[m->get_source().num()] < epoch) { - dout(15) << " peer " << m->get_source() - << " has mdsmap epoch >= " << epoch - << dendl; - peer_mdsmap_epoch[m->get_source().num()] = epoch; - } - - // is it new? - if (epoch <= mdsmap->get_epoch()) { - dout(5) << " old map epoch " << epoch << " <= " << mdsmap->get_epoch() - << ", discarding" << dendl; - delete m; - return; - } - - // keep old map, for a moment - MDSMap *oldmap = mdsmap; - int oldwhoami = whoami; - int oldstate = state; - - // decode and process - mdsmap = new MDSMap; - mdsmap->decode(m->get_encoded()); - - // see who i am - whoami = mdsmap->get_addr_rank(messenger->get_myaddr()); - if (whoami < 0) { - if (mdsmap->is_standby(messenger->get_myaddr())) { - if (state != MDSMap::STATE_STANDBY) { - want_state = state = MDSMap::STATE_STANDBY; - dout(1) << "handle_mds_map standby" << dendl; - } - goto out; - } - dout(1) << "handle_mds_map i'm not in the mdsmap, killing myself" << dendl; - suicide(); - goto out; - } - - // open logger? - // note that fakesyn/newsyn starts knowing who they are - if (whoami >= 0 && - mdsmap->is_up(whoami) && - (oldwhoami != whoami || !logger)) - reopen_logger(mdsmap->get_create()); // adopt mds cluster timeline - - if (oldwhoami != whoami) { - // update messenger. - dout(1) << "handle_mds_map i am now mds" << whoami - << " incarnation " << mdsmap->get_inc(whoami) - << dendl; - messenger->reset_myname(entity_name_t::MDS(whoami)); - - // do i need an osdmap? - if (oldwhoami < 0) { - // we need an osdmap too. - int mon = monmap->pick_mon(); - messenger->send_message(new MOSDGetMap(0), - monmap->get_inst(mon)); - } - } - - // tell objecter my incarnation - if (objecter->get_client_incarnation() < 0 && - mdsmap->have_inst(whoami)) { - assert(mdsmap->get_inc(whoami) > 0); - objecter->set_client_incarnation(mdsmap->get_inc(whoami)); - } - - // for debug - if (g_conf.mds_dump_cache_on_map) - mdcache->dump_cache(); - - // update my state - state = mdsmap->get_state(whoami); - - // did it change? - if (oldstate != state) { - if (state == want_state) { - dout(1) << "handle_mds_map new state " << mdsmap->get_state_name(state) << dendl; - } else { - dout(1) << "handle_mds_map new state " << mdsmap->get_state_name(state) - // << ", although i wanted " << mdsmap->get_state_name(want_state) - << dendl; - want_state = state; - } - - // now active? - if (is_active()) { - // did i just recover? - if (oldstate == MDSMap::STATE_REJOIN || - oldstate == MDSMap::STATE_RECONNECT) - recovery_done(); - finish_contexts(waiting_for_active); // kick waiters - } else if (is_replay()) { - replay_start(); - } else if (is_resolve()) { - resolve_start(); - } else if (is_reconnect()) { - reconnect_start(); - } else if (is_stopping()) { - assert(oldstate == MDSMap::STATE_ACTIVE); - stopping_start(); - } else if (is_stopped()) { - assert(oldstate == MDSMap::STATE_STOPPING); - suicide(); - return; - } - } - - - // RESOLVE - // is someone else newly resolving? - if (is_resolve() || is_rejoin() || is_active() || is_stopping()) { - set oldresolve, resolve; - oldmap->get_mds_set(oldresolve, MDSMap::STATE_RESOLVE); - mdsmap->get_mds_set(resolve, MDSMap::STATE_RESOLVE); - if (oldresolve != resolve) { - dout(10) << "resolve set is " << resolve << ", was " << oldresolve << dendl; - for (set::iterator p = resolve.begin(); p != resolve.end(); ++p) - if (*p != whoami && - oldresolve.count(*p) == 0) - mdcache->send_resolve(*p); // now or later. - } - } - - // REJOIN - // is everybody finally rejoining? - if (is_rejoin() || is_active() || is_stopping()) { - // did we start? - if (!oldmap->is_rejoining() && mdsmap->is_rejoining()) - rejoin_joint_start(); - - // did we finish? - if (g_conf.mds_dump_cache_after_rejoin && - oldmap->is_rejoining() && !mdsmap->is_rejoining()) - mdcache->dump_cache(); // for DEBUG only - } - if (oldmap->is_degraded() && !mdsmap->is_degraded() && state >= MDSMap::STATE_ACTIVE) - dout(1) << "cluster recovered." << dendl; - - // did someone go active? - if (is_active() || is_stopping()) { - set oldactive, active; - oldmap->get_mds_set(oldactive, MDSMap::STATE_ACTIVE); - mdsmap->get_mds_set(active, MDSMap::STATE_ACTIVE); - for (set::iterator p = active.begin(); p != active.end(); ++p) - if (*p != whoami && // not me - oldactive.count(*p) == 0) // newly so? - handle_mds_recovery(*p); - } - - // did someone fail or stop? - if (is_active() || is_stopping()) { - // new failed? - set oldfailed, failed; - oldmap->get_mds_set(oldfailed, MDSMap::STATE_FAILED); - mdsmap->get_mds_set(failed, MDSMap::STATE_FAILED); - for (set::iterator p = failed.begin(); p != failed.end(); ++p) - if (oldfailed.count(*p) == 0) - mdcache->handle_mds_failure(*p); - - // or down then up? - // did their addr/inst change? - set up; - mdsmap->get_up_mds_set(up); - for (set::iterator p = up.begin(); p != up.end(); ++p) - if (oldmap->have_inst(*p) && - oldmap->get_inst(*p) != mdsmap->get_inst(*p)) - mdcache->handle_mds_failure(*p); - - // did anyone stop? - set oldstopped, stopped; - oldmap->get_mds_set(oldstopped, MDSMap::STATE_STOPPED); - mdsmap->get_mds_set(stopped, MDSMap::STATE_STOPPED); - for (set::iterator p = stopped.begin(); p != stopped.end(); ++p) - if (oldstopped.count(*p) == 0) // newly so? - mdcache->migrator->handle_mds_failure_or_stop(*p); - } - - // just got mdsmap+osdmap? - if (hadepoch == 0 && - mdsmap->get_epoch() > 0 && - osdmap->get_epoch() > 0) { - boot(); - } else if (want_state != state) { - // resend beacon. - beacon_send(); - } - - out: - delete m; - delete oldmap; -} - -void MDS::bcast_mds_map() -{ - dout(7) << "bcast_mds_map " << mdsmap->get_epoch() << dendl; - - // share the map with mounted clients - for (set::const_iterator p = clientmap.get_session_set().begin(); - p != clientmap.get_session_set().end(); - ++p) { - messenger->send_message(new MMDSMap(mdsmap), - clientmap.get_inst(*p)); - } - last_client_mdsmap_bcast = mdsmap->get_epoch(); -} - - -void MDS::handle_osd_map(MOSDMap *m) -{ - version_t hadepoch = osdmap->get_epoch(); - dout(10) << "handle_osd_map had " << hadepoch << dendl; - - // process - objecter->handle_osd_map(m); - - // just got mdsmap+osdmap? - if (hadepoch == 0 && - osdmap->get_epoch() > 0 && - mdsmap->get_epoch() > 0) - boot(); -} - - -void MDS::set_want_state(int s) -{ - dout(3) << "set_want_state " << MDSMap::get_state_name(s) << dendl; - want_state = s; - beacon_send(); -} - -void MDS::boot() -{ - if (is_creating()) - boot_create(); // new tables, journal - else if (is_starting() || is_replay()) - boot_start(); // start|replay, join - else - assert(is_standby()); -} - - -class C_MDS_CreateFinish : public Context { - MDS *mds; -public: - C_MDS_CreateFinish(MDS *m) : mds(m) {} - void finish(int r) { mds->creating_done(); } -}; - -void MDS::boot_create() -{ - dout(3) << "boot_create" << dendl; - - C_Gather *fin = new C_Gather(new C_MDS_CreateFinish(this)); - - CDir *rootdir = 0; - if (whoami == 0) { - dout(3) << "boot_create since i am also mds0, creating root inode and dir" << dendl; - - // create root inode. - mdcache->open_root(0); - CInode *root = mdcache->get_root(); - assert(root); - - // force empty root dir - rootdir = root->get_dirfrag(frag_t()); - rootdir->mark_complete(); - } - - // create my stray dir - CDir *straydir; - { - dout(10) << "boot_create creating local stray dir" << dendl; - mdcache->open_local_stray(); - CInode *stray = mdcache->get_stray(); - straydir = stray->get_dirfrag(frag_t()); - straydir->mark_complete(); - } - - // start with a fresh journal - dout(10) << "boot_create creating fresh journal" << dendl; - mdlog->create(fin->new_sub()); - - // write our first subtreemap - mdlog->start_new_segment(fin->new_sub()); - - // dirty, commit (root and) stray dir(s) - if (whoami == 0) { - rootdir->mark_dirty(rootdir->pre_dirty(), mdlog->get_current_segment()); - rootdir->commit(0, fin->new_sub()); - } - straydir->mark_dirty(straydir->pre_dirty(), mdlog->get_current_segment()); - straydir->commit(0, fin->new_sub()); - - // fixme: fake out idalloc (reset, pretend loaded) - dout(10) << "boot_create creating fresh idalloc table" << dendl; - idalloc->reset(); - idalloc->save(fin->new_sub()); - - // write empty clientmap - clientmap.save(fin->new_sub()); - - // fixme: fake out anchortable - if (mdsmap->get_anchortable() == whoami) { - dout(10) << "boot_create creating fresh anchortable" << dendl; - anchortable->create_fresh(); - anchortable->save(fin->new_sub()); - } -} - -void MDS::creating_done() -{ - dout(1)<< "creating_done" << dendl; - set_want_state(MDSMap::STATE_ACTIVE); -} - - -class C_MDS_BootStart : public Context { - MDS *mds; - int nextstep; -public: - C_MDS_BootStart(MDS *m, int n) : mds(m), nextstep(n) {} - void finish(int r) { mds->boot_start(nextstep); } -}; - -void MDS::boot_start(int step) -{ - switch (step) { - case 0: - step = 1; // fall-thru. - - case 1: - { - C_Gather *gather = new C_Gather(new C_MDS_BootStart(this, 2)); - dout(2) << "boot_start " << step << ": opening idalloc" << dendl; - idalloc->load(gather->new_sub()); - - dout(2) << "boot_start " << step << ": opening clientmap" << dendl; - clientmap.load(gather->new_sub()); - - if (mdsmap->get_anchortable() == whoami) { - dout(2) << "boot_start " << step << ": opening anchor table" << dendl; - anchortable->load(gather->new_sub()); - } - - dout(2) << "boot_start " << step << ": opening mds log" << dendl; - mdlog->open(gather->new_sub()); - } - break; - - case 2: - if (is_replay()) { - dout(2) << "boot_start " << step << ": replaying mds log" << dendl; - mdlog->replay(new C_MDS_BootStart(this, 3)); - break; - } else { - dout(2) << "boot_start " << step << ": positioning at end of old mds log" << dendl; - mdlog->append(); - step++; - } - - case 3: - if (is_replay()) { - replay_done(); - break; - } - - // starting only - assert(is_starting()); - if (mdsmap->get_root() == whoami) { - dout(2) << "boot_start " << step << ": opening root directory" << dendl; - mdcache->open_root(new C_MDS_BootStart(this, 4)); - break; - } - step++; - - case 4: - dout(2) << "boot_start " << step << ": opening local stray directory" << dendl; - mdcache->open_local_stray(); - - starting_done(); - break; - } -} - -void MDS::starting_done() -{ - dout(3) << "starting_done" << dendl; - assert(is_starting()); - set_want_state(MDSMap::STATE_ACTIVE); - - // start new segment - mdlog->start_new_segment(0); -} - - -void MDS::replay_start() -{ - dout(1) << "replay_start" << dendl; - - // initialize gather sets - set rs; - mdsmap->get_recovery_mds_set(rs); - rs.erase(whoami); - dout(1) << "now replay. my recovery peers are " << rs << dendl; - mdcache->set_recovery_set(rs); - - // start? - if (osdmap->get_epoch() > 0 && - mdsmap->get_epoch() > 0) - boot_start(); -} - -void MDS::replay_done() -{ - dout(1) << "replay_done" << dendl; - - if (mdsmap->get_num_in_mds() == 1 && - mdsmap->get_num_mds(MDSMap::STATE_FAILED) == 0) { // just me! - dout(2) << "i am alone, moving to state reconnect" << dendl; - set_want_state(MDSMap::STATE_RECONNECT); - } else { - dout(2) << "i am not alone, moving to state resolve" << dendl; - set_want_state(MDSMap::STATE_RESOLVE); - } - - // start new segment - mdlog->start_new_segment(0); -} - - -void MDS::resolve_start() -{ - dout(1) << "resolve_start" << dendl; - - set who; - mdsmap->get_mds_set(who, MDSMap::STATE_RESOLVE); - mdsmap->get_mds_set(who, MDSMap::STATE_REJOIN); - mdsmap->get_mds_set(who, MDSMap::STATE_ACTIVE); - mdsmap->get_mds_set(who, MDSMap::STATE_STOPPING); - for (set::iterator p = who.begin(); p != who.end(); ++p) { - if (*p == whoami) continue; - mdcache->send_resolve(*p); // now. - } -} -void MDS::resolve_done() -{ - dout(1) << "resolve_done" << dendl; - set_want_state(MDSMap::STATE_RECONNECT); -} - -void MDS::reconnect_start() -{ - dout(1) << "reconnect_start" << dendl; - server->reconnect_clients(); -} -void MDS::reconnect_done() -{ - dout(1) << "reconnect_done" << dendl; - set_want_state(MDSMap::STATE_REJOIN); // move to rejoin state - - /* - if (mdsmap->get_num_in_mds() == 1 && - mdsmap->get_num_mds(MDSMap::STATE_FAILED) == 0) { // just me! - - // finish processing caps (normally, this happens during rejoin, but we're skipping that...) - mdcache->rejoin_gather_finish(); - - set_want_state(MDSMap::STATE_ACTIVE); // go active - } else { - set_want_state(MDSMap::STATE_REJOIN); // move to rejoin state - } - */ -} - -void MDS::rejoin_joint_start() -{ - dout(1) << "rejoin_joint_start" << dendl; - mdcache->rejoin_send_rejoins(); -} -void MDS::rejoin_done() -{ - dout(1) << "rejoin_done" << dendl; - mdcache->show_subtrees(); - mdcache->show_cache(); - set_want_state(MDSMap::STATE_ACTIVE); -} - - -void MDS::recovery_done() -{ - dout(1) << "recovery_done -- successful recovery!" << dendl; - assert(is_active()); - - // kick anchortable (resent AGREEs) - if (mdsmap->get_anchortable() == whoami) - anchortable->finish_recovery(); - - // kick anchorclient (resent COMMITs) - anchorclient->finish_recovery(); - - mdcache->start_recovered_purges(); - - // tell connected clients - bcast_mds_map(); -} - -void MDS::handle_mds_recovery(int who) -{ - dout(5) << "handle_mds_recovery mds" << who << dendl; - - mdcache->handle_mds_recovery(who); - - if (anchortable) - anchortable->handle_mds_recovery(who); - anchorclient->handle_mds_recovery(who); - - queue_waiters(waiting_for_active_peer[who]); - waiting_for_active_peer.erase(who); -} - -void MDS::stopping_start() -{ - dout(2) << "stopping_start" << dendl; - - // start cache shutdown - mdcache->shutdown_start(); - - // terminate client sessions - server->terminate_sessions(); -} - -void MDS::stopping_done() -{ - dout(2) << "stopping_done" << dendl; - - // tell monitor we shut down cleanly. - set_want_state(MDSMap::STATE_STOPPED); -} - - - -void MDS::suicide() -{ - dout(1) << "suicide" << dendl; - - // stop timers - if (beacon_killer) { - timer.cancel_event(beacon_killer); - beacon_killer = 0; - } - if (beacon_sender) { - timer.cancel_event(beacon_sender); - beacon_sender = 0; - } - if (tick_event) { - timer.cancel_event(tick_event); - tick_event = 0; - } - timer.cancel_all(); - //timer.join(); // this will deadlock from beacon_kill -> suicide - - // shut down cache - mdcache->shutdown(); - - objecter->shutdown(); - - // shut down messenger - messenger->shutdown(); -} - - - - - -void MDS::dispatch(Message *m) -{ - mds_lock.Lock(); - my_dispatch(m); - mds_lock.Unlock(); -} - - - -void MDS::my_dispatch(Message *m) -{ - // from bad mds? - if (m->get_source().is_mds()) { - int from = m->get_source().num(); - if (!mdsmap->have_inst(from) || - mdsmap->get_inst(from) != m->get_source_inst() || - mdsmap->is_down(from)) { - // bogus mds? - if (m->get_type() == MSG_MDS_MAP) { - dout(5) << "got " << *m << " from old/bad/imposter mds " << m->get_source() - << ", but it's an mdsmap, looking at it" << dendl; - } else if (m->get_type() == MSG_MDS_CACHEEXPIRE && - mdsmap->get_inst(from) == m->get_source_inst()) { - dout(5) << "got " << *m << " from down mds " << m->get_source() - << ", but it's a cache_expire, looking at it" << dendl; - } else { - dout(5) << "got " << *m << " from down/old/bad/imposter mds " << m->get_source() - << ", dropping" << dendl; - delete m; - return; - } - } - } - - - int port = m->get_type() & 0xff00; - switch (port) { - case MDS_PORT_CACHE: - mdcache->dispatch(m); - break; - - case MDS_PORT_LOCKER: - locker->dispatch(m); - break; - - case MDS_PORT_MIGRATOR: - mdcache->migrator->dispatch(m); - break; - - default: - switch (m->get_type()) { - // SERVER - case MSG_CLIENT_SESSION: - case MSG_CLIENT_REQUEST: - case MSG_MDS_SLAVE_REQUEST: - server->dispatch(m); - break; - - case MSG_MDS_HEARTBEAT: - balancer->proc_message(m); - break; - - // anchor - case MSG_MDS_ANCHOR: - if (((MAnchor*)m)->get_op() < 0) - anchorclient->dispatch(m); - else - anchortable->dispatch(m); - break; - - // OSD - case MSG_OSD_OPREPLY: - objecter->handle_osd_op_reply((class MOSDOpReply*)m); - break; - case MSG_OSD_MAP: - handle_osd_map((MOSDMap*)m); - break; - - // MDS - case MSG_MDS_MAP: - handle_mds_map((MMDSMap*)m); - break; - case MSG_MDS_BEACON: - handle_mds_beacon((MMDSBeacon*)m); - break; - - default: - dout(1) << "MDS unknown messge " << m->get_type() << dendl; - assert(0); - } - } - - // finish any triggered contexts - if (finished_queue.size()) { - dout(7) << "mds has " << finished_queue.size() << " queued contexts" << dendl; - dout(10) << finished_queue << dendl; - list ls; - ls.splice(ls.begin(), finished_queue); - assert(finished_queue.empty()); - finish_contexts(ls); - } - - - // HACK FOR NOW - if (is_active()) { - // flush log to disk after every op. for now. - mdlog->flush(); - - // trim cache - mdcache->trim(); - } - - - // hack: thrash exports - static utime_t start; - utime_t now = g_clock.now(); - if (start == utime_t()) - start = now; - double el = now - start; - if (el > 30.0 && - el < 60.0) - for (int i=0; i s; - if (!is_active()) break; - mdsmap->get_mds_set(s, MDSMap::STATE_ACTIVE); - if (s.size() < 2 || mdcache->get_num_inodes() < 10) - break; // need peers for this to work. - - dout(7) << "mds thrashing exports pass " << (i+1) << "/" << g_conf.mds_thrash_exports << dendl; - - // pick a random dir inode - CInode *in = mdcache->hack_pick_random_inode(); - - list ls; - in->get_dirfrags(ls); - if (ls.empty()) continue; // must be an open dir. - CDir *dir = ls.front(); - if (!dir->get_parent_dir()) continue; // must be linked. - if (!dir->is_auth()) continue; // must be auth. - - int dest; - do { - int k = rand() % s.size(); - set::iterator p = s.begin(); - while (k--) p++; - dest = *p; - } while (dest == whoami); - mdcache->migrator->export_dir_nicely(dir,dest); - } - // hack: thrash exports - for (int i=0; ihack_pick_random_inode(); - - list ls; - in->get_dirfrags(ls); - if (ls.empty()) continue; // must be an open dir. - CDir *dir = ls.front(); - if (!dir->get_parent_dir()) continue; // must be linked. - if (!dir->is_auth()) continue; // must be auth. - mdcache->split_dir(dir, 1);// + (rand() % 3)); - } - - // hack: force hash root? - /* - if (false && - mdcache->get_root() && - mdcache->get_root()->dir && - !(mdcache->get_root()->dir->is_hashed() || - mdcache->get_root()->dir->is_hashing())) { - dout(0) << "hashing root" << dendl; - mdcache->migrator->hash_dir(mdcache->get_root()->dir); - } - */ - - - - // shut down? - if (is_stopping()) { - if (mdcache->shutdown_pass()) { - dout(7) << "shutdown_pass=true, finished w/ shutdown, moving to down:stopped" << dendl; - stopping_done(); - } - } - -} - - -void MDS::proc_message(Message *m) -{ - switch (m->get_type()) { - - // OSD - case MSG_OSD_OPREPLY: - objecter->handle_osd_op_reply((class MOSDOpReply*)m); - return; - case MSG_OSD_MAP: - handle_osd_map((MOSDMap*)m); - return; - - - // MDS - case MSG_MDS_MAP: - handle_mds_map((MMDSMap*)m); - return; - case MSG_MDS_BEACON: - handle_mds_beacon((MMDSBeacon*)m); - return; - - default: - assert(0); - } - -} - - - -void MDS::ms_handle_failure(Message *m, const entity_inst_t& inst) -{ - mds_lock.Lock(); - dout(10) << "handle_ms_failure to " << inst << " on " << *m << dendl; - - if (m->get_type() == MSG_MDS_MAP && m->get_dest().is_client()) - server->client_reconnect_failure(m->get_dest().num()); - - delete m; - mds_lock.Unlock(); -} - diff --git a/branches/sage/ebofs2/mds/MDS.h b/branches/sage/ebofs2/mds/MDS.h deleted file mode 100644 index 7dcd921d05f4e..0000000000000 --- a/branches/sage/ebofs2/mds/MDS.h +++ /dev/null @@ -1,298 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __MDS_H -#define __MDS_H - -#include -#include -#include -#include -#include -using namespace std; - -#include -using namespace __gnu_cxx; - -#include "mdstypes.h" - -#include "msg/Dispatcher.h" -#include "include/types.h" -#include "include/Context.h" -#include "common/DecayCounter.h" -#include "common/Logger.h" -#include "common/Mutex.h" -#include "common/Cond.h" -#include "common/Timer.h" - -#include "mon/MonMap.h" -#include "MDSMap.h" - -#include "ClientMap.h" - - - - -class filepath; - -class OSDMap; -class Objecter; -class Filer; - -class Server; -class Locker; -class AnchorTable; -class AnchorClient; -class MDCache; -class MDLog; -class MDBalancer; -class IdAllocator; - -class CInode; -class CDir; -class CDentry; - -class Messenger; -class Message; - -class MClientRequest; -class MClientReply; -class MHashReaddir; -class MHashReaddirReply; - -class MMDSBeacon; - - -class MDS : public Dispatcher { - public: - Mutex mds_lock; - - SafeTimer timer; - - protected: - int whoami; - - public: - Messenger *messenger; - MDSMap *mdsmap; - MonMap *monmap; - OSDMap *osdmap; - Objecter *objecter; - Filer *filer; // for reading/writing to/from osds - - // sub systems - Server *server; - MDCache *mdcache; - Locker *locker; - MDLog *mdlog; - MDBalancer *balancer; - - IdAllocator *idalloc; - - AnchorTable *anchortable; - AnchorClient *anchorclient; - - Logger *logger, *logger2; - - - protected: - // -- MDS state -- - int state; // my confirmed state - int want_state; // the state i want - int want_rank; // the mds rank i want - - list waiting_for_active; - map > waiting_for_active_peer; - - map peer_mdsmap_epoch; - - public: - void wait_for_active(Context *c) { - waiting_for_active.push_back(c); - } - void wait_for_active_peer(int who, Context *c) { - waiting_for_active_peer[who].push_back(c); - } - - int get_state() { return state; } - bool is_dne() { return state == MDSMap::STATE_DNE; } - bool is_failed() { return state == MDSMap::STATE_FAILED; } - bool is_creating() { return state == MDSMap::STATE_CREATING; } - bool is_starting() { return state == MDSMap::STATE_STARTING; } - bool is_standby() { return state == MDSMap::STATE_STANDBY; } - bool is_replay() { return state == MDSMap::STATE_REPLAY; } - bool is_resolve() { return state == MDSMap::STATE_RESOLVE; } - bool is_reconnect() { return state == MDSMap::STATE_RECONNECT; } - bool is_rejoin() { return state == MDSMap::STATE_REJOIN; } - bool is_active() { return state == MDSMap::STATE_ACTIVE; } - bool is_stopping() { return state == MDSMap::STATE_STOPPING; } - bool is_stopped() { return state == MDSMap::STATE_STOPPED; } - - void set_want_state(int s); - - - // -- waiters -- - list finished_queue; - - void queue_waiter(Context *c) { - finished_queue.push_back(c); - } - void queue_waiters(list& ls) { - finished_queue.splice( finished_queue.end(), ls ); - } - - // -- keepalive beacon -- - version_t beacon_last_seq; // last seq sent to monitor - map beacon_seq_stamp; // seq # -> time sent - utime_t beacon_last_acked_stamp; // last time we sent a beacon that got acked - - class C_MDS_BeaconSender : public Context { - MDS *mds; - public: - C_MDS_BeaconSender(MDS *m) : mds(m) {} - void finish(int r) { - mds->beacon_sender = 0; - mds->beacon_send(); - } - } *beacon_sender; - class C_MDS_BeaconKiller : public Context { - MDS *mds; - utime_t lab; - public: - C_MDS_BeaconKiller(MDS *m, utime_t l) : mds(m), lab(l) {} - void finish(int r) { - if (mds->beacon_killer) { - mds->beacon_killer = 0; - mds->beacon_kill(lab); - } - // else mds is pbly already shutting down - } - } *beacon_killer; - - // tick and other timer fun - class C_MDS_Tick : public Context { - MDS *mds; - public: - C_MDS_Tick(MDS *m) : mds(m) {} - void finish(int r) { - mds->tick_event = 0; - mds->tick(); - } - } *tick_event; - void reset_tick(); - - // -- client map -- - ClientMap clientmap; - epoch_t last_client_mdsmap_bcast; - //void log_clientmap(Context *c); - - - // shutdown crap - int req_rate; - - // ino's and fh's - public: - - int get_req_rate() { return req_rate; } - - - public: - MDS(int whoami, Messenger *m, MonMap *mm); - ~MDS(); - - // who am i etc - int get_nodeid() { return whoami; } - MDSMap *get_mds_map() { return mdsmap; } - OSDMap *get_osd_map() { return osdmap; } - - void send_message_mds(Message *m, int mds); - void forward_message_mds(Message *req, int mds); - - void send_message_client(Message *m, int client); - void send_message_client(Message *m, entity_inst_t clientinst); - void send_message_client_maybe_opening(Message *m, int); - void send_message_client_maybe_open(Message *m, entity_inst_t clientinst); - - - // start up, shutdown - int init(bool standby=false); - void reopen_logger(utime_t start); - - void bcast_mds_map(); // to mounted clients - - void boot(); - void boot_create(); // i am new mds. - void boot_start(int step=0); // starting|replay - - void replay_start(); - void creating_done(); - void starting_done(); - void replay_done(); - - void resolve_start(); - void resolve_done(); - void reconnect_start(); - void reconnect_done(); - void rejoin_joint_start(); - void rejoin_done(); - void recovery_done(); - void handle_mds_recovery(int who); - - void stopping_start(); - void stopping_done(); - void suicide(); - - void tick(); - - void beacon_start(); - void beacon_send(); - void beacon_kill(utime_t lab); - void handle_mds_beacon(MMDSBeacon *m); - void reset_beacon_killer(); - - // messages - void proc_message(Message *m); - virtual void dispatch(Message *m); - void my_dispatch(Message *m); - - void ms_handle_failure(Message *m, const entity_inst_t& inst); - - // special message types - void handle_mds_map(class MMDSMap *m); - - // osds - void handle_osd_map(class MOSDMap *m); -}; - - - -class C_MDS_RetryMessage : public Context { - Message *m; - MDS *mds; -public: - C_MDS_RetryMessage(MDS *mds, Message *m) { - assert(m); - this->m = m; - this->mds = mds; - } - virtual void finish(int r) { - mds->my_dispatch(m); - } -}; - - - -#endif diff --git a/branches/sage/ebofs2/mds/MDSMap.h b/branches/sage/ebofs2/mds/MDSMap.h deleted file mode 100644 index 9a4371609c7eb..0000000000000 --- a/branches/sage/ebofs2/mds/MDSMap.h +++ /dev/null @@ -1,380 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MDSMAP_H -#define __MDSMAP_H - -#include "common/Clock.h" -#include "msg/Message.h" - -#include "include/types.h" - -#include -#include -#include -using namespace std; - - -/* - - beautiful state diagram: - - STOPPED DNE FAILED - / | \ / | | - / | \________ _______/ | | -| v v v v | -| STARTING <--> STANDBY <--> CREATING | -| \ / | -| \____ ____________/ | - \ v v | - \ ACTIVE <-- REJOIN <-- RECONNECT <-- REPLAY - \ | - \ | - \ v - \-- STOPPING - - - new states: - - boot --> standby, creating, or starting. - - - dne ----> creating -----> active* - ^ ^___________/ / ^ ^ - | / / | - destroying / / | - ^ / / | - | / / | - stopped <---- stopping* <-/ / | - \ / | - ----- starting* ----/ | - | - failed | - \ | - \--> replay* --> reconnect* --> rejoin* - - * = can fail - -*/ - - -class MDSMap { - public: - // mds states - static const int STATE_DNE = 0; // down, never existed. - static const int STATE_DESTROYING = -1; // down, existing, semi-destroyed. - static const int STATE_STOPPED = -2; // down, once existed, but no subtrees. empty log. - static const int STATE_FAILED = 3; // down, active subtrees; needs to be recovered. - - static const int STATE_BOOT = -4; // up, boot announcement. destiny unknown. - static const int STATE_STANDBY = -5; // up, idle. waiting for assignment by monitor. - - static const int STATE_CREATING = -6; // up, creating MDS instance (new journal, idalloc..). - static const int STATE_STARTING = -7; // up, starting prior stopped MDS instance. - - static const int STATE_REPLAY = 8; // up, starting prior failed instance. scanning journal. - static const int STATE_RESOLVE = 9; // up, disambiguating distributed operations (import, rename, etc.) - static const int STATE_RECONNECT = 10; // up, reconnect to clients - static const int STATE_REJOIN = 11; // up, replayed journal, rejoining distributed cache - static const int STATE_ACTIVE = 12; // up, active - static const int STATE_STOPPING = 13; // up, exporting metadata (-> standby or out) - - static const char *get_state_name(int s) { - switch (s) { - // down and out - case STATE_DNE: return "down:dne"; - case STATE_DESTROYING: return "down:destroying"; - case STATE_STOPPED: return "down:stopped"; - // down and in - case STATE_FAILED: return "down:failed"; - // up and out - case STATE_BOOT: return "up:boot"; - case STATE_STANDBY: return "up:standby"; - case STATE_CREATING: return "up:creating"; - case STATE_STARTING: return "up:starting"; - // up and in - case STATE_REPLAY: return "up:replay"; - case STATE_RESOLVE: return "up:resolve"; - case STATE_RECONNECT: return "up:reconnect"; - case STATE_REJOIN: return "up:rejoin"; - case STATE_ACTIVE: return "up:active"; - case STATE_STOPPING: return "up:stopping"; - default: assert(0); - } - return 0; - } - - protected: - epoch_t epoch; - epoch_t client_epoch; // incremented only when change is significant to client. - utime_t created; - - int32_t max_mds; - int32_t anchortable; // which MDS has anchortable (fixme someday) - int32_t root; // which MDS has root directory - - map mds_state; // MDS state - map mds_state_seq; - map mds_inst; // up instances - map mds_inc; // incarnation count (monotonically increases) - - map standby; // -1 == any - map > standby_for; - set standby_any; - - friend class MDSMonitor; - - public: - MDSMap() : epoch(0), client_epoch(0), anchortable(0), root(0) {} - - epoch_t get_epoch() const { return epoch; } - void inc_epoch() { epoch++; } - - const utime_t& get_create() const { return created; } - - int get_anchortable() const { return anchortable; } - int get_root() const { return root; } - - // counts - int get_num_mds() { - return get_num_in_mds(); - } - int get_num_mds(int state) { - int n = 0; - for (map::const_iterator p = mds_state.begin(); - p != mds_state.end(); - p++) - if (p->second == state) ++n; - return n; - } - - int get_num_in_mds() { - int n = 0; - for (map::const_iterator p = mds_state.begin(); - p != mds_state.end(); - p++) - if (p->second > 0) ++n; - return n; - } - - // sets - void get_mds_set(set& s) { - for (map::const_iterator p = mds_state.begin(); - p != mds_state.end(); - p++) - s.insert(p->first); - } - void get_mds_set(set& s, int state) { - for (map::const_iterator p = mds_state.begin(); - p != mds_state.end(); - p++) - if (p->second == state) - s.insert(p->first); - } - void get_up_mds_set(set& s) { - for (map::const_iterator p = mds_state.begin(); - p != mds_state.end(); - p++) - if (is_up(p->first)) s.insert(p->first); - } - void get_in_mds_set(set& s) { - for (map::const_iterator p = mds_state.begin(); - p != mds_state.end(); - p++) - if (is_in(p->first)) s.insert(p->first); - } - void get_active_mds_set(set& s) { - get_mds_set(s, MDSMap::STATE_ACTIVE); - } - void get_failed_mds_set(set& s) { - get_mds_set(s, MDSMap::STATE_FAILED); - } - void get_recovery_mds_set(set& s) { - for (map::const_iterator p = mds_state.begin(); - p != mds_state.end(); - p++) - if (is_failed(p->first) || - (p->second >= STATE_REPLAY && p->second <= STATE_STOPPING)) - s.insert(p->first); - } - - int get_random_in_mds() { - vector v; - for (map::const_iterator p = mds_state.begin(); - p != mds_state.end(); - p++) - if (p->second > 0) v.push_back(p->first); - if (v.empty()) - return -1; - else - return v[rand() % v.size()]; - } - - - // mds states - bool is_down(int m) { return is_dne(m) || is_stopped(m) || is_failed(m); } - bool is_up(int m) { return !is_down(m); } - bool is_in(int m) { return mds_state.count(m) && mds_state[m] > 0; } - bool is_out(int m) { return !mds_state.count(m) || mds_state[m] <= 0; } - - bool is_dne(int m) { return mds_state.count(m) == 0 || mds_state[m] == STATE_DNE; } - bool is_failed(int m) { return mds_state.count(m) && mds_state[m] == STATE_FAILED; } - - bool is_boot(int m) { return mds_state.count(m) && mds_state[m] == STATE_BOOT; } - bool is_creating(int m) { return mds_state.count(m) && mds_state[m] == STATE_CREATING; } - bool is_starting(int m) { return mds_state.count(m) && mds_state[m] == STATE_STARTING; } - bool is_replay(int m) { return mds_state.count(m) && mds_state[m] == STATE_REPLAY; } - bool is_resolve(int m) { return mds_state.count(m) && mds_state[m] == STATE_RESOLVE; } - bool is_reconnect(int m) { return mds_state.count(m) && mds_state[m] == STATE_RECONNECT; } - bool is_rejoin(int m) { return mds_state.count(m) && mds_state[m] == STATE_REJOIN; } - bool is_active(int m) { return mds_state.count(m) && mds_state[m] == STATE_ACTIVE; } - bool is_stopping(int m) { return mds_state.count(m) && mds_state[m] == STATE_STOPPING; } - bool is_active_or_stopping(int m) { return is_active(m) || is_stopping(m); } - bool is_stopped(int m) { return mds_state.count(m) && mds_state[m] == STATE_STOPPED; } - - bool is_standby(entity_addr_t a) { return standby.count(a); } - - // cluster states - bool is_full() { - return get_num_in_mds() >= max_mds; - } - bool is_degraded() { // degraded = some recovery in process. fixes active membership and recovery_set. - return - get_num_mds(STATE_REPLAY) + - get_num_mds(STATE_RESOLVE) + - get_num_mds(STATE_RECONNECT) + - get_num_mds(STATE_REJOIN) + - get_num_mds(STATE_FAILED); - } - bool is_rejoining() { - // nodes are rejoining cache state - return - get_num_mds(STATE_REJOIN) > 0 && - get_num_mds(STATE_REPLAY) == 0 && - get_num_mds(STATE_RECONNECT) == 0 && - get_num_mds(STATE_RESOLVE) == 0 && - get_num_mds(STATE_FAILED) == 0; - } - bool is_stopped() { - return - get_num_in_mds() == 0 && - get_num_mds(STATE_CREATING) == 0 && - get_num_mds(STATE_STARTING) == 0 && - get_num_mds(STATE_STANDBY) == 0; - } - - bool would_be_overfull_with(int mds) { - int in = 1; // mds! - for (map::const_iterator p = mds_state.begin(); - p != mds_state.end(); - p++) { - if (p->first == mds) continue; - if (p->second > 0 || - p->second == STATE_STARTING || - p->second == STATE_CREATING) - in++; - } - return (in > max_mds); - } - - int get_state(int m) { - if (mds_state.count(m)) - return mds_state[m]; - else - return STATE_DNE; - } - - // inst - bool have_inst(int m) { - return mds_inst.count(m); - } - const entity_inst_t& get_inst(int m) { - assert(mds_inst.count(m)); - return mds_inst[m]; - } - bool get_inst(int m, entity_inst_t& inst) { - if (mds_inst.count(m)) { - inst = mds_inst[m]; - return true; - } - return false; - } - - int get_addr_rank(const entity_addr_t& addr) { - for (map::iterator p = mds_inst.begin(); - p != mds_inst.end(); - ++p) { - if (p->second.addr == addr) return p->first; - } - if (standby.count(addr)) - return -2; - return -1; - } - - int get_inc(int m) { - if (mds_inc.count(m)) - return mds_inc[m]; - return 0; - } - - - void remove_mds(int m) { - mds_inst.erase(m); - mds_state.erase(m); - mds_state_seq.erase(m); - } - - - // serialize, unserialize - void encode(bufferlist& bl) { - ::_encode(epoch, bl); - ::_encode(client_epoch, bl); - ::_encode(created, bl); - ::_encode(anchortable, bl); - ::_encode(root, bl); - ::_encode(max_mds, bl); - ::_encode(mds_state, bl); - ::_encode(mds_state_seq, bl); - ::_encode(mds_inst, bl); - ::_encode(mds_inc, bl); - ::_encode(standby, bl); - ::_encode(standby_for, bl); - ::_encode(standby_any, bl); - } - - void decode(bufferlist& bl) { - int off = 0; - ::_decode(epoch, bl, off); - ::_decode(client_epoch, bl, off); - ::_decode(created, bl, off); - ::_decode(anchortable, bl, off); - ::_decode(root, bl, off); - ::_decode(max_mds, bl, off); - ::_decode(mds_state, bl, off); - ::_decode(mds_state_seq, bl, off); - ::_decode(mds_inst, bl, off); - ::_decode(mds_inc, bl, off); - ::_decode(standby, bl, off); - ::_decode(standby_for, bl, off); - ::_decode(standby_any, bl, off); - } - - - /*** mapping functions ***/ - - int hash_dentry( inodeno_t dirino, const string& dn ); -}; - -#endif diff --git a/branches/sage/ebofs2/mds/Migrator.cc b/branches/sage/ebofs2/mds/Migrator.cc deleted file mode 100644 index 1c443c7bf6f79..0000000000000 --- a/branches/sage/ebofs2/mds/Migrator.cc +++ /dev/null @@ -1,2109 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "MDS.h" -#include "MDCache.h" -#include "CInode.h" -#include "CDir.h" -#include "CDentry.h" -#include "Migrator.h" -#include "Locker.h" -#include "Migrator.h" - -#include "MDBalancer.h" -#include "MDLog.h" -#include "MDSMap.h" - -#include "include/filepath.h" - -#include "events/EString.h" -#include "events/EExport.h" -#include "events/EImportStart.h" -#include "events/EImportFinish.h" - -#include "msg/Messenger.h" - -#include "messages/MClientFileCaps.h" - -#include "messages/MExportDirDiscover.h" -#include "messages/MExportDirDiscoverAck.h" -#include "messages/MExportDirCancel.h" -#include "messages/MExportDirPrep.h" -#include "messages/MExportDirPrepAck.h" -#include "messages/MExportDir.h" -#include "messages/MExportDirAck.h" -#include "messages/MExportDirNotify.h" -#include "messages/MExportDirNotifyAck.h" -#include "messages/MExportDirFinish.h" - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_mds || l <= g_conf.debug_mds_migrator) *_dout << dbeginl << g_clock.now() << " mds" << mds->get_nodeid() << ".migrator " - - - -void Migrator::dispatch(Message *m) -{ - switch (m->get_type()) { - // import - case MSG_MDS_EXPORTDIRDISCOVER: - handle_export_discover((MExportDirDiscover*)m); - break; - case MSG_MDS_EXPORTDIRPREP: - handle_export_prep((MExportDirPrep*)m); - break; - case MSG_MDS_EXPORTDIR: - handle_export_dir((MExportDir*)m); - break; - case MSG_MDS_EXPORTDIRFINISH: - handle_export_finish((MExportDirFinish*)m); - break; - case MSG_MDS_EXPORTDIRCANCEL: - handle_export_cancel((MExportDirCancel*)m); - break; - - // export - case MSG_MDS_EXPORTDIRDISCOVERACK: - handle_export_discover_ack((MExportDirDiscoverAck*)m); - break; - case MSG_MDS_EXPORTDIRPREPACK: - handle_export_prep_ack((MExportDirPrepAck*)m); - break; - case MSG_MDS_EXPORTDIRACK: - handle_export_ack((MExportDirAck*)m); - break; - case MSG_MDS_EXPORTDIRNOTIFYACK: - handle_export_notify_ack((MExportDirNotifyAck*)m); - break; - - // export 3rd party (dir_auth adjustments) - case MSG_MDS_EXPORTDIRNOTIFY: - handle_export_notify((MExportDirNotify*)m); - break; - - default: - assert(0); - } -} - - -class C_MDC_EmptyImport : public Context { - Migrator *mig; - CDir *dir; -public: - C_MDC_EmptyImport(Migrator *m, CDir *d) : mig(m), dir(d) {} - void finish(int r) { - mig->export_empty_import(dir); - } -}; - - -void Migrator::export_empty_import(CDir *dir) -{ - dout(7) << "export_empty_import " << *dir << dendl; - assert(dir->is_subtree_root()); - - if (dir->inode->is_auth()) { - dout(7) << " inode is auth" << dendl; - return; - } - if (!dir->is_auth()) { - dout(7) << " not auth" << dendl; - return; - } - if (dir->is_freezing() || dir->is_frozen()) { - dout(7) << " freezing or frozen" << dendl; - return; - } - if (dir->get_size() > 0) { - dout(7) << " not actually empty" << dendl; - return; - } - if (dir->inode->is_root()) { - dout(7) << " root" << dendl; - return; - } - - int dest = dir->inode->authority().first; - //if (mds->is_shutting_down()) dest = 0; // this is more efficient. - - dout(7) << " really empty, exporting to " << dest << dendl; - assert (dest != mds->get_nodeid()); - - dout(7) << "exporting to mds" << dest - << " empty import " << *dir << dendl; - export_dir( dir, dest ); -} - - - - -// ========================================================== -// mds failure handling - -void Migrator::handle_mds_failure_or_stop(int who) -{ - dout(5) << "handle_mds_failure_or_stop mds" << who << dendl; - - // check my exports - map::iterator p = export_state.begin(); - while (p != export_state.end()) { - map::iterator next = p; - next++; - CDir *dir = p->first; - - // abort exports: - // - that are going to the failed node - // - that aren't frozen yet (to avoid auth_pin deadlock) - if (export_peer[dir] == who || - p->second == EXPORT_DISCOVERING || p->second == EXPORT_FREEZING) { - // the guy i'm exporting to failed, or we're just freezing. - dout(10) << "cleaning up export state " << p->second << " of " << *dir << dendl; - - switch (p->second) { - case EXPORT_DISCOVERING: - dout(10) << "export state=discovering : canceling freeze and removing auth_pin" << dendl; - dir->unfreeze_tree(); // cancel the freeze - dir->auth_unpin(); - export_state.erase(dir); // clean up - dir->state_clear(CDir::STATE_EXPORTING); - if (export_peer[dir] != who) // tell them. - mds->send_message_mds(new MExportDirCancel(dir->dirfrag()), export_peer[dir]); - break; - - case EXPORT_FREEZING: - dout(10) << "export state=freezing : canceling freeze" << dendl; - dir->unfreeze_tree(); // cancel the freeze - export_state.erase(dir); // clean up - dir->state_clear(CDir::STATE_EXPORTING); - if (export_peer[dir] != who) // tell them. - mds->send_message_mds(new MExportDirCancel(dir->dirfrag()), export_peer[dir]); - break; - - // NOTE: state order reversal, warning comes after loggingstart+prepping - case EXPORT_WARNING: - dout(10) << "export state=warning : unpinning bounds, unfreezing, notifying" << dendl; - // fall-thru - - //case EXPORT_LOGGINGSTART: - case EXPORT_PREPPING: - if (p->second != EXPORT_WARNING) - dout(10) << "export state=loggingstart|prepping : unpinning bounds, unfreezing" << dendl; - { - // unpin bounds - set bounds; - cache->get_subtree_bounds(dir, bounds); - for (set::iterator p = bounds.begin(); - p != bounds.end(); - ++p) { - CDir *bd = *p; - bd->put(CDir::PIN_EXPORTBOUND); - bd->state_clear(CDir::STATE_EXPORTBOUND); - } - } - dir->unfreeze_tree(); - cache->adjust_subtree_auth(dir, mds->get_nodeid()); - cache->try_subtree_merge(dir); - export_state.erase(dir); // clean up - dir->state_clear(CDir::STATE_EXPORTING); - break; - - case EXPORT_EXPORTING: - dout(10) << "export state=exporting : reversing, and unfreezing" << dendl; - export_reverse(dir); - export_state.erase(dir); // clean up - dir->state_clear(CDir::STATE_EXPORTING); - break; - - case EXPORT_LOGGINGFINISH: - case EXPORT_NOTIFYING: - dout(10) << "export state=loggingfinish|notifying : ignoring dest failure, we were successful." << dendl; - // leave export_state, don't clean up now. - break; - - default: - assert(0); - } - - // finish clean-up? - if (export_state.count(dir) == 0) { - export_peer.erase(dir); - export_warning_ack_waiting.erase(dir); - export_notify_ack_waiting.erase(dir); - - // unpin the path - vector trace; - cache->make_trace(trace, dir->inode); - mds->locker->dentry_anon_rdlock_trace_finish(trace); - - // wake up any waiters - mds->queue_waiters(export_finish_waiters[dir]); - export_finish_waiters.erase(dir); - - // send pending import_maps? (these need to go out when all exports have finished.) - cache->maybe_send_pending_resolves(); - - cache->show_subtrees(); - - maybe_do_queued_export(); - } - } else { - // bystander failed. - if (export_warning_ack_waiting.count(dir) && - export_warning_ack_waiting[dir].count(who)) { - export_warning_ack_waiting[dir].erase(who); - export_notify_ack_waiting[dir].erase(who); // they won't get a notify either. - if (p->second == EXPORT_WARNING) { - // exporter waiting for warning acks, let's fake theirs. - dout(10) << "faking export_warning_ack from mds" << who - << " on " << *dir << " to mds" << export_peer[dir] - << dendl; - if (export_warning_ack_waiting[dir].empty()) - export_go(dir); - } - } - if (export_notify_ack_waiting.count(dir) && - export_notify_ack_waiting[dir].count(who)) { - export_notify_ack_waiting[dir].erase(who); - if (p->second == EXPORT_NOTIFYING) { - // exporter is waiting for notify acks, fake it - dout(10) << "faking export_notify_ack from mds" << who - << " on " << *dir << " to mds" << export_peer[dir] - << dendl; - if (export_notify_ack_waiting[dir].empty()) - export_finish(dir); - } - } - } - - // next! - p = next; - } - - - // check my imports - map::iterator q = import_state.begin(); - while (q != import_state.end()) { - map::iterator next = q; - next++; - dirfrag_t df = q->first; - CInode *diri = mds->mdcache->get_inode(df.ino); - CDir *dir = mds->mdcache->get_dirfrag(df); - - if (import_peer[df] == who) { - switch (q->second) { - case IMPORT_DISCOVERING: - dout(10) << "import state=discovering : clearing state" << dendl; - import_state.erase(df); - import_peer.erase(df); - break; - - case IMPORT_DISCOVERED: - dout(10) << "import state=discovered : unpinning inode " << *diri << dendl; - assert(diri); - // unpin base - diri->put(CInode::PIN_IMPORTING); - import_state.erase(df); - import_peer.erase(df); - break; - - case IMPORT_PREPPING: - if (q->second == IMPORT_PREPPING) { - dout(10) << "import state=prepping : unpinning base+bounds " << *dir << dendl; - } - assert(dir); - { - set bounds; - cache->map_dirfrag_set(import_bound_ls[dir], bounds); - import_remove_pins(dir, bounds); - import_reverse_final(dir); - } - break; - - case IMPORT_PREPPED: - dout(10) << "import state=prepped : unpinning base+bounds, unfreezing " << *dir << dendl; - assert(dir); - { - set bounds; - cache->get_subtree_bounds(dir, bounds); - import_remove_pins(dir, bounds); - - // adjust auth back to me - cache->adjust_subtree_auth(dir, import_peer[df]); - cache->try_subtree_merge(dir); - - // bystanders? - if (import_bystanders[dir].empty()) { - import_reverse_unfreeze(dir); - } else { - // notify them; wait in aborting state - import_notify_abort(dir, bounds); - import_state[df] = IMPORT_ABORTING; - } - } - break; - - case IMPORT_LOGGINGSTART: - dout(10) << "import state=loggingstart : reversing import on " << *dir << dendl; - import_reverse(dir); - break; - - case IMPORT_ACKING: - // hrm. make this an ambiguous import, and wait for exporter recovery to disambiguate - dout(10) << "import state=acking : noting ambiguous import " << *dir << dendl; - { - set bounds; - cache->get_subtree_bounds(dir, bounds); - cache->add_ambiguous_import(dir, bounds); - } - break; - - case IMPORT_ABORTING: - dout(10) << "import state=aborting : ignoring repeat failure " << *dir << dendl; - break; - } - } else { - if (q->second == IMPORT_ABORTING && - import_bystanders[dir].count(who)) { - dout(10) << "faking export_notify_ack from mds" << who - << " on aborting import " << *dir << " from mds" << import_peer[df] - << dendl; - import_bystanders[dir].erase(who); - if (import_bystanders[dir].empty()) { - import_bystanders.erase(dir); - import_reverse_unfreeze(dir); - } - } - } - - // next! - q = next; - } -} - - - -void Migrator::show_importing() -{ - dout(10) << "show_importing" << dendl; - for (map::iterator p = import_state.begin(); - p != import_state.end(); - p++) { - CDir *dir = mds->mdcache->get_dirfrag(p->first); - if (dir) { - dout(10) << " importing from " << import_peer[p->first] - << ": (" << p->second << ") " << get_import_statename(p->second) - << " " << p->first - << " " << *dir - << dendl; - } else { - dout(10) << " importing from " << import_peer[p->first] - << ": (" << p->second << ") " << get_import_statename(p->second) - << " " << p->first - << dendl; - } - } -} - -void Migrator::show_exporting() -{ - dout(10) << "show_exporting" << dendl; - for (map::iterator p = export_state.begin(); - p != export_state.end(); - p++) - dout(10) << " exporting to " << export_peer[p->first] - << ": (" << p->second << ") " << get_export_statename(p->second) - << " " << p->first->dirfrag() - << " " << *p->first - << dendl; -} - - - -void Migrator::audit() -{ - if (g_conf.debug_mds < 5) return; // hrm. - - // import_state - show_importing(); - for (map::iterator p = import_state.begin(); - p != import_state.end(); - p++) { - if (p->second == IMPORT_DISCOVERING) - continue; - if (p->second == IMPORT_DISCOVERED) { - CInode *in = cache->get_inode(p->first.ino); - assert(in); - continue; - } - CDir *dir = cache->get_dirfrag(p->first); - assert(dir); - if (p->second == IMPORT_PREPPING) - continue; - assert(dir->is_ambiguous_dir_auth()); - assert(dir->authority().first == mds->get_nodeid() || - dir->authority().second == mds->get_nodeid()); - } - - // export_state - show_exporting(); - for (map::iterator p = export_state.begin(); - p != export_state.end(); - p++) { - CDir *dir = p->first; - if (p->second == EXPORT_DISCOVERING || - p->second == EXPORT_FREEZING) continue; - assert(dir->is_ambiguous_dir_auth()); - assert(dir->authority().first == mds->get_nodeid() || - dir->authority().second == mds->get_nodeid()); - } - - // ambiguous+me subtrees should be importing|exporting - - // write me -} - - - - - -// ========================================================== -// EXPORT - -void Migrator::export_dir_nicely(CDir *dir, int dest) -{ - // enqueue - dout(7) << "export_dir_nicely " << *dir << " to " << dest << dendl; - export_queue.push_back(pair(dir->dirfrag(), dest)); - - maybe_do_queued_export(); -} - -void Migrator::maybe_do_queued_export() -{ - while (!export_queue.empty() && - export_state.size() <= 4) { - dirfrag_t df = export_queue.front().first; - int dest = export_queue.front().second; - export_queue.pop_front(); - - CDir *dir = mds->mdcache->get_dirfrag(df); - if (!dir) continue; - if (!dir->is_auth()) continue; - - dout(-7) << "nicely exporting to mds" << dest << " " << *dir << dendl; - - export_dir(dir, dest); - } -} - - - - -class C_MDC_ExportFreeze : public Context { - Migrator *mig; - CDir *ex; // dir i'm exporting - -public: - C_MDC_ExportFreeze(Migrator *m, CDir *e) : - mig(m), ex(e) {} - virtual void finish(int r) { - if (r >= 0) - mig->export_frozen(ex); - } -}; - - -/** export_dir(dir, dest) - * public method to initiate an export. - * will fail if the directory is freezing, frozen, unpinnable, or root. - */ -void Migrator::export_dir(CDir *dir, int dest) -{ - dout(7) << "export_dir " << *dir << " to " << dest << dendl; - assert(dir->is_auth()); - assert(dest != mds->get_nodeid()); - - if (mds->mdsmap->is_degraded()) { - dout(7) << "cluster degraded, no exports for now" << dendl; - return; - } - - if (dir->inode->is_root()) { - dout(7) << "i won't export root" << dendl; - //assert(0); - return; - } - - if (dir->is_frozen() || - dir->is_freezing()) { - dout(7) << " can't export, freezing|frozen. wait for other exports to finish first." << dendl; - return; - } - if (dir->state_test(CDir::STATE_EXPORTING)) { - dout(7) << "already exporting" << dendl; - return; - } - - // pin path? - vector trace; - cache->make_trace(trace, dir->inode); - if (!mds->locker->dentry_can_rdlock_trace(trace)) { - dout(7) << "export_dir couldn't pin path, failing." << dendl; - return; - } - - // ok. - mds->locker->dentry_anon_rdlock_trace_start(trace); - assert(export_state.count(dir) == 0); - export_state[dir] = EXPORT_DISCOVERING; - export_peer[dir] = dest; - - dir->state_set(CDir::STATE_EXPORTING); - - // send ExportDirDiscover (ask target) - mds->send_message_mds(new MExportDirDiscover(dir), dest); - - // start the freeze, but hold it up with an auth_pin. - dir->auth_pin(); - dir->freeze_tree(); - assert(dir->is_freezing_tree()); - dir->add_waiter(CDir::WAIT_FROZEN, new C_MDC_ExportFreeze(this, dir)); -} - - -/* - * called on receipt of MExportDirDiscoverAck - * the importer now has the directory's _inode_ in memory, and pinned. - */ -void Migrator::handle_export_discover_ack(MExportDirDiscoverAck *m) -{ - CDir *dir = cache->get_dirfrag(m->get_dirfrag()); - assert(dir); - - dout(7) << "export_discover_ack from " << m->get_source() - << " on " << *dir << dendl; - - if (export_state.count(dir) == 0 || - export_state[dir] != EXPORT_DISCOVERING || - export_peer[dir] != m->get_source().num()) { - dout(7) << "must have aborted" << dendl; - } else { - // freeze the subtree - export_state[dir] = EXPORT_FREEZING; - dir->auth_unpin(); - } - - delete m; // done -} - -void Migrator::export_frozen(CDir *dir) -{ - dout(7) << "export_frozen on " << *dir << dendl; - assert(dir->is_frozen()); - assert(dir->get_cum_auth_pins() == 0); - - // ok! - int dest = export_peer[dir]; - - cache->show_subtrees(); - - // note the bounds. - // force it into a subtree by listing auth as . - cache->adjust_subtree_auth(dir, mds->get_nodeid(), mds->get_nodeid()); - set bounds; - cache->get_subtree_bounds(dir, bounds); - - // generate prep message, log entry. - MExportDirPrep *prep = new MExportDirPrep(dir->dirfrag()); - - // include list of bystanders - for (map::iterator p = dir->replicas_begin(); - p != dir->replicas_end(); - p++) { - if (p->first != dest) { - dout(10) << "bystander mds" << p->first << dendl; - prep->add_bystander(p->first); - } - } - - /* include spanning tree for all nested exports. - * these need to be on the destination _before_ the final export so that - * dir_auth updates on any nested exports are properly absorbed. - * this includes inodes and dirfrags included in the subtree, but - * only the inodes at the bounds. - */ - set inodes_added; - - // include base dirfrag - prep->add_dirfrag( new CDirDiscover(dir, dir->add_replica(dest)) ); - - // check bounds - for (set::iterator it = bounds.begin(); - it != bounds.end(); - it++) { - CDir *bound = *it; - - // pin it. - bound->get(CDir::PIN_EXPORTBOUND); - bound->state_set(CDir::STATE_EXPORTBOUND); - - dout(7) << " export bound " << *bound << dendl; - - prep->add_export( bound->dirfrag() ); - - /* first assemble each trace, in trace order, and put in message */ - list inode_trace; - - // trace to dir - CDir *cur = bound; - while (cur != dir) { - // don't repeat ourselves - if (inodes_added.count(cur->ino())) break; // did already! - inodes_added.insert(cur->ino()); - - // inode - assert(cur->inode->is_auth()); - inode_trace.push_front(cur->inode); - dout(7) << " will add " << *cur->inode << dendl; - - // include the dirfrag? only if it's not the bounding subtree root. - if (cur != bound) { - assert(cur->is_auth()); - prep->add_dirfrag( cur->replicate_to(dest) ); // yay! - dout(7) << " added " << *cur << dendl; - } - - cur = cur->get_parent_dir(); - } - - for (list::iterator it = inode_trace.begin(); - it != inode_trace.end(); - it++) { - CInode *in = *it; - dout(7) << " added " << *in->parent << dendl; - dout(7) << " added " << *in << dendl; - prep->add_inode( in->parent->get_dir()->dirfrag(), - in->parent->get_name(), - in->parent->replicate_to(dest), - in->replicate_to(dest) ); - } - - } - - // send. - export_state[dir] = EXPORT_PREPPING; - mds->send_message_mds(prep, dest); -} - -void Migrator::handle_export_prep_ack(MExportDirPrepAck *m) -{ - CDir *dir = cache->get_dirfrag(m->get_dirfrag()); - assert(dir); - - dout(7) << "export_prep_ack " << *dir << dendl; - - if (export_state.count(dir) == 0 || - export_state[dir] != EXPORT_PREPPING) { - // export must have aborted. - dout(7) << "export must have aborted" << dendl; - delete m; - return; - } - - // send warnings - int dest = export_peer[dir]; - set bounds; - cache->get_subtree_bounds(dir, bounds); - - assert(export_peer.count(dir)); - assert(export_warning_ack_waiting.count(dir) == 0); - assert(export_notify_ack_waiting.count(dir) == 0); - - for (map::iterator p = dir->replicas_begin(); - p != dir->replicas_end(); - ++p) { - if (p->first == dest) continue; - if (!mds->mdsmap->is_active_or_stopping(p->first)) - continue; // only if active - export_warning_ack_waiting[dir].insert(p->first); - export_notify_ack_waiting[dir].insert(p->first); // we'll eventually get a notifyack, too! - - MExportDirNotify *notify = new MExportDirNotify(dir->dirfrag(), true, - pair(mds->get_nodeid(),CDIR_AUTH_UNKNOWN), - pair(mds->get_nodeid(),export_peer[dir])); - notify->copy_bounds(bounds); - mds->send_message_mds(notify, p->first); - - } - export_state[dir] = EXPORT_WARNING; - - // nobody to warn? - if (export_warning_ack_waiting.count(dir) == 0) - export_go(dir); // start export. - - // done. - delete m; -} - - -void Migrator::export_go(CDir *dir) -{ - assert(export_peer.count(dir)); - int dest = export_peer[dir]; - dout(7) << "export_go " << *dir << " to " << dest << dendl; - - cache->show_subtrees(); - - export_warning_ack_waiting.erase(dir); - export_state[dir] = EXPORT_EXPORTING; - - assert(dir->get_cum_auth_pins() == 0); - - // set ambiguous auth - cache->adjust_subtree_auth(dir, dest, mds->get_nodeid()); - - // take away the popularity we're sending. - mds->balancer->subtract_export(dir); - - // fill export message with cache data - utime_t now = g_clock.now(); - map exported_client_map; - bufferlist export_data; - int num_exported_inodes = encode_export_dir( export_data, - dir, // recur start point - exported_client_map, - now ); - bufferlist bl; - ::_encode(exported_client_map, bl); - bl.claim_append(export_data); - export_data.claim(bl); - - // send the export data! - MExportDir *req = new MExportDir(dir->dirfrag()); - req->take_dirstate(export_data); - - // add bounds to message - set bounds; - cache->get_subtree_bounds(dir, bounds); - for (set::iterator p = bounds.begin(); - p != bounds.end(); - ++p) - req->add_export((*p)->dirfrag()); - - // send - mds->send_message_mds(req, dest); - - // stats - if (mds->logger) mds->logger->inc("ex"); - if (mds->logger) mds->logger->inc("iex", num_exported_inodes); - - cache->show_subtrees(); -} - - -/** encode_export_inode - * update our local state for this inode to export. - * encode relevant state to be sent over the wire. - * used by: encode_export_dir, file_rename (if foreign) - * - * FIXME: the separation between CInode.encode_export and these methods - * is pretty arbitrary and dumb. - */ -void Migrator::encode_export_inode(CInode *in, bufferlist& enc_state, - map& exported_client_map) -{ - dout(7) << "encode_export_inode " << *in << dendl; - assert(!in->is_replica(mds->get_nodeid())); - - ::_encode_simple(in->inode.ino, enc_state); - in->encode_export(enc_state); - - // make note of clients named by exported capabilities - for (map::iterator it = in->client_caps.begin(); - it != in->client_caps.end(); - it++) - exported_client_map[it->first] = mds->clientmap.get_inst(it->first); -} - -void Migrator::finish_export_inode(CInode *in, utime_t now, list& finished) -{ - dout(12) << "finish_export_inode " << *in << dendl; - - in->finish_export(now); - - // tell (all) clients about migrating caps.. mark STALE - for (map::iterator it = in->client_caps.begin(); - it != in->client_caps.end(); - it++) { - dout(7) << "finish_export_inode telling client" << it->first - << " stale caps on " << *in << dendl; - MClientFileCaps *m = new MClientFileCaps(MClientFileCaps::OP_STALE, - in->inode, - it->second.get_last_seq(), - it->second.pending(), - it->second.wanted()); - entity_inst_t inst = mds->clientmap.get_inst(it->first); - mds->send_message_client_maybe_open(m, inst); - } - in->clear_client_caps(); - - // relax locks? - if (!in->is_replicated()) - in->replicate_relax_locks(); - - // clean - if (in->is_dirty()) in->mark_clean(); - - // clear/unpin cached_by (we're no longer the authority) - in->clear_replica_map(); - - // twiddle lock states for auth -> replica transition - in->authlock.export_twiddle(); - in->linklock.export_twiddle(); - in->dirfragtreelock.export_twiddle(); - in->filelock.export_twiddle(); - in->dirlock.export_twiddle(); - - // mark auth - assert(in->is_auth()); - in->state_clear(CInode::STATE_AUTH); - in->replica_nonce = CInode::EXPORT_NONCE; - - // waiters - in->take_waiting(CInode::WAIT_ANY, finished); - - // *** other state too? - - // move to end of LRU so we drop out of cache quickly! - if (in->get_parent_dn()) - cache->lru.lru_bottouch(in->get_parent_dn()); - -} - -int Migrator::encode_export_dir(bufferlist& exportbl, - CDir *dir, - map& exported_client_map, - utime_t now) -{ - int num_exported = 0; - - dout(7) << "encode_export_dir " << *dir << " " << dir->nitems << " items" << dendl; - - assert(dir->get_projected_version() == dir->get_version()); - - // dir - dirfrag_t df = dir->dirfrag(); - ::_encode_simple(df, exportbl); - dir->encode_export(exportbl); - - long nden = dir->items.size(); - ::_encode_simple(nden, exportbl); - - // dentries - list subdirs; - CDir::map_t::iterator it; - for (it = dir->begin(); it != dir->end(); it++) { - CDentry *dn = it->second; - CInode *in = dn->get_inode(); - - num_exported++; - - // -- dentry - dout(7) << "encode_export_dir exporting " << *dn << dendl; - - // dn name - ::_encode(it->first, exportbl); - - // state - dn->encode_export(exportbl); - - // points to... - - // null dentry? - if (dn->is_null()) { - exportbl.append("N", 1); // null dentry - continue; - } - - if (dn->is_remote()) { - // remote link - exportbl.append("L", 1); // remote link - - inodeno_t ino = dn->get_remote_ino(); - unsigned char d_type = dn->get_remote_d_type(); - ::_encode(ino, exportbl); - ::_encode(d_type, exportbl); - continue; - } - - // primary link - // -- inode - exportbl.append("I", 1); // inode dentry - - encode_export_inode(in, exportbl, exported_client_map); // encode, and (update state for) export - - // directory? - list dfs; - in->get_dirfrags(dfs); - for (list::iterator p = dfs.begin(); p != dfs.end(); ++p) { - CDir *dir = *p; - if (!dir->state_test(CDir::STATE_EXPORTBOUND)) { - // include nested dirfrag - assert(dir->get_dir_auth().first == CDIR_AUTH_PARENT); - subdirs.push_back(dir); // it's ours, recurse (later) - } - } - } - - // subdirs - for (list::iterator it = subdirs.begin(); it != subdirs.end(); it++) - num_exported += encode_export_dir(exportbl, *it, exported_client_map, now); - - return num_exported; -} - -void Migrator::finish_export_dir(CDir *dir, list& finished, utime_t now) -{ - dout(10) << "finish_export_dir " << *dir << dendl; - - // release open_by - dir->clear_replica_map(); - - // mark - assert(dir->is_auth()); - dir->state_clear(CDir::STATE_AUTH); - dir->replica_nonce = CDir::NONCE_EXPORT; - - if (dir->is_dirty()) - dir->mark_clean(); - - // discard most dir state - dir->state &= CDir::MASK_STATE_EXPORT_KEPT; // i only retain a few things. - - // suck up all waiters - dir->take_waiting(CDir::WAIT_ANY, finished); // all dir waiters - - // pop - dir->finish_export(now); - - // dentries - list subdirs; - CDir::map_t::iterator it; - for (it = dir->begin(); it != dir->end(); it++) { - CDentry *dn = it->second; - CInode *in = dn->get_inode(); - - // dentry - dn->finish_export(); - - // inode? - if (dn->is_primary()) { - finish_export_inode(in, now, finished); - - // subdirs? - in->get_nested_dirfrags(subdirs); - } - } - - // subdirs - for (list::iterator it = subdirs.begin(); it != subdirs.end(); it++) - finish_export_dir(*it, finished, now); -} - -class C_MDS_ExportFinishLogged : public Context { - Migrator *migrator; - CDir *dir; -public: - C_MDS_ExportFinishLogged(Migrator *m, CDir *d) : migrator(m), dir(d) {} - void finish(int r) { - migrator->export_logged_finish(dir); - } -}; - - -/* - * i should get an export_ack from the export target. - */ -void Migrator::handle_export_ack(MExportDirAck *m) -{ - CDir *dir = cache->get_dirfrag(m->get_dirfrag()); - assert(dir); - assert(dir->is_frozen_tree_root()); // i'm exporting! - - // yay! - dout(7) << "handle_export_ack " << *dir << dendl; - - export_warning_ack_waiting.erase(dir); - - export_state[dir] = EXPORT_LOGGINGFINISH; - - set bounds; - cache->get_subtree_bounds(dir, bounds); - - // log completion. - // include export bounds, to ensure they're in the journal. - EExport *le = new EExport(mds->mdlog, dir); - le->metablob.add_dir_context(dir); - le->metablob.add_dir( dir, false ); - for (set::iterator p = bounds.begin(); - p != bounds.end(); - ++p) { - CDir *bound = *p; - le->get_bounds().insert(bound->dirfrag()); - le->metablob.add_dir_context(bound); - le->metablob.add_dir(bound, false); - } - - // log export completion, then finish (unfreeze, trigger finish context, etc.) - mds->mdlog->submit_entry(le, - new C_MDS_ExportFinishLogged(this, dir)); - - delete m; -} - - - - - -/* - * this happens if hte dest failes after i send teh export data but before it is acked - * that is, we don't know they safely received and logged it, so we reverse our changes - * and go on. - */ -void Migrator::export_reverse(CDir *dir) -{ - dout(7) << "export_reverse " << *dir << dendl; - - assert(export_state[dir] == EXPORT_EXPORTING); - - set bounds; - cache->get_subtree_bounds(dir, bounds); - - // adjust auth, with possible subtree merge. - cache->adjust_subtree_auth(dir, mds->get_nodeid()); - cache->try_subtree_merge(dir); - - // remove exporting pins - list rq; - rq.push_back(dir); - while (!rq.empty()) { - CDir *dir = rq.front(); - rq.pop_front(); - dir->abort_export(); - for (CDir::map_t::iterator p = dir->items.begin(); p != dir->items.end(); ++p) { - p->second->abort_export(); - if (!p->second->is_primary()) continue; - CInode *in = p->second->get_inode(); - in->abort_export(); - if (in->is_dir()) - in->get_nested_dirfrags(rq); - } - } - - // unpin bounds - for (set::iterator p = bounds.begin(); - p != bounds.end(); - ++p) { - CDir *bd = *p; - bd->put(CDir::PIN_EXPORTBOUND); - bd->state_clear(CDir::STATE_EXPORTBOUND); - } - - // process delayed expires - cache->process_delayed_expire(dir); - - // some clean up - export_warning_ack_waiting.erase(dir); - export_notify_ack_waiting.erase(dir); - - // unfreeze - dir->unfreeze_tree(); - - cache->show_cache(); -} - - -/* - * once i get the ack, and logged the EExportFinish(true), - * send notifies (if any), otherwise go straight to finish. - * - */ -void Migrator::export_logged_finish(CDir *dir) -{ - dout(7) << "export_logged_finish " << *dir << dendl; - - // send notifies - int dest = export_peer[dir]; - - set bounds; - cache->get_subtree_bounds(dir, bounds); - - for (set::iterator p = export_notify_ack_waiting[dir].begin(); - p != export_notify_ack_waiting[dir].end(); - ++p) { - MExportDirNotify *notify; - if (mds->mdsmap->is_active_or_stopping(export_peer[dir])) - // dest is still alive. - notify = new MExportDirNotify(dir->dirfrag(), true, - pair(mds->get_nodeid(), dest), - pair(dest, CDIR_AUTH_UNKNOWN)); - else - // dest is dead. bystanders will think i am only auth, as per mdcache->handle_mds_failure() - notify = new MExportDirNotify(dir->dirfrag(), true, - pair(mds->get_nodeid(), CDIR_AUTH_UNKNOWN), - pair(dest, CDIR_AUTH_UNKNOWN)); - - notify->copy_bounds(bounds); - - mds->send_message_mds(notify, *p); - } - - // wait for notifyacks - export_state[dir] = EXPORT_NOTIFYING; - - // no notifies to wait for? - if (export_notify_ack_waiting[dir].empty()) - export_finish(dir); // skip notify/notify_ack stage. -} - -/* - * warning: - * i'll get an ack from each bystander. - * when i get them all, do the export. - * notify: - * i'll get an ack from each bystander. - * when i get them all, unfreeze and send the finish. - */ -void Migrator::handle_export_notify_ack(MExportDirNotifyAck *m) -{ - CDir *dir = cache->get_dirfrag(m->get_dirfrag()); - assert(dir); - int from = m->get_source().num(); - - if (export_state.count(dir) && export_state[dir] == EXPORT_WARNING) { - // exporting. process warning. - dout(7) << "handle_export_notify_ack from " << m->get_source() - << ": exporting, processing warning on " - << *dir << dendl; - assert(export_warning_ack_waiting.count(dir)); - export_warning_ack_waiting[dir].erase(from); - - if (export_warning_ack_waiting[dir].empty()) - export_go(dir); // start export. - } - else if (export_state.count(dir) && export_state[dir] == EXPORT_NOTIFYING) { - // exporting. process notify. - dout(7) << "handle_export_notify_ack from " << m->get_source() - << ": exporting, processing notify on " - << *dir << dendl; - assert(export_notify_ack_waiting.count(dir)); - export_notify_ack_waiting[dir].erase(from); - - if (export_notify_ack_waiting[dir].empty()) - export_finish(dir); - } - else if (import_state.count(dir->dirfrag()) && import_state[dir->dirfrag()] == IMPORT_ABORTING) { - // reversing import - dout(7) << "handle_export_notify_ack from " << m->get_source() - << ": aborting import on " - << *dir << dendl; - assert(import_bystanders[dir].count(from)); - import_bystanders[dir].erase(from); - if (import_bystanders[dir].empty()) { - import_bystanders.erase(dir); - import_reverse_unfreeze(dir); - } - } - - delete m; -} - - -void Migrator::export_finish(CDir *dir) -{ - dout(5) << "export_finish " << *dir << dendl; - - if (export_state.count(dir) == 0) { - dout(7) << "target must have failed, not sending final commit message. export succeeded anyway." << dendl; - return; - } - - // send finish/commit to new auth - if (mds->mdsmap->is_active_or_stopping(export_peer[dir])) { - mds->send_message_mds(new MExportDirFinish(dir->dirfrag()), export_peer[dir]); - } else { - dout(7) << "not sending MExportDirFinish, dest has failed" << dendl; - } - - // finish export (adjust local cache state) - C_Contexts *fin = new C_Contexts; - finish_export_dir(dir, fin->contexts, g_clock.now()); - dir->add_waiter(CDir::WAIT_UNFREEZE, fin); - - // unfreeze - dout(7) << "export_finish unfreezing" << dendl; - dir->unfreeze_tree(); - - // unpin bounds - set bounds; - cache->get_subtree_bounds(dir, bounds); - for (set::iterator p = bounds.begin(); - p != bounds.end(); - ++p) { - CDir *bd = *p; - bd->put(CDir::PIN_EXPORTBOUND); - bd->state_clear(CDir::STATE_EXPORTBOUND); - } - - // adjust auth, with possible subtree merge. - // (we do this _after_ removing EXPORTBOUND pins, to allow merges) - cache->adjust_subtree_auth(dir, export_peer[dir]); - cache->try_subtree_merge(dir); - - // unpin path - dout(7) << "export_finish unpinning path" << dendl; - vector trace; - cache->make_trace(trace, dir->inode); - mds->locker->dentry_anon_rdlock_trace_finish(trace); - - // discard delayed expires - cache->discard_delayed_expire(dir); - - // remove from exporting list, clean up state - dir->state_clear(CDir::STATE_EXPORTING); - export_state.erase(dir); - export_peer.erase(dir); - export_notify_ack_waiting.erase(dir); - - // queue finishers - mds->queue_waiters(export_finish_waiters[dir]); - export_finish_waiters.erase(dir); - - cache->show_subtrees(); - audit(); - - // send pending import_maps? - mds->mdcache->maybe_send_pending_resolves(); - - maybe_do_queued_export(); -} - - - - - - - - -// ========================================================== -// IMPORT - -void Migrator::handle_export_discover(MExportDirDiscover *m) -{ - assert(m->get_source().num() != mds->get_nodeid()); - - dout(7) << "handle_export_discover on " << m->get_path() << dendl; - - // note import state - dirfrag_t df = m->get_dirfrag(); - - // only start discovering on this message once. - if (!m->started) { - m->started = true; - import_state[df] = IMPORT_DISCOVERING; - import_peer[df] = m->get_source().num(); - } - - // am i retrying after ancient path_traverse results? - if (import_state.count(df) == 0 && - import_state[df] != IMPORT_DISCOVERING) { - dout(7) << "hmm import_state is off, i must be obsolete lookup" << dendl; - delete m; - return; - } - - // do we have it? - CInode *in = cache->get_inode(m->get_dirfrag().ino); - if (!in) { - // must discover it! - filepath fpath(m->get_path()); - vector trace; - int r = cache->path_traverse(0, m, - 0, fpath, trace, true, - MDS_TRAVERSE_DISCOVER); - if (r > 0) return; // wait - if (r < 0) { - dout(7) << "handle_export_discover_2 failed to discover or not dir " << m->get_path() << ", NAK" << dendl; - assert(0); // this shouldn't happen if the auth pins his path properly!!!! - } - - assert(0); // this shouldn't happen; the get_inode above would have succeeded. - } - - // yay - dout(7) << "handle_export_discover have " << df << " inode " << *in << dendl; - - import_state[m->get_dirfrag()] = IMPORT_DISCOVERED; - - // pin inode in the cache (for now) - assert(in->is_dir()); - in->get(CInode::PIN_IMPORTING); - - // reply - dout(7) << " sending export_discover_ack on " << *in << dendl; - mds->send_message_mds(new MExportDirDiscoverAck(df), import_peer[df]); -} - -void Migrator::handle_export_cancel(MExportDirCancel *m) -{ - dout(7) << "handle_export_cancel on " << m->get_dirfrag() << dendl; - - if (import_state[m->get_dirfrag()] == IMPORT_DISCOVERED) { - CInode *in = cache->get_inode(m->get_dirfrag().ino); - assert(in); - in->put(CInode::PIN_IMPORTING); - } else { - assert(import_state[m->get_dirfrag()] == IMPORT_DISCOVERING); - } - - import_state.erase(m->get_dirfrag()); - import_peer.erase(m->get_dirfrag()); - - delete m; -} - - -void Migrator::handle_export_prep(MExportDirPrep *m) -{ - int oldauth = m->get_source().num(); - assert(oldauth != mds->get_nodeid()); - - // make sure we didn't abort - if (import_state.count(m->get_dirfrag()) == 0 || - (import_state[m->get_dirfrag()] != IMPORT_DISCOVERED && - import_state[m->get_dirfrag()] != IMPORT_PREPPING) || - import_peer[m->get_dirfrag()] != oldauth) { - dout(10) << "handle_export_prep import has aborted, dropping" << dendl; - delete m; - return; - } - - CInode *diri = cache->get_inode(m->get_dirfrag().ino); - assert(diri); - - list finished; - - // assimilate root dir. - CDir *dir; - - if (!m->did_assim()) { - dir = cache->add_replica_dir(diri, - m->get_dirfrag().frag, *m->get_dirfrag_discover(m->get_dirfrag()), - oldauth, finished); - dout(7) << "handle_export_prep on " << *dir << " (first pass)" << dendl; - } else { - dir = cache->get_dirfrag(m->get_dirfrag()); - assert(dir); - dout(7) << "handle_export_prep on " << *dir << " (subsequent pass)" << dendl; - } - assert(dir->is_auth() == false); - - cache->show_subtrees(); - - // build import bound map - map import_bound_fragset; - for (list::iterator p = m->get_bounds().begin(); - p != m->get_bounds().end(); - ++p) { - dout(10) << " bound " << *p << dendl; - import_bound_fragset[p->ino].insert(p->frag); - } - - // assimilate contents? - if (!m->did_assim()) { - dout(7) << "doing assim on " << *dir << dendl; - m->mark_assim(); // only do this the first time! - - // move pin to dir - diri->put(CInode::PIN_IMPORTING); - dir->get(CDir::PIN_IMPORTING); - dir->state_set(CDir::STATE_IMPORTING); - - // change import state - import_state[dir->dirfrag()] = IMPORT_PREPPING; - import_bound_ls[dir] = m->get_bounds(); - - // bystander list - import_bystanders[dir] = m->get_bystanders(); - dout(7) << "bystanders are " << import_bystanders[dir] << dendl; - - // assimilate traces to exports - for (list::iterator it = m->get_inodes().begin(); - it != m->get_inodes().end(); - it++) { - // inode - CInode *in = cache->get_inode( (*it)->get_ino() ); - if (in) { - (*it)->update_inode(in); - dout(7) << " updated " << *in << dendl; - } else { - in = new CInode(mds->mdcache, false); - (*it)->update_inode(in); - - // link to the containing dir - CDir *condir = cache->get_dirfrag( m->get_containing_dirfrag(in->ino()) ); - assert(condir); - cache->add_inode( in ); - condir->add_primary_dentry( m->get_dentry(in->ino()), in ); - - dout(7) << " added " << *in << dendl; - } - - assert( in->get_parent_dir()->dirfrag() == m->get_containing_dirfrag(in->ino()) ); - - // dirs - for (list::iterator pf = m->get_inode_dirfrags(in->ino()).begin(); - pf != m->get_inode_dirfrags(in->ino()).end(); - ++pf) { - // add/update - cache->add_replica_dir(in, *pf, *m->get_dirfrag_discover(dirfrag_t(in->ino(), *pf)), - oldauth, finished); - } - } - - // make bound sticky - for (map::iterator p = import_bound_fragset.begin(); - p != import_bound_fragset.end(); - ++p) { - CInode *in = cache->get_inode(p->first); - assert(in); - in->get_stickydirs(); - dout(7) << " set stickydirs on bound inode " << *in << dendl; - } - - } else { - dout(7) << " not doing assim on " << *dir << dendl; - } - - if (!finished.empty()) - mds->queue_waiters(finished); - - - // open all bounds - set import_bounds; - for (map::iterator p = import_bound_fragset.begin(); - p != import_bound_fragset.end(); - ++p) { - CInode *in = cache->get_inode(p->first); - assert(in); - - // map fragset into a frag_t list, based on the inode fragtree - list fglist; - for (set::iterator q = p->second.begin(); q != p->second.end(); ++q) - in->dirfragtree.get_leaves_under(*q, fglist); - dout(10) << " bound inode " << p->first << " fragset " << p->second << " maps to " << fglist << dendl; - - for (list::iterator q = fglist.begin(); - q != fglist.end(); - ++q) { - CDir *bound = cache->get_dirfrag(dirfrag_t(p->first, *q)); - if (!bound) { - dout(7) << " opening bounding dirfrag " << *q << " on " << *in << dendl; - cache->open_remote_dirfrag(in, *q, - new C_MDS_RetryMessage(mds, m)); - return; - } - - if (!bound->state_test(CDir::STATE_IMPORTBOUND)) { - dout(7) << " pinning import bound " << *bound << dendl; - bound->get(CDir::PIN_IMPORTBOUND); - bound->state_set(CDir::STATE_IMPORTBOUND); - } else { - dout(7) << " already pinned import bound " << *bound << dendl; - } - import_bounds.insert(bound); - } - } - - dout(7) << " all ready, noting auth and freezing import region" << dendl; - - // note that i am an ambiguous auth for this subtree. - // specify bounds, since the exporter explicitly defines the region. - cache->adjust_bounded_subtree_auth(dir, import_bounds, - pair(oldauth, mds->get_nodeid())); - cache->verify_subtree_bounds(dir, import_bounds); - - // freeze. - dir->_freeze_tree(); - - // ok! - dout(7) << " sending export_prep_ack on " << *dir << dendl; - mds->send_message_mds(new MExportDirPrepAck(dir->dirfrag()), m->get_source().num()); - - // note new state - import_state[dir->dirfrag()] = IMPORT_PREPPED; - - // done - delete m; - -} - - - - -class C_MDS_ImportDirLoggedStart : public Context { - Migrator *migrator; - CDir *dir; - int from; -public: - C_MDS_ImportDirLoggedStart(Migrator *m, CDir *d, int f) : - migrator(m), dir(d), from(f) { - } - void finish(int r) { - migrator->import_logged_start(dir, from); - } -}; - -void Migrator::handle_export_dir(MExportDir *m) -{ - CDir *dir = cache->get_dirfrag(m->get_dirfrag()); - assert(dir); - - int oldauth = m->get_source().num(); - dout(7) << "handle_export_dir importing " << *dir << " from " << oldauth << dendl; - assert(dir->is_auth() == false); - - cache->show_subtrees(); - - // start the journal entry - EImportStart *le = new EImportStart(dir->dirfrag(), m->get_bounds()); - le->metablob.add_dir_context(dir); - - // adjust auth (list us _first_) - cache->adjust_subtree_auth(dir, mds->get_nodeid(), oldauth); - - // add this crap to my cache - map imported_client_map; - bufferlist::iterator blp = m->get_dirstate().begin(); - ::_decode_simple(imported_client_map, blp); - - int num_imported_inodes = 0; - while (!blp.end()) { - num_imported_inodes += - decode_import_dir(blp, - oldauth, - dir, // import root - le, - imported_client_map, - mds->mdlog->get_current_segment(), - import_updated_scatterlocks[dir]); - } - dout(10) << " " << m->get_bounds().size() << " imported bounds" << dendl; - - // include bounds in EImportStart - set import_bounds; - cache->get_subtree_bounds(dir, import_bounds); - for (set::iterator it = import_bounds.begin(); - it != import_bounds.end(); - it++) - le->metablob.add_dir(*it, false); // note that parent metadata is already in the event - - // adjust popularity - mds->balancer->add_import(dir); - - dout(7) << "handle_export_dir did " << *dir << dendl; - - // log it - mds->mdlog->submit_entry(le, - new C_MDS_ImportDirLoggedStart(this, dir, m->get_source().num())); - - // note state - import_state[dir->dirfrag()] = IMPORT_LOGGINGSTART; - - // some stats - if (mds->logger) { - mds->logger->inc("im"); - mds->logger->inc("iim", num_imported_inodes); - } - - delete m; -} - - -/* - * this is an import helper - * called by import_finish, and import_reverse and friends. - */ -void Migrator::import_remove_pins(CDir *dir, set& bounds) -{ - // root - dir->put(CDir::PIN_IMPORTING); - dir->state_clear(CDir::STATE_IMPORTING); - - // bounds - set didinodes; - for (set::iterator it = bounds.begin(); - it != bounds.end(); - it++) { - CDir *bd = *it; - bd->put(CDir::PIN_IMPORTBOUND); - bd->state_clear(CDir::STATE_IMPORTBOUND); - CInode *bdi = bd->get_inode(); - if (didinodes.count(bdi) == 0) { - bdi->put_stickydirs(); - didinodes.insert(bdi); - } - } -} - - -/* - * note: this does teh full work of reversing and import and cleaning up - * state. - * called by both handle_mds_failure and by handle_resolve (if we are - * a survivor coping with an exporter failure+recovery). - */ -void Migrator::import_reverse(CDir *dir) -{ - dout(7) << "import_reverse " << *dir << dendl; - - set bounds; - cache->get_subtree_bounds(dir, bounds); - - // remove pins - import_remove_pins(dir, bounds); - - // update auth, with possible subtree merge. - assert(dir->is_subtree_root()); - cache->adjust_subtree_auth(dir, import_peer[dir->dirfrag()]); - cache->try_subtree_merge(dir); - - // adjust auth bits. - list q; - q.push_back(dir); - while (!q.empty()) { - CDir *cur = q.front(); - q.pop_front(); - - // dir - assert(cur->is_auth()); - cur->state_clear(CDir::STATE_AUTH); - cur->clear_replica_map(); - if (cur->is_dirty()) - cur->mark_clean(); - - CDir::map_t::iterator it; - for (it = cur->begin(); it != cur->end(); it++) { - CDentry *dn = it->second; - - // dentry - dn->state_clear(CDentry::STATE_AUTH); - dn->clear_replica_map(); - if (dn->is_dirty()) - dn->mark_clean(); - - // inode? - if (dn->is_primary()) { - CInode *in = dn->get_inode(); - in->state_clear(CDentry::STATE_AUTH); - in->clear_replica_map(); - if (in->is_dirty()) - in->mark_clean(); - in->authlock.clear_gather(); - in->linklock.clear_gather(); - in->dirfragtreelock.clear_gather(); - in->filelock.clear_gather(); - - // non-bounding dir? - list dfs; - in->get_dirfrags(dfs); - for (list::iterator p = dfs.begin(); p != dfs.end(); ++p) - if (bounds.count(*p) == 0) - q.push_back(*p); - } - } - } - - // log our failure - mds->mdlog->submit_entry(new EImportFinish(dir, false)); // log failure - - // bystanders? - if (import_bystanders[dir].empty()) { - dout(7) << "no bystanders, finishing reverse now" << dendl; - import_reverse_unfreeze(dir); - } else { - // notify them; wait in aborting state - dout(7) << "notifying bystanders of abort" << dendl; - import_notify_abort(dir, bounds); - import_state[dir->dirfrag()] = IMPORT_ABORTING; - } -} - -void Migrator::import_notify_abort(CDir *dir, set& bounds) -{ - dout(7) << "import_notify_abort " << *dir << dendl; - - for (set::iterator p = import_bystanders[dir].begin(); - p != import_bystanders[dir].end(); - ++p) { - // NOTE: the bystander will think i am _only_ auth, because they will have seen - // the exporter's failure and updated the subtree auth. see mdcache->handle_mds_failure(). - MExportDirNotify *notify = - new MExportDirNotify(dir->dirfrag(), true, - pair(mds->get_nodeid(), CDIR_AUTH_UNKNOWN), - pair(import_peer[dir->dirfrag()], CDIR_AUTH_UNKNOWN)); - notify->copy_bounds(bounds); - mds->send_message_mds(notify, *p); - } -} - -void Migrator::import_reverse_unfreeze(CDir *dir) -{ - dout(7) << "import_reverse_unfreeze " << *dir << dendl; - dir->unfreeze_tree(); - cache->discard_delayed_expire(dir); - import_reverse_final(dir); -} - -void Migrator::import_reverse_final(CDir *dir) -{ - dout(7) << "import_reverse_final " << *dir << dendl; - - // clean up - import_state.erase(dir->dirfrag()); - import_peer.erase(dir->dirfrag()); - import_bystanders.erase(dir); - import_bound_ls.erase(dir); - import_updated_scatterlocks.erase(dir); - - // send pending import_maps? - mds->mdcache->maybe_send_pending_resolves(); - - cache->show_subtrees(); - //audit(); // this fails, bc we munge up the subtree map during handle_import_map (resolve phase) -} - - -void Migrator::import_logged_start(CDir *dir, int from) -{ - dout(7) << "import_logged " << *dir << dendl; - - // note state - import_state[dir->dirfrag()] = IMPORT_ACKING; - - // send notify's etc. - dout(7) << "sending ack for " << *dir << " to old auth mds" << from << dendl; - mds->send_message_mds(new MExportDirAck(dir->dirfrag()), from); - - cache->show_subtrees(); -} - - -void Migrator::handle_export_finish(MExportDirFinish *m) -{ - CDir *dir = cache->get_dirfrag(m->get_dirfrag()); - assert(dir); - dout(7) << "handle_export_finish on " << *dir << dendl; - import_finish(dir); - delete m; -} - -void Migrator::import_finish(CDir *dir) -{ - dout(7) << "import_finish on " << *dir << dendl; - - // log finish - mds->mdlog->submit_entry(new EImportFinish(dir, true)); - - // clear updated scatterlocks - for (list::iterator p = import_updated_scatterlocks[dir].begin(); - p != import_updated_scatterlocks[dir].end(); - ++p) - (*p)->clear_updated(); - - // remove pins - set bounds; - cache->get_subtree_bounds(dir, bounds); - import_remove_pins(dir, bounds); - - // adjust auth, with possible subtree merge. - cache->adjust_subtree_auth(dir, mds->get_nodeid()); - cache->try_subtree_merge(dir); - - // clear import state (we're done!) - import_state.erase(dir->dirfrag()); - import_peer.erase(dir->dirfrag()); - import_bystanders.erase(dir); - import_bound_ls.erase(dir); - import_updated_scatterlocks.erase(dir); - - // process delayed expires - cache->process_delayed_expire(dir); - - // ok now unfreeze (and thus kick waiters) - dir->unfreeze_tree(); - - cache->show_subtrees(); - //audit(); // this fails, bc we munge up the subtree map during handle_import_map (resolve phase) - - // send pending import_maps? - mds->mdcache->maybe_send_pending_resolves(); - - // is it empty? - if (dir->get_size() == 0 && - !dir->inode->is_auth()) { - // reexport! - export_empty_import(dir); - } -} - - -void Migrator::decode_import_inode(CDentry *dn, bufferlist::iterator& blp, int oldauth, - map& imported_client_map, - LogSegment *ls, - list& updated_scatterlocks) -{ - dout(15) << "decode_import_inode on " << *dn << dendl; - - inodeno_t ino; - ::_decode_simple(ino, blp); - - bool added = false; - CInode *in = cache->get_inode(ino); - if (!in) { - in = new CInode(mds->mdcache); - added = true; - } else { - in->state_set(CInode::STATE_AUTH); - } - - // state after link -- or not! -sage - set merged_client_caps; - in->decode_import(blp, merged_client_caps, ls); - - // link before state -- or not! -sage - if (dn->inode != in) { - assert(!dn->inode); - dn->dir->link_primary_inode(dn, in); - } - - // add inode? - if (added) { - cache->add_inode(in); - dout(10) << "added " << *in << dendl; - } else { - dout(10) << " had " << *in << dendl; - } - - // clear if dirtyscattered, since we're going to journal this - // but not until we _actually_ finish the import... - if (in->dirlock.is_updated()) - updated_scatterlocks.push_back(&in->dirlock); - - // put in autoscatter list? - // this is conservative, but safe. - if (in->dirlock.get_state() == LOCK_SCATTER) - mds->locker->note_autoscattered(&in->dirlock); - - // adjust replica list - //assert(!in->is_replica(oldauth)); // not true on failed export - in->add_replica( oldauth, CInode::EXPORT_NONCE ); - if (in->is_replica(mds->get_nodeid())) - in->remove_replica(mds->get_nodeid()); - - // caps - for (set::iterator it = merged_client_caps.begin(); - it != merged_client_caps.end(); - it++) { - dout(0) << "merged caps for client" << *it << " on " << *in << dendl; - MClientFileCaps *caps = new MClientFileCaps(MClientFileCaps::OP_REAP, - in->inode, - in->client_caps[*it].get_last_seq(), - in->client_caps[*it].pending(), - in->client_caps[*it].wanted()); - caps->set_mds( oldauth ); // reap from whom? - mds->send_message_client_maybe_open(caps, imported_client_map[*it]); - } -} - - -int Migrator::decode_import_dir(bufferlist::iterator& blp, - int oldauth, - CDir *import_root, - EImportStart *le, - map& imported_client_map, - LogSegment *ls, - list& updated_scatterlocks) -{ - // set up dir - dirfrag_t df; - ::_decode_simple(df, blp); - - CInode *diri = cache->get_inode(df.ino); - assert(diri); - CDir *dir = diri->get_or_open_dirfrag(mds->mdcache, df.frag); - assert(dir); - - dout(7) << "decode_import_dir " << *dir << dendl; - - // assimilate state - dir->decode_import(blp); - - // mark (may already be marked from get_or_open_dir() above) - if (!dir->is_auth()) - dir->state_set(CDir::STATE_AUTH); - - // adjust replica list - //assert(!dir->is_replica(oldauth)); // not true on failed export - dir->add_replica(oldauth); - if (dir->is_replica(mds->get_nodeid())) - dir->remove_replica(mds->get_nodeid()); - - // add to journal entry - if (le) - le->metablob.add_dir(dir, - true, // Hmm: dirty=false would be okay in some cases - dir->is_complete()); - - int num_imported = 0; - - // take all waiters on this dir - // NOTE: a pass of imported data is guaranteed to get all of my waiters because - // a replica's presense in my cache implies/forces it's presense in authority's. - list waiters; - - dir->take_waiting(CDir::WAIT_ANY, waiters); - for (list::iterator it = waiters.begin(); - it != waiters.end(); - it++) - import_root->add_waiter(CDir::WAIT_UNFREEZE, *it); // UNFREEZE will get kicked both on success or failure - - dout(15) << "doing contents" << dendl; - - // contents - long nden; - ::_decode_simple(nden, blp); - - for (; nden>0; nden--) { - num_imported++; - - // dentry - string dname; - ::_decode_simple(dname, blp); - - CDentry *dn = dir->lookup(dname); - if (!dn) - dn = dir->add_null_dentry(dname); - - dn->decode_import(blp, ls); - - dn->add_replica(oldauth, CDentry::EXPORT_NONCE); - if (dn->is_replica(mds->get_nodeid())) - dn->remove_replica(mds->get_nodeid()); - - dout(15) << "decode_import_dir got " << *dn << dendl; - - // points to... - char icode; - ::_decode_simple(icode, blp); - - if (icode == 'N') { - // null dentry - assert(dn->is_null()); - - // fall thru - } - else if (icode == 'L') { - // remote link - inodeno_t ino; - unsigned char d_type; - ::_decode_simple(ino, blp); - ::_decode_simple(d_type, blp); - if (dn->is_remote()) { - assert(dn->get_remote_ino() == ino); - } else { - dir->link_remote_inode(dn, ino, d_type); - } - } - else if (icode == 'I') { - // inode - decode_import_inode(dn, blp, oldauth, imported_client_map, ls, updated_scatterlocks); - } - - // add dentry to journal entry - if (le) - le->metablob.add_dentry(dn, dn->is_dirty()); - } - - dout(7) << "decode_import_dir done " << *dir << dendl; - return num_imported; -} - - - - - -// authority bystander - -void Migrator::handle_export_notify(MExportDirNotify *m) -{ - CDir *dir = cache->get_dirfrag(m->get_dirfrag()); - - int from = m->get_source().num(); - pair old_auth = m->get_old_auth(); - pair new_auth = m->get_new_auth(); - - if (!dir) { - dout(7) << "handle_export_notify " << old_auth << " -> " << new_auth - << " on missing dir " << m->get_dirfrag() << dendl; - } else if (dir->authority() != old_auth) { - dout(7) << "handle_export_notify old_auth was " << dir->authority() - << " != " << old_auth << " -> " << new_auth - << " on " << *dir << dendl; - } else { - dout(7) << "handle_export_notify " << old_auth << " -> " << new_auth - << " on " << *dir << dendl; - // adjust auth - set have; - cache->map_dirfrag_set(m->get_bounds(), have); - cache->adjust_bounded_subtree_auth(dir, have, new_auth); - - // induce a merge? - cache->try_subtree_merge(dir); - } - - // send ack - if (m->wants_ack()) { - mds->send_message_mds(new MExportDirNotifyAck(m->get_dirfrag()), from); - } else { - // aborted. no ack. - dout(7) << "handle_export_notify no ack requested" << dendl; - } - - delete m; -} - - - - - - - - - - - - - diff --git a/branches/sage/ebofs2/mds/Migrator.h b/branches/sage/ebofs2/mds/Migrator.h deleted file mode 100644 index 07a8731868a92..0000000000000 --- a/branches/sage/ebofs2/mds/Migrator.h +++ /dev/null @@ -1,260 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_MIGRATOR_H -#define __MDS_MIGRATOR_H - -#include "include/types.h" - -#include -#include -#include -using std::map; -using std::list; -using std::set; - - -class MDS; -class CDir; -class CInode; -class CDentry; - -class MExportDirDiscover; -class MExportDirDiscoverAck; -class MExportDirCancel; -class MExportDirPrep; -class MExportDirPrepAck; -class MExportDir; -class MExportDirAck; -class MExportDirNotify; -class MExportDirNotifyAck; -class MExportDirFinish; - -class EImportStart; - - -class Migrator { -private: - MDS *mds; - MDCache *cache; - - // -- exports -- -public: - // export stages. used to clean up intelligently if there's a failure. - const static int EXPORT_DISCOVERING = 1; // dest is disovering export dir - const static int EXPORT_FREEZING = 2; // we're freezing the dir tree - const static int EXPORT_PREPPING = 4; // sending dest spanning tree to export bounds - const static int EXPORT_WARNING = 5; // warning bystanders of dir_auth_pending - const static int EXPORT_EXPORTING = 6; // sent actual export, waiting for ack - const static int EXPORT_LOGGINGFINISH = 7; // logging EExportFinish - const static int EXPORT_NOTIFYING = 8; // waiting for notifyacks - const static int EXPORT_ABORTING = 9; // notifying bystanders of abort - static const char *get_export_statename(int s) { - switch (s) { - case EXPORT_DISCOVERING: return "discovering"; - case EXPORT_FREEZING: return "freezing"; - case EXPORT_PREPPING: return "prepping"; - case EXPORT_WARNING: return "warning"; - case EXPORT_EXPORTING: return "exporting"; - case EXPORT_LOGGINGFINISH: return "loggingfinish"; - case EXPORT_NOTIFYING: return "notifying"; - case EXPORT_ABORTING: return "aborting"; - default: assert(0); return 0; - } - } - -protected: - // export fun - map export_state; - map export_peer; - //map > export_data; // only during EXPORTING state - map > export_warning_ack_waiting; - map > export_notify_ack_waiting; - - map > export_finish_waiters; - - list< pair > export_queue; - - // -- imports -- -public: - const static int IMPORT_DISCOVERING = 1; // waiting for prep - const static int IMPORT_DISCOVERED = 2; // waiting for prep - const static int IMPORT_PREPPING = 3; // opening dirs on bounds - const static int IMPORT_PREPPED = 4; // opened bounds, waiting for import - const static int IMPORT_LOGGINGSTART = 5; // got import, logging EImportStart - const static int IMPORT_ACKING = 6; // logged EImportStart, sent ack, waiting for finish - const static int IMPORT_ABORTING = 8; // notifying bystanders of an abort before unfreezing - static const char *get_import_statename(int s) { - switch (s) { - case IMPORT_DISCOVERING: return "discovering"; - case IMPORT_DISCOVERED: return "discovered"; - case IMPORT_PREPPING: return "prepping"; - case IMPORT_PREPPED: return "prepped"; - case IMPORT_LOGGINGSTART: return "loggingstart"; - case IMPORT_ACKING: return "acking"; - case IMPORT_ABORTING: return "aborting"; - default: assert(0); return 0; - } - } - -protected: - map import_state; // FIXME make these dirfrags - map import_peer; - map > import_bystanders; - map > import_bound_ls; - map > import_updated_scatterlocks; - - /* - // -- hashing madness -- - multimap unhash_waiting; // nodes i am waiting for UnhashDirAck's from - multimap import_hashed_replicate_waiting; // nodes i am waiting to discover to complete my import of a hashed dir - // maps frozen_dir_ino's to waiting-for-discover ino's. - multimap import_hashed_frozen_waiting; // dirs i froze (for the above) - */ - - -public: - // -- cons -- - Migrator(MDS *m, MDCache *c) : mds(m), cache(c) {} - - void dispatch(Message*); - - void show_importing(); - void show_exporting(); - - // -- status -- - int is_exporting(CDir *dir) { - if (export_state.count(dir)) return export_state[dir]; - return 0; - } - bool is_exporting() { return !export_state.empty(); } - int is_importing(dirfrag_t df) { - if (import_state.count(df)) return import_state[df]; - return 0; - } - bool is_importing() { return !import_state.empty(); } - - int get_import_state(dirfrag_t df) { - assert(import_state.count(df)); - return import_state[df]; - } - int get_import_peer(dirfrag_t df) { - assert(import_peer.count(df)); - return import_peer[df]; - } - - int get_export_state(CDir *dir) { - assert(export_state.count(dir)); - return export_state[dir]; - } - // this returns true if we are export @dir, - // and are not waiting for @who to be - // be warned of ambiguous auth. - // only returns meaningful results during EXPORT_WARNING state. - bool export_has_warned(CDir *dir, int who) { - assert(is_exporting(dir)); - assert(export_state[dir] == EXPORT_WARNING); - return (export_warning_ack_waiting[dir].count(who) == 0); - } - - - // -- misc -- - void handle_mds_failure_or_stop(int who); - - void audit(); - - // -- import/export -- - // exporter - public: - void export_dir(CDir *dir, int dest); - void export_empty_import(CDir *dir); - - void export_dir_nicely(CDir *dir, int dest); - void maybe_do_queued_export(); - void clear_export_queue() { - export_queue.clear(); - } - - void encode_export_inode(CInode *in, bufferlist& enc_state, - map& exported_client_map); - void finish_export_inode(CInode *in, utime_t now, list& finished); - int encode_export_dir(bufferlist& exportbl, - CDir *dir, - map& exported_client_map, - utime_t now); - void finish_export_dir(CDir *dir, list& finished, utime_t now); - - void add_export_finish_waiter(CDir *dir, Context *c) { - export_finish_waiters[dir].push_back(c); - } - void clear_export_proxy_pins(CDir *dir); - - protected: - void handle_export_discover_ack(MExportDirDiscoverAck *m); - void export_frozen(CDir *dir); - void handle_export_prep_ack(MExportDirPrepAck *m); - void export_go(CDir *dir); - void export_reverse(CDir *dir); - void handle_export_ack(MExportDirAck *m); - void export_logged_finish(CDir *dir); - void handle_export_notify_ack(MExportDirNotifyAck *m); - void export_finish(CDir *dir); - - friend class C_MDC_ExportFreeze; - friend class C_MDS_ExportFinishLogged; - - - // importer - void handle_export_discover(MExportDirDiscover *m); - void handle_export_cancel(MExportDirCancel *m); - void handle_export_prep(MExportDirPrep *m); - void handle_export_dir(MExportDir *m); - -public: - void decode_import_inode(CDentry *dn, bufferlist::iterator& blp, int oldauth, - map& imported_client_map, - LogSegment *ls, - list& updated_scatterlocks); - int decode_import_dir(bufferlist::iterator& blp, - int oldauth, - CDir *import_root, - EImportStart *le, - map& imported_client_map, - LogSegment *ls, - list& updated_scatterlocks); - -public: - void import_reverse(CDir *dir); -protected: - void import_remove_pins(CDir *dir, set& bounds); - void import_reverse_unfreeze(CDir *dir); - void import_reverse_final(CDir *dir); - void import_notify_abort(CDir *dir, set& bounds); - void import_logged_start(CDir *dir, int from); - void handle_export_finish(MExportDirFinish *m); -public: - void import_finish(CDir *dir); -protected: - - friend class C_MDS_ImportDirLoggedStart; - friend class C_MDS_ImportDirLoggedFinish; - - // bystander - void handle_export_notify(MExportDirNotify *m); - - -}; - - -#endif diff --git a/branches/sage/ebofs2/mds/ScatterLock.h b/branches/sage/ebofs2/mds/ScatterLock.h deleted file mode 100644 index 24a1361f82d68..0000000000000 --- a/branches/sage/ebofs2/mds/ScatterLock.h +++ /dev/null @@ -1,183 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __SCATTERLOCK_H -#define __SCATTERLOCK_H - -#include "SimpleLock.h" - - -// lock state machine states: -// Sync -- Lock -- sCatter -// Tempsync _/ -// auth repl -#define LOCK_SYNC__ // R . R . rdlocks allowed on auth and replicas -#define LOCK_GLOCKS -20 // r . r . waiting for replicas+rdlocks (auth), or rdlocks to release (replica) -#define LOCK_GSCATTERS -28 // r . r . - -#define LOCK_GSYNCL__ // . w LOCK on replica. -#define LOCK_LOCK__ // . W . . -#define LOCK_GTEMPSYNCL -21 // . w LOCK on replica. - -#define LOCK_GLOCKC -22 // . wp . wp waiting for replicas+wrlocks (auth), or wrlocks to release (replica) -#define LOCK_SCATTER 23 // . Wp . WP mtime updates on replicas allowed, no reads. stable here. -#define LOCK_GTEMPSYNCC -24 // . wp . wp GLOCKC|LOCK on replica - -#define LOCK_GSCATTERT -25 // r . LOCK on replica. -#define LOCK_GLOCKT -26 // r . LOCK on replica. -#define LOCK_TEMPSYNC 27 // R . LOCK on replica. - - -inline const char *get_scatterlock_state_name(int s) { - switch(s) { - case LOCK_SYNC: return "Sync"; - case LOCK_GLOCKS: return "gLockS"; - case LOCK_GSCATTERS: return "gScatterS"; - - case LOCK_GSYNCL: return "gSyncL"; - case LOCK_LOCK: return "Lock"; - case LOCK_GTEMPSYNCL: return "gTempsyncL"; - - case LOCK_GLOCKC: return "gLockC"; - case LOCK_SCATTER: return "sCatter"; - case LOCK_GTEMPSYNCC: return "gTempsyncC"; - - case LOCK_GSCATTERT: return "gsCatterT"; - case LOCK_GLOCKT: return "gLockT"; - case LOCK_TEMPSYNC: return "Tempsync"; - - default: assert(0); return 0; - } -} - -class ScatterLock : public SimpleLock { - int num_wrlock; - bool updated; - utime_t last_scatter; - -public: - xlist::item xlistitem_autoscattered; - - ScatterLock(MDSCacheObject *o, int t, int wo) : - SimpleLock(o, t, wo), - num_wrlock(0), - updated(false), - xlistitem_autoscattered(this) {} - - int get_replica_state() { - switch (state) { - case LOCK_SYNC: - return LOCK_SYNC; - - case LOCK_GSCATTERS: // hrm. - case LOCK_GLOCKS: - case LOCK_GSYNCL: - case LOCK_LOCK: - case LOCK_GTEMPSYNCL: - case LOCK_GLOCKC: - return LOCK_LOCK; - - case LOCK_SCATTER: - return LOCK_SCATTER; - - case LOCK_GTEMPSYNCC: - case LOCK_GSCATTERT: - case LOCK_GLOCKT: - case LOCK_TEMPSYNC: - return LOCK_LOCK; - default: - assert(0); - return 0; - } - } - - void set_updated() { - if (!updated) { - parent->get(MDSCacheObject::PIN_DIRTYSCATTERED); - updated = true; - } - } - void clear_updated() { - if (updated) { - parent->put(MDSCacheObject::PIN_DIRTYSCATTERED); - updated = false; - parent->clear_dirty_scattered(type); - } - } - bool is_updated() { return updated; } - - void set_last_scatter(utime_t t) { last_scatter = t; } - utime_t get_last_scatter() { return last_scatter; } - - void replicate_relax() { - } - - void export_twiddle() { - clear_gather(); - state = get_replica_state(); - } - - // rdlock - bool can_rdlock(MDRequest *mdr) { - return state == LOCK_SYNC || state == LOCK_TEMPSYNC; - } - bool can_rdlock_soon() { - return state == LOCK_GTEMPSYNCC; - } - - // xlock - bool can_xlock_soon() { - if (parent->is_auth()) - return (state == LOCK_GLOCKC || - state == LOCK_GLOCKS); - else - return false; - } - - // wrlock - bool can_wrlock() { - return state == LOCK_SCATTER || state == LOCK_LOCK; - } - void get_wrlock() { - assert(can_wrlock()); - if (num_wrlock == 0) parent->get(MDSCacheObject::PIN_LOCK); - ++num_wrlock; - } - void put_wrlock() { - --num_wrlock; - if (num_wrlock == 0) parent->put(MDSCacheObject::PIN_LOCK); - } - bool is_wrlocked() { return num_wrlock > 0; } - int get_num_wrlocks() { return num_wrlock; } - - void print(ostream& out) { - out << "("; - out << get_lock_type_name(get_type()) << " "; - out << get_scatterlock_state_name(get_state()); - if (!get_gather_set().empty()) out << " g=" << get_gather_set(); - if (is_rdlocked()) - out << " r=" << get_num_rdlocks(); - if (is_xlocked()) - out << " x=" << get_xlocked_by(); - if (is_wrlocked()) - out << " wr=" << get_num_wrlocks(); - if (updated) - out << " updated"; - out << ")"; - } - -}; - -#endif diff --git a/branches/sage/ebofs2/mds/Server.cc b/branches/sage/ebofs2/mds/Server.cc deleted file mode 100644 index 0c2559af324b7..0000000000000 --- a/branches/sage/ebofs2/mds/Server.cc +++ /dev/null @@ -1,3975 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "MDS.h" -#include "Server.h" -#include "Locker.h" -#include "MDCache.h" -#include "MDLog.h" -#include "Migrator.h" -#include "MDBalancer.h" -#include "AnchorClient.h" -#include "IdAllocator.h" - -#include "msg/Messenger.h" - -#include "messages/MClientSession.h" -#include "messages/MClientRequest.h" -#include "messages/MClientReply.h" -#include "messages/MClientReconnect.h" -#include "messages/MClientFileCaps.h" - -#include "messages/MMDSSlaveRequest.h" - -#include "messages/MLock.h" - -#include "messages/MDentryUnlink.h" - -#include "events/EString.h" -#include "events/EUpdate.h" -#include "events/ESlaveUpdate.h" -#include "events/ESession.h" -#include "events/EOpen.h" - -#include "include/filepath.h" -#include "common/Timer.h" -#include "common/Logger.h" -#include "common/LogType.h" - -#include -#include - -#include -#include -using namespace std; - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) *_dout << dbeginl << g_clock.now() << " mds" << mds->get_nodeid() << ".server " -#define derr(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) *_derr << dbeginl << g_clock.now() << " mds" << mds->get_nodeid() << ".server " - - -void Server::reopen_logger(utime_t start, bool append) -{ - static LogType mdserver_logtype; - static bool didit = false; - if (!didit) { - didit = true; - mdserver_logtype.add_inc("hcreq"); // handle client req - mdserver_logtype.add_inc("hsreq"); // slave - mdserver_logtype.add_inc("hcsess"); // client session - mdserver_logtype.add_inc("dcreq"); // dispatch client req - mdserver_logtype.add_inc("dsreq"); // slave - } - - if (logger) - delete logger; - - // logger - char name[80]; - sprintf(name, "mds%d.server", mds->get_nodeid()); - logger = new Logger(name, &mdserver_logtype, append); - logger->set_start(start); -} - - -void Server::dispatch(Message *m) -{ - switch (m->get_type()) { - case MSG_CLIENT_RECONNECT: - handle_client_reconnect((MClientReconnect*)m); - return; - } - - // active? - if (!mds->is_active()) { - dout(3) << "not active yet, waiting" << dendl; - mds->wait_for_active(new C_MDS_RetryMessage(mds, m)); - return; - } - - switch (m->get_type()) { - case MSG_CLIENT_SESSION: - handle_client_session((MClientSession*)m); - return; - case MSG_CLIENT_REQUEST: - handle_client_request((MClientRequest*)m); - return; - case MSG_MDS_SLAVE_REQUEST: - handle_slave_request((MMDSSlaveRequest*)m); - return; - } - - dout(1) << "server unknown message " << m->get_type() << dendl; - assert(0); -} - - - -// ---------------------------------------------------------- -// SESSION management - - -class C_MDS_session_finish : public Context { - MDS *mds; - entity_inst_t client_inst; - bool open; - version_t cmapv; -public: - C_MDS_session_finish(MDS *m, entity_inst_t ci, bool s, version_t mv) : - mds(m), client_inst(ci), open(s), cmapv(mv) { } - void finish(int r) { - assert(r == 0); - mds->server->_session_logged(client_inst, open, cmapv); - } -}; - - -void Server::handle_client_session(MClientSession *m) -{ - dout(3) << "handle_client_session " << *m << " from " << m->get_source() << dendl; - int from = m->get_source().num(); - bool open = m->op == MClientSession::OP_REQUEST_OPEN; - - if (open) { - if (mds->clientmap.have_session(from)) { - dout(10) << "already open, dropping this req" << dendl; - delete m; - return; - } - if (mds->clientmap.is_opening(from)) { - dout(10) << "already opening, dropping this req" << dendl; - delete m; - return; - } - mds->clientmap.add_opening(from); - } else { - if (mds->clientmap.is_closing(from)) { - dout(10) << "already closing, dropping this req" << dendl; - delete m; - return; - } - if (m->seq < mds->clientmap.get_push_seq(from)) { - dout(10) << "old push seq " << m->seq << " < " << mds->clientmap.get_push_seq(from) - << ", dropping" << dendl; - delete m; - return; - } - assert(m->seq == mds->clientmap.get_push_seq(from)); - - mds->clientmap.add_closing(from); - } - - // journal it - version_t cmapv = mds->clientmap.inc_projected(); - mdlog->submit_entry(new ESession(m->get_source_inst(), open, cmapv), - new C_MDS_session_finish(mds, m->get_source_inst(), open, cmapv)); - delete m; - - if (logger) logger->inc("hcsess"); -} - -void Server::_session_logged(entity_inst_t client_inst, bool open, version_t cmapv) -{ - dout(10) << "_session_logged " << client_inst << " " << (open ? "open":"close") - << " " << cmapv - << dendl; - - // apply - int from = client_inst.name.num(); - if (open) { - assert(mds->clientmap.is_opening(from)); - mds->clientmap.open_session(client_inst); - } else { - assert(mds->clientmap.is_closing(from)); - mds->clientmap.close_session(from); - - // purge completed requests from clientmap - mds->clientmap.trim_completed_requests(from, 0); - } - - assert(cmapv == mds->clientmap.get_version()); - - // reply - if (open) - mds->messenger->send_message(new MClientSession(MClientSession::OP_OPEN), client_inst); - else - mds->messenger->send_message(new MClientSession(MClientSession::OP_CLOSE), client_inst); -} - - -void Server::terminate_sessions() -{ - dout(2) << "terminate_sessions" << dendl; - - // kill them off. clients will retry etc. - for (set::const_iterator p = mds->clientmap.get_session_set().begin(); - p != mds->clientmap.get_session_set().end(); - ++p) { - if (mds->clientmap.is_closing(*p)) - continue; - mds->clientmap.add_closing(*p); - version_t cmapv = mds->clientmap.inc_projected(); - mdlog->submit_entry(new ESession(mds->clientmap.get_inst(*p), false, cmapv), - new C_MDS_session_finish(mds, mds->clientmap.get_inst(*p), false, cmapv)); - } -} - - -void Server::reconnect_clients() -{ - // reconnect with clients - if (mds->clientmap.get_session_set().empty()) { - dout(7) << "reconnect_clients -- no sessions, doing nothing." << dendl; - reconnect_gather_finish(); - return; - } - - dout(7) << "reconnect_clients -- sending mdsmap to clients with sessions" << dendl; - - mds->bcast_mds_map(); // send mdsmap to all client sessions - - // init gather list - reconnect_start = g_clock.now(); - client_reconnect_gather = mds->clientmap.get_session_set(); - dout(1) << "reconnect_clients -- " << client_reconnect_gather.size() << " sessions" << dendl; -} - -void Server::handle_client_reconnect(MClientReconnect *m) -{ - dout(7) << "handle_client_reconnect " << m->get_source() << dendl; - int from = m->get_source().num(); - - if (m->closed) { - dout(7) << " client had no session, removing from clientmap" << dendl; - - mds->clientmap.add_closing(from); - version_t cmapv = mds->clientmap.inc_projected(); - mdlog->submit_entry(new ESession(mds->clientmap.get_inst(from), false, cmapv), - new C_MDS_session_finish(mds, mds->clientmap.get_inst(from), false, cmapv)); - - } else { - - // caps - for (map::iterator p = m->inode_caps.begin(); - p != m->inode_caps.end(); - ++p) { - CInode *in = mdcache->get_inode(p->first); - if (in && in->is_auth()) { - // we recovered it, and it's ours. take note. - dout(15) << "open caps on " << *in << dendl; - in->reconnect_cap(from, p->second); - reconnected_caps.insert(in); - continue; - } - - filepath path = m->inode_path[p->first]; - if ((in && !in->is_auth()) || - !mds->mdcache->path_is_mine(path)) { - // not mine. - dout(0) << "non-auth " << p->first << " " << m->inode_path[p->first] - << ", will pass off to authority" << dendl; - - // mark client caps stale. - inode_t fake_inode; - fake_inode.ino = p->first; - MClientFileCaps *stale = new MClientFileCaps(MClientFileCaps::OP_STALE, - fake_inode, - 0, - 0, // doesn't matter. - p->second.wanted); // doesn't matter. - mds->send_message_client(stale, m->get_source_inst()); - - // add to cap export list. - mdcache->rejoin_export_caps(p->first, m->inode_path[p->first], from, p->second); - } else { - // mine. fetch later. - dout(0) << "missing " << p->first << " " << m->inode_path[p->first] - << " (mine), will load later" << dendl; - mdcache->rejoin_recovered_caps(p->first, m->inode_path[p->first], from, p->second, - -1); // "from" me. - } - } - } - - // remove from gather set - client_reconnect_gather.erase(from); - if (client_reconnect_gather.empty()) reconnect_gather_finish(); - - delete m; -} - -/* - * called by mdcache, late in rejoin (right before acks are sent) - */ -void Server::process_reconnected_caps() -{ - dout(10) << "process_reconnected_caps" << dendl; - - // adjust filelock state appropriately - for (set::iterator p = reconnected_caps.begin(); - p != reconnected_caps.end(); - ++p) { - CInode *in = *p; - int issued = in->get_caps_issued(); - if (in->is_auth()) { - // wr? - if (issued & (CAP_FILE_WR|CAP_FILE_WRBUFFER)) { - if (issued & (CAP_FILE_RDCACHE|CAP_FILE_WRBUFFER)) { - in->filelock.set_state(LOCK_LONER); - } else { - in->filelock.set_state(LOCK_MIXED); - } - } - } else { - // note that client should perform stale/reap cleanup during reconnect. - assert(issued & (CAP_FILE_WR|CAP_FILE_WRBUFFER) == 0); // ???? - if (in->filelock.is_xlocked()) - in->filelock.set_state(LOCK_LOCK); - else - in->filelock.set_state(LOCK_SYNC); // might have been lock, previously - } - dout(15) << " issued " << cap_string(issued) - << " chose " << in->filelock - << " on " << *in << dendl; - } - reconnected_caps.clear(); // clean up -} - - -void Server::client_reconnect_failure(int from) -{ - dout(5) << "client_reconnect_failure on client" << from << dendl; - if (mds->is_reconnect() && - client_reconnect_gather.count(from)) { - client_reconnect_gather.erase(from); - if (client_reconnect_gather.empty()) - reconnect_gather_finish(); - } -} - -void Server::reconnect_gather_finish() -{ - dout(7) << "reconnect_gather_finish" << dendl; - mds->reconnect_done(); -} - - - -/******* - * some generic stuff for finishing off requests - */ - - -/* - * send generic response (just and error code) - */ -void Server::reply_request(MDRequest *mdr, int r, CInode *tracei) -{ - reply_request(mdr, new MClientReply(mdr->client_request, r), tracei); -} - - -/* - * send given reply - * include a trace to tracei - */ -void Server::reply_request(MDRequest *mdr, MClientReply *reply, CInode *tracei) -{ - MClientRequest *req = mdr->client_request; - - dout(10) << "reply_request " << reply->get_result() - << " (" << strerror(-reply->get_result()) - << ") " << *req << dendl; - - // note result code in clientmap? - if (!req->is_idempotent()) - mds->clientmap.add_completed_request(mdr->reqid); - - /* - if (tracei && !tracei->hack_accessed) { - tracei->hack_accessed = true; - mds->logger->inc("newt"); - if (tracei->parent && - tracei->parent->dir->hack_num_accessed >= 0) { - tracei->parent->dir->hack_num_accessed++; - if (tracei->parent->dir->hack_num_accessed == 1) - mds->logger->inc("dirt1"); - if (tracei->parent->dir->hack_num_accessed == 2) - mds->logger->inc("dirt2"); - if (tracei->parent->dir->hack_num_accessed == 3) - mds->logger->inc("dirt3"); - if (tracei->parent->dir->hack_num_accessed == 4) - mds->logger->inc("dirt4"); - if (tracei->parent->dir->hack_num_accessed == 5) - mds->logger->inc("dirt5"); - } - } - */ - - // include trace - if (tracei) { - reply->set_trace_dist( tracei, mds->get_nodeid() ); - } - - // send reply - messenger->send_message(reply, req->get_client_inst()); - - // finish request - mdcache->request_finish(mdr); -} - - - - - -/*** - * process a client request - */ -void Server::handle_client_request(MClientRequest *req) -{ - dout(4) << "handle_client_request " << *req << dendl; - int client = req->get_client(); - - if (logger) logger->inc("hcreq"); - - if (!mds->is_active()) { - dout(5) << " not active, discarding client request." << dendl; - delete req; - return; - } - - if (!mdcache->get_root()) { - dout(5) << "need to open root" << dendl; - mdcache->open_root(new C_MDS_RetryMessage(mds, req)); - return; - } - - // active session? - if (!mds->clientmap.have_session(client)) { - dout(5) << "no session for client" << client << ", dropping" << dendl; - delete req; - return; - } - - - // okay, i want - CInode *ref = 0; - - // retry? - if (req->get_retry_attempt()) { - if (mds->clientmap.have_completed_request(req->get_reqid())) { - dout(5) << "already completed " << req->get_reqid() << dendl; - mds->messenger->send_message(new MClientReply(req, 0), req->get_client_inst()); - delete req; - return; - } - } - // trim completed_request list - if (req->get_oldest_client_tid() > 0) { - dout(15) << " oldest_client_tid=" << req->get_oldest_client_tid() << dendl; - mds->clientmap.trim_completed_requests(client, - req->get_oldest_client_tid()); - } - - - // ----- - // some ops are on ino's - switch (req->get_op()) { - case MDS_OP_FSTAT: - ref = mdcache->get_inode(req->args.fstat.ino); - assert(ref); - break; - - case MDS_OP_TRUNCATE: - if (!req->args.truncate.ino) - break; // can be called w/ either fh OR path - ref = mdcache->get_inode(req->args.truncate.ino); - assert(ref); - break; - - case MDS_OP_FSYNC: - ref = mdcache->get_inode(req->args.fsync.ino); // fixme someday no ino needed? - assert(ref); - break; - } - - // register + dispatch - MDRequest *mdr = mdcache->request_start(req); - if (!mdr) return; - - if (ref) { - dout(10) << "inode op on ref " << *ref << dendl; - mdr->ref = ref; - mdr->pin(ref); - } - - dispatch_client_request(mdr); - return; -} - - -void Server::dispatch_client_request(MDRequest *mdr) -{ - MClientRequest *req = mdr->client_request; - - if (logger) logger->inc("dcreq"); - - if (mdr->ref) { - dout(7) << "dispatch_client_request " << *req << " ref " << *mdr->ref << dendl; - } else { - dout(7) << "dispatch_client_request " << *req << dendl; - } - - // we shouldn't be waiting on anyone. - assert(mdr->more()->waiting_on_slave.empty()); - - switch (req->get_op()) { - - // inodes ops. - case MDS_OP_STAT: - case MDS_OP_LSTAT: - handle_client_stat(mdr); - break; - case MDS_OP_UTIME: - handle_client_utime(mdr); - break; - case MDS_OP_CHMOD: - handle_client_chmod(mdr); - break; - case MDS_OP_CHOWN: - handle_client_chown(mdr); - break; - case MDS_OP_TRUNCATE: - handle_client_truncate(mdr); - break; - case MDS_OP_READDIR: - handle_client_readdir(mdr); - break; - case MDS_OP_FSYNC: - //handle_client_fsync(req, ref); - break; - - // funky. - case MDS_OP_OPEN: - if (req->args.open.flags & O_CREAT) - handle_client_openc(mdr); - else - handle_client_open(mdr); - break; - - // namespace. - // no prior locks. - case MDS_OP_MKNOD: - handle_client_mknod(mdr); - break; - case MDS_OP_LINK: - handle_client_link(mdr); - break; - case MDS_OP_UNLINK: - case MDS_OP_RMDIR: - handle_client_unlink(mdr); - break; - case MDS_OP_RENAME: - handle_client_rename(mdr); - break; - case MDS_OP_MKDIR: - handle_client_mkdir(mdr); - break; - case MDS_OP_SYMLINK: - handle_client_symlink(mdr); - break; - - - default: - dout(1) << " unknown client op " << req->get_op() << dendl; - assert(0); - } -} - - -// --------------------------------------- -// SLAVE REQUESTS - -void Server::handle_slave_request(MMDSSlaveRequest *m) -{ - dout(4) << "handle_slave_request " << m->get_reqid() << " from " << m->get_source() << dendl; - int from = m->get_source().num(); - - if (logger) logger->inc("hsreq"); - - // reply? - if (m->is_reply()) { - - switch (m->get_op()) { - case MMDSSlaveRequest::OP_XLOCKACK: - { - // identify lock, master request - SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(), - m->get_object_info()); - MDRequest *mdr = mdcache->request_get(m->get_reqid()); - mdr->more()->slaves.insert(from); - dout(10) << "got remote xlock on " << *lock << " on " << *lock->get_parent() << dendl; - mdr->xlocks.insert(lock); - mdr->locks.insert(lock); - lock->get_xlock(mdr); - lock->finish_waiters(SimpleLock::WAIT_REMOTEXLOCK); - } - break; - - case MMDSSlaveRequest::OP_AUTHPINACK: - { - MDRequest *mdr = mdcache->request_get(m->get_reqid()); - handle_slave_auth_pin_ack(mdr, m); - } - break; - - case MMDSSlaveRequest::OP_LINKPREPACK: - { - MDRequest *mdr = mdcache->request_get(m->get_reqid()); - handle_slave_link_prep_ack(mdr, m); - } - break; - - case MMDSSlaveRequest::OP_RENAMEPREPACK: - { - MDRequest *mdr = mdcache->request_get(m->get_reqid()); - handle_slave_rename_prep_ack(mdr, m); - } - break; - - default: - assert(0); - } - - // done with reply. - delete m; - return; - - } else { - // am i a new slave? - MDRequest *mdr; - if (mdcache->have_request(m->get_reqid())) { - // existing? - mdr = mdcache->request_get(m->get_reqid()); - if (mdr->slave_to_mds != from) { // may not even be a slave! (e.g. forward race) - dout(10) << "local request " << *mdr << " not slave to mds" << from - << ", ignoring " << *m << dendl; - delete m; - return; - } - } else { - // new? - if (m->get_op() == MMDSSlaveRequest::OP_FINISH) { - dout(10) << "missing slave request for " << m->get_reqid() - << " OP_FINISH, must have lost race with a forward" << dendl; - delete m; - return; - } - mdr = mdcache->request_start_slave(m->get_reqid(), m->get_source().num()); - } - assert(mdr->slave_request == 0); // only one at a time, please! - mdr->slave_request = m; - - dispatch_slave_request(mdr); - } -} - -void Server::dispatch_slave_request(MDRequest *mdr) -{ - dout(7) << "dispatch_slave_request " << *mdr << " " << *mdr->slave_request << dendl; - - if (mdr->aborted) { - dout(7) << " abort flag set, finishing" << dendl; - mdcache->request_finish(mdr); - return; - } - - if (logger) logger->inc("dsreq"); - - switch (mdr->slave_request->get_op()) { - case MMDSSlaveRequest::OP_XLOCK: - { - // identify object - SimpleLock *lock = mds->locker->get_lock(mdr->slave_request->get_lock_type(), - mdr->slave_request->get_object_info()); - - if (lock && lock->get_parent()->is_auth()) { - // xlock. - // use acquire_locks so that we get auth_pinning. - set rdlocks; - set wrlocks; - set xlocks = mdr->xlocks; - xlocks.insert(lock); - - if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) - return; - - // ack - MMDSSlaveRequest *r = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_XLOCKACK); - r->set_lock_type(lock->get_type()); - lock->get_parent()->set_object_info(r->get_object_info()); - mds->send_message_mds(r, mdr->slave_request->get_source().num()); - } else { - if (lock) { - dout(10) << "not auth for remote xlock attempt, dropping on " - << *lock << " on " << *lock->get_parent() << dendl; - } else { - dout(10) << "don't have object, dropping" << dendl; - assert(0); // can this happen, if we auth pinned properly. - } - } - - // done. - delete mdr->slave_request; - mdr->slave_request = 0; - } - break; - - case MMDSSlaveRequest::OP_UNXLOCK: - { - SimpleLock *lock = mds->locker->get_lock(mdr->slave_request->get_lock_type(), - mdr->slave_request->get_object_info()); - assert(lock); - mds->locker->xlock_finish(lock, mdr); - - // done. no ack necessary. - delete mdr->slave_request; - mdr->slave_request = 0; - } - break; - - case MMDSSlaveRequest::OP_AUTHPIN: - handle_slave_auth_pin(mdr); - break; - - case MMDSSlaveRequest::OP_LINKPREP: - case MMDSSlaveRequest::OP_UNLINKPREP: - handle_slave_link_prep(mdr); - break; - - case MMDSSlaveRequest::OP_RENAMEPREP: - handle_slave_rename_prep(mdr); - break; - - case MMDSSlaveRequest::OP_FINISH: - // finish off request. - mdcache->request_finish(mdr); - break; - - default: - assert(0); - } -} - - -void Server::handle_slave_auth_pin(MDRequest *mdr) -{ - dout(10) << "handle_slave_auth_pin " << *mdr << dendl; - - // build list of objects - list objects; - bool fail = false; - - for (list::iterator p = mdr->slave_request->get_authpins().begin(); - p != mdr->slave_request->get_authpins().end(); - ++p) { - MDSCacheObject *object = mdcache->get_object(*p); - if (!object) { - dout(10) << " don't have " << *p << dendl; - fail = true; - break; - } - - objects.push_back(object); - } - - // can we auth pin them? - if (!fail) { - for (list::iterator p = objects.begin(); - p != objects.end(); - ++p) { - if (!(*p)->is_auth()) { - dout(10) << " not auth for " << **p << dendl; - fail = true; - break; - } - if (!mdr->is_auth_pinned(*p) && - !(*p)->can_auth_pin()) { - // wait - dout(10) << " waiting for authpinnable on " << **p << dendl; - (*p)->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr)); - mdr->drop_local_auth_pins(); - return; - } - } - } - - // auth pin! - if (fail) { - mdr->drop_local_auth_pins(); // just in case - } else { - for (list::iterator p = objects.begin(); - p != objects.end(); - ++p) { - dout(10) << "auth_pinning " << **p << dendl; - mdr->auth_pin(*p); - } - } - - // ack! - MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_AUTHPINACK); - - // return list of my auth_pins (if any) - for (set::iterator p = mdr->auth_pins.begin(); - p != mdr->auth_pins.end(); - ++p) { - MDSCacheObjectInfo info; - (*p)->set_object_info(info); - reply->get_authpins().push_back(info); - } - - mds->send_message_mds(reply, mdr->slave_to_mds); - - // clean up this request - delete mdr->slave_request; - mdr->slave_request = 0; - return; -} - -void Server::handle_slave_auth_pin_ack(MDRequest *mdr, MMDSSlaveRequest *ack) -{ - dout(10) << "handle_slave_auth_pin_ack on " << *mdr << " " << *ack << dendl; - int from = ack->get_source().num(); - - // added auth pins? - set pinned; - for (list::iterator p = ack->get_authpins().begin(); - p != ack->get_authpins().end(); - ++p) { - MDSCacheObject *object = mdcache->get_object(*p); - assert(object); // we pinned it - dout(10) << " remote has pinned " << *object << dendl; - if (!mdr->is_auth_pinned(object)) - mdr->remote_auth_pins.insert(object); - pinned.insert(object); - } - - // removed auth pins? - set::iterator p = mdr->remote_auth_pins.begin(); - while (p != mdr->remote_auth_pins.end()) { - if ((*p)->authority().first == from && - pinned.count(*p) == 0) { - dout(10) << " remote has unpinned " << **p << dendl; - set::iterator o = p; - ++p; - mdr->remote_auth_pins.erase(o); - } else { - ++p; - } - } - - // note slave - mdr->more()->slaves.insert(from); - - // clear from waiting list - assert(mdr->more()->waiting_on_slave.count(from)); - mdr->more()->waiting_on_slave.erase(from); - - // go again? - if (mdr->more()->waiting_on_slave.empty()) - dispatch_client_request(mdr); - else - dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl; -} - - -// --------------------------------------- -// HELPERS - - -/** validate_dentry_dir - * - * verify that the dir exists and would own the dname. - * do not check if the dentry exists. - */ -CDir *Server::validate_dentry_dir(MDRequest *mdr, CInode *diri, const string& dname) -{ - // make sure parent is a dir? - if (!diri->is_dir()) { - dout(7) << "validate_dentry_dir: not a dir" << dendl; - reply_request(mdr, -ENOTDIR); - return false; - } - - // which dirfrag? - frag_t fg = diri->pick_dirfrag(dname); - CDir *dir = try_open_auth_dirfrag(diri, fg, mdr); - if (!dir) - return 0; - - // frozen? - if (dir->is_frozen()) { - dout(7) << "dir is frozen " << *dir << dendl; - dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr)); - return false; - } - - return dir; -} - - -/** prepare_null_dentry - * prepare a null (or existing) dentry in given dir. - * wait for any dn lock. - */ -CDentry* Server::prepare_null_dentry(MDRequest *mdr, CDir *dir, const string& dname, bool okexist) -{ - dout(10) << "prepare_null_dentry " << dname << " in " << *dir << dendl; - assert(dir->is_auth()); - - // does it already exist? - CDentry *dn = dir->lookup(dname); - if (dn) { - if (dn->lock.is_xlocked_by_other(mdr)) { - dout(10) << "waiting on xlocked dentry " << *dn << dendl; - dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr)); - return 0; - } - - if (!dn->is_null()) { - // name already exists - dout(10) << "dentry " << dname << " exists in " << *dir << dendl; - if (!okexist) { - reply_request(mdr, -EEXIST); - return 0; - } - } - - return dn; - } - - // make sure dir is complete - if (!dir->is_complete()) { - dout(7) << " incomplete dir contents for " << *dir << ", fetching" << dendl; - dir->fetch(new C_MDS_RetryRequest(mdcache, mdr)); - return 0; - } - - // create - dn = dir->add_null_dentry(dname); - dn->mark_new(); - dout(10) << "prepare_null_dentry added " << *dn << dendl; - - return dn; -} - - -/** prepare_new_inode - * - * create a new inode. set c/m/atime. hit dir pop. - */ -CInode* Server::prepare_new_inode(MDRequest *mdr, CDir *dir) -{ - CInode *in = mdcache->create_inode(); - in->inode.uid = mdr->client_request->get_caller_uid(); - in->inode.gid = mdr->client_request->get_caller_gid(); - in->inode.ctime = in->inode.mtime = in->inode.atime = mdr->now; // now - dout(10) << "prepare_new_inode " << *in << dendl; - - return in; -} - - - -CDir *Server::traverse_to_auth_dir(MDRequest *mdr, vector &trace, filepath refpath) -{ - // figure parent dir vs dname - if (refpath.depth() == 0) { - dout(7) << "can't do that to root" << dendl; - reply_request(mdr, -EINVAL); - return 0; - } - string dname = refpath.last_dentry(); - refpath.pop_dentry(); - - dout(10) << "traverse_to_auth_dir dirpath " << refpath << " dname " << dname << dendl; - - // traverse to parent dir - int r = mdcache->path_traverse(mdr, mdr->client_request, - 0, refpath, trace, true, - MDS_TRAVERSE_FORWARD); - if (r > 0) return 0; // delayed - if (r < 0) { - reply_request(mdr, r); - return 0; - } - - // open inode - CInode *diri; - if (trace.empty()) - diri = mdcache->get_root(); - else - diri = mdcache->get_dentry_inode(trace[trace.size()-1], mdr); - if (!diri) - return 0; // opening inode. - - // is it an auth dir? - CDir *dir = validate_dentry_dir(mdr, diri, dname); - if (!dir) - return 0; // forwarded or waiting for freeze - - dout(10) << "traverse_to_auth_dir " << *dir << dendl; - return dir; -} - - - -CInode* Server::rdlock_path_pin_ref(MDRequest *mdr, bool want_auth) -{ - // already got ref? - if (mdr->ref) - return mdr->ref; - - MClientRequest *req = mdr->client_request; - - // traverse - filepath refpath = req->get_filepath(); - vector trace; - int r = mdcache->path_traverse(mdr, req, - 0, refpath, - trace, req->follow_trailing_symlink(), - MDS_TRAVERSE_FORWARD); - if (r > 0) return false; // delayed - if (r < 0) { // error - reply_request(mdr, r); - return 0; - } - - // open ref inode - CInode *ref = 0; - if (trace.empty()) - ref = mdcache->get_root(); - else { - CDentry *dn = trace[trace.size()-1]; - - // if no inode (null or unattached remote), fw to dentry auth? - if (want_auth && !dn->is_auth() && - (dn->is_null() || - (dn->is_remote() && dn->inode))) { - if (dn->is_ambiguous_auth()) { - dout(10) << "waiting for single auth on " << *dn << dendl; - dn->dir->add_waiter(CInode::WAIT_SINGLEAUTH, new C_MDS_RetryRequest(mdcache, mdr)); - } else { - dout(10) << "fw to auth for " << *dn << dendl; - mdcache->request_forward(mdr, dn->authority().first); - return 0; - } - } - - // open ref inode - ref = mdcache->get_dentry_inode(dn, mdr); - if (!ref) return 0; - } - dout(10) << "ref is " << *ref << dendl; - - // fw to inode auth? - if (want_auth && !ref->is_auth()) { - if (ref->is_ambiguous_auth()) { - dout(10) << "waiting for single auth on " << *ref << dendl; - ref->add_waiter(CInode::WAIT_SINGLEAUTH, new C_MDS_RetryRequest(mdcache, mdr)); - } else { - dout(10) << "fw to auth for " << *ref << dendl; - mdcache->request_forward(mdr, ref->authority().first); - } - return 0; - } - - // auth_pin? - if (want_auth) { - if (ref->is_frozen()) { - dout(7) << "waiting for !frozen/authpinnable on " << *ref << dendl; - ref->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr)); - return 0; - } - mdr->auth_pin(ref); - } - - // lock the path - set rdlocks, empty; - - for (int i=0; i<(int)trace.size(); i++) - rdlocks.insert(&trace[i]->lock); - - if (!mds->locker->acquire_locks(mdr, rdlocks, empty, empty)) - return 0; - - // set and pin ref - mdr->pin(ref); - mdr->ref = ref; - - // save the locked trace. - mdr->trace.swap(trace); - - return ref; -} - - -/** rdlock_path_xlock_dentry - * traverse path to the directory that could/would contain dentry. - * make sure i am auth for that dentry, forward as necessary. - * create null dentry in place (or use existing if okexist). - * get rdlocks on traversed dentries, xlock on new dentry. - */ -CDentry* Server::rdlock_path_xlock_dentry(MDRequest *mdr, bool okexist, bool mustexist) -{ - MClientRequest *req = mdr->client_request; - - vector trace; - CDir *dir = traverse_to_auth_dir(mdr, trace, req->get_filepath()); - if (!dir) return 0; - dout(10) << "rdlock_path_xlock_dentry dir " << *dir << dendl; - - // make sure we can auth_pin (or have already authpinned) dir - if (dir->is_frozen()) { - dout(7) << "waiting for !frozen/authpinnable on " << *dir << dendl; - dir->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr)); - return 0; - } - - // make a null dentry? - const string &dname = req->get_filepath().last_dentry(); - CDentry *dn; - if (mustexist) { - dn = dir->lookup(dname); - - // make sure dir is complete - if (!dn && !dir->is_complete()) { - dout(7) << " incomplete dir contents for " << *dir << ", fetching" << dendl; - dir->fetch(new C_MDS_RetryRequest(mdcache, mdr)); - return 0; - } - - // readable? - if (dn && dn->lock.is_xlocked_by_other(mdr)) { - dout(10) << "waiting on xlocked dentry " << *dn << dendl; - dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr)); - return 0; - } - - // exists? - if (!dn || dn->is_null()) { - dout(7) << "dentry " << dname << " dne in " << *dir << dendl; - reply_request(mdr, -ENOENT); - return 0; - } - } else { - dn = prepare_null_dentry(mdr, dir, dname, okexist); - if (!dn) - return 0; - } - - // -- lock -- - set rdlocks, wrlocks, xlocks; - - for (int i=0; i<(int)trace.size(); i++) - rdlocks.insert(&trace[i]->lock); - if (dn->is_null()) { - xlocks.insert(&dn->lock); // new dn, xlock - wrlocks.insert(&dn->dir->inode->dirlock); // also, wrlock on dir mtime - } else - rdlocks.insert(&dn->lock); // existing dn, rdlock - - if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) - return 0; - - // save the locked trace. - mdr->trace.swap(trace); - - return dn; -} - - - - - -/** - * try_open_auth_dirfrag -- open dirfrag, or forward to dirfrag auth - * - * @diri base indoe - * @fg the exact frag we want - * @mdr request - */ -CDir* Server::try_open_auth_dirfrag(CInode *diri, frag_t fg, MDRequest *mdr) -{ - CDir *dir = diri->get_dirfrag(fg); - - // not open and inode not mine? - if (!dir && !diri->is_auth()) { - int inauth = diri->authority().first; - dout(7) << "try_open_auth_dirfrag: not open, not inode auth, fw to mds" << inauth << dendl; - mdcache->request_forward(mdr, inauth); - return 0; - } - - // not open and inode frozen? - if (!dir && diri->is_frozen_dir()) { - dout(10) << "try_open_auth_dirfrag: dir inode is frozen, waiting " << *diri << dendl; - assert(diri->get_parent_dir()); - diri->get_parent_dir()->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr)); - return 0; - } - - // invent? - if (!dir) - dir = diri->get_or_open_dirfrag(mds->mdcache, fg); - - // am i auth for the dirfrag? - if (!dir->is_auth()) { - int auth = dir->authority().first; - dout(7) << "try_open_auth_dirfrag: not auth for " << *dir - << ", fw to mds" << auth << dendl; - mdcache->request_forward(mdr, auth); - return 0; - } - - return dir; -} - - - -/** predirty_dn_diri - * predirty the directory inode for a new dentry, if it is auth (and not root) - * BUG: root inode doesn't get dirtied properly, currently. blech. - */ -version_t Server::predirty_dn_diri(MDRequest *mdr, CDentry *dn, EMetaBlob *blob) -{ - version_t dirpv = 0; - CInode *diri = dn->dir->inode; - - if (diri->is_root()) return 0; - - if (diri->is_auth()) { - assert(mdr->wrlocks.count(&diri->dirlock)); - - dirpv = diri->pre_dirty(); - dout(10) << "predirty_dn_diri ctime/mtime " << mdr->now << " pv " << dirpv << " on " << *diri << dendl; - - // predirty+journal - inode_t *pi = diri->project_inode(); - if (dirpv) pi->version = dirpv; - pi->ctime = pi->mtime = mdr->now; - blob->add_dir_context(diri->get_parent_dn()->get_dir()); - blob->add_primary_dentry(diri->get_parent_dn(), true, 0, pi); - } else { - // journal the mtime change anyway. - inode_t *ji = blob->add_primary_dentry(diri->get_parent_dn(), true); - ji->ctime = ji->mtime = mdr->now; - - dout(10) << "predirty_dn_diri (non-auth) ctime/mtime " << mdr->now << " on " << *diri << dendl; - - blob->add_dirtied_inode_mtime(diri->ino(), mdr->now); - assert(mdr->ls); - mdr->ls->dirty_inode_mtimes.push_back(&diri->xlist_dirty_inode_mtime); - } - - return dirpv; -} - -/** dirty_dn_diri - * follow-up with actual dirty of inode after journal entry commits. - */ -void Server::dirty_dn_diri(MDRequest *mdr, CDentry *dn, version_t dirpv) -{ - CInode *diri = dn->dir->inode; - - if (diri->is_root()) return; - - if (dirpv) { - // we journaled and predirtied. - assert(diri->is_auth() && !diri->is_root()); - diri->pop_and_dirty_projected_inode(mdr->ls); - dout(10) << "dirty_dn_diri ctime/mtime " << mdr->now << " v " << diri->inode.version << " on " << *diri << dendl; - } else { - // dirlock scatterlock will propagate the update. - diri->inode.ctime = diri->inode.mtime = mdr->now; - diri->dirlock.set_updated(); - dout(10) << "dirty_dn_diri (non-dirty) ctime/mtime " << mdr->now << " on " << *diri << dendl; - } -} - - - - - - -// =============================================================================== -// STAT - -void Server::handle_client_stat(MDRequest *mdr) -{ - MClientRequest *req = mdr->client_request; - CInode *ref = rdlock_path_pin_ref(mdr, false); - if (!ref) return; - - // which inode locks do I want? - /* note: this works because we include existing locks in our lists, - and because all new locks are on inodes and sort to the right of - the dentry rdlocks previous acquired by rdlock_path_pin_ref(). */ - set rdlocks = mdr->rdlocks; - set wrlocks = mdr->wrlocks; - set xlocks = mdr->xlocks; - - int mask = req->args.stat.mask; - if (mask & STAT_MASK_LINK) rdlocks.insert(&ref->linklock); - if (mask & STAT_MASK_AUTH) rdlocks.insert(&ref->authlock); - if (ref->is_file() && - mask & STAT_MASK_FILE) rdlocks.insert(&ref->filelock); - if (ref->is_dir() && - mask & STAT_MASK_MTIME) rdlocks.insert(&ref->dirlock); - - if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) - return; - - mds->balancer->hit_inode(g_clock.now(), ref, META_POP_IRD, - mdr->client_request->get_client_inst().name.num()); - - // reply - dout(10) << "reply to stat on " << *req << dendl; - MClientReply *reply = new MClientReply(req); - reply_request(mdr, reply, ref); -} - - - - -// =============================================================================== -// INODE UPDATES - - -/* - * finisher for basic inode updates - */ -class C_MDS_inode_update_finish : public Context { - MDS *mds; - MDRequest *mdr; - CInode *in; -public: - C_MDS_inode_update_finish(MDS *m, MDRequest *r, CInode *i) : - mds(m), mdr(r), in(i) { } - void finish(int r) { - assert(r == 0); - - // apply - in->pop_and_dirty_projected_inode(mdr->ls); - - mds->balancer->hit_inode(mdr->now, in, META_POP_IWR); - - // reply - MClientReply *reply = new MClientReply(mdr->client_request, 0); - reply->set_result(0); - mds->server->reply_request(mdr, reply, in); - } -}; - - -// utime - -void Server::handle_client_utime(MDRequest *mdr) -{ - MClientRequest *req = mdr->client_request; - CInode *cur = rdlock_path_pin_ref(mdr, true); - if (!cur) return; - - if (cur->is_root()) { - reply_request(mdr, -EINVAL); // for now - return; - } - - // xlock inode - set rdlocks = mdr->rdlocks; - set wrlocks = mdr->wrlocks; - set xlocks = mdr->xlocks; - xlocks.insert(&cur->filelock); - if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) - return; - - // project update - inode_t *pi = cur->project_inode(); - pi->mtime = req->args.utime.mtime; - pi->atime = req->args.utime.atime; - pi->version = cur->pre_dirty(); - pi->ctime = g_clock.real_now(); - - // log + wait - mdr->ls = mdlog->get_current_segment(); - EUpdate *le = new EUpdate(mdlog, "utime"); - le->metablob.add_client_req(req->get_reqid()); - le->metablob.add_dir_context(cur->get_parent_dir()); - le->metablob.add_primary_dentry(cur->parent, true, 0, pi); - - mdlog->submit_entry(le, new C_MDS_inode_update_finish(mds, mdr, cur)); -} - - -// chmod - -void Server::handle_client_chmod(MDRequest *mdr) -{ - MClientRequest *req = mdr->client_request; - CInode *cur = rdlock_path_pin_ref(mdr, true); - if (!cur) return; - - if (cur->is_root()) { - reply_request(mdr, -EINVAL); // for now - return; - } - - // write - set rdlocks = mdr->rdlocks; - set wrlocks = mdr->wrlocks; - set xlocks = mdr->xlocks; - xlocks.insert(&cur->authlock); - if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) - return; - - // project update - inode_t *pi = cur->project_inode(); - pi->mode = - (pi->mode & ~04777) | - (req->args.chmod.mode & 04777); - pi->version = cur->pre_dirty(); - pi->ctime = g_clock.real_now(); - - // log + wait - mdr->ls = mdlog->get_current_segment(); - EUpdate *le = new EUpdate(mdlog, "chmod"); - le->metablob.add_client_req(req->get_reqid()); - le->metablob.add_dir_context(cur->get_parent_dir()); - le->metablob.add_primary_dentry(cur->parent, true, 0, pi); - - mdlog->submit_entry(le, new C_MDS_inode_update_finish(mds, mdr, cur)); -} - - -// chown - -void Server::handle_client_chown(MDRequest *mdr) -{ - MClientRequest *req = mdr->client_request; - CInode *cur = rdlock_path_pin_ref(mdr, true); - if (!cur) return; - - if (cur->is_root()) { - reply_request(mdr, -EINVAL); // for now - return; - } - - // write - set rdlocks = mdr->rdlocks; - set wrlocks = mdr->wrlocks; - set xlocks = mdr->xlocks; - xlocks.insert(&cur->authlock); - if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) - return; - - // project update - inode_t *pi = cur->project_inode(); - pi->uid = MAX(req->args.chown.uid, 0); - pi->gid = MAX(req->args.chown.gid, 0); - pi->version = cur->pre_dirty(); - pi->ctime = g_clock.real_now(); - - // log + wait - mdr->ls = mdlog->get_current_segment(); - EUpdate *le = new EUpdate(mdlog, "chown"); - le->metablob.add_client_req(req->get_reqid()); - le->metablob.add_dir_context(cur->get_parent_dir()); - le->metablob.add_primary_dentry(cur->parent, true, 0, pi); - - mdlog->submit_entry(le); - mdlog->wait_for_sync(new C_MDS_inode_update_finish(mds, mdr, cur)); -} - - - - -// ================================================================= -// DIRECTORY and NAMESPACE OPS - -// READDIR - -void Server::handle_client_readdir(MDRequest *mdr) -{ - MClientRequest *req = mdr->client_request; - CInode *diri = rdlock_path_pin_ref(mdr, false); - if (!diri) return; - - // it's a directory, right? - if (!diri->is_dir()) { - // not a dir - dout(10) << "reply to " << *req << " readdir -ENOTDIR" << dendl; - reply_request(mdr, -ENOTDIR, diri); - return; - } - - // which frag? - frag_t fg = req->args.readdir.frag; - - // does the frag exist? - if (diri->dirfragtree[fg] != fg) { - dout(10) << "frag " << fg << " doesn't appear in fragtree " << diri->dirfragtree << dendl; - reply_request(mdr, -EAGAIN, diri); - return; - } - - CDir *dir = try_open_auth_dirfrag(diri, fg, mdr); - if (!dir) return; - - // ok! - assert(dir->is_auth()); - - // check perm - /* - if (!mds->locker->inode_hard_rdlock_start(diri, mdr)) - return; - mds->locker->inode_hard_rdlock_finish(diri, mdr); - */ - - if (!dir->is_complete()) { - // fetch - dout(10) << " incomplete dir contents for readdir on " << *dir << ", fetching" << dendl; - dir->fetch(new C_MDS_RetryRequest(mdcache, mdr)); - return; - } - - // build dir contents - bufferlist dirbl; - - DirStat::_encode(dirbl, dir, mds->get_nodeid()); - - int numfiles = 0; - for (CDir::map_t::iterator it = dir->begin(); - it != dir->end(); - it++) { - CDentry *dn = it->second; - if (dn->is_null()) continue; - - CInode *in = dn->inode; - - // remote link? - // better for the MDS to do the work, if we think the client will stat any of these files. - if (dn->is_remote() && !in) { - in = mdcache->get_inode(dn->get_remote_ino()); - if (in) { - dn->link_remote(in); - } else { - mdcache->open_remote_ino(dn->get_remote_ino(), - mdr, - new C_MDS_RetryRequest(mdcache, mdr)); - - // touch everything i _do_ have - for (it = dir->begin(); - it != dir->end(); - it++) - if (!it->second->is_null()) - mdcache->lru.lru_touch(it->second); - return; - } - } - assert(in); - - - assert(in); - - dout(12) << "including inode " << *in << dendl; - - // add this dentry + inodeinfo - ::_encode(it->first, dirbl); - InodeStat::_encode(dirbl, in); - - // touch it - mdcache->lru.lru_touch(dn); - } - - // yay, reply - MClientReply *reply = new MClientReply(req); - reply->take_dir_items(dirbl); - - dout(10) << "reply to " << *req << " readdir " << numfiles << " files" << dendl; - reply->set_result(0); - - // bump popularity. NOTE: this doesn't quite capture it. - mds->balancer->hit_dir(g_clock.now(), dir, META_POP_IRD, -1, numfiles); - - // reply - reply_request(mdr, reply, diri); -} - - - -// ------------------------------------------------ - -// MKNOD - -class C_MDS_mknod_finish : public Context { - MDS *mds; - MDRequest *mdr; - CDentry *dn; - CInode *newi; - version_t dirpv; - version_t newdirpv; -public: - C_MDS_mknod_finish(MDS *m, MDRequest *r, CDentry *d, CInode *ni, version_t dirpv_, version_t newdirpv_=0) : - mds(m), mdr(r), dn(d), newi(ni), - dirpv(dirpv_), newdirpv(newdirpv_) {} - void finish(int r) { - assert(r == 0); - - // link the inode - dn->get_dir()->link_primary_inode(dn, newi); - - // dirty inode, dn, dir - newi->mark_dirty(newi->inode.version + 1, mdr->ls); - - // mkdir? - if (newdirpv) { - CDir *dir = newi->get_dirfrag(frag_t()); - assert(dir); - dir->mark_dirty(newdirpv, mdr->ls); - } - - // dir inode's mtime - mds->server->dirty_dn_diri(mdr, dn, dirpv); - - // hit pop - mds->balancer->hit_inode(mdr->now, newi, META_POP_IWR); - //mds->balancer->hit_dir(mdr->now, dn->get_dir(), META_POP_DWR); - - // reply - MClientReply *reply = new MClientReply(mdr->client_request, 0); - reply->set_result(0); - mds->server->reply_request(mdr, reply, newi); - } -}; - - -void Server::handle_client_mknod(MDRequest *mdr) -{ - MClientRequest *req = mdr->client_request; - - CDentry *dn = rdlock_path_xlock_dentry(mdr, false, false); - if (!dn) return; - - mdr->now = g_clock.real_now(); - CInode *newi = prepare_new_inode(mdr, dn->dir); - assert(newi); - - // it's a file. - newi->inode.rdev = req->args.mknod.rdev; - newi->inode.mode = req->args.mknod.mode; - newi->inode.mode &= ~INODE_TYPE_MASK; - newi->inode.mode |= INODE_MODE_FILE; - newi->inode.version = dn->pre_dirty() - 1; - - // prepare finisher - mdr->ls = mdlog->get_current_segment(); - EUpdate *le = new EUpdate(mdlog, "mknod"); - le->metablob.add_client_req(req->get_reqid()); - le->metablob.add_allocated_ino(newi->ino(), mds->idalloc->get_version()); - version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob); // dir mtime too - le->metablob.add_dir_context(dn->dir); - le->metablob.add_primary_dentry(dn, true, newi, &newi->inode); - - // log + wait - mdlog->submit_entry(le, new C_MDS_mknod_finish(mds, mdr, dn, newi, dirpv)); -} - - - -// MKDIR - -void Server::handle_client_mkdir(MDRequest *mdr) -{ - MClientRequest *req = mdr->client_request; - - CDentry *dn = rdlock_path_xlock_dentry(mdr, false, false); - if (!dn) return; - - // new inode - mdr->now = g_clock.real_now(); - CInode *newi = prepare_new_inode(mdr, dn->dir); - assert(newi); - - // it's a directory. - newi->inode.mode = req->args.mkdir.mode; - newi->inode.mode &= ~INODE_TYPE_MASK; - newi->inode.mode |= INODE_MODE_DIR; - newi->inode.layout = g_OSD_MDDirLayout; - newi->inode.version = dn->pre_dirty() - 1; - - // ...and that new dir is empty. - CDir *newdir = newi->get_or_open_dirfrag(mds->mdcache, frag_t()); - newdir->mark_complete(); - version_t newdirpv = newdir->pre_dirty(); - - //if (mds->logger) mds->logger->inc("mkdir"); - - // prepare finisher - mdr->ls = mdlog->get_current_segment(); - EUpdate *le = new EUpdate(mdlog, "mkdir"); - le->metablob.add_client_req(req->get_reqid()); - le->metablob.add_allocated_ino(newi->ino(), mds->idalloc->get_version()); - version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob); // dir mtime too - le->metablob.add_dir_context(dn->dir); - le->metablob.add_primary_dentry(dn, true, newi, &newi->inode); - le->metablob.add_dir(newdir, true, true); // dirty AND complete - - // log + wait - mdlog->submit_entry(le, new C_MDS_mknod_finish(mds, mdr, dn, newi, dirpv, newdirpv)); - - /* old export heuristic. pbly need to reimplement this at some point. - if ( - diri->dir->is_auth() && - diri->dir->is_rep() && - newdir->is_auth() && - !newdir->is_hashing()) { - int dest = rand() % mds->mdsmap->get_num_mds(); - if (dest != whoami) { - dout(10) << "exporting new dir " << *newdir << " in replicated parent " << *diri->dir << dendl; - mdcache->migrator->export_dir(newdir, dest); - } - } - */ -} - - -// SYMLINK - -void Server::handle_client_symlink(MDRequest *mdr) -{ - MClientRequest *req = mdr->client_request; - - CDentry *dn = rdlock_path_xlock_dentry(mdr, false, false); - if (!dn) return; - - mdr->now = g_clock.real_now(); - - CInode *newi = prepare_new_inode(mdr, dn->dir); - assert(newi); - - // it's a symlink - newi->inode.mode &= ~INODE_TYPE_MASK; - newi->inode.mode |= INODE_MODE_SYMLINK; - newi->symlink = req->get_sarg(); - newi->inode.version = dn->pre_dirty() - 1; - - // prepare finisher - mdr->ls = mdlog->get_current_segment(); - EUpdate *le = new EUpdate(mdlog, "symlink"); - le->metablob.add_client_req(req->get_reqid()); - le->metablob.add_allocated_ino(newi->ino(), mds->idalloc->get_version()); - version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob); // dir mtime too - le->metablob.add_dir_context(dn->dir); - le->metablob.add_primary_dentry(dn, true, newi, &newi->inode); - - // log + wait - mdlog->submit_entry(le, new C_MDS_mknod_finish(mds, mdr, dn, newi, dirpv)); -} - - - - - -// LINK - -void Server::handle_client_link(MDRequest *mdr) -{ - MClientRequest *req = mdr->client_request; - - dout(7) << "handle_client_link " << req->get_filepath() - << " to " << req->get_sarg() - << dendl; - - // traverse to dest dir, make sure it's ours. - const filepath &linkpath = req->get_filepath(); - const string &dname = linkpath.last_dentry(); - vector linktrace; - CDir *dir = traverse_to_auth_dir(mdr, linktrace, linkpath); - if (!dir) return; - dout(7) << "handle_client_link link " << dname << " in " << *dir << dendl; - - // traverse to link target - filepath targetpath = req->get_sarg(); - dout(7) << "handle_client_link discovering target " << targetpath << dendl; - vector targettrace; - int r = mdcache->path_traverse(mdr, req, - 0, targetpath, targettrace, false, - MDS_TRAVERSE_DISCOVER); - if (r > 0) return; // wait - if (targettrace.empty()) r = -EINVAL; - if (r < 0) { - reply_request(mdr, r); - return; - } - - // identify target inode - CInode *targeti = targettrace[targettrace.size()-1]->inode; - assert(targeti); - - // dir? - dout(7) << "target is " << *targeti << dendl; - if (targeti->is_dir()) { - dout(7) << "target is a dir, failing..." << dendl; - reply_request(mdr, -EINVAL); - return; - } - - // get/make null link dentry - CDentry *dn = prepare_null_dentry(mdr, dir, dname, false); - if (!dn) return; - - // create lock lists - set rdlocks, wrlocks, xlocks; - - for (int i=0; i<(int)linktrace.size(); i++) - rdlocks.insert(&linktrace[i]->lock); - xlocks.insert(&dn->lock); - wrlocks.insert(&dn->dir->inode->dirlock); - for (int i=0; i<(int)targettrace.size(); i++) - rdlocks.insert(&targettrace[i]->lock); - xlocks.insert(&targeti->linklock); - - if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) - return; - - mdr->done_locking = true; // avoid wrlock moving target issues. - - // pick mtime - if (mdr->now == utime_t()) - mdr->now = g_clock.real_now(); - - // does the target need an anchor? - if (targeti->is_auth()) { - /*if (targeti->get_parent_dir() == dn->dir) { - dout(7) << "target is in the same dirfrag, sweet" << dendl; - } - else - */ - if (targeti->is_anchored() && !targeti->is_unanchoring()) { - dout(7) << "target anchored already (nlink=" << targeti->inode.nlink << "), sweet" << dendl; - } - else { - dout(7) << "target needs anchor, nlink=" << targeti->inode.nlink << ", creating anchor" << dendl; - - mdcache->anchor_create(mdr, targeti, - new C_MDS_RetryRequest(mdcache, mdr)); - return; - } - } - - // go! - - // local or remote? - if (targeti->is_auth()) - _link_local(mdr, dn, targeti); - else - _link_remote(mdr, dn, targeti); -} - - -class C_MDS_link_local_finish : public Context { - MDS *mds; - MDRequest *mdr; - CDentry *dn; - CInode *targeti; - version_t dnpv; - version_t tipv; - version_t dirpv; -public: - C_MDS_link_local_finish(MDS *m, MDRequest *r, CDentry *d, CInode *ti, - version_t dnpv_, version_t tipv_, version_t dirpv_) : - mds(m), mdr(r), dn(d), targeti(ti), - dnpv(dnpv_), tipv(tipv_), dirpv(dirpv_) { } - void finish(int r) { - assert(r == 0); - mds->server->_link_local_finish(mdr, dn, targeti, dnpv, tipv, dirpv); - } -}; - - -void Server::_link_local(MDRequest *mdr, CDentry *dn, CInode *targeti) -{ - dout(10) << "_link_local " << *dn << " to " << *targeti << dendl; - - mdr->ls = mdlog->get_current_segment(); - - // predirty NEW dentry - version_t dnpv = dn->pre_dirty(); - version_t tipv = targeti->pre_dirty(); - - // project inode update - inode_t *pi = targeti->project_inode(); - pi->nlink++; - pi->ctime = mdr->now; - pi->version = tipv; - - // log + wait - EUpdate *le = new EUpdate(mdlog, "link_local"); - le->metablob.add_client_req(mdr->reqid); - version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob); // dir inode's mtime - le->metablob.add_dir_context(dn->get_dir()); - le->metablob.add_remote_dentry(dn, true, targeti->ino()); // new remote - le->metablob.add_dir_context(targeti->get_parent_dir()); - le->metablob.add_primary_dentry(targeti->parent, true, targeti, pi); // update old primary - - mdlog->submit_entry(le, new C_MDS_link_local_finish(mds, mdr, dn, targeti, dnpv, tipv, dirpv)); -} - -void Server::_link_local_finish(MDRequest *mdr, CDentry *dn, CInode *targeti, - version_t dnpv, version_t tipv, version_t dirpv) -{ - dout(10) << "_link_local_finish " << *dn << " to " << *targeti << dendl; - - // link and unlock the NEW dentry - dn->dir->link_remote_inode(dn, targeti->ino(), MODE_TO_DT(targeti->inode.mode)); - dn->mark_dirty(dnpv, mdr->ls); - - // target inode - targeti->pop_and_dirty_projected_inode(mdr->ls); - - // new dentry dir mtime - dirty_dn_diri(mdr, dn, dirpv); - - // bump target popularity - mds->balancer->hit_inode(mdr->now, targeti, META_POP_IWR); - //mds->balancer->hit_dir(mdr->now, dn->get_dir(), META_POP_DWR); - - // reply - MClientReply *reply = new MClientReply(mdr->client_request, 0); - reply_request(mdr, reply, dn->get_dir()->get_inode()); // FIXME: imprecise ref -} - - -// remote - -class C_MDS_link_remote_finish : public Context { - MDS *mds; - MDRequest *mdr; - CDentry *dn; - CInode *targeti; - version_t dpv; - version_t dirpv; -public: - C_MDS_link_remote_finish(MDS *m, MDRequest *r, CDentry *d, CInode *ti, version_t dirpv_) : - mds(m), mdr(r), dn(d), targeti(ti), - dpv(d->get_projected_version()), - dirpv(dirpv_) { } - void finish(int r) { - assert(r == 0); - mds->server->_link_remote_finish(mdr, dn, targeti, dpv, dirpv); - } -}; - -void Server::_link_remote(MDRequest *mdr, CDentry *dn, CInode *targeti) -{ - dout(10) << "_link_remote " << *dn << " to " << *targeti << dendl; - - // 1. send LinkPrepare to dest (journal nlink++ prepare) - int linkauth = targeti->authority().first; - if (mdr->more()->witnessed.count(linkauth) == 0) { - dout(10) << " targeti auth must prepare nlink++" << dendl; - - MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_LINKPREP); - targeti->set_object_info(req->get_object_info()); - req->now = mdr->now; - mds->send_message_mds(req, linkauth); - - assert(mdr->more()->waiting_on_slave.count(linkauth) == 0); - mdr->more()->waiting_on_slave.insert(linkauth); - return; - } - dout(10) << " targeti auth has prepared nlink++" << dendl; - - // go. - // predirty dentry - dn->pre_dirty(); - - // add to event - mdr->ls = mdlog->get_current_segment(); - EUpdate *le = new EUpdate(mdlog, "link_remote"); - le->metablob.add_client_req(mdr->reqid); - version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob); // dir inode's mtime - le->metablob.add_dir_context(dn->get_dir()); - le->metablob.add_remote_dentry(dn, true, targeti->ino()); // new remote - - // mark committing (needed for proper recovery) - mdr->committing = true; - - // log + wait - mdlog->submit_entry(le, new C_MDS_link_remote_finish(mds, mdr, dn, targeti, dirpv)); -} - -void Server::_link_remote_finish(MDRequest *mdr, CDentry *dn, CInode *targeti, - version_t dpv, version_t dirpv) -{ - dout(10) << "_link_remote_finish " << *dn << " to " << *targeti << dendl; - - // link the new dentry - dn->dir->link_remote_inode(dn, targeti->ino(), MODE_TO_DT(targeti->inode.mode)); - dn->mark_dirty(dpv, mdr->ls); - - // dir inode's mtime - dirty_dn_diri(mdr, dn, dirpv); - - // bump target popularity - mds->balancer->hit_inode(mdr->now, targeti, META_POP_IWR); - //mds->balancer->hit_dir(mdr->now, dn->get_dir(), META_POP_DWR); - - // reply - MClientReply *reply = new MClientReply(mdr->client_request, 0); - reply_request(mdr, reply, dn->get_dir()->get_inode()); // FIXME: imprecise ref -} - - -// remote linking/unlinking - -class C_MDS_SlaveLinkPrep : public Context { - Server *server; - MDRequest *mdr; - CInode *targeti; - utime_t old_ctime; - bool inc; -public: - C_MDS_SlaveLinkPrep(Server *s, MDRequest *r, CInode *t, utime_t oct, bool in) : - server(s), mdr(r), targeti(t), old_ctime(oct), inc(in) { } - void finish(int r) { - assert(r == 0); - server->_logged_slave_link(mdr, targeti, old_ctime, inc); - } -}; - -void Server::handle_slave_link_prep(MDRequest *mdr) -{ - dout(10) << "handle_slave_link_prep " << *mdr - << " on " << mdr->slave_request->get_object_info() - << dendl; - - CInode *targeti = mdcache->get_inode(mdr->slave_request->get_object_info().ino); - assert(targeti); - dout(10) << "targeti " << *targeti << dendl; - CDentry *dn = targeti->get_parent_dn(); - assert(dn->is_primary()); - - mdr->now = mdr->slave_request->now; - - // anchor? - if (mdr->slave_request->get_op() == MMDSSlaveRequest::OP_LINKPREP) { - if (targeti->is_anchored() && !targeti->is_unanchoring()) { - dout(7) << "target anchored already (nlink=" << targeti->inode.nlink << "), sweet" << dendl; - } - else { - dout(7) << "target needs anchor, nlink=" << targeti->inode.nlink << ", creating anchor" << dendl; - mdcache->anchor_create(mdr, targeti, - new C_MDS_RetryRequest(mdcache, mdr)); - return; - } - } - - // journal it - mdr->ls = mdlog->get_current_segment(); - ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_link_prep", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_PREPARE); - - inode_t *pi = dn->inode->project_inode(); - - // rollback case - le->rollback.add_dir_context(targeti->get_parent_dir()); - le->rollback.add_primary_dentry(dn, true, targeti, pi); // update old primary - - // update journaled target inode - bool inc; - if (mdr->slave_request->get_op() == MMDSSlaveRequest::OP_LINKPREP) { - inc = true; - pi->nlink++; - } else { - inc = false; - pi->nlink--; - } - utime_t old_ctime = pi->ctime; - pi->ctime = mdr->now; - pi->version = targeti->pre_dirty(); - - dout(10) << " projected inode " << pi << " v " << pi->version << dendl; - - // commit case - le->commit.add_dir_context(targeti->get_parent_dir()); - le->commit.add_primary_dentry(dn, true, targeti, pi); // update old primary - - mdlog->submit_entry(le, new C_MDS_SlaveLinkPrep(this, mdr, targeti, old_ctime, inc)); -} - -class C_MDS_SlaveLinkCommit : public Context { - Server *server; - MDRequest *mdr; - CInode *targeti; - utime_t old_ctime; - version_t old_version; - bool inc; -public: - C_MDS_SlaveLinkCommit(Server *s, MDRequest *r, CInode *t, utime_t oct, version_t ov, bool in) : - server(s), mdr(r), targeti(t), old_ctime(oct), old_version(ov), inc(in) { } - void finish(int r) { - server->_commit_slave_link(mdr, r, targeti, - old_ctime, old_version, inc); - } -}; - -void Server::_logged_slave_link(MDRequest *mdr, CInode *targeti, utime_t old_ctime, bool inc) -{ - dout(10) << "_logged_slave_link " << *mdr - << " inc=" << inc - << " " << *targeti << dendl; - - version_t old_version = targeti->inode.version; - - // update the target - targeti->pop_and_dirty_projected_inode(mdr->ls); - - // hit pop - mds->balancer->hit_inode(mdr->now, targeti, META_POP_IWR); - - // ack - MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_LINKPREPACK); - mds->send_message_mds(reply, mdr->slave_to_mds); - - // set up commit waiter - mdr->more()->slave_commit = new C_MDS_SlaveLinkCommit(this, mdr, targeti, old_ctime, old_version, inc); - - // done. - delete mdr->slave_request; - mdr->slave_request = 0; -} - - -void Server::_commit_slave_link(MDRequest *mdr, int r, CInode *targeti, - utime_t old_ctime, version_t old_version, bool inc) -{ - dout(10) << "_commit_slave_link " << *mdr - << " r=" << r - << " inc=" << inc - << " " << *targeti << dendl; - - ESlaveUpdate *le; - if (r == 0) { - // write a commit to the journal - le = new ESlaveUpdate(mdlog, "slave_link_commit", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_COMMIT); - } else { - le = new ESlaveUpdate(mdlog, "slave_link_rollback", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_ROLLBACK); - - // -- rollback in memory -- - assert(targeti->inode.ctime == mdr->now); - assert(targeti->projected_inode.empty()); // we're holding the version lock. - - targeti->inode.ctime = old_ctime; - targeti->inode.version = old_version; - if (inc) - targeti->inode.nlink++; - else - targeti->inode.nlink--; - } - - mdlog->submit_entry(le); -} - - - -void Server::handle_slave_link_prep_ack(MDRequest *mdr, MMDSSlaveRequest *m) -{ - dout(10) << "handle_slave_link_prep_ack " << *mdr - << " " << *m << dendl; - int from = m->get_source().num(); - - // note slave - mdr->more()->slaves.insert(from); - - // witnessed! - assert(mdr->more()->witnessed.count(from) == 0); - mdr->more()->witnessed.insert(from); - - // remove from waiting list - assert(mdr->more()->waiting_on_slave.count(from)); - mdr->more()->waiting_on_slave.erase(from); - - assert(mdr->more()->waiting_on_slave.empty()); - - dispatch_client_request(mdr); // go again! -} - - - - - -// UNLINK - -void Server::handle_client_unlink(MDRequest *mdr) -{ - MClientRequest *req = mdr->client_request; - - // traverse to path - vector trace; - int r = mdcache->path_traverse(mdr, req, - 0, req->get_filepath(), trace, false, - MDS_TRAVERSE_FORWARD); - if (r > 0) return; - if (trace.empty()) r = -EINVAL; // can't unlink root - if (r < 0) { - reply_request(mdr, r); - return; - } - - CDentry *dn = trace[trace.size()-1]; - assert(dn); - - // is it my dentry? - if (!dn->is_auth()) { - // fw to auth - mdcache->request_forward(mdr, dn->authority().first); - return; - } - - // rmdir or unlink? - bool rmdir = false; - if (req->get_op() == MDS_OP_RMDIR) rmdir = true; - - if (rmdir) { - dout(7) << "handle_client_rmdir on " << *dn << dendl; - } else { - dout(7) << "handle_client_unlink on " << *dn << dendl; - } - - // readable? - if (dn->lock.is_xlocked_by_other(mdr)) { - dout(10) << "waiting on xlocked dentry " << *dn << dendl; - dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr)); - return; - } - - // dn looks ok. - - // get/open inode. - mdr->trace.swap(trace); - CInode *in = mdcache->get_dentry_inode(dn, mdr); - if (!in) return; - dout(7) << "dn links to " << *in << dendl; - - // rmdir vs is_dir - if (in->is_dir()) { - if (rmdir) { - // do empty directory checks - if (!_verify_rmdir(mdr, in)) - return; - } else { - dout(7) << "handle_client_unlink on dir " << *in << ", returning error" << dendl; - reply_request(mdr, -EISDIR); - return; - } - } else { - if (rmdir) { - // unlink - dout(7) << "handle_client_rmdir on non-dir " << *in << ", returning error" << dendl; - reply_request(mdr, -ENOTDIR); - return; - } - } - - // lock - set rdlocks, wrlocks, xlocks; - - for (int i=0; i<(int)trace.size()-1; i++) - rdlocks.insert(&trace[i]->lock); - xlocks.insert(&dn->lock); - wrlocks.insert(&dn->dir->inode->dirlock); - xlocks.insert(&in->linklock); - - if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) - return; - - // yay! - mdr->done_locking = true; // avoid wrlock racing - if (mdr->now == utime_t()) - mdr->now = g_clock.real_now(); - - // get stray dn ready? - CDentry *straydn = 0; - if (dn->is_primary()) { - straydn = mdcache->get_or_create_stray_dentry(dn->inode); - mdr->pin(straydn); // pin it. - dout(10) << " straydn is " << *straydn << dendl; - assert(straydn->is_null()); - - if (!mdr->more()->dst_reanchor_atid && - dn->inode->is_anchored()) { - dout(10) << "reanchoring to stray " << *dn->inode << dendl; - vector trace; - straydn->make_anchor_trace(trace, dn->inode); - mds->anchorclient->prepare_update(dn->inode->ino(), trace, &mdr->more()->dst_reanchor_atid, - new C_MDS_RetryRequest(mdcache, mdr)); - return; - } - } - - // ok! - if (dn->is_remote() && !dn->inode->is_auth()) - _unlink_remote(mdr, dn); - else - _unlink_local(mdr, dn, straydn); -} - - - -class C_MDS_unlink_local_finish : public Context { - MDS *mds; - MDRequest *mdr; - CDentry *dn; - CDentry *straydn; - version_t dnpv; // deleted dentry - version_t dirpv; -public: - C_MDS_unlink_local_finish(MDS *m, MDRequest *r, CDentry *d, CDentry *sd, - version_t dirpv_) : - mds(m), mdr(r), dn(d), straydn(sd), - dnpv(d->get_projected_version()), dirpv(dirpv_) { } - void finish(int r) { - assert(r == 0); - mds->server->_unlink_local_finish(mdr, dn, straydn, dnpv, dirpv); - } -}; - - -void Server::_unlink_local(MDRequest *mdr, CDentry *dn, CDentry *straydn) -{ - dout(10) << "_unlink_local " << *dn << dendl; - - // ok, let's do it. - mdr->ls = mdlog->get_current_segment(); - - // prepare log entry - EUpdate *le = new EUpdate(mdlog, "unlink_local"); - le->metablob.add_client_req(mdr->reqid); - - version_t ipv = 0; // dirty inode version - inode_t *ji = 0; // journaled projected inode - if (dn->is_primary()) { - // primary link. add stray dentry. - assert(straydn); - ipv = straydn->pre_dirty(dn->inode->inode.version); - le->metablob.add_dir_context(straydn->dir); - ji = le->metablob.add_primary_dentry(straydn, true, dn->inode); - } else { - // remote link. update remote inode. - ipv = dn->inode->pre_dirty(); - le->metablob.add_dir_context(dn->inode->get_parent_dir()); - ji = le->metablob.add_primary_dentry(dn->inode->parent, true, dn->inode); - } - - // update journaled target inode - inode_t *pi = dn->inode->project_inode(); - pi->nlink--; - pi->ctime = mdr->now; - pi->version = ipv; - *ji = *pi; // copy into journal - - // the unlinked dentry - dn->pre_dirty(); - version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob); - le->metablob.add_dir_context(dn->get_dir()); - le->metablob.add_null_dentry(dn, true); - - if (mdr->more()->dst_reanchor_atid) - le->metablob.add_anchor_transaction(mdr->more()->dst_reanchor_atid); - - // log + wait - journal_opens(); // journal pending opens, just in case - mdlog->submit_entry(le, new C_MDS_unlink_local_finish(mds, mdr, dn, straydn, - dirpv)); -} - -void Server::_unlink_local_finish(MDRequest *mdr, - CDentry *dn, CDentry *straydn, - version_t dnpv, version_t dirpv) -{ - dout(10) << "_unlink_local_finish " << *dn << dendl; - - // unlink main dentry - CInode *in = dn->inode; - dn->dir->unlink_inode(dn); - - // relink as stray? (i.e. was primary link?) - if (straydn) { - dout(20) << " straydn is " << *straydn << dendl; - straydn->dir->link_primary_inode(straydn, in); - } - - // nlink--, dirty old dentry - in->pop_and_dirty_projected_inode(mdr->ls); - dn->mark_dirty(dnpv, mdr->ls); - - // dir inode's mtime - dirty_dn_diri(mdr, dn, dirpv); - - // share unlink news with replicas - for (map::iterator it = dn->replicas_begin(); - it != dn->replicas_end(); - it++) { - dout(7) << "_unlink_local_finish sending MDentryUnlink to mds" << it->first << dendl; - MDentryUnlink *unlink = new MDentryUnlink(dn->dir->dirfrag(), dn->name); - if (straydn) { - unlink->strayin = straydn->dir->inode->replicate_to(it->first); - unlink->straydir = straydn->dir->replicate_to(it->first); - unlink->straydn = straydn->replicate_to(it->first); - } - mds->send_message_mds(unlink, it->first); - } - - // commit anchor update? - if (mdr->more()->dst_reanchor_atid) - mds->anchorclient->commit(mdr->more()->dst_reanchor_atid, mdr->ls); - - // bump pop - //mds->balancer->hit_dir(mdr->now, dn->dir, META_POP_DWR); - - // reply - MClientReply *reply = new MClientReply(mdr->client_request, 0); - reply_request(mdr, reply, dn->dir->get_inode()); // FIXME: imprecise ref - - // clean up? - if (straydn) - mdcache->eval_stray(straydn); - - // removing a new dn? - dn->dir->try_remove_unlinked_dn(dn); -} - - -class C_MDS_unlink_remote_finish : public Context { - MDS *mds; - MDRequest *mdr; - CDentry *dn; - version_t dnpv; // deleted dentry - version_t dirpv; -public: - C_MDS_unlink_remote_finish(MDS *m, MDRequest *r, CDentry *d, - version_t dirpv_) : - mds(m), mdr(r), dn(d), - dnpv(d->get_projected_version()), dirpv(dirpv_) { } - void finish(int r) { - assert(r == 0); - mds->server->_unlink_remote_finish(mdr, dn, dnpv, dirpv); - } -}; - -void Server::_unlink_remote(MDRequest *mdr, CDentry *dn) -{ - dout(10) << "_unlink_remote " << *dn << " " << *dn->inode << dendl; - - // 1. send LinkPrepare to dest (journal nlink-- prepare) - int inauth = dn->inode->authority().first; - if (mdr->more()->witnessed.count(inauth) == 0) { - dout(10) << " inode auth must prepare nlink--" << dendl; - - MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_UNLINKPREP); - dn->inode->set_object_info(req->get_object_info()); - req->now = mdr->now; - mds->send_message_mds(req, inauth); - - assert(mdr->more()->waiting_on_slave.count(inauth) == 0); - mdr->more()->waiting_on_slave.insert(inauth); - return; - } - dout(10) << " inode auth has prepared nlink--" << dendl; - - // ok, let's do it. - // prepare log entry - mdr->ls = mdlog->get_current_segment(); - EUpdate *le = new EUpdate(mdlog, "unlink_remote"); - le->metablob.add_client_req(mdr->reqid); - - // the unlinked dentry - dn->pre_dirty(); - version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob); - le->metablob.add_dir_context(dn->get_dir()); - le->metablob.add_null_dentry(dn, true); - - if (mdr->more()->dst_reanchor_atid) - le->metablob.add_anchor_transaction(mdr->more()->dst_reanchor_atid); - - // finisher - C_MDS_unlink_remote_finish *fin = new C_MDS_unlink_remote_finish(mds, mdr, dn, dirpv); - - journal_opens(); // journal pending opens, just in case - - // mark committing (needed for proper recovery) - mdr->committing = true; - - // log + wait - mdlog->submit_entry(le, fin); -} - -void Server::_unlink_remote_finish(MDRequest *mdr, - CDentry *dn, - version_t dnpv, version_t dirpv) -{ - dout(10) << "_unlink_remote_finish " << *dn << dendl; - - // unlink main dentry - dn->dir->unlink_inode(dn); - dn->mark_dirty(dnpv, mdr->ls); // dirty old dentry - - // dir inode's mtime - dirty_dn_diri(mdr, dn, dirpv); - - // share unlink news with replicas - for (map::iterator it = dn->replicas_begin(); - it != dn->replicas_end(); - it++) { - dout(7) << "_unlink_remote_finish sending MDentryUnlink to mds" << it->first << dendl; - MDentryUnlink *unlink = new MDentryUnlink(dn->dir->dirfrag(), dn->name); - mds->send_message_mds(unlink, it->first); - } - - // commit anchor update? - if (mdr->more()->dst_reanchor_atid) - mds->anchorclient->commit(mdr->more()->dst_reanchor_atid, mdr->ls); - - //mds->balancer->hit_dir(mdr->now, dn->dir, META_POP_DWR); - - // reply - MClientReply *reply = new MClientReply(mdr->client_request, 0); - reply_request(mdr, reply, dn->dir->get_inode()); // FIXME: imprecise ref - - // removing a new dn? - dn->dir->try_remove_unlinked_dn(dn); -} - - - - -/** _verify_rmdir - * - * verify that a directory is empty (i.e. we can rmdir it), - * and make sure it is part of the same subtree (i.e. local) - * so that rmdir will occur locally. - * - * @param in is the inode being rmdir'd. - */ -bool Server::_verify_rmdir(MDRequest *mdr, CInode *in) -{ - dout(10) << "_verify_rmdir " << *in << dendl; - assert(in->is_auth()); - - list frags; - in->dirfragtree.get_leaves(frags); - - for (list::iterator p = frags.begin(); - p != frags.end(); - ++p) { - CDir *dir = in->get_or_open_dirfrag(mdcache, *p); - assert(dir); - - // dir looks empty but incomplete? - if (dir->is_auth() && - dir->get_size() == 0 && - !dir->is_complete()) { - dout(7) << "_verify_rmdir fetching incomplete dir " << *dir << dendl; - dir->fetch(new C_MDS_RetryRequest(mdcache, mdr)); - return false; - } - - // does the frag _look_ empty? - if (dir->get_size()) { - dout(10) << "_verify_rmdir still " << dir->get_size() << " items in frag " << *dir << dendl; - reply_request(mdr, -ENOTEMPTY); - return false; - } - - // not dir auth? - if (!dir->is_auth()) { - dout(10) << "_verify_rmdir not auth for " << *dir << ", FIXME BUG" << dendl; - reply_request(mdr, -ENOTEMPTY); - return false; - } - } - - return true; -} -/* - // export sanity check - if (!in->is_auth()) { - // i should be exporting this now/soon, since the dir is empty. - dout(7) << "handle_client_rmdir dir is auth, but not inode." << dendl; - mdcache->migrator->export_empty_import(in->dir); - in->dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mds, req, diri)); - return; - } -*/ - - - - -// ====================================================== - - -class C_MDS_rename_finish : public Context { - MDS *mds; - MDRequest *mdr; - CDentry *srcdn; - CDentry *destdn; - CDentry *straydn; -public: - C_MDS_rename_finish(MDS *m, MDRequest *r, - CDentry *sdn, CDentry *ddn, CDentry *stdn) : - mds(m), mdr(r), - srcdn(sdn), destdn(ddn), straydn(stdn) { } - void finish(int r) { - assert(r == 0); - mds->server->_rename_finish(mdr, srcdn, destdn, straydn); - } -}; - - -/** handle_client_rename - * - */ -void Server::handle_client_rename(MDRequest *mdr) -{ - MClientRequest *req = mdr->client_request; - dout(7) << "handle_client_rename " << *req << dendl; - - // traverse to dest dir (not dest) - // we do this FIRST, because the rename should occur on the - // destdn's auth. - const filepath &destpath = req->get_sarg(); - const string &destname = destpath.last_dentry(); - vector desttrace; - CDir *destdir = traverse_to_auth_dir(mdr, desttrace, destpath); - if (!destdir) return; // fw or error out - dout(10) << "dest will be " << destname << " in " << *destdir << dendl; - assert(destdir->is_auth()); - - // traverse to src - filepath srcpath = req->get_filepath(); - vector srctrace; - int r = mdcache->path_traverse(mdr, req, - 0, srcpath, srctrace, false, - MDS_TRAVERSE_DISCOVER); - if (r > 0) return; - if (srctrace.empty()) r = -EINVAL; // can't rename root - if (r < 0) { - reply_request(mdr, r); - return; - } - CDentry *srcdn = srctrace[srctrace.size()-1]; - dout(10) << " srcdn " << *srcdn << dendl; - CInode *srci = mdcache->get_dentry_inode(srcdn, mdr); - dout(10) << " srci " << *srci << dendl; - - // -- some sanity checks -- - // src == dest? - if (srcdn->get_dir() == destdir && srcdn->name == destname) { - dout(7) << "rename src=dest, noop" << dendl; - reply_request(mdr, 0); - return; - } - - // dest a child of src? - // e.g. mv /usr /usr/foo - CDentry *pdn = destdir->inode->parent; - while (pdn) { - if (pdn == srcdn) { - dout(7) << "cannot rename item to be a child of itself" << dendl; - reply_request(mdr, -EINVAL); - return; - } - pdn = pdn->dir->inode->parent; - } - - - // identify/create dest dentry - CDentry *destdn = destdir->lookup(destname); - if (destdn && destdn->lock.is_xlocked_by_other(mdr)) { - destdn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr)); - return; - } - - CInode *oldin = 0; - if (destdn && !destdn->is_null()) { - //dout(10) << "dest dn exists " << *destdn << dendl; - oldin = mdcache->get_dentry_inode(destdn, mdr); - if (!oldin) return; - dout(10) << " oldin " << *oldin << dendl; - - // mv /some/thing /to/some/existing_other_thing - if (oldin->is_dir() && !srci->is_dir()) { - reply_request(mdr, -EISDIR); - return; - } - if (!oldin->is_dir() && srci->is_dir()) { - reply_request(mdr, -ENOTDIR); - return; - } - - // non-empty dir? - if (oldin->is_dir() && !_verify_rmdir(mdr, oldin)) - return; - } - if (!destdn) { - // mv /some/thing /to/some/non_existent_name - destdn = prepare_null_dentry(mdr, destdir, destname); - if (!destdn) return; - } - - dout(10) << " destdn " << *destdn << dendl; - - - // -- locks -- - set rdlocks, wrlocks, xlocks; - - // rdlock sourcedir path, xlock src dentry - for (int i=0; i<(int)srctrace.size()-1; i++) - rdlocks.insert(&srctrace[i]->lock); - xlocks.insert(&srcdn->lock); - wrlocks.insert(&srcdn->dir->inode->dirlock); - /* - * no, this causes problems if the dftlock is scattered... - * and what was i thinking anyway? - * rdlocks.insert(&srcdn->dir->inode->dirfragtreelock); // rd lock on srci dirfragtree. - */ - - // rdlock destdir path, xlock dest dentry - for (int i=0; i<(int)desttrace.size(); i++) - rdlocks.insert(&desttrace[i]->lock); - xlocks.insert(&destdn->lock); - wrlocks.insert(&destdn->dir->inode->dirlock); - - // xlock versionlock on srci if remote? - // this ensures it gets safely remotely auth_pinned, avoiding deadlock; - // strictly speaking, having the slave node freeze the inode is - // otherwise sufficient for avoiding conflicts with inode locks, etc. - if (!srcdn->is_auth() && srcdn->is_primary()) - xlocks.insert(&srcdn->inode->versionlock); - - // xlock oldin (for nlink--) - if (oldin) xlocks.insert(&oldin->linklock); - - if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) - return; - - // set done_locking flag, to avoid problems with wrlock moving auth target - mdr->done_locking = true; - - // -- open all srcdn inode frags, if any -- - // we need these open so that auth can properly delegate from inode to dirfrags - // after the inode is _ours_. - if (srcdn->is_primary() && - !srcdn->is_auth() && - srci->is_dir()) { - dout(10) << "srci is remote dir, setting stickydirs and opening all frags" << dendl; - mdr->set_stickydirs(srci); - - list frags; - srci->dirfragtree.get_leaves(frags); - for (list::iterator p = frags.begin(); - p != frags.end(); - ++p) { - CDir *dir = srci->get_dirfrag(*p); - if (!dir) { - dout(10) << " opening " << *p << " under " << *srci << dendl; - mdcache->open_remote_dirfrag(srci, *p, new C_MDS_RetryRequest(mdcache, mdr)); - return; - } - } - } - - // -- declare now -- - if (mdr->now == utime_t()) - mdr->now = g_clock.real_now(); - - // -- create stray dentry? -- - CDentry *straydn = 0; - if (destdn->is_primary()) { - straydn = mdcache->get_or_create_stray_dentry(destdn->inode); - mdr->pin(straydn); - dout(10) << "straydn is " << *straydn << dendl; - } - - // -- prepare witnesses -- - /* - * NOTE: we use _all_ replicas as witnesses. - * this probably isn't totally necessary (esp for file renames), - * but if/when we change that, we have to make sure rejoin is - * sufficiently robust to handle strong rejoins from survivors - * with totally wrong dentry->inode linkage. - * (currently, it can ignore rename effects, because the resolve - * stage will sort them out.) - */ - set witnesses = mdr->more()->extra_witnesses; - if (srcdn->is_auth()) - srcdn->list_replicas(witnesses); - else - witnesses.insert(srcdn->authority().first); - destdn->list_replicas(witnesses); - dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl; - - // do srcdn auth last - int last = -1; - if (!srcdn->is_auth()) - last = srcdn->authority().first; - - for (set::iterator p = witnesses.begin(); - p != witnesses.end(); - ++p) { - if (*p == last) continue; // do it last! - if (mdr->more()->witnessed.count(*p)) { - dout(10) << " already witnessed by mds" << *p << dendl; - } else if (mdr->more()->waiting_on_slave.count(*p)) { - dout(10) << " already waiting on witness mds" << *p << dendl; - } else { - _rename_prepare_witness(mdr, *p, srcdn, destdn, straydn); - } - } - if (!mdr->more()->waiting_on_slave.empty()) - return; // we're waiting for a witness. - - if (last >= 0 && - mdr->more()->witnessed.count(last) == 0 && - mdr->more()->waiting_on_slave.count(last) == 0) { - dout(10) << " preparing last witness (srcdn auth)" << dendl; - _rename_prepare_witness(mdr, last, srcdn, destdn, straydn); - return; - } - - // -- prepare anchor updates -- - bool linkmerge = (srcdn->inode == destdn->inode && - (srcdn->is_primary() || destdn->is_primary())); - - if (!linkmerge) { - C_Gather *anchorgather = 0; - - if (srcdn->is_primary() && srcdn->inode->is_anchored() && - srcdn->dir != destdn->dir && - !mdr->more()->src_reanchor_atid) { - dout(10) << "reanchoring src->dst " << *srcdn->inode << dendl; - vector trace; - destdn->make_anchor_trace(trace, srcdn->inode); - - anchorgather = new C_Gather(new C_MDS_RetryRequest(mdcache, mdr)); - mds->anchorclient->prepare_update(srcdn->inode->ino(), trace, &mdr->more()->src_reanchor_atid, - anchorgather->new_sub()); - } - if (destdn->is_primary() && - destdn->inode->is_anchored() && - !mdr->more()->dst_reanchor_atid) { - dout(10) << "reanchoring dst->stray " << *destdn->inode << dendl; - - assert(straydn); - vector trace; - straydn->make_anchor_trace(trace, destdn->inode); - - if (!anchorgather) - anchorgather = new C_Gather(new C_MDS_RetryRequest(mdcache, mdr)); - mds->anchorclient->prepare_update(destdn->inode->ino(), trace, &mdr->more()->dst_reanchor_atid, - anchorgather->new_sub()); - } - - if (anchorgather) - return; // waiting for anchor prepares - } - - // -- prepare journal entry -- - mdr->ls = mdlog->get_current_segment(); - EUpdate *le = new EUpdate(mdlog, "rename"); - le->metablob.add_client_req(mdr->reqid); - - _rename_prepare(mdr, &le->metablob, srcdn, destdn, straydn); - - // -- commit locally -- - C_MDS_rename_finish *fin = new C_MDS_rename_finish(mds, mdr, srcdn, destdn, straydn); - - journal_opens(); // journal pending opens, just in case - - // mark committing (needed for proper recovery) - mdr->committing = true; - - // log + wait - mdlog->submit_entry(le, fin); -} - - -void Server::_rename_finish(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn) -{ - dout(10) << "_rename_finish " << *mdr << dendl; - - // apply - _rename_apply(mdr, srcdn, destdn, straydn); - - // commit anchor updates? - if (mdr->more()->src_reanchor_atid) - mds->anchorclient->commit(mdr->more()->src_reanchor_atid, mdr->ls); - if (mdr->more()->dst_reanchor_atid) - mds->anchorclient->commit(mdr->more()->dst_reanchor_atid, mdr->ls); - - // bump popularity - //if (srcdn->is_auth()) - //mds->balancer->hit_dir(mdr->now, srcdn->get_dir(), META_POP_DWR); - // mds->balancer->hit_dir(mdr->now, destdn->get_dir(), META_POP_DWR); - if (destdn->is_remote() && - destdn->inode->is_auth()) - mds->balancer->hit_inode(mdr->now, destdn->get_inode(), META_POP_IWR); - - // reply - MClientReply *reply = new MClientReply(mdr->client_request, 0); - reply_request(mdr, reply, destdn->get_inode()); // FIXME: imprecise ref - - // clean up? - if (straydn) - mdcache->eval_stray(straydn); -} - - - -// helpers - -void Server::_rename_prepare_witness(MDRequest *mdr, int who, CDentry *srcdn, CDentry *destdn, CDentry *straydn) -{ - dout(10) << "_rename_prepare_witness mds" << who << dendl; - MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_RENAMEPREP); - srcdn->make_path(req->srcdnpath); - destdn->make_path(req->destdnpath); - req->now = mdr->now; - - if (straydn) { - CInodeDiscover *indis = straydn->dir->inode->replicate_to(who); - CDirDiscover *dirdis = straydn->dir->replicate_to(who); - CDentryDiscover *dndis = straydn->replicate_to(who); - indis->_encode(req->stray); - dirdis->_encode(req->stray); - dndis->_encode(req->stray); - delete indis; - delete dirdis; - delete dndis; - } - - // srcdn auth will verify our current witness list is sufficient - req->witnesses = mdr->more()->witnessed; - - mds->send_message_mds(req, who); - - assert(mdr->more()->waiting_on_slave.count(who) == 0); - mdr->more()->waiting_on_slave.insert(who); -} - - -void Server::_rename_prepare(MDRequest *mdr, - EMetaBlob *metablob, - CDentry *srcdn, CDentry *destdn, CDentry *straydn) -{ - dout(10) << "_rename_prepare " << *mdr << " " << *srcdn << " " << *destdn << dendl; - - // primary+remote link merge? - bool linkmerge = (srcdn->inode == destdn->inode && - (srcdn->is_primary() || destdn->is_primary())); - - if (mdr->is_master()) { - mdr->more()->pvmap[destdn->dir->inode] = predirty_dn_diri(mdr, destdn, metablob); - if (destdn->dir != srcdn->dir) - mdr->more()->pvmap[srcdn->dir->inode] = predirty_dn_diri(mdr, srcdn, metablob); - } - - inode_t *ji = 0; // journaled inode getting nlink-- - version_t ipv = 0; // it's version - - if (linkmerge) { - dout(10) << "will merge remote+primary links" << dendl; - - // destdn -> primary - metablob->add_dir_context(destdn->dir); - if (destdn->is_auth()) - ipv = mdr->more()->pvmap[destdn] = destdn->pre_dirty(destdn->inode->inode.version); - ji = metablob->add_primary_dentry(destdn, true, destdn->inode); - - // do src dentry - metablob->add_dir_context(srcdn->dir); - if (srcdn->is_auth()) - mdr->more()->pvmap[srcdn] = srcdn->pre_dirty(); - metablob->add_null_dentry(srcdn, true); - - } else { - // move to stray? - if (destdn->is_primary()) { - // primary. we'll move inode to stray dir. - assert(straydn); - - // link-- inode, move to stray dir. - metablob->add_dir_context(straydn->dir); - if (straydn->is_auth()) - ipv = mdr->more()->pvmap[straydn] = straydn->pre_dirty(destdn->inode->inode.version); - ji = metablob->add_primary_dentry(straydn, true, destdn->inode); - } - else if (destdn->is_remote()) { - // remote. - // nlink-- targeti - metablob->add_dir_context(destdn->inode->get_parent_dir()); - if (destdn->inode->is_auth()) - ipv = mdr->more()->pvmap[destdn->inode] = destdn->inode->pre_dirty(); - ji = metablob->add_primary_dentry(destdn->inode->parent, true, destdn->inode); // update primary - dout(10) << "remote targeti (nlink--) is " << *destdn->inode << dendl; - } - else { - assert(destdn->is_null()); - } - - // add dest dentry - metablob->add_dir_context(destdn->dir); - if (srcdn->is_primary()) { - dout(10) << "src is a primary dentry" << dendl; - if (destdn->is_auth()) { - version_t siv; - if (srcdn->is_auth()) - siv = srcdn->inode->get_projected_version(); - else - siv = mdr->more()->inode_import_v; - mdr->more()->pvmap[destdn] = destdn->pre_dirty(siv+1); - } - metablob->add_primary_dentry(destdn, true, srcdn->inode); - - } else { - assert(srcdn->is_remote()); - dout(10) << "src is a remote dentry" << dendl; - if (destdn->is_auth()) - mdr->more()->pvmap[destdn] = destdn->pre_dirty(); - metablob->add_remote_dentry(destdn, true, srcdn->get_remote_ino()); - } - - // remove src dentry - metablob->add_dir_context(srcdn->dir); - if (srcdn->is_auth()) - mdr->more()->pvmap[srcdn] = srcdn->pre_dirty(); - metablob->add_null_dentry(srcdn, true); - - // new subtree? - if (srcdn->is_primary() && - srcdn->inode->is_dir()) { - list ls; - srcdn->inode->get_nested_dirfrags(ls); - int auth = srcdn->authority().first; - for (list::iterator p = ls.begin(); p != ls.end(); ++p) - mdcache->adjust_subtree_auth(*p, auth, auth); - } - } - - if (ipv) { - // update journaled target inode - inode_t *pi = destdn->inode->project_inode(); - pi->nlink--; - pi->ctime = mdr->now; - pi->version = ipv; - *ji = *pi; // copy into journal - } - - // anchor updates? - if (mdr->more()->src_reanchor_atid) - metablob->add_anchor_transaction(mdr->more()->src_reanchor_atid); - if (mdr->more()->dst_reanchor_atid) - metablob->add_anchor_transaction(mdr->more()->dst_reanchor_atid); -} - - -void Server::_rename_apply(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn) -{ - dout(10) << "_rename_apply " << *mdr << " " << *srcdn << " " << *destdn << dendl; - dout(10) << " pvs " << mdr->more()->pvmap << dendl; - - CInode *oldin = destdn->inode; - - // primary+remote link merge? - bool linkmerge = (srcdn->inode == destdn->inode && - (srcdn->is_primary() || destdn->is_primary())); - - // dir mtimes - if (mdr->is_master()) { - dirty_dn_diri(mdr, destdn, mdr->more()->pvmap[destdn->dir->inode]); - if (destdn->dir != srcdn->dir) - dirty_dn_diri(mdr, srcdn, mdr->more()->pvmap[srcdn->dir->inode]); - } - - if (linkmerge) { - if (destdn->is_primary()) { - dout(10) << "merging remote onto primary link" << dendl; - - // nlink-- in place - destdn->inode->inode.nlink--; - destdn->inode->inode.ctime = mdr->now; - if (destdn->inode->is_auth()) - destdn->inode->mark_dirty(mdr->more()->pvmap[destdn], mdr->ls); - - // unlink srcdn - srcdn->dir->unlink_inode(srcdn); - if (srcdn->is_auth()) - srcdn->mark_dirty(mdr->more()->pvmap[srcdn], mdr->ls); - } else { - dout(10) << "merging primary onto remote link" << dendl; - assert(srcdn->is_primary()); - - // move inode to dest - srcdn->dir->unlink_inode(srcdn); - destdn->dir->unlink_inode(destdn); - destdn->dir->link_primary_inode(destdn, oldin); - - // nlink-- - destdn->inode->inode.nlink--; - destdn->inode->inode.ctime = mdr->now; - if (destdn->inode->is_auth()) - destdn->inode->mark_dirty(mdr->more()->pvmap[destdn], mdr->ls); - - // mark src dirty - if (srcdn->is_auth()) - srcdn->mark_dirty(mdr->more()->pvmap[srcdn], mdr->ls); - } - } - else { - // unlink destdn? - if (!destdn->is_null()) - destdn->dir->unlink_inode(destdn); - - if (straydn) { - dout(10) << "straydn is " << *straydn << dendl; - - // relink oldin to stray dir. destdn was primary. - assert(oldin); - straydn->dir->link_primary_inode(straydn, oldin); - //assert(straypv == ipv); - - // nlink-- in stray dir. - oldin->inode.nlink--; - oldin->inode.ctime = mdr->now; - if (oldin->is_auth()) - oldin->pop_and_dirty_projected_inode(mdr->ls); - } - else if (oldin) { - // nlink-- remote. destdn was remote. - oldin->inode.nlink--; - oldin->inode.ctime = mdr->now; - if (oldin->is_auth()) - oldin->pop_and_dirty_projected_inode(mdr->ls); - } - - CInode *in = srcdn->inode; - assert(in); - if (srcdn->is_remote()) { - // srcdn was remote. - srcdn->dir->unlink_inode(srcdn); - destdn->dir->link_remote_inode(destdn, in->ino(), MODE_TO_DT(in->inode.mode)); - destdn->link_remote(in); - if (destdn->is_auth()) - destdn->mark_dirty(mdr->more()->pvmap[destdn], mdr->ls); - } else { - // srcdn was primary. - srcdn->dir->unlink_inode(srcdn); - destdn->dir->link_primary_inode(destdn, in); - - // srcdn inode import? - if (!srcdn->is_auth() && destdn->is_auth()) { - assert(mdr->more()->inode_import.length() > 0); - bufferlist::iterator blp = mdr->more()->inode_import.begin(); - map imported_client_map; - list updated_scatterlocks; // we clear_updated explicitly below - ::_decode_simple(imported_client_map, blp); - mdcache->migrator->decode_import_inode(destdn, blp, - srcdn->authority().first, - imported_client_map, - mdr->ls, - updated_scatterlocks); - destdn->inode->dirlock.clear_updated(); - } - if (destdn->inode->is_auth()) - destdn->inode->mark_dirty(mdr->more()->pvmap[destdn], mdr->ls); - } - - if (srcdn->is_auth()) - srcdn->mark_dirty(mdr->more()->pvmap[srcdn], mdr->ls); - } - - // update subtree map? - if (destdn->is_primary() && destdn->inode->is_dir()) - mdcache->adjust_subtree_after_rename(destdn->inode, srcdn->dir); - - // removing a new dn? - if (srcdn->is_auth()) - srcdn->dir->try_remove_unlinked_dn(srcdn); -} - - - - - -// ------------ -// SLAVE - -class C_MDS_SlaveRenamePrep : public Context { - Server *server; - MDRequest *mdr; - CDentry *srcdn, *destdn, *straydn; -public: - C_MDS_SlaveRenamePrep(Server *s, MDRequest *m, CDentry *sr, CDentry *de, CDentry *st) : - server(s), mdr(m), srcdn(sr), destdn(de), straydn(st) {} - void finish(int r) { - server->_logged_slave_rename(mdr, srcdn, destdn, straydn); - } -}; - -class C_MDS_SlaveRenameCommit : public Context { - Server *server; - MDRequest *mdr; - CDentry *srcdn, *destdn, *straydn; -public: - C_MDS_SlaveRenameCommit(Server *s, MDRequest *m, CDentry *sr, CDentry *de, CDentry *st) : - server(s), mdr(m), srcdn(sr), destdn(de), straydn(st) {} - void finish(int r) { - server->_commit_slave_rename(mdr, r, srcdn, destdn, straydn); - } -}; - -void Server::handle_slave_rename_prep(MDRequest *mdr) -{ - dout(10) << "handle_slave_rename_prep " << *mdr - << " " << mdr->slave_request->srcdnpath - << " to " << mdr->slave_request->destdnpath - << dendl; - - // discover destdn - filepath destpath(mdr->slave_request->destdnpath); - dout(10) << " dest " << destpath << dendl; - vector trace; - int r = mdcache->path_traverse(mdr, mdr->slave_request, - 0, destpath, trace, false, - MDS_TRAVERSE_DISCOVERXLOCK); - if (r > 0) return; - assert(r == 0); // we shouldn't get an error here! - - CDentry *destdn = trace[trace.size()-1]; - dout(10) << " destdn " << *destdn << dendl; - mdr->pin(destdn); - - - // discover srcdn - filepath srcpath(mdr->slave_request->srcdnpath); - dout(10) << " src " << srcpath << dendl; - r = mdcache->path_traverse(mdr, mdr->slave_request, - 0, srcpath, trace, false, - MDS_TRAVERSE_DISCOVERXLOCK); - if (r > 0) return; - assert(r == 0); // we shouldn't get an error here! - - CDentry *srcdn = trace[trace.size()-1]; - dout(10) << " srcdn " << *srcdn << dendl; - mdr->pin(srcdn); - assert(srcdn->inode); - mdr->pin(srcdn->inode); - - // stray? - CDentry *straydn = 0; - if (destdn->is_primary()) { - assert(mdr->slave_request->stray.length() > 0); - straydn = mdcache->add_replica_stray(mdr->slave_request->stray, - destdn->inode, mdr->slave_to_mds); - assert(straydn); - mdr->pin(straydn); - } - - mdr->now = mdr->slave_request->now; - - // set up commit waiter (early, to clean up any freezing etc we do) - if (!mdr->more()->slave_commit) - mdr->more()->slave_commit = new C_MDS_SlaveRenameCommit(this, mdr, srcdn, destdn, straydn); - - // am i srcdn auth? - if (srcdn->is_auth()) { - if (srcdn->is_primary() && - !srcdn->inode->is_freezing_inode() && - !srcdn->inode->is_frozen_inode()) { - // srci auth. - // set ambiguous auth. - srcdn->inode->state_set(CInode::STATE_AMBIGUOUSAUTH); - - // freeze? - // we need this to - // - avoid conflicting lock state changes - // - avoid concurrent updates to the inode - // (this could also be accomplished with the versionlock) - int allowance = 1; // for the versionlock and possible linklock xlock (both are tied to mdr) - dout(10) << " freezing srci " << *srcdn->inode << " with allowance " << allowance << dendl; - if (!srcdn->inode->freeze_inode(allowance)) { - srcdn->inode->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr)); - return; - } - } - - // is witness list sufficient? - set srcdnrep; - srcdn->list_replicas(srcdnrep); - for (set::iterator p = srcdnrep.begin(); - p != srcdnrep.end(); - ++p) { - if (*p == mdr->slave_to_mds || - mdr->slave_request->witnesses.count(*p)) continue; - dout(10) << " witness list insufficient; providing srcdn replica list" << dendl; - MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_RENAMEPREPACK); - reply->witnesses.swap(srcdnrep); - mds->send_message_mds(reply, mdr->slave_to_mds); - delete mdr->slave_request; - mdr->slave_request = 0; - return; - } - dout(10) << " witness list sufficient: includes all srcdn replicas" << dendl; - } - - // journal it? - if (srcdn->is_auth() || - (destdn->inode && destdn->inode->is_auth()) || - srcdn->inode->is_any_caps()) { - // journal. - mdr->ls = mdlog->get_current_segment(); - ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rename_prep", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_PREPARE); - - // rollback case - if (destdn->inode && destdn->inode->is_auth()) { - assert(destdn->is_remote()); - le->rollback.add_dir_context(destdn->dir); - le->rollback.add_dentry(destdn, true); - } - if (srcdn->is_auth() || - (srcdn->inode && srcdn->inode->is_auth())) { - le->rollback.add_dir_context(srcdn->dir); - le->rollback.add_dentry(srcdn, true); - } - - // commit case - _rename_prepare(mdr, &le->commit, srcdn, destdn, straydn); - - mdlog->submit_entry(le, new C_MDS_SlaveRenamePrep(this, mdr, srcdn, destdn, straydn)); - } else { - // don't journal. - dout(10) << "not journaling, i'm not auth for anything, and srci isn't open" << dendl; - - // prepare anyway; this may twiddle dir_auth - EMetaBlob blah; - _rename_prepare(mdr, &blah, srcdn, destdn, straydn); - _logged_slave_rename(mdr, srcdn, destdn, straydn); - } -} - -void Server::_logged_slave_rename(MDRequest *mdr, - CDentry *srcdn, CDentry *destdn, CDentry *straydn) -{ - dout(10) << "_logged_slave_rename " << *mdr << dendl; - - // prepare ack - MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_RENAMEPREPACK); - - // export srci? - if (srcdn->is_auth() && srcdn->is_primary()) { - list finished; - map exported_client_map; - bufferlist inodebl; - mdcache->migrator->encode_export_inode(srcdn->inode, inodebl, - exported_client_map); - mdcache->migrator->finish_export_inode(srcdn->inode, mdr->now, finished); - mds->queue_waiters(finished); // this includes SINGLEAUTH waiters. - ::_encode(exported_client_map, reply->inode_export); - reply->inode_export.claim_append(inodebl); - reply->inode_export_v = srcdn->inode->inode.version; - - // remove mdr auth pin - mdr->auth_unpin(srcdn->inode); - assert(!srcdn->inode->is_auth_pinned()); - - dout(10) << " exported srci " << *srcdn->inode << dendl; - } - - // apply - _rename_apply(mdr, srcdn, destdn, straydn); - - mds->send_message_mds(reply, mdr->slave_to_mds); - - // bump popularity - //if (srcdn->is_auth()) - //mds->balancer->hit_dir(mdr->now, srcdn->get_dir(), META_POP_DWR); - if (destdn->inode && destdn->inode->is_auth()) - mds->balancer->hit_inode(mdr->now, destdn->inode, META_POP_IWR); - - // done. - delete mdr->slave_request; - mdr->slave_request = 0; -} - -void Server::_commit_slave_rename(MDRequest *mdr, int r, - CDentry *srcdn, CDentry *destdn, CDentry *straydn) -{ - dout(10) << "_commit_slave_rename " << *mdr << " r=" << r << dendl; - - // unfreeze+singleauth inode - // hmm, do i really need to delay this? - if (srcdn->is_auth() && destdn->is_primary()) { - dout(10) << " unfreezing exported inode " << *destdn->inode << dendl; - list finished; - - // singleauth - assert(destdn->inode->state_test(CInode::STATE_AMBIGUOUSAUTH)); - destdn->inode->state_clear(CInode::STATE_AMBIGUOUSAUTH); - destdn->inode->take_waiting(CInode::WAIT_SINGLEAUTH, finished); - - // unfreeze - assert(destdn->inode->is_frozen_inode() || - destdn->inode->is_freezing_inode()); - destdn->inode->unfreeze_inode(finished); - - mds->queue_waiters(finished); - } - - - ESlaveUpdate *le; - if (r == 0) { - // write a commit to the journal - le = new ESlaveUpdate(mdlog, "slave_rename_commit", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_COMMIT); - - } else { - // abort - le = new ESlaveUpdate(mdlog, "slave_rename_abort", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_ROLLBACK); - - // -- rollback in memory -- - - if (mdr->more()->was_link_merge) { - // link merge - CInode *in = destdn->inode; - in->inode.nlink++; - if (mdr->more()->destdn_was_remote_inode) { - destdn->dir->unlink_inode(destdn); - srcdn->dir->link_primary_inode(srcdn, in); - destdn->dir->link_remote_inode(destdn, in->ino(), MODE_TO_DT(in->inode.mode)); - } else { - srcdn->dir->link_remote_inode(srcdn, in->ino(), MODE_TO_DT(in->inode.mode)); - } - } else { - // normal - - // revert srcdn - if (destdn->is_remote()) { - srcdn->dir->link_remote_inode(srcdn, destdn->inode->ino(), MODE_TO_DT(destdn->inode->inode.mode)); - destdn->dir->unlink_inode(destdn); - } else { - // renamed a primary - CInode *in = destdn->inode; - destdn->dir->unlink_inode(destdn); - srcdn->dir->link_primary_inode(srcdn, in); - } - - // revert destdn - if (mdr->more()->destdn_was_remote_inode) { - destdn->dir->link_remote_inode(destdn, - mdr->more()->destdn_was_remote_inode->ino(), - MODE_TO_DT(mdr->more()->destdn_was_remote_inode->inode.mode)); - mdr->more()->destdn_was_remote_inode->inode.nlink++; - } else if (straydn && straydn->inode) { - CInode *in = straydn->inode; - straydn->dir->unlink_inode(straydn); - destdn->dir->link_primary_inode(destdn, in); - straydn->dir->remove_dentry(straydn); - } - } - - // FIXME: reverse srci export? - - dout(-10) << " srcdn back to " << *srcdn << dendl; - dout(-10) << " srci back to " << *srcdn->inode << dendl; - dout(-10) << " destdn back to " << *destdn << dendl; - if (destdn->inode) dout(-10) << " desti back to " << *destdn->inode << dendl; - - // *** WRITE ME *** - assert(0); - - } - - - - mdlog->submit_entry(le); -} - -void Server::handle_slave_rename_prep_ack(MDRequest *mdr, MMDSSlaveRequest *ack) -{ - dout(10) << "handle_slave_rename_prep_ack " << *mdr - << " witnessed by " << ack->get_source() - << " " << *ack << dendl; - int from = ack->get_source().num(); - - // note slave - mdr->more()->slaves.insert(from); - - // witnessed? or add extra witnesses? - assert(mdr->more()->witnessed.count(from) == 0); - if (ack->witnesses.empty()) { - mdr->more()->witnessed.insert(from); - } else { - dout(10) << " extra witnesses (srcdn replicas) are " << ack->witnesses << dendl; - mdr->more()->extra_witnesses.swap(ack->witnesses); - mdr->more()->extra_witnesses.erase(mds->get_nodeid()); // not me! - } - - // srci import? - if (ack->inode_export.length()) { - dout(10) << " got srci import" << dendl; - mdr->more()->inode_import.claim(ack->inode_export); - mdr->more()->inode_import_v = ack->inode_export_v; - } - - // remove from waiting list - assert(mdr->more()->waiting_on_slave.count(from)); - mdr->more()->waiting_on_slave.erase(from); - - if (mdr->more()->waiting_on_slave.empty()) - dispatch_client_request(mdr); // go again! - else - dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl; -} - - - - - -// =================================== -// TRUNCATE, FSYNC - -class C_MDS_truncate_purged : public Context { - MDS *mds; - MDRequest *mdr; - CInode *in; - version_t pv; - off_t size; - utime_t ctime; -public: - C_MDS_truncate_purged(MDS *m, MDRequest *r, CInode *i, version_t pdv, off_t sz, utime_t ct) : - mds(m), mdr(r), in(i), - pv(pdv), - size(sz), ctime(ct) { } - void finish(int r) { - assert(r == 0); - - // apply to cache - in->inode.size = size; - in->inode.ctime = ctime; - in->inode.mtime = ctime; - in->mark_dirty(pv, mdr->ls); - - // reply - mds->server->reply_request(mdr, 0); - } -}; - -class C_MDS_truncate_logged : public Context { - MDS *mds; - MDRequest *mdr; - CInode *in; - version_t pv; - off_t size; - utime_t ctime; -public: - C_MDS_truncate_logged(MDS *m, MDRequest *r, CInode *i, version_t pdv, off_t sz, utime_t ct) : - mds(m), mdr(r), in(i), - pv(pdv), - size(sz), ctime(ct) { } - void finish(int r) { - assert(r == 0); - - // purge - mds->mdcache->purge_inode(in, size, in->inode.size, mdr->ls); - mds->mdcache->wait_for_purge(in, size, - new C_MDS_truncate_purged(mds, mdr, in, pv, size, ctime)); - } -}; - -void Server::handle_client_truncate(MDRequest *mdr) -{ - MClientRequest *req = mdr->client_request; - CInode *cur = rdlock_path_pin_ref(mdr, true); - if (!cur) return; - - // check permissions? - - // xlock inode - set rdlocks = mdr->rdlocks; - set wrlocks = mdr->wrlocks; - set xlocks = mdr->xlocks; - xlocks.insert(&cur->filelock); - if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) - return; - - // already small enough? - if (cur->inode.size <= req->args.truncate.length) { - reply_request(mdr, 0); - return; - } - - // prepare - version_t pdv = cur->pre_dirty(); - utime_t ctime = g_clock.real_now(); - Context *fin = new C_MDS_truncate_logged(mds, mdr, cur, - pdv, req->args.truncate.length, ctime); - - // log + wait - mdr->ls = mdlog->get_current_segment(); - EUpdate *le = new EUpdate(mdlog, "truncate"); - le->metablob.add_client_req(mdr->reqid); - le->metablob.add_dir_context(cur->get_parent_dir()); - le->metablob.add_inode_truncate(cur->ino(), req->args.truncate.length, cur->inode.size); - inode_t *pi = le->metablob.add_dentry(cur->parent, true); - pi->mtime = ctime; - pi->ctime = ctime; - pi->version = pdv; - pi->size = req->args.truncate.length; - - - mdlog->submit_entry(le, fin); -} - - -// =========================== -// open, openc, close - -void Server::handle_client_open(MDRequest *mdr) -{ - MClientRequest *req = mdr->client_request; - - int flags = req->args.open.flags; - int cmode = req->get_open_file_mode(); - bool need_auth = ((cmode != FILE_MODE_R && cmode != FILE_MODE_LAZY) || - (flags & O_TRUNC)); - dout(10) << "open flags = " << flags - << ", filemode = " << cmode - << ", need_auth = " << need_auth - << dendl; - - CInode *cur = rdlock_path_pin_ref(mdr, need_auth); - if (!cur) return; - - // regular file? - if ((cur->inode.mode & INODE_TYPE_MASK) != INODE_MODE_FILE) { - dout(7) << "not a regular file " << *cur << dendl; - reply_request(mdr, -EINVAL); // FIXME what error do we want? - return; - } - - // hmm, check permissions or something. - - - // O_TRUNC - if (flags & O_TRUNC) { - assert(cur->is_auth()); - - // xlock file size - set rdlocks = mdr->rdlocks; - set wrlocks = mdr->wrlocks; - set xlocks = mdr->xlocks; - xlocks.insert(&cur->filelock); - if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) - return; - - if (cur->inode.size > 0) { - handle_client_opent(mdr); - return; - } - } - - // do it - _do_open(mdr, cur); -} - -void Server::_do_open(MDRequest *mdr, CInode *cur) -{ - MClientRequest *req = mdr->client_request; - int cmode = req->get_open_file_mode(); - - // can we issue the caps they want? - version_t fdv = mds->locker->issue_file_data_version(cur); - Capability *cap = mds->locker->issue_new_caps(cur, cmode, req); - if (!cap) return; // can't issue (yet), so wait! - - dout(12) << "_do_open issuing caps " << cap_string(cap->pending()) - << " for " << req->get_source() - << " on " << *cur << dendl; - - // hit pop - mdr->now = g_clock.now(); - if (cmode == FILE_MODE_RW || - cmode == FILE_MODE_W) - mds->balancer->hit_inode(mdr->now, cur, META_POP_IWR); - else - mds->balancer->hit_inode(mdr->now, cur, META_POP_IRD, - mdr->client_request->get_client_inst().name.num()); - - // reply - MClientReply *reply = new MClientReply(req, 0); - reply->set_file_caps(cap->pending()); - reply->set_file_caps_seq(cap->get_last_seq()); - reply->set_file_data_version(fdv); - reply_request(mdr, reply, cur); - - // journal? - if (cur->last_open_journaled == 0) { - queue_journal_open(cur); - maybe_journal_opens(); - } - -} - -void Server::queue_journal_open(CInode *in) -{ - dout(10) << "queue_journal_open on " << *in << dendl; - - if (journal_open_queue.count(in) == 0) { - // pin so our pointer stays valid - in->get(CInode::PIN_BATCHOPENJOURNAL); - - // queue it up for a bit - journal_open_queue.insert(in); - } -} - - -void Server::journal_opens() -{ - dout(10) << "journal_opens " << journal_open_queue.size() << " inodes" << dendl; - if (journal_open_queue.empty()) return; - - EOpen *le = 0; - - // check queued inodes - LogSegment *ls = mdlog->get_current_segment(); - for (set::iterator p = journal_open_queue.begin(); - p != journal_open_queue.end(); - ++p) { - CInode *in = *p; - in->put(CInode::PIN_BATCHOPENJOURNAL); - if (in->is_any_caps()) { - if (!le) le = new EOpen(mdlog); - le->add_inode(in); - in->last_open_journaled = mds->mdlog->get_write_pos(); - ls->open_files.push_back(&in->xlist_open_file); - } - } - journal_open_queue.clear(); - - if (le) { - // journal - mdlog->submit_entry(le); - - // add waiters to journal entry - for (list::iterator p = journal_open_waiters.begin(); - p != journal_open_waiters.end(); - ++p) - mds->mdlog->wait_for_sync(*p); - journal_open_waiters.clear(); - } else { - // nothing worth journaling here, just kick the waiters. - mds->queue_waiters(journal_open_waiters); - } -} - - - - -class C_MDS_open_truncate_purged : public Context { - MDS *mds; - MDRequest *mdr; - CInode *in; - version_t pv; - utime_t ctime; -public: - C_MDS_open_truncate_purged(MDS *m, MDRequest *r, CInode *i, version_t pdv, utime_t ct) : - mds(m), mdr(r), in(i), - pv(pdv), - ctime(ct) { } - void finish(int r) { - assert(r == 0); - - // apply to cache - in->inode.size = 0; - in->inode.ctime = ctime; - in->inode.mtime = ctime; - in->mark_dirty(pv, mdr->ls); - - // do the open - mds->server->_do_open(mdr, in); - } -}; - -class C_MDS_open_truncate_logged : public Context { - MDS *mds; - MDRequest *mdr; - CInode *in; - version_t pv; - utime_t ctime; -public: - C_MDS_open_truncate_logged(MDS *m, MDRequest *r, CInode *i, version_t pdv, utime_t ct) : - mds(m), mdr(r), in(i), - pv(pdv), - ctime(ct) { } - void finish(int r) { - assert(r == 0); - - // hit pop - mds->balancer->hit_inode(mdr->now, in, META_POP_IWR); - - // purge also... - mds->mdcache->purge_inode(in, 0, in->inode.size, mdr->ls); - mds->mdcache->wait_for_purge(in, 0, - new C_MDS_open_truncate_purged(mds, mdr, in, pv, ctime)); - } -}; - - -void Server::handle_client_opent(MDRequest *mdr) -{ - CInode *cur = mdr->ref; - assert(cur); - - // prepare - version_t pdv = cur->pre_dirty(); - utime_t ctime = g_clock.real_now(); - Context *fin = new C_MDS_open_truncate_logged(mds, mdr, cur, - pdv, ctime); - - // log + wait - mdr->ls = mdlog->get_current_segment(); - EUpdate *le = new EUpdate(mdlog, "open_truncate"); - le->metablob.add_client_req(mdr->reqid); - le->metablob.add_dir_context(cur->get_parent_dir()); - le->metablob.add_inode_truncate(cur->ino(), 0, cur->inode.size); - inode_t *pi = le->metablob.add_dentry(cur->parent, true); - pi->mtime = ctime; - pi->ctime = ctime; - pi->version = pdv; - pi->size = 0; - - mdlog->submit_entry(le, fin); -} - - - -class C_MDS_openc_finish : public Context { - MDS *mds; - MDRequest *mdr; - CDentry *dn; - CInode *newi; - version_t pv; -public: - C_MDS_openc_finish(MDS *m, MDRequest *r, CDentry *d, CInode *ni) : - mds(m), mdr(r), dn(d), newi(ni), - pv(d->get_projected_version()) {} - void finish(int r) { - assert(r == 0); - - // link the inode - dn->get_dir()->link_primary_inode(dn, newi); - - // dirty inode, dn, dir - newi->mark_dirty(pv, mdr->ls); - - // downgrade xlock to rdlock - //mds->locker->dentry_xlock_downgrade_to_rdlock(dn, mdr); - - // set/pin ref inode for open() - mdr->ref = newi; - mdr->pin(newi); - - // ok, do the open. - mds->server->handle_client_open(mdr); - } -}; - - -void Server::handle_client_openc(MDRequest *mdr) -{ - MClientRequest *req = mdr->client_request; - - dout(7) << "open w/ O_CREAT on " << req->get_filepath() << dendl; - - bool excl = (req->args.open.flags & O_EXCL); - CDentry *dn = rdlock_path_xlock_dentry(mdr, !excl, false); - if (!dn) return; - - if (!dn->is_null()) { - // it existed. - if (req->args.open.flags & O_EXCL) { - dout(10) << "O_EXCL, target exists, failing with -EEXIST" << dendl; - reply_request(mdr, -EEXIST, dn->get_dir()->get_inode()); - return; - } - - // pass to regular open handler. - handle_client_open(mdr); - return; - } - - // created null dn. - - // create inode. - mdr->now = g_clock.real_now(); - CInode *in = prepare_new_inode(mdr, dn->dir); - assert(in); - - // it's a file. - in->inode.mode = req->args.open.mode; - in->inode.mode |= INODE_MODE_FILE; - in->inode.version = dn->pre_dirty() - 1; - - // prepare finisher - C_MDS_openc_finish *fin = new C_MDS_openc_finish(mds, mdr, dn, in); - mdr->ls = mdlog->get_current_segment(); - EUpdate *le = new EUpdate(mdlog, "openc"); - le->metablob.add_client_req(req->get_reqid()); - le->metablob.add_allocated_ino(in->ino(), mds->idalloc->get_version()); - le->metablob.add_dir_context(dn->dir); - le->metablob.add_primary_dentry(dn, true, in, &in->inode); - - // log + wait - mdlog->submit_entry(le, fin); - - /* - FIXME. this needs to be rewritten when the write capability stuff starts - getting journaled. - */ -} - - - - - - - - - - - - - - diff --git a/branches/sage/ebofs2/mds/Server.h b/branches/sage/ebofs2/mds/Server.h deleted file mode 100644 index 281fd13ca2593..0000000000000 --- a/branches/sage/ebofs2/mds/Server.h +++ /dev/null @@ -1,184 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_SERVER_H -#define __MDS_SERVER_H - -#include "MDS.h" - -class Logger; -class LogEvent; -class C_MDS_rename_finish; -class MDRequest; -class EMetaBlob; -class PVList; -class MMDSSlaveRequest; - -class Server { - MDS *mds; - MDCache *mdcache; - MDLog *mdlog; - Messenger *messenger; - Logger *logger; - -public: - Server(MDS *m) : - mds(m), - mdcache(mds->mdcache), mdlog(mds->mdlog), - messenger(mds->messenger), - logger(0) { - } - ~Server() { - delete logger; - } - - void reopen_logger(utime_t start, bool append); - - // message handler - void dispatch(Message *m); - - - // -- sessions and recovery -- - utime_t reconnect_start; - set client_reconnect_gather; // clients i need a reconnect msg from. - set reconnected_caps; - - void handle_client_session(class MClientSession *m); - void _session_logged(entity_inst_t ci, bool open, version_t cmapv); - void terminate_sessions(); - void reconnect_clients(); - void handle_client_reconnect(class MClientReconnect *m); - void process_reconnect_cap(CInode *in, int from, inode_caps_reconnect_t& capinfo); - void add_reconnected_cap_inode(CInode *in) { - reconnected_caps.insert(in); - } - void process_reconnected_caps(); - void client_reconnect_failure(int from); - void reconnect_gather_finish(); - - - // -- requests -- - void handle_client_request(MClientRequest *m); - - void dispatch_client_request(MDRequest *mdr); - void reply_request(MDRequest *mdr, int r = 0, CInode *tracei = 0); - void reply_request(MDRequest *mdr, MClientReply *reply, CInode *tracei); - - void handle_slave_request(MMDSSlaveRequest *m); - void dispatch_slave_request(MDRequest *mdr); - void handle_slave_auth_pin(MDRequest *mdr); - void handle_slave_auth_pin_ack(MDRequest *mdr, MMDSSlaveRequest *ack); - - // some helpers - CDir *validate_dentry_dir(MDRequest *mdr, CInode *diri, const string& dname); - CDir *traverse_to_auth_dir(MDRequest *mdr, vector &trace, filepath refpath); - CDentry *prepare_null_dentry(MDRequest *mdr, CDir *dir, const string& dname, bool okexist=false); - CInode* prepare_new_inode(MDRequest *mdr, CDir *dir); - - CInode* rdlock_path_pin_ref(MDRequest *mdr, bool want_auth); - CDentry* rdlock_path_xlock_dentry(MDRequest *mdr, bool okexist, bool mustexist); - - CDir* try_open_auth_dirfrag(CInode *diri, frag_t fg, MDRequest *mdr); - - version_t predirty_dn_diri(MDRequest *mdr, CDentry *dn, class EMetaBlob *blob); - void dirty_dn_diri(MDRequest *mdr, CDentry *dn, version_t dirpv); - - - // requests on existing inodes. - void handle_client_stat(MDRequest *mdr); - void handle_client_utime(MDRequest *mdr); - void handle_client_chmod(MDRequest *mdr); - void handle_client_chown(MDRequest *mdr); - void handle_client_readdir(MDRequest *mdr); - void handle_client_truncate(MDRequest *mdr); - void handle_client_fsync(MDRequest *mdr); - - // open - void handle_client_open(MDRequest *mdr); - void handle_client_openc(MDRequest *mdr); // O_CREAT variant. - void handle_client_opent(MDRequest *mdr); // O_TRUNC variant. - void _do_open(MDRequest *mdr, CInode *ref); - - set journal_open_queue; // to be journal - list journal_open_waiters; - void queue_journal_open(CInode *in); - void add_journal_open_waiter(Context *c) { - journal_open_waiters.push_back(c); - } - void maybe_journal_opens() { - if (journal_open_queue.size() >= (unsigned)g_conf.mds_log_eopen_size) - journal_opens(); - } - void journal_opens(); - - // namespace changes - void handle_client_mknod(MDRequest *mdr); - void handle_client_mkdir(MDRequest *mdr); - void handle_client_symlink(MDRequest *mdr); - - // link - void handle_client_link(MDRequest *mdr); - void _link_local(MDRequest *mdr, CDentry *dn, CInode *targeti); - void _link_local_finish(MDRequest *mdr, - CDentry *dn, CInode *targeti, - version_t, version_t, version_t); - - void _link_remote(MDRequest *mdr, CDentry *dn, CInode *targeti); - void _link_remote_finish(MDRequest *mdr, CDentry *dn, CInode *targeti, - version_t, version_t); - - void handle_slave_link_prep(MDRequest *mdr); - void _logged_slave_link(MDRequest *mdr, CInode *targeti, utime_t old_ctime, bool inc); - void _commit_slave_link(MDRequest *mdr, int r, CInode *targeti, - utime_t old_ctime, version_t old_version, bool inc); - void handle_slave_link_prep_ack(MDRequest *mdr, MMDSSlaveRequest *m); - - // unlink - void handle_client_unlink(MDRequest *mdr); - bool _verify_rmdir(MDRequest *mdr, CInode *rmdiri); - void _unlink_local(MDRequest *mdr, CDentry *dn, CDentry *straydn); - void _unlink_local_finish(MDRequest *mdr, - CDentry *dn, CDentry *straydn, - version_t, version_t); - - void _unlink_remote(MDRequest *mdr, CDentry *dn); - void _unlink_remote_finish(MDRequest *mdr, - CDentry *dn, - version_t, version_t); - - // rename - void handle_client_rename(MDRequest *mdr); - void _rename_finish(MDRequest *mdr, - CDentry *srcdn, CDentry *destdn, CDentry *straydn); - - // helpers - void _rename_prepare_witness(MDRequest *mdr, int who, - CDentry *srcdn, CDentry *destdn, CDentry *straydn); - void _rename_prepare(MDRequest *mdr, - EMetaBlob *metablob, - CDentry *srcdn, CDentry *destdn, CDentry *straydn); - void _rename_apply(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn); - - // slaving - void handle_slave_rename_prep(MDRequest *mdr); - void handle_slave_rename_prep_ack(MDRequest *mdr, MMDSSlaveRequest *m); - void _logged_slave_rename(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn); - void _commit_slave_rename(MDRequest *mdr, int r, CDentry *srcdn, CDentry *destdn, CDentry *straydn); - -}; - - - - -#endif diff --git a/branches/sage/ebofs2/mds/SimpleLock.h b/branches/sage/ebofs2/mds/SimpleLock.h deleted file mode 100644 index e785e2c36d50c..0000000000000 --- a/branches/sage/ebofs2/mds/SimpleLock.h +++ /dev/null @@ -1,301 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __SIMPLELOCK_H -#define __SIMPLELOCK_H - -// -- lock types -- -// NOTE: this also defines the lock ordering! -#define LOCK_OTYPE_DN 1 - -#define LOCK_OTYPE_IVERSION 2 -#define LOCK_OTYPE_IFILE 3 -#define LOCK_OTYPE_IAUTH 4 -#define LOCK_OTYPE_ILINK 5 -#define LOCK_OTYPE_IDIRFRAGTREE 6 -#define LOCK_OTYPE_IDIR 7 - -//#define LOCK_OTYPE_DIR 7 // not used - -inline const char *get_lock_type_name(int t) { - switch (t) { - case LOCK_OTYPE_DN: return "dn"; - case LOCK_OTYPE_IVERSION: return "iversion"; - case LOCK_OTYPE_IFILE: return "ifile"; - case LOCK_OTYPE_IAUTH: return "iauth"; - case LOCK_OTYPE_ILINK: return "ilink"; - case LOCK_OTYPE_IDIRFRAGTREE: return "idft"; - case LOCK_OTYPE_IDIR: return "idir"; - default: assert(0); return 0; - } -} - -// -- lock states -- -// sync <-> lock -#define LOCK_UNDEF 0 -// auth rep -#define LOCK_SYNC 1 // AR R . R . -#define LOCK_LOCK 2 // AR R W . . -#define LOCK_GLOCKR -3 // AR R . . . -#define LOCK_REMOTEXLOCK -50 // on NON-auth - -inline const char *get_simplelock_state_name(int n) { - switch (n) { - case LOCK_UNDEF: return "UNDEF"; - case LOCK_SYNC: return "sync"; - case LOCK_LOCK: return "lock"; - case LOCK_GLOCKR: return "glockr"; - case LOCK_REMOTEXLOCK: return "remote_xlock"; - default: assert(0); return 0; - } -} - -class MDRequest; - -class SimpleLock { -public: - static const int WAIT_RD = (1<<0); // to read - static const int WAIT_WR = (1<<1); // to write - static const int WAIT_XLOCK = (1<<2); // to xlock (** dup) - static const int WAIT_STABLE = (1<<2); // for a stable state - static const int WAIT_REMOTEXLOCK = (1<<3); // for a remote xlock - static const int WAIT_BITS = 4; - static const int WAIT_ALL = ((1< gather_set; // auth - - // local state - int num_rdlock; - MDRequest *xlock_by; - -public: - SimpleLock(MDSCacheObject *o, int t, int wo) : - parent(o), type(t), wait_offset(wo), - state(LOCK_SYNC), - num_rdlock(0), xlock_by(0) { } - virtual ~SimpleLock() {} - - // parent - MDSCacheObject *get_parent() { return parent; } - int get_type() { return type; } - - struct ptr_lt { - bool operator()(const SimpleLock* l, const SimpleLock* r) const { - // first sort by object type (dn < inode) - if ((l->type>LOCK_OTYPE_DN) < (r->type>LOCK_OTYPE_DN)) return true; - if ((l->type>LOCK_OTYPE_DN) == (r->type>LOCK_OTYPE_DN)) { - // then sort by object - if (l->parent->is_lt(r->parent)) return true; - if (l->parent == r->parent) { - // then sort by (inode) lock type - if (l->type < r->type) return true; - } - } - return false; - } - }; - - void decode_locked_state(bufferlist& bl) { - parent->decode_lock_state(type, bl); - } - void encode_locked_state(bufferlist& bl) { - parent->encode_lock_state(type, bl); - } - void finish_waiters(int mask, int r=0) { - parent->finish_waiting(mask << wait_offset, r); - } - void take_waiting(int mask, list& ls) { - parent->take_waiting(mask << wait_offset, ls); - } - void add_waiter(int mask, Context *c) { - parent->add_waiter(mask << wait_offset, c); - } - bool is_waiter_for(int mask) { - return parent->is_waiter_for(mask << wait_offset); - } - - - - // state - int get_state() { return state; } - int set_state(int s) { - state = s; - assert(!is_stable() || gather_set.size() == 0); // gather should be empty in stable states. - return s; - }; - bool is_stable() { - return state >= 0; - } - - - // gather set - const set& get_gather_set() { return gather_set; } - void init_gather() { - for (map::const_iterator p = parent->replicas_begin(); - p != parent->replicas_end(); - ++p) - gather_set.insert(p->first); - } - bool is_gathering() { return !gather_set.empty(); } - bool is_gathering(int i) { - return gather_set.count(i); - } - void clear_gather() { - gather_set.clear(); - } - void remove_gather(int i) { - gather_set.erase(i); - } - - // ref counting - bool is_rdlocked() { return num_rdlock > 0; } - int get_rdlock() { - if (!num_rdlock) parent->get(MDSCacheObject::PIN_LOCK); - return ++num_rdlock; - } - int put_rdlock() { - assert(num_rdlock>0); - --num_rdlock; - if (num_rdlock == 0) parent->put(MDSCacheObject::PIN_LOCK); - return num_rdlock; - } - int get_num_rdlocks() { return num_rdlock; } - - void get_xlock(MDRequest *who) { - assert(xlock_by == 0); - parent->get(MDSCacheObject::PIN_LOCK); - xlock_by = who; - } - void put_xlock() { - assert(xlock_by); - parent->put(MDSCacheObject::PIN_LOCK); - xlock_by = 0; - } - bool is_xlocked() { return xlock_by ? true:false; } - bool is_xlocked_by_other(MDRequest *mdr) { - return is_xlocked() && xlock_by != mdr; - } - MDRequest *get_xlocked_by() { return xlock_by; } - bool is_used() { - return is_xlocked() || is_rdlocked(); - } - - // encode/decode - void _encode(bufferlist& bl) { - ::_encode_simple(state, bl); - ::_encode_simple(gather_set, bl); - } - void _decode(bufferlist::iterator& p) { - ::_decode_simple(state, p); - ::_decode_simple(gather_set, p); - } - - - // simplelock specifics - int get_replica_state() { - switch (state) { - case LOCK_LOCK: - case LOCK_GLOCKR: - return LOCK_LOCK; - case LOCK_SYNC: - return LOCK_SYNC; - default: - assert(0); - } - return 0; - } - void export_twiddle() { - clear_gather(); - state = get_replica_state(); - } - - /** replicate_relax - * called on first replica creation. - */ - void replicate_relax() { - assert(parent->is_auth()); - assert(!parent->is_replicated()); - if (state == LOCK_LOCK && !is_used()) - state = LOCK_SYNC; - } - bool remove_replica(int from) { - if (is_gathering(from)) { - remove_gather(from); - if (!is_gathering()) - return true; - } - return false; - } - bool do_import(int from, int to) { - if (!is_stable()) { - remove_gather(from); - remove_gather(to); - if (!is_gathering()) - return true; - } - if (!is_stable() && !is_gathering()) - return true; - return false; - } - - bool can_rdlock(MDRequest *mdr) { - //if (state == LOCK_LOCK && mdr && xlock_by == mdr) return true; // xlocked by me. (actually, is this right?) - //if (state == LOCK_LOCK && !xlock_by && parent->is_auth()) return true; - return (state == LOCK_SYNC); - } - bool can_xlock(MDRequest *mdr) { - if (mdr && xlock_by == mdr) { - assert(state == LOCK_LOCK); - return true; // auth or replica! xlocked by me. - } - if (state == LOCK_LOCK && parent->is_auth() && !xlock_by) return true; - return false; - } - bool can_xlock_soon() { - if (parent->is_auth()) - return (state == LOCK_GLOCKR); - else - return false; - } - - virtual void print(ostream& out) { - out << "("; - out << get_lock_type_name(get_type()) << " "; - out << get_simplelock_state_name(get_state()); - if (!get_gather_set().empty()) out << " g=" << get_gather_set(); - if (is_rdlocked()) - out << " r=" << get_num_rdlocks(); - if (is_xlocked()) - out << " x=" << get_xlocked_by(); - out << ")"; - } -}; - -inline ostream& operator<<(ostream& out, SimpleLock& l) -{ - l.print(out); - return out; -} - - -#endif diff --git a/branches/sage/ebofs2/mds/events/EAnchor.h b/branches/sage/ebofs2/mds/events/EAnchor.h deleted file mode 100644 index 97a21a36734be..0000000000000 --- a/branches/sage/ebofs2/mds/events/EAnchor.h +++ /dev/null @@ -1,80 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_EANCHOR_H -#define __MDS_EANCHOR_H - -#include -#include "config.h" -#include "include/types.h" - -#include "../LogEvent.h" -#include "../Anchor.h" - -class EAnchor : public LogEvent { -protected: - int op; - inodeno_t ino; - version_t atid; - vector trace; - version_t version; // anchor table version - int reqmds; - - public: - EAnchor() : LogEvent(EVENT_ANCHOR) { } - EAnchor(int o, inodeno_t i, version_t v, int rm) : - LogEvent(EVENT_ANCHOR), - op(o), ino(i), atid(0), version(v), reqmds(rm) { } - EAnchor(int o, version_t a, version_t v) : - LogEvent(EVENT_ANCHOR), - op(o), atid(a), version(v), reqmds(-1) { } - - void set_trace(vector& t) { trace = t; } - vector& get_trace() { return trace; } - - void encode_payload(bufferlist& bl) { - bl.append((char*)&op, sizeof(op)); - bl.append((char*)&ino, sizeof(ino)); - bl.append((char*)&atid, sizeof(atid)); - ::_encode(trace, bl); - bl.append((char*)&version, sizeof(version)); - bl.append((char*)&reqmds, sizeof(reqmds)); - } - void decode_payload(bufferlist& bl, int& off) { - bl.copy(off, sizeof(op), (char*)&op); - off += sizeof(op); - bl.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - bl.copy(off, sizeof(atid), (char*)&atid); - off += sizeof(atid); - ::_decode(trace, bl, off); - bl.copy(off, sizeof(version), (char*)&version); - off += sizeof(version); - bl.copy(off, sizeof(reqmds), (char*)&reqmds); - off += sizeof(reqmds); - } - - void print(ostream& out) { - out << "EAnchor " << get_anchor_opname(op); - if (ino) out << " " << ino; - if (atid) out << " atid " << atid; - if (version) out << " v " << version; - if (reqmds >= 0) out << " by mds" << reqmds; - } - - void update_segment(); - void replay(MDS *mds); -}; - -#endif diff --git a/branches/sage/ebofs2/mds/events/EAnchorClient.h b/branches/sage/ebofs2/mds/events/EAnchorClient.h deleted file mode 100644 index 21f78369cae72..0000000000000 --- a/branches/sage/ebofs2/mds/events/EAnchorClient.h +++ /dev/null @@ -1,56 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_EANCHORCLIENT_H -#define __MDS_EANCHORCLIENT_H - -#include -#include "config.h" -#include "include/types.h" - -#include "../LogEvent.h" -#include "../Anchor.h" - -class EAnchorClient : public LogEvent { -protected: - int op; - version_t atid; - - public: - EAnchorClient() : LogEvent(EVENT_ANCHORCLIENT) { } - EAnchorClient(int o, version_t at) : - LogEvent(EVENT_ANCHORCLIENT), - op(o), atid(at) { } - - void encode_payload(bufferlist& bl) { - bl.append((char*)&op, sizeof(op)); - bl.append((char*)&atid, sizeof(atid)); - } - void decode_payload(bufferlist& bl, int& off) { - bl.copy(off, sizeof(op), (char*)&op); - off += sizeof(op); - bl.copy(off, sizeof(atid), (char*)&atid); - off += sizeof(atid); - } - - void print(ostream& out) { - out << "EAnchorClient " << get_anchor_opname(op); - if (atid) out << " atid " << atid; - } - - void replay(MDS *mds); - -}; - -#endif diff --git a/branches/sage/ebofs2/mds/events/EExport.h b/branches/sage/ebofs2/mds/events/EExport.h deleted file mode 100644 index 89534f12b51bf..0000000000000 --- a/branches/sage/ebofs2/mds/events/EExport.h +++ /dev/null @@ -1,63 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __EEXPORT_H -#define __EEXPORT_H - -#include -#include "config.h" -#include "include/types.h" - -#include "../MDS.h" - -#include "EMetaBlob.h" - -class EExport : public LogEvent { -public: - EMetaBlob metablob; // exported dir -protected: - dirfrag_t base; - set bounds; - -public: - EExport() : LogEvent(EVENT_EXPORT) { } - EExport(MDLog *mdlog, CDir *dir) : - LogEvent(EVENT_EXPORT), metablob(mdlog), - base(dir->dirfrag()) { } - - set &get_bounds() { return bounds; } - - void print(ostream& out) { - out << "EExport " << base << " " << metablob; - } - - virtual void encode_payload(bufferlist& bl) { - metablob._encode(bl); - bl.append((char*)&base, sizeof(base)); - ::_encode(bounds, bl); - } - void decode_payload(bufferlist& bl, int& off) { - metablob._decode(bl, off); - bl.copy(off, sizeof(base), (char*)&base); - off += sizeof(base); - ::_decode(bounds, bl, off); - } - - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); - void replay(MDS *mds); - -}; - -#endif diff --git a/branches/sage/ebofs2/mds/events/EFragment.h b/branches/sage/ebofs2/mds/events/EFragment.h deleted file mode 100644 index 64969111193c0..0000000000000 --- a/branches/sage/ebofs2/mds/events/EFragment.h +++ /dev/null @@ -1,54 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_EFRAGMENT_H -#define __MDS_EFRAGMENT_H - -#include "../LogEvent.h" -#include "EMetaBlob.h" - -class EFragment : public LogEvent { -public: - EMetaBlob metablob; - inodeno_t ino; - frag_t basefrag; - int bits; // positive for split (from basefrag), negative for merge (to basefrag) - - EFragment() : LogEvent(EVENT_FRAGMENT) { } - EFragment(MDLog *mdlog, inodeno_t i, frag_t bf, int b) : - LogEvent(EVENT_FRAGMENT), metablob(mdlog), - ino(i), basefrag(bf), bits(b) { } - void print(ostream& out) { - out << "EFragment " << ino << " " << basefrag << " by " << bits << " " << metablob; - } - - void encode_payload(bufferlist& bl) { - ::_encode(ino, bl); - ::_encode(basefrag, bl); - ::_encode(bits, bl); - metablob._encode(bl); - } - void decode_payload(bufferlist& bl, int& off) { - ::_decode(ino, bl, off); - ::_decode(basefrag, bl, off); - ::_decode(bits, bl, off); - metablob._decode(bl, off); - } - - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); - void replay(MDS *mds); -}; - -#endif diff --git a/branches/sage/ebofs2/mds/events/EImportFinish.h b/branches/sage/ebofs2/mds/events/EImportFinish.h deleted file mode 100644 index 0ee6d71ffdc13..0000000000000 --- a/branches/sage/ebofs2/mds/events/EImportFinish.h +++ /dev/null @@ -1,60 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __EIMPORTFINISH_H -#define __EIMPORTFINISH_H - -#include -#include "config.h" -#include "include/types.h" - -#include "../MDS.h" - -class EImportFinish : public LogEvent { - protected: - dirfrag_t base; // imported dir - bool success; - - public: - EImportFinish(CDir *dir, bool s) : LogEvent(EVENT_IMPORTFINISH), - base(dir->dirfrag()), - success(s) { } - EImportFinish() : LogEvent(EVENT_IMPORTFINISH) { } - - void print(ostream& out) { - out << "EImportFinish " << base; - if (success) - out << " success"; - else - out << " failed"; - } - - virtual void encode_payload(bufferlist& bl) { - bl.append((char*)&base, sizeof(base)); - bl.append((char*)&success, sizeof(success)); - } - void decode_payload(bufferlist& bl, int& off) { - bl.copy(off, sizeof(base), (char*)&base); - off += sizeof(base); - bl.copy(off, sizeof(success), (char*)&success); - off += sizeof(success); - } - - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); - void replay(MDS *mds); - -}; - -#endif diff --git a/branches/sage/ebofs2/mds/events/EImportStart.h b/branches/sage/ebofs2/mds/events/EImportStart.h deleted file mode 100644 index aa1902576542d..0000000000000 --- a/branches/sage/ebofs2/mds/events/EImportStart.h +++ /dev/null @@ -1,61 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __EIMPORTSTART_H -#define __EIMPORTSTART_H - -#include -#include "config.h" -#include "include/types.h" - -#include "../MDS.h" - -#include "EMetaBlob.h" - -class EImportStart : public LogEvent { -protected: - dirfrag_t base; - list bounds; - - public: - EMetaBlob metablob; - - EImportStart(dirfrag_t di, - list& b) : LogEvent(EVENT_IMPORTSTART), - base(di), bounds(b) { } - EImportStart() : LogEvent(EVENT_IMPORTSTART) { } - - void print(ostream& out) { - out << "EImportStart " << base << " " << metablob; - } - - virtual void encode_payload(bufferlist& bl) { - bl.append((char*)&base, sizeof(base)); - metablob._encode(bl); - ::_encode(bounds, bl); - } - void decode_payload(bufferlist& bl, int& off) { - bl.copy(off, sizeof(base), (char*)&base); - off += sizeof(base); - metablob._decode(bl, off); - ::_decode(bounds, bl, off); - } - - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); - void replay(MDS *mds); - -}; - -#endif diff --git a/branches/sage/ebofs2/mds/events/EMetaBlob.h b/branches/sage/ebofs2/mds/events/EMetaBlob.h deleted file mode 100644 index 767521523f9fe..0000000000000 --- a/branches/sage/ebofs2/mds/events/EMetaBlob.h +++ /dev/null @@ -1,501 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_EMETABLOB_H -#define __MDS_EMETABLOB_H - -#include -#include -using std::string; - -#include "../CInode.h" -#include "../CDir.h" -#include "../CDentry.h" - -#include "include/triple.h" - -class MDS; -class MDLog; -class LogSegment; - -/* - * a bunch of metadata in the journal - */ - -/* notes: - * - * - make sure you adjust the inode.version for any modified inode you - * journal. CDir and CDentry maintain a projected_version, but CInode - * doesn't, since the journaled inode usually has to be modifed - * manually anyway (to delay the change in the MDS's cache until after - * it is journaled). - * - */ - - -class EMetaBlob { - - /* fullbit - a regular dentry + inode - */ - struct fullbit { - string dn; // dentry - version_t dnv; - inode_t inode; // if it's not - fragtree_t dirfragtree; - string symlink; - bool dirty; - - fullbit(const string& d, version_t v, inode_t& i, fragtree_t dft, bool dr) : - dn(d), dnv(v), inode(i), dirfragtree(dft), dirty(dr) { } - fullbit(const string& d, version_t v, inode_t& i, fragtree_t dft, string& sym, bool dr) : - dn(d), dnv(v), inode(i), dirfragtree(dft), symlink(sym), dirty(dr) { } - fullbit(bufferlist& bl, int& off) { _decode(bl, off); } - void _encode(bufferlist& bl) { - ::_encode(dn, bl); - ::_encode(dnv, bl); - ::_encode(inode, bl); - dirfragtree._encode(bl); - if (inode.is_symlink()) - ::_encode(symlink, bl); - ::_encode(dirty, bl); - } - void _decode(bufferlist& bl, int& off) { - ::_decode(dn, bl, off); - ::_decode(dnv, bl, off); - ::_decode(inode, bl, off); - dirfragtree._decode(bl, off); - if (inode.is_symlink()) - ::_decode(symlink, bl, off); - ::_decode(dirty, bl, off); - } - void print(ostream& out) { - out << " fullbit dn " << dn << " dnv " << dnv - << " inode " << inode.ino - << " dirty=" << dirty << std::endl; - } - }; - - /* remotebit - a dentry + remote inode link (i.e. just an ino) - */ - struct remotebit { - string dn; - version_t dnv; - inodeno_t ino; - unsigned char d_type; - bool dirty; - - remotebit(const string& d, version_t v, inodeno_t i, unsigned char dt, bool dr) : - dn(d), dnv(v), ino(i), d_type(dt), dirty(dr) { } - remotebit(bufferlist& bl, int& off) { _decode(bl, off); } - void _encode(bufferlist& bl) { - ::_encode(dn, bl); - ::_encode(dnv, bl); - ::_encode(ino, bl); - ::_encode(d_type, bl); - ::_encode(dirty, bl); - } - void _decode(bufferlist& bl, int& off) { - ::_decode(dn, bl, off); - ::_decode(dnv, bl, off); - ::_decode(ino, bl, off); - ::_decode(d_type, bl, off); - ::_decode(dirty, bl, off); - } - void print(ostream& out) { - out << " remotebit dn " << dn << " dnv " << dnv - << " ino " << ino - << " dirty=" << dirty << std::endl; - } - }; - - /* - * nullbit - a null dentry - */ - struct nullbit { - string dn; - version_t dnv; - bool dirty; - nullbit(const string& d, version_t v, bool dr) : dn(d), dnv(v), dirty(dr) { } - nullbit(bufferlist& bl, int& off) { _decode(bl, off); } - void _encode(bufferlist& bl) { - ::_encode(dn, bl); - ::_encode(dnv, bl); - ::_encode(dirty, bl); - } - void _decode(bufferlist& bl, int& off) { - ::_decode(dn, bl, off); - ::_decode(dnv, bl, off); - ::_decode(dirty, bl, off); - } - void print(ostream& out) { - out << " nullbit dn " << dn << " dnv " << dnv - << " dirty=" << dirty << std::endl; - } - }; - - - /* dirlump - contains metadata for any dir we have contents for. - */ -public: - struct dirlump { - static const int STATE_COMPLETE = (1<<1); - static const int STATE_DIRTY = (1<<2); // dirty due to THIS journal item, that is! - - version_t dirv; - int state; - int nfull, nremote, nnull; - - private: - bufferlist dnbl; - bool dn_decoded; - list dfull; - list dremote; - list dnull; - - public: - dirlump() : dirv(0), state(0), nfull(0), nremote(0), nnull(0), dn_decoded(true) { } - - bool is_complete() { return state & STATE_COMPLETE; } - void mark_complete() { state |= STATE_COMPLETE; } - bool is_dirty() { return state & STATE_DIRTY; } - void mark_dirty() { state |= STATE_DIRTY; } - - list &get_dfull() { return dfull; } - list &get_dremote() { return dremote; } - list &get_dnull() { return dnull; } - - void print(dirfrag_t dirfrag, ostream& out) { - out << "dirlump " << dirfrag << " dirv " << dirv - << " state " << state - << " num " << nfull << "/" << nremote << "/" << nnull - << std::endl; - _decode_bits(); - for (list::iterator p = dfull.begin(); p != dfull.end(); ++p) - p->print(out); - for (list::iterator p = dremote.begin(); p != dremote.end(); ++p) - p->print(out); - for (list::iterator p = dnull.begin(); p != dnull.end(); ++p) - p->print(out); - } - - void _encode_bits() { - for (list::iterator p = dfull.begin(); p != dfull.end(); ++p) - p->_encode(dnbl); - for (list::iterator p = dremote.begin(); p != dremote.end(); ++p) - p->_encode(dnbl); - for (list::iterator p = dnull.begin(); p != dnull.end(); ++p) - p->_encode(dnbl); - } - void _decode_bits() { - if (dn_decoded) return; - int off = 0; - for (int i=0; i lump_order; - map lump_map; - - // anchor transactions included in this update. - list atids; - - // inode dirlocks (scatterlocks) i've touched. - map dirty_inode_mtimes; - - // ino's i've allocated - list allocated_inos; - version_t alloc_tablev; - - // inodes i've destroyed. - list< triple > truncated_inodes; - - // idempotent op(s) - list client_reqs; - - public: - // soft state - off_t last_subtree_map; - off_t my_offset; - - // for replay, in certain cases - LogSegment *_segment; - - EMetaBlob() : last_subtree_map(0), my_offset(0), _segment(0) { } - EMetaBlob(MDLog *mdl); // defined in journal.cc - - void print(ostream& out) { - for (list::iterator p = lump_order.begin(); - p != lump_order.end(); - ++p) { - lump_map[*p].print(*p, out); - } - } - - void add_client_req(metareqid_t r) { - client_reqs.push_back(r); - } - - void add_anchor_transaction(version_t atid) { - atids.push_back(atid); - } - - void add_dirtied_inode_mtime(inodeno_t ino, utime_t ctime) { - dirty_inode_mtimes[ino] = ctime; - } - - void add_allocated_ino(inodeno_t ino, version_t tablev) { - allocated_inos.push_back(ino); - alloc_tablev = tablev; - } - - void add_inode_truncate(inodeno_t ino, off_t newsize, off_t oldsize) { - truncated_inodes.push_back(triple(ino, newsize, oldsize)); - } - - void add_null_dentry(CDentry *dn, bool dirty) { - add_null_dentry(add_dir(dn->get_dir(), false), dn, dirty); - } - void add_null_dentry(dirlump& lump, CDentry *dn, bool dirty) { - // add the dir - lump.nnull++; - if (dirty) - lump.get_dnull().push_front(nullbit(dn->get_name(), - dn->get_projected_version(), - dirty)); - else - lump.get_dnull().push_back(nullbit(dn->get_name(), - dn->get_projected_version(), - dirty)); - } - - void add_remote_dentry(CDentry *dn, bool dirty, inodeno_t rino=0) { - add_remote_dentry(add_dir(dn->get_dir(), false), - dn, dirty, rino); - } - void add_remote_dentry(dirlump& lump, CDentry *dn, bool dirty, - inodeno_t rino=0, unsigned char rdt=0) { - if (!rino) { - rino = dn->get_remote_ino(); - rdt = dn->get_remote_d_type(); - } - lump.nremote++; - if (dirty) - lump.get_dremote().push_front(remotebit(dn->get_name(), - dn->get_projected_version(), - rino, rdt, - dirty)); - else - lump.get_dremote().push_back(remotebit(dn->get_name(), - dn->get_projected_version(), - rino, rdt, - dirty)); - } - - // return remote pointer to to-be-journaled inode - inode_t *add_primary_dentry(CDentry *dn, bool dirty, - CInode *in=0, inode_t *pi=0, fragtree_t *pdft=0) { - return add_primary_dentry(add_dir(dn->get_dir(), false), - dn, dirty, in, pi, pdft); - } - inode_t *add_primary_dentry(dirlump& lump, CDentry *dn, bool dirty, - CInode *in=0, inode_t *pi=0, fragtree_t *pdft=0) { - if (!in) - in = dn->get_inode(); - - // make note of where this inode was last journaled - in->last_journaled = my_offset; - //cout << "journaling " << in->inode.ino << " at " << my_offset << std::endl; - - lump.nfull++; - if (dirty) { - lump.get_dfull().push_front(fullbit(dn->get_name(), - dn->get_projected_version(), - in->inode, in->dirfragtree, in->symlink, - dirty)); - if (pi) lump.get_dfull().front().inode = *pi; - return &lump.get_dfull().front().inode; - } else { - lump.get_dfull().push_back(fullbit(dn->get_name(), - dn->get_projected_version(), - in->inode, in->dirfragtree, in->symlink, - dirty)); - if (pi) lump.get_dfull().back().inode = *pi; - return &lump.get_dfull().back().inode; - } - } - - // convenience: primary or remote? figure it out. - inode_t *add_dentry(CDentry *dn, bool dirty) { - dirlump& lump = add_dir(dn->get_dir(), false); - return add_dentry(lump, dn, dirty); - } - inode_t *add_dentry(dirlump& lump, CDentry *dn, bool dirty) { - // primary or remote - if (dn->is_remote()) { - add_remote_dentry(dn, dirty); - return 0; - } else if (dn->is_null()) { - add_null_dentry(dn, dirty); - return 0; - } - assert(dn->is_primary()); - return add_primary_dentry(dn, dirty); - } - - - dirlump& add_dir(CDir *dir, bool dirty, bool complete=false) { - return add_dir(dir->dirfrag(), dir->get_projected_version(), dirty, complete); - } - dirlump& add_dir(dirfrag_t df, version_t pv, bool dirty, bool complete=false) { - if (lump_map.count(df) == 0) { - lump_order.push_back(df); - lump_map[df].dirv = pv; - } - dirlump& l = lump_map[df]; - if (complete) l.mark_complete(); - if (dirty) l.mark_dirty(); - return l; - } - - static const int TO_AUTH_SUBTREE_ROOT = 0; // default. - static const int TO_ROOT = 1; - - void add_dir_context(CDir *dir, int mode = TO_AUTH_SUBTREE_ROOT) { - // already have this dir? (we must always add in order) - if (lump_map.count(dir->dirfrag())) - return; - - if (mode == TO_AUTH_SUBTREE_ROOT) { - //return; // hack: for comparison purposes.. what if NO context? - - // subtree root? - if (dir->is_subtree_root() && dir->is_auth()) - return; - - // was the inode journaled since the last subtree_map? - if (//false && // for benchmarking - last_subtree_map && - dir->inode->last_journaled >= last_subtree_map) { - /* - cout << " inode " << dir->inode->inode.ino - << " last journaled at " << dir->inode->last_journaled - << " and last_subtree_map is " << last_subtree_map - << std::endl; - */ - return; - } - } - - // stop at root/stray - CInode *diri = dir->get_inode(); - if (!diri->get_parent_dn()) - return; - - // journaled? - - // add parent dn - CDentry *parent = diri->get_parent_dn(); - add_dir_context(parent->get_dir(), mode); - add_dentry(parent, false); - } - - - // encoding - - void _encode(bufferlist& bl) { - int32_t n = lump_map.size(); - ::_encode(n, bl); - for (list::iterator i = lump_order.begin(); - i != lump_order.end(); - ++i) { - dirfrag_t dirfrag = *i; - ::_encode(dirfrag, bl); - lump_map[*i]._encode(bl); - } - ::_encode(atids, bl); - ::_encode(dirty_inode_mtimes, bl); - ::_encode(allocated_inos, bl); - if (!allocated_inos.empty()) - ::_encode(alloc_tablev, bl); - ::_encode(truncated_inodes, bl); - ::_encode(client_reqs, bl); - } - void _decode(bufferlist& bl, int& off) { - int32_t n; - ::_decode(n, bl, off); - for (int i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_EOPEN_H -#define __MDS_EOPEN_H - -#include "../LogEvent.h" -#include "EMetaBlob.h" - -class EOpen : public LogEvent { -public: - EMetaBlob metablob; - list inos; - - EOpen() : LogEvent(EVENT_OPEN) { } - EOpen(MDLog *mdlog) : - LogEvent(EVENT_OPEN), metablob(mdlog) { } - - void print(ostream& out) { - out << "EOpen " << metablob; - } - - void add_inode(CInode *in) { - inos.push_back(in->ino()); - metablob.add_dir_context(in->get_parent_dn()->get_dir()); - metablob.add_primary_dentry(in->get_parent_dn(), false); - } - - void encode_payload(bufferlist& bl) { - ::_encode(inos, bl); - metablob._encode(bl); - } - void decode_payload(bufferlist& bl, int& off) { - ::_decode(inos, bl, off); - metablob._decode(bl, off); - } - - void update_segment(); - void replay(MDS *mds); -}; - -#endif diff --git a/branches/sage/ebofs2/mds/events/EPurgeFinish.h b/branches/sage/ebofs2/mds/events/EPurgeFinish.h deleted file mode 100644 index dff0101b7699a..0000000000000 --- a/branches/sage/ebofs2/mds/events/EPurgeFinish.h +++ /dev/null @@ -1,54 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __EPURGE_H -#define __EPURGE_H - -#include -#include "config.h" -#include "include/types.h" - -class EPurgeFinish : public LogEvent { - protected: - inodeno_t ino; - off_t newsize, oldsize; - - public: - EPurgeFinish(inodeno_t i, off_t ns, off_t os) : - LogEvent(EVENT_PURGEFINISH), - ino(i), newsize(ns), oldsize(os) { } - EPurgeFinish() : LogEvent(EVENT_PURGEFINISH) { } - - void print(ostream& out) { - out << "purgefinish " << ino << " " << oldsize << " ->" << newsize; - } - - virtual void encode_payload(bufferlist& bl) { - bl.append((char*)&ino, sizeof(ino)); - bl.append((char*)&newsize, sizeof(newsize)); - bl.append((char*)&oldsize, sizeof(oldsize)); - } - void decode_payload(bufferlist& bl, int& off) { - ::_decode(ino, bl, off); - ::_decode(newsize, bl, off); - ::_decode(oldsize, bl, off); - } - - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); - void update_segment(); - void replay(MDS *mds); -}; - -#endif diff --git a/branches/sage/ebofs2/mds/events/ESession.h b/branches/sage/ebofs2/mds/events/ESession.h deleted file mode 100644 index a8f9992486a18..0000000000000 --- a/branches/sage/ebofs2/mds/events/ESession.h +++ /dev/null @@ -1,64 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_ESESSION_H -#define __MDS_ESESSION_H - -#include -#include "config.h" -#include "include/types.h" - -#include "../LogEvent.h" - -class ESession : public LogEvent { - protected: - entity_inst_t client_inst; - bool open; // open or close - version_t cmapv; // client map version - - public: - ESession() : LogEvent(EVENT_SESSION) { } - ESession(entity_inst_t inst, bool o, version_t v) : - LogEvent(EVENT_SESSION), - client_inst(inst), - open(o), - cmapv(v) { - } - - void encode_payload(bufferlist& bl) { - ::_encode(client_inst, bl); - ::_encode(open, bl); - ::_encode(cmapv, bl); - } - void decode_payload(bufferlist& bl, int& off) { - ::_decode(client_inst, bl, off); - ::_decode(open, bl, off); - ::_decode(cmapv, bl, off); - } - - - void print(ostream& out) { - if (open) - out << "ESession " << client_inst << " open cmapv " << cmapv; - else - out << "ESession " << client_inst << " close cmapv " << cmapv; - } - - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); - void update_segment(); - void replay(MDS *mds); -}; - -#endif diff --git a/branches/sage/ebofs2/mds/events/ESlaveUpdate.h b/branches/sage/ebofs2/mds/events/ESlaveUpdate.h deleted file mode 100644 index 54eaef9c6a296..0000000000000 --- a/branches/sage/ebofs2/mds/events/ESlaveUpdate.h +++ /dev/null @@ -1,79 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_ESLAVEUPDATE_H -#define __MDS_ESLAVEUPDATE_H - -#include "../LogEvent.h" -#include "EMetaBlob.h" - -class ESlaveUpdate : public LogEvent { -public: - const static int OP_PREPARE = 1; - const static int OP_COMMIT = 2; - const static int OP_ROLLBACK = 3; - - /* - * we journal a rollback metablob that contains the unmodified metadata - * too, because we may be updating previously dirty metadata, which - * will allow old log segments to be trimmed. if we end of rolling back, - * those updates could be lost.. so we re-journal the unmodified metadata, - * and replay will apply _either_ commit or rollback. - */ - EMetaBlob commit, rollback; - string type; - metareqid_t reqid; - int master; - int op; // prepare, commit, abort - - ESlaveUpdate() : LogEvent(EVENT_SLAVEUPDATE) { } - ESlaveUpdate(MDLog *mdlog, const char *s, metareqid_t ri, int mastermds, int o) : - LogEvent(EVENT_SLAVEUPDATE), commit(mdlog), rollback(mdlog), - type(s), - reqid(ri), - master(mastermds), - op(o) { } - - void print(ostream& out) { - if (type.length()) - out << type << " "; - out << " " << op; - out << " " << reqid; - out << " for mds" << master; - out << commit << " " << rollback; - } - - void encode_payload(bufferlist& bl) { - ::_encode(type, bl); - ::_encode(reqid, bl); - ::_encode(master, bl); - ::_encode(op, bl); - commit._encode(bl); - rollback._encode(bl); - } - void decode_payload(bufferlist& bl, int& off) { - ::_decode(type, bl, off); - ::_decode(reqid, bl, off); - ::_decode(master, bl, off); - ::_decode(op, bl, off); - commit._decode(bl, off); - rollback._decode(bl, off); - } - - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); - void replay(MDS *mds); -}; - -#endif diff --git a/branches/sage/ebofs2/mds/events/EString.h b/branches/sage/ebofs2/mds/events/EString.h deleted file mode 100644 index b292f9927d76f..0000000000000 --- a/branches/sage/ebofs2/mds/events/EString.h +++ /dev/null @@ -1,56 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __ESTRING_H -#define __ESTRING_H - -#include -#include -using namespace std; - -#include "../LogEvent.h" - -// generic log event -class EString : public LogEvent { - protected: - string event; - - public: - EString(string e) : - LogEvent(EVENT_STRING) { - event = e; - } - EString() : - LogEvent(EVENT_STRING) { - } - - void decode_payload(bufferlist& bl, int& off) { - ::_decode(event, bl, off); - } - void encode_payload(bufferlist& bl) { - ::_encode(event, bl); - } - - void print(ostream& out) { - out << '"' << event << '"'; - } - - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); - void replay(MDS *mds); - -}; - -#endif diff --git a/branches/sage/ebofs2/mds/events/ESubtreeMap.h b/branches/sage/ebofs2/mds/events/ESubtreeMap.h deleted file mode 100644 index cb6feb1d92ec6..0000000000000 --- a/branches/sage/ebofs2/mds/events/ESubtreeMap.h +++ /dev/null @@ -1,47 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_ESUBTREEMAP_H -#define __MDS_ESUBTREEMAP_H - -#include "../LogEvent.h" -#include "EMetaBlob.h" - -class ESubtreeMap : public LogEvent { -public: - EMetaBlob metablob; - map > subtrees; - - ESubtreeMap() : LogEvent(EVENT_SUBTREEMAP) { } - - void print(ostream& out) { - out << "subtree_map " << subtrees.size() << " subtrees " - << metablob; - } - - void encode_payload(bufferlist& bl) { - metablob._encode(bl); - ::_encode(subtrees, bl); - } - void decode_payload(bufferlist& bl, int& off) { - metablob._decode(bl, off); - ::_decode(subtrees, bl, off); - } - - //bool has_expired(MDS *mds); - //void expire(MDS *mds, Context *c); - void replay(MDS *mds); -}; - -#endif diff --git a/branches/sage/ebofs2/mds/events/EUpdate.h b/branches/sage/ebofs2/mds/events/EUpdate.h deleted file mode 100644 index de965429f9bdd..0000000000000 --- a/branches/sage/ebofs2/mds/events/EUpdate.h +++ /dev/null @@ -1,50 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_EUPDATE_H -#define __MDS_EUPDATE_H - -#include "../LogEvent.h" -#include "EMetaBlob.h" - -class EUpdate : public LogEvent { -public: - EMetaBlob metablob; - string type; - - EUpdate() : LogEvent(EVENT_UPDATE) { } - EUpdate(MDLog *mdlog, const char *s) : - LogEvent(EVENT_UPDATE), metablob(mdlog), - type(s) { } - - void print(ostream& out) { - if (type.length()) - out << type << " "; - out << metablob; - } - - void encode_payload(bufferlist& bl) { - ::_encode(type, bl); - metablob._encode(bl); - } - void decode_payload(bufferlist& bl, int& off) { - ::_decode(type, bl, off); - metablob._decode(bl, off); - } - - void update_segment(); - void replay(MDS *mds); -}; - -#endif diff --git a/branches/sage/ebofs2/mds/journal.cc b/branches/sage/ebofs2/mds/journal.cc deleted file mode 100644 index 1f27cf713a078..0000000000000 --- a/branches/sage/ebofs2/mds/journal.cc +++ /dev/null @@ -1,1084 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "events/EString.h" -#include "events/ESubtreeMap.h" -#include "events/ESession.h" - -#include "events/EMetaBlob.h" - -#include "events/EUpdate.h" -#include "events/ESlaveUpdate.h" -#include "events/EOpen.h" - -#include "events/EPurgeFinish.h" - -#include "events/EExport.h" -#include "events/EImportStart.h" -#include "events/EImportFinish.h" -#include "events/EFragment.h" - -#include "events/EAnchor.h" -#include "events/EAnchorClient.h" - -#include "LogSegment.h" - -#include "MDS.h" -#include "MDLog.h" -#include "MDCache.h" -#include "Server.h" -#include "Migrator.h" -#include "AnchorTable.h" -#include "AnchorClient.h" -#include "IdAllocator.h" -#include "Locker.h" - - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug_mds || l <= g_conf.debug_mds_log || l <= g_conf.debug_mds_log_expire) *_dout << dbeginl << g_clock.now() << " mds" << mds->get_nodeid() << ".journal " -#define derr(l) if (l<=g_conf.debug_mds || l <= g_conf.debug_mds_log || l <= g_conf.debug_mds_log_expire) *_dout << dbeginl << g_clock.now() << " mds" << mds->get_nodeid() << ".journal " - - -// ----------------------- -// LogSegment - -class C_MDL_RetryExpireSegment : public Context { -public: - MDS *mds; - LogSegment *ls; - C_MDL_RetryExpireSegment(MDS *m, LogSegment *l) : mds(m), ls(l) {} - void finish(int r) { - ls->try_to_expire(mds); - } -}; - -C_Gather *LogSegment::try_to_expire(MDS *mds) -{ - C_Gather *gather = 0; - - set commit; - - dout(6) << "LogSegment(" << offset << ").try_to_expire" << dendl; - - // commit dirs - for (xlist::iterator p = dirty_dirfrags.begin(); !p.end(); ++p) - commit.insert(*p); - for (xlist::iterator p = dirty_dentries.begin(); !p.end(); ++p) - commit.insert((*p)->get_dir()); - for (xlist::iterator p = dirty_inodes.begin(); !p.end(); ++p) - commit.insert((*p)->get_parent_dn()->get_dir()); - - if (!commit.empty()) { - if (!gather) gather = new C_Gather; - - for (set::iterator p = commit.begin(); - p != commit.end(); - ++p) { - CDir *dir = *p; - if (dir->can_auth_pin()) { - dout(15) << "try_to_expire committing " << *dir << dendl; - dir->commit(0, gather->new_sub()); - } else { - dout(15) << "try_to_expire waiting for unfreeze on " << *dir << dendl; - dir->add_waiter(CDir::WAIT_UNFREEZE, gather->new_sub()); - } - } - } - - // dirty non-auth mtimes - for (xlist::iterator p = dirty_inode_mtimes.begin(); !p.end(); ++p) { - CInode *in = *p; - dout(10) << "try_to_expire waiting for dirlock mtime flush on " << *in << dendl; - if (!gather) gather = new C_Gather; - - if (in->is_ambiguous_auth()) { - dout(10) << " waiting for single auth on " << *in << dendl; - in->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, gather->new_sub()); - } else if (in->is_auth()) { - dout(10) << " i'm auth, unscattering dirlock on " << *in << dendl; - assert(in->is_replicated()); // hrm! - mds->locker->scatter_lock(&in->dirlock); - in->dirlock.add_waiter(SimpleLock::WAIT_STABLE, gather->new_sub()); - } else { - dout(10) << " i'm a replica, requesting dirlock unscatter of " << *in << dendl; - mds->locker->scatter_try_unscatter(&in->dirlock, gather->new_sub()); - } - //(*p)->dirlock.add_waiter(SimpleLock::WAIT_STABLE, gather->new_sub()); - } - - // open files - if (!open_files.empty()) { - assert(!mds->mdlog->is_capped()); // hmm FIXME - for (xlist::iterator p = open_files.begin(); !p.end(); ++p) { - dout(20) << "try_to_expire requeueing open file " << **p << dendl; - mds->server->queue_journal_open(*p); - } - if (!gather) gather = new C_Gather; - mds->server->add_journal_open_waiter(gather->new_sub()); - mds->server->maybe_journal_opens(); - dout(10) << "try_to_expire waiting for open files to rejournal" << dendl; - } - - // slave updates - for (xlist::iterator p = slave_updates.begin(); !p.end(); ++p) { - MDSlaveUpdate *su = *p; - dout(10) << "try_to_expire waiting on slave update " << su << dendl; - assert(su->waiter == 0); - if (!gather) gather = new C_Gather; - su->waiter = gather->new_sub(); - } - - // idalloc - if (allocv > mds->idalloc->get_committed_version()) { - dout(10) << "try_to_expire saving idalloc table, need " << allocv - << ", committed is " << mds->idalloc->get_committed_version() - << " (" << mds->idalloc->get_committing_version() << ")" - << dendl; - if (!gather) gather = new C_Gather; - mds->idalloc->save(gather->new_sub(), allocv); - } - - // clientmap - if (clientmapv > mds->clientmap.get_committed()) { - dout(10) << "try_to_expire saving clientmap, need " << clientmapv - << ", committed is " << mds->clientmap.get_committed() - << " (" << mds->clientmap.get_committing() << ")" - << dendl; - if (!gather) gather = new C_Gather; - mds->clientmap.save(gather->new_sub(), clientmapv); - } - - // pending commit atids - for (hash_set::iterator p = pending_commit_atids.begin(); - p != pending_commit_atids.end(); - ++p) { - if (!gather) gather = new C_Gather; - assert(!mds->anchorclient->has_committed(*p)); - dout(10) << "try_to_expire anchor transaction " << *p - << " pending commit (not yet acked), waiting" << dendl; - mds->anchorclient->wait_for_ack(*p, gather->new_sub()); - } - - // anchortable - if (anchortablev > mds->anchortable->get_committed_version()) { - dout(10) << "try_to_expire waiting for anchor table to save, need " << anchortablev << dendl; - if (!gather) gather = new C_Gather; - mds->anchortable->save(gather->new_sub()); - } - - // FIXME client requests...? - // audit handling of anchor transactions? - - if (gather) { - dout(6) << "LogSegment(" << offset << ").try_to_expire waiting" << dendl; - } else { - dout(6) << "LogSegment(" << offset << ").try_to_expire success" << dendl; - } - return gather; -} - - - -#undef dout -#undef derr -#define dout(l) if (l<=g_conf.debug_mds || l <= g_conf.debug_mds_log) *_dout << dbeginl << g_clock.now() << " mds" << mds->get_nodeid() << ".journal " -#define derr(l) if (l<=g_conf.debug_mds || l <= g_conf.debug_mds_log) *_dout << dbeginl << g_clock.now() << " mds" << mds->get_nodeid() << ".journal " - - -// ----------------------- -// EString - -bool EString::has_expired(MDS *mds) { - dout(10) << "EString.has_expired " << event << dendl; - return true; -} -void EString::expire(MDS *mds, Context *c) -{ - dout(10) << "EString.expire " << event << dendl; -} -void EString::replay(MDS *mds) -{ - dout(10) << "EString.replay " << event << dendl; -} - - - -// ----------------------- -// EMetaBlob - -EMetaBlob::EMetaBlob(MDLog *mdlog) : - last_subtree_map(mdlog->get_last_segment_offset()), - my_offset(mdlog->get_write_pos()) -{ -} - - -/* - * we need to ensure that a journaled item has either - * - * - been safely committed to its dirslice. - * - * - has been safely exported. i.e., authority().first != us. - * in particular, auth of is not enough, we need to - * wait for . - * - * note that this check is overly conservative, in that we'll - * try to flush the dir again if we reimport the subtree, even though - * later journal entries contain the same dirty data (from the import). - * - */ -bool EMetaBlob::has_expired(MDS *mds) -{ -/* - // examine dirv's for my lumps - for (map::iterator lp = lump_map.begin(); - lp != lump_map.end(); - ++lp) { - CDir *dir = mds->mdcache->get_dirfrag(lp->first); - if (!dir) - continue; // we expired it - - // FIXME: check the slice only - - if (dir->authority().first != mds->get_nodeid()) { - dout(10) << "EMetaBlob.has_expired not auth, needed dirv " << lp->second.dirv - << " for " << *dir << dendl; - continue; // not our problem - } - - if (g_conf.mds_hack_log_expire_for_better_stats) { - // FIXME HACK: this makes logger stats more accurage, for journal stats, - // but is not perfectly safe. for benchmarking ONLY! - if (dir->get_committing_version() >= lp->second.dirv || // committING, not committED. - dir->get_committed_version_equivalent() >= lp->second.dirv) { - dout(10) << "EMetaBlob.has_expired have|committING (unsafe hack!) dirv " << lp->second.dirv - << " for " << *dir << dendl; - continue; // yay - } - } else { - // this is the proper (safe) way - if (dir->get_committed_version() >= lp->second.dirv || - dir->get_committed_version_equivalent() >= lp->second.dirv) { - dout(10) << "EMetaBlob.has_expired have dirv " << lp->second.dirv - << " for " << *dir << dendl; - continue; // yay - } - } - - if (dir->is_ambiguous_dir_auth()) { - CDir *ex = mds->mdcache->get_subtree_root(dir); - if (ex->is_exporting()) { - // wait until export is acked (logged on remote) and committed (logged locally) - dout(10) << "EMetaBlob.has_expired ambiguous auth for " << *dir - << ", exporting on " << *ex << dendl; - return false; - } else { - dout(10) << "EMetaBlob.has_expired ambiguous auth for " << *dir - << ", importing on " << *ex << dendl; - return false; - } - } - - if (dir->get_committed_version() < lp->second.dirv) { - dout(10) << "EMetaBlob.has_expired need dirv " << lp->second.dirv - << " for " << *dir << dendl; - return false; // not committed. - } - - assert(0); // i goofed the logic - } - - // have my anchortable ops committed? - for (list::iterator p = atids.begin(); - p != atids.end(); - ++p) { - if (!mds->anchorclient->has_committed(*p)) { - dout(10) << "EMetaBlob.has_expired anchor transaction " << *p - << " not yet acked" << dendl; - return false; - } - } - - if (!dirty_inode_mtimes.empty()) - for (map::iterator p = dirty_inode_mtimes.begin(); - p != dirty_inode_mtimes.end(); - ++p) { - CInode *in = mds->mdcache->get_inode(p->first); - if (in) { - if (in->inode.ctime == p->second && - in->dirlock.is_updated()) { - dout(10) << "EMetaBlob.has_expired dirty mtime dirlock hasn't flushed on " << *in << dendl; - return false; - } - } - } - - // allocated_ios - if (!allocated_inos.empty()) { - version_t cv = mds->idalloc->get_committed_version(); - if (cv < alloc_tablev) { - dout(10) << "EMetaBlob.has_expired idalloc tablev " << alloc_tablev << " > " << cv - << ", still dirty" << dendl; - return false; // still dirty - } else { - dout(10) << "EMetaBlob.has_expired idalloc tablev " << alloc_tablev << " <= " << cv - << ", already flushed" << dendl; - } - } - - - // truncated inodes - for (list< pair >::iterator p = truncated_inodes.begin(); - p != truncated_inodes.end(); - ++p) { - if (mds->mdcache->is_purging(p->first.ino, p->second)) { - dout(10) << "EMetaBlob.has_expired still purging inode " << p->first.ino - << " to " << p->second << dendl; - return false; - } - } - - // client requests - for (list::iterator p = client_reqs.begin(); - p != client_reqs.end(); - ++p) { - if (mds->clientmap.have_completed_request(*p)) { - dout(10) << "EMetaBlob.has_expired still have completed request " << *p - << dendl; - return false; - } - } - - - */ - return true; // all dirlumps expired, etc. -} - - -void EMetaBlob::expire(MDS *mds, Context *c) -{ -/* - map commit; // dir -> version needed - list waitfor_export; - list waitfor_import; - int ncommit = 0; - - // examine dirv's for my lumps - // make list of dir slices i need to commit - for (map::iterator lp = lump_map.begin(); - lp != lump_map.end(); - ++lp) { - CDir *dir = mds->mdcache->get_dirfrag(lp->first); - if (!dir) - continue; // we expired it - - // FIXME: check the slice only - - if (dir->authority().first != mds->get_nodeid()) { - dout(10) << "EMetaBlob.expire not auth, needed dirv " << lp->second.dirv - << " for " << *dir << dendl; - continue; // not our problem - } - if (dir->get_committed_version() >= lp->second.dirv || - dir->get_committed_version_equivalent() >= lp->second.dirv) { - dout(10) << "EMetaBlob.expire have dirv " << lp->second.dirv - << " on " << *dir << dendl; - continue; // yay - } - - if (dir->is_ambiguous_dir_auth()) { - CDir *ex = mds->mdcache->get_subtree_root(dir); - if (ex->is_exporting()) { - // wait until export is acked (logged on remote) and committed (logged locally) - dout(10) << "EMetaBlob.expire ambiguous auth for " << *dir - << ", waiting for export finish on " << *ex << dendl; - waitfor_export.push_back(ex); - continue; - } else { - dout(10) << "EMetaBlob.expire ambiguous auth for " << *dir - << ", waiting for import finish on " << *ex << dendl; - waitfor_import.push_back(ex); - continue; - } - } - - assert(dir->get_committed_version() < lp->second.dirv); - dout(10) << "EMetaBlob.expire need dirv " << lp->second.dirv - << ", committing " << *dir << dendl; - commit[dir] = MAX(commit[dir], lp->second.dirv); - ncommit++; - } - - // set up gather context - C_Gather *gather = new C_Gather(c); - - // do or wait for exports and commits - for (map::iterator p = commit.begin(); - p != commit.end(); - ++p) { - if (p->first->can_auth_pin()) - p->first->commit(p->second, gather->new_sub()); - else - // pbly about to export|split|merge. - // just wait for it to unfreeze, then retry - p->first->add_waiter(CDir::WAIT_UNFREEZE, gather->new_sub()); - } - for (list::iterator p = waitfor_export.begin(); - p != waitfor_export.end(); - ++p) - mds->mdcache->migrator->add_export_finish_waiter(*p, gather->new_sub()); - for (list::iterator p = waitfor_import.begin(); - p != waitfor_import.end(); - ++p) - (*p)->add_waiter(CDir::WAIT_UNFREEZE, gather->new_sub()); - - - // have my anchortable ops committed? - for (list::iterator p = atids.begin(); - p != atids.end(); - ++p) { - if (!mds->anchorclient->has_committed(*p)) { - dout(10) << "EMetaBlob.expire anchor transaction " << *p - << " not yet acked, waiting" << dendl; - mds->anchorclient->wait_for_ack(*p, gather->new_sub()); - } - } - - // dirtied inode mtimes - if (!dirty_inode_mtimes.empty()) - for (map::iterator p = dirty_inode_mtimes.begin(); - p != dirty_inode_mtimes.end(); - ++p) { - CInode *in = mds->mdcache->get_inode(p->first); - if (in) { - if (in->inode.ctime == p->second && - in->dirlock.is_updated()) { - dout(10) << "EMetaBlob.expire dirty mtime dirlock hasn't flushed, waiting on " - << *in << dendl; - in->dirlock.add_waiter(SimpleLock::WAIT_STABLE, gather->new_sub()); - } - } - } - - // allocated_inos - if (!allocated_inos.empty()) { - version_t cv = mds->idalloc->get_committed_version(); - if (cv < alloc_tablev) { - dout(10) << "EMetaBlob.expire saving idalloc table, need " << alloc_tablev << dendl; - mds->idalloc->save(gather->new_sub(), alloc_tablev); - } - } - - // truncated inodes - for (list< pair >::iterator p = truncated_inodes.begin(); - p != truncated_inodes.end(); - ++p) { - if (mds->mdcache->is_purging(p->first.ino, p->second)) { - dout(10) << "EMetaBlob.expire waiting for purge of inode " << p->first.ino - << " to " << p->second << dendl; - mds->mdcache->wait_for_purge(p->first.ino, p->second, gather->new_sub()); - } - } - - // client requests - for (list::iterator p = client_reqs.begin(); - p != client_reqs.end(); - ++p) { - if (mds->clientmap.have_completed_request(*p)) { - dout(10) << "EMetaBlob.expire waiting on completed request " << *p - << dendl; - mds->clientmap.add_trim_waiter(*p, gather->new_sub()); - } - } - - dout(10) << "my gather finsher is " << gather << " with " << gather->get_num() << dendl; - -*/ -} - -void EMetaBlob::update_segment(LogSegment *ls) -{ - // atids? - //for (list::iterator p = atids.begin(); p != atids.end(); ++p) - // ls->pending_commit_atids[*p] = ls; - // -> handled directly by AnchorClient - - // dirty inode mtimes - // -> handled directly by Server.cc, replay() - - // alloc table update? - if (!allocated_inos.empty()) - ls->allocv = alloc_tablev; - - // truncated inodes - // -> handled directly by Server.cc - - // client requests - // note the newest request per client - //if (!client_reqs.empty()) - // ls->last_client_tid[client_reqs.rbegin()->client] = client_reqs.rbegin()->tid); -} - -void EMetaBlob::replay(MDS *mds, LogSegment *logseg) -{ - dout(10) << "EMetaBlob.replay " << lump_map.size() << " dirlumps" << dendl; - - if (!logseg) logseg = _segment; - assert(logseg); - - // walk through my dirs (in order!) - for (list::iterator lp = lump_order.begin(); - lp != lump_order.end(); - ++lp) { - dout(10) << "EMetaBlob.replay dir " << *lp << dendl; - dirlump &lump = lump_map[*lp]; - - // the dir - CDir *dir = mds->mdcache->get_dirfrag(*lp); - if (!dir) { - // hmm. do i have the inode? - CInode *diri = mds->mdcache->get_inode((*lp).ino); - if (!diri) { - if ((*lp).ino == MDS_INO_ROOT) { - diri = mds->mdcache->create_root_inode(); - dout(10) << "EMetaBlob.replay created root " << *diri << dendl; - } else if (MDS_INO_IS_STRAY((*lp).ino)) { - int whose = (*lp).ino - MDS_INO_STRAY_OFFSET; - diri = mds->mdcache->create_stray_inode(whose); - dout(10) << "EMetaBlob.replay created stray " << *diri << dendl; - } else { - dout(0) << "EMetaBlob.replay missing dir ino " << (*lp).ino << dendl; - assert(0); - } - } - // create the dirfrag - dir = diri->get_or_open_dirfrag(mds->mdcache, (*lp).frag); - - if ((*lp).ino < MDS_INO_BASE) - mds->mdcache->adjust_subtree_auth(dir, CDIR_AUTH_UNKNOWN); - - dout(10) << "EMetaBlob.replay added dir " << *dir << dendl; - } - dir->set_version( lump.dirv ); - if (lump.is_dirty()) - dir->_mark_dirty(logseg); - if (lump.is_complete()) - dir->mark_complete(); - - // decode bits - lump._decode_bits(); - - // full dentry+inode pairs - for (list::iterator p = lump.get_dfull().begin(); - p != lump.get_dfull().end(); - p++) { - CDentry *dn = dir->lookup(p->dn); - if (!dn) { - dn = dir->add_null_dentry(p->dn); - dn->set_version(p->dnv); - if (p->dirty) dn->_mark_dirty(logseg); - dout(10) << "EMetaBlob.replay added " << *dn << dendl; - } else { - dn->set_version(p->dnv); - if (p->dirty) dn->_mark_dirty(logseg); - dout(10) << "EMetaBlob.replay had " << *dn << dendl; - } - - CInode *in = mds->mdcache->get_inode(p->inode.ino); - if (!in) { - in = new CInode(mds->mdcache); - in->inode = p->inode; - in->dirfragtree = p->dirfragtree; - if (in->inode.is_symlink()) in->symlink = p->symlink; - mds->mdcache->add_inode(in); - if (!dn->is_null()) { - if (dn->is_primary()) - dout(-10) << "EMetaBlob.replay FIXME had dentry linked to wrong inode " << *dn - << " " << *dn->get_inode() - << " should be " << p->inode.ino - << dendl; - dir->unlink_inode(dn); - //assert(0); // hrm! fallout from sloppy unlink? or? hmmm FIXME investigate further - } - dir->link_primary_inode(dn, in); - if (p->dirty) in->_mark_dirty(logseg); - dout(10) << "EMetaBlob.replay added " << *in << dendl; - } else { - if (dn->get_inode() != in && in->get_parent_dn()) { - dout(10) << "EMetaBlob.replay unlinking " << *in << dendl; - in->get_parent_dn()->get_dir()->unlink_inode(in->get_parent_dn()); - } - in->inode = p->inode; - in->dirfragtree = p->dirfragtree; - if (in->inode.is_symlink()) in->symlink = p->symlink; - if (p->dirty) in->_mark_dirty(logseg); - if (dn->get_inode() != in) { - dir->link_primary_inode(dn, in); - dout(10) << "EMetaBlob.replay linked " << *in << dendl; - } else { - dout(10) << "EMetaBlob.replay had " << *in << dendl; - } - } - } - - // remote dentries - for (list::iterator p = lump.get_dremote().begin(); - p != lump.get_dremote().end(); - p++) { - CDentry *dn = dir->lookup(p->dn); - if (!dn) { - dn = dir->add_remote_dentry(p->dn, p->ino, p->d_type); - dn->set_version(p->dnv); - if (p->dirty) dn->_mark_dirty(logseg); - dout(10) << "EMetaBlob.replay added " << *dn << dendl; - } else { - if (!dn->is_null()) { - dout(10) << "EMetaBlob.replay unlinking " << *dn << dendl; - dir->unlink_inode(dn); - } - dn->set_remote(p->ino, p->d_type); - dn->set_version(p->dnv); - if (p->dirty) dn->_mark_dirty(logseg); - dout(10) << "EMetaBlob.replay had " << *dn << dendl; - } - } - - // null dentries - for (list::iterator p = lump.get_dnull().begin(); - p != lump.get_dnull().end(); - p++) { - CDentry *dn = dir->lookup(p->dn); - if (!dn) { - dn = dir->add_null_dentry(p->dn); - dn->set_version(p->dnv); - if (p->dirty) dn->_mark_dirty(logseg); - dout(10) << "EMetaBlob.replay added " << *dn << dendl; - } else { - if (!dn->is_null()) { - dout(10) << "EMetaBlob.replay unlinking " << *dn << dendl; - dir->unlink_inode(dn); - } - dn->set_version(p->dnv); - if (p->dirty) dn->_mark_dirty(logseg); - dout(10) << "EMetaBlob.replay had " << *dn << dendl; - } - } - } - - // anchor transactions - for (list::iterator p = atids.begin(); - p != atids.end(); - ++p) { - dout(10) << "EMetaBlob.replay noting anchor transaction " << *p << dendl; - mds->anchorclient->got_journaled_agree(*p, logseg); - } - - // dirtied inode mtimes - if (!dirty_inode_mtimes.empty()) - for (map::iterator p = dirty_inode_mtimes.begin(); - p != dirty_inode_mtimes.end(); - ++p) { - CInode *in = mds->mdcache->get_inode(p->first); - dout(10) << "EMetaBlob.replay setting dirlock updated flag on " << *in << dendl; - in->dirlock.set_updated(); - logseg->dirty_inode_mtimes.push_back(&in->xlist_dirty_inode_mtime); - } - - // allocated_inos - if (!allocated_inos.empty()) { - if (mds->idalloc->get_version() >= alloc_tablev) { - dout(10) << "EMetaBlob.replay idalloc tablev " << alloc_tablev - << " <= table " << mds->idalloc->get_version() << dendl; - } else { - for (list::iterator p = allocated_inos.begin(); - p != allocated_inos.end(); - ++p) { - dout(10) << " EMetaBlob.replay idalloc " << *p << " tablev " << alloc_tablev - << " - 1 == table " << mds->idalloc->get_version() << dendl; - assert(alloc_tablev-1 == mds->idalloc->get_version()); - - inodeno_t ino = mds->idalloc->alloc_id(); - assert(ino == *p); // this should match. - } - assert(alloc_tablev == mds->idalloc->get_version()); - } - } - - // truncated inodes - for (list< triple >::iterator p = truncated_inodes.begin(); - p != truncated_inodes.end(); - ++p) { - CInode *in = mds->mdcache->get_inode(p->first); - assert(in); - dout(10) << "EMetaBlob.replay will purge truncated " - << p->third << " -> " << p->second - << " on " << *in << dendl; - mds->mdcache->add_recovered_purge(in, p->second, p->third, logseg); - } - - // client requests - for (list::iterator p = client_reqs.begin(); - p != client_reqs.end(); - ++p) - mds->clientmap.add_completed_request(*p); - - - // update segment - update_segment(logseg); -} - -// ----------------------- -// ESession - -void ESession::update_segment() -{ - _segment->clientmapv = cmapv; -} - -void ESession::replay(MDS *mds) -{ - if (mds->clientmap.get_version() >= cmapv) { - dout(10) << "ESession.replay clientmap " << mds->clientmap.get_version() - << " >= " << cmapv << ", noop" << dendl; - - // hrm, this isn't very pretty. - if (!open) - mds->clientmap.trim_completed_requests(client_inst.name.num(), 0); - - } else { - dout(10) << "ESession.replay clientmap " << mds->clientmap.get_version() - << " < " << cmapv << dendl; - assert(mds->clientmap.get_version() + 1 == cmapv); - if (open) { - mds->clientmap.open_session(client_inst); - } else { - mds->clientmap.close_session(client_inst.name.num()); - mds->clientmap.trim_completed_requests(client_inst.name.num(), 0); - } - mds->clientmap.reset_projected(); // make it follow version. - } -} - - - -// ----------------------- -// EAnchor - -void EAnchor::update_segment() -{ - _segment->anchortablev = version; -} - -void EAnchor::replay(MDS *mds) -{ - if (mds->anchortable->get_version() >= version) { - dout(10) << "EAnchor.replay event " << version - << " <= table " << mds->anchortable->get_version() << dendl; - } else { - dout(10) << " EAnchor.replay event " << version - << " - 1 == table " << mds->anchortable->get_version() << dendl; - assert(version-1 == mds->anchortable->get_version()); - - switch (op) { - // anchortable - case ANCHOR_OP_CREATE_PREPARE: - mds->anchortable->create_prepare(ino, trace, reqmds); - break; - case ANCHOR_OP_DESTROY_PREPARE: - mds->anchortable->destroy_prepare(ino, reqmds); - break; - case ANCHOR_OP_UPDATE_PREPARE: - mds->anchortable->update_prepare(ino, trace, reqmds); - break; - case ANCHOR_OP_COMMIT: - mds->anchortable->commit(atid); - break; - - default: - assert(0); - } - - assert(version == mds->anchortable->get_version()); - } -} - - -// EAnchorClient - -void EAnchorClient::replay(MDS *mds) -{ - dout(10) << " EAnchorClient.replay op " << op << " atid " << atid << dendl; - - switch (op) { - // anchorclient - case ANCHOR_OP_ACK: - mds->anchorclient->got_journaled_ack(atid); - break; - - default: - assert(0); - } -} - - -// ----------------------- -// EUpdate - -void EUpdate::update_segment() -{ - metablob.update_segment(_segment); -} - -void EUpdate::replay(MDS *mds) -{ - metablob.replay(mds, _segment); -} - - -// ------------------------ -// EOpen - -void EOpen::update_segment() -{ - // ?? -} - -void EOpen::replay(MDS *mds) -{ - dout(10) << "EOpen.replay " << dendl; - metablob.replay(mds, _segment); -} - - -// ----------------------- -// ESlaveUpdate - -void ESlaveUpdate::replay(MDS *mds) -{ - switch (op) { - case ESlaveUpdate::OP_PREPARE: - // FIXME: horribly inefficient copy; EMetaBlob needs a swap() or something - dout(10) << "ESlaveUpdate.replay prepare " << reqid << " for mds" << master - << ": saving blobs for later commit" << dendl; - assert(mds->mdcache->uncommitted_slave_updates[master].count(reqid) == 0); - commit._segment = _segment; // may need this later - rollback._segment = _segment; // may need this later - mds->mdcache->uncommitted_slave_updates[master][reqid] = - MDSlaveUpdate(commit, rollback, _segment->slave_updates); - break; - - case ESlaveUpdate::OP_COMMIT: - if (mds->mdcache->uncommitted_slave_updates[master].count(reqid)) { - dout(10) << "ESlaveUpdate.replay commit " << reqid << " for mds" << master - << ": applying commit blob" << dendl; - mds->mdcache->uncommitted_slave_updates[master][reqid].commit.replay(mds, _segment); - mds->mdcache->uncommitted_slave_updates[master].erase(reqid); - } else { - dout(10) << "ESlaveUpdate.replay commit " << reqid << " for mds" << master - << ": ignoring, no previously saved blobs" << dendl; - } - break; - - case ESlaveUpdate::OP_ROLLBACK: - if (mds->mdcache->uncommitted_slave_updates[master].count(reqid)) { - dout(10) << "ESlaveUpdate.replay abort " << reqid << " for mds" << master - << ": applying rollback blob" << dendl; - assert(mds->mdcache->uncommitted_slave_updates[master].count(reqid)); - mds->mdcache->uncommitted_slave_updates[master][reqid].rollback.replay(mds, _segment); - mds->mdcache->uncommitted_slave_updates[master].erase(reqid); - } else { - dout(10) << "ESlaveUpdate.replay abort " << reqid << " for mds" << master - << ": ignoring, no previously saved blobs" << dendl; - } - break; - - default: - assert(0); - } -} - - -// ----------------------- -// ESubtreeMap - -void ESubtreeMap::replay(MDS *mds) -{ - // suck up the subtree map? - if (mds->mdcache->is_subtrees()) { - dout(10) << "ESubtreeMap.replay -- ignoring, already have import map" << dendl; - return; - } - - dout(10) << "ESubtreeMap.replay -- reconstructing (auth) subtree spanning tree" << dendl; - - // first, stick the spanning tree in my cache - //metablob.print(cout); - metablob.replay(mds, _segment); - - // restore import/export maps - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) { - CDir *dir = mds->mdcache->get_dirfrag(p->first); - mds->mdcache->adjust_bounded_subtree_auth(dir, p->second, mds->get_nodeid()); - } - - mds->mdcache->show_subtrees(); -} - - - -// ----------------------- -// EFragment - -void EFragment::replay(MDS *mds) -{ - dout(10) << "EFragment.replay " << ino << " " << basefrag << " by " << bits << dendl; - - CInode *in = mds->mdcache->get_inode(ino); - assert(in); - - list resultfrags; - list waiters; - mds->mdcache->adjust_dir_fragments(in, basefrag, bits, resultfrags, waiters); - - metablob.replay(mds, _segment); -} - - - -// ----------------------- -// EPurgeFinish - - -bool EPurgeFinish::has_expired(MDS *mds) -{ - return true; -} - -void EPurgeFinish::expire(MDS *mds, Context *c) -{ - assert(0); -} - -void EPurgeFinish::update_segment() -{ - // ** update purge lists? -} - -void EPurgeFinish::replay(MDS *mds) -{ - dout(10) << "EPurgeFinish.replay " << ino << " " << oldsize << " -> " << newsize << dendl; - CInode *in = mds->mdcache->get_inode(ino); - assert(in); - mds->mdcache->remove_recovered_purge(in, newsize, oldsize); -} - - - - - -// ========================================================================= - -// ----------------------- -// EExport - -bool EExport::has_expired(MDS *mds) -{ - CDir *dir = mds->mdcache->get_dirfrag(base); - if (dir && mds->mdcache->migrator->is_exporting(dir)) { - dout(10) << "EExport.has_expired still exporting " << *dir << dendl; - return false; - } - return true; -} - -void EExport::expire(MDS *mds, Context *c) -{ - CDir *dir = mds->mdcache->get_dirfrag(base); - assert(dir); - assert(mds->mdcache->migrator->is_exporting(dir)); - - dout(10) << "EExport.expire waiting for export of " << *dir << dendl; - mds->mdcache->migrator->add_export_finish_waiter(dir, c); -} - -void EExport::replay(MDS *mds) -{ - dout(10) << "EExport.replay " << base << dendl; - metablob.replay(mds, _segment); - - CDir *dir = mds->mdcache->get_dirfrag(base); - assert(dir); - - set realbounds; - for (set::iterator p = bounds.begin(); - p != bounds.end(); - ++p) { - CDir *bd = mds->mdcache->get_dirfrag(*p); - assert(bd); - realbounds.insert(bd); - } - - // adjust auth away - mds->mdcache->adjust_bounded_subtree_auth(dir, realbounds, pair(CDIR_AUTH_UNKNOWN, CDIR_AUTH_UNKNOWN)); - mds->mdcache->try_subtree_merge(dir); -} - - -// ----------------------- -// EImportStart - -void EImportStart::replay(MDS *mds) -{ - dout(10) << "EImportStart.replay " << base << dendl; - metablob.replay(mds, _segment); - - // put in ambiguous import list - mds->mdcache->add_ambiguous_import(base, bounds); -} - -// ----------------------- -// EImportFinish - -bool EImportFinish::has_expired(MDS *mds) -{ - return true; -} -void EImportFinish::expire(MDS *mds, Context *c) -{ - assert(0); // shouldn't ever happen -} - -void EImportFinish::replay(MDS *mds) -{ - if (mds->mdcache->have_ambiguous_import(base)) { - dout(10) << "EImportFinish.replay " << base << " success=" << success << dendl; - if (success) - mds->mdcache->finish_ambiguous_import(base); - else - mds->mdcache->cancel_ambiguous_import(base); - } else { - dout(10) << "EImportFinish.replay " << base << " success=" << success - << ", predates my subtree_map start point, ignoring" - << dendl; - // verify that? - } -} - - - - - diff --git a/branches/sage/ebofs2/mds/mdstypes.h b/branches/sage/ebofs2/mds/mdstypes.h deleted file mode 100644 index ee14474761ada..0000000000000 --- a/branches/sage/ebofs2/mds/mdstypes.h +++ /dev/null @@ -1,684 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -#ifndef __MDSTYPES_H -#define __MDSTYPES_H - - -#include -#include -#include -#include -using namespace std; - -#include "config.h" -#include "common/DecayCounter.h" -#include "include/Context.h" - -#include - -#include "include/frag.h" -#include "include/xlist.h" - -#define MDS_REF_SET // define me for improved debug output, sanity checking - - -#define MDS_PORT_CACHE 0x200 -#define MDS_PORT_LOCKER 0x300 -#define MDS_PORT_MIGRATOR 0x400 - - -#define MAX_MDS 0x100 - -#define MDS_INO_ROOT 1 -#define MDS_INO_PGTABLE 2 -#define MDS_INO_ANCHORTABLE 3 -#define MDS_INO_PG 4 // *** WARNING: this should match osd/osd_types.h PG_INO *** -#define MDS_INO_LOG_OFFSET (1*MAX_MDS) -#define MDS_INO_IDS_OFFSET (2*MAX_MDS) -#define MDS_INO_CLIENTMAP_OFFSET (3*MAX_MDS) -#define MDS_INO_STRAY_OFFSET (4*MAX_MDS) -#define MDS_INO_BASE (5*MAX_MDS) - -#define MDS_INO_STRAY(x) (MDS_INO_STRAY_OFFSET+((unsigned)x)) -#define MDS_INO_IS_STRAY(i) ((i) >= MDS_INO_STRAY_OFFSET && (i) < MDS_INO_STRAY_OFFSET+MAX_MDS) - -#define MDS_TRAVERSE_FORWARD 1 -#define MDS_TRAVERSE_DISCOVER 2 // skips permissions checks etc. -#define MDS_TRAVERSE_DISCOVERXLOCK 3 // succeeds on (foreign?) null, xlocked dentries. -#define MDS_TRAVERSE_FAIL 4 - - -struct metareqid_t { - uint64_t tid; - int32_t client; - int32_t _pad; - metareqid_t() : tid(0), client(-1), _pad(0) {} - metareqid_t(int c, tid_t t) : tid(t), client(c), _pad(0) {} -}; - -inline ostream& operator<<(ostream& out, const metareqid_t& r) { - return out << "client" << r.client << ":" << r.tid; -} - -inline bool operator==(const metareqid_t& l, const metareqid_t& r) { - return (l.client == r.client) && (l.tid == r.tid); -} -inline bool operator!=(const metareqid_t& l, const metareqid_t& r) { - return (l.client != r.client) || (l.tid != r.tid); -} -inline bool operator<(const metareqid_t& l, const metareqid_t& r) { - return (l.client < r.client) || - (l.client == r.client && l.tid < r.tid); -} -inline bool operator<=(const metareqid_t& l, const metareqid_t& r) { - return (l.client < r.client) || - (l.client == r.client && l.tid <= r.tid); -} -inline bool operator>(const metareqid_t& l, const metareqid_t& r) { return !(l <= r); } -inline bool operator>=(const metareqid_t& l, const metareqid_t& r) { return !(l < r); } - -namespace __gnu_cxx { - template<> struct hash { - size_t operator()(const metareqid_t &r) const { - hash H; - return H(r.client) ^ H(r.tid); - } - }; -} - - -// inode caps info for client reconnect -struct inode_caps_reconnect_t { - int32_t wanted; - int32_t issued; - off_t size; - utime_t mtime, atime; - - inode_caps_reconnect_t() {} - inode_caps_reconnect_t(int w, int i) : - wanted(w), issued(i), size(0) {} - inode_caps_reconnect_t(int w, int i, off_t sz, utime_t mt, utime_t at) : - wanted(w), issued(i), size(sz), mtime(mt), atime(at) {} -}; - - -// ================================================================ -// dir frag - -struct dirfrag_t { - inodeno_t ino; - frag_t frag; - uint32_t _pad; - - dirfrag_t() : ino(0), _pad(0) { } - dirfrag_t(inodeno_t i, frag_t f) : ino(i), frag(f), _pad(0) { } -}; - -inline ostream& operator<<(ostream& out, const dirfrag_t df) { - out << df.ino; - if (!df.frag.is_root()) out << "." << df.frag; - return out; -} -inline bool operator<(dirfrag_t l, dirfrag_t r) { - if (l.ino < r.ino) return true; - if (l.ino == r.ino && l.frag < r.frag) return true; - return false; -} -inline bool operator==(dirfrag_t l, dirfrag_t r) { - return l.ino == r.ino && l.frag == r.frag; -} - -namespace __gnu_cxx { - template<> struct hash { - size_t operator()(const dirfrag_t &df) const { - static rjhash H; - static rjhash I; - return H(df.ino) ^ I(df.frag); - } - }; -} - - - -// ================================================================ - -#define META_POP_IRD 0 -#define META_POP_IWR 1 -#define META_POP_READDIR 2 -#define META_POP_FETCH 3 -#define META_POP_STORE 4 -#define META_NPOP 5 - -class inode_load_vec_t { - static const int NUM = 2; - DecayCounter vec[NUM]; -public: - DecayCounter &get(int t) { - assert(t < NUM); - return vec[t]; - } - void zero(utime_t now) { - for (int i=0; i"; -} - -/* -inline mds_load_t& operator+=( mds_load_t& l, mds_load_t& r ) -{ - l.root_pop += r.root_pop; - l.req_rate += r.req_rate; - l.queue_len += r.queue_len; - return l; -} - -inline mds_load_t operator/( mds_load_t& a, double d ) -{ - mds_load_t r; - r.root_pop = a.root_pop / d; - r.req_rate = a.req_rate / d; - r.queue_len = a.queue_len / d; - return r; -} -*/ - - -class load_spread_t { -public: - static const int MAX = 4; - int last[MAX]; - int p, n; - DecayCounter count; - -public: - load_spread_t() : p(0), n(0) { - for (int i=0; i= 0 is the auth mds -#define CDIR_AUTH_PARENT -1 // default -#define CDIR_AUTH_UNKNOWN -2 -#define CDIR_AUTH_DEFAULT pair(-1, -2) -#define CDIR_AUTH_UNDEF pair(-2, -2) -//#define CDIR_AUTH_ROOTINODE pair( 0, -2) - - - -// print hack -struct mdsco_db_line_prefix { - MDSCacheObject *object; - mdsco_db_line_prefix(MDSCacheObject *o) : object(o) {} -}; -ostream& operator<<(ostream& out, mdsco_db_line_prefix o); - -// printer -ostream& operator<<(ostream& out, MDSCacheObject &o); - -class MDSCacheObjectInfo { -public: - inodeno_t ino; - dirfrag_t dirfrag; - string dname; - - MDSCacheObjectInfo() : ino(0) {} - - void _encode(bufferlist& bl) const { - ::_encode(ino, bl); - ::_encode(dirfrag, bl); - ::_encode(dname, bl); - } - void _decode(bufferlist& bl, int& off) { - ::_decode(ino, bl, off); - ::_decode(dirfrag, bl, off); - ::_decode(dname, bl, off); - } - void _decode(bufferlist::iterator& p) { - ::_decode_simple(ino, p); - ::_decode_simple(dirfrag, p); - ::_decode_simple(dname, p); - } -}; - - -class MDSCacheObject { - public: - // -- pins -- - const static int PIN_REPLICATED = 1000; - const static int PIN_DIRTY = 1001; - const static int PIN_LOCK = -1002; - const static int PIN_REQUEST = -1003; - const static int PIN_WAITER = 1004; - const static int PIN_DIRTYSCATTERED = 1005; // make this neg if we start using multiple scatterlocks? - static const int PIN_AUTHPIN = 1006; - static const int PIN_PTRWAITER = -1007; - const static int PIN_TEMPEXPORTING = 1008; // temp pin between encode_ and finish_export - - const char *generic_pin_name(int p) { - switch (p) { - case PIN_REPLICATED: return "replicated"; - case PIN_DIRTY: return "dirty"; - case PIN_LOCK: return "lock"; - case PIN_REQUEST: return "request"; - case PIN_WAITER: return "waiter"; - case PIN_DIRTYSCATTERED: return "dirtyscattered"; - case PIN_AUTHPIN: return "authpin"; - case PIN_PTRWAITER: return "ptrwaiter"; - case PIN_TEMPEXPORTING: return "tempexporting"; - default: assert(0); return 0; - } - } - - // -- state -- - const static int STATE_AUTH = (1<<30); - const static int STATE_DIRTY = (1<<29); - const static int STATE_REJOINING = (1<<28); // replica has not joined w/ primary copy - - // -- wait -- - const static int WAIT_SINGLEAUTH = (1<<30); - const static int WAIT_UNFREEZE = (1<<29); // pka AUTHPINNABLE - - - // ============================================ - // cons - public: - MDSCacheObject() : - state(0), - ref(0), - replica_nonce(0) {} - virtual ~MDSCacheObject() {} - - // printing - virtual void print(ostream& out) = 0; - virtual ostream& print_db_line_prefix(ostream& out) { - return out << "mdscacheobject(" << this << ") "; - } - - // -------------------------------------------- - // state - protected: - unsigned state; // state bits - - public: - unsigned get_state() const { return state; } - unsigned state_test(unsigned mask) const { return (state & mask); } - void state_clear(unsigned mask) { state &= ~mask; } - void state_set(unsigned mask) { state |= mask; } - void state_reset(unsigned s) { state = s; } - - bool is_auth() const { return state_test(STATE_AUTH); } - bool is_dirty() const { return state_test(STATE_DIRTY); } - bool is_clean() const { return !is_dirty(); } - bool is_rejoining() const { return state_test(STATE_REJOINING); } - - // -------------------------------------------- - // authority - virtual pair authority() = 0; - bool is_ambiguous_auth() { - return authority().second != CDIR_AUTH_UNKNOWN; - } - - // -------------------------------------------- - // pins -protected: - int ref; // reference count -#ifdef MDS_REF_SET - multiset ref_set; -#endif - - public: - int get_num_ref() { return ref; } - virtual const char *pin_name(int by) = 0; - //bool is_pinned_by(int by) { return ref_set.count(by); } - //multiset& get_ref_set() { return ref_set; } - - virtual void last_put() {} - virtual void bad_put(int by) { -#ifdef MDS_REF_SET - assert(ref_set.count(by) > 0); -#endif - assert(ref > 0); - } - void put(int by) { -#ifdef MDS_REF_SET - if (ref == 0 || ref_set.count(by) == 0) { -#else - if (ref == 0) { -#endif - bad_put(by); - } else { - ref--; -#ifdef MDS_REF_SET - ref_set.erase(ref_set.find(by)); - assert(ref == (int)ref_set.size()); -#endif - if (ref == 0) - last_put(); - } - } - - virtual void first_get() {} - virtual void bad_get(int by) { -#ifdef MDS_REF_SET - assert(by < 0 || ref_set.count(by) == 0); -#endif - assert(0); - } - void get(int by) { -#ifdef MDS_REF_SET - if (by >= 0 && ref_set.count(by)) { - bad_get(by); - } else { -#endif - if (ref == 0) - first_get(); - ref++; -#ifdef MDS_REF_SET - ref_set.insert(by); - assert(ref == (int)ref_set.size()); - } -#endif - } - - void print_pin_set(ostream& out) { -#ifdef MDS_REF_SET - multiset::iterator it = ref_set.begin(); - while (it != ref_set.end()) { - out << " " << pin_name(*it); - int last = *it; - int c = 1; - do { - it++; - if (it == ref_set.end()) break; - } while (*it == last); - if (c > 1) - out << "*" << c; - } -#endif - } - - - // -------------------------------------------- - // auth pins - virtual bool can_auth_pin() = 0; - virtual void auth_pin() = 0; - virtual void auth_unpin() = 0; - - - // -------------------------------------------- - // replication - protected: - map replica_map; // [auth] mds -> nonce - int replica_nonce; // [replica] defined on replica - - public: - bool is_replicated() { return !replica_map.empty(); } - bool is_replica(int mds) { return replica_map.count(mds); } - int num_replicas() { return replica_map.size(); } - int add_replica(int mds) { - if (replica_map.count(mds)) - return ++replica_map[mds]; // inc nonce - if (replica_map.empty()) - get(PIN_REPLICATED); - return replica_map[mds] = 1; - } - void add_replica(int mds, int nonce) { - if (replica_map.empty()) - get(PIN_REPLICATED); - replica_map[mds] = nonce; - } - int get_replica_nonce(int mds) { - assert(replica_map.count(mds)); - return replica_map[mds]; - } - void remove_replica(int mds) { - assert(replica_map.count(mds)); - replica_map.erase(mds); - if (replica_map.empty()) - put(PIN_REPLICATED); - } - void clear_replica_map() { - if (!replica_map.empty()) - put(PIN_REPLICATED); - replica_map.clear(); - } - map::iterator replicas_begin() { return replica_map.begin(); } - map::iterator replicas_end() { return replica_map.end(); } - const map& get_replicas() { return replica_map; } - void list_replicas(set& ls) { - for (map::const_iterator p = replica_map.begin(); - p != replica_map.end(); - ++p) - ls.insert(p->first); - } - - int get_replica_nonce() { return replica_nonce;} - void set_replica_nonce(int n) { replica_nonce = n; } - - - // --------------------------------------------- - // waiting - protected: - multimap waiting; - - public: - bool is_waiter_for(int mask) { - return waiting.count(mask) > 0; // FIXME: not quite right. - } - virtual void add_waiter(int mask, Context *c) { - if (waiting.empty()) - get(PIN_WAITER); - waiting.insert(pair(mask, c)); - pdout(10,g_conf.debug_mds) << (mdsco_db_line_prefix(this)) - << "add_waiter " << hex << mask << dec << " " << c - << " on " << *this - << dendl; - - } - virtual void take_waiting(int mask, list& ls) { - if (waiting.empty()) return; - multimap::iterator it = waiting.begin(); - while (it != waiting.end()) { - if (it->first & mask) { - ls.push_back(it->second); - pdout(10,g_conf.debug_mds) << (mdsco_db_line_prefix(this)) - << "take_waiting mask " << hex << mask << dec << " took " << it->second - << " tag " << it->first - << " on " << *this - << dendl; - waiting.erase(it++); - } else { - pdout(10,g_conf.debug_mds) << "take_waiting mask " << hex << mask << dec << " SKIPPING " << it->second - << " tag " << it->first - << " on " << *this - << dendl; - it++; - } - } - if (waiting.empty()) - put(PIN_WAITER); - } - void finish_waiting(int mask, int result = 0) { - list finished; - take_waiting(mask, finished); - finish_contexts(finished, result); - } - - - // --------------------------------------------- - // locking - // noop unless overloaded. - virtual SimpleLock* get_lock(int type) { assert(0); return 0; } - virtual void set_object_info(MDSCacheObjectInfo &info) { assert(0); } - virtual void encode_lock_state(int type, bufferlist& bl) { assert(0); } - virtual void decode_lock_state(int type, bufferlist& bl) { assert(0); } - virtual void finish_lock_waiters(int type, int mask, int r=0) { assert(0); } - virtual void add_lock_waiter(int type, int mask, Context *c) { assert(0); } - virtual bool is_lock_waiting(int type, int mask) { assert(0); return false; } - - virtual void clear_dirty_scattered(int type) { assert(0); } - - // --------------------------------------------- - // ordering - virtual bool is_lt(const MDSCacheObject *r) const = 0; - struct ptr_lt { - bool operator()(const MDSCacheObject* l, const MDSCacheObject* r) const { - return l->is_lt(r); - } - }; - -}; - -inline ostream& operator<<(ostream& out, MDSCacheObject &o) { - o.print(out); - return out; -} - -inline ostream& operator<<(ostream& out, const MDSCacheObjectInfo &info) { - if (info.ino) return out << info.ino; - if (info.dname.length()) return out << info.dirfrag << "/" << info.dname; - return out << info.dirfrag; -} - -inline ostream& operator<<(ostream& out, mdsco_db_line_prefix o) { - o.object->print_db_line_prefix(out); - return out; -} - - -#endif diff --git a/branches/sage/ebofs2/messages/MAnchor.h b/branches/sage/ebofs2/messages/MAnchor.h deleted file mode 100644 index 6ceb8981244fa..0000000000000 --- a/branches/sage/ebofs2/messages/MAnchor.h +++ /dev/null @@ -1,74 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MANCHORREQUEST_H -#define __MANCHORREQUEST_H - -#include - -#include "msg/Message.h" -#include "mds/Anchor.h" - - -class MAnchor : public Message { - int op; - inodeno_t ino; - vector trace; - version_t atid; // anchor table version. - - public: - MAnchor() {} - MAnchor(int o, inodeno_t i, version_t v=0) : - Message(MSG_MDS_ANCHOR), - op(o), ino(i), atid(v) { } - - virtual char *get_type_name() { return "anchor"; } - void print(ostream& o) { - o << "anchor(" << get_anchor_opname(op); - if (ino) o << " " << ino; - if (atid) o << " atid " << atid; - if (!trace.empty()) o << ' ' << trace; - o << ")"; - } - - void set_trace(vector& trace) { - this->trace = trace; - } - - int get_op() { return op; } - inodeno_t get_ino() { return ino; } - vector& get_trace() { return trace; } - version_t get_atid() { return atid; } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(op), (char*)&op); - off += sizeof(op); - payload.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - payload.copy(off, sizeof(atid), (char*)&atid); - off += sizeof(atid); - ::_decode(trace, payload, off); - } - - virtual void encode_payload() { - payload.append((char*)&op, sizeof(op)); - payload.append((char*)&ino, sizeof(ino)); - payload.append((char*)&atid, sizeof(atid)); - ::_encode(trace, payload); - } -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MCacheExpire.h b/branches/sage/ebofs2/messages/MCacheExpire.h deleted file mode 100644 index 015aa562038a7..0000000000000 --- a/branches/sage/ebofs2/messages/MCacheExpire.h +++ /dev/null @@ -1,127 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MCACHEEXPIRE_H -#define __MCACHEEXPIRE_H - -class MCacheExpire : public Message { - int from; - -public: - /* - group things by realm (auth delgation root), since that's how auth is determined. - that makes it less work to process when exports are in progress. - */ - struct realm { - map inodes; - map dirs; - map > dentries; - }; - map realms; - - int get_from() { return from; } - - MCacheExpire() {} - MCacheExpire(int f) : - Message(MSG_MDS_CACHEEXPIRE), - from(f) { } - - virtual char *get_type_name() { return "CEx";} - - void add_inode(dirfrag_t r, inodeno_t ino, int nonce) { - realms[r].inodes[ino] = nonce; - } - void add_dir(dirfrag_t r, dirfrag_t df, int nonce) { - realms[r].dirs[df] = nonce; - } - void add_dentry(dirfrag_t r, dirfrag_t df, const string& dn, int nonce) { - realms[r].dentries[df][dn] = nonce; - } - - void add_realm(dirfrag_t df, realm& r) { - realm& myr = realms[df]; - for (map::iterator p = r.inodes.begin(); - p != r.inodes.end(); - ++p) - myr.inodes[p->first] = p->second; - for (map::iterator p = r.dirs.begin(); - p != r.dirs.end(); - ++p) - myr.dirs[p->first] = p->second; - for (map >::iterator p = r.dentries.begin(); - p != r.dentries.end(); - ++p) - for (map::iterator q = p->second.begin(); - q != p->second.end(); - ++q) - myr.dentries[p->first][q->first] = q->second; - } - - void decode_payload() { - int off = 0; - - payload.copy(off, sizeof(from), (char*)&from); - off += sizeof(from); - - int nr; - payload.copy(off, sizeof(nr), (char*)&nr); - off += sizeof(nr); - - while (nr--) { - dirfrag_t r; - payload.copy(off, sizeof(r), (char*)&r); - off += sizeof(r); - - ::_decode(realms[r].inodes, payload, off); - ::_decode(realms[r].dirs, payload, off); - - int n; - payload.copy(off, sizeof(int), (char*)&n); - off += sizeof(int); - for (int i=0; i::iterator q = realms.begin(); - q != realms.end(); - ++q) { - payload.append((char*)&q->first, sizeof(q->first)); - - ::_encode(q->second.inodes, payload); - ::_encode(q->second.dirs, payload); - - int n = q->second.dentries.size(); - payload.append((char*)&n, sizeof(n)); - for (map >::iterator p = q->second.dentries.begin(); - p != q->second.dentries.end(); - ++p) { - payload.append((char*)&p->first, sizeof(p->first)); - ::_encode(p->second, payload); - } - } - } -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MClientFileCaps.h b/branches/sage/ebofs2/messages/MClientFileCaps.h deleted file mode 100644 index 979be331e5ce8..0000000000000 --- a/branches/sage/ebofs2/messages/MClientFileCaps.h +++ /dev/null @@ -1,109 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MCLIENTFILECAPS_H -#define __MCLIENTFILECAPS_H - -#include "msg/Message.h" -#include "mds/Capability.h" - -class MClientFileCaps : public Message { - public: - static const int OP_GRANT = 0; // mds->client grant. - static const int OP_ACK = 1; // client->mds ack (if prior grant was a recall) - static const int OP_RELEASE = 2; // mds closed the cap - static const int OP_STALE = 3; // mds has exported the cap - static const int OP_REAP = 4; // mds has imported the cap from get_mds() - static const char* get_opname(int op) { - switch (op) { - case OP_GRANT: return "grant"; - case OP_ACK: return "ack"; - case OP_RELEASE: return "release"; - case OP_STALE: return "stale"; - case OP_REAP: return "reap"; - default: assert(0); return 0; - } - } - - private: - int32_t op; - inode_t inode; - capseq_t seq; - int32_t caps; - int32_t wanted; - - int32_t mds; - - public: - inodeno_t get_ino() { return inode.ino; } - inode_t& get_inode() { return inode; } - int get_caps() { return caps; } - int get_wanted() { return wanted; } - capseq_t get_seq() { return seq; } - - // for cap migration - int get_mds() { return mds; } - int get_op() { return op; } - - void set_caps(int c) { caps = c; } - void set_wanted(int w) { wanted = w; } - - void set_mds(int m) { mds = m; } - void set_op(int s) { op = s; } - - MClientFileCaps() {} - MClientFileCaps(int op_, - inode_t& inode_, - long seq_, - int caps_, - int wanted_, - int mds_=0) : - Message(MSG_CLIENT_FILECAPS), - op(op_), - inode(inode_), - seq(seq_), - caps(caps_), - wanted(wanted_), - mds(mds_) { } - - char *get_type_name() { return "Cfcap";} - void print(ostream& out) { - out << "client_file_caps(" << get_opname(op) - << " " << inode.ino - << " seq " << seq - << " caps " << cap_string(caps) - << " wanted" << cap_string(wanted) - << ")"; - } - - void decode_payload() { - int off = 0; - ::_decode(op, payload, off); - ::_decode(seq, payload, off); - ::_decode(inode, payload, off); - ::_decode(caps, payload, off); - ::_decode(wanted, payload, off); - ::_decode(mds, payload, off); - } - void encode_payload() { - ::_encode(op, payload); - ::_encode(seq, payload); - ::_encode(inode, payload); - ::_encode(caps, payload); - ::_encode(wanted, payload); - ::_encode(mds, payload); - } -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MClientMount.h b/branches/sage/ebofs2/messages/MClientMount.h deleted file mode 100644 index a49b558c7f040..0000000000000 --- a/branches/sage/ebofs2/messages/MClientMount.h +++ /dev/null @@ -1,43 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MCLIENTMOUNT_H -#define __MCLIENTMOUNT_H - -#include "msg/Message.h" - -class MClientMount : public Message { -public: - entity_addr_t addr; - int32_t instance; // on this node - - MClientMount() : Message(MSG_CLIENT_MOUNT) { } - MClientMount(entity_addr_t a, int i = 0) : - Message(MSG_CLIENT_MOUNT), - addr(a), instance(i) { } - - char *get_type_name() { return "client_mount"; } - - void decode_payload() { - int off = 0; - ::_decode(addr, payload, off); - ::_decode(instance, payload, off); - } - void encode_payload() { - ::_encode(addr, payload); - ::_encode(instance, payload); - } -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MClientReconnect.h b/branches/sage/ebofs2/messages/MClientReconnect.h deleted file mode 100644 index bf1fbacd4b75c..0000000000000 --- a/branches/sage/ebofs2/messages/MClientReconnect.h +++ /dev/null @@ -1,59 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MCLIENTRECONNECT_H -#define __MCLIENTRECONNECT_H - -#include "msg/Message.h" -#include "mds/mdstypes.h" - -class MClientReconnect : public Message { -public: - map inode_caps; - map inode_path; - bool closed; // true if this session was closed by the client. - - MClientReconnect() : Message(MSG_CLIENT_RECONNECT), - closed(false) { } - - char *get_type_name() { return "client_reconnect"; } - void print(ostream& out) { - out << "client_reconnect(" << inode_caps.size() << " caps)"; - } - - void add_inode_caps(inodeno_t ino, - int wanted, int issued, - off_t sz, utime_t mt, utime_t at) { - inode_caps[ino] = inode_caps_reconnect_t(wanted, issued, sz, mt, at); - } - void add_inode_path(inodeno_t ino, const string& path) { - inode_path[ino] = path; - } - - void encode_payload() { - ::_encode(closed, payload); - ::_encode(inode_caps, payload); - ::_encode(inode_path, payload); - } - void decode_payload() { - int off = 0; - ::_decode(closed, payload, off); - ::_decode(inode_caps, payload, off); - ::_decode(inode_path, payload, off); - } - -}; - - -#endif diff --git a/branches/sage/ebofs2/messages/MClientReply.h b/branches/sage/ebofs2/messages/MClientReply.h deleted file mode 100644 index 760dcc971ebad..0000000000000 --- a/branches/sage/ebofs2/messages/MClientReply.h +++ /dev/null @@ -1,285 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MCLIENTREPLY_H -#define __MCLIENTREPLY_H - -#include "include/types.h" -#include "include/encodable.h" -#include "MClientRequest.h" - -#include "msg/Message.h" -#include "mds/CInode.h" -#include "mds/CDir.h" -#include "mds/CDentry.h" - -#include -using namespace std; - -class CInode; - -/*** - * - * MClientReply - container message for MDS reply to a client's MClientRequest - * - * key fields: - * long tid - transaction id, so the client can match up with pending request - * int result - error code, or fh if it was open - * - * for most requests: - * trace is a vector of InodeStat's tracing from root to the file/dir/whatever - * the operation referred to, so that the client can update it's info about what - * metadata lives on what MDS. - * - * for readdir replies: - * dir_contents is a vector of InodeStat*'s. - * - * that's mostly it, i think! - * - */ - -struct DirStat { - // mds distribution hints - frag_t frag; - int auth; - set dist; - bool is_rep; - - DirStat() {} - DirStat(bufferlist::iterator& p) { - _decode(p); - } - - void _decode(bufferlist::iterator& p) { - ::_decode_simple(frag, p); - ::_decode_simple(auth, p); - ::_decode_simple(dist, p); - ::_decode_simple(is_rep, p); - } - - static void _encode(bufferlist& bl, CDir *dir, int whoami) { - frag_t frag = dir->get_frag(); - int auth; - set dist; - bool is_rep; - - auth = dir->get_dir_auth().first; - if (dir->is_auth()) - dir->get_dist_spec(dist, whoami); - is_rep = dir->is_rep(); - - ::_encode_simple(frag, bl); - ::_encode_simple(auth, bl); - ::_encode_simple(dist, bl); - ::_encode_simple(is_rep, bl); - } -}; - -struct InodeStat { - inode_t inode; - string symlink; // symlink content (if symlink) - fragtree_t dirfragtree; - uint32_t mask; - - public: - InodeStat() {} - InodeStat(bufferlist::iterator& p) { - _decode(p); - } - - void _decode(bufferlist::iterator &p) { - ::_decode_simple(mask, p); - ::_decode_simple(inode, p); - ::_decode_simple(symlink, p); - dirfragtree._decode(p); - } - - static void _encode(bufferlist &bl, CInode *in) { - int mask = STAT_MASK_INO|STAT_MASK_TYPE|STAT_MASK_BASE; - - // mask - if (in->authlock.can_rdlock(0)) mask |= STAT_MASK_AUTH; - if (in->linklock.can_rdlock(0)) mask |= STAT_MASK_LINK; - if (in->filelock.can_rdlock(0)) mask |= STAT_MASK_FILE; - - ::_encode_simple(mask, bl); - ::_encode_simple(in->inode, bl); - ::_encode_simple(in->symlink, bl); - in->dirfragtree._encode(bl); - } - -}; - - -class MClientReply : public Message { - // reply data - struct st_ { - long tid; - int op; - int result; // error code - unsigned char file_caps; // for open - long file_caps_seq; - uint64_t file_data_version; // for client buffercache consistency - } st; - - string path; - - list trace_in; - list trace_dir; - list trace_dn; - bufferlist trace_bl; - - DirStat *dir_dir; - list dir_in; - list dir_dn; - bufferlist dir_bl; - - public: - long get_tid() { return st.tid; } - int get_op() { return st.op; } - - int get_result() { return st.result; } - const string& get_path() { return path; } - - inodeno_t get_ino() { return trace_in.back()->inode.ino; } - const inode_t& get_inode() { return trace_in.back()->inode; } - - unsigned char get_file_caps() { return st.file_caps; } - long get_file_caps_seq() { return st.file_caps_seq; } - uint64_t get_file_data_version() { return st.file_data_version; } - - void set_result(int r) { st.result = r; } - void set_file_caps(unsigned char c) { st.file_caps = c; } - void set_file_caps_seq(long s) { st.file_caps_seq = s; } - void set_file_data_version(uint64_t v) { st.file_data_version = v; } - - MClientReply() : dir_dir(0) {}; - MClientReply(MClientRequest *req, int result = 0) : - Message(MSG_CLIENT_REPLY), dir_dir(0) { - memset(&st, 0, sizeof(st)); - this->st.tid = req->get_tid(); - this->st.op = req->get_op(); - this->path = req->get_path(); - - this->st.result = result; - } - virtual ~MClientReply() { - list::iterator it; - - for (it = trace_in.begin(); it != trace_in.end(); ++it) - delete *it; - for (it = dir_in.begin(); it != dir_in.end(); ++it) - delete *it; - } - virtual char *get_type_name() { return "creply"; } - void print(ostream& o) { - o << "creply(" << env.dst.name << "." << st.tid; - o << " = " << st.result; - if (st.result <= 0) - o << " " << strerror(-st.result); - o << ")"; - } - - // serialization - virtual void decode_payload() { - bufferlist::iterator p = payload.begin(); - ::_decode_simple(st, p); - ::_decode_simple(path, p); - ::_decode_simple(trace_bl, p); - ::_decode_simple(dir_bl, p); - assert(p.end()); - } - virtual void encode_payload() { - ::_encode_simple(st, payload); - ::_encode_simple(path, payload); - ::_encode_simple(trace_bl, payload); - ::_encode_simple(dir_bl, payload); - } - - - // dir contents - void take_dir_items(bufferlist& bl) { - dir_bl.claim(bl); - } - void _decode_dir() { - bufferlist::iterator p = dir_bl.begin(); - dir_dir = new DirStat(p); - while (!p.end()) { - string dn; - ::_decode_simple(dn, p); - dir_dn.push_back(dn); - dir_in.push_back(new InodeStat(p)); - } - } - - const list& get_dir_in() { - if (dir_in.empty() && dir_bl.length()) _decode_dir(); - return dir_in; - } - const list& get_dir_dn() { - if (dir_dn.empty() && dir_bl.length()) _decode_dir(); - return dir_dn; - } - const DirStat* get_dir_dir() { - return dir_dir; - } - - - // trace - void set_trace_dist(CInode *in, int whoami) { - // inode, dentry, dir, ..., inode - while (in) { - InodeStat::_encode(trace_bl, in); - CDentry *dn = in->get_parent_dn(); - if (!dn) break; - ::_encode_simple(in->get_parent_dn()->get_name(), trace_bl); - DirStat::_encode(trace_bl, dn->get_dir(), whoami); - in = dn->get_dir()->get_inode(); - } - } - void _decode_trace() { - bufferlist::iterator p = trace_bl.begin(); - while (!p.end()) { - // inode - trace_in.push_front(new InodeStat(p)); - if (!p.end()) { - // dentry - string ref_dn; - ::_decode_simple(ref_dn, p); - trace_dn.push_front(ref_dn); - - // dir - trace_dir.push_front(new DirStat(p)); - } - } - } - - const list& get_trace_in() { - if (trace_in.empty() && trace_bl.length()) _decode_trace(); - return trace_in; - } - const list& get_trace_dir() { - if (trace_in.empty() && trace_bl.length()) _decode_trace(); - return trace_dir; - } - const list& get_trace_dn() { - if (trace_in.empty() && trace_bl.length()) _decode_trace(); - return trace_dn; - } - - -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MClientRequest.h b/branches/sage/ebofs2/messages/MClientRequest.h deleted file mode 100644 index 8f03044cf5a4f..0000000000000 --- a/branches/sage/ebofs2/messages/MClientRequest.h +++ /dev/null @@ -1,325 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MCLIENTREQUEST_H -#define __MCLIENTREQUEST_H - -/** - * - * MClientRequest - container for a client METADATA request. created/sent by clients. - * can be forwarded around between MDS's. - * - * int client - the originating client - * long tid - transaction id, unique among requests for that client. probably just a counter! - * -> the MDS passes the Request to the Reply constructor, so this always matches. - * - * int op - the metadata op code. MDS_OP_RENAME, etc. - * int caller_uid, _gid - guess - * - * fixed size arguments are in a union. - * there's also a string argument, for e.g. symlink(). - * - */ - -#include "msg/Message.h" -#include "include/filepath.h" -#include "mds/mdstypes.h" - -#include -#include -#include -#include -#include - - -// metadata ops. -// >=1000 --> an update, non-idempotent (i.e. an update) -#define MDS_OP_STATFS 1 - -#define MDS_OP_STAT 100 -#define MDS_OP_LSTAT 101 -#define MDS_OP_FSTAT 102 -#define MDS_OP_UTIME 1102 -#define MDS_OP_CHMOD 1104 -#define MDS_OP_CHOWN 1105 - -#define MDS_OP_READDIR 200 -#define MDS_OP_MKNOD 1201 -#define MDS_OP_LINK 1202 -#define MDS_OP_UNLINK 1203 -#define MDS_OP_RENAME 1204 - -#define MDS_OP_MKDIR 1220 -#define MDS_OP_RMDIR 1221 -#define MDS_OP_SYMLINK 1222 - -#define MDS_OP_OPEN 301 -#define MDS_OP_TRUNCATE 1306 -#define MDS_OP_FSYNC 307 - -#define MDS_OP_RELEASE 308 // used only by SyntheticClient op_dist thinger - - -class MClientRequest : public Message { - struct { - tid_t tid; - tid_t oldest_client_tid; - int num_fwd; - int retry_attempt; - inodeno_t mds_wants_replica_in_dirino; - - entity_inst_t client_inst; - - int op; - int caller_uid, caller_gid; - inodeno_t cwd_ino; - } st; - - // path arguments - filepath path; - string sarg; - - public: - // fixed size arguments. in a union. - // note: nothing with a constructor can go here; use underlying base - // types for _inodeno_t, _frag_t. - union { - struct { - int mask; - } stat; - struct { - _inodeno_t ino; - int mask; - } fstat; - struct { - _frag_t frag; - } readdir; - struct { - _utime_t mtime; - _utime_t atime; - } utime; - struct { - mode_t mode; - } chmod; - struct { - uid_t uid; - gid_t gid; - } chown; - struct { - mode_t mode; - dev_t rdev; - } mknod; - struct { - mode_t mode; - } mkdir; - struct { - int flags; - mode_t mode; - } open; - struct { - _inodeno_t ino; // optional - off_t length; - } truncate; - struct { - _inodeno_t ino; - } fsync; - } args; - - // cons - MClientRequest() : Message(MSG_CLIENT_REQUEST) {} - MClientRequest(int op, entity_inst_t ci) : Message(MSG_CLIENT_REQUEST) { - memset(&st, 0, sizeof(st)); - memset(&args, 0, sizeof(args)); - this->st.op = op; - this->st.client_inst = ci; - } - - metareqid_t get_reqid() { - // FIXME: for now, assume clients always have 1 incarnation - return metareqid_t(st.client_inst.name.num(), st.tid); - } - - int get_open_file_mode() { - if (args.open.flags & O_LAZY) - return FILE_MODE_LAZY; - if (args.open.flags & O_WRONLY) - return FILE_MODE_W; - if (args.open.flags & O_RDWR) - return FILE_MODE_RW; - if (args.open.flags & O_APPEND) - return FILE_MODE_W; - return FILE_MODE_R; - } - bool open_file_mode_is_readonly() { - return get_open_file_mode() == FILE_MODE_R; - } - bool is_idempotent() { - if (st.op == MDS_OP_OPEN) - return open_file_mode_is_readonly(); - return (st.op < 1000); - } - bool auth_is_best() { - if (!is_idempotent()) return true; - if (st.op == MDS_OP_READDIR) return true; - return false; - } - bool follow_trailing_symlink() { - switch (st.op) { - case MDS_OP_LSTAT: - case MDS_OP_FSTAT: - case MDS_OP_LINK: - case MDS_OP_UNLINK: - case MDS_OP_RENAME: - return false; - - case MDS_OP_STAT: - case MDS_OP_UTIME: - case MDS_OP_CHMOD: - case MDS_OP_CHOWN: - case MDS_OP_READDIR: - case MDS_OP_OPEN: - case MDS_OP_TRUNCATE: - - case MDS_OP_FSYNC: - case MDS_OP_MKNOD: - case MDS_OP_MKDIR: - case MDS_OP_RMDIR: - case MDS_OP_SYMLINK: - return true; - - default: - assert(0); - return false; - } - } - - - - // normal fields - void set_tid(tid_t t) { st.tid = t; } - void set_oldest_client_tid(tid_t t) { st.oldest_client_tid = t; } - void inc_num_fwd() { st.num_fwd++; } - void set_retry_attempt(int a) { st.retry_attempt = a; } - void set_path(string& p) { path.set_path(p); } - void set_path(const char *p) { path.set_path(p); } - void set_path(const filepath& fp) { path = fp; } - void set_caller_uid(int u) { st.caller_uid = u; } - void set_caller_gid(int g) { st.caller_gid = g; } - void set_sarg(string& arg) { this->sarg = arg; } - void set_sarg(const char *arg) { this->sarg = arg; } - void set_mds_wants_replica_in_dirino(inodeno_t dirino) { - st.mds_wants_replica_in_dirino = dirino; } - - void set_client_inst(const entity_inst_t& i) { st.client_inst = i; } - const entity_inst_t& get_client_inst() { return st.client_inst; } - - int get_client() { return st.client_inst.name.num(); } - tid_t get_tid() { return st.tid; } - tid_t get_oldest_client_tid() { return st.oldest_client_tid; } - int get_num_fwd() { return st.num_fwd; } - int get_retry_attempt() { return st.retry_attempt; } - int get_op() { return st.op; } - int get_caller_uid() { return st.caller_uid; } - int get_caller_gid() { return st.caller_gid; } - //inodeno_t get_ino() { return st.ino; } - const string& get_path() { return path.get_path(); } - filepath& get_filepath() { return path; } - string& get_sarg() { return sarg; } - inodeno_t get_mds_wants_replica_in_dirino() { - return st.mds_wants_replica_in_dirino; } - - inodeno_t get_cwd_ino() { return st.cwd_ino ? st.cwd_ino:inodeno_t(MDS_INO_ROOT); } - - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(st), (char*)&st); - off += sizeof(st); - payload.copy(off, sizeof(args), (char*)&args); - off += sizeof(args); - path._decode(payload, off); - ::_decode(sarg, payload, off); - } - - void encode_payload() { - payload.append((char*)&st, sizeof(st)); - payload.append((char*)&args, sizeof(args)); - path._encode(payload); - ::_encode(sarg, payload); - } - - char *get_type_name() { return "creq"; } - void print(ostream& out) { - out << "clientreq(client" << get_client() - << "." << get_tid() - << " "; - switch(get_op()) { - case MDS_OP_STATFS: - out << "statfs"; break; - - case MDS_OP_STAT: - out << "stat"; break; - case MDS_OP_LSTAT: - out << "lstat"; break; - case MDS_OP_FSTAT: - out << "fstat"; break; - case MDS_OP_UTIME: - out << "utime"; break; - case MDS_OP_CHMOD: - out << "chmod"; break; - case MDS_OP_CHOWN: - out << "chown"; break; - - case MDS_OP_READDIR: - out << "readdir"; break; - case MDS_OP_MKNOD: - out << "mknod"; break; - case MDS_OP_LINK: - out << "link"; break; - case MDS_OP_UNLINK: - out << "unlink"; break; - case MDS_OP_RENAME: - out << "rename"; break; - - case MDS_OP_MKDIR: - out << "mkdir"; break; - case MDS_OP_RMDIR: - out << "rmdir"; break; - case MDS_OP_SYMLINK: - out << "symlink"; break; - - case MDS_OP_OPEN: - out << "open"; break; - case MDS_OP_TRUNCATE: - out << "truncate"; break; - case MDS_OP_FSYNC: - out << "fsync"; break; - // case MDS_OP_RELEASE: - //out << "release"; break; - default: - out << "unknown=" << get_op(); - assert(0); - } - if (get_path().length()) - out << " " << get_path(); - if (get_sarg().length()) - out << " " << get_sarg(); - if (st.retry_attempt) - out << " RETRY=" << st.retry_attempt; - out << ")"; - } - -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MClientRequestForward.h b/branches/sage/ebofs2/messages/MClientRequestForward.h deleted file mode 100644 index 53fb5270d30a9..0000000000000 --- a/branches/sage/ebofs2/messages/MClientRequestForward.h +++ /dev/null @@ -1,59 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MCLIENTREQUESTFORWARD_H -#define __MCLIENTREQUESTFORWARD_H - -class MClientRequestForward : public Message { - tid_t tid; - int32_t dest_mds; - int32_t num_fwd; - - public: - MClientRequestForward() : Message(MSG_CLIENT_REQUEST_FORWARD) {} - MClientRequestForward(tid_t t, int dm, int nf) : - Message(MSG_CLIENT_REQUEST_FORWARD), - tid(t), dest_mds(dm), num_fwd(nf) { } - - tid_t get_tid() { return tid; } - int get_dest_mds() { return dest_mds; } - int get_num_fwd() { return num_fwd; } - - char *get_type_name() { return "cfwd"; } - void print(ostream& o) { - o << "client_request_forward(" << tid - << " to " << dest_mds - << " num_fwd=" << num_fwd - << ")"; - } - - void encode_payload() { - payload.append((char*)&tid, sizeof(tid)); - payload.append((char*)&dest_mds, sizeof(dest_mds)); - payload.append((char*)&num_fwd, sizeof(num_fwd)); - } - - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(tid), (char*)&tid); - off += sizeof(tid); - payload.copy(off, sizeof(dest_mds), (char*)&dest_mds); - off += sizeof(dest_mds); - payload.copy(off, sizeof(num_fwd), (char*)&num_fwd); - off += sizeof(num_fwd); - } -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MClientSession.h b/branches/sage/ebofs2/messages/MClientSession.h deleted file mode 100644 index dc4252ac73d8e..0000000000000 --- a/branches/sage/ebofs2/messages/MClientSession.h +++ /dev/null @@ -1,62 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MCLIENTSESSION_H -#define __MCLIENTSESSION_H - -#include "msg/Message.h" - -class MClientSession : public Message { -public: - const static int OP_REQUEST_OPEN = 1; - const static int OP_OPEN = 2; - const static int OP_REQUEST_CLOSE = 3; - const static int OP_CLOSE = 4; - static const char *get_opname(int o) { - switch (o) { - case OP_REQUEST_OPEN: return "request_open"; - case OP_OPEN: return "open"; - case OP_REQUEST_CLOSE: return "request_close"; - case OP_CLOSE: return "close"; - default: assert(0); return 0; - } - } - - int32_t op; - version_t seq; - - MClientSession() : Message(MSG_CLIENT_SESSION) { } - MClientSession(int o, version_t s=0) : - Message(MSG_CLIENT_SESSION), - op(o), seq(s) { } - - char *get_type_name() { return "client_session"; } - void print(ostream& out) { - out << "client_session(" << get_opname(op); - if (seq) out << " seq " << seq; - out << ")"; - } - - void decode_payload() { - int off = 0; - ::_decode(op, payload, off); - ::_decode(seq, payload, off); - } - void encode_payload() { - ::_encode(op, payload); - ::_encode(seq, payload); - } -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MClientUnmount.h b/branches/sage/ebofs2/messages/MClientUnmount.h deleted file mode 100644 index 42fa07db7ba05..0000000000000 --- a/branches/sage/ebofs2/messages/MClientUnmount.h +++ /dev/null @@ -1,40 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MCLIENTUNMOUNT_H -#define __MCLIENTUNMOUNT_H - -#include "msg/Message.h" - -class MClientUnmount : public Message { -public: - entity_inst_t inst; - - MClientUnmount() : Message(MSG_CLIENT_UNMOUNT) { } - MClientUnmount(entity_inst_t i) : - Message(MSG_CLIENT_UNMOUNT), - inst(i) { } - - char *get_type_name() { return "client_unmount"; } - - void decode_payload() { - int off = 0; - ::_decode(inst, payload, off); - } - void encode_payload() { - ::_encode(inst, payload); - } -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MDentryUnlink.h b/branches/sage/ebofs2/messages/MDentryUnlink.h deleted file mode 100644 index 6e24d6f45410f..0000000000000 --- a/branches/sage/ebofs2/messages/MDentryUnlink.h +++ /dev/null @@ -1,82 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MDENTRYUNLINK_H -#define __MDENTRYUNLINK_H - -class MDentryUnlink : public Message { - dirfrag_t dirfrag; - string dn; - - public: - dirfrag_t get_dirfrag() { return dirfrag; } - string& get_dn() { return dn; } - - CInodeDiscover *strayin; - CDirDiscover *straydir; - CDentryDiscover *straydn; - - MDentryUnlink() : - Message(MSG_MDS_DENTRYUNLINK), - strayin(0), straydir(0), straydn(0) { } - MDentryUnlink(dirfrag_t df, string& n) : - Message(MSG_MDS_DENTRYUNLINK), - dirfrag(df), - dn(n), - strayin(0), straydir(0), straydn(0) { } - ~MDentryUnlink() { - delete strayin; - delete straydir; - delete straydn; - } - - char *get_type_name() { return "dentry_unlink";} - void print(ostream& o) { - o << "dentry_unlink(" << dirfrag << " " << dn << ")"; - } - - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(dirfrag), (char*)&dirfrag); - off += sizeof(dirfrag); - ::_decode(dn, payload, off); - - bool isstray; - payload.copy(off, sizeof(isstray), (char*)&isstray); - off += sizeof(isstray); - if (isstray) { - strayin = new CInodeDiscover; - strayin->_decode(payload, off); - straydir = new CDirDiscover; - straydir->_decode(payload, off); - straydn = new CDentryDiscover; - straydn->_decode(payload, off); - } - } - void encode_payload() { - payload.append((char*)&dirfrag,sizeof(dirfrag)); - ::_encode(dn, payload); - - bool isstray = strayin ? true:false; - payload.append((char*)&isstray, sizeof(isstray)); - if (isstray) { - strayin->_encode(payload); - straydir->_encode(payload); - straydn->_encode(payload); - } - } -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MDirUpdate.h b/branches/sage/ebofs2/messages/MDirUpdate.h deleted file mode 100644 index 0db32208efd45..0000000000000 --- a/branches/sage/ebofs2/messages/MDirUpdate.h +++ /dev/null @@ -1,71 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MDIRUPDATE_H -#define __MDIRUPDATE_H - -#include "msg/Message.h" - -class MDirUpdate : public Message { - struct { - dirfrag_t dirfrag; - int dir_rep; - int discover; - } st; - set dir_rep_by; - string path; - - public: - dirfrag_t get_dirfrag() { return st.dirfrag; } - int get_dir_rep() { return st.dir_rep; } - set& get_dir_rep_by() { return dir_rep_by; } - bool should_discover() { return st.discover > 0; } - string& get_path() { return path; } - - void tried_discover() { - if (st.discover) st.discover--; - } - - MDirUpdate() {} - MDirUpdate(dirfrag_t dirfrag, - int dir_rep, - set& dir_rep_by, - string& path, - bool discover = false) : - Message(MSG_MDS_DIRUPDATE) { - this->st.dirfrag = dirfrag; - this->st.dir_rep = dir_rep; - this->dir_rep_by = dir_rep_by; - if (discover) this->st.discover = 5; - this->path = path; - } - virtual char *get_type_name() { return "dir_update"; } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(st), (char*)&st); - off += sizeof(st); - ::_decode(dir_rep_by, payload, off); - ::_decode(path, payload, off); - } - - virtual void encode_payload() { - payload.append((char*)&st, sizeof(st)); - ::_encode(dir_rep_by, payload); - ::_encode(path, payload); - } -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MDiscover.h b/branches/sage/ebofs2/messages/MDiscover.h deleted file mode 100644 index 7294bad22d796..0000000000000 --- a/branches/sage/ebofs2/messages/MDiscover.h +++ /dev/null @@ -1,108 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MDISCOVER_H -#define __MDISCOVER_H - -#include "msg/Message.h" -#include "mds/CDir.h" -#include "include/filepath.h" - -#include -#include -using namespace std; - - -class MDiscover : public Message { - int asker; - inodeno_t base_ino; // 1 -> root - frag_t base_dir_frag; - - filepath want; // ... [/]need/this/stuff - inodeno_t want_ino; - - bool want_base_dir; - bool want_xlocked; - - public: - int get_asker() { return asker; } - inodeno_t get_base_ino() { return base_ino; } - frag_t get_base_dir_frag() { return base_dir_frag; } - - filepath& get_want() { return want; } - inodeno_t get_want_ino() { return want_ino; } - const string& get_dentry(int n) { return want[n]; } - - bool wants_base_dir() { return want_base_dir; } - bool wants_xlocked() { return want_xlocked; } - - void set_base_dir_frag(frag_t f) { base_dir_frag = f; } - - MDiscover() { } - MDiscover(int asker_, - inodeno_t base_ino_, - filepath& want_, - bool want_base_dir_ = true, - bool discover_xlocks_ = false) : - Message(MSG_MDS_DISCOVER), - asker(asker_), - base_ino(base_ino_), - want(want_), - want_ino(0), - want_base_dir(want_base_dir_), - want_xlocked(discover_xlocks_) { } - MDiscover(int asker_, - dirfrag_t base_dirfrag, - inodeno_t want_ino_, - bool want_base_dir_ = true) : - Message(MSG_MDS_DISCOVER), - asker(asker_), - base_ino(base_dirfrag.ino), - base_dir_frag(base_dirfrag.frag), - want_ino(want_ino_), - want_base_dir(want_base_dir_), - want_xlocked(false) { } - - char *get_type_name() { return "Dis"; } - void print(ostream &out) { - out << "discover(" << base_ino << "." << base_dir_frag - << " " << want; - if (want_ino) out << want_ino; - out << ")"; - } - - void decode_payload() { - int off = 0; - ::_decode(asker, payload, off); - ::_decode(base_ino, payload, off); - ::_decode(base_dir_frag, payload, off); - want._decode(payload, off); - ::_decode(want_ino, payload, off); - ::_decode(want_base_dir, payload, off); - ::_decode(want_xlocked, payload, off); - } - void encode_payload() { - ::_encode(asker, payload); - ::_encode(base_ino, payload); - ::_encode(base_dir_frag, payload); - want._encode(payload); - ::_encode(want_ino, payload); - ::_encode(want_base_dir, payload); - ::_encode(want_xlocked, payload); - } - -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MDiscoverReply.h b/branches/sage/ebofs2/messages/MDiscoverReply.h deleted file mode 100644 index 67491049c0b8f..0000000000000 --- a/branches/sage/ebofs2/messages/MDiscoverReply.h +++ /dev/null @@ -1,300 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MDISCOVERREPLY_H -#define __MDISCOVERREPLY_H - -#include "msg/Message.h" -#include "mds/CDir.h" -#include "mds/CInode.h" -#include "include/filepath.h" - -#include -#include -using namespace std; - -#define max(a,b) ((a)>(b) ? (a):(b)) - - -/** - * MDiscoverReply - return new replicas (of inodes, dirs, dentries) - * - * we group returned items by (dir, dentry, inode). each - * item in each set shares an index (it's "depth"). - * - * we can start and end with any type. - * no_base_dir = true if the first group has an inode but no dir - * no_base_dentry = true if the first group has an inode but no dentry - * they are false if there is no returned data, ie the first group is empty. - * - * we also return errors: - * error_flag_dn(string) - the specified dentry dne - * error_flag_dir - the last item wasn't a dir, so we couldn't continue. - * - * and sometimes, - * dir_auth_hint - where we think the dir auth is - * - * depth() gives us the number of depth units/indices for which we have - * information. this INCLUDES those for which we have errors but no data. - * - * see MDCache::handle_discover, handle_discover_reply. - * - * - * so basically, we get - * - * dir den ino i - * x 0 - * x x x 1 - * or - * x x 0 - * x x x 1 - * or - * x x x 0 - * x x x 1 - * ...and trail off however we want. - * - * - */ - -class MDiscoverReply : public Message { - // info about original request - inodeno_t base_ino; - frag_t base_dir_frag; - bool wanted_base_dir; - bool wanted_xlocked; - inodeno_t wanted_ino; - - // and the response - bool flag_error_dn; - bool flag_error_ino; - bool flag_error_dir; - bool no_base_dir; // no base dir (but IS dentry+inode) - bool no_base_dentry; // no base dentry (but IS inode) - string error_dentry; // dentry that was not found (to trigger waiters on asker) - - int dir_auth_hint; - - vector dirs; // not inode-aligned if no_base_dir = true. - vector dentries; // not inode-aligned if no_base_dentry = true - vector inodes; - - - public: - // accessors - inodeno_t get_base_ino() { return base_ino; } - frag_t get_base_dir_frag() { return base_dir_frag; } - bool get_wanted_base_dir() { return wanted_base_dir; } - bool get_wanted_xlocked() { return wanted_xlocked; } - inodeno_t get_wanted_ino() { return wanted_ino; } - - int get_num_inodes() { return inodes.size(); } - int get_num_dentries() { return dentries.size(); } - int get_num_dirs() { return dirs.size(); } - - int get_last_inode() { return inodes.size(); } - int get_last_dentry() { return dentries.size() + no_base_dentry; } - int get_last_dir() { return dirs.size() + no_base_dir; } - - int get_depth() { // return depth of deepest object (in dir/dentry/inode units) - return max( inodes.size(), // at least this many - max( no_base_dentry + dentries.size() + flag_error_dn, // inode start + path + possible error - dirs.size() + no_base_dir )); // dn/inode + dirs - } - - bool has_base_dir() { return !no_base_dir && dirs.size(); } - bool has_base_dentry() { return !no_base_dentry && dentries.size(); } - bool has_base_inode() { return no_base_dir && no_base_dentry; } - - bool is_flag_error_dn() { return flag_error_dn; } - bool is_flag_error_ino() { return flag_error_ino; } - bool is_flag_error_dir() { return flag_error_dir; } - string& get_error_dentry() { return error_dentry; } - - int get_dir_auth_hint() { return dir_auth_hint; } - - - // these index _arguments_ are aligned to each ([[dir, ] dentry, ] inode) set. - CInodeDiscover& get_inode(int n) { return *(inodes[n]); } - CDentryDiscover& get_dentry(int n) { return *(dentries[n - no_base_dentry]); } - CDirDiscover& get_dir(int n) { return *(dirs[n - no_base_dir]); } - inodeno_t get_ino(int n) { return inodes[n]->get_ino(); } - - // cons - MDiscoverReply() {} - MDiscoverReply(MDiscover *dis) : - Message(MSG_MDS_DISCOVERREPLY), - base_ino(dis->get_base_ino()), - base_dir_frag(dis->get_base_dir_frag()), - wanted_base_dir(dis->wants_base_dir()), - wanted_xlocked(dis->wants_xlocked()), - wanted_ino(dis->get_want_ino()), - flag_error_dn(false), - flag_error_ino(false), - flag_error_dir(false), - no_base_dir(false), no_base_dentry(false), - dir_auth_hint(CDIR_AUTH_UNKNOWN) { - } - MDiscoverReply(dirfrag_t df) : - Message(MSG_MDS_DISCOVERREPLY), - base_ino(df.ino), - base_dir_frag(df.frag), - wanted_base_dir(false), - wanted_xlocked(false), - wanted_ino(inodeno_t()), - flag_error_dn(false), - flag_error_ino(false), - flag_error_dir(false), - no_base_dir(false), no_base_dentry(false), - dir_auth_hint(CDIR_AUTH_UNKNOWN) { - } - ~MDiscoverReply() { - for (vector::iterator it = dirs.begin(); - it != dirs.end(); - it++) - delete *it; - for (vector::iterator it = dentries.begin(); - it != dentries.end(); - it++) - delete *it; - for (vector::iterator it = inodes.begin(); - it != inodes.end(); - it++) - delete *it; - } - virtual char *get_type_name() { return "DisR"; } - - // builders - bool is_empty() { - return dirs.empty() && dentries.empty() && inodes.empty() && - !flag_error_dn && - !flag_error_ino && - !flag_error_dir && - dir_auth_hint == CDIR_AUTH_UNKNOWN; - } - void add_dentry(CDentryDiscover* ddis) { - if (dentries.empty() && dirs.empty()) no_base_dir = true; - dentries.push_back(ddis); - } - - void add_inode(CInodeDiscover* din) { - if (inodes.empty() && dentries.empty()) no_base_dir = no_base_dentry = true; - inodes.push_back( din ); - } - - void add_dir(CDirDiscover* dir) { - dirs.push_back( dir ); - } - - - // void set_flag_forward() { flag_forward = true; } - void set_flag_error_dn(const string& dn) { - flag_error_dn = true; - error_dentry = dn; - } - void set_flag_error_ino() { - flag_error_ino = true; - } - void set_flag_error_dir() { - flag_error_dir = true; - } - void set_dir_auth_hint(int a) { - dir_auth_hint = a; - } - void set_error_dentry(const string& dn) { - error_dentry = dn; - } - - - // ... - virtual void decode_payload() { - int off = 0; - ::_decode(base_ino, payload, off); - ::_decode(base_dir_frag, payload, off); - ::_decode(wanted_base_dir, payload, off); - ::_decode(wanted_xlocked, payload, off); - ::_decode(flag_error_dn, payload, off); - ::_decode(flag_error_ino, payload, off); - ::_decode(flag_error_dir, payload, off); - ::_decode(no_base_dir, payload, off); - ::_decode(no_base_dentry, payload, off); - ::_decode(error_dentry, payload, off); - ::_decode(dir_auth_hint, payload, off); - - // dirs - int n; - payload.copy(off, sizeof(int), (char*)&n); - off += sizeof(int); - for (int i=0; i_decode(payload, off); - } - - // inodes - payload.copy(off, sizeof(int), (char*)&n); - off += sizeof(int); - for (int i=0; i_decode(payload, off); - } - - // dentries - payload.copy(off, sizeof(int), (char*)&n); - off += sizeof(int); - for (int i=0; i_decode(payload, off); - } - } - void encode_payload() { - ::_encode(base_ino, payload); - ::_encode(base_dir_frag, payload); - ::_encode(wanted_base_dir, payload); - ::_encode(wanted_xlocked, payload); - ::_encode(flag_error_dn, payload); - ::_encode(flag_error_ino, payload); - ::_encode(flag_error_dir, payload); - ::_encode(no_base_dir, payload); - ::_encode(no_base_dentry, payload); - ::_encode(error_dentry, payload); - ::_encode(dir_auth_hint, payload); - - // dirs - int n = dirs.size(); - payload.append((char*)&n, sizeof(int)); - for (vector::iterator it = dirs.begin(); - it != dirs.end(); - it++) - (*it)->_encode( payload ); - - // inodes - n = inodes.size(); - payload.append((char*)&n, sizeof(int)); - for (vector::iterator it = inodes.begin(); - it != inodes.end(); - it++) - (*it)->_encode( payload ); - - // dentries - n = dentries.size(); - payload.append((char*)&n, sizeof(int)); - for (vector::iterator it = dentries.begin(); - it != dentries.end(); - it++) - (*it)->_encode( payload ); - } - -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MExportDir.h b/branches/sage/ebofs2/messages/MExportDir.h deleted file mode 100644 index 9964a7059c1d2..0000000000000 --- a/branches/sage/ebofs2/messages/MExportDir.h +++ /dev/null @@ -1,65 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MEXPORTDIR_H -#define __MEXPORTDIR_H - -#include "msg/Message.h" - - -class MExportDir : public Message { - dirfrag_t dirfrag; - - bufferlist dirstate; - list bounds; - - public: - MExportDir() {} - MExportDir(dirfrag_t df) : - Message(MSG_MDS_EXPORTDIR), - dirfrag(df) { - } - virtual char *get_type_name() { return "Ex"; } - void print(ostream& o) { - o << "export(" << dirfrag << ")"; - } - - dirfrag_t get_dirfrag() { return dirfrag; } - bufferlist& get_dirstate() { return dirstate; } - list& get_bounds() { return bounds; } - - void take_dirstate(bufferlist& bl) { - dirstate.claim(bl); - } - void add_export(dirfrag_t df) { - bounds.push_back(df); - } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(dirfrag), (char*)&dirfrag); - off += sizeof(dirfrag); - ::_decode(bounds, payload, off); - ::_decode(dirstate, payload, off); - } - virtual void encode_payload() { - payload.append((char*)&dirfrag, sizeof(dirfrag)); - ::_encode(bounds, payload); - ::_encode(dirstate, payload); - } - -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MExportDirAck.h b/branches/sage/ebofs2/messages/MExportDirAck.h deleted file mode 100644 index 1b9d683b4e36f..0000000000000 --- a/branches/sage/ebofs2/messages/MExportDirAck.h +++ /dev/null @@ -1,46 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MEXPORTDIRACK_H -#define __MEXPORTDIRACK_H - -#include "MExportDir.h" - -class MExportDirAck : public Message { - dirfrag_t dirfrag; - - public: - dirfrag_t get_dirfrag() { return dirfrag; } - - MExportDirAck() {} - MExportDirAck(dirfrag_t i) : - Message(MSG_MDS_EXPORTDIRACK), dirfrag(i) { } - - virtual char *get_type_name() { return "ExAck"; } - void print(ostream& o) { - o << "export_ack(" << dirfrag << ")"; - } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(dirfrag), (char*)&dirfrag); - off += sizeof(dirfrag); - } - virtual void encode_payload() { - payload.append((char*)&dirfrag, sizeof(dirfrag)); - } - -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MExportDirCancel.h b/branches/sage/ebofs2/messages/MExportDirCancel.h deleted file mode 100644 index f13ee1a44fa21..0000000000000 --- a/branches/sage/ebofs2/messages/MExportDirCancel.h +++ /dev/null @@ -1,49 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MEXPORTDIRCANCEL_H -#define __MEXPORTDIRCANCEL_H - -#include "msg/Message.h" -#include "mds/CInode.h" -#include "include/types.h" - -class MExportDirCancel : public Message { - dirfrag_t dirfrag; - - public: - dirfrag_t get_dirfrag() { return dirfrag; } - - MExportDirCancel() {} - MExportDirCancel(dirfrag_t df) : - Message(MSG_MDS_EXPORTDIRCANCEL), - dirfrag(df) { } - - virtual char *get_type_name() { return "ExCancel"; } - void print(ostream& o) { - o << "export_cancel(" << dirfrag << ")"; - } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(dirfrag), (char*)&dirfrag); - off += sizeof(dirfrag); - } - - virtual void encode_payload() { - payload.append((char*)&dirfrag, sizeof(dirfrag)); - } -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MExportDirDiscover.h b/branches/sage/ebofs2/messages/MExportDirDiscover.h deleted file mode 100644 index c311d1e87e940..0000000000000 --- a/branches/sage/ebofs2/messages/MExportDirDiscover.h +++ /dev/null @@ -1,59 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MEXPORTDIRDISCOVER_H -#define __MEXPORTDIRDISCOVER_H - -#include "msg/Message.h" -#include "mds/CInode.h" -#include "include/types.h" - -class MExportDirDiscover : public Message { - dirfrag_t dirfrag; - string path; - - public: - inodeno_t get_ino() { return dirfrag.ino; } - dirfrag_t get_dirfrag() { return dirfrag; } - string& get_path() { return path; } - - bool started; - - MExportDirDiscover() : - Message(MSG_MDS_EXPORTDIRDISCOVER), - started(false) { } - MExportDirDiscover(CDir *dir) : - Message(MSG_MDS_EXPORTDIRDISCOVER), - started(false) { - dir->get_inode()->make_path(path); - dirfrag = dir->dirfrag(); - } - virtual char *get_type_name() { return "ExDis"; } - void print(ostream& o) { - o << "export_discover(" << dirfrag << " " << path << ")"; - } - - virtual void decode_payload() { - bufferlist::iterator p = payload.begin(); - ::_decode_simple(dirfrag, p); - ::_decode_simple(path, p); - } - - virtual void encode_payload() { - ::_encode_simple(dirfrag, payload); - ::_encode_simple(path, payload); - } -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MExportDirDiscoverAck.h b/branches/sage/ebofs2/messages/MExportDirDiscoverAck.h deleted file mode 100644 index 5e1924bc57e38..0000000000000 --- a/branches/sage/ebofs2/messages/MExportDirDiscoverAck.h +++ /dev/null @@ -1,60 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MEXPORTDIRDISCOVERACK_H -#define __MEXPORTDIRDISCOVERACK_H - -#include "msg/Message.h" -#include "mds/CInode.h" -#include "include/types.h" - -class MExportDirDiscoverAck : public Message { - dirfrag_t dirfrag; - bool success; - - public: - inodeno_t get_ino() { return dirfrag.ino; } - dirfrag_t get_dirfrag() { return dirfrag; } - bool is_success() { return success; } - - MExportDirDiscoverAck() {} - MExportDirDiscoverAck(dirfrag_t df, bool s=true) : - Message(MSG_MDS_EXPORTDIRDISCOVERACK), - dirfrag(df), - success(s) { } - - virtual char *get_type_name() { return "ExDisA"; } - void print(ostream& o) { - o << "export_discover_ack(" << dirfrag; - if (success) - o << " success)"; - else - o << " failure)"; - } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(dirfrag), (char*)&dirfrag); - off += sizeof(dirfrag); - payload.copy(off, sizeof(success), (char*)&success); - off += sizeof(success); - } - - virtual void encode_payload() { - payload.append((char*)&dirfrag, sizeof(dirfrag)); - payload.append((char*)&success, sizeof(success)); - } -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MExportDirFinish.h b/branches/sage/ebofs2/messages/MExportDirFinish.h deleted file mode 100644 index 03f5e1fcc9ef3..0000000000000 --- a/branches/sage/ebofs2/messages/MExportDirFinish.h +++ /dev/null @@ -1,46 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MEXPORTDIRFINISH_H -#define __MEXPORTDIRFINISH_H - -#include "msg/Message.h" - -class MExportDirFinish : public Message { - dirfrag_t dirfrag; - - public: - dirfrag_t get_dirfrag() { return dirfrag; } - - MExportDirFinish() {} - MExportDirFinish(dirfrag_t dirfrag) : - Message(MSG_MDS_EXPORTDIRFINISH) { - this->dirfrag = dirfrag; - } - virtual char *get_type_name() { return "ExFin"; } - void print(ostream& o) { - o << "export_finish(" << dirfrag << ")"; - } - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(dirfrag), (char*)&dirfrag); - off += sizeof(dirfrag); - } - virtual void encode_payload() { - payload.append((char*)&dirfrag, sizeof(dirfrag)); - } - -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MExportDirNotify.h b/branches/sage/ebofs2/messages/MExportDirNotify.h deleted file mode 100644 index c7a79a64f9317..0000000000000 --- a/branches/sage/ebofs2/messages/MExportDirNotify.h +++ /dev/null @@ -1,85 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MEXPORTDIRNOTIFY_H -#define __MEXPORTDIRNOTIFY_H - -#include "msg/Message.h" -#include -using namespace std; - -class MExportDirNotify : public Message { - dirfrag_t base; - bool ack; - pair old_auth, new_auth; - list bounds; // bounds; these dirs are _not_ included (tho the dirfragdes are) - - public: - dirfrag_t get_dirfrag() { return base; } - pair get_old_auth() { return old_auth; } - pair get_new_auth() { return new_auth; } - bool wants_ack() { return ack; } - list& get_bounds() { return bounds; } - - MExportDirNotify() {} - MExportDirNotify(dirfrag_t i, bool a, pair oa, pair na) : - Message(MSG_MDS_EXPORTDIRNOTIFY), - base(i), ack(a), old_auth(oa), new_auth(na) { } - - virtual char *get_type_name() { return "ExNot"; } - void print(ostream& o) { - o << "export_notify(" << base; - o << " " << old_auth << " -> " << new_auth; - if (ack) - o << " ack)"; - else - o << " no ack)"; - } - - void copy_bounds(list& ex) { - this->bounds = ex; - } - void copy_bounds(set& ex) { - for (set::iterator i = ex.begin(); - i != ex.end(); ++i) - bounds.push_back(*i); - } - void copy_bounds(set& ex) { - for (set::iterator i = ex.begin(); - i != ex.end(); ++i) - bounds.push_back((*i)->dirfrag()); - } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(base), (char*)&base); - off += sizeof(base); - payload.copy(off, sizeof(ack), (char*)&ack); - off += sizeof(ack); - payload.copy(off, sizeof(old_auth), (char*)&old_auth); - off += sizeof(old_auth); - payload.copy(off, sizeof(new_auth), (char*)&new_auth); - off += sizeof(new_auth); - ::_decode(bounds, payload, off); - } - virtual void encode_payload() { - payload.append((char*)&base, sizeof(base)); - payload.append((char*)&ack, sizeof(ack)); - payload.append((char*)&old_auth, sizeof(old_auth)); - payload.append((char*)&new_auth, sizeof(new_auth)); - ::_encode(bounds, payload); - } -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MExportDirNotifyAck.h b/branches/sage/ebofs2/messages/MExportDirNotifyAck.h deleted file mode 100644 index 6a41aee83b5f3..0000000000000 --- a/branches/sage/ebofs2/messages/MExportDirNotifyAck.h +++ /dev/null @@ -1,50 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MEXPORTDIRNOTIFYACK_H -#define __MEXPORTDIRNOTIFYACK_H - -#include "msg/Message.h" -#include -using namespace std; - -class MExportDirNotifyAck : public Message { - dirfrag_t dirfrag; - - public: - dirfrag_t get_dirfrag() { return dirfrag; } - - MExportDirNotifyAck() {} - MExportDirNotifyAck(dirfrag_t dirfrag) : - Message(MSG_MDS_EXPORTDIRNOTIFYACK) { - this->dirfrag = dirfrag; - } - virtual char *get_type_name() { return "ExNotA"; } - void print(ostream& o) { - o << "export_notify_ack(" << dirfrag << ")"; - } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(dirfrag), (char*)&dirfrag); - off += sizeof(dirfrag); - } - - virtual void encode_payload() { - payload.append((char*)&dirfrag, sizeof(dirfrag)); - } - -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MExportDirPrep.h b/branches/sage/ebofs2/messages/MExportDirPrep.h deleted file mode 100644 index 5789e301e8b11..0000000000000 --- a/branches/sage/ebofs2/messages/MExportDirPrep.h +++ /dev/null @@ -1,205 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MEXPORTDIRPREP_H -#define __MEXPORTDIRPREP_H - -#include "msg/Message.h" -#include "mds/CInode.h" -#include "include/types.h" - -class MExportDirPrep : public Message { - dirfrag_t dirfrag; - - /* nested export discover payload. - not all inodes will have dirs; they may require a separate discover. - dentries are the links to each inode. - dirs map includes base dir (ino) - */ - list bounds; - - list inodes; - list dentries; - map inode_dirfrag; - map inode_dentry; - - map > frags_by_ino; - map dirfrags; - - set bystanders; - - bool b_did_assim; - - public: - dirfrag_t get_dirfrag() { return dirfrag; } - list& get_bounds() { return bounds; } - list& get_inodes() { return inodes; } - list& get_dentries() { return dentries; } - list& get_inode_dirfrags(inodeno_t ino) { - return frags_by_ino[ino]; - } - dirfrag_t get_containing_dirfrag(inodeno_t ino) { - return inode_dirfrag[ino]; - } - string& get_dentry(inodeno_t ino) { - return inode_dentry[ino]; - } - bool have_dirfrag(dirfrag_t df) { - return dirfrags.count(df); - } - CDirDiscover* get_dirfrag_discover(dirfrag_t df) { - return dirfrags[df]; - } - set &get_bystanders() { return bystanders; } - - bool did_assim() { return b_did_assim; } - void mark_assim() { b_did_assim = true; } - - MExportDirPrep() { - b_did_assim = false; - } - MExportDirPrep(dirfrag_t df) : - Message(MSG_MDS_EXPORTDIRPREP), - dirfrag(df), - b_did_assim(false) { } - ~MExportDirPrep() { - for (list::iterator iit = inodes.begin(); - iit != inodes.end(); - iit++) - delete *iit; - for (list::iterator p = dentries.begin(); - p != dentries.end(); - p++) - delete *p; - for (map::iterator dit = dirfrags.begin(); - dit != dirfrags.end(); - dit++) - delete dit->second; - } - - - virtual char *get_type_name() { return "ExP"; } - void print(ostream& o) { - o << "export_prep(" << dirfrag << ")"; - } - - void add_export(dirfrag_t df) { - bounds.push_back( df ); - } - void add_inode(dirfrag_t df, const string& name, CDentryDiscover *dn, CInodeDiscover *in) { - inodes.push_back(in); - dentries.push_back(dn); - inode_dirfrag[in->get_ino()] = df; - inode_dentry[in->get_ino()] = name; - } - void add_dirfrag(CDirDiscover *dir) { - dirfrags[dir->get_dirfrag()] = dir; - frags_by_ino[dir->get_dirfrag().ino].push_back(dir->get_dirfrag().frag); - } - void add_bystander(int who) { - bystanders.insert(who); - } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(dirfrag), (char*)&dirfrag); - off += sizeof(dirfrag); - - ::_decode(bounds, payload, off); - - // inodes - int ni; - payload.copy(off, sizeof(int), (char*)&ni); - off += sizeof(int); - for (int i=0; i_decode(payload, off); - inodes.push_back(in); - - // dentry - CDentryDiscover *dn = new CDentryDiscover; - dn->_decode(payload, off); - dentries.push_back(dn); - - // dentry - string d; - _decode(d, payload, off); - inode_dentry[in->get_ino()] = d; - - // dir ino - dirfrag_t df; - payload.copy(off, sizeof(df), (char*)&df); - off += sizeof(df); - inode_dirfrag[in->get_ino()] = df; - - // child frags - ::_decode(frags_by_ino[in->get_ino()], payload, off); - } - - // dirs - int nd; - payload.copy(off, sizeof(int), (char*)&nd); - off += sizeof(int); - for (int i=0; i_decode(payload, off); - dirfrags[dir->get_dirfrag()] = dir; - } - - ::_decode(bystanders, payload, off); - } - - virtual void encode_payload() { - payload.append((char*)&dirfrag, sizeof(dirfrag)); - - ::_encode(bounds, payload); - - // inodes - int ni = inodes.size(); - payload.append((char*)&ni, sizeof(int)); - list::iterator dit = dentries.begin(); - list::iterator iit = inodes.begin(); - while (iit != inodes.end()) { - (*iit)->_encode(payload); - (*dit)->_encode(payload); - - // dentry name - _encode(inode_dentry[(*iit)->get_ino()], payload); - - // dir ino - dirfrag_t df = inode_dirfrag[(*iit)->get_ino()]; - payload.append((char*)&df, sizeof(df)); - - // child frags - ::_encode(frags_by_ino[(*iit)->get_ino()], payload); - - iit++; - dit++; - } - - // dirs - int nd = dirfrags.size(); - payload.append((char*)&nd, sizeof(int)); - for (map::iterator dit = dirfrags.begin(); - dit != dirfrags.end(); - dit++) - dit->second->_encode(payload); - - ::_encode(bystanders, payload); - } -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MExportDirPrepAck.h b/branches/sage/ebofs2/messages/MExportDirPrepAck.h deleted file mode 100644 index 355541e9f1b5c..0000000000000 --- a/branches/sage/ebofs2/messages/MExportDirPrepAck.h +++ /dev/null @@ -1,47 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MEXPORTDIRPREPACK_H -#define __MEXPORTDIRPREPACK_H - -#include "msg/Message.h" -#include "include/types.h" - -class MExportDirPrepAck : public Message { - dirfrag_t dirfrag; - - public: - dirfrag_t get_dirfrag() { return dirfrag; } - - MExportDirPrepAck() {} - MExportDirPrepAck(dirfrag_t df) : - Message(MSG_MDS_EXPORTDIRPREPACK), - dirfrag(df) { } - - virtual char *get_type_name() { return "ExPAck"; } - void print(ostream& o) { - o << "export_prep_ack(" << dirfrag << ")"; - } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(dirfrag), (char*)&dirfrag); - off += sizeof(dirfrag); - } - virtual void encode_payload() { - payload.append((char*)&dirfrag, sizeof(dirfrag)); - } -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MExportDirWarning.h b/branches/sage/ebofs2/messages/MExportDirWarning.h deleted file mode 100644 index b59e2eb12251c..0000000000000 --- a/branches/sage/ebofs2/messages/MExportDirWarning.h +++ /dev/null @@ -1,50 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MEXPORTDIRWARNING_H -#define __MEXPORTDIRWARNING_H - -#include "msg/Message.h" -#include "mds/CInode.h" -#include "include/types.h" - -class MExportDirWarning : public Message { - inodeno_t ino; - int new_dir_auth; - - public: - inodeno_t get_ino() { return ino; } - int get_new_dir_auth() { return new_dir_auth; } - - MExportDirWarning() {} - MExportDirWarning(inodeno_t i, int nda) : - Message(MSG_MDS_EXPORTDIRWARNING), - ino(i), new_dir_auth(nda) {} - - virtual char *get_type_name() { return "ExW"; } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - payload.copy(off, sizeof(new_dir_auth), (char*)&new_dir_auth); - off += sizeof(new_dir_auth); - } - virtual void encode_payload() { - payload.append((char*)&ino, sizeof(ino)); - payload.append((char*)&new_dir_auth, sizeof(new_dir_auth)); - } -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MExportDirWarningAck.h b/branches/sage/ebofs2/messages/MExportDirWarningAck.h deleted file mode 100644 index 7ee3078e61973..0000000000000 --- a/branches/sage/ebofs2/messages/MExportDirWarningAck.h +++ /dev/null @@ -1,45 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MEXPORTDIRWARNINGACK_H -#define __MEXPORTDIRWARNINGACK_H - -#include "msg/Message.h" -#include "mds/CInode.h" -#include "include/types.h" - -class MExportDirWarningAck : public Message { - inodeno_t ino; - - public: - inodeno_t get_ino() { return ino; } - - MExportDirWarningAck() {} - MExportDirWarningAck(inodeno_t i) : - Message(MSG_MDS_EXPORTDIRWARNINGACK), - ino(i) {} - - virtual char *get_type_name() { return "ExWAck"; } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - } - virtual void encode_payload() { - payload.append((char*)&ino, sizeof(ino)); - } -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MGenericMessage.h b/branches/sage/ebofs2/messages/MGenericMessage.h deleted file mode 100644 index fee4e014edaf8..0000000000000 --- a/branches/sage/ebofs2/messages/MGenericMessage.h +++ /dev/null @@ -1,45 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MGENERICMESSAGE_H -#define __MGENERICMESSAGE_H - -#include "msg/Message.h" - -class MGenericMessage : public Message { - char tname[20]; - //long pcid; - - public: - MGenericMessage(int t) : Message(t) { - sprintf(tname, "generic%d", get_type()); - } - - //void set_pcid(long pcid) { this->pcid = pcid; } - //long get_pcid() { return pcid; } - - char *get_type_name() { return tname; } - - virtual void decode_payload() { - //int off = 0; - //payload.copy(off, sizeof(pcid), (char*)&pcid); - //off += sizeof(pcid); - } - virtual void encode_payload() { - //payload.append((char*)&pcid, sizeof(pcid)); - } -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MHeartbeat.h b/branches/sage/ebofs2/messages/MHeartbeat.h deleted file mode 100644 index 964f2a3bd49f2..0000000000000 --- a/branches/sage/ebofs2/messages/MHeartbeat.h +++ /dev/null @@ -1,60 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MHEARTBEAT_H -#define __MHEARTBEAT_H - -#include "include/types.h" -#include "msg/Message.h" - -class MHeartbeat : public Message { - mds_load_t load; - int beat; - map import_map; - - public: - mds_load_t& get_load() { return load; } - int get_beat() { return beat; } - - map& get_import_map() { - return import_map; - } - - MHeartbeat() {} - MHeartbeat(mds_load_t& load, int beat) : - Message(MSG_MDS_HEARTBEAT) { - this->load = load; - this->beat = beat; - } - - virtual char *get_type_name() { return "HB"; } - - virtual void decode_payload() { - int off = 0; - payload.copy(off,sizeof(load), (char*)&load); - off += sizeof(load); - payload.copy(off, sizeof(beat), (char*)&beat); - off += sizeof(beat); - ::_decode(import_map, payload, off); - } - virtual void encode_payload() { - payload.append((char*)&load, sizeof(load)); - payload.append((char*)&beat, sizeof(beat)); - ::_encode(import_map, payload); - } - -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MInodeFileCaps.h b/branches/sage/ebofs2/messages/MInodeFileCaps.h deleted file mode 100644 index 05ade1094c9c8..0000000000000 --- a/branches/sage/ebofs2/messages/MInodeFileCaps.h +++ /dev/null @@ -1,57 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MINODEFILECAPS_H -#define __MINODEFILECAPS_H - -class MInodeFileCaps : public Message { - inodeno_t ino; - int from; - int caps; - - public: - inodeno_t get_ino() { return ino; } - int get_from() { return from; } - int get_caps() { return caps; } - - MInodeFileCaps() {} - // from auth - MInodeFileCaps(inodeno_t ino, int from, int caps) : - Message(MSG_MDS_INODEFILECAPS) { - - this->ino = ino; - this->from = from; - this->caps = caps; - } - - virtual char *get_type_name() { return "Icap";} - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(from), (char*)&from); - off += sizeof(from); - payload.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - payload.copy(off, sizeof(caps), (char*)&caps); - off += sizeof(caps); - } - virtual void encode_payload() { - payload.append((char*)&from, sizeof(from)); - payload.append((char*)&ino, sizeof(ino)); - payload.append((char*)&caps, sizeof(caps)); - } -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MLock.h b/branches/sage/ebofs2/messages/MLock.h deleted file mode 100644 index 95c3e5f325212..0000000000000 --- a/branches/sage/ebofs2/messages/MLock.h +++ /dev/null @@ -1,126 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MLOCK_H -#define __MLOCK_H - -#include "msg/Message.h" -#include "mds/SimpleLock.h" - -// for replicas -#define LOCK_AC_SYNC -1 -#define LOCK_AC_MIXED -2 -#define LOCK_AC_LOCK -3 - -#define LOCK_AC_SCATTER -6 - -// for auth -#define LOCK_AC_SYNCACK 1 -#define LOCK_AC_MIXEDACK 2 -#define LOCK_AC_LOCKACK 3 - -#define LOCK_AC_REQSCATTER 7 -#define LOCK_AC_REQUNSCATTER 8 - -#define LOCK_AC_FOR_REPLICA(a) ((a) < 0) -#define LOCK_AC_FOR_AUTH(a) ((a) > 0) - - -static const char *get_lock_action_name(int a) { - switch (a) { - case LOCK_AC_SYNC: return "sync"; - case LOCK_AC_MIXED: return "mixed"; - case LOCK_AC_LOCK: return "lock"; - case LOCK_AC_SCATTER: return "scatter"; - case LOCK_AC_SYNCACK: return "syncack"; - case LOCK_AC_MIXEDACK: return "mixedack"; - case LOCK_AC_LOCKACK: return "lockack"; - case LOCK_AC_REQSCATTER: return "reqscatter"; - case LOCK_AC_REQUNSCATTER: return "requnscatter"; - default: assert(0); return 0; - } -} - - -class MLock : public Message { - int32_t action; // action type - int32_t asker; // who is initiating this request - metareqid_t reqid; // for remote lock requests - - char lock_type; // lock object type - MDSCacheObjectInfo object_info; - - bufferlist data; // and possibly some data - - public: - bufferlist& get_data() { return data; } - int get_asker() { return asker; } - int get_action() { return action; } - metareqid_t get_reqid() { return reqid; } - - int get_lock_type() { return lock_type; } - MDSCacheObjectInfo &get_object_info() { return object_info; } - - MLock() {} - MLock(int ac, int as) : - Message(MSG_MDS_LOCK), - action(ac), asker(as), - lock_type(0) { } - MLock(SimpleLock *lock, int ac, int as) : - Message(MSG_MDS_LOCK), - action(ac), asker(as), - lock_type(lock->get_type()) { - lock->get_parent()->set_object_info(object_info); - } - MLock(SimpleLock *lock, int ac, int as, bufferlist& bl) : - Message(MSG_MDS_LOCK), - action(ac), asker(as), lock_type(lock->get_type()) { - lock->get_parent()->set_object_info(object_info); - data.claim(bl); - } - virtual char *get_type_name() { return "ILock"; } - void print(ostream& out) { - out << "lock(a=" << get_lock_action_name(action) - << " " << get_lock_type_name(lock_type) - << " " << object_info - << ")"; - } - - void set_reqid(metareqid_t ri) { reqid = ri; } - void set_data(const bufferlist& data) { - this->data = data; - } - - void decode_payload() { - int off = 0; - ::_decode(asker, payload, off); - ::_decode(action, payload, off); - ::_decode(reqid, payload, off); - ::_decode(lock_type, payload, off); - object_info._decode(payload, off); - ::_decode(data, payload, off); - } - virtual void encode_payload() { - ::_encode(asker, payload); - ::_encode(action, payload); - ::_encode(reqid, payload); - ::_encode(lock_type, payload); - object_info._encode(payload); - ::_encode(data, payload); - } - -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MMDSBoot.h b/branches/sage/ebofs2/messages/MMDSBoot.h deleted file mode 100644 index 8529578e29d56..0000000000000 --- a/branches/sage/ebofs2/messages/MMDSBoot.h +++ /dev/null @@ -1,39 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMDSBOOT_H -#define __MMDSBOOT_H - -#include "msg/Message.h" - -#include "include/types.h" - -class MMDSBoot : public Message { - public: - MMDSBoot() : Message(MSG_MDS_BOOT) { - } - - char *get_type_name() { return "mdsboot"; } - - void encode_payload() { - //payload.append((char*)&sb, sizeof(sb)); - } - void decode_payload() { - //int off = 0; - //payload.copy(off, sizeof(sb), (char*)&sb); - //off += sizeof(sb); - } -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MMDSCacheRejoin.h b/branches/sage/ebofs2/messages/MMDSCacheRejoin.h deleted file mode 100644 index 844ece02000ae..0000000000000 --- a/branches/sage/ebofs2/messages/MMDSCacheRejoin.h +++ /dev/null @@ -1,230 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMDSCACHEREJOIN_H -#define __MMDSCACHEREJOIN_H - -#include "msg/Message.h" - -#include "include/types.h" -#include "include/encodable.h" - -// sent from replica to auth - -class MMDSCacheRejoin : public Message { - public: - static const int OP_WEAK = 1; // replica -> auth, i exist, + maybe open files. - static const int OP_STRONG = 2; // replica -> auth, i exist, + open files and lock state. - static const int OP_ACK = 3; // auth -> replica, here is your lock state. - //static const int OP_PURGE = 4; // auth -> replica, remove these items, they are old/obsolete. - static const int OP_MISSING = 5; // auth -> replica, i am missing these items - static const int OP_FULL = 6; // replica -> auth, here is the full object. - static const char *get_opname(int op) { - switch (op) { - case OP_WEAK: return "weak"; - case OP_STRONG: return "strong"; - case OP_ACK: return "ack"; - case OP_MISSING: return "missing"; - case OP_FULL: return "full"; - default: assert(0); return 0; - } - } - - // -- types -- - struct inode_strong { - int32_t caps_wanted; - int32_t nonce; - int32_t authlock; - int32_t linklock; - int32_t dirfragtreelock; - int32_t filelock; - __int32_t dirlock; - inode_strong() {} - inode_strong(int n, int cw=0, int a=0, int l=0, int dft=0, int f=0, int dl=0) : - caps_wanted(cw), - nonce(n), - authlock(a), linklock(l), dirfragtreelock(dft), filelock(f), dirlock(dl) { } - }; - struct inode_full { - inode_t inode; - string symlink; - fragtree_t dirfragtree; - inode_full() {} - inode_full(const inode_t& i, const string& s, const fragtree_t& f) : - inode(i), symlink(s), dirfragtree(f) {} - - void _decode(bufferlist::iterator& p) { - ::_decode_simple(inode, p); - ::_decode_simple(symlink, p); - dirfragtree._decode(p); - } - void _encode(bufferlist& bl) const { - ::_encode(inode, bl); - ::_encode(symlink, bl); - dirfragtree._encode(bl); - } - }; - - struct dirfrag_strong { - int32_t nonce; - int8_t dir_rep; - dirfrag_strong() {} - dirfrag_strong(int n, int dr) : nonce(n), dir_rep(dr) {} - }; - struct dn_strong { - inodeno_t ino; - inodeno_t remote_ino; - unsigned char remote_d_type; - int32_t nonce; - int32_t lock; - dn_strong() : - ino(0), remote_ino(0), remote_d_type(0), nonce(0), lock(0) {} - dn_strong(inodeno_t pi, inodeno_t ri, unsigned char rdt, int n, int l) : - ino(pi), remote_ino(ri), remote_d_type(rdt), nonce(n), lock(l) {} - bool is_primary() { return ino > 0; } - bool is_remote() { return remote_ino > 0; } - bool is_null() { return ino == 0 && remote_ino == 0; } - }; - - struct dn_weak { - inodeno_t ino; - dn_weak() : ino(0) {} - dn_weak(inodeno_t pi) : ino(pi) {} - }; - - // -- data -- - int32_t op; - - // weak - map > weak; - set weak_inodes; - - // strong - map strong_dirfrags; - map > strong_dentries; - map strong_inodes; - - // open - bufferlist cap_export_bl; - map > cap_exports; - map cap_export_paths; - - // full - list full_inodes; - - // authpins, xlocks - map authpinned_inodes; - map > xlocked_inodes; - map > authpinned_dentries; - map > xlocked_dentries; - - MMDSCacheRejoin() : Message(MSG_MDS_CACHEREJOIN) {} - MMDSCacheRejoin(int o) : - Message(MSG_MDS_CACHEREJOIN), - op(o) {} - - char *get_type_name() { return "cache_rejoin"; } - void print(ostream& out) { - out << "cache_rejoin " << get_opname(op); - } - - // -- builders -- - // inodes - void add_weak_inode(inodeno_t i) { - weak_inodes.insert(i); - } - void add_strong_inode(inodeno_t i, int n, int cw, int a, int l, int dft, int f, int dl) { - strong_inodes[i] = inode_strong(n, cw, a, l, dft, f, dl); - } - void add_full_inode(inode_t &i, const string& s, const fragtree_t &f) { - full_inodes.push_back(inode_full(i, s, f)); - } - void add_inode_authpin(inodeno_t ino, const metareqid_t& ri) { - authpinned_inodes[ino] = ri; - } - void add_inode_xlock(inodeno_t ino, int lt, const metareqid_t& ri) { - xlocked_inodes[ino][lt] = ri; - } - - void copy_cap_exports(bufferlist &bl) { - cap_export_bl = bl; - } - - // dirfrags - void add_weak_dirfrag(dirfrag_t df) { - weak[df]; - } - void add_weak_dirfrag(dirfrag_t df, map& dnmap) { - weak[df] = dnmap; - } - void add_strong_dirfrag(dirfrag_t df, int n, int dr) { - strong_dirfrags[df] = dirfrag_strong(n, dr); - } - - // dentries - void add_weak_dentry(dirfrag_t df, const string& dname, dn_weak& dnw) { - weak[df][dname] = dnw; - } - void add_weak_primary_dentry(dirfrag_t df, const string& dname, inodeno_t ino) { - weak[df][dname] = dn_weak(ino); - } - void add_strong_dentry(dirfrag_t df, const string& dname, inodeno_t pi, inodeno_t ri, unsigned char rdt, int n, int ls) { - strong_dentries[df][dname] = dn_strong(pi, ri, rdt, n, ls); - } - void add_dentry_authpin(dirfrag_t df, const string& dname, const metareqid_t& ri) { - authpinned_dentries[df][dname] = ri; - } - void add_dentry_xlock(dirfrag_t df, const string& dname, const metareqid_t& ri) { - xlocked_dentries[df][dname] = ri; - } - - // -- encoding -- - void encode_payload() { - ::_encode(op, payload); - ::_encode(strong_inodes, payload); - ::_encode_complex(full_inodes, payload); - ::_encode(authpinned_inodes, payload); - ::_encode(xlocked_inodes, payload); - ::_encode(cap_export_bl, payload); - ::_encode(strong_dirfrags, payload); - ::_encode(weak, payload); - ::_encode(weak_inodes, payload); - ::_encode(strong_dentries, payload); - ::_encode(authpinned_dentries, payload); - ::_encode(xlocked_dentries, payload); - } - void decode_payload() { - bufferlist::iterator p = payload.begin(); - ::_decode_simple(op, p); - ::_decode_simple(strong_inodes, p); - ::_decode_complex(full_inodes, p); - ::_decode_simple(authpinned_inodes, p); - ::_decode_simple(xlocked_inodes, p); - ::_decode_simple(cap_export_bl, p); - if (cap_export_bl.length()) { - bufferlist::iterator q = cap_export_bl.begin(); - ::_decode_simple(cap_exports, q); - ::_decode_simple(cap_export_paths, q); - } - ::_decode_simple(strong_dirfrags, p); - ::_decode_simple(weak, p); - ::_decode_simple(weak_inodes, p); - ::_decode_simple(strong_dentries, p); - ::_decode_simple(authpinned_dentries, p); - ::_decode_simple(xlocked_dentries, p); - } - -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MMDSFragmentNotify.h b/branches/sage/ebofs2/messages/MMDSFragmentNotify.h deleted file mode 100644 index 232cce92427bb..0000000000000 --- a/branches/sage/ebofs2/messages/MMDSFragmentNotify.h +++ /dev/null @@ -1,60 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMDSFRAGMENTNOTIFY_H -#define __MMDSFRAGMENTNOTIFY_H - -#include "msg/Message.h" -#include -using namespace std; - -class MMDSFragmentNotify : public Message { - inodeno_t ino; - frag_t basefrag; - int8_t bits; - - public: - inodeno_t get_ino() { return ino; } - frag_t get_basefrag() { return basefrag; } - int get_bits() { return bits; } - - bufferlist basebl; - - MMDSFragmentNotify() {} - MMDSFragmentNotify(inodeno_t i, frag_t bf, int b) : - Message(MSG_MDS_FRAGMENTNOTIFY), - ino(i), basefrag(bf), bits(b) { } - - virtual char *get_type_name() { return "fragment_notify"; } - void print(ostream& o) { - o << "fragment_notify(" << ino << "#" << basefrag - << " " << (int)bits << ")"; - } - - virtual void decode_payload() { - int off = 0; - ::_decode(ino, payload, off); - ::_decode(basefrag, payload, off); - ::_decode(bits, payload, off); - ::_decode(basebl, payload, off); - } - virtual void encode_payload() { - ::_encode(ino, payload); - ::_encode(basefrag, payload); - ::_encode(bits, payload); - ::_encode(basebl, payload); - } -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MMDSGetMap.h b/branches/sage/ebofs2/messages/MMDSGetMap.h deleted file mode 100644 index eab9a3506a40b..0000000000000 --- a/branches/sage/ebofs2/messages/MMDSGetMap.h +++ /dev/null @@ -1,39 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMDSGETMAP_H -#define __MMDSGETMAP_H - -#include "msg/Message.h" - -#include "include/types.h" - -class MMDSGetMap : public Message { - public: - MMDSGetMap() : Message(MSG_MDS_GETMAP) { - } - - char *get_type_name() { return "mdsgetmap"; } - - void encode_payload() { - //payload.append((char*)&sb, sizeof(sb)); - } - void decode_payload() { - //int off = 0; - //payload.copy(off, sizeof(sb), (char*)&sb); - //off += sizeof(sb); - } -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MMDSMap.h b/branches/sage/ebofs2/messages/MMDSMap.h deleted file mode 100644 index 164e547cc513a..0000000000000 --- a/branches/sage/ebofs2/messages/MMDSMap.h +++ /dev/null @@ -1,79 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MMDSMAP_H -#define __MMDSMAP_H - -#include "msg/Message.h" -#include "mds/MDSMap.h" - -class MMDSMap : public Message { - public: - /* - map maps; - map incremental_maps; - - epoch_t get_first() { - epoch_t e = 0; - map::iterator i = maps.begin(); - if (i != maps.end()) e = i->first; - i = incremental_maps.begin(); - if (i != incremental_maps.end() && - (e == 0 || i->first < e)) e = i->first; - return e; - } - epoch_t get_last() { - epoch_t e = 0; - map::reverse_iterator i = maps.rbegin(); - if (i != maps.rend()) e = i->first; - i = incremental_maps.rbegin(); - if (i != incremental_maps.rend() && - (e == 0 || i->first > e)) e = i->first; - return e; - } - */ - - version_t epoch; - bufferlist encoded; - - version_t get_epoch() const { return epoch; } - bufferlist& get_encoded() { return encoded; } - - MMDSMap() : - Message(MSG_MDS_MAP) {} - MMDSMap(MDSMap *mm) : - Message(MSG_MDS_MAP) { - epoch = mm->get_epoch(); - mm->encode(encoded); - } - - char *get_type_name() { return "mdsmap"; } - void print(ostream& out) { - out << "mdsmap(e " << epoch << ")"; - } - - // marshalling - void decode_payload() { - int off = 0; - ::_decode(epoch, payload, off); - ::_decode(encoded, payload, off); - } - void encode_payload() { - ::_encode(epoch, payload); - ::_encode(encoded, payload); - } -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MMDSResolve.h b/branches/sage/ebofs2/messages/MMDSResolve.h deleted file mode 100644 index 2103a0115081d..0000000000000 --- a/branches/sage/ebofs2/messages/MMDSResolve.h +++ /dev/null @@ -1,66 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMDSRESOLVE_H -#define __MMDSRESOLVE_H - -#include "msg/Message.h" - -#include "include/types.h" - -class MMDSResolve : public Message { - public: - map > subtrees; - map > ambiguous_imports; - list slave_requests; - - MMDSResolve() : Message(MSG_MDS_RESOLVE) {} - - char *get_type_name() { return "mds_resolve"; } - - void print(ostream& out) { - out << "mds_resolve(" << subtrees.size() - << "+" << ambiguous_imports.size() - << " subtrees +" << slave_requests.size() << " slave requests)"; - } - - void add_subtree(dirfrag_t im) { - subtrees[im].clear(); - } - void add_subtree_bound(dirfrag_t im, dirfrag_t ex) { - subtrees[im].push_back(ex); - } - - void add_ambiguous_import(dirfrag_t im, const list& m) { - ambiguous_imports[im] = m; - } - - void add_slave_request(metareqid_t reqid) { - slave_requests.push_back(reqid); - } - - void encode_payload() { - ::_encode(subtrees, payload); - ::_encode(ambiguous_imports, payload); - ::_encode(slave_requests, payload); - } - void decode_payload() { - int off = 0; - ::_decode(subtrees, payload, off); - ::_decode(ambiguous_imports, payload, off); - ::_decode(slave_requests, payload, off); - } -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MMDSResolveAck.h b/branches/sage/ebofs2/messages/MMDSResolveAck.h deleted file mode 100644 index 1870e226b4161..0000000000000 --- a/branches/sage/ebofs2/messages/MMDSResolveAck.h +++ /dev/null @@ -1,56 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMDSRESOLVEACK_H -#define __MMDSRESOLVEACK_H - -#include "msg/Message.h" - -#include "include/types.h" - - -class MMDSResolveAck : public Message { - public: - list commit; - list abort; - - MMDSResolveAck() : Message(MSG_MDS_RESOLVEACK) {} - - char *get_type_name() { return "resolve_ack"; } - /*void print(ostream& out) { - out << "resolve_ack.size() - << "+" << ambiguous_imap.size() - << " imports +" << slave_requests.size() << " slave requests)"; - } - */ - - void add_commit(metareqid_t r) { - commit.push_back(r); - } - void add_abort(metareqid_t r) { - abort.push_back(r); - } - - void encode_payload() { - ::_encode(commit, payload); - ::_encode(abort, payload); - } - void decode_payload() { - int off = 0; - ::_decode(commit, payload, off); - ::_decode(abort, payload, off); - } -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MMDSSlaveRequest.h b/branches/sage/ebofs2/messages/MMDSSlaveRequest.h deleted file mode 100644 index 5ef65223ec1c9..0000000000000 --- a/branches/sage/ebofs2/messages/MMDSSlaveRequest.h +++ /dev/null @@ -1,148 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MMDSSLAVEREQUEST_H -#define __MMDSSLAVEREQUEST_H - -#include "msg/Message.h" -#include "mds/mdstypes.h" -#include "include/encodable.h" - -class MMDSSlaveRequest : public Message { - public: - static const int OP_XLOCK = 1; - static const int OP_XLOCKACK = -1; - static const int OP_UNXLOCK = 2; - static const int OP_AUTHPIN = 3; - static const int OP_AUTHPINACK = -3; - - static const int OP_LINKPREP = 4; - static const int OP_UNLINKPREP = 5; - static const int OP_LINKPREPACK = -4; - - static const int OP_RENAMEPREP = 7; - static const int OP_RENAMEPREPACK = -7; - - static const int OP_FINISH = 17; - - static const int OP_ABORT = 20; // used for recovery only - //static const int OP_COMMIT = 21; // used for recovery only - - - const static char *get_opname(int o) { - switch (o) { - case OP_XLOCK: return "xlock"; - case OP_XLOCKACK: return "xlock_ack"; - case OP_UNXLOCK: return "unxlock"; - case OP_AUTHPIN: return "authpin"; - case OP_AUTHPINACK: return "authpin_ack"; - - case OP_LINKPREP: return "link_prep"; - case OP_LINKPREPACK: return "link_prep_ack"; - case OP_UNLINKPREP: return "unlink_prep"; - - case OP_RENAMEPREP: return "rename_prep"; - case OP_RENAMEPREPACK: return "rename_prep_ack"; - - case OP_FINISH: return "finish"; // commit - case OP_ABORT: return "abort"; - //case OP_COMMIT: return "commit"; - - default: assert(0); return 0; - } - } - - private: - metareqid_t reqid; - char op; - - // for locking - char lock_type; // lock object type - MDSCacheObjectInfo object_info; - - // for authpins - list authpins; - - public: - // for rename prep - string srcdnpath; - string destdnpath; - set witnesses; - bufferlist inode_export; - version_t inode_export_v; - bufferlist srci_replica; - utime_t now; - - bufferlist stray; // stray dir + dentry - -public: - metareqid_t get_reqid() { return reqid; } - int get_op() { return op; } - bool is_reply() { return op < 0; } - - int get_lock_type() { return lock_type; } - MDSCacheObjectInfo &get_object_info() { return object_info; } - - list& get_authpins() { return authpins; } - - void set_lock_type(int t) { lock_type = t; } - - // ---- - MMDSSlaveRequest() : Message(MSG_MDS_SLAVE_REQUEST) { } - MMDSSlaveRequest(metareqid_t ri, int o) : - Message(MSG_MDS_SLAVE_REQUEST), - reqid(ri), op(o) { } - void encode_payload() { - ::_encode(reqid, payload); - ::_encode(op, payload); - ::_encode(lock_type, payload); - object_info._encode(payload); - ::_encode_complex(authpins, payload); - ::_encode(srcdnpath, payload); - ::_encode(destdnpath, payload); - ::_encode(witnesses, payload); - ::_encode(now, payload); - ::_encode(inode_export, payload); - ::_encode(inode_export_v, payload); - ::_encode(srci_replica, payload); - ::_encode(stray, payload); - } - void decode_payload() { - bufferlist::iterator p = payload.begin(); - ::_decode_simple(reqid, p); - ::_decode_simple(op, p); - ::_decode_simple(lock_type, p); - object_info._decode(p); - ::_decode_complex(authpins, p); - ::_decode_simple(srcdnpath, p); - ::_decode_simple(destdnpath, p); - ::_decode_simple(witnesses, p); - ::_decode_simple(now, p); - ::_decode_simple(inode_export, p); - ::_decode_simple(inode_export_v, p); - ::_decode_simple(srci_replica, p); - ::_decode_simple(stray, p); - } - - char *get_type_name() { return "slave_request"; } - void print(ostream& out) { - out << "slave_request(" << reqid - << " " << get_opname(op) - << ")"; - } - -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MMonCommand.h b/branches/sage/ebofs2/messages/MMonCommand.h deleted file mode 100644 index 19d25dd7a4d77..0000000000000 --- a/branches/sage/ebofs2/messages/MMonCommand.h +++ /dev/null @@ -1,54 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMONCOMMAND_H -#define __MMONCOMMAND_H - -#include "msg/Message.h" - -#include -using std::vector; - -class MMonCommand : public Message { - public: - entity_inst_t inst; - vector cmd; - - MMonCommand() : Message(MSG_MON_COMMAND) {} - MMonCommand(entity_inst_t i) : - Message(MSG_MON_COMMAND), - inst(i) { } - - virtual char *get_type_name() { return "mon_command"; } - void print(ostream& o) { - o << "mon_command("; - for (unsigned i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMONCOMMANDACK_H -#define __MMONCOMMANDACK_H - -#include "msg/Message.h" - -class MMonCommandAck : public Message { - public: - int r; - string rs; - - MMonCommandAck() : Message(MSG_MON_COMMAND_ACK) {} - MMonCommandAck(int _r, string s) : Message(MSG_MON_COMMAND_ACK), - r(_r), rs(s) { } - - virtual char *get_type_name() { return "mon_command"; } - void print(ostream& o) { - o << "mon_command_ack(" << r << " " << rs << ")"; - } - - void encode_payload() { - payload.append((char*)&r, sizeof(r)); - ::_encode(rs, payload); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(r), (char*)&r); - off += sizeof(r); - ::_decode(rs, payload, off); - } -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MMonElection.h b/branches/sage/ebofs2/messages/MMonElection.h deleted file mode 100644 index 14a29af9140f9..0000000000000 --- a/branches/sage/ebofs2/messages/MMonElection.h +++ /dev/null @@ -1,63 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MMONELECTION_H -#define __MMONELECTION_H - -#include "msg/Message.h" - - -class MMonElection : public Message { -public: - static const int OP_PROPOSE = 1; - static const int OP_ACK = 2; - static const int OP_NAK = 3; - static const int OP_VICTORY = 4; - static const char *get_opname(int o) { - switch (o) { - case OP_PROPOSE: return "propose"; - case OP_ACK: return "ack"; - case OP_NAK: return "nak"; - case OP_VICTORY: return "victory"; - default: assert(0); return 0; - } - } - - int32_t op; - epoch_t epoch; - - MMonElection() : Message(MSG_MON_ELECTION) {} - MMonElection(int o, epoch_t e) : - Message(MSG_MON_ELECTION), - op(o), epoch(e) {} - - char *get_type_name() { return "election"; } - void print(ostream& out) { - out << "election(" << get_opname(op) << " " << epoch << ")"; - } - - void encode_payload() { - ::_encode(op, payload); - ::_encode(epoch, payload); - } - void decode_payload() { - int off = 0; - ::_decode(op, payload, off); - ::_decode(epoch, payload, off); - } - -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MMonElectionCollect.h b/branches/sage/ebofs2/messages/MMonElectionCollect.h deleted file mode 100644 index f9f0c12d1ac2e..0000000000000 --- a/branches/sage/ebofs2/messages/MMonElectionCollect.h +++ /dev/null @@ -1,43 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MMONELECTIONCOLLECT_H -#define __MMONELECTIONCOLLECT_H - -#include "msg/Message.h" - - -class MMonElectionCollect : public Message { - public: - int read_num; - - MMonElectionCollect() {} - MMonElectionCollect(int n) : - Message(MSG_MON_ELECTION_COLLECT), - read_num(n) {} - - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(read_num), (char*)&read_num); - off += sizeof(read_num); - } - void encode_payload() { - payload.append((char*)&read_num, sizeof(read_num)); - } - - virtual char *get_type_name() { return "MonElCollect"; } -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MMonElectionRefresh.h b/branches/sage/ebofs2/messages/MMonElectionRefresh.h deleted file mode 100644 index bc0337b8720dc..0000000000000 --- a/branches/sage/ebofs2/messages/MMonElectionRefresh.h +++ /dev/null @@ -1,52 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MMONELECTIONREFRESH_H -#define __MMONELECTIONREFRESH_H - -#include "msg/Message.h" - -#include "mon/Elector.h" - -class MMonElectionRefresh : public Message { - public: - int p; - Elector::State state; - int refresh_num; - - MMonElectionRefresh() {} - MMonElectionRefresh(int _p, Elector::State& s, int r) : - Message(MSG_MON_ELECTION_REFRESH), - p(_p), state(s), refresh_num(r) {} - - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(p), (char*)&p); - off += sizeof(p); - payload.copy(off, sizeof(state), (char*)&state); - off += sizeof(state); - payload.copy(off, sizeof(refresh_num), (char*)&refresh_num); - off += sizeof(refresh_num); - } - void encode_payload() { - payload.append((char*)&p, sizeof(p)); - payload.append((char*)&state, sizeof(state)); - payload.append((char*)&refresh_num, sizeof(refresh_num)); - } - - virtual char *get_type_name() { return "MonElRefresh"; } -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MMonElectionStatus.h b/branches/sage/ebofs2/messages/MMonElectionStatus.h deleted file mode 100644 index f91e42d64b184..0000000000000 --- a/branches/sage/ebofs2/messages/MMonElectionStatus.h +++ /dev/null @@ -1,51 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MMONELECTIONSTATUS_H -#define __MMONELECTIONSTATUS_H - -#include "msg/Message.h" - -#include "mon/Elector.h" - -class MMonElectionStatus : public Message { - public: - int q; - int read_num; - map registry; - - MMonElectionStatus() {} - MMonElectionStatus(int _q, int r, map reg) : - Message(MSG_MON_ELECTION_STATUS), - q(_q), read_num(r), registry(reg) {} - - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(q), (char*)&q); - off += sizeof(q); - payload.copy(off, sizeof(read_num), (char*)&read_num); - off += sizeof(read_num); - ::_decode(registry, payload, off); - } - void encode_payload() { - payload.append((char*)&q, sizeof(q)); - payload.append((char*)&read_num, sizeof(read_num)); - ::_encode(registry, payload); - } - - virtual char *get_type_name() { return "MonElStatus"; } -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MMonOSDMapInfo.h b/branches/sage/ebofs2/messages/MMonOSDMapInfo.h deleted file mode 100644 index 329c05e657d46..0000000000000 --- a/branches/sage/ebofs2/messages/MMonOSDMapInfo.h +++ /dev/null @@ -1,50 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMONOSDMAPINFO_H -#define __MMONOSDMAPINFO_H - -#include "msg/Message.h" - -#include "include/types.h" - -class MMonOSDMapInfo : public Message { - public: - epoch_t epoch; - epoch_t mon_epoch; - - epoch_t get_epoch() { return epoch; } - epoch_t get_mon_epoch() { return mon_epoch; } - - MMonOSDMapInfo(epoch_t e, epoch_t me) : - Message(MSG_MON_OSDMAP_UPDATE_PREPARE), - epoch(e), mon_epoch(me) { - } - - char *get_type_name() { return "omap_info"; } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - payload.append((char*)&mon_epoch, sizeof(mon_epoch)); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - payload.copy(off, sizeof(mon_epoch), (char*)&mon_epoch); - off += sizeof(mon_epoch); - } -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MMonOSDMapLease.h b/branches/sage/ebofs2/messages/MMonOSDMapLease.h deleted file mode 100644 index 3f4ed8ea4db85..0000000000000 --- a/branches/sage/ebofs2/messages/MMonOSDMapLease.h +++ /dev/null @@ -1,50 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMONOSDMAPLEASE_H -#define __MMONOSDMAPLEASE_H - -#include "msg/Message.h" - -#include "include/types.h" - -class MMonOSDMapLease : public Message { - epoch_t epoch; - utime_t lease_expire; - - public: - epoch_t get_epoch() { return epoch; } - const utime_t& get_lease_expire() { return lease_expire; } - - MMonOSDMapLease(epoch_t e, utime_t le) : - Message(MSG_MON_OSDMAP_LEASE), - epoch(e), lease_expire(le) { - } - - char *get_type_name() { return "omap_lease"; } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - payload.append((char*)&lease_expire, sizeof(lease_expire)); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - payload.copy(off, sizeof(lease_expire), (char*)&lease_expire); - off += sizeof(lease_expire); - } -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MMonOSDMapLeaseAck.h b/branches/sage/ebofs2/messages/MMonOSDMapLeaseAck.h deleted file mode 100644 index 449a0ac61a84f..0000000000000 --- a/branches/sage/ebofs2/messages/MMonOSDMapLeaseAck.h +++ /dev/null @@ -1,45 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMONOSDMAPLEASEACK_H -#define __MMONOSDMAPLEASEACK_H - -#include "msg/Message.h" - -#include "include/types.h" - -class MMonOSDMapLeaseAck : public Message { - epoch_t epoch; - -public: - epoch_t get_epoch() { return epoch; } - - MMonOSDMapLeaseAck(epoch_t e) : - Message(MSG_MON_OSDMAP_LEASE_ACK), - epoch(e) { - } - - char *get_type_name() { return "omap_lease_ack"; } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - } -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MMonOSDMapUpdateAck.h b/branches/sage/ebofs2/messages/MMonOSDMapUpdateAck.h deleted file mode 100644 index 9655548dfcb00..0000000000000 --- a/branches/sage/ebofs2/messages/MMonOSDMapUpdateAck.h +++ /dev/null @@ -1,43 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMONOSDMAPUPDATEACK_H -#define __MMONOSDMAPUPDATEACK_H - -#include "msg/Message.h" - -#include "include/types.h" - -class MMonOSDMapUpdateAck : public Message { -public: - epoch_t epoch; - - MMonOSDMapUpdateAck(epoch_t e) : - Message(MSG_MON_OSDMAP_UPDATE_ACK), - epoch(e) { - } - - char *get_type_name() { return "omap_update_ack"; } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - } -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MMonOSDMapUpdateCommit.h b/branches/sage/ebofs2/messages/MMonOSDMapUpdateCommit.h deleted file mode 100644 index 8aa6929c2ed9a..0000000000000 --- a/branches/sage/ebofs2/messages/MMonOSDMapUpdateCommit.h +++ /dev/null @@ -1,43 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMONOSDMAPUPDATECOMMIT_H -#define __MMONOSDMAPUPDATECOMMIT_H - -#include "msg/Message.h" - -#include "include/types.h" - -class MMonOSDMapUpdateCommit : public Message { - public: - epoch_t epoch; - - MMonOSDMapUpdateCommit(epoch_t e) : - Message(MSG_MON_OSDMAP_UPDATE_COMMIT), - epoch(e) { - } - - char *get_type_name() { return "omap_update_commit"; } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - } -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MMonOSDMapUpdatePrepare.h b/branches/sage/ebofs2/messages/MMonOSDMapUpdatePrepare.h deleted file mode 100644 index 8e908e2ed0664..0000000000000 --- a/branches/sage/ebofs2/messages/MMonOSDMapUpdatePrepare.h +++ /dev/null @@ -1,53 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMONOSDMAPUPDATEPREPARE_H -#define __MMONOSDMAPUPDATEPREPARE_H - -#include "msg/Message.h" - -#include "include/types.h" - -class MMonOSDMapUpdatePrepare : public Message { - public: - epoch_t epoch; - bufferlist map_bl; - bufferlist inc_map_bl; - - epoch_t get_epoch() { return epoch; } - - MMonOSDMapUpdatePrepare(epoch_t e, - bufferlist& mbl, bufferlist& incmbl) : - Message(MSG_MON_OSDMAP_UPDATE_PREPARE), - epoch(e), - map_bl(mbl), inc_map_bl(incmbl) { - } - - char *get_type_name() { return "omap_update_prepare"; } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - ::_encode(map_bl, payload); - ::_encode(inc_map_bl, payload); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - ::_decode(map_bl, payload, off); - ::_decode(inc_map_bl, payload, off); - } -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MMonPaxos.h b/branches/sage/ebofs2/messages/MMonPaxos.h deleted file mode 100644 index 7210b179c9a42..0000000000000 --- a/branches/sage/ebofs2/messages/MMonPaxos.h +++ /dev/null @@ -1,98 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MMONPAXOS_H -#define __MMONPAXOS_H - -#include "msg/Message.h" -#include "mon/mon_types.h" - -class MMonPaxos : public Message { - public: - // op types - const static int OP_COLLECT = 1; // proposer: propose round - const static int OP_LAST = 2; // voter: accept proposed round - const static int OP_BEGIN = 3; // proposer: value proposed for this round - const static int OP_ACCEPT = 4; // voter: accept propsed value - const static int OP_COMMIT = 5; // proposer: notify learners of agreed value - const static int OP_LEASE = 6; // leader: extend peon lease - const static int OP_LEASE_ACK = 7; // peon: lease ack - const static char *get_opname(int op) { - switch (op) { - case OP_COLLECT: return "collect"; - case OP_LAST: return "last"; - case OP_BEGIN: return "begin"; - case OP_ACCEPT: return "accept"; - case OP_COMMIT: return "commit"; - case OP_LEASE: return "lease"; - case OP_LEASE_ACK: return "lease_ack"; - default: assert(0); return 0; - } - } - - epoch_t epoch; // monitor epoch - int op; // paxos op - int machine_id; // which state machine? - - version_t last_committed; // i've committed to - version_t pn_from; // i promise to accept after - version_t pn; // with with proposal - version_t uncommitted_pn; // previous pn, if we are a LAST with an uncommitted value - utime_t lease_expire; - - map values; - - MMonPaxos() : Message(MSG_MON_PAXOS) {} - MMonPaxos(epoch_t e, int o, int mid) : - Message(MSG_MON_PAXOS), - epoch(e), - op(o), machine_id(mid), - last_committed(0), pn_from(0), pn(0), uncommitted_pn(0) { } - - virtual char *get_type_name() { return "paxos"; } - - void print(ostream& out) { - out << "paxos(" << get_paxos_name(machine_id) - << " " << get_opname(op) << " lc " << last_committed - << " pn " << pn << " opn " << uncommitted_pn - << ")"; - } - - void encode_payload() { - ::_encode(epoch, payload); - ::_encode(op, payload); - ::_encode(machine_id, payload); - ::_encode(last_committed, payload); - ::_encode(pn_from, payload); - ::_encode(pn, payload); - ::_encode(uncommitted_pn, payload); - ::_encode(lease_expire, payload); - ::_encode(values, payload); - } - void decode_payload() { - int off = 0; - ::_decode(epoch, payload, off); - ::_decode(op, payload, off); - ::_decode(machine_id, payload, off); - ::_decode(last_committed, payload, off); - ::_decode(pn_from, payload, off); - ::_decode(pn, payload, off); - ::_decode(uncommitted_pn, payload, off); - ::_decode(lease_expire, payload, off); - ::_decode(values, payload, off); - } -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MOSDBoot.h b/branches/sage/ebofs2/messages/MOSDBoot.h deleted file mode 100644 index 00c94ad1a2a80..0000000000000 --- a/branches/sage/ebofs2/messages/MOSDBoot.h +++ /dev/null @@ -1,51 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MOSDBOOT_H -#define __MOSDBOOT_H - -#include "msg/Message.h" - -#include "include/types.h" -#include "osd/osd_types.h" - -class MOSDBoot : public Message { - public: - entity_inst_t inst; - OSDSuperblock sb; - - MOSDBoot() {} - MOSDBoot(entity_inst_t i, OSDSuperblock& s) : - Message(MSG_OSD_BOOT), - inst(i), - sb(s) { - } - - char *get_type_name() { return "osd_boot"; } - void print(ostream& out) { - out << "osd_boot(" << inst << ")"; - } - - void encode_payload() { - ::_encode(inst, payload); - ::_encode(sb, payload); - } - void decode_payload() { - int off = 0; - ::_decode(inst, payload, off); - ::_decode(sb, payload, off); - } -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MOSDFailure.h b/branches/sage/ebofs2/messages/MOSDFailure.h deleted file mode 100644 index adc4e700a4f85..0000000000000 --- a/branches/sage/ebofs2/messages/MOSDFailure.h +++ /dev/null @@ -1,55 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDFAILURE_H -#define __MOSDFAILURE_H - -#include "msg/Message.h" - - -class MOSDFailure : public Message { - public: - entity_inst_t from; - entity_inst_t failed; - epoch_t epoch; - - MOSDFailure() {} - MOSDFailure(entity_inst_t fr, entity_inst_t f, epoch_t e) : - Message(MSG_OSD_FAILURE), - from(fr), failed(f), epoch(e) {} - - entity_inst_t get_from() { return from; } - entity_inst_t get_failed() { return failed; } - epoch_t get_epoch() { return epoch; } - - void decode_payload() { - int off = 0; - ::_decode(from, payload, off); - ::_decode(failed, payload, off); - ::_decode(epoch, payload, off); - } - void encode_payload() { - ::_encode(from, payload); - ::_encode(failed, payload); - ::_encode(epoch, payload); - } - - virtual char *get_type_name() { return "osd_failure"; } - /*void print(ostream& out) { - out << "osd_failure(" << failed << " e" << epoch << ")"; - }*/ -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MOSDGetMap.h b/branches/sage/ebofs2/messages/MOSDGetMap.h deleted file mode 100644 index 25f94ef3bcc92..0000000000000 --- a/branches/sage/ebofs2/messages/MOSDGetMap.h +++ /dev/null @@ -1,51 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MOSDGETMAP_H -#define __MOSDGETMAP_H - -#include "msg/Message.h" - -#include "include/types.h" - -class MOSDGetMap : public Message { - public: - epoch_t start, want; - - MOSDGetMap(epoch_t s=0, epoch_t w=0) : - Message(MSG_OSD_GETMAP), - start(s), want(w) { } - - epoch_t get_start_epoch() { return start; } - epoch_t get_want_epoch() { return want; } - - char *get_type_name() { return "get_osd_map"; } - void print(ostream& out) { - out << "get_osd_map(have " << start; - if (want) out << " want " << want; - out << ")"; - } - - void encode_payload() { - ::_encode(start, payload); - ::_encode(want, payload); - } - void decode_payload() { - int off = 0; - ::_decode(start, payload, off); - ::_decode(want, payload, off); - } -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MOSDIn.h b/branches/sage/ebofs2/messages/MOSDIn.h deleted file mode 100644 index 8f8cb4b7877ae..0000000000000 --- a/branches/sage/ebofs2/messages/MOSDIn.h +++ /dev/null @@ -1,43 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __MOSDIN_H -#define __MOSDIN_H - -#include "msg/Message.h" - - -class MOSDIn : public Message { - public: - epoch_t map_epoch; - - MOSDIn(epoch_t e) : Message(MSG_OSD_IN), map_epoch(e) { - } - MOSDIn() {} - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(map_epoch), (char*)&map_epoch); - off += sizeof(map_epoch); - } - virtual void encode_payload() { - payload.append((char*)&map_epoch, sizeof(map_epoch)); - } - - virtual char *get_type_name() { return "oin"; } -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MOSDMap.h b/branches/sage/ebofs2/messages/MOSDMap.h deleted file mode 100644 index 525ed82ae5c29..0000000000000 --- a/branches/sage/ebofs2/messages/MOSDMap.h +++ /dev/null @@ -1,71 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDGETMAPACK_H -#define __MOSDGETMAPACK_H - -#include "msg/Message.h" -#include "osd/OSDMap.h" - - -class MOSDMap : public Message { - public: - map maps; - map incremental_maps; - - epoch_t get_first() { - epoch_t e = 0; - map::iterator i = maps.begin(); - if (i != maps.end()) e = i->first; - i = incremental_maps.begin(); - if (i != incremental_maps.end() && - (e == 0 || i->first < e)) e = i->first; - return e; - } - epoch_t get_last() { - epoch_t e = 0; - map::reverse_iterator i = maps.rbegin(); - if (i != maps.rend()) e = i->first; - i = incremental_maps.rbegin(); - if (i != incremental_maps.rend() && - (e == 0 || i->first > e)) e = i->first; - return e; - } - - - MOSDMap() : Message(MSG_OSD_MAP) { } - MOSDMap(OSDMap *oc) : Message(MSG_OSD_MAP) { - oc->encode(maps[oc->get_epoch()]); - } - - - // marshalling - virtual void decode_payload() { - int off = 0; - ::_decode(maps, payload, off); - ::_decode(incremental_maps, payload, off); - } - virtual void encode_payload() { - ::_encode(maps, payload); - ::_encode(incremental_maps, payload); - } - - virtual char *get_type_name() { return "omap"; } - void print(ostream& out) { - out << "osd_map(" << get_first() << "," << get_last() << ")"; - } -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MOSDOp.h b/branches/sage/ebofs2/messages/MOSDOp.h deleted file mode 100644 index 7ac401bd75a69..0000000000000 --- a/branches/sage/ebofs2/messages/MOSDOp.h +++ /dev/null @@ -1,280 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDOP_H -#define __MOSDOP_H - -#include "msg/Message.h" -#include "osd/osd_types.h" - -/* - * OSD op - * - * oid - object id - * op - OSD_OP_DELETE, etc. - * - */ - -// osd client ops -#define OSD_OP_READ 1 -#define OSD_OP_STAT 2 - -#define OSD_OP_REPLICATE 3 -#define OSD_OP_UNREPLICATE 4 - -#define OSD_OP_WRNOOP 10 -#define OSD_OP_WRITE 11 -#define OSD_OP_DELETE 12 -#define OSD_OP_TRUNCATE 13 -#define OSD_OP_ZERO 14 - -#define OSD_OP_WRLOCK 20 -#define OSD_OP_WRUNLOCK 21 -#define OSD_OP_RDLOCK 22 -#define OSD_OP_RDUNLOCK 23 -#define OSD_OP_UPLOCK 24 -#define OSD_OP_DNLOCK 25 -#define OSD_OP_MININCLOCK 26 // minimum incarnation lock - -#define OSD_OP_PULL 30 -#define OSD_OP_PUSH 31 - -#define OSD_OP_BALANCEREADS 101 -#define OSD_OP_UNBALANCEREADS 102 - - - -class MOSDOp : public Message { -public: - static const char* get_opname(int op) { - switch (op) { - case OSD_OP_READ: return "read"; - case OSD_OP_STAT: return "stat"; - - case OSD_OP_WRNOOP: return "wrnoop"; - case OSD_OP_WRITE: return "write"; - case OSD_OP_ZERO: return "zero"; - case OSD_OP_DELETE: return "delete"; - case OSD_OP_TRUNCATE: return "truncate"; - case OSD_OP_WRLOCK: return "wrlock"; - case OSD_OP_WRUNLOCK: return "wrunlock"; - case OSD_OP_RDLOCK: return "rdlock"; - case OSD_OP_RDUNLOCK: return "rdunlock"; - case OSD_OP_UPLOCK: return "uplock"; - case OSD_OP_DNLOCK: return "dnlock"; - - case OSD_OP_MININCLOCK: return "mininclock"; - - case OSD_OP_BALANCEREADS: return "balance-reads"; - case OSD_OP_UNBALANCEREADS: return "unbalance-reads"; - - case OSD_OP_PULL: return "pull"; - case OSD_OP_PUSH: return "push"; - default: assert(0); - } - return 0; - } - -private: - struct st_ { - // who's asking? - entity_inst_t client; - osdreqid_t reqid; // minor weirdness: entity_name_t is in reqid_t too. - - // for replication - tid_t rep_tid; - - object_t oid; - objectrev_t rev; - ObjectLayout layout; - - epoch_t map_epoch; - - eversion_t pg_trim_to; // primary->replica: trim to here - - int32_t op; - off_t offset, length; - - eversion_t version; - eversion_t old_version; - - bool want_ack; - bool want_commit; - bool retry_attempt; - - int shed_count; - osd_peer_stat_t peer_stat; - } st; - - bufferlist data; - map attrset; - - - friend class MOSDOpReply; - -public: - const osdreqid_t& get_reqid() { return st.reqid; } - const tid_t get_client_tid() { return st.reqid.tid; } - int get_client_inc() { return st.reqid.inc; } - - const entity_name_t& get_client() { return st.client.name; } - const entity_inst_t& get_client_inst() { return st.client; } - void set_client_inst(const entity_inst_t& i) { st.client = i; } - - bool wants_reply() { - if (st.op < 100) return true; - return false; // no reply needed for primary-lock, -unlock. - } - - const tid_t get_rep_tid() { return st.rep_tid; } - void set_rep_tid(tid_t t) { st.rep_tid = t; } - - bool get_retry_attempt() const { return st.retry_attempt; } - void set_retry_attempt(bool a) { st.retry_attempt = a; } - - const object_t get_oid() { return st.oid; } - const pg_t get_pg() { return st.layout.pgid; } - const ObjectLayout& get_layout() { return st.layout; } - const epoch_t get_map_epoch() { return st.map_epoch; } - - //const int get_pg_role() { return st.pg_role; } // who am i asking for? - const eversion_t get_version() { return st.version; } - //const eversion_t get_old_version() { return st.old_version; } - - void set_rev(objectrev_t r) { st.rev = r; } - objectrev_t get_rev() { return st.rev; } - - const eversion_t get_pg_trim_to() { return st.pg_trim_to; } - void set_pg_trim_to(eversion_t v) { st.pg_trim_to = v; } - - const int get_op() { return st.op; } - void set_op(int o) { st.op = o; } - bool is_read() { - return st.op < 10; - } - - const off_t get_length() { return st.length; } - const off_t get_offset() { return st.offset; } - - map& get_attrset() { return attrset; } - void set_attrset(map &as) { attrset.swap(as); } - - const bool wants_ack() { return st.want_ack; } - const bool wants_commit() { return st.want_commit; } - - void set_peer_stat(const osd_peer_stat_t& stat) { st.peer_stat = stat; } - const osd_peer_stat_t& get_peer_stat() { return st.peer_stat; } - void inc_shed_count() { st.shed_count++; } - int get_shed_count() { return st.shed_count; } - - void set_data(bufferlist &d) { - data.claim(d); - } - bufferlist& get_data() { - return data; - } - off_t get_data_len() { return data.length(); } - - - MOSDOp(entity_inst_t asker, int inc, long tid, - object_t oid, ObjectLayout ol, epoch_t mapepoch, int op) : - Message(MSG_OSD_OP) { - memset(&st, 0, sizeof(st)); - this->st.client = asker; - this->st.reqid.name = asker.name; - this->st.reqid.inc = inc; - this->st.reqid.tid = tid; - - this->st.oid = oid; - this->st.layout = ol; - this->st.map_epoch = mapepoch; - this->st.op = op; - - this->st.rep_tid = 0; - - this->st.want_ack = true; - this->st.want_commit = true; - } - MOSDOp() {} - - //void set_pg_role(int r) { st.pg_role = r; } - //void set_rg_nrep(int n) { st.rg_nrep = n; } - - void set_layout(const ObjectLayout& l) { st.layout = l; } - - void set_length(off_t l) { st.length = l; } - void set_offset(off_t o) { st.offset = o; } - void set_version(eversion_t v) { st.version = v; } - void set_old_version(eversion_t ov) { st.old_version = ov; } - - void set_want_ack(bool b) { st.want_ack = b; } - void set_want_commit(bool b) { st.want_commit = b; } - - // marshalling - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(st), (char*)&st); - off += sizeof(st); - ::_decode(attrset, payload, off); - ::_decode(data, payload, off); - } - - static void add_payload_chunk_breaks(int from, int off, int len, - list& breaks) { - if (len > 0 && - len & 4095 == 0 && - off & 4095 == 0) { - // page-sized and aligned data? easy. - breaks.push_back(from); - } else if (len > 8192) { - // there is at least 1 full page in there. somewhere. - int p = 0; - - // leading partial page? - if (off & 4095 != 0) - p = 4096 - (off & 4095); - - // full page(s) - breaks.push_back(from + p); - p += (len - p) & (~4095); - - // tail bit? - if (p != len) - breaks.push_back(from + p); - } - } - - virtual void encode_payload() { - ::_encode(st, payload); - ::_encode(attrset, payload); - add_payload_chunk_breaks(payload.length() + 4, - st.offset, data.length(), - chunk_payload_at); - ::_encode(data, payload); - } - - virtual char *get_type_name() { return "osd_op"; } - void print(ostream& out) { - out << "osd_op(" << st.reqid - << " " << get_opname(st.op) - << " " << st.oid; - if (st.length) out << " " << st.offset << "~" << st.length; - if (st.retry_attempt) out << " RETRY"; - out << ")"; - } -}; - - -#endif diff --git a/branches/sage/ebofs2/messages/MOSDOpReply.h b/branches/sage/ebofs2/messages/MOSDOpReply.h deleted file mode 100644 index 3c567397e6a2d..0000000000000 --- a/branches/sage/ebofs2/messages/MOSDOpReply.h +++ /dev/null @@ -1,164 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDOPREPLY_H -#define __MOSDOPREPLY_H - -#include "msg/Message.h" - -#include "MOSDOp.h" -#include "osd/ObjectStore.h" - -/* - * OSD op reply - * - * oid - object id - * op - OSD_OP_DELETE, etc. - * - */ - -class MOSDOpReply : public Message { - struct { - // req - osdreqid_t reqid; - - tid_t rep_tid; - - object_t oid; - ObjectLayout layout; // pgid, etc. - - int32_t op; - - // reply - int32_t result; - bool commit; - off_t length, offset; - off_t object_size; - eversion_t version; - - eversion_t pg_complete_thru; - - epoch_t map_epoch; - - osd_peer_stat_t peer_stat; - } st; - - bufferlist data; - map attrset; - - public: - const osdreqid_t& get_reqid() { return st.reqid; } - long get_tid() { return st.reqid.tid; } - long get_rep_tid() { return st.rep_tid; } - object_t get_oid() { return st.oid; } - pg_t get_pg() { return st.layout.pgid; } - int get_op() { return st.op; } - bool get_commit() { return st.commit; } - - int get_result() { return st.result; } - off_t get_length() { return st.length; } - off_t get_offset() { return st.offset; } - off_t get_object_size() { return st.object_size; } - eversion_t get_version() { return st.version; } - map& get_attrset() { return attrset; } - - eversion_t get_pg_complete_thru() { return st.pg_complete_thru; } - void set_pg_complete_thru(eversion_t v) { st.pg_complete_thru = v; } - - void set_result(int r) { st.result = r; } - void set_length(off_t s) { st.length = s; } - void set_offset(off_t o) { st.offset = o; } - void set_object_size(off_t s) { st.object_size = s; } - void set_version(eversion_t v) { st.version = v; } - void set_attrset(map &as) { attrset = as; } - - void set_op(int op) { st.op = op; } - void set_rep_tid(tid_t t) { st.rep_tid = t; } - - void set_peer_stat(const osd_peer_stat_t& stat) { st.peer_stat = stat; } - const osd_peer_stat_t& get_peer_stat() { return st.peer_stat; } - - // data payload - void set_data(bufferlist &d) { - data.claim(d); - } - bufferlist& get_data() { - return data; - } - - // osdmap - epoch_t get_map_epoch() { return st.map_epoch; } - - -public: - MOSDOpReply(MOSDOp *req, int result, epoch_t e, bool commit) : - Message(MSG_OSD_OPREPLY) { - memset(&st, 0, sizeof(st)); - this->st.reqid = req->st.reqid; - this->st.op = req->st.op; - this->st.rep_tid = req->st.rep_tid; - - this->st.oid = req->st.oid; - this->st.layout = req->st.layout; - this->st.result = result; - this->st.commit = commit; - - this->st.length = req->st.length; // speculative... OSD should ensure these are correct - this->st.offset = req->st.offset; - this->st.version = req->st.version; - - this->st.map_epoch = e; - } - MOSDOpReply() {} - - - // marshalling - virtual void decode_payload() { - payload.copy(0, sizeof(st), (char*)&st); - payload.splice(0, sizeof(st)); - int off = 0; - ::_decode(attrset, payload, off); - ::_decode(data, payload, off); - } - virtual void encode_payload() { - payload.append((char*)&st, sizeof(st)); - ::_encode(attrset, payload); - MOSDOp::add_payload_chunk_breaks(payload.length() + 4, - st.offset, data.length(), - chunk_payload_at); - ::_encode(data, payload); - } - - virtual char *get_type_name() { return "osd_op_reply"; } - - void print(ostream& out) { - out << "osd_op_reply(" << st.reqid - << " " << MOSDOp::get_opname(st.op) - << " " << st.oid; - if (st.length) out << " " << st.offset << "~" << st.length; - if (st.op >= 10) { - if (st.commit) - out << " commit"; - else - out << " ack"; - } - out << " = " << st.result; - out << ")"; - } - -}; - - -#endif diff --git a/branches/sage/ebofs2/messages/MOSDOut.h b/branches/sage/ebofs2/messages/MOSDOut.h deleted file mode 100644 index 798356f663f9e..0000000000000 --- a/branches/sage/ebofs2/messages/MOSDOut.h +++ /dev/null @@ -1,43 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __MOSDOUT_H -#define __MOSDOUT_H - -#include "msg/Message.h" - - -class MOSDOut : public Message { - public: - epoch_t map_epoch; - - MOSDOut(epoch_t e) : Message(MSG_OSD_OUT), map_epoch(e) { - } - MOSDOut() {} - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(map_epoch), (char*)&map_epoch); - off += sizeof(map_epoch); - } - virtual void encode_payload() { - payload.append((char*)&map_epoch, sizeof(map_epoch)); - } - - virtual char *get_type_name() { return "oout"; } -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MOSDPGActivateSet.h b/branches/sage/ebofs2/messages/MOSDPGActivateSet.h deleted file mode 100644 index cdee7996e9647..0000000000000 --- a/branches/sage/ebofs2/messages/MOSDPGActivateSet.h +++ /dev/null @@ -1,50 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDPGACTIVATESET_H -#define __MOSDPGACTIVATESET_H - -#include "msg/Message.h" - -class MOSDPGActivateSet : public Message { - epoch_t epoch; - -public: - list pg_info; - - epoch_t get_epoch() { return epoch; } - - MOSDPGActivateSet() {} - MOSDPGActivateSet(version_t mv) : - Message(MSG_OSD_PG_ACTIVATE_SET), - epoch(mv) { } - - char *get_type_name() { return "pg_activate_set"; } - void print(ostream& out) { - out << "pg_activate_set(" << pg_info.size() << " pgs e" << epoch << ")"; - } - - void encode_payload() { - ::_encode(epoch, payload); - ::_encode(pg_info, payload); - } - void decode_payload() { - int off = 0; - ::_decode(epoch, payload, off); - ::_decode(pg_info, payload, off); - } -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MOSDPGLog.h b/branches/sage/ebofs2/messages/MOSDPGLog.h deleted file mode 100644 index 653bb9f10570c..0000000000000 --- a/branches/sage/ebofs2/messages/MOSDPGLog.h +++ /dev/null @@ -1,59 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDPGLOG_H -#define __MOSDPGLOG_H - -#include "msg/Message.h" - -class MOSDPGLog : public Message { - epoch_t epoch; - -public: - PG::Info info; - PG::Log log; - PG::Missing missing; - - epoch_t get_epoch() { return epoch; } - pg_t get_pgid() { return info.pgid; } - - MOSDPGLog() {} - MOSDPGLog(version_t mv, PG::Info& i) : - Message(MSG_OSD_PG_LOG), - epoch(mv), info(i) { } - - char *get_type_name() { return "PGlog"; } - void print(ostream& out) { - out << "pg_log(" << info.pgid << " e" << epoch << ")"; - } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - payload.append((char*)&info, sizeof(info)); - log._encode(payload); - missing._encode(payload); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - payload.copy(off, sizeof(info), (char*)&info); - off += sizeof(info); - log._decode(payload, off); - missing._decode(payload, off); - } -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MOSDPGNotify.h b/branches/sage/ebofs2/messages/MOSDPGNotify.h deleted file mode 100644 index 76a984276b66b..0000000000000 --- a/branches/sage/ebofs2/messages/MOSDPGNotify.h +++ /dev/null @@ -1,55 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MOSDPGPEERNOTIFY_H -#define __MOSDPGPEERNOTIFY_H - -#include "msg/Message.h" - -#include "osd/PG.h" - -/* - * PGNotify - notify primary of my PGs and versions. - */ - -class MOSDPGNotify : public Message { - epoch_t epoch; - list pg_list; // pgid -> version - - public: - version_t get_epoch() { return epoch; } - list& get_pg_list() { return pg_list; } - - MOSDPGNotify() {} - MOSDPGNotify(epoch_t e, list& l) : - Message(MSG_OSD_PG_NOTIFY) { - this->epoch = e; - pg_list.splice(pg_list.begin(),l); - } - - char *get_type_name() { return "PGnot"; } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - _encode(pg_list, payload); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - _decode(pg_list, payload, off); - } -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MOSDPGPeer.h b/branches/sage/ebofs2/messages/MOSDPGPeer.h deleted file mode 100644 index dd3164cdc1124..0000000000000 --- a/branches/sage/ebofs2/messages/MOSDPGPeer.h +++ /dev/null @@ -1,58 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDPGPEER_H -#define __MOSDPGPEER_H - -#include "msg/Message.h" - - -class MOSDPGPeer : public Message { - uint64_t map_version; - list pg_list; - - bool complete; - - public: - uint64_t get_version() { return map_version; } - list& get_pg_list() { return pg_list; } - bool get_complete() { return complete; } - - MOSDPGPeer() {} - MOSDPGPeer(uint64_t v, list& l, bool c=false) : - Message(MSG_OSD_PG_PEER) { - this->map_version = v; - this->complete = c; - pg_list.splice(pg_list.begin(), l); - } - - char *get_type_name() { return "PGPeer"; } - - void encode_payload() { - payload.append((char*)&map_version, sizeof(map_version)); - payload.append((char*)&complete, sizeof(complete)); - _encode(pg_list, payload); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(map_version), (char*)&map_version); - off += sizeof(map_version); - payload.copy(off, sizeof(complete), (char*)&complete); - off += sizeof(complete); - _decode(pg_list, payload, off); - } -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MOSDPGPeerAck.h b/branches/sage/ebofs2/messages/MOSDPGPeerAck.h deleted file mode 100644 index dc4fac1a9436b..0000000000000 --- a/branches/sage/ebofs2/messages/MOSDPGPeerAck.h +++ /dev/null @@ -1,70 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDPGPEERACK_H -#define __MOSDPGPEERACK_H - -#include "msg/Message.h" -#include "osd/OSD.h" - -class MOSDPGPeerAck : public Message { - version_t map_version; - - public: - list pg_dne; // pg dne - map pg_state; // state, lists, etc. - - version_t get_version() { return map_version; } - - MOSDPGPeerAck() {} - MOSDPGPeerAck(version_t v) : - Message(MSG_OSD_PG_PEERACK) { - this->map_version = v; - } - - char *get_type_name() { return "PGPeer"; } - - void encode_payload() { - payload.append((char*)&map_version, sizeof(map_version)); - _encode(pg_dne, payload); - - int n = pg_state.size(); - payload.append((char*)&n, sizeof(n)); - for (map::iterator it = pg_state.begin(); - it != pg_state.end(); - it++) { - payload.append((char*)&it->first, sizeof(it->first)); - it->second._encode(payload); - } - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(map_version), (char*)&map_version); - off += sizeof(map_version); - _decode(pg_dne, payload, off); - - int n; - payload.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDPEERREQUEST_H -#define __MOSDPEERREQUEST_H - -#include "msg/Message.h" - - -class MOSDPGPeerRequest : public Message { - version_t map_version; - list pg_list; - - public: - version_t get_version() { return map_version; } - list& get_pg_list() { return pg_list; } - - MOSDPGPeerRequest() {} - MOSDPGPeerRequest(version_t v, list& l) : - Message(MSG_OSD_PG_PEERREQUEST) { - this->map_version = v; - pg_list.splice(pg_list.begin(), l); - } - - char *get_type_name() { return "PGPR"; } - - void encode_payload() { - payload.append((char*)&map_version, sizeof(map_version)); - _encode(pg_list, payload); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(map_version), (char*)&map_version); - off += sizeof(map_version); - _decode(pg_list, payload, off); - } -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MOSDPGQuery.h b/branches/sage/ebofs2/messages/MOSDPGQuery.h deleted file mode 100644 index 70dbfdbb96fd7..0000000000000 --- a/branches/sage/ebofs2/messages/MOSDPGQuery.h +++ /dev/null @@ -1,52 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDPGQUERY_H -#define __MOSDPGQUERY_H - -#include "msg/Message.h" - -/* - * PGQuery - query another OSD as to the contents of their PGs - */ - -class MOSDPGQuery : public Message { - version_t epoch; - - public: - version_t get_epoch() { return epoch; } - map pg_list; - - MOSDPGQuery() {} - MOSDPGQuery(epoch_t e, map& ls) : - Message(MSG_OSD_PG_QUERY), - epoch(e), pg_list(ls) { - } - - char *get_type_name() { return "PGq"; } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - ::_encode(pg_list, payload); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - ::_decode(pg_list, payload, off); - } -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MOSDPGRemove.h b/branches/sage/ebofs2/messages/MOSDPGRemove.h deleted file mode 100644 index 17cb28a3c95a1..0000000000000 --- a/branches/sage/ebofs2/messages/MOSDPGRemove.h +++ /dev/null @@ -1,52 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDPGREMOVE_H -#define __MOSDPGREMOVE_H - -#include "msg/Message.h" - - -class MOSDPGRemove : public Message { - epoch_t epoch; - - public: - set pg_list; - - epoch_t get_epoch() { return epoch; } - - MOSDPGRemove() {} - MOSDPGRemove(epoch_t e, set& l) : - Message(MSG_OSD_PG_REMOVE) { - this->epoch = e; - pg_list = l; - } - - char *get_type_name() { return "PGrm"; } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - _encode(pg_list, payload); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - _decode(pg_list, payload, off); - } - -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MOSDPGSummary.h b/branches/sage/ebofs2/messages/MOSDPGSummary.h deleted file mode 100644 index 0dcebffaf74da..0000000000000 --- a/branches/sage/ebofs2/messages/MOSDPGSummary.h +++ /dev/null @@ -1,69 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDPGQUERYREPLY_H -#define __MOSDPGQUERYREPLY_H - -#include "msg/Message.h" - -class MOSDPGSummary : public Message { - epoch_t epoch; - pg_t pgid; - -public: - PG::PGInfo info; - bufferlist sumbl; - - epoch_t get_epoch() { return epoch; } - - MOSDPGSummary() {} - MOSDPGSummary(version_t mv, pg_t pgid, PG::PGSummary &summary) : - Message(MSG_OSD_PG_SUMMARY) { - this->epoch = mv; - this->pgid = pgid; - summary._encode(sumbl); - } - - pg_t get_pgid() { return pgid; } - bufferlist& get_summary_bl() { - return sumbl; - } - - char *get_type_name() { return "PGsum"; } - void print(ostream& out) { - out << "pg_summary(" << pgid << " e" << epoch << ")"; - } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - payload.append((char*)&pgid, sizeof(pgid)); - payload.append((char*)&info, sizeof(info)); - payload.claim_append(sumbl); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - payload.copy(off, sizeof(pgid), (char*)&pgid); - off += sizeof(pgid); - payload.copy(off, sizeof(info), (char*)&info); - off += sizeof(info); - - payload.splice(0, off); - sumbl.claim(payload); - } -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MOSDPGUpdate.h b/branches/sage/ebofs2/messages/MOSDPGUpdate.h deleted file mode 100644 index 869c02e18c156..0000000000000 --- a/branches/sage/ebofs2/messages/MOSDPGUpdate.h +++ /dev/null @@ -1,71 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDPGUPDATE_H -#define __MOSDPGUPDATE_H - -#include "msg/Message.h" - -class MOSDPGUpdate : public Message { - version_t map_version; - pg_t pgid; - //pginfo_t info; - bool complete; - version_t last_any_complete; - - public: - version_t get_version() { return map_version; } - pg_t get_pgid() { return pgid; } - //pginfo_t& get_pginfo() { return info; } - bool is_complete() { return complete; } - version_t get_last_any_complete() { return last_any_complete; } - - MOSDPGUpdate() {} - MOSDPGUpdate(version_t mv, pg_t pgid, bool complete, version_t last_any_complete) : - Message(MSG_OSD_PG_UPDATE) { - this->map_version = mv; - this->pgid = pgid; - this->complete = complete; - this->last_any_complete = last_any_complete; - } - - char *get_type_name() { return "PGUp"; } - void print(ostream& out) { - out << "pg_update(" << pgid << " e" << map_version; - if (complete) out << " complete"; - out << " lac=" << last_any_complete; - out << ")"; - } - - void encode_payload() { - payload.append((char*)&map_version, sizeof(map_version)); - payload.append((char*)&pgid, sizeof(pgid)); - payload.append((char*)&complete, sizeof(complete)); - payload.append((char*)&last_any_complete, sizeof(last_any_complete)); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(map_version), (char*)&map_version); - off += sizeof(map_version); - payload.copy(off, sizeof(pgid), (char*)&pgid); - off += sizeof(pgid); - payload.copy(off, sizeof(complete), (char*)&complete); - off += sizeof(complete); - payload.copy(off, sizeof(last_any_complete), (char*)&last_any_complete); - off += sizeof(last_any_complete); - } -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MOSDPing.h b/branches/sage/ebofs2/messages/MOSDPing.h deleted file mode 100644 index 37be289c0a923..0000000000000 --- a/branches/sage/ebofs2/messages/MOSDPing.h +++ /dev/null @@ -1,49 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MOSDPING_H -#define __MOSDPING_H - -#include "common/Clock.h" - -#include "msg/Message.h" -#include "osd/osd_types.h" - - -class MOSDPing : public Message { - public: - epoch_t map_epoch; - bool ack; - osd_peer_stat_t peer_stat; - - MOSDPing(epoch_t e, osd_peer_stat_t& ps, bool a=false) : - Message(MSG_OSD_PING), map_epoch(e), ack(a), peer_stat(ps) { } - MOSDPing() {} - - virtual void decode_payload() { - int off = 0; - ::_decode(map_epoch, payload, off); - ::_decode(ack, payload, off); - ::_decode(peer_stat, payload, off); - } - virtual void encode_payload() { - ::_encode(map_epoch, payload); - ::_encode(ack, payload); - ::_encode(peer_stat, payload); - } - - virtual char *get_type_name() { return "osd_ping"; } -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MPGStats.h b/branches/sage/ebofs2/messages/MPGStats.h deleted file mode 100644 index a851eb103f07f..0000000000000 --- a/branches/sage/ebofs2/messages/MPGStats.h +++ /dev/null @@ -1,43 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MPGSTATS_H -#define __MPGSTATS_H - -#include "osd/osd_types.h" - -class MPGStats : public Message { -public: - map pg_stat; - osd_stat_t osd_stat; - - MPGStats() : Message(MSG_PGSTATS) {} - - char *get_type_name() { return "pg_stats"; } - void print(ostream& out) { - out << "pg_stats"; - } - - void encode_payload() { - ::_encode(osd_stat, payload); - ::_encode(pg_stat, payload); - } - void decode_payload() { - int off = 0; - ::_decode(osd_stat, payload, off); - ::_decode(pg_stat, payload, off); - } -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MPing.h b/branches/sage/ebofs2/messages/MPing.h deleted file mode 100644 index 6b569666ed377..0000000000000 --- a/branches/sage/ebofs2/messages/MPing.h +++ /dev/null @@ -1,43 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __MPING_H -#define __MPING_H - -#include "msg/Message.h" - - -class MPing : public Message { - public: - int seq; - MPing(int s) : Message(MSG_PING) { - seq = s; - } - MPing() : Message(MSG_PING) {} - - virtual void decode_payload() { - int off = 0; - payload.copy(0, sizeof(seq), (char*)&seq); - off += sizeof(seq); - } - virtual void encode_payload() { - payload.append((char*)&seq, sizeof(seq)); - } - - virtual char *get_type_name() { return "ping"; } -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MPingAck.h b/branches/sage/ebofs2/messages/MPingAck.h deleted file mode 100644 index f8f32aee43ee0..0000000000000 --- a/branches/sage/ebofs2/messages/MPingAck.h +++ /dev/null @@ -1,42 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MPINGACK_H -#define __MPINGACK_H - -#include "MPing.h" - - -class MPingAck : public Message { - public: - int seq; - MPingAck() {} - MPingAck(MPing *p) : Message(MSG_PING_ACK) { - this->seq = p->seq; - } - - virtual void decode_payload() { - int off = 0; - payload.copy(0, sizeof(seq), (char*)&seq); - off += sizeof(seq); - } - virtual void encode_payload() { - payload.append((char*)&seq, sizeof(seq)); - } - - virtual char *get_type_name() { return "pinga"; } -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MStatfs.h b/branches/sage/ebofs2/messages/MStatfs.h deleted file mode 100644 index 66e5847206a7b..0000000000000 --- a/branches/sage/ebofs2/messages/MStatfs.h +++ /dev/null @@ -1,42 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MSTATFS_H -#define __MSTATFS_H - -#include /* or */ - -class MStatfs : public Message { -public: - tid_t tid; - - MStatfs() : Message(MSG_STATFS) {} - MStatfs(tid_t t) : Message(MSG_STATFS), tid(t) {} - - char *get_type_name() { return "statfs"; } - void print(ostream& out) { - out << "statfs(" << tid << ")"; - } - - void encode_payload() { - ::_encode(tid, payload); - } - void decode_payload() { - int off = 0; - ::_decode(tid, payload, off); - } -}; - -#endif diff --git a/branches/sage/ebofs2/messages/MStatfsReply.h b/branches/sage/ebofs2/messages/MStatfsReply.h deleted file mode 100644 index f8e21ddcc2b31..0000000000000 --- a/branches/sage/ebofs2/messages/MStatfsReply.h +++ /dev/null @@ -1,45 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MSTATFSREPLY_H -#define __MSTATFSREPLY_H - -#include /* or */ - -class MStatfsReply : public Message { -public: - tid_t tid; - struct statvfs stfs; - - MStatfsReply() : Message(MSG_STATFS_REPLY) {} - MStatfsReply(tid_t t) : Message(MSG_STATFS_REPLY), tid(t) {} - - char *get_type_name() { return "statfs_reply"; } - void print(ostream& out) { - out << "statfs_reply(" << tid << ")"; - } - - void encode_payload() { - ::_encode(tid, payload); - ::_encode(stfs, payload); - } - void decode_payload() { - int off = 0; - ::_decode(tid, payload, off); - ::_decode(stfs, payload, off); - } -}; - -#endif diff --git a/branches/sage/ebofs2/mkmonmap.cc b/branches/sage/ebofs2/mkmonmap.cc deleted file mode 100644 index 0a80e93c40bd2..0000000000000 --- a/branches/sage/ebofs2/mkmonmap.cc +++ /dev/null @@ -1,68 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include -#include -#include - -#include -#include -#include -using namespace std; - -#include "config.h" - -#include "mon/MonMap.h" - - - - - -int main(int argc, char **argv) -{ - vector args; - argv_to_vec(argc, argv, args); - - MonMap monmap; - - char *outfn = ".ceph_monmap"; - - for (unsigned i=0; i= 0); - - return 0; -} diff --git a/branches/sage/ebofs2/mon/ClientMonitor.cc b/branches/sage/ebofs2/mon/ClientMonitor.cc deleted file mode 100644 index b7ac275b0afca..0000000000000 --- a/branches/sage/ebofs2/mon/ClientMonitor.cc +++ /dev/null @@ -1,256 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include "ClientMonitor.h" -#include "Monitor.h" -#include "MDSMonitor.h" -#include "OSDMonitor.h" -#include "MonitorStore.h" - -#include "messages/MClientMount.h" -#include "messages/MClientUnmount.h" - -#include "common/Timer.h" - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_dout << dbeginl << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".client v" << client_map.version << " " -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_derr << dbeginl << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".client v" << client_map.version << " " - - - -bool ClientMonitor::update_from_paxos() -{ - assert(paxos->is_active()); - - version_t paxosv = paxos->get_version(); - if (paxosv == client_map.version) return true; - assert(paxosv >= client_map.version); - - dout(10) << "update_from_paxos paxosv " << paxosv - << ", my v " << client_map.version << dendl; - - - if (client_map.version == 0 && paxosv > 1 && - mon->store->exists_bl_ss("clientmap","latest")) { - // starting up: load latest - dout(7) << "update_from_paxos startup: loading latest full clientmap" << dendl; - bufferlist bl; - mon->store->get_bl_ss(bl, "clientmap", "latest"); - int off = 0; - client_map._decode(bl, off); - } - - // walk through incrementals - while (paxosv > client_map.version) { - bufferlist bl; - bool success = paxos->read(client_map.version+1, bl); - if (success) { - dout(7) << "update_from_paxos applying incremental " << client_map.version+1 << dendl; - Incremental inc; - int off = 0; - inc._decode(bl, off); - client_map.apply_incremental(inc); - - dout(1) << client_map.client_addr.size() << " clients (+" - << inc.mount.size() << " -" << inc.unmount.size() << ")" - << dendl; - - } else { - dout(7) << "update_from_paxos couldn't read incremental " << client_map.version+1 << dendl; - return false; - } - } - - // save latest - bufferlist bl; - client_map._encode(bl); - mon->store->put_bl_ss(bl, "clientmap", "latest"); - - return true; -} - -void ClientMonitor::create_pending() -{ - assert(mon->is_leader()); - pending_inc = Incremental(); - pending_inc.version = client_map.version + 1; - pending_inc.next_client = client_map.next_client; - dout(10) << "create_pending v " << pending_inc.version - << ", next is " << pending_inc.next_client - << dendl; -} - -void ClientMonitor::create_initial() -{ - dout(1) << "create_initial -- creating initial map" << dendl; -} - -void ClientMonitor::committed() -{ - -} - - -void ClientMonitor::encode_pending(bufferlist &bl) -{ - assert(mon->is_leader()); - dout(10) << "encode_pending v " << pending_inc.version - << ", next is " << pending_inc.next_client - << dendl; - assert(paxos->get_version() + 1 == pending_inc.version); - pending_inc._encode(bl); -} - - -// ------- - - -bool ClientMonitor::preprocess_query(Message *m) -{ - dout(10) << "preprocess_query " << *m << " from " << m->get_source_inst() << dendl; - - switch (m->get_type()) { - case MSG_CLIENT_MOUNT: - { - // already mounted? - MClientMount *mount = (MClientMount*)m; - entity_addr_t addr = m->get_source_addr(); - pair addrinst(addr, mount->instance); - if (client_map.addr_client.count(addrinst)) { - int client = client_map.addr_client[addrinst]; - dout(7) << " client" << client << " already mounted" << dendl; - _mounted(client, (MClientMount*)m); - return true; - } - } - return false; - - case MSG_CLIENT_UNMOUNT: - { - // already unmounted? - int client = m->get_source().num(); - if (client_map.client_addr.count(client) == 0) { - dout(7) << " client" << client << " not mounted" << dendl; - _unmounted((MClientUnmount*)m); - return true; - } - } - return false; - - - default: - assert(0); - delete m; - return true; - } -} - -bool ClientMonitor::prepare_update(Message *m) -{ - dout(10) << "prepare_update " << *m << " from " << m->get_source_inst() << dendl; - - switch (m->get_type()) { - case MSG_CLIENT_MOUNT: - { - MClientMount *mount = (MClientMount*)m; - pair addrinst(mount->addr, mount->instance); - int client = -1; - if (mount->get_source().is_client()) - client = mount->get_source().num(); - - // choose a client id - if (client < 0) { - client = pending_inc.next_client; - dout(10) << "mount: assigned client" << client << " to " << mount->addr << dendl; - } else { - dout(10) << "mount: client" << client << " requested by " - << mount->addr << "i" << mount->instance - << dendl; - if (client_map.client_addr.count(client)) { - assert(client_map.client_addr[client] != addrinst); - dout(0) << "mount: WARNING: client" << client << " requested by " - << mount->addr << "." << mount->instance - << ", which used to be " - << client_map.client_addr[client].first << "i" << client_map.client_addr[client].second - << dendl; - } - } - - pending_inc.add_mount(client, mount->addr, mount->instance); - paxos->wait_for_commit(new C_Mounted(this, client, mount)); - } - return true; - - case MSG_CLIENT_UNMOUNT: - { - MClientUnmount *unmount = (MClientUnmount*)m; - assert(unmount->inst.name.is_client()); - int client = unmount->inst.name.num(); - - assert(client_map.client_addr.count(client)); - - pending_inc.add_unmount(client); - paxos->wait_for_commit(new C_Unmounted(this, unmount)); - } - return true; - - default: - assert(0); - delete m; - return false; - } - -} - - -// MOUNT - - -void ClientMonitor::_mounted(int client, MClientMount *m) -{ - entity_inst_t to; - to.addr = m->addr; - to.name = entity_name_t::CLIENT(client); - - dout(10) << "_mounted client" << client << " at " << to << dendl; - - // reply with latest mds, osd maps - mon->mdsmon->send_latest(to); - mon->osdmon->send_latest(to); - - delete m; -} - -void ClientMonitor::_unmounted(MClientUnmount *m) -{ - dout(10) << "_unmounted " << m->inst << dendl; - - // reply with (same) unmount message - mon->messenger->send_message(m, m->inst); - - // auto-shutdown? - // (hack for fakesyn/newsyn, mostly) - if (mon->is_leader() && - client_map.version > 1 && - client_map.client_addr.empty() && - g_conf.mon_stop_on_last_unmount && - !mon->is_stopping()) { - dout(1) << "last client unmounted" << dendl; - mon->do_stop(); - } -} - - diff --git a/branches/sage/ebofs2/mon/ClientMonitor.h b/branches/sage/ebofs2/mon/ClientMonitor.h deleted file mode 100644 index f36ee9f7c18bd..0000000000000 --- a/branches/sage/ebofs2/mon/ClientMonitor.h +++ /dev/null @@ -1,178 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __CLIENTMONITOR_H -#define __CLIENTMONITOR_H - -#include -#include -using namespace std; - -#include "include/types.h" -#include "msg/Messenger.h" - -#include "mds/MDSMap.h" - -#include "PaxosService.h" - -class Monitor; -class Paxos; -class MClientMount; -class MClientUnmount; - -class ClientMonitor : public PaxosService { -public: - - struct Incremental { - version_t version; - uint32_t next_client; - map > mount; - set unmount; - - Incremental() : version(0), next_client() {} - - bool is_empty() { return mount.empty() && unmount.empty(); } - void add_mount(uint32_t client, entity_addr_t addr, int instance) { - next_client = MAX(next_client, client+1); - mount[client] = pair(addr, instance); - } - void add_unmount(uint32_t client) { - assert(client < next_client); - if (mount.count(client)) - mount.erase(client); - else - unmount.insert(client); - } - - void _encode(bufferlist &bl) { - ::_encode(version, bl); - ::_encode(next_client, bl); - ::_encode(mount, bl); - ::_encode(unmount, bl); - } - void _decode(bufferlist &bl, int& off) { - ::_decode(version, bl, off); - ::_decode(next_client, bl, off); - ::_decode(mount, bl, off); - ::_decode(unmount, bl, off); - } - }; - - struct Map { - version_t version; - uint32_t next_client; - map > client_addr; - map,uint32_t> addr_client; - - Map() : version(0), next_client(0) {} - - void reverse() { - addr_client.clear(); - for (map >::iterator p = client_addr.begin(); - p != client_addr.end(); - ++p) { - addr_client[p->second] = p->first; - } - } - void apply_incremental(Incremental &inc) { - assert(inc.version == version+1); - version = inc.version; - next_client = inc.next_client; - for (map >::iterator p = inc.mount.begin(); - p != inc.mount.end(); - ++p) { - client_addr[p->first] = p->second; - addr_client[p->second] = p->first; - } - - for (set::iterator p = inc.unmount.begin(); - p != inc.unmount.end(); - ++p) { - assert(client_addr.count(*p)); - addr_client.erase(client_addr[*p]); - client_addr.erase(*p); - } - } - - void _encode(bufferlist &bl) { - ::_encode(version, bl); - ::_encode(next_client, bl); - ::_encode(client_addr, bl); - } - void _decode(bufferlist &bl, int& off) { - ::_decode(version, bl, off); - ::_decode(next_client, bl, off); - ::_decode(client_addr, bl, off); - reverse(); - } - }; - - class C_Mounted : public Context { - ClientMonitor *cmon; - int client; - MClientMount *m; - public: - C_Mounted(ClientMonitor *cm, int c, MClientMount *m_) : - cmon(cm), client(c), m(m_) {} - void finish(int r) { - if (r >= 0) - cmon->_mounted(client, m); - else - cmon->dispatch((Message*)m); - } - }; - - class C_Unmounted : public Context { - ClientMonitor *cmon; - MClientUnmount *m; - public: - C_Unmounted(ClientMonitor *cm, MClientUnmount *m_) : - cmon(cm), m(m_) {} - void finish(int r) { - if (r >= 0) - cmon->_unmounted(m); - else - cmon->dispatch((Message*)m); - } - }; - - -private: - Map client_map; - - // leader - Incremental pending_inc; - - void create_initial(); - bool update_from_paxos(); - void create_pending(); // prepare a new pending - void encode_pending(bufferlist &bl); // propose pending update to peers - - void committed(); - - void _mounted(int c, MClientMount *m); - void _unmounted(MClientUnmount *m); - - bool preprocess_query(Message *m); // true if processed. - bool prepare_update(Message *m); - - - public: - ClientMonitor(Monitor *mn, Paxos *p) : PaxosService(mn, p) { } - - //void tick(); // check state, take actions - -}; - -#endif diff --git a/branches/sage/ebofs2/mon/Elector.cc b/branches/sage/ebofs2/mon/Elector.cc deleted file mode 100644 index 4a09b58ab5073..0000000000000 --- a/branches/sage/ebofs2/mon/Elector.cc +++ /dev/null @@ -1,293 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "Elector.h" -#include "Monitor.h" - -#include "common/Timer.h" -#include "MonitorStore.h" -#include "messages/MMonElection.h" - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_dout << dbeginl << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".elector(" << epoch << ") " -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_derr << dbeginl << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".elector(" << epoch << ") " - - -void Elector::init() -{ - epoch = mon->store->get_int("mon_epoch"); - if (!epoch) - epoch = 1; - dout(1) << "init, last seen epoch " << epoch << dendl; -} - -void Elector::shutdown() -{ - if (expire_event) - mon->timer.cancel_event(expire_event); -} - -void Elector::bump_epoch(epoch_t e) -{ - dout(10) << "bump_epoch " << epoch << " to " << e << dendl; - assert(epoch < e); - epoch = e; - mon->store->put_int(epoch, "mon_epoch"); - - // clear up some state - electing_me = false; - acked_me.clear(); - leader_acked = -1; -} - - -void Elector::start() -{ - dout(5) << "start -- can i be leader?" << dendl; - - // start by trying to elect me - if (epoch % 2 == 0) - bump_epoch(epoch+1); // odd == election cycle - start_stamp = g_clock.now(); - electing_me = true; - acked_me.insert(whoami); - - // bcast to everyone else - for (int i=0; imonmap->num_mon; ++i) { - if (i == whoami) continue; - mon->messenger->send_message(new MMonElection(MMonElection::OP_PROPOSE, epoch), - mon->monmap->get_inst(i)); - } - - reset_timer(); -} - -void Elector::defer(int who) -{ - dout(5) << "defer to " << who << dendl; - - if (electing_me) { - // drop out - acked_me.clear(); - electing_me = false; - } - - // ack them - leader_acked = who; - ack_stamp = g_clock.now(); - mon->messenger->send_message(new MMonElection(MMonElection::OP_ACK, epoch), - mon->monmap->get_inst(who)); - - // set a timer - reset_timer(1.0); // give the leader some extra time to declare victory -} - - -void Elector::reset_timer(double plus) -{ - // set the timer - cancel_timer(); - expire_event = new C_ElectionExpire(this); - mon->timer.add_event_after(g_conf.mon_lease + plus, - expire_event); -} - - -void Elector::cancel_timer() -{ - if (expire_event) { - mon->timer.cancel_event(expire_event); - expire_event = 0; - } -} - -void Elector::expire() -{ - dout(5) << "election timer expired" << dendl; - - // did i win? - if (electing_me && - acked_me.size() > (unsigned)(mon->monmap->num_mon / 2)) { - // i win - victory(); - } else { - // whoever i deferred to didn't declare victory quickly enough. - start(); - } -} - - -void Elector::victory() -{ - leader_acked = -1; - electing_me = false; - set quorum = acked_me; - - cancel_timer(); - - assert(epoch % 2 == 1); // election - bump_epoch(epoch+1); // is over! - - // tell everyone - for (set::iterator p = quorum.begin(); - p != quorum.end(); - ++p) { - if (*p == whoami) continue; - mon->messenger->send_message(new MMonElection(MMonElection::OP_VICTORY, epoch), - mon->monmap->get_inst(*p)); - } - - // tell monitor - mon->win_election(epoch, quorum); -} - - -void Elector::handle_propose(MMonElection *m) -{ - dout(5) << "handle_propose from " << m->get_source() << dendl; - int from = m->get_source().num(); - - assert(m->epoch % 2 == 1); // election - if (m->epoch > epoch) { - bump_epoch(m->epoch); - } - else if (m->epoch < epoch && // got an "old" propose, - epoch % 2 == 0 && // in a non-election cycle - mon->quorum.count(from) == 0) { // from someone outside the quorum - // a mon just started up, call a new election so they can rejoin! - dout(5) << " got propose from old epoch, " << m->get_source() << " must have just started" << dendl; - start(); - } - - if (whoami < from) { - // i would win over them. - if (leader_acked >= 0) { // we already acked someone - assert(leader_acked < from); // and they still win, of course - dout(5) << "no, we already acked " << leader_acked << dendl; - } else { - // wait, i should win! - if (!electing_me) - start(); - } - } else { - // they would win over me - if (leader_acked < 0 || // haven't acked anyone yet, or - leader_acked > from || // they would win over who you did ack, or - leader_acked == from) { // this is the guy we're already deferring to - defer(from); - } else { - // ignore them! - dout(5) << "no, we already acked " << leader_acked << dendl; - } - } - - delete m; -} - -void Elector::handle_ack(MMonElection *m) -{ - dout(5) << "handle_ack from " << m->get_source() << dendl; - int from = m->get_source().num(); - - assert(m->epoch % 2 == 1); // election - if (m->epoch > epoch) { - dout(5) << "woah, that's a newer epoch, i must have rebooted. bumping and re-starting!" << dendl; - bump_epoch(m->epoch); - start(); - delete m; - return; - } - assert(m->epoch == epoch); - - if (electing_me) { - // thanks - acked_me.insert(from); - dout(5) << " so far i have " << acked_me << dendl; - - // is that _everyone_? - if (acked_me.size() == (unsigned)mon->monmap->num_mon) { - // if yes, shortcut to election finish - victory(); - } - } else { - // ignore, i'm deferring already. - assert(leader_acked >= 0); - } - - delete m; -} - - -void Elector::handle_victory(MMonElection *m) -{ - dout(5) << "handle_victory from " << m->get_source() << dendl; - int from = m->get_source().num(); - - assert(from < whoami); - assert(m->epoch % 2 == 0); - assert(m->epoch == epoch + 1); // i should have seen this election if i'm getting the victory. - bump_epoch(m->epoch); - - // they win - mon->lose_election(epoch, from); - - // cancel my timer - cancel_timer(); -} - - - - -void Elector::dispatch(Message *m) -{ - switch (m->get_type()) { - - case MSG_MON_ELECTION: - { - MMonElection *em = (MMonElection*)m; - - switch (em->op) { - case MMonElection::OP_PROPOSE: - handle_propose(em); - return; - } - - if (em->epoch < epoch) { - dout(5) << "old epoch, dropping" << dendl; - delete em; - break; - } - - switch (em->op) { - case MMonElection::OP_ACK: - handle_ack(em); - return; - case MMonElection::OP_VICTORY: - handle_victory(em); - return; - default: - assert(0); - } - } - break; - - default: - assert(0); - } -} - - - - diff --git a/branches/sage/ebofs2/mon/Elector.h b/branches/sage/ebofs2/mon/Elector.h deleted file mode 100644 index 9bfd7cb644fc7..0000000000000 --- a/branches/sage/ebofs2/mon/Elector.h +++ /dev/null @@ -1,92 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MON_ELECTOR_H -#define __MON_ELECTOR_H - -#include -using namespace std; - -#include "include/types.h" -#include "msg/Message.h" - -#include "include/Context.h" - -#include "common/Timer.h" - -class Monitor; - - -class Elector { - private: - Monitor *mon; - int whoami; - - Context *expire_event; - - void reset_timer(double plus=0.0); - void cancel_timer(); - - epoch_t epoch; // latest epoch we've seen. odd == election, even == stable, - - // electing me - bool electing_me; - utime_t start_stamp; - set acked_me; - - // electing them - int leader_acked; // who i've acked - utime_t ack_stamp; // and when - - void bump_epoch(epoch_t e=0); // i just saw a larger epoch - - class C_ElectionExpire : public Context { - Elector *elector; - public: - C_ElectionExpire(Elector *e) : elector(e) { } - void finish(int r) { - elector->expire(); - } - }; - - void start(); // start an electing me - void defer(int who); - void expire(); // timer goes off - void victory(); - - void handle_propose(class MMonElection *m); - void handle_ack(class MMonElection *m); - void handle_victory(class MMonElection *m); - - public: - Elector(Monitor *m, int w) : mon(m), whoami(w), - expire_event(0), - epoch(0), - electing_me(false), - leader_acked(-1) { } - - void init(); - void shutdown(); - - void dispatch(Message *m); - - void call_election() { - start(); - } - -}; - - -#endif diff --git a/branches/sage/ebofs2/mon/MDSMonitor.cc b/branches/sage/ebofs2/mon/MDSMonitor.cc deleted file mode 100644 index 3b415232b8f14..0000000000000 --- a/branches/sage/ebofs2/mon/MDSMonitor.cc +++ /dev/null @@ -1,667 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include "MDSMonitor.h" -#include "Monitor.h" -#include "MonitorStore.h" -#include "OSDMonitor.h" - -#include "messages/MMDSMap.h" -#include "messages/MMDSGetMap.h" -#include "messages/MMDSBeacon.h" - -#include "messages/MMonCommand.h" -#include "messages/MMonCommandAck.h" - -#include "messages/MGenericMessage.h" - - -#include "common/Timer.h" - -#include - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_dout << dbeginl << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".mds e" << mdsmap.get_epoch() << " " -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_derr << dbeginl << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".mds e" << mdsmap.get_epoch() << " " - - - -// my methods - -void MDSMonitor::print_map(MDSMap &m) -{ - dout(7) << "print_map epoch " << m.get_epoch() << " max " << m.max_mds << dendl; - entity_inst_t blank; - set all; - m.get_mds_set(all); - for (set::iterator p = all.begin(); - p != all.end(); - ++p) { - if (m.standby_for.count(*p) && !m.standby_for[*p].empty()) { - dout(7) << " mds" << *p << "." << m.mds_inc[*p] - << " : " << MDSMap::get_state_name(m.get_state(*p)) - << " : " << (m.have_inst(*p) ? m.get_inst(*p) : blank) - << " : +" << m.standby_for[*p].size() - << " standby " << m.standby_for[*p] - << dendl; - } else { - dout(7) << " mds" << *p << "." << m.mds_inc[*p] - << " : " << MDSMap::get_state_name(m.get_state(*p)) - << " : " << (m.have_inst(*p) ? m.get_inst(*p) : blank) - << dendl; - } - } - if (!m.standby_any.empty()) { - dout(7) << " +" << m.standby_any.size() << " shared standby " << m.standby_any << dendl; - } -} - - - -// service methods - -void MDSMonitor::create_initial() -{ - dout(10) << "create_initial" << dendl; - pending_mdsmap.max_mds = g_conf.num_mds; - pending_mdsmap.created = g_clock.now(); - print_map(pending_mdsmap); -} - -bool MDSMonitor::update_from_paxos() -{ - assert(paxos->is_active()); - - version_t paxosv = paxos->get_version(); - if (paxosv == mdsmap.epoch) return true; - assert(paxosv >= mdsmap.epoch); - - dout(10) << "update_from_paxos paxosv " << paxosv - << ", my e " << mdsmap.epoch << dendl; - - // read and decode - mdsmap_bl.clear(); - bool success = paxos->read(paxosv, mdsmap_bl); - assert(success); - dout(10) << "update_from_paxos got " << paxosv << dendl; - mdsmap.decode(mdsmap_bl); - - // new map - dout(7) << "new map:" << dendl; - print_map(mdsmap); - - // bcast map to mds, waiters - if (mon->is_leader()) - bcast_latest_mds(); - send_to_waiting(); - - // make sure last_beacon is populated - for (map::iterator p = mdsmap.mds_inst.begin(); - p != mdsmap.mds_inst.end(); - ++p) - if (last_beacon.count(p->second.addr) == 0 && - mdsmap.get_state(p->first) != MDSMap::STATE_DNE) - last_beacon[p->second.addr] = g_clock.now(); - for (map::iterator p = mdsmap.standby.begin(); - p != mdsmap.standby.end(); - ++p ) - if (last_beacon.count(p->first) == 0) - last_beacon[p->first] = g_clock.now(); - - return true; -} - -void MDSMonitor::create_pending() -{ - pending_mdsmap = mdsmap; - pending_mdsmap.epoch++; - dout(10) << "create_pending e" << pending_mdsmap.epoch << dendl; -} - -void MDSMonitor::encode_pending(bufferlist &bl) -{ - dout(10) << "encode_pending e" << pending_mdsmap.epoch << dendl; - - //print_map(pending_mdsmap); - - // apply to paxos - assert(paxos->get_version() + 1 == pending_mdsmap.epoch); - pending_mdsmap.encode(bl); -} - - -bool MDSMonitor::preprocess_query(Message *m) -{ - dout(10) << "preprocess_query " << *m << " from " << m->get_source_inst() << dendl; - - switch (m->get_type()) { - - case MSG_MDS_BEACON: - return preprocess_beacon((MMDSBeacon*)m); - - case MSG_MDS_GETMAP: - send_full(m->get_source_inst()); - return true; - - case MSG_MON_COMMAND: - return false; - - default: - assert(0); - delete m; - return true; - } -} - - -bool MDSMonitor::preprocess_beacon(MMDSBeacon *m) -{ - dout(12) << "preprocess_beacon " << *m - << " from " << m->get_mds_inst() - << dendl; - - // fw to leader? - if (!mon->is_leader()) { - dout(10) << "fw to leader" << dendl; - mon->messenger->send_message(m, mon->monmap->get_inst(mon->get_leader())); - return true; - } - - // let's see. - int from = m->get_mds_inst().name.num(); - entity_addr_t addr = m->get_mds_inst().addr; - int state = m->get_state(); - version_t seq = m->get_seq(); - - // can i handle this query without a map update? - - // boot? - if (state == MDSMap::STATE_BOOT) { - // already booted? - if (pending_mdsmap.get_addr_rank(addr) == -1) - return false; // not booted|booting|standby yet - - // ignore. - goto out; - } - else if (state == MDSMap::STATE_STANDBY) { - // standby? - if (!pending_mdsmap.is_standby(addr) && - !mdsmap.is_standby(addr)) { - dout(7) << "mds_beacon " << *m << " claiming standby, but not, ignoring" << dendl; - goto out; - } - // reply. - } - else { - // old seq? - if (mdsmap.mds_state_seq[from] > seq) { - dout(7) << "mds_beacon " << *m << " has old seq, ignoring" << dendl; - goto out; - } - - // is there a state change here? - if (mdsmap.mds_state.count(from) == 0) { - dout(1) << "mds_beacon " << *m << " announcing non-boot|standby state, ignoring" << dendl; - goto out; - } - - if (mdsmap.mds_state[from] != state) { - if (mdsmap.get_epoch() == m->get_last_epoch_seen()) - return false; // need to update map - dout(10) << "mds_beacon " << *m << " ignoring requested state, because mds hasn't seen latest map" << dendl; - } - } - - // note time and reply - dout(15) << "mds_beacon " << *m << " noting time and replying" << dendl; - last_beacon[addr] = g_clock.now(); - mon->messenger->send_message(new MMDSBeacon(m->get_mds_inst(), mdsmap.get_epoch(), state, seq, 0), - m->get_mds_inst()); - - // done - out: - delete m; - return true; -} - - -bool MDSMonitor::prepare_update(Message *m) -{ - dout(7) << "prepare_update " << *m << dendl; - - switch (m->get_type()) { - - case MSG_MDS_BEACON: - return handle_beacon((MMDSBeacon*)m); - - case MSG_MON_COMMAND: - return handle_command((MMonCommand*)m); - - default: - assert(0); - delete m; - } - - return true; -} - - - -bool MDSMonitor::handle_beacon(MMDSBeacon *m) -{ - // -- this is an update -- - dout(12) << "handle_beacon " << *m - << " from " << m->get_mds_inst() - << dendl; - int from = m->get_mds_inst().name.num(); - entity_addr_t addr = m->get_mds_inst().addr; - int state = m->get_state(); - version_t seq = m->get_seq(); - - // boot? - int standby_for = -1; - if (state == MDSMap::STATE_BOOT) { - from = -1; - - // standby for a given rank? - standby_for = m->get_want_rank(); - if (standby_for >= pending_mdsmap.max_mds) { - dout(10) << "mds_beacon boot: wanted standby for mds" << from - << " >= max_mds " << pending_mdsmap.max_mds - << ", will be shared standby" << dendl; - standby_for = -1; - } - if (standby_for >= 0 && pending_mdsmap.is_down(standby_for)) { - // wants to be a specific MDS, who is down - from = standby_for; - switch (pending_mdsmap.get_state(standby_for)) { - case MDSMap::STATE_STOPPED: - state = MDSMap::STATE_STARTING; - break; - case MDSMap::STATE_DNE: - state = MDSMap::STATE_CREATING; - break; - case MDSMap::STATE_FAILED: - state = MDSMap::STATE_REPLAY; - break; - default: - assert(0); - } - dout(10) << "mds_beacon boot: mds" << from - << " was " << MDSMap::get_state_name(pending_mdsmap.get_state(from)) - << ", " << MDSMap::get_state_name(state) - << dendl; - } - else if (standby_for < 0) { - // pick another failed mds? - set failed; - pending_mdsmap.get_failed_mds_set(failed); - if (!failed.empty()) { - from = *failed.begin(); - dout(10) << "mds_beacon boot: assigned failed mds" << from << dendl; - state = MDSMap::STATE_REPLAY; - } - } - if (from < 0 && standby_for < 0 && - !pending_mdsmap.is_degraded()) { - // ok, just pick any unused mds rank - // that doesn't make us overfull - for (int i=0; i " << MDSMap::get_state_name(state) - << dendl; - - // change the state - pending_mdsmap.mds_state[from] = state; - if (pending_mdsmap.is_up(from)) - pending_mdsmap.mds_state_seq[from] = seq; - else - pending_mdsmap.mds_state_seq.erase(from); - } - - dout(7) << "pending map now:" << dendl; - print_map(pending_mdsmap); - - paxos->wait_for_commit(new C_Updated(this, from, m)); - - return true; -} - -bool MDSMonitor::should_propose(double& delay) -{ - delay = 0.0; - return true; -} - -void MDSMonitor::_updated(int from, MMDSBeacon *m) -{ - if (from < 0) { - dout(10) << "_updated (booted) mds" << from << " " << *m << dendl; - mon->osdmon->send_latest(m->get_source_inst()); - } else { - dout(10) << "_updated mds" << from << " " << *m << dendl; - } - if (m->get_state() == MDSMap::STATE_STOPPED) { - // send the map manually (they're out of the map, so they won't get it automatic) - send_latest(m->get_mds_inst()); - } - - delete m; -} - - -void MDSMonitor::committed() -{ - // check for failed - set failed; - mdsmap.get_failed_mds_set(failed); - - if (!mdsmap.standby.empty() && !failed.empty()) { - bool didtakeover = false; - set::iterator p = failed.begin(); - while (p != failed.end()) { - int f = *p++; - - // someone standby for me? - if (mdsmap.standby_for.count(f) && - !mdsmap.standby_for[f].empty()) { - dout(0) << "mds" << f << " standby " << *mdsmap.standby_for[f].begin() << " taking over" << dendl; - take_over(*mdsmap.standby_for[f].begin(), f); - didtakeover = true; - } - else if (!mdsmap.standby_any.empty()) { - dout(0) << "standby " << mdsmap.standby.begin()->first << " taking over for mds" << f << dendl; - take_over(mdsmap.standby.begin()->first, f); - didtakeover = true; - } - } - if (didtakeover) { - dout(7) << "pending map now:" << dendl; - print_map(pending_mdsmap); - propose_pending(); - } - } - - // hackish: did all mds's shut down? - if (mon->is_leader() && - g_conf.mon_stop_with_last_mds && - mdsmap.get_epoch() > 1 && - mdsmap.is_stopped()) - mon->messenger->send_message(new MGenericMessage(MSG_SHUTDOWN), - mon->monmap->get_inst(mon->whoami)); -} - -void MDSMonitor::take_over(entity_addr_t addr, int mds) -{ - pending_mdsmap.mds_inst[mds].addr = addr; - pending_mdsmap.mds_inst[mds].name = entity_name_t::MDS(mds); - pending_mdsmap.mds_inc[mds]++; - pending_mdsmap.mds_state[mds] = MDSMap::STATE_REPLAY; - pending_mdsmap.mds_state_seq[mds] = 0; - - // remove from standby list(s) - pending_mdsmap.standby.erase(addr); - pending_mdsmap.standby_for[mds].erase(addr); - pending_mdsmap.standby_any.erase(addr); -} - - - -bool MDSMonitor::handle_command(MMonCommand *m) -{ - int r = -EINVAL; - stringstream ss; - - if (m->cmd.size() > 1) { - if (m->cmd[1] == "stop" && m->cmd.size() > 2) { - int who = atoi(m->cmd[2].c_str()); - if (mdsmap.is_active(who)) { - r = 0; - ss << "telling mds" << who << " to stop"; - pending_mdsmap.mds_state[who] = MDSMap::STATE_STOPPING; - } else { - r = -EEXIST; - ss << "mds" << who << " not active (" << mdsmap.get_state_name(mdsmap.get_state(who)) << ")"; - } - } - else if (m->cmd[1] == "set_max_mds" && m->cmd.size() > 2) { - pending_mdsmap.max_mds = atoi(m->cmd[2].c_str()); - r = 0; - ss << "max_mds = " << pending_mdsmap.max_mds; - } - } - if (r == -EINVAL) { - ss << "unrecognized command"; - } - - // reply - string rs; - getline(ss,rs); - mon->messenger->send_message(new MMonCommandAck(r, rs), m->get_source_inst()); - delete m; - return r >= 0; -} - - - -void MDSMonitor::bcast_latest_mds() -{ - dout(10) << "bcast_latest_mds " << mdsmap.get_epoch() << dendl; - - // tell mds - set up; - mdsmap.get_up_mds_set(up); - for (set::iterator p = up.begin(); - p != up.end(); - p++) - send_full(mdsmap.get_inst(*p)); - - // standby too - entity_inst_t inst; - inst.name = entity_name_t::MDS(-1); - for (map::iterator p = mdsmap.standby.begin(); - p != mdsmap.standby.end(); - p++) { - inst.addr = p->first; - send_full(inst); - } -} - -void MDSMonitor::send_full(entity_inst_t dest) -{ - dout(11) << "send_full to " << dest << dendl; - mon->messenger->send_message(new MMDSMap(&mdsmap), dest); -} - -void MDSMonitor::send_to_waiting() -{ - dout(10) << "send_to_waiting " << mdsmap.get_epoch() << dendl; - for (list::iterator i = waiting_for_map.begin(); - i != waiting_for_map.end(); - i++) - send_full(*i); - waiting_for_map.clear(); -} - -void MDSMonitor::send_latest(entity_inst_t dest) -{ - if (paxos->is_readable()) - send_full(dest); - else - waiting_for_map.push_back(dest); -} - - -void MDSMonitor::tick() -{ - // make sure mds's are still alive - // ...if i am an active leader - if (!mon->is_leader()) return; - if (!paxos->is_active()) return; - - utime_t cutoff = g_clock.now(); - cutoff -= g_conf.mds_beacon_grace; - - map::iterator p = last_beacon.begin(); - while (p != last_beacon.end()) { - entity_addr_t addr = p->first; - p++; - - if (last_beacon[addr] >= cutoff) continue; - - int mds = pending_mdsmap.get_addr_rank(addr); - if (mds >= 0) { - // failure! - int newstate; - switch (pending_mdsmap.get_state(mds)) { - case MDSMap::STATE_CREATING: - newstate = MDSMap::STATE_DNE; // didn't finish creating - last_beacon.erase(addr); - break; - - case MDSMap::STATE_STARTING: - newstate = MDSMap::STATE_STOPPED; - break; - - case MDSMap::STATE_REPLAY: - case MDSMap::STATE_RESOLVE: - case MDSMap::STATE_RECONNECT: - case MDSMap::STATE_REJOIN: - case MDSMap::STATE_ACTIVE: - case MDSMap::STATE_STOPPING: - newstate = MDSMap::STATE_FAILED; - break; - - default: - assert(0); - } - - dout(10) << "no beacon from mds" << *p << " since " << last_beacon[addr] - << ", marking " << pending_mdsmap.get_state_name(newstate) - << dendl; - - // update map - pending_mdsmap.mds_state[mds] = newstate; - pending_mdsmap.mds_state_seq.erase(mds); - } - else if (pending_mdsmap.is_standby(addr)) { - dout(10) << "no beacon from standby " << addr << " since " << last_beacon[addr] - << ", removing from standby list" - << dendl; - if (pending_mdsmap.standby[addr] >= 0) - pending_mdsmap.standby_for[pending_mdsmap.standby[addr]].erase(addr); - else - pending_mdsmap.standby_any.erase(addr); - pending_mdsmap.standby.erase(addr); - } - else { - dout(0) << "BUG: removing stray " << addr << " from last_beacon map" << dendl; - } - - last_beacon.erase(addr); - propose_pending(); - } -} - - -void MDSMonitor::do_stop() -{ - // hrm... - if (!mon->is_leader() || - !paxos->is_active()) { - dout(-10) << "do_stop can't stop right now, mdsmap not writeable" << dendl; - return; - } - - dout(7) << "do_stop stopping active mds nodes" << dendl; - print_map(mdsmap); - - for (map::iterator p = mdsmap.mds_state.begin(); - p != mdsmap.mds_state.end(); - ++p) { - switch (p->second) { - case MDSMap::STATE_ACTIVE: - case MDSMap::STATE_STOPPING: - pending_mdsmap.mds_state[p->first] = MDSMap::STATE_STOPPING; - break; - case MDSMap::STATE_CREATING: - pending_mdsmap.mds_state[p->first] = MDSMap::STATE_DNE; - last_beacon.erase(pending_mdsmap.mds_inst[p->first].addr); - break; - case MDSMap::STATE_STARTING: - pending_mdsmap.mds_state[p->first] = MDSMap::STATE_STOPPED; - break; - case MDSMap::STATE_REPLAY: - case MDSMap::STATE_RESOLVE: - case MDSMap::STATE_RECONNECT: - case MDSMap::STATE_REJOIN: - // BUG: hrm, if this is the case, the STOPPING gusy won't be able to stop, will they? - pending_mdsmap.mds_state[p->first] = MDSMap::STATE_FAILED; - break; - } - } - // hose standby list - pending_mdsmap.standby.clear(); - pending_mdsmap.standby_for.clear(); - pending_mdsmap.standby_any.clear(); - - propose_pending(); -} diff --git a/branches/sage/ebofs2/mon/MDSMonitor.h b/branches/sage/ebofs2/mon/MDSMonitor.h deleted file mode 100644 index c4dc095236501..0000000000000 --- a/branches/sage/ebofs2/mon/MDSMonitor.h +++ /dev/null @@ -1,100 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDSMONITOR_H -#define __MDSMONITOR_H - -#include -#include -using namespace std; - -#include "include/types.h" -#include "msg/Messenger.h" - -#include "mds/MDSMap.h" - -#include "PaxosService.h" - -class MMDSBeacon; - -class MDSMonitor : public PaxosService { - public: - // mds maps - MDSMap mdsmap; // current - bufferlist mdsmap_bl; // encoded - - MDSMap pending_mdsmap; // current + pending updates - - // my helpers - void print_map(MDSMap &m); - - class C_Updated : public Context { - MDSMonitor *mm; - int mds; - MMDSBeacon *m; - public: - C_Updated(MDSMonitor *a, int b, MMDSBeacon *c) : - mm(a), mds(b), m(c) {} - void finish(int r) { - if (r >= 0) - mm->_updated(mds, m); // success - else - mm->dispatch((Message*)m); // try again - } - }; - - - // service methods - void create_initial(); - bool update_from_paxos(); - void create_pending(); - void encode_pending(bufferlist &bl); - - void _updated(int m, MMDSBeacon *m); - - bool preprocess_query(Message *m); // true if processed. - bool prepare_update(Message *m); - bool should_propose(double& delay); - - void committed(); - - bool preprocess_beacon(class MMDSBeacon *m); - bool handle_beacon(class MMDSBeacon *m); - bool handle_command(class MMonCommand *m); - - void take_over(entity_addr_t addr, int mds); - - // beacons - map last_beacon; - -public: - MDSMonitor(Monitor *mn, Paxos *p) : PaxosService(mn, p) { } - - // sending the map -private: - list waiting_for_map; - - void bcast_latest_mds(); - void send_full(entity_inst_t dest); - void send_to_waiting(); - -public: - void send_latest(entity_inst_t dest); - - void tick(); // check state, take actions - void do_stop(); - -}; - -#endif diff --git a/branches/sage/ebofs2/mon/MonMap.h b/branches/sage/ebofs2/mon/MonMap.h deleted file mode 100644 index dbe9c9b5ac5e9..0000000000000 --- a/branches/sage/ebofs2/mon/MonMap.h +++ /dev/null @@ -1,101 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MONMAP_H -#define __MONMAP_H - -#include -#include -#include - -#include "msg/Message.h" -#include "include/types.h" - -class MonMap { - public: - epoch_t epoch; // what epoch/version of the monmap - int32_t num_mon; - vector mon_inst; - - int last_mon; // last mon i talked to - - MonMap(int s=0) : epoch(0), num_mon(s), mon_inst(s), last_mon(-1) {} - - void add_mon(entity_inst_t inst) { - mon_inst.push_back(inst); - num_mon++; - } - - // pick a mon. - // choice should be stable, unless we explicitly ask for a new one. - int pick_mon(bool newmon=false) { - if (newmon || (last_mon < 0)) { - last_mon = rand() % num_mon; - } - return last_mon; - } - - const entity_inst_t &get_inst(int m) { - assert(m < num_mon); - return mon_inst[m]; - } - - void encode(bufferlist& blist) { - ::_encode(epoch, blist); - ::_encode(num_mon, blist); - ::_encode(mon_inst, blist); - } - - void decode(bufferlist& blist) { - int off = 0; - ::_decode(epoch, blist, off); - ::_decode(num_mon, blist, off); - ::_decode(mon_inst, blist, off); - } - - // read from/write to a file - int write(char *fn) { - // encode - bufferlist bl; - encode(bl); - - // write - int fd = ::open(fn, O_RDWR|O_CREAT); - if (fd < 0) return fd; - ::fchmod(fd, 0644); - ::write(fd, (void*)bl.c_str(), bl.length()); - ::close(fd); - return 0; - } - - int read(char *fn) { - // read - bufferlist bl; - int fd = ::open(fn, O_RDONLY); - if (fd < 0) return fd; - struct stat st; - ::fstat(fd, &st); - bufferptr bp(st.st_size); - bl.append(bp); - ::read(fd, (void*)bl.c_str(), bl.length()); - ::close(fd); - - // decode - decode(bl); - return 0; - } - -}; - -#endif diff --git a/branches/sage/ebofs2/mon/Monitor.cc b/branches/sage/ebofs2/mon/Monitor.cc deleted file mode 100644 index 1db23b0270e57..0000000000000 --- a/branches/sage/ebofs2/mon/Monitor.cc +++ /dev/null @@ -1,405 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -// TODO: missing run() method, which creates the two main timers, refreshTimer and readTimer - -#include "Monitor.h" - -#include "osd/OSDMap.h" - -#include "MonitorStore.h" - -#include "msg/Message.h" -#include "msg/Messenger.h" - -#include "messages/MPing.h" -#include "messages/MPingAck.h" -#include "messages/MGenericMessage.h" -#include "messages/MMonCommand.h" -#include "messages/MMonCommandAck.h" - -#include "messages/MMonPaxos.h" - -#include "common/Timer.h" -#include "common/Clock.h" - -#include "OSDMonitor.h" -#include "MDSMonitor.h" -#include "ClientMonitor.h" -#include "PGMonitor.h" - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_dout << dbeginl << g_clock.now() << " mon" << whoami << (is_starting() ? (const char*)"(starting)":(is_leader() ? (const char*)"(leader)":(is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << " " -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_derr << dbeginl << g_clock.now() << " mon" << whoami << (is_starting() ? (const char*)"(starting)":(is_leader() ? (const char*)"(leader)":(is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << " " - - - -void Monitor::init() -{ - lock.Lock(); - - dout(1) << "init" << dendl; - - // store - char s[80]; - sprintf(s, "mondata/mon%d", whoami); - store = new MonitorStore(s); - - if (g_conf.mkfs) - store->mkfs(); - - store->mount(); - - // create - osdmon = new OSDMonitor(this, &paxos_osdmap); - mdsmon = new MDSMonitor(this, &paxos_mdsmap); - clientmon = new ClientMonitor(this, &paxos_clientmap); - pgmon = new PGMonitor(this, &paxos_pgmap); - - // init paxos - paxos_test.init(); - paxos_osdmap.init(); - paxos_mdsmap.init(); - paxos_clientmap.init(); - paxos_pgmap.init(); - - // i'm ready! - messenger->set_dispatcher(this); - - // start ticker - reset_tick(); - - // call election? - if (monmap->num_mon > 1) { - assert(monmap->num_mon != 2); - call_election(); - } else { - // we're standalone. - set q; - q.insert(whoami); - win_election(1, q); - } - - lock.Unlock(); -} - -void Monitor::shutdown() -{ - dout(1) << "shutdown" << dendl; - - elector.shutdown(); - - if (is_leader()) { - // stop osds. - for (set::iterator it = osdmon->osdmap.get_osds().begin(); - it != osdmon->osdmap.get_osds().end(); - it++) { - if (osdmon->osdmap.is_down(*it)) continue; - dout(10) << "sending shutdown to osd" << *it << dendl; - messenger->send_message(new MGenericMessage(MSG_SHUTDOWN), - osdmon->osdmap.get_inst(*it)); - } - osdmon->mark_all_down(); - - // monitors too. - for (int i=0; inum_mon; i++) - if (i != whoami) - messenger->send_message(new MGenericMessage(MSG_SHUTDOWN), - monmap->get_inst(i)); - } - - // cancel all events - cancel_tick(); - timer.cancel_all(); - timer.join(); - - // unmount my local storage - if (store) - delete store; - - // clean up - if (osdmon) delete osdmon; - if (mdsmon) delete mdsmon; - if (clientmon) delete clientmon; - if (pgmon) delete pgmon; - - // die. - messenger->shutdown(); -} - - -void Monitor::call_election() -{ - if (monmap->num_mon == 1) return; - - dout(10) << "call_election" << dendl; - state = STATE_STARTING; - - // tell paxos - paxos_test.election_starting(); - paxos_mdsmap.election_starting(); - paxos_osdmap.election_starting(); - paxos_clientmap.election_starting(); - - // call a new election - elector.call_election(); -} - -void Monitor::win_election(epoch_t epoch, set& active) -{ - state = STATE_LEADER; - leader = whoami; - mon_epoch = epoch; - quorum = active; - dout(10) << "win_election, epoch " << mon_epoch << " quorum is " << quorum << dendl; - - // init paxos - paxos_test.leader_init(); - paxos_mdsmap.leader_init(); - paxos_osdmap.leader_init(); - paxos_clientmap.leader_init(); - paxos_pgmap.leader_init(); - - // init - osdmon->election_finished(); - mdsmon->election_finished(); - clientmon->election_finished(); - pgmon->election_finished(); -} - -void Monitor::lose_election(epoch_t epoch, int l) -{ - state = STATE_PEON; - mon_epoch = epoch; - leader = l; - dout(10) << "lose_election, epoch " << mon_epoch << " leader is mon" << leader << dendl; - - // init paxos - paxos_test.peon_init(); - paxos_mdsmap.peon_init(); - paxos_osdmap.peon_init(); - paxos_clientmap.peon_init(); - paxos_pgmap.peon_init(); - - // init - osdmon->election_finished(); - mdsmon->election_finished(); - clientmon->election_finished(); - pgmon->election_finished(); -} - - -void Monitor::handle_command(MMonCommand *m) -{ - dout(0) << "handle_command " << *m << dendl; - - int r = -1; - string rs = "unrecognized command"; - - if (!m->cmd.empty()) { - if (m->cmd[0] == "stop") { - r = 0; - rs = "stopping"; - do_stop(); - } - else if (m->cmd[0] == "mds") { - mdsmon->dispatch(m); - return; - } - else if (m->cmd[0] == "osd") { - - } - } - - // reply - messenger->send_message(new MMonCommandAck(r, rs), m->get_source_inst()); - delete m; -} - - -void Monitor::do_stop() -{ - dout(0) << "do_stop -- shutting down" << dendl; - stopping = true; - mdsmon->do_stop(); -} - - -void Monitor::dispatch(Message *m) -{ - lock.Lock(); - { - switch (m->get_type()) { - - // misc - case MSG_PING_ACK: - handle_ping_ack((MPingAck*)m); - break; - - case MSG_SHUTDOWN: - if (m->get_source().is_osd()) - osdmon->dispatch(m); - else - handle_shutdown(m); - break; - - case MSG_MON_COMMAND: - handle_command((MMonCommand*)m); - break; - - - // OSDs - case MSG_OSD_GETMAP: - case MSG_OSD_FAILURE: - case MSG_OSD_BOOT: - case MSG_OSD_IN: - case MSG_OSD_OUT: - osdmon->dispatch(m); - break; - - - // MDSs - case MSG_MDS_BEACON: - case MSG_MDS_GETMAP: - mdsmon->dispatch(m); - break; - - // clients - case MSG_CLIENT_MOUNT: - case MSG_CLIENT_UNMOUNT: - clientmon->dispatch(m); - break; - - // pg - case MSG_STATFS: - case MSG_PGSTATS: - pgmon->dispatch(m); - break; - - - // paxos - case MSG_MON_PAXOS: - { - MMonPaxos *pm = (MMonPaxos*)m; - - // sanitize - if (pm->epoch > mon_epoch) - call_election(); - if (pm->epoch != mon_epoch) { - delete pm; - break; - } - - // send it to the right paxos instance - switch (pm->machine_id) { - case PAXOS_TEST: - paxos_test.dispatch(m); - break; - case PAXOS_OSDMAP: - paxos_osdmap.dispatch(m); - break; - case PAXOS_MDSMAP: - paxos_mdsmap.dispatch(m); - break; - case PAXOS_CLIENTMAP: - paxos_clientmap.dispatch(m); - break; - default: - assert(0); - } - } - break; - - // elector messages - case MSG_MON_ELECTION: - elector.dispatch(m); - break; - - - default: - dout(0) << "unknown message " << m << " " << *m << " from " << m->get_source_inst() << dendl; - assert(0); - } - } - lock.Unlock(); -} - - -void Monitor::handle_shutdown(Message *m) -{ - assert(m->get_source().is_mon()); - if (m->get_source().num() == get_leader()) { - dout(1) << "shutdown from leader " << m->get_source() << dendl; - shutdown(); - } else { - dout(1) << "ignoring shutdown from non-leader " << m->get_source() << dendl; - } - delete m; -} - -void Monitor::handle_ping_ack(MPingAck *m) -{ - // ... - - delete m; -} - - - - -/************ TICK ***************/ - -class C_Mon_Tick : public Context { - Monitor *mon; -public: - C_Mon_Tick(Monitor *m) : mon(m) {} - void finish(int r) { - mon->tick(); - } -}; - -void Monitor::cancel_tick() -{ - if (tick_timer) timer.cancel_event(tick_timer); -} - -void Monitor::reset_tick() -{ - cancel_tick(); - tick_timer = new C_Mon_Tick(this); - timer.add_event_after(g_conf.mon_tick_interval, tick_timer); -} - - -void Monitor::tick() -{ - tick_timer = 0; - - // ok go. - dout(11) << "tick" << dendl; - - osdmon->tick(); - mdsmon->tick(); - - // next tick! - reset_tick(); -} - - - - - - - diff --git a/branches/sage/ebofs2/mon/Monitor.h b/branches/sage/ebofs2/mon/Monitor.h deleted file mode 100644 index bd278a2092308..0000000000000 --- a/branches/sage/ebofs2/mon/Monitor.h +++ /dev/null @@ -1,154 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MONITOR_H -#define __MONITOR_H - -#include "include/types.h" -#include "msg/Messenger.h" - -#include "common/Timer.h" - -#include "MonMap.h" -#include "Elector.h" -#include "Paxos.h" - - -class MonitorStore; -class OSDMonitor; -class MDSMonitor; -class ClientMonitor; -class PGMonitor; - -class Monitor : public Dispatcher { -public: - // me - int whoami; - Messenger *messenger; - Mutex lock; - - MonMap *monmap; - - // timer. - SafeTimer timer; - Context *tick_timer; - void cancel_tick(); - void reset_tick(); - friend class C_Mon_Tick; - - // -- local storage -- -public: - MonitorStore *store; - - // -- monitor state -- -private: - const static int STATE_STARTING = 0; // electing - const static int STATE_LEADER = 1; - const static int STATE_PEON = 2; - int state; - bool stopping; - -public: - bool is_starting() { return state == STATE_STARTING; } - bool is_leader() { return state == STATE_LEADER; } - bool is_peon() { return state == STATE_PEON; } - bool is_stopping() { return stopping; } - - - // -- elector -- -private: - Elector elector; - friend class Elector; - - epoch_t mon_epoch; // monitor epoch (election instance) - int leader; // current leader (to best of knowledge) - set quorum; // current active set of monitors (if !starting) - utime_t last_called_election; // [starting] last time i called an election - -public: - epoch_t get_epoch() { return mon_epoch; } - int get_leader() { return leader; } - const set& get_quorum() { return quorum; } - - void call_election(); // initiate election - void win_election(epoch_t epoch, set& q); // end election (called by Elector) - void lose_election(epoch_t epoch, int l); // end election (called by Elector) - - - // -- paxos -- - Paxos paxos_test; - Paxos paxos_mdsmap; - Paxos paxos_osdmap; - Paxos paxos_clientmap; - Paxos paxos_pgmap; - friend class Paxos; - - - // -- services -- - OSDMonitor *osdmon; - MDSMonitor *mdsmon; - ClientMonitor *clientmon; - PGMonitor *pgmon; - - friend class OSDMonitor; - friend class MDSMonitor; - friend class ClientMonitor; - friend class PGMonitor; - - - // messages - void handle_shutdown(Message *m); - void handle_ping_ack(class MPingAck *m); - void handle_command(class MMonCommand *m); - - - - public: - Monitor(int w, Messenger *m, MonMap *mm) : - whoami(w), - messenger(m), - monmap(mm), - timer(lock), tick_timer(0), - store(0), - - state(STATE_STARTING), stopping(false), - - elector(this, w), - mon_epoch(0), - leader(0), - - paxos_test(this, w, PAXOS_TEST), - paxos_mdsmap(this, w, PAXOS_MDSMAP), - paxos_osdmap(this, w, PAXOS_OSDMAP), - paxos_clientmap(this, w, PAXOS_CLIENTMAP), - paxos_pgmap(this, w, PAXOS_PGMAP), - - osdmon(0), mdsmon(0), clientmon(0) - { - } - ~Monitor() { - delete messenger; - } - - void init(); - void shutdown(); - void dispatch(Message *m); - void tick(); - - void do_stop(); - -}; - -#endif diff --git a/branches/sage/ebofs2/mon/MonitorStore.cc b/branches/sage/ebofs2/mon/MonitorStore.cc deleted file mode 100644 index 86df22bcd6590..0000000000000 --- a/branches/sage/ebofs2/mon/MonitorStore.cc +++ /dev/null @@ -1,222 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "MonitorStore.h" -#include "common/Clock.h" - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_dout << dbeginl << g_clock.now() << " store(" << dir <<") " -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_derr << dbeginl << g_clock.now() << " store(" << dir <<") " - -#include -#include -#include -#include -#include - -void MonitorStore::mount() -{ - dout(1) << "mount" << dendl; - // verify dir exists - DIR *d = ::opendir(dir.c_str()); - if (!d) { - derr(1) << "basedir " << dir << " dne" << dendl; - assert(0); - } - ::closedir(d); - - if (g_conf.use_abspaths) { - // combine it with the cwd, in case fuse screws things up (i.e. fakefuse) - string old = dir; - char *cwd = get_current_dir_name(); - dir = cwd; - free(cwd); - dir += "/"; - dir += old; - } -} - - -void MonitorStore::mkfs() -{ - dout(1) << "mkfs" << dendl; - - char cmd[200]; - sprintf(cmd, "test -d %s && /bin/rm -r %s ; mkdir -p %s", dir.c_str(), dir.c_str(), dir.c_str()); - dout(1) << cmd << dendl; - system(cmd); -} - - -version_t MonitorStore::get_int(const char *a, const char *b) -{ - char fn[200]; - if (b) - sprintf(fn, "%s/%s/%s", dir.c_str(), a, b); - else - sprintf(fn, "%s/%s", dir.c_str(), a); - - FILE *f = ::fopen(fn, "r"); - if (!f) - return 0; - - char buf[20]; - ::fgets(buf, 20, f); - ::fclose(f); - - version_t val = atoi(buf); - - if (b) { - dout(15) << "get_int " << a << "/" << b << " = " << val << dendl; - } else { - dout(15) << "get_int " << a << " = " << val << dendl; - } - return val; -} - - -void MonitorStore::put_int(version_t val, const char *a, const char *b) -{ - char fn[200]; - sprintf(fn, "%s/%s", dir.c_str(), a); - if (b) { - ::mkdir(fn, 0755); - dout(15) << "set_int " << a << "/" << b << " = " << val << dendl; - sprintf(fn, "%s/%s/%s", dir.c_str(), a, b); - } else { - dout(15) << "set_int " << a << " = " << val << dendl; - } - - char vs[30]; -#ifdef __LP64__ - sprintf(vs, "%ld\n", val); -#else - sprintf(vs, "%lld\n", val); -#endif - - char tfn[200]; - sprintf(tfn, "%s.new", fn); - - int fd = ::open(tfn, O_WRONLY|O_CREAT, 0644); - assert(fd > 0); - ::write(fd, vs, strlen(vs)); - ::close(fd); - ::rename(tfn, fn); -} - - -// ---------------------------------------- -// buffers - -bool MonitorStore::exists_bl_ss(const char *a, const char *b) -{ - char fn[200]; - if (b) { - dout(15) << "exists_bl " << a << "/" << b << dendl; - sprintf(fn, "%s/%s/%s", dir.c_str(), a, b); - } else { - dout(15) << "exists_bl " << a << dendl; - sprintf(fn, "%s/%s", dir.c_str(), a); - } - - struct stat st; - int r = ::stat(fn, &st); - //dout(15) << "exists_bl stat " << fn << " r=" << r << " errno " << errno << " " << strerror(errno) << dendl; - return r == 0; -} - - -int MonitorStore::get_bl_ss(bufferlist& bl, const char *a, const char *b) -{ - char fn[200]; - if (b) { - sprintf(fn, "%s/%s/%s", dir.c_str(), a, b); - } else { - sprintf(fn, "%s/%s", dir.c_str(), a); - } - - int fd = ::open(fn, O_RDONLY); - if (!fd) { - if (b) { - dout(15) << "get_bl " << a << "/" << b << " DNE" << dendl; - } else { - dout(15) << "get_bl " << a << " DNE" << dendl; - } - return 0; - } - - // get size - struct stat st; - int rc = ::fstat(fd, &st); - assert(rc == 0); - __int32_t len = st.st_size; - - // read buffer - bl.clear(); - bufferptr bp(len); - int off = 0; - while (off < len) { - dout(20) << "reading at off " << off << " of " << len << dendl; - int r = ::read(fd, bp.c_str()+off, len-off); - if (r < 0) derr(0) << "errno on read " << strerror(errno) << dendl; - assert(r>0); - off += r; - } - bl.append(bp); - ::close(fd); - - if (b) { - dout(15) << "get_bl " << a << "/" << b << " = " << bl.length() << " bytes" << dendl; - } else { - dout(15) << "get_bl " << a << " = " << bl.length() << " bytes" << dendl; - } - - return len; -} - -int MonitorStore::put_bl_ss(bufferlist& bl, const char *a, const char *b) -{ - char fn[200]; - sprintf(fn, "%s/%s", dir.c_str(), a); - if (b) { - ::mkdir(fn, 0755); - dout(15) << "put_bl " << a << "/" << b << " = " << bl.length() << " bytes" << dendl; - sprintf(fn, "%s/%s/%s", dir.c_str(), a, b); - } else { - dout(15) << "put_bl " << a << " = " << bl.length() << " bytes" << dendl; - } - - char tfn[200]; - sprintf(tfn, "%s.new", fn); - int fd = ::open(tfn, O_WRONLY|O_CREAT, 0644); - assert(fd); - - // write data - for (list::const_iterator it = bl.buffers().begin(); - it != bl.buffers().end(); - it++) { - int r = ::write(fd, it->c_str(), it->length()); - if (r != (int)it->length()) - derr(0) << "put_bl_ss ::write() returned " << r << " not " << it->length() << dendl; - if (r < 0) - derr(0) << "put_bl_ss ::write() errored out, errno is " << strerror(errno) << dendl; - } - - ::fsync(fd); - ::close(fd); - ::rename(tfn, fn); - - return 0; -} diff --git a/branches/sage/ebofs2/mon/MonitorStore.h b/branches/sage/ebofs2/mon/MonitorStore.h deleted file mode 100644 index 485bf972551c4..0000000000000 --- a/branches/sage/ebofs2/mon/MonitorStore.h +++ /dev/null @@ -1,82 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MON_MONITORSTORE_H -#define __MON_MONITORSTORE_H - -#include "include/types.h" -#include "include/buffer.h" - -#include - -class MonitorStore { - string dir; - -public: - MonitorStore(char *d) : dir(d) { - } - ~MonitorStore() { - } - - void mkfs(); // wipe - void mount(); - - // ints (stored as ascii) - version_t get_int(const char *a, const char *b=0); - void put_int(version_t v, const char *a, const char *b=0); - - // buffers - // ss and sn varieties. - bool exists_bl_ss(const char *a, const char *b=0); - int get_bl_ss(bufferlist& bl, const char *a, const char *b); - int put_bl_ss(bufferlist& bl, const char *a, const char *b); - bool exists_bl_sn(const char *a, version_t b) { - char bs[20]; -#ifdef __LP64__ - sprintf(bs, "%lu", b); -#else - sprintf(bs, "%llu", b); -#endif - return exists_bl_ss(a, bs); - } - int get_bl_sn(bufferlist& bl, const char *a, version_t b) { - char bs[20]; -#ifdef __LP64__ - sprintf(bs, "%lu", b); -#else - sprintf(bs, "%llu", b); -#endif - return get_bl_ss(bl, a, bs); - } - int put_bl_sn(bufferlist& bl, const char *a, version_t b) { - char bs[20]; -#ifdef __LP64__ - sprintf(bs, "%lu", b); -#else - sprintf(bs, "%llu", b); -#endif - return put_bl_ss(bl, a, bs); - } - - /* - version_t get_incarnation() { return get_int("incarnation"); } - void set_incarnation(version_t i) { set_int(i, "incarnation"); } - - version_t get_last_proposal() { return get_int("last_proposal"); } - void set_last_proposal(version_t i) { set_int(i, "last_proposal"); } - */ -}; - - -#endif diff --git a/branches/sage/ebofs2/mon/OSDMonitor.cc b/branches/sage/ebofs2/mon/OSDMonitor.cc deleted file mode 100644 index 8c8fb91b2b18c..0000000000000 --- a/branches/sage/ebofs2/mon/OSDMonitor.cc +++ /dev/null @@ -1,847 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "OSDMonitor.h" -#include "Monitor.h" -#include "MDSMonitor.h" - -#include "MonitorStore.h" - -#include "crush/CrushWrapper.h" - -#include "messages/MOSDFailure.h" -#include "messages/MOSDMap.h" -#include "messages/MOSDGetMap.h" -#include "messages/MOSDBoot.h" -#include "messages/MOSDIn.h" -#include "messages/MOSDOut.h" - -#include "messages/MMonOSDMapInfo.h" -#include "messages/MMonOSDMapLease.h" -#include "messages/MMonOSDMapLeaseAck.h" -#include "messages/MMonOSDMapUpdatePrepare.h" -#include "messages/MMonOSDMapUpdateAck.h" -#include "messages/MMonOSDMapUpdateCommit.h" - -#include "common/Timer.h" - -#include "config.h" - - -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_dout << dbeginl << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".osd e" << osdmap.get_epoch() << " " -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_derr << dbeginl << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".osd e" << osdmap.get_epoch() << " " - - -// FAKING - -class C_Mon_FakeOSDFailure : public Context { - OSDMonitor *mon; - int osd; - bool down; -public: - C_Mon_FakeOSDFailure(OSDMonitor *m, int o, bool d) : mon(m), osd(o), down(d) {} - void finish(int r) { - mon->fake_osd_failure(osd,down); - } -}; - -void OSDMonitor::fake_osd_failure(int osd, bool down) -{ - if (down) { - dout(1) << "fake_osd_failure DOWN osd" << osd << dendl; - pending_inc.new_down[osd].first = osdmap.osd_inst[osd]; - pending_inc.new_down[osd].second = false; - } else { - dout(1) << "fake_osd_failure OUT osd" << osd << dendl; - pending_inc.new_out.push_back(osd); - } - propose_pending(); - - // fixme - //bcast_latest_osd(); - //bcast_latest_mds(); -} - -void OSDMonitor::fake_osdmap_update() -{ - dout(1) << "fake_osdmap_update" << dendl; - propose_pending(); - - // tell a random osd - int osd = rand() % g_conf.num_osd; - send_latest(osdmap.get_inst(osd)); -} - - -void OSDMonitor::fake_reorg() -{ - int r = rand() % g_conf.num_osd; - - if (osdmap.is_out(r)) { - dout(1) << "fake_reorg marking osd" << r << " in" << dendl; - pending_inc.new_in.push_back(r); - } else { - dout(1) << "fake_reorg marking osd" << r << " out" << dendl; - pending_inc.new_out.push_back(r); - } - - propose_pending(); - send_latest(osdmap.get_inst(r)); // after -} - - - -/************ MAPS ****************/ - -void OSDMonitor::create_initial() -{ - assert(mon->is_leader()); - assert(paxos->get_version() == 0); - - dout(1) << "create_initial -- creating initial osdmap from g_conf" << dendl; - - // - OSDMap newmap; - newmap.mon_epoch = mon->mon_epoch; - newmap.ctime = g_clock.now(); - - newmap.set_pg_num(g_conf.num_osd << g_conf.osd_pg_bits); - - // start at epoch 1 until all osds boot - newmap.inc_epoch(); // = 1 - assert(newmap.get_epoch() == 1); - - map weights; - build_crush_map(newmap.crush, weights); - - for (int i=0; i - - // fake osd failures - for (map::iterator i = g_fake_osd_down.begin(); - i != g_fake_osd_down.end(); - i++) { - dout(0) << "will fake osd" << i->first << " DOWN after " << i->second << dendl; - mon->timer.add_event_after(i->second, new C_Mon_FakeOSDFailure(this, i->first, 1)); - } - for (map::iterator i = g_fake_osd_out.begin(); - i != g_fake_osd_out.end(); - i++) { - dout(0) << "will fake osd" << i->first << " OUT after " << i->second << dendl; - mon->timer.add_event_after(i->second, new C_Mon_FakeOSDFailure(this, i->first, 0)); - } - - // encode into pending incremental - newmap.encode(pending_inc.fullmap); -} - - -void OSDMonitor::build_crush_map(CrushWrapper& crush, - map& weights) -{ - // new - crush.create(); - - if (g_conf.num_osd >= 12) { - int ndom = g_conf.osd_max_rep; - int ritems[ndom]; - int rweights[ndom]; - - int nper = ((g_conf.num_osd - 1) / ndom) + 1; - derr(0) << ndom << " failure domains, " << nper << " osds each" << dendl; - - int o = 0; - for (int i=0; i= i) { - crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_TAKE, nroot)); - crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE_INDEP, i, 1)); - crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE_INDEP, 1, 0)); - crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - } else { - crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_TAKE, nroot)); - crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE_INDEP, i, 0)); - crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - } - } - */ - - // test - //vector out; - //pg_to_osds(0x40200000110ULL, out); - - } else { - // one bucket - - int items[g_conf.num_osd]; - for (int i=0; imax_devices << dendl; - //vector t; - //crush.do_rule(2, 132, t, 4, -1); -} - - -bool OSDMonitor::update_from_paxos() -{ - assert(paxos->is_active()); - - version_t paxosv = paxos->get_version(); - if (paxosv == osdmap.epoch) return true; - assert(paxosv >= osdmap.epoch); - - dout(15) << "update_from_paxos paxos e " << paxosv - << ", my e " << osdmap.epoch << dendl; - - if (osdmap.epoch == 0 && paxosv > 1) { - // startup: just load latest full map - epoch_t lastfull = mon->store->get_int("osdmap_full","last_epoch"); - if (lastfull) { - dout(7) << "update_from_paxos startup: loading latest full map e" << lastfull << dendl; - bufferlist bl; - mon->store->get_bl_sn(bl, "osdmap_full", lastfull); - osdmap.decode(bl); - } - } - - // walk through incrementals - while (paxosv > osdmap.epoch) { - bufferlist bl; - bool success = paxos->read(osdmap.epoch+1, bl); - assert(success); - - dout(7) << "update_from_paxos applying incremental " << osdmap.epoch+1 << dendl; - OSDMap::Incremental inc; - int off = 0; - inc.decode(bl, off); - osdmap.apply_incremental(inc); - - // write out the full map, too. - bl.clear(); - osdmap.encode(bl); - mon->store->put_bl_sn(bl, "osdmap_full", osdmap.epoch); - - // share - dout(1) << osdmap.osds.size() << " osds, " - << osdmap.down_osds.size() << " down, " - << osdmap.out_osds.size() << " out" - << dendl; - } - mon->store->put_int(osdmap.epoch, "osdmap_full","last_epoch"); - - // new map! - bcast_latest_mds(); - send_to_waiting(); - - return true; -} - - -void OSDMonitor::create_pending() -{ - pending_inc = OSDMap::Incremental(osdmap.epoch+1); - dout(10) << "create_pending e " << pending_inc.epoch - << dendl; -} - -void OSDMonitor::encode_pending(bufferlist &bl) -{ - dout(10) << "encode_pending e " << pending_inc.epoch - << dendl; - - // finish up pending_inc - pending_inc.ctime = g_clock.now(); - pending_inc.mon_epoch = mon->mon_epoch; - - // tell me about it - for (map >::iterator i = pending_inc.new_down.begin(); - i != pending_inc.new_down.end(); - i++) { - dout(2) << " osd" << i->first << " DOWN " << i->second.first << " clean=" << i->second.second << dendl; - derr(0) << " osd" << i->first << " DOWN " << i->second.first << " clean=" << i->second.second << dendl; - mon->messenger->mark_down(i->second.first.addr); - } - for (map::iterator i = pending_inc.new_up.begin(); - i != pending_inc.new_up.end(); - i++) { - dout(2) << " osd" << i->first << " UP " << i->second << dendl; - derr(0) << " osd" << i->first << " UP " << i->second << dendl; - } - for (list::iterator i = pending_inc.new_out.begin(); - i != pending_inc.new_out.end(); - i++) { - dout(2) << " osd" << *i << " OUT" << dendl; - derr(0) << " osd" << *i << " OUT" << dendl; - } - for (list::iterator i = pending_inc.new_in.begin(); - i != pending_inc.new_in.end(); - i++) { - dout(2) << " osd" << *i << " IN" << dendl; - derr(0) << " osd" << *i << " IN" << dendl; - } - - // encode - assert(paxos->get_version() + 1 == pending_inc.epoch); - pending_inc.encode(bl); -} - - -void OSDMonitor::committed() -{ - -} - - -// ------------- - -bool OSDMonitor::preprocess_query(Message *m) -{ - dout(10) << "preprocess_query " << *m << " from " << m->get_source_inst() << dendl; - - switch (m->get_type()) { - // READs - case MSG_OSD_GETMAP: - handle_osd_getmap((MOSDGetMap*)m); - return true; - - // damp updates - case MSG_OSD_FAILURE: - return preprocess_failure((MOSDFailure*)m); - case MSG_OSD_BOOT: - return preprocess_boot((MOSDBoot*)m); - /* - case MSG_OSD_IN: - return preprocess_in((MOSDIn*)m); - case MSG_OSD_OUT: - return preprocess_out((MOSDOut*)m); - */ - - default: - assert(0); - delete m; - return true; - } -} - -bool OSDMonitor::prepare_update(Message *m) -{ - dout(7) << "prepare_update " << *m << " from " << m->get_source_inst() << dendl; - - switch (m->get_type()) { - // damp updates - case MSG_OSD_FAILURE: - return prepare_failure((MOSDFailure*)m); - case MSG_OSD_BOOT: - return prepare_boot((MOSDBoot*)m); - - /* - case MSG_OSD_IN: - return prepare_in((MOSDIn*)m); - case MSG_OSD_OUT: - return prepare_out((MOSDOut*)m); - */ - - default: - assert(0); - delete m; - } - - return false; -} - -bool OSDMonitor::should_propose(double& delay) -{ - if (osdmap.epoch == 1) { - if (pending_inc.new_up.size() == osdmap.get_osds().size()) { - delay = 0.0; - if (g_conf.osd_auto_weight) { - CrushWrapper crush; - build_crush_map(crush, osd_weight); - crush._encode(pending_inc.crush); - } - return true; - } else - return false; - } - return PaxosService::should_propose(delay); -} - - - -// --------------------------- -// READs - -void OSDMonitor::handle_osd_getmap(MOSDGetMap *m) -{ - dout(7) << "handle_osd_getmap from " << m->get_source() << " from " << m->get_start_epoch() << dendl; - - if (m->get_start_epoch()) { - if (m->get_want_epoch() <= osdmap.get_epoch()) - send_incremental(m->get_source_inst(), m->get_start_epoch()); - else - waiting_for_map[m->get_source_inst()] = pair(m->get_start_epoch(), - m->get_want_epoch()); - } else - send_full(m->get_source_inst()); - - delete m; -} - - - -// --------------------------- -// UPDATEs - -// failure -- - -bool OSDMonitor::preprocess_failure(MOSDFailure *m) -{ - int badboy = m->get_failed().name.num(); - - // weird? - if (!osdmap.have_inst(badboy)) { - dout(5) << "preprocess_failure dne(/dup?): " << m->get_failed() << ", from " << m->get_from() << dendl; - send_incremental(m->get_from(), m->get_epoch()+1); - return true; - } - if (osdmap.get_inst(badboy) != m->get_failed()) { - dout(5) << "preprocess_failure wrong osd: report " << m->get_failed() << " != map's " << osdmap.get_inst(badboy) - << ", from " << m->get_from() << dendl; - send_incremental(m->get_from(), m->get_epoch()+1); - return true; - } - // already reported? - if (osdmap.is_down(badboy)) { - dout(5) << "preprocess_failure dup: " << m->get_failed() << ", from " << m->get_from() << dendl; - send_incremental(m->get_from(), m->get_epoch()+1); - return true; - } - - dout(10) << "preprocess_failure new: " << m->get_failed() << ", from " << m->get_from() << dendl; - return false; -} - -bool OSDMonitor::prepare_failure(MOSDFailure *m) -{ - dout(1) << "prepare_failure " << m->get_failed() << " from " << m->get_from() << dendl; - - // FIXME - // take their word for it - int badboy = m->get_failed().name.num(); - assert(osdmap.is_up(badboy)); - assert(osdmap.osd_inst[badboy] == m->get_failed()); - - pending_inc.new_down[badboy].first = m->get_failed(); - pending_inc.new_down[badboy].second = false; - - if (osdmap.is_in(badboy)) - down_pending_out[badboy] = g_clock.now(); - - paxos->wait_for_commit(new C_Reported(this, m)); - - return true; -} - -void OSDMonitor::_reported_failure(MOSDFailure *m) -{ - dout(7) << "_reported_failure on " << m->get_failed() << ", telling " << m->get_from() << dendl; - send_latest(m->get_from(), m->get_epoch()); -} - - -// boot -- - -bool OSDMonitor::preprocess_boot(MOSDBoot *m) -{ - assert(m->inst.name.is_osd()); - int from = m->inst.name.num(); - - // already booted? - if (osdmap.is_up(from) && - osdmap.get_inst(from) == m->inst) { - // yup. - dout(7) << "preprocess_boot dup from " << m->inst << dendl; - _booted(m); - return true; - } - - dout(10) << "preprocess_boot from " << m->inst << dendl; - return false; -} - -bool OSDMonitor::prepare_boot(MOSDBoot *m) -{ - dout(7) << "prepare_boot from " << m->inst << dendl; - assert(m->inst.name.is_osd()); - int from = m->inst.name.num(); - - // does this osd exist? - if (!osdmap.exists(from)) { - dout(1) << "boot from non-existent osd" << from << dendl; - delete m; - return true; - } - - // already up? mark down first? - if (osdmap.is_up(from)) { - dout(7) << "prepare_boot was up, first marking down " << osdmap.get_inst(from) << dendl; - assert(osdmap.get_inst(from) != m->inst); // preproces should have caught it - - // mark previous guy down - pending_inc.new_down[from].first = osdmap.osd_inst[from]; - pending_inc.new_down[from].second = false; - - paxos->wait_for_commit(new C_RetryMessage(this, m)); - } else { - // mark new guy up. - down_pending_out.erase(from); // if any - pending_inc.new_up[from] = m->inst; - - // mark in? - if (osdmap.out_osds.count(from)) - pending_inc.new_in.push_back(from); - - osd_weight[from] = m->sb.weight; - - // wait - paxos->wait_for_commit(new C_Booted(this, m)); - } - return true; -} - -void OSDMonitor::_booted(MOSDBoot *m) -{ - dout(7) << "_booted " << m->inst << " w " << m->sb.weight << dendl; - send_latest(m->inst, m->sb.current_epoch); - delete m; -} - - - - - -// --------------- -// map helpers - -void OSDMonitor::send_to_waiting() -{ - dout(10) << "send_to_waiting " << osdmap.get_epoch() << dendl; - - map >::iterator i = waiting_for_map.begin(); - while (i != waiting_for_map.end()) { - if (i->second.first) { - if (i->second.second <= osdmap.get_epoch()) - send_incremental(i->first, i->second.first); - else { - dout(10) << "send_to_waiting skipping " << i->first - << " has " << i->second.first - << " wants " << i->second.second - << dendl; - i++; - continue; - } - } else - send_full(i->first); - - waiting_for_map.erase(i++); - } -} - -void OSDMonitor::send_latest(entity_inst_t who, epoch_t start) -{ - if (paxos->is_readable()) { - dout(5) << "send_latest to " << who << " now" << dendl; - if (start == 0) - send_full(who); - else - send_incremental(who, start); - } else { - dout(5) << "send_latest to " << who << " later" << dendl; - waiting_for_map[who] = pair(start, 0); - } -} - - -void OSDMonitor::send_full(entity_inst_t who) -{ - dout(5) << "send_full to " << who << dendl; - mon->messenger->send_message(new MOSDMap(&osdmap), who); -} - -void OSDMonitor::send_incremental(entity_inst_t dest, epoch_t from) -{ - dout(5) << "send_incremental from " << from << " -> " << osdmap.get_epoch() - << " to " << dest << dendl; - - MOSDMap *m = new MOSDMap; - - for (epoch_t e = osdmap.get_epoch(); - e >= from; - e--) { - bufferlist bl; - if (mon->store->get_bl_sn(bl, "osdmap", e) > 0) { - dout(20) << "send_incremental inc " << e << " " << bl.length() << " bytes" << dendl; - m->incremental_maps[e] = bl; - } - else if (mon->store->get_bl_sn(bl, "osdmap_full", e) > 0) { - dout(20) << "send_incremental full " << e << dendl; - m->maps[e] = bl; - } - else { - assert(0); // we should have all maps. - } - } - - mon->messenger->send_message(m, dest); -} - - -void OSDMonitor::bcast_latest_mds() -{ - epoch_t e = osdmap.get_epoch(); - dout(1) << "bcast_latest_mds epoch " << e << dendl; - - // tell mds - set up; - mon->mdsmon->mdsmap.get_up_mds_set(up); - for (set::iterator i = up.begin(); - i != up.end(); - i++) { - send_incremental(mon->mdsmon->mdsmap.get_inst(*i), osdmap.get_epoch()); - } -} - -void OSDMonitor::bcast_latest_osd() -{ - epoch_t e = osdmap.get_epoch(); - dout(1) << "bcast_latest_osd epoch " << e << dendl; - - // tell osds - set osds; - osdmap.get_all_osds(osds); - for (set::iterator it = osds.begin(); - it != osds.end(); - it++) { - if (osdmap.is_down(*it)) continue; - - send_incremental(osdmap.get_inst(*it), osdmap.get_epoch()); - } -} - -void OSDMonitor::bcast_full_osd() -{ - epoch_t e = osdmap.get_epoch(); - dout(1) << "bcast_full_osd epoch " << e << dendl; - - // tell osds - set osds; - osdmap.get_all_osds(osds); - for (set::iterator it = osds.begin(); - it != osds.end(); - it++) { - if (osdmap.is_down(*it)) continue; - send_full(osdmap.get_inst(*it)); - } -} - - -// TICK - - -void OSDMonitor::tick() -{ - // mark down osds out? - utime_t now = g_clock.now(); - list mark_out; - for (map::iterator i = down_pending_out.begin(); - i != down_pending_out.end(); - i++) { - utime_t down = now; - down -= i->second; - - if (down.sec() >= g_conf.mon_osd_down_out_interval) { - dout(10) << "tick marking osd" << i->first << " OUT after " << down << " sec" << dendl; - mark_out.push_back(i->first); - } - } - for (list::iterator i = mark_out.begin(); - i != mark_out.end(); - i++) { - down_pending_out.erase(*i); - pending_inc.new_out.push_back( *i ); - } - if (!mark_out.empty()) { - propose_pending(); - } -} - - - - - -/* -void OSDMonitor::init() -{ - // start with blank map - - // load my last state from the store - bufferlist bl; - if (get_map_bl(0, bl)) { // FIXME - // yay! - osdmap.decode(bl); - dout(1) << "init got epoch " << osdmap.get_epoch() << " from store" << dendl; - - // set up pending_inc - pending_inc.epoch = osdmap.get_epoch()+1; - } -} -*/ - - - - -void OSDMonitor::mark_all_down() -{ - assert(mon->is_leader()); - - dout(7) << "mark_all_down" << dendl; - - for (set::iterator it = osdmap.get_osds().begin(); - it != osdmap.get_osds().end(); - it++) { - if (osdmap.is_down(*it)) continue; - pending_inc.new_down[*it].first = osdmap.get_inst(*it); - pending_inc.new_down[*it].second = true; // FIXME: am i sure it's clean? we need a proper osd shutdown sequence! - } - - propose_pending(); -} - - - - - - - - - - - - - - - -/* - - -void OSDMonitor::election_finished() -{ - dout(10) << "election_finished" << dendl; - - if (mon->is_leader()) { - if (g_conf.mkfs) { - create_initial(); - save_map(); - } else { - // - epoch_t epoch = mon->store->get_int("osd_epoch"); - dout(10) << " last epoch was " << epoch << dendl; - bufferlist bl, blinc; - int r = mon->store->get_bl_sn(bl, "osdmap_full", epoch); - assert(r>0); - osdmap.decode(bl); - - // pending_inc - pending_inc.epoch = epoch+1; - } - - } - -} - - - -*/ diff --git a/branches/sage/ebofs2/mon/OSDMonitor.h b/branches/sage/ebofs2/mon/OSDMonitor.h deleted file mode 100644 index c22c007f2d9b6..0000000000000 --- a/branches/sage/ebofs2/mon/OSDMonitor.h +++ /dev/null @@ -1,131 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __OSDMONITOR_H -#define __OSDMONITOR_H - -#include -#include -using namespace std; - -#include "include/types.h" -#include "msg/Messenger.h" - -#include "osd/OSDMap.h" - -#include "PaxosService.h" - -class Monitor; -class MOSDBoot; - -class OSDMonitor : public PaxosService { -public: - OSDMap osdmap; - -private: - map > waiting_for_map; // who -> (has, wants) - - // [leader] - OSDMap::Incremental pending_inc; - map down_pending_out; // osd down -> out - - map osd_weight; - - void build_crush_map(CrushWrapper& crush, - map& weights); - - // svc - void create_initial(); - bool update_from_paxos(); - void create_pending(); // prepare a new pending - void encode_pending(bufferlist &bl); - - void committed(); - - void handle_query(Message *m); - bool preprocess_query(Message *m); // true if processed. - bool prepare_update(Message *m); - bool should_propose(double &delay); - - // ... - bool get_map_bl(epoch_t epoch, bufferlist &bl); - bool get_inc_map_bl(epoch_t epoch, bufferlist &bl); - - void send_to_waiting(); // send current map to waiters. - void send_full(entity_inst_t dest); - void send_incremental(entity_inst_t dest, epoch_t since); - void bcast_latest_mds(); - void bcast_latest_osd(); - void bcast_full_osd(); - - void handle_osd_getmap(class MOSDGetMap *m); - - bool preprocess_failure(class MOSDFailure *m); - bool prepare_failure(class MOSDFailure *m); - void _reported_failure(MOSDFailure *m); - - bool preprocess_boot(class MOSDBoot *m); - bool prepare_boot(class MOSDBoot *m); - void _booted(MOSDBoot *m); - - class C_Booted : public Context { - OSDMonitor *cmon; - MOSDBoot *m; - public: - C_Booted(OSDMonitor *cm, MOSDBoot *m_) : - cmon(cm), m(m_) {} - void finish(int r) { - if (r >= 0) - cmon->_booted(m); - else - cmon->dispatch((Message*)m); - } - }; - class C_Reported : public Context { - OSDMonitor *cmon; - MOSDFailure *m; - public: - C_Reported(OSDMonitor *cm, MOSDFailure *m_) : - cmon(cm), m(m_) {} - void finish(int r) { - if (r >= 0) - cmon->_reported_failure(m); - else - cmon->dispatch((Message*)m); - } - }; - - bool preprocess_in(class MOSDIn *m); - bool prepare_in(class MOSDIn *m); - - bool preprocess_out(class MOSDOut *m); - bool prepare_out(class MOSDOut *m); - - public: - OSDMonitor(Monitor *mn, Paxos *p) : - PaxosService(mn, p) { } - - void tick(); // check state, take actions - - void mark_all_down(); - - void send_latest(entity_inst_t i, epoch_t start=0); - - void fake_osd_failure(int osd, bool down); - void fake_osdmap_update(); - void fake_reorg(); -}; - -#endif diff --git a/branches/sage/ebofs2/mon/PGMap.h b/branches/sage/ebofs2/mon/PGMap.h deleted file mode 100644 index b915c28cbd755..0000000000000 --- a/branches/sage/ebofs2/mon/PGMap.h +++ /dev/null @@ -1,103 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __PGMAP_H -#define __PGMAP_H - -#include "osd/osd_types.h" - -class PGMap { -public: - // the map - version_t version; - hash_map pg_stat; - hash_map osd_stat; - - class Incremental { - public: - version_t version; - map pg_stat_updates; - map osd_stat_updates; - - void _encode(bufferlist &bl) { - ::_encode(version, bl); - ::_encode(pg_stat_updates, bl); - ::_encode(osd_stat_updates, bl); - } - void _decode(bufferlist& bl, int& off) { - ::_decode(version, bl, off); - ::_decode(pg_stat_updates, bl, off); - ::_decode(osd_stat_updates, bl, off); - } - - Incremental() : version(0) {} - }; - - void apply_incremental(Incremental& inc) { - assert(inc.version == version+1); - version++; - for (map::iterator p = inc.pg_stat_updates.begin(); - p != inc.pg_stat_updates.end(); - ++p) { - if (pg_stat.count(p->first)) - stat_sub(pg_stat[p->first]); - pg_stat[p->first] = p->second; - stat_add(p->second); - } - } - - // aggregate stats (soft state) - hash_map num_pg_by_state; - int64_t num_pg; - int64_t total_size; - int64_t total_num_blocks; - - void stat_zero() { - num_pg = 0; - num_pg_by_state.clear(); - total_size = 0; - total_num_blocks = 0; - } - void stat_add(pg_stat_t &s) { - num_pg++; - num_pg_by_state[s.state]++; - total_size += s.size; - total_num_blocks += s.num_blocks; - } - void stat_sub(pg_stat_t &s) { - num_pg--; - num_pg_by_state[s.state]--; - total_size -= s.size; - total_num_blocks -= s.num_blocks; - } - - PGMap() : version(0), - num_pg(0), total_size(0), total_num_blocks(0) {} - - void _encode(bufferlist &bl) { - ::_encode(version, bl); - ::_encode(pg_stat, bl); - } - void _decode(bufferlist& bl, int& off) { - ::_decode(version, bl, off); - ::_decode(pg_stat, bl, off); - stat_zero(); - for (hash_map::iterator p = pg_stat.begin(); - p != pg_stat.end(); - ++p) - stat_add(p->second); - } -}; - -#endif diff --git a/branches/sage/ebofs2/mon/PGMonitor.cc b/branches/sage/ebofs2/mon/PGMonitor.cc deleted file mode 100644 index 6e571fea7f612..0000000000000 --- a/branches/sage/ebofs2/mon/PGMonitor.cc +++ /dev/null @@ -1,219 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include "PGMonitor.h" -#include "Monitor.h" -#include "MDSMonitor.h" -#include "OSDMonitor.h" -#include "MonitorStore.h" - -#include "messages/MPGStats.h" - -#include "messages/MStatfs.h" -#include "messages/MStatfsReply.h" - -#include "common/Timer.h" - -#include "osd/osd_types.h" -#include "osd/PG.h" // yuck - -#include "config.h" -#include - - -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_dout << dbeginl << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".pg " -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_derr << dbeginl << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".pg " - - - -void PGMonitor::create_initial() -{ - dout(1) << "create_initial -- creating initial map" << dendl; -} - -bool PGMonitor::update_from_paxos() -{ - version_t paxosv = paxos->get_version(); - if (paxosv == pg_map.version) return true; - assert(paxosv >= pg_map.version); - - if (pg_map.version == 0 && paxosv > 1 && - mon->store->exists_bl_ss("pgmap","latest")) { - // starting up: load latest - dout(7) << "update_from_paxos startup: loading latest full pgmap" << dendl; - bufferlist bl; - mon->store->get_bl_ss(bl, "pgmap", "latest"); - int off = 0; - pg_map._decode(bl, off); - } - - // walk through incrementals - while (paxosv > pg_map.version) { - bufferlist bl; - bool success = paxos->read(pg_map.version+1, bl); - if (success) { - dout(7) << "update_from_paxos applying incremental " << pg_map.version+1 << dendl; - PGMap::Incremental inc; - int off = 0; - inc._decode(bl, off); - pg_map.apply_incremental(inc); - - std::stringstream ss; - for (hash_map::iterator p = pg_map.num_pg_by_state.begin(); - p != pg_map.num_pg_by_state.end(); - ++p) { - if (p != pg_map.num_pg_by_state.begin()) - ss << ", "; - ss << p->second << " " << PG::get_state_string(p->first) << "(" << p->first << ")"; - } - string states = ss.str(); - dout(0) << "v" << pg_map.version << " " << states << dendl; - - } else { - dout(7) << "update_from_paxos couldn't read incremental " << pg_map.version+1 << dendl; - return false; - } - } - - // save latest - bufferlist bl; - pg_map._encode(bl); - mon->store->put_bl_ss(bl, "pgmap", "latest"); - - return true; -} - -void PGMonitor::create_pending() -{ - pending_inc = PGMap::Incremental(); - pending_inc.version = pg_map.version + 1; - dout(10) << "create_pending v " << pending_inc.version << dendl; -} - -void PGMonitor::encode_pending(bufferlist &bl) -{ - assert(mon->is_leader()); - dout(10) << "encode_pending v " << pending_inc.version << dendl; - assert(paxos->get_version() + 1 == pending_inc.version); - pending_inc._encode(bl); -} - -bool PGMonitor::preprocess_query(Message *m) -{ - dout(10) << "preprocess_query " << *m << " from " << m->get_source_inst() << dendl; - - switch (m->get_type()) { - case MSG_STATFS: - handle_statfs((MStatfs*)m); - return true; - - case MSG_PGSTATS: - { - MPGStats *stats = (MPGStats*)m; - for (map::iterator p = stats->pg_stat.begin(); - p != stats->pg_stat.end(); - p++) { - if (pg_map.pg_stat.count(p->first) == 0 || - pg_map.pg_stat[p->first].reported < p->second.reported) - return false; - } - dout(10) << " message contains no new pg stats" << dendl; - return true; - } - - default: - assert(0); - delete m; - return true; - } -} - -bool PGMonitor::prepare_update(Message *m) -{ - dout(10) << "prepare_update " << *m << " from " << m->get_source_inst() << dendl; - switch (m->get_type()) { - case MSG_PGSTATS: - return handle_pg_stats((MPGStats*)m); - - default: - assert(0); - delete m; - return false; - } -} - -void PGMonitor::committed() -{ - -} - -void PGMonitor::handle_statfs(MStatfs *statfs) -{ - dout(10) << "handle_statfs " << *statfs << " from " << statfs->get_source() << dendl; - - // fill out stfs - MStatfsReply *reply = new MStatfsReply(statfs->tid); - memset(&reply->stfs, 0, sizeof(reply->stfs)); - reply->stfs.f_bsize = 1024; - reply->stfs.f_frsize = 1024; - reply->stfs.f_blocks = 1024 * 1024; //pg_map.total_num_blocks; - reply->stfs.f_bfree = 1024 * 1024; - reply->stfs.f_bavail = 1024 * 1024; - reply->stfs.f_files = 1024 * 1024; - reply->stfs.f_ffree = 1024 * 1024; - reply->stfs.f_favail = 1024 * 1024; - reply->stfs.f_namemax = 1024; - reply->stfs.f_flag = ST_NOATIME|ST_NODIRATIME; // for now. - - // reply - mon->messenger->send_message(reply, statfs->get_source_inst()); - delete statfs; -} - -bool PGMonitor::handle_pg_stats(MPGStats *stats) -{ - dout(10) << "handle_pg_stats " << *stats << " from " << stats->get_source() << dendl; - - for (map::iterator p = stats->pg_stat.begin(); - p != stats->pg_stat.end(); - p++) { - pg_t pgid = p->first; - if ((pg_map.pg_stat.count(pgid) && - pg_map.pg_stat[pgid].reported > p->second.reported)) { - dout(15) << " had " << pgid << " from " << pg_map.pg_stat[pgid].reported << dendl; - continue; - } - if (pending_inc.pg_stat_updates.count(pgid) && - pending_inc.pg_stat_updates[pgid].reported > p->second.reported) { - dout(15) << " had " << pgid << " from " << pending_inc.pg_stat_updates[pgid].reported - << " (pending)" << dendl; - continue; - } - - dout(15) << " got " << pgid << " reported at " << p->second.reported - << " state " << PG::get_state_string(p->second.state) - << dendl; - pending_inc.pg_stat_updates[pgid] = p->second; - - // we don't care about consistency; apply to live map. - if (pg_map.pg_stat.count(pgid)) - pg_map.stat_sub(pg_map.pg_stat[pgid]); - pg_map.pg_stat[pgid] = p->second; - pg_map.stat_add(pg_map.pg_stat[pgid]); - } - - delete stats; - return true; -} diff --git a/branches/sage/ebofs2/mon/PGMonitor.h b/branches/sage/ebofs2/mon/PGMonitor.h deleted file mode 100644 index 7b6d44f814fd2..0000000000000 --- a/branches/sage/ebofs2/mon/PGMonitor.h +++ /dev/null @@ -1,58 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __PGMONITOR_H -#define __PGMONITOR_H - -#include -#include -using namespace std; - -#include "include/types.h" -#include "msg/Messenger.h" -#include "PaxosService.h" - -#include "PGMap.h" - -class MPGStats; -class MStatfs; - -class PGMonitor : public PaxosService { -public: - -private: - PGMap pg_map; - PGMap::Incremental pending_inc; - - void create_initial(); - bool update_from_paxos(); - void create_pending(); // prepare a new pending - void encode_pending(bufferlist &bl); // propose pending update to peers - - void committed(); - - bool preprocess_query(Message *m); // true if processed. - bool prepare_update(Message *m); - - void handle_statfs(MStatfs *statfs); - bool handle_pg_stats(MPGStats *stats); - - public: - PGMonitor(Monitor *mn, Paxos *p) : PaxosService(mn, p) { } - - //void tick(); // check state, take actions - -}; - -#endif diff --git a/branches/sage/ebofs2/mon/Paxos.cc b/branches/sage/ebofs2/mon/Paxos.cc deleted file mode 100644 index c1f4472059ff5..0000000000000 --- a/branches/sage/ebofs2/mon/Paxos.cc +++ /dev/null @@ -1,784 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "Paxos.h" -#include "Monitor.h" -#include "MonitorStore.h" - -#include "messages/MMonPaxos.h" - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_paxos) *_dout << dbeginl << g_clock.now() << " mon" << whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".paxos(" << machine_name << " " << get_statename(state) << " lc " << last_committed << ") " -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_paxos) *_derr << dbeginl << g_clock.now() << " mon" << whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".paxos(" << machine_name << " " << get_statename(state) << " lc " << last_committed << ") " - - -void Paxos::init() -{ - // load paxos variables from stable storage - last_pn = mon->store->get_int(machine_name, "last_pn"); - accepted_pn = mon->store->get_int(machine_name, "accepted_pn"); - last_committed = mon->store->get_int(machine_name, "last_committed"); - - dout(10) << "init" << dendl; -} - -// --------------------------------- - -// PHASE 1 - -// leader -void Paxos::collect(version_t oldpn) -{ - // we're recoverying, it seems! - state = STATE_RECOVERING; - assert(mon->is_leader()); - - // reset the number of lasts received - uncommitted_v = 0; - uncommitted_pn = 0; - uncommitted_value.clear(); - - // look for uncommitted value - if (mon->store->exists_bl_sn(machine_name, last_committed+1)) { - uncommitted_v = last_committed+1; - uncommitted_pn = accepted_pn; - mon->store->get_bl_sn(uncommitted_value, machine_name, last_committed+1); - dout(10) << "learned uncommitted " << (last_committed+1) - << " (" << uncommitted_value.length() << " bytes) from myself" - << dendl; - } - - // pick new pn - accepted_pn = get_new_proposal_number(MAX(accepted_pn, oldpn)); - accepted_pn_from = last_committed; - num_last = 1; - dout(10) << "collect with pn " << accepted_pn << dendl; - - // send collect - for (set::const_iterator p = mon->get_quorum().begin(); - p != mon->get_quorum().end(); - ++p) { - if (*p == whoami) continue; - - MMonPaxos *collect = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_COLLECT, machine_id); - collect->last_committed = last_committed; - collect->pn = accepted_pn; - mon->messenger->send_message(collect, mon->monmap->get_inst(*p)); - } - -} - - -// peon -void Paxos::handle_collect(MMonPaxos *collect) -{ - dout(10) << "handle_collect " << *collect << dendl; - - assert(mon->is_peon()); // mon epoch filter should catch strays - - // we're recoverying, it seems! - state = STATE_RECOVERING; - - // reply - MMonPaxos *last = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_LAST, machine_id); - last->last_committed = last_committed; - - // do we have an accepted but uncommitted value? - // (it'll be at last_committed+1) - bufferlist bl; - if (mon->store->exists_bl_sn(machine_name, last_committed+1)) { - mon->store->get_bl_sn(bl, machine_name, last_committed+1); - assert(bl.length() > 0); - dout(10) << " sharing our accepted but uncommitted value for " << last_committed+1 - << " (" << bl.length() << " bytes)" << dendl; - last->values[last_committed+1] = bl; - last->uncommitted_pn = accepted_pn; - } - - // can we accept this pn? - if (collect->pn > accepted_pn) { - // ok, accept it - accepted_pn = collect->pn; - accepted_pn_from = collect->pn_from; - dout(10) << "accepting pn " << accepted_pn << " from " << accepted_pn_from << dendl; - mon->store->put_int(accepted_pn, machine_name, "accepted_pn"); - } else { - // don't accept! - dout(10) << "NOT accepting pn " << collect->pn << " from " << collect->pn_from - << ", we already accepted " << accepted_pn << " from " << accepted_pn_from - << dendl; - } - last->pn = accepted_pn; - last->pn_from = accepted_pn_from; - - // and share whatever data we have - for (version_t v = collect->last_committed+1; - v <= last_committed; - v++) { - if (mon->store->exists_bl_sn(machine_name, v)) { - mon->store->get_bl_sn(last->values[v], machine_name, v); - dout(10) << " sharing " << v << " (" - << last->values[v].length() << " bytes)" << dendl; - } - } - - // send reply - mon->messenger->send_message(last, collect->get_source_inst()); - delete collect; -} - - -// leader -void Paxos::handle_last(MMonPaxos *last) -{ - dout(10) << "handle_last " << *last << dendl; - - if (!mon->is_leader()) { - dout(10) << "not leader, dropping" << dendl; - delete last; - return; - } - - // share committed values? - if (last->last_committed < last_committed) { - // share committed values - dout(10) << "sending commit to " << last->get_source() << dendl; - MMonPaxos *commit = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_COMMIT, machine_id); - for (version_t v = last->last_committed+1; - v <= last_committed; - v++) { - mon->store->get_bl_sn(commit->values[v], machine_name, v); - dout(10) << " sharing " << v << " (" - << commit->values[v].length() << " bytes)" << dendl; - } - commit->last_committed = last_committed; - mon->messenger->send_message(commit, last->get_source_inst()); - } - - // did we receive a committed value? - if (last->last_committed > last_committed) { - for (version_t v = last_committed+1; - v <= last->last_committed; - v++) { - mon->store->put_bl_sn(last->values[v], machine_name, v); - dout(10) << "committing " << v << " " - << last->values[v].length() << " bytes" << dendl; - } - last_committed = last->last_committed; - mon->store->put_int(last_committed, machine_name, "last_committed"); - dout(10) << "last_committed now " << last_committed << dendl; - } - - // do they accept your pn? - if (last->pn > accepted_pn) { - // no, try again. - dout(10) << " they had a higher pn than us, picking a new one." << dendl; - collect(last->pn); - } else { - // yes, they accepted our pn. great. - num_last++; - dout(10) << " they accepted our pn, we now have " - << num_last << " peons" << dendl; - - // did this person send back an accepted but uncommitted value? - if (last->uncommitted_pn && - last->uncommitted_pn > uncommitted_pn) { - uncommitted_v = last->last_committed+1; - uncommitted_pn = last->uncommitted_pn; - uncommitted_value = last->values[uncommitted_v]; - dout(10) << "we learned an uncommitted value for " << uncommitted_v - << " pn " << uncommitted_pn - << " " << uncommitted_value.length() << " bytes" - << dendl; - } - - // is that everyone? - if (num_last == mon->get_quorum().size()) { - // almost... - state = STATE_ACTIVE; - - // did we learn an old value? - if (uncommitted_v == last_committed+1 && - uncommitted_value.length()) { - dout(10) << "that's everyone. begin on old learned value" << dendl; - begin(uncommitted_value); - } else { - // active! - dout(10) << "that's everyone. active!" << dendl; - extend_lease(); - - // wake people up - finish_contexts(waiting_for_active); - finish_contexts(waiting_for_readable); - finish_contexts(waiting_for_writeable); - } - } - } - - delete last; -} - - -// leader -void Paxos::begin(bufferlist& v) -{ - dout(10) << "begin for " << last_committed+1 << " " - << v.length() << " bytes" - << dendl; - - assert(mon->is_leader()); - assert(is_active()); - state = STATE_UPDATING; - - // we must already have a majority for this to work. - assert(mon->get_quorum().size() == 1 || - num_last > (unsigned)mon->monmap->num_mon/2); - - // and no value, yet. - assert(new_value.length() == 0); - - // accept it ourselves - accepted.clear(); - accepted.insert(whoami); - new_value = v; - mon->store->put_bl_sn(new_value, machine_name, last_committed+1); - - if (mon->get_quorum().size() == 1) { - // we're alone, take it easy - commit(); - state = STATE_ACTIVE; - finish_contexts(waiting_for_active); - finish_contexts(waiting_for_commit); - finish_contexts(waiting_for_readable); - finish_contexts(waiting_for_writeable); - return; - } - - // ask others to accept it to! - for (set::const_iterator p = mon->get_quorum().begin(); - p != mon->get_quorum().end(); - ++p) { - if (*p == whoami) continue; - - dout(10) << " sending begin to mon" << *p << dendl; - MMonPaxos *begin = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_BEGIN, machine_id); - begin->values[last_committed+1] = new_value; - begin->last_committed = last_committed; - begin->pn = accepted_pn; - - mon->messenger->send_message(begin, mon->monmap->get_inst(*p)); - } - - // set timeout event - accept_timeout_event = new C_AcceptTimeout(this); - mon->timer.add_event_after(g_conf.mon_accept_timeout, accept_timeout_event); -} - -// peon -void Paxos::handle_begin(MMonPaxos *begin) -{ - dout(10) << "handle_begin " << *begin << dendl; - - // can we accept this? - if (begin->pn < accepted_pn) { - dout(10) << " we accepted a higher pn " << accepted_pn << ", ignoring" << dendl; - delete begin; - return; - } - assert(begin->pn == accepted_pn); - assert(begin->last_committed == last_committed); - - // set state. - state = STATE_UPDATING; - lease_expire = utime_t(); // cancel lease - - // yes. - version_t v = last_committed+1; - dout(10) << "accepting value for " << v << " pn " << accepted_pn << dendl; - mon->store->put_bl_sn(begin->values[v], machine_name, v); - - // reply - MMonPaxos *accept = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_ACCEPT, machine_id); - accept->pn = accepted_pn; - accept->last_committed = last_committed; - mon->messenger->send_message(accept, begin->get_source_inst()); - - delete begin; -} - -// leader -void Paxos::handle_accept(MMonPaxos *accept) -{ - dout(10) << "handle_accept " << *accept << dendl; - int from = accept->get_source().num(); - - if (accept->pn != accepted_pn) { - // we accepted a higher pn, from some other leader - dout(10) << " we accepted a higher pn " << accepted_pn << ", ignoring" << dendl; - delete accept; - return; - } - if (last_committed > 0 && - accept->last_committed < last_committed-1) { - dout(10) << " this is from an old round, ignoring" << dendl; - delete accept; - return; - } - assert(accept->last_committed == last_committed || // not committed - accept->last_committed == last_committed-1); // committed - - assert(state == STATE_UPDATING); - assert(accepted.count(from) == 0); - accepted.insert(from); - dout(10) << " now " << accepted << " have accepted" << dendl; - - // new majority? - if (accepted.size() == (unsigned)mon->monmap->num_mon/2+1) { - // yay, commit! - // note: this may happen before the lease is reextended (below) - dout(10) << " got majority, committing" << dendl; - commit(); - } - - // done? - if (accepted == mon->get_quorum()) { - dout(10) << " got quorum, done with update" << dendl; - // cancel timeout event - mon->timer.cancel_event(accept_timeout_event); - accept_timeout_event = 0; - - // yay! - state = STATE_ACTIVE; - extend_lease(); - - // wake people up - finish_contexts(waiting_for_active); - finish_contexts(waiting_for_commit); - finish_contexts(waiting_for_readable); - finish_contexts(waiting_for_writeable); - } -} - -void Paxos::accept_timeout() -{ - dout(5) << "accept timeout, calling fresh election" << dendl; - accept_timeout_event = 0; - assert(mon->is_leader()); - assert(is_updating()); - cancel_events(); - mon->call_election(); -} - -void Paxos::commit() -{ - dout(10) << "commit " << last_committed+1 << dendl; - - // commit locally - last_committed++; - mon->store->put_int(last_committed, machine_name, "last_committed"); - - // tell everyone - for (set::const_iterator p = mon->get_quorum().begin(); - p != mon->get_quorum().end(); - ++p) { - if (*p == whoami) continue; - - dout(10) << " sending commit to mon" << *p << dendl; - MMonPaxos *commit = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_COMMIT, machine_id); - commit->values[last_committed] = new_value; - commit->pn = accepted_pn; - - mon->messenger->send_message(commit, mon->monmap->get_inst(*p)); - } - - // get ready for a new round. - new_value.clear(); -} - - -void Paxos::handle_commit(MMonPaxos *commit) -{ - dout(10) << "handle_commit on " << commit->last_committed << dendl; - - if (!mon->is_peon()) { - dout(10) << "not a peon, dropping" << dendl; - assert(0); - delete commit; - return; - } - - // commit locally. - for (map::iterator p = commit->values.begin(); - p != commit->values.end(); - ++p) { - assert(p->first == last_committed+1); - last_committed = p->first; - dout(10) << " storing " << last_committed << " (" << p->second.length() << " bytes)" << dendl; - mon->store->put_bl_sn(p->second, machine_name, last_committed); - } - mon->store->put_int(last_committed, machine_name, "last_committed"); - - delete commit; -} - -void Paxos::extend_lease() -{ - assert(mon->is_leader()); - assert(is_active()); - - lease_expire = g_clock.now(); - lease_expire += g_conf.mon_lease; - acked_lease.clear(); - acked_lease.insert(whoami); - - dout(7) << "extend_lease now+" << g_conf.mon_lease << " (" << lease_expire << ")" << dendl; - - // bcast - for (set::const_iterator p = mon->get_quorum().begin(); - p != mon->get_quorum().end(); - ++p) { - if (*p == whoami) continue; - MMonPaxos *lease = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_LEASE, machine_id); - lease->last_committed = last_committed; - lease->lease_expire = lease_expire; - mon->messenger->send_message(lease, mon->monmap->get_inst(*p)); - } - - // set timeout event. - // if old timeout is still in place, leave it. - if (!lease_ack_timeout_event) { - lease_ack_timeout_event = new C_LeaseAckTimeout(this); - mon->timer.add_event_after(g_conf.mon_lease_ack_timeout, lease_ack_timeout_event); - } - - // set renew event - lease_renew_event = new C_LeaseRenew(this); - utime_t at = lease_expire; - at -= g_conf.mon_lease; - at += g_conf.mon_lease_renew_interval; - mon->timer.add_event_at(at, lease_renew_event); -} - - -// peon -void Paxos::handle_lease(MMonPaxos *lease) -{ - // sanity - if (!mon->is_peon() || - last_committed != lease->last_committed) { - dout(10) << "handle_lease i'm not a peon, or they're not the leader, or the last_committed doesn't match, dropping" << dendl; - delete lease; - return; - } - - // extend lease - if (lease_expire < lease->lease_expire) - lease_expire = lease->lease_expire; - - state = STATE_ACTIVE; - - dout(10) << "handle_lease on " << lease->last_committed - << " now " << lease_expire << dendl; - - // ack - MMonPaxos *ack = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_LEASE_ACK, machine_id); - ack->last_committed = last_committed; - ack->lease_expire = lease_expire; - mon->messenger->send_message(ack, lease->get_source_inst()); - - // (re)set timeout event. - if (lease_timeout_event) - mon->timer.cancel_event(lease_timeout_event); - lease_timeout_event = new C_LeaseTimeout(this); - mon->timer.add_event_after(g_conf.mon_lease_ack_timeout, lease_timeout_event); - - // kick waiters - finish_contexts(waiting_for_active); - if (is_readable()) - finish_contexts(waiting_for_readable); - - delete lease; -} - -void Paxos::handle_lease_ack(MMonPaxos *ack) -{ - int from = ack->get_source().num(); - - if (!lease_ack_timeout_event) { - dout(10) << "handle_lease_ack from " << ack->get_source() << " -- stray (probably since revoked)" << dendl; - } - else if (acked_lease.count(from) == 0) { - acked_lease.insert(from); - - if (acked_lease == mon->get_quorum()) { - // yay! - dout(10) << "handle_lease_ack from " << ack->get_source() - << " -- got everyone" << dendl; - mon->timer.cancel_event(lease_ack_timeout_event); - lease_ack_timeout_event = 0; - } else { - dout(10) << "handle_lease_ack from " << ack->get_source() - << " -- still need " - << mon->get_quorum().size() - acked_lease.size() - << " more" << dendl; - } - } else { - dout(10) << "handle_lease_ack from " << ack->get_source() - << " dup (lagging!), ignoring" << dendl; - } - - delete ack; -} - -void Paxos::lease_ack_timeout() -{ - dout(5) << "lease_ack_timeout -- calling new election" << dendl; - assert(mon->is_leader()); - assert(is_active()); - - lease_ack_timeout_event = 0; - cancel_events(); - mon->call_election(); -} - -void Paxos::lease_timeout() -{ - dout(5) << "lease_timeout -- calling new election" << dendl; - assert(mon->is_peon()); - - lease_timeout_event = 0; - cancel_events(); - mon->call_election(); -} - -void Paxos::lease_renew_timeout() -{ - lease_renew_event = 0; - extend_lease(); -} - - -/* - * return a globally unique, monotonically increasing proposal number - */ -version_t Paxos::get_new_proposal_number(version_t gt) -{ - if (last_pn < gt) - last_pn = gt; - - // update. make it unique among all monitors. - last_pn /= 100; - last_pn++; - last_pn *= 100; - last_pn += (version_t)whoami; - - // write - mon->store->put_int(last_pn, machine_name, "last_pn"); - - dout(10) << "get_new_proposal_number = " << last_pn << dendl; - return last_pn; -} - - -void Paxos::cancel_events() -{ - if (accept_timeout_event) { - mon->timer.cancel_event(accept_timeout_event); - accept_timeout_event = 0; - } - if (lease_renew_event) { - mon->timer.cancel_event(lease_renew_event); - lease_renew_event = 0; - } - if (lease_ack_timeout_event) { - mon->timer.cancel_event(lease_ack_timeout_event); - lease_ack_timeout_event = 0; - } - if (lease_timeout_event) { - mon->timer.cancel_event(lease_timeout_event); - lease_timeout_event = 0; - } -} - -void Paxos::leader_init() -{ - if (mon->get_quorum().size() == 1) { - state = STATE_ACTIVE; - return; - } - cancel_events(); - state = STATE_RECOVERING; - lease_expire = utime_t(); - dout(10) << "leader_init -- starting paxos recovery" << dendl; - collect(0); -} - -void Paxos::peon_init() -{ - cancel_events(); - state = STATE_RECOVERING; - lease_expire = utime_t(); - dout(10) << "peon_init -- i am a peon" << dendl; - - // no chance to write now! - finish_contexts(waiting_for_writeable, -1); - finish_contexts(waiting_for_commit, -1); -} - -void Paxos::election_starting() -{ - dout(10) << "election_starting -- canceling timeouts" << dendl; - cancel_events(); - new_value.clear(); - - finish_contexts(waiting_for_commit, -1); -} - - -void Paxos::dispatch(Message *m) -{ - // election in progress? - if (mon->is_starting()) { - dout(5) << "election in progress, dropping " << *m << dendl; - delete m; - return; - } - - // check sanity - assert(mon->is_leader() || - (mon->is_peon() && m->get_source().num() == mon->get_leader())); - - switch (m->get_type()) { - - case MSG_MON_PAXOS: - { - MMonPaxos *pm = (MMonPaxos*)m; - - // NOTE: these ops are defined in messages/MMonPaxos.h - switch (pm->op) { - // learner - case MMonPaxos::OP_COLLECT: - handle_collect(pm); - break; - case MMonPaxos::OP_LAST: - handle_last(pm); - break; - case MMonPaxos::OP_BEGIN: - handle_begin(pm); - break; - case MMonPaxos::OP_ACCEPT: - handle_accept(pm); - break; - case MMonPaxos::OP_COMMIT: - handle_commit(pm); - break; - case MMonPaxos::OP_LEASE: - handle_lease(pm); - break; - case MMonPaxos::OP_LEASE_ACK: - handle_lease_ack(pm); - break; - default: - assert(0); - } - } - break; - - default: - assert(0); - } -} - - - - -// ----------------- -// service interface - -// -- READ -- - -bool Paxos::is_readable() -{ - //dout(15) << "is_readable now=" << g_clock.now() << " lease_expire=" << lease_expire << dendl; - return - (mon->is_peon() || mon->is_leader()) && - is_active() && - last_committed > 0 && // must have a value - (mon->get_quorum().size() == 1 || // alone, or - g_clock.now() < lease_expire); // have lease -} - -bool Paxos::read(version_t v, bufferlist &bl) -{ - if (!is_readable()) - return false; - - if (!mon->store->get_bl_sn(bl, machine_name, v)) - return false; - return true; -} - -version_t Paxos::read_current(bufferlist &bl) -{ - if (!is_readable()) - return 0; - if (read(last_committed, bl)) - return last_committed; - return 0; -} - - - - -// -- WRITE -- - -bool Paxos::is_writeable() -{ - if (mon->get_quorum().size() == 1) return true; - return - mon->is_leader() && - is_active() && - g_clock.now() < lease_expire; -} - -bool Paxos::propose_new_value(bufferlist& bl, Context *oncommit) -{ - /* - // writeable? - if (!is_writeable()) { - dout(5) << "propose_new_value " << last_committed+1 << " " << bl.length() << " bytes" - << " -- not writeable" << dendl; - if (oncommit) { - oncommit->finish(-1); - delete oncommit; - } - return false; - } - */ - - assert(mon->is_leader() && is_active()); - - // cancel lease renewal and timeout events. - cancel_events(); - - // ok! - dout(5) << "propose_new_value " << last_committed+1 << " " << bl.length() << " bytes" << dendl; - if (oncommit) - waiting_for_commit.push_back(oncommit); - begin(bl); - - return true; -} - diff --git a/branches/sage/ebofs2/mon/Paxos.h b/branches/sage/ebofs2/mon/Paxos.h deleted file mode 100644 index a6d28dd1cea9a..0000000000000 --- a/branches/sage/ebofs2/mon/Paxos.h +++ /dev/null @@ -1,251 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -/* -time----> - -cccccccccccccccccca???????????????????????????????????????? -cccccccccccccccccca???????????????????????????????????????? -cccccccccccccccccca???????????????????????????????????????? leader -cccccccccccccccccc????????????????????????????????????????? -ccccc?????????????????????????????????????????????????????? - -last_committed - -pn_from -pn - -a 12v -b 12v -c 14v -d -e 12v - - -*/ - - -/* - * NOTE: This libary is based on the Paxos algorithm, but varies in a few key ways: - * 1- Only a single new value is generated at a time, simplifying the recovery logic. - * 2- Nodes track "committed" values, and share them generously (and trustingly) - * 3- A 'leasing' mechism is built-in, allowing nodes to determine when it is safe to - * "read" their copy of the last committed value. - * - * This provides a simple replication substrate that services can be built on top of. - */ - -#ifndef __MON_PAXOS_H -#define __MON_PAXOS_H - -#include "include/types.h" -#include "mon_types.h" -#include "include/buffer.h" -#include "msg/Message.h" - -#include "include/Context.h" - -#include "common/Timer.h" - -class Monitor; -class MMonPaxos; - - -// i am one state machine. -class Paxos { - Monitor *mon; - int whoami; - - // my state machine info - int machine_id; - const char *machine_name; - - friend class PaxosService; - - // LEADER+PEON - - // -- generic state -- -public: - const static int STATE_RECOVERING = 1; // leader|peon: recovering paxos state - const static int STATE_ACTIVE = 2; // leader|peon: idle. peon may or may not have valid lease - const static int STATE_UPDATING = 3; // leader|peon: updating to new value - const char *get_statename(int s) { - switch (s) { - case STATE_RECOVERING: return "recovering"; - case STATE_ACTIVE: return "active"; - case STATE_UPDATING: return "updating"; - default: assert(0); return 0; - } - } - -private: - int state; - -public: - bool is_recovering() { return state == STATE_RECOVERING; } - bool is_active() { return state == STATE_ACTIVE; } - bool is_updating() { return state == STATE_UPDATING; } - -private: - // recovery (phase 1) - version_t last_pn; - version_t last_committed; - version_t accepted_pn; - version_t accepted_pn_from; - - // active (phase 2) - utime_t lease_expire; - list waiting_for_active; - list waiting_for_readable; - - - // -- leader -- - // recovery (paxos phase 1) - unsigned num_last; - version_t uncommitted_v; - version_t uncommitted_pn; - bufferlist uncommitted_value; - - // active - set acked_lease; - Context *lease_renew_event; - Context *lease_ack_timeout_event; - Context *lease_timeout_event; - - // updating (paxos phase 2) - bufferlist new_value; - set accepted; - - Context *accept_timeout_event; - - list waiting_for_writeable; - list waiting_for_commit; - - class C_AcceptTimeout : public Context { - Paxos *paxos; - public: - C_AcceptTimeout(Paxos *p) : paxos(p) {} - void finish(int r) { - paxos->accept_timeout(); - } - }; - - class C_LeaseAckTimeout : public Context { - Paxos *paxos; - public: - C_LeaseAckTimeout(Paxos *p) : paxos(p) {} - void finish(int r) { - paxos->lease_ack_timeout(); - } - }; - - class C_LeaseTimeout : public Context { - Paxos *paxos; - public: - C_LeaseTimeout(Paxos *p) : paxos(p) {} - void finish(int r) { - paxos->lease_timeout(); - } - }; - - class C_LeaseRenew : public Context { - Paxos *paxos; - public: - C_LeaseRenew(Paxos *p) : paxos(p) {} - void finish(int r) { - std::cout << "HI MOM" << std::endl; - paxos->lease_renew_timeout(); - } - }; - - - void collect(version_t oldpn); - void handle_collect(MMonPaxos*); - void handle_last(MMonPaxos*); - void begin(bufferlist& value); - void handle_begin(MMonPaxos*); - void handle_accept(MMonPaxos*); - void accept_timeout(); - void commit(); - void handle_commit(MMonPaxos*); - void extend_lease(); - void handle_lease(MMonPaxos*); - void handle_lease_ack(MMonPaxos*); - - void lease_ack_timeout(); // on leader, if lease isn't acked by all peons - void lease_renew_timeout(); // on leader, to renew the lease - void lease_timeout(); // on peon, if lease isn't extended - - void cancel_events(); - - version_t get_new_proposal_number(version_t gt=0); - -public: - Paxos(Monitor *m, int w, - int mid) : mon(m), whoami(w), - machine_id(mid), - machine_name(get_paxos_name(mid)), - state(STATE_RECOVERING), - lease_renew_event(0), - lease_ack_timeout_event(0), - lease_timeout_event(0), - accept_timeout_event(0) { } - - void dispatch(Message *m); - - void init(); - - void election_starting(); - void leader_init(); - void peon_init(); - - - // -- service interface -- - void wait_for_active(Context *c) { - assert(!is_active()); - waiting_for_active.push_back(c); - } - - // read - version_t get_version() { return last_committed; } - bool is_readable(); - bool read(version_t v, bufferlist &bl); - version_t read_current(bufferlist &bl); - void wait_for_readable(Context *onreadable) { - assert(!is_readable()); - waiting_for_readable.push_back(onreadable); - } - - // write - bool is_leader(); - bool is_writeable(); - void wait_for_writeable(Context *c) { - assert(!is_writeable()); - waiting_for_writeable.push_back(c); - } - - bool propose_new_value(bufferlist& bl, Context *oncommit=0); - void wait_for_commit(Context *oncommit) { - waiting_for_commit.push_back(oncommit); - } - void wait_for_commit_front(Context *oncommit) { - waiting_for_commit.push_front(oncommit); - } - -}; - - - -#endif - diff --git a/branches/sage/ebofs2/mon/PaxosService.cc b/branches/sage/ebofs2/mon/PaxosService.cc deleted file mode 100644 index 7b0eed20972a0..0000000000000 --- a/branches/sage/ebofs2/mon/PaxosService.cc +++ /dev/null @@ -1,172 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "PaxosService.h" -#include "common/Clock.h" -#include "Monitor.h" - - - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_paxos) *_dout << dbeginl << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".paxosservice(" << get_paxos_name(paxos->machine_id) << ") " - - - - -void PaxosService::dispatch(Message *m) -{ - dout(10) << "dispatch " << *m << " from " << m->get_source_inst() << dendl; - - // make sure our map is readable and up to date - if (!paxos->is_readable()) { - dout(10) << " waiting for paxos -> readable" << dendl; - paxos->wait_for_readable(new C_RetryMessage(this, m)); - return; - } - - // make sure service has latest from paxos. - update_from_paxos(); - - // preprocess - if (preprocess_query(m)) - return; // easy! - - // leader? - if (!mon->is_leader()) { - // fw to leader - dout(10) << " fw to leader mon" << mon->get_leader() << dendl; - mon->messenger->send_message(m, mon->monmap->get_inst(mon->get_leader())); - return; - } - - // writeable? - if (!paxos->is_writeable()) { - dout(10) << " waiting for paxos -> writeable" << dendl; - paxos->wait_for_writeable(new C_RetryMessage(this, m)); - return; - } - - // update - if (prepare_update(m)) { - double delay; - if (should_propose(delay)) { - if (delay == 0.0) { - propose_pending(); - } else { - // delay a bit - if (!proposal_timer) { - dout(10) << " setting propose timer with dealy of " << delay << dendl; - proposal_timer = new C_Propose(this); - mon->timer.add_event_after(delay, proposal_timer); - } else { - dout(10) << " propose timer already set" << dendl; - } - } - } else { - dout(10) << " not proposing" << dendl; - } - } -} - -bool PaxosService::should_propose(double& delay) -{ - // simple default policy: quick startup, then some damping. - if (paxos->last_committed <= 1) - delay = 0.0; - else - delay = g_conf.paxos_propose_interval; - return true; -} - -void PaxosService::_commit() -{ - dout(7) << "_commit" << dendl; - update_from_paxos(); // notify service of new paxos state - - if (mon->is_leader()) { - dout(7) << "_commit creating new pending" << dendl; - assert(have_pending == false); - create_pending(); - have_pending = true; - - committed(); - } -} - - -void PaxosService::propose_pending() -{ - dout(10) << "propose_pending" << dendl; - assert(have_pending); - - if (proposal_timer) { - mon->timer.cancel_event(proposal_timer); - proposal_timer = 0; - } - - // finish and encode - bufferlist bl; - encode_pending(bl); - have_pending = false; - - // apply to paxos - paxos->wait_for_commit_front(new C_Commit(this)); - paxos->propose_new_value(bl); -} - - - - -void PaxosService::election_finished() -{ - dout(10) << "election_finished" << dendl; - - if (have_pending && - !mon->is_leader()) { - discard_pending(); - have_pending = false; - } - - // make sure we update our state - if (paxos->is_active()) - _active(); - else - paxos->wait_for_active(new C_Active(this)); -} - -void PaxosService::_active() -{ - dout(10) << "_active" << dendl; - assert(paxos->is_active()); - - // pull latest from paxos - update_from_paxos(); - - // create pending state? - if (mon->is_leader()) { - if (!have_pending) { - create_pending(); - have_pending = true; - } - - if (g_conf.mkfs && - paxos->get_version() == 0) { - create_initial(); - propose_pending(); - } - } -} - - diff --git a/branches/sage/ebofs2/mon/PaxosService.h b/branches/sage/ebofs2/mon/PaxosService.h deleted file mode 100644 index a0f39c7862273..0000000000000 --- a/branches/sage/ebofs2/mon/PaxosService.h +++ /dev/null @@ -1,107 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __PAXOSSERVICE_H -#define __PAXOSSERVICE_H - -#include "msg/Dispatcher.h" -#include "include/Context.h" - -class Monitor; -class Paxos; - -class PaxosService : public Dispatcher { -protected: - Monitor *mon; - Paxos *paxos; - - class C_RetryMessage : public Context { - PaxosService *svc; - Message *m; - public: - C_RetryMessage(PaxosService *s, Message *m_) : svc(s), m(m_) {} - void finish(int r) { - svc->dispatch(m); - } - }; - class C_Active : public Context { - PaxosService *svc; - public: - C_Active(PaxosService *s) : svc(s) {} - void finish(int r) { - if (r >= 0) - svc->_active(); - } - }; - class C_Commit : public Context { - PaxosService *svc; - public: - C_Commit(PaxosService *s) : svc(s) {} - void finish(int r) { - if (r >= 0) - svc->_commit(); - } - }; - friend class C_Update; - - class C_Propose : public Context { - PaxosService *ps; - public: - C_Propose(PaxosService *p) : ps(p) { } - void finish(int r) { - ps->proposal_timer = 0; - ps->propose_pending(); - } - }; - friend class C_Propose; - - -private: - Context *proposal_timer; - bool have_pending; - -public: - PaxosService(Monitor *mn, Paxos *p) : mon(mn), paxos(p), - proposal_timer(0), - have_pending(false) { } - - // i implement and you ignore - void dispatch(Message *m); - void election_finished(); - -private: - void _active(); - void _commit(); - -public: - // i implement and you use - void propose_pending(); // propose current pending as new paxos state - - // you implement - virtual bool update_from_paxos() = 0; // assimilate latest paxos state - virtual void create_pending() = 0; // [leader] create new pending structures - virtual void create_initial() = 0; // [leader] populate pending with initial state (1) - virtual void encode_pending(bufferlist& bl) = 0; // [leader] finish and encode pending for next paxos state - virtual void discard_pending() { } // [leader] discard pending - - virtual bool preprocess_query(Message *m) = 0; // true if processed (e.g., read-only) - virtual bool prepare_update(Message *m) = 0; - virtual bool should_propose(double &delay); - - virtual void committed() = 0; - -}; - -#endif - diff --git a/branches/sage/ebofs2/mon/mon_types.h b/branches/sage/ebofs2/mon/mon_types.h deleted file mode 100644 index 8d1ac92822356..0000000000000 --- a/branches/sage/ebofs2/mon/mon_types.h +++ /dev/null @@ -1,35 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MON_TYPES_H -#define __MON_TYPES_H - -#define PAXOS_TEST 0 -#define PAXOS_MDSMAP 1 -#define PAXOS_OSDMAP 2 -#define PAXOS_CLIENTMAP 3 -#define PAXOS_PGMAP 4 - -inline const char *get_paxos_name(int p) { - switch (p) { - case PAXOS_TEST: return "test"; - case PAXOS_MDSMAP: return "mdsmap"; - case PAXOS_OSDMAP: return "osdmap"; - case PAXOS_CLIENTMAP: return "clientmap"; - case PAXOS_PGMAP: return "pgmap"; - default: assert(0); return 0; - } -} - -#endif diff --git a/branches/sage/ebofs2/msg/Dispatcher.cc b/branches/sage/ebofs2/msg/Dispatcher.cc deleted file mode 100644 index 4fa04d7d4c92a..0000000000000 --- a/branches/sage/ebofs2/msg/Dispatcher.cc +++ /dev/null @@ -1,28 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "Dispatcher.h" -#include "Messenger.h" - -#include "mds/MDS.h" - -/* -int Dispatcher::send_message(Message *m, msg_addr_t dest, int dest_port) -{ - assert(0); - //return dis_messenger->send_message(m, dest, dest_port, MDS_PORT_SERVER); // on my port! -} -*/ diff --git a/branches/sage/ebofs2/msg/Dispatcher.h b/branches/sage/ebofs2/msg/Dispatcher.h deleted file mode 100644 index 0a77de3d20369..0000000000000 --- a/branches/sage/ebofs2/msg/Dispatcher.h +++ /dev/null @@ -1,34 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __DISPATCHER_H -#define __DISPATCHER_H - -#include "Message.h" - -class Messenger; - -class Dispatcher { - public: - virtual ~Dispatcher() { } - - // how i receive messages - virtual void dispatch(Message *m) = 0; - - // how i deal with transmission failures. - virtual void ms_handle_failure(Message *m, const entity_inst_t& inst) { delete m; } -}; - -#endif diff --git a/branches/sage/ebofs2/msg/FakeMessenger.cc b/branches/sage/ebofs2/msg/FakeMessenger.cc deleted file mode 100644 index 590b3214eb351..0000000000000 --- a/branches/sage/ebofs2/msg/FakeMessenger.cc +++ /dev/null @@ -1,414 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "Message.h" -#include "FakeMessenger.h" -#include "mds/MDS.h" - -#include "common/Timer.h" - -#include "common/LogType.h" -#include "common/Logger.h" - -#include "config.h" - -#define dout(x) if ((x) <= g_conf.debug_ms) *_dout << dbeginl << g_clock.now() << " " - - - -#include -#include -#include -#include -#include - -using namespace std; - -#include -using namespace __gnu_cxx; - - -#include "common/Cond.h" -#include "common/Mutex.h" -#include - - -// global queue. - -int num_entity; -vector directory; - -hash_map loggers; -LogType fakemsg_logtype; - -set shutdown_set; - -Mutex lock; -Cond cond; - -bool awake = false; -bool fm_shutdown = false; -pthread_t thread_id; - -extern std::map g_fake_kill_after; // in config.cc -utime_t start_time; -map fail_queue; -list sent_to_failed_queue; - -void *fakemessenger_thread(void *ptr) -{ - start_time = g_clock.now(); - - lock.Lock(); - while (1) { - if (fm_shutdown) break; - fakemessenger_do_loop_2(); - - if (num_entity == 0 && directory.size() > 0) break; - - dout(20) << "thread waiting" << dendl; - if (fm_shutdown) break; - awake = false; - cond.Wait(lock); - awake = true; - dout(20) << "thread woke up" << dendl; - } - lock.Unlock(); - - dout(1) << "thread finish (i woke up but no messages, bye)" << dendl; - return 0; -} - - -void fakemessenger_startthread() { - pthread_create(&thread_id, NULL, fakemessenger_thread, 0); -} - -void fakemessenger_stopthread() { - dout(0) << "fakemessenger_stopthread setting stop flag" << dendl; - lock.Lock(); - fm_shutdown = true; - lock.Unlock(); - cond.Signal(); - - fakemessenger_wait(); -} - -void fakemessenger_wait() -{ - dout(0) << "fakemessenger_wait waiting" << dendl; - void *ptr; - pthread_join(thread_id, &ptr); -} - - -// fake failure - - - -// lame main looper - -int fakemessenger_do_loop() -{ - lock.Lock(); - fakemessenger_do_loop_2(); - lock.Unlock(); - - g_timer.shutdown(); - return 0; -} - - -int fakemessenger_do_loop_2() -{ - //lock.Lock(); - dout(18) << "do_loop begin." << dendl; - - while (1) { - bool didone = false; - - dout(18) << "do_loop top" << dendl; - - // fail_queue - while (!fail_queue.empty() && - fail_queue.begin()->first < g_clock.now()) { - entity_name_t nm = fail_queue.begin()->second; - fail_queue.erase(fail_queue.begin()); - - dout(0) << "MUST FAKE KILL " << nm << dendl; - - for (unsigned i=0; iget_myname() == nm) { - dout(0) << "FAKING FAILURE of " << nm << " at " << directory[i]->get_myaddr() << dendl; - directory[i]->failed = true; - directory[i] = 0; - num_entity--; - break; - } - } - } - - list ls; - ls.swap(sent_to_failed_queue); - for (list::iterator p = ls.begin(); - p != ls.end(); - ++p) { - Message *m = *p; - FakeMessenger *mgr = 0; - Dispatcher *dis = 0; - - unsigned drank = m->get_source_addr().v.erank; - if (drank < directory.size() && directory[drank]) { - mgr = directory[drank]; - if (mgr) - dis = mgr->get_dispatcher(); - } - if (dis) { - dout(1) << "fail on " << *m - << " to " << m->get_dest() << " from " << m->get_source() - << ", passing back to sender." << dendl; - dis->ms_handle_failure(m, m->get_dest_inst()); - } else { - dout(1) << "fail on " << *m - << " to " << m->get_dest() << " from " << m->get_source() - << ", sender gone, dropping." << dendl; - delete m; - } - } - - // messages - for (unsigned i=0; iget_myname() << " has " << mgr->num_incoming() << " queued" << dendl; - - if (!mgr->is_ready()) { - dout(18) << "messenger " << mgr << " at " << mgr->get_myname() << " has no dispatcher, skipping" << dendl; - continue; - } - - Message *m = mgr->get_message(); - - if (m) { - m->set_recv_stamp(g_clock.now()); - - //dout(18) << "got " << m << dendl; - dout(1) << "==== " << m->get_dest() - << " <- " << m->get_source() - << " ==== " << *m - << " ---- " << m - << dendl; - - if (g_conf.fakemessenger_serialize) { - // encode - if (m->empty_payload()) - m->encode_payload(); - ceph_message_header env = m->get_envelope(); - bufferlist bl; - bl.claim( m->get_payload() ); - //bl.c_str(); // condense into 1 buffer - - delete m; - - // decode - m = decode_message(env, bl); - assert(m); - } - - didone = true; - - lock.Unlock(); - mgr->dispatch(m); - lock.Lock(); - } - } - - // deal with shutdowns.. delayed to avoid concurrent directory modification - if (!shutdown_set.empty()) { - for (set::iterator it = shutdown_set.begin(); - it != shutdown_set.end(); - it++) { - dout(7) << "fakemessenger: removing " << *it << " from directory" << dendl; - int r = it->v.erank; - assert(directory[r]); - directory[r] = 0; - num_entity--; - if (num_entity == 0) { - dout(1) << "fakemessenger: last shutdown" << dendl; - ::fm_shutdown = true; - } - } - shutdown_set.clear(); - } - - if (!didone) - break; - } - - - dout(18) << "do_loop end (no more messages)." << dendl; - //lock.Unlock(); - return 0; -} - - -FakeMessenger::FakeMessenger(entity_name_t me) : Messenger(me) -{ - failed = false; - - lock.Lock(); - { - // assign rank - unsigned r = directory.size(); - _myinst.name = me; - _myinst.addr.set_port(0); - _myinst.addr.v.erank = r; - _myinst.addr.v.nonce = getpid(); - - // add to directory - directory.push_back(this); - assert(directory.size() == r+1); - - num_entity++; - - // put myself in the fail queue? - if (g_fake_kill_after.count(me)) { - utime_t w = start_time; - w += g_fake_kill_after[me]; - dout(0) << "will fake failure of " << me << " at " << w << dendl; - fail_queue[w] = me; - } - } - lock.Unlock(); - - - dout(0) << "fakemessenger " << get_myname() << " messenger is " << this << " at " << _myinst << dendl; - - qlen = 0; - - /* - string name; - name = "m."; - name += MSG_ADDR_TYPE(myaddr); - int w = MSG_ADDR_NUM(myaddr); - if (w >= 1000) name += ('0' + ((w/1000)%10)); - if (w >= 100) name += ('0' + ((w/100)%10)); - if (w >= 10) name += ('0' + ((w/10)%10)); - name += ('0' + ((w/1)%10)); - - loggers[ myaddr ] = new Logger(name, (LogType*)&fakemsg_logtype); - */ -} - -FakeMessenger::~FakeMessenger() -{ - // hose any undelivered messages - for (list::iterator p = incoming.begin(); - p != incoming.end(); - ++p) - delete *p; -} - - -int FakeMessenger::shutdown() -{ - dout(2) << "shutdown on messenger " << this << " has " << num_incoming() << " queued" << dendl; - lock.Lock(); - assert(directory[_myinst.addr.v.erank] == this); - shutdown_set.insert(_myinst.addr); - - /* - if (loggers[myaddr]) { - delete loggers[myaddr]; - loggers.erase(myaddr); - } - */ - - lock.Unlock(); - return 0; -} - - -void FakeMessenger::reset_myname(entity_name_t m) -{ - dout(1) << "reset_myname from " << get_myname() << " to " << m << dendl; - _set_myname(m); - - _myinst.name = m; - - // put myself in the fail queue? - if (g_fake_kill_after.count(m)) { - utime_t w = start_time; - w += g_fake_kill_after[m]; - dout(0) << "will fake failure of " << m << " at " << w << dendl; - fail_queue[w] = m; - } - -} - - -int FakeMessenger::send_message(Message *m, entity_inst_t inst) -{ - entity_name_t dest = inst.name; - - m->set_source(get_myname()); - m->set_source_addr(get_myaddr()); - - m->set_dest_inst(inst); - - lock.Lock(); - -#ifdef LOG_MESSAGES - // stats - loggers[get_myaddr()]->inc("+send",1); - loggers[dest]->inc("-recv",1); - - char s[20]; - sprintf(s,"+%s", m->get_type_name()); - loggers[get_myaddr()]->inc(s); - sprintf(s,"-%s", m->get_type_name()); - loggers[dest]->inc(s); -#endif - - // queue - unsigned drank = inst.addr.v.erank; - if (drank < directory.size() && directory[drank] && - shutdown_set.count(inst.addr) == 0) { - dout(1) << "--> " << get_myname() << " -> " << inst.name << " --- " << *m << " -- " << m - << dendl; - directory[drank]->queue_incoming(m); - } else { - dout(0) << "--> " << get_myname() << " -> " << inst.name << " " << *m << " -- " << m - << " *** destination " << inst.addr << " DNE ***" - << dendl; - - // do the failure callback - sent_to_failed_queue.push_back(m); - } - - // wake up loop? - if (!awake) { - dout(10) << "waking up fakemessenger thread" << dendl; - cond.Signal(); - lock.Unlock(); - } else - lock.Unlock(); - - return 0; -} - - diff --git a/branches/sage/ebofs2/msg/FakeMessenger.h b/branches/sage/ebofs2/msg/FakeMessenger.h deleted file mode 100644 index 0b08b8c9d4c55..0000000000000 --- a/branches/sage/ebofs2/msg/FakeMessenger.h +++ /dev/null @@ -1,95 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __FAKEMESSENGER_H -#define __FAKEMESSENGER_H - -#include "Messenger.h" -#include "Dispatcher.h" - -#include -#include - -class Timer; - -class FakeMessenger : public Messenger { - protected: - class Logger *logger; - - int qlen; - list incoming; // incoming queue - - entity_inst_t _myinst; - - public: - bool failed; - - FakeMessenger(entity_name_t me); - ~FakeMessenger(); - - virtual int shutdown(); - - const entity_inst_t& get_myinst() { - return _myinst; - }; - const entity_addr_t& get_myaddr() { - return _myinst.addr; - } - - void reset_myname(entity_name_t m); - - // msg interface - int send_message(Message *m, entity_inst_t dest); - - int get_dispatch_queue_len() { return qlen; } - - // -- incoming queue -- - // (that nothing uses) - Message *get_message() { - if (!incoming.empty()) { - Message *m = incoming.front(); - incoming.pop_front(); - qlen--; - return m; - } - return NULL; - } - bool queue_incoming(Message *m) { - incoming.push_back(m); - qlen++; - return true; - } - int num_incoming() { - //return incoming.size(); - return qlen; - } - - void suicide() { - if (!failed) { - failed = true; - } - shutdown(); - } - -}; - -int fakemessenger_do_loop(); -int fakemessenger_do_loop_2(); -void fakemessenger_startthread(); -void fakemessenger_stopthread(); -void fakemessenger_wait(); - -#endif diff --git a/branches/sage/ebofs2/msg/Message.cc b/branches/sage/ebofs2/msg/Message.cc deleted file mode 100644 index 9a8dbb26f2c18..0000000000000 --- a/branches/sage/ebofs2/msg/Message.cc +++ /dev/null @@ -1,369 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#include -#include -using namespace std; - -#include "include/types.h" - -#include "Message.h" - -#include "messages/MGenericMessage.h" - -#include "messages/MPGStats.h" - -#include "messages/MStatfs.h" -#include "messages/MStatfsReply.h" - -#include "messages/MMonCommand.h" -#include "messages/MMonCommandAck.h" -#include "messages/MMonPaxos.h" - -#include "messages/MMonElection.h" - -#include "messages/MPing.h" -#include "messages/MPingAck.h" -//#include "messages/MFailure.h" -//#include "messages/MFailureAck.h" - -#include "messages/MOSDBoot.h" -#include "messages/MOSDIn.h" -#include "messages/MOSDOut.h" -#include "messages/MOSDFailure.h" -#include "messages/MOSDPing.h" -#include "messages/MOSDOp.h" -#include "messages/MOSDOpReply.h" -#include "messages/MOSDMap.h" -#include "messages/MOSDGetMap.h" - -#include "messages/MOSDPGNotify.h" -#include "messages/MOSDPGQuery.h" -#include "messages/MOSDPGLog.h" -#include "messages/MOSDPGRemove.h" -#include "messages/MOSDPGActivateSet.h" - -#include "messages/MClientMount.h" -#include "messages/MClientUnmount.h" -#include "messages/MClientSession.h" -#include "messages/MClientReconnect.h" -#include "messages/MClientRequest.h" -#include "messages/MClientRequestForward.h" -#include "messages/MClientReply.h" -#include "messages/MClientFileCaps.h" - -#include "messages/MMDSSlaveRequest.h" - -#include "messages/MMDSGetMap.h" -#include "messages/MMDSMap.h" -#include "messages/MMDSBeacon.h" -#include "messages/MMDSResolve.h" -#include "messages/MMDSResolveAck.h" -#include "messages/MMDSCacheRejoin.h" -//#include "messages/MMDSCacheRejoinAck.h" - -#include "messages/MDirUpdate.h" -#include "messages/MDiscover.h" -#include "messages/MDiscoverReply.h" - -#include "messages/MMDSFragmentNotify.h" - -#include "messages/MExportDirDiscover.h" -#include "messages/MExportDirDiscoverAck.h" -#include "messages/MExportDirCancel.h" -#include "messages/MExportDirPrep.h" -#include "messages/MExportDirPrepAck.h" -#include "messages/MExportDirWarning.h" -#include "messages/MExportDirWarningAck.h" -#include "messages/MExportDir.h" -#include "messages/MExportDirAck.h" -#include "messages/MExportDirNotify.h" -#include "messages/MExportDirNotifyAck.h" -#include "messages/MExportDirFinish.h" - -#include "messages/MDentryUnlink.h" - -#include "messages/MHeartbeat.h" - -#include "messages/MAnchor.h" - -//#include "messages/MInodeUpdate.h" -#include "messages/MCacheExpire.h" -#include "messages/MInodeFileCaps.h" - -#include "messages/MLock.h" - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug) *_dout << dbeginl << g_clock.now() << " MESSENGER: " -#define DEBUGLVL 10 // debug level of output - - - - - - - -Message * -decode_message(ceph_message_header& env, bufferlist& payload) -{ - // make message - Message *m = 0; - switch(env.type) { - - // -- with payload -- - - case MSG_PGSTATS: - m = new MPGStats; - break; - - case MSG_STATFS: - m = new MStatfs; - break; - case MSG_STATFS_REPLY: - m = new MStatfsReply; - break; - - case MSG_MON_COMMAND: - m = new MMonCommand; - break; - case MSG_MON_COMMAND_ACK: - m = new MMonCommandAck; - break; - case MSG_MON_PAXOS: - m = new MMonPaxos; - break; - - case MSG_MON_ELECTION: - m = new MMonElection; - break; - - case MSG_PING: - m = new MPing(); - break; - case MSG_PING_ACK: - m = new MPingAck(); - break; - /* - case MSG_FAILURE: - m = new MFailure(); - break; - case MSG_FAILURE_ACK: - m = new MFailureAck(); - break; - */ - - case MSG_OSD_BOOT: - m = new MOSDBoot(); - break; - case MSG_OSD_IN: - m = new MOSDIn(); - break; - case MSG_OSD_OUT: - m = new MOSDOut(); - break; - case MSG_OSD_FAILURE: - m = new MOSDFailure(); - break; - case MSG_OSD_PING: - m = new MOSDPing(); - break; - case MSG_OSD_OP: - m = new MOSDOp(); - break; - case MSG_OSD_OPREPLY: - m = new MOSDOpReply(); - break; - - case MSG_OSD_MAP: - m = new MOSDMap(); - break; - case MSG_OSD_GETMAP: - m = new MOSDGetMap(); - break; - - case MSG_OSD_PG_NOTIFY: - m = new MOSDPGNotify(); - break; - case MSG_OSD_PG_QUERY: - m = new MOSDPGQuery(); - break; - case MSG_OSD_PG_LOG: - m = new MOSDPGLog(); - break; - case MSG_OSD_PG_REMOVE: - m = new MOSDPGRemove(); - break; - case MSG_OSD_PG_ACTIVATE_SET: - m = new MOSDPGActivateSet(); - break; - - // clients - case MSG_CLIENT_MOUNT: - m = new MClientMount; - break; - case MSG_CLIENT_UNMOUNT: - m = new MClientUnmount; - break; - case MSG_CLIENT_SESSION: - m = new MClientSession; - break; - case MSG_CLIENT_RECONNECT: - m = new MClientReconnect; - break; - case MSG_CLIENT_REQUEST: - m = new MClientRequest; - break; - case MSG_CLIENT_REQUEST_FORWARD: - m = new MClientRequestForward; - break; - case MSG_CLIENT_REPLY: - m = new MClientReply; - break; - case MSG_CLIENT_FILECAPS: - m = new MClientFileCaps; - break; - - // mds - case MSG_MDS_SLAVE_REQUEST: - m = new MMDSSlaveRequest; - break; - - case MSG_MDS_GETMAP: - m = new MMDSGetMap(); - break; - case MSG_MDS_MAP: - m = new MMDSMap(); - break; - case MSG_MDS_BEACON: - m = new MMDSBeacon; - break; - case MSG_MDS_RESOLVE: - m = new MMDSResolve; - break; - case MSG_MDS_RESOLVEACK: - m = new MMDSResolveAck; - break; - case MSG_MDS_CACHEREJOIN: - m = new MMDSCacheRejoin; - break; - /* - case MSG_MDS_CACHEREJOINACK: - m = new MMDSCacheRejoinAck; - break; - */ - - case MSG_MDS_DIRUPDATE: - m = new MDirUpdate(); - break; - - case MSG_MDS_DISCOVER: - m = new MDiscover(); - break; - case MSG_MDS_DISCOVERREPLY: - m = new MDiscoverReply(); - break; - - case MSG_MDS_FRAGMENTNOTIFY: - m = new MMDSFragmentNotify; - break; - - case MSG_MDS_EXPORTDIRDISCOVER: - m = new MExportDirDiscover(); - break; - case MSG_MDS_EXPORTDIRDISCOVERACK: - m = new MExportDirDiscoverAck(); - break; - case MSG_MDS_EXPORTDIRCANCEL: - m = new MExportDirCancel(); - break; - - case MSG_MDS_EXPORTDIR: - m = new MExportDir; - break; - case MSG_MDS_EXPORTDIRACK: - m = new MExportDirAck; - break; - case MSG_MDS_EXPORTDIRFINISH: - m = new MExportDirFinish; - break; - - case MSG_MDS_EXPORTDIRNOTIFY: - m = new MExportDirNotify(); - break; - - case MSG_MDS_EXPORTDIRNOTIFYACK: - m = new MExportDirNotifyAck(); - break; - - case MSG_MDS_EXPORTDIRPREP: - m = new MExportDirPrep(); - break; - - case MSG_MDS_EXPORTDIRPREPACK: - m = new MExportDirPrepAck(); - break; - - case MSG_MDS_EXPORTDIRWARNING: - m = new MExportDirWarning; - break; - case MSG_MDS_EXPORTDIRWARNINGACK: - m = new MExportDirWarningAck; - break; - - - - case MSG_MDS_DENTRYUNLINK: - m = new MDentryUnlink(); - break; - - case MSG_MDS_HEARTBEAT: - m = new MHeartbeat(); - break; - - case MSG_MDS_CACHEEXPIRE: - m = new MCacheExpire(); - break; - - case MSG_MDS_ANCHOR: - m = new MAnchor(); - break; - - /* case MSG_MDS_INODEUPDATE: - m = new MInodeUpdate(); - break; - */ - - case MSG_MDS_INODEFILECAPS: - m = new MInodeFileCaps(); - break; - - case MSG_MDS_LOCK: - m = new MLock(); - break; - - - // -- simple messages without payload -- - - case MSG_CLOSE: - case MSG_SHUTDOWN: - m = new MGenericMessage(env.type); - break; - - default: - dout(0) << "can't decode unknown message type " << env.type << dendl; - assert(0); - } - - // env - m->set_envelope(env); - - // decode - m->set_payload(payload); - m->decode_payload(); - - // done! - return m; -} - - diff --git a/branches/sage/ebofs2/msg/Message.h b/branches/sage/ebofs2/msg/Message.h deleted file mode 100644 index b4bf53db8b4fd..0000000000000 --- a/branches/sage/ebofs2/msg/Message.h +++ /dev/null @@ -1,250 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MESSAGE_H -#define __MESSAGE_H - -#define MSG_CLOSE 0 - -#define MSG_STATFS 1 -#define MSG_STATFS_REPLY 2 -#define MSG_PGSTATS 3 - -#define MSG_PING 10 -#define MSG_PING_ACK 11 - -#define MSG_SHUTDOWN 99999 - -#define MSG_MON_COMMAND 13 -#define MSG_MON_COMMAND_ACK 14 - - -#define MSG_MON_ELECTION 15 - -#define MSG_MON_OSDMAP_INFO 20 -#define MSG_MON_OSDMAP_LEASE 21 -#define MSG_MON_OSDMAP_LEASE_ACK 22 -#define MSG_MON_OSDMAP_UPDATE_PREPARE 23 -#define MSG_MON_OSDMAP_UPDATE_ACK 24 -#define MSG_MON_OSDMAP_UPDATE_COMMIT 25 - -#define MSG_MON_PAXOS 30 - -#define MSG_OSD_OP 40 // delete, etc. -#define MSG_OSD_OPREPLY 41 // delete, etc. -#define MSG_OSD_PING 42 - -#define MSG_OSD_GETMAP 43 -#define MSG_OSD_MAP 44 - -#define MSG_OSD_BOOT 45 - -#define MSG_OSD_FAILURE 47 - -#define MSG_OSD_IN 48 -#define MSG_OSD_OUT 49 - - - -#define MSG_OSD_PG_NOTIFY 50 -#define MSG_OSD_PG_QUERY 51 -#define MSG_OSD_PG_SUMMARY 52 -#define MSG_OSD_PG_LOG 53 -#define MSG_OSD_PG_REMOVE 54 -#define MSG_OSD_PG_ACTIVATE_SET 55 - -// -- client -- -// to monitor -#define MSG_CLIENT_MOUNT 60 -#define MSG_CLIENT_UNMOUNT 61 - -// to mds -#define MSG_CLIENT_SESSION 70 // start or stop -#define MSG_CLIENT_RECONNECT 71 - -#define MSG_CLIENT_REQUEST 80 -#define MSG_CLIENT_REQUEST_FORWARD 81 -#define MSG_CLIENT_REPLY 82 -#define MSG_CLIENT_FILECAPS 0x310 // - - - -// *** MDS *** - - -#define MSG_MDS_RESOLVE 0x200 -#define MSG_MDS_RESOLVEACK 0x201 -#define MSG_MDS_CACHEREJOIN 0x202 -#define MSG_MDS_DISCOVER 0x203 -#define MSG_MDS_DISCOVERREPLY 0x204 -#define MSG_MDS_INODEUPDATE 0x205 -#define MSG_MDS_DIRUPDATE 0x206 -#define MSG_MDS_CACHEEXPIRE 0x207 -#define MSG_MDS_DENTRYUNLINK 0x208 -#define MSG_MDS_FRAGMENTNOTIFY 0x209 - -#define MSG_MDS_LOCK 0x300 -#define MSG_MDS_INODEFILECAPS 0x301 - -#define MSG_MDS_EXPORTDIRDISCOVER 0x449 -#define MSG_MDS_EXPORTDIRDISCOVERACK 0x450 -#define MSG_MDS_EXPORTDIRCANCEL 0x451 -#define MSG_MDS_EXPORTDIRPREP 0x452 -#define MSG_MDS_EXPORTDIRPREPACK 0x453 -#define MSG_MDS_EXPORTDIRWARNING 0x454 -#define MSG_MDS_EXPORTDIRWARNINGACK 0x455 -#define MSG_MDS_EXPORTDIR 0x456 -#define MSG_MDS_EXPORTDIRACK 0x457 -#define MSG_MDS_EXPORTDIRNOTIFY 0x458 -#define MSG_MDS_EXPORTDIRNOTIFYACK 0x459 -#define MSG_MDS_EXPORTDIRFINISH 0x460 - - -#define MSG_MDS_GETMAP 102 -#define MSG_MDS_MAP 103 -#define MSG_MDS_BEACON 105 // to monitor - -#define MSG_MDS_ANCHOR 0x100 -#define MSG_MDS_HEARTBEAT 0x500 // for mds load balancer - -#define MSG_MDS_SLAVE_REQUEST 170 - -/* -#define MSG_MDS_INODEGETREPLICA 112 -#define MSG_MDS_INODEGETREPLICAACK 113 - -#define MSG_MDS_DIREXPIREREQ 124 -*/ - - - - -#include -#include - -#include -#include -using std::list; - -#include - - -#include "include/types.h" -#include "include/buffer.h" -#include "msg_types.h" - - - - -// ====================================================== - -// abstract Message class - - -class Message { - private: - - protected: - ceph_message_header env; // envelope - bufferlist payload; // payload - list chunk_payload_at; - - utime_t recv_stamp; - - friend class Messenger; -public: - - public: - Message() { - env.nchunks = 0; - }; - Message(int t) { - env.nchunks = 0; - env.type = t; - } - virtual ~Message() { - } - - - void clear_payload() { payload.clear(); } - bool empty_payload() { return payload.length() == 0; } - bufferlist& get_payload() { - return payload; - } - void set_payload(bufferlist& bl) { - payload.claim(bl); - } - void copy_payload(const bufferlist& bl) { - payload = bl; - } - const list& get_chunk_payload_at() const { return chunk_payload_at; } - void set_chunk_payload_at(list& o) { chunk_payload_at.swap(o); } - ceph_message_header& get_envelope() { - return env; - } - void set_envelope(ceph_message_header& env) { - this->env = env; - } - - - void set_recv_stamp(utime_t t) { recv_stamp = t; } - utime_t get_recv_stamp() { return recv_stamp; } - - unsigned get_seq() { return env.seq; } - void set_seq(unsigned s) { env.seq = s; } - - // ENVELOPE ---- - - // type - int get_type() { return env.type; } - void set_type(int t) { env.type = t; } - virtual char *get_type_name() = 0; - - // source/dest - entity_inst_t& get_dest_inst() { return *(entity_inst_t*)&env.dst; } - void set_dest_inst(entity_inst_t& inst) { env.dst = *(ceph_entity_inst*)&inst; } - - entity_inst_t& get_source_inst() { return *(entity_inst_t*)&env.src; } - void set_source_inst(entity_inst_t& inst) { env.src = *(ceph_entity_inst*)&inst; } - - entity_name_t& get_dest() { return *(entity_name_t*)&env.dst.name; } - void set_dest(entity_name_t a) { env.dst.name = *(ceph_entity_name*)&a; } - - entity_name_t& get_source() { return *(entity_name_t*)&env.src.name; } - void set_source(entity_name_t a) { env.src.name = *(ceph_entity_name*)&a; } - - entity_addr_t& get_source_addr() { return *(entity_addr_t*)&env.src.addr; } - void set_source_addr(const entity_addr_t &i) { env.src.addr = *(ceph_entity_addr*)&i; } - - // PAYLOAD ---- - void reset_payload() { - payload.clear(); - } - - virtual void decode_payload() = 0; - virtual void encode_payload() = 0; - - virtual void print(ostream& out) { - out << get_type_name(); - } - -}; - -extern Message *decode_message(ceph_message_header &env, bufferlist& bl); -inline ostream& operator<<(ostream& out, Message& m) { - m.print(out); - return out; -} - -#endif diff --git a/branches/sage/ebofs2/msg/Messenger.cc b/branches/sage/ebofs2/msg/Messenger.cc deleted file mode 100644 index 5af83462b2995..0000000000000 --- a/branches/sage/ebofs2/msg/Messenger.cc +++ /dev/null @@ -1,39 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include -#include "include/types.h" - -#include "Message.h" -#include "Messenger.h" -#include "messages/MGenericMessage.h" - -#include -#include -using namespace std; - - -// --------- -// incoming messages - -void Messenger::dispatch(Message *m) -{ - assert(dispatcher); - dispatcher->dispatch(m); -} - - - diff --git a/branches/sage/ebofs2/msg/Messenger.h b/branches/sage/ebofs2/msg/Messenger.h deleted file mode 100644 index 1bb9c8acb28ed..0000000000000 --- a/branches/sage/ebofs2/msg/Messenger.h +++ /dev/null @@ -1,89 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __MESSENGER_H -#define __MESSENGER_H - -#include -using namespace std; - -#include "Message.h" -#include "Dispatcher.h" -#include "common/Mutex.h" -#include "common/Cond.h" -#include "include/Context.h" - - - -class MDS; -class Timer; - -class Messenger { - private: - Dispatcher *dispatcher; - entity_name_t _myname; - - public: - Messenger(entity_name_t w) : dispatcher(0), _myname(w) { } - virtual ~Messenger() { } - - // accessors - entity_name_t get_myname() { return _myname; } - void _set_myname(entity_name_t m) { _myname = m; } - - virtual void reset_myname(entity_name_t m) = 0; - - virtual const entity_addr_t &get_myaddr() = 0; - - entity_inst_t get_myinst() { return entity_inst_t(_myname, get_myaddr()); } - - // hrmpf. - virtual int get_dispatch_queue_len() { return 0; }; - - // setup - void set_dispatcher(Dispatcher *d) { - if (!dispatcher) { - dispatcher = d; - ready(); - } - } - Dispatcher *get_dispatcher() { return dispatcher; } - virtual void ready() { } - bool is_ready() { return dispatcher != 0; } - - // dispatch incoming messages - virtual void dispatch(Message *m) { - assert(dispatcher); - dispatcher->dispatch(m); - } - - // shutdown - virtual int shutdown() = 0; - virtual void suicide() = 0; - - // send message - virtual void prepare_dest(const entity_addr_t& addr) {} - virtual int send_message(Message *m, entity_inst_t dest) = 0; - - virtual void mark_down(entity_addr_t a) {} - -}; - - - - - -#endif diff --git a/branches/sage/ebofs2/msg/SimpleMessenger.cc b/branches/sage/ebofs2/msg/SimpleMessenger.cc deleted file mode 100644 index 48576755b1eb5..0000000000000 --- a/branches/sage/ebofs2/msg/SimpleMessenger.cc +++ /dev/null @@ -1,1495 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "SimpleMessenger.h" - -#include -#include -#include -#include -#include -#include -#include - -#include "config.h" - -#include "messages/MGenericMessage.h" - -#include - -#include -#include - -#define dout(l) if (l<=g_conf.debug_ms) *_dout << dbeginl << g_clock.now() << " " << pthread_self() << " -- " << rank.rank_addr << " " -#define derr(l) if (l<=g_conf.debug_ms) *_derr << dbeginl << g_clock.now() << " " << pthread_self() << " -- " << rank.rank_addr << " " - - - -#include "tcp.cc" - - -Rank rank; - - -sighandler_t old_sigint_handler = 0; - - -/******************************************** - * Accepter - */ - -void simplemessenger_sigint(int r) -{ - rank.sigint(); - if (old_sigint_handler) - old_sigint_handler(r); -} - -void Rank::sigint() -{ - lock.Lock(); - derr(0) << "got control-c, exiting" << dendl; - - // force close listener socket - if (accepter.listen_sd > 0) - ::close(accepter.listen_sd); - - // force close all pipe sockets, too - for (hash_map::iterator p = rank_pipe.begin(); - p != rank_pipe.end(); - ++p) - p->second->force_close(); - - lock.Unlock(); -} - - - -void noop_signal_handler(int s) -{ - //dout(0) << "blah_handler got " << s << dendl; -} - -int Rank::Accepter::start() -{ - // bind to a socket - dout(10) << "accepter.start" << dendl; - - char hostname[100]; - memset(hostname, 0, 100); - gethostname(hostname, 100); - dout(2) << "accepter.start my hostname is " << hostname << dendl; - - // is there a .ceph_hosts file? - { - ifstream fh; - fh.open(".ceph_hosts"); - if (fh.is_open()) { - while (1) { - string line; - getline(fh, line); - if (fh.eof()) break; - if (line[0] == '#' || line[0] == ';') continue; - int ospace = line.find(" "); - if (!ospace) continue; - string host = line.substr(0, ospace); - string addr = line.substr(ospace+1); - dout(15) << ".ceph_hosts: host '" << host << "' -> '" << addr << "'" << dendl; - if (host == hostname) { - parse_ip_port(addr.c_str(), g_my_addr); - dout(1) << ".ceph_hosts: my addr is " << g_my_addr << dendl; - break; - } - } - fh.close(); - } - } - - // use whatever user specified (if anything) - sockaddr_in listen_addr; - g_my_addr.make_addr(listen_addr); - - /* socket creation */ - listen_sd = socket(AF_INET,SOCK_STREAM,0); - assert(listen_sd > 0); - - /* bind to port */ - int rc = bind(listen_sd, (struct sockaddr *) &listen_addr, sizeof(listen_addr)); - if (rc < 0) - derr(0) << "accepter.start unable to bind to " << listen_addr << dendl; - assert(rc >= 0); - - // what port did we get? - socklen_t llen = sizeof(listen_addr); - getsockname(listen_sd, (sockaddr*)&listen_addr, &llen); - - dout(10) << "accepter.start bound to " << listen_addr << dendl; - - // listen! - rc = ::listen(listen_sd, 128); - assert(rc >= 0); - - // figure out my_addr - if (g_my_addr != entity_addr_t()) { - // user specified it, easy peasy. - rank.rank_addr = g_my_addr; - } else { - // my IP is... HELP! - struct hostent *myhostname = gethostbyname(hostname); - - // look up my hostname. - listen_addr.sin_family = myhostname->h_addrtype; - memcpy((char*)&listen_addr.sin_addr.s_addr, - myhostname->h_addr_list[0], - myhostname->h_length); - rank.rank_addr.set_addr(listen_addr); - rank.rank_addr.set_port(0); - } - if (rank.rank_addr.get_port() == 0) { - entity_addr_t tmp; - tmp.set_addr(listen_addr); - rank.rank_addr.set_port(tmp.get_port()); - rank.rank_addr.v.nonce = getpid(); // FIXME: pid might not be best choice here. - } - rank.rank_addr.v.erank = 0; - - dout(1) << "accepter.start rank_addr is " << rank.rank_addr << dendl; - - // set up signal handler - //old_sigint_handler = signal(SIGINT, simplemessenger_sigint); - - // set a harmless handle for SIGUSR1 (we'll use it to stop the accepter) - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_handler = noop_signal_handler; - sa.sa_flags = 0; - sigemptyset(&sa.sa_mask); - sigaction(SIGUSR1, &sa, NULL); - - // start thread - create(); - - return 0; -} - -void *Rank::Accepter::entry() -{ - dout(10) << "accepter starting" << dendl; - - fd_set fds; - while (!done) { - FD_ZERO(&fds); - FD_SET(listen_sd, &fds); - dout(20) << "accepter calling select" << dendl; - int r = ::select(listen_sd+1, &fds, 0, &fds, 0); - dout(20) << "accepter select got " << r << dendl; - - if (done) break; - - // accept - struct sockaddr_in addr; - socklen_t slen = sizeof(addr); - int sd = ::accept(listen_sd, (sockaddr*)&addr, &slen); - if (sd > 0) { - dout(10) << "accepted incoming on sd " << sd << dendl; - - // disable Nagle algorithm? - if (g_conf.ms_tcp_nodelay) { - int flag = 1; - int r = ::setsockopt(sd, IPPROTO_TCP, TCP_NODELAY, (char*)&flag, sizeof(flag)); - if (r < 0) - dout(0) << "accepter could't set TCP_NODELAY: " << strerror(errno) << dendl; - } - - rank.lock.Lock(); - if (rank.num_local > 0) { - Pipe *p = new Pipe(Pipe::STATE_ACCEPTING); - p->sd = sd; - p->start_reader(); - rank.pipes.insert(p); - } - rank.lock.Unlock(); - } else { - dout(10) << "no incoming connection?" << dendl; - } - } - - dout(20) << "accepter closing" << dendl; - if (listen_sd > 0) ::close(listen_sd); - dout(10) << "accepter stopping" << dendl; - return 0; -} - -void Rank::Accepter::stop() -{ - done = true; - this->kill(SIGUSR1); - join(); -} - - - - -/******************************************** - * Rank - */ - - -/* - * note: assumes lock is held - */ -void Rank::reaper() -{ - dout(10) << "reaper" << dendl; - assert(lock.is_locked()); - - while (!pipe_reap_queue.empty()) { - Pipe *p = pipe_reap_queue.front(); - dout(10) << "reaper reaping pipe " << p << " " << p->get_peer_addr() << dendl; - p->unregister_pipe(); - pipe_reap_queue.pop_front(); - assert(pipes.count(p)); - pipes.erase(p); - p->join(); - dout(10) << "reaper reaped pipe " << p << " " << p->get_peer_addr() << dendl; - delete p; - } -} - - -int Rank::start_rank() -{ - lock.Lock(); - if (started) { - dout(10) << "start_rank already started" << dendl; - lock.Unlock(); - return 0; - } - dout(10) << "start_rank" << dendl; - lock.Unlock(); - - // bind to a socket - if (accepter.start() < 0) - return -1; - - lock.Lock(); - - dout(1) << "start_rank at " << rank_addr << dendl; - started = true; - lock.Unlock(); - return 0; -} - - - -/* connect_rank - * NOTE: assumes rank.lock held. - */ -Rank::Pipe *Rank::connect_rank(const entity_addr_t& addr) -{ - assert(rank.lock.is_locked()); - assert(addr != rank.rank_addr); - - dout(10) << "connect_rank to " << addr << ", creating pipe and registering" << dendl; - - // create pipe - Pipe *pipe = new Pipe(Pipe::STATE_CONNECTING); - pipe->peer_addr = addr; - pipe->start_writer(); - pipe->register_pipe(); - pipes.insert(pipe); - - return pipe; -} - - - - - - - - -/* register_entity - */ -Rank::EntityMessenger *Rank::register_entity(entity_name_t name) -{ - dout(10) << "register_entity " << name << dendl; - lock.Lock(); - - // create messenger - int erank = max_local; - EntityMessenger *msgr = new EntityMessenger(name, erank); - - // add to directory - max_local++; - local.resize(max_local); - stopped.resize(max_local); - - local[erank] = msgr; - stopped[erank] = false; - msgr->my_addr = rank_addr; - msgr->my_addr.v.erank = erank; - - dout(10) << "register_entity " << name << " at " << msgr->my_addr << dendl; - - num_local++; - - lock.Unlock(); - return msgr; -} - - -void Rank::unregister_entity(EntityMessenger *msgr) -{ - lock.Lock(); - dout(10) << "unregister_entity " << msgr->get_myname() << dendl; - - // remove from local directory. - local[msgr->my_rank] = 0; - stopped[msgr->my_rank] = true; - num_local--; - - wait_cond.Signal(); - - lock.Unlock(); -} - - -void Rank::submit_message(Message *m, const entity_addr_t& dest_addr) -{ - const entity_name_t dest = m->get_dest(); - - // lookup - entity_addr_t dest_proc_addr = dest_addr; - dest_proc_addr.v.erank = 0; - - lock.Lock(); - { - // local? - if (ceph_entity_addr_is_local(dest_addr.v, rank_addr.v)) { - if (dest_addr.v.erank < max_local && local[dest_addr.v.erank]) { - // local - dout(20) << "submit_message " << *m << " dest " << dest << " local" << dendl; - local[dest_addr.v.erank]->queue_message(m); - } else { - derr(0) << "submit_message " << *m << " dest " << dest << " " << dest_addr << " local but not in local map? dropping." << dendl; - //assert(0); // hmpf, this is probably mds->mon beacon from newsyn. - delete m; - } - } - else { - // remote. - Pipe *pipe = 0; - if (rank_pipe.count( dest_proc_addr )) { - // connected? - pipe = rank_pipe[ dest_proc_addr ]; - pipe->lock.Lock(); - if (pipe->state == Pipe::STATE_CLOSED) { - dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << dest_addr << ", ignoring old closed pipe." << dendl; - pipe->unregister_pipe(); - pipe->lock.Unlock(); - pipe = 0; - } else { - dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << dest_addr << ", have pipe." << dendl; - pipe->_send(m); - pipe->lock.Unlock(); - } - } - if (!pipe) { - dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << dest_addr << ", new pipe." << dendl; - // not connected. - pipe = connect_rank( dest_proc_addr ); - pipe->send(m); - } - } - } - - lock.Unlock(); -} - - - - - -void Rank::wait() -{ - lock.Lock(); - while (1) { - // reap dead pipes - reaper(); - - if (num_local == 0) { - dout(10) << "wait: everything stopped" << dendl; - break; // everything stopped. - } else { - dout(10) << "wait: local still has " << local.size() << " items, waiting" << dendl; - } - - wait_cond.Wait(lock); - } - lock.Unlock(); - - // done! clean up. - dout(20) << "wait: stopping accepter thread" << dendl; - accepter.stop(); - dout(20) << "wait: stopped accepter thread" << dendl; - - // close+reap all pipes - lock.Lock(); - { - dout(10) << "wait: closing pipes" << dendl; - list toclose; - for (hash_map::iterator i = rank_pipe.begin(); - i != rank_pipe.end(); - i++) - toclose.push_back(i->second); - for (list::iterator i = toclose.begin(); - i != toclose.end(); - i++) { - (*i)->unregister_pipe(); - (*i)->dirty_close(); - } - - reaper(); - dout(10) << "wait: waiting for pipes " << pipes << " to close" << dendl; - while (!pipes.empty()) { - wait_cond.Wait(lock); - reaper(); - } - } - lock.Unlock(); - - dout(10) << "wait: done." << dendl; - dout(1) << "shutdown complete." << dendl; -} - - - - - - -/********************************** - * EntityMessenger - */ - -void Rank::EntityMessenger::dispatch_entry() -{ - lock.Lock(); - while (!stop) { - if (!dispatch_queue.empty() || !prio_dispatch_queue.empty()) { - list ls; - if (!prio_dispatch_queue.empty()) { - ls.swap(prio_dispatch_queue); - pqlen = 0; - } else { - if (0) { - ls.swap(dispatch_queue); - qlen = 0; - } else { - // limit how much low-prio stuff we grab, to avoid starving high-prio messages! - ls.push_back(dispatch_queue.front()); - dispatch_queue.pop_front(); - qlen--; - } - } - - lock.Unlock(); - { - // deliver - while (!ls.empty()) { - if (stop) { - dout(1) << "dispatch: stop=true, discarding " << ls.size() - << " messages in dispatch queue" << dendl; - break; - } - Message *m = ls.front(); - ls.pop_front(); - dout(1) << m->get_dest() - << " <== " << m->get_source_inst() - << " ==== " << *m - << " ==== " << m - << dendl; - dispatch(m); - dout(20) << "done calling dispatch on " << m << dendl; - } - } - lock.Lock(); - continue; - } - cond.Wait(lock); - } - lock.Unlock(); - - // deregister - rank.unregister_entity(this); -} - -void Rank::EntityMessenger::ready() -{ - dout(10) << "ready " << get_myaddr() << dendl; - assert(!dispatch_thread.is_started()); - - // start my dispatch thread - dispatch_thread.create(); -} - - -int Rank::EntityMessenger::shutdown() -{ - dout(10) << "shutdown " << get_myaddr() << dendl; - - // stop my dispatch thread - if (dispatch_thread.am_self()) { - dout(10) << "shutdown i am dispatch, setting stop flag" << dendl; - stop = true; - } else { - dout(10) << "shutdown i am not dispatch, setting stop flag and joining thread." << dendl; - lock.Lock(); - stop = true; - cond.Signal(); - lock.Unlock(); - } - - return 0; -} - -void Rank::EntityMessenger::suicide() -{ - dout(10) << "suicide " << get_myaddr() << dendl; - shutdown(); - // hmm, or exit(0)? -} - -void Rank::EntityMessenger::prepare_dest(const entity_addr_t& addr) -{ - rank.lock.Lock(); - { - if (rank.rank_pipe.count(addr) == 0) - rank.connect_rank(addr); - } - rank.lock.Unlock(); -} - -int Rank::EntityMessenger::send_message(Message *m, entity_inst_t dest) -{ - // set envelope - m->set_source(get_myname()); - m->set_source_addr(my_addr); - m->set_dest_inst(dest); - - dout(1) << m->get_source() - << " --> " << dest.name << " " << dest.addr - << " -- " << *m - << " -- " << m - << dendl; - - rank.submit_message(m, dest.addr); - - return 0; -} - - - -void Rank::EntityMessenger::reset_myname(entity_name_t newname) -{ - entity_name_t oldname = get_myname(); - dout(10) << "reset_myname " << oldname << " to " << newname << dendl; - _set_myname(newname); -} - - - - -void Rank::EntityMessenger::mark_down(entity_addr_t a) -{ - rank.mark_down(a); -} - -void Rank::mark_down(entity_addr_t addr) -{ - //if (my_rank == 0) return; // ugh.. rank0 already handles this stuff in the namer - lock.Lock(); - /* - if (entity_map.count(a) && - entity_map[a] > inst) { - dout(10) << "mark_down " << a << " inst " << inst << " < " << entity_map[a] << dendl; - derr(10) << "mark_down " << a << " inst " << inst << " < " << entity_map[a] << dendl; - // do nothing! - } else { - if (entity_map.count(a) == 0) { - // don't know it - dout(10) << "mark_down " << a << " inst " << inst << " ... unknown by me" << dendl; - derr(10) << "mark_down " << a << " inst " << inst << " ... unknown by me" << dendl; - } else { - // know it - assert(entity_map[a] <= inst); - dout(10) << "mark_down " << a << " inst " << inst << dendl; - derr(10) << "mark_down " << a << " inst " << inst << dendl; - - entity_map.erase(a); - - if (rank_pipe.count(inst)) { - rank_pipe[inst]->close(); - rank_pipe.erase(inst); - } - } - } - */ - lock.Unlock(); -} - - - - - -/************************************** - * Pipe - */ - -#undef dout -#undef derr -#define dout(l) if (l<=g_conf.debug_ms) *_dout << dbeginl << g_clock.now() << " " << pthread_self() << " -- " << rank.rank_addr << " >> " << peer_addr << " pipe(" << this << ")." -#define derr(l) if (l<=g_conf.debug_ms) *_derr << dbeginl << g_clock.now() << " " << pthread_self() << " -- " << rank.rank_addr << " >> " << peer_addr << " pipe(" << this << ")." - -/* - * we have to be careful about connection races: - * A initiates connection - * B initiates connection - * B accepts A's connection - * A rejects B's connection (or vice-versa) - * - * this is controlled by whether accept uses the new incoming socket - * as the new pipe. two cases: - * old new - * connecting connecting -> use socket initiated by lower address - * open connecting - * -> use new socket _only_ if connect_seq matches. that is, the - * peer reconnected subsequent to the current open socket. if - * connect_seq _doesn't_ match, it means that it is an old attempt. - */ - -int Rank::Pipe::accept() -{ - dout(10) << "accept" << dendl; - - // my creater gave me sd via accept() - assert(state == STATE_ACCEPTING); - - // announce myself. - int rc = tcp_write(sd, (char*)&rank.rank_addr, sizeof(rank.rank_addr)); - if (rc < 0) { - dout(10) << "accept couldn't write my addr" << dendl; - state = STATE_CLOSED; - return -1; - } - - // identify peer - rc = tcp_read(sd, (char*)&peer_addr, sizeof(peer_addr)); - if (rc < 0) { - dout(10) << "accept couldn't read peer addr" << dendl; - state = STATE_CLOSED; - return -1; - } - - __u32 cseq; - rc = tcp_read(sd, (char*)&cseq, sizeof(cseq)); - if (rc < 0) { - dout(10) << "accept couldn't read connect seq" << dendl; - state = STATE_CLOSED; - return -1; - } - - dout(20) << "accept got connect_seq " << cseq << dendl; - - // register pipe. - rank.lock.Lock(); - { - if (rank.rank_pipe.count(peer_addr) == 0) { - dout(10) << "accept new peer " << peer_addr << dendl; - register_pipe(); - } else { - // hmm! - Pipe *other = rank.rank_pipe[peer_addr]; - other->lock.Lock(); - - dout(10) << "accept got connect_seq " << cseq - << ", existing pipe connect_seq " << other->connect_seq - << " state " << other->state - << dendl; - - // if open race, low addr's pipe "wins". - // otherwise, look at oseq vs out_seq - if ((other->state == STATE_CONNECTING && peer_addr < rank.rank_addr) || - (other->state == STATE_OPEN && cseq == other->connect_seq)) { - dout(10) << "accept already had pipe " << other - << ", but switching to this new one" << dendl; - // switch to this new Pipe - other->state = STATE_CLOSED; - assert(q.empty()); - other->cond.Signal(); - other->unregister_pipe(); - register_pipe(); - - // steal queue and out_seq - other->take_queue(q); - out_seq = other->out_seq; - //for (list::iterator p = q.begin(); p != q.end(); p++) - //(*p)->set_seq(++out_seq); - } - else { - dout(10) << "accept already had pipe " << other - << ", closing other" << dendl; - state = STATE_CLOSED; - } - other->lock.Unlock(); - } - } - rank.lock.Unlock(); - - char tag; - if (state == STATE_CLOSED) { - dout(10) << "accept closed, sending REJECT tag" << dendl; - tag = CEPH_MSGR_TAG_REJECT; - } else { - dout(10) << "accept sending READY tag" << dendl; - tag = CEPH_MSGR_TAG_READY; - state = STATE_OPEN; - } - - if (tcp_write(sd, &tag, 1) < 0 || - tcp_write(sd, (char*)&connect_seq, sizeof(connect_seq)) < 0) { - // hrmpf - dout(10) << "accept couldn't send initial tag+seq: " - << strerror(errno) << dendl; - fault(); - } - - if (state != STATE_CLOSED) { - dout(10) << "accept starting writer, " - << "state=" << state << dendl; - start_writer(); - } - - return 0; // success. -} - -int Rank::Pipe::connect() -{ - dout(10) << "connect " << connect_seq << dendl; - assert(lock.is_locked()); - - if (sd > 0) { - ::close(sd); - sd = 0; - } - __u32 cseq = connect_seq; - - lock.Unlock(); - - int newsd; - char tag; - int rc; - struct sockaddr_in myAddr; - sockaddr_in tcpaddr; - entity_addr_t paddr; - struct msghdr msg; - struct iovec msgvec[2]; - int msglen; - - // create socket? - newsd = ::socket(AF_INET,SOCK_STREAM,0); - if (newsd < 0) { - dout(-1) << "connect couldn't created socket " << strerror(errno) << dendl; - assert(0); - goto fail; - } - - // bind any port - myAddr.sin_family = AF_INET; - myAddr.sin_addr.s_addr = htonl(INADDR_ANY); - myAddr.sin_port = htons( 0 ); - - rc = ::bind(newsd, (struct sockaddr *) &myAddr, sizeof(myAddr)); - assert(rc>=0); - - // connect! - peer_addr.make_addr(tcpaddr); - rc = ::connect(newsd, (sockaddr*)&tcpaddr, sizeof(myAddr)); - if (rc < 0) { - dout(10) << "connect error " << peer_addr - << ", " << errno << ": " << strerror(errno) << dendl; - goto fail; - } - - // disable Nagle algorithm? - if (g_conf.ms_tcp_nodelay) { - int flag = 1; - int r = ::setsockopt(newsd, IPPROTO_TCP, TCP_NODELAY, (char*)&flag, sizeof(flag)); - if (r < 0) - dout(0) << "connect couldn't set TCP_NODELAY: " << strerror(errno) << dendl; - } - - // identify peer - rc = tcp_read(newsd, (char*)&paddr, sizeof(paddr)); - if (!rc) { // bool - dout(0) << "connect couldn't read peer addr" << dendl; - goto fail; - } - dout(20) << "connect read peer addr " << paddr << dendl; - if (!ceph_entity_addr_is_local(peer_addr.v, paddr.v)) { - dout(0) << "connect peer identifies itself as " - << paddr << "... wrong node!" << dendl; - goto fail; - } - - // identify myself, and send open seq - memset(&msg, 0, sizeof(msg)); - msgvec[0].iov_base = (char*)&rank.rank_addr; - msgvec[0].iov_len = sizeof(rank.rank_addr); - msgvec[1].iov_base = (char*)&cseq; - msgvec[1].iov_len = sizeof(cseq); - msg.msg_iov = msgvec; - msg.msg_iovlen = 2; - msglen = msgvec[0].iov_len + msgvec[1].iov_len; - - if (do_sendmsg(newsd, &msg, msglen)) { - dout(20) << "connect couldn't write self, seq" << dendl; - goto fail; - } - - dout(20) << "connect wrote self, seq, waiting for tag" << dendl; - - // wait for tag - tag = -1; - if (tcp_read(newsd, &tag, 1) <= 0 || - tcp_read(newsd, (char*)&cseq, sizeof(cseq)) <= 0) - goto fail; - - dout(20) << "connect got initial tag " << (int)tag << " + seq " << cseq << dendl; - - lock.Lock(); - - // FINISH - if (state != STATE_CONNECTING) { - dout(20) << "connect hmm, not connecting anymore, failing" << dendl; - goto fail2; // hmm! - } - if (tag != CEPH_MSGR_TAG_READY) { - dout(20) << "connect didn't get READY tag, my connect_seq=" << connect_seq - << ", got " << cseq << dendl; - if (connect_seq != cseq) { - dout(0) << "connect got REJECT tag, old connect_seq was " << connect_seq - << ", taking new " << cseq << dendl; - connect_seq = cseq; - } - goto fail2; - } - state = STATE_OPEN; - this->sd = newsd; - connect_seq++; - first_fault = last_attempt = utime_t(); - dout(20) << "connect success " << connect_seq << dendl; - - if (!reader_running) { - dout(20) << "connect starting reader" << dendl; - start_reader(); - } - return 0; - - fail: - lock.Lock(); - fail2: - if (newsd > 0) ::close(newsd); - fault(); - return -1; -} - -void Rank::Pipe::register_pipe() -{ - dout(10) << "register" << dendl; - assert(rank.lock.is_locked()); - assert(rank.rank_pipe.count(peer_addr) == 0); - rank.rank_pipe[peer_addr] = this; -} - -void Rank::Pipe::unregister_pipe() -{ - assert(rank.lock.is_locked()); - if (rank.rank_pipe.count(peer_addr) && - rank.rank_pipe[peer_addr] == this) { - dout(10) << "unregister" << dendl; - rank.rank_pipe.erase(peer_addr); - } else { - dout(10) << "unregister - not registered" << dendl; - } -} - -void Rank::Pipe::fault() -{ - assert(lock.is_locked()); - - if (q.empty()) { - dout(0) << "fault nothing to send, closing" << dendl; - state = STATE_CLOSED; - } else { - utime_t now = g_clock.now(); - if (state != STATE_CONNECTING) { - dout(0) << "fault initiating reconnect" << dendl; - connect_seq++; - state = STATE_CONNECTING; - first_fault = now; - } else if (first_fault.sec() == 0) { - dout(0) << "fault during connect" << dendl; - first_fault = now; - } else { - utime_t failinterval = now - first_fault; - utime_t retryinterval = now - last_attempt; - dout(10) << "fault failure was " << failinterval - << " ago, last attempt was at " << last_attempt - << ", " << retryinterval << " ago" << dendl; - if (failinterval > g_conf.ms_fail_interval) { - // give up - dout(0) << "fault giving up" << dendl; - state = STATE_CLOSED; - fail(); - } else if (retryinterval < g_conf.ms_retry_interval) { - // wait - now += (g_conf.ms_retry_interval - retryinterval); - dout(10) << "fault waiting until " << now << dendl; - cond.WaitUntil(lock, now); - dout(10) << "fault done waiting or woke up" << dendl; - } - } - last_attempt = now; - } - cond.Signal(); -} - -void Rank::Pipe::fail() -{ - derr(10) << "fail" << dendl; - assert(lock.is_locked()); - - cond.Signal(); - - // deactivate myself - lock.Unlock(); - rank.lock.Lock(); - unregister_pipe(); - rank.lock.Unlock(); - lock.Lock(); - - // report failures - q.splice(q.begin(), sent); - while (!q.empty()) { - Message *m = q.front(); - q.pop_front(); - unsigned srcrank = m->get_source_inst().addr.v.erank; - if (srcrank >= rank.max_local || rank.local[srcrank] == 0) { - dout(1) << "fail on " << *m << ", srcrank " << srcrank << " dne, dropping" << dendl; - delete m; - continue; - } - if (rank.local[srcrank]->is_stopped()) { - dout(1) << "fail on " << *m << ", dispatcher stopping, ignoring." << dendl; - delete m; - continue; - } - dout(10) << "fail on " << *m << dendl; - rank.local[srcrank]->get_dispatcher()->ms_handle_failure(m, m->get_dest_inst()); - } -} - - - -void Rank::Pipe::dirty_close() -{ - dout(10) << "dirty_close" << dendl; - lock.Lock(); - state = STATE_CLOSING; - cond.Signal(); - lock.Unlock(); -} - - -/* read msgs from socket. - * also, server. - */ -void Rank::Pipe::reader() -{ - lock.Lock(); - - if (state == STATE_ACCEPTING) - accept(); - - // loop. - while (state != STATE_CLOSED) { - assert(lock.is_locked()); - - // sleep if (re)connecting - if (state == STATE_CONNECTING) { - dout(20) << "reader sleeping during reconnect" << dendl; - cond.Wait(lock); - continue; - } - - lock.Unlock(); - - char tag = -1; - dout(20) << "reader reading tag..." << dendl; - int rc = tcp_read(sd, (char*)&tag, 1); - if (rc <= 0) { - lock.Lock(); - dout(20) << "reader couldn't read tag" << dendl; - fault(); - continue; - } - - // open ... - if (tag == CEPH_MSGR_TAG_ACK) { - dout(20) << "reader got ACK" << dendl; - __u32 seq; - int rc = tcp_read( sd, (char*)&seq, sizeof(seq)); - lock.Lock(); - if (rc < 0) { - dout(20) << "reader couldn't read ack seq" << dendl; - fault(); - } else { - dout(15) << "reader got ack seq " << seq << dendl; - // trim sent list - while (!sent.empty() && - sent.front()->get_seq() <= seq) { - Message *m = sent.front(); - sent.pop_front(); - dout(10) << "reader got ack seq " - << seq << " >= " << m->get_seq() << " on " << m << " " << *m << dendl; - delete m; - } - } - continue; - } - - else if (tag == CEPH_MSGR_TAG_MSG) { - dout(20) << "reader got MSG" << dendl; - Message *m = read_message(); - if (!m) { - derr(10) << "reader read null message" << dendl; - lock.Lock(); - fault(); - continue; - } - - // note received seq# - lock.Lock(); - if (m->get_seq() <= in_seq) { - dout(-10) << "reader got old message " - << m->get_seq() << " <= " << in_seq << " " << m << " " << *m - << " for " << m->get_dest() - << ", discarding" << dendl; - delete m; - continue; - } - in_seq++; - assert(in_seq == m->get_seq()); - cond.Signal(); // wake up writer, to ack this - lock.Unlock(); - - dout(10) << "reader got message " - << m->get_seq() << " " << m << " " << *m - << " for " << m->get_dest() << dendl; - - // deliver - EntityMessenger *entity = 0; - - rank.lock.Lock(); - { - unsigned erank = m->get_dest_inst().addr.v.erank; - if (erank < rank.max_local && rank.local[erank]) { - // find entity - entity = rank.local[erank]; - } else { - derr(0) << "reader got message " << *m << " for " << m->get_dest() << ", which isn't local" << dendl; - } - } - rank.lock.Unlock(); - - if (entity) - entity->queue_message(m); // queue - - lock.Lock(); - } - - else if (tag == CEPH_MSGR_TAG_CLOSE) { - dout(20) << "reader got CLOSE" << dendl; - lock.Lock(); - fault(); // treat as a fault; i.e. reconnect|close - continue; - } - else { - dout(0) << "reader bad tag " << (int)tag << dendl; - lock.Lock(); - fault(); - } - } - - - // reap? - bool reap = false; - reader_running = false; - if (!writer_running) reap = true; - - lock.Unlock(); - - if (reap) { - dout(20) << "reader queueing for reap" << dendl; - if (sd > 0) ::close(sd); - rank.lock.Lock(); - { - rank.pipe_reap_queue.push_back(this); - rank.wait_cond.Signal(); - } - rank.lock.Unlock(); - } -} - - - -/* write msgs to socket. - * also, client. - */ -void Rank::Pipe::writer() -{ - lock.Lock(); - - while (state != STATE_CLOSED) { - // connect? - if (state == STATE_CONNECTING) { - connect(); - continue; - } - - if (state == STATE_CLOSING) { - // write close tag - dout(20) << "writer writing CLOSE tag" << dendl; - char c = CEPH_MSGR_TAG_CLOSE; - lock.Unlock(); - ::write(sd, &c, 1); - lock.Lock(); - state = STATE_CLOSED; - continue; - } - - if (state != STATE_CONNECTING && - (!q.empty() || in_seq > in_seq_acked)) { - - // send ack? - if (in_seq > in_seq_acked) { - int send_seq = in_seq; - lock.Unlock(); - int rc = write_ack(send_seq); - lock.Lock(); - if (rc < 0) { - dout(20) << "writer couldn't write ack" << dendl; - fault(); - continue; - } - in_seq_acked = send_seq; - } - - // grab outgoing message - if (!q.empty()) { - Message *m = q.front(); - q.pop_front(); - sent.push_back(m); // move to sent list - lock.Unlock(); - dout(20) << "writer sending " << m->get_seq() << " " << m << " " << *m << dendl; - if (m->empty_payload()) m->encode_payload(); - int rc = write_message(m); - lock.Lock(); - - if (rc < 0) { - derr(1) << "writer error sending " << *m << " to " << m->get_dest() << ", " - << errno << ": " << strerror(errno) << dendl; - fault(); - } - } - continue; - } - - // wait - dout(20) << "writer sleeping" << dendl; - cond.Wait(lock); - } - - dout(20) << "writer finishing" << dendl; - - // reap? - bool reap = false; - writer_running = false; - if (!reader_running) reap = true; - - lock.Unlock(); - - if (reap) { - dout(20) << "writer queueing for reap" << dendl; - if (sd > 0) ::close(sd); - rank.lock.Lock(); - { - rank.pipe_reap_queue.push_back(this); - rank.wait_cond.Signal(); - } - rank.lock.Unlock(); - } -} - - -Message *Rank::Pipe::read_message() -{ - // envelope - //dout(10) << "receiver.read_message from sd " << sd << dendl; - - ceph_message_header env; - if (!tcp_read( sd, (char*)&env, sizeof(env) )) - return 0; - - dout(20) << "reader got envelope type=" << env.type - << " src " << env.src << " dst " << env.dst - << " nchunks=" << env.nchunks - << dendl; - - // payload - bufferlist blist; - int32_t pos = 0; - list chunk_at; - for (unsigned i=0; iset_chunk_payload_at(chunk_at); - - dout(20) << "reader got " << s << " byte message from " - << m->get_source() << dendl; - - return m; -} - - -int Rank::Pipe::do_sendmsg(int sd, struct msghdr *msg, int len) -{ - while (len > 0) { - if (0) { // sanity - int l = 0; - for (unsigned i=0; imsg_iovlen; i++) - l += msg->msg_iov[i].iov_len; - assert(l == len); - } - - int r = ::sendmsg(sd, msg, 0); - if (r < 0) { - assert(r == -1); - dout(1) << "error on sendmsg " << strerror(errno) << dendl; - return -1; - } - len -= r; - if (len == 0) break; - - // hrmph. trim r bytes off the front of our message. - dout(20) << "partial sendmsg, did " << r << ", still have " << len << dendl; - while (r > 0) { - if (msg->msg_iov[0].iov_len <= (size_t)r) { - // lose this whole item - //dout(30) << "skipping " << msg->msg_iov[0].iov_len << ", " << (msg->msg_iovlen-1) << " v, " << r << " left" << dendl; - r -= msg->msg_iov[0].iov_len; - msg->msg_iov++; - msg->msg_iovlen--; - } else { - // partial! - //dout(30) << "adjusting " << msg->msg_iov[0].iov_len << ", " << msg->msg_iovlen << " v, " << r << " left" << dendl; - msg->msg_iov[0].iov_base = (void*)((long)msg->msg_iov[0].iov_base + r); - msg->msg_iov[0].iov_len -= r; - break; - } - } - } - return 0; -} - - -int Rank::Pipe::write_ack(unsigned seq) -{ - dout(10) << "write_ack " << seq << dendl; - - char c = CEPH_MSGR_TAG_ACK; - __u32 s = seq;/*cpu_to_le32(seq);*/ - - struct msghdr msg; - memset(&msg, 0, sizeof(msg)); - struct iovec msgvec[2]; - msgvec[0].iov_base = &c; - msgvec[0].iov_len = 1; - msgvec[1].iov_base = &s; - msgvec[1].iov_len = sizeof(s); - msg.msg_iov = msgvec; - msg.msg_iovlen = 2; - - if (do_sendmsg(sd, &msg, 5) < 0) - return -1; - return 0; -} - - -int Rank::Pipe::write_message(Message *m) -{ - // get envelope, buffers - ceph_message_header *env = &m->get_envelope(); - bufferlist blist; - blist.claim( m->get_payload() ); - - // chunk out page aligned buffers? - if (blist.length() == 0) - env->nchunks = 0; - else { - env->nchunks = 1 + m->get_chunk_payload_at().size(); // header + explicit chunk points - if (!m->get_chunk_payload_at().empty()) - dout(20) << "chunking at " << m->get_chunk_payload_at() - << " in " << *m << " len " << blist.length() - << dendl; - } - - dout(20) << "write_message " << m << " " << *m - << " to " << m->get_dest() - << " in " << env->nchunks - << dendl; - - // set up msghdr and iovecs - struct msghdr msg; - memset(&msg, 0, sizeof(msg)); - struct iovec msgvec[2 + blist.buffers().size() + env->nchunks*2]; // conservative upper bound - msg.msg_iov = msgvec; - int msglen = 0; - - // send tag - char tag = CEPH_MSGR_TAG_MSG; - msgvec[msg.msg_iovlen].iov_base = &tag; - msgvec[msg.msg_iovlen].iov_len = 1; - msglen++; - msg.msg_iovlen++; - - // send envelope - msgvec[msg.msg_iovlen].iov_base = (char*)env; - msgvec[msg.msg_iovlen].iov_len = sizeof(*env); - msglen += sizeof(*env); - msg.msg_iovlen++; - - // payload - list::const_iterator pb = blist.buffers().begin(); - list::const_iterator pc = m->get_chunk_payload_at().begin(); - int b_off = 0; // carry-over buffer offset, if any - int bl_pos = 0; // blist pos - int nchunks = env->nchunks; - int32_t chunksizes[nchunks]; - - for (int curchunk=0; curchunk < nchunks; curchunk++) { - // start a chunk - int32_t size = blist.length() - bl_pos; - if (pc != m->get_chunk_payload_at().end()) { - assert(*pc > bl_pos); - size = *pc - bl_pos; - dout(30) << "pos " << bl_pos << " explicit chunk at " << *pc << " size " << size << " of " << blist.length() << dendl; - pc++; - } - assert(size > 0); - dout(30) << "chunk " << curchunk << " pos " << bl_pos << " size " << size << dendl; - - // chunk size - chunksizes[curchunk] = size; - msgvec[msg.msg_iovlen].iov_base = &chunksizes[curchunk]; - msgvec[msg.msg_iovlen].iov_len = sizeof(int32_t); - msglen += sizeof(int32_t); - msg.msg_iovlen++; - - // chunk contents - int left = size; - while (left > 0) { - int donow = MIN(left, (int)pb->length()-b_off); - assert(donow > 0); - dout(30) << " bl_pos " << bl_pos << " b_off " << b_off - << " leftinchunk " << left - << " buffer len " << pb->length() - << " writing " << donow - << dendl; - - if (msg.msg_iovlen >= IOV_MAX-1) { - if (do_sendmsg(sd, &msg, msglen)) - return -1; - - // and restart the iov - msg.msg_iov = msgvec; - msg.msg_iovlen = 0; - msglen = 0; - } - - msgvec[msg.msg_iovlen].iov_base = (void*)(pb->c_str()+b_off); - msgvec[msg.msg_iovlen].iov_len = donow; - msglen += donow; - msg.msg_iovlen++; - - left -= donow; - assert(left >= 0); - b_off += donow; - bl_pos += donow; - if (b_off != (int)pb->length()) - break; - pb++; - b_off = 0; - } - assert(left == 0); - } - assert(pb == blist.buffers().end()); - - // send - if (do_sendmsg(sd, &msg, msglen)) - return -1; - - return 0; -} - - diff --git a/branches/sage/ebofs2/msg/SimpleMessenger.h b/branches/sage/ebofs2/msg/SimpleMessenger.h deleted file mode 100644 index 4ef9144e343ca..0000000000000 --- a/branches/sage/ebofs2/msg/SimpleMessenger.h +++ /dev/null @@ -1,320 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __SIMPLEMESSENGER_H -#define __SIMPLEMESSENGER_H - - -#include -#include -using namespace std; -#include -#include -using namespace __gnu_cxx; - - -#include "include/types.h" - -#include "common/Mutex.h" -#include "common/Cond.h" -#include "common/Thread.h" - -#include "Messenger.h" -#include "Message.h" -#include "tcp.h" - - - -/* Rank - per-process - */ -class Rank { -public: - void sigint(); - -private: - class EntityMessenger; - class Pipe; - - // incoming - class Accepter : public Thread { - public: - bool done; - - int listen_sd; - - Accepter() : done(false) {} - - void *entry(); - void stop(); - int start(); - } accepter; - - void sigint(int r); - - - // pipe - class Pipe { - public: - enum { - STATE_ACCEPTING, - STATE_CONNECTING, - STATE_OPEN, - STATE_CLOSED, - STATE_CLOSING - //STATE_GOTCLOSE, // got (but haven't sent) a close - //STATE_SENTCLOSE // sent (but haven't got) a close - }; - - int sd; - int new_sd; - entity_addr_t peer_addr; - - Mutex lock; - int state; - - protected: - - utime_t first_fault; // time of original failure - utime_t last_attempt; // time of last reconnect attempt - - bool reader_running; - bool writer_running; - - list q; - list sent; - Cond cond; - - __u32 connect_seq; - __u32 out_seq; - __u32 in_seq, in_seq_acked; - - int accept(); // server handshake - int connect(); // client handshake - void reader(); - void writer(); - - Message *read_message(); - int write_message(Message *m); - int do_sendmsg(int sd, struct msghdr *msg, int len); - int write_ack(unsigned s); - - void fault(); - void fail(); - - void take_queue(list& ls) { - ls.splice(ls.begin(), q); - ls.splice(ls.begin(), sent); - } - - // threads - class Reader : public Thread { - Pipe *pipe; - public: - Reader(Pipe *p) : pipe(p) {} - void *entry() { pipe->reader(); return 0; } - } reader_thread; - friend class Reader; - - class Writer : public Thread { - Pipe *pipe; - public: - Writer(Pipe *p) : pipe(p) {} - void *entry() { pipe->writer(); return 0; } - } writer_thread; - friend class Writer; - - public: - Pipe(int st) : - sd(0), - state(st), - reader_running(false), writer_running(false), - connect_seq(0), - out_seq(0), in_seq(0), in_seq_acked(0), - reader_thread(this), writer_thread(this) { } - - void start_reader() { - reader_running = true; - reader_thread.create(); - } - void start_writer() { - writer_running = true; - writer_thread.create(); - } - - // public constructors - static const Pipe& Server(int s); - static const Pipe& Client(const entity_addr_t& pi); - - entity_addr_t& get_peer_addr() { return peer_addr; } - - void register_pipe(); - void unregister_pipe(); - void dirty_close(); - void join() { - if (writer_thread.is_started()) writer_thread.join(); - if (reader_thread.is_started()) { - reader_thread.kill(SIGUSR1); - reader_thread.join(); - } - } - - void send(Message *m) { - lock.Lock(); - _send(m); - lock.Unlock(); - } - void _send(Message *m) { - q.push_back(m); - m->set_seq(++out_seq); - cond.Signal(); - } - - void force_close() { - ::close(sd); - } - }; - - - // messenger interface - class EntityMessenger : public Messenger { - Mutex lock; - Cond cond; - list dispatch_queue; - list prio_dispatch_queue; - bool stop; - int qlen, pqlen; - int my_rank; - entity_addr_t my_addr; - - class DispatchThread : public Thread { - EntityMessenger *m; - public: - DispatchThread(EntityMessenger *_m) : m(_m) {} - void *entry() { - m->dispatch_entry(); - return 0; - } - } dispatch_thread; - void dispatch_entry(); - - friend class Rank; - - public: - void queue_message(Message *m) { - // set recv stamp - m->set_recv_stamp(g_clock.now()); - - lock.Lock(); - if (m->get_source().is_mon()) { - prio_dispatch_queue.push_back(m); - pqlen++; - } else { - qlen++; - dispatch_queue.push_back(m); - } - cond.Signal(); - lock.Unlock(); - } - - public: - EntityMessenger(entity_name_t myaddr, int r) : - Messenger(myaddr), - stop(false), - qlen(0), pqlen(0), - my_rank(r), - dispatch_thread(this) { } - ~EntityMessenger() { - // join dispatch thread - if (dispatch_thread.is_started()) - dispatch_thread.join(); - } - - void ready(); - bool is_stopped() { return stop; } - - void wait() { - dispatch_thread.join(); - } - - const entity_addr_t &get_myaddr() { return my_addr; } - - int get_dispatch_queue_len() { return qlen + pqlen; } - - void reset_myname(entity_name_t m); - - int shutdown(); - void suicide(); - void prepare_dest(const entity_addr_t& addr); - int send_message(Message *m, entity_inst_t dest); - - void mark_down(entity_addr_t a); - void mark_up(entity_name_t a, entity_addr_t& i); - }; - - - // Rank stuff - public: - Mutex lock; - Cond wait_cond; // for wait() - bool started; - - // where i listen - entity_addr_t rank_addr; - - // local - unsigned max_local, num_local; - vector local; - vector stopped; - - // remote - hash_map rank_pipe; - - set pipes; - list pipe_reap_queue; - - Pipe *connect_rank(const entity_addr_t& addr); - - const entity_addr_t &get_rank_addr() { return rank_addr; } - - void mark_down(entity_addr_t addr); - - void reaper(); - -public: - Rank() : started(false), - max_local(0), num_local(0) { } - ~Rank() { } - - //void set_listen_addr(tcpaddr_t& a); - - int start_rank(); - void wait(); - - EntityMessenger *register_entity(entity_name_t addr); - void rename_entity(EntityMessenger *ms, entity_name_t newaddr); - void unregister_entity(EntityMessenger *ms); - - void submit_message(Message *m, const entity_addr_t& addr); - void prepare_dest(const entity_addr_t& addr); - - // create a new messenger - EntityMessenger *new_entity(entity_name_t addr); - -} ; - - - -extern Rank rank; - -#endif diff --git a/branches/sage/ebofs2/msg/msg_types.h b/branches/sage/ebofs2/msg/msg_types.h deleted file mode 100644 index a9e3ec4f970f8..0000000000000 --- a/branches/sage/ebofs2/msg/msg_types.h +++ /dev/null @@ -1,191 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MSG_TYPES_H -#define __MSG_TYPES_H - -#include "include/types.h" -#include "include/blobhash.h" -#include "tcp.h" - -class entity_name_t { - struct ceph_entity_name v; - -public: - static const int TYPE_MON = CEPH_ENTITY_TYPE_MON; - static const int TYPE_MDS = CEPH_ENTITY_TYPE_MDS; - static const int TYPE_OSD = CEPH_ENTITY_TYPE_OSD; - static const int TYPE_CLIENT = CEPH_ENTITY_TYPE_CLIENT; - static const int TYPE_ADMIN = CEPH_ENTITY_TYPE_ADMIN; - - static const int NEW = -1; - - // cons - entity_name_t() { v.type = v.num = 0; } - entity_name_t(int t, int n=NEW) { v.type = t; v.num = n; } - - // static cons - static entity_name_t MON(int i=NEW) { return entity_name_t(TYPE_MON, i); } - static entity_name_t MDS(int i=NEW) { return entity_name_t(TYPE_MDS, i); } - static entity_name_t OSD(int i=NEW) { return entity_name_t(TYPE_OSD, i); } - static entity_name_t CLIENT(int i=NEW) { return entity_name_t(TYPE_CLIENT, i); } - static entity_name_t ADMIN(int i=NEW) { return entity_name_t(TYPE_ADMIN, i); } - - int num() const { return v.num; } - int type() const { return v.type; } - const char *type_str() const { - switch (type()) { - case TYPE_MDS: return "mds"; - case TYPE_OSD: return "osd"; - case TYPE_MON: return "mon"; - case TYPE_CLIENT: return "client"; - case TYPE_ADMIN: return "admin"; - default: return "unknown"; - } - } - - bool is_new() const { return num() < 0; } - - bool is_client() const { return type() == TYPE_CLIENT; } - bool is_mds() const { return type() == TYPE_MDS; } - bool is_osd() const { return type() == TYPE_OSD; } - bool is_mon() const { return type() == TYPE_MON; } - bool is_admin() const { return type() == TYPE_ADMIN; } -}; - -inline bool operator== (const entity_name_t& l, const entity_name_t& r) { - return (l.type() == r.type()) && (l.num() == r.num()); } -inline bool operator!= (const entity_name_t& l, const entity_name_t& r) { - return (l.type() != r.type()) || (l.num() != r.num()); } -inline bool operator< (const entity_name_t& l, const entity_name_t& r) { - return (l.type() < r.type()) || (l.type() == r.type() && l.num() < r.num()); } - -inline std::ostream& operator<<(std::ostream& out, const entity_name_t& addr) { - //if (addr.is_namer()) return out << "namer"; - if (addr.is_new() || addr.num() < 0) - return out << addr.type_str() << "?"; - else - return out << addr.type_str() << addr.num(); -} -inline std::ostream& operator<<(std::ostream& out, const ceph_entity_name& addr) { - return out << *(const entity_name_t*)&addr; -} - -namespace __gnu_cxx { - template<> struct hash< entity_name_t > - { - size_t operator()( const entity_name_t m ) const - { - static blobhash H; - return H((const char*)&m, sizeof(m)); - } - }; -} - - - -/* - * an entity's network address. - * includes a random value that prevents it from being reused. - * thus identifies a particular process instance. - * ipv4 for now. - */ -struct entity_addr_t { - struct ceph_entity_addr v; - - entity_addr_t() { - memset(&v, 0, sizeof(v)); - } - - void set_addr(sockaddr_in& a) { - memcpy((char*)&v.ipaddr, (char*)&a, sizeof(a)); - } - void make_addr(sockaddr_in& a) const { - memcpy((char*)&a, (char*)&v.ipaddr, sizeof(a)); - } - void set_port(int port) { - v.ipaddr.sin_port = htons(port); - } - int get_port() { - return ntohs(v.ipaddr.sin_port); - } -}; - -inline ostream& operator<<(ostream& out, const entity_addr_t &addr) -{ - return out << addr.v.ipaddr - << '#' << addr.v.nonce - << '@' << addr.v.erank; -} - -inline bool operator==(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) == 0; } -inline bool operator!=(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) != 0; } -inline bool operator<(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) < 0; } -inline bool operator<=(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) <= 0; } -inline bool operator>(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) > 0; } -inline bool operator>=(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) >= 0; } - -namespace __gnu_cxx { - template<> struct hash< entity_addr_t > - { - size_t operator()( const entity_addr_t& x ) const - { - static blobhash H; - return H((const char*)&x, sizeof(x)); - } - }; -} - - -/* - * a particular entity instance - */ -struct entity_inst_t { - entity_name_t name; - entity_addr_t addr; - entity_inst_t() {} - entity_inst_t(entity_name_t n, const entity_addr_t& a) : name(n), addr(a) {} -}; - - -inline bool operator==(const entity_inst_t& a, const entity_inst_t& b) { return memcmp(&a, &b, sizeof(a)) == 0; } -inline bool operator!=(const entity_inst_t& a, const entity_inst_t& b) { return memcmp(&a, &b, sizeof(a)) != 0; } -inline bool operator<(const entity_inst_t& a, const entity_inst_t& b) { return memcmp(&a, &b, sizeof(a)) < 0; } -inline bool operator<=(const entity_inst_t& a, const entity_inst_t& b) { return memcmp(&a, &b, sizeof(a)) <= 0; } -inline bool operator>(const entity_inst_t& a, const entity_inst_t& b) { return memcmp(&a, &b, sizeof(a)) > 0; } -inline bool operator>=(const entity_inst_t& a, const entity_inst_t& b) { return memcmp(&a, &b, sizeof(a)) >= 0; } - -namespace __gnu_cxx { - template<> struct hash< entity_inst_t > - { - size_t operator()( const entity_inst_t& x ) const - { - static blobhash H; - return H((const char*)&x, sizeof(x)); - } - }; -} - -inline ostream& operator<<(ostream& out, const entity_inst_t &i) -{ - return out << i.name << " " << i.addr; -} -inline ostream& operator<<(ostream& out, const ceph_entity_inst &i) -{ - return out << *(const entity_inst_t*)&i; -} - - - -#endif diff --git a/branches/sage/ebofs2/msg/tcp.h b/branches/sage/ebofs2/msg/tcp.h deleted file mode 100644 index a59cf2a8ac47f..0000000000000 --- a/branches/sage/ebofs2/msg/tcp.h +++ /dev/null @@ -1,67 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -#ifndef __TCP_H -#define __TCP_H - -#include -#include -#include -#include - -using std::ostream; - -inline ostream& operator<<(ostream& out, const sockaddr_in &a) -{ - unsigned char addr[4]; - memcpy((char*)addr, (char*)&a.sin_addr.s_addr, 4); - out << (unsigned)addr[0] << "." - << (unsigned)addr[1] << "." - << (unsigned)addr[2] << "." - << (unsigned)addr[3] << ":" - << ntohs(a.sin_port); - return out; -} - -inline bool tcp_read(int sd, char *buf, int len) { - while (len > 0) { - int got = ::recv( sd, buf, len, 0 ); - if (got <= 0) { - //generic_dout(18) << "tcp_read socket " << sd << " closed" << dendl; - return false; - } - len -= got; - buf += got; - //generic_dout(DBL) << "tcp_read got " << got << ", " << len << " left" << dendl; - } - return true; -} - -inline int tcp_write(int sd, const char *buf, int len) { - //generic_dout(DBL) << "tcp_write writing " << len << dendl; - assert(len > 0); - while (len > 0) { - int did = ::send( sd, buf, len, 0 ); - if (did < 0) { - //generic_dout(1) << "tcp_write error did = " << did << " errno " << errno << " " << strerror(errno) << dendl; - //generic_derr(1) << "tcp_write error did = " << did << " errno " << errno << " " << strerror(errno) << dendl; - return did; - } - len -= did; - buf += did; - //generic_dout(DBL) << "tcp_write did " << did << ", " << len << " left" << dendl; - } - return 0; -} - - -extern int tcp_hostlookup(char *str, sockaddr_in& ta); - -inline bool operator==(const sockaddr_in& a, const sockaddr_in& b) { - return strncmp((const char*)&a, (const char*)&b, sizeof(a)) == 0; -} -inline bool operator!=(const sockaddr_in& a, const sockaddr_in& b) { - return strncmp((const char*)&a, (const char*)&b, sizeof(a)) != 0; -} - - -#endif diff --git a/branches/sage/ebofs2/newsyn.cc b/branches/sage/ebofs2/newsyn.cc deleted file mode 100644 index 4b37591cdca69..0000000000000 --- a/branches/sage/ebofs2/newsyn.cc +++ /dev/null @@ -1,438 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#define intabs(x) ((x) >= 0 ? (x):(-(x))) - -#include - -#include -#include -#include -using namespace std; - -#include - -#include "config.h" - -#include "mds/MDS.h" -#include "osd/OSD.h" -#include "mon/Monitor.h" -#include "client/Client.h" -#include "client/SyntheticClient.h" - -#include "msg/SimpleMessenger.h" - -#include "common/Timer.h" - -class C_Test : public Context { -public: - void finish(int r) { - cout << "C_Test->finish(" << r << ")" << std::endl; - } -}; - -extern std::map g_fake_kill_after; - - -/* - * start up NewMessenger via MPI. - */ - -pair mpi_bootstrap_new(int& argc, char**& argv, MonMap *monmap) -{ - MPI_Init(&argc, &argv); - - int mpi_world; - int mpi_rank; - MPI_Comm_size(MPI_COMM_WORLD, &mpi_world); - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - - // first, synchronize clocks. - if (g_conf.clock_tare) { - if (1) { - // use an MPI barrier. probably not terribly precise. - MPI_Barrier(MPI_COMM_WORLD); - g_clock.tare(); - } else { - // use wall clock; assume NTP has all nodes synchronized already. - // FIXME someday: this hangs for some reason. whatever. - utime_t z = g_clock.now(); - MPI_Bcast( &z, sizeof(z), MPI_CHAR, - 0, MPI_COMM_WORLD); - cout << "z is " << z << std::endl; - g_clock.tare(z); - } - } - - // start up all monitors at known addresses. - entity_inst_t moninst[mpi_world]; // only care about first g_conf.num_mon of these. - - rank.start_rank(); // bind and listen - - if (mpi_rank < g_conf.num_mon) { - moninst[mpi_rank].addr = rank.rank_addr; - moninst[mpi_rank].name = entity_name_t(entity_name_t::TYPE_MON, mpi_rank); - - //cerr << mpi_rank << " at " << rank.get_listen_addr() << std::endl; - } - - MPI_Gather( &moninst[mpi_rank], sizeof(entity_inst_t), MPI_CHAR, - moninst, sizeof(entity_inst_t), MPI_CHAR, - 0, MPI_COMM_WORLD); - - if (mpi_rank == 0) { - for (int i=0; imon_inst[i] = moninst[i]; - } - } - - - // distribute monmap - bufferlist bl; - if (mpi_rank == 0) { - monmap->encode(bl); - monmap->write(".ceph_monmap"); - } else { - int l = g_conf.num_mon * 1000; // nice'n big. - bufferptr bp(l); - bl.append(bp); - } - - MPI_Bcast(bl.c_str(), bl.length(), MPI_CHAR, - 0, MPI_COMM_WORLD); - - if (mpi_rank > 0) { - monmap->decode(bl); - } - - // wait for everyone! - MPI_Barrier(MPI_COMM_WORLD); - - return pair(mpi_rank, mpi_world); -} - -utime_t tick_start; -int tick_count = 0; - -class C_Tick : public Context { -public: - void finish(int) { - utime_t now = g_clock.now() - tick_start; - cout << "tick +" << g_conf.tick << " -> " << now << " (" << tick_count << ")" << std::endl; - tick_count += g_conf.tick; - utime_t next = tick_start; - next.sec_ref() += tick_count; - g_timer.add_event_at(next, new C_Tick); - } -}; - -class C_Die : public Context { -public: - void finish(int) { - cerr << "die" << std::endl; - _exit(1); - } -}; - -class C_Debug : public Context { - public: - void finish(int) { - int size = (long)&g_conf.debug_after - (long)&g_conf.debug; - memcpy((char*)&g_conf.debug, (char*)&g_debug_after_conf.debug, size); - cout << "debug_after flipping debug settings" << std::endl; - //g_conf.debug_ms = 1; - } -}; - - -int main(int argc, char **argv) -{ - vector args; - argv_to_vec(argc, argv, args); - - map kill_osd_after; - if (1) { - vector nargs; - for (unsigned i=0; i 0 ? g_conf.num_mon:0; - int start_mds = g_conf.num_mds > 0 ? g_conf.num_mds:0; - int start_osd = g_conf.num_osd > 0 ? g_conf.num_osd:0; - int start_client = g_conf.num_client > 0 ? g_conf.num_client:0; - - //g_conf.num_mon = intabs(g_conf.num_mon); - g_conf.num_mds = intabs(g_conf.num_mds); - g_conf.num_client = intabs(g_conf.num_client); - g_conf.num_osd = intabs(g_conf.num_osd); - - - if (g_conf.kill_after) - g_timer.add_event_after(g_conf.kill_after, new C_Die); - if (g_conf.debug_after) - g_timer.add_event_after(g_conf.debug_after, new C_Debug); - - if (g_conf.tick) { - tick_start = g_clock.now(); - g_timer.add_event_after(g_conf.tick, new C_Tick); - } - - vector nargs; - for (unsigned i=0; i mpiwho = mpi_bootstrap_new(argc, argv, monmap); - int mpirank = mpiwho.first; - int world = mpiwho.second; - - int need = 0; - if (g_conf.ms_skip_rank0) need++; - need += start_mds; - if (g_conf.ms_stripe_osds) - need++; - else - need += start_osd; - if (start_client) { - if (!g_conf.ms_overlay_clients) - need += 1; - } - assert(need <= world); - - if (mpirank == 0) - cerr << "nummds " << start_mds << " numosd " << start_osd << " numclient " << start_client << " .. need " << need << ", have " << world << std::endl; - - - char hostname[100]; - gethostname(hostname,100); - int pid = getpid(); - - int started = 0; - - //if (mpirank == 0) g_conf.debug = 20; - - // courtesy symlinks - char ffrom[100]; - char fto[100]; - sprintf(fto, "%s.%d", hostname, pid); - - - // create mon - if (mpirank < g_conf.num_mon) { - Monitor *mon = new Monitor(mpirank, rank.register_entity(entity_name_t(entity_name_t::TYPE_MON, mpirank)), monmap); - mon->init(); - if (g_conf.dout_dir) { - sprintf(ffrom, "%s/mon%d", g_conf.dout_dir, mpirank); - ::symlink(fto, ffrom); - } - } - - // wait for monitors to start. - MPI_Barrier(MPI_COMM_WORLD); - - // okay, home free! - MPI_Finalize(); - - - // create mds - map mds; - map mdsosd; - for (int i=0; iget_myaddr() << " " << hostname << "." << pid << std::endl; - if (g_conf.dout_dir) { - sprintf(ffrom, "%s/mds%d", g_conf.dout_dir, i); - ::symlink(fto, ffrom); - } - mds[i] = new MDS(i, m, monmap); - mds[i]->init(); - started++; - - if (g_conf.mds_local_osd) { - int n = i+g_conf.mds_local_osd_offset; - mdsosd[i] = new OSD(n, rank.register_entity(entity_name_t(entity_name_t::TYPE_OSD, n)), monmap); - mdsosd[i]->init(); - } - - if (g_fake_kill_after.count(entity_name_t::MDS(i))) { - cerr << "mds" << i << " will die after " << g_fake_kill_after[entity_name_t::MDS(i)] << std::endl; - g_timer.add_event_after(g_fake_kill_after[entity_name_t::MDS(i)], new C_Die); - } - } - - // create osd - map osd; - int max_osd_nodes = world - start_mds - g_conf.ms_skip_rank0; // assumes 0 clients, if we stripe. - int osds_per_node = (start_osd-1)/max_osd_nodes + 1; - for (int i=0; iget_myaddr() << " " << hostname << "." << pid << std::endl; - if (g_conf.dout_dir) { - sprintf(ffrom, "%s/osd%d", g_conf.dout_dir, i); - ::symlink(fto, ffrom); - } - - osd[i] = new OSD(i, m, monmap); - if (osd[i]->init() < 0) - return 1; - started++; - } - - if (g_conf.ms_overlay_clients) sleep(5); - - // create client - int skip_osd = start_osd; - if (g_conf.ms_overlay_clients) - skip_osd = 0; // put clients with osds too! - int client_nodes = world - start_mds - skip_osd - g_conf.ms_skip_rank0; - int clients_per_node = 1; - if (start_client && client_nodes > 0) clients_per_node = (start_client-1) / client_nodes + 1; - set clientlist; - map client;//[start_client]; - map syn;//[start_client]; - int nclients = 0; - for (int i=0; i::iterator it = clientlist.begin(); - it != clientlist.end(); - it++) { - int i = *it; - - //cerr << "starting synthetic client" << i << " on rank " << mpirank << std::endl; - syn[i]->start_thread(); - } - if (nclients) { - cerr << nclients << " clients at " << rank.rank_addr << " " << hostname << "." << pid << std::endl; - } - - for (set::iterator it = clientlist.begin(); - it != clientlist.end(); - it++) { - int i = *it; - // cout << "waiting for synthetic client" << i << " to finish" << std::endl; - syn[i]->join_thread(); - // fix simpelmeessenger race first! - //delete syn[i]; - //delete client[i]; - } - - - if (mpirank && !started) { - //dout(1) << "IDLE" << dendl; - cerr << "idle at " << rank.rank_addr << " mpirank " << mpirank << " " << hostname << "." << pid << std::endl; - } - - // wait for everything to finish - rank.wait(); - - cerr << "newsyn done on " << hostname << "." << pid << std::endl; - - // cd on exit, so that gmon.out (if any) goes into a separate directory for each node. - char s[20]; - sprintf(s, "gmon/%d", mpirank); - mkdir(s, 0755); - chdir(s); - - return 0; // whatever, cleanup hangs sometimes (stopping ebofs threads?). - - // cleanup - for (map::iterator i = mds.begin(); i != mds.end(); i++) - delete i->second; - for (map::iterator i = mdsosd.begin(); i != mdsosd.end(); i++) - delete i->second; - for (map::iterator i = osd.begin(); i != osd.end(); i++) - delete i->second; - /* - for (map::iterator i = client.begin(); i != client.end(); i++) - delete i->second; - for (map::iterator i = syn.begin(); i != syn.end(); i++) - delete i->second; - */ - /* - for (int i=0; i - -Ceph - scalable distributed file system - -This is free software; you can redistribute it and/or -modify it under the terms of the GNU Lesser General Public -License version 2.1, as published by the Free Software -Foundation. See file COPYING. */ - - -#include -#include -#include -#include "OSBDB.h" -#include "common/Timer.h" - -using namespace std; - -#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_bdbstore) *_dout << dbeginl << "bdbstore(" << device << ")@" << __LINE__ << "." -#define derr(x) if (x <= g_conf.debug || x <= g_conf.debug_bdbstore) *_derr << dbeginl << "bdbstore(" << device << ")@" << __LINE__ << "." - -#define CLEANUP(onsafe) do { \ - dout(6) << "DELETE " << hex << onsafe << dec << dendl; \ - delete onsafe; \ - } while (0) -#define COMMIT(onsafe) do { \ - dout(6) << "COMMIT " << hex << onsafe << dec << dendl; \ - sync(onsafe); \ - } while (0) - - // Have a lock, already. - -class scoped_lock -{ -private: - Mutex *m; -public: - scoped_lock(Mutex *m) : m(m) { m->Lock(); } - ~scoped_lock() { m->Unlock(); } -}; - - // Utilities. - -// Starting off with my own bsearch; mail reader to follow... - -// Perform a binary search on a sorted array, returning the insertion -// point for key, or key if it is exactly found. In other words, this -// will return a pointer to the element that will come after key if -// key were to be inserted into the sorted array. -// -// Requires that T have < and > operators defined. -template -uint32_t binary_search (T *array, size_t size, T key) -{ - int low = 0; - int high = size; - int p = (low + high) / 2; - - while (low < high - 1) - { - if (array[p] > key) - { - high = p; - } - else if (array[p] < key) - { - low = p; - } - else - return p; - - p = (low + high) / 2; - } - - if (array[p] < key) - p++; - else if (array[p] > key && p > 0) - p--; - return p; -} - - // Management. - -DbEnv *OSBDB::getenv () -{ - DbEnv *envp = new DbEnv (DB_CXX_NO_EXCEPTIONS); - if (g_conf.debug > 1 || g_conf.debug_bdbstore > 1) - envp->set_error_stream (&std::cerr); - if (g_conf.debug > 2 || g_conf.debug_bdbstore > 2) - envp->set_message_stream (&std::cout); - envp->set_flags (DB_LOG_INMEMORY, 1); - //env->set_flags (DB_DIRECT_DB, 1); - int env_flags = (DB_CREATE - | DB_THREAD - //| DB_INIT_LOCK - | DB_INIT_MPOOL - //| DB_INIT_TXN - //| DB_INIT_LOG - | DB_PRIVATE); - if (envp->open (NULL, env_flags, 0) != 0) - { - std::cerr << "failed to open environment " << std::endl; - assert(0); - } - return envp; -} - -int OSBDB::opendb(DBTYPE type, int flags, bool new_env) -{ - env = getenv(); - db = new Db(env, 0); - db->set_error_stream (&std::cerr); - db->set_message_stream (&std::cout); - db->set_flags (0); - if (!g_conf.bdbstore_btree) - { - if (g_conf.bdbstore_pagesize > 0) - db->set_pagesize (g_conf.bdbstore_pagesize); - if (g_conf.bdbstore_ffactor > 0 && g_conf.bdbstore_nelem > 0) - { - db->set_h_ffactor (g_conf.bdbstore_ffactor); - db->set_h_nelem (g_conf.bdbstore_nelem); - } - } - if (g_conf.bdbstore_cachesize > 0) - { - db->set_cachesize (0, g_conf.bdbstore_cachesize, 0); - } - - flags = flags | DB_THREAD; - if (transactional) - flags = flags | DB_AUTO_COMMIT; - - int ret; - if ((ret = db->open (NULL, device.c_str(), NULL, type, flags, 0)) != 0) - { - derr(1) << "failed to open database: " << device << ": " - << db_strerror(ret) << std::dendl; - return -EINVAL; - } - opened = true; - return 0; -} - -int OSBDB::mount() -{ - dout(2) << "mount " << device << dendl; - - if (mounted) - { - dout(4) << "..already mounted" << dendl; - return 0; - } - - if (!opened) - { - int ret; - if ((ret = opendb ()) != 0) - { - dout(4) << "..returns " << ret << dendl; - return ret; - } - } - - // XXX Do we want anything else in the superblock? - - Dbt key (OSBDB_SUPERBLOCK_KEY, 1); - stored_superblock super; - Dbt value (&super, sizeof (super)); - value.set_dlen (sizeof (super)); - value.set_ulen (sizeof (super)); - value.set_flags (DB_DBT_USERMEM | DB_DBT_PARTIAL); - - if (db->get (NULL, &key, &value, 0) != 0) - { - dout(4) << "..get superblock fails" << dendl; - return -EINVAL; // XXX how to say "badly formed fs?" - } - - dout(3) << ".mount " << super << dendl; - - if (super.version != OSBDB_THIS_VERSION) - { - dout(4) << "version mismatch (" << super.version << ")" << dendl; - return -EINVAL; - } - - DBTYPE t; - db->get_type (&t); - - if (t == DB_BTREE) - { - u_int32_t minkey; - u_int32_t flags; - db->get_bt_minkey (&minkey); - db->get_flags (&flags); - dout(1) << "mounted version " << OSBDB_THIS_VERSION << "; Btree; " - << "min keys per page: " << minkey << "; flags: " - << hex << flags << dec << dendl; - cout << dec; - } - else - { - u_int32_t ffactor; - u_int32_t nelem; - u_int32_t flags; - db->get_h_ffactor (&ffactor); - db->get_h_nelem (&nelem); - db->get_flags (&flags); - dout(1) << "mounted version " << OSBDB_THIS_VERSION << "; Hash; " - << "fill factor: " << ffactor - << " table size: " << nelem << "; flags: " - << hex << flags << dec << dendl; - cout << dec; - } - - mounted = true; - dout(4) << "..mounted" << dendl; - return 0; -} - -int OSBDB::umount() -{ - if (!mounted) - return -EINVAL; - - dout(2) << "umount" << dendl; - - int ret; - if (opened) - { - if (transactional) - { - env->log_flush (NULL); - if ((ret = env->lsn_reset (device.c_str(), 0)) != 0) - { - derr(1) << "lsn_reset: " << db_strerror (ret) << dendl; - } - } - - db->sync (0); - - if ((ret = db->close (0)) != 0) - { - derr(1) << "close: " << db_strerror(ret) << dendl; - return -EINVAL; - } - delete db; - db = NULL; - - if (env) - { - env->close (0); - delete env; - env = NULL; - } - } - mounted = false; - opened = false; - dout(4) << "..unmounted" << dendl; - return 0; -} - -int OSBDB::mkfs() -{ - if (mounted) - return -EINVAL; - - dout(2) << "mkfs" << dendl; - - string d = env_dir; - d += device; - unlink (d.c_str()); - - int ret; - if ((ret = opendb((g_conf.bdbstore_btree ? DB_BTREE : DB_HASH), - DB_CREATE, true)) != 0) - { - derr(1) << "failed to open database: " << device << ": " - << db_strerror(ret) << std::dendl; - return -EINVAL; - } - opened = true; - dout(3) << "..opened " << device << dendl; - - uint32_t c; - ret = db->truncate (NULL, &c, 0); - if (ret != 0) - { - derr(1) << "db truncate failed: " << db_strerror (ret) << dendl; - return -EIO; // ??? - } - - Dbt key (OSBDB_SUPERBLOCK_KEY, 1); - struct stored_superblock sb; - sb.version = OSBDB_THIS_VERSION; - Dbt value (&sb, sizeof (sb)); - - dout(3) << "..writing superblock" << dendl; - if ((ret = db->put (NULL, &key, &value, 0)) != 0) - { - derr(1) << "failed to write superblock: " << db_strerror (ret) - << dendl; - return -EIO; - } - dout(3) << "..wrote superblock" << dendl; - dout(4) << "..mkfs done" << dendl; - return 0; -} - - // Objects. - -int OSBDB::pick_object_revision_lt(object_t& oid) -{ - // Not really needed. - dout(0) << "pick_object_revision_lt " << oid << dendl; - return -ENOSYS; -} - -bool OSBDB::exists(object_t oid) -{ - dout(2) << "exists " << oid << dendl; - struct stat st; - bool ret = (stat (oid, &st) == 0); - dout(4) << "..returns " << ret << dendl; - return ret; -} - -int OSBDB::statfs (struct statfs *st) -{ - // Hacky? - if (::statfs (device.c_str(), st) != 0) - { - int ret = -errno; - derr(1) << "statfs returns " << ret << dendl; - return ret; - } - st->f_type = OSBDB_MAGIC; - dout(4) << "..statfs OK" << dendl; - return 0; -} - -int OSBDB::stat(object_t oid, struct stat *st) -{ - if (!mounted) - { - dout(4) << "not mounted!" << dendl; - return -EINVAL; - } - - dout(2) << "stat " << oid << dendl; - - object_inode_key ikey = new_object_inode_key(oid); - stored_object obj; - Dbt key (&ikey, sizeof_object_inode_key()); - Dbt value (&obj, sizeof (obj)); - value.set_flags (DB_DBT_USERMEM); - value.set_ulen (sizeof (obj)); - - dout(3) << " lookup " << ikey << dendl; - int ret; - if ((ret = db->get (NULL, &key, &value, 0)) != 0) - { - derr(1) << " get returned " << ret << dendl; - return -ENOENT; - } - - st->st_size = obj.length; - dout(3) << "stat length:" << obj.length << dendl; - dout(4) << "..stat OK" << dendl; - return 0; -} - -int OSBDB::remove(object_t oid, Context *onsafe) -{ - if (!mounted) - { - derr(1) << "not mounted!" << dendl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - dout(6) << "Context " << hex << onsafe << dec << dendl; - scoped_lock __lock(&lock); - dout(2) << "remove " << oid << dendl; - - DbTxn *txn = NULL; - if (transactional) - env->txn_begin (NULL, &txn, 0); - - oid_t id; - mkoid(id, oid); - Dbt key (&id, sizeof (oid_t)); - int ret; - if ((ret = db->del (txn, &key, 0)) != 0) - { - derr(1) << ".del returned error: " << db_strerror (ret) << dendl; - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - - object_inode_key _ikey = new_object_inode_key (oid); - Dbt ikey (&_ikey, sizeof_object_inode_key()); - if ((ret = db->del (txn, &ikey, 0)) != 0) - { - derr(1) << ".del returned error: " << db_strerror (ret) << dendl; - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - - attrs_id aids = new_attrs_id (oid); - Dbt askey (&aids, sizeof_attrs_id()); - Dbt asval; - asval.set_flags (DB_DBT_MALLOC); - if (db->get (txn, &askey, &asval, 0) == 0) - { - // We have attributes; remove them. - stored_attrs *sap = (stored_attrs *) asval.get_data(); - auto_ptr sa (sap); - for (unsigned i = 0; i < sap->count; i++) - { - attr_id aid = new_attr_id (oid, sap->names[i].name); - Dbt akey (&aid, sizeof (aid)); - if ((ret = db->del (txn, &akey, 0)) != 0) - { - derr(1) << ".del returns error: " << db_strerror (ret) << dendl; - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - } - if ((ret = db->del (txn, &askey, 0)) != 0) - { - derr(1) << ".del returns error: " << db_strerror (ret) << dendl; - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - } - - // XXX check del return value - - if (txn != NULL) - txn->commit (0); - if (onsafe != NULL) - COMMIT(onsafe); - dout(4) << "..remove OK" << dendl; - return 0; -} - -int OSBDB::truncate(object_t oid, off_t size, Context *onsafe) -{ - dout(6) << "Context " << hex << onsafe << dec << dendl; - - if (!mounted) - { - derr(1) << "not mounted!" << dendl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - scoped_lock __lock(&lock); - dout(2) << "truncate " << size << dendl; - - if (size > 0xFFFFFFFF) - { - derr(1) << "object size too big!" << dendl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -ENOSPC; - } - - DbTxn *txn = NULL; - - if (transactional) - env->txn_begin (NULL, &txn, 0); - - object_inode_key ikey = new_object_inode_key (oid); - stored_object obj; - Dbt key (&ikey, sizeof_object_inode_key()); - Dbt value (&obj, sizeof (obj)); - value.set_dlen (sizeof (obj)); - value.set_ulen (sizeof (obj)); - value.set_flags (DB_DBT_USERMEM); - - if (db->get (txn, &key, &value, 0) != 0) - { - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - dout(4) << "..returns -ENOENT" << dendl; - return -ENOENT; - } - - if (obj.length < size) - { - oid_t id; - mkoid (id, oid); - Dbt okey (&id, sizeof (oid_t)); - char b[] = { '\0' }; - Dbt newVal (b, 1); - newVal.set_doff ((size_t) size); - newVal.set_dlen (1); - newVal.set_ulen (1); - newVal.set_flags (DB_DBT_PARTIAL); - if (db->put (txn, &okey, &newVal, 0) != 0) - { - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".updating object failed" << dendl; - return -EIO; - } - - obj.length = size; - value.set_ulen (sizeof (obj)); - if (db->put (txn, &key, &value, 0) != 0) - { - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".updating object info failed" << dendl; - return -EIO; - } - } - else if (obj.length > size) - { - obj.length = size; - Dbt tval (&obj, sizeof (obj)); - tval.set_ulen (sizeof (obj)); - tval.set_flags (DB_DBT_USERMEM); - if (db->put (txn, &key, &tval, 0) != 0) - { - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".updating object info failed" << dendl; - return -EIO; - } - if (size == 0) - { - char x[1]; - oid_t id; - mkoid (id, oid); - Dbt okey (&id, sizeof (oid_t)); - Dbt oval (&x, 0); - if (db->put (txn, &okey, &oval, 0) != 0) - { - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".updating object failed" << dendl; - return -EIO; - } - } - else - { - oid_t id; - mkoid (id, oid); - Dbt okey (&id, sizeof (oid_t)); - Dbt oval; - oval.set_flags (DB_DBT_MALLOC); - if (db->get (txn, &okey, &oval, 0) != 0) - { - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".getting old object failed" << dendl; - return -EIO; - } - auto_ptr ovalPtr ((char *) oval.get_data()); - oval.set_size ((size_t) size); - oval.set_ulen ((size_t) size); - if (db->put (txn, &okey, &oval, 0) != 0) - { - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".putting new object failed" << dendl; - return -EIO; - } - } - } - - if (txn) - txn->commit (0); - if (onsafe != NULL) - COMMIT(onsafe); - - dout(4) << "..truncate OK" << dendl; - return 0; -} - -int OSBDB::read(object_t oid, off_t offset, size_t len, bufferlist& bl) -{ - if (!mounted) - { - derr(1) << "not mounted!" << dendl; - return -EINVAL; - } - - dout(2) << "read " << oid << " " << offset << " " - << len << dendl; - - if (bl.length() < len) - { - int remain = len - bl.length(); - bufferptr ptr (remain); - bl.push_back(ptr); - } - - DbTxn *txn = NULL; - if (transactional) - env->txn_begin (NULL, &txn, 0); - - object_inode_key _ikey = new_object_inode_key (oid); - stored_object obj; - Dbt ikey (&_ikey, sizeof_object_inode_key()); - Dbt ival (&obj, sizeof (obj)); - ival.set_flags (DB_DBT_USERMEM); - ival.set_ulen (sizeof(obj)); - - dout(3) << "..get " << _ikey << dendl; - int ret; - if ((ret = db->get (txn, &ikey, &ival, 0)) != 0) - { - if (txn) - txn->abort(); - derr(1) << "get returned " << db_strerror (ret) << dendl; - return -ENOENT; - } - - dout(3) << "..object has size " << obj.length << dendl; - - if (offset == 0 && len >= obj.length) - { - len = obj.length; - dout(3) << "..doing full read of " << len << dendl; - oid_t id; - mkoid (id, oid); - Dbt key (&id, sizeof (oid_t)); - Dbt value (bl.c_str(), len); - value.set_ulen (len); - value.set_flags (DB_DBT_USERMEM); - dout(3) << "..getting " << oid << dendl; - if ((ret = db->get (txn, &key, &value, 0)) != 0) - { - derr(1) << ".get returned " << db_strerror (ret) << dendl; - if (txn) - txn->abort(); - return -EIO; - } - } - else - { - if (offset > obj.length) - { - dout(2) << "..offset out of range" << dendl; - return 0; - } - if (offset + len > obj.length) - len = obj.length - (size_t) offset; - dout(3) << "..doing partial read of " << len << dendl; - oid_t id; - mkoid (id, oid); - Dbt key (&id, sizeof (oid)); - Dbt value; - char *data = bl.c_str(); - dout(3) << ".bufferlist c_str returned " << ((void*) data) << dendl; - value.set_data (data); - value.set_doff ((size_t) offset); - value.set_dlen (len); - value.set_ulen (len); - value.set_flags (DB_DBT_USERMEM | DB_DBT_PARTIAL); - dout(3) << "..getting " << oid << dendl; - if ((ret = db->get (txn, &key, &value, 0)) != 0) - { - derr(1) << ".get returned " << db_strerror (ret) << dendl; - if (txn) - txn->abort(); - return -EIO; - } - } - - if (txn) - txn->commit (0); - dout(4) << "..read OK, returning " << len << dendl; - return len; -} - -int OSBDB::write(object_t oid, off_t offset, size_t len, - bufferlist& bl, Context *onsafe) -{ - dout(6) << "Context " << hex << onsafe << dec << dendl; - if (!mounted) - { - derr(1) << "not mounted!" << dendl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - scoped_lock __lock(&lock); - dout(2) << "write " << oid << " " << offset << " " - << len << dendl; - - if (offset > 0xFFFFFFFFL || offset + len > 0xFFFFFFFFL) - { - derr(1) << "object too big" << dendl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -ENOSPC; - } - - DbTxn *txn = NULL; - if (transactional) - env->txn_begin (txn, &txn, 0); - - object_inode_key _ikey = new_object_inode_key (oid); - stored_object obj; - Dbt ikey (&_ikey, sizeof_object_inode_key()); - Dbt ival (&obj, sizeof (obj)); - ival.set_ulen (sizeof (obj)); - ival.set_flags (DB_DBT_USERMEM); - - int ret; - dout(3) << "..getting " << _ikey << dendl; - if (db->get (txn, &ikey, &ival, 0) != 0) - { - dout(3) << "..writing new object" << dendl; - - // New object. - obj.length = (size_t) offset + len; - dout(3) << "..mapping " << _ikey << " => " - << obj << dendl; - if ((ret = db->put (txn, &ikey, &ival, 0)) != 0) - { - derr(1) << "..put returned " << db_strerror (ret) << dendl; - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - - oid_t id; - mkoid (id, oid); - Dbt key (&id, sizeof (oid_t)); - Dbt value (bl.c_str(), len); - if (offset == 0) // whole object - { - value.set_flags (DB_DBT_USERMEM); - value.set_ulen (len); - } - else - { - value.set_flags (DB_DBT_USERMEM | DB_DBT_PARTIAL); - value.set_ulen (len); - value.set_doff ((size_t) offset); - value.set_dlen (len); - } - dout(3) << "..mapping " << oid << " => (" - << obj.length << " bytes)" << dendl; - if ((ret = db->put (txn, &key, &value, 0)) != 0) - { - derr(1) << "..put returned " << db_strerror (ret) << dendl; - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - - if (txn != NULL) - txn->commit (0); - if (onsafe != NULL) - COMMIT(onsafe); - - dout(4) << "..write OK, returning " << len << dendl; - return len; - } - - if (offset == 0 && len >= obj.length) - { - if (len != obj.length) - { - obj.length = len; - if ((ret = db->put (txn, &ikey, &ival, 0)) != 0) - { - derr(1) << " put returned " << db_strerror (ret) << dendl; - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - } - oid_t id; - mkoid(id, oid); - Dbt key (&id, sizeof (oid_t)); - Dbt value (bl.c_str(), len); - if (db->put (txn, &key, &value, 0) != 0) - { - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << "..writing object failed!" << dendl; - return -EIO; - } - } - else - { - if (offset + len > obj.length) - { - obj.length = (size_t) offset + len; - if (db->put (txn, &ikey, &ival, 0) != 0) - { - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << "..writing object info failed!" << dendl; - return -EIO; - } - } - oid_t id; - mkoid(id, oid); - Dbt key (&id, sizeof (oid_t)); - Dbt value (bl.c_str(), len); - value.set_doff ((size_t) offset); - value.set_dlen (len); - value.set_ulen (len); - value.set_flags (DB_DBT_PARTIAL); - if (db->put (txn, &key, &value, 0) != 0) - { - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << "..writing object failed!" << dendl; - return -EIO; - } - } - - if (txn != NULL) - txn->commit (0); - if (onsafe != NULL) - COMMIT(onsafe); - - dout(4) << "..write OK, returning " << len << dendl; - return len; -} - -int OSBDB::clone(object_t oid, object_t noid) -{ - if (!mounted) - { - derr(1) << "not mounted!" << dendl; - return -EINVAL; - } - - dout(2) << "clone " << oid << ", " << noid << dendl; - - if (exists (noid)) - { - dout(4) << "..target exists; returning -EEXIST" << dendl; - return -EEXIST; - } - - DbTxn *txn = NULL; - if (transactional) - env->txn_begin (NULL, &txn, 0); - - object_inode_key _ikey = new_object_inode_key (oid); - object_inode_key _nikey = new_object_inode_key (noid); - stored_object obj; - Dbt ikey (&_ikey, sizeof_object_inode_key()); - Dbt ival (&obj, sizeof (obj)); - Dbt nikey (&_nikey, sizeof_object_inode_key()); - ival.set_ulen (sizeof (obj)); - ival.set_flags (DB_DBT_USERMEM); - - oid_t id, nid; - mkoid(id, oid); - mkoid(nid, noid); - Dbt key (&id, sizeof (oid_t)); - Dbt nkey (&oid, sizeof (oid_t)); - Dbt value; - value.set_flags (DB_DBT_MALLOC); - - if (db->get (txn, &ikey, &ival, 0) != 0) - { - if (txn) - txn->abort(); - derr(1) << "..getting object info failed!" << dendl; - return -ENOENT; - } - if (db->get (txn, &key, &value, 0) != 0) - { - if (txn) - txn->abort(); - derr(1) << "..getting original object failed" << dendl; - return -ENOENT; - } - auto_ptr valueptr ((char *) value.get_data()); - - if (db->put (txn, &nikey, &ival, 0) != 0) - { - if (txn) - txn->abort(); - derr(1) << "..putting object info failed" << dendl; - return -EIO; - } - if (db->put (txn, &nkey, &value, 0) != 0) - { - if (txn) - txn->abort(); - derr(1) << "..putting new object failed" << dendl; - return -EIO; - } - - if (txn) - txn->commit (0); - - dout(4) << "..clone OK" << dendl; - return 0; -} - - // Collections - -int OSBDB::list_collections(list& ls) -{ - if (!mounted) - { - derr(1) << "not mounted!" << dendl; - return -EINVAL; - } - - dout(2) << "list_collections" << dendl; - - Dbt key (COLLECTIONS_KEY, 1); - Dbt value; - value.set_flags (DB_DBT_MALLOC); - - if (db->get (NULL, &key, &value, 0) != 0) - { - dout(4) << "..no collections" << dendl; - return 0; // no collections. - } - - auto_ptr sc ((stored_colls *) value.get_data()); - stored_colls *scp = sc.get(); - for (uint32_t i = 0; i < sc->count; i++) - ls.push_back (scp->colls[i]); - - dout(4) << "..list_collections returns " << scp->count << dendl; - return scp->count; -} - -int OSBDB::create_collection(coll_t c, Context *onsafe) -{ - dout(6) << "Context " << hex << onsafe << dec << dendl; - if (!mounted) - { - derr(1) << "not mounted" << dendl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - scoped_lock __lock(&lock); - dout(2) << "create_collection " << hex << c << dec << dendl; - - Dbt key (COLLECTIONS_KEY, 1); - Dbt value; - value.set_flags (DB_DBT_MALLOC); - - DbTxn *txn = NULL; - if (transactional) - env->txn_begin (NULL, &txn, 0); - - stored_colls *scp = NULL; - size_t sz = 0; - bool created = false; - if (db->get (txn, &key, &value, 0) != 0) - { - sz = sizeof (stored_colls) + sizeof (coll_t); - scp = (stored_colls *) malloc (sz); - scp->count = 0; - created = true; - } - else - { - scp = (stored_colls *) value.get_data(); - sz = value.get_size(); - } - - auto_ptr sc (scp); - int ins = 0; - if (scp->count > 0) - ins = binary_search (scp->colls, scp->count, c); - if (ins < scp->count && scp->colls[ins] == c) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".collection " << c << " already exists " << dendl; - return -EEXIST; - } - - dout(3) << "..insertion point: " << ins << dendl; - - // Make room for a new collection ID. - if (!created) - { - sz += sizeof (coll_t); - dout(3) << "..increase size to " << sz << dendl; - stored_colls *scp2 = (stored_colls *) realloc (scp, sz); - sc.release (); - sc.reset (scp2); - scp = scp2; - } - - int n = (scp->count - ins) * sizeof (coll_t); - if (n > 0) - { - dout(3) << "..moving " << n << " bytes up" << dendl; - memmove (&scp->colls[ins + 1], &scp->colls[ins], n); - } - scp->count++; - scp->colls[ins] = c; - - dout(3) << "..collections: " << scp << dendl; - - // Put the modified collection list back. - { - Dbt value2 (scp, sz); - if (db->put (txn, &key, &value2, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".writing new collections list failed" << dendl; - return -EIO; - } - } - - // Create the new collection. - { - stored_coll new_coll; - new_coll.count = 0; - Dbt coll_key (&c, sizeof (coll_t)); - Dbt coll_value (&new_coll, sizeof (stored_coll)); - if (db->put (txn, &coll_key, &coll_value, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".writing new collection failed" << dendl; - return -EIO; - } - } - - if (txn != NULL) - txn->commit (0); - if (onsafe != NULL) - COMMIT(onsafe); - - dout(4) << "..create_collection OK" << dendl; - return 0; -} - -int OSBDB::destroy_collection(coll_t c, Context *onsafe) -{ - dout(6) << "Context " << hex << onsafe << dec << dendl; - if (!mounted) - { - derr(1) << "not mounted" << dendl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - scoped_lock __lock(&lock); - dout(2) << "destroy_collection " << hex << c << dec << dendl; - - Dbt key (COLLECTIONS_KEY, 1); - Dbt value; - value.set_flags (DB_DBT_MALLOC); - DbTxn *txn = NULL; - - if (transactional) - env->txn_begin (NULL, &txn, 0); - - if (db->get (txn, &key, &value, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".collection list doesn't exist" << dendl; - return -ENOENT; // XXX - } - - stored_colls *scp = (stored_colls *) value.get_data(); - auto_ptr valueBuf (scp); - if (scp->count == 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".collection " << c << " not listed" << dendl; - return -ENOENT; - } - uint32_t ins = binary_search (scp->colls, scp->count, c); - dout(4) << "..insertion point is " << ins << dendl; - if (ins >= scp->count || scp->colls[ins] != c) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".collection " << c << " not listed" << dendl; - return -ENOENT; - } - - dout(4) << "..collections list is " << scp << dendl; - - // Move the rest of the list down in memory, if needed. - if (ins < scp->count) - { - size_t n = scp->count - ins - 1; - dout(4) << "..shift list down " << n << dendl; - memmove (&scp->colls[ins], &scp->colls[ins + 1], n); - } - - dout(4) << "..collections list is " << scp << dendl; - - // Modify the record size to be one less. - Dbt nvalue (scp, value.get_size() - sizeof (coll_t)); - nvalue.set_flags (DB_DBT_USERMEM); - if (db->put (txn, &key, &nvalue, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".putting modified collection list failed" << dendl; - return -EIO; - } - - // Delete the collection. - Dbt collKey (&c, sizeof (coll_t)); - if (db->del (txn, &collKey, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".deleting collection failed" << dendl; - return -EIO; - } - - if (txn != NULL) - txn->commit (0); - if (onsafe != NULL) - COMMIT(onsafe); - dout(4) << "..destroy_collection OK" << dendl; - return 0; -} - -bool OSBDB::collection_exists(coll_t c) -{ - if (!mounted) - { - derr(1) << "not mounted" << dendl; - return -EINVAL; - } - - dout(2) << "collection_exists " << hex << c << dec << dendl; - - /*Dbt key (COLLECTIONS_KEY, 1); - Dbt value; - value.set_flags (DB_DBT_MALLOC); - - if (db->get (NULL, &key, &value, 0) != 0) - { - dout(4) << "..no collection list; return false" << dendl; - return false; - } - - stored_colls *scp = (stored_colls *) value.get_data(); - auto_ptr sc (scp); - dout(5) << "..collection list is " << scp << dendl; - if (scp->count == 0) - { - dout(4) << "..empty collection list; return false" << dendl; - return false; - } - uint32_t ins = binary_search (scp->colls, scp->count, c); - dout(4) << "..insertion point is " << ins << dendl; - - int ret = (scp->colls[ins] == c); - dout(4) << "..returns " << ret << dendl; - return ret;*/ - - Dbt key (&c, sizeof (coll_t)); - Dbt value; - value.set_flags (DB_DBT_MALLOC); - if (db->get (NULL, &key, &value, 0) != 0) - { - dout(4) << "..no collection, return false" << dendl; - return false; - } - void *val = value.get_data(); - free (val); - dout(4) << "..collection exists; return true" << dendl; - return true; -} - -int OSBDB::collection_stat(coll_t c, struct stat *st) -{ - if (!mounted) - { - derr(1) << "not mounted" << dendl; - return -EINVAL; - } - - dout(2) << "collection_stat " << c << dendl; - // XXX is this needed? - return -ENOSYS; -} - -int OSBDB::collection_add(coll_t c, object_t o, Context *onsafe) -{ - dout(6) << "Context " << hex << onsafe << dec << dendl; - if (!mounted) - { - dout(2) << "not mounted" << dendl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - scoped_lock __lock(&lock); - dout(2) << "collection_add " << hex << c << dec << " " << o << dendl; - - Dbt key (&c, sizeof (coll_t)); - Dbt value; - value.set_flags (DB_DBT_MALLOC); - DbTxn *txn = NULL; - - if (transactional) - env->txn_begin (NULL, &txn, 0); - - if (db->get (txn, &key, &value, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << "failed to find collection" << dendl; - return -ENOENT; - } - - size_t sz = value.get_size(); - stored_coll *scp = (stored_coll *) value.get_data(); - auto_ptr sc (scp); - - // Find the insertion point for the new object ID. - uint32_t ins = 0; - if (scp->count > 0) - { - ins = binary_search (scp->objects, scp->count, o); - // Already there? - if (ins < scp->count && scp->objects[ins] == o) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << "collection already has object" << dendl; - return -EEXIST; - } - } - - // Make room for the new value, and add it. - sz += sizeof (object_t); - scp = (stored_coll *) realloc (scp, sz); - sc.release(); - sc.reset (scp); - dout(3) << "..current collection: " << scp << dendl; - if (ins < scp->count - 1) - { - size_t n = (scp->count - ins) * sizeof (object_t); - dout(3) << "..move up " << n << " bytes" << dendl; - memmove (&scp->objects[ins + 1], &scp->objects[ins], n); - } - scp->count++; - scp->objects[ins] = o; - - dout(3) << "..collection: " << scp << dendl; - - Dbt nvalue (scp, sz); - if (db->put (txn, &key, &nvalue, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << "..putting modified collection failed" << dendl; - return -EIO; - } - - if (txn != NULL) - txn->commit (0); - if (onsafe != NULL) - COMMIT(onsafe); - dout(4) << "..collection add OK" << dendl; - return 0; -} - -int OSBDB::collection_remove(coll_t c, object_t o, Context *onsafe) -{ - dout(6) << "Context " << hex << onsafe << dec << dendl; - if (!mounted) - { - derr(1) << "not mounted" << dendl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - scoped_lock __lock(&lock); - dout(2) << "collection_remove " << hex << c << dec << " " << o << dendl; - - Dbt key (&c, sizeof (coll_t)); - Dbt value; - value.set_flags (DB_DBT_MALLOC); - DbTxn *txn = NULL; - - if (transactional) - env->txn_begin (NULL, &txn, 0); - - if (db->get (txn, &key, &value, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - dout(1) << "..collection doesn't exist" << dendl; - return -ENOENT; - } - - stored_coll *scp = (stored_coll *) value.get_data(); - auto_ptr sc (scp); - - dout(5) << "..collection is " << scp << dendl; - if (scp->count == 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - dout(1) << "..collection is empty" << dendl; - return -ENOENT; - } - uint32_t ins = binary_search (scp->objects, scp->count, o); - dout(4) << "..insertion point is " << ins << dendl; - if (ins >= scp->count || scp->objects[ins] != o) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - dout(1) << "..object not in collection" << dendl; - return -ENOENT; - } - - if (ins < scp->count - 1) - { - size_t n = (scp->count - ins - 1) * sizeof (object_t); - dout(5) << "..moving " << n << " bytes down" << dendl; - memmove (&scp->objects[ins], &scp->objects[ins + 1], n); - } - scp->count--; - - dout(3) << "..collection " << scp << dendl; - - Dbt nval (scp, value.get_size() - sizeof (object_t)); - if (db->put (txn, &key, &nval, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << "..putting modified collection failed" << dendl; - return -EIO; - } - - if (txn != NULL) - txn->commit (0); - if (onsafe != NULL) - COMMIT(onsafe); - dout(4) << "..collection remove OK" << dendl; - return 0; -} - -int OSBDB::collection_list(coll_t c, list& o) -{ - if (!mounted) - { - derr(1) << "not mounted" << dendl; - return -EINVAL; - } - - Dbt key (&c, sizeof (coll_t)); - Dbt value; - DbTxn *txn = NULL; - - if (transactional) - env->txn_begin (NULL, &txn, 0); - - if (db->get (txn, &key, &value, 0) != 0) - { - if (txn != NULL) - txn->abort(); - return -ENOENT; - } - - stored_coll *scp = (stored_coll *) value.get_data(); - auto_ptr sc (scp); - for (uint32_t i = 0; i < scp->count; i++) - o.push_back (scp->objects[i]); - - if (txn != NULL) - txn->commit (0); - return 0; -} - - // Attributes - -int OSBDB::_setattr(object_t oid, const char *name, - const void *value, size_t size, Context *onsafe, - DbTxn *txn) -{ - dout(6) << "Context " << hex << onsafe << dec << dendl; - if (!mounted) - { - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - if (strlen (name) >= OSBDB_MAX_ATTR_LEN) - { - derr(1) << "name too long: " << name << dendl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -ENAMETOOLONG; - } - - scoped_lock __lock(&lock); - - // Add name to attribute list, if needed. - attrs_id aids = new_attrs_id (oid); - Dbt attrs_key (&aids, sizeof_attrs_id()); - Dbt attrs_val; - attrs_val.set_flags (DB_DBT_MALLOC); - stored_attrs *sap = NULL; - size_t sz = 0; - - dout(3) << " getting " << aids << dendl; - if (db->get (txn, &attrs_key, &attrs_val, 0) != 0) - { - dout(2) << " first attribute" << dendl; - sz = sizeof (stored_attrs); - sap = (stored_attrs *) malloc(sz); - sap->count = 0; - } - else - { - sz = attrs_val.get_size(); - sap = (stored_attrs *) attrs_val.get_data(); - dout(2) << "..add to list of " << sap->count << " attrs" << dendl; - } - auto_ptr sa (sap); - - attr_name _name; - strncpy (_name.name, name, OSBDB_MAX_ATTR_LEN); - - int ins = 0; - if (sap->count > 0) - ins = binary_search (sap->names, sap->count, _name); - dout(3) << "..insertion point is " << ins << dendl; - if (sap->count == 0 || - (ins >= sap->count || strcmp (sap->names[ins].name, name) != 0)) - { - sz += sizeof (attr_name); - dout(3) << "..realloc " << ((void *) sap) << " to " - << dec << sz << dendl; - sap = (stored_attrs *) realloc (sap, sz); - dout(3) << "..returns " << ((void *) sap) << dendl; - sa.release (); - sa.reset (sap); - int n = (sap->count - ins) * sizeof (attr_name); - if (n > 0) - { - dout(3) << "..move " << n << " bytes from 0x" - << hex << (&sap->names[ins]) << " to 0x" - << hex << (&sap->names[ins+1]) << dec << dendl; - memmove (&sap->names[ins+1], &sap->names[ins], n); - } - memset (&sap->names[ins], 0, sizeof (attr_name)); - strncpy (sap->names[ins].name, name, OSBDB_MAX_ATTR_LEN); - sap->count++; - - Dbt newAttrs_val (sap, sz); - newAttrs_val.set_ulen (sz); - newAttrs_val.set_flags (DB_DBT_USERMEM); - dout(3) << "..putting " << aids << dendl; - if (db->put (txn, &attrs_key, &newAttrs_val, 0) != 0) - { - derr(1) << ".writing attributes list failed" << dendl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - } - else - { - dout(3) << "..attribute " << name << " already exists" << dendl; - } - - dout(5) << "..attributes list: " << sap << dendl; - - // Add the attribute. - attr_id aid = new_attr_id (oid, name); - Dbt attr_key (&aid, sizeof (aid)); - Dbt attr_val ((void *) value, size); - dout(3) << "..writing attribute key " << aid << dendl; - if (db->put (txn, &attr_key, &attr_val, 0) != 0) - { - derr(1) << ".writing attribute key failed" << dendl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - - dout(4) << "..setattr OK" << dendl; - if (onsafe != NULL) - COMMIT(onsafe); - return 0; -} - -int OSBDB::setattr(object_t oid, const char *name, - const void *value, size_t size, - Context *onsafe) -{ - dout(6) << "Context " << hex << onsafe << dec << dendl; - if (!mounted) - { - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - DbTxn *txn = NULL; - if (transactional) - env->txn_begin (NULL, &txn, 0); - - dout(2) << "setattr " << oid << ":" << name << " => (" - << size << " bytes)" << dendl; - int ret = _setattr (oid, name, value, size, onsafe, txn); - if (ret == 0) - { - if (txn != NULL) - txn->commit (0); - } - else - { - if (txn != NULL) - txn->abort(); - } - return ret; -} - -int OSBDB::setattrs(object_t oid, map& aset, - Context *onsafe) -{ - dout(6) << "Context " << hex << onsafe << dec << dendl; - if (!mounted) - { - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - DbTxn *txn = NULL; - - if (transactional) - env->txn_begin (NULL, &txn, 0); - - map::iterator it; - for (it = aset.begin(); it != aset.end(); it++) - { - string name = it->first; - bufferptr value = it->second; - int ret = _setattr (oid, name.c_str(), value.c_str(), - value.length(), onsafe, txn); - if (ret != 0) - { - if (txn != NULL) - txn->abort(); - return ret; - } - } - - if (txn != NULL) - txn->commit (0); - return 0; -} - -int OSBDB::_getattr (object_t oid, const char *name, void *value, size_t size) -{ - if (!mounted) - return -EINVAL; - - dout(2) << "_getattr " << oid << " " << name << " " << size << dendl; - - attr_id aid = new_attr_id (oid, name); - Dbt key (&aid, sizeof (aid)); - Dbt val (value, size); - val.set_ulen (size); - val.set_doff (0); - val.set_dlen (size); - val.set_flags (DB_DBT_USERMEM | DB_DBT_PARTIAL); - - int ret; - if ((ret = db->get (NULL, &key, &val, 0)) != 0) - { - derr(1) << ".getting value failed: " << db_strerror (ret) << dendl; - return -ENOENT; - } - - dout(4) << ".._getattr OK; returns " << val.get_size() << dendl; - return val.get_size(); -} - -int OSBDB::getattr(object_t oid, const char *name, void *value, size_t size) -{ - if (!mounted) - return -EINVAL; - - return _getattr (oid, name, value, size); -} - -int OSBDB::getattrs(object_t oid, map& aset) -{ - if (!mounted) - return -EINVAL; - - for (map::iterator it = aset.begin(); - it != aset.end(); it++) - { - int ret = _getattr (oid, (*it).first.c_str(), - (*it).second.c_str(), - (*it).second.length()); - if (ret < 0) - return ret; - } - return 0; -} - -int OSBDB::rmattr(object_t oid, const char *name, Context *onsafe) -{ - dout(6) << "Context " << hex << onsafe << dec << dendl; - if (!mounted) - { - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - scoped_lock __lock(&lock); - dout(2) << "rmattr " << oid << " " << name << dendl; - - attrs_id aids = new_attrs_id (oid); - Dbt askey (&aids, sizeof_attrs_id()); - Dbt asvalue; - asvalue.set_flags (DB_DBT_MALLOC); - - DbTxn *txn = NULL; - - if (transactional) - env->txn_begin (NULL, &txn, 0); - - if (db->get (txn, &askey, &asvalue, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - return -ENOENT; - } - - stored_attrs *sap = (stored_attrs *) asvalue.get_data(); - auto_ptr sa (sap); - - dout(5) << "..attributes list " << sap << dendl; - - if (sap->count == 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".empty attribute list" << dendl; - return -ENOENT; - } - - attr_name _name; - memset(&_name, 0, sizeof (_name)); - strncpy (_name.name, name, OSBDB_MAX_ATTR_LEN); - int ins = binary_search (sap->names, sap->count, _name); - dout(4) << "..insertion point is " << ins << dendl; - if (ins >= sap->count || strcmp (sap->names[ins].name, name) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".attribute not found in list" << dendl; - return -ENOENT; - } - - // Shift the later elements down by one, if needed. - int n = (sap->count - ins) * sizeof (attr_name); - if (n > 0) - { - dout(4) << "..shift down by " << n << dendl; - memmove (&(sap->names[ins]), &(sap->names[ins + 1]), n); - } - sap->count--; - dout(5) << "..attributes list now " << sap << dendl; - - asvalue.set_size(asvalue.get_size() - sizeof (attr_name)); - int ret; - if ((ret = db->put (txn, &askey, &asvalue, 0)) != 0) - { - derr(1) << "put stored_attrs " << db_strerror (ret) << dendl; - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - - // Remove the attribute. - attr_id aid = new_attr_id (oid, name); - Dbt key (&aid, sizeof (aid)); - if ((ret = db->del (txn, &key, 0)) != 0) - { - derr(1) << "deleting " << aid << ": " << db_strerror(ret) << dendl; - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - - if (txn != NULL) - txn->commit (0); - if (onsafe != NULL) - COMMIT(onsafe); - dout(4) << "..rmattr OK" << dendl; - return 0; -} - -int OSBDB::listattr(object_t oid, char *attrs, size_t size) -{ - if (!mounted) - return -EINVAL; - - dout(2) << "listattr " << oid << dendl; - - attrs_id aids = new_attrs_id (oid); - Dbt key (&aids, sizeof_attrs_id()); - Dbt value; - value.set_flags (DB_DBT_MALLOC); - - // XXX Transactions for read atomicity??? - - int ret; - if ((ret = db->get (NULL, &key, &value, 0)) != 0) - { - derr(1) << "fetching " << aids << ": " << db_strerror (ret) - << dendl; - return -ENOENT; - } - - stored_attrs *attrsp = (stored_attrs *) value.get_data(); - auto_ptr _attrs (attrsp); - size_t s = 0; - char *p = attrs; - for (unsigned i = 0; i < attrsp->count && s < size; i++) - { - int n = MIN (OSBDB_MAX_ATTR_LEN, - MIN (strlen (attrsp->names[i].name), size - s - 1)); - strncpy (p, attrsp->names[i].name, n); - p[n] = '\0'; - p = p + n + 1; - } - - dout(4) << "listattr OK" << dendl; - return 0; -} - - // Collection attributes. - -int OSBDB::collection_setattr(coll_t cid, const char *name, - const void *value, size_t size, - Context *onsafe) -{ - dout(6) << "Context " << hex << onsafe << dec << dendl; - if (!mounted) - { - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - scoped_lock __lock(&lock); - dout(2) << "collection_setattr " << hex << cid << dec << " " << name - << " (" << size << " bytes)" << dendl; - if (strlen (name) >= OSBDB_MAX_ATTR_LEN) - { - derr(1) << "name too long" << dendl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -ENAMETOOLONG; - } - - // Add name to attribute list, if needed. - coll_attrs_id aids = new_coll_attrs_id (cid); - Dbt attrs_key (&aids, sizeof_coll_attrs_id()); - Dbt attrs_val; - attrs_val.set_flags (DB_DBT_MALLOC); - stored_attrs *sap = NULL; - size_t sz = 0; - - DbTxn *txn = NULL; - if (transactional) - env->txn_begin (NULL, &txn, 0); - - dout(3) << " getting " << aids << dendl; - if (db->get (txn, &attrs_key, &attrs_val, 0) != 0) - { - dout(2) << " first attribute" << dendl; - sz = sizeof (stored_attrs); - sap = (stored_attrs *) malloc(sz); - sap->count = 0; - } - else - { - sz = attrs_val.get_size(); - sap = (stored_attrs *) attrs_val.get_data(); - dout(2) << " add to list of " << sap->count << " attrs" << dendl; - } - auto_ptr sa (sap); - - attr_name _name; - strncpy (_name.name, name, OSBDB_MAX_ATTR_LEN); - - int ins = 0; - if (sap->count > 0) - ins = binary_search (sap->names, sap->count, _name); - dout(3) << " insertion point is " << ins << dendl; - if (ins >= sap->count || strcmp (sap->names[ins].name, name) != 0) - { - sz += sizeof (attr_name); - dout(3) << " realloc " << hex << ((void *) sap) << " to " - << dec << sz << dendl; - sap = (stored_attrs *) realloc (sap, sz); - dout(3) << " returns " << hex << ((void *) sap) << dec << dendl; - sa.release (); - sa.reset (sap); - int n = (sap->count - ins) * sizeof (attr_name); - if (n > 0) - { - dout(3) << " move " << n << " bytes from 0x" - << hex << (&sap->names[ins]) << " to 0x" - << hex << (&sap->names[ins+1]) << dec << dendl; - memmove (&sap->names[ins+1], &sap->names[ins], n); - } - memset (&sap->names[ins], 0, sizeof (attr_name)); - strncpy (sap->names[ins].name, name, OSBDB_MAX_ATTR_LEN); - sap->count++; - - Dbt newAttrs_val (sap, sz); - newAttrs_val.set_ulen (sz); - newAttrs_val.set_flags (DB_DBT_USERMEM); - dout(3) << " putting " << aids << dendl; - if (db->put (txn, &attrs_key, &newAttrs_val, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".putting new attributes failed" << dendl; - return -EIO; - } - } - else - { - dout(3) << "..attribute " << name << " already exists" << dendl; - } - - dout(3) << "..attributes list: " << sap << dendl; - - // Add the attribute. - coll_attr_id aid = new_coll_attr_id (cid, name); - Dbt attr_key (&aid, sizeof (aid)); - Dbt attr_val ((void *) value, size); - dout(3) << " writing attribute key " << aid << dendl; - if (db->put (txn, &attr_key, &attr_val, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".putting attribute failed" << dendl; - return -EIO; - } - - if (txn != NULL) - txn->commit (0); - if (onsafe != NULL) - COMMIT(onsafe); - - dout(4) << "..collection setattr OK" << dendl; - return 0; -} - -int OSBDB::collection_rmattr(coll_t cid, const char *name, - Context *onsafe) -{ - dout(6) << "Context " << hex << onsafe << dec << dendl; - if (!mounted) - { - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - scoped_lock __lock(&lock); - dout(2) << "collection_rmattr " << hex << cid << dec - << " " << name << dendl; - - coll_attrs_id aids = new_coll_attrs_id (cid); - Dbt askey (&aids, sizeof_coll_attrs_id()); - Dbt asvalue; - asvalue.set_flags (DB_DBT_MALLOC); - - DbTxn *txn = NULL; - if (transactional) - env->txn_begin (NULL, &txn, 0); - - if (db->get (txn, &askey, &asvalue, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".no attributes list" << dendl; - return -ENOENT; - } - - stored_attrs *sap = (stored_attrs *) asvalue.get_data(); - auto_ptr sa (sap); - - dout(5) << "..attributes list " << sap << dendl; - if (sap->count == 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".empty attributes list" << dendl; - return -ENOENT; - } - - attr_name _name; - memset(&_name, 0, sizeof (_name)); - strncpy (_name.name, name, OSBDB_MAX_ATTR_LEN); - int ins = binary_search (sap->names, sap->count, _name); - if (ins >= sap->count || strcmp (sap->names[ins].name, name) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".attribute not listed" << dendl; - return -ENOENT; - } - - // Shift the later elements down by one, if needed. - int n = (sap->count - ins) * sizeof (attr_name); - if (n > 0) - { - dout(4) << "..shift down by " << n << dendl; - memmove (&(sap->names[ins]), &(sap->names[ins + 1]), n); - } - sap->count--; - dout(5) << "..attributes list now " << sap << dendl; - - asvalue.set_size(asvalue.get_size() - sizeof (attr_name)); - int ret; - if ((ret = db->put (txn, &askey, &asvalue, 0)) != 0) - { - derr(1) << "put stored_attrs " << db_strerror (ret) << dendl; - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - - // Remove the attribute. - coll_attr_id aid = new_coll_attr_id (cid, name); - Dbt key (&aid, sizeof (aid)); - if ((ret = db->del (txn, &key, 0)) != 0) - { - derr(1) << "deleting " << aid << ": " << db_strerror(ret) << dendl; - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - - if (txn != NULL) - txn->commit (0); - if (onsafe != NULL) - COMMIT(onsafe); - - dout(4) << "..collection rmattr OK" << dendl; - return 0; -} - -int OSBDB::collection_getattr(coll_t cid, const char *name, - void *value, size_t size) -{ - if (!mounted) - return -EINVAL; - - dout(2) << "collection_getattr " << hex << cid << dec - << " " << name << dendl; - - // XXX transactions/read isolation? - - coll_attr_id caid = new_coll_attr_id (cid, name); - Dbt key (&caid, sizeof (caid)); - Dbt val (value, size); - val.set_ulen (size); - val.set_dlen (size); - val.set_flags (DB_DBT_USERMEM | DB_DBT_PARTIAL); - - if (db->get (NULL, &key, &val, 0) != 0) - { - derr(1) << ".no attribute entry" << dendl; - return -ENOENT; - } - - dout(4) << "..collection getattr OK; returns " << val.get_size() << dendl; - return val.get_size(); -} - -int OSBDB::collection_listattr(coll_t cid, char *attrs, size_t size) -{ - if (!mounted) - return -EINVAL; - - dout(2) << "collection_listattr " << hex << cid << dec << dendl; - - // XXX transactions/read isolation? - - coll_attrs_id caids = new_coll_attrs_id (cid); - Dbt key (&caids, sizeof_coll_attrs_id()); - Dbt value; - value.set_flags (DB_DBT_MALLOC); - - int ret; - if ((ret = db->get (NULL, &key, &value, 0)) != 0) - { - derr(1) << "fetching " << caids << ": " << db_strerror (ret) - << dendl; - return -ENOENT; - } - - stored_attrs *attrsp = (stored_attrs *) value.get_data(); - auto_ptr _attrs (attrsp); - size_t s = 0; - char *p = attrs; - for (unsigned i = 0; i < attrsp->count && s < size; i++) - { - int n = MIN (OSBDB_MAX_ATTR_LEN, - MIN (strlen (attrsp->names[i].name), size - s - 1)); - strncpy (p, attrsp->names[i].name, n); - p[n] = '\0'; - p = p + n + 1; - } - return 0; -} - - // Sync. - -void OSBDB::sync (Context *onsync) -{ - if (!mounted) - return; - - sync(); - - if (onsync != NULL) - { - g_timer.add_event_after(0.1, onsync); - } -} - -void OSBDB::sync() -{ - if (!mounted) - return; - - if (transactional) - { - env->log_flush (NULL); - env->lsn_reset (device.c_str(), 0); - } - db->sync(0); -} diff --git a/branches/sage/ebofs2/osbdb/OSBDB.h b/branches/sage/ebofs2/osbdb/OSBDB.h deleted file mode 100644 index 8eb2004d3903f..0000000000000 --- a/branches/sage/ebofs2/osbdb/OSBDB.h +++ /dev/null @@ -1,482 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* OSBDB.h -- ObjectStore on Berkeley DB. -*- c++ -*- - Copyright (C) 2007 Casey Marshall - -Ceph - scalable distributed file system - -This is free software; you can redistribute it and/or -modify it under the terms of the GNU Lesser General Public -License version 2.1, as published by the Free Software -Foundation. See file COPYING. */ - - -#include -#include "osd/ObjectStore.h" - -#define OSBDB_MAGIC 0x05BDB - -/* - * Maximum length of an attribute name. - */ -#define OSBDB_MAX_ATTR_LEN 256 - -#define OSBDB_THIS_VERSION 1 - -#define OSBDB_SUPERBLOCK_KEY ((void *) "s") - -/* - * The "superblock" of the BDB object store. We store one of these in - * the DB, to store version and other information. We don't record - * anything special here, just the version number the database was - * written with. - * - * In principle, this structure is variable-length, depending on the - * software version writing the superblock. - */ -struct stored_superblock -{ - uint32_t version; -}; - -inline ostream& operator<<(ostream& out, const stored_superblock sb) -{ - out << "osbdb.super(" << sb.version << ")" << endl; - return out; -} - -/** - * An object identifier; we define this so we can have a POD object to - * work with. - */ -struct oid_t // POD -{ - char id[16]; -}; - -inline void mkoid (oid_t& id, object_t& oid) -{ - // XXX byte order? - memcpy (id.id, &oid, sizeof (oid_t)); -} - -inline ostream& operator<<(ostream& out, const oid_t id) -{ - for (int i = 0; i < 16; i++) - { - out.fill('0'); - out << setw(2) << hex << (id.id[i] & 0xFF); - if ((i & 3) == 3) - out << ':'; - } - out.unsetf(ios::right); - out << dec; - return out; -} - -/** - * An "inode" key. We map a 'stored_object' struct to this key for - * every object. - */ -struct object_inode_key // POD -{ - oid_t oid; - char tag; -}; - -/** - * "Constructor" for an object_inode_key. - */ -inline object_inode_key new_object_inode_key (object_t& oid) -{ - object_inode_key key; - memset(&key, 0, sizeof (object_inode_key)); - mkoid (key.oid, oid); - key.tag = 'i'; - return key; -} - -/* - * We use this, instead of sizeof(), to try and guarantee that we - * don't include the structure padding, if any. - * - * This *should* return 17: sizeof (oid_t) == 16; sizeof (char) == 1. - */ -inline size_t sizeof_object_inode_key() -{ - return offsetof(object_inode_key, tag) + sizeof (char); -} - - // Frank Poole: Unfortunately, that sounds a little - // like famous last words. - // -- 2001: A Space Odyssey - -inline ostream& operator<<(ostream& out, const object_inode_key o) -{ - out << o.tag << "/" << o.oid; - return out; -} - -/** - * A stored object. This is essentially the "inode" of the object, - * containing things like the object's length. The object itself is - * stored as-is, mapped by the 128-bit object ID. - */ -struct stored_object -{ - uint32_t length; -}; - -inline ostream& operator<<(ostream& out, const stored_object s) -{ - out << "inode(l:" << s.length << ")"; - return out; -} - -/* - * Key referencing the list of attribute names for an object. This is - * simply the object's ID, with an additional character 'a' appended. - */ -struct attrs_id // POD -{ - oid_t oid; - char tag; -}; - -/* - * "Construtor" for attrs_id. - */ -inline struct attrs_id new_attrs_id (object_t& oid) -{ - attrs_id aid; - memset (&aid, 0, sizeof (attrs_id)); - mkoid(aid.oid, oid); - aid.tag = 'a'; - return aid; -} - -/* - * See explanation for sizeof_object_inode_id. - */ -inline size_t sizeof_attrs_id() -{ - return offsetof(struct attrs_id, tag) + sizeof (char); -} - -inline ostream& operator<<(ostream& out, const attrs_id id) -{ - out << id.tag << "/" << id.oid; - return out; -} - -/* - * Encapsulation of a single attribute name. - */ -struct attr_name // POD -{ - char name[OSBDB_MAX_ATTR_LEN]; -}; - -inline ostream& operator<<(ostream& out, const attr_name n) -{ - out << n.name; - return out; -} - -inline bool operator<(const attr_name n1, const attr_name n2) -{ - return (strncmp (n1.name, n2.name, OSBDB_MAX_ATTR_LEN) < 0); -} - -inline bool operator>(const attr_name n1, const attr_name n2) -{ - return (strncmp (n1.name, n2.name, OSBDB_MAX_ATTR_LEN) > 0); -} - -inline bool operator==(const attr_name n1, const attr_name n2) -{ - std::cerr << n1.name << " == " << n2.name << "?" << endl; - return (strncmp (n1.name, n2.name, OSBDB_MAX_ATTR_LEN) == 0); -} - -inline bool operator!=(const attr_name n1, const attr_name n2) -{ - return !(n1 == n2); -} - -inline bool operator>=(const attr_name n1, const attr_name n2) -{ - return !(n1 < n2); -} - -inline bool operator<=(const attr_name n1, const attr_name n2) -{ - return !(n1 > n2); -} - -/* - * A list of an object or collection's attribute names. - */ -struct stored_attrs -{ - uint32_t count; - attr_name names[0]; // actually variable-length -}; - -inline ostream& operator<<(ostream& out, const stored_attrs *sa) -{ - out << sa->count << " [ "; - for (unsigned i = 0; i < sa->count; i++) - out << sa->names[i] << (i == sa->count - 1 ? " " : ", "); - out << "]"; - return out; -} - -/* - * An object attribute key. An object attribute is mapped simply by - * the object ID appended with the attribute name. Attribute names - * may not be empty, and must be less than 256 characters, in this - * implementation. - */ -struct attr_id // POD -{ - oid_t oid; - attr_name name; -}; - -inline attr_id new_attr_id (object_t& oid, const char *name) -{ - attr_id aid; - memset(&aid, 0, sizeof (attr_id)); - mkoid (aid.oid, oid); - strncpy (aid.name.name, name, OSBDB_MAX_ATTR_LEN); - return aid; -} - -inline ostream& operator<<(ostream &out, const attr_id id) -{ - out << id.oid << ":" << id.name; - return out; -} - -/* - * A key for a collection attributes list. - */ -struct coll_attrs_id // POD -{ - coll_t cid; - char tag; -}; - -inline coll_attrs_id new_coll_attrs_id (coll_t cid) -{ - coll_attrs_id catts; - memset(&catts, 0, sizeof (coll_attrs_id)); - catts.cid = cid; - catts.tag = 'C'; - return catts; -} - -inline size_t sizeof_coll_attrs_id() -{ - return offsetof(coll_attrs_id, tag) + sizeof (char); -} - -inline ostream& operator<<(ostream& out, coll_attrs_id id) -{ - out << id.tag << "/" << id.cid; - return out; -} - -/* - * A collection attribute key. Similar to - */ -struct coll_attr_id // POD -{ - coll_t cid; - attr_name name; -}; - -inline coll_attr_id new_coll_attr_id (coll_t cid, const char *name) -{ - coll_attr_id catt; - memset(&catt, 0, sizeof (coll_attr_id)); - catt.cid = cid; - strncpy (catt.name.name, name, OSBDB_MAX_ATTR_LEN); - return catt; -} - -inline ostream& operator<<(ostream& out, coll_attr_id id) -{ - out << id.cid << ":" << id.name; - return out; -} - -/* - * This is the key we store the master collections list under. - */ -#define COLLECTIONS_KEY ((void *) "c") - -/* - * The master list of collections. There should be one of these per - * OSD. The sole reason for this structure is to have the ability - * to enumerate all collections stored on this OSD. - */ -struct stored_colls -{ - // The number of collections. - uint32_t count; - - // The collection identifiers. This is a sorted list of coll_t - // values. - coll_t colls[0]; // actually variable-length -}; - -inline ostream& operator<<(ostream& out, stored_colls *c) -{ - out << c->count << " [ "; - for (unsigned i = 0; i < c->count; i++) - { - out << hex << c->colls[i]; - if (i < c->count - 1) - out << ", "; - } - out << " ]" << dec; - return out; -} - -/* - * A stored collection (a bag of object IDs). These are referenced by - * the bare collection identifier type, a coll_t (thus, a 32-bit - * integer). Internally this is stored as a sorted list of object IDs. - * - * Note, this structure places all collection items in a single - * record; this may be a memory burden for large collections. - */ -struct stored_coll -{ - // The size of this collection. - uint32_t count; - - // The object IDs in this collection. This is a sorted list of all - // object ID's in this collection. - object_t objects[0]; // actually variable-length -}; - -inline ostream& operator<<(ostream& out, stored_coll *c) -{ - out << c->count << " [ "; - for (unsigned i = 0; i < c->count; i++) - { - out << c->objects[i]; - if (i < c->count - 1) - out << ", "; - } - out << " ]"; - return out; -} - -class OSBDBException : public std::exception -{ - const char *msg; - -public: - OSBDBException(const char *msg) : msg(msg) { } - const char *what() { return msg; } -}; - -/* - * The object store interface for Berkeley DB. - */ -class OSBDB : public ObjectStore -{ - private: - Mutex lock; - DbEnv *env; - Db *db; - string device; - string env_dir; - bool mounted; - bool opened; - bool transactional; - - public: - - OSBDB(const char *dev) throw(OSBDBException) - : lock(true), env(0), db (0), device (dev), mounted(false), opened(false), - transactional(g_conf.bdbstore_transactional) - { - } - - ~OSBDB() - { - if (mounted) - { - umount(); - } - } - - int mount(); - int umount(); - int mkfs(); - - int statfs(struct statfs *buf); - - int pick_object_revision_lt(object_t& oid); - - bool exists(object_t oid); - int stat(object_t oid, struct stat *st); - - int remove(object_t oid, Context *onsafe=0); - - int truncate(object_t oid, off_t size, Context *onsafe=0); - - int read(object_t oid, off_t offset, size_t len, - bufferlist& bl); - int write(object_t oid, off_t offset, size_t len, - bufferlist& bl, Context *onsafe); - - int setattr(object_t oid, const char *name, - const void *value, size_t size, Context *onsafe=0); - int setattrs(object_t oid, map& aset, - Context *onsafe=0); - int getattr(object_t oid, const char *name, - void *value, size_t size); - int getattrs(object_t oid, map& aset); - int rmattr(object_t oid, const char *name, - Context *onsafe=0); - int listattr(object_t oid, char *attrs, size_t size); - - int clone(object_t oid, object_t noid); - - // Collections. - - int list_collections(list& ls); - int create_collection(coll_t c, Context *onsafe=0); - int destroy_collection(coll_t c, Context *onsafe=0); - bool collection_exists(coll_t c); - int collection_stat(coll_t c, struct stat *st); - int collection_add(coll_t c, object_t o, Context *onsafe=0); - int collection_remove(coll_t c, object_t o, Context *onsafe=0); - int collection_list(coll_t c, list& o); - - int collection_setattr(coll_t cid, const char *name, - const void *value, size_t size, - Context *onsafe=0); - int collection_rmattr(coll_t cid, const char *name, - Context *onsafe=0); - int collection_getattr(coll_t cid, const char *name, - void *value, size_t size); - int collection_listattr(coll_t cid, char *attrs, size_t size); - - void sync(Context *onsync); - void sync(); - -private: - int opendb (DBTYPE type=DB_UNKNOWN, int flags=0, bool new_env=false); - - int _setattr(object_t oid, const char *name, const void *value, - size_t size, Context *onsync, DbTxn *txn); - int _getattr(object_t oid, const char *name, void *value, size_t size); - DbEnv *getenv(); -}; diff --git a/branches/sage/ebofs2/osd/Ager.cc b/branches/sage/ebofs2/osd/Ager.cc deleted file mode 100644 index fb777238da8fb..0000000000000 --- a/branches/sage/ebofs2/osd/Ager.cc +++ /dev/null @@ -1,333 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#include "include/types.h" - -#include "Ager.h" -#include "ObjectStore.h" - -#include "config.h" -#include "common/Clock.h" - -// ick -#include "ebofs/Ebofs.h" -#include -#include -#include - -#ifdef DARWIN -#include -#include -#endif // DARWIN - - -int myrand() -{ - if (0) - return rand(); - else { - static int n = 0; - srand(n++); - return rand(); - } -} - - -object_t Ager::age_get_oid() { - if (!age_free_oids.empty()) { - object_t o = age_free_oids.front(); - age_free_oids.pop_front(); - return o; - } - object_t last = age_cur_oid; - ++age_cur_oid.bno; - return last; -} - -ssize_t Ager::age_pick_size() { - ssize_t max = file_size_distn.sample() * 1024; - return max/2 + (myrand() % 100) * max/200 + 1; -} - -bool start_debug = false; - -uint64_t Ager::age_fill(float pc, utime_t until) { - int max = 1024*1024; - bufferptr bp(max); - bp.zero(); - bufferlist bl; - bl.push_back(bp); - uint64_t wrote = 0; - while (1) { - if (g_clock.now() > until) break; - - struct statfs st; - store->statfs(&st); - float free = 1.0 - ((float)(st.f_bfree) / (float)st.f_blocks); - float avail = 1.0 - ((float)(st.f_bavail) / (float)st.f_blocks); // to write to - //float a = (float)(st.f_bfree) / (float)st.f_blocks; - //dout(10) << "age_fill at " << a << " / " << pc << " .. " << st.f_blocks << " " << st.f_bavail << dendl; - if (free >= pc) { - generic_dout(2) << "age_fill at " << free << " / " << avail << " / " << " / " << pc << " stopping" << dendl; - break; - } - - // make sure we can write to it.. - if (avail > .98 || - avail - free > .02) - store->sync(); - - object_t oid = age_get_oid(); - - int b = myrand() % 10; - age_objects[b].push_back(oid); - - ssize_t s = age_pick_size(); - wrote += (s + 4095) / 4096; - - - - - generic_dout(2) << "age_fill at " << free << " / " << avail << " / " << pc << " creating " << hex << oid << dec << " sz " << s << dendl; - - - if (false && !g_conf.ebofs_verify && start_debug && wrote > 1000000ULL) { - /* - - - 1005700 -? -1005000 -1005700 - 1005710 - 1005725ULL - 1005750ULL - 1005800 - 1006000 - -// 99 1000500 ? 1000750 1006000 -*/ - g_conf.debug_ebofs = 30; - g_conf.ebofs_verify = true; - } - - off_t off = 0; - while (s) { - ssize_t t = MIN(s, max); - bufferlist sbl; - sbl.substr_of(bl, 0, t); - store->write(oid, off, t, sbl, false); - off += t; - s -= t; - } - oid.bno++; - } - - return wrote*4; // KB -} - -void Ager::age_empty(float pc) { - int nper = 20; - int n = nper; - - //g_conf.ebofs_verify = true; - - while (1) { - struct statfs st; - store->statfs(&st); - float free = 1.0 - ((float)(st.f_bfree) / (float)st.f_blocks); - float avail = 1.0 - ((float)(st.f_bavail) / (float)st.f_blocks); // to write to - generic_dout(2) << "age_empty at " << free << " / " << avail << " / " << pc << dendl;//" stopping" << dendl; - if (free <= pc) { - generic_dout(2) << "age_empty at " << free << " / " << avail << " / " << pc << " stopping" << dendl; - break; - } - - int b = myrand() % 10; - n--; - if (n == 0 || age_objects[b].empty()) { - generic_dout(2) << "age_empty sync" << dendl; - //sync(); - //sync(); - n = nper; - continue; - } - object_t oid = age_objects[b].front(); - age_objects[b].pop_front(); - - generic_dout(2) << "age_empty at " << free << " / " << avail << " / " << pc << " removing " << hex << oid << dec << dendl; - - store->remove(oid); - age_free_oids.push_back(oid); - } - - g_conf.ebofs_verify = false; -} - -void pfrag(uint64_t written, ObjectStore::FragmentationStat &st) -{ - cout << "#gb wr\ttotal\tn x\tavg x\tavg per\tavg j\tfree\tn fr\tavg fr\tnum<2\tsum<2\tnum<4\tsum<4\t..." - << std::endl; - cout << written - << "\t" << st.total - << "\t" << st.num_extent - << "\t" << st.avg_extent - << "\t" << st.avg_extent_per_object - << "\t" << st.avg_extent_jump - << "\t" << st.total_free - << "\t" << st.num_free_extent - << "\t" << st.avg_free_extent; - - int n = st.num_extent; - for (uint64_t i=1; i <= 30; i += 1) { - cout << "\t" << st.extent_dist[i]; - cout << "\t" << st.extent_dist_sum[i]; - //cout << "\ta " << (st.extent_dist[i] ? (st.extent_dist_sum[i] / st.extent_dist[i]):0); - n -= st.extent_dist[i]; - if (n == 0) break; - } - cout << std::endl; -} - - -void Ager::age(int time, - float high_water, // fill to this % - float low_water, // then empty to this % - int count, // this many times - float final_water, // and end here ( <= low_water) - int fake_size_mb) { - - store->_fake_writes(true); - srand(0); - - utime_t start = g_clock.now(); - utime_t until = start; - until.sec_ref() += time; - - int elapsed = 0; - int freelist_inc = 60; - utime_t nextfl = start; - nextfl.sec_ref() += freelist_inc; - - while (age_objects.size() < 10) age_objects.push_back( list() ); - - if (fake_size_mb) { - int fake_bl = fake_size_mb * 256; - struct statfs st; - store->statfs(&st); - float f = (float)fake_bl / (float)st.f_blocks; - high_water = (float)high_water * f; - low_water = (float)low_water * f; - final_water = (float)final_water * f; - generic_dout(2) << "fake " << fake_bl << " / " << st.f_blocks << " is " << f << ", high " << high_water << " low " << low_water << " final " << final_water << dendl; - } - - // init size distn (once) - if (!did_distn) { - did_distn = true; - age_cur_oid = object_t(0,1); - file_size_distn.add(1, 19.0758125+0.65434375); - file_size_distn.add(512, 35.6566); - file_size_distn.add(1024, 27.7271875); - file_size_distn.add(2*1024, 16.63503125); - //file_size_distn.add(4*1024, 106.82384375); - //file_size_distn.add(8*1024, 81.493375); - //file_size_distn.add(16*1024, 14.13553125); - //file_size_distn.add(32*1024, 2.176); - //file_size_distn.add(256*1024, 0.655938); - //file_size_distn.add(512*1024, 0.1480625); - //file_size_distn.add(1*1024*1024, 0.020125); // actually 2, but 32bit - file_size_distn.normalize(); - } - - // clear - for (int i=0; i<10; i++) - age_objects[i].clear(); - - ObjectStore::FragmentationStat st; - - uint64_t wrote = 0; - - for (int c=1; c<=count; c++) { - if (g_clock.now() > until) break; - - //if (c == 7) start_debug = true; - - generic_dout(1) << "#age " << c << "/" << count << " filling to " << high_water << dendl; - uint64_t w = age_fill(high_water, until); - //dout(1) << "age wrote " << w << dendl; - wrote += w; - //store->sync(); - //store->_get_frag_stat(st); - //pfrag(st); - - - if (c == count) { - generic_dout(1) << "#age final empty to " << final_water << dendl; - age_empty(final_water); - } else { - generic_dout(1) << "#age " << c << "/" << count << " emptying to " << low_water << dendl; - age_empty(low_water); - } - //store->sync(); - //store->sync(); - - // show frag state - store->_get_frag_stat(st); - pfrag(wrote / (1024ULL*1024ULL) , // GB - st); - - // dump freelist? - if (g_clock.now() > nextfl) { - elapsed += freelist_inc; - save_freelist(elapsed); - nextfl.sec_ref() += freelist_inc; - } - } - - // dump the freelist - save_freelist(0); - exit(0); // hack - - // ok! - store->_fake_writes(false); - store->sync(); - store->sync(); - generic_dout(1) << "age finished" << dendl; -} - - -void Ager::load_freelist() -{ - generic_dout(1) << "load_freelist" << dendl; - - struct stat st; - - int r = ::stat("ebofs.freelist", &st); - assert(r == 0); - - bufferptr bp(st.st_size); - bufferlist bl; - bl.push_back(bp); - int fd = ::open("ebofs.freelist", O_RDONLY); - ::read(fd, bl.c_str(), st.st_size); - ::close(fd); - - ((Ebofs*)store)->_import_freelist(bl); - store->sync(); - store->sync(); -} - -void Ager::save_freelist(int el) -{ - generic_dout(1) << "save_freelist " << el << dendl; - char s[100]; - sprintf(s, "ebofs.freelist.%d", el); - bufferlist bl; - ((Ebofs*)store)->_export_freelist(bl); - ::unlink(s); - int fd = ::open(s, O_CREAT|O_WRONLY); - ::fchmod(fd, 0644); - ::write(fd, bl.c_str(), bl.length()); - ::close(fd); -} diff --git a/branches/sage/ebofs2/osd/Ager.h b/branches/sage/ebofs2/osd/Ager.h deleted file mode 100644 index ad160c0e9f9ff..0000000000000 --- a/branches/sage/ebofs2/osd/Ager.h +++ /dev/null @@ -1,44 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -#ifndef __AGER_H -#define __AGER_H - -#include "include/types.h" -#include "include/Distribution.h" -#include "ObjectStore.h" -#include "common/Clock.h" - -#include -#include -using namespace std; - -class Ager { - ObjectStore *store; - - private: - list age_free_oids; - object_t age_cur_oid; - vector< list > age_objects; - Distribution file_size_distn; //kb - bool did_distn; - - void age_empty(float pc); - uint64_t age_fill(float pc, utime_t until); - ssize_t age_pick_size(); - object_t age_get_oid(); - - public: - Ager(ObjectStore *s) : store(s), did_distn(false) {} - - void age(int time, - float high_water, // fill to this % - float low_water, // then empty to this % - int count, // this many times - float final_water, // and end here ( <= low_water) - int fake_size_mb=0); - - void save_freelist(int); - void load_freelist(); -}; - -#endif diff --git a/branches/sage/ebofs2/osd/BDBMap.h b/branches/sage/ebofs2/osd/BDBMap.h deleted file mode 100644 index a8e96a8a192f7..0000000000000 --- a/branches/sage/ebofs2/osd/BDBMap.h +++ /dev/null @@ -1,137 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __BERKELEYDB_H -#define __BERKELEYDB_H - -#include -#include - -#include -using namespace std; - - -template -class BDBMap { - private: - DB *dbp; - - public: - BDBMap() : dbp(0) {} - ~BDBMap() { - close(); - } - - bool is_open() { return dbp ? true:false; } - - // open/close - int open(const char *fn) { - //cout << "open " << fn << endl; - - int r; - if ((r = db_create(&dbp, NULL, 0)) != 0) { - cerr << "db_create: " << db_strerror(r) << endl; - assert(0); - } - - dbp->set_errfile(dbp, stderr); - dbp->set_errpfx(dbp, "bdbmap"); - - r = dbp->open(dbp, NULL, fn, NULL, DB_BTREE, DB_CREATE, 0644); - if (r != 0) { - dbp->err(dbp, r, "%s", fn); - } - assert(r == 0); - return 0; - } - void close() { - if (dbp) { - dbp->close(dbp,0); - dbp = 0; - } - } - void remove(const char *fn) { - if (!dbp) open(fn); - if (dbp) { - dbp->remove(dbp, fn, 0, 0); - dbp = 0; - } else { - ::unlink(fn); - } - } - - // accessors - int put(K key, - D data) { - DBT k; - memset(&k, 0, sizeof(k)); - k.data = &key; - k.size = sizeof(K); - DBT d; - memset(&d, 0, sizeof(d)); - d.data = &data; - d.size = sizeof(data); - return dbp->put(dbp, NULL, &k, &d, 0); - } - - int get(K key, - D& data) { - DBT k; - memset(&k, 0, sizeof(k)); - k.data = &key; - k.size = sizeof(key); - DBT d; - memset(&d, 0, sizeof(d)); - d.data = &data; - d.size = sizeof(data); - int r = dbp->get(dbp, NULL, &k, &d, 0); - return r; - } - - int del(K key) { - DBT k; - memset(&k, 0, sizeof(k)); - k.data = &key; - k.size = sizeof(key); - return dbp->del(dbp, NULL, &k, 0); - } - - int list_keys(list& ls) { - DBC *cursor = 0; - int r = dbp->cursor(dbp, NULL, &cursor, 0); - assert(r == 0); - - DBT k,d; - memset(&k, 0, sizeof(k)); - memset(&d, 0, sizeof(d)); - - while ((r = cursor->c_get(cursor, &k, &d, DB_NEXT)) == 0) { - K key; - assert(k.size == sizeof(key)); - memcpy(&key, k.data, k.size); - ls.push_back(key); - } - if (r != DB_NOTFOUND) { - dbp->err(dbp, r, "DBcursor->get"); - assert(r == DB_NOTFOUND); - } - - cursor->c_close(cursor); - return 0; - } - -}; - -#endif diff --git a/branches/sage/ebofs2/osd/Fake.h b/branches/sage/ebofs2/osd/Fake.h deleted file mode 100644 index 342c153c25cfd..0000000000000 --- a/branches/sage/ebofs2/osd/Fake.h +++ /dev/null @@ -1,262 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __FAKE_H -#define __FAKE_H - -#include "include/types.h" - -#include -#include -#include -using namespace std; -using namespace __gnu_cxx; - -class FakeStoreCollections { - private: - Mutex faker_lock; - ObjectStore *store; - hash_map > fakecollections; - - public: - FakeStoreCollections(ObjectStore *s) : store(s) {} - - // faked collections - int list_collections(list& ls) { - faker_lock.Lock(); - int r = 0; - for (hash_map< coll_t, set >::iterator p = fakecollections.begin(); - p != fakecollections.end(); - p++) { - r++; - ls.push_back(p->first); - } - faker_lock.Unlock(); - return r; - } - - int create_collection(coll_t c, - Context *onsafe=0) { - faker_lock.Lock(); - fakecollections[c].size(); - if (onsafe) store->sync(onsafe); - faker_lock.Unlock(); - return 0; - } - - int destroy_collection(coll_t c, - Context *onsafe=0) { - int r = 0; - faker_lock.Lock(); - if (fakecollections.count(c)) { - fakecollections.erase(c); - //fakecattr.erase(c); - if (onsafe) store->sync(onsafe); - } else - r = -1; - faker_lock.Unlock(); - return r; - } - - int collection_stat(coll_t c, struct stat *st) { - return collection_exists(c) ? 0:-1; - } - - bool collection_exists(coll_t c) { - faker_lock.Lock(); - int r = fakecollections.count(c); - faker_lock.Unlock(); - return r; - } - - int collection_add(coll_t c, object_t o, - Context *onsafe=0) { - faker_lock.Lock(); - fakecollections[c].insert(o); - if (onsafe) store->sync(onsafe); - faker_lock.Unlock(); - return 0; - } - - int collection_remove(coll_t c, object_t o, - Context *onsafe=0) { - faker_lock.Lock(); - fakecollections[c].erase(o); - if (onsafe) store->sync(onsafe); - faker_lock.Unlock(); - return 0; - } - - int collection_list(coll_t c, list& o) { - faker_lock.Lock(); - int r = 0; - for (set::iterator p = fakecollections[c].begin(); - p != fakecollections[c].end(); - p++) { - o.push_back(*p); - r++; - } - faker_lock.Unlock(); - return r; - } - -}; - -class FakeStoreAttrs { - private: - - class FakeAttrSet { - public: - map attrs; - - int getattr(const char *name, void *value, size_t size) { - string n = name; - if (attrs.count(n)) { - size_t l = MIN( attrs[n].length(), size ); - bufferlist bl; - bl.append(attrs[n]); - bl.copy(0, l, (char*)value); - return l; - } - return -1; - } - int getattrs(map& aset) { - aset = attrs; - return 0; - } - int setattrs(map& aset) { - attrs = aset; - return 0; - } - - int setattr(const char *name, const void *value, size_t size) { - string n = name; - bufferptr bp = buffer::copy((char*)value, size); - attrs[n] = bp; - return 0; - } - - int listattr(char *attrs, size_t size) { - assert(0); - return 0; - } - - int rmattr(const char *name) { - string n = name; - attrs.erase(n); - return 0; - } - - bool empty() { return attrs.empty(); } - }; - - Mutex faker_lock; - ObjectStore *store; - hash_map fakeoattrs; - hash_map fakecattrs; - - public: - FakeStoreAttrs(ObjectStore *s) : store(s) {} - - int setattr(object_t oid, const char *name, - const void *value, size_t size, - Context *onsafe=0) { - faker_lock.Lock(); - int r = fakeoattrs[oid].setattr(name, value, size); - if (onsafe) store->sync(onsafe); - faker_lock.Unlock(); - return r; - } - int setattrs(object_t oid, map& aset) { - faker_lock.Lock(); - int r = fakeoattrs[oid].setattrs(aset); - faker_lock.Unlock(); - return r; - } - int getattr(object_t oid, const char *name, - void *value, size_t size) { - faker_lock.Lock(); - int r = fakeoattrs[oid].getattr(name, value, size); - faker_lock.Unlock(); - return r; - } - int getattrs(object_t oid, map& aset) { - faker_lock.Lock(); - int r = fakeoattrs[oid].getattrs(aset); - faker_lock.Unlock(); - return r; - } - int rmattr(object_t oid, const char *name, - Context *onsafe=0) { - faker_lock.Lock(); - int r = fakeoattrs[oid].rmattr(name); - if (onsafe) store->sync(onsafe); - faker_lock.Unlock(); - return r; - } - - int listattr(object_t oid, char *attrs, size_t size) { - faker_lock.Lock(); - int r = fakeoattrs[oid].listattr(attrs,size); - faker_lock.Unlock(); - return r; - } - - int collection_setattr(coll_t c, const char *name, - void *value, size_t size, - Context *onsafe=0) { - faker_lock.Lock(); - int r = fakecattrs[c].setattr(name, value, size); - if (onsafe) store->sync(onsafe); - faker_lock.Unlock(); - return r; - } - int collection_setattrs(coll_t cid, map& aset) { - faker_lock.Lock(); - int r = fakecattrs[cid].setattrs(aset); - faker_lock.Unlock(); - return r; - } - int collection_getattrs(coll_t cid, map& aset) { - faker_lock.Lock(); - int r = fakecattrs[cid].getattrs(aset); - faker_lock.Unlock(); - return r; - } - int collection_rmattr(coll_t c, const char *name, - Context *onsafe=0) { - faker_lock.Lock(); - int r = fakecattrs[c].rmattr(name); - if (onsafe) store->sync(onsafe); - faker_lock.Unlock(); - return r; - } - int collection_getattr(coll_t c, const char *name, - void *value, size_t size) { - faker_lock.Lock(); - int r = fakecattrs[c].getattr(name, value, size); - faker_lock.Unlock(); - return r; - } - int collection_listattr(coll_t c, char *attrs, size_t size) { - faker_lock.Lock(); - int r = fakecattrs[c].listattr(attrs,size); - faker_lock.Unlock(); - return r; - } - -}; - -#endif diff --git a/branches/sage/ebofs2/osd/FakeStore.cc b/branches/sage/ebofs2/osd/FakeStore.cc deleted file mode 100644 index e7c77f3eab558..0000000000000 --- a/branches/sage/ebofs2/osd/FakeStore.cc +++ /dev/null @@ -1,742 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "FakeStore.h" -#include "include/types.h" - -#include "common/Timer.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#ifndef __CYGWIN__ -# include -#endif -//#include - -#ifdef DARWIN -#include -#include -#endif // DARWIN - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug) *_dout << dbeginl << g_clock.now() << " fakestore(" << basedir << ") " -#define derr(l) if (l<=g_conf.debug) *_derr << dbeginl << g_clock.now() << " fakestore(" << basedir << ") " - -#include "include/buffer.h" - -#include - - -// crap-a-crap hash -//#define HASH_DIRS 0x80 -//#define HASH_MASK 0x7f -// end crap hash - - - - -int FakeStore::statfs(struct statfs *buf) -{ - return ::statfs(basedir.c_str(), buf); -} - - -/* - * sorry, these are sentitive to the object_t and coll_t typing. - */ -void FakeStore::get_oname(object_t oid, char *s) -{ - //static hash H; - assert(sizeof(oid) == 16); -#ifdef __LP64__ - //sprintf(s, "%s/objects/%02lx/%016lx.%016lx", basedir.c_str(), H(oid) & HASH_MASK, - sprintf(s, "%s/objects/%016lx.%016lx", basedir.c_str(), - *((uint64_t*)&oid), - *(((uint64_t*)&oid) + 1)); -#else - //sprintf(s, "%s/objects/%02x/%016llx.%016llx", basedir.c_str(), H(oid) & HASH_MASK, - sprintf(s, "%s/objects/%016llx.%016llx", basedir.c_str(), - *((uint64_t*)&oid), - *(((uint64_t*)&oid) + 1)); -#endif -} - -void FakeStore::get_cdir(coll_t cid, char *s) -{ - assert(sizeof(cid) == 8); -#ifdef __LP64__ - sprintf(s, "%s/collections/%016lx", basedir.c_str(), - cid); -#else - sprintf(s, "%s/collections/%016llx", basedir.c_str(), - cid); -#endif -} - -void FakeStore::get_coname(coll_t cid, object_t oid, char *s) -{ - assert(sizeof(oid) == 16); -#ifdef __LP64__ - sprintf(s, "%s/collections/%016lx/%016lx.%016lx", basedir.c_str(), cid, - *((uint64_t*)&oid), - *(((uint64_t*)&oid) + 1)); -#else - sprintf(s, "%s/collections/%016llx/%016llx.%016llx", basedir.c_str(), cid, - *((uint64_t*)&oid), - *(((uint64_t*)&oid) + 1)); -#endif -} - - - - -int FakeStore::mkfs() -{ - char cmd[200]; - if (g_conf.fakestore_dev) { - dout(0) << "mounting" << dendl; - sprintf(cmd,"mount %s", g_conf.fakestore_dev); - system(cmd); - } - - dout(1) << "mkfs in " << basedir << dendl; - - // wipe - sprintf(cmd, "test -d %s && rm -r %s ; mkdir -p %s/collections && mkdir -p %s/objects", - basedir.c_str(), basedir.c_str(), basedir.c_str(), basedir.c_str()); - - dout(5) << "wipe: " << cmd << dendl; - system(cmd); - - // hashed bits too - /* - for (int i=0; i 0) bl.push_back( bptr ); // put it in the target bufferlist - } - ::flock(fd, LOCK_UN); - ::close(fd); - return got; -} - - -int FakeStore::write(object_t oid, - off_t offset, size_t len, - const bufferlist& bl, - Context *onsafe) -{ - char fn[200]; - get_oname(oid,fn); - - dout(20) << "write " << fn << " len " << len << " off " << offset << dendl; - - - ::mknod(fn, 0644, 0); // in case it doesn't exist yet. - - int flags = O_WRONLY;//|O_CREAT; - int fd = ::open(fn, flags); - if (fd < 0) { - derr(0) << "write couldn't open " << fn << " flags " << flags << " errno " << errno << " " << strerror(errno) << dendl; - return fd; - } - ::fchmod(fd, 0664); - ::flock(fd, LOCK_EX); // lock for safety - - // seek - off_t actual = ::lseek(fd, offset, SEEK_SET); - int did = 0; - assert(actual == offset); - - // write buffers - for (list::const_iterator it = bl.buffers().begin(); - it != bl.buffers().end(); - it++) { - int r = ::write(fd, (char*)(*it).c_str(), (*it).length()); - if (r > 0) - did += r; - else { - derr(0) << "couldn't write to " << fn << " len " << len << " off " << offset << " errno " << errno << " " << strerror(errno) << dendl; - } - } - - if (did < 0) { - derr(0) << "couldn't write to " << fn << " len " << len << " off " << offset << " errno " << errno << " " << strerror(errno) << dendl; - } - - ::flock(fd, LOCK_UN); - - // schedule sync - if (onsafe) sync(onsafe); - - ::close(fd); - - return did; -} - - -class C_FakeSync : public Context { - Context *c; - int *n; - Mutex *lock; - Cond *cond; - -public: - C_FakeSync(Context *c_, int *n_, Mutex *lo, Cond *co) : - c(c_), n(n_), - lock(lo), cond(co) { - lock->Lock(); - ++*n; - lock->Unlock(); - } - void finish(int r) { - c->finish(r); - - lock->Lock(); - --(*n); - if (*n == 0) cond->Signal(); - lock->Unlock(); - } -}; - -void FakeStore::sync() -{ - synclock.Lock(); - while (unsync > 0) { - dout(0) << "sync waiting for " << unsync << " items to (fake) sync" << dendl; - synccond.Wait(synclock); - } - synclock.Unlock(); -} - -void FakeStore::sync(Context *onsafe) -{ - if (g_conf.fakestore_fake_sync > 0.0) { - g_timer.add_event_after((float)g_conf.fakestore_fake_sync, - new C_FakeSync(onsafe, &unsync, &synclock, &synccond)); - - } else { - assert(0); // der..no implemented anymore - } -} - - -// ------------------------------- -// attributes - -// objects - -int FakeStore::setattr(object_t oid, const char *name, - const void *value, size_t size, - Context *onsafe) -{ - if (fake_attrs) return attrs.setattr(oid, name, value, size, onsafe); - - int r = 0; -#ifndef __CYGWIN__ - char fn[100]; - get_oname(oid, fn); - r = ::setxattr(fn, name, value, size, 0); -#endif - return r; -} - -int FakeStore::setattrs(object_t oid, map& aset) -{ - if (fake_attrs) return attrs.setattrs(oid, aset); - - char fn[100]; - get_oname(oid, fn); - int r = 0; -#ifndef __CYGWIN__ - for (map::iterator p = aset.begin(); - p != aset.end(); - ++p) { - r = ::setxattr(fn, p->first.c_str(), p->second.c_str(), p->second.length(), 0); - if (r < 0) { - cerr << "error setxattr " << strerror(errno) << std::endl; - break; - } - } -#endif - return r; -} - -int FakeStore::getattr(object_t oid, const char *name, - void *value, size_t size) -{ - if (fake_attrs) return attrs.getattr(oid, name, value, size); - int r = 0; -#ifndef __CYGWIN__ - char fn[100]; - get_oname(oid, fn); - r = ::getxattr(fn, name, value, size); -#endif - return r; -} - -int FakeStore::getattrs(object_t oid, map& aset) -{ - if (fake_attrs) return attrs.getattrs(oid, aset); - -#ifndef __CYGWIN__ - char fn[100]; - get_oname(oid, fn); - - char val[1000]; - char names[1000]; - int num = ::listxattr(fn, names, 1000); - - char *name = names; - for (int i=0; i& aset) -{ - if (fake_attrs) return attrs.collection_setattrs(cid, aset); - - char fn[100]; - get_cdir(cid, fn); - int r = 0; -#ifndef __CYGWIN__ - for (map::iterator p = aset.begin(); - p != aset.end(); - ++p) { - r = ::setxattr(fn, p->first.c_str(), p->second.c_str(), p->second.length(), 0); - if (r < 0) break; - } -#endif - return r; -} - -int FakeStore::collection_getattrs(coll_t cid, map& aset) -{ - if (fake_attrs) return attrs.collection_getattrs(cid, aset); - -#ifndef __CYGWIN__ - char fn[100]; - get_cdir(cid, fn); - - char val[1000]; - char names[1000]; - int num = ::listxattr(fn, names, 1000); - - char *name = names; - for (int i=0; i& ls) -{ - char fn[200]; - sprintf(fn, "%s/objects", basedir.c_str()); - - DIR *dir = ::opendir(fn); - assert(dir); - - struct dirent *de; - while ((de = ::readdir(dir)) != 0) { - if (de->d_name[0] == '.') continue; - // parse - object_t o; - assert(sizeof(o) == 16); - //cout << " got object " << de->d_name << std::endl; - *(((uint64_t*)&o) + 0) = strtoll(de->d_name, 0, 16); - assert(de->d_name[16] == '.'); - *(((uint64_t*)&o) + 1) = strtoll(de->d_name+17, 0, 16); - //dout(0) << " got " << o << " errno " << errno << " on " << de->d_name << dendl; - if (errno) continue; - ls.push_back(o); - } - - ::closedir(dir); - return 0; -} - - -// -------------------------- -// collections - -int FakeStore::list_collections(list& ls) -{ - if (fake_collections) return collections.list_collections(ls); - - char fn[200]; - sprintf(fn, "%s/collections", basedir.c_str()); - - DIR *dir = ::opendir(fn); - assert(dir); - - struct dirent *de; - while ((de = ::readdir(dir)) != 0) { - // parse - errno = 0; - coll_t c = strtoll(de->d_name, 0, 16); - if (c) ls.push_back(c); - } - - ::closedir(dir); - return 0; -} - -int FakeStore::create_collection(coll_t c, - Context *onsafe) -{ - if (fake_collections) return collections.create_collection(c, onsafe); - - char fn[200]; - get_cdir(c, fn); - - int r = ::mkdir(fn, 0755); - - if (onsafe) sync(onsafe); - return r; -} - -int FakeStore::destroy_collection(coll_t c, - Context *onsafe) -{ - if (fake_collections) return collections.destroy_collection(c, onsafe); - - char fn[200]; - get_cdir(c, fn); - char cmd[200]; - sprintf(cmd, "test -d %s && rm -r %s", fn, fn); - system(cmd); - - if (onsafe) sync(onsafe); - return 0; -} - -int FakeStore::collection_stat(coll_t c, struct stat *st) -{ - if (fake_collections) return collections.collection_stat(c, st); - - char fn[200]; - get_cdir(c, fn); - return ::lstat(fn, st); -} - -bool FakeStore::collection_exists(coll_t c) -{ - if (fake_collections) return collections.collection_exists(c); - - struct stat st; - return collection_stat(c, &st) == 0; -} - - -int FakeStore::collection_add(coll_t c, object_t o, - Context *onsafe) -{ - if (fake_collections) return collections.collection_add(c, o, onsafe); - - char cof[200]; - get_coname(c, o, cof); - char of[200]; - get_oname(o, of); - - int r = ::link(of, cof); - if (onsafe) sync(onsafe); - return r; -} - -int FakeStore::collection_remove(coll_t c, object_t o, - Context *onsafe) -{ - if (fake_collections) return collections.collection_remove(c, o, onsafe); - - char cof[200]; - get_coname(c, o, cof); - - int r = ::unlink(cof); - if (onsafe) sync(onsafe); - return r; -} - -int FakeStore::collection_list(coll_t c, list& ls) -{ - if (fake_collections) return collections.collection_list(c, ls); - - char fn[200]; - get_cdir(c, fn); - - DIR *dir = ::opendir(fn); - assert(dir); - - struct dirent *de; - while ((de = ::readdir(dir)) != 0) { - // parse - if (de->d_name[0] == '.') continue; - //cout << " got object " << de->d_name << std::endl; - object_t o; - assert(sizeof(o) == 16); - *(((uint64_t*)&o) + 0) = strtoll(de->d_name, 0, 16); - assert(de->d_name[16] == '.'); - *(((uint64_t*)&o) + 1) = strtoll(de->d_name+17, 0, 16); - dout(0) << " got " << o << " errno " << errno << " on " << de->d_name << dendl; - if (errno) continue; - ls.push_back(o); - } - - ::closedir(dir); - return 0; -} - -// eof. diff --git a/branches/sage/ebofs2/osd/FakeStore.h b/branches/sage/ebofs2/osd/FakeStore.h deleted file mode 100644 index 5828c27c14d96..0000000000000 --- a/branches/sage/ebofs2/osd/FakeStore.h +++ /dev/null @@ -1,114 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __FAKESTORE_H -#define __FAKESTORE_H - -#include "ObjectStore.h" -#include "common/ThreadPool.h" -#include "common/Mutex.h" - -#include "Fake.h" -//#include "FakeStoreBDBCollections.h" - - -#include -using namespace std; - -#include -using namespace __gnu_cxx; - - -// fake attributes in memory, if we need to. - -class FakeStore : public ObjectStore { - string basedir; - - Mutex synclock; - Cond synccond; - int unsync; - - // fake attrs? - FakeStoreAttrs attrs; - bool fake_attrs; - - // fake collections? - FakeStoreCollections collections; - bool fake_collections; - - // helper fns - void get_oname(object_t oid, char *s); - void get_cdir(coll_t cid, char *s); - void get_coname(coll_t cid, object_t oid, char *s); - - public: - FakeStore(char *base) : - basedir(base), - unsync(0), - attrs(this), fake_attrs(false), - collections(this), fake_collections(false) { } - - int mount(); - int umount(); - int mkfs(); - - int statfs(struct statfs *buf); - - // ------------------ - // objects - int pick_object_revision_lt(object_t& oid) { - return 0; - } - bool exists(object_t oid); - int stat(object_t oid, struct stat *st); - int remove(object_t oid, Context *onsafe); - int truncate(object_t oid, off_t size, Context *onsafe); - int read(object_t oid, off_t offset, size_t len, bufferlist& bl); - int write(object_t oid, off_t offset, size_t len, const bufferlist& bl, Context *onsafe); - - void sync(); - void sync(Context *onsafe); - - int list_objects(list& ls); - - // attrs - int setattr(object_t oid, const char *name, const void *value, size_t size, Context *onsafe=0); - int setattrs(object_t oid, map& aset); - int getattr(object_t oid, const char *name, void *value, size_t size); - int getattrs(object_t oid, map& aset); - int rmattr(object_t oid, const char *name, Context *onsafe=0); - //int listattr(object_t oid, char *attrs, size_t size); - int collection_setattr(coll_t c, const char *name, void *value, size_t size, Context *onsafe=0); - int collection_rmattr(coll_t c, const char *name, Context *onsafe=0); - int collection_getattr(coll_t c, const char *name, void *value, size_t size); - //int collection_listattr(coll_t c, char *attrs, size_t size); - int collection_getattrs(coll_t cid, map &aset); - int collection_setattrs(coll_t cid, map &aset); - - // collections - int list_collections(list& ls); - int create_collection(coll_t c, Context *onsafe=0); - int destroy_collection(coll_t c, Context *onsafe=0); - int collection_stat(coll_t c, struct stat *st); - bool collection_exists(coll_t c); - int collection_add(coll_t c, object_t o, Context *onsafe=0); - int collection_remove(coll_t c, object_t o, Context *onsafe=0); - int collection_list(coll_t c, list& o); - - - -}; - -#endif diff --git a/branches/sage/ebofs2/osd/FakeStoreBDBCollections.h b/branches/sage/ebofs2/osd/FakeStoreBDBCollections.h deleted file mode 100644 index a779a2a57972c..0000000000000 --- a/branches/sage/ebofs2/osd/FakeStoreBDBCollections.h +++ /dev/null @@ -1,169 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __FAKESTOREBDBCOLLECTIONS_H -#define __FAKESTOREBDBCOLLECTIONS_H - -#include "BDBMap.h" -#include "ObjectStore.h" -#include "common/Mutex.h" - -#define BDBHASH_DIRS 128LL -#define BDBHASH_FUNC(x) (((x) ^ ((x)>>30) ^ ((x)>>18) ^ ((x)>>45) ^ 0xdead1234) * 884811 % BDBHASH_DIRS) - -class FakeStoreBDBCollections { - private: - int whoami; - string basedir; - - Mutex bdblock; - - // collection dbs - BDBMap collections; - map*> collection_map; - - // dirs - void get_dir(string& dir) { - char s[30]; - sprintf(s, "%d", whoami); - dir = basedir + "/" + s; - } - void get_collfn(coll_t c, string &fn) { - char s[100]; - sprintf(s, "%d/%02llx/%016llx.co", whoami, BDBHASH_FUNC(c), c); - fn = basedir + "/" + s; - } - - void open_collections() { - string cfn; - get_dir(cfn); - cfn += "/collections"; - collections.open(cfn.c_str()); - list ls; - collections.list_keys(ls); - } - void close_collections() { - if (collections.is_open()) - collections.close(); - - for (map*>::iterator it = collection_map.begin(); - it != collection_map.end(); - it++) { - it->second->close(); - } - collection_map.clear(); - } - - int open_collection(coll_t c) { - if (collection_map.count(c)) - return 0; // already open. - - string fn; - get_collfn(c,fn); - collection_map[c] = new BDBMap; - int r = collection_map[c]->open(fn.c_str()); - if (r != 0) - collection_map.erase(c); // failed - return r; - } - - public: - FakeStoreBDBCollections(int w, string& bd) : whoami(w), basedir(bd) {} - ~FakeStoreBDBCollections() { - close_collections(); - } - - int list_collections(list& ls) { - bdblock.Lock(); - if (!collections.is_open()) open_collections(); - - ls.clear(); - collections.list_keys(ls); - bdblock.Unlock(); - return 0; - } - int create_collection(coll_t c) { - bdblock.Lock(); - if (!collections.is_open()) open_collections(); - - collections.put(c, 1); - open_collection(c); - bdblock.Unlock(); - return 0; - } - int destroy_collection(coll_t c) { - bdblock.Lock(); - if (!collections.is_open()) open_collections(); - - collections.del(c); - - open_collection(c); - collection_map[c]->close(); - - string fn; - get_collfn(c,fn); - collection_map[c]->remove(fn.c_str()); - delete collection_map[c]; - collection_map.erase(c); - bdblock.Unlock(); - return 0; - } - int collection_stat(coll_t c, struct stat *st) { - bdblock.Lock(); - if (!collections.is_open()) open_collections(); - - string fn; - get_collfn(c,fn); - int r = ::stat(fn.c_str(), st); - bdblock.Unlock(); - return r; - } - bool collection_exists(coll_t c) { - bdblock.Lock(); - struct stat st; - int r = collection_stat(c, &st) == 0; - bdblock.Unlock(); - return r; - } - int collection_add(coll_t c, object_t o) { - bdblock.Lock(); - if (!collections.is_open()) open_collections(); - - open_collection(c); - collection_map[c]->put(o,1); - bdblock.Unlock(); - return 0; - } - int collection_remove(coll_t c, object_t o) { - bdblock.Lock(); - if (!collections.is_open()) open_collections(); - - open_collection(c); - collection_map[c]->del(o); - bdblock.Unlock(); - return 0; - } - int collection_list(coll_t c, list& o) { - bdblock.Lock(); - if (!collections.is_open()) open_collections(); - - open_collection(c); - collection_map[c]->list_keys(o); - bdblock.Unlock(); - return 0; - } -}; - -#endif diff --git a/branches/sage/ebofs2/osd/OSD.cc b/branches/sage/ebofs2/osd/OSD.cc deleted file mode 100644 index ab57f0c603302..0000000000000 --- a/branches/sage/ebofs2/osd/OSD.cc +++ /dev/null @@ -1,2377 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "include/types.h" - -#include "OSD.h" -#include "OSDMap.h" - -#include "FakeStore.h" - -#include "ebofs/Ebofs.h" - -#ifdef USE_OSBDB -#include "osbdb/OSBDB.h" -#endif // USE_OSBDB - - -#include "ReplicatedPG.h" -//#include "RAID4PG.h" - -#include "Ager.h" - - -#include "msg/Messenger.h" -#include "msg/Message.h" - -#include "messages/MGenericMessage.h" -#include "messages/MPing.h" -#include "messages/MPingAck.h" -#include "messages/MOSDPing.h" -#include "messages/MOSDFailure.h" -#include "messages/MOSDOp.h" -#include "messages/MOSDOpReply.h" -#include "messages/MOSDBoot.h" -#include "messages/MOSDIn.h" -#include "messages/MOSDOut.h" - -#include "messages/MOSDMap.h" -#include "messages/MOSDGetMap.h" -#include "messages/MOSDPGNotify.h" -#include "messages/MOSDPGQuery.h" -#include "messages/MOSDPGLog.h" -#include "messages/MOSDPGRemove.h" -#include "messages/MOSDPGActivateSet.h" - -#include "messages/MPGStats.h" - -#include "common/Logger.h" -#include "common/LogType.h" -#include "common/Timer.h" -#include "common/ThreadPool.h" - -#include -#include -#include -#include - - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_osd) *_dout << dbeginl << g_clock.now() << " osd" << whoami << " " << (osdmap ? osdmap->get_epoch():0) << " " -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_osd) *_derr << dbeginl << g_clock.now() << " osd" << whoami << " " << (osdmap ? osdmap->get_epoch():0) << " " - -char *osd_base_path = "./osddata"; -char *ebofs_base_path = "./dev"; - -static const object_t SUPERBLOCK_OBJECT(0,0); - -// force remount hack for performance testing FakeStore -class C_Remount : public Context { - OSD *osd; -public: - C_Remount(OSD *o) : osd(o) {} - void finish(int) { - osd->force_remount(); - } -}; - -void OSD::force_remount() -{ - dout(0) << "forcing remount" << dendl; - osd_lock.Lock(); - { - store->umount(); - store->mount(); - } - osd_lock.Unlock(); - dout(0) << "finished remount" << dendl; -} -// - - -// cons/des - -LogType osd_logtype; - -OSD::OSD(int id, Messenger *m, MonMap *mm, char *dev) : - timer(osd_lock), - stat_oprate(5.0), - read_latency_calc(g_conf.osd_max_opq<1 ? 1:g_conf.osd_max_opq), - qlen_calc(3), - iat_averager(g_conf.osd_flash_crowd_iat_alpha) -{ - whoami = id; - messenger = m; - monmap = mm; - - osdmap = 0; - boot_epoch = 0; - - last_tid = 0; - num_pulling = 0; - - state = STATE_BOOTING; - - stat_ops = 0; - stat_qlen = 0; - stat_rd_ops = stat_rd_ops_shed_in = stat_rd_ops_shed_out = 0; - stat_rd_ops_in_queue = 0; - - pending_ops = 0; - waiting_for_no_ops = false; - - if (g_conf.osd_remount_at) - timer.add_event_after(g_conf.osd_remount_at, new C_Remount(this)); - - - // init object store - // try in this order: - // dev/osd$num - // dev/osd.$hostname - // dev/osd.all - - if (dev) { - strcpy(dev_path,dev); - } else { - char hostname[100]; - hostname[0] = 0; - gethostname(hostname,100); - - sprintf(dev_path, "%s/osd%d", ebofs_base_path, whoami); - - struct stat sta; - if (::lstat(dev_path, &sta) != 0) - sprintf(dev_path, "%s/osd.%s", ebofs_base_path, hostname); - - if (::lstat(dev_path, &sta) != 0) - sprintf(dev_path, "%s/osd.all", ebofs_base_path); - } - - if (g_conf.ebofs) { - store = new Ebofs(dev_path); - //store->_fake_writes(true); - } -#ifdef USE_OSBDB - else if (g_conf.bdbstore) { - store = new OSBDB(dev_path); - } -#endif // USE_OSBDB - else { - sprintf(dev_path, "osddata/osd%d", whoami); - store = new FakeStore(dev_path); - } - -} - -OSD::~OSD() -{ - if (threadpool) { delete threadpool; threadpool = 0; } - if (osdmap) { delete osdmap; osdmap = 0; } - //if (monitor) { delete monitor; monitor = 0; } - if (messenger) { delete messenger; messenger = 0; } - if (logger) { delete logger; logger = 0; } - if (store) { delete store; store = 0; } -} - -int OSD::init() -{ - Mutex::Locker lock(osd_lock); - - // mkfs? - if (g_conf.osd_mkfs) { - dout(2) << "mkfs on local store" << dendl; - if (store->mkfs() < 0) - return -1; - - // make up a superblock - //superblock.fsid = ???; - superblock.whoami = whoami; - } - - // mount. - dout(2) << "mounting " << dev_path << dendl; - int r = store->mount(); - if (r < 0) return -1; - - if (g_conf.osd_mkfs) { - // age? - if (g_conf.osd_age_time != 0) { - dout(2) << "age" << dendl; - Ager ager(store); - if (g_conf.osd_age_time < 0) - ager.load_freelist(); - else - ager.age(g_conf.osd_age_time, - g_conf.osd_age, - g_conf.osd_age - .05, - 50000, - g_conf.osd_age - .05); - } - - if (g_conf.osd_auto_weight) { - // benchmark - bufferlist bl; - bufferptr bp(1048576); - bp.zero(); - bl.push_back(bp); - utime_t start = g_clock.now(); - for (int i=0; i<1000; i++) - store->write(object_t(999,i), 0, bl.length(), bl, 0); - store->sync(); - utime_t end = g_clock.now(); - end -= start; - dout(0) << "measured " << (1000.0 / (double)end) << " mb/sec" << dendl; - for (int i=0; i<1000; i++) - store->remove(object_t(999,i), 0); - - // set osd weight - superblock.weight = (1000.0 / (double)end); - } - } - else { - dout(2) << "boot" << dendl; - - // read superblock - read_superblock(); - - // load up pgs (as they previously existed) - load_pgs(); - - dout(2) << "superblock: i am osd" << superblock.whoami << dendl; - assert(whoami == superblock.whoami); - } - - - - - // log - char name[80]; - sprintf(name, "osd%d", whoami); - logger = new Logger(name, (LogType*)&osd_logtype); - osd_logtype.add_set("opq"); - osd_logtype.add_inc("op"); - osd_logtype.add_inc("c_rd"); - osd_logtype.add_inc("c_rdb"); - osd_logtype.add_inc("c_wr"); - osd_logtype.add_inc("c_wrb"); - - osd_logtype.add_inc("r_push"); - osd_logtype.add_inc("r_pushb"); - osd_logtype.add_inc("r_wr"); - osd_logtype.add_inc("r_wrb"); - - osd_logtype.add_set("qlen"); - osd_logtype.add_set("rqlen"); - osd_logtype.add_set("rdlat"); - osd_logtype.add_set("rdlatm"); - osd_logtype.add_set("fshdin"); - osd_logtype.add_set("fshdout"); - osd_logtype.add_inc("shdout"); - osd_logtype.add_inc("shdin"); - - osd_logtype.add_set("loadavg"); - - osd_logtype.add_inc("rlsum"); - osd_logtype.add_inc("rlnum"); - - osd_logtype.add_set("numpg"); - osd_logtype.add_set("pingset"); - - osd_logtype.add_set("buf"); - - osd_logtype.add_inc("map"); - osd_logtype.add_inc("mapi"); - osd_logtype.add_inc("mapidup"); - osd_logtype.add_inc("mapf"); - osd_logtype.add_inc("mapfdup"); - - // request thread pool - { - char name[80]; - sprintf(name,"osd%d.threadpool", whoami); - threadpool = new ThreadPool(name, g_conf.osd_maxthreads, - static_dequeueop, - this); - } - - // i'm ready! - messenger->set_dispatcher(this); - - // announce to monitor i exist and have booted. - int mon = monmap->pick_mon(); - messenger->send_message(new MOSDBoot(messenger->get_myinst(), superblock), monmap->get_inst(mon)); - - // start the heart - timer.add_event_after(g_conf.osd_heartbeat_interval, new C_Heartbeat(this)); - - // and stat beacon - timer.add_event_after(g_conf.osd_pg_stats_interval, new C_Stats(this)); - - return 0; -} - -int OSD::shutdown() -{ - dout(1) << "shutdown" << dendl; - - state = STATE_STOPPING; - - // cancel timers - timer.cancel_all(); - timer.join(); - - // finish ops - wait_for_no_ops(); - - // stop threads - delete threadpool; - threadpool = 0; - - // close pgs - for (hash_map::iterator p = pg_map.begin(); - p != pg_map.end(); - p++) { - delete p->second; - } - pg_map.clear(); - - // shut everything else down - //monitor->shutdown(); - messenger->shutdown(); - - osd_lock.Unlock(); - int r = store->umount(); - osd_lock.Lock(); - return r; -} - - - -void OSD::write_superblock(ObjectStore::Transaction& t) -{ - dout(10) << "write_superblock " << superblock << dendl; - - bufferlist bl; - bl.append((char*)&superblock, sizeof(superblock)); - t.write(SUPERBLOCK_OBJECT, 0, sizeof(superblock), bl); -} - -int OSD::read_superblock() -{ - bufferlist bl; - int r = store->read(SUPERBLOCK_OBJECT, 0, sizeof(superblock), bl); - if (bl.length() != sizeof(superblock)) { - dout(10) << "read_superblock failed, r = " << r << ", i got " << bl.length() << " bytes, not " << sizeof(superblock) << dendl; - return -1; - } - - bl.copy(0, sizeof(superblock), (char*)&superblock); - - dout(10) << "read_superblock " << superblock << dendl; - - // load up "current" osdmap - assert(!osdmap); - osdmap = new OSDMap; - bl.clear(); - get_map_bl(superblock.current_epoch, bl); - osdmap->decode(bl); - - assert(whoami == superblock.whoami); // fixme! - return 0; -} - - - - - -// ====================================================== -// PG's - -PG *OSD::_new_lock_pg(pg_t pgid) -{ - // create - PG *pg; - if (pgid.is_rep()) - pg = new ReplicatedPG(this, pgid); - //else if (pgid.is_raid4()) - //pg = new RAID4PG(this, pgid); - else - assert(0); - - assert(pg_map.count(pgid) == 0); - pg_map[pgid] = pg; - - pg->lock(); // always lock. - pg->get(); // because it's in pg_map - return pg; -} - - -PG *OSD::_create_lock_pg(pg_t pgid, ObjectStore::Transaction& t) -{ - dout(10) << "_create_lock_pg " << pgid << dendl; - - if (pg_map.count(pgid)) - dout(0) << "_create_lock_pg on " << pgid << ", already have " << *pg_map[pgid] << dendl; - - // open - PG *pg = _new_lock_pg(pgid); - - // create collection - assert(!store->collection_exists(pgid)); - t.create_collection(pgid); - - return pg; -} - -bool OSD::_have_pg(pg_t pgid) -{ - return pg_map.count(pgid); -} - -PG *OSD::_lookup_lock_pg(pg_t pgid) -{ - assert(pg_map.count(pgid)); - PG *pg = pg_map[pgid]; - pg->lock(); - return pg; -} - - -void OSD::_remove_unlock_pg(PG *pg) -{ - pg_t pgid = pg->info.pgid; - - dout(10) << "_remove_unlock_pg " << pgid << dendl; - - // remove from store - list olist; - store->collection_list(pgid, olist); - - ObjectStore::Transaction t; - { - for (list::iterator p = olist.begin(); - p != olist.end(); - p++) - t.remove(*p); - t.remove_collection(pgid); - t.remove(pgid.to_object()); // log too - } - store->apply_transaction(t); - - // mark deleted - pg->mark_deleted(); - - // remove from map - pg_map.erase(pgid); - - // unlock, and probably delete - pg->put_unlock(); // will delete, if last reference -} - - -void OSD::try_create_pg(pg_t pgid, ObjectStore::Transaction& t) -{ - vector acting; - int nrep = osdmap->pg_to_acting_osds(pgid, acting); - int role = osdmap->calc_pg_role(whoami, acting, nrep); - if (role < 0) return; - - PG *pg = _create_lock_pg(pgid, t); - pg->set_role(role); - pg->acting.swap(acting); - pg->last_epoch_started_any = - pg->info.last_epoch_started = - pg->info.history.same_since = - pg->info.history.same_primary_since = - pg->info.history.same_acker_since = osdmap->get_epoch(); - pg->write_log(t); - if (g_conf.osd_hack_fast_startup) - pg->activate(t); - - dout(7) << "created " << *pg << dendl; - pg->unlock(); -} - -void OSD::load_pgs() -{ - dout(10) << "load_pgs" << dendl; - assert(pg_map.empty()); - - list ls; - store->list_collections(ls); - - for (list::iterator it = ls.begin(); - it != ls.end(); - it++) { - pg_t pgid = *it; - PG *pg = _new_lock_pg(pgid); - - // read pg info - store->collection_getattr(pgid, "info", &pg->info, sizeof(pg->info)); - - // read pg log - pg->read_log(store); - - // generate state for current mapping - int nrep = osdmap->pg_to_acting_osds(pgid, pg->acting); - int role = osdmap->calc_pg_role(whoami, pg->acting, nrep); - pg->set_role(role); - - dout(10) << "load_pgs loaded " << *pg << " " << pg->log << dendl; - pg->unlock(); - } -} - - - -/** - * check epochs starting from start to verify the pg acting set hasn't changed - * up until now - */ -void OSD::project_pg_history(pg_t pgid, PG::Info::History& h, epoch_t from, - vector& last) -{ - dout(15) << "project_pg_history " << pgid - << " from " << from << " to " << osdmap->get_epoch() - << ", start " << h - << dendl; - - for (epoch_t e = osdmap->get_epoch()-1; - e >= from; - e--) { - // verify during intermediate epoch - OSDMap oldmap; - get_map(e, oldmap); - - vector acting; - oldmap.pg_to_acting_osds(pgid, acting); - - // acting set change? - if (acting != last && - e > h.same_since) { - dout(15) << "project_pg_history " << pgid << " changed in " << e+1 - << " from " << acting << " -> " << last << dendl; - h.same_since = e+1; - } - - // primary change? - if (!(!acting.empty() && !last.empty() && acting[0] == last[0]) && - e > h.same_primary_since) { - dout(15) << "project_pg_history " << pgid << " primary changed in " << e+1 << dendl; - h.same_primary_since = e+1; - - if (g_conf.osd_rep == OSD_REP_PRIMARY) - h.same_acker_since = h.same_primary_since; - } - - // acker change? - if (g_conf.osd_rep != OSD_REP_PRIMARY) { - if (!(!acting.empty() && !last.empty() && acting[acting.size()-1] == last[last.size()-1]) && - e > h.same_acker_since) { - dout(15) << "project_pg_history " << pgid << " acker changed in " << e+1 << dendl; - h.same_acker_since = e+1; - } - } - - if (h.same_since > e && - h.same_primary_since > e && - h.same_acker_since > e) break; - } - - dout(15) << "project_pg_history end " << h << dendl; -} - -void OSD::activate_pg(pg_t pgid, epoch_t epoch) -{ - osd_lock.Lock(); - { - if (pg_map.count(pgid)) { - PG *pg = _lookup_lock_pg(pgid); - if (pg->is_crashed() && - pg->is_replay() && - pg->get_role() == 0 && - pg->info.history.same_primary_since <= epoch) { - ObjectStore::Transaction t; - pg->activate(t); - store->apply_transaction(t); - } - pg->unlock(); - } - } - - // finishers? - finished_lock.Lock(); - if (finished.empty()) { - finished_lock.Unlock(); - osd_lock.Unlock(); - } else { - list waiting; - waiting.splice(waiting.begin(), finished); - - finished_lock.Unlock(); - osd_lock.Unlock(); - - for (list::iterator it = waiting.begin(); - it != waiting.end(); - it++) { - dispatch(*it); - } - } -} - - -// ------------------------------------- - -void OSD::_refresh_my_stat(utime_t now) -{ - assert(peer_stat_lock.is_locked()); - - // refresh? - if (now - my_stat.stamp > g_conf.osd_stat_refresh_interval || - pending_ops > 2*my_stat.qlen) { - - my_stat.stamp = now; - my_stat.oprate = stat_oprate.get(now); - - //read_latency_calc.set_size( 20 ); // hrm. - - // qlen - my_stat.qlen = 0; - if (stat_ops) my_stat.qlen = (float)stat_qlen / (float)stat_ops; //get_average(); - - // rd ops shed in - float frac_rd_ops_shed_in = 0; - float frac_rd_ops_shed_out = 0; - if (stat_rd_ops) { - frac_rd_ops_shed_in = (float)stat_rd_ops_shed_in / (float)stat_rd_ops; - frac_rd_ops_shed_out = (float)stat_rd_ops_shed_out / (float)stat_rd_ops; - } - my_stat.frac_rd_ops_shed_in = (my_stat.frac_rd_ops_shed_in + frac_rd_ops_shed_in) / 2.0; - my_stat.frac_rd_ops_shed_out = (my_stat.frac_rd_ops_shed_out + frac_rd_ops_shed_out) / 2.0; - - // recent_qlen - qlen_calc.add(my_stat.qlen); - my_stat.recent_qlen = qlen_calc.get_average(); - - // read latency - if (stat_rd_ops) { - my_stat.read_latency = read_latency_calc.get_average(); - if (my_stat.read_latency < 0) my_stat.read_latency = 0; - } else { - my_stat.read_latency = 0; - } - - my_stat.read_latency_mine = my_stat.read_latency * (1.0 - frac_rd_ops_shed_in); - - logger->fset("qlen", my_stat.qlen); - logger->fset("rqlen", my_stat.recent_qlen); - logger->fset("rdlat", my_stat.read_latency); - logger->fset("rdlatm", my_stat.read_latency_mine); - logger->fset("fshdin", my_stat.frac_rd_ops_shed_in); - logger->fset("fshdout", my_stat.frac_rd_ops_shed_out); - dout(12) << "_refresh_my_stat " << my_stat << dendl; - - stat_rd_ops = 0; - stat_rd_ops_shed_in = 0; - stat_rd_ops_shed_out = 0; - stat_ops = 0; - stat_qlen = 0; - } -} - -osd_peer_stat_t OSD::get_my_stat_for(utime_t now, int peer) -{ - Mutex::Locker lock(peer_stat_lock); - _refresh_my_stat(now); - my_stat_on_peer[peer] = my_stat; - return my_stat; -} - -void OSD::take_peer_stat(int peer, const osd_peer_stat_t& stat) -{ - Mutex::Locker lock(peer_stat_lock); - dout(10) << "take_peer_stat peer osd" << peer << " " << stat << dendl; - peer_stat[peer] = stat; -} - -void OSD::heartbeat() -{ - utime_t now = g_clock.now(); - utime_t since = now; - since.sec_ref() -= g_conf.osd_heartbeat_interval; - - // get CPU load avg - ifstream in("/proc/loadavg"); - if (in.is_open()) { - float oneminavg; - in >> oneminavg; - logger->fset("loadavg", oneminavg); - in.close(); - } - - // calc my stats - Mutex::Locker lock(peer_stat_lock); - _refresh_my_stat(now); - - dout(5) << "heartbeat: " << my_stat << dendl; - - //load_calc.set_size(stat_ops); - - // send pings - set pingset; - for (hash_map::iterator i = pg_map.begin(); - i != pg_map.end(); - i++) { - PG *pg = i->second; - - // we want to ping the primary. - if (pg->get_role() <= 0) continue; - if (pg->acting.size() < 1) continue; - - if (pg->last_heartbeat < since) { - pg->last_heartbeat = now; - pingset.insert(pg->acting[0]); - } - } - my_stat_on_peer.clear(); - for (set::iterator i = pingset.begin(); - i != pingset.end(); - i++) { - _share_map_outgoing( osdmap->get_inst(*i) ); - my_stat_on_peer[*i] = my_stat; - messenger->send_message(new MOSDPing(osdmap->get_epoch(), my_stat), - osdmap->get_inst(*i)); - } - - if (logger) logger->set("pingset", pingset.size()); - - // hack: fake reorg? - if (osdmap && g_conf.fake_osdmap_updates) { - int mon = monmap->pick_mon(); - if ((rand() % g_conf.fake_osdmap_updates) == 0) { - //if ((rand() % (g_conf.num_osd / g_conf.fake_osdmap_updates)) == whoami / g_conf.fake_osdmap_updates) { - messenger->send_message(new MOSDIn(osdmap->get_epoch()), - monmap->get_inst(mon)); - } - /* - if (osdmap->is_out(whoami)) { - messenger->send_message(new MOSDIn(osdmap->get_epoch()), - MSG_ADDR_MON(mon), monmap->get_inst(mon)); - } - else if ((rand() % g_conf.fake_osdmap_updates) == 0) { - //messenger->send_message(new MOSDOut(osdmap->get_epoch()), - //MSG_ADDR_MON(mon), monmap->get_inst(mon)); - } - } - */ - } - - // schedule next! randomly. - float wait = .5 + ((float)(rand() % 10)/10.0) * (float)g_conf.osd_heartbeat_interval; - timer.add_event_after(wait, new C_Heartbeat(this)); -} - - - -void OSD::send_pg_stats() -{ - //dout(-10) << "send_pg_stats" << dendl; - - // grab queue - set q; - pg_stat_queue_lock.Lock(); - q.swap(pg_stat_queue); - pg_stat_queue_lock.Unlock(); - - if (!q.empty()) { - dout(1) << "send_pg_stats - " << q.size() << " pgs updated" << dendl; - - MPGStats *m = new MPGStats; - while (!q.empty()) { - pg_t pgid = *q.begin(); - q.erase(q.begin()); - - if (!pg_map.count(pgid)) continue; - PG *pg = pg_map[pgid]; - pg->pg_stats_lock.Lock(); - m->pg_stat[pgid] = pg->pg_stats; - dout(20) << " sending " << pgid << " " << pg->pg_stats.state << dendl; - pg->pg_stats_lock.Unlock(); - } - - // fill in osd stats too - struct statfs stbuf; - store->statfs(&stbuf); - m->osd_stat.num_blocks = stbuf.f_blocks; - m->osd_stat.num_blocks_avail = stbuf.f_bavail; - m->osd_stat.num_objects = stbuf.f_files; - - int mon = monmap->pick_mon(); - messenger->send_message(m, monmap->get_inst(mon)); - } - - // reschedule - timer.add_event_after(g_conf.osd_pg_stats_interval, new C_Stats(this)); -} - - - - -// -------------------------------------- -// dispatch - -bool OSD::_share_map_incoming(const entity_inst_t& inst, epoch_t epoch) -{ - bool shared = false; - dout(20) << "_share_map_incoming " << inst << " " << epoch << dendl; - assert(osd_lock.is_locked()); - - // does client have old map? - if (inst.name.is_client()) { - if (epoch < osdmap->get_epoch()) { - dout(10) << inst.name << " has old map " << epoch << " < " << osdmap->get_epoch() << dendl; - send_incremental_map(epoch, inst, true); - shared = true; - } - } - - // does peer have old map? - if (inst.name.is_osd()) { - // remember - if (peer_map_epoch[inst.name] < epoch) { - dout(20) << "peer " << inst.name << " has " << epoch << dendl; - peer_map_epoch[inst.name] = epoch; - } - - // older? - if (peer_map_epoch[inst.name] < osdmap->get_epoch()) { - dout(10) << inst.name << " has old map " << epoch << " < " << osdmap->get_epoch() << dendl; - send_incremental_map(epoch, inst, true); - peer_map_epoch[inst.name] = osdmap->get_epoch(); // so we don't send it again. - shared = true; - } - } - - return shared; -} - - -void OSD::_share_map_outgoing(const entity_inst_t& inst) -{ - assert(inst.name.is_osd()); - - if (inst.name.is_osd()) { - // send map? - if (peer_map_epoch.count(inst.name)) { - epoch_t pe = peer_map_epoch[inst.name]; - if (pe < osdmap->get_epoch()) { - send_incremental_map(pe, inst, true); - peer_map_epoch[inst.name] = osdmap->get_epoch(); - } - } else { - // no idea about peer's epoch. - // ??? send recent ??? - // do nothing. - } - } -} - - - -void OSD::dispatch(Message *m) -{ - // lock! - osd_lock.Lock(); - dout(20) << "dispatch " << m << dendl; - - switch (m->get_type()) { - - // -- don't need lock -- - case MSG_PING: - dout(10) << "ping from " << m->get_source() << dendl; - delete m; - break; - - // -- don't need OSDMap -- - - // map and replication - case MSG_OSD_MAP: - handle_osd_map((MOSDMap*)m); - break; - - // osd - case MSG_SHUTDOWN: - shutdown(); - delete m; - break; - - - - // -- need OSDMap -- - - default: - { - // no map? starting up? - if (!osdmap) { - dout(7) << "no OSDMap, not booted" << dendl; - waiting_for_osdmap.push_back(m); - break; - } - - // down? - if (osdmap->is_down(whoami)) { - dout(7) << "i am marked down, dropping " << *m << dendl; - delete m; - break; - } - - - - - // need OSDMap - switch (m->get_type()) { - - case MSG_OSD_PING: - // take note. - handle_osd_ping((MOSDPing*)m); - break; - - case MSG_OSD_PG_NOTIFY: - handle_pg_notify((MOSDPGNotify*)m); - break; - case MSG_OSD_PG_QUERY: - handle_pg_query((MOSDPGQuery*)m); - break; - case MSG_OSD_PG_LOG: - handle_pg_log((MOSDPGLog*)m); - break; - case MSG_OSD_PG_REMOVE: - handle_pg_remove((MOSDPGRemove*)m); - break; - case MSG_OSD_PG_ACTIVATE_SET: - handle_pg_activate_set((MOSDPGActivateSet*)m); - break; - - case MSG_OSD_OP: - handle_op((MOSDOp*)m); - break; - - // for replication etc. - case MSG_OSD_OPREPLY: - handle_op_reply((MOSDOpReply*)m); - break; - - - default: - dout(1) << " got unknown message " << m->get_type() << dendl; - assert(0); - } - } - } - - // finishers? - finished_lock.Lock(); - if (!finished.empty()) { - list waiting; - waiting.splice(waiting.begin(), finished); - - finished_lock.Unlock(); - osd_lock.Unlock(); - - while (!waiting.empty()) { - dout(20) << "doing finished " << waiting.front() << dendl; - dispatch(waiting.front()); - waiting.pop_front(); - } - return; - } - - finished_lock.Unlock(); - osd_lock.Unlock(); -} - - -void OSD::ms_handle_failure(Message *m, const entity_inst_t& inst) -{ - entity_name_t dest = inst.name; - - if (g_conf.ms_die_on_failure) { - dout(0) << "ms_handle_failure " << inst << " on " << *m << dendl; - exit(0); - } - - if (is_stopping()) { - delete m; - return; - } - - if (dest.is_osd()) { - // failed osd. drop message, report to mon. - int mon = monmap->pick_mon(); - dout(1) << "ms_handle_failure " << inst - << ", dropping and reporting to mon" << mon - << " " << *m - << dendl; - messenger->send_message(new MOSDFailure(messenger->get_myinst(), inst, osdmap->get_epoch()), - monmap->get_inst(mon)); - delete m; - } else if (dest.is_mon()) { - // resend to a different monitor. - int mon = monmap->pick_mon(true); - dout(1) << "ms_handle_failure " << inst - << ", resending to mon" << mon - << " " << *m - << dendl; - messenger->send_message(m, monmap->get_inst(mon)); - } - else { - // client? - dout(1) << "ms_handle_failure " << inst - << ", dropping " << *m << dendl; - delete m; - } -} - - - - -void OSD::handle_osd_ping(MOSDPing *m) -{ - dout(20) << "osdping from " << m->get_source() << " got stat " << m->peer_stat << dendl; - - _share_map_incoming(m->get_source_inst(), ((MOSDPing*)m)->map_epoch); - - int from = m->get_source().num(); - take_peer_stat(from, m->peer_stat); - - delete m; -} - - - - -// ===================================================== -// MAP - -void OSD::wait_for_new_map(Message *m) -{ - // ask - if (waiting_for_osdmap.empty()) { - int mon = monmap->pick_mon(); - messenger->send_message(new MOSDGetMap(osdmap->get_epoch()+1), - monmap->get_inst(mon)); - } - - waiting_for_osdmap.push_back(m); -} - - -/** update_map - * assimilate new OSDMap(s). scan pgs, etc. - */ -void OSD::handle_osd_map(MOSDMap *m) -{ - wait_for_no_ops(); - - assert(osd_lock.is_locked()); - - ObjectStore::Transaction t; - - if (osdmap) { - dout(3) << "handle_osd_map epochs [" - << m->get_first() << "," << m->get_last() - << "], i have " << osdmap->get_epoch() - << dendl; - } else { - dout(3) << "handle_osd_map epochs [" - << m->get_first() << "," << m->get_last() - << "], i have none" - << dendl; - osdmap = new OSDMap; - boot_epoch = m->get_last(); // hrm...? - } - - logger->inc("mapmsg"); - - // store them? - for (map::iterator p = m->maps.begin(); - p != m->maps.end(); - p++) { - object_t oid = get_osdmap_object_name(p->first); - if (store->exists(oid)) { - dout(10) << "handle_osd_map already had full map epoch " << p->first << dendl; - logger->inc("mapfdup"); - bufferlist bl; - get_map_bl(p->first, bl); - dout(10) << " .. it is " << bl.length() << " bytes" << dendl; - continue; - } - - dout(10) << "handle_osd_map got full map epoch " << p->first << dendl; - store->write(oid, 0, p->second.length(), p->second, 0); // store _outside_ transaction; activate_map reads it. - - if (p->first > superblock.newest_map) - superblock.newest_map = p->first; - if (p->first < superblock.oldest_map || - superblock.oldest_map == 0) - superblock.oldest_map = p->first; - - logger->inc("mapf"); - } - for (map::iterator p = m->incremental_maps.begin(); - p != m->incremental_maps.end(); - p++) { - object_t oid = get_inc_osdmap_object_name(p->first); - if (store->exists(oid)) { - dout(10) << "handle_osd_map already had incremental map epoch " << p->first << dendl; - logger->inc("mapidup"); - bufferlist bl; - get_inc_map_bl(p->first, bl); - dout(10) << " .. it is " << bl.length() << " bytes" << dendl; - continue; - } - - dout(10) << "handle_osd_map got incremental map epoch " << p->first << dendl; - store->write(oid, 0, p->second.length(), p->second, 0); // store _outside_ transaction; activate_map reads it. - - if (p->first > superblock.newest_map) - superblock.newest_map = p->first; - if (p->first < superblock.oldest_map || - superblock.oldest_map == 0) - superblock.oldest_map = p->first; - - logger->inc("mapi"); - } - - // advance if we can - bool advanced = false; - - epoch_t cur = superblock.current_epoch; - while (cur < superblock.newest_map) { - dout(10) << "cur " << cur << " < newest " << superblock.newest_map << dendl; - - if (m->incremental_maps.count(cur+1) || - store->exists(get_inc_osdmap_object_name(cur+1))) { - dout(10) << "handle_osd_map decoding inc map epoch " << cur+1 << dendl; - - bufferlist bl; - if (m->incremental_maps.count(cur+1)) { - dout(10) << " using provided inc map" << dendl; - bl = m->incremental_maps[cur+1]; - } else { - dout(10) << " using my locally stored inc map" << dendl; - get_inc_map_bl(cur+1, bl); - } - - OSDMap::Incremental inc; - int off = 0; - inc.decode(bl, off); - - osdmap->apply_incremental(inc); - - // archive the full map - bl.clear(); - osdmap->encode(bl); - t.write( get_osdmap_object_name(cur+1), 0, bl.length(), bl); - - // notify messenger - for (map >::iterator i = inc.new_down.begin(); - i != inc.new_down.end(); - i++) { - int osd = i->first; - if (osd == whoami) continue; - messenger->mark_down(i->second.first.addr); - peer_map_epoch.erase(i->second.first.name); - - // kick any replica ops - for (hash_map::iterator it = pg_map.begin(); - it != pg_map.end(); - it++) { - PG *pg = it->second; - - pg->lock(); - pg->note_failed_osd(osd); - pg->unlock(); - } - } - for (map::iterator i = inc.new_up.begin(); - i != inc.new_up.end(); - i++) { - if (i->first == whoami) continue; - peer_map_epoch.erase(i->second.name); - } - } - else if (m->maps.count(cur+1) || - store->exists(get_osdmap_object_name(cur+1))) { - dout(10) << "handle_osd_map decoding full map epoch " << cur+1 << dendl; - bufferlist bl; - if (m->maps.count(cur+1)) - bl = m->maps[cur+1]; - else - get_map_bl(cur+1, bl); - osdmap->decode(bl); - - // FIXME BUG: need to notify messenger of ups/downs!! - } - else { - dout(10) << "handle_osd_map missing epoch " << cur+1 << dendl; - int mon = monmap->pick_mon(); - messenger->send_message(new MOSDGetMap(cur+1), monmap->get_inst(mon)); - break; - } - - cur++; - superblock.current_epoch = cur; - advance_map(t); - advanced = true; - } - - // all the way? - if (advanced && cur == superblock.newest_map) { - // yay! - activate_map(t); - - // process waiters - take_waiters(waiting_for_osdmap); - } - - // write updated pg state to store - for (hash_map::iterator i = pg_map.begin(); - i != pg_map.end(); - i++) { - pg_t pgid = i->first; - PG *pg = i->second; - t.collection_setattr( pgid, "info", &pg->info, sizeof(pg->info)); - } - - // superblock and commit - write_superblock(t); - store->apply_transaction(t); - - //if (osdmap->get_epoch() == 1) store->sync(); // in case of early death, blah - - delete m; -} - - -/** - * scan placement groups, initiate any replication - * activities. - */ -void OSD::advance_map(ObjectStore::Transaction& t) -{ - dout(7) << "advance_map epoch " << osdmap->get_epoch() - << " " << pg_map.size() << " pgs" - << dendl; - - if (osdmap->is_mkfs()) { - ps_t numps = osdmap->get_pg_num(); - ps_t numlps = osdmap->get_localized_pg_num(); - dout(1) << "mkfs on " << numps << " normal, " << numlps << " localized pg sets" << dendl; - int minrep = 1; - int maxrep = MIN(g_conf.num_osd, g_conf.osd_max_rep); - int minraid = g_conf.osd_min_raid_width; - int maxraid = g_conf.osd_max_raid_width; - dout(1) << "mkfs " << minrep << ".." << maxrep << " replicas, " - << minraid << ".." << maxraid << " osd raid groups" << dendl; - - //derr(0) << "osdmap " << osdmap->get_ctime() << " logger start " << logger->get_start() << dendl; - logger->set_start( osdmap->get_ctime() ); - - assert(g_conf.osd_mkfs); // make sure we did a mkfs! - - // create PGs - // replicated - for (int nrep = 1; nrep <= maxrep; nrep++) { - for (ps_t ps = 0; ps < numps; ++ps) - try_create_pg(pg_t(pg_t::TYPE_REP, nrep, ps, -1), t); - for (ps_t ps = 0; ps < numlps; ++ps) - try_create_pg(pg_t(pg_t::TYPE_REP, nrep, ps, whoami), t); - } - - // raided - /* - for (int size = minraid; size <= maxraid; size++) { - for (ps_t ps = 0; ps < numps; ++ps) - try_create_pg(pg_t(pg_t::TYPE_RAID4, size, ps, -1), t); - for (ps_t ps = 0; ps < numlps; ++ps) - try_create_pg(pg_t(pg_t::TYPE_RAID4, size, ps, whoami), t); - } - */ - dout(1) << "mkfs done, created " << pg_map.size() << " pgs" << dendl; - - } else { - // scan existing pg's - for (hash_map::iterator it = pg_map.begin(); - it != pg_map.end(); - it++) { - pg_t pgid = it->first; - PG *pg = it->second; - - // did i finish this epoch? - if (pg->is_active()) { - pg->info.last_epoch_finished = osdmap->get_epoch()-1; - } - - // get new acting set - vector tacting; - int nrep = osdmap->pg_to_acting_osds(pgid, tacting); - int role = osdmap->calc_pg_role(whoami, tacting, nrep); - - // no change? - if (tacting == pg->acting) - continue; - - // -- there was a change! -- - pg->lock(); - - int oldrole = pg->get_role(); - int oldprimary = pg->get_primary(); - int oldacker = pg->get_acker(); - vector oldacting = pg->acting; - - // update PG - pg->acting.swap(tacting); - pg->set_role(role); - - // did primary|acker change? - pg->info.history.same_since = osdmap->get_epoch(); - if (oldprimary != pg->get_primary()) { - pg->info.history.same_primary_since = osdmap->get_epoch(); - pg->cancel_recovery(); - } - if (oldacker != pg->get_acker()) { - pg->info.history.same_acker_since = osdmap->get_epoch(); - } - - // deactivate. - pg->state_clear(PG::STATE_ACTIVE); - - // reset primary state? - if (oldrole == 0 || pg->get_role() == 0) - pg->clear_primary_state(); - - // apply any repops in progress. - if (oldacker == whoami) { - pg->on_acker_change(); - } - - if (role != oldrole) { - // old primary? - if (oldrole == 0) { - pg->state_clear(PG::STATE_CLEAN); - - // take replay queue waiters - list ls; - for (map::iterator it = pg->replay_queue.begin(); - it != pg->replay_queue.end(); - it++) - ls.push_back(it->second); - pg->replay_queue.clear(); - take_waiters(ls); - - // take active waiters - take_waiters(pg->waiting_for_active); - - pg->on_role_change(); - } - - // new primary? - if (role == 0) { - // i am new primary - pg->state_clear(PG::STATE_STRAY); - } else { - // i am now replica|stray. we need to send a notify. - pg->state_set(PG::STATE_STRAY); - - if (nrep == 0) { - // did they all shut down cleanly? - bool clean = true; - vector inset; - osdmap->pg_to_osds(pg->info.pgid, inset); - for (unsigned i=0; iis_down_clean(inset[i])) clean = false; - if (clean) { - dout(1) << *pg << " is cleanly inactive" << dendl; - } else { - pg->state_set(PG::STATE_CRASHED); - dout(1) << *pg << " is crashed" << dendl; - } - } - } - - // my role changed. - dout(10) << *pg << " " << oldacting << " -> " << pg->acting - << ", role " << oldrole << " -> " << role << dendl; - - } else { - // no role change. - // did primary change? - if (pg->get_primary() != oldprimary) { - // we need to announce - pg->state_set(PG::STATE_STRAY); - - dout(10) << *pg << " " << oldacting << " -> " << pg->acting - << ", acting primary " - << oldprimary << " -> " << pg->get_primary() - << dendl; - } else { - // primary is the same. - if (role == 0) { - // i am (still) primary. but my replica set changed. - pg->state_clear(PG::STATE_CLEAN); - pg->state_clear(PG::STATE_REPLAY); - - dout(10) << *pg << " " << oldacting << " -> " << pg->acting - << ", replicas changed" << dendl; - } - } - } - - pg->unlock(); - } - } -} - -void OSD::activate_map(ObjectStore::Transaction& t) -{ - dout(7) << "activate_map version " << osdmap->get_epoch() << dendl; - - map< int, list > notify_list; // primary -> list - map< int, map > query_map; // peer -> PG -> get_summary_since - map activator_map; // peer -> message - - // scan pg's - for (hash_map::iterator it = pg_map.begin(); - it != pg_map.end(); - it++) { - //pg_t pgid = it->first; - PG *pg = it->second; - pg->lock(); - if (pg->is_active()) { - // update started counter - pg->info.last_epoch_started = osdmap->get_epoch(); - } - else if (pg->get_role() == 0 && !pg->is_active()) { - // i am (inactive) primary - pg->build_prior(); - pg->peer(t, query_map, &activator_map); - } - else if (pg->is_stray() && - pg->get_primary() >= 0) { - // i am residual|replica - notify_list[pg->get_primary()].push_back(pg->info); - } - if (pg->is_primary()) - pg->update_stats(); - pg->unlock(); - } - - if (g_conf.osd_hack_fast_startup && - osdmap->is_mkfs()) // hack: skip the queries/summaries if it's a mkfs - return; - - do_notifies(notify_list); // notify? (residual|replica) - do_queries(query_map); - do_activators(activator_map); - - logger->set("numpg", pg_map.size()); -} - - -void OSD::send_incremental_map(epoch_t since, const entity_inst_t& inst, bool full) -{ - dout(10) << "send_incremental_map " << since << " -> " << osdmap->get_epoch() - << " to " << inst << dendl; - - MOSDMap *m = new MOSDMap; - - for (epoch_t e = osdmap->get_epoch(); - e > since; - e--) { - bufferlist bl; - if (get_inc_map_bl(e,bl)) { - m->incremental_maps[e].claim(bl); - } else if (get_map_bl(e,bl)) { - m->maps[e].claim(bl); - if (!full) break; - } - else { - assert(0); // we should have all maps. - } - } - - messenger->send_message(m, inst); -} - -bool OSD::get_map_bl(epoch_t e, bufferlist& bl) -{ - return store->read(get_osdmap_object_name(e), 0, 0, bl) >= 0; -} - -bool OSD::get_inc_map_bl(epoch_t e, bufferlist& bl) -{ - return store->read(get_inc_osdmap_object_name(e), 0, 0, bl) >= 0; -} - -void OSD::get_map(epoch_t epoch, OSDMap &m) -{ - // find a complete map - list incs; - epoch_t e; - for (e = epoch; e > 0; e--) { - bufferlist bl; - if (get_map_bl(e, bl)) { - //dout(10) << "get_map " << epoch << " full " << e << dendl; - m.decode(bl); - break; - } else { - OSDMap::Incremental inc; - bool got = get_inc_map(e, inc); - assert(got); - incs.push_front(inc); - } - } - assert(e >= 0); - - // apply incrementals - for (e++; e <= epoch; e++) { - //dout(10) << "get_map " << epoch << " inc " << e << dendl; - m.apply_incremental( incs.front() ); - incs.pop_front(); - } -} - - -bool OSD::get_inc_map(epoch_t e, OSDMap::Incremental &inc) -{ - bufferlist bl; - if (!get_inc_map_bl(e, bl)) - return false; - int off = 0; - inc.decode(bl, off); - return true; -} - - - - - -bool OSD::require_current_map(Message *m, epoch_t ep) -{ - // older map? - if (ep < osdmap->get_epoch()) { - dout(7) << "require_current_map epoch " << ep << " < " << osdmap->get_epoch() << dendl; - delete m; // discard and ignore. - return false; - } - - // newer map? - if (ep > osdmap->get_epoch()) { - dout(7) << "require_current_map epoch " << ep << " > " << osdmap->get_epoch() << dendl; - wait_for_new_map(m); - return false; - } - - assert(ep == osdmap->get_epoch()); - return true; -} - - -/* - * require that we have same (or newer) map, and that - * the source is the pg primary. - */ -bool OSD::require_same_or_newer_map(Message *m, epoch_t epoch) -{ - dout(15) << "require_same_or_newer_map " << epoch << " (i am " << osdmap->get_epoch() << ") " << m << dendl; - - // newer map? - if (epoch > osdmap->get_epoch()) { - dout(7) << "waiting for newer map epoch " << epoch << " > my " << osdmap->get_epoch() << " with " << m << dendl; - wait_for_new_map(m); - return false; - } - - if (epoch < boot_epoch) { - dout(7) << "from pre-boot epoch " << epoch << " < " << boot_epoch << dendl; - delete m; - return false; - } - - return true; -} - - - - - -/** do_notifies - * Send an MOSDPGNotify to a primary, with a list of PGs that I have - * content for, and they are primary for. - */ - -void OSD::do_notifies(map< int, list >& notify_list) -{ - for (map< int, list >::iterator it = notify_list.begin(); - it != notify_list.end(); - it++) { - if (it->first == whoami) { - dout(7) << "do_notify osd" << it->first << " is self, skipping" << dendl; - continue; - } - dout(7) << "do_notify osd" << it->first << " on " << it->second.size() << " PGs" << dendl; - MOSDPGNotify *m = new MOSDPGNotify(osdmap->get_epoch(), it->second); - _share_map_outgoing(osdmap->get_inst(it->first)); - messenger->send_message(m, osdmap->get_inst(it->first)); - } -} - - -/** do_queries - * send out pending queries for info | summaries - */ -void OSD::do_queries(map< int, map >& query_map) -{ - for (map< int, map >::iterator pit = query_map.begin(); - pit != query_map.end(); - pit++) { - int who = pit->first; - dout(7) << "do_queries querying osd" << who - << " on " << pit->second.size() << " PGs" << dendl; - - MOSDPGQuery *m = new MOSDPGQuery(osdmap->get_epoch(), - pit->second); - _share_map_outgoing(osdmap->get_inst(who)); - messenger->send_message(m, osdmap->get_inst(who)); - } -} - - -void OSD::do_activators(map& activator_map) -{ - for (map::iterator p = activator_map.begin(); - p != activator_map.end(); - ++p) - messenger->send_message(p->second, osdmap->get_inst(p->first)); - activator_map.clear(); -} - - - - - -/** PGNotify - * from non-primary to primary - * includes PG::Info. - * NOTE: called with opqueue active. - */ -void OSD::handle_pg_notify(MOSDPGNotify *m) -{ - dout(7) << "handle_pg_notify from " << m->get_source() << dendl; - int from = m->get_source().num(); - - if (!require_same_or_newer_map(m, m->get_epoch())) return; - - ObjectStore::Transaction t; - - // look for unknown PGs i'm primary for - map< int, map > query_map; - map activator_map; - - for (list::iterator it = m->get_pg_list().begin(); - it != m->get_pg_list().end(); - it++) { - pg_t pgid = it->pgid; - PG *pg; - - if (!_have_pg(pgid)) { - // same primary? - vector acting; - int nrep = osdmap->pg_to_acting_osds(pgid, acting); - int role = osdmap->calc_pg_role(whoami, acting, nrep); - - PG::Info::History history = it->history; - project_pg_history(pgid, history, m->get_epoch(), acting); - - if (m->get_epoch() < history.same_primary_since) { - dout(10) << "handle_pg_notify pg " << pgid << " dne, and primary changed in " - << history.same_primary_since << " (msg from " << m->get_epoch() << ")" << dendl; - continue; - } - - assert(role == 0); // otherwise, probably bug in project_pg_history. - - // ok, create PG! - pg = _create_lock_pg(pgid, t); - pg->acting.swap(acting); - pg->set_role(role); - pg->info.history = history; - pg->last_epoch_started_any = it->last_epoch_started; - pg->clear_primary_state(); // yep, notably, set hml=false - pg->build_prior(); - pg->write_log(t); - - dout(10) << *pg << " is new" << dendl; - - // kick any waiters - if (waiting_for_pg.count(pgid)) { - take_waiters(waiting_for_pg[pgid]); - waiting_for_pg.erase(pgid); - } - } else { - // already had it. am i (still) the primary? - pg = _lookup_lock_pg(pgid); - if (m->get_epoch() < pg->info.history.same_primary_since) { - dout(10) << *pg << " handle_pg_notify primary changed in " - << pg->info.history.same_primary_since - << " (msg from " << m->get_epoch() << ")" << dendl; - pg->unlock(); - continue; - } - } - - // ok! - - // stray? - bool acting = pg->is_acting(from); - if (!acting && (*it).last_epoch_started > 0) { - dout(10) << *pg << " osd" << from << " has stray content: " << *it << dendl; - pg->stray_set.insert(from); - pg->state_clear(PG::STATE_CLEAN); - } - - // save info. - bool had = pg->peer_info.count(from); - pg->peer_info[from] = *it; - - if (had) { - if (pg->is_active() && - (*it).is_uptodate() && acting) { - pg->uptodate_set.insert(from); - dout(10) << *pg << " osd" << from << " now uptodate (" << pg->uptodate_set - << "): " << *it << dendl; - if (pg->is_all_uptodate()) - pg->finish_recovery(); - } else { - // hmm, maybe keep an eye out for cases where we see this, but peer should happen. - dout(10) << *pg << " already had notify info from osd" << from << ": " << *it << dendl; - } - } else { - // adjust prior? - if (it->last_epoch_started > pg->last_epoch_started_any) - pg->adjust_prior(); - - // peer - pg->peer(t, query_map, &activator_map); - } - - pg->unlock(); - } - - unsigned tr = store->apply_transaction(t); - assert(tr == 0); - - do_queries(query_map); - do_activators(activator_map); - - delete m; -} - - - -/** PGLog - * from non-primary to primary - * includes log and info - * from primary to non-primary - * includes log for use in recovery - * NOTE: called with opqueue active. - */ - - -void OSD::_process_pg_info(epoch_t epoch, int from, - PG::Info &info, - PG::Log &log, - PG::Missing &missing, - map* activator_map) -{ - if (pg_map.count(info.pgid) == 0) { - dout(10) << "_process_pg_info " << info << " don't have pg" << dendl; - assert(epoch < osdmap->get_epoch()); - return; - } - - PG *pg = _lookup_lock_pg(info.pgid); - assert(pg); - - dout(10) << *pg << " got " << info << " " << log << " " << missing << dendl; - - if (epoch < pg->info.history.same_since) { - dout(10) << *pg << " got old info " << info << ", ignoring" << dendl; - pg->unlock(); - return; - } - - //m->log.print(cout); - - ObjectStore::Transaction t; - - if (pg->is_primary()) { - // i am PRIMARY - assert(pg->peer_log_requested.count(from) || - pg->peer_summary_requested.count(from)); - - pg->proc_replica_log(log, missing, from); - - // peer - map< int, map > query_map; - pg->peer(t, query_map, activator_map); - do_queries(query_map); - - } else { - // i am REPLICA - // merge log - pg->merge_log(log, missing, from); - pg->proc_missing(log, missing, from); - assert(pg->missing.num_lost() == 0); - - // ok activate! - pg->activate(t, activator_map); - } - - unsigned tr = store->apply_transaction(t); - assert(tr == 0); - - pg->unlock(); -} - - -void OSD::handle_pg_log(MOSDPGLog *m) -{ - dout(7) << "handle_pg_log " << *m << " from " << m->get_source() << dendl; - - int from = m->get_source().num(); - if (!require_same_or_newer_map(m, m->get_epoch())) return; - - _process_pg_info(m->get_epoch(), from, - m->info, m->log, m->missing, 0); - - delete m; -} - -void OSD::handle_pg_activate_set(MOSDPGActivateSet *m) -{ - dout(7) << "handle_pg_activate_set " << *m << " from " << m->get_source() << dendl; - - int from = m->get_source().num(); - if (!require_same_or_newer_map(m, m->get_epoch())) return; - - PG::Log empty_log; - PG::Missing empty_missing; - map activator_map; - - for (list::iterator p = m->pg_info.begin(); - p != m->pg_info.end(); - ++p) - _process_pg_info(m->get_epoch(), from, *p, empty_log, empty_missing, &activator_map); - - do_activators(activator_map); - - delete m; -} - - -/** PGQuery - * from primary to replica | stray - * NOTE: called with opqueue active. - */ -void OSD::handle_pg_query(MOSDPGQuery *m) -{ - dout(7) << "handle_pg_query from " << m->get_source() << " epoch " << m->get_epoch() << dendl; - int from = m->get_source().num(); - - if (!require_same_or_newer_map(m, m->get_epoch())) return; - - map< int, list > notify_list; - - for (map::iterator it = m->pg_list.begin(); - it != m->pg_list.end(); - it++) { - pg_t pgid = it->first; - PG *pg = 0; - - if (pg_map.count(pgid) == 0) { - // get active crush mapping - vector acting; - int nrep = osdmap->pg_to_acting_osds(pgid, acting); - int role = osdmap->calc_pg_role(whoami, acting, nrep); - - // same primary? - PG::Info::History history = it->second.history; - project_pg_history(pgid, history, m->get_epoch(), acting); - - if (m->get_epoch() < history.same_since) { - dout(10) << " pg " << pgid << " dne, and pg has changed in " - << history.same_primary_since << " (msg from " << m->get_epoch() << ")" << dendl; - continue; - } - - if (role < 0) { - dout(10) << " pg " << pgid << " dne, and i am not an active replica" << dendl; - PG::Info empty(pgid); - notify_list[from].push_back(empty); - continue; - } - assert(role > 0); - - ObjectStore::Transaction t; - pg = _create_lock_pg(pgid, t); - pg->acting.swap( acting ); - pg->set_role(role); - pg->info.history = history; - pg->write_log(t); - store->apply_transaction(t); - - dout(10) << *pg << " dne (before), but i am role " << role << dendl; - } else { - pg = _lookup_lock_pg(pgid); - - // same primary? - if (m->get_epoch() < pg->info.history.same_since) { - dout(10) << *pg << " handle_pg_query primary changed in " - << pg->info.history.same_since - << " (msg from " << m->get_epoch() << ")" << dendl; - pg->unlock(); - continue; - } - } - - // ok, process query! - assert(!pg->acting.empty()); - assert(from == pg->acting[0]); - - if (it->second.type == PG::Query::INFO) { - // info - dout(10) << *pg << " sending info" << dendl; - notify_list[from].push_back(pg->info); - } else { - MOSDPGLog *m = new MOSDPGLog(osdmap->get_epoch(), pg->info); - m->missing = pg->missing; - - if (it->second.type == PG::Query::LOG) { - dout(10) << *pg << " sending info+missing+log since split " << it->second.split - << " from floor " << it->second.floor - << dendl; - if (!m->log.copy_after_unless_divergent(pg->log, it->second.split, it->second.floor)) { - dout(10) << *pg << " divergent, sending backlog" << dendl; - it->second.type = PG::Query::BACKLOG; - } - } - - if (it->second.type == PG::Query::BACKLOG) { - dout(10) << *pg << " sending info+missing+backlog" << dendl; - if (pg->log.backlog) { - m->log = pg->log; - } else { - pg->generate_backlog(); - m->log = pg->log; - pg->drop_backlog(); - } - } - else if (it->second.type == PG::Query::FULLLOG) { - dout(10) << *pg << " sending info+missing+full log" << dendl; - m->log.copy_non_backlog(pg->log); - } - - dout(10) << *pg << " sending " << m->log << " " << m->missing << dendl; - //m->log.print(cout); - - _share_map_outgoing(osdmap->get_inst(from)); - messenger->send_message(m, osdmap->get_inst(from)); - } - - pg->unlock(); - } - - do_notifies(notify_list); - - delete m; -} - - -void OSD::handle_pg_remove(MOSDPGRemove *m) -{ - dout(7) << "handle_pg_remove from " << m->get_source() << dendl; - - if (!require_same_or_newer_map(m, m->get_epoch())) return; - - for (set::iterator it = m->pg_list.begin(); - it != m->pg_list.end(); - it++) { - pg_t pgid = *it; - PG *pg; - - if (pg_map.count(pgid) == 0) { - dout(10) << " don't have pg " << pgid << dendl; - continue; - } - - pg = _lookup_lock_pg(pgid); - - dout(10) << *pg << " removing." << dendl; - assert(pg->get_role() == -1); - - _remove_unlock_pg(pg); - } - - delete m; -} - - - - - - -// ========================================================= -// OPS - -void OSD::handle_op(MOSDOp *op) -{ - // throttle? FIXME PROBABLY! - while (pending_ops > g_conf.osd_max_opq) { - dout(10) << "enqueue_op waiting for pending_ops " << pending_ops << " to drop to " << g_conf.osd_max_opq << dendl; - op_queue_cond.Wait(osd_lock); - } - - // get and lock *pg. - const pg_t pgid = op->get_pg(); - PG *pg = _have_pg(pgid) ? _lookup_lock_pg(pgid):0; - - logger->set("buf", buffer_total_alloc); - - utime_t now = g_clock.now(); - - // update qlen stats - stat_oprate.hit(now); - stat_ops++; - stat_qlen += pending_ops; - if (op->get_op() == OSD_OP_READ) { - stat_rd_ops++; - if (op->get_source().is_osd()) { - //derr(-10) << "shed in " << stat_rd_ops_shed_in << " / " << stat_rd_ops << dendl; - stat_rd_ops_shed_in++; - } - } - - // require same or newer map - if (!require_same_or_newer_map(op, op->get_map_epoch())) { - if (pg) pg->unlock(); - return; - } - - // share our map with sender, if they're old - _share_map_incoming(op->get_source_inst(), op->get_map_epoch()); - - - if (!op->get_source().is_osd()) { - // REGULAR OP (non-replication) - - // note original source - op->set_client_inst( op->get_source_inst() ); - op->clear_payload(); // and hose encoded payload (in case we forward) - - // have pg? - if (!pg) { - dout(7) << "hit non-existent pg " - << pgid - << ", waiting" << dendl; - waiting_for_pg[pgid].push_back(op); - return; - } - - // pg must be same-ish... - if (op->is_read()) { - // read - if (!pg->same_for_read_since(op->get_map_epoch())) { - dout(7) << "handle_rep_op pg changed " << pg->info.history - << " after " << op->get_map_epoch() - << ", dropping" << dendl; - assert(op->get_map_epoch() < osdmap->get_epoch()); - pg->unlock(); - delete op; - return; - } - - /* - if (read && op->get_oid().rev > 0) { - // versioned read. hrm. - // are we missing a revision that we might need? - object_t moid = op->get_oid(); - if (pick_missing_object_rev(moid, pg)) { - // is there a local revision we might use instead? - object_t loid = op->get_oid(); - if (store->pick_object_revision_lt(loid) && - moid <= loid) { - // we need moid. pull it. - dout(10) << "handle_op read on " << op->get_oid() - << ", have " << loid - << ", but need missing " << moid - << ", pulling" << dendl; - pull(pg, moid); - pg->waiting_for_missing_object[moid].push_back(op); - return; - } - - dout(10) << "handle_op read on " << op->get_oid() - << ", have " << loid - << ", don't need missing " << moid - << dendl; - } - } else { - // live revision. easy. - if (op->get_op() != OSD_OP_PUSH && - waitfor_missing_object(op, pg)) return; - } - */ - - } else { - // modify - if ((pg->get_primary() != whoami || - !pg->same_for_modify_since(op->get_map_epoch()))) { - dout(7) << "handle_rep_op pg changed " << pg->info.history - << " after " << op->get_map_epoch() - << ", dropping" << dendl; - assert(op->get_map_epoch() < osdmap->get_epoch()); - pg->unlock(); - delete op; - return; - } - } - - // pg must be active. - if (!pg->is_active()) { - // replay? - if (op->get_version().version > 0) { - if (op->get_version() > pg->info.last_update) { - dout(7) << *pg << " queueing replay at " << op->get_version() - << " for " << *op << dendl; - pg->replay_queue[op->get_version()] = op; - pg->unlock(); - return; - } else { - dout(7) << *pg << " replay at " << op->get_version() << " <= " << pg->info.last_update - << " for " << *op - << ", will queue for WRNOOP" << dendl; - } - } - - dout(7) << *pg << " not active (yet)" << dendl; - pg->waiting_for_active.push_back(op); - pg->unlock(); - return; - } - - // missing object? - if (pg->is_missing_object(op->get_oid())) { - pg->wait_for_missing_object(op->get_oid(), op); - pg->unlock(); - return; - } - - dout(10) << "handle_op " << *op << " in " << *pg << dendl; - - } else { - // REPLICATION OP (it's from another OSD) - - // have pg? - if (!pg) { - derr(-7) << "handle_rep_op " << *op - << " pgid " << pgid << " dne" << dendl; - delete op; - //assert(0); // wtf, shouldn't happen. - return; - } - - // check osd map: same set, or primary+acker? - if (!pg->same_for_rep_modify_since(op->get_map_epoch())) { - dout(10) << "handle_rep_op pg changed " << pg->info.history - << " after " << op->get_map_epoch() - << ", dropping" << dendl; - pg->unlock(); - delete op; - return; - } - - assert(pg->get_role() >= 0); - dout(7) << "handle_rep_op " << op << " in " << *pg << dendl; - } - - // proprocess op? - if (pg->preprocess_op(op, now)) { - pg->unlock(); - return; - } - - if (op->get_op() == OSD_OP_READ) { - Mutex::Locker lock(peer_stat_lock); - stat_rd_ops_in_queue++; - } - - if (g_conf.osd_maxthreads < 1) { - // do it now. - if (op->get_type() == MSG_OSD_OP) - pg->do_op((MOSDOp*)op); - else if (op->get_type() == MSG_OSD_OPREPLY) - pg->do_op_reply((MOSDOpReply*)op); - else - assert(0); - } else { - // queue for worker threads - enqueue_op(pg, op); - } - - pg->unlock(); -} - - -void OSD::handle_op_reply(MOSDOpReply *op) -{ - if (op->get_map_epoch() < boot_epoch) { - dout(3) << "replica op reply from before boot" << dendl; - delete op; - return; - } - - // must be a rep op. - assert(op->get_source().is_osd()); - - // make sure we have the pg - const pg_t pgid = op->get_pg(); - - // require same or newer map - if (!require_same_or_newer_map(op, op->get_map_epoch())) return; - - // share our map with sender, if they're old - _share_map_incoming(op->get_source_inst(), op->get_map_epoch()); - - if (!_have_pg(pgid)) { - // hmm. - delete op; - return; - } - - PG *pg = _lookup_lock_pg(pgid); - if (g_conf.osd_maxthreads < 1) { - pg->do_op_reply(op); // do it now - } else { - enqueue_op(pg, op); // queue for worker threads - } - pg->unlock(); -} - - -/* - * enqueue called with osd_lock held - */ -void OSD::enqueue_op(PG *pg, Message *op) -{ - // add to pg's op_queue - pg->op_queue.push_back(op); - pending_ops++; - logger->set("opq", pending_ops); - - // add pg to threadpool queue - pg->get(); // we're exposing the pointer, here. - threadpool->put_op(pg); -} - -/* - * NOTE: dequeue called in worker thread, without osd_lock - */ -void OSD::dequeue_op(PG *pg) -{ - Message *op = 0; - - osd_lock.Lock(); - { - // lock pg and get pending op - pg->lock(); - - assert(!pg->op_queue.empty()); - op = pg->op_queue.front(); - pg->op_queue.pop_front(); - - dout(10) << "dequeue_op " << *op << " pg " << *pg - << ", " << (pending_ops-1) << " more pending" - << dendl; - - // share map? - // do this preemptively while we hold osd_lock and pg->lock - // to avoid lock ordering issues later. - for (unsigned i=1; iacting.size(); i++) - _share_map_outgoing( osdmap->get_inst(pg->acting[i]) ); - } - osd_lock.Unlock(); - - // do it - if (op->get_type() == MSG_OSD_OP) - pg->do_op((MOSDOp*)op); // do it now - else if (op->get_type() == MSG_OSD_OPREPLY) - pg->do_op_reply((MOSDOpReply*)op); - else - assert(0); - - // unlock and put pg - pg->put_unlock(); - - // finish - osd_lock.Lock(); - { - dout(10) << "dequeue_op " << op << " finish" << dendl; - assert(pending_ops > 0); - - if (pending_ops > g_conf.osd_max_opq) - op_queue_cond.Signal(); - - pending_ops--; - logger->set("opq", pending_ops); - if (pending_ops == 0 && waiting_for_no_ops) - no_pending_ops.Signal(); - } - osd_lock.Unlock(); -} - - - - -void OSD::wait_for_no_ops() -{ - if (pending_ops > 0) { - dout(7) << "wait_for_no_ops - waiting for " << pending_ops << dendl; - waiting_for_no_ops = true; - while (pending_ops > 0) - no_pending_ops.Wait(osd_lock); - waiting_for_no_ops = false; - assert(pending_ops == 0); - } - dout(7) << "wait_for_no_ops - none" << dendl; -} - - - - diff --git a/branches/sage/ebofs2/osd/OSD.h b/branches/sage/ebofs2/osd/OSD.h deleted file mode 100644 index be6348eceb126..0000000000000 --- a/branches/sage/ebofs2/osd/OSD.h +++ /dev/null @@ -1,366 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __OSD_H -#define __OSD_H - -#include "msg/Dispatcher.h" - -#include "common/Mutex.h" -#include "common/ThreadPool.h" -#include "common/Timer.h" - -#include "mon/MonMap.h" - -#include "ObjectStore.h" -#include "PG.h" - -#include "common/DecayCounter.h" - - -#include -using namespace std; - -#include -#include -using namespace __gnu_cxx; - - -class Messenger; -class Message; -class Logger; -class ObjectStore; -class OSDMap; - -class OSD : public Dispatcher { -public: - // -- states -- - static const int STATE_BOOTING = 1; - static const int STATE_ACTIVE = 2; - static const int STATE_STOPPING = 3; - - - - /** OSD **/ -protected: - Mutex osd_lock; // global lock - SafeTimer timer; // safe timer - - Messenger *messenger; - Logger *logger; - ObjectStore *store; - MonMap *monmap; - - int whoami; - char dev_path[100]; - -public: - int get_nodeid() { return whoami; } - - static object_t get_osdmap_object_name(epoch_t epoch) { return object_t(0,epoch << 1); } - static object_t get_inc_osdmap_object_name(epoch_t epoch) { return object_t(0, (epoch << 1) + 1); } - - -private: - /** superblock **/ - OSDSuperblock superblock; - epoch_t boot_epoch; - - void write_superblock(); - void write_superblock(ObjectStore::Transaction& t); - int read_superblock(); - - - // -- state -- - int state; - -public: - bool is_booting() { return state == STATE_BOOTING; } - bool is_active() { return state == STATE_ACTIVE; } - bool is_stopping() { return state == STATE_STOPPING; } - -private: - - // heartbeat - void heartbeat(); - - class C_Heartbeat : public Context { - OSD *osd; - public: - C_Heartbeat(OSD *o) : osd(o) {} - void finish(int r) { - osd->heartbeat(); - } - }; - - - // -- stats -- - DecayCounter stat_oprate; - int stat_ops; // ops since last heartbeat - int stat_rd_ops; - int stat_rd_ops_shed_in; - int stat_rd_ops_shed_out; - int stat_qlen; // cumulative queue length since last refresh - int stat_rd_ops_in_queue; // in queue - - Mutex peer_stat_lock; - osd_peer_stat_t my_stat; - hash_map > peer_stat; - hash_map > my_stat_on_peer; // what the peer thinks of me - - void _refresh_my_stat(utime_t now); - osd_peer_stat_t get_my_stat_for(utime_t now, int peer); - void take_peer_stat(int peer, const osd_peer_stat_t& stat); - - // load calculation - //current implementation is moving averges. - class MovingAverager { - private: - Mutex lock; - deque m_Data; - unsigned m_Size; - double m_Total; - - public: - MovingAverager(unsigned size) : m_Size(size), m_Total(0) { } - - void set_size(unsigned size) { - m_Size = size; - } - - void add(double value) { - Mutex::Locker locker(lock); - - // add item - m_Data.push_back(value); - m_Total += value; - - // trim - while (m_Data.size() > m_Size) { - m_Total -= m_Data.front(); - m_Data.pop_front(); - } - } - - double get_average() { - Mutex::Locker locker(lock); - if (m_Data.empty()) return -1; - return m_Total / (double)m_Data.size(); - } - } read_latency_calc, qlen_calc; - - class IATAverager { - public: - struct iat_data { - double last_req_stamp; - double average_iat; - iat_data() : last_req_stamp(0), average_iat(0) {} - }; - private: - mutable Mutex lock; - double alpha; - hash_map iat_map; - - public: - IATAverager(double a) : alpha(a) {} - - void add_sample(object_t oid, double now) { - Mutex::Locker locker(lock); - iat_data &r = iat_map[oid]; - double iat = now - r.last_req_stamp; - r.last_req_stamp = now; - r.average_iat = r.average_iat*(1.0-alpha) + iat*alpha; - } - - bool have(object_t oid) const { - Mutex::Locker locker(lock); - return iat_map.count(oid); - } - - double get_average_iat(object_t oid) const { - Mutex::Locker locker(lock); - hash_map::const_iterator p = iat_map.find(oid); - assert(p != iat_map.end()); - return p->second.average_iat; - } - - bool is_flash_crowd_candidate(object_t oid) const { - Mutex::Locker locker(lock); - return get_average_iat(oid) <= g_conf.osd_flash_crowd_iat_threshold; - } - }; - - IATAverager iat_averager; - - - // -- waiters -- - list finished; - Mutex finished_lock; - - void take_waiters(list& ls) { - finished_lock.Lock(); - finished.splice(finished.end(), ls); - finished_lock.Unlock(); - } - - // -- op queue -- - class ThreadPool *threadpool; - - int pending_ops; - bool waiting_for_no_ops; - Cond no_pending_ops; - Cond op_queue_cond; - - void wait_for_no_ops(); - - void enqueue_op(PG *pg, Message *op); - void dequeue_op(PG *pg); - static void static_dequeueop(OSD *o, PG *pg) { - o->dequeue_op(pg); - }; - - - friend class PG; - friend class ReplicatedPG; - friend class RAID4PG; - - - protected: - - // -- osd map -- - OSDMap *osdmap; - list waiting_for_osdmap; - - hash_map peer_map_epoch; // FIXME types - bool _share_map_incoming(const entity_inst_t& inst, epoch_t epoch); - void _share_map_outgoing(const entity_inst_t& inst); - - void wait_for_new_map(Message *m); - void handle_osd_map(class MOSDMap *m); - - void advance_map(ObjectStore::Transaction& t); - void activate_map(ObjectStore::Transaction& t); - - void get_map(epoch_t e, OSDMap &m); - bool get_map_bl(epoch_t e, bufferlist& bl); - bool get_inc_map_bl(epoch_t e, bufferlist& bl); - bool get_inc_map(epoch_t e, OSDMap::Incremental &inc); - - void send_incremental_map(epoch_t since, const entity_inst_t& inst, bool full); - - - - // -- placement groups -- - hash_map pg_map; - hash_map > waiting_for_pg; - - bool _have_pg(pg_t pgid); - PG *_lookup_lock_pg(pg_t pgid); - PG *_new_lock_pg(pg_t pg); // create new PG (in memory) - PG *_create_lock_pg(pg_t pg, ObjectStore::Transaction& t); // create new PG - void _remove_unlock_pg(PG *pg); // remove from store and memory - - void try_create_pg(pg_t pgid, ObjectStore::Transaction& t); - - void load_pgs(); - void project_pg_history(pg_t pgid, PG::Info::History& h, epoch_t from, - vector& last); - void activate_pg(pg_t pgid, epoch_t epoch); - - class C_Activate : public Context { - OSD *osd; - pg_t pgid; - epoch_t epoch; - public: - C_Activate(OSD *o, pg_t p, epoch_t e) : osd(o), pgid(p), epoch(e) {} - void finish(int r) { - osd->activate_pg(pgid, epoch); - } - }; - - - // -- pg stats -- - Mutex pg_stat_queue_lock; - set pg_stat_queue; - - class C_Stats : public Context { - OSD *osd; - public: - C_Stats(OSD *o) : osd(o) {} - void finish(int r) { - osd->send_pg_stats(); - } - }; - void send_pg_stats(); - - - // -- tids -- - // for ops i issue - tid_t last_tid; - - Mutex tid_lock; - tid_t get_tid() { - tid_t t; - tid_lock.Lock(); - t = ++last_tid; - tid_lock.Unlock(); - return t; - } - - - // -- generic pg recovery -- - int num_pulling; - - void do_notifies(map< int, list >& notify_list); - void do_queries(map< int, map >& query_map); - void do_activators(map& activator_map); - void repeer(PG *pg, map< int, map >& query_map); - - bool require_current_map(Message *m, epoch_t v); - bool require_same_or_newer_map(Message *m, epoch_t e); - - void handle_pg_query(class MOSDPGQuery *m); - void handle_pg_notify(class MOSDPGNotify *m); - void handle_pg_log(class MOSDPGLog *m); - void handle_pg_activate_set(class MOSDPGActivateSet *m); - void handle_pg_remove(class MOSDPGRemove *m); - - // helper for handle_pg_log and handle_pg_activate_set - void _process_pg_info(epoch_t epoch, int from, - PG::Info &info, - PG::Log &log, - PG::Missing &missing, - map* activator_map); - - - public: - OSD(int id, Messenger *m, MonMap *mm, char *dev = 0); - ~OSD(); - - // startup/shutdown - int init(); - int shutdown(); - - // messages - virtual void dispatch(Message *m); - virtual void ms_handle_failure(Message *m, const entity_inst_t& inst); - - void handle_osd_ping(class MOSDPing *m); - void handle_op(class MOSDOp *m); - void handle_op_reply(class MOSDOpReply *m); - - void force_remount(); -}; - -#endif diff --git a/branches/sage/ebofs2/osd/OSDMap.h b/branches/sage/ebofs2/osd/OSDMap.h deleted file mode 100644 index 2b476e0456168..0000000000000 --- a/branches/sage/ebofs2/osd/OSDMap.h +++ /dev/null @@ -1,539 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __OSDMAP_H -#define __OSDMAP_H - -/* - * describe properties of the OSD cluster. - * disks, disk groups, total # osds, - * - */ -#include "config.h" -#include "include/types.h" -#include "osd_types.h" -#include "msg/Message.h" -#include "common/Mutex.h" -#include "common/Clock.h" - -#include "crush/CrushWrapper.h" - -#include -#include -#include -#include -using namespace std; - - -/* - * some system constants - */ - -// from LSB to MSB, -#define PG_PS_BITS 16 // max bits for placement seed/group portion of PG -#define PG_REP_BITS 6 // up to 64 replicas -#define PG_TYPE_BITS 2 -#define PG_PS_MASK ((1LL<>1)); -} - -inline int calc_bits_of(int t) { - int b = 0; - while (t) { - t = t >> 1; - b++; - } - return b; -} - - - -/** OSDMap - */ -class OSDMap { - -public: - class Incremental { - public: - epoch_t epoch; // new epoch; we are a diff from epoch-1 to epoch - epoch_t mon_epoch; // monitor epoch (election iteration) - utime_t ctime; - - // full (rare) - bufferlist fullmap; // in leiu of below. - bufferlist crush; - - // incremental - map new_up; - map > new_down; - list new_in; - list new_out; - map new_overload; // updated overload value - list old_overload; // no longer overload - - void encode(bufferlist& bl) { - ::_encode(epoch, bl); - ::_encode(mon_epoch, bl); - ::_encode(ctime, bl); - ::_encode(new_up, bl); - ::_encode(new_down, bl); - ::_encode(new_in, bl); - ::_encode(new_out, bl); - ::_encode(new_overload, bl); - ::_encode(fullmap, bl); - ::_encode(crush, bl); - } - void decode(bufferlist& bl, int& off) { - ::_decode(epoch, bl, off); - ::_decode(mon_epoch, bl, off); - ::_decode(ctime, bl, off); - ::_decode(new_up, bl, off); - ::_decode(new_down, bl, off); - ::_decode(new_in, bl, off); - ::_decode(new_out, bl, off); - ::_decode(new_overload, bl, off); - ::_decode(fullmap, bl, off); - ::_decode(crush, bl, off); - } - - Incremental(epoch_t e=0) : epoch(e), mon_epoch(0) {} - }; - -private: - epoch_t epoch; // what epoch of the osd cluster descriptor is this - epoch_t mon_epoch; // monitor epoch (election iteration) - utime_t ctime; // epoch start time - int32_t pg_num; // placement group count - int32_t pg_num_mask; // bitmask for above - int32_t localized_pg_num; // localized place group count - int32_t localized_pg_num_mask; // ditto - - set osds; // all osds - map down_osds; // list of down disks, -> clean shutdown (true/false) - set out_osds; // list of unmapped disks - map overload_osds; - map osd_inst; - - public: - CrushWrapper crush; // hierarchical map - - friend class OSDMonitor; - friend class MDS; - - public: - OSDMap() : epoch(0), mon_epoch(0), - pg_num(1<<5), - localized_pg_num(1<<3) { - calc_pg_masks(); - } - - // map info - epoch_t get_epoch() const { return epoch; } - void inc_epoch() { epoch++; } - - void calc_pg_masks() { - pg_num_mask = (1 << calc_bits_of(pg_num-1)) - 1; - localized_pg_num_mask = (1 << calc_bits_of(localized_pg_num-1)) - 1; - } - - int get_pg_num() const { return pg_num; } - void set_pg_num(int m) { pg_num = m; calc_pg_masks(); } - int get_localized_pg_num() const { return localized_pg_num; } - - const utime_t& get_ctime() const { return ctime; } - - bool is_mkfs() const { return epoch == 2; } - bool post_mkfs() const { return epoch > 2; } - - /***** cluster state *****/ - int num_osds() { return osds.size(); } - void get_all_osds(set& ls) { ls = osds; } - - const set& get_osds() { return osds; } - const map& get_down_osds() { return down_osds; } - const set& get_out_osds() { return out_osds; } - const map& get_overload_osds() { return overload_osds; } - - bool exists(int osd) { return osds.count(osd); } - bool is_down(int osd) { return down_osds.count(osd); } - bool is_down_clean(int osd) { return down_osds.count(osd) && down_osds[osd]; } - bool is_up(int osd) { return exists(osd) && !is_down(osd); } - bool is_out(int osd) { return out_osds.count(osd); } - bool is_in(int osd) { return exists(osd) && !is_out(osd); } - - bool have_inst(int osd) { - return osd_inst.count(osd); - } - const entity_inst_t& get_inst(int osd) { - assert(osd_inst.count(osd)); - return osd_inst[osd]; - } - bool get_inst(int osd, entity_inst_t& inst) { - if (osd_inst.count(osd)) { - inst = osd_inst[osd]; - return true; - } - return false; - } - - void mark_down(int o, bool clean) { down_osds[o] = clean; } - void mark_up(int o) { down_osds.erase(o); } - void mark_out(int o) { - out_osds.insert(o); - crush.update_offload_map(out_osds, overload_osds); - } - void mark_in(int o) { - out_osds.erase(o); - crush.update_offload_map(out_osds, overload_osds); - } - - void apply_incremental(Incremental &inc) { - assert(inc.epoch == epoch+1); - epoch++; - mon_epoch = inc.mon_epoch; - ctime = inc.ctime; - - // full map? - if (inc.fullmap.length()) { - decode(inc.fullmap); - return; - } - if (inc.crush.length()) { - bufferlist::iterator blp = inc.crush.begin(); - crush._decode(blp); - } - - // nope, incremental. - for (map >::iterator i = inc.new_down.begin(); - i != inc.new_down.end(); - i++) { - assert(down_osds.count(i->first) == 0); - down_osds[i->first] = i->second.second; - //assert(osd_inst.count(i->first) == 0 || osd_inst[i->first] == i->second.first); - osd_inst.erase(i->first); - //cout << "epoch " << epoch << " down osd" << i->first << endl; - } - for (list::iterator i = inc.new_out.begin(); - i != inc.new_out.end(); - i++) { - assert(out_osds.count(*i) == 0); - out_osds.insert(*i); - //cout << "epoch " << epoch << " out osd" << *i << endl; - } - for (list::iterator i = inc.old_overload.begin(); - i != inc.old_overload.end(); - i++) { - assert(overload_osds.count(*i)); - overload_osds.erase(*i); - } - - for (map::iterator i = inc.new_up.begin(); - i != inc.new_up.end(); - i++) { - assert(down_osds.count(i->first)); - down_osds.erase(i->first); - assert(osd_inst.count(i->first) == 0); - osd_inst[i->first] = i->second; - //cout << "epoch " << epoch << " up osd" << i->first << endl; - } - for (list::iterator i = inc.new_in.begin(); - i != inc.new_in.end(); - i++) { - assert(out_osds.count(*i)); - out_osds.erase(*i); - //cout << "epoch " << epoch << " in osd" << *i << endl; - } - for (map::iterator i = inc.new_overload.begin(); - i != inc.new_overload.end(); - i++) { - overload_osds[i->first] = i->second; - } - - crush.update_offload_map(out_osds, overload_osds); - } - - // serialize, unserialize - void encode(bufferlist& blist) { - ::_encode(epoch, blist); - ::_encode(mon_epoch, blist); - ::_encode(ctime, blist); - ::_encode(pg_num, blist); - ::_encode(localized_pg_num, blist); - - ::_encode(osds, blist); - ::_encode(down_osds, blist); - ::_encode(out_osds, blist); - ::_encode(overload_osds, blist); - ::_encode(osd_inst, blist); - - bufferlist cbl; - crush._encode(cbl); - ::_encode(cbl, blist); - } - - void decode(bufferlist& blist) { - int off = 0; - ::_decode(epoch, blist, off); - ::_decode(mon_epoch, blist, off); - ::_decode(ctime, blist, off); - ::_decode(pg_num, blist, off); - ::_decode(localized_pg_num, blist, off); - calc_pg_masks(); - - ::_decode(osds, blist, off); - ::_decode(down_osds, blist, off); - ::_decode(out_osds, blist, off); - ::_decode(overload_osds, blist, off); - ::_decode(osd_inst, blist, off); - - bufferlist cbl; - ::_decode(cbl, blist, off); - bufferlist::iterator cblp = cbl.begin(); - crush._decode(cblp); - - crush.update_offload_map(out_osds, overload_osds); - } - - - - - /**** mapping facilities ****/ - - // oid -> pg - ObjectLayout file_to_object_layout(object_t oid, FileLayout& layout) { - return make_object_layout(oid, layout.fl_pg_type, layout.fl_pg_size, layout.fl_pg_preferred, layout.fl_object_stripe_unit); - } - - ObjectLayout make_object_layout(object_t oid, int pg_type, int pg_size, int preferred=-1, int object_stripe_unit = 0) { - int num = preferred >= 0 ? localized_pg_num:pg_num; - int num_mask = preferred >= 0 ? localized_pg_num_mask:pg_num_mask; - - // calculate ps (placement seed) - ps_t ps; - switch (g_conf.osd_object_layout) { - case CEPH_OBJECT_LAYOUT_LINEAR: - ps = stable_mod(oid.bno + oid.ino, num, num_mask); - break; - - case CEPH_OBJECT_LAYOUT_HASHINO: - //ps = stable_mod(oid.bno + H(oid.bno+oid.ino)^H(oid.ino>>32), num, num_mask); - ps = stable_mod(oid.bno + crush_hash32_2(oid.ino, oid.ino>>32), num, num_mask); - break; - - case CEPH_OBJECT_LAYOUT_HASH: - //ps = stable_mod(H( (oid.bno & oid.ino) ^ ((oid.bno^oid.ino) >> 32) ), num, num_mask); - //ps = stable_mod(H(oid.bno) + H(oid.ino)^H(oid.ino>>32), num, num_mask); - //ps = stable_mod(oid.bno + H(oid.bno+oid.ino)^H(oid.bno+oid.ino>>32), num, num_mask); - ps = stable_mod(oid.bno + crush_hash32_2(oid.ino, oid.ino>>32), num, num_mask); - break; - - default: - assert(0); - } - - //cout << "preferred " << preferred << " num " << num << " mask " << num_mask << " ps " << ps << endl; - - // construct object layout - return ObjectLayout(pg_t(pg_type, pg_size, ps, preferred), - object_stripe_unit); - } - - - // pg -> (osd list) - int pg_to_osds(pg_t pg, - vector& osds) { // list of osd addr's - // map to osds[] - switch (g_conf.osd_pg_layout) { - case CEPH_PG_LAYOUT_CRUSH: - { - // what crush rule? - int rule; - if (pg.is_rep()) rule = CRUSH_REP_RULE(pg.size()); - else if (pg.is_raid4()) rule = CRUSH_RAID_RULE(pg.size()); - else assert(0); - - // forcefeed? - int forcefeed = -1; - if (pg.preferred() >= 0 && - out_osds.count(pg.preferred()) == 0) - forcefeed = pg.preferred(); - crush.do_rule(rule, - pg.ps(), - osds, pg.size(), - forcefeed); - } - break; - - case CEPH_PG_LAYOUT_LINEAR: - for (int i=0; i= 0 && - g_conf.osd_pg_layout != CEPH_PG_LAYOUT_CRUSH) { - int osd = pg.preferred(); - - // already in there? - if (osds.empty()) { - osds.push_back(osd); - } else { - assert(pg.size() > 0); - for (int i=1; i (up osd list) - int pg_to_acting_osds(pg_t pg, - vector& osds) { // list of osd addr's - // get rush list - vector raw; - pg_to_osds(pg, raw); - - osds.clear(); - for (unsigned i=0; i primary osd - int get_pg_primary(pg_t pg) { - vector group; - int nrep = pg_to_osds(pg, group); - if (nrep) - return group[0]; - return -1; // we fail! - } - - // pg -> acting primary osd - int get_pg_acting_primary(pg_t pg) { - vector group; - int nrep = pg_to_acting_osds(pg, group); - if (nrep > 0) - return group[0]; - return -1; // we fail! - } - int get_pg_acting_tail(pg_t pg) { - vector group; - int nrep = pg_to_acting_osds(pg, group); - if (nrep > 0) - return group[group.size()-1]; - return -1; // we fail! - } - - - /* what replica # is a given osd? 0 primary, -1 for none. */ - int calc_pg_rank(int osd, vector& acting, int nrep=0) { - if (!nrep) nrep = acting.size(); - for (int i=0; i& acting, int nrep=0) { - if (!nrep) nrep = acting.size(); - int rank = calc_pg_rank(osd, acting, nrep); - - if (rank < 0) return PG_ROLE_STRAY; - else if (rank == 0) return PG_ROLE_HEAD; - else if (rank == 1) return PG_ROLE_ACKER; - else return PG_ROLE_MIDDLE; - } - - int get_pg_role(pg_t pg, int osd) { - vector group; - int nrep = pg_to_osds(pg, group); - return calc_pg_role(osd, group, nrep); - } - - /* rank is -1 (stray), 0 (primary), 1,2,3,... (replica) */ - int get_pg_acting_rank(pg_t pg, int osd) { - vector group; - int nrep = pg_to_acting_osds(pg, group); - return calc_pg_rank(osd, group, nrep); - } - /* role is -1 (stray), 0 (primary), 1 (replica) */ - int get_pg_acting_role(pg_t pg, int osd) { - vector group; - int nrep = pg_to_acting_osds(pg, group); - return calc_pg_role(osd, group, nrep); - } - - - - -}; - - -#endif diff --git a/branches/sage/ebofs2/osd/ObjectStore.cc b/branches/sage/ebofs2/osd/ObjectStore.cc deleted file mode 100644 index 7aeab1d063d4d..0000000000000 --- a/branches/sage/ebofs2/osd/ObjectStore.cc +++ /dev/null @@ -1,152 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#include "ObjectStore.h" - -#include "config.h" -#include "common/Clock.h" - -#define dout(x) if (x < g_conf.debug) *_dout << dbeginl << g_clock.now() << " ager: " - -object_t ObjectStore::age_get_oid() { - if (!age_free_oids.empty()) { - object_t o = age_free_oids.front(); - age_free_oids.pop_front(); - return o; - } - return age_cur_oid++; - } - - ssize_t ObjectStore::age_pick_size() { - ssize_t max = file_size_distn.sample() * 1024; - return max/2 + (rand() % 100) * max/200 + 1; - } - - void ObjectStore::age_fill(float pc, utime_t until) { - bufferptr bp(1024*1024); - bp.zero(); - bufferlist bl; - bl.push_back(bp); - while (1) { - if (g_clock.now() > until) break; - - struct statfs st; - statfs(&st); - float a = (float)(st.f_blocks-st.f_bavail) / (float)st.f_blocks; - if (a >= pc) { - dout(10) << "age_fill at " << a << " / " << pc << " stopping" << dendl; - break; - } - - object_t oid = age_get_oid(); - - int b = rand() % 10; - age_objects[b].push_back(oid); - - ssize_t s = age_pick_size(); - - dout(10) << "age_fill at " << a << " / " << pc << " creating " << hex << oid << dec << " sz " << s << dendl; - - off_t off = 0; - while (s) { - ssize_t t = MIN(s, 1024*1024); - write(oid, t, off, bl, false); - off += t; - s -= t; - } - oid++; - } - } - - void ObjectStore::age_empty(float pc) { - int nper = 20; - int n = nper; - while (1) { - struct statfs st; - statfs(&st); - float a = (float)(st.f_blocks-st.f_bavail) / (float)st.f_blocks; - if (a <= pc) { - dout(10) << "age_empty at " << a << " / " << pc << " stopping" << dendl; - break; - } - - int b = rand() % 10; - n--; - if (n == 0 || age_objects[b].empty()) { - dout(10) << "age_empty sync" << dendl; - //sync(); - sync(); - n = nper; - continue; - } - object_t oid = age_objects[b].front(); - age_objects[b].pop_front(); - - dout(10) << "age_empty at " << a << " / " << pc << " removing " << hex << oid << dec << dendl; - - remove(oid); - age_free_oids.push_back(oid); - } - } - - - void ObjectStore::age(int time, - float high_water, // fill to this % - float low_water, // then empty to this % - int count, // this many times - float final_water, // and end here ( <= low_water) - int fake_size_mb) { - utime_t until = g_clock.now(); - until.sec_ref() += time; - - while (age_objects.size() < 10) age_objects.push_back( list() ); - - if (fake_size_mb) { - int fake_bl = fake_size_mb * 256; - struct statfs st; - statfs(&st); - float f = (float)fake_bl / (float)st.f_blocks; - high_water = (float)high_water * f; - low_water = (float)low_water * f; - final_water = (float)final_water * f; - dout(10) << "fake " << fake_bl << " / " << st.f_blocks << " is " << f << ", high " << high_water << " low " << low_water << " final " << final_water << dendl; - } - - // init size distn (once) - if (!did_distn) { - did_distn = true; - age_cur_oid = 1; - file_size_distn.add(1, 19.0758125+0.65434375); - file_size_distn.add(512, 35.6566); - file_size_distn.add(1024, 27.7271875); - file_size_distn.add(2*1024, 16.63503125); - //file_size_distn.add(4*1024, 106.82384375); - //file_size_distn.add(8*1024, 81.493375); - //file_size_distn.add(16*1024, 14.13553125); - //file_size_distn.add(32*1024, 2.176); - //file_size_distn.add(256*1024, 0.655938); - //file_size_distn.add(512*1024, 0.1480625); - //file_size_distn.add(1*1024*1024, 0.020125); // actually 2, but 32bit - file_size_distn.normalize(); - } - - // clear - for (int i=0; i<10; i++) - age_objects[i].clear(); - - for (int c=1; c<=count; c++) { - if (g_clock.now() > until) break; - - dout(1) << "age " << c << "/" << count << " filling to " << high_water << dendl; - age_fill(high_water, until); - if (c == count) { - dout(1) << "age final empty to " << final_water << dendl; - age_empty(final_water); - } else { - dout(1) << "age " << c << "/" << count << " emptying to " << low_water << dendl; - age_empty(low_water); - } - } - dout(1) << "age finished" << dendl; - } - diff --git a/branches/sage/ebofs2/osd/ObjectStore.h b/branches/sage/ebofs2/osd/ObjectStore.h deleted file mode 100644 index c8df5d8218fed..0000000000000 --- a/branches/sage/ebofs2/osd/ObjectStore.h +++ /dev/null @@ -1,611 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __OBJECTSTORE_H -#define __OBJECTSTORE_H - -#include "include/types.h" -#include "osd_types.h" -#include "include/Context.h" -#include "include/buffer.h" - -#include "include/Distribution.h" - -#include - -#ifdef DARWIN -#include -#else -#include /* or */ -#endif /* DARWIN */ - -#include -using std::list; - -#ifndef MIN -# define MIN(a,b) ((a) < (b) ? (a):(b)) -#endif - -/* - * low-level interface to the local OSD file system - */ - - - -class ObjectStore { -public: - - - class FragmentationStat { - public: - int total; - int num_extent; - int avg_extent; - map extent_dist; // powers of two - map extent_dist_sum; // powers of two - - float avg_extent_per_object; - int avg_extent_jump; // avg distance bweteen consecutive extents - - int total_free; - int num_free_extent; - int avg_free_extent; - map free_extent_dist; // powers of two - map free_extent_dist_sum; // powers of two - }; - - - - /********************************* - * transaction - */ - class Transaction { - public: - static const int OP_READ = 1; // oid, offset, len, pbl - static const int OP_STAT = 2; // oid, pstat - static const int OP_GETATTR = 3; // oid, attrname, pattrval - static const int OP_GETATTRS = 4; // oid, pattrset - - static const int OP_WRITE = 10; // oid, offset, len, bl - static const int OP_TRUNCATE = 11; // oid, len - static const int OP_REMOVE = 13; // oid - static const int OP_SETATTR = 14; // oid, attrname, attrval - static const int OP_SETATTRS = 15; // oid, attrset - static const int OP_RMATTR = 16; // oid, attrname - static const int OP_CLONE = 17; // oid, newoid - - static const int OP_TRIMCACHE = 18; // oid, offset, len - - static const int OP_MKCOLL = 20; // cid - static const int OP_RMCOLL = 21; // cid - static const int OP_COLL_ADD = 22; // cid, oid - static const int OP_COLL_REMOVE = 23; // cid, oid - static const int OP_COLL_SETATTR = 24; // cid, attrname, attrval - static const int OP_COLL_RMATTR = 25; // cid, attrname - - private: - list ops; - list bls; - list oids; - list cids; - list lengths; - list attrnames; - list attrnames2; - - // for reads only (not encoded) - list pbls; - list psts; - list< pair > pattrvals; - list< map* > pattrsets; - - public: - bool have_op() { - return !ops.empty(); - } - int get_num_ops() { return ops.size(); } - int get_op() { - int op = ops.front(); - ops.pop_front(); - return op; - } - void get_bl(bufferlist& bl) { - bl.claim(bls.front()); - bls.pop_front(); - } - void get_oid(object_t& oid) { - oid = oids.front(); - oids.pop_front(); - } - void get_cid(coll_t& cid) { - cid = cids.front(); - cids.pop_front(); - } - void get_length(off_t& len) { - len = lengths.front(); - lengths.pop_front(); - } - void get_attrname(const char * &p) { - p = attrnames.front(); - attrnames.pop_front(); - } - void get_pbl(bufferlist* &pbl) { - pbl = pbls.front(); - pbls.pop_front(); - } - void get_pstat(struct stat* &pst) { - pst = psts.front(); - psts.pop_front(); - } - void get_pattrval(pair& p) { - p = pattrvals.front(); - pattrvals.pop_front(); - } - void get_pattrset(map* &ps) { - ps = pattrsets.front(); - pattrsets.pop_front(); - } - - - void read(object_t oid, off_t off, size_t len, bufferlist *pbl) { - int op = OP_READ; - ops.push_back(op); - oids.push_back(oid); - lengths.push_back(off); - lengths.push_back(len); - pbls.push_back(pbl); - } - void stat(object_t oid, struct stat *st) { - int op = OP_STAT; - ops.push_back(op); - oids.push_back(oid); - psts.push_back(st); - } - void getattr(object_t oid, const char* name, void* val, int *plen) { - int op = OP_GETATTR; - ops.push_back(op); - oids.push_back(oid); - attrnames.push_back(name); - pattrvals.push_back(pair(val,plen)); - } - void getattrs(object_t oid, map& aset) { - int op = OP_GETATTRS; - ops.push_back(op); - oids.push_back(oid); - pattrsets.push_back(&aset); - } - - void write(object_t oid, off_t off, size_t len, const bufferlist& bl) { - int op = OP_WRITE; - ops.push_back(op); - oids.push_back(oid); - lengths.push_back(off); - lengths.push_back(len); - bls.push_back(bl); - } - void trim_from_cache(object_t oid, off_t off, size_t len) { - int op = OP_TRIMCACHE; - ops.push_back(op); - oids.push_back(oid); - lengths.push_back(off); - lengths.push_back(len); - } - void truncate(object_t oid, off_t off) { - int op = OP_TRUNCATE; - ops.push_back(op); - oids.push_back(oid); - lengths.push_back(off); - } - void remove(object_t oid) { - int op = OP_REMOVE; - ops.push_back(op); - oids.push_back(oid); - } - void setattr(object_t oid, const char* name, const void* val, int len) { - int op = OP_SETATTR; - ops.push_back(op); - oids.push_back(oid); - attrnames.push_back(name); - //attrvals.push_back(pair(val,len)); - bufferlist bl; - bl.append((char*)val,len); - bls.push_back(bl); - } - void setattrs(object_t oid, map& attrset) { - int op = OP_SETATTRS; - ops.push_back(op); - oids.push_back(oid); - pattrsets.push_back(&attrset); - } - void rmattr(object_t oid, const char* name) { - int op = OP_RMATTR; - ops.push_back(op); - oids.push_back(oid); - attrnames.push_back(name); - } - void clone(object_t oid, object_t noid) { - int op = OP_CLONE; - ops.push_back(op); - oids.push_back(oid); - oids.push_back(noid); - } - void create_collection(coll_t cid) { - int op = OP_MKCOLL; - ops.push_back(op); - cids.push_back(cid); - } - void remove_collection(coll_t cid) { - int op = OP_RMCOLL; - ops.push_back(op); - cids.push_back(cid); - } - void collection_add(coll_t cid, object_t oid) { - int op = OP_COLL_ADD; - ops.push_back(op); - cids.push_back(cid); - oids.push_back(oid); - } - void collection_remove(coll_t cid, object_t oid) { - int op = OP_COLL_REMOVE; - ops.push_back(op); - cids.push_back(cid); - oids.push_back(oid); - } - void collection_setattr(coll_t cid, const char* name, const void* val, int len) { - int op = OP_COLL_SETATTR; - ops.push_back(op); - cids.push_back(cid); - attrnames.push_back(name); - bufferlist bl; - bl.append((char*)val, len); - bls.push_back(bl); - } - void collection_rmattr(coll_t cid, const char* name) { - int op = OP_COLL_RMATTR; - ops.push_back(op); - cids.push_back(cid); - attrnames.push_back(name); - } - - // etc. - - void _encode(bufferlist& bl) { - ::_encode(ops, bl); - ::_encode(bls, bl); - ::_encode(oids, bl); - ::_encode(cids, bl); - ::_encode(lengths, bl); - ::_encode(attrnames, bl); - } - void _decode(bufferlist& bl, int& off) { - ::_decode(ops, bl, off); - ::_decode(bls, bl, off); - ::_decode(oids, bl, off); - ::_decode(cids, bl, off); - ::_decode(lengths, bl, off); - ::_decode(attrnames2, bl, off); - for (list::iterator p = attrnames2.begin(); - p != attrnames2.end(); - ++p) - attrnames.push_back((*p).c_str()); - } - }; - - - - /* this implementation is here only for naive ObjectStores that - * do not do atomic transactions natively. it is not atomic. - */ - virtual unsigned apply_transaction(Transaction& t, Context *onsafe=0) { - // non-atomic implementation - while (t.have_op()) { - int op = t.get_op(); - switch (op) { - case Transaction::OP_READ: - { - object_t oid; - off_t offset, len; - t.get_oid(oid); - t.get_length(offset); - t.get_length(len); - bufferlist *pbl; - t.get_pbl(pbl); - read(oid, offset, len, *pbl); - } - break; - case Transaction::OP_STAT: - { - object_t oid; - t.get_oid(oid); - struct stat *st; - t.get_pstat(st); - stat(oid, st); - } - break; - case Transaction::OP_GETATTR: - { - object_t oid; - t.get_oid(oid); - const char *attrname; - t.get_attrname(attrname); - pair pattrval; - t.get_pattrval(pattrval); - *pattrval.second = getattr(oid, attrname, pattrval.first, *pattrval.second); - } - break; - case Transaction::OP_GETATTRS: - { - object_t oid; - t.get_oid(oid); - map *pset; - t.get_pattrset(pset); - getattrs(oid, *pset); - } - break; - - case Transaction::OP_WRITE: - { - object_t oid; - t.get_oid(oid); - off_t offset, len; - t.get_length(offset); - t.get_length(len); - bufferlist bl; - t.get_bl(bl); - write(oid, offset, len, bl, 0); - } - break; - - case Transaction::OP_TRIMCACHE: - { - object_t oid; - t.get_oid(oid); - off_t offset, len; - t.get_length(offset); - t.get_length(len); - trim_from_cache(oid, offset, len); - } - break; - - case Transaction::OP_TRUNCATE: - { - object_t oid; - t.get_oid(oid); - off_t len; - t.get_length(len); - truncate(oid, len, 0); - } - break; - - case Transaction::OP_REMOVE: - { - object_t oid; - t.get_oid(oid); - remove(oid, 0); - } - break; - - case Transaction::OP_SETATTR: - { - object_t oid; - t.get_oid(oid); - const char *attrname; - t.get_attrname(attrname); - bufferlist bl; - t.get_bl(bl); - setattr(oid, attrname, bl.c_str(), bl.length(), 0); - } - break; - case Transaction::OP_SETATTRS: - { - object_t oid; - t.get_oid(oid); - map *pattrset; - t.get_pattrset(pattrset); - setattrs(oid, *pattrset, 0); - } - break; - - case Transaction::OP_RMATTR: - { - object_t oid; - t.get_oid(oid); - const char *attrname; - t.get_attrname(attrname); - rmattr(oid, attrname, 0); - } - break; - - case Transaction::OP_CLONE: - { - object_t oid; - t.get_oid(oid); - object_t noid; - t.get_oid(noid); - clone(oid, noid); - } - break; - - case Transaction::OP_MKCOLL: - { - coll_t cid; - t.get_cid(cid); - create_collection(cid, 0); - } - break; - - case Transaction::OP_RMCOLL: - { - coll_t cid; - t.get_cid(cid); - destroy_collection(cid, 0); - } - break; - - case Transaction::OP_COLL_ADD: - { - coll_t cid; - t.get_cid(cid); - object_t oid; - t.get_oid(oid); - collection_add(cid, oid, 0); - } - break; - - case Transaction::OP_COLL_REMOVE: - { - coll_t cid; - t.get_cid(cid); - object_t oid; - t.get_oid(oid); - collection_remove(cid, oid, 0); - } - break; - - case Transaction::OP_COLL_SETATTR: - { - coll_t cid; - t.get_cid(cid); - const char *attrname; - t.get_attrname(attrname); - bufferlist bl; - t.get_bl(bl); - collection_setattr(cid, attrname, bl.c_str(), bl.length(), 0); - } - break; - - case Transaction::OP_COLL_RMATTR: - { - coll_t cid; - t.get_cid(cid); - const char *attrname; - t.get_attrname(attrname); - collection_rmattr(cid, attrname, 0); - } - break; - - - default: - cerr << "bad op " << op << std::endl; - assert(0); - } - } - - if (onsafe) sync(onsafe); - - return 0; // FIXME count errors - } - - /*********************************************/ - - - - public: - ObjectStore() {} - virtual ~ObjectStore() {} - - // mgmt - virtual int mount() = 0; - virtual int umount() = 0; - virtual int mkfs() = 0; // wipe - - virtual int statfs(struct statfs *buf) = 0; - - // objects - virtual int pick_object_revision_lt(object_t& oid) = 0; - - virtual bool exists(object_t oid) = 0; // useful? - virtual int stat(object_t oid, struct stat *st) = 0; // struct stat? - - virtual int remove(object_t oid, - Context *onsafe=0) = 0; - - virtual int truncate(object_t oid, off_t size, - Context *onsafe=0) = 0; - - virtual int read(object_t oid, - off_t offset, size_t len, - bufferlist& bl) = 0; - virtual int write(object_t oid, - off_t offset, size_t len, - const bufferlist& bl, - Context *onsafe) = 0;//{ return -1; } - virtual void trim_from_cache(object_t oid, - off_t offset, size_t len) { } - virtual int is_cached(object_t oid, - off_t offset, - size_t len) { return -1; } - - virtual int setattr(object_t oid, const char *name, - const void *value, size_t size, - Context *onsafe=0) {return 0;} //= 0; - virtual int setattrs(object_t oid, map& aset, - Context *onsafe=0) {return 0;} //= 0; - virtual int getattr(object_t oid, const char *name, - void *value, size_t size) {return 0;} //= 0; - virtual int getattrs(object_t oid, map& aset) {return 0;}; - - virtual int rmattr(object_t oid, const char *name, - Context *onsafe=0) {return 0;} - - virtual int clone(object_t oid, object_t noid) { - return -1; - } - - virtual int list_objects(list& ls) = 0;//{ return -1; } - - virtual int get_object_collections(object_t oid, set& ls) { return -1; } - - //virtual int listattr(object_t oid, char *attrs, size_t size) {return 0;} //= 0; - - // collections - virtual int list_collections(list& ls) {return 0;}//= 0; - virtual int create_collection(coll_t c, - Context *onsafe=0) {return 0;}//= 0; - virtual int destroy_collection(coll_t c, - Context *onsafe=0) {return 0;}//= 0; - virtual bool collection_exists(coll_t c) {return 0;} - virtual int collection_stat(coll_t c, struct stat *st) {return 0;}//= 0; - virtual int collection_add(coll_t c, object_t o, - Context *onsafe=0) {return 0;}//= 0; - virtual int collection_remove(coll_t c, object_t o, - Context *onsafe=0) {return 0;}// = 0; - virtual int collection_list(coll_t c, list& o) {return 0;}//= 0; - - virtual int collection_setattr(coll_t cid, const char *name, - const void *value, size_t size, - Context *onsafe=0) {return 0;} //= 0; - virtual int collection_rmattr(coll_t cid, const char *name, - Context *onsafe=0) {return 0;} //= 0; - virtual int collection_getattr(coll_t cid, const char *name, - void *value, size_t size) {return 0;} //= 0; - - virtual int collection_getattrs(coll_t cid, map &aset) = 0;//{ return -1; } - virtual int collection_setattrs(coll_t cid, map &aset) = 0;//{ return -1; } - - - //virtual int collection_listattr(coll_t cid, char *attrs, size_t size) {return 0;} //= 0; - - virtual void sync(Context *onsync) {} - virtual void sync() {} - - - virtual void _fake_writes(bool b) {}; - - virtual void _get_frag_stat(FragmentationStat& st) {}; - -}; - - -#endif diff --git a/branches/sage/ebofs2/osd/PG.cc b/branches/sage/ebofs2/osd/PG.cc deleted file mode 100644 index 5b55c9a88e1de..0000000000000 --- a/branches/sage/ebofs2/osd/PG.cc +++ /dev/null @@ -1,1289 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "PG.h" -#include "config.h" -#include "OSD.h" - -#include "common/Timer.h" - -#include "messages/MOSDOp.h" -#include "messages/MOSDPGNotify.h" -#include "messages/MOSDPGLog.h" -#include "messages/MOSDPGRemove.h" -#include "messages/MOSDPGActivateSet.h" - -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_osd) *_dout << dbeginl << g_clock.now() << " osd" << osd->whoami << " " << (osd->osdmap ? osd->osdmap->get_epoch():0) << " " << *this << " " - - -/******* PGLog ********/ - -void PG::Log::copy_after(const Log &other, eversion_t v) -{ - assert(v >= other.bottom); - top = bottom = other.top; - for (list::const_reverse_iterator i = other.log.rbegin(); - i != other.log.rend(); - i++) { - if (i->version == v) break; - assert(i->version > v); - log.push_front(*i); - } - bottom = v; -} - -bool PG::Log::copy_after_unless_divergent(const Log &other, eversion_t split, eversion_t floor) -{ - assert(split >= other.bottom); - assert(floor >= other.bottom); - assert(floor <= split); - top = bottom = other.top; - - /* runs on replica. split is primary's log.top. floor is how much they want. - split tell us if the primary is divergent.. e.g.: - -> i am A, B is primary, split is 2'6, floor is 2'2. -A B C -2'2 2'2 -2'3 2'3 2'3 -2'4 2'4 2'4 -3'5 | 2'5 2'5 -3'6 | 2'6 -3'7 | -3'8 | -3'9 | - -> i return full backlog. - */ - - for (list::const_reverse_iterator i = other.log.rbegin(); - i != other.log.rend(); - i++) { - // is primary divergent? - // e.g. my 3'6 vs their 2'6 split - if (i->version.version == split.version && i->version.epoch > split.epoch) { - clear(); - return false; // divergent! - } - if (i->version == floor) break; - assert(i->version > floor); - - // e.g. my 2'23 > '12 - log.push_front(*i); - } - bottom = floor; - return true; -} - -void PG::Log::copy_non_backlog(const Log &other) -{ - if (other.backlog) { - top = other.top; - bottom = other.bottom; - for (list::const_reverse_iterator i = other.log.rbegin(); - i != other.log.rend(); - i++) - if (i->version > bottom) - log.push_front(*i); - else - break; - } else { - *this = other; - } -} - - - -void PG::IndexedLog::trim(ObjectStore::Transaction& t, eversion_t s) -{ - if (backlog && s < bottom) - s = bottom; - - while (!log.empty()) { - Entry &e = *log.begin(); - - if (e.version > s) break; - - assert(complete_to != log.begin()); - assert(requested_to != log.begin()); - - // remove from index, - unindex(e); - - // from log - log.pop_front(); - } - - // raise bottom? - if (backlog) backlog = false; - if (bottom < s) bottom = s; -} - - -void PG::IndexedLog::trim_write_ahead(eversion_t last_update) -{ - while (!log.empty() && - log.rbegin()->version > last_update) { - // remove from index - unindex(*log.rbegin()); - - // remove - log.pop_back(); - } -} - -void PG::trim_write_ahead() -{ - if (info.last_update < log.top) { - dout(10) << "trim_write_ahead (" << info.last_update << "," << log.top << "]" << dendl; - log.trim_write_ahead(info.last_update); - } else { - assert(info.last_update == log.top); - dout(10) << "trim_write_ahead last_update=top=" << info.last_update << dendl; - } - -} - -void PG::proc_replica_log(Log &olog, Missing& omissing, int from) -{ - dout(10) << "proc_replica_log for osd" << from << ": " << olog << " " << omissing << dendl; - assert(!is_active()); - - if (!have_master_log) { - // i'm building master log. - // note peer's missing. - peer_missing[from] = omissing; - - // merge log into our own log - merge_log(olog, omissing, from); - proc_missing(olog, omissing, from); - } else { - // i'm just building missing lists. - peer_missing[from] = omissing; - - // iterate over peer log. in reverse. - list::reverse_iterator pp = olog.log.rbegin(); - eversion_t lu = peer_info[from].last_update; - while (pp != olog.log.rend()) { - if (!log.objects.count(pp->oid)) { - dout(10) << " divergent " << *pp << " not in our log, generating backlog" << dendl; - generate_backlog(); - } - - if (!log.objects.count(pp->oid)) { - dout(10) << " divergent " << *pp << " dne, must have been new, ignoring" << dendl; - ++pp; - continue; - } - - if (log.objects[pp->oid]->version == pp->version) { - break; // we're no longer divergent. - //++pp; - //continue; - } - - if (log.objects[pp->oid]->version > pp->version) { - dout(10) << " divergent " << *pp - << " superceded by " << log.objects[pp->oid] - << ", ignoring" << dendl; - } else { - dout(10) << " divergent " << *pp << ", adding to missing" << dendl; - peer_missing[from].add(pp->oid, pp->version); - } - - ++pp; - if (pp != olog.log.rend()) - lu = pp->version; - else - lu = olog.bottom; - } - - if (lu < peer_info[from].last_update) { - dout(10) << " peer osd" << from << " last_update now " << lu << dendl; - peer_info[from].last_update = lu; - if (lu < oldest_update) { - dout(10) << " oldest_update now " << lu << dendl; - oldest_update = lu; - } - } - - proc_missing(olog, peer_missing[from], from); - } -} - -void PG::merge_log(Log &olog, Missing &omissing, int fromosd) -{ - dout(10) << "merge_log " << olog << " from osd" << fromosd - << " into " << log << dendl; - - //dout(0) << "log" << dendl; - //log.print(cout); - //dout(0) << "olog" << dendl; - //olog.print(cout); - - if (log.empty() || - (olog.bottom > log.top && olog.backlog)) { // e.g. log=(0,20] olog=(40,50]+backlog) - - // swap and index - log.log.swap(olog.log); - log.index(); - - // find split point (old log.top) in new log - // add new items to missing along the way. - for (list::reverse_iterator p = log.log.rbegin(); - p != log.log.rend(); - p++) { - if (p->version <= log.top) { - // ok, p is at split point. - - // was our old log divergent? - if (log.top > p->version) { - dout(10) << "merge_log i was (possibly) divergent for (" << p->version << "," << log.top << "]" << dendl; - if (p->version < oldest_update) - oldest_update = p->version; - - while (!olog.log.empty() && - olog.log.rbegin()->version > p->version) { - Log::Entry &oe = *olog.log.rbegin(); // old entry (possibly divergent) - if (log.objects.count(oe.oid)) { - if (log.objects[oe.oid]->version < oe.version) { - dout(10) << "merge_log divergent entry " << oe - << " not superceded by " << *log.objects[oe.oid] - << ", adding to missing" << dendl; - missing.add(oe.oid, oe.version); - } else { - dout(10) << "merge_log divergent entry " << oe - << " superceded by " << *log.objects[oe.oid] - << ", ignoring" << dendl; - } - } else { - dout(10) << "merge_log divergent entry " << oe << ", adding to missing" << dendl; - missing.add(oe.oid, oe.version); - } - olog.log.pop_back(); // discard divergent entry - } - } - break; - } - - if (p->is_delete()) { - dout(10) << "merge_log merging " << *p << ", not missing" << dendl; - missing.rm(p->oid, p->version); - } else { - dout(10) << "merge_log merging " << *p << ", now missing" << dendl; - missing.add(p->oid, p->version); - } - } - - info.last_update = log.top = olog.top; - info.log_bottom = log.bottom = olog.bottom; - info.log_backlog = log.backlog = olog.backlog; - } - - else { - // i can merge the two logs! - - // extend on bottom? - // FIXME: what if we have backlog, but they have lower bottom? - if (olog.bottom < log.bottom && olog.top >= log.bottom && !log.backlog) { - dout(10) << "merge_log extending bottom to " << olog.bottom - << (olog.backlog ? " +backlog":"") - << dendl; - - // ok - list::iterator from = olog.log.begin(); - list::iterator to; - for (to = from; - to != olog.log.end(); - to++) { - if (to->version > log.bottom) break; - - // update our index while we're here - log.index(*to); - - dout(15) << *to << dendl; - - // new missing object? - if (to->version > info.last_complete) { - if (to->is_update()) - missing.add(to->oid, to->version); - else - missing.rm(to->oid, to->version); - } - } - assert(to != olog.log.end()); - - // splice into our log. - log.log.splice(log.log.begin(), - olog.log, from, to); - - info.log_bottom = log.bottom = olog.bottom; - info.log_backlog = log.backlog = olog.backlog; - } - - // extend on top? - if (olog.top > log.top && - olog.bottom <= log.top) { - dout(10) << "merge_log extending top to " << olog.top << dendl; - - list::iterator to = olog.log.end(); - list::iterator from = olog.log.end(); - while (1) { - if (from == olog.log.begin()) break; - from--; - //dout(0) << "? " << *from << dendl; - if (from->version < log.top) { - from++; - break; - } - - log.index(*from); - dout(10) << "merge_log " << *from << dendl; - - // add to missing - if (from->is_update()) { - missing.add(from->oid, from->version); - } else - missing.rm(from->oid, from->version); - } - - // remove divergent items - while (1) { - Log::Entry *oldtail = &(*log.log.rbegin()); - if (oldtail->version.version+1 == from->version.version) break; - - // divergent! - assert(oldtail->version.version >= from->version.version); - - if (log.objects[oldtail->oid]->version == oldtail->version) { - // and significant. - dout(10) << "merge_log had divergent " << *oldtail << ", adding to missing" << dendl; - //missing.add(oldtail->oid); - assert(0); - } else { - dout(10) << "merge_log had divergent " << *oldtail << ", already missing" << dendl; - assert(missing.is_missing(oldtail->oid)); - } - log.log.pop_back(); - } - - // splice - log.log.splice(log.log.end(), - olog.log, from, to); - - info.last_update = log.top = olog.top; - } - } - - dout(10) << "merge_log result " << log << " " << missing << dendl; - //log.print(cout); - -} - -void PG::proc_missing(Log &olog, Missing &omissing, int fromosd) -{ - // found items? - for (map::iterator p = missing.missing.begin(); - p != missing.missing.end(); - p++) { - if (omissing.is_missing(p->first)) { - assert(omissing.is_missing(p->first, p->second)); - if (omissing.loc.count(p->first)) { - dout(10) << "proc_missing missing " << p->first << " " << p->second - << " on osd" << omissing.loc[p->first] << dendl; - missing.loc[p->first] = omissing.loc[p->first]; - } else { - dout(10) << "proc_missing missing " << p->first << " " << p->second - << " also LOST on source, osd" << fromosd << dendl; - } - } - else if (p->second <= olog.top) { - dout(10) << "proc_missing missing " << p->first << " " << p->second - << " on source, osd" << fromosd << dendl; - missing.loc[p->first] = fromosd; - } else { - dout(10) << "proc_missing " << p->first << " " << p->second - << " > olog.top " << olog.top << ", not found...." - << dendl; - } - } - - dout(10) << "proc_missing missing " << missing.missing << dendl; -} - - - -void PG::generate_backlog() -{ - dout(10) << "generate_backlog to " << log << dendl; - assert(!log.backlog); - log.backlog = true; - - list olist; - osd->store->collection_list(info.pgid, olist); - - int local = 0; - map add; - for (list::iterator it = olist.begin(); - it != olist.end(); - it++) { - local++; - - if (log.logged_object(*it)) continue; // already have it logged. - - // add entry - Log::Entry e; - e.op = Log::Entry::MODIFY; // FIXME when we do smarter op codes! - e.oid = *it; - osd->store->getattr(*it, - "version", - &e.version, sizeof(e.version)); - add[e.version] = e; - dout(10) << "generate_backlog found " << e << dendl; - } - - for (map::reverse_iterator i = add.rbegin(); - i != add.rend(); - i++) { - log.log.push_front(i->second); - log.index( *log.log.begin() ); // index - } - - dout(10) << local << " local objects, " - << add.size() << " objects added to backlog, " - << log.objects.size() << " in pg" << dendl; - - //log.print(cout); -} - -void PG::drop_backlog() -{ - dout(10) << "drop_backlog for " << log << dendl; - //log.print(cout); - - assert(log.backlog); - log.backlog = false; - - while (!log.log.empty()) { - Log::Entry &e = *log.log.begin(); - if (e.version > log.bottom) break; - - dout(15) << "drop_backlog trimming " << e.version << dendl; - log.unindex(e); - log.log.pop_front(); - } -} - - - - - -ostream& PG::Log::print(ostream& out) const -{ - out << *this << dendl; - for (list::const_iterator p = log.begin(); - p != log.end(); - p++) - out << *p << dendl; - return out; -} - - - - - -/******* PG ***********/ -void PG::build_prior() -{ - // build prior set. - prior_set.clear(); - - // current - for (unsigned i=1; iosdmap->get_epoch(); - epoch++) { - OSDMap omap; - osd->get_map(epoch, omap); - - vector acting; - omap.pg_to_acting_osds(get_pgid(), acting); - - for (unsigned i=0; iosdmap->is_up(acting[i]) && // is up now - acting[i] != osd->whoami) // and is not me - prior_set.insert(acting[i]); - } - } - - dout(10) << "build_prior built " << prior_set << dendl; -} - -void PG::adjust_prior() -{ - assert(!prior_set.empty()); - - // raise last_epoch_started_any - epoch_t max = 0; - for (map::iterator it = peer_info.begin(); - it != peer_info.end(); - it++) { - if (it->second.last_epoch_started > max) - max = it->second.last_epoch_started; - } - - dout(10) << "adjust_prior last_epoch_started_any " - << last_epoch_started_any << " -> " << max << dendl; - assert(max > last_epoch_started_any); - last_epoch_started_any = max; - - // rebuild prior set - build_prior(); -} - - -void PG::clear_primary_state() -{ - dout(10) << "clear_primary_state" << dendl; - - // clear peering state - have_master_log = false; - prior_set.clear(); - stray_set.clear(); - uptodate_set.clear(); - peer_info_requested.clear(); - peer_log_requested.clear(); - peer_info.clear(); - peer_missing.clear(); - - stat_object_temp_rd.clear(); - - last_epoch_started_any = info.last_epoch_started; -} - -void PG::peer(ObjectStore::Transaction& t, - map< int, map >& query_map, - map *activator_map) -{ - dout(10) << "peer. acting is " << acting - << ", prior_set is " << prior_set << dendl; - - - /** GET ALL PG::Info *********/ - - // -- query info from everyone in prior_set. - bool missing_info = false; - for (set::iterator it = prior_set.begin(); - it != prior_set.end(); - it++) { - if (peer_info.count(*it)) { - dout(10) << " have info from osd" << *it - << ": " << peer_info[*it] - << dendl; - continue; - } - missing_info = true; - - if (peer_info_requested.count(*it)) { - dout(10) << " waiting for osd" << *it << dendl; - continue; - } - - dout(10) << " querying info from osd" << *it << dendl; - query_map[*it][info.pgid] = Query(Query::INFO, info.history); - peer_info_requested.insert(*it); - } - if (missing_info) return; - - - // -- ok, we have all (prior_set) info. (and maybe others.) - - // did we crash? - dout(10) << " last_epoch_started_any " << last_epoch_started_any << dendl; - if (last_epoch_started_any) { - OSDMap omap; - osd->get_map(last_epoch_started_any, omap); - - // start with the last active set of replicas - set last_started; - vector acting; - bool cleanly_down = true; - omap.pg_to_acting_osds(get_pgid(), acting); - for (unsigned i=0; iosdmap->get_epoch(); - e++) { - OSDMap omap; - osd->get_map(e, omap); - - set still_up; - - for (set::iterator i = last_started.begin(); - i != last_started.end(); - i++) { - //dout(10) << " down in epoch " << e << " is " << omap.get_down_osds() << dendl; - if (omap.is_up(*i)) - still_up.insert(*i); - else if (!omap.is_down_clean(*i)) - cleanly_down = false; - } - - last_started.swap(still_up); - //dout(10) << " still active as of epoch " << e << ": " << last_started << dendl; - } - - if (last_started.empty()) { - if (cleanly_down) { - dout(10) << " cleanly stopped since epoch " << last_epoch_started_any << dendl; - } else { - dout(10) << " crashed since epoch " << last_epoch_started_any << dendl; - state_set(STATE_CRASHED); - } - } else { - dout(10) << " still active from last started: " << last_started << dendl; - } - } else if (osd->osdmap->post_mkfs()) { - dout(10) << " crashed since epoch " << last_epoch_started_any << dendl; - state_set(STATE_CRASHED); - } - - dout(10) << " peers_complete_thru " << peers_complete_thru << dendl; - - - - - /** CREATE THE MASTER PG::Log *********/ - - // who (of all priors and active) has the latest PG version? - eversion_t newest_update = info.last_update; - int newest_update_osd = osd->whoami; - - oldest_update = info.last_update; // only of acting (current) osd set. - peers_complete_thru = info.last_complete; - - for (map::iterator it = peer_info.begin(); - it != peer_info.end(); - it++) { - if (it->second.last_update > newest_update) { - newest_update = it->second.last_update; - newest_update_osd = it->first; - } - if (is_acting(it->first)) { - if (it->second.last_update < oldest_update) - oldest_update = it->second.last_update; - if (it->second.last_complete < peers_complete_thru) - peers_complete_thru = it->second.last_complete; - } - } - - // gather log(+missing) from that person! - if (newest_update_osd != osd->whoami) { - if (peer_log_requested.count(newest_update_osd) || - peer_summary_requested.count(newest_update_osd)) { - dout(10) << " newest update on osd" << newest_update_osd - << " v " << newest_update - << ", already queried" - << dendl; - } else { - // we'd like it back to oldest_update, but will settle for log_bottom - eversion_t since = MAX(peer_info[newest_update_osd].log_bottom, - oldest_update); - if (peer_info[newest_update_osd].log_bottom < log.top) { - dout(10) << " newest update on osd" << newest_update_osd - << " v " << newest_update - << ", querying since " << since - << dendl; - query_map[newest_update_osd][info.pgid] = Query(Query::LOG, log.top, since, info.history); - peer_log_requested.insert(newest_update_osd); - } else { - dout(10) << " newest update on osd" << newest_update_osd - << " v " << newest_update - << ", querying entire summary/backlog" - << dendl; - assert((peer_info[newest_update_osd].last_complete >= - peer_info[newest_update_osd].log_bottom) || - peer_info[newest_update_osd].log_backlog); // or else we're in trouble. - query_map[newest_update_osd][info.pgid] = Query(Query::BACKLOG, info.history); - peer_summary_requested.insert(newest_update_osd); - } - } - return; - } else { - dout(10) << " newest_update " << info.last_update << " (me)" << dendl; - } - - dout(10) << " oldest_update " << oldest_update << dendl; - - have_master_log = true; - - - // -- do i need to generate backlog for any of my peers? - if (oldest_update < log.bottom && !log.backlog) { - dout(10) << "generating backlog for some peers, bottom " - << log.bottom << " > " << oldest_update - << dendl; - generate_backlog(); - } - - - /** COLLECT MISSING+LOG FROM PEERS **********/ - /* - we also detect divergent replicas here by pulling the full log - from everyone. - */ - - // gather missing from peers - for (unsigned i=1; i 0) { - dout(10) << "there are still " << missing.num_lost() << " lost objects" << dendl; - - // ***** - // FIXME: i don't think this actually accomplishes anything! - // ***** - - // ok, let's get more summaries! - bool waiting = false; - for (map::iterator it = peer_info.begin(); - it != peer_info.end(); - it++) { - int peer = it->first; - - if (peer_summary_requested.count(peer)) { - dout(10) << " already requested summary/backlog from osd" << peer << dendl; - waiting = true; - continue; - } - - dout(10) << " requesting summary/backlog from osd" << peer << dendl; - query_map[peer][info.pgid] = Query(Query::BACKLOG, info.history); - peer_summary_requested.insert(peer); - waiting = true; - } - - if (!waiting) { - dout(10) << missing.num_lost() << " objects are still lost, waiting+hoping for a notify from someone else!" << dendl; - } - return; - } - - // sanity check - assert(missing.num_lost() == 0); - assert(info.last_complete >= log.bottom || log.backlog); - - - // -- crash recovery? - if (is_crashed()) { - dout(10) << "crashed, allowing op replay for " << g_conf.osd_replay_window << dendl; - state_set(STATE_REPLAY); - osd->timer.add_event_after(g_conf.osd_replay_window, - new OSD::C_Activate(osd, info.pgid, osd->osdmap->get_epoch())); - } - else if (!is_active()) { - // -- ok, activate! - activate(t, activator_map); - } -} - - -void PG::activate(ObjectStore::Transaction& t, - map *activator_map) -{ - assert(!is_active()); - - // twiddle pg state - state_set(STATE_ACTIVE); - state_clear(STATE_STRAY); - if (is_crashed()) { - //assert(is_replay()); // HELP.. not on replica? - state_clear(STATE_CRASHED); - state_clear(STATE_REPLAY); - } - info.last_epoch_started = osd->osdmap->get_epoch(); - - if (role == 0) { // primary state - peers_complete_thru = 0; // we don't know (yet)! - } - - assert(info.last_complete >= log.bottom || log.backlog); - - // write pg info - t.collection_setattr(info.pgid, "info", (char*)&info, sizeof(info)); - - // write log - write_log(t); - - // clean up stray objects - clean_up_local(t); - - // init complete pointer - if (info.last_complete == info.last_update) { - dout(10) << "activate - complete" << dendl; - log.complete_to == log.log.end(); - log.requested_to = log.log.end(); - } - else if (true) { - dout(10) << "activate - not complete, " << missing << dendl; - - // init complete_to - log.complete_to = log.log.begin(); - while (log.complete_to->version < info.last_complete) { - log.complete_to++; - assert(log.complete_to != log.log.end()); - } - - if (is_primary()) { - // start recovery - dout(10) << "activate - starting recovery" << dendl; - log.requested_to = log.complete_to; - do_recovery(); - } - } else { - dout(10) << "activate - not complete, " << missing << dendl; - } - - // if primary.. - if (role == 0 && - (!g_conf.osd_hack_fast_startup || osd->osdmap->post_mkfs())) { - // who is clean? - uptodate_set.clear(); - if (info.is_uptodate()) - uptodate_set.insert(osd->whoami); - - // start up replicas - for (unsigned i=1; icount(peer) == 0) - (*activator_map)[peer] = new MOSDPGActivateSet(osd->osdmap->get_epoch()); - (*activator_map)[peer]->pg_info.push_back(info); - } else { - dout(10) << "activate - peer osd" << peer << " is up to date, but sending pg_log anyway" << dendl; - m = new MOSDPGLog(osd->osdmap->get_epoch(), info); - } - } - else { - m = new MOSDPGLog(osd->osdmap->get_epoch(), info); - if (peer_info[peer].last_update < log.bottom) { - // summary/backlog - assert(log.backlog); - m->log = log; - } else { - // incremental log - assert(peer_info[peer].last_update < info.last_update); - m->log.copy_after(log, peer_info[peer].last_update); - } - } - - // update local version of peer's missing list! - if (m) { - eversion_t plu = peer_info[peer].last_update; - Missing& pm = peer_missing[peer]; - for (list::iterator p = m->log.log.begin(); - p != m->log.log.end(); - p++) - if (p->version > plu) - pm.add(p->oid, p->version); - } - - if (m) { - dout(10) << "activate sending " << m->log << " " << m->missing - << " to osd" << peer << dendl; - //m->log.print(cout); - osd->messenger->send_message(m, osd->osdmap->get_inst(peer)); - } - - // update our missing - if (peer_missing[peer].num_missing() == 0) { - dout(10) << "activate peer osd" << peer << " already uptodate, " << peer_info[peer] << dendl; - assert(peer_info[peer].is_uptodate()); - uptodate_set.insert(peer); - } else { - dout(10) << "activate peer osd" << peer << " " << peer_info[peer] - << " missing " << peer_missing[peer] << dendl; - } - - } - - // discard unneeded peering state - //peer_log.clear(); // actually, do this carefully, in case peer() is called again. - - // all clean? - if (is_all_uptodate()) - finish_recovery(); - else { - dout(10) << "activate not all replicas are uptodate, starting recovery" << dendl; - do_recovery(); - } - } - - - // replay (queue them _before_ other waiting ops!) - if (!replay_queue.empty()) { - eversion_t c = info.last_update; - list replay; - for (map::iterator p = replay_queue.begin(); - p != replay_queue.end(); - p++) { - if (p->first <= info.last_update) { - dout(10) << "activate will WRNOOP " << p->first << " " << *p->second << dendl; - replay.push_back(p->second); - continue; - } - if (p->first.version != c.version+1) { - dout(10) << "activate replay " << p->first - << " skipping " << c.version+1 - p->first.version - << " ops" - << dendl; - } - dout(10) << "activate replay " << p->first << " " << *p->second << dendl; - replay.push_back(p->second); - c = p->first; - } - replay_queue.clear(); - osd->take_waiters(replay); - } - - if (is_primary()) - update_stats(); // update stats - - // waiters - osd->take_waiters(waiting_for_active); -} - - -void PG::finish_recovery() -{ - dout(10) << "finish_recovery" << dendl; - - state_set(PG::STATE_CLEAN); - purge_strays(); - update_stats(); -} - - - -void PG::update_stats() -{ - dout(15) << "update_stats" << dendl; - assert(is_primary()); - - // update our stat summary - pg_stats_lock.Lock(); - pg_stats.reported = info.last_update; - pg_stats.state = state; - pg_stats.size = stat_size; - pg_stats.num_blocks = stat_num_blocks; - pg_stats_lock.Unlock(); - - // put in osd stat_queue - osd->pg_stat_queue_lock.Lock(); - osd->pg_stat_queue.insert(info.pgid); - osd->pg_stat_queue_lock.Unlock(); -} - - -void PG::write_log(ObjectStore::Transaction& t) -{ - dout(10) << "write_log" << dendl; - - // assemble buffer - bufferlist bl; - - // build buffer - ondisklog.bottom = 0; - ondisklog.block_map.clear(); - for (list::iterator p = log.log.begin(); - p != log.log.end(); - p++) { - if (bl.length() % 4096 == 0) - ondisklog.block_map[bl.length()] = p->version; - bl.append((char*)&(*p), sizeof(*p)); - if (g_conf.osd_pad_pg_log) { // pad to 4k, until i fix ebofs reallocation crap. FIXME. - bufferptr bp(4096 - sizeof(*p)); - bl.push_back(bp); - } - } - ondisklog.top = bl.length(); - - // write it - t.remove( info.pgid.to_object() ); - t.write( info.pgid.to_object() , 0, bl.length(), bl); - t.collection_setattr(info.pgid, "ondisklog_bottom", &ondisklog.bottom, sizeof(ondisklog.bottom)); - t.collection_setattr(info.pgid, "ondisklog_top", &ondisklog.top, sizeof(ondisklog.top)); - - t.collection_setattr(info.pgid, "info", &info, sizeof(info)); -} - -void PG::trim_ondisklog_to(ObjectStore::Transaction& t, eversion_t v) -{ - dout(15) << " trim_ondisk_log_to v " << v << dendl; - - map::iterator p = ondisklog.block_map.begin(); - while (p != ondisklog.block_map.end()) { - dout(15) << " " << p->first << " -> " << p->second << dendl; - p++; - if (p == ondisklog.block_map.end() || - p->second > v) { // too far! - p--; // back up - break; - } - } - dout(15) << " * " << p->first << " -> " << p->second << dendl; - if (p == ondisklog.block_map.begin()) - return; // can't trim anything! - - // we can trim! - off_t trim = p->first; - dout(10) << " trimming ondisklog to [" << ondisklog.bottom << "," << ondisklog.top << ")" << dendl; - - assert(trim >= ondisklog.bottom); - ondisklog.bottom = trim; - - // adjust block_map - while (p != ondisklog.block_map.begin()) - ondisklog.block_map.erase(ondisklog.block_map.begin()); - - t.collection_setattr(info.pgid, "ondisklog_bottom", &ondisklog.bottom, sizeof(ondisklog.bottom)); - t.collection_setattr(info.pgid, "ondisklog_top", &ondisklog.top, sizeof(ondisklog.top)); -} - - -void PG::append_log(ObjectStore::Transaction& t, PG::Log::Entry& logentry, - eversion_t trim_to) -{ - dout(10) << "append_log " << ondisklog.top << " " << logentry << dendl; - - // write entry on disk - bufferlist bl; - bl.append( (char*)&logentry, sizeof(logentry) ); - if (g_conf.osd_pad_pg_log) { // pad to 4k, until i fix ebofs reallocation crap. FIXME. - bufferptr bp(4096 - sizeof(logentry)); - bl.push_back(bp); - } - t.write( info.pgid.to_object(), ondisklog.top, bl.length(), bl ); - - // update block map? - if (ondisklog.top % 4096 == 0) - ondisklog.block_map[ondisklog.top] = logentry.version; - - ondisklog.top += bl.length(); - t.collection_setattr(info.pgid, "ondisklog_top", &ondisklog.top, sizeof(ondisklog.top)); - - // trim? - if (trim_to > log.bottom) { - dout(10) << " trimming " << log << " to " << trim_to << dendl; - log.trim(t, trim_to); - info.log_bottom = log.bottom; - info.log_backlog = log.backlog; - trim_ondisklog_to(t, trim_to); - } - dout(10) << " ondisklog [" << ondisklog.bottom << "," << ondisklog.top << ")" << dendl; -} - -void PG::read_log(ObjectStore *store) -{ - int r; - // load bounds - ondisklog.bottom = ondisklog.top = 0; - r = store->collection_getattr(info.pgid, "ondisklog_bottom", &ondisklog.bottom, sizeof(ondisklog.bottom)); - assert(r == sizeof(ondisklog.bottom)); - r = store->collection_getattr(info.pgid, "ondisklog_top", &ondisklog.top, sizeof(ondisklog.top)); - assert(r == sizeof(ondisklog.top)); - - dout(10) << "read_log [" << ondisklog.bottom << "," << ondisklog.top << ")" << dendl; - - log.backlog = info.log_backlog; - log.bottom = info.log_bottom; - - if (ondisklog.top > 0) { - // read - bufferlist bl; - store->read(info.pgid.to_object(), ondisklog.bottom, ondisklog.top-ondisklog.bottom, bl); - if (bl.length() < ondisklog.top-ondisklog.bottom) { - dout(0) << "read_log data doesn't match attrs" << dendl; - assert(0); - } - - PG::Log::Entry e; - off_t pos = ondisklog.bottom; - assert(log.log.empty()); - while (pos < ondisklog.top) { - bl.copy(pos-ondisklog.bottom, sizeof(e), (char*)&e); - dout(10) << "read_log " << pos << " " << e << dendl; - - if (e.version > log.bottom || log.backlog) { // ignore items below log.bottom - if (pos % 4096 == 0) - ondisklog.block_map[pos] = e.version; - log.log.push_back(e); - } else { - dout(10) << "read_log ignoring entry at " << pos << dendl; - } - - if (g_conf.osd_pad_pg_log) // pad to 4k, until i fix ebofs reallocation crap. FIXME. - pos += 4096; - else - pos += sizeof(e); - } - } - log.top = info.last_update; - log.index(); - - // build missing - set did; - for (list::reverse_iterator i = log.log.rbegin(); - i != log.log.rend(); - i++) { - if (i->version <= info.last_complete) break; - if (did.count(i->oid)) continue; - did.insert(i->oid); - - if (i->is_delete()) continue; - - eversion_t v; - int r = osd->store->getattr(i->oid, "version", &v, sizeof(v)); - if (r < 0 || v < i->version) - missing.add(i->oid, i->version); - } -} - - - - -// ============================== -// Object locking - -// -// If the target object of the operation op is locked for writing by another client, the function puts op to the waiting queue waiting_for_wr_unlock -// returns true if object was locked, otherwise returns false -// -bool PG::block_if_wrlocked(MOSDOp* op) -{ - object_t oid = op->get_oid(); - - entity_name_t source; - int len = osd->store->getattr(oid, "wrlock", &source, sizeof(entity_name_t)); - //dout(0) << "getattr returns " << len << " on " << oid << dendl; - - if (len == sizeof(source) && - source != op->get_client()) { - //the object is locked for writing by someone else -- add the op to the waiting queue - waiting_for_wr_unlock[oid].push_back(op); - return true; - } - - return false; //the object wasn't locked, so the operation can be handled right away -} - - - - -// ======================= -// revisions - - -/* -int OSD::list_missing_revs(object_t oid, set& revs, PG *pg) -{ - int c = 0; - oid.rev = 0; - - map::iterator p = pg->missing.missing.lower_bound(oid); - if (p == pg->missing.missing.end()) - return 0; // clearly not - - while (p->first.ino == oid.ino && - p->first.bno == oid.bno) { - revs.insert(p->first); - c++; - } - return c; -}*/ - -bool PG::pick_missing_object_rev(object_t& oid) -{ - map::iterator p = missing.missing.upper_bound(oid); - if (p == missing.missing.end()) - return false; // clearly no candidate - - if (p->first.ino == oid.ino && p->first.bno == oid.bno) { - oid = p->first; // yes! it's an upper bound revision for me. - return true; - } - return false; -} - -bool PG::pick_object_rev(object_t& oid) -{ - object_t t = oid; - - if (!osd->store->pick_object_revision_lt(t)) - return false; // we have no revisions of this object! - - objectrev_t crev; - int r = osd->store->getattr(t, "crev", &crev, sizeof(crev)); - assert(r >= 0); - if (crev <= oid.rev) { - dout(10) << "pick_object_rev choosing " << t << " crev " << crev << " for " << oid << dendl; - oid = t; - return true; - } - - return false; -} - - - - - diff --git a/branches/sage/ebofs2/osd/PG.h b/branches/sage/ebofs2/osd/PG.h deleted file mode 100644 index 0e14ea3a2ed63..0000000000000 --- a/branches/sage/ebofs2/osd/PG.h +++ /dev/null @@ -1,754 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __PG_H -#define __PG_H - - -#include "include/types.h" -#include "osd_types.h" -#include "include/buffer.h" - -#include "OSDMap.h" -#include "ObjectStore.h" -#include "msg/Messenger.h" - -#include "common/DecayCounter.h" - -#include -#include -using namespace std; - -#include -#include -using namespace __gnu_cxx; - - -class OSD; -class MOSDOp; -class MOSDOpReply; -class MOSDPGActivateSet; - -/** PG - Replica Placement Group - * - */ - -class PG { -public: - - /* - * PG::Info - summary of PG statistics. - * - * some notes: - * - last_complete implies we have all objects that existed as of that - * stamp, OR a newer object, OR have already applied a later delete. - * - if last_complete >= log.bottom, then we know pg contents thru log.top. - * otherwise, we have no idea what the pg is supposed to contain. - */ - struct Info { - pg_t pgid; - eversion_t last_update; // last object version applied to store. - eversion_t last_complete; // last version pg was complete through. - - eversion_t log_bottom; // oldest log entry. - bool log_backlog; // do we store a complete log? - - epoch_t last_epoch_started; // last epoch started. - epoch_t last_epoch_finished; // last epoch finished. - - struct History { - epoch_t same_since; // same acting set since - epoch_t same_primary_since; // same primary at least back through this epoch. - epoch_t same_acker_since; // same acker at least back through this epoch. - History() : same_since(0), same_primary_since(0), same_acker_since(0) {} - } history; - - Info(pg_t p=0) : pgid(p), - log_backlog(false), - last_epoch_started(0), last_epoch_finished(0) {} - bool is_uptodate() const { return last_update == last_complete; } - bool is_empty() const { return last_update.version == 0; } - }; - - - /** - * Query - used to ask a peer for information about a pg. - * - * note: if version=0, type=LOG, then we just provide our full log. - * only if type=BACKLOG do we generate a backlog and provide that too. - */ - struct Query { - const static int INFO = 0; - const static int LOG = 1; - const static int BACKLOG = 2; - const static int FULLLOG = 3; - - int type; - eversion_t split, floor; - Info::History history; - - Query() : type(-1) {} - Query(int t, Info::History& h) : - type(t), history(h) { assert(t != LOG); } - Query(int t, eversion_t s, eversion_t f, Info::History& h) : - type(t), split(s), floor(f), history(h) { assert(t == LOG); } - }; - - - /* - * Missing - summary of missing objects. - * kept in memory, as a supplement to Log. - * also used to pass missing info in messages. - */ - class Missing { - public: - map missing; // oid -> v - map rmissing; // v -> oid - - map loc; // where i think i can get them. - - int num_lost() const { return missing.size() - loc.size(); } - int num_missing() const { return missing.size(); } - - bool is_missing(object_t oid) { - return missing.count(oid); - } - bool is_missing(object_t oid, eversion_t v) { - return missing.count(oid) && missing[oid] <= v; - } - void add(object_t oid) { - eversion_t z; - add(oid,z); - } - void add(object_t oid, eversion_t v) { - if (missing.count(oid)) { - if (missing[oid] > v) return; // already missing newer. - rmissing.erase(missing[oid]); - } - missing[oid] = v; - rmissing[v] = oid; - } - void rm(object_t oid, eversion_t when) { - if (missing.count(oid) && missing[oid] < when) { - rmissing.erase(missing[oid]); - missing.erase(oid); - loc.erase(oid); - } - } - void got(object_t oid, eversion_t v) { - assert(missing.count(oid)); - assert(missing[oid] <= v); - loc.erase(oid); - rmissing.erase(missing[oid]); - missing.erase(oid); - } - void got(object_t oid) { - assert(missing.count(oid)); - loc.erase(oid); - rmissing.erase(missing[oid]); - missing.erase(oid); - } - - void _encode(bufferlist& blist) { - ::_encode(missing, blist); - ::_encode(loc, blist); - } - void _decode(bufferlist& blist, int& off) { - ::_decode(missing, blist, off); - ::_decode(loc, blist, off); - - for (map::iterator it = missing.begin(); - it != missing.end(); - it++) - rmissing[it->second] = it->first; - } - }; - - - /* - * Log - incremental log of recent pg changes. - * also, serves as a recovery queue. - * - * when backlog is true, - * objects with versions <= bottom are in log. - * we do not have any deletion info before that time, however. - * log is a "summary" in that it contains all objects in the PG. - */ - class Log { - public: - /** top, bottom - * top - newest entry (update|delete) - * bottom - entry previous to oldest (update|delete) for which we have - * complete negative information. - * i.e. we can infer pg contents for any store whose last_update >= bottom. - */ - eversion_t top; // newest entry (update|delete) - eversion_t bottom; // version prior to oldest (update|delete) - - /** backlog - true if log is a complete summary of pg contents. - * updated will include all items in pg, but deleted will not include - * negative entries for items deleted prior to 'bottom'. - */ - bool backlog; - - /** Entry - * mapped from the eversion_t, so don't include that. - */ - class Entry { - public: - const static int LOST = 0; - const static int MODIFY = 1; - const static int CLONE = 2; - const static int DELETE = 3; - - int op; // write, zero, trunc, remove - object_t oid; - eversion_t version; - - osdreqid_t reqid; // caller+tid to uniquely identify request - - Entry() : op(0) {} - Entry(int _op, object_t _oid, const eversion_t& v, - const osdreqid_t& rid) : - op(_op), oid(_oid), version(v), reqid(rid) {} - - bool is_delete() const { return op == DELETE; } - bool is_clone() const { return op == CLONE; } - bool is_modify() const { return op == MODIFY; } - bool is_update() const { return is_clone() || is_modify(); } - }; - - list log; // the actual log. - - Log() : backlog(false) {} - - void clear() { - eversion_t z; - top = bottom = z; - backlog = false; - log.clear(); - } - bool empty() const { - return top.version == 0 && top.epoch == 0; - } - - void _encode(bufferlist& blist) const { - blist.append((char*)&top, sizeof(top)); - blist.append((char*)&bottom, sizeof(bottom)); - blist.append((char*)&backlog, sizeof(backlog)); - ::_encode(log, blist); - } - void _decode(bufferlist& blist, int& off) { - blist.copy(off, sizeof(top), (char*)&top); - off += sizeof(top); - blist.copy(off, sizeof(bottom), (char*)&bottom); - off += sizeof(bottom); - blist.copy(off, sizeof(backlog), (char*)&backlog); - off += sizeof(backlog); - - ::_decode(log, blist, off); - } - - void copy_after(const Log &other, eversion_t v); - bool copy_after_unless_divergent(const Log &other, eversion_t split, eversion_t floor); - void copy_non_backlog(const Log &other); - ostream& print(ostream& out) const; - }; - - /** - * IndexLog - adds in-memory index of the log, by oid. - * plus some methods to manipulate it all. - */ - class IndexedLog : public Log { - public: - hash_map objects; // ptrs into log. be careful! - hash_set caller_ops; - - // recovery pointers - list::iterator requested_to; // not inclusive of referenced item - list::iterator complete_to; // not inclusive of referenced item - - /****/ - IndexedLog() {} - - void clear() { - assert(0); - unindex(); - Log::clear(); - } - - bool logged_object(object_t oid) { - return objects.count(oid); - } - bool logged_req(const osdreqid_t &r) { - return caller_ops.count(r); - } - - void index() { - objects.clear(); - caller_ops.clear(); - for (list::iterator i = log.begin(); - i != log.end(); - i++) { - objects[i->oid] = &(*i); - caller_ops.insert(i->reqid); - } - } - - void index(Entry& e) { - if (objects.count(e.oid) == 0 || - objects[e.oid]->version < e.version) - objects[e.oid] = &e; - caller_ops.insert(e.reqid); - } - void unindex() { - objects.clear(); - caller_ops.clear(); - } - void unindex(Entry& e) { - // NOTE: this only works if we remove from the _bottom_ of the log! - assert(objects.count(e.oid)); - if (objects[e.oid]->version == e.version) - objects.erase(e.oid); - caller_ops.erase(e.reqid); - } - - - // accessors - Entry *is_updated(object_t oid) { - if (objects.count(oid) && objects[oid]->is_update()) return objects[oid]; - return 0; - } - Entry *is_deleted(object_t oid) { - if (objects.count(oid) && objects[oid]->is_delete()) return objects[oid]; - return 0; - } - - // actors - void add(Entry& e) { - // add to log - log.push_back(e); - assert(e.version > top); - assert(top.version == 0 || e.version.version > top.version); - top = e.version; - - // to our index - objects[e.oid] = &(log.back()); - caller_ops.insert(e.reqid); - } - - void trim(ObjectStore::Transaction &t, eversion_t s); - void trim_write_ahead(eversion_t last_update); - }; - - - /** - * OndiskLog - some info about how we store the log on disk. - */ - class OndiskLog { - public: - // ok - off_t bottom; // first byte of log. - off_t top; // byte following end of log. - map block_map; // block -> first stamp logged there - - OndiskLog() : bottom(0), top(0) {} - - bool trim_to(eversion_t v, ObjectStore::Transaction& t); - }; - - - /*** PG ****/ -public: - // any - static const int STATE_ACTIVE = 1; // i am active. (primary: replicas too) - - // primary - static const int STATE_CLEAN = 2; // peers are complete, clean of stray replicas. - static const int STATE_CRASHED = 4; // all replicas went down. - static const int STATE_REPLAY = 8; // crashed, waiting for replay - - // non-primary - static const int STATE_STRAY = 16; // i must notify the primary i exist. - - static std::string get_state_string(int state) { - std::string st; - if (state & STATE_ACTIVE) st += "active+"; - if (state & STATE_CLEAN) st += "clean+"; - if (state & STATE_CRASHED) st += "crashed+"; - if (state & STATE_REPLAY) st += "replay+"; - if (state & STATE_STRAY) st += "stray+"; - if (!st.length()) - st = "inactive"; - else - st.resize(st.length()-1); - return st; - } - -protected: - OSD *osd; - - /** locking and reference counting. - * I destroy myself when the reference count hits zero. - * lock() should be called before doing anything. - * get() should be called on pointer copy (to another thread, etc.). - * put() should be called on destruction of some previously copied pointer. - * put_unlock() when done with the current pointer (_most common_). - */ - Mutex _lock; - int ref; - bool deleted; - -public: - void lock() { - //cout << this << " " << info.pgid << " lock" << endl; - _lock.Lock(); - } - void unlock() { - //cout << this << " " << info.pgid << " unlock" << endl; - _lock.Unlock(); - } - void get() { - //cout << this << " " << info.pgid << " get " << ref << endl; - assert(_lock.is_locked()); - ++ref; - } - void put() { - //cout << this << " " << info.pgid << " put " << ref << endl; - assert(_lock.is_locked()); - --ref; - assert(ref > 0); // last put must be a put_unlock. - } - void put_unlock() { - //cout << this << " " << info.pgid << " put_unlock " << ref << endl; - assert(_lock.is_locked()); - --ref; - _lock.Unlock(); - if (ref == 0) delete this; - } - - - list op_queue; // op queue - - - void mark_deleted() { deleted = true; } - bool is_deleted() { return deleted; } - -public: - // pg state - Info info; - IndexedLog log; - OndiskLog ondisklog; - Missing missing; - utime_t last_heartbeat; // - -protected: - int role; // 0 = primary, 1 = replica, -1=none. - int state; // see bit defns above - - // primary state - public: - vector acting; - epoch_t last_epoch_started_any; - eversion_t last_complete_commit; - - // [primary only] content recovery state - eversion_t peers_complete_thru; - bool have_master_log; - protected: - set prior_set; // current+prior OSDs, as defined by last_epoch_started_any. - set stray_set; // non-acting osds that have PG data. - set uptodate_set; // current OSDs that are uptodate - eversion_t oldest_update; // lowest (valid) last_update in active set - map peer_info; // info from peers (stray or prior) - set peer_info_requested; - map peer_missing; - set peer_log_requested; // logs i've requested (and start stamps) - set peer_summary_requested; - friend class OSD; - - - // pg waiters - list waiting_for_active; - hash_map > waiting_for_missing_object; - map replay_queue; - - hash_map > waiting_for_wr_unlock; - - bool block_if_wrlocked(MOSDOp* op); - - - // recovery - map objects_pulling; // which objects are currently being pulled - - - - // stats - off_t stat_size; - off_t stat_num_blocks; - - hash_map stat_object_temp_rd; - - Mutex pg_stats_lock; - pg_stat_t pg_stats; - - void update_stats(); - -public: - void clear_primary_state(); - - public: - bool is_acting(int osd) const { - for (unsigned i=0; i peers_complete_thru) { - peers_complete_thru = t; - return true; - } - return false; - } - - void proc_replica_log(Log &olog, Missing& omissing, int from); - void merge_log(Log &olog, Missing& omissing, int from); - void proc_missing(Log &olog, Missing &omissing, int fromosd); - - void generate_backlog(); - void drop_backlog(); - - void trim_write_ahead(); - - void peer(ObjectStore::Transaction& t, - map< int, map >& query_map, - map *activator_map=0); - void activate(ObjectStore::Transaction& t, - map *activator_map=0); - - virtual void clean_up_local(ObjectStore::Transaction& t) = 0; - - virtual void cancel_recovery() = 0; - virtual bool do_recovery() = 0; - virtual void purge_strays() = 0; - - void finish_recovery(); - - off_t get_log_write_pos() { - return 0; - } - - friend class C_OSD_RepModify_Commit; - - public: - PG(OSD *o, pg_t p) : - osd(o), - ref(0), deleted(false), - info(p), - role(0), - state(0), - last_epoch_started_any(0), - last_complete_commit(0), - peers_complete_thru(0), - have_master_log(true), - stat_size(0), stat_num_blocks(0) - { } - virtual ~PG() { } - - pg_t get_pgid() const { return info.pgid; } - int get_nrep() const { return acting.size(); } - - int get_primary() { return acting.empty() ? -1:acting[0]; } - //int get_tail() { return acting.empty() ? -1:acting[ acting.size()-1 ]; } - //int get_acker() { return g_conf.osd_rep == OSD_REP_PRIMARY ? get_primary():get_tail(); } - int get_acker() { - if (g_conf.osd_rep == OSD_REP_PRIMARY || - acting.size() <= 1) - return get_primary(); - return acting[1]; - } - - int get_role() const { return role; } - void set_role(int r) { role = r; } - - bool is_primary() const { return role == PG_ROLE_HEAD; } - bool is_acker() const { - if (g_conf.osd_rep == OSD_REP_PRIMARY) - return is_primary(); - else - return role == PG_ROLE_ACKER; - } - bool is_head() const { return role == PG_ROLE_HEAD; } - bool is_middle() const { return role == PG_ROLE_MIDDLE; } - bool is_residual() const { return role == PG_ROLE_STRAY; } - - //int get_state() const { return state; } - bool state_test(int m) const { return (state & m) != 0; } - void state_set(int m) { state |= m; } - void state_clear(int m) { state &= ~m; } - - bool is_complete() const { return info.last_complete == info.last_update; } - - bool is_active() const { return state_test(STATE_ACTIVE); } - bool is_crashed() const { return state_test(STATE_CRASHED); } - bool is_replay() const { return state_test(STATE_REPLAY); } - //bool is_complete() { return state_test(STATE_COMPLETE); } - bool is_clean() const { return state_test(STATE_CLEAN); } - bool is_stray() const { return state_test(STATE_STRAY); } - - bool is_empty() const { return info.last_update == 0; } - - int num_active_ops() const { - return objects_pulling.size(); - } - - // pg on-disk state - void write_log(ObjectStore::Transaction& t); - void append_log(ObjectStore::Transaction& t, - PG::Log::Entry& logentry, - eversion_t trim_to); - void read_log(ObjectStore *store); - void trim_ondisklog_to(ObjectStore::Transaction& t, eversion_t v); - - - bool is_dup(osdreqid_t rid) { - return log.logged_req(rid); - } - - - bool pick_missing_object_rev(object_t& oid); - bool pick_object_rev(object_t& oid); - - - - // abstract bits - virtual bool preprocess_op(MOSDOp *op, utime_t now) { return false; } - virtual void do_op(MOSDOp *op) = 0; - virtual void do_op_reply(MOSDOpReply *op) = 0; - - virtual bool same_for_read_since(epoch_t e) = 0; - virtual bool same_for_modify_since(epoch_t e) = 0; - virtual bool same_for_rep_modify_since(epoch_t e) = 0; - - virtual bool is_missing_object(object_t oid) = 0; - virtual void wait_for_missing_object(object_t oid, MOSDOp *op) = 0; - - virtual void note_failed_osd(int osd) = 0; - - virtual void on_acker_change() = 0; - virtual void on_role_change() = 0; -}; - - - -inline ostream& operator<<(ostream& out, const PG::Info::History& h) -{ - return out << h.same_since << "/" << h.same_primary_since << "/" << h.same_acker_since; -} - -inline ostream& operator<<(ostream& out, const PG::Info& pgi) -{ - out << pgi.pgid << "("; - if (pgi.is_empty()) - out << " empty"; - else - out << " v " << pgi.last_update << "/" << pgi.last_complete - << " (" << pgi.log_bottom << "," << pgi.last_update << "]" - << (pgi.log_backlog ? "+backlog":""); - out << " e " << pgi.last_epoch_started << "/" << pgi.last_epoch_finished - << " " << pgi.history - << ")"; - return out; -} - -inline ostream& operator<<(ostream& out, const PG::Log::Entry& e) -{ - return out << " " << e.version - << (e.is_delete() ? " - ": - (e.is_clone() ? " c ": - (e.is_modify() ? " m ": - " ? "))) - << e.oid << " by " << e.reqid; -} - -inline ostream& operator<<(ostream& out, const PG::Log& log) -{ - out << "log(" << log.bottom << "," << log.top << "]"; - if (log.backlog) out << "+backlog"; - return out; -} - -inline ostream& operator<<(ostream& out, const PG::Missing& missing) -{ - out << "missing(" << missing.num_missing(); - if (missing.num_lost()) out << ", " << missing.num_lost() << " lost"; - out << ")"; - return out; -} - -inline ostream& operator<<(ostream& out, const PG& pg) -{ - out << "pg[" << pg.info - << " r=" << pg.get_role(); - - if (pg.log.bottom != pg.info.log_bottom) - out << " (info mismatch, " << pg.log << ")"; - - if (pg.log.log.empty()) { - // shoudl it be? - if (pg.log.top.version - pg.log.bottom.version != 0) { - out << " (log bound mismatch, empty)"; - } - } else { - if (((pg.log.log.begin()->version.version - 1 != pg.log.bottom.version) && - !pg.log.backlog) || - (pg.log.log.rbegin()->version.version != pg.log.top.version)) { - out << " (log bound mismatch, actual=[" - << pg.log.log.begin()->version << "," - << pg.log.log.rbegin()->version << "] len=" << pg.log.log.size() << ")"; - } - } - - if (pg.get_role() == 0) out << " pct " << pg.peers_complete_thru; - if (!pg.have_master_log) out << " !hml"; - if (pg.is_active()) out << " active"; - if (pg.is_crashed()) out << " crashed"; - if (pg.is_replay()) out << " replay"; - if (pg.is_clean()) out << " clean"; - if (pg.is_stray()) out << " stray"; - //out << " (" << pg.log.bottom << "," << pg.log.top << "]"; - if (pg.missing.num_missing()) out << " m=" << pg.missing.num_missing(); - if (pg.missing.num_lost()) out << " l=" << pg.missing.num_lost(); - out << "]"; - - - return out; -} - - - -#endif diff --git a/branches/sage/ebofs2/osd/RAID4PG.cc b/branches/sage/ebofs2/osd/RAID4PG.cc deleted file mode 100644 index 20cd6d8ab416b..0000000000000 --- a/branches/sage/ebofs2/osd/RAID4PG.cc +++ /dev/null @@ -1,123 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "RAID4PG.h" -#include "OSD.h" - -#include "common/Logger.h" - -#include "messages/MOSDOp.h" -#include "messages/MOSDOpReply.h" - -#include "messages/MOSDPGNotify.h" -#include "messages/MOSDPGRemove.h" - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_osd) *_dout << dbeginl << g_clock.now() << " osd" << osd->get_nodeid() << " " << (osd->osdmap ? osd->osdmap->get_epoch():0) << " " << *this << " " - -#include -#include - - - - -void RAID4PG::do_op(MOSDOp *op) -{ - - -} - - - -void RAID4PG::do_op_reply(MOSDOpReply *reply) -{ - -} - - - -// ----------------- -// pg changes - -bool RAID4PG::same_for_read_since(epoch_t e) -{ - return e >= info.history.same_since; // whole pg set same -} - -bool RAID4PG::same_for_modify_since(epoch_t e) -{ - return e >= info.history.same_since; // whole pg set same -} - -bool RAID4PG::same_for_rep_modify_since(epoch_t e) -{ - return e >= info.history.same_since; // whole pg set same -} - - -// ----------------- -// RECOVERY - -bool RAID4PG::is_missing_object(object_t oid) -{ - return false; -} - -void RAID4PG::wait_for_missing_object(object_t oid, MOSDOp *op) -{ - //assert(0); -} - -void RAID4PG::note_failed_osd(int o) -{ - dout(10) << "note_failed_osd osd" << o << dendl; - //assert(0); -} - -void RAID4PG::on_acker_change() -{ - dout(10) << "on_acker_change" << dendl; - //assert(0); -} - - -void RAID4PG::on_role_change() -{ - dout(10) << "on_role_change" << dendl; - //assert(0); -} - - -void RAID4PG::clean_up_local(ObjectStore::Transaction&) -{ -} - -void RAID4PG::cancel_recovery() -{ - //assert(0); -} - -bool RAID4PG::do_recovery() -{ - //assert(0); - return false; -} - -void RAID4PG::purge_strays() -{ - //assert(0); -} - - - diff --git a/branches/sage/ebofs2/osd/RAID4PG.h b/branches/sage/ebofs2/osd/RAID4PG.h deleted file mode 100644 index 98e4deab56895..0000000000000 --- a/branches/sage/ebofs2/osd/RAID4PG.h +++ /dev/null @@ -1,74 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __RAID4PG_H -#define __RAID4PG_H - - -#include "PG.h" - -#include "messages/MOSDOp.h" - - -class RAID4PG : public PG { -public: - -protected: - - void prepare_log_transaction(ObjectStore::Transaction& t, - MOSDOp *op, eversion_t& version, - objectrev_t crev, objectrev_t rev, - eversion_t trim_to); - void prepare_op_transaction(ObjectStore::Transaction& t, - MOSDOp *op, eversion_t& version, - objectrev_t crev, objectrev_t rev); - - void op_stat(MOSDOp *op); - int op_read(MOSDOp *op); - void op_modify(MOSDOp *op); - void op_rep_modify(MOSDOp *op); - void op_push(MOSDOp *op); - void op_pull(MOSDOp *op); - - - -public: - RAID4PG(OSD *o, pg_t p) : PG(o,p) { } - - void do_op(MOSDOp *op); - void do_op_reply(MOSDOpReply *r); - - bool same_for_read_since(epoch_t e); - bool same_for_modify_since(epoch_t e); - bool same_for_rep_modify_since(epoch_t e); - - bool is_missing_object(object_t oid); - void wait_for_missing_object(object_t oid, MOSDOp *op); - - void note_failed_osd(int osd); - - void on_acker_change(); - void on_role_change(); - - void clean_up_local(ObjectStore::Transaction& t); - - void cancel_recovery(); - bool do_recovery(); - - void purge_strays(); - - -}; - - -#endif diff --git a/branches/sage/ebofs2/osd/ReplicatedPG.cc b/branches/sage/ebofs2/osd/ReplicatedPG.cc deleted file mode 100644 index 7b5bdf581d643..0000000000000 --- a/branches/sage/ebofs2/osd/ReplicatedPG.cc +++ /dev/null @@ -1,1972 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include "ReplicatedPG.h" -#include "OSD.h" - -#include "common/Logger.h" - -#include "messages/MOSDOp.h" -#include "messages/MOSDOpReply.h" - -#include "messages/MOSDPGNotify.h" -#include "messages/MOSDPGRemove.h" - -#include "messages/MOSDPing.h" - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_osd) *_dout << dbeginl << g_clock.now() << " osd" << osd->get_nodeid() << " " << (osd->osdmap ? osd->osdmap->get_epoch():0) << " " << *this << " " -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_osd) *_derr << dbeginl << g_clock.now() << " osd" << osd->get_nodeid() << " " << (osd->osdmap ? osd->osdmap->get_epoch():0) << " " << *this << " " - -#include -#include - -static const int LOAD_LATENCY = 1; -static const int LOAD_QUEUE_SIZE = 2; -static const int LOAD_HYBRID = 3; - - -// ======================= -// pg changes - -bool ReplicatedPG::same_for_read_since(epoch_t e) -{ - return (e >= info.history.same_acker_since); -} - -bool ReplicatedPG::same_for_modify_since(epoch_t e) -{ - return (e >= info.history.same_primary_since); -} - -bool ReplicatedPG::same_for_rep_modify_since(epoch_t e) -{ - // check osd map: same set, or primary+acker? - - if (g_conf.osd_rep == OSD_REP_CHAIN) { - return e >= info.history.same_since; // whole pg set same - } else { - // primary, splay - return (e >= info.history.same_primary_since && - e >= info.history.same_acker_since); - } -} - -// ==================== -// missing objects - -bool ReplicatedPG::is_missing_object(object_t oid) -{ - return missing.missing.count(oid); -} - - -void ReplicatedPG::wait_for_missing_object(object_t oid, MOSDOp *op) -{ - assert(is_missing_object(oid)); - - // we don't have it (yet). - eversion_t v = missing.missing[oid]; - if (objects_pulling.count(oid)) { - dout(7) << "missing " - << oid - << " v " << v - << ", already pulling" - << dendl; - } else { - dout(7) << "missing " - << oid - << " v " << v - << ", pulling" - << dendl; - pull(oid); - } - waiting_for_missing_object[oid].push_back(op); -} - - - - -/** preprocess_op - preprocess an op (before it gets queued). - * fasttrack read - */ -bool ReplicatedPG::preprocess_op(MOSDOp *op, utime_t now) -{ - // we only care about reads here on out.. - if (!op->is_read()) - return false; - - object_t oid = op->get_oid(); - - // -- load balance reads -- - if (is_primary() && - g_conf.osd_rep == OSD_REP_PRIMARY) { - // -- read on primary+acker --- - - // test - if (false) { - if (acting.size() > 1) { - int peer = acting[1]; - dout(-10) << "preprocess_op fwd client read op to osd" << peer << " for " << op->get_client() << " " << op->get_client_inst() << dendl; - osd->messenger->send_message(op, osd->osdmap->get_inst(peer)); - return true; - } - } - - // -- balance reads? - if (g_conf.osd_balance_reads && - !op->get_source().is_osd()) { - // flash crowd? - bool is_flash_crowd_candidate = false; - if (g_conf.osd_flash_crowd_iat_threshold > 0) { - osd->iat_averager.add_sample( oid, (double)g_clock.now() ); - is_flash_crowd_candidate = osd->iat_averager.is_flash_crowd_candidate( oid ); - } - - // hot? - double temp = 0; - if (stat_object_temp_rd.count(oid)) - temp = stat_object_temp_rd[oid].get(op->get_recv_stamp()); - bool is_hotly_read = temp > g_conf.osd_balance_reads_temp; - - dout(20) << "balance_reads oid " << oid << " temp " << temp - << (is_hotly_read ? " hotly_read":"") - << (is_flash_crowd_candidate ? " flash_crowd_candidate":"") - << dendl; - - bool should_balance = is_flash_crowd_candidate || is_hotly_read; - bool is_balanced = false; - bool b; - // *** FIXME *** this may block, and we're in the fast path! *** - if (osd->store->getattr(oid, "balance-reads", &b, 1) >= 0) - is_balanced = true; - - if (!is_balanced && should_balance && - balancing_reads.count(oid) == 0) { - dout(-10) << "preprocess_op balance-reads on " << oid << dendl; - balancing_reads.insert(oid); - MOSDOp *pop = new MOSDOp(osd->messenger->get_myinst(), 0, osd->get_tid(), - oid, - ObjectLayout(info.pgid), - osd->osdmap->get_epoch(), - OSD_OP_BALANCEREADS); - do_op(pop); - } - if (is_balanced && !should_balance && - !unbalancing_reads.count(oid) == 0) { - dout(-10) << "preprocess_op unbalance-reads on " << oid << dendl; - unbalancing_reads.insert(oid); - MOSDOp *pop = new MOSDOp(osd->messenger->get_myinst(), 0, osd->get_tid(), - oid, - ObjectLayout(info.pgid), - osd->osdmap->get_epoch(), - OSD_OP_UNBALANCEREADS); - do_op(pop); - } - } - - // -- read shedding - if (g_conf.osd_shed_reads && - g_conf.osd_stat_refresh_interval > 0 && - !op->get_source().is_osd()) { // no re-shedding! - Mutex::Locker lock(osd->peer_stat_lock); - - osd->_refresh_my_stat(now); - - // check my load. - // TODO xxx we must also compare with our own load - // if i am x percentage higher than replica , - // redirect the read - - int shedto = -1; - double bestscore = 0.0; // highest positive score wins - - // we calculate score values such that we can interpret them as a probability. - - switch (g_conf.osd_shed_reads) { - case LOAD_LATENCY: - // above some minimum? - if (osd->my_stat.read_latency >= g_conf.osd_shed_reads_min_latency) { - for (unsigned i=1; ipeer_stat.count(peer) == 0) continue; - - // assume a read_latency of 0 (technically, undefined) is OK, since - // we'll be corrected soon enough if we're wrong. - - double plat = osd->peer_stat[peer].read_latency_mine; - - double diff = osd->my_stat.read_latency - plat; - if (diff < g_conf.osd_shed_reads_min_latency_diff) continue; - - double c = .002; // add in a constant to smooth it a bit - double latratio = - (c+osd->my_stat.read_latency) / - (c+plat); - double p = (latratio - 1.0) / 2.0 / latratio; - dout(15) << "preprocess_op " << op->get_reqid() - << " my read latency " << osd->my_stat.read_latency - << ", peer osd" << peer << " is " << plat << " (" << osd->peer_stat[peer].read_latency << ")" - << ", latratio " << latratio - << ", p=" << p - << dendl; - if (latratio > g_conf.osd_shed_reads_min_latency_ratio && - p > bestscore && - drand48() < p) { - shedto = peer; - bestscore = p; - } - } - } - break; - - case LOAD_HYBRID: - // dumb mostly - if (osd->my_stat.read_latency >= g_conf.osd_shed_reads_min_latency) { - for (unsigned i=1; ipeer_stat.count(peer) == 0/* || - osd->peer_stat[peer].read_latency <= 0*/) continue; - - if (osd->peer_stat[peer].qlen < osd->my_stat.qlen) { - - if (osd->my_stat.read_latency - osd->peer_stat[peer].read_latency > - g_conf.osd_shed_reads_min_latency_diff) continue; - - double qratio = osd->pending_ops / osd->peer_stat[peer].qlen; - - double c = .002; // add in a constant to smooth it a bit - double latratio = - (c+osd->my_stat.read_latency)/ - (c+osd->peer_stat[peer].read_latency); - double p = (latratio - 1.0) / 2.0 / latratio; - - dout(-15) << "preprocess_op " << op->get_reqid() - << " my qlen / rdlat " - << osd->pending_ops << " " << osd->my_stat.read_latency - << ", peer osd" << peer << " is " - << osd->peer_stat[peer].qlen << " " << osd->peer_stat[peer].read_latency - << ", qratio " << qratio - << ", latratio " << latratio - << ", p=" << p - << dendl; - if (latratio > g_conf.osd_shed_reads_min_latency_ratio && - p > bestscore && - drand48() < p) { - shedto = peer; - bestscore = p; - } - } - } - } - break; - - /* - case LOAD_QUEUE_SIZE: - // am i above my average? -- dumb - if (osd->pending_ops > osd->my_stat.qlen) { - // yes. is there a peer who is below my average? - for (unsigned i=1; ipeer_stat.count(peer) == 0) continue; - if (osd->peer_stat[peer].qlen < osd->my_stat.qlen) { - // calculate a probability that we should redirect - float p = (osd->my_stat.qlen - osd->peer_stat[peer].qlen) / osd->my_stat.qlen; // this is dumb. - float v = 1.0 - p; - - dout(10) << "my qlen " << osd->pending_ops << " > my_avg " << osd->my_stat.qlen - << ", peer osd" << peer << " has qlen " << osd->peer_stat[peer].qlen - << ", p=" << p - << ", v= "<< v - << dendl; - - if (v > bestscore) { - shedto = peer; - bestscore = v; - } - } - } - } - break;*/ - - } - - // shed? - if (shedto >= 0) { - dout(10) << "preprocess_op shedding read to peer osd" << shedto - << " " << op->get_reqid() - << dendl; - op->set_peer_stat(osd->my_stat); - osd->messenger->send_message(op, osd->osdmap->get_inst(shedto)); - osd->stat_rd_ops_shed_out++; - osd->logger->inc("shdout"); - return true; - } - } - } // endif balance reads - - - // -- fastpath read? - // if this is a read and the data is in the cache, do an immediate read.. - if ( g_conf.osd_immediate_read_from_cache ) { - if (osd->store->is_cached( oid , - op->get_offset(), - op->get_length() ) == 0) { - if (!is_primary() && !op->get_source().is_osd()) { - // am i allowed? - bool v; - if (osd->store->getattr(oid, "balance-reads", &v, 1) < 0) { - dout(-10) << "preprocess_op in-cache but no balance-reads on " << oid - << ", fwd to primary" << dendl; - osd->messenger->send_message(op, osd->osdmap->get_inst(get_primary())); - return true; - } - } - - // do it now - dout(10) << "preprocess_op data is in cache, reading from cache" << *op << dendl; - do_op(op); - return true; - } - } - - return false; -} - - -/** do_op - do an op - * pg lock will be held (if multithreaded) - * osd_lock NOT held. - */ -void ReplicatedPG::do_op(MOSDOp *op) -{ - //dout(15) << "do_op " << *op << dendl; - - osd->logger->inc("op"); - - switch (op->get_op()) { - - // reads - case OSD_OP_READ: - case OSD_OP_STAT: - op_read(op); - break; - - // rep stuff - case OSD_OP_PULL: - op_pull(op); - break; - case OSD_OP_PUSH: - op_push(op); - break; - - // writes - case OSD_OP_WRNOOP: - case OSD_OP_WRITE: - case OSD_OP_ZERO: - case OSD_OP_DELETE: - case OSD_OP_TRUNCATE: - case OSD_OP_WRLOCK: - case OSD_OP_WRUNLOCK: - case OSD_OP_RDLOCK: - case OSD_OP_RDUNLOCK: - case OSD_OP_UPLOCK: - case OSD_OP_DNLOCK: - case OSD_OP_BALANCEREADS: - case OSD_OP_UNBALANCEREADS: - if (op->get_source().is_osd()) { - op_rep_modify(op); - } else { - // go go gadget pg - op_modify(op); - } - break; - - default: - assert(0); - } -} - -void ReplicatedPG::do_op_reply(MOSDOpReply *r) -{ - if (r->get_op() == OSD_OP_PUSH) { - // continue peer recovery - op_push_reply(r); - } else { - // must be replication. - tid_t rep_tid = r->get_rep_tid(); - int fromosd = r->get_source().num(); - - osd->take_peer_stat(fromosd, r->get_peer_stat()); - - if (rep_gather.count(rep_tid)) { - // oh, good. - repop_ack(rep_gather[rep_tid], - r->get_result(), r->get_commit(), - fromosd, - r->get_pg_complete_thru()); - delete r; - } else { - // early ack. - waiting_for_repop[rep_tid].push_back(r); - } - } -} - - - - -// ======================================================================== -// READS - -void ReplicatedPG::op_read(MOSDOp *op) -{ - object_t oid = op->get_oid(); - - dout(10) << "op_read " << MOSDOp::get_opname(op->get_op()) - << " " << oid - << " " << op->get_offset() << "~" << op->get_length() - << dendl; - - // wrlocked? - if (block_if_wrlocked(op)) - return; - - // !primary and unbalanced? - // (ignore ops forwarded from the primary) - if (!is_primary()) { - if (op->get_source().is_osd() && - op->get_source().num() == get_primary()) { - // read was shed to me by the primary - int from = op->get_source().num(); - osd->take_peer_stat(from, op->get_peer_stat()); - dout(10) << "read shed IN from " << op->get_source() - << " " << op->get_reqid() - << ", me = " << osd->my_stat.read_latency_mine - << ", them = " << op->get_peer_stat().read_latency - << (osd->my_stat.read_latency_mine > op->get_peer_stat().read_latency ? " WTF":"") - << dendl; - osd->logger->inc("shdin"); - - // does it look like they were wrong to do so? - Mutex::Locker lock(osd->peer_stat_lock); - if (osd->my_stat.read_latency_mine > op->get_peer_stat().read_latency && - osd->my_stat_on_peer[from].read_latency_mine < op->get_peer_stat().read_latency) { - dout(-10) << "read shed IN from " << op->get_source() - << " " << op->get_reqid() - << " and me " << osd->my_stat.read_latency_mine - << " > them " << op->get_peer_stat().read_latency - << ", but they didn't know better, sharing" << dendl; - osd->my_stat_on_peer[from] = osd->my_stat; - osd->messenger->send_message(new MOSDPing(osd->osdmap->get_epoch(), osd->my_stat), - osd->osdmap->get_inst(from)); - } - } else { - // make sure i exist and am balanced, otherwise fw back to acker. - bool b; - if (!osd->store->exists(oid) || - osd->store->getattr(oid, "balance-reads", &b, 1) < 0) { - dout(-10) << "read on replica, object " << oid - << " dne or no balance-reads, fw back to primary" << dendl; - osd->messenger->send_message(op, osd->osdmap->get_inst(get_acker())); - return; - } - } - } - - - // set up reply - MOSDOpReply *reply = new MOSDOpReply(op, 0, osd->osdmap->get_epoch(), true); - long r = 0; - - // do it. - if (oid.rev && !pick_object_rev(oid)) { - // we have no revision for this request. - r = -EEXIST; - } else { - switch (op->get_op()) { - case OSD_OP_READ: - { - // read into a buffer - bufferlist bl; - r = osd->store->read(oid, - op->get_offset(), op->get_length(), - bl); - reply->set_data(bl); - reply->set_length(r); - dout(15) << " read got " << r << " / " << op->get_length() << " bytes from obj " << oid << dendl; - } - osd->logger->inc("c_rd"); - osd->logger->inc("c_rdb", op->get_length()); - break; - - case OSD_OP_STAT: - { - struct stat st; - memset(&st, sizeof(st), 0); - r = osd->store->stat(oid, &st); - if (r >= 0) - reply->set_object_size(st.st_size); - } - break; - - default: - assert(0); - } - } - - if (r >= 0) { - reply->set_result(0); - - utime_t now = g_clock.now(); - utime_t diff = now; - diff -= op->get_recv_stamp(); - dout(10) << "op_read " << op->get_reqid() << " total op latency " << diff << dendl; - Mutex::Locker lock(osd->peer_stat_lock); - osd->stat_rd_ops_in_queue--; - osd->read_latency_calc.add(diff); - - if (is_primary() && - g_conf.osd_balance_reads) - stat_object_temp_rd[oid].hit(now); // hit temp. - - } else { - reply->set_result(r); // error - } - - // send it - osd->messenger->send_message(reply, op->get_client_inst()); - - delete op; -} - - - - - - -// ======================================================================== -// MODIFY - -void ReplicatedPG::prepare_log_transaction(ObjectStore::Transaction& t, - MOSDOp *op, eversion_t& version, - objectrev_t crev, objectrev_t rev, - eversion_t trim_to) -{ - const object_t oid = op->get_oid(); - - // clone entry? - if (crev && rev && rev > crev) { - eversion_t cv = version; - cv.version--; - Log::Entry cloneentry(PG::Log::Entry::CLONE, oid, cv, op->get_reqid()); - log.add(cloneentry); - - dout(10) << "prepare_log_transaction " << op->get_op() - << " " << cloneentry - << dendl; - } - - // actual op - int opcode = Log::Entry::MODIFY; - if (op->get_op() == OSD_OP_DELETE) opcode = Log::Entry::DELETE; - Log::Entry logentry(opcode, oid, version, op->get_reqid()); - - dout(10) << "prepare_log_transaction " << op->get_op() - << " " << logentry - << dendl; - - // append to log - assert(version > log.top); - log.add(logentry); - assert(log.top == version); - dout(10) << "prepare_log_transaction appended" << dendl; - - // write to pg log on disk - append_log(t, logentry, trim_to); -} - - -/** prepare_op_transaction - * apply an op to the store wrapped in a transaction. - */ -void ReplicatedPG::prepare_op_transaction(ObjectStore::Transaction& t, - MOSDOp *op, eversion_t& version, - objectrev_t crev, objectrev_t rev) -{ - const object_t oid = op->get_oid(); - const pg_t pgid = op->get_pg(); - - bool did_clone = false; - - dout(10) << "prepare_op_transaction " << MOSDOp::get_opname( op->get_op() ) - << " " << oid - << " v " << version - << " crev " << crev - << " rev " << rev - << dendl; - - // WRNOOP does nothing. - if (op->get_op() == OSD_OP_WRNOOP) - return; - - // raise last_complete? - if (info.last_complete == info.last_update) - info.last_complete = version; - - // raise last_update. - assert(version > info.last_update); - info.last_update = version; - - // write pg info - t.collection_setattr(pgid, "info", &info, sizeof(info)); - - // clone? - if (crev && rev && rev > crev) { - object_t noid = oid; - noid.rev = rev; - dout(10) << "prepare_op_transaction cloning " << oid << " crev " << crev << " to " << noid << dendl; - t.clone(oid, noid); - did_clone = true; - } - - // apply the op - switch (op->get_op()) { - - // -- locking -- - - case OSD_OP_WRLOCK: - { // lock object - t.setattr(oid, "wrlock", &op->get_client(), sizeof(entity_name_t)); - } - break; - case OSD_OP_WRUNLOCK: - { // unlock objects - t.rmattr(oid, "wrlock"); - } - break; - - case OSD_OP_MININCLOCK: - { - uint32_t mininc = op->get_length(); - t.setattr(oid, "mininclock", &mininc, sizeof(mininc)); - } - break; - - case OSD_OP_BALANCEREADS: - { - bool bal = true; - t.setattr(oid, "balance-reads", &bal, sizeof(bal)); - } - break; - case OSD_OP_UNBALANCEREADS: - { - t.rmattr(oid, "balance-reads"); - } - break; - - - // -- modify -- - - case OSD_OP_WRITE: - { // write - assert(op->get_data().length() == op->get_length()); - bufferlist bl; - bl.claim( op->get_data() ); // give buffers to store; we keep *op in memory for a long time! - - //if (oid < 100000000000000ULL) // hack hack-- don't write client data - t.write( oid, op->get_offset(), op->get_length(), bl ); - } - break; - - case OSD_OP_ZERO: - { - // zero, remove, or truncate? - struct stat st; - int r = osd->store->stat(oid, &st); - if (r >= 0) { - if (op->get_length() == 0 || - op->get_offset() + (off_t)op->get_length() >= (off_t)st.st_size) { - if (op->get_offset()) - t.truncate(oid, op->get_length() + op->get_offset()); - else - t.remove(oid); - } else { - // zero. the dumb way. FIXME. - bufferptr bp(op->get_length()); - bp.zero(); - bufferlist bl; - bl.push_back(bp); - t.write(oid, op->get_offset(), op->get_length(), bl); - } - } else { - // noop? - dout(10) << "apply_transaction zero on " << oid << ", but dne? stat returns " << r << dendl; - } - } - break; - - case OSD_OP_TRUNCATE: - { // truncate - t.truncate(oid, op->get_length() ); - } - break; - - case OSD_OP_DELETE: - { // delete - t.remove(oid); - } - break; - - default: - assert(0); - } - - // object collection, version - if (op->get_op() == OSD_OP_DELETE) { - // remove object from c - t.collection_remove(pgid, oid); - } else { - // add object to c - t.collection_add(pgid, oid); - - // object version - t.setattr(oid, "version", &version, sizeof(version)); - - // set object crev - if (crev == 0 || // new object - did_clone) // we cloned - t.setattr(oid, "crev", &rev, sizeof(rev)); - } -} - - - -// ======================================================================== -// rep op gather - -class C_OSD_ModifyCommit : public Context { -public: - ReplicatedPG *pg; - tid_t rep_tid; - eversion_t pg_last_complete; - C_OSD_ModifyCommit(ReplicatedPG *p, tid_t rt, eversion_t lc) : pg(p), rep_tid(rt), pg_last_complete(lc) { - pg->get(); // we're copying the pointer - } - void finish(int r) { - pg->lock(); - if (!pg->is_deleted()) - pg->op_modify_commit(rep_tid, pg_last_complete); - pg->put_unlock(); - } -}; - - -void ReplicatedPG::get_rep_gather(RepGather *repop) -{ - //repop->lock.Lock(); - dout(10) << "get_repop " << *repop << dendl; -} - -void ReplicatedPG::apply_repop(RepGather *repop) -{ - dout(10) << "apply_repop applying update on " << *repop << dendl; - assert(!repop->applied); - - Context *oncommit = new C_OSD_ModifyCommit(this, repop->rep_tid, repop->pg_local_last_complete); - unsigned r = osd->store->apply_transaction(repop->t, oncommit); - if (r) - dout(-10) << "apply_repop apply transaction return " << r << " on " << *repop << dendl; - - // discard my reference to the buffer - repop->op->get_data().clear(); - - repop->applied = true; - - - // any completion stuff to do here? - object_t oid = repop->op->get_oid(); - - switch (repop->op->get_op()) { - case OSD_OP_UNBALANCEREADS: - dout(-10) << "apply_repop completed unbalance-reads on " << oid << dendl; - unbalancing_reads.erase(oid); - if (waiting_for_unbalanced_reads.count(oid)) { - osd->take_waiters(waiting_for_unbalanced_reads[oid]); - waiting_for_unbalanced_reads.erase(oid); - } - break; - - case OSD_OP_BALANCEREADS: - dout(-10) << "apply_repop completed balance-reads on " << oid << dendl; - /* - if (waiting_for_balanced_reads.count(oid)) { - osd->take_waiters(waiting_for_balanced_reads[oid]); - waiting_for_balanced_reads.erase(oid); - } - */ - break; - - case OSD_OP_WRUNLOCK: - dout(-10) << "apply_repop completed wrunlock on " << oid << dendl; - if (waiting_for_wr_unlock.count(oid)) { - osd->take_waiters(waiting_for_wr_unlock[oid]); - waiting_for_wr_unlock.erase(oid); - } - break; - } - - -} - -void ReplicatedPG::put_rep_gather(RepGather *repop) -{ - dout(10) << "put_repop " << *repop << dendl; - - // commit? - if (repop->can_send_commit() && - repop->op->wants_commit()) { - // send commit. - if (repop->op->wants_reply()) { - MOSDOpReply *reply = new MOSDOpReply(repop->op, 0, osd->osdmap->get_epoch(), true); - dout(10) << "put_repop sending commit on " << *repop << " " << reply << dendl; - osd->messenger->send_message(reply, repop->op->get_client_inst()); - } - repop->sent_commit = true; - } - - // ack? - else if (repop->can_send_ack() && - repop->op->wants_ack()) { - // apply - apply_repop(repop); - - // send ack - if (repop->op->wants_reply()) { - MOSDOpReply *reply = new MOSDOpReply(repop->op, 0, osd->osdmap->get_epoch(), false); - dout(10) << "put_repop sending ack on " << *repop << " " << reply << dendl; - osd->messenger->send_message(reply, repop->op->get_client_inst()); - } - repop->sent_ack = true; - - utime_t now = g_clock.now(); - now -= repop->start; - osd->logger->finc("rlsum", now); - osd->logger->inc("rlnum", 1); - } - - // done. - if (repop->can_delete()) { - // adjust peers_complete_thru - if (!repop->pg_complete_thru.empty()) { - eversion_t min = info.last_complete; // hrm.... - for (unsigned i=0; ipg_complete_thru[acting[i]] < min) // note: if we haven't heard, it'll be zero, which is what we want. - min = repop->pg_complete_thru[acting[i]]; - } - - if (min > peers_complete_thru) { - dout(10) << "put_repop peers_complete_thru " - << peers_complete_thru << " -> " << min - << dendl; - peers_complete_thru = min; - } - } - - dout(10) << "put_repop deleting " << *repop << dendl; - - assert(rep_gather.count(repop->rep_tid)); - rep_gather.erase(repop->rep_tid); - - delete repop->op; - delete repop; - } -} - - -void ReplicatedPG::issue_repop(MOSDOp *op, int dest, utime_t now) -{ - object_t oid = op->get_oid(); - - dout(7) << " issue_repop rep_tid " << op->get_rep_tid() - << " o " << oid - << " to osd" << dest - << dendl; - - // forward the write/update/whatever - MOSDOp *wr = new MOSDOp(op->get_client_inst(), op->get_client_inc(), op->get_reqid().tid, - oid, - ObjectLayout(info.pgid), - osd->osdmap->get_epoch(), - op->get_op()); - wr->get_data() = op->get_data(); // _copy_ bufferlist - wr->set_length(op->get_length()); - wr->set_offset(op->get_offset()); - wr->set_version(op->get_version()); - - wr->set_rep_tid(op->get_rep_tid()); - wr->set_pg_trim_to(peers_complete_thru); - - wr->set_peer_stat(osd->get_my_stat_for(now, dest)); - - osd->messenger->send_message(wr, osd->osdmap->get_inst(dest)); -} - -ReplicatedPG::RepGather *ReplicatedPG::new_rep_gather(MOSDOp *op) -{ - dout(10) << "new_rep_gather rep_tid " << op->get_rep_tid() << " on " << *op << dendl; - int whoami = osd->get_nodeid(); - - RepGather *repop = new RepGather(op, op->get_rep_tid(), - op->get_version(), - info.last_complete); - - // osds. commits all come to me. - for (unsigned i=0; iosds.insert(osd); - repop->waitfor_commit.insert(osd); - } - - // acks vary: - if (g_conf.osd_rep == OSD_REP_CHAIN) { - // chain rep. - // there's my local ack... - repop->osds.insert(whoami); - repop->waitfor_ack.insert(whoami); - repop->waitfor_commit.insert(whoami); - - // also, the previous guy will ack to me - int myrank = osd->osdmap->calc_pg_rank(whoami, acting); - if (myrank > 0) { - int osd = acting[ myrank-1 ]; - repop->osds.insert(osd); - repop->waitfor_ack.insert(osd); - repop->waitfor_commit.insert(osd); - } - } else { - // primary, splay. all osds ack to me. - for (unsigned i=0; iwaitfor_ack.insert(osd); - } - } - - repop->start = g_clock.now(); - - rep_gather[ repop->rep_tid ] = repop; - - // anyone waiting? (acks that got here before the op did) - if (waiting_for_repop.count(repop->rep_tid)) { - osd->take_waiters(waiting_for_repop[repop->rep_tid]); - waiting_for_repop.erase(repop->rep_tid); - } - - return repop; -} - - -void ReplicatedPG::repop_ack(RepGather *repop, - int result, bool commit, - int fromosd, eversion_t pg_complete_thru) -{ - MOSDOp *op = repop->op; - - dout(7) << "repop_ack rep_tid " << repop->rep_tid << " op " << *op - << " result " << result << " commit " << commit << " from osd" << fromosd - << dendl; - - get_rep_gather(repop); - { - if (commit) { - // commit - assert(repop->waitfor_commit.count(fromosd)); - repop->waitfor_commit.erase(fromosd); - repop->waitfor_ack.erase(fromosd); - repop->pg_complete_thru[fromosd] = pg_complete_thru; - } else { - // ack - repop->waitfor_ack.erase(fromosd); - } - } - put_rep_gather(repop); -} - - - - - - - - - - - - - - - - - - - - - - - -/** op_modify_commit - * transaction commit on the acker. - */ -void ReplicatedPG::op_modify_commit(tid_t rep_tid, eversion_t pg_complete_thru) -{ - if (rep_gather.count(rep_tid)) { - RepGather *repop = rep_gather[rep_tid]; - - dout(10) << "op_modify_commit " << *repop->op << dendl; - get_rep_gather(repop); - { - assert(repop->waitfor_commit.count(osd->get_nodeid())); - repop->waitfor_commit.erase(osd->get_nodeid()); - repop->pg_complete_thru[osd->get_nodeid()] = pg_complete_thru; - } - put_rep_gather(repop); - dout(10) << "op_modify_commit done on " << repop << dendl; - } else { - dout(10) << "op_modify_commit rep_tid " << rep_tid << " dne" << dendl; - } -} - - - -objectrev_t ReplicatedPG::assign_version(MOSDOp *op) -{ - object_t oid = op->get_oid(); - - // check crev - objectrev_t crev = 0; - osd->store->getattr(oid, "crev", (char*)&crev, sizeof(crev)); - - // assign version - eversion_t clone_version; - eversion_t nv = log.top; - if (op->get_op() != OSD_OP_WRNOOP) { - nv.epoch = osd->osdmap->get_epoch(); - nv.version++; - assert(nv > info.last_update); - assert(nv > log.top); - - // will clone? - if (crev && op->get_rev() && op->get_rev() > crev) { - clone_version = nv; - nv.version++; - } - - if (op->get_version().version) { - // replay! - if (nv.version < op->get_version().version) { - nv.version = op->get_version().version; - - // clone? - if (crev && op->get_rev() && op->get_rev() > crev) { - // backstep clone - clone_version = nv; - clone_version.version--; - } - } - } - } - - // set version in op, for benefit of client and our eventual reply - op->set_version(nv); - - return crev; -} - - -// commit (to disk) callback -class C_OSD_RepModifyCommit : public Context { -public: - ReplicatedPG *pg; - MOSDOp *op; - int destosd; - - eversion_t pg_last_complete; - - Mutex lock; - Cond cond; - bool acked; - bool waiting; - - C_OSD_RepModifyCommit(ReplicatedPG *p, MOSDOp *oo, int dosd, eversion_t lc) : - pg(p), op(oo), destosd(dosd), pg_last_complete(lc), - acked(false), waiting(false) { - pg->get(); // we're copying the pointer. - } - void finish(int r) { - lock.Lock(); - assert(!waiting); - while (!acked) { - waiting = true; - cond.Wait(lock); - } - assert(acked); - lock.Unlock(); - - pg->lock(); - pg->op_rep_modify_commit(op, destosd, pg_last_complete); - pg->put_unlock(); - } - void ack() { - lock.Lock(); - assert(!acked); - acked = true; - if (waiting) cond.Signal(); - - // discard my reference to buffer - op->get_data().clear(); - - lock.Unlock(); - } -}; - - -void ReplicatedPG::op_modify(MOSDOp *op) -{ - int whoami = osd->get_nodeid(); - object_t oid = op->get_oid(); - const char *opname = MOSDOp::get_opname(op->get_op()); - - // --- locking --- - - // wrlock? - if (op->get_op() != OSD_OP_WRNOOP && // except WRNOOP; we just want to flush - block_if_wrlocked(op)) - return; // op will be handled later, after the object unlocks - - // balance-reads set? - char v; - if ((op->get_op() != OSD_OP_BALANCEREADS && op->get_op() != OSD_OP_UNBALANCEREADS) && - (osd->store->getattr(op->get_oid(), "balance-reads", &v, 1) >= 0 || - balancing_reads.count(op->get_oid()))) { - - if (!unbalancing_reads.count(op->get_oid())) { - // unbalance - dout(-10) << "preprocess_op unbalancing-reads on " << op->get_oid() << dendl; - unbalancing_reads.insert(op->get_oid()); - - MOSDOp *pop = new MOSDOp(osd->messenger->get_myinst(), 0, osd->get_tid(), - op->get_oid(), - ObjectLayout(info.pgid), - osd->osdmap->get_epoch(), - OSD_OP_UNBALANCEREADS); - do_op(pop); - } - - // add to wait queue - dout(-10) << "preprocess_op waiting for unbalance-reads on " << op->get_oid() << dendl; - waiting_for_unbalanced_reads[op->get_oid()].push_back(op); - return; - } - - - // dup op? - if (is_dup(op->get_reqid())) { - dout(3) << "op_modify " << opname << " dup op " << op->get_reqid() - << ", doing WRNOOP" << dendl; - op->set_op(OSD_OP_WRNOOP); - opname = MOSDOp::get_opname(op->get_op()); - } - - // assign the op a version - objectrev_t crev = assign_version(op); - eversion_t nv = op->get_version(); - - // are any peers missing this? - for (unsigned i=1; iget_rev() - << " " << op->get_offset() << "~" << op->get_length() - << dendl; - - if (op->get_op() == OSD_OP_WRITE) { - osd->logger->inc("c_wr"); - osd->logger->inc("c_wrb", op->get_length()); - } - - // note my stats - utime_t now = g_clock.now(); - - // issue replica writes - RepGather *repop = 0; - bool alone = (acting.size() == 1); - tid_t rep_tid = osd->get_tid(); - op->set_rep_tid(rep_tid); - - if (g_conf.osd_rep == OSD_REP_CHAIN && !alone) { - // chain rep. send to #2 only. - int next = acting[1]; - if (acting.size() > 2) - next = acting[2]; - issue_repop(op, next, now); - } - else if (g_conf.osd_rep == OSD_REP_SPLAY && !alone) { - // splay rep. send to rest. - for (unsigned i=1; i=1; --i) - issue_repop(op, acting[i], now); - } else { - // primary rep, or alone. - repop = new_rep_gather(op); - - // send to rest. - if (!alone) - for (unsigned i=1; iget_op() != OSD_OP_WRNOOP) { - // log and update later. - prepare_log_transaction(repop->t, op, nv, crev, op->get_rev(), peers_complete_thru); - prepare_op_transaction(repop->t, op, nv, crev, op->get_rev()); - } - - // (logical) local ack. - // (if alone, this will apply the update.) - get_rep_gather(repop); - { - assert(repop->waitfor_ack.count(whoami)); - repop->waitfor_ack.erase(whoami); - } - put_rep_gather(repop); - - } else { - // not acker. - // chain or splay. apply. - ObjectStore::Transaction t; - prepare_log_transaction(t, op, nv, crev, op->get_rev(), peers_complete_thru); - prepare_op_transaction(t, op, nv, crev, op->get_rev()); - - C_OSD_RepModifyCommit *oncommit = new C_OSD_RepModifyCommit(this, op, get_acker(), - info.last_complete); - unsigned r = osd->store->apply_transaction(t, oncommit); - if (r != 0 && // no errors - r != 2) { // or error on collection_add - derr(0) << "error applying transaction: r = " << r << dendl; - assert(r == 0); - } - - // lets evict the data from our cache to maintain a total large cache size - if (g_conf.osd_exclusive_caching) - osd->store->trim_from_cache(op->get_oid(), op->get_offset(), op->get_length()); - - oncommit->ack(); - } - -} - - - -// replicated - - - - -void ReplicatedPG::op_rep_modify(MOSDOp *op) -{ - object_t oid = op->get_oid(); - eversion_t nv = op->get_version(); - - const char *opname = MOSDOp::get_opname(op->get_op()); - - // check crev - objectrev_t crev = 0; - osd->store->getattr(oid, "crev", (char*)&crev, sizeof(crev)); - - dout(10) << "op_rep_modify " << opname - << " " << oid - << " v " << nv - << " " << op->get_offset() << "~" << op->get_length() - << dendl; - - // note peer's stat - int fromosd = op->get_source().num(); - osd->take_peer_stat(fromosd, op->get_peer_stat()); - - // we better not be missing this. - assert(!missing.is_missing(oid)); - - // prepare our transaction - ObjectStore::Transaction t; - - // am i acker? - RepGather *repop = 0; - int ackerosd = acting[0]; - - if ((g_conf.osd_rep == OSD_REP_CHAIN || g_conf.osd_rep == OSD_REP_SPLAY)) { - ackerosd = get_acker(); - - if (is_acker()) { - // i am tail acker. - if (rep_gather.count(op->get_rep_tid())) { - repop = rep_gather[ op->get_rep_tid() ]; - } else { - repop = new_rep_gather(op); - } - - // infer ack from source - get_rep_gather(repop); - { - //assert(repop->waitfor_ack.count(fromosd)); // no, we may come thru here twice. - repop->waitfor_ack.erase(fromosd); - } - put_rep_gather(repop); - - // prepare dest socket - //messenger->prepare_send_message(op->get_client()); - } - - // chain? forward? - if (g_conf.osd_rep == OSD_REP_CHAIN && !is_acker()) { - // chain rep, not at the tail yet. - int myrank = osd->osdmap->calc_pg_rank(osd->get_nodeid(), acting); - int next = myrank+1; - if (next == (int)acting.size()) - next = 1; - issue_repop(op, acting[next], g_clock.now()); - } - } - - // do op? - C_OSD_RepModifyCommit *oncommit = 0; - - osd->logger->inc("r_wr"); - osd->logger->inc("r_wrb", op->get_length()); - - if (repop) { - // acker. we'll apply later. - if (op->get_op() != OSD_OP_WRNOOP) { - prepare_log_transaction(repop->t, op, nv, crev, op->get_rev(), op->get_pg_trim_to()); - prepare_op_transaction(repop->t, op, nv, crev, op->get_rev()); - } - } else { - // middle|replica. - if (op->get_op() != OSD_OP_WRNOOP) { - prepare_log_transaction(t, op, nv, crev, op->get_rev(), op->get_pg_trim_to()); - prepare_op_transaction(t, op, nv, crev, op->get_rev()); - } - - oncommit = new C_OSD_RepModifyCommit(this, op, ackerosd, info.last_complete); - - // apply log update. and possibly update itself. - unsigned tr = osd->store->apply_transaction(t, oncommit); - if (tr != 0 && // no errors - tr != 2) { // or error on collection_add - derr(0) << "error applying transaction: r = " << tr << dendl; - assert(tr == 0); - } - } - - // ack? - if (repop) { - // (logical) local ack. this may induce the actual update. - get_rep_gather(repop); - { - assert(repop->waitfor_ack.count(osd->get_nodeid())); - repop->waitfor_ack.erase(osd->get_nodeid()); - } - put_rep_gather(repop); - } - else { - // send ack to acker? - if (g_conf.osd_rep != OSD_REP_CHAIN) { - MOSDOpReply *ack = new MOSDOpReply(op, 0, osd->osdmap->get_epoch(), false); - ack->set_peer_stat(osd->get_my_stat_for(g_clock.now(), ackerosd)); - osd->messenger->send_message(ack, osd->osdmap->get_inst(ackerosd)); - } - - // ack myself. - assert(oncommit); - oncommit->ack(); - } - -} - - -void ReplicatedPG::op_rep_modify_commit(MOSDOp *op, int ackerosd, eversion_t last_complete) -{ - // send commit. - dout(10) << "rep_modify_commit on op " << *op - << ", sending commit to osd" << ackerosd - << dendl; - if (osd->osdmap->is_up(ackerosd)) { - MOSDOpReply *commit = new MOSDOpReply(op, 0, osd->osdmap->get_epoch(), true); - commit->set_pg_complete_thru(last_complete); - commit->set_peer_stat(osd->get_my_stat_for(g_clock.now(), ackerosd)); - osd->messenger->send_message(commit, osd->osdmap->get_inst(ackerosd)); - delete op; - } -} - - - - - - - - - - -// =========================================================== - -/** pull - request object from a peer - */ -void ReplicatedPG::pull(object_t oid) -{ - assert(missing.loc.count(oid)); - eversion_t v = missing.missing[oid]; - int fromosd = missing.loc[oid]; - - dout(7) << "pull " << oid - << " v " << v - << " from osd" << fromosd - << dendl; - - // send op - tid_t tid = osd->get_tid(); - MOSDOp *op = new MOSDOp(osd->messenger->get_myinst(), 0, tid, - oid, info.pgid, - osd->osdmap->get_epoch(), - OSD_OP_PULL); - op->set_version(v); - osd->messenger->send_message(op, osd->osdmap->get_inst(fromosd)); - - // take note - assert(objects_pulling.count(oid) == 0); - num_pulling++; - objects_pulling[oid] = v; -} - - -/** push - send object to a peer - */ -void ReplicatedPG::push(object_t oid, int peer) -{ - // read data+attrs - bufferlist bl; - eversion_t v; - int vlen = sizeof(v); - map attrset; - - ObjectStore::Transaction t; - t.read(oid, 0, 0, &bl); - t.getattr(oid, "version", &v, &vlen); - t.getattrs(oid, attrset); - unsigned tr = osd->store->apply_transaction(t); - - assert(tr == 0); // !!! - - // ok - dout(7) << "push " << oid << " v " << v - << " size " << bl.length() - << " to osd" << peer - << dendl; - - osd->logger->inc("r_push"); - osd->logger->inc("r_pushb", bl.length()); - - // send - MOSDOp *op = new MOSDOp(osd->messenger->get_myinst(), 0, osd->get_tid(), - oid, info.pgid, osd->osdmap->get_epoch(), - OSD_OP_PUSH); - op->set_offset(0); - op->set_length(bl.length()); - op->set_data(bl); // note: claims bl, set length above here! - op->set_version(v); - op->set_attrset(attrset); - - osd->messenger->send_message(op, osd->osdmap->get_inst(peer)); - - if (is_primary()) { - peer_missing[peer].got(oid); - pushing[oid].insert(peer); - } -} - - - -/** op_pull - * process request to pull an entire object. - * NOTE: called from opqueue. - */ -void ReplicatedPG::op_pull(MOSDOp *op) -{ - const object_t oid = op->get_oid(); - const eversion_t v = op->get_version(); - int from = op->get_source().num(); - - dout(7) << "op_pull " << oid << " v " << op->get_version() - << " from " << op->get_source() - << dendl; - - // is a replica asking? are they missing it? - if (is_primary()) { - // primary - assert(peer_missing.count(from)); // we had better know this, from the peering process. - - if (!peer_missing[from].is_missing(oid)) { - dout(7) << "op_pull replica isn't actually missing it, we must have already pushed to them" << dendl; - delete op; - return; - } - - // do we have it yet? - if (is_missing_object(oid)) { - wait_for_missing_object(oid, op); - return; - } - } else { - // non-primary - if (missing.is_missing(oid)) { - dout(7) << "op_pull not primary, and missing " << oid << ", ignoring" << dendl; - delete op; - return; - } - } - - // push it back! - push(oid, op->get_source().num()); -} - - -/** op_push - * NOTE: called from opqueue. - */ -void ReplicatedPG::op_push(MOSDOp *op) -{ - object_t oid = op->get_oid(); - eversion_t v = op->get_version(); - - if (!is_missing_object(oid)) { - dout(7) << "op_push not missing " << oid << dendl; - return; - } - - dout(7) << "op_push " - << oid - << " v " << v - << " size " << op->get_length() << " " << op->get_data().length() - << dendl; - - assert(op->get_data().length() == op->get_length()); - - // write object and add it to the PG - ObjectStore::Transaction t; - t.remove(oid); // in case old version exists - t.write(oid, 0, op->get_length(), op->get_data()); - t.setattrs(oid, op->get_attrset()); - t.collection_add(info.pgid, oid); - - // close out pull op? - num_pulling--; - if (objects_pulling.count(oid)) - objects_pulling.erase(oid); - missing.got(oid, v); - - - // raise last_complete? - assert(log.complete_to != log.log.end()); - while (log.complete_to != log.log.end()) { - if (missing.missing.count(log.complete_to->oid)) break; - if (info.last_complete < log.complete_to->version) - info.last_complete = log.complete_to->version; - log.complete_to++; - } - dout(10) << "last_complete now " << info.last_complete << dendl; - - - // apply to disk! - t.collection_setattr(info.pgid, "info", &info, sizeof(info)); - unsigned r = osd->store->apply_transaction(t); - assert(r == 0); - - - - // am i primary? are others missing this too? - if (is_primary()) { - for (unsigned i=1; itake_waiters(waiting_for_missing_object[oid]); - waiting_for_missing_object.erase(oid); - } - - if (is_primary()) { - // continue recovery - do_recovery(); - } else { - // ack if i'm a replica and being pushed to. - MOSDOpReply *reply = new MOSDOpReply(op, 0, osd->osdmap->get_epoch(), true); - osd->messenger->send_message(reply, op->get_source_inst()); - } - - delete op; -} - - - - - - -void ReplicatedPG::note_failed_osd(int o) -{ - dout(10) << "note_failed_osd " << o << dendl; - // do async; repop_ack() may modify pg->repop_gather - list ls; - for (hash_map::iterator p = rep_gather.begin(); - p != rep_gather.end(); - p++) { - //dout(-1) << "checking repop tid " << p->first << dendl; - if (p->second->waitfor_ack.count(o) || - p->second->waitfor_commit.count(o)) - ls.push_back(p->second); - } - for (list::iterator p = ls.begin(); - p != ls.end(); - p++) - repop_ack(*p, -1, true, o); -} - - -void ReplicatedPG::on_acker_change() -{ - dout(10) << "on_acker_change" << dendl; - - if (g_conf.osd_rep == OSD_REP_PRIMARY) { - // we're fine. - // note that note_failed_osd() above shoudl ahve implicitly acked/committed - // from the failed guy. - } else { - // for splay or chain replication, any change is significant. - // apply repops - for (hash_map::iterator p = rep_gather.begin(); - p != rep_gather.end(); - p++) { - if (!p->second->applied) - apply_repop(p->second); - delete p->second->op; - delete p->second; - } - rep_gather.clear(); - - // and repop waiters - for (hash_map >::iterator p = waiting_for_repop.begin(); - p != waiting_for_repop.end(); - p++) - for (list::iterator pm = p->second.begin(); - pm != p->second.end(); - pm++) - delete *pm; - waiting_for_repop.clear(); - } -} - - -void ReplicatedPG::on_role_change() -{ - dout(10) << "on_role_change" << dendl; - - // take object waiters - for (hash_map >::iterator it = waiting_for_missing_object.begin(); - it != waiting_for_missing_object.end(); - it++) - osd->take_waiters(it->second); - waiting_for_missing_object.clear(); -} - - - - - - - - - -/** clean_up_local - * remove any objects that we're storing but shouldn't. - * as determined by log. - */ -void ReplicatedPG::clean_up_local(ObjectStore::Transaction& t) -{ - dout(10) << "clean_up_local" << dendl; - - assert(info.last_update >= log.bottom); // otherwise we need some help! - - if (log.backlog) { - // be thorough. - list ls; - osd->store->collection_list(info.pgid, ls); - set s; - - for (list::iterator i = ls.begin(); - i != ls.end(); - i++) - s.insert(*i); - - set did; - for (list::reverse_iterator p = log.log.rbegin(); - p != log.log.rend(); - p++) { - if (did.count(p->oid)) continue; - did.insert(p->oid); - - if (p->is_delete()) { - if (s.count(p->oid)) { - dout(10) << " deleting " << p->oid - << " when " << p->version << dendl; - t.remove(p->oid); - } - s.erase(p->oid); - } else { - // just leave old objects.. they're missing or whatever - s.erase(p->oid); - } - } - - for (set::iterator i = s.begin(); - i != s.end(); - i++) { - dout(10) << " deleting stray " << *i << dendl; - t.remove(*i); - } - - } else { - // just scan the log. - set did; - for (list::reverse_iterator p = log.log.rbegin(); - p != log.log.rend(); - p++) { - if (did.count(p->oid)) continue; - did.insert(p->oid); - - if (p->is_delete()) { - dout(10) << " deleting " << p->oid - << " when " << p->version << dendl; - t.remove(p->oid); - } else { - // keep old(+missing) objects, just for kicks. - } - } - } -} - - - -void ReplicatedPG::cancel_recovery() -{ - // forget about where missing items are, or anything we're pulling - missing.loc.clear(); - osd->num_pulling -= objects_pulling.size(); - objects_pulling.clear(); - num_pulling = 0; - pushing.clear(); -} - -/** - * do one recovery op. - * return true if done, false if nothing left to do. - */ -bool ReplicatedPG::do_recovery() -{ - assert(is_primary()); - /*if (!is_primary()) { - dout(10) << "do_recovery not primary, doing nothing" << dendl; - return true; - } - */ - - if (info.is_uptodate()) { // am i up to date? - if (!is_all_uptodate()) { - dout(-10) << "do_recovery i'm clean but replicas aren't, starting peer recovery" << dendl; - do_peer_recovery(); - } else { - dout(-10) << "do_recovery all clean, nothing to do" << dendl; - } - return true; - } - - dout(-10) << "do_recovery pulling " << objects_pulling.size() << " in pg, " - << osd->num_pulling << "/" << g_conf.osd_max_pull << " total" - << dendl; - dout(10) << "do_recovery " << missing << dendl; - - // can we slow down on this PG? - if (osd->num_pulling >= g_conf.osd_max_pull && !objects_pulling.empty()) { - dout(-10) << "do_recovery already pulling max, waiting" << dendl; - return true; - } - - // look at log! - Log::Entry *latest = 0; - - while (log.requested_to != log.log.end()) { - assert(log.objects.count(log.requested_to->oid)); - latest = log.objects[log.requested_to->oid]; - assert(latest); - - dout(10) << "do_recovery " - << *log.requested_to - << (objects_pulling.count(latest->oid) ? " (pulling)":"") - << dendl; - - if (latest->is_update() && - !objects_pulling.count(latest->oid) && - missing.is_missing(latest->oid)) { - pull(latest->oid); - return true; - } - - log.requested_to++; - } - - if (!objects_pulling.empty()) { - dout(7) << "do_recovery requested everything, still waiting" << dendl; - return false; - } - - // done? - assert(missing.num_missing() == 0); - assert(info.last_complete == info.last_update); - - if (is_primary()) { - // i am primary - dout(-7) << "do_recovery complete, cleaning strays" << dendl; - uptodate_set.insert(osd->whoami); - if (is_all_uptodate()) - finish_recovery(); - } else { - // tell primary - dout(7) << "do_recovery complete, telling primary" << dendl; - list ls; - ls.push_back(info); - osd->messenger->send_message(new MOSDPGNotify(osd->osdmap->get_epoch(), - ls), - osd->osdmap->get_inst(get_primary())); - } - - return false; -} - -void ReplicatedPG::do_peer_recovery() -{ - dout(-10) << "do_peer_recovery" << dendl; - - // this is FAR from an optimal recovery order. pretty lame, really. - for (unsigned i=0; isecond; - eversion_t v = peer_missing[peer].rmissing.begin()->first; - - push(oid, peer); - - // do other peers need it too? - for (i++; iget_source() << " " << *reply << dendl; - - int peer = reply->get_source().num(); - object_t oid = reply->get_oid(); - - if (pushing.count(oid) && - pushing[oid].count(peer)) { - pushing[oid].erase(peer); - - if (peer_missing.count(peer) == 0 || - peer_missing[peer].num_missing() == 0) - uptodate_set.insert(peer); - - if (pushing[oid].empty()) { - dout(10) << "pushed " << oid << " to all replicas" << dendl; - do_peer_recovery(); - } else { - dout(10) << "pushed " << oid << ", still waiting for push ack from " - << pushing[oid] << dendl; - } - } else { - dout(10) << "huh, i wasn't pushing " << oid << dendl; - } - delete reply; -} - -void ReplicatedPG::purge_strays() -{ - dout(10) << "purge_strays " << stray_set << dendl; - - for (set::iterator p = stray_set.begin(); - p != stray_set.end(); - p++) { - dout(10) << "sending PGRemove to osd" << *p << dendl; - set ls; - ls.insert(info.pgid); - MOSDPGRemove *m = new MOSDPGRemove(osd->osdmap->get_epoch(), ls); - osd->messenger->send_message(m, osd->osdmap->get_inst(*p)); - } - - stray_set.clear(); -} - diff --git a/branches/sage/ebofs2/osd/ReplicatedPG.h b/branches/sage/ebofs2/osd/ReplicatedPG.h deleted file mode 100644 index ab44026b43fb2..0000000000000 --- a/branches/sage/ebofs2/osd/ReplicatedPG.h +++ /dev/null @@ -1,170 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __REPLICATEDPG_H -#define __REPLICATEDPG_H - - -#include "PG.h" - -#include "messages/MOSDOp.h" - - -class ReplicatedPG : public PG { -public: - /* - * gather state on the primary/head while replicating an osd op. - */ - class RepGather { - public: - class MOSDOp *op; - tid_t rep_tid; - - ObjectStore::Transaction t; - bool applied; - - set waitfor_ack; - set waitfor_commit; - - utime_t start; - - bool sent_ack, sent_commit; - - set osds; - eversion_t new_version; - - eversion_t pg_local_last_complete; - map pg_complete_thru; - - RepGather(MOSDOp *o, tid_t rt, eversion_t nv, eversion_t lc) : - op(o), rep_tid(rt), - applied(false), - sent_ack(false), sent_commit(false), - new_version(nv), - pg_local_last_complete(lc) { } - - bool can_send_ack() { - return !sent_ack && !sent_commit && - waitfor_ack.empty(); - } - bool can_send_commit() { - return !sent_commit && - waitfor_ack.empty() && waitfor_commit.empty(); - } - bool can_delete() { - return waitfor_ack.empty() && waitfor_commit.empty(); - } - }; - -protected: - // replica ops - // [primary|tail] - hash_map rep_gather; - hash_map > waiting_for_repop; - - // load balancing - set balancing_reads; - set unbalancing_reads; - hash_map > waiting_for_unbalanced_reads; // i.e. primary-lock - - void get_rep_gather(RepGather*); - void apply_repop(RepGather *repop); - void put_rep_gather(RepGather*); - void issue_repop(MOSDOp *op, int osd, utime_t now); - RepGather *new_rep_gather(MOSDOp *op); - void repop_ack(RepGather *repop, - int result, bool commit, - int fromosd, eversion_t pg_complete_thru=0); - - // push/pull - int num_pulling; - map > pushing; - - void push(object_t oid, int dest); - void pull(object_t oid); - - // modify - objectrev_t assign_version(MOSDOp *op); - void op_modify_commit(tid_t rep_tid, eversion_t pg_complete_thru); - void op_rep_modify_commit(MOSDOp *op, int ackerosd, eversion_t last_complete); - - void prepare_log_transaction(ObjectStore::Transaction& t, - MOSDOp *op, eversion_t& version, - objectrev_t crev, objectrev_t rev, - eversion_t trim_to); - void prepare_op_transaction(ObjectStore::Transaction& t, - MOSDOp *op, eversion_t& version, - objectrev_t crev, objectrev_t rev); - - friend class C_OSD_ModifyCommit; - friend class C_OSD_RepModifyCommit; - - - // pg on-disk content - void clean_up_local(ObjectStore::Transaction& t); - - void cancel_recovery(); - bool do_recovery(); - void do_peer_recovery(); - - void purge_strays(); - - - void op_read(MOSDOp *op); - void op_modify(MOSDOp *op); - void op_rep_modify(MOSDOp *op); - void op_push(MOSDOp *op); - void op_pull(MOSDOp *op); - - void op_push_reply(MOSDOpReply *reply); - - -public: - ReplicatedPG(OSD *o, pg_t p) : - PG(o,p), - num_pulling(0) - { } - ~ReplicatedPG() {} - - bool preprocess_op(MOSDOp *op, utime_t now); - void do_op(MOSDOp *op); - void do_op_reply(MOSDOpReply *r); - - bool same_for_read_since(epoch_t e); - bool same_for_modify_since(epoch_t e); - bool same_for_rep_modify_since(epoch_t e); - - bool is_missing_object(object_t oid); - void wait_for_missing_object(object_t oid, MOSDOp *op); - - void note_failed_osd(int o); - void on_acker_change(); - void on_role_change(); - -}; - - -inline ostream& operator<<(ostream& out, ReplicatedPG::RepGather& repop) -{ - out << "repgather(" << &repop << " rep_tid=" << repop.rep_tid - << " wfack=" << repop.waitfor_ack - << " wfcommit=" << repop.waitfor_commit; - out << " pct=" << repop.pg_complete_thru; - out << " op=" << *(repop.op); - out << " repop=" << &repop; - out << ")"; - return out; -} - - -#endif diff --git a/branches/sage/ebofs2/osd/osd_types.h b/branches/sage/ebofs2/osd/osd_types.h deleted file mode 100644 index 0ae9d0831b0d7..0000000000000 --- a/branches/sage/ebofs2/osd/osd_types.h +++ /dev/null @@ -1,321 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __OSD_TYPES_H -#define __OSD_TYPES_H - -#include "msg/msg_types.h" -#include "include/types.h" - -/* osdreqid_t - caller name + incarnation# + tid to unique identify this request - * use for metadata and osd ops. - */ -class osdreqid_t { -public: - entity_name_t name; // who - int32_t inc; // incarnation - tid_t tid; - osdreqid_t() : inc(0), tid(0) {} - osdreqid_t(const entity_name_t& a, int i, tid_t t) : name(a), inc(i), tid(t) {} -}; - -inline ostream& operator<<(ostream& out, const osdreqid_t& r) { - return out << r.name << "." << r.inc << ":" << r.tid; -} - -inline bool operator==(const osdreqid_t& l, const osdreqid_t& r) { - return (l.name == r.name) && (l.inc == r.inc) && (l.tid == r.tid); -} -inline bool operator!=(const osdreqid_t& l, const osdreqid_t& r) { - return (l.name != r.name) || (l.inc != r.inc) || (l.tid != r.tid); -} -inline bool operator<(const osdreqid_t& l, const osdreqid_t& r) { - return (l.name < r.name) || (l.inc < r.inc) || - (l.name == r.name && l.inc == r.inc && l.tid < r.tid); -} -inline bool operator<=(const osdreqid_t& l, const osdreqid_t& r) { - return (l.name < r.name) || (l.inc < r.inc) || - (l.name == r.name && l.inc == r.inc && l.tid <= r.tid); -} -inline bool operator>(const osdreqid_t& l, const osdreqid_t& r) { return !(l <= r); } -inline bool operator>=(const osdreqid_t& l, const osdreqid_t& r) { return !(l < r); } - -namespace __gnu_cxx { - template<> struct hash { - size_t operator()(const osdreqid_t &r) const { - static blobhash H; - return H((const char*)&r, sizeof(r)); - } - }; -} - - - -// osd types -typedef uint64_t coll_t; // collection id - -// pg stuff - -#define PG_INO 4 // this should match mds/mdstypes.h MDS_INO_PG - -typedef uint16_t ps_t; -typedef uint8_t pruleset_t; - - -// crush rule ids -#define CRUSH_REP_RULE(nrep) (nrep) // replication -#define CRUSH_RAID_RULE(num) (10+num) // raid - - - -// placement group id -struct pg_t { -public: - static const int TYPE_REP = CEPH_PG_TYPE_REP; - static const int TYPE_RAID4 = CEPH_PG_TYPE_RAID4; - -private: - union ceph_pg u; - -public: - pg_t() { u.pg64 = 0; } - pg_t(const pg_t& o) { u.pg64 = o.u.pg64; } - pg_t(int type, int size, ps_t seed, int pref) {//, pruleset_t r=0) { - u.pg.type = type; - u.pg.size = size; - u.pg.ps = seed; - u.pg.preferred = pref; // hack: avoid negative. - //u.pg.ruleset = r; - assert(sizeof(u.pg) == sizeof(u.pg64)); - } - pg_t(uint64_t v) { u.pg64 = v; } - - int type() { return u.pg.type; } - bool is_rep() { return type() == TYPE_REP; } - bool is_raid4() { return type() == TYPE_RAID4; } - - int size() { return u.pg.size; } - ps_t ps() { return u.pg.ps; } - //pruleset_t ruleset() { return u.pg.ruleset; } - int preferred() { return u.pg.preferred; } // hack: avoid negative. - - /* - pg_t operator=(uint64_t v) { u.val = v; return *this; } - pg_t operator&=(uint64_t v) { u.val &= v; return *this; } - pg_t operator+=(pg_t o) { u.val += o.val; return *this; } - pg_t operator-=(pg_t o) { u.val -= o.val; return *this; } - pg_t operator++() { ++u.val; return *this; } - */ - operator uint64_t() const { return u.pg64; } - - object_t to_object() const { return object_t(PG_INO, u.pg64 >> 32, u.pg64 & 0xffffffff); } -}; - -inline ostream& operator<<(ostream& out, pg_t pg) -{ - if (pg.is_rep()) - out << pg.size() << 'x'; - else if (pg.is_raid4()) - out << pg.size() << 'r'; - else - out << pg.size() << '?'; - - //if (pg.ruleset()) - //out << (int)pg.ruleset() << 's'; - - out << hex << pg.ps() << dec; - - if (pg.preferred() >= 0) - out << 'p' << pg.preferred(); - - //out << "=" << hex << (__uint64_t)pg << dec; - return out; -} - -namespace __gnu_cxx { - template<> struct hash< pg_t > - { - size_t operator()( const pg_t& x ) const - { - static rjhash H; - return H(x); - } - }; -} - - - - - -/** ObjectLayout - * - * describes an object's placement and layout in the storage cluster. - * most importatly, which pg it belongs to. - * if that pg is raided, it also specifies the object's stripe_unit. - */ -struct ObjectLayout { - pg_t pgid; // what pg do i belong to - int32_t stripe_unit; // for object raid in raid pgs - - ObjectLayout() : pgid(0), stripe_unit(0) { } - ObjectLayout(pg_t p, int su=0) : pgid(p), stripe_unit(su) { } -}; - -inline ostream& operator<<(ostream& out, const ObjectLayout &ol) -{ - out << "pg" << ol.pgid; - if (ol.stripe_unit) - out << ".su=" << ol.stripe_unit; - return out; -} - - - -// compound rados version type -class eversion_t { -public: - epoch_t epoch; - version_t version; - eversion_t(epoch_t e=0, version_t v=0) : epoch(e), version(v) {} -}; - -inline bool operator==(const eversion_t& l, const eversion_t& r) { - return (l.epoch == r.epoch) && (l.version == r.version); -} -inline bool operator!=(const eversion_t& l, const eversion_t& r) { - return (l.epoch != r.epoch) || (l.version != r.version); -} -inline bool operator<(const eversion_t& l, const eversion_t& r) { - return (l.epoch == r.epoch) ? (l.version < r.version):(l.epoch < r.epoch); -} -inline bool operator<=(const eversion_t& l, const eversion_t& r) { - return (l.epoch == r.epoch) ? (l.version <= r.version):(l.epoch <= r.epoch); -} -inline bool operator>(const eversion_t& l, const eversion_t& r) { - return (l.epoch == r.epoch) ? (l.version > r.version):(l.epoch > r.epoch); -} -inline bool operator>=(const eversion_t& l, const eversion_t& r) { - return (l.epoch == r.epoch) ? (l.version >= r.version):(l.epoch >= r.epoch); -} -inline ostream& operator<<(ostream& out, const eversion_t e) { - return out << e.epoch << "'" << e.version; -} - - - -/** osd_stat - * aggregate stats for an osd - */ -struct osd_stat_t { - int64_t num_blocks; - int64_t num_blocks_avail; - int64_t num_objects; - - osd_stat_t() : num_blocks(0), num_blocks_avail(0), num_objects(0) {} -}; - - -/** pg_stat - * aggregate stats for a single PG. - */ -struct pg_stat_t { - eversion_t reported; - - int32_t state; - int64_t size; // in bytes - int64_t num_blocks; // in 4k blocks - int64_t num_objects; - - pg_stat_t() : reported(0), state(0), size(0), num_blocks(0), num_objects(0) {} -}; - - - -struct osd_peer_stat_t { - utime_t stamp; - double oprate; - double qlen; - double recent_qlen; - double read_latency; - double read_latency_mine; - double frac_rd_ops_shed_in; - double frac_rd_ops_shed_out; - osd_peer_stat_t() : oprate(0), qlen(0), recent_qlen(0), - read_latency(0), read_latency_mine(0), - frac_rd_ops_shed_in(0), frac_rd_ops_shed_out(0) {} -}; - -inline ostream& operator<<(ostream& out, const osd_peer_stat_t &stat) { - return out << "stat(" << stat.stamp - //<< " oprate=" << stat.oprate - // << " qlen=" << stat.qlen - // << " recent_qlen=" << stat.recent_qlen - << " rdlat=" << stat.read_latency_mine << " / " << stat.read_latency - << " fshedin=" << stat.frac_rd_ops_shed_in - << ")"; -} - -// ----------------------------------------- - -class ObjectExtent { - public: - object_t oid; // object id - off_t start; // in object - size_t length; // in object - - ObjectLayout layout; // object layout (pgid, etc.) - - map buffer_extents; // off -> len. extents in buffer being mapped (may be fragmented bc of striping!) - - ObjectExtent() : start(0), length(0) {} - ObjectExtent(object_t o, off_t s=0, size_t l=0) : oid(o), start(s), length(l) { } -}; - -inline ostream& operator<<(ostream& out, ObjectExtent &ex) -{ - return out << "extent(" - << ex.oid << " in " << ex.layout - << " " << ex.start << "~" << ex.length - << ")"; -} - - - -// --------------------------------------- - -class OSDSuperblock { -public: - const static uint64_t MAGIC = 0xeb0f505dULL; - uint64_t magic; - uint64_t fsid; // unique fs id (random number) - int32_t whoami; // my role in this fs. - epoch_t current_epoch; // most recent epoch - epoch_t oldest_map, newest_map; // oldest/newest maps we have. - double weight; - OSDSuperblock(uint64_t f=0, int w=0) : - magic(MAGIC), fsid(f), whoami(w), - current_epoch(0), oldest_map(0), newest_map(0), weight(0) {} -}; - -inline ostream& operator<<(ostream& out, OSDSuperblock& sb) -{ - return out << "sb(fsid " << sb.fsid - << " osd" << sb.whoami - << " e" << sb.current_epoch - << " [" << sb.oldest_map << "," << sb.newest_map - << "])"; -} - - -#endif diff --git a/branches/sage/ebofs2/osdc/Blinker.h b/branches/sage/ebofs2/osdc/Blinker.h deleted file mode 100644 index e59c9629725ce..0000000000000 --- a/branches/sage/ebofs2/osdc/Blinker.h +++ /dev/null @@ -1,92 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __BLINKER_H -#define __BLINKER_H - -class Blinker { - - public: - - class Op { - int op; - static const int LOOKUP = 1; - static const int INSERT = 2; - static const int REMOVE = 3; - static const int CLEAR = 4; - Op(int o) : op(o) {} - }; - - class OpLookup : public Op { - public: - bufferptr key; - OpLookup(bufferptr& k) : Op(Op::LOOKUP), key(k) {} - }; - - class OpInsert : public Op { - bufferptr key; - bufferlist val; - OpInsert(bufferptr& k, bufferlist& v) : Op(Op::INSERT), key(k), val(v) {} - }; - - class OpRemove : public Op { - public: - bufferptr key; - OpRemove(bufferptr& k) : Op(Op::REMOVE), key(k) {} - }; - - class OpClear : public Op { - public: - OpClear() : Op(Op::CLEAR) {} - }; - - - -private: - Objecter *objecter; - - // in-flight operations. - - - // cache information about tree structure. - - - -public: - // public interface - - // simple accessors - void lookup(inode_t& inode, bufferptr& key, bufferlist *pval, Context *onfinish); - - // simple modifiers - void insert(inode_t& inode, bufferptr& key, bufferlist& val, Context *onack, Context *onsafe); - void remove(inode_t& inode, bufferptr& key, Context *onack, Context *onsafe); - void clear(inode_t& inode, Context *onack, Context *onsafe); - - // these are dangerous: the table may be large. - void listkeys(inode_t& inode, list* pkeys, Context *onfinish); - void listvals(inode_t& inode, list* pkeys, list* pvals, Context *onfinish); - - // fetch *at least* key, but also anything else that is convenient. - // include lexical bounds for which this is a complete result. - // (if *start and *end are empty, it's the entire table) - void prefetch(inode_t& inode, bufferptr& key, - list* pkeys, list* pvals, - bufferptr *start, bufferptr *end, - Context *onfinish); - - -}; - -#endif diff --git a/branches/sage/ebofs2/osdc/Filer.cc b/branches/sage/ebofs2/osdc/Filer.cc deleted file mode 100644 index 193089d3915b1..0000000000000 --- a/branches/sage/ebofs2/osdc/Filer.cc +++ /dev/null @@ -1,235 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include - -#include "Filer.h" -#include "osd/OSDMap.h" - -//#include "messages/MOSDRead.h" -//#include "messages/MOSDReadReply.h" -//#include "messages/MOSDWrite.h" -//#include "messages/MOSDWriteReply.h" -#include "messages/MOSDOp.h" -#include "messages/MOSDOpReply.h" -#include "messages/MOSDMap.h" - -#include "msg/Messenger.h" - -#include "include/Context.h" - -#include "config.h" - -#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_filer) *_dout << dbeginl << g_clock.now() << " " << objecter->messenger->get_myname() << ".filer " - - -class Filer::C_Probe : public Context { -public: - Filer *filer; - Probe *probe; - object_t oid; - off_t size; - C_Probe(Filer *f, Probe *p, object_t o) : filer(f), probe(p), oid(o), size(0) {} - void finish(int r) { - filer->_probed(probe, oid, size); - } -}; - -int Filer::probe_fwd(inode_t& inode, - off_t start_from, - off_t *end, - Context *onfinish) -{ - dout(10) << "probe_fwd " << hex << inode.ino << dec << " starting from " << start_from << dendl; - - Probe *probe = new Probe(inode, start_from, end, onfinish); - - // period (bytes before we jump unto a new set of object(s)) - off_t period = ceph_file_layout_period(inode.layout); - - // start with 1+ periods. - probe->probing_len = period; - if (start_from % period) - probe->probing_len += period - (start_from % period); - - _probe(probe); - return 0; -} - -void Filer::_probe(Probe *probe) -{ - dout(10) << "_probe " << hex << probe->inode.ino << dec << " " << probe->from << "~" << probe->probing_len << dendl; - - // map range onto objects - file_to_extents(probe->inode, probe->from, probe->probing_len, probe->probing); - - for (list::iterator p = probe->probing.begin(); - p != probe->probing.end(); - p++) { - dout(10) << "_probe probing " << p->oid << dendl; - C_Probe *c = new C_Probe(this, probe, p->oid); - probe->ops[p->oid] = objecter->stat(p->oid, &c->size, p->layout, c); - } -} - -void Filer::_probed(Probe *probe, object_t oid, off_t size) -{ - dout(10) << "_probed " << probe->inode.ino << " object " << hex << oid << dec << " has size " << size << dendl; - - probe->known[oid] = size; - assert(probe->ops.count(oid)); - probe->ops.erase(oid); - - if (!probe->ops.empty()) - return; // waiting for more! - - // analyze! - off_t end = 0; - for (list::iterator p = probe->probing.begin(); - p != probe->probing.end(); - p++) { - off_t shouldbe = p->length+p->start; - dout(10) << "_probed " << probe->inode.ino << " object " << hex << p->oid << dec - << " should be " << shouldbe - << ", actual is " << probe->known[p->oid] - << dendl; - - if (probe->known[p->oid] < 0) { end = -1; break; } // error! - - assert(probe->known[p->oid] <= shouldbe); - if (shouldbe == probe->known[p->oid]) continue; // keep going - - // aha, we found the end! - // calc offset into buffer_extent to get distance from probe->from. - off_t oleft = probe->known[p->oid] - p->start; - for (map::iterator i = p->buffer_extents.begin(); - i != p->buffer_extents.end(); - i++) { - if (oleft <= (off_t)i->second) { - end = probe->from + i->first + oleft; - dout(10) << "_probed end is in buffer_extent " << i->first << "~" << i->second << " off " << oleft - << ", from was " << probe->from << ", end is " << end - << dendl; - break; - } - oleft -= i->second; - } - break; - } - - if (end == 0) { - // keep probing! - dout(10) << "_probed didn't find end, probing further" << dendl; - off_t period = probe->inode.layout.fl_object_size * probe->inode.layout.fl_stripe_count; - probe->from += probe->probing_len; - probe->probing_len = period; - _probe(probe); - return; - } - - if (end < 0) { - dout(10) << "_probed encountered an error while probing" << dendl; - *probe->end = -1; - } else { - // hooray! - dout(10) << "_probed found end at " << end << dendl; - *probe->end = end; - } - - // done! finish and clean up. - probe->onfinish->finish(end > 0 ? 0:-1); - delete probe->onfinish; - delete probe; -} - - -void Filer::file_to_extents(inode_t inode, - off_t offset, size_t len, - list& extents, - objectrev_t rev) -{ - dout(10) << "file_to_extents " << offset << "~" << len - << " on " << hex << inode.ino << dec - << dendl; - - /* we want only one extent per object! - * this means that each extent we read may map into different bits of the - * final read buffer.. hence OSDExtent.buffer_extents - */ - map< object_t, ObjectExtent > object_extents; - - assert(inode.layout.fl_object_size >= inode.layout.fl_stripe_unit); - off_t stripes_per_object = inode.layout.fl_object_size / inode.layout.fl_stripe_unit; - dout(20) << " stripes_per_object " << stripes_per_object << dendl; - - off_t cur = offset; - off_t left = len; - while (left > 0) { - // layout into objects - off_t blockno = cur / inode.layout.fl_stripe_unit; // which block - off_t stripeno = blockno / inode.layout.fl_stripe_count; // which horizontal stripe (Y) - off_t stripepos = blockno % inode.layout.fl_stripe_count; // which object in the object set (X) - off_t objectsetno = stripeno / stripes_per_object; // which object set - off_t objectno = objectsetno * inode.layout.fl_stripe_count + stripepos; // object id - - // find oid, extent - ObjectExtent *ex = 0; - object_t oid( inode.ino, objectno, rev ); - if (object_extents.count(oid)) - ex = &object_extents[oid]; - else { - ex = &object_extents[oid]; - ex->oid = oid; - ex->layout = objecter->osdmap->file_to_object_layout( oid, inode.layout ); - } - - // map range into object - off_t block_start = (stripeno % stripes_per_object)*inode.layout.fl_stripe_unit; - off_t block_off = cur % inode.layout.fl_stripe_unit; - off_t max = inode.layout.fl_stripe_unit - block_off; - - off_t x_offset = block_start + block_off; - off_t x_len; - if (left > max) - x_len = max; - else - x_len = left; - - if (ex->start + (off_t)ex->length == x_offset) { - // add to extent - ex->length += x_len; - } else { - // new extent - assert(ex->length == 0); - assert(ex->start == 0); - ex->start = x_offset; - ex->length = x_len; - } - ex->buffer_extents[cur-offset] = x_len; - - dout(15) << "file_to_extents " << *ex << " in " << ex->layout << dendl; - //dout(0) << "map: ino " << ino << " oid " << ex.oid << " osd " << ex.osd << " offset " << ex.offset << " len " << ex.len << " ... left " << left << dendl; - - left -= x_len; - cur += x_len; - } - - // make final list - for (map::iterator it = object_extents.begin(); - it != object_extents.end(); - it++) { - extents.push_back(it->second); - } -} diff --git a/branches/sage/ebofs2/osdc/Filer.h b/branches/sage/ebofs2/osdc/Filer.h deleted file mode 100644 index 0679a9b6ffef3..0000000000000 --- a/branches/sage/ebofs2/osdc/Filer.h +++ /dev/null @@ -1,165 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __FILER_H -#define __FILER_H - -/*** Filer - * - * stripe file ranges onto objects. - * build list for the objecter or objectcacher. - * - * also, provide convenience methods that call objecter for you. - * - * "files" are identified by ino. - */ - -#include -#include -using namespace std; - -#include -using namespace __gnu_cxx; - -#include "include/types.h" - -#include "osd/OSDMap.h" -#include "Objecter.h" - -class Context; -class Messenger; -class OSDMap; - - -/**** Filer interface ***/ - -class Filer { - Objecter *objecter; - - // probes - struct Probe { - inode_t inode; - off_t from; - off_t *end; - Context *onfinish; - - list probing; - off_t probing_len; - - map known; - map ops; - - Probe(inode_t &i, off_t f, off_t *e, Context *c) : - inode(i), from(f), end(e), onfinish(c), probing_len(0) {} - }; - - class C_Probe; - //friend class C_Probe; - - void _probe(Probe *p); - void _probed(Probe *p, object_t oid, off_t size); - - public: - Filer(Objecter *o) : objecter(o) {} - ~Filer() {} - - bool is_active() { - return objecter->is_active(); // || (oc && oc->is_active()); - } - - /*** async file interface ***/ - Objecter::OSDRead *prepare_read(inode_t& inode, - off_t offset, - size_t len, - bufferlist *bl) { - Objecter::OSDRead *rd = new Objecter::OSDRead(bl); - file_to_extents(inode, offset, len, rd->extents); - return rd; - } - int read(inode_t& inode, - off_t offset, - size_t len, - bufferlist *bl, // ptr to data - Context *onfinish) { - Objecter::OSDRead *rd = prepare_read(inode, offset, len, bl); - return objecter->readx(rd, onfinish) > 0 ? 0:-1; - } - - int write(inode_t& inode, - off_t offset, - size_t len, - bufferlist& bl, - int flags, - Context *onack, - Context *oncommit, - objectrev_t rev=0) { - Objecter::OSDWrite *wr = new Objecter::OSDWrite(bl); - file_to_extents(inode, offset, len, wr->extents, rev); - return objecter->modifyx(wr, onack, oncommit) > 0 ? 0:-1; - } - - int zero(inode_t& inode, - off_t offset, - size_t len, - Context *onack, - Context *oncommit) { - Objecter::OSDModify *z = new Objecter::OSDModify(OSD_OP_ZERO); - file_to_extents(inode, offset, len, z->extents); - return objecter->modifyx(z, onack, oncommit) > 0 ? 0:-1; - } - - int remove(inode_t& inode, - off_t offset, - size_t len, - Context *onack, - Context *oncommit) { - Objecter::OSDModify *z = new Objecter::OSDModify(OSD_OP_DELETE); - file_to_extents(inode, offset, len, z->extents); - return objecter->modifyx(z, onack, oncommit) > 0 ? 0:-1; - } - - int probe_fwd(inode_t& inode, - off_t start_from, - off_t *end, - Context *onfinish); - - - /***** mapping *****/ - - /* map (ino, ono) to an object name - (to be used on any osd in the proper replica group) */ - /*object_t file_to_object(inodeno_t ino, - size_t _ono) { - uint64_t ono = _ono; - assert(ino < (1ULL<& extents, - objectrev_t rev=0); - -}; - - - -#endif diff --git a/branches/sage/ebofs2/osdc/Journaler.cc b/branches/sage/ebofs2/osdc/Journaler.cc deleted file mode 100644 index 363b7c60de9aa..0000000000000 --- a/branches/sage/ebofs2/osdc/Journaler.cc +++ /dev/null @@ -1,666 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "Journaler.h" - -#include "include/Context.h" -#include "common/Logger.h" -#include "msg/Messenger.h" - -#include "config.h" - -#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_journaler) *_dout << dbeginl << g_clock.now() << " " << objecter->messenger->get_myname() << ".journaler " -#define derr(x) if (x <= g_conf.debug || x <= g_conf.debug_journaler) *_derr << dbeginl << g_clock.now() << " " << objecter->messenger->get_myname() << ".journaler " - - - -void Journaler::reset() -{ - dout(1) << "reset to blank journal" << dendl; - state = STATE_ACTIVE; - write_pos = flush_pos = ack_pos = - read_pos = requested_pos = received_pos = - expire_pos = trimming_pos = trimmed_pos = ceph_file_layout_period(inode.layout); -} - - -/***************** HEADER *******************/ - -ostream& operator<<(ostream& out, Journaler::Header &h) -{ - return out << "loghead(trim " << h.trimmed_pos - << ", expire " << h.expire_pos - << ", read " << h.read_pos - << ", write " << h.write_pos - << ")"; -} - -class Journaler::C_ReadHead : public Context { - Journaler *ls; -public: - bufferlist bl; - C_ReadHead(Journaler *l) : ls(l) {} - void finish(int r) { - ls->_finish_read_head(r, bl); - } -}; - -class Journaler::C_ProbeEnd : public Context { - Journaler *ls; -public: - off_t end; - C_ProbeEnd(Journaler *l) : ls(l), end(-1) {} - void finish(int r) { - ls->_finish_probe_end(r, end); - } -}; - -void Journaler::recover(Context *onread) -{ - assert(state != STATE_ACTIVE); - - if (onread) - waitfor_recover.push_back(onread); - - if (state != STATE_UNDEF) { - dout(1) << "recover - already recoverying" << dendl; - return; - } - - dout(1) << "read_head" << dendl; - state = STATE_READHEAD; - C_ReadHead *fin = new C_ReadHead(this); - filer.read(inode, 0, sizeof(Header), &fin->bl, fin); -} - -void Journaler::_finish_read_head(int r, bufferlist& bl) -{ - assert(state == STATE_READHEAD); - - if (bl.length() == 0) { - dout(1) << "_finish_read_head r=" << r << " read 0 bytes, assuming empty log" << dendl; - state = STATE_ACTIVE; - list ls; - ls.swap(waitfor_recover); - finish_contexts(ls, 0); - return; - } - - // unpack header - Header h; - assert(bl.length() == sizeof(h)); - bl.copy(0, sizeof(h), (char*)&h); - - write_pos = flush_pos = ack_pos = h.write_pos; - read_pos = requested_pos = received_pos = h.read_pos; - expire_pos = h.expire_pos; - trimmed_pos = trimming_pos = h.trimmed_pos; - - dout(1) << "_finish_read_head " << h << ". probing for end of log (from " << write_pos << ")..." << dendl; - - // probe the log - state = STATE_PROBING; - C_ProbeEnd *fin = new C_ProbeEnd(this); - filer.probe_fwd(inode, h.write_pos, &fin->end, fin); -} - -void Journaler::_finish_probe_end(int r, off_t end) -{ - assert(state == STATE_PROBING); - - if (end == -1) { - end = write_pos; - dout(1) << "_finish_probe_end write_pos = " << end - << " (header had " << write_pos << "). log was empty. recovered." - << dendl; - assert(0); // hrm. - } else { - assert(end >= write_pos); - assert(r >= 0); - dout(1) << "_finish_probe_end write_pos = " << end - << " (header had " << write_pos << "). recovered." - << dendl; - } - - write_pos = flush_pos = ack_pos = end; - - // done. - list ls; - ls.swap(waitfor_recover); - finish_contexts(ls, 0); -} - - -// WRITING - -class Journaler::C_WriteHead : public Context { -public: - Journaler *ls; - Header h; - Context *oncommit; - C_WriteHead(Journaler *l, Header& h_, Context *c) : ls(l), h(h_), oncommit(c) {} - void finish(int r) { - ls->_finish_write_head(h, oncommit); - } -}; - -void Journaler::write_head(Context *oncommit) -{ - assert(state == STATE_ACTIVE); - last_written.trimmed_pos = trimmed_pos; - last_written.expire_pos = expire_pos; - last_written.read_pos = read_pos; - last_written.write_pos = ack_pos; //write_pos; - dout(10) << "write_head " << last_written << dendl; - - last_wrote_head = g_clock.now(); - - bufferlist bl; - bl.append((char*)&last_written, sizeof(last_written)); - filer.write(inode, 0, bl.length(), bl, 0, - 0, - new C_WriteHead(this, last_written, oncommit)); -} - -void Journaler::_finish_write_head(Header &wrote, Context *oncommit) -{ - dout(10) << "_finish_write_head " << wrote << dendl; - last_committed = wrote; - if (oncommit) { - oncommit->finish(0); - delete oncommit; - } - - trim(); // trim? -} - - -/***************** WRITING *******************/ - -class Journaler::C_Flush : public Context { - Journaler *ls; - off_t start; -public: - C_Flush(Journaler *l, off_t s) : ls(l), start(s) {} - void finish(int r) { ls->_finish_flush(r, start); } -}; - -void Journaler::_finish_flush(int r, off_t start) -{ - assert(r>=0); - - assert(start >= ack_pos); - assert(start < flush_pos); - assert(pending_flush.count(start)); - - // calc latency? - if (logger) { - utime_t lat = g_clock.now(); - lat -= pending_flush[start]; - logger->favg("jlat", lat); - } - - pending_flush.erase(start); - - // adjust ack_pos - if (pending_flush.empty()) - ack_pos = flush_pos; - else - ack_pos = pending_flush.begin()->first; - - dout(10) << "_finish_flush from " << start - << ", pending_flush now " << pending_flush - << ", write positions now " << write_pos << "/" << flush_pos << "/" << ack_pos - << dendl; - - // kick waiters <= ack_pos - while (!waitfor_flush.empty()) { - if (waitfor_flush.begin()->first > ack_pos) break; - finish_contexts(waitfor_flush.begin()->second); - waitfor_flush.erase(waitfor_flush.begin()); - } -} - - -off_t Journaler::append_entry(bufferlist& bl, Context *onsync) -{ - uint32_t s = bl.length(); - - if (!g_conf.journaler_allow_split_entries) { - // will we span a stripe boundary? - int p = inode.layout.fl_stripe_unit; - if (write_pos / p != (write_pos + (off_t)(bl.length() + sizeof(s))) / p) { - // yes. - // move write_pos forward. - off_t owp = write_pos; - write_pos += p; - write_pos -= (write_pos % p); - - // pad with zeros. - bufferptr bp(write_pos - owp); - bp.zero(); - assert(bp.length() >= 4); - write_buf.push_back(bp); - - // now flush. - flush(); - - dout(12) << "append_entry skipped " << (write_pos-owp) << " bytes to " << write_pos << " to avoid spanning stripe boundary" << dendl; - } - } - - dout(10) << "append_entry len " << bl.length() << " to " << write_pos << "~" << (bl.length() + sizeof(uint32_t)) << dendl; - - // cache? - // NOTE: this is a dumb thing to do; this is used for a benchmarking - // purposes only. - if (g_conf.journaler_cache && - write_pos == read_pos + read_buf.length()) { - dout(10) << "append_entry caching in read_buf too" << dendl; - assert(requested_pos == received_pos); - assert(requested_pos == read_pos + read_buf.length()); - read_buf.append((char*)&s, sizeof(s)); - read_buf.append(bl); - requested_pos = received_pos = write_pos + sizeof(s) + s; - } - - // append - write_buf.append((char*)&s, sizeof(s)); - write_buf.claim_append(bl); - write_pos += sizeof(s) + s; - - // flush now? - if (onsync) - flush(onsync); - - return write_pos; -} - - -void Journaler::_do_flush() -{ - if (write_pos == flush_pos) return; - assert(write_pos > flush_pos); - - // flush - unsigned len = write_pos - flush_pos; - assert(len == write_buf.length()); - dout(10) << "_do_flush flushing " << flush_pos << "~" << len << dendl; - - // submit write for anything pending - // flush _start_ pos to _finish_flush - filer.write(inode, flush_pos, len, write_buf, 0, - g_conf.journaler_safe ? 0:new C_Flush(this, flush_pos), // on ACK - g_conf.journaler_safe ? new C_Flush(this, flush_pos):0); // on COMMIT - pending_flush[flush_pos] = g_clock.now(); - - // adjust pointers - flush_pos = write_pos; - write_buf.clear(); - - dout(10) << "_do_flush write pointers now at " << write_pos << "/" << flush_pos << "/" << ack_pos << dendl; -} - - - -void Journaler::flush(Context *onsync) -{ - // all flushed and acked? - if (write_pos == ack_pos) { - assert(write_buf.length() == 0); - dout(10) << "flush nothing to flush, write pointers at " << write_pos << "/" << flush_pos << "/" << ack_pos << dendl; - if (onsync) { - onsync->finish(0); - delete onsync; - } - return; - } - - if (write_pos == flush_pos) { - assert(write_buf.length() == 0); - dout(10) << "flush nothing to flush, write pointers at " << write_pos << "/" << flush_pos << "/" << ack_pos << dendl; - } else { - if (1) { - // maybe buffer - if (write_buf.length() < g_conf.journaler_batch_max) { - // delay! schedule an event. - dout(20) << "flush delaying flush" << dendl; - if (delay_flush_event) timer.cancel_event(delay_flush_event); - delay_flush_event = new C_DelayFlush(this); - timer.add_event_after(g_conf.journaler_batch_interval, delay_flush_event); - } else { - dout(20) << "flush not delaying flush" << dendl; - _do_flush(); - } - } else { - // always flush - _do_flush(); - } - } - - // queue waiter (at _new_ write_pos; will go when reached by ack_pos) - if (onsync) - waitfor_flush[write_pos].push_back(onsync); - - // write head? - if (last_wrote_head.sec() + g_conf.journaler_write_head_interval < g_clock.now().sec()) { - write_head(); - } -} - - - -/***************** READING *******************/ - - -class Journaler::C_Read : public Context { - Journaler *ls; -public: - C_Read(Journaler *l) : ls(l) {} - void finish(int r) { ls->_finish_read(r); } -}; - -class Journaler::C_RetryRead : public Context { - Journaler *ls; -public: - C_RetryRead(Journaler *l) : ls(l) {} - void finish(int r) { ls->is_readable(); } // this'll kickstart. -}; - -void Journaler::_finish_read(int r) -{ - assert(r>=0); - - dout(10) << "_finish_read got " << received_pos << "~" << reading_buf.length() << dendl; - received_pos += reading_buf.length(); - read_buf.claim_append(reading_buf); - assert(received_pos <= requested_pos); - dout(10) << "_finish_read read_buf now " << read_pos << "~" << read_buf.length() - << ", read pointers " << read_pos << "/" << received_pos << "/" << requested_pos - << dendl; - - if (is_readable()) { // NOTE: this check may read more - // readable! - dout(10) << "_finish_read now readable" << dendl; - if (on_readable) { - Context *f = on_readable; - on_readable = 0; - f->finish(0); - delete f; - } - - if (read_bl) { - bool r = try_read_entry(*read_bl); - assert(r); // this should have worked. - - // clear state - Context *f = on_read_finish; - on_read_finish = 0; - read_bl = 0; - - // do callback - f->finish(0); - delete f; - } - } - - // prefetch? - _prefetch(); -} - -/* NOTE: this could be slightly smarter... we could allow - * multiple reads to be in progress. e.g., if we prefetch, but - * then discover we need even more for an especially large entry. - * i don't think that circumstance will arise particularly often. - */ -void Journaler::_issue_read(off_t len) -{ - // make sure we're fully flushed - _do_flush(); - - if (_is_reading()) { - dout(10) << "_issue_read " << len << " waiting, already reading " - << received_pos << "~" << (requested_pos-received_pos) << dendl; - return; - } - assert(requested_pos == received_pos); - - // stuck at ack_pos? - assert(requested_pos <= ack_pos); - if (requested_pos == ack_pos) { - dout(10) << "_issue_read requested_pos = ack_pos = " << ack_pos << ", waiting" << dendl; - assert(write_pos > requested_pos); - if (flush_pos == ack_pos) - flush(); - assert(flush_pos > ack_pos); - waitfor_flush[flush_pos].push_back(new C_RetryRead(this)); - return; - } - - // don't read too much - if (requested_pos + len > ack_pos) { - len = ack_pos - requested_pos; - dout(10) << "_issue_read reading only up to ack_pos " << ack_pos << dendl; - } - - // go. - dout(10) << "_issue_read reading " << requested_pos << "~" << len - << ", read pointers " << read_pos << "/" << received_pos << "/" << (requested_pos+len) - << dendl; - - filer.read(inode, requested_pos, len, &reading_buf, - new C_Read(this)); - requested_pos += len; -} - -void Journaler::_prefetch() -{ - // prefetch? - off_t left = requested_pos - read_pos; - if (left <= prefetch_from && // should read more, - !_is_reading() && // and not reading anything right now - write_pos > requested_pos) { // there's something more to read... - dout(10) << "_prefetch only " << left << " < " << prefetch_from - << ", prefetching " << dendl; - _issue_read(fetch_len); - } -} - - -void Journaler::read_entry(bufferlist *bl, Context *onfinish) -{ - // only one read at a time! - assert(read_bl == 0); - assert(on_read_finish == 0); - - if (is_readable()) { - dout(10) << "read_entry at " << read_pos << ", read_buf is " - << read_pos << "~" << read_buf.length() - << ", readable now" << dendl; - - // nice, just do it now. - bool r = try_read_entry(*bl); - assert(r); - - // callback - onfinish->finish(0); - delete onfinish; - } else { - dout(10) << "read_entry at " << read_pos << ", read_buf is " - << read_pos << "~" << read_buf.length() - << ", not readable now" << dendl; - - bl->clear(); - - // set it up - read_bl = bl; - on_read_finish = onfinish; - - // is_readable() will have already initiated a read (if it was possible) - } -} - - -/* is_readable() - * return true if next entry is ready. - * kickstart read as necessary. - */ -bool Journaler::is_readable() -{ - // anything to read? - if (read_pos == write_pos) return false; - - // have enough for entry size? - uint32_t s = 0; - if (read_buf.length() >= sizeof(s)) - read_buf.copy(0, sizeof(s), (char*)&s); - - // entry and payload? - if (read_buf.length() >= sizeof(s) && - read_buf.length() >= sizeof(s) + s) - return true; // yep, next entry is ready. - - // darn it! - - // partial fragment at the end? - if (received_pos == write_pos) { - dout(10) << "is_readable() detected partial entry at tail, adjusting write_pos to " << read_pos << dendl; - write_pos = flush_pos = ack_pos = read_pos; - assert(write_buf.length() == 0); - - // truncate? - // FIXME: how much? - - return false; - } - - // start reading some more? - if (!_is_reading()) { - if (s) - fetch_len = MAX(fetch_len, (off_t)(sizeof(s)+s-read_buf.length())); - _issue_read(fetch_len); - } - - return false; -} - - -/* try_read_entry(bl) - * read entry into bl if it's ready. - * otherwise, do nothing. (well, we'll start fetching it for good measure.) - */ -bool Journaler::try_read_entry(bufferlist& bl) -{ - if (!is_readable()) { // this may start a read. - dout(10) << "try_read_entry at " << read_pos << " not readable" << dendl; - return false; - } - - uint32_t s; - assert(read_buf.length() >= sizeof(s)); - read_buf.copy(0, sizeof(s), (char*)&s); - assert(read_buf.length() >= sizeof(s) + s); - - dout(10) << "try_read_entry at " << read_pos << " reading " - << read_pos << "~" << (sizeof(s)+s) << dendl; - - // do it - assert(bl.length() == 0); - read_buf.splice(0, sizeof(s)); - read_buf.splice(0, s, &bl); - read_pos += sizeof(s) + s; - - // prefetch? - _prefetch(); - return true; -} - -void Journaler::wait_for_readable(Context *onreadable) -{ - dout(10) << "wait_for_readable at " << read_pos << " onreadable " << onreadable << dendl; - assert(!is_readable()); - assert(on_readable == 0); - on_readable = onreadable; -} - - - - -/***************** TRIMMING *******************/ - - -class Journaler::C_Trim : public Context { - Journaler *ls; - off_t to; -public: - C_Trim(Journaler *l, off_t t) : ls(l), to(t) {} - void finish(int r) { - ls->_trim_finish(r, to); - } -}; - -void Journaler::trim() -{ - off_t trim_to = last_committed.expire_pos; - trim_to -= trim_to % ceph_file_layout_period(inode.layout); - dout(10) << "trim last_commited head was " << last_committed - << ", can trim to " << trim_to - << dendl; - if (trim_to == 0 || trim_to == trimming_pos) { - dout(10) << "trim already trimmed/trimming to " - << trimmed_pos << "/" << trimming_pos << dendl; - return; - } - - if (trimming_pos > trimmed_pos) { - dout(10) << "trim already trimming atm, try again later. trimmed/trimming is " - << trimmed_pos << "/" << trimming_pos << dendl; - return; - } - - // trim - assert(trim_to <= write_pos); - assert(trim_to > trimming_pos); - dout(10) << "trim trimming to " << trim_to - << ", trimmed/trimming/expire are " - << trimmed_pos << "/" << trimming_pos << "/" << expire_pos - << dendl; - - filer.remove(inode, trimming_pos, trim_to-trimming_pos, - 0, new C_Trim(this, trim_to)); - trimming_pos = trim_to; -} - -void Journaler::_trim_finish(int r, off_t to) -{ - dout(10) << "_trim_finish trimmed_pos was " << trimmed_pos - << ", trimmed/trimming/expire now " - << to << "/" << trimming_pos << "/" << expire_pos - << dendl; - assert(r >= 0); - - assert(to <= trimming_pos); - assert(to > trimmed_pos); - trimmed_pos = to; - - // finishers? - while (!waitfor_trim.empty() && - waitfor_trim.begin()->first <= trimmed_pos) { - finish_contexts(waitfor_trim.begin()->second, 0); - waitfor_trim.erase(waitfor_trim.begin()); - } -} - - -// eof. diff --git a/branches/sage/ebofs2/osdc/Journaler.h b/branches/sage/ebofs2/osdc/Journaler.h deleted file mode 100644 index a90ec5f9e348f..0000000000000 --- a/branches/sage/ebofs2/osdc/Journaler.h +++ /dev/null @@ -1,236 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -/* Journaler - * - * This class stripes a serial log over objects on the store. Four logical pointers: - * - * write_pos - where we're writing new entries - * read_pos - where we're reading old entires - * expire_pos - what is deemed "old" by user - * trimmed_pos - where we're expiring old items - * - * trimmed_pos <= expire_pos <= read_pos <= write_pos. - * - * Often, read_pos <= write_pos (as with MDS log). During recovery, write_pos is undefined - * until the end of the log is discovered. - * - * A "head" struct at the beginning of the log is used to store metadata at - * regular intervals. The basic invariants include: - * - * head.read_pos <= read_pos -- the head may "lag", since it's updated lazily. - * head.write_pos <= write_pos - * head.expire_pos <= expire_pos - * head.trimmed_pos <= trimmed_pos - * - * More significantly, - * - * head.expire_pos >= trimmed_pos -- this ensures we can find the "beginning" of the log - * as last recorded, before it is trimmed. trimming will - * block until a sufficiently current expire_pos is committed. - * - * To recover log state, we simply start at the last write_pos in the head, and probe the - * object sequence sizes until we read the end. - * - * Head struct is stored in the first object. Actual journal starts after layout.period() bytes. - * - */ - -#ifndef __JOURNALER_H -#define __JOURNALER_H - -#include "Objecter.h" -#include "Filer.h" - -#include -#include - -class Context; -class Logger; - -class Journaler { - - // this goes at the head of the log "file". - struct Header { - off_t trimmed_pos; - off_t expire_pos; - off_t read_pos; - off_t write_pos; - Header() : trimmed_pos(0), expire_pos(0), read_pos(0), write_pos(0) {} - } last_written, last_committed; - - friend ostream& operator<<(ostream& out, Header &h); - - - // me - inode_t inode; - Objecter *objecter; - Filer filer; - - Logger *logger; - - Mutex *lock; - SafeTimer timer; - - class C_DelayFlush : public Context { - Journaler *journaler; - public: - C_DelayFlush(Journaler *j) : journaler(j) {} - void finish(int r) { - journaler->delay_flush_event = 0; - journaler->_do_flush(); - } - } *delay_flush_event; - - - // my state - static const int STATE_UNDEF = 0; - static const int STATE_READHEAD = 1; - static const int STATE_PROBING = 2; - static const int STATE_ACTIVE = 2; - - int state; - - // header - utime_t last_wrote_head; - void _finish_write_head(Header &wrote, Context *oncommit); - class C_WriteHead; - friend class C_WriteHead; - - list waitfor_recover; - void _finish_read_head(int r, bufferlist& bl); - void _finish_probe_end(int r, off_t end); - class C_ReadHead; - friend class C_ReadHead; - class C_ProbeEnd; - friend class C_ProbeEnd; - - - - // writer - off_t write_pos; // logical write position, where next entry will go - off_t flush_pos; // where we will flush. if write_pos>flush_pos, we're buffering writes. - off_t ack_pos; // what has been acked. - bufferlist write_buf; // write buffer. flush_pos + write_buf.length() == write_pos. - - std::map pending_flush; // start offsets and times for pending flushes - std::map > waitfor_flush; // when flushed through given offset - - void _do_flush(); - void _finish_flush(int r, off_t start); - class C_Flush; - friend class C_Flush; - - // reader - off_t read_pos; // logical read position, where next entry starts. - off_t requested_pos; // what we've requested from OSD. - off_t received_pos; // what we've received from OSD. - bufferlist read_buf; // read buffer. read_pos + read_buf.length() == prefetch_pos. - bufferlist reading_buf; // what i'm reading into - - off_t fetch_len; // how much to read at a time - off_t prefetch_from; // how far from end do we read next chunk - - // for read_entry() in-progress read - bufferlist *read_bl; - Context *on_read_finish; - // for wait_for_readable() - Context *on_readable; - - bool _is_reading() { - return requested_pos > received_pos; - } - void _finish_read(int r); // we just read some (read completion callback) - void _issue_read(off_t len); // read some more - void _prefetch(); // maybe read ahead - class C_Read; - friend class C_Read; - class C_RetryRead; - friend class C_RetryRead; - - // trimmer - off_t expire_pos; // what we're allowed to trim to - off_t trimming_pos; // what we've requested to trim through - off_t trimmed_pos; // what has been trimmed - map > waitfor_trim; - - void _trim_finish(int r, off_t to); - class C_Trim; - friend class C_Trim; - -public: - Journaler(inode_t& inode_, Objecter *obj, Logger *l, Mutex *lk, off_t fl=0, off_t pff=0) : - inode(inode_), objecter(obj), filer(objecter), logger(l), - lock(lk), timer(*lk), delay_flush_event(0), - state(STATE_UNDEF), - write_pos(0), flush_pos(0), ack_pos(0), - read_pos(0), requested_pos(0), received_pos(0), - fetch_len(fl), prefetch_from(pff), - read_bl(0), on_read_finish(0), on_readable(0), - expire_pos(0), trimming_pos(0), trimmed_pos(0) - { - // prefetch intelligently. - // (watch out, this is big if you use big objects or weird striping) - if (!fetch_len) - fetch_len = inode.layout.fl_object_size*inode.layout.fl_stripe_count * - g_conf.journaler_prefetch_periods; - if (!prefetch_from) - prefetch_from = fetch_len / 2; - } - - // me - //void open(Context *onopen); - //void claim(Context *onclaim, msg_addr_t from); - - /* reset - * NOTE: we assume the caller knows/has ensured that any objects - * in our sequence do not exist.. e.g. after a MKFS. this is _not_ - * an "erase" method. - */ - void reset(); - void recover(Context *onfinish); - void write_head(Context *onsave=0); - - bool is_active() { return state == STATE_ACTIVE; } - - off_t get_write_pos() const { return write_pos; } - off_t get_read_pos() const { return read_pos; } - off_t get_expire_pos() const { return expire_pos; } - off_t get_trimmed_pos() const { return trimmed_pos; } - - // write - off_t append_entry(bufferlist& bl, Context *onsync = 0); - void flush(Context *onsync = 0); - - // read - void set_read_pos(off_t p) { - assert(requested_pos == received_pos); // we can't cope w/ in-progress read right now. - assert(read_bl == 0); // ... - read_pos = requested_pos = received_pos = p; - read_buf.clear(); - } - bool is_readable(); - bool try_read_entry(bufferlist& bl); - void wait_for_readable(Context *onfinish); - void read_entry(bufferlist* bl, Context *onfinish); - - // trim - void set_expire_pos(off_t ep) { expire_pos = ep; } - void trim(); - //bool is_trimmable() { return trimming_pos < expire_pos; } - //void trim(off_t trim_to=0, Context *c=0); -}; - - -#endif diff --git a/branches/sage/ebofs2/osdc/ObjectCacher.cc b/branches/sage/ebofs2/osdc/ObjectCacher.cc deleted file mode 100644 index d5f347d3863cb..0000000000000 --- a/branches/sage/ebofs2/osdc/ObjectCacher.cc +++ /dev/null @@ -1,1587 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#include "msg/Messenger.h" -#include "ObjectCacher.h" -#include "Objecter.h" - - - -/*** ObjectCacher::BufferHead ***/ - - -/*** ObjectCacher::Object ***/ - -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_objectcacher) *_dout << dbeginl << g_clock.now() << " " << oc->objecter->messenger->get_myname() << ".objectcacher.object(" << oid << ") " - - -ObjectCacher::BufferHead *ObjectCacher::Object::split(BufferHead *left, off_t off) -{ - dout(20) << "split " << *left << " at " << off << dendl; - - // split off right - ObjectCacher::BufferHead *right = new BufferHead(this); - right->last_write_tid = left->last_write_tid; - right->set_state(left->get_state()); - - off_t newleftlen = off - left->start(); - right->set_start(off); - right->set_length(left->length() - newleftlen); - - // shorten left - oc->bh_stat_sub(left); - left->set_length(newleftlen); - oc->bh_stat_add(left); - - // add right - oc->bh_add(this, right); - - // split buffers too - bufferlist bl; - bl.claim(left->bl); - if (bl.length()) { - assert(bl.length() == (left->length() + right->length())); - right->bl.substr_of(bl, left->length(), right->length()); - left->bl.substr_of(bl, 0, left->length()); - } - - // move read waiters - if (!left->waitfor_read.empty()) { - map >::iterator o, p = left->waitfor_read.end(); - p--; - while (p != left->waitfor_read.begin()) { - if (p->first < right->start()) break; - dout(0) << "split moving waiters at byte " << p->first << " to right bh" << dendl; - right->waitfor_read[p->first].swap( p->second ); - o = p; - p--; - left->waitfor_read.erase(o); - } - } - - dout(20) << "split left is " << *left << dendl; - dout(20) << "split right is " << *right << dendl; - return right; -} - - -void ObjectCacher::Object::merge_left(BufferHead *left, BufferHead *right) -{ - assert(left->end() == right->start()); - assert(left->get_state() == right->get_state()); - - dout(10) << "merge_left " << *left << " + " << *right << dendl; - oc->bh_remove(this, right); - oc->bh_stat_sub(left); - left->set_length( left->length() + right->length()); - oc->bh_stat_add(left); - - // data - left->bl.claim_append(right->bl); - - // version - // note: this is sorta busted, but should only be used for dirty buffers - left->last_write_tid = MAX( left->last_write_tid, right->last_write_tid ); - left->last_write = MAX( left->last_write, right->last_write ); - - // waiters - for (map >::iterator p = right->waitfor_read.begin(); - p != right->waitfor_read.end(); - p++) - left->waitfor_read[p->first].splice( left->waitfor_read[p->first].begin(), - p->second ); - - // hose right - delete right; - - dout(10) << "merge_left result " << *left << dendl; -} - -void ObjectCacher::Object::try_merge_bh(BufferHead *bh) -{ - dout(10) << "try_merge_bh " << *bh << dendl; - - // to the left? - map::iterator p = data.find(bh->start()); - assert(p->second == bh); - if (p != data.begin()) { - p--; - if (p->second->end() == bh->start() && - p->second->get_state() == bh->get_state()) { - merge_left(p->second, bh); - bh = p->second; - } else - p++; - } - // to the right? - assert(p->second == bh); - p++; - if (p != data.end() && - p->second->start() == bh->end() && - p->second->get_state() == bh->get_state()) - merge_left(bh, p->second); -} - - -/* - * map a range of bytes into buffer_heads. - * - create missing buffer_heads as necessary. - */ -int ObjectCacher::Object::map_read(Objecter::OSDRead *rd, - map& hits, - map& missing, - map& rx) -{ - for (list::iterator ex_it = rd->extents.begin(); - ex_it != rd->extents.end(); - ex_it++) { - - if (ex_it->oid != oid) continue; - - dout(10) << "map_read " << ex_it->oid - << " " << ex_it->start << "~" << ex_it->length << dendl; - - map::iterator p = data.lower_bound(ex_it->start); - // p->first >= start - - off_t cur = ex_it->start; - off_t left = ex_it->length; - - if (p != data.begin() && - (p == data.end() || p->first > cur)) { - p--; // might overlap! - if (p->first + p->second->length() <= cur) - p++; // doesn't overlap. - } - - while (left > 0) { - // at end? - if (p == data.end()) { - // rest is a miss. - BufferHead *n = new BufferHead(this); - n->set_start( cur ); - n->set_length( left ); - oc->bh_add(this, n); - missing[cur] = n; - dout(20) << "map_read miss " << left << " left, " << *n << dendl; - cur += left; - left -= left; - assert(left == 0); - assert(cur == ex_it->start + (off_t)ex_it->length); - break; // no more. - } - - if (p->first <= cur) { - // have it (or part of it) - BufferHead *e = p->second; - - if (e->is_clean() || - e->is_dirty() || - e->is_tx()) { - hits[cur] = e; // readable! - dout(20) << "map_read hit " << *e << dendl; - } - else if (e->is_rx()) { - rx[cur] = e; // missing, not readable. - dout(20) << "map_read rx " << *e << dendl; - } - else assert(0); - - off_t lenfromcur = MIN(e->end() - cur, left); - cur += lenfromcur; - left -= lenfromcur; - p++; - continue; // more? - - } else if (p->first > cur) { - // gap.. miss - off_t next = p->first; - BufferHead *n = new BufferHead(this); - n->set_start( cur ); - n->set_length( MIN(next - cur, left) ); - oc->bh_add(this,n); - missing[cur] = n; - cur += MIN(left, n->length()); - left -= MIN(left, n->length()); - dout(20) << "map_read gap " << *n << dendl; - continue; // more? - } - else - assert(0); - } - } - return(0); -} - -/* - * map a range of extents on an object's buffer cache. - * - combine any bh's we're writing into one - * - break up bufferheads that don't fall completely within the range - * //no! - return a bh that includes the write. may also include other dirty data to left and/or right. - */ -ObjectCacher::BufferHead *ObjectCacher::Object::map_write(Objecter::OSDWrite *wr) -{ - BufferHead *final = 0; - - for (list::iterator ex_it = wr->extents.begin(); - ex_it != wr->extents.end(); - ex_it++) { - - if (ex_it->oid != oid) continue; - - dout(10) << "map_write oex " << ex_it->oid - << " " << ex_it->start << "~" << ex_it->length << dendl; - - map::iterator p = data.lower_bound(ex_it->start); - // p->first >= start - - off_t cur = ex_it->start; - off_t left = ex_it->length; - - if (p != data.begin() && - (p == data.end() || p->first > cur)) { - p--; // might overlap or butt up! - - /*// dirty and butts up? - if (p->first + p->second->length() == cur && - p->second->is_dirty()) { - dout(10) << "map_write will append to tail of " << *p->second << dendl; - final = p->second; - } - */ - if (p->first + p->second->length() <= cur) - p++; // doesn't overlap. - } - - while (left > 0) { - off_t max = left; - - // at end ? - if (p == data.end()) { - if (final == NULL) { - final = new BufferHead(this); - final->set_start( cur ); - final->set_length( max ); - oc->bh_add(this, final); - dout(10) << "map_write adding trailing bh " << *final << dendl; - } else { - final->set_length( final->length() + max ); - } - left -= max; - cur += max; - continue; - } - - dout(10) << "p is " << *p->second << dendl; - - if (p->first <= cur) { - BufferHead *bh = p->second; - dout(10) << "map_write bh " << *bh << " intersected" << dendl; - - if (p->first < cur) { - assert(final == 0); - if (cur + max >= p->first + p->second->length()) { - // we want right bit (one splice) - final = split(bh, cur); // just split it, take right half. - p++; - assert(p->second == final); - } else { - // we want middle bit (two splices) - final = split(bh, cur); - p++; - assert(p->second == final); - split(final, cur+max); - } - } else if (p->first == cur) { - if (p->second->length() <= max) { - // whole bufferhead, piece of cake. - } else { - // we want left bit (one splice) - split(bh, cur + max); // just split - } - if (final) - merge_left(final, bh); - else - final = bh; - } - - // keep going. - off_t lenfromcur = final->end() - cur; - cur += lenfromcur; - left -= lenfromcur; - p++; - continue; - } else { - // gap! - off_t next = p->first; - off_t glen = MIN(next - cur, max); - dout(10) << "map_write gap " << cur << "~" << glen << dendl; - if (final) { - final->set_length( final->length() + glen ); - } else { - final = new BufferHead(this); - final->set_start( cur ); - final->set_length( glen ); - oc->bh_add(this, final); - } - - cur += glen; - left -= glen; - continue; // more? - } - } - } - - // set versoin - assert(final); - dout(10) << "map_write final is " << *final << dendl; - - return final; -} - - -void ObjectCacher::Object::truncate(off_t s) -{ - dout(10) << "truncate to " << s << dendl; - - while (!data.empty()) { - BufferHead *bh = data.rbegin()->second; - if (bh->end() <= s) - break; - - // split bh at truncation point? - if (bh->start() < s) { - split(bh, s); - continue; - } - - // remove bh entirely - assert(bh->start() >= s); - oc->bh_remove(this, bh); - delete bh; - } -} - - - - - -/*** ObjectCacher ***/ - -#undef dout -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_objectcacher) *_dout << dbeginl << g_clock.now() << " " << objecter->messenger->get_myname() << ".objectcacher " - - - -/* private */ - -void ObjectCacher::close_object(Object *ob) -{ - dout(10) << "close_object " << *ob << dendl; - assert(ob->can_close()); - - // ok! - objects.erase(ob->get_oid()); - objects_by_ino[ob->get_ino()].erase(ob); - if (objects_by_ino[ob->get_ino()].empty()) - objects_by_ino.erase(ob->get_ino()); - delete ob; -} - - - - -void ObjectCacher::bh_read(BufferHead *bh) -{ - dout(7) << "bh_read on " << *bh << dendl; - - mark_rx(bh); - - // finisher - C_ReadFinish *onfinish = new C_ReadFinish(this, bh->ob->get_oid(), bh->start(), bh->length()); - - // go - objecter->read(bh->ob->get_oid(), bh->start(), bh->length(), bh->ob->get_layout(), &onfinish->bl, - onfinish); -} - -void ObjectCacher::bh_read_finish(object_t oid, off_t start, size_t length, bufferlist &bl) -{ - //lock.Lock(); - dout(7) << "bh_read_finish " - << oid - << " " << start << "~" << length - << " (bl is " << bl.length() << ")" - << dendl; - - if (bl.length() < length) { - bufferptr bp(length - bl.length()); - bp.zero(); - dout(7) << "bh_read_finish " << oid << " padding " << start << "~" << length - << " with " << bp.length() << " bytes of zeroes" << dendl; - bl.push_back(bp); - } - - if (objects.count(oid) == 0) { - dout(7) << "bh_read_finish no object cache" << dendl; - } else { - Object *ob = objects[oid]; - - // apply to bh's! - off_t opos = start; - map::iterator p = ob->data.lower_bound(opos); - - while (p != ob->data.end() && - opos < start+(off_t)length) { - BufferHead *bh = p->second; - - if (bh->start() > opos) { - dout(1) << "weirdness: gap when applying read results, " - << opos << "~" << bh->start() - opos - << dendl; - opos = bh->start(); - continue; - } - - if (!bh->is_rx()) { - dout(10) << "bh_read_finish skipping non-rx " << *bh << dendl; - opos = bh->end(); - p++; - continue; - } - - assert(opos >= bh->start()); - assert(bh->start() == opos); // we don't merge rx bh's... yet! - assert(bh->length() <= start+(off_t)length-opos); - - bh->bl.substr_of(bl, - opos-bh->start(), - bh->length()); - mark_clean(bh); - dout(10) << "bh_read_finish read " << *bh << dendl; - - opos = bh->end(); - p++; - - // finishers? - // called with lock held. - list ls; - for (map >::iterator p = bh->waitfor_read.begin(); - p != bh->waitfor_read.end(); - p++) - ls.splice(ls.end(), p->second); - bh->waitfor_read.clear(); - finish_contexts(ls); - - // clean up? - ob->try_merge_bh(bh); - } - } - //lock.Unlock(); -} - - -void ObjectCacher::bh_write(BufferHead *bh) -{ - dout(7) << "bh_write " << *bh << dendl; - - // finishers - C_WriteAck *onack = new C_WriteAck(this, bh->ob->get_oid(), bh->start(), bh->length()); - C_WriteCommit *oncommit = new C_WriteCommit(this, bh->ob->get_oid(), bh->start(), bh->length()); - - // go - tid_t tid = objecter->write(bh->ob->get_oid(), bh->start(), bh->length(), bh->ob->get_layout(), bh->bl, - onack, oncommit); - - // set bh last_write_tid - onack->tid = tid; - oncommit->tid = tid; - bh->ob->last_write_tid = tid; - bh->last_write_tid = tid; - - mark_tx(bh); -} - -void ObjectCacher::lock_ack(list& oids, tid_t tid) -{ - for (list::iterator i = oids.begin(); - i != oids.end(); - i++) { - object_t oid = *i; - - if (objects.count(oid) == 0) { - dout(7) << "lock_ack no object cache" << dendl; - assert(0); - } - - Object *ob = objects[oid]; - - list ls; - - assert(tid <= ob->last_write_tid); - if (ob->last_write_tid == tid) { - dout(10) << "lock_ack " << *ob - << " tid " << tid << dendl; - - switch (ob->lock_state) { - case Object::LOCK_RDUNLOCKING: - case Object::LOCK_WRUNLOCKING: - ob->lock_state = Object::LOCK_NONE; - break; - case Object::LOCK_RDLOCKING: - case Object::LOCK_DOWNGRADING: - ob->lock_state = Object::LOCK_RDLOCK; - ls.splice(ls.begin(), ob->waitfor_rd); - break; - case Object::LOCK_UPGRADING: - case Object::LOCK_WRLOCKING: - ob->lock_state = Object::LOCK_WRLOCK; - ls.splice(ls.begin(), ob->waitfor_wr); - ls.splice(ls.begin(), ob->waitfor_rd); - break; - - default: - assert(0); - } - - ob->last_ack_tid = tid; - - if (ob->can_close()) - close_object(ob); - } else { - dout(10) << "lock_ack " << *ob - << " tid " << tid << " obsolete" << dendl; - } - - // waiters? - if (ob->waitfor_ack.count(tid)) { - ls.splice(ls.end(), ob->waitfor_ack[tid]); - ob->waitfor_ack.erase(tid); - } - - finish_contexts(ls); - - } -} - -void ObjectCacher::bh_write_ack(object_t oid, off_t start, size_t length, tid_t tid) -{ - //lock.Lock(); - - dout(7) << "bh_write_ack " - << oid - << " tid " << tid - << " " << start << "~" << length - << dendl; - if (objects.count(oid) == 0) { - dout(7) << "bh_write_ack no object cache" << dendl; - assert(0); - } else { - Object *ob = objects[oid]; - - // apply to bh's! - for (map::iterator p = ob->data.lower_bound(start); - p != ob->data.end(); - p++) { - BufferHead *bh = p->second; - - if (bh->start() > start+(off_t)length) break; - - if (bh->start() < start && - bh->end() > start+(off_t)length) { - dout(20) << "bh_write_ack skipping " << *bh << dendl; - continue; - } - - // make sure bh is tx - if (!bh->is_tx()) { - dout(10) << "bh_write_ack skipping non-tx " << *bh << dendl; - continue; - } - - // make sure bh tid matches - if (bh->last_write_tid != tid) { - assert(bh->last_write_tid > tid); - dout(10) << "bh_write_ack newer tid on " << *bh << dendl; - continue; - } - - // ok! mark bh clean. - mark_clean(bh); - dout(10) << "bh_write_ack clean " << *bh << dendl; - } - - // update object last_ack. - assert(ob->last_ack_tid < tid); - ob->last_ack_tid = tid; - - // waiters? - if (ob->waitfor_ack.count(tid)) { - list ls; - ls.splice(ls.begin(), ob->waitfor_ack[tid]); - ob->waitfor_ack.erase(tid); - finish_contexts(ls); - } - } - //lock.Unlock(); -} - -void ObjectCacher::bh_write_commit(object_t oid, off_t start, size_t length, tid_t tid) -{ - //lock.Lock(); - - // update object last_commit - dout(7) << "bh_write_commit " - << oid - << " tid " << tid - << " " << start << "~" << length - << dendl; - if (objects.count(oid) == 0) { - dout(7) << "bh_write_commit no object cache" << dendl; - //assert(0); - } else { - Object *ob = objects[oid]; - - // update last_commit. - ob->last_commit_tid = tid; - - // waiters? - if (ob->waitfor_commit.count(tid)) { - list ls; - ls.splice(ls.begin(), ob->waitfor_commit[tid]); - ob->waitfor_commit.erase(tid); - finish_contexts(ls); - } - } - - // lock.Unlock(); -} - - -void ObjectCacher::flush(off_t amount) -{ - utime_t cutoff = g_clock.now(); - //cutoff.sec_ref() -= g_conf.client_oc_max_dirty_age; - - dout(10) << "flush " << amount << dendl; - - /* - * NOTE: we aren't actually pulling things off the LRU here, just looking at the - * tail item. Then we call bh_write, which moves it to the other LRU, so that we - * can call lru_dirty.lru_get_next_expire() again. - */ - off_t did = 0; - while (amount == 0 || did < amount) { - BufferHead *bh = (BufferHead*) lru_dirty.lru_get_next_expire(); - if (!bh) break; - if (bh->last_write > cutoff) break; - - did += bh->length(); - bh_write(bh); - } -} - - -void ObjectCacher::trim(off_t max) -{ - if (max < 0) - max = g_conf.client_oc_size; - - dout(10) << "trim start: max " << max - << " clean " << get_stat_clean() - << dendl; - - while (get_stat_clean() > max) { - BufferHead *bh = (BufferHead*) lru_rest.lru_expire(); - if (!bh) break; - - dout(10) << "trim trimming " << *bh << dendl; - assert(bh->is_clean()); - - Object *ob = bh->ob; - bh_remove(ob, bh); - delete bh; - - if (ob->can_close()) { - dout(10) << "trim trimming " << *ob << dendl; - close_object(ob); - } - } - - dout(10) << "trim finish: max " << max - << " clean " << get_stat_clean() - << dendl; -} - - - -/* public */ - -/* - * returns # bytes read (if in cache). onfinish is untouched (caller must delete it) - * returns 0 if doing async read - */ -int ObjectCacher::readx(Objecter::OSDRead *rd, inodeno_t ino, Context *onfinish) -{ - bool success = true; - list hit_ls; - map stripe_map; // final buffer offset -> substring - - for (list::iterator ex_it = rd->extents.begin(); - ex_it != rd->extents.end(); - ex_it++) { - dout(10) << "readx " << *ex_it << dendl; - - // get Object cache - Object *o = get_object(ex_it->oid, ino, ex_it->layout); - - // map extent into bufferheads - map hits, missing, rx; - o->map_read(rd, hits, missing, rx); - - if (!missing.empty() || !rx.empty()) { - // read missing - for (map::iterator bh_it = missing.begin(); - bh_it != missing.end(); - bh_it++) { - bh_read(bh_it->second); - if (success) { - dout(10) << "readx missed, waiting on " << *bh_it->second - << " off " << bh_it->first << dendl; - success = false; - bh_it->second->waitfor_read[bh_it->first].push_back( new C_RetryRead(this, rd, ino, onfinish) ); - } - } - - // bump rx - for (map::iterator bh_it = rx.begin(); - bh_it != rx.end(); - bh_it++) { - touch_bh(bh_it->second); // bump in lru, so we don't lose it. - if (success) { - dout(10) << "readx missed, waiting on " << *bh_it->second - << " off " << bh_it->first << dendl; - success = false; - bh_it->second->waitfor_read[bh_it->first].push_back( new C_RetryRead(this, rd, ino, onfinish) ); - } - } - } else { - assert(!hits.empty()); - - // make a plain list - for (map::iterator bh_it = hits.begin(); - bh_it != hits.end(); - bh_it++) { - dout(10) << "readx hit bh " << *bh_it->second << dendl; - hit_ls.push_back(bh_it->second); - } - - // create reverse map of buffer offset -> object for the eventual result. - // this is over a single ObjectExtent, so we know that - // - the bh's are contiguous - // - the buffer frags need not be (and almost certainly aren't) - off_t opos = ex_it->start; - map::iterator bh_it = hits.begin(); - assert(bh_it->second->start() <= opos); - size_t bhoff = opos - bh_it->second->start(); - map::iterator f_it = ex_it->buffer_extents.begin(); - size_t foff = 0; - while (1) { - BufferHead *bh = bh_it->second; - assert(opos == (off_t)(bh->start() + bhoff)); - - dout(10) << "readx rmap opos " << opos - << ": " << *bh << " +" << bhoff - << " frag " << f_it->first << "~" << f_it->second << " +" << foff - << dendl; - - size_t len = MIN(f_it->second - foff, - bh->length() - bhoff); - bufferlist bit; // put substr here first, since substr_of clobbers, and - // we may get multiple bh's at this stripe_map position - bit.substr_of(bh->bl, - opos - bh->start(), - len); - stripe_map[f_it->first].claim_append(bit); - - opos += len; - bhoff += len; - foff += len; - if (opos == bh->end()) { - bh_it++; - bhoff = 0; - } - if (foff == f_it->second) { - f_it++; - foff = 0; - } - if (bh_it == hits.end()) break; - if (f_it == ex_it->buffer_extents.end()) break; - } - assert(f_it == ex_it->buffer_extents.end()); - assert(opos == ex_it->start + (off_t)ex_it->length); - } - } - - // bump hits in lru - for (list::iterator bhit = hit_ls.begin(); - bhit != hit_ls.end(); - bhit++) - touch_bh(*bhit); - - if (!success) return 0; // wait! - - // no misses... success! do the read. - assert(!hit_ls.empty()); - dout(10) << "readx has all buffers" << dendl; - - // ok, assemble into result buffer. - rd->bl->clear(); - size_t pos = 0; - for (map::iterator i = stripe_map.begin(); - i != stripe_map.end(); - i++) { - assert(pos == i->first); - dout(10) << "readx adding buffer len " << i->second.length() << " at " << pos << dendl; - pos += i->second.length(); - rd->bl->claim_append(i->second); - assert(rd->bl->length() == pos); - } - dout(10) << "readx result is " << rd->bl->length() << dendl; - - // done with read. - delete rd; - - trim(); - - return pos; -} - - -int ObjectCacher::writex(Objecter::OSDWrite *wr, inodeno_t ino) -{ - utime_t now = g_clock.now(); - - for (list::iterator ex_it = wr->extents.begin(); - ex_it != wr->extents.end(); - ex_it++) { - // get object cache - Object *o = get_object(ex_it->oid, ino, ex_it->layout); - - // map it all into a single bufferhead. - BufferHead *bh = o->map_write(wr); - - // adjust buffer pointers (ie "copy" data into my cache) - // this is over a single ObjectExtent, so we know that - // - there is one contiguous bh - // - the buffer frags need not be (and almost certainly aren't) - // note: i assume striping is monotonic... no jumps backwards, ever! - off_t opos = ex_it->start; - for (map::iterator f_it = ex_it->buffer_extents.begin(); - f_it != ex_it->buffer_extents.end(); - f_it++) { - dout(10) << "writex writing " << f_it->first << "~" << f_it->second << " into " << *bh << " at " << opos << dendl; - size_t bhoff = bh->start() - opos; - assert(f_it->second <= bh->length() - bhoff); - - // get the frag we're mapping in - bufferlist frag; - frag.substr_of(wr->bl, - f_it->first, f_it->second); - - // keep anything left of bhoff - bufferlist newbl; - if (bhoff) - newbl.substr_of(bh->bl, 0, bhoff); - newbl.claim_append(frag); - bh->bl.swap(newbl); - - opos += f_it->second; - } - - // ok, now bh is dirty. - mark_dirty(bh); - touch_bh(bh); - bh->last_write = now; - - o->try_merge_bh(bh); - } - - delete wr; - - trim(); - return 0; -} - - -// blocking wait for write. -void ObjectCacher::wait_for_write(size_t len, Mutex& lock) -{ - while (get_stat_dirty() + get_stat_tx() >= g_conf.client_oc_max_dirty) { - dout(10) << "wait_for_write waiting on " << len << ", dirty|tx " - << (get_stat_dirty() + get_stat_tx()) - << " >= " << g_conf.client_oc_max_dirty - << dendl; - flusher_cond.Signal(); - stat_waiter++; - stat_cond.Wait(lock); - stat_waiter--; - dout(10) << "wait_for_write woke up" << dendl; - } -} - -void ObjectCacher::flusher_entry() -{ - dout(10) << "flusher start" << dendl; - lock.Lock(); - while (!flusher_stop) { - while (!flusher_stop) { - off_t all = get_stat_tx() + get_stat_rx() + get_stat_clean() + get_stat_dirty(); - dout(11) << "flusher " - << all << " / " << g_conf.client_oc_size << ": " - << get_stat_tx() << " tx, " - << get_stat_rx() << " rx, " - << get_stat_clean() << " clean, " - << get_stat_dirty() << " / " << g_conf.client_oc_max_dirty << " dirty" - << dendl; - if (get_stat_dirty() > g_conf.client_oc_max_dirty) { - // flush some dirty pages - dout(10) << "flusher " - << get_stat_dirty() << " / " << g_conf.client_oc_max_dirty << " dirty," - << " flushing some dirty bhs" << dendl; - flush(get_stat_dirty() - g_conf.client_oc_max_dirty); - } - else { - // check tail of lru for old dirty items - utime_t cutoff = g_clock.now(); - cutoff.sec_ref()--; - BufferHead *bh = 0; - while ((bh = (BufferHead*)lru_dirty.lru_get_next_expire()) != 0 && - bh->last_write < cutoff) { - dout(10) << "flusher flushing aged dirty bh " << *bh << dendl; - bh_write(bh); - } - break; - } - } - if (flusher_stop) break; - flusher_cond.WaitInterval(lock, utime_t(1,0)); - } - lock.Unlock(); - dout(10) << "flusher finish" << dendl; -} - - - -// blocking. atomic+sync. -int ObjectCacher::atomic_sync_readx(Objecter::OSDRead *rd, inodeno_t ino, Mutex& lock) -{ - dout(10) << "atomic_sync_readx " << rd - << " in " << ino - << dendl; - - if (rd->extents.size() == 1) { - // single object. - // just write synchronously. - Cond cond; - bool done = false; - objecter->readx(rd, new C_SafeCond(&lock, &cond, &done)); - - // block - while (!done) cond.Wait(lock); - } else { - // spans multiple objects, or is big. - - // sort by object... - map by_oid; - for (list::iterator ex_it = rd->extents.begin(); - ex_it != rd->extents.end(); - ex_it++) - by_oid[ex_it->oid] = *ex_it; - - // lock - for (map::iterator i = by_oid.begin(); - i != by_oid.end(); - i++) { - Object *o = get_object(i->first, ino, i->second.layout); - rdlock(o); - } - - // readx will hose rd - list extents = rd->extents; - - // do the read, into our cache - Cond cond; - bool done = false; - readx(rd, ino, new C_SafeCond(&lock, &cond, &done)); - - // block - while (!done) cond.Wait(lock); - - // release the locks - for (list::iterator ex_it = extents.begin(); - ex_it != extents.end(); - ex_it++) { - assert(objects.count(ex_it->oid)); - Object *o = objects[ex_it->oid]; - rdunlock(o); - } - } - - return 0; -} - -int ObjectCacher::atomic_sync_writex(Objecter::OSDWrite *wr, inodeno_t ino, Mutex& lock) -{ - dout(10) << "atomic_sync_writex " << wr - << " in " << ino - << dendl; - - if (wr->extents.size() == 1 && - wr->extents.front().length <= g_conf.client_oc_max_sync_write) { - // single object. - - // make sure we aren't already locking/locked... - object_t oid = wr->extents.front().oid; - Object *o = 0; - if (objects.count(oid)) o = get_object(oid, ino, wr->extents.front().layout); - if (!o || - (o->lock_state != Object::LOCK_WRLOCK && - o->lock_state != Object::LOCK_WRLOCKING && - o->lock_state != Object::LOCK_UPGRADING)) { - // just write synchronously. - dout(10) << "atomic_sync_writex " << wr - << " in " << ino - << " doing sync write" - << dendl; - - Cond cond; - bool done = false; - objecter->modifyx(wr, new C_SafeCond(&lock, &cond, &done), 0); - - // block - while (!done) cond.Wait(lock); - return 0; - } - } - - // spans multiple objects, or is big. - // sort by object... - map by_oid; - for (list::iterator ex_it = wr->extents.begin(); - ex_it != wr->extents.end(); - ex_it++) - by_oid[ex_it->oid] = *ex_it; - - // wrlock - for (map::iterator i = by_oid.begin(); - i != by_oid.end(); - i++) { - Object *o = get_object(i->first, ino, i->second.layout); - wrlock(o); - } - - // writex will hose wr - list extents = wr->extents; - - // do the write, into our cache - writex(wr, ino); - - // flush - // ...and release the locks? - for (list::iterator ex_it = extents.begin(); - ex_it != extents.end(); - ex_it++) { - assert(objects.count(ex_it->oid)); - Object *o = objects[ex_it->oid]; - - wrunlock(o); - } - - return 0; -} - - - -// locking ----------------------------- - -void ObjectCacher::rdlock(Object *o) -{ - // lock? - if (o->lock_state == Object::LOCK_NONE || - o->lock_state == Object::LOCK_RDUNLOCKING || - o->lock_state == Object::LOCK_WRUNLOCKING) { - dout(10) << "rdlock rdlock " << *o << dendl; - - o->lock_state = Object::LOCK_RDLOCKING; - - C_LockAck *ack = new C_LockAck(this, o->get_oid()); - C_WriteCommit *commit = new C_WriteCommit(this, o->get_oid(), 0, 0); - - commit->tid = - ack->tid = - o->last_write_tid = - objecter->lock(OSD_OP_RDLOCK, o->get_oid(), o->get_layout(), ack, commit); - } - - // stake our claim. - o->rdlock_ref++; - - // wait? - if (o->lock_state == Object::LOCK_RDLOCKING || - o->lock_state == Object::LOCK_WRLOCKING) { - dout(10) << "rdlock waiting for rdlock|wrlock on " << *o << dendl; - Cond cond; - bool done = false; - o->waitfor_rd.push_back(new C_SafeCond(&lock, &cond, &done)); - while (!done) cond.Wait(lock); - } - assert(o->lock_state == Object::LOCK_RDLOCK || - o->lock_state == Object::LOCK_WRLOCK || - o->lock_state == Object::LOCK_UPGRADING || - o->lock_state == Object::LOCK_DOWNGRADING); -} - -void ObjectCacher::wrlock(Object *o) -{ - // lock? - if (o->lock_state != Object::LOCK_WRLOCK && - o->lock_state != Object::LOCK_WRLOCKING && - o->lock_state != Object::LOCK_UPGRADING) { - dout(10) << "wrlock wrlock " << *o << dendl; - - int op = 0; - if (o->lock_state == Object::LOCK_RDLOCK) { - o->lock_state = Object::LOCK_UPGRADING; - op = OSD_OP_UPLOCK; - } else { - o->lock_state = Object::LOCK_WRLOCKING; - op = OSD_OP_WRLOCK; - } - - C_LockAck *ack = new C_LockAck(this, o->get_oid()); - C_WriteCommit *commit = new C_WriteCommit(this, o->get_oid(), 0, 0); - - commit->tid = - ack->tid = - o->last_write_tid = - objecter->lock(op, o->get_oid(), o->get_layout(), ack, commit); - } - - // stake our claim. - o->wrlock_ref++; - - // wait? - if (o->lock_state == Object::LOCK_WRLOCKING || - o->lock_state == Object::LOCK_UPGRADING) { - dout(10) << "wrlock waiting for wrlock on " << *o << dendl; - Cond cond; - bool done = false; - o->waitfor_wr.push_back(new C_SafeCond(&lock, &cond, &done)); - while (!done) cond.Wait(lock); - } - assert(o->lock_state == Object::LOCK_WRLOCK); -} - - -void ObjectCacher::rdunlock(Object *o) -{ - dout(10) << "rdunlock " << *o << dendl; - assert(o->lock_state == Object::LOCK_RDLOCK || - o->lock_state == Object::LOCK_WRLOCK || - o->lock_state == Object::LOCK_UPGRADING || - o->lock_state == Object::LOCK_DOWNGRADING); - - assert(o->rdlock_ref > 0); - o->rdlock_ref--; - if (o->rdlock_ref > 0 || - o->wrlock_ref > 0) { - dout(10) << "rdunlock " << *o << " still has rdlock|wrlock refs" << dendl; - return; - } - - release(o); // release first - - o->lock_state = Object::LOCK_RDUNLOCKING; - - C_LockAck *lockack = new C_LockAck(this, o->get_oid()); - C_WriteCommit *commit = new C_WriteCommit(this, o->get_oid(), 0, 0); - commit->tid = - lockack->tid = - o->last_write_tid = - objecter->lock(OSD_OP_RDUNLOCK, o->get_oid(), o->get_layout(), lockack, commit); -} - -void ObjectCacher::wrunlock(Object *o) -{ - dout(10) << "wrunlock " << *o << dendl; - assert(o->lock_state == Object::LOCK_WRLOCK); - - assert(o->wrlock_ref > 0); - o->wrlock_ref--; - if (o->wrlock_ref > 0) { - dout(10) << "wrunlock " << *o << " still has wrlock refs" << dendl; - return; - } - - flush(o); // flush first - - int op = 0; - if (o->rdlock_ref > 0) { - dout(10) << "wrunlock rdlock " << *o << dendl; - op = OSD_OP_DNLOCK; - o->lock_state = Object::LOCK_DOWNGRADING; - } else { - dout(10) << "wrunlock wrunlock " << *o << dendl; - op = OSD_OP_WRUNLOCK; - o->lock_state = Object::LOCK_WRUNLOCKING; - } - - C_LockAck *lockack = new C_LockAck(this, o->get_oid()); - C_WriteCommit *commit = new C_WriteCommit(this, o->get_oid(), 0, 0); - commit->tid = - lockack->tid = - o->last_write_tid = - objecter->lock(op, o->get_oid(), o->get_layout(), lockack, commit); -} - - -// ------------------------------------------------- - - -bool ObjectCacher::set_is_cached(inodeno_t ino) -{ - if (objects_by_ino.count(ino) == 0) - return false; - - set& s = objects_by_ino[ino]; - for (set::iterator i = s.begin(); - i != s.end(); - i++) { - Object *ob = *i; - if (!ob->data.empty()) return true; - } - - return false; -} - -bool ObjectCacher::set_is_dirty_or_committing(inodeno_t ino) -{ - if (objects_by_ino.count(ino) == 0) - return false; - - set& s = objects_by_ino[ino]; - for (set::iterator i = s.begin(); - i != s.end(); - i++) { - Object *ob = *i; - - for (map::iterator p = ob->data.begin(); - p != ob->data.end(); - p++) { - BufferHead *bh = p->second; - if (bh->is_dirty() || bh->is_tx()) - return true; - } - } - - return false; -} - - -// purge. non-blocking. violently removes dirty buffers from cache. -void ObjectCacher::purge(Object *ob) -{ - dout(10) << "purge " << *ob << dendl; - - for (map::iterator p = ob->data.begin(); - p != ob->data.end(); - p++) { - BufferHead *bh = p->second; - if (!bh->is_clean()) - dout(0) << "purge forcibly removing " << *ob << " " << *bh << dendl; - bh_remove(ob, bh); - delete bh; - } - - if (ob->can_close()) { - dout(10) << "trim trimming " << *ob << dendl; - close_object(ob); - } -} - -// flush. non-blocking. no callback. -// true if clean, already flushed. -// false if we wrote something. -bool ObjectCacher::flush(Object *ob) -{ - bool clean = true; - for (map::iterator p = ob->data.begin(); - p != ob->data.end(); - p++) { - BufferHead *bh = p->second; - if (bh->is_tx()) { - clean = false; - continue; - } - if (!bh->is_dirty()) continue; - - bh_write(bh); - clean = false; - } - return clean; -} - -// flush. non-blocking, takes callback. -// returns true if already flushed -bool ObjectCacher::flush_set(inodeno_t ino, Context *onfinish) -{ - if (objects_by_ino.count(ino) == 0) { - dout(10) << "flush_set on " << ino << " dne" << dendl; - return true; - } - - dout(10) << "flush_set " << ino << dendl; - - C_Gather *gather = 0; // we'll need to wait for all objects to flush! - - set& s = objects_by_ino[ino]; - bool safe = true; - for (set::iterator i = s.begin(); - i != s.end(); - i++) { - Object *ob = *i; - - if (!flush(ob)) { - // we'll need to gather... - if (!gather && onfinish) - gather = new C_Gather(onfinish); - safe = false; - - dout(10) << "flush_set " << ino << " will wait for ack tid " - << ob->last_write_tid - << " on " << *ob - << dendl; - if (gather) - ob->waitfor_ack[ob->last_write_tid].push_back(gather->new_sub()); - } - } - - if (safe) { - dout(10) << "flush_set " << ino << " has no dirty|tx bhs" << dendl; - return true; - } - return false; -} - - -// commit. non-blocking, takes callback. -// return true if already flushed. -bool ObjectCacher::commit_set(inodeno_t ino, Context *onfinish) -{ - assert(onfinish); // doesn't make any sense otherwise. - - if (objects_by_ino.count(ino) == 0) { - dout(10) << "commit_set on " << ino << " dne" << dendl; - return true; - } - - dout(10) << "commit_set " << ino << dendl; - - C_Gather *gather = 0; // we'll need to wait for all objects to commit - - set& s = objects_by_ino[ino]; - bool safe = true; - for (set::iterator i = s.begin(); - i != s.end(); - i++) { - Object *ob = *i; - - // make sure it's flushing. - flush_set(ino); - - if (ob->last_write_tid > ob->last_commit_tid) { - dout(10) << "commit_set " << ino << " " << *ob - << " will finish on commit tid " << ob->last_write_tid - << dendl; - if (!gather && onfinish) gather = new C_Gather(onfinish); - safe = false; - if (gather) - ob->waitfor_commit[ob->last_write_tid].push_back( gather->new_sub() ); - } - } - - if (safe) { - dout(10) << "commit_set " << ino << " all committed" << dendl; - return true; - } - return false; -} - -void ObjectCacher::purge_set(inodeno_t ino) -{ - if (objects_by_ino.count(ino) == 0) { - dout(10) << "purge_set on " << ino << " dne" << dendl; - return; - } - - dout(10) << "purge_set " << ino << dendl; - - set& s = objects_by_ino[ino]; - for (set::iterator i = s.begin(); - i != s.end(); - i++) { - Object *ob = *i; - purge(ob); - } -} - - -off_t ObjectCacher::release(Object *ob) -{ - list clean; - off_t o_unclean = 0; - - for (map::iterator p = ob->data.begin(); - p != ob->data.end(); - p++) { - BufferHead *bh = p->second; - if (bh->is_clean()) - clean.push_back(bh); - else - o_unclean += bh->length(); - } - - for (list::iterator p = clean.begin(); - p != clean.end(); - p++) { - bh_remove(ob, *p); - delete *p; - } - - if (ob->can_close()) { - dout(10) << "trim trimming " << *ob << dendl; - close_object(ob); - } - - return o_unclean; -} - -off_t ObjectCacher::release_set(inodeno_t ino) -{ - // return # bytes not clean (and thus not released). - off_t unclean = 0; - - if (objects_by_ino.count(ino) == 0) { - dout(10) << "release_set on " << ino << " dne" << dendl; - return 0; - } - - dout(10) << "release_set " << ino << dendl; - - set s = objects_by_ino[ino]; - for (set::iterator i = s.begin(); - i != s.end(); - i++) { - Object *ob = *i; - - off_t o_unclean = release(ob); - unclean += o_unclean; - - if (o_unclean) - dout(10) << "release_set " << ino << " " << *ob - << " has " << o_unclean << " bytes left" - << dendl; - - } - - if (unclean) { - dout(10) << "release_set " << ino - << ", " << unclean << " bytes left" << dendl; - } - - return unclean; -} - -void ObjectCacher::truncate_set(inodeno_t ino, list& exls) -{ - if (objects_by_ino.count(ino) == 0) { - dout(10) << "truncate_set on " << ino << " dne" << dendl; - return; - } - - dout(10) << "truncate_set " << ino << dendl; - - for (list::iterator p = exls.begin(); - p != exls.end(); - ++p) { - ObjectExtent &ex = *p; - if (objects.count(ex.oid) == 0) continue; - Object *ob = objects[ex.oid]; - - // purge or truncate? - if (ex.start == 0) { - dout(10) << "truncate_set purging " << *ob << dendl; - purge(ob); - } else { - // hrm, truncate object - dout(10) << "truncate_set truncating " << *ob << " at " << ex.start << dendl; - ob->truncate(ex.start); - - if (ob->can_close()) { - dout(10) << "truncate_set trimming " << *ob << dendl; - close_object(ob); - } - } - } -} - - -void ObjectCacher::kick_sync_writers(inodeno_t ino) -{ - if (objects_by_ino.count(ino) == 0) { - dout(10) << "kick_sync_writers on " << ino << " dne" << dendl; - return; - } - - dout(10) << "kick_sync_writers on " << ino << dendl; - - list ls; - - set& s = objects_by_ino[ino]; - for (set::iterator i = s.begin(); - i != s.end(); - i++) { - Object *ob = *i; - - ls.splice(ls.begin(), ob->waitfor_wr); - } - - finish_contexts(ls); -} - -void ObjectCacher::kick_sync_readers(inodeno_t ino) -{ - if (objects_by_ino.count(ino) == 0) { - dout(10) << "kick_sync_readers on " << ino << " dne" << dendl; - return; - } - - dout(10) << "kick_sync_readers on " << ino << dendl; - - list ls; - - set& s = objects_by_ino[ino]; - for (set::iterator i = s.begin(); - i != s.end(); - i++) { - Object *ob = *i; - - ls.splice(ls.begin(), ob->waitfor_rd); - } - - finish_contexts(ls); -} - - - diff --git a/branches/sage/ebofs2/osdc/ObjectCacher.h b/branches/sage/ebofs2/osdc/ObjectCacher.h deleted file mode 100644 index f1d057beef99c..0000000000000 --- a/branches/sage/ebofs2/osdc/ObjectCacher.h +++ /dev/null @@ -1,566 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -#ifndef __OBJECTCACHER_H_ -#define __OBJECTCACHER_H_ - -#include "include/types.h" -#include "include/lru.h" -#include "include/Context.h" - -#include "common/Cond.h" -#include "common/Thread.h" - -#include "Objecter.h" -#include "Filer.h" - -class Objecter; -class Objecter::OSDRead; -class Objecter::OSDWrite; - -class ObjectCacher { - public: - - class Object; - - // ******* BufferHead ********* - class BufferHead : public LRUObject { - public: - // states - static const int STATE_MISSING = 0; - static const int STATE_CLEAN = 1; - static const int STATE_DIRTY = 2; - static const int STATE_RX = 3; - static const int STATE_TX = 4; - - private: - // my fields - int state; - int ref; - struct { - off_t start, length; // bh extent in object - } ex; - - public: - Object *ob; - bufferlist bl; - tid_t last_write_tid; // version of bh (if non-zero) - utime_t last_write; - - map< off_t, list > waitfor_read; - - public: - // cons - BufferHead(Object *o) : - state(STATE_MISSING), - ref(0), - ob(o), - last_write_tid(0) {} - - // extent - off_t start() { return ex.start; } - void set_start(off_t s) { ex.start = s; } - off_t length() { return ex.length; } - void set_length(off_t l) { ex.length = l; } - off_t end() { return ex.start + ex.length; } - off_t last() { return end() - 1; } - - // states - void set_state(int s) { - if (s == STATE_RX || s == STATE_TX) get(); - if (state == STATE_RX || state == STATE_TX) put(); - state = s; - } - int get_state() { return state; } - - bool is_missing() { return state == STATE_MISSING; } - bool is_dirty() { return state == STATE_DIRTY; } - bool is_clean() { return state == STATE_CLEAN; } - bool is_tx() { return state == STATE_TX; } - bool is_rx() { return state == STATE_RX; } - - // reference counting - int get() { - assert(ref >= 0); - if (ref == 0) lru_pin(); - return ++ref; - } - int put() { - assert(ref > 0); - if (ref == 1) lru_unpin(); - --ref; - return ref; - } - }; - - - // ******* Object ********* - class Object { - private: - // ObjectCacher::Object fields - ObjectCacher *oc; - object_t oid; // this _always_ is oid.rev=0 - inodeno_t ino; - objectrev_t rev; // last rev we're written - ObjectLayout layout; - - public: - map data; - - tid_t last_write_tid; // version of bh (if non-zero) - tid_t last_ack_tid; // last update acked. - tid_t last_commit_tid; // last update commited. - - map< tid_t, list > waitfor_ack; - map< tid_t, list > waitfor_commit; - list waitfor_rd; - list waitfor_wr; - - // lock - static const int LOCK_NONE = 0; - static const int LOCK_WRLOCKING = 1; - static const int LOCK_WRLOCK = 2; - static const int LOCK_WRUNLOCKING = 3; - static const int LOCK_RDLOCKING = 4; - static const int LOCK_RDLOCK = 5; - static const int LOCK_RDUNLOCKING = 6; - static const int LOCK_UPGRADING = 7; // rd -> wr - static const int LOCK_DOWNGRADING = 8; // wr -> rd - int lock_state; - int wrlock_ref; // how many ppl want or are using a WRITE lock - int rdlock_ref; // how many ppl want or are using a READ lock - - public: - Object(ObjectCacher *_oc, object_t o, inodeno_t i, ObjectLayout& l) : - oc(_oc), - oid(o), ino(i), layout(l), - last_write_tid(0), last_ack_tid(0), last_commit_tid(0), - lock_state(LOCK_NONE), wrlock_ref(0), rdlock_ref(0) - {} - ~Object() { - assert(data.empty()); - } - - object_t get_oid() { return oid; } - inodeno_t get_ino() { return ino; } - - ObjectLayout& get_layout() { return layout; } - void set_layout(ObjectLayout& l) { layout = l; } - - bool can_close() { - return data.empty() && lock_state == LOCK_NONE && - waitfor_ack.empty() && waitfor_commit.empty() && - waitfor_rd.empty() && waitfor_wr.empty(); - } - - // bh - void add_bh(BufferHead *bh) { - // add to my map - assert(data.count(bh->start()) == 0); - - if (0) { // sanity check FIXME DEBUG - //cout << "add_bh " << bh->start() << "~" << bh->length() << endl; - map::iterator p = data.lower_bound(bh->start()); - if (p != data.end()) { - //cout << " after " << *p->second << endl; - //cout << " after starts at " << p->first << endl; - assert(p->first >= bh->end()); - } - if (p != data.begin()) { - p--; - //cout << " before starts at " << p->second->start() - //<< " and ends at " << p->second->end() << endl; - //cout << " before " << *p->second << endl; - assert(p->second->end() <= bh->start()); - } - } - - data[bh->start()] = bh; - } - void remove_bh(BufferHead *bh) { - assert(data.count(bh->start())); - data.erase(bh->start()); - } - bool is_empty() { return data.empty(); } - - // mid-level - BufferHead *split(BufferHead *bh, off_t off); - void merge_left(BufferHead *left, BufferHead *right); - void try_merge_bh(BufferHead *bh); - - int map_read(Objecter::OSDRead *rd, - map& hits, - map& missing, - map& rx); - BufferHead *map_write(Objecter::OSDWrite *wr); - - void truncate(off_t s); - - }; - - // ******* ObjectCacher ********* - // ObjectCacher fields - public: - Objecter *objecter; - Filer filer; - - private: - Mutex& lock; - - hash_map objects; - hash_map > objects_by_ino; - - set dirty_bh; - LRU lru_dirty, lru_rest; - - Cond flusher_cond; - bool flusher_stop; - void flusher_entry(); - class FlusherThread : public Thread { - ObjectCacher *oc; - public: - FlusherThread(ObjectCacher *o) : oc(o) {} - void *entry() { - oc->flusher_entry(); - return 0; - } - } flusher_thread; - - - // objects - Object *get_object(object_t oid, inodeno_t ino, ObjectLayout &l) { - // have it? - if (objects.count(oid)) - return objects[oid]; - - // create it. - Object *o = new Object(this, oid, ino, l); - objects[oid] = o; - objects_by_ino[ino].insert(o); - return o; - } - void close_object(Object *ob); - - // bh stats - Cond stat_cond; - int stat_waiter; - - off_t stat_clean; - off_t stat_dirty; - off_t stat_rx; - off_t stat_tx; - off_t stat_missing; - - void bh_stat_add(BufferHead *bh) { - switch (bh->get_state()) { - case BufferHead::STATE_MISSING: stat_missing += bh->length(); break; - case BufferHead::STATE_CLEAN: stat_clean += bh->length(); break; - case BufferHead::STATE_DIRTY: stat_dirty += bh->length(); break; - case BufferHead::STATE_TX: stat_tx += bh->length(); break; - case BufferHead::STATE_RX: stat_rx += bh->length(); break; - } - if (stat_waiter) stat_cond.Signal(); - } - void bh_stat_sub(BufferHead *bh) { - switch (bh->get_state()) { - case BufferHead::STATE_MISSING: stat_missing -= bh->length(); break; - case BufferHead::STATE_CLEAN: stat_clean -= bh->length(); break; - case BufferHead::STATE_DIRTY: stat_dirty -= bh->length(); break; - case BufferHead::STATE_TX: stat_tx -= bh->length(); break; - case BufferHead::STATE_RX: stat_rx -= bh->length(); break; - } - } - off_t get_stat_tx() { return stat_tx; } - off_t get_stat_rx() { return stat_rx; } - off_t get_stat_dirty() { return stat_dirty; } - off_t get_stat_clean() { return stat_clean; } - - void touch_bh(BufferHead *bh) { - if (bh->is_dirty()) - lru_dirty.lru_touch(bh); - else - lru_rest.lru_touch(bh); - } - - // bh states - void bh_set_state(BufferHead *bh, int s) { - // move between lru lists? - if (s == BufferHead::STATE_DIRTY && bh->get_state() != BufferHead::STATE_DIRTY) { - lru_rest.lru_remove(bh); - lru_dirty.lru_insert_top(bh); - dirty_bh.insert(bh); - } - if (s != BufferHead::STATE_DIRTY && bh->get_state() == BufferHead::STATE_DIRTY) { - lru_dirty.lru_remove(bh); - lru_rest.lru_insert_mid(bh); - dirty_bh.erase(bh); - } - - // set state - bh_stat_sub(bh); - bh->set_state(s); - bh_stat_add(bh); - } - - void copy_bh_state(BufferHead *bh1, BufferHead *bh2) { - bh_set_state(bh2, bh1->get_state()); - } - - void mark_missing(BufferHead *bh) { bh_set_state(bh, BufferHead::STATE_MISSING); }; - void mark_clean(BufferHead *bh) { bh_set_state(bh, BufferHead::STATE_CLEAN); }; - void mark_rx(BufferHead *bh) { bh_set_state(bh, BufferHead::STATE_RX); }; - void mark_tx(BufferHead *bh) { bh_set_state(bh, BufferHead::STATE_TX); }; - void mark_dirty(BufferHead *bh) { - bh_set_state(bh, BufferHead::STATE_DIRTY); - lru_dirty.lru_touch(bh); - //bh->set_dirty_stamp(g_clock.now()); - }; - - void bh_add(Object *ob, BufferHead *bh) { - ob->add_bh(bh); - if (bh->is_dirty()) { - lru_dirty.lru_insert_top(bh); - dirty_bh.insert(bh); - } else { - lru_rest.lru_insert_top(bh); - } - bh_stat_add(bh); - } - void bh_remove(Object *ob, BufferHead *bh) { - ob->remove_bh(bh); - if (bh->is_dirty()) { - lru_dirty.lru_remove(bh); - dirty_bh.erase(bh); - } else { - lru_rest.lru_remove(bh); - } - bh_stat_sub(bh); - } - - // io - void bh_read(BufferHead *bh); - void bh_write(BufferHead *bh); - - void trim(off_t max=-1); - void flush(off_t amount=0); - - bool flush(Object *o); - off_t release(Object *o); - void purge(Object *o); - - void rdlock(Object *o); - void rdunlock(Object *o); - void wrlock(Object *o); - void wrunlock(Object *o); - - public: - void bh_read_finish(object_t oid, off_t offset, size_t length, bufferlist &bl); - void bh_write_ack(object_t oid, off_t offset, size_t length, tid_t t); - void bh_write_commit(object_t oid, off_t offset, size_t length, tid_t t); - void lock_ack(list& oids, tid_t tid); - - class C_ReadFinish : public Context { - ObjectCacher *oc; - object_t oid; - off_t start; - size_t length; - public: - bufferlist bl; - C_ReadFinish(ObjectCacher *c, object_t o, off_t s, size_t l) : oc(c), oid(o), start(s), length(l) {} - void finish(int r) { - oc->bh_read_finish(oid, start, length, bl); - } - }; - - class C_WriteAck : public Context { - ObjectCacher *oc; - object_t oid; - off_t start; - size_t length; - public: - tid_t tid; - C_WriteAck(ObjectCacher *c, object_t o, off_t s, size_t l) : oc(c), oid(o), start(s), length(l) {} - void finish(int r) { - oc->bh_write_ack(oid, start, length, tid); - } - }; - class C_WriteCommit : public Context { - ObjectCacher *oc; - object_t oid; - off_t start; - size_t length; - public: - tid_t tid; - C_WriteCommit(ObjectCacher *c, object_t o, off_t s, size_t l) : oc(c), oid(o), start(s), length(l) {} - void finish(int r) { - oc->bh_write_commit(oid, start, length, tid); - } - }; - - class C_LockAck : public Context { - ObjectCacher *oc; - public: - list oids; - tid_t tid; - C_LockAck(ObjectCacher *c, object_t o) : oc(c) { - oids.push_back(o); - } - void finish(int r) { - oc->lock_ack(oids, tid); - } - }; - - - - public: - ObjectCacher(Objecter *o, Mutex& l) : - objecter(o), filer(o), lock(l), - flusher_stop(false), flusher_thread(this), - stat_waiter(0), - stat_clean(0), stat_dirty(0), stat_rx(0), stat_tx(0), stat_missing(0) { - flusher_thread.create(); - } - ~ObjectCacher() { - // we should be empty. - assert(objects.empty()); - assert(lru_rest.lru_get_size() == 0); - assert(lru_dirty.lru_get_size() == 0); - assert(dirty_bh.empty()); - - assert(flusher_thread.is_started()); - lock.Lock(); // hmm.. watch out for deadlock! - flusher_stop = true; - flusher_cond.Signal(); - lock.Unlock(); - flusher_thread.join(); - } - - - class C_RetryRead : public Context { - ObjectCacher *oc; - Objecter::OSDRead *rd; - inodeno_t ino; - Context *onfinish; - public: - C_RetryRead(ObjectCacher *_oc, Objecter::OSDRead *r, inodeno_t i, Context *c) : oc(_oc), rd(r), ino(i), onfinish(c) {} - void finish(int) { - int r = oc->readx(rd, ino, onfinish); - if (r > 0) { - onfinish->finish(r); - delete onfinish; - } - } - }; - - // non-blocking. async. - int readx(Objecter::OSDRead *rd, inodeno_t ino, Context *onfinish); - int writex(Objecter::OSDWrite *wr, inodeno_t ino); - - // write blocking - void wait_for_write(size_t len, Mutex& lock); - - // blocking. atomic+sync. - int atomic_sync_readx(Objecter::OSDRead *rd, inodeno_t ino, Mutex& lock); - int atomic_sync_writex(Objecter::OSDWrite *wr, inodeno_t ino, Mutex& lock); - - bool set_is_cached(inodeno_t ino); - bool set_is_dirty_or_committing(inodeno_t ino); - - bool flush_set(inodeno_t ino, Context *onfinish=0); - void flush_all(Context *onfinish=0); - - bool commit_set(inodeno_t ino, Context *oncommit); - void commit_all(Context *oncommit=0); - - void purge_set(inodeno_t ino); - - off_t release_set(inodeno_t ino); // returns # of bytes not released (ie non-clean) - - void truncate_set(inodeno_t ino, list& ex); - - void kick_sync_writers(inodeno_t ino); - void kick_sync_readers(inodeno_t ino); - - - // file functions - - /*** async+caching (non-blocking) file interface ***/ - int file_read(inode_t& inode, - off_t offset, size_t len, - bufferlist *bl, - Context *onfinish) { - Objecter::OSDRead *rd = new Objecter::OSDRead(bl); - filer.file_to_extents(inode, offset, len, rd->extents); - return readx(rd, inode.ino, onfinish); - } - - int file_write(inode_t& inode, - off_t offset, size_t len, - bufferlist& bl, - objectrev_t rev=0) { - Objecter::OSDWrite *wr = new Objecter::OSDWrite(bl); - filer.file_to_extents(inode, offset, len, wr->extents); - return writex(wr, inode.ino); - } - - - - /*** sync+blocking file interface ***/ - - int file_atomic_sync_read(inode_t& inode, - off_t offset, size_t len, - bufferlist *bl, - Mutex &lock) { - Objecter::OSDRead *rd = new Objecter::OSDRead(bl); - filer.file_to_extents(inode, offset, len, rd->extents); - return atomic_sync_readx(rd, inode.ino, lock); - } - - int file_atomic_sync_write(inode_t& inode, - off_t offset, size_t len, - bufferlist& bl, - Mutex &lock, - objectrev_t rev=0) { - Objecter::OSDWrite *wr = new Objecter::OSDWrite(bl); - filer.file_to_extents(inode, offset, len, wr->extents); - return atomic_sync_writex(wr, inode.ino, lock); - } - -}; - - -inline ostream& operator<<(ostream& out, ObjectCacher::BufferHead &bh) -{ - out << "bh[" - << bh.start() << "~" << bh.length() - << " (" << bh.bl.length() << ")" - << " v " << bh.last_write_tid; - if (bh.is_tx()) out << " tx"; - if (bh.is_rx()) out << " rx"; - if (bh.is_dirty()) out << " dirty"; - if (bh.is_clean()) out << " clean"; - if (bh.is_missing()) out << " missing"; - if (bh.bl.length() > 0) out << " firstbyte=" << (int)bh.bl[0]; - out << "]"; - return out; -} - -inline ostream& operator<<(ostream& out, ObjectCacher::Object &ob) -{ - out << "object[" - << hex << ob.get_oid() << " ino " << ob.get_ino() << dec - << " wr " << ob.last_write_tid << "/" << ob.last_ack_tid << "/" << ob.last_commit_tid; - - switch (ob.lock_state) { - case ObjectCacher::Object::LOCK_WRLOCKING: out << " wrlocking"; break; - case ObjectCacher::Object::LOCK_WRLOCK: out << " wrlock"; break; - case ObjectCacher::Object::LOCK_WRUNLOCKING: out << " wrunlocking"; break; - case ObjectCacher::Object::LOCK_RDLOCKING: out << " rdlocking"; break; - case ObjectCacher::Object::LOCK_RDLOCK: out << " rdlock"; break; - case ObjectCacher::Object::LOCK_RDUNLOCKING: out << " rdunlocking"; break; - } - - out << "]"; - return out; -} - -#endif diff --git a/branches/sage/ebofs2/osdc/Objecter.cc b/branches/sage/ebofs2/osdc/Objecter.cc deleted file mode 100644 index 84563b0af9720..0000000000000 --- a/branches/sage/ebofs2/osdc/Objecter.cc +++ /dev/null @@ -1,913 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "Objecter.h" -#include "osd/OSDMap.h" -#include "mon/MonMap.h" - -#include "msg/Messenger.h" -#include "msg/Message.h" - -#include "messages/MOSDOp.h" -#include "messages/MOSDOpReply.h" -#include "messages/MOSDMap.h" -#include "messages/MOSDGetMap.h" - -#include "messages/MOSDFailure.h" - -#include - -#include "config.h" - -#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_objecter) *_dout << dbeginl << g_clock.now() << " " << messenger->get_myname() << ".objecter " -#define derr(x) if (x <= g_conf.debug || x <= g_conf.debug_objecter) *_derr << dbeginl << g_clock.now() << " " << messenger->get_myname() << ".objecter " - - -// messages ------------------------------ - -void Objecter::init() -{ - assert(client_lock.is_locked()); // otherwise event cancellation is unsafe - timer.add_event_after(g_conf.objecter_tick_interval, new C_Tick(this)); -} - -void Objecter::shutdown() -{ - assert(client_lock.is_locked()); // otherwise event cancellation is unsafe - timer.cancel_all(); -} - - -void Objecter::dispatch(Message *m) -{ - switch (m->get_type()) { - case MSG_OSD_OPREPLY: - handle_osd_op_reply((MOSDOpReply*)m); - break; - - case MSG_OSD_MAP: - handle_osd_map((MOSDMap*)m); - break; - - default: - dout(1) << "don't know message type " << m->get_type() << dendl; - assert(0); - } -} - -void Objecter::handle_osd_map(MOSDMap *m) -{ - assert(osdmap); - - if (m->get_last() <= osdmap->get_epoch()) { - dout(3) << "handle_osd_map ignoring epochs [" - << m->get_first() << "," << m->get_last() - << "] <= " << osdmap->get_epoch() << dendl; - } - else { - dout(3) << "handle_osd_map got epochs [" - << m->get_first() << "," << m->get_last() - << "] > " << osdmap->get_epoch() - << dendl; - - set changed_pgs; - - for (epoch_t e = osdmap->get_epoch() + 1; - e <= m->get_last(); - e++) { - if (m->incremental_maps.count(e)) { - dout(3) << "handle_osd_map decoding incremental epoch " << e << dendl; - OSDMap::Incremental inc; - int off = 0; - inc.decode(m->incremental_maps[e], off); - osdmap->apply_incremental(inc); - - // notify messenger - for (map >::iterator i = inc.new_down.begin(); - i != inc.new_down.end(); - i++) - messenger->mark_down(i->second.first.addr); - - } - else if (m->maps.count(e)) { - dout(3) << "handle_osd_map decoding full epoch " << e << dendl; - osdmap->decode(m->maps[e]); - } - else { - dout(3) << "handle_osd_map requesting missing epoch " << osdmap->get_epoch()+1 << dendl; - int mon = monmap->pick_mon(); - messenger->send_message(new MOSDGetMap(osdmap->get_epoch()+1), - monmap->get_inst(mon)); - break; - } - - // scan pgs for changes - scan_pgs(changed_pgs); - - assert(e == osdmap->get_epoch()); - } - - // kick requests who might be timing out on the wrong osds - if (!changed_pgs.empty()) - kick_requests(changed_pgs); - } - - delete m; -} - - -void Objecter::maybe_request_map() -{ - utime_t now; - if (!osdmap) goto yes; - if (last_epoch_requested <= osdmap->get_epoch()) goto yes; - now = g_clock.now(); - if (now - last_epoch_requested_stamp > g_conf.objecter_map_request_interval) goto yes; - return; - - yes: - dout(10) << "maybe_request_map requesting next osd map" << dendl; - last_epoch_requested_stamp = now; - last_epoch_requested = osdmap->get_epoch()+1; - messenger->send_message(new MOSDGetMap(osdmap->get_epoch(), last_epoch_requested), - monmap->get_inst(monmap->pick_mon())); -} - - - -void Objecter::scan_pgs(set& changed_pgs) -{ - dout(10) << "scan_pgs" << dendl; - - for (hash_map::iterator i = pg_map.begin(); - i != pg_map.end(); - i++) { - pg_t pgid = i->first; - PG& pg = i->second; - - // calc new. - vector other; - osdmap->pg_to_acting_osds(pgid, other); - - if (other == pg.acting) - continue; // no change. - - other.swap(pg.acting); - - if (g_conf.osd_rep == OSD_REP_PRIMARY) { - // same primary? - if (!other.empty() && - !pg.acting.empty() && - other[0] == pg.acting[0]) - continue; - } - else if (g_conf.osd_rep == OSD_REP_SPLAY) { - // same primary and acker? - if (!other.empty() && - !pg.acting.empty() && - other[0] == pg.acting[0] && - other[other.size() > 1 ? 1:0] == pg.acting[pg.acting.size() > 1 ? 1:0]) - continue; - } - else if (g_conf.osd_rep == OSD_REP_CHAIN) { - // any change is significant. - } - - // changed significantly. - dout(10) << "scan_pgs pg " << pgid - << " (" << pg.active_tids << ")" - << " " << other << " -> " << pg.acting - << dendl; - changed_pgs.insert(pgid); - } -} - -void Objecter::kick_requests(set& changed_pgs) -{ - dout(10) << "kick_requests in pgs " << changed_pgs << dendl; - - for (set::iterator i = changed_pgs.begin(); - i != changed_pgs.end(); - i++) { - pg_t pgid = *i; - PG& pg = pg_map[pgid]; - - // resubmit ops! - set tids; - tids.swap( pg.active_tids ); - close_pg( pgid ); // will pbly reopen, unless it's just commits we're missing - - for (set::iterator p = tids.begin(); - p != tids.end(); - p++) { - tid_t tid = *p; - - if (op_modify.count(tid)) { - OSDModify *wr = op_modify[tid]; - op_modify.erase(tid); - - // WRITE - if (wr->tid_version.count(tid)) { - if (wr->op == OSD_OP_WRITE && - !g_conf.objecter_buffer_uncommitted) { - dout(0) << "kick_requests missing commit, cannot replay: objecter_buffer_uncommitted == FALSE" << dendl; - } else { - dout(3) << "kick_requests missing commit, replay write " << tid - << " v " << wr->tid_version[tid] << dendl; - modifyx_submit(wr, wr->waitfor_commit[tid], tid); - } - } - else if (wr->waitfor_ack.count(tid)) { - dout(3) << "kick_requests missing ack, resub write " << tid << dendl; - modifyx_submit(wr, wr->waitfor_ack[tid], tid); - } - } - - else if (op_read.count(tid)) { - // READ - OSDRead *rd = op_read[tid]; - op_read.erase(tid); - dout(3) << "kick_requests resub read " << tid << dendl; - - // resubmit - readx_submit(rd, rd->ops[tid], true); - rd->ops.erase(tid); - } - - else if (op_stat.count(tid)) { - OSDStat *st = op_stat[tid]; - op_stat.erase(tid); - - dout(3) << "kick_requests resub stat " << tid << dendl; - - // resubmit - stat_submit(st); - } - - else - assert(0); - } - } -} - - -void Objecter::tick() -{ - dout(10) << "tick" << dendl; - - // look for laggy pgs - utime_t cutoff = g_clock.now(); - cutoff -= g_conf.objecter_timeout; // timeout - for (hash_map::iterator i = pg_map.begin(); - i != pg_map.end(); - i++) { - if (!i->second.active_tids.empty() && - i->second.last < cutoff) { - dout(10) << "tick pg " << i->first << " is laggy" << dendl; - maybe_request_map(); - break; - } - } - - // reschedule - timer.add_event_after(g_conf.objecter_tick_interval, new C_Tick(this)); -} - - - -void Objecter::handle_osd_op_reply(MOSDOpReply *m) -{ - // read or modify? - switch (m->get_op()) { - case OSD_OP_READ: - handle_osd_read_reply(m); - break; - - case OSD_OP_STAT: - handle_osd_stat_reply(m); - break; - - case OSD_OP_WRNOOP: - case OSD_OP_WRITE: - case OSD_OP_ZERO: - case OSD_OP_DELETE: - case OSD_OP_WRUNLOCK: - case OSD_OP_WRLOCK: - case OSD_OP_RDLOCK: - case OSD_OP_RDUNLOCK: - case OSD_OP_UPLOCK: - case OSD_OP_DNLOCK: - handle_osd_modify_reply(m); - break; - - default: - assert(0); - } -} - - - -// stat ----------------------------------- - -tid_t Objecter::stat(object_t oid, off_t *size, ObjectLayout ol, Context *onfinish) -{ - OSDStat *st = new OSDStat(size); - st->extents.push_back(ObjectExtent(oid, 0, 0)); - st->extents.front().layout = ol; - st->onfinish = onfinish; - - return stat_submit(st); -} - -tid_t Objecter::stat_submit(OSDStat *st) -{ - // find OSD - ObjectExtent &ex = st->extents.front(); - PG &pg = get_pg( ex.layout.pgid ); - - // pick tid - last_tid++; - assert(client_inc >= 0); - - // add to gather set - st->tid = last_tid; - op_stat[last_tid] = st; - - pg.active_tids.insert(last_tid); - - // send? - - dout(10) << "stat_submit " << st << " tid " << last_tid - << " oid " << ex.oid - << " " << ex.layout - << " osd" << pg.acker() - << dendl; - - if (pg.acker() >= 0) { - MOSDOp *m = new MOSDOp(messenger->get_myinst(), client_inc, last_tid, - ex.oid, ex.layout, osdmap->get_epoch(), - OSD_OP_STAT); - - messenger->send_message(m, osdmap->get_inst(pg.acker())); - } - - return last_tid; -} - -void Objecter::handle_osd_stat_reply(MOSDOpReply *m) -{ - // get pio - tid_t tid = m->get_tid(); - - if (op_stat.count(tid) == 0) { - dout(7) << "handle_osd_stat_reply " << tid << " ... stray" << dendl; - delete m; - return; - } - - dout(7) << "handle_osd_stat_reply " << tid - << " r=" << m->get_result() - << " size=" << m->get_object_size() - << dendl; - OSDStat *st = op_stat[ tid ]; - op_stat.erase( tid ); - - // remove from osd/tid maps - PG& pg = get_pg( m->get_pg() ); - assert(pg.active_tids.count(tid)); - pg.active_tids.erase(tid); - if (pg.active_tids.empty()) close_pg( m->get_pg() ); - - // success? - if (m->get_result() == -EAGAIN) { - dout(7) << " got -EAGAIN, resubmitting" << dendl; - stat_submit(st); - delete m; - return; - } - //assert(m->get_result() >= 0); - - // ok! - if (m->get_result() < 0) { - *st->size = -1; - } else { - *st->size = m->get_object_size(); - } - - // finish, clean up - Context *onfinish = st->onfinish; - - // done - delete st; - if (onfinish) { - onfinish->finish(m->get_result()); - delete onfinish; - } - - delete m; -} - - -// read ----------------------------------- - - -tid_t Objecter::read(object_t oid, off_t off, size_t len, ObjectLayout ol, bufferlist *bl, - Context *onfinish) -{ - OSDRead *rd = new OSDRead(bl); - rd->extents.push_back(ObjectExtent(oid, off, len)); - rd->extents.front().layout = ol; - readx(rd, onfinish); - return last_tid; -} - - -tid_t Objecter::readx(OSDRead *rd, Context *onfinish) -{ - rd->onfinish = onfinish; - - // issue reads - for (list::iterator it = rd->extents.begin(); - it != rd->extents.end(); - it++) - readx_submit(rd, *it); - - return last_tid; -} - -tid_t Objecter::readx_submit(OSDRead *rd, ObjectExtent &ex, bool retry) -{ - // find OSD - PG &pg = get_pg( ex.layout.pgid ); - - // pick tid - last_tid++; - assert(client_inc >= 0); - - // add to gather set - rd->ops[last_tid] = ex; - op_read[last_tid] = rd; - - pg.active_tids.insert(last_tid); - pg.last = g_clock.now(); - - // send? - dout(10) << "readx_submit " << rd << " tid " << last_tid - << " oid " << ex.oid << " " << ex.start << "~" << ex.length - << " (" << ex.buffer_extents.size() << " buffer fragments)" - << " " << ex.layout - << " osd" << pg.acker() - << dendl; - - if (pg.acker() >= 0) { - MOSDOp *m = new MOSDOp(messenger->get_myinst(), client_inc, last_tid, - ex.oid, ex.layout, osdmap->get_epoch(), - OSD_OP_READ); - m->set_length(ex.length); - m->set_offset(ex.start); - m->set_retry_attempt(retry); - - int who = pg.acker(); - if (rd->balance_reads) { - int replica = messenger->get_myname().num() % pg.acting.size(); - who = pg.acting[replica]; - dout(-10) << "readx_submit reading from random replica " << replica - << " = osd" << who << dendl; - } - messenger->send_message(m, osdmap->get_inst(who)); - } else - maybe_request_map(); - - return last_tid; -} - - -void Objecter::handle_osd_read_reply(MOSDOpReply *m) -{ - // get pio - tid_t tid = m->get_tid(); - - if (op_read.count(tid) == 0) { - dout(7) << "handle_osd_read_reply " << tid << " ... stray" << dendl; - delete m; - return; - } - - dout(7) << "handle_osd_read_reply " << tid << dendl; - OSDRead *rd = op_read[ tid ]; - op_read.erase( tid ); - - // remove from osd/tid maps - PG& pg = get_pg( m->get_pg() ); - assert(pg.active_tids.count(tid)); - pg.active_tids.erase(tid); - if (pg.active_tids.empty()) close_pg( m->get_pg() ); - - // our op finished - rd->ops.erase(tid); - - // success? - if (m->get_result() == -EAGAIN) { - dout(7) << " got -EAGAIN, resubmitting" << dendl; - readx_submit(rd, rd->ops[tid], true); - delete m; - return; - } - //assert(m->get_result() >= 0); - - // what buffer offset are we? - dout(7) << " got frag from " << m->get_oid() << " " - << m->get_offset() << "~" << m->get_length() - << ", still have " << rd->ops.size() << " more ops" << dendl; - - if (rd->ops.empty()) { - // all done - size_t bytes_read = 0; - - if (rd->read_data.size()) { - dout(15) << " assembling frags" << dendl; - - /** FIXME This doesn't handle holes efficiently. - * It allocates zero buffers to fill whole buffer, and - * then discards trailing ones at the end. - * - * Actually, this whole thing is pretty messy with temporary bufferlist*'s all over - * the heap. - */ - - // we have other fragments, assemble them all... blech! - rd->read_data[m->get_oid()] = new bufferlist; - rd->read_data[m->get_oid()]->claim( m->get_data() ); - - // map extents back into buffer - map by_off; // buffer offset -> bufferlist - - // for each object extent... - for (list::iterator eit = rd->extents.begin(); - eit != rd->extents.end(); - eit++) { - bufferlist *ox_buf = rd->read_data[eit->oid]; - unsigned ox_len = ox_buf->length(); - unsigned ox_off = 0; - assert(ox_len <= eit->length); - - // for each buffer extent we're mapping into... - for (map::iterator bit = eit->buffer_extents.begin(); - bit != eit->buffer_extents.end(); - bit++) { - dout(21) << " object " << eit->oid << " extent " << eit->start << "~" << eit->length << " : ox offset " << ox_off << " -> buffer extent " << bit->first << "~" << bit->second << dendl; - by_off[bit->first] = new bufferlist; - - if (ox_off + bit->second <= ox_len) { - // we got the whole bx - by_off[bit->first]->substr_of(*ox_buf, ox_off, bit->second); - if (bytes_read < bit->first + bit->second) - bytes_read = bit->first + bit->second; - } else if (ox_off + bit->second > ox_len && ox_off < ox_len) { - // we got part of this bx - by_off[bit->first]->substr_of(*ox_buf, ox_off, (ox_len-ox_off)); - if (bytes_read < bit->first + ox_len-ox_off) - bytes_read = bit->first + ox_len-ox_off; - - // zero end of bx - dout(21) << " adding some zeros to the end " << ox_off + bit->second-ox_len << dendl; - bufferptr z(ox_off + bit->second - ox_len); - z.zero(); - by_off[bit->first]->append( z ); - } else { - // we got none of this bx. zero whole thing. - assert(ox_off >= ox_len); - dout(21) << " adding all zeros for this bit " << bit->second << dendl; - bufferptr z(bit->second); - z.zero(); - by_off[bit->first]->append( z ); - } - ox_off += bit->second; - } - assert(ox_off == eit->length); - } - - // sort and string bits together - for (map::iterator it = by_off.begin(); - it != by_off.end(); - it++) { - assert(it->second->length()); - if (it->first < (off_t)bytes_read) { - dout(21) << " concat buffer frag off " << it->first << " len " << it->second->length() << dendl; - rd->bl->claim_append(*(it->second)); - } else { - dout(21) << " NO concat zero buffer frag off " << it->first << " len " << it->second->length() << dendl; - } - delete it->second; - } - - // trim trailing zeros? - if (rd->bl->length() > bytes_read) { - dout(10) << " trimming off trailing zeros . bytes_read=" << bytes_read - << " len=" << rd->bl->length() << dendl; - rd->bl->splice(bytes_read, rd->bl->length() - bytes_read); - assert(bytes_read == rd->bl->length()); - } - - // hose p->read_data bufferlist*'s - for (map::iterator it = rd->read_data.begin(); - it != rd->read_data.end(); - it++) { - delete it->second; - } - } else { - dout(15) << " only one frag" << dendl; - - // only one fragment, easy - rd->bl->claim( m->get_data() ); - bytes_read = rd->bl->length(); - } - - // finish, clean up - Context *onfinish = rd->onfinish; - - dout(7) << " " << bytes_read << " bytes " - << rd->bl->length() - << dendl; - - // done - delete rd; - if (onfinish) { - onfinish->finish(bytes_read);// > 0 ? bytes_read:m->get_result()); - delete onfinish; - } - } else { - // store my bufferlist for later assembling - rd->read_data[m->get_oid()] = new bufferlist; - rd->read_data[m->get_oid()]->claim( m->get_data() ); - } - - delete m; -} - - - -// write ------------------------------------ - -tid_t Objecter::write(object_t oid, off_t off, size_t len, ObjectLayout ol, bufferlist &bl, - Context *onack, Context *oncommit) -{ - OSDWrite *wr = new OSDWrite(bl); - wr->extents.push_back(ObjectExtent(oid, off, len)); - wr->extents.front().layout = ol; - wr->extents.front().buffer_extents[0] = len; - modifyx(wr, onack, oncommit); - return last_tid; -} - - -// zero - -tid_t Objecter::zero(object_t oid, off_t off, size_t len, ObjectLayout ol, - Context *onack, Context *oncommit) -{ - OSDModify *z = new OSDModify(OSD_OP_ZERO); - z->extents.push_back(ObjectExtent(oid, off, len)); - z->extents.front().layout = ol; - modifyx(z, onack, oncommit); - return last_tid; -} - - -// lock ops - -tid_t Objecter::lock(int op, object_t oid, ObjectLayout ol, - Context *onack, Context *oncommit) -{ - OSDModify *l = new OSDModify(op); - l->extents.push_back(ObjectExtent(oid, 0, 0)); - l->extents.front().layout = ol; - modifyx(l, onack, oncommit); - return last_tid; -} - - - -// generic modify ----------------------------------- - -tid_t Objecter::modifyx(OSDModify *wr, Context *onack, Context *oncommit) -{ - wr->onack = onack; - wr->oncommit = oncommit; - - // issue writes/whatevers - for (list::iterator it = wr->extents.begin(); - it != wr->extents.end(); - it++) - modifyx_submit(wr, *it); - - return last_tid; -} - - -tid_t Objecter::modifyx_submit(OSDModify *wr, ObjectExtent &ex, tid_t usetid) -{ - // find - PG &pg = get_pg( ex.layout.pgid ); - - // pick tid - tid_t tid; - if (usetid > 0) - tid = usetid; - else - tid = ++last_tid; - assert(client_inc >= 0); - - // add to gather set - wr->waitfor_ack[tid] = ex; - wr->waitfor_commit[tid] = ex; - op_modify[tid] = wr; - pg.active_tids.insert(tid); - pg.last = g_clock.now(); - - ++num_unacked; - ++num_uncommitted; - - // send? - dout(10) << "modifyx_submit " << MOSDOp::get_opname(wr->op) << " tid " << tid - << " oid " << ex.oid - << " " << ex.start << "~" << ex.length - << " " << ex.layout - << " osd" << pg.primary() - << dendl; - if (pg.primary() >= 0) { - MOSDOp *m = new MOSDOp(messenger->get_myinst(), client_inc, tid, - ex.oid, ex.layout, osdmap->get_epoch(), - wr->op); - m->set_length(ex.length); - m->set_offset(ex.start); - if (usetid > 0) - m->set_retry_attempt(true); - - if (wr->tid_version.count(tid)) - m->set_version(wr->tid_version[tid]); // we're replaying this op! - - // what type of op? - switch (wr->op) { - case OSD_OP_WRITE: - { - // map buffer segments into this extent - // (may be fragmented bc of striping) - bufferlist cur; - for (map::iterator bit = ex.buffer_extents.begin(); - bit != ex.buffer_extents.end(); - bit++) - ((OSDWrite*)wr)->bl.copy(bit->first, bit->second, cur); - assert(cur.length() == ex.length); - m->set_data(cur);//.claim(cur); - } - break; - } - - messenger->send_message(m, osdmap->get_inst(pg.primary())); - } else - maybe_request_map(); - - dout(5) << num_unacked << " unacked, " << num_uncommitted << " uncommitted" << dendl; - - return tid; -} - - - -void Objecter::handle_osd_modify_reply(MOSDOpReply *m) -{ - // get pio - tid_t tid = m->get_tid(); - - if (op_modify.count(tid) == 0) { - dout(7) << "handle_osd_modify_reply " << tid - << (m->get_commit() ? " commit":" ack") - << " ... stray" << dendl; - delete m; - return; - } - - dout(7) << "handle_osd_modify_reply " << tid - << (m->get_commit() ? " commit":" ack") - << " v " << m->get_version() - << dendl; - OSDModify *wr = op_modify[ tid ]; - - Context *onack = 0; - Context *oncommit = 0; - - PG &pg = get_pg( m->get_pg() ); - - // ignore? - if (pg.acker() != m->get_source().num()) { - dout(7) << " ignoring ack|commit from non-acker" << dendl; - delete m; - return; - } - - assert(m->get_result() >= 0); - - // ack or commit? - if (m->get_commit()) { - //dout(15) << " handle_osd_write_reply commit on " << tid << dendl; - assert(wr->tid_version.count(tid) == 0 || - m->get_version() == wr->tid_version[tid]); - - // remove from tid/osd maps - assert(pg.active_tids.count(tid)); - pg.active_tids.erase(tid); - if (pg.active_tids.empty()) close_pg( m->get_pg() ); - - // commit. - op_modify.erase( tid ); - wr->waitfor_ack.erase(tid); - wr->waitfor_commit.erase(tid); - - num_uncommitted--; - - if (wr->waitfor_commit.empty()) { - onack = wr->onack; - oncommit = wr->oncommit; - delete wr; - } - } else { - // ack. - //dout(15) << " handle_osd_write_reply ack on " << tid << dendl; - assert(wr->waitfor_ack.count(tid)); - wr->waitfor_ack.erase(tid); - - num_unacked--; - - if (wr->tid_version.count(tid) && - wr->tid_version[tid].version != m->get_version().version) { - dout(-10) << "handle_osd_modify_reply WARNING: replay of tid " << tid - << " did not achieve previous ordering" << dendl; - } - wr->tid_version[tid] = m->get_version(); - - if (wr->waitfor_ack.empty()) { - onack = wr->onack; - wr->onack = 0; // only do callback once - - // buffer uncommitted? - if (!g_conf.objecter_buffer_uncommitted && - wr->op == OSD_OP_WRITE) { - // discard buffer! - ((OSDWrite*)wr)->bl.clear(); - } - } - } - - dout(5) << num_unacked << " unacked, " << num_uncommitted << " uncommitted" << dendl; - - // do callbacks - if (onack) { - onack->finish(0); - delete onack; - } - if (oncommit) { - oncommit->finish(0); - delete oncommit; - } - - delete m; -} - - - -void Objecter::ms_handle_failure(Message *m, entity_name_t dest, const entity_inst_t& inst) -{ - if (dest.is_mon()) { - // try a new mon - int mon = monmap->pick_mon(true); - dout(0) << "ms_handle_failure " << dest << " inst " << inst - << ", resending to mon" << mon - << dendl; - messenger->send_message(m, monmap->get_inst(mon)); - } - else if (dest.is_osd()) { - int mon = monmap->pick_mon(); - dout(0) << "ms_handle_failure " << dest << " inst " << inst - << ", dropping and reporting to mon" << mon - << dendl; - messenger->send_message(new MOSDFailure(messenger->get_myinst(), inst, osdmap->get_epoch()), - monmap->get_inst(mon)); - delete m; - } else { - dout(0) << "ms_handle_failure " << dest << " inst " << inst - << ", dropping" << dendl; - delete m; - } -} diff --git a/branches/sage/ebofs2/osdc/Objecter.h b/branches/sage/ebofs2/osdc/Objecter.h deleted file mode 100644 index 82a437aa04f8d..0000000000000 --- a/branches/sage/ebofs2/osdc/Objecter.h +++ /dev/null @@ -1,230 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __OBJECTER_H -#define __OBJECTER_H - -#include "include/types.h" -#include "include/buffer.h" - -#include "osd/OSDMap.h" -#include "messages/MOSDOp.h" - -#include "common/Timer.h" - -#include -#include -#include -using namespace std; -using namespace __gnu_cxx; - -class Context; -class Messenger; -class OSDMap; -class MonMap; -class Message; - -class Objecter { - public: - Messenger *messenger; - MonMap *monmap; - OSDMap *osdmap; - - private: - tid_t last_tid; - int client_inc; - int num_unacked; - int num_uncommitted; - - epoch_t last_epoch_requested; - utime_t last_epoch_requested_stamp; - - void maybe_request_map(); - - Mutex &client_lock; - SafeTimer timer; - - class C_Tick : public Context { - Objecter *ob; - public: - C_Tick(Objecter *o) : ob(o) {} - void finish(int r) { ob->tick(); } - }; - void tick(); - - - /*** track pending operations ***/ - // read - public: - class OSDOp { - public: - list extents; - virtual ~OSDOp() {} - }; - - class OSDRead : public OSDOp { - public: - bufferlist *bl; - Context *onfinish; - map ops; - map read_data; // bits of data as they come back - int balance_reads; // if non-zero, direct reads to a pseudo-random replica - - OSDRead(bufferlist *b) : bl(b), onfinish(0), balance_reads(0) { - bl->clear(); - } - }; - - class OSDStat : public OSDOp { - public: - tid_t tid; - off_t *size; // where the size goes. - Context *onfinish; - OSDStat(off_t *s) : tid(0), size(s), onfinish(0) { } - }; - - // generic modify - class OSDModify : public OSDOp { - public: - int op; - list extents; - Context *onack; - Context *oncommit; - map waitfor_ack; - map tid_version; - map waitfor_commit; - - OSDModify(int o) : op(o), onack(0), oncommit(0) {} - }; - - // write (includes the bufferlist) - class OSDWrite : public OSDModify { - public: - bufferlist bl; - OSDWrite(bufferlist &b) : OSDModify(OSD_OP_WRITE), bl(b) {} - }; - - - - private: - // pending ops - hash_map op_stat; - hash_map op_read; - hash_map op_modify; - - /** - * track pending ops by pg - * ...so we can cope with failures, map changes - */ - class PG { - public: - vector acting; - set active_tids; // active ops - utime_t last; - - PG() {} - - // primary - where i write - int primary() { - if (acting.empty()) return -1; - return acting[0]; - } - // acker - where i read, and receive acks from - int acker() { - if (acting.empty()) return -1; - if (g_conf.osd_rep == OSD_REP_PRIMARY) - return acting[0]; - else - return acting[acting.size() > 1 ? 1:0]; - } - }; - - hash_map pg_map; - - - PG &get_pg(pg_t pgid) { - if (!pg_map.count(pgid)) - osdmap->pg_to_acting_osds(pgid, pg_map[pgid].acting); - return pg_map[pgid]; - } - void close_pg(pg_t pgid) { - assert(pg_map.count(pgid)); - assert(pg_map[pgid].active_tids.empty()); - pg_map.erase(pgid); - } - void scan_pgs(set& chnaged_pgs); - void kick_requests(set& changed_pgs); - - - public: - Objecter(Messenger *m, MonMap *mm, OSDMap *om, Mutex& l) : - messenger(m), monmap(mm), osdmap(om), - last_tid(0), client_inc(-1), - num_unacked(0), num_uncommitted(0), - last_epoch_requested(0), - client_lock(l), timer(l) - { } - ~Objecter() { } - - void init(); - void shutdown(); - - // messages - public: - void dispatch(Message *m); - void handle_osd_op_reply(class MOSDOpReply *m); - void handle_osd_stat_reply(class MOSDOpReply *m); - void handle_osd_read_reply(class MOSDOpReply *m); - void handle_osd_modify_reply(class MOSDOpReply *m); - void handle_osd_lock_reply(class MOSDOpReply *m); - void handle_osd_map(class MOSDMap *m); - - private: - tid_t readx_submit(OSDRead *rd, ObjectExtent& ex, bool retry=false); - tid_t modifyx_submit(OSDModify *wr, ObjectExtent& ex, tid_t tid=0); - tid_t stat_submit(OSDStat *st); - - // public interface - public: - bool is_active() { - return !(op_read.empty() && op_modify.empty()); - } - - int get_client_incarnation() { return client_inc; } - void set_client_incarnation(int inc) { - client_inc = inc; - } - - // med level - tid_t readx(OSDRead *read, Context *onfinish); - tid_t modifyx(OSDModify *wr, Context *onack, Context *oncommit); - //tid_t lockx(OSDLock *l, Context *onack, Context *oncommit); - - // even lazier - tid_t read(object_t oid, off_t off, size_t len, ObjectLayout ol, bufferlist *bl, - Context *onfinish); - tid_t write(object_t oid, off_t off, size_t len, ObjectLayout ol, bufferlist &bl, - Context *onack, Context *oncommit); - tid_t zero(object_t oid, off_t off, size_t len, ObjectLayout ol, - Context *onack, Context *oncommit); - tid_t stat(object_t oid, off_t *size, ObjectLayout ol, Context *onfinish); - - tid_t lock(int op, object_t oid, ObjectLayout ol, Context *onack, Context *oncommit); - - - void ms_handle_failure(Message *m, entity_name_t dest, const entity_inst_t& inst); - -}; - -#endif diff --git a/branches/sage/ebofs2/script/add_header.pl b/branches/sage/ebofs2/script/add_header.pl deleted file mode 100755 index 023c06e455fd1..0000000000000 --- a/branches/sage/ebofs2/script/add_header.pl +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/perl - -use strict; -my $fn = shift @ARGV; -my $old = `cat $fn`; - -my $header = `cat doc/header.txt`; - -# strip existing header -my $new = $old; -if ($new =~ /^(.*)\* Ceph - scalable distributed file system/s) { - my ($a,@b) = split(/\*\/\n/, $new); - $new = join("*/\n",@b); -} -$new = $header . $new; - -if ($new ne $old) { - open(O, ">$fn.new"); - print O $new; - close O; - system "diff $fn $fn.new"; - rename "$fn.new", $fn; - #unlink "$fn.new"; - -} - diff --git a/branches/sage/ebofs2/script/adjusttabs.pl b/branches/sage/ebofs2/script/adjusttabs.pl deleted file mode 100755 index 66edff2ac6c02..0000000000000 --- a/branches/sage/ebofs2/script/adjusttabs.pl +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/perl - -my $tablen = shift @ARGV; -my $fn = shift @ARGV; - -my $tab = ' ' x $tablen; -open(I, $fn); -my $f; -my $oldtab = ' ' x 4; -while () { - if (my ($oldlen) = /\-\*\- .*tab-width:(\d)/) { - print "old length was $oldlen\n"; - $oldtab = ' ' x $oldlen; - s/tab-width:\d/tab-width:$tablen/; - } - s/\t/$oldtab/g; - $f .= $_; -} -close I; -open(O, ">$fn.new"); -print O $f; -close O; - -rename "$fn.new", $fn; diff --git a/branches/sage/ebofs2/script/check_cache_dumps.pl b/branches/sage/ebofs2/script/check_cache_dumps.pl deleted file mode 100755 index 95bd28a474991..0000000000000 --- a/branches/sage/ebofs2/script/check_cache_dumps.pl +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/perl - -my $epoch = shift || die "specify epoch"; - -my %auth; # mds -> id -> replica -> nonce -my %replica; # mds -> id -> auth -> nonce - -print "reading\n"; -for (my $i=0; -e "cachedump.$epoch.mds$i"; $i++) { - open(O,"cachedump.$epoch.mds$i"); - while () { - my ($name,$s); - ($name,$s) = /^\[(inode \d+) \S+ (\S+)/; - ($name,$s) = /^\[(dir \d+) \S+ (\S+)/ unless $name; - ($name,$s) = /^\[dentry (\S+) (\S+)/ unless $name; - if ($name) { - if ($s =~ /^auth/) { - $auth{$i}->{$name} = {}; - my ($rl) = $s =~ /\{(.*)\}/; - for my $r (split(/,/,$rl)) { - my ($who,$nonce) = $r =~ /(\d+)\=(\d+)/; - $auth{$i}->{$name}->{$who} = $nonce; - #print "auth $name rep by $who $nonce $s\n"; - } - } - else { - my ($a,$b,$n) = $s =~ /rep@(\d+)\,([\-\d]+)\.(\d+)/; - die $_ unless $a >= 0; - $replica{$i}->{$name}->{$a} = $n; - if ($b >= 0) { - $replica{$i}->{$name}->{$b} = $n; - } - } - } - } -} - -print "verifying replicas\n"; -for my $mds (keys %replica) { - for my $name (keys %{$replica{$mds}}) { - for my $auth (keys %{$replica{$mds}->{$name}}) { - if ($auth{$auth}->{$name}->{$mds}) { - if ($auth{$auth}->{$name}->{$mds} < $replica{$mds}->{$name}->{$auth}) { - print "problem: mds$mds has $name from mds$auth nonce $replica{$mds}->{$name}->{$auth}, auth has nonce $auth{$auth}->{$name}->{$mds}\n"; - } else { - print "ok: mds$mds has $name from mds$auth nonce $replica{$mds}->{$name}->{$auth}, auth has nonce $auth{$auth}->{$name}->{$mds}\n"; - } - } else { - print "??: mds$mds has $name from mds$auth nonce $replica{$mds}->{$name}->{$auth}, auth has no nonce\n"; - } - - } - } -} - - diff --git a/branches/sage/ebofs2/script/clean_osd_cow.sh b/branches/sage/ebofs2/script/clean_osd_cow.sh deleted file mode 100755 index 1e443c95e7ebc..0000000000000 --- a/branches/sage/ebofs2/script/clean_osd_cow.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/sh - -rm osddata/*/*\.* diff --git a/branches/sage/ebofs2/script/clean_trace.pl b/branches/sage/ebofs2/script/clean_trace.pl deleted file mode 100755 index cb02ff7abe7c2..0000000000000 --- a/branches/sage/ebofs2/script/clean_trace.pl +++ /dev/null @@ -1,8 +0,0 @@ -#!/usr/bin/perl - -my $n = 0; -while (<>) { - next unless /trace: /; - my $l = $'; $'; - print $l; -} diff --git a/branches/sage/ebofs2/script/comb.pl b/branches/sage/ebofs2/script/comb.pl deleted file mode 100755 index 1a0d4dcbe6c07..0000000000000 --- a/branches/sage/ebofs2/script/comb.pl +++ /dev/null @@ -1,113 +0,0 @@ -#!/usr/bin/perl - -use strict; - -my $xaxis = shift @ARGV; -my @vars; -while (@ARGV) { - $_ = shift @ARGV; - last if ($_ eq '-'); - push(@vars, $_); -} -my @dirs; -while (@ARGV) { - $_ = shift @ARGV; - last if ($_ eq '-'); - push(@dirs, $_) if -d $_; -} -my @filt = @ARGV; -push( @filt, '.' ) unless @filt; - -print "#xaxis $xaxis -#vars @vars -#dirs @dirs -#filt @filt -"; - -sub load_sum { - my $fn = shift @_; - - open(I, "$fn"); - my $k = ; - chomp($k); - my @k = split(/\s+/,$k); - shift @k; - - my $s; - while () { - chomp; - s/^\#//; - next unless $_; - my @l = split(/\s+/,$_); - my $k = shift @l; - for my $f (@k) { - $s->{$k}->{$f} = shift @l; - } - - # clnode latency? - if ($fn =~ /cl/) { - $s->{$k}->{'wrlat'} = $s->{$k}->{'wrlsum'} / $s->{$k}->{'wrlnum'} if $s->{$k}->{'wrlnum'} > 0; - $s->{$k}->{'rlat'} = $s->{$k}->{'rlsum'} / $s->{$k}->{'rlnum'} if $s->{$k}->{'rlnum'} > 0; - $s->{$k}->{'lat'} = $s->{$k}->{'lsum'} / $s->{$k}->{'lnum'} if $s->{$k}->{'lnum'} > 0; - $s->{$k}->{'latw'} = $s->{$k}->{'lwsum'} / $s->{$k}->{'lwnum'} if $s->{$k}->{'lwnum'} > 0; - $s->{$k}->{'latr'} = $s->{$k}->{'lrsum'} / $s->{$k}->{'lrnum'} if $s->{$k}->{'lrnum'} > 0; - $s->{$k}->{'statlat'} = $s->{$k}->{'lstatsum'} / $s->{$k}->{'lstatnum'} if $s->{$k}->{'lstatnum'} > 0; - $s->{$k}->{'dirlat'} = $s->{$k}->{'ldirsum'} / $s->{$k}->{'ldirnum'} if $s->{$k}->{'ldirnum'} > 0; - } - } - return $s; -} - - -my %res; -my @key; -my %didkey; -for my $f (@filt) { - my @reg = split(/,/, $f); - #print "reg @reg\n"; - for my $d (@dirs) { - if ($f ne '.') { - my $r = (split(/\//,$d))[-1]; - my @db = split(/,/, $r); - #print "db @db\n"; - my $ok = 1; - for my $r (@reg) { - - $ok = 0 unless grep {$_ eq $r} @db; - } - next unless $ok; - } - #next if ($f ne '.' && $d !~ /$reg/); - #print "$d\n"; - my ($x) = $d =~ /$xaxis=([\d\.]+)/; - - for my $v (@vars) { - my ($what, $field) = $v =~ /^(.+)\.([^\.]+)$/; - #print "$what $field .. $v .. $f.$field\n"; - my $s = &load_sum("$d/sum.$what"); - - #print "\t$v"; - if ($field =~ /^sum=/) { - #warn "SUM field $field\n"; - push( @{$res{$x}}, $s->{'sum'}->{$'} ); #'}); - } else { - #warn "avg field $field\n"; - push( @{$res{$x}}, $s->{'avgval'}->{$field} ); - } - - push( @key, "$f.$field" ) unless $didkey{"$f.$field"}; - $didkey{"$f.$field"} = 1; - - if (0 && exists $s->{'avgvaldevt'}) { - push( @{$res{$x}}, $s->{'avgvaldevt'}->{$field} ); - push( @key, "$f.$field.dev" ) unless $didkey{"$f.$field.dev"}; - $didkey{"$f.$field.dev"} = 1; - } - } - } -} - -print join("\t", "#", @key) . "\n"; -for my $x (sort {$a <=> $b} keys %res) { - print join("\t", $x, @{$res{$x}}) . "\n"; -} diff --git a/branches/sage/ebofs2/script/convert_soe_trace.pl b/branches/sage/ebofs2/script/convert_soe_trace.pl deleted file mode 100755 index a6ec80312d0fe..0000000000000 --- a/branches/sage/ebofs2/script/convert_soe_trace.pl +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/perl - -# this reads in one of kristal's anonymized static traces from -# soe and makes it look like output from -# -# find . -exec ls -dilsn --time-style=+%s \{\} \; -# -# (which is what SyntheticClient likes to "import", and -# study_static.pl likes to analyze for hardlinks, dirsizes, etc.) - -while (<>) { - chomp; - my ($file, $ino, $size, $actime, $ctime, $mtime, $uid, $gid, $omode, $nlink) = split(/ /,substr($_,1)); - $file = '.' . $file; - my $nmode = oct($omode); - my $mode = '-...'; - $mode = 'd...' if (($nmode & 0170000) == 0040000); - $mode = 'f...' if (($nmode & 0170000) == 0100000); - $size = hex($size); - $mtime = hex($mtime); - $uid = hex($uid); - $gid = hex($gid); - print "$ino ? $mode ? $nlink $uid $gid $size $mtime $file\n"; -} - -__END__ - -soe format is -0. a space -1. full path of file name (MD5-ed and in base 64) -2. inode number -3. size of file in bytes (hex) -4. atime (hex) -5. ctime (hex) -6. mtime (hex) -7. uid (hex) -8. gid (hex) -9. mode (octal) -10. number of links diff --git a/branches/sage/ebofs2/script/find_auth_pins.pl b/branches/sage/ebofs2/script/find_auth_pins.pl deleted file mode 100755 index d37fb109a48da..0000000000000 --- a/branches/sage/ebofs2/script/find_auth_pins.pl +++ /dev/null @@ -1,51 +0,0 @@ -#!/usr/bin/perl - -my %pin; -my %hist; -my $l = 1; -my @pins; -while (<>) { - - #cdir:adjust_nested_auth_pins on [dir 163 /foo/ rep@13 | child] count now 0 + 1 - - if (/adjust_nested_auth_pins/) { - my ($what) = / (\w+)\]/; - $what =~ s/ 0x/ /; - $hist{$what} .= "$l: $_" - if defined $pin{$what}; - } - - # cinode:auth_pin on inode [1000000002625 /gnu/blah_client_created. 0x89b7700] count now 1 + 0 - - elsif (/auth_pin / && !/waiting/) { - #my ($what) = /\[(\w+ \w+) /; - my ($what) = / (\w+)\]/; - $what =~ s/ 0x/ /; - #print "$_ add_waiter $c $what\n"; - $pin{$what}++; - $hist{$what} .= "$l: $_"; - push( @pins, $what ) unless grep {$_ eq $what} @pins; - } - - # cinode:auth_unpin on inode [1000000002625 (dangling) 0x89b7700] count now 0 + 0 - - elsif (/auth_unpin/) { - #my ($what) = /\[(\w+ \w+) /;# / on (.*\])/; - my ($what) = / (\w+)\]/; - $what =~ s/ 0x/ /; - $pin{$what}--; - $hist{$what} .= "$l: $_"; - unless ($pin{$what}) { - delete $hist{$what}; - delete $pin{$what}; - @pins = grep {$_ ne $what} @pins; - } - } - $l++; -} - -for my $what (@pins) { - print "---- count $pin{$what} on $what -$hist{$what} -"; -} diff --git a/branches/sage/ebofs2/script/find_bufferleaks.pl b/branches/sage/ebofs2/script/find_bufferleaks.pl deleted file mode 100755 index 152515d5e788e..0000000000000 --- a/branches/sage/ebofs2/script/find_bufferleaks.pl +++ /dev/null @@ -1,69 +0,0 @@ -#!/usr/bin/perl - -use strict; -my %buffers; -my %bufferlists; -my %ref; -my %mal; -my $l = 1; -while (<>) { - #print "$l: $_"; - - # cinode:auth_pin on inode [1000000002625 /gnu/blah_client_created. 0x89b7700] count now 1 + 0 - - if (/^buffer\.cons /) { - my ($x) = /(0x\S+)/; - $buffers{$x} = 1; - } - if (/^buffer\.des /) { - my ($x) = /(0x\S+)/; - die "des without cons at $l: $_" unless $buffers{$x}; - delete $buffers{$x}; - die "des with ref>0 at $l: $_" unless $ref{$x} == 0; - delete $ref{$x}; - } - - if (/^bufferlist\.cons /) { - my ($x) = /(0x\S+)/; - $bufferlists{$x} = 1; - } - if (/^bufferlist\.des /) { - my ($x) = /(0x\S+)/; - warn "des without cons at $l: $_" unless $bufferlists{$x}; - delete $bufferlists{$x}; - } - - - if (/^buffer\.malloc /) { - my ($x) = /(0x\S+)/; - $mal{$x} = 1; - } - if (/^buffer\.free /) { - my ($x) = /(0x\S+)/; - die "free with malloc at $l: $_" unless $mal{$x}; - delete $mal{$x}; - } - - if (/^buffer\.get /) { - my ($x) = /(0x\S+)/; - $ref{$x}++; - } - if (/^buffer\.get /) { - my ($x) = /(0x\S+)/; - $ref{$x}--; - } - -$l++; -} - -for my $x (keys %bufferlists) { - print "leaked bufferlist $x\n"; -} - -for my $x (keys %buffers) { - print "leaked buffer $x ref $ref{$x}\n"; -} - -for my $x (keys %mal) { - print "leaked buffer dataptr $x ref $ref{$x}\n"; -} diff --git a/branches/sage/ebofs2/script/find_lost_bdev_ops.pl b/branches/sage/ebofs2/script/find_lost_bdev_ops.pl deleted file mode 100755 index ac1793b42dfac..0000000000000 --- a/branches/sage/ebofs2/script/find_lost_bdev_ops.pl +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/perl - -use strict; -my %op; - -my $line = 0; -while (<>) { - #print $line . $_ if /0x8d4f6a0/; - chomp; - $line++; - - #bdev(./ebofsdev/0)._submit_io bio(wr 269~1 usemap 0x4de33cc0) - if (my ($bio) = /_submit_io bio\(.*(0x\w+)\)/) { - $op{$bio} = $line; - } - - # cancel - #bdev(./ebofsdev/3)._cancel_io bio(wr 1525~1 bh_write 0x8a437b8) - if (my ($bio) = /_cancel_io bio\(.*(0x\w+)\)/ && - !(/FAILED/)) { - delete $op{$bio}; - } - - # finish - #bdev(./ebofsdev/3).complete_thread finishing bio(wr 1131~1 write_cnode 0x832c1f8) - if (my ($bio) = /complete_thread finishing bio\(.*(0x\w+)\)/) { - delete $op{$bio}; - } - -} - -for my $bio (keys %op) { - print "---- lost bio $bio\n"; -} diff --git a/branches/sage/ebofs2/script/find_lost_commit.pl b/branches/sage/ebofs2/script/find_lost_commit.pl deleted file mode 100755 index 73934248ad5c0..0000000000000 --- a/branches/sage/ebofs2/script/find_lost_commit.pl +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/perl - -use strict; -my %op; - -my $line = 0; -while (<>) { - #print "$line: $_"; - $line++; - - #osd3 do_op MOSDOp(client0.933 oid 100000000000008 0x84b4480) in pg[pginfo(4020000000d v 5662/0 e 2/1) r=0 active (0,5662]] - if (my ($from, $opno, $oid, $op) = /do_op MOSDOp\((\S+) op (\d+) oid (\d+) (\w+)\)/) { -# print "$op\n"; - if ($opno == 2 || $opno == 11 || $opno == 12 || $opno == 14 || $opno == 15) { - $op{$op} = $from; - } - } - - # commits - #osd1 op_modify_commit on op MOSDOp(client1.289 oid 100000100000002 0x51a2f788) - if (my ($op) = /op_modify_commit.* (\w+)\)/) { - delete $op{$op}; - } - #osd4 rep_modify_commit on op MOSDOp(osd3.289 oid 100000000000008 0x84b0980) - if (my ($op) = /rep_modify_commit.* (\w+)\)/) { - delete $op{$op}; - } - - # forwarded? - if (my ($op) = /sending (\w+) to osd/) { - delete $op{$op}; - } - -} - -for my $op (keys %op) { - print "---- lost op $op $op{$op}\n"; -} diff --git a/branches/sage/ebofs2/script/find_lost_objecter.pl b/branches/sage/ebofs2/script/find_lost_objecter.pl deleted file mode 100755 index a0c2089140e23..0000000000000 --- a/branches/sage/ebofs2/script/find_lost_objecter.pl +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/perl - -use strict; -my %ack; -my %commit; - -my $line = 0; -while (<>) { - #print "$line: $_"; - $line++; - - #client0.objecter writex_submit tid 21 osd0 oid 100000000000001 851424~100000 - if (my ($who, $tid) = /(\S+)\.objecter writex_submit tid\D+(\d+)\D+osd/) { -# print "$who.$tid\n"; - $ack{"$who.$tid"} = $line; - $commit{"$who.$tid"} = $line; - } - - #client1.objecter handle_osd_write_reply 304 commit 0 - #client1.objecter handle_osd_write_reply 777 commit 1 - if (my ($who, $tid, $commit) = /(\S+)\.objecter handle_osd_write_reply\D+(\d+)\D+commit\D+(\d)/) { -# print "$who.$tid\n"; - delete $ack{"$who.$tid"}; - delete $commit{"$who.$tid"} if $commit; - } - -} - -for my $op (keys %commit) { - print "---- lost commit $op $commit{$op}\n"; -} -for my $op (keys %ack) { - print "---- lost ack $op $commit{$op}\n"; -} diff --git a/branches/sage/ebofs2/script/find_pathpins.pl b/branches/sage/ebofs2/script/find_pathpins.pl deleted file mode 100755 index e4a7d81dfb7b7..0000000000000 --- a/branches/sage/ebofs2/script/find_pathpins.pl +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/perl - -my %pin; -my %hist; -my $l = 1; -my @pins; -while (<>) { - - # cinode:auth_pin on inode [1000000002625 /gnu/blah_client_created. 0x89b7700] count now 1 + 0 - - if (/path_pinned /) { - my ($dname, $dir) = /\[dentry (\S+) .* in \[dir (\d+) /; - $what = "$dname $dir"; - #print "$l pin $what\n"; - $pin{$what}++; - $hist{$what} .= "$l: $_"; - push( @pins, $what ) unless grep {$_ eq $what} @pins; - } - - # cinode:auth_unpin on inode [1000000002625 (dangling) 0x89b7700] count now 0 + 0 - - if (/path_unpinned/) { - my ($dname, $dir) = /\[dentry (\S+) .* in \[dir (\d+) /; - $what = "$dname $dir"; - #print "$l unpin $what\n"; - $pin{$what}--; - $hist{$what} .= "$l: $_"; - unless ($pin{$what}) { - delete $hist{$what}; - delete $pin{$what}; - @pins = grep {$_ ne $what} @pins; - } - } - $l++; -} - -for my $what (@pins) { - print "---- count $pin{$what} on $what -$hist{$what} -"; -} diff --git a/branches/sage/ebofs2/script/find_requests.pl b/branches/sage/ebofs2/script/find_requests.pl deleted file mode 100755 index 5144896249413..0000000000000 --- a/branches/sage/ebofs2/script/find_requests.pl +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/perl - -my %waiting; # context => what where what is "inode ..." or "dir ..." -my %hist; # context => history since waited -my @waiting; - -my $line = 0; -while (<>) { - - #print $line . $_ if /0x8d4f6a0/; - $line++; - if (/request_start/) { - my ($c) = /(0x\w+)/; - my ($what) = $'; #'; - chomp $what; - #print "$line add_waiter $c $what\n" if /0x8d4f6a0/; - $waiting{$c} = $what - if $what && !$waiting{$c}; - $hist{$c} .= "$line: $_"; - unless (grep {$_ eq $c} @waiting) { - push( @waiting, $c ); - } - } - #if (/finish_waiting/) { - # my ($c) = /(0x\w+)/; - # $hist{$c} .= "$line: $_"; - #} - if (/request_finish/ || - /request_forward/) { - my ($c) = /(0x\w+)/; - #print "took\n" if /0x8d4f6a0/; - delete $waiting{$c}; - delete $hist{$c}; - @waiting = grep {$_ ne $c} @waiting; - } -} - -for my $c (@waiting) { - print "---- lost request $c $waiting{$c} -$hist{$c} -"; -} diff --git a/branches/sage/ebofs2/script/find_waiters.pl b/branches/sage/ebofs2/script/find_waiters.pl deleted file mode 100755 index c89d2b1a49db7..0000000000000 --- a/branches/sage/ebofs2/script/find_waiters.pl +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/perl - -my %waiting; # context => what where what is "inode ..." or "dir ..." -my %hist; # context => history since waited -my @waiting; - -my $line = 0; -while (<>) { - #print $line . $_ if /0x8d4f6a0/; - $line++; - if (/add_waiter/) { - my ($c) = /(0x\w+)/; - my ($what) = / on (.*\])/; - #print "$line add_waiter $c $what\n" if /0x8d4f6a0/; - $waiting{$c} = $what - if $what && !$waiting{$c}; - $hist{$c} .= "$line: $_"; - unless (grep {$_ eq $c} @waiting) { - push( @waiting, $c ); - } - } - #if (/finish_waiting/) { - # my ($c) = /(0x\w+)/; - # $hist{$c} .= "$line: $_"; - #} - if (/take_waiting/) { - my ($c) = /(0x\w+)/; - if (/SKIPPING/) { - #print "skipping\n" if /0x8d4f6a0/; - $hist{$c} .= "$line: $_"; - } elsif (/took/) { - #print "took\n" if /0x8d4f6a0/; - delete $waiting{$c}; - delete $hist{$c}; - @waiting = grep {$_ ne $c} @waiting; - } else { - die "i don't understand: $_"; - } - } -} - -for my $c (@waiting) { - print "---- lost waiter $c $waiting{$c} -$hist{$c} -"; -} diff --git a/branches/sage/ebofs2/script/fix_modeline.pl b/branches/sage/ebofs2/script/fix_modeline.pl deleted file mode 100755 index 8eadde9b54e56..0000000000000 --- a/branches/sage/ebofs2/script/fix_modeline.pl +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/perl - -use strict; -my $fn = shift @ARGV; -my $old = `cat $fn`; -my $header = `cat doc/modeline.txt`; - -# strip existing modeline -my $new = $old; -$new =~ s/^\/\/ \-\*\- ([^\n]+) \-\*\-([^\n]*)\n//s; # emacs -$new =~ s/^\/\/ vim: ([^\n]*)\n//s; # vim; -$new =~ s/^\/\/ \-\*\- ([^\n]+) \-\*\-([^\n]*)\n//s; # emacs -$new =~ s/^\/\/ vim: ([^\n]*)\n//s; # vim; -$new =~ s/^\/\/ \-\*\- ([^\n]+) \-\*\-([^\n]*)\n//s; # emacs -$new =~ s/^\/\/ vim: ([^\n]*)\n//s; # vim; - -# add correct header -$new = $header . $new; - -if ($new ne $old) { - print "$fn\n"; - open(O, ">$fn.new"); - print O $new; - close O; - system "diff $fn $fn.new"; - rename "$fn.new", $fn; - #unlink "$fn.new"; -} - diff --git a/branches/sage/ebofs2/script/gprofnewsyn b/branches/sage/ebofs2/script/gprofnewsyn deleted file mode 100755 index 5d352e4e9e52c..0000000000000 --- a/branches/sage/ebofs2/script/gprofnewsyn +++ /dev/null @@ -1,12 +0,0 @@ -#!/usr/bin/perl - -my @ranks = @ARGV; -unless (@ranks) { - @ranks = split(/\n/,`/bin/ls gmon`); -} -print "will do @ranks\n"; -for my $r (@ranks) { - print "$r\n"; - system "test -e gmon.out && rm gmon.out ; ln -f gmon/$r/gmon.out ; gprof newsyn > gmon/$r/o"; -} - diff --git a/branches/sage/ebofs2/script/grepblock b/branches/sage/ebofs2/script/grepblock deleted file mode 100755 index f5acf95732abb..0000000000000 --- a/branches/sage/ebofs2/script/grepblock +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/perl - -use strict; - -my $block = shift ARGV; -die unless int $block; - -while (<>) { - my $yes = 0; - for my $x (/(\d+\~\d+)/) { - my ($s,$l) = split(/\~/,$x); - $yes = 1 if ($block >= $s && $block < $s+$l); - } - print if $yes; -} diff --git a/branches/sage/ebofs2/script/merge_cdfs.pl b/branches/sage/ebofs2/script/merge_cdfs.pl deleted file mode 100755 index 98c22764fc8b3..0000000000000 --- a/branches/sage/ebofs2/script/merge_cdfs.pl +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/perl - -my %rows; # val -> [ count1, count2, ... ] - -my $filen = 0; -for my $file (@ARGV) { - open(I,"$file"); - while () { - next if /^\#/; - chomp; - my ($v, $c) = split(/\t/,$_); - $rows{$v}->[$filen] = $c; - } - $filen++; -} - -for my $v (sort {$a <=> $b} keys %rows) { - print "$v"; - for (my $i=0; $i < $filen; $i++) { - print "\t" . int($rows{$v}->[$i]); - } - print "\n"; - #print join("\t", $v, @{$rows{$v}}) . "\n"; -} diff --git a/branches/sage/ebofs2/script/merge_trace_rw.pl b/branches/sage/ebofs2/script/merge_trace_rw.pl deleted file mode 100644 index 378d629ef43f6..0000000000000 --- a/branches/sage/ebofs2/script/merge_trace_rw.pl +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/perl - -use strict; - -my @file = <>; -sub get_op { - my @op = shift @file; - while (@file && - $file[0] !~ /^[a-z]+$/) { - push( @op, shift @file ); - } - #print "op = ( @op )\n"; - return @op; -} - -my $n = 0; -while (@file) { - my ($op, @args) = &get_op; - while ($op eq "read\n" || - $op eq "write\n") { - die unless scalar(@args) == 3; - my ($nop, @nargs) = &get_op; - if ($nop eq $op - && ($args[0] == $nargs[0] ) - && ($args[2] + $args[1] == $nargs[2]) - ) { - die unless scalar(@nargs) == 3; - $args[1] += $nargs[1]; - $args[1] .= "\n"; - die unless scalar(@args) == 3; - #print STDOUT "combining $n $op @args\n"; - $n++; - } else { -# print STDERR "not combinging\n"; - unshift( @file, $nop, @nargs ); - die unless scalar(@args) == 3; - last; - } - } - print $op; - print join('', @args); -} diff --git a/branches/sage/ebofs2/script/plot.pl b/branches/sage/ebofs2/script/plot.pl deleted file mode 100755 index 2d4e3002bbd4d..0000000000000 --- a/branches/sage/ebofs2/script/plot.pl +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/perl - -use strict; - -my $dir = shift @ARGV; -my ($type,$subtype) = split(/\./, shift @ARGV); -$subtype = '.' . $subtype if $subtype; - -# list files -my @files; -my %fields; -for my $f (`ls $dir/$type*$subtype`) { - chomp $f; - next unless $f =~ /$type(\d+)$subtype$/; - push(@files, $f); - unless (%fields) { - open(I,$f); - while () { - next unless /^\#/; - my @f = split(/\t/,$_); - for (my $n=1; @f; $n++) { - my $f = shift @f; - $fields{$f} = $n; - #print "$f = $n\n"; - } - last; - } - close I; - } -} -#print "#files @files\n"; - -# get field names -my $var = shift @ARGV; -my $rest = join(' ', @ARGV); - -print "set style data lines\nset grid\n"; -print "set title \"$dir .. $var\"\n"; -if (scalar(@files) > 30) { print "set key off\n"; } -#for my $var (@ARGV) { - my @p; - for my $f (@files) { - my ($lastbit) = $f =~ /\/([^\/]+)$/; - push(@p, "\"$f\" u 1:$fields{$var} $rest t \"$lastbit\""); - } - print "plot " . join(',', @p) . "\n"; -#} -print "pause 60000\n"; diff --git a/branches/sage/ebofs2/script/profonly.pl b/branches/sage/ebofs2/script/profonly.pl deleted file mode 100755 index 6a05dec473ca0..0000000000000 --- a/branches/sage/ebofs2/script/profonly.pl +++ /dev/null @@ -1,12 +0,0 @@ -#!/usr/bin/perl - -my $rank = shift @ARGV; -my $args = join(' ',@ARGV); -if ($rank == $ENV{MPD_JRANK}) { - $c = "LD_PRELOAD=$ENV{'HOME'}/csl/obsd/src/pmds/gprof-helper.so ./newsyn $args"; -} else { - $c = "./newsyn.nopg $args"; -} - -#print "$rank: $c\n"; -system $c; diff --git a/branches/sage/ebofs2/script/runjob.pl b/branches/sage/ebofs2/script/runjob.pl deleted file mode 100755 index c432675d33830..0000000000000 --- a/branches/sage/ebofs2/script/runjob.pl +++ /dev/null @@ -1,341 +0,0 @@ -#!/usr/bin/perl - -use strict; -use Data::Dumper; - - -my $usage = "script/runset.pl [--clean] jobs/some/job blah\n"; - -my $clean; -my $use_srun = 0; -my $nobg = '&'; -my $in = shift || die $usage; -if ($in eq '--clean') { - $clean = 1; - $in = shift || die $usage; -} -if ($in eq '--srun') { - $use_srun = 1; - $in = shift || die $usage; -} -if ($in eq '--nobg') { - $nobg = ''; - $in = shift || die $usage; -} -my $tag = shift || die $usage; -my $fake = shift; - - -my ($job) = $in =~ /^jobs\/(.*)/; -my ($jname) = $job =~ /\/(\w+)$/; -$jname ||= $job; -die "not jobs/?" unless defined $job; -my $out = "log/$job.$tag"; -my $relout = "$job.$tag"; - - -my $cwd = `/bin/pwd`; -chomp($cwd); - - - -print "# --- job $job, tag $tag ---\n"; - - -# get input -my $raw = `cat $in`; -my $sim = eval $raw; -unless (ref $sim) { - print "bad input: $in\n"; - system "perl -c $in"; - exit 1; -} - -# prep output -system "mkdir -p $out" unless -d "$out"; - -open(W, ">$out/in"); -print W $raw; -close W; - -my $comb = $sim->{'_comb'}; -delete $sim->{'_comb'}; -my %filters; -my @fulldirs; - - - -sub reset { - print "reset: restarting mpd in 3 seconds\n"; - system "sleep 3 && (mpiexec -l -n 32 killall newsyn ; restartmpd.sh)"; - print "reset: done\n"; -} - - -if (`hostname` =~ /alc/ && !$use_srun) { - print "# this looks like alc\n"; - $sim->{'_psub'} = 'jobs/alc.tp'; -} - - -sub iterate { - my $sim = shift @_; - my $fix = shift @_ || {}; - my $vary; - my @r; - - my $this; - for my $k (sort keys %$sim) { - #next if $k =~ /^_/; - if (defined $fix->{$k}) { - $this->{$k} = $fix->{$k}; - } - elsif (ref $sim->{$k} eq 'HASH') { - # nothing - } - elsif ($k =~ /^_/ || !(ref $sim->{$k})) { - $this->{$k} = $sim->{$k}; - } - else { - #print ref $sim->{$k}; - if (!(defined $vary)) { - $vary = $k; - } - } - } - - if ($vary) { - #print "vary $vary\n"; - for my $v (@{$sim->{$vary}}) { - $this->{$vary} = $v; - push(@r, &iterate($sim, $this)); - } - } else { - - if ($sim->{'_dep'}) { - my @s = @{$sim->{'_dep'}}; - while (@s) { - my $dv = shift @s; - my $eq = shift @s; - - $eq =~ s/\$(\w+)/"\$this->{'$1'}"/eg; - $this->{$dv} = eval $eq; - #print "$dv : $eq -> $this->{$dv}\n"; - } - } - - push(@r, $this); - } - return @r; -} - - - -sub run { - my $h = shift @_; - - my @fn; - my @filt; - my @vals; - for my $k (sort keys %$sim) { - next if $k =~ /^_/; - next unless ref $sim->{$k} eq 'ARRAY'; - push(@fn, "$k=$h->{$k}"); - push(@vals, $h->{$k}); - next if $comb && $k eq $comb->{'x'}; - push(@filt, "$k=$h->{$k}"); - } - my $keys = join(",", @fn); - $keys =~ s/ /_/g; - my $fn = $out . '/' . $keys; - my $name = $jname . '_' . join('_',@vals); #$tag . '_' . $keys; - - push( @fulldirs, "" . $fn ); - - - # filters - $filters{ join(',', @filt) } = 1; - - - #system "sh $fn/sh.post" if -e "$fn/sh.post";# && !(-e "$fn/.post"); - - if (-e "$fn/.done") { - print "already done.\n"; - return; - } - system "rm -r $fn" if $clean && -d "$fn"; - system "mkdir $fn" unless -d "$fn"; - system "mkdir $fn/out" unless -d "$fn/out"; - - my $e = './newsyn'; - #$e = './tcpsynobfs' if $h->{'fs'} eq 'obfs'; - my $c = "$e"; - $c .= " --mkfs" unless $h->{'_no_mkfs'}; - - for my $k (keys %$h) { - next if $k =~ /^_/; - next if $h->{'_noarg'} && grep {$k eq $_} @{$h->{'_noarg'}}; - next if $h->{'_subst'} && grep {$k eq $_} @{$h->{'_subst'}}; - $c .= " --$k $h->{$k}"; - } - - if ($h->{'_custom'}) { - if ($h->{'_subst'}) { - for my $var (@{$h->{'_subst'}}) { - $h->{'_custom'} =~ s/\$$var/$h->{$var}/g; - } - } - $c .= ' ' . $h->{'_custom'}; - } - - $c .= " --log_name $relout/$keys"; - $c .= " --doutdir log/$relout/$keys/out"; - - my $post = "#!/bin/sh -script/sum.pl -start $h->{'_start'} -end $h->{'_end'} $fn/osd\\* > $fn/sum.osd -script/sum.pl -start $h->{'_start'} -end $h->{'_end'} $fn/mds? $fn/mds?? > $fn/sum.mds -script/sum.pl -start $h->{'_start'} -end $h->{'_end'} $fn/mds*.log > $fn/sum.mds.log -script/sum.pl -start $h->{'_start'} -end $h->{'_end'} $fn/clnode* > $fn/sum.cl -touch $fn/.post -"; - open(O,">$fn/sh.post"); - print O $post; - close O; - - my $killmin; - if ($h->{'_kill_after'}) { - $killmin = 1 + int ($h->{'_kill_after'} / 60); - $killmin = "-t $killmin"; - } - - $c = "bash -c \"ulimit -c 0 ; $c\""; - #$c = "bash -c \"$c\""; - - #print "h keys are " . join(' ', sort keys %$h) . "\n"; - - my $srun = "srun --wait=600 -x jobs/ltest.ignore -l $killmin -N $h->{'_n'} -p ltest"; - my $mpiexec = "mpiexec -l -n $h->{'_n'}"; - my $launch; - if ($use_srun) { - $launch = $srun; - } else { - $launch = $mpiexec; - } - - if ($sim->{'_psub'}) { - # template! - my $tp = `cat $sim->{'_psub'}`; - $tp =~ s/\$CWD/$cwd/g; - $tp =~ s/\$NAME/$name/g; - $tp =~ s/\$NUM/$h->{'_n'}/g; - $tp =~ s/\$OUT/$fn\/o/g; - $tp =~ s/\$DONE/$fn\/.done/g; - $tp =~ s/\$CMD/$c/g; - open(O,">$out/$name"); - print O $tp; - close O; - print "\npsub $out/$name\n"; - return; - } else { - # run - my $cmd = "\n$launch $c > $fn/o && touch $fn/.done";# - #my $cmd = "\n$launch $c > $fn/o ; touch $fn/.done"; - print "$cmd $nobg\n"; - my $r = undef; - unless ($fake) { - if ($sim->{'_pre'}) { - print "pre: $launch $sim->{'_pre'}\n"; - system "$launch $sim->{'_pre'}"; - } - $r = system $cmd; - if ($sim->{'_post'}) { - print "post: $launch $sim->{'_post'}\n"; - system "$launch $sim->{'_post'}"; - } - if ($r) { - print "r = $r\n"; - #&reset; - } - system "sh $fn/sh.post"; - } - return $r; - } -} - - - -my @r = &iterate($sim); -my $n = scalar(@r); -my $c = 1; -my %r; -my $nfailed = 0; -for my $h (@r) { - my $d = `date`; - chomp($d); - $d =~ s/ P.T .*//; - print "# === $c/$n"; - print " ($nfailed failed)" if $nfailed; - print " $d: "; - my $r = &run($h); - - if (!(defined $r)) { - # already done - } else { - if ($r) { - $nfailed++; - } - print "sleep $h->{'_sleep'}\n"; - sleep $h->{'_sleep'}; - } - - $c++; -} -print "$nfailed failed\n"; - - -my @comb; -if ($comb) { - my $x = $comb->{'x'}; - my @vars = @{$comb->{'vars'}}; - - print "\n\n# post\n"; - for my $p (@fulldirs) { - print "sh $p/sh.post\n"; - } - - my @filters = sort keys %filters; - my $cmd = "script/comb.pl $x @vars - @fulldirs - @filters > $out/c"; - print "$cmd\n"; - open(O,">$out/comb"); - print O "$cmd\n"; - close O; - system $cmd; - - print "\n\n"; - - my $plot; - $plot .= "set data style linespoints;\n"; - my $s = 2; - for my $v (@vars) { - my $c = $s; - $s++; - my @p; - for my $f (@filters) { - my $t = $f; - if ($comb->{'maptitle'}) { - for my $a (keys %{$comb->{'maptitle'}}) { - my $b = $comb->{'maptitle'}->{$a}; - $t =~ s/$a/$b/; - } - } - push (@p, "\"$out/c\" u 1:$c t \"$t\"" ); - $c += scalar(@vars); - } - $plot .= "# $v\nplot " . join(", ", @p) . ";\n\n"; - } - print $plot; - open(O,">$out/plot"); - print O $plot; - close O; -} - diff --git a/branches/sage/ebofs2/script/runset.pl b/branches/sage/ebofs2/script/runset.pl deleted file mode 100755 index 966cf4e5100cb..0000000000000 --- a/branches/sage/ebofs2/script/runset.pl +++ /dev/null @@ -1,380 +0,0 @@ -#!/usr/bin/perl - -use strict; -use Data::Dumper; - -=item sample input file - -# hi there -{ - # startup - 'n' => 30, # mpi nodes - 'sleep' => 10, # seconds between runs - 'nummds' => 1, - 'numosd' => 8, - 'numclient' => 400,#[10, 50, 100, 200, 400], - - # parameters - 'fs' => [ 'ebofs', 'fakestore' ], - 'until' => 150, # --syn until $n ... when to stop clients - 'writefile' => 1, - 'writefile_size' => [ 4096, 65526, 256000, 1024000, 2560000 ], - 'writefile_mb' => 1000, - - 'custom' => '--tcp_skip_rank0 --osd_maxthreads 0'; - - # for final summation (script/sum.pl) - 'start' => 30, - 'end' => 120, - - '_psub' => 'alc.tp' # switch to psub mode! -}; - -=cut - -my $usage = "script/runset.pl [--clean] jobs/some/job blah\n"; - -my $clean; -my $use_srun; -my $nobg = '&'; -my $in = shift || die $usage; -if ($in eq '--clean') { - $clean = 1; - $in = shift || die $usage; -} -if ($in eq '--srun') { - $use_srun = 1; - $in = shift || die $usage; -} -if ($in eq '--nobg') { - $nobg = ''; - $in = shift || die $usage; -} -my $tag = shift || die $usage; -my $fake = shift; - - -my ($job) = $in =~ /^jobs\/(.*)/; -my ($jname) = $job =~ /\/(\w+)$/; -$jname ||= $job; -die "not jobs/?" unless defined $job; -my $out = "log/$job.$tag"; -my $relout = "$job.$tag"; - - -my $cwd = `/bin/pwd`; -chomp($cwd); - - - -print "# --- job $job, tag $tag ---\n"; - - -# get input -my $raw = `cat $in`; -my $sim = eval $raw; -unless (ref $sim) { - print "bad input: $in\n"; - system "perl -c $in"; - exit 1; -} - -# prep output -system "mkdir -p $out" unless -d "$out"; - -open(W, ">$out/in"); -print W $raw; -close W; - -my $comb = $sim->{'comb'}; -delete $sim->{'comb'}; -my %filters; -my @fulldirs; - - - -sub reset { - print "reset: restarting mpd in 3 seconds\n"; - system "sleep 3 && (mpiexec -l -n 32 killall newsyn ; restartmpd.sh)"; - print "reset: done\n"; -} - - -if (`hostname` =~ /alc/ && !$use_srun) { - print "# this looks like alc\n"; - $sim->{'_psub'} = 'jobs/alc.tp'; -} - - -sub iterate { - my $sim = shift @_; - my $fix = shift @_ || {}; - my $vary; - my @r; - - my $this; - for my $k (sort keys %$sim) { - next if $k =~ /^_/; - if (defined $fix->{$k}) { - $this->{$k} = $fix->{$k}; - } - elsif (ref $sim->{$k} eq 'HASH') { - # nothing - } - elsif (!(ref $sim->{$k})) { - $this->{$k} = $sim->{$k}; - } - else { - #print ref $sim->{$k}; - if (!(defined $vary)) { - $vary = $k; - } - } - } - - if ($vary) { - #print "vary $vary\n"; - for my $v (@{$sim->{$vary}}) { - $this->{$vary} = $v; - push(@r, &iterate($sim, $this)); - } - } else { - - if ($sim->{'_dep'}) { - my @s = @{$sim->{'_dep'}}; - while (@s) { - my $dv = shift @s; - my $eq = shift @s; - - $eq =~ s/\$(\w+)/"\$this->{'$1'}"/eg; - $this->{$dv} = eval $eq; - #print "$dv : $eq -> $this->{$dv}\n"; - } - } - - push(@r, $this); - } - return @r; -} - - - -sub run { - my $h = shift @_; - - my @fn; - my @filt; - my @vals; - for my $k (sort keys %$sim) { - next if $k =~ /^_/; - next unless ref $sim->{$k} eq 'ARRAY'; - push(@fn, "$k=$h->{$k}"); - push(@vals, $h->{$k}); - next if $comb && $k eq $comb->{'x'}; - push(@filt, "$k=$h->{$k}"); - } - my $keys = join(",", @fn); - $keys =~ s/ /_/g; - my $fn = $out . '/' . $keys; - my $name = $jname . '_' . join('_',@vals); #$tag . '_' . $keys; - - push( @fulldirs, "" . $fn ); - - - # filters - $filters{ join(',', @filt) } = 1; - - - #system "sh $fn/sh.post" if -e "$fn/sh.post";# && !(-e "$fn/.post"); - if (-e "$fn/.done") { - print "already done.\n"; - return; - } - system "rm -r $fn" if $clean && -d "$fn"; - system "mkdir $fn" unless -d "$fn"; - - my $e = './newsyn'; - #$e = './tcpsynobfs' if $h->{'fs'} eq 'obfs'; - my $c = "$e"; - $c .= " --mkfs" unless $h->{'no_mkfs'}; - $c .= " --$h->{'fs'}" if $h->{'fs'}; - $c .= " --syn until $h->{'until'}" if $h->{'until'}; - - $c .= " --syn writefile $h->{'writefile_mb'} $h->{'writefile_size'}" if $h->{'writefile'}; - $c .= " --syn rw $h->{'rw_mb'} $h->{'rw_size'}" if $h->{'rw'}; - $c .= " --syn readfile $h->{'readfile_mb'} $h->{'readfile_size'}" if $h->{'readfile'}; - $c .= " --syn makedirs $h->{'makedirs_dirs'} $h->{'makedirs_files'} $h->{'makedirs_depth'}" if $h->{'makedirs'}; - - if ($h->{'ebofs_freelist'}) { - system "cp freelist/ebofs.freelist.$h->{'ebofs_freelist'} ebofs.freelist"; - $c .= " --osd_age_time -1"; - } - - for my $k ('nummds', 'numclient', 'numosd', 'kill_after', - 'osd_maxthreads', 'osd_object_layout', 'osd_pg_layout','osd_pg_bits', - 'mds_bal_rep', 'mds_bal_interval', 'mds_bal_max','mds_decay_halflife', - 'mds_bal_hash_rd','mds_bal_hash_wr','mds_bal_unhash_rd','mds_bal_unhash_wr', - 'mds_cache_size','mds_log_max_len', - 'mds_local_osd', - 'osd_age_time','osd_age', - 'osd_rep', - 'osd_pad_pg_log','ebofs_realloc', - 'osd_balance_reads', - 'tcp_multi_out', - 'client_cache_stat_ttl','client_cache_readdir_ttl', - 'client_oc', - 'fake_osdmap_updates', - 'bdev_el_bidir', 'ebofs_idle_commit_ms', 'ebofs_commit_ms', - 'ebofs_oc_size','ebofs_cc_size','ebofs_bc_size','ebofs_bc_max_dirty','ebofs_abp_max_alloc', - 'file_layout_ssize','file_layout_scount','file_layout_osize','file_layout_num_rep', - 'meta_dir_layout_ssize','meta_dir_layout_scount','meta_dir_layout_osize','meta_dir_layout_num_rep', - 'meta_log_layout_ssize','meta_log_layout_scount','meta_log_layout_osize','meta_log_layout_num_rep') { - $c .= " --$k $h->{$k}" if defined $h->{$k}; - } - - $c .= ' ' . $h->{'custom'} if $h->{'custom'}; - - $c .= " --log_name $relout/$keys"; - - my $post = "#!/bin/sh -script/sum.pl -start $h->{'start'} -end $h->{'end'} $fn/osd\\* > $fn/sum.osd -script/sum.pl -start $h->{'start'} -end $h->{'end'} $fn/mds? $fn/mds?? > $fn/sum.mds -script/sum.pl -start $h->{'start'} -end $h->{'end'} $fn/mds*.log > $fn/sum.mds.log -script/sum.pl -start $h->{'start'} -end $h->{'end'} $fn/clnode* > $fn/sum.cl -touch $fn/.post -"; - open(O,">$fn/sh.post"); - print O $post; - close O; - - my $killmin = 1 + int ($h->{'kill_after'} / 60); - - $c = "bash -c \"ulimit -c 0 ; $c\""; - #$c = "bash -c \"$c\""; - - my $srun = "srun --wait=600 --exclude=jobs/ltest.ignore -l -t $killmin -N $h->{'n'} -p ltest"; - my $mpiexec = "mpiexec -l -n $h->{'n'}"; - my $launch; - if ($use_srun) { - $launch = $srun; - } else { - $launch = $mpiexec; - } - - if ($sim->{'_psub'}) { - # template! - my $tp = `cat $sim->{'_psub'}`; - $tp =~ s/\$CWD/$cwd/g; - $tp =~ s/\$NAME/$name/g; - $tp =~ s/\$NUM/$h->{'n'}/g; - $tp =~ s/\$OUT/$fn\/o/g; - $tp =~ s/\$DONE/$fn\/.done/g; - $tp =~ s/\$CMD/$c/g; - open(O,">$out/$name"); - print O $tp; - close O; - print "\npsub $out/$name\n"; - return; - } else { - # run - my $cmd = "\n$launch $c > $fn/o && touch $fn/.done";# - #my $cmd = "\n$launch $c > $fn/o ; touch $fn/.done"; - print "$cmd $nobg\n"; - my $r = undef; - unless ($fake) { - if ($sim->{'_pre'}) { - print "pre: $launch $sim->{'_pre'}\n"; - system "$launch $sim->{'_pre'}"; - } - $r = system $cmd; - if ($sim->{'_post'}) { - print "post: $launch $sim->{'_post'}\n"; - system "$launch $sim->{'_post'}"; - } - if ($r) { - print "r = $r\n"; - #&reset; - } - system "sh $fn/sh.post"; - } - return $r; - } -} - - - -my @r = &iterate($sim); -my $n = scalar(@r); -my $c = 1; -my %r; -my $nfailed = 0; -for my $h (@r) { - my $d = `date`; - chomp($d); - $d =~ s/ P.T .*//; - print "# === $c/$n"; - print " ($nfailed failed)" if $nfailed; - print " $d: "; - my $r = &run($h); - - if (!(defined $r)) { - # already done - } else { - if ($r) { - $nfailed++; - } - print "sleep $h->{'sleep'}\n"; - sleep $h->{'sleep'}; - } - - $c++; -} -print "$nfailed failed\n"; - - -my @comb; -if ($comb) { - my $x = $comb->{'x'}; - my @vars = @{$comb->{'vars'}}; - - print "\n\n# post\n"; - for my $p (@fulldirs) { - print "sh $p/sh.post\n"; - } - - my @filters = sort keys %filters; - my $cmd = "script/comb.pl $x @vars - @fulldirs - @filters > $out/c"; - print "$cmd\n"; - open(O,">$out/comb"); - print O "$cmd\n"; - close O; - system $cmd; - - print "\n\n"; - - my $plot; - $plot .= "set data style linespoints;\n"; - my $s = 2; - for my $v (@vars) { - my $c = $s; - $s++; - my @p; - for my $f (@filters) { - my $t = $f; - if ($comb->{'maptitle'}) { - for my $a (keys %{$comb->{'maptitle'}}) { - my $b = $comb->{'maptitle'}->{$a}; - $t =~ s/$a/$b/; - } - } - push (@p, "\"$out/c\" u 1:$c t \"$t\"" ); - $c += scalar(@vars); - } - $plot .= "# $v\nplot " . join(", ", @p) . ";\n\n"; - } - print $plot; - open(O,">$out/plot"); - print O $plot; - close O; -} - diff --git a/branches/sage/ebofs2/script/smooth.pl b/branches/sage/ebofs2/script/smooth.pl deleted file mode 100755 index 6cfbaf60ff921..0000000000000 --- a/branches/sage/ebofs2/script/smooth.pl +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/perl - -my $n = shift @ARGV || 2; - -my %v; # t -> [..] -while (<>) { - chomp; - my @l = split(/\t/,$_); - my $t = shift @l; - if (int $t) { - $v{$t} = \@l; - } else { - print "$_\n"; - } -} - -for my $t (sort {$a <=> $b} keys %v) { - my $s = $t - $n/2; - my @v; - my $c = 0; - for (my $a=0; $a < $n; $a++) { - my $x = $t + $a; - next unless ($v{$x}); - my @o = @{$v{$x}}; - #print "$t: $x o @o\n"; - if (@v) { - for (my $y=0; $y<=$#o; $y++) { - $v[$y] += $o[$y]; - } - } else { - @v = @o; - } - #print "$t: $x v @v\n"; - $c++; - } - print "$t"; - for my $sum (@v) { - print "\t" . ($sum / $c); - } - print "\n"; -} diff --git a/branches/sage/ebofs2/script/study_find.pl b/branches/sage/ebofs2/script/study_find.pl deleted file mode 100755 index 6e6cccdf37c89..0000000000000 --- a/branches/sage/ebofs2/script/study_find.pl +++ /dev/null @@ -1,224 +0,0 @@ -#!/usr/bin/perl - -use strict; - -my $name = shift @ARGV || die; - -my $nfiles = 0; -my $ndirs = 0; -my $nreg = 0; -my $nhardlinks = 0; -my %nlinks; -my %ino_nlinks; -my %names; -my %dirsize; - -my %fnlen; - -my %hdepth; - -my $bytes; -my $ebytes; - -# -# output generated with -# -# find . -path ./.snapshot -prune -o -exec ls -dilsn --time-style=+%s \{\} \; -# -# find output looks like this: -#4495744 4 drwxrwxrwx 24 0 0 4096 1187290970 . -#2996320 8 drwxr-xr-x 189 0 1000 8192 1186594257 ./jangle -#28378499 4 drwxr-x--x 4 1068885 52673 4096 1162938122 ./jangle/cymcruise -#28378500 4 drwx--S--- 5 1068885 52673 4096 1162938122 ./jangle/cymcruise/Maildir -#28378501 4 drwx------ 2 1068885 52673 4096 1162938122 ./jangle/cymcruise/Maildir/tmp -#28378502 4 drwx------ 2 1068885 52673 4096 1162938122 ./jangle/cymcruise/Maildir/new -#28378503 4 drwx------ 2 1068885 52673 4096 1162938122 ./jangle/cymcruise/Maildir/cur -#28378504 4 -rw-r--r-- 1 1068885 52673 260 943743700 ./jangle/cymcruise/.alias -#999425 4 drwxr-xr-x 92 1125 100 4096 1186523060 . -#999426 0 lrwxrwxrwx 1 0 0 5 1177701093 ./root -> /root -#1015809 4 drwxr-xr-x 4 1289 1000 4096 1174584949 ./andrea -#541007 4 drwxr-xr-x 3 0 0 4096 1173111449 ./andrea/lux -#5014055 4 drwx--S--- 11 70228 51207 4096 1172250346 ./andrea/lux/Maildir - -# dirs we're currently counting in -my %numindir; - -sub finish_dir { - my $curdir = shift @_; - #print "finish_dir $numindir{$curdir} in $curdir\n"; - $dirsize{$numindir{$curdir}}++; - $ndirs++; - delete $numindir{$curdir}; -} - -my $curdir; -while (<>) { - #print; - chomp; - my ($ino, $blah, $mode, $nlink, $uid, $gid, $size, $mtime, @path) = split(/[ ]+/,$_); - my $file = join(' ',@path); - ($file) = split(/ \-\> /, $file); # ignore symlink dest - my @bits = split(/\//, $file); - my $depth = scalar(@bits); - my $f = pop @bits; - my $dir = join('/', @bits); - #print "file = '$file', dir = '$dir', curdir = '$curdir'\n"; - - if ($dir ne $curdir) { - for my $d (keys %numindir) { - #print "? $d vs $dir\n"; - &finish_dir($d) if ($d ne substr($dir, 0, length($d))); - } - $curdir = $dir; - } - - my $esize = 0; - $esize = int (($size-1)/4096)*4096 + 4096 if $size > 0; - $esize += 160; # for the inode? - $bytes += $size; - $ebytes += $esize; - - $nfiles++; - $numindir{$dir}++; - - $hdepth{$depth}++; - - my $fnlen = length($f); - $fnlen{$fnlen}++; - - if ($mode =~ /^d/) { - # find does depth-first search, so assume we descend, so that on empty dir we "back out" above and &finish_dir. - $numindir{$file} = 0; - $curdir = $file; - } else { - $nreg++ if $mode =~ /^f/; - if ($nlink > 1) { - #system "ls -aldi $file"; - $nhardlinks++; - $nlinks{$nlink}++; - $ino_nlinks{$ino} = $nlink; - push(@{$names{$ino}->{$dir}}, $file); - } - } -} -for my $d (keys %numindir) { - &finish_dir($d); -} - - - -my $nsamedir = 0; -open(LOG, ">$name.log"); -my %dirmap; # from dir -> to dir -for my $ino (keys %names) { - print LOG "# $ino\n"; - my @dirs = keys %{$names{$ino}}; - my $insamedir = 1 if scalar(@dirs) == 1; - for my $dir (@dirs) { - print LOG "#\t$dir\n"; - for my $fn (@{$names{$ino}->{$dir}}) { - print LOG "#\t\t$fn\n"; - $nsamedir++ if $insamedir; - } - } - - # stick in dirmap - for (my $i=0; $i<$#dirs; $i++) { - for (my $j=1; $j <= $#dirs; $j++) { - print LOG "# $dirs[$i] <-> $dirs[$j]\n"; - push(@{$dirmap{$dirs[$i]}->{$dirs[$j]}}, $ino); - push(@{$dirmap{$dirs[$j]}->{$dirs[$i]}}, $ino); - } - } -} - - -my $notherinsamedir = 0; -my $notherinsamedirs = 0; -for my $ino (keys %names) { - my @dirs = keys %{$names{$ino}}; - next unless (scalar(@dirs) > 1); - my $n = 0; - my $np = 0; - for (my $i=0; $i<$#dirs; $i++) { - for (my $j=$i+1; $j <= $#dirs; $j++) { - $np++; - if (scalar(@{$dirmap{$dirs[$i]}->{$dirs[$j]}}) > 1 || - scalar(@{$dirmap{$dirs[$j]}->{$dirs[$i]}}) > 1) { - $n++; - #print LOG "# $ino is not alone between $dirs[$i] and $dirs[$j] : @{$dirmap{$dirs[$j]}->{$dirs[$i]}}\n"; - } - } - } - if ($n) { - print LOG "# $ino\tfor $n / $np dir pairs, there is another hl between the same pair of dirs\n"; - $notherinsamedir += $ino_nlinks{$ino}; - $notherinsamedirs += ($n / $np) * $ino_nlinks{$ino}; - } else { - print LOG "# $ino is ALL ALONE\n"; - } -} -close LOG; -$notherinsamedirs = sprintf("%.1f",$notherinsamedirs); - - -sub do_cdf { - my $hash = shift @_; - my $num = shift @_; - my $fn = shift @_; - - open(CDF, ">$fn") if $fn; - print CDF "# $name\n"; - - my $median; - my $sum = 0; - my $p = 0; - my $lastv = 0; - for my $v (sort {$a <=> $b} keys %$hash) { - print CDF "$v\t$hash->{$v}\n"; - $p += $hash->{$v}; - $sum += $hash->{$v} * $v; - if (!(defined $median) && - $p >= ($num/2)) { - $median = $v; - } - } - if ($p != $num) { - warn "uh oh, BUG, $p != $num in cdf/median calculation\n"; - } - my $avg = sprintf("%.2f", $sum/$num); - print CDF "# avg $avg, median $median, sum $sum, num $num\n"; - return ($avg, $median); -} -close DSLOG; - - -# do cdfs -my ($avgdirsize, $mediandirsize) = &do_cdf(\%dirsize, $ndirs, "$name.ds"); -my ($avgfnlen, $medianfnlen) = &do_cdf(\%fnlen, $nfiles, "$name.fnlen"); -my ($avgdepth, $mediandepth) = &do_cdf(\%hdepth, $nfiles, "$name.hdepth"); - - -# stat fs -#my $df = `df $base`; -#my $line = (split(/\n/,$df))[1]; # second line -#my ($kb) = $df =~ /\s+\d+\s+(\d+)/; -my $gb = sprintf("%.1f",($ebytes / 1024 / 1024 / 1024)); - -open(O, ">$name.sum"); - -# final line -my $pad = '# ' . (' ' x (length($name)-2)); -print O "$pad\tgb\tfiles\tdirs\tdsavg\tdsmed\tfnavg\tfnmed\treg\tnl>1\tsmdr\tothers\totherss\tnlink=2\t=3\t=4\t...\n"; -print O "$name\t$gb\t$nfiles\t$ndirs\t$avgdirsize\t$mediandirsize\t$avgfnlen\t$medianfnlen\t$nreg\t$nhardlinks\t$nsamedir\t$notherinsamedir\t$notherinsamedirs"; -my $i = 2; -for (sort {$a <=> $b} keys %nlinks) { - while ($_ < $i) { - print O "\t0"; - } - print O "\t$nlinks{$_}"; - $i = $_ + 1; -} -print O "\n"; - -close O; diff --git a/branches/sage/ebofs2/script/study_hardlink_lifetimes.pl b/branches/sage/ebofs2/script/study_hardlink_lifetimes.pl deleted file mode 100755 index 012ef6009bb43..0000000000000 --- a/branches/sage/ebofs2/script/study_hardlink_lifetimes.pl +++ /dev/null @@ -1,131 +0,0 @@ -#!/usr/bin/perl - -use strict; - -my %ns; # parent -> fn -> ino -my %nlink; # num links to each ino -my %since; # when it got its second link - -my @ignore = ('ll_getattr','ll_setattr','ll_forget','ll_fsync','ll_readlink','ll_statfs','ll_opendir','ll_releasedir','ll_flush','ll_release','ll_open','ll_read','ll_write'); - -my $when; - -my $sumage; -my $numage; - -sub unlink { - my ($p,$n) = @_; - my $i = $ns{$p}->{$n}; - my $new = --$nlink{$i}; - if ($new == 1) { - my $age = $when - $since{$i}; - #print "$since{$i} to $when on $i\t$age\n"; - delete $since{$i}; - - $numage++; - $sumage += $age; - - } elsif ($new == 0) { - delete $nlink{$i}; - } - delete $ns{$p}->{$n}; -} - - -my ($sec, $usec, $cmd); -$_ = <>; -while (1) { - # read trace record - chomp; - last unless $_ eq '@'; - - chomp(my $sec = <>); - chomp(my $usec = <>); - $when = sprintf("%d.%06d",$sec,$usec);# + ($usec / 1000000); - #$when = "$sec.$usec"; - - chomp($cmd = <>); - - #print "cmd $cmd\n"; - - if ($cmd eq 'll_lookup') { - chomp(my $p = <>); - chomp(my $n = <>); - chomp(my $r = <>); - $ns{$p}->{$n} = $r; - } - - elsif ($cmd eq 'll_create') { - chomp(my $p = <>); - chomp(my $n = <>); - <>; <>; <>; - chomp(my $r = <>); - $ns{$p}->{$n} = $r; - $nlink{$r} = 1; - } - elsif ($cmd eq 'll_mknod') { - chomp(my $p = <>); - chomp(my $n = <>); - <>; <>; - chomp(my $r = <>); - $ns{$p}->{$n} = $r; - $nlink{$r} = 1; - } - elsif ($cmd eq 'll_mkdir') { - chomp(my $p = <>); - chomp(my $n = <>); - <>; - chomp(my $r = <>); - $ns{$p}->{$n} = $r; - $nlink{$r} = 1; - } - elsif ($cmd eq 'll_symlink') { - chomp(my $p = <>); - chomp(my $n = <>); - <>; - chomp(my $r = <>); - $ns{$p}->{$n} = $r; - $nlink{$r} = 1; - } - elsif ($cmd eq 'll_link') { - chomp(my $i = <>); - chomp(my $p = <>); - chomp(my $n = <>); - $ns{$p}->{$n} = $i; - if (++$nlink{$i} == 2) { - $since{$i} = $when; - } - } - elsif ($cmd eq 'll_unlink' || - $cmd eq 'll_rmdir') { - chomp(my $p = <>); - chomp(my $n = <>); - &unlink($p, $n); - } - elsif ($cmd eq 'll_rename') { - chomp(my $p = <>); - chomp(my $n = <>); - chomp(my $np = <>); - chomp(my $nn = <>); - if ($ns{$np}->{$nn}) { - &unlink($np, $nn); - } - $ns{$np}->{$nn} = $ns{$p}->{$n}; - delete $ns{$p}->{$n}; - } - - # skip to @ - while (<>) { - last if $_ eq "@\n"; - print "$cmd: $_" - unless grep {$_ eq $cmd} @ignore; - } -} - -print "num $numage .. sum $sumage .. avg lifetime " . ($sumage / $numage) . "\n"; - -# dump hard link inos -for my $ino (keys %nlink) { - next if $nlink{$ino} < 2; - print "$ino\t$nlink{$ino}\n"; -} diff --git a/branches/sage/ebofs2/script/study_lookups.pl b/branches/sage/ebofs2/script/study_lookups.pl deleted file mode 100644 index 7a0784f3210a4..0000000000000 --- a/branches/sage/ebofs2/script/study_lookups.pl +++ /dev/null @@ -1,137 +0,0 @@ -#!/usr/bin/perl - -use strict; - -my @buckets = (1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096); - -my %dir_miss; # dir(ino) -> last lookup miss -my %dir_has; # ino -> dentries we have - - -my %ns; # parent -> fn -> ino -my %nlink; # num links to each ino -my %since; # when it got its second link - -my @ignore = ('ll_getattr','ll_setattr','ll_forget','ll_fsync','ll_readlink','ll_statfs','ll_opendir','ll_releasedir','ll_flush','ll_release','ll_open','ll_read','ll_write'); - -my $when; - -my $sumage; -my $numage; - -sub unlink { - my ($p,$n) = @_; - my $i = $ns{$p}->{$n}; - my $new = --$nlink{$i}; - if ($new == 1) { - my $age = $when - $since{$i}; - #print "$since{$i} to $when on $i\t$age\n"; - delete $since{$i}; - - $numage++; - $sumage += $age; - - } elsif ($new == 0) { - delete $nlink{$i}; - } - delete $ns{$p}->{$n}; -} - - -my ($sec, $usec, $cmd); -$_ = <>; -while (1) { - # read trace record - chomp; - last unless $_ eq '@'; - - chomp(my $sec = <>); - chomp(my $usec = <>); - $when = sprintf("%d.%06d",$sec,$usec);# + ($usec / 1000000); - #$when = "$sec.$usec"; - - chomp($cmd = <>); - - #print "cmd $cmd\n"; - - if ($cmd eq 'll_lookup') { - chomp(my $p = <>); - chomp(my $n = <>); - chomp(my $r = <>); - $ns{$p}->{$n} = $r; - } - - elsif ($cmd eq 'll_create') { - chomp(my $p = <>); - chomp(my $n = <>); - <>; <>; <>; - chomp(my $r = <>); - $ns{$p}->{$n} = $r; - $nlink{$r} = 1; - } - elsif ($cmd eq 'll_mknod') { - chomp(my $p = <>); - chomp(my $n = <>); - <>; <>; - chomp(my $r = <>); - $ns{$p}->{$n} = $r; - $nlink{$r} = 1; - } - elsif ($cmd eq 'll_mkdir') { - chomp(my $p = <>); - chomp(my $n = <>); - <>; - chomp(my $r = <>); - $ns{$p}->{$n} = $r; - $nlink{$r} = 1; - } - elsif ($cmd eq 'll_symlink') { - chomp(my $p = <>); - chomp(my $n = <>); - <>; - chomp(my $r = <>); - $ns{$p}->{$n} = $r; - $nlink{$r} = 1; - } - elsif ($cmd eq 'll_link') { - chomp(my $i = <>); - chomp(my $p = <>); - chomp(my $n = <>); - $ns{$p}->{$n} = $i; - if (++$nlink{$i} == 2) { - $since{$i} = $when; - } - } - elsif ($cmd eq 'll_unlink' || - $cmd eq 'll_rmdir') { - chomp(my $p = <>); - chomp(my $n = <>); - &unlink($p, $n); - } - elsif ($cmd eq 'll_rename') { - chomp(my $p = <>); - chomp(my $n = <>); - chomp(my $np = <>); - chomp(my $nn = <>); - if ($ns{$np}->{$nn}) { - &unlink($np, $nn); - } - $ns{$np}->{$nn} = $ns{$p}->{$n}; - delete $ns{$p}->{$n}; - } - - # skip to @ - while (<>) { - last if $_ eq "@\n"; - print "$cmd: $_" - unless grep {$_ eq $cmd} @ignore; - } -} - -print "num $numage .. sum $sumage .. avg lifetime " . ($sumage / $numage) . "\n"; - -# dump hard link inos -for my $ino (keys %nlink) { - next if $nlink{$ino} < 2; - print "$ino\t$nlink{$ino}\n"; -} diff --git a/branches/sage/ebofs2/script/sum.pl b/branches/sage/ebofs2/script/sum.pl deleted file mode 100755 index 92ef9a9b222a8..0000000000000 --- a/branches/sage/ebofs2/script/sum.pl +++ /dev/null @@ -1,148 +0,0 @@ -#!/usr/bin/perl - -use strict; -my $starttime = 1; -my $endtime = -1; - -my $avgrows = 0; - -while ($ARGV[0] =~ /^-/) { - $_ = shift @ARGV; - if ($_ eq '-avg') { - $avgrows = 1; - } - elsif ($_ eq '-start') { - $starttime = shift @ARGV; - } - elsif ($_ eq '-end') { - $endtime = shift @ARGV; - } - else { - die "i don't understand arg $_"; - } -} -my @files = @ARGV; - -if (scalar(@files) == 1 && $files[0] =~ /\*/) { - my ($dir, $pat) = $files[0] =~ /^(.*)\/([^\/]+)$/; - @files = (); - $pat =~ s/\*//; -# print "dir $dir pat $pat\n"; - opendir(D,"$dir"); - for my $f (readdir(D)) { - # print "$f\n"; - next unless $f =~ /^$pat/; - push(@files, "$dir/$f"); - } - closedir(D); - -# print "files = @files\n"; -} - -my @data; -for my $f (@files) { - open(I,$f); - push( @data, ); - close I; -} - -my %sum; # time -> name -> val -my %col; # colnum -> name .. colnums start at 0 (time doesn't count) -my %min; -my %max; -my %avg; -my %tcount; -my $files; -for (@data) { - chomp; - my @r = split(/\s+/,$_); - my $r = shift @r; - - # column headings? - if ($r =~ /^\#/) { - my $num = 0; - while (my $name = shift @r) { - $col{$num} = $name; - $num++; - } - next; - } - - next unless int $r; - next if $r < $starttime; - next if $endtime > 0 && $r > $endtime; - - $tcount{$r}++; - $files = $tcount{$r} if $tcount{$r} > $files; - #print "$r: @r\n"; - my $i = 0; - while (@r) { - my $v = shift @r; - $sum{$r}->{$col{$i}} += $v; # if $v > 0; - - $min{$col{$i}} = $v - if ($min{$col{$i}} > $v || !(defined $min{$col{$i}})); - $max{$col{$i}} = $v - if ($max{$col{$i}} < $v); - - $avg{$col{$i}} += $v; - $i++; - } -} - -## dump -my @c = sort {$a <=> $b} keys %col; -# cols -print join("\t",'#', map { $col{$_} } @c) . "\n"; -my $n = 0; -for my $k (sort {$a <=> $b} keys %sum) { - if ($avgrows) { - print join("\t",$k, #map int, - map { $sum{$k}->{$col{$_}}/$tcount{$k} } @c ) . "\n"; - } else { - print join("\t",$k, map { $sum{$k}->{$col{$_}} } @c ) . "\n"; - } - $n++; -} - -my $rows = $n || 1; -#my $files = $tcount{$starttime}; -my %avgval; - -## devt -#warn "rows $rows, files $files\n"; -my %avgvalvart; # std dev of each col avg, over time -for my $k (keys %avg) { - my $av = $avgval{$k} = $avg{$k} / ($rows*$files); - - my $var = 0.0; - for my $t (sort {$a <=> $b} keys %sum) { - my $a = $sum{$t}->{$k} / $files; - $var += ($a - $av) * ($a - $av); - } - - $avgvalvart{$k} = $var / $rows; -} - - - - -print "\n"; -print join("\t",'#', map { $col{$_} } @c) . "\n"; -print join("\t", '#minval', map { $min{$col{$_}} } @c ) . "\n"; -print join("\t", '#maxval', map { $max{$col{$_}} } @c ) . "\n"; -print join("\t", '#rows', map { $rows } @c) . "\n"; -print join("\t", '#files', map { $files } @c) . "\n"; -print join("\t", '#sum', - map { $avg{$col{$_}} } @c ) . "\n"; -print join("\t", '#avgval', #map int, - map { $avgval{$col{$_}} } @c ) . "\n"; -# map { ($rows*$files) ? ($_ / ($rows*$files)):0 } map { $avg{$col{$_}} } @c ) . "\n"; - -print join("\t", '#avgvalvart', - map { $avgvalvart{$col{$_}} } @c ) . "\n"; -print join("\t", '#avgvaldevt', - map { sqrt($_) } map { $avgvalvart{$col{$_}} } @c ) . "\n"; - -print join("\t", '#avgsum', #map int, - map { $_ / $rows } map { $avg{$col{$_}} } @c ) . "\n"; diff --git a/branches/sage/ebofs2/test/fakemds.cc b/branches/sage/ebofs2/test/fakemds.cc deleted file mode 100644 index b75b62d58152c..0000000000000 --- a/branches/sage/ebofs2/test/fakemds.cc +++ /dev/null @@ -1,104 +0,0 @@ - - -#include -#include -#include - -#include "mds/MDS.h" -#include "osd/OSD.h" -#include "fakeclient/FakeClient.h" - -#include "mds/MDCluster.h" -#include "mds/MDCache.h" -#include "mds/MDStore.h" - -#include "msg/FakeMessenger.h" - -#include "messages/MPing.h" - -using namespace std; - -__uint64_t ino = 1; - - - -#include "config.h" -#define NUMMDS g_conf.num_mds -#define NUMOSD g_conf.num_osd -#define NUMCLIENT g_conf.num_fakeclient - -// this parses find output -int play(); - -int main(int oargc, char **oargv) { - cerr << "hi there" << endl; - - int argc; - char **argv; - parse_config_options(oargc, oargv, - argc, argv); - - MDCluster *mdc = new MDCluster(NUMMDS, NUMOSD); - - // local config settings - g_conf.num_client = g_conf.num_fakeclient; // to fool mds, hack gross - - // create osds - OSD *osd[NUMOSD]; - for (int i=0; iinit(); - } - - // create mds - MDS *mds[NUMMDS]; - for (int i=0; iinit(); - } - - - // create clients - FakeClient *client[NUMCLIENT]; - for (int i=0; iinit(); - } - - // mount clients - for (int i=0; imount(); - - // loop - fakemessenger_do_loop(); - - //mds[0]->shutdown_start(); - //fakemessenger_do_loop(); - - // - if (argc > 1 && - strcmp(argv[1], "nocheck") == 0) { - cerr << "---- nocheck" << endl; - } else { - cout << "---- check ----" << endl; - for (int i=0; imdcache->shutdown_pass(); - } - - // cleanup - cout << "cleanup" << endl; - for (int i=0; i - * Daniel Jönsson - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the Do What The Fuck You Want To - * Public License as published by Banlu Kemiyatorn. See - * http://sam.zoy.org/projects/COPYING.WTFPL for more details. - * - * Compilation example: - * gcc -shared -fPIC gprof-helper.c -o gprof-helper.so -lpthread -ldl - * - * Usage example: - * LD_PRELOAD=./gprof-helper.so your_program - */ - -#define _GNU_SOURCE -#include -#include -#include -#include -#include - -static void * wrapper_routine(void *); - -/* Original pthread function */ -static int (*pthread_create_orig)(pthread_t *__restrict, - __const pthread_attr_t *__restrict, - void *(*)(void *), - void *__restrict) = NULL; - -/* Library initialization function */ -void wooinit(void) __attribute__((constructor)); - -void wooinit(void) -{ - pthread_create_orig = dlsym(RTLD_NEXT, "pthread_create"); - fprintf(stderr, "pthreads: using profiling hooks for gprof\n"); - if(pthread_create_orig == NULL) - { - char *error = dlerror(); - if(error == NULL) - { - error = "pthread_create is NULL"; - } - fprintf(stderr, "%s\n", error); - exit(EXIT_FAILURE); - } -} - -/* Our data structure passed to the wrapper */ -typedef struct wrapper_s -{ - void * (*start_routine)(void *); - void * arg; - - pthread_mutex_t lock; - pthread_cond_t wait; - - struct itimerval itimer; - -} wrapper_t; - -/* The wrapper function in charge for setting the itimer value */ -static void * wrapper_routine(void * data) -{ - /* Put user data in thread-local variables */ - void * (*start_routine)(void *) = ((wrapper_t*)data)->start_routine; - void * arg = ((wrapper_t*)data)->arg; - - /* Set the profile timer value */ - setitimer(ITIMER_PROF, &((wrapper_t*)data)->itimer, NULL); - - /* Tell the calling thread that we don't need its data anymore */ - pthread_mutex_lock(&((wrapper_t*)data)->lock); - pthread_cond_signal(&((wrapper_t*)data)->wait); - pthread_mutex_unlock(&((wrapper_t*)data)->lock); - - /* Call the real function */ - return start_routine(arg); -} - -/* Our wrapper function for the real pthread_create() */ -int pthread_create(pthread_t *__restrict thread, - __const pthread_attr_t *__restrict attr, - void * (*start_routine)(void *), - void *__restrict arg) -{ - wrapper_t wrapper_data; - int i_return; - - /* Initialize the wrapper structure */ - wrapper_data.start_routine = start_routine; - wrapper_data.arg = arg; - getitimer(ITIMER_PROF, &wrapper_data.itimer); - pthread_cond_init(&wrapper_data.wait, NULL); - pthread_mutex_init(&wrapper_data.lock, NULL); - pthread_mutex_lock(&wrapper_data.lock); - - /* The real pthread_create call */ - i_return = pthread_create_orig(thread, - attr, - &wrapper_routine, - &wrapper_data); - - /* If the thread was successfully spawned, wait for the data - * to be released */ - if(i_return == 0) - { - pthread_cond_wait(&wrapper_data.wait, &wrapper_data.lock); - } - - pthread_mutex_unlock(&wrapper_data.lock); - pthread_mutex_destroy(&wrapper_data.lock); - pthread_cond_destroy(&wrapper_data.wait); - - return i_return; -} - diff --git a/branches/sage/ebofs2/test/makedirs.cc b/branches/sage/ebofs2/test/makedirs.cc deleted file mode 100644 index 8fd74d996ef9f..0000000000000 --- a/branches/sage/ebofs2/test/makedirs.cc +++ /dev/null @@ -1,38 +0,0 @@ -#include -#include -using namespace std; - -int make_dirs(const char *basedir, int dirs, int files, int depth) -{ - //if (time_to_stop()) return 0; - - // make sure base dir exists - int r = mkdir(basedir, 0755); - if (r != 0) { - cout << "can't make base dir? " << basedir << endl; - return -1; - } - - // children - char d[500]; - cout << "make_dirs " << basedir << " dirs " << dirs << " files " << files << " depth " << depth << endl; - for (int i=0; i -#include -#include -using namespace std; - -#include "mds/MDCluster.h" -#include "mds/MDS.h" -#include "osd/OSD.h" -#include "fakeclient/FakeClient.h" - -#include "mds/MDCache.h" -#include "mds/MDStore.h" - -#include "msg/MPIMessenger.h" -//#include "msg/CheesySerializer.h" - -#include "messages/MPing.h" - - -__uint64_t ino = 1; - - - -#include "config.h" -#define NUMMDS g_conf.num_mds -#define NUMOSD g_conf.num_osd -#define NUMCLIENT g_conf.num_client - -// this parses find output -int play(); - -int main(int argc, char **argv) { - cout << "mpitest starting" << endl; - - int myrank = mpimessenger_init(argc, argv); - int world = mpimessenger_world(); - - - - MDCluster *mdc = new MDCluster(NUMMDS, NUMOSD); - - // create osds - OSD *osd[NUMOSD]; - for (int i=0; iinit(); - } - - // create mds - MDS *mds[NUMMDS]; - for (int i=0; iinit(); - } - - // create clients - FakeClient *client[NUMCLIENT]; - for (int i=0; iset_dispatcher(serializer); - - client[i] = new FakeClient(mdc, i, real, g_conf.fakeclient_requests); - client[i]->init(); - } - - // seed initial requests - for (int i=0; iissue_request(); - } - - mpimessenger_start(); // start message loop - mpimessenger_wait(); // wait for thread to finish - mpimessenger_shutdown(); // shutdown MPI - - // - /* - cout << "---- check ----" << endl; - for (int i=0; imdcache->shutdown_pass(); - } - */ - - // cleanup - //cout << "cleanup" << endl; - for (int i=0; i -#include "mpi.h" - -#include "messages/MClientRequest.h" -#include "msg/MTMessenger.h" -#include "include/error.h" - -#define SARG_SIZE 64 -#define SERVER_RANK 0 -#define NTHREADS 11 // number of threads per rank -#define NMESSAGES 31 // number of messages per thread - -static void server_loop(MTMessenger &msgr, int world_size) -{ - // we expect this many messages from clients, then we quit - // (world_size-1 since server is one of the processes). - int totmsg = NTHREADS * NMESSAGES * (world_size - 1); - int nmsg = 0; - - char buf[SARG_SIZE]; - - while(nmsg < totmsg) { - MClientRequest *req = (MClientRequest*)msgr.recvreq(); - ASSERT(req->get_type() == MSG_CLIENT_REQUEST); - - //cout << "Server acknowledging " << req->get_sarg() << endl; - - sprintf(buf, "%s reply", req->get_sarg().c_str()); - MClientRequest resp(0, 0); - resp.set_sarg(buf); - msgr.sendresp(req, &resp); - - delete req; - nmsg++; - } - - cout << "Server successful" << endl; -} - -// arguments for client thread start function (see pthread_create) -struct client_arg -{ - MTMessenger *msgr; - int rank; - int thread; -}; - -static void *client_session(void *_carg) -{ - client_arg *carg = (client_arg *)_carg; - - char buf[SARG_SIZE]; - - // repeat some number (arbitrary really) of rounds - for (int i = 0; i < NMESSAGES; i++) { - - // send the message, receive the reply and check reply is as - // expected - - MClientRequest request(0, 0); - sprintf(buf, "r%d:t%d:m%d", carg->rank, carg->thread, i); - request.set_sarg(buf); - - //cout << "Client sending " << request.get_sarg() << endl; - - MClientRequest *resp = - (MClientRequest*)carg->msgr->sendrecv(&request, SERVER_RANK); - - ASSERT(resp->get_type() == MSG_CLIENT_REQUEST); - sprintf(buf, "r%d:t%d:m%d reply", carg->rank, carg->thread, i); - ASSERT(strcmp(buf, resp->get_sarg().c_str()) == 0); - - //cout << "Client verified " << resp->get_sarg() << endl; - - delete resp; - } - - cout << "Client (" << carg->rank << "," << carg->thread - << ") successful" << endl; - - delete carg; - return NULL; -} - -static void launch_clients(MTMessenger &msgr, int rank) -{ - pthread_t tid[NTHREADS]; - - // launch some number (arbitrary really) of threads - for (int i = 0; i < NTHREADS; i++) { - - client_arg *carg = (client_arg*)malloc(sizeof(client_arg)); - ASSERT(carg); - carg->msgr = &msgr; - carg->rank = rank; - carg->thread = i; - - if (pthread_create(&tid[i], NULL, client_session, carg) < 0) - SYSERROR(); - } - - // we must wait for all the threads to exit before returning, - // otherwise we shutdown MPI before while the threads are - // chatting. - for (int i = 0; i < NTHREADS; i++) { - void *retval; - - if (pthread_join(tid[i], &retval) < 0) - SYSERROR(); - } -} - -int main(int argc, char **argv) -{ - MTMessenger msgr(argc, argv); - - int rank; - ASSERT(MPI_Comm_rank(MPI_COMM_WORLD, &rank) == MPI_SUCCESS); - int world_size; - ASSERT(MPI_Comm_size(MPI_COMM_WORLD, &world_size) == MPI_SUCCESS); - - if (rank == SERVER_RANK) - server_loop(msgr, world_size); - else - launch_clients(msgr, rank); - - return 0; -} diff --git a/branches/sage/ebofs2/test/rushconfig b/branches/sage/ebofs2/test/rushconfig deleted file mode 100644 index 40d82702ea0a5..0000000000000 --- a/branches/sage/ebofs2/test/rushconfig +++ /dev/null @@ -1,7 +0,0 @@ -6 -8 10.0 -4 20.0 -7 30.0 -9 10.0 -8 15.0 -5 11.0 diff --git a/branches/sage/ebofs2/test/rushtest.cc b/branches/sage/ebofs2/test/rushtest.cc deleted file mode 100644 index ecff83523e0c6..0000000000000 --- a/branches/sage/ebofs2/test/rushtest.cc +++ /dev/null @@ -1,49 +0,0 @@ -// -// $Id$ -// - -#include -#include -#include "../osd/rush.h" - -main (int argc, char *argv[]) -{ - Rush rush; - char buf[200]; - int i, j, k, numClusters; - int numKeys = 5; - int numReplicas = 4; - int curSize; - double curWeight; - int servers[1000]; - - if (argc > 1) { - numKeys = atoi (argv[1]); - } - if (argc > 2) { - numReplicas = atoi (argv[2]); - } - - fgets (buf, sizeof (buf) - 2, stdin); - sscanf (buf, "%d", &numClusters); - for (i = 0; i < numClusters; i++) { - fgets (buf, sizeof (buf) - 2, stdin); - sscanf (buf, "%d %lf", &curSize, &curWeight); - rush.AddCluster (curSize, curWeight); - if (rush.Servers () < numReplicas) { - fprintf (stderr, "ERROR: must have at least %d disks in the system!\n", - rush.Clusters ()); - exit (-1); - } - for (j = 0; j < numKeys; j++) { - rush.GetServersByKey (j, numReplicas, servers); -#if 0 - printf ("%-3d %-6d ", i, j); - for (k = 0; k < numReplicas; k++) { - printf ("%-5d ", servers[k]); - } - putchar ('\n'); -#endif - } - } -} diff --git a/branches/sage/ebofs2/test/rushtest.cc~ b/branches/sage/ebofs2/test/rushtest.cc~ deleted file mode 100644 index 0b9512ccd0c3d..0000000000000 --- a/branches/sage/ebofs2/test/rushtest.cc~ +++ /dev/null @@ -1,49 +0,0 @@ -// -// $Id$ -// - -#include -#include -#include "rush.h" - -main (int argc, char *argv[]) -{ - Rush rush; - char buf[200]; - int i, j, k, numClusters; - int numKeys = 5; - int numReplicas = 4; - int curSize; - double curWeight; - int servers[1000]; - - if (argc > 1) { - numKeys = atoi (argv[1]); - } - if (argc > 2) { - numReplicas = atoi (argv[2]); - } - - fgets (buf, sizeof (buf) - 2, stdin); - sscanf (buf, "%d", &numClusters); - for (i = 0; i < numClusters; i++) { - fgets (buf, sizeof (buf) - 2, stdin); - sscanf (buf, "%d %lf", &curSize, &curWeight); - rush.AddCluster (curSize, curWeight); - if (rush.Servers () < numReplicas) { - fprintf (stderr, "ERROR: must have at least %d disks in the system!\n", - rush.Clusters ()); - exit (-1); - } - for (j = 0; j < numKeys; j++) { - rush.GetServersByKey (j, numReplicas, servers); -#if 0 - printf ("%-3d %-6d ", i, j); - for (k = 0; k < numReplicas; k++) { - printf ("%-5d ", servers[k]); - } - putchar ('\n'); -#endif - } - } -} diff --git a/branches/sage/ebofs2/test/test_disk_bw.cc b/branches/sage/ebofs2/test/test_disk_bw.cc deleted file mode 100644 index fc36da74fadb2..0000000000000 --- a/branches/sage/ebofs2/test/test_disk_bw.cc +++ /dev/null @@ -1,59 +0,0 @@ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "common/Clock.h" - -#include -using namespace std; - -int main(int argc, char **argv) -{ - void *buf; - int fd, count, loop = 0, ret; - - if (argc != 4) { - fprintf(stderr, "Usage: %s device bsize count\n", argv[0]); - exit (0); - } - - int bsize = atoi(argv[2]); - count = atoi(argv[3]); - - posix_memalign(&buf, sysconf(_SC_PAGESIZE), bsize); - - //if ((fd = open(argv[1], O_SYNC|O_RDWR)) < 0) { - if ((fd = open(argv[1], O_DIRECT|O_RDWR)) < 0) { - - fprintf(stderr, "Can't open device %s\n", argv[1]); - exit (4); - } - - - utime_t start = g_clock.now(); - while (loop++ < count) { - ret = ::write(fd, buf, bsize); - //if ((loop % 100) == 0) - //fprintf(stderr, "."); - } - ::fsync(fd); - ::close(fd); - utime_t end = g_clock.now(); - end -= start; - - - char hostname[80]; - gethostname(hostname, 80); - - double mb = bsize*count/1024/1024; - - cout << hostname << "\t" << mb << " MB\t" << end << " seconds\t" << (mb / (double)end) << " MB/sec" << std::endl; -} diff --git a/branches/sage/ebofs2/test/test_seek_read.c b/branches/sage/ebofs2/test/test_seek_read.c deleted file mode 100644 index 988ff1dcec88d..0000000000000 --- a/branches/sage/ebofs2/test/test_seek_read.c +++ /dev/null @@ -1,53 +0,0 @@ -#include "include/types.h" -#include "common/Clock.h" - -#include -#include -#include -#include -#include -#include - -int main(int argc, char **argv) -{ - char *fn = argv[1]; - uint64_t numblocks = atoll(argv[2]) / 4096; - int count = 400; - - cout << "fn " << fn << endl; - cout << "numblocks " << numblocks << endl; - - int blocks = 1; - while (blocks <= 1024) { - int fd = ::open(fn, O_RDWR|O_DIRECT);//|O_SYNC|O_DIRECT); - if (fd < 0) return 1; - //cout << "fd is " << fd << endl; - - void *buf; - ::posix_memalign(&buf, 4096, 4096*blocks); - - int s = blocks*4096; - - utime_t start = g_clock.now(); - for (int i=0; i -#include -using namespace std; - - -ostream& operator<<(ostream& out, vector& v) -{ - out << "["; - for (int i=0; i disks; - for (int i=0; i<20; i++) - disks.push_back(i); - - - /* - UniformBucket ub(1, 1, 0, 10, disks); - ub.make_primes(h); - cout << "primes are " << ub.primes << endl; - */ - - MixedBucket mb(2, 1); - for (int i=0;i<20;i++) - mb.add_item(i, 10); - - /* - MixedBucket b(3, 1); - b.add_item(1, ub.get_weight()); - b.add_item(2, mb.get_weight()); - */ - MixedBucket b= mb; - - vector ocount(disks.size()); - int numrep = 3; - - vector v(numrep); - for (int x=1; x<1000000; x++) { - //cout << H(x) << "\t" << h(x) << endl; - for (int i=0; i -using namespace std; - -#include "include/bufferlist.h" - - -int main() -{ - - bufferptr p1 = new buffer("123456",6); - bufferptr p2 = p1; - - cout << "it is '" << p1.c_str() << "'" << endl; - - bufferptr p3 = new buffer("abcdef",6); - - cout << "p3 is " << p3 << endl; - - bufferlist bl; - bl.push_back(p2); - bl.push_back(p1); - bl.push_back(p3); - - cout << "bl is " << bl << endl; - - cout << "len is " << bl.length() << endl; - - bufferlist took; - bl.splice(10,4,&took); - - cout << "took out " << took << "leftover is " << bl << endl; - //cout << "len is " << bl.length() << endl; - - bufferlist bl2; - bl2.substr_of(bl, 3, 5); - cout << "bl2 is " << bl2 << endl; - - -} diff --git a/branches/sage/ebofs2/test/testcounter.cc b/branches/sage/ebofs2/test/testcounter.cc deleted file mode 100644 index a3194489e4886..0000000000000 --- a/branches/sage/ebofs2/test/testcounter.cc +++ /dev/null @@ -1,70 +0,0 @@ - -#include "common/DecayCounter.h" - -#include -using namespace std; - -struct RealCounter { -public: - list hits; - - void hit(int ms) { - hits.push_back(ms); - } - - int get(double hl, int now) { - trim(now-hl); - return hits.size(); - } - - void trim(int to) { - while (!hits.empty() && - hits.front() < to) - hits.pop_front(); - } - - -}; - -int main(int argc, char **argv) -{ - int target; - double hl = atof(argv[1]); - cerr << "halflife " << hl << endl; - - DecayCounter dc(hl); - RealCounter rc; - - utime_t now = g_clock.now(); - - for (int ms=0; ms < 300*1000; ms++) { - if (ms % 30000 == 0) { - target = 1 + (rand() % 10) * 10; - if (ms > 200000) target = 0; - } - - if (target && - (rand() % (1000/target) == 0)) { - dc.hit(); - rc.hit(ms); - } - - if (ms % 500 == 0) dc.get(now); - if (ms % 100 == 0) { - //dc.get(now); - DecayCounter o = dc; - cout << ms << "\t" - << target*hl << "\t" - << rc.get(hl*1000, ms) << "\t" - << o.get(now) << "\t" - << dc.val << "\t" - // << dc.delta << "\t" - << o.get_last_vel() << "\t" - << o.get_last() + o.get_last_vel() << "\t" - << endl; - } - - now += .001; - } - -} diff --git a/branches/sage/ebofs2/test/testcrush.cc b/branches/sage/ebofs2/test/testcrush.cc deleted file mode 100644 index bd432b23ee95c..0000000000000 --- a/branches/sage/ebofs2/test/testcrush.cc +++ /dev/null @@ -1,266 +0,0 @@ - - -#include "../crush/crush.h" -using namespace crush; - -#include - -#include -#include -using namespace std; - -/* -ostream& operator<<(ostream& out, vector& v) -{ - out << "["; - for (int i=0; i& d) -{ - d.clear(); - while (n) { - d.push_back(no); - no++; - n--; - } -} - - -Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks, int& nbuckets) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - return b; - } else { - // mixed - MixedBucket *b = new MixedBucket(nbuckets--, h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, int& ndisks, int& nbuckets) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks, nbuckets); - return b->get_id(); -} - - - -int main() -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - vector disks; - int root = -1; - int nbuckets = -1; - int ndisks = 0; - - if (0) { - make_disks(12, ndisks, disks); - UniformBucket ub1(-1, 1, 0, 30, disks); - ub1.make_primes(h); - cout << "ub1 primes are " << ub1.primes << endl; - c.add_bucket(&ub1); - - make_disks(17, ndisks, disks); - UniformBucket ub2(-2, 1, 0, 30, disks); - ub2.make_primes(h); - cout << "ub2 primes are " << ub2.primes << endl; - c.add_bucket(&ub2); - - make_disks(4, ndisks, disks); - UniformBucket ub3(-3, 1, 0, 30, disks); - ub3.make_primes(h); - cout << "ub3 primes are " << ub3.primes << endl; - c.add_bucket(&ub3); - - make_disks(20, ndisks, disks); - MixedBucket umb1(-4, 1); - for (int i=0; i<20; i++) - umb1.add_item(disks[i], 30); - c.add_bucket(&umb1); - - MixedBucket b(-100, 1); - //b.add_item(-2, ub1.get_weight()); - b.add_item(-4, umb1.get_weight()); - //b.add_item(-2, ub2.get_weight()); - //b.add_item(-3, ub3.get_weight()); - } - - if (0) { - int bucket = -1; - MixedBucket *root = new MixedBucket(bucket--, 2); - - for (int i=0; i<5; i++) { - MixedBucket *b = new MixedBucket(bucket--, 1); - - int n = 5; - - if (1) { - // add n buckets of n disks - for (int j=0; jadd_item(disks[k], 10); - - //b->add_item(disks[j], 10); - c.add_bucket(d); - b->add_item(d->get_id(), d->get_weight()); - } - - c.add_bucket(b); - root->add_item(b->get_id(), b->get_weight()); - } else { - // add n*n disks - make_disks(n*n, ndisks, disks); - for (int k=0; kadd_item(disks[k], 10); - - c.add_bucket(b); - root->add_item(b->get_id(), b->get_weight()); - } - } - - c.add_bucket(root); - } - - - if (1) { - vector wid; - for (int d=0; d<5; d++) - wid.push_back(10); - root = make_hierarchy(c, wid, ndisks, nbuckets); - } - - - - // rule - int numrep = 1; - - Rule rule; - if (0) { - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, -100)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - } - if (1) { - /* - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, -4)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, 2, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - */ - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, 1, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - } - - //c.overload[10] = .1; - - - int pg_per = 100; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - cout << ndisks << " disks, " << 1-nbuckets << " buckets" << endl; - cout << pg_per << " pgs per disk" << endl; - cout << numpg << " logical pgs" << endl; - cout << "numrep is " << numrep << endl; - - - int place = 1000000; - int times = place / numpg; - if (!times) times = 1; - - cout << "looping " << times << " times" << endl; - - float tvar = 0; - int tvarnum = 0; - - int x = 0; - for (int t=0; t v(numrep); - - for (int z=0; z -using namespace std; - -int print(string s) { - filepath fp = s; - cout << "s = " << s << " filepath = " << fp << endl; - cout << " depth " << fp.depth() << endl; - for (int i=0; i -#include -#include -using namespace std; - -#include "config.h" -#include "messages/MPing.h" -#include "common/Mutex.h" - -#include "msg/MPIMessenger.h" - -class Pinger : public Dispatcher { -public: - Messenger *messenger; - Pinger(Messenger *m) : messenger(m) { - m->set_dispatcher(this); - } - void dispatch(Message *m) { - //dout(1) << "got incoming " << m << endl; - delete m; - - } -}; - -int main(int argc, char **argv) { - int num = 1000; - - int myrank = mpimessenger_init(argc, argv); - int world = mpimessenger_world(); - - Pinger *p = new Pinger( new MPIMessenger(myrank) ); - - mpimessenger_start(); - - //while (1) { - for (int i=0; i<10000; i++) { - - // ping random nodes - int d = rand() % world; - if (d != myrank) { - //cout << "sending " << i << " to " << d << endl; - p->messenger->send_message(new MPing(), d); - } - - } - - - //cout << "shutting down" << endl; - //p->messenger->shutdown(); - - mpimessenger_wait(); - mpimessenger_shutdown(); // shutdown MPI -} diff --git a/branches/sage/ebofs2/test/testnewbuffers.cc b/branches/sage/ebofs2/test/testnewbuffers.cc deleted file mode 100644 index 0fea7571a4572..0000000000000 --- a/branches/sage/ebofs2/test/testnewbuffers.cc +++ /dev/null @@ -1,91 +0,0 @@ - -#include -#include -using namespace std; - - -#include "include/newbuffer.h" -//#include "include/bufferlist.h" - -#include "common/Thread.h" - - - class Th : public Thread { - public: - bufferlist bl; - Th(bufferlist& o) : bl(o) { } - - void *entry() { - //cout << "start" << endl; - // thrash it a bit. - for (int n=0; n<10000; n++) { - bufferlist bl2; - unsigned off = rand() % (bl.length() -1); - unsigned len = 1 + rand() % (bl.length() - off - 1); - bl2.substr_of(bl, off, len); - bufferlist bl3; - bl3.append(bl); - bl3.append(bl2); - //cout << bl3 << endl; - bl2.clear(); - bl3.clear(); - } - //cout << "end" << endl; - } - }; - -int main() -{ - - bufferptr p1 = buffer::copy("123456",7); - //bufferptr p1 = new buffer("123456",7); - bufferptr p2 = p1; - - cout << "p1 is '" << p1.c_str() << "'" << " " << p1 << endl; - cout << "p2 is '" << p2.c_str() << "'" << " " << p2 << endl; - - bufferptr p3 = buffer::copy("abcdef",7); - //bufferptr p3 = new buffer("abcdef",7); - - cout << "p3 is " << p3.c_str() << " " << p3 << endl; - - bufferlist bl; - bl.push_back(p2); - bl.push_back(p1); - bl.push_back(p3); - - cout << "bl is " << bl << endl; - - bufferlist took; - bl.splice(10,4,&took); - - cout << "took out " << took << ", leftover is " << bl << endl; - //cout << "len is " << bl.length() << endl; - - bufferlist bl2; - bl2.substr_of(bl, 3, 5); - cout << "bl2 is " << bl2 << endl; - - - cout << "bl before " << bl << endl; - - list ls; - for (int t=0; t<40; t++) { - Th *t = new Th(bl); - cout << "create" << endl; - t->create(); - ls.push_back(t); - } - - bl.clear(); - - while (!ls.empty()) { - cout << "join" << endl; - ls.front()->join(); - delete ls.front(); - ls.pop_front(); - } - - cout << "bl after " << bl << endl; - -} diff --git a/branches/sage/ebofs2/test/testos.cc b/branches/sage/ebofs2/test/testos.cc deleted file mode 100644 index 24c81590d899c..0000000000000 --- a/branches/sage/ebofs2/test/testos.cc +++ /dev/null @@ -1,343 +0,0 @@ -/* testos.cc -- simple ObjectStore test harness. - Copyright (C) 2007 Casey Marshall - -Ceph - scalable distributed file system - -This is free software; you can redistribute it and/or -modify it under the terms of the GNU Lesser General Public -License version 2.1, as published by the Free Software -Foundation. See file COPYING. */ - - -#include "osd/ObjectStore.h" -#include "ebofs/Ebofs.h" -#include "osbdb/OSBDB.h" -#include "include/buffer.h" - -#include -#include -#include - -#include -#include - -using namespace std; - -static inline unsigned long long -to_usec (struct timeval &time) -{ - return (((unsigned long long) time.tv_sec * 1000000) - + ((unsigned long long) time.tv_usec)); -} - -static inline unsigned long long -to_msec (struct timeval &time) -{ - return (((unsigned long long) time.tv_sec * 1000) - + ((unsigned long long) time.tv_usec / 1000)); -} - -int main (int argc, char **argv) -{ - vector args; - char *osd_name = "ebofs"; - unsigned object_size = 1024; - unsigned object_count = 1024; - unsigned write_iter = 64; - unsigned random_seed = ::time(NULL); - char *device = "/tmp/testos"; - char *mountcmd = "mount /tmp/testos"; - char *umountcmd = "umount /tmp/testos"; - - bool ebofs_raw_device = false; - bool inhibit_remount = (getenv("TESTOS_INHIBIT_REMOUNT") != NULL); - - if (argc > 1 - && (strcmp (argv[1], "-h") == 0 - || strcmp (argv[1], "-help") == 0 - || strcmp (argv[1], "--help") == 0)) - { - cout << "usage: " << argv[0] << " [store [object-size [object-count [iterations [seed]]]]]" << endl; - cout << endl; - cout << "Where the arguments are:" << endl << endl; - cout << " store -- store type; default \"ebofs\"" << endl; - cout << " object-size -- size of objects; default 1024" << endl; - cout << " object-count -- number of objects to write; default 1024" - << endl; - cout << " iterations -- write the objects that many times; default 5" - << endl; - cout << " seed -- random seed; default current time" << endl; - exit (0); - } - - argv_to_vec (argc, argv, args); - for (vector::iterator it = args.begin(); it != args.end(); - it++) - cout << *it << " "; - cout << endl; - parse_config_options (args); - for (vector::iterator it = args.begin(); it != args.end(); - it++) - cout << *it << " "; - cout << endl; - - argc = args.size(); - if (argc > 0) - osd_name = args[0]; - if (argc > 1) - object_size = (unsigned) atol (args[1]); - if (argc > 2) - object_count = (unsigned) atol (args[2]); - if (argc > 3) - write_iter = (unsigned) atol (args[3]); - if (argc > 4) - random_seed = (unsigned) atol (args[4]); - - // algin object size to 'long' - object_size = ((object_size + (sizeof (long) - 1)) / sizeof (long)) * sizeof (long); - - char *osd_file = new char[32]; - strcpy (osd_file, "/tmp/testos/testos.XXXXXX"); - mktemp (osd_file); - - if (strcasecmp (osd_name, "ebofs") == 0) - { - char *dev_env = getenv ("TESTOS_EBOFS_DEV"); - if (dev_env != NULL) - { - // Assume it is a true device. - strncpy (osd_file, dev_env, 32); - inhibit_remount = true; - ebofs_raw_device = true; - } - } - - if (!inhibit_remount) - { - if (system (mountcmd) != 0) - { - cerr << "mount failed" << endl; - exit (1); - } - } - - ObjectStore *os = NULL; - if (strcasecmp (osd_name, "ebofs") == 0) - { - if (!ebofs_raw_device) - { - FILE *f = fopen (osd_file, "w"); - if (f == NULL) - { - cerr << "failed to open " << osd_file << ": " << strerror (errno) - << endl; - exit (1); - } - // 1G file. - fseek (f, 1024 * 1024 * 1024, SEEK_SET); - fputc ('\0', f); - fclose (f); - } - os = new Ebofs (osd_file); - } - else if (strcasecmp (osd_name, "osbdb") == 0) - { - os = new OSBDB (osd_file); - } - else if (strcasecmp (osd_name, "osbdb-btree") == 0) - { - g_conf.bdbstore_btree = true; - os = new OSBDB (osd_file); - } - else - { - cerr << "I don't know about object store \"" << osd_name << "\"" - << endl; - exit (1); - } - - cout << "Writing " << object_count << " objects of size " - << object_size << " to " << osd_name << endl; - - char *val = (char *) malloc (object_size); - char *val2 = (char *) malloc (object_size); - auto_ptr valptr (val); - auto_ptr valptr2(val2); - if (getenv ("TESTOS_UNALIGNED") != NULL) - { - val = val + 1; - val2 = val2 + 1; - } - - for (unsigned i = 0; i < object_size; i++) - { - val[i] = (char) i; - val2[i] = (char) i; - } - object_t *oids = new object_t[object_count]; - - utime_t writes[write_iter]; - utime_t total_write; - utime_t reads[write_iter]; - utime_t total_read; - for (unsigned i = 0; i < write_iter; i++) - { - cerr << "Iteration " << i << endl; - - int ret = os->mkfs(); - if (ret != 0) - { - cerr << "mkfs(" << osd_file << "): " << strerror (-ret) << endl; - exit (1); - } - ret = os->mount(); - if (ret != 0) - { - cerr << "mount(): " << strerror (-ret) << endl; - exit (1); - } - - srandom (random_seed + i); - - for (unsigned j = 0; j < object_count; j++) - { - oids[j].ino = (uint64_t) random() << 32 | random(); - oids[j].bno = random(); - } - - utime_t begin = g_clock.now(); - for (unsigned o = 0; o < object_count; o++) - { - bufferptr bp (val, object_size); - bufferlist bl; - bl.push_back (bp); - int ret; - if ((ret = os->write (oids[o], 0L, object_size, bl, NULL)) < 0) - cerr << "write " << oids[o] << " failed: " - << strerror (-ret) << endl; - } - os->sync(); - - utime_t end = g_clock.now() - begin; - - cerr << "Write finished in " << end << endl; - total_write += end; - writes[i] = end; - - os->umount(); - sync(); - - if (!inhibit_remount) - { - if (system (umountcmd) != 0) - { - cerr << "umount failed" << endl; - exit (1); - } - - if (system (mountcmd) != 0) - { - cerr << "mount(2) failed" << endl; - exit (1); - } - } - - os->mount(); - - // Shuffle the OIDs. - for (int j = 0; j < object_count; j++) - { - int x = random() % object_count; - if (x < 0) - x = -x; - object_t o = oids[j]; - oids[j] = oids[x]; - oids[x] = o; - } - - begin = g_clock.now(); - for (unsigned o = 0; o < object_count; o++) - { - bufferptr bp (val2, object_size); - bufferlist bl; - bl.push_back (bp); - - if (os->read (oids[o], 0L, object_size, bl) < 0) - { - cerr << "object " << oids[o] << " not found!" << endl; - } - } - end = g_clock.now() - begin; - - cerr << "Read finished in " << end << endl; - total_read += end; - reads[i] = end; - - os->umount(); - sync(); - - if (!inhibit_remount) - { - if (system (umountcmd) != 0) - { - cerr << "umount(2) failed" << endl; - exit (1); - } - - if (system (mountcmd) != 0) - { - cerr << "mount(3) failed" << endl; - exit (1); - } - } - } - - cerr << "Finished in " << (total_write + total_read) << endl; - - double write_mean = ((double) total_write) / ((double) write_iter); - double write_sd = 0.0; - for (unsigned i = 0; i < write_iter; i++) - { - double x = ((double) writes[i]) - write_mean; - write_sd += x * x; - } - write_sd = sqrt (write_sd / ((double) write_iter)); - - double read_mean = ((double) total_read) / ((double) write_iter); - double read_sd = 0.0; - for (unsigned i = 0; i < write_iter; i++) - { - double x = ((double) reads[i]) - read_mean; - write_sd += x * x; - } - read_sd = sqrt (read_sd / ((double) write_iter)); - - cout << "TESTOS: write " << osd_name << ":" << object_size << ":" - << object_count << ":" << write_iter << ":" << random_seed - << " -- " << write_mean << " " << write_sd << endl; - - cout << "TESTOS: write.raw -- "; - for (int i = 0; i < write_iter; i++) - cout << ((double) writes[i]) << " "; - cout << endl; - - cout << "TESTOS: read " << osd_name << ":" << object_size << ":" - << object_count << ":" << write_iter << ":" << random_seed - << " -- " << read_mean << " " << read_sd << endl; - - cout << "TESTOS: read.raw -- "; - for (int i = 0; i < write_iter; i++) - cout << ((double) reads[i]) << " "; - cout << endl; - - unlink (osd_file); - if (!inhibit_remount) - { - if (system (umountcmd) != 0) - { - cerr << "umount(3) failed" << endl; - exit (1); - } - } - exit (0); -} diff --git a/branches/sage/ebofs2/test/testosbdb.cc b/branches/sage/ebofs2/test/testosbdb.cc deleted file mode 100644 index 19268e7587531..0000000000000 --- a/branches/sage/ebofs2/test/testosbdb.cc +++ /dev/null @@ -1,347 +0,0 @@ -/* testosbdb.cc -- test OSBDB. - Copyright (C) 2007 Casey Marshall */ - - -#include -#include "osbdb/OSBDB.h" - -using namespace std; - -int -main (int argc, char **argv) -{ - vector args; - argv_to_vec (argc, argv, args); - parse_config_options (args); - - g_conf.debug_bdbstore = 10; - //g_conf.bdbstore_btree = true; - char dbfile[256]; - strncpy (dbfile, "/tmp/testosbdb/db.XXXXXX", 256); - mktemp (dbfile); - OSBDB *os = new OSBDB(dbfile); - auto_ptr osPtr (os); - os->mkfs(); - os->mount(); - - // Put an object. - object_t oid (0xDEADBEEF00000000ULL, 0xFEEDFACE); - - cout << "sizeof oid_t is " << sizeof (oid_t) << endl; - cout << "offsetof oid_t.id " << offsetof (oid_t, id) << endl; - - cout << sizeof (object_t) << endl; - cout << sizeof (oid.ino) << endl; - cout << sizeof (oid.bno) << endl; - cout << sizeof (oid.rev) << endl; - - // Shouldn't be there. - if (os->exists (oid)) - { - cout << "FAIL: oid shouldn't be there " << oid << endl; - } - - // Write an object. - char *x = (char *) malloc (1024); - memset(x, 0xaa, 1024); - bufferptr bp (x, 1024); - bufferlist bl; - bl.push_back (bp); - - if (os->write (oid, 0L, 1024, bl, NULL) != 1024) - { - cout << "FAIL: writing object" << endl; - } - - os->sync(); - - // Should be there. - if (!os->exists (oid)) - { - cout << "FAIL: oid should be there: " << oid << endl; - } - - memset(x, 0, 1024); - if (os->read (oid, 0, 1024, bl) != 1024) - { - cout << "FAIL: reading object" << endl; - } - - for (int i = 0; i < 1024; i++) - { - if ((x[i] & 0xFF) != 0xaa) - { - cout << "FAIL: data read out is different" << endl; - break; - } - } - - // Set some attributes - if (os->setattr (oid, "alpha", "value", strlen ("value")) != 0) - { - cout << "FAIL: set attribute" << endl; - } - if (os->setattr (oid, "beta", "value", strlen ("value")) != 0) - { - cout << "FAIL: set attribute" << endl; - } - if (os->setattr (oid, "gamma", "value", strlen ("value")) != 0) - { - cout << "FAIL: set attribute" << endl; - } - if (os->setattr (oid, "fred", "value", strlen ("value")) != 0) - { - cout << "FAIL: set attribute" << endl; - } - - char *attrs = (char *) malloc (1024); - if (os->listattr (oid, attrs, 1024) != 0) - { - cout << "FAIL: listing attributes" << endl; - } - else - { - char *p = attrs; - if (strcmp (p, "alpha") != 0) - { - cout << "FAIL: should be \"alpha:\" \"" << p << "\"" << endl; - } - p = p + strlen (p) + 1; - if (strcmp (p, "beta") != 0) - { - cout << "FAIL: should be \"beta:\" \"" << p << "\"" << endl; - } - p = p + strlen (p) + 1; - if (strcmp (p, "fred") != 0) - { - cout << "FAIL: should be \"fred:\" \"" << p << "\"" << endl; - } - p = p + strlen (p) + 1; - if (strcmp (p, "gamma") != 0) - { - cout << "FAIL: should be \"gamma:\" \"" << p << "\"" << endl; - } - } - - char attrvalue[256]; - memset(attrvalue, 0, sizeof (attrvalue)); - if (os->getattr (oid, "alpha", attrvalue, sizeof(attrvalue)) < 0) - { - cout << "FAIL: getattr alpha" << endl; - } - else if (strncmp ("value", attrvalue, strlen("value")) != 0) - { - cout << "FAIL: read attribute value differs" << endl; - } - memset(attrvalue, 0, sizeof (attrvalue)); - if (os->getattr (oid, "fred", attrvalue, sizeof(attrvalue)) < 0) - { - cout << "FAIL: getattr fred" << endl; - } - else if (strncmp ("value", attrvalue, strlen("value")) != 0) - { - cout << "FAIL: read attribute value differs" << endl; - } - memset(attrvalue, 0, sizeof (attrvalue)); - if (os->getattr (oid, "beta", attrvalue, sizeof(attrvalue)) < 0) - { - cout << "FAIL: getattr beta" << endl; - } - else if (strncmp ("value", attrvalue, strlen("value")) != 0) - { - cout << "FAIL: read attribute value differs" << endl; - } - memset(attrvalue, 0, sizeof (attrvalue)); - if (os->getattr (oid, "gamma", attrvalue, sizeof(attrvalue)) < 0) - { - cout << "FAIL: getattr gamma" << endl; - } - else if (strncmp ("value", attrvalue, strlen("value")) != 0) - { - cout << "FAIL: read attribute value differs" << endl; - } - - if (os->setattr (oid, "alpha", "different", strlen("different")) != 0) - cout << "FAIL: setattr overwrite" << endl; - memset(attrvalue, 0, sizeof (attrvalue)); - if (os->getattr (oid, "alpha", attrvalue, sizeof(attrvalue)) < 0) - { - cout << "FAIL: getattr alpha" << endl; - } - else if (strncmp ("different", attrvalue, strlen("different")) != 0) - { - cout << "FAIL: read attribute value differs" << endl; - } - - if (os->rmattr (oid, "alpha") != 0) - { - cout << "FAIL: rmattr alpha" << endl; - } - if (os->rmattr (oid, "fred") != 0) - { - cout << "FAIL: rmattr fred" << endl; - } - if (os->rmattr (oid, "beta") != 0) - { - cout << "FAIL: rmattr beta" << endl; - } - if (os->rmattr (oid, "gamma") != 0) - { - cout << "FAIL: rmattr gamma" << endl; - } - - coll_t cid = 0xCAFEBABE; - if (os->create_collection (cid) != 0) - { - cout << "FAIL: create_collection" << endl; - } - if (os->create_collection (cid + 10) != 0) - { - cout << "FAIL: create_collection" << endl; - } - if (os->create_collection (cid + 5) != 0) - { - cout << "FAIL: create_collection" << endl; - } - if (os->create_collection (42) != 0) - { - cout << "FAIL: create_collection" << endl; - } - - if (os->collection_add (cid, oid) != 0) - { - cout << "FAIL: collection_add" << endl; - } - - list ls; - if (os->list_collections (ls) < 0) - { - cout << "FAIL: list_collections" << endl; - } - cout << "collections: "; - for (list::iterator it = ls.begin(); it != ls.end(); it++) - { - cout << *it << ", "; - } - cout << endl; - - if (os->destroy_collection (0xCAFEBABE + 10) != 0) - { - cout << "FAIL: destroy_collection" << endl; - } - - if (os->destroy_collection (0xCAFEBADE + 10) == 0) - { - cout << "FAIL: destroy_collection" << endl; - } - - object_t oid2 (12345, 12345); - for (int i = 0; i < 8; i++) - { - oid2.rev++; - if (os->collection_add (cid, oid2) != 0) - { - cout << "FAIL: collection_add" << endl; - } - } - for (int i = 0; i < 8; i++) - { - if (os->collection_remove (cid, oid2) != 0) - { - cout << "FAIL: collection_remove" << endl; - } - oid2.rev--; - } - - if (os->collection_setattr (cid, "alpha", "value", 5) != 0) - cout << "FAIL: collection_setattr" << endl; - if (os->collection_setattr (cid, "beta", "value", 5) != 0) - cout << "FAIL: collection_setattr" << endl; - if (os->collection_setattr (cid, "gamma", "value", 5) != 0) - cout << "FAIL: collection_setattr" << endl; - if (os->collection_setattr (cid, "fred", "value", 5) != 0) - cout << "FAIL: collection_setattr" << endl; - - memset (attrvalue, 0, sizeof (attrvalue)); - if (os->collection_getattr (cid, "alpha", attrvalue, sizeof (attrvalue)) < 0) - cout << "FAIL: collection_getattr" << endl; - else if (strncmp (attrvalue, "value", 5) != 0) - cout << "FAIL: collection attribute value different" << endl; - memset (attrvalue, 0, sizeof (attrvalue)); - if (os->collection_getattr (cid, "beta", attrvalue, sizeof (attrvalue)) < 0) - cout << "FAIL: collection_getattr" << endl; - else if (strncmp (attrvalue, "value", 5) != 0) - cout << "FAIL: collection attribute value different" << endl; - memset (attrvalue, 0, sizeof (attrvalue)); - if (os->collection_getattr (cid, "gamma", attrvalue, sizeof (attrvalue)) < 0) - cout << "FAIL: collection_getattr" << endl; - else if (strncmp (attrvalue, "value", 5) != 0) - cout << "FAIL: collection attribute value different" << endl; - memset (attrvalue, 0, sizeof (attrvalue)); - if (os->collection_getattr (cid, "fred", attrvalue, sizeof (attrvalue)) < 0) - cout << "FAIL: collection_getattr" << endl; - else if (strncmp (attrvalue, "value", 5) != 0) - cout << "FAIL: collection attribute value different" << endl; - - if (os->collection_setattr (cid, "alpha", "eulavvalue", 10) != 0) - cout << "FAIL: collection setattr overwrite" << endl; - memset (attrvalue, 0, sizeof (attrvalue)); - if (os->collection_getattr (cid, "alpha", attrvalue, sizeof (attrvalue)) < 0) - cout << "FAIL: collection_getattr" << endl; - else if (strncmp (attrvalue, "eulavvalue", 10) != 0) - cout << "FAIL: collection attribute value different" << endl; - memset (attrvalue, 0, sizeof (attrvalue)); - if (os->collection_getattr (cid, "beta", attrvalue, sizeof (attrvalue)) < 0) - cout << "FAIL: collection_getattr" << endl; - else if (strncmp (attrvalue, "value", 5) != 0) - cout << "FAIL: collection attribute value different" << endl; - memset (attrvalue, 0, sizeof (attrvalue)); - if (os->collection_getattr (cid, "gamma", attrvalue, sizeof (attrvalue)) < 0) - cout << "FAIL: collection_getattr" << endl; - else if (strncmp (attrvalue, "value", 5) != 0) - cout << "FAIL: collection attribute value different" << endl; - memset (attrvalue, 0, sizeof (attrvalue)); - if (os->collection_getattr (cid, "fred", attrvalue, sizeof (attrvalue)) < 0) - cout << "FAIL: collection_getattr" << endl; - else if (strncmp (attrvalue, "value", 5) != 0) - cout << "FAIL: collection attribute value different" << endl; - - if (os->collection_rmattr (cid, "alpha") != 0) - cout << "FAIL: collection_rmattr" << endl; - if (os->collection_rmattr (cid, "fred") != 0) - cout << "FAIL: collection_rmattr" << endl; - if (os->collection_rmattr (cid, "beta") != 0) - cout << "FAIL: collection_rmattr" << endl; - if (os->collection_rmattr (cid, "gamma") != 0) - cout << "FAIL: collection_rmattr" << endl; - - if (os->collection_rmattr (cid, "alpha") == 0) - cout << "FAIL: collection_rmattr (nonexistent)" << endl; - - // Truncate the object. - if (os->truncate (oid, 512, NULL) != 0) - { - cout << "FAIL: truncate" << endl; - } - - // Expand the object. - if (os->truncate (oid, 1200, NULL) != 0) - { - cout << "FAIL: expand" << endl; - } - - // Delete the object. - if (os->remove (oid) != 0) - { - cout << "FAIL: could not remove object" << endl; - } - - // Shouldn't be there - if (os->exists (oid)) - { - cout << "FAIL: should not be there" << endl; - } - - os->sync(); - exit (0); -} diff --git a/branches/sage/ebofs2/test/testtree.cc b/branches/sage/ebofs2/test/testtree.cc deleted file mode 100644 index 2c21bcbe52e25..0000000000000 --- a/branches/sage/ebofs2/test/testtree.cc +++ /dev/null @@ -1,46 +0,0 @@ - - -#include "../crush/BinaryTree.h" -using namespace crush; - -#include -#include -using namespace std; - -int main() -{ - BinaryTree t; - - vector nodes; - - for (int i=0; i<30; i++) { - cout << "adding " << i << endl; - int n = t.add_node(1); - nodes.push_back(n); - //cout << t << endl; - } - cout << t << endl; - - for (int k=0; k<10000; k++) { - if (rand() % 2) { - cout << "adding" << endl; - nodes.push_back( t.add_node(1) ); - } else { - if (!nodes.empty()) { - //for (int i=0; i -using namespace std; - - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -int main(int argc, char**argv) -{ - int a = 1; - int b = 2; - - mknod("test", 0600, 0); - - cout << "setxattr " << setxattr("test", "asdf", &a, sizeof(a), 0) << endl; - cout << "errno " << errno << " " << strerror(errno) << endl; - cout << "getxattr " << getxattr("test", "asdf", &b, sizeof(b)) << endl; - cout << "errno " << errno << " " << strerror(errno) << endl; - cout << "a is " << a << " and b is " << b << endl; - return 0; -} diff --git a/branches/sage/ebofs2/valgrind.supp b/branches/sage/ebofs2/valgrind.supp deleted file mode 100644 index 356df039050c4..0000000000000 --- a/branches/sage/ebofs2/valgrind.supp +++ /dev/null @@ -1,62 +0,0 @@ -# some valgrind suppressions -# to load these automagically, -# cat > ~/.valgrindrc -# --suppressions=valgrind.supp -# - - -# this one makes valgrind shut up about what appears to be a bug in libc's writev. -{ - writev uninit bytes thing -sage - Memcheck:Param - writev(vector[...]) - fun:writev - fun:_ZN11BlockDevice6_writeEijjRN6buffer4listE - fun:_ZN11BlockDevice5do_ioEiRSt4listIPNS_6biovecESaIS2_EE - fun:_ZN11BlockDevice15io_thread_entryEv - fun:_ZN11BlockDevice8IOThread5entryEv - fun:_ZN6Thread11_entry_funcEPv - fun:start_thread - fun:clone - obj:* - obj:* - obj:* - obj:* -} - -# gethostbyname -{ - gethostbyname on issdm - Memcheck:Param - socketcall.sendto(msg) - fun:send - fun:get_mapping - fun:__nscd_get_map_ref - fun:nscd_gethst_r - fun:__nscd_gethostbyname_r - fun:gethostbyname_r@@GLIBC_2.2.5 - fun:gethostbyname - fun:_ZN4Rank8Accepter5startEv - fun:_ZN4Rank10start_rankEv - fun:main -} - -# gethostbyname - -{ - gethostbyname on foil - Memcheck:Addr8 - obj:/lib/ld-2.6.1.so - obj:/lib/ld-2.6.1.so - obj:/lib/ld-2.6.1.so - obj:/lib/ld-2.6.1.so - obj:/lib/ld-2.6.1.so - obj:/lib/ld-2.6.1.so - obj:/lib/ld-2.6.1.so - obj:/lib/libc-2.6.1.so - obj:/lib/ld-2.6.1.so - fun:__libc_dlopen_mode - fun:__nss_lookup_function - obj:/lib/libc-2.6.1.so -} - diff --git a/branches/sage/mds/COPYING b/branches/sage/mds/COPYING deleted file mode 100644 index 5ab7695ab8cab..0000000000000 --- a/branches/sage/mds/COPYING +++ /dev/null @@ -1,504 +0,0 @@ - GNU LESSER GENERAL PUBLIC LICENSE - Version 2.1, February 1999 - - Copyright (C) 1991, 1999 Free Software Foundation, Inc. - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - Everyone is permitted to copy and distribute verbatim copies - of this license document, but changing it is not allowed. - -[This is the first released version of the Lesser GPL. It also counts - as the successor of the GNU Library Public License, version 2, hence - the version number 2.1.] - - Preamble - - The licenses for most software are designed to take away your -freedom to share and change it. By contrast, the GNU General Public -Licenses are intended to guarantee your freedom to share and change -free software--to make sure the software is free for all its users. - - This license, the Lesser General Public License, applies to some -specially designated software packages--typically libraries--of the -Free Software Foundation and other authors who decide to use it. You -can use it too, but we suggest you first think carefully about whether -this license or the ordinary General Public License is the better -strategy to use in any particular case, based on the explanations below. - - When we speak of free software, we are referring to freedom of use, -not price. Our General Public Licenses are designed to make sure that -you have the freedom to distribute copies of free software (and charge -for this service if you wish); that you receive source code or can get -it if you want it; that you can change the software and use pieces of -it in new free programs; and that you are informed that you can do -these things. - - To protect your rights, we need to make restrictions that forbid -distributors to deny you these rights or to ask you to surrender these -rights. These restrictions translate to certain responsibilities for -you if you distribute copies of the library or if you modify it. - - For example, if you distribute copies of the library, whether gratis -or for a fee, you must give the recipients all the rights that we gave -you. You must make sure that they, too, receive or can get the source -code. If you link other code with the library, you must provide -complete object files to the recipients, so that they can relink them -with the library after making changes to the library and recompiling -it. And you must show them these terms so they know their rights. - - We protect your rights with a two-step method: (1) we copyright the -library, and (2) we offer you this license, which gives you legal -permission to copy, distribute and/or modify the library. - - To protect each distributor, we want to make it very clear that -there is no warranty for the free library. Also, if the library is -modified by someone else and passed on, the recipients should know -that what they have is not the original version, so that the original -author's reputation will not be affected by problems that might be -introduced by others. - - Finally, software patents pose a constant threat to the existence of -any free program. We wish to make sure that a company cannot -effectively restrict the users of a free program by obtaining a -restrictive license from a patent holder. Therefore, we insist that -any patent license obtained for a version of the library must be -consistent with the full freedom of use specified in this license. - - Most GNU software, including some libraries, is covered by the -ordinary GNU General Public License. This license, the GNU Lesser -General Public License, applies to certain designated libraries, and -is quite different from the ordinary General Public License. We use -this license for certain libraries in order to permit linking those -libraries into non-free programs. - - When a program is linked with a library, whether statically or using -a shared library, the combination of the two is legally speaking a -combined work, a derivative of the original library. The ordinary -General Public License therefore permits such linking only if the -entire combination fits its criteria of freedom. The Lesser General -Public License permits more lax criteria for linking other code with -the library. - - We call this license the "Lesser" General Public License because it -does Less to protect the user's freedom than the ordinary General -Public License. It also provides other free software developers Less -of an advantage over competing non-free programs. These disadvantages -are the reason we use the ordinary General Public License for many -libraries. However, the Lesser license provides advantages in certain -special circumstances. - - For example, on rare occasions, there may be a special need to -encourage the widest possible use of a certain library, so that it becomes -a de-facto standard. To achieve this, non-free programs must be -allowed to use the library. A more frequent case is that a free -library does the same job as widely used non-free libraries. In this -case, there is little to gain by limiting the free library to free -software only, so we use the Lesser General Public License. - - In other cases, permission to use a particular library in non-free -programs enables a greater number of people to use a large body of -free software. For example, permission to use the GNU C Library in -non-free programs enables many more people to use the whole GNU -operating system, as well as its variant, the GNU/Linux operating -system. - - Although the Lesser General Public License is Less protective of the -users' freedom, it does ensure that the user of a program that is -linked with the Library has the freedom and the wherewithal to run -that program using a modified version of the Library. - - The precise terms and conditions for copying, distribution and -modification follow. Pay close attention to the difference between a -"work based on the library" and a "work that uses the library". The -former contains code derived from the library, whereas the latter must -be combined with the library in order to run. - - GNU LESSER GENERAL PUBLIC LICENSE - TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION - - 0. This License Agreement applies to any software library or other -program which contains a notice placed by the copyright holder or -other authorized party saying it may be distributed under the terms of -this Lesser General Public License (also called "this License"). -Each licensee is addressed as "you". - - A "library" means a collection of software functions and/or data -prepared so as to be conveniently linked with application programs -(which use some of those functions and data) to form executables. - - The "Library", below, refers to any such software library or work -which has been distributed under these terms. A "work based on the -Library" means either the Library or any derivative work under -copyright law: that is to say, a work containing the Library or a -portion of it, either verbatim or with modifications and/or translated -straightforwardly into another language. (Hereinafter, translation is -included without limitation in the term "modification".) - - "Source code" for a work means the preferred form of the work for -making modifications to it. For a library, complete source code means -all the source code for all modules it contains, plus any associated -interface definition files, plus the scripts used to control compilation -and installation of the library. - - Activities other than copying, distribution and modification are not -covered by this License; they are outside its scope. The act of -running a program using the Library is not restricted, and output from -such a program is covered only if its contents constitute a work based -on the Library (independent of the use of the Library in a tool for -writing it). Whether that is true depends on what the Library does -and what the program that uses the Library does. - - 1. You may copy and distribute verbatim copies of the Library's -complete source code as you receive it, in any medium, provided that -you conspicuously and appropriately publish on each copy an -appropriate copyright notice and disclaimer of warranty; keep intact -all the notices that refer to this License and to the absence of any -warranty; and distribute a copy of this License along with the -Library. - - You may charge a fee for the physical act of transferring a copy, -and you may at your option offer warranty protection in exchange for a -fee. - - 2. You may modify your copy or copies of the Library or any portion -of it, thus forming a work based on the Library, and copy and -distribute such modifications or work under the terms of Section 1 -above, provided that you also meet all of these conditions: - - a) The modified work must itself be a software library. - - b) You must cause the files modified to carry prominent notices - stating that you changed the files and the date of any change. - - c) You must cause the whole of the work to be licensed at no - charge to all third parties under the terms of this License. - - d) If a facility in the modified Library refers to a function or a - table of data to be supplied by an application program that uses - the facility, other than as an argument passed when the facility - is invoked, then you must make a good faith effort to ensure that, - in the event an application does not supply such function or - table, the facility still operates, and performs whatever part of - its purpose remains meaningful. - - (For example, a function in a library to compute square roots has - a purpose that is entirely well-defined independent of the - application. Therefore, Subsection 2d requires that any - application-supplied function or table used by this function must - be optional: if the application does not supply it, the square - root function must still compute square roots.) - -These requirements apply to the modified work as a whole. If -identifiable sections of that work are not derived from the Library, -and can be reasonably considered independent and separate works in -themselves, then this License, and its terms, do not apply to those -sections when you distribute them as separate works. But when you -distribute the same sections as part of a whole which is a work based -on the Library, the distribution of the whole must be on the terms of -this License, whose permissions for other licensees extend to the -entire whole, and thus to each and every part regardless of who wrote -it. - -Thus, it is not the intent of this section to claim rights or contest -your rights to work written entirely by you; rather, the intent is to -exercise the right to control the distribution of derivative or -collective works based on the Library. - -In addition, mere aggregation of another work not based on the Library -with the Library (or with a work based on the Library) on a volume of -a storage or distribution medium does not bring the other work under -the scope of this License. - - 3. You may opt to apply the terms of the ordinary GNU General Public -License instead of this License to a given copy of the Library. To do -this, you must alter all the notices that refer to this License, so -that they refer to the ordinary GNU General Public License, version 2, -instead of to this License. (If a newer version than version 2 of the -ordinary GNU General Public License has appeared, then you can specify -that version instead if you wish.) Do not make any other change in -these notices. - - Once this change is made in a given copy, it is irreversible for -that copy, so the ordinary GNU General Public License applies to all -subsequent copies and derivative works made from that copy. - - This option is useful when you wish to copy part of the code of -the Library into a program that is not a library. - - 4. You may copy and distribute the Library (or a portion or -derivative of it, under Section 2) in object code or executable form -under the terms of Sections 1 and 2 above provided that you accompany -it with the complete corresponding machine-readable source code, which -must be distributed under the terms of Sections 1 and 2 above on a -medium customarily used for software interchange. - - If distribution of object code is made by offering access to copy -from a designated place, then offering equivalent access to copy the -source code from the same place satisfies the requirement to -distribute the source code, even though third parties are not -compelled to copy the source along with the object code. - - 5. A program that contains no derivative of any portion of the -Library, but is designed to work with the Library by being compiled or -linked with it, is called a "work that uses the Library". Such a -work, in isolation, is not a derivative work of the Library, and -therefore falls outside the scope of this License. - - However, linking a "work that uses the Library" with the Library -creates an executable that is a derivative of the Library (because it -contains portions of the Library), rather than a "work that uses the -library". The executable is therefore covered by this License. -Section 6 states terms for distribution of such executables. - - When a "work that uses the Library" uses material from a header file -that is part of the Library, the object code for the work may be a -derivative work of the Library even though the source code is not. -Whether this is true is especially significant if the work can be -linked without the Library, or if the work is itself a library. The -threshold for this to be true is not precisely defined by law. - - If such an object file uses only numerical parameters, data -structure layouts and accessors, and small macros and small inline -functions (ten lines or less in length), then the use of the object -file is unrestricted, regardless of whether it is legally a derivative -work. (Executables containing this object code plus portions of the -Library will still fall under Section 6.) - - Otherwise, if the work is a derivative of the Library, you may -distribute the object code for the work under the terms of Section 6. -Any executables containing that work also fall under Section 6, -whether or not they are linked directly with the Library itself. - - 6. As an exception to the Sections above, you may also combine or -link a "work that uses the Library" with the Library to produce a -work containing portions of the Library, and distribute that work -under terms of your choice, provided that the terms permit -modification of the work for the customer's own use and reverse -engineering for debugging such modifications. - - You must give prominent notice with each copy of the work that the -Library is used in it and that the Library and its use are covered by -this License. You must supply a copy of this License. If the work -during execution displays copyright notices, you must include the -copyright notice for the Library among them, as well as a reference -directing the user to the copy of this License. Also, you must do one -of these things: - - a) Accompany the work with the complete corresponding - machine-readable source code for the Library including whatever - changes were used in the work (which must be distributed under - Sections 1 and 2 above); and, if the work is an executable linked - with the Library, with the complete machine-readable "work that - uses the Library", as object code and/or source code, so that the - user can modify the Library and then relink to produce a modified - executable containing the modified Library. (It is understood - that the user who changes the contents of definitions files in the - Library will not necessarily be able to recompile the application - to use the modified definitions.) - - b) Use a suitable shared library mechanism for linking with the - Library. A suitable mechanism is one that (1) uses at run time a - copy of the library already present on the user's computer system, - rather than copying library functions into the executable, and (2) - will operate properly with a modified version of the library, if - the user installs one, as long as the modified version is - interface-compatible with the version that the work was made with. - - c) Accompany the work with a written offer, valid for at - least three years, to give the same user the materials - specified in Subsection 6a, above, for a charge no more - than the cost of performing this distribution. - - d) If distribution of the work is made by offering access to copy - from a designated place, offer equivalent access to copy the above - specified materials from the same place. - - e) Verify that the user has already received a copy of these - materials or that you have already sent this user a copy. - - For an executable, the required form of the "work that uses the -Library" must include any data and utility programs needed for -reproducing the executable from it. However, as a special exception, -the materials to be distributed need not include anything that is -normally distributed (in either source or binary form) with the major -components (compiler, kernel, and so on) of the operating system on -which the executable runs, unless that component itself accompanies -the executable. - - It may happen that this requirement contradicts the license -restrictions of other proprietary libraries that do not normally -accompany the operating system. Such a contradiction means you cannot -use both them and the Library together in an executable that you -distribute. - - 7. You may place library facilities that are a work based on the -Library side-by-side in a single library together with other library -facilities not covered by this License, and distribute such a combined -library, provided that the separate distribution of the work based on -the Library and of the other library facilities is otherwise -permitted, and provided that you do these two things: - - a) Accompany the combined library with a copy of the same work - based on the Library, uncombined with any other library - facilities. This must be distributed under the terms of the - Sections above. - - b) Give prominent notice with the combined library of the fact - that part of it is a work based on the Library, and explaining - where to find the accompanying uncombined form of the same work. - - 8. You may not copy, modify, sublicense, link with, or distribute -the Library except as expressly provided under this License. Any -attempt otherwise to copy, modify, sublicense, link with, or -distribute the Library is void, and will automatically terminate your -rights under this License. However, parties who have received copies, -or rights, from you under this License will not have their licenses -terminated so long as such parties remain in full compliance. - - 9. You are not required to accept this License, since you have not -signed it. However, nothing else grants you permission to modify or -distribute the Library or its derivative works. These actions are -prohibited by law if you do not accept this License. Therefore, by -modifying or distributing the Library (or any work based on the -Library), you indicate your acceptance of this License to do so, and -all its terms and conditions for copying, distributing or modifying -the Library or works based on it. - - 10. Each time you redistribute the Library (or any work based on the -Library), the recipient automatically receives a license from the -original licensor to copy, distribute, link with or modify the Library -subject to these terms and conditions. You may not impose any further -restrictions on the recipients' exercise of the rights granted herein. -You are not responsible for enforcing compliance by third parties with -this License. - - 11. If, as a consequence of a court judgment or allegation of patent -infringement or for any other reason (not limited to patent issues), -conditions are imposed on you (whether by court order, agreement or -otherwise) that contradict the conditions of this License, they do not -excuse you from the conditions of this License. If you cannot -distribute so as to satisfy simultaneously your obligations under this -License and any other pertinent obligations, then as a consequence you -may not distribute the Library at all. For example, if a patent -license would not permit royalty-free redistribution of the Library by -all those who receive copies directly or indirectly through you, then -the only way you could satisfy both it and this License would be to -refrain entirely from distribution of the Library. - -If any portion of this section is held invalid or unenforceable under any -particular circumstance, the balance of the section is intended to apply, -and the section as a whole is intended to apply in other circumstances. - -It is not the purpose of this section to induce you to infringe any -patents or other property right claims or to contest validity of any -such claims; this section has the sole purpose of protecting the -integrity of the free software distribution system which is -implemented by public license practices. Many people have made -generous contributions to the wide range of software distributed -through that system in reliance on consistent application of that -system; it is up to the author/donor to decide if he or she is willing -to distribute software through any other system and a licensee cannot -impose that choice. - -This section is intended to make thoroughly clear what is believed to -be a consequence of the rest of this License. - - 12. If the distribution and/or use of the Library is restricted in -certain countries either by patents or by copyrighted interfaces, the -original copyright holder who places the Library under this License may add -an explicit geographical distribution limitation excluding those countries, -so that distribution is permitted only in or among countries not thus -excluded. In such case, this License incorporates the limitation as if -written in the body of this License. - - 13. The Free Software Foundation may publish revised and/or new -versions of the Lesser General Public License from time to time. -Such new versions will be similar in spirit to the present version, -but may differ in detail to address new problems or concerns. - -Each version is given a distinguishing version number. If the Library -specifies a version number of this License which applies to it and -"any later version", you have the option of following the terms and -conditions either of that version or of any later version published by -the Free Software Foundation. If the Library does not specify a -license version number, you may choose any version ever published by -the Free Software Foundation. - - 14. If you wish to incorporate parts of the Library into other free -programs whose distribution conditions are incompatible with these, -write to the author to ask for permission. For software which is -copyrighted by the Free Software Foundation, write to the Free -Software Foundation; we sometimes make exceptions for this. Our -decision will be guided by the two goals of preserving the free status -of all derivatives of our free software and of promoting the sharing -and reuse of software generally. - - NO WARRANTY - - 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO -WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. -EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR -OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY -KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE -LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME -THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. - - 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN -WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY -AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU -FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR -CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE -LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING -RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A -FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF -SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH -DAMAGES. - - END OF TERMS AND CONDITIONS - - How to Apply These Terms to Your New Libraries - - If you develop a new library, and you want it to be of the greatest -possible use to the public, we recommend making it free software that -everyone can redistribute and change. You can do so by permitting -redistribution under these terms (or, alternatively, under the terms of the -ordinary General Public License). - - To apply these terms, attach the following notices to the library. It is -safest to attach them to the start of each source file to most effectively -convey the exclusion of warranty; and each file should have at least the -"copyright" line and a pointer to where the full notice is found. - - - Copyright (C) - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - -Also add information on how to contact you by electronic and paper mail. - -You should also get your employer (if you work as a programmer) or your -school, if any, to sign a "copyright disclaimer" for the library, if -necessary. Here is a sample; alter the names: - - Yoyodyne, Inc., hereby disclaims all copyright interest in the - library `Frob' (a library for tweaking knobs) written by James Random Hacker. - - , 1 April 1990 - Ty Coon, President of Vice - -That's all there is to it! - - diff --git a/branches/sage/mds/Makefile b/branches/sage/mds/Makefile deleted file mode 100644 index fd0c3623f0a6a..0000000000000 --- a/branches/sage/mds/Makefile +++ /dev/null @@ -1,304 +0,0 @@ -# -# until autoconf is set up, here are the options i understand: -# -# darwin=yes -- build on darwin -# fuse=no -- don't build anything requiring FUSE -# mpi=no -- don't build newsyn (require MPI) -# use_ccpp=yes -- use Common C++ for buffer.h reference counting -# want_bdb=yes -- build berkelydb objectstore -# - -# mpicxx must be on your path to build newsyn. -# on googoo, this means that /usr/local/mpich2-1.0.2/bin must be on your path. -# on issdm, it's /usr/local/mpich2/bin. - -# Hook for extra -I options, etc. -EXTRA_CFLAGS = #-I${HOME}/include -L${HOME}/lib -EXTRA_CFLAGS += -g -EXTRA_CFLAGS += -pg -#EXTRA_CFLAGS += -O3 - -# base -CFLAGS = -Wall -I. -D_FILE_OFFSET_BITS=64 -D_REENTRANT -D_THREAD_SAFE ${EXTRA_CFLAGS} -LDINC = ld -i -o -CC = g++ -LIBS = -pthread - -# darwin? -ifeq ($(target),darwin) -CFLAGS += -DDARWIN -D__FreeBSD__=10 -LDINC = ar -rc -endif - -# use Common C++ (for buffer.h)? -ifeq ($(use_ccpp),yes) -CFLAGS += -D_GNU_SOURCE -DBUFFER_USE_CCPP -LIBS += -lccgnu2 -ldl -endif - - -#for normal mpich2 machines -MPICC = mpicxx -MPICFLAGS = -DMPICH_IGNORE_CXX_SEEK ${CFLAGS} -MPILIBS = ${LIBS} - -#for LLNL boxes without mpicxx -#MPICC = g++ -#MPICFLAGS = ${CFLAGS} -I/usr/lib/mpi/mpi_gnu/include -L/usr/lib/mpi/mpi_gnu/lib -#MPILIBS = ${LIBS} -lelan -lmpi - -EBOFS_OBJS= \ - ebofs/BlockDevice.o\ - ebofs/BufferCache.o\ - ebofs/Ebofs.o\ - ebofs/Allocator.o\ - ebofs/FileJournal.o - -MDS_OBJS= \ - mds/MDS.o\ - mds/journal.o\ - mds/Server.o\ - mds/MDCache.o\ - mds/Locker.o\ - mds/Migrator.o\ - mds/MDBalancer.o\ - mds/CDentry.o\ - mds/CDir.o\ - mds/CInode.o\ - mds/AnchorTable.o\ - mds/AnchorClient.o\ - mds/LogEvent.o\ - mds/IdAllocator.o\ - mds/ClientMap.o\ - mds/MDLog.o - -OSD_OBJS= \ - osd/PG.o\ - osd/ReplicatedPG.o\ - osd/Ager.o\ - osd/FakeStore.o\ - osd/OSD.o -# osd/RAID4PG.o\ - -OSDC_OBJS= \ - osdc/Objecter.o\ - osdc/ObjectCacher.o\ - osdc/Filer.o\ - osdc/Journaler.o - -MON_OBJS= \ - mon/Monitor.o\ - mon/Paxos.o\ - mon/PaxosService.o\ - mon/OSDMonitor.o\ - mon/MDSMonitor.o\ - mon/ClientMonitor.o\ - mon/PGMonitor.o\ - mon/Elector.o\ - mon/MonitorStore.o - -COMMON_OBJS= \ - msg/Message.o\ - common/Logger.o\ - common/Clock.o\ - common/Timer.o\ - config.o - -CLIENT_OBJS= \ - client/FileCache.o\ - client/Client.o\ - client/SyntheticClient.o\ - client/Trace.o - - -# bdbstore? -ifeq ($(want_bdb),yes) -CFLAGS += -DUSE_OSBDB -LIBS = -ldb_cxx -OSD_OBJS += osbdb/OSBDB.o -OSBDB_OBJS = \ - osbdb/OSBDB.o -endif - - -# targets -TARGETS = cmon cosd cmds csyn mkmonmap cmonctl fakesyn dupstore -SRCS=*.cc */*.cc *.h */*.h */*/*.h - -ifneq ($(fuse),no) -TARGETS += cfuse fakefuse -endif - -ifneq ($(mpi),no) -TARGETS += newsyn -endif - -all: depend ${TARGETS} - -test: depend ${TEST_TARGETS} - - -# real bits -mkmonmap: mkmonmap.cc common.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - -extractosdmaps: extractosdmaps.cc common.o osd.o mon.o ebofs.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - -cmon: cmon.o mon.o msg/SimpleMessenger.o common.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - -cmonctl: cmonctl.cc msg/SimpleMessenger.o common.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - -cosd: cosd.o osd.o ebofs.o msg/SimpleMessenger.o common.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - -cmds: cmds.o mds.o osdc.o msg/SimpleMessenger.o common.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - -csyn: csyn.o client.o osdc.o msg/SimpleMessenger.o common.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - -cfuse: cfuse.o client.o osdc.o client/fuse.o client/fuse_ll.o msg/SimpleMessenger.o common.o - ${CC} ${CFLAGS} ${LIBS} -lfuse $^ -o $@ - - -# code shipping experiments -activemaster: active/activemaster.cc client.o osdc.o msg/SimpleMessenger.o common.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - -activeslave: active/activeslave.cc client.o osdc.o msg/SimpleMessenger.o common.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - -echotestclient: active/echotestclient.cc - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - -msgtestclient: active/msgtestclient.o client.o osdc.o msg/SimpleMessenger.o common.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - -libtrivialtask.so: active/trivial_task.cc client.o osdc.o msg/SimpleMessenger.o common.o - ${CC} -fPIC -shared -Wl,-soname,$@.1 ${CFLAGS} ${LIBS} $^ -o $@ - - - -# IPC interface -ipc_server: ceph_ipc/ipc_server.cc client.o osdc.o msg/SimpleMessenger.o common.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - -ipc_testclient: ceph_ipc/ipc_testclient.cc ceph_ipc/ipc_client.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - - -# fake* -fakefuse: fakefuse.o mon.o mds.o client.o osd.o osdc.o ebofs.o client/fuse.o client/fuse_ll.o msg/FakeMessenger.o common.o - ${CC} ${CFLAGS} ${LIBS} -lfuse $^ -o $@ - -fakesyn: fakesyn.o mon.o mds.o client.o osd.o ebofs.o osdc.o msg/FakeMessenger.o common.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - - -# mpi startup -newsyn: newsyn.cc mon.o mds.o client.o osd.o ebofs.o osdc.o msg/SimpleMessenger.o common.o - ${MPICC} ${MPICFLAGS} ${MPILIBS} $^ -o $@ - - -# ebofs -mkfs.ebofs: ebofs/mkfs.ebofs.cc config.cc common/Clock.o ebofs.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - -test.ebofs: ebofs/test.ebofs.cc config.cc common/Clock.o ebofs.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - -dupstore: dupstore.cc config.cc ebofs.o common/Clock.o common/Timer.o osd/FakeStore.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - - -# hadoop -libhadoopcephfs.so: client/hadoop/CephFSInterface.cc client.o osdc.o msg/SimpleMessenger.o common.o - ${CC} -fPIC -shared -Wl,-soname,$@.1 ${CFLAGS} -I/usr/local/java/include -I/usr/local/java/include/linux ${LIBS} $^ -o $@ - -# libceph -libceph.o: client/ldceph.o client/Client.o msg/SimpleMessenger.o ${COMMON_OBJS} ${SYN_OBJS} ${OSDC_OBJS} - ${LDINC} $^ -o $@ - -# some benchmarking tools -bench/mdtest/mdtest.o: bench/mdtest/mdtest.c - mpicc -c $^ -o $@ - -mdtest: bench/mdtest/mdtest.o - ${MPICC} ${MPICFLAGS} ${MPILIBS} $^ -o $@ - -mdtest.ceph: bench/mdtest/mdtest.o libceph.o - ${MPICC} ${MPICFLAGS} ${MPILIBS} $^ -o $@ - -testos: test/testos.o ebofs.o osbdb.o common.o - ${CC} ${CFLAGS} ${LIBS} ${OSBDB_LIBS} -o $@ $^ - - -# misc -gprof-helper.so: test/gprof-helper.c - gcc -shared -fPIC test/gprof-helper.c -o gprof-helper.so -lpthread -ldl - -test_disk_bw: test/test_disk_bw.cc common.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - -# bits -common.o: ${COMMON_OBJS} - ${LDINC} $@ $^ - -ebofs.o: ${EBOFS_OBJS} - ${LDINC} $@ $^ - -client.o: ${CLIENT_OBJS} - ${LDINC} $@ $^ - -osd.o: ${OSD_OBJS} - ${LDINC} $@ $^ - -osdc.o: ${OSDC_OBJS} - ${LDINC} $@ $^ - -mds.o: ${MDS_OBJS} - ${LDINC} $@ $^ - -mon.o: ${MON_OBJS} - ${LDINC} $@ $^ - -osbdb.o: ${OSBDB_OBJS} - ${LDINC} $@ $^ - - -# generic rules -%.so: %.cc - ${CC} -shared -fPIC ${CFLAGS} $< -o $@ - -%.o: %.cc - ${CC} ${CFLAGS} -c $< -o $@ - -%.po: %.cc - ${CC} -fPIC ${CFLAGS} -c $< -o $@ - - -# handy -clean: - rm -f *.o */*.o ${TARGETS} ${TEST_TARGETS} - -count: - cat ${SRCS} | wc -l - cat ${SRCS} | grep -c \; - -TAGS: - etags `find . -name "*.[h|cc]"` - -.depend: - touch .depend - -depend: - $(RM) .depend - makedepend -f- -- $(CFLAGS) -- $(SRCS) > .depend 2>/dev/null -# for f in $(SRCS) ; do cpp -MM $(CFLAGS) $$f 2> /dev/null >> .depend ; done - - -# now add a line to include the dependency list. -include .depend diff --git a/branches/sage/mds/README b/branches/sage/mds/README deleted file mode 100644 index aa016817cebf0..0000000000000 --- a/branches/sage/mds/README +++ /dev/null @@ -1,4 +0,0 @@ -Ceph - a scalable distributed file system ------------------------------------------ - -Please see http://ceph.sourceforge.net/ for current info. diff --git a/branches/sage/mds/TODO b/branches/sage/mds/TODO deleted file mode 100644 index 1b03b7f450243..0000000000000 --- a/branches/sage/mds/TODO +++ /dev/null @@ -1,265 +0,0 @@ - -some smallish projects: - -- crush rewrite in C - - generalize any memory management etc. to allow use in kernel and userspace -- userspace crush tools - - xml import/export? - - ? - -- pg monitor service - - to support statfs? - - general pg health - - some sort of (throttled) osd status reporting - - dynamic pg creation (eventually!) - -- SimpleMessenger - - clean up/merge Messenger/Dispatcher interfaces - - auto close idle connections - - delivery ack and buffering, and then reconnect - - take a look at RDS? http://oss.oracle.com/projects/rds/ - -- generalize monitor client? - - throttle message resend attempts - -- ENOSPC on client, OSD - - - - -code cleanup -- endian portability -- word size - - clean up all encoded structures - - - -mds bugs -- rename slave in-memory rollback on failure -- proper handling of cache expire messages during rejoin phase? - -> i think cache expires are fine; the rejoin_ack handler just has to behave if rejoining items go missing -- try_remove_unlinked_dn thing - -- rerun destro trace against latest, with various journal lengths - -mds -- stray reintegration -- extend/clean up filepath to allow paths relative to an ino - - fix path_traverse - - fix reconnect/rejoin open file weirdness -- real chdir (directory "open") - - relative metadata ops -- get rid of C*Discover objects for replicate_to .. encode to bufferlists directly? -- consistency points/snapshots - - dentry versions vs dirfrags... -- detect and deal with client failure - - failure during reconnect vs clientmap. although probalby the whole thing needs a larger overhaul... - -- inode.rmtime (recursive mtime) -- make inode.size reflect directory size (number of entries) - -- inode.max_size -- inode.allocated_size - -- osd needs a set_floor_and_read op for safe failover/STOGITH-like semantics. - -- could mark dir complete in EMetaBlob by counting how many dentries are dirtied in the current log epoch in CDir... - -- fix rmdir empty exported dirfrag race - - export all frags <= 1 item? then we ensure freezing before empty, avoiding any last unlink + export vs rmdir race. - - how to know full dir size (when trimming)? - - put frag size/mtime in fragmap in inode? we will need that anyway for stat on dirs - - will need to make inode discover/import_decode smart about dirfrag auth - - or, only put frag size/mtime in inode when frag is closed. otherwise, soft (journaled) state, possibly on another mds. - - need to move state from replicas to auth. simplelock doesn't currently support that. - - ScatterLock or something? hrm. - -- FIXME how to journal/store root and stray inode content? - - in particular, i care about dirfragtree.. get it on rejoin? - - and dir sizes, if i add that... also on rejoin? - -- efficient stat for single writers -- lstat vs stat? -- add FILE_CAP_EXTEND capability bit -- only share osdmap updates with clients holding capabilities -- delayed replica caps release... we need to set a timer event? (and cancel it when appropriate?) - - -client -- clean up client mds session vs mdsmap behavior? -- client caps migration races - - caps need a seq number; reap logic needs to be a bit smarter - - also needs cope with mds failures -- fstat - - -osdmon -- allow fresh replacement osds. add osd_created in osdmap, probably -- monitor needs to monitor some osds... -- monitor pg states, notify on out? -- watch osd utilization; adjust overload in cluster map - -journaler -- fix up for large events (e.g. imports) -- use set_floor_and_read for safe takeover from possibly-not-quite-dead otherguy. -- should we pad with zeros to avoid splitting individual entries? - - make it a g_conf flag? - - have to fix reader to skip over zeros (either <4 bytes for size, or zeroed sizes) -- need to truncate at detected (valid) write_pos to clear out any other partial trailing writes - - -crush -- xml import/export? -- crush tools - - -rados snapshots -- integrate revisions into ObjectCacher -- clean up oid.rev vs op.rev in osd+osdc - -- attr.crev is rev we were created in. -- oid.rev=0 is "live". defined for attr.crev <= rev. -- otherwise, defined for attr.crev <= rev < oid.rev (i.e. oid.rev is upper bound, non-inclusive.) - -- write|delete is tagged with op.rev - - if attr.crev < op.rev - - we clone to oid.rev=rev (clone keeps old crev) - - change live attr.crev=rev. - - apply update -- read is tagged with op.rev - - if 0, we read from 0 (if it exists). - - otherwise we choose object rev based on op.rev vs oid.rev, and then verifying attr.crev <= op.rev. - -- how to get usage feedback to monitor? - -- clean up mds caps release in exporter -- figure out client failure modes -- add connection retry. - - -objecter -- maybe_request_map should set a timer event to periodically re-request. -- transaction prepare/commit -- read+floor_lockout - -osd/rados -- transaction prepare/commit - - rollback - - rollback logging (to fix slow prepare vs rollback race) -- read+floor_lockout for clean STOGITH-like/fencing semantics after failover. - -- consider implications of nvram writeahead logs -- clean shutdown? -- pgmonitor should supplement failure detection - -- flag missing log entries on crash recovery --> WRNOOP? or WRLOST? - -- efficiently replicate clone() objects -- fix heartbeat wrt new replication -- mark residual pgs obsolete ??? -- rdlocks -- optimize remove wrt recovery pushes -- report crashed pgs? - -messenger -- fix messenger shutdown.. we shouldn't delete messenger, since the caller may be referencing it, etc. - -simplemessenger -- close idle connections -- buffer sent messages until a receive is acknowledged (handshake!) - - retry, timeout on connection or transmission failure -- exponential backoff on monitor resend attempts (actually, this should go outside the messenger!) - -objectcacher -- merge clean bh's -- ocacher caps transitions vs locks -- test read locks - -reliability -- heartbeat vs ping? -- osdmonitor, filter - -ebofs -- allow holes -- allow btree sets -- optionally scrub deallocated extents -- clone() - -- map ObjectStore - -- verify proper behavior of conflicting/overlapping reads of clones -- combine inodes and/or cnodes into same blocks -- nonblocking write on missing onodes? -- fix bug in node rotation on insert (and reenable) -- fix NEAR_LAST_FWD (?) - -- awareness of underlying software/hardware raid in allocator so that we - write full stripes _only_. - - hmm, that's basically just a large block size. - -- rewrite the btree code! - - multithreaded - - eliminate nodepools - - allow btree sets - - allow arbitrary embedded data? - - allow arbitrary btrees - - allow root node(s?) to be embedded in onode, or whereever. - - keys and values can be uniform (fixed-size) or non-uniform. - - fixed size (if any) is a value in the btree struct. - - negative indicates bytes of length value? (1 -> 255bytes, 2 -> 65535 bytes, etc.?) - - non-uniform records preceeded by length. - - keys sorted via a comparator defined in btree root. - - lexicographically, by default. - -- goal - - object btree key->value payload, not just a data blob payload. - - better threading behavior. - - with transactional goodness! - -- onode - - object attributes.. as a btree? - - blob stream - - map stream. - - allow blob values. - - - - - - -remaining hard problems -- how to cope with file size changes and read/write sharing - - -crush -- more efficient failure when all/too many osds are down -- allow forcefeed for more complicated rule structures. (e.g. make force_stack a list< set >) - - - - - - - - - - - -why qsync could be wrong (for very strict POSIX) : varying mds -> client message transit or processing times. -- mds -> 1,2 : qsync -- client1 writes at byte 100 -- client1 -> mds : qsync reply (size=100) -- client1 writes at byte 300 -- client1 -> client2 (outside channel) -- client2 writes at byte 200 -- client2 -> mds : qsync reply (size=200) --> stat results in size 200, even though at no single point in time was the max size 500. --> for correct result, need to _stop_ client writers while gathering metadata. - - -- dump active config in run output somewhere - - - - - - diff --git a/branches/sage/mds/active/README b/branches/sage/mds/active/README deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/branches/sage/mds/active/activemaster.cc b/branches/sage/mds/active/activemaster.cc deleted file mode 100644 index b4dc742c414ab..0000000000000 --- a/branches/sage/mds/active/activemaster.cc +++ /dev/null @@ -1,115 +0,0 @@ -/* - * Startup executable for - * Ceph Active Storage. See README for details. - * - */ -#include "activemaster.h" - - -/* - * What main() must do: - * - * - start up a Ceph client - * - find the set of OSDs that the file is striped across - * - start up the Map task on each OSD, using ssh - * - eat lunch? - * - start up the Reduce task locally - */ - -int main(int argc, const char* argv[]) { - - if (argc < 4) { - usage(argv[0]); - exit(-1); - } - - const char* input_filename = argv[1]; - const char* map_command = argv[2]; - //const char* reduce_command = argv[3]; - - // fire up the client - Client* client = startCephClient(); - - // open the file as read_only - int fh = client->open(input_filename, O_RDONLY); - if (fh < 0) { - cout << "The input file " << input_filename << " could not be opened." << endl; - exit(-1); - } - - // How big is the file? - int filesize; - struct stat stbuf; - if (0 > client->lstat(input_filename, &stbuf)) { - cout << "Error: could not retrieve size of input file " << input_filename << endl; - exit(-1); - } - filesize = stbuf.st_size; - if (filesize < 1) { - cout << "Error: input file size is " << filesize << endl; - exit(-1); - } - - // retrieve all the object extents - list extents; - int offset = 0; - client->enumerate_layout(fh, extents, filesize, offset); - - // for each object extent, retrieve the OSD IP address and start up a Map task - list::iterator i; - map::iterator j; - int osd; - int start, length; - tcpaddr_t tcpaddr; - - for (i = extents.begin(); i != extents.end(); i++) - { - // find the primary and get its IP address - osd = client->osdmap->get_pg_primary(i->pgid); - entity_inst_t inst = client->osdmap->get_inst(osd); - entity_addr_t entity_addr = inst.addr; - entity_addr.make_addr(tcpaddr); - - // iterate through each buffer_extent in the ObjectExtent - for (j = i->buffer_extents.begin(); - j != i->buffer_extents.end(); j++) - { - // get the range of the buffer_extent - start = (*j).first; - length = (*j).second; - // fire up the Map task - start_map_task(map_command, input_filename, start, length, tcpaddr); - } - } - return 0; -} - -// Fires up the map task. -// For the moment, all it does is echo the command line, not run it. -int start_map_task(const char* command, const char* input_filename, - long start, long length, sockaddr_in ip_address) -{ - string ip_addr_string(inet_ntoa(ip_address.sin_addr)); - - - - - - cout << "ssh " << ip_addr_string << " " << command - << " " << input_filename << " " << start << " " << length << endl; - return 0; -} - - - -void usage(const char* name) { - cout << "usage: " << name << " inputfile map_task reduce_task" << endl; - cout << "inputfile must be a valid path in the running Ceph filesystem." << endl; - cout << "map_task should be given with an absolute path, and be present on "; - cout << "the REGULAR filesystem every node." << endl; - cout << "reduce_task need be present on this node only." << endl; -} - - - - diff --git a/branches/sage/mds/active/activemaster.h b/branches/sage/mds/active/activemaster.h deleted file mode 100644 index 524138e253c7b..0000000000000 --- a/branches/sage/mds/active/activemaster.h +++ /dev/null @@ -1,18 +0,0 @@ -/* - * This is the master executable to start up - * a compute task across several nodes. - * - * - */ - - -//#include -#include "utility.h" - -int start_map_task(const char* command, const char* input_filename, - long start, long length, tcpaddr_t ip_address); - -void usage(const char* name); - -//Client* startCephClient(); -//void kill_client(Client* client); diff --git a/branches/sage/mds/active/activeslave.cc b/branches/sage/mds/active/activeslave.cc deleted file mode 100644 index d2953490f9d69..0000000000000 --- a/branches/sage/mds/active/activeslave.cc +++ /dev/null @@ -1,510 +0,0 @@ -/* - * This is a slave for receiving and executing commands for - * compute tasks on an OSD. This supersedes the daemon - * version in activetaskd.h/cc, because it's easier to debug - * if it's not a daemon. - * - * Networking code is based off examples from Stevens' UNIX Network Programming. - */ - -#include "activeslave.h" - -int main(int argc, const char* argv[]) { - - /* Set up TCP server */ - int sockfd, newsockfd, childpid; - socklen_t clilen; - struct sockaddr_in cli_addr, serv_addr; - - //const char *pname = argv[0]; // process name - - // Open a TCP socket - if ((sockfd = socket(AF_INET, SOCK_STREAM, 0)) < 0) { - cerr << "slave: can't open TCP socket. Exiting." << endl; - exit(-1); - } - cerr << "slave: opened TCP socket." << endl; - - // set up the port - bzero((char*) &serv_addr, sizeof(serv_addr)); - serv_addr.sin_family = AF_INET; - serv_addr.sin_addr.s_addr = htonl(INADDR_ANY); - serv_addr.sin_port = htons(SERV_TCP_PORT); - - if (bind(sockfd, (struct sockaddr *) &serv_addr, sizeof(serv_addr)) < 0) { - cerr << "slave: can't bind local address. Exiting." << endl; - exit(-1); - } - - if(listen(sockfd, SOMAXCONN) < 0) { - cerr << "slave: listening error. Exiting." << endl; - exit(-1); - } - - - /* The Big Loop */ - while (1) { - - // wait for a message and fork off a child process to handle it - clilen = sizeof(cli_addr); - newsockfd = accept(sockfd, - (struct sockaddr *) &cli_addr, - &clilen); - - if (newsockfd < 0) { - cerr << "slave: accept error. Exiting." << endl; - exit(-1); - } - - if ((childpid = fork()) < 0) { - cerr << "slave: fork error. Exiting." << endl; - exit(-1); - } - - else if (childpid == 0) { // child process - cerr << "Forked child process for incoming socket" << endl; - close(sockfd); - process_request(newsockfd); - cerr << "Finished processing request. Exiting child." << endl; - exit(0); - } - - close (newsockfd); // parent - - //sleep(30); /* wait 30 seconds */ - } - exit(EXIT_SUCCESS); -} - - -/* This will process requests from the master. - * The protocol in a nutshell: - * Master opens a socket to slave, and sends - * one message. - * Slave replies with one message. - * Socket is closed. - */ - -void process_request(int newsockfd) { - - // first, read the message type. - int msg_type = readmsgtype(newsockfd); - - - // Second, call some function based on the message type to process - // the rest of the message. The function is responsible for the rest - // of the message; this includes checking the message footer. - - switch(msg_type) { - - case PING: // ping - process_ping(newsockfd); - break; - case STARTTASK: // start_task - process_start_task(newsockfd); - break; - case RETRIEVELOCALFILE: // get_local - process_get_local(newsockfd); - break; - case SHIPCODE: - process_shipcode(newsockfd); - break; - - case PINGREPLY: - case FINISHEDTASK: - case TASKFAILED: - case SENDLOCALFILE: - case LOCALFILENOTFOUND: - case CODESAVED: - case SHIPFAILED: - cerr << "activeslave: BUG received message " << CMD_LIST[msg_type] << - " from master; master should never send this message." << endl; - exit(-1); - break; - - - case -1: - cerr << "activeslave: message had an unidentifiable type. " << - "Closing socket and discarding rest of message." << endl; - default: - cerr << "activeslave: BUG! received unexpected return value of" << msg_type << - "from readmsgtype(). Closing socket and discarding rest of message." << endl; - - exit(-1); - } -} - - -// Just write a ping_reply to the socket. -void process_ping(int fd) { - - // make sure the footer is valid - if (!check_footer(fd)) { - cerr << "process_ping warning: ping message has invalid or missing footer." - << endl; - } - // Even if the footer's invalid, send the reply. - cerr << "Replying to ping..." << endl; - send_msg_header(fd, PINGREPLY); - send_msg_footer(fd); - cerr << "Ping processing completed." << endl; -} - - - -// Process a start_task message. This reads the incoming message and -// starts the corresponding task. - -// Parameter format: taskID(int) command(string) -// cephinputfile(string) offset(long) length(long) localoutputfile - -// WARNING: currently has the trivial task hardwired. It -// ignores the command and the output file. -void process_start_task(int fd) { - - char command[MAX_STRING_SIZE + 1]; - char cephinputfile[MAX_STRING_SIZE + 1]; - char localoutputfile[MAX_STRING_SIZE + 1]; - - cout << "in process_start_task: "; - int taskID = read_positive_int(fd); - cout << "read taskID " << taskID; - - read_string(fd, command); - cout << ", command " << command; - - read_string(fd, cephinputfile); - cout << ", cephinputfile " << cephinputfile; - off_t offset = read_off_t(fd); - cout << ", offset " << offset; - off_t length = read_off_t(fd); - cout << ", length " << length; - - read_string(fd, localoutputfile); - cout << ", localoutputfile " << localoutputfile << endl; - - // make sure the footer is valid - if (!check_footer(fd)) { - cerr << "process_start_task warning: message has invalid or missing footer. " - << "Discarding message." << endl; - exit(-1); - } - - - // To do: modify to load the task from a library instead of just - // using the hardwired one. - - void (*task)(const char*, const char*, off_t, off_t) = 0; - task = start_trivial_task; - - - - // start a task; create an output filename that uses the task ID, 'cause we might - // end up with multiple pieces of a file on each OSD. - // WARNING: always does the trivial task; prints answer to stdout but - // does not write it to disk. - cerr << "starting task: " << endl; - //start_trivial_task(cephinputfile, localoutputfile, offset, length); - task(cephinputfile, localoutputfile, offset, length); - cerr << "returned from task! Sending reply:" << endl; - - - - // send the reply - send_msg_header(fd, FINISHEDTASK); - write_positive_int(fd, taskID); - send_msg_footer(fd); - - // done - cout << "Done sending reply for taskID " << taskID << endl; - return; -} - - - -void start_trivial_task (const char* ceph_filename, const char* local_filename, - off_t offset, off_t length) { - // Don't bother to copy the file to disk. Read the file directly from Ceph, - // and add up all the bytes. - // Write the total to the local file as a string. - Client * client = startCephClient(); - - bufferptr bp(CHUNK); - - // get the source file's size. Sanity-check the request range. - struct stat st; - int r = client->lstat(ceph_filename, &st); - assert (r == 0); - - off_t src_total_size = st.st_size; - if (src_total_size < offset + length) { - cerr << "Error in copy ExtentToLocalFile: offset + length = " << offset << " + " << length - << " = " + (offset + length) << ", source file size is only " << src_total_size << endl; - exit(-1); - } - off_t remaining = length; - - // open the file and seek to the start position - cerr << "start_trivial_task: opening the source file and seeking " << endl; - - int fh_ceph = client->open(ceph_filename, O_RDONLY); - assert (fh_ceph > -1); - r = client->lseek(fh_ceph, offset, SEEK_SET); - assert (r == offset); - - int counter = 0; - // read through the extent and add up the bytes - cerr << "start_trivial_task: counting up bytes" << endl; - char* bp_c = bp.c_str(); - while (remaining > 0) { - off_t got = client->read(fh_ceph, bp_c, MIN(remaining,CHUNK), -1); - assert(got > 0); - remaining -= got; - for (off_t i = 0; i < got; ++i) { - counter += (unsigned int)(bp_c[i]); - } - } - cerr << "start_trivial_task: Done! Answer is " << counter << endl; - client->close(fh_ceph); - - //assert(0); -} - - -// Starts a sloppy grep count of the hardwired search string over the -// given Ceph file extent. It's sloppy because it copies the given -// extent to a local file and runs "grep" on it, with no effort to take -// care of boundary issues. -void start_sloppy_grepcount (const char* ceph_filename, const char* local_filename, - long offset, long size) { - - Client* client = startCephClient(); - char* search_string = "the"; - // copy the file to a local file. - - copyExtentToLocalFile (client, ceph_filename, offset, size, local_filename); - // we want: grep -c search_string local_filename - // to get the number of occurrences of the string. - string command = ""; - command.append("grep -c "); - command.append(search_string); - command.append(local_filename); - - assert(0); - -} - - -// Processes a SHIPCODE message. The message will have a shared -// library attached to it, which must be stored locally. - -void process_shipcode(int fd) { - - - // get the size of the shared library - size_t library_size = read_size_t(fd); - - - // save the library to a file - cerr << "saving library..." << endl; - const char* libfile = "/tmp/libslavetask.so"; - int local_fd = ::open(libfile, O_WRONLY | O_CREAT | O_TRUNC); - if (local_fd < 0) { - cerr << "Error opening " << libfile << " for writing." << endl; - exit(-1); - } - - off_t remaining = library_size; - - bufferptr bp(CHUNK); - char* bp_c = bp.c_str(); - while (remaining > 0) { - off_t got = readn(fd, bp_c, MIN(remaining, CHUNK)); - assert(got > 0); - remaining -= got; - ssize_t written = ::write(local_fd, bp_c, got); - assert (written == got); - } - cerr << "Received shared library and stored as " << libfile << endl; - -} - - -// Processes a get_local message. The message -// specifies the filename of a local file to -// return to the sender. - -// Parameter format: requestID(int) localfilename(string) - -// INCOMPLETE: currently just reads the message. - - -void process_get_local(int fd) { - cout << "in process_get_local: "; - int taskID = read_positive_int(fd); - cout << "read taskID " << taskID; - - char localfilename[MAX_STRING_SIZE+1]; - read_string(fd, localfilename); - cout << ", localfilename " << localfilename << endl; - - - // make sure the footer is valid - if (!check_footer(fd)) { - cerr << "process_get_local warning: message has invalid or missing footer." - << endl; - } - - // not implemented - cerr << "Error: get_local command unimplemented." << endl; - assert(0); -} - - -// Retrieves a formatted message from the socket. -// At the moment, this just reads and prints a fixed- -// length message type. -// DEPRECATED. -void str_getmsg(int sockfd) { - - int n; - - // read message types until the connection dies - while(true) { - n = readmsgtype(sockfd); - if (n != 0) { - cerr << "from getmsg: some sort of error" << endl; - exit(-1); - } - } -} - -// Echo a stream socket message back to the sender. -// DEPRECATED. -void str_echo(int sockfd) { - - int n; - char line[MAXLINE]; - - while(true) { - - // read from the stream - cerr << "str_echo: waiting for a line" << endl; - n = readline(sockfd, line, MAXLINE); - cerr << "str_echo: read a line" << endl; - if (0 == n) { - cerr << "str_echo: connection terminated" << endl; - return; // connection is terminated - } - else if (n < 0) { - cerr << "str_echo: readline error" << endl; - exit(-1); - } - - // write back to the stream - if (n != writen(sockfd, line, n)) { - cerr << "str_echo: writen error" << endl; - exit(-1); - } - else - cerr << "Echoed line " << endl; - } -} - - -void str_ack(int sockfd) { - - int n; - char line[MAXLINE]; - //char *ack = "ack"; - - while(true) { - - // read from the stream - n = readline(sockfd, line, MAXLINE); - - if (0 == n) - return; // connection is terminated - else if (n < 0) - //err_dump("str_echo: readline error"); - exit(-1); - - // write back to the stream - if (4 != writen(sockfd, "ack\n", 4)) - //err_dump("str_echo: writen error"); - exit(-1); - } -} - - - -// Read command lines from the socket and execute them - -void str_run(int sockfd) { - - int n; - char line[MAXLINE]; - char* error_msg = "str_run: No command interpreter found\n"; - char* ack_msg = "Running command... "; - char* commit_msg = "Command executed!\n"; - - while(true) { - - // read from the stream - n = readline(sockfd, line, MAXLINE); - - if (0 == n) - return; // connection is terminated - else if (n < 0) - //err_dump("str_echo: readline error"); - exit(-1); - - if (system(NULL)) { - writen(sockfd, ack_msg, strlen(ack_msg)); - system(line); - writen(sockfd, commit_msg, strlen(commit_msg)); - } - else if ((int)strlen(error_msg) != writen(sockfd, error_msg, strlen(error_msg))) - //err_dump("str_echo: writen error"); - exit(-1); - } -} - - -// take a filename and copy it from Ceph to a local directory. -// Not completed. - -void str_copytolocal(int sockfd) { - - int n; - char line[MAXLINE]; - char* error_msg = "str_copy: No command interpreter found\n"; - char* ack_msg = "Running command... "; - char* commit_msg = "Command executed!\n"; - //char* temp_dir = "/tmp"; - - - while(true) { - - // read from the stream - n = readline(sockfd, line, MAXLINE); - - if (0 == n) - return; // connection is terminated - else if (n < 0) - //err_dump("str_echo: readline error"); - exit(-1); - - if (system(NULL)) { - writen(sockfd, ack_msg, strlen(ack_msg)); - system(line); - writen(sockfd, commit_msg, strlen(commit_msg)); - } - else if ((int)strlen(error_msg) != writen(sockfd, error_msg, strlen(error_msg))) - //err_dump("str_echo: writen error"); - exit(-1); - } -} - - - diff --git a/branches/sage/mds/active/activeslave.h b/branches/sage/mds/active/activeslave.h deleted file mode 100644 index 574824b0478f6..0000000000000 --- a/branches/sage/mds/active/activeslave.h +++ /dev/null @@ -1,23 +0,0 @@ -#include "inet.h" -#include "common.h" -#include "utility.h" -#include "client/Client.h" - - -// The port number is "osdd" on a telephone keypad. -#define SERV_TCP_PORT 6733 - -#define MAXLINE 512 - -void str_echo(int sockfd); -void str_ack(int sockfd); -void str_run(int sockfd); -void str_getmsg(int sockfd); -void process_request(int newsockfd); -void process_ping(int fd); -void process_start_task(int fd); -void process_get_local(int fd); -void process_shipcode(int fd); - -void start_trivial_task(const char* ceph_filename, const char* local_filename, - long offset, long length); diff --git a/branches/sage/mds/active/activetaskd.cc b/branches/sage/mds/active/activetaskd.cc deleted file mode 100644 index ec9f290543093..0000000000000 --- a/branches/sage/mds/active/activetaskd.cc +++ /dev/null @@ -1,241 +0,0 @@ -/* - * This is a daemon for receiving and executing commands for compute tasks on an OSD. - * - * The daemon uses skeleton code from - * http://www.linuxprofilm.com/articles/linux-daemon-howto.html. The - * site is no longer up, but can be seen through the archive.org. - * Networking code is based off examples from Stevens' UNIX Network Programming. - */ - -#include "activetaskd.h" - - -#define SERVER - -#undef SERVER - -int main(int argc, const char* argv[]) { - - /* Our process ID and Session ID */ - pid_t pid, sid; - - /* Fork off the parent process */ - pid = fork(); - if (pid < 0) { - exit(EXIT_FAILURE); - } - /* If we got a good PID, then - we can exit the parent process. */ - if (pid > 0) { - exit(EXIT_SUCCESS); - } - - /* Change the file mode mask */ - umask(0); - - /* Open any logs here */ - - /* Create a new SID for the child process */ - sid = setsid(); - if (sid < 0) { - /* Log the failure */ - exit(EXIT_FAILURE); - } - - - /* Change the current working directory */ - if ((chdir("/")) < 0) { - /* Log the failure */ - exit(EXIT_FAILURE); - } - - /* Close out the standard file descriptors */ - close(STDIN_FILENO); - close(STDOUT_FILENO); - close(STDERR_FILENO); - - /* Daemon-specific initialization goes here */ - - - - /* Set up TCP server */ - int sockfd, newsockfd, childpid; - socklen_t clilen; - struct sockaddr_in cli_addr, serv_addr; - - const char *pname = argv[0]; // process name - - // Open a TCP socket - if ((sockfd = socket(AF_INET, SOCK_STREAM, 0)) < 0) - exit(-1); - //err_dump("server: can't open stream socket"); - - // set up the port - bzero((char*) &serv_addr, sizeof(serv_addr)); - serv_addr.sin_family = AF_INET; - serv_addr.sin_addr.s_addr = htonl(INADDR_ANY); - serv_addr.sin_port = htons(SERV_TCP_PORT); - - if (bind(sockfd, (struct sockaddr *) &serv_addr, sizeof(serv_addr)) < 0) - exit(-1); - //err_dump("server: can't bind local address"); - - if(listen(sockfd, SOMAXCONN) < 0) - exit(-1); - //err_dump("server: listening error"); - - /* The Big Loop */ - while (1) { - - // wait for a message and fork off a child process to handle it - clilen = sizeof(cli_addr); - newsockfd = accept(sockfd, - (struct sockaddr *) &cli_addr, - &clilen); - - if (newsockfd < 0) - exit(-1); - //err_dump("server: accept error"); - - if ( (childpid = fork()) < 0) - exit(-1); - // err_dump("server: fork error"); - - else if (childpid == 0) { // child process - close(sockfd); - //str_echo(newsockfd); - str_run(newsockfd); - // insert code to process the request - exit(0); - } - - close (newsockfd); // parent - - //sleep(30); /* wait 30 seconds */ - } - exit(EXIT_SUCCESS); -} - - -// Echo a stream socket message back to the sender. - -void str_echo(int sockfd) { - - int n; - char line[MAXLINE]; - - while(true) { - - // read from the stream - n = readline(sockfd, line, MAXLINE); - - if (0 == n) - return; // connection is terminated - else if (n < 0) - //err_dump("str_echo: readline error"); - exit(-1); - - // write back to the stream - if (n != writen(sockfd, line, n)) - //err_dump("str_echo: writen error"); - exit(-1); - } -} - - -void str_ack(int sockfd) { - - int n; - char line[MAXLINE]; - char *ack = "ack"; - - while(true) { - - // read from the stream - n = readline(sockfd, line, MAXLINE); - - if (0 == n) - return; // connection is terminated - else if (n < 0) - //err_dump("str_echo: readline error"); - exit(-1); - - // write back to the stream - if (4 != writen(sockfd, "ack\n", 4)) - //err_dump("str_echo: writen error"); - exit(-1); - } -} - - - - -// Read command lines from the socket and execute them - -void str_run(int sockfd) { - - int n; - char line[MAXLINE]; - char* error_msg = "str_run: No command interpreter found\n"; - char* ack_msg = "Running command... "; - char* commit_msg = "Command executed!\n"; - - while(true) { - - // read from the stream - n = readline(sockfd, line, MAXLINE); - - if (0 == n) - return; // connection is terminated - else if (n < 0) - //err_dump("str_echo: readline error"); - exit(-1); - - if (system(NULL)) { - writen(sockfd, ack_msg, strlen(ack_msg)); - system(line); - writen(sockfd, commit_msg, strlen(commit_msg)); - } - else if (strlen(error_msg) != writen(sockfd, error_msg, strlen(error_msg))) - //err_dump("str_echo: writen error"); - exit(-1); - } -} - - -// take a filename and copy it from Ceph to a local directory - -void str_copytolocal(int sockfd) { - - int n; - char line[MAXLINE]; - char* error_msg = "str_copy: No command interpreter found\n"; - char* ack_msg = "Running command... "; - char* commit_msg = "Command executed!\n"; - char* temp_dir = "/tmp"; - - - while(true) { - - // read from the stream - n = readline(sockfd, line, MAXLINE); - - if (0 == n) - return; // connection is terminated - else if (n < 0) - //err_dump("str_echo: readline error"); - exit(-1); - - if (system(NULL)) { - writen(sockfd, ack_msg, strlen(ack_msg)); - system(line); - writen(sockfd, commit_msg, strlen(commit_msg)); - } - else if (strlen(error_msg) != writen(sockfd, error_msg, strlen(error_msg))) - //err_dump("str_echo: writen error"); - exit(-1); - } -} - - - diff --git a/branches/sage/mds/active/activetaskd.h b/branches/sage/mds/active/activetaskd.h deleted file mode 100644 index fc5cec923c4bc..0000000000000 --- a/branches/sage/mds/active/activetaskd.h +++ /dev/null @@ -1,14 +0,0 @@ -#include "inet.h" -#include "common.h" -#include "utility.h" -#include "client/Client.h" - - -// The port number is "osdd" on a telephone keypad. -#define SERV_TCP_PORT 6733 - -#define MAXLINE 512 - -void str_echo(int sockfd); -void str_ack(int sockfd); -void str_run(int sockfd); diff --git a/branches/sage/mds/active/client_init.cc b/branches/sage/mds/active/client_init.cc deleted file mode 100644 index 8b137891791fe..0000000000000 --- a/branches/sage/mds/active/client_init.cc +++ /dev/null @@ -1 +0,0 @@ - diff --git a/branches/sage/mds/active/client_init.h b/branches/sage/mds/active/client_init.h deleted file mode 100644 index 139597f9cb07c..0000000000000 --- a/branches/sage/mds/active/client_init.h +++ /dev/null @@ -1,2 +0,0 @@ - - diff --git a/branches/sage/mds/active/common.h b/branches/sage/mds/active/common.h deleted file mode 100644 index bf2c73ca4052a..0000000000000 --- a/branches/sage/mds/active/common.h +++ /dev/null @@ -1,94 +0,0 @@ -#ifndef COMMON_H -#define COMMON_H - - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -// a bunch of string constants -// for commands - - - -#define CMDLENGTH 10 -#define CMDCOUNT 11 - -#define MAX_STRING_SIZE 255 - -/* - * These are the various messages that can be sent between the master - * and slave. The slave sends one reply to each message from the master. - - * PING/PINGREPLY: just what it sounds like. - - * STARTTASK: starts a task. Needs to be reworked to allow code - * shipping. The slave attempts to perform the task, and replies with - * FINISHEDTASK or TASKFAILED. - * - * RETRIEVELOCALFILE: requests a file that the slave has stored - * locally. Slave replies with SENDLOCALFILE and the file, or with - * LOCALFILENOTFOUND. - * - * SHIPCODE: sends a shared library to the slave, containing a - * function that is to be executed later by the STARTTASK - * command. Slave replies with CODESAVED or SHIPFAILED. - * - */ - - -const off_t CHUNK = 1024 * 1024 * 4; - -#define PING 0 -#define STARTTASK 1 -#define RETRIEVELOCALFILE 2 -#define PINGREPLY 3 -#define FINISHEDTASK 4 -#define TASKFAILED 5 -#define SENDLOCALFILE 6 -#define LOCALFILENOTFOUND 7 -#define SHIPCODE 8 -#define CODESAVED 9 -#define SHIPFAILED 10 - - -#define FOOTER_LENGTH 7 - -const char* CMD_LIST[CMDCOUNT] = {"______PING", - "START_TASK", - "_GET_LOCAL", - "PING_REPLY", - "_TASK_DONE", - "TASKFAILED", - "SEND_LOCAL", - "LOCAL_GONE", - "_SHIP_CODE", - "CODE_SAVED", - "SHIPFAILED"}; - -const char FOOTER[FOOTER_LENGTH + 1] = "MSG_END"; - - -// const char* strArray[] = {"string1", "string2", "string3"}; -//const char commands[2][4] = {"foo", "bar"}; - - - -// error codes -#define ARGUMENTSINVALID 1001 -#define CEPHCLIENTSTARTUPFAILED 1002 -#define INPUTFILEREADFAILED 1003 - - -// const char* name = "Njal"; - - - -#endif //COMMON_H diff --git a/branches/sage/mds/active/echotestclient.cc b/branches/sage/mds/active/echotestclient.cc deleted file mode 100644 index 2b2d15e7ca5cb..0000000000000 --- a/branches/sage/mds/active/echotestclient.cc +++ /dev/null @@ -1,74 +0,0 @@ -/* - * This is merely a test of an echo server; it's an early step in - * building up the Ceph distributed compute service. This is - * discardable once the next stage is up and running. - * - * Code is based off examples in Stevens' "Unix Network Programming". - */ - -#include "echotestclient.h" - -int main(int argc, char* argv[]) { - - int sockfd; - struct sockaddr_in serv_addr; - - char* pname = argv[0]; - - bzero((char *) &serv_addr, sizeof(serv_addr)); - serv_addr.sin_family = AF_INET; - serv_addr.sin_addr.s_addr = inet_addr(SERV_HOST_ADDR); - serv_addr.sin_port = htons(SERV_TCP_PORT); - - - // open a TCP socket - if ((sockfd = socket(AF_INET, SOCK_STREAM, 0)) < 0) { - printf("client: can't open stream socket"); - exit (-1); - } - - // connect to the server. - if (connect(sockfd, (struct sockaddr *) &serv_addr, - sizeof(serv_addr)) < 0) { - printf("client: can't connect to server"); - exit (-1); - } - - // start the test echoer - str_cli(stdin, sockfd); - - - close (sockfd); - exit(0); -} - - -void str_cli(FILE *fp, int sockfd) { - - int n; - char sendline[MAXLINE], recvline[MAXLINE + 1]; - - // read from the fp and write to the socket; - // then read from the socket and write to stdout - while (fgets(sendline, MAXLINE, fp) != NULL) { - - n = strlen(sendline); - if (writen(sockfd, sendline, n) != n) { - printf("str_cli: writen error on socket"); - exit(-1); - } - n = readline(sockfd, recvline, MAXLINE); - if (n < 0) { - printf("str_cli: readline error"); - exit(-1); - } - recvline[n] = 0; - fputs(recvline, stdout); - } - - if (ferror(fp)) { - printf("str_cli: error reading file"); - exit(-1); - } - -} diff --git a/branches/sage/mds/active/echotestclient.h b/branches/sage/mds/active/echotestclient.h deleted file mode 100644 index 9b26416640bc2..0000000000000 --- a/branches/sage/mds/active/echotestclient.h +++ /dev/null @@ -1,10 +0,0 @@ -#include "inet.h" -#include "common.h" -#include "socket_utility.h" - -#define SERV_HOST_ADDR "128.114.57.143" //issdm-8 -#define SERV_TCP_PORT 6733 -#define MAXLINE 512 - -void str_cli(FILE *fp, int sockfd); - diff --git a/branches/sage/mds/active/inet.h b/branches/sage/mds/active/inet.h deleted file mode 100644 index 385fa915f9dc7..0000000000000 --- a/branches/sage/mds/active/inet.h +++ /dev/null @@ -1,9 +0,0 @@ -/* - * Generic TCP/IP definitions - */ - -#include -#include -#include -#include -#include diff --git a/branches/sage/mds/active/msgtestclient.cc b/branches/sage/mds/active/msgtestclient.cc deleted file mode 100644 index 53650e730b387..0000000000000 --- a/branches/sage/mds/active/msgtestclient.cc +++ /dev/null @@ -1,418 +0,0 @@ -/* - * This test client tests the sending of message headers to the slave. - * - * Code is based off examples in Stevens' "Unix Network Programming". - */ -#include "msgtestclient.h" -#define REQUIRED_ARGS 2 - -int main(int argc, char* argv[]) { - - - // make sure we have all the arguments we need - if (argc < REQUIRED_ARGS) { usage(argv[0]); exit(-1); } - - // This file is rewired for running tests from a - // shell script. The first parameter specifies the - // name of the Ceph file that the test will be - // run on; the second parameter specifies which of - // four different tests will be run. - const char* input_filename = argv[1]; - int test_number = atoi(argv[2]); - assert (test_number > 0); - assert (test_number < 4); - - //const char* map_command = argv[2]; - // These two variables aren't really used yet. - const char* map_command = "map_foo"; - const char* output_filename = "out_foo"; - //const char* output_filename = argv[3]; - //const char* reduce_command = argv[4]; // not implemented yet - - // start up a Ceph client - Client* client = startCephClient(); - - // open the input file as read_only - int fh = client->open(input_filename, O_RDONLY); - if (fh < 0) { - cerr << "The input file " << input_filename << " could not be opened." << endl; - exit(-1); - } - - // How big is the file? - off_t filesize; - struct stat stbuf; - if (0 > client->lstat(input_filename, &stbuf)) { - cerr << "Error: could not retrieve size of input file " << input_filename << endl; - exit(-1); - } - filesize = stbuf.st_size; - if (filesize < 1) { - cerr << "Error: input file size is " << filesize << endl; - exit(-1); - } - - // retrieve all the object extents and close the file - list extents; - off_t offset = 0; - client->enumerate_layout(fh, extents, filesize, offset); - client->close(fh); - - list::iterator i; - map::iterator j; - int osd; - int taskID = 0; - - // Pull out all the extents, and make a vector of - // (ip_address, start, length). - - vector original_splits; - - for (i = extents.begin(); i != extents.end(); i++) { - - request_split split; - // find the primary and get its IP address - osd = client->osdmap->get_pg_primary(i->layout.pgid); - entity_inst_t inst = client->osdmap->get_inst(osd); - entity_addr_t entity_addr = inst.addr; - entity_addr.make_addr(split.ip_address); - - // iterate through each buffer_extent in the ObjectExtent - for (j = i->buffer_extents.begin(); - j != i->buffer_extents.end(); j++) { - - // get the range of the buffer_extent - split.start = (*j).first; - split.length = (*j).second; - // throw the split onto the vector - original_splits.push_back(split); - } - } - - // close the client - we're done with it - kill_client(client); - - // sanity check: display the splits - cerr << "Listing original splits:" << endl; - for (vector::iterator i = original_splits.begin(); - i != original_splits.end(); i++) { - cerr << "Split: IP " << i->ip_address << ", start " - << i->start << ", length " << i->length << endl; - } - - vector test_splits; - // Now, modify the splits as needed for the test type. - // There are three types of tests. - // Test 1: regular test. - // Test 2: put all the tasks on the "wrong" OSD. - // Test 3: do the entire job off one node. - - if (1 == test_number) { - cerr << "Test type 1: using original splits." << endl; - test_splits = original_splits; - } - else if (2 == test_number) { - cerr << "Test type 2: rotating split IP addresses. " << endl; - int split_count = original_splits.size(); - for (int i = 0; i < split_count; ++i) { - request_split s; - s.start = original_splits.at(i).start; - s.length = original_splits.at(i).length; - s.ip_address = original_splits.at((i+1)%split_count).ip_address; - test_splits.push_back(s); - } - } - else if (3 == test_number) { - cerr << "Test type 3: one giant split." << endl; - request_split s; - s.start = 0; - s.length = filesize; - s.ip_address = original_splits.at(0).ip_address; - test_splits.push_back(s); - } - else { - cerr << "Error: received invalid test type " << test_number << endl; - exit(-1); - } - - cerr << "Listing test splits:" << endl; - for (vector::iterator i = test_splits.begin(); - i != test_splits.end(); i++) { - cerr << "Split: IP " << i->ip_address << ", start " - << i->start << ", length " << i->length << endl; - } - - // start the timer - utime_t start_time = g_clock.now(); - int pending_tasks = 0; - - // start up the tasks - for (vector::iterator i = test_splits.begin(); - i != test_splits.end(); i++) { - start_map_task(i->ip_address, taskID++, map_command, input_filename, - i->start, i->length, output_filename); - ++pending_tasks; - } - - cerr << "Waiting for " << pending_tasks << " tasks to return..." << endl; - - // wait for all the tasks to finish - while (pending_tasks > 0) { - int exit_status; - cerr << "Waiting for " << pending_tasks << " tasks to return..." << endl; - pid_t pid = wait(&exit_status); - if (pid < 0) { - cerr << "ERROR on wait(): result was " << pid << endl; - exit(-1); - } - --pending_tasks; - if (WIFEXITED(exit_status)) { - cerr << "Task with pid " << pid << " returned with exit status " << - WEXITSTATUS(exit_status) << endl; - } - else { cerr << "WARNING: Task with pid " << pid << " exited abnormally" << endl; } - } - - cerr << "All tasks have returned." << endl; - // report the time - double elapsed_time; - elapsed_time = (g_clock.now() - start_time); - cerr << "Elapsed time: " << elapsed_time << endl; - cerr << elapsed_time << " " << endl; - // send the time to stdout for the shell script - cout << elapsed_time << " "; - exit(0); -} - - -// sends a complete ping message -// through the file descriptor -// and waits for a reply. This -// will hang if there's no reply. - -void ping_test(int fd) { - - // send the message header and footer. - // A ping message has no body. - send_msg_header(fd, PING); - send_msg_footer(fd); - - // receive the reply. - int msg_type = readmsgtype(fd); - if (msg_type < 0) { - cerr << "ping_test: Failed reading the ping reply. Exiting." << endl; - exit(-1); - } - if (PINGREPLY != msg_type) { - assert((msg_type <= 0) && (msg_type < CMDCOUNT) && - "readmsgtype return value out of range"); - cerr << "ping_test: slave sent invalid reply: replied to ping with message type" << - msg_type << ": " << CMD_LIST[msg_type] << ". Exiting. " << endl; - exit(-1); - } - else { - cerr << "Received valid ping reply!" << endl; - } - - if(!check_footer(fd)) { - cerr << "ping_test: message footer not found. Exiting." << endl; - exit(-1); - } -} - - - - -// send a test message for starting a task -void start_task_test(int fd) { - - // The test: - // TaskID 42 - // command: "Burninate" - // input file: "countryside" - // offset: 8764 (TROG) - // length: 367 (DOR) - - send_start_task_msg(fd, 42, strlen("Burninate"), "Burninate", - strlen("countryside"), "countryside", - 8764, 367, - strlen("toast"), "toast"); -} - - -// sends a message to the fd telling it to start a task. -// Remember: the message format requires any string to be -// prefixed by its (unterminated) length. -void send_start_task_msg(int fd, - int taskID, - int command_size, const char* command, - int inputfilenamesize, const char* inputfilename, - off_t offset, - off_t length, - int outputfilenamesize, const char* outputfilename) { - - // write the header and the message to the file descriptor. - - send_msg_header(fd, STARTTASK); - - write_positive_int(fd, taskID); - write_positive_int(fd, command_size); - write_string(fd, command); - write_positive_int(fd, inputfilenamesize); - write_string(fd, inputfilename); - //write_long(fd, offset); - write_off_t (fd, offset); - //write_long(fd, length); - write_off_t (fd, length); - write_positive_int(fd, outputfilenamesize); - write_string(fd, outputfilename); - - // terminate the message - send_msg_footer(fd); -} - - - - -// creates a new connection to the slave -// at the given IP address and port. -// Overloaded to take an IP address as a -// string or as an in_addr_t. - -int create_new_connection(const char* ip_address, uint16_t port) -{ - in_addr_t ip = inet_addr(ip_address); - if ((in_addr_t)-1 == ip) { - cerr << "Error creating new connection: \"" << ip_address << - "\" is not a valid IP address." << endl; - return -1; - } - else - //cerr << "Opening connection to " << ip_address << ":" << endl; - return create_new_connection(ip, port); -} - - -int create_new_connection(in_addr_t ip_address, uint16_t port) { - - struct sockaddr_in serv_addr; - int sockfd; - - bzero((char *) &serv_addr, sizeof(serv_addr)); - serv_addr.sin_family = AF_INET; - //serv_addr.sin_addr.s_addr = inet_addr(SERV_HOST_ADDR); - serv_addr.sin_addr.s_addr = ip_address; - serv_addr.sin_port = htons(SERV_TCP_PORT); - - // open a TCP socket - if ((sockfd = socket(AF_INET, SOCK_STREAM, 0)) < 0) { - cerr << "msgtestclient: can't open stream socket. Exiting." << endl; - exit (-1); - } - - // connect to the server. - if (connect(sockfd, (struct sockaddr *) &serv_addr, - sizeof(serv_addr)) < 0) { - cerr << "msgtestclient: can't connect to server." << endl; - exit (-1); - } - //cerr << "opened connection!" << endl; - return sockfd; -} - -void msg_type_sender(int sockfd) { - - for (int i = 0; i < CMDCOUNT; ++i) { - send_msg_header(sockfd, i); - } - -} - -// Fires up the map task. -// For the moment, all it does is echo the command line, not run it. -int start_map_task(sockaddr_in ip_address, int taskID, - const char* command, const char* input_filename, - off_t start, off_t length, - const char* output_filename) -{ - int childpid; - // fork off a child process to do the work, and return - if ((childpid = fork()) < 0) { - cerr << "start_map_task: fork error. Exiting." << endl; - exit(-1); - } - - else if (childpid != 0) { // parent - cerr << "start_map_task: forked child process " - << childpid << " to start task. " << endl; - return 0; - } - - - string ip_addr_string(inet_ntoa(ip_address.sin_addr)); - // cerr << "command: " << ip_addr_string << " taskID " - // << taskID << ": " << command - // << " " << input_filename << " " << start << " " << length - // << " " << output_filename << endl; - - // open a socket to the slave, and send the message - //cerr << "Sending message: " << endl; - int sockfd = create_new_connection(ip_addr_string.c_str(), SERV_TCP_PORT); - send_start_task_msg(sockfd, taskID, strlen(command), command, - strlen(input_filename), input_filename, - start, length, - strlen(output_filename), output_filename); - - // wait for a reply - cerr << "Sent message for taskID " << taskID << ". Waiting for reply..." << endl; - - // receive the reply. - int msg_type = readmsgtype(sockfd); - if (msg_type < 0) { - cerr << "start_map_task: Failed reading the reply. Exiting." << endl; - exit(-1); - } - if (FINISHEDTASK != msg_type) { - assert((msg_type <= 0) && (msg_type < CMDCOUNT)); - cerr << "start_map_task: slave sent invalid reply: replied with message type" << - msg_type << ": " << CMD_LIST[msg_type] << ". Exiting. " << endl; - exit(-1); - } - // read the taskID of the reply - - int reply_taskID = read_positive_int(sockfd); - - if(!check_footer(sockfd)) { - cerr << "ping_test: message footer not found. Exiting." << endl; - exit(-1); - } - - // done! - close(sockfd); - cerr << "Task " << taskID << "/" << reply_taskID << - " complete! Ending child process." << endl; - exit(0); - //_exit(0); - cerr << "exit(0) returned. Strange things are afoot." << endl; -} - - - - -void usage(const char* name) { - //cout << "usage: " << name << " inputfile map_task outputfile" << endl; - //cout << "inputfile must be a valid path in the running Ceph filesystem." << endl; - //cout << "map_task should be given with an absolute path, and be present on "; - //cout << "the REGULAR filesystem every node." << endl; - //cout << "output_file will be written locally to the node." << endl; - - cout << "usage: " << name << " inputfile test_number" << endl; - cout << "inputfile must be a valid path in the running Ceph filesystem." << endl; - cout << "test_number must be 1, 2, or 3." << endl; - cout << " 1: run the test task normally (one slave per OSD)" << endl; - cout << " 2: run the test task on the \"wrong\" OSDs" << endl; - cout << " 3: run the entire task in a single process" << endl; -} - - - diff --git a/branches/sage/mds/active/msgtestclient.h b/branches/sage/mds/active/msgtestclient.h deleted file mode 100644 index 568c9057be250..0000000000000 --- a/branches/sage/mds/active/msgtestclient.h +++ /dev/null @@ -1,44 +0,0 @@ -#include "inet.h" -#include "common.h" -#include "utility.h" -#include "client/Client.h" - -// wait.h MUST NOT be #included before client/Client.h -#include -#include - - struct request_split { - tcpaddr_t ip_address; - off_t start; - off_t length; - }; - - -//#define SERV_HOST_ADDR "128.114.57.143" //issdm-8 -#define SERV_HOST_ADDR "128.114.57.166" //issdm-31 - -#define SERV_TCP_PORT 6733 -#define MAXLINE 512 - -void msg_type_sender(int sockfd); - - -int create_new_connection(const char* ip_address, uint16_t port); -int create_new_connection(in_addr_t ip_address, uint16_t port); -void usage(const char* name); -void ping_test(int fd); -void start_task_test(int fd); - -int start_map_task(sockaddr_in ip_address, int taskID, - const char* map_command, - const char* input_filename, - off_t start, off_t length, - const char* output_filename); - -void send_start_task_msg(int fd, - int taskID, - int command_size, const char* command, - int inputfilenamesize, const char* inputfilename, - off_t offset, - off_t length, - int outputfilenamesize, const char* outputfilename); diff --git a/branches/sage/mds/active/trivial_task.cc b/branches/sage/mds/active/trivial_task.cc deleted file mode 100644 index 7a72ecb277c4b..0000000000000 --- a/branches/sage/mds/active/trivial_task.cc +++ /dev/null @@ -1,50 +0,0 @@ -#include "trivial_task.h" - -void start_trivial_task (const char* ceph_filename, const char* local_filename, - off_t offset, off_t length) { - // Don't bother to copy the file to disk. Read the file directly from Ceph, - // and add up all the bytes. - // Write the total to the local file as a string. - Client * client = startCephClient(); - - bufferptr bp(CHUNK); - - // get the source file's size. Sanity-check the request range. - struct stat st; - int r = client->lstat(ceph_filename, &st); - assert (r == 0); - - off_t src_total_size = st.st_size; - if (src_total_size < offset + length) { - cerr << "Error in copy ExtentToLocalFile: offset + length = " << offset << " + " << length - << " = " + (offset + length) << ", source file size is only " << src_total_size << endl; - exit(-1); - } - off_t remaining = length; - - // open the file and seek to the start position - cerr << "start_trivial_task: opening the source file and seeking " << endl; - - int fh_ceph = client->open(ceph_filename, O_RDONLY); - assert (fh_ceph > -1); - r = client->lseek(fh_ceph, offset, SEEK_SET); - assert (r == offset); - - int counter = 0; - // read through the extent and add up the bytes - cerr << "start_trivial_task: counting up bytes" << endl; - char* bp_c = bp.c_str(); - while (remaining > 0) { - off_t got = client->read(fh_ceph, bp_c, MIN(remaining,CHUNK), -1); - assert(got > 0); - remaining -= got; - for (off_t i = 0; i < got; ++i) { - counter += (unsigned int)(bp_c[i]); - } - } - cerr << "start_trivial_task: Done! Answer is " << counter << endl; - client->close(fh_ceph); - - //assert(0); -} - diff --git a/branches/sage/mds/active/trivial_task.h b/branches/sage/mds/active/trivial_task.h deleted file mode 100644 index ce9b47c82ceb6..0000000000000 --- a/branches/sage/mds/active/trivial_task.h +++ /dev/null @@ -1,12 +0,0 @@ -// Shared library for the trivial task of adding up all the bytes in a file - -//#include "inet.h" -#include "common.h" -#include "utility.h" -#include "client/Client.h" - - -extern "C" void start_trivial_task (const char* ceph_filename, - const char* local_filename, - off_t offset, off_t length); - diff --git a/branches/sage/mds/active/utility.h b/branches/sage/mds/active/utility.h deleted file mode 100644 index 789398c0f4527..0000000000000 --- a/branches/sage/mds/active/utility.h +++ /dev/null @@ -1,214 +0,0 @@ -/* - * Miscellaneous Active OSD helper functions. - * - */ - -//#include -#include "client/Client.h" -#include "common.h" -#include "config.h" -#include "common/Timer.h" -#include "msg/SimpleMessenger.h" -#include "socket_utility.h" - -Client* startCephClient(); -void kill_client(Client* client); - - -int send_msg_header(int fd, int header_ID); -int readmsgtype(int fd); -bool check_footer(int fd); -int send_msg_header(int fd, int header_ID); -int send_msg_footer(int fd); - -/* - * Fires up a Ceph client and returns a pointer to it. - */ - -Client* startCephClient() -{ - cout << "ActiveMaster: Initializing Ceph client:" << endl; - - // parse args from CEPH_ARGS, not command line - vector args; - env_to_vec(args); - parse_config_options(args); - - if (g_conf.clock_tare) g_clock.tare(); - - // be safe - g_conf.use_abspaths = true; - - // load monmap - MonMap* monmap = new MonMap(); - int r = monmap->read(".ceph_monmap"); - if (r < 0) { - cout << "ActiveMaster: could not find .ceph_monmap" << endl; - return 0; - } - assert(r >= 0); - - // start up network - rank.start_rank(); - - // start client - Client *client; - client = new Client(rank.register_entity(MSG_ADDR_CLIENT_NEW), monmap); - client->init(); - - // mount - client->mount(); - - return client; -} - -void kill_client (Client * client) -{ - client->unmount(); - client->shutdown(); - delete client; - - // wait for messenger to finish - rank.wait(); -} - - - -// read a message type from the socket, and print it. - -int readmsgtype(int fd) { - int rc; - char typebuf[CMDLENGTH + 1]; - - rc = read(fd, &typebuf, CMDLENGTH); - - // read a fixed-length text command - if (rc != CMDLENGTH) { - cerr << "in readmsgtype: read error: result is " << rc << endl; - return -1; - } - - // null-terminate the string - typebuf[CMDLENGTH] = 0; - - // print the command - //cerr << "readmsgtype: text type is " << typebuf << ", " ; - - // figure out which one it is, by number - for (int i = 0; i < CMDCOUNT; ++i) { - if (!strcmp(typebuf, CMD_LIST[i])) { - //cerr << "which is identified as type " << i << endl; - return i; - } - } - - // if we get here the type was invalid - cerr << "readmsgtype: unrecognized message type " << typebuf << endl; - return -1; -} - -// Attempt to read the message footer off -// the given stream. -bool check_footer(int fd) { - - // leave space for null termination - char footer_buf[FOOTER_LENGTH+1]; - - // read the footer - int rc = read(fd, &footer_buf, FOOTER_LENGTH); - if (rc != FOOTER_LENGTH) { - cerr << "in check_footer: read error: result is " << rc << endl; - return false; - } - - // null-terminate the string - footer_buf[FOOTER_LENGTH] = 0; - - // Is the footer correct? - if (0 == strcmp(footer_buf, FOOTER)) - return true; - else - return false; -} - - -// send a fixed-length message header -// given the header's ID. -int send_msg_header(int fd, int header_ID) { - if ((header_ID < 0) || (header_ID >= CMDCOUNT)) { - cerr << "In send_msg_header: received out-of-range header ID " << header_ID << - ". Exiting process." << endl; - exit(-1); - } - - //cerr << "attempting to send message " << CMD_LIST[header_ID] << - // " with ID " << header_ID << endl; - - if (CMDLENGTH != writen(fd, CMD_LIST[header_ID], CMDLENGTH)) { - cerr << "In send_msg_header: error writing header ID " << header_ID << - "to file descriptor " << fd << ". Exiting process." << endl; - exit(-1); - } - - return 0; -} - -// send the fixed-length message footer. -int send_msg_footer(int fd) { - //cerr << "attempting to send message footer: " << endl; - if (FOOTER_LENGTH != writen(fd, FOOTER, FOOTER_LENGTH)) { - cerr << "in send_msg_footer: error writing footer to file descriptor " << - fd << ". Exiting process." << endl; - exit(-1); - } else { - //cerr << "Sent message footer!" << endl; - } - return 0; -} - - - -// Copy a given extent of a Ceph file to the local disk. -// Requires a running Ceph client. -void copyExtentToLocalFile (Client* client, const char* ceph_source, - long offset, long length, - const char* local_destination) { - - // get the source file's size. Sanity-check the request range. - struct stat st; - int r = client->lstat(ceph_source, &st); - assert (r == 0); - - off_t src_total_size = st.st_size; - if (src_total_size < offset + length) { - cerr << "Error in copy ExtentToLocalFile: offset + size = " << offset << " + " << length - << " = " + (offset + length) << ", source file size is only " << src_total_size << endl; - exit(-1); - } - off_t remaining = length; - - // open the source and destination files. Advance the source - // file to the desired offset. - int fh_ceph = client->open(ceph_source, O_RDONLY); - assert (fh_ceph > -1); - r = client->lseek(fh_ceph, offset, SEEK_SET); - assert (r == offset); - - int fh_local = ::open(local_destination, O_WRONLY|O_CREAT|O_TRUNC, 0644); - assert (fh_local > -1); - - // copy the file 4 MB at a time - const int chunk = 4*1024*1024; - bufferptr bp(chunk); - - while (remaining > 0) { - off_t got = client->read(fh_ceph, bp.c_str(), MIN(remaining,chunk), -1); - assert(got > 0); - remaining -= got; - off_t wrote = ::write(fh_local, bp.c_str(), got); - assert (got == wrote); - } - // close the files - client->close(fh_ceph); - ::close(fh_local); -} diff --git a/branches/sage/mds/cfuse.cc b/branches/sage/mds/cfuse.cc deleted file mode 100644 index 3c157fefadf89..0000000000000 --- a/branches/sage/mds/cfuse.cc +++ /dev/null @@ -1,88 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include -#include -#include -using namespace std; - -#include "config.h" - -#include "client/Client.h" -#include "client/fuse.h" -#include "client/fuse_ll.h" - -#include "msg/SimpleMessenger.h" - -#include "common/Timer.h" - -#ifndef DARWIN -#include -#endif // DARWIN - -#include -#include -#include - -int main(int argc, char **argv, char *envp[]) { - - //cerr << "cfuse starting " << myrank << "/" << world << std::endl; - vector args; - argv_to_vec(argc, argv, args); - parse_config_options(args); - - // args for fuse - vec_to_argv(args, argc, argv); - - // FUSE will chdir("/"); be ready. - g_conf.use_abspaths = true; - - if (g_conf.clock_tare) g_clock.tare(); - - // load monmap - MonMap monmap; - int r = monmap.read(".ceph_monmap"); - assert(r >= 0); - - // start up network - rank.start_rank(); - - // start client - Client *client = new Client(rank.register_entity(entity_name_t::CLIENT()), &monmap); - client->init(); - - // start up fuse - // use my argc, argv (make sure you pass a mount point!) - cout << "mounting" << std::endl; - client->mount(); - - //cerr << "starting fuse on pid " << getpid() << std::endl; - if (g_conf.fuse_ll) - ceph_fuse_ll_main(client, argc, argv); - else - ceph_fuse_main(client, argc, argv); - //cerr << "fuse finished on pid " << getpid() << std::endl; - - client->unmount(); - cout << "unmounted" << std::endl; - client->shutdown(); - - delete client; - - // wait for messenger to finish - rank.wait(); - - return 0; -} - diff --git a/branches/sage/mds/client/Client.cc b/branches/sage/mds/client/Client.cc deleted file mode 100644 index 08f431bf84835..0000000000000 --- a/branches/sage/mds/client/Client.cc +++ /dev/null @@ -1,3923 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -// unix-ey fs stuff -#include -#include -#include -#include -#include -#include - -#include - - -#include -using namespace std; - - -// ceph stuff -#include "Client.h" - -#include "messages/MStatfs.h" -#include "messages/MStatfsReply.h" - -#include "messages/MClientMount.h" -#include "messages/MClientUnmount.h" -#include "messages/MClientSession.h" -#include "messages/MClientReconnect.h" -#include "messages/MClientRequest.h" -#include "messages/MClientRequestForward.h" -#include "messages/MClientReply.h" -#include "messages/MClientFileCaps.h" - -#include "messages/MGenericMessage.h" - -#include "messages/MMDSGetMap.h" -#include "messages/MMDSMap.h" - -#include "osdc/Filer.h" -#include "osdc/Objecter.h" -#include "osdc/ObjectCacher.h" - -#include "common/Cond.h" -#include "common/Mutex.h" -#include "common/Logger.h" - - - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_client) *_dout << dbeginl << g_clock.now() << " client" << whoami /*<< "." << pthread_self() */ << " " - -#define tout if (g_conf.client_trace) traceout - - -// static logger -Mutex client_logger_lock; -LogType client_logtype; -Logger *client_logger = 0; - - - - -class C_Client_CloseRelease : public Context { - Client *cl; - Inode *in; -public: - C_Client_CloseRelease(Client *c, Inode *i) : cl(c), in(i) {} - void finish(int) { - cl->close_release(in); - } -}; - -class C_Client_CloseSafe : public Context { - Client *cl; - Inode *in; -public: - C_Client_CloseSafe(Client *c, Inode *i) : cl(c), in(i) {} - void finish(int) { - cl->close_safe(in); - } -}; - - - - - - -// cons/des - -Client::Client(Messenger *m, MonMap *mm, int in) : timer(client_lock) -{ - // which client am i? - whoami = m->get_myname().num(); - my_instance = in; - monmap = mm; - - mounted = false; - mount_timeout_event = 0; - unmounting = false; - - last_tid = 0; - unsafe_sync_write = 0; - - mdsmap = 0; - - // - root = 0; - - lru.lru_set_max(g_conf.client_cache_size); - - // file handles - free_fd_set.insert(10, 1<<30); - - // set up messengers - messenger = m; - - // osd interfaces - osdmap = new OSDMap(); // initially blank.. see mount() - objecter = new Objecter(messenger, monmap, osdmap, client_lock); - objecter->set_client_incarnation(0); // client always 0, for now. - objectcacher = new ObjectCacher(objecter, client_lock); - filer = new Filer(objecter); -} - - -Client::~Client() -{ - tear_down_cache(); - - if (objectcacher) { - delete objectcacher; - objectcacher = 0; - } - - if (filer) { delete filer; filer = 0; } - if (objecter) { delete objecter; objecter = 0; } - if (osdmap) { delete osdmap; osdmap = 0; } - if (mdsmap) { delete mdsmap; mdsmap = 0; } - - if (messenger) { delete messenger; messenger = 0; } -} - - -void Client::tear_down_cache() -{ - // fd's - for (hash_map::iterator it = fd_map.begin(); - it != fd_map.end(); - it++) { - Fh *fh = it->second; - dout(1) << "tear_down_cache forcing close of fh " << it->first << " ino " << fh->inode->inode.ino << dendl; - put_inode(fh->inode); - delete fh; - } - fd_map.clear(); - - // caps! - // *** FIXME *** - - // empty lru - lru.lru_set_max(0); - trim_cache(); - assert(lru.lru_get_size() == 0); - - // close root ino - assert(inode_map.size() <= 1); - if (root && inode_map.size() == 1) { - delete root; - root = 0; - inode_map.clear(); - } - - assert(inode_map.empty()); -} - - - -// debug crapola - -void Client::dump_inode(Inode *in, set& did) -{ - dout(1) << "dump_inode: inode " << in->ino() << " ref " << in->ref << " dir " << in->dir << dendl; - - if (in->dir) { - dout(1) << " dir size " << in->dir->dentries.size() << dendl; - //for (hash_map, eqstr>::iterator it = in->dir->dentries.begin(); - for (hash_map::iterator it = in->dir->dentries.begin(); - it != in->dir->dentries.end(); - it++) { - dout(1) << " dn " << it->first << " ref " << it->second->ref << dendl; - dump_inode(it->second->inode, did); - } - } -} - -void Client::dump_cache() -{ - set did; - - if (root) dump_inode(root, did); - - for (hash_map::iterator it = inode_map.begin(); - it != inode_map.end(); - it++) { - if (did.count(it->second)) continue; - - dout(1) << "dump_cache: inode " << it->first - << " ref " << it->second->ref - << " dir " << it->second->dir << dendl; - if (it->second->dir) { - dout(1) << " dir size " << it->second->dir->dentries.size() << dendl; - } - } - -} - - -void Client::init() -{ - - // logger? - client_logger_lock.Lock(); - if (client_logger == 0) { - client_logtype.add_inc("lsum"); - client_logtype.add_inc("lnum"); - client_logtype.add_inc("lwsum"); - client_logtype.add_inc("lwnum"); - client_logtype.add_inc("lrsum"); - client_logtype.add_inc("lrnum"); - client_logtype.add_inc("trsum"); - client_logtype.add_inc("trnum"); - client_logtype.add_inc("wrlsum"); - client_logtype.add_inc("wrlnum"); - client_logtype.add_inc("lstatsum"); - client_logtype.add_inc("lstatnum"); - client_logtype.add_inc("ldirsum"); - client_logtype.add_inc("ldirnum"); - client_logtype.add_inc("readdir"); - client_logtype.add_inc("stat"); - client_logtype.add_avg("owrlat"); - client_logtype.add_avg("ordlat"); - client_logtype.add_inc("owr"); - client_logtype.add_inc("ord"); - - char s[80]; - char hostname[80]; - gethostname(hostname, 79); - sprintf(s,"clients.%s.%d", hostname, getpid()); - client_logger = new Logger(s, &client_logtype); - } - client_logger_lock.Unlock(); - -} - -void Client::shutdown() -{ - dout(1) << "shutdown" << dendl; - messenger->shutdown(); -} - - - - -// =================== -// metadata cache stuff - -void Client::trim_cache() -{ - unsigned last = 0; - while (lru.lru_get_size() != last) { - last = lru.lru_get_size(); - - if (lru.lru_get_size() <= lru.lru_get_max()) break; - - // trim! - Dentry *dn = (Dentry*)lru.lru_expire(); - if (!dn) break; // done - - dout(15) << "trim_cache unlinking dn " << dn->name - << " in dir " << hex << dn->dir->parent_inode->inode.ino - << dendl; - unlink(dn); - } - - // hose root? - if (lru.lru_get_size() == 0 && root && root->ref == 0 && inode_map.size() == 1) { - dout(15) << "trim_cache trimmed root " << root << dendl; - delete root; - root = 0; - inode_map.clear(); - } -} - -/** insert_inode - * - * insert + link a single dentry + inode into the metadata cache. - */ -Inode* Client::insert_inode(Dir *dir, InodeStat *st, const string& dname) -{ - Dentry *dn = NULL; - if (dir->dentries.count(dname)) - dn = dir->dentries[dname]; - - dout(12) << "insert_inode " << dname << " ino " << st->inode.ino - << " size " << st->inode.size - << " mtime " << st->inode.mtime - << " mask " << st->mask - << " in dir " << dir->parent_inode->inode.ino - << dendl; - - if (dn) { - if (dn->inode->inode.ino == st->inode.ino) { - touch_dn(dn); - dout(12) << " had dentry " << dname - << " with correct ino " << dn->inode->inode.ino - << dendl; - } else { - dout(12) << " had dentry " << dname - << " with WRONG ino " << dn->inode->inode.ino - << dendl; - unlink(dn); - dn = NULL; - } - } - - if (!dn) { - // have inode linked elsewhere? -> unlink and relink! - if (inode_map.count(st->inode.ino)) { - Inode *in = inode_map[st->inode.ino]; - assert(in); - - if (in->dn) { - dout(12) << " had ino " << in->inode.ino - << " not linked or linked at the right position, relinking" - << dendl; - dn = relink(dir, dname, in); - } else { - // link - dout(12) << " had ino " << in->inode.ino - << " unlinked, linking" << dendl; - dn = link(dir, dname, in); - } - } - } - - if (!dn) { - Inode *in = new Inode(st->inode, objectcacher); - inode_map[st->inode.ino] = in; - dn = link(dir, dname, in); - dout(12) << " new dentry+node with ino " << st->inode.ino << dendl; - } else { - // actually update info - dout(12) << " stat inode mask is " << st->mask << dendl; - if (st->mask & STAT_MASK_BASE) { - dn->inode->inode = st->inode; - dn->inode->dirfragtree = st->dirfragtree; // FIXME look at the mask! - } - - // ...but don't clobber our mtime, size! - /* isn't this handled below? - if ((dn->inode->mask & STAT_MASK_SIZE) == 0 && - dn->inode->file_wr_size > dn->inode->inode.size) - dn->inode->inode.size = dn->inode->file_wr_size; - if ((dn->inode->mask & STAT_MASK_MTIME) == 0 && - dn->inode->file_wr_mtime > dn->inode->inode.mtime) - dn->inode->inode.mtime = dn->inode->file_wr_mtime; - */ - } - - // OK, we found it! - assert(dn && dn->inode); - - // save the mask - dn->inode->mask = st->mask; - - // or do we have newer size/mtime from writing? - if (dn->inode->file_caps() & CAP_FILE_WR) { - if (dn->inode->file_wr_size > dn->inode->inode.size) - dn->inode->inode.size = dn->inode->file_wr_size; - if (dn->inode->file_wr_mtime > dn->inode->inode.mtime) - dn->inode->inode.mtime = dn->inode->file_wr_mtime; - } - - // symlink? - if (dn->inode->inode.is_symlink()) { - if (!dn->inode->symlink) - dn->inode->symlink = new string; - *(dn->inode->symlink) = st->symlink; - } - - return dn->inode; -} - -/** update_inode_dist - * - * update MDS location cache for a single inode - */ -void Client::update_dir_dist(Inode *in, DirStat *dst) -{ - // auth - in->dir_auth = -1; - if (dst->frag == frag_t()) { - in->dir_auth = dst->auth; - } else { - dout(20) << "got dirfrag map for " << in->inode.ino << " frag " << dst->frag << " to mds " << dst->auth << dendl; - in->fragmap[dst->frag] = dst->auth; - } - - // replicated - in->dir_replicated = dst->is_rep; // FIXME that's just one frag! - - // dist - /* - if (!st->dirfrag_dist.empty()) { // FIXME - set dist = st->dirfrag_dist.begin()->second; - if (dist.empty() && !in->dir_contacts.empty()) - dout(9) << "lost dist spec for " << in->inode.ino - << " " << dist << dendl; - if (!dist.empty() && in->dir_contacts.empty()) - dout(9) << "got dist spec for " << in->inode.ino - << " " << dist << dendl; - in->dir_contacts = dist; - } - */ -} - - -/** insert_trace - * - * insert a trace from a MDS reply into the cache. - */ -Inode* Client::insert_trace(MClientReply *reply) -{ - Inode *cur = root; - utime_t now = g_clock.real_now(); - - dout(10) << "insert_trace got " << reply->get_trace_in().size() << " inodes" << dendl; - - list::const_iterator pdn = reply->get_trace_dn().begin(); - list::const_iterator pdir = reply->get_trace_dir().begin(); - - for (list::const_iterator pin = reply->get_trace_in().begin(); - pin != reply->get_trace_in().end(); - ++pin) { - - if (pin == reply->get_trace_in().begin()) { - // root - dout(10) << "insert_trace root" << dendl; - if (!root) { - // create - cur = root = new Inode((*pin)->inode, objectcacher); - dout(10) << "insert_trace new root is " << root << dendl; - inode_map[root->inode.ino] = root; - root->dir_auth = 0; - } - } else { - // not root. - Dir *dir = cur->open_dir(); - assert(pdn != reply->get_trace_dn().end()); - cur = this->insert_inode(dir, *pin, *pdn); - dout(10) << "insert_trace dn " << *pdn << " ino " << (*pin)->inode.ino << " -> " << cur << dendl; - ++pdn; - - // move to top of lru! - if (cur->dn) - lru.lru_touch(cur->dn); - } - - // set cache ttl - if (g_conf.client_cache_stat_ttl) { - cur->valid_until = now; - cur->valid_until += g_conf.client_cache_stat_ttl; - } - - // update dir dist info - if (pdir == reply->get_trace_dir().end()) break; - update_dir_dist(cur, *pdir); - ++pdir; - } - - return cur; -} - - - - -Dentry *Client::lookup(filepath& path) -{ - dout(14) << "lookup " << path << dendl; - - Inode *cur = root; - if (!cur) return NULL; - - Dentry *dn = 0; - for (unsigned i=0; iinode << " valid_until " << dn->inode->valid_until << dendl; - } else { - dout(14) << " dentry " << path[i] << " dne" << dendl; - return NULL; - } - cur = dn->inode; - assert(cur); - } else { - return NULL; // not a dir - } - } - - if (dn) { - dout(11) << "lookup '" << path << "' found " << dn->name << " inode " << dn->inode->inode.ino << " valid_until " << dn->inode->valid_until<< dendl; - } - - return dn; -} - -// ------- - -int Client::choose_target_mds(MClientRequest *req) -{ - int mds = 0; - - // find deepest known prefix - Inode *diri = root; // the deepest known containing dir - Inode *item = 0; // the actual item... if we know it - int missing_dn = -1; // which dn we miss on (if we miss) - - unsigned depth = req->get_filepath().depth(); - unsigned i; - for (i=0; iinode.is_dir() && diri->dir) { - Dir *dir = diri->dir; - - // do we have the next dentry? - if (dir->dentries.count( req->get_filepath()[i] ) == 0) { - missing_dn = i; // no. - break; - } - - dout(7) << " have path seg " << i << " on " << diri->dir_auth << " ino " << diri->inode.ino << " " << req->get_filepath()[i] << dendl; - - if (i == depth-1) { // last one! - item = dir->dentries[ req->get_filepath()[i] ]->inode; - break; - } - - // continue.. - diri = dir->dentries[ req->get_filepath()[i] ]->inode; - assert(diri); - } else { - missing_dn = i; - break; - } - } - - // pick mds - if (!diri || g_conf.client_use_random_mds) { - // no root info, pick a random MDS - mds = mdsmap->get_random_in_mds(); - dout(0) << "random mds" << mds << dendl; - if (mds < 0) mds = 0; - - if (0) { - mds = 0; - dout(0) << "hack: sending all requests to mds" << mds << dendl; - } - } else { - if (req->auth_is_best()) { - // pick the actual auth (as best we can) - if (item) { - mds = item->authority(); - } else { - mds = diri->authority(req->get_filepath()[missing_dn]); - } - } else { - // balance our traffic! - mds = diri->pick_replica(mdsmap); // for the _inode_ - dout(20) << "for " << req->get_filepath() << " diri " << diri->inode.ino << " rep " - << diri->dir_contacts - << " mds" << mds << dendl; - } - } - dout(20) << "mds is " << mds << dendl; - - return mds; -} - - - -MClientReply *Client::make_request(MClientRequest *req, - int use_mds) // this param is purely for debug hacking -{ - // time the call - utime_t start = g_clock.real_now(); - - bool nojournal = false; - int op = req->get_op(); - if (op == MDS_OP_STAT || - op == MDS_OP_LSTAT || - op == MDS_OP_READDIR || - op == MDS_OP_OPEN) - nojournal = true; - - - // -- request -- - // assign a unique tid - tid_t tid = ++last_tid; - req->set_tid(tid); - - if (!mds_requests.empty()) - req->set_oldest_client_tid(mds_requests.begin()->first); - else - req->set_oldest_client_tid(tid); // this one is the oldest. - - // make note - MetaRequest request(req, tid); - mds_requests[tid] = &request; - - // encode payload now, in case we have to resend (in case of mds failure) - req->encode_payload(); - request.request_payload = req->get_payload(); - - // note idempotency - request.idempotent = req->is_idempotent(); - - // hack target mds? - if (use_mds) - request.resend_mds = use_mds; - - // set up wait cond - Cond cond; - request.caller_cond = &cond; - - while (1) { - // choose mds - int mds; - // force use of a particular mds? - if (request.resend_mds >= 0) { - mds = request.resend_mds; - request.resend_mds = -1; - dout(10) << "target resend_mds specified as mds" << mds << dendl; - } else { - mds = choose_target_mds(req); - if (mds >= 0) { - dout(10) << "chose target mds" << mds << " based on hierarchy" << dendl; - } else { - mds = mdsmap->get_random_in_mds(); - if (mds < 0) mds = 0; // hrm. - dout(10) << "chose random target mds" << mds << " for lack of anything better" << dendl; - } - } - - // open a session? - if (mds_sessions.count(mds) == 0) { - Cond cond; - - if (!mdsmap->is_active(mds)) { - dout(10) << "no address for mds" << mds << ", requesting new mdsmap" << dendl; - int mon = monmap->pick_mon(); - messenger->send_message(new MMDSGetMap(mdsmap->get_epoch()), - monmap->get_inst(mon)); - waiting_for_mdsmap.push_back(&cond); - cond.Wait(client_lock); - - if (!mdsmap->is_active(mds)) { - dout(10) << "hmm, still have no address for mds" << mds << ", trying a random mds" << dendl; - request.resend_mds = mdsmap->get_random_in_mds(); - continue; - } - } - - if (waiting_for_session.count(mds) == 0) { - dout(10) << "opening session to mds" << mds << dendl; - messenger->send_message(new MClientSession(MClientSession::OP_REQUEST_OPEN), - mdsmap->get_inst(mds), MDS_PORT_SERVER); - } - - // wait - waiting_for_session[mds].push_back(&cond); - while (waiting_for_session.count(mds)) { - dout(10) << "waiting for session to mds" << mds << " to open" << dendl; - cond.Wait(client_lock); - } - } - - // send request. - send_request(&request, mds); - - // wait for signal - dout(20) << "awaiting kick on " << &cond << dendl; - cond.Wait(client_lock); - - // did we get a reply? - if (request.reply) - break; - } - - // got it! - MClientReply *reply = request.reply; - - // kick dispatcher (we've got it!) - assert(request.dispatch_cond); - request.dispatch_cond->Signal(); - dout(20) << "sendrecv kickback on tid " << tid << " " << request.dispatch_cond << dendl; - - // clean up. - mds_requests.erase(tid); - - - // -- log times -- - if (client_logger) { - utime_t lat = g_clock.real_now(); - lat -= start; - dout(20) << "lat " << lat << dendl; - client_logger->finc("lsum",(double)lat); - client_logger->inc("lnum"); - - if (nojournal) { - client_logger->finc("lrsum",(double)lat); - client_logger->inc("lrnum"); - } else { - client_logger->finc("lwsum",(double)lat); - client_logger->inc("lwnum"); - } - - if (op == MDS_OP_STAT) { - client_logger->finc("lstatsum",(double)lat); - client_logger->inc("lstatnum"); - } - else if (op == MDS_OP_READDIR) { - client_logger->finc("ldirsum",(double)lat); - client_logger->inc("ldirnum"); - } - - } - - return reply; -} - - -void Client::handle_client_session(MClientSession *m) -{ - dout(10) << "handle_client_session " << *m << dendl; - int from = m->get_source().num(); - - switch (m->op) { - case MClientSession::OP_OPEN: - assert(mds_sessions.count(from) == 0); - mds_sessions[from] = 0; - break; - - case MClientSession::OP_CLOSE: - mds_sessions.erase(from); - // FIXME: kick requests (hard) so that they are redirected. or fail. - break; - - default: - assert(0); - } - - // kick waiting threads - for (list::iterator p = waiting_for_session[from].begin(); - p != waiting_for_session[from].end(); - ++p) - (*p)->Signal(); - waiting_for_session.erase(from); - - delete m; -} - - -void Client::send_request(MetaRequest *request, int mds) -{ - MClientRequest *r = request->request; - if (!r) { - // make a new one - dout(10) << "send_request rebuilding request " << request->tid - << " for mds" << mds << dendl; - r = new MClientRequest; - r->copy_payload(request->request_payload); - r->decode_payload(); - r->set_retry_attempt(request->retry_attempt); - } - request->request = 0; - - r->set_mdsmap_epoch(mdsmap->get_epoch()); - - dout(10) << "send_request " << *r << " to mds" << mds << dendl; - messenger->send_message(r, mdsmap->get_inst(mds), MDS_PORT_SERVER); - - request->mds.insert(mds); -} - -void Client::handle_client_request_forward(MClientRequestForward *fwd) -{ - tid_t tid = fwd->get_tid(); - - if (mds_requests.count(tid) == 0) { - dout(10) << "handle_client_request_forward no pending request on tid " << tid << dendl; - delete fwd; - return; - } - - MetaRequest *request = mds_requests[tid]; - assert(request); - - // reset retry counter - request->retry_attempt = 0; - - if (request->idempotent && - mds_sessions.count(fwd->get_dest_mds())) { - // dest mds has a session, and request was forwarded for us. - - // note new mds set. - if (request->num_fwd < fwd->get_num_fwd()) { - // there are now exactly two mds's whose failure should trigger a resend - // of this request. - request->mds.clear(); - request->mds.insert(fwd->get_source().num()); - request->mds.insert(fwd->get_dest_mds()); - request->num_fwd = fwd->get_num_fwd(); - dout(10) << "handle_client_request tid " << tid - << " fwd " << fwd->get_num_fwd() - << " to mds" << fwd->get_dest_mds() - << ", mds set now " << request->mds - << dendl; - } else { - dout(10) << "handle_client_request tid " << tid - << " previously forwarded to mds" << fwd->get_dest_mds() - << ", mds still " << request->mds - << dendl; - } - } else { - // request not forwarded, or dest mds has no session. - // resend. - dout(10) << "handle_client_request tid " << tid - << " fwd " << fwd->get_num_fwd() - << " to mds" << fwd->get_dest_mds() - << ", non-idempotent, resending to " << fwd->get_dest_mds() - << dendl; - - request->mds.clear(); - request->num_fwd = fwd->get_num_fwd(); - request->resend_mds = fwd->get_dest_mds(); - request->caller_cond->Signal(); - } - - delete fwd; -} - -void Client::handle_client_reply(MClientReply *reply) -{ - tid_t tid = reply->get_tid(); - - if (mds_requests.count(tid) == 0) { - dout(10) << "handle_client_reply no pending request on tid " << tid << dendl; - delete reply; - return; - } - MetaRequest *request = mds_requests[tid]; - assert(request); - - // store reply - request->reply = reply; - - // wake up waiter - request->caller_cond->Signal(); - - // wake for kick back - Cond cond; - request->dispatch_cond = &cond; - while (mds_requests.count(tid)) { - dout(20) << "handle_client_reply awaiting kickback on tid " << tid << " " << &cond << dendl; - cond.Wait(client_lock); - } -} - - -// ------------------------ -// incoming messages - -void Client::dispatch(Message *m) -{ - client_lock.Lock(); - - switch (m->get_type()) { - // osd - case MSG_OSD_OPREPLY: - objecter->handle_osd_op_reply((MOSDOpReply*)m); - break; - - case MSG_OSD_MAP: - objecter->handle_osd_map((class MOSDMap*)m); - if (!mounted) mount_cond.Signal(); - break; - - // mounting and mds sessions - case MSG_MDS_MAP: - handle_mds_map((MMDSMap*)m); - break; - case MSG_CLIENT_UNMOUNT: - handle_unmount(m); - break; - case MSG_CLIENT_SESSION: - handle_client_session((MClientSession*)m); - break; - - // requests - case MSG_CLIENT_REQUEST_FORWARD: - handle_client_request_forward((MClientRequestForward*)m); - break; - case MSG_CLIENT_REPLY: - handle_client_reply((MClientReply*)m); - break; - - case MSG_CLIENT_FILECAPS: - handle_file_caps((MClientFileCaps*)m); - break; - - case MSG_STATFS_REPLY: - handle_statfs_reply((MStatfsReply*)m); - break; - - default: - dout(10) << "dispatch doesn't recognize message type " << m->get_type() << dendl; - assert(0); // fail loudly - break; - } - - // unmounting? - if (unmounting) { - dout(10) << "unmounting: trim pass, size was " << lru.lru_get_size() - << "+" << inode_map.size() << dendl; - trim_cache(); - if (lru.lru_get_size() == 0 && inode_map.empty()) { - dout(10) << "unmounting: trim pass, cache now empty, waking unmount()" << dendl; - mount_cond.Signal(); - } else { - dout(10) << "unmounting: trim pass, size still " << lru.lru_get_size() - << "+" << inode_map.size() << dendl; - dump_cache(); - } - } - - client_lock.Unlock(); -} - - -void Client::handle_mds_map(MMDSMap* m) -{ - int frommds = -1; - if (m->get_source().is_mds()) - frommds = m->get_source().num(); - - if (mdsmap == 0) { - mdsmap = new MDSMap; - - assert(m->get_source().is_mon()); - whoami = m->get_dest().num(); - dout(1) << "handle_mds_map i am now " << m->get_dest() << dendl; - - mount_cond.Signal(); // mount might be waiting for this. - } - - if (m->get_epoch() < mdsmap->get_epoch()) { - dout(1) << "handle_mds_map epoch " << m->get_epoch() << " is older than our " - << mdsmap->get_epoch() << dendl; - delete m; - return; - } - - dout(1) << "handle_mds_map epoch " << m->get_epoch() << dendl; - mdsmap->decode(m->get_encoded()); - - // send reconnect? - if (frommds >= 0 && - mdsmap->get_state(frommds) == MDSMap::STATE_RECONNECT) { - send_reconnect(frommds); - } - - // kick requests? - if (frommds >= 0 && - mdsmap->get_state(frommds) == MDSMap::STATE_ACTIVE) { - kick_requests(frommds); - //failed_mds.erase(from); - } - - // kick any waiting threads - list ls; - ls.swap(waiting_for_mdsmap); - for (list::iterator p = ls.begin(); p != ls.end(); ++p) - (*p)->Signal(); - - delete m; -} - -void Client::send_reconnect(int mds) -{ - dout(10) << "send_reconnect to mds" << mds << dendl; - - MClientReconnect *m = new MClientReconnect; - - if (mds_sessions.count(mds)) { - // i have an open session. - for (hash_map::iterator p = inode_map.begin(); - p != inode_map.end(); - p++) { - if (p->second->caps.count(mds)) { - dout(10) << " caps on " << p->first - << " " << cap_string(p->second->caps[mds].caps) - << " wants " << cap_string(p->second->file_caps_wanted()) - << dendl; - p->second->caps[mds].seq = 0; // reset seq. - m->add_inode_caps(p->first, // ino - p->second->file_caps_wanted(), // wanted - p->second->caps[mds].caps, // issued - p->second->inode.size, p->second->inode.mtime, p->second->inode.atime); - string path; - p->second->make_path(path); - dout(10) << " path on " << p->first << " is " << path << dendl; - m->add_inode_path(p->first, path); - } - if (p->second->stale_caps.count(mds)) { - dout(10) << " clearing stale caps on " << p->first << dendl; - p->second->stale_caps.erase(mds); // hrm, is this right? - } - } - - // reset my cap seq number - mds_sessions[mds] = 0; - } else { - dout(10) << " i had no session with this mds"; - m->closed = true; - } - - messenger->send_message(m, mdsmap->get_inst(mds), MDS_PORT_SERVER); -} - - -void Client::kick_requests(int mds) -{ - dout(10) << "kick_requests for mds" << mds << dendl; - - for (map::iterator p = mds_requests.begin(); - p != mds_requests.end(); - ++p) - if (p->second->mds.count(mds)) { - p->second->retry_attempt++; // inc retry counter - send_request(p->second, mds); - } -} - - -/**** - * caps - */ - - -class C_Client_ImplementedCaps : public Context { - Client *client; - MClientFileCaps *msg; - Inode *in; -public: - C_Client_ImplementedCaps(Client *c, MClientFileCaps *m, Inode *i) : client(c), msg(m), in(i) {} - void finish(int r) { - client->implemented_caps(msg,in); - } -}; - -/** handle_file_caps - * handle caps update from mds. including mds to mds caps transitions. - * do not block. - */ -void Client::handle_file_caps(MClientFileCaps *m) -{ - int mds = m->get_source().num(); - Inode *in = 0; - if (inode_map.count(m->get_ino())) in = inode_map[ m->get_ino() ]; - - m->clear_payload(); // for if/when we send back to MDS - - // note push seq increment - if (mds_sessions.count(mds) == 0) - dout(0) << "got file_caps without session from mds" << mds << " msg " << *m << dendl; - //assert(mds_sessions.count(mds)); // HACK FIXME SOON - mds_sessions[mds]++; - - // reap? - if (m->get_op() == MClientFileCaps::OP_IMPORT) { - int other = m->get_mds(); - - /* - * FIXME: there is a race here.. if the caps are exported twice in succession, - * you may get the second import before the first, in which case the middle MDS's - * import and then export won't be handled properly. - * there should be a sequence number attached to the cap, incremented each time - * it is exported... - */ - /* - * FIXME: handle mds failures - */ - - if (in && in->stale_caps.count(other)) { - dout(5) << "handle_file_caps on ino " << m->get_ino() << " from mds" << mds << " imported from mds" << other << dendl; - - // fresh from new mds? - if (!in->caps.count(mds)) { - if (in->caps.empty()) in->get(); - in->caps[mds].seq = m->get_seq(); - in->caps[mds].caps = m->get_caps(); - } - - assert(in->stale_caps.count(other)); - in->stale_caps.erase(other); - if (in->stale_caps.empty()) put_inode(in); // note: this will never delete *in - - // fall-thru! - } else { - dout(5) << "handle_file_caps on ino " << m->get_ino() << " from mds" << mds << " premature (!!) import from mds" << other << dendl; - // delay! - cap_reap_queue[in->ino()][other] = m; - return; - } - } - - assert(in); - - // stale? - if (m->get_op() == MClientFileCaps::OP_EXPORT) { - dout(5) << "handle_file_caps on ino " << m->get_ino() << " seq " << m->get_seq() << " from mds" << mds << " now exported/stale" << dendl; - - // move to stale list - assert(in->caps.count(mds)); - if (in->stale_caps.empty()) in->get(); - in->stale_caps[mds] = in->caps[mds]; - - assert(in->caps.count(mds)); - in->caps.erase(mds); - if (in->caps.empty()) in->put(); - - // delayed reap? - if (cap_reap_queue.count(in->ino()) && - cap_reap_queue[in->ino()].count(mds)) { - dout(5) << "handle_file_caps on ino " << m->get_ino() << " from mds" << mds << " delayed reap on mds" << m->get_mds() << dendl; - - // process delayed reap - handle_file_caps( cap_reap_queue[in->ino()][mds] ); - - cap_reap_queue[in->ino()].erase(mds); - if (cap_reap_queue[in->ino()].empty()) - cap_reap_queue.erase(in->ino()); - } - delete m; - return; - } - - // release? - if (m->get_op() == MClientFileCaps::OP_RELEASE) { - dout(5) << "handle_file_caps on ino " << m->get_ino() << " from mds" << mds << " release" << dendl; - assert(in->caps.count(mds)); - in->caps.erase(mds); - for (map::iterator p = in->caps.begin(); - p != in->caps.end(); - p++) - dout(20) << " left cap " << p->first << " " - << cap_string(p->second.caps) << " " - << p->second.seq << dendl; - for (map::iterator p = in->stale_caps.begin(); - p != in->stale_caps.end(); - p++) - dout(20) << " left stale cap " << p->first << " " - << cap_string(p->second.caps) << " " - << p->second.seq << dendl; - - if (in->caps.empty()) { - //dout(0) << "did put_inode" << dendl; - put_inode(in); - } else { - //dout(0) << "didn't put_inode" << dendl; - } - delete m; - return; - } - - - // don't want? - if (in->file_caps_wanted() == 0) { - dout(5) << "handle_file_caps on ino " << m->get_ino() - << " seq " << m->get_seq() - << " " << cap_string(m->get_caps()) - << ", which we don't want caps for, releasing." << dendl; - m->set_op(MClientFileCaps::OP_ACK); - m->set_caps(0); - m->set_wanted(0); - messenger->send_message(m, m->get_source_inst(), MDS_PORT_LOCKER); - return; - } - - assert(in->caps.count(mds)); - - // update per-mds caps - const int old_caps = in->caps[mds].caps; - const int new_caps = m->get_caps(); - in->caps[mds].caps = new_caps; - in->caps[mds].seq = m->get_seq(); - dout(5) << "handle_file_caps on in " << m->get_ino() - << " mds" << mds << " seq " << m->get_seq() - << " caps now " << cap_string(new_caps) - << " was " << cap_string(old_caps) << dendl; - - // did file size decrease? - if ((old_caps & (CAP_FILE_RD|CAP_FILE_WR)) == 0 && - (new_caps & (CAP_FILE_RD|CAP_FILE_WR)) != 0 && - in->inode.size > m->get_inode().size) { - dout(10) << "*** file size decreased from " << in->inode.size << " to " << m->get_inode().size << dendl; - - // trim filecache? - if (g_conf.client_oc) - in->fc.truncate(in->inode.size, m->get_inode().size); - - in->inode.size = in->file_wr_size = m->get_inode().size; - } - - // update inode - in->inode = m->get_inode(); // might have updated size... FIXME this is overkill! - - // preserve our (possibly newer) file size, mtime - if (in->file_wr_size > in->inode.size) - m->get_inode().size = in->inode.size = in->file_wr_size; - if (in->file_wr_mtime > in->inode.mtime) - m->get_inode().mtime = in->inode.mtime = in->file_wr_mtime; - - - - if (g_conf.client_oc) { - // caching on, use FileCache. - Context *onimplement = 0; - if (old_caps & ~new_caps) { // this mds is revoking caps - if (in->fc.get_caps() & ~(in->file_caps())) // net revocation - onimplement = new C_Client_ImplementedCaps(this, m, in); - else { - implemented_caps(m, in); // ack now. - } - } - in->fc.set_caps(new_caps, onimplement); - } else { - // caching off. - - // wake up waiters? - if (new_caps & CAP_FILE_RD) { - for (list::iterator it = in->waitfor_read.begin(); - it != in->waitfor_read.end(); - it++) { - dout(5) << "signaling read waiter " << *it << dendl; - (*it)->Signal(); - } - in->waitfor_read.clear(); - } - if (new_caps & CAP_FILE_WR) { - for (list::iterator it = in->waitfor_write.begin(); - it != in->waitfor_write.end(); - it++) { - dout(5) << "signaling write waiter " << *it << dendl; - (*it)->Signal(); - } - in->waitfor_write.clear(); - } - if (new_caps & CAP_FILE_LAZYIO) { - for (list::iterator it = in->waitfor_lazy.begin(); - it != in->waitfor_lazy.end(); - it++) { - dout(5) << "signaling lazy waiter " << *it << dendl; - (*it)->Signal(); - } - in->waitfor_lazy.clear(); - } - - // ack? - if (old_caps & ~new_caps) { - if (in->sync_writes) { - // wait for sync writes to finish - dout(5) << "sync writes in progress, will ack on finish" << dendl; - in->waitfor_no_write.push_back(new C_Client_ImplementedCaps(this, m, in)); - } else { - // ok now - implemented_caps(m, in); - } - } else { - // discard - delete m; - } - } -} - -void Client::implemented_caps(MClientFileCaps *m, Inode *in) -{ - dout(5) << "implemented_caps " << cap_string(m->get_caps()) - << ", acking to " << m->get_source() << dendl; - - if (in->file_caps() == 0) { - in->file_wr_mtime = utime_t(); - in->file_wr_size = 0; - } - - messenger->send_message(m, m->get_source_inst(), MDS_PORT_LOCKER); -} - - -void Client::release_caps(Inode *in, - int retain) -{ - dout(5) << "releasing caps on ino " << in->inode.ino << dec - << " had " << cap_string(in->file_caps()) - << " retaining " << cap_string(retain) - << " want " << cap_string(in->file_caps_wanted()) - << dendl; - - for (map::iterator it = in->caps.begin(); - it != in->caps.end(); - it++) { - //if (it->second.caps & ~retain) { - if (1) { - // release (some of?) these caps - it->second.caps = retain & it->second.caps; - // note: tell mds _full_ wanted; it'll filter/behave based on what it is allowed to do - MClientFileCaps *m = new MClientFileCaps(MClientFileCaps::OP_ACK, - in->inode, - it->second.seq, - it->second.caps, - in->file_caps_wanted()); - messenger->send_message(m, mdsmap->get_inst(it->first), MDS_PORT_LOCKER); - } - } - - if (in->file_caps() == 0) { - in->file_wr_mtime = utime_t(); - in->file_wr_size = 0; - } -} - -void Client::update_caps_wanted(Inode *in) -{ - dout(5) << "updating caps wanted on ino " << in->inode.ino - << " to " << cap_string(in->file_caps_wanted()) - << dendl; - - // FIXME: pick a single mds and let the others off the hook.. - for (map::iterator it = in->caps.begin(); - it != in->caps.end(); - it++) { - MClientFileCaps *m = new MClientFileCaps(MClientFileCaps::OP_ACK, - in->inode, - it->second.seq, - it->second.caps, - in->file_caps_wanted()); - messenger->send_message(m, - mdsmap->get_inst(it->first), MDS_PORT_LOCKER); - } -} - - - -// ------------------- -// MOUNT - -void Client::_try_mount() -{ - dout(10) << "_try_mount" << dendl; - int mon = monmap->pick_mon(); - dout(2) << "sending client_mount to mon" << mon << " as instance " << my_instance << dendl; - messenger->send_first_message(this, // simultaneously go active (if we haven't already) - new MClientMount(messenger->get_myaddr(), my_instance), - monmap->get_inst(mon)); - - // schedule timeout? - assert(mount_timeout_event == 0); - mount_timeout_event = new C_MountTimeout(this); - timer.add_event_after(g_conf.client_mount_timeout, mount_timeout_event); -} - -void Client::_mount_timeout() -{ - dout(10) << "_mount_timeout" << dendl; - mount_timeout_event = 0; - _try_mount(); -} - -int Client::mount() -{ - client_lock.Lock(); - assert(!mounted); // caller is confused? - - objecter->init(); - - _try_mount(); - //messenger->set_dispatcher(this); // FIXME: there is still a race condition here! - - while (!mdsmap || - !osdmap || - osdmap->get_epoch() == 0) - mount_cond.Wait(client_lock); - - timer.cancel_event(mount_timeout_event); - mount_timeout_event = 0; - - mounted = true; - - dout(2) << "mounted: have osdmap " << osdmap->get_epoch() - << " and mdsmap " << mdsmap->get_epoch() - << dendl; - - // hack: get+pin root inode. - // fuse assumes it's always there. - Inode *root; - _do_lstat("/", STAT_MASK_ALL, &root); - _ll_get(root); - - // trace? - if (g_conf.client_trace) { - traceout.open(g_conf.client_trace); - if (traceout.is_open()) { - dout(1) << "opened trace file '" << g_conf.client_trace << "'" << dendl; - } else { - dout(1) << "FAILED to open trace file '" << g_conf.client_trace << "'" << dendl; - } - } - - client_lock.Unlock(); - - /* - dout(3) << "op: // client trace data structs" << dendl; - dout(3) << "op: struct stat st;" << dendl; - dout(3) << "op: struct utimbuf utim;" << dendl; - dout(3) << "op: int readlinkbuf_len = 1000;" << dendl; - dout(3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl; - dout(3) << "op: map dir_contents;" << dendl; - dout(3) << "op: map open_files;" << dendl; - dout(3) << "op: int fd;" << dendl; - */ - return 0; -} - - -// UNMOUNT - - -int Client::unmount() -{ - client_lock.Lock(); - - assert(mounted); // caller is confused? - - dout(2) << "unmounting" << dendl; - unmounting = true; - - // NOTE: i'm assuming all caches are already flushing (because all files are closed). - assert(fd_map.empty()); - - dout(10) << "a" << dendl; - - _ll_drop_pins(); - - dout(10) << "b" << dendl; - - // empty lru cache - lru.lru_set_max(0); - trim_cache(); - - if (g_conf.client_oc) { - // release any/all caps - for (hash_map::iterator p = inode_map.begin(); - p != inode_map.end(); - p++) { - Inode *in = p->second; - if (!in->caps.empty()) { - in->fc.release_clean(); - if (in->fc.is_dirty()) { - dout(10) << "unmount residual caps on " << in->ino() << ", flushing" << dendl; - in->fc.empty(new C_Client_CloseRelease(this, in)); - } else { - dout(10) << "unmount residual caps on " << in->ino() << ", releasing" << dendl; - release_caps(in); - } - } - } - } - - //if (0) {// hack - while (lru.lru_get_size() > 0 || - !inode_map.empty()) { - dout(2) << "cache still has " << lru.lru_get_size() - << "+" << inode_map.size() << " items" - << ", waiting (for caps to release?)" - << dendl; - dump_cache(); - mount_cond.Wait(client_lock); - } - assert(lru.lru_get_size() == 0); - assert(inode_map.empty()); - //} - - // unsafe writes - if (!g_conf.client_oc) { - while (unsafe_sync_write > 0) { - dout(0) << unsafe_sync_write << " unsafe_sync_writes, waiting" - << dendl; - mount_cond.Wait(client_lock); - } - } - - // stop tracing - if (g_conf.client_trace) { - dout(1) << "closing trace file '" << g_conf.client_trace << "'" << dendl; - traceout.close(); - } - - - // send session closes! - for (map::iterator p = mds_sessions.begin(); - p != mds_sessions.end(); - ++p) { - dout(2) << "sending client_session close to mds" << p->first << " seq " << p->second << dendl; - messenger->send_message(new MClientSession(MClientSession::OP_REQUEST_CLOSE, - p->second), - mdsmap->get_inst(p->first), MDS_PORT_SERVER); - } - - // send unmount! - int mon = monmap->pick_mon(); - dout(2) << "sending client_unmount to mon" << mon << dendl; - messenger->send_message(new MClientUnmount(messenger->get_myinst()), - monmap->get_inst(mon)); - - while (mounted) - mount_cond.Wait(client_lock); - - dout(2) << "unmounted." << dendl; - - objecter->shutdown(); - - client_lock.Unlock(); - return 0; -} - -void Client::handle_unmount(Message* m) -{ - dout(1) << "handle_unmount got ack" << dendl; - - mounted = false; - - delete mdsmap; - mdsmap = 0; - - mount_cond.Signal(); - - delete m; -} - - -// =============================================================== -// high level (POSIXy) interface - - -// namespace ops - -int Client::link(const char *existing, const char *newname) -{ - Mutex::Locker lock(client_lock); - tout << "link" << std::endl; - tout << existing << std::endl; - tout << newname << std::endl; - return _link(existing, newname); -} - -int Client::_link(const char *existing, const char *newname) -{ - // main path arg is new link name - // sarg is target (existing file) - - MClientRequest *req = new MClientRequest(MDS_OP_LINK, messenger->get_myinst()); - req->set_path(newname); - req->set_path2(existing); - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - MClientReply *reply = make_request(req); - int res = reply->get_result(); - - insert_trace(reply); - delete reply; - dout(10) << "link result is " << res << dendl; - - trim_cache(); - dout(3) << "link(\"" << existing << "\", \"" << newname << "\") = " << res << dendl; - return res; -} - - -int Client::unlink(const char *relpath) -{ - Mutex::Locker lock(client_lock); - tout << "unlink" << std::endl; - tout << relpath << std::endl; - - string abspath; - mkabspath(relpath, abspath); - return _unlink(abspath.c_str()); -} - -int Client::_unlink(const char *path) -{ - - MClientRequest *req = new MClientRequest(MDS_OP_UNLINK, messenger->get_myinst()); - req->set_path(path); - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - //FIXME enforce caller uid rights? - - MClientReply *reply = make_request(req); - int res = reply->get_result(); - if (res == 0) { - // remove from local cache - filepath fp(path); - Dentry *dn = lookup(fp); - if (dn) { - assert(dn->inode); - unlink(dn); - } - } - insert_trace(reply); - delete reply; - dout(10) << "unlink result is " << res << dendl; - - trim_cache(); - dout(3) << "unlink(\"" << path << "\") = " << res << dendl; - return res; -} - -int Client::rename(const char *relfrom, const char *relto) -{ - Mutex::Locker lock(client_lock); - tout << "rename" << std::endl; - tout << relfrom << std::endl; - tout << relto << std::endl; - - string absfrom, absto; - mkabspath(relfrom, absfrom); - mkabspath(relto, absto); - return _rename(absfrom.c_str(), absto.c_str()); -} - -int Client::_rename(const char *from, const char *to) -{ - MClientRequest *req = new MClientRequest(MDS_OP_RENAME, messenger->get_myinst()); - req->set_path(from); - req->set_path2(to); - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - //FIXME enforce caller uid rights? - - MClientReply *reply = make_request(req); - int res = reply->get_result(); - if (res == 0) { - // remove from local cache - filepath fp(to); - Dentry *dn = lookup(fp); - if (dn) { - assert(dn->inode); - unlink(dn); - } - } - insert_trace(reply); - delete reply; - dout(10) << "rename result is " << res << dendl; - - // renamed item from our cache - - trim_cache(); - dout(3) << "rename(\"" << from << "\", \"" << to << "\") = " << res << dendl; - return res; -} - -// dirs - -int Client::mkdir(const char *relpath, mode_t mode) -{ - Mutex::Locker lock(client_lock); - tout << "mkdir" << std::endl; - tout << relpath << std::endl; - tout << mode << std::endl; - - string abspath; - mkabspath(relpath, abspath); - return _mkdir(abspath.c_str(), mode); -} - -int Client::_mkdir(const char *path, mode_t mode) -{ - MClientRequest *req = new MClientRequest(MDS_OP_MKDIR, messenger->get_myinst()); - req->set_path(path); - req->args.mkdir.mode = mode; - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - //FIXME enforce caller uid rights? - - MClientReply *reply = make_request(req); - int res = reply->get_result(); - insert_trace(reply); - delete reply; - dout(10) << "mkdir result is " << res << dendl; - - trim_cache(); - - dout(3) << "mkdir(\"" << path << "\", 0" << oct << mode << dec << ") = " << res << dendl; - return res; -} - -int Client::rmdir(const char *relpath) -{ - Mutex::Locker lock(client_lock); - tout << "rmdir" << std::endl; - tout << relpath << std::endl; - - string abspath; - mkabspath(relpath, abspath); - return _rmdir(abspath.c_str()); -} - -int Client::_rmdir(const char *path) -{ - MClientRequest *req = new MClientRequest(MDS_OP_RMDIR, messenger->get_myinst()); - req->set_path(path); - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - //FIXME enforce caller uid rights? - - MClientReply *reply = make_request(req); - int res = reply->get_result(); - if (res == 0) { - // remove from local cache - filepath fp(path); - Dentry *dn = lookup(fp); - if (dn) { - if (dn->inode->dir && dn->inode->dir->is_empty()) - close_dir(dn->inode->dir); // FIXME: maybe i shoudl proactively hose the whole subtree from cache? - unlink(dn); - } - } - insert_trace(reply); - delete reply; - - trim_cache(); - dout(3) << "rmdir(\"" << path << "\") = " << res << dendl; - return res; -} - -// symlinks - -int Client::symlink(const char *target, const char *rellink) -{ - Mutex::Locker lock(client_lock); - tout << "symlink" << std::endl; - tout << target << std::endl; - tout << rellink << std::endl; - - string link; - mkabspath(rellink, link); - return _symlink(target, link.c_str()); -} - -int Client::_symlink(const char *target, const char *link) -{ - MClientRequest *req = new MClientRequest(MDS_OP_SYMLINK, messenger->get_myinst()); - req->set_path(link); - req->set_path2(target); - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - //FIXME enforce caller uid rights? - - MClientReply *reply = make_request(req); - int res = reply->get_result(); - insert_trace(reply); //FIXME assuming trace of link, not of target - delete reply; - - trim_cache(); - dout(3) << "symlink(\"" << target << "\", \"" << link << "\") = " << res << dendl; - return res; -} - -int Client::readlink(const char *path, char *buf, off_t size) -{ - Mutex::Locker lock(client_lock); - tout << "readlink" << std::endl; - tout << path << std::endl; - - string abspath; - mkabspath(path, abspath); - return _readlink(abspath.c_str(), buf, size); -} - -int Client::_readlink(const char *path, char *buf, off_t size) -{ - Inode *in; - int r = _do_lstat(path, STAT_MASK_BASE, &in); - if (r == 0 && !in->inode.is_symlink()) r = -EINVAL; - if (r == 0) { - // copy into buf (at most size bytes) - r = in->symlink->length(); - if (r > size) r = size; - memcpy(buf, in->symlink->c_str(), r); - } else { - buf[0] = 0; - } - trim_cache(); - - dout(3) << "readlink(\"" << path << "\", \"" << buf << "\", " << size << ") = " << r << dendl; - return r; -} - - - -// inode stuff - -int Client::_do_lstat(const char *path, int mask, Inode **in) -{ - MClientRequest *req = 0; - filepath fpath(path); - - // check whether cache content is fresh enough - int res = 0; - - Dentry *dn = lookup(fpath); - inode_t inode; - utime_t now = g_clock.real_now(); - - if (dn && - now <= dn->inode->valid_until) - dout(10) << "_lstat has inode " << path << " with mask " << dn->inode->mask << ", want " << mask << dendl; - - if (dn && dn->inode && - now <= dn->inode->valid_until && - ((mask & ~STAT_MASK_BASE) || now <= dn->inode->valid_until) && - ((dn->inode->mask & mask) == mask)) { - inode = dn->inode->inode; - dout(10) << "lstat cache hit w/ sufficient mask, valid until " << dn->inode->valid_until << dendl; - - if (g_conf.client_cache_stat_ttl == 0) - dn->inode->valid_until = utime_t(); // only one stat allowed after each readdir - - *in = dn->inode; - } else { - // FIXME where does FUSE maintain user information - //struct fuse_context *fc = fuse_get_context(); - //req->set_caller_uid(fc->uid); - //req->set_caller_gid(fc->gid); - - req = new MClientRequest(MDS_OP_LSTAT, messenger->get_myinst()); - req->args.stat.mask = mask; - req->set_filepath(fpath); - - MClientReply *reply = make_request(req); - res = reply->get_result(); - dout(10) << "lstat res is " << res << dendl; - if (res == 0) { - //Transfer information from reply to stbuf - inode = reply->get_inode(); - - //Update metadata cache - *in = insert_trace(reply); - } - - delete reply; - - if (res != 0) - *in = 0; // not a success. - } - - return res; -} - - -int Client::fill_stat(Inode *in, struct stat *st) -{ - dout(10) << "fill_stat on " << in->inode.ino << " mode 0" << oct << in->inode.mode << dec - << " mtime " << in->inode.mtime << " ctime " << in->inode.ctime << dendl; - memset(st, 0, sizeof(struct stat)); - st->st_ino = in->inode.ino; - st->st_mode = in->inode.mode; - st->st_rdev = in->inode.rdev; - st->st_nlink = in->inode.nlink; - st->st_uid = in->inode.uid; - st->st_gid = in->inode.gid; - st->st_ctime = MAX(in->inode.ctime, in->inode.mtime); - st->st_atime = in->inode.atime; - st->st_mtime = in->inode.mtime; - st->st_size = in->inode.size; - st->st_blocks = in->inode.size ? ((in->inode.size - 1) / 4096 + 1):0; - st->st_blksize = 4096; - return in->mask; -} - - /* - S_REQUIREBLKSIZE(st->st_litemask); - if (inode.mask & INODE_MASK_BASE) S_REQUIRECTIME(st->st_litemask); - if (inode.mask & INODE_MASK_SIZE) { - S_REQUIRESIZE(st->st_litemask); - S_REQUIREBLOCKS(st->st_litemask); - } - if (inode.mask & INODE_MASK_MTIME) S_REQUIREMTIME(st->st_litemask); - if (inode.mask & INODE_MASK_ATIME) S_REQUIREATIME(st->st_litemask); - */ - - -int Client::lstat(const char *relpath, struct stat *stbuf) -{ - Mutex::Locker lock(client_lock); - tout << "lstat" << std::endl; - tout << relpath << std::endl; - - string abspath; - mkabspath(relpath, abspath); - return _lstat(abspath.c_str(), stbuf); -} - -int Client::_lstat(const char *path, struct stat *stbuf) -{ - Inode *in = 0; - int res = _do_lstat(path, STAT_MASK_ALL, &in); - if (res == 0) { - assert(in); - fill_stat(in, stbuf); - dout(10) << "stat sez size = " << in->inode.size << " mode = 0" << oct << stbuf->st_mode << dec << " ino = " << stbuf->st_ino << dendl; - } - - trim_cache(); - dout(3) << "lstat(\"" << path << "\", " << stbuf << ") = " << res << dendl; - return res; -} - - -/* -int Client::lstatlite(const char *relpath, struct statlite *stl) -{ - client_lock.Lock(); - - string abspath; - mkabspath(relpath, abspath); - const char *path = abspath.c_str(); - - dout(3) << "op: client->lstatlite(\"" << path << "\", &st);" << dendl; - tout << "lstatlite" << std::endl; - tout << path << std::endl; - - // make mask - // FIXME. - int mask = INODE_MASK_BASE | INODE_MASK_AUTH; - if (S_ISVALIDSIZE(stl->st_litemask) || - S_ISVALIDBLOCKS(stl->st_litemask)) mask |= INODE_MASK_SIZE; - if (S_ISVALIDMTIME(stl->st_litemask)) mask |= INODE_MASK_FILE; - if (S_ISVALIDATIME(stl->st_litemask)) mask |= INODE_MASK_FILE; - - Inode *in = 0; - int res = _lstat(path, mask, &in); - - if (res == 0) { - fill_statlite(in->inode,stl); - dout(10) << "stat sez size = " << in->inode.size << " ino = " << in->inode.ino << dendl; - } - - trim_cache(); - client_lock.Unlock(); - return res; -} -*/ - - -int Client::chmod(const char *relpath, mode_t mode) -{ - Mutex::Locker lock(client_lock); - tout << "chmod" << std::endl; - tout << relpath << std::endl; - tout << mode << std::endl; - - string abspath; - mkabspath(relpath, abspath); - return _chmod(abspath.c_str(), mode); -} - -int Client::_chmod(const char *path, mode_t mode) -{ - dout(3) << "_chmod(" << path << ", 0" << oct << mode << dec << ")" << dendl; - MClientRequest *req = new MClientRequest(MDS_OP_CHMOD, messenger->get_myinst()); - req->set_path(path); - req->args.chmod.mode = mode; - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - MClientReply *reply = make_request(req); - int res = reply->get_result(); - insert_trace(reply); - delete reply; - - trim_cache(); - dout(3) << "_chmod(\"" << path << "\", 0" << oct << mode << dec << ") = " << res << dendl; - return res; -} - -int Client::chown(const char *relpath, uid_t uid, gid_t gid) -{ - Mutex::Locker lock(client_lock); - tout << "chown" << std::endl; - tout << relpath << std::endl; - tout << uid << std::endl; - tout << gid << std::endl; - - string abspath; - mkabspath(relpath, abspath); - return _chown(abspath.c_str(), uid, gid); -} - -int Client::_chown(const char *path, uid_t uid, gid_t gid) -{ - dout(3) << "_chown(" << path << ", " << uid << ", " << gid << ")" << dendl; - MClientRequest *req = new MClientRequest(MDS_OP_CHOWN, messenger->get_myinst()); - req->set_path(path); - req->args.chown.uid = uid; - req->args.chown.gid = gid; - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - //FIXME enforce caller uid rights? - - MClientReply *reply = make_request(req); - int res = reply->get_result(); - insert_trace(reply); - delete reply; - dout(10) << "chown result is " << res << dendl; - - trim_cache(); - dout(3) << "chown(\"" << path << "\", " << uid << ", " << gid << ") = " << res << dendl; - return res; -} - -int Client::utime(const char *relpath, struct utimbuf *buf) -{ - Mutex::Locker lock(client_lock); - tout << "utime" << std::endl; - tout << relpath << std::endl; - tout << buf->modtime << std::endl; - tout << buf->actime << std::endl; - - string abspath; - mkabspath(relpath, abspath); - return _utimes(abspath.c_str(), utime_t(buf->modtime,0), utime_t(buf->actime,0)); -} - -int Client::_utimes(const char *path, utime_t mtime, utime_t atime) -{ - dout(3) << "_utimes(" << path << ", " << mtime << ", " << atime << ")" << dendl; - MClientRequest *req = new MClientRequest(MDS_OP_UTIME, messenger->get_myinst()); - req->set_path(path); - req->args.utime.mtime = mtime.tv_ref(); - req->args.utime.atime = atime.tv_ref(); - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - MClientReply *reply = make_request(req); - int res = reply->get_result(); - insert_trace(reply); - delete reply; - - dout(3) << "utimes(\"" << path << "\", " << mtime << ", " << atime << ") = " << res << dendl; - trim_cache(); - return res; -} - - - -int Client::mknod(const char *relpath, mode_t mode, dev_t rdev) -{ - Mutex::Locker lock(client_lock); - tout << "mknod" << std::endl; - tout << relpath << std::endl; - tout << mode << std::endl; - tout << rdev << std::endl; - - string abspath; - mkabspath(relpath, abspath); - return _mknod(abspath.c_str(), mode, rdev); -} - -int Client::_mknod(const char *path, mode_t mode, dev_t rdev) -{ - dout(3) << "_mknod(" << path << ", 0" << oct << mode << dec << ", " << rdev << ")" << dendl; - - MClientRequest *req = new MClientRequest(MDS_OP_MKNOD, messenger->get_myinst()); - req->set_path(path); - req->args.mknod.mode = mode; - req->args.mknod.rdev = rdev; - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - //FIXME enforce caller uid rights? - - MClientReply *reply = make_request(req); - int res = reply->get_result(); - insert_trace(reply); - - delete reply; - - trim_cache(); - - dout(3) << "mknod(\"" << path << "\", 0" << oct << mode << dec << ") = " << res << dendl; - return res; -} - - - - -int Client::getdir(const char *relpath, list& contents) -{ - dout(3) << "getdir(" << relpath << ")" << dendl; - { - Mutex::Locker lock(client_lock); - tout << "getdir" << std::endl; - tout << relpath << std::endl; - } - - DIR *d; - int r = opendir(relpath, &d); - if (r < 0) return r; - - struct dirent de; - int n = 0; - while (readdir_r(d, &de) == 0) { - contents.push_back(de.d_name); - n++; - } - closedir(d); - - return n; -} - -int Client::opendir(const char *name, DIR **dirpp) -{ - Mutex::Locker lock(client_lock); - tout << "opendir" << std::endl; - tout << name << std::endl; - - int r = _opendir(name, (DirResult**)dirpp); - tout << (unsigned long)*dirpp << std::endl; - return r; -} - -int Client::_opendir(const char *name, DirResult **dirpp) -{ - *dirpp = new DirResult(name); - - // do we have the inode in our cache? - // if so, should be we ask for a different dirfrag? - filepath path(name); - Dentry *dn = lookup(path); - if (dn && dn->inode) { - (*dirpp)->inode = dn->inode; - (*dirpp)->inode->get(); - dout(10) << "had inode " << dn->inode << " " << dn->inode->inode.ino << " ref now " << dn->inode->ref << dendl; - (*dirpp)->set_frag(dn->inode->dirfragtree[0]); - dout(10) << "_opendir " << name << ", our cache says the first dirfrag is " << (*dirpp)->frag() << dendl; - } - - // get the first frag - int r = _readdir_get_frag(*dirpp); - if (r < 0) { - _closedir(*dirpp); - *dirpp = 0; - } else { - r = 0; - } - dout(3) << "_opendir(" << name << ") = " << r << " (" << *dirpp << ")" << dendl; - - return r; -} - -void Client::_readdir_add_dirent(DirResult *dirp, const string& name, Inode *in) -{ - struct stat st; - int stmask = fill_stat(in, &st); - frag_t fg = dirp->frag(); - dirp->buffer[fg].push_back(DirEntry(name, st, stmask)); - dout(10) << "_readdir_add_dirent " << dirp << " added '" << name << "' -> " << in->inode.ino - << ", size now " << dirp->buffer[fg].size() << dendl; -} - -//struct dirent { -// ino_t d_ino; /* inode number */ -// off_t d_off; /* offset to the next dirent */ -// unsigned short d_reclen; /* length of this record */ -// unsigned char d_type; /* type of file */ -// char d_name[256]; /* filename */ -//}; -void Client::_readdir_fill_dirent(struct dirent *de, DirEntry *entry, off_t off) -{ - de->d_ino = entry->st.st_ino; - de->d_off = off + 1; - de->d_reclen = 1; - de->d_type = MODE_TO_DT(entry->st.st_mode); - strncpy(de->d_name, entry->d_name.c_str(), 256); - dout(10) << "_readdir_fill_dirent '" << de->d_name << "' -> " << de->d_ino - << " type " << (int)de->d_type << " at off " << off << dendl; -} - -void Client::_readdir_next_frag(DirResult *dirp) -{ - frag_t fg = dirp->frag(); - - // hose old data - assert(dirp->buffer.count(fg)); - dirp->buffer.erase(fg); - - // advance - dirp->next_frag(); - if (dirp->at_end()) { - dout(10) << "_readdir_next_frag advance from " << fg << " to END" << dendl; - } else { - dout(10) << "_readdir_next_frag advance from " << fg << " to " << dirp->frag() << dendl; - _readdir_rechoose_frag(dirp); - } -} - -void Client::_readdir_rechoose_frag(DirResult *dirp) -{ - assert(dirp->inode); - frag_t cur = dirp->frag(); - frag_t f = dirp->inode->dirfragtree[cur.value()]; - if (f != cur) { - dout(10) << "_readdir_rechoose_frag frag " << cur << " maps to " << f << dendl; - dirp->set_frag(f); - } -} - -int Client::_readdir_get_frag(DirResult *dirp) -{ - // get the current frag. - frag_t fg = dirp->frag(); - assert(dirp->buffer.count(fg) == 0); - - dout(10) << "_readdir_get_frag " << dirp << " on " << dirp->path << " fg " << fg << dendl; - - MClientRequest *req = new MClientRequest(MDS_OP_READDIR, messenger->get_myinst()); - req->set_path(dirp->path); - req->args.readdir.frag = fg; - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - MClientReply *reply = make_request(req); - int res = reply->get_result(); - insert_trace(reply); - inodeno_t ino = reply->get_ino(); - - // did i get directory inode? - Inode *diri = 0; - if ((res == -EAGAIN || res == 0) && - inode_map.count(ino)) { - diri = inode_map[ino]; - dout(10) << "_readdir_get_frag got diri " << diri << " " << diri->inode.ino << dendl; - assert(diri); - assert(diri->inode.is_dir()); - } - - if (!dirp->inode && diri) { - dout(10) << "_readdir_get_frag attaching inode" << dendl; - dirp->inode = inode_map[ino]; - diri->get(); - } - - if (res == -EAGAIN) { - dout(10) << "_readdir_get_frag got EAGAIN, retrying" << dendl; - _readdir_rechoose_frag(dirp); - return _readdir_get_frag(dirp); - } - - if (res == 0) { - // stuff dir contents to cache, DirResult - assert(diri); - - // create empty result vector - dirp->buffer[fg].clear(); - - if (fg.is_leftmost()) { - // add . and ..? - string dot("."); - _readdir_add_dirent(dirp, dot, diri); - string dotdot(".."); - if (diri->dn) - _readdir_add_dirent(dirp, dotdot, diri->dn->dir->parent_inode); - //else - //_readdir_add_dirent(dirp, dotdot, DT_DIR); - } - - // the rest? - if (!reply->get_dir_dn().empty()) { - // only open dir if we're actually adding stuff to it! - Dir *dir = diri->open_dir(); - assert(dir); - utime_t now = g_clock.real_now(); - - list::const_iterator pin = reply->get_dir_in().begin(); - for (list::const_iterator pdn = reply->get_dir_dn().begin(); - pdn != reply->get_dir_dn().end(); - ++pdn, ++pin) { - // count entries - res++; - - // put in cache - Inode *in = this->insert_inode(dir, *pin, *pdn); - - if (g_conf.client_cache_stat_ttl) { - in->valid_until = now; - in->valid_until += g_conf.client_cache_stat_ttl; - } - else if (g_conf.client_cache_readdir_ttl) { - in->valid_until = now; - in->valid_until += g_conf.client_cache_readdir_ttl; - } else - in->valid_until = utime_t(); - - // contents to caller too! - dout(15) << "_readdir_get_frag got " << *pdn << " to " << in->inode.ino << dendl; - _readdir_add_dirent(dirp, *pdn, in); - } - - if (dir->is_empty()) - close_dir(dir); - } - - // FIXME: remove items in cache that weren't in my readdir? - // *** - } else { - dout(10) << "_readdir_get_frag got error " << res << ", setting end flag" << dendl; - dirp->set_end(); - } - - delete reply; - - return res; -} - -int Client::readdir_r(DIR *d, struct dirent *de) -{ - return readdirplus_r(d, de, 0, 0); -} - -int Client::readdirplus_r(DIR *d, struct dirent *de, struct stat *st, int *stmask) -{ - DirResult *dirp = (DirResult*)d; - - while (1) { - if (dirp->at_end()) return -1; - - if (dirp->buffer.count(dirp->frag()) == 0) { - Mutex::Locker lock(client_lock); - _readdir_get_frag(dirp); - if (dirp->at_end()) return -1; - } - - frag_t fg = dirp->frag(); - uint32_t pos = dirp->fragpos(); - assert(dirp->buffer.count(fg)); - vector &ent = dirp->buffer[fg]; - - if (ent.empty()) { - dout(10) << "empty frag " << fg << ", moving on to next" << dendl; - _readdir_next_frag(dirp); - continue; - } - - assert(pos < ent.size()); - _readdir_fill_dirent(de, &ent[pos], dirp->offset); - if (st) *st = ent[pos].st; - if (stmask) *stmask = ent[pos].stmask; - pos++; - dirp->offset++; - - if (pos == ent.size()) - _readdir_next_frag(dirp); - - break; - } - - return 0; -} - - -int Client::closedir(DIR *dir) -{ - Mutex::Locker lock(client_lock); - tout << "closedir" << std::endl; - tout << (unsigned long)dir << std::endl; - - dout(3) << "closedir(" << dir << ") = 0" << dendl; - _closedir((DirResult*)dir); - return 0; -} - -void Client::_closedir(DirResult *dirp) -{ - dout(10) << "_closedir(" << dirp << ")" << dendl; - if (dirp->inode) { - dout(10) << "_closedir detaching inode " << dirp->inode << dendl; - put_inode(dirp->inode); - dirp->inode = 0; - } - delete dirp; -} - -void Client::rewinddir(DIR *dirp) -{ - dout(3) << "rewinddir(" << dirp << ")" << dendl; - DirResult *d = (DirResult*)dirp; - d->offset = 0; - d->buffer.clear(); -} - -off_t Client::telldir(DIR *dirp) -{ - DirResult *d = (DirResult*)dirp; - dout(3) << "telldir(" << dirp << ") = " << d->offset << dendl; - return d->offset; -} - -void Client::seekdir(DIR *dirp, off_t offset) -{ - dout(3) << "seekdir(" << dirp << ", " << offset << ")" << dendl; - DirResult *d = (DirResult*)dirp; - d->offset = offset; -} - - - - - - - -/****** file i/o **********/ - -int Client::open(const char *relpath, int flags, mode_t mode) -{ - Mutex::Locker lock(client_lock); - tout << "open" << std::endl; - tout << relpath << std::endl; - tout << flags << std::endl; - - string abspath; - mkabspath(relpath, abspath); - - Fh *fh; - int r = _open(abspath.c_str(), flags, mode, &fh); - if (r >= 0) { - // allocate a integer file descriptor - assert(fh); - r = get_fd(); - assert(fd_map.count(r) == 0); - fd_map[r] = fh; - } - - tout << r << std::endl; - dout(3) << "open(" << relpath << ", " << flags << ") = " << r << dendl; - return r; -} - -int Client::_open(const char *path, int flags, mode_t mode, Fh **fhp) -{ - // go - MClientRequest *req = new MClientRequest(MDS_OP_OPEN, messenger->get_myinst()); - req->set_path(path); - req->args.open.flags = flags; - req->args.open.mode = mode; - - int cmode = req->get_open_file_mode(); - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - // do i have the inode? - Dentry *dn = lookup(req->get_filepath()); - Inode *in = 0; - if (dn) { - in = dn->inode; - in->add_open(cmode); // make note of pending open, since it effects _wanted_ caps. - } - - MClientReply *reply = make_request(req); - assert(reply); - - insert_trace(reply); - int result = reply->get_result(); - - // success? - if (result >= 0) { - // yay - Fh *f = new Fh; - if (fhp) *fhp = f; - f->mode = cmode; - - // inode - f->inode = inode_map[reply->get_ino()]; - assert(f->inode); - f->inode->get(); - - if (!in) { - in = f->inode; - in->add_open(f->mode); - } - - // caps included? - int mds = reply->get_source().num(); - - if (in->caps.empty()) {// first caps? - dout(7) << " first caps on " << in->inode.ino << dendl; - in->get(); - } - - int new_caps = reply->get_file_caps(); - - assert(reply->get_file_caps_seq() >= in->caps[mds].seq); - if (reply->get_file_caps_seq() > in->caps[mds].seq) { - int old_caps = in->caps[mds].caps; - - dout(7) << "open got caps " << cap_string(new_caps) - << " (had " << cap_string(old_caps) << ")" - << " for " << in->ino() - << " seq " << reply->get_file_caps_seq() - << " from mds" << mds - << dendl; - - in->caps[mds].caps = new_caps; - in->caps[mds].seq = reply->get_file_caps_seq(); - - // we shouldn't ever lose caps at this point. - // actually, we might...? - assert((old_caps & ~in->caps[mds].caps) == 0); - - if (g_conf.client_oc) - in->fc.set_caps(new_caps); - - } else { - dout(7) << "open got SAME caps " << cap_string(new_caps) - << " for " << in->ino() - << " seq " << reply->get_file_caps_seq() - << " from mds" << mds - << dendl; - } - - dout(5) << "open success, fh is " << f << " combined caps " << cap_string(in->file_caps()) << dendl; - } - - delete reply; - - trim_cache(); - - return result; -} - - - - - -void Client::close_release(Inode *in) -{ - dout(10) << "close_release on " << in->ino() << dendl; - dout(10) << " wr " << in->num_open_wr << " rd " << in->num_open_rd - << " dirty " << in->fc.is_dirty() << " cached " << in->fc.is_cached() << dendl; - - if (!in->num_open_rd) - in->fc.release_clean(); - - int retain = 0; - if (in->num_open_wr || in->fc.is_dirty()) retain |= CAP_FILE_WR | CAP_FILE_WRBUFFER | CAP_FILE_WREXTEND; - if (in->num_open_rd || in->fc.is_cached()) retain |= CAP_FILE_RD | CAP_FILE_RDCACHE; - - release_caps(in, retain); // release caps now. -} - -void Client::close_safe(Inode *in) -{ - dout(10) << "close_safe on " << in->ino() << dendl; - put_inode(in); - if (unmounting) - mount_cond.Signal(); -} - - -int Client::close(int fd) -{ - Mutex::Locker lock(client_lock); - tout << "close" << std::endl; - tout << fd << std::endl; - - dout(3) << "close(" << fd << ")" << dendl; - assert(fd_map.count(fd)); - Fh *fh = fd_map[fd]; - _release(fh); - fd_map.erase(fd); - return 0; -} - -int Client::_release(Fh *f) -{ - //dout(3) << "op: client->close(open_files[ " << fh << " ]);" << dendl; - //dout(3) << "op: open_files.erase( " << fh << " );" << dendl; - dout(5) << "_release " << f << dendl; - Inode *in = f->inode; - - // update inode rd/wr counts - int before = in->file_caps_wanted(); - in->sub_open(f->mode); - int after = in->file_caps_wanted(); - - // does this change what caps we want? - if (before != after && after) - update_caps_wanted(in); - - // release caps right away? - dout(10) << "num_open_rd " << in->num_open_rd << " num_open_wr " << in->num_open_wr << dendl; - - if (g_conf.client_oc) { - // caching on. - if (in->num_open_rd == 0 && in->num_open_wr == 0) { - dout(20) << "calling empty" << dendl; - in->fc.empty(new C_Client_CloseRelease(this, in)); - } - else if (in->num_open_rd == 0) { - dout(20) << "calling release" << dendl; - in->fc.release_clean(); - close_release(in); - } - else if (in->num_open_wr == 0) { - dout(20) << "calling flush dirty" << dendl; - in->fc.flush_dirty(new C_Client_CloseRelease(this,in)); - } - - // pin until safe? - if (in->num_open_wr == 0 && !in->fc.all_safe()) { - dout(10) << "pinning ino " << in->ino() << " until safe" << dendl; - in->get(); - in->fc.add_safe_waiter(new C_Client_CloseSafe(this, in)); - } - } else { - // caching off. - if (in->num_open_rd == 0 && in->num_open_wr == 0) { - dout(10) << " releasing caps on " << in->ino() << dendl; - release_caps(in); // release caps now. - } - } - - put_inode( in ); - return 0; -} - - - -// ------------ -// read, write - - -off_t Client::lseek(int fd, off_t offset, int whence) -{ - Mutex::Locker lock(client_lock); - tout << "lseek" << std::endl; - tout << fd << std::endl; - tout << offset << std::endl; - tout << whence << std::endl; - - assert(fd_map.count(fd)); - Fh *f = fd_map[fd]; - Inode *in = f->inode; - - switch (whence) { - case SEEK_SET: - f->pos = offset; - break; - - case SEEK_CUR: - f->pos += offset; - break; - - case SEEK_END: - f->pos = in->inode.size + offset; - break; - - default: - assert(0); - } - - off_t pos = f->pos; - - dout(3) << "lseek(" << fd << ", " << offset << ", " << whence << ") = " << pos << dendl; - return pos; -} - - - -void Client::lock_fh_pos(Fh *f) -{ - dout(10) << "lock_fh_pos " << f << dendl; - - if (f->pos_locked || !f->pos_waiters.empty()) { - Cond cond; - f->pos_waiters.push_back(&cond); - dout(10) << "lock_fh_pos BLOCKING on " << f << dendl; - while (f->pos_locked || f->pos_waiters.front() != &cond) - cond.Wait(client_lock); - dout(10) << "lock_fh_pos UNBLOCKING on " << f << dendl; - assert(f->pos_waiters.front() == &cond); - f->pos_waiters.pop_front(); - } - - f->pos_locked = true; -} - -void Client::unlock_fh_pos(Fh *f) -{ - dout(10) << "unlock_fh_pos " << f << dendl; - f->pos_locked = false; -} - - - -//char *hackbuf = 0; - - -// blocking osd interface - -int Client::read(int fd, char *buf, off_t size, off_t offset) -{ - Mutex::Locker lock(client_lock); - tout << "read" << std::endl; - tout << fd << std::endl; - tout << size << std::endl; - tout << offset << std::endl; - - assert(fd_map.count(fd)); - Fh *f = fd_map[fd]; - bufferlist bl; - int r = _read(f, offset, size, &bl); - dout(3) << "read(" << fd << ", " << buf << ", " << size << ", " << offset << ") = " << r << dendl; - if (r >= 0) { - bl.copy(0, bl.length(), buf); - r = bl.length(); - } - return r; -} - -int Client::_read(Fh *f, off_t offset, off_t size, bufferlist *bl) -{ - Inode *in = f->inode; - - bool movepos = false; - if (offset < 0) { - lock_fh_pos(f); - offset = f->pos; - movepos = true; - } - - bool lazy = f->mode == FILE_MODE_LAZY; - - // determine whether read range overlaps with file - // ...ONLY if we're doing async io - if (!lazy && (in->file_caps() & (CAP_FILE_WRBUFFER|CAP_FILE_RDCACHE))) { - // we're doing buffered i/o. make sure we're inside the file. - // we can trust size info bc we get accurate info when buffering/caching caps are issued. - dout(10) << "file size: " << in->inode.size << dendl; - if (offset > 0 && offset >= in->inode.size) { - if (movepos) unlock_fh_pos(f); - return 0; - } - if (offset + size > (off_t)in->inode.size) - size = (off_t)in->inode.size - offset; - - if (size == 0) { - dout(10) << "read is size=0, returning 0" << dendl; - if (movepos) unlock_fh_pos(f); - return 0; - } - } else { - // unbuffered, synchronous file i/o. - // or lazy. - // defer to OSDs for file bounds. - } - - int r = 0; - int rvalue = 0; - - if (g_conf.client_oc) { - // object cache ON - rvalue = r = in->fc.read(offset, size, *bl, client_lock); // may block. - - /* - if (in->inode.ino == 0x10000000075 && hackbuf) { - int s = MIN(size, bl->length()); - char *v = bl->c_str(); - for (int a=0; afile_caps() & CAP_FILE_RD) == 0) { - dout(7) << " don't have read cap, waiting" << dendl; - Cond cond; - in->waitfor_read.push_back(&cond); - cond.Wait(client_lock); - } - // lazy cap? - while (lazy && (in->file_caps() & CAP_FILE_LAZYIO) == 0) { - dout(7) << " don't have lazy cap, waiting" << dendl; - Cond cond; - in->waitfor_lazy.push_back(&cond); - cond.Wait(client_lock); - } - - // do sync read - Cond cond; - bool done = false; - C_Cond *onfinish = new C_Cond(&cond, &done, &rvalue); - - Objecter::OSDRead *rd = filer->prepare_read(in->inode, offset, size, bl); - if (in->hack_balance_reads || - g_conf.client_hack_balance_reads) - rd->balance_reads = true; - r = objecter->readx(rd, onfinish); - assert(r >= 0); - - // wait! - while (!done) - cond.Wait(client_lock); - } - - if (movepos) { - // adjust fd pos - f->pos = offset+bl->length(); - unlock_fh_pos(f); - } - - // done! - return rvalue; -} - - - -/* - * hack -- - * until we properly implement synchronous writes wrt buffer cache, - * make sure we delay shutdown until they're all safe on disk! - */ -class C_Client_HackUnsafe : public Context { - Client *cl; -public: - C_Client_HackUnsafe(Client *c) : cl(c) {} - void finish(int) { - cl->hack_sync_write_safe(); - } -}; - -void Client::hack_sync_write_safe() -{ - client_lock.Lock(); - assert(unsafe_sync_write > 0); - unsafe_sync_write--; - dout(15) << "hack_sync_write_safe unsafe_sync_write = " << unsafe_sync_write << dendl; - if (unsafe_sync_write == 0 && unmounting) { - dout(10) << "hack_sync_write_safe -- no more unsafe writes, unmount can proceed" << dendl; - mount_cond.Signal(); - } - client_lock.Unlock(); -} - -int Client::write(int fd, const char *buf, off_t size, off_t offset) -{ - Mutex::Locker lock(client_lock); - tout << "write" << std::endl; - tout << fd << std::endl; - tout << size << std::endl; - tout << offset << std::endl; - - assert(fd_map.count(fd)); - Fh *fh = fd_map[fd]; - int r = _write(fh, offset, size, buf); - dout(3) << "write(" << fd << ", \"...\", " << size << ", " << offset << ") = " << r << dendl; - return r; -} - - -int Client::_write(Fh *f, off_t offset, off_t size, const char *buf) -{ - //dout(7) << "write fh " << fh << " size " << size << " offset " << offset << dendl; - Inode *in = f->inode; - - // use/adjust fd pos? - if (offset < 0) { - lock_fh_pos(f); - offset = f->pos; - f->pos = offset+size; - unlock_fh_pos(f); - } - - bool lazy = f->mode == FILE_MODE_LAZY; - - dout(10) << "cur file size is " << in->inode.size << " wr size " << in->file_wr_size << dendl; - - // time it. - utime_t start = g_clock.real_now(); - - // copy into fresh buffer (since our write may be resub, async) - bufferptr bp; - if (size > 0) bp = buffer::copy(buf, size); - bufferlist blist; - blist.push_back( bp ); - - if (g_conf.client_oc) { // buffer cache ON? - assert(objectcacher); - - // write (this may block!) - in->fc.write(offset, size, blist, client_lock); - - } else { - // legacy, inconsistent synchronous write. - dout(7) << "synchronous write" << dendl; - - // do we have write file cap? - while (!lazy && (in->file_caps() & CAP_FILE_WR) == 0) { - dout(7) << " don't have write cap, waiting" << dendl; - Cond cond; - in->waitfor_write.push_back(&cond); - cond.Wait(client_lock); - } - while (lazy && (in->file_caps() & CAP_FILE_LAZYIO) == 0) { - dout(7) << " don't have lazy cap, waiting" << dendl; - Cond cond; - in->waitfor_lazy.push_back(&cond); - cond.Wait(client_lock); - } - - // prepare write - Cond cond; - bool done = false; - C_Cond *onfinish = new C_Cond(&cond, &done); - C_Client_HackUnsafe *onsafe = new C_Client_HackUnsafe(this); - unsafe_sync_write++; - in->sync_writes++; - - dout(20) << " sync write start " << onfinish << dendl; - - filer->write(in->inode, offset, size, blist, 0, - onfinish, onsafe - //, 1+((int)g_clock.now()) / 10 //f->pos // hack hack test osd revision snapshots - ); - - while (!done) { - cond.Wait(client_lock); - dout(20) << " sync write bump " << onfinish << dendl; - } - - in->sync_writes--; - if (in->sync_writes == 0 && - !in->waitfor_no_write.empty()) { - for (list::iterator i = in->waitfor_no_write.begin(); - i != in->waitfor_no_write.end(); - i++) - (*i)->finish(0); - in->waitfor_no_write.clear(); - } - - dout(20) << " sync write done " << onfinish << dendl; - } - - // time - utime_t lat = g_clock.real_now(); - lat -= start; - if (client_logger) { - client_logger->finc("wrlsum",(double)lat); - client_logger->inc("wrlnum"); - } - - // assume success for now. FIXME. - off_t totalwritten = size; - - // extend file? - if (totalwritten + offset > in->inode.size) { - in->inode.size = in->file_wr_size = totalwritten + offset; - dout(7) << "wrote to " << totalwritten+offset << ", extending file size" << dendl; - } else { - dout(7) << "wrote to " << totalwritten+offset << ", leaving file size at " << in->inode.size << dendl; - } - - // mtime - in->file_wr_mtime = in->inode.mtime = g_clock.real_now(); - - // ok! - return totalwritten; -} - -int Client::_flush(Fh *f) -{ - // no-op, for now. hrm. - return 0; -} - - -int Client::truncate(const char *relpath, off_t length) -{ - Mutex::Locker lock(client_lock); - tout << "truncate" << std::endl; - tout << relpath << std::endl; - tout << length << std::endl; - - string path; - mkabspath(relpath, path); - return _truncate(path.c_str(), length); -} - -int Client::_truncate(const char *file, off_t length) -{ - MClientRequest *req = new MClientRequest(MDS_OP_TRUNCATE, messenger->get_myinst()); - req->set_path(file); - req->args.truncate.length = length; - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - MClientReply *reply = make_request(req); - int res = reply->get_result(); - insert_trace(reply); - delete reply; - - dout(3) << "truncate(\"" << file << "\", " << length << ") = " << res << dendl; - return res; -} - -int Client::ftruncate(int fd, off_t length) -{ - Mutex::Locker lock(client_lock); - tout << "ftruncate" << std::endl; - tout << fd << std::endl; - tout << length << std::endl; - - assert(fd_map.count(fd)); - Fh *f = fd_map[fd]; - return _ftruncate(f, length); -} - -int Client::_ftruncate(Fh *fh, off_t length) -{ - MClientRequest *req = new MClientRequest(MDS_OP_TRUNCATE, messenger->get_myinst()); - req->args.truncate.ino = fh->inode->inode.ino; - req->args.truncate.length = length; - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - MClientReply *reply = make_request(req); - int res = reply->get_result(); - insert_trace(reply); - delete reply; - - dout(3) << "ftruncate(\"" << fh << "\", " << length << ") = " << res << dendl; - return res; -} - - -int Client::fsync(int fd, bool syncdataonly) -{ - Mutex::Locker lock(client_lock); - tout << "fsync" << std::endl; - tout << fd << std::endl; - tout << syncdataonly << std::endl; - - assert(fd_map.count(fd)); - Fh *f = fd_map[fd]; - int r = _fsync(f, syncdataonly); - dout(3) << "fsync(" << fd << ", " << syncdataonly << ") = " << r << dendl; - return r; -} - -int Client::_fsync(Fh *f, bool syncdataonly) -{ - int r = 0; - - Inode *in = f->inode; - - // metadata? - if (!syncdataonly) { - dout(0) << "fsync - not syncing metadata yet.. implement me" << dendl; - } - - // data? - Cond cond; - bool done = false; - if (!objectcacher->commit_set(in->ino(), - new C_Cond(&cond, &done))) { - // wait for callback - while (!done) cond.Wait(client_lock); - } - return r; -} - - -// not written yet, but i want to link! - -int Client::chdir(const char *path) -{ - Mutex::Locker lock(client_lock); - tout << "chdir" << std::endl; - tout << path << std::endl; - - // fake it for now! - string abs; - mkabspath(path, abs); - dout(3) << "chdir " << path << " -> cwd now " << abs << dendl; - cwd = abs; - return 0; -} - -int Client::statfs(const char *path, struct statvfs *stbuf) -{ - Mutex::Locker lock(client_lock); - tout << "statfs" << std::endl; - return _statfs(stbuf); -} - -int Client::ll_statfs(inodeno_t ino, struct statvfs *stbuf) -{ - Mutex::Locker lock(client_lock); - tout << "ll_statfs" << std::endl; - return _statfs(stbuf); -} - -int Client::_statfs(struct statvfs *stbuf) -{ - dout(3) << "_statfs" << dendl; - - Cond cond; - tid_t tid = ++last_tid; - StatfsRequest *req = new StatfsRequest(tid, &cond); - statfs_requests[tid] = req; - - int mon = monmap->pick_mon(); - messenger->send_message(new MStatfs(req->tid), monmap->get_inst(mon)); - - while (req->reply == 0) - cond.Wait(client_lock); - - // yay - memcpy(stbuf, &req->reply->stfs, sizeof(*stbuf)); - - statfs_requests.erase(req->tid); - delete req->reply; - delete req; - - int r = 0; - dout(3) << "_statfs = " << r << dendl; - return r; -} - -void Client::handle_statfs_reply(MStatfsReply *reply) -{ - if (statfs_requests.count(reply->tid) && - statfs_requests[reply->tid]->reply == 0) { - dout(10) << "handle_statfs_reply " << *reply << ", kicking waiter" << dendl; - statfs_requests[reply->tid]->reply = reply; - statfs_requests[reply->tid]->caller_cond->Signal(); - } else { - dout(10) << "handle_statfs_reply " << *reply << ", dup or old, dropping" << dendl; - delete reply; - } -} - - -int Client::lazyio_propogate(int fd, off_t offset, size_t count) -{ - client_lock.Lock(); - dout(3) << "op: client->lazyio_propogate(" << fd - << ", " << offset << ", " << count << ")" << dendl; - - assert(fd_map.count(fd)); - Fh *f = fd_map[fd]; - Inode *in = f->inode; - - if (f->mode & FILE_MODE_LAZY) { - // wait for lazy cap - while ((in->file_caps() & CAP_FILE_LAZYIO) == 0) { - dout(7) << " don't have lazy cap, waiting" << dendl; - Cond cond; - in->waitfor_lazy.push_back(&cond); - cond.Wait(client_lock); - } - - if (g_conf.client_oc) { - Cond cond; - bool done = false; - in->fc.flush_dirty(new C_SafeCond(&client_lock, &cond, &done)); - - while (!done) - cond.Wait(client_lock); - - } else { - // mmm, nothin to do. - } - } - - client_lock.Unlock(); - return 0; -} - -int Client::lazyio_synchronize(int fd, off_t offset, size_t count) -{ - client_lock.Lock(); - dout(3) << "op: client->lazyio_synchronize(" << fd - << ", " << offset << ", " << count << ")" << dendl; - - assert(fd_map.count(fd)); - Fh *f = fd_map[fd]; - Inode *in = f->inode; - - if (f->mode & FILE_MODE_LAZY) { - // wait for lazy cap - while ((in->file_caps() & CAP_FILE_LAZYIO) == 0) { - dout(7) << " don't have lazy cap, waiting" << dendl; - Cond cond; - in->waitfor_lazy.push_back(&cond); - cond.Wait(client_lock); - } - - if (g_conf.client_oc) { - in->fc.flush_dirty(0); // flush to invalidate. - in->fc.release_clean(); - } else { - // mm, nothin to do. - } - } - - client_lock.Unlock(); - return 0; -} - - - - -// ========================================= -// low level - -// ugly hack for ll -#define FUSE_SET_ATTR_MODE (1 << 0) -#define FUSE_SET_ATTR_UID (1 << 1) -#define FUSE_SET_ATTR_GID (1 << 2) -#define FUSE_SET_ATTR_SIZE (1 << 3) -#define FUSE_SET_ATTR_ATIME (1 << 4) -#define FUSE_SET_ATTR_MTIME (1 << 5) - -int Client::ll_lookup(inodeno_t parent, const char *name, struct stat *attr) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_lookup " << parent << " " << name << dendl; - tout << "ll_lookup" << std::endl; - tout << parent.val << std::endl; - tout << name << std::endl; - - string dname = name; - Inode *diri = 0; - Inode *in = 0; - int r = 0; - - if (inode_map.count(parent) == 0) { - dout(1) << "ll_lookup " << parent << " " << name << " -> ENOENT (parent DNE... WTF)" << dendl; - r = -ENOENT; - attr->st_ino = 0; - goto out; - } - diri = inode_map[parent]; - if (!diri->inode.is_dir()) { - dout(1) << "ll_lookup " << parent << " " << name << " -> ENOTDIR (parent not a dir... WTF)" << dendl; - r = -ENOTDIR; - attr->st_ino = 0; - goto out; - } - - // get the inode - if (diri->dir && - diri->dir->dentries.count(dname)) { - Dentry *dn = diri->dir->dentries[dname]; - touch_dn(dn); - in = dn->inode; - } else { - string path; - diri->make_path(path); - path += "/"; - path += name; - _do_lstat(path.c_str(), 0, &in); - } - if (in) { - fill_stat(in, attr); - _ll_get(in); - } else { - r = -ENOENT; - attr->st_ino = 0; - } - - out: - dout(3) << "ll_lookup " << parent << " " << name - << " -> " << r << " (" << hex << attr->st_ino << dec << ")" << dendl; - tout << attr->st_ino << std::endl; - return r; -} - -void Client::_ll_get(Inode *in) -{ - if (in->ll_ref == 0) - in->get(); - in->ll_get(); - dout(20) << "_ll_get " << in << " " << in->inode.ino << " -> " << in->ll_ref << dendl; -} - -int Client::_ll_put(Inode *in, int num) -{ - in->ll_put(num); - dout(20) << "_ll_put " << in << " " << in->inode.ino << " " << num << " -> " << in->ll_ref << dendl; - if (in->ll_ref == 0) { - put_inode(in); - return 0; - } else { - return in->ll_ref; - } -} - -void Client::_ll_drop_pins() -{ - dout(10) << "_ll_drop_pins" << dendl; - hash_map::iterator next; - for (hash_map::iterator it = inode_map.begin(); - it != inode_map.end(); - it = next) { - Inode *in = it->second; - next = it; - next++; - if (in->ll_ref) - _ll_put(in, in->ll_ref); - } -} - -bool Client::ll_forget(inodeno_t ino, int num) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_forget " << ino << " " << num << dendl; - tout << "ll_forget" << std::endl; - tout << ino.val << std::endl; - tout << num << std::endl; - - if (ino == 1) return true; // ignore forget on root. - - bool last = false; - if (inode_map.count(ino) == 0) { - dout(1) << "WARNING: ll_forget on " << ino << " " << num - << ", which I don't have" << dendl; - } else { - Inode *in = inode_map[ino]; - assert(in); - if (in->ll_ref < num) { - dout(1) << "WARNING: ll_forget on " << ino << " " << num << ", which only has ll_ref=" << in->ll_ref << dendl; - _ll_put(in, in->ll_ref); - last = true; - } else { - if (_ll_put(in, num) == 0) - last = true; - } - } - return last; -} - -Inode *Client::_ll_get_inode(inodeno_t ino) -{ - if (inode_map.count(ino) == 0) { - assert(ino == 1); // must be the root inode. - Inode *in; - int r = _do_lstat("/", 0, &in); - assert(r >= 0); - return in; - } else { - return inode_map[ino]; - } -} - - -int Client::ll_getattr(inodeno_t ino, struct stat *attr) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_getattr " << ino << dendl; - tout << "ll_getattr" << std::endl; - tout << ino.val << std::endl; - - Inode *in = _ll_get_inode(ino); - fill_stat(in, attr); - return 0; -} - -int Client::ll_setattr(inodeno_t ino, struct stat *attr, int mask) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_setattr " << ino << " mask " << hex << mask << dec << dendl; - tout << "ll_setattr" << std::endl; - tout << ino.val << std::endl; - tout << attr->st_mode << std::endl; - tout << attr->st_uid << std::endl; - tout << attr->st_gid << std::endl; - tout << attr->st_size << std::endl; - tout << attr->st_mtime << std::endl; - tout << attr->st_atime << std::endl; - tout << mask << std::endl; - - Inode *in = _ll_get_inode(ino); - - string path; - in->make_path(path); - - int r = 0; - if ((mask & FUSE_SET_ATTR_MODE) && - ((r = _chmod(path.c_str(), attr->st_mode)) < 0)) return r; - - if ((mask & FUSE_SET_ATTR_UID) && (mask & FUSE_SET_ATTR_GID) && - ((r = _chown(path.c_str(), attr->st_uid, attr->st_gid)) < 0)) return r; - //if ((mask & FUSE_SET_ATTR_GID) && - //(r = client->_chgrp(path.c_str(), attr->st_gid) < 0)) return r; - - if ((mask & FUSE_SET_ATTR_SIZE) && - ((r = _truncate(path.c_str(), attr->st_size)) < 0)) return r; - - if ((mask & FUSE_SET_ATTR_MTIME) && (mask & FUSE_SET_ATTR_ATIME)) { - if ((r = _utimes(path.c_str(), utime_t(attr->st_mtime,0), utime_t(attr->st_atime,0))) < 0) return r; - } else if (mask & FUSE_SET_ATTR_MTIME) { - if ((r = _utimes(path.c_str(), utime_t(attr->st_mtime,0), utime_t())) < 0) return r; - } else if (mask & FUSE_SET_ATTR_ATIME) { - if ((r = _utimes(path.c_str(), utime_t(), utime_t(attr->st_atime,0))) < 0) return r; - } - - assert(r == 0); - fill_stat(in, attr); - - dout(3) << "ll_setattr " << ino << " = " << r << dendl; - return 0; -} - -int Client::ll_readlink(inodeno_t ino, const char **value) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_readlink " << ino << dendl; - tout << "ll_readlink" << std::endl; - tout << ino.val << std::endl; - - Inode *in = _ll_get_inode(ino); - if (in->dn) touch_dn(in->dn); - - int r = 0; - if (in->inode.is_symlink()) { - *value = in->symlink->c_str(); - } else { - *value = ""; - r = -EINVAL; - } - dout(3) << "ll_readlink " << ino << " = " << r << " (" << *value << ")" << dendl; - return r; -} - -int Client::ll_mknod(inodeno_t parent, const char *name, mode_t mode, dev_t rdev, struct stat *attr) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_mknod " << parent << " " << name << dendl; - tout << "ll_mknod" << std::endl; - tout << parent.val << std::endl; - tout << name << std::endl; - tout << mode << std::endl; - tout << rdev << std::endl; - - Inode *diri = _ll_get_inode(parent); - - string path; - diri->make_path(path); - path += "/"; - path += name; - int r = _mknod(path.c_str(), mode, rdev); - if (r == 0) { - string dname(name); - Inode *in = diri->dir->dentries[dname]->inode; - fill_stat(in, attr); - _ll_get(in); - } - tout << attr->st_ino << std::endl; - dout(3) << "ll_mknod " << parent << " " << name - << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl; - return r; -} - -int Client::ll_mkdir(inodeno_t parent, const char *name, mode_t mode, struct stat *attr) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_mkdir " << parent << " " << name << dendl; - tout << "ll_mkdir" << std::endl; - tout << parent.val << std::endl; - tout << name << std::endl; - tout << mode << std::endl; - - Inode *diri = _ll_get_inode(parent); - - string path; - diri->make_path(path); - path += "/"; - path += name; - int r = _mkdir(path.c_str(), mode); - if (r == 0) { - string dname(name); - Inode *in = diri->dir->dentries[dname]->inode; - fill_stat(in, attr); - _ll_get(in); - } - tout << attr->st_ino << std::endl; - dout(3) << "ll_mkdir " << parent << " " << name - << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl; - return r; -} - -int Client::ll_symlink(inodeno_t parent, const char *name, const char *value, struct stat *attr) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_symlink " << parent << " " << name << " -> " << value << dendl; - tout << "ll_symlink" << std::endl; - tout << parent.val << std::endl; - tout << name << std::endl; - tout << value << std::endl; - - Inode *diri = _ll_get_inode(parent); - - string path; - diri->make_path(path); - path += "/"; - path += name; - int r = _symlink(value, path.c_str()); - if (r == 0) { - string dname(name); - Inode *in = diri->dir->dentries[dname]->inode; - fill_stat(in, attr); - _ll_get(in); - } - tout << attr->st_ino << std::endl; - dout(3) << "ll_symlink " << parent << " " << name - << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl; - return r; -} - -int Client::ll_unlink(inodeno_t ino, const char *name) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_unlink " << ino << " " << name << dendl; - tout << "ll_unlink" << std::endl; - tout << ino.val << std::endl; - tout << name << std::endl; - - Inode *diri = _ll_get_inode(ino); - - string path; - diri->make_path(path); - path += "/"; - path += name; - return _unlink(path.c_str()); -} - -int Client::ll_rmdir(inodeno_t ino, const char *name) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_rmdir " << ino << " " << name << dendl; - tout << "ll_rmdir" << std::endl; - tout << ino.val << std::endl; - tout << name << std::endl; - - Inode *diri = _ll_get_inode(ino); - - string path; - diri->make_path(path); - path += "/"; - path += name; - return _rmdir(path.c_str()); -} - -int Client::ll_rename(inodeno_t parent, const char *name, inodeno_t newparent, const char *newname) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_rename " << parent << " " << name << " to " - << newparent << " " << newname << dendl; - tout << "ll_rename" << std::endl; - tout << parent.val << std::endl; - tout << name << std::endl; - tout << newparent.val << std::endl; - tout << newname << std::endl; - - Inode *diri = _ll_get_inode(parent); - string path; - diri->make_path(path); - path += "/"; - path += name; - - Inode *newdiri = _ll_get_inode(newparent); - string newpath; - newdiri->make_path(newpath); - newpath += "/"; - newpath += newname; - - return _rename(path.c_str(), newpath.c_str()); -} - -int Client::ll_link(inodeno_t ino, inodeno_t newparent, const char *newname, struct stat *attr) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_link " << ino << " to " << newparent << " " << newname << dendl; - tout << "ll_link" << std::endl; - tout << ino.val << std::endl; - tout << newparent << std::endl; - tout << newname << std::endl; - - Inode *old = _ll_get_inode(ino); - Inode *diri = _ll_get_inode(newparent); - - string path; - old->make_path(path); - - string newpath; - diri->make_path(newpath); - newpath += "/"; - newpath += newname; - - int r = _link(path.c_str(), newpath.c_str()); - if (r == 0) { - Inode *in = _ll_get_inode(ino); - fill_stat(in, attr); - _ll_get(in); - } - return r; -} - -int Client::ll_opendir(inodeno_t ino, void **dirpp) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_opendir " << ino << dendl; - tout << "ll_opendir" << std::endl; - tout << ino.val << std::endl; - - Inode *diri = inode_map[ino]; - assert(diri); - string path; - diri->make_path(path); - - int r = _opendir(path.c_str(), (DirResult**)dirpp); - - tout << (unsigned long)*dirpp << std::endl; - - dout(3) << "ll_opendir " << ino << " = " << r << " (" << *dirpp << ")" << dendl; - return r; -} - -void Client::ll_releasedir(void *dirp) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_releasedir " << dirp << dendl; - tout << "ll_releasedir" << std::endl; - tout << (unsigned long)dirp << std::endl; - _closedir((DirResult*)dirp); -} - -int Client::ll_open(inodeno_t ino, int flags, Fh **fhp) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_open " << ino << " " << flags << dendl; - tout << "ll_open" << std::endl; - tout << ino.val << std::endl; - tout << flags << std::endl; - - Inode *in = _ll_get_inode(ino); - string path; - in->make_path(path); - - int r = _open(path.c_str(), flags, 0, fhp); - - tout << (unsigned long)*fhp << std::endl; - dout(3) << "ll_open " << ino << " " << flags << " = " << r << " (" << *fhp << ")" << dendl; - return r; -} - -int Client::ll_create(inodeno_t parent, const char *name, mode_t mode, int flags, - struct stat *attr, Fh **fhp) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_create " << parent << " " << name << " 0" << oct << mode << dec << " " << flags << dendl; - tout << "ll_create" << std::endl; - tout << parent.val << std::endl; - tout << name << std::endl; - tout << mode << std::endl; - tout << flags << std::endl; - - Inode *pin = _ll_get_inode(parent); - string path; - pin->make_path(path); - path += "/"; - path += name; - - int r = _open(path.c_str(), flags|O_CREAT, mode, fhp); - if (r >= 0) { - Inode *in = (*fhp)->inode; - fill_stat(in, attr); - _ll_get(in); - } else { - attr->st_ino = 0; - } - tout << (unsigned long)*fhp << std::endl; - tout << attr->st_ino << std::endl; - dout(3) << "ll_create " << parent << " " << name << " 0" << oct << mode << dec << " " << flags - << " = " << r << " (" << *fhp << " " << hex << attr->st_ino << dec << ")" << dendl; - return 0; -} - -int Client::ll_read(Fh *fh, off_t off, off_t len, bufferlist *bl) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_read " << fh << " " << off << "~" << len << dendl; - tout << "ll_read" << std::endl; - tout << (unsigned long)fh << std::endl; - tout << off << std::endl; - tout << len << std::endl; - - return _read(fh, off, len, bl); -} - -int Client::ll_write(Fh *fh, off_t off, off_t len, const char *data) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_write " << fh << " " << off << "~" << len << dendl; - tout << "ll_write" << std::endl; - tout << (unsigned long)fh << std::endl; - tout << off << std::endl; - tout << len << std::endl; - - return _write(fh, off, len, data); -} - -int Client::ll_flush(Fh *fh) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_flush " << fh << dendl; - tout << "ll_flush" << std::endl; - tout << (unsigned long)fh << std::endl; - - return _flush(fh); -} - -int Client::ll_fsync(Fh *fh, bool syncdataonly) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_fsync " << fh << dendl; - tout << "ll_fsync" << std::endl; - tout << (unsigned long)fh << std::endl; - - return _fsync(fh, syncdataonly); -} - - -int Client::ll_release(Fh *fh) -{ - Mutex::Locker lock(client_lock); - dout(3) << "ll_release " << fh << dendl; - tout << "ll_release" << std::endl; - tout << (unsigned long)fh << std::endl; - - _release(fh); - return 0; -} - - - - - - -// ========================================= -// layout - - -int Client::describe_layout(int fd, FileLayout *lp) -{ - Mutex::Locker lock(client_lock); - - assert(fd_map.count(fd)); - Fh *f = fd_map[fd]; - Inode *in = f->inode; - - *lp = in->inode.layout; - - dout(3) << "describe_layout(" << fd << ") = 0" << dendl; - return 0; -} - -int Client::get_stripe_unit(int fd) -{ - FileLayout layout; - describe_layout(fd, &layout); - return layout.fl_stripe_unit; -} - -int Client::get_stripe_width(int fd) -{ - FileLayout layout; - describe_layout(fd, &layout); - return ceph_file_layout_stripe_width(layout); -} - -int Client::get_stripe_period(int fd) -{ - FileLayout layout; - describe_layout(fd, &layout); - return ceph_file_layout_period(layout); -} - -int Client::enumerate_layout(int fd, list& result, - off_t length, off_t offset) -{ - Mutex::Locker lock(client_lock); - - assert(fd_map.count(fd)); - Fh *f = fd_map[fd]; - Inode *in = f->inode; - - // map to a list of extents - filer->file_to_extents(in->inode, offset, length, result); - - dout(3) << "enumerate_layout(" << fd << ", " << length << ", " << offset << ") = 0" << dendl; - return 0; -} - - -void Client::ms_handle_failure(Message *m, const entity_inst_t& inst) -{ - entity_name_t dest = inst.name; - - if (dest.is_mon()) { - // resend to a different monitor. - int mon = monmap->pick_mon(true); - dout(0) << "ms_handle_failure " << *m << " to " << inst - << ", resending to mon" << mon - << dendl; - messenger->send_message(m, monmap->get_inst(mon)); - } - else if (dest.is_osd()) { - objecter->ms_handle_failure(m, dest, inst); - } - else if (dest.is_mds()) { - dout(0) << "ms_handle_failure " << *m << " to " << inst << dendl; - //failed_mds.insert(dest.num()); - } - else { - // client? - dout(0) << "ms_handle_failure " << *m << " to " << inst << ", dropping" << dendl; - delete m; - } -} - diff --git a/branches/sage/mds/client/Client.h b/branches/sage/mds/client/Client.h deleted file mode 100644 index fff9829c90ec5..0000000000000 --- a/branches/sage/mds/client/Client.h +++ /dev/null @@ -1,845 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __CLIENT_H -#define __CLIENT_H - - -#include "mds/MDSMap.h" -#include "osd/OSDMap.h" -#include "mon/MonMap.h" - -#include "msg/Message.h" -#include "msg/Dispatcher.h" -#include "msg/Messenger.h" - -#include "messages/MClientReply.h" - -#include "include/types.h" -#include "include/lru.h" -#include "include/filepath.h" -#include "include/interval_set.h" - -#include "common/Mutex.h" -#include "common/Timer.h" - -#include "FileCache.h" - - -// stl -#include -#include -#include -using std::set; -using std::map; -using std::fstream; - -#include -using namespace __gnu_cxx; - - - -class MStatfsReply; -class MClientSession; -class MClientRequest; -class MClientRequestForward; - -class Filer; -class Objecter; -class ObjectCacher; - -extern class LogType client_logtype; -extern class Logger *client_logger; - - - -// ============================================ -// types for my local metadata cache -/* basic structure: - - - Dentries live in an LRU loop. they get expired based on last access. - see include/lru.h. items can be bumped to "mid" or "top" of list, etc. - - Inode has ref count for each Fh, Dir, or Dentry that points to it. - - when Inode ref goes to 0, it's expired. - - when Dir is empty, it's removed (and it's Inode ref--) - -*/ - -class Dir; -class Inode; - -class Dentry : public LRUObject { - public: - string name; // sort of lame - //const char *name; - Dir *dir; - Inode *inode; - int ref; // 1 if there's a dir beneath me. - - void get() { assert(ref == 0); ref++; lru_pin(); } - void put() { assert(ref == 1); ref--; lru_unpin(); } - - Dentry() : dir(0), inode(0), ref(0) { } - - /*Dentry() : name(0), dir(0), inode(0), ref(0) { } - Dentry(string& n) : name(0), dir(0), inode(0), ref(0) { - name = new char[n.length()+1]; - strcpy((char*)name, n.c_str()); - } - ~Dentry() { - delete[] name; - }*/ -}; - -class Dir { - public: - Inode *parent_inode; // my inode - //hash_map, eqstr> dentries; - hash_map dentries; - - Dir(Inode* in) { parent_inode = in; } - - bool is_empty() { return dentries.empty(); } -}; - - -class InodeCap { - public: - int caps; - long seq; - InodeCap() : caps(0), seq(0) {} -}; - - -class Inode { - public: - inode_t inode; // the actual inode - utime_t valid_until; - int mask; - - // about the dir (if this is one!) - int dir_auth; - set dir_contacts; - bool dir_hashed, dir_replicated; - - // per-mds caps - map caps; // mds -> InodeCap - map stale_caps; // mds -> cap .. stale - - utime_t file_wr_mtime; // [writers] time of last write - off_t file_wr_size; // [writers] largest offset we've written to - int num_open_rd, num_open_wr, num_open_lazy; // num readers, writers - - int ref; // ref count. 1 for each dentry, fh that links to me. - int ll_ref; // separate ref count for ll client - Dir *dir; // if i'm a dir. - Dentry *dn; // if i'm linked to a dentry. - string *symlink; // symlink content, if it's a symlink - fragtree_t dirfragtree; - map fragmap; // known frag -> mds mappings - - // for caching i/o mode - FileCache fc; - - // for sync i/o mode - int sync_reads; // sync reads in progress - int sync_writes; // sync writes in progress - - list waitfor_write; - list waitfor_read; - list waitfor_lazy; - list waitfor_no_read, waitfor_no_write; - - // - bool hack_balance_reads; - // - - void make_path(string& p) { - if (dn) { - if (dn->dir && dn->dir->parent_inode) - dn->dir->parent_inode->make_path(p); - p += "/"; - p += dn->name; - } - } - - void get() { - ref++; - //cout << "inode.get on " << this << " " << hex << inode.ino << dec << " now " << ref << endl; - } - void put(int n=1) { - ref -= n; assert(ref >= 0); - //cout << "inode.put on " << this << " " << hex << inode.ino << dec << " now " << ref << endl; - } - - void ll_get() { - ll_ref++; - } - void ll_put(int n=1) { - assert(ll_ref >= n); - ll_ref -= n; - } - - Inode(inode_t _inode, ObjectCacher *_oc) : - inode(_inode), - valid_until(0, 0), - dir_auth(-1), dir_hashed(false), dir_replicated(false), - file_wr_mtime(0, 0), file_wr_size(0), - num_open_rd(0), num_open_wr(0), num_open_lazy(0), - ref(0), ll_ref(0), - dir(0), dn(0), symlink(0), - fc(_oc, _inode), - sync_reads(0), sync_writes(0), - hack_balance_reads(false) - { } - ~Inode() { - if (symlink) { delete symlink; symlink = 0; } - } - - inodeno_t ino() { return inode.ino; } - - bool is_dir() { return inode.is_dir(); } - - int file_caps() { - int c = 0; - for (map::iterator it = caps.begin(); - it != caps.end(); - it++) - c |= it->second.caps; - for (map::iterator it = stale_caps.begin(); - it != stale_caps.end(); - it++) - c |= it->second.caps; - return c; - } - - int file_caps_wanted() { - int w = 0; - if (num_open_rd) w |= CAP_FILE_RD|CAP_FILE_RDCACHE; - if (num_open_wr) w |= CAP_FILE_WR|CAP_FILE_WRBUFFER; - if (num_open_lazy) w |= CAP_FILE_LAZYIO; - if (fc.is_dirty()) w |= CAP_FILE_WRBUFFER; - if (fc.is_cached()) w |= CAP_FILE_RDCACHE; - return w; - } - - void add_open(int cmode) { - if (cmode & FILE_MODE_R) num_open_rd++; - if (cmode & FILE_MODE_W) num_open_wr++; - if (cmode & FILE_MODE_LAZY) num_open_lazy++; - } - void sub_open(int cmode) { - if (cmode & FILE_MODE_R) num_open_rd--; - if (cmode & FILE_MODE_W) num_open_wr--; - if (cmode & FILE_MODE_LAZY) num_open_lazy--; - } - - int authority(const string& dname) { - if (!dirfragtree.empty()) { - __gnu_cxx::hash H; - frag_t fg = dirfragtree[H(dname)]; - while (fg != frag_t()) { - if (fragmap.count(fg) && - fragmap[fg] >= 0) { - //cout << "picked frag ino " << inode.ino << " dname " << dname << " fg " << fg << " mds" << fragmap[fg] << std::endl; - return fragmap[fg]; - } - fg = frag_t(fg.value(), fg.bits()-1); // try more general... - } - } - return authority(); - } - - int authority() { - if (dir_auth >= 0) - return dir_auth; - - assert(dn); - return dn->dir->parent_inode->authority(dn->name); - } - - - int pick_replica(MDSMap *mdsmap) { - // replicas? - /* fixme - if (//ino() > 1ULL && - dir_contacts.size()) { - set::iterator it = dir_contacts.begin(); - if (dir_contacts.size() == 1) - return *it; - else { - //cout << "dir_contacts on " << inode.ino << " is " << dir_contacts << std::endl; - int r = 1 + (rand() % dir_contacts.size()); - int a = authority(); - while (r--) { - it++; - if (mdsmap->is_down(*it)) it++; - if (it == dir_contacts.end()) it = dir_contacts.begin(); - if (*it == a) it++; // skip the authority - if (it == dir_contacts.end()) it = dir_contacts.begin(); - } - return *it; - } - } - */ - - if (dir_replicated) {// || ino() == 1) { - // pick a random mds that isn't the auth - set s; - mdsmap->get_in_mds_set(s); - set::iterator it = s.begin(); - if (s.empty()) - return 0; - if (s.size() == 1) - return *it; - else { - //cout << "dir_contacts on " << inode.ino << " is " << dir_contacts << std::endl; - int r = 1 + (rand() % s.size()); - int a = authority(); - while (r--) { - it++; - if (mdsmap->is_down(*it)) it++; - if (it == s.end()) it = s.begin(); - if (*it == a) it++; // skip the authority - if (it == s.end()) it = s.begin(); - } - //if (inode.ino == 1) cout << "chose " << *it << " from " << s << std::endl; - return *it; - } - //cout << "num_mds is " << mdcluster->get_num_mds() << endl; - //return mdsmap->get_random_in_mds(); - //return rand() % mdsmap->get_num_mds(); // huh.. pick a random mds! - } - else - return authority(); - } - - - // open Dir for an inode. if it's not open, allocated it (and pin dentry in memory). - Dir *open_dir() { - if (!dir) { - if (dn) dn->get(); // pin dentry - get(); // pin inode - dir = new Dir(this); - } - return dir; - } - -}; - - - - -// file handle for any open file state - -struct Fh { - Inode *inode; - off_t pos; - int mds; // have to talk to mds we opened with (for now) - int mode; // the mode i opened the file with - - bool is_lazy() { return mode & O_LAZY; } - - bool pos_locked; // pos is currently in use - list pos_waiters; // waiters for pos - - Fh() : inode(0), pos(0), mds(0), mode(0), pos_locked(false) {} -}; - - - - - -// ======================================================== -// client interface - -class Client : public Dispatcher { - public: - - /* getdir result */ - struct DirEntry { - string d_name; - struct stat st; - int stmask; - DirEntry(const string &s) : d_name(s), stmask(0) {} - DirEntry(const string &n, struct stat& s, int stm) : d_name(n), st(s), stmask(stm) {} - }; - - struct DirResult { - static const int SHIFT = 28; - static const int64_t MASK = (1 << SHIFT) - 1; - static const off_t END = 1ULL << (SHIFT + 32); - - string path; - Inode *inode; - int64_t offset; // high bits: frag_t, low bits: an offset - map > buffer; - - DirResult(const char *p, Inode *in=0) : path(p), inode(in), offset(0) { - if (inode) inode->get(); - } - DirResult(const string &p, Inode *in=0) : path(p), inode(in), offset(0) { - if (inode) inode->get(); - } - - frag_t frag() { return frag_t(offset >> SHIFT); } - unsigned fragpos() { return offset & MASK; } - - void next_frag() { - frag_t fg = offset >> SHIFT; - if (fg.is_rightmost()) - set_end(); - else - set_frag(fg.next()); - } - void set_frag(frag_t f) { - offset = (uint64_t)f << SHIFT; - assert(sizeof(offset) == 8); - } - void set_end() { offset = END; } - bool at_end() { return (offset == END); } - }; - - - // cluster descriptors - MDSMap *mdsmap; - OSDMap *osdmap; - - SafeTimer timer; - - protected: - Messenger *messenger; - int whoami; - MonMap *monmap; - - // mds sessions - map mds_sessions; // mds -> push seq - map > waiting_for_session; - list waiting_for_mdsmap; - - void handle_client_session(MClientSession *m); - void send_reconnect(int mds); - - // mds requests - struct MetaRequest { - tid_t tid; - MClientRequest *request; - bufferlist request_payload; // in case i have to retry - - bool idempotent; // is request idempotent? - set mds; // who i am asking - int resend_mds; // someone wants you to (re)send the request here - int num_fwd; // # of times i've been forwarded - int retry_attempt; - - MClientReply *reply; // the reply - - Cond *caller_cond; // who to take up - Cond *dispatch_cond; // who to kick back - - MetaRequest(MClientRequest *req, tid_t t) : - tid(t), request(req), - idempotent(false), resend_mds(-1), num_fwd(0), retry_attempt(0), - reply(0), - caller_cond(0), dispatch_cond(0) { } - }; - tid_t last_tid; - map mds_requests; - set failed_mds; - - struct StatfsRequest { - tid_t tid; - MStatfsReply *reply; - Cond *caller_cond; - StatfsRequest(tid_t t, Cond *cc) : tid(t), reply(0), caller_cond(cc) {} - }; - map statfs_requests; - - MClientReply *make_request(MClientRequest *req, int use_auth=-1); - int choose_target_mds(MClientRequest *req); - void send_request(MetaRequest *request, int mds); - void kick_requests(int mds); - void handle_client_request_forward(MClientRequestForward *reply); - void handle_client_reply(MClientReply *reply); - void handle_statfs_reply(MStatfsReply *reply); - - bool mounted; - bool unmounting; - Cond mount_cond; - int my_instance; - - int unsafe_sync_write; -public: - entity_name_t get_myname() { return messenger->get_myname(); } - void hack_sync_write_safe(); - -protected: - Filer *filer; - ObjectCacher *objectcacher; - Objecter *objecter; // (non-blocking) osd interface - - // cache - hash_map inode_map; - Inode* root; - LRU lru; // lru list of Dentry's in our local metadata cache. - - // cap weirdness - map > cap_reap_queue; // ino -> mds -> msg .. set of (would-be) stale caps to reap - - - // file handles, etc. - string cwd; - interval_set free_fd_set; // unused fds - hash_map fd_map; - - int get_fd() { - int fd = free_fd_set.start(); - free_fd_set.erase(fd, 1); - return fd; - } - void put_fd(int fd) { - free_fd_set.insert(fd, 1); - } - - void mkabspath(const char *rel, string& abs) { - if (rel[0] == '/') { - abs = rel; - } else { - abs = cwd; - abs += "/"; - abs += rel; - } - } - - - // global client lock - // - protects Client and buffer cache both! - Mutex client_lock; - - - // -- metadata cache stuff - - // decrease inode ref. delete if dangling. - void put_inode(Inode *in, int n=1) { - //cout << "put_inode on " << in << " " << in->inode.ino << endl; - in->put(n); - if (in->ref == 0) { - //cout << "put_inode deleting " << in->inode.ino << endl; - inode_map.erase(in->inode.ino); - if (in == root) root = 0; - delete in; - } - } - - void close_dir(Dir *dir) { - assert(dir->is_empty()); - - Inode *in = dir->parent_inode; - if (in->dn) in->dn->put(); // unpin dentry - - delete in->dir; - in->dir = 0; - put_inode(in); // unpin inode - } - - //int get_cache_size() { return lru.lru_get_size(); } - //void set_cache_size(int m) { lru.lru_set_max(m); } - - Dentry* link(Dir *dir, const string& name, Inode *in) { - Dentry *dn = new Dentry; - dn->name = name; - - // link to dir - dn->dir = dir; - //cout << "link dir " << dir->parent_inode->inode.ino << " '" << name << "' -> inode " << in->inode.ino << endl; - dir->dentries[dn->name] = dn; - - // link to inode - dn->inode = in; - assert(in->dn == 0); - in->dn = dn; - in->get(); - - if (in->dir) dn->get(); // dir -> dn pin - - lru.lru_insert_mid(dn); // mid or top? - return dn; - } - - void unlink(Dentry *dn) { - Inode *in = dn->inode; - assert(in->dn == dn); - - // unlink from inode - if (dn->inode->dir) dn->put(); // dir -> dn pin - dn->inode = 0; - in->dn = 0; - put_inode(in); - - // unlink from dir - dn->dir->dentries.erase(dn->name); - if (dn->dir->is_empty()) - close_dir(dn->dir); - dn->dir = 0; - - // delete den - lru.lru_remove(dn); - delete dn; - } - - Dentry *relink(Dir *dir, const string& name, Inode *in) { - Dentry *olddn = in->dn; - Dir *olddir = olddn->dir; // note: might == dir! - - // newdn, attach to inode. don't touch inode ref. - Dentry *newdn = new Dentry; - newdn->dir = dir; - newdn->name = name; - newdn->inode = in; - in->dn = newdn; - - if (in->dir) { // dir -> dn pin - newdn->get(); - olddn->put(); - } - - // unlink old dn from dir - olddir->dentries.erase(olddn->name); - olddn->inode = 0; - olddn->dir = 0; - lru.lru_remove(olddn); - - // link new dn to dir - dir->dentries[name] = newdn; - lru.lru_insert_mid(newdn); - - // olddir now empty? (remember, olddir might == dir) - if (olddir->is_empty()) - close_dir(olddir); - - return newdn; - } - - // move dentry to top of lru - void touch_dn(Dentry *dn) { lru.lru_touch(dn); } - - // trim cache. - void trim_cache(); - void dump_inode(Inode *in, set& did); - void dump_cache(); // debug - - // find dentry based on filepath - Dentry *lookup(filepath& path); - - int fill_stat(Inode *in, struct stat *st); - - - // trace generation - ofstream traceout; - - - // friends - friend class SyntheticClient; - - public: - Client(Messenger *m, MonMap *mm, int i=0); - ~Client(); - void tear_down_cache(); - - int get_nodeid() { return whoami; } - - void init(); - void shutdown(); - - // messaging - void dispatch(Message *m); - - void handle_unmount(Message*); - void handle_mds_map(class MMDSMap *m); - - // file caps - void handle_file_caps(class MClientFileCaps *m); - void implemented_caps(class MClientFileCaps *m, Inode *in); - void release_caps(Inode *in, int retain=0); - void update_caps_wanted(Inode *in); - - void close_release(Inode *in); - void close_safe(Inode *in); - - void lock_fh_pos(Fh *f); - void unlock_fh_pos(Fh *f); - - // metadata cache - Inode* insert_inode(Dir *dir, InodeStat *in_info, const string& dn); - void update_dir_dist(Inode *in, DirStat *st); - Inode* insert_trace(MClientReply *reply); - - // ---------------------- - // fs ops. -private: - void _try_mount(); - void _mount_timeout(); - Context *mount_timeout_event; - - class C_MountTimeout : public Context { - Client *client; - public: - C_MountTimeout(Client *c) : client(c) { } - void finish(int r) { - if (r >= 0) client->_mount_timeout(); - } - }; - - // some helpers - int _do_lstat(const char *path, int mask, Inode **in); - int _opendir(const char *name, DirResult **dirpp); - void _readdir_add_dirent(DirResult *dirp, const string& name, Inode *in); - void _readdir_fill_dirent(struct dirent *de, DirEntry *entry, off_t); - bool _readdir_have_frag(DirResult *dirp); - void _readdir_next_frag(DirResult *dirp); - void _readdir_rechoose_frag(DirResult *dirp); - int _readdir_get_frag(DirResult *dirp); - void _closedir(DirResult *dirp); - void _ll_get(Inode *in); - int _ll_put(Inode *in, int num); - void _ll_drop_pins(); - - // internal interface - // call these with client_lock held! - int _link(const char *existing, const char *newname); - int _unlink(const char *path); - int _rename(const char *from, const char *to); - int _mkdir(const char *path, mode_t mode); - int _rmdir(const char *path); - int _readlink(const char *path, char *buf, off_t size); - int _symlink(const char *existing, const char *newname); - int _lstat(const char *path, struct stat *stbuf); - int _chmod(const char *relpath, mode_t mode); - int _chown(const char *relpath, uid_t uid, gid_t gid); - int _utimes(const char *relpath, utime_t mtime, utime_t atime); - int _mknod(const char *path, mode_t mode, dev_t rdev); - int _open(const char *path, int flags, mode_t mode, Fh **fhp); - int _release(Fh *fh); - int _read(Fh *fh, off_t offset, off_t size, bufferlist *bl); - int _write(Fh *fh, off_t offset, off_t size, const char *buf); - int _flush(Fh *fh); - int _truncate(const char *file, off_t length); - int _ftruncate(Fh *fh, off_t length); - int _fsync(Fh *fh, bool syncdataonly); - int _statfs(struct statvfs *stbuf); - - -public: - int mount(); - int unmount(); - - // these shoud (more or less) mirror the actual system calls. - int statfs(const char *path, struct statvfs *stbuf); - - // crap - int chdir(const char *s); - const string getcwd() { return cwd; } - - // namespace ops - int getdir(const char *relpath, list& names); // get the whole dir at once. - - int opendir(const char *name, DIR **dirpp); - int closedir(DIR *dirp); - int readdir_r(DIR *dirp, struct dirent *de); - int readdirplus_r(DIR *dirp, struct dirent *de, struct stat *st, int *stmask); - void rewinddir(DIR *dirp); - off_t telldir(DIR *dirp); - void seekdir(DIR *dirp, off_t offset); - - struct dirent_plus *readdirplus(DIR *dirp); - int readdirplus_r(DIR *dirp, struct dirent_plus *entry, struct dirent_plus **result); - struct dirent_lite *readdirlite(DIR *dirp); - int readdirlite_r(DIR *dirp, struct dirent_lite *entry, struct dirent_lite **result); - - int link(const char *existing, const char *newname); - int unlink(const char *path); - int rename(const char *from, const char *to); - - // dirs - int mkdir(const char *path, mode_t mode); - int rmdir(const char *path); - - // symlinks - int readlink(const char *path, char *buf, off_t size); - int symlink(const char *existing, const char *newname); - - // inode stuff - int lstat(const char *path, struct stat *stbuf); - int lstatlite(const char *path, struct statlite *buf); - - int chmod(const char *path, mode_t mode); - int chown(const char *path, uid_t uid, gid_t gid); - int utime(const char *path, struct utimbuf *buf); - - // file ops - int mknod(const char *path, mode_t mode, dev_t rdev=0); - int open(const char *path, int flags, mode_t mode=0); - int close(int fd); - off_t lseek(int fd, off_t offset, int whence); - int read(int fd, char *buf, off_t size, off_t offset=-1); - int write(int fd, const char *buf, off_t size, off_t offset=-1); - int fake_write_size(int fd, off_t size); - int truncate(const char *file, off_t size); - int ftruncate(int fd, off_t size); - int fsync(int fd, bool syncdataonly); - - // hpc lazyio - int lazyio_propogate(int fd, off_t offset, size_t count); - int lazyio_synchronize(int fd, off_t offset, size_t count); - - // expose file layout - int describe_layout(int fd, FileLayout* layout); - int get_stripe_unit(int fd); - int get_stripe_width(int fd); - int get_stripe_period(int fd); - int enumerate_layout(int fd, list& result, - off_t length, off_t offset); - - // low-level interface - int ll_lookup(inodeno_t parent, const char *name, struct stat *attr); - bool ll_forget(inodeno_t ino, int count); - Inode *_ll_get_inode(inodeno_t ino); - int ll_getattr(inodeno_t ino, struct stat *st); - int ll_setattr(inodeno_t ino, struct stat *st, int mask); - int ll_opendir(inodeno_t ino, void **dirpp); - void ll_releasedir(void *dirp); - int ll_readlink(inodeno_t ino, const char **value); - int ll_mknod(inodeno_t ino, const char *name, mode_t mode, dev_t rdev, struct stat *attr); - int ll_mkdir(inodeno_t ino, const char *name, mode_t mode, struct stat *attr); - int ll_symlink(inodeno_t ino, const char *name, const char *value, struct stat *attr); - int ll_unlink(inodeno_t ino, const char *name); - int ll_rmdir(inodeno_t ino, const char *name); - int ll_rename(inodeno_t parent, const char *name, inodeno_t newparent, const char *newname); - int ll_link(inodeno_t ino, inodeno_t newparent, const char *newname, struct stat *attr); - int ll_open(inodeno_t ino, int flags, Fh **fh); - int ll_create(inodeno_t parent, const char *name, mode_t mode, int flags, struct stat *attr, Fh **fh); - int ll_read(Fh *fh, off_t off, off_t len, bufferlist *bl); - int ll_write(Fh *fh, off_t off, off_t len, const char *data); - int ll_flush(Fh *fh); - int ll_fsync(Fh *fh, bool syncdataonly); - int ll_release(Fh *fh); - int ll_statfs(inodeno_t, struct statvfs *stbuf); - - - // failure - void ms_handle_failure(Message*, const entity_inst_t& inst); -}; - -#endif diff --git a/branches/sage/mds/client/FileCache.cc b/branches/sage/mds/client/FileCache.cc deleted file mode 100644 index 1adec4aaabee7..0000000000000 --- a/branches/sage/mds/client/FileCache.cc +++ /dev/null @@ -1,266 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "include/types.h" - -#include "FileCache.h" -#include "osdc/ObjectCacher.h" - -#include "msg/Messenger.h" - -#include "config.h" -#define dout(x) if (x <= g_conf.debug_client) *_dout << dbeginl << g_clock.now() << " " << oc->objecter->messenger->get_myname() << ".filecache " -#define derr(x) if (x <= g_conf.debug_client) *_derr << dbeginl << g_clock.now() << " " << oc->objecter->messenger->get_myname() << ".filecache " - - - -// flush/release/clean - -void FileCache::flush_dirty(Context *onflush) -{ - if (oc->flush_set(inode.ino, onflush)) { - onflush->finish(0); - delete onflush; - } -} - -off_t FileCache::release_clean() -{ - return oc->release_set(inode.ino); -} - -bool FileCache::is_cached() -{ - return oc->set_is_cached(inode.ino); -} - -bool FileCache::is_dirty() -{ - return oc->set_is_dirty_or_committing(inode.ino); -} - -void FileCache::empty(Context *onempty) -{ - off_t unclean = release_clean(); - bool clean = oc->flush_set(inode.ino, onempty); - assert(!unclean == clean); - - if (clean) { - onempty->finish(0); - delete onempty; - } -} - - -void FileCache::tear_down() -{ - off_t unclean = release_clean(); - if (unclean) { - dout(0) << "tear_down " << unclean << " unclean bytes, purging" << dendl; - oc->purge_set(inode.ino); - } -} - -// truncate - -void FileCache::truncate(off_t olds, off_t news) -{ - dout(5) << "truncate " << olds << " -> " << news << dendl; - - // map range to objects - list ls; - oc->filer.file_to_extents(inode, news, olds-news, ls); - oc->truncate_set(inode.ino, ls); -} - -// caps - -class C_FC_CheckCaps : public Context { - FileCache *fc; -public: - C_FC_CheckCaps(FileCache *f) : fc(f) {} - void finish(int r) { - fc->check_caps(); - } -}; - -void FileCache::set_caps(int caps, Context *onimplement) -{ - if (onimplement) { - dout(10) << "set_caps setting onimplement context for " << cap_string(caps) << dendl; - assert(latest_caps & ~caps); // we should be losing caps. - caps_callbacks[caps].push_back(onimplement); - } - - latest_caps = caps; - check_caps(); - - // kick waiters? (did we gain caps?) - if (can_read() && !waitfor_read.empty()) - for (set::iterator p = waitfor_read.begin(); - p != waitfor_read.end(); - ++p) - (*p)->Signal(); - if (can_write() && !waitfor_write.empty()) - for (set::iterator p = waitfor_write.begin(); - p != waitfor_write.end(); - ++p) - (*p)->Signal(); - -} - -int FileCache::get_used_caps() -{ - int used = 0; - if (num_reading) used |= CAP_FILE_RD; - if (oc->set_is_cached(inode.ino)) used |= CAP_FILE_RDCACHE; - if (num_writing) used |= CAP_FILE_WR; - if (oc->set_is_dirty_or_committing(inode.ino)) used |= CAP_FILE_WRBUFFER; - return used; -} - -void FileCache::check_caps() -{ - // calc used - int used = get_used_caps(); - dout(10) << "check_caps used was " << cap_string(used) << dendl; - - // try to implement caps? - // BUG? latest_caps, not least caps i've seen? - if ((latest_caps & CAP_FILE_RDCACHE) == 0 && - (used & CAP_FILE_RDCACHE)) - release_clean(); - if ((latest_caps & CAP_FILE_WRBUFFER) == 0 && - (used & CAP_FILE_WRBUFFER)) - flush_dirty(new C_FC_CheckCaps(this)); - - used = get_used_caps(); - dout(10) << "check_caps used now " << cap_string(used) << dendl; - - // check callbacks - map >::iterator p = caps_callbacks.begin(); - while (p != caps_callbacks.end()) { - if (used == 0 || (~(p->first) & used) == 0) { - // implemented. - dout(10) << "used is " << cap_string(used) - << ", caps " << cap_string(p->first) << " implemented, doing callback(s)" << dendl; - finish_contexts(p->second); - map >::iterator o = p; - p++; - caps_callbacks.erase(o); - } else { - dout(10) << "used is " << cap_string(used) - << ", caps " << cap_string(p->first) << " not yet implemented" << dendl; - p++; - } - } -} - - - -// read/write - -int FileCache::read(off_t offset, size_t size, bufferlist& blist, Mutex& client_lock) -{ - int r = 0; - - // can i read? - while ((latest_caps & CAP_FILE_RD) == 0) { - dout(10) << "read doesn't have RD cap, blocking" << dendl; - Cond c; - waitfor_read.insert(&c); - c.Wait(client_lock); - waitfor_read.erase(&c); - } - - // inc reading counter - num_reading++; - - if (latest_caps & CAP_FILE_RDCACHE) { - // read (and block) - Cond cond; - bool done = false; - int rvalue = 0; - C_Cond *onfinish = new C_Cond(&cond, &done, &rvalue); - - r = oc->file_read(inode, offset, size, &blist, onfinish); - - if (r == 0) { - // block - while (!done) - cond.Wait(client_lock); - r = rvalue; - } else { - // it was cached. - delete onfinish; - } - } else { - r = oc->file_atomic_sync_read(inode, offset, size, &blist, client_lock); - } - - // dec reading counter - num_reading--; - - if (num_reading == 0 && !caps_callbacks.empty()) - check_caps(); - - return r; -} - -void FileCache::write(off_t offset, size_t size, bufferlist& blist, Mutex& client_lock) -{ - // can i write - while ((latest_caps & CAP_FILE_WR) == 0) { - dout(10) << "write doesn't have WR cap, blocking" << dendl; - Cond c; - waitfor_write.insert(&c); - c.Wait(client_lock); - waitfor_write.erase(&c); - } - - // inc writing counter - num_writing++; - - if (size > 0) { - if (latest_caps & CAP_FILE_WRBUFFER) { // caps buffered write? - // wait? (this may block!) - oc->wait_for_write(size, client_lock); - - // async, caching, non-blocking. - oc->file_write(inode, offset, size, blist); - } else { - // atomic, synchronous, blocking. - oc->file_atomic_sync_write(inode, offset, size, blist, client_lock); - } - } - - // dec writing counter - num_writing--; - if (num_writing == 0 && !caps_callbacks.empty()) - check_caps(); -} - -bool FileCache::all_safe() -{ - return !oc->set_is_dirty_or_committing(inode.ino); -} - -void FileCache::add_safe_waiter(Context *c) -{ - bool safe = oc->commit_set(inode.ino, c); - if (safe) { - c->finish(0); - delete c; - } -} diff --git a/branches/sage/mds/client/FileCache.h b/branches/sage/mds/client/FileCache.h deleted file mode 100644 index 8d6e08146b508..0000000000000 --- a/branches/sage/mds/client/FileCache.h +++ /dev/null @@ -1,85 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __FILECACHE_H -#define __FILECACHE_H - -#include -using std::iostream; - -#include "common/Cond.h" -#include "mds/Capability.h" - -class ObjectCacher; - -class FileCache { - ObjectCacher *oc; - inode_t inode; - - // caps - int latest_caps; - map > caps_callbacks; - - int num_reading; - int num_writing; - //int num_unsafe; - - // waiters - set waitfor_read; - set waitfor_write; - - bool waitfor_release; - - public: - FileCache(ObjectCacher *_oc, inode_t _inode) : - oc(_oc), - inode(_inode), - latest_caps(0), - num_reading(0), num_writing(0),// num_unsafe(0), - waitfor_release(false) {} - ~FileCache() { - tear_down(); - } - - // waiters/waiting - bool can_read() { return latest_caps & CAP_FILE_RD; } - bool can_write() { return latest_caps & CAP_FILE_WR; } - bool all_safe();// { return num_unsafe == 0; } - - void add_safe_waiter(Context *c); - - void truncate(off_t olds, off_t news); - - // ... - void flush_dirty(Context *onflush=0); - off_t release_clean(); - void empty(Context *onempty=0); - bool is_empty() { return !(is_cached() || is_dirty()); } - bool is_cached(); - bool is_dirty(); - - void tear_down(); - - int get_caps() { return latest_caps; } - int get_used_caps(); - void set_caps(int caps, Context *onimplement=0); - void check_caps(); - - int read(off_t offset, size_t size, bufferlist& blist, Mutex& client_lock); // may block. - void write(off_t offset, size_t size, bufferlist& blist, Mutex& client_lock); // may block. - -}; - - -#endif diff --git a/branches/sage/mds/client/SyntheticClient.cc b/branches/sage/mds/client/SyntheticClient.cc deleted file mode 100644 index 4ac6ab356081a..0000000000000 --- a/branches/sage/mds/client/SyntheticClient.cc +++ /dev/null @@ -1,2893 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include -#include -using namespace std; - - - -#include "SyntheticClient.h" -#include "osdc/Objecter.h" - -#include "include/filepath.h" -#include "mds/mdstypes.h" -#include "common/Logger.h" - -#include -#include -#include -#include -#include -#include -#include - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_client) *_dout << dbeginl << g_clock.now() << " synthetic" << client->get_nodeid() << " " -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_client) *_derr << dbeginl << g_clock.now() << " synthetic" << client->get_nodeid() << " " - -// traces -//void trace_include(SyntheticClient *syn, Client *cl, string& prefix); -//void trace_openssh(SyntheticClient *syn, Client *cl, string& prefix); - - -list syn_modes; -list syn_iargs; -list syn_sargs; - -void parse_syn_options(vector& args) -{ - vector nargs; - - for (unsigned i=0; iclient = client; - thread_id = 0; - - did_readdir = false; - - run_only = -1; - exclude = -1; - - this->modes = syn_modes; - this->iargs = syn_iargs; - this->sargs = syn_sargs; - - run_start = g_clock.now(); -} - - - - -#define DBL 2 - -void *synthetic_client_thread_entry(void *ptr) -{ - SyntheticClient *sc = (SyntheticClient*)ptr; - //int r = - sc->run(); - return 0;//(void*)r; -} - -string SyntheticClient::get_sarg(int seq) -{ - string a; - if (!sargs.empty()) { - a = sargs.front(); - sargs.pop_front(); - } - if (a.length() == 0 || a == "~") { - char s[20]; - sprintf(s,"/syn.%d.%d", client->whoami, seq); - a = s; - } - return a; -} - -int SyntheticClient::run() -{ - client->init(); - client->mount(); - - //run_start = g_clock.now(); - run_until = utime_t(0,0); - dout(5) << "run" << dendl; - - int seq = 0; - - for (list::iterator it = modes.begin(); - it != modes.end(); - it++) { - int mode = *it; - dout(3) << "mode " << mode << dendl; - - switch (mode) { - - - // WHO? - - case SYNCLIENT_MODE_ONLY: - { - run_only = iargs.front(); - iargs.pop_front(); - if (run_only == client->get_nodeid()) - dout(2) << "only " << run_only << dendl; - } - break; - case SYNCLIENT_MODE_ONLYRANGE: - { - int first = iargs.front(); - iargs.pop_front(); - int last = iargs.front(); - iargs.pop_front(); - if (first <= client->get_nodeid() && - last > client->get_nodeid()) { - run_only = client->get_nodeid(); - dout(2) << "onlyrange [" << first << ", " << last << ") includes me" << dendl; - } else - run_only = client->get_nodeid()+1; // not me - } - break; - case SYNCLIENT_MODE_EXCLUDE: - { - exclude = iargs.front(); - iargs.pop_front(); - if (exclude == client->get_nodeid()) { - run_only = client->get_nodeid() + 1; - dout(2) << "not running " << exclude << dendl; - } else - run_only = -1; - } - break; - - // HOW LONG? - - case SYNCLIENT_MODE_UNTIL: - { - int iarg1 = iargs.front(); - iargs.pop_front(); - if (run_me()) { - if (iarg1) { - dout(2) << "until " << iarg1 << dendl; - utime_t dur(iarg1,0); - run_until = run_start + dur; - } else { - dout(2) << "until " << iarg1 << " (no limit)" << dendl; - run_until = utime_t(0,0); - } - } - } - break; - - - // ... - - case SYNCLIENT_MODE_FOO: - if (run_me()) { - foo(); - } - did_run_me(); - break; - - case SYNCLIENT_MODE_RANDOMSLEEP: - { - int iarg1 = iargs.front(); - iargs.pop_front(); - if (run_me()) { - srand(time(0) + getpid() + client->whoami); - sleep(rand() % iarg1); - } - did_run_me(); - } - break; - - case SYNCLIENT_MODE_SLEEP: - { - int iarg1 = iargs.front(); - iargs.pop_front(); - if (run_me()) { - dout(2) << "sleep " << iarg1 << dendl; - sleep(iarg1); - } - did_run_me(); - } - break; - - case SYNCLIENT_MODE_SLEEPUNTIL: - { - int iarg1 = iargs.front(); - iargs.pop_front(); - if (iarg1 && run_me()) { - dout(2) << "sleepuntil " << iarg1 << dendl; - utime_t at = g_clock.now() - run_start; - if (at.sec() < iarg1) - sleep(iarg1 - at.sec()); - } - did_run_me(); - } - break; - - case SYNCLIENT_MODE_RANDOMWALK: - { - int iarg1 = iargs.front(); - iargs.pop_front(); - if (run_me()) { - dout(2) << "randomwalk " << iarg1 << dendl; - random_walk(iarg1); - } - did_run_me(); - } - break; - - case SYNCLIENT_MODE_MAKEDIRMESS: - { - string sarg1 = get_sarg(0); - int iarg1 = iargs.front(); iargs.pop_front(); - if (run_me()) { - dout(2) << "makedirmess " << sarg1 << " " << iarg1 << dendl; - make_dir_mess(sarg1.c_str(), iarg1); - } - did_run_me(); - } - break; - case SYNCLIENT_MODE_MAKEDIRS: - { - string sarg1 = get_sarg(seq++); - int iarg1 = iargs.front(); iargs.pop_front(); - int iarg2 = iargs.front(); iargs.pop_front(); - int iarg3 = iargs.front(); iargs.pop_front(); - if (run_me()) { - dout(2) << "makedirs " << sarg1 << " " << iarg1 << " " << iarg2 << " " << iarg3 << dendl; - make_dirs(sarg1.c_str(), iarg1, iarg2, iarg3); - } - did_run_me(); - } - break; - case SYNCLIENT_MODE_STATDIRS: - { - string sarg1 = get_sarg(0); - int iarg1 = iargs.front(); iargs.pop_front(); - int iarg2 = iargs.front(); iargs.pop_front(); - int iarg3 = iargs.front(); iargs.pop_front(); - if (run_me()) { - dout(2) << "statdirs " << sarg1 << " " << iarg1 << " " << iarg2 << " " << iarg3 << dendl; - stat_dirs(sarg1.c_str(), iarg1, iarg2, iarg3); - } - did_run_me(); - } - break; - case SYNCLIENT_MODE_READDIRS: - { - string sarg1 = get_sarg(0); - int iarg1 = iargs.front(); iargs.pop_front(); - int iarg2 = iargs.front(); iargs.pop_front(); - int iarg3 = iargs.front(); iargs.pop_front(); - if (run_me()) { - dout(2) << "readdirs " << sarg1 << " " << iarg1 << " " << iarg2 << " " << iarg3 << dendl; - read_dirs(sarg1.c_str(), iarg1, iarg2, iarg3); - } - did_run_me(); - } - break; - - - case SYNCLIENT_MODE_THRASHLINKS: - { - string sarg1 = get_sarg(0); - int iarg1 = iargs.front(); iargs.pop_front(); - int iarg2 = iargs.front(); iargs.pop_front(); - int iarg3 = iargs.front(); iargs.pop_front(); - int iarg4 = iargs.front(); iargs.pop_front(); - if (run_me()) { - dout(2) << "thrashlinks " << sarg1 << " " << iarg1 << " " << iarg2 << " " << iarg3 << dendl; - thrash_links(sarg1.c_str(), iarg1, iarg2, iarg3, iarg4); - } - did_run_me(); - } - break; - - case SYNCLIENT_MODE_LINKTEST: - { - if (run_me()) { - link_test(); - } - did_run_me(); - } - break; - - - case SYNCLIENT_MODE_MAKEFILES: - { - int num = iargs.front(); iargs.pop_front(); - int count = iargs.front(); iargs.pop_front(); - int priv = iargs.front(); iargs.pop_front(); - if (run_me()) { - dout(2) << "makefiles " << num << " " << count << " " << priv << dendl; - make_files(num, count, priv, false); - } - did_run_me(); - } - break; - case SYNCLIENT_MODE_MAKEFILES2: - { - int num = iargs.front(); iargs.pop_front(); - int count = iargs.front(); iargs.pop_front(); - int priv = iargs.front(); iargs.pop_front(); - if (run_me()) { - dout(2) << "makefiles2 " << num << " " << count << " " << priv << dendl; - make_files(num, count, priv, true); - } - did_run_me(); - } - break; - case SYNCLIENT_MODE_CREATESHARED: - { - string sarg1 = get_sarg(0); - int num = iargs.front(); iargs.pop_front(); - if (run_me()) { - dout(2) << "createshared " << num << dendl; - create_shared(num); - } - did_run_me(); - } - break; - case SYNCLIENT_MODE_OPENSHARED: - { - string sarg1 = get_sarg(0); - int num = iargs.front(); iargs.pop_front(); - int count = iargs.front(); iargs.pop_front(); - if (run_me()) { - dout(2) << "openshared " << num << dendl; - open_shared(num, count); - } - did_run_me(); - } - break; - - case SYNCLIENT_MODE_CREATEOBJECTS: - { - int count = iargs.front(); iargs.pop_front(); - int size = iargs.front(); iargs.pop_front(); - int inflight = iargs.front(); iargs.pop_front(); - if (run_me()) { - dout(2) << "createobjects " << cout << " of " << size << " bytes" - << ", " << inflight << " in flight" << dendl; - create_objects(count, size, inflight); - } - did_run_me(); - } - break; - case SYNCLIENT_MODE_OBJECTRW: - { - int count = iargs.front(); iargs.pop_front(); - int size = iargs.front(); iargs.pop_front(); - int wrpc = iargs.front(); iargs.pop_front(); - int overlap = iargs.front(); iargs.pop_front(); - int rskew = iargs.front(); iargs.pop_front(); - int wskew = iargs.front(); iargs.pop_front(); - if (run_me()) { - dout(2) << "objectrw " << cout << " " << size << " " << wrpc - << " " << overlap << " " << rskew << " " << wskew << dendl; - object_rw(count, size, wrpc, overlap, rskew, wskew); - } - did_run_me(); - } - break; - - case SYNCLIENT_MODE_FULLWALK: - { - string sarg1;// = get_sarg(0); - if (run_me()) { - dout(2) << "fullwalk" << sarg1 << dendl; - full_walk(sarg1); - } - did_run_me(); - } - break; - case SYNCLIENT_MODE_REPEATWALK: - { - string sarg1 = get_sarg(0); - if (run_me()) { - dout(2) << "repeatwalk " << sarg1 << dendl; - while (full_walk(sarg1) == 0) ; - } - did_run_me(); - } - break; - - case SYNCLIENT_MODE_WRITEFILE: - { - string sarg1 = get_sarg(0); - int iarg1 = iargs.front(); iargs.pop_front(); - int iarg2 = iargs.front(); iargs.pop_front(); - dout(1) << "WRITING SYN CLIENT" << dendl; - if (run_me()) { - write_file(sarg1, iarg1, iarg2); - } - did_run_me(); - } - break; - case SYNCLIENT_MODE_WRSHARED: - { - string sarg1 = "shared"; - int iarg1 = iargs.front(); iargs.pop_front(); - int iarg2 = iargs.front(); iargs.pop_front(); - if (run_me()) { - write_file(sarg1, iarg1, iarg2); - } - did_run_me(); - } - break; - case SYNCLIENT_MODE_READSHARED: - { - string sarg1 = "shared"; - int iarg1 = iargs.front(); iargs.pop_front(); - int iarg2 = iargs.front(); iargs.pop_front(); - if (run_me()) { - read_file(sarg1, iarg1, iarg2, true); - } - did_run_me(); - } - break; - case SYNCLIENT_MODE_WRITEBATCH: - { - int iarg1 = iargs.front(); iargs.pop_front(); - int iarg2 = iargs.front(); iargs.pop_front(); - int iarg3 = iargs.front(); iargs.pop_front(); - - if (run_me()) { - write_batch(iarg1, iarg2, iarg3); - } - did_run_me(); - } - break; - - case SYNCLIENT_MODE_READFILE: - { - string sarg1 = get_sarg(0); - int iarg1 = iargs.front(); iargs.pop_front(); - int iarg2 = iargs.front(); iargs.pop_front(); - - dout(1) << "READING SYN CLIENT" << dendl; - if (run_me()) { - read_file(sarg1, iarg1, iarg2); - } - did_run_me(); - } - break; - - case SYNCLIENT_MODE_RDWRRANDOM: - { - string sarg1 = get_sarg(0); - int iarg1 = iargs.front(); iargs.pop_front(); - int iarg2 = iargs.front(); iargs.pop_front(); - - dout(1) << "RANDOM READ WRITE SYN CLIENT" << dendl; - if (run_me()) { - read_random(sarg1, iarg1, iarg2); - } - did_run_me(); - } - break; - - case SYNCLIENT_MODE_RDWRRANDOM_EX: - { - string sarg1 = get_sarg(0); - int iarg1 = iargs.front(); iargs.pop_front(); - int iarg2 = iargs.front(); iargs.pop_front(); - - dout(1) << "RANDOM READ WRITE SYN CLIENT" << dendl; - if (run_me()) { - read_random_ex(sarg1, iarg1, iarg2); - } - did_run_me(); - } - break; - case SYNCLIENT_MODE_TRACE: - { - string tfile = get_sarg(0); - sargs.push_front(string("~")); - int iarg1 = iargs.front(); iargs.pop_front(); - int playdata = iargs.front(); iargs.pop_front(); - string prefix = get_sarg(0); - char realtfile[100]; - sprintf(realtfile, tfile.c_str(), client->get_nodeid()); - - if (run_me()) { - dout(-2) << "trace " << tfile << " prefix=" << prefix << " count=" << iarg1 << " data=" << playdata << dendl; - - Trace t(realtfile); - - if (iarg1 == 0) iarg1 = 1; // play trace at least once! - - for (int i=0; i 1) clean_dir(prefix); // clean only if repeat - - utime_t lat = g_clock.now(); - lat -= start; - - dout(0) << " trace " << tfile << " loop " << (i+1) << "/" << iarg1 << " done in " << (double)lat << " seconds" << dendl; - if (client_logger - && i > 0 - && i < iarg1-1 - ) { - client_logger->finc("trsum", (double)lat); - client_logger->inc("trnum"); - } - } - dout(1) << "done " << dendl; - } - did_run_me(); - } - break; - - - case SYNCLIENT_MODE_OPENTEST: - { - int count = iargs.front(); iargs.pop_front(); - if (run_me()) { - for (int i=0; iopen("test", rand()%2 ? (O_WRONLY|O_CREAT):O_RDONLY); - if (fd > 0) client->close(fd); - } - } - did_run_me(); - } - break; - - case SYNCLIENT_MODE_OPTEST: - { - int count = iargs.front(); iargs.pop_front(); - if (run_me()) { - client->mknod("test", 0777); - struct stat st; - for (int i=0; ilstat("test", &st); - client->chmod("test", 0777); - } - } - did_run_me(); - } - break; - - case SYNCLIENT_MODE_TRUNCATE: - { - string file = get_sarg(0); - sargs.push_front(file); - int iarg1 = iargs.front(); iargs.pop_front(); - if (run_me()) { - client->truncate(file.c_str(), iarg1); - } - did_run_me(); - } - break; - - - case SYNCLIENT_MODE_IMPORTFIND: - { - string base = get_sarg(0); - string find = get_sarg(0); - int data = get_iarg(); - if (run_me()) { - import_find(base.c_str(), find.c_str(), data); - } - did_run_me(); - } - break; - - default: - assert(0); - } - } - dout(1) << "syn done, unmounting " << dendl; - - client->unmount(); - client->shutdown(); - return 0; -} - - -int SyntheticClient::start_thread() -{ - assert(!thread_id); - - pthread_create(&thread_id, NULL, synthetic_client_thread_entry, this); - assert(thread_id); - return 0; -} - -int SyntheticClient::join_thread() -{ - assert(thread_id); - void *rv; - pthread_join(thread_id, &rv); - return 0; -} - - -bool roll_die(float p) -{ - float r = (float)(rand() % 100000) / 100000.0; - if (r < p) - return true; - else - return false; -} - -void SyntheticClient::init_op_dist() -{ - op_dist.clear(); - op_dist.add( MDS_OP_STAT, g_conf.fakeclient_op_stat ); - op_dist.add( MDS_OP_UTIME, g_conf.fakeclient_op_utime ); - op_dist.add( MDS_OP_CHMOD, g_conf.fakeclient_op_chmod ); - op_dist.add( MDS_OP_CHOWN, g_conf.fakeclient_op_chown ); - - op_dist.add( MDS_OP_READDIR, g_conf.fakeclient_op_readdir ); - op_dist.add( MDS_OP_MKNOD, g_conf.fakeclient_op_mknod ); - op_dist.add( MDS_OP_LINK, g_conf.fakeclient_op_link ); - op_dist.add( MDS_OP_UNLINK, g_conf.fakeclient_op_unlink ); - op_dist.add( MDS_OP_RENAME, g_conf.fakeclient_op_rename ); - - op_dist.add( MDS_OP_MKDIR, g_conf.fakeclient_op_mkdir ); - op_dist.add( MDS_OP_RMDIR, g_conf.fakeclient_op_rmdir ); - op_dist.add( MDS_OP_SYMLINK, g_conf.fakeclient_op_symlink ); - - op_dist.add( MDS_OP_OPEN, g_conf.fakeclient_op_openrd ); - //op_dist.add( MDS_OP_READ, g_conf.fakeclient_op_read ); - //op_dist.add( MDS_OP_WRITE, g_conf.fakeclient_op_write ); - op_dist.add( MDS_OP_TRUNCATE, g_conf.fakeclient_op_truncate ); - op_dist.add( MDS_OP_FSYNC, g_conf.fakeclient_op_fsync ); - op_dist.add( MDS_OP_RELEASE, g_conf.fakeclient_op_close ); // actually, close() - op_dist.normalize(); -} - -void SyntheticClient::up() -{ - cwd = cwd.prefixpath(cwd.depth()-1); - dout(DBL) << "cd .. -> " << cwd << dendl; - clear_dir(); -} - - -int SyntheticClient::play_trace(Trace& t, string& prefix, bool metadata_only) -{ - dout(4) << "play trace prefix '" << prefix << "'" << dendl; - t.start(); - - char buf[1024]; - char buf2[1024]; - - utime_t start = g_clock.now(); - - hash_map open_files; - hash_map open_dirs; - - hash_map ll_files; - hash_map ll_dirs; - hash_map ll_inos; - - ll_inos[1] = 1; // root inode is known. - - // prefix? - const char *p = prefix.c_str(); - if (prefix.length()) { - client->mkdir(prefix.c_str(), 0755); - struct stat attr; - if (client->ll_lookup(1, prefix.c_str(), &attr) == 0) { - ll_inos[1] = attr.st_ino; - dout(5) << "'root' ino is " << inodeno_t(attr.st_ino) << dendl; - } else { - dout(0) << "warning: play_trace coudln't lookup up my per-client directory" << dendl; - } - } - - - utime_t last_status = start; - - int n = 0; - - // for object traces - Mutex &lock = client->client_lock; - Cond cond; - bool ack; - bool safe; - C_Gather *safeg = new C_Gather(new C_SafeCond(&lock, &cond, &safe)); - Context *safegref = safeg->new_sub(); // take a ref - - while (!t.end()) { - - if (++n == 100) { - n = 00; - utime_t now = last_status; - if (now - last_status > 1.0) { - last_status = now; - dout(1) << "play_trace at line " << t.get_line() << dendl; - } - } - - if (time_to_stop()) break; - - // op - const char *op = t.get_string(buf, 0); - dout(4) << (t.get_line()-1) << ": trace op " << op << dendl; - - if (op[0] == '@') { - // timestamp... ignore it! - t.get_int(); // sec - t.get_int(); // usec - op = t.get_string(buf, 0); - } - - // high level ops --------------------- - if (strcmp(op, "link") == 0) { - const char *a = t.get_string(buf, p); - const char *b = t.get_string(buf2, p); - client->link(a,b); - } else if (strcmp(op, "unlink") == 0) { - const char *a = t.get_string(buf, p); - client->unlink(a); - } else if (strcmp(op, "rename") == 0) { - const char *a = t.get_string(buf, p); - const char *b = t.get_string(buf2, p); - client->rename(a,b); - } else if (strcmp(op, "mkdir") == 0) { - const char *a = t.get_string(buf, p); - int64_t b = t.get_int(); - client->mkdir(a, b); - } else if (strcmp(op, "rmdir") == 0) { - const char *a = t.get_string(buf, p); - client->rmdir(a); - } else if (strcmp(op, "symlink") == 0) { - const char *a = t.get_string(buf, p); - const char *b = t.get_string(buf2, p); - client->symlink(a,b); - } else if (strcmp(op, "readlink") == 0) { - const char *a = t.get_string(buf, p); - char buf[100]; - client->readlink(a, buf, 100); - } else if (strcmp(op, "lstat") == 0) { - struct stat st; - const char *a = t.get_string(buf, p); - if (strcmp(a, p) != 0 && - strcmp(a, "/") != 0 && - strcmp(a, "/lib") != 0 && // or /lib.. that would be a lookup. hack. - a[0] != 0) // stop stating the root directory already - client->lstat(a, &st); - } else if (strcmp(op, "chmod") == 0) { - const char *a = t.get_string(buf, p); - int64_t b = t.get_int(); - client->chmod(a, b); - } else if (strcmp(op, "chown") == 0) { - const char *a = t.get_string(buf, p); - int64_t b = t.get_int(); - int64_t c = t.get_int(); - client->chown(a, b, c); - } else if (strcmp(op, "utime") == 0) { - const char *a = t.get_string(buf, p); - int64_t b = t.get_int(); - int64_t c = t.get_int(); - struct utimbuf u; - u.actime = b; - u.modtime = c; - client->utime(a, &u); - } else if (strcmp(op, "mknod") == 0) { - const char *a = t.get_string(buf, p); - int64_t b = t.get_int(); - int64_t c = t.get_int(); - client->mknod(a, b, c); - } else if (strcmp(op, "oldmknod") == 0) { - const char *a = t.get_string(buf, p); - int64_t b = t.get_int(); - client->mknod(a, b, 0); - } else if (strcmp(op, "getdir") == 0) { - const char *a = t.get_string(buf, p); - list contents; - client->getdir(a, contents); - } else if (strcmp(op, "getdir") == 0) { - const char *a = t.get_string(buf, p); - list contents; - client->getdir(a, contents); - } else if (strcmp(op, "opendir") == 0) { - const char *a = t.get_string(buf, p); - int64_t b = t.get_int(); - DIR *dirp; - client->opendir(a, &dirp); - if (dirp) open_dirs[b] = dirp; - } else if (strcmp(op, "closedir") == 0) { - int64_t a = t.get_int(); - client->closedir(open_dirs[a]); - open_dirs.erase(a); - } else if (strcmp(op, "open") == 0) { - const char *a = t.get_string(buf, p); - int64_t b = t.get_int(); - int64_t c = t.get_int(); - int64_t d = t.get_int(); - int64_t fd = client->open(a, b, c); - if (fd > 0) open_files[d] = fd; - } else if (strcmp(op, "oldopen") == 0) { - const char *a = t.get_string(buf, p); - int64_t b = t.get_int(); - int64_t d = t.get_int(); - int64_t fd = client->open(a, b, 0755); - if (fd > 0) open_files[d] = fd; - } else if (strcmp(op, "close") == 0) { - int64_t id = t.get_int(); - int64_t fh = open_files[id]; - if (fh > 0) client->close(fh); - open_files.erase(id); - } else if (strcmp(op, "lseek") == 0) { - int64_t f = t.get_int(); - int fd = open_files[f]; - int64_t off = t.get_int(); - int64_t whence = t.get_int(); - client->lseek(fd, off, whence); - } else if (strcmp(op, "read") == 0) { - int64_t f = t.get_int(); - int64_t size = t.get_int(); - int64_t off = t.get_int(); - int64_t fd = open_files[f]; - if (!metadata_only) { - char *b = new char[size]; - client->read(fd, b, size, off); - delete[] b; - } - } else if (strcmp(op, "write") == 0) { - int64_t f = t.get_int(); - int64_t fd = open_files[f]; - int64_t size = t.get_int(); - int64_t off = t.get_int(); - if (!metadata_only) { - char *b = new char[size]; - memset(b, 1, size); // let's write 1's! - client->write(fd, b, size, off); - delete[] b; - } else { - client->write(fd, NULL, 0, size+off); - } - } else if (strcmp(op, "truncate") == 0) { - const char *a = t.get_string(buf, p); - int64_t l = t.get_int(); - client->truncate(a, l); - } else if (strcmp(op, "ftruncate") == 0) { - int64_t f = t.get_int(); - int fd = open_files[f]; - int64_t l = t.get_int(); - client->ftruncate(fd, l); - } else if (strcmp(op, "fsync") == 0) { - int64_t f = t.get_int(); - int64_t b = t.get_int(); - int fd = open_files[f]; - client->fsync(fd, b); - } else if (strcmp(op, "chdir") == 0) { - const char *a = t.get_string(buf, p); - client->chdir(a); - } else if (strcmp(op, "statfs") == 0) { - struct statvfs stbuf; - client->statfs("/", &stbuf); - } - - // low level ops --------------------- - else if (strcmp(op, "ll_lookup") == 0) { - int64_t i = t.get_int(); - const char *name = t.get_string(buf, p); - int64_t r = t.get_int(); - struct stat attr; - if (ll_inos.count(i) && - client->ll_lookup(ll_inos[i], name, &attr) == 0) - ll_inos[r] = attr.st_ino; - } else if (strcmp(op, "ll_forget") == 0) { - int64_t i = t.get_int(); - int64_t n = t.get_int(); - if (ll_inos.count(i) && - client->ll_forget(ll_inos[i], n)) - ll_inos.erase(i); - } else if (strcmp(op, "ll_getattr") == 0) { - int64_t i = t.get_int(); - struct stat attr; - if (ll_inos.count(i)) - client->ll_getattr(ll_inos[i], &attr); - } else if (strcmp(op, "ll_setattr") == 0) { - int64_t i = t.get_int(); - struct stat attr; - memset(&attr, 0, sizeof(attr)); - attr.st_mode = t.get_int(); - attr.st_uid = t.get_int(); - attr.st_gid = t.get_int(); - attr.st_size = t.get_int(); - attr.st_mtime = t.get_int(); - attr.st_atime = t.get_int(); - int mask = t.get_int(); - if (ll_inos.count(i)) - client->ll_setattr(ll_inos[i], &attr, mask); - } else if (strcmp(op, "ll_readlink") == 0) { - int64_t i = t.get_int(); - const char *value; - if (ll_inos.count(i)) - client->ll_readlink(ll_inos[i], &value); - } else if (strcmp(op, "ll_mknod") == 0) { - int64_t i = t.get_int(); - const char *n = t.get_string(buf, p); - int m = t.get_int(); - int r = t.get_int(); - int64_t ri = t.get_int(); - struct stat attr; - if (ll_inos.count(i) && - client->ll_mknod(ll_inos[i], n, m, r, &attr) == 0) - ll_inos[ri] = attr.st_ino; - } else if (strcmp(op, "ll_mkdir") == 0) { - int64_t i = t.get_int(); - const char *n = t.get_string(buf, p); - int m = t.get_int(); - int64_t ri = t.get_int(); - struct stat attr; - if (ll_inos.count(i) && - client->ll_mkdir(ll_inos[i], n, m, &attr) == 0) - ll_inos[ri] = attr.st_ino; - } else if (strcmp(op, "ll_symlink") == 0) { - int64_t i = t.get_int(); - const char *n = t.get_string(buf, p); - const char *v = t.get_string(buf2, p); - int64_t ri = t.get_int(); - struct stat attr; - if (ll_inos.count(i) && - client->ll_symlink(ll_inos[i], n, v, &attr) == 0) - ll_inos[ri] = attr.st_ino; - } else if (strcmp(op, "ll_unlink") == 0) { - int64_t i = t.get_int(); - const char *n = t.get_string(buf, p); - if (ll_inos.count(i)) - client->ll_unlink(ll_inos[i], n); - } else if (strcmp(op, "ll_rmdir") == 0) { - int64_t i = t.get_int(); - const char *n = t.get_string(buf, p); - if (ll_inos.count(i)) - client->ll_rmdir(ll_inos[i], n); - } else if (strcmp(op, "ll_rename") == 0) { - int64_t i = t.get_int(); - const char *n = t.get_string(buf, p); - int64_t ni = t.get_int(); - const char *nn = t.get_string(buf2, p); - if (ll_inos.count(i) && - ll_inos.count(ni)) - client->ll_rename(ll_inos[i], n, ll_inos[ni], nn); - } else if (strcmp(op, "ll_link") == 0) { - int64_t i = t.get_int(); - int64_t ni = t.get_int(); - const char *nn = t.get_string(buf, p); - struct stat attr; - if (ll_inos.count(i) && - ll_inos.count(ni)) - client->ll_link(ll_inos[i], ll_inos[ni], nn, &attr); - } else if (strcmp(op, "ll_opendir") == 0) { - int64_t i = t.get_int(); - int64_t r = t.get_int(); - void *dirp; - if (ll_inos.count(i) && - client->ll_opendir(ll_inos[i], &dirp) == 0) - ll_dirs[r] = dirp; - } else if (strcmp(op, "ll_releasedir") == 0) { - int64_t f = t.get_int(); - if (ll_dirs.count(f)) { - client->ll_releasedir(ll_dirs[f]); - ll_dirs.erase(f); - } - } else if (strcmp(op, "ll_open") == 0) { - int64_t i = t.get_int(); - int64_t f = t.get_int(); - int64_t r = t.get_int(); - Fh *fhp; - if (ll_inos.count(i) && - client->ll_open(ll_inos[i], f, &fhp) == 0) - ll_files[r] = fhp; - } else if (strcmp(op, "ll_create") == 0) { - int64_t i = t.get_int(); - const char *n = t.get_string(buf, p); - int64_t m = t.get_int(); - int64_t f = t.get_int(); - int64_t r = t.get_int(); - int64_t ri = t.get_int(); - Fh *fhp; - struct stat attr; - if (ll_inos.count(i) && - client->ll_create(ll_inos[i], n, m, f, &attr, &fhp) == 0) { - ll_inos[ri] = attr.st_ino; - ll_files[r] = fhp; - } - } else if (strcmp(op, "ll_read") == 0) { - int64_t f = t.get_int(); - int64_t off = t.get_int(); - int64_t size = t.get_int(); - if (ll_files.count(f) && - !metadata_only) { - bufferlist bl; - client->ll_read(ll_files[f], off, size, &bl); - } - } else if (strcmp(op, "ll_write") == 0) { - int64_t f = t.get_int(); - int64_t off = t.get_int(); - int64_t size = t.get_int(); - if (ll_files.count(f)) { - if (!metadata_only) { - bufferlist bl; - bufferptr bp(size); - bl.push_back(bp); - bp.zero(); - client->ll_write(ll_files[f], off, size, bl.c_str()); - } else { - client->ll_write(ll_files[f], off+size, 0, NULL); - } - } - } else if (strcmp(op, "ll_flush") == 0) { - int64_t f = t.get_int(); - if (!metadata_only && - ll_files.count(f)) - client->ll_flush(ll_files[f]); - } else if (strcmp(op, "ll_fsync") == 0) { - int64_t f = t.get_int(); - if (!metadata_only && - ll_files.count(f)) - client->ll_fsync(ll_files[f], false); // FIXME dataonly param - } else if (strcmp(op, "ll_release") == 0) { - int64_t f = t.get_int(); - if (ll_files.count(f)) { - client->ll_release(ll_files[f]); - ll_files.erase(f); - } - } else if (strcmp(op, "ll_statfs") == 0) { - int64_t i = t.get_int(); - if (ll_inos.count(i)) - {} //client->ll_statfs(ll_inos[i]); - } - - - // object-level traces - - else if (strcmp(op, "o_stat") == 0) { - int64_t oh = t.get_int(); - int64_t ol = t.get_int(); - object_t oid(oh, ol); - lock.Lock(); - ObjectLayout layout = client->osdmap->make_object_layout(oid, pg_t::TYPE_REP, 2); - off_t size; - client->objecter->stat(oid, &size, layout, new C_SafeCond(&lock, &cond, &ack)); - while (!ack) cond.Wait(lock); - lock.Unlock(); - } - else if (strcmp(op, "o_read") == 0) { - int64_t oh = t.get_int(); - int64_t ol = t.get_int(); - int64_t off = t.get_int(); - int64_t len = t.get_int(); - object_t oid(oh, ol); - lock.Lock(); - ObjectLayout layout = client->osdmap->make_object_layout(oid, pg_t::TYPE_REP, 2); - bufferlist bl; - client->objecter->read(oid, off, len, layout, &bl, new C_SafeCond(&lock, &cond, &ack)); - while (!ack) cond.Wait(lock); - lock.Unlock(); - } - else if (strcmp(op, "o_write") == 0) { - int64_t oh = t.get_int(); - int64_t ol = t.get_int(); - int64_t off = t.get_int(); - int64_t len = t.get_int(); - object_t oid(oh, ol); - lock.Lock(); - ObjectLayout layout = client->osdmap->make_object_layout(oid, pg_t::TYPE_REP, 2); - bufferptr bp(len); - bufferlist bl; - bl.push_back(bp); - client->objecter->write(oid, off, len, layout, bl, - new C_SafeCond(&lock, &cond, &ack), - safeg->new_sub()); - while (!ack) cond.Wait(lock); - lock.Unlock(); - } - else if (strcmp(op, "o_zero") == 0) { - int64_t oh = t.get_int(); - int64_t ol = t.get_int(); - int64_t off = t.get_int(); - int64_t len = t.get_int(); - object_t oid(oh, ol); - lock.Lock(); - ObjectLayout layout = client->osdmap->make_object_layout(oid, pg_t::TYPE_REP, 2); - client->objecter->zero(oid, off, len, layout, - new C_SafeCond(&lock, &cond, &ack), - safeg->new_sub()); - while (!ack) cond.Wait(lock); - lock.Unlock(); - } - - - else { - dout(0) << (t.get_line()-1) << ": *** trace hit unrecognized symbol '" << op << "' " << dendl; - assert(0); - } - } - - dout(10) << "trace finished on line " << t.get_line() << dendl; - - // wait for safe after an object trace - safegref->finish(0); - delete safegref; - lock.Lock(); - while (!safe) { - dout(10) << "waiting for safe" << dendl; - cond.Wait(lock); - } - lock.Unlock(); - - // close open files - for (hash_map::iterator fi = open_files.begin(); - fi != open_files.end(); - fi++) { - dout(1) << "leftover close " << fi->second << dendl; - if (fi->second > 0) client->close(fi->second); - } - for (hash_map::iterator fi = open_dirs.begin(); - fi != open_dirs.end(); - fi++) { - dout(1) << "leftover closedir " << fi->second << dendl; - if (fi->second != 0) client->closedir(fi->second); - } - for (hash_map::iterator fi = ll_files.begin(); - fi != ll_files.end(); - fi++) { - dout(1) << "leftover ll_release " << fi->second << dendl; - if (fi->second > 0) client->ll_release(fi->second); - } - for (hash_map::iterator fi = ll_dirs.begin(); - fi != ll_dirs.end(); - fi++) { - dout(1) << "leftover ll_releasedir " << fi->second << dendl; - if (fi->second > 0) client->ll_releasedir(fi->second); - } - - return 0; -} - - - -int SyntheticClient::clean_dir(string& basedir) -{ - // read dir - list contents; - int r = client->getdir(basedir.c_str(), contents); - if (r < 0) { - dout(1) << "readdir on " << basedir << " returns " << r << dendl; - return r; - } - - for (list::iterator it = contents.begin(); - it != contents.end(); - it++) { - if (*it == ".") continue; - if (*it == "..") continue; - string file = basedir + "/" + *it; - - if (time_to_stop()) break; - - struct stat st; - int r = client->lstat(file.c_str(), &st); - if (r < 0) { - dout(1) << "stat error on " << file << " r=" << r << dendl; - continue; - } - - if ((st.st_mode & S_IFMT) == S_IFDIR) { - clean_dir(file); - client->rmdir(file.c_str()); - } else { - client->unlink(file.c_str()); - } - } - - return 0; - -} - - -int SyntheticClient::full_walk(string& basedir) -{ - if (time_to_stop()) return -1; - - list dirq; - dirq.push_back(basedir); - - while (!dirq.empty()) { - string dir = dirq.front(); - dirq.pop_front(); - - // read dir - list contents; - int r = client->getdir(dir.c_str(), contents); - if (r < 0) { - dout(1) << "readdir on " << dir << " returns " << r << dendl; - continue; - } - - for (list::iterator it = contents.begin(); - it != contents.end(); - it++) { - if (*it == "." || - *it == "..") - continue; - string file = dir + "/" + *it; - - struct stat st; - int r = client->lstat(file.c_str(), &st); - if (r < 0) { - dout(1) << "stat error on " << file << " r=" << r << dendl; - continue; - } - - // print - char *tm = ctime(&st.st_mtime); - tm[strlen(tm)-1] = 0; - printf("%llx %c%c%c%c%c%c%c%c%c%c %2d %5d %5d %8d %12s %s\n", - (long long)st.st_ino, - S_ISDIR(st.st_mode) ? 'd':'-', - (st.st_mode & 0400) ? 'r':'-', - (st.st_mode & 0200) ? 'w':'-', - (st.st_mode & 0100) ? 'x':'-', - (st.st_mode & 040) ? 'r':'-', - (st.st_mode & 020) ? 'w':'-', - (st.st_mode & 010) ? 'x':'-', - (st.st_mode & 04) ? 'r':'-', - (st.st_mode & 02) ? 'w':'-', - (st.st_mode & 01) ? 'x':'-', - (int)st.st_nlink, - st.st_uid, st.st_gid, - (int)st.st_size, - tm, - file.c_str()); - - - if ((st.st_mode & S_IFMT) == S_IFDIR) { - dirq.push_back(file); - } - } - } - - return 0; -} - -int SyntheticClient::make_dirs(const char *basedir, int dirs, int files, int depth) -{ - if (time_to_stop()) return 0; - - // make sure base dir exists - int r = client->mkdir(basedir, 0755); - if (r != 0) { - dout(1) << "can't make base dir? " << basedir << dendl; - return -1; - } - - // children - char d[500]; - dout(3) << "make_dirs " << basedir << " dirs " << dirs << " files " << files << " depth " << depth << dendl; - for (int i=0; imknod(d, 0644); - } - - if (depth == 0) return 0; - - for (int i=0; ilstat(basedir, &st); - if (r != 0) { - dout(1) << "can't make base dir? " << basedir << dendl; - return -1; - } - - // children - char d[500]; - dout(3) << "stat_dirs " << basedir << " dirs " << dirs << " files " << files << " depth " << depth << dendl; - for (int i=0; ilstat(d, &st); - } - - if (depth == 0) return 0; - - for (int i=0; i contents; - utime_t s = g_clock.now(); - int r = client->getdir(basedir, contents); - utime_t e = g_clock.now(); - e -= s; - if (client_logger) client_logger->finc("readdir", e); - if (r < 0) { - dout(0) << "read_dirs couldn't readdir " << basedir << ", stopping" << dendl; - return -1; - } - - for (int i=0; ilstat(d, &st) < 0) { - dout(2) << "read_dirs failed stat on " << d << ", stopping" << dendl; - return -1; - } - utime_t e = g_clock.now(); - e -= s; - if (client_logger) client_logger->finc("stat", e); - } - - if (depth > 0) - for (int i=0; iget_nodeid(); - char d[255]; - - if (priv) { - for (int c=0; cmkdir(d, 0755); - } - } else { - // shared - if (true || whoami == 0) { - for (int c=0; cmkdir(d, 0755); - } - } else { - sleep(2); - } - } - - // files - struct stat st; - utime_t start = g_clock.now(); - for (int c=0; cmknod(d, 0644); - - if (more) { - client->lstat(d, &st); - int fd = client->open(d, O_RDONLY); - client->unlink(d); - client->close(fd); - } - - if (time_to_stop()) return 0; - } - } - utime_t end = g_clock.now(); - end -= start; - dout(0) << "makefiles time is " << end << " or " << ((double)end / (double)num) <<" per file" << dendl; - - return 0; -} - -int SyntheticClient::link_test() -{ - char d[255]; - char e[255]; - - // create files - int num = 200; - - client->mkdir("orig", 0755); - client->mkdir("copy", 0755); - - utime_t start = g_clock.now(); - for (int i=0; imknod(d, 0755); - } - utime_t end = g_clock.now(); - end -= start; - - dout(0) << "orig " << end << dendl; - - // link - start = g_clock.now(); - for (int i=0; ilink(d, e); - } - end = g_clock.now(); - end -= start; - dout(0) << "copy " << end << dendl; - - return 0; -} - - -int SyntheticClient::create_shared(int num) -{ - // files - char d[255]; - for (int n=0; nmknod(d, 0644); - } - - return 0; -} - -int SyntheticClient::open_shared(int num, int count) -{ - // files - char d[255]; - for (int c=0; c fds; - for (int n=0; nopen(d,O_RDONLY); - if (fd > 0) fds.push_back(fd); - } - - if (false && client->get_nodeid() == 0) - for (int n=0; nunlink(d); - } - - while (!fds.empty()) { - int fd = fds.front(); - fds.pop_front(); - client->close(fd); - } - } - - return 0; -} - - -int SyntheticClient::write_file(string& fn, int size, int wrsize) // size is in MB, wrsize in bytes -{ - //uint64_t wrsize = 1024*256; - char *buf = new char[wrsize+100]; // 1 MB - memset(buf, 7, wrsize); - uint64_t chunks = (uint64_t)size * (uint64_t)(1024*1024) / (uint64_t)wrsize; - - int fd = client->open(fn.c_str(), O_RDWR|O_CREAT); - dout(5) << "writing to " << fn << " fd " << fd << dendl; - if (fd < 0) return fd; - - for (unsigned i=0; iget_nodeid(); - p++; - } - - client->write(fd, buf, wrsize, i*wrsize); - } - - client->close(fd); - delete[] buf; - - return 0; -} - -int SyntheticClient::write_batch(int nfile, int size, int wrsize) -{ - for (int i=0; iopen(fn.c_str(), O_RDONLY); - dout(5) << "reading from " << fn << " fd " << fd << dendl; - if (fd < 0) return fd; - - for (unsigned i=0; iread(fd, buf, rdsize, i*rdsize); - if (r < rdsize) { - dout(1) << "read_file got r = " << r << ", probably end of file" << dendl; - break; - } - - // verify fingerprint - int bad = 0; - uint64_t *p = (uint64_t*)buf; - uint64_t readoff; - int64_t readclient; - while ((char*)p + 32 < buf + rdsize) { - readoff = *p; - uint64_t wantoff = (uint64_t)i*(uint64_t)rdsize + (uint64_t)((char*)p - buf); - p++; - readclient = *p; - p++; - if (readoff != wantoff || - readclient != client->get_nodeid()) { - if (!bad && !ignoreprint) - dout(0) << "WARNING: wrong data from OSD, block says fileoffset=" << readoff << " client=" << readclient - << ", should be offset " << wantoff << " clietn " << client->get_nodeid() - << dendl; - bad++; - } - } - if (bad && !ignoreprint) - dout(0) << " + " << (bad-1) << " other bad 16-byte bits in this block" << dendl; - } - - client->close(fd); - delete[] buf; - - return 0; -} - - - - -class C_Ref : public Context { - Mutex& lock; - Cond& cond; - int *ref; -public: - C_Ref(Mutex &l, Cond &c, int *r) : lock(l), cond(c), ref(r) { - lock.Lock(); - (*ref)++; - lock.Unlock(); - } - void finish(int) { - lock.Lock(); - (*ref)--; - cond.Signal(); - lock.Unlock(); - } -}; - -int SyntheticClient::create_objects(int nobj, int osize, int inflight) -{ - // divy up - int numc = g_conf.num_client ? g_conf.num_client : 1; - - int start, inc, end; - - if (1) { - // strided - start = client->get_nodeid(); //nobjs % numc; - inc = numc; - end = start + nobj; - } else { - // segments - start = nobj * client->get_nodeid() / numc; - inc = 1; - end = nobj * (client->get_nodeid()+1) / numc; - } - - dout(5) << "create_objects " << nobj << " size=" << osize - << " .. doing [" << start << "," << end << ") inc " << inc - << dendl; - - bufferptr bp(osize); - bp.zero(); - bufferlist bl; - bl.push_back(bp); - - Mutex lock; - Cond cond; - - int unack = 0; - int unsafe = 0; - - list starts; - - for (int i=start; iosdmap->make_object_layout(oid, pg_t::TYPE_REP, g_OSD_FileLayout.fl_pg_size); - - if (i % inflight == 0) { - dout(6) << "create_objects " << i << "/" << (nobj+1) << dendl; - } - dout(10) << "writing " << oid << dendl; - - starts.push_back(g_clock.now()); - client->client_lock.Lock(); - client->objecter->write(oid, 0, osize, layout, bl, - new C_Ref(lock, cond, &unack), - new C_Ref(lock, cond, &unsafe)); - client->client_lock.Unlock(); - - lock.Lock(); - while (unack > inflight) { - dout(20) << "waiting for " << unack << " unack" << dendl; - cond.Wait(lock); - } - lock.Unlock(); - - utime_t lat = g_clock.now(); - lat -= starts.front(); - starts.pop_front(); - if (client_logger) - client_logger->favg("owrlat", lat); - } - - lock.Lock(); - while (unack > 0) { - dout(20) << "waiting for " << unack << " unack" << dendl; - cond.Wait(lock); - } - while (unsafe > 0) { - dout(10) << "waiting for " << unsafe << " unsafe" << dendl; - cond.Wait(lock); - } - lock.Unlock(); - - dout(5) << "create_objects done" << dendl; - derr(0) << "create_objects done" << dendl; - return 0; -} - -int SyntheticClient::object_rw(int nobj, int osize, int wrpc, - int overlappc, - double rskew, double wskew) -{ - dout(5) << "object_rw " << nobj << " size=" << osize << " with " - << wrpc << "% writes" - << ", " << overlappc << "% overlap" - << ", rskew = " << rskew - << ", wskew = " << wskew - << dendl; - - bufferptr bp(osize); - bp.zero(); - bufferlist bl; - bl.push_back(bp); - - // start with odd number > nobj - rjhash h; - unsigned prime = nobj + 1; // this is the minimum! - prime += h(nobj) % (3*nobj); // bump it up some - prime |= 1; // make it odd - - while (true) { - unsigned j; - for (j=2; j*j<=prime; j++) - if (prime % j == 0) break; - if (j*j > prime) { - break; - //cout << "prime " << prime << endl; - } - prime += 2; - } - - Mutex lock; - Cond cond; - - int unack = 0; - int unsafe = 0; - - while (1) { - if (time_to_stop()) break; - - // read or write? - bool write = (rand() % 100) < wrpc; - - // choose object - double r = drand48(); // [0..1) - long o; - if (write) { - o = (long)trunc(pow(r, wskew) * (double)nobj); // exponentially skew towards 0 - int pnoremap = (long)(r * 100.0); - if (pnoremap >= overlappc) - o = (o*prime) % nobj; // remap - } else { - o = (long)trunc(pow(r, rskew) * (double)nobj); // exponentially skew towards 0 - } - object_t oid(0x1000, o); - - ObjectLayout layout = client->osdmap->make_object_layout(oid, pg_t::TYPE_REP, g_OSD_FileLayout.fl_pg_size); - - client->client_lock.Lock(); - utime_t start = g_clock.now(); - if (write) { - dout(10) << "write to " << oid << dendl; - client->objecter->write(oid, 0, osize, layout, bl, - new C_Ref(lock, cond, &unack), - new C_Ref(lock, cond, &unsafe)); - } else { - dout(10) << "read from " << oid << dendl; - bufferlist inbl; - client->objecter->read(oid, 0, osize, layout, &inbl, - new C_Ref(lock, cond, &unack)); - } - client->client_lock.Unlock(); - - lock.Lock(); - while (unack > 0) { - dout(20) << "waiting for " << unack << " unack" << dendl; - cond.Wait(lock); - } - lock.Unlock(); - - utime_t lat = g_clock.now(); - lat -= start; - if (client_logger) { - if (write) - client_logger->favg("owrlat", lat); - else - client_logger->favg("ordlat", lat); - } - } - - - lock.Lock(); - while (unsafe > 0) { - dout(10) << "waiting for " << unsafe << " unsafe" << dendl; - cond.Wait(lock); - } - lock.Unlock(); - return 0; -} - - - - - -int SyntheticClient::read_random(string& fn, int size, int rdsize) // size is in MB, wrsize in bytes -{ - __uint64_t chunks = (__uint64_t)size * (__uint64_t)(1024*1024) / (__uint64_t)rdsize; - - int fd = client->open(fn.c_str(), O_RDWR); - dout(5) << "reading from " << fn << " fd " << fd << dendl; - - // dout(0) << "READING FROM " << fn << " fd " << fd << dendl; - - // dout(0) << "filename " << fn << " size:" << size << " read size|" << rdsize << "|" << "\ chunks: |" << chunks <<"|" << dendl; - - if (fd < 0) return fd; - int offset = 0; - char * buf = NULL; - - for (unsigned i=0; i<2000; i++) { - if (time_to_stop()) break; - - bool read=false; - - time_t seconds; - time( &seconds); - srand(seconds); - - // use rand instead ?? - double x = drand48(); - - //dout(0) << "RANDOM NUMBER RETURN |" << x << "|" << dendl; - - if ( x < 0.5) - { - //dout(0) << "DECIDED TO READ " << x << dendl; - buf = new char[rdsize]; - memset(buf, 1, rdsize); - read=true; - } - else - { - // dout(0) << "DECIDED TO WRITE " << x << dendl; - buf = new char[rdsize+100]; // 1 MB - memset(buf, 7, rdsize); - } - - //double y = drand48() ; - - //dout(0) << "OFFSET is |" << offset << "| chunks |" << chunks<< dendl; - - if ( read) - { - offset=(rand())%(chunks+1); - dout(2) << "reading block " << offset << "/" << chunks << dendl; - - int r = client->read(fd, buf, rdsize, - offset*rdsize); - if (r < rdsize) { - dout(1) << "read_file got r = " << r << ", probably end of file" << dendl; - } - } - else - { - dout(2) << "writing block " << offset << "/" << chunks << dendl; - - // fill buf with a 16 byte fingerprint - // 64 bits : file offset - // 64 bits : client id - // = 128 bits (16 bytes) - - //if (true ) - //{ - //int count = rand()%10; - - //for ( int j=0;jget_nodeid(); - p++; - } - - client->write(fd, buf, rdsize, - offset*rdsize); - //} - //} - } - - // verify fingerprint - if ( read ) - { - int bad = 0; - __int64_t *p = (__int64_t*)buf; - __int64_t readoff, readclient; - while ((char*)p + 32 < buf + rdsize) { - readoff = *p; - __int64_t wantoff = offset*rdsize + (__int64_t)((char*)p - buf); - p++; - readclient = *p; - p++; - if (readoff != wantoff || - readclient != client->get_nodeid()) { - if (!bad) - dout(0) << "WARNING: wrong data from OSD, block says fileoffset=" << readoff << " client=" << readclient - << ", should be offset " << wantoff << " clietn " << client->get_nodeid() - << dendl; - bad++; - } - } - if (bad) - dout(0) << " + " << (bad-1) << " other bad 16-byte bits in this block" << dendl; - } - } - - client->close(fd); - delete[] buf; - - return 0; -} - - -//#include -//#include - -int normdist(int min, int max, int stdev) /* specifies input values */; -//main() -//{ - // for ( int i=0; i < 10; i++ ) - // normdist ( 0 , 10, 1 ); - -//} - - -int normdist(int min, int max, int stdev) /* specifies input values */ -{ - /* min: Minimum value; max: Maximum value; stdev: degree of deviation */ - - //int min, max, stdev; { - time_t seconds; - time( &seconds); - srand(seconds); - - int range, iterate, result; - /* declare range, iterate and result as integers, to avoid the need for - floating point math*/ - - result = 0; - /* ensure result is initialized to 0 */ - - range = max -min; - /* calculate range of possible values between the max and min values */ - - iterate = range / stdev; - /* this number of iterations ensures the proper shape of the resulting - curve */ - - stdev += 1; /* compensation for integer vs. floating point math */ - for (int c = iterate; c != 0; c--) /* loop through iterations */ - { - // result += (uniform (1, 100) * stdev) / 100; /* calculate and - result += ( (rand()%100 + 1) * stdev) / 100; - // printf("result=%d\n", result ); - } - printf("\n final result=%d\n", result ); - return result + min; /* send final result back */ -} - -int SyntheticClient::read_random_ex(string& fn, int size, int rdsize) // size is in MB, wrsize in bytes -{ - __uint64_t chunks = (__uint64_t)size * (__uint64_t)(1024*1024) / (__uint64_t)rdsize; - - int fd = client->open(fn.c_str(), O_RDWR); - dout(5) << "reading from " << fn << " fd " << fd << dendl; - - // dout(0) << "READING FROM " << fn << " fd " << fd << dendl; - - // dout(0) << "filename " << fn << " size:" << size << " read size|" << rdsize << "|" << "\ chunks: |" << chunks <<"|" << dendl; - - if (fd < 0) return fd; - int offset = 0; - char * buf = NULL; - - for (unsigned i=0; i<2000; i++) { - if (time_to_stop()) break; - - bool read=false; - - time_t seconds; - time( &seconds); - srand(seconds); - - // use rand instead ?? - double x = drand48(); - - //dout(0) << "RANDOM NUMBER RETURN |" << x << "|" << dendl; - - if ( x < 0.5) - { - //dout(0) << "DECIDED TO READ " << x << dendl; - buf = new char[rdsize]; - memset(buf, 1, rdsize); - read=true; - } - else - { - // dout(0) << "DECIDED TO WRITE " << x << dendl; - buf = new char[rdsize+100]; // 1 MB - memset(buf, 7, rdsize); - } - - //double y = drand48() ; - - //dout(0) << "OFFSET is |" << offset << "| chunks |" << chunks<< dendl; - - if ( read) - { - //offset=(rand())%(chunks+1); - - /* if ( chunks > 10000 ) - offset= normdist( 0 , chunks/1000 , 5 )*1000; - else if ( chunks > 1000 ) - offset= normdist( 0 , chunks/100 , 5 )*100; - else if ( chunks > 100 ) - offset= normdist( 0 , chunks/20 , 5 )*20;*/ - - - dout(2) << "reading block " << offset << "/" << chunks << dendl; - - int r = client->read(fd, buf, rdsize, - offset*rdsize); - if (r < rdsize) { - dout(1) << "read_file got r = " << r << ", probably end of file" << dendl; - } - } - else - { - dout(2) << "writing block " << offset << "/" << chunks << dendl; - - // fill buf with a 16 byte fingerprint - // 64 bits : file offset - // 64 bits : client id - // = 128 bits (16 bytes) - - //if (true ) - //{ - int count = rand()%10; - - for ( int j=0;jget_nodeid(); - p++; - } - - client->write(fd, buf, rdsize, - offset*rdsize); - } - //} - } - - // verify fingerprint - if ( read ) - { - int bad = 0; - __int64_t *p = (__int64_t*)buf; - __int64_t readoff, readclient; - while ((char*)p + 32 < buf + rdsize) { - readoff = *p; - __int64_t wantoff = offset*rdsize + (__int64_t)((char*)p - buf); - p++; - readclient = *p; - p++; - if (readoff != wantoff || - readclient != client->get_nodeid()) { - if (!bad) - dout(0) << "WARNING: wrong data from OSD, block says fileoffset=" << readoff << " client=" << readclient - << ", should be offset " << wantoff << " clietn " << client->get_nodeid() - << dendl; - bad++; - } - } - if (bad) - dout(0) << " + " << (bad-1) << " other bad 16-byte bits in this block" << dendl; - } - } - - client->close(fd); - delete[] buf; - - return 0; -} - - -int SyntheticClient::random_walk(int num_req) -{ - int left = num_req; - - //dout(1) << "random_walk() will do " << left << " ops" << dendl; - - init_op_dist(); // set up metadata op distribution - - while (left > 0) { - left--; - - if (time_to_stop()) break; - - // ascend? - if (cwd.depth() && !roll_die(::pow((double).9, (double)cwd.depth()))) { - dout(DBL) << "die says up" << dendl; - up(); - continue; - } - - // descend? - if (.9*roll_die(::pow((double).9,(double)cwd.depth())) && subdirs.size()) { - string s = get_random_subdir(); - cwd.push_dentry( s ); - dout(DBL) << "cd " << s << " -> " << cwd << dendl; - clear_dir(); - continue; - } - - int op = 0; - filepath path; - - if (contents.empty() && roll_die(.3)) { - if (did_readdir) { - dout(DBL) << "empty dir, up" << dendl; - up(); - } else - op = MDS_OP_READDIR; - } else { - op = op_dist.sample(); - } - //dout(DBL) << "op is " << op << dendl; - - int r = 0; - - // do op - if (op == MDS_OP_UNLINK) { - if (contents.empty()) - op = MDS_OP_READDIR; - else - r = client->unlink( get_random_sub() ); // will fail on dirs - } - - if (op == MDS_OP_RENAME) { - if (contents.empty()) - op = MDS_OP_READDIR; - else { - r = client->rename( get_random_sub(), make_sub("ren") ); - } - } - - if (op == MDS_OP_MKDIR) { - r = client->mkdir( make_sub("mkdir"), 0755); - } - - if (op == MDS_OP_RMDIR) { - if (!subdirs.empty()) - r = client->rmdir( get_random_subdir() ); - else - r = client->rmdir( cwd.c_str() ); // will pbly fail - } - - if (op == MDS_OP_SYMLINK) { - } - - if (op == MDS_OP_CHMOD) { - if (contents.empty()) - op = MDS_OP_READDIR; - else - r = client->chmod( get_random_sub(), rand() & 0755 ); - } - - if (op == MDS_OP_CHOWN) { - if (contents.empty()) r = client->chown( cwd.c_str(), rand(), rand() ); - else - r = client->chown( get_random_sub(), rand(), rand() ); - } - - if (op == MDS_OP_LINK) { - } - - if (op == MDS_OP_UTIME) { - struct utimbuf b; - memset(&b, 1, sizeof(b)); - if (contents.empty()) - r = client->utime( cwd.c_str(), &b ); - else - r = client->utime( get_random_sub(), &b ); - } - - if (op == MDS_OP_MKNOD) { - r = client->mknod( make_sub("mknod"), 0644); - } - - if (op == MDS_OP_OPEN) { - if (contents.empty()) - op = MDS_OP_READDIR; - else { - r = client->open( get_random_sub(), O_RDONLY ); - if (r > 0) { - assert(open_files.count(r) == 0); - open_files.insert(r); - } - } - } - - if (op == MDS_OP_RELEASE) { // actually, close - if (open_files.empty()) - op = MDS_OP_STAT; - else { - int fh = get_random_fh(); - r = client->close( fh ); - if (r == 0) open_files.erase(fh); - } - } - - if (op == MDS_OP_STAT) { - struct stat st; - if (contents.empty()) { - if (did_readdir) { - if (roll_die(.1)) { - dout(DBL) << "stat in empty dir, up" << dendl; - up(); - } else { - op = MDS_OP_MKNOD; - } - } else - op = MDS_OP_READDIR; - } else - r = client->lstat(get_random_sub(), &st); - } - - if (op == MDS_OP_READDIR) { - clear_dir(); - - list c; - r = client->getdir( cwd.c_str(), c ); - - for (list::iterator it = c.begin(); - it != c.end(); - it++) { - //dout(DBL) << " got " << *it << dendl; - assert(0); - /*contents[*it] = it->second; - if (it->second && - S_ISDIR(it->second->st_mode)) - subdirs.insert(*it); - */ - } - - did_readdir = true; - } - - // errors? - if (r < 0) { - // reevaluate cwd. - //while (cwd.depth()) { - //if (client->lookup(cwd)) break; // it's in the cache - - //dout(DBL) << "r = " << r << ", client doesn't have " << cwd << ", cd .." << dendl; - dout(DBL) << "r = " << r << ", client may not have " << cwd << ", cd .." << dendl; - up(); - //} - } - } - - // close files - dout(DBL) << "closing files" << dendl; - while (!open_files.empty()) { - int fh = get_random_fh(); - int r = client->close( fh ); - if (r == 0) open_files.erase(fh); - } - - dout(DBL) << "done" << dendl; - return 0; -} - - - - -void SyntheticClient::make_dir_mess(const char *basedir, int n) -{ - vector dirs; - - dirs.push_back(basedir); - dirs.push_back(basedir); - - client->mkdir(basedir, 0755); - - // motivation: - // P(dir) ~ subdirs_of(dir) + 2 - // from 5-year metadata workload paper in fast'07 - - // create dirs - for (int i=0; imkdir(dir.c_str(), 0755); - } - - -} - - - -void SyntheticClient::foo() -{ - if (1) { - // open some files - srand(0); - for (int i=0; i<20; i++) { - int s = 5; - int a = rand() % s; - int b = rand() % s; - int c = rand() % s; - char src[80]; - sprintf(src, "syn.0.0/dir.%d/dir.%d/file.%d", a, b, c); - //int fd = - client->open(src, O_RDONLY); - } - - return; - } - - if (0) { - // rename fun - for (int i=0; i<100; i++) { - int s = 5; - int a = rand() % s; - int b = rand() % s; - int c = rand() % s; - int d = rand() % s; - int e = rand() % s; - int f = rand() % s; - char src[80]; - char dst[80]; - sprintf(src, "syn.0.0/dir.%d/dir.%d/file.%d", a, b, c); - sprintf(dst, "syn.0.0/dir.%d/dir.%d/file.%d", d, e, f); - client->rename(src, dst); - } - return; - } - - if (1) { - // link fun - srand(0); - for (int i=0; i<100; i++) { - int s = 5; - int a = rand() % s; - int b = rand() % s; - int c = rand() % s; - int d = rand() % s; - int e = rand() % s; - int f = rand() % s; - char src[80]; - char dst[80]; - sprintf(src, "syn.0.0/dir.%d/dir.%d/file.%d", a, b, c); - sprintf(dst, "syn.0.0/dir.%d/dir.%d/newlink.%d", d, e, f); - client->link(src, dst); - } - srand(0); - for (int i=0; i<100; i++) { - int s = 5; - int a = rand() % s; - int b = rand() % s; - int c = rand() % s; - int d = rand() % s; - int e = rand() % s; - int f = rand() % s; - char src[80]; - char dst[80]; - sprintf(src, "syn.0.0/dir.%d/dir.%d/file.%d", a, b, c); - sprintf(dst, "syn.0.0/dir.%d/dir.%d/newlink.%d", d, e, f); - client->unlink(dst); - } - - - return; - } - - // link fun - client->mknod("one", 0755); - client->mknod("two", 0755); - client->link("one", "three"); - client->mkdir("dir", 0755); - client->link("two", "/dir/twolink"); - client->link("dir/twolink", "four"); - - // unlink fun - client->mknod("a", 0644); - client->unlink("a"); - client->mknod("b", 0644); - client->link("b", "c"); - client->unlink("c"); - client->mkdir("d", 0755); - client->unlink("d"); - client->rmdir("d"); - - // rename fun - client->mknod("p1", 0644); - client->mknod("p2", 0644); - client->rename("p1","p2"); - client->mknod("p3", 0644); - client->rename("p3","p4"); - - // check dest dir ambiguity thing - client->mkdir("dir1", 0755); - client->mkdir("dir2", 0755); - client->rename("p2","dir1/p2"); - client->rename("dir1/p2","dir2/p2"); - client->rename("dir2/p2","/p2"); - - // check primary+remote link merging - client->link("p2","p2.l"); - client->link("p4","p4.l"); - client->rename("p2.l","p2"); - client->rename("p4","p4.l"); - - // check anchor updates - client->mknod("dir1/a", 0644); - client->link("dir1/a", "da1"); - client->link("dir1/a", "da2"); - client->link("da2","da3"); - client->rename("dir1/a","dir2/a"); - client->rename("dir2/a","da2"); - client->rename("da1","da2"); - client->rename("da2","da3"); - - // check directory renames - client->mkdir("dir3", 0755); - client->mknod("dir3/asdf", 0644); - client->mkdir("dir4", 0755); - client->mkdir("dir5", 0755); - client->mknod("dir5/asdf", 0644); - client->rename("dir3","dir4"); // ok - client->rename("dir4","dir5"); // fail -} - -int SyntheticClient::thrash_links(const char *basedir, int dirs, int files, int depth, int n) -{ - dout(1) << "thrash_links " << basedir << " " << dirs << " " << files << " " << depth - << " links " << n - << dendl; - - if (time_to_stop()) return 0; - - for (int k=0; krename(dst.c_str(), "/tmp") == 0) { - client->rename(src.c_str(), dst.c_str()); - client->rename("/tmp", src.c_str()); - } - continue; - } - - // pick a dest dir - string src = basedir; - { - char t[80]; - for (int d=0; dmknod(src.c_str(), 0755); - client->rename(src.c_str(), dst.c_str()); - break; - case 1: - client->mknod(src.c_str(), 0755); - client->unlink(dst.c_str()); - client->link(src.c_str(), dst.c_str()); - break; - case 2: client->unlink(src.c_str()); break; - case 3: client->unlink(dst.c_str()); break; - //case 4: client->mknod(src.c_str(), 0755); break; - //case 5: client->mknod(dst.c_str(), 0755); break; - } - } - return 0; - - // now link shit up - for (int i=0; ilink(file.c_str(), ln.c_str()); - } - - return 0; -} - - - - -void SyntheticClient::import_find(const char *base, const char *find, bool data) -{ - dout(1) << "import_find " << base << " from " << find << " data=" << data << dendl; - - /* use this to gather the static trace: - * - * find . -exec ls -dilsn --time-style=+%s \{\} \; - * or if it's wafl, - * find . -path ./.snapshot -prune -o -exec ls -dilsn --time-style=+%s \{\} \; - * - */ - - if (base[0] != '-') - client->mkdir(base, 0755); - - ifstream f(find); - assert(f.is_open()); - - int dirnum = 0; - - while (!f.eof()) { - uint64_t ino; - int dunno, nlink; - string modestring; - int uid, gid; - off_t size; - time_t mtime; - string filename; - f >> ino; - if (f.eof()) break; - f >> dunno; - f >> modestring; - f >> nlink; - f >> uid; - f >> gid; - f >> size; - f >> mtime; - f.seekg(1, ios::cur); - getline(f, filename); - - // ignore "." - if (filename == ".") continue; - - // remove leading ./ - assert(filename[0] == '.' && filename[1] == '/'); - filename = filename.substr(2); - - // new leading dir? - int sp = filename.find("/"); - if (sp < 0) dirnum++; - - //dout(0) << "leading dir " << filename << " " << dirnum << dendl; - if (dirnum % g_conf.num_client != client->get_nodeid()) { - dout(20) << "skipping leading dir " << dirnum << " " << filename << dendl; - continue; - } - - // parse the mode - assert(modestring.length() == 10); - mode_t mode = 0; - switch (modestring[0]) { - case 'd': mode |= S_IFDIR; break; - case 'l': mode |= S_IFLNK; break; - default: - case '-': mode |= S_IFREG; break; - } - if (modestring[1] == 'r') mode |= 0400; - if (modestring[2] == 'w') mode |= 0200; - if (modestring[3] == 'x') mode |= 0100; - if (modestring[4] == 'r') mode |= 040; - if (modestring[5] == 'w') mode |= 020; - if (modestring[6] == 'x') mode |= 010; - if (modestring[7] == 'r') mode |= 04; - if (modestring[8] == 'w') mode |= 02; - if (modestring[9] == 'x') mode |= 01; - - dout(20) << " mode " << modestring << " to " << oct << mode << dec << dendl; - - if (S_ISLNK(mode)) { - // target vs destination - int pos = filename.find(" -> "); - assert(pos > 0); - string link; - if (base[0] != '-') { - link = base; - link += "/"; - } - link += filename.substr(0, pos); - string target; - if (filename[pos+4] == '/') { - if (base[0] != '-') - target = base; - target += filename.substr(pos + 4); - } else { - target = filename.substr(pos + 4); - } - dout(10) << "symlink from '" << link << "' -> '" << target << "'" << dendl; - client->symlink(target.c_str(), link.c_str()); - } else { - string f; - if (base[0] != '-') { - f = base; - f += "/"; - } - f += filename; - if (S_ISDIR(mode)) { - client->mkdir(f.c_str(), mode); - } else { - int fd = client->open(f.c_str(), O_WRONLY|O_CREAT, mode & 0777); - assert(fd > 0); - client->write(fd, "", 0, size); - client->close(fd); - - //client->chmod(f.c_str(), mode & 0777); - client->chown(f.c_str(), uid, gid); - - struct utimbuf ut; - ut.modtime = mtime; - ut.actime = mtime; - client->utime(f.c_str(), &ut); - } - } - } - - -} - diff --git a/branches/sage/mds/client/SyntheticClient.h b/branches/sage/mds/client/SyntheticClient.h deleted file mode 100644 index ce09b18addfb2..0000000000000 --- a/branches/sage/mds/client/SyntheticClient.h +++ /dev/null @@ -1,241 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __SYNTHETICCLIENT_H -#define __SYNTHETICCLIENT_H - -#include - -#include "Client.h" -#include "include/Distribution.h" - -#include "Trace.h" - -#define SYNCLIENT_MODE_RANDOMWALK 1 -#define SYNCLIENT_MODE_FULLWALK 2 -#define SYNCLIENT_MODE_REPEATWALK 3 - -#define SYNCLIENT_MODE_MAKEDIRMESS 7 -#define SYNCLIENT_MODE_MAKEDIRS 8 // dirs files depth -#define SYNCLIENT_MODE_STATDIRS 9 // dirs files depth -#define SYNCLIENT_MODE_READDIRS 10 // dirs files depth - -#define SYNCLIENT_MODE_MAKEFILES 11 // num count private -#define SYNCLIENT_MODE_MAKEFILES2 12 // num count private -#define SYNCLIENT_MODE_CREATESHARED 13 // num -#define SYNCLIENT_MODE_OPENSHARED 14 // num count - -#define SYNCLIENT_MODE_WRITEFILE 20 -#define SYNCLIENT_MODE_READFILE 21 -#define SYNCLIENT_MODE_WRITEBATCH 22 -#define SYNCLIENT_MODE_WRSHARED 23 -#define SYNCLIENT_MODE_READSHARED 24 -#define SYNCLIENT_MODE_RDWRRANDOM 25 -#define SYNCLIENT_MODE_RDWRRANDOM_EX 26 - -#define SYNCLIENT_MODE_LINKTEST 27 - -#define SYNCLIENT_MODE_TRACE 30 - -#define SYNCLIENT_MODE_CREATEOBJECTS 35 -#define SYNCLIENT_MODE_OBJECTRW 36 - -#define SYNCLIENT_MODE_OPENTEST 40 -#define SYNCLIENT_MODE_OPTEST 41 - -#define SYNCLIENT_MODE_ONLY 50 -#define SYNCLIENT_MODE_ONLYRANGE 51 -#define SYNCLIENT_MODE_EXCLUDE 52 -#define SYNCLIENT_MODE_EXCLUDERANGE 53 - -#define SYNCLIENT_MODE_UNTIL 55 -#define SYNCLIENT_MODE_SLEEPUNTIL 56 - -#define SYNCLIENT_MODE_RANDOMSLEEP 61 -#define SYNCLIENT_MODE_SLEEP 62 - -#define SYNCLIENT_MODE_TRUNCATE 200 - -#define SYNCLIENT_MODE_FOO 100 -#define SYNCLIENT_MODE_THRASHLINKS 101 - -#define SYNCLIENT_MODE_IMPORTFIND 300 - - - -void parse_syn_options(vector& args); - -class SyntheticClient { - Client *client; - - pthread_t thread_id; - - Distribution op_dist; - - void init_op_dist(); - int get_op(); - - - filepath cwd; - map contents; - set subdirs; - bool did_readdir; - set open_files; - - void up(); - - void clear_dir() { - contents.clear(); - subdirs.clear(); - did_readdir = false; - } - - int get_random_fh() { - int r = rand() % open_files.size(); - set::iterator it = open_files.begin(); - while (r--) it++; - return *it; - } - - - filepath n1; - const char *get_random_subdir() { - assert(!subdirs.empty()); - int r = ((rand() % subdirs.size()) + (rand() % subdirs.size())) / 2; // non-uniform distn - set::iterator it = subdirs.begin(); - while (r--) it++; - - n1 = cwd; - n1.push_dentry( *it ); - return n1.get_path().c_str(); - } - filepath n2; - const char *get_random_sub() { - assert(!contents.empty()); - int r = ((rand() % contents.size()) + (rand() % contents.size())) / 2; // non-uniform distn - if (cwd.depth() && cwd.last_dentry().length()) - r += cwd.last_dentry().c_str()[0]; // slightly permuted - r %= contents.size(); - - map::iterator it = contents.begin(); - while (r--) it++; - - n2 = cwd; - n2.push_dentry( it->first ); - return n2.get_path().c_str(); - } - - filepath sub; - char sub_s[50]; - const char *make_sub(char *base) { - sprintf(sub_s, "%s.%d", base, rand() % 100); - string f = sub_s; - sub = cwd; - sub.push_dentry(f); - return sub.c_str(); - } - - public: - SyntheticClient(Client *client); - - int start_thread(); - int join_thread(); - - int run(); - - bool run_me() { - if (run_only >= 0) { - if (run_only == client->get_nodeid()) - return true; - else - return false; - } - return true; - } - void did_run_me() { - run_only = -1; - run_until = utime_t(); - } - - // run() will do one of these things: - list modes; - list sargs; - list iargs; - utime_t run_start; - utime_t run_until; - - int run_only; - int exclude; - - string get_sarg(int seq); - int get_iarg() { - int i = iargs.front(); - iargs.pop_front(); - return i; - } - - bool time_to_stop() { - utime_t now = g_clock.now(); - if (0) cout << "time_to_stop .. now " << now - << " until " << run_until - << " start " << run_start - << std::endl; - if (run_until.sec() && now > run_until) - return true; - else - return false; - } - - string compose_path(string& prefix, char *rest) { - return prefix + rest; - } - - int full_walk(string& fromdir); - int random_walk(int n); - - int make_dirs(const char *basedir, int dirs, int files, int depth); - int stat_dirs(const char *basedir, int dirs, int files, int depth); - int read_dirs(const char *basedir, int dirs, int files, int depth); - int make_files(int num, int count, int priv, bool more); - int link_test(); - - int create_shared(int num); - int open_shared(int num, int count); - - int write_file(string& fn, int mb, int chunk); - int write_batch(int nfile, int mb, int chunk); - int read_file(string& fn, int mb, int chunk, bool ignoreprint=false); - - int create_objects(int nobj, int osize, int inflight); - int object_rw(int nobj, int osize, int wrpc, int overlap, - double rskew, double wskew); - - int read_random(string& fn, int mb, int chunk); - int read_random_ex(string& fn, int mb, int chunk); - - int clean_dir(string& basedir); - - int play_trace(Trace& t, string& prefix, bool metadata_only=false); - - void make_dir_mess(const char *basedir, int n); - void foo(); - - int thrash_links(const char *basedir, int dirs, int files, int depth, int n); - - void import_find(const char *basedir, const char *find, bool writedata); - -}; - -#endif diff --git a/branches/sage/mds/client/Trace.cc b/branches/sage/mds/client/Trace.cc deleted file mode 100644 index 31bb1c4cf5c4a..0000000000000 --- a/branches/sage/mds/client/Trace.cc +++ /dev/null @@ -1,83 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "Trace.h" - -#include -#include -#include -#include -using namespace __gnu_cxx; - -#include "common/Mutex.h" - -#include "config.h" - -#include -#include -#include - - - - - -void Trace::start() -{ - //cout << "start" << std::endl; - delete fs; - - fs = new ifstream(); - fs->open(filename); - if (!fs->is_open()) { - generic_dout(0) << "** unable to open trace file " << filename << dendl; - assert(0); - } - generic_dout(2) << "opened traced file '" << filename << "'" << dendl; - - // read first line - getline(*fs, line); - //cout << "first line is " << line << std::endl; - - _line = 1; -} - -const char *Trace::peek_string(char *buf, const char *prefix) -{ - //if (prefix) cout << "prefix '" << prefix << "' line '" << line << "'" << std::endl; - if (prefix && - strstr(line.c_str(), "/prefix") == line.c_str()) { - strcpy(buf, prefix); - strcpy(buf + strlen(prefix), - line.c_str() + strlen("/prefix")); - } else { - strcpy(buf, line.c_str()); - } - return buf; -} - - -const char *Trace::get_string(char *buf, const char *prefix) -{ - peek_string(buf, prefix); - - //cout << "buf is " << buf << std::endl; - // read next line (and detect eof early) - _line++; - getline(*fs, line); - //cout << "next line is " << line << std::endl; - - return buf; -} diff --git a/branches/sage/mds/client/Trace.h b/branches/sage/mds/client/Trace.h deleted file mode 100644 index 97821f4e95e56..0000000000000 --- a/branches/sage/mds/client/Trace.h +++ /dev/null @@ -1,63 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __CLIENT_TRACE_H -#define __CLIENT_TRACE_H - -#include -#include -#include -#include -using std::list; -using std::string; -using std::ifstream; - -/* - - this class is more like an iterator over a constant tokenlist (which - is protected by a mutex, see Trace.cc) - - */ - -class Trace { - int _line; - const char *filename; - ifstream *fs; - string line; - - public: - Trace(const char* f) : filename(f), fs(0) {} - ~Trace() { - delete fs; - } - - int get_line() { return _line; } - - void start(); - - const char *peek_string(char *buf, const char *prefix); - const char *get_string(char *buf, const char *prefix); - - __int64_t get_int() { - char buf[20]; - return atoll(get_string(buf, 0)); - } - bool end() { - return !fs || fs->eof(); - //return _cur == _end; - } -}; - -#endif diff --git a/branches/sage/mds/client/fuse.cc b/branches/sage/mds/client/fuse.cc deleted file mode 100644 index 64198dc41df51..0000000000000 --- a/branches/sage/mds/client/fuse.cc +++ /dev/null @@ -1,306 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -/* - FUSE: Filesystem in Userspace - Copyright (C) 2001-2005 Miklos Szeredi - - This program can be distributed under the terms of the GNU GPL. - See the file COPYING. -*/ - - -// fuse crap -#ifdef linux -/* For pread()/pwrite() */ -#define _XOPEN_SOURCE 500 -#endif - -#define FUSE_USE_VERSION 26 - -#include -#include -#include -#include -#include -#include -#include -#include - - -// ceph stuff -#include "include/types.h" - -#include "Client.h" - -#include "config.h" - -// globals -static Client *client; // the ceph client - - - -// ------ -// fuse hooks - -static int ceph_getattr(const char *path, struct stat *stbuf) -{ - return client->lstat(path, stbuf); -} - -static int ceph_readlink(const char *path, char *buf, size_t size) -{ - int res; - - res = client->readlink(path, buf, size - 1); - if (res < 0) return res; - - buf[res] = '\0'; - return 0; -} - -static int ceph_mknod(const char *path, mode_t mode, dev_t rdev) -{ - return client->mknod(path, mode); -} - -static int ceph_mkdir(const char *path, mode_t mode) -{ - return client->mkdir(path, mode); -} - -static int ceph_unlink(const char *path) -{ - return client->unlink(path); -} - -static int ceph_rmdir(const char *path) -{ - return client->rmdir(path); -} - -static int ceph_symlink(const char *from, const char *to) -{ - return client->symlink(from, to); -} - -static int ceph_rename(const char *from, const char *to) -{ - return client->rename(from, to); -} - -static int ceph_link(const char *from, const char *to) -{ - return client->link(from, to); -} - -static int ceph_chmod(const char *path, mode_t mode) -{ - return client->chmod(path, mode); -} - -static int ceph_chown(const char *path, uid_t uid, gid_t gid) -{ - return client->chown(path, uid, gid); -} - -static int ceph_truncate(const char *path, off_t size) -{ - return client->truncate(path, size); -} - -static int ceph_utime(const char *path, struct utimbuf *buf) -{ - return client->utime(path, buf); -} - - -// ------------------ -// file i/o - -static int ceph_open(const char *path, struct fuse_file_info *fi) -{ - int res; - - res = client->open(path, fi->flags, 0); - if (res < 0) return res; - fi->fh = res; - return 0; // fuse wants 0 onsucess -} - -static int ceph_read(const char *path, char *buf, size_t size, off_t offset, - struct fuse_file_info *fi) -{ - int fd = fi->fh; - return client->read(fd, buf, size, offset); -} - -static int ceph_write(const char *path, const char *buf, size_t size, - off_t offset, struct fuse_file_info *fi) -{ - int fd = fi->fh; - return client->write(fd, buf, size, offset); -} - -static int ceph_flush(const char *path, struct fuse_file_info *fi) -{ - //int fh = fi->fh; - //return client->flush(fh); - return 0; -} - -static int ceph_statfs(const char *path, struct statvfs *stbuf) -{ - return client->statfs(path, stbuf); -} - -static int ceph_release(const char *path, struct fuse_file_info *fi) -{ - int fd = fi->fh; - int r = client->close(fd); // close the file - return r; -} - -static int ceph_fsync(const char *path, int isdatasync, - struct fuse_file_info *fi) -{ - int fd = fi->fh; - return client->fsync(fd, isdatasync ? true:false); -} - - -// --------------------- -// directory i/o - -static int ceph_opendir(const char *path, struct fuse_file_info *fi) -{ - DIR *dirp; - int r = client->opendir(path, &dirp); - if (r < 0) return r; - fi->fh = (uint64_t)(void*)dirp; - return 0; -} - -static int ceph_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t off, fuse_file_info *fi) -{ - DIR *dirp = (DIR*)fi->fh; - - client->seekdir(dirp, off); - - int res = 0; - struct dirent de; - struct stat st; - int stmask = 0; - while (res == 0) { - int r = client->readdirplus_r(dirp, &de, &st, &stmask); - if (r != 0) break; - int stneed = STAT_MASK_INO | STAT_MASK_TYPE; - res = filler(buf, - de.d_name, - ((stmask & stneed) == stneed) ? &st:0, - client->telldir(dirp)); - } - return 0; -} - -static int ceph_releasedir(const char *path, struct fuse_file_info *fi) -{ - DIR *dirp = (DIR*)fi->fh; - int r = client->closedir(dirp); // close the file - return r; -} - - - - - -static struct fuse_operations ceph_oper = { - getattr: ceph_getattr, - readlink: ceph_readlink, - getdir: 0, - mknod: ceph_mknod, - mkdir: ceph_mkdir, - unlink: ceph_unlink, - rmdir: ceph_rmdir, - symlink: ceph_symlink, - rename: ceph_rename, - link: ceph_link, - chmod: ceph_chmod, - chown: ceph_chown, - truncate: ceph_truncate, - utime: ceph_utime, - open: ceph_open, - read: ceph_read, - write: ceph_write, - statfs: ceph_statfs, - flush: ceph_flush, - release: ceph_release, - fsync: ceph_fsync, - setxattr: 0, - getxattr: 0, - listxattr: 0, - removexattr: 0, - opendir: ceph_opendir, - readdir: ceph_readdir, - releasedir: ceph_releasedir -}; - - -int ceph_fuse_main(Client *c, int argc, char *argv[]) -{ - // init client - client = c; - - // set up fuse argc/argv - int newargc = 0; - char **newargv = (char **) malloc((argc + 10) * sizeof(char *)); - newargv[newargc++] = argv[0]; - - // allow other (all!) users to see my file system - // NOTE: echo user_allow_other >> /etc/fuse.conf - // NB: seems broken on Darwin -#ifndef DARWIN - newargv[newargc++] = "-o"; - newargv[newargc++] = "allow_other"; -#endif // DARWIN - - // use inos - newargv[newargc++] = "-o"; - newargv[newargc++] = "use_ino"; - - // large reads, direct_io (no kernel cachine) - //newargv[newargc++] = "-o"; - //newargv[newargc++] = "large_read"; - if (g_conf.fuse_direct_io) { - newargv[newargc++] = "-o"; - newargv[newargc++] = "direct_io"; - } - - // disable stupid fuse unlink hiding thing - newargv[newargc++] = "-o"; - newargv[newargc++] = "hard_remove"; - - // force into foreground - // -> we can watch stdout this way!! - newargv[newargc++] = "-f"; - - // copy rest of cmdline (hopefully, the mount point!) - for (int argctr = 1; argctr < argc; argctr++) newargv[newargc++] = argv[argctr]; - - // go fuse go - cout << "ok, calling fuse_main" << std::endl; - int r = fuse_main(newargc, newargv, &ceph_oper, 0); - return r; -} diff --git a/branches/sage/mds/client/fuse.h b/branches/sage/mds/client/fuse.h deleted file mode 100644 index dfacbaa4fdd85..0000000000000 --- a/branches/sage/mds/client/fuse.h +++ /dev/null @@ -1,24 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -/* ceph_fuse_main - * - start up fuse glue, attached to Client* cl. - * - argc, argv should include a mount point, and - * any weird fuse options you want. by default, - * we will put fuse in the foreground so that it - * won't fork and we can see stdout. - */ -int ceph_fuse_main(Client *cl, int argc, char *argv[]); diff --git a/branches/sage/mds/client/fuse_ll.cc b/branches/sage/mds/client/fuse_ll.cc deleted file mode 100644 index f1f92b0cd01b3..0000000000000 --- a/branches/sage/mds/client/fuse_ll.cc +++ /dev/null @@ -1,397 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#define FUSE_USE_VERSION 26 - -#include -#include -#include -#include -#include -#include -#include -#include - -// ceph -#include "include/types.h" -#include "Client.h" -#include "config.h" - -static Client *client; - - -static void ceph_ll_lookup(fuse_req_t req, fuse_ino_t parent, const char *name) -{ - struct fuse_entry_param fe; - int stmask; - - memset(&fe, 0, sizeof(fe)); - stmask = client->ll_lookup(parent, name, &fe.attr); - if (stmask >= 0) { - fe.ino = fe.attr.st_ino; - fuse_reply_entry(req, &fe); - } else { - fuse_reply_err(req, ENOENT); - } -} - -static void ceph_ll_forget(fuse_req_t req, fuse_ino_t ino, long unsigned nlookup) -{ - client->ll_forget(ino, nlookup); - fuse_reply_none(req); -} - -static void ceph_ll_getattr(fuse_req_t req, fuse_ino_t ino, - struct fuse_file_info *fi) -{ - struct stat stbuf; - - (void) fi; - - if (client->ll_getattr(ino, &stbuf) == 0) - fuse_reply_attr(req, &stbuf, 0); - else - fuse_reply_err(req, ENOENT); -} - -static void ceph_ll_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr, - int to_set, struct fuse_file_info *fi) -{ - int r = client->ll_setattr(ino, attr, to_set); - if (r == 0) - fuse_reply_attr(req, attr, 0); - else - fuse_reply_err(req, -r); -} - -static void ceph_ll_opendir(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) -{ - void *dirp; - int r = client->ll_opendir(ino, &dirp); - if (r >= 0) { - fi->fh = (long)dirp; - fuse_reply_open(req, fi); - } else { - fuse_reply_err(req, -r); - } -} - -static void ceph_ll_readlink(fuse_req_t req, fuse_ino_t ino) -{ - const char *value; - int r = client->ll_readlink(ino, &value); - if (r == 0) - fuse_reply_readlink(req, value); - else - fuse_reply_err(req, -r); -} - -static void ceph_ll_mknod(fuse_req_t req, fuse_ino_t parent, const char *name, - mode_t mode, dev_t rdev) -{ - struct fuse_entry_param fe; - memset(&fe, 0, sizeof(fe)); - - int r = client->ll_mknod(parent, name, mode, rdev, &fe.attr); - if (r == 0) { - fe.ino = fe.attr.st_ino; - fuse_reply_entry(req, &fe); - } else { - fuse_reply_err(req, -r); - } -} - -static void ceph_ll_mkdir(fuse_req_t req, fuse_ino_t parent, const char *name, - mode_t mode) -{ - struct fuse_entry_param fe; - memset(&fe, 0, sizeof(fe)); - - int r = client->ll_mkdir(parent, name, mode, &fe.attr); - if (r == 0) { - fe.ino = fe.attr.st_ino; - fuse_reply_entry(req, &fe); - } else { - fuse_reply_err(req, -r); - } -} - -static void ceph_ll_unlink(fuse_req_t req, fuse_ino_t parent, const char *name) -{ - int r = client->ll_unlink(parent, name); - fuse_reply_err(req, -r); -} - -static void ceph_ll_rmdir(fuse_req_t req, fuse_ino_t parent, const char *name) -{ - int r = client->ll_rmdir(parent, name); - fuse_reply_err(req, -r); -} - -static void ceph_ll_symlink(fuse_req_t req, const char *existing, fuse_ino_t parent, const char *name) -{ - struct fuse_entry_param fe; - memset(&fe, 0, sizeof(fe)); - - int r = client->ll_symlink(parent, name, existing, &fe.attr); - if (r == 0) { - fe.ino = fe.attr.st_ino; - fuse_reply_entry(req, &fe); - } else { - fuse_reply_err(req, -r); - } -} - -static void ceph_ll_rename(fuse_req_t req, fuse_ino_t parent, const char *name, - fuse_ino_t newparent, const char *newname) -{ - int r = client->ll_rename(parent, name, newparent, newname); - fuse_reply_err(req, -r); -} - -static void ceph_ll_link(fuse_req_t req, fuse_ino_t ino, fuse_ino_t newparent, - const char *newname) -{ - struct fuse_entry_param fe; - memset(&fe, 0, sizeof(fe)); - - int r = client->ll_link(ino, newparent, newname, &fe.attr); - if (r == 0) { - fe.ino = fe.attr.st_ino; - fuse_reply_entry(req, &fe); - } else { - fuse_reply_err(req, -r); - } -} - -static void ceph_ll_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) -{ - Fh *fh; - int r = client->ll_open(ino, fi->flags, &fh); - if (r == 0) { - fi->fh = (long)fh; - fuse_reply_open(req, fi); - } else { - fuse_reply_err(req, -r); - } -} - -static void ceph_ll_read(fuse_req_t req, fuse_ino_t ino, size_t size, off_t off, - struct fuse_file_info *fi) -{ - Fh *fh = (Fh*)fi->fh; - bufferlist bl; - int r = client->ll_read(fh, off, size, &bl); - if (r >= 0) - fuse_reply_buf(req, bl.c_str(), bl.length()); - else - fuse_reply_err(req, -r); -} - -static void ceph_ll_write(fuse_req_t req, fuse_ino_t ino, const char *buf, - size_t size, off_t off, struct fuse_file_info *fi) -{ - Fh *fh = (Fh*)fi->fh; - int r = client->ll_write(fh, off, size, buf); - if (r >= 0) - fuse_reply_write(req, r); - else - fuse_reply_err(req, -r); -} - -static void ceph_ll_flush(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) -{ - // NOOP - fuse_reply_err(req, 0); -} - -static void ceph_ll_release(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) -{ - Fh *fh = (Fh*)fi->fh; - int r = client->ll_release(fh); - fuse_reply_err(req, -r); -} - -static void ceph_ll_fsync(fuse_req_t req, fuse_ino_t ino, int datasync, - struct fuse_file_info *fi) -{ - -} - -static void ceph_ll_readdir(fuse_req_t req, fuse_ino_t ino, size_t size, - off_t off, struct fuse_file_info *fi) -{ - (void) fi; - - // buffer - char *buf; - size_t pos = 0; - - buf = new char[size]; - if (!buf) { - fuse_reply_err(req, ENOMEM); - return; - } - - DIR *dirp = (DIR*)fi->fh; - client->seekdir(dirp, off); - - struct dirent de; - struct stat st; - memset(&st, 0, sizeof(st)); - - while (1) { - int r = client->readdir_r(dirp, &de); - if (r < 0) break; - st.st_ino = de.d_ino; - st.st_mode = DT_TO_MODE(de.d_type); - - off_t off = client->telldir(dirp); - size_t entrysize = fuse_add_direntry(req, buf + pos, size - pos, - de.d_name, &st, off); - - /* - cout << "ceph_ll_readdir added " << de.d_name << " at " << pos << " len " << entrysize - << " (buffer size is " << size << ")" - << " .. off = " << off - << std::endl; - */ - - if (entrysize > size - pos) - break; // didn't fit, done for now. - pos += entrysize; - } - - fuse_reply_buf(req, buf, pos); - delete[] buf; -} - -static void ceph_ll_releasedir(fuse_req_t req, fuse_ino_t ino, - struct fuse_file_info *fi) -{ - DIR *dirp = (DIR*)fi->fh; - client->ll_releasedir(dirp); - fuse_reply_err(req, 0); -} - -static void ceph_ll_create(fuse_req_t req, fuse_ino_t parent, const char *name, - mode_t mode, struct fuse_file_info *fi) -{ - struct fuse_entry_param fe; - memset(&fe, 0, sizeof(fe)); - Fh *fh; - int r = client->ll_create(parent, name, mode, fi->flags, &fe.attr, &fh); - if (r == 0) { - fi->fh = (long)fh; - fe.ino = fe.attr.st_ino; - fuse_reply_create(req, &fe, fi); - } else { - fuse_reply_err(req, -r); - } -} - -static void ceph_ll_statfs(fuse_req_t req, fuse_ino_t ino) -{ - struct statvfs stbuf; - int r = client->ll_statfs(ino, &stbuf); - if (r == 0) - fuse_reply_statfs(req, &stbuf); - else - fuse_reply_err(req, -r); -} - -static struct fuse_lowlevel_ops ceph_ll_oper = { - init: 0, - destroy: 0, - lookup: ceph_ll_lookup, - forget: ceph_ll_forget, - getattr: ceph_ll_getattr, - setattr: ceph_ll_setattr, - readlink: ceph_ll_readlink, - mknod: ceph_ll_mknod, - mkdir: ceph_ll_mkdir, - unlink: ceph_ll_unlink, - rmdir: ceph_ll_rmdir, - symlink: ceph_ll_symlink, - rename: ceph_ll_rename, - link: ceph_ll_link, - open: ceph_ll_open, - read: ceph_ll_read, - write: ceph_ll_write, - flush: ceph_ll_flush, - release: ceph_ll_release, - fsync: ceph_ll_fsync, - opendir: ceph_ll_opendir, - readdir: ceph_ll_readdir, - releasedir: ceph_ll_releasedir, - fsyncdir: 0, - statfs: ceph_ll_statfs, - setxattr: 0, - getxattr: 0, - listxattr: 0, - removexattr: 0, - access: 0, - create: ceph_ll_create, - getlk: 0, - setlk: 0, - bmap: 0 -}; - -int ceph_fuse_ll_main(Client *c, int argc, char *argv[]) -{ - cout << "ceph_fuse_ll_main starting fuse on pid " << getpid() << std::endl; - - client = c; - - // set up fuse argc/argv - int newargc = 0; - char **newargv = (char **) malloc((argc + 10) * sizeof(char *)); - newargv[newargc++] = argv[0]; - newargv[newargc++] = "-f"; // stay in foreground - - newargv[newargc++] = "-o"; - newargv[newargc++] = "allow_other"; - - for (int argctr = 1; argctr < argc; argctr++) newargv[newargc++] = argv[argctr]; - - // go go gadget fuse - struct fuse_args args = FUSE_ARGS_INIT(newargc, newargv); - struct fuse_chan *ch; - char *mountpoint; - int err = -1; - - if (fuse_parse_cmdline(&args, &mountpoint, NULL, NULL) != -1 && - (ch = fuse_mount(mountpoint, &args)) != NULL) { - struct fuse_session *se; - - // init fuse - se = fuse_lowlevel_new(&args, &ceph_ll_oper, sizeof(ceph_ll_oper), - NULL); - if (se != NULL) { - if (fuse_set_signal_handlers(se) != -1) { - fuse_session_add_chan(se, ch); - err = fuse_session_loop(se); - fuse_remove_signal_handlers(se); - fuse_session_remove_chan(ch); - } - fuse_session_destroy(se); - } - fuse_unmount(mountpoint, ch); - } - fuse_opt_free_args(&args); - - cout << "ceph_fuse_ll_main done, err=" << err << std::endl; - return err ? 1 : 0; -} - diff --git a/branches/sage/mds/client/fuse_ll.h b/branches/sage/mds/client/fuse_ll.h deleted file mode 100644 index 068969c4f7487..0000000000000 --- a/branches/sage/mds/client/fuse_ll.h +++ /dev/null @@ -1,15 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -int ceph_fuse_ll_main(Client *c, int argc, char *argv[]); diff --git a/branches/sage/mds/client/hadoop/CephFSInterface.cc b/branches/sage/mds/client/hadoop/CephFSInterface.cc deleted file mode 100644 index 7aa8c133d370b..0000000000000 --- a/branches/sage/mds/client/hadoop/CephFSInterface.cc +++ /dev/null @@ -1,789 +0,0 @@ -#include "CephFSInterface.h" - -using namespace std; - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_initializeClient - * Signature: ()J - * Initializes a ceph client. - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1initializeClient - (JNIEnv *, jobject) -{ - - dout(3) << "CephFSInterface: Initializing Ceph client:" << endl; - - // parse args from CEPH_ARGS - vector args; - env_to_vec(args); - parse_config_options(args); - - if (g_conf.clock_tare) g_clock.tare(); - - // be safe - g_conf.use_abspaths = true; - - // load monmap - MonMap monmap; - // int r = monmap.read(".ceph_monmap"); - int r = monmap.read("/cse/grads/eestolan/ceph/trunk/ceph/.ceph_monmap"); - if (r < 0) { - dout(0) << "CephFSInterface: could not find .ceph_monmap" << endl; - assert(0 && "could not find .ceph_monmap"); - // return 0; - } - assert(r >= 0); - - // start up network - rank.start_rank(); - - // start client - Client *client; - client = new Client(rank.register_entity(MSG_ADDR_CLIENT_NEW), &monmap); - client->init(); - - // mount - client->mount(); - - jlong clientp = *(jlong*)&client; - return clientp; -} - - - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_copyFromLocalFile - * Signature: (JLjava/lang/String;Ljava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1copyFromLocalFile -(JNIEnv * env, jobject obj, jlong clientp, jstring j_local_path, jstring j_ceph_path) { - - dout(10) << "CephFSInterface: In copyFromLocalFile" << endl; - Client* client; - //client = (Client*) clientp; - client = *(Client**)&clientp; - - const char* c_local_path = env->GetStringUTFChars(j_local_path, 0); - const char* c_ceph_path = env->GetStringUTFChars(j_ceph_path, 0); - - dout(10) << "CephFSInterface: Local source file is "<< c_local_path << " and Ceph destination file is " << c_ceph_path << endl; - struct stat st; - int r = ::stat(c_local_path, &st); - assert (r == 0); - - // open the files - int fh_local = ::open(c_local_path, O_RDONLY); - int fh_ceph = client->open(c_ceph_path, O_WRONLY|O_CREAT|O_TRUNC); - assert (fh_local > -1); - assert (fh_ceph > -1); - dout(10) << "CephFSInterface: local fd is " << fh_local << " and Ceph fd is " << fh_ceph << endl; - - // get the source file size - off_t remaining = st.st_size; - - // copy the file a MB at a time - const int chunk = 1048576; - bufferptr bp(chunk); - - while (remaining > 0) { - off_t got = ::read(fh_local, bp.c_str(), MIN(remaining,chunk)); - assert(got > 0); - remaining -= got; - off_t wrote = client->write(fh_ceph, bp.c_str(), got, -1); - assert (got == wrote); - } - client->close(fh_ceph); - ::close(fh_local); - - env->ReleaseStringUTFChars(j_local_path, c_local_path); - env->ReleaseStringUTFChars(j_ceph_path, c_ceph_path); - - return JNI_TRUE; -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_copyToLocalFile - * Signature: (JLjava/lang/String;Ljava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1copyToLocalFile -(JNIEnv *env, jobject obj, jlong clientp, jstring j_ceph_path, jstring j_local_path) -{ - Client* client; - client = *(Client**)&clientp; - const char* c_ceph_path = env->GetStringUTFChars(j_ceph_path, 0); - const char* c_local_path = env->GetStringUTFChars(j_local_path, 0); - - dout(3) << "CephFSInterface: dout(3): In copyToLocalFile, copying from Ceph file " << c_ceph_path << - " to local file " << c_local_path << endl; - - cout << "CephFSInterface: cout: In copyToLocalFile, copying from Ceph file " << c_ceph_path << - " to local file " << c_local_path << endl; - - - // get source file size - struct stat st; - //dout(10) << "Attempting lstat with file " << c_ceph_path << ":" << endl; - int r = client->lstat(c_ceph_path, &st); - assert (r == 0); - - dout(10) << "CephFSInterface: Opening Ceph source file for read: " << endl; - int fh_ceph = client->open(c_ceph_path, O_RDONLY); - assert (fh_ceph > -1); - - dout(10) << "CephFSInterface: Opened Ceph file! Opening local destination file: " << endl; - int fh_local = ::open(c_local_path, O_WRONLY|O_CREAT|O_TRUNC, 0644); - assert (fh_local > -1); - - // copy the file a chunk at a time - const int chunk = 1048576; - bufferptr bp(chunk); - - off_t remaining = st.st_size; - while (remaining > 0) { - off_t got = client->read(fh_ceph, bp.c_str(), MIN(remaining,chunk), -1); - assert(got > 0); - remaining -= got; - off_t wrote = ::write(fh_local, bp.c_str(), got); - assert (got == wrote); - } - client->close(fh_ceph); - ::close(fh_local); - - env->ReleaseStringUTFChars(j_local_path, c_local_path); - env->ReleaseStringUTFChars(j_ceph_path, c_ceph_path); - - return JNI_TRUE; -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_getcwd - * Signature: (J)Ljava/lang/String; - * Returns the current working directory. - */ -JNIEXPORT jstring JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1getcwd - (JNIEnv *env, jobject obj, jlong clientp) -{ - dout(10) << "CephFSInterface: In getcwd" << endl; - - Client* client; - client = *(Client**)&clientp; - - return (env->NewStringUTF(client->getcwd().c_str())); -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_setcwd - * Signature: (JLjava/lang/String;)Z - * - * Changes the working directory. - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1setcwd -(JNIEnv *env, jobject obj, jlong clientp, jstring j_path) -{ - dout(10) << "CephFSInterface: In setcwd" << endl; - - Client* client; - client = *(Client**)&clientp; - - const char* c_path = env->GetStringUTFChars(j_path, 0); - return (0 <= client->chdir(c_path)) ? JNI_TRUE : JNI_FALSE; - env->ReleaseStringUTFChars(j_path, c_path); -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_rmdir - * Signature: (JLjava/lang/String;)Z - * Removes an empty directory. - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1rmdir - (JNIEnv *env, jobject, jlong clientp, jstring j_path) -{ - dout(10) << "CephFSInterface: In rmdir" << endl; - - Client* client; - client = *(Client**)&clientp; - - const char* c_path = env->GetStringUTFChars(j_path, 0); - return (0 == client->rmdir(c_path)) ? JNI_TRUE : JNI_FALSE; - env->ReleaseStringUTFChars(j_path, c_path); -} - - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_mkdir - * Signature: (JLjava/lang/String;)Z - * Creates a directory with full permissions. - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1mkdir - (JNIEnv * env, jobject, jlong clientp, jstring j_path) -{ - dout(10) << "CephFSInterface: In mkdir" << endl; - - Client* client; - client = *(Client**)&clientp; - - const char* c_path = env->GetStringUTFChars(j_path, 0); - return (0 == client->mkdir(c_path, 0xFF)) ? JNI_TRUE : JNI_FALSE; - env->ReleaseStringUTFChars(j_path, c_path); -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_unlink - * Signature: (JLjava/lang/String;)Z - * Unlinks a path. - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1unlink - (JNIEnv * env, jobject, jlong clientp, jstring j_path) -{ - Client* client; - client = *(Client**)&clientp; - - const char* c_path = env->GetStringUTFChars(j_path, 0); - dout(10) << "CephFSInterface: In unlink for path " << c_path << ":" << endl; - - // is it a file or a directory? - struct stat stbuf; - int stat_result = client->lstat(c_path, &stbuf); - if (stat_result < 0) {// then the path doesn't even exist - dout(0) << "ceph_unlink: path " << c_path << " does not exist" << endl; - return false; - } - int result; - if (0 != S_ISDIR(stbuf.st_mode)) { // it's a directory - dout(10) << "ceph_unlink: path " << c_path << " is a directory. Calling client->rmdir()" << endl; - result = client->rmdir(c_path); - } - else if (0 != S_ISREG(stbuf.st_mode)) { // it's a file - dout(10) << "ceph_unlink: path " << c_path << " is a file. Calling client->unlink()" << endl; - result = client->unlink(c_path); - } - else { - dout(0) << "ceph_unlink: path " << c_path << " is not a file or a directory. Failing:" << endl; - result = -1; - } - - dout(10) << "In ceph_unlink for path " << c_path << - ": got result " - << result << ". Returning..."<< endl; - - env->ReleaseStringUTFChars(j_path, c_path); - return (0 == result) ? JNI_TRUE : JNI_FALSE; -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_rename - * Signature: (JLjava/lang/String;Ljava/lang/String;)Z - * Renames a file. - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1rename - (JNIEnv *env, jobject, jlong clientp, jstring j_from, jstring j_to) -{ - dout(10) << "CephFSInterface: In rename" << endl; - - Client* client; - client = *(Client**)&clientp; - - const char* c_from = env->GetStringUTFChars(j_from, 0); - const char* c_to = env->GetStringUTFChars(j_to, 0); - - return (0 <= client->rename(c_from, c_to)) ? JNI_TRUE : JNI_FALSE; - env->ReleaseStringUTFChars(j_from, c_from); - env->ReleaseStringUTFChars(j_to, c_to); -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_exists - * Signature: (JLjava/lang/String;)Z - * Returns true if the path exists. - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1exists -(JNIEnv *env, jobject, jlong clientp, jstring j_path) -{ - - dout(10) << "CephFSInterface: In exists" << endl; - - Client* client; - struct stat stbuf; - client = *(Client**)&clientp; - - const char* c_path = env->GetStringUTFChars(j_path, 0); - dout(10) << "Attempting lstat with file " << c_path << ":" ; - int result = client->lstat(c_path, &stbuf); - dout(10) << "result is " << result << endl; - env->ReleaseStringUTFChars(j_path, c_path); - if (result < 0) { - dout(10) << "Returning false (file does not exist)" << endl; - return JNI_FALSE; - } - else { - dout(10) << "Returning true (file exists)" << endl; - return JNI_TRUE; - } -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_getblocksize - * Signature: (JLjava/lang/String;)J - * Returns the block size. Size is -1 if the file - * does not exist. - * TODO: see if Hadoop wants something more like stripe size - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1getblocksize - (JNIEnv *env, jobject obj, jlong clientp, jstring j_path) -{ - dout(10) << "In getblocksize" << endl; - - Client* client; - //struct stat stbuf; - client = *(Client**)&clientp; - - jint result; - - const char* c_path = env->GetStringUTFChars(j_path, 0); - - /* - if (0 > client->lstat(c_path, &stbuf)) - result = -1; - else - result = stbuf.st_blksize; - */ - - // we need to open the file to retrieve the stripe size - dout(10) << "CephFSInterface: getblocksize: opening file" << endl; - int fh = client->open(c_path, O_RDONLY); - if (fh < 0) - return -1; - - result = client->get_stripe_unit(fh); - - int close_result = client->close(fh); - assert (close_result > -1); - - - env->ReleaseStringUTFChars(j_path, c_path); - return result; -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_getfilesize - * Signature: (JLjava/lang/String;)J - * Returns the file size, or -1 on failure. - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1getfilesize - (JNIEnv *env, jobject, jlong clientp, jstring j_path) -{ - dout(10) << "In getfilesize" << endl; - - Client* client; - struct stat stbuf; - client = *(Client**)&clientp; - - jlong result; - - const char* c_path = env->GetStringUTFChars(j_path, 0); - if (0 > client->lstat(c_path, &stbuf)) result = -1; - else result = stbuf.st_size; - env->ReleaseStringUTFChars(j_path, c_path); - - return result; -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_isfile - * Signature: (JLjava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1isfile - (JNIEnv *env, jobject obj, jlong clientp, jstring j_path) -{ - dout(10) << "In isfile" << endl; - - Client* client; - struct stat stbuf; - client = *(Client**)&clientp; - - - const char* c_path = env->GetStringUTFChars(j_path, 0); - int result = client->lstat(c_path, &stbuf); - - env->ReleaseStringUTFChars(j_path, c_path); - - // if the stat call failed, it's definitely not a file... - if (0 > result) return JNI_FALSE; - - // check the stat result - return (0 == S_ISREG(stbuf.st_mode)) ? JNI_FALSE : JNI_TRUE; -} - - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_isdirectory - * Signature: (JLjava/lang/String;)Z - * Returns true if the path is a directory. - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1isdirectory - (JNIEnv *env, jobject, jlong clientp, jstring j_path) -{ - dout(10) << "In isdirectory" << endl; - - Client* client; - struct stat stbuf; - client = *(Client**)&clientp; - - const char* c_path = env->GetStringUTFChars(j_path, 0); - int result = client->lstat(c_path, &stbuf); - env->ReleaseStringUTFChars(j_path, c_path); - - // if the stat call failed, it's definitely not a directory... - if (0 > result) return JNI_FALSE; - - // check the stat result - return (0 == S_ISDIR(stbuf.st_mode)) ? JNI_FALSE : JNI_TRUE; -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_getdir - * Signature: (JLjava/lang/String;)[Ljava/lang/String; - * Returns a Java array of Strings with the directory contents - */ -JNIEXPORT jobjectArray JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1getdir -(JNIEnv *env, jobject obj, jlong clientp, jstring j_path) { - - dout(10) << "In getdir" << endl; - - Client* client; - client = *(Client**)&clientp; - - // get the directory listing - map contents; - const char* c_path = env->GetStringUTFChars(j_path, 0); - int result = client->getdir(c_path, contents); - env->ReleaseStringUTFChars(j_path, c_path); - - if (result < 0) return NULL; - - dout(10) << "checking for empty dir" << endl; - jint dir_size = contents.size(); - - // Hadoop freaks out if the listing contains "." or "..". Shrink - // the listing size by two, or by one if the directory is the root. - if(('/' == c_path[0]) && (0 == c_path[1])) - dir_size -= 1; - else - dir_size -= 2; - assert (dir_size >= 0); - - // Create a Java String array of the size of the directory listing - // jstring blankString = env->NewStringUTF(""); - jclass stringClass = env->FindClass("java/lang/String"); - if (NULL == stringClass) { - dout(0) << "ERROR: java String class not found; dying a horrible, painful death" << endl; - assert(0); - } - jobjectArray dirListingStringArray = (jobjectArray) env->NewObjectArray(dir_size, stringClass, NULL); - - // populate the array with the elements of the directory list, - // omitting . and .. - int i = 0; - string dot("."); - string dotdot (".."); - for (map::iterator it = contents.begin(); - it != contents.end(); - it++) { - // is it "."? - if (it->first == dot) continue; - if (it->first == dotdot) continue; - - if (0 == dir_size) - dout(0) << "CephFSInterface: WARNING: adding stuff to an empty array." << endl; - assert (i < dir_size); - env->SetObjectArrayElement(dirListingStringArray, i, - env->NewStringUTF(it->first.c_str())); - ++i; - } - - return dirListingStringArray; -} - - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_open_for_read - * Signature: (JLjava/lang/String;)I - * Open a file for reading. - */ -JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1open_1for_1read - (JNIEnv *env, jobject obj, jlong clientp, jstring j_path) - -{ - dout(10) << "In open_for_read" << endl; - - Client* client; - client = *(Client**)&clientp; - - jint result; - - // open as read-only: flag = O_RDONLY - - const char* c_path = env->GetStringUTFChars(j_path, 0); - result = client->open(c_path, O_RDONLY); - env->ReleaseStringUTFChars(j_path, c_path); - - // returns file handle, or -1 on failure - return result; -} - - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_open_for_overwrite - * Signature: (JLjava/lang/String;)I - * Opens a file for overwriting; creates it if necessary. - */ -JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1open_1for_1overwrite - (JNIEnv *env, jobject obj, jlong clientp, jstring j_path) -{ - dout(10) << "In open_for_overwrite" << endl; - - Client* client; - client = *(Client**)&clientp; - - jint result; - - - const char* c_path = env->GetStringUTFChars(j_path, 0); - result = client->open(c_path, O_WRONLY|O_CREAT|O_TRUNC); - env->ReleaseStringUTFChars(j_path, c_path); - - // returns file handle, or -1 on failure - return result; -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_kill_client - * Signature: (J)Z - * - * Closes the Ceph client. - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1kill_1client - (JNIEnv *env, jobject obj, jlong clientp) -{ - Client* client; - client = *(Client**)&clientp; - - client->unmount(); - client->shutdown(); - delete client; - - // wait for messenger to finish - rank.wait(); - - return true; -} - - - -/* - * Class: org_apache_hadoop_fs_ceph_CephInputStream - * Method: ceph_read - * Signature: (JI[BII)I - * Reads into the given byte array from the current position. - */ -JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephInputStream_ceph_1read - (JNIEnv *env, jobject obj, jlong clientp, jint fh, jbyteArray j_buffer, jint buffer_offset, jint length) -{ - dout(10) << "In read" << endl; - - - // IMPORTANT NOTE: Hadoop read arguments are a bit different from POSIX so we - // have to convert. The read is *always* from the current position in the file, - // and buffer_offset is the location in the *buffer* where we start writing. - - - Client* client; - client = *(Client**)&clientp; - jint result; - - // Step 1: get a pointer to the buffer. - jbyte* j_buffer_ptr = env->GetByteArrayElements(j_buffer, NULL); - char* c_buffer = (char*) j_buffer_ptr; - - // Step 2: pointer arithmetic to start in the right buffer position - c_buffer += (int)buffer_offset; - - // Step 3: do the read - result = client->read((int)fh, c_buffer, length, -1); - - // Step 4: release the pointer to the buffer - env->ReleaseByteArrayElements(j_buffer, j_buffer_ptr, 0); - - return result; -} - - -/* - * Class: org_apache_hadoop_fs_ceph_CephInputStream - * Method: ceph_seek_from_start - * Signature: (JIJ)J - * Seeks to the given position. - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephInputStream_ceph_1seek_1from_1start - (JNIEnv *env, jobject obj, jlong clientp, jint fh, jlong pos) -{ - dout(10) << "In CephInputStream::seek_from_start" << endl; - - Client* client; - client = *(Client**)&clientp; - jint result; - - result = client->lseek(fh, pos, SEEK_SET); - - return result; -} - -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephInputStream_ceph_1getpos - (JNIEnv *env, jobject obj, jlong clientp, jint fh) -{ - dout(10) << "In CephInputStream::ceph_getpos" << endl; - - Client* client; - client = *(Client**)&clientp; - jint result; - - // seek a distance of 0 to get current offset - result = client->lseek(fh, 0, SEEK_CUR); - - return result; -} - - -/* - * Class: org_apache_hadoop_fs_ceph_CephInputStream - * Method: ceph_close - * Signature: (JI)I - * Closes the file. - */ -JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephInputStream_ceph_1close - (JNIEnv *env, jobject obj, jlong clientp, jint fh) -{ - dout(10) << "In CephInputStream::ceph_close" << endl; - - Client* client; - client = *(Client**)&clientp; - jint result; - - result = client->close(fh); - - return result; -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephOutputStream - * Method: ceph_seek_from_start - * Signature: (JIJ)J - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephOutputStream_ceph_1seek_1from_1start - (JNIEnv *env, jobject obj, jlong clientp, jint fh, jlong pos) -{ - dout(10) << "In CephOutputStream::ceph_seek_from_start" << endl; - - Client* client; - client = *(Client**)&clientp; - jint result; - - result = client->lseek(fh, pos, SEEK_SET); - - return result; -} - - -/* - * Class: org_apache_hadoop_fs_ceph_CephOutputStream - * Method: ceph_getpos - * Signature: (JI)J - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephOutputStream_ceph_1getpos - (JNIEnv *env, jobject obj, jlong clientp, jint fh) -{ - dout(10) << "In CephOutputStream::ceph_getpos" << endl; - - Client* client; - client = *(Client**)&clientp; - jint result; - - // seek a distance of 0 to get current offset - result = client->lseek(fh, 0, SEEK_CUR); - - return result; -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephOutputStream - * Method: ceph_close - * Signature: (JI)I - */ -JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephOutputStream_ceph_1close - (JNIEnv *env, jobject obj, jlong clientp, jint fh) -{ - dout(10) << "In CephOutputStream::ceph_close" << endl; - - Client* client; - client = *(Client**)&clientp; - jint result; - - result = client->close(fh); - - return result; -} - - - -/* - * Class: org_apache_hadoop_fs_ceph_CephOutputStream - * Method: ceph_write - * Signature: (JI[BII)I - */ -JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephOutputStream_ceph_1write - (JNIEnv *env, jobject obj, jlong clientp, jint fh, jbyteArray j_buffer, jint buffer_offset, jint length) -{ - dout(10) << "In write" << endl; - - // IMPORTANT NOTE: Hadoop write arguments are a bit different from POSIX so we - // have to convert. The write is *always* from the current position in the file, - // and buffer_offset is the location in the *buffer* where we start writing. - - Client* client; - client = *(Client**)&clientp; - jint result; - - // Step 1: get a pointer to the buffer. - jbyte* j_buffer_ptr = env->GetByteArrayElements(j_buffer, NULL); - char* c_buffer = (char*) j_buffer_ptr; - - // Step 2: pointer arithmetic to start in the right buffer position - c_buffer += (int)buffer_offset; - - // Step 3: do the write - result = client->write((int)fh, c_buffer, length, -1); - - // Step 4: release the pointer to the buffer - env->ReleaseByteArrayElements(j_buffer, j_buffer_ptr, 0); - - return result; -} - diff --git a/branches/sage/mds/client/hadoop/CephFSInterface.h b/branches/sage/mds/client/hadoop/CephFSInterface.h deleted file mode 100644 index 549925aba6e64..0000000000000 --- a/branches/sage/mds/client/hadoop/CephFSInterface.h +++ /dev/null @@ -1,239 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* DO NOT EDIT THIS FILE - it is machine generated */ -#include -/* Header for class org_apache_hadoop_fs_ceph_CephFileSystem */ - -#include -#include "client/Client.h" -#include "config.h" -#include "client/fuse.h" -#include "msg/SimpleMessenger.h" -#include "common/Timer.h" - -#ifndef _Included_org_apache_hadoop_fs_ceph_CephFileSystem -#define _Included_org_apache_hadoop_fs_ceph_CephFileSystem -#ifdef __cplusplus -extern "C" { -#endif - -#undef org_apache_hadoop_fs_ceph_CephFileSystem_DEFAULT_BLOCK_SIZE -#define org_apache_hadoop_fs_ceph_CephFileSystem_DEFAULT_BLOCK_SIZE 1048576LL -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_initializeClient - * Signature: ()J - * Initializes a ceph client. - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1initializeClient -(JNIEnv *, jobject); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_copyFromLocalFile - * Signature: (JLjava/lang/String;Ljava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1copyFromLocalFile - (JNIEnv *, jobject, jlong, jstring, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_copyToLocalFile - * Signature: (JLjava/lang/String;Ljava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1copyToLocalFile - (JNIEnv *, jobject, jlong, jstring, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_getcwd - * Signature: (J)Ljava/lang/String; - */ -JNIEXPORT jstring JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1getcwd - (JNIEnv *, jobject, jlong); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_setcwd - * Signature: (JLjava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1setcwd - (JNIEnv *, jobject, jlong, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_rmdir - * Signature: (JLjava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1rmdir - (JNIEnv *, jobject, jlong, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_mkdir - * Signature: (JLjava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1mkdir - (JNIEnv *, jobject, jlong, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_unlink - * Signature: (JLjava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1unlink - (JNIEnv *, jobject, jlong, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_rename - * Signature: (JLjava/lang/String;Ljava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1rename - (JNIEnv *, jobject, jlong, jstring, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_exists - * Signature: (JLjava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1exists - (JNIEnv *, jobject, jlong, jstring); - - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_getblocksize - * Signature: (JLjava/lang/String;)J - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1getblocksize - (JNIEnv *, jobject, jlong, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_getfilesize - * Signature: (JLjava/lang/String;)J - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1getfilesize - (JNIEnv *, jobject, jlong, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_isdirectory - * Signature: (JLjava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1isdirectory - (JNIEnv *, jobject, jlong, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_isfile - * Signature: (JLjava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1isfile - (JNIEnv *, jobject, jlong, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_getdir - * Signature: (JLjava/lang/String;)[Ljava/lang/String; - */ -JNIEXPORT jobjectArray JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1getdir - (JNIEnv *, jobject, jlong, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_open_for_read - * Signature: (JLjava/lang/String;)I - */ -JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1open_1for_1read - (JNIEnv *, jobject, jlong, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_open_for_overwrite - * Signature: (JLjava/lang/String;)I - */ -JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1open_1for_1overwrite - (JNIEnv *, jobject, jlong, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_kill_client - * Signature: (J)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1kill_1client - (JNIEnv *, jobject, jlong); - -#undef org_apache_hadoop_fs_ceph_CephInputStream_SKIP_BUFFER_SIZE -#define org_apache_hadoop_fs_ceph_CephInputStream_SKIP_BUFFER_SIZE 2048L -/* - * Class: org_apache_hadoop_fs_ceph_CephInputStream - * Method: ceph_read - * Signature: (JI[BII)I - */ -JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephInputStream_ceph_1read - (JNIEnv *, jobject, jlong, jint, jbyteArray, jint, jint); - -/* - * Class: org_apache_hadoop_fs_ceph_CephInputStream - * Method: ceph_seek_from_start - * Signature: (JIJ)J - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephInputStream_ceph_1seek_1from_1start - (JNIEnv *, jobject, jlong, jint, jlong); - -/* - * Class: org_apache_hadoop_fs_ceph_CephInputStream - * Method: ceph_getpos - * Signature: (JI)J - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephInputStream_ceph_1getpos - (JNIEnv *, jobject, jlong, jint); - -/* - * Class: org_apache_hadoop_fs_ceph_CephInputStream - * Method: ceph_close - * Signature: (JI)I - */ -JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephInputStream_ceph_1close - (JNIEnv *, jobject, jlong, jint); - -/* Header for class org_apache_hadoop_fs_ceph_CephOutputStream */ - -/* - * Class: org_apache_hadoop_fs_ceph_CephOutputStream - * Method: ceph_seek_from_start - * Signature: (JIJ)J - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephOutputStream_ceph_1seek_1from_1start - (JNIEnv *, jobject, jlong, jint, jlong); - -/* - * Class: org_apache_hadoop_fs_ceph_CephOutputStream - * Method: ceph_getpos - * Signature: (JI)J - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephOutputStream_ceph_1getpos - (JNIEnv *, jobject, jlong, jint); - -/* - * Class: org_apache_hadoop_fs_ceph_CephOutputStream - * Method: ceph_close - * Signature: (JI)I - */ -JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephOutputStream_ceph_1close - (JNIEnv *, jobject, jlong, jint); - -/* - * Class: org_apache_hadoop_fs_ceph_CephOutputStream - * Method: ceph_write - * Signature: (JI[BII)I - */ -JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephOutputStream_ceph_1write - (JNIEnv *, jobject, jlong, jint, jbyteArray, jint, jint); - -#ifdef __cplusplus -} -#endif -#endif diff --git a/branches/sage/mds/client/ldceph.cc b/branches/sage/mds/client/ldceph.cc deleted file mode 100644 index b17133ee1e6f2..0000000000000 --- a/branches/sage/mds/client/ldceph.cc +++ /dev/null @@ -1,298 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include -using namespace std; - -// ceph stuff -#include "config.h" -#include "client/Client.h" -#include "msg/SimpleMessenger.h" - -// syscall fun -#include -#include -#include -//#include - -#define _FCNTL_H -#include - -#define CEPH_FD_OFF 50000 - - -/****** startup etc *******/ - -class LdCeph { -public: - // globals - bool started; - char *mount_point; - char *mount_point_parent; - int mount_point_len; - - Client *client; - - filepath fp_mount_point; - filepath cwd; - bool cwd_above_mp, cwd_in_mp; - - const char *get_ceph_path(const char *orig, char *buf) { - if (!started) return 0; - - // relative path? BUG: this won't catch "blah/../../asdf" - if (orig[0] && - orig[0] != '/' && - !(orig[0] == '.' && orig[1] == '.')) { - - if (cwd_in_mp) return orig; // inside mount point, definitely ceph - if (!cwd_above_mp) return 0; // not above mount point, definitely not ceph - - // relative, above mp. - filepath o = orig; - filepath p = cwd; - for (unsigned b = 0; b < o.depth(); b++) { - if (o[b] == "..") - p.pop_dentry(); - else - p.add_dentry(o[b]); - } - - // FIXME rewrite - if (strncmp(p.c_str(), mount_point, mount_point_len) == 0) { - if (p.c_str()[mount_point_len] == 0) - return "/"; - if (p.c_str()[mount_point_len] == '/') { - strcpy(buf, p.c_str() + mount_point_len); - return buf; - } - } - return 0; - } else { - // absolute - if (strncmp(orig, mount_point, mount_point_len) == 0) { - if (orig[mount_point_len] == 0) - return "/"; - if (orig[mount_point_len] == '/') - return orig + mount_point_len; - } - return 0; - } - } - - void refresh_cwd() { - char buf[255]; - syscall(SYS_getcwd, buf, 255); - cwd = buf; - - if (strncmp(buf, mount_point, mount_point_len) == 0 && - (buf[mount_point_len] == 0 || - buf[mount_point_len] == '/')) - cwd_in_mp = true; - else { - if (cwd.depth() > fp_mount_point.depth()) - cwd_above_mp = false; - else { - cwd_above_mp = true; - for (unsigned i=0; iget_myaddr() << endl; - - refresh_cwd(); - } - } - ~LdCeph() { - cout << "ldceph fini" << endl; - if (false && client) { - client->unmount(); - client->shutdown(); - delete client; - client = 0; - tcpmessenger_wait(); - tcpmessenger_shutdown(); - } - } - -} ldceph; - - - -/****** original functions ****/ - - - -/****** captured functions ****/ - - -#define MYFD(f) ((fd) > CEPH_FD_OFF && ldceph.started) -#define TO_FD(fd) (fd > 0 ? fd+CEPH_FD_OFF:fd) -#define FROM_FD(fd) (fd - CEPH_FD_OFF) - -extern "C" { - - // open/close - //int open(const char *pathname, int flags) { - int open(const char *pathname, int flags, mode_t mode) { - char buf[255]; - if (const char *c = ldceph.get_ceph_path(pathname, buf)) - return TO_FD(ldceph.client->open(c, flags)); - else - return syscall(SYS_open, pathname, flags, mode); - } - - int creat(const char *pathname, mode_t mode) { - return open(pathname, O_CREAT|O_WRONLY|O_TRUNC, mode); - } - int close(int fd) { - if (MYFD(fd)) - return ldceph.client->close(FROM_FD(fd)); - else - return syscall(SYS_close, fd); - } - - - // read/write - ssize_t write(int fd, const void *buf, size_t count) { - if (MYFD(fd)) - return ldceph.client->write(FROM_FD(fd), (char*)buf, count); - else - return syscall(SYS_write, fd, buf, count); - } - - ssize_t read(int fd, void *buf, size_t count) { - if (MYFD(fd)) - return ldceph.client->read(FROM_FD(fd), (char*)buf, count); - else - return syscall(SYS_read, fd, buf, count); - } - - //int fsync(int fd); - //int fdatasync(int fd); - - - // namespace - int rmdir(const char *pathname) { - char buf[255]; - if (const char *c = ldceph.get_ceph_path(pathname, buf)) - return ldceph.client->rmdir(c); - else - return syscall(SYS_rmdir, pathname); - } - int mkdir(const char *pathname, mode_t mode) { - char buf[255]; - if (const char *c = ldceph.get_ceph_path(pathname, buf)) - return ldceph.client->mkdir(c, mode); - else - return syscall(SYS_mkdir, pathname, mode); - } - int unlink(const char *pathname) { - char buf[255]; - if (const char *c = ldceph.get_ceph_path(pathname, buf)) - return ldceph.client->unlink(c); - else - return syscall(SYS_unlink, pathname); - } - - int stat(const char *pathname, struct stat *st) { - //int __xstat64(int __ver, const char *pathname, struct stat64 *st64) { // stoopid GLIBC - //struct stat *st = (struct stat*)st64; - char buf[255]; - if (const char *c = ldceph.get_ceph_path(pathname, buf)) - return ldceph.client->lstat(c, st); // FIXME - else - return syscall(SYS_stat, pathname, st); - } - //int fstat(int filedes, struct stat *buf); - //int lstat(const char *file_name, struct stat *buf); - - int chdir(const char *pathname) { - char buf[255]; - if (const char *c = ldceph.get_ceph_path(pathname, buf)) { - int r = ldceph.client->chdir(c); - if (r == 0) { - if (!ldceph.cwd_in_mp) - syscall(SYS_chdir, ldceph.mount_point_parent); - ldceph.cwd_in_mp = true; - ldceph.cwd_above_mp = false; - ldceph.cwd = ldceph.mount_point; - filepath fpc = c; - ldceph.cwd.append(fpc); - } - return r; - } else { - int r = syscall(SYS_chdir, pathname); - if (r) { - ldceph.refresh_cwd(); - } - return r; - } - } - char *getcwd(char *buf, size_t size) { - strncpy(buf, ldceph.cwd.c_str(), size); - return buf; - } - //int fchdir(int fd); - - - - -} diff --git a/branches/sage/mds/cmds.cc b/branches/sage/mds/cmds.cc deleted file mode 100644 index 6e475ad4b588d..0000000000000 --- a/branches/sage/mds/cmds.cc +++ /dev/null @@ -1,108 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include -#include -#include - -#include -#include -#include -using namespace std; - -#include "config.h" - -#include "mon/MonMap.h" -#include "mds/MDS.h" - -#include "msg/SimpleMessenger.h" - -#include "common/Timer.h" - - -class C_Die : public Context { -public: - void finish(int) { - cerr << "die" << std::endl; - exit(1); - } -}; - -class C_Debug : public Context { - public: - void finish(int) { - int size = &g_conf.debug_after - &g_conf.debug; - memcpy((char*)&g_conf.debug, (char*)&g_debug_after_conf.debug, size); - generic_dout(0) << "debug_after flipping debug settings" << dendl; - } -}; - - -int main(int argc, char **argv) -{ - vector args; - argv_to_vec(argc, argv, args); - - parse_config_options(args); - - if (g_conf.kill_after) - g_timer.add_event_after(g_conf.kill_after, new C_Die); - if (g_conf.debug_after) - g_timer.add_event_after(g_conf.debug_after, new C_Debug); - - // mds specific args - int whoami = -1; - bool standby = false; // by default, i'll start active. - for (unsigned i=0; i= 0); - - // start up network - rank.start_rank(); - - // start mds - Messenger *m = rank.register_entity(entity_name_t::MDS(whoami)); - assert(m); - - MDS *mds = new MDS(whoami, m, &monmap); - mds->init(standby); - - // wait - rank.wait(); - - // yuck: grab the mds lock, so we can be sure that whoever in *mds - // called shutdown finishes what they were doing. - mds->mds_lock.Lock(); - mds->mds_lock.Unlock(); - - // done - //delete mds; - - return 0; -} - diff --git a/branches/sage/mds/cmon.cc b/branches/sage/mds/cmon.cc deleted file mode 100644 index f9ada45f7ef99..0000000000000 --- a/branches/sage/mds/cmon.cc +++ /dev/null @@ -1,129 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include -#include -#include - -#include -#include -#include -using namespace std; - -#include "config.h" - -#include "mon/MonMap.h" -#include "mon/Monitor.h" - -#include "msg/SimpleMessenger.h" - -#include "common/Timer.h" - - -class C_Die : public Context { -public: - void finish(int) { - cerr << "die" << std::endl; - exit(1); - } -}; - -class C_Debug : public Context { - public: - void finish(int) { - int size = &g_conf.debug_after - &g_conf.debug; - memcpy((char*)&g_conf.debug, (char*)&g_debug_after_conf.debug, size); - generic_dout(0) << "debug_after flipping debug settings" << dendl; - } -}; - - -int main(int argc, char **argv) -{ - vector args; - argv_to_vec(argc, argv, args); - - parse_config_options(args); - - if (g_conf.kill_after) - g_timer.add_event_after(g_conf.kill_after, new C_Die); - if (g_conf.debug_after) - g_timer.add_event_after(g_conf.debug_after, new C_Debug); - - // args - int whoami = -1; - char *monmap_fn = ".ceph_monmap"; - for (unsigned i=0; i= 0); - } else { - // i am specific monitor. - - // read monmap - cout << "reading monmap from .ceph_monmap" << std::endl; - int r = monmap.read(monmap_fn); - assert(r >= 0); - - // bind to a specific port - cout << "starting mon" << whoami << " at " << monmap.get_inst(whoami) << std::endl; - g_my_addr = monmap.get_inst(whoami).addr; - rank.start_rank(); - } - - // start monitor - Messenger *m = rank.register_entity(entity_name_t::MON(whoami)); - Monitor *mon = new Monitor(whoami, m, &monmap); - mon->init(); - - // wait - cout << "waiting for shutdown ..." << std::endl; - rank.wait(); - - // done - delete mon; - - return 0; -} - diff --git a/branches/sage/mds/cmonctl.cc b/branches/sage/mds/cmonctl.cc deleted file mode 100644 index 85f4e1dc49392..0000000000000 --- a/branches/sage/mds/cmonctl.cc +++ /dev/null @@ -1,92 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include -#include -#include -using namespace std; - -#include "config.h" - -#include "mon/MonMap.h" -#include "msg/SimpleMessenger.h" -#include "messages/MMonCommand.h" -#include "messages/MMonCommandAck.h" - -#include "common/Timer.h" - -#ifndef DARWIN -#include -#endif // DARWIN - -#include -#include -#include - - -Messenger *messenger = 0; - -class Admin : public Dispatcher { - void dispatch(Message *m) { - switch (m->get_type()) { - case MSG_MON_COMMAND_ACK: - generic_dout(0) << m->get_source() << " -> '" - << ((MMonCommandAck*)m)->rs << "' (" << ((MMonCommandAck*)m)->r << ")" - << dendl; - messenger->shutdown(); - break; - } - } -} dispatcher; - -int main(int argc, char **argv, char *envp[]) { - - vector args; - argv_to_vec(argc, argv, args); - parse_config_options(args); - - // args for fuse - vec_to_argv(args, argc, argv); - - // load monmap - MonMap monmap; - int r = monmap.read(".ceph_monmap"); - assert(r >= 0); - - // start up network - rank.start_rank(); - messenger = rank.register_entity(entity_name_t::ADMIN()); - messenger->set_dispatcher(&dispatcher); - - // build command - MMonCommand *m = new MMonCommand(messenger->get_myinst()); - string cmd; - for (unsigned i=0; icmd.push_back(string(args[i])); - } - int mon = monmap.pick_mon(); - - generic_dout(0) << "mon" << mon << " <- '" << cmd << "'" << dendl; - - // send it - messenger->send_message(m, monmap.get_inst(mon)); - - // wait for messenger to finish - rank.wait(); - - return 0; -} - diff --git a/branches/sage/mds/common/Clock.cc b/branches/sage/mds/common/Clock.cc deleted file mode 100644 index 8b07f6d9eb15f..0000000000000 --- a/branches/sage/mds/common/Clock.cc +++ /dev/null @@ -1,20 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include "Clock.h" - -// public -Clock g_clock; - diff --git a/branches/sage/mds/common/Clock.h b/branches/sage/mds/common/Clock.h deleted file mode 100644 index 1ea7227adebd4..0000000000000 --- a/branches/sage/mds/common/Clock.h +++ /dev/null @@ -1,104 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __CLOCK_H -#define __CLOCK_H - -#include -#include - -#include -#include - -#include "Mutex.h" - -#include "include/utime.h" - - - -// -- clock -- -class Clock { - protected: - //utime_t start_offset; - //utime_t abs_last; - utime_t last; - utime_t zero; - - Mutex lock; - - public: - Clock() { - // set offset - //tare(); - } - - // real time. - utime_t real_now() { - utime_t realnow = now(); - realnow += zero; - //gettimeofday(&realnow.timeval(), NULL); - return realnow; - } - - // relative time (from startup) - void tare() { - gettimeofday(&zero.timeval(), NULL); - } - void tare(utime_t z) { - zero = z; - } - utime_t now() { - //lock.Lock(); - utime_t n; - gettimeofday(&n.timeval(), NULL); - n -= zero; - if (n < last) { - //std::cerr << "WARNING: clock jumped backwards from " << last << " to " << n << std::endl; - n = last; // clock jumped backwards! - } else - last = n; - //lock.Unlock(); - return n; - } - utime_t recent_now() { - return last; - } - - void realify(utime_t& t) { - t += zero; - } - - void make_timespec(utime_t& t, struct timespec *ts) { - utime_t real = t; - realify(real); - - memset(ts, 0, sizeof(*ts)); - ts->tv_sec = real.sec(); - ts->tv_nsec = real.nsec(); - } - - - - // absolute time - time_t gettime() { - return real_now().sec(); - } - -}; - -extern Clock g_clock; - -#endif diff --git a/branches/sage/mds/common/Cond.h b/branches/sage/mds/common/Cond.h deleted file mode 100644 index 4cb3d721b423f..0000000000000 --- a/branches/sage/mds/common/Cond.h +++ /dev/null @@ -1,119 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __COND_H -#define __COND_H - -#include - -#include "Mutex.h" -#include "Clock.h" - -#include "include/Context.h" - -#include -#include - -class Cond { - // my bits - pthread_cond_t _c; - - // don't allow copying. - void operator=(Cond &C) {} - Cond( const Cond &C ) {} - - public: - Cond() { - int r = pthread_cond_init(&_c,NULL); - assert(r == 0); - } - virtual ~Cond() { - pthread_cond_destroy(&_c); - } - - int Wait(Mutex &mutex) { - int r = pthread_cond_wait(&_c, &mutex._m); - return r; - } - - int Wait(Mutex &mutex, char* s) { - //cout << "Wait: " << s << endl; - int r = pthread_cond_wait(&_c, &mutex._m); - return r; - } - - int WaitUntil(Mutex &mutex, utime_t when) { - struct timespec ts; - g_clock.make_timespec(when, &ts); - //cout << "timedwait for " << ts.tv_sec << " sec " << ts.tv_nsec << " nsec" << endl; - int r = pthread_cond_timedwait(&_c, &mutex._m, &ts); - return r; - } - int WaitInterval(Mutex &mutex, utime_t interval) { - utime_t when = g_clock.now(); - when += interval; - return WaitUntil(mutex, when); - } - - int Signal() { - //int r = pthread_cond_signal(&_c); - int r = pthread_cond_broadcast(&_c); - return r; - } - int SignalOne() { - int r = pthread_cond_signal(&_c); - return r; - } - int SignalAll() { - //int r = pthread_cond_signal(&_c); - int r = pthread_cond_broadcast(&_c); - return r; - } -}; - -class C_Cond : public Context { - Cond *cond; - bool *done; - int *rval; -public: - C_Cond(Cond *c, bool *d, int *r=0) : cond(c), done(d), rval(r) { - *done = false; - } - void finish(int r) { - if (rval) *rval = r; - *done = true; - cond->Signal(); - } -}; - -class C_SafeCond : public Context { - Mutex *lock; - Cond *cond; - bool *done; - int *rval; -public: - C_SafeCond(Mutex *l, Cond *c, bool *d, int *r=0) : lock(l), cond(c), done(d), rval(r) { - *done = false; - } - void finish(int r) { - lock->Lock(); - if (rval) *rval = r; - *done = true; - cond->Signal(); - lock->Unlock(); - } -}; - -#endif diff --git a/branches/sage/mds/common/DecayCounter.h b/branches/sage/mds/common/DecayCounter.h deleted file mode 100644 index f431fb2073cd7..0000000000000 --- a/branches/sage/mds/common/DecayCounter.h +++ /dev/null @@ -1,138 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __DECAYCOUNTER_H -#define __DECAYCOUNTER_H - -#include -#include "Clock.h" - -#include "config.h" - -/** - * - * TODO: normalize value based on some fucntion of half_life, - * so that it can be interpreted as an approximation of a - * moving average of N seconds. currently, changing half-life - * skews the scale of the value, even at steady state. - * - */ - -class DecayCounter { - protected: -public: - double half_life; - double k; // k = ln(.5)/half_life - double val; // value - double delta; // delta since last decay - double vel; // recent velocity - utime_t last_decay; // time of last decay - - public: - DecayCounter() : val(0), delta(0), vel(0) { - set_halflife( g_conf.mds_decay_halflife ); - reset(); - } - DecayCounter(double hl) : val(0), delta(0), vel(0) { - set_halflife( hl ); - reset(); - } - - /** - * reading - */ - - double get() { - return get(g_clock.now()); - } - - double get(utime_t now) { - decay(now); - return val; - } - - double get_last() { - return val; - } - - double get_last_vel() { - return vel; - } - - utime_t get_last_decay() { - return last_decay; - } - - /** - * adjusting - */ - - double hit(utime_t now, double v = 1.0) { - decay(now); - delta += v; - return val+delta; - } - - void adjust(double a) { - val += a; - } - void adjust(utime_t now, double a) { - decay(now); - val += a; - } - void scale(double f) { - val *= f; - delta *= f; - vel *= f; - } - - /** - * decay etc. - */ - - void set_halflife(double hl) { - half_life = hl; - k = log(.5) / hl; - } - - void reset() { - reset(g_clock.now()); - } - void reset(utime_t now) { - last_decay = g_clock.now(); - val = delta = 0; - } - - void decay(utime_t now) { - utime_t el = now; - el -= last_decay; - - if (el.sec() >= 1) { - // calculate new value - double newval = (val+delta) * exp((double)el * k); - if (newval < .01) newval = 0.0; - - // calculate velocity approx - vel += (newval - val) * (double)el; - vel *= exp((double)el * k); - - val = newval; - delta = 0; - last_decay = now; - } - } -}; - - -#endif diff --git a/branches/sage/mds/common/LogType.h b/branches/sage/mds/common/LogType.h deleted file mode 100644 index a0889545acb6a..0000000000000 --- a/branches/sage/mds/common/LogType.h +++ /dev/null @@ -1,122 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __LOGTYPE_H -#define __LOGTYPE_H - -#include "include/types.h" - -#include -#include -using std::string; -using std::ofstream; - -#include -#include -using __gnu_cxx::hash_map; -using __gnu_cxx::hash_set; - -#include "Mutex.h" - - -class LogType { - protected: - hash_map keymap; - vector keys; - set inc_keys; - vector avg; - - int version; - - // HACK to avoid the hash table as often as possible... - // cache recent key name lookups in a small ring buffer - const static int cache_keys = 10; - intptr_t kc_ptr[cache_keys]; - int kc_val[cache_keys]; - int kc_pos; - - friend class Logger; - - public: - LogType() { - version = 1; - - for (int i=0;i= 0) return i; - - i = keys.size(); - keys.push_back(key); - avg.push_back(false); - - intptr_t p = (intptr_t)key; - keymap[p] = i; - if (is_inc) inc_keys.insert(i); - - version++; - return i; - } - int add_inc(const char* key) { - return add_key(key, true); - } - int add_set(const char *key) { - return add_key(key, false); - } - int add_avg(const char *key) { - int i = add_key(key, true); - avg[i] = true; - return i; - } - - bool have_key(const char* key) { - return lookup_key(key) < 0; - } - - int lookup_key(const char* key) { - intptr_t p = (intptr_t)key; - - if (keymap.count(p)) - return keymap[p]; - - // try kc ringbuffer - int pos = kc_pos-1; - for (int j=0; j - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include - -#include "LogType.h" -#include "Logger.h" - -#include -#include "Clock.h" - -#include "config.h" - -#include -#include - -#include "common/Timer.h" - -// per-process lock. lame, but this way I protect LogType too! -Mutex logger_lock; -SafeTimer logger_timer(logger_lock); -Context *logger_event = 0; -list logger_list; -utime_t start; -int last_flush; // in seconds since start - -static void flush_all_loggers(); - -class C_FlushLoggers : public Context { -public: - void finish(int r) { - if (logger_event == this) { - logger_event = 0; - flush_all_loggers(); - } - } -}; - -void Logger::set_start(utime_t s) -{ - logger_lock.Lock(); - - start = s; - - utime_t fromstart = g_clock.now(); - if (fromstart < start) { - cerr << "set_start: logger time jumped backwards from " << start << " to " << fromstart << std::endl; - fromstart = start; - } - fromstart -= start; - last_flush = fromstart.sec(); - - logger_lock.Unlock(); -} - -static void flush_all_loggers() -{ - generic_dout(20) << "flush_all_loggers" << dendl; - - utime_t now = g_clock.now(); - utime_t fromstart = now; - if (fromstart < start) { - cerr << "logger time jumped backwards from " << start << " to " << fromstart << std::endl; - //assert(0); - start = fromstart; - } - fromstart -= start; - int now_sec = fromstart.sec(); - - // do any catching up we need to - while (now_sec - last_flush >= g_conf.log_interval) { - generic_dout(20) << "fromstart " << fromstart << " last_flush " << last_flush << " flushign" << dendl; - for (list::iterator p = logger_list.begin(); - p != logger_list.end(); - ++p) - (*p)->_flush(); - last_flush += g_conf.log_interval; - } - - // schedule next flush event - utime_t next; - next.sec_ref() = start.sec() + last_flush + g_conf.log_interval; - next.usec_ref() = start.usec(); - generic_dout(20) << "logger now=" << now - << " start=" << start - << " next=" << next - << dendl; - logger_event = new C_FlushLoggers; - logger_timer.add_event_at(next, logger_event); -} - - - -// --------- - -Logger::Logger(string fn, LogType *type, bool append) -{ - logger_lock.Lock(); - { - filename = ""; - if (g_conf.use_abspaths) { - char *cwd = get_current_dir_name(); - filename = cwd; - free(cwd); - filename += "/"; - } - - filename = "log/"; - if (g_conf.log_name) { - filename += g_conf.log_name; - ::mkdir( filename.c_str(), 0755 ); // make sure dir exists - filename += "/"; - } - filename += fn; - - if (append) - out.open(filename.c_str(), ofstream::out|ofstream::app); - else - out.open(filename.c_str(), ofstream::out); - - this->type = type; - wrote_header = -1; - wrote_header_last = 0; - - version = 0; - - if (logger_list.empty()) { - // init logger - if (!g_conf.clock_tare) - start = g_clock.now(); // time 0! otherwise g_clock does it for us. - - last_flush = 0; - - // call manually the first time; then it'll schedule itself. - flush_all_loggers(); - } - logger_list.push_back(this); - } - logger_lock.Unlock(); -} - -Logger::~Logger() -{ - logger_lock.Lock(); - { - _flush(); - out.close(); - logger_list.remove(this); // slow, but rare. - if (logger_list.empty()) - logger_event = 0; // stop the timer events. - } - logger_lock.Unlock(); -} - - -/* -void Logger::flush() -{ - logger_lock.Lock(); - _flush(); - logger_lock.Unlock(); -} -*/ - -void Logger::_flush() -{ - // header? - wrote_header_last++; - if (wrote_header != type->version || - wrote_header_last > 10) { - out << "#" << type->keymap.size(); - for (unsigned i=0; ikeys.size(); i++) { - out << "\t" << type->keys[i]; - if (type->avg[i]) - out << "\t" << type->keys[i] << "*\t" << type->keys[i] << "~"; - } - out << std::endl; //out << "\t (" << type->keymap.size() << ")" << endl; - wrote_header = type->version; - wrote_header_last = 0; - } - - maybe_resize(type->keys.size()); - - // write line to log - out << last_flush; - for (unsigned i=0; ikeys.size(); i++) { - if (type->avg[i]) { - if (vals[i] > 0) { - double avg = (fvals[i] / (double)vals[i]); - double var = 0.0; - if (g_conf.logger_calc_variance) { - int n = vals[i]; - for (vector::iterator p = vals_to_avg[i].begin(); n--; ++p) - var += (avg - *p) * (avg - *p); - } - out << "\t" << avg << "\t" << vals[i] << "\t" << var; - } else - out << "\t0\t0\t0"; - } else { - if (fvals[i] > 0 && vals[i] == 0) - out << "\t" << fvals[i]; - else { - //cout << this << " p " << i << " and size is " << vals.size() << std::endl; - out << "\t" << vals[i]; - } - } - } - out << std::endl; - - // reset the counters - for (unsigned i=0; ikeys.size(); i++) { - if (type->inc_keys.count(i)) { - this->vals[i] = 0; - this->fvals[i] = 0; - } - } -} - - - -long Logger::inc(const char *key, long v) -{ - if (!g_conf.log) return 0; - logger_lock.Lock(); - int i = type->lookup_key(key); - if (i < 0) i = type->add_inc(key); - maybe_resize(i+1); - - vals[i] += v; - long r = vals[i]; - logger_lock.Unlock(); - return r; -} - -double Logger::finc(const char *key, double v) -{ - if (!g_conf.log) return 0; - logger_lock.Lock(); - int i = type->lookup_key(key); - if (i < 0) i = type->add_inc(key); - maybe_resize(i+1); - - fvals[i] += v; - double r = fvals[i]; - logger_lock.Unlock(); - return r; -} - -long Logger::set(const char *key, long v) -{ - if (!g_conf.log) return 0; - logger_lock.Lock(); - int i = type->lookup_key(key); - if (i < 0) i = type->add_set(key); - maybe_resize(i+1); - - //cout << this << " set " << i << " to " << v << std::endl; - long r = vals[i] = v; - logger_lock.Unlock(); - return r; -} - - -double Logger::fset(const char *key, double v) -{ - if (!g_conf.log) return 0; - logger_lock.Lock(); - int i = type->lookup_key(key); - if (i < 0) i = type->add_set(key); - maybe_resize(i+1); - - //cout << this << " fset " << i << " to " << v << std::endl; - double r = fvals[i] = v; - logger_lock.Unlock(); - return r; -} - -double Logger::favg(const char *key, double v) -{ - if (!g_conf.log) return 0; - logger_lock.Lock(); - int i = type->lookup_key(key); - if (i < 0) i = type->add_avg(key); - maybe_resize(i+1); - - vals[i]++; - double r = fvals[i] = v; - if (g_conf.logger_calc_variance) - vals_to_avg[i].push_back(v); - logger_lock.Unlock(); - return r; -} - -long Logger::get(const char* key) -{ - if (!g_conf.log) return 0; - logger_lock.Lock(); - int i = type->lookup_key(key); - maybe_resize(i+1); - - long r = 0; - if (i >= 0 && i < (int)vals.size()) - r = vals[i]; - logger_lock.Unlock(); - return r; -} - diff --git a/branches/sage/mds/common/Logger.h b/branches/sage/mds/common/Logger.h deleted file mode 100644 index 70fc1fa978024..0000000000000 --- a/branches/sage/mds/common/Logger.h +++ /dev/null @@ -1,77 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __LOGGER_H -#define __LOGGER_H - -#include "include/types.h" -#include "Clock.h" - -#include -#include -#include -using std::vector; -using std::string; -using std::ofstream; - -#include "LogType.h" - - -class Logger { - protected: - // values for this instance - vector vals; - vector fvals; - vector< vector > vals_to_avg; - - void maybe_resize(unsigned s) { - while (s >= vals.size()) { - vals.push_back(0); - fvals.push_back(0.0); - vals_to_avg.push_back(vector()); - } - } - - // my type - LogType *type; - int version; - - string filename; - ofstream out; - - // what i've written - //int last_logged; - int wrote_header; - int wrote_header_last; - - public: - Logger(string fn, LogType *type, bool append=false); - ~Logger(); - - long inc(const char *s, long v = 1); - long set(const char *s, long v); - long get(const char *s); - - double fset(const char *s, double v); - double finc(const char *s, double v); - double favg(const char *s, double v); - - //void flush(); - void _flush(); - - void set_start(utime_t s); -}; - -#endif diff --git a/branches/sage/mds/common/Mutex.h b/branches/sage/mds/common/Mutex.h deleted file mode 100755 index 724c4dbed2a76..0000000000000 --- a/branches/sage/mds/common/Mutex.h +++ /dev/null @@ -1,83 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MUTEX_H -#define __MUTEX_H - -#include -#include - -class Mutex { -private: - pthread_mutex_t _m; - int nlock; - bool recursive; - - // don't allow copying. - void operator=(Mutex &M) {} - Mutex( const Mutex &M ) {} - -public: - Mutex(bool r = true) : nlock(0), recursive(r) { - if (recursive) { - pthread_mutexattr_t attr; - pthread_mutexattr_init(&attr); - pthread_mutexattr_settype(&attr,PTHREAD_MUTEX_RECURSIVE); - pthread_mutex_init(&_m,&attr); - pthread_mutexattr_destroy(&attr); - } else { - pthread_mutex_init(&_m,NULL); - } - } - virtual ~Mutex() { - assert(nlock == 0); - pthread_mutex_destroy(&_m); - } - - bool is_locked() { - return (nlock > 0); - } - - void Lock() { - int r = pthread_mutex_lock(&_m); - assert(r == 0); - nlock++; - assert(nlock == 1 || recursive); - } - - void Unlock() { - assert(nlock > 0); - --nlock; - int r = pthread_mutex_unlock(&_m); - assert(r == 0); - } - - friend class Cond; - - -public: - class Locker { - Mutex &mutex; - - public: - Locker(Mutex& m) : mutex(m) { - mutex.Lock(); - } - ~Locker() { - mutex.Unlock(); - } - }; -}; - -#endif diff --git a/branches/sage/mds/common/RWLock.h b/branches/sage/mds/common/RWLock.h deleted file mode 100644 index 14e158a64ab97..0000000000000 --- a/branches/sage/mds/common/RWLock.h +++ /dev/null @@ -1,50 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef _RWLock_Posix_ -#define _RWLock_Posix_ - -#include - -class RWLock -{ - mutable pthread_rwlock_t L; - - public: - - RWLock() { - pthread_rwlock_init(&L, NULL); - } - - virtual ~RWLock() { - pthread_rwlock_unlock(&L); - pthread_rwlock_destroy(&L); - } - - void unlock() { - pthread_rwlock_unlock(&L); - } - void get_read() { - pthread_rwlock_rdlock(&L); - } - void put_read() { unlock(); } - void get_write() { - pthread_rwlock_wrlock(&L); - } - void put_write() { unlock(); } -}; - -#endif // !_Mutex_Posix_ diff --git a/branches/sage/mds/common/Semaphore.h b/branches/sage/mds/common/Semaphore.h deleted file mode 100644 index bc0a9e60d7ffa..0000000000000 --- a/branches/sage/mds/common/Semaphore.h +++ /dev/null @@ -1,53 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef _Sem_Posix_ -#define _Sem_Posix_ - -#include - -class Semaphore -{ - Mutex m; - Cond c; - int count; - - public: - - Semaphore() - { - count = 0; - } - - void Put() - { - m.Lock(); - count++; - c.Signal(); - m.Unlock(); - } - - void Get() - { - m.Lock(); - while(count <= 0) { - c.Wait(m); - } - count--; - m.Unlock(); - } -}; - -#endif // !_Mutex_Posix_ diff --git a/branches/sage/mds/common/Thread.h b/branches/sage/mds/common/Thread.h deleted file mode 100644 index 06e20047da57f..0000000000000 --- a/branches/sage/mds/common/Thread.h +++ /dev/null @@ -1,81 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __THREAD_H -#define __THREAD_H - -#include -#include -#include - -class Thread { - private: - pthread_t thread_id; - - public: - Thread() : thread_id(0) {} - virtual ~Thread() {} - - protected: - virtual void *entry() = 0; - - private: - static void *_entry_func(void *arg) { - return ((Thread*)arg)->entry(); - } - - public: - pthread_t &get_thread_id() { return thread_id; } - bool is_started() { return thread_id != 0; } - bool am_self() { return (pthread_self() == thread_id); } - - int kill(int signal) { - return pthread_kill(thread_id, signal); - } - int create() { - return pthread_create( &thread_id, NULL, _entry_func, (void*)this ); - } - int join(void **prval = 0) { - if (thread_id == 0) { - generic_derr(0) << "WARNING: join on thread that was never started" << dendl; - //assert(0); - return -EINVAL; // never started. - } - - int status = pthread_join(thread_id, prval); - if (status != 0) { - switch (status) { - case -EINVAL: - generic_derr(0) << "thread " << thread_id << " join status = EINVAL" << dendl; - break; - case -ESRCH: - generic_derr(0) << "thread " << thread_id << " join status = ESRCH" << dendl; - assert(0); - break; - case -EDEADLK: - generic_derr(0) << "thread " << thread_id << " join status = EDEADLK" << dendl; - break; - default: - generic_derr(0) << "thread " << thread_id << " join status = " << status << dendl; - } - assert(0); // none of these should happen. - } - thread_id = 0; - return status; - } - -}; - -#endif diff --git a/branches/sage/mds/common/ThreadPool.h b/branches/sage/mds/common/ThreadPool.h deleted file mode 100644 index 62855a240cd0c..0000000000000 --- a/branches/sage/mds/common/ThreadPool.h +++ /dev/null @@ -1,139 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef THREADPOOL -#define THREADPOOL - -#include -using std::list; - - -#include -#include -#include -#include - - -// debug output -#include "config.h" -#define tpdout(x) if (x <= g_conf.debug) *_dout << myname -#define DBLVL 15 - - -using namespace std; - -#define MAX_THREADS 1000 - -template -class ThreadPool { - - private: - list q; - Mutex q_lock; - Semaphore q_sem; - - int num_ops; - int num_threads; - vector thread; - - U u; - void (*func)(U,T); - void (*prefunc)(U,T); - string myname; - - static void *foo(void *arg) - { - ThreadPool *t = (ThreadPool *)arg; - t->do_ops(arg); - return 0; - } - - void *do_ops(void *nothing) - { - tpdout(DBLVL) << ".do_ops thread " << pthread_self() << " starting" << std::endl; - while (1) { - q_sem.Get(); - if (q.empty()) break; - - T op = get_op(); - tpdout(DBLVL) << ".func thread "<< pthread_self() << " on " << op << std::endl; - func(u, op); - } - tpdout(DBLVL) << ".do_ops thread " << pthread_self() << " exiting" << std::endl; - return 0; - } - - - T get_op() - { - T op; - q_lock.Lock(); - { - op = q.front(); - q.pop_front(); - num_ops--; - - if (prefunc && op) { - tpdout(DBLVL) << ".prefunc thread "<< pthread_self() << " on " << op << std::endl; - prefunc(u, op); - } - } - q_lock.Unlock(); - - return op; - } - - public: - - ThreadPool(char *myname, int howmany, void (*f)(U,T), U obj, void (*pf)(U,T) = 0) : - num_ops(0), num_threads(howmany), - thread(num_threads), - u(obj), - func(f), prefunc(pf), - myname(myname) { - tpdout(DBLVL) << ".cons num_threads " << num_threads << std::endl; - - // start threads - int status; - for(int i = 0; i < howmany; i++) { - status = pthread_create(&thread[i], NULL, (void*(*)(void *))&ThreadPool::foo, this); - assert(status == 0); - } - } - - ~ThreadPool() { - // bump sem to make threads exit cleanly - for(int i = 0; i < num_threads; i++) - q_sem.Put(); - - // wait for them to die - for(int i = 0; i < num_threads; i++) { - tpdout(DBLVL) << ".des joining thread " << thread[i] << std::endl; - void *rval = 0; // we don't actually care - pthread_join(thread[i], &rval); - } - } - - void put_op(T op) { - tpdout(DBLVL) << ".put_op " << op << std::endl; - q_lock.Lock(); - q.push_back(op); - num_ops++; - q_sem.Put(); - q_lock.Unlock(); - } - -}; -#endif diff --git a/branches/sage/mds/common/Timer.cc b/branches/sage/mds/common/Timer.cc deleted file mode 100644 index 1705bc759ac9f..0000000000000 --- a/branches/sage/mds/common/Timer.cc +++ /dev/null @@ -1,335 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - - -#include "Timer.h" -#include "Cond.h" - -#include "config.h" -#include "include/Context.h" - -#define dout(x) if (x <= g_conf.debug_timer) *_dout << dbeginl << g_clock.now() << " TIMER " -#define derr(x) if (x <= g_conf.debug_timer) *_derr << dbeginl << g_clock.now() << " TIMER " - -#define DBL 10 - -#include -#include -#include - -// single global instance -Timer g_timer; - - - -/**** thread solution *****/ - -bool Timer::get_next_due(utime_t& when) -{ - if (scheduled.empty()) { - dout(10) << "get_next_due - nothing scheduled" << dendl; - return false; - } else { - map< utime_t, set >::iterator it = scheduled.begin(); - when = it->first; - dout(10) << "get_next_due - " << when << dendl; - return true; - } -} - - -void Timer::timer_entry() -{ - lock.Lock(); - - while (!thread_stop) { - - // now - utime_t now = g_clock.now(); - - // any events due? - utime_t next; - bool next_due = get_next_due(next); - - if (next_due && now >= next) { - // move to pending list - list pending; - - map< utime_t, set >::iterator it = scheduled.begin(); - while (it != scheduled.end()) { - if (it->first > now) break; - - utime_t t = it->first; - dout(DBL) << "queueing event(s) scheduled at " << t << dendl; - - for (set::iterator cit = it->second.begin(); - cit != it->second.end(); - cit++) { - pending.push_back(*cit); - event_times.erase(*cit); - num_event--; - } - - map< utime_t, set >::iterator previt = it; - it++; - scheduled.erase(previt); - } - - if (!pending.empty()) { - sleeping = false; - lock.Unlock(); - { - // make sure we're not holding any locks while we do callbacks - // make the callbacks myself. - for (list::iterator cit = pending.begin(); - cit != pending.end(); - cit++) { - dout(DBL) << "start callback " << *cit << dendl; - (*cit)->finish(0); - dout(DBL) << "finish callback " << *cit << dendl; - delete *cit; - } - pending.clear(); - assert(pending.empty()); - } - lock.Lock(); - } - - } - else { - // sleep - if (next_due) { - dout(DBL) << "sleeping until " << next << dendl; - timed_sleep = true; - sleeping = true; - timeout_cond.WaitUntil(lock, next); // wait for waker or time - utime_t now = g_clock.now(); - dout(DBL) << "kicked or timed out at " << now << dendl; - } else { - dout(DBL) << "sleeping" << dendl; - timed_sleep = false; - sleeping = true; - sleep_cond.Wait(lock); // wait for waker - utime_t now = g_clock.now(); - dout(DBL) << "kicked at " << now << dendl; - } - } - } - - lock.Unlock(); -} - - - -/** - * Timer bits - */ - -void Timer::register_timer() -{ - if (timer_thread.is_started()) { - if (sleeping) { - dout(DBL) << "register_timer kicking thread" << dendl; - if (timed_sleep) - timeout_cond.SignalAll(); - else - sleep_cond.SignalAll(); - } else { - dout(DBL) << "register_timer doing nothing; thread is awake" << dendl; - // it's probably doing callbacks. - } - } else { - dout(DBL) << "register_timer starting thread" << dendl; - timer_thread.create(); - } -} - -void Timer::cancel_timer() -{ - // clear my callback pointers - if (timer_thread.is_started()) { - dout(10) << "setting thread_stop flag" << dendl; - lock.Lock(); - thread_stop = true; - if (timed_sleep) - timeout_cond.SignalAll(); - else - sleep_cond.SignalAll(); - lock.Unlock(); - - dout(10) << "waiting for thread to finish" << dendl; - void *ptr; - timer_thread.join(&ptr); - - dout(10) << "thread finished, exit code " << ptr << dendl; - } -} - - -/* - * schedule - */ - - -void Timer::add_event_after(double seconds, - Context *callback) -{ - utime_t when = g_clock.now(); - when += seconds; - add_event_at(when, callback); -} - -void Timer::add_event_at(utime_t when, - Context *callback) -{ - lock.Lock(); - - dout(DBL) << "add_event " << callback << " at " << when << dendl; - - // insert - scheduled[when].insert(callback); - assert(event_times.count(callback) == 0); - event_times[callback] = when; - - num_event++; - - // make sure i wake up on time - register_timer(); - - lock.Unlock(); -} - -bool Timer::cancel_event(Context *callback) -{ - lock.Lock(); - - dout(DBL) << "cancel_event " << callback << dendl; - - if (!event_times.count(callback)) { - dout(DBL) << "cancel_event " << callback << " isn't scheduled (probably executing)" << dendl; - lock.Unlock(); - return false; // wasn't scheduled. - } - - utime_t tp = event_times[callback]; - event_times.erase(callback); - - assert(scheduled.count(tp)); - assert(scheduled[tp].count(callback)); - scheduled[tp].erase(callback); - if (scheduled[tp].empty()) - scheduled.erase(tp); - - lock.Unlock(); - - // delete the canceled event. - delete callback; - - return true; -} - - -// ------------------------------- - -void SafeTimer::add_event_after(double seconds, Context *c) -{ - assert(lock.is_locked()); - Context *w = new EventWrapper(this, c); - dout(DBL) << "SafeTimer.add_event_after wrapping " << c << " with " << w << dendl; - scheduled[c] = w; - g_timer.add_event_after(seconds, w); -} - -void SafeTimer::add_event_at(utime_t when, Context *c) -{ - assert(lock.is_locked()); - Context *w = new EventWrapper(this, c); - dout(DBL) << "SafeTimer.add_event_at wrapping " << c << " with " << w << dendl; - scheduled[c] = w; - g_timer.add_event_at(when, w); -} - -void SafeTimer::EventWrapper::finish(int r) -{ - timer->lock.Lock(); - if (timer->scheduled.count(actual)) { - // still scheduled. execute. - actual->finish(r); - timer->scheduled.erase(actual); - } else { - // i was canceled. - assert(timer->canceled.count(actual)); - } - - // did i get canceled? - // (this can happen even if i just executed above. e.g., i may have canceled myself.) - if (timer->canceled.count(actual)) { - timer->canceled.erase(actual); - timer->cond.Signal(); - } - - // delete the original event - delete actual; - - timer->lock.Unlock(); -} - -void SafeTimer::cancel_event(Context *c) -{ - assert(lock.is_locked()); - assert(scheduled.count(c)); - - if (g_timer.cancel_event(scheduled[c])) { - // hosed wrapper. hose original event too. - delete c; - } else { - // clean up later. - canceled[c] = scheduled[c]; - } - scheduled.erase(c); -} - -void SafeTimer::cancel_all() -{ - assert(lock.is_locked()); - - while (!scheduled.empty()) - cancel_event(scheduled.begin()->first); -} - -void SafeTimer::join() -{ - assert(lock.is_locked()); - assert(scheduled.empty()); - - if (!canceled.empty()) { - while (!canceled.empty()) { - // wait - dout(2) << "SafeTimer.join waiting for " << canceled.size() << " to join: " << canceled << dendl; - cond.Wait(lock); - } - dout(2) << "SafeTimer.join done" << dendl; - } -} - -SafeTimer::~SafeTimer() -{ - if (!scheduled.empty() && !canceled.empty()) { - derr(0) << "SafeTimer.~SafeTimer " << scheduled.size() << " events scheduled, " - << canceled.size() << " canceled but unflushed" - << dendl; - } -} diff --git a/branches/sage/mds/common/Timer.h b/branches/sage/mds/common/Timer.h deleted file mode 100644 index 3574833c342c3..0000000000000 --- a/branches/sage/mds/common/Timer.h +++ /dev/null @@ -1,175 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __TIMER_H -#define __TIMER_H - -#include "include/types.h" -#include "include/Context.h" -#include "Clock.h" - -#include "Mutex.h" -#include "Cond.h" -#include "Thread.h" - -#include -#include -using std::map; -using std::set; - -#include -using namespace __gnu_cxx; - - -/*** Timer - * schedule callbacks - */ - -//class Messenger; - - -namespace __gnu_cxx { - template<> struct hash { - size_t operator()(const Context *p) const { - static hash H; - return H((unsigned long)p); - } - }; -} - - -class Timer { - private: - map< utime_t, set > scheduled; // time -> (context ...) - hash_map< Context*, utime_t > event_times; // event -> time - - bool get_next_due(utime_t &when); - - void register_timer(); // make sure i get a callback - void cancel_timer(); // make sure i get a callback - - bool thread_stop; - Mutex lock; - bool timed_sleep; - bool sleeping; - Cond sleep_cond; - Cond timeout_cond; - - public: - void timer_entry(); // waiter thread (that wakes us up) - - class TimerThread : public Thread { - Timer *t; - public: - void *entry() { - t->timer_entry(); - return 0; - } - TimerThread(Timer *_t) : t(_t) {} - } timer_thread; - - - int num_event; - - - public: - Timer() : - thread_stop(false), - timed_sleep(false), - sleeping(false), - timer_thread(this), - num_event(0) - { - } - ~Timer() { - // stop. - cancel_timer(); - - // scheduled - for (map< utime_t, set >::iterator it = scheduled.begin(); - it != scheduled.end(); - it++) { - for (set::iterator sit = it->second.begin(); - sit != it->second.end(); - sit++) - delete *sit; - } - scheduled.clear(); - } - - void init() { - register_timer(); - } - void shutdown() { - cancel_timer(); - } - - // schedule events - void add_event_after(double seconds, - Context *callback); - void add_event_at(utime_t when, - Context *callback); - bool cancel_event(Context *callback); - - // execute pending events - void execute_pending(); - -}; - - -/* - * SafeTimer is a wrapper around the raw Timer (or rather, g_timer, it's global - * instantiation) that protects event execution with an existing mutex. It - * provides for, among other things, reliable event cancellation on class - * destruction. The caller just needs to cancel each event (or cancel_all()), - * and then call join() to ensure any concurrently exectuting events (in other - * threads) get flushed. - */ -class SafeTimer { - Mutex& lock; - Cond cond; - map scheduled; // actual -> wrapper - map canceled; - - class EventWrapper : public Context { - SafeTimer *timer; - Context *actual; - public: - EventWrapper(SafeTimer *st, Context *c) : timer(st), - actual(c) {} - void finish(int r); - }; - -public: - SafeTimer(Mutex& l) : lock(l) { } - ~SafeTimer(); - - void add_event_after(double seconds, Context *c); - void add_event_at(utime_t when, Context *c); - void cancel_event(Context *c); - void cancel_all(); - void join(); - - int get_num_scheduled() { return scheduled.size(); } - int get_num_canceled() { return canceled.size(); } -}; - - -// single global instance -extern Timer g_timer; - - - -#endif diff --git a/branches/sage/mds/config.cc b/branches/sage/mds/config.cc deleted file mode 100644 index f037fe728dfe4..0000000000000 --- a/branches/sage/mds/config.cc +++ /dev/null @@ -1,1039 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include "config.h" -#include "include/types.h" -#include - -//#define MDS_CACHE_SIZE 4*10000 -> <20mb -//#define MDS_CACHE_SIZE 80000 62mb - -#define AVG_PER_INODE_SIZE 450 -#define MDS_CACHE_MB_TO_INODES(x) ((x)*1000000/AVG_PER_INODE_SIZE) - -//#define MDS_CACHE_SIZE MDS_CACHE_MB_TO_INODES( 50 ) -//#define MDS_CACHE_SIZE 1500000 -#define MDS_CACHE_SIZE 150000 - - -// hack hack hack ugly FIXME -#include "common/Mutex.h" -long buffer_total_alloc = 0; -Mutex bufferlock; - -#include "osd/osd_types.h" - -// debug output -Mutex _dout_lock; -ostream *_dout = &std::cout; -ostream *_derr = &std::cerr; - -// file layouts -struct ceph_file_layout g_OSD_FileLayout = { - fl_stripe_unit: 1<<22, - fl_stripe_count: 1, - fl_object_size: 1<<22, - fl_object_stripe_unit: 0, - fl_pg_preferred: -1, - fl_pg_type: CEPH_PG_TYPE_REP, - fl_pg_size: 2 -}; - -struct ceph_file_layout g_OSD_MDDirLayout = { - fl_stripe_unit: 1<<22, - fl_stripe_count: 1, - fl_object_size: 1<<22, - fl_object_stripe_unit: 0, - fl_pg_preferred: -1, - fl_pg_type: CEPH_PG_TYPE_REP, - fl_pg_size: 2 -}; - -struct ceph_file_layout g_OSD_MDLogLayout = { - fl_stripe_unit: 1<<20, - fl_stripe_count: 1, - fl_object_size: 1<<20, - fl_object_stripe_unit: 0, - fl_pg_preferred: -1, - fl_pg_type: CEPH_PG_TYPE_REP, - fl_pg_size: 2 -}; - -struct ceph_file_layout g_OSD_MDAnchorTableLayout = { - fl_stripe_unit: 1<<20, - fl_stripe_count: 1, - fl_object_size: 1<<20, - fl_object_stripe_unit: 0, - fl_pg_preferred: -1, - fl_pg_type: CEPH_PG_TYPE_REP, - fl_pg_size: 2 -}; - -#include - -// fake osd failures: osd -> time -std::map g_fake_kill_after; -std::map g_fake_osd_down; -std::map g_fake_osd_out; - -entity_addr_t g_my_addr; - -md_config_t g_debug_after_conf; - -md_config_t g_conf = { - num_mon: 1, - num_mds: 1, - num_osd: 4, - num_client: 1, - - mkfs: false, - - // profiling and debugging - log: true, - log_interval: 1, - log_name: (char*)0, - - log_messages: true, - log_pins: true, - - logger_calc_variance: true, - - dout_dir: 0, - - fake_clock: false, - fakemessenger_serialize: true, - - fake_osdmap_expand: 0, - fake_osdmap_updates: 0, - fake_osd_mttf: 0, - fake_osd_mttr: 0, - - osd_remount_at: 0, - - kill_after: 0, - - tick: 0, - - debug: 0, - debug_mds: 1, - debug_mds_balancer: 1, - debug_mds_log: 1, - debug_mds_log_expire: 1, - debug_mds_migrator: 1, - debug_buffer: 0, - debug_timer: 0, - debug_filer: 0, - debug_objecter: 0, - debug_journaler: 0, - debug_objectcacher: 0, - debug_client: 0, - debug_osd: 0, - debug_ebofs: 1, - debug_bdev: 1, // block device - debug_ns: 0, - debug_ms: 0, - debug_mon: 1, - debug_paxos: 0, - - debug_after: 0, - - // -- misc -- - use_abspaths: false, // make monitorstore et al use absolute path (to workaround FUSE chdir("/")) - - // --- clock --- - clock_lock: false, - clock_tare: false, - - // --- messenger --- - ms_tcp_nodelay: true, - ms_single_dispatch: false, - ms_requeue_on_sender_fail: false, - - ms_stripe_osds: false, - ms_skip_rank0: false, - ms_overlay_clients: false, - - ms_die_on_failure: false, - - /*tcp_skip_rank0: false, - tcp_overlay_clients: false, // over osds! - tcp_log: false, - tcp_serial_marshall: true, - tcp_serial_out: false, - tcp_multi_out: true, - tcp_multi_dispatch: false, // not fully implemented yet - */ - - // --- mon --- - mon_tick_interval: 5, - mon_osd_down_out_interval: 5, // seconds - mon_lease: 5, // seconds // lease interval - mon_lease_renew_interval: 3, // on leader, to renew the lease - mon_lease_ack_timeout: 10.0, // on leader, if lease isn't acked by all peons - mon_lease_timeout: 10.0, // on peon, if lease isn't extended - mon_accept_timeout: 10.0, // on leader, if paxos update isn't accepted - mon_stop_on_last_unmount: false, - mon_stop_with_last_mds: false, - mon_allow_mds_bully: true, // allow a booting mds to (forcibly) claim an mds # - - paxos_propose_interval: 1.0, // gather updates for this long before proposing a map update - - // --- client --- - client_cache_size: 1000, - client_cache_mid: .5, - client_cache_stat_ttl: 0, // seconds until cached stat results become invalid - client_cache_readdir_ttl: 1, // 1 second only - client_use_random_mds: false, - - client_sync_writes: 0, - - client_mount_timeout: 10.0, // retry every N seconds - - client_hack_balance_reads: false, - - client_trace: 0, - fuse_direct_io: 0, - fuse_ll: true, - - // --- objectcacher --- - client_oc: true, - client_oc_size: 1024*1024* 10, // MB * n - client_oc_max_dirty: 1024*1024* 10, // MB * n (dirty OR tx) - client_oc_max_sync_write: 128*1024, // synx writes >= this use wrlock - - // --- objecter --- - objecter_buffer_uncommitted: true, // this must be true for proper failure handling - objecter_map_request_interval: 15.0, // request a new map every N seconds, if we have pending io - objecter_tick_interval: 5.0, - objecter_timeout: 10.0, // before we ask for a map - - // --- journaler --- - journaler_allow_split_entries: true, - journaler_safe: false, // wait for COMMIT on journal writes - journaler_write_head_interval: 15, - journaler_cache: false, // cache writes for later readback - journaler_prefetch_periods: 50, // * journal object size (1~MB? see above) - journaler_batch_interval: .001, // seconds.. max add'l latency we artificially incur - journaler_batch_max: 16384, // max bytes we'll delay flushing - - // --- mds --- - mds_cache_size: 300000, //MDS_CACHE_SIZE, - mds_cache_mid: .7, - - mds_decay_halflife: 5, - - mds_beacon_interval: 4, //30.0, - mds_beacon_grace: 15, //60*60.0, - - mds_log: true, - mds_log_max_events: -1, //MDS_CACHE_SIZE / 3, - mds_log_max_segments: 100, - mds_log_max_expiring: 20, - mds_log_pad_entry: 128,//256,//64, - mds_log_eopen_size: 100, // # open inodes per log entry - - mds_bal_sample_interval: 3.0, // every 5 seconds - mds_bal_replicate_threshold: 8000, - mds_bal_unreplicate_threshold: 0,//500, - mds_bal_split_size: 10000, - mds_bal_split_rd: 25000, - mds_bal_split_wr: 10000, - mds_bal_merge_size: 50, - mds_bal_merge_rd: 1000, - mds_bal_merge_wr: 1000, - mds_bal_interval: 10, // seconds - mds_bal_fragment_interval: 2, // seconds - mds_bal_idle_threshold: 0, //.1, - mds_bal_max: -1, - mds_bal_max_until: -1, - - mds_bal_mode: 0, - mds_bal_min_rebalance: .1, // must be this much above average before we export anything - mds_bal_min_start: .2, // if we need less than this, we don't do anything - mds_bal_need_min: .8, // take within this range of what we need - mds_bal_need_max: 1.2, - mds_bal_midchunk: .3, // any sub bigger than this taken in full - mds_bal_minchunk: .001, // never take anything smaller than this - - mds_trim_on_rejoin: true, - mds_shutdown_check: 0, //30, - - mds_verify_export_dirauth: true, - - mds_local_osd: false, - mds_local_osd_offset: 1000, - - mds_thrash_exports: 0, - mds_thrash_fragments: 0, - mds_dump_cache_on_map: false, - mds_dump_cache_after_rejoin: true, - - mds_hack_log_expire_for_better_stats: false, - - // --- osd --- - osd_rep: OSD_REP_PRIMARY, - - osd_balance_reads: false, // send from client to replica - osd_flash_crowd_iat_threshold: 0,//100, - osd_flash_crowd_iat_alpha: 0.125, - osd_balance_reads_temp: 100, - - osd_shed_reads: false, // forward from primary to replica - osd_shed_reads_min_latency: .01, // min local latency - osd_shed_reads_min_latency_diff: .01, // min latency difference - osd_shed_reads_min_latency_ratio: 1.5, // 1.2 == 20% higher than peer - - osd_immediate_read_from_cache: false,//true, // osds to read from the cache immediately? - osd_exclusive_caching: true, // replicas evict replicated writes - - osd_stat_refresh_interval: .5, - - osd_pg_bits: 4, // bits per osd - osd_object_layout: CEPH_OBJECT_LAYOUT_HASHINO,//LINEAR,//HASHINO, - osd_pg_layout: CEPH_PG_LAYOUT_CRUSH,//LINEAR,//CRUSH, - osd_max_rep: 4, - osd_min_raid_width: 4, - osd_max_raid_width: 3, //6, - - osd_maxthreads: 2, // 0 == no threading - osd_max_opq: 10, - osd_mkfs: false, - osd_age: .8, - osd_age_time: 0, - osd_heartbeat_interval: 1, - osd_pg_stats_interval: 5, - osd_replay_window: 5, - osd_max_pull: 2, - osd_pad_pg_log: false, - - osd_auto_weight: false, - - osd_hack_fast_startup: false, // this breaks localized pgs. - - - // --- fakestore --- - fakestore_fake_sync: .2, // seconds - fakestore_fsync: false,//true, - fakestore_writesync: false, - fakestore_syncthreads: 4, - fakestore_fake_attrs: false, - fakestore_fake_collections: false, - fakestore_dev: 0, - - // --- ebofs --- - ebofs: 1, - ebofs_cloneable: false, - ebofs_verify: false, - ebofs_commit_ms: 1000, // 0 = no forced commit timeout (for debugging/tracing) - ebofs_idle_commit_ms: 0, // 0 = no idle detection. UGLY HACK. use bdev_idle_kick_after_ms instead. - ebofs_oc_size: 10000, // onode cache - ebofs_cc_size: 10000, // cnode cache - ebofs_bc_size: (50 *256), // 4k blocks, *256 for MB - ebofs_bc_max_dirty: (30 *256), // before write() will block - ebofs_max_prefetch: 1000, // 4k blocks - ebofs_realloc: false, // hrm, this can cause bad fragmentation, don't use! - - ebofs_abp_zero: false, // zero newly allocated buffers (may shut up valgrind) - ebofs_abp_max_alloc: 4096*16, // max size of new buffers (larger -> more memory fragmentation) - - // --- block device --- - bdev_lock: true, - bdev_iothreads: 1, // number of ios to queue with kernel - bdev_idle_kick_after_ms: 100, // ms - bdev_el_fw_max_ms: 10000, // restart elevator at least once every 1000 ms - bdev_el_bw_max_ms: 3000, // restart elevator at least once every 300 ms - bdev_el_bidir: false, // bidirectional elevator? - bdev_iov_max: 512, // max # iov's to collect into a single readv()/writev() call - bdev_debug_check_io_overlap: true, // [DEBUG] check for any pending io overlaps - bdev_fake_mb: 0, - bdev_fake_max_mb: 0, - - // --- fakeclient (mds regression testing) (ancient history) --- - num_fakeclient: 100, - fakeclient_requests: 100, - fakeclient_deterministic: false, - - fakeclient_op_statfs: false, - - // loosely based on Roselli workload paper numbers - fakeclient_op_stat: 610, - fakeclient_op_lstat: false, - fakeclient_op_utime: 0, - fakeclient_op_chmod: 1, - fakeclient_op_chown: 1, - - fakeclient_op_readdir: 2, - fakeclient_op_mknod: 30, - fakeclient_op_link: false, - fakeclient_op_unlink: 20, - fakeclient_op_rename: 0,//40, - - fakeclient_op_mkdir: 10, - fakeclient_op_rmdir: 20, - fakeclient_op_symlink: 20, - - fakeclient_op_openrd: 200, - fakeclient_op_openwr: 0, - fakeclient_op_openwrc: 0, - fakeclient_op_read: false, // osd! - fakeclient_op_write: false, // osd! - fakeclient_op_truncate: false, - fakeclient_op_fsync: false, - fakeclient_op_close: 200 - -#ifdef USE_OSBDB - , - bdbstore: false, - debug_bdbstore: 1, - bdbstore_btree: false, - bdbstore_ffactor: 0, - bdbstore_nelem: 0, - bdbstore_pagesize: 0, - bdbstore_cachesize: 0, - bdbstore_transactional: false -#endif // USE_OSBDB -}; - - -#include -#include - - -void env_to_vec(std::vector& args) -{ - const char *p = getenv("CEPH_ARGS"); - if (!p) return; - - static char buf[1000]; - int len = strlen(p); - memcpy(buf, p, len); - buf[len] = 0; - //cout << "CEPH_ARGS " << buf << endl; - - int l = 0; - for (int i=0; i& args) -{ - for (int i=1; i& args, - int& argc, char **&argv) -{ - argv = (char**)malloc(sizeof(char*) * argc); - argc = 1; - argv[0] = "asdf"; - - for (unsigned i=0; i= '0' && *s <= '9') { - int digit = *s - '0'; - //cout << "digit " << digit << endl; - val *= 10; - val += digit; - numdigits++; - s++; off++; - } - //cout << "val " << val << endl; - - if (numdigits == 0) { - cerr << "no digits at off " << off << std::endl; - return false; // no digits - } - if (count < 3 && *s != '.') { - cerr << "should period at " << off << std::endl; - return false; // should have 3 periods - } - s++; off++; - - if (count <= 3) - a.v.ipq[count] = val; - else - a.v.port = val; - - count++; - if (count == 4 && *s != ':') break; - if (count == 5) break; - } - - return true; -} - - - -void parse_config_options(std::vector& args) -{ - std::vector nargs; - - for (unsigned i=0; iis_open()) { - std::cerr << "error opening output file " << fn << std::endl; - delete out; - } else { - _dout = out; - } - } - - args = nargs; -} diff --git a/branches/sage/mds/config.h b/branches/sage/mds/config.h deleted file mode 100644 index b5cdf6cbd586d..0000000000000 --- a/branches/sage/mds/config.h +++ /dev/null @@ -1,418 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __CONFIG_H -#define __CONFIG_H - -extern struct ceph_file_layout g_OSD_FileLayout; -extern struct ceph_file_layout g_OSD_MDDirLayout; -extern struct ceph_file_layout g_OSD_MDLogLayout; -extern struct ceph_file_layout g_OSD_MDAnchorTableLayout; - -#include -#include - -#include "common/Mutex.h" - -extern std::map g_fake_osd_down; -extern std::map g_fake_osd_out; - -#define OSD_REP_PRIMARY 0 -#define OSD_REP_SPLAY 1 -#define OSD_REP_CHAIN 2 - - -#include "msg/msg_types.h" - -extern entity_addr_t g_my_addr; - -struct md_config_t { - int num_mon; - int num_mds; - int num_osd; - int num_client; - - bool mkfs; - - // profiling - bool log; - int log_interval; - char *log_name; - - bool log_messages; - bool log_pins; - - bool logger_calc_variance; - - char *dout_dir; - - bool fake_clock; - bool fakemessenger_serialize; - - int fake_osdmap_expand; - int fake_osdmap_updates; - int fake_osd_mttf; - int fake_osd_mttr; - - int osd_remount_at; - - int kill_after; - - int tick; - - int debug; - int debug_mds; - int debug_mds_balancer; - int debug_mds_log; - int debug_mds_log_expire; - int debug_mds_migrator; - int debug_buffer; - int debug_timer; - int debug_filer; - int debug_objecter; - int debug_journaler; - int debug_objectcacher; - int debug_client; - int debug_osd; - int debug_ebofs; - int debug_bdev; - int debug_ns; - int debug_ms; - int debug_mon; - int debug_paxos; - - int debug_after; - - // misc - bool use_abspaths; - - // clock - bool clock_lock; - bool clock_tare; - - // messenger - - /*bool tcp_skip_rank0; - bool tcp_overlay_clients; - bool tcp_log; - bool tcp_serial_marshall; - bool tcp_serial_out; - bool tcp_multi_out; - bool tcp_multi_dispatch; - */ - - bool ms_tcp_nodelay; - bool ms_single_dispatch; - bool ms_requeue_on_sender_fail; - - bool ms_stripe_osds; - bool ms_skip_rank0; - bool ms_overlay_clients; - bool ms_die_on_failure; - - // mon - int mon_tick_interval; - int mon_osd_down_out_interval; - float mon_lease; - float mon_lease_renew_interval; - float mon_lease_ack_timeout; - float mon_lease_timeout; - float mon_accept_timeout; - bool mon_stop_on_last_unmount; - bool mon_stop_with_last_mds; - bool mon_allow_mds_bully; - - double paxos_propose_interval; - - // client - int client_cache_size; - float client_cache_mid; - int client_cache_stat_ttl; - int client_cache_readdir_ttl; - bool client_use_random_mds; // debug flag - - bool client_sync_writes; - - double client_mount_timeout; - - // hack - bool client_hack_balance_reads; - - - /* - bool client_bcache; - int client_bcache_alloc_minsize; - int client_bcache_alloc_maxsize; - int client_bcache_ttl; - off_t client_bcache_size; - int client_bcache_lowater; - int client_bcache_hiwater; - size_t client_bcache_align; - */ - - char *client_trace; - int fuse_direct_io; - bool fuse_ll; - - // objectcacher - bool client_oc; - int client_oc_size; - int client_oc_max_dirty; - size_t client_oc_max_sync_write; - - // objecter - bool objecter_buffer_uncommitted; - double objecter_map_request_interval; - double objecter_tick_interval; - double objecter_timeout; - - // journaler - bool journaler_allow_split_entries; - bool journaler_safe; - int journaler_write_head_interval; - bool journaler_cache; - int journaler_prefetch_periods; - double journaler_batch_interval; - size_t journaler_batch_max; - - // mds - int mds_cache_size; - float mds_cache_mid; - - float mds_decay_halflife; - - float mds_beacon_interval; - float mds_beacon_grace; - - bool mds_log; - int mds_log_max_events; - int mds_log_max_segments; - int mds_log_max_expiring; - int mds_log_pad_entry; - int mds_log_eopen_size; - - float mds_bal_sample_interval; - float mds_bal_replicate_threshold; - float mds_bal_unreplicate_threshold; - int mds_bal_split_size; - float mds_bal_split_rd; - float mds_bal_split_wr; - int mds_bal_merge_size; - float mds_bal_merge_rd; - float mds_bal_merge_wr; - int mds_bal_interval; - int mds_bal_fragment_interval; - float mds_bal_idle_threshold; - int mds_bal_max; - int mds_bal_max_until; - - int mds_bal_mode; - float mds_bal_min_rebalance; - float mds_bal_min_start; - float mds_bal_need_min; - float mds_bal_need_max; - float mds_bal_midchunk; - float mds_bal_minchunk; - - bool mds_trim_on_rejoin; - int mds_shutdown_check; - - bool mds_verify_export_dirauth; // debug flag - - bool mds_local_osd; - int mds_local_osd_offset; - - int mds_thrash_exports; - int mds_thrash_fragments; - bool mds_dump_cache_on_map; - bool mds_dump_cache_after_rejoin; - - bool mds_hack_log_expire_for_better_stats; - - // osd - int osd_rep; - - bool osd_balance_reads; - int osd_flash_crowd_iat_threshold; // flash crowd interarrival time threshold in ms - double osd_flash_crowd_iat_alpha; - double osd_balance_reads_temp; - - int osd_shed_reads; - double osd_shed_reads_min_latency; - double osd_shed_reads_min_latency_diff; - double osd_shed_reads_min_latency_ratio; - - bool osd_immediate_read_from_cache; - bool osd_exclusive_caching; - double osd_stat_refresh_interval; - - int osd_pg_bits; - int osd_object_layout; - int osd_pg_layout; - int osd_max_rep; - int osd_min_raid_width; - int osd_max_raid_width; - int osd_maxthreads; - int osd_max_opq; - bool osd_mkfs; - float osd_age; - int osd_age_time; - int osd_heartbeat_interval; - int osd_pg_stats_interval; - int osd_replay_window; - int osd_max_pull; - bool osd_pad_pg_log; - - bool osd_auto_weight; - - bool osd_hack_fast_startup; - - double fakestore_fake_sync; - bool fakestore_fsync; - bool fakestore_writesync; - int fakestore_syncthreads; // such crap - bool fakestore_fake_attrs; - bool fakestore_fake_collections; - char *fakestore_dev; - - // ebofs - int ebofs; - bool ebofs_cloneable; - bool ebofs_verify; - int ebofs_commit_ms; - int ebofs_idle_commit_ms; - int ebofs_oc_size; - int ebofs_cc_size; - off_t ebofs_bc_size; - off_t ebofs_bc_max_dirty; - unsigned ebofs_max_prefetch; - bool ebofs_realloc; - - bool ebofs_abp_zero; - size_t ebofs_abp_max_alloc; - - // block device - bool bdev_lock; - int bdev_iothreads; - int bdev_idle_kick_after_ms; - int bdev_el_fw_max_ms; - int bdev_el_bw_max_ms; - bool bdev_el_bidir; - int bdev_iov_max; - bool bdev_debug_check_io_overlap; - int bdev_fake_mb; - int bdev_fake_max_mb; - - // fake client - int num_fakeclient; - unsigned fakeclient_requests; - bool fakeclient_deterministic; // debug flag - - int fakeclient_op_statfs; - - int fakeclient_op_stat; - int fakeclient_op_lstat; - int fakeclient_op_utime; - int fakeclient_op_chmod; - int fakeclient_op_chown; - - int fakeclient_op_readdir; - int fakeclient_op_mknod; - int fakeclient_op_link; - int fakeclient_op_unlink; - int fakeclient_op_rename; - - int fakeclient_op_mkdir; - int fakeclient_op_rmdir; - int fakeclient_op_symlink; - - int fakeclient_op_openrd; - int fakeclient_op_openwr; - int fakeclient_op_openwrc; - int fakeclient_op_read; - int fakeclient_op_write; - int fakeclient_op_truncate; - int fakeclient_op_fsync; - int fakeclient_op_close; - -#ifdef USE_OSBDB - bool bdbstore; - int debug_bdbstore; - bool bdbstore_btree; - int bdbstore_ffactor; - int bdbstore_nelem; - int bdbstore_pagesize; - int bdbstore_cachesize; - bool bdbstore_transactional; -#endif // USE_OSBDB -}; - -extern md_config_t g_conf; -extern md_config_t g_debug_after_conf; - - -/** - * command line / environment argument parsing - */ -void env_to_vec(std::vector& args); -void argv_to_vec(int argc, char **argv, - std::vector& args); -void vec_to_argv(std::vector& args, - int& argc, char **&argv); - -void parse_config_options(std::vector& args); - -extern bool parse_ip_port(const char *s, entity_addr_t& addr); - - -/** - * for cleaner output, bracket each line with - * dbeginl (in the dout macro) and dendl (in place of endl). - */ -extern Mutex _dout_lock; -struct _dbeginl_t { _dbeginl_t(int) {} }; -struct _dendl_t { _dendl_t(int) {} }; -static const _dbeginl_t dbeginl = 0; -static const _dendl_t dendl = 0; - -// intentionally conflict with endl -class _bad_endl_use_dendl_t { public: _bad_endl_use_dendl_t(int) {} }; -static const _bad_endl_use_dendl_t endl = 0; - -inline ostream& operator<<(ostream& out, _dbeginl_t) { - _dout_lock.Lock(); - return out; -} -inline ostream& operator<<(ostream& out, _dendl_t) { - out << std::endl; - _dout_lock.Unlock(); - return out; -} -inline ostream& operator<<(ostream& out, _bad_endl_use_dendl_t) { - assert(0 && "you are using the wrong endl.. use std::endl or dendl"); - return out; -} - -// the streams -extern ostream *_dout; -extern ostream *_derr; - -// generic macros -#define generic_dout(x) if ((x) <= g_conf.debug) *_dout << dbeginl -#define generic_derr(x) if ((x) <= g_conf.debug) *_derr << dbeginl - -#define pdout(x,p) if ((x) <= (p)) *_dout << dbeginl - - -#endif diff --git a/branches/sage/mds/cosd.cc b/branches/sage/mds/cosd.cc deleted file mode 100644 index e575c72836e69..0000000000000 --- a/branches/sage/mds/cosd.cc +++ /dev/null @@ -1,135 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include -#include -#include - -#include -#include -#include -using namespace std; - -#include "config.h" - -#include "mon/MonMap.h" - -#include "osd/OSD.h" -#include "ebofs/Ebofs.h" - -#include "msg/SimpleMessenger.h" - -#include "common/Timer.h" - - -class C_Die : public Context { -public: - void finish(int) { - cerr << "die" << std::endl; - exit(1); - } -}; - -class C_Debug : public Context { - public: - void finish(int) { - int size = &g_conf.debug_after - &g_conf.debug; - memcpy((char*)&g_conf.debug, (char*)&g_debug_after_conf.debug, size); - cout << "debug_after flipping debug settings" << std::endl; - } -}; - - -int main(int argc, char **argv) -{ - vector args; - argv_to_vec(argc, argv, args); - - parse_config_options(args); - - if (g_conf.kill_after) - g_timer.add_event_after(g_conf.kill_after, new C_Die); - if (g_conf.debug_after) - g_timer.add_event_after(g_conf.debug_after, new C_Debug); - - if (g_conf.clock_tare) g_clock.tare(); - - // osd specific args - char *dev = 0; - char dev_default[20]; - int whoami = -1; - for (unsigned i=0; imount(); - int r = store->read(object_t(0,0), 0, sizeof(sb), bl); - if (r < 0) { - cerr << "couldn't read superblock object on " << dev << std::endl; - exit(0); - } - bl.copy(0, sizeof(sb), (char*)&sb); - store->umount(); - delete store; - whoami = sb.whoami; - - cout << "osd fs says i am osd" << whoami << std::endl; - } else { - cout << "command line arg says i am osd" << whoami << std::endl; - } - - // load monmap - MonMap monmap; - int r = monmap.read(".ceph_monmap"); - assert(r >= 0); - - // start up network - rank.start_rank(); - - // start osd - Messenger *m = rank.register_entity(entity_name_t::OSD(whoami)); - assert(m); - OSD *osd = new OSD(whoami, m, &monmap, dev); - osd->init(); - - // wait - rank.wait(); - - // done - delete osd; - - return 0; -} - diff --git a/branches/sage/mds/crush/BinaryTree.h b/branches/sage/mds/crush/BinaryTree.h deleted file mode 100644 index 7573fc02ed6dc..0000000000000 --- a/branches/sage/mds/crush/BinaryTree.h +++ /dev/null @@ -1,285 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __crush_BINARYTREE_H -#define __crush_BINARYTREE_H - -#include -#include -#include -#include -using std::map; -using std::vector; - -#include "include/buffer.h" - -namespace crush { - - class BinaryTree { - private: - // tree def - int root_node; // 0 for empty tree. - int alloc; - vector node_nested; // all existing nodes in this map - vector node_weight; // and this one - vector node_complete; // only nodes with all possible children - - public: - BinaryTree() : root_node(0), alloc(0) {} - - void _encode(bufferlist& bl) { - bl.append((char*)&root_node, sizeof(root_node)); - bl.append((char*)&alloc, sizeof(alloc)); - ::_encode(node_nested, bl); - ::_encode(node_weight, bl); - ::_encode(node_complete, bl); - } - void _decode(bufferlist& bl, int& off) { - bl.copy(off, sizeof(root_node), (char*)&root_node); - off += sizeof(root_node); - bl.copy(off, sizeof(alloc), (char*)&alloc); - off += sizeof(alloc); - ::_decode(node_nested, bl, off); - ::_decode(node_weight, bl, off); - ::_decode(node_complete, bl, off); - } - - // accessors - bool empty() const { return root_node == 0; } - bool exists(int n) const { return n < alloc && node_nested[n]; } - int nested(int n) const { return exists(n) ? node_nested[n]:0; } - float weight(int n) const { return exists(n) ? node_weight[n]:0; } - bool complete(int n) const { return exists(n) ? node_complete[n]:false; } - - int root() const { return root_node; } - - void realloc(int n) { - /* - while (alloc <= n) { - node_nested.push_back(0); - node_weight.push_back(0); - node_complete.push_back(0); - alloc++; - } - */ - if (alloc <= n) { - int add = n - alloc + 1; - node_nested.insert(node_nested.end(), add, 0); - node_weight.insert(node_weight.end(), add, 0); - node_complete.insert(node_complete.end(), add, 0); - alloc = n+1; - } - } - - // tree navigation - bool terminal(int n) const { return n & 1; } // odd nodes are leaves. - int height(int n) const { - assert(n); - int h = 0; - while ((n & 1) == 0) { - assert(n > 0); - h++; n = n >> 1; - } - return h; - } - int left(int n) const { - int h = height(n); - //cout << "left of " << n << " is " << (n - (1 << h)) << std::endl; - return n - (1 << (h-1)); - } - int right(int n) const { - int h = height(n); - //cout << "right of " << n << " is " << (n + (1 << h)) << std::endl; - return n + (1 << (h-1)); - } - bool on_right(int n, int h = -1) const { - if (h < 0) h = height(n); - return n & (1 << (h+1)); - } - bool on_left(int n) const { return !on_right(n); } - int parent(int n) const { - int h = height(n); - if (on_right(n, h)) - return n - (1<0; t--) out << " "; - if (tree.root() == n) - out << "root "; - else { - if (tree.on_left(n)) - out << "left "; - else - out << "right "; - } - out << n << " : nested " << tree.nested(n) << " weight " << tree.weight(n); - if (tree.complete(n)) out << " complete"; - out << std::endl; - if (!tree.terminal(n)) { - if (tree.exists(tree.left(n))) - print_binary_tree_node(out, tree, tree.left(n), i+2); - if (tree.exists(tree.right(n))) - print_binary_tree_node(out, tree, tree.right(n), i+2); - } - } - - inline ostream& operator<<(ostream& out, const BinaryTree& tree) { - if (tree.empty()) - return out << "tree is empty"; - print_binary_tree_node(out, tree, tree.root(), 0); - return out; - } - -} - -#endif diff --git a/branches/sage/mds/crush/Bucket.h b/branches/sage/mds/crush/Bucket.h deleted file mode 100644 index 81a2576697bd7..0000000000000 --- a/branches/sage/mds/crush/Bucket.h +++ /dev/null @@ -1,632 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __crush_BUCKET_H -#define __crush_BUCKET_H - -#include "BinaryTree.h" -#include "Hash.h" - -#include -#include -#include -#include -using namespace std; - -#include - -#include "include/buffer.h" - -namespace crush { - - - const int CRUSH_BUCKET_UNIFORM = 1; - const int CRUSH_BUCKET_TREE = 2; - const int CRUSH_BUCKET_LIST = 3; - const int CRUSH_BUCKET_STRAW = 4; - - /** abstract bucket **/ - class Bucket { - protected: - int id; - int parent; - int type; - float weight; - - public: - Bucket(int _type, - float _weight) : - id(0), parent(0), - type(_type), - weight(_weight) { } - - Bucket(bufferlist& bl, int& off) { - bl.copy(off, sizeof(id), (char*)&id); - off += sizeof(id); - bl.copy(off, sizeof(parent), (char*)&parent); - off += sizeof(parent); - bl.copy(off, sizeof(type), (char*)&type); - off += sizeof(type); - bl.copy(off, sizeof(weight), (char*)&weight); - off += sizeof(weight); - } - - virtual ~Bucket() { } - - virtual const char *get_bucket_type() const = 0; - virtual bool is_uniform() const = 0; - - int get_id() const { return id; } - int get_type() const { return type; } - float get_weight() const { return weight; } - int get_parent() const { return parent; } - virtual int get_size() const = 0; - - void set_id(int i) { id = i; } - void set_parent(int p) { parent = p; } - void set_weight(float w) { weight = w; } - - virtual void get_items(vector& i) const = 0; - virtual float get_item_weight(int item) const = 0; - virtual void add_item(int item, float w, bool back=false) = 0; - virtual void adjust_item_weight(int item, float w) = 0; - virtual void set_item_weight(int item, float w) { - adjust_item_weight(item, w - get_item_weight(item)); - } - - virtual int choose_r(int x, int r, Hash& h) const = 0; - - virtual void _encode(bufferlist& bl) = 0; - }; - - - - - /** uniform bucket **/ - class UniformBucket : public Bucket { - protected: - public: - vector items; - int item_type; - float item_weight; - - // primes - vector primes; - - int get_prime(int j) const { - return primes[ j % primes.size() ]; - } - void make_primes() { - if (items.empty()) return; - - //cout << "make_primes " << get_id() << " " << items.size() << endl; - Hash h(123+get_id()); - primes.clear(); - - // start with odd number > num_items - unsigned x = items.size() + 1; // this is the minimum! - x += h(items.size()) % (3*items.size()); // bump it up some - x |= 1; // make it odd - - while (primes.size() < items.size()) { - unsigned j; - for (j=2; j*j<=x; j++) - if (x % j == 0) break; - if (j*j > x) { - primes.push_back(x); - //cout << "prime " << x << endl; - } - x += 2; - } - } - - public: - UniformBucket(int _type, int _item_type) : - Bucket(_type, 0), - item_type(_item_type) { } - UniformBucket(int _type, int _item_type, - float _item_weight, vector& _items) : - Bucket(_type, _item_weight*_items.size()), - item_type(_item_type), - item_weight(_item_weight) { - items = _items; - make_primes(); - } - - UniformBucket(bufferlist& bl, int& off) : Bucket(bl, off) { - bl.copy(off, sizeof(item_type), (char*)&item_type); - off += sizeof(item_type); - bl.copy(off, sizeof(item_weight), (char*)&item_weight); - off += sizeof(item_weight); - ::_decode(items, bl, off); - make_primes(); - } - - void _encode(bufferlist& bl) { - char t = CRUSH_BUCKET_UNIFORM; - bl.append((char*)&t, sizeof(t)); - bl.append((char*)&id, sizeof(id)); - bl.append((char*)&parent, sizeof(parent)); - bl.append((char*)&type, sizeof(type)); - bl.append((char*)&weight, sizeof(weight)); - - bl.append((char*)&item_type, sizeof(item_type)); - bl.append((char*)&item_weight, sizeof(item_weight)); - - ::_encode(items, bl); - } - - const char *get_bucket_type() const { return "uniform"; } - bool is_uniform() const { return true; } - - int get_size() const { return items.size(); } - - // items - void get_items(vector& i) const { - i = items; - } - int get_item_type() const { return item_type; } - float get_item_weight(int item) const { return item_weight; } - - void add_item(int item, float w, bool back=false) { - if (items.empty()) - item_weight = w; - items.push_back(item); - weight += item_weight; - make_primes(); - } - - void adjust_item_weight(int item, float w) { - assert(0); - } - - int choose_r(int x, int r, Hash& hash) const { - //cout << "uniformbucket.choose_r(" << x << ", " << r << ")" << endl; - //if (r >= get_size()) cout << "warning: r " << r << " >= " << get_size() << " uniformbucket.size" << endl; - - unsigned v = hash(x, get_id());// % get_size(); - unsigned p = get_prime( hash(get_id(), x) ); // choose a prime based on hash(x, get_id(), 2) - unsigned s = (x + v + (r+1)*p) % get_size(); - return items[s]; - } - - }; - - - - - - // list bucket.. RUSH_P sorta - - class ListBucket : public Bucket { - protected: - list items; - list item_weight; - list sum_weight; - - public: - ListBucket(int _type) : Bucket(_type, 0) { } - - ListBucket(bufferlist& bl, int& off) : Bucket(bl, off) { - ::_decode(items, bl, off); - ::_decode(item_weight, bl, off); - ::_decode(sum_weight, bl, off); - } - - void _encode(bufferlist& bl) { - char t = CRUSH_BUCKET_LIST; - bl.append((char*)&t, sizeof(t)); - bl.append((char*)&id, sizeof(id)); - bl.append((char*)&parent, sizeof(parent)); - bl.append((char*)&type, sizeof(type)); - bl.append((char*)&weight, sizeof(weight)); - - ::_encode(items, bl); - ::_encode(item_weight, bl); - ::_encode(sum_weight, bl); - } - - const char *get_bucket_type() const { return "list"; } - bool is_uniform() const { return false; } - - int get_size() const { return items.size(); } - - void get_items(vector& i) const { - for (list::const_iterator it = items.begin(); - it != items.end(); - it++) - i.push_back(*it); - } - float get_item_weight(int item) const { - list::const_iterator i = items.begin(); - list::const_iterator w = item_weight.begin(); - while (i != items.end()) { - if (*i == item) return *w; - i++; w++; - } - assert(0); - return 0; - } - - void add_item(int item, float w, bool back=false) { - if (back) { - items.push_back(item); - item_weight.push_back(w); - sum_weight.clear(); - float s = 0.0; - for (list::reverse_iterator i = item_weight.rbegin(); - i != item_weight.rend(); - i++) { - s += *i; - sum_weight.push_front(s); - } - weight += w; - assert(weight == s); - } else { - items.push_front(item); - item_weight.push_front(w); - weight += w; - sum_weight.push_front(weight); - } - } - - void adjust_item_weight(int item, float dw) { - // find it - list::iterator p = items.begin(); - list::iterator pw = item_weight.begin(); - list::iterator ps = sum_weight.begin(); - - while (*p != item) { - *ps += dw; - p++; pw++; ps++; // next! - assert(p != items.end()); - } - - assert(*p == item); - *pw += dw; - *ps += dw; - } - - - int choose_r(int x, int r, Hash& h) const { - //cout << "linearbucket.choose_r(" << x << ", " << r << ")" << endl; - - list::const_iterator p = items.begin(); - list::const_iterator pw = item_weight.begin(); - list::const_iterator ps = sum_weight.begin(); - - while (p != items.end()) { - const int item = *p; - const float iw = *pw; - const float tw = *ps; - const float f = (float)(h(x, item, r, get_id()) % 10000) * tw / 10000.0; - //cout << "item " << item << " iw = " << iw << " tw = " << tw << " f = " << f << endl; - if (f < iw) { - //cout << "linearbucket.choose_r(" << x << ", " << r << ") = " << item << endl; - return item; - } - p++; pw++; ps++; // next! - } - assert(0); - return 0; - } - - - }; - - - - - // mixed bucket, based on RUSH_T type binary tree - - class TreeBucket : public Bucket { - protected: - //vector item_weight; - - // public: - BinaryTree tree; - map node_item; // node id -> item - vector node_item_vec; // fast version of above - map item_node; // item -> node id - map item_weight; - - public: - TreeBucket(int _type) : Bucket(_type, 0) { } - - TreeBucket(bufferlist& bl, int& off) : Bucket(bl, off) { - tree._decode(bl, off); - - ::_decode(node_item, bl, off); - ::_decode(node_item_vec, bl, off); - ::_decode(item_node, bl, off); - ::_decode(item_weight, bl, off); - } - - void _encode(bufferlist& bl) { - char t = CRUSH_BUCKET_TREE; - bl.append((char*)&t, sizeof(t)); - bl.append((char*)&id, sizeof(id)); - bl.append((char*)&parent, sizeof(parent)); - bl.append((char*)&type, sizeof(type)); - bl.append((char*)&weight, sizeof(weight)); - - tree._encode(bl); - - ::_encode(node_item, bl); - ::_encode(node_item_vec, bl); - ::_encode(item_node, bl); - ::_encode(item_weight, bl); - } - - const char *get_bucket_type() const { return "tree"; } - bool is_uniform() const { return false; } - - int get_size() const { return node_item.size(); } - - // items - void get_items(vector& i) const { - for (map::const_iterator it = node_item.begin(); - it != node_item.end(); - it++) - i.push_back(it->second); - } - float get_item_weight(int i) const { - assert(item_weight.count(i)); - return ((map)item_weight)[i]; - } - - - void add_item(int item, float w, bool back=false) { - item_weight[item] = w; - weight += w; - - unsigned n = tree.add_node(w); - node_item[n] = item; - item_node[item] = n; - - while (node_item_vec.size() <= n) - node_item_vec.push_back(0); - node_item_vec[n] = item; - } - - void adjust_item_weight(int item, float dw) { - // adjust my weight - weight += dw; - item_weight[item] += dw; - - // adjust tree weights - tree.adjust_node_weight(item_node[item], dw); - } - - int choose_r(int x, int r, Hash& h) const { - //cout << "mixedbucket.choose_r(" << x << ", " << r << ")" << endl; - int n = tree.root(); - while (!tree.terminal(n)) { - // pick a point in [0,w) - float w = tree.weight(n); - float f = (float)(h(x, n, r, get_id()) % 10000) * w / 10000.0; - - // left or right? - int l = tree.left(n); - if (tree.exists(l) && - f < tree.weight(l)) - n = l; - else - n = tree.right(n); - } - //assert(node_item.count(n)); - //return ((map)node_item)[n]; - return node_item_vec[n]; - } - }; - - - - - - // straw bucket.. new thing! - - class StrawBucket : public Bucket { - protected: - map item_weight; - map item_straw; - - list _items; - list _straws; - - public: - StrawBucket(int _type) : Bucket(_type, 0) { } - - StrawBucket(bufferlist& bl, int& off) : Bucket(bl, off) { - ::_decode(item_weight, bl, off); - calc_straws(); - } - - void _encode(bufferlist& bl) { - char t = CRUSH_BUCKET_TREE; - bl.append((char*)&t, sizeof(t)); - bl.append((char*)&id, sizeof(id)); - bl.append((char*)&parent, sizeof(parent)); - bl.append((char*)&type, sizeof(type)); - bl.append((char*)&weight, sizeof(weight)); - - ::_encode(item_weight, bl); - } - - const char *get_bucket_type() const { return "straw"; } - bool is_uniform() const { return false; } - - int get_size() const { return item_weight.size(); } - - - // items - void get_items(vector& i) const { - for (map::const_iterator it = item_weight.begin(); - it != item_weight.end(); - it++) - i.push_back(it->first); - } - float get_item_weight(int item) const { - assert(item_weight.count(item)); - return ((map)item_weight)[item]; - } - - void add_item(int item, float w, bool back=false) { - item_weight[item] = w; - weight += w; - calc_straws(); - } - - void adjust_item_weight(int item, float dw) { - //cout << "adjust " << item << " " << dw << endl; - weight += dw; - item_weight[item] += dw; - calc_straws(); - } - - - /* calculate straw lengths. - this is kind of ugly. not sure if there's a closed form way to calculate this or not! - */ - void calc_straws() { - //cout << get_id() << ": calc_straws ============" << endl; - - item_straw.clear(); - _items.clear(); - _straws.clear(); - - // reverse sort by weight; skip zero weight items - map > reverse; - for (map::iterator p = item_weight.begin(); - p != item_weight.end(); - p++) { - //cout << get_id() << ":" << p->first << " " << p->second << endl; - if (p->second > 0) { - //p->second /= minw; - reverse[p->second].insert(p->first); - } - } - - /* 1:2:7 - item_straw[0] = 1.0; - item_straw[1] = item_straw[0]*sqrt(1.0/.6); - item_straw[2] = item_straw[1]*2.0; - */ - - // work from low to high weights - float straw = 1.0; - float numleft = item_weight.size(); - float wbelow = 0.0; - float lastw = 0.0; - - map >::iterator next = reverse.begin(); - //while (next != reverse.end()) { - while (1) { - //cout << "hi " << next->first << endl; - map >::iterator cur = next; - - // set straw length for this set - for (set::iterator s = cur->second.begin(); - s != cur->second.end(); - s++) { - item_straw[*s] = straw; - //cout << "straw " << *s << " w " << item_weight[*s] << " -> " << straw << endl; - _items.push_back(*s); - _straws.push_back(straw); - } - - next++; - if (next == reverse.end()) break; - - wbelow += (cur->first-lastw) * numleft; - //cout << "wbelow " << wbelow << endl; - - numleft -= 1.0 * (float)cur->second.size(); - //cout << "numleft now " << numleft << endl; - - float wnext = numleft * (next->first - cur->first); - //cout << "wnext " << wnext << endl; - - float pbelow = wbelow / (wbelow+wnext); - //cout << "pbelow " << pbelow << endl; - - straw *= pow((double)(1.0/pbelow), (double)1.0/numleft); - - lastw = cur->first; - } - //cout << "============" << endl; - } - - int choose_r(int x, int r, Hash& h) const { - //cout << "strawbucket.choose_r(" << x << ", " << r << ")" << endl; - - float high_draw = -1; - int high = 0; - - list::const_iterator pi = _items.begin(); - list::const_iterator ps = _straws.begin(); - while (pi != _items.end()) { - const int item = *pi; - const float rnd = (float)(h(x, item, r) % 1000000) / 1000000.0; - const float straw = *ps * rnd; - - if (high_draw < 0 || - straw > high_draw) { - high = *pi; - high_draw = straw; - } - - pi++; - ps++; - } - return high; - } - }; - - - - - - inline Bucket* decode_bucket(bufferlist& bl, int& off) { - char t; - bl.copy(off, sizeof(t), (char*)&t); - off += sizeof(t); - - switch (t) { - case CRUSH_BUCKET_UNIFORM: - return new UniformBucket(bl, off); - case CRUSH_BUCKET_LIST: - return new ListBucket(bl, off); - case CRUSH_BUCKET_TREE: - return new TreeBucket(bl, off); - case CRUSH_BUCKET_STRAW: - return new StrawBucket(bl, off); - default: - assert(0); - } - return 0; - } - - - -} - - - - - - - - -#endif diff --git a/branches/sage/mds/crush/Hash.h b/branches/sage/mds/crush/Hash.h deleted file mode 100644 index 2f0d9e4db918b..0000000000000 --- a/branches/sage/mds/crush/Hash.h +++ /dev/null @@ -1,301 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -// Robert Jenkins' function for mixing 32-bit values -// http://burtleburtle.net/bob/hash/evahash.html -// a, b = random bits, c = input and output -#define hashmix(a,b,c) \ - a=a-b; a=a-c; a=a^(c>>13); \ - b=b-c; b=b-a; b=b^(a<<8); \ - c=c-a; c=c-b; c=c^(b>>13); \ - a=a-b; a=a-c; a=a^(c>>12); \ - b=b-c; b=b-a; b=b^(a<<16); \ - c=c-a; c=c-b; c=c^(b>>5); \ - a=a-b; a=a-c; a=a^(c>>3); \ - b=b-c; b=b-a; b=b^(a<<10); \ - c=c-a; c=c-b; c=c^(b>>15); - -namespace crush { - - class Hash { - int seed; - - public: - int get_seed() { return seed; } - void set_seed(int s) { seed = s; } - - Hash(int s) { - unsigned int hash = 1315423911; - int x = 231232; - int y = 1232; - hashmix(s, x, hash); - hashmix(y, s, hash); - seed = s; - } - - inline int operator()(int a) { - unsigned int hash = seed ^ a; - int b = a; - int x = 231232; - int y = 1232; - hashmix(b, x, hash); - hashmix(y, a, hash); - return (hash & 0x7FFFFFFF); - } - - inline int operator()(int a, int b) { - unsigned int hash = seed ^ a ^ b; - int x = 231232; - int y = 1232; - hashmix(a, b, hash); - hashmix(x, a, hash); - hashmix(b, y, hash); - return (hash & 0x7FFFFFFF); - } - - inline int operator()(int a, int b, int c) { - unsigned int hash = seed ^ a ^ b ^ c; - int x = 231232; - int y = 1232; - hashmix(a, b, hash); - hashmix(c, x, hash); - hashmix(y, a, hash); - hashmix(b, x, hash); - hashmix(y, c, hash); - return (hash & 0x7FFFFFFF); - } - - inline int operator()(int a, int b, int c, int d) { - unsigned int hash = seed ^a ^ b ^ c ^ d; - int x = 231232; - int y = 1232; - hashmix(a, b, hash); - hashmix(c, d, hash); - hashmix(a, x, hash); - hashmix(y, b, hash); - hashmix(c, x, hash); - hashmix(y, d, hash); - return (hash & 0x7FFFFFFF); - } - - inline int operator()(int a, int b, int c, int d, int e) { - unsigned int hash = seed ^ a ^ b ^ c ^ d ^ e; - int x = 231232; - int y = 1232; - hashmix(a, b, hash); - hashmix(c, d, hash); - hashmix(e, x, hash); - hashmix(y, a, hash); - hashmix(b, x, hash); - hashmix(y, c, hash); - hashmix(d, x, hash); - hashmix(y, e, hash); - return (hash & 0x7FFFFFFF); - } - }; - -} - - - -#if 0 - - - //return myhash(a) ^ seed; - return myhash(a, seed); - } - int operator()(int a, int b) { - //return myhash( myhash(a) ^ myhash(b) ^ seed ); - return myhash(a, b, seed); - } - int operator()(int a, int b, int c) { - //return myhash( myhash(a ^ seed) ^ myhash(b ^ seed) ^ myhash(c ^ seed) ^ seed ); - return myhash(a, b, c, seed); - } - int operator()(int a, int b, int c, int d) { - //return myhash( myhash(a ^ seed) ^ myhash(b ^ seed) ^ myhash(c ^ seed) ^ myhash(d ^ seed) ^ seed ); - return myhash(a, b, c, d, seed); - } - - // ethan's rush hash? - if (0) - return (n ^ 0xdead1234) * (884811920 * 3 + 1); - - if (1) { - - // before - hash ^= ((hash << 5) + (n&255) + (hash >> 2)); - hashmix(a, b, hash); - n = n >> 8; - hash ^= ((hash << 5) + (n&255) + (hash >> 2)); - hashmix(a, b, hash); - n = n >> 8; - hash ^= ((hash << 5) + (n&255) + (hash >> 2)); - hashmix(a, b, hash); - n = n >> 8; - hash ^= ((hash << 5) + (n&255) + (hash >> 2)); - hashmix(a, b, hash); - n = n >> 8; - - //return hash; - return (hash & 0x7FFFFFFF); - } - - // JS - // a little better than RS - // + jenkin's mixing thing (which sucks on its own but helps tons here) - // best so far - if (1) { - unsigned int hash = 1315423911; - int a = 231232; - int b = 1232; - - for(unsigned int i = 0; i < 4; i++) - { - hash ^= ((hash << 5) + (n&255) + (hash >> 2)); - hashmix(a, b, hash); - n = n >> 8; - } - - return (hash & 0x7FFFFFFF); - } - - - // Robert jenkins' 96 bit mix - // sucks - if (0) { - int c = n; - int a = 12378912; - int b = 2982827; - a=a-b; a=a-c; a=a^(c>>13); - b=b-c; b=b-a; b=b^(a<<8); - c=c-a; c=c-b; c=c^(b>>13); - a=a-b; a=a-c; a=a^(c>>12); - b=b-c; b=b-a; b=b^(a<<16); - c=c-a; c=c-b; c=c^(b>>5); - a=a-b; a=a-c; a=a^(c>>3); - b=b-c; b=b-a; b=b^(a<<10); - c=c-a; c=c-b; c=c^(b>>15); - return c; - } - // robert jenkins 32-bit - // sucks - if (0) { - n += (n << 12); - n ^= (n >> 22); - n += (n << 4); - n ^= (n >> 9); - n += (n << 10); - n ^= (n >> 2); - n += (n << 7); - n ^= (n >> 12); - return n; - } - - // djb2 - if (0) { - unsigned int hash = 5381; - for (int i=0; i<4; i++) { - hash = ((hash << 5) + hash) + ((n&255) ^ 123); - n = n >> 8; - } - return hash; - } - - - // SDBM - if (1) { - unsigned int hash = 0; - - for(unsigned int i = 0; i < 4; i++) - { - hash = (n&255) + (hash << 6) + (hash << 16) - hash; - n = n >> 8; - } - - return (hash & 0x7FFFFFFF); - } - - // PJW - // horrid - if (0) { - unsigned int BitsInUnsignedInt = (unsigned int)(sizeof(unsigned int) * 8); - unsigned int ThreeQuarters = (unsigned int)((BitsInUnsignedInt * 3) / 4); - unsigned int OneEighth = (unsigned int)(BitsInUnsignedInt / 8); - unsigned int HighBits = (unsigned int)(0xFFFFFFFF) << (BitsInUnsignedInt - OneEighth); - unsigned int hash = 0; - unsigned int test = 0; - - for(unsigned int i = 0; i < 4; i++) - { - hash = (hash << OneEighth) + (n&255); - - if((test = hash & HighBits) != 0) - { - hash = (( hash ^ (test >> ThreeQuarters)) & (~HighBits)); - } - n = n >> 8; - } - - return (hash & 0x7FFFFFFF); - } - - // RS Hash function, from Robert Sedgwicks Algorithms in C book, w/ some changes. - if (0) { - unsigned int b = 378551; - unsigned int a = 63689; - unsigned int hash = 0; - - for(unsigned int i=0; i<4; i++) - { - hash = hash * a + (n&0xff); - a = a * b; - n = n >> 8; - } - - return (hash & 0x7FFFFFFF); - } - - // DJB - // worse than rs - if (0) { - unsigned int hash = 5381; - - for(unsigned int i = 0; i < 4; i++) - { - hash = ((hash << 5) + hash) + (n&255); - n = n >> 8; - } - - return (hash & 0x7FFFFFFF); - } - - // AP - // even worse - if (1) { - unsigned int hash = 0; - - for(unsigned int i = 0; i < 4; i++) - { - hash ^= ((i & 1) == 0) ? ( (hash << 7) ^ (n&255) ^ (hash >> 3)) : - (~((hash << 11) ^ (n&255) ^ (hash >> 5))); - n = n >> 8; - } - - return (hash & 0x7FFFFFFF); - } - - -#endif diff --git a/branches/sage/mds/crush/crush.h b/branches/sage/mds/crush/crush.h deleted file mode 100644 index 376e7d9b3fc86..0000000000000 --- a/branches/sage/mds/crush/crush.h +++ /dev/null @@ -1,543 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __crush_CRUSH_H -#define __crush_CRUSH_H - -#include -#include -#include -#include -#include -using std::set; -using std::map; -using std::vector; -using std::list; -#include -#include -using namespace __gnu_cxx; - - -#include "Bucket.h" - -#include "include/buffer.h" - - -namespace crush { - - - // *** RULES *** - - class RuleStep { - public: - int cmd; - vector args; - - RuleStep(int c) : cmd(c) {} - RuleStep(int c, int a) : cmd(c) { - args.push_back(a); - } - RuleStep(int c, int a, int b) : cmd(c) { - args.push_back(a); - args.push_back(b); - } - RuleStep(int o, int a, int b, int c) : cmd(o) { - args.push_back(a); - args.push_back(b); - args.push_back(c); - } - - void _encode(bufferlist& bl) { - bl.append((char*)&cmd, sizeof(cmd)); - ::_encode(args, bl); - } - void _decode(bufferlist& bl, int& off) { - bl.copy(off, sizeof(cmd), (char*)&cmd); - off += sizeof(cmd); - ::_decode(args, bl, off); - } - }; - - - // Rule operations - const int CRUSH_RULE_TAKE = 0; - const int CRUSH_RULE_CHOOSE = 1; // first n by default - const int CRUSH_RULE_CHOOSE_FIRSTN = 1; - const int CRUSH_RULE_CHOOSE_INDEP = 2; - const int CRUSH_RULE_EMIT = 3; - - class Rule { - public: - vector< RuleStep > steps; - - void _encode(bufferlist& bl) { - int n = steps.size(); - bl.append((char*)&n, sizeof(n)); - for (int i=0; i buckets; - int bucketno; - Hash h; - - hash_map parent_map; // what bucket each leaf/bucket lives in - - public: - map rules; - - //map collisions; - //map bumps; - - void _encode(bufferlist& bl) { - // buckets - int n = buckets.size(); - bl.append((char*)&n, sizeof(n)); - for (map::const_iterator it = buckets.begin(); - it != buckets.end(); - it++) { - bl.append((char*)&it->first, sizeof(it->first)); - it->second->_encode(bl); - } - bl.append((char*)&bucketno, sizeof(bucketno)); - - // hash - int s = h.get_seed(); - bl.append((char*)&s, sizeof(s)); - - //::_encode(out, bl); - //::_encode(overload, bl); - - // rules - n = rules.size(); - bl.append((char*)&n, sizeof(n)); - for(map::iterator it = rules.begin(); - it != rules.end(); - it++) { - bl.append((char*)&it->first, sizeof(it->first)); - it->second._encode(bl); - } - - } - - void _decode(bufferlist& bl, int& off) { - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i::iterator bp = buckets.begin(); - bp != buckets.end(); - ++bp) { - // index bucket items - vector items; - bp->second->get_items(items); - for (vector::iterator ip = items.begin(); - ip != items.end(); - ++ip) - parent_map[*ip] = bp->first; - } - } - - - - public: - Crush(int seed=123) : bucketno(-1), h(seed) {} - ~Crush() { - // hose buckets - for (map::iterator it = buckets.begin(); - it != buckets.end(); - it++) { - delete it->second; - } - } - - int print(ostream& out, int root, int indent=0) { - for (int i=0; iget_weight() << "\t" << b->get_id() << "\t"; - for (int i=0; iget_bucket_type() << ": "; - - vector items; - b->get_items(items); - - if (buckets.count(items[0])) { - out << std::endl; - for (unsigned i=0; iset_id(n); - buckets[n] = b; - return n; - } - - void add_item(int parent, int item, float w, bool back=false) { - // add item - assert(!buckets[parent]->is_uniform()); - Bucket *p = buckets[parent]; - - p->add_item(item, w, back); - - // set item's parent - Bucket *n = buckets[item]; - if (n) - n->set_parent(parent); - - // update weights - while (buckets.count(p->get_parent())) { - int child = p->get_id(); - p = buckets[p->get_parent()]; - p->adjust_item_weight(child, w); - } - } - - - /* - this is a hack, fix me! weights should be consistent throughout hierarchy! - - */ - void set_bucket_weight(int item, float w) { - Bucket *b = buckets[item]; - float adj = w - b->get_weight(); - - while (buckets.count(b->get_parent())) { - Bucket *p = buckets[b->get_parent()]; - p->adjust_item_weight(b->get_id(), adj); - b = p; - } - } - - - /* - * choose numrep distinct items of type type - */ - void choose(int x, - int numrep, - int type, - Bucket *inbucket, - vector& outvec, - bool firstn, - set& outset, map& overloadmap, - bool forcefeed=false, - int forcefeedval=-1) { - int off = outvec.size(); - - // for each replica - for (int rep=0; repis_uniform()) { - // uniform bucket; be careful! - if (firstn || numrep >= in->get_size()) { - // uniform bucket is too small; just walk thru elements - r += ftotal; // r' = r + f_total (first n) - } else { - // make sure numrep is not a multple of bucket size - int add = numrep*flocal; // r' = r + n*f_local - if (in->get_size() % numrep == 0) { - add += add/in->get_size(); // shift seq once per pass through the bucket - } - r += add; - } - } else { - // mixed bucket; just make a distinct-ish r sequence - if (firstn) - r += ftotal; // r' = r + f_total - else - r += numrep * flocal; // r' = r + n*f_local - } - - // choose - outv = in->choose_r(x, r, h); - - // did we get the type we want? - int itemtype = 0; // 0 is terminal type - Bucket *newin = 0; // remember bucket we hit - if (in->is_uniform()) { - itemtype = ((UniformBucket*)in)->get_item_type(); - } else { - if (buckets.count(outv)) { // another bucket - newin = buckets[outv]; - itemtype = newin->get_type(); - } - } - if (itemtype == type) { // this is what we want! - // collision? - bool collide = false; - for (int prep=0; prep overloadmap[outv]) - bad = true; - } - - if (collide || bad) { - ftotal++; - flocal++; - - if (collide && flocal < 3) - continue; // try locally a few times! - - if (ftotal >= 10) { - // ok fine, just ignore dup. FIXME. - skip_rep = true; - break; - } - - retry_rep = true; - } - - break; // ok then! - } - - // next - in = newin; - } - - if (retry_rep) continue; // try again - - break; - } - - // skip this rep? (e.g. too many collisions, we give up) - if (skip_rep) continue; - - // output this value - outvec.push_back(outv); - } // for rep - - // double check! - if (0) { - for (unsigned i=1; i& result, - set& outset, map& overloadmap, - int forcefeed=-1) { - //int numresult = 0; - result.clear(); - - // determine hierarchical context for forcefeed (if any) - list force_stack; - if (forcefeed >= 0 && parent_map.count(forcefeed)) { - int t = forcefeed; - while (1) { - force_stack.push_front(t); - //cout << "push " << t << " onto force_stack" << std::endl; - if (parent_map.count(t) == 0) break; // reached root, presumably. - //cout << " " << t << " parent is " << parent_map[t] << std::endl; - t = parent_map[t]; - } - } - - // working vector - vector w; // working variable - - // go through each statement - for (vector::iterator pc = rule.steps.begin(); - pc != rule.steps.end(); - pc++) { - // move input? - - // do it - switch (pc->cmd) { - case CRUSH_RULE_TAKE: - { - const int arg = pc->args[0]; - //cout << "take " << arg << std::endl; - - if (!force_stack.empty()) { - assert(force_stack.front() == arg); - force_stack.pop_front(); - } - - w.clear(); - w.push_back(arg); - } - break; - - case CRUSH_RULE_CHOOSE_FIRSTN: - case CRUSH_RULE_CHOOSE_INDEP: - { - const bool firstn = pc->cmd == CRUSH_RULE_CHOOSE_FIRSTN; - const int numrep = pc->args[0]; - const int type = pc->args[1]; - - //cout << "choose " << numrep << " of type " << type << std::endl; - - assert(!w.empty()); - - // reset output - vector out; - - // forcefeeding? - bool forcing = false; - int forceval = -1; - if (!force_stack.empty()) { - forceval = force_stack.front(); - force_stack.pop_front(); - //cout << "priming out with " << forceval << std::endl; - forcing = true; - } else if (forcefeed >= 0 && type == 0) { - //cout << "forcing context-less " << forcefeed << std::endl; - forceval = forcefeed; - forcefeed = -1; - forcing = true; - } - - // do each row independently - for (vector::iterator i = w.begin(); - i != w.end(); - i++) { - assert(buckets.count(*i)); - Bucket *b = buckets[*i]; - choose(x, numrep, type, b, out, firstn, - outset, overloadmap, - forcing, - forceval); - forcing = false; // only once - } // for inrow - - // put back into w - w.swap(out); - out.clear(); - } - break; - - case CRUSH_RULE_EMIT: - { - for (unsigned i=0; i - -#include -#include -using namespace std; - - -void place(Crush& c, Rule& rule, int numpg, int numrep, map >& placement) -{ - vector v(numrep); - map ocount; - - for (int x=1; x<=numpg; x++) { - - //cout << H(x) << "\t" << h(x) << endl; - c.do_rule(rule, x, v); - //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl; - - bool bad = false; - for (int i=0; i::iterator it = ocount.begin(); - it != ocount.end(); - it++) - cout << it->first << "\t" << it->second << endl; - -} - - -float testmovement(int n, float f, int buckettype) -{ - Hash h(73232313); - - // crush - Crush c; - - int ndisks = 0; - - // bucket - Bucket *b; - if (buckettype == 0) - b = new TreeBucket(1); - else if (buckettype == 1 || buckettype == 2) - b = new ListBucket(1); - else if (buckettype == 3) - b = new StrawBucket(1); - else if (buckettype == 4) - b = new UniformBucket(0,0); - - for (int i=0; iadd_item(ndisks++,1); - - c.add_bucket(b); - int root = b->get_id(); - - //c.print(cout,root); - - // rule - int numrep = 2; - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - //c.overload[10] = .1; - - - int pg_per = 1000; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - - /* - cout << ndisks << " disks, " << endl; - cout << pg_per << " pgs per disk" << endl; - cout << numpg << " logical pgs" << endl; - cout << "numrep is " << numrep << endl; - */ - map > placement1, placement2; - - //c.print(cout, root); - - - // ORIGINAL - place(c, rule, numpg, numrep, placement1); - - int olddisks = ndisks; - - // add item - if (buckettype == 2) { - // start over! - ndisks = 0; - b = new ListBucket(1); - for (int i=0; i<=n; i++) - b->add_item(ndisks++,1); - c.add_bucket(b); - root = b->get_id(); - - rule.steps.clear(); - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - } - else - b->add_item(ndisks++, 1); - - - // ADDED - //c.print(cout, root); - place(c, rule, numpg, numrep, placement2); - - int moved = 0; - for (int x=1; x<=numpg; x++) - if (placement1[x] != placement2[x]) - for (int j=0; j - -#include -#include -using namespace std; - - -Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - return b; - } else { - // mixed - //Bucket *b = new MixedBucket(h+1); - Bucket *b = new StrawBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); - return b->get_id(); -} - - -float go(int dep) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - if (0) { - for (int d=0; dadd_item(ndisks++, 10); - root = c.add_bucket(b); - } - if (0) { - vector disks; - for (int i=0; i<10000; i++) - disks.push_back(ndisks++); - UniformBucket *b = new UniformBucket(1, 0, 10000, disks); - Hash h(123); - b->make_primes(h); - root = c.add_bucket(b); - } - - - - // rule - int numrep = 1; - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - //c.overload[10] = .1; - - - int pg_per = 100; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - //cout << ndisks << " disks, " << endl; - //cout << pg_per << " pgs per disk" << endl; - // cout << numpg << " logical pgs" << endl; - //cout << "numrep is " << numrep << endl; - - - int place = 100000; - int times = place / numpg; - if (!times) times = 1; - - cout << "#looping " << times << " times" << endl; - - float tvar = 0; - int tvarnum = 0; - - int x = 0; - for (int t=0; t v(numrep); - - for (int z=0; z - -#include -#include -using namespace std; - -int buckettype = 0; - -Bucket *make_bucket(Crush& c, vector& wid, int h, map< int, list >& buckets, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - buckets[h].push_back(b); - return b; - } else { - // mixed - //Bucket *b = new TreeBucket(h+1); - //Bucket *b = new ListBucket(h+1); - //Bucket *b = new StrawBucket(h+1); - Bucket *b; - if (buckettype == 0) - b = new TreeBucket(h+1); - else if (buckettype == 1 || buckettype == 2) - b = new ListBucket(h+1); - else if (buckettype == 3) - b = new StrawBucket(h+1); - - c.add_bucket(b); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - n->set_parent(b->get_id()); - } - buckets[h].push_back(b); - //cout << b->get_id() << " mixedbucket with " << wid[h] << " at " << h << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, map< int, list >& buckets, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, buckets, ndisks); - return b->get_id(); -} - - -void place(Crush& c, Rule& rule, int numpg, int numrep, map >& placement) -{ - vector v(numrep); - map ocount; - - for (int x=1; x<=numpg; x++) { - - //cout << H(x) << "\t" << h(x) << endl; - c.do_rule(rule, x, v); - //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl; - - bool bad = false; - for (int i=0; i::iterator it = ocount.begin(); - it != ocount.end(); - it++) - cout << it->first << "\t" << it->second << endl; - -} - - -float testmovement(int depth, int branching, int udisks, int add, int modifydepth) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - wid.push_back(udisks); - for (int d=1; d > buckets; - - root = make_hierarchy(c, wid, buckets, ndisks); - - //c.print(cout,root); - - // rule - int numrep = 2; - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - //c.overload[10] = .1; - - - int pg_per = 100; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - - /* - cout << ndisks << " disks, " << endl; - cout << pg_per << " pgs per disk" << endl; - cout << numpg << " logical pgs" << endl; - cout << "numrep is " << numrep << endl; - */ - map > placement1, placement2; - - //c.print(cout, root); - - - // ORIGINAL - place(c, rule, numpg, numrep, placement1); - - int olddisks = ndisks; - - // add disks - //cout << " adding " << add << " disks" << endl; - vector disks; - for (int i=0; imake_primes(h); - - //Bucket *o = buckets[2].back(); - Bucket *o; - if (buckettype == 2) - o = buckets[modifydepth].front(); - else - o = buckets[modifydepth].back(); - - c.add_bucket(b); - //cout << " adding under " << o->get_id() << endl; - c.add_item(o->get_id(), b->get_id(), b->get_weight()); - //((MixedBucket*)o)->add_item(b->get_id(), b->get_weight()); - //newbucket = b; - - - // ADDED - //c.print(cout, root); - place(c, rule, numpg, numrep, placement2); - - int moved = 0; - for (int x=1; x<=numpg; x++) - if (placement1[x] != placement2[x]) - for (int j=0; j - -#include -#include -using namespace std; - - -int buckettype = 2; // 0 = mixed, 1 = linear, 2 = straw - -int big_one_skip = 255; -int big_one_size; -Bucket *big_one = 0; - -Bucket *make_bucket(Crush& c, vector& wid, int h, map< int, list >& buckets, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - - int s = wid[h]; - if (big_one_skip > 0) - big_one_skip--; - if (!big_one_skip && !big_one) - s = big_one_size; - - - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks " << disks.size()<< endl; - buckets[h].push_back(b); - return b; - } else { - // mixed - Bucket *b; - if (buckettype == 0) - b = new TreeBucket(h+1); - else if (buckettype == 1) - b = new ListBucket(h+1); - else if (buckettype == 2) - b = new StrawBucket(h+1); - c.add_bucket(b); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - n->set_parent(b->get_id()); - } - buckets[h].push_back(b); - //cout << b->get_id() << " mixedbucket with " << wid[h] << " at " << h << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, map< int, list >& buckets, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, buckets, ndisks); - return b->get_id(); -} - - -void place(Crush& c, Rule& rule, int numpg, int numrep, map >& placement) -{ - vector v(numrep); - map ocount; - - for (int x=1; x<=numpg; x++) { - - //cout << H(x) << "\t" << h(x) << endl; - c.do_rule(rule, x, v); - //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl; - - bool bad = false; - for (int i=0; i::iterator it = ocount.begin(); - it != ocount.end(); - it++) - cout << it->first << "\t" << it->second << endl; - -} - - -float testmovement(int depth, int branching, int udisks, int add) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - wid.push_back(udisks); - for (int d=1; d > buckets; - - big_one_size = add; - big_one = 0; - - //cout << "making tree" << endl; - root = make_hierarchy(c, wid, buckets, ndisks); - - //c.print(cout, root); - - - // rule - int numrep = 2; - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - //c.overload[10] = .1; - - - int pg_per = 100; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - - /* - cout << ndisks << " disks, " << endl; - cout << pg_per << " pgs per disk" << endl; - cout << numpg << " logical pgs" << endl; - cout << "numrep is " << numrep << endl; - */ - map > placement1, placement2; - - //c.print(cout, root); - - int olddisks = ndisks; - - - place(c, rule, numpg, numrep, placement1); - - if (1) { - // remove disks - assert(big_one); - c.adjust_item(big_one->get_id(), 0); - } - - int newdisks = ndisks - add; - - //c.print(cout, root); - place(c, rule, numpg, numrep, placement2); - - int moved = 0; - for (int x=1; x<=numpg; x++) - if (placement1[x] != placement2[x]) - for (int j=0; j >::iterator i = r.begin(); - i != r.end(); - i++) { - cout << i->first; - for (map::iterator j = i->second.begin(); - j != i->second.end(); - j++) - cout << "\t" << j->first << "\t" << j->second; - cout << endl; - } - */ -} - diff --git a/branches/sage/mds/crush/test/cluster_movement_rush.cc b/branches/sage/mds/crush/test/cluster_movement_rush.cc deleted file mode 100644 index 90cc197c24f65..0000000000000 --- a/branches/sage/mds/crush/test/cluster_movement_rush.cc +++ /dev/null @@ -1,218 +0,0 @@ - - -#include "../crush.h" -using namespace crush; - -#include - -#include -#include -using namespace std; - -int buckettype = 0; - -Bucket *make_bucket(Crush& c, vector& wid, int h, map< int, list >& buckets, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - buckets[h].push_back(b); - return b; - } else { - // mixed - //Bucket *b = new TreeBucket(h+1); - //Bucket *b = new ListBucket(h+1); - //Bucket *b = new StrawBucket(h+1); - Bucket *b; - if (buckettype == 0) - b = new TreeBucket(h+1); - else if (buckettype == 1 || buckettype == 2) - b = new ListBucket(h+1); - else if (buckettype == 3) - b = new StrawBucket(h+1); - - c.add_bucket(b); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - n->set_parent(b->get_id()); - } - buckets[h].push_back(b); - //cout << b->get_id() << " mixedbucket with " << wid[h] << " at " << h << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, map< int, list >& buckets, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, buckets, ndisks); - return b->get_id(); -} - - -void place(Crush& c, Rule& rule, int numpg, int numrep, map >& placement) -{ - vector v(numrep); - map ocount; - - for (int x=1; x<=numpg; x++) { - - //cout << H(x) << "\t" << h(x) << endl; - c.do_rule(rule, x, v); - //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl; - - bool bad = false; - for (int i=0; i::iterator it = ocount.begin(); - it != ocount.end(); - it++) - cout << it->first << "\t" << it->second << endl; - -} - - -float testmovement(int depth, int branching, int udisks, int add, int modifydepth) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - wid.push_back(udisks); - for (int d=1; d > buckets; - - root = make_hierarchy(c, wid, buckets, ndisks); - - //c.print(cout,root); - - // rule - int numrep = 2; - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - //c.overload[10] = .1; - - - int pg_per = 100; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - - /* - cout << ndisks << " disks, " << endl; - cout << pg_per << " pgs per disk" << endl; - cout << numpg << " logical pgs" << endl; - cout << "numrep is " << numrep << endl; - */ - map > placement1, placement2; - - //c.print(cout, root); - - - // ORIGINAL - place(c, rule, numpg, numrep, placement1); - - int olddisks = ndisks; - - // add disks - //cout << " adding " << add << " disks" << endl; - vector disks; - for (int i=0; imake_primes(h); - - //Bucket *o = buckets[2].back(); - Bucket *o; - if (buckettype == 2) - o = buckets[modifydepth].front(); - else - o = buckets[modifydepth].back(); - - c.add_bucket(b); - //cout << " adding under " << o->get_id() << endl; - c.add_item(o->get_id(), b->get_id(), b->get_weight(), buckettype == 2); - //((MixedBucket*)o)->add_item(b->get_id(), b->get_weight()); - //newbucket = b; - - - // ADDED - //c.print(cout, root); - place(c, rule, numpg, numrep, placement2); - - int moved = 0; - for (int x=1; x<=numpg; x++) - if (placement1[x] != placement2[x]) - for (int j=0; j - -#include -#include -using namespace std; - - -Clock g_clock; - - -Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks weight " << w << endl; - return b; - } else { - // mixed - Bucket *b = new TreeBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); - return b->get_id(); -} - - - -float go(int dep, int failpc) -{ - Hash h(73232313); - - //int overloadcutoff = (int)((float)10000.0 / (float)utilization); - - //cout << "util " << utilization << " cutoff " << overloadcutoff << endl; - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - for (int d=0; d ocount(ndisks); - //cout << ndisks << " disks, " << endl; - //cout << pg_per << " pgs per disk" << endl; - // cout << numpg << " logical pgs" << endl; - //cout << "numrep is " << numrep << endl; - - - int place = 1000000; - int times = place / numpg; - if (!times) times = 1; - - - //cout << "looping " << times << " times" << endl; - - float tavg[10]; - float tvar[10]; - for (int j=0;j<10;j++) { - tvar[j] = 0; - tavg[j] = 0; - } - int tvarnum = 0; - float trvar = 0.0; - - float overloadsum = 0.0; - float adjustsum = 0.0; - float afteroverloadsum = 0.0; - float aslowdown = 0.0; - int chooses = 0; - int xs = 1; - for (int t=0; t v(numrep); - - c.out.clear(); - - for (int z=0; z= ndisks) cout << "v[i] " << i << " is " << v[i] << " .. x = " << x << endl; - //assert(v[i] < ndisks); - ocount[v[i]]++; - } - } - utime_t t1b = g_clock.now(); - - // add in numf failed disks - for (int f = 0; f < numf; f++) { - int d = rand() % ndisks; - while (c.out.count(d)) d = rand() % ndisks; - c.out.insert(d); - } - - utime_t t3a = g_clock.now(); - for (int x=xs; x - -#include -#include -using namespace std; - - -Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - return b; - } else { - // mixed - MixedBucket *b = new MixedBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); - return b->get_id(); -} - - -Bucket *make_random(Crush& c, int wid, int height, int& ndisks) -{ - int w = rand() % (wid-1) + 2; - - if (height == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - return b; - } else { - // mixed - int h = rand() % height + 1; - MixedBucket *b = new MixedBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } - -} - - -float go(int dep, int overloadcutoff) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - for (int d=0; dget_id(); - //c.print(cout, root); - } - if (0) { - MixedBucket *b = new MixedBucket(1); - for (int i=0; i<10000; i++) - b->add_item(ndisks++, 10); - root = c.add_bucket(b); - } - if (0) { - vector disks; - for (int i=0; i<10000; i++) - disks.push_back(ndisks++); - UniformBucket *b = new UniformBucket(1, 0, 10000, disks); - Hash h(123); - b->make_primes(h); - root = c.add_bucket(b); - } - //cout << ndisks << " disks" << endl; - - - - // rule - int numrep = 1; - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - //c.overload[10] = .1; - - - int pg_per = 100; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - //cout << ndisks << " disks, " << endl; - //cout << pg_per << " pgs per disk" << endl; - // cout << numpg << " logical pgs" << endl; - //cout << "numrep is " << numrep << endl; - - - int place = 1000000; - int times = place / numpg; - if (!times) times = 1; - - - //cout << "looping " << times << " times" << endl; - - float tvar = 0; - int tvarnum = 0; - - float overloadsum = 0.0; - float adjustsum = 0.0; - float afteroverloadsum = 0.0; - int chooses = 0; - int xs = 1; - for (int t=0; t v(numrep); - - c.overload.clear(); - - for (int z=0; z overloadcutoff) - overloaded++; - - if (ocount[i] > 100+(overloadcutoff-100)/2) { - adjusted++; - c.overload[i] = 100.0 / (float)ocount[i]; - //cout << "disk " << i << " has " << ocount[i] << endl; - } - ocount[i] = 0; - } - //cout << overloaded << " overloaded" << endl; - overloadsum += (float)overloaded / (float)ndisks; - adjustsum += (float)adjusted / (float)ndisks; - - - for (int x=xs; x overloadcutoff) { - still++; - //c.overload[ocount[i]] = 100.0 / (float)ocount[i]; - //cout << "disk " << i << " has " << ocount[i] << endl; - } - } - //if (still) cout << "overload was " << overloaded << " now " << still << endl; - afteroverloadsum += (float)still / (float)ndisks; - - //cout << "collisions: " << c.collisions << endl; - //cout << "r bumps: " << c.bumps << endl; - - float avg = 0.0; - for (int i=0; i100; d -= 5) { - float var = go(3,d); - //cout << "## depth = " << d << endl; - //cout << d << "\t" << var << endl; - } -} diff --git a/branches/sage/mds/crush/test/depth_variance.cc b/branches/sage/mds/crush/test/depth_variance.cc deleted file mode 100644 index 7d60ebaae9501..0000000000000 --- a/branches/sage/mds/crush/test/depth_variance.cc +++ /dev/null @@ -1,185 +0,0 @@ - - -#include "../crush.h" -using namespace crush; - -#include - -#include -#include -using namespace std; - - -Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - return b; - } else { - // mixed - Bucket *b = new TreeBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); - return b->get_id(); -} - - -float go(int dep) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - if (1) { - for (int d=0; d ocount(ndisks); - //cout << ndisks << " disks, " << endl; - //cout << pg_per << " pgs per disk" << endl; - // cout << numpg << " logical pgs" << endl; - //cout << "numrep is " << numrep << endl; - - - int place = 100000; - int times = place / numpg; - if (!times) times = 1; - - cout << "#looping " << times << " times" << endl; - - float tvar = 0; - int tvarnum = 0; - float tavg = 0; - - int x = 0; - for (int t=0; t v(numrep); - - for (int z=0; z - -#include -#include -using namespace std; - - -Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks weight " << w << endl; - return b; - } else { - // mixed - Bucket *b = new TreeBucket(h+1); - //Bucket *b = new StrawBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); - return b->get_id(); -} - - - -float go(int dep, int overloadcutoff) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - for (int d=0; d ocount(ndisks); - //cout << ndisks << " disks, " << endl; - //cout << pg_per << " pgs per disk" << endl; - // cout << numpg << " logical pgs" << endl; - //cout << "numrep is " << numrep << endl; - - - int place = 100000; - int times = place / numpg; - if (!times) times = 1; - - - //cout << "looping " << times << " times" << endl; - - float tavg[10]; - float tvar[10]; - for (int j=0;j<10;j++) { - tvar[j] = 0; - tavg[j] = 0; - } - int tvarnum = 0; - - float overloadsum = 0.0; - float adjustsum = 0.0; - float afteroverloadsum = 0.0; - int chooses = 0; - int xs = 1; - for (int t=0; t v(numrep); - - c.overload.clear(); - - for (int z=0; z cutoff) - overloaded++; - - if (ocount[i] > adjoff) { - adjusted++; - c.overload[i] = (float)target / (float)ocount[i]; - //cout << "setting overload " << i << " to " << c.overload[i] << endl; - //cout << "disk " << i << " has " << ocount[i] << endl; - } - ocount[i] = 0; - } - //cout << overloaded << " overloaded" << endl; - overloadsum += (float)overloaded / (float)ndisks; - adjustsum += (float)adjusted / (float)ndisks; - - - - if (1) { - // second pass - for (int x=xs; x= adjoff) { - adjusted++; - if (c.overload.count(i) == 0) { - c.overload[i] = 1.0; - adjusted++; - } - //else cout << "(re)adjusting " << i << endl; - c.overload[i] *= (float)target / (float)ocount[i]; - //cout << "setting overload " << i << " to " << c.overload[i] << endl; - //cout << "disk " << i << " has " << ocount[i] << endl; - } - ocount[i] = 0; - } - } - - for (int x=xs; x cutoff) { - still++; - //c.overload[ocount[i]] = 100.0 / (float)ocount[i]; - if (c.overload.count(i)) cout << "[adjusted] "; - cout << "disk " << i << " has " << ocount[i] << endl; - } - } - //if (still) cout << "overload was " << overloaded << " now " << still << endl; - afteroverloadsum += (float)still / (float)ndisks; - - //cout << "collisions: " << c.collisions << endl; - //cout << "r bumps: " << c.bumps << endl; - - int n = ndisks/10; - float avg[10]; - float var[10]; - for (int i=0;i<10;i++) { - int s = n*i; - avg[i] = 0.0; - for (int j=0; j100; d -= 5) { - float var = go(3,d); - //cout << "## depth = " << d << endl; - //cout << d << "\t" << var << endl; - } -} diff --git a/branches/sage/mds/crush/test/movement.cc b/branches/sage/mds/crush/test/movement.cc deleted file mode 100644 index 2621f09457fe6..0000000000000 --- a/branches/sage/mds/crush/test/movement.cc +++ /dev/null @@ -1,223 +0,0 @@ - - -#include "../crush.h" -using namespace crush; - -#include - -#include -#include -using namespace std; - - -Bucket *make_bucket(Crush& c, vector& wid, int h, map< int, list >& buckets, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - buckets[h].push_back(b); - return b; - } else { - // mixed - MixedBucket *b = new MixedBucket(h+1); - c.add_bucket(b); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - n->set_parent(b->get_id()); - } - buckets[h].push_back(b); - //cout << b->get_id() << " mixedbucket with " << wid[h] << " at " << h << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, map< int, list >& buckets, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, buckets, ndisks); - return b->get_id(); -} - - -void place(Crush& c, Rule& rule, int numpg, int numrep, map >& placement) -{ - vector v(numrep); - map ocount; - - for (int x=1; x<=numpg; x++) { - - //cout << H(x) << "\t" << h(x) << endl; - c.do_rule(rule, x, v); - //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl; - - bool bad = false; - for (int i=0; i::iterator it = ocount.begin(); - it != ocount.end(); - it++) - cout << it->first << "\t" << it->second << endl; - -} - - -float testmovement(int depth, int branching, int udisks) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - wid.push_back(udisks); - for (int d=1; d > buckets; - - if (1) { - root = make_hierarchy(c, wid, buckets, ndisks); - } - if (0) { - MixedBucket *b = new MixedBucket(1); - for (int i=0; i<10000; i++) - b->add_item(ndisks++, 10); - root = c.add_bucket(b); - } - if (0) { - vector disks; - for (int i=0; i<10000; i++) - disks.push_back(ndisks++); - UniformBucket *b = new UniformBucket(1, 0, 1, disks); - Hash h(123); - b->make_primes(h); - root = c.add_bucket(b); - } - - - - // rule - int numrep = 2; - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - //c.overload[10] = .1; - - - int pg_per = 100; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - - /* - cout << ndisks << " disks, " << endl; - cout << pg_per << " pgs per disk" << endl; - cout << numpg << " logical pgs" << endl; - cout << "numrep is " << numrep << endl; - */ - map > placement1, placement2; - - //c.print(cout, root); - - place(c, rule, numpg, numrep, placement1); - - if (1) { - // failed - - //for (int i=500; i<1000; i++) - //c.failed.insert(i); - c.failed.insert(0); - } - - int olddisks = ndisks; - - if (1) { - int n = udisks; - //cout << " adding " << n << " disks" << endl; - vector disks; - for (int i=0; imake_primes(h); - Bucket *o = buckets[1].back(); - c.add_bucket(b); - //cout << " adding under " << o->get_id() << endl; - c.add_item(o->get_id(), b->get_id(), b->get_weight()); - //((MixedBucket*)o)->add_item(b->get_id(), b->get_weight()); - } - - //c.print(cout, root); - place(c, rule, numpg, numrep, placement2); - - int moved = 0; - for (int x=1; x<=numpg; x++) { - if (placement1[x] != placement2[x]) { - for (int j=0; j v; - cout << depth; - for (int branching = 3; branching < 16; branching += 1) { - float fac = testmovement(depth, branching, udisks); - v.push_back(fac); - int n = udisks * pow((float)branching, (float)depth-1); - cout << "\t" << n; - cout << "\t" << fac; - } - //for (int i=0; i - -#include -#include -using namespace std; - - -Bucket *make_bucket(Crush& c, vector& wid, int h, map< int, list >& buckets, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - buckets[h].push_back(b); - return b; - } else { - // mixed - MixedBucket *b = new MixedBucket(h+1); - c.add_bucket(b); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - n->set_parent(b->get_id()); - } - buckets[h].push_back(b); - //cout << b->get_id() << " mixedbucket with " << wid[h] << " at " << h << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, map< int, list >& buckets, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, buckets, ndisks); - return b->get_id(); -} - - -void place(Crush& c, Rule& rule, int numpg, int numrep, map >& placement) -{ - vector v(numrep); - map ocount; - - for (int x=1; x<=numpg; x++) { - - //cout << H(x) << "\t" << h(x) << endl; - c.do_rule(rule, x, v); - //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl; - - bool bad = false; - for (int i=0; i::iterator it = ocount.begin(); - it != ocount.end(); - it++) - cout << it->first << "\t" << it->second << endl; - -} - - -float testmovement(int depth, int branching, int udisks) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - wid.push_back(udisks); - for (int d=1; d > buckets; - - if (1) { - root = make_hierarchy(c, wid, buckets, ndisks); - } - if (0) { - MixedBucket *b = new MixedBucket(1); - for (int i=0; i<10000; i++) - b->add_item(ndisks++, 10); - root = c.add_bucket(b); - } - if (0) { - vector disks; - for (int i=0; i<10000; i++) - disks.push_back(ndisks++); - UniformBucket *b = new UniformBucket(1, 0, 1, disks); - Hash h(123); - b->make_primes(h); - root = c.add_bucket(b); - } - - - - // rule - int numrep = 2; - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - //c.overload[10] = .1; - - - int pg_per = 100; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - - /* - cout << ndisks << " disks, " << endl; - cout << pg_per << " pgs per disk" << endl; - cout << numpg << " logical pgs" << endl; - cout << "numrep is " << numrep << endl; - */ - map > placement1, placement2; - - //c.print(cout, root); - - place(c, rule, numpg, numrep, placement1); - - float over = .5; - - if (1) { - // failed - - //for (int i=500; i<1000; i++) - //c.failed.insert(i); - //c.failed.insert(0); - c.overload[0] = over; - } - - int olddisks = ndisks; - - - - if (0) { - int n = udisks; - //cout << " adding " << n << " disks" << endl; - vector disks; - for (int i=0; imake_primes(h); - Bucket *o = buckets[1].back(); - c.add_bucket(b); - //cout << " adding under " << o->get_id() << endl; - c.add_item(o->get_id(), b->get_id(), b->get_weight()); - //((MixedBucket*)o)->add_item(b->get_id(), b->get_weight()); - } - - //c.print(cout, root); - place(c, rule, numpg, numrep, placement2); - - vector moved(ndisks); - - //int moved = 0; - for (int d=0; d::iterator it = placement1[d].begin(); - it != placement1[d].end(); - it++) { - placement2[d].erase(*it); - } - } - - float avg = 0; - for (int d=0; d v; - cout << depth; - for (int branching = 3; branching < 16; branching += 1) { - float fac = testmovement(depth, branching, udisks); - v.push_back(fac); - int n = udisks * pow((float)branching, (float)depth-1); - //cout << "\t" << n; - //cout << "\t" << fac; - } - //for (int i=0; i - -#include -#include -using namespace std; - - -Clock g_clock; - - -Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks weight " << w << endl; - return b; - } else { - // mixed - Bucket *b = new TreeBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); - return b->get_id(); -} - - - -float go(int dep, int utilization ) -{ - Hash h(73232313); - - int overloadcutoff = (int)((float)10000.0 / (float)utilization); - - //cout << "util " << utilization << " cutoff " << overloadcutoff << endl; - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - for (int d=0; d ocount(ndisks); - //cout << ndisks << " disks, " << endl; - //cout << pg_per << " pgs per disk" << endl; - // cout << numpg << " logical pgs" << endl; - //cout << "numrep is " << numrep << endl; - - - int place = 100000; - int times = place / numpg; - if (!times) times = 1; - - - //cout << "looping " << times << " times" << endl; - - float tavg[10]; - float tvar[10]; - for (int j=0;j<10;j++) { - tvar[j] = 0; - tavg[j] = 0; - } - int tvarnum = 0; - - float overloadsum = 0.0; - float adjustsum = 0.0; - float afteroverloadsum = 0.0; - float aslowdown = 0.0; - int chooses = 0; - int xs = 1; - for (int t=0; t v(numrep); - - c.overload.clear(); - - for (int z=0; z cutoff) - overloaded++; - - if (ocount[i] > adjoff) { - adjusted++; - c.overload[i] = (float)target / (float)ocount[i]; - //cout << "setting overload " << i << " to " << c.overload[i] << endl; - //cout << "disk " << i << " has " << ocount[i] << endl; - } - ocount[i] = 0; - } - //cout << overloaded << " overloaded" << endl; - overloadsum += (float)overloaded / (float)ndisks; - adjustsum += (float)adjusted / (float)ndisks; - - - - // keep adjusting! - for (int bla=0; bla<5; bla++) { - utime_t t2a = g_clock.now(); - - // second pass - for (int x=xs; x= adjoff) { - numover++; - if (c.overload.count(i) == 0) { - c.overload[i] = 1.0; - adjusted++; - } - //else cout << "(re)adjusting " << i << endl; - c.overload[i] *= (float)target / (float)ocount[i]; - //cout << "setting overload " << i << " to " << c.overload[i] << endl; - //cout << "disk " << i << " has " << ocount[i] << endl; - } - ocount[i] = 0; - } - if (!numover) break; - cout << "readjusting" << endl; - } - - utime_t t3a = g_clock.now(); - - for (int x=xs; x cutoff) { - still++; - //c.overload[ocount[i]] = 100.0 / (float)ocount[i]; - if (c.overload.count(i)) cout << "[adjusted] "; - cout << "disk " << i << " has " << ocount[i] << endl; - } - } - //if (still) cout << "overload was " << overloaded << " now " << still << endl; - afteroverloadsum += (float)still / (float)ndisks; - - //cout << "collisions: " << c.collisions << endl; - //cout << "r bumps: " << c.bumps << endl; - - int n = ndisks/10; - float avg[10]; - float var[10]; - for (int i=0;i<10;i++) { - int s = n*i; - avg[i] = 0.0; - for (int j=0; j - -#include -#include -using namespace std; - - -Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - return b; - } else { - // mixed - MixedBucket *b = new MixedBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); - return b->get_id(); -} - - -Bucket *make_random(Crush& c, int wid, int height, int& ndisks) -{ - int w = rand() % (wid-1) + 2; - - if (height == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - return b; - } else { - // mixed - int h = rand() % height + 1; - MixedBucket *b = new MixedBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } - -} - - -float go(int dep, int overloadcutoff) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - for (int d=0; dget_id(); - //c.print(cout, root); - } - if (0) { - MixedBucket *b = new MixedBucket(1); - for (int i=0; i<10000; i++) - b->add_item(ndisks++, 10); - root = c.add_bucket(b); - } - if (0) { - vector disks; - for (int i=0; i<10000; i++) - disks.push_back(ndisks++); - UniformBucket *b = new UniformBucket(1, 0, 10000, disks); - Hash h(123); - b->make_primes(h); - root = c.add_bucket(b); - } - //cout << ndisks << " disks" << endl; - - - - // rule - int numrep = 1; - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - //c.overload[10] = .1; - - - int pg_per = 100; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - //cout << ndisks << " disks, " << endl; - //cout << pg_per << " pgs per disk" << endl; - // cout << numpg << " logical pgs" << endl; - //cout << "numrep is " << numrep << endl; - - - int place = 1000000; - int times = place / numpg; - if (!times) times = 1; - - - //cout << "looping " << times << " times" << endl; - - float tvar = 0; - int tvarnum = 0; - - float overloadsum = 0.0; - float adjustsum = 0.0; - float afteroverloadsum = 0.0; - int chooses = 0; - int xs = 1; - for (int t=0; t v(numrep); - - c.overload.clear(); - - for (int z=0; z overloadcutoff) - overloaded++; - - if (ocount[i] > 100+(overloadcutoff-100)/2) { - adjusted++; - c.overload[i] = 100.0 / (float)ocount[i]; - //cout << "disk " << i << " has " << ocount[i] << endl; - } - ocount[i] = 0; - } - //cout << overloaded << " overloaded" << endl; - overloadsum += (float)overloaded / (float)ndisks; - adjustsum += (float)adjusted / (float)ndisks; - - - for (int x=xs; x overloadcutoff) { - still++; - //c.overload[ocount[i]] = 100.0 / (float)ocount[i]; - //cout << "disk " << i << " has " << ocount[i] << endl; - } - } - //if (still) cout << "overload was " << overloaded << " now " << still << endl; - afteroverloadsum += (float)still / (float)ndisks; - - //cout << "collisions: " << c.collisions << endl; - //cout << "r bumps: " << c.bumps << endl; - - float avg = 0.0; - for (int i=0; i100; d -= 5) { - float var = go(3,d); - //cout << "## depth = " << d << endl; - //cout << d << "\t" << var << endl; - } -} diff --git a/branches/sage/mds/crush/test/sizes.cc b/branches/sage/mds/crush/test/sizes.cc deleted file mode 100644 index cc5780218210a..0000000000000 --- a/branches/sage/mds/crush/test/sizes.cc +++ /dev/null @@ -1,131 +0,0 @@ - -#include "include/types.h" -#include "include/Distribution.h" -#include "osd/OSDMap.h" - - -Distribution file_size_distn; //kb - - -list object_queue; -int max_object_size = 1024*1024*100; //kb - -off_t no; - -int get_object() //kb -{ - if (object_queue.empty()) { - int max = file_size_distn.sample(); - no++; - int filesize = max/2 + (rand() % 100) * max/200 + 1; - //cout << "file " << filesize << endl; - while (filesize > max_object_size) { - object_queue.push_back(max_object_size); - filesize -= max_object_size; - } - object_queue.push_back(filesize); - } - int s = object_queue.front(); - object_queue.pop_front(); - //cout << "object " << s << endl; - return s; -} - -void getdist(vector& v, float& avg, float& var) -{ - avg = 0.0; - for (int i=0; i pgs(n); - off_t did = 0; - - no = 0; - while (did < dist) { - off_t s = get_object(); - pgs[rand()%n] += s; - did += s; - } - while (!object_queue.empty()) - pgs[rand()%n] += get_object(); - - numo = no; - //cout << did/n << endl; - - //for (int i=0; i - -#include -#include -using namespace std; - - -Bucket *make_bucket(Crush& c, vector& wid, int h, map< int, list >& buckets, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - buckets[h].push_back(b); - return b; - } else { - // mixed - Bucket *b = new TreeBucket(h+1); - c.add_bucket(b); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - n->set_parent(b->get_id()); - } - buckets[h].push_back(b); - //cout << b->get_id() << " mixedbucket with " << wid[h] << " at " << h << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, map< int, list >& buckets, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, buckets, ndisks); - return b->get_id(); -} - - -void place(Crush& c, Rule& rule, int numpg, int numrep, vector& ocount) -{ - vector v(numrep); - //map ocount; - - for (int x=1; x<=numpg; x++) { - - //cout << H(x) << "\t" << h(x) << endl; - c.do_rule(rule, x, v); - //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl; - - bool bad = false; - for (int i=0; i wid; - wid.push_back(10); - wid.push_back(2); - - map< int, list > buckets; - root = make_hierarchy(c, wid, buckets, ndisks); - - // add small bucket - vector disks; - for (int i=0; i<3; i++) - disks.push_back(ndisks++); - UniformBucket *b = new UniformBucket(1, 0, 1, disks); - b->make_primes(h); - Bucket *o = buckets[1].back(); - c.add_bucket(b); - //cout << " adding under " << o->get_id() << endl; - c.add_item(o->get_id(), b->get_id(), b->get_weight()); - - - // rule - int numrep = 6; - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - //c.overload[10] = .1; - - int pg_per = 10000; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - - c.print(cout, root); - - place(c, rule, numpg, numrep, ocount); - - for (int i=0; i - -#include -#include -using namespace std; - - -int numrep = 1; - - -double go(int n, int bucket) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - Bucket *b; - vector items; - if (bucket == 0) b = new UniformBucket(1,0,10,items); - if (bucket == 1) b = new TreeBucket(1); - if (bucket == 2) b = new ListBucket(1); - if (bucket == 3) b = new StrawBucket(1); - - for (int d=0; dadd_item(ndisks++, 1); - - //if (!bucket) ((UniformBucket*)b)->make_primes(h); - - root = c.add_bucket(b); - - // rule - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - - int place = 1000000; - - - vector v(numrep); - set out; - map overload; - - utime_t start = g_clock.now(); - - for (int x=1; x <= place; x++) - c.do_rule(rule, x, v, out, overload); - - utime_t end = g_clock.now(); - - end -= start; - double el = (double)end; - - //cout << "\t" << ndisks; - - return el; -} - - -int main() -{ - - for (int n=4; n<=50; n += 4) { - cout << n; - for (int b=0; b<4; b++) { - double el = go(n,b); - cout << "\t" << el; - } - cout << endl; - } -} diff --git a/branches/sage/mds/crush/test/speed_depth.cc b/branches/sage/mds/crush/test/speed_depth.cc deleted file mode 100644 index 32275d16d2b31..0000000000000 --- a/branches/sage/mds/crush/test/speed_depth.cc +++ /dev/null @@ -1,174 +0,0 @@ - -#include "../../common/Clock.h" -#include "../crush.h" -using namespace crush; - - -Clock g_clock; - -#include - -#include -#include -using namespace std; - - -int uniform = 10; -int branching = 10; -int buckettype = 0; -int numrep = 1; - -Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - return b; - } else { - // mixed - Bucket *b; - if (buckettype == 0) - b = new TreeBucket(h+1); - else if (buckettype == 1 || buckettype == 2) - b = new ListBucket(h+1); - else if (buckettype == 3) - b = new StrawBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); - return b->get_id(); -} - - -double go(int dep, int per) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - if (1) { - wid.push_back(uniform); - for (int d=1; d v(numrep); - - utime_t start = g_clock.now(); - - set out; - map overload; - - for (int x=1; x <= place; x++) - c.do_rule(rule, x, v, out, overload); - - utime_t end = g_clock.now(); - - end -= start; - double el = (double)end; - - //cout << "\t" << ndisks; - - return el; -} - - -int main() -{ - uniform = branching = 8; - - cout << "// dep\tuniform\tbranch\tndisks" << endl; - - for (int d=2; d<=5; d++) { - cout << d;// << "\t" << branching; - cout << "\t" << uniform; - cout << "\t" << branching; - - int n = 1; - for (int i=0; i - -#include -#include -using namespace std; - - -int branching = 10; -bool linear = false; -int numrep = 1; - -Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - return b; - } else { - // mixed - Bucket *b; - if (linear) - b = new ListBucket(h+1); - else - b = new TreeBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); - return b->get_id(); -} - - -double go(int s) -{ - int dep = 2; - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - if (1) { - //for (int d=0; d v(numrep); - - utime_t start = g_clock.now(); - - for (int x=1; x <= place; x++) - c.do_rule(rule, x, v); - - utime_t end = g_clock.now(); - - end -= start; - double el = (double)end; - - cout << "\t" << ndisks; - - return el; -} - - -int main() -{ - branching = 8; - - int d = 2; - numrep = 2; - - for (int s = 64; s <= 32768; s *= 8) { - cout << "t"; - linear = false; - double el = go(s, d); - cout << "\t" << el; - - cout << "\tp"; - linear = true; - el = go(s, d); - cout << "\t" << el; - - cout << endl; - } -} diff --git a/branches/sage/mds/crush/test/t.cc b/branches/sage/mds/crush/test/t.cc deleted file mode 100644 index 0785ef47d6c04..0000000000000 --- a/branches/sage/mds/crush/test/t.cc +++ /dev/null @@ -1,25 +0,0 @@ - -#include "../../common/Clock.h" -#include "../crush.h" -using namespace crush; - - -Clock g_clock; - -#include - -#include -#include -using namespace std; - - -int branching = 10; -bool linear = false; -int numrep = 1; - -int main() { - - Bucket *b = new UniformBucket(1, 0); - //b = new TreeBucket(1); -} - diff --git a/branches/sage/mds/crush/test/testbucket.cc b/branches/sage/mds/crush/test/testbucket.cc deleted file mode 100644 index 065721c2c1967..0000000000000 --- a/branches/sage/mds/crush/test/testbucket.cc +++ /dev/null @@ -1,61 +0,0 @@ - - -#include "../Bucket.h" -using namespace crush; - -#include -#include -using namespace std; - - -ostream& operator<<(ostream& out, vector& v) -{ - out << "["; - for (int i=0; i ocount(ndisks); - - vector v(numrep); - int nplace = 0; - for (int x=1; x<1000000; x++) { - //cout << H(x) << "\t" << h(x) << endl; - for (int i=0; i -#include -using namespace std; - - -void getdist(vector& v, float& avg, float& var) -{ - avg = 0.0; - for (int i=0; i a(n); - vector b(n); - - for (int i=0; i c(n); - for (int i=0; i .depend 2>/dev/null - -include .depend diff --git a/branches/sage/mds/crush2/buckets.c b/branches/sage/mds/crush2/buckets.c deleted file mode 100644 index 2a2e170bbbb6c..0000000000000 --- a/branches/sage/mds/crush2/buckets.c +++ /dev/null @@ -1,56 +0,0 @@ - -#include "hash.h" -#include "buckets.h" - -int -crush_bucket_uniform_choose(struct crush_bucket_uniform *bucket, int x, int r) -{ - unsigned o, p, s; - o = crush_hash32_2(x, bucket->h.id); - p = bucket->primes[crush_hash32_2(bucket->h.id, x) % bucket->h.size]; - s = (x + o + (r+1)*p) % bucket->h.size; - return bucket->h.items[s]; -} - -int -crush_bucket_list_choose(struct crush_bucket_list *bucket, int x, int r) -{ - int i; - __u64 w; - - for (i=0; ih.size; i++) { - w = crush_hash32_4(x, bucket->h.items[i], r, bucket->h.id) & 0xffff; - w = (w * bucket->sum_weights[i]) >> 32; - if (w < bucket->item_weights[i]) - return bucket->h.items[i]; - } - - BUG_ON(1); - return 0; -} - -int -crush_bucket_tree_choose(struct crush_bucket_tree *bucket, int x, int r) -{ - return 0; -} - -int -crush_bucket_straw_choose(struct crush_bucket_straw *bucket, int x, int r) -{ - int i; - int high = 0; - unsigned high_draw = 0; - __u64 draw; - - for (i=0; ih.size; i++) { - draw = (crush_hash32_3(x, bucket->h.items[i], r) & 0xffff) * bucket->straws[i]; - draw = draw >> 32; - if (i == 0 || draw > high_draw) { - high = i; - high_draw = draw; - } - } - - return high; -} diff --git a/branches/sage/mds/crush2/buckets.h b/branches/sage/mds/crush2/buckets.h deleted file mode 100644 index c83d522159ffc..0000000000000 --- a/branches/sage/mds/crush2/buckets.h +++ /dev/null @@ -1,49 +0,0 @@ -#ifndef _CRUSH_BUCKETS_H -#define _CRUSH_BUCKETS_H - -#include "types.h" - -enum { - CRUSH_BUCKET_UNIFORM = 1, - CRUSH_BUCKET_LIST = 2, - CRUSH_BUCKET_TREE = 3, - CRUSH_BUCKET_STRAW = 4 -}; - -struct crush_bucket { - __u32 id; - __u32 type; - __u32 weight; /* 16-bit fixed point */ - __u32 size; /* num items */ - __s32 *items; -}; - -struct crush_bucket_uniform { - struct crush_bucket h; - __u32 item_weight; /* 16-bit fixed point */ - __u32 item_type; - __u32 *primes; -}; - -struct crush_bucket_list { - struct crush_bucket h; - __u32 *item_weights; /* 16-bit fixed point */ - __u32 *sum_weights; /* 16-bit fixed point */ -}; - -struct crush_bucket_tree { - struct crush_bucket h; - -}; - -struct crush_bucket_straw { - struct crush_bucket h; - __u32 *straws; /* 16-bit fixed point */ -}; - -extern int crush_bucket_uniform_choose(struct crush_bucket_uniform *bucket, int x, int r); -extern int crush_bucket_list_choose(struct crush_bucket_list *bucket, int x, int r); -extern int crush_bucket_tree_choose(struct crush_bucket_tree *bucket, int x, int r); -extern int crush_bucket_straw_choose(struct crush_bucket_straw *bucket, int x, int r); - -#endif diff --git a/branches/sage/mds/crush2/crush.c b/branches/sage/mds/crush2/crush.c deleted file mode 100644 index 3b420c43780d7..0000000000000 --- a/branches/sage/mds/crush2/crush.c +++ /dev/null @@ -1,236 +0,0 @@ - -#include "crush.h" -#include "hash.h" - -/* - * choose numrep distinct items of given type - */ -static int crush_choose(struct crush_map *map, - struct crush_bucket *bucket, - int x, int numrep, int type, - int *out, int firstn, - int *outmap) -{ - int rep; - int ftotal, flocal; - int retry_rep, skip_rep; - struct crush_bucket *in = bucket; - int r; - int i; - int item; - int itemtype; - int outpos; - int collide, bad; - - outpos = 0; - - for (rep = 0; rep < numrep; rep++) { - /* keep trying until we get a non-out, non-colliding item */ - ftotal = 0; - skip_rep = 0; - - while (1) { - in = bucket; /* initial bucket */ - - /* choose through intervening buckets */ - flocal = 0; - retry_rep = 0; - - while (1) { - r = rep; - if (in->type == CRUSH_BUCKET_UNIFORM) { - /* be careful */ - if (firstn || numrep >= in->size) { - r += ftotal; /* r' = r + f_total */ - } else { - r += numrep * flocal; /* r' = r + n*f_local */ - /* make sure numrep is not a multiple of bucket size */ - if (in->size % numrep == 0) - /* shift seq once per pass through the bucket */ - r += numrep * flocal / in->size; - } - } else { - if (firstn) - r += ftotal; /* r' = r + f_total */ - else - r += numrep * flocal; /* r' = r + n*f_local */ - } - - /* bucket choose */ - switch (in->type) { - case CRUSH_BUCKET_UNIFORM: - item = crush_bucket_uniform_choose((struct crush_bucket_uniform*)in, x, r); - break; - case CRUSH_BUCKET_LIST: - item = crush_bucket_list_choose((struct crush_bucket_list*)in, x, r); - break; - case CRUSH_BUCKET_TREE: - item = crush_bucket_tree_choose((struct crush_bucket_tree*)in, x, r); - break; - case CRUSH_BUCKET_STRAW: - item = crush_bucket_straw_choose((struct crush_bucket_straw*)in, x, r); - break; - default: - BUG_ON(1); - } - - /* desired type? */ - if (in->type == CRUSH_BUCKET_UNIFORM) - itemtype = ((struct crush_bucket_uniform*)in)->item_type; - else if (item < 0) - itemtype = map->buckets[-item].type; - else - itemtype = 0; - - /* keep going? */ - if (itemtype != type) { - in = &map->buckets[-item]; - continue; - } - - /* collision? */ - collide = 0; - for (i=0; i out[item]) - bad = 1; - } - - if (bad || collide) { - ftotal++; - flocal++; - - if (collide && flocal < 3) - continue; /* locally a few times */ - if (ftotal >= 10) { - /* give up, ignore dup, fixme */ - skip_rep = 1; - break; - } - retry_rep = 1; - } - break; - } - - if (retry_rep) continue; - } - - if (skip_rep) continue; - - out[outpos] = item; - outpos++; - } - - return outpos; -} - - -int crush_do_rule(struct crush_map *map, - int ruleno, - int x, int *result, int result_max, - int *outmap, /* array of size max_devices, values 0...0xffff */ - int forcefeed) /* -1 for none */ -{ - int result_len; - int force_stack[CRUSH_MAX_DEPTH]; - int force_pos = -1; - int a[CRUSH_MAX_SET]; - int b[CRUSH_MAX_SET]; - int *w; - int wsize = 0; - int *o; - int osize; - int *tmp; - struct crush_rule *rule; - int step; - int i; - int numrep; - - rule = &map->rules[ruleno]; - result_len = 0; - w = a; - o = b; - - /* determine hierarchical context of forcefeed, if any */ - if (forcefeed >= 0) { - while (1) { - force_stack[++force_pos] = forcefeed; - if (forcefeed >= 0) - forcefeed = map->device_parent_map[forcefeed]; - else - forcefeed = map->bucket_parent_map[-forcefeed]; - if (forcefeed == 0) break; - } - } - - for (step = 0; step < rule->len; step++) { - switch (rule->steps[step].op) { - case CRUSH_RULE_TAKE: - if (force_pos >= 0) { - w[0] = force_stack[force_pos]; - force_pos--; - BUG_ON(w[0] != rule->steps[step].arg1); - } else { - w[0] = rule->steps[step].arg1; - } - wsize = 1; - break; - - case CRUSH_RULE_CHOOSE_FIRSTN: - case CRUSH_RULE_CHOOSE_INDEP: - BUG_ON(wsize == 0); - - /* reset output */ - osize = 0; - - for (i = 0; i < wsize; i++) { - numrep = rule->steps[step].arg1; - - if (force_pos >= 0) { - o[osize++] = force_stack[force_pos]; - force_pos--; - numrep--; - } - if (numrep) - crush_choose(map, - &map->buckets[-w[i]], - x, numrep, rule->steps[step].arg2, - o+osize, rule->steps[step].op == CRUSH_RULE_CHOOSE_FIRSTN, - outmap); - } - - /* swap t and w arrays */ - tmp = o; - o = w; - w = o; - wsize = osize; - break; - - - case CRUSH_RULE_EMIT: - for (i=0; i>13); \ - b=b-c; b=b-a; b=b^(a<<8); \ - c=c-a; c=c-b; c=c^(b>>13); \ - a=a-b; a=a-c; a=a^(c>>12); \ - b=b-c; b=b-a; b=b^(a<<16); \ - c=c-a; c=c-b; c=c^(b>>5); \ - a=a-b; a=a-c; a=a^(c>>3); \ - b=b-c; b=b-a; b=b^(a<<10); \ - c=c-a; c=c-b; c=c^(b>>15); - -#define crush_hash_seed 1315423911 - -static __inline__ unsigned crush_hash32(unsigned a) { - unsigned hash = crush_hash_seed ^ a; - unsigned b = a; - unsigned x = 231232; - unsigned y = 1232; - hashmix(b, x, hash); - hashmix(y, a, hash); - return (hash & 0xFFFFFFFF); -} - -static __inline__ unsigned crush_hash32_2(unsigned a, unsigned b) { - unsigned hash = crush_hash_seed ^ a ^ b; - unsigned x = 231232; - unsigned y = 1232; - hashmix(a, b, hash); - hashmix(x, a, hash); - hashmix(b, y, hash); - return (hash & 0xFFFFFFFF); -} - -static __inline__ unsigned crush_hash32_3(unsigned a, unsigned b, unsigned c) { - unsigned int hash = crush_hash_seed ^ a ^ b ^ c; - unsigned x = 231232; - unsigned y = 1232; - hashmix(a, b, hash); - hashmix(c, x, hash); - hashmix(y, a, hash); - hashmix(b, x, hash); - hashmix(y, c, hash); - return (hash & 0xFFFFFFFF); -} - -static __inline__ unsigned crush_hash32_4(unsigned a, unsigned b, unsigned c, unsigned d) { - unsigned int hash = crush_hash_seed ^a ^ b ^ c ^ d; - unsigned x = 231232; - unsigned y = 1232; - hashmix(a, b, hash); - hashmix(c, d, hash); - hashmix(a, x, hash); - hashmix(y, b, hash); - hashmix(c, x, hash); - hashmix(y, d, hash); - return (hash & 0xFFFFFFFF); -} - -static __inline__ unsigned crush_hash32_5(unsigned a, unsigned b, unsigned c, unsigned d, unsigned e) { - unsigned int hash = crush_hash_seed ^ a ^ b ^ c ^ d ^ e; - unsigned x = 231232; - unsigned y = 1232; - hashmix(a, b, hash); - hashmix(c, d, hash); - hashmix(e, x, hash); - hashmix(y, a, hash); - hashmix(b, x, hash); - hashmix(y, c, hash); - hashmix(d, x, hash); - hashmix(y, e, hash); - return (hash & 0xFFFFFFFF); -} - -#endif diff --git a/branches/sage/mds/crush2/types.h b/branches/sage/mds/crush2/types.h deleted file mode 100644 index ea682401e146b..0000000000000 --- a/branches/sage/mds/crush2/types.h +++ /dev/null @@ -1,11 +0,0 @@ -#ifndef _CRUSH_TYPES_H -#define _CRUSH_TYPES_H - -#include /* just for int types */ - -#ifndef BUG_ON -# include -# define BUG_ON(x) assert(!(x)) -#endif - -#endif diff --git a/branches/sage/mds/csyn.cc b/branches/sage/mds/csyn.cc deleted file mode 100644 index 562f00e3f861b..0000000000000 --- a/branches/sage/mds/csyn.cc +++ /dev/null @@ -1,87 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include -#include -#include -using namespace std; - -#include "config.h" - -#include "client/SyntheticClient.h" -#include "client/Client.h" -#include "client/fuse.h" - -#include "msg/SimpleMessenger.h" - -#include "common/Timer.h" - -#ifndef DARWIN -#include -#endif // DARWIN - -#include -#include -#include - -int main(int argc, char **argv, char *envp[]) { - - //cerr << "cfuse starting " << myrank << "/" << world << std::endl; - vector args; - argv_to_vec(argc, argv, args); - parse_config_options(args); - parse_syn_options(args); // for SyntheticClient - - // args for fuse - vec_to_argv(args, argc, argv); - - if (g_conf.clock_tare) g_clock.tare(); - - // load monmap - MonMap monmap; - int r = monmap.read(".ceph_monmap"); - assert(r >= 0); - - // start up network - rank.start_rank(); - - list clients; - list synclients; - - cout << "mounting and starting " << g_conf.num_client << " syn client(s)" << std::endl; - for (int i=0; istart_thread(); - clients.push_back(client); - synclients.push_back(syn); - } - - cout << "waiting for client(s) to finish" << std::endl; - while (!clients.empty()) { - Client *client = clients.front(); - SyntheticClient *syn = synclients.front(); - clients.pop_front(); - synclients.pop_front(); - syn->join_thread(); - delete syn; - delete client; - } - - // wait for messenger to finish - rank.wait(); - - return 0; -} - diff --git a/branches/sage/mds/doc/Commitdir.txt b/branches/sage/mds/doc/Commitdir.txt deleted file mode 100644 index 05c727be60ae6..0000000000000 --- a/branches/sage/mds/doc/Commitdir.txt +++ /dev/null @@ -1,24 +0,0 @@ -OLD - - -How Directory Committing Works: - -Each CDir has: - version - current version of directory - committing_version - which version was sent to stable storage - last_committed_version - last version to be safely stored - -Each Inode has: - parent_dir_version - what dir version i was in when i was dirtied. (*) - - (*) note that if you change an inode, mark_dirty() again, even if it's already dirty! - - -How committing works: - -A call to commit_dir(dir, context) will ensure tha the _current_ version is stored safely on disk before the context is finished. - -When a commit completes, inodes in the directory are checked. If they are dirty and belonged to the _committed_ (or earlier) version, then they are marked clean. If they belong to a newer version, then they are _still dirty_. - - - diff --git a/branches/sage/mds/doc/anchortable.txt b/branches/sage/mds/doc/anchortable.txt deleted file mode 100644 index d9c0fefc31e08..0000000000000 --- a/branches/sage/mds/doc/anchortable.txt +++ /dev/null @@ -1,54 +0,0 @@ - -ANCHOR TABLE PROTOCOL - -MDS sends an update PREPARE to the anchortable MDS. The prepare is -identified by the ino and operation type; only one for each type -(create, update, destroy) can be pending at any time. Both parties -may actually be the same local node, but for simplicity we treat that -situation the same. (That is, we act as if they may fail -independently, even if they can't.) - -The anchortable journals the proposed update, and responds with an -AGREE and a version number. This uniquely identifies the request. - -The MDS can then update the filesystem metadata however it sees fit. -When it is finished (and the results journaled), it sends a COMMIT to -the anchortable. The table journals the commit, frees any state from -the transaction, and sends an ACK. The initiating MDS should then -journal the ACK to complete the transaction. - - -ANCHOR TABLE FAILURE - -If the AT fails before journaling the PREPARE and sending the AGREE, -the initiating MDS will simply retry the request. - -If the AT fails after journaling PREPARE but before journaling COMMIT, -it will resend AGREE to the initiating MDS. - -If the AT fails after the COMMIT, the transaction has been closed, and it -takes no action. If it receives a COMMIT for which it has no open -transaction, it will reply with ACK. - - -INITIATING MDS FAILURE - -If the MDS fails before the metadata update has been journaled, no -action is taken, since nothing is known about the previously proposed -transaction. If an AGREE message is received and there is no -corresponding PREPARE or pending-commit state, and ROLLBACK is sent to -the anchor table. - -If the MDS fails after journaling the metadata update but before -journaling the ACK, it resends COMMIT to the anchor table. If it -receives an AGREE after resending the COMMIT, it simply ignores the -AGREE. The anchortable will respond with an ACK, allowing the -initiating MDS to journal the final ACK and close out the transaction -locally. - -On journal replay, each metadata update (EMetaBlob) encountered that -includes an anchor transaction is noted in the AnchorClient by adding -it to the pending_commit list, and each journaled ACK is removed from -that list. Journal replay may enounter ACKs with no prior metadata -update; these are ignored. When recovery finishes, a COMMIT is sent -for all outstanding transactions. diff --git a/branches/sage/mds/doc/bdb.txt b/branches/sage/mds/doc/bdb.txt deleted file mode 100644 index 63e647f5bb3cc..0000000000000 --- a/branches/sage/mds/doc/bdb.txt +++ /dev/null @@ -1,48 +0,0 @@ -OBJECT STORE ON BERKELEY DB ---------------------------- - -OSBDB is an implementation of an object store that uses Berkeley DB as -the underlying storage. It is meant to be an alternative to EBOFS. - -BUILDING --------- - -You will need to have Berkeley DB installed, including the developent -packages. We've tested this with Berkeley DB 4.4.20 on Ubuntu 6.10. - -To compile OSBDB support, you need to pass the argument "want_bdb=yes" -to "make." If you don't specify this, OSBDB and all its associated -support is not included in the executables. - -RUNNING -------- - -To use OSBDB in Ceph, simply pass the --bdbstore flag to programs. You -don't need to create a "device" for OSBDB ahead of time; Berkeley DB -will take care of creating the files. You also *cannot* use a raw -device as your store -- it must be regular file. - -OSBDB additionally accepts the following flags: - - --bdbstore-btree Configures OSBDB to use the "Btree" - database type for Berkeley DB. The default - database type is "Hash". - - --bdbstore-hash-ffactor Sets the "fill factor" for the hash - database type. Takes an integer argument. - - --bdbstore-hash-nelem Sets the "nelem" parameter for the hash - database type. Takes an integer argument. - - --bdbstore-hash-pagesize Sets the page size for the hash database - type. Takes an integer argument. - - --bdbstore-cachesize Sets the cache size. Takes an integer - argument, which must be a power of two, and - no less than 20 KiB. - - --bdbstore-transactional Enable (in-memory-only) transactions for - all operations in the OSBDB store. - - --debug-bdbstore Set the debug level. Takes an integer - argument. diff --git a/branches/sage/mds/doc/caching.txt b/branches/sage/mds/doc/caching.txt deleted file mode 100644 index 161eaf7428a53..0000000000000 --- a/branches/sage/mds/doc/caching.txt +++ /dev/null @@ -1,303 +0,0 @@ - -SPANNING TREE PROPERTY - -All metadata that exists in the cache is attached directly or -indirectly to the root inode. That is, if the /usr/bin/vi inode is in -the cache, then /usr/bin, /usr, and / are too, including the inodes, -directory objects, and dentries. - - -AUTHORITY - -The authority maintains a list of what nodes cache each inode. -Additionally, each replica is assigned a nonce (initial 0) to -disambiguate multiple replicas of the same item (see below). - - map replicas; // maps replicating mds# to nonce - -The cached_by set _always_ includes all nodes that cache the -partcuarly object, but may additionally include nodes that used to -cache it but no longer do. In those cases, an expire message should -be in transit. That is, we have two invariants: - - 1) the authority's replica set will always include all actual - replicas, and - - 2) cache expiration notices will be reliably delivered to the - authority. - -The second invariant is particularly important because the presence of -replicas will pin the metadata object in memory on the authority, -preventing it from being trimmed from the cache. Notification of -expiration of the replicas is required to allow previously replicated -objects from eventually being trimmed from the cache as well. - -Each metdata object has a authority bit that indicates whether it is -authoritative or a replica. - - -REPLICA NONCE - -Each replicated object maintains a "nonce" value, issued by the -authority at the time the replica was created. If the authority has -already created a replica for the given MDS, the new replica will be -issues a new (incremented) nonce. This nonce is attached -to cache expirations, and allows the authority to disambiguate -expirations when multiple replicas of the same object are created and -cache expiration is coincident with replication. That is, when an -old replica is expired from the replicating MDS at the same time that -a new replica is issued by the authority and the resulting messages -cross paths, the authority can tell that it was the old replica that -was expired and effectively ignore the expiration message. The -replica is removed from the replicas map only if the nonce matches. - - -SUBTREE PARTITION - -Authority of the file system namespace is partitioned using a -subtree-based partitioning strategy. This strategy effectively -separates directory inodes from directory contents, such that the -directory contents are the unit of redelegation. That is, if / is -assigned to mds0 and /usr to mds1, the inode for /usr will be managed -by mds0 (it is part of the / directory), while the contents of /usr -(and everything nested beneath it) will be managed by mds1. - -The description for this partition exists solely in the collective -memory of the MDS cluster and in the individual MDS journals. It is -not described in the regular on-disk metadata structures. This is -related to the fact that authority delegation is a property of the -{\it directory} and not the directory's {\it inode}. - -Subsequently, if an MDS is authoritative for a directory inode and does -not yet have any state associated with the directory in its cache, -then it can assume that it is also authoritative for the directory. - -Directory state consists of a data object that describes any cached -dentries contained in the directory, information about the -relationship between the cached contents and what appears on disk, and -any delegation of authority. That is, each CDir object has a dir_auth -element. Normally dir_auth has a value of AUTH_PARENT, meaning that -the authority for the directory is the same as the directory's inode. -When dir_auth specifies another metadata server, that directory is -point of authority delegation and becomes a {\it subtree root}. A -CDir is a subtree root iff its dir_auth specifies an MDS id (and is not -AUTH_PARENT). - - - A dir is a subtree root iff dir_auth != AUTH_PARENT. - - - If dir_auth = AUTH_PARENT then the inode auth == dir auth, but the - converse may not be true. - -The authority for any metadata object in the cache can be determined -by following the parent pointers toward the root until a subtree root -CDir object is reached, at which point the authority is specified by -its dir_auth. - -Each MDS cache maintains a subtree data structure that describes the -subtree partition for all objects currently in the cache: - - map< CDir*, set > subtrees; - - - A dir will appear in the subtree map (as a key) IFF it is a subtree - root. - -Each subtree root will have an entry in the map. The map value is a -set of all other subtree roots nested beneath that point. Nested -subtree roots effectively bound or prune a subtree. For example, if -we had the following partition: - - mds0 / - mds1 /usr - mds0 /usr/local - mds0 /home - -The subtree map on mds0 would be - - / -> (/usr, /home) - /usr/local -> () - /home -> () - -and on mds1: - - /usr -> (/usr/local) - - -AMBIGUOUS DIR_AUTH - -While metadata for a subtree is being migrated between two MDS nodes, -the dir_auth for the subtree root is allowed to be ambiguous. That -is, it will specify both the old and new MDS ids, indicating that a -migration is in progress. - -If a replicated metadata object is expired from the cache from a -subtree whose authority is ambiguous, the cache expiration is sent to -both potential authorities. This ensures that the message will be -reliably delivered, even if either of those nodes fails. A number of -alternative strategies were considered. Sending the expiration to the -old or new authority and having it forwarded if authority has been -delegated can result in message loss if the forwarding node fails. -Pinning ambiguous metadata in cache is computationally expensive for -implementation reasons, and while delaying the transmission of expiration -messages is difficult to implement because the replicating must send -the final expiration messages when the subtree authority is -disambiguated, forcing it to keep certain elements of it cache in -memory. Although duplicated expirations incurs a small communications -overhead, the implementation is much simpler. - - -AUTH PINS - -Most operations that modify metadata must allow some amount of time to -pass in order for the operation to be journaled or for communication -to take place between the object's authority and any replicas. For -this reason it must not only be pinned in the authority's metadata -cache, but also be locked such that the object's authority is not -allowed to change until the operation completes. This is accomplished -using {\it auth pins}, which increment a reference counter on the -object in question, as well as all parent metadata objects up to the -root of the subtree. As long as the pin is in place, it is impossible -for that subtree (or any fragment of it that contains one or more -pins) to be migrated to a different MDS node. Pins can be placed on -both inodes and directories. - -Auth pins can only exist for authoritative metadata, because they are -only created if the object is authoritative, and their presense -prevents the migration of authority. - - -FREEZING - -More specifically, auth pins prevent a subtree from being frozen. -When a subtree is frozen, all updates to metadata are forbidden. This -includes updates to the replicas map that describes which replicas -(and nonces) exist for each object. - -In order for metadata to be migrated between MDS nodes, it must first -be frozen. The root of the subtree is initially marked as {\it -freezing}. This prevents the creation of any new auth pins within the -subtree. After all existing auth pins are removed, the subtree is -then marked as {\it frozen}, at which point all updates are -forbidden. This allows metadata state to be packaged up in a message -and transmitted to the new authority, without worrying about -intervening updates. - -If the directory at the base of a freezing or frozen subtree is not -also a subtree root (that is, it has dir_auth == AUTH_PARENT), the -directory's parent inode is auth pinned. - - - a frozen tree root dir will auth_pin its inode IFF it is auth AND - not a subtree root. - -This prevents a parent directory from being concurrently frozen, and a -range of resulting implementation complications relating metadata -migration. - - -CACHE EXPIRATION FOR FROZEN SUBTREES - -Cache expiration messages that are received for a subtree that is -frozen are temporarily set aside instead of being processed. Only -when the subtree is unfrozen are the expirations either processed (if -the MDS is authoritative) or discarded (if it is not). Because either -the exporting or importing metadata can fail during the migration -process, the MDS cannot tell whether it will be authoritative or not -until the process completes. - -During a migration, the subtree will first be frozen on both the -exporter and importer, and then all other replicas will be informed of -a subtrees ambiguous authority. This ensures that all expirations -during migration will go to both parties, and nothing will be lost in -the event of a failure. - - - - -NORMAL MIGRATION - -The exporter begins by doing some checks in export_dir() to verify -that it is permissible to export the subtree at this time. In -particular, the cluster must not be degraded, the subtree root may not -be freezing or frozen, and the path must be pinned (\ie not conflicted -with a rename). If these conditions are met, the subtree root -directory is temporarily auth pinned, the subtree freeze is initiated, -and the exporter is committed to the subtree migration, barring an -intervening failure of the importer or itself. - -The MExportDiscover serves simply to ensure that the inode for the -base directory being exported is open on the destination node. It is -pinned by the importer to prevent it from being trimmed. This occurs -before the exporter completes the freeze of the subtree to ensure that -the importer is able to replicate the necessary metadata. When the -exporter receives the MDiscoverAck, it allows the freeze to proceed by -removing its temporary auth pin. - -The MExportPrep message then follows to populate the importer with a -spanning tree that includes all dirs, inodes, and dentries necessary -to reach any nested subtrees within the exported region. This -replicates metadata as well, but it is pushed out by the exporter, -avoiding deadlock with the regular discover and replication process. -The importer is responsible for opening the bounding directories from -any third parties authoritative for those subtrees before -acknowledging. This ensures that the importer has correct dir_auth -information about where authority is redelegated for all points nested -beneath the subtree being migrated. While processing the MExportPrep, -the importer freezes the entire subtree region to prevent any new -replication or cache expiration. - -A warning stage occurs only if the base subtree directory is open by -nodes other than the importer and exporter. If it is not, then this -implies that no metadata within or nested beneath the subtree is -replicated by any node other than the importer an exporter. If it is, -then a MExportWarning message informs any bystanders that the -authority for the region is temporarily ambiguous, and lists both the -exporter and importer as authoritative MDS nodes. In particular, -bystanders who are trimming items from their cache must send -MCacheExpire messages to both the old and new authorities. This is -necessary to ensure that the surviving authority reliably receives all -expirations even if the importer or exporter fails. While the subtree -is frozen (on both the importer and exporter), expirations will not be -immediately processed; instead, they will be queued until the region -is unfrozen and it can be determined that the node is or is not -authoritative. - -The exporter walks the subtree hierarchy and packages up an MExport -message containing all metadata and important state (\eg, information -about metadata replicas). At the same time, the expoter's metadata -objects are flagged as non-authoritative. The MExport message sends -the actual subtree metadata to the importer. Upon receipt, the -importer inserts the data into its cache, marks all objects as -authoritative, and logs a copy of all metadata in an EImportStart -journal message. Once that has safely flushed, it replies with an -MExportAck. The exporter can now log an EExport journal entry, which -ultimately specifies that the export was a success. In the presence -of failures, it is the existence of the EExport entry only that -disambiguates authority during recovery. - -Once logged, the exporter will send an MExportNotify to any -bystanders, informing them that the authority is no longer ambiguous -and cache expirations should be sent only to the new authority (the -importer). Once these are acknowledged back to the exporter, -implicitly flushing the bystander to exporter message streams of any -stray expiration notices, the exporter unfreezes the subtree, cleans -up its migration-related state, and sends a final MExportFinish to the -importer. Upon receipt, the importer logs an EImportFinish(true) -(noting locally that the export was indeed a success), unfreezes its -subtree, processes any queued cache expierations, and cleans up its -state. - - -PARTIAL FAILURE RECOVERY - - - - -RECOVERY FROM JOURNAL - - - - - - - - - diff --git a/branches/sage/mds/doc/dentries.txt b/branches/sage/mds/doc/dentries.txt deleted file mode 100644 index ab14765998b2f..0000000000000 --- a/branches/sage/mds/doc/dentries.txt +++ /dev/null @@ -1,4 +0,0 @@ - -null dentires only exist - - on auth - - on replica, if they are xlock \ No newline at end of file diff --git a/branches/sage/mds/doc/exports.txt b/branches/sage/mds/doc/exports.txt deleted file mode 100644 index 8e0e146bea2fe..0000000000000 --- a/branches/sage/mds/doc/exports.txt +++ /dev/null @@ -1,72 +0,0 @@ - -NORMAL MIGRATION - -The exporter begins by doing some checks in export_dir() to verify -that it is permissible to export the subtree at this time. In -particular, the cluster must not be degraded, the subtree root may not -be freezing or frozen (\ie already exporting, or nested beneath -something that is exporting), and the path must be pinned (\ie not -conflicted with a rename). If these conditions are met, the subtree -freeze is initiated, and the exporter is committed to the subtree -migration, barring an intervening failure of the importer or itself. - -The MExportDiscover serves simply to ensure that the base directory -being exported is open on the destination node. It is pinned by the -importer to prevent it from being trimmed. This occurs before the -exporter completes the freeze of the subtree to ensure that the -importer is able to replicate the necessary metadata. When the -exporter receives the MDiscoverAck, it allows the freeze to proceed. - -The MExportPrep message then follows to populate a spanning tree that -includes all dirs, inodes, and dentries necessary to reach any nested -exports within the exported region. This replicates metadata as well, -but it is pushed out by the exporter, avoiding deadlock with the -regular discover and replication process. The importer is responsible -for opening the bounding directories from any third parties before -acknowledging. This ensures that the importer has correct dir_auth -information about where authority is delegated for all points nested -within the subtree being migrated. While processing the MExportPrep, -the importer freezes the entire subtree region to prevent any new -replication or cache expiration. - -The warning stage occurs only if the base subtree directory is open by -nodes other than the importer and exporter. If so, then a -MExportWarning message informs any bystanders that the authority for -the region is temporarily ambiguous. In particular, bystanders who -are trimming items from their cache must send MCacheExpire messages to -both the old and new authorities. This is necessary to ensure that -the surviving authority reliably receives all expirations even if the -importer or exporter fails. While the subtree is frozen (on both the -importer and exporter), expirations will not be immediately processed; -instead, they will be queued until the region is unfrozen and it can -be determined that the node is or is not authoritative for the region. - -The MExport message sends the actual subtree metadata to the importer. -Upon receipt, the importer inserts the data into its cache, logs a -copy in the EImportStart, and replies with an ExportAck. The exporter -can now log an EExportFinish(true), which ultimately specifies that -the export was a success. In the presence of failures, it is the -existence (and value) of the EExportFinish that disambiguates -authority during recovery. - -Once logged, the exporter will send an MExportNotify to any -bystanders, informing them that the authority is no longer ambiguous -and cache expirations should be sent only to the new authority (the -importer). Once these are acknowledged, implicitly flushing the -bystander to exporter message streams of any stray expiration notices, -the exporter unfreezes the subtree, cleans up its state, and sends a -final MExportFinish to the importer. Upon receipt, the importer logs -an EImportFinish(true), unfreezes its subtree, and cleans up its -state. - - -PARTIAL FAILURE RECOVERY - - - -RECOVERY FROM JOURNAL - - - - - diff --git a/branches/sage/mds/doc/file_modes.txt b/branches/sage/mds/doc/file_modes.txt deleted file mode 100644 index d4ceba4034e5f..0000000000000 --- a/branches/sage/mds/doc/file_modes.txt +++ /dev/null @@ -1,66 +0,0 @@ - -underlying client capabilities: - -- read + cache -- read sync -- write sync -- write + buffer - (...potentially eventually augmented by byte ranges) - -whatever system of modes, tokens, etc. has to satisfy the basic -constraint that no conflicting capabilities are ever in the -hands of clients. - - -questions: -- is there any use to clients writing to a replica? - - reading, yes.. 100,000 open same file.. - - ------- - -simplest approach: -- all readers, writers go through authority -- all open, close traffic at replicas forwarded to auth - -- fh state migrates with exports. - - - --------- - -less simple: -- all writers go through authority - - open, close traffic fw -- readers from any replica - - need token from auth -- weird auth <-> replica <-> client interactions ensue! - - --------- - -even more complex (and totally FLAWED, ignore this!) - -- clients can open a file with any replica (for read or write). -- replica gets a read or write token from the primary - - primary thus knows if it's all read, all write, mixed, or none. -- once replica has a token it can service as many clients (of given type(s)) as it wants. -- on export, tokens are moved too. - - primary give _itself_ a token too! much simpler. - -- clients maintain a mode for each open file: rdonly, wronly, rdwr, lock -- globally, the mode is controlled by the primary, based on the mixture of - read and write tokens issued - - - -- [optional] if a client has a file open rdwr and the mode is rdonly or wronly, it can - request to read or write from the mds (which might twiddle the mode for performance - reasons.. e.g. lots of ppl rdwr but no actual reading) - - - - --------- - - diff --git a/branches/sage/mds/doc/header.txt b/branches/sage/mds/doc/header.txt deleted file mode 100644 index bccdb81533b6f..0000000000000 --- a/branches/sage/mds/doc/header.txt +++ /dev/null @@ -1,13 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ diff --git a/branches/sage/mds/doc/inos.txt b/branches/sage/mds/doc/inos.txt deleted file mode 100644 index b5ab1db25ca60..0000000000000 --- a/branches/sage/mds/doc/inos.txt +++ /dev/null @@ -1,11 +0,0 @@ - -inodeno_t namespace - - relevant both for ino's, and for the (ino) input for Filer and object storage namespace... - -1 - root inode - -100+mds - mds log/journal -200+mds - mds ino, fh allocation tables -300+mds - mds inode files (for non-embedded inodes) - -1000+ - regular files and directories \ No newline at end of file diff --git a/branches/sage/mds/doc/journal.txt b/branches/sage/mds/doc/journal.txt deleted file mode 100644 index 22cb4fc9e21b2..0000000000000 --- a/branches/sage/mds/doc/journal.txt +++ /dev/null @@ -1,124 +0,0 @@ - - -- LogEvent.replay() is idempotent. we won't know whether the update is old or not. - - - - - - - - - - - - - - - -journal is distributed among different nodes. because authority changes over time, it's not immedicatley clear to a recoverying node relaying the journal whether the data is "real" or not (it might be exported later in the journal). - - -possibilities: - - -ONE.. bloat the journal! - -- journal entry includes full trace of dirty data (dentries, inodes) up until import point - - local renames implicit.. cache is reattached on replay - - exports are a list of exported dirs.. which are then dumped - ... - -recovery phase 1 -- each entry includes full trace (inodes + dentries) up until the import point -- cache during recovery is fragmetned/dangling beneath import points -- when export is encountered items are discarded (marked clean) - -recovery phase 2 -- import roots ping store to determine attachment points (if not already known) - - if it was imported during period, attachment point is already known. - - renames affecting imports are logged too -- import roots discovered from other nodes, attached to hierarchy - -then -- maybe resume normal operations -- if recovery is a background process on a takeover mds, "export" everything to that node. - - --> journal contains lots of clean data.. maybe 5+ times bigger as a result! - -possible fixes: - - collect dir traces into journal chunks so they aren't repeated as often - - each chunk summarizes traces in previous chunk - - hopefully next chunk will include many of the same traces - - if not, then the entry will include it - - - - -=== log entry types === -- all inode, dentry, dir items include a dirty flag. -- dirs are implicitly _never_ complete; even if they are, a fetch before commit is necessary to confirm - -ImportPath - log change in import path -Import - log import addition (w/ path, dirino) - -InoAlloc - allocate ino -InoRelease - release ino - -Inode - inode info, along with dentry+inode trace up to import point -Unlink - (null) dentry + trace, + flag (whether inode/dir is destroyed) -Link - (new) dentry + inode + trace - - ------------------------------ - -TWO.. -- directories in store contain path at time of commit (relative to import, and root) -- replay without attaching anything to heirarchy -- after replay, directories pinged in store to attach to hierarchy - --> phase 2 too slow! --> and nested dirs may reattach... that won't be apparent from journal. - - put just parent dir+dentry in dir store.. even worse on phase 2! - - -THREE -- - - - - - - - -metadata journal/log - - -event types: - -chown, chmod, utime - InodeUpdate - -mknod, mkdir, symlink - Mknod .. new inode + link - -unlink, rmdir - Unlink - -rename - Link + Unlink (foreign) -or Rename (local) - -link - Link .. link existing inode - - - - -InodeUpdate -DentryLink -DentryUnlink -InodeCreate -InodeDestroy -Mkdir? diff --git a/branches/sage/mds/doc/lazy_posix.txt b/branches/sage/mds/doc/lazy_posix.txt deleted file mode 100644 index 1d226cd03d8e4..0000000000000 --- a/branches/sage/mds/doc/lazy_posix.txt +++ /dev/null @@ -1,53 +0,0 @@ - -http://www.usenix.org/events/fast05/wips/slides/welch.pdf - - - --- STATLITE - statlite(const char *filename, struct statlite *buf); - fstatlite(int fd, struct statlite *buf); - lstatlite(const char *filename, struct statlite *buf); - - * file size, mtime are optionally not guaranteed to be correct - * mask field to specify which fields you need to be correct - - --- READDIR+ - - struct dirent_plus *readdirplus(DIR *dirp); - int readdirplus_r(DIR *dirp, struct dirent_plus *entry, struct dirent_plus **result); - struct dirent_lite *readdirlite(DIR *dirp); - int readdirlite_r(DIR *dirp, struct dirent_lite *entry, struct dirent_lite **result); - - * plus returns lstat - * lite returns lstatlite - - --- lazy i/o integrity - - O_LAZY to open(2) - - * relax data coherency - * writes may not be visible until lazyio_propagate, fsync, close - - lazyio_propagate(int fd, off_t offset, size_t count); - * my writes are safe - - lazyio_synchronize(int fd, off_t offset, size_t count); - * i will see everyone else's propagated writes - --- read/write non-serial vectors - - ssize_t readx(int fd, const struct iovec *iov, size_t iov_count, struct xtvec *xtv, size_t xtv_count); - ssize_t writex(int fd, const struct iovec *iov, size_t iov_count, struct xtvec *xtv, size_t xtv_count); - - * like readv/writev, but serial - * - - -int lockg(int fd, int cmd, lgid_t *lgid) - group locks - -int openg(char *path, int mode, fh_t *handle); - portable file handle -int sutoc(fh_t *fh); \ No newline at end of file diff --git a/branches/sage/mds/doc/mds_locks.txt b/branches/sage/mds/doc/mds_locks.txt deleted file mode 100644 index f41a89a9b31e5..0000000000000 --- a/branches/sage/mds/doc/mds_locks.txt +++ /dev/null @@ -1,66 +0,0 @@ - -new names - dentry_read (not path_pins) - dentry_xlock - - inode_read - inode_xlock (not inode_write) - -locks are always tied to active_requests. - -read locks can be placed on any node. -xlocks must be applied at the authority. - -for multi-lock operations (link, unlink, rename), we must acquire xlocks on a remote node. lock requests are associated with a reqid. the authoritative node keeps track of which remote xlocks it holds. when forwarded/restarted, it can drop remote locks. - -when restarting, drop all locks. -on remote, drop locks and state, and notify main req node. -recover dist request state on rejoin: - - surviving op initiator will assert read or xlock - - recovering op initiator will restart requests. (from initiator's perspective, ops have either happened or they haven't, depending on whether the event is journaled.) - - recovering or surviving op cohort will determine lock state during rejoin, or get a commit or rollback... - - - - ---- path_pin = read lock on /some/random/path - - blocks a dentry xlock - ---- dnxlock = exclusive lock on /some/random/path - - locking: prevents subsequent path pins. - - locked: prevents dn read - - on auth - --> grab _all_ path pins at onces; hold none while waiting. --> grab xlocks in order. - ---- auth_pin = pin to authority, on *dir, *in - - prevents freezing -> frozen. - - freezing blocks new auth pins, thus blocking other local auth_pins. (hangs up local export.) - - does not block remote auth_pins, because remote side is not auth (or frozen!) until after local subtree is frozen. - --> blocking on auth_pins is dangerous. _never_ block if we are holding other auth_pins on the same node (subtree?). --> grab _all_ auth pins at once; hold none while waiting. - ---- hard/file_wrlock = exlusive lock on inode content - - prevents inode read - - on auth - --> grab locks in order. - - -ORDERING -- namespace(dentries) < inodes -- order dentries on (dirino, dname) -- order inodes on (ino); -- need to order both read and write locks, esp with dentries. so, if we need to lock /usr/bin/foo with read on usr and bin and xwrite on foo, we need to acquire all of those locks using the same ordering. - - on same host, we can be 'nice' and check lockability of all items, then lock all, and drop everything while waiting. (actually, is there any use to this?) - - on mutiple hosts, we need to use full ordering (at least as things separate across host boundaries). and if needed lock set changes (such that the order of already acquired locks changes), we need to drop those locks and start over. - -- how do auth pins fit into all this? - - auth pin on xlocks only. no need on read locks. - - pre-grab all auth pins on a node the first time it is visiting during lock acquisition. - - what if things move? if we find we are missing a needed auth pin when we revisit a host at any point, and the item is not still authpinnable, we back off and restart. (we cannot block.) - - - - if we find we are not authpinnable, drop all locks and wait. - - diff --git a/branches/sage/mds/doc/modeline.txt b/branches/sage/mds/doc/modeline.txt deleted file mode 100644 index 1b3956f4d486b..0000000000000 --- a/branches/sage/mds/doc/modeline.txt +++ /dev/null @@ -1,2 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab diff --git a/branches/sage/mds/doc/osd_outline.txt b/branches/sage/mds/doc/osd_outline.txt deleted file mode 100644 index 2c6f3287aac5f..0000000000000 --- a/branches/sage/mds/doc/osd_outline.txt +++ /dev/null @@ -1,37 +0,0 @@ - -intro - -osd cluster map - requirements - desireable properties - (c)rush - -failure detection - distributed ping or heartbeat - central filter, notifier - -design - placement seed, class/superset, groups - -normal operation - reads - writes - -recovery - triggers: failed disk, or total cluster reorganization - - notify - peering - pull - push - clean - -writes during recovery - -graceful data loss + recovery? - - - - - - diff --git a/branches/sage/mds/doc/osd_replication.txt b/branches/sage/mds/doc/osd_replication.txt deleted file mode 100644 index 907d00e2050a2..0000000000000 --- a/branches/sage/mds/doc/osd_replication.txt +++ /dev/null @@ -1,226 +0,0 @@ - - -SOME GENERAL REQUIREMENTS - -- cluster expansion: - - any or all of the replicas may move to new OSDs. - -- cluster map may change frequently - - map change should translate into pending replication/migration - state quickly (or better yet, instantly), so that we could push - through a series of (say, botched) maps quickly and be fine, so long - as the final map is correct. - -- ideally, unordered osd<->osd, client<->osd communication - (mds<->mds, client<->mds communication is ordered, but file i/o - would be too slow that way?) - - - - -PRIMARY ONLY PICTURE - -let's completely ignore replication for a while, and see how -complicated the picture needs to be to reliably support cluster expansion. - -typedef __uint64_t version_t; - - -per-Object metadata: -- version #. incremented when an object is modified. - e.g. version_t version; -- on primary, keep list of stray replicas - e.g. map stray_replicas; // osds w/ stray replicas - includes old primary osd(s), until deletion is confirmed. used while rg - is importing. - - -per-RG metadata -- object list. well, a method to fetch it by querying a collection or whatever. -- negative list - e.g. map deleted_objects; - - used to enumerate deleted objects, when in "importing" state. -- a RG "state" (enum/int) - - - - - - -Normal RG state: -- role=primary - clean - i am primary, all is well. no stray copies. i can - discard my negative object list, since my local - object store tells me everything. - - -After a map change: -- new primary - undef - initially; i don't know RG exists. -- old primary - homeless - i was primary, still have unmolested data. new primary is not yet migrating - (presumably it's state=undef.) i need to contact new primary and tell them - this RG exists. - -- new primary - importing - i am migrating data from old primary. keep negative dir entries for deletions. - write locally. proxy reads (force immediately migration). do whole objects - initially (on write, block until i migrate the object). later we can do - sub-object state (where "live" object data is spread across new/old primaries.. -- old primary - exporting - primary is migrating my data. - undef - when it finishes. (i will forget this RG existed.) - - -After a second map change (scenario 1): - as above, if we were clean again. - -After a second map change (scenario 2): - we weren't clean yet. -- new primary - undef - initially (until i learn RG exists) -- old primary - importing - i'm still migrating from old old primary -- old old primary - exporting - ... -- old primary -?? importing+exporting - proxy reads as before. continue migrating from old old primary. - - -After a second map change (scenario 3): - we weren't clean yet, and old old primary is also new primary -- new primary (and old old primary) - exporting - change state to importing. be sure to compare object versions, and neg dir - entries (as we always should do, really!). -- old primary - importing - note that the old import source matches new primary, and change - state to exporting, and stop importing. (unlike scenario 2) - --> this approach could mean that a series of fast map changes could - force data to migrate down a "chain" of old primaries to reach the - new one. maybe old primary should go from importing -> exporting, - and pass along old old primary id to new primary such that the - import is a many-to-one thing, instead of one-to-one. version - numbers and neg entries will make it easy to pick out correct versions. - - - -For the importing process on a given RG: - -- metadata for each source - - each source has a state: - 'starting' - don't know anything about source yet. query source! - this probaby induces the source to change from - 'homeless' or something similar to 'exporting'. - 'importing' - i've fetched the source's object list (and neg - object list). i'm busy reading them! these lists - will shrink as the process continues. after i fetch - an object, i will erase it from the source. - (object metadata will include stray copy info - until i confirm that its removed.) - 'finishing' - i've read all my data, and i'm telling the old person - to discard any remaining RG metadata (RG contents - should already be gone) - - unmigrated object list - - migrated but not deleted object list - - stray osd is also listed in per-object MD during this stage - - negative object list - - i can remove these items if i see a newer object version (say, - from a different import source or something). - - i can remove any local objects or ignore imported ones if it is - older than deleted version - -- the lists should be sets or otherwise queryable so that while i'm - importing and a real op comes through I can quickly determine if a - given object_id is pending migration etc or if my local store is to - be trusted. - - - - - -SOME CODE BITS - - -typedef __uint64_t version_t; -class Object { - version_t version; - map stray_replicas; -}; - - -class ReplicaGroup { - int enumerate_objects(list& ls); - - int state; - - // for unstable states, - map deleted_objects; // locally - map exporters; // importing from these guys. -}; - -// primary -#define RG_STATE_CLEAN 1 -#define RG_STATE_IMPORTING 2 // pulling data - -// non-primary -#define RG_STATE_HOMELESS 5 // old primary; new primary not yet - // notified; not yet exporting. -#define RG_STATE_EXPORTING 6 // a newer primary is extracting my - // data. - - -struct RGExporter_t { - int import_state; - - set remaining_objects; // remote object list - set stray_objects; // imported but not deleted. - -}; - - - - - ----- -all crap from here on down - - - - -REPLICAS -- - - - - -OSD STATES -- primary, up to date. -- replica, up to date. - -- primary, proxy to old primary (primaries?) - -- replica, not up to date. - - -REPLICATION STUFF - -Per-RG metadata -- primary - - per-replica state: clean, catching up? -- replica - -Per-object metadata -- primary and replica - - version number/mtime - - rg (reverse indexed) -- primary - - replication level and state. - - commited to memory and/or disk, on which replicas (#1, #2, etc.) -- replica - - - - - --> \ No newline at end of file diff --git a/branches/sage/mds/doc/shared_write_states_nogo.txt b/branches/sage/mds/doc/shared_write_states_nogo.txt deleted file mode 100644 index f409617d82681..0000000000000 --- a/branches/sage/mds/doc/shared_write_states_nogo.txt +++ /dev/null @@ -1,39 +0,0 @@ - -// stable states // ------auth----- -----replica----- -#define LOCK_SYNC 0 // R . / . . . WB same ... for stat() -#define LOCK_LOCK 1 // R W / RC . . . . . / RC . . . ... for truncate(), fsync() -#define LOCK_RDONLY 2 // R . / RC R . . same -#define LOCK_MIXED 3 // . . / . R W . same -#define LOCK_WRONLY 4 // . . / . . W WB same - -// transition states -#define LOCK_GSYNCR 8 // R . / RC . . . same -#define LOCK_GSYNCMW 9 // . . / RC . . WB same -#define LOCK_GSYNCMW2 9 // . . / RC . . WB same - -#define LOCK_GLOCKSR 5 // R . / RC . . . . . / RC . . . -#define LOCK_GLOCKMW 7 // . . / RC . . . same - -#define LOCK_GRDONLYM 10 // . . / . R . . same -#define LOCK_GRDONLYM2 10 // --- . . / . R . . -#define LOCK_GRDONLYW 11 // . . / . . . . same -#define LOCK_GRDONLYW2 11 // --- . . / . . . . -#define LOCK_GRDONLYS 12 // R . / RC . . . same -#define LOCK_GRDONLYL 13 // R . / RC . . . --- - -#define LOCK_GMIXEDR 14 // R . / . R . . . . / . R . . -#define LOCK_GMIXEDR2 15 // --- . . / . R . . -#define LOCK_GMIXEDW 16 // . . / . . W . same -#define LOCK_GMIXEDW2 16 // --- . . / . . W . -#define LOCK_GMIXEDS 16 // R . / . . . . . . / . . . . -#define LOCK_GMIXEDS2 16 // --- . . / . . . . -#define LOCK_GMIXEDL 17 // R . / . . . . --- - -#define LOCK_GWRONLYR 18 // R . / . . . . same -#define LOCK_GWRONLYR2 18 // --- . . / . . . . -#define LOCK_GWRONLYM 19 // . . / . . . . same -#define LOCK_GWRONLYM2 19 // --- . . / . . . . -#define LOCK_GWRONLYS 20 // R . / . . . WB same -#define LOCK_GWRONLYS2 20 // --- . . / . . . . -#define LOCK_GWRONLYL 21 - diff --git a/branches/sage/mds/doc/shutdown.txt b/branches/sage/mds/doc/shutdown.txt deleted file mode 100644 index e5ccde3171004..0000000000000 --- a/branches/sage/mds/doc/shutdown.txt +++ /dev/null @@ -1,13 +0,0 @@ - -- mds0 triggers shutdown by sending a shutdown_start to all nodes. - -- from here on out, all client requests are discarded (unless they are a file close?) - -- each mds checks for outstanding inter-mds transations. e.g imports, discoveries, etc. once they're all done, send a shutdown_ready to mds0 - -- each mds successively disassembles its cache, flushing data to long-term storage, and sending inodeexpires, exporting imported dirs to parent (after they're clean + empty) - -- when the cache is empty, send shutdown_done to mds0 and exit. - -- mds0 exits when all mdss have finished. - diff --git a/branches/sage/mds/dupstore.cc b/branches/sage/mds/dupstore.cc deleted file mode 100644 index d43f935cb50cc..0000000000000 --- a/branches/sage/mds/dupstore.cc +++ /dev/null @@ -1,102 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include -#include "ebofs/Ebofs.h" -#include "osd/FakeStore.h" - - -int dupstore(ObjectStore* src, ObjectStore* dst) -{ - if (src->mount() < 0) return 1; - if (dst->mkfs() < 0) return 1; - if (dst->mount() < 0) return 1; - - // objects - list objects; - src->list_objects(objects); - int num = objects.size(); - cout << num << " objects" << std::endl; - int i = 1; - for (list::iterator p = objects.begin(); p != objects.end(); ++p) { - bufferlist bl; - src->read(*p, 0, 0, bl); - cout << "object " << i++ << "/" << num << " " << *p << " = " << bl.length() << " bytes" << std::endl; - dst->write(*p, 0, bl.length(), bl, 0); - map attrs; - src->getattrs(*p, attrs); - dst->setattrs(*p, attrs); - } - - // collections - list collections; - src->list_collections(collections); - num = collections.size(); - cout << num << " collections" << std::endl; - i = 1; - for (list::iterator p = collections.begin(); - p != collections.end(); - ++p) { - dst->create_collection(*p, 0); - map attrs; - src->collection_getattrs(*p, attrs); - dst->collection_setattrs(*p, attrs); - list o; - src->collection_list(*p, o); - int numo = 0; - for (list::iterator q = o.begin(); q != o.end(); q++) { - dst->collection_add(*p, *q, 0); - numo++; - } - cout << "collection " << i++ << "/" << num << " " << hex << *p << dec << " = " << numo << " objects" << std::endl; - } - - - src->umount(); - dst->umount(); - return 0; -} - -void usage() -{ - cerr << "usage: dup.ebofs (ebofs|fakestore) src (ebofs|fakestore) dst" << std::endl; - exit(0); -} - -int main(int argc, char **argv) -{ - vector args; - argv_to_vec(argc, argv, args); - parse_config_options(args); - - // args - if (args.size() != 4) - usage(); - - ObjectStore *src, *dst; - - if (strcmp(args[0], "ebofs") == 0) - src = new Ebofs(args[1]); - else if (strcmp(args[0], "fakestore") == 0) - src = new FakeStore(args[1]); - else usage(); - - if (strcmp(args[2], "ebofs") == 0) - dst = new Ebofs(args[3]); - else if (strcmp(args[2], "fakestore") == 0) - dst = new FakeStore(args[3]); - else usage(); - - return dupstore(src, dst); -} diff --git a/branches/sage/mds/ebofs/Allocator.cc b/branches/sage/mds/ebofs/Allocator.cc deleted file mode 100644 index 35b0db16b84c2..0000000000000 --- a/branches/sage/mds/ebofs/Allocator.cc +++ /dev/null @@ -1,693 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "Allocator.h" -#include "Ebofs.h" - - -#undef dout -#define dout(x) if (x <= g_conf.debug_ebofs) *_dout << dbeginl << g_clock.now() << " ebofs(" << fs->dev.get_device_name() << ").allocator." - - -void Allocator::dump_freelist() -{ - if (1) { - interval_set free; // validate too - - block_t n = 0; - for (int b=0; b<=EBOFS_NUM_FREE_BUCKETS; b++) { - Table *tab; - if (b < EBOFS_NUM_FREE_BUCKETS) { - tab = fs->free_tab[b]; - dout(0) << "dump bucket " << b << " " << tab->get_num_keys() << dendl; - } else { - tab = fs->limbo_tab; - dout(0) << "dump limbo " << tab->get_num_keys() << dendl;; - } - - if (tab->get_num_keys() > 0) { - Table::Cursor cursor(tab); - assert(tab->find(0, cursor) >= 0); - while (1) { - dout(0) << "dump ex " << cursor.current().key << "~" << cursor.current().value << dendl; - assert(cursor.current().value > 0); - - if (b < EBOFS_NUM_FREE_BUCKETS) - n += cursor.current().value; - - if (free.contains( cursor.current().key, cursor.current().value )) - dout(0) << "dump bad " << cursor.current().key << "~" << cursor.current().value << dendl; - assert(!free.contains( cursor.current().key, cursor.current().value )); - free.insert( cursor.current().key, cursor.current().value ); - if (cursor.move_right() <= 0) break; - } - } else { - //dout(0) << " empty" << dendl; - } - } - - assert(n == fs->free_blocks); - dout(0) << "dump combined freelist is " << free << dendl; - - - // alloc_tab - if (fs->alloc_tab->get_num_keys() > 0) { - Table >::Cursor cursor(fs->alloc_tab); - assert(fs->alloc_tab->find(0, cursor) >= 0); - while (1) { - dout(0) << "alloc ex " << cursor.current().key << "~" << cursor.current().value.first << " ref " - << cursor.current().value.second - << dendl; - assert(cursor.current().value.first > 0); - - if (cursor.move_right() <= 0) break; - } - } - } -} - - -int Allocator::find(Extent& ex, int bucket, block_t num, block_t near, int dir) -{ - Table::Cursor cursor(fs->free_tab[bucket]); - bool found = false; - - if ((dir == DIR_ANY || dir == DIR_FWD) && - fs->free_tab[bucket]->find( near, cursor ) >= 0) { - // look to the right - do { - if (cursor.current().value >= num) - found = true; - } while (!found && cursor.move_right() > 0); - } - - if ((dir == DIR_ANY || dir == DIR_BACK) && - !found) { - // look to the left - fs->free_tab[bucket]->find( near, cursor ); - - while (!found && cursor.move_left() >= 0) - if (cursor.current().value >= num) - found = true; - } - - if (found) { - ex.start = cursor.current().key; - ex.length = cursor.current().value; - return 0; - } - - return -1; -} - -int Allocator::allocate(Extent& ex, block_t num, block_t near) -{ - //dump_freelist(); - - int dir = DIR_ANY; // no dir - if (near == NEAR_LAST_FWD) { - near = last_pos; - dir = DIR_FWD; // fwd - } - else if (near == NEAR_LAST) - near = last_pos; - - int bucket; - - while (1) { // try twice, if fwd = true - - // look for contiguous extent - for (bucket = pick_bucket(num); bucket < EBOFS_NUM_FREE_BUCKETS; bucket++) { - if (find(ex, bucket, num, near, dir) >= 0) { - // yay! - - // remove original - fs->free_tab[bucket]->remove( ex.start ); - fs->free_blocks -= ex.length; - - if (ex.length > num) { - if (ex.start < near) { - // to the left - if (ex.start + ex.length - num <= near) { - // by a lot. take right-most portion. - Extent left; - left.start = ex.start; - left.length = ex.length - num; - ex.start += left.length; - ex.length -= left.length; - assert(ex.length == num); - _release_loner(left); - } else { - // take middle part. - Extent left,right; - left.start = ex.start; - left.length = near - ex.start; - ex.start = near; - right.start = ex.start + num; - right.length = ex.length - left.length - num; - ex.length = num; - _release_loner(left); - _release_loner(right); - } - } - else { - // to the right. take left-most part. - Extent right; - right.start = ex.start + num; - right.length = ex.length - num; - ex.length = num; - _release_loner(right); - } - } - - dout(20) << "allocate " << ex << " near " << near << dendl; - last_pos = ex.end(); - //dump_freelist(); - if (g_conf.ebofs_cloneable) - alloc_inc(ex); - return num; - } - } - - if (dir == DIR_BACK || dir == DIR_ANY) break; - dir = DIR_BACK; - } - - // ok, find partial extent instead. - for (block_t trysize = num/2; trysize >= 1; trysize /= 2) { - int bucket = pick_bucket(trysize); - if (find(ex, bucket, trysize, near) >= 0) { - // yay! - assert(ex.length < num); - - fs->free_tab[bucket]->remove(ex.start); - fs->free_blocks -= ex.length; - last_pos = ex.end(); - dout(20) << "allocate partial " << ex << " (wanted " << num << ") near " << near << dendl; - //dump_freelist(); - if (g_conf.ebofs_cloneable) - alloc_inc(ex); - return ex.length; - } - } - - dout(1) << "allocate failed, fs completely full! " << fs->free_blocks << dendl; - assert(0); - //dump_freelist(); - return -1; -} - -int Allocator::_release_into_limbo(Extent& ex) -{ - dout(10) << "_release_into_limbo " << ex << dendl; - dout(10) << "limbo is " << limbo << dendl; - assert(ex.length > 0); - limbo.insert(ex.start, ex.length); - fs->limbo_blocks += ex.length; - return 0; -} - -int Allocator::release(Extent& ex) -{ - if (g_conf.ebofs_cloneable) - return alloc_dec(ex); - - _release_into_limbo(ex); - return 0; -} - -int Allocator::commit_limbo() -{ - dout(20) << "commit_limbo" << dendl; - for (map::iterator i = limbo.m.begin(); - i != limbo.m.end(); - i++) { - fs->limbo_tab->insert(i->first, i->second); - //fs->free_blocks += i->second; - } - limbo.clear(); - //fs->limbo_blocks = 0; - //dump_freelist(); - return 0; -} - -int Allocator::release_limbo() -{ - //dump_freelist(); - if (fs->limbo_tab->get_num_keys() > 0) { - Table::Cursor cursor(fs->limbo_tab); - fs->limbo_tab->find(0, cursor); - while (1) { - Extent ex(cursor.current().key, cursor.current().value); - dout(20) << "release_limbo ex " << ex << dendl; - - fs->limbo_blocks -= ex.length; - _release_merge(ex); - - if (cursor.move_right() <= 0) break; - } - } - fs->limbo_tab->clear(); - //dump_freelist(); - return 0; -} - - - -/* -int Allocator::_alloc_loner_inc(Extent& ex) -{ - Table >::Cursor cursor(fs->alloc_tab); - - if (fs->alloc_tab->find( ex.start, cursor ) - == Table >::Cursor::MATCH) { - assert(cursor.current().value.first == ex.length); - pair& v = cursor.dirty_current_value(); - v.second++; - dout(10) << "_alloc_loner_inc " << ex << " " - << (v.second-1) << " -> " << v.second - << dendl; - } else { - // insert it, @1 - fs->alloc_tab->insert(ex.start, pair(ex.length,1)); - dout(10) << "_alloc_loner_inc " << ex << " 0 -> 1" << dendl; - } - return 0; -} - -int Allocator::_alloc_loner_dec(Extent& ex) -{ - Table >::Cursor cursor(fs->alloc_tab); - - if (fs->alloc_tab->find( ex.start, cursor ) - == Table >::Cursor::MATCH) { - assert(cursor.current().value.first == ex.length); - if (cursor.current().value.second == 1) { - dout(10) << "_alloc_loner_dec " << ex << " 1 -> 0" << dendl; - fs->alloc_tab->remove( cursor.current().key ); - } else { - pair& v = cursor.dirty_current_value(); - --v.second; - dout(10) << "_alloc_loner_dec " << ex << " " - << (v.second+1) << " -> " << v.second - << dendl; - } - } else { - assert(0); - } - return 0; -} -*/ - - -int Allocator::alloc_inc(Extent ex) -{ - dout(10) << "alloc_inc " << ex << dendl; - - // empty table? - if (fs->alloc_tab->get_num_keys() == 0) { - // easy. - fs->alloc_tab->insert(ex.start, pair(ex.length,1)); - dout(10) << "alloc_inc + " << ex << " 0 -> 1 (first entry)" << dendl; - return 0; - } - - Table >::Cursor cursor(fs->alloc_tab); - - // try to move to left (to check for overlap) - int r = fs->alloc_tab->find( ex.start, cursor ); - if (r == Table >::Cursor::OOB || - cursor.current().key > ex.start) { - r = cursor.move_left(); - dout(10) << "alloc_inc move_left r = " << r << dendl; - } - - while (1) { - dout(10) << "alloc_inc loop at " << cursor.current().key - << "~" << cursor.current().value.first - << " ref " << cursor.current().value.second - << dendl; - - // too far left? - if (cursor.current().key < ex.start && - cursor.current().key + cursor.current().value.first <= ex.start) { - // adjacent? - bool adjacent = false; - if (cursor.current().key + cursor.current().value.first == ex.start && - cursor.current().value.second == 1) - adjacent = true; - - // no overlap. - r = cursor.move_right(); - dout(10) << "alloc_inc move_right r = " << r << dendl; - - // at end? - if (r <= 0) { - // hmm! - if (adjacent) { - // adjust previous entry - cursor.move_left(); - pair &v = cursor.dirty_current_value(); - v.first += ex.length; // yay! - dout(10) << "alloc_inc + " << ex << " 0 -> 1 (adjust at end)" << dendl; - } else { - // insert at end, finish. - int r = fs->alloc_tab->insert(ex.start, pair(ex.length,1)); - dout(10) << "alloc_inc + " << ex << " 0 -> 1 (at end) .. r = " << r << dendl; - //dump_freelist(); - } - return 0; - } - } - - if (cursor.current().key > ex.start) { - // gap. - // oooooo - // nnnnn..... - block_t l = MIN(ex.length, cursor.current().key - ex.start); - - fs->alloc_tab->insert(ex.start, pair(l,1)); - dout(10) << "alloc_inc + " << ex.start << "~" << l << " 0 -> 1" << dendl; - ex.start += l; - ex.length -= l; - if (ex.length == 0) break; - fs->alloc_tab->find( ex.start, cursor ); - } - else if (cursor.current().key < ex.start) { - block_t end = cursor.current().value.first + cursor.current().key; - - if (end <= ex.end()) { - // single split - // oooooo - // nnnnn - pair& v = cursor.dirty_current_value(); - v.first = ex.start - cursor.current().key; - int ref = v.second; - - block_t l = end - ex.start; - fs->alloc_tab->insert(ex.start, pair(l, 1+ref)); - - dout(10) << "alloc_inc " << ex.start << "~" << l - << " " << ref << " -> " << ref+1 - << " (right split)" << dendl; - - ex.start += l; - ex.length -= l; - if (ex.length == 0) break; - fs->alloc_tab->find( ex.start, cursor ); - - } else { - // double split, finish. - // ------------- - // ------ - pair& v = cursor.dirty_current_value(); - v.first = ex.start - cursor.current().key; - int ref = v.second; - - fs->alloc_tab->insert(ex.start, pair(ex.length, 1+ref)); - - int rl = end - ex.end(); - fs->alloc_tab->insert(ex.end(), pair(rl, ref)); - - dout(10) << "alloc_inc " << ex - << " " << ref << " -> " << ref+1 - << " (double split finish)" - << dendl; - - break; - } - } - else { - assert(cursor.current().key == ex.start); - - if (cursor.current().value.first <= ex.length) { - // inc. - // oooooo - // nnnnnnnn - pair& v = cursor.dirty_current_value(); - v.second++; - dout(10) << "alloc_inc " << ex.start << "~" << cursor.current().value.first - << " " << cursor.current().value.second-1 << " -> " - << cursor.current().value.second - << " (left split)" << dendl; - ex.start += v.first; - ex.length -= v.first; - if (ex.length == 0) break; - cursor.move_right(); - } else { - // single split, finish. - // oooooo - // nnn - block_t l = cursor.current().value.first - ex.length; - int ref = cursor.current().value.second; - - pair& v = cursor.dirty_current_value(); - v.first = ex.length; - v.second++; - - fs->alloc_tab->insert(ex.end(), pair(l, ref)); - - dout(10) << "alloc_inc " << ex - << " " << ref << " -> " << ref+1 - << " (left split finish)" - << dendl; - - break; - } - } - } - - return 0; -} - - -int Allocator::alloc_dec(Extent ex) -{ - dout(10) << "alloc_dec " << ex << dendl; - - assert(fs->alloc_tab->get_num_keys() >= 0); - - Table >::Cursor cursor(fs->alloc_tab); - - // try to move to left (to check for overlap) - int r = fs->alloc_tab->find( ex.start, cursor ); - dout(10) << "alloc_dec find r = " << r << dendl; - - if (r == Table >::Cursor::OOB || - cursor.current().key > ex.start) { - r = cursor.move_left(); - dout(10) << "alloc_dec move_left r = " << r << dendl; - - // too far left? - if (cursor.current().key < ex.start && - cursor.current().key + cursor.current().value.first <= ex.start) { - // no overlap. - dump_freelist(); - assert(0); - } - } - - while (1) { - dout(10) << "alloc_dec ? " << cursor.current().key - << "~" << cursor.current().value.first - << " " << cursor.current().value.second - << ", ex is " << ex - << dendl; - - assert(cursor.current().key <= ex.start); // no gap allowed. - - if (cursor.current().key < ex.start) { - block_t end = cursor.current().value.first + cursor.current().key; - - if (end <= ex.end()) { - // single split - // oooooo - // ----- - pair& v = cursor.dirty_current_value(); - v.first = ex.start - cursor.current().key; - int ref = v.second; - dout(10) << "alloc_dec s " << cursor.current().key << "~" << cursor.current().value.first - << " " << ref - << " shortened left bit of single" << dendl; - - block_t l = end - ex.start; - if (ref > 1) { - fs->alloc_tab->insert(ex.start, pair(l, ref-1)); - dout(10) << "alloc_dec . " << ex.start << "~" << l - << " " << ref << " -> " << ref-1 - << dendl; - } else { - Extent r(ex.start, l); - _release_into_limbo(r); - } - - ex.start += l; - ex.length -= l; - if (ex.length == 0) break; - fs->alloc_tab->find( ex.start, cursor ); - - } else { - // double split, finish. - // ooooooooooooo - // ------ - pair& v = cursor.dirty_current_value(); - v.first = ex.start - cursor.current().key; - int ref = v.second; - dout(10) << "alloc_dec s " << cursor.current().key << "~" << cursor.current().value.first - << " " << ref - << " shorted left bit of double split" << dendl; - - if (ref > 1) { - fs->alloc_tab->insert(ex.start, pair(ex.length, ref-1)); - dout(10) << "alloc_inc s " << ex - << " " << ref << " -> " << ref-1 - << " reinserted middle bit of double split" - << dendl; - } else { - _release_into_limbo(ex); - } - - int rl = end - ex.end(); - fs->alloc_tab->insert(ex.end(), pair(rl, ref)); - dout(10) << "alloc_dec s " << ex.end() << "~" << rl - << " " << ref - << " reinserted right bit of double split" << dendl; - break; - } - } - else { - assert(cursor.current().key == ex.start); - - if (cursor.current().value.first <= ex.length) { - // inc. - // oooooo - // nnnnnnnn - if (cursor.current().value.second > 1) { - pair& v = cursor.dirty_current_value(); - v.second--; - dout(10) << "alloc_dec s " << ex.start << "~" << cursor.current().value.first - << " " << cursor.current().value.second+1 << " -> " << cursor.current().value.second - << dendl; - ex.start += v.first; - ex.length -= v.first; - if (ex.length == 0) break; - cursor.move_right(); - } else { - Extent r(cursor.current().key, cursor.current().value.first); - _release_into_limbo(r); - - ex.start += cursor.current().value.first; - ex.length -= cursor.current().value.first; - cursor.remove(); - - if (ex.length == 0) break; - fs->alloc_tab->find( ex.start, cursor ); - } - } else { - // single split, finish. - // oooooo - // nnn - block_t l = cursor.current().value.first - ex.length; - int ref = cursor.current().value.second; - - if (ref > 1) { - pair& v = cursor.dirty_current_value(); - v.first = ex.length; - v.second--; - dout(10) << "alloc_inc . " << ex - << " " << ref << " -> " << ref-1 - << dendl; - } else { - _release_into_limbo(ex); - cursor.remove(); - } - - dout(10) << "alloc_dec s " << ex.end() << "~" << l - << " " << ref - << " reinserted right bit of single split" << dendl; - fs->alloc_tab->insert(ex.end(), pair(l, ref)); - break; - } - } - - - } - - return 0; -} - - -/* - * release extent into freelist - * WARNING: *ONLY* use this if you _know_ there are no adjacent free extents - */ -int Allocator::_release_loner(Extent& ex) -{ - assert(ex.length > 0); - int b = pick_bucket(ex.length); - fs->free_tab[b]->insert(ex.start, ex.length); - fs->free_blocks += ex.length; - return 0; -} - -/* - * release extent into freelist - * look for any adjacent extents and merge with them! - */ -int Allocator::_release_merge(Extent& orig) -{ - dout(15) << "_release_merge " << orig << dendl; - assert(orig.length > 0); - - Extent newex = orig; - - // one after us? - for (int b=0; b::Cursor cursor(fs->free_tab[b]); - - if (fs->free_tab[b]->find( newex.start+newex.length, cursor ) - == Table::Cursor::MATCH) { - // add following extent to ours - newex.length += cursor.current().value; - - // remove it - fs->free_blocks -= cursor.current().value; - fs->free_tab[b]->remove( cursor.current().key ); - break; - } - } - - // one before us? - for (int b=0; b::Cursor cursor(fs->free_tab[b]); - fs->free_tab[b]->find( newex.start+newex.length, cursor ); - if (cursor.move_left() >= 0 && - (cursor.current().key + cursor.current().value == newex.start)) { - // merge - newex.start = cursor.current().key; - newex.length += cursor.current().value; - - // remove it - fs->free_blocks -= cursor.current().value; - fs->free_tab[b]->remove( cursor.current().key ); - break; - } - } - - // ok, insert newex - _release_loner(newex); - return 0; -} diff --git a/branches/sage/mds/ebofs/Allocator.h b/branches/sage/mds/ebofs/Allocator.h deleted file mode 100644 index c1898784d50a7..0000000000000 --- a/branches/sage/mds/ebofs/Allocator.h +++ /dev/null @@ -1,85 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __EBOFS_ALLOCATOR_H -#define __EBOFS_ALLOCATOR_H - -#include "types.h" - -#include "include/interval_set.h" - -class Ebofs; - -class Allocator { -public: - const static block_t NEAR_LAST = 0; - const static block_t NEAR_LAST_FWD = 1; - - const static int DIR_ANY = 0; - const static int DIR_FWD = 2; - const static int DIR_BACK = 1; - -protected: - Ebofs *fs; - block_t last_pos; - - - interval_set limbo; - - static int pick_bucket(block_t num) { - int b = 0; - while (num > 1) { - b++; - num = num >> EBOFS_FREE_BUCKET_BITS; - } - if (b >= EBOFS_NUM_FREE_BUCKETS) - b = EBOFS_NUM_FREE_BUCKETS-1; - return b; - } - - int find(Extent& ex, int bucket, block_t num, block_t near, int dir = DIR_ANY); - - void dump_freelist(); - - public: - int _release_into_limbo(Extent& ex); - - int _release_loner(Extent& ex); // release loner extent - int _release_merge(Extent& ex); // release any extent (searches for adjacent) - - //int _alloc_loner_inc(Extent& ex); - //int _alloc_loner_dec(Extent& ex); - - - public: - Allocator(Ebofs *f) : fs(f), last_pos(0) {} - - int allocate(Extent& ex, block_t num, block_t near=NEAR_LAST); - int release(Extent& ex); // alias for alloc_dec - - int alloc_inc(Extent ex); - int alloc_dec(Extent ex); - - - int unallocate(Extent& ex) { // skip limbo - return _release_merge(ex); - } - - int commit_limbo(); // limbo -> fs->limbo_tab - int release_limbo(); // fs->limbo_tab -> free_tabs - -}; - -#endif diff --git a/branches/sage/mds/ebofs/BlockDevice.cc b/branches/sage/mds/ebofs/BlockDevice.cc deleted file mode 100644 index 94c108db2612c..0000000000000 --- a/branches/sage/mds/ebofs/BlockDevice.cc +++ /dev/null @@ -1,846 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "BlockDevice.h" - -#include "config.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include - -#ifndef __CYGWIN__ -#ifndef DARWIN -#include -#endif -#endif - - - -/******************************************* - * biovec - */ - -inline ostream& operator<<(ostream& out, BlockDevice::biovec &bio) -{ - out << "bio("; - if (bio.type == BlockDevice::biovec::IO_READ) out << "rd "; - if (bio.type == BlockDevice::biovec::IO_WRITE) out << "wr "; - out << bio.start << "~" << bio.length; - if (bio.note) out << " " << bio.note; - out << " " << &bio; - out << ")"; - return out; -} - - - -/******************************************* - * ElevatorQueue - */ - -#define dout(x) if (x <= g_conf.debug_bdev) *_dout << dbeginl << g_clock.now() << " bdev(" << dev << ").elevatorq." -#define derr(x) if (x <= g_conf.debug_bdev) *_derr << dbeginl << g_clock.now() << " bdev(" << dev << ").elevatorq." - - -int BlockDevice::ElevatorQueue::dequeue_io(list& biols, - block_t& start, block_t& length, - interval_set& block_lock) -{ - // queue empty? - assert(!io_map.empty()); - - dout(20) << "dequeue_io el_pos " << el_pos << " dir " << el_dir_forward << dendl; - - // find our position: i >= pos - map::iterator i; - - int tries = 2; - while (tries > 0) { - if (el_dir_forward) { - i = io_map.lower_bound(el_pos); - if (i != io_map.end()) { - break; // not at end. good. - } - } else { - i = io_map.upper_bound(el_pos); - if (i != io_map.begin()) { - i--; // and back down one (to get i <= pos). good. - break; - } - } - - // reverse (or initial startup)? - if (g_conf.bdev_el_bidir || !el_dir_forward) { - // dout(20) << "restart reversing" << dendl; - el_dir_forward = !el_dir_forward; - } - - if (el_dir_forward) { - // forward - el_pos = 0; - - if (g_conf.bdev_el_fw_max_ms) { - el_stop = g_clock.now(); - utime_t max(0, 1000*g_conf.bdev_el_fw_max_ms); // (s,us), convert ms -> us! - el_stop += max; - // dout(20) << "restart forward sweep for " << max << dendl; - } else { - // dout(20) << "restart fowrard sweep" << dendl; - } - } else { - // reverse - el_pos = bdev->get_num_blocks(); - - if (g_conf.bdev_el_bw_max_ms) { - el_stop = g_clock.now(); - utime_t max(0, 1000*g_conf.bdev_el_bw_max_ms); // (s,us), convert ms -> us! - el_stop += max; - // dout(20) << "restart reverse sweep for " << max << dendl; - } else { - // dout(20) << "restart reverse sweep" << dendl; - } - } - - tries--; - } - - assert(tries > 0); // this shouldn't happen if the queue is non-empty. - - // get some biovecs - int num_bio = 0; - - dout(20) << "dequeue_io starting with " << i->first << " " << *i->second << dendl; - - // merge contiguous ops - char type = i->second->type; // read or write - int num_iovs = 0; // count eventual iov's for readv/writev - - start = i->first; - length = 0; - - if (el_dir_forward) - el_pos = start; - else - el_pos = i->first + i->second->length; - - // while (contiguous) - while ((( el_dir_forward && el_pos == i->first) || - (!el_dir_forward && el_pos == i->first + i->second->length)) && - type == i->second->type) { - biovec *bio = i->second; - - // allowed? (not already submitted to kernel?) - if (block_lock.intersects(bio->start, bio->length)) { - dout(20) << "dequeue_io " << bio->start << "~" << bio->length - << " intersects block_lock " << block_lock << dendl; - break; // stop, or go with what we've got so far - } - - // add to biols - int nv = bio->bl.buffers().size(); // how many iov's in this bio's bufferlist? - if (num_bio && - num_iovs + nv >= IOV_MAX) break; // to many //g_conf.bdev_iov_max) break; // too many! - num_iovs += nv; - - start = MIN(start, bio->start); - length += bio->length; - - if (el_dir_forward) { - dout(20) << "dequeue_io fw dequeue io at " << el_pos << " " << *i->second << dendl; - biols.push_back(bio); // add at back - } else { - dout(20) << "dequeue_io bw dequeue io at " << el_pos << " " << *i->second << dendl; - biols.push_front(bio); // add at front - } - num_bio++; - - // move elevator pointer - bool at_end = false; - map::iterator prev = i; - if (el_dir_forward) { - el_pos += bio->length; // cont. next would start right after us - i++; - if (i == io_map.end()) { - at_end = true; - } - } else { - el_pos -= bio->length; - if (i == io_map.begin()) { - at_end = true; - } else { - i--; - } - } - - // dequeue - io_map.erase(prev); - bio->in_queue = 0; - - if (at_end) break; - } - - return num_bio; -} - - - -/******************************************* - * BarrierQueue - */ -#undef dout -#define dout(x) if (x <= g_conf.debug_bdev) *_dout << dbeginl << g_clock.now() << " bdev(" << dev << ").barrierq." - -void BlockDevice::BarrierQueue::barrier() -{ - if (!qls.empty() && qls.front()->empty()) { - assert(qls.size() == 1); - dout(10) << "barrier not adding new queue, front is empty" << dendl; - } else { - qls.push_back(new ElevatorQueue(bdev, dev)); - dout(10) << "barrier adding new elevator queue (now " << qls.size() << "), front queue has " - << qls.front()->size() << " ios left" << dendl; - } -} - -bool BlockDevice::BarrierQueue::bump() -{ - assert(!qls.empty()); - - // is the front queue empty? - if (qls.front()->empty() && - qls.front() != qls.back()) { - delete qls.front(); - qls.pop_front(); - dout(10) << "dequeue_io front empty, moving to next queue (" << qls.front()->size() << ")" << dendl; - return true; - } - - return false; -} - -int BlockDevice::BarrierQueue::dequeue_io(list& biols, - block_t& start, block_t& length, - interval_set& locked) -{ - assert(!qls.empty()); - int n = qls.front()->dequeue_io(biols, start, length, locked); - bump(); // in case we emptied the front queue - return n; -} - - - - -/******************************************* - * BlockDevice - */ - -#undef dout -#define dout(x) if (x <= g_conf.debug_bdev) *_dout << dbeginl << g_clock.now() << " bdev(" << dev << ")." - - - -block_t BlockDevice::get_num_blocks() -{ - if (!num_blocks) { - assert(fd > 0); - - int r; -#ifdef BLKGETSIZE64 - // ioctl block device - uint64_t bytes = 0; - r = ioctl(fd, BLKGETSIZE64, &bytes); - num_blocks = bytes / (uint64_t)EBOFS_BLOCK_SIZE; - if (r == 0) { - dout(10) << "get_num_blocks ioctl BLKGETSIZE64 reports " - << num_blocks << " 4k blocks, " - << bytes << " bytes" - << dendl; -#else - // hrm, try the 32 bit ioctl? - unsigned long sectors = 0; - r = ioctl(fd, BLKGETSIZE, §ors); - num_blocks = sectors/8ULL; - bytes = sectors*512ULL; - if (r == 0) { - dout(10) << "get_num_blocks ioctl BLKGETSIZE reports " << sectors << " sectors, " - << num_blocks << " 4k blocks, " << bytes << " bytes" << dendl; -#endif - } else { - // hmm, try stat! - dout(10) << "get_num_blocks ioctl(2) failed with " << errno << " " << strerror(errno) << ", using stat(2)" << dendl; - struct stat st; - fstat(fd, &st); - uint64_t bytes = st.st_size; - num_blocks = bytes / EBOFS_BLOCK_SIZE; - dout(10) << "get_num_blocks stat reports " << num_blocks << " 4k blocks, " << bytes << " bytes" << dendl; - } - - if (g_conf.bdev_fake_mb) { - num_blocks = g_conf.bdev_fake_mb * 256; - dout(0) << "faking dev size " << g_conf.bdev_fake_mb << " mb" << dendl; - } - if (g_conf.bdev_fake_max_mb && - num_blocks > (block_t)g_conf.bdev_fake_max_mb * 256ULL) { - dout(0) << "faking dev size " << g_conf.bdev_fake_max_mb << " mb" << dendl; - num_blocks = g_conf.bdev_fake_max_mb * 256; - } - - } - return num_blocks; -} - - - -/** io thread - * each worker thread dequeues ios from the root_queue and submits them to the kernel. - */ -void* BlockDevice::io_thread_entry() -{ - lock.Lock(); - - int whoami = io_threads_started++; - io_threads_running++; - assert(io_threads_running <= g_conf.bdev_iothreads); - dout(10) << "io_thread" << whoami << " start, " << io_threads_running << " now running" << dendl; - - // get my own fd (and file position pointer) - int fd = open_fd(); - assert(fd > 0); - - while (!io_stop) { - if (!root_queue.empty()) { - dout(20) << "io_thread" << whoami << "/" << io_threads_running << " going" << dendl; - - block_t start, length; - list biols; - int n = root_queue.dequeue_io(biols, start, length, io_block_lock); - - if (n == 0) { - // failed to dequeue a do-able op, sleep for now - dout(20) << "io_thread" << whoami << "/" << io_threads_running << " couldn't dequeue doable op, sleeping" << dendl; - assert(io_threads_running > 1); // there must be someone else, if we couldn't dequeue something doable. - } - else { - // lock blocks - assert(start == biols.front()->start); - io_block_lock.insert(start, length); - - // drop lock to do the io - lock.Unlock(); - do_io(fd, biols); - lock.Lock(); - - // unlock blocks - io_block_lock.erase(start, length); - - // someone might have blocked on our block_lock? - if (io_threads_running < g_conf.bdev_iothreads && - (int)root_queue.size() > io_threads_running) - io_wakeup.SignalAll(); - - // loop again (don't sleep) - continue; - } - } - - // sleep - io_threads_running--; - dout(20) << "io_thread" << whoami << " sleeping, " - << io_threads_running << " threads now running," - << " queue has " << root_queue.size() - << dendl; - - // first wait for signal | timeout? - if (g_conf.bdev_idle_kick_after_ms > 0 && - idle_kicker && - io_threads_running == 0 && !is_idle_waiting) { // only the last thread asleep needs to kick. - // sleep, but just briefly. - dout(20) << "io_thread" << whoami << " doing short wait, to see if i stay idle" << dendl; - is_idle_waiting = true; - int r = io_wakeup.WaitInterval(lock, utime_t(0, g_conf.bdev_idle_kick_after_ms*1000)); - is_idle_waiting = false; - - if (r == ETIMEDOUT) { - dout(20) << "io_thread" << whoami << " timeout expired, kicking ebofs" << dendl; - kicker_cond.Signal(); // signal kicker thread - } else { - dout(20) << "io_thread" << whoami << " signaled during short sleep, waking up" << dendl; - goto wake_up; - } - } - - // sleeeep - io_wakeup.Wait(lock); // and wait (if condition still holds) - - wake_up: - io_threads_running++; - assert(io_threads_running <= g_conf.bdev_iothreads); - dout(20) << "io_thread" << whoami << "/" << io_threads_running << " woke up, " << io_threads_running << " threads now running" << dendl; - } - - // clean up - ::close(fd); - io_threads_running--; - - lock.Unlock(); - - dout(10) << "io_thread" << whoami << " finish" << dendl; - return 0; -} - - - -/** do_io - * do a single io operation - * (lock is NOT held, but we own the *biovec) - */ -void BlockDevice::do_io(int fd, list& biols) -{ - int r; - assert(!biols.empty()); - - // get full range, type, bl - bufferlist bl; - bl.claim(biols.front()->bl); - block_t start = biols.front()->start; - block_t length = biols.front()->length; - char type = biols.front()->type; - - list::iterator p = biols.begin(); - int numbio = 1; - for (p++; p != biols.end(); p++) { - length += (*p)->length; - bl.claim_append((*p)->bl); - numbio++; - } - - // do it - dout(20) << "do_io start " << (type==biovec::IO_WRITE?"write":"read") - << " " << start << "~" << length - << " " << numbio << " bits" << dendl; - if (type == biovec::IO_WRITE) { - r = _write(fd, start, length, bl); - } else if (type == biovec::IO_READ) { - r = _read(fd, start, length, bl); - } else assert(0); - dout(20) << "do_io finish " << (type==biovec::IO_WRITE?"write":"read") - << " " << start << "~" << length << dendl; - - // set rval - for (p = biols.begin(); p != biols.end(); p++) - (*p)->rval = r; - - if (1) { - // put in completion queue - complete_lock.Lock(); - complete_queue.splice( complete_queue.end(), biols ); - complete_queue_len += numbio; - complete_wakeup.Signal(); - complete_lock.Unlock(); - dout(20) << "do_io kicked completer on " << (type==biovec::IO_WRITE?"write":"read") - << " " << start << "~" << length << dendl; - - } else { - // be slow and finish synchronously - for (p = biols.begin(); p != biols.end(); p++) - finish_io(*p); - } -} - - -/** finish_io - * - * finish an io by signaling the cond or performing a callback. - * called by completion thread, unless that's disabled above. - */ -void BlockDevice::finish_io(biovec *bio) -{ - bio->done = true; - if (bio->cond) { - lock.Lock(); // hmm? - bio->cond->Signal(); - lock.Unlock(); - } - else if (bio->cb) { - bio->cb->finish((ioh_t)bio, bio->rval); - delete bio->cb; - delete bio; - } -} - -/*** completion_thread - * handle Cond signals or callbacks for completed ios - */ -void* BlockDevice::complete_thread_entry() -{ - complete_lock.Lock(); - dout(10) << "complete_thread start" << dendl; - - while (!io_stop) { - - while (!complete_queue.empty()) { - list ls; - ls.swap(complete_queue); - dout(10) << "complete_thread grabbed " << complete_queue_len << " biovecs" << dendl; - complete_queue_len = 0; - - complete_lock.Unlock(); - - // finish - for (list::iterator p = ls.begin(); - p != ls.end(); - p++) { - biovec *bio = *p; - dout(20) << "complete_thread finishing " << *bio << dendl; - finish_io(bio); - } - - complete_lock.Lock(); - } - if (io_stop) break; - - dout(25) << "complete_thread sleeping" << dendl; - complete_wakeup.Wait(complete_lock); - } - - dout(10) << "complete_thread finish" << dendl; - complete_lock.Unlock(); - return 0; -} - - -/*** idle kicker thread - * kick ebofs when we're idle. we're a separate thread (yuck) - * because ebofs may be holding it's lock _and_ waiting for us - * to do useful work. that rules out io_thread and complete_thread! - */ -void* BlockDevice::kicker_thread_entry() -{ - lock.Lock(); - dout(10) << "kicker_thread start" << dendl; - - while (!io_stop) { - - if (io_threads_running == 0 && idle_kicker) { - dout(25) << "kicker_thread kicking ebofs" << dendl; - lock.Unlock(); - idle_kicker->kick(); - lock.Lock(); - dout(25) << "kicker_thread done kicking ebofs" << dendl; - } - if (io_stop) break; - - dout(25) << "kicker_thread sleeping" << dendl; - kicker_cond.Wait(lock); - } - - dout(10) << "kicker_thread finish" << dendl; - lock.Unlock(); - return 0; -} - - - - -// io queue - -void BlockDevice::_submit_io(biovec *b) -{ - // NOTE: lock must be held - dout(15) << "_submit_io " << *b << dendl; - - // wake up io_thread(s)? - if ((int)root_queue.size() == io_threads_running) - io_wakeup.SignalOne(); - else if ((int)root_queue.size() > io_threads_running) - io_wakeup.SignalAll(); - - // queue - root_queue.submit_io(b); - - /* - // [DEBUG] check for overlapping ios - // BUG: this doesn't detect all overlaps w/ the next queue thing. - if (g_conf.bdev_debug_check_io_overlap) { - // BUG: this doesn't catch everything! eg 1~10000000 will be missed.... - multimap::iterator p = io_queue.lower_bound(b->start); - if ((p != io_queue.end() && - p->first < b->start+b->length) || - (p != io_queue.begin() && - (p--, p->second->start + p->second->length > b->start))) { - dout(1) << "_submit_io new io " << *b - << " overlaps with existing " << *p->second << dendl; - cerr << "_submit_io new io " << *b - << " overlaps with existing " << *p->second << dendl; - } - } - */ - -} - -int BlockDevice::_cancel_io(biovec *bio) -{ - // NOTE: lock must be held - - if (bio->in_queue == 0) { - dout(15) << "_cancel_io " << *bio << " FAILED" << dendl; - return -1; - } else { - dout(15) << "_cancel_io " << *bio << dendl; - bio->in_queue->cancel_io(bio); - if (root_queue.bump()) - io_wakeup.SignalAll(); // something happened! - return 0; - } -} - - - -// low level io - -int BlockDevice::_read(int fd, block_t bno, unsigned num, bufferlist& bl) -{ - dout(10) << "_read " << bno << "~" << num << dendl; - - assert(fd > 0); - - off_t offset = bno * EBOFS_BLOCK_SIZE; - off_t actual = lseek(fd, offset, SEEK_SET); - assert(actual == offset); - - size_t len = num*EBOFS_BLOCK_SIZE; - assert(bl.length() >= len); - - struct iovec iov[ bl.buffers().size() ]; - int n = 0; - size_t left = len; - for (list::const_iterator i = bl.buffers().begin(); - i != bl.buffers().end(); - i++) { - assert(i->length() % EBOFS_BLOCK_SIZE == 0); - - iov[n].iov_base = (void*)i->c_str(); - iov[n].iov_len = MIN(left, i->length()); - - left -= iov[n].iov_len; - n++; - if (left == 0) break; - } - - int got = ::readv(fd, iov, n); - assert(got <= (int)len); - - return 0; -} - -int BlockDevice::_write(int fd, unsigned bno, unsigned num, bufferlist& bl) -{ - dout(10) << "_write " << bno << "~" << num << dendl; - - assert(fd > 0); - - while (1) { - off_t offset = (off_t)bno << EBOFS_BLOCK_BITS; - assert((off_t)bno * (off_t)EBOFS_BLOCK_SIZE == offset); - off_t actual = lseek(fd, offset, SEEK_SET); - assert(actual == offset); - - // write buffers - size_t len = num*EBOFS_BLOCK_SIZE; - - struct iovec iov[ bl.buffers().size() ]; - - int n = 0; - size_t left = len; - for (list::const_iterator i = bl.buffers().begin(); - i != bl.buffers().end(); - i++) { - assert(i->length() % EBOFS_BLOCK_SIZE == 0); - - iov[n].iov_base = (void*)i->c_str(); - iov[n].iov_len = MIN(left, i->length()); - - assert((((intptr_t)iov[n].iov_base) & ((intptr_t)4095ULL)) == 0); - assert((iov[n].iov_len & 4095) == 0); - - left -= iov[n].iov_len; - n++; - if (left == 0 || - n == IOV_MAX) break; - } - - int r = ::writev(fd, iov, n); - - if (r < 0) { - dout(1) << "couldn't write bno " << bno << " num " << num - << " (" << len << " bytes) in " << n << " iovs, r=" << r - << " errno " << errno << " " << strerror(errno) << dendl; - dout(1) << "bl is " << bl << dendl; - assert(0); - } else if (r < (int)len) { - // hrm, we didn't write _all_ of our data. WTF kind of FS is this? - dout(-1) << "bloody hell, writev only wrote " << r << " of " << len << " bytes, looping" << dendl; - assert(r % 4096 == 0); - int wrote = r / 4096; - bno += wrote; - num -= wrote; - bufferlist tail; - tail.substr_of(bl, r, len-r); - bl.claim(tail); - continue; - } else { - // yay - assert(r == (int)len); - break; - } - } - return 0; -} - - - -// open/close - -int BlockDevice::open_fd() -{ -#ifdef DARWIN - int fd = ::open(dev.c_str(), O_RDWR|O_SYNC, 0); - ::fcntl(fd, F_NOCACHE); - return fd; -#else - return ::open(dev.c_str(), O_RDWR|O_SYNC|O_DIRECT, 0); -#endif -} - -int BlockDevice::open(kicker *idle) -{ - assert(fd == 0); - - // open? - fd = open_fd(); - if (fd < 0) { - dout(1) << "open failed, r = " << fd << " " << strerror(errno) << dendl; - fd = 0; - return -1; - } - - // lock - if (g_conf.bdev_lock) { - int r = ::flock(fd, LOCK_EX|LOCK_NB); - if (r < 0) { - derr(1) << "open " << dev << " failed to get LOCK_EX" << dendl; - return -1; - } - } - - // figure size - block_t b = get_num_blocks(); - if (!b) { - dout(0) << "open can't determine size of device" << dendl; - assert(0); - } - dout(2) << "open " << b << " blocks, " << b*4096 << " bytes" << dendl; - - // start thread - io_threads_started = 0; - io_threads.clear(); - for (int i=0; icreate(); - } - complete_thread.create(); - kicker_thread.create(); - - // idle kicker? - idle_kicker = idle; - - return fd; -} - - -/* - * warning: ebofs shoudl drop it's lock before calling close(), - * or else deadlock against the idle kicker - */ -int BlockDevice::close() -{ - assert(fd>0); - - idle_kicker = 0; - - // shut down io thread - dout(10) << "close stopping io+complete threads" << dendl; - lock.Lock(); - complete_lock.Lock(); - io_stop = true; - io_wakeup.SignalAll(); - complete_wakeup.SignalAll(); - kicker_cond.Signal(); - complete_lock.Unlock(); - lock.Unlock(); - - for (int i=0; ijoin(); - delete io_threads[i]; - } - io_threads.clear(); - - complete_thread.join(); - kicker_thread.join(); - - io_stop = false; // in case we start again - - dout(2) << "close " << dendl; - - if (g_conf.bdev_lock) - ::flock(fd, LOCK_UN); - - ::close(fd); - fd = 0; - - return 0; -} - -int BlockDevice::cancel_io(ioh_t ioh) -{ - biovec *pbio = (biovec*)ioh; - - lock.Lock(); - int r = _cancel_io(pbio); - lock.Unlock(); - - // FIXME? - if (r == 0 && pbio->cb) { - //pbio->cb->finish(ioh, 0); - delete pbio->cb; - delete pbio; - } - - return r; -} - diff --git a/branches/sage/mds/ebofs/BlockDevice.h b/branches/sage/mds/ebofs/BlockDevice.h deleted file mode 100644 index 295ea6b55b75f..0000000000000 --- a/branches/sage/mds/ebofs/BlockDevice.h +++ /dev/null @@ -1,351 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __EBOFS_BLOCKDEVICE_H -#define __EBOFS_BLOCKDEVICE_H - -#include "include/buffer.h" -#include "include/interval_set.h" -#include "include/Context.h" -#include "common/Mutex.h" -#include "common/Cond.h" -#include "common/Thread.h" - -#include "types.h" - - -typedef void *ioh_t; // opaque handle to an io request. (in actuality, a biovec*) - - -class BlockDevice { - public: - // callback type for io completion notification - class callback { - public: - virtual ~callback() {} - virtual void finish(ioh_t ioh, int rval) = 0; - }; - - // kicker for idle notification - class kicker { - public: - virtual ~kicker() {} - virtual void kick() = 0; - }; - - - /********************************************************/ - - class Queue; - - // io item - // two variants: one with Cond*, one with callback*. - class biovec { - public: - static const char IO_WRITE = 1; - static const char IO_READ = 2; - - char type; - block_t start, length; - bufferlist bl; - callback *cb; - Cond *cond; - int rval; - char *note; - bool done; - - Queue *in_queue; - - biovec(char t, block_t s, block_t l, bufferlist& b, callback *c, char *n=0) : - type(t), start(s), length(l), bl(b), cb(c), cond(0), rval(0), note(n), done(false), in_queue(0) {} - biovec(char t, block_t s, block_t l, bufferlist& b, Cond *c, char *n=0) : - type(t), start(s), length(l), bl(b), cb(0), cond(c), rval(0), note(n), done(false), in_queue(0) {} - }; - friend ostream& operator<<(ostream& out, biovec &bio); - - - /********************************************************/ - - /* - * Queue -- abstract IO queue interface - */ - class Queue { - public: - virtual ~Queue() {} - virtual void submit_io(biovec *b) = 0; - virtual void cancel_io(biovec *b) = 0; - virtual int dequeue_io(list& biols, - block_t& start, block_t& length, - interval_set& locked) = 0; - virtual int size() = 0; - virtual bool empty() { return size() == 0; } - }; - - /* - * ElevatorQueue - simple elevator scheduler queue - */ - class ElevatorQueue : public Queue { - BlockDevice *bdev; - const char *dev; - map io_map; - bool el_dir_forward; - block_t el_pos; - utime_t el_stop; - - public: - ElevatorQueue(BlockDevice *bd, const char *d) : - bdev(bd), dev(d), - el_dir_forward(false), - el_pos(0) {} - void submit_io(biovec *b) { - b->in_queue = this; - assert(io_map.count(b->start) == 0); - io_map[b->start] = b; - } - void cancel_io(biovec *b) { - assert(b->in_queue == this); - assert(io_map.count(b->start) && - io_map[b->start] == b); - io_map.erase(b->start); - b->in_queue = 0; - } - int dequeue_io(list& biols, - block_t& start, block_t& length, - interval_set& locked); - int size() { - return io_map.size(); - } - }; - - /* - * BarrierQueue - lets you specify io "barriers" - * barrier() - force completion of all prior IOs before - * future ios are started. - * bump() - must be called after cancel_io to properly - * detect empty subqueue. - */ - class BarrierQueue : public Queue { - BlockDevice *bdev; - const char *dev; - list qls; - public: - BarrierQueue(BlockDevice *bd, const char *d) : bdev(bd), dev(d) { - barrier(); - } - ~BarrierQueue() { - for (list::iterator p = qls.begin(); - p != qls.end(); - ++p) - delete *p; - qls.clear(); - } - int size() { - // this isn't perfectly accurate. - if (!qls.empty()) - return qls.front()->size(); - return 0; - } - void submit_io(biovec *b) { - assert(!qls.empty()); - qls.back()->submit_io(b); - } - void cancel_io(biovec *b) { - assert(0); // shouldn't happen. - } - int dequeue_io(list& biols, - block_t& start, block_t& length, - interval_set& locked); - void barrier(); - bool bump(); - }; - - - private: - string dev; // my device file - int fd; - block_t num_blocks; - - Mutex lock; - - /** the root io queue. - * i current assumeit's a barrier queue,but this can be changed - * with some minor rearchitecting. - */ - BarrierQueue root_queue; - - /* io_block_lock - block ranges current dispatched to kernel - * once a bio is dispatched, it cannot be canceled, so an overlapping - * io and be submitted. the overlapping io cannot be dispatched - * to the kernel, however, until the original io finishes, or else - * there will be a race condition. - */ - interval_set io_block_lock; // blocks currently dispatched to kernel - - // io threads - Cond io_wakeup; - bool io_stop; - int io_threads_started, io_threads_running; - bool is_idle_waiting; - - void *io_thread_entry(); - - class IOThread : public Thread { - BlockDevice *dev; - public: - IOThread(BlockDevice *d) : dev(d) {} - void *entry() { return (void*)dev->io_thread_entry(); } - } ; - - vector io_threads; - - // private io interface - int open_fd(); // get an fd (for a thread) - - void _submit_io(biovec *b); - int _cancel_io(biovec *bio); - void do_io(int fd, list& biols); // called by an io thread - - // low level io - int _read(int fd, block_t bno, unsigned num, bufferlist& bl); - int _write(int fd, unsigned bno, unsigned num, bufferlist& bl); - - - // completion callback queue - Mutex complete_lock; - Cond complete_wakeup; - list complete_queue; - int complete_queue_len; - - void finish_io(biovec *bio); - - // complete thread - void *complete_thread_entry(); - class CompleteThread : public Thread { - BlockDevice *dev; - public: - CompleteThread(BlockDevice *d) : dev(d) {} - void *entry() { return (void*)dev->complete_thread_entry(); } - } complete_thread; - - // kicker - kicker *idle_kicker; // not used.. - Mutex kicker_lock; - Cond kicker_cond; - void *kicker_thread_entry(); - class KickerThread : public Thread { - BlockDevice *dev; - public: - KickerThread(BlockDevice *d) : dev(d) {} - void *entry() { return (void*)dev->complete_thread_entry(); } - } kicker_thread; - - - - public: - BlockDevice(const char *d) : - dev(d), fd(0), num_blocks(0), - root_queue(this, dev.c_str()), - io_stop(false), io_threads_started(0), io_threads_running(0), is_idle_waiting(false), - complete_queue_len(0), - complete_thread(this), - idle_kicker(0), kicker_thread(this) { } - ~BlockDevice() { - if (fd > 0) close(); - } - - // get size in blocks - block_t get_num_blocks(); - const char *get_device_name() const { return dev.c_str(); } - - // open/close - int open(kicker *idle = 0); - int close(); - - // state stuff - bool is_idle() { - lock.Lock(); - bool idle = (io_threads_running == 0) && root_queue.empty(); - lock.Unlock(); - return idle; - } - void barrier() { - lock.Lock(); - root_queue.barrier(); - lock.Unlock(); - } - - // ** blocking interface ** - - // read - int read(block_t bno, unsigned num, bufferptr& bptr, char *n=0) { - bufferlist bl; - bl.push_back(bptr); - return read(bno, num, bl, n); - } - int read(block_t bno, unsigned num, bufferlist& bl, char *n=0) { - Cond c; - biovec bio(biovec::IO_READ, bno, num, bl, &c, n); - - lock.Lock(); - _submit_io(&bio); - barrier(); // need this, to prevent starvation! - while (!bio.done) - c.Wait(lock); - lock.Unlock(); - return bio.rval; - } - - // write - int write(unsigned bno, unsigned num, bufferptr& bptr, char *n=0) { - bufferlist bl; - bl.push_back(bptr); - return write(bno, num, bl, n); - } - int write(unsigned bno, unsigned num, bufferlist& bl, char *n=0) { - Cond c; - biovec bio(biovec::IO_WRITE, bno, num, bl, &c, n); - - lock.Lock(); - _submit_io(&bio); - barrier(); // need this, to prevent starvation! - while (!bio.done) - c.Wait(lock); - lock.Unlock(); - return bio.rval; - } - - // ** non-blocking interface ** - ioh_t read(block_t bno, unsigned num, bufferlist& bl, callback *fin, char *n=0) { - biovec *pbio = new biovec(biovec::IO_READ, bno, num, bl, fin, n); - lock.Lock(); - _submit_io(pbio); - lock.Unlock(); - return (ioh_t)pbio; - } - ioh_t write(block_t bno, unsigned num, bufferlist& bl, callback *fin, char *n=0) { - biovec *pbio = new biovec(biovec::IO_WRITE, bno, num, bl, fin, n); - lock.Lock(); - _submit_io(pbio); - lock.Unlock(); - return (ioh_t)pbio; - } - int cancel_io(ioh_t ioh); - -}; - - - - -#endif diff --git a/branches/sage/mds/ebofs/BufferCache.cc b/branches/sage/mds/ebofs/BufferCache.cc deleted file mode 100644 index b1c98455f8278..0000000000000 --- a/branches/sage/mds/ebofs/BufferCache.cc +++ /dev/null @@ -1,1228 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "BufferCache.h" -#include "Onode.h" - - -/*********** BufferHead **************/ - - -#undef dout -#define dout(x) if (x <= g_conf.debug_ebofs) *_dout << dbeginl << g_clock.now() << " ebofs.bh." - - - - - - -/************ ObjectCache **************/ - - -#undef dout -#define dout(x) if (x <= g_conf.debug_ebofs) *_dout << dbeginl << g_clock.now() << " ebofs.oc." - - - -void ObjectCache::rx_finish(ioh_t ioh, block_t start, block_t length, bufferlist& bl) -{ - list waiters; - - dout(10) << "rx_finish " << start << "~" << length << dendl; - for (map::iterator p = data.lower_bound(start); - p != data.end(); - p++) { - BufferHead *bh = p->second; - dout(10) << "rx_finish ?" << *bh << dendl; - assert(p->first == bh->start()); - - // past? - if (p->first >= start+length) break; - if (bh->end() > start+length) break; // past - - assert(p->first >= start); - assert(bh->end() <= start+length); - - dout(10) << "rx_finish !" << *bh << dendl; - - if (bh->rx_ioh == ioh) - bh->rx_ioh = 0; - - if (bh->is_rx()) { - assert(bh->get_version() == 0); - assert(bh->end() <= start+length); - assert(bh->start() >= start); - dout(10) << "rx_finish rx -> clean on " << *bh << dendl; - bh->data.substr_of(bl, (bh->start()-start)*EBOFS_BLOCK_SIZE, bh->length()*EBOFS_BLOCK_SIZE); - bc->mark_clean(bh); - } - else if (bh->is_partial()) { - dout(10) << "rx_finish partial -> tx on " << *bh << dendl; - - if (1) { - // double-check what block i am - vector exv; - on->map_extents(bh->start(), 1, exv); - assert(exv.size() == 1); - block_t cur_block = exv[0].start; - assert(cur_block == bh->partial_tx_to); - } - - // ok, cancel my low-level partial (since we're still here, and can bh_write ourselves) - bc->cancel_partial( bh->rx_from.start, bh->partial_tx_to, bh->partial_tx_epoch ); - - // apply partial to myself - assert(bh->data.length() == 0); - bufferptr bp = buffer::create_page_aligned(EBOFS_BLOCK_SIZE); - bh->data.push_back( bp ); - bh->data.copy_in(0, EBOFS_BLOCK_SIZE, bl); - bh->apply_partial(); - - // write "normally" - bc->mark_dirty(bh); - bc->bh_write(on, bh, bh->partial_tx_to);//cur_block); - - // clean up a bit - bh->partial_tx_to = 0; - bh->partial_tx_epoch = 0; - bh->partial.clear(); - } - else { - dout(10) << "rx_finish ignoring status on (dirty|tx|clean) " << *bh << dendl; - assert(bh->is_dirty() || // was overwritten - bh->is_tx() || // was overwritten and queued - bh->is_clean()); // was overwritten, queued, _and_ flushed to disk - } - - // trigger waiters - for (map >::iterator p = bh->waitfor_read.begin(); - p != bh->waitfor_read.end(); - p++) { - assert(p->first >= bh->start() && p->first < bh->end()); - waiters.splice(waiters.begin(), p->second); - } - bh->waitfor_read.clear(); - } - - finish_contexts(waiters); -} - - -void ObjectCache::tx_finish(ioh_t ioh, block_t start, block_t length, - version_t version, version_t epoch) -{ - dout(10) << "tx_finish " << start << "~" << length << " v" << version << dendl; - for (map::iterator p = data.lower_bound(start); - p != data.end(); - p++) { - BufferHead *bh = p->second; - dout(30) << "tx_finish ?bh " << *bh << dendl; - assert(p->first == bh->start()); - - // past? - if (p->first >= start+length) { - bh->oc->try_merge_bh_right(p); - break; - } - - if (bh->tx_ioh == ioh) - bh->tx_ioh = 0; - - if (!bh->is_tx()) { - dout(10) << "tx_finish bh not marked tx, skipping" << dendl; - continue; - } - assert(bh->is_tx()); - - if (version == bh->version) { - dout(10) << "tx_finish tx -> clean on " << *bh << dendl; - assert(bh->end() <= start+length); - bh->set_last_flushed(version); - bc->mark_clean(bh); - bh->oc->try_merge_bh_left(p); - } else { - dout(10) << "tx_finish leaving tx, " << bh->version << " > " << version - << " on " << *bh << dendl; - assert(bh->version > version); - } - } -} - - - -/* - * return any bh's that are (partially) in this range that are TX. - */ -int ObjectCache::find_tx(block_t start, block_t len, - list& tx) -{ - map::iterator p = data.lower_bound(start); - - block_t cur = start; - block_t left = len; - - /* don't care about overlap, we want things _fully_ in start~len. - if (p != data.begin() && - (p == data.end() || p->first > cur)) { - p--; // might overlap! - if (p->first + p->second->length() <= cur) - p++; // doesn't overlap. - } - */ - - while (left > 0) { - assert(cur+left == start+len); - - // at end? - if (p == data.end()) - break; - - if (p->first <= cur) { - // have it (or part of it) - BufferHead *e = p->second; - - if (e->end() <= start+len && - e->is_tx()) - tx.push_back(e); - - block_t lenfromcur = MIN(e->end() - cur, left); - cur += lenfromcur; - left -= lenfromcur; - p++; - continue; // more? - } else if (p->first > cur) { - // gap.. miss - block_t next = p->first; - left -= (next-cur); - cur = next; - continue; - } - else - assert(0); - } - - return 0; -} - - -int ObjectCache::try_map_read(block_t start, block_t len) -{ - map::iterator p = data.lower_bound(start); - - block_t cur = start; - block_t left = len; - - if (p != data.begin() && - (p == data.end() || p->first > cur)) { - p--; // might overlap! - if (p->first + p->second->length() <= cur) - p++; // doesn't overlap. - } - - int num_missing = 0; - - while (left > 0) { - // at end? - if (p == data.end()) { - // rest is a miss. - vector exv; - on->map_extents(cur, - left, // no prefetch here! - exv); - - num_missing += exv.size(); - left = 0; - cur = start+len; - break; - } - - if (p->first <= cur) { - // have it (or part of it) - BufferHead *e = p->second; - - if (e->is_clean() || - e->is_dirty() || - e->is_tx()) { - dout(20) << "try_map_read hit " << *e << dendl; - } - else if (e->is_rx()) { - dout(20) << "try_map_read rx " << *e << dendl; - num_missing++; - } - else if (e->is_partial()) { - dout(-20) << "try_map_read partial " << *e << dendl; - num_missing++; - } - else { - dout(0) << "try_map_read got unexpected " << *e << dendl; - assert(0); - } - - block_t lenfromcur = MIN(e->end() - cur, left); - cur += lenfromcur; - left -= lenfromcur; - p++; - continue; // more? - } else if (p->first > cur) { - // gap.. miss - block_t next = p->first; - vector exv; - on->map_extents(cur, - MIN(next-cur, left), // no prefetch - exv); - - dout(20) << "try_map_read gap of " << p->first-cur << " blocks, " - << exv.size() << " extents" << dendl; - num_missing += exv.size(); - left -= (p->first - cur); - cur = p->first; - continue; // more? - } - else - assert(0); - } - - assert(left == 0); - assert(cur == start+len); - return num_missing; -} - - - - - -/* - * map a range of blocks into buffer_heads. - * - create missing buffer_heads as necessary. - * - fragment along disk extent boundaries - */ -int ObjectCache::map_read(block_t start, block_t len, - map& hits, - map& missing, - map& rx, - map& partial) { - - map::iterator p = data.lower_bound(start); - - block_t cur = start; - block_t left = len; - - if (p != data.begin() && - (p == data.end() || p->first > cur)) { - p--; // might overlap! - if (p->first + p->second->length() <= cur) - p++; // doesn't overlap. - } - - while (left > 0) { - // at end? - if (p == data.end()) { - // rest is a miss. - vector exv; - //on->map_extents(cur, left, exv); // we might consider some prefetch here. - on->map_extents(cur, - //MIN(left + g_conf.ebofs_max_prefetch, // prefetch - //on->object_blocks-cur), - left, // no prefetch - exv); - for (unsigned i=0; i 0; i++) { - BufferHead *n = new BufferHead(this); - n->set_start( cur ); - n->set_length( exv[i].length ); - bc->add_bh(n); - missing[cur] = n; - dout(20) << "map_read miss " << left << " left, " << *n << dendl; - cur += MIN(left,exv[i].length); - left -= MIN(left,exv[i].length); - } - assert(left == 0); - assert(cur == start+len); - break; - } - - if (p->first <= cur) { - // have it (or part of it) - BufferHead *e = p->second; - - if (e->is_clean() || - e->is_dirty() || - e->is_tx()) { - hits[cur] = e; // readable! - dout(20) << "map_read hit " << *e << dendl; - bc->touch(e); - } - else if (e->is_rx()) { - rx[cur] = e; // missing, not readable. - dout(20) << "map_read rx " << *e << dendl; - } - else if (e->is_partial()) { - partial[cur] = e; - dout(20) << "map_read partial " << *e << dendl; - } - else { - dout(0) << "map_read ??? got unexpected " << *e << dendl; - assert(0); - } - - block_t lenfromcur = MIN(e->end() - cur, left); - cur += lenfromcur; - left -= lenfromcur; - p++; - continue; // more? - } else if (p->first > cur) { - // gap.. miss - block_t next = p->first; - vector exv; - on->map_extents(cur, - //MIN(next-cur, MIN(left + g_conf.ebofs_max_prefetch, // prefetch - // on->object_blocks-cur)), - MIN(next-cur, left), // no prefetch - exv); - - for (unsigned i=0; i0; i++) { - BufferHead *n = new BufferHead(this); - n->set_start( cur ); - n->set_length( exv[i].length ); - bc->add_bh(n); - missing[cur] = n; - cur += MIN(left, n->length()); - left -= MIN(left, n->length()); - dout(20) << "map_read gap " << *n << dendl; - } - continue; // more? - } - else - assert(0); - } - - assert(left == 0); - assert(cur == start+len); - return 0; -} - - -/* - * map a range of pages on an object's buffer cache. - * - * - break up bufferheads that don't fall completely within the range - * - cancel rx ops we obsolete. - * - resubmit rx ops if we split bufferheads - * - * - leave potentially obsoleted tx ops alone (for now) - * - don't worry about disk extent boundaries (yet) - */ -int ObjectCache::map_write(block_t start, block_t len, - map& hits, - version_t super_epoch) -{ - map::iterator p; - - // hack speed up common cases - if (start == 0) { - p = data.begin(); - } else if (start + len == on->object_blocks && len == 1 && !data.empty()) { - // append hack. - p = data.end(); - p--; - if (p->first < start) p++; - } else { - p = data.lower_bound(start); - } - - dout(10) << "map_write " << *on << " " << start << "~" << len << dendl; - // p->first >= start - - block_t cur = start; - block_t left = len; - - if (p != data.begin() && - (p == data.end() || p->first > cur)) { - p--; // might overlap! - if (p->first + p->second->length() <= cur) - p++; // doesn't overlap. - } - - //dump(); - - while (left > 0) { - // max for this bh (bc of (re)alloc on disk) - block_t max = left; - - // based on disk extent boundary ... - vector exv; - on->map_extents(cur, max, exv); - if (exv.size() > 1) - max = exv[0].length; - - dout(10) << "map_write " << cur << "~" << max << dendl; - - // at end? - if (p == data.end()) { - BufferHead *n = new BufferHead(this); - n->set_start( cur ); - n->set_length( max ); - bc->add_bh(n); - hits[cur] = n; - left -= max; - cur += max; - continue; - } - - dout(10) << "p is " << *p->second << dendl; - - - if (p->first <= cur) { - BufferHead *bh = p->second; - dout(10) << "map_write bh " << *bh << " intersected" << dendl; - - if (p->first < cur) { - if (cur+max >= p->first+p->second->length()) { - // we want right bit (one splice) - if (bh->is_rx() && bc->bh_cancel_read(bh)) { - BufferHead *right = bc->split(bh, cur); - bc->bh_read(on, bh); // reread left bit - bh = right; - } else if (bh->is_tx() && bh->epoch_modified == super_epoch && bc->bh_cancel_write(bh, super_epoch)) { - BufferHead *right = bc->split(bh, cur); - bc->bh_write(on, bh); // rewrite left bit - bh = right; - } else { - bh = bc->split(bh, cur); // just split it - } - p++; - assert(p->second == bh); - } else { - // we want middle bit (two splices) - if (bh->is_rx() && bc->bh_cancel_read(bh)) { - BufferHead *middle = bc->split(bh, cur); - bc->bh_read(on, bh); // reread left - p++; - assert(p->second == middle); - BufferHead *right = bc->split(middle, cur+max); - bc->bh_read(on, right); // reread right - bh = middle; - } else if (bh->is_tx() && bh->epoch_modified == super_epoch && bc->bh_cancel_write(bh, super_epoch)) { - BufferHead *middle = bc->split(bh, cur); - bc->bh_write(on, bh); // redo left - p++; - assert(p->second == middle); - BufferHead *right = bc->split(middle, cur+max); - bc->bh_write(on, right); // redo right - bh = middle; - } else { - BufferHead *middle = bc->split(bh, cur); - p++; - assert(p->second == middle); - bc->split(middle, cur+max); - bh = middle; - } - } - } else if (p->first == cur) { - if (p->second->length() <= max) { - // whole bufferhead, piece of cake. - } else { - // we want left bit (one splice) - if (bh->is_rx() && bc->bh_cancel_read(bh)) { - BufferHead *right = bc->split(bh, cur+max); - bc->bh_read(on, right); // re-rx the right bit - } else if (bh->is_tx() && bh->epoch_modified == super_epoch && bc->bh_cancel_write(bh, super_epoch)) { - BufferHead *right = bc->split(bh, cur+max); - bc->bh_write(on, right); // re-tx the right bit - } else { - bc->split(bh, cur+max); // just split - } - } - } - - // try to cancel tx? - if (bh->is_tx() && bh->epoch_modified == super_epoch) bc->bh_cancel_write(bh, super_epoch); - - // put in our map - hits[cur] = bh; - - // keep going. - block_t lenfromcur = bh->end() - cur; - cur += lenfromcur; - left -= lenfromcur; - p++; - continue; - } else { - // gap! - block_t next = p->first; - block_t glen = MIN(next-cur, max); - dout(10) << "map_write gap " << cur << "~" << glen << dendl; - BufferHead *n = new BufferHead(this); - n->set_start( cur ); - n->set_length( glen ); - bc->add_bh(n); - hits[cur] = n; - - cur += glen; - left -= glen; - continue; // more? - } - } - - assert(left == 0); - assert(cur == start+len); - return 0; -} - -/* don't need this. -int ObjectCache::scan_versions(block_t start, block_t len, - version_t& low, version_t& high) -{ - map::iterator p = data.lower_bound(start); - // p->first >= start - - if (p != data.begin() && p->first > start) { - p--; // might overlap? - if (p->first + p->second->length() <= start) - p++; // doesn't overlap. - } - if (p->first >= start+len) - return -1; // to the right. no hits. - - // start - low = high = p->second->get_version(); - - for (p++; p != data.end(); p++) { - // past? - if (p->first >= start+len) break; - - const version_t v = p->second->get_version(); - if (low > v) low = v; - if (high < v) high = v; - } - - return 0; -} -*/ - -void ObjectCache::touch_bottom(block_t bstart, block_t blast) -{ - for (map::iterator p = data.lower_bound(bstart); - p != data.end(); - ++p) { - BufferHead *bh = p->second; - - // don't trim unless it's entirely in our range - if (bh->start() < bstart) continue; - if (bh->end() > blast) break; - - dout(12) << "moving " << *bh << " to bottom of lru" << dendl; - bc->touch_bottom(bh); // move to bottom of lru list - } -} - - -void ObjectCache::truncate(block_t blocks, version_t super_epoch) -{ - dout(7) << "truncate " << object_id - << " " << blocks << " blocks" - << dendl; - - while (!data.empty()) { - block_t bhoff = data.rbegin()->first; - BufferHead *bh = data.rbegin()->second; - - if (bh->end() <= blocks) break; - - bool uncom = on->uncommitted.contains(bh->start(), bh->length()); - dout(10) << "truncate " << *bh << " uncom " << uncom - << " of " << on->uncommitted - << dendl; - - if (bhoff < blocks) { - // we want right bit (one splice) - if (bh->is_rx() && bc->bh_cancel_read(bh)) { - BufferHead *right = bc->split(bh, blocks); - bc->bh_read(on, bh); // reread left bit - bh = right; - } else if (bh->is_tx() && uncom && bh->epoch_modified == super_epoch && bc->bh_cancel_write(bh, super_epoch)) { - BufferHead *right = bc->split(bh, blocks); - bc->bh_write(on, bh); // rewrite left bit - bh = right; - } else { - bh = bc->split(bh, blocks); // just split it - } - // no worries about partials up here, they're always 1 block (and thus never split) - } else { - // whole thing - // cancel any pending/queued io, if possible. - if (bh->is_rx()) - bc->bh_cancel_read(bh); - if (bh->is_tx() && uncom && bh->epoch_modified == super_epoch) - bc->bh_cancel_write(bh, super_epoch); - if (bh->shadow_of) { - dout(10) << "truncate " << *bh << " unshadowing " << *bh->shadow_of << dendl; - // shadow - bh->shadow_of->remove_shadow(bh); - if (bh->is_partial()) - bc->cancel_shadow_partial(bh->rx_from.start, bh); - } else { - // normal - if (bh->is_partial() && uncom) - bc->bh_cancel_partial_write(bh); - } - } - - for (map >::iterator p = bh->waitfor_read.begin(); - p != bh->waitfor_read.end(); - p++) { - finish_contexts(p->second, -1); - } - - bc->remove_bh(bh); - delete bh; - } -} - - -void ObjectCache::clone_to(Onode *other) -{ - ObjectCache *ton = 0; - - for (map::iterator p = data.begin(); - p != data.end(); - p++) { - BufferHead *bh = p->second; - dout(10) << "clone_to ? " << *bh << dendl; - if (bh->is_dirty() || bh->is_tx() || bh->is_partial()) { - // dup dirty or tx bh's - if (!ton) - ton = other->get_oc(bc); - BufferHead *nbh = new BufferHead(ton); - nbh->set_start( bh->start() ); - nbh->set_length( bh->length() ); - nbh->data = bh->data; // just copy refs to underlying buffers. - bc->add_bh(nbh); - - if (bh->is_partial()) { - dout(0) << "clone_to PARTIAL FIXME NOT FULLY IMPLEMENTED ******" << dendl; - nbh->partial = bh->partial; - bc->mark_partial(nbh); - // register as shadow_partial - bc->add_shadow_partial(bh->rx_from.start, nbh); - } else { - // clean buffer will shadow - bh->add_shadow(nbh); - bc->mark_clean(nbh); - } - - dout(10) << "clone_to dup " << *bh << " -> " << *nbh << dendl; - } - } -} - - - -BufferHead *ObjectCache::merge_bh_left(BufferHead *left, BufferHead *right) -{ - dout(10) << "merge_bh_left " << *left << " " << *right << dendl; - assert(left->end() == right->start()); - assert(left->is_clean()); - assert(right->is_clean()); - assert(right->get_num_ref() == 0); - - // hrm, is this right? - if (right->version > left->version) left->version = right->version; - if (right->last_flushed > left->last_flushed) left->last_flushed = right->last_flushed; - - left->set_length(left->length() + right->length()); - left->data.claim_append(right->data); - - // remove right - remove_bh(right); - bc->lru_rest.lru_remove(right); - delete right; - dout(10) << "merge_bh_left result " << *left << dendl; - return left; -} - -/* wait until this has a user -void ObjectCache::try_merge_bh(BufferHead *bh) -{ - dout(-10) << "try_merge_bh " << *bh << dendl; - - map::iterator p = data.lower_bound(bh->start()); - assert(p->second == bh); - - try_merge_bh_left(p); - try_merge_bh_right(p); -} -*/ - - -void ObjectCache::try_merge_bh_left(map::iterator& p) -{ - BufferHead *bh = p->second; - dout(10) << "try_merge_bh_left " << *bh << dendl; - - // left? - if (p != data.begin()) { - p--; - if (p->second->end() == bh->start() && - p->second->is_clean() && - bh->is_clean() && - bh->get_num_ref() == 0 && - bh->data.buffers().size() < 8 && - p->second->data.buffers().size() < 8) - bh = merge_bh_left(p->second, bh); // yay! - else - p++; // nope. - } -} - -void ObjectCache::try_merge_bh_right(map::iterator& p) -{ - BufferHead *bh = p->second; - dout(10) << "try_merge_bh_right " << *bh << dendl; - - // right? - map::iterator o = p; - p++; - if (p != data.end() && - bh->end() == p->second->start() && - p->second->is_clean() && - bh->is_clean() && - p->second->get_num_ref() == 0 && - bh->data.buffers().size() < 8 && - p->second->data.buffers().size() < 8) { - BufferHead *right = p->second; - p--; - merge_bh_left(bh, right); - } else - p = o; -} - - - -/************** BufferCache ***************/ - -#undef dout -#define dout(x) if (x <= g_conf.debug_ebofs) *_dout << dbeginl << g_clock.now() << " ebofs.bc." - - - -BufferHead *BufferCache::split(BufferHead *orig, block_t after) -{ - dout(20) << "split " << *orig << " at " << after << dendl; - - // split off right - BufferHead *right = new BufferHead(orig->get_oc()); - right->set_version(orig->get_version()); - right->epoch_modified = orig->epoch_modified; - right->last_flushed = orig->last_flushed; - right->set_state(orig->get_state()); - - block_t newleftlen = after - orig->start(); - right->set_start( after ); - right->set_length( orig->length() - newleftlen ); - - // shorten left - stat_sub(orig); - orig->set_length( newleftlen ); - stat_add(orig); - - // add right - add_bh(right); - - // adjust rx_from - if (orig->is_rx()) { - right->rx_from = orig->rx_from; - orig->rx_from.length = newleftlen; - right->rx_from.length -= newleftlen; - right->rx_from.start += newleftlen; - } - - // dup shadows - for (set::iterator p = orig->shadows.begin(); - p != orig->shadows.end(); - ++p) - right->add_shadow(*p); - - // split buffers too - bufferlist bl; - bl.claim(orig->data); - if (bl.length()) { - assert(bl.length() == (orig->length()+right->length())*EBOFS_BLOCK_SIZE); - right->data.substr_of(bl, orig->length()*EBOFS_BLOCK_SIZE, right->length()*EBOFS_BLOCK_SIZE); - orig->data.substr_of(bl, 0, orig->length()*EBOFS_BLOCK_SIZE); - } - - // move read waiters - if (!orig->waitfor_read.empty()) { - map >::iterator o, p = orig->waitfor_read.end(); - p--; - while (p != orig->waitfor_read.begin()) { - if (p->first < right->start()) break; - dout(0) << "split moving waiters at block " << p->first << " to right bh" << dendl; - right->waitfor_read[p->first].swap( p->second ); - o = p; - p--; - orig->waitfor_read.erase(o); - } - } - - dout(20) << "split left is " << *orig << dendl; - dout(20) << "split right is " << *right << dendl; - return right; -} - - - - -void BufferCache::bh_read(Onode *on, BufferHead *bh, block_t from) -{ - dout(10) << "bh_read " << *on << " on " << *bh << dendl; - - if (bh->is_missing()) { - mark_rx(bh); - } else { - assert(bh->is_partial()); - } - - // get extent. there should be only one! - vector exv; - on->map_extents(bh->start(), bh->length(), exv); - assert(exv.size() == 1); - Extent ex = exv[0]; - - if (from) { // force behavior, used for reading partials - dout(10) << "bh_read forcing read from block " << from << " (for a partial)" << dendl; - ex.start = from; - ex.length = 1; - } - - // this should be empty!! - assert(bh->rx_ioh == 0); - - dout(20) << "bh_read " << *on << " " << *bh << " from " << ex << dendl; - - C_OC_RxFinish *fin = new C_OC_RxFinish(ebofs_lock, on->oc, - bh->start(), bh->length(), - ex.start); - - //bufferpool.alloc(EBOFS_BLOCK_SIZE*bh->length(), fin->bl); // new buffers! - fin->bl.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*bh->length()) ); - - bh->rx_ioh = dev.read(ex.start, ex.length, fin->bl, - fin); - bh->rx_from = ex; - on->oc->get(); - -} - -bool BufferCache::bh_cancel_read(BufferHead *bh) -{ - if (bh->rx_ioh && dev.cancel_io(bh->rx_ioh) >= 0) { - dout(10) << "bh_cancel_read on " << *bh << dendl; - bh->rx_ioh = 0; - mark_missing(bh); - int l = bh->oc->put(); - assert(l); - return true; - } - return false; -} - -void BufferCache::bh_write(Onode *on, BufferHead *bh, block_t shouldbe) -{ - dout(10) << "bh_write " << *on << " on " << *bh << " in epoch " << bh->epoch_modified << dendl; - assert(bh->get_version() > 0); - - assert(bh->is_dirty()); - mark_tx(bh); - - // get extents - vector exv; - on->map_extents(bh->start(), bh->length(), exv); - assert(exv.size() == 1); - Extent ex = exv[0]; - - if (shouldbe) - assert(ex.length == 1 && ex.start == shouldbe); - - dout(20) << "bh_write " << *on << " " << *bh << " to " << ex << dendl; - - //assert(bh->tx_ioh == 0); - - assert(bh->get_last_flushed() < bh->get_version()); - - bh->tx_block = ex.start; - bh->tx_ioh = dev.write(ex.start, ex.length, bh->data, - new C_OC_TxFinish(ebofs_lock, on->oc, - bh->start(), bh->length(), - bh->get_version(), - bh->epoch_modified), - "bh_write"); - - on->oc->get(); - inc_unflushed( EBOFS_BC_FLUSH_BHWRITE, bh->epoch_modified ); - - /* - // assert: no partials on the same block - // hose any partial on the same block - if (bh->partial_write.count(ex.start)) { - dout(10) << "bh_write hosing parital write on same block " << ex.start << " " << *bh << dendl; - dec_unflushed( bh->partial_write[ex.start].epoch ); - bh->partial_write.erase(ex.start); - } - */ -} - - -bool BufferCache::bh_cancel_write(BufferHead *bh, version_t cur_epoch) -{ - assert(bh->is_tx()); - assert(bh->epoch_modified == cur_epoch); - assert(bh->epoch_modified > 0); - if (bh->tx_ioh && dev.cancel_io(bh->tx_ioh) >= 0) { - dout(10) << "bh_cancel_write on " << *bh << dendl; - bh->tx_ioh = 0; - mark_dirty(bh); - - dec_unflushed( EBOFS_BC_FLUSH_BHWRITE, bh->epoch_modified ); // assert.. this should be the same epoch! - - int l = bh->oc->put(); - assert(l); - return true; - } - return false; -} - -void BufferCache::tx_finish(ObjectCache *oc, - ioh_t ioh, block_t start, block_t length, - version_t version, version_t epoch) -{ - ebofs_lock.Lock(); - - // finish oc - if (oc->put() == 0) { - delete oc; - } else - oc->tx_finish(ioh, start, length, version, epoch); - - // update unflushed counter - assert(get_unflushed(EBOFS_BC_FLUSH_BHWRITE, epoch) > 0); - dec_unflushed(EBOFS_BC_FLUSH_BHWRITE, epoch); - - ebofs_lock.Unlock(); -} - -void BufferCache::rx_finish(ObjectCache *oc, - ioh_t ioh, block_t start, block_t length, - block_t diskstart, - bufferlist& bl) -{ - ebofs_lock.Lock(); - dout(10) << "rx_finish ioh " << ioh << " on " << start << "~" << length - << ", at device block " << diskstart << dendl; - - // oc - if (oc->put() == 0) - delete oc; - else - oc->rx_finish(ioh, start, length, bl); - - // finish any partials? - // note: these are partials that were re-written after a commit, - // or for whom the OC was destroyed (eg truncated after a commit) - map >::iterator sp = partial_write.lower_bound(diskstart); - while (sp != partial_write.end()) { - if (sp->first >= diskstart+length) break; - assert(sp->first >= diskstart); - - block_t pblock = sp->first; - map writes; - writes.swap( sp->second ); - - map >::iterator t = sp; - sp++; - partial_write.erase(t); - - for (map::iterator p = writes.begin(); - p != writes.end(); - p++) { - dout(10) << "rx_finish partial from " << pblock << " -> " << p->first - << " for epoch " << p->second.epoch - //<< " (bh.epoch_modified is now " << bh->epoch_modified << ")" - << dendl; - // this had better be a past epoch - //assert(p->epoch == epoch_modified - 1); // ?? - - // make the combined block - bufferlist combined; - bufferptr bp = buffer::create_page_aligned(EBOFS_BLOCK_SIZE); - combined.push_back( bp ); - combined.copy_in((pblock-diskstart)*EBOFS_BLOCK_SIZE, (pblock-diskstart+1)*EBOFS_BLOCK_SIZE, bl); - BufferHead::apply_partial( combined, p->second.partial ); - - // write it! - dev.write( pblock, 1, combined, - new C_OC_PartialTxFinish( this, p->second.epoch ), - "finish_partials"); - } - } - - // shadow partials? - { - list waiters; - map >::iterator sp = shadow_partials.lower_bound(diskstart); - while (sp != shadow_partials.end()) { - if (sp->first >= diskstart+length) break; - assert(sp->first >= diskstart); - - block_t pblock = sp->first; - set ls; - ls.swap( sp->second ); - - map >::iterator t = sp; - sp++; - shadow_partials.erase(t); - - for (set::iterator p = ls.begin(); - p != ls.end(); - ++p) { - BufferHead *bh = *p; - dout(10) << "rx_finish applying shadow_partial for " << pblock - << " to " << *bh << dendl; - bufferptr bp = buffer::create_page_aligned(EBOFS_BLOCK_SIZE); - bh->data.clear(); - bh->data.push_back( bp ); - bh->data.copy_in((pblock-diskstart)*EBOFS_BLOCK_SIZE, - (pblock-diskstart+1)*EBOFS_BLOCK_SIZE, - bl); - bh->apply_partial(); - bh->set_state(BufferHead::STATE_CLEAN); - - // trigger waiters - for (map >::iterator p = bh->waitfor_read.begin(); - p != bh->waitfor_read.end(); - p++) { - assert(p->first >= bh->start() && p->first < bh->end()); - waiters.splice(waiters.begin(), p->second); - } - bh->waitfor_read.clear(); - } - } - - // kick waiters - finish_contexts(waiters); - } - - // done. - ebofs_lock.Unlock(); -} - -void BufferCache::partial_tx_finish(version_t epoch) -{ - ebofs_lock.Lock(); - - dout(10) << "partial_tx_finish in epoch " << epoch << dendl; - - // update unflushed counter - assert(get_unflushed(EBOFS_BC_FLUSH_PARTIAL, epoch) > 0); - dec_unflushed(EBOFS_BC_FLUSH_PARTIAL, epoch); - - ebofs_lock.Unlock(); -} - - - - -void BufferCache::bh_queue_partial_write(Onode *on, BufferHead *bh) -{ - assert(bh->get_version() > 0); - - assert(bh->is_partial()); - assert(bh->length() == 1); - - // get the block no - vector exv; - on->map_extents(bh->start(), bh->length(), exv); - assert(exv.size() == 1); - block_t b = exv[0].start; - assert(exv[0].length == 1); - bh->partial_tx_to = exv[0].start; - bh->partial_tx_epoch = bh->epoch_modified; - - dout(10) << "bh_queue_partial_write " << *on << " on " << *bh << " block " << b << " epoch " << bh->epoch_modified << dendl; - - - // copy map state, queue for this block - assert(bh->rx_from.length == 1); - queue_partial( bh->rx_from.start, bh->partial_tx_to, bh->partial, bh->partial_tx_epoch ); -} - -void BufferCache::bh_cancel_partial_write(BufferHead *bh) -{ - assert(bh->is_partial()); - assert(bh->length() == 1); - - cancel_partial( bh->rx_from.start, bh->partial_tx_to, bh->partial_tx_epoch ); -} - - -void BufferCache::queue_partial(block_t from, block_t to, - map& partial, version_t epoch) -{ - dout(10) << "queue_partial " << from << " -> " << to - << " in epoch " << epoch - << dendl; - - if (partial_write[from].count(to)) { - // this should be in the same epoch. - assert( partial_write[from][to].epoch == epoch); - assert(0); // actually.. no! - } else { - inc_unflushed( EBOFS_BC_FLUSH_PARTIAL, epoch ); - } - - partial_write[from][to].partial = partial; - partial_write[from][to].epoch = epoch; -} - -void BufferCache::cancel_partial(block_t from, block_t to, version_t epoch) -{ - assert(partial_write.count(from)); - assert(partial_write[from].count(to)); - assert(partial_write[from][to].epoch == epoch); - - dout(10) << "cancel_partial " << from << " -> " << to - << " (was epoch " << partial_write[from][to].epoch << ")" - << dendl; - - partial_write[from].erase(to); - if (partial_write[from].empty()) - partial_write.erase(from); - - dec_unflushed( EBOFS_BC_FLUSH_PARTIAL, epoch ); -} - - -void BufferCache::add_shadow_partial(block_t from, BufferHead *bh) -{ - dout(10) << "add_shadow_partial from " << from << " " << *bh << dendl; - shadow_partials[from].insert(bh); -} - -void BufferCache::cancel_shadow_partial(block_t from, BufferHead *bh) -{ - dout(10) << "cancel_shadow_partial from " << from << " " << *bh << dendl; - shadow_partials[from].erase(bh); -} diff --git a/branches/sage/mds/ebofs/BufferCache.h b/branches/sage/mds/ebofs/BufferCache.h deleted file mode 100644 index 346a5cc785618..0000000000000 --- a/branches/sage/mds/ebofs/BufferCache.h +++ /dev/null @@ -1,723 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __EBOFS_BUFFERCACHE_H -#define __EBOFS_BUFFERCACHE_H - -#include "include/lru.h" -#include "include/Context.h" - -#include "common/Clock.h" - -#include "types.h" -#include "BlockDevice.h" - -#include "include/interval_set.h" -#include "include/xlist.h" - -class ObjectCache; -class BufferCache; -class Onode; - -class BufferHead : public LRUObject { - public: - /* - * - buffer_heads should always break across disk extent boundaries - * - partial buffer_heads are always 1 block. - */ - const static int STATE_MISSING = 0; // missing; data is on disk, but not loaded. - const static int STATE_CLEAN = 1; // Rw clean - const static int STATE_DIRTY = 2; // RW dirty - const static int STATE_TX = 3; // Rw flushing to disk - const static int STATE_RX = 4; // w reading from disk - const static int STATE_PARTIAL = 5; // reading from disk, + partial content map. always 1 block. - - public: - ObjectCache *oc; - - bufferlist data; - - ioh_t rx_ioh; // - Extent rx_from; - ioh_t tx_ioh; // - block_t tx_block; - block_t partial_tx_to; - version_t partial_tx_epoch; - - map partial; // partial dirty content overlayed onto incoming data - - map< block_t, list > waitfor_read; - - set shadows; // shadow bh's that clone()ed me. - BufferHead* shadow_of; - - - private: - int ref; - int state; - - public: - version_t epoch_modified; - - version_t version; // current version in cache - version_t last_flushed; // last version flushed to disk - - Extent object_loc; // block position _in_object_ - - utime_t dirty_stamp; - //xlist::item xlist_dirty; - - bool want_to_expire; // wants to be at bottom of lru - - public: - BufferHead(ObjectCache *o) : - oc(o), //cancellable_ioh(0), tx_epoch(0), - rx_ioh(0), tx_ioh(0), tx_block(0), partial_tx_to(0), partial_tx_epoch(0), - shadow_of(0), - ref(0), state(STATE_MISSING), epoch_modified(0), version(0), last_flushed(0), - //xlist_dirty(this), - want_to_expire(false) - {} - ~BufferHead() { - unpin_shadows(); - } - - ObjectCache *get_oc() { return oc; } - - int get() { - assert(ref >= 0); - if (ref == 0) lru_pin(); - return ++ref; - } - int put() { - assert(ref > 0); - if (ref == 1) lru_unpin(); - --ref; - return ref; - } - int get_num_ref() { return ref; } - - block_t start() { return object_loc.start; } - void set_start(block_t s) { object_loc.start = s; } - block_t length() { return object_loc.length; } - void set_length(block_t l) { object_loc.length = l; } - block_t end() { return start() + length(); } - block_t last() { return end()-1; } - - version_t get_version() { return version; } - void set_version(version_t v) { version = v; } - version_t get_last_flushed() { return last_flushed; } - void set_last_flushed(version_t v) { - if (v <= last_flushed) cout << "last_flushed begin set to " << v << ", was " << last_flushed << std::endl; - assert(v > last_flushed); - last_flushed = v; - } - - utime_t get_dirty_stamp() { return dirty_stamp; } - void set_dirty_stamp(utime_t t) { dirty_stamp = t; } - - void set_state(int s) { - if (s == STATE_PARTIAL || s == STATE_RX || s == STATE_TX) get(); - if (state == STATE_PARTIAL || state == STATE_RX || state == STATE_TX) put(); - - if ((state == STATE_TX && s != STATE_TX) || - (state == STATE_PARTIAL && s != STATE_PARTIAL)) - unpin_shadows(); - - state = s; - } - int get_state() { return state; } - - bool is_missing() { return state == STATE_MISSING; } - bool is_dirty() { return state == STATE_DIRTY; } - bool is_clean() { return state == STATE_CLEAN; } - bool is_tx() { return state == STATE_TX; } - bool is_rx() { return state == STATE_RX; } - bool is_partial() { return state == STATE_PARTIAL; } - - //bool is_partial_writes() { return !partial_write.empty(); } - //void finish_partials(); - //void cancel_partials(); - //void queue_partial_write(block_t b); - - void add_shadow(BufferHead *dup) { - shadows.insert(dup); - dup->shadow_of = this; - dup->get(); - } - void remove_shadow(BufferHead *dup) { - shadows.erase(dup); - dup->shadow_of = 0; - dup->put(); - } - void unpin_shadows() { - for (set::iterator p = shadows.begin(); - p != shadows.end(); - ++p) { - //cout << "unpin shadow " << *p << std::endl; - (*p)->shadow_of = 0; - (*p)->put(); - } - shadows.clear(); - } - - void copy_partial_substr(off_t start, off_t end, bufferlist& bl) { - map::iterator i = partial.begin(); - - // skip first bits (fully to left) - while ((i->first + i->second.length() < start) && - i != partial.end()) - i++; - assert(i != partial.end()); - assert(i->first <= start); - - // first - unsigned bhoff = MAX(start, i->first) - i->first; - unsigned bhlen = MIN(end-start, i->second.length()); - bl.substr_of( i->second, bhoff, bhlen ); - - off_t pos = i->first + i->second.length(); - - // have continuous to end? - for (i++; i != partial.end(); i++) { - if (pos >= end) break; - assert(pos == i->first); - - pos = i->first + i->second.length(); - - if (pos <= end) { // this whole frag - bl.append( i->second ); - } else { // partial end - unsigned bhlen = end-start-bl.length(); - bufferlist frag; - frag.substr_of( i->second, 0, bhlen ); - bl.claim_append(frag); - break; // done. - } - } - - assert(pos >= end); - assert(bl.length() == (unsigned)(end-start)); - } - - bool have_partial_range(off_t start, off_t end) { - map::iterator i = partial.begin(); - - // skip first bits (fully to left) - while ((i->first + i->second.length() < start) && - i != partial.end()) - i++; - if (i == partial.end()) return false; - - // have start? - if (i->first > start) return false; - off_t pos = i->first + i->second.length(); - - // have continuous to end? - for (i++; i != partial.end(); i++) { - assert(pos <= i->first); - if (pos < i->first) return false; - assert(pos == i->first); - pos = i->first + i->second.length(); - if (pos >= end) break; // gone far enough - } - - if (pos >= end) return true; - return false; - } - - bool partial_is_complete(off_t size) { - return have_partial_range( 0, MIN(size, EBOFS_BLOCK_SIZE) ); - //(off_t)(start()*EBOFS_BLOCK_SIZE), - //MIN( size, (off_t)(end()*EBOFS_BLOCK_SIZE) ) ); - } - void apply_partial() { - apply_partial(data, partial); - partial.clear(); - } - static void apply_partial(bufferlist& bl, map& pm) { - assert(bl.length() == (unsigned)EBOFS_BLOCK_SIZE); - //assert(partial_is_complete()); - //cout << "apply_partial" << std::endl; - for (map::iterator i = pm.begin(); - i != pm.end(); - i++) { - int pos = i->first; - //cout << " frag at opos " << i->first << " bhpos " << pos << " len " << i->second.length() << std::endl; - bl.copy_in(pos, i->second.length(), i->second); - } - pm.clear(); - } - void add_partial(off_t off, bufferlist& p) { - unsigned len = p.length(); - assert(len <= (unsigned)EBOFS_BLOCK_SIZE); - //assert(off >= (off_t)(start()*EBOFS_BLOCK_SIZE)); - //assert(off + len <= (off_t)(end()*EBOFS_BLOCK_SIZE)); - assert(off >= 0); - assert(off + len <= EBOFS_BLOCK_SIZE); - - // trim any existing that overlaps - map::iterator i = partial.begin(); - while (i != partial.end()) { - // is [off,off+len)... - // past i? - if (off >= i->first + i->second.length()) { - i++; - continue; - } - // before i? - if (i->first >= off+len) break; - - // does [off,off+len)... - // overlap all of i? - if (off <= i->first && off+len >= i->first + i->second.length()) { - // erase it and move on. - partial.erase(i++); - continue; - } - // overlap tail of i? - if (off > i->first && off+len >= i->first + i->second.length()) { - // shorten i. - bufferlist o; - o.claim( i->second ); - unsigned taillen = off - i->first; - i->second.substr_of(o, 0, taillen); - i++; - continue; - } - // overlap head of i? - if (off <= i->first && off+len < i->first + i->second.length()) { - // move i (make new tail). - off_t tailoff = off+len; - unsigned trim = tailoff - i->first; - partial[tailoff].substr_of(i->second, trim, i->second.length()-trim); - partial.erase(i++); // should now be at tailoff - i++; - continue; - } - // split i? - if (off > i->first && off+len < i->first + i->second.length()) { - bufferlist o; - o.claim( i->second ); - // shorten head - unsigned headlen = off - i->first; - i->second.substr_of(o, 0, headlen); - // new tail - unsigned tailoff = off+len - i->first; - unsigned taillen = o.length() - len - headlen; - partial[off+len].substr_of(o, tailoff, taillen); - break; - } - assert(0); - } - - // insert - partial[off] = p; - } - -}; - -inline ostream& operator<<(ostream& out, BufferHead& bh) -{ - out << "bufferhead(" << bh.start() << "~" << bh.length(); - out << " v" << bh.get_version() << "/" << bh.get_last_flushed(); - if (bh.is_missing()) out << " missing"; - if (bh.is_dirty()) out << " dirty"; - if (bh.is_clean()) out << " clean"; - if (bh.is_rx()) out << " rx"; - if (bh.is_tx()) out << " tx"; - if (bh.is_partial()) out << " partial"; - - // include epoch modified? - if (bh.is_dirty() || bh.is_tx() || bh.is_partial()) - out << "(e" << bh.epoch_modified << ")"; - - //out << " " << bh.data.length(); - out << " " << &bh; - out << ")"; - return out; -} - - -class ObjectCache { - public: - object_t object_id; - Onode *on; - BufferCache *bc; - - private: - map data; - int ref; - - public: - version_t write_count; - - - public: - ObjectCache(object_t o, Onode *_on, BufferCache *b) : - object_id(o), on(_on), bc(b), ref(0), - write_count(0) { } - ~ObjectCache() { - assert(data.empty()); - assert(ref == 0); - } - - int get() { - ++ref; - //cout << "oc.get " << object_id << " " << ref << std::endl; - return ref; - } - int put() { - assert(ref > 0); - --ref; - //cout << "oc.put " << object_id << " " << ref << std::endl; - return ref; - } - - object_t get_object_id() { return object_id; } - - void add_bh(BufferHead *bh) { - // add to my map - assert(data.count(bh->start()) == 0); - - if (0) { // sanity check FIXME DEBUG - //cout << "add_bh " << bh->start() << "~" << bh->length() << std::endl; - map::iterator p = data.lower_bound(bh->start()); - if (p != data.end()) { - //cout << " after " << *p->second << std::endl; - //cout << " after starts at " << p->first << std::endl; - assert(p->first >= bh->end()); - } - if (p != data.begin()) { - p--; - //cout << " before starts at " << p->second->start() - //<< " and ends at " << p->second->end() << std::endl; - //cout << " before " << *p->second << std::endl; - assert(p->second->end() <= bh->start()); - } - } - - data[bh->start()] = bh; - } - void remove_bh(BufferHead *bh) { - assert(data.count(bh->start())); - data.erase(bh->start()); - } - bool is_empty() { return data.empty(); } - - void try_merge_bh(BufferHead *bh); - void try_merge_bh_left(map::iterator& p); - void try_merge_bh_right(map::iterator& p); - BufferHead* merge_bh_left(BufferHead *left, BufferHead *right); - - int find_tx(block_t start, block_t len, - list& tx); - - int map_read(block_t start, block_t len, - map& hits, // hits - map& missing, // read these from disk - map& rx, // wait for these to finish reading from disk - map& partial); // (maybe) wait for these to read from disk - int try_map_read(block_t start, block_t len); // just tell us how many extents we're missing. - - - int map_write(block_t start, block_t len, - map& hits, - version_t super_epoch); // can write to these. - void touch_bottom(block_t bstart, block_t blast); - - BufferHead *split(BufferHead *bh, block_t off); - - /*int scan_versions(block_t start, block_t len, - version_t& low, version_t& high); - */ - - void rx_finish(ioh_t ioh, block_t start, block_t length, bufferlist& bl); - void tx_finish(ioh_t ioh, block_t start, block_t length, version_t v, version_t epoch); - - void truncate(block_t blocks, version_t super_epoch); - // void tear_down(); - - void clone_to(Onode *other); - - void dump() { - for (map::iterator i = data.begin(); - i != data.end(); - i++) - cout << "dump: " << i->first << ": " << *i->second << std::endl; - } - -}; - - - -class BufferCache { - public: - Mutex &ebofs_lock; // hack: this is a ref to global ebofs_lock - BlockDevice &dev; - - //xlist dirty_bh; - - LRU lru_dirty, lru_rest; - - private: - Cond stat_cond; - Cond flush_cond; - int stat_waiter; - - off_t stat_clean; - off_t stat_dirty; - off_t stat_rx; - off_t stat_tx; - off_t stat_partial; - off_t stat_missing; - -#define EBOFS_BC_FLUSH_BHWRITE 0 -#define EBOFS_BC_FLUSH_PARTIAL 1 - - map epoch_unflushed[2]; - - /* partial writes - incomplete blocks that can't be written until - * their prior content is read and overlayed with the new data. - * - * we put partial block management here because objects may be deleted - * before the read completes, but the write may have been committed in a - * prior epoch. - * - * we map: src block -> dest block -> PartialWrite - * - * really, at most there will only ever be two of these, for current+previous epochs. - */ - class PartialWrite { - public: - map partial; // partial dirty content overlayed onto incoming data - version_t epoch; - }; - - map > partial_write; // queued writes w/ partial content - map > shadow_partials; - - public: - BufferCache(BlockDevice& d, Mutex& el) : - ebofs_lock(el), dev(d), - stat_waiter(0), - stat_clean(0), stat_dirty(0), stat_rx(0), stat_tx(0), stat_partial(0), stat_missing(0) - {} - - - off_t get_size() { - return stat_clean+stat_dirty+stat_rx+stat_tx+stat_partial; - } - off_t get_trimmable() { - return stat_clean; - } - - - // bh's in cache - void add_bh(BufferHead *bh) { - bh->get_oc()->add_bh(bh); - if (bh->is_dirty()) { - lru_dirty.lru_insert_mid(bh); - //dirty_bh.push_back(&bh->xlist_dirty); - } else - lru_rest.lru_insert_mid(bh); - stat_add(bh); - } - void touch(BufferHead *bh) { - if (bh->is_dirty()) { - lru_dirty.lru_touch(bh); - } else - lru_rest.lru_touch(bh); - } - void touch_bottom(BufferHead *bh) { - if (bh->is_dirty()) { - bh->want_to_expire = true; - lru_dirty.lru_bottouch(bh); - } else - lru_rest.lru_bottouch(bh); - } - void remove_bh(BufferHead *bh) { - bh->get_oc()->remove_bh(bh); - stat_sub(bh); - if (bh->is_dirty()) { - lru_dirty.lru_remove(bh); - //dirty_bh.push_back(&bh->xlist_dirty); - } else - lru_rest.lru_remove(bh); - } - - // stats - void stat_add(BufferHead *bh) { - switch (bh->get_state()) { - case BufferHead::STATE_MISSING: stat_missing += bh->length(); break; - case BufferHead::STATE_CLEAN: stat_clean += bh->length(); break; - case BufferHead::STATE_DIRTY: stat_dirty += bh->length(); break; - case BufferHead::STATE_TX: stat_tx += bh->length(); break; - case BufferHead::STATE_RX: stat_rx += bh->length(); break; - case BufferHead::STATE_PARTIAL: stat_partial += bh->length(); break; - } - if (stat_waiter) stat_cond.Signal(); - } - void stat_sub(BufferHead *bh) { - switch (bh->get_state()) { - case BufferHead::STATE_MISSING: stat_missing -= bh->length(); break; - case BufferHead::STATE_CLEAN: stat_clean -= bh->length(); break; - case BufferHead::STATE_DIRTY: stat_dirty -= bh->length(); break; - case BufferHead::STATE_TX: stat_tx -= bh->length(); break; - case BufferHead::STATE_RX: stat_rx -= bh->length(); break; - case BufferHead::STATE_PARTIAL: stat_partial -= bh->length(); break; - } - } - off_t get_stat_tx() { return stat_tx; } - off_t get_stat_rx() { return stat_rx; } - off_t get_stat_dirty() { return stat_dirty; } - off_t get_stat_clean() { return stat_clean; } - off_t get_stat_partial() { return stat_partial; } - - - map &get_unflushed(int what) { - return epoch_unflushed[what]; - } - - int get_unflushed(int what, version_t epoch) { - return epoch_unflushed[what][epoch]; - } - void inc_unflushed(int what, version_t epoch) { - epoch_unflushed[what][epoch]++; - //cout << "inc_unflushed " << epoch << " now " << epoch_unflushed[epoch] << std::endl; - } - void dec_unflushed(int what, version_t epoch) { - epoch_unflushed[what][epoch]--; - //cout << "dec_unflushed " << epoch << " now " << epoch_unflushed[epoch] << std::endl; - if (epoch_unflushed[what][epoch] == 0) - flush_cond.Signal(); - } - - void waitfor_stat() { - stat_waiter++; - stat_cond.Wait(ebofs_lock); - stat_waiter--; - } - void waitfor_flush() { - flush_cond.Wait(ebofs_lock); - } - - - // bh state - void set_state(BufferHead *bh, int s) { - // move between lru lists? - if (s == BufferHead::STATE_DIRTY && bh->get_state() != BufferHead::STATE_DIRTY) { - lru_rest.lru_remove(bh); - lru_dirty.lru_insert_top(bh); - //dirty_bh.push_back(&bh->xlist_dirty); - } - if (s != BufferHead::STATE_DIRTY && bh->get_state() == BufferHead::STATE_DIRTY) { - lru_dirty.lru_remove(bh); - if (bh->want_to_expire) - lru_rest.lru_insert_bot(bh); - else - lru_rest.lru_insert_mid(bh); - //dirty_bh.remove(&bh->xlist_dirty); - } - - // set state - stat_sub(bh); - bh->set_state(s); - stat_add(bh); - } - - void copy_state(BufferHead *bh1, BufferHead *bh2) { - set_state(bh2, bh1->get_state()); - } - - void mark_missing(BufferHead *bh) { set_state(bh, BufferHead::STATE_MISSING); }; - void mark_clean(BufferHead *bh) { set_state(bh, BufferHead::STATE_CLEAN); }; - void mark_rx(BufferHead *bh) { set_state(bh, BufferHead::STATE_RX); }; - void mark_partial(BufferHead *bh) { set_state(bh, BufferHead::STATE_PARTIAL); }; - void mark_tx(BufferHead *bh) { set_state(bh, BufferHead::STATE_TX); }; - void mark_dirty(BufferHead *bh) { - set_state(bh, BufferHead::STATE_DIRTY); - bh->set_dirty_stamp(g_clock.now()); - }; - - - // io - void bh_read(Onode *on, BufferHead *bh, block_t from=0); - void bh_write(Onode *on, BufferHead *bh, block_t shouldbe=0); - - bool bh_cancel_read(BufferHead *bh); - bool bh_cancel_write(BufferHead *bh, version_t cur_epoch); - - void bh_queue_partial_write(Onode *on, BufferHead *bh); - void bh_cancel_partial_write(BufferHead *bh); - - void queue_partial(block_t from, block_t to, map& partial, version_t epoch); - void cancel_partial(block_t from, block_t to, version_t epoch); - - void add_shadow_partial(block_t from, BufferHead *bh); - void cancel_shadow_partial(block_t from, BufferHead *bh); - - void rx_finish(ObjectCache *oc, ioh_t ioh, block_t start, block_t len, block_t diskstart, bufferlist& bl); - void tx_finish(ObjectCache *oc, ioh_t ioh, block_t start, block_t len, version_t v, version_t e); - void partial_tx_finish(version_t epoch); - - friend class C_E_FlushPartial; - - // bh fun - BufferHead *split(BufferHead *orig, block_t after); -}; - - -class C_OC_RxFinish : public BlockDevice::callback { - Mutex &lock; - ObjectCache *oc; - block_t start, length; - block_t diskstart; -public: - bufferlist bl; - C_OC_RxFinish(Mutex &m, ObjectCache *o, block_t s, block_t l, block_t ds) : - lock(m), oc(o), start(s), length(l), diskstart(ds) {} - void finish(ioh_t ioh, int r) { - oc->bc->rx_finish(oc, ioh, start, length, diskstart, bl); - } -}; - -class C_OC_TxFinish : public BlockDevice::callback { - Mutex &lock; - ObjectCache *oc; - block_t start, length; - version_t version; - version_t epoch; - public: - C_OC_TxFinish(Mutex &m, ObjectCache *o, block_t s, block_t l, version_t v, version_t e) : - lock(m), oc(o), start(s), length(l), version(v), epoch(e) {} - void finish(ioh_t ioh, int r) { - oc->bc->tx_finish(oc, ioh, start, length, version, epoch); - } -}; - -class C_OC_PartialTxFinish : public BlockDevice::callback { - BufferCache *bc; - version_t epoch; -public: - C_OC_PartialTxFinish(BufferCache *b, version_t e) : - bc(b), epoch(e) {} - void finish(ioh_t ioh, int r) { - bc->partial_tx_finish(epoch); - } -}; - - -#endif diff --git a/branches/sage/mds/ebofs/Cnode.h b/branches/sage/mds/ebofs/Cnode.h deleted file mode 100644 index 8415978893fb5..0000000000000 --- a/branches/sage/mds/ebofs/Cnode.h +++ /dev/null @@ -1,101 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __EBOFS_CNODE_H -#define __EBOFS_CNODE_H - -#include "Onode.h" - -/* - * collection node - * - * holds attribute metadata for collections. - * colletion membership is stored in b+tree tables, independent of tte cnode. - */ - -class Cnode : public LRUObject -{ - private: - int ref; - bool dirty; - - public: - coll_t coll_id; - Extent cnode_loc; - - map attr; - - public: - Cnode(coll_t cid) : ref(0), dirty(false), coll_id(cid) { - cnode_loc.length = 0; - } - ~Cnode() { - } - - block_t get_cnode_id() { return cnode_loc.start; } - int get_cnode_len() { return cnode_loc.length; } - - void get() { - if (ref == 0) lru_pin(); - ref++; - } - void put() { - ref--; - if (ref == 0) lru_unpin(); - } - int get_ref_count() { return ref; } - - void mark_dirty() { - if (!dirty) { - dirty = true; - get(); - } - } - void mark_clean() { - if (dirty) { - dirty = false; - put(); - } - } - bool is_dirty() { return dirty; } - - - int get_attr_bytes() { - int s = 0; - for (map::iterator i = attr.begin(); - i != attr.end(); - i++) { - s += i->first.length() + 1; - s += i->second.length() + sizeof(int); - } - return s; - } - - // - //???void clear(); - - -}; - -inline ostream& operator<<(ostream& out, Cnode& cn) -{ - out << "cnode(" << hex << cn.coll_id << dec; - if (cn.is_dirty()) out << " dirty"; - //out << " " << &cn; - out << ")"; - return out; -} - -#endif diff --git a/branches/sage/mds/ebofs/Ebofs.cc b/branches/sage/mds/ebofs/Ebofs.cc deleted file mode 100644 index b1f6ab7539467..0000000000000 --- a/branches/sage/mds/ebofs/Ebofs.cc +++ /dev/null @@ -1,3628 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "Ebofs.h" - -#include "FileJournal.h" - -#include - -#ifndef DARWIN -#include -#else -#include -#include -#endif // DARWIN - -// ******************* - -#define dout(x) if (x <= g_conf.debug_ebofs) *_dout << dbeginl << g_clock.now() << " ebofs(" << dev.get_device_name() << ")." -#define derr(x) if (x <= g_conf.debug_ebofs) *_derr << dbeginl << g_clock.now() << " ebofs(" << dev.get_device_name() << ")." - - -char *nice_blocks(block_t b) -{ - static char s[20]; - float sz = b*4.0; - if (sz > (10 << 20)) - sprintf(s,"%.1f GB", sz / (1024.0*1024.0)); - else if (sz > (10 << 10)) - sprintf(s,"%.1f MB", sz / (1024.0)); - else - sprintf(s,"%llu KB", b*4ULL); - return s; -} - -int Ebofs::mount() -{ - ebofs_lock.Lock(); - assert(!mounted); - - // open dev - int r = dev.open(&idle_kicker); - if (r < 0) { - ebofs_lock.Unlock(); - return r; - } - - dout(2) << "mounting " << dev.get_device_name() << " " << dev.get_num_blocks() << " blocks, " << nice_blocks(dev.get_num_blocks()) << dendl; - - // read super - bufferptr bp1 = buffer::create_page_aligned(EBOFS_BLOCK_SIZE); - bufferptr bp2 = buffer::create_page_aligned(EBOFS_BLOCK_SIZE); - dev.read(0, 1, bp1); - dev.read(1, 1, bp2); - - struct ebofs_super *sb1 = (struct ebofs_super*)bp1.c_str(); - struct ebofs_super *sb2 = (struct ebofs_super*)bp2.c_str(); - - // valid superblocks? - if (sb1->s_magic != EBOFS_MAGIC || - sb2->s_magic != EBOFS_MAGIC) { - derr(0) << "mount bad magic, not a valid EBOFS file system" << dendl; - return -EINVAL; - } - if (sb1->num_blocks > dev.get_num_blocks() || - sb2->num_blocks > dev.get_num_blocks()) { - derr(0) << "mount superblock size exceeds actual device size" << dendl; - return -EINVAL; - } - - dout(3) << "mount super @0 epoch " << sb1->epoch << dendl; - dout(3) << "mount super @1 epoch " << sb2->epoch << dendl; - - // pick newest super - struct ebofs_super *sb = 0; - if (sb1->epoch > sb2->epoch) - sb = sb1; - else - sb = sb2; - super_epoch = sb->epoch; - dout(3) << "mount epoch " << super_epoch << dendl; - assert(super_epoch == sb->epoch); - - super_fsid = sb->fsid; - - free_blocks = sb->free_blocks; - limbo_blocks = sb->limbo_blocks; - - // init node pools - dout(3) << "mount nodepool" << dendl; - nodepool.init( &sb->nodepool ); - nodepool.read_usemap_and_clean_nodes( dev, super_epoch ); - - // open tables - dout(3) << "mount opening tables" << dendl; - object_tab = new Table( nodepool, sb->object_tab ); - for (int i=0; i( nodepool, sb->free_tab[i] ); - limbo_tab = new Table( nodepool, sb->limbo_tab ); - alloc_tab = new Table >( nodepool, sb->alloc_tab ); - - collection_tab = new Table( nodepool, sb->collection_tab ); - co_tab = new Table( nodepool, sb->co_tab ); - - verify_tables(); - - allocator.release_limbo(); - - - // open journal? - if (journalfn) { - journal = new FileJournal(this, journalfn); - if (journal->open() < 0) { - dout(3) << "mount journal " << journalfn << " open failed" << dendl; - delete journal; - journal = 0; - } else { - dout(3) << "mount journal " << journalfn << " opened, replaying" << dendl; - - while (1) { - bufferlist bl; - epoch_t e; - if (!journal->read_entry(bl, e)) { - dout(3) << "mount replay: end of journal, done." << dendl; - break; - } - - if (e < super_epoch) { - dout(3) << "mount replay: skipping old entry in epoch " << e << " < " << super_epoch << dendl; - continue; - } - if (e == super_epoch+1) { - super_epoch++; - dout(3) << "mount replay: jumped to next epoch " << super_epoch << dendl; - } - assert(e == super_epoch); - - dout(3) << "mount replay: applying transaction in epoch " << e << dendl; - Transaction t; - int off = 0; - t._decode(bl, off); - _apply_transaction(t); - } - - // done reading, make writeable. - journal->make_writeable(); - } - } - - dout(3) << "mount starting commit+finisher threads" << dendl; - commit_thread.create(); - finisher_thread.create(); - - dout(1) << "mounted " << dev.get_device_name() << " " << dev.get_num_blocks() << " blocks, " << nice_blocks(dev.get_num_blocks()) - << (journal ? ", with journal":", no journal") - << dendl; - mounted = true; - - - ebofs_lock.Unlock(); - return 0; -} - - -int Ebofs::mkfs() -{ - ebofs_lock.Lock(); - assert(!mounted); - - int r = dev.open(); - if (r < 0) { - ebofs_lock.Unlock(); - return r; - } - - block_t num_blocks = dev.get_num_blocks(); - - // make a super-random fsid - srand48(time(0) ^ getpid()); - super_fsid = ((uint64_t)lrand48() << 32) ^ mrand48(); - srand(time(0) ^ getpid()); - super_fsid ^= rand(); - super_fsid ^= (uint64_t)rand() << 32; - - free_blocks = 0; - limbo_blocks = 0; - - // create first noderegion - Extent nr; - nr.start = 2; - nr.length = 20+ (num_blocks / 1000); - if (nr.length < 10) nr.length = 10; - nodepool.add_region(nr); - dout(10) << "mkfs: first node region at " << nr << dendl; - - // allocate two usemaps - block_t usemap_len = nodepool.get_usemap_len(); - nodepool.usemap_even.start = nr.end(); - nodepool.usemap_even.length = usemap_len; - nodepool.usemap_odd.start = nodepool.usemap_even.end(); - nodepool.usemap_odd.length = usemap_len; - dout(10) << "mkfs: even usemap at " << nodepool.usemap_even << dendl; - dout(10) << "mkfs: odd usemap at " << nodepool.usemap_odd << dendl; - nodepool.init_usemap(); - - // init tables - struct ebofs_table empty; - empty.num_keys = 0; - empty.root = -1; - empty.depth = 0; - - object_tab = new Table( nodepool, empty ); - collection_tab = new Table( nodepool, empty ); - - for (int i=0; i( nodepool, empty ); - limbo_tab = new Table( nodepool, empty ); - alloc_tab = new Table >( nodepool, empty ); - - co_tab = new Table( nodepool, empty ); - - // add free space - Extent left; - left.start = nodepool.usemap_odd.end(); - left.length = num_blocks - left.start; - dout(10) << "mkfs: free data blocks at " << left << dendl; - allocator._release_into_limbo( left ); - if (g_conf.ebofs_cloneable) { - allocator.alloc_inc(nr); - allocator.alloc_inc(nodepool.usemap_even); - allocator.alloc_inc(nodepool.usemap_odd); - } - allocator.commit_limbo(); // -> limbo_tab - allocator.release_limbo(); // -> free_tab - - // write nodes, super, 2x - dout(10) << "mkfs: flushing nodepool and superblocks (2x)" << dendl; - - for (epoch_t e=0; e<2; e++) { - nodepool.commit_start(dev, e); - nodepool.commit_wait(); - bufferptr superbp; - prepare_super(e, superbp); - write_super(e, superbp); - } - - // free memory - dout(10) << "mkfs: cleaning up" << dendl; - close_tables(); - - dev.close(); - - - // create journal? - if (journalfn) { - Journal *journal = new FileJournal(this, journalfn); - if (journal->create() < 0) { - dout(3) << "mount journal " << journalfn << " created failed" << dendl; - } else { - dout(3) << "mount journal " << journalfn << " created" << dendl; - } - delete journal; - } - - dout(2) << "mkfs: " << dev.get_device_name() << " " << dev.get_num_blocks() << " blocks, " << nice_blocks(dev.get_num_blocks()) << dendl; - ebofs_lock.Unlock(); - return 0; -} - -void Ebofs::close_tables() -{ - // close tables - delete object_tab; - for (int i=0; iverify("onmount"); - limbo_tab->verify("onmount"); - alloc_tab->verify("onmount"); - collection_tab->verify("onmount"); - co_tab->verify("onmount"); - for (int i=0; iverify("onmount"); - - g_conf.ebofs_verify = o; -} - -int Ebofs::umount() -{ - ebofs_lock.Lock(); - - // mark unmounting - dout(2) << "umount start" << dendl; - readonly = true; - unmounting = true; - - // kick commit thread - dout(5) << "umount stopping commit thread" << dendl; - commit_cond.Signal(); - ebofs_lock.Unlock(); - commit_thread.join(); - ebofs_lock.Lock(); - - // kick finisher thread - dout(5) << "umount stopping finisher thread" << dendl; - finisher_lock.Lock(); - finisher_stop = true; - finisher_cond.Signal(); - finisher_lock.Unlock(); - - finisher_thread.join(); - - trim_bc(0); - trim_inodes(0); - - for (hash_map::iterator i = onode_map.begin(); - i != onode_map.end(); - i++) { - dout(0) << "umount *** leftover: " << i->first << " " << *(i->second) << dendl; - } - - // free memory - dout(5) << "umount cleaning up" << dendl; - close_tables(); - dev.close(); - readonly = unmounting = mounted = false; - - dout(2) << "umount done on " << dev.get_device_name() << dendl; - ebofs_lock.Unlock(); - return 0; -} - - - -void Ebofs::prepare_super(version_t epoch, bufferptr& bp) -{ - struct ebofs_super sb; - - dout(10) << "prepare_super v" << epoch << dendl; - - // fill in super - memset(&sb, 0, sizeof(sb)); - sb.s_magic = EBOFS_MAGIC; - sb.fsid = super_fsid; - sb.epoch = epoch; - sb.num_blocks = dev.get_num_blocks(); - - sb.free_blocks = free_blocks; - sb.limbo_blocks = limbo_blocks; - - - // tables - sb.object_tab.num_keys = object_tab->get_num_keys(); - sb.object_tab.root = object_tab->get_root(); - sb.object_tab.depth = object_tab->get_depth(); - - for (int i=0; iget_num_keys(); - sb.free_tab[i].root = free_tab[i]->get_root(); - sb.free_tab[i].depth = free_tab[i]->get_depth(); - } - sb.limbo_tab.num_keys = limbo_tab->get_num_keys(); - sb.limbo_tab.root = limbo_tab->get_root(); - sb.limbo_tab.depth = limbo_tab->get_depth(); - - sb.alloc_tab.num_keys = alloc_tab->get_num_keys(); - sb.alloc_tab.root = alloc_tab->get_root(); - sb.alloc_tab.depth = alloc_tab->get_depth(); - - sb.collection_tab.num_keys = collection_tab->get_num_keys(); - sb.collection_tab.root = collection_tab->get_root(); - sb.collection_tab.depth = collection_tab->get_depth(); - - sb.co_tab.num_keys = co_tab->get_num_keys(); - sb.co_tab.root = co_tab->get_root(); - sb.co_tab.depth = co_tab->get_depth(); - - // pools - sb.nodepool.num_regions = nodepool.region_loc.size(); - for (unsigned i=0; i 0) { - // *** this is an ugly ugly hack **** - // do not use - // periodically check for idle block device - utime_t idle_wait(0, g_conf.ebofs_idle_commit_ms*1000); - dout(20) << "commit_thread sleeping (up to) " << g_conf.ebofs_commit_ms << " ms, " - << idle_wait << " ms if idle" << dendl; - utime_t now = g_clock.now(); - utime_t stop = now; - stop += (double)g_conf.ebofs_commit_ms / 1000.0; - do { - utime_t wait = MIN(stop - now, idle_wait); - if (commit_cond.WaitInterval(ebofs_lock, wait) != ETIMEDOUT) { - dout(20) << "commit_thread i got kicked" << dendl; - break; // we got kicked - } - if (dev.is_idle()) { - dout(20) << "commit_thread bdev is idle, early commit" << dendl; - break; // dev is idle - } - now = g_clock.now(); - dout(20) << "commit_thread now=" << now << ", stop at " << stop << dendl; - - // hack hack - //if (!left) g_conf.debug_ebofs = 10; - // /hack hack - } while (now < stop); - dout(20) << "commit_thread done with idle loop" << dendl; - - } else { - // normal wait+timeout - dout(20) << "commit_thread sleeping (up to) " << g_conf.ebofs_commit_ms << " ms" << dendl; - commit_cond.WaitInterval(ebofs_lock, utime_t(0, g_conf.ebofs_commit_ms*1000)); - } - - } else { - // DEBUG.. wait until kicked - dout(10) << "commit_thread no commit_ms, waiting until kicked" << dendl; - commit_cond.Wait(ebofs_lock); - } - - if (unmounting) { - dout(10) << "commit_thread unmounting: final commit pass" << dendl; - assert(readonly); - unmounting = false; - mounted = false; - dirty = true; - } - - if (!dirty && !limbo_blocks) { - dout(10) << "commit_thread not dirty" << dendl; - } - else { - super_epoch++; - dirty = false; - - derr(10) << "commit_thread commit start, new epoch " << super_epoch << dendl; - dout(10) << "commit_thread commit start, new epoch " << super_epoch << dendl; - dout(2) << "commit_thread data: " - << 100*(dev.get_num_blocks()-get_free_blocks())/dev.get_num_blocks() << "% used, " - << get_free_blocks() << " (" << 100*get_free_blocks()/dev.get_num_blocks() - << "%) free in " << get_free_extents() - << ", " << get_limbo_blocks() << " (" << 100*get_limbo_blocks()/dev.get_num_blocks() - << "%) limbo in " << get_limbo_extents() - << dendl; - dout(2) << "commit_thread nodes: " - << 100*nodepool.get_num_used()/nodepool.get_num_total() << "% used, " - << nodepool.get_num_free() << " (" << 100*nodepool.get_num_free()/nodepool.get_num_total() << "%) free, " - << nodepool.get_num_limbo() << " (" << 100*nodepool.get_num_limbo()/nodepool.get_num_total() << "%) limbo, " - << nodepool.get_num_total() << " total." << dendl; - dout(2) << "commit_thread bc: " - << "size " << bc.get_size() - << ", trimmable " << bc.get_trimmable() - << ", max " << g_conf.ebofs_bc_size - << "; dirty " << bc.get_stat_dirty() - << ", tx " << bc.get_stat_tx() - << ", max dirty " << g_conf.ebofs_bc_max_dirty - << dendl; - - if (journal) journal->commit_epoch_start(); - - // (async) write onodes+condes (do this first; it currently involves inode reallocation) - commit_inodes_start(); - - allocator.commit_limbo(); // limbo -> limbo_tab - - // (async) write btree nodes - nodepool.commit_start( dev, super_epoch ); - - // blockdev barrier (prioritize our writes!) - dout(30) << "commit_thread barrier. flushing inodes " << inodes_flushing << dendl; - dev.barrier(); - - // prepare super (before any changes get made!) - bufferptr superbp; - prepare_super(super_epoch, superbp); - - // wait for it all to flush (drops global lock) - commit_bc_wait(super_epoch-1); - dout(30) << "commit_thread bc flushed" << dendl; - commit_inodes_wait(); - dout(30) << "commit_thread inodes flushed" << dendl; - nodepool.commit_wait(); - dout(30) << "commit_thread btree nodes flushed" << dendl; - - // ok, now (synchronously) write the prior super! - dout(10) << "commit_thread commit flushed, writing super for prior epoch" << dendl; - ebofs_lock.Unlock(); - write_super(super_epoch, superbp); - ebofs_lock.Lock(); - - dout(10) << "commit_thread wrote super" << dendl; - - // free limbo space now - // (since we're done allocating things, - // AND we've flushed all previous epoch data) - allocator.release_limbo(); // limbo_tab -> free_tabs - - // do we need more node space? - if (nodepool.get_num_free() < nodepool.get_num_total() / 3) { - dout(2) << "commit_thread running low on node space, allocating more." << dendl; - alloc_more_node_space(); - } - - // signal journal - if (journal) journal->commit_epoch_finish(); - - // kick waiters - dout(10) << "commit_thread queueing commit + kicking sync waiters" << dendl; - - queue_finishers(commit_waiters[super_epoch-1]); - commit_waiters.erase(super_epoch-1); - - sync_cond.Signal(); - - dout(10) << "commit_thread commit finish" << dendl; - } - - // trim bc? - trim_bc(); - trim_inodes(); - - } - - dout(10) << "commit_thread finish" << dendl; - commit_thread_started = false; - ebofs_lock.Unlock(); - return 0; -} - - -void Ebofs::alloc_more_node_space() -{ - dout(1) << "alloc_more_node_space free " << nodepool.get_num_free() << "/" << nodepool.get_num_total() << dendl; - - if (nodepool.num_regions() < EBOFS_MAX_NODE_REGIONS) { - int want = nodepool.get_num_total(); - - Extent ex; - allocator.allocate(ex, want, 2); - dout(1) << "alloc_more_node_space wants " << want << " more, got " << ex << dendl; - - Extent even, odd; - unsigned ulen = nodepool.get_usemap_len(nodepool.get_num_total() + ex.length); - allocator.allocate(even, ulen, 2); - allocator.allocate(odd, ulen, 2); - dout(1) << "alloc_more_node_space maps need " << ulen << " x2, got " << even << " " << odd << dendl; - - if (even.length == ulen && odd.length == ulen) { - dout(1) << "alloc_more_node_space got " << ex << ", new usemaps at even " << even << " odd " << odd << dendl; - allocator.release(nodepool.usemap_even); - allocator.release(nodepool.usemap_odd); - nodepool.add_region(ex); - - // expand usemap? - nodepool.usemap_even = even; - nodepool.usemap_odd = odd; - nodepool.expand_usemap(); - } else { - dout (1) << "alloc_more_node_space failed to get space for new usemaps" << dendl; - allocator.release(ex); - allocator.release(even); - allocator.release(odd); - //assert(0); - } - } else { - dout(1) << "alloc_more_node_space already have max node regions!" << dendl; - assert(0); - } -} - - -void *Ebofs::finisher_thread_entry() -{ - finisher_lock.Lock(); - dout(10) << "finisher_thread start" << dendl; - - while (!finisher_stop) { - while (!finisher_queue.empty()) { - list ls; - ls.swap(finisher_queue); - - finisher_lock.Unlock(); - - //ebofs_lock.Lock(); // um.. why lock this? -sage - finish_contexts(ls, 0); - //ebofs_lock.Unlock(); - - finisher_lock.Lock(); - } - if (finisher_stop) break; - - dout(30) << "finisher_thread sleeping" << dendl; - finisher_cond.Wait(finisher_lock); - } - - dout(10) << "finisher_thread start" << dendl; - finisher_lock.Unlock(); - return 0; -} - - -// *** onodes *** - -Onode* Ebofs::new_onode(object_t oid) -{ - Onode* on = new Onode(oid); - - assert(onode_map.count(oid) == 0); - onode_map[oid] = on; - onode_lru.lru_insert_top(on); - - assert(object_tab->lookup(oid) < 0); - object_tab->insert( oid, on->onode_loc ); // even tho i'm not placed yet - - on->get(); - on->onode_loc.start = 0; - on->onode_loc.length = 0; - - dirty_onode(on); - - dout(7) << "new_onode " << *on << dendl; - return on; -} - - -Onode* Ebofs::get_onode(object_t oid) -{ - while (1) { - // in cache? - if (have_onode(oid)) { - // yay - Onode *on = onode_map[oid]; - on->get(); - //dout(0) << "get_onode " << *on << dendl; - return on; - } - - // on disk? - Extent onode_loc; - if (object_tab->lookup(oid, onode_loc) < 0) { - dout(10) << "onode lookup failed on " << oid << dendl; - // object dne. - return 0; - } - - // already loading? - if (waitfor_onode.count(oid)) { - // yep, just wait. - Cond c; - waitfor_onode[oid].push_back(&c); - dout(10) << "get_onode " << oid << " already loading, waiting" << dendl; - c.Wait(ebofs_lock); - continue; - } - - dout(10) << "get_onode reading " << oid << " from " << onode_loc << dendl; - - assert(waitfor_onode.count(oid) == 0); - waitfor_onode[oid].clear(); // this should be empty initially. - - // read it! - bufferlist bl; - bl.push_back( buffer::create_page_aligned( EBOFS_BLOCK_SIZE*onode_loc.length ) ); - - ebofs_lock.Unlock(); - dev.read( onode_loc.start, onode_loc.length, bl ); - ebofs_lock.Lock(); - - // add onode - Onode *on = new Onode(oid); - onode_map[oid] = on; - onode_lru.lru_insert_top(on); - - // parse data block - struct ebofs_onode *eo = (struct ebofs_onode*)bl.c_str(); - if (eo->object_id != oid) { - dout(0) << " wrong oid in onode block: " << eo->object_id << " != " << oid << dendl; - dout(0) << " onode_loc is " << eo->onode_loc << dendl; - dout(0) << " object_size " << eo->object_size << dendl; - dout(0) << " object_blocks " << eo->object_blocks << dendl; - dout(0) << " " << eo->num_collections << " coll + " - << eo->num_attr << " attr + " - << eo->num_extents << " extents" << dendl; - assert(eo->object_id == oid); - } - on->readonly = eo->readonly; - on->onode_loc = eo->onode_loc; - on->object_size = eo->object_size; - on->object_blocks = eo->object_blocks; - - // parse - char *p = bl.c_str() + sizeof(*eo); - - // parse collection list - for (int i=0; inum_collections; i++) { - coll_t c = *((coll_t*)p); - p += sizeof(c); - on->collections.insert(c); - } - - // parse attributes - for (int i=0; inum_attr; i++) { - string key = p; - p += key.length() + 1; - int len = *(int*)(p); - p += sizeof(len); - on->attr[key] = buffer::copy(p, len); - p += len; - dout(15) << "get_onode " << *on << " attr " << key << " len " << len << dendl; - } - - // parse extents - on->extent_map.clear(); - block_t n = 0; - for (int i=0; inum_extents; i++) { - Extent ex = *((Extent*)p); - on->extent_map[n] = ex; - dout(15) << "get_onode " << *on << " ex " << i << ": " << ex << dendl; - n += ex.length; - p += sizeof(Extent); - } - assert(n == on->object_blocks); - - // wake up other waiters - for (list::iterator i = waitfor_onode[oid].begin(); - i != waitfor_onode[oid].end(); - i++) - (*i)->Signal(); - waitfor_onode.erase(oid); // remove Cond list - - on->get(); - //dout(0) << "get_onode " << *on << " (loaded)" << dendl; - return on; - } -} - - -class C_E_InodeFlush : public BlockDevice::callback { - Ebofs *ebofs; -public: - C_E_InodeFlush(Ebofs *e) : ebofs(e) {} - void finish(ioh_t ioh, int r) { - ebofs->flush_inode_finish(); - } -}; - - -void Ebofs::encode_onode(Onode *on, bufferlist& bl, unsigned& off) -{ - // onode - struct ebofs_onode eo; - eo.readonly = on->readonly; - eo.onode_loc = on->onode_loc; - eo.object_id = on->object_id; - eo.object_size = on->object_size; - eo.object_blocks = on->object_blocks; - eo.num_collections = on->collections.size(); - eo.num_attr = on->attr.size(); - eo.num_extents = on->extent_map.size(); - bl.copy_in(off, sizeof(eo), (char*)&eo); - off += sizeof(eo); - - // collections - for (set::iterator i = on->collections.begin(); - i != on->collections.end(); - i++) { - bl.copy_in(off, sizeof(*i), (char*)&(*i)); - off += sizeof(*i); - } - - // attr - for (map::iterator i = on->attr.begin(); - i != on->attr.end(); - i++) { - bl.copy_in(off, i->first.length()+1, i->first.c_str()); - off += i->first.length()+1; - int l = i->second.length(); - bl.copy_in(off, sizeof(int), (char*)&l); - off += sizeof(int); - bl.copy_in(off, l, i->second.c_str()); - off += l; - dout(15) << "write_onode " << *on << " attr " << i->first << " len " << l << dendl; - } - - // extents - for (map::iterator i = on->extent_map.begin(); - i != on->extent_map.end(); - i++) { - bl.copy_in(off, sizeof(Extent), (char*)&(i->second)); - off += sizeof(Extent); - dout(15) << "write_onode " << *on << " ex " << i->first << ": " << i->second << dendl; - } -} - -void Ebofs::write_onode(Onode *on) -{ - // buffer - unsigned bytes = sizeof(ebofs_onode) + on->get_collection_bytes() + on->get_attr_bytes() + on->get_extent_bytes(); - unsigned blocks = (bytes-1)/EBOFS_BLOCK_SIZE + 1; - - bufferlist bl; - bl.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*blocks) ); - - // (always) relocate onode - if (1) { - if (on->onode_loc.length) - allocator.release(on->onode_loc); - - block_t first = 0; - if (on->extent_map.size()) - first = on->extent_map.begin()->second.start; - - allocator.allocate(on->onode_loc, blocks, first); - object_tab->remove( on->object_id ); - object_tab->insert( on->object_id, on->onode_loc ); - //object_tab->verify(); - } - - dout(10) << "write_onode " << *on << " to " << on->onode_loc << dendl; - - unsigned off = 0; - encode_onode(on, bl, off); - assert(off == bytes); - - // write - dev.write( on->onode_loc.start, on->onode_loc.length, bl, - new C_E_InodeFlush(this), "write_onode" ); -} - -void Ebofs::remove_onode(Onode *on) -{ - dout(8) << "remove_onode " << *on << dendl; - - assert(on->get_ref_count() >= 1); // caller - - // tear down buffer cache - if (on->oc) { - on->oc->truncate(0, super_epoch); // this will kick readers along the way. - on->close_oc(); - } - - // remove from onode map, mark dangling/deleted - onode_map.erase(on->object_id); - onode_lru.lru_remove(on); - on->deleted = true; - on->dangling = true; - - // remove from object table - //dout(0) << "remove_onode on " << *on << dendl; - object_tab->remove(on->object_id); - - // free onode space - if (on->onode_loc.length) - allocator.release(on->onode_loc); - - // free data space - for (map::iterator i = on->extent_map.begin(); - i != on->extent_map.end(); - i++) - allocator.release(i->second); - on->extent_map.clear(); - - // remove from collections - for (set::iterator i = on->collections.begin(); - i != on->collections.end(); - i++) { - co_tab->remove(coll_object_t(*i,on->object_id)); - } - on->collections.clear(); - - // dirty -> clean? - if (on->is_dirty()) { - on->mark_clean(); // this unpins *on - dirty_onodes.erase(on); - } - - if (on->get_ref_count() > 1) dout(10) << "remove_onode **** will survive " << *on << dendl; - put_onode(on); - - dirty = true; -} - -void Ebofs::put_onode(Onode *on) -{ - on->put(); - //dout(0) << "put_onode " << *on << dendl; - - if (on->get_ref_count() == 0 && on->dangling) { - //dot(0) << " *** hosing on " << *on << dendl; - delete on; - } -} - -void Ebofs::dirty_onode(Onode *on) -{ - if (!on->is_dirty()) { - on->mark_dirty(); - dirty_onodes.insert(on); - } - dirty = true; -} - -void Ebofs::trim_inodes(int max) -{ - unsigned omax = onode_lru.lru_get_max(); - unsigned cmax = cnode_lru.lru_get_max(); - if (max >= 0) omax = cmax = max; - dout(10) << "trim_inodes start " << onode_lru.lru_get_size() << " / " << omax << " onodes, " - << cnode_lru.lru_get_size() << " / " << cmax << " cnodes" << dendl; - - // onodes - while (onode_lru.lru_get_size() > omax) { - // expire an item - Onode *on = (Onode*)onode_lru.lru_expire(); - if (on == 0) break; // nothing to expire - - // expire - dout(20) << "trim_inodes removing onode " << *on << dendl; - onode_map.erase(on->object_id); - on->dangling = true; - - if (on->get_ref_count() == 0) { - assert(on->oc == 0); // an open oc pins the onode! - delete on; - } else { - dout(-20) << "trim_inodes still active: " << *on << dendl; - assert(0); // huh? - } - } - - - // cnodes - while (cnode_lru.lru_get_size() > cmax) { - // expire an item - Cnode *cn = (Cnode*)cnode_lru.lru_expire(); - if (cn == 0) break; // nothing to expire - - // expire - dout(20) << "trim_inodes removing cnode " << *cn << dendl; - cnode_map.erase(cn->coll_id); - - delete cn; - } - - dout(10) << "trim_inodes finish " - << onode_lru.lru_get_size() << " / " << omax << " onodes, " - << cnode_lru.lru_get_size() << " / " << cmax << " cnodes" << dendl; -} - - - -// *** cnodes **** - -Cnode* Ebofs::new_cnode(coll_t cid) -{ - Cnode* cn = new Cnode(cid); - - assert(cnode_map.count(cid) == 0); - cnode_map[cid] = cn; - cnode_lru.lru_insert_top(cn); - - assert(collection_tab->lookup(cid) < 0); - collection_tab->insert( cid, cn->cnode_loc ); // even tho i'm not placed yet - - cn->get(); - cn->cnode_loc.start = 0; - cn->cnode_loc.length = 0; - - dirty_cnode(cn); - - return cn; -} - -Cnode* Ebofs::get_cnode(coll_t cid) -{ - while (1) { - // in cache? - if (cnode_map.count(cid)) { - // yay - Cnode *cn = cnode_map[cid]; - cn->get(); - return cn; - } - - // on disk? - Extent cnode_loc; - if (collection_tab->lookup(cid, cnode_loc) < 0) { - // object dne. - return 0; - } - - // already loading? - if (waitfor_cnode.count(cid)) { - // yep, just wait. - Cond c; - waitfor_cnode[cid].push_back(&c); - dout(10) << "get_cnode " << cid << " already loading, waiting" << dendl; - c.Wait(ebofs_lock); - continue; - } - - dout(10) << "get_cnode reading " << cid << " from " << cnode_loc << dendl; - - assert(waitfor_cnode.count(cid) == 0); - waitfor_cnode[cid].clear(); // this should be empty initially. - - // read it! - bufferlist bl; - //bufferpool.alloc( EBOFS_BLOCK_SIZE*cnode_loc.length, bl ); - bl.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*cnode_loc.length) ); - - ebofs_lock.Unlock(); - dev.read( cnode_loc.start, cnode_loc.length, bl ); - ebofs_lock.Lock(); - - // parse data block - Cnode *cn = new Cnode(cid); - - cnode_map[cid] = cn; - cnode_lru.lru_insert_top(cn); - - struct ebofs_cnode *ec = (struct ebofs_cnode*)bl.c_str(); - cn->cnode_loc = ec->cnode_loc; - - // parse attributes - char *p = bl.c_str() + sizeof(*ec); - for (int i=0; inum_attr; i++) { - string key = p; - p += key.length() + 1; - int len = *(int*)(p); - p += sizeof(len); - cn->attr[key] = buffer::copy(p, len); - p += len; - dout(15) << "get_cnode " << *cn << " attr " << key << " len " << len << dendl; - } - - // wake up other waiters - for (list::iterator i = waitfor_cnode[cid].begin(); - i != waitfor_cnode[cid].end(); - i++) - (*i)->Signal(); - waitfor_cnode.erase(cid); // remove Cond list - - cn->get(); - return cn; - } -} - -void Ebofs::encode_cnode(Cnode *cn, bufferlist& bl, unsigned& off) -{ - // cnode - struct ebofs_cnode ec; - ec.cnode_loc = cn->cnode_loc; - ec.coll_id = cn->coll_id; - ec.num_attr = cn->attr.size(); - bl.copy_in(off, sizeof(ec), (char*)&ec); - off += sizeof(ec); - - // attr - for (map::iterator i = cn->attr.begin(); - i != cn->attr.end(); - i++) { - bl.copy_in(off, i->first.length()+1, i->first.c_str()); - off += i->first.length()+1; - int len = i->second.length(); - bl.copy_in(off, sizeof(int), (char*)&len); - off += sizeof(int); - bl.copy_in(off, len, i->second.c_str()); - off += len; - - dout(15) << "write_cnode " << *cn << " attr " << i->first << " len " << len << dendl; - } -} - -void Ebofs::write_cnode(Cnode *cn) -{ - // allocate buffer - unsigned bytes = sizeof(ebofs_cnode) + cn->get_attr_bytes(); - unsigned blocks = (bytes-1)/EBOFS_BLOCK_SIZE + 1; - - bufferlist bl; - //bufferpool.alloc( EBOFS_BLOCK_SIZE*blocks, bl ); - bl.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*blocks) ); - - // (always) relocate cnode! - if (1) { - if (cn->cnode_loc.length) - allocator.release(cn->cnode_loc); - - allocator.allocate(cn->cnode_loc, blocks, Allocator::NEAR_LAST_FWD); - collection_tab->remove( cn->coll_id ); - collection_tab->insert( cn->coll_id, cn->cnode_loc ); - } - - dout(10) << "write_cnode " << *cn << " to " << cn->cnode_loc << dendl; - - unsigned off = 0; - encode_cnode(cn, bl, off); - assert(off == bytes); - - // write - dev.write( cn->cnode_loc.start, cn->cnode_loc.length, bl, - new C_E_InodeFlush(this), "write_cnode" ); -} - -void Ebofs::remove_cnode(Cnode *cn) -{ - dout(10) << "remove_cnode " << *cn << dendl; - - // remove from table - collection_tab->remove(cn->coll_id); - - // free cnode space - if (cn->cnode_loc.length) - allocator.release(cn->cnode_loc); - - // remove from dirty list? - if (cn->is_dirty()) - dirty_cnodes.erase(cn); - - // remove from map and lru - cnode_map.erase(cn->coll_id); - cnode_lru.lru_remove(cn); - - // count down refs - cn->mark_clean(); - cn->put(); - assert(cn->get_ref_count() == 0); - - // hose. - delete cn; - - dirty = true; -} - -void Ebofs::put_cnode(Cnode *cn) -{ - cn->put(); -} - -void Ebofs::dirty_cnode(Cnode *cn) -{ - if (!cn->is_dirty()) { - cn->mark_dirty(); - dirty_cnodes.insert(cn); - } - dirty = true; -} - - - - - -void Ebofs::flush_inode_finish() -{ - ebofs_lock.Lock(); - { - inodes_flushing--; - if (inodes_flushing < 1000) - dout(20) << "flush_inode_finish, " << inodes_flushing << " left" << dendl; - if (inodes_flushing == 0) - inode_commit_cond.Signal(); - } - ebofs_lock.Unlock(); -} - -void Ebofs::commit_inodes_start() -{ - dout(10) << "commit_inodes_start" << dendl; - - assert(inodes_flushing == 0); - - // onodes - for (set::iterator i = dirty_onodes.begin(); - i != dirty_onodes.end(); - i++) { - Onode *on = *i; - inodes_flushing++; - write_onode(on); - on->mark_clean(); - on->uncommitted.clear(); // commit allocated blocks - on->commit_waiters.clear(); // these guys are gonna get taken care of, bc we committed. - } - dirty_onodes.clear(); - - // cnodes - for (set::iterator i = dirty_cnodes.begin(); - i != dirty_cnodes.end(); - i++) { - Cnode *cn = *i; - inodes_flushing++; - write_cnode(cn); - cn->mark_clean(); - } - dirty_cnodes.clear(); - - dout(10) << "commit_inodes_start writing " << inodes_flushing << " onodes+cnodes" << dendl; -} - -void Ebofs::commit_inodes_wait() -{ - // caller must hold ebofs_lock - while (inodes_flushing > 0) { - dout(10) << "commit_inodes_wait waiting for " << inodes_flushing << " onodes+cnodes to flush" << dendl; - inode_commit_cond.Wait(ebofs_lock); - } - dout(10) << "commit_inodes_wait all flushed" << dendl; -} - - - - - - - -// *** buffer cache *** - -void Ebofs::trim_buffer_cache() -{ - ebofs_lock.Lock(); - trim_bc(0); - ebofs_lock.Unlock(); -} - -void Ebofs::trim_bc(off_t max) -{ - if (max < 0) - max = g_conf.ebofs_bc_size; - dout(10) << "trim_bc start: size " << bc.get_size() << ", trimmable " << bc.get_trimmable() << ", max " << max << dendl; - - while (bc.get_size() > max && - bc.get_trimmable()) { - BufferHead *bh = (BufferHead*) bc.lru_rest.lru_expire(); - if (!bh) break; - - dout(25) << "trim_bc trimming " << *bh << dendl; - assert(bh->is_clean()); - - ObjectCache *oc = bh->oc; - bc.remove_bh(bh); - delete bh; - - if (oc->is_empty()) { - Onode *on = oc->on; - dout(10) << "trim_bc closing oc on " << *on << dendl; - on->close_oc(); - } - } - - dout(10) << "trim_bc finish: size " << bc.get_size() << ", trimmable " << bc.get_trimmable() << ", max " << max << dendl; -} - - -void Ebofs::kick_idle() -{ - dout(10) << "kick_idle" << dendl; - //commit_cond.Signal(); - - ebofs_lock.Lock(); - if (mounted && !unmounting && dirty) { - dout(10) << "kick_idle dirty, doing commit" << dendl; - commit_cond.Signal(); - } else { - dout(10) << "kick_idle !dirty or !mounted or unmounting, doing nothing" << dendl; - } - ebofs_lock.Unlock(); -} - -void Ebofs::sync(Context *onsafe) -{ - ebofs_lock.Lock(); - if (onsafe) { - dirty = true; - - while (1) { - if (journal) { - // journal empty transaction - Transaction t; - bufferlist bl; - t._encode(bl); - if (journal->submit_entry(bl, onsafe)) break; - } - commit_waiters[super_epoch].push_back(onsafe); - break; - } - } - ebofs_lock.Unlock(); -} - -void Ebofs::sync() -{ - ebofs_lock.Lock(); - if (!dirty) { - dout(7) << "sync in " << super_epoch << ", not dirty" << dendl; - } else { - epoch_t start = super_epoch; - dout(7) << "sync start in " << start << dendl; - while (super_epoch == start) { - dout(7) << "sync kicking commit in " << super_epoch << dendl; - dirty = true; - commit_cond.Signal(); - sync_cond.Wait(ebofs_lock); - } - dout(10) << "sync finish in " << super_epoch << dendl; - } - ebofs_lock.Unlock(); -} - - - -void Ebofs::commit_bc_wait(version_t epoch) -{ - dout(10) << "commit_bc_wait on epoch " << epoch << dendl; - - while (bc.get_unflushed(EBOFS_BC_FLUSH_BHWRITE,epoch) > 0 || - bc.get_unflushed(EBOFS_BC_FLUSH_PARTIAL,epoch) > 0) { - //dout(10) << "commit_bc_wait " << bc.get_unflushed(epoch) << " unflushed in epoch " << epoch << dendl; - dout(10) << "commit_bc_wait epoch " << epoch - << ", unflushed bhwrite " << bc.get_unflushed(EBOFS_BC_FLUSH_BHWRITE) - << ", unflushed partial " << bc.get_unflushed(EBOFS_BC_FLUSH_PARTIAL) - << dendl; - bc.waitfor_flush(); - } - - bc.get_unflushed(EBOFS_BC_FLUSH_BHWRITE).erase(epoch); - bc.get_unflushed(EBOFS_BC_FLUSH_PARTIAL).erase(epoch); - - dout(10) << "commit_bc_wait all flushed for epoch " << epoch - << "; " << bc.get_unflushed(EBOFS_BC_FLUSH_BHWRITE) - << " " << bc.get_unflushed(EBOFS_BC_FLUSH_PARTIAL) - << dendl; -} - - - -int Ebofs::statfs(struct statfs *buf) -{ - dout(7) << "statfs" << dendl; - - buf->f_type = EBOFS_MAGIC; /* type of filesystem */ - buf->f_bsize = 4096; /* optimal transfer block size */ - buf->f_blocks = dev.get_num_blocks(); /* total data blocks in file system */ - buf->f_bfree = get_free_blocks() - + get_limbo_blocks(); /* free blocks in fs */ - buf->f_bavail = get_free_blocks(); /* free blocks avail to non-superuser -- actually, for writing. */ - buf->f_files = nodepool.get_num_total(); /* total file nodes in file system */ - buf->f_ffree = nodepool.get_num_free(); /* free file nodes in fs */ - //buf->f_fsid = 0; /* file system id */ -#ifndef DARWIN - buf->f_namelen = 8; /* maximum length of filenames */ -#endif // DARWIN - - return 0; -} - - - - -/* - * allocate a write to blocks on disk. - * - take care to not overwrite any "safe" data blocks. - * - allocate/map new extents on disk as necessary - */ -void Ebofs::alloc_write(Onode *on, - block_t start, block_t len, - interval_set& alloc, - block_t& old_bfirst, block_t& old_blast) -{ - // first decide what pages to (re)allocate - alloc.insert(start, len); // start with whole range - - // figure out what bits are already uncommitted - interval_set already_uncom; - already_uncom.intersection_of(alloc, on->uncommitted); - - // subtract those off, so we're left with the committed bits (that must be reallocated). - alloc.subtract(already_uncom); - - dout(10) << "alloc_write must (re)alloc " << alloc << " on " << *on << dendl; - - // release it (into limbo) - for (map::iterator i = alloc.m.begin(); - i != alloc.m.end(); - i++) { - // get old region - vector old; - on->map_extents(i->first, i->second, old); - for (unsigned o=0; ofirst == start) { - old_bfirst = old[0].start; - dout(20) << "alloc_write old_bfirst " << old_bfirst << " of " << old[0] << dendl; - } - if (i->first+i->second == start+len) { - old_blast = old[old.size()-1].last(); - dout(20) << "alloc_write old_blast " << old_blast << " of " << old[old.size()-1] << dendl; - } - } - } - - // reallocate uncommitted too? - // ( --> yes. we can always make better allocation decisions later, with more information. ) - if (g_conf.ebofs_realloc) { - list tx; - - ObjectCache *oc = on->get_oc(&bc); - oc->find_tx(start, len, tx); - - for (list::reverse_iterator p = tx.rbegin(); - p != tx.rend(); - p++) { - BufferHead *bh = *p; - - // cancelable/moveable? - if (alloc.contains(bh->start(), bh->length())) { - dout(10) << "alloc_write " << *bh << " already in " << alloc << dendl; - continue; - } - - vector old; - on->map_extents(bh->start(), bh->length(), old); - assert(old.size() == 1); - - if (bh->start() >= start && bh->end() <= start+len) { - assert(bh->epoch_modified == super_epoch); - if (bc.bh_cancel_write(bh, super_epoch)) { - if (bh->length() == 1) - dout(10) << "alloc_write unallocated tx " << old[0] << ", canceled " << *bh << dendl; - // no, this isn't compatible with clone() and extent reference counting. - //allocator.unallocate(old[0]); // release (into free) - allocator.release(old[0]); // **FIXME** no cloning yet, my friend! - alloc.insert(bh->start(), bh->length()); - } else { - if (bh->length() == 1) - dout(10) << "alloc_write released tx " << old[0] << ", couldn't cancel " << *bh << dendl; - allocator.release(old[0]); // release (into limbo) - alloc.insert(bh->start(), bh->length()); - } - } else { - if (bh->length() == 1) - dout(10) << "alloc_write skipped tx " << old[0] << ", not entirely within " - << start << "~" << len - << " bh " << *bh << dendl; - } - } - - dout(10) << "alloc_write will (re)alloc " << alloc << " on " << *on << dendl; - } - - if (alloc.empty()) return; // no need to dirty the onode below! - - - // merge alloc into onode uncommitted map - //dout(10) << " union of " << on->uncommitted << " and " << alloc << dendl; - interval_set old = on->uncommitted; - on->uncommitted.union_of(alloc); - - dout(10) << "alloc_write onode.uncommitted is now " << on->uncommitted << dendl; - - if (0) { - // verify - interval_set ta; - ta.intersection_of(on->uncommitted, alloc); - dout(0) << " ta " << ta << dendl; - assert(alloc == ta); - - interval_set tb; - tb.intersection_of(on->uncommitted, old); - dout(0) << " tb " << tb << dendl; - assert(old == tb); - } - - dirty_onode(on); - - // allocate the space - for (map::iterator i = alloc.m.begin(); - i != alloc.m.end(); - i++) { - dout(15) << "alloc_write alloc " << i->first << "~" << i->second << " (of " << start << "~" << len << ")" << dendl; - - // allocate new space - block_t left = i->second; - block_t cur = i->first; - while (left > 0) { - Extent ex; - allocator.allocate(ex, left, Allocator::NEAR_LAST_FWD); - dout(10) << "alloc_write got " << ex << " for object offset " << cur << dendl; - on->set_extent(cur, ex); // map object to new region - left -= ex.length; - cur += ex.length; - } - } -} - - - - -void Ebofs::apply_write(Onode *on, off_t off, size_t len, const bufferlist& bl) -{ - ObjectCache *oc = on->get_oc(&bc); - - // map into blocks - off_t opos = off; // byte pos in object - size_t zleft = 0; // zeros left to write - size_t left = len; // bytes left - - block_t bstart = off / EBOFS_BLOCK_SIZE; - - if (off > on->object_size) { - zleft = off - on->object_size; - opos = on->object_size; - bstart = on->object_size / EBOFS_BLOCK_SIZE; - } - if (off+(off_t)len > on->object_size) { - dout(10) << "apply_write extending size on " << *on << ": " << on->object_size - << " -> " << off+len << dendl; - on->object_size = off+len; - dirty_onode(on); - } - if (bl.length() == 0) { - zleft += len; - left = 0; - } else { - assert(bl.length() == len); - } - if (zleft) - dout(10) << "apply_write zeroing " << zleft << " bytes before " << off << "~" << len - << " in " << *on << dendl; - - block_t blast = (len+off-1) / EBOFS_BLOCK_SIZE; - block_t blen = blast-bstart+1; - - // allocate write on disk. - interval_set alloc; - block_t old_bfirst = 0; // zero means not defined here (since we ultimately pass to bh_read) - block_t old_blast = 0; - alloc_write(on, bstart, blen, alloc, old_bfirst, old_blast); - dout(20) << "apply_write old_bfirst " << old_bfirst << ", old_blast " << old_blast << dendl; - - if (fake_writes) { - on->uncommitted.clear(); // worst case! - return; - } - - // map b range onto buffer_heads - map hits; - oc->map_write(bstart, blen, hits, super_epoch); - - // get current versions - //version_t lowv, highv; - //oc->scan_versions(bstart, blen, lowv, highv); - //highv++; - version_t highv = ++oc->write_count; - - // copy from bl into buffer cache - unsigned blpos = 0; // byte pos in input buffer - - // write data into buffers - for (map::iterator i = hits.begin(); - i != hits.end(); - i++) { - BufferHead *bh = i->second; - bh->set_version(highv); - bh->epoch_modified = super_epoch; - - // old write in progress? - if (bh->is_tx()) { // copy the buffer to avoid munging up in-flight write - dout(10) << "apply_write tx pending, copying buffer on " << *bh << dendl; - bufferlist temp; - temp.claim(bh->data); - //bc.bufferpool.alloc(EBOFS_BLOCK_SIZE*bh->length(), bh->data); - bh->data.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*bh->length()) ); - bh->data.copy_in(0, bh->length()*EBOFS_BLOCK_SIZE, temp); - } - - // need to split off partial? (partials can only be ONE block) - if ((bh->is_missing() || bh->is_rx()) && bh->length() > 1) { - if ((bh->start() == bstart && opos % EBOFS_BLOCK_SIZE != 0)) { - BufferHead *right = bc.split(bh, bh->start()+1); - hits[right->start()] = right; - dout(10) << "apply_write split off left block for partial write; rest is " << *right << dendl; - } - if ((bh->last() == blast && (len+off) % EBOFS_BLOCK_SIZE != 0) && - ((off_t)len+off < on->object_size)) { - BufferHead *right = bc.split(bh, bh->last()); - hits[right->start()] = right; - dout(10) << "apply_write split off right block for upcoming partial write; rest is " << *right << dendl; - } - } - - // partial at head or tail? - if ((bh->start() == bstart && opos % EBOFS_BLOCK_SIZE != 0) || // opos, not off, in case we're zeroing... - (bh->last() == blast && ((off_t)len+off) % EBOFS_BLOCK_SIZE != 0 && ((off_t)len+off) < on->object_size)) { - // locate ourselves in bh - unsigned off_in_bh = opos - bh->start()*EBOFS_BLOCK_SIZE; - assert(off_in_bh >= 0); - unsigned len_in_bh = MIN( (off_t)(zleft+left), - (off_t)(bh->end()*EBOFS_BLOCK_SIZE)-opos ); - - if (bh->is_partial() || bh->is_rx() || bh->is_missing()) { - assert(bh->is_partial() || bh->is_rx() || bh->is_missing()); - assert(bh->length() == 1); - - // add frag to partial - dout(10) << "apply_write writing into partial " << *bh << ":" - << " off_in_bh " << off_in_bh - << " len_in_bh " << len_in_bh - << dendl; - unsigned z = MIN( zleft, len_in_bh ); - if (z) { - bufferptr zp(z); - zp.zero(); - bufferlist zb; - zb.push_back(zp); - bh->add_partial(off_in_bh, zb); - zleft -= z; - opos += z; - } - - bufferlist sb; - sb.substr_of(bl, blpos, len_in_bh-z); // substr in existing buffer - bufferlist cp; - cp.append(sb.c_str(), len_in_bh-z); // copy the partial bit! - bh->add_partial(off_in_bh, cp); - left -= len_in_bh-z; - blpos += len_in_bh-z; - opos += len_in_bh-z; - - if (bh->partial_is_complete(on->object_size - bh->start()*EBOFS_BLOCK_SIZE)) { - dout(10) << "apply_write completed partial " << *bh << dendl; - //bc.bufferpool.alloc(EBOFS_BLOCK_SIZE*bh->length(), bh->data); // new buffers! - bh->data.clear(); - bh->data.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*bh->length()) ); - bh->data.zero(); - bh->apply_partial(); - bc.mark_dirty(bh); - bc.bh_write(on, bh); - } - else if (bh->is_rx()) { - dout(10) << "apply_write rx -> partial " << *bh << dendl; - assert(bh->length() == 1); - bc.mark_partial(bh); - bc.bh_queue_partial_write(on, bh); // queue the eventual write - } - else if (bh->is_missing()) { - dout(10) << "apply_write missing -> partial " << *bh << dendl; - assert(bh->length() == 1); - bc.mark_partial(bh); - - // take care to read from _old_ disk block locations! - if (bh->start() == bstart) - bc.bh_read(on, bh, old_bfirst); - else if (bh->start() == blast) - bc.bh_read(on, bh, old_blast); - else assert(0); - - bc.bh_queue_partial_write(on, bh); // queue the eventual write - } - else if (bh->is_partial()) { - dout(10) << "apply_write already partial, no need to submit rx on " << *bh << dendl; - if (bh->partial_tx_epoch == super_epoch) - bc.bh_cancel_partial_write(bh); - bc.bh_queue_partial_write(on, bh); // queue the eventual write - } - - - } else { - assert(bh->is_clean() || bh->is_dirty() || bh->is_tx()); - - // just write into the bh! - dout(10) << "apply_write writing leading/tailing partial into " << *bh << ":" - << " off_in_bh " << off_in_bh - << " len_in_bh " << len_in_bh - << dendl; - - // copy data into new buffers first (copy on write!) - // FIXME: only do the modified pages? this might be a big bh! - bufferlist temp; - temp.claim(bh->data); - //bc.bufferpool.alloc(EBOFS_BLOCK_SIZE*bh->length(), bh->data); - bh->data.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*bh->length()) ); - bh->data.copy_in(0, bh->length()*EBOFS_BLOCK_SIZE, temp); - - unsigned z = MIN( zleft, len_in_bh ); - if (z) { - bufferptr zp(z); - zp.zero(); - bufferlist zb; - zb.push_back(zp); - bh->data.copy_in(off_in_bh, z, zb); - zleft -= z; - opos += z; - } - - bufferlist sub; - sub.substr_of(bl, blpos, len_in_bh-z); - bh->data.copy_in(off_in_bh+z, len_in_bh-z, sub); - - blpos += len_in_bh-z; - left -= len_in_bh-z; - opos += len_in_bh-z; - - if (!bh->is_dirty()) - bc.mark_dirty(bh); - - bc.bh_write(on, bh); - } - continue; - } - - // ok, we're talking full block(s) now (modulo last block of the object) - assert(opos % EBOFS_BLOCK_SIZE == 0); - assert((off_t)(zleft+left) >= (off_t)(EBOFS_BLOCK_SIZE*bh->length()) || - opos+(off_t)(zleft+left) == on->object_size); - - unsigned len_in_bh = MIN(bh->length()*EBOFS_BLOCK_SIZE, zleft+left); - assert(len_in_bh <= zleft+left); - - dout(10) << "apply_write writing into " << *bh << ":" - << " len_in_bh " << len_in_bh - << dendl; - - // i will write: - unsigned z = MIN(len_in_bh, zleft); - bufferlist sub; - sub.substr_of(bl, blpos, len_in_bh-z); - - if (!z && - sub.is_page_aligned() && - sub.is_n_page_sized()) { - // assume caller isn't going to modify written buffers. - // just refrence them! - dout(10) << "apply_write yippee, written buffer already page aligned" << dendl; - bh->data.claim(sub); - } else { - // alloc new buffers. - bh->data.clear(); - bh->data.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*bh->length()) ); - - if (z) { - bufferptr zp(z); - zp.zero(); - bufferlist zb; - zb.push_back(zp); - bh->data.copy_in(0, z, zb); - zleft -= z; - } - - bufferlist sub; - sub.substr_of(bl, blpos, len_in_bh-z); - bh->data.copy_in(z, len_in_bh-z, sub); - } - - blpos += len_in_bh-z; - left -= len_in_bh-z; - opos += len_in_bh; - - // old partial? - if (bh->is_partial() && - bh->partial_tx_epoch == super_epoch) - bc.bh_cancel_partial_write(bh); - - // mark dirty - if (!bh->is_dirty()) - bc.mark_dirty(bh); - - bc.bh_write(on, bh); - } - - assert(zleft == 0); - assert(left == 0); - assert(opos == off+(off_t)len); - //assert(blpos == bl.length()); -} - - - - -// *** file i/o *** - -bool Ebofs::attempt_read(Onode *on, off_t off, size_t len, bufferlist& bl, - Cond *will_wait_on, bool *will_wait_on_bool) -{ - dout(10) << "attempt_read " << *on << " " << off << "~" << len << dendl; - ObjectCache *oc = on->get_oc(&bc); - - // map - block_t bstart = off / EBOFS_BLOCK_SIZE; - block_t blast = (len+off-1) / EBOFS_BLOCK_SIZE; - block_t blen = blast-bstart+1; - - map hits; - map missing; // read these - map rx; // wait for these - map partials; // ?? - oc->map_read(bstart, blen, hits, missing, rx, partials); - - // missing buffers? - if (!missing.empty()) { - for (map::iterator i = missing.begin(); - i != missing.end(); - i++) { - dout(10) << "attempt_read missing buffer " << *(i->second) << dendl; - bc.bh_read(on, i->second); - } - BufferHead *wait_on = missing.begin()->second; - block_t b = MAX(wait_on->start(), bstart); - wait_on->waitfor_read[b].push_back(new C_Cond(will_wait_on, will_wait_on_bool)); - return false; - } - - // are partials sufficient? - bool partials_ok = true; - for (map::iterator i = partials.begin(); - i != partials.end(); - i++) { - BufferHead *bh = i->second; - off_t bhstart = (off_t)(bh->start()*EBOFS_BLOCK_SIZE); - off_t bhend = (off_t)(bh->end()*EBOFS_BLOCK_SIZE); - off_t start = MAX( off, bhstart ); - off_t end = MIN( off+(off_t)len, bhend ); - - if (!i->second->have_partial_range(start-bhstart, end-bhend)) { - if (partials_ok) { - // wait on this one - Context *c = new C_Cond(will_wait_on, will_wait_on_bool); - dout(10) << "attempt_read insufficient partial buffer " << *(i->second) << " c " << c << dendl; - i->second->waitfor_read[i->second->start()].push_back(c); - } - partials_ok = false; - } - } - if (!partials_ok) return false; - - // wait on rx? - if (!rx.empty()) { - BufferHead *wait_on = rx.begin()->second; - Context *c = new C_Cond(will_wait_on, will_wait_on_bool); - dout(20) << "attempt_read waiting for read to finish on " << *wait_on << " c " << c << dendl; - block_t b = MAX(wait_on->start(), bstart); - wait_on->waitfor_read[b].push_back(c); - return false; - } - - // yay, we have it all! - // concurrently walk thru hits, partials. - map::iterator h = hits.begin(); - map::iterator p = partials.begin(); - - bl.clear(); - off_t pos = off; - block_t curblock = bstart; - while (curblock <= blast) { - BufferHead *bh = 0; - if (h->first == curblock) { - bh = h->second; - h++; - } else if (p->first == curblock) { - bh = p->second; - p++; - } else assert(0); - - off_t bhstart = (off_t)(bh->start()*EBOFS_BLOCK_SIZE); - off_t bhend = (off_t)(bh->end()*EBOFS_BLOCK_SIZE); - off_t start = MAX( pos, bhstart ); - off_t end = MIN( off+(off_t)len, bhend ); - - if (bh->is_partial()) { - // copy from a partial block. yuck! - bufferlist frag; - bh->copy_partial_substr( start-bhstart, end-bhstart, frag ); - bl.claim_append( frag ); - pos += frag.length(); - } else { - // copy from a full block. - if (bhstart == start && bhend == end) { - bl.append( bh->data ); - pos += bh->data.length(); - } else { - bufferlist frag; - dout(10) << "substr " << (start-bhstart) << "~" << (end-start) << " of " << bh->data.length() << " in " << *bh << dendl; - frag.substr_of(bh->data, start-bhstart, end-start); - pos += frag.length(); - bl.claim_append( frag ); - } - } - - curblock = bh->end(); - /* this assert is more trouble than it's worth - assert((off_t)(curblock*EBOFS_BLOCK_SIZE) == pos || // should be aligned with next block - end != bhend || // or we ended midway through bh - (bh->last() == blast && end == bhend)); // ended last block ** FIXME WRONG??? - */ - } - - assert(bl.length() == len); - return true; -} - - -/* - * is_cached -- query whether a object extent is in our cache - * return value of -1 if onode isn't loaded. otherwise, the number - * of extents that need to be read (i.e. # of seeks) - */ -int Ebofs::is_cached(object_t oid, off_t off, size_t len) -{ - ebofs_lock.Lock(); - int r = _is_cached(oid, off, len); - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_is_cached(object_t oid, off_t off, size_t len) -{ - if (!have_onode(oid)) { - dout(7) << "_is_cached " << oid << " " << off << "~" << len << " ... onode " << dendl; - return -1; // object dne? - } - Onode *on = get_onode(oid); - - if (!on->have_oc()) { - // nothing is cached. return # of extents in file. - dout(10) << "_is_cached have onode but no object cache, returning extent count" << dendl; - return on->extent_map.size(); - } - - // map - block_t bstart = off / EBOFS_BLOCK_SIZE; - block_t blast = (len+off-1) / EBOFS_BLOCK_SIZE; - block_t blen = blast-bstart+1; - - map hits; - map missing; // read these - map rx; // wait for these - map partials; // ?? - - int num_missing = on->get_oc(&bc)->try_map_read(bstart, blen); - dout(7) << "_is_cached try_map_read reports " << num_missing << " missing extents" << dendl; - return num_missing; - - // FIXME: actually, we should calculate if these extents are contiguous. - // and not using map_read, probably... - /* hrmpf - block_t dpos = 0; - block_t opos = bstart; - while (opos < blen) { - if (hits.begin()->first == opos) { - } else { - block_t d; - if (missing.begin()->first == opos) d = missing.begin()->second. - - } - */ -} - -void Ebofs::trim_from_cache(object_t oid, off_t off, size_t len) -{ - ebofs_lock.Lock(); - _trim_from_cache(oid, off, len); - ebofs_lock.Unlock(); -} - -void Ebofs::_trim_from_cache(object_t oid, off_t off, size_t len) -{ - // be careful not to load it if we don't have it - if (!have_onode(oid)) { - dout(7) << "_trim_from_cache " << oid << " " << off << "~" << len << " ... onode not in cache " << dendl; - return; - } - - // ok, we have it, get a pointer. - Onode *on = get_onode(oid); - - if (!on->have_oc()) - return; // nothing is cached. - - // map to blocks - block_t bstart = off / EBOFS_BLOCK_SIZE; - block_t blast = (len+off-1) / EBOFS_BLOCK_SIZE; - - ObjectCache *oc = on->get_oc(&bc); - oc->touch_bottom(bstart, blast); - - return; -} - - -int Ebofs::read(object_t oid, - off_t off, size_t len, - bufferlist& bl) -{ - ebofs_lock.Lock(); - int r = _read(oid, off, len, bl); - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_read(object_t oid, off_t off, size_t len, bufferlist& bl) -{ - dout(7) << "_read " << oid << " " << off << "~" << len << dendl; - - Onode *on = get_onode(oid); - if (!on) { - dout(7) << "_read " << oid << " " << off << "~" << len << " ... dne " << dendl; - return -ENOENT; // object dne? - } - - // read data into bl. block as necessary. - Cond cond; - - int r = 0; - while (1) { - // check size bound - if (off >= on->object_size) { - dout(7) << "_read " << oid << " " << off << "~" << len << " ... off past eof " << on->object_size << dendl; - r = -ESPIPE; // FIXME better errno? - break; - } - - size_t try_len = len ? len:on->object_size; - size_t will_read = MIN(off+(off_t)try_len, on->object_size) - off; - - bool done; - if (attempt_read(on, off, will_read, bl, &cond, &done)) - break; // yay - - // wait - while (!done) - cond.Wait(ebofs_lock); - - if (on->deleted) { - dout(7) << "_read " << oid << " " << off << "~" << len << " ... object deleted" << dendl; - r = -ENOENT; - break; - } - } - - put_onode(on); - - trim_bc(); - - if (r < 0) return r; // return error, - dout(7) << "_read " << oid << " " << off << "~" << len << " ... got " << bl.length() << dendl; - return bl.length(); // or bytes read. -} - - -bool Ebofs::_write_will_block() -{ - return (bc.get_stat_dirty()+bc.get_stat_tx() > g_conf.ebofs_bc_max_dirty); -} - -bool Ebofs::write_will_block() -{ - ebofs_lock.Lock(); - bool b = _write_will_block(); - ebofs_lock.Unlock(); - return b; -} - - -unsigned Ebofs::apply_transaction(Transaction& t, Context *onsafe) -{ - ebofs_lock.Lock(); - dout(7) << "apply_transaction start (" << t.get_num_ops() << " ops)" << dendl; - - unsigned r = _apply_transaction(t); - - // journal, wait for commit - if (r != 0 && onsafe) { - delete onsafe; // kill callback, but still journal below (in case transaction had side effects) - onsafe = 0; - } - while (1) { - if (journal) { - bufferlist bl; - t._encode(bl); - if (journal->submit_entry(bl, onsafe)) break; - } - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - break; - } - - ebofs_lock.Unlock(); - return r; -} - -unsigned Ebofs::_apply_transaction(Transaction& t) -{ - // do ops - unsigned r = 0; // bit fields indicate which ops failed. - int bit = 1; - while (t.have_op()) { - int op = t.get_op(); - switch (op) { - case Transaction::OP_READ: - { - object_t oid; - t.get_oid(oid); - off_t offset, len; - t.get_length(offset); - t.get_length(len); - bufferlist *pbl; - t.get_pbl(pbl); - if (_read(oid, offset, len, *pbl) < 0) { - dout(7) << "apply_transaction fail on _read" << dendl; - r &= bit; - } - } - break; - - case Transaction::OP_STAT: - { - object_t oid; - t.get_oid(oid); - struct stat *st; - t.get_pstat(st); - if (_stat(oid, st) < 0) { - dout(7) << "apply_transaction fail on _stat" << dendl; - r &= bit; - } - } - break; - - case Transaction::OP_GETATTR: - { - object_t oid; - t.get_oid(oid); - const char *attrname; - t.get_attrname(attrname); - pair pattrval; - t.get_pattrval(pattrval); - if ((*(pattrval.second) = _getattr(oid, attrname, pattrval.first, *(pattrval.second))) < 0) { - dout(7) << "apply_transaction fail on _getattr" << dendl; - r &= bit; - } - } - break; - - case Transaction::OP_GETATTRS: - { - object_t oid; - t.get_oid(oid); - map *pset; - t.get_pattrset(pset); - if (_getattrs(oid, *pset) < 0) { - dout(7) << "apply_transaction fail on _getattrs" << dendl; - r &= bit; - } - } - break; - - - case Transaction::OP_WRITE: - { - object_t oid; - t.get_oid(oid); - off_t offset, len; - t.get_length(offset); - t.get_length(len); - bufferlist bl; - t.get_bl(bl); - if (_write(oid, offset, len, bl) < 0) { - dout(7) << "apply_transaction fail on _write" << dendl; - r &= bit; - } - } - break; - - case Transaction::OP_TRIMCACHE: - { - object_t oid; - t.get_oid(oid); - off_t offset, len; - t.get_length(offset); - t.get_length(len); - _trim_from_cache(oid, offset, len); - } - break; - - case Transaction::OP_TRUNCATE: - { - object_t oid; - t.get_oid(oid); - off_t len; - t.get_length(len); - if (_truncate(oid, len) < 0) { - dout(7) << "apply_transaction fail on _truncate" << dendl; - r &= bit; - } - } - break; - - case Transaction::OP_REMOVE: - { - object_t oid; - t.get_oid(oid); - if (_remove(oid) < 0) { - dout(7) << "apply_transaction fail on _remove" << dendl; - r &= bit; - } - } - break; - - case Transaction::OP_SETATTR: - { - object_t oid; - t.get_oid(oid); - const char *attrname; - t.get_attrname(attrname); - bufferlist bl; - t.get_bl(bl); - if (_setattr(oid, attrname, bl.c_str(), bl.length()) < 0) { - dout(7) << "apply_transaction fail on _setattr" << dendl; - r &= bit; - } - } - break; - - case Transaction::OP_SETATTRS: - { - object_t oid; - t.get_oid(oid); - map *pattrset; - t.get_pattrset(pattrset); - if (_setattrs(oid, *pattrset) < 0) { - dout(7) << "apply_transaction fail on _setattrs" << dendl; - r &= bit; - } - } - break; - - case Transaction::OP_RMATTR: - { - object_t oid; - t.get_oid(oid); - const char *attrname; - t.get_attrname(attrname); - if (_rmattr(oid, attrname) < 0) { - dout(7) << "apply_transaction fail on _rmattr" << dendl; - r &= bit; - } - } - break; - - case Transaction::OP_CLONE: - { - object_t oid; - t.get_oid(oid); - object_t noid; - t.get_oid(noid); - if (_clone(oid, noid) < 0) { - dout(7) << "apply_transaction fail on _clone" << dendl; - r &= bit; - } - } - break; - - case Transaction::OP_MKCOLL: - { - coll_t cid; - t.get_cid(cid); - if (_create_collection(cid) < 0) { - dout(7) << "apply_transaction fail on _create_collection" << dendl; - r &= bit; - } - } - break; - - case Transaction::OP_RMCOLL: - { - coll_t cid; - t.get_cid(cid); - if (_destroy_collection(cid) < 0) { - dout(7) << "apply_transaction fail on _destroy_collection" << dendl; - r &= bit; - } - } - break; - - case Transaction::OP_COLL_ADD: - { - coll_t cid; - t.get_cid(cid); - object_t oid; - t.get_oid(oid); - if (_collection_add(cid, oid) < 0) { - //dout(7) << "apply_transaction fail on _collection_add" << dendl; - //r &= bit; - } - } - break; - - case Transaction::OP_COLL_REMOVE: - { - coll_t cid; - t.get_cid(cid); - object_t oid; - t.get_oid(oid); - if (_collection_remove(cid, oid) < 0) { - dout(7) << "apply_transaction fail on _collection_remove" << dendl; - r &= bit; - } - } - break; - - case Transaction::OP_COLL_SETATTR: - { - coll_t cid; - t.get_cid(cid); - const char *attrname; - t.get_attrname(attrname); - bufferlist bl; - t.get_bl(bl); - if (_collection_setattr(cid, attrname, bl.c_str(), bl.length()) < 0) { - //if (_collection_setattr(cid, attrname, attrval.first, attrval.second) < 0) { - dout(7) << "apply_transaction fail on _collection_setattr" << dendl; - r &= bit; - } - } - break; - - case Transaction::OP_COLL_RMATTR: - { - coll_t cid; - t.get_cid(cid); - const char *attrname; - t.get_attrname(attrname); - if (_collection_rmattr(cid, attrname) < 0) { - dout(7) << "apply_transaction fail on _collection_rmattr" << dendl; - r &= bit; - } - } - break; - - default: - dout(0) << "bad op " << op << dendl; - assert(0); - } - - bit = bit << 1; - } - - dout(7) << "_apply_transaction finish (r = " << r << ")" << dendl; - return r; -} - - - -int Ebofs::_write(object_t oid, off_t offset, size_t length, const bufferlist& bl) -{ - dout(7) << "_write " << oid << " " << offset << "~" << length << dendl; - assert(bl.length() == length); - - // too much unflushed dirty data? (if so, block!) - if (_write_will_block()) { - dout(10) << "_write blocking " - << oid << " " << offset << "~" << length - << " bc: " - << "size " << bc.get_size() - << ", trimmable " << bc.get_trimmable() - << ", max " << g_conf.ebofs_bc_size - << "; dirty " << bc.get_stat_dirty() - << ", tx " << bc.get_stat_tx() - << ", max dirty " << g_conf.ebofs_bc_max_dirty - << dendl; - - while (_write_will_block()) - bc.waitfor_stat(); // waits on ebofs_lock - - dout(10) << "_write unblocked " - << oid << " " << offset << "~" << length - << " bc: " - << "size " << bc.get_size() - << ", trimmable " << bc.get_trimmable() - << ", max " << g_conf.ebofs_bc_size - << "; dirty " << bc.get_stat_dirty() - << ", tx " << bc.get_stat_tx() - << ", max dirty " << g_conf.ebofs_bc_max_dirty - << dendl; - } - - // out of space? - unsigned max = (length+offset) / EBOFS_BLOCK_SIZE + 10; // very conservative; assumes we have to rewrite - max += dirty_onodes.size() + dirty_cnodes.size(); - if (max >= free_blocks) { - dout(1) << "write failing, only " << free_blocks << " blocks free, may need up to " << max << dendl; - return -ENOSPC; - } - - // get|create inode - Onode *on = get_onode(oid); - if (!on) on = new_onode(oid); // new inode! - if (on->readonly) { - put_onode(on); - return -EACCES; - } - - dirty_onode(on); // dirty onode! - - // apply write to buffer cache - if (length > 0) - apply_write(on, offset, length, bl); - - // done. - put_onode(on); - trim_bc(); - - return length; -} - - -int Ebofs::write(object_t oid, - off_t off, size_t len, - const bufferlist& bl, Context *onsafe) -{ - ebofs_lock.Lock(); - - // go - int r = _write(oid, off, len, bl); - - // commit waiter - if (r > 0) { - assert((size_t)r == len); - while (1) { - if (journal) { - Transaction t; - t.write(oid, off, len, bl); - bufferlist tbl; - t._encode(tbl); - if (journal->submit_entry(tbl, onsafe)) break; - } - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - break; - } - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return r; -} - - -int Ebofs::_remove(object_t oid) -{ - dout(7) << "_remove " << oid << dendl; - - // get inode - Onode *on = get_onode(oid); - if (!on) return -ENOENT; - - // ok remove it! - remove_onode(on); - - return 0; -} - - -int Ebofs::remove(object_t oid, Context *onsafe) -{ - ebofs_lock.Lock(); - - // do it - int r = _remove(oid); - - // journal, wait for commit - if (r >= 0) { - while (1) { - if (journal) { - Transaction t; - t.remove(oid); - bufferlist bl; - t._encode(bl); - if (journal->submit_entry(bl, onsafe)) break; - } - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - break; - } - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_truncate(object_t oid, off_t size) -{ - dout(7) << "_truncate " << oid << " size " << size << dendl; - - Onode *on = get_onode(oid); - if (!on) - return -ENOENT; - if (on->readonly) { - put_onode(on); - return -EACCES; - } - - int r = 0; - if (size > on->object_size) { - r = -EINVAL; // whatever - } - else if (size < on->object_size) { - // change size - on->object_size = size; - dirty_onode(on); - - // free blocks - block_t nblocks = 0; - if (size) nblocks = 1 + (size-1) / EBOFS_BLOCK_SIZE; - if (on->object_blocks > nblocks) { - vector extra; - on->truncate_extents(nblocks, extra); - for (unsigned i=0; ioc) { - on->oc->truncate(on->object_blocks, super_epoch); - if (on->oc->is_empty()) - on->close_oc(); - } - - // update uncommitted - interval_set uncom; - if (nblocks > 0) { - interval_set left; - left.insert(0, nblocks); - uncom.intersection_of(left, on->uncommitted); - } - dout(10) << "uncommitted was " << on->uncommitted << " now " << uncom << dendl; - on->uncommitted = uncom; - - } - else { - assert(size == on->object_size); - } - - put_onode(on); - return r; -} - - -int Ebofs::truncate(object_t oid, off_t size, Context *onsafe) -{ - ebofs_lock.Lock(); - - int r = _truncate(oid, size); - - // journal, wait for commit - if (r >= 0) { - while (1) { - if (journal) { - Transaction t; - t.truncate(oid, size); - bufferlist bl; - t._encode(bl); - if (journal->submit_entry(bl, onsafe)) break; - } - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - break; - } - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return r; -} - - - -int Ebofs::clone(object_t from, object_t to, Context *onsafe) -{ - ebofs_lock.Lock(); - - int r = _clone(from, to); - - // journal, wait for commit - if (r >= 0) { - while (1) { - if (journal) { - Transaction t; - t.clone(from, to); - bufferlist bl; - t._encode(bl); - if (journal->submit_entry(bl, onsafe)) break; - } - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - break; - } - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_clone(object_t from, object_t to) -{ - dout(7) << "_clone " << from << " -> " << to << dendl; - - if (!g_conf.ebofs_cloneable) - return -1; // no! - - Onode *fon = get_onode(from); - if (!fon) return -ENOENT; - Onode *ton = get_onode(to); - if (ton) { - put_onode(fon); - put_onode(ton); - return -EEXIST; - } - ton = new_onode(to); - assert(ton); - - // copy easy bits - ton->readonly = true; - ton->object_size = fon->object_size; - ton->object_blocks = fon->object_blocks; - ton->attr = fon->attr; - - // collections - for (set::iterator p = fon->collections.begin(); - p != fon->collections.end(); - p++) - _collection_add(*p, to); - - // extents - ton->extent_map = fon->extent_map; - for (map::iterator p = ton->extent_map.begin(); - p != ton->extent_map.end(); - ++p) { - allocator.alloc_inc(p->second); - } - - // clear uncommitted - fon->uncommitted.clear(); - - // muck with ObjectCache - if (fon->oc) - fon->oc->clone_to( ton ); - - // ok! - put_onode(ton); - put_onode(fon); - return 0; -} - - - - -/* - * pick object revision with rev < specified rev. - * (oid.rev is a noninclusive upper bound.) - * - */ -int Ebofs::pick_object_revision_lt(object_t& oid) -{ - assert(oid.rev > 0); // this is only useful for non-zero oid.rev - - int r = -EEXIST; // return code - ebofs_lock.Lock(); - { - object_t orig = oid; - object_t live = oid; - live.rev = 0; - - if (object_tab->get_num_keys() > 0) { - Table::Cursor cursor(object_tab); - - object_tab->find(oid, cursor); // this will be just _past_ highest eligible rev - if (cursor.move_left() > 0) { - bool firstpass = true; - while (1) { - object_t t = cursor.current().key; - if (t.ino != oid.ino || - t.bno != oid.bno) // passed to previous object - break; - if (oid.rev < t.rev) { // rev < desired. possible match. - r = 0; - oid = t; - break; - } - if (firstpass && oid.rev >= t.rev) { // there is no old rev < desired. try live. - r = 0; - oid = live; - break; - } - if (cursor.move_left() <= 0) break; - firstpass = false; - } - } - } - - dout(8) << "find_object_revision " << orig << " -> " << oid - << " r=" << r << dendl; - } - ebofs_lock.Unlock(); - return r; -} - - - - -bool Ebofs::exists(object_t oid) -{ - ebofs_lock.Lock(); - dout(8) << "exists " << oid << dendl; - bool e = (object_tab->lookup(oid) == 0); - ebofs_lock.Unlock(); - return e; -} - -int Ebofs::stat(object_t oid, struct stat *st) -{ - ebofs_lock.Lock(); - int r = _stat(oid,st); - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_stat(object_t oid, struct stat *st) -{ - dout(7) << "_stat " << oid << dendl; - - Onode *on = get_onode(oid); - if (!on) return -ENOENT; - - // ?? - st->st_size = on->object_size; - - put_onode(on); - return 0; -} - - -int Ebofs::_setattr(object_t oid, const char *name, const void *value, size_t size) -{ - dout(8) << "setattr " << oid << " '" << name << "' len " << size << dendl; - - Onode *on = get_onode(oid); - if (!on) return -ENOENT; - if (on->readonly) { - put_onode(on); - return -EACCES; - } - - string n(name); - on->attr[n] = buffer::copy((char*)value, size); - dirty_onode(on); - put_onode(on); - - dout(8) << "setattr " << oid << " '" << name << "' len " << size << " success" << dendl; - - return 0; -} - -int Ebofs::setattr(object_t oid, const char *name, const void *value, size_t size, Context *onsafe) -{ - ebofs_lock.Lock(); - int r = _setattr(oid, name, value, size); - - // journal, wait for commit - if (r >= 0) { - while (1) { - if (journal) { - Transaction t; - t.setattr(oid, name, value, size); - bufferlist bl; - t._encode(bl); - if (journal->submit_entry(bl, onsafe)) break; - } - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - break; - } - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_setattrs(object_t oid, map& attrset) -{ - dout(8) << "setattrs " << oid << dendl; - - Onode *on = get_onode(oid); - if (!on) return -ENOENT; - if (on->readonly) { - put_onode(on); - return -EACCES; - } - - on->attr = attrset; - dirty_onode(on); - put_onode(on); - return 0; -} - -int Ebofs::setattrs(object_t oid, map& attrset, Context *onsafe) -{ - ebofs_lock.Lock(); - int r = _setattrs(oid, attrset); - - // journal, wait for commit - if (r >= 0) { - while (1) { - if (journal) { - Transaction t; - t.setattrs(oid, attrset); - bufferlist bl; - t._encode(bl); - if (journal->submit_entry(bl, onsafe)) break; - } - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - break; - } - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return r; -} - - -int Ebofs::get_object_collections(object_t oid, set& ls) -{ - ebofs_lock.Lock(); - int r = _get_object_collections(oid, ls); - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_get_object_collections(object_t oid, set& ls) -{ - dout(8) << "_get_object_collections " << oid << dendl; - - Onode *on = get_onode(oid); - if (!on) return -ENOENT; - ls = on->collections; - put_onode(on); - return 0; -} - -int Ebofs::getattr(object_t oid, const char *name, void *value, size_t size) -{ - ebofs_lock.Lock(); - int r = _getattr(oid, name, value, size); - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_getattr(object_t oid, const char *name, void *value, size_t size) -{ - dout(8) << "_getattr " << oid << " '" << name << "' maxlen " << size << dendl; - - Onode *on = get_onode(oid); - if (!on) return -ENOENT; - - string n(name); - int r = 0; - if (on->attr.count(n) == 0) { - dout(10) << "_getattr " << oid << " '" << name << "' dne" << dendl; - r = -1; - } else { - r = MIN( on->attr[n].length(), size ); - dout(10) << "_getattr " << oid << " '" << name << "' got len " << r << dendl; - memcpy(value, on->attr[n].c_str(), r ); - } - put_onode(on); - return r; -} - -int Ebofs::getattrs(object_t oid, map &aset) -{ - ebofs_lock.Lock(); - int r = _getattrs(oid, aset); - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_getattrs(object_t oid, map &aset) -{ - dout(8) << "_getattrs " << oid << dendl; - - Onode *on = get_onode(oid); - if (!on) return -ENOENT; - aset = on->attr; - put_onode(on); - return 0; -} - - - -int Ebofs::_rmattr(object_t oid, const char *name) -{ - dout(8) << "_rmattr " << oid << " '" << name << "'" << dendl; - - Onode *on = get_onode(oid); - if (!on) return -ENOENT; - if (on->readonly) { - put_onode(on); - return -EACCES; - } - - string n(name); - on->attr.erase(n); - dirty_onode(on); - put_onode(on); - return 0; -} - -int Ebofs::rmattr(object_t oid, const char *name, Context *onsafe) -{ - ebofs_lock.Lock(); - - int r = _rmattr(oid, name); - - // journal, wait for commit - if (r >= 0) { - while (1) { - if (journal) { - Transaction t; - t.rmattr(oid, name); - bufferlist bl; - t._encode(bl); - if (journal->submit_entry(bl, onsafe)) break; - } - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - break; - } - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::listattr(object_t oid, vector& attrs) -{ - ebofs_lock.Lock(); - dout(8) << "listattr " << oid << dendl; - - Onode *on = get_onode(oid); - if (!on) { - ebofs_lock.Unlock(); - return -ENOENT; - } - - attrs.clear(); - for (map::iterator i = on->attr.begin(); - i != on->attr.end(); - i++) { - attrs.push_back(i->first); - } - - put_onode(on); - ebofs_lock.Unlock(); - return 0; -} - -int Ebofs::list_objects(list& ls) -{ - ebofs_lock.Lock(); - dout(9) << "list_objects " << dendl; - - Table::Cursor cursor(object_tab); - - int num = 0; - if (object_tab->find(object_t(), cursor) >= 0) { - while (1) { - ls.push_back(cursor.current().key); - num++; - if (cursor.move_right() <= 0) break; - } - } - - ebofs_lock.Unlock(); - return num; -} - - -/***************** collections ******************/ - -int Ebofs::list_collections(list& ls) -{ - ebofs_lock.Lock(); - dout(9) << "list_collections " << dendl; - - Table::Cursor cursor(collection_tab); - - int num = 0; - if (collection_tab->find(0, cursor) >= 0) { - while (1) { - ls.push_back(cursor.current().key); - num++; - if (cursor.move_right() <= 0) break; - } - } - - ebofs_lock.Unlock(); - return num; -} - -int Ebofs::_create_collection(coll_t cid) -{ - dout(9) << "_create_collection " << hex << cid << dec << dendl; - - if (_collection_exists(cid)) - return -EEXIST; - - Cnode *cn = new_cnode(cid); - put_cnode(cn); - - return 0; -} - -int Ebofs::create_collection(coll_t cid, Context *onsafe) -{ - ebofs_lock.Lock(); - - int r = _create_collection(cid); - - // journal, wait for commit - if (r >= 0) { - while (1) { - if (journal) { - Transaction t; - t.create_collection(cid); - bufferlist bl; - t._encode(bl); - if (journal->submit_entry(bl, onsafe)) break; - } - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - break; - } - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_destroy_collection(coll_t cid) -{ - dout(9) << "_destroy_collection " << hex << cid << dec << dendl; - - if (!_collection_exists(cid)) - return -ENOENT; - - Cnode *cn = get_cnode(cid); - assert(cn); - - // hose mappings - list objects; - collection_list(cid, objects); - for (list::iterator i = objects.begin(); - i != objects.end(); - i++) { - co_tab->remove(coll_object_t(cid,*i)); - - Onode *on = get_onode(*i); - if (on) { - on->collections.erase(cid); - dirty_onode(on); - put_onode(on); - } - } - - remove_cnode(cn); - return 0; -} - -int Ebofs::destroy_collection(coll_t cid, Context *onsafe) -{ - ebofs_lock.Lock(); - - int r = _destroy_collection(cid); - - // journal, wait for commit - if (r >= 0) { - while (1) { - if (journal) { - Transaction t; - t.remove_collection(cid); - bufferlist bl; - t._encode(bl); - if (journal->submit_entry(bl, onsafe)) break; - } - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - break; - } - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return r; -} - -bool Ebofs::collection_exists(coll_t cid) -{ - ebofs_lock.Lock(); - dout(10) << "collection_exists " << hex << cid << dec << dendl; - bool r = _collection_exists(cid); - ebofs_lock.Unlock(); - return r; -} -bool Ebofs::_collection_exists(coll_t cid) -{ - return (collection_tab->lookup(cid) == 0); -} - -int Ebofs::_collection_add(coll_t cid, object_t oid) -{ - dout(9) << "_collection_add " << hex << cid << " object " << oid << dec << dendl; - - if (!_collection_exists(cid)) - return -ENOENT; - - Onode *on = get_onode(oid); - if (!on) return -ENOENT; - - int r = 0; - - if (on->collections.count(cid) == 0) { - on->collections.insert(cid); - dirty_onode(on); - co_tab->insert(coll_object_t(cid,oid), true); - } else { - r = -ENOENT; // FIXME? already in collection. - } - - put_onode(on); - return r; -} - -int Ebofs::collection_add(coll_t cid, object_t oid, Context *onsafe) -{ - ebofs_lock.Lock(); - - int r = _collection_add(cid, oid); - - // journal, wait for commit - if (r >= 0) { - while (1) { - if (journal) { - Transaction t; - t.collection_add(cid, oid); - bufferlist bl; - t._encode(bl); - if (journal->submit_entry(bl, onsafe)) break; - } - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - break; - } - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return 0; -} - -int Ebofs::_collection_remove(coll_t cid, object_t oid) -{ - dout(9) << "_collection_remove " << hex << cid << " object " << oid << dec << dendl; - - if (!_collection_exists(cid)) - return -ENOENT; - - Onode *on = get_onode(oid); - if (!on) return -ENOENT; - - int r = 0; - - if (on->collections.count(cid)) { - on->collections.erase(cid); - dirty_onode(on); - co_tab->remove(coll_object_t(cid,oid)); - } else { - r = -ENOENT; // FIXME? - } - - put_onode(on); - return r; -} - -int Ebofs::collection_remove(coll_t cid, object_t oid, Context *onsafe) -{ - ebofs_lock.Lock(); - - int r = _collection_remove(cid, oid); - - // journal, wait for commit - if (r >= 0) { - while (1) { - if (journal) { - Transaction t; - t.collection_remove(cid, oid); - bufferlist bl; - t._encode(bl); - if (journal->submit_entry(bl, onsafe)) break; - } - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - break; - } - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return 0; -} - -int Ebofs::collection_list(coll_t cid, list& ls) -{ - ebofs_lock.Lock(); - dout(9) << "collection_list " << hex << cid << dec << dendl; - - if (!_collection_exists(cid)) { - ebofs_lock.Unlock(); - return -ENOENT; - } - - Table::Cursor cursor(co_tab); - - int num = 0; - if (co_tab->find(coll_object_t(cid,object_t()), cursor) >= 0) { - while (1) { - const coll_t c = cursor.current().key.first; - const object_t o = cursor.current().key.second; - if (c != cid) break; // end! - dout(10) << "collection_list " << hex << cid << " includes " << o << dec << dendl; - ls.push_back(o); - num++; - if (cursor.move_right() < 0) break; - } - } - - ebofs_lock.Unlock(); - return num; -} - - -int Ebofs::_collection_setattr(coll_t cid, const char *name, const void *value, size_t size) -{ - dout(10) << "_collection_setattr " << hex << cid << dec << " '" << name << "' len " << size << dendl; - - Cnode *cn = get_cnode(cid); - if (!cn) return -ENOENT; - - string n(name); - cn->attr[n] = buffer::copy((char*)value, size); - dirty_cnode(cn); - put_cnode(cn); - - return 0; -} - -int Ebofs::collection_setattr(coll_t cid, const char *name, const void *value, size_t size, Context *onsafe) -{ - ebofs_lock.Lock(); - dout(10) << "collection_setattr " << hex << cid << dec << " '" << name << "' len " << size << dendl; - - int r = _collection_setattr(cid, name, value, size); - - // journal, wait for commit - if (r >= 0) { - while (1) { - if (journal) { - Transaction t; - t.collection_setattr(cid, name, value, size); - bufferlist bl; - t._encode(bl); - if (journal->submit_entry(bl, onsafe)) break; - } - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - break; - } - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return 0; -} - -int Ebofs::collection_getattr(coll_t cid, const char *name, void *value, size_t size) -{ - ebofs_lock.Lock(); - dout(10) << "collection_setattr " << hex << cid << dec << " '" << name << "' maxlen " << size << dendl; - - Cnode *cn = get_cnode(cid); - if (!cn) { - ebofs_lock.Unlock(); - return -ENOENT; - } - - string n(name); - int r; - if (cn->attr.count(n) == 0) { - r = -1; - } else { - r = MIN( cn->attr[n].length(), size ); - memcpy(value, cn->attr[n].c_str(), r); - } - - put_cnode(cn); - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::collection_getattrs(coll_t cid, map &aset) -{ - ebofs_lock.Lock(); - int r = _collection_getattrs(cid, aset); - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_collection_getattrs(coll_t cid, map &aset) -{ - dout(8) << "_collection_getattrs " << cid << dendl; - - Cnode *cn = get_cnode(cid); - if (!cn) return -ENOENT; - aset = cn->attr; - put_cnode(cn); - return 0; -} - -int Ebofs::collection_setattrs(coll_t cid, map &aset) -{ - ebofs_lock.Lock(); - int r = _collection_setattrs(cid, aset); - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_collection_setattrs(coll_t cid, map &aset) -{ - dout(8) << "_collection_setattrs " << cid << dendl; - - Cnode *cn = get_cnode(cid); - if (!cn) return -ENOENT; - cn->attr = aset; - dirty_cnode(cn); - put_cnode(cn); - return 0; -} - - -int Ebofs::_collection_rmattr(coll_t cid, const char *name) -{ - dout(10) << "_collection_rmattr " << hex << cid << dec << " '" << name << "'" << dendl; - - Cnode *cn = get_cnode(cid); - if (!cn) return -ENOENT; - - string n(name); - cn->attr.erase(n); - - dirty_cnode(cn); - put_cnode(cn); - - return 0; -} - -int Ebofs::collection_rmattr(coll_t cid, const char *name, Context *onsafe) -{ - ebofs_lock.Lock(); - - int r = _collection_rmattr(cid, name); - - // journal, wait for commit - if (r >= 0) { - while (1) { - if (journal) { - Transaction t; - t.collection_rmattr(cid, name); - bufferlist bl; - t._encode(bl); - if (journal->submit_entry(bl, onsafe)) break; - } - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - break; - } - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return 0; -} - -int Ebofs::collection_listattr(coll_t cid, vector& attrs) -{ - ebofs_lock.Lock(); - dout(10) << "collection_listattr " << hex << cid << dec << dendl; - - Cnode *cn = get_cnode(cid); - if (!cn) { - ebofs_lock.Unlock(); - return -ENOENT; - } - - attrs.clear(); - for (map::iterator i = cn->attr.begin(); - i != cn->attr.end(); - i++) { - attrs.push_back(i->first); - } - - put_cnode(cn); - ebofs_lock.Unlock(); - return 0; -} - - - -void Ebofs::_export_freelist(bufferlist& bl) -{ - for (int b=0; b<=EBOFS_NUM_FREE_BUCKETS; b++) { - Table *tab; - if (b < EBOFS_NUM_FREE_BUCKETS) { - tab = free_tab[b]; - } else { - tab = limbo_tab; - } - - if (tab->get_num_keys() > 0) { - Table::Cursor cursor(tab); - assert(tab->find(0, cursor) >= 0); - while (1) { - assert(cursor.current().value > 0); - - Extent ex(cursor.current().key, cursor.current().value); - dout(10) << "_export_freelist " << ex << dendl; - bl.append((char*)&ex, sizeof(ex)); - if (cursor.move_right() <= 0) break; - } - } - } -} - -void Ebofs::_import_freelist(bufferlist& bl) -{ - // clear - for (int b=0; bclear(); - limbo_tab->clear(); - - // import! - int num = bl.length() / sizeof(Extent); - Extent *p = (Extent*)bl.c_str(); - for (int i=0; i *tab; - if (b < EBOFS_NUM_FREE_BUCKETS) { - tab = free_tab[b]; - dout(30) << "dump bucket " << b << " " << tab->get_num_keys() << dendl; - } else { - tab = limbo_tab; - dout(30) << "dump limbo " << tab->get_num_keys() << dendl;; - } - - if (tab->get_num_keys() > 0) { - Table::Cursor cursor(tab); - assert(tab->find(0, cursor) >= 0); - while (1) { - assert(cursor.current().value > 0); - - block_t l = cursor.current().value; - tfree += l; - int b = 0; - do { - l = l >> 1; - b++; - } while (l); - st.free_extent_dist[b]++; - st.free_extent_dist_sum[b] += cursor.current().value; - st.num_free_extent++; - - if (cursor.move_right() <= 0) break; - } - } - } - st.avg_free_extent = tfree / st.num_free_extent; -*/ - - // used extents is harder. :( - st.num_extent = 0; - st.avg_extent = 0; - st.extent_dist.clear(); - st.extent_dist_sum.clear(); - st.avg_extent_per_object = 0; - st.avg_extent_jump = 0; - - Table::Cursor cursor(object_tab); - object_tab->find(object_t(), cursor); - int nobj = 0; - int njump = 0; - while (object_tab->get_num_keys() > 0) { - Onode *on = get_onode(cursor.current().key); - assert(on); - - nobj++; - st.avg_extent_per_object += on->extent_map.size(); - - for (map::iterator p = on->extent_map.begin(); - p != on->extent_map.end(); - p++) { - block_t l = p->second.length; - - st.num_extent++; - st.avg_extent += l; - if (p->first > 0) { - njump++; - st.avg_extent_jump += l; - } - - int b = 0; - do { - l = l >> 1; - b++; - } while (l); - st.extent_dist[b]++; - st.extent_dist_sum[b] += p->second.length; - } - put_onode(on); - if (cursor.move_right() <= 0) break; - } - if (njump) st.avg_extent_jump /= njump; - if (nobj) st.avg_extent_per_object /= (float)nobj; - if (st.num_extent) st.avg_extent /= st.num_extent; - - ebofs_lock.Unlock(); -} diff --git a/branches/sage/mds/ebofs/Ebofs.h b/branches/sage/mds/ebofs/Ebofs.h deleted file mode 100644 index 13eebabd93aad..0000000000000 --- a/branches/sage/mds/ebofs/Ebofs.h +++ /dev/null @@ -1,370 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include -using namespace std; - -#include -using namespace __gnu_cxx; - -#include "include/Context.h" -#include "include/buffer.h" -#include "include/hash.h" - -#include "types.h" -#include "Onode.h" -#include "Cnode.h" -#include "BlockDevice.h" -#include "nodes.h" -#include "Allocator.h" -#include "Table.h" -#include "Journal.h" - -#include "common/Mutex.h" -#include "common/Cond.h" - -#include "osd/ObjectStore.h" - -//typedef pair object_coll_t; -typedef pair coll_object_t; - - -class Ebofs : public ObjectStore { -protected: - Mutex ebofs_lock; // a beautiful global lock - - // ** debuggy ** - bool fake_writes; - - // ** super ** -public: - BlockDevice dev; -protected: - bool mounted, unmounting, dirty; - bool readonly; - version_t super_epoch; - bool commit_thread_started, mid_commit; - Cond commit_cond; // to wake up the commit thread - Cond sync_cond; - uint64_t super_fsid; - - map > commit_waiters; - - void prepare_super(version_t epoch, bufferptr& bp); - void write_super(version_t epoch, bufferptr& bp); - int commit_thread_entry(); - - class CommitThread : public Thread { - Ebofs *ebofs; - public: - CommitThread(Ebofs *e) : ebofs(e) {} - void *entry() { - ebofs->commit_thread_entry(); - return 0; - } - } commit_thread; - -public: - uint64_t get_fsid() { return super_fsid; } - epoch_t get_super_epoch() { return super_epoch; } -protected: - - - // ** journal ** - char *journalfn; - Journal *journal; - - // ** allocator ** - block_t free_blocks, limbo_blocks; - Allocator allocator; - friend class Allocator; - - block_t get_free_blocks() { return free_blocks; } - block_t get_limbo_blocks() { return limbo_blocks; } - block_t get_free_extents() { - int n = 0; - for (int i=0; iget_num_keys(); - return n; - } - block_t get_limbo_extents() { return limbo_tab->get_num_keys(); } - - - // ** tables and sets ** - // nodes - NodePool nodepool; // for all tables... - - // tables - Table *object_tab; - Table *free_tab[EBOFS_NUM_FREE_BUCKETS]; - Table *limbo_tab; - Table > *alloc_tab; - - // collections - Table *collection_tab; - Table *co_tab; - - void close_tables(); - void verify_tables(); - - - // ** onodes ** - hash_map onode_map; // onode cache - LRU onode_lru; - set dirty_onodes; - map > waitfor_onode; - - Onode* new_onode(object_t oid); // make new onode. ref++. - bool have_onode(object_t oid) { - return onode_map.count(oid); - } - Onode* get_onode(object_t oid); // get cached onode, or read from disk. ref++. - void remove_onode(Onode *on); - void put_onode(Onode* o); // put it back down. ref--. - void dirty_onode(Onode* o); - void encode_onode(Onode *on, bufferlist& bl, unsigned& off); - void write_onode(Onode *on); - - // ** cnodes ** - hash_map > cnode_map; - LRU cnode_lru; - set dirty_cnodes; - map > waitfor_cnode; - - Cnode* new_cnode(coll_t cid); - Cnode* get_cnode(coll_t cid); - void remove_cnode(Cnode *cn); - void put_cnode(Cnode *cn); - void dirty_cnode(Cnode *cn); - void encode_cnode(Cnode *cn, bufferlist& bl, unsigned& off); - void write_cnode(Cnode *cn); - - // ** onodes+cnodes = inodes ** - int inodes_flushing; - Cond inode_commit_cond; - - void flush_inode_finish(); - void commit_inodes_start(); - void commit_inodes_wait(); - friend class C_E_InodeFlush; - - void trim_inodes(int max = -1); - - // ** buffer cache ** - BufferCache bc; - pthread_t flushd_thread_id; - - version_t trigger_commit(); - void commit_bc_wait(version_t epoch); - void trim_bc(off_t max = -1); - - public: - void kick_idle(); - void sync(); - void sync(Context *onsafe); - void trim_buffer_cache(); - - class IdleKicker : public BlockDevice::kicker { - Ebofs *ebo; - public: - IdleKicker(Ebofs *t) : ebo(t) {} - void kick() { ebo->kick_idle(); } - } idle_kicker; - - - protected: - //void zero(Onode *on, size_t len, off_t off, off_t write_thru); - void alloc_write(Onode *on, - block_t start, block_t len, - interval_set& alloc, - block_t& old_bfirst, block_t& old_blast); - void apply_write(Onode *on, off_t off, size_t len, const bufferlist& bl); - bool attempt_read(Onode *on, off_t off, size_t len, bufferlist& bl, - Cond *will_wait_on, bool *will_wait_on_bool); - - // ** finisher ** - // async write notification to users - Mutex finisher_lock; - Cond finisher_cond; - bool finisher_stop; - list finisher_queue; - -public: - void queue_finisher(Context *c) { - finisher_lock.Lock(); - finisher_queue.push_back(c); - finisher_cond.Signal(); - finisher_lock.Unlock(); - } - void queue_finishers(list& ls) { - finisher_lock.Lock(); - finisher_queue.splice(finisher_queue.end(), ls); - finisher_cond.Signal(); - finisher_lock.Unlock(); - } -protected: - - void *finisher_thread_entry(); - class FinisherThread : public Thread { - Ebofs *ebofs; - public: - FinisherThread(Ebofs *e) : ebofs(e) {} - void* entry() { return (void*)ebofs->finisher_thread_entry(); } - } finisher_thread; - - - void alloc_more_node_space(); - - void do_csetattrs(map > > &cmods); - void do_setattrs(Onode *on, map > &setattrs); - - - public: - Ebofs(char *devfn, char *jfn=0) : - fake_writes(false), - dev(devfn), - mounted(false), unmounting(false), dirty(false), readonly(false), - super_epoch(0), commit_thread_started(false), mid_commit(false), - commit_thread(this), - journalfn(jfn), journal(0), - free_blocks(0), limbo_blocks(0), - allocator(this), - nodepool(ebofs_lock), - object_tab(0), limbo_tab(0), collection_tab(0), co_tab(0), - onode_lru(g_conf.ebofs_oc_size), - cnode_lru(g_conf.ebofs_cc_size), - inodes_flushing(0), - bc(dev, ebofs_lock), - idle_kicker(this), - finisher_stop(false), finisher_thread(this) { - for (int i=0; i& ls); - - // object attr - int setattr(object_t oid, const char *name, const void *value, size_t size, Context *onsafe=0); - int setattrs(object_t oid, map& attrset, Context *onsafe=0); - int getattr(object_t oid, const char *name, void *value, size_t size); - int getattrs(object_t oid, map &aset); - int rmattr(object_t oid, const char *name, Context *onsafe=0); - int listattr(object_t oid, vector& attrs); - - int get_object_collections(object_t oid, set& ls); - - // collections - int list_collections(list& ls); - bool collection_exists(coll_t c); - - int create_collection(coll_t c, Context *onsafe); - int destroy_collection(coll_t c, Context *onsafe); - int collection_add(coll_t c, object_t o, Context *onsafe); - int collection_remove(coll_t c, object_t o, Context *onsafe); - - int collection_list(coll_t c, list& o); - - int collection_setattr(coll_t cid, const char *name, const void *value, size_t size, Context *onsafe); - int collection_setattrs(coll_t cid, map &aset); - int collection_getattr(coll_t cid, const char *name, void *value, size_t size); - int collection_getattrs(coll_t cid, map &aset); - int collection_rmattr(coll_t cid, const char *name, Context *onsafe); - int collection_listattr(coll_t oid, vector& attrs); - - // maps - int map_lookup(object_t o, bufferlist& key, bufferlist& val); - int map_insert(object_t o, bufferlist& key, bufferlist& val); - int map_remove(object_t o, bufferlist& key); - int map_list(object_t o, list& keys); - int map_list(object_t o, map& vals); - int map_list(object_t o, - bufferlist& start, bufferlist& end, - map& vals); - - // crap - void _fake_writes(bool b) { fake_writes = b; } - void _get_frag_stat(FragmentationStat& st); - - void _import_freelist(bufferlist& bl); - void _export_freelist(bufferlist& bl); - - -private: - // private interface -- use if caller already holds lock - unsigned _apply_transaction(Transaction& t); - - int _read(object_t oid, off_t off, size_t len, bufferlist& bl); - int _is_cached(object_t oid, off_t off, size_t len); - int _stat(object_t oid, struct stat *st); - int _getattr(object_t oid, const char *name, void *value, size_t size); - int _getattrs(object_t oid, map &aset); - int _get_object_collections(object_t oid, set& ls); - - bool _write_will_block(); - int _write(object_t oid, off_t off, size_t len, const bufferlist& bl); - void _trim_from_cache(object_t oid, off_t off, size_t len); - int _truncate(object_t oid, off_t size); - int _truncate_front(object_t oid, off_t size); - int _remove(object_t oid); - int _clone(object_t from, object_t to); - int _setattr(object_t oid, const char *name, const void *value, size_t size); - int _setattrs(object_t oid, map& attrset); - int _rmattr(object_t oid, const char *name); - bool _collection_exists(coll_t c); - int _create_collection(coll_t c); - int _destroy_collection(coll_t c); - int _collection_add(coll_t c, object_t o); - int _collection_remove(coll_t c, object_t o); - int _collection_getattrs(coll_t oid, map &aset); - int _collection_setattr(coll_t oid, const char *name, const void *value, size_t size); - int _collection_setattrs(coll_t oid, map &aset); - int _collection_rmattr(coll_t cid, const char *name); - - -}; diff --git a/branches/sage/mds/ebofs/FileJournal.cc b/branches/sage/mds/ebofs/FileJournal.cc deleted file mode 100644 index 35a1e6f4127b6..0000000000000 --- a/branches/sage/mds/ebofs/FileJournal.cc +++ /dev/null @@ -1,456 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "FileJournal.h" -#include "Ebofs.h" - -#include -#include -#include -#include - - -#include "config.h" - -#define dout(x) if (x <= g_conf.debug_ebofs) *_dout << dbeginl << g_clock.now() << " ebofs(" << ebofs->dev.get_device_name() << ").journal " -#define derr(x) if (x <= g_conf.debug_ebofs) *_derr << dbeginl << g_clock.now() << " ebofs(" << ebofs->dev.get_device_name() << ").journal " - - -int FileJournal::create() -{ - dout(2) << "create " << fn << dendl; - - // open/create - fd = ::open(fn.c_str(), O_RDWR|O_SYNC); - if (fd < 0) { - dout(2) << "create failed " << errno << " " << strerror(errno) << dendl; - return -errno; - } - assert(fd > 0); - - //::ftruncate(fd, 0); - //::fchmod(fd, 0644); - - // get size - struct stat st; - ::fstat(fd, &st); - dout(2) << "create " << fn << " " << st.st_size << " bytes" << dendl; - - // write empty header - memset(&header, 0, sizeof(header)); - header.clear(); - header.fsid = ebofs->get_fsid(); - header.max_size = st.st_size; - write_header(); - - // writeable. - read_pos = 0; - write_pos = queue_pos = sizeof(header); - - ::close(fd); - - return 0; -} - -int FileJournal::open() -{ - //dout(1) << "open " << fn << dendl; - - // open and file - assert(fd == 0); - fd = ::open(fn.c_str(), O_RDWR|O_SYNC); - if (fd < 0) { - dout(2) << "open failed " << errno << " " << strerror(errno) << dendl; - return -errno; - } - assert(fd > 0); - - // assume writeable, unless... - read_pos = 0; - write_pos = queue_pos = sizeof(header); - - // read header? - read_header(); - if (header.fsid != ebofs->get_fsid()) { - dout(2) << "open journal fsid doesn't match, invalid (someone else's?) journal" << dendl; - } - else if (header.num > 0) { - // valid header, pick an offset - for (int i=0; iget_super_epoch()) { - dout(2) << "using read_pos header pointer " - << header.epoch[i] << " at " << header.offset[i] - << dendl; - read_pos = header.offset[i]; - write_pos = queue_pos = 0; - break; - } - else if (header.epoch[i] < ebofs->get_super_epoch()) { - dout(2) << "super_epoch is " << ebofs->get_super_epoch() - << ", skipping old " << header.epoch[i] << " at " << header.offset[i] - << dendl; - } - else if (header.epoch[i] > ebofs->get_super_epoch()) { - dout(2) << "super_epoch is " << ebofs->get_super_epoch() - << ", but wtf, journal is later " << header.epoch[i] << " at " << header.offset[i] - << dendl; - break; - } - } - } - - start_writer(); - - return 0; -} - -void FileJournal::close() -{ - dout(1) << "close " << fn << dendl; - - // stop writer thread - stop_writer(); - - // close - assert(writeq.empty()); - assert(commitq.empty()); - assert(fd > 0); - ::close(fd); - fd = 0; -} - -void FileJournal::start_writer() -{ - write_stop = false; - write_thread.create(); -} - -void FileJournal::stop_writer() -{ - write_lock.Lock(); - { - write_stop = true; - write_cond.Signal(); - } - write_lock.Unlock(); - write_thread.join(); -} - - -void FileJournal::print_header() -{ - for (int i=0; i::const_iterator it = bl.buffers().begin(); - it != bl.buffers().end(); - it++) { - if ((*it).length() == 0) continue; // blank buffer. - ::write(fd, (char*)(*it).c_str(), (*it).length() ); - } - - ::write(fd, &h, sizeof(h)); - - // move position pointer - write_pos += 2*sizeof(entry_header_t) + bl.length(); - - if (oncommit) { - if (1) { - // queue callback - ebofs->queue_finisher(oncommit); - } else { - // callback now - oncommit->finish(0); - delete oncommit; - } - } - } - } - - write_lock.Unlock(); - dout(10) << "write_thread_entry finish" << dendl; -} - -bool FileJournal::submit_entry(bufferlist& e, Context *oncommit) -{ - assert(queue_pos != 0); // bad create(), or journal didn't replay to completion. - - // ** lock ** - Mutex::Locker locker(write_lock); - - // wrap? full? - off_t size = 2*sizeof(entry_header_t) + e.length(); - - if (full) return false; // already marked full. - - if (header.wrap) { - // we're wrapped. don't overwrite ourselves. - if (queue_pos + size >= header.offset[0]) { - dout(10) << "submit_entry JOURNAL FULL (and wrapped), " << queue_pos << "+" << size - << " >= " << header.offset[0] - << dendl; - full = true; - print_header(); - return false; - } - } else { - // we haven't wrapped. - if (queue_pos + size >= header.max_size) { - // is there room if we wrap? - if ((off_t)sizeof(header_t) + size < header.offset[0]) { - // yes! - dout(10) << "submit_entry wrapped from " << queue_pos << " to " << sizeof(header_t) << dendl; - header.wrap = queue_pos; - queue_pos = sizeof(header_t); - header.push(ebofs->get_super_epoch(), queue_pos); - } else { - // no room. - dout(10) << "submit_entry JOURNAL FULL (and can't wrap), " << queue_pos << "+" << size - << " >= " << header.max_size - << dendl; - full = true; - return false; - } - } - } - - dout(10) << "submit_entry " << queue_pos << " : " << e.length() - << " epoch " << ebofs->get_super_epoch() - << " " << oncommit << dendl; - - // dump on queue - writeq.push_back(pair(ebofs->get_super_epoch(), e)); - commitq.push_back(oncommit); - - queue_pos += size; - - // kick writer thread - write_cond.Signal(); - - return true; -} - - -void FileJournal::commit_epoch_start() -{ - dout(10) << "commit_epoch_start on " << ebofs->get_super_epoch()-1 - << " -- new epoch " << ebofs->get_super_epoch() - << dendl; - - Mutex::Locker locker(write_lock); - - // was full -> empty -> now usable? - if (full) { - if (header.num != 0) { - dout(1) << " journal FULL, ignoring this epoch" << dendl; - return; - } - - dout(1) << " clearing FULL flag, journal now usable" << dendl; - full = false; - } - - // note epoch boundary - header.push(ebofs->get_super_epoch(), queue_pos); // note: these entries may not yet be written. - //write_header(); // no need to write it now, though... -} - -void FileJournal::commit_epoch_finish() -{ - dout(10) << "commit_epoch_finish committed " << ebofs->get_super_epoch()-1 << dendl; - - write_lock.Lock(); - { - if (full) { - // full journal damage control. - dout(15) << " journal was FULL, contents now committed, clearing header. journal still not usable until next epoch." << dendl; - header.clear(); - write_pos = queue_pos = sizeof(header_t); - } else { - // update header -- trim/discard old (committed) epochs - while (header.epoch[0] < ebofs->get_super_epoch()) - header.pop(); - } - write_header(); - - // discard any unwritten items in previous epoch, and do callbacks - epoch_t epoch = ebofs->get_super_epoch(); - list callbacks; - while (!writeq.empty() && writeq.front().first < epoch) { - dout(15) << " dropping unwritten and committed " - << write_pos << " : " << writeq.front().second.length() - << " epoch " << writeq.front().first - << dendl; - // finisher? - Context *oncommit = commitq.front(); - if (oncommit) callbacks.push_back(oncommit); - - write_pos += 2*sizeof(entry_header_t) + writeq.front().second.length(); - - // discard. - writeq.pop_front(); - commitq.pop_front(); - } - - // queue the finishers - ebofs->queue_finishers(callbacks); - } - write_lock.Unlock(); - -} - - -void FileJournal::make_writeable() -{ - if (read_pos) - write_pos = queue_pos = read_pos; - else - write_pos = queue_pos = sizeof(header_t); - read_pos = 0; -} - - -bool FileJournal::read_entry(bufferlist& bl, epoch_t& epoch) -{ - if (!read_pos) { - dout(2) << "read_entry -- not readable" << dendl; - return false; - } - - if (read_pos == header.wrap) { - // find wrap point - for (int i=1; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __EBOFS_FILEJOURNAL_H -#define __EBOFS_FILEJOURNAL_H - - -#include "Journal.h" -#include "common/Cond.h" -#include "common/Mutex.h" -#include "common/Thread.h" - -class FileJournal : public Journal { -public: - /** log header - * we allow 3 pointers: - * top/initial, - * one for an epoch boundary, - * and one for a wrap in the ring buffer/journal file. - * the epoch boundary one is useful only for speedier recovery in certain cases - * (i.e. when ebofs committed, but the journal didn't rollover ... very small window!) - */ - struct header_t { - uint64_t fsid; - int num; - off_t wrap; - off_t max_size; - epoch_t epoch[3]; - off_t offset[3]; - - header_t() : fsid(0), num(0), wrap(0), max_size(0) {} - - void clear() { - num = 0; - wrap = 0; - } - void pop() { - if (num >= 2 && offset[0] > offset[1]) - wrap = 0; // we're eliminating a wrap - num--; - for (int i=0; i > writeq; // currently journaling - list commitq; // currently journaling - - // write thread - Mutex write_lock; - Cond write_cond; - bool write_stop; - - void print_header(); - void read_header(); - void write_header(); - void start_writer(); - void stop_writer(); - void write_thread_entry(); - - class Writer : public Thread { - FileJournal *journal; - public: - Writer(FileJournal *fj) : journal(fj) {} - void *entry() { - journal->write_thread_entry(); - return 0; - } - } write_thread; - - public: - FileJournal(Ebofs *e, char *f) : - Journal(e), fn(f), - full(false), - write_pos(0), queue_pos(0), read_pos(0), - fd(0), - write_stop(false), write_thread(this) { } - ~FileJournal() {} - - int create(); - int open(); - void close(); - - void make_writeable(); - - // writes - bool submit_entry(bufferlist& e, Context *oncommit); // submit an item - void commit_epoch_start(); // mark epoch boundary - void commit_epoch_finish(); // mark prior epoch as committed (we can expire) - - bool read_entry(bufferlist& bl, epoch_t& e); - - // reads -}; - -#endif diff --git a/branches/sage/mds/ebofs/Journal.h b/branches/sage/mds/ebofs/Journal.h deleted file mode 100644 index 9bab0b7f3c109..0000000000000 --- a/branches/sage/mds/ebofs/Journal.h +++ /dev/null @@ -1,47 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __EBOFS_JOURNAL_H -#define __EBOFS_JOURNAL_H - -class Ebofs; - -#include "include/buffer.h" -#include "include/Context.h" - -class Journal { -protected: - Ebofs *ebofs; - -public: - Journal(Ebofs *e) : ebofs(e) { } - virtual ~Journal() { } - - virtual int create() = 0; - virtual int open() = 0; - virtual void close() = 0; - - // writes - virtual void make_writeable() = 0; - virtual bool submit_entry(bufferlist& e, Context *oncommit) = 0;// submit an item - virtual void commit_epoch_start() = 0; // mark epoch boundary - virtual void commit_epoch_finish() = 0; // mark prior epoch as committed (we can expire) - virtual bool read_entry(bufferlist& bl, epoch_t &e) = 0; - - // reads/recovery - -}; - -#endif diff --git a/branches/sage/mds/ebofs/Onode.h b/branches/sage/mds/ebofs/Onode.h deleted file mode 100644 index 1d79d317dd96a..0000000000000 --- a/branches/sage/mds/ebofs/Onode.h +++ /dev/null @@ -1,408 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __EBOFS_ONODE_H -#define __EBOFS_ONODE_H - -#include "include/lru.h" - -#include "types.h" -#include "BufferCache.h" - -#include "include/interval_set.h" - - -/* - * object node (like an inode) - * - * holds object metadata, including - * size - * allocation (extent list) - * attributes - * - */ - -class Onode : public LRUObject { -private: - int ref; - -public: - object_t object_id; - version_t version; // incremented on each modify. - - // data - bool readonly; - Extent onode_loc; - off_t object_size; - unsigned object_blocks; - - // onode - set collections; - map attr; - //vector extents; - map extent_map; - - interval_set uncommitted; - - ObjectCache *oc; - - bool dirty; - bool dangling; // not in onode_map - bool deleted; // deleted - - list commit_waiters; - - public: - Onode(object_t oid) : ref(0), object_id(oid), version(0), - readonly(false), - object_size(0), object_blocks(0), - oc(0), - dirty(false), dangling(false), deleted(false) { - onode_loc.length = 0; - } - ~Onode() { - if (oc) delete oc; - } - - block_t get_onode_id() { return onode_loc.start; } - int get_onode_len() { return onode_loc.length; } - - int get_ref_count() { return ref; } - void get() { - if (ref == 0) lru_pin(); - ref++; - //cout << "ebofs.onode.get " << hex << object_id << dec << " " << ref << std::endl; - } - void put() { - ref--; - if (ref == 0) lru_unpin(); - //cout << "ebofs.onode.put " << hex << object_id << dec << " " << ref << std::endl; - } - - void mark_dirty() { - if (!dirty) { - dirty = true; - get(); - } - } - void mark_clean() { - if (dirty) { - dirty = false; - put(); - } - } - bool is_dirty() { return dirty; } - bool is_deleted() { return deleted; } - bool is_dangling() { return dangling; } - - - bool have_oc() { - return oc != 0; - } - ObjectCache *get_oc(BufferCache *bc) { - if (!oc) { - oc = new ObjectCache(object_id, this, bc); - oc->get(); - get(); - } - return oc; - } - void close_oc() { - if (oc) { - //cout << "close_oc on " << object_id << std::endl; - assert(oc->is_empty()); - if (oc->put() == 0){ - //cout << "************************* hosing oc" << std::endl; - delete oc; - } - oc = 0; - put(); - } - } - - - // allocation - void verify_extents() { - if (0) { // do crazy stupid sanity checking - block_t count = 0; - interval_set is; - - set s; - cout << "verifying" << std::endl; - - for (map::iterator p = extent_map.begin(); - p != extent_map.end(); - p++) { - cout << " " << p->first << ": " << p->second << std::endl; - assert(count == p->first); - count += p->second.length; - for (unsigned j=0;jsecond.length;j++) { - assert(s.count(p->second.start+j) == 0); - s.insert(p->second.start+j); - } - } - - assert(s.size() == count); - assert(count == object_blocks); - } - } - void set_extent(block_t offset, Extent ex) { - //cout << "set_extent " << offset << " -> " << ex << " ... " << object_blocks << std::endl; - assert(offset <= object_blocks); - verify_extents(); - - // at the end? - if (offset == object_blocks) { - //cout << " appending " << ex << std::endl; - if (!extent_map.empty() && extent_map.rbegin()->second.end() == ex.start) { - //cout << "appending " << ex << " to " << extent_map.rbegin()->second << std::endl; - extent_map.rbegin()->second.length += ex.length; - } else - extent_map[object_blocks] = ex; - object_blocks += ex.length; - return; - } - - // removing any extent bits we overwrite - if (!extent_map.empty()) { - // preceeding extent? - map::iterator p = extent_map.lower_bound(offset); - if (p != extent_map.begin()) { - p--; - if (p->first + p->second.length > offset) { - //cout << " preceeding was " << p->second << std::endl; - if (p->first + p->second.length > offset+ex.length) { - // cutting chunk out of middle, add last bit - Extent &n = extent_map[offset+ex.length] = p->second; - n.start += offset+ex.length - p->first; - n.length -= offset+ex.length - p->first; - //cout << " tail frag is " << n << std::endl; - } - p->second.length = offset - p->first; // cut tail off preceeding extent - //cout << " preceeding now " << p->second << std::endl; - } - p++; - } - - // overlapping extents - while (p != extent_map.end() && - p->first < offset + ex.length) { - map::iterator next = p; - next++; - - // completely subsumed? - if (p->first + p->second.length <= offset+ex.length) { - //cout << " erasing " << p->second << std::endl; - extent_map.erase(p); - p = next; - continue; - } - - // spans new extent, cut off head - Extent &n = extent_map[ offset+ex.length ] = p->second; - //cout << " cut head off " << p->second; - n.start += offset+ex.length - p->first; - n.length -= offset+ex.length - p->first; - extent_map.erase(p); - //cout << ", now " << n << std::endl; - break; - } - } - - extent_map[ offset ] = ex; - - // extend object? - if (offset + ex.length > object_blocks) - object_blocks = offset+ex.length; - - verify_extents(); - } - - - /* map_extents(start, len, ls) - * map teh given page range into extents on disk. - */ - int map_extents(block_t start, block_t len, vector& ls) { - //cout << "map_extents " << start << " " << len << std::endl; - verify_extents(); - - //assert(start+len <= object_blocks); - - map::iterator p; - - // hack hack speed up common cases! - if (start == 0) { - p = extent_map.begin(); - } else if (start+len == object_blocks && len == 1 && !extent_map.empty()) { - // append hack. - p = extent_map.end(); - p--; - if (p->first < start) p++; - //while (p->first >= start) p--; - //p++; - } else { - // normal - p = extent_map.lower_bound(start); - } - - if (p != extent_map.begin() && - (p == extent_map.end() || p->first > start && p->first)) { - p--; - if (p->second.length > start - p->first) { - Extent ex; - ex.start = p->second.start + (start - p->first); - ex.length = MIN(len, p->second.length - (start - p->first)); - ls.push_back(ex); - - //cout << " got (tail of?) " << p->second << " : " << ex << std::endl; - - start += ex.length; - len -= ex.length; - } - p++; - } - - while (len > 0 && - p != extent_map.end()) { - assert(p->first == start); - Extent ex = p->second; - ex.length = MIN(len, ex.length); - ls.push_back(ex); - //cout << " got (head of?) " << p->second << " : " << ex << std::endl; - start += ex.length; - len -= ex.length; - p++; - } - - return 0; - } - - int truncate_extents(block_t len, vector& extra) { - verify_extents(); - - map::iterator p = extent_map.lower_bound(len); - if (p != extent_map.begin() && - (p == extent_map.end() || p->first > len && p->first)) { - p--; - if (p->second.length > len - p->first) { - Extent ex; - ex.start = p->second.start + (len - p->first); - ex.length = p->second.length - (len - p->first); - extra.push_back(ex); - - p->second.length = len - p->first; - assert(p->second.length > 0); - - //cout << " got (tail of?) " << p->second << " : " << ex << std::endl; - } - p++; - } - - while (p != extent_map.end()) { - assert(p->first >= len); - extra.push_back(p->second); - map::iterator n = p; - n++; - extent_map.erase(p); - p = n; - } - - object_blocks = len; - verify_extents(); - return 0; - } - - int truncate_front_extents(block_t len, vector& extra) { - verify_extents(); - - while (len > 0) { - Extent& ex = extent_map.begin()->second; // look, this is a reference! - if (ex.length > len) { - // partial first extent - Extent frontbit( ex.start, len ); - extra.push_back(frontbit); - ex.length -= len; - ex.start += len; - break; - } - - // pull off entire first extent. - assert(ex.length <= len); - len -= ex.length; - extra.push_back(ex); - extent_map.erase(extent_map.begin()); - } - - object_blocks -= len; - verify_extents(); - return 0; - } - - - - /* map_alloc_regions(start, len, map) - * map range into regions that need to be (re)allocated on disk - * because they overlap "safe" (or unallocated) parts of the object - */ - /* - void map_alloc_regions(block_t start, block_t len, - interval_set& alloc) { - interval_set already_uncom; - - alloc.insert(start, len); // start with whole range - already_uncom.intersection_of(alloc, uncommitted); - alloc.subtract(already_uncom); // take out the bits that aren't yet committed - } - */ - - - - // pack/unpack - int get_collection_bytes() { - return sizeof(coll_t) * collections.size(); - } - int get_attr_bytes() { - int s = 0; - for (map::iterator i = attr.begin(); - i != attr.end(); - i++) { - s += i->first.length() + 1; - s += i->second.length() + sizeof(int); - } - return s; - } - int get_extent_bytes() { - return sizeof(Extent) * extent_map.size(); - } - -}; - - -inline ostream& operator<<(ostream& out, Onode& on) -{ - out << "onode(" << hex << on.object_id << dec << " len=" << on.object_size; - out << " ref=" << on.get_ref_count(); - if (on.is_dirty()) out << " dirty"; - if (on.is_dangling()) out << " dangling"; - if (on.is_deleted()) out << " deleted"; - out << " uncom=" << on.uncommitted; - // out << " " << &on; - out << ")"; - return out; -} - - - -#endif diff --git a/branches/sage/mds/ebofs/Table.h b/branches/sage/mds/ebofs/Table.h deleted file mode 100644 index 041a55afa0c68..0000000000000 --- a/branches/sage/mds/ebofs/Table.h +++ /dev/null @@ -1,928 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __EBOFS_TABLE_H -#define __EBOFS_TABLE_H - -#include "types.h" -#include "nodes.h" - -/** table **/ - -#define dbtout if (25 <= g_conf.debug_ebofs) cout << "ebofs.table(" << this << ")." - - -template -class Table { - private: - NodePool &pool; - - nodeid_t root; - int nkeys; - int depth; - - public: - Table(NodePool &p, - struct ebofs_table& bts) : - pool(p), - root(bts.root), nkeys(bts.num_keys), depth(bts.depth) { - dbtout << "cons" << std::endl; - } - - nodeid_t get_root() { return root; } - int get_num_keys() { return nkeys; } - int get_depth() { return depth; } - - - /* - */ - class _IndexItem { // i just need a struct size for below - K k; - nodeid_t n; - }; - class IndexItem { - public: - K key; - nodeid_t node; - static const int MAX = Node::ITEM_LEN / (sizeof(_IndexItem)); - static const int MIN = MAX/2; - }; - class _LeafItem { // i just need a struct size for below - K k; - V v; - }; - class LeafItem { - public: - K key; - V value; - static const int MAX = Node::ITEM_LEN / (sizeof(_LeafItem)); - static const int MIN = MAX/2; - }; - - class Nodeptr { - public: - Node *node; - - Nodeptr() : node(0) {} - Nodeptr(Node *n) : node(n) {} - Nodeptr(NodePool& pool, nodeid_t nid) { - open(pool, nid); - } - Nodeptr& operator=(Node *n) { - node = n; - return *this; - } - - void open(NodePool& pool, nodeid_t nid) { - node = pool.get_node(nid); - if (is_index() && node->children.empty()) init_index(pool); - } - - LeafItem& leaf_item(int i) { return (( LeafItem*)(node->item_ptr()))[i]; } - IndexItem& index_item(int i) { return ((IndexItem*)(node->item_ptr()))[i]; } - K key(int i) { - if (node->is_index()) - return index_item(i).key; - else - return leaf_item(i).key; - } - - bool is_leaf() { return node->is_leaf(); } - bool is_index() { return node->is_index(); } - void set_type(int t) { node->set_type(t); } - - int max_items() const { - if (node->is_leaf()) - return LeafItem::MAX; - else - return IndexItem::MAX; - } - int min_items() const { return max_items() / 2; } - - nodeid_t get_id() { return node->get_id(); } - - int size() { return node->size(); } - void set_size(int s) { node->set_size(s); } - - void init_index(NodePool& nodepool) { - /* - node->children = vector(max_items()); - for (int i=0; ichildren[i] = nodepool.get_node(index_item(i).node); - else - node->children[i] = 0; - */ - } - - - void remove_at_pos(int p) { - if (node->is_index()) { - for (int i=p; ichildren[i] = node->children[i+1]; - } - } else { - for (int i=p; iis_index() ? "index":"leaf") << std::endl; - } - void insert_at_leaf_pos(int p, K key, V value) { - assert(is_leaf()); - for (int i=size(); i>p; i--) - leaf_item(i) = leaf_item(i-1); - leaf_item(p).key = key; - leaf_item(p).value = value; - set_size(size() + 1); - } - void insert_at_index_pos(int p, K key, nodeid_t nid) { - assert(is_index()); - for (int i=size(); i>p; i--) { - index_item(i) = index_item(i-1); - //node->children[i] = node->children[i-1]; - } - index_item(p).key = key; - index_item(p).node = nid; - set_size(size() + 1); - } - - void append_item(LeafItem& i) { - leaf_item(size()) = i; - set_size(size() + 1); - } - void append_item(IndexItem& i) { - index_item(size()) = i; - set_size(size() + 1); - } - - void split(Nodeptr& right) { - if (node->is_index()) { - for (int i=min_items(); iis_index()) - for (int i=0; i open; // open nodes - vector pos; // position within the node - //Nodeptr open[20]; - //int pos[20]; - int level; - - Cursor(Table *t) : table(t), open(t->depth), pos(t->depth), level(0) {} - - public: - - const LeafItem& current() { - assert(open[level].is_leaf()); - return open[level].leaf_item(pos[level]); - } - V& dirty_current_value() { - assert(open[level].is_leaf()); - dirty(); - return open[level].leaf_item(pos[level]).value; - } - - // ** read-only bits ** - int move_left() { - if (table->depth == 0) return OOB; - - // work up around branch - int l; - for (l = level; l >= 0; l--) - if (pos[l] > 0) break; - if (l < 0) - return OOB; // we are the first item in the btree - - // move left one - pos[l]--; - - // work back down right side - for (; lpool, open[l].index_item(pos[l]).node); - pos[l+1] = open[l+1].size() - 1; - } - return 1; - } - int move_right() { - if (table->depth == 0) return OOB; - - // work up branch - int l; - for (l=level; l>=0; l--) - if (pos[l] < open[l].size() - 1) break; - if (l < 0) { - /* we are at last item in btree. */ - if (pos[level] < open[level].size()) { - pos[level]++; /* move into add position! */ - return 0; - } - return -1; - } - - /* move right one */ - assert( pos[l] < open[l].size() ); - pos[l]++; - - /* work back down */ - for (; lpool, open[l].index_item(pos[l]).node ); - pos[l+1] = 0; // furthest left - } - return 1; - } - - // ** modifications ** - void dirty() { - for (int l=level; l>=0; l--) { - if (open[l].node->is_dirty()) break; // already dirty! (and thus parents are too) - - table->pool.dirty_node(open[l].node); - if (l > 0) - open[l-1].index_item( pos[l-1] ).node = open[l].get_id(); - else - table->root = open[0].get_id(); - } - } - private: - void repair_parents() { - // did i make a change at the start of a node? - if (pos[level] == 0) { - K key = open[level].key(0); // new key parents should have - for (int j=level-1; j>=0; j--) { - if (open[j].index_item(pos[j]).key == key) - break; /* it's the same key, we can stop fixing */ - open[j].index_item(pos[j]).key = key; - if (pos[j] > 0) break; /* last in position 0.. */ - } - } - } - - public: - void remove() { - dirty(); - - // remove from node - open[level].remove_at_pos( pos[level] ); - repair_parents(); - - // was it a key? - if (level == table->depth-1) - table->nkeys--; - } - - void insert(K key, V value) { - dirty(); - - // insert - open[level].insert_at_leaf_pos(pos[level], key, value); - repair_parents(); - - // was it a key? - if (level == table->depth-1) - table->nkeys++; - } - - int rotate_left() { - if (level == 0) return -1; // i am root - if (pos[level-1] == 0) return -1; // nothing to left - - Nodeptr here = open[level]; - Nodeptr parent = open[level-1]; - Nodeptr left(table->pool, parent.index_item(pos[level-1] - 1).node ); - if (left.size() == left.max_items()) return -1; // it's full - - // make both dirty - dirty(); - if (!left.node->is_dirty()) { - table->pool.dirty_node(left.node); - parent.index_item(pos[level-1]-1).node = left.get_id(); - } - - dbtout << "rotating item " << here.key(0) << " left from " << here.get_id() << " to " << left.get_id() << std::endl; - - /* add */ - if (here.node->is_leaf()) - left.append_item(here.leaf_item(0)); - else - left.append_item(here.index_item(0)); - - /* remove */ - here.remove_at_pos(0); - - /* fix parent index for me */ - parent.index_item( pos[level-1] ).key = here.key(0); - // we never have to update past immediate parent, since we're not at pos 0 - - /* adjust cursor */ - if (pos[level] > 0) - pos[level]--; - //else - //assert(1); /* if we were positioned here, we're equal */ - /* if it was 0, then the shifted item == our key, and we can stay here safely. */ - return 0; - } - int rotate_right() { - if (level == 0) return -1; // i am root - if (pos[level-1] + 1 >= open[level-1].size()) return -1; // nothing to right - - Nodeptr here = open[level]; - Nodeptr parent = open[level-1]; - Nodeptr right(table->pool, parent.index_item( pos[level-1] + 1 ).node ); - if (right.size() == right.max_items()) return -1; // it's full - - // make both dirty - dirty(); - if (!right.node->is_dirty()) { - table->pool.dirty_node(right.node); - parent.index_item( pos[level-1]+1 ).node = right.get_id(); - } - - if (pos[level] == here.size()) { - /* let's just move the cursor over! */ - //if (sizeof(K) == 8) - dbtout << "shifting cursor right from " << here.get_id() << " to less-full node " << right.get_id() << std::endl; - open[level] = right; - pos[level] = 0; - pos[level-1]++; - return 0; - } - - //if (sizeof(K) == 8) - dbtout << "rotating item " << hex << here.key(here.size()-1) << dec << " right from " - << here.get_id() << " to " << right.get_id() << std::endl; - - /* add */ - if (here.is_index()) - right.insert_at_index_pos(0, - here.index_item( here.size()-1 ).key, - here.index_item( here.size()-1 ).node); - else - right.insert_at_leaf_pos(0, - here.leaf_item( here.size()-1 ).key, - here.leaf_item( here.size()-1 ).value); - - /* remove */ - here.set_size(here.size() - 1); - - /* fix parent index for right */ - parent.index_item( pos[level-1] + 1 ).key = right.key(0); - - return 0; - } - }; - - - public: - bool almost_full() { - if (2*(depth+1) > pool.get_num_free()) // worst case, plus some. - return true; - return false; - } - - int find(K key, Cursor& cursor) { - dbtout << "find " << key << std::endl; - verify("find"); - - if (depth == 0) - return Cursor::OOB; - - // init - cursor.level = 0; - - // start at root - Nodeptr curnode(pool, root); - cursor.open[0] = curnode; - - if (curnode.size() == 0) return -1; // empty! - - // find leaf - for (cursor.level = 0; cursor.level < depth-1; cursor.level++) { - /* if key=5, we want 2 3 [4] 6 7, or 3 4 [5] 5 6 (err to the left) */ - int left = 0; /* i >= left */ - int right = curnode.size()-1; /* i < right */ - while (left < right) { - int i = left + (right - left) / 2; - if (curnode.index_item(i).key < key) { - left = i + 1; - } else if (i && curnode.index_item(i-1).key >= key) { - right = i; - } else { - left = right = i; - break; - } - } - int i = left; - if (i && curnode.index_item(i).key > key) i--; - -#ifdef EBOFS_DEBUG_BTREE - int j; - for (j=0; j key) break; - } - if (i != j) { - dbtout << "btree binary search failed" << std::endl; - i = j; - } -#endif - - cursor.pos[cursor.level] = i; - - /* get child node */ - curnode.open(pool, cursor.open[cursor.level].index_item(i).node ); - cursor.open[cursor.level+1] = curnode; - } - - /* search leaf */ - /* if key=5, we want 2 3 4 [6] 7, or 3 4 [5] 5 6 (err to the right) */ - int left = 0; /* i >= left */ - int right = curnode.size(); /* i < right */ - while (left < right) { - int i = left + (right - left) / 2; - if (curnode.leaf_item(i).key < key) { - left = i + 1; - } else if (i && curnode.leaf_item(i-1).key >= key) { - right = i; - } else { - left = right = i; - break; - } - } - int i = left; - -#ifdef EBOFS_DEBUG_BTREE - int j; - for (j=0; j= key) break; - } - if (i != j) { - dbtout << "btree binary search failed" << std::endl; - i = j; - } -#endif - - cursor.pos[cursor.level] = i; /* first key in this node, or key insertion point */ - - if (curnode.size() >= i+1) { - if (curnode.leaf_item(i).key == key) { - return Cursor::MATCH; /* it's the actual key */ - } else { - return Cursor::INSERT; /* it's an insertion point */ - } - } - return Cursor::OOB; /* it's the end of the btree (also a valid insertion point) */ - } - - int lookup(K key) { - dbtout << "lookup" << std::endl; - Cursor cursor(this); - if (find(key, cursor) == Cursor::MATCH) - return 0; - return -1; - } - - int lookup(K key, V& value) { - dbtout << "lookup" << std::endl; - Cursor cursor(this); - if (find(key, cursor) == Cursor::MATCH) { - value = cursor.current().value; - return 0; - } - return -1; - } - - int insert(K key, V value) { - verify("pre-insert"); - dbtout << "insert " << key << " -> " << value << std::endl; - if (almost_full()) return -1; - - // empty? - if (nkeys == 0) { - if (root == -1) { - // create a root node (leaf!) - assert(depth == 0); - Nodeptr newroot( pool.new_node(Node::TYPE_LEAF) ); - root = newroot.get_id(); - depth++; - } - assert(depth == 1); - assert(root >= 0); - } - - // start at/near key - Cursor cursor(this); - find(key, cursor); - - // insert loop - nodeid_t nodevalue = 0; - while (1) { - - /* room in this node? */ - if (cursor.open[cursor.level].size() < cursor.open[cursor.level].max_items()) { - if (cursor.open[cursor.level].is_leaf()) - cursor.insert( key, value ); // will dirty, etc. - else { - // indices are already dirty - cursor.open[cursor.level].insert_at_index_pos(cursor.pos[cursor.level], key, nodevalue); - } - verify("insert 1"); - return 0; - } - - /* this node is full. */ - assert( cursor.open[cursor.level].size() == cursor.open[cursor.level].max_items() ); - - /* can we rotate? */ - if (false) // NO! there's a bug in here somewhere, don't to it. - if (cursor.level > 0) { - if ((cursor.pos[cursor.level-1] > 0 - && cursor.rotate_left() >= 0) || - (cursor.pos[cursor.level-1] + 1 < cursor.open[cursor.level-1].size() - && cursor.rotate_right() >= 0)) { - - if (cursor.open[cursor.level].is_leaf()) - cursor.insert( key, value ); // will dirty, etc. - else { - // indices are already dirty - cursor.open[cursor.level].insert_at_index_pos(cursor.pos[cursor.level], key, nodevalue); - } - verify("insert 2"); - return 0; - } - } - - /** split node **/ - - if (cursor.level == depth-1) { - dbtout << "splitting leaf " << cursor.open[cursor.level].get_id() << std::endl; - } else { - dbtout << "splitting index " << cursor.open[cursor.level].get_id() << std::endl; - } - - cursor.dirty(); - - // split - Nodeptr leftnode = cursor.open[cursor.level]; - Nodeptr newnode( pool.new_node(leftnode.node->get_type()) ); - leftnode.split( newnode ); - - /* insert our item */ - if (cursor.pos[cursor.level] > leftnode.size()) { - // not with cursor, since this node isn't added yet! - if (newnode.is_leaf()) { - newnode.insert_at_leaf_pos( cursor.pos[cursor.level] - leftnode.size(), - key, value ); - nkeys++; - } else { - newnode.insert_at_index_pos( cursor.pos[cursor.level] - leftnode.size(), - key, nodevalue ); - } - } else { - // with cursor (if leaf) - if (leftnode.is_leaf()) - cursor.insert( key, value ); - else - leftnode.insert_at_index_pos( cursor.pos[cursor.level], - key, nodevalue ); - } - - /* are we at the root? */ - if (cursor.level == 0) { - /* split root. */ - dbtout << "that split was the root " << root << std::endl; - Nodeptr newroot( pool.new_node(Node::TYPE_INDEX) ); - - /* new root node */ - newroot.set_size(2); - newroot.index_item(0).key = leftnode.key(0); - newroot.index_item(0).node = root; - newroot.index_item(1).key = newnode.key(0); - newroot.index_item(1).node = newnode.get_id(); - - /* heighten tree */ - depth++; - root = newroot.get_id(); - verify("insert 3"); - return 0; - } - - /* now insert newindex in level-1 */ - nodevalue = newnode.get_id(); - key = newnode.key(0); - cursor.level--; - cursor.pos[cursor.level]++; // ...to the right of leftnode! - } - } - - - int remove(K key) { - verify("pre-remove"); - dbtout << "remove " << key << std::endl; - - if (almost_full()) { - cout << "table almost full, failing" << std::endl; - assert(0); - return -1; - } - - Cursor cursor(this); - if (find(key, cursor) <= 0) { - cerr << "remove " << key << " 0x" << hex << key << dec << " .. dne" << std::endl; - g_conf.debug_ebofs = 33; - g_conf.ebofs_verify = true; - verify("remove dne"); - assert(0); - return -1; // key dne - } - - - while (1) { - cursor.remove(); - verify("post-remove"); - - // balance + adjust - - if (cursor.level == 0) { - // useless root index? - if (cursor.open[0].size() == 1 && - depth > 1) { - depth--; - root = cursor.open[0].index_item(0).node; - pool.release( cursor.open[0].node ); - } - - // note: root can be small, but not empty - else if (nkeys == 0) { - assert(cursor.open[cursor.level].size() == 0); - assert(depth == 1); - root = -1; - depth = 0; - if (cursor.open[0].node) - pool.release(cursor.open[0].node); - } - verify("remove 1"); - return 0; - } - - if (cursor.open[cursor.level].size() > cursor.open[cursor.level].min_items()) { - verify("remove 2"); - return 0; - } - - // borrow from siblings? - Nodeptr left; - Nodeptr right; - - // left? - if (cursor.pos[cursor.level-1] > 0) { - int left_loc = cursor.open[cursor.level-1].index_item( cursor.pos[cursor.level-1] - 1).node; - left.open(pool, left_loc); - - if (left.size() > left.min_items()) { - /* move cursor left, shift right */ - cursor.pos[cursor.level] = 0; - cursor.open[cursor.level] = left; - cursor.pos[cursor.level-1]--; - cursor.rotate_right(); - verify("remove 3"); - return 0; - } - - /* combine to left */ - right = cursor.open[cursor.level]; - } - else { - assert(cursor.pos[cursor.level-1] < cursor.open[cursor.level-1].size() - 1); - int right_loc = cursor.open[cursor.level-1].index_item( cursor.pos[cursor.level-1] + 1 ).node; - right.open(pool, right_loc ); - - if (right.size() > right.min_items()) { - /* move cursor right, shift an item left */ - cursor.pos[cursor.level] = 1; - cursor.open[cursor.level] = right; - cursor.pos[cursor.level-1]++; - cursor.rotate_left(); - verify("remove 4"); - return 0; - } - - /* combine to left */ - left = cursor.open[cursor.level]; - cursor.pos[cursor.level-1]++; /* move cursor to (soon-to-be-empty) right side item */ - } - - // note: cursor now points to _right_ node. - - /* combine (towards left) - * (this makes it so our next delete will be in the index - * interior, which is less scary.) - */ - dbtout << "combining nodes " << left.get_id() << " and " << right.get_id() << std::endl; - - left.merge(right); - - // dirty left + right - cursor.dirty(); // right - if (!left.node->is_dirty()) { - pool.dirty_node(left.node); - cursor.open[cursor.level-1].index_item( cursor.pos[cursor.level-1]-1 ).node = left.get_id(); - } - - pool.release(right.node); - - cursor.level--; // now point to the link to the obsolete (right-side) sib */ - } - - } - - void clear(Cursor& cursor, int node_loc, int level) { - dbtout << "clear" << std::endl; - - Nodeptr node(pool, node_loc); - cursor.open[level] = node; - - // hose children? - if (level < depth-1) { - for (int i=0; i max) - max = node.key(i); - - if (level < depth-1) { - // index - cursor.pos[level] = i; - err += verify_sub( cursor, cursor.open[level].index_item(i).node, level+1, count, last, on ); - } else { - // leaf - count++; - last = node.key(i); - } - } - - if (level) { - // verify that parent's keys are appropriate - if (min != cursor.open[level-1].index_item(cursor.pos[level-1]).key) { - dbtout << ":: key in index node " << cursor.open[level-1].get_id() - << " != min in child " << node_loc - << "(key is " << hex << cursor.open[level-1].index_item(cursor.pos[level-1]).key - << ", min is " << min << ")" << dec << std::endl; - err++; - } - if (cursor.pos[level-1] < cursor.open[level-1].size()-1) { - if (max > cursor.open[level-1].index_item(1+cursor.pos[level-1]).key) { - dbtout << ":: next key in index node " << cursor.open[level-1].get_id() - << " < max in child " << node_loc - << "(key is " << hex << cursor.open[level-1].index_item(1+cursor.pos[level-1]).key - << ", max is " << max << ")" << dec << std::endl; - err++; - } - } - } - - if (err == 0) return err; - - // print it - char s[1000]; - strcpy(s," "); - s[level+1] = 0; - if (1) { - if (root == node_loc) { - dbtout << s << "root " << node_loc << ": " - << node.size() << " / " << node.max_items() << " keys, " << hex << min << "-" << max << dec << std::endl; - } else if (level == depth-1) { - dbtout << s << "leaf " << node_loc << ": " - << node.size() << " / " << node.max_items() << " keys, " << hex << min << "-" << max << dec << std::endl; - } else { - dbtout << s << "indx " << node_loc << ": " - << node.size() << " / " << node.max_items() << " keys, " << hex << min << "-" << max << dec << std::endl; - } - - if (1) { - for (int i=0; i " << node.leaf_item(i).value << dec << std::endl; - } - } - } - } - - return err; - } - - void verify(const char *on) { - if (!g_conf.ebofs_verify) - return; - - if (root == -1 && depth == 0) { - return; // empty! - } - - int count = 0; - Cursor cursor(this); - K last; - - int before = g_conf.debug_ebofs; - g_conf.debug_ebofs = 0; - - int err = verify_sub(cursor, root, 0, count, last, on); - if (count != nkeys) { - cerr << "** count " << count << " != nkeys " << nkeys << std::endl; - err++; - } - - g_conf.debug_ebofs = before; - - // ok? - if (err) { - cerr << "verify failure, called by '" << on << "'" << std::endl; - g_conf.debug_ebofs = 30; - // do it again, so we definitely get the dump. - int count = 0; - Cursor cursor(this); - K last; - verify_sub(cursor, root, 0, count, last, on); - assert(err == 0); - } - } - -}; - - -#endif diff --git a/branches/sage/mds/ebofs/mkfs.ebofs.cc b/branches/sage/mds/ebofs/mkfs.ebofs.cc deleted file mode 100644 index d1d5975e7fd65..0000000000000 --- a/branches/sage/mds/ebofs/mkfs.ebofs.cc +++ /dev/null @@ -1,349 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include -#include "ebofs/Ebofs.h" - - -int main(int argc, char **argv) -{ - // args - vector args; - argv_to_vec(argc, argv, args); - parse_config_options(args); - - if (args.size() < 1) { - cerr << "usage: mkfs.ebofs [options] " << std::endl; - return -1; - } - char *filename = args[0]; - - // mkfs - Ebofs mfs(filename); - int r = mfs.mkfs(); - if (r < 0) exit(r); - - if (args.size() > 1) { // pass an extra arg of some sort to trigger the test crapola - // test-o-rama! - Ebofs fs(filename); - fs.mount(); - - // zillion objects - if (1) { - char crap[1024*1024]; - memset(crap, 0, 1024*1024); - bufferlist bl; - int sz = 10000; - bl.append(crap, sz); - - int n = 100000; - utime_t start = g_clock.now(); - for (int i=0; i nsec - - while (1) { - cout << g_clock.now() << " writing " << pos << "~" << sz << std::endl; - fs.write(oid, pos, sz, bl, (Context*)0); - pos += sz; - nanosleep(&ts, 0); - } - - } - - /* - if (1) { - // partial write tests - char crap[1024*1024]; - memset(crap, 0, 1024*1024); - - bufferlist small; - small.append(crap, 10); - bufferlist med; - med.append(crap, 1000); - bufferlist big; - big.append(crap, 1024*1024); - - cout << "0" << std::endl; - fs.write(10, 0, 1024*1024, big, (Context*)0); - fs.sync(); - fs.trim_buffer_cache(); - - cout << "1" << std::endl; - fs.write(10, 10, 10, small, 0); - fs.write(10, 1, 1000, med, 0); - fs.sync(); - fs.trim_buffer_cache(); - - cout << "2" << std::endl; - fs.write(10, 10, 10, small, 0); - //fs.sync(); - fs.write(10, 1, 1000, med, 0); - fs.sync(); - fs.trim_buffer_cache(); - - cout << "3" << std::endl; - fs.write(10, 1, 1000, med, 0); - fs.write(10, 10000, 10, small, 0); - fs.truncate(10, 100, 0); - fs.sync(); - fs.trim_buffer_cache(); - - cout << "4" << std::endl; - fs.remove(10); - fs.sync(); - fs.write(10, 10, 10, small, 0); - fs.sync(); - fs.write(10, 1, 1000, med, 0); - fs.sync(); - fs.truncate(10, 100, 0); - fs.write(10, 10, 10, small, 0); - fs.trim_buffer_cache(); - - - - } - - if (0) { // onode write+read test - bufferlist bl; - char crap[1024*1024]; - memset(crap, 0, 1024*1024); - bl.append(crap, 10); - - fs.write(10, 10, 0, bl, (Context*)0); - fs.umount(); - - Ebofs fs2(filename); - fs2.mount(); - fs2.read(10, 10, 0, bl); - fs2.umount(); - - return 0; - } - - - if (0) { // small write + read test - bufferlist bl; - char crap[1024*1024]; - memset(crap, 0, 1024*1024); - - object_t oid = 10; - int n = 10000; - int l = 128; - bl.append(crap, l); - - - char *p = bl.c_str(); - off_t o = 0; - for (int i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __EBOFS_NODES_H -#define __EBOFS_NODES_H - -/** nodes, node regions **/ - -#include "types.h" -#include "BlockDevice.h" -#include "include/xlist.h" -#include "include/bitmapper.h" - -/* - - disk wire memory - - free free -> free can alloc - free used -> dirty can modify - - free used used -> clean - free used free -> limbo - - used used -> clean - used free -> limbo - - - // meaningless - used free free -> free can alloc - used free used __DNE__ - - -*/ - -#undef debofs -#define debofs(x) if (x < g_conf.debug_ebofs) cout << "ebofs.nodepool." - - -class Node { - public: - // bit fields - static const int STATE_CLEAN = 1; - static const int STATE_DIRTY = 2; - - static const int ITEM_LEN = EBOFS_NODE_BYTES - sizeof(int) - sizeof(int) - sizeof(int); - - static const int TYPE_INDEX = 1; - static const int TYPE_LEAF = 2; - - protected: - nodeid_t id; - int pos_in_bitmap; // position in bitmap - int state; // use bit fields above! - - bufferptr bptr; - - // in disk buffer - int *type; - int *nrecs; - - public: - xlist::item xlist; // dirty - - vector children; - - Node(nodeid_t i, int pib, bufferptr& b, int s) : - id(i), pos_in_bitmap(pib), - state(s), bptr(b), xlist(this) { - setup_pointers(); - } - - void setup_pointers() { - nrecs = (int*)(bptr.c_str()); - type = (int*)(bptr.c_str() + sizeof(*nrecs)); - } - - bool do_cow() { - if (bptr.do_cow()) { - setup_pointers(); - return true; - } - return false; - } - - - // id - nodeid_t get_id() const { return id; } - void set_id(nodeid_t n) { id = n; } - int get_pos_in_bitmap() const { return pos_in_bitmap; } - void set_pos_in_bitmap(int i) { pos_in_bitmap = i; } - - // buffer - bufferptr& get_buffer() { return bptr; } - - char *item_ptr() { return bptr.c_str() + sizeof(*nrecs) + sizeof(*type); } - - // size - int size() { return *nrecs; } - void set_size(int s) { *nrecs = s; } - - // type - int& get_type() { return *type; } - void set_type(int t) { *type = t; } - bool is_index() { return *type == TYPE_INDEX; } - bool is_leaf() { return *type == TYPE_LEAF; } - - - // state - bool is_dirty() { return state == STATE_DIRTY; } - bool is_clean() { return state == STATE_CLEAN; } - - void set_state(int s) { state = s; } - -}; - - - - - -class NodePool { - protected: - hash_map > node_map; // open node map - - public: - vector region_loc; // region locations - Extent usemap_even; - Extent usemap_odd; - - buffer::ptr usemap_data; - bitmapper usemap_bits; - - protected: - // on-disk block states - int num_nodes; - int num_dirty; - int num_clean; - int num_free; - int num_limbo; - - xlist dirty_ls; - interval_set free; - interval_set limbo; - - Mutex &ebofs_lock; - Cond commit_cond; - int flushing; - - nodeid_t make_nodeid(int region, int offset) { - return region_loc[region].start + (block_t)offset; - } - int nodeid_pos_in_bitmap(nodeid_t nid) { - unsigned region; - int num = 0; - for (region = 0; - (block_t)nid < region_loc[region].start || (block_t)nid > region_loc[region].end(); - region++) { - //generic_dout(-20) << "node " << nid << " not in " << region << " " << region_loc[region] << dendl; - num += region_loc[region].length; - } - num += nid - region_loc[region].start; - //generic_dout(-20) << "node " << nid << " is in " << region << ", overall bitmap pos is " << num << dendl; - return num; - } - - - public: - NodePool(Mutex &el) : - num_nodes(0), - num_dirty(0), num_clean(0), num_free(0), num_limbo(0), - ebofs_lock(el), - flushing(0) {} - ~NodePool() { - // nodes - release_all(); - } - - int get_num_free() { return num_free; } - int get_num_dirty() { return num_dirty; } - int get_num_limbo() { return num_limbo; } - int get_num_clean() { return num_clean; } - int get_num_total() { return num_nodes; } - int get_num_used() { return num_clean + num_dirty; } - - int get_usemap_len(int n=0) { - if (n == 0) n = num_nodes; - return ((n-1) / 8 / EBOFS_BLOCK_SIZE) + 1; - } - - unsigned num_regions() { return region_loc.size(); } - - // the caller had better adjust usemap locations... - void add_region(Extent ex) { - assert(region_loc.size() < EBOFS_MAX_NODE_REGIONS); - region_loc.push_back(ex); - free.insert(ex.start, ex.length); - num_free += ex.length; - num_nodes += ex.length; - } - - void init_usemap() { - usemap_data = buffer::create_page_aligned(EBOFS_BLOCK_SIZE*usemap_even.length); - usemap_data.zero(); - usemap_bits.set_data(usemap_data.c_str(), usemap_data.length()); - } - - void expand_usemap() { - block_t have = usemap_data.length() / EBOFS_BLOCK_SIZE; - if (have < usemap_even.length) { - // use bufferlist to copy/merge two chunks - bufferlist bl; - bl.push_back(usemap_data); - bufferptr newbit = buffer::create_page_aligned(EBOFS_BLOCK_SIZE*(usemap_even.length - have)); - newbit.zero(); - bl.push_back(newbit); - assert(bl.buffers().size() == 1); - usemap_data = bl.buffers().front(); - usemap_bits.set_data(usemap_data.c_str(), usemap_data.length()); - } - } - - - - int init(struct ebofs_nodepool *np) { - // regions - assert(region_loc.empty()); - num_nodes = 0; - for (int i=0; inum_regions; i++) { - debofs(3) << "init region " << i << " at " << np->region_loc[i] << std::endl; - region_loc.push_back( np->region_loc[i] ); - num_nodes += np->region_loc[i].length; - } - - // usemap - usemap_even = np->node_usemap_even; - usemap_odd = np->node_usemap_odd; - debofs(3) << "init even map at " << usemap_even << std::endl; - debofs(3) << "init odd map at " << usemap_odd << std::endl; - - init_usemap(); - return 0; - } - - void close() { - release_all(); - - region_loc.clear(); - - num_free = 0; - num_dirty = 0; - num_clean = 0; - num_limbo = 0; - dirty_ls.clear(); - - free.clear(); - limbo.clear(); - - flushing = 0; - node_map.clear(); - } - - - // *** blocking i/o routines *** - - int read_usemap_and_clean_nodes(BlockDevice& dev, version_t epoch) { - // read map - Extent loc; - if (epoch & 1) - loc = usemap_odd; - else - loc = usemap_even; - - // usemap - dev.read(loc.start, loc.length, usemap_data); - - // nodes - unsigned region = 0; - unsigned region_pos = 0; - for (int i=0; iflushed_usemap(); - } - }; - - void flushed_usemap() { - ebofs_lock.Lock(); - flushing--; - if (flushing == 0) - commit_cond.Signal(); - ebofs_lock.Unlock(); - } - - public: - int write_usemap(BlockDevice& dev, version_t version) { - // alloc - Extent loc; - if (version & 1) - loc = usemap_odd; - else - loc = usemap_even; - - // write - bufferlist bl; - bufferptr bp = usemap_data.clone(); - bl.append(bp); - dev.write(loc.start, loc.length, bl, - new C_NP_FlushUsemap(this), "usemap"); - return 0; - } - - - - // *** node commit *** - private: - - class C_NP_FlushNode : public BlockDevice::callback { - NodePool *pool; - nodeid_t nid; - public: - C_NP_FlushNode(NodePool *p, nodeid_t n) : - pool(p), nid(n) {} - void finish(ioh_t ioh, int r) { - pool->flushed_node(nid); - } - }; - - void flushed_node(nodeid_t nid) { - ebofs_lock.Lock(); - flushing--; - if (flushing == 0) - commit_cond.Signal(); - ebofs_lock.Unlock(); - } - - public: - void commit_start(BlockDevice& dev, version_t version) { - debofs(20) << "ebofs.nodepool.commit_start start dirty=" << dirty_ls.size() << std::endl; - - assert(flushing == 0); - /*if (0) - for (unsigned i=0; i clean (write to disk) - while (!dirty_ls.empty()) { - Node *n = dirty_ls.front(); - assert(n); - assert(n->is_dirty()); - n->set_state(Node::STATE_CLEAN); - dirty_ls.remove(&n->xlist); - num_dirty--; - num_clean++; - - debofs(20) << "ebofs.nodepool.commit_start writing node " << n->get_id() << std::endl; - - bufferlist bl; - if (0) { - bufferptr bp = n->get_buffer().clone(); // dup it now - bl.append(bp); - } else { - bl.append(n->get_buffer()); - } - dev.write(n->get_id(), EBOFS_NODE_BLOCKS, - bl, - new C_NP_FlushNode(this, n->get_id()), "node"); - flushing++; - } - - // limbo -> free - for (map::iterator i = limbo.m.begin(); - i != limbo.m.end(); - i++) { - num_free += i->second; - num_limbo -= i->second; - free.insert(i->first, i->second); - } - limbo.clear(); - - debofs(20) << "ebofs.nodepool.commit_start finish" << std::endl; - } - - void commit_wait() { - while (flushing > 0) - commit_cond.Wait(ebofs_lock); - debofs(20) << "ebofs.nodepool.commit_wait finish" << std::endl; - } - - - - - - - - - - // *** nodes *** - // opened node - Node* get_node(nodeid_t nid) { - //dbtout << "pool.get " << nid << std::endl; - assert(node_map.count(nid)); - return node_map[nid]; - } - - // allocate id/block on disk. always free -> dirty. - nodeid_t alloc_id() { - // pick node id - assert(!free.empty()); - nodeid_t nid = free.start(); - free.erase(nid); - num_free--; - return nid; - } - - // new node - Node* new_node(int type) { - nodeid_t nid = alloc_id(); - debofs(15) << "ebofs.nodepool.new_node " << nid << std::endl; - - // alloc node - bufferptr bp = buffer::create_page_aligned(EBOFS_NODE_BYTES); - bp.zero(); - Node *n = new Node(nid, nodeid_pos_in_bitmap(nid), bp, Node::STATE_DIRTY); - n->set_type(type); - n->set_size(0); - - usemap_bits.set(n->get_pos_in_bitmap()); - - n->set_state(Node::STATE_DIRTY); - dirty_ls.push_back(&n->xlist); - num_dirty++; - - assert(node_map.count(nid) == 0); - node_map[nid] = n; - - return n; - } - - void release(Node *n) { - const nodeid_t nid = n->get_id(); - debofs(15) << "ebofs.nodepool.release on " << nid << std::endl; - node_map.erase(nid); - - if (n->is_dirty()) { - dirty_ls.remove(&n->xlist); - num_dirty--; - free.insert(nid); - num_free++; - usemap_bits.clear(n->get_pos_in_bitmap()); - } else if (n->is_clean()) { - limbo.insert(nid); - num_limbo++; - num_clean--; - usemap_bits.clear(n->get_pos_in_bitmap()); - } - - delete n; - assert(num_clean + num_dirty + num_limbo + num_free == num_nodes); - } - - void release_all() { - while (!node_map.empty()) { - hash_map >::iterator i = node_map.begin(); - debofs(2) << "ebofs.nodepool.release_all leftover " << i->first << " " << i->second << std::endl; - release( i->second ); - } - assert(node_map.empty()); - } - - void dirty_node(Node *n) { - // get new node id? - nodeid_t oldid = n->get_id(); - nodeid_t newid = alloc_id(); - debofs(15) << "ebofs.nodepool.dirty_node on " << oldid << " now " << newid << std::endl; - - // dup data? - // this only does a memcpy if there are multiple references.. - // i.e. if we are still writing the old data - if (n->do_cow()) { - //assert(0); //i'm duping on write - debofs(15) << "ebofs.nodepool.dirty_node did cow on " << oldid << " now " << newid << std::endl; - //cerr << "ebofs.nodepool.dirty_node did cow on " << oldid << " now " << newid << std::endl; - } - - // release old block - assert(n->is_clean()); - num_clean--; - limbo.insert(oldid); - num_limbo++; - usemap_bits.clear(n->get_pos_in_bitmap()); - - // rename node - node_map.erase(oldid); - n->set_id(newid); - n->set_pos_in_bitmap(nodeid_pos_in_bitmap(newid)); - node_map[newid] = n; - - // new block - n->set_state(Node::STATE_DIRTY); - dirty_ls.push_back(&n->xlist); - debofs(15) << "ebofs.nodepool.dirty_node added to dirty list, len now " << dirty_ls.size() << std::endl; - num_dirty++; - usemap_bits.set(n->get_pos_in_bitmap()); - - assert(num_clean + num_dirty + num_limbo + num_free == num_nodes); - } - - -}; - -#endif diff --git a/branches/sage/mds/ebofs/test.ebofs.cc b/branches/sage/mds/ebofs/test.ebofs.cc deleted file mode 100644 index 9a8913a52d80d..0000000000000 --- a/branches/sage/mds/ebofs/test.ebofs.cc +++ /dev/null @@ -1,226 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include -#include "ebofs/Ebofs.h" - -bool stop = false; - - -int nt = 0; -class Tester : public Thread { - Ebofs &fs; - int t; - - char b[1024*1024]; - -public: - Tester(Ebofs &e) : fs(e), t(nt) { nt++; } - void *entry() { - - while (!stop) { - object_t oid; - oid.ino = (rand() % 10) + 0x10000000; - coll_t cid = rand() % 50; - off_t off = rand() % 10000;//0;//rand() % 1000000; - off_t len = 1+rand() % 100000; - char *a = "one"; - if (rand() % 2) a = "two"; - int l = 3;//rand() % 10; - - switch (rand() % 10) { - case 0: - { - oid.rev = rand() % 10; - cout << t << " read " << hex << oid << dec << " at " << off << " len " << len << std::endl; - bufferlist bl; - fs.read(oid, off, len, bl); - int l = MIN(len,bl.length()); - if (l) { - cout << t << " got " << l << std::endl; - bl.copy(0, l, b); - char *p = b; - while (l--) { - assert(*p == 0 || - *p == (char)(off ^ oid.ino)); - off++; - p++; - } - } - } - break; - - case 1: - { - cout << t << " write " << hex << oid << dec << " at " << off << " len " << len << std::endl; - for (int j=0;j args; - argv_to_vec(argc, argv, args); - parse_config_options(args); - - // args - if (args.size() != 3) return -1; - char *filename = args[0]; - int seconds = atoi(args[1]); - int threads = atoi(args[2]); - if (!threads) threads = 1; - - cout << "dev " << filename << " .. " << threads << " threads .. " << seconds << " seconds" << std::endl; - - Ebofs fs(filename); - if (fs.mount() < 0) return -1; - - - // explicit tests - if (0) { - // verify that clone() plays nice with partial writes - object_t oid(1,1); - bufferptr bp(10000); - bp.zero(); - bufferlist bl; - bl.push_back(bp); - fs.write(oid, 0, 10000, bl, 0); - - fs.sync(); - fs.trim_buffer_cache(); - - // induce a partial write - bufferlist bl2; - bl2.substr_of(bl, 0, 100); - fs.write(oid, 100, 100, bl2, 0); - - // clone it - object_t oid2; - oid2 = oid; - oid2.rev = 1; - fs.clone(oid, oid2, 0); - - // ... - if (0) { - // make sure partial still behaves after orig is removed... - fs.remove(oid, 0); - - // or i read for oid2... - bufferlist rbl; - fs.read(oid2, 0, 200, rbl); - } - if (1) { - // make sure things behave if we remove the clone - fs.remove(oid2,0); - } - } - // /explicit tests - - list ls; - for (int i=0; icreate(); - ls.push_back(t); - } - - utime_t now = g_clock.now(); - utime_t dur(seconds,0); - utime_t end = now + dur; - cout << "stop at " << end << std::endl; - while (now < end) { - sleep(1); - now = g_clock.now(); - cout << now << std::endl; - } - - cout << "stopping" << std::endl; - stop = true; - - while (!ls.empty()) { - Tester *t = ls.front(); - ls.pop_front(); - t->join(); - delete t; - } - - fs.umount(); - return 0; -} - diff --git a/branches/sage/mds/ebofs/types.h b/branches/sage/mds/ebofs/types.h deleted file mode 100644 index 749ebddb3ccec..0000000000000 --- a/branches/sage/mds/ebofs/types.h +++ /dev/null @@ -1,171 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __EBOFS_TYPES_H -#define __EBOFS_TYPES_H - -#include -#include "include/buffer.h" -#include "include/Context.h" -#include "common/Cond.h" - -#include -#include -#include -#include -using namespace std; -using namespace __gnu_cxx; - - -#include "include/object.h" - - -#ifndef MIN -# define MIN(a,b) ((a)<=(b) ? (a):(b)) -#endif -#ifndef MAX -# define MAX(a,b) ((a)>=(b) ? (a):(b)) -#endif - - -/* -namespace __gnu_cxx { - template<> struct hash { - size_t operator()(unsigned long long __x) const { - static hash H; - return H((__x >> 32) ^ (__x & 0xffffffff)); - } - }; - - template<> struct hash< std::string > - { - size_t operator()( const std::string& x ) const - { - static hash H; - return H(x.c_str()); - } - }; -} -*/ - - -// disk -typedef uint64_t block_t; // disk location/sector/block - -static const int EBOFS_BLOCK_SIZE = 4096; -static const int EBOFS_BLOCK_BITS = 12; // 1<<12 == 4096 - -class Extent { - public: - block_t start, length; - - Extent() : start(0), length(0) {} - Extent(block_t s, block_t l) : start(s), length(l) {} - - block_t last() const { return start + length - 1; } - block_t end() const { return start + length; } -}; - -inline ostream& operator<<(ostream& out, Extent& ex) -{ - return out << ex.start << "~" << ex.length; -} - - -// tree/set nodes -//typedef int nodeid_t; -typedef int64_t nodeid_t; // actually, a block number. FIXME. - -static const unsigned EBOFS_NODE_BLOCKS = 1; -static const unsigned EBOFS_NODE_BYTES = EBOFS_NODE_BLOCKS * EBOFS_BLOCK_SIZE; -static const unsigned EBOFS_MAX_NODE_REGIONS = 10; // pick a better value! - -struct ebofs_nodepool { - Extent node_usemap_even; // for even sb versions - Extent node_usemap_odd; // for odd sb versions - - int num_regions; - Extent region_loc[EBOFS_MAX_NODE_REGIONS]; -}; - - -// objects - -typedef uint64_t coll_t; - -struct ebofs_onode { - Extent onode_loc; /* this is actually the block we live in */ - - object_t object_id; /* for kicks */ - off_t object_size; /* file size in bytes. should this be 64-bit? */ - unsigned object_blocks; - bool readonly; - - int num_collections; - int num_attr; // num attr in onode - int num_extents; /* number of extents used. if 0, data is in the onode */ -}; - -struct ebofs_cnode { - Extent cnode_loc; /* this is actually the block we live in */ - coll_t coll_id; - int num_attr; // num attr in cnode -}; - - -// table -struct ebofs_table { - nodeid_t root; /* root node of btree */ - int num_keys; - int depth; -}; - - -// super -typedef uint64_t version_t; - -static const unsigned EBOFS_MAGIC = 0x000EB0F5; - -static const int EBOFS_NUM_FREE_BUCKETS = 5; /* see alloc.h for bucket constraints */ -static const int EBOFS_FREE_BUCKET_BITS = 2; - - -struct ebofs_super { - uint64_t s_magic; - uint64_t fsid; - - epoch_t epoch; // version of this superblock. - - uint64_t num_blocks; /* # blocks in filesystem */ - - // some basic stats, for kicks - uint64_t free_blocks; /* unused blocks */ - uint64_t limbo_blocks; /* limbo blocks */ - //unsigned num_objects; - //unsigned num_fragmented; - - struct ebofs_nodepool nodepool; - - // tables - struct ebofs_table free_tab[EBOFS_NUM_FREE_BUCKETS]; - struct ebofs_table limbo_tab; - struct ebofs_table alloc_tab; - struct ebofs_table object_tab; // object directory - struct ebofs_table collection_tab; // collection directory - struct ebofs_table co_tab; -}; - - -#endif diff --git a/branches/sage/mds/extractosdmaps.cc b/branches/sage/mds/extractosdmaps.cc deleted file mode 100644 index bc8ec91984d1e..0000000000000 --- a/branches/sage/mds/extractosdmaps.cc +++ /dev/null @@ -1,64 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include -#include -#include - -#include -#include -#include -using namespace std; - -#include "config.h" - -#include "mon/MonMap.h" - -#include "ebofs/Ebofs.h" - -#include "osd/OSD.h" -#include "mon/MonitorStore.h" - -int main(int argc, char **argv) -{ - vector args; - argv_to_vec(argc, argv, args); - - Ebofs eb("dev/osd0"); - eb.mount(); - MonitorStore ms("mondata/mon0"); - ms.mount(); - - epoch_t e = 1; - while (1) { - bufferlist bl; - object_t oid = OSD::get_osdmap_object_name(e); - eb.read(oid, 0, 0, bl); - if (bl.length() == 0) break; - cout << "saving epoch " << e << std::endl; - - bufferlist ibl; - oid = OSD::get_inc_osdmap_object_name(e); - eb.read(oid, 0, 0, ibl); - - ms.put_bl_sn(ibl, "osdmap", e); - ms.put_bl_sn(bl, "osdmap_full", e); - e++; - } - - eb.umount(); - //ms.umount(); - - return 0; -} diff --git a/branches/sage/mds/fakefuse.cc b/branches/sage/mds/fakefuse.cc deleted file mode 100644 index b08d00d11a5d6..0000000000000 --- a/branches/sage/mds/fakefuse.cc +++ /dev/null @@ -1,168 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include -#include -#include -using namespace std; - -#include "config.h" - -#include "mon/Monitor.h" - -#include "mds/MDS.h" -#include "osd/OSD.h" -#include "client/Client.h" -#include "client/fuse.h" -#include "client/fuse_ll.h" - -#include "common/Timer.h" - -#include "msg/FakeMessenger.h" - - - - -#define NUMMDS g_conf.num_mds -#define NUMOSD g_conf.num_osd -#define NUMCLIENT g_conf.num_client - - -class C_Test : public Context { -public: - void finish(int r) { - cout << "C_Test->finish(" << r << ")" << std::endl; - } -}; -class C_Test2 : public Context { -public: - void finish(int r) { - cout << "C_Test2->finish(" << r << ")" << std::endl; - g_timer.add_event_after(2, new C_Test); - } -}; - - - -int main(int argc, char **argv) { - cerr << "fakefuse starting" << std::endl; - - // stop on our own (by default) - g_conf.mon_stop_on_last_unmount = true; - g_conf.mon_stop_with_last_mds = true; - - vector args; - argv_to_vec(argc, argv, args); - parse_config_options(args); - - // start messenger thread - fakemessenger_startthread(); - - //g_timer.add_event_after(5.0, new C_Test2); - //g_timer.add_event_after(10.0, new C_Test); - - vector nargs; - for (unsigned i=0; imon_inst[i] = entity_inst_t(entity_name_t::MON(i), a); // hack ; see FakeMessenger.cc - } - - Monitor *mon[g_conf.num_mon]; - for (int i=0; iinit(); - for (int i=0; iinit(); - for (int i=0; iinit(); - - - // create client - Client *client[NUMCLIENT]; - for (int i=0; iinit(); - - - // start up fuse - // use my argc, argv (make sure you pass a mount point!) - client[i]->mount(); - - char *oldcwd = get_current_dir_name(); // note previous wd - cout << "starting fuse on pid " << getpid() << std::endl; - if (g_conf.fuse_ll) - ceph_fuse_ll_main(client[i], argc, argv); - else - ceph_fuse_main(client[i], argc, argv); - cout << "fuse finished on pid " << getpid() << std::endl; - ::chdir(oldcwd); // return to previous wd - free(oldcwd); - - client[i]->unmount(); - client[i]->shutdown(); - } - - - - // wait for it to finish - cout << "DONE -----" << std::endl; - fakemessenger_wait(); // blocks until messenger stops - - - // cleanup - for (int i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include -#include -#include -using namespace std; - -#include "config.h" - -#include "mds/MDS.h" -#include "osd/OSD.h" -#include "mon/Monitor.h" -#include "client/Client.h" - -#include "client/SyntheticClient.h" - -#include "msg/FakeMessenger.h" - -#include "common/Timer.h" - - -class C_Test : public Context { -public: - void finish(int r) { - cout << "C_Test->finish(" << r << ")" << std::endl; - } -}; - -class C_Die : public Context { -public: - void finish(int) { - cerr << "die" << std::endl; - exit(1); - } -}; - - -int main(int argc, char **argv) -{ - cerr << "fakesyn start" << std::endl; - - //cerr << "inode_t " << sizeof(inode_t) << std::endl; - - vector args; - argv_to_vec(argc, argv, args); - - // stop on our own (by default) - g_conf.mon_stop_on_last_unmount = true; - g_conf.mon_stop_with_last_mds = true; - - parse_config_options(args); - - int start = 0; - - parse_syn_options(args); - - vector nargs; - - for (unsigned i=0; imon_inst[i] = entity_inst_t(entity_name_t::MON(i), a); // hack ; see FakeMessenger.cc - } - - char hostname[100]; - gethostname(hostname,100); - //int pid = getpid(); - - // create mon - Monitor *mon[g_conf.num_mon]; - for (int i=0; iinit(); - } - for (int i=0; iinit(); - } - for (int i=0; iinit(); - if (g_conf.mds_local_osd) - mdsosd[i]->init(); - } - - - // create client(s) - Client *client[g_conf.num_client]; - SyntheticClient *syn[g_conf.num_client]; - for (int i=0; istart_thread(); - start++; - } - - - for (int i=0; ijoin_thread(); - delete syn[i]; - } - - - // wait for it to finish - fakemessenger_wait(); - - // cleanup - for (int i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __CONTEXT_H -#define __CONTEXT_H - -#include "config.h" - -#include -#include -#include - -#include - - -/* - * Context - abstract callback class - */ -class Context { - public: - virtual ~Context() {} // we want a virtual destructor!!! - virtual void finish(int r) = 0; -}; - - -/* - * finish and destroy a list of Contexts - */ -inline void finish_contexts(std::list& finished, - int result = 0) -{ - using std::cout; - using std::endl; - - list ls; - if (finished.empty()) return; - - ls.swap(finished); // swap out of place to avoid weird loops - - generic_dout(10) << ls.size() << " contexts to finish with " << result << dendl; - for (std::list::iterator it = ls.begin(); - it != ls.end(); - it++) { - Context *c = *it; - generic_dout(10) << "---- " << c << dendl; - c->finish(result); - delete c; - } -} - -class C_NoopContext : public Context { -public: - void finish(int r) { } -}; - - -/* - * C_Contexts - set of Contexts - */ -class C_Contexts : public Context { -public: - std::list contexts; - - void add(Context* c) { - contexts.push_back(c); - } - void take(std::list& ls) { - contexts.splice(contexts.end(), ls); - } - void finish(int r) { - finish_contexts(contexts, r); - } -}; - - -/* - * C_Gather - * - * BUG: does not report errors. - */ -class C_Gather : public Context { -public: - bool sub_finish(int r) { - //cout << "C_Gather sub_finish " << this << " got " << r << " of " << waitfor << endl; - assert(waitfor.count(r)); - waitfor.erase(r); - if (!waitfor.empty()) - return false; // more subs left - - // last one - onfinish->finish(0); - delete onfinish; - onfinish = 0; - return true; - } - - class C_GatherSub : public Context { - C_Gather *gather; - int num; - public: - C_GatherSub(C_Gather *g, int n) : gather(g), num(n) {} - void finish(int r) { - if (gather->sub_finish(num)) - delete gather; // last one! - } - }; - -private: - Context *onfinish; - std::set waitfor; - int num; - -public: - C_Gather(Context *f=0) : onfinish(f), num(0) { - //cout << "C_Gather new " << this << endl; - } - ~C_Gather() { - //cout << "C_Gather delete " << this << endl; - assert(!onfinish); - } - - void set_finisher(Context *c) { - assert(!onfinish); - onfinish = c; - } - Context *new_sub() { - num++; - waitfor.insert(num); - return new C_GatherSub(this, num); - } - - bool empty() { return num == 0; } - int get_num() { return num; } - - void finish(int r) { - assert(0); // nobody should ever call me. - } - -}; - -#endif diff --git a/branches/sage/mds/include/Distribution.h b/branches/sage/mds/include/Distribution.h deleted file mode 100644 index efc0795a72fcb..0000000000000 --- a/branches/sage/mds/include/Distribution.h +++ /dev/null @@ -1,75 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __DISTRIBUTION_H -#define __DISTRIBUTION_H - -#include -#include -using namespace std; - -class Distribution { - vector p; - vector v; - - public: - //Distribution() { - //} - - unsigned get_width() { - return p.size(); - } - - void clear() { - p.clear(); - v.clear(); - } - void add(int val, float pr) { - p.push_back(pr); - v.push_back(val); - } - - void random() { - float sum = 0.0; - for (unsigned i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __BITMAPPER_H -#define __BITMAPPER_H - -class bitmapper { - char *_data; - int _len; - - public: - bitmapper() : _data(0), _len(0) { } - bitmapper(char *data, int len) : _data(data), _len(len) { } - - void set_data(char *data, int len) { _data = data; _len = len; } - - int bytes() const { return _len; } - int bits() const { return _len * 8; } - - bool operator[](int b) const { - return get(b); - } - bool get(int b) const { - return _data[b >> 3] & (1 << (b&7)); - } - void set(int b) { - _data[b >> 3] |= 1 << (b&7); - } - void clear(int b) { - _data[b >> 3] &= ~(1 << (b&7)); - } - void toggle(int b) { - _data[b >> 3] ^= 1 << (b&7); - } -}; - -#endif diff --git a/branches/sage/mds/include/blobhash.h b/branches/sage/mds/include/blobhash.h deleted file mode 100644 index a3703e46d67f5..0000000000000 --- a/branches/sage/mds/include/blobhash.h +++ /dev/null @@ -1,47 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __BLOBHASH_H -#define __BLOBHASH_H - -#include "hash.h" - -/* -- this is to make some of the STL types work with 64 bit values, string hash keys, etc. -- added when i was using an old STL.. maybe try taking these out and see if things - compile now? -*/ - -class blobhash { -public: - size_t operator()(const char *p, unsigned len) { - static rjhash H; - size_t acc = 0; - while (len >= sizeof(size_t)) { - acc ^= *(size_t*)p; - p += sizeof(size_t); - len -= sizeof(size_t); - } - int sh = 0; - while (len) { - acc ^= (size_t)*p << sh; - sh += 8; - len--; - p++; - } - return H(acc); - } -}; - - -#endif diff --git a/branches/sage/mds/include/buffer.h b/branches/sage/mds/include/buffer.h deleted file mode 100644 index 5e48b6ce91bf6..0000000000000 --- a/branches/sage/mds/include/buffer.h +++ /dev/null @@ -1,1205 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __BUFFER_H -#define __BUFFER_H - -#include - -#include "common/Mutex.h" - - -//#define BUFFER_USE_CCPP - -#ifdef BUFFER_USE_CCPP -# include "cc++/thread.h" -#endif - -#include -#include - -using std::cout; - -#ifndef __CYGWIN__ -# include -#endif - -#define BUFFER_PAGE_SIZE 4096 // FIXME - -// -// these are in config.o -extern Mutex bufferlock; -extern long buffer_total_alloc; -// - - - - -class buffer { -private: - - /* hack for memory utilization debugging. */ - static void inc_total_alloc(unsigned len) { - bufferlock.Lock(); - buffer_total_alloc += len; - bufferlock.Unlock(); - } - static void dec_total_alloc(unsigned len) { - bufferlock.Lock(); - buffer_total_alloc -= len; - bufferlock.Unlock(); - } - - /* - * an abstract raw buffer. with a reference count. - */ - class raw { - public: - char *data; - unsigned len; -#ifdef BUFFER_USE_CCPP - mutable ost::AtomicCounter nref; // mutable for const-ness of operator<< -#else - int nref; - Mutex lock; // we'll make it non-recursive. -#endif - - raw(unsigned l) : len(l), nref(0) -#ifndef BUFFER_USE_CCPP - , lock(false) -#endif - { } - raw(char *c, unsigned l) : data(c), len(l), nref(0) -#ifndef BUFFER_USE_CCPP - , lock(false) -#endif - { } - virtual ~raw() {}; - - // no copying. - raw(const raw &other); - const raw& operator=(const raw &other); - - virtual raw* clone_empty() = 0; - raw *clone() { - raw *c = clone_empty(); - memcpy(c->data, data, len); - return c; - } - - bool is_page_aligned() { - return (long)data % BUFFER_PAGE_SIZE == 0; - } - }; - - friend std::ostream& operator<<(std::ostream& out, const raw &r); - - /* - * primitive buffer types - */ - class raw_char : public raw { - public: - raw_char(unsigned l) : raw(l) { - data = new char[len]; - inc_total_alloc(len); - } - ~raw_char() { - delete[] data; - dec_total_alloc(len); - } - raw* clone_empty() { - return new raw_char(len); - } - }; - - class raw_static : public raw { - public: - raw_static(const char *d, unsigned l) : raw((char*)d, l) { } - ~raw_static() {} - raw* clone_empty() { - return new raw_char(len); - } - }; - -#ifndef __CYGWIN__ - class raw_mmap_pages : public raw { - public: - raw_mmap_pages(unsigned l) : raw(l) { - data = (char*)::mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, 0); - inc_total_alloc(len); - } - ~raw_mmap_pages() { - ::munmap(data, len); - dec_total_alloc(len); - } - raw* clone_empty() { - return new raw_mmap_pages(len); - } - }; - - class raw_posix_aligned : public raw { - public: - raw_posix_aligned(unsigned l) : raw(l) { -#ifdef DARWIN - data = (char *) valloc (len); -#else - ::posix_memalign((void**)(void*)&data, BUFFER_PAGE_SIZE, len); -#endif /* DARWIN */ - inc_total_alloc(len); - } - ~raw_posix_aligned() { - ::free((void*)data); - dec_total_alloc(len); - } - raw* clone_empty() { - return new raw_posix_aligned(len); - } - }; -#endif - -#ifdef __CYGWIN__ - class raw_hack_aligned : public raw { - char *realdata; - public: - raw_hack_aligned(unsigned l) : raw(l) { - realdata = new char[len+BUFFER_PAGE_SIZE-1]; - unsigned off = ((unsigned)realdata) % BUFFER_PAGE_SIZE; - if (off) - data = realdata + BUFFER_PAGE_SIZE - off; - else - data = realdata; - inc_total_alloc(len+BUFFER_PAGE_SIZE-1); - //cout << "hack aligned " << (unsigned)data - //<< " in raw " << (unsigned)realdata - //<< " off " << off << std::endl; - assert(((unsigned)data & (BUFFER_PAGE_SIZE-1)) == 0); - } - ~raw_hack_aligned() { - delete[] realdata; - dec_total_alloc(len+BUFFER_PAGE_SIZE-1); - } - raw* clone_empty() { - return new raw_hack_aligned(len); - } - }; -#endif - -public: - - /* - * named constructors - */ - - static raw* copy(const char *c, unsigned len) { - raw* r = new raw_char(len); - memcpy(r->data, c, len); - return r; - } - static raw* create(unsigned len) { - return new raw_char(len); - } - - static raw* create_page_aligned(unsigned len) { -#ifndef __CYGWIN__ - //return new raw_mmap_pages(len); - return new raw_posix_aligned(len); -#else - return new raw_hack_aligned(len); -#endif - } - - - /* - * a buffer pointer. references (a subsequence of) a raw buffer. - */ - class ptr { - raw *_raw; - unsigned _off, _len; - - public: - ptr() : _raw(0), _off(0), _len(0) {} - ptr(raw *r) : _raw(r), _off(0), _len(r->len) { // no lock needed; this is an unref raw. - ++r->nref; - } - ptr(unsigned l) : _off(0), _len(l) { - _raw = create(l); - ++_raw->nref; - } - ptr(char *d, unsigned l) : _off(0), _len(l) { // ditto. - _raw = copy(d, l); - ++_raw->nref; - } - ptr(const ptr& p) : _raw(p._raw), _off(p._off), _len(p._len) { - if (_raw) { -#ifdef BUFFER_USE_CCPP - ++_raw->nref; -#else - _raw->lock.Lock(); - ++_raw->nref; - _raw->lock.Unlock(); -#endif - } - } - ptr(const ptr& p, unsigned o, unsigned l) : _raw(p._raw), _off(p._off + o), _len(l) { - assert(o+l <= p._len); - assert(_raw); -#ifdef BUFFER_USE_CCPP - ++_raw->nref; -#else - _raw->lock.Lock(); - ++_raw->nref; - _raw->lock.Unlock(); -#endif - } - ptr& operator= (const ptr& p) { - // be careful -- we need to properly handle self-assignment. - if (p._raw) { -#ifdef BUFFER_USE_CCPP - ++p._raw->nref; // inc new -#else - p._raw->lock.Lock(); - ++p._raw->nref; // inc new - p._raw->lock.Unlock(); -#endif - } - release(); // dec (+ dealloc) old (if any) - _raw = p._raw; // change my ref - _off = p._off; - _len = p._len; - return *this; - } - ~ptr() { - release(); - } - - raw *clone() { - return _raw->clone(); - } - - bool do_cow() { - if (_raw->nref > 1) { - //std::cout << "doing cow on " << _raw << " len " << _len << std::endl; - raw *newraw = _raw->clone(); - release(); - newraw->nref++; - _raw = newraw; - return true; - } else - return false; - } - - void swap(ptr& other) { - raw *r = _raw; - unsigned o = _off; - unsigned l = _len; - _raw = other._raw; - _off = other._off; - _len = other._len; - other._raw = r; - other._off = o; - other._len = l; - } - - void release() { - if (_raw) { -#ifndef BUFFER_USE_CCPP - _raw->lock.Lock(); -#endif - if (--_raw->nref == 0) { - //cout << "hosing raw " << (void*)_raw << " len " << _raw->len << std::endl; -#ifndef BUFFER_USE_CCPP - _raw->lock.Unlock(); -#endif - delete _raw; // dealloc old (if any) - } else { -#ifndef BUFFER_USE_CCPP - _raw->lock.Unlock(); -#endif - } - _raw = 0; - } - } - - // misc - bool at_buffer_head() const { return _off == 0; } - bool at_buffer_tail() const { return _off + _len == _raw->len; } - - bool is_page_aligned() const { return (long)c_str() % BUFFER_PAGE_SIZE == 0; } - - // accessors - raw *get_raw() const { return _raw; } - const char *c_str() const { assert(_raw); return _raw->data + _off; } - char *c_str() { assert(_raw); return _raw->data + _off; } - unsigned length() const { return _len; } - unsigned offset() const { return _off; } - unsigned start() const { return _off; } - unsigned end() const { return _off + _len; } - unsigned unused_tail_length() const { - if (_raw) - return _raw->len - (_off+_len); - else - return 0; - } - const char& operator[](unsigned n) const { - assert(_raw); - assert(n < _len); - return _raw->data[_off + n]; - } - char& operator[](unsigned n) { - assert(_raw); - assert(n < _len); - return _raw->data[_off + n]; - } - - const char *raw_c_str() const { assert(_raw); return _raw->data; } - unsigned raw_length() const { assert(_raw); return _raw->len; } - int raw_nref() const { assert(_raw); return _raw->nref; } - - void copy_out(unsigned o, unsigned l, char *dest) const { - assert(_raw); - assert(o >= 0 && o <= _len); - assert(l >= 0 && o+l <= _len); - memcpy(dest, c_str()+o, l); - } - - unsigned wasted() { - assert(_raw); - return _raw->len - _len; - } - - // modifiers - void set_offset(unsigned o) { _off = o; } - void set_length(unsigned l) { _len = l; } - - void append(const char *p, unsigned l) { - assert(_raw); - assert(l <= unused_tail_length()); - memcpy(c_str() + _len, p, l); - _len += l; - } - - void copy_in(unsigned o, unsigned l, const char *src) { - assert(_raw); - assert(o >= 0 && o <= _len); - assert(l >= 0 && o+l <= _len); - memcpy(c_str()+o, src, l); - } - - void zero() { - memset(c_str(), 0, _len); - } - - void clean() { - //raw *newraw = _raw->makesib(_len); - } - }; - - friend std::ostream& operator<<(std::ostream& out, const buffer::ptr& bp); - - /* - * list - the useful bit! - */ - - class list { - // my private bits - list *bl; - std::list _buffers; - unsigned _len; - - ptr append_buffer; // where i put small appends. - - public: - class iterator { - list *bl; - std::list &ls; - unsigned off; // in bl - std::list::iterator p; - unsigned p_off; // in *p - public: - // constructor. position. - iterator(list *l, unsigned o=0) : - bl(l), ls(bl->_buffers), off(0), p(ls.begin()), p_off(0) { - advance(o); - } - iterator(list *l, unsigned o, std::list::iterator ip, unsigned po) : - bl(l), ls(bl->_buffers), off(0), p(ip), p_off(po) { } - - iterator operator=(const iterator& other) { - return iterator(bl, off, p, p_off); - } - - unsigned get_off() { return off; } - - bool end() { - return p == ls.end(); - } - - void advance(unsigned o) { - //cout << this << " advance " << o << " from " << off << " (p_off " << p_off << " in " << p->length() << ")" << std::endl; - p_off += o; - while (p_off > 0) { - assert(p != ls.end()); - if (p_off >= p->length()) { - // skip this buffer - p_off -= p->length(); - p++; - } else { - // somewhere in this buffer! - break; - } - } - off += o; - } - - void seek(unsigned o) { - //cout << this << " seek " << o << std::endl; - p = ls.begin(); - off = p_off = 0; - advance(o); - } - - char operator*() { - assert(p != ls.end()); - return (*p)[p_off]; - } - iterator& operator++() { - assert(p != ls.end()); - advance(1); - return *this; - } - - // copy data out. - // note that these all _append_ to dest! - - void copy(unsigned len, char *dest) { - if (p == ls.end()) seek(off); - while (len > 0) { - assert(p != ls.end()); - - unsigned howmuch = p->length() - p_off; - if (len < howmuch) howmuch = len; - p->copy_out(p_off, howmuch, dest); - dest += howmuch; - - len -= howmuch; - advance(howmuch); - } - } - - void copy(unsigned len, list &dest) { - if (p == ls.end()) seek(off); - while (len > 0) { - assert(p != ls.end()); - - unsigned howmuch = p->length() - p_off; - if (len < howmuch) howmuch = len; - dest.append(*p, p_off, howmuch); - - len -= howmuch; - advance(howmuch); - } - } - - void copy(unsigned len, std::string &dest) { - if (p == ls.end()) seek(off); - while (len > 0) { - assert(p != ls.end()); - - unsigned howmuch = p->length() - p_off; - if (len < howmuch) howmuch = len; - dest.append(p->c_str() + p_off, howmuch); - - len -= howmuch; - advance(howmuch); - } - } - - // copy data in - - void copy_in(unsigned len, const char *src) { - // copy - if (p == ls.end()) seek(off); - while (len > 0) { - assert(p != ls.end()); - - unsigned howmuch = p->length() - p_off; - if (len < howmuch) howmuch = len; - p->copy_in(p_off, howmuch, src); - - src += howmuch; - len -= howmuch; - advance(howmuch); - } - } - - void copy_in(unsigned len, const list& otherl) { - if (p == ls.end()) seek(off); - unsigned left = len; - for (std::list::const_iterator i = otherl._buffers.begin(); - i != otherl._buffers.end(); - i++) { - unsigned l = (*i).length(); - if (left < l) l = left; - copy_in(l, i->c_str()); - left -= l; - if (left == 0) break; - } - } - - }; - - private: - mutable iterator last_p; - - public: - // cons/des - list() : _len(0), last_p(this) {} - list(const list& other) : _buffers(other._buffers), _len(other._len), last_p(this) { } - list(unsigned l) : _len(0), last_p(this) { - ptr bp(l); - push_back(bp); - } - ~list() {} - - list& operator= (const list& other) { - _buffers = other._buffers; - _len = other._len; - return *this; - } - - const std::list& buffers() const { return _buffers; } - - void swap(list& other) { - unsigned t = _len; - _len = other._len; - other._len = t; - _buffers.swap(other._buffers); - append_buffer.swap(other.append_buffer); - } - - unsigned length() const { -#if 0 - // DEBUG: verify _len - unsigned len = 0; - for (std::list::const_iterator it = _buffers.begin(); - it != _buffers.end(); - it++) { - len += (*it).length(); - } - assert(len == _len); -#endif - return _len; - } - - bool is_page_aligned() const { - for (std::list::const_iterator it = _buffers.begin(); - it != _buffers.end(); - it++) - if (!it->is_page_aligned()) return false; - return true; - } - bool is_n_page_sized() const { - return length() % BUFFER_PAGE_SIZE == 0; - } - - // modifiers - void clear() { - _buffers.clear(); - _len = 0; - } - void push_front(ptr& bp) { - _buffers.push_front(bp); - _len += bp.length(); - } - void push_front(raw *r) { - ptr bp(r); - _buffers.push_front(bp); - _len += bp.length(); - } - void push_back(const ptr& bp) { - _buffers.push_back(bp); - _len += bp.length(); - } - void push_back(raw *r) { - ptr bp(r); - _buffers.push_back(bp); - _len += bp.length(); - } - void zero() { - for (std::list::iterator it = _buffers.begin(); - it != _buffers.end(); - it++) - it->zero(); - } - - // sort-of-like-assignment-op - void claim(list& bl) { - // free my buffers - clear(); - claim_append(bl); - } - void claim_append(list& bl) { - // steal the other guy's buffers - _len += bl._len; - _buffers.splice( _buffers.end(), bl._buffers ); - bl._len = 0; - } - - - - - iterator begin() { - return iterator(this, 0); - } - iterator end() { - return iterator(this, _len, _buffers.end(), 0); - } - - - // crope lookalikes. - // **** WARNING: this are horribly inefficient for large bufferlists. **** - - // data OUT - - void copy(unsigned off, unsigned len, char *dest) const { - assert(off >= 0); - assert(off + len <= length()); - if (last_p.get_off() != off) - last_p.seek(off); - last_p.copy(len, dest); - } - - void copy(unsigned off, unsigned len, list &dest) const { - assert(off >= 0); - assert(off + len <= length()); - if (last_p.get_off() != off) - last_p.seek(off); - last_p.copy(len, dest); - } - - void copy(unsigned off, unsigned len, std::string& dest) const { - if (last_p.get_off() != off) - last_p.seek(off); - return last_p.copy(len, dest); - } - - void copy_in(unsigned off, unsigned len, const char *src) { - assert(off >= 0); - assert(off + len <= length()); - - if (last_p.get_off() != off) - last_p.seek(off); - last_p.copy_in(len, src); - } - - void copy_in(unsigned off, unsigned len, const list& src) { - if (last_p.get_off() != off) - last_p.seek(off); - last_p.copy_in(len, src); - } - - - void append(const char *data, unsigned len) { - while (len > 0) { - // put what we can into the existing append_buffer. - unsigned gap = append_buffer.unused_tail_length(); - if (gap > 0) { - if (gap > len) gap = len; - //cout << "append first char is " << data[0] << ", last char is " << data[len-1] << std::endl; - append_buffer.append(data, gap); - append(append_buffer, append_buffer.end() - gap, gap); // add segment to the list - len -= gap; - data += gap; - } - if (len == 0) break; // done! - - // make a new append_buffer! - unsigned alen = BUFFER_PAGE_SIZE * (((len-1) / BUFFER_PAGE_SIZE) + 1); - append_buffer = create_page_aligned(alen); - append_buffer.set_length(0); // unused, so far. - } - } - void append(const ptr& bp) { - push_back(bp); - } - void append(const ptr& bp, unsigned off, unsigned len) { - assert(len+off <= bp.length()); - if (!_buffers.empty() && - _buffers.back().get_raw() == bp.get_raw() && - _buffers.back().end() == bp.start() + off) { - // yay contiguous with tail bp! - _buffers.back().set_length(_buffers.back().length()+len); - _len += len; - } else { - // add new item to list - ptr tempbp(bp, off, len); - push_back(tempbp); - } - } - void append(const list& bl) { - _len += bl._len; - for (std::list::const_iterator p = bl._buffers.begin(); - p != bl._buffers.end(); - ++p) - _buffers.push_back(*p); - } - - - /* - * get a char - */ - const char& operator[](unsigned n) { - assert(n < _len); - for (std::list::iterator p = _buffers.begin(); - p != _buffers.end(); - p++) { - if (n >= p->length()) { - n -= p->length(); - continue; - } - return (*p)[n]; - } - assert(0); - } - - /* - * return a contiguous ptr to whole bufferlist contents. - */ - char *c_str() { - if (_buffers.size() == 1) { - return _buffers.front().c_str(); // good, we're already contiguous. - } - else if (_buffers.size() == 0) { - return 0; // no buffers - } - else { - ptr newbuf = create(length()); // make one new contiguous buffer. - copy(0, length(), newbuf.c_str()); // copy myself into it. - clear(); - push_back(newbuf); - return newbuf.c_str(); // now it'll work. - } - } - - void substr_of(const list& other, unsigned off, unsigned len) { - assert(off + len <= other.length()); - clear(); - - // skip off - std::list::const_iterator curbuf = other._buffers.begin(); - while (off > 0 && - off >= curbuf->length()) { - // skip this buffer - //cout << "skipping over " << *curbuf << std::endl; - off -= (*curbuf).length(); - curbuf++; - } - assert(len == 0 || curbuf != other._buffers.end()); - - while (len > 0) { - // partial? - if (off + len < curbuf->length()) { - //cout << "copying partial of " << *curbuf << std::endl; - _buffers.push_back( ptr( *curbuf, off, len ) ); - _len += len; - break; - } - - // through end - //cout << "copying end (all?) of " << *curbuf << std::endl; - unsigned howmuch = curbuf->length() - off; - _buffers.push_back( ptr( *curbuf, off, howmuch ) ); - _len += howmuch; - len -= howmuch; - off = 0; - curbuf++; - } - } - - - - // funky modifer - void splice(unsigned off, unsigned len, list *claim_by=0 /*, bufferlist& replace_with */) { // fixme? - assert(off < length()); - assert(len > 0); - //cout << "splice off " << off << " len " << len << " ... mylen = " << length() << std::endl; - - // skip off - std::list::iterator curbuf = _buffers.begin(); - while (off > 0) { - assert(curbuf != _buffers.end()); - if (off >= (*curbuf).length()) { - // skip this buffer - //cout << "off = " << off << " skipping over " << *curbuf << std::endl; - off -= (*curbuf).length(); - curbuf++; - } else { - // somewhere in this buffer! - //cout << "off = " << off << " somewhere in " << *curbuf << std::endl; - break; - } - } - assert(off >= 0); - - if (off) { - // add a reference to the front bit - // insert it before curbuf (which we'll hose) - //cout << "keeping front " << off << " of " << *curbuf << std::endl; - _buffers.insert( curbuf, ptr( *curbuf, 0, off ) ); - _len += off; - } - - while (len > 0) { - // partial? - if (off + len < (*curbuf).length()) { - //cout << "keeping end of " << *curbuf << ", losing first " << off+len << std::endl; - if (claim_by) - claim_by->append( *curbuf, off, len ); - (*curbuf).set_offset( off+len + (*curbuf).offset() ); // ignore beginning big - (*curbuf).set_length( (*curbuf).length() - (len+off) ); - _len -= off+len; - //cout << " now " << *curbuf << std::endl; - break; - } - - // hose though the end - unsigned howmuch = (*curbuf).length() - off; - //cout << "discarding " << howmuch << " of " << *curbuf << std::endl; - if (claim_by) - claim_by->append( *curbuf, off, howmuch ); - _len -= (*curbuf).length(); - _buffers.erase( curbuf++ ); - len -= howmuch; - off = 0; - } - - // splice in *replace (implement me later?) - } - - }; - -}; - -typedef buffer::ptr bufferptr; -typedef buffer::list bufferlist; - - -inline bool operator>(bufferlist& l, bufferlist& r) { - for (unsigned p = 0; ; p++) { - if (l.length() > p && r.length() == p) return true; - if (l.length() == p) return false; - if (l[p] > r[p]) return true; - if (l[p] < r[p]) return false; - p++; - } -} -inline bool operator>=(bufferlist& l, bufferlist& r) { - for (unsigned p = 0; ; p++) { - if (l.length() > p && r.length() == p) return true; - if (r.length() == p && l.length() == p) return true; - if (l[p] > r[p]) return true; - if (l[p] < r[p]) return false; - p++; - } -} -inline bool operator<(bufferlist& l, bufferlist& r) { - return r > l; -} -inline bool operator<=(bufferlist& l, bufferlist& r) { - return r >= l; -} - - -inline std::ostream& operator<<(std::ostream& out, const buffer::raw &r) { - return out << "buffer::raw(" << (void*)r.data << " len " << r.len << " nref " << r.nref << ")"; -} - -inline std::ostream& operator<<(std::ostream& out, const buffer::ptr& bp) { - out << "buffer::ptr(" << bp.offset() << "~" << bp.length() - << " " << (void*)bp.c_str() - << " in raw " << (void*)bp.raw_c_str() - << " len " << bp.raw_length() - << " nref " << bp.raw_nref() << ")"; - return out; -} - -inline std::ostream& operator<<(std::ostream& out, const buffer::list& bl) { - out << "buffer::list(len=" << bl.length() << "," << std::endl; - - std::list::const_iterator it = bl.buffers().begin(); - while (it != bl.buffers().end()) { - out << "\t" << *it; - if (++it == bl.buffers().end()) break; - out << "," << std::endl; - } - out << std::endl << ")"; - return out; -} - - - - -// ---------------------------------------------------------- -// encoders - -// DEPRECATED, please use _(en|de)code_(simple|complex) - -// raw -template -inline void _encoderaw(const T& t, bufferlist& bl) -{ - bl.append((char*)&t, sizeof(t)); -} -template -inline void _decoderaw(T& t, bufferlist& bl, int& off) -{ - bl.copy(off, sizeof(t), (char*)&t); - off += sizeof(t); -} - -#include -#include -#include -#include -#include -#include - -// list -template -inline void _encode(const std::list& ls, bufferlist& bl) -{ - // should i pre- or post- count? - if (!ls.empty()) { - unsigned pos = bl.length(); - uint32_t n = 0; - _encoderaw(n, bl); - for (typename std::list::const_iterator p = ls.begin(); p != ls.end(); ++p) { - n++; - _encode(*p, bl); - } - bl.copy_in(pos, sizeof(n), (char*)&n); - } else { - uint32_t n = ls.size(); // FIXME: this is slow on a list. - _encoderaw(n, bl); - for (typename std::list::const_iterator p = ls.begin(); p != ls.end(); ++p) - _encode(*p, bl); - } -} -template -inline void _decode(std::list& ls, bufferlist& bl, int& off) -{ - uint32_t n; - _decoderaw(n, bl, off); - ls.clear(); - while (n--) { - T v; - _decode(v, bl, off); - ls.push_back(v); - } -} - -// deque -template -inline void _encode(const std::deque& ls, bufferlist& bl) -{ - uint32_t n = ls.size(); - _encoderaw(n, bl); - for (typename std::deque::const_iterator p = ls.begin(); p != ls.end(); ++p) - _encode(*p, bl); -} -template -inline void _decode(std::deque& ls, bufferlist& bl, int& off) -{ - uint32_t n; - _decoderaw(n, bl, off); - ls.clear(); - while (n--) { - T v; - _decode(v, bl, off); - ls.push_back(v); - } -} - -// set -template -inline void _encode(const std::set& s, bufferlist& bl) -{ - uint32_t n = s.size(); - _encoderaw(n, bl); - for (typename std::set::const_iterator p = s.begin(); p != s.end(); ++p) - _encode(*p, bl); -} -template -inline void _decode(std::set& s, bufferlist& bl, int& off) -{ - uint32_t n; - _decoderaw(n, bl, off); - s.clear(); - while (n--) { - T v; - _decode(v, bl, off); - s.insert(v); - } -} - -// vector -template -inline void _encode(const std::vector& v, bufferlist& bl) -{ - uint32_t n = v.size(); - _encoderaw(n, bl); - for (typename std::vector::const_iterator p = v.begin(); p != v.end(); ++p) - _encode(*p, bl); -} -template -inline void _decode(std::vector& v, bufferlist& bl, int& off) -{ - uint32_t n; - _decoderaw(n, bl, off); - v.resize(n); - for (uint32_t i=0; i -inline void _encode(const std::map& m, bufferlist& bl) -{ - uint32_t n = m.size(); - _encoderaw(n, bl); - for (typename std::map::const_iterator p = m.begin(); p != m.end(); ++p) { - _encode(p->first, bl); - _encode(p->second, bl); - } -} -template -inline void _decode(std::map& m, bufferlist& bl, int& off) -{ - uint32_t n; - _decoderaw(n, bl, off); - m.clear(); - while (n--) { - T k; - _decode(k, bl, off); - _decode(m[k], bl, off); - } -} - -// hash_map -template -inline void _encode(const __gnu_cxx::hash_map& m, bufferlist& bl) -{ - uint32_t n = m.size(); - _encoderaw(n, bl); - for (typename __gnu_cxx::hash_map::const_iterator p = m.begin(); p != m.end(); ++p) { - _encode(p->first, bl); - _encode(p->second, bl); - } -} -template -inline void _decode(__gnu_cxx::hash_map& m, bufferlist& bl, int& off) -{ - uint32_t n; - _decoderaw(n, bl, off); - m.clear(); - while (n--) { - T k; - _decode(k, bl, off); - _decode(m[k], bl, off); - } -} - -// string -inline void _encode(const std::string& s, bufferlist& bl) -{ - uint32_t len = s.length(); - _encoderaw(len, bl); - bl.append(s.data(), len); -} -inline void _decode(std::string& s, bufferlist& bl, int& off) -{ - uint32_t len; - _decoderaw(len, bl, off); - s.clear(); - bl.copy(off, len, s); - off += len; -} - -// const char* (encode only, string compatible) -inline void _encode(const char *s, bufferlist& bl) -{ - uint32_t len = strlen(s); - _encoderaw(len, bl); - bl.append(s, len); -} - -// bufferptr (encapsulated) -inline void _encode(const buffer::ptr& bp, bufferlist& bl) -{ - uint32_t len = bp.length(); - _encoderaw(len, bl); - bl.append(bp); -} -inline void _decode(buffer::ptr& bp, bufferlist& bl, int& off) -{ - uint32_t len; - _decoderaw(len, bl, off); - - bufferlist s; - bl.copy(off, len, s); - off += len; - - if (s.buffers().size() == 1) - bp = s.buffers().front(); - else - bp = buffer::copy(s.c_str(), s.length()); -} - -// bufferlist (encapsulated) -inline void _encode(const bufferlist& s, bufferlist& bl) -{ - uint32_t len = s.length(); - _encoderaw(len, bl); - bl.append(s); -} -inline void _encode_destructively(bufferlist& s, bufferlist& bl) -{ - uint32_t len = s.length(); - _encoderaw(len, bl); - bl.claim_append(s); -} -inline void _decode(bufferlist& s, bufferlist& bl, int& off) -{ - uint32_t len; - _decoderaw(len, bl, off); - s.clear(); - bl.copy(off, len, s); - off += len; -} - -// base -template -inline void _encode(const T& t, bufferlist& bl) -{ - _encoderaw(t, bl); -} -template -inline void _decode(T& t, bufferlist& bl, int& off) -{ - _decoderaw(t, bl, off); -} - - - -#endif diff --git a/branches/sage/mds/include/ceph_fs.h b/branches/sage/mds/include/ceph_fs.h deleted file mode 100644 index ede0663a79158..0000000000000 --- a/branches/sage/mds/include/ceph_fs.h +++ /dev/null @@ -1,163 +0,0 @@ -/* ceph_fs.h - * - * C data types to share between kernel and userspace - */ - -#ifndef _FS_CEPH_CEPH_FS_H -#define _FS_CEPH_CEPH_FS_H - -#include - - -typedef __u64 ceph_ino_t; - - -/** - * object id - */ -struct ceph_object { - ceph_ino_t ino; /* inode "file" identifier */ - __u32 bno; /* "block" (object) in that "file" */ - __u32 rev; /* revision. normally ctime (as epoch). */ -}; -typedef struct ceph_object ceph_object_t; - - - - -/** object layout - * how objects are mapped into PGs - */ -#define CEPH_OBJECT_LAYOUT_HASH 1 -#define CEPH_OBJECT_LAYOUT_LINEAR 2 -#define CEPH_OBJECT_LAYOUT_HASHINO 3 - -/** - * pg layout -- how PGs are mapped into (sets of) OSDs - */ -#define CEPH_PG_LAYOUT_CRUSH 0 -#define CEPH_PG_LAYOUT_HASH 1 -#define CEPH_PG_LAYOUT_LINEAR 2 -#define CEPH_PG_LAYOUT_HYBRID 3 - - -/** - * ceph_file_layout - describe data layout for a file/inode - */ -struct ceph_file_layout { - /* file -> object mapping */ - __u32 fl_stripe_unit; /* stripe unit, in bytes. must be multiple of page size. */ - __u32 fl_stripe_count; /* over this many objects */ - __u32 fl_object_size; /* until objects are this big, then move to new objects */ - - /* pg -> disk layout */ - __u32 fl_object_stripe_unit; /* for per-object raid */ - - /* object -> pg layout */ - __s32 fl_pg_preferred; /* preferred primary for pg */ - __u8 fl_pg_type; /* pg type; see PG_TYPE_* */ - __u8 fl_pg_size; /* pg size (num replicas, raid stripe width, etc. */ -}; - -#define ceph_file_layout_stripe_width(l) (l.fl_stripe_unit * l.fl_stripe_count) - -/* period = bytes before i start on a new set of objects */ -#define ceph_file_layout_period(l) (l.fl_object_size * l.fl_stripe_count) - - - -/** - * placement group id - */ -#define CEPH_PG_TYPE_REP 1 -#define CEPH_PG_TYPE_RAID4 2 - -union ceph_pg { - __u64 pg64; - struct { - __s32 preferred; /* preferred primary osd */ - __u16 ps; /* placement seed */ - __u8 type; - __u8 size; - } pg; -}; -typedef union ceph_pg ceph_pg_t; - -#define ceph_pg_is_rep(pg) (pg.pg.type == CEPH_PG_TYPE_REP) -#define ceph_pg_is_raid4(pg) (pg.pg.type == CEPH_PG_TYPE_RAID4) - -/** - * object layout - * - * describe how a given object should be stored. - */ -struct ceph_object_layout { - ceph_pg_t ol_pgid; - __u32 ol_stripe_unit; -}; - - - -/** - * object extent - */ -struct ceph_object_extent { - ceph_object_t oe_oid; - __u64 oe_start; - __u64 oe_length; - struct ceph_object_layout oe_object_layout; - - /* buffer extent reverse mapping? */ -}; - - - - - -/********************************************* - * message types - */ - -/* - * entity_name - */ -struct ceph_entity_name { - __u32 type; - __u32 num; -}; - -#define CEPH_ENTITY_TYPE_MON 1 -#define CEPH_ENTITY_TYPE_MDS 2 -#define CEPH_ENTITY_TYPE_OSD 3 -#define CEPH_ENTITY_TYPE_CLIENT 4 -#define CEPH_ENTITY_TYPE_ADMIN 5 - - -/* - * entity_addr - * ipv4 only for now - */ -struct ceph_entity_addr { - __u64 nonce; - __u32 port; - __u8 ipq[4]; -}; - - -struct ceph_entity_inst { - struct ceph_entity_name name; - struct ceph_entity_addr addr; -}; - - -/* - * message header - */ -struct ceph_message_header { - __u32 type; - struct ceph_entity_inst src, dst; - __u32 source_port, dest_port; - __u32 nchunks; -}; - -#endif diff --git a/branches/sage/mds/include/encodable.h b/branches/sage/mds/include/encodable.h deleted file mode 100644 index 321361866ec9b..0000000000000 --- a/branches/sage/mds/include/encodable.h +++ /dev/null @@ -1,424 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __ENCODABLE_H -#define __ENCODABLE_H - -#include "buffer.h" - -#include -#include -#include -#include -#include -#include - - -// ================================================================== -// simple - - -// raw -template -inline void _encode_raw(const T& t, bufferlist& bl) -{ - bl.append((char*)&t, sizeof(t)); -} -template -inline void _decode_raw(T& t, bufferlist::iterator &p) -{ - p.copy(sizeof(t), (char*)&t); -} - -#include -#include -#include -#include -#include -#include - -// list -template -inline void _encode_simple(const std::list& ls, bufferlist& bl) -{ - // should i pre- or post- count? - if (!ls.empty()) { - unsigned pos = bl.length(); - uint32_t n = 0; - _encode_raw(n, bl); - for (typename std::list::const_iterator p = ls.begin(); p != ls.end(); ++p) { - n++; - _encode_simple(*p, bl); - } - bl.copy_in(pos, sizeof(n), (char*)&n); - } else { - uint32_t n = ls.size(); // FIXME: this is slow on a list. - _encode_raw(n, bl); - for (typename std::list::const_iterator p = ls.begin(); p != ls.end(); ++p) - _encode_simple(*p, bl); - } -} -template -inline void _decode_simple(std::list& ls, bufferlist::iterator& p) -{ - uint32_t n; - _decode_raw(n, p); - ls.clear(); - while (n--) { - T v; - _decode_simple(v, p); - ls.push_back(v); - } -} - -// deque -template -inline void _encode_simple(const std::deque& ls, bufferlist& bl) -{ - uint32_t n = ls.size(); - _encode_raw(n, bl); - for (typename std::deque::const_iterator p = ls.begin(); p != ls.end(); ++p) - _encode_simple(*p, bl); -} -template -inline void _decode_simple(std::deque& ls, bufferlist::iterator& p) -{ - uint32_t n; - _decode_raw(n, p); - ls.clear(); - while (n--) { - T v; - _decode_simple(v, p); - ls.push_back(v); - } -} - -// set -template -inline void _encode_simple(const std::set& s, bufferlist& bl) -{ - uint32_t n = s.size(); - _encode_raw(n, bl); - for (typename std::set::const_iterator p = s.begin(); p != s.end(); ++p) - _encode_simple(*p, bl); -} -template -inline void _decode_simple(std::set& s, bufferlist::iterator& p) -{ - uint32_t n; - _decode_raw(n, p); - s.clear(); - while (n--) { - T v; - _decode_simple(v, p); - s.insert(v); - } -} - -// vector -template -inline void _encode_simple(const std::vector& v, bufferlist& bl) -{ - uint32_t n = v.size(); - _encode_raw(n, bl); - for (typename std::vector::const_iterator p = v.begin(); p != v.end(); ++p) - _encode_simple(*p, bl); -} -template -inline void _decode_simple(std::vector& v, bufferlist::iterator& p) -{ - uint32_t n; - _decode_raw(n, p); - v.resize(n); - for (uint32_t i=0; i -inline void _encode_simple(const std::map& m, bufferlist& bl) -{ - uint32_t n = m.size(); - _encode_raw(n, bl); - for (typename std::map::const_iterator p = m.begin(); p != m.end(); ++p) { - _encode_simple(p->first, bl); - _encode_simple(p->second, bl); - } -} -template -inline void _decode_simple(std::map& m, bufferlist::iterator& p) -{ - uint32_t n; - _decode_raw(n, p); - m.clear(); - while (n--) { - T k; - _decode_simple(k, p); - _decode_simple(m[k], p); - } -} - -// hash_map -template -inline void _encode_simple(const __gnu_cxx::hash_map& m, bufferlist& bl) -{ - uint32_t n = m.size(); - _encode_raw(n, bl); - for (typename __gnu_cxx::hash_map::const_iterator p = m.begin(); p != m.end(); ++p) { - _encode_simple(p->first, bl); - _encode_simple(p->second, bl); - } -} -template -inline void _decode_simple(__gnu_cxx::hash_map& m, bufferlist::iterator& p) -{ - uint32_t n; - _decode_raw(n, p); - m.clear(); - while (n--) { - T k; - _decode_simple(k, p); - _decode_simple(m[k], p); - } -} - -// string -inline void _encode_simple(const std::string& s, bufferlist& bl) -{ - uint32_t len = s.length(); - _encode_raw(len, bl); - bl.append(s.data(), len); -} -inline void _decode_simple(std::string& s, bufferlist::iterator& p) -{ - uint32_t len; - _decode_raw(len, p); - s.clear(); - p.copy(len, s); -} - -// const char* (encode only, string compatible) -inline void _encode_simple(const char *s, bufferlist& bl) -{ - uint32_t len = strlen(s); - _encode_raw(len, bl); - bl.append(s, len); -} - -// bufferptr (encapsulated) -inline void _encode_simple(const buffer::ptr& bp, bufferlist& bl) -{ - uint32_t len = bp.length(); - _encode_raw(len, bl); - bl.append(bp); -} -inline void _decode_simple(buffer::ptr& bp, bufferlist::iterator& p) -{ - uint32_t len; - _decode_raw(len, p); - - bufferlist s; - p.copy(len, s); - - if (s.buffers().size() == 1) - bp = s.buffers().front(); - else - bp = buffer::copy(s.c_str(), s.length()); -} - -// bufferlist (encapsulated) -inline void _encode_simple(const bufferlist& s, bufferlist& bl) -{ - uint32_t len = s.length(); - _encode_raw(len, bl); - bl.append(s); -} -inline void _encode_simple_destructively(bufferlist& s, bufferlist& bl) -{ - uint32_t len = s.length(); - _encode_raw(len, bl); - bl.claim_append(s); -} -inline void _decode_simple(bufferlist& s, bufferlist::iterator& p) -{ - uint32_t len; - _decode_raw(len, p); - s.clear(); - p.copy(len, s); -} - -// base -template -inline void _encode_simple(const T& t, bufferlist& bl) -{ - _encode_raw(t, bl); -} -template -inline void _decode_simple(T& t, bufferlist::iterator& p) -{ - _decode_raw(t, p); -} - - - - -// ================================================================== -// complex - -// list -template -inline void _encode_complex(const std::list& ls, bufferlist& bl) -{ - uint32_t n = ls.size(); - _encode_raw(n, bl); - for (typename std::list::const_iterator p = ls.begin(); p != ls.end(); ++p) - _encode_complex(*p, bl); -} -template -inline void _decode_complex(std::list& ls, bufferlist::iterator& p) -{ - uint32_t n; - _decode_raw(n, p); - ls.clear(); - while (n--) { - T v; - _decode_complex(v, p); - ls.push_back(v); - } -} - -// deque -template -inline void _encode_complex(const std::deque& ls, bufferlist& bl) -{ - uint32_t n = ls.size(); - _encode_raw(n, bl); - for (typename std::deque::const_iterator p = ls.begin(); p != ls.end(); ++p) - _encode_complex(*p, bl); -} -template -inline void _decode_complex(std::deque& ls, bufferlist::iterator& p) -{ - uint32_t n; - _decode_raw(n, p); - ls.clear(); - while (n--) { - T v; - _decode_complex(v, p); - ls.push_back(v); - } -} - -// set -template -inline void _encode_complex(const std::set& s, bufferlist& bl) -{ - uint32_t n = s.size(); - _encode_raw(n, bl); - for (typename std::set::const_iterator p = s.begin(); p != s.end(); ++p) - _encode_complex(*p, bl); -} -template -inline void _decode_complex(std::set& s, bufferlist::iterator& p) -{ - uint32_t n; - _decode_raw(n, p); - s.clear(); - while (n--) { - T v; - _decode_complex(v, p); - s.insert(v); - } -} - -// vector -template -inline void _encode_complex(const std::vector& v, bufferlist& bl) -{ - uint32_t n = v.size(); - _encode_raw(n, bl); - for (typename std::vector::const_iterator p = v.begin(); p != v.end(); ++p) - _encode_complex(*p, bl); -} -template -inline void _decode_complex(std::vector& v, bufferlist::iterator& p) -{ - uint32_t n; - _decode_raw(n, p); - v.resize(n); - for (uint32_t i=0; i -inline void _encode_complex(const std::map& m, bufferlist& bl) -{ - uint32_t n = m.size(); - _encode_raw(n, bl); - for (typename std::map::const_iterator p = m.begin(); p != m.end(); ++p) { - _encode_simple(p->first, bl); - _encode_complex(p->second, bl); - } -} -template -inline void _decode_complex(std::map& m, bufferlist::iterator& p) -{ - uint32_t n; - _decode_raw(n, p); - m.clear(); - while (n--) { - T k; - _decode_simple(k, p); - _decode_complex(m[k], p); - } -} - -// hash_map -template -inline void _encode_complex(const __gnu_cxx::hash_map& m, bufferlist& bl) -{ - uint32_t n = m.size(); - _encode_raw(n, bl); - for (typename __gnu_cxx::hash_map::const_iterator p = m.begin(); p != m.end(); ++p) { - _encode_simple(p->first, bl); - _encode_complex(p->second, bl); - } -} -template -inline void _decode_complex(__gnu_cxx::hash_map& m, bufferlist::iterator& p) -{ - uint32_t n; - _decode_raw(n, p); - m.clear(); - while (n--) { - T k; - _decode_simple(k, p); - _decode_complex(m[k], p); - } -} - -// base case -template -inline void _encode_complex(const T& t, bufferlist& bl) -{ - t._encode(bl); -} -template -inline void _decode_complex(T& t, bufferlist::iterator& p) -{ - t._decode(p); -} - -#endif diff --git a/branches/sage/mds/include/error.h b/branches/sage/mds/include/error.h deleted file mode 100644 index a548d9756b9b8..0000000000000 --- a/branches/sage/mds/include/error.h +++ /dev/null @@ -1,41 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -#define SYSERROR() syserror("At %s:%d", __FILE__, __LINE__) - -#define ASSERT(c) \ - ((c) || (exiterror("Assertion failed at %s:%d", __FILE__, __LINE__), 1)) - -/* print usage error message and exit */ -extern void userror(const char *use, const char *fmt, ...); - -/* print system error message and exit */ -extern void syserror(const char *fmt, ...); - -/* print error message and exit */ -extern void exiterror(const char *fmt, ...); - -/* print error message */ -extern void error(const char *fmt, ...); - -#ifdef __cplusplus -} // extern "C" -#endif diff --git a/branches/sage/mds/include/frag.h b/branches/sage/mds/include/frag.h deleted file mode 100644 index eac9d5bfa9e36..0000000000000 --- a/branches/sage/mds/include/frag.h +++ /dev/null @@ -1,573 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __FRAG_H -#define __FRAG_H - -#include -#include -#include -#include -#include "buffer.h" -#include "encodable.h" - -/* - * - * the goal here is to use a binary split strategy to partition a namespace. - * frag_t represents a particular fragment. bits() tells you the size of the - * fragment, and value() it's name. this is roughly analogous to an ip address - * and netmask. - * - * fragtree_t represents an entire namespace and it's partition. it essentially - * tells you where fragments are split into other fragments, and by how much - * (i.e. by how many bits, resulting in a power of 2 number of child fragments). - * - * this vaguely resembles a btree, in that when a fragment becomes large or small - * we can split or merge, except that there is no guarantee of being balanced. - * - * presumably we are partitioning the output of a (perhaps specialized) hash - * function. - */ - -/** - * frag_t - * - * description of an individual fragment. that is, a particular piece - * of the overall namespace. - * - * this is conceptually analogous to an ip address and netmask. - * - * a value v falls "within" fragment f iff (v & f.mask()) == f.value(). - * - * we write it as v/b, where v is a value and b is the number of bits. - * 0/0 (bits==0) corresponds to the entire namespace. if we bisect that, - * we get 0/1 and 1/1. quartering gives us 0/2, 1/2, 2/2, 3/2. and so on. - * - * this makes the right most bit of v the "most significant", which is the - * opposite of what we usually see. - */ - -/* - * TODO: - * - get_first_child(), next_sibling(int parent_bits) to make (possibly partial) - * iteration efficient (see, e.g., try_assimilate_children() - * - rework frag_t so that we mask the left-most (most significant) bits instead of - * the right-most (least significant) bits. just because it's more intutive, and - * matches the network/netmask concept. - */ - -typedef uint32_t _frag_t; - -class frag_t { - /* encoded value. - * 8 upper bits = "bits" - * 24 lower bits = "value" - */ - _frag_t _enc; - - public: - frag_t() : _enc(0) { } - frag_t(unsigned v, unsigned b) : _enc((b << 24) + - (v & (0xffffffffULL >> (32-b)))) { } - frag_t(_frag_t e) : _enc(e) { } - - // constructors - void from_unsigned(unsigned e) { _enc = e; } - - // accessors - unsigned value() const { return _enc & 0xffffff; } - unsigned bits() const { return _enc >> 24; } - unsigned mask() const { return 0xffffffffULL >> (32-bits()); } - - operator _frag_t() const { return _enc; } - - // tests - bool contains(unsigned v) const { - return (v & mask()) == value(); - } - bool contains(frag_t sub) const { - return (sub.bits() >= bits() && // they at least as specific as us, - (sub.value() & mask()) == value()); // and they are contained by us. - } - bool is_root() const { - return bits() == 0; - } - frag_t parent() const { - assert(bits() > 0); - return frag_t(value() & (mask() >> 1), bits()-1); - } - - // splitting - void split(int nb, std::list& fragments) const { - assert(nb > 0); - unsigned nway = 1 << nb; - for (unsigned i=0; i 0 && - (value() & (1 << (bits()-1)) == 0); - } - bool is_right() const { - return - bits() > 0 && - (value() & (1 << (bits()-1)) == 1); - } - frag_t left_child() const { - return frag_t(value(), bits()+1); - } - frag_t right_child() const { - return frag_t(value() | (1<: - // frag_t f is split by b bits. - // if child frag_t does not appear, it is not split. - std::map _splits; - - public: - // ------------- - // basics - void swap(fragtree_t& other) { - _splits.swap(other._splits); - } - - // ------------- - // accessors - bool empty() { - return _splits.empty(); - } - int get_split(const frag_t hb) const { - std::map::const_iterator p = _splits.find(hb); - if (p == _splits.end()) - return 0; - else - return p->second; - } - - - bool is_leaf(frag_t x) const { - std::list ls; - get_leaves_under(x, ls); - //cout << "is_leaf(" << x << ") -> " << ls << std::endl; - if (!ls.empty() && - ls.front() == x && - ls.size() == 1) - return true; - return false; - } - - /** - * get_leaves -- list all leaves - */ - void get_leaves(std::list& ls) const { - return get_leaves_under_split(frag_t(), ls); - } - - /** - * get_leaves_under_split -- list all leaves under a known split point (or root) - */ - void get_leaves_under_split(frag_t under, std::list& ls) const { - std::list q; - q.push_back(under); - while (!q.empty()) { - frag_t t = q.front(); - q.pop_front(); - int nb = get_split(t); - if (nb) - t.split(nb, q); // queue up children - else - ls.push_back(t); // not spit, it's a leaf. - } - } - - /** - * get_branch -- get branch point at OR above frag @x - * - may be @x itself, if @x is a split - * - may be root (frag_t()) - */ - frag_t get_branch(frag_t x) const { - while (1) { - if (x == frag_t()) return x; // root - if (get_split(x)) return x; // found it! - x = x.parent(); - } - } - - /** - * get_branch_above -- get a branch point above frag @x - * - may be root (frag_t()) - * - may NOT be @x, even if @x is a split. - */ - frag_t get_branch_above(frag_t x) const { - while (1) { - if (x == frag_t()) return x; // root - x = x.parent(); - if (get_split(x)) return x; // found it! - } - } - - - /** - * get_branch_or_leaf -- get branch or leaf point parent for frag @x - * - may be @x itself, if @x is a split or leaf - * - may be root (frag_t()) - */ - frag_t get_branch_or_leaf(frag_t x) const { - frag_t branch = get_branch(x); - int nb = get_split(branch); - if (nb > 0 && // if branch is a split, and - branch.bits() + nb <= x.bits()) // one of the children is or contains x - return frag_t(x.value(), branch.bits()+nb); // then return that child (it's a leaf) - else - return branch; - } - - /** - * get_leaves_under(x, ls) -- search for any leaves fully contained by x - */ - void get_leaves_under(frag_t x, std::list& ls) const { - std::list q; - q.push_back(get_branch(x)); - while (!q.empty()) { - frag_t t = q.front(); - q.pop_front(); - if (t.bits() >= x.bits() && // if t is more specific than x, and - !x.contains(t)) // x does not contain t, - continue; // then skip - int nb = get_split(t); - if (nb) - t.split(nb, q); // queue up children - else - ls.push_back(t); // not spit, it's a leaf. - } - } - - /** - * contains(fg) -- does fragtree contain the specific frag @x - */ - bool contains(frag_t x) const { - std::list q; - q.push_back(get_branch(x)); - while (!q.empty()) { - frag_t t = q.front(); - q.pop_front(); - if (t.bits() >= x.bits() && // if t is more specific than x, and - !x.contains(t)) // x does not contain t, - continue; // then skip - int nb = get_split(t); - if (nb) { - if (t == x) return false; // it's split. - t.split(nb, q); // queue up children - } else { - if (t == x) return true; // it's there. - } - } - return false; - } - - /** - * operator[] -- map a (hash?) value to a frag - */ - frag_t operator[](unsigned v) const { - frag_t t; - while (1) { - assert(t.contains(v)); - int nb = get_split(t); - - // is this a leaf? - if (nb == 0) return t; // done. - - // pick appropriate child fragment. - unsigned nway = 1 << nb; - unsigned i; - for (i=0; i children; - x.split(nb, children); - int childbits = 0; - for (std::list::iterator p = children.begin(); - p != children.end(); - ++p) { - int cb = get_split(*p); - if (!cb) return; // nope. - if (childbits && cb != childbits) return; // not the same - childbits = cb; - } - // all children are split with childbits! - for (std::list::iterator p = children.begin(); - p != children.end(); - ++p) - _splits.erase(*p); - _splits[x] += childbits; - } - - bool force_to_leaf(frag_t x) { - if (is_leaf(x)) - return false; - - cout << "force_to_leaf " << x << " on " << _splits << std::endl; - - frag_t parent = get_branch_or_leaf(x); - assert(parent.bits() <= x.bits()); - cout << "parent is " << parent << std::endl; - - // do we need to split from parent to x? - if (parent.bits() < x.bits()) { - int spread = x.bits() - parent.bits(); - int nb = get_split(parent); - cout << "spread " << spread << ", parent splits by " << nb << std::endl; - if (nb == 0) { - // easy: split parent (a leaf) by the difference - cout << "splitting parent " << parent << " by spread " << spread << std::endl; - split(parent, spread); - assert(is_leaf(x)); - return true; - } - assert(nb > spread); - - // add an intermediary split - merge(parent, nb); - split(parent, spread); - - std::list subs; - parent.split(spread, subs); - for (std::list::iterator p = subs.begin(); - p != subs.end(); - ++p) { - cout << "splitting intermediate " << *p << " by " << (nb-spread) << std::endl; - split(*p, nb - spread); - } - } - - // x is now a leaf or split. - // hoover up any children. - std::list q; - q.push_back(x); - while (!q.empty()) { - frag_t t = q.front(); - q.pop_front(); - int nb = get_split(t); - if (nb) { - cout << "merging child " << t << " by " << nb << std::endl; - merge(t, nb); // merge this point, and - t.split(nb, q); // queue up children - } - } - - cout << "force_to_leaf done" << std::endl; - assert(is_leaf(x)); - return true; - } - - // verify that we describe a legal partition of the namespace. - void verify() const { - std::map copy; - std::list q; - q.push_back(frag_t()); - - while (1) { - frag_t cur = q.front(); - q.pop_front(); - int b = get_split(cur); - if (!b) continue; - copy[cur] = b; - cur.split(b, q); - } - - assert(copy == _splits); - } - - // encoding - void _encode(bufferlist& bl) const { - ::_encode(_splits, bl); - } - void _decode(bufferlist& bl, int& off) { - ::_decode(_splits, bl, off); - } - void _decode(bufferlist::iterator& p) { - ::_decode_simple(_splits, p); - } - - void print(std::ostream& out) { - out << "fragtree_t("; - std::list q; - q.push_back(frag_t()); - while (!q.empty()) { - frag_t t = q.front(); - q.pop_front(); - // newline + indent? - if (t.bits()) { - out << std::endl; - for (unsigned i=0; i q; - q.push_back(frag_t()); - while (!q.empty()) { - frag_t t = q.front(); - q.pop_front(); - int nb = ft.get_split(t); - if (nb) { - if (t.bits()) out << ' '; - out << t << '%' << nb; - t.split(nb, q); // queue up children - } - } - } - if (1) { - std::list leaves; - ft.get_leaves(leaves); - out << leaves; - } - return out << ")"; -} - - -/** - * fragset_t -- a set of fragments - */ -class fragset_t { - std::set _set; - -public: - std::set &get() { return _set; } - std::set::iterator begin() { return _set.begin(); } - std::set::iterator end() { return _set.end(); } - - bool empty() const { return _set.empty(); } - - bool contains(frag_t f) const { - while (1) { - if (_set.count(f)) return true; - if (f.bits() == 0) return false; - f = f.parent(); - } - } - - void insert(frag_t f) { - _set.insert(f); - simplify(); - } - - void simplify() { - while (1) { - bool clean = true; - std::set::iterator p = _set.begin(); - while (p != _set.end()) { - if (!p->is_root() && - _set.count(p->get_sibling())) { - _set.erase(p->get_sibling()); - _set.insert(p->parent()); - _set.erase(p++); - clean = false; - } else { - p++; - } - } - if (clean) - break; - } - } -}; - -inline std::ostream& operator<<(std::ostream& out, fragset_t& fs) -{ - return out << "fragset_t(" << fs.get() << ")"; -} - -#endif diff --git a/branches/sage/mds/include/hash.h b/branches/sage/mds/include/hash.h deleted file mode 100644 index 0c27d3535174f..0000000000000 --- a/branches/sage/mds/include/hash.h +++ /dev/null @@ -1,70 +0,0 @@ -#ifndef __CEPHHASH_H -#define __CEPHHASH_H - -// Robert Jenkins' function for mixing 32-bit values -// http://burtleburtle.net/bob/hash/evahash.html -// a, b = random bits, c = input and output - -#define hashmix(a,b,c) \ - a=a-b; a=a-c; a=a^(c>>13); \ - b=b-c; b=b-a; b=b^(a<<8); \ - c=c-a; c=c-b; c=c^(b>>13); \ - a=a-b; a=a-c; a=a^(c>>12); \ - b=b-c; b=b-a; b=b^(a<<16); \ - c=c-a; c=c-b; c=c^(b>>5); \ - a=a-b; a=a-c; a=a^(c>>3); \ - b=b-c; b=b-a; b=b^(a<<10); \ - c=c-a; c=c-b; c=c^(b>>15); - - -//namespace ceph { - -template struct rjhash { }; - -inline uint64_t rjhash64(uint64_t key) { - key = (~key) + (key << 21); // key = (key << 21) - key - 1; - key = key ^ (key >> 24); - key = (key + (key << 3)) + (key << 8); // key * 265 - key = key ^ (key >> 14); - key = (key + (key << 2)) + (key << 4); // key * 21 - key = key ^ (key >> 28); - key = key + (key << 31); - return key; -} - -inline uint32_t rjhash32(uint32_t a) { - a = (a+0x7ed55d16) + (a<<12); - a = (a^0xc761c23c) ^ (a>>19); - a = (a+0x165667b1) + (a<<5); - a = (a+0xd3a2646c) ^ (a<<9); - a = (a+0xfd7046c5) + (a<<3); - a = (a^0xb55a4f09) ^ (a>>16); - return a; -} - - -template<> struct rjhash { - inline size_t operator()(const uint32_t x) const { -#ifdef __LP64__ - return rjhash64(x); -#else - return rjhash32(x); -#endif - } -}; - -template<> struct rjhash { - inline size_t operator()(const uint64_t x) const { -#ifdef __LP64__ - return rjhash64(x); -#else - return rjhash32(x) ^ rjhash32(x >> 32); -#endif - } -}; - -//} - - - -#endif diff --git a/branches/sage/mds/include/interval_set.h b/branches/sage/mds/include/interval_set.h deleted file mode 100644 index bc5edbc29441d..0000000000000 --- a/branches/sage/mds/include/interval_set.h +++ /dev/null @@ -1,315 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __INTERVAL_SET_H -#define __INTERVAL_SET_H - -#include -#include -#include -using namespace std; - -#ifndef MIN -# define MIN(a,b) ((a)<=(b) ? (a):(b)) -#endif -#ifndef MAX -# define MAX(a,b) ((a)>=(b) ? (a):(b)) -#endif - - -template -class interval_set { - public: - map m; // map start -> len - int _size; - - // helpers - private: - typename map::const_iterator find_inc(T start) const { - typename map::const_iterator p = m.lower_bound(start); // p->first >= start - if (p != m.begin() && - (p == m.end() || p->first > start)) { - p--; // might overlap? - if (p->first + p->second <= start) - p++; // it doesn't. - } - return p; - } - - typename map::iterator find_inc_m(T start) { - typename map::iterator p = m.lower_bound(start); - if (p != m.begin() && - (p == m.end() || p->first > start)) { - p--; // might overlap? - if (p->first + p->second <= start) - p++; // it doesn't. - } - return p; - } - - typename map::const_iterator find_adj(T start) const { - typename map::const_iterator p = m.lower_bound(start); - if (p != m.begin() && - (p == m.end() || p->first > start)) { - p--; // might touch? - if (p->first + p->second < start) - p++; // it doesn't. - } - return p; - } - - typename map::iterator find_adj_m(T start) { - typename map::iterator p = m.lower_bound(start); - if (p != m.begin() && - (p == m.end() || p->first > start)) { - p--; // might touch? - if (p->first + p->second < start) - p++; // it doesn't. - } - return p; - } - - public: - bool operator==(const interval_set& other) const { - return m == other.m; - } - - int size() { - return _size; - } - - void clear() { - m.clear(); - _size = 0; - } - - bool contains(T i) const { - typename map::const_iterator p = find_inc(i); - if (p == m.end()) return false; - if (p->first > i) return false; - if (p->first+p->second <= i) return false; - assert(p->first <= i && p->first+p->second > i); - return true; - } - bool contains(T start, T len) const { - typename map::const_iterator p = find_inc(start); - if (p == m.end()) return false; - if (p->first > start) return false; - if (p->first+p->second <= start) return false; - assert(p->first <= start && p->first+p->second > start); - if (p->first+p->second < start+len) return false; - return true; - } - bool intersects(T start, T len) const { - interval_set a; - a.insert(start, len); - interval_set i; - i.intersection_of( *this, a ); - if (i.empty()) return false; - return true; - } - - // outer range of set - bool empty() const { - return m.empty(); - } - T start() const { - assert(!empty()); - typename map::const_iterator p = m.begin(); - return p->first; - } - T end() const { - assert(!empty()); - typename map::const_iterator p = m.end(); - p--; - return p->first+p->second; - } - - // interval start after p (where p not in set) - bool starts_after(T i) const { - assert(!contains(i)); - typename map::const_iterator p = find_inc(i); - if (p == m.end()) return false; - return true; - } - T start_after(T i) const { - assert(!contains(i)); - typename map::const_iterator p = find_inc(i); - return p->first; - } - - // interval end that contains start - T end_after(T start) const { - assert(contains(start)); - typename map::const_iterator p = find_inc(start); - return p->first+p->second; - } - - void insert(T val) { - insert(val, 1); - } - - void insert(T start, T len) { - //cout << "insert " << start << "~" << len << endl; - assert(len > 0); - _size += len; - typename map::iterator p = find_adj_m(start); - if (p == m.end()) { - m[start] = len; // new interval - } else { - if (p->first < start) { - - if (p->first + p->second != start) { - //cout << "p is " << p->first << "~" << p->second << ", start is " << start << ", len is " << len << endl; - assert(0); - } - - assert(p->first + p->second == start); - p->second += len; // append to end - - typename map::iterator n = p; - n++; - if (n != m.end() && - start+len == n->first) { // combine with next, too! - p->second += n->second; - m.erase(n); - } - } else { - if (start+len == p->first) { - m[start] = len + p->second; // append to front - m.erase(p); - } else { - assert(p->first > start+len); - m[start] = len; // new interval - } - } - } - } - - void erase(T val) { - erase(val, 1); - } - - void erase(T start, T len) { - typename map::iterator p = find_inc_m(start); - - _size -= len; - - assert(p != m.end()); - assert(p->first <= start); - - T before = start - p->first; - assert(p->second >= before+len); - T after = p->second - before - len; - - if (before) - p->second = before; // shorten bit before - else - m.erase(p); - if (after) - m[start+len] = after; - } - - - void subtract(const interval_set &a) { - for (typename map::const_iterator p = a.m.begin(); - p != a.m.end(); - p++) - erase(p->first, p->second); - } - - void insert(const interval_set &a) { - for (typename map::const_iterator p = a.m.begin(); - p != a.m.end(); - p++) - insert(p->first, p->second); - } - - - void intersection_of(const interval_set &a, const interval_set &b) { - assert(&a != this); - assert(&b != this); - clear(); - - typename map::const_iterator pa = a.m.begin(); - typename map::const_iterator pb = b.m.begin(); - - while (pa != a.m.end() && pb != b.m.end()) { - // passing? - if (pa->first + pa->second <= pb->first) - { pa++; continue; } - if (pb->first + pb->second <= pa->first) - { pb++; continue; } - T start = MAX(pa->first, pb->first); - T end = MIN(pa->first+pa->second, pb->first+pb->second); - assert(end > start); - insert(start, end-start); - if (pa->first+pa->second > pb->first+pb->second) - pb++; - else - pa++; - } - } - - void union_of(const interval_set &a, const interval_set &b) { - assert(&a != this); - assert(&b != this); - clear(); - - //cout << "union_of" << endl; - - // a - m = a.m; - - // - (a*b) - interval_set ab; - ab.intersection_of(a, b); - subtract(ab); - - // + b - insert(b); - return; - } - void union_of(const interval_set &b) { - interval_set a; - a.m.swap(m); - union_of(a, b); - } - - bool subset_of(const interval_set &big) const { - for (typename map::const_iterator i = m.begin(); - i != m.end(); - i++) - if (!big.contains(i->first, i->second)) return false; - return true; - } - -}; - -template -inline ostream& operator<<(ostream& out, const interval_set &s) { - out << "["; - for (typename map::const_iterator i = s.m.begin(); - i != s.m.end(); - i++) { - if (i != s.m.begin()) out << ","; - out << i->first << "~" << i->second; - } - out << "]"; - return out; -} - - -#endif diff --git a/branches/sage/mds/include/lru.h b/branches/sage/mds/include/lru.h deleted file mode 100644 index 40dce1aa191ab..0000000000000 --- a/branches/sage/mds/include/lru.h +++ /dev/null @@ -1,341 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __LRU_H -#define __LRU_H - -#include -#include - -#include "config.h" - - - -class LRUObject { - private: - LRUObject *lru_next, *lru_prev; - bool lru_pinned; - class LRU *lru; - class LRUList *lru_list; - - public: - LRUObject() { - lru_next = lru_prev = NULL; - lru_list = 0; - lru_pinned = false; - lru = 0; - } - - // pin/unpin item in cache - void lru_pin(); - void lru_unpin(); - bool lru_is_expireable() { return !lru_pinned; } - - friend class LRU; - friend class LRUList; -}; - - -class LRUList { - private: - LRUObject *head, *tail; - uint32_t len; - - public: - LRUList() { - head = tail = 0; - len = 0; - } - - uint32_t get_length() { return len; } - - LRUObject *get_head() { - return head; - } - LRUObject *get_tail() { - return tail; - } - - void clear() { - while (len > 0) { - remove(get_head()); - } - } - - void insert_head(LRUObject *o) { - o->lru_next = head; - o->lru_prev = NULL; - if (head) { - head->lru_prev = o; - } else { - tail = o; - } - head = o; - o->lru_list = this; - len++; - } - void insert_tail(LRUObject *o) { - o->lru_next = NULL; - o->lru_prev = tail; - if (tail) { - tail->lru_next = o; - } else { - head = o; - } - tail = o; - o->lru_list = this; - len++; - } - - void remove(LRUObject *o) { - assert(o->lru_list == this); - if (o->lru_next) - o->lru_next->lru_prev = o->lru_prev; - else - tail = o->lru_prev; - if (o->lru_prev) - o->lru_prev->lru_next = o->lru_next; - else - head = o->lru_next; - o->lru_next = o->lru_prev = NULL; - o->lru_list = 0; - assert(len>0); - len--; - } - -}; - - -class LRU { - protected: - LRUList lru_top, lru_bot, lru_pintail; - uint32_t lru_num, lru_num_pinned; - uint32_t lru_max; // max items - double lru_midpoint; - - friend class LRUObject; - //friend class MDCache; // hack - - public: - LRU(int max = 0) { - lru_num = 0; - lru_num_pinned = 0; - lru_midpoint = .6; - lru_max = max; - } - - uint32_t lru_get_size() { return lru_num; } - uint32_t lru_get_top() { return lru_top.get_length(); } - uint32_t lru_get_bot() { return lru_bot.get_length(); } - uint32_t lru_get_pintail() { return lru_pintail.get_length(); } - uint32_t lru_get_max() { return lru_max; } - uint32_t lru_get_num_pinned() { return lru_num_pinned; } - - void lru_set_max(uint32_t m) { lru_max = m; } - void lru_set_midpoint(float f) { lru_midpoint = f; } - - void lru_clear() { - lru_top.clear(); - lru_bot.clear(); - lru_pintail.clear(); - } - - // insert at top of lru - void lru_insert_top(LRUObject *o) { - //assert(!o->lru_in_lru); - //o->lru_in_lru = true; - assert(!o->lru); - o->lru = this; - lru_top.insert_head( o ); - lru_num++; - if (o->lru_pinned) lru_num_pinned++; - lru_adjust(); - } - - // insert at mid point in lru - void lru_insert_mid(LRUObject *o) { - //assert(!o->lru_in_lru); - //o->lru_in_lru = true; - assert(!o->lru); - o->lru = this; - lru_bot.insert_head(o); - lru_num++; - if (o->lru_pinned) lru_num_pinned++; - } - - // insert at bottom of lru - void lru_insert_bot(LRUObject *o) { - assert(!o->lru); - o->lru = this; - lru_bot.insert_tail(o); - lru_num++; - if (o->lru_pinned) lru_num_pinned++; - } - - /* - // insert at bottom of lru - void lru_insert_pintail(LRUObject *o) { - assert(!o->lru); - o->lru = this; - - assert(o->lru_pinned); - - lru_pintail.insert_head(o); - lru_num++; - lru_num_pinned += o->lru_pinned; - } - */ - - - - - // adjust top/bot balance, as necessary - void lru_adjust() { - if (!lru_max) return; - - unsigned toplen = lru_top.get_length(); - unsigned topwant = (unsigned)(lru_midpoint * ((double)lru_max - lru_num_pinned)); - while (toplen > 0 && - toplen > topwant) { - // remove from tail of top, stick at head of bot - // FIXME: this could be way more efficient by moving a whole chain of items. - - LRUObject *o = lru_top.get_tail(); - lru_top.remove(o); - lru_bot.insert_head(o); - toplen--; - } - } - - - // remove an item - LRUObject *lru_remove(LRUObject *o) { - // not in list - //assert(o->lru_in_lru); - //if (!o->lru_in_lru) return o; // might have expired and been removed that way. - if (!o->lru) return o; - - - if (o->lru_list == &lru_top) - lru_top.remove(o); - else if (o->lru_list == &lru_bot) - lru_bot.remove(o); - else if (o->lru_list == &lru_pintail) - lru_pintail.remove(o); - else - assert(0); - - lru_num--; - if (o->lru_pinned) lru_num_pinned--; - o->lru = 0; - return o; - } - - // touch item -- move to head of lru - bool lru_touch(LRUObject *o) { - lru_remove(o); - lru_insert_top(o); - return true; - } - - // touch item -- move to midpoint (unless already higher) - bool lru_midtouch(LRUObject *o) { - if (o->lru_list == &lru_top) return false; - - lru_remove(o); - lru_insert_mid(o); - return true; - } - - // touch item -- move to bottom - bool lru_bottouch(LRUObject *o) { - lru_remove(o); - lru_insert_bot(o); - return true; - } - - void lru_touch_entire_pintail() { - // promote entire pintail to the top lru - while (lru_pintail.get_length() > 0) { - LRUObject *o = lru_pintail.get_head(); - lru_pintail.remove(o); - lru_top.insert_tail(o); - } - } - - - // expire -- expire a single item - LRUObject *lru_get_next_expire() { - LRUObject *p; - - // look through tail of bot - while (lru_bot.get_length()) { - p = lru_bot.get_tail(); - if (!p->lru_pinned) return p; - - // move to pintail - lru_bot.remove(p); - lru_pintail.insert_head(p); - } - - // ok, try head then - while (lru_top.get_length()) { - p = lru_top.get_tail(); - if (!p->lru_pinned) return p; - - // move to pintail - lru_top.remove(p); - lru_pintail.insert_head(p); - } - - // no luck! - return NULL; - } - - LRUObject *lru_expire() { - LRUObject *p = lru_get_next_expire(); - if (p) - return lru_remove(p); - return NULL; - } - - - void lru_status() { - generic_dout(10) << "lru: " << lru_num << " items, " << lru_top.get_length() << " top, " << lru_bot.get_length() << " bot, " << lru_pintail.get_length() << " pintail" << dendl; - } - -}; - - -inline void LRUObject::lru_pin() -{ - lru_pinned = true; - if (lru) lru->lru_num_pinned++; -} -inline void LRUObject::lru_unpin() { - lru_pinned = false; - if (lru) { - lru->lru_num_pinned--; - - // move from pintail -> bot - if (lru_list == &lru->lru_pintail) { - lru->lru_pintail.remove(this); - lru->lru_bot.insert_tail(this); - } - } -} - -#endif diff --git a/branches/sage/mds/include/object.h b/branches/sage/mds/include/object.h deleted file mode 100644 index 3b8ac05a86b38..0000000000000 --- a/branches/sage/mds/include/object.h +++ /dev/null @@ -1,99 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __OBJECT_H -#define __OBJECT_H - -#include - -#include -#include -using namespace std; - -#include -using namespace __gnu_cxx; - -#include "hash.h" - -typedef uint32_t objectrev_t; - -struct object_t { - static const uint32_t MAXREV = 0xffffffffU; - - uint64_t ino; // "file" identifier - uint32_t bno; // "block" in that "file" - objectrev_t rev; // revision. normally ctime (as epoch). - - object_t() : ino(0), bno(0), rev(0) {} - object_t(uint64_t i, uint32_t b) : ino(i), bno(b), rev(0) {} - object_t(uint64_t i, uint32_t b, uint32_t r) : ino(i), bno(b), rev(r) {} -}; - - -inline bool operator==(const object_t l, const object_t r) { - return (l.ino == r.ino) && (l.bno == r.bno) && (l.rev == r.rev); -} -inline bool operator!=(const object_t l, const object_t r) { - return (l.ino != r.ino) || (l.bno != r.bno) || (l.rev != r.rev); -} -inline bool operator>(const object_t l, const object_t r) { - if (l.ino > r.ino) return true; - if (l.ino < r.ino) return false; - if (l.bno > r.bno) return true; - if (l.bno < r.bno) return false; - if (l.rev > r.rev) return true; - return false; -} -inline bool operator<(const object_t l, const object_t r) { - if (l.ino < r.ino) return true; - if (l.ino > r.ino) return false; - if (l.bno < r.bno) return true; - if (l.bno > r.bno) return false; - if (l.rev < r.rev) return true; - return false; -} -inline bool operator>=(const object_t l, const object_t r) { - return !(l < r); -} -inline bool operator<=(const object_t l, const object_t r) { - return !(l > r); -} -inline ostream& operator<<(ostream& out, const object_t o) { - out << hex << o.ino << '.'; - out.setf(ios::right); - out.fill('0'); - out << setw(8) << o.bno << dec; - out.unsetf(ios::right); - if (o.rev) - out << '.' << o.rev; - return out; -} - - - - - -namespace __gnu_cxx { - template<> struct hash { - size_t operator()(const object_t &r) const { - static rjhash H; - static rjhash I; - //static hash H; - //static hash I; - return H(r.ino) ^ I(r.bno) ^ I(r.rev); - } - }; - -} -#endif diff --git a/branches/sage/mds/include/rangeset.h b/branches/sage/mds/include/rangeset.h deleted file mode 100644 index 547ea3ab72274..0000000000000 --- a/branches/sage/mds/include/rangeset.h +++ /dev/null @@ -1,253 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __RANGESET_H -#define __RANGESET_H - -/* - * - * my first container with iterator! it's pretty ugly. - * - */ - -#include -#include -#include -using namespace std; - -//typedef int T; - -template -struct _rangeset_base { - map ranges; // pair(first,last) (inclusive, e.g. [first,last]) - - typedef typename map::iterator mapit; - - // get iterator for range including val. or ranges.end(). - mapit get_range_for(T val) { - mapit it = ranges.lower_bound(val); - if (it == ranges.end()) { - // search backwards - typename map::reverse_iterator it = ranges.rbegin(); - if (it == ranges.rend()) return ranges.end(); - if (it->first <= val && it->second >= val) - return ranges.find(it->first); - return ranges.end(); - } else { - if (it->first == val) return - it--; - if (it->first <= val && it->second >= val) - return it; - return ranges.end(); - } - } - -}; - - -template -class rangeset_iterator : - public std::iterator -{ - //typedef typename map::iterator mapit; - - map ranges; - typename map::iterator it; - T current; - -public: - // cons - rangeset_iterator() {} - - rangeset_iterator(typename map::iterator& it, map& ranges) { - this->ranges = ranges; - this->it = it; - if (this->it != ranges.end()) - current = it->first; - } - - bool operator==(rangeset_iterator rit) { - return (it == rit.it && rit.current == current); - } - bool operator!=(rangeset_iterator rit) { - return (it != rit.it) || (rit.current != current); - } - - T& operator*() { - return current; - } - - rangeset_iterator operator++(int) { - if (current < it->second) - current++; - else { - it++; - if (it != ranges.end()) - current = it->first; - } - - return *this; - } -}; - - -template -class rangeset -{ - typedef typename map::iterator map_iterator; - - _rangeset_base theset; - inodeno_t _size; - -public: - rangeset() { _size = 0; } - typedef rangeset_iterator iterator; - - iterator begin() { - map_iterator it = theset.ranges.begin(); - return iterator(it, theset.ranges); - } - - iterator end() { - map_iterator it = theset.ranges.end(); - return iterator(it, theset.ranges); - } - - map_iterator map_begin() { - return theset.ranges.begin(); - } - map_iterator map_end() { - return theset.ranges.end(); - } - int map_size() { - return theset.ranges.size(); - } - - void map_insert(T v1, T v2) { - theset.ranges.insert(pair(v1,v2)); - _size += v2 - v1+1; - } - - - // ... - bool contains(T val) { - if (theset.get_range_for(val) == theset.ranges.end()) return false; - assert(!empty()); - return true; - } - - void insert(T val) { - assert(!contains(val)); - - map_iterator left = theset.get_range_for(val-1); - map_iterator right = theset.get_range_for(val+1); - - if (left != theset.ranges.end() && - right != theset.ranges.end()) { - // join! - left->second = right->second; - theset.ranges.erase(right); - _size++; - return; - } - - if (left != theset.ranges.end()) { - // add to left range - left->second = val; - _size++; - return; - } - - if (right != theset.ranges.end()) { - // add to right range - theset.ranges.insert(pair(val, right->second)); - theset.ranges.erase(val+1); - _size++; - return; - } - - // new range - theset.ranges.insert(pair(val,val)); - _size++; - return; - } - - unsigned size() { - return size(); - } - - bool empty() { - if (theset.ranges.empty()) { - assert(_size == 0); - return true; - } - assert(_size>0); - return false; - } - - - T first() { - assert(!empty()); - map_iterator it = theset.ranges.begin(); - return it->first; - } - - void erase(T val) { - assert(contains(val)); - map_iterator it = theset.get_range_for(val); - assert(it != theset.ranges.end()); - - // entire range - if (val == it->first && val == it->second) { - theset.ranges.erase(it); - _size--; - return; - } - - // beginning - if (val == it->first) { - theset.ranges.insert(pair(val+1, it->second)); - theset.ranges.erase(it); - _size--; - return; - } - - // end - if (val == it->second) { - it->second = val-1; - _size--; - return; - } - - // middle split - theset.ranges.insert(pair(it->first, val-1)); - theset.ranges.insert(pair(val+1, it->second)); - theset.ranges.erase(it); - _size--; - return; - } - - void dump() { - for (typename map::iterator it = theset.ranges.begin(); - it != theset.ranges.end(); - it++) { - cout << " " << it->first << "-" << it->second << endl; - } - } - -}; - - -#endif diff --git a/branches/sage/mds/include/statlite.h b/branches/sage/mds/include/statlite.h deleted file mode 100644 index a9c0433e4a4e8..0000000000000 --- a/branches/sage/mds/include/statlite.h +++ /dev/null @@ -1,72 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -#ifndef _STATLITE_H -#define _STATLITE_H - -extern "C" { - -#include -#include -#include -#include -#include - -struct statlite { - dev_t st_dev; /* device */ - ino_t st_ino; /* inode */ - mode_t st_mode; /* protection */ - nlink_t st_nlink; /* number of hard links */ - uid_t st_uid; /* user ID of owner */ - gid_t st_gid; /* group ID of owner */ - dev_t st_rdev; /* device type (if inode device)*/ - unsigned long st_litemask; /* bit mask for optional fields */ - /***************************************************************/ - /**** Remaining fields are optional according to st_litemask ***/ - off_t st_size; /* total size, in bytes */ - blksize_t st_blksize; /* blocksize for filesystem I/O */ - blkcnt_t st_blocks; /* number of blocks allocated */ - struct timespec st_atim; /* Time of last access. */ - struct timespec st_mtim; /* Time of last modification. */ - struct timespec st_ctim; /* Time of last status change. */ - //time_t st_atime; /* time of last access */ - //time_t st_mtime; /* time of last modification */ - //time_t st_ctime; /* time of last change */ -}; - -#define S_STATLITE_SIZE 1 -#define S_STATLITE_BLKSIZE 2 -#define S_STATLITE_BLOCKS 4 -#define S_STATLITE_ATIME 8 -#define S_STATLITE_MTIME 16 -#define S_STATLITE_CTIME 32 - -#define S_REQUIRESIZE(m) (m | S_STATLITE_SIZE) -#define S_REQUIREBLKSIZE(m) (m | S_STATLITE_BLKSIZE) -#define S_REQUIREBLOCKS(m) (m | S_STATLITE_BLOCKS) -#define S_REQUIREATIME(m) (m | S_STATLITE_ATIME) -#define S_REQUIREMTIME(m) (m | S_STATLITE_MTIME) -#define S_REQUIRECTIME(m) (m | S_STATLITE_CTIME) - -#define S_ISVALIDSIZE(m) (m & S_STATLITE_SIZE) -#define S_ISVALIDBLKSIZE(m) (m & S_STATLITE_BLKSIZE) -#define S_ISVALIDBLOCKS(m) (m & S_STATLITE_BLOCKS) -#define S_ISVALIDATIME(m) (m & S_STATLITE_ATIME) -#define S_ISVALIDMTIME(m) (m & S_STATLITE_MTIME) -#define S_ISVALIDCTIME(m) (m & S_STATLITE_CTIME) - - -// readdirplus etc. - -struct dirent_plus { - struct dirent d_dirent; /* dirent struct for this entry */ - struct stat d_stat; /* attributes for this entry */ - int d_stat_err;/* errno for d_stat, or 0 */ -}; -struct dirent_lite { - struct dirent d_dirent; /* dirent struct for this entry */ - struct statlite d_stat; /* attributes for this entry */ - int d_stat_err;/* errno for d_stat, or 0 */ -}; - -} -#endif diff --git a/branches/sage/mds/include/triple.h b/branches/sage/mds/include/triple.h deleted file mode 100644 index e9f43b9315d21..0000000000000 --- a/branches/sage/mds/include/triple.h +++ /dev/null @@ -1,28 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __TRIPLE_H -#define __TRIPLE_H - -template -class triple { - public: - A first; - B second; - C third; - triple() {} - triple(A f, B s, C t) : first(f), second(s), third(t) {} -}; - -#endif diff --git a/branches/sage/mds/include/types.h b/branches/sage/mds/include/types.h deleted file mode 100644 index e394a27559636..0000000000000 --- a/branches/sage/mds/include/types.h +++ /dev/null @@ -1,294 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_TYPES_H -#define __MDS_TYPES_H - -extern "C" { -#include -#include -#include -#include -#include "statlite.h" -} - -#include -#include -#include -#include -#include -#include -#include - -using namespace std; - -#include -using namespace __gnu_cxx; - -#include "ceph_fs.h" - - -#include "object.h" -#include "utime.h" - - -#ifndef MIN -# define MIN(a,b) ((a) < (b) ? (a):(b)) -#endif -#ifndef MAX -# define MAX(a,b) ((a) > (b) ? (a):(b)) -#endif - - -// -- stl crap -- - -namespace __gnu_cxx { - template<> struct hash< std::string > - { - size_t operator()( const std::string& x ) const - { - static hash H; - return H(x.c_str()); - } - }; - -#ifndef __LP64__ - template<> struct hash { - size_t operator()(int64_t __x) const { - static hash H; - return H((__x >> 32) ^ (__x & 0xffffffff)); - } - }; - template<> struct hash { - size_t operator()(uint64_t __x) const { - static hash H; - return H((__x >> 32) ^ (__x & 0xffffffff)); - } - }; -#endif - -} - - -/* - * comparators for stl containers - */ -// for hash_map: -// hash_map, eqstr> vals; -struct eqstr -{ - bool operator()(const char* s1, const char* s2) const - { - return strcmp(s1, s2) == 0; - } -}; - -// for set, map -struct ltstr -{ - bool operator()(const char* s1, const char* s2) const - { - return strcmp(s1, s2) < 0; - } -}; - - - -// ---------------------- -// some basic types - -typedef uint64_t tid_t; // transaction id -typedef uint64_t version_t; -typedef uint32_t epoch_t; // map epoch (32bits -> 13 epochs/second for 10 years) - - - -#define O_LAZY 01000000 - - - -typedef ceph_file_layout FileLayout; - - -// -------------------------------------- -// inode - -typedef __uint64_t _inodeno_t; - -struct inodeno_t { - _inodeno_t val; - inodeno_t() : val(0) {} - inodeno_t(_inodeno_t v) : val(v) {} - inodeno_t operator+=(inodeno_t o) { val += o.val; return *this; } - operator _inodeno_t() const { return val; } -}; - -inline ostream& operator<<(ostream& out, inodeno_t ino) { - return out << hex << ino.val << dec; -} - -namespace __gnu_cxx { - template<> struct hash< inodeno_t > - { - size_t operator()( const inodeno_t& x ) const - { - static rjhash H; - return H(x.val); - } - }; -} - - -#define FILE_MODE_R 1 -#define FILE_MODE_W 2 -#define FILE_MODE_RW (1|2) -#define FILE_MODE_LAZY 4 - -/** stat masks - */ -#define STAT_MASK_INO 1 // inode nmber -#define STAT_MASK_TYPE 2 // file type bits of the mode -#define STAT_MASK_BASE 4 // layout, symlink value -#define STAT_MASK_AUTH 8 // uid, gid, mode -#define STAT_MASK_LINK 16 // nlink, anchored -#define STAT_MASK_FILE 32 // mtime, size. - -#define STAT_MASK_ALL 63 - -#define STAT_MASK_SIZE STAT_MASK_FILE // size, blksize, blocks -#define STAT_MASK_MTIME STAT_MASK_FILE // mtime -#define STAT_MASK_ATIME STAT_MASK_FILE // atime -#define STAT_MASK_CTIME (STAT_MASK_FILE|STAT_MASK_AUTH|STAT_MASK_LINK) // ctime - -inline int DT_TO_MODE(int dt) { - return dt << 12; -} -inline unsigned char MODE_TO_DT(int mode) { - return mode >> 12; -} - -struct inode_t { - // base (immutable) - inodeno_t ino; - FileLayout layout; // ?immutable? - uint32_t rdev; // if special file - - // affected by any inode change... - utime_t ctime; // inode change time - - // perm (namespace permissions) - uint32_t mode; - uid_t uid; - gid_t gid; - - // nlink - int32_t nlink; - bool anchored; // auth only? - - // file (data access) - int64_t size, max_size, allocated_size; - utime_t mtime; // file data modify time. - utime_t atime; // file data access time. - utime_t rmtime; // recursive mtime - - // special stuff - version_t version; // auth only - version_t file_data_version; // auth only - - // file type - bool is_symlink() { return (mode & S_IFMT) == S_IFLNK; } - bool is_dir() { return (mode & S_IFMT) == S_IFDIR; } - bool is_file() { return (mode & S_IFMT) == S_IFREG; } -}; - - - - - - - -// dentries -#define MAX_DENTRY_LEN 255 - - - - -// -- io helpers -- - -template -inline ostream& operator<<(ostream& out, pair v) { - return out << v.first << "," << v.second; -} - -template -inline ostream& operator<<(ostream& out, vector& v) { - out << "["; - for (unsigned i=0; i -inline ostream& operator<<(ostream& out, const list& ilist) { - for (typename list::const_iterator it = ilist.begin(); - it != ilist.end(); - it++) { - if (it != ilist.begin()) out << ","; - out << *it; - } - return out; -} - -template -inline ostream& operator<<(ostream& out, const set& iset) { - for (typename set::const_iterator it = iset.begin(); - it != iset.end(); - it++) { - if (it != iset.begin()) out << ","; - out << *it; - } - return out; -} - -template -inline ostream& operator<<(ostream& out, const multiset& iset) { - for (typename multiset::const_iterator it = iset.begin(); - it != iset.end(); - it++) { - if (it != iset.begin()) out << ","; - out << *it; - } - return out; -} - -template -inline ostream& operator<<(ostream& out, const map& m) -{ - out << "{"; - for (typename map::const_iterator it = m.begin(); - it != m.end(); - it++) { - if (it != m.begin()) out << ","; - out << it->first << "=" << it->second; - } - out << "}"; - return out; -} - - - -#endif diff --git a/branches/sage/mds/include/uofs.h b/branches/sage/mds/include/uofs.h deleted file mode 100644 index a4673aaa616ea..0000000000000 --- a/branches/sage/mds/include/uofs.h +++ /dev/null @@ -1,51 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -/* - * uofs.h - * - * user-level object-based file system - */ - - #ifndef _UOFS_H_ - #define _UOFS_H_ - - #include - #include - #include - - - int device_open(char *path, int xflags); - void device_findsizes(int fd, long long *sz, int *bsz); - - int uofs_format(int bdev_id, int donode_size, int bd_ratio, int reg_size, int sb_size, int lb_size, - int nr_hash_table_buckets, int delay_allocation, int flush_interval); - - int uofs_mount(int bdev_id); - void uofs_shutdown(void); - - int uofs_read(long long oid, void *buf, off_t offset, size_t count); - int uofs_write(long long oid, void *buf, off_t offset, size_t count); - int uofs_del(long long oid); - int uofs_sync(long long oid); - int uofs_exist(long long oid); - - int uofs_get_size(long long oid); - - void uofs_superblock_printout(void); - int get_large_object_pages(void); - - int uofs_buffer_size(void); - #endif diff --git a/branches/sage/mds/include/utime.h b/branches/sage/mds/include/utime.h deleted file mode 100644 index 7fef5a7f930d2..0000000000000 --- a/branches/sage/mds/include/utime.h +++ /dev/null @@ -1,149 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __UTIME_H -#define __UTIME_H - -#include -#include -#include - -// -------- -// utime_t - -typedef struct timeval _utime_t; - -class utime_t { - private: - struct timeval tv; - - struct timeval& timeval() { return tv; } - friend class Clock; - - - public: - void normalize() { - if (tv.tv_usec > 1000*1000) { - tv.tv_sec += tv.tv_usec / (1000*1000); - tv.tv_usec %= 1000*1000; - } - } - - // cons - utime_t() { tv.tv_sec = 0; tv.tv_usec = 0; normalize(); } - //utime_t(time_t s) { tv.tv_sec = s; tv.tv_usec = 0; } - utime_t(time_t s, int u) { tv.tv_sec = s; tv.tv_usec = u; normalize(); } - utime_t(const _utime_t &v) : tv(v) {} - /* - utime_t(double d) { - tv.tv_sec = (time_t)trunc(d); - tv.tv_usec = (__suseconds_t)((d - tv.tv_sec) / (double)1000000.0); - } - */ - - // accessors - time_t sec() const { return tv.tv_sec; } - long usec() const { return tv.tv_usec; } - int nsec() const { return tv.tv_usec*1000; } - - // ref accessors/modifiers - time_t& sec_ref() { return tv.tv_sec; } - // FIXME: tv.tv_usec is a __darwin_suseconds_t on Darwin. - // is just casting it to long& OK? - long& usec_ref() { return (long&) tv.tv_usec; } - - struct timeval& tv_ref() { return tv; } - - // cast to double - operator double() { - return (double)sec() + ((double)usec() / 1000000.0L); - } -}; - -// arithmetic operators -inline utime_t operator+(const utime_t& l, const utime_t& r) { - return utime_t( l.sec() + r.sec() + (l.usec()+r.usec())/1000000L, - (l.usec()+r.usec())%1000000L ); -} -inline utime_t& operator+=(utime_t& l, const utime_t& r) { - l.sec_ref() += r.sec() + (l.usec()+r.usec())/1000000L; - l.usec_ref() += r.usec(); - l.usec_ref() %= 1000000L; - return l; -} -inline utime_t& operator+=(utime_t& l, double f) { - double fs = trunc(f); - double us = (f - fs) * (double)1000000.0; - l.sec_ref() += (long)fs; - l.usec_ref() += (long)us; - l.normalize(); - return l; -} - -inline utime_t operator-(const utime_t& l, const utime_t& r) { - return utime_t( l.sec() - r.sec() - (l.usec()= r.usec()) - l.usec_ref() -= r.usec(); - else { - l.usec_ref() += 1000000L - r.usec(); - l.sec_ref()--; - } - return l; -} -inline utime_t& operator-=(utime_t& l, double f) { - l += -f; - return l; -} - -inline bool operator>(const utime_t& a, const utime_t& b) -{ - return (a.sec() > b.sec()) || (a.sec() == b.sec() && a.usec() > b.usec()); -} -inline bool operator<(const utime_t& a, const utime_t& b) -{ - return (a.sec() < b.sec()) || (a.sec() == b.sec() && a.usec() < b.usec()); -} - -// ostream -inline std::ostream& operator<<(std::ostream& out, const utime_t& t) -{ - out.setf(std::ios::right); - out.fill('0'); - if (t.sec() < ((time_t)(60*60*24*365*10))) { - // raw seconds. this looks like a relative time. - out << (long)t.sec(); - } else { - // localtime. this looks like an absolute time. - struct tm bdt; - time_t tt = t.sec(); - localtime_r(&tt, &bdt); - out << std::setw(2) << (bdt.tm_year-100) // 2007 -> '07' - << std::setw(2) << (bdt.tm_mon+1) - << std::setw(2) << bdt.tm_mday - << "." - << std::setw(2) << bdt.tm_hour - << std::setw(2) << bdt.tm_min - << std::setw(2) << bdt.tm_sec; - } - out << "."; - out << std::setw(6) << t.usec(); - out.unsetf(std::ios::right); - return out; -} - -#endif diff --git a/branches/sage/mds/include/xlist.h b/branches/sage/mds/include/xlist.h deleted file mode 100644 index 2ea2cbec6c815..0000000000000 --- a/branches/sage/mds/include/xlist.h +++ /dev/null @@ -1,123 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __XLIST_H -#define __XLIST_H - -template -class xlist { -public: - struct item { - T _item; - item *_prev, *_next; - xlist *_head; - - item(T i) : _item(i), _prev(0), _next(0), _head(0) {} - ~item() { - remove_myself(); - } - - xlist* get_xlist() { return _head; } - void remove_myself() { - if (_head) - _head->remove(this); - assert(_head == 0); - } - }; - -private: - item *_front, *_back; - int _size; - -public: - xlist() : _front(0), _back(0), _size(0) {} - ~xlist() { - assert(_size == 0); - assert(_front == 0); - assert(_back == 0); - } - - int size() { return _size; } - bool empty() { - assert((bool)_front == (bool)_size); - return _front == 0; - } - - void clear() { - while (_front) remove(_front); - } - - void push_back(item *item) { - if (item->_head) - item->_head->remove(item); - - item->_head = this; - item->_next = 0; - item->_prev = _back; - if (_back) - _back->_next = item; - else - _front = item; - _back = item; - _size++; - } - void remove(item *item) { - assert(item->_head == this); - - if (item->_prev) - item->_prev->_next = item->_next; - else - _front = item->_next; - if (item->_next) - item->_next->_prev = item->_prev; - else - _back = item->_prev; - _size--; - - item->_head = 0; - item->_next = item->_prev = 0; - } - - T front() { return (T)_front->_item; } - T back() { return (T)_back->_item; } - - void pop_front() { - assert(!empty()); - remove(_front); - } - void pop_back() { - assert(!empty()); - remove(_back); - } - - class iterator { - private: - item *cur; - public: - iterator(item *i = 0) : cur(i) {} - T operator*() { return (T)cur->_item; } - iterator& operator++() { - assert(cur); - cur = cur->_next; - return *this; - } - bool end() { return cur == 0; } - }; - - iterator begin() { return iterator(_front); } - iterator end() { return iterator(NULL); } -}; - - -#endif diff --git a/branches/sage/mds/jobs/alc.tp b/branches/sage/mds/jobs/alc.tp deleted file mode 100644 index c600850c54be0..0000000000000 --- a/branches/sage/mds/jobs/alc.tp +++ /dev/null @@ -1,38 +0,0 @@ -#PSUB -s /bin/bash # Sets your shell in batch -#PSUB -c alc # Where to run the job - -#PSUB -eo # Send std error & std out to the same file - -#PSUB -ln $NUM # Number of nodes to use -#PSUB -g $NUM # Total Number of tasks to use -#PSUB -cpn 1 # cpus per node - -####PSUB -c 1024Mb # memory limit -#PSUB -lc 1500 # Core file size per process -#PSUB -nr # Do not automatically resubmit job -#PSUB -tM 20m # Select time limit. The default time limit - # is only 30 minutes! Time can be HH:MM:SS or HH:MM - -#PSUB -o $CWD/$OUT # filename for output - -# Put your commands here. Remember to 'cd' to the appropriate -# directory, because the job will initially be in your home directory. -# To run a parallel job, you need to use the srun. - - - -echo job $PSUB_JOBID nodes $NUM name $NAME - -# environment -cd $CWD -export LD_LIBRARY_PATH=/usr/lib/mpi/mpi_gnu/lib - -# create fakestore dirs -srun -l -N $NUM -ppbatch bash -c "test -d tmp/osddata || mkdir tmp/osddata || echo cant make osddata ; uptime" - -# go -srun -l -N $NUM -ppbatch $CMD && touch $DONE - -# clean up fakestore -srun -l -N $NUM -ppbatch bash -c 'uptime ; rm -r tmp/osddata/*' - diff --git a/branches/sage/mds/jobs/alcdat/makedirs b/branches/sage/mds/jobs/alcdat/makedirs deleted file mode 100644 index af5a098a254c9..0000000000000 --- a/branches/sage/mds/jobs/alcdat/makedirs +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - #'nummds' => [1, 2, 4, 8, 16, 32, 48, 64, 80, 96, 128, 160, 192], - 'nummds' => [1, 2, 8, 16, 32, 48, 64, 80, 96, 112, 128],#144, 160, 192, 208], - - 'cper' => [15,20], - '_dep' => [ 'cnode' => '$nummds',# / 4 + 1', - 'numclient' => '$nummds * $cper', - 'numosd' => '$nummds > 1 ? $nummds:2', - 'n' => '1 + $cnode + $nummds + $numosd' ], - - # parameters - 'fs' => 'ebofs', - #'fs' => 'fakestore', - - 'mds_bal_rep' => 10000, # none of that! - 'mds_decay_halflife' => 30, - - 'mds_bal_interval' => 45, - 'mds_bal_max' => [2], - - 'until' => 300, # --syn until $n ... when to stop clients - 'kill_after' => 400, - 'start' => 100, - 'end' => 300, - - 'makedirs' => 1, - 'makedirs_dirs' => 10, - 'makedirs_files' => 10, - 'makedirs_depth' => 4, - - # --meta_log_layout_scount 32 --meta_log_layout_ssize 256 - # --osd_pg_layout linear - 'custom' => '--tcp_skip_rank0 --meta_log_layout_num_rep 1 --meta_dir_layout_num_rep 1 --mds_shutdown_check 60', - #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', - - 'comb' => { - 'x' => 'nummds', - 'vars' => [ 'mds.req' ] - } -}; diff --git a/branches/sage/mds/jobs/alcdat/makedirs.big b/branches/sage/mds/jobs/alcdat/makedirs.big deleted file mode 100644 index c67b2b93dd742..0000000000000 --- a/branches/sage/mds/jobs/alcdat/makedirs.big +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - #'nummds' => [1, 2, 4, 8, 16, 32, 48, 64, 80, 96, 128, 160, 192], - 'nummds' => [160, 200],#[1, 2, 8, 16, 32, 48, 64, 80, 96, 112, 128],#144, 160, 192, 208], - - 'cper' => [15,20], - '_dep' => [ 'cnode' => '40',#$nummds',# / 4 + 1', - 'numclient' => '$nummds * $cper', - 'numosd' => '$nummds * .8', - 'n' => '415'],#1 + $cnode + $nummds + $numosd' ], - - # parameters - 'fs' => 'ebofs', - #'fs' => 'fakestore', - - 'mds_bal_rep' => 10000, # none of that! - 'mds_decay_halflife' => 30, - - 'mds_bal_interval' => 45, - 'mds_bal_max' => 2, - - 'until' => 300, # --syn until $n ... when to stop clients - 'kill_after' => 400, - 'start' => 100, - 'end' => 300, - - 'makedirs' => 1, - 'makedirs_dirs' => 10, - 'makedirs_files' => 10, - 'makedirs_depth' => 4, - - # --meta_log_layout_scount 32 --meta_log_layout_ssize 256 - # --osd_pg_layout linear - 'custom' => '--tcp_skip_rank0 --meta_log_layout_num_rep 1 --meta_dir_layout_num_rep 1 --mds_shutdown_check 60', - #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', - - 'comb' => { - 'x' => 'nummds', - 'vars' => [ 'mds.req' ] - } -}; diff --git a/branches/sage/mds/jobs/alcdat/makedirs.tput b/branches/sage/mds/jobs/alcdat/makedirs.tput deleted file mode 100644 index 8dd5ae4c47d8c..0000000000000 --- a/branches/sage/mds/jobs/alcdat/makedirs.tput +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - #'nummds' => [1, 2, 4, 8, 16, 32, 48, 64, 80, 96, 128, 160, 192], - 'nummds' => [4, 16, 64],#[1, 16, 64, 128],#144, 160, 192, 208], - - #'cper' => [2, 5, 7, 10, 13, 16, 20, 30, 40, 50, 100, 150], - 'cper' => [13, 30, 40], # just for final run... - '_dep' => [ 'cnode' => '$nummds',# / 4 + 1', - 'numclient' => '$nummds * $cper', - 'numosd' => '$nummds', - 'n' => '1 + $cnode + $nummds + $numosd' ], - - # parameters - 'fs' => 'ebofs', - #'fs' => 'fakestore', - - 'mds_bal_rep' => 10000, # none of that! - 'mds_decay_halflife' => 30, - - 'mds_bal_interval' => 45, - 'mds_bal_max' => 2, - - 'until' => 300, # --syn until $n ... when to stop clients - 'kill_after' => 400, - 'start' => 100, - 'end' => 300, - - 'makedirs' => 1, - 'makedirs_dirs' => 10, - 'makedirs_files' => 10, - 'makedirs_depth' => 4, - - # --meta_log_layout_scount 32 --meta_log_layout_ssize 256 - # --osd_pg_layout linear - 'custom' => '--tcp_skip_rank0 --meta_log_layout_num_rep 1 --meta_dir_layout_num_rep 1 --mds_shutdown_check 60', - #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', - - 'comb' => { - 'x' => 'cper',#nummds', - 'vars' => [ 'mds.req', 'cl.lat' ] - } -}; diff --git a/branches/sage/mds/jobs/alcdat/makefiles.shared b/branches/sage/mds/jobs/alcdat/makefiles.shared deleted file mode 100644 index ab96702c73289..0000000000000 --- a/branches/sage/mds/jobs/alcdat/makefiles.shared +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - 'nummds' => [1, 2, 4, 8, 16, 32, 64, 96, 128], #2, 4, 8, 16, 32, 48, 64, 80, 96], - - 'cper' => [25, 50, 100, 150],# 100, 150, 200], - - '_dep' => [ 'cnode' => '$nummds', - 'numclient' => '$nummds * $cper', - 'numosd' => '$nummds', - 'n' => '1 + $cnode + $nummds + $numosd' ], - - # parameters - 'fs' => 'ebofs', - - 'mds_bal_hash_wr' => 1000, - - 'until' => 180, # --syn until $n ... when to stop clients - 'kill_after' => 250, - 'start' => 30, - 'end' => 180, - - 'custom' => '--tcp_skip_rank0 --meta_log_layout_num_rep 1 --meta_dir_layout_num_rep 1 --mds_shutdown_check 60 --syn makefiles 100000 1000 0', - - 'comb' => { - 'x' => 'nummds', - 'vars' => [ 'mds.req', 'cl.lat' ] - } -}; diff --git a/branches/sage/mds/jobs/alcdat/openshared b/branches/sage/mds/jobs/alcdat/openshared deleted file mode 100644 index 5ed7ba95894b3..0000000000000 --- a/branches/sage/mds/jobs/alcdat/openshared +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - 'nummds' => [1, 4, 16, 64, 128, 192 ], - - 'cper' => [10, 50, 100, 150], - '_dep' => [ 'cnode' => '$nummds',# > 30 ? 30:$nummds', - 'numclient' => '$nummds*$cper', - 'numosd' => '$nummds > 30 ? 30:$nummds', - 'n' => '1 + $cnode + $nummds + $numosd' ], - - # parameters - 'fs' => 'ebofs', - - 'mds_bal_interval' => 10000, - 'mds_bal_hash_wr' => 1000, - - 'until' => 120, # --syn until $n ... when to stop clients - 'kill_after' => 180, - 'start' => 10, - 'end' => 120, - - 'custom' => '--tcp_skip_rank0 --debug_mds_balancer 10 --mds_shutdown_check 60 --syn only 0 --syn createshared 10 --syn sleep 5 --syn openshared 10 10000', - - 'comb' => { - 'x' => 'nummds', - 'vars' => [ 'mds.req', 'cl.lat' ] - } -}; diff --git a/branches/sage/mds/jobs/alcdat/ossh.include b/branches/sage/mds/jobs/alcdat/ossh.include deleted file mode 100644 index c9a368ba5c60f..0000000000000 --- a/branches/sage/mds/jobs/alcdat/ossh.include +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 10, - - #'nummds' => [2, 4, 8, 16, 32, 48, 64, 80, 96, 128], - #'nummds' => [1, 2, 4, 6, 7], # googoo - 'nummds' => [ 2, 4, 8, 16, 32, 48, 64, 80, 96, 128 ], - - #'trace' => ['make.lib', 'make.include'], - - 'mds_bal_interval' => 45, - 'mds_bal_max' => 2,#6, #[ 2,4,6 ], - 'mds_decay_halflife' => 30, - 'mds_bal_rep' => 1500, - 'mds_bal_hash_rd' => 100000, - - 'cper' => [15, 20],#25, 50, 100], #50,#[25, 50, 75, 100],#50,# [ 50, 100 ], - #'cper' => 125, #[30, 50, 75, 100, 125, 150], #50, #[10,50,100],# [ 50, 75, 100 ], - - '_dep' => [ 'cnode' => '$nummds', - 'numclient' => '$nummds * $cper', - 'numosd' => '$nummds', - 'n' => '1 + $cnode + $nummds + $numosd' ], - - 'custom' => '--tcp_skip_rank0 --mds_shutdown_check 60 --syn only 0 --syn trace traces/openssh/untar.include 1 --syn sleep 30 --syn trace traces/openssh/make.include 1000', - - # parameters - 'fs' => 'ebofs', - - #'until' => 500, - #'kill_after' => 600, - #'start' => 200, - #'end' => 500, - 'until' => 300, # --syn until $n ... when to stop clients - 'kill_after' => 400, - 'start' => 200, - 'end' => 300, - - 'comb' => { - 'x' => 'nummds', - 'vars' => [ 'mds.req' ] - } -}; diff --git a/branches/sage/mds/jobs/alcdat/ossh.include.big b/branches/sage/mds/jobs/alcdat/ossh.include.big deleted file mode 100644 index b92895a53a763..0000000000000 --- a/branches/sage/mds/jobs/alcdat/ossh.include.big +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 10, - - #'nummds' => [2, 4, 8, 16, 32, 48, 64, 80, 96, 128], - #'nummds' => [1, 2, 4, 6, 7], # googoo - #'nummds' => [ 2, 4, 8, 16, 32, 48, 64, 80, 96, 128 ], - 'nummds' => [160,200], - - #'trace' => ['make.lib', 'make.include'], - - 'mds_bal_interval' => 45, - 'mds_bal_max' => 2,#6, #[ 2,4,6 ], - 'mds_decay_halflife' => 30, - 'mds_bal_rep' => 1500, - 'mds_bal_hash_rd' => 100000, - - 'cper' => [25, 50], #50,#[25, 50, 75, 100],#50,# [ 50, 100 ], - #'cper' => 125, #[30, 50, 75, 100, 125, 150], #50, #[10,50,100],# [ 50, 75, 100 ], - - '_dep' => [ 'cnode' => '$nummds', - 'numclient' => '$nummds * $cper', - 'numosd' => '$nummds * .6', - 'n' => '415'],#1 + $cnode + $nummds + $numosd' ], - - 'custom' => '--tcp_skip_rank0 --tcp_overlay_clients --mds_shutdown_check 60 --syn only 0 --syn trace traces/openssh/untar.include 1 --syn sleep 30 --syn trace traces/openssh/make.include 1000', - - # parameters - 'fs' => 'ebofs', - - #'until' => 500, - #'kill_after' => 600, - #'start' => 200, - #'end' => 500, - 'until' => 300, # --syn until $n ... when to stop clients - 'kill_after' => 400, - 'start' => 200, - 'end' => 300, - - 'comb' => { - 'x' => 'nummds', - 'vars' => [ 'mds.req' ] - } -}; diff --git a/branches/sage/mds/jobs/alcdat/ossh.lib b/branches/sage/mds/jobs/alcdat/ossh.lib deleted file mode 100644 index 73372866f051f..0000000000000 --- a/branches/sage/mds/jobs/alcdat/ossh.lib +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 10, - - #'nummds' => [1, 2, 4, 8, 16, 32, 48, 64, 80, 96, 128], - 'nummds' => [2, 4, 8, 16, 32, 48, 64, 80, 96, 128], - - #'nummds' => [1, 2, 4, 6, 7], # googoo - #'trace' => ['make.lib', 'make.include'], - - 'mds_bal_interval' => 90, #[30, 60, 90], #$[60,90], #[60,90],#[30, 60, 90], - #'mds_bal_max' => [4, 10],#6,#[2,4,6,8], - - 'mds_decay_halflife' => 30, - 'mds_bal_rep' => 1500, - 'cper' => [10, 16], #50,#[25, 50, 75, 100],#50,# [ 50, 100 ], - - '_dep' => [ 'cnode' => '$nummds', - 'numclient' => '$nummds * $cper', - 'numosd' => '$nummds', - 'n' => '1 + $cnode + $nummds + $numosd' ], - - - 'custom' => '--tcp_skip_rank0 --debug_mds_balancer 1 --mds_shutdown_check 60 --syn only 0 --syn trace traces/openssh/untar.lib 1 --syn sleep 10 --syn trace traces/openssh/make.lib 1000', - - # parameters - #'fs' => ['fakestore'], - 'fs' => 'ebofs', - - #'until' => 500, - #'kill_after' => 600, - #'start' => 200, - #'end' => 500, - 'until' => 300, # --syn until $n ... when to stop clients - 'kill_after' => 400, - 'start' => 150, - 'end' => 300, - - 'comb' => { - 'x' => 'nummds', - 'vars' => [ 'mds.req' ] - } -}; diff --git a/branches/sage/mds/jobs/alcdat/ossh.lib.big b/branches/sage/mds/jobs/alcdat/ossh.lib.big deleted file mode 100644 index b9e0dd1ff68cd..0000000000000 --- a/branches/sage/mds/jobs/alcdat/ossh.lib.big +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 10, - - #'nummds' => [1, 2, 4, 8, 16, 32, 48, 64, 80, 96, 128], - #'nummds' => [2, 4, 8, 16, 32, 48, 64, 80, 96, 128], - 'nummds' => [160,200], - - #'nummds' => [1, 2, 4, 6, 7], # googoo - #'trace' => ['make.lib', 'make.include'], - - 'mds_bal_interval' => 90, #[30, 60, 90], #$[60,90], #[60,90],#[30, 60, 90], - #'mds_bal_max' => [4, 10],#6,#[2,4,6,8], - - 'mds_decay_halflife' => 30, - 'mds_bal_rep' => 1500, - 'cper' => [25, 50, 100], #50,#[25, 50, 75, 100],#50,# [ 50, 100 ], - - '_dep' => [ 'cnode' => 0,#'30', - 'numclient' => '$nummds * $cper', - 'numosd' => '$nummds * .6', - 'n' => '415'],#'1 + $cnode + $nummds + $numosd' ], - - - 'custom' => '--tcp_skip_rank0 --debug_mds_balancer 1 --mds_shutdown_check 60 --syn only 0 --syn trace traces/openssh/untar.lib 1 --syn sleep 10 --syn trace traces/openssh/make.lib 1000', - - # parameters - #'fs' => ['fakestore'], - 'fs' => 'ebofs', - - #'until' => 500, - #'kill_after' => 600, - #'start' => 200, - #'end' => 500, - 'until' => 300, # --syn until $n ... when to stop clients - 'kill_after' => 400, - 'start' => 150, - 'end' => 300, - - 'comb' => { - 'x' => 'nummds', - 'vars' => [ 'mds.req' ] - } -}; diff --git a/branches/sage/mds/jobs/alcdat/striping b/branches/sage/mds/jobs/alcdat/striping deleted file mode 100644 index de71828d12bde..0000000000000 --- a/branches/sage/mds/jobs/alcdat/striping +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - 'nummds' => 1, - 'numosd' => 10, - - 'cnode' => 10, - 'cper' => [ 10, 25, 50, 100 ], - - '_dep' => [ 'numclient' => '$cper * $cnode', - 'n' => '1 + $cnode + $nummds + $numosd', - 'file_layout_osize' => '$writefile_size' ], - - # parameters - 'fs' => 'ebofs', - #'fs' => 'fakestore', - - 'until' => 160, # --syn until $n ... when to stop clients - 'kill_after' => 200, - 'start' => 100, - 'end' => 160, - - 'writefile' => 1, - 'writefile_size' => [ -# 4*1024*1024, - 1024*1024 ], -# 256*1024, -# 64*1024 - 'writefile_mb' => 100000, - - 'osd_pg_bits' => 10,#16, - #'osd_pg_bits' => [ 16, 20 ], - - #'osd_object_layout' => [ 'hash', 'hashino', 'linear' ], - 'osd_pg_layout' => [ 'crush', -# 'hash', - 'linear' ], - - 'custom' => '--tcp_skip_rank0 --file_layout_num_rep 1 --mds_shutdown_check 60', - - 'comb' => { - 'x' => 'cper',#writefile_size', - 'vars' => [ 'osd.c_wrb', 'osd.c_wr' ], - } -}; diff --git a/branches/sage/mds/jobs/example b/branches/sage/mds/jobs/example deleted file mode 100644 index 802a8b66e6332..0000000000000 --- a/branches/sage/mds/jobs/example +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/perl -# hi there -{ - # startup - 'n' => 30, # number of mpi nodes - 'sleep' => 3, # seconds to sleep between runs (so you have time to control-c out) - 'nummds' => 1, - 'numosd' => 6, - 'numclient' => 100, - - 'until' => 100, # --syn until $n ... synthetic client will stop itself after this many seconds. - 'kill_after' => 300, # seconds before everything commits suicide (in case something hangs) - - # stuff i want to vary - # here's a simple example: - - # do --syn writefile command - 'writefile' => 1, - # and very the write size - 'writefile_size' => [ # vary -# 2048*1024, - 1024*1024, - 512*1024, - 256*1024, - 128*1024, - 64*1024, - 48*1024, - 32*1024, - 28*1024, - 24*1024, - 16*1024, - 12*1024, - 8*1024, - 4096, -# 256, -# 16, -# 1 - ], - 'writefile_mb' => 1000, # each client shoudl write 1GB (or more likely, keep going until time runs out) - - 'file_layout_num_rep'=> [1,2], # also vary the replication level - - # pass some other random things to newsyn - 'custom' => '--', - - # for final summation (script/sum.pl) - # specify time period to look at the results - 'start' => 30, # skip first 30 seconds, so that caches are full etc. - 'end' => 90, # go for 60 seconds - - # what should i parse/plot? - 'comb' => { - 'x' => 'writefile_size', - 'vars' => [ 'osd.c_wrb', 'osd.r_wrb' ], - } -}; diff --git a/branches/sage/mds/jobs/mds/log_striping b/branches/sage/mds/jobs/mds/log_striping deleted file mode 100644 index 46242cdda4f00..0000000000000 --- a/branches/sage/mds/jobs/mds/log_striping +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - 'kill_after' => 300, - - 'nummds' => 1, - 'numosd' => 8, - 'numclient' => 100, - 'n' => 16, - - # parameters - 'fs' => ['ebofs','fakestore'], - 'meta_log_ssize' => [ 128, 256, 1024, 1 << 15, 1 << 20 ], - 'meta_log_scount' => 4,#[ 1, 2, 4, 8 ], - - 'until' => 200, # --syn until $n ... when to stop clients - - 'makedirs' => 1, - 'makedirs_dirs' => 10, - 'makedirs_files' => 10, - 'makedirs_depth' => 4, - - 'custom' => '--tcp_skip_rank0', - #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', - - # for final summation (script/sum.pl) - 'start' => 100, - 'end' => 550, - - 'comb' => { - 'x' => 'nummds', - 'vars' => [ 'mds.req' ] - } -}; diff --git a/branches/sage/mds/jobs/mds/makedir_lat b/branches/sage/mds/jobs/mds/makedir_lat deleted file mode 100644 index 63374f52a36c0..0000000000000 --- a/branches/sage/mds/jobs/mds/makedir_lat +++ /dev/null @@ -1,33 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - 'nummds' => 1, - 'numosd' => 8, - 'numclient' => [1],#, 40, 80, 160 ], - 'n' => 20, - - 'fs' => 'ebofs', - - 'start' => 20, - 'end' => 40, - 'until' => 40, - 'kill_after' => 60, - - 'makedirs' => 1, - 'makedirs_dirs' => 10, - 'makedirs_files' => 10, - 'makedirs_depth' => 5, - - 'mds_local_osd' => [ 0, 1 ], - 'meta_log_layout_num_rep' => [ 0, 1, 2, 3, 4], - - 'custom' => '--tcp_skip_rank0', - - 'comb' => { - 'x' => 'meta_log_layout_num_rep', - 'vars' => [ 'mds.log.lat', 'cl.lat', 'osd.rlat' ] - } -}; diff --git a/branches/sage/mds/jobs/mds/makedirs b/branches/sage/mds/jobs/mds/makedirs deleted file mode 100644 index 4ca42d72fa37e..0000000000000 --- a/branches/sage/mds/jobs/mds/makedirs +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - '_psub' => 'jobs/alc.tp', - - 'sleep' => 3, - - 'nummds' => [1, 2, 4, 6, 8, 12, 16, 24, 32, 40, 48, 64], - - 'cper' => 50, - '_dep' => [ 'cnode' => '$nummds', - 'numclient' => '$cnode * $cper', - 'numosd' => '$nummds * 2', - 'n' => '1 + $cnode + $nummds + $numosd' ], - - # parameters - #'fs' => 'ebofs', - 'fs' => 'fakestore', - - 'until' => 300, # --syn until $n ... when to stop clients - 'kill_after' => 400, - - 'makedirs' => 1, - 'makedirs_dirs' => 10, - 'makedirs_files' => 10, - 'makedirs_depth' => 3, - - 'custom' => '--tcp_skip_rank0', - #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', - - # for final summation (script/sum.pl) - 'start' => 100, - 'end' => 550, - - 'comb' => { - 'x' => 'nummds', - 'vars' => [ 'mds.req' ] - } -}; diff --git a/branches/sage/mds/jobs/mds/opensshlib b/branches/sage/mds/jobs/mds/opensshlib deleted file mode 100644 index d8b61ae52c655..0000000000000 --- a/branches/sage/mds/jobs/mds/opensshlib +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - 'nummds' => [1, 2, 4, 7], # googoo - #'nummds' => [1, 2, 4, 6, 8, 12, 16, 24, 32, 40, 48, 64], # alc - - - # parameters - 'fs' => 'ebofs', - #'fs' => 'fakestore', - - 'until' => 300, # --syn until $n ... when to stop clients - 'kill_after' => 400, - 'start' => 150, - 'end' => 300, - - 'mds_bal_interval' => 90,#[60, 90], - #'mds_bal_max' => [3,4,5], - 'mds_bal_max' => 4, - 'mds_decay_halflife' => 30,#[15, 25, 30, 45, 60], - 'mds_bal_rep' => 1500,#[1000, 1500, 2000], - - 'decay_hl' => 100,#[ 25, 50, 100, 150 ], - - 'cper' => 100, #[50, 75, 100, 125, 150, 200], - '_dep' => [ 'cnode' => '$nummds', - 'numclient' => '$nummds * $cper', - 'numosd' => '$nummds * 2', - 'n' => '1 + $cnode + $nummds + $numosd', - 'mds_bal_rep' => '$mds_decay_halflife * $decay_hl'], - - 'custom' => '--tcp_skip_rank0 --syn only 0 --syn trace traces/openssh/untar.lib 1 --syn sleep 10 --syn randomsleep 30 --syn trace traces/openssh/make.lib 100 --debug_mds_balancer 1 --mds_shutdown_check 60', - #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', - - # for final summation (script/sum.pl) - - 'comb' => { - 'x' => 'nummds',#decay_hl',#'nummds', - 'vars' => [ 'mds.req' ] - } -}; diff --git a/branches/sage/mds/jobs/meta1 b/branches/sage/mds/jobs/meta1 deleted file mode 100644 index 743212f1c3009..0000000000000 --- a/branches/sage/mds/jobs/meta1 +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/sh - -# makedirs for 300 seconds -# first bit in memory -# second bit is commiting from journal too -# then walk fs for 300 seconds -# this should all be in memory. - -JOB="meta1" -ARGS="--numosd 10 --fullmkfs --syn until 180 --syn makedirs 10 10 4 --syn until 360 --syn repeatwalk --mds_bal_max 1 --osd_fsync 0 --mds_log_max_len 200000 --mds_cache_size 500000" - -#rm core* ; make tcpsyn && mpiexec -l -n 17 ./tcpsyn $ARGS --nummds 1 --log_name $JOB/1 --numclient 25 > log/$JOB/o.1 -#rm core* ; make tcpsyn && mpiexec -l -n 18 ./tcpsyn $ARGS --nummds 2 --log_name $JOB/2 --numclient 50 > log/$JOB/o.2 -#rm core* ; make tcpsyn && mpiexec -l -n 20 ./tcpsyn $ARGS --nummds 4 --log_name $JOB/4 --numclient 100 > log/$JOB/o.4 -#rm core* ; make tcpsyn && mpiexec -l -n 24 ./tcpsyn $ARGS --nummds 8 --log_name $JOB/8 --numclient 200 > log/$JOB/o.8 -#rm core* ; make tcpsyn && mpiexec -l -n 28 ./tcpsyn $ARGS --nummds 12 --log_name $JOB/12 --numclient 300 > log/$JOB/o.12 -rm core* ; make tcpsyn && mpiexec -l -n 32 ./tcpsyn $ARGS --nummds 16 --log_name $JOB/16 --numclient 300 > log/$JOB/o.16 - - diff --git a/branches/sage/mds/jobs/meta1.proc.sh b/branches/sage/mds/jobs/meta1.proc.sh deleted file mode 100755 index 616acbefff619..0000000000000 --- a/branches/sage/mds/jobs/meta1.proc.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/sh - -for d in 1 2 4 8 12 -do - echo $d - cd $d - ../../../script/sum.pl mds? mds?? > mds.sum - ../../../script/sum.pl -avg mds? mds?? > mds.avg - - ../../../script/sum.pl -start 90 -end 180 mds? mds?? > mds.sum.makedirs - ../../../script/sum.pl -start 200 -end 300 mds? mds?? > mds.sum.walk - - cd .. -done diff --git a/branches/sage/mds/jobs/osd/ebofs b/branches/sage/mds/jobs/osd/ebofs deleted file mode 100644 index 5d11523f6f832..0000000000000 --- a/branches/sage/mds/jobs/osd/ebofs +++ /dev/null @@ -1,51 +0,0 @@ -# hi there -{ - # startup - 'n' => 30, # mpi nodes - 'sleep' => 3, # seconds between runs - 'nummds' => 1, - 'numosd' => 8, - 'numclient' => 100,#[10, 50, 100, 200, 400], - -'kill_after' => 200, - - # parameters - 'fs' => 'ebofs',#[ -# 'obfs', -# 'fakestore', -# 'ebofs' -# ], - 'until' => 100, # --syn until $n ... when to stop clients - 'writefile' => 1, - 'writefile_size' => [ -# 2560000, - 1024000, - 262144, -# 131072, -# 98304, - 65536, -# 16384, -# 4096, - 256, -# 16, -# 1 - ], - 'writefile_mb' => 1000, - - 'ebofs_idle_commit_ms' => [ 100, 500 ], - 'ebofs_abp_max_alloc' => [ 4096*16, 4096*64 ], - -# 'custom' => '--tcp_skip_rank0',# --osd_maxthreads 0', - 'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', - - # for final summation (script/sum.pl) - 'start' => 30, - 'end' => 90, - -'comb' => { - 'x' => 'writefile_size', - 'vars' => [ 'osd.c_wrb' ], -# 'maptitle' => { 'osd_object_layout=' => '', -# ',osd_pg_layout=' => ' + '} - } -}; diff --git a/branches/sage/mds/jobs/osd/mds_log b/branches/sage/mds/jobs/osd/mds_log deleted file mode 100644 index 0f99f6998dcfc..0000000000000 --- a/branches/sage/mds/jobs/osd/mds_log +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - #'_psub' => 'jobs/alc.tp', - 'sleep' => 3, - - 'nummds' => 1, - 'numclient' => [5, 10, 15, 25, 50, 100, 200, 300, 400], - #'numclient' => [ 50, 100, 200 ], - 'numosd' => [2,4],#[ 4, 8, 12, 16, 20, 24 ], - 'n' => 12, - - # parameters - 'fs' => 'fakestore',#['ebofs', 'fakestore','obfs'], - #'fs' => 'ebofs', - #'ebofs_commit_ms' => [ 1000, 5000 ], - #'osd_maxthreads' => [ 0, 1, 2, 4, 8 ], - - 'until' => 100, # --syn until $n ... when to stop clients - 'kill_after' => 300, - 'start' => 20, - 'end' => 90, - - 'makedirs' => 1, - 'makedirs_dirs' => 10, - 'makedirs_files' => 10, - 'makedirs_depth' => 3, - - - #'meta_log_layout_ssize' => [256, 512, 1024, 4096, 16384, 65536, 262400], - #'meta_log_layout_scount' => [2, 4, 8], - #'meta_log_layout_num_rep' => [1, 2], - #'meta_log_layout_num_rep' => 1, - - 'custom' => '--tcp_skip_rank0 --mds_shutdown_check 60', - #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', - - 'comb' => { - 'x' => 'numclient',#'meta_log_layout_ssize', - 'vars' => [ 'mds.req' ] - } -}; diff --git a/branches/sage/mds/jobs/osd/osd_threads b/branches/sage/mds/jobs/osd/osd_threads deleted file mode 100644 index ef271f9e88710..0000000000000 --- a/branches/sage/mds/jobs/osd/osd_threads +++ /dev/null @@ -1,33 +0,0 @@ -# hi there -{ - # startup - 'n' => 30, # mpi nodes - 'sleep' => 10, # seconds between runs - 'nummds' => 1, - 'numosd' => 8, - 'numclient' => 50, - - # parameters - 'fs' => [ -# 'obfs', - 'fakestore', - 'ebofs' - ], - 'until' => 100, # --syn until $n ... when to stop clients - 'writefile' => 1, - 'writefile_size' => [ - 1024000, - 131072, - 65536, - 16 - ], - 'writefile_mb' => 1000, - - 'osd_maxthreads' => [0, 1, 2, 4, 8], - - 'custom' => '--tcp_skip_rank0', - - # for final summation (script/sum.pl) - 'start' => 30, - 'end' => 90 -}; diff --git a/branches/sage/mds/jobs/osd/striping b/branches/sage/mds/jobs/osd/striping deleted file mode 100644 index ea8cabe643274..0000000000000 --- a/branches/sage/mds/jobs/osd/striping +++ /dev/null @@ -1,78 +0,0 @@ -#!/usr/bin/perl -# hi there -{ - # startup - #'n' => 28, # mpi nodes - - 'sleep' => 3, # seconds between runs - 'nummds' => 1, - - 'numosd' => [2,3,4,5,6,7,8,10,12], #[6, 8, 10, 12, 16], - 'numosd' => [14], - #'cper' => [4, 5, 6, 7, 8, 9, 10, 11, 12], #[1, 4, 6, 8, 16, 32, 64], - #'cper' => [4, 6, 8, 10, 12, 16, 24, 32 ], #[1, 4, 6, 8, 16, 32, 64], - 'cper' => [30], - - '_dep' => [ 'cnode' => '$numosd', - 'numclient' => '$cnode * $cper', - 'n' => 38],#'$nummds + $numosd + $cnode'], - #'numclient' => [5, 10, 20, 50, 75, 100, 150 ], - - 'start' => 30, - 'end' => 90, - 'until' => 100, # --syn until $n ... when to stop clients - 'kill_after' => 260, - - # parameters - 'fs' => 'ebofs', - 'writefile' => 1, - - 'writefile_size' => [# 4096, - # 16*1024, - # 64*1024, - # 256*1024, - 1024*1024 ], -# 'writefile_size' => [ -# 2048*1024, -# 1048576, -# 512*1024, -# 262144, -# 65536, -# 16384 -# ], - 'writefile_mb' => 1000, - - 'file_layout_num_rep'=> [1,2,3], - - 'osd_pg_bits' => 12,#[6, 8, 10, 12, 14], - - 'osd_object_layout' => [ 'hashino' ],#'hash', 'hashino', 'linear' ], - 'osd_pg_layout' => [ 'crush', 'linear' ],#, 'linear'],#, 'hash' ],#, 'linear' ],#, 'hash' ], - - #'custom' => '--tcp_skip_rank0', # --osd_maxthreads 0', - #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', - - # for final summation (script/sum.pl) - - 'comb' => { - 'x' => 'numosd',#'writefile_size', - 'vars' => [ 'osd.c_wrb', 'cl.wrlat' ], -# 'maptitle' => { 'osd_object_layout=' => '', -# ',osd_pg_layout=' => ' + '} - } -}; - - -=item some googoo notes - -for 1mb 1x writes, - - with numosd=6, min cper=6 to saturate (cper_saturate) - googoo saturates at numosd=8. (osd_saturate) - - -> so, numosd=6 or 7 is a safe size! - - - - -=cut diff --git a/branches/sage/mds/jobs/osd/wr_lat2 b/branches/sage/mds/jobs/osd/wr_lat2 deleted file mode 100644 index 47053dd61f3ab..0000000000000 --- a/branches/sage/mds/jobs/osd/wr_lat2 +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - 'nummds' => 1, - 'numosd' => [12], - 'numclient' => [1],#, 40, 80, 160 ], - 'n' => 16, - - 'fs' => 'ebofs', - - 'start' => 10, - 'end' => 40, - 'until' => 40, - 'kill_after' => 90, - - 'writefile' => 1, - 'writefile_size' => [4096, - 8*1024, - 16*1024, - 32*1024, - 64*1024, - 128*1024, - 256*1024, - 512*1024, - 1024*1024], - 'writefile_mb' => 10000, - - #'tcp_multi_out' => [0,1], - -# 'mds_local_osd' => [ 0, 1 ], - 'file_layout_num_rep' => [1,2,3],#, 2, 3, 4], - - 'client_oc' => [0,1], - - 'custom' => '--tcp_skip_rank0', - - 'comb' => { - 'x' => 'writefile_size',#'file_layout_num_rep', - 'vars' => [ 'osd.c_wrb','cl.wrlat' ] - } -}; diff --git a/branches/sage/mds/jobs/osd/write_sizes b/branches/sage/mds/jobs/osd/write_sizes deleted file mode 100644 index 57369f3a97c50..0000000000000 --- a/branches/sage/mds/jobs/osd/write_sizes +++ /dev/null @@ -1,60 +0,0 @@ -#!/usr/bin/perl -# hi there -{ - # startup - 'n' => 30, # mpi nodes - 'sleep' => 3, # seconds between runs - 'nummds' => 1, - 'numosd' => 6, - 'numclient' => 100,#[25,50,100,300],#100,#[10, 50, 100, 200, 400], - - 'until' => 100, # --syn until $n ... when to stop clients - 'kill_after' => 300, - - # parameters - 'fs' => [ -# 'obfs', - 'fakestore', -# 'ebofs' - ], - 'writefile' => 1, - 'writefile_size' => [ -# 2048*1024, - 1024*1024, - 512*1024, - 256*1024, - 128*1024, - 64*1024, - 48*1024, - 32*1024, - 28*1024, - 24*1024, - 16*1024, - 12*1024, - 8*1024, - 4096, -# 256, -# 16, -# 1 - ], - 'writefile_mb' => 1000, - - 'file_layout_num_rep'=> 1,#[1,2], - - -# 'ebofs_idle_commit_ms' => [ 100, 500 ], -# 'ebofs_abp_max_alloc' => [ 4096*16, 4096*64 ], - - 'custom' => '--debug_after 110 --debug_mds 15 --debug 5 --mds_shutdown_check 60', - - # for final summation (script/sum.pl) - 'start' => 30, - 'end' => 90, - - 'comb' => { - 'x' => 'writefile_size', - 'vars' => [ 'osd.c_wrb' ], -# 'maptitle' => { 'osd_object_layout=' => '', -# ',osd_pg_layout=' => ' + '} - } -}; diff --git a/branches/sage/mds/jobs/rados/map_dist b/branches/sage/mds/jobs/rados/map_dist deleted file mode 100644 index 39f16daa1cdc2..0000000000000 --- a/branches/sage/mds/jobs/rados/map_dist +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - 'osdbits' => [6,7,8],#,9],10,11], - 'pgperbits' => [3],#,4,5],#[4,6,8], - - 'nummds' => 1, - - '_dep' => [ 'numosd' => '1 << $osdbits', - 'osd_pg_bits' => '$pgperbits + $osdbits', - 'n' => '3 + $numosd / 32'], - 'numclient' => 0, - - 'fake_osdmap_updates' => [30], - - 'fs' => 'ebofs', - - 'start' => 30, - 'end' => 300, - 'kill_after' => 300, - - 'custom' => '--bdev_lock 0 --ms_stripe_osds --osd_maxthreads 0', - #'custom' => '--tcp_skip_rank0', - - 'comb' => { - 'x' => 'osdbits', - 'vars' => [ 'osd.sum=mapi', 'osd.sum=mapidup', 'osd.numpg', 'osd.pingset' ] - } -}; diff --git a/branches/sage/mds/jobs/rados/rep_lat b/branches/sage/mds/jobs/rados/rep_lat deleted file mode 100644 index 3f5ab0c8a7d87..0000000000000 --- a/branches/sage/mds/jobs/rados/rep_lat +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - 'nummds' => 1, - 'numosd' => 8, #[6], - 'numclient' => 1,#, 40, 80, 160 ], - 'n' => 10, - - 'fs' => 'ebofs', - - 'start' => 10, - 'end' => 40, - 'until' => 40, - 'kill_after' => 45, - - 'writefile' => 1, - 'writefile_size' => [4096, -# 8*1024, -# 16*1024, -# 32*1024, - 64*1024, -# 128*1024, -# 256*1024, -# 512*1024, -# 1024*1024 -], - 'writefile_mb' => 10000, - - 'osd_rep' => [0,1,2], - - 'file_layout_num_rep' => [1,2,3,4,5,6],#, 2, 3, 4], - - 'osd_pg_bits' => 4, - 'custom' => '--osd_max_rep 8', - - 'comb' => { - 'x' => 'file_layout_num_rep', - 'vars' => [ 'cl.wrlat' ] - } -}; diff --git a/branches/sage/mds/jobs/rados/wr_sizes b/branches/sage/mds/jobs/rados/wr_sizes deleted file mode 100644 index 9b73477ea6142..0000000000000 --- a/branches/sage/mds/jobs/rados/wr_sizes +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - 'nummds' => 1, - 'numosd' => [8],#10,14,16], - 'numclient' => [10*16], - 'n' => 15, - - 'fs' => 'ebofs', - - 'start' => 60, - 'end' => 90, - 'until' => 90, - 'kill_after' => 190, - - 'writefile' => 1, - 'writefile_size' => [4096, - 8*1024, - 16*1024, - 32*1024, - 64*1024, - 128*1024, - 256*1024, - # 512*1024, -# 4*1024*1024, -# 2*1024*1024, -# 1024*1024 -], - 'writefile_mb' => 10000, - - 'file_layout_num_rep' => 1, - 'file_layout_ssize' => 4*1024*1024, - 'file_layout_osize' => 4*1024*1024, - - 'osd_pg_bits' => 12, - -# 'ebofs_freelist' => [0, 1080, 65400], - - 'custom' => '--objecter_buffer_uncommitted 0', - - #'custom' => '--tcp_skip_rank0', - - 'comb' => { - 'x' => 'writefile_size', - 'vars' => [ 'osd.c_wrb', 'cl.wrlat' ] - } -}; diff --git a/branches/sage/mds/jobs/runjobsample b/branches/sage/mds/jobs/runjobsample deleted file mode 100644 index 590be207771b2..0000000000000 --- a/branches/sage/mds/jobs/runjobsample +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - '_sleep' => 3, - - 'nummds' => 1, - 'numosd' => 16, #[8],#10,14,16], - 'numclient' => 32,#,4,10,20,40], #[10*16], - '_n' => 32, - - '_start' => 15, - '_end' => 45, - '_kill_after' => 190, - - 'osd_pg_bits' => [4, 6], - 'osd_auto_weight' => [0,1], - 'file_layout_pg_size' => [1,2], - - '_custom' => '--syn createobjects 1000000 1048576 2', - - '_comb' => { - 'x' => 'osd_pg_bits', - 'vars' => [ 'osd.c_wrb' ] - } -}; diff --git a/branches/sage/mds/kernel/Makefile b/branches/sage/mds/kernel/Makefile deleted file mode 100644 index 2ad658b5566c3..0000000000000 --- a/branches/sage/mds/kernel/Makefile +++ /dev/null @@ -1,7 +0,0 @@ -# -# Makefile for CEPH filesystem. -# - -obj-$(CONFIG_CEPH_FS) += ceph.o - -ceph-objs := inode.o diff --git a/branches/sage/mds/kernel/bufferlist.h b/branches/sage/mds/kernel/bufferlist.h deleted file mode 100644 index 78e4c6f95216b..0000000000000 --- a/branches/sage/mds/kernel/bufferlist.h +++ /dev/null @@ -1,74 +0,0 @@ -#ifndef _FS_CEPH_BUFFERLIST_H -#define _FS_CEPH_BUFFERLIST_H - - - -#define CEPH_BUFFERLIST_START_IOVLEN 8 /* embed some statically, for fast normal case */ - -struct ceph_bufferlist { - struct iovec *b_iov; /* data payload */ - struct iovec b_iov_array[CEPH_BUFFERLIST_START_IOVLEN]; - int b_iovlen; /* used/defined elements in b_iov */ - int b_iovmax; /* allocated size of b_iov array */ - struct iovec b_append; /* preallocated memory for appending data to this bufferlist */ -}; - -struct ceph_bufferlist_iterator { - int i_iov; /* which iov */ - int i_off; /* offset in that iov */ -}; - -/* - * add referenced memory to the bufferlist. - * expand b_iov array if necessary. - * extend tail iovec if the added region is contiguous. - */ -void ceph_bufferlist_append_ref(struct ceph_bufferlist *bl, void *p, int len) -{ - struct iovec *tmpvec; - if (bl->b_iovlen == bl->b_iovmax) { - if (bl->b_iovmax) { - bl->b_iovmax *= 2; - tmpvec = kmalloc(bl->b_iovmax); - memcpy(tmpvec, bl->b_iov, sizeof(iovec)*bl->b_iovlen); - if (bl->b_iovlen > CEPH_BUFFERLIST_START_IOVLEN) - kfree(bl->b_iov); - bl->b_iov = tmpvec; - memset(tmpvec + bl->b_iovlen, 0, - sizeof(iovec)*(bl->b_iovmax - bl->b_iovlen)); - } else { - bl->b_iovmax = CEPH_BUFFERLIST_START_IOVLEN; - bl->b_iov = bl->b_iov_array; - } - } - - if (bl->b_iovlen && - p == bl->b_iov[bl->b_iovlen-1].iov_base + bl->b_iov[bl->b_iovlen-1].iov_base) { - bl->b_iov[bl->b_iovlen-1].iov_len += len; - } else { - bl->b_iov[bl->b_iovlen].iov_base = p; - bl->b_iov[bl->b_iovlen].iov_len = len; - bl->b_iovlen++; - } -} - -void ceph_bufferlist_append_copy(struct ceph_bufferlist *bl, void *p, int len) -{ - int s; - while (len > 0) { - /* allocate more space? */ - if (!bl->b_append.iov_len) { - bl->b_append.iov_len = (len + PAGE_SIZE - 1) & ~(PAGE_SIZE-1); - bl->b_append.iov_base = kmalloc(bl->b_append.iov_len, GFP_KERNEL); - } - - /* copy what we can */ - s = min(bl->b_append.iov_len, len); - memcpy(bl->b_append.iov_base, s); - ceph_bufferlist_append_ref(bl, b_append.iov_base, b_append.iov_len); - len -= s; - bl->b_append.iov_len -= s; - } -} - -#endif diff --git a/branches/sage/mds/kernel/inode.c b/branches/sage/mds/kernel/inode.c deleted file mode 100644 index f21fa58386935..0000000000000 --- a/branches/sage/mds/kernel/inode.c +++ /dev/null @@ -1,136 +0,0 @@ -#include -#include -#include -#include -#include "ceph_fs.h" - -MODULE_AUTHOR("Patience Warnick "); -MODULE_DESCRIPTION("Ceph filesystem for Linux"); -MODULE_LICENSE("GPL"); - - -static void ceph_read_inode(struct inode * inode) -{ - return; -} - -static int ceph_write_inode(struct inode * inode, int unused) -{ - lock_kernel(); - unlock_kernel(); - return 0; -} - -static void ceph_delete_inode(struct inode * inode) -{ - return; -} - -static void ceph_put_super(struct super_block *s) -{ - return; -} - -static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) -{ - return 0; -} - -static void ceph_write_super(struct super_block *s) -{ - lock_kernel(); - unlock_kernel(); - return; -} - -static struct kmem_cache *ceph_inode_cachep; - -static struct inode *ceph_alloc_inode(struct super_block *sb) -{ - struct ceph_inode_info *ci; - ci = kmem_cache_alloc(ceph_inode_cachep, GFP_KERNEL); - if (!ci) - return NULL; - return &ci->vfs_inode; -} - -static void ceph_destroy_inode(struct inode *inode) -{ - kmem_cache_free(ceph_inode_cachep, CEPH_I(inode)); -} - -static void init_once(void *foo, struct kmem_cache *cachep, unsigned long flags) -{ - struct ceph_inode_info *ci = (struct ceph_inode_info *) foo; - - if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == - SLAB_CTOR_CONSTRUCTOR) - inode_init_once(&ci->vfs_inode); -} - -static int init_inodecache(void) -{ - ceph_inode_cachep = kmem_cache_create("ceph_inode_cache", - sizeof(struct ceph_inode_info), - 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), - init_once, NULL); - if (ceph_inode_cachep == NULL) - return -ENOMEM; - return 0; -} - -static void destroy_inodecache(void) -{ - kmem_cache_destroy(ceph_inode_cachep); -} - -static const struct super_operations ceph_sops = { - .alloc_inode = ceph_alloc_inode, - .destroy_inode = ceph_destroy_inode, - .read_inode = ceph_read_inode, - .write_inode = ceph_write_inode, - .delete_inode = ceph_delete_inode, - .put_super = ceph_put_super, - .write_super = ceph_write_super, - .statfs = ceph_statfs, -}; - -static int ceph_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) -{ - printk(KERN_INFO "entered ceph_get_sb\n"); - return 0; -} - -static struct file_system_type ceph_fs_type = { - .owner = THIS_MODULE, - .name = "ceph", - .get_sb = ceph_get_sb, - .kill_sb = kill_block_super, -/* .fs_flags = */ -}; - -static int __init init_ceph(void) -{ - int ret = 0; - - printk(KERN_INFO "ceph init\n"); - if (!(ret = init_inodecache())) { - if ((ret = register_filesystem(&ceph_fs_type))) { - destroy_inodecache(); - } - } - return ret; -} - -static void __exit exit_ceph(void) -{ - printk(KERN_INFO "ceph exit\n"); - - unregister_filesystem(&ceph_fs_type); -} - - -module_init(init_ceph); -module_exit(exit_ceph); diff --git a/branches/sage/mds/kernel/kmsg.h b/branches/sage/mds/kernel/kmsg.h deleted file mode 100644 index cc44b9fd291e5..0000000000000 --- a/branches/sage/mds/kernel/kmsg.h +++ /dev/null @@ -1,51 +0,0 @@ -#ifndef __FS_CEPH_KMSG_H -#define __FS_CEPH_KMSG_H - -#include -#include -#include -#include "ceph_kthread.h" - - -struct ceph_kthreadpool *msg_threadpool; /* thread pool */ - -struct ceph_kmsgr { - void *m_parent; - struct radix_tree_root mpipes; /* other nodes talk to */ - struct client_thread_info cthread; /* listener thread info */ -}; - -struct ceph_message { - struct ceph_message_header *msghdr; /* header */ - struct kvec *m_iov; /* data storage */ - size_t m_iovlen; /* is this kvec.iov_len why need it in kvec? */ - struct list_head m_list_head; -}; - -struct ceph_kmsg_pipe { - int p_sd; /* socket descriptor */ - __u64 p_out_seq; /* last message sent */ - __u64 p_in_seq; /* last message received */ - - /* out queue */ - struct list_head p_out_queue; - struct ceph_message *p_out_partial; /* partially sent message */ - int p_out_partial_pos; - struct list_head p_out_sent; /* sent but unacked; may need resend if connection drops */ - - /* partially read message contents */ - struct kvec *p_in_partial_iov; /* hrm, this probably isn't what we want */ - size_t p_in_partial_iovlen; - size_t p_in_parital_iovmax; /* size of currently allocated m_iov array */ - /* .. or something like that? .. */ - -}; - -/* - * function prototypes - */ -extern void ceph_read_message(struct ceph_message *message); -extern void ceph_write_message(struct ceph_message *message); -extern void ceph_client_dispatch(void *fs_client, struct ceph_message *message ); -extern void queue_message(struct ceph_message *message); -#endif diff --git a/branches/sage/mds/kernel/kmsgbits.h b/branches/sage/mds/kernel/kmsgbits.h deleted file mode 100644 index 730ff7f74f53b..0000000000000 --- a/branches/sage/mds/kernel/kmsgbits.h +++ /dev/null @@ -1,50 +0,0 @@ - - - -struct ceph_message { - struct ceph_message_header m_hdr; /* header */ - struct iovec *m_iov; /* payload */ - int m_iovlen; - struct list_head m_list_head; /* i'll sit in a queue */ -}; - - - -/* dispatch method type */ -typedef void (*ceph_kmsg_dispatch_t)(void *h, struct ceph_message *message); - -struct ceph_kmsg { - ceph_kmsg_dispatch_t m_dispatch; /* where incoming messages go */ - void *m_parent; /* passed to dispatch method */ - - struct ceph_kmsg_threadpool *m_threadpool; /* pool of threads */ - /* possibly shared among multiple kmsg instances? */ - - /* other nodes i talk to */ - struct radix_tree_root m_pipes; /* key: dest addr, value: ceph_kmsg_pipe */ - - /* ... */ -}; - - -struct ceph_kmsg_pipe { - int p_sd; /* socket descriptor */ - __u64 p_out_seq; /* last message sent */ - __u64 p_in_seq; /* last message received */ - - /* out queue */ - struct list_head p_out_queue; - struct ceph_message *p_out_partial; /* partially sent message */ - int p_out_partial_pos; - struct list_head p_out_sent; /* sent but unacked; may need resend if connection drops */ - - /* partially read message contents */ - struct iovec *p_in_partial_iov; /* hrm, this probably isn't what we want */ - int p_in_partial_iovlen; - int p_in_parital_iovmax; /* size of currently allocated m_iov array */ - /* .. or something like that? .. */ - -}; - - - diff --git a/branches/sage/mds/kernel/mds_client.h b/branches/sage/mds/kernel/mds_client.h deleted file mode 100644 index 764d7ccd6bdf6..0000000000000 --- a/branches/sage/mds/kernel/mds_client.h +++ /dev/null @@ -1,42 +0,0 @@ -#ifndef _FS_CEPH_MDS_CLIENT_H -#define _FS_CEPH_MDS_CLIENT_H - -#include -#include "kmsg.h" - -/* - * state associated with an individual MDS<->client session - */ -struct ceph_mds_session { - __u64 s_push_seq; - /* wait queue? */ -}; - -struct ceph_mds_request { - __u64 r_tid; - struct ceph_message *r_msg; - __u8 r_idempotent; - - __u32 r_mds[4]; /* set of mds's with whom request may be outstanding */ - __u32 r_num_mds; /* items in r_mds */ - - __u32 r_num_fwd; /* number of forward attempts */ - __s32 r_resend_mds; /* mds to resend to next, if any*/ - - /* waiter/callback? */ -}; - - -struct ceph_mds_client { - struct ceph_mdsmap *s_mdsmap; /* mds map */ - - /* mds sessions */ - struct ceph_mds_session **s_mds_sessions; /* sparse array; elements NULL if no session */ - int s_max_mds_sessions; /* size of s_mds_sessions array */ - - __u64 s_last_mds_tid; /* id of last mds request */ - struct radix_tree_root s_mds_requests; /* in-flight mds requests */ - -}; - -#endif diff --git a/branches/sage/mds/kernel/mdsmap.h b/branches/sage/mds/kernel/mdsmap.h deleted file mode 100644 index c5a970992c36c..0000000000000 --- a/branches/sage/mds/kernel/mdsmap.h +++ /dev/null @@ -1,42 +0,0 @@ -#ifndef _FS_CEPH_MDSMAP_H -#define _FS_CEPH_MDSMAP_H - -/* see mds/MDSMap.h */ -#define CEPH_MDS_STATE_DNE 0 /* down, never existed. */ -#define CEPH_MDS_STATE_STOPPED -1 /* down, once existed, but no subtrees. empty log. */ -#define CEPH_MDS_STATE_FAILED 2 /* down, active subtrees needs to be recovered. */ - -#define CEPH_MDS_STATE_BOOT -3 /* up, boot announcement. destiny unknown. */ -#define CEPH_MDS_STATE_STANDBY -4 /* up, idle. waiting for assignment by monitor. */ -#define CEPH_MDS_STATE_CREATING -5 /* up, creating MDS instance (new journal, idalloc..). */ -#define CEPH_MDS_STATE_STARTING -6 /* up, starting prior stopped MDS instance. */ - -#define CEPH_MDS_STATE_REPLAY 7 /* up, starting prior failed instance. scanning journal. */ -#define CEPH_MDS_STATE_RESOLVE 8 /* up, disambiguating distributed operations (import, rename, etc.) */ -#define CEPH_MDS_STATE_RECONNECT 9 /* up, reconnect to clients */ -#define CEPH_MDS_STATE_REJOIN 10 /* up, replayed journal, rejoining distributed cache */ -#define CEPH_MDS_STATE_ACTIVE 11 /* up, active */ -#define CEPH_MDS_STATE_STOPPING 12 /* up, exporting metadata (-> standby or out) */ - -/* - * mds map - * - * fields limited to those the client cares about - */ -struct ceph_mdsmap { - __u64 m_epoch; - __u64 m_same_in_set_since; - struct timeval m_created; - __u32 m_anchortable; - __u32 m_root; - struct ceph_entity_addr *m_addr; /* array of addresses */ - __u8 *m_state; /* array of states */ - __u32 m_max_mds; /* size of m_addr, m_state arrays */ -}; - -extern int ceph_mdsmap_get_random_mds(ceph_mdsmap *m); -extern int ceph_mdsmap_get_state(ceph_mdsmap *m, int w); -extern struct ceph_entity_addr *ceph_mdsmap_get_addr(ceph_mdsmap *m, int w); -extern int ceph_mdsmap_decode(ceph_mdsmap *m, iovec *v); - -#endif diff --git a/branches/sage/mds/kernel/monmap.h b/branches/sage/mds/kernel/monmap.h deleted file mode 100644 index 2f60c8a0c3436..0000000000000 --- a/branches/sage/mds/kernel/monmap.h +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef _FS_CEPH_MONMAP_H -#define _FS_CEPH_MONMAP_H - -#include - -/* - * monitor map - */ -struct ceph_monmap { - __u64 m_epoch; - __u32 m_num_mon; - __u32 m_last_mon; - struct ceph_entity_inst m_mon_inst; -}; - -extern int ceph_monmap_pick_mon(struct ceph_monmap *m); -extern int ceph_monmap_decode(struct ceph_monmap *m, struct kvec *v); - -#endif diff --git a/branches/sage/mds/kernel/osd_client.h b/branches/sage/mds/kernel/osd_client.h deleted file mode 100644 index 6efa3b8f2ab25..0000000000000 --- a/branches/sage/mds/kernel/osd_client.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef _FS_CEPH_OSD_CLIENT_H -#define _FS_CEPH_OSD_CLIENT_H - -/* this will be equivalent to osdc/Objecter.h */ - - -/* do these later -#include "osdmap.h" -*/ -struct ceph_osdmap; - - -struct ceph_osd_client { - struct ceph_osdmap *s_osdmap; /* osd map */ - -}; - -#endif diff --git a/branches/sage/mds/kernel/super.h b/branches/sage/mds/kernel/super.h deleted file mode 100644 index 94418511ffa53..0000000000000 --- a/branches/sage/mds/kernel/super.h +++ /dev/null @@ -1,75 +0,0 @@ -#ifndef _FS_CEPH_CEPH_H -#define _FS_CEPH_CEPH_H - -/* #include */ - -#include "kmsg.h" -#include "monmap.h" -#include "mds_client.h" -#include "osd_client.h" - - - -/* - * CEPH per-filesystem client state - * - * possibly shared by multiple mount points, if they are - * mounting the same ceph filesystem/cluster. - */ -struct ceph_fs_client { - __u64 s_fsid; /* hmm this should be part of the monmap? */ - - __u32 s_whoami; /* my client number */ - struct ceph_kmsg *s_kmsg; /* messenger instance */ - - struct ceph_monmap *s_monmap; /* monitor map */ - - struct ceph_mds_client *s_mds_client; - struct ceph_osd_client *s_osd_client; - - int s_ref; /* reference count (for each sb_info that points to me) */ -}; - -/* - * directory of filesystems mounted by this host - * - * key: fsid? ipquad of monitor? hmm! - * value: struct ceph_fs_client* - */ -extern struct radix_tree ceph_fs_clients; - - -/* - * CEPH per-mount superblock info - */ -struct ceph_sb_info { - struct ceph_fs_client *sb_client; - - /* FIXME: add my relative offset into the filesystem, - so we can appropriately mangle/adjust path names in requests, etc. */ -}; - -/* - * CEPH file system in-core inode info - */ -struct ceph_inode_info { - struct ceph_file_layout i_layout; - struct inode vfs_inode; -}; - -static inline struct ceph_inode_info *CEPH_I(struct inode *inode) -{ - return list_entry(inode, struct ceph_inode_info, vfs_inode); -} - - -/* file.c */ -extern const struct inode_operations ceph_file_inops; -extern const struct file_operations ceph_file_operations; -extern const struct address_space_operations ceph_aops; - -/* dir.c */ -extern const struct inode_operations ceph_dir_inops; -extern const struct file_operations ceph_dir_operations; - -#endif /* _FS_CEPH_CEPH_H */ diff --git a/branches/sage/mds/mds/Anchor.h b/branches/sage/mds/mds/Anchor.h deleted file mode 100644 index 748091306a44d..0000000000000 --- a/branches/sage/mds/mds/Anchor.h +++ /dev/null @@ -1,108 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __ANCHOR_H -#define __ANCHOR_H - -#include -using std::string; - -#include "include/types.h" -#include "mdstypes.h" -#include "include/buffer.h" - - -// anchor ops -#define ANCHOR_OP_LOOKUP 1 -#define ANCHOR_OP_LOOKUP_REPLY 2 - -#define ANCHOR_OP_CREATE_PREPARE 11 -#define ANCHOR_OP_CREATE_AGREE 12 - -#define ANCHOR_OP_DESTROY_PREPARE 21 -#define ANCHOR_OP_DESTROY_AGREE 22 - -#define ANCHOR_OP_UPDATE_PREPARE 31 -#define ANCHOR_OP_UPDATE_AGREE 32 - -#define ANCHOR_OP_COMMIT 41 -#define ANCHOR_OP_ACK 42 -#define ANCHOR_OP_ROLLBACK 43 - - - -inline const char* get_anchor_opname(int o) { - switch (o) { - case ANCHOR_OP_LOOKUP: return "lookup"; - case ANCHOR_OP_LOOKUP_REPLY: return "lookup_reply"; - - case ANCHOR_OP_CREATE_PREPARE: return "create_prepare"; - case ANCHOR_OP_CREATE_AGREE: return "create_agree"; - case ANCHOR_OP_DESTROY_PREPARE: return "destroy_prepare"; - case ANCHOR_OP_DESTROY_AGREE: return "destroy_agree"; - case ANCHOR_OP_UPDATE_PREPARE: return "update_prepare"; - case ANCHOR_OP_UPDATE_AGREE: return "update_agree"; - - case ANCHOR_OP_COMMIT: return "commit"; - case ANCHOR_OP_ACK: return "ack"; - case ANCHOR_OP_ROLLBACK: return "rollback"; - default: assert(0); return 0; - } -} - - -// identifies a anchor table mutation - - - -// anchor type - -class Anchor { -public: - inodeno_t ino; // anchored ino - dirfrag_t dirfrag; // containing dirfrag - //string ref_dn; // referring dentry - int nref; // reference count - - Anchor() {} - Anchor(inodeno_t i, dirfrag_t df, - //string& rd, - int nr=0) : - ino(i), dirfrag(df), - //ref_dn(rd), - nref(nr) { } - - void _encode(bufferlist &bl) { - bl.append((char*)&ino, sizeof(ino)); - bl.append((char*)&dirfrag, sizeof(dirfrag)); - bl.append((char*)&nref, sizeof(nref)); - //::_encode(ref_dn, bl); - } - void _decode(bufferlist& bl, int& off) { - bl.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - bl.copy(off, sizeof(dirfrag), (char*)&dirfrag); - off += sizeof(dirfrag); - bl.copy(off, sizeof(nref), (char*)&nref); - off += sizeof(nref); - //::_decode(ref_dn, bl, off); - } -}; - -inline ostream& operator<<(ostream& out, Anchor& a) -{ - return out << "a(" << a.ino << " " << a.dirfrag << " " << a.nref << ")"; -} - -#endif diff --git a/branches/sage/mds/mds/AnchorClient.cc b/branches/sage/mds/mds/AnchorClient.cc deleted file mode 100644 index b2fb1fb50d7bd..0000000000000 --- a/branches/sage/mds/mds/AnchorClient.cc +++ /dev/null @@ -1,379 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include -using std::cout; -using std::cerr; - -#include "Anchor.h" -#include "AnchorClient.h" -#include "MDSMap.h" - -#include "include/Context.h" -#include "msg/Messenger.h" - -#include "MDS.h" -#include "MDLog.h" -#include "LogSegment.h" - -#include "events/EAnchorClient.h" -#include "messages/MAnchor.h" - -#include "config.h" - -#define dout(x) if (x <= g_conf.debug_mds) *_dout << dbeginl << g_clock.now() << " " << mds->messenger->get_myname() << ".anchorclient " -#define derr(x) if (x <= g_conf.debug_mds) *_derr << dbeginl << g_clock.now() << " " << mds->messenger->get_myname() << ".anchorclient " - - -void AnchorClient::dispatch(Message *m) -{ - switch (m->get_type()) { - case MSG_MDS_ANCHOR: - handle_anchor_reply((MAnchor*)m); - break; - - default: - assert(0); - } -} - -void AnchorClient::handle_anchor_reply(class MAnchor *m) -{ - inodeno_t ino = m->get_ino(); - version_t atid = m->get_atid(); - - dout(10) << "handle_anchor_reply " << *m << dendl; - - switch (m->get_op()) { - - // lookup - case ANCHOR_OP_LOOKUP_REPLY: - assert(pending_lookup.count(ino)); - { - *pending_lookup[ino].trace = m->get_trace(); - Context *onfinish = pending_lookup[ino].onfinish; - pending_lookup.erase(ino); - - if (onfinish) { - onfinish->finish(0); - delete onfinish; - } - } - break; - - // prepare -> agree - case ANCHOR_OP_CREATE_AGREE: - if (pending_create_prepare.count(ino)) { - dout(10) << "got create_agree on " << ino << " atid " << atid << dendl; - Context *onfinish = pending_create_prepare[ino].onfinish; - *pending_create_prepare[ino].patid = atid; - pending_create_prepare.erase(ino); - - if (onfinish) { - onfinish->finish(0); - delete onfinish; - } - } - else if (pending_commit.count(atid)) { - dout(10) << "stray create_agree on " << ino - << " atid " << atid - << ", already committing, resending COMMIT" - << dendl; - MAnchor *req = new MAnchor(ANCHOR_OP_COMMIT, 0, atid); - mds->messenger->send_message(req, - mds->mdsmap->get_inst(mds->mdsmap->get_anchortable()), - MDS_PORT_ANCHORTABLE, MDS_PORT_ANCHORCLIENT); - } - else { - dout(10) << "stray create_agree on " << ino - << " atid " << atid - << ", sending ROLLBACK" - << dendl; - MAnchor *req = new MAnchor(ANCHOR_OP_ROLLBACK, 0, atid); - mds->messenger->send_message(req, - mds->mdsmap->get_inst(mds->mdsmap->get_anchortable()), - MDS_PORT_ANCHORTABLE, MDS_PORT_ANCHORCLIENT); - } - break; - - case ANCHOR_OP_DESTROY_AGREE: - if (pending_destroy_prepare.count(ino)) { - dout(10) << "got destroy_agree on " << ino << " atid " << atid << dendl; - Context *onfinish = pending_destroy_prepare[ino].onfinish; - *pending_destroy_prepare[ino].patid = atid; - pending_destroy_prepare.erase(ino); - - if (onfinish) { - onfinish->finish(0); - delete onfinish; - } - } - else if (pending_commit.count(atid)) { - dout(10) << "stray destroy_agree on " << ino - << " atid " << atid - << ", already committing, resending COMMIT" - << dendl; - MAnchor *req = new MAnchor(ANCHOR_OP_COMMIT, 0, atid); - mds->messenger->send_message(req, - mds->mdsmap->get_inst(mds->mdsmap->get_anchortable()), - MDS_PORT_ANCHORTABLE, MDS_PORT_ANCHORCLIENT); - } - else { - dout(10) << "stray destroy_agree on " << ino - << " atid " << atid - << ", sending ROLLBACK" - << dendl; - MAnchor *req = new MAnchor(ANCHOR_OP_ROLLBACK, 0, atid); - mds->messenger->send_message(req, - mds->mdsmap->get_inst(mds->mdsmap->get_anchortable()), - MDS_PORT_ANCHORTABLE, MDS_PORT_ANCHORCLIENT); - } - break; - - case ANCHOR_OP_UPDATE_AGREE: - if (pending_update_prepare.count(ino)) { - dout(10) << "got update_agree on " << ino << " atid " << atid << dendl; - Context *onfinish = pending_update_prepare[ino].onfinish; - *pending_update_prepare[ino].patid = atid; - pending_update_prepare.erase(ino); - - if (onfinish) { - onfinish->finish(0); - delete onfinish; - } - } - else if (pending_commit.count(atid)) { - dout(10) << "stray update_agree on " << ino - << " atid " << atid - << ", already committing, resending COMMIT" - << dendl; - MAnchor *req = new MAnchor(ANCHOR_OP_COMMIT, 0, atid); - mds->messenger->send_message(req, - mds->mdsmap->get_inst(mds->mdsmap->get_anchortable()), - MDS_PORT_ANCHORTABLE, MDS_PORT_ANCHORCLIENT); - } - else { - dout(10) << "stray update_agree on " << ino - << " atid " << atid - << ", sending ROLLBACK" - << dendl; - MAnchor *req = new MAnchor(ANCHOR_OP_ROLLBACK, 0, atid); - mds->messenger->send_message(req, - mds->mdsmap->get_inst(mds->mdsmap->get_anchortable()), - MDS_PORT_ANCHORTABLE, MDS_PORT_ANCHORCLIENT); - } - break; - - // commit -> ack - case ANCHOR_OP_ACK: - { - dout(10) << "got ack on atid " << atid << ", logging" << dendl; - - // remove from committing list - assert(pending_commit.count(atid)); - assert(pending_commit[atid]->pending_commit_atids.count(atid)); - - // log ACK. - mds->mdlog->submit_entry(new EAnchorClient(ANCHOR_OP_ACK, atid), - new C_LoggedAck(this, atid)); - } - break; - - default: - assert(0); - } - - delete m; -} - - -void AnchorClient::_logged_ack(version_t atid) -{ - dout(10) << "_logged_ack" << dendl; - - assert(pending_commit.count(atid)); - assert(pending_commit[atid]->pending_commit_atids.count(atid)); - - pending_commit[atid]->pending_commit_atids.erase(atid); - pending_commit.erase(atid); - - // kick any waiters (LogSegment trim) - if (ack_waiters.count(atid)) { - dout(15) << "kicking ack waiters on atid " << atid << dendl; - mds->queue_waiters(ack_waiters[atid]); - ack_waiters.erase(atid); - } -} - - -/* - * public async interface - */ - - -/* - * FIXME: we need to be able to resubmit messages if the anchortable mds fails. - */ - - -void AnchorClient::lookup(inodeno_t ino, vector& trace, Context *onfinish) -{ - // send message - MAnchor *req = new MAnchor(ANCHOR_OP_LOOKUP, ino); - - assert(pending_lookup.count(ino) == 0); - pending_lookup[ino].onfinish = onfinish; - pending_lookup[ino].trace = &trace; - - mds->send_message_mds(req, - mds->mdsmap->get_anchortable(), - MDS_PORT_ANCHORTABLE, MDS_PORT_ANCHORCLIENT); -} - - -// PREPARE - -void AnchorClient::prepare_create(inodeno_t ino, vector& trace, - version_t *patid, Context *onfinish) -{ - dout(10) << "prepare_create " << ino << " " << trace << dendl; - - // send message - MAnchor *req = new MAnchor(ANCHOR_OP_CREATE_PREPARE, ino); - req->set_trace(trace); - - pending_create_prepare[ino].trace = trace; - pending_create_prepare[ino].patid = patid; - pending_create_prepare[ino].onfinish = onfinish; - - mds->send_message_mds(req, - mds->mdsmap->get_anchortable(), - MDS_PORT_ANCHORTABLE, MDS_PORT_ANCHORCLIENT); -} - -void AnchorClient::prepare_destroy(inodeno_t ino, - version_t *patid, Context *onfinish) -{ - dout(10) << "prepare_destroy " << ino << dendl; - - // send message - MAnchor *req = new MAnchor(ANCHOR_OP_DESTROY_PREPARE, ino); - pending_destroy_prepare[ino].onfinish = onfinish; - pending_destroy_prepare[ino].patid = patid; - mds->messenger->send_message(req, - mds->mdsmap->get_inst(mds->mdsmap->get_anchortable()), - MDS_PORT_ANCHORTABLE, MDS_PORT_ANCHORCLIENT); -} - - -void AnchorClient::prepare_update(inodeno_t ino, vector& trace, - version_t *patid, Context *onfinish) -{ - dout(10) << "prepare_update " << ino << " " << trace << dendl; - - // send message - MAnchor *req = new MAnchor(ANCHOR_OP_UPDATE_PREPARE, ino); - req->set_trace(trace); - - pending_update_prepare[ino].trace = trace; - pending_update_prepare[ino].patid = patid; - pending_update_prepare[ino].onfinish = onfinish; - - mds->messenger->send_message(req, - mds->mdsmap->get_inst(mds->mdsmap->get_anchortable()), - MDS_PORT_ANCHORTABLE, MDS_PORT_ANCHORCLIENT); -} - - -// COMMIT - -void AnchorClient::commit(version_t atid, LogSegment *ls) -{ - dout(10) << "commit " << atid << dendl; - - assert(pending_commit.count(atid) == 0); - pending_commit[atid] = ls; - ls->pending_commit_atids.insert(atid); - - // send message - MAnchor *req = new MAnchor(ANCHOR_OP_COMMIT, 0, atid); - mds->messenger->send_message(req, - mds->mdsmap->get_inst(mds->mdsmap->get_anchortable()), - MDS_PORT_ANCHORTABLE, MDS_PORT_ANCHORCLIENT); -} - - - -// RECOVERY - -void AnchorClient::finish_recovery() -{ - dout(7) << "finish_recovery" << dendl; - - resend_commits(); -} - -void AnchorClient::resend_commits() -{ - for (map::iterator p = pending_commit.begin(); - p != pending_commit.end(); - ++p) { - dout(10) << "resending commit on " << p->first << dendl; - MAnchor *req = new MAnchor(ANCHOR_OP_COMMIT, 0, p->first); - mds->send_message_mds(req, - mds->mdsmap->get_anchortable(), - MDS_PORT_ANCHORTABLE, MDS_PORT_ANCHORCLIENT); - } -} - -void AnchorClient::resend_prepares(hash_map& prepares, int op) -{ - for (hash_map::iterator p = prepares.begin(); - p != prepares.end(); - p++) { - dout(10) << "resending " << get_anchor_opname(op) << " on " << p->first << dendl; - MAnchor *req = new MAnchor(op, p->first); - req->set_trace(p->second.trace); - mds->send_message_mds(req, - mds->mdsmap->get_anchortable(), - MDS_PORT_ANCHORTABLE, MDS_PORT_ANCHORCLIENT); - } -} - - -void AnchorClient::handle_mds_recovery(int who) -{ - dout(7) << "handle_mds_recovery mds" << who << dendl; - - if (who != mds->mdsmap->get_anchortable()) - return; // do nothing. - - // resend any pending lookups. - for (hash_map::iterator p = pending_lookup.begin(); - p != pending_lookup.end(); - p++) { - dout(10) << "resending lookup on " << p->first << dendl; - mds->send_message_mds(new MAnchor(ANCHOR_OP_LOOKUP, p->first), - mds->mdsmap->get_anchortable(), - MDS_PORT_ANCHORTABLE, MDS_PORT_ANCHORCLIENT); - } - - // resend any pending prepares. - resend_prepares(pending_create_prepare, ANCHOR_OP_CREATE_PREPARE); - resend_prepares(pending_update_prepare, ANCHOR_OP_UPDATE_PREPARE); - resend_prepares(pending_destroy_prepare, ANCHOR_OP_DESTROY_PREPARE); - - // resend any pending commits. - resend_commits(); -} diff --git a/branches/sage/mds/mds/AnchorClient.h b/branches/sage/mds/mds/AnchorClient.h deleted file mode 100644 index fd790f39c399d..0000000000000 --- a/branches/sage/mds/mds/AnchorClient.h +++ /dev/null @@ -1,107 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __ANCHORCLIENT_H -#define __ANCHORCLIENT_H - -#include -using std::vector; -#include -using __gnu_cxx::hash_map; - -#include "include/types.h" -#include "msg/Dispatcher.h" - -#include "Anchor.h" - -class Context; -class MDS; -class LogSegment; - -class AnchorClient : public Dispatcher { - MDS *mds; - - // lookups - struct _pending_lookup { - vector *trace; - Context *onfinish; - }; - hash_map pending_lookup; - - // prepares - struct _pending_prepare { - vector trace; - Context *onfinish; - version_t *patid; // ptr to atid - }; - hash_map pending_create_prepare; - hash_map pending_destroy_prepare; - hash_map pending_update_prepare; - - // pending commits - map pending_commit; - map > ack_waiters; - - void handle_anchor_reply(class MAnchor *m); - - class C_LoggedAck : public Context { - AnchorClient *ac; - version_t atid; - public: - C_LoggedAck(AnchorClient *a, version_t t) : ac(a), atid(t) {} - void finish(int r) { - ac->_logged_ack(atid); - } - }; - void _logged_ack(version_t atid); - -public: - AnchorClient(MDS *m) : mds(m) {} - - void dispatch(Message *m); - - // async user interface - void lookup(inodeno_t ino, vector& trace, Context *onfinish); - - void prepare_create(inodeno_t ino, vector& trace, version_t *atid, Context *onfinish); - void prepare_destroy(inodeno_t ino, version_t *atid, Context *onfinish); - void prepare_update(inodeno_t ino, vector& trace, version_t *atid, Context *onfinish); - - void commit(version_t atid, LogSegment *ls); - - // for recovery (by other nodes) - void handle_mds_recovery(int mds); // called when someone else recovers - - void resend_commits(); - void resend_prepares(hash_map& prepares, int op); - - // for recovery (by me) - void got_journaled_agree(version_t atid, LogSegment *ls) { - pending_commit[atid] = ls; - } - void got_journaled_ack(version_t atid) { - pending_commit.erase(atid); - } - bool has_committed(version_t atid) { - return pending_commit.count(atid) == 0; - } - void wait_for_ack(version_t atid, Context *c) { - ack_waiters[atid].push_back(c); - } - void finish_recovery(); // called when i recover and go active - - -}; - -#endif diff --git a/branches/sage/mds/mds/AnchorTable.cc b/branches/sage/mds/mds/AnchorTable.cc deleted file mode 100644 index 65c09278c9850..0000000000000 --- a/branches/sage/mds/mds/AnchorTable.cc +++ /dev/null @@ -1,713 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "AnchorTable.h" -#include "MDS.h" - -#include "osdc/Filer.h" - -#include "msg/Messenger.h" -#include "messages/MAnchor.h" - -#include "common/Clock.h" - -#include "MDLog.h" -#include "events/EAnchor.h" - -#include "config.h" - -#define dout(x) if (x <= g_conf.debug_mds) *_dout << dbeginl << g_clock.now() << " " << mds->messenger->get_myname() << ".anchortable " -#define derr(x) if (x <= g_conf.debug_mds) *_derr << dbeginl << g_clock.now() << " " << mds->messenger->get_myname() << ".anchortable " - - -void AnchorTable::dump() -{ - dout(7) << "dump v " << version << dendl; - for (hash_map::iterator it = anchor_map.begin(); - it != anchor_map.end(); - it++) - dout(15) << "dump " << it->second << dendl; -} - - -/* - * basic updates - */ - -bool AnchorTable::add(inodeno_t ino, dirfrag_t dirfrag) -{ - //dout(17) << "add " << ino << " dirfrag " << dirfrag << dendl; - - // parent should be there - assert(dirfrag.ino < MDS_INO_BASE || // system dirino - anchor_map.count(dirfrag.ino)); // have - - if (anchor_map.count(ino) == 0) { - // new item - anchor_map[ino] = Anchor(ino, dirfrag); - dout(7) << "add added " << anchor_map[ino] << dendl; - return true; - } else { - dout(7) << "add had " << anchor_map[ino] << dendl; - return false; - } -} - -void AnchorTable::inc(inodeno_t ino) -{ - dout(7) << "inc " << ino << dendl; - - assert(anchor_map.count(ino)); - - while (1) { - Anchor &anchor = anchor_map[ino]; - anchor.nref++; - - dout(10) << "inc now " << anchor << dendl; - ino = anchor.dirfrag.ino; - - if (ino == 0) break; - if (anchor_map.count(ino) == 0) break; - } -} - -void AnchorTable::dec(inodeno_t ino) -{ - dout(7) << "dec " << ino << dendl; - assert(anchor_map.count(ino)); - - while (true) { - Anchor &anchor = anchor_map[ino]; - anchor.nref--; - - if (anchor.nref == 0) { - dout(10) << "dec removing " << anchor << dendl; - dirfrag_t dirfrag = anchor.dirfrag; - anchor_map.erase(ino); - ino = dirfrag.ino; - } else { - dout(10) << "dec now " << anchor << dendl; - ino = anchor.dirfrag.ino; - } - - if (ino == 0) break; - if (anchor_map.count(ino) == 0) break; - } -} - - -/* - * high level - */ - - -// LOOKUP - -void AnchorTable::handle_lookup(MAnchor *req) -{ - inodeno_t curino = req->get_ino(); - dout(7) << "handle_lookup " << curino << dendl; - - vector trace; - while (true) { - assert(anchor_map.count(curino) == 1); - Anchor &anchor = anchor_map[curino]; - - dout(10) << "handle_lookup adding " << anchor << dendl; - trace.insert(trace.begin(), anchor); // lame FIXME - - if (anchor.dirfrag.ino < MDS_INO_BASE) break; - curino = anchor.dirfrag.ino; - } - - // reply - MAnchor *reply = new MAnchor(ANCHOR_OP_LOOKUP_REPLY, req->get_ino()); - reply->set_trace(trace); - mds->messenger->send_message(reply, req->get_source_inst(), req->get_source_port()); - - delete req; -} - - -// MIDLEVEL - -void AnchorTable::create_prepare(inodeno_t ino, vector& trace, int reqmds) -{ - // make sure trace is in table - for (unsigned i=0; i& trace, int reqmds) -{ - version++; - pending_update[version].first = ino; - pending_update[version].second = trace; - pending_reqmds[version] = reqmds; - //dump(); -} - -void AnchorTable::commit(version_t atid) -{ - if (pending_create.count(atid)) { - dout(7) << "commit " << atid << " create " << pending_create[atid] << dendl; - pending_create.erase(atid); - } - - else if (pending_destroy.count(atid)) { - inodeno_t ino = pending_destroy[atid]; - dout(7) << "commit " << atid << " destroy " << ino << dendl; - - dec(ino); // destroy - - pending_destroy.erase(atid); - } - - else if (pending_update.count(atid)) { - inodeno_t ino = pending_update[atid].first; - vector &trace = pending_update[atid].second; - - dout(7) << "commit " << atid << " update " << ino << dendl; - - // remove old - dec(ino); - - // add new - for (unsigned i=0; i_create_prepare_logged(req, atid); - } -}; - -void AnchorTable::handle_create_prepare(MAnchor *req) -{ - inodeno_t ino = req->get_ino(); - vector& trace = req->get_trace(); - - dout(7) << "handle_create_prepare " << ino << dendl; - - create_prepare(ino, trace, req->get_source().num()); - - // log it - EAnchor *le = new EAnchor(ANCHOR_OP_CREATE_PREPARE, ino, version, req->get_source().num()); - le->set_trace(trace); - mds->mdlog->submit_entry(le, - new C_AT_CreatePrepare(this, req, version)); -} - -void AnchorTable::_create_prepare_logged(MAnchor *req, version_t atid) -{ - inodeno_t ino = req->get_ino(); - dout(7) << "_create_prepare_logged " << ino << " atid " << atid << dendl; - - // reply - MAnchor *reply = new MAnchor(ANCHOR_OP_CREATE_AGREE, ino, atid); - mds->messenger->send_message(reply, req->get_source_inst(), req->get_source_port()); - - delete req; -} - - - - -// DESTROY - -class C_AT_DestroyPrepare : public Context { - AnchorTable *at; - MAnchor *req; - version_t atid; -public: - C_AT_DestroyPrepare(AnchorTable *a, MAnchor *r, version_t t) : - at(a), req(r), atid(t) { } - void finish(int r) { - at->_destroy_prepare_logged(req, atid); - } -}; - -void AnchorTable::handle_destroy_prepare(MAnchor *req) -{ - inodeno_t ino = req->get_ino(); - dout(7) << "handle_destroy_prepare " << ino << dendl; - - destroy_prepare(ino, req->get_source().num()); - - mds->mdlog->submit_entry(new EAnchor(ANCHOR_OP_DESTROY_PREPARE, ino, version, req->get_source().num()), - new C_AT_DestroyPrepare(this, req, version)); -} - -void AnchorTable::_destroy_prepare_logged(MAnchor *req, version_t atid) -{ - inodeno_t ino = req->get_ino(); - dout(7) << "_destroy_prepare_logged " << ino << " atid " << atid << dendl; - - // reply - MAnchor *reply = new MAnchor(ANCHOR_OP_DESTROY_AGREE, ino, atid); - mds->messenger->send_message(reply, req->get_source_inst(), req->get_source_port()); - delete req; -} - - - -// UPDATE - -class C_AT_UpdatePrepare : public Context { - AnchorTable *at; - MAnchor *req; - version_t atid; -public: - C_AT_UpdatePrepare(AnchorTable *a, MAnchor *r, version_t t) : - at(a), req(r), atid(t) { } - void finish(int r) { - at->_update_prepare_logged(req, atid); - } -}; - -void AnchorTable::handle_update_prepare(MAnchor *req) -{ - inodeno_t ino = req->get_ino(); - vector& trace = req->get_trace(); - - dout(7) << "handle_update_prepare " << ino << dendl; - - update_prepare(ino, trace, req->get_source().num()); - - // log it - EAnchor *le = new EAnchor(ANCHOR_OP_UPDATE_PREPARE, ino, version, req->get_source().num()); - le->set_trace(trace); - mds->mdlog->submit_entry(le, - new C_AT_UpdatePrepare(this, req, version)); -} - -void AnchorTable::_update_prepare_logged(MAnchor *req, version_t atid) -{ - inodeno_t ino = req->get_ino(); - dout(7) << "_update_prepare_logged " << ino << " atid " << atid << dendl; - - // reply - MAnchor *reply = new MAnchor(ANCHOR_OP_UPDATE_AGREE, ino, atid); - mds->messenger->send_message(reply, req->get_source_inst(), req->get_source_port()); - delete req; -} - - - -// COMMIT - -class C_AT_Commit : public Context { - AnchorTable *at; - MAnchor *req; -public: - C_AT_Commit(AnchorTable *a, MAnchor *r) : - at(a), req(r) { } - void finish(int r) { - at->_commit_logged(req); - } -}; - -void AnchorTable::handle_commit(MAnchor *req) -{ - version_t atid = req->get_atid(); - dout(7) << "handle_commit " << atid << dendl; - - if (pending_create.count(atid) || - pending_destroy.count(atid) || - pending_update.count(atid)) { - commit(atid); - mds->mdlog->submit_entry(new EAnchor(ANCHOR_OP_COMMIT, atid, version)); - } - else if (atid <= version) { - dout(0) << "got commit for atid " << atid << " <= " << version - << ", already committed, sending ack." - << dendl; - MAnchor *reply = new MAnchor(ANCHOR_OP_ACK, 0, atid); - mds->messenger->send_message(reply, req->get_source_inst(), req->get_source_port()); - delete req; - return; - } - else { - // wtf. - dout(0) << "got commit for atid " << atid << " > " << version << dendl; - assert(atid <= version); - } - - // wait for it to journal - mds->mdlog->wait_for_sync(new C_AT_Commit(this, req)); -} - - -void AnchorTable::_commit_logged(MAnchor *req) -{ - dout(7) << "_commit_logged, sending ACK" << dendl; - MAnchor *reply = new MAnchor(ANCHOR_OP_ACK, req->get_ino(), req->get_atid()); - mds->messenger->send_message(reply, req->get_source_inst(), req->get_source_port()); - delete req; -} - - - -// ROLLBACK - -void AnchorTable::handle_rollback(MAnchor *req) -{ - version_t atid = req->get_atid(); - dout(7) << "handle_rollback " << atid << dendl; - rollback(atid); - delete req; -} - - - -/* - * messages - */ - -void AnchorTable::dispatch(Message *m) -{ - switch (m->get_type()) { - case MSG_MDS_ANCHOR: - handle_anchor_request((MAnchor*)m); - break; - - default: - assert(0); - } -} - - -void AnchorTable::handle_anchor_request(class MAnchor *req) -{ - // make sure i'm open! - if (!opened) { - dout(7) << "not open yet" << dendl; - - waiting_for_open.push_back(new C_MDS_RetryMessage(mds, req)); - - if (!opening) { - opening = true; - load(0); - } - return; - } - - dout(10) << "handle_anchor_request " << *req << dendl; - - // go - switch (req->get_op()) { - - case ANCHOR_OP_LOOKUP: - handle_lookup(req); - break; - - case ANCHOR_OP_CREATE_PREPARE: - handle_create_prepare(req); - break; - case ANCHOR_OP_DESTROY_PREPARE: - handle_destroy_prepare(req); - break; - case ANCHOR_OP_UPDATE_PREPARE: - handle_update_prepare(req); - break; - - case ANCHOR_OP_COMMIT: - handle_commit(req); - break; - - case ANCHOR_OP_ROLLBACK: - handle_rollback(req); - break; - - default: - assert(0); - } - -} - - - - -// primitive load/save for now! - -// load/save entire table for now! - -class C_AT_Saved : public Context { - AnchorTable *at; - version_t version; -public: - C_AT_Saved(AnchorTable *a, version_t v) : at(a), version(v) {} - void finish(int r) { - at->_saved(version); - } -}; - -void AnchorTable::save(Context *onfinish) -{ - dout(7) << "save v " << version << dendl; - if (!opened) { - assert(!onfinish); - return; - } - - if (onfinish) - waiting_for_save[version].push_back(onfinish); - - if (committing_version == version) { - dout(7) << "save already committing v " << version << dendl; - return; - } - committing_version = version; - - // build up write - bufferlist bl; - - // version - bl.append((char*)&version, sizeof(version)); - - // # anchors - size_t size = anchor_map.size(); - bl.append((char*)&size, sizeof(size)); - - // anchors - for (hash_map::iterator it = anchor_map.begin(); - it != anchor_map.end(); - it++) { - it->second._encode(bl); - dout(15) << "save encoded " << it->second << dendl; - } - - // pending - ::_encode(pending_reqmds, bl); - ::_encode(pending_create, bl); - ::_encode(pending_destroy, bl); - - size_t s = pending_update.size(); - bl.append((char*)&s, sizeof(s)); - for (map > >::iterator p = pending_update.begin(); - p != pending_update.end(); - ++p) { - bl.append((char*)&p->first, sizeof(p->first)); - bl.append((char*)&p->second.first, sizeof(p->second.first)); - ::_encode(p->second.second, bl); - } - - // write! - object_t oid = object_t(MDS_INO_ANCHORTABLE+mds->get_nodeid(), 0); - mds->objecter->write(oid, - 0, bl.length(), - mds->objecter->osdmap->file_to_object_layout(oid, g_OSD_MDAnchorTableLayout), - bl, - NULL, new C_AT_Saved(this, version)); -} - -void AnchorTable::_saved(version_t v) -{ - dout(7) << "_saved v " << v << dendl; - - assert(v <= committing_version); - assert(committed_version < v); - committed_version = v; - - finish_contexts(waiting_for_save[v], 0); - waiting_for_save.erase(v); -} - - - -class C_AT_Load : public Context { - AnchorTable *at; -public: - bufferlist bl; - C_AT_Load(AnchorTable *a) : at(a) {} - void finish(int result) { - assert(result > 0); - at->_loaded(bl); - } -}; - -void AnchorTable::load(Context *onfinish) -{ - dout(7) << "load" << dendl; - assert(!opened); - - waiting_for_open.push_back(onfinish); - - C_AT_Load *fin = new C_AT_Load(this); - object_t oid = object_t(MDS_INO_ANCHORTABLE+mds->get_nodeid(), 0); - mds->objecter->read(oid, - 0, 0, - mds->objecter->osdmap->file_to_object_layout(oid, g_OSD_MDAnchorTableLayout), - &fin->bl, fin); -} - -void AnchorTable::_loaded(bufferlist& bl) -{ - dout(10) << "_loaded got " << bl.length() << " bytes" << dendl; - - int off = 0; - bl.copy(off, sizeof(version), (char*)&version); - off += sizeof(version); - - size_t size; - bl.copy(off, sizeof(size), (char*)&size); - off += sizeof(size); - - for (size_t n=0; n::iterator p = pending_reqmds.begin(); - p != pending_reqmds.end(); - p++) - resend_agree(p->first, p->second); -} - - -void AnchorTable::resend_agree(version_t v, int who) -{ - if (pending_create.count(v)) { - MAnchor *reply = new MAnchor(ANCHOR_OP_CREATE_AGREE, pending_create[v], v); - mds->send_message_mds(reply, who, MDS_PORT_ANCHORCLIENT); - } - else if (pending_destroy.count(v)) { - MAnchor *reply = new MAnchor(ANCHOR_OP_DESTROY_AGREE, pending_destroy[v], v); - mds->send_message_mds(reply, who, MDS_PORT_ANCHORCLIENT); - } - else { - assert(pending_update.count(v)); - MAnchor *reply = new MAnchor(ANCHOR_OP_UPDATE_AGREE, pending_update[v].first, v); - mds->send_message_mds(reply, who, MDS_PORT_ANCHORCLIENT); - } -} - -void AnchorTable::handle_mds_recovery(int who) -{ - dout(7) << "handle_mds_recovery mds" << who << dendl; - - // resend agrees for recovered mds - for (map::iterator p = pending_reqmds.begin(); - p != pending_reqmds.end(); - p++) { - if (p->second != who) continue; - resend_agree(p->first, p->second); - } -} diff --git a/branches/sage/mds/mds/AnchorTable.h b/branches/sage/mds/mds/AnchorTable.h deleted file mode 100644 index 64a2002ba7c85..0000000000000 --- a/branches/sage/mds/mds/AnchorTable.h +++ /dev/null @@ -1,127 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __ANCHORTABLE_H -#define __ANCHORTABLE_H - -#include "Anchor.h" -#include "include/Context.h" - -#include -using namespace __gnu_cxx; - -class MDS; -class MAnchor; - -class AnchorTable { - MDS *mds; - - // keep the entire table in memory. - hash_map anchor_map; - - // uncommitted operations - map pending_reqmds; - map pending_create; - map pending_destroy; - map > > pending_update; - - version_t version; // this includes anchor_map AND pending_* state. - version_t committing_version; - version_t committed_version; - - // load/save state - bool opening, opened; - - // waiters - list waiting_for_open; - map > waiting_for_save; - -protected: - - // basic updates - bool add(inodeno_t ino, dirfrag_t dirfrag); - void inc(inodeno_t ino); - void dec(inodeno_t ino); - - // mid-level - void create_prepare(inodeno_t ino, vector& trace, int reqmds); - void destroy_prepare(inodeno_t ino, int reqmds); - void update_prepare(inodeno_t ino, vector& trace, int reqmds); - void commit(version_t atid); - void rollback(version_t atid); - friend class EAnchor; // used for journal replay. - - // high level interface - void handle_lookup(MAnchor *req); - - void handle_create_prepare(MAnchor *req); - void _create_prepare_logged(MAnchor *req, version_t atid); - friend class C_AT_CreatePrepare; - - void handle_destroy_prepare(MAnchor *req); - void _destroy_prepare_logged(MAnchor *req, version_t atid); - friend class C_AT_DestroyPrepare; - - void handle_update_prepare(MAnchor *req); - void _update_prepare_logged(MAnchor *req, version_t atid); - friend class C_AT_UpdatePrepare; - - void handle_commit(MAnchor *req); - void _commit_logged(MAnchor *req); - friend class C_AT_Commit; - - void handle_rollback(MAnchor *req); - - // messages - void handle_anchor_request(MAnchor *m); - - void dump(); - -public: - AnchorTable(MDS *m) : - mds(m), - version(0), committing_version(0), committed_version(0), - opening(false), opened(false) { } - - void dispatch(class Message *m); - - version_t get_version() { return version; } - version_t get_committed_version() { return committed_version; } - - void create_fresh() { - // reset (i.e. on mkfs) to empty, but unsaved table. - version = 1; - opened = true; - opening = false; - anchor_map.clear(); - pending_create.clear(); - pending_destroy.clear(); - pending_update.clear(); - } - - // load/save entire table for now! - void save(Context *onfinish); - void _saved(version_t v); - void load(Context *onfinish); - void _loaded(bufferlist& bl); - - // recovery - void handle_mds_recovery(int who); - void finish_recovery(); - void resend_agree(version_t v, int who); - -}; - -#endif diff --git a/branches/sage/mds/mds/CDir.h b/branches/sage/mds/mds/CDir.h deleted file mode 100644 index 99bad3801e130..0000000000000 --- a/branches/sage/mds/mds/CDir.h +++ /dev/null @@ -1,540 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __CDIR_H -#define __CDIR_H - -#include "include/types.h" -#include "include/buffer.h" -#include "mdstypes.h" -#include "config.h" -#include "common/DecayCounter.h" - -#include -#include - -#include -#include -#include -#include -using namespace std; - -#include -using __gnu_cxx::hash_map; - - -#include "CInode.h" - -class CDentry; -class MDCache; -class MDCluster; -class Context; -class CDirDiscover; - - -ostream& operator<<(ostream& out, class CDir& dir); - - -class CDir : public MDSCacheObject { - public: - // -- pins -- - static const int PIN_DNWAITER = 1; - static const int PIN_INOWAITER = 2; - static const int PIN_CHILD = 3; - static const int PIN_FROZEN = 4; - static const int PIN_SUBTREE = 5; - static const int PIN_IMPORTING = 7; - static const int PIN_IMPORTBOUND = 9; - static const int PIN_EXPORTBOUND = 10; - static const int PIN_STICKY = 11; - static const int PIN_SUBTREETEMP = 12; // used by MDCache::trim_non_auth() - const char *pin_name(int p) { - switch (p) { - case PIN_DNWAITER: return "dnwaiter"; - case PIN_INOWAITER: return "inowaiter"; - case PIN_CHILD: return "child"; - case PIN_FROZEN: return "frozen"; - case PIN_SUBTREE: return "subtree"; - case PIN_IMPORTING: return "importing"; - case PIN_IMPORTBOUND: return "importbound"; - case PIN_EXPORTBOUND: return "exportbound"; - case PIN_STICKY: return "sticky"; - case PIN_SUBTREETEMP: return "subtreetemp"; - default: return generic_pin_name(p); - } - } - - // -- state -- - static const unsigned STATE_COMPLETE = (1<< 1); // the complete contents are in cache - static const unsigned STATE_FROZENTREE = (1<< 2); // root of tree (bounded by exports) - static const unsigned STATE_FREEZINGTREE = (1<< 3); // in process of freezing - static const unsigned STATE_FROZENDIR = (1<< 4); - static const unsigned STATE_FREEZINGDIR = (1<< 5); - static const unsigned STATE_COMMITTING = (1<< 6); // mid-commit - static const unsigned STATE_FETCHING = (1<< 7); // currenting fetching - static const unsigned STATE_IMPORTBOUND = (1<<10); - static const unsigned STATE_EXPORTBOUND = (1<<11); - static const unsigned STATE_EXPORTING = (1<<12); - static const unsigned STATE_IMPORTING = (1<<13); - static const unsigned STATE_FRAGMENTING = (1<<14); - static const unsigned STATE_STICKY = (1<<15); // sticky pin due to inode stickydirs - static const unsigned STATE_DNPINNEDFRAG = (1<<16); // dir is refragmenting - - // common states - static const unsigned STATE_CLEAN = 0; - static const unsigned STATE_INITIAL = 0; - - // these state bits are preserved by an import/export - // ...except if the directory is hashed, in which case none of them are! - static const unsigned MASK_STATE_EXPORTED = - (STATE_COMPLETE|STATE_DIRTY); - static const unsigned MASK_STATE_IMPORT_KEPT = - ( - STATE_IMPORTING - |STATE_IMPORTBOUND|STATE_EXPORTBOUND - |STATE_FROZENTREE - |STATE_STICKY); - static const unsigned MASK_STATE_EXPORT_KEPT = - (STATE_EXPORTING - |STATE_IMPORTBOUND|STATE_EXPORTBOUND - |STATE_FROZENTREE - |STATE_FROZENDIR - |STATE_STICKY); - static const unsigned MASK_STATE_FRAGMENT_KEPT = - (STATE_DIRTY | - STATE_COMPLETE | - STATE_EXPORTBOUND | - STATE_IMPORTBOUND); - - // -- rep spec -- - static const int REP_NONE = 0; - static const int REP_ALL = 1; - static const int REP_LIST = 2; - - - static const int NONCE_EXPORT = 1; - - - // -- wait masks -- - static const int WAIT_DENTRY = (1<<0); // wait for item to be in cache - static const int WAIT_COMPLETE = (1<<1); // wait for complete dir contents - static const int WAIT_FROZEN = (1<<2); // auth pins removed - - static const int WAIT_DNLOCK_OFFSET = 4; - - static const int WAIT_ANY = (0xffffffff); - static const int WAIT_ATFREEZEROOT = (WAIT_UNFREEZE); - static const int WAIT_ATSUBTREEROOT = (WAIT_SINGLEAUTH); - - - - - public: - // context - MDCache *cache; - - CInode *inode; // my inode - frag_t frag; // my frag - - bool is_lt(const MDSCacheObject *r) const { - return dirfrag() < ((const CDir*)r)->dirfrag(); - } - - //int hack_num_accessed; - -public: - //typedef hash_map map_t; // there is a bug somewhere, valgrind me. - typedef map map_t; -protected: - // contents - map_t items; // non-null AND null - unsigned nitems; // # non-null - unsigned nnull; // # null - - int num_dirty; - - - - // state - version_t version; - version_t committing_version; - version_t committed_version; - version_t committed_version_equivalent; // in case of, e.g., temporary file - version_t projected_version; - - xlist::item xlist_dirty; - - // lock nesting, freeze - int auth_pins; - int nested_auth_pins; - int request_pins; - - // cache control (defined for authority; hints for replicas) - int dir_rep; - set dir_rep_by; // if dir_rep == REP_LIST - - // popularity - dirfrag_load_vec_t pop_me; - dirfrag_load_vec_t pop_nested; - dirfrag_load_vec_t pop_auth_subtree; - dirfrag_load_vec_t pop_auth_subtree_nested; - - utime_t last_popularity_sample; - - load_spread_t pop_spread; - - // and to provide density - int num_dentries_nested; - int num_dentries_auth_subtree; - int num_dentries_auth_subtree_nested; - - - // friends - friend class Migrator; - friend class CInode; - friend class MDCache; - friend class MDiscover; - friend class MDBalancer; - - friend class CDirDiscover; - friend class CDirExport; - - public: - CDir(CInode *in, frag_t fg, MDCache *mdcache, bool auth); - - - - // -- accessors -- - inodeno_t ino() const { return inode->ino(); } // deprecate me? - frag_t get_frag() const { return frag; } - dirfrag_t dirfrag() const { return dirfrag_t(inode->ino(), frag); } - - CInode *get_inode() { return inode; } - CDir *get_parent_dir() { return inode->get_parent_dir(); } - - map_t::iterator begin() { return items.begin(); } - map_t::iterator end() { return items.end(); } - unsigned get_size() { - return nitems; - } - unsigned get_nitems() { return nitems; } - unsigned get_nnull() { return nnull; } - - void inc_num_dirty() { num_dirty++; } - void dec_num_dirty() { - assert(num_dirty > 0); - num_dirty--; - } - int get_num_dirty() { - return num_dirty; - } - - - // -- dentries and inodes -- - public: - CDentry* lookup(const string& n) { - map_t::iterator iter = items.find(n); - if (iter == items.end()) - return 0; - else - return iter->second; - } - - CDentry* add_null_dentry(const string& dname); - CDentry* add_primary_dentry(const string& dname, CInode *in); - CDentry* add_remote_dentry(const string& dname, inodeno_t ino, unsigned char d_type); - void remove_dentry( CDentry *dn ); // delete dentry - void link_remote_inode( CDentry *dn, inodeno_t ino, unsigned char d_type); - void link_primary_inode( CDentry *dn, CInode *in ); - void unlink_inode( CDentry *dn ); - void try_remove_unlinked_dn(CDentry *dn); -private: - void link_inode_work( CDentry *dn, CInode *in ); - void unlink_inode_work( CDentry *dn ); - void remove_null_dentries(); - -public: - void split(int bits, list& subs, list& waiters); - void merge(int bits, list& waiters); -private: - void steal_dentry(CDentry *dn); // from another dir. used by merge/split. - void purge_stolen(list& waiters); - void init_fragment_pins(); - - - // -- authority -- - /* - * normal: !subtree_root - * delegation: subtree_root - * ambiguous: subtree_root - * subtree_root - */ - pair dir_auth; - - public: - pair authority(); - pair get_dir_auth() { return dir_auth; } - void set_dir_auth(pair a); - void set_dir_auth(int a) { set_dir_auth(pair(a, CDIR_AUTH_UNKNOWN)); } - bool is_ambiguous_dir_auth() { - return dir_auth.second != CDIR_AUTH_UNKNOWN; - } - bool is_full_dir_auth() { - return is_auth() && !is_ambiguous_dir_auth(); - } - bool is_full_dir_nonauth() { - return !is_auth() && !is_ambiguous_dir_auth(); - } - - bool is_subtree_root() { - return dir_auth != CDIR_AUTH_DEFAULT; - } - - bool contains(CDir *x); // true if we are x or an ancestor of x - - - // for giving to clients - void get_dist_spec(set& ls, int auth) { - if (is_rep()) { - for (map::iterator p = replicas_begin(); - p != replicas_end(); - ++p) - ls.insert(p->first); - if (!ls.empty()) - ls.insert(auth); - } - } - - CDirDiscover *replicate_to(int mds); - - - // -- state -- - bool is_complete() { return state & STATE_COMPLETE; } - bool is_exporting() { return state & STATE_EXPORTING; } - bool is_importing() { return state & STATE_IMPORTING; } - - int get_dir_rep() { return dir_rep; } - bool is_rep() { - if (dir_rep == REP_NONE) return false; - return true; - } - - // -- fetch -- - object_t get_ondisk_object() { return object_t(ino(), frag); } - void fetch(Context *c, bool ignore_authpinnability=false); - void _fetched(bufferlist &bl); - - // -- commit -- - map > waiting_for_commit; - - void commit_to(version_t want); - void commit(version_t want, Context *c); - void _commit(version_t want); - void _committed(version_t v); - void wait_for_commit(Context *c, version_t v=0); - - // -- dirtyness -- - version_t get_version() { return version; } - void set_version(version_t v) { projected_version = version = v; } - version_t get_projected_version() { return projected_version; } - version_t get_committing_version() { return committing_version; } - version_t get_committed_version() { return committed_version; } - version_t get_committed_version_equivalent() { return committed_version_equivalent; } - void set_committed_version(version_t v) { committed_version = v; } - - version_t pre_dirty(version_t min=0); - void _mark_dirty(LogSegment *ls); - void mark_dirty(version_t pv, LogSegment *ls); - void mark_clean(); - void mark_complete() { state_set(STATE_COMPLETE); } - - - // -- reference counting -- - void first_get(); - void last_put(); - - void request_pin_get() { - if (request_pins == 0) get(PIN_REQUEST); - request_pins++; - } - void request_pin_put() { - request_pins--; - if (request_pins == 0) put(PIN_REQUEST); - } - - - // -- waiters -- -protected: - hash_map< string, list > waiting_on_dentry; - hash_map< inodeno_t, list > waiting_on_ino; - -public: - bool is_waiting_for_dentry(const string& dn) { - return waiting_on_dentry.count(dn); - } - void add_dentry_waiter(const string& dentry, Context *c); - void take_dentry_waiting(const string& dentry, list& ls); - - bool is_waiting_for_ino(inodeno_t ino) { - return waiting_on_ino.count(ino); - } - void add_ino_waiter(inodeno_t ino, Context *c); - void take_ino_waiting(inodeno_t ino, list& ls); - - void take_sub_waiting(list& ls); // dentry or ino - - void add_waiter(int mask, Context *c); - void take_waiting(int mask, list& ls); // may include dentry waiters - void finish_waiting(int mask, int result = 0); // ditto - - - // -- import/export -- - void encode_export(bufferlist& bl); - void finish_export(utime_t now); - void abort_export() { - put(PIN_TEMPEXPORTING); - } - void decode_import(bufferlist::iterator& blp); - - - // -- auth pins -- - bool can_auth_pin() { return is_auth() && !(is_frozen() || is_freezing()); } - int is_auth_pinned() { return auth_pins; } - int get_cum_auth_pins() { return auth_pins + nested_auth_pins; } - int get_auth_pins() { return auth_pins; } - int get_nested_auth_pins() { return nested_auth_pins; } - void auth_pin(); - void auth_unpin(); - void adjust_nested_auth_pins(int inc); - - // -- freezing -- - bool freeze_tree(); - void _freeze_tree(); - void unfreeze_tree(); - - bool freeze_dir(); - void _freeze_dir(); - void unfreeze_dir(); - - void maybe_finish_freeze() { - if (auth_pins != 1 || nested_auth_pins != 0) - return; - if (state_test(STATE_FREEZINGTREE)) { - _freeze_tree(); - auth_unpin(); - finish_waiting(WAIT_FROZEN); - } - if (state_test(STATE_FREEZINGDIR)) { - _freeze_dir(); - auth_unpin(); - finish_waiting(WAIT_FROZEN); - } - } - - bool is_freezing() { return is_freezing_tree() || is_freezing_dir(); } - bool is_freezing_tree(); - bool is_freezing_tree_root() { return state & STATE_FREEZINGTREE; } - bool is_freezing_dir() { return state & STATE_FREEZINGDIR; } - - bool is_frozen() { return is_frozen_dir() || is_frozen_tree(); } - bool is_frozen_tree(); - bool is_frozen_tree_root() { return state & STATE_FROZENTREE; } - bool is_frozen_dir() { return state & STATE_FROZENDIR; } - - bool is_freezeable(bool freezing=false) { - // no nested auth pins. - if ((auth_pins-freezing) > 0 || nested_auth_pins > 0) - return false; - - // inode must not be frozen. - if (!is_subtree_root() && inode->is_frozen()) - return false; - - return true; - } - bool is_freezeable_dir(bool freezing=false) { - if ((auth_pins-freezing) > 0) - return false; - - // if not subtree root, inode must not be frozen (tree--frozen_dir is okay). - if (!is_subtree_root() && inode->is_frozen() && !inode->is_frozen_dir()) - return false; - - return true; - } - - CDir *get_frozen_tree_root(); - - - - ostream& print_db_line_prefix(ostream& out); - void print(ostream& out); -}; - - - -// -- encoded state -- - -// discover - -class CDirDiscover { - dirfrag_t dirfrag; - int nonce; - int dir_rep; - set rep_by; - - public: - CDirDiscover() {} - CDirDiscover(CDir *dir, int nonce) { - dirfrag = dir->dirfrag(); - this->nonce = nonce; - dir_rep = dir->dir_rep; - rep_by = dir->dir_rep_by; - } - - void update_dir(CDir *dir) { - assert(dir->dirfrag() == dirfrag); - assert(!dir->is_auth()); - - dir->replica_nonce = nonce; - dir->dir_rep = dir_rep; - dir->dir_rep_by = rep_by; - } - - dirfrag_t get_dirfrag() { return dirfrag; } - - void _encode(bufferlist& bl) { - bl.append((char*)&dirfrag, sizeof(dirfrag)); - bl.append((char*)&nonce, sizeof(nonce)); - bl.append((char*)&dir_rep, sizeof(dir_rep)); - ::_encode(rep_by, bl); - } - - void _decode(bufferlist& bl, int& off) { - bl.copy(off, sizeof(dirfrag), (char*)&dirfrag); - off += sizeof(dirfrag); - bl.copy(off, sizeof(nonce), (char*)&nonce); - off += sizeof(nonce); - bl.copy(off, sizeof(dir_rep), (char*)&dir_rep); - off += sizeof(dir_rep); - ::_decode(rep_by, bl, off); - } - -}; - - - -#endif diff --git a/branches/sage/mds/mds/Capability.h b/branches/sage/mds/mds/Capability.h deleted file mode 100644 index d7619d13ca156..0000000000000 --- a/branches/sage/mds/mds/Capability.h +++ /dev/null @@ -1,245 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __CAPABILITY_H -#define __CAPABILITY_H - -#include "include/buffer.h" - -#include -using namespace std; - -#include "config.h" - - -// definite caps -#define CAP_FILE_RDCACHE 1 // client can safely cache reads -#define CAP_FILE_RD 2 // client can read -#define CAP_FILE_WR 4 // client can write -#define CAP_FILE_WREXTEND 8 // client can extend file -#define CAP_FILE_WRBUFFER 16 // client can safely buffer writes -#define CAP_FILE_LAZYIO 32 // client can perform lazy io - - -// heuristics -//#define CAP_FILE_DELAYFLUSH 32 - -inline string cap_string(int cap) -{ - string s; - s = "["; - if (cap & CAP_FILE_RDCACHE) s += " rdcache"; - if (cap & CAP_FILE_RD) s += " rd"; - if (cap & CAP_FILE_WR) s += " wr"; - if (cap & CAP_FILE_WRBUFFER) s += " wrbuffer"; - if (cap & CAP_FILE_WRBUFFER) s += " wrextend"; - if (cap & CAP_FILE_LAZYIO) s += " lazyio"; - s += " ]"; - return s; -} - -typedef uint32_t capseq_t; - -class Capability { -public: - struct Export { - int wanted; - int issued; - int pending; - Export() {} - Export(int w, int i, int p) : wanted(w), issued(i), pending(p) {} - }; - -private: - int wanted_caps; // what the client wants (ideally) - - map cap_history; // seq -> cap - capseq_t last_sent, last_recv; - - bool suppress; - -public: - Capability(int want=0, capseq_t s=0) : - wanted_caps(want), - last_sent(s), - last_recv(s), - suppress(false) { - } - Capability(Export& other) : - wanted_caps(other.wanted), - last_sent(0), last_recv(0) { - // issued vs pending - if (other.issued & ~other.pending) - issue(other.issued); - issue(other.pending); - } - - bool is_suppress() { return suppress; } - void set_suppress(bool b) { suppress = b; } - - bool is_null() { return cap_history.empty() && wanted_caps == 0; } - - // most recently issued caps. - int pending() { - if (cap_history.count(last_sent)) - return cap_history[ last_sent ]; - return 0; - } - - // caps client has confirmed receipt of - int confirmed() { - if (cap_history.count(last_recv)) - return cap_history[ last_recv ]; - return 0; - } - - // caps potentially issued - int issued() { - int c = 0; - for (capseq_t seq = last_recv; seq <= last_sent; seq++) { - if (cap_history.count(seq)) { - c |= cap_history[seq]; - generic_dout(10) << " cap issued: seq " << seq << " " << cap_string(cap_history[seq]) << " -> " << cap_string(c) << dendl; - } - } - return c; - } - - // caps this client wants to hold - int wanted() { return wanted_caps; } - void set_wanted(int w) { - wanted_caps = w; - } - - // needed - static int needed(int from) { - // strip out wrbuffer, rdcache - return from & (CAP_FILE_WR|CAP_FILE_RD); - } - int needed() { return needed(wanted_caps); } - - // conflicts - static int conflicts(int from) { - int c = 0; - if (from & CAP_FILE_WRBUFFER) c |= CAP_FILE_RDCACHE|CAP_FILE_RD; - if (from & CAP_FILE_WR) c |= CAP_FILE_RDCACHE; - if (from & CAP_FILE_RD) c |= CAP_FILE_WRBUFFER; - if (from & CAP_FILE_RDCACHE) c |= CAP_FILE_WRBUFFER|CAP_FILE_WR; - return c; - } - int wanted_conflicts() { return conflicts(wanted()); } - int needed_conflicts() { return conflicts(needed()); } - int issued_conflicts() { return conflicts(issued()); } - - // issue caps; return seq number. - capseq_t issue(int c) { - //int was = pending(); - //no! if (c == was && last_sent) return -1; // repeat of previous? - - ++last_sent; - cap_history[last_sent] = c; - - /* no! - // not recalling, just adding? - if (c & ~was && - cap_history.count(last_sent-1)) { - cap_history.erase(last_sent-1); - } - */ - return last_sent; - } - capseq_t get_last_seq() { return last_sent; } - - Export make_export() { - return Export(wanted_caps, issued(), pending()); - } - void merge(Export& other) { - // issued + pending - int newpending = other.pending | pending(); - if (other.issued & ~newpending) - issue(other.issued | newpending); - issue(newpending); - - // wanted - wanted_caps = wanted_caps | other.wanted; - } - void merge(int otherwanted, int otherissued) { - // issued + pending - int newpending = pending(); - if (otherissued & ~newpending) - issue(otherissued | newpending); - issue(newpending); - - // wanted - wanted_caps = wanted_caps | otherwanted; - } - - // confirm receipt of a previous sent/issued seq. - int confirm_receipt(capseq_t seq, int caps) { - int r = 0; - - // old seqs - while (last_recv < seq) { - generic_dout(10) << " cap.confirm_receipt forgetting seq " << last_recv << " " << cap_string(cap_history[last_recv]) << dendl; - r |= cap_history[last_recv]; - cap_history.erase(last_recv); - ++last_recv; - } - - // release current? - if (cap_history.count(seq) && - cap_history[seq] != caps) { - generic_dout(10) << " cap.confirm_receipt revising seq " << seq << " " << cap_string(cap_history[seq]) << " -> " << cap_string(caps) << dendl; - // note what we're releasing.. - assert(cap_history[seq] & ~caps); - r |= cap_history[seq] & ~caps; - - cap_history[seq] = caps; // confirmed() now less than before.. - } - - // null? - if (caps == 0 && - cap_history.size() == 1 && - cap_history.count(seq)) { - cap_history.clear(); // viola, null! - } - - return r; - } - - // serializers - void _encode(bufferlist& bl) { - bl.append((char*)&wanted_caps, sizeof(wanted_caps)); - bl.append((char*)&last_sent, sizeof(last_sent)); - bl.append((char*)&last_recv, sizeof(last_recv)); - ::_encode(cap_history, bl); - } - void _decode(bufferlist& bl, int& off) { - bl.copy(off, sizeof(wanted_caps), (char*)&wanted_caps); - off += sizeof(wanted_caps); - bl.copy(off, sizeof(last_sent), (char*)&last_sent); - off += sizeof(last_sent); - bl.copy(off, sizeof(last_recv), (char*)&last_recv); - off += sizeof(last_recv); - ::_decode(cap_history, bl, off); - } - -}; - - - - - -#endif diff --git a/branches/sage/mds/mds/ClientMap.cc b/branches/sage/mds/mds/ClientMap.cc deleted file mode 100644 index 1d781b9ba48c3..0000000000000 --- a/branches/sage/mds/mds/ClientMap.cc +++ /dev/null @@ -1,126 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#define DBLEVEL 20 - -#include "include/types.h" - -#include "MDS.h" -#include "ClientMap.h" - -#include "osdc/Filer.h" - -#include "config.h" - -#define dout(x) if (x <= g_conf.debug_mds) *_dout << dbeginl << g_clock.now() << " mds" << mds->get_nodeid() << ".clientmap " - - - -void ClientMap::init_inode() -{ - memset(&inode, 0, sizeof(inode)); - inode.ino = MDS_INO_CLIENTMAP_OFFSET + mds->get_nodeid(); - inode.layout = g_OSD_FileLayout; -} - - -// ---------------- -// LOAD - -class C_CM_Load : public Context { - ClientMap *clientmap; -public: - bufferlist bl; - C_CM_Load(ClientMap *cm) : clientmap(cm) {} - void finish(int r) { - clientmap->_load_finish(bl); - } -}; - -void ClientMap::load(Context *onload) -{ - dout(10) << "load" << dendl; - - init_inode(); - - if (onload) - waiting_for_load.push_back(onload); - - C_CM_Load *c = new C_CM_Load(this); - mds->filer->read(inode, - 0, inode.layout.fl_stripe_unit, - &c->bl, - c); - -} - -void ClientMap::_load_finish(bufferlist &bl) -{ - int off = 0; - decode(bl, off); - dout(10) << "_load_finish v " << version - << ", " << client_inst.size() << " clients, " - << bl.length() << " bytes" - << dendl; - projected = committing = committed = version; - finish_contexts(waiting_for_load); -} - - -// ---------------- -// SAVE - -class C_CM_Save : public Context { - ClientMap *clientmap; - version_t version; -public: - C_CM_Save(ClientMap *cm, version_t v) : clientmap(cm), version(v) {} - void finish(int r) { - clientmap->_save_finish(version); - } -}; - -void ClientMap::save(Context *onsave, version_t needv) -{ - dout(10) << "save needv " << needv << ", v " << version << dendl; - - if (needv && committing >= needv) { - assert(committing > committed); - commit_waiters[committing].push_back(onsave); - return; - } - - commit_waiters[version].push_back(onsave); - - bufferlist bl; - - init_inode(); - encode(bl); - committing = version; - mds->filer->write(inode, - 0, bl.length(), bl, - 0, - 0, new C_CM_Save(this, version)); -} - -void ClientMap::_save_finish(version_t v) -{ - dout(10) << "_save_finish v" << v << dendl; - committed = v; - - finish_contexts(commit_waiters[v]); - commit_waiters.erase(v); -} diff --git a/branches/sage/mds/mds/FileLock.h b/branches/sage/mds/mds/FileLock.h deleted file mode 100644 index 09868f7563fb6..0000000000000 --- a/branches/sage/mds/mds/FileLock.h +++ /dev/null @@ -1,227 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __FILELOCK_H -#define __FILELOCK_H - -#include -#include -using namespace std; - -#include "include/buffer.h" - -#include "SimpleLock.h" -#include "Capability.h" - -// states and such. -// C = cache reads, R = read, W = write, A = append, B = buffer writes, L = lazyio - -// -----auth-------- ---replica------- -#define LOCK_SYNC_ 1 // AR R . / C R . . . L R . / C R . . . L stat() -#define LOCK_GSYNCL -12 // A . . / C ? . . . L loner -> sync (*) -#define LOCK_GSYNCM -13 // A . . / . R . . . L - -#define LOCK_LOCK_ 2 // AR R W / C . . . . . . . / C . . . . . truncate() -#define LOCK_GLOCKR_ -3 // AR R . / C . . . . . . . / C . . . . . -#define LOCK_GLOCKL -4 // A . . / C . . . . . loner -> lock -#define LOCK_GLOCKM -5 // A . . / . . . . . . - -#define LOCK_MIXED 6 // AR . . / . R W A . L . . / . R . . . L -#define LOCK_GMIXEDR -7 // AR R . / . R . . . L . . / . R . . . L -#define LOCK_GMIXEDL -8 // A . . / . . . . . L loner -> mixed - -#define LOCK_LONER 9 // A . . / C R W A B L (lock) -#define LOCK_GLONERR -10 // A . . / . R . . . L -#define LOCK_GLONERM -11 // A . . / . R W A . L - -// (*) FIXME: how to let old loner keep R, somehow, during GSYNCL - -// 4 stable -// +9 transition -// 13 total - -inline const char *get_filelock_state_name(int n) { - switch (n) { - case LOCK_SYNC: return "sync"; - case LOCK_GSYNCL: return "gsyncl"; - case LOCK_GSYNCM: return "gsyncm"; - case LOCK_LOCK: return "lock"; - case LOCK_GLOCKR: return "glockr"; - case LOCK_GLOCKL: return "glockl"; - case LOCK_GLOCKM: return "glockm"; - case LOCK_MIXED: return "mixed"; - case LOCK_GMIXEDR: return "gmixedr"; - case LOCK_GMIXEDL: return "gmixedl"; - case LOCK_LONER: return "loner"; - case LOCK_GLONERR: return "glonerr"; - case LOCK_GLONERM: return "glonerm"; - default: assert(0); return 0; - } -} - - -/* no append scenarios: - -loner + truncate(): - - loner needs to lose A (?unless it's the loner doing the truncate?) -loner + statlite(size): - - loner needs to lose A - -any + statlite(size) - - all lose A - -any + statlite(mtime) - - all lose W - --> we need to add lonerfixed and mixedfixed states (and associated transitions) - in order to efficiently support statlite(size) and truncate(). until then, - we have to LOCK. - - */ - -// -- lock... hard or file - -class MDRequest; - -class FileLock : public SimpleLock { - public: - FileLock(MDSCacheObject *o, int t, int wo) : SimpleLock(o, t, wo) { } - - int get_replica_state() { - switch (state) { - case LOCK_LOCK: - case LOCK_GLOCKM: - case LOCK_GLOCKL: - case LOCK_GLOCKR: - case LOCK_LONER: - case LOCK_GLONERR: - case LOCK_GLONERM: - return LOCK_LOCK; - case LOCK_MIXED: - case LOCK_GMIXEDR: - return LOCK_MIXED; - case LOCK_SYNC: - return LOCK_SYNC; - - // after gather auth will bc LOCK_AC_MIXED or whatever - case LOCK_GSYNCM: - return LOCK_MIXED; - case LOCK_GSYNCL: - case LOCK_GMIXEDL: // ** LOCK isn't exact right state, but works. - return LOCK_LOCK; - - default: - assert(0); - } - return 0; - } - void export_twiddle() { - clear_gather(); - state = get_replica_state(); - } - - // read/write access - bool can_rdlock(MDRequest *mdr) { - if (!parent->is_auth()) return (state == LOCK_SYNC); - //if (state == LOCK_LOCK && mdr && xlock_by == mdr) return true; - if (state == LOCK_LOCK && !xlock_by) return true; - return - (state == LOCK_SYNC) || - (state == LOCK_GMIXEDR) || - (state == LOCK_GLOCKR); - } - bool can_rdlock_soon() { - if (parent->is_auth()) - return (state == LOCK_GLOCKL); - else - return false; - } - bool can_xlock_soon() { - if (parent->is_auth()) - return (state == LOCK_GLOCKR) || (state == LOCK_GLOCKL) - || (state == LOCK_GLOCKM); - else - return false; - } - - // client caps allowed - int caps_allowed_ever() { - if (parent->is_auth()) - return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_WR | CAP_FILE_WREXTEND | CAP_FILE_WRBUFFER | CAP_FILE_LAZYIO; - else - return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_LAZYIO; - } - int caps_allowed() { - if (parent->is_auth()) - switch (state) { - case LOCK_SYNC: - return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_LAZYIO; - case LOCK_LOCK: - case LOCK_GLOCKR: - case LOCK_GLOCKL: - return CAP_FILE_RDCACHE; - - case LOCK_GLOCKM: - return 0; - - case LOCK_MIXED: - return CAP_FILE_RD | CAP_FILE_WR | CAP_FILE_WREXTEND | CAP_FILE_LAZYIO; - case LOCK_GMIXEDR: - return CAP_FILE_RD | CAP_FILE_LAZYIO; - case LOCK_GMIXEDL: - return 0; - - case LOCK_LONER: // single client writer, of course. - return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_WR | CAP_FILE_WREXTEND | CAP_FILE_WRBUFFER | CAP_FILE_LAZYIO; - case LOCK_GLONERR: - return CAP_FILE_RD | CAP_FILE_LAZYIO; - case LOCK_GLONERM: - return CAP_FILE_RD | CAP_FILE_WR | CAP_FILE_WREXTEND | CAP_FILE_LAZYIO; - - case LOCK_GSYNCL: - return CAP_FILE_RDCACHE | CAP_FILE_LAZYIO; - case LOCK_GSYNCM: - return CAP_FILE_RD | CAP_FILE_LAZYIO; - } - else - switch (state) { - case LOCK_SYNC: - return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_LAZYIO; - case LOCK_LOCK: - case LOCK_GLOCKR: - return CAP_FILE_RDCACHE; - case LOCK_GMIXEDR: - case LOCK_MIXED: - return CAP_FILE_RD | CAP_FILE_LAZYIO; - } - assert(0); - return 0; - } - - void print(ostream& out) { - out << "("; - out << get_lock_type_name(get_type()) << " "; - out << get_filelock_state_name(get_state()); - if (!get_gather_set().empty()) out << " g=" << get_gather_set(); - if (is_rdlocked()) - out << " r=" << get_num_rdlocks(); - if (is_xlocked()) - out << " x=" << get_xlocked_by(); - out << ")"; - } -}; - - -#endif diff --git a/branches/sage/mds/mds/IdAllocator.cc b/branches/sage/mds/mds/IdAllocator.cc deleted file mode 100644 index 36a36ea9eb037..0000000000000 --- a/branches/sage/mds/mds/IdAllocator.cc +++ /dev/null @@ -1,205 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#define DBLEVEL 20 - -#include "IdAllocator.h" -#include "MDS.h" -#include "MDLog.h" - -#include "osdc/Filer.h" - -#include "include/types.h" - -#include "config.h" - -#define dout(x) if (x <= g_conf.debug_mds) *_dout << dbeginl << g_clock.now() << " mds" << mds->get_nodeid() << ".idalloc: " - - -void IdAllocator::init_inode() -{ - memset(&inode, 0, sizeof(inode)); - inode.ino = MDS_INO_IDS_OFFSET + mds->get_nodeid(); - inode.layout = g_OSD_FileLayout; -} - - -inodeno_t IdAllocator::alloc_id() -{ - assert(is_active()); - - // pick one - inodeno_t id = free.start(); - free.erase(id); - dout(10) << "idalloc " << this << ": alloc id " << id << dendl; - - version++; - - // log it - /* - if (!replay) - mds->mdlog->submit_entry(new EAlloc(IDTYPE_INO, id, EALLOC_EV_ALLOC, version)); - */ - - return id; -} - -void IdAllocator::reclaim_id(inodeno_t id) -{ - assert(is_active()); - - dout(10) << "idalloc " << this << ": reclaim id " << id << dendl; - free.insert(id); - - version++; - - /* - if (!replay) - mds->mdlog->submit_entry(new EAlloc(IDTYPE_INO, id, EALLOC_EV_FREE, version)); - */ -} - - - -class C_ID_Save : public Context { - IdAllocator *ida; - version_t version; -public: - C_ID_Save(IdAllocator *i, version_t v) : ida(i), version(v) {} - void finish(int r) { - ida->save_2(version); - } -}; - -void IdAllocator::save(Context *onfinish, version_t v) -{ - if (v > 0 && v <= committing_version) { - dout(10) << "save v " << version << " - already saving " - << committing_version << " >= needed " << v << dendl; - waitfor_save[v].push_back(onfinish); - return; - } - - dout(10) << "save v " << version << dendl; - assert(is_active()); - - bufferlist bl; - - bl.append((char*)&version, sizeof(version)); - ::_encode(free.m, bl); - - committing_version = version; - - if (onfinish) - waitfor_save[version].push_back(onfinish); - - // write (async) - mds->filer->write(inode, - 0, bl.length(), bl, - 0, - 0, new C_ID_Save(this, version)); -} - -void IdAllocator::save_2(version_t v) -{ - dout(10) << "save_2 v " << v << dendl; - - committed_version = v; - - list ls; - while (!waitfor_save.empty()) { - if (waitfor_save.begin()->first > v) break; - ls.splice(ls.end(), waitfor_save.begin()->second); - waitfor_save.erase(waitfor_save.begin()); - } - finish_contexts(ls,0); -} - - -void IdAllocator::reset() -{ - init_inode(); - - // use generic range. FIXME THIS IS CRAP - free.clear(); -#ifdef __LP64__ - uint64_t start = (uint64_t)(mds->get_nodeid()+1) << 40; - uint64_t end = ((uint64_t)(mds->get_nodeid()+2) << 40) - 1; -#else -# warning this looks like a 32-bit system, using small inode numbers. - uint64_t start = (uint64_t)(mds->get_nodeid()+1) << 25; - uint64_t end = ((uint64_t)(mds->get_nodeid()+2) << 25) - 1; -#endif - free.insert(start, end); - - state = STATE_ACTIVE; -} - - - -// ----------------------- - -class C_ID_Load : public Context { -public: - IdAllocator *ida; - Context *onfinish; - bufferlist bl; - C_ID_Load(IdAllocator *i, Context *o) : ida(i), onfinish(o) {} - void finish(int r) { - ida->load_2(r, bl, onfinish); - } -}; - -void IdAllocator::load(Context *onfinish) -{ - dout(10) << "load" << dendl; - - init_inode(); - - assert(is_undef()); - state = STATE_OPENING; - - C_ID_Load *c = new C_ID_Load(this, onfinish); - mds->filer->read(inode, - 0, inode.layout.fl_stripe_unit, - &c->bl, - c); -} - -void IdAllocator::load_2(int r, bufferlist& bl, Context *onfinish) -{ - assert(is_opening()); - state = STATE_ACTIVE; - - if (r > 0) { - dout(10) << "load_2 got " << bl.length() << " bytes" << dendl; - int off = 0; - bl.copy(off, sizeof(version), (char*)&version); - off += sizeof(version); - ::_decode(free.m, bl, off); - committed_version = version; - } - else { - dout(10) << "load_2 found no alloc file" << dendl; - assert(0); // this shouldn't happen if mkfs finished. - reset(); - } - - if (onfinish) { - onfinish->finish(0); - delete onfinish; - } -} diff --git a/branches/sage/mds/mds/IdAllocator.h b/branches/sage/mds/mds/IdAllocator.h deleted file mode 100644 index 51001f2236627..0000000000000 --- a/branches/sage/mds/mds/IdAllocator.h +++ /dev/null @@ -1,78 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __IDALLOCATOR_H -#define __IDALLOCATOR_H - -#include "include/types.h" -#include "include/interval_set.h" -#include "include/buffer.h" -#include "include/Context.h" - -class MDS; - -class IdAllocator { - MDS *mds; - inode_t inode; - - static const int STATE_UNDEF = 0; - static const int STATE_OPENING = 1; - static const int STATE_ACTIVE = 2; - //static const int STATE_COMMITTING = 3; - int state; - - version_t version, committing_version, committed_version; - - interval_set free; // unused ids - - map > waitfor_save; - - public: - IdAllocator(MDS *m) : - mds(m), - state(STATE_UNDEF), - version(0), committing_version(0), committed_version(0) - { - } - - void init_inode(); - - // alloc or reclaim ids - inodeno_t alloc_id(); - void reclaim_id(inodeno_t ino); - - version_t get_version() { return version; } - version_t get_committed_version() { return committed_version; } - version_t get_committing_version() { return committing_version; } - - // load/save from disk (hack) - bool is_undef() { return state == STATE_UNDEF; } - bool is_active() { return state == STATE_ACTIVE; } - bool is_opening() { return state == STATE_OPENING; } - - void reset(); - void save(Context *onfinish=0, version_t need=0); - void save_2(version_t v); - - void shutdown() { - if (is_active()) save(0); - } - - void load(Context *onfinish); - void load_2(int, bufferlist&, Context *onfinish); - -}; - -#endif diff --git a/branches/sage/mds/mds/LocalLock.h b/branches/sage/mds/mds/LocalLock.h deleted file mode 100644 index 752fdcb4d3fd1..0000000000000 --- a/branches/sage/mds/mds/LocalLock.h +++ /dev/null @@ -1,61 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __LOCALLOCK_H -#define __LOCALLOCK_H - -#include "SimpleLock.h" - -class LocalLock : public SimpleLock { -protected: - int num_wrlock; - -public: - LocalLock(MDSCacheObject *o, int t, int wo) : - SimpleLock(o, t, wo), - num_wrlock(0) { - set_state(LOCK_LOCK); // always. - } - - bool can_wrlock() { - return !is_xlocked(); - } - void get_wrlock() { - assert(can_wrlock()); - if (num_wrlock == 0) parent->get(MDSCacheObject::PIN_LOCK); - ++num_wrlock; - } - void put_wrlock() { - --num_wrlock; - if (num_wrlock == 0) parent->put(MDSCacheObject::PIN_LOCK); - } - bool is_wrlocked() { return num_wrlock > 0; } - int get_num_wrlocks() { return num_wrlock; } - - - void print(ostream& out) { - out << "("; - out << get_lock_type_name(get_type()); - if (is_xlocked()) - out << " x=" << get_xlocked_by(); - if (is_wrlocked()) - out << " wr=" << get_num_wrlocks(); - out << ")"; - } - -}; - - -#endif diff --git a/branches/sage/mds/mds/Locker.cc b/branches/sage/mds/mds/Locker.cc deleted file mode 100644 index 902cfd79ed28e..0000000000000 --- a/branches/sage/mds/mds/Locker.cc +++ /dev/null @@ -1,2918 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include "MDS.h" -#include "MDCache.h" -#include "Locker.h" -#include "CInode.h" -#include "CDir.h" -#include "CDentry.h" - -#include "MDLog.h" -#include "MDSMap.h" - -#include "include/filepath.h" - -#include "events/EString.h" -#include "events/EUpdate.h" - -#include "msg/Messenger.h" - -#include "messages/MGenericMessage.h" -#include "messages/MDiscover.h" -#include "messages/MDiscoverReply.h" - -#include "messages/MDirUpdate.h" - -#include "messages/MInodeFileCaps.h" - -#include "messages/MLock.h" -#include "messages/MDentryUnlink.h" - -#include "messages/MClientRequest.h" -#include "messages/MClientFileCaps.h" - -#include "messages/MMDSSlaveRequest.h" - -#include -#include - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) *_dout << dbeginl << g_clock.now() << " mds" << mds->get_nodeid() << ".locker " - - - -void Locker::dispatch(Message *m) -{ - - switch (m->get_type()) { - - // locking - case MSG_MDS_LOCK: - handle_lock((MLock*)m); - break; - - // cache fun - case MSG_MDS_INODEFILECAPS: - handle_inode_file_caps((MInodeFileCaps*)m); - break; - - case MSG_CLIENT_FILECAPS: - handle_client_file_caps((MClientFileCaps*)m); - break; - - - - default: - assert(0); - } -} - - -void Locker::send_lock_message(SimpleLock *lock, int msg) -{ - for (map::iterator it = lock->get_parent()->replicas_begin(); - it != lock->get_parent()->replicas_end(); - it++) { - if (mds->mdsmap->get_state(it->first) < MDSMap::STATE_REJOIN) - continue; - MLock *m = new MLock(lock, msg, mds->get_nodeid()); - mds->send_message_mds(m, it->first, MDS_PORT_LOCKER); - } -} - -void Locker::send_lock_message(SimpleLock *lock, int msg, const bufferlist &data) -{ - for (map::iterator it = lock->get_parent()->replicas_begin(); - it != lock->get_parent()->replicas_end(); - it++) { - if (mds->mdsmap->get_state(it->first) < MDSMap::STATE_REJOIN) - continue; - MLock *m = new MLock(lock, msg, mds->get_nodeid()); - m->set_data(data); - mds->send_message_mds(m, it->first, MDS_PORT_LOCKER); - } -} - - - - - - - - - - - -bool Locker::acquire_locks(MDRequest *mdr, - set &rdlocks, - set &wrlocks, - set &xlocks) -{ - if (mdr->done_locking) { - dout(10) << "acquire_locks " << *mdr << " -- done locking" << dendl; - return true; // at least we had better be! - } - dout(10) << "acquire_locks " << *mdr << dendl; - - set sorted; // sort everything we will lock - set mustpin = xlocks; // items to authpin - - // xlocks - for (set::iterator p = xlocks.begin(); p != xlocks.end(); ++p) { - dout(20) << " must xlock " << **p << " " << *(*p)->get_parent() << dendl; - sorted.insert(*p); - - // augment xlock with a versionlock? - if ((*p)->get_type() > LOCK_OTYPE_IVERSION) { - // inode version lock? - CInode *in = (CInode*)(*p)->get_parent(); - if (mdr->is_master()) { - // master. wrlock versionlock so we can pipeline inode updates to journal. - wrlocks.insert(&in->versionlock); - } else { - // slave. exclusively lock the inode version (i.e. block other journal updates) - xlocks.insert(&in->versionlock); - sorted.insert(&in->versionlock); - } - } - } - - // wrlocks - for (set::iterator p = wrlocks.begin(); p != wrlocks.end(); ++p) { - dout(20) << " must wrlock " << **p << " " << *(*p)->get_parent() << dendl; - sorted.insert(*p); - if ((*p)->get_parent()->is_auth()) - mustpin.insert(*p); - else if ((*p)->get_type() == LOCK_OTYPE_IDIR && - !(*p)->get_parent()->is_auth() && !((ScatterLock*)(*p))->can_wrlock()) { // we might have to request a scatter - dout(15) << " will also auth_pin " << *(*p)->get_parent() << " in case we need to request a scatter" << dendl; - mustpin.insert(*p); - } - } - - // rdlocks - for (set::iterator p = rdlocks.begin(); - p != rdlocks.end(); - ++p) { - dout(20) << " must rdlock " << **p << " " << *(*p)->get_parent() << dendl; - sorted.insert(*p); - } - - - // AUTH PINS - map > mustpin_remote; // mds -> (object set) - - // can i auth pin them all now? - for (set::iterator p = mustpin.begin(); - p != mustpin.end(); - ++p) { - MDSCacheObject *object = (*p)->get_parent(); - - dout(10) << " must authpin " << *object << dendl; - - if (mdr->is_auth_pinned(object)) - continue; - - if (!object->is_auth()) { - if (object->is_ambiguous_auth()) { - // wait - dout(10) << " ambiguous auth, waiting to authpin " << *object << dendl; - object->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, new C_MDS_RetryRequest(mdcache, mdr)); - mds->locker->drop_locks(mdr); - mdr->drop_local_auth_pins(); - return false; - } - mustpin_remote[object->authority().first].insert(object); - continue; - } - if (!object->can_auth_pin()) { - // wait - dout(10) << " can't auth_pin (freezing?), waiting to authpin " << *object << dendl; - object->add_waiter(MDSCacheObject::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr)); - mds->locker->drop_locks(mdr); - mdr->drop_local_auth_pins(); - return false; - } - } - - // ok, grab local auth pins - for (set::iterator p = mustpin.begin(); - p != mustpin.end(); - ++p) { - MDSCacheObject *object = (*p)->get_parent(); - if (mdr->is_auth_pinned(object)) { - dout(10) << " already auth_pinned " << *object << dendl; - } else if (object->is_auth()) { - dout(10) << " auth_pinning " << *object << dendl; - mdr->auth_pin(object); - } - } - - // request remote auth_pins - if (!mustpin_remote.empty()) { - for (map >::iterator p = mustpin_remote.begin(); - p != mustpin_remote.end(); - ++p) { - dout(10) << "requesting remote auth_pins from mds" << p->first << dendl; - - MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_AUTHPIN); - for (set::iterator q = p->second.begin(); - q != p->second.end(); - ++q) { - dout(10) << " req remote auth_pin of " << **q << dendl; - MDSCacheObjectInfo info; - (*q)->set_object_info(info); - req->get_authpins().push_back(info); - mdr->pin(*q); - } - mds->send_message_mds(req, p->first, MDS_PORT_SERVER); - - // put in waiting list - assert(mdr->more()->waiting_on_slave.count(p->first) == 0); - mdr->more()->waiting_on_slave.insert(p->first); - } - return false; - } - - // acquire locks. - // make sure they match currently acquired locks. - set::iterator existing = mdr->locks.begin(); - for (set::iterator p = sorted.begin(); - p != sorted.end(); - ++p) { - - // already locked? - if (existing != mdr->locks.end() && *existing == *p) { - // right kind? - SimpleLock *have = *existing; - existing++; - if (xlocks.count(*p) && mdr->xlocks.count(*p)) { - dout(10) << " already xlocked " << *have << " " << *have->get_parent() << dendl; - } - else if (wrlocks.count(*p) && mdr->wrlocks.count(*p)) { - dout(10) << " already wrlocked " << *have << " " << *have->get_parent() << dendl; - } - else if (rdlocks.count(*p) && mdr->rdlocks.count(*p)) { - dout(10) << " already rdlocked " << *have << " " << *have->get_parent() << dendl; - } - else assert(0); - continue; - } - - // hose any stray locks - while (existing != mdr->locks.end()) { - SimpleLock *stray = *existing; - existing++; - dout(10) << " unlocking out-of-order " << *stray << " " << *stray->get_parent() << dendl; - if (mdr->xlocks.count(stray)) - xlock_finish(stray, mdr); - else if (mdr->wrlocks.count(stray)) - wrlock_finish(stray, mdr); - else - rdlock_finish(stray, mdr); - } - - // lock - if (xlocks.count(*p)) { - if (!xlock_start(*p, mdr)) - return false; - dout(10) << " got xlock on " << **p << " " << *(*p)->get_parent() << dendl; - } else if (wrlocks.count(*p)) { - if (!wrlock_start(*p, mdr)) - return false; - dout(10) << " got wrlock on " << **p << " " << *(*p)->get_parent() << dendl; - } else { - if (!rdlock_start(*p, mdr)) - return false; - dout(10) << " got rdlock on " << **p << " " << *(*p)->get_parent() << dendl; - } - } - - // any extra unneeded locks? - while (existing != mdr->locks.end()) { - SimpleLock *stray = *existing; - existing++; - dout(10) << " unlocking extra " << *stray << " " << *stray->get_parent() << dendl; - if (mdr->xlocks.count(stray)) - xlock_finish(stray, mdr); - else if (mdr->wrlocks.count(stray)) - wrlock_finish(stray, mdr); - else - rdlock_finish(stray, mdr); - } - - return true; -} - - -void Locker::drop_locks(MDRequest *mdr) -{ - // leftover locks - while (!mdr->xlocks.empty()) - xlock_finish(*mdr->xlocks.begin(), mdr); - while (!mdr->rdlocks.empty()) - rdlock_finish(*mdr->rdlocks.begin(), mdr); - while (!mdr->wrlocks.empty()) - wrlock_finish(*mdr->wrlocks.begin(), mdr); -} - - -// generics - -bool Locker::rdlock_start(SimpleLock *lock, MDRequest *mdr) -{ - switch (lock->get_type()) { - case LOCK_OTYPE_IFILE: - return file_rdlock_start((FileLock*)lock, mdr); - case LOCK_OTYPE_IDIRFRAGTREE: - case LOCK_OTYPE_IDIR: - return scatter_rdlock_start((ScatterLock*)lock, mdr); - default: - return simple_rdlock_start(lock, mdr); - } -} - -void Locker::rdlock_finish(SimpleLock *lock, MDRequest *mdr) -{ - switch (lock->get_type()) { - case LOCK_OTYPE_IFILE: - return file_rdlock_finish((FileLock*)lock, mdr); - case LOCK_OTYPE_IDIRFRAGTREE: - case LOCK_OTYPE_IDIR: - return scatter_rdlock_finish((ScatterLock*)lock, mdr); - default: - return simple_rdlock_finish(lock, mdr); - } -} - -bool Locker::wrlock_start(SimpleLock *lock, MDRequest *mdr) -{ - switch (lock->get_type()) { - case LOCK_OTYPE_IDIRFRAGTREE: - case LOCK_OTYPE_IDIR: - return scatter_wrlock_start((ScatterLock*)lock, mdr); - case LOCK_OTYPE_IVERSION: - return local_wrlock_start((LocalLock*)lock, mdr); - default: - assert(0); - return false; - } -} - -void Locker::wrlock_finish(SimpleLock *lock, MDRequest *mdr) -{ - switch (lock->get_type()) { - case LOCK_OTYPE_IDIRFRAGTREE: - case LOCK_OTYPE_IDIR: - return scatter_wrlock_finish((ScatterLock*)lock, mdr); - case LOCK_OTYPE_IVERSION: - return local_wrlock_finish((LocalLock*)lock, mdr); - default: - assert(0); - } -} - -bool Locker::xlock_start(SimpleLock *lock, MDRequest *mdr) -{ - switch (lock->get_type()) { - case LOCK_OTYPE_IFILE: - return file_xlock_start((FileLock*)lock, mdr); - case LOCK_OTYPE_IVERSION: - return local_xlock_start((LocalLock*)lock, mdr); - case LOCK_OTYPE_IDIRFRAGTREE: - case LOCK_OTYPE_IDIR: - assert(0); - default: - return simple_xlock_start(lock, mdr); - } -} - -void Locker::xlock_finish(SimpleLock *lock, MDRequest *mdr) -{ - switch (lock->get_type()) { - case LOCK_OTYPE_IFILE: - return file_xlock_finish((FileLock*)lock, mdr); - case LOCK_OTYPE_IVERSION: - return local_xlock_finish((LocalLock*)lock, mdr); - case LOCK_OTYPE_IDIRFRAGTREE: - case LOCK_OTYPE_IDIR: - assert(0); - default: - return simple_xlock_finish(lock, mdr); - } -} - - - -/** rejoin_set_state - * @lock the lock - * @s the new state - * @waiters list for anybody waiting on this lock - */ -void Locker::rejoin_set_state(SimpleLock *lock, int s, list& waiters) -{ - if (!lock->is_stable()) { - lock->set_state(s); - lock->get_parent()->auth_unpin(); - } else { - lock->set_state(s); - } - lock->take_waiting(SimpleLock::WAIT_ALL, waiters); -} - - - - -// file i/o ----------------------------------------- - -version_t Locker::issue_file_data_version(CInode *in) -{ - dout(7) << "issue_file_data_version on " << *in << dendl; - return in->inode.file_data_version; -} - - -Capability* Locker::issue_new_caps(CInode *in, - int mode, - MClientRequest *req) -{ - dout(7) << "issue_new_caps for mode " << mode << " on " << *in << dendl; - - // my needs - int my_client = req->get_client().num(); - int my_want = 0; - if (mode & FILE_MODE_R) my_want |= CAP_FILE_RDCACHE | CAP_FILE_RD; - if (mode & FILE_MODE_W) my_want |= CAP_FILE_WRBUFFER | CAP_FILE_WR; - - // register a capability - Capability *cap = in->get_client_cap(my_client); - if (!cap) { - // new cap - Capability c(my_want); - in->add_client_cap(my_client, c); - cap = in->get_client_cap(my_client); - - // suppress file cap messages for new cap (we'll bundle with the open() reply) - cap->set_suppress(true); - } else { - // make sure it has sufficient caps - if (my_want & ~cap->wanted()) { - // augment wanted caps for this client - cap->set_wanted( cap->wanted() | my_want ); - } - } - - int before = cap->pending(); - - if (in->is_auth()) { - // [auth] twiddle mode? - if (in->filelock.is_stable()) - file_eval(&in->filelock); - } else { - // [replica] tell auth about any new caps wanted - request_inode_file_caps(in); - } - - // issue caps (pot. incl new one) - issue_caps(in); // note: _eval above may have done this already... - - // re-issue whatever we can - cap->issue(cap->pending()); - - // ok, stop suppressing. - cap->set_suppress(false); - - int now = cap->pending(); - if (before != now && - (before & CAP_FILE_WR) == 0 && - (now & CAP_FILE_WR)) { - // FIXME FIXME FIXME - } - - // twiddle file_data_version? - if ((before & CAP_FILE_WRBUFFER) == 0 && - (now & CAP_FILE_WRBUFFER)) { - in->inode.file_data_version++; - dout(7) << " incrementing file_data_version, now " << in->inode.file_data_version << " for " << *in << dendl; - } - - return cap; -} - - - -bool Locker::issue_caps(CInode *in) -{ - // allowed caps are determined by the lock mode. - int allowed = in->filelock.caps_allowed(); - dout(7) << "issue_caps filelock allows=" << cap_string(allowed) - << " on " << *in << dendl; - - // count conflicts with - int nissued = 0; - - // client caps - for (map::iterator it = in->client_caps.begin(); - it != in->client_caps.end(); - it++) { - if (it->second.pending() != (it->second.wanted() & allowed)) { - // issue - nissued++; - - int before = it->second.pending(); - long seq = it->second.issue(it->second.wanted() & allowed); - int after = it->second.pending(); - - // twiddle file_data_version? - if (!(before & CAP_FILE_WRBUFFER) && - (after & CAP_FILE_WRBUFFER)) { - dout(7) << " incrementing file_data_version for " << *in << dendl; - in->inode.file_data_version++; - } - - if (seq > 0 && - !it->second.is_suppress()) { - dout(7) << " sending MClientFileCaps to client" << it->first << " seq " << it->second.get_last_seq() << " new pending " << cap_string(it->second.pending()) << " was " << cap_string(before) << dendl; - mds->send_message_client(new MClientFileCaps(MClientFileCaps::OP_GRANT, - in->inode, - it->second.get_last_seq(), - it->second.pending(), - it->second.wanted()), - it->first); - } - } - } - - return (nissued == 0); // true if no re-issued, no callbacks -} - - -class C_MDL_RequestInodeFileCaps : public Context { - Locker *locker; - CInode *in; -public: - C_MDL_RequestInodeFileCaps(Locker *l, CInode *i) : locker(l), in(i) {} - void finish(int r) { - in->put(CInode::PIN_PTRWAITER); - if (!in->is_auth()) - locker->request_inode_file_caps(in); - } -}; - -void Locker::request_inode_file_caps(CInode *in) -{ - int wanted = in->get_caps_wanted(); - if (wanted != in->replica_caps_wanted) { - - if (wanted == 0) { - if (in->replica_caps_wanted_keep_until > g_clock.recent_now()) { - // ok, release them finally! - in->replica_caps_wanted_keep_until.sec_ref() = 0; - dout(7) << "request_inode_file_caps " << cap_string(wanted) - << " was " << cap_string(in->replica_caps_wanted) - << " no keeping anymore " - << " on " << *in - << dendl; - } - else if (in->replica_caps_wanted_keep_until.sec() == 0) { - in->replica_caps_wanted_keep_until = g_clock.recent_now(); - in->replica_caps_wanted_keep_until.sec_ref() += 2; - - dout(7) << "request_inode_file_caps " << cap_string(wanted) - << " was " << cap_string(in->replica_caps_wanted) - << " keeping until " << in->replica_caps_wanted_keep_until - << " on " << *in - << dendl; - return; - } else { - // wait longer - return; - } - } else { - in->replica_caps_wanted_keep_until.sec_ref() = 0; - } - assert(!in->is_auth()); - - // wait for single auth - if (in->is_ambiguous_auth()) { - in->get(CInode::PIN_PTRWAITER); - in->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, - new C_MDL_RequestInodeFileCaps(this, in)); - return; - } - - int auth = in->authority().first; - dout(7) << "request_inode_file_caps " << cap_string(wanted) - << " was " << cap_string(in->replica_caps_wanted) - << " on " << *in << " to mds" << auth << dendl; - assert(!in->is_auth()); - - in->replica_caps_wanted = wanted; - - if (mds->mdsmap->get_state(auth) >= MDSMap::STATE_REJOIN) - mds->send_message_mds(new MInodeFileCaps(in->ino(), mds->get_nodeid(), - in->replica_caps_wanted), - auth, MDS_PORT_LOCKER); - } else { - in->replica_caps_wanted_keep_until.sec_ref() = 0; - } -} - -void Locker::handle_inode_file_caps(MInodeFileCaps *m) -{ - // nobody should be talking to us during recovery. - assert(mds->is_rejoin() || mds->is_active() || mds->is_stopping()); - - // ok - CInode *in = mdcache->get_inode(m->get_ino()); - assert(in); - assert(in->is_auth()); - - if (mds->is_rejoin() && - in->is_rejoining()) { - dout(7) << "handle_inode_file_caps still rejoining " << *in << ", dropping " << *m << dendl; - delete m; - return; - } - - - dout(7) << "handle_inode_file_caps replica mds" << m->get_from() << " wants caps " << cap_string(m->get_caps()) << " on " << *in << dendl; - - if (m->get_caps()) - in->mds_caps_wanted[m->get_from()] = m->get_caps(); - else - in->mds_caps_wanted.erase(m->get_from()); - - if (in->filelock.is_stable()) - try_file_eval(&in->filelock); // ** may or may not be auth_pinned ** - delete m; -} - - -/* - * note: we only get these from the client if - * - we are calling back previously issued caps (fewer than the client previously had) - * - or if the client releases (any of) its caps on its own - */ -void Locker::handle_client_file_caps(MClientFileCaps *m) -{ - int client = m->get_source().num(); - CInode *in = mdcache->get_inode(m->get_ino()); - Capability *cap = 0; - if (in) - cap = in->get_client_cap(client); - - if (!in || !cap) { - if (!in) { - dout(7) << "handle_client_file_caps on unknown ino " << m->get_ino() << ", dropping" << dendl; - } else { - dout(7) << "handle_client_file_caps no cap for client" << client << " on " << *in << dendl; - } - delete m; - return; - } - - assert(cap); - - // filter wanted based on what we could ever give out (given auth/replica status) - int wanted = m->get_wanted() & in->filelock.caps_allowed_ever(); - - dout(7) << "handle_client_file_caps seq " << m->get_seq() - << " confirms caps " << cap_string(m->get_caps()) - << " wants " << cap_string(wanted) - << " from client" << client - << " on " << *in - << dendl; - - // update wanted - if (cap->wanted() != wanted) { - if (m->get_seq() < cap->get_last_seq()) { - /* this is awkward. - client may be trying to release caps (i.e. inode closed, etc.) by setting reducing wanted - set. - but it may also be opening the same filename, not sure that it'll map to the same inode. - so, we don't want wanted reductions to clobber mds's notion of wanted unless we're - sure the client has seen all the latest caps. - */ - dout(10) << "handle_client_file_caps ignoring wanted " << cap_string(m->get_wanted()) - << " bc seq " << m->get_seq() << " < " << cap->get_last_seq() << dendl; - } else { - cap->set_wanted(wanted); - } - } - - // confirm caps - int had = cap->confirm_receipt(m->get_seq(), m->get_caps()); - int has = cap->confirmed(); - if (cap->is_null()) { - dout(7) << " cap for client" << client << " is now null, removing from " << *in << dendl; - in->remove_client_cap(client); - if (!in->is_any_caps()) - in->xlist_open_file.remove_myself(); // unpin logsegment - if (!in->is_auth()) - request_inode_file_caps(in); - - // tell client. - MClientFileCaps *r = new MClientFileCaps(MClientFileCaps::OP_RELEASE, - in->inode, - 0, 0, 0); - mds->send_message_client(r, m->get_source_inst()); - } - - // merge in atime? - if (m->get_inode().atime > in->inode.atime) { - dout(7) << " taking atime " << m->get_inode().atime << " > " - << in->inode.atime << " for " << *in << dendl; - in->inode.atime = m->get_inode().atime; - } - - if ((has|had) & CAP_FILE_WR) { - bool dirty = false; - - // mtime - if (m->get_inode().mtime > in->inode.mtime) { - dout(7) << " taking mtime " << m->get_inode().mtime << " > " - << in->inode.mtime << " for " << *in << dendl; - in->inode.mtime = m->get_inode().mtime; - dirty = true; - } - // size - if (m->get_inode().size > in->inode.size) { - dout(7) << " taking size " << m->get_inode().size << " > " - << in->inode.size << " for " << *in << dendl; - in->inode.size = m->get_inode().size; - dirty = true; - } - - if (dirty) - mds->mdlog->submit_entry(new EString("cap inode update dirty fixme")); - } - - // reevaluate, waiters - if (!in->filelock.is_stable()) - file_eval_gather(&in->filelock); - else if (in->is_auth()) - file_eval(&in->filelock); - - //in->finish_waiting(CInode::WAIT_CAPS, 0); // note: any users for this? - - delete m; -} - - - - - - - - - - -// locks ---------------------------------------------------------------- - -SimpleLock *Locker::get_lock(int lock_type, MDSCacheObjectInfo &info) -{ - switch (lock_type) { - case LOCK_OTYPE_DN: - { - // be careful; info.dirfrag may have incorrect frag; recalculate based on dname. - CInode *diri = mdcache->get_inode(info.dirfrag.ino); - frag_t fg; - CDir *dir = 0; - CDentry *dn = 0; - if (diri) { - fg = diri->pick_dirfrag(info.dname); - dir = diri->get_dirfrag(fg); - if (dir) - dn = dir->lookup(info.dname); - } - if (!dn) { - dout(7) << "get_lock don't have dn " << info.dirfrag.ino << " " << info.dname << dendl; - return 0; - } - return &dn->lock; - } - - case LOCK_OTYPE_IAUTH: - case LOCK_OTYPE_ILINK: - case LOCK_OTYPE_IDIRFRAGTREE: - case LOCK_OTYPE_IFILE: - case LOCK_OTYPE_IDIR: - { - CInode *in = mdcache->get_inode(info.ino); - if (!in) { - dout(7) << "get_lock don't have ino " << info.ino << dendl; - return 0; - } - switch (lock_type) { - case LOCK_OTYPE_IAUTH: return &in->authlock; - case LOCK_OTYPE_ILINK: return &in->linklock; - case LOCK_OTYPE_IDIRFRAGTREE: return &in->dirfragtreelock; - case LOCK_OTYPE_IFILE: return &in->filelock; - case LOCK_OTYPE_IDIR: return &in->dirlock; - } - } - - default: - dout(7) << "get_lock don't know lock_type " << lock_type << dendl; - assert(0); - break; - } - - return 0; -} - - -void Locker::handle_lock(MLock *m) -{ - // nobody should be talking to us during recovery. - assert(mds->is_rejoin() || mds->is_active() || mds->is_stopping()); - - SimpleLock *lock = get_lock(m->get_lock_type(), m->get_object_info()); - if (!lock) { - dout(10) << "don't have object " << m->get_object_info() << ", must have trimmed, dropping" << dendl; - delete m; - return; - } - - switch (lock->get_type()) { - case LOCK_OTYPE_DN: - case LOCK_OTYPE_IAUTH: - case LOCK_OTYPE_ILINK: - handle_simple_lock(lock, m); - break; - - case LOCK_OTYPE_IFILE: - handle_file_lock((FileLock*)lock, m); - break; - - case LOCK_OTYPE_IDIRFRAGTREE: - case LOCK_OTYPE_IDIR: - handle_scatter_lock((ScatterLock*)lock, m); - break; - - default: - dout(7) << "handle_lock got otype " << m->get_lock_type() << dendl; - assert(0); - break; - } -} - - - - - -// ========================================================================== -// simple lock - -void Locker::handle_simple_lock(SimpleLock *lock, MLock *m) -{ - int from = m->get_asker(); - - if (mds->is_rejoin()) { - if (lock->get_parent()->is_rejoining()) { - dout(7) << "handle_simple_lock still rejoining " << *lock->get_parent() - << ", dropping " << *m << dendl; - delete m; - return; - } - } - - switch (m->get_action()) { - // -- replica -- - case LOCK_AC_SYNC: - assert(lock->get_state() == LOCK_LOCK); - lock->decode_locked_state(m->get_data()); - lock->set_state(LOCK_SYNC); - lock->finish_waiters(SimpleLock::WAIT_RD|SimpleLock::WAIT_STABLE); - - // special case: trim replica no-longer-null dentry? - if (lock->get_type() == LOCK_OTYPE_DN) { - CDentry *dn = (CDentry*)lock->get_parent(); - if (dn->is_null() && m->get_data().length() > 0) { - dout(10) << "handle_simple_lock replica dentry null -> non-null, must trim " - << *dn << dendl; - assert(dn->get_num_ref() == 0); - map expiremap; - mdcache->trim_dentry(dn, expiremap); - mdcache->send_expire_messages(expiremap); - } - } - break; - - case LOCK_AC_LOCK: - assert(lock->get_state() == LOCK_SYNC); - //|| lock->get_state() == LOCK_GLOCKR); - - // wait for readers to finish? - if (lock->is_rdlocked()) { - dout(7) << "handle_simple_lock has reader, waiting before ack on " << *lock - << " on " << *lock->get_parent() << dendl; - lock->set_state(LOCK_GLOCKR); - } else { - // update lock and reply - lock->set_state(LOCK_LOCK); - mds->send_message_mds(new MLock(lock, LOCK_AC_LOCKACK, mds->get_nodeid()), - from, MDS_PORT_LOCKER); - } - break; - - - // -- auth -- - case LOCK_AC_LOCKACK: - assert(lock->get_state() == LOCK_GLOCKR); - assert(lock->is_gathering(from)); - lock->remove_gather(from); - - if (lock->is_gathering()) { - dout(7) << "handle_simple_lock " << *lock << " on " << *lock->get_parent() << " from " << from - << ", still gathering " << lock->get_gather_set() << dendl; - } else { - dout(7) << "handle_simple_lock " << *lock << " on " << *lock->get_parent() << " from " << from - << ", last one" << dendl; - simple_eval_gather(lock); - } - break; - - } - - delete m; -} - -/* unused, currently. - -class C_Locker_SimpleEval : public Context { - Locker *locker; - SimpleLock *lock; -public: - C_Locker_SimpleEval(Locker *l, SimpleLock *lk) : locker(l), lock(lk) {} - void finish(int r) { - locker->try_simple_eval(lock); - } -}; - -void Locker::try_simple_eval(SimpleLock *lock) -{ - // unstable and ambiguous auth? - if (!lock->is_stable() && - lock->get_parent()->is_ambiguous_auth()) { - dout(7) << "simple_eval not stable and ambiguous auth, waiting on " << *lock->get_parent() << dendl; - //if (!lock->get_parent()->is_waiter(MDSCacheObject::WAIT_SINGLEAUTH)) - lock->get_parent()->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, new C_Locker_SimpleEval(this, lock)); - return; - } - - if (!lock->get_parent()->is_auth()) { - dout(7) << "try_simple_eval not auth for " << *lock->get_parent() << dendl; - return; - } - - if (!lock->get_parent()->can_auth_pin()) { - dout(7) << "try_simple_eval can't auth_pin, waiting on " << *lock->get_parent() << dendl; - //if (!lock->get_parent()->is_waiter(MDSCacheObject::WAIT_SINGLEAUTH)) - lock->get_parent()->add_waiter(MDSCacheObject::WAIT_UNFREEZE, new C_Locker_SimpleEval(this, lock)); - return; - } - - if (lock->is_stable()) - simple_eval(lock); -} -*/ - -void Locker::simple_eval_gather(SimpleLock *lock) -{ - dout(10) << "simple_eval_gather " << *lock << " on " << *lock->get_parent() << dendl; - - // finished gathering? - if (lock->get_state() == LOCK_GLOCKR && - !lock->is_gathering() && - !lock->is_rdlocked()) { - dout(7) << "simple_eval finished gather on " << *lock << " on " << *lock->get_parent() << dendl; - - // replica: tell auth - if (!lock->get_parent()->is_auth()) { - int auth = lock->get_parent()->authority().first; - if (mds->mdsmap->get_state(auth) >= MDSMap::STATE_REJOIN) - mds->send_message_mds(new MLock(lock, LOCK_AC_LOCKACK, mds->get_nodeid()), - lock->get_parent()->authority().first, MDS_PORT_LOCKER); - } - - lock->set_state(LOCK_LOCK); - lock->finish_waiters(SimpleLock::WAIT_STABLE|SimpleLock::WAIT_WR); - - if (lock->get_parent()->is_auth()) { - lock->get_parent()->auth_unpin(); - - // re-eval? - simple_eval(lock); - } - } -} - -void Locker::simple_eval(SimpleLock *lock) -{ - dout(10) << "simple_eval " << *lock << " on " << *lock->get_parent() << dendl; - - assert(lock->get_parent()->is_auth()); - assert(lock->is_stable()); - - if (lock->get_parent()->is_frozen()) return; - - // stable -> sync? - if (!lock->is_xlocked() && - lock->get_state() != LOCK_SYNC && - !lock->is_waiter_for(SimpleLock::WAIT_WR)) { - dout(7) << "simple_eval stable, syncing " << *lock - << " on " << *lock->get_parent() << dendl; - simple_sync(lock); - } - -} - - -// mid - -void Locker::simple_sync(SimpleLock *lock) -{ - dout(7) << "simple_sync on " << *lock << " on " << *lock->get_parent() << dendl; - assert(lock->get_parent()->is_auth()); - assert(lock->is_stable()); - - // check state - if (lock->get_state() == LOCK_SYNC) - return; // already sync - assert(lock->get_state() == LOCK_LOCK); - - // sync. - if (lock->get_parent()->is_replicated()) { - // hard data - bufferlist data; - lock->encode_locked_state(data); - - // bcast to replicas - send_lock_message(lock, LOCK_AC_SYNC, data); - } - - // change lock - lock->set_state(LOCK_SYNC); - - // waiters? - lock->finish_waiters(SimpleLock::WAIT_RD|SimpleLock::WAIT_STABLE); -} - -void Locker::simple_lock(SimpleLock *lock) -{ - dout(7) << "simple_lock on " << *lock << " on " << *lock->get_parent() << dendl; - assert(lock->get_parent()->is_auth()); - assert(lock->is_stable()); - - // check state - if (lock->get_state() == LOCK_LOCK) return; - assert(lock->get_state() == LOCK_SYNC); - - if (lock->get_parent()->is_replicated()) { - // bcast to replicas - send_lock_message(lock, LOCK_AC_LOCK); - - // change lock - lock->set_state(LOCK_GLOCKR); - lock->init_gather(); - lock->get_parent()->auth_pin(); - } else { - lock->set_state(LOCK_LOCK); - } -} - - -// top - -bool Locker::simple_rdlock_try(SimpleLock *lock, Context *con) -{ - dout(7) << "simple_rdlock_try on " << *lock << " on " << *lock->get_parent() << dendl; - - // can read? grab ref. - if (lock->can_rdlock(0)) - return true; - - assert(!lock->get_parent()->is_auth()); - - // wait! - dout(7) << "simple_rdlock_try waiting on " << *lock << " on " << *lock->get_parent() << dendl; - if (con) lock->add_waiter(SimpleLock::WAIT_RD, con); - return false; -} - -bool Locker::simple_rdlock_start(SimpleLock *lock, MDRequest *mdr) -{ - dout(7) << "simple_rdlock_start on " << *lock << " on " << *lock->get_parent() << dendl; - - // can read? grab ref. - if (lock->can_rdlock(mdr)) { - lock->get_rdlock(); - mdr->rdlocks.insert(lock); - mdr->locks.insert(lock); - return true; - } - - // wait! - dout(7) << "simple_rdlock_start waiting on " << *lock << " on " << *lock->get_parent() << dendl; - lock->add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr)); - return false; -} - -void Locker::simple_rdlock_finish(SimpleLock *lock, MDRequest *mdr) -{ - // drop ref - lock->put_rdlock(); - if (mdr) { - mdr->rdlocks.erase(lock); - mdr->locks.erase(lock); - } - - dout(7) << "simple_rdlock_finish on " << *lock << " on " << *lock->get_parent() << dendl; - - // last one? - if (!lock->is_rdlocked()) - simple_eval_gather(lock); -} - -bool Locker::simple_xlock_start(SimpleLock *lock, MDRequest *mdr) -{ - dout(7) << "simple_xlock_start on " << *lock << " on " << *lock->get_parent() << dendl; - - // xlock by me? - if (lock->is_xlocked() && - lock->get_xlocked_by() == mdr) - return true; - - // auth? - if (lock->get_parent()->is_auth()) { - // auth - - // lock. - if (lock->get_state() == LOCK_SYNC) - simple_lock(lock); - - // already locked? - if (lock->get_state() == LOCK_LOCK) { - if (lock->is_xlocked()) { - // by someone else. - lock->add_waiter(SimpleLock::WAIT_WR, new C_MDS_RetryRequest(mdcache, mdr)); - return false; - } - - // xlock. - lock->get_xlock(mdr); - mdr->xlocks.insert(lock); - mdr->locks.insert(lock); - return true; - } else { - // wait for lock - lock->add_waiter(SimpleLock::WAIT_STABLE, new C_MDS_RetryRequest(mdcache, mdr)); - return false; - } - } else { - // replica - // this had better not be a remote xlock attempt! - assert(!mdr->slave_request); - - // wait for single auth - if (lock->get_parent()->is_ambiguous_auth()) { - lock->get_parent()->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, - new C_MDS_RetryRequest(mdcache, mdr)); - return false; - } - - // send lock request - int auth = lock->get_parent()->authority().first; - mdr->more()->slaves.insert(auth); - MMDSSlaveRequest *r = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_XLOCK); - r->set_lock_type(lock->get_type()); - lock->get_parent()->set_object_info(r->get_object_info()); - mds->send_message_mds(r, auth, MDS_PORT_SERVER); - - // wait - lock->add_waiter(SimpleLock::WAIT_REMOTEXLOCK, new C_MDS_RetryRequest(mdcache, mdr)); - return false; - } -} - - -void Locker::simple_xlock_finish(SimpleLock *lock, MDRequest *mdr) -{ - dout(7) << "simple_xlock_finish on " << *lock << " on " << *lock->get_parent() << dendl; - - // drop ref - assert(lock->can_xlock(mdr)); - lock->put_xlock(); - assert(mdr); - mdr->xlocks.erase(lock); - mdr->locks.erase(lock); - - // remote xlock? - if (!lock->get_parent()->is_auth()) { - // tell auth - dout(7) << "simple_xlock_finish releasing remote xlock on " << *lock->get_parent() << dendl; - int auth = lock->get_parent()->authority().first; - if (mds->mdsmap->get_state(auth) >= MDSMap::STATE_REJOIN) { - MMDSSlaveRequest *slavereq = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_UNXLOCK); - slavereq->set_lock_type(lock->get_type()); - lock->get_parent()->set_object_info(slavereq->get_object_info()); - mds->send_message_mds(slavereq, auth, MDS_PORT_SERVER); - } - } - - // others waiting? - lock->finish_waiters(SimpleLock::WAIT_WR, 0); - - // eval? - if (lock->get_parent()->is_auth()) - simple_eval(lock); -} - - - -// dentry specific helpers - -/** dentry_can_rdlock_trace - * see if we can _anonymously_ rdlock an entire trace. - * if not, and req is specified, wait and retry that message. - */ -bool Locker::dentry_can_rdlock_trace(vector& trace) -{ - // verify dentries are rdlockable. - // we do this because - // - we're being less aggressive about locks acquisition, and - // - we're not acquiring the locks in order! - for (vector::iterator it = trace.begin(); - it != trace.end(); - it++) { - CDentry *dn = *it; - if (!dn->lock.can_rdlock(0)) { - dout(10) << "can_rdlock_trace can't rdlock " << *dn << dendl; - return false; - } - } - return true; -} - -void Locker::dentry_anon_rdlock_trace_start(vector& trace) -{ - // grab dentry rdlocks - for (vector::iterator it = trace.begin(); - it != trace.end(); - it++) { - dout(10) << "dentry_anon_rdlock_trace_start rdlocking " << (*it)->lock << " " << **it << dendl; - (*it)->lock.get_rdlock(); - } -} - - -void Locker::dentry_anon_rdlock_trace_finish(vector& trace) -{ - for (vector::iterator it = trace.begin(); - it != trace.end(); - it++) - simple_rdlock_finish(&(*it)->lock, 0); -} - - - -// ========================================================================== -// scatter lock - -bool Locker::scatter_rdlock_start(ScatterLock *lock, MDRequest *mdr) -{ - dout(7) << "scatter_rdlock_start on " << *lock - << " on " << *lock->get_parent() << dendl; - - // read on stable scattered replica? - if (lock->get_state() == LOCK_SCATTER && - !lock->get_parent()->is_auth()) { - dout(7) << "scatter_rdlock_start scatterlock read on a stable scattered replica, fw to auth" << dendl; - mdcache->request_forward(mdr, lock->get_parent()->authority().first); - return false; - } - - // pre-twiddle? - if (lock->get_state() == LOCK_SCATTER && - lock->get_parent()->is_auth() && - !lock->get_parent()->is_replicated() && - !lock->is_wrlocked()) - scatter_sync(lock); - - // can rdlock? - if (lock->can_rdlock(mdr)) { - lock->get_rdlock(); - mdr->rdlocks.insert(lock); - mdr->locks.insert(lock); - return true; - } - - // wait for read. - lock->add_waiter(SimpleLock::WAIT_RD|SimpleLock::WAIT_STABLE, new C_MDS_RetryRequest(mdcache, mdr)); - - // initiate sync or tempsync? - if (lock->is_stable() && - lock->get_parent()->is_auth()) { - if (lock->get_parent()->is_replicated()) - scatter_tempsync(lock); - else - scatter_sync(lock); - } - - return false; -} - -void Locker::scatter_rdlock_finish(ScatterLock *lock, MDRequest *mdr) -{ - dout(7) << "scatter_rdlock_finish on " << *lock - << " on " << *lock->get_parent() << dendl; - lock->put_rdlock(); - if (mdr) { - mdr->rdlocks.erase(lock); - mdr->locks.erase(lock); - } - - scatter_eval_gather(lock); -} - - -bool Locker::scatter_wrlock_start(ScatterLock *lock, MDRequest *mdr) -{ - dout(7) << "scatter_wrlock_start on " << *lock - << " on " << *lock->get_parent() << dendl; - - // pre-twiddle? - if (lock->get_parent()->is_auth() && - !lock->get_parent()->is_replicated() && - !lock->is_rdlocked() && - !lock->is_xlocked() && - lock->get_state() == LOCK_SYNC) - lock->set_state(LOCK_SCATTER); - //scatter_scatter(lock); - - // can wrlock? - if (lock->can_wrlock()) { - lock->get_wrlock(); - mdr->wrlocks.insert(lock); - mdr->locks.insert(lock); - return true; - } - - // wait for write. - lock->add_waiter(SimpleLock::WAIT_WR|SimpleLock::WAIT_STABLE, - new C_MDS_RetryRequest(mdcache, mdr)); - - // initiate scatter or lock? - if (lock->is_stable()) { - if (lock->get_parent()->is_auth()) { - // auth. scatter or lock? - if (((CInode*)lock->get_parent())->has_subtree_root_dirfrag()) - scatter_scatter(lock); - else - scatter_lock(lock); - } else { - // replica. - // auth should be auth_pinned (see acquire_locks wrlock weird mustpin case). - int auth = lock->get_parent()->authority().first; - dout(10) << "requesting scatter from auth on " - << *lock << " on " << *lock->get_parent() << dendl; - mds->send_message_mds(new MLock(lock, LOCK_AC_REQSCATTER, mds->get_nodeid()), - auth, MDS_PORT_LOCKER); - } - } - - return false; -} - -void Locker::scatter_wrlock_finish(ScatterLock *lock, MDRequest *mdr) -{ - dout(7) << "scatter_wrlock_finish on " << *lock - << " on " << *lock->get_parent() << dendl; - lock->put_wrlock(); - if (mdr) { - mdr->wrlocks.erase(lock); - mdr->locks.erase(lock); - } - - scatter_eval_gather(lock); -} - - -class C_Locker_ScatterEval : public Context { - Locker *locker; - ScatterLock *lock; -public: - C_Locker_ScatterEval(Locker *l, ScatterLock *lk) : locker(l), lock(lk) {} - void finish(int r) { - lock->get_parent()->put(CInode::PIN_PTRWAITER); - locker->try_scatter_eval(lock); - } -}; - - -void Locker::try_scatter_eval(ScatterLock *lock) -{ - // unstable and ambiguous auth? - if (!lock->is_stable() && - lock->get_parent()->is_ambiguous_auth()) { - dout(7) << "try_scatter_eval not stable and ambiguous auth, waiting on " << *lock->get_parent() << dendl; - //if (!lock->get_parent()->is_waiter(MDSCacheObject::WAIT_SINGLEAUTH)) - lock->get_parent()->get(CInode::PIN_PTRWAITER); - lock->get_parent()->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, new C_Locker_ScatterEval(this, lock)); - return; - } - - if (!lock->get_parent()->is_auth()) { - dout(7) << "try_scatter_eval not auth for " << *lock->get_parent() << dendl; - return; - } - - if (!lock->get_parent()->can_auth_pin()) { - dout(7) << "try_scatter_eval can't auth_pin, waiting on " << *lock->get_parent() << dendl; - //if (!lock->get_parent()->is_waiter(MDSCacheObject::WAIT_SINGLEAUTH)) - lock->get_parent()->get(CInode::PIN_PTRWAITER); - lock->get_parent()->add_waiter(MDSCacheObject::WAIT_UNFREEZE, new C_Locker_ScatterEval(this, lock)); - return; - } - - if (lock->is_stable()) - scatter_eval(lock); -} - - -void Locker::scatter_eval_gather(ScatterLock *lock) -{ - dout(10) << "scatter_eval_gather " << *lock << " on " << *lock->get_parent() << dendl; - - if (!lock->get_parent()->is_auth()) { - // REPLICA - - if (lock->get_state() == LOCK_GLOCKC && - !lock->is_wrlocked()) { - dout(10) << "scatter_eval no wrlocks, acking lock" << dendl; - int auth = lock->get_parent()->authority().first; - if (mds->mdsmap->get_state(auth) >= MDSMap::STATE_REJOIN) { - bufferlist data; - lock->encode_locked_state(data); - mds->send_message_mds(new MLock(lock, LOCK_AC_LOCKACK, mds->get_nodeid(), data), - auth, MDS_PORT_LOCKER); - } - lock->set_state(LOCK_LOCK); - } - - } else { - // AUTH - - // glocks|glockt -> lock? - if ((lock->get_state() == LOCK_GLOCKS || - lock->get_state() == LOCK_GLOCKT) && - !lock->is_gathering() && - !lock->is_rdlocked()) { - dout(7) << "scatter_eval finished lock gather/un-rdlock on " << *lock - << " on " << *lock->get_parent() << dendl; - lock->set_state(LOCK_LOCK); - lock->finish_waiters(ScatterLock::WAIT_XLOCK|ScatterLock::WAIT_STABLE); - lock->get_parent()->auth_unpin(); - } - - // glockc -> lock? - else if (lock->get_state() == LOCK_GLOCKC && - !lock->is_gathering() && - !lock->is_wrlocked()) { - if (lock->is_updated()) { - scatter_writebehind(lock); - } else { - dout(7) << "scatter_eval finished lock gather/un-wrlock on " << *lock - << " on " << *lock->get_parent() << dendl; - lock->set_state(LOCK_LOCK); - lock->finish_waiters(ScatterLock::WAIT_XLOCK|ScatterLock::WAIT_STABLE); - lock->get_parent()->auth_unpin(); - } - } - - // gSyncL -> sync? - else if (lock->get_state() == LOCK_GSYNCL && - !lock->is_wrlocked()) { - dout(7) << "scatter_eval finished sync un-wrlock on " << *lock - << " on " << *lock->get_parent() << dendl; - if (lock->get_parent()->is_replicated()) { - // encode and bcast - bufferlist data; - lock->encode_locked_state(data); - send_lock_message(lock, LOCK_AC_SYNC, data); - } - lock->set_state(LOCK_SYNC); - lock->finish_waiters(ScatterLock::WAIT_RD|ScatterLock::WAIT_STABLE); - lock->get_parent()->auth_unpin(); - } - - // gscattert|gscatters -> scatter? - else if ((lock->get_state() == LOCK_GSCATTERT || - lock->get_state() == LOCK_GSCATTERS) && - !lock->is_gathering() && - !lock->is_rdlocked()) { - dout(7) << "scatter_eval finished scatter un-rdlock(/gather) on " << *lock - << " on " << *lock->get_parent() << dendl; - if (lock->get_parent()->is_replicated()) { - // encode and bcast - bufferlist data; - lock->encode_locked_state(data); - send_lock_message(lock, LOCK_AC_SCATTER, data); - } - lock->set_state(LOCK_SCATTER); - lock->finish_waiters(ScatterLock::WAIT_WR|ScatterLock::WAIT_STABLE); - lock->get_parent()->auth_unpin(); - } - - // gTempsyncC|gTempsyncL -> tempsync - else if ((lock->get_state() == LOCK_GTEMPSYNCC || - lock->get_state() == LOCK_GTEMPSYNCL) && - !lock->is_gathering() && - !lock->is_wrlocked()) { - if (lock->is_updated()) { - scatter_writebehind(lock); - } else { - dout(7) << "scatter_eval finished tempsync gather/un-wrlock on " << *lock - << " on " << *lock->get_parent() << dendl; - lock->set_state(LOCK_TEMPSYNC); - lock->finish_waiters(ScatterLock::WAIT_RD|ScatterLock::WAIT_STABLE); - lock->get_parent()->auth_unpin(); - } - } - - - // re-eval? - if (lock->is_stable()) // && lock->get_parent()->can_auth_pin()) - scatter_eval(lock); - } -} - -void Locker::scatter_writebehind(ScatterLock *lock) -{ - CInode *in = (CInode*)lock->get_parent(); - dout(10) << "scatter_writebehind " << in->inode.mtime << " on " << *lock << " on " << *in << dendl; - - // hack: - if (in->is_base()) { - dout(10) << "scatter_writebehind just clearing updated flag for base inode " << *in << dendl; - lock->clear_updated(); - scatter_eval_gather(lock); - return; - } - - // journal write-behind. - inode_t *pi = in->project_inode(); - pi->mtime = in->inode.mtime; // make sure an intermediate version isn't goofing us up - pi->version = in->pre_dirty(); - - EUpdate *le = new EUpdate(mds->mdlog, "scatter writebehind"); - le->metablob.add_dir_context(in->get_parent_dn()->get_dir()); - le->metablob.add_primary_dentry(in->get_parent_dn(), true, 0, pi); - - mds->mdlog->submit_entry(le); - mds->mdlog->wait_for_sync(new C_Locker_ScatterWB(this, lock, mds->mdlog->get_current_segment())); -} - -void Locker::scatter_writebehind_finish(ScatterLock *lock, LogSegment *ls) -{ - CInode *in = (CInode*)lock->get_parent(); - dout(10) << "scatter_writebehind_finish on " << *lock << " on " << *in << dendl; - in->pop_and_dirty_projected_inode(ls); - lock->clear_updated(); - scatter_eval_gather(lock); -} - -void Locker::scatter_eval(ScatterLock *lock) -{ - dout(10) << "scatter_eval " << *lock << " on " << *lock->get_parent() << dendl; - - assert(lock->get_parent()->is_auth()); - assert(lock->is_stable()); - - if (lock->get_parent()->is_frozen()) return; - - CInode *in = (CInode*)lock->get_parent(); - if (in->has_subtree_root_dirfrag() && !in->is_base()) { - // i _should_ be scattered. - if (!lock->is_rdlocked() && - !lock->is_xlocked() && - lock->get_state() != LOCK_SCATTER) { - dout(10) << "scatter_eval no rdlocks|xlocks, am subtree root inode, scattering" << dendl; - scatter_scatter(lock); - autoscattered.push_back(&lock->xlistitem_autoscattered); - } - } else { - // i _should_ be sync. - lock->xlistitem_autoscattered.remove_myself(); - if (!lock->is_wrlocked() && - !lock->is_xlocked() && - lock->get_state() != LOCK_SYNC) { - dout(10) << "scatter_eval no wrlocks|xlocks, not subtree root inode, syncing" << dendl; - scatter_sync(lock); - } - } -} - -void Locker::note_autoscattered(ScatterLock *lock) -{ - dout(10) << "note_autoscattered " << *lock << " on " << *lock->get_parent() << dendl; - autoscattered.push_back(&lock->xlistitem_autoscattered); -} - - -/* - * this is called by LogSegment::try_to_trim() when trying to - * flush dirty scattered data (e.g. inode->dirlock mtime) back - * to the auth node. - */ -void Locker::scatter_try_unscatter(ScatterLock *lock, Context *c) -{ - dout(10) << "scatter_try_unscatter " << *lock << " on " << *lock->get_parent() << dendl; - assert(!lock->get_parent()->is_auth()); - assert(!lock->get_parent()->is_ambiguous_auth()); - - // request unscatter? - int auth = lock->get_parent()->authority().first; - if (lock->get_state() == LOCK_SCATTER && - mds->mdsmap->get_state(auth) >= MDSMap::STATE_ACTIVE) - mds->send_message_mds(new MLock(lock, LOCK_AC_REQUNSCATTER, mds->get_nodeid()), - auth, MDS_PORT_LOCKER); - - // wait... - lock->add_waiter(SimpleLock::WAIT_STABLE, c); -} - - -void Locker::scatter_sync(ScatterLock *lock) -{ - dout(10) << "scatter_sync " << *lock - << " on " << *lock->get_parent() << dendl; - assert(lock->get_parent()->is_auth()); - assert(lock->is_stable()); - - switch (lock->get_state()) { - case LOCK_SYNC: - return; // already sync. - - case LOCK_TEMPSYNC: - break; // just do it. - - case LOCK_LOCK: - if (lock->is_wrlocked() || lock->is_xlocked()) { - lock->set_state(LOCK_GSYNCL); - lock->get_parent()->auth_pin(); - return; - } - break; // do it. - - case LOCK_SCATTER: - // lock first. this is the slow way, incidentally. - if (lock->get_parent()->is_replicated()) { - send_lock_message(lock, LOCK_AC_LOCK); - lock->init_gather(); - } else { - if (!lock->is_wrlocked()) { - break; // do it now, we're fine - } - } - lock->set_state(LOCK_GLOCKC); - lock->get_parent()->auth_pin(); - return; - - default: - assert(0); - } - - // do sync - if (lock->get_parent()->is_replicated()) { - // encode and bcast - bufferlist data; - lock->encode_locked_state(data); - send_lock_message(lock, LOCK_AC_SYNC, data); - } - - lock->set_state(LOCK_SYNC); - lock->finish_waiters(ScatterLock::WAIT_RD|ScatterLock::WAIT_STABLE); -} - -void Locker::scatter_scatter(ScatterLock *lock) -{ - dout(10) << "scatter_scatter " << *lock - << " on " << *lock->get_parent() << dendl; - assert(lock->get_parent()->is_auth()); - assert(lock->is_stable()); - - lock->set_last_scatter(g_clock.now()); - - switch (lock->get_state()) { - case LOCK_SYNC: - if (!lock->is_rdlocked() && - !lock->get_parent()->is_replicated()) - break; // do it - if (lock->get_parent()->is_replicated()) { - send_lock_message(lock, LOCK_AC_LOCK); - lock->init_gather(); - } - lock->set_state(LOCK_GSCATTERS); - lock->get_parent()->auth_pin(); - return; - - case LOCK_LOCK: - if (lock->is_xlocked()) - return; // sorry - break; // do it. - - case LOCK_SCATTER: - return; // did it. - - case LOCK_TEMPSYNC: - if (lock->is_rdlocked()) { - lock->set_state(LOCK_GSCATTERT); - lock->get_parent()->auth_pin(); - return; - } - break; // do it - - default: - assert(0); - } - - // do scatter - if (lock->get_parent()->is_replicated()) { - // encode and bcast - bufferlist data; - lock->encode_locked_state(data); - send_lock_message(lock, LOCK_AC_SCATTER, data); - } - lock->set_state(LOCK_SCATTER); - lock->finish_waiters(ScatterLock::WAIT_WR|ScatterLock::WAIT_STABLE); -} - -void Locker::scatter_lock(ScatterLock *lock) -{ - dout(10) << "scatter_lock " << *lock - << " on " << *lock->get_parent() << dendl; - assert(lock->get_parent()->is_auth()); - assert(lock->is_stable()); - - switch (lock->get_state()) { - case LOCK_SYNC: - if (!lock->is_rdlocked() && - !lock->get_parent()->is_replicated()) - break; // do it. - - if (lock->get_parent()->is_replicated()) { - send_lock_message(lock, LOCK_AC_LOCK); - lock->init_gather(); - } - lock->set_state(LOCK_GLOCKS); - lock->get_parent()->auth_pin(); - return; - - case LOCK_LOCK: - return; // done. - - case LOCK_SCATTER: - if (!lock->is_wrlocked() && - !lock->get_parent()->is_replicated()) { - break; // do it. - } - - if (lock->get_parent()->is_replicated()) { - send_lock_message(lock, LOCK_AC_LOCK); - lock->init_gather(); - } - lock->set_state(LOCK_GLOCKC); - lock->get_parent()->auth_pin(); - return; - - case LOCK_TEMPSYNC: - if (lock->is_rdlocked()) { - lock->set_state(LOCK_GLOCKT); - lock->get_parent()->auth_pin(); - return; - } - break; // do it. - } - - // do lock - lock->set_state(LOCK_LOCK); - lock->finish_waiters(ScatterLock::WAIT_XLOCK|ScatterLock::WAIT_WR|ScatterLock::WAIT_STABLE); -} - -void Locker::scatter_tempsync(ScatterLock *lock) -{ - dout(10) << "scatter_tempsync " << *lock - << " on " << *lock->get_parent() << dendl; - assert(lock->get_parent()->is_auth()); - assert(lock->is_stable()); - - switch (lock->get_state()) { - case LOCK_SYNC: - break; // do it. - - case LOCK_LOCK: - if (lock->is_wrlocked() || - lock->is_xlocked()) { - lock->set_state(LOCK_GTEMPSYNCL); - lock->get_parent()->auth_pin(); - return; - } - break; // do it. - - case LOCK_SCATTER: - if (!lock->is_wrlocked() && - !lock->get_parent()->is_replicated()) { - break; // do it. - } - - if (lock->get_parent()->is_replicated()) { - send_lock_message(lock, LOCK_AC_LOCK); - lock->init_gather(); - } - lock->set_state(LOCK_GTEMPSYNCC); - lock->get_parent()->auth_pin(); - return; - - case LOCK_TEMPSYNC: - return; // done - } - - // do tempsync - lock->set_state(LOCK_TEMPSYNC); - lock->finish_waiters(ScatterLock::WAIT_RD|ScatterLock::WAIT_STABLE); -} - - - - -void Locker::handle_scatter_lock(ScatterLock *lock, MLock *m) -{ - int from = m->get_asker(); - dout(10) << "handle_scatter_lock " << *m << " on " << *lock << " on " << *lock->get_parent() << dendl; - - if (mds->is_rejoin()) { - if (lock->get_parent()->is_rejoining()) { - dout(7) << "handle_scatter_lock still rejoining " << *lock->get_parent() - << ", dropping " << *m << dendl; - delete m; - return; - } - } - - switch (m->get_action()) { - // -- replica -- - case LOCK_AC_SYNC: - assert(lock->get_state() == LOCK_LOCK); - lock->set_state(LOCK_SYNC); - lock->decode_locked_state(m->get_data()); - lock->clear_updated(); - lock->finish_waiters(ScatterLock::WAIT_RD|ScatterLock::WAIT_STABLE); - break; - - case LOCK_AC_LOCK: - assert(lock->get_state() == LOCK_SCATTER || - lock->get_state() == LOCK_SYNC); - - // wait for wrlocks to close? - if (lock->is_wrlocked()) { - assert(lock->get_state() == LOCK_SCATTER); - dout(7) << "handle_scatter_lock has wrlocks, waiting on " << *lock - << " on " << *lock->get_parent() << dendl; - lock->set_state(LOCK_GLOCKC); - } else if (lock->is_rdlocked()) { - assert(lock->get_state() == LOCK_SYNC); - dout(7) << "handle_scatter_lock has rdlocks, waiting on " << *lock - << " on " << *lock->get_parent() << dendl; - lock->set_state(LOCK_GLOCKS); - } else { - dout(7) << "handle_scatter_lock has no rd|wrlocks, sending lockack for " << *lock - << " on " << *lock->get_parent() << dendl; - - // encode and reply - bufferlist data; - lock->encode_locked_state(data); - mds->send_message_mds(new MLock(lock, LOCK_AC_LOCKACK, mds->get_nodeid(), data), - from, MDS_PORT_LOCKER); - lock->set_state(LOCK_LOCK); - } - break; - - case LOCK_AC_SCATTER: - assert(lock->get_state() == LOCK_LOCK); - lock->decode_locked_state(m->get_data()); - lock->clear_updated(); - lock->set_state(LOCK_SCATTER); - lock->finish_waiters(ScatterLock::WAIT_WR|ScatterLock::WAIT_STABLE); - break; - - // -- for auth -- - case LOCK_AC_LOCKACK: - assert(lock->get_state() == LOCK_GLOCKS || - lock->get_state() == LOCK_GLOCKC || - lock->get_state() == LOCK_GSCATTERS || - lock->get_state() == LOCK_GTEMPSYNCC); - assert(lock->is_gathering(from)); - lock->remove_gather(from); - lock->decode_locked_state(m->get_data()); - - if (lock->is_gathering()) { - dout(7) << "handle_scatter_lock " << *lock << " on " << *lock->get_parent() - << " from " << from << ", still gathering " << lock->get_gather_set() - << dendl; - } else { - dout(7) << "handle_scatter_lock " << *lock << " on " << *lock->get_parent() - << " from " << from << ", last one" - << dendl; - scatter_eval_gather(lock); - } - break; - - case LOCK_AC_REQSCATTER: - if (lock->is_stable()) { - /* NOTE: we can do this _even_ if !can_auth_pin (i.e. freezing) - * because the replica should be holding an auth_pin if they're - * doing this (and thus, we are freezing, not frozen, and indefinite - * starvation isn't an issue). - */ - dout(7) << "handle_scatter_lock got scatter request on " << *lock - << " on " << *lock->get_parent() << dendl; - scatter_scatter(lock); - } else { - dout(7) << "handle_scatter_lock ignoring scatter request on " << *lock - << " on " << *lock->get_parent() << dendl; - } - break; - - case LOCK_AC_REQUNSCATTER: - if (!lock->is_stable()) { - dout(7) << "handle_scatter_lock ignoring now-unnecessary unscatter request on " << *lock - << " on " << *lock->get_parent() << dendl; - } else if (lock->get_parent()->can_auth_pin()) { - dout(7) << "handle_scatter_lock got unscatter request on " << *lock - << " on " << *lock->get_parent() << dendl; - scatter_lock(lock); - } else { - dout(7) << "handle_scatter_lock DROPPING unscatter request on " << *lock - << " on " << *lock->get_parent() << dendl; - /* FIXME: if we can't auth_pin here, this request is effectively lost... */ - } - } - - delete m; -} - - - -void Locker::scatter_unscatter_autoscattered() -{ - /* - * periodically unscatter autoscattered locks - */ - - dout(10) << "scatter_unscatter_autoscattered" << dendl; - - utime_t now = g_clock.now(); - int n = autoscattered.size(); - while (!autoscattered.empty()) { - ScatterLock *lock = autoscattered.front(); - - // stop? - if (lock->get_state() == LOCK_SCATTER && - now - lock->get_last_scatter() < 10.0) - break; - - autoscattered.pop_front(); - - if (lock->get_state() == LOCK_SCATTER && - lock->get_parent()->is_replicated()) { - if (((CInode*)lock->get_parent())->is_frozen() || - ((CInode*)lock->get_parent())->is_freezing()) { - // hrm.. requeue. - dout(10) << "last_scatter " << lock->get_last_scatter() - << ", now " << now << ", but frozen|freezing, requeueing" << dendl; - autoscattered.push_back(&lock->xlistitem_autoscattered); - } else { - dout(10) << "last_scatter " << lock->get_last_scatter() - << ", now " << now << ", locking" << dendl; - scatter_lock(lock); - } - } - if (--n == 0) break; - } -} - - - -// ========================================================================== -// local lock - - -bool Locker::local_wrlock_start(LocalLock *lock, MDRequest *mdr) -{ - dout(7) << "local_wrlock_start on " << *lock - << " on " << *lock->get_parent() << dendl; - - if (lock->can_wrlock()) { - lock->get_wrlock(); - mdr->wrlocks.insert(lock); - mdr->locks.insert(lock); - return true; - } else { - lock->add_waiter(SimpleLock::WAIT_WR|SimpleLock::WAIT_STABLE, new C_MDS_RetryRequest(mdcache, mdr)); - return false; - } -} - -void Locker::local_wrlock_finish(LocalLock *lock, MDRequest *mdr) -{ - dout(7) << "local_wrlock_finish on " << *lock - << " on " << *lock->get_parent() << dendl; - lock->put_wrlock(); - mdr->wrlocks.erase(lock); - mdr->locks.erase(lock); -} - -bool Locker::local_xlock_start(LocalLock *lock, MDRequest *mdr) -{ - dout(7) << "local_xlock_start on " << *lock - << " on " << *lock->get_parent() << dendl; - - if (lock->is_xlocked_by_other(mdr)) { - lock->add_waiter(SimpleLock::WAIT_WR|SimpleLock::WAIT_STABLE, new C_MDS_RetryRequest(mdcache, mdr)); - return false; - } - - lock->get_xlock(mdr); - mdr->xlocks.insert(lock); - mdr->locks.insert(lock); - return true; -} - -void Locker::local_xlock_finish(LocalLock *lock, MDRequest *mdr) -{ - dout(7) << "local_xlock_finish on " << *lock - << " on " << *lock->get_parent() << dendl; - lock->put_xlock(); - mdr->xlocks.erase(lock); - mdr->locks.erase(lock); - - lock->finish_waiters(SimpleLock::WAIT_STABLE|SimpleLock::WAIT_WR); -} - - - -// ========================================================================== -// file lock - - -bool Locker::file_rdlock_start(FileLock *lock, MDRequest *mdr) -{ - dout(7) << "file_rdlock_start " << *lock << " on " << *lock->get_parent() << dendl; - - // can read? grab ref. - if (lock->can_rdlock(mdr)) { - lock->get_rdlock(); - mdr->rdlocks.insert(lock); - mdr->locks.insert(lock); - return true; - } - - // can't read, and replicated. - if (lock->can_rdlock_soon()) { - // wait - dout(7) << "file_rdlock_start can_rdlock_soon " << *lock << " on " << *lock->get_parent() << dendl; - } else { - if (lock->get_parent()->is_auth()) { - // auth - - // FIXME or qsync? - - if (lock->is_stable()) { - file_lock(lock); // lock, bc easiest to back off ... FIXME - - if (lock->can_rdlock(mdr)) { - lock->get_rdlock(); - mdr->rdlocks.insert(lock); - mdr->locks.insert(lock); - - lock->finish_waiters(SimpleLock::WAIT_STABLE); - return true; - } - } else { - dout(7) << "file_rdlock_start waiting until stable on " << *lock << " on " << *lock->get_parent() << dendl; - lock->add_waiter(SimpleLock::WAIT_STABLE, new C_MDS_RetryRequest(mdcache, mdr)); - return false; - } - } else { - // replica - if (lock->is_stable()) { - - // fw to auth - CInode *in = (CInode*)lock->get_parent(); - int auth = in->authority().first; - dout(7) << "file_rdlock_start " << *lock << " on " << *lock->get_parent() << " on replica and async, fw to auth " << auth << dendl; - assert(auth != mds->get_nodeid()); - mdcache->request_forward(mdr, auth); - return false; - - } else { - // wait until stable - dout(7) << "inode_file_rdlock_start waiting until stable on " << *lock << " on " << *lock->get_parent() << dendl; - lock->add_waiter(SimpleLock::WAIT_STABLE, new C_MDS_RetryRequest(mdcache, mdr)); - return false; - } - } - } - - // wait - dout(7) << "file_rdlock_start waiting on " << *lock << " on " << *lock->get_parent() << dendl; - lock->add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr)); - - return false; -} - - - -void Locker::file_rdlock_finish(FileLock *lock, MDRequest *mdr) -{ - dout(7) << "rdlock_finish on " << *lock << " on " << *lock->get_parent() << dendl; - - // drop ref - lock->put_rdlock(); - mdr->rdlocks.erase(lock); - mdr->locks.erase(lock); - - if (!lock->is_rdlocked()) - file_eval_gather(lock); -} - - -bool Locker::file_xlock_start(FileLock *lock, MDRequest *mdr) -{ - dout(7) << "file_xlock_start on " << *lock << " on " << *lock->get_parent() << dendl; - - assert(lock->get_parent()->is_auth()); // remote file xlock not implemented - - // already xlocked by me? - if (lock->get_xlocked_by() == mdr) - return true; - - // can't write? - if (!lock->can_xlock(mdr)) { - - // auth - if (!lock->can_xlock_soon()) { - if (!lock->is_stable()) { - dout(7) << "file_xlock_start on auth, waiting for stable on " << *lock << " on " << *lock->get_parent() << dendl; - lock->add_waiter(SimpleLock::WAIT_STABLE, new C_MDS_RetryRequest(mdcache, mdr)); - return false; - } - - // initiate lock - file_lock(lock); - - // fall-thru to below. - } - } - - // check again - if (lock->can_xlock(mdr)) { - assert(lock->get_parent()->is_auth()); - lock->get_xlock(mdr); - mdr->locks.insert(lock); - mdr->xlocks.insert(lock); - return true; - } else { - dout(7) << "file_xlock_start on auth, waiting for write on " << *lock << " on " << *lock->get_parent() << dendl; - lock->add_waiter(SimpleLock::WAIT_WR, new C_MDS_RetryRequest(mdcache, mdr)); - return false; - } -} - - -void Locker::file_xlock_finish(FileLock *lock, MDRequest *mdr) -{ - dout(7) << "file_xlock_finish on " << *lock << " on " << *lock->get_parent() << dendl; - - // drop ref - assert(lock->can_xlock(mdr)); - lock->put_xlock(); - mdr->locks.erase(lock); - mdr->xlocks.erase(lock); - - assert(lock->get_parent()->is_auth()); // or implement remote xlocks - - // others waiting? - lock->finish_waiters(SimpleLock::WAIT_WR, 0); - - if (lock->get_parent()->is_auth()) - file_eval(lock); -} - - -/* - * ... - * - * also called after client caps are acked to us - * - checks if we're in unstable sfot state and can now move on to next state - * - checks if soft state should change (eg bc last writer closed) - */ -class C_Locker_FileEval : public Context { - Locker *locker; - FileLock *lock; -public: - C_Locker_FileEval(Locker *l, FileLock *lk) : locker(l), lock(lk) {} - void finish(int r) { - lock->get_parent()->put(CInode::PIN_PTRWAITER); - locker->try_file_eval(lock); - } -}; - -void Locker::try_file_eval(FileLock *lock) -{ - CInode *in = (CInode*)lock->get_parent(); - - // unstable and ambiguous auth? - if (!lock->is_stable() && - in->is_ambiguous_auth()) { - dout(7) << "try_file_eval not stable and ambiguous auth, waiting on " << *in << dendl; - //if (!lock->get_parent()->is_waiter(MDSCacheObject::WAIT_SINGLEAUTH)) - in->get(CInode::PIN_PTRWAITER); - in->add_waiter(CInode::WAIT_SINGLEAUTH, new C_Locker_FileEval(this, lock)); - return; - } - - if (!lock->get_parent()->is_auth()) { - dout(7) << "try_file_eval not auth for " << *lock->get_parent() << dendl; - return; - } - - if (!lock->get_parent()->can_auth_pin()) { - dout(7) << "try_file_eval can't auth_pin, waiting on " << *in << dendl; - //if (!lock->get_parent()->is_waiter(MDSCacheObject::WAIT_SINGLEAUTH)) - in->get(CInode::PIN_PTRWAITER); - in->add_waiter(CInode::WAIT_UNFREEZE, new C_Locker_FileEval(this, lock)); - return; - } - - if (lock->is_stable()) - file_eval(lock); -} - - - -void Locker::file_eval_gather(FileLock *lock) -{ - CInode *in = (CInode*)lock->get_parent(); - int issued = in->get_caps_issued(); - - dout(7) << "file_eval_gather issued " << cap_string(issued) - << " vs " << cap_string(lock->caps_allowed()) - << " on " << *lock << " on " << *lock->get_parent() - << dendl; - - if (lock->is_stable()) - return; // nothing for us to do here! - - // [auth] finished gather? - if (in->is_auth() && - !lock->is_gathering() && - ((issued & ~lock->caps_allowed()) == 0)) { - dout(7) << "file_eval_gather finished gather" << dendl; - - switch (lock->get_state()) { - // to lock - case LOCK_GLOCKR: - case LOCK_GLOCKM: - case LOCK_GLOCKL: - lock->set_state(LOCK_LOCK); - - // waiters - lock->get_rdlock(); - lock->finish_waiters(SimpleLock::WAIT_STABLE|SimpleLock::WAIT_WR|SimpleLock::WAIT_RD); - lock->put_rdlock(); - lock->get_parent()->auth_unpin(); - break; - - // to mixed - case LOCK_GMIXEDR: - lock->set_state(LOCK_MIXED); - lock->finish_waiters(SimpleLock::WAIT_STABLE); - lock->get_parent()->auth_unpin(); - break; - - case LOCK_GMIXEDL: - lock->set_state(LOCK_MIXED); - - if (in->is_replicated()) { - // data - bufferlist softdata; - lock->encode_locked_state(softdata); - - // bcast to replicas - send_lock_message(lock, LOCK_AC_MIXED, softdata); - } - - lock->finish_waiters(SimpleLock::WAIT_STABLE); - lock->get_parent()->auth_unpin(); - break; - - // to loner - case LOCK_GLONERR: - lock->set_state(LOCK_LONER); - lock->finish_waiters(SimpleLock::WAIT_STABLE); - lock->get_parent()->auth_unpin(); - break; - - case LOCK_GLONERM: - lock->set_state(LOCK_LONER); - lock->finish_waiters(SimpleLock::WAIT_STABLE); - lock->get_parent()->auth_unpin(); - break; - - // to sync - case LOCK_GSYNCL: - case LOCK_GSYNCM: - lock->set_state(LOCK_SYNC); - - { // bcast data to replicas - bufferlist softdata; - lock->encode_locked_state(softdata); - - send_lock_message(lock, LOCK_AC_SYNC, softdata); - } - - // waiters - lock->get_rdlock(); - lock->finish_waiters(SimpleLock::WAIT_RD|SimpleLock::WAIT_STABLE); - lock->put_rdlock(); - lock->get_parent()->auth_unpin(); - break; - - default: - assert(0); - } - - issue_caps(in); - - // stable re-eval? - if (lock->is_stable()) //&& lock->get_parent()->can_auth_pin()) - file_eval(lock); - } - - // [replica] finished caps gather? - if (!in->is_auth() && - ((issued & ~lock->caps_allowed()) == 0)) { - switch (lock->get_state()) { - case LOCK_GMIXEDR: - { - lock->set_state(LOCK_MIXED); - - // ack - MLock *reply = new MLock(lock, LOCK_AC_MIXEDACK, mds->get_nodeid()); - mds->send_message_mds(reply, in->authority().first, MDS_PORT_LOCKER); - } - break; - - case LOCK_GLOCKR: - { - lock->set_state(LOCK_LOCK); - - // ack - MLock *reply = new MLock(lock, LOCK_AC_LOCKACK, mds->get_nodeid()); - mds->send_message_mds(reply, in->authority().first, MDS_PORT_LOCKER); - } - break; - - default: - assert(0); - } - } - - -} - -void Locker::file_eval(FileLock *lock) -{ - CInode *in = (CInode*)lock->get_parent(); - int wanted = in->get_caps_wanted(); - bool loner = (in->client_caps.size() == 1) && in->mds_caps_wanted.empty(); - dout(7) << "file_eval wanted=" << cap_string(wanted) - << " filelock=" << *lock << " on " << *lock->get_parent() - << " loner=" << loner - << dendl; - - assert(lock->get_parent()->is_auth()); - assert(lock->is_stable()); - - // not xlocked! - if (lock->is_xlocked() || lock->get_parent()->is_frozen()) return; - - // * -> loner? - if (!lock->is_rdlocked() && - !lock->is_waiter_for(SimpleLock::WAIT_WR) && - (wanted & CAP_FILE_WR) && - loner && - lock->get_state() != LOCK_LONER) { - dout(7) << "file_eval stable, bump to loner " << *lock << " on " << *lock->get_parent() << dendl; - file_loner(lock); - } - - // * -> mixed? - else if (!lock->is_rdlocked() && - !lock->is_waiter_for(SimpleLock::WAIT_WR) && - (wanted & CAP_FILE_RD) && - (wanted & CAP_FILE_WR) && - !(loner && lock->get_state() == LOCK_LONER) && - lock->get_state() != LOCK_MIXED) { - dout(7) << "file_eval stable, bump to mixed " << *lock << " on " << *lock->get_parent() << dendl; - file_mixed(lock); - } - - // * -> sync? - else if (!in->filelock.is_waiter_for(SimpleLock::WAIT_WR) && - !(wanted & (CAP_FILE_WR|CAP_FILE_WRBUFFER)) && - ((wanted & CAP_FILE_RD) || - in->is_replicated() || - (!loner && lock->get_state() == LOCK_LONER)) && - lock->get_state() != LOCK_SYNC) { - dout(7) << "file_eval stable, bump to sync " << *lock << " on " << *lock->get_parent() << dendl; - file_sync(lock); - } - - // * -> lock? (if not replicated or open) - else if (!in->is_replicated() && - wanted == 0 && - lock->get_state() != LOCK_LOCK) { - file_lock(lock); - } -} - - -// mid - -bool Locker::file_sync(FileLock *lock) -{ - CInode *in = (CInode*)lock->get_parent(); - dout(7) << "file_sync " << *lock << " on " << *lock->get_parent() << dendl; - - assert(in->is_auth()); - assert(lock->is_stable()); - - int issued = in->get_caps_issued(); - - assert((in->get_caps_wanted() & CAP_FILE_WR) == 0); - - if (lock->get_state() == LOCK_LOCK) { - if (in->is_replicated()) { - bufferlist softdata; - lock->encode_locked_state(softdata); - send_lock_message(lock, LOCK_AC_SYNC, softdata); - } - - // change lock - lock->set_state(LOCK_SYNC); - - issue_caps(in); // reissue caps - return true; - } - - else if (lock->get_state() == LOCK_MIXED) { - // writers? - if (issued & CAP_FILE_WR) { - // gather client write caps - lock->set_state(LOCK_GSYNCM); - lock->get_parent()->auth_pin(); - issue_caps(in); - } else { - // no writers, go straight to sync - if (in->is_replicated()) { - bufferlist softdata; - lock->encode_locked_state(softdata); - send_lock_message(lock, LOCK_AC_SYNC, softdata); - } - - // change lock - lock->set_state(LOCK_SYNC); - } - return false; - } - - else if (lock->get_state() == LOCK_LONER) { - // writers? - if (issued & CAP_FILE_WR) { - // gather client write caps - lock->set_state(LOCK_GSYNCL); - lock->get_parent()->auth_pin(); - issue_caps(in); - } else { - // no writers, go straight to sync - if (in->is_replicated()) { - bufferlist softdata; - lock->encode_locked_state(softdata); - send_lock_message(lock, LOCK_AC_SYNC, softdata); - } - - // change lock - lock->set_state(LOCK_SYNC); - } - return false; - } - else - assert(0); // wtf. - - return false; -} - - - -void Locker::file_lock(FileLock *lock) -{ - CInode *in = (CInode*)lock->get_parent(); - dout(7) << "inode_file_lock " << *lock << " on " << *lock->get_parent() << dendl; - - assert(in->is_auth()); - assert(lock->is_stable()); - - int issued = in->get_caps_issued(); - - if (lock->get_state() == LOCK_SYNC) { - if (in->is_replicated()) { - // bcast to replicas - send_lock_message(lock, LOCK_AC_LOCK); - lock->init_gather(); - - // change lock - lock->set_state(LOCK_GLOCKR); - lock->get_parent()->auth_pin(); - - // call back caps - if (issued) - issue_caps(in); - } else { - if (issued) { - // call back caps - lock->set_state(LOCK_GLOCKR); - lock->get_parent()->auth_pin(); - issue_caps(in); - } else { - lock->set_state(LOCK_LOCK); - } - } - } - - else if (lock->get_state() == LOCK_MIXED) { - if (in->is_replicated()) { - // bcast to replicas - send_lock_message(lock, LOCK_AC_LOCK); - lock->init_gather(); - - // change lock - lock->set_state(LOCK_GLOCKM); - lock->get_parent()->auth_pin(); - - // call back caps - issue_caps(in); - } else { - //assert(issued); // ??? -sage 2/19/06 - if (issued) { - // change lock - lock->set_state(LOCK_GLOCKM); - lock->get_parent()->auth_pin(); - - // call back caps - issue_caps(in); - } else { - lock->set_state(LOCK_LOCK); - } - } - - } - else if (lock->get_state() == LOCK_LONER) { - if (issued & CAP_FILE_WR) { - // change lock - lock->set_state(LOCK_GLOCKL); - lock->get_parent()->auth_pin(); - - // call back caps - issue_caps(in); - } else { - lock->set_state(LOCK_LOCK); - } - } - else - assert(0); // wtf. -} - - -void Locker::file_mixed(FileLock *lock) -{ - dout(7) << "file_mixed " << *lock << " on " << *lock->get_parent() << dendl; - - CInode *in = (CInode*)lock->get_parent(); - assert(in->is_auth()); - assert(lock->is_stable()); - - int issued = in->get_caps_issued(); - - if (lock->get_state() == LOCK_SYNC) { - if (in->is_replicated()) { - // bcast to replicas - send_lock_message(lock, LOCK_AC_MIXED); - lock->init_gather(); - - lock->set_state(LOCK_GMIXEDR); - lock->get_parent()->auth_pin(); - - issue_caps(in); - } else { - if (issued) { - lock->set_state(LOCK_GMIXEDR); - lock->get_parent()->auth_pin(); - issue_caps(in); - } else { - lock->set_state(LOCK_MIXED); - } - } - } - - else if (lock->get_state() == LOCK_LOCK) { - if (in->is_replicated()) { - // data - bufferlist softdata; - lock->encode_locked_state(softdata); - - // bcast to replicas - send_lock_message(lock, LOCK_AC_MIXED, softdata); - } - - // change lock - lock->set_state(LOCK_MIXED); - issue_caps(in); - } - - else if (lock->get_state() == LOCK_LONER) { - if (issued & CAP_FILE_WRBUFFER) { - // gather up WRBUFFER caps - lock->set_state(LOCK_GMIXEDL); - lock->get_parent()->auth_pin(); - issue_caps(in); - } - else if (in->is_replicated()) { - // bcast to replicas - send_lock_message(lock, LOCK_AC_MIXED); - lock->set_state(LOCK_MIXED); - issue_caps(in); - } else { - lock->set_state(LOCK_MIXED); - issue_caps(in); - } - } - - else - assert(0); // wtf. -} - - -void Locker::file_loner(FileLock *lock) -{ - CInode *in = (CInode*)lock->get_parent(); - dout(7) << "inode_file_loner " << *lock << " on " << *lock->get_parent() << dendl; - - assert(in->is_auth()); - assert(lock->is_stable()); - - assert((in->client_caps.size() == 1) && in->mds_caps_wanted.empty()); - - if (lock->get_state() == LOCK_SYNC) { - if (in->is_replicated()) { - // bcast to replicas - send_lock_message(lock, LOCK_AC_LOCK); - lock->init_gather(); - - // change lock - lock->set_state(LOCK_GLONERR); - lock->get_parent()->auth_pin(); - } else { - // only one guy with file open, who gets it all, so - lock->set_state(LOCK_LONER); - issue_caps(in); - } - } - - else if (lock->get_state() == LOCK_LOCK) { - // change lock. ignore replicas; they don't know about LONER. - lock->set_state(LOCK_LONER); - issue_caps(in); - } - - else if (lock->get_state() == LOCK_MIXED) { - if (in->is_replicated()) { - // bcast to replicas - send_lock_message(lock, LOCK_AC_LOCK); - lock->init_gather(); - - // change lock - lock->set_state(LOCK_GLONERM); - lock->get_parent()->auth_pin(); - } else { - lock->set_state(LOCK_LONER); - issue_caps(in); - } - } - - else - assert(0); -} - - - -// messenger - -void Locker::handle_file_lock(FileLock *lock, MLock *m) -{ - CInode *in = (CInode*)lock->get_parent(); - int from = m->get_asker(); - - if (mds->is_rejoin()) { - if (in->is_rejoining()) { - dout(7) << "handle_file_lock still rejoining " << *in - << ", dropping " << *m << dendl; - delete m; - return; - } - } - - - dout(7) << "handle_file_lock a=" << m->get_action() << " from " << from << " " - << *in << " filelock=" << *lock << dendl; - - int issued = in->get_caps_issued(); - - switch (m->get_action()) { - // -- replica -- - case LOCK_AC_SYNC: - assert(lock->get_state() == LOCK_LOCK || - lock->get_state() == LOCK_MIXED); - - lock->decode_locked_state(m->get_data()); - lock->set_state(LOCK_SYNC); - - // no need to reply. - - // waiters - lock->get_rdlock(); - lock->finish_waiters(SimpleLock::WAIT_RD|SimpleLock::WAIT_STABLE); - lock->put_rdlock(); - file_eval_gather(lock); - break; - - case LOCK_AC_LOCK: - assert(lock->get_state() == LOCK_SYNC || - lock->get_state() == LOCK_MIXED); - - lock->set_state(LOCK_GLOCKR); - - // call back caps? - if (issued & CAP_FILE_RD) { - dout(7) << "handle_file_lock client readers, gathering caps on " << *in << dendl; - issue_caps(in); - break; - } - else if (lock->is_rdlocked()) { - dout(7) << "handle_file_lock rdlocked, waiting before ack on " << *in << dendl; - break; - } - - // nothing to wait for, lock and ack. - { - lock->set_state(LOCK_LOCK); - - MLock *reply = new MLock(lock, LOCK_AC_LOCKACK, mds->get_nodeid()); - mds->send_message_mds(reply, from, MDS_PORT_LOCKER); - } - break; - - case LOCK_AC_MIXED: - assert(lock->get_state() == LOCK_SYNC || - lock->get_state() == LOCK_LOCK); - - if (lock->get_state() == LOCK_SYNC) { - // MIXED - if (issued & CAP_FILE_RD) { - // call back client caps - lock->set_state(LOCK_GMIXEDR); - issue_caps(in); - break; - } else { - // no clients, go straight to mixed - lock->set_state(LOCK_MIXED); - - // ack - MLock *reply = new MLock(lock, LOCK_AC_MIXEDACK, mds->get_nodeid()); - mds->send_message_mds(reply, from, MDS_PORT_LOCKER); - } - } else { - // LOCK - lock->set_state(LOCK_MIXED); - - // no ack needed. - } - - issue_caps(in); - - // waiters - lock->finish_waiters(SimpleLock::WAIT_WR|SimpleLock::WAIT_STABLE); - file_eval_gather(lock); - break; - - - - - // -- auth -- - case LOCK_AC_LOCKACK: - assert(lock->get_state() == LOCK_GLOCKR || - lock->get_state() == LOCK_GLOCKM || - lock->get_state() == LOCK_GLONERM || - lock->get_state() == LOCK_GLONERR); - assert(lock->is_gathering(from)); - lock->remove_gather(from); - - if (lock->is_gathering()) { - dout(7) << "handle_lock_inode_file " << *in << " from " << from - << ", still gathering " << lock->get_gather_set() << dendl; - } else { - dout(7) << "handle_lock_inode_file " << *in << " from " << from - << ", last one" << dendl; - file_eval_gather(lock); - } - break; - - case LOCK_AC_SYNCACK: - assert(lock->get_state() == LOCK_GSYNCM); - assert(lock->is_gathering(from)); - lock->remove_gather(from); - - /* not used currently - { - // merge data (keep largest size, mtime, etc.) - int off = 0; - in->decode_merge_file_state(m->get_data(), off); - } - */ - - if (lock->is_gathering()) { - dout(7) << "handle_lock_inode_file " << *in << " from " << from - << ", still gathering " << lock->get_gather_set() << dendl; - } else { - dout(7) << "handle_lock_inode_file " << *in << " from " << from - << ", last one" << dendl; - file_eval_gather(lock); - } - break; - - case LOCK_AC_MIXEDACK: - assert(lock->get_state() == LOCK_GMIXEDR); - assert(lock->is_gathering(from)); - lock->remove_gather(from); - - if (lock->is_gathering()) { - dout(7) << "handle_lock_inode_file " << *in << " from " << from - << ", still gathering " << lock->get_gather_set() << dendl; - } else { - dout(7) << "handle_lock_inode_file " << *in << " from " << from - << ", last one" << dendl; - file_eval_gather(lock); - } - break; - - - default: - assert(0); - } - - delete m; -} - - - - - - diff --git a/branches/sage/mds/mds/Locker.h b/branches/sage/mds/mds/Locker.h deleted file mode 100644 index a69055f49449e..0000000000000 --- a/branches/sage/mds/mds/Locker.h +++ /dev/null @@ -1,195 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_LOCKER_H -#define __MDS_LOCKER_H - -#include "include/types.h" - -#include -#include -#include -using std::map; -using std::list; -using std::set; - -class MDS; -class CDir; -class CInode; -class CDentry; - -class Message; - -class MDiscover; -class MDiscoverReply; -class MCacheExpire; -class MDirUpdate; -class MDentryUnlink; -class MLock; - -class MClientRequest; - -class Anchor; -class Capability; -class LogSegment; - -class SimpleLock; -class FileLock; -class ScatterLock; -class LocalLock; - -class Locker { -private: - MDS *mds; - MDCache *mdcache; - - public: - Locker(MDS *m, MDCache *c) : mds(m), mdcache(c) {} - - SimpleLock *get_lock(int lock_type, MDSCacheObjectInfo &info); - - void dispatch(Message *m); - void handle_lock(MLock *m); - - -protected: - void send_lock_message(SimpleLock *lock, int msg); - void send_lock_message(SimpleLock *lock, int msg, const bufferlist &data); - - // -- locks -- -public: - bool acquire_locks(MDRequest *mdr, - set &rdlocks, - set &wrlocks, - set &xlocks); - - void drop_locks(MDRequest *mdr); - -protected: - bool rdlock_start(SimpleLock *lock, MDRequest *mdr); - void rdlock_finish(SimpleLock *lock, MDRequest *mdr); - bool xlock_start(SimpleLock *lock, MDRequest *mdr); -public: - void xlock_finish(SimpleLock *lock, MDRequest *mdr); // public for Server's slave UNXLOCK -protected: - bool wrlock_start(SimpleLock *lock, MDRequest *mdr); - void wrlock_finish(SimpleLock *lock, MDRequest *mdr); - -public: - void rejoin_set_state(SimpleLock *lock, int s, list& waiters); - - // simple -public: - void try_simple_eval(SimpleLock *lock); - void simple_eval_gather(SimpleLock *lock); - bool simple_rdlock_try(SimpleLock *lock, Context *con); -protected: - void simple_eval(SimpleLock *lock); - void handle_simple_lock(SimpleLock *lock, MLock *m); - void simple_sync(SimpleLock *lock); - void simple_lock(SimpleLock *lock); - bool simple_rdlock_start(SimpleLock *lock, MDRequest *mdr); - void simple_rdlock_finish(SimpleLock *lock, MDRequest *mdr); - bool simple_xlock_start(SimpleLock *lock, MDRequest *mdr); - void simple_xlock_finish(SimpleLock *lock, MDRequest *mdr); - -public: - bool dentry_can_rdlock_trace(vector& trace); - void dentry_anon_rdlock_trace_start(vector& trace); - void dentry_anon_rdlock_trace_finish(vector& trace); - - // scatter -protected: - xlist autoscattered; - -public: - void try_scatter_eval(ScatterLock *lock); - void scatter_eval(ScatterLock *lock); // public for MDCache::adjust_subtree_auth() - void scatter_eval_gather(ScatterLock *lock); - - void scatter_unscatter_autoscattered(); - void scatter_try_unscatter(ScatterLock *lock, Context *c); - void note_autoscattered(ScatterLock *lock); - - void scatter_lock(ScatterLock *lock); // called by LogSegment::try_to_expire - -protected: - void handle_scatter_lock(ScatterLock *lock, MLock *m); - void _scatter_replica_lock(ScatterLock *lock, int auth); - void scatter_sync(ScatterLock *lock); - void scatter_scatter(ScatterLock *lock); - void scatter_tempsync(ScatterLock *lock); - bool scatter_rdlock_start(ScatterLock *lock, MDRequest *mdr); - void scatter_rdlock_finish(ScatterLock *lock, MDRequest *mdr); - bool scatter_wrlock_start(ScatterLock *lock, MDRequest *mdr); - void scatter_wrlock_finish(ScatterLock *lock, MDRequest *mdr); - - void scatter_writebehind(ScatterLock *lock); - class C_Locker_ScatterWB : public Context { - Locker *locker; - ScatterLock *lock; - LogSegment *ls; - public: - C_Locker_ScatterWB(Locker *l, ScatterLock *sl, LogSegment *s) : locker(l), lock(sl), ls(s) {} - void finish(int r) { - locker->scatter_writebehind_finish(lock, ls); - } - }; - void scatter_writebehind_finish(ScatterLock *lock, LogSegment *ls); - - // local -protected: - bool local_wrlock_start(LocalLock *lock, MDRequest *mdr); - void local_wrlock_finish(LocalLock *lock, MDRequest *mdr); - bool local_xlock_start(LocalLock *lock, MDRequest *mdr); - void local_xlock_finish(LocalLock *lock, MDRequest *mdr); - - - // file -public: - void file_eval_gather(FileLock *lock); - void try_file_eval(FileLock *lock); -protected: - void file_eval(FileLock *lock); - void handle_file_lock(FileLock *lock, MLock *m); - bool file_sync(FileLock *lock); - void file_lock(FileLock *lock); - void file_mixed(FileLock *lock); - void file_loner(FileLock *lock); - bool file_rdlock_try(FileLock *lock, Context *con); - bool file_rdlock_start(FileLock *lock, MDRequest *mdr); - void file_rdlock_finish(FileLock *lock, MDRequest *mdr); - bool file_xlock_start(FileLock *lock, MDRequest *mdr); - void file_xlock_finish(FileLock *lock, MDRequest *mdr); - - - - // -- file i/o -- - public: - version_t issue_file_data_version(CInode *in); - Capability* issue_new_caps(CInode *in, int mode, MClientRequest *req); - bool issue_caps(CInode *in); - - protected: - void handle_client_file_caps(class MClientFileCaps *m); - - void request_inode_file_caps(CInode *in); - void handle_inode_file_caps(class MInodeFileCaps *m); - - friend class C_MDL_RequestInodeFileCaps; - -}; - - -#endif diff --git a/branches/sage/mds/mds/MDBalancer.cc b/branches/sage/mds/mds/MDBalancer.cc deleted file mode 100644 index 933c8306a7526..0000000000000 --- a/branches/sage/mds/mds/MDBalancer.cc +++ /dev/null @@ -1,1050 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "mdstypes.h" - -#include "MDBalancer.h" -#include "MDS.h" -#include "MDSMap.h" -#include "CInode.h" -#include "CDir.h" -#include "MDCache.h" -#include "Migrator.h" - -#include "include/Context.h" -#include "msg/Messenger.h" -#include "messages/MHeartbeat.h" - -#include -#include -using std::map; -using std::vector; - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug_mds || l<=g_conf.debug_mds_balancer) *_dout << dbeginl << g_clock.now() << " mds" << mds->get_nodeid() << ".bal " - -#define MIN_LOAD 50 // ?? -#define MIN_REEXPORT 5 // will automatically reexport -#define MIN_OFFLOAD 10 // point at which i stop trying, close enough - - - -int MDBalancer::proc_message(Message *m) -{ - switch (m->get_type()) { - - case MSG_MDS_HEARTBEAT: - handle_heartbeat((MHeartbeat*)m); - break; - - default: - dout(1) << " balancer unknown message " << m->get_type() << dendl; - assert(0); - break; - } - - return 0; -} - - - - -void MDBalancer::tick() -{ - static int num_bal_times = g_conf.mds_bal_max; - static utime_t first = g_clock.now(); - utime_t now = g_clock.now(); - utime_t elapsed = now; - elapsed -= first; - - // sample? - if ((double)now - (double)last_sample > g_conf.mds_bal_sample_interval) { - dout(15) << "tick last_sample now " << now << dendl; - last_sample = now; - } - - // balance? - if (last_heartbeat == utime_t()) last_heartbeat = now; - if (true && - mds->get_nodeid() == 0 && - g_conf.mds_bal_interval > 0 && - (num_bal_times || - (g_conf.mds_bal_max_until >= 0 && - elapsed.sec() > g_conf.mds_bal_max_until)) && - mds->is_active() && - now.sec() - last_heartbeat.sec() >= g_conf.mds_bal_interval) { - last_heartbeat = now; - send_heartbeat(); - num_bal_times--; - } - - // hash? - if (true && - now.sec() - last_fragment.sec() > g_conf.mds_bal_fragment_interval) { - last_fragment = now; - do_fragmenting(); - } -} - - - - -class C_Bal_SendHeartbeat : public Context { -public: - MDS *mds; - C_Bal_SendHeartbeat(MDS *mds) { - this->mds = mds; - } - virtual void finish(int f) { - mds->balancer->send_heartbeat(); - } -}; - - -double mds_load_t::mds_load() -{ - switch(g_conf.mds_bal_mode) { - case 0: - return - .8 * auth.meta_load() + - .2 * all.meta_load() + - req_rate + - 10.0 * queue_len; - - case 1: - return req_rate + 10.0*queue_len; - - case 2: - return cpu_load_avg; - - } - assert(0); - return 0; -} - -mds_load_t MDBalancer::get_load() -{ - mds_load_t load; - - if (mds->mdcache->get_root()) { - list ls; - mds->mdcache->get_root()->get_dirfrags(ls); - for (list::iterator p = ls.begin(); - p != ls.end(); - p++) { - load.auth += (*p)->pop_auth_subtree_nested; - load.all += (*p)->pop_nested; - } - } else { - dout(20) << "get_load no root, no load" << dendl; - } - - load.req_rate = mds->get_req_rate(); - load.queue_len = mds->messenger->get_dispatch_queue_len(); - - ifstream cpu("/proc/loadavg"); - if (cpu.is_open()) - cpu >> load.cpu_load_avg; - - dout(15) << "get_load " << load << dendl; - return load; -} - -void MDBalancer::send_heartbeat() -{ - utime_t now = g_clock.now(); - if (!mds->mdcache->get_root()) { - dout(5) << "no root on send_heartbeat" << dendl; - mds->mdcache->open_root(new C_Bal_SendHeartbeat(mds)); - return; - } - - mds_load.clear(); - if (mds->get_nodeid() == 0) - beat_epoch++; - - // my load - mds_load_t load = get_load(); - mds_load[ mds->get_nodeid() ] = load; - - // import_map -- how much do i import from whom - map import_map; - set authsubs; - mds->mdcache->get_auth_subtrees(authsubs); - for (set::iterator it = authsubs.begin(); - it != authsubs.end(); - it++) { - CDir *im = *it; - int from = im->inode->authority().first; - if (from == mds->get_nodeid()) continue; - if (im->get_inode()->is_stray()) continue; - import_map[from] += im->pop_auth_subtree.meta_load(now); - } - mds_import_map[ mds->get_nodeid() ] = import_map; - - - dout(5) << "mds" << mds->get_nodeid() << " epoch " << beat_epoch << " load " << load << dendl; - for (map::iterator it = import_map.begin(); - it != import_map.end(); - it++) { - dout(5) << " import_map from " << it->first << " -> " << it->second << dendl; - } - - - set up; - mds->get_mds_map()->get_in_mds_set(up); - for (set::iterator p = up.begin(); p != up.end(); ++p) { - if (*p == mds->get_nodeid()) continue; - MHeartbeat *hb = new MHeartbeat(load, beat_epoch); - hb->get_import_map() = import_map; - mds->messenger->send_message(hb, - mds->mdsmap->get_inst(*p), - MDS_PORT_BALANCER, MDS_PORT_BALANCER); - } -} - -void MDBalancer::handle_heartbeat(MHeartbeat *m) -{ - dout(25) << "=== got heartbeat " << m->get_beat() << " from " << m->get_source().num() << " " << m->get_load() << dendl; - - if (!mds->is_active()) - return; - - if (!mds->mdcache->get_root()) { - dout(10) << "opening root on handle_heartbeat" << dendl; - mds->mdcache->open_root(new C_MDS_RetryMessage(mds, m)); - return; - } - - int who = m->get_source().num(); - - if (who == 0) { - dout(20) << " from mds0, new epoch" << dendl; - beat_epoch = m->get_beat(); - send_heartbeat(); - - show_imports(); - } - - mds_load[ who ] = m->get_load(); - mds_import_map[ who ] = m->get_import_map(); - - //dout(0) << " load is " << load << " have " << mds_load.size() << dendl; - - unsigned cluster_size = mds->get_mds_map()->get_num_in_mds(); - if (mds_load.size() == cluster_size) { - // let's go! - //export_empties(); // no! - do_rebalance(m->get_beat()); - } - - // done - delete m; -} - - -void MDBalancer::export_empties() -{ - dout(5) << "export_empties checking for empty imports" << dendl; - - for (map >::iterator it = mds->mdcache->subtrees.begin(); - it != mds->mdcache->subtrees.end(); - it++) { - CDir *dir = it->first; - if (!dir->is_auth() || - dir->is_ambiguous_auth() || - dir->is_freezing() || - dir->is_frozen()) - continue; - - if (!dir->inode->is_root() && dir->get_size() == 0) - mds->mdcache->migrator->export_empty_import(dir); - } -} - - - -double MDBalancer::try_match(int ex, double& maxex, - int im, double& maxim) -{ - if (maxex <= 0 || maxim <= 0) return 0.0; - - double howmuch = MIN(maxex, maxim); - if (howmuch <= 0) return 0.0; - - dout(5) << " - mds" << ex << " exports " << howmuch << " to mds" << im << dendl; - - if (ex == mds->get_nodeid()) - my_targets[im] += howmuch; - - exported[ex] += howmuch; - imported[im] += howmuch; - - maxex -= howmuch; - maxim -= howmuch; - - return howmuch; -} - - - -void MDBalancer::do_fragmenting() -{ - if (split_queue.empty()) { - dout(20) << "do_fragmenting has nothing to do" << dendl; - return; - } - - dout(0) << "do_fragmenting " << split_queue.size() << " dirs marked for possible splitting" << dendl; - - for (set::iterator i = split_queue.begin(); - i != split_queue.end(); - i++) { - CDir *dir = mds->mdcache->get_dirfrag(*i); - if (!dir) continue; - if (!dir->is_auth()) continue; - - dout(0) << "do_fragmenting splitting " << *dir << dendl; - mds->mdcache->split_dir(dir, 4); - } - split_queue.clear(); -} - - - -void MDBalancer::do_rebalance(int beat) -{ - int cluster_size = mds->get_mds_map()->get_num_mds(); - int whoami = mds->get_nodeid(); - utime_t now = g_clock.now(); - - dump_pop_map(); - - // reset - my_targets.clear(); - imported.clear(); - exported.clear(); - - dout(5) << " do_rebalance: cluster loads are" << dendl; - - mds->mdcache->migrator->clear_export_queue(); - - // rescale! turn my mds_load back into meta_load units - double load_fac = 1.0; - if (mds_load[whoami].mds_load() > 0) { - double metald = mds_load[whoami].auth.meta_load(now); - double mdsld = mds_load[whoami].mds_load(); - load_fac = metald / mdsld; - dout(7) << " load_fac is " << load_fac - << " <- " << mds_load[whoami].auth << " " << metald - << " / " << mdsld - << dendl; - } - - double total_load = 0; - multimap load_map; - for (int i=0; i( l, i )); - } - - // target load - target_load = total_load / (double)cluster_size; - dout(5) << "do_rebalance: my load " << my_load - << " target " << target_load - << " total " << total_load - << dendl; - - // under or over? - if (my_load < target_load * (1.0 + g_conf.mds_bal_min_rebalance)) { - dout(5) << " i am underloaded or barely overloaded, doing nothing." << dendl; - last_epoch_under = beat_epoch; - show_imports(); - return; - } - - last_epoch_over = beat_epoch; - - // am i over long enough? - if (last_epoch_under && beat_epoch - last_epoch_under < 2) { - dout(5) << " i am overloaded, but only for " << (beat_epoch - last_epoch_under) << " epochs" << dendl; - return; - } - - dout(5) << " i am sufficiently overloaded" << dendl; - - - // first separate exporters and importers - multimap importers; - multimap exporters; - set importer_set; - set exporter_set; - - for (multimap::iterator it = load_map.begin(); - it != load_map.end(); - it++) { - if (it->first < target_load) { - dout(15) << " mds" << it->second << " is importer" << dendl; - importers.insert(pair(it->first,it->second)); - importer_set.insert(it->second); - } else { - dout(15) << " mds" << it->second << " is exporter" << dendl; - exporters.insert(pair(it->first,it->second)); - exporter_set.insert(it->second); - } - } - - - // determine load transfer mapping - - if (true) { - // analyze import_map; do any matches i can - - dout(15) << " matching exporters to import sources" << dendl; - - // big -> small exporters - for (multimap::reverse_iterator ex = exporters.rbegin(); - ex != exporters.rend(); - ex++) { - double maxex = get_maxex(ex->second); - if (maxex <= .001) continue; - - // check importers. for now, just in arbitrary order (no intelligent matching). - for (map::iterator im = mds_import_map[ex->second].begin(); - im != mds_import_map[ex->second].end(); - im++) { - double maxim = get_maxim(im->first); - if (maxim <= .001) continue; - try_match(ex->second, maxex, - im->first, maxim); - if (maxex <= .001) break;; - } - } - } - - - if (1) { - if (beat % 2 == 1) { - // old way - dout(15) << " matching big exporters to big importers" << dendl; - // big exporters to big importers - multimap::reverse_iterator ex = exporters.rbegin(); - multimap::iterator im = importers.begin(); - while (ex != exporters.rend() && - im != importers.end()) { - double maxex = get_maxex(ex->second); - double maxim = get_maxim(im->second); - if (maxex < .001 || maxim < .001) break; - try_match(ex->second, maxex, - im->second, maxim); - if (maxex <= .001) ex++; - if (maxim <= .001) im++; - } - } else { - // new way - dout(15) << " matching small exporters to big importers" << dendl; - // small exporters to big importers - multimap::iterator ex = exporters.begin(); - multimap::iterator im = importers.begin(); - while (ex != exporters.end() && - im != importers.end()) { - double maxex = get_maxex(ex->second); - double maxim = get_maxim(im->second); - if (maxex < .001 || maxim < .001) break; - try_match(ex->second, maxex, - im->second, maxim); - if (maxex <= .001) ex++; - if (maxim <= .001) im++; - } - } - } - - - - // make a sorted list of my imports - map import_pop_map; - multimap import_from_map; - set fullauthsubs; - - mds->mdcache->get_fullauth_subtrees(fullauthsubs); - for (set::iterator it = fullauthsubs.begin(); - it != fullauthsubs.end(); - it++) { - CDir *im = *it; - if (im->get_inode()->is_stray()) continue; - - double pop = im->pop_auth_subtree.meta_load(now); - if (g_conf.mds_bal_idle_threshold > 0 && - pop < g_conf.mds_bal_idle_threshold && - im->inode != mds->mdcache->get_root() && - im->inode->authority().first != mds->get_nodeid()) { - dout(-5) << " exporting idle (" << pop << ") import " << *im - << " back to mds" << im->inode->authority().first - << dendl; - mds->mdcache->migrator->export_dir_nicely(im, im->inode->authority().first); - continue; - } - - import_pop_map[ pop ] = im; - int from = im->inode->authority().first; - dout(15) << " map: i imported " << *im << " from " << from << dendl; - import_from_map.insert(pair(from, im)); - } - - - - // do my exports! - set already_exporting; - double total_sent = 0; - double total_goal = 0; - - for (map::iterator it = my_targets.begin(); - it != my_targets.end(); - it++) { - - /* - double fac = 1.0; - if (false && total_goal > 0 && total_sent > 0) { - fac = total_goal / total_sent; - dout(-5) << " total sent is " << total_sent << " / " << total_goal << " -> fac 1/ " << fac << dendl; - if (fac > 1.0) fac = 1.0; - } - fac = .9 - .4 * ((float)g_conf.num_mds / 128.0); // hack magic fixme - */ - - int target = (*it).first; - double amount = (*it).second; - total_goal += amount; - - if (amount < MIN_OFFLOAD) continue; - if (amount / target_load < .2) continue; - - dout(5) << "want to send " << amount << " to mds" << target - //<< " .. " << (*it).second << " * " << load_fac - << " -> " << amount - << dendl;//" .. fudge is " << fudge << dendl; - double have = 0; - - - show_imports(); - - // search imports from target - if (import_from_map.count(target)) { - dout(5) << " aha, looking through imports from target mds" << target << dendl; - pair::iterator, multimap::iterator> p = - import_from_map.equal_range(target); - while (p.first != p.second) { - CDir *dir = (*p.first).second; - dout(5) << "considering " << *dir << " from " << (*p.first).first << dendl; - multimap::iterator plast = p.first++; - - if (dir->inode->is_root()) continue; - if (dir->is_freezing() || dir->is_frozen()) continue; // export pbly already in progress - double pop = dir->pop_auth_subtree.meta_load(now); - assert(dir->inode->authority().first == target); // cuz that's how i put it in the map, dummy - - if (pop <= amount-have) { - dout(-5) << "reexporting " << *dir - << " pop " << pop - << " back to mds" << target << dendl; - mds->mdcache->migrator->export_dir_nicely(dir, target); - have += pop; - import_from_map.erase(plast); - import_pop_map.erase(pop); - } else { - dout(5) << "can't reexport " << *dir << ", too big " << pop << dendl; - } - if (amount-have < MIN_OFFLOAD) break; - } - } - if (amount-have < MIN_OFFLOAD) { - total_sent += have; - continue; - } - - // any other imports - if (false) - for (map::iterator import = import_pop_map.begin(); - import != import_pop_map.end(); - import++) { - CDir *imp = (*import).second; - if (imp->inode->is_root()) continue; - - double pop = (*import).first; - if (pop < amount-have || pop < MIN_REEXPORT) { - dout(-5) << "reexporting " << *imp - << " pop " << pop - << " back to mds" << imp->inode->authority() - << dendl; - have += pop; - mds->mdcache->migrator->export_dir_nicely(imp, imp->inode->authority().first); - } - if (amount-have < MIN_OFFLOAD) break; - } - if (amount-have < MIN_OFFLOAD) { - //fudge = amount-have; - total_sent += have; - continue; - } - - // okay, search for fragments of my workload - set candidates; - mds->mdcache->get_fullauth_subtrees(candidates); - - list exports; - - for (set::iterator pot = candidates.begin(); - pot != candidates.end(); - pot++) { - if ((*pot)->get_inode()->is_stray()) continue; - find_exports(*pot, amount, exports, have, already_exporting, now); - if (have > amount-MIN_OFFLOAD) - break; - } - //fudge = amount - have; - total_sent += have; - - for (list::iterator it = exports.begin(); it != exports.end(); it++) { - dout(-5) << " - exporting " - << (*it)->pop_auth_subtree - << " " - << (*it)->pop_auth_subtree.meta_load(now) - << " to mds" << target - << " " << **it - << dendl; - mds->mdcache->migrator->export_dir_nicely(*it, target); - } - } - - dout(5) << "rebalance done" << dendl; - show_imports(); - -} - - - -void MDBalancer::find_exports(CDir *dir, - double amount, - list& exports, - double& have, - set& already_exporting, - utime_t now) -{ - double need = amount - have; - if (need < amount * g_conf.mds_bal_min_start) - return; // good enough! - double needmax = need * g_conf.mds_bal_need_max; - double needmin = need * g_conf.mds_bal_need_min; - double midchunk = need * g_conf.mds_bal_midchunk; - double minchunk = need * g_conf.mds_bal_minchunk; - - list bigger_rep, bigger_unrep; - multimap smaller; - - double dir_pop = dir->pop_auth_subtree.meta_load(now); - dout(7) << " find_exports in " << dir_pop << " " << *dir << " need " << need << " (" << needmin << " - " << needmax << ")" << dendl; - - double subdir_sum = 0; - for (CDir::map_t::iterator it = dir->begin(); - it != dir->end(); - it++) { - CInode *in = it->second->get_inode(); - if (!in) continue; - if (!in->is_dir()) continue; - - list dfls; - in->get_dirfrags(dfls); - for (list::iterator p = dfls.begin(); - p != dfls.end(); - ++p) { - CDir *subdir = *p; - if (!subdir->is_auth()) continue; - if (already_exporting.count(subdir)) continue; - - if (subdir->is_frozen()) continue; // can't export this right now! - - // how popular? - double pop = subdir->pop_auth_subtree.meta_load(now); - subdir_sum += pop; - dout(15) << " subdir pop " << pop << " " << *subdir << dendl; - - if (pop < minchunk) continue; - - // lucky find? - if (pop > needmin && pop < needmax) { - exports.push_back(subdir); - already_exporting.insert(subdir); - have += pop; - return; - } - - if (pop > need) { - if (subdir->is_rep()) - bigger_rep.push_back(subdir); - else - bigger_unrep.push_back(subdir); - } else - smaller.insert(pair(pop, subdir)); - } - } - dout(15) << " sum " << subdir_sum << " / " << dir_pop << dendl; - - // grab some sufficiently big small items - multimap::reverse_iterator it; - for (it = smaller.rbegin(); - it != smaller.rend(); - it++) { - - if ((*it).first < midchunk) - break; // try later - - dout(7) << " taking smaller " << *(*it).second << dendl; - - exports.push_back((*it).second); - already_exporting.insert((*it).second); - have += (*it).first; - if (have > needmin) - return; - } - - // apprently not enough; drill deeper into the hierarchy (if non-replicated) - for (list::iterator it = bigger_unrep.begin(); - it != bigger_unrep.end(); - it++) { - dout(15) << " descending into " << **it << dendl; - find_exports(*it, amount, exports, have, already_exporting, now); - if (have > needmin) - return; - } - - // ok fine, use smaller bits - for (; - it != smaller.rend(); - it++) { - dout(7) << " taking (much) smaller " << it->first << " " << *(*it).second << dendl; - - exports.push_back((*it).second); - already_exporting.insert((*it).second); - have += (*it).first; - if (have > needmin) - return; - } - - // ok fine, drill into replicated dirs - for (list::iterator it = bigger_rep.begin(); - it != bigger_rep.end(); - it++) { - dout(7) << " descending into replicated " << **it << dendl; - find_exports(*it, amount, exports, have, already_exporting, now); - if (have > needmin) - return; - } - -} - - - - -void MDBalancer::hit_inode(utime_t now, CInode *in, int type, int who) -{ - // hit inode - in->pop.get(type).hit(now); - - if (in->get_parent_dn()) - hit_dir(now, in->get_parent_dn()->get_dir(), type, who); -} -/* - // hit me - in->popularity[MDS_POP_JUSTME].pop[type].hit(now); - in->popularity[MDS_POP_NESTED].pop[type].hit(now); - if (in->is_auth()) { - in->popularity[MDS_POP_CURDOM].pop[type].hit(now); - in->popularity[MDS_POP_ANYDOM].pop[type].hit(now); - - dout(20) << "hit_inode " << type << " pop " - << in->popularity[MDS_POP_JUSTME].pop[type].get(now) << " me, " - << in->popularity[MDS_POP_NESTED].pop[type].get(now) << " nested, " - << in->popularity[MDS_POP_CURDOM].pop[type].get(now) << " curdom, " - << in->popularity[MDS_POP_CURDOM].pop[type].get(now) << " anydom" - << " on " << *in - << dendl; - } else { - dout(20) << "hit_inode " << type << " pop " - << in->popularity[MDS_POP_JUSTME].pop[type].get(now) << " me, " - << in->popularity[MDS_POP_NESTED].pop[type].get(now) << " nested, " - << " on " << *in - << dendl; - } - - // hit auth up to import - CDir *dir = in->get_parent_dir(); - if (dir) hit_dir(now, dir, type); -*/ - - -void MDBalancer::hit_dir(utime_t now, CDir *dir, int type, int who, double amount) -{ - // hit me - double v = dir->pop_me.get(type).hit(now, amount); - - //if (dir->ino() == inodeno_t(0x10000000000)) - //dout(0) << "hit_dir " << type << " pop " << v << " in " << *dir << dendl; - - // hit modify counter, if this was a modify - if (g_conf.num_mds > 2 && // FIXME >2 thing - !dir->inode->is_root() && // not root (for now at least) - dir->is_auth() && - - ((g_conf.mds_bal_split_size > 0 && - dir->get_size() > (unsigned)g_conf.mds_bal_split_size) || - (v > g_conf.mds_bal_split_rd && type == META_POP_IRD) || - (v > g_conf.mds_bal_split_wr && type == META_POP_IWR)) && - split_queue.count(dir->dirfrag()) == 0) { - dout(0) << "hit_dir " << type << " pop is " << v << ", putting in split_queue: " << *dir << dendl; - split_queue.insert(dir->dirfrag()); - } - - // replicate? - if (type == META_POP_IRD && who >= 0) { - dir->pop_spread.hit(now, who); - } - - double rd_adj = 0; - if (type == META_POP_IRD && - dir->last_popularity_sample < last_sample) { - float dir_pop = dir->pop_auth_subtree.get(type).get(now); // hmm?? - dir->last_popularity_sample = last_sample; - float pop_sp = dir->pop_spread.get(now); - dir_pop += pop_sp * 10; - - //if (dir->ino() == inodeno_t(0x10000000002)) - if (pop_sp > 0) { - dout(20) << "hit_dir " << type << " pop " << dir_pop << " spread " << pop_sp - << " " << dir->pop_spread.last[0] - << " " << dir->pop_spread.last[1] - << " " << dir->pop_spread.last[2] - << " " << dir->pop_spread.last[3] - << " in " << *dir << dendl; - } - - if (dir->is_auth() && !dir->is_ambiguous_auth()) { - if (!dir->is_rep() && - dir_pop >= g_conf.mds_bal_replicate_threshold) { - // replicate - float rdp = dir->pop_me.get(META_POP_IRD).get(now); - rd_adj = rdp / mds->get_mds_map()->get_num_mds() - rdp; - rd_adj /= 2.0; // temper somewhat - - dout(0) << "replicating dir " << *dir << " pop " << dir_pop << " .. rdp " << rdp << " adj " << rd_adj << dendl; - - dir->dir_rep = CDir::REP_ALL; - mds->mdcache->send_dir_updates(dir, true); - - // fixme this should adjust the whole pop hierarchy - dir->pop_me.get(META_POP_IRD).adjust(rd_adj); - dir->pop_auth_subtree.get(META_POP_IRD).adjust(rd_adj); - } - - if (dir->ino() != 1 && - dir->is_rep() && - dir_pop < g_conf.mds_bal_unreplicate_threshold) { - // unreplicate - dout(0) << "unreplicating dir " << *dir << " pop " << dir_pop << dendl; - - dir->dir_rep = CDir::REP_NONE; - mds->mdcache->send_dir_updates(dir); - } - } - } - - // adjust ancestors - bool hit_subtree = dir->is_auth(); // current auth subtree (if any) - bool hit_subtree_nested = dir->is_auth(); // all nested auth subtrees - - while (1) { - dir->pop_nested.get(type).hit(now, amount); - if (rd_adj != 0.0) - dir->pop_nested.get(META_POP_IRD).adjust(now, rd_adj); - - if (hit_subtree) { - dir->pop_auth_subtree.get(type).hit(now, amount); - if (rd_adj != 0.0) - dir->pop_auth_subtree.get(META_POP_IRD).adjust(now, rd_adj); - } - - if (hit_subtree_nested) { - dir->pop_auth_subtree_nested.get(type).hit(now, amount); - if (rd_adj != 0.0) - dir->pop_auth_subtree_nested.get(META_POP_IRD).adjust(now, rd_adj); - } - - if (dir->is_subtree_root()) - hit_subtree = false; // end of auth domain, stop hitting auth counters. - - if (dir->inode->get_parent_dn() == 0) break; - dir = dir->inode->get_parent_dn()->get_dir(); - } -} - - -/* - * subtract off an exported chunk. - * this excludes *dir itself (encode_export_dir should have take care of that) - * we _just_ do the parents' nested counters. - * - * NOTE: call me _after_ forcing *dir into a subtree root, - * but _before_ doing the encode_export_dirs. - */ -void MDBalancer::subtract_export(CDir *dir) -{ - dirfrag_load_vec_t subload = dir->pop_auth_subtree; - - while (true) { - dir = dir->inode->get_parent_dir(); - if (!dir) break; - - dir->pop_nested -= subload; - dir->pop_auth_subtree_nested -= subload; - } -} - - -void MDBalancer::add_import(CDir *dir) -{ - dirfrag_load_vec_t subload = dir->pop_auth_subtree; - - while (true) { - dir = dir->inode->get_parent_dir(); - if (!dir) break; - - dir->pop_nested += subload; - dir->pop_auth_subtree_nested += subload; - } -} - - - - - - -void MDBalancer::show_imports(bool external) -{ - mds->mdcache->show_subtrees(); -} - - -void MDBalancer::dump_pop_map() -{ - return; // this is dumb - - - char fn[20]; - sprintf(fn, "popdump.%d.mds%d", beat_epoch, mds->get_nodeid()); - - dout(1) << "dump_pop_map to " << fn << dendl; - - ofstream myfile; - myfile.open(fn); - - list iq; - if (mds->mdcache->root) - iq.push_back(mds->mdcache->root); - - utime_t now = g_clock.now(); - while (!iq.empty()) { - CInode *in = iq.front(); - iq.pop_front(); - - // pop stats - /*for (int a=0; apopularity[a].pop[b].get(now) << "\t"; - */ - - // recurse, depth-first. - if (in->is_dir()) { - - list dirs; - in->get_dirfrags(dirs); - for (list::iterator p = dirs.begin(); - p != dirs.end(); - ++p) { - CDir *dir = *p; - - myfile << (int)dir->pop_me.meta_load(now) << "\t"; - myfile << (int)dir->pop_nested.meta_load(now) << "\t"; - myfile << (int)dir->pop_auth_subtree.meta_load(now) << "\t"; - myfile << (int)dir->pop_auth_subtree_nested.meta_load(now) << "\t"; - - // filename last - string p; - in->make_path_string(p); - myfile << "." << p; - if (dir->get_frag() != frag_t()) - myfile << "___" << (unsigned)dir->get_frag(); - myfile << std::endl; //"/" << dir->get_frag() << dendl; - - // add contents - for (CDir::map_t::iterator q = dir->items.begin(); - q != dir->items.end(); - q++) - if (q->second->is_primary()) - iq.push_front(q->second->get_inode()); - } - } - - } - - myfile.close(); -} - - - -/* replicate? - - float dir_pop = dir->get_popularity(); - - if (dir->is_auth()) { - if (!dir->is_rep() && - dir_pop >= g_conf.mds_bal_replicate_threshold) { - // replicate - dout(5) << "replicating dir " << *in << " pop " << dir_pop << dendl; - - dir->dir_rep = CDIR_REP_ALL; - mds->mdcache->send_dir_updates(dir); - } - - if (dir->is_rep() && - dir_pop < g_conf.mds_bal_unreplicate_threshold) { - // unreplicate - dout(5) << "unreplicating dir " << *in << " pop " << dir_pop << dendl; - - dir->dir_rep = CDIR_REP_NONE; - mds->mdcache->send_dir_updates(dir); - } - } - -*/ diff --git a/branches/sage/mds/mds/MDBalancer.h b/branches/sage/mds/mds/MDBalancer.h deleted file mode 100644 index 819c69f0616c0..0000000000000 --- a/branches/sage/mds/mds/MDBalancer.h +++ /dev/null @@ -1,118 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __MDBALANCER_H -#define __MDBALANCER_H - -#include -#include -using std::list; -using std::map; - -#include -using namespace __gnu_cxx; - -#include "include/types.h" -#include "common/Clock.h" -#include "CInode.h" - - -class MDS; -class Message; -class MHeartbeat; -class CInode; -class Context; -class CDir; - -class MDBalancer { - protected: - MDS *mds; - int beat_epoch; - - int last_epoch_under; - int last_epoch_over; - - utime_t last_heartbeat; - utime_t last_fragment; - utime_t last_sample; - - - // todo - set split_queue; - - // per-epoch scatter/gathered info - hash_map mds_load; - hash_map mds_meta_load; - map > mds_import_map; - - // per-epoch state - double my_load, target_load; - map my_targets; - map imported; - map exported; - - double try_match(int ex, double& maxex, - int im, double& maxim); - double get_maxim(int im) { - return target_load - mds_meta_load[im] - imported[im]; - } - double get_maxex(int ex) { - return mds_meta_load[ex] - target_load - exported[ex]; - } - - public: - MDBalancer(MDS *m) : - mds(m), - beat_epoch(0), - last_epoch_under(0), last_epoch_over(0) { } - - mds_load_t get_load(); - - int proc_message(Message *m); - - void send_heartbeat(); - void handle_heartbeat(MHeartbeat *m); - - void tick(); - - void do_fragmenting(); - - void export_empties(); - void do_rebalance(int beat); - void find_exports(CDir *dir, - double amount, - list& exports, - double& have, - set& already_exporting, - utime_t now); - - - void subtract_export(class CDir *ex); - void add_import(class CDir *im); - - void hit_inode(utime_t now, class CInode *in, int type, int who=-1); - void hit_dir(utime_t now, class CDir *dir, int type, int who, double amount=1.0); - void hit_recursive(utime_t now, class CDir *dir, int type, double amount, double rd_adj); - - - void show_imports(bool external=false); - void dump_pop_map(); - -}; - - - -#endif diff --git a/branches/sage/mds/mds/MDCache.cc b/branches/sage/mds/mds/MDCache.cc deleted file mode 100644 index 2ac319a4d439a..0000000000000 --- a/branches/sage/mds/mds/MDCache.cc +++ /dev/null @@ -1,6444 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "MDCache.h" -#include "MDS.h" -#include "Server.h" -#include "Locker.h" -#include "MDLog.h" -#include "MDBalancer.h" -#include "AnchorClient.h" -#include "Migrator.h" - -#include "MDSMap.h" - -#include "CInode.h" -#include "CDir.h" - -#include "include/filepath.h" - -#include "msg/Message.h" -#include "msg/Messenger.h" - -#include "common/Logger.h" - -#include "osdc/Filer.h" - -#include "events/ESubtreeMap.h" -#include "events/EUpdate.h" -#include "events/ESlaveUpdate.h" -#include "events/EString.h" -#include "events/EPurgeFinish.h" -#include "events/EImportFinish.h" -#include "events/EFragment.h" - -#include "messages/MGenericMessage.h" - -#include "messages/MMDSResolve.h" -#include "messages/MMDSResolveAck.h" -#include "messages/MMDSCacheRejoin.h" - -#include "messages/MDiscover.h" -#include "messages/MDiscoverReply.h" - -//#include "messages/MInodeUpdate.h" -#include "messages/MDirUpdate.h" -#include "messages/MCacheExpire.h" - -#include "messages/MInodeFileCaps.h" - -#include "messages/MLock.h" -#include "messages/MDentryUnlink.h" - -#include "messages/MClientRequest.h" -#include "messages/MClientFileCaps.h" - -#include "messages/MMDSSlaveRequest.h" - -#include "messages/MMDSFragmentNotify.h" - - -#include "IdAllocator.h" - -#include "common/Timer.h" - -#include -#include -#include -#include -#include -using namespace std; - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) *_dout << dbeginl << g_clock.now() << " mds" << mds->get_nodeid() << ".cache " - - - - -MDCache::MDCache(MDS *m) -{ - mds = m; - migrator = new Migrator(mds, this); - // renamer = new Renamer(mds, this); - root = NULL; - stray = NULL; - lru.lru_set_max(g_conf.mds_cache_size); - lru.lru_set_midpoint(g_conf.mds_cache_mid); - - did_shutdown_log_cap = false; -} - -MDCache::~MDCache() -{ - delete migrator; - //delete renamer; -} - - - -void MDCache::log_stat(Logger *logger) -{ - if (get_root()) { - utime_t now = g_clock.now(); - //logger->set("pop", (int)get_root()->pop_nested.meta_load(now)); - //logger->set("popauth", (int)get_root()->pop_auth_subtree_nested.meta_load(now)); - } - logger->set("c", lru.lru_get_size()); - logger->set("cpin", lru.lru_get_num_pinned()); - logger->set("ctop", lru.lru_get_top()); - logger->set("cbot", lru.lru_get_bot()); - logger->set("cptail", lru.lru_get_pintail()); -} - - -// - -bool MDCache::shutdown() -{ - if (lru.lru_get_size() > 0) { - dout(7) << "WARNING: mdcache shutodwn with non-empty cache" << dendl; - //show_cache(); - show_subtrees(); - //dump(); - } - return true; -} - - -// ==================================================================== -// some inode functions - -CInode *MDCache::create_inode() -{ - CInode *in = new CInode(this); - - // zero - memset(&in->inode, 0, sizeof(inode_t)); - - // assign ino - in->inode.ino = mds->idalloc->alloc_id(); - - in->inode.nlink = 1; // FIXME - - in->inode.layout = g_OSD_FileLayout; - - add_inode(in); // add - return in; -} - - -void MDCache::add_inode(CInode *in) -{ - // add to lru, inode map - assert(inode_map.count(in->ino()) == 0); // should be no dup inos! - inode_map[ in->ino() ] = in; - - if (in->ino() < MDS_INO_BASE) { - base_inodes.insert(in); - if (in->ino() == MDS_INO_ROOT) - set_root(in); - if (in->ino() == MDS_INO_STRAY(mds->get_nodeid())) - stray = in; - } -} - -void MDCache::remove_inode(CInode *o) -{ - dout(14) << "remove_inode " << *o << dendl; - - if (o->get_parent_dn()) { - // FIXME: multiple parents? - CDentry *dn = o->get_parent_dn(); - assert(!dn->is_dirty()); - dn->dir->unlink_inode(dn); // leave dentry ... FIXME? - } - - // remove from inode map - inode_map.erase(o->ino()); - - if (o->ino() < MDS_INO_BASE) { - assert(base_inodes.count(o)); - base_inodes.erase(o); - - if (o == root) root = 0; - if (o == stray) stray = 0; - } - - // delete it - delete o; -} - - - -CInode *MDCache::create_root_inode() -{ - CInode *root = new CInode(this); - memset(&root->inode, 0, sizeof(inode_t)); - root->inode.ino = MDS_INO_ROOT; - - // make it up (FIXME) - root->inode.mode = 0755 | S_IFDIR; - root->inode.size = 0; - root->inode.ctime = - root->inode.mtime = g_clock.now(); - - root->inode.nlink = 1; - root->inode.layout = g_OSD_MDDirLayout; - - root->inode_auth = pair(0, CDIR_AUTH_UNKNOWN); - - add_inode( root ); - - return root; -} - - -void MDCache::open_root(Context *c) -{ - int whoami = mds->get_nodeid(); - - // open root inode - if (whoami == 0) { - // i am root inode - CInode *root = create_root_inode(); - - // root directory too - CDir *dir = root->get_or_open_dirfrag(this, frag_t()); - adjust_subtree_auth(dir, 0); - dir->dir_rep = CDir::REP_ALL; //NONE; - - show_subtrees(); - - if (c) { - c->finish(0); - delete c; - } - } else { - // request inode from root mds - discover_base_ino(MDS_INO_ROOT, c, 0); - } -} - -CInode *MDCache::create_stray_inode(int whose) -{ - if (whose < 0) whose = mds->get_nodeid(); - - CInode *in = new CInode(this, whose == mds->get_nodeid()); - memset(&in->inode, 0, sizeof(inode_t)); - in->inode.ino = MDS_INO_STRAY(whose); - - // make it up (FIXME) - in->inode.mode = 0755 | S_IFDIR; - in->inode.size = 0; - in->inode.ctime = - in->inode.mtime = g_clock.now(); - - in->inode.nlink = 1; - in->inode.layout = g_OSD_MDDirLayout; - - add_inode( in ); - - return in; -} - -void MDCache::open_local_stray() -{ - create_stray_inode(); - CDir *dir = stray->get_or_open_dirfrag(this, frag_t()); - adjust_subtree_auth(dir, mds->get_nodeid()); -} - -void MDCache::open_foreign_stray(int who, Context *c) -{ - inodeno_t ino = MDS_INO_STRAY(who); - dout(10) << "open_foreign_stray mds" << who << " " << ino << dendl; - assert(!have_inode(ino)); - - discover_base_ino(ino, c, who); -} - - -CDentry *MDCache::get_or_create_stray_dentry(CInode *in) -{ - string straydname; - in->name_stray_dentry(straydname); - - if (!stray) create_stray_inode(mds->get_nodeid()); - - frag_t fg = stray->pick_dirfrag(straydname); - - CDir *straydir = stray->get_or_open_dirfrag(this, fg); - - CDentry *straydn = straydir->lookup(straydname); - if (!straydn) - straydn = straydir->add_null_dentry(straydname); - - return straydn; -} - - - -MDSCacheObject *MDCache::get_object(MDSCacheObjectInfo &info) -{ - // inode? - if (info.ino) - return get_inode(info.ino); - - // dir or dentry. - CDir *dir = get_dirfrag(info.dirfrag); - if (!dir) return 0; - - if (info.dname.length()) - return dir->lookup(info.dname); - else - return dir; -} - - - - -// ==================================================================== -// subtree management - -void MDCache::list_subtrees(list& ls) -{ - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) - ls.push_back(p->first); -} - -/* - * adjust the dir_auth of a subtree. - * merge with parent and/or child subtrees, if is it appropriate. - * merge can ONLY happen if both parent and child have unambiguous auth. - */ -void MDCache::adjust_subtree_auth(CDir *dir, pair auth) -{ - dout(7) << "adjust_subtree_auth " << dir->get_dir_auth() << " -> " << auth - << " on " << *dir << dendl; - - show_subtrees(); - - CDir *root; - if (dir->ino() < MDS_INO_BASE) { - root = dir; // bootstrap hack. - if (subtrees.count(root) == 0) { - subtrees[root].clear(); - root->get(CDir::PIN_SUBTREE); - } - } else { - root = get_subtree_root(dir); // subtree root - } - assert(root); - assert(subtrees.count(root)); - dout(7) << " current root is " << *root << dendl; - - if (root == dir) { - // i am already a subtree. - dir->set_dir_auth(auth); - } else { - // i am a new subtree. - dout(10) << " new subtree at " << *dir << dendl; - assert(subtrees.count(dir) == 0); - subtrees[dir].clear(); // create empty subtree bounds list for me. - dir->get(CDir::PIN_SUBTREE); - - // set dir_auth - dir->set_dir_auth(auth); - - // move items nested beneath me, under me. - set::iterator p = subtrees[root].begin(); - while (p != subtrees[root].end()) { - set::iterator next = p; - next++; - if (get_subtree_root((*p)->get_parent_dir()) == dir) { - // move under me - dout(10) << " claiming child bound " << **p << dendl; - subtrees[dir].insert(*p); - subtrees[root].erase(p); - } - p = next; - } - - // i am a bound of the parent subtree. - subtrees[root].insert(dir); - - // i am now the subtree root. - root = dir; - - // adjust recursive pop counters - if (dir->is_auth()) { - CDir *p = dir->get_parent_dir(); - while (p) { - p->pop_auth_subtree -= dir->pop_auth_subtree; - if (p->is_subtree_root()) break; - p = p->inode->get_parent_dir(); - } - } - - eval_subtree_root(dir); - } - - show_subtrees(); -} - - -void MDCache::try_subtree_merge(CDir *dir) -{ - dout(7) << "try_subtree_merge " << *dir << dendl; - assert(subtrees.count(dir)); - set oldbounds = subtrees[dir]; - - // try merge at my root - try_subtree_merge_at(dir); - - // try merge at my old bounds - for (set::iterator p = oldbounds.begin(); - p != oldbounds.end(); - ++p) - try_subtree_merge_at(*p); -} - -class C_MDC_SubtreeMergeWB : public Context { - MDCache *mdcache; - CInode *in; - LogSegment *ls; -public: - C_MDC_SubtreeMergeWB(MDCache *mdc, CInode *i, LogSegment *s) : mdcache(mdc), in(i), ls(s) {} - void finish(int r) { - mdcache->subtree_merge_writebehind_finish(in, ls); - } -}; - -void MDCache::try_subtree_merge_at(CDir *dir) -{ - dout(10) << "try_subtree_merge_at " << *dir << dendl; - assert(subtrees.count(dir)); - - // merge with parent? - CDir *parent = dir; - if (dir->ino() >= MDS_INO_BASE) - parent = get_subtree_root(dir->get_parent_dir()); - - if (parent != dir && // we have a parent, - parent->dir_auth == dir->dir_auth && // auth matches, - dir->dir_auth.second == CDIR_AUTH_UNKNOWN && // auth is unambiguous, - !dir->state_test(CDir::STATE_EXPORTBOUND)) { // not an exportbound, - // merge with parent. - dout(10) << " subtree merge at " << *dir << dendl; - dir->set_dir_auth(CDIR_AUTH_DEFAULT); - - // move our bounds under the parent - for (set::iterator p = subtrees[dir].begin(); - p != subtrees[dir].end(); - ++p) - subtrees[parent].insert(*p); - - // we are no longer a subtree or bound - dir->put(CDir::PIN_SUBTREE); - subtrees.erase(dir); - subtrees[parent].erase(dir); - - // adjust popularity? - if (dir->is_auth()) { - CDir *p = dir->get_parent_dir(); - while (p) { - p->pop_auth_subtree += dir->pop_auth_subtree; - if (p->is_subtree_root()) break; - p = p->inode->get_parent_dir(); - } - } - - eval_subtree_root(dir); - - // journal inode? - // (this is a large hammer to ensure that dirfragtree updates will - // hit the disk before the relevant dirfrags ever close) - if (dir->inode->is_auth() && - dir->inode->can_auth_pin() && - (mds->is_active() || mds->is_stopping())) { - CInode *in = dir->inode; - dout(10) << "try_subtree_merge_at journaling merged bound " << *in << dendl; - - in->auth_pin(); - - // journal write-behind. - inode_t *pi = in->project_inode(); - pi->version = in->pre_dirty(); - - EUpdate *le = new EUpdate(mds->mdlog, "subtree merge writebehind"); - le->metablob.add_dir_context(in->get_parent_dn()->get_dir()); - le->metablob.add_primary_dentry(in->get_parent_dn(), true, 0, pi); - - mds->mdlog->submit_entry(le); - mds->mdlog->wait_for_sync(new C_MDC_SubtreeMergeWB(this, in, - mds->mdlog->get_current_segment())); - } - } - - show_subtrees(15); -} - -void MDCache::subtree_merge_writebehind_finish(CInode *in, LogSegment *ls) -{ - dout(10) << "subtree_merge_writebehind_finish on " << in << dendl; - in->pop_and_dirty_projected_inode(ls); - in->auth_unpin(); -} - -void MDCache::eval_subtree_root(CDir *dir) -{ - // evaluate subtree inode dirlock? - // (we should scatter the dirlock on subtree bounds) - if (dir->inode->is_auth() && - dir->inode->dirlock.is_stable()) { - // force the issue a bit - if (!dir->inode->is_frozen()) - mds->locker->scatter_eval(&dir->inode->dirlock); - else - mds->locker->try_scatter_eval(&dir->inode->dirlock); // ** may or may not be auth_pinned ** - } - -} - - -void MDCache::adjust_bounded_subtree_auth(CDir *dir, set& bounds, pair auth) -{ - dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth - << " on " << *dir - << " bounds " << bounds - << dendl; - - show_subtrees(); - - CDir *root; - if (dir->ino() < MDS_INO_BASE) { - root = dir; // bootstrap hack. - if (subtrees.count(root) == 0) { - subtrees[root].clear(); - root->get(CDir::PIN_SUBTREE); - } - } else { - root = get_subtree_root(dir); // subtree root - } - assert(root); - assert(subtrees.count(root)); - dout(7) << " current root is " << *root << dendl; - - pair oldauth = dir->authority(); - - if (root == dir) { - // i am already a subtree. - dir->set_dir_auth(auth); - } else { - // i am a new subtree. - dout(10) << " new subtree at " << *dir << dendl; - assert(subtrees.count(dir) == 0); - subtrees[dir].clear(); // create empty subtree bounds list for me. - dir->get(CDir::PIN_SUBTREE); - - // set dir_auth - dir->set_dir_auth(auth); - - // move items nested beneath me, under me. - set::iterator p = subtrees[root].begin(); - while (p != subtrees[root].end()) { - set::iterator next = p; - next++; - if (get_subtree_root((*p)->get_parent_dir()) == dir) { - // move under me - dout(10) << " claiming child bound " << **p << dendl; - subtrees[dir].insert(*p); - subtrees[root].erase(p); - } - p = next; - } - - // i am a bound of the parent subtree. - subtrees[root].insert(dir); - - // i am now the subtree root. - root = dir; - } - - // verify/adjust bounds. - // - these may be new, or - // - beneath existing ambiguous bounds (which will be collapsed), - // - but NOT beneath unambiguous bounds. - for (set::iterator p = bounds.begin(); - p != bounds.end(); - ++p) { - CDir *bound = *p; - - // new bound? - if (subtrees[dir].count(bound) == 0) { - if (get_subtree_root(bound) == dir) { - dout(10) << " new bound " << *bound << ", adjusting auth back to old " << oldauth << dendl; - adjust_subtree_auth(bound, oldauth); // otherwise, adjust at bound. - } - else { - dout(10) << " want bound " << *bound << dendl; - // make sure it's nested beneath ambiguous subtree(s) - while (1) { - CDir *t = get_subtree_root(bound->get_parent_dir()); - if (t == dir) break; - while (subtrees[dir].count(t) == 0) - t = get_subtree_root(t->get_parent_dir()); - dout(10) << " swallowing intervening subtree at " << *t << dendl; - adjust_subtree_auth(t, auth); - try_subtree_merge_at(t); - } - } - } - else { - dout(10) << " already have bound " << *bound << dendl; - } - } - // merge stray bounds? - set::iterator p = subtrees[dir].begin(); - while (p != subtrees[dir].end()) { - set::iterator n = p; - n++; - if (bounds.count(*p) == 0) { - CDir *stray = *p; - dout(10) << " swallowing extra subtree at " << *stray << dendl; - adjust_subtree_auth(stray, auth); - try_subtree_merge_at(stray); - } - p = n; - } - - // bound should now match. - verify_subtree_bounds(dir, bounds); - - show_subtrees(); -} - - -void MDCache::adjust_bounded_subtree_auth(CDir *dir, list& bound_dfs, pair auth) -{ - dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth - << " on " << *dir - << " bound_dfs " << bound_dfs - << dendl; - - // make bounds list - set bounds; - for (list::iterator p = bound_dfs.begin(); - p != bound_dfs.end(); - ++p) { - CDir *bd = get_dirfrag(*p); - if (bd) - bounds.insert(bd); - } - - adjust_bounded_subtree_auth(dir, bounds, auth); -} - -void MDCache::map_dirfrag_set(list& dfs, set& result) -{ - // group by inode - map ino_fragset; - for (list::iterator p = dfs.begin(); p != dfs.end(); ++p) - ino_fragset[p->ino].insert(p->frag); - - // get frags - for (map::iterator p = ino_fragset.begin(); - p != ino_fragset.end(); - ++p) { - CInode *in = get_inode(p->first); - if (!in) continue; - - list fglist; - for (set::iterator q = p->second.begin(); q != p->second.end(); ++q) - in->dirfragtree.get_leaves_under(*q, fglist); - - dout(15) << "map_dirfrag_set " << p->second << " -> " << fglist - << " on " << *in << dendl; - - for (list::iterator q = fglist.begin(); q != fglist.end(); ++q) { - CDir *dir = in->get_dirfrag(*q); - if (dir) result.insert(dir); - } - } -} - - - -CDir *MDCache::get_subtree_root(CDir *dir) -{ - // find the underlying dir that delegates (or is about to delegate) auth - while (true) { - if (dir->is_subtree_root()) - return dir; - dir = dir->get_parent_dir(); - if (!dir) - return 0; // none - } -} - -void MDCache::remove_subtree(CDir *dir) -{ - dout(10) << "remove_subtree " << *dir << dendl; - assert(subtrees.count(dir)); - assert(subtrees[dir].empty()); - subtrees.erase(dir); - dir->put(CDir::PIN_SUBTREE); - if (dir->get_parent_dir()) { - CDir *p = get_subtree_root(dir->get_parent_dir()); - assert(subtrees[p].count(dir)); - subtrees[p].erase(dir); - } -} - -void MDCache::get_subtree_bounds(CDir *dir, set& bounds) -{ - assert(subtrees.count(dir)); - bounds = subtrees[dir]; -} - -void MDCache::get_wouldbe_subtree_bounds(CDir *dir, set& bounds) -{ - if (subtrees.count(dir)) { - // just copy them, dir is a subtree. - get_subtree_bounds(dir, bounds); - } else { - // find them - CDir *root = get_subtree_root(dir); - for (set::iterator p = subtrees[root].begin(); - p != subtrees[root].end(); - ++p) { - CDir *t = *p; - while (t != root) { - t = t->get_parent_dir(); - assert(t); - if (t == dir) { - bounds.insert(*p); - continue; - } - } - } - } -} - -void MDCache::verify_subtree_bounds(CDir *dir, const set& bounds) -{ - // for debugging only. - assert(subtrees.count(dir)); - if (bounds != subtrees[dir]) { - dout(0) << "verify_subtree_bounds failed" << dendl; - set b = bounds; - for (set::iterator p = subtrees[dir].begin(); - p != subtrees[dir].end(); - ++p) { - if (bounds.count(*p)) { - b.erase(*p); - continue; - } - dout(0) << " missing bound " << **p << dendl; - } - for (set::iterator p = b.begin(); - p != b.end(); - ++p) - dout(0) << " extra bound " << **p << dendl; - } - assert(bounds == subtrees[dir]); -} - -void MDCache::verify_subtree_bounds(CDir *dir, const list& bounds) -{ - // for debugging only. - assert(subtrees.count(dir)); - - // make sure that any bounds i do have are properly noted as such. - int failed = 0; - for (list::const_iterator p = bounds.begin(); - p != bounds.end(); - ++p) { - CDir *bd = get_dirfrag(*p); - if (!bd) continue; - if (subtrees[dir].count(bd) == 0) { - dout(0) << "verify_subtree_bounds failed: extra bound " << *bd << dendl; - failed++; - } - } - assert(failed == 0); -} - -void MDCache::adjust_subtree_after_rename(CInode *diri, CDir *olddir) -{ - dout(10) << "adjust_subtree_after_rename " << *diri << " from " << *olddir << dendl; - - //show_subtrees(); - - list dfls; - diri->get_dirfrags(dfls); - for (list::iterator p = dfls.begin(); p != dfls.end(); ++p) { - CDir *dir = *p; - - dout(10) << "dirfrag " << *dir << dendl; - CDir *oldparent = get_subtree_root(olddir); - dout(10) << " old parent " << *oldparent << dendl; - CDir *newparent = get_subtree_root(diri->get_parent_dir()); - dout(10) << " new parent " << *newparent << dendl; - - if (oldparent == newparent) { - dout(10) << "parent unchanged for " << *dir << " at " << *oldparent << dendl; - continue; - } - - if (dir->is_subtree_root()) { - // children are fine. change parent. - dout(10) << "moving " << *dir << " from " << *oldparent << " to " << *newparent << dendl; - assert(subtrees[oldparent].count(dir)); - subtrees[oldparent].erase(dir); - assert(subtrees.count(newparent)); - subtrees[newparent].insert(dir); - } else { - // mid-subtree. - - // see if any old bounds move to the new parent. - list tomove; - for (set::iterator p = subtrees[oldparent].begin(); - p != subtrees[oldparent].end(); - ++p) { - CDir *bound = *p; - CDir *broot = get_subtree_root(bound->get_parent_dir()); - if (broot != oldparent) { - assert(broot == newparent); - tomove.push_back(bound); - } - } - for (list::iterator p = tomove.begin(); p != tomove.end(); ++p) { - CDir *bound = *p; - dout(10) << "moving bound " << *bound << " from " << *oldparent << " to " << *newparent << dendl; - subtrees[oldparent].erase(bound); - subtrees[newparent].insert(bound); - } - - // did auth change? - if (oldparent->authority() != newparent->authority()) - adjust_subtree_auth(dir, oldparent->authority()); // caller is responsible for *diri. - } - } - - for (list::iterator p = dfls.begin(); p != dfls.end(); ++p) { - CDir *dir = *p; - - // un-force dir to subtree root - if (dir->dir_auth == pair(dir->dir_auth.first, dir->dir_auth.first)) { - adjust_subtree_auth(dir, dir->dir_auth.first); - try_subtree_merge_at(dir); - } - } - - show_subtrees(); -} - - -void MDCache::get_fullauth_subtrees(set& s) -{ - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) { - CDir *root = p->first; - if (root->is_full_dir_auth()) - s.insert(root); - } -} -void MDCache::get_auth_subtrees(set& s) -{ - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) { - CDir *root = p->first; - if (root->is_auth()) - s.insert(root); - } -} - - -// count. - -int MDCache::num_subtrees() -{ - return subtrees.size(); -} - -int MDCache::num_subtrees_fullauth() -{ - int n = 0; - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) { - CDir *root = p->first; - if (root->is_full_dir_auth()) - n++; - } - return n; -} - -int MDCache::num_subtrees_fullnonauth() -{ - int n = 0; - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) { - CDir *root = p->first; - if (root->is_full_dir_nonauth()) - n++; - } - return n; -} - - - - - - - -// ==================================================================== -// import map, recovery - - -ESubtreeMap *MDCache::create_subtree_map() -{ - dout(10) << "create_subtree_map " << num_subtrees() << " subtrees, " - << num_subtrees_fullauth() << " fullauth" - << dendl; - - ESubtreeMap *le = new ESubtreeMap(); - - // include all auth subtrees, and their bounds. - // and a spanning tree to tie it to the root. - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) { - CDir *dir = p->first; - if (!dir->is_auth()) continue; - - dout(15) << " subtree " << *dir << dendl; - le->subtrees[dir->dirfrag()].clear(); - le->metablob.add_dir_context(dir, EMetaBlob::TO_ROOT); - le->metablob.add_dir(dir, false); - - // bounds - for (set::iterator q = p->second.begin(); - q != p->second.end(); - ++q) { - CDir *bound = *q; - dout(15) << " subtree bound " << *bound << dendl; - le->subtrees[dir->dirfrag()].push_back(bound->dirfrag()); - le->metablob.add_dir_context(bound, EMetaBlob::TO_ROOT); - le->metablob.add_dir(bound, false); - } - } - - //le->metablob.print(cout); - return le; -} - - -void MDCache::send_resolve(int who) -{ - if (migrator->is_importing() || - migrator->is_exporting()) - send_resolve_later(who); - else - send_resolve_now(who); -} - -void MDCache::send_resolve_later(int who) -{ - dout(10) << "send_resolve_later to mds" << who << dendl; - wants_resolve.insert(who); -} - -void MDCache::maybe_send_pending_resolves() -{ - if (wants_resolve.empty()) - return; // nothing to send. - - // only if it's appropriate! - if (migrator->is_exporting() || - migrator->is_importing()) { - dout(7) << "maybe_send_pending_resolves waiting, imports/exports still in progress" << dendl; - migrator->show_importing(); - migrator->show_exporting(); - return; // not now - } - - // ok, send them. - for (set::iterator p = wants_resolve.begin(); - p != wants_resolve.end(); - p++) - send_resolve_now(*p); - wants_resolve.clear(); -} - - -class C_MDC_SendResolve : public Context { - MDCache *mdc; - int who; -public: - C_MDC_SendResolve(MDCache *c, int w) : mdc(c), who(w) { } - void finish(int r) { - mdc->send_resolve_now(who); - } -}; - -void MDCache::send_resolve_now(int who) -{ - dout(10) << "send_resolve_now to mds" << who << dendl; - MMDSResolve *m = new MMDSResolve; - - show_subtrees(); - - // known - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - p++) { - CDir *dir = p->first; - - // only our subtrees - if (dir->authority().first != mds->get_nodeid()) - continue; - - if (migrator->is_importing(dir->dirfrag())) { - // ambiguous (mid-import) - // NOTE: because we are first authority, import state is at least IMPORT_LOGGINSTART. - assert(migrator->get_import_state(dir->dirfrag()) >= Migrator::IMPORT_LOGGINGSTART); - set bounds; - get_subtree_bounds(dir, bounds); - list dfls; - for (set::iterator p = bounds.begin(); p != bounds.end(); ++p) - dfls.push_back((*p)->dirfrag()); - m->add_ambiguous_import(dir->dirfrag(), dfls); - } else { - // not ambiguous. - m->add_subtree(dir->dirfrag()); - - // bounds too - for (set::iterator q = subtrees[dir].begin(); - q != subtrees[dir].end(); - ++q) { - CDir *bound = *q; - m->add_subtree_bound(dir->dirfrag(), bound->dirfrag()); - } - } - } - - // ambiguous - for (map >::iterator p = my_ambiguous_imports.begin(); - p != my_ambiguous_imports.end(); - ++p) - m->add_ambiguous_import(p->first, p->second); - - - // list prepare requests lacking a commit - // [active survivor] - for (hash_map::iterator p = active_requests.begin(); - p != active_requests.end(); - ++p) { - if (p->second->is_slave() && p->second->slave_to_mds == who) { - dout(10) << " including uncommitted " << *p->second << dendl; - m->add_slave_request(p->first); - } - } - // [resolving] - if (uncommitted_slave_updates.count(who)) { - for (map::iterator p = uncommitted_slave_updates[who].begin(); - p != uncommitted_slave_updates[who].end(); - ++p) { - dout(10) << " including uncommitted " << p->first << dendl; - m->add_slave_request(p->first); - } - need_resolve_ack.insert(who); - } - - - // send - mds->send_message_mds(m, who, MDS_PORT_CACHE); -} - - -void MDCache::handle_mds_failure(int who) -{ - dout(7) << "handle_mds_failure mds" << who << dendl; - - // make note of recovery set - mds->mdsmap->get_recovery_mds_set(recovery_set); - recovery_set.erase(mds->get_nodeid()); - dout(1) << "handle_mds_failure mds" << who << " : recovery peers are " << recovery_set << dendl; - - // adjust my recovery lists - wants_resolve.erase(who); // MDS will ask again - got_resolve.erase(who); // i'll get another. - - rejoin_sent.erase(who); // i need to send another - rejoin_ack_gather.erase(who); // i'll need/get another. - - dout(10) << " wants_resolve " << wants_resolve << dendl; - dout(10) << " got_resolve " << got_resolve << dendl; - dout(10) << " rejoin_sent " << rejoin_sent << dendl; - dout(10) << " rejoin_gather " << rejoin_gather << dendl; - dout(10) << " rejoin_ack_gather " << rejoin_ack_gather << dendl; - - - // tell the migrator too. - migrator->handle_mds_failure_or_stop(who); - - // kick any discovers that are waiting - kick_discovers(who); - - // clean up any requests slave to/from this node - list finish; - for (hash_map::iterator p = active_requests.begin(); - p != active_requests.end(); - ++p) { - // slave to the failed node? - if (p->second->slave_to_mds == who) { - if (p->second->slave_did_prepare()) { - dout(10) << " slave request " << *p->second << " uncommitted, will resolve shortly" << dendl; - } else { - dout(10) << " slave request " << *p->second << " has no prepare, finishing up" << dendl; - if (p->second->slave_request) - p->second->aborted = true; - else - finish.push_back(p->second); - } - } - - // failed node is slave? - if (!p->second->committing) { - if (p->second->more()->witnessed.count(who)) { - dout(10) << " master request " << *p->second << " no longer witnessed by slave mds" << who - << dendl; - // discard this peer's prepare (if any) - p->second->more()->witnessed.erase(who); - } - - if (p->second->more()->waiting_on_slave.count(who)) { - dout(10) << " master request " << *p->second << " waiting for slave mds" << who - << " to recover" << dendl; - // retry request when peer recovers - p->second->more()->waiting_on_slave.erase(who); - mds->wait_for_active_peer(who, new C_MDS_RetryRequest(this, p->second)); - } - } - } - - while (!finish.empty()) { - dout(10) << "cleaning up slave request " << *finish.front() << dendl; - request_finish(finish.front()); - finish.pop_front(); - } - - show_subtrees(); -} - -/* - * handle_mds_recovery - called on another node's transition - * from resolve -> active. - */ -void MDCache::handle_mds_recovery(int who) -{ - dout(7) << "handle_mds_recovery mds" << who << dendl; - - list waiters; - - // wake up any waiters in their subtrees - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) { - CDir *dir = p->first; - - if (dir->authority().first != who) continue; - assert(!dir->is_auth()); - - // wake any waiters - list q; - q.push_back(dir); - - while (!q.empty()) { - CDir *d = q.front(); - q.pop_front(); - d->take_waiting(CDir::WAIT_ANY, waiters); - - // inode waiters too - for (CDir::map_t::iterator p = d->items.begin(); - p != d->items.end(); - ++p) { - CDentry *dn = p->second; - if (dn->is_primary()) { - dn->get_inode()->take_waiting(CInode::WAIT_ANY, waiters); - - // recurse? - list ls; - dn->get_inode()->get_dirfrags(ls); - for (list::iterator p = ls.begin(); - p != ls.end(); - ++p) { - CDir *subdir = *p; - if (!subdir->is_subtree_root()) - q.push_back(subdir); - } - } - } - } - } - - // queue them up. - mds->queue_waiters(waiters); -} - -void MDCache::set_recovery_set(set& s) -{ - dout(7) << "set_recovery_set " << s << dendl; - recovery_set = s; -} - - -/* - * during resolve state, we share resolves to determine who - * is authoritative for which trees. we expect to get an resolve - * from _everyone_ in the recovery_set (the mds cluster at the time of - * the first failure). - */ -void MDCache::handle_resolve(MMDSResolve *m) -{ - dout(7) << "handle_resolve from " << m->get_source() << dendl; - int from = m->get_source().num(); - - // ambiguous slave requests? - if (!m->slave_requests.empty()) { - MMDSResolveAck *ack = new MMDSResolveAck; - - for (list::iterator p = m->slave_requests.begin(); - p != m->slave_requests.end(); - ++p) { - if (mds->clientmap.have_completed_request(*p)) { - // COMMIT - dout(10) << " ambiguous slave request " << *p << " will COMMIT" << dendl; - ack->add_commit(*p); - } else { - // ABORT - dout(10) << " ambiguous slave request " << *p << " will ABORT" << dendl; - ack->add_abort(*p); - } - } - - mds->send_message_mds(ack, from, MDS_PORT_CACHE); - } - - // am i a surviving ambiguous importer? - if (mds->is_active() || mds->is_stopping()) { - // check for any import success/failure (from this node) - map >::iterator p = my_ambiguous_imports.begin(); - while (p != my_ambiguous_imports.end()) { - map >::iterator next = p; - next++; - CDir *dir = get_dirfrag(p->first); - assert(dir); - dout(10) << "checking ambiguous import " << *dir << dendl; - if (migrator->is_importing(dir->dirfrag()) && - migrator->get_import_peer(dir->dirfrag()) == from) { - assert(migrator->get_import_state(dir->dirfrag()) == Migrator::IMPORT_ACKING); - - // check if sender claims the subtree - bool claimed_by_sender = false; - for (map >::iterator q = m->subtrees.begin(); - q != m->subtrees.end(); - ++q) { - CDir *base = get_dirfrag(q->first); - if (!base || !base->contains(dir)) - continue; // base not dir or an ancestor of dir, clearly doesn't claim dir. - - bool inside = true; - for (list::iterator r = q->second.begin(); - r != q->second.end(); - ++r) { - CDir *bound = get_dirfrag(*r); - if (bound && bound->contains(dir)) { - inside = false; // nope, bound is dir or parent of dir, not inside. - break; - } - } - if (inside) - claimed_by_sender = true; - } - - if (claimed_by_sender) { - dout(7) << "ambiguous import failed on " << *dir << dendl; - migrator->import_reverse(dir); - } else { - dout(7) << "ambiguous import succeeded on " << *dir << dendl; - migrator->import_finish(dir); - } - my_ambiguous_imports.erase(p); // no longer ambiguous. - } - p = next; - } - } - - // update my dir_auth values - for (map >::iterator pi = m->subtrees.begin(); - pi != m->subtrees.end(); - ++pi) { - CInode *diri = get_inode(pi->first.ino); - if (!diri) continue; - bool forced = diri->dirfragtree.force_to_leaf(pi->first.frag); - if (forced) { - dout(10) << " forced frag " << pi->first.frag << " to leaf in " - << diri->dirfragtree - << " on " << pi->first << dendl; - } - - CDir *dir = diri->get_dirfrag(pi->first.frag); - if (!dir) continue; - - adjust_bounded_subtree_auth(dir, pi->second, from); - try_subtree_merge(dir); - } - - show_subtrees(); - - - // note ambiguous imports too - for (map >::iterator pi = m->ambiguous_imports.begin(); - pi != m->ambiguous_imports.end(); - ++pi) { - dout(10) << "noting ambiguous import on " << pi->first << " bounds " << pi->second << dendl; - other_ambiguous_imports[from][pi->first].swap( pi->second ); - } - - // did i get them all? - got_resolve.insert(from); - - maybe_resolve_finish(); - - delete m; -} - -void MDCache::maybe_resolve_finish() -{ - if (got_resolve != recovery_set) { - dout(10) << "maybe_resolve_finish still waiting for more resolves, got (" << got_resolve - << "), need (" << recovery_set << ")" << dendl; - } - else if (!need_resolve_ack.empty()) { - dout(10) << "maybe_resolve_finish still waiting for resolve_ack from (" << need_resolve_ack << ")" << dendl; - } - else { - dout(10) << "maybe_resolve_finish got all resolves+resolve_acks, done." << dendl; - disambiguate_imports(); - if (mds->is_resolve()) { - recalc_auth_bits(); - trim_non_auth(); - mds->resolve_done(); - } - } -} - -void MDCache::handle_resolve_ack(MMDSResolveAck *ack) -{ - dout(10) << "handle_resolve_ack " << *ack << " from " << ack->get_source() << dendl; - int from = ack->get_source().num(); - - for (list::iterator p = ack->commit.begin(); - p != ack->commit.end(); - ++p) { - dout(10) << " commit on slave " << *p << dendl; - - if (mds->is_resolve()) { - // replay - assert(uncommitted_slave_updates[from].count(*p)); - uncommitted_slave_updates[from][*p].commit.replay(mds); - uncommitted_slave_updates[from].erase(*p); - // log commit - mds->mdlog->submit_entry(new ESlaveUpdate(mds->mdlog, "unknown", *p, from, ESlaveUpdate::OP_COMMIT)); - } else { - MDRequest *mdr = request_get(*p); - assert(mdr->slave_request == 0); // shouldn't be doing anything! - request_finish(mdr); - } - } - - for (list::iterator p = ack->abort.begin(); - p != ack->abort.end(); - ++p) { - dout(10) << " abort on slave " << *p << dendl; - - if (mds->is_resolve()) { - assert(uncommitted_slave_updates[from].count(*p)); - uncommitted_slave_updates[from][*p].rollback.replay(mds); - uncommitted_slave_updates[from].erase(*p); - mds->mdlog->submit_entry(new ESlaveUpdate(mds->mdlog, "unknown", *p, from, ESlaveUpdate::OP_ROLLBACK)); - } else { - MDRequest *mdr = request_get(*p); - if (mdr->more()->slave_commit) { - mdr->more()->slave_commit->finish(-1); - delete mdr->more()->slave_commit; - mdr->more()->slave_commit = 0; - } - if (mdr->slave_request) - mdr->aborted = true; - else - request_finish(mdr); - } - } - - need_resolve_ack.erase(from); - - if (mds->is_resolve()) - maybe_resolve_finish(); - - delete ack; -} - - - -void MDCache::disambiguate_imports() -{ - dout(10) << "disambiguate_imports" << dendl; - - // other nodes' ambiguous imports - for (map > >::iterator p = other_ambiguous_imports.begin(); - p != other_ambiguous_imports.end(); - ++p) { - int who = p->first; - dout(10) << "ambiguous imports for mds" << who << dendl; - - for (map >::iterator q = p->second.begin(); - q != p->second.end(); - ++q) { - dout(10) << " ambiguous import " << q->first << " bounds " << q->second << dendl; - CDir *dir = get_dirfrag(q->first); - if (!dir) continue; - - if (dir->authority().first == CDIR_AUTH_UNKNOWN || // if i am resolving - dir->is_ambiguous_auth()) { // if i am a surviving bystander - dout(10) << " mds" << who << " did import " << *dir << dendl; - adjust_bounded_subtree_auth(dir, q->second, who); - try_subtree_merge(dir); - } else { - dout(10) << " mds" << who << " did not import " << *dir << dendl; - } - } - } - other_ambiguous_imports.clear(); - - // my ambiguous imports - while (!my_ambiguous_imports.empty()) { - map >::iterator q = my_ambiguous_imports.begin(); - - CDir *dir = get_dirfrag(q->first); - if (!dir) continue; - - if (dir->authority().first != CDIR_AUTH_UNKNOWN) { - dout(10) << "ambiguous import auth known, must not be me " << *dir << dendl; - cancel_ambiguous_import(q->first); - mds->mdlog->submit_entry(new EImportFinish(dir, false)); - } else { - dout(10) << "ambiguous import auth unknown, must be me " << *dir << dendl; - finish_ambiguous_import(q->first); - mds->mdlog->submit_entry(new EImportFinish(dir, true)); - } - } - assert(my_ambiguous_imports.empty()); - - if (mds->is_resolve()) { - // verify all my subtrees are unambiguous! - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) { - CDir *dir = p->first; - if (dir->is_ambiguous_dir_auth()) { - dout(0) << "disambiguate_imports uh oh, dir_auth is still ambiguous for " << *dir << dendl; - show_subtrees(); - } - assert(!dir->is_ambiguous_dir_auth()); - } - } - - show_subtrees(); -} - - -void MDCache::add_ambiguous_import(dirfrag_t base, list& bounds) -{ - assert(my_ambiguous_imports.count(base) == 0); - my_ambiguous_imports[base].swap( bounds ); -} - - -void MDCache::add_ambiguous_import(CDir *base, const set& bounds) -{ - // make a list - list binos; - for (set::iterator p = bounds.begin(); - p != bounds.end(); - ++p) - binos.push_back((*p)->dirfrag()); - - // note: this can get called twice if the exporter fails during recovery - if (my_ambiguous_imports.count(base->dirfrag())) - my_ambiguous_imports.erase(base->dirfrag()); - - add_ambiguous_import(base->dirfrag(), binos); -} - -void MDCache::cancel_ambiguous_import(dirfrag_t df) -{ - assert(my_ambiguous_imports.count(df)); - dout(10) << "cancel_ambiguous_import " << df - << " bounds " << my_ambiguous_imports[df] - << dendl; - my_ambiguous_imports.erase(df); -} - -void MDCache::finish_ambiguous_import(dirfrag_t df) -{ - assert(my_ambiguous_imports.count(df)); - list bounds; - bounds.swap(my_ambiguous_imports[df]); - my_ambiguous_imports.erase(df); - - dout(10) << "finish_ambiguous_import " << df - << " bounds " << bounds - << dendl; - CDir *dir = get_dirfrag(df); - assert(dir); - - // adjust dir_auth, import maps - adjust_bounded_subtree_auth(dir, bounds, mds->get_nodeid()); - try_subtree_merge(dir); -} - - -/** recalc_auth_bits() - * once subtree auth is disambiguated, we need to adjust all the - * auth and dirty bits in our cache before moving on. - */ -void MDCache::recalc_auth_bits() -{ - dout(7) << "recalc_auth_bits" << dendl; - - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) { - list dfq; // dirfrag queue - dfq.push_back(p->first); - - bool auth = p->first->authority().first == mds->get_nodeid(); - dout(10) << " subtree auth=" << auth << " for " << *p->first << dendl; - - while (!dfq.empty()) { - CDir *dir = dfq.front(); - dfq.pop_front(); - - // dir - if (auth) - dir->state_set(CDir::STATE_AUTH); - else { - dir->state_set(CDir::STATE_REJOINING); - dir->state_clear(CDir::STATE_AUTH); - if (dir->is_dirty()) - dir->mark_clean(); - } - - // dentries in this dir - for (CDir::map_t::iterator q = dir->items.begin(); - q != dir->items.end(); - ++q) { - // dn - CDentry *dn = q->second; - if (auth) - dn->state_set(CDentry::STATE_AUTH); - else { - dn->state_set(CDentry::STATE_REJOINING); - dn->state_clear(CDentry::STATE_AUTH); - if (dn->is_dirty()) - dn->mark_clean(); - } - - if (dn->is_primary()) { - // inode - if (auth) - dn->inode->state_set(CInode::STATE_AUTH); - else { - dn->inode->state_set(CInode::STATE_REJOINING); - dn->inode->state_clear(CInode::STATE_AUTH); - if (dn->inode->is_dirty()) - dn->inode->mark_clean(); - } - - // recurse? - if (dn->inode->is_dir()) - dn->inode->get_nested_dirfrags(dfq); - } - } - } - } - - show_subtrees(); - show_cache(); -} - - - -// =========================================================================== -// REJOIN - - -/* - * rejoin phase! - * - * this initiates rejoin. it shoudl be called before we get any - * rejoin or rejoin_ack messages (or else mdsmap distribution is broken). - * - * we start out by sending rejoins to everyone in the recovery set. - * - * if we are rejoin, send for all regions in our cache. - * if we are active|stopping, send only to nodes that are are rejoining. - */ -void MDCache::rejoin_send_rejoins() -{ - dout(10) << "rejoin_send_rejoins with recovery_set " << recovery_set << dendl; - - map rejoins; - - // encode cap list once. - bufferlist cap_export_bl; - if (mds->is_rejoin()) { - ::_encode(cap_exports, cap_export_bl); - ::_encode(cap_export_paths, cap_export_bl); - } - - // if i am rejoining, send a rejoin to everyone. - // otherwise, just send to others who are rejoining. - for (set::iterator p = recovery_set.begin(); - p != recovery_set.end(); - ++p) { - if (*p == mds->get_nodeid()) continue; // nothing to myself! - if (rejoin_sent.count(*p)) continue; // already sent a rejoin to this node! - if (mds->is_rejoin()) { - rejoin_gather.insert(*p); - rejoins[*p] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_WEAK); - rejoins[*p]->copy_cap_exports(cap_export_bl); - } else if (mds->mdsmap->is_rejoin(*p)) - rejoins[*p] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_STRONG); - } - - assert(!migrator->is_importing()); - assert(!migrator->is_exporting()); - - // check all subtrees - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) { - CDir *dir = p->first; - assert(dir->is_subtree_root()); - assert(!dir->is_ambiguous_dir_auth()); - - int auth = dir->get_dir_auth().first; - assert(auth >= 0); - - if (auth == mds->get_nodeid()) continue; // skip my own regions! - if (rejoins.count(auth) == 0) continue; // don't care about this node's regions - - rejoin_walk(dir, rejoins[auth]); - } - - // rejoin root inodes, too - for (map::iterator p = rejoins.begin(); - p != rejoins.end(); - ++p) { - if (mds->is_rejoin()) { - // weak - if (p->first == 0 && root) - p->second->add_weak_inode(root->ino()); - if (get_inode(MDS_INO_STRAY(p->first))) - p->second->add_weak_inode(MDS_INO_STRAY(p->first)); - } else { - // strong - if (p->first == 0 && root) { - p->second->add_weak_inode(root->ino()); - p->second->add_strong_inode(root->ino(), root->get_replica_nonce(), - root->get_caps_wanted(), - root->authlock.get_state(), - root->linklock.get_state(), - root->dirfragtreelock.get_state(), - root->filelock.get_state(), - root->dirlock.get_state()); - } - if (CInode *in = get_inode(MDS_INO_STRAY(p->first))) { - p->second->add_weak_inode(in->ino()); - p->second->add_strong_inode(in->ino(), in->get_replica_nonce(), - in->get_caps_wanted(), - in->authlock.get_state(), - in->linklock.get_state(), - in->dirfragtreelock.get_state(), - in->filelock.get_state(), - in->dirlock.get_state()); - } - } - } - - if (!mds->is_rejoin()) { - // i am survivor. send strong rejoin. - // note request authpins, xlocks - for (hash_map::iterator p = active_requests.begin(); - p != active_requests.end(); - ++p) { - // auth pins - for (set::iterator q = p->second->auth_pins.begin(); - q != p->second->auth_pins.end(); - ++q) { - if (!(*q)->is_auth()) { - int who = (*q)->authority().first; - if (rejoins.count(who) == 0) continue; - MMDSCacheRejoin *rejoin = rejoins[who]; - - dout(15) << " " << *p->second << " authpin on " << **q << dendl; - MDSCacheObjectInfo i; - (*q)->set_object_info(i); - if (i.ino) - rejoin->add_inode_authpin(i.ino, p->second->reqid); - else - rejoin->add_dentry_authpin(i.dirfrag, i.dname, p->second->reqid); - } - } - // xlocks - for (set::iterator q = p->second->xlocks.begin(); - q != p->second->xlocks.end(); - ++q) { - if (!(*q)->get_parent()->is_auth()) { - int who = (*q)->get_parent()->authority().first; - if (rejoins.count(who) == 0) continue; - MMDSCacheRejoin *rejoin = rejoins[who]; - - dout(15) << " " << *p->second << " xlock on " << **q << " " << *(*q)->get_parent() << dendl; - MDSCacheObjectInfo i; - (*q)->get_parent()->set_object_info(i); - if (i.ino) - rejoin->add_inode_xlock(i.ino, (*q)->get_type(), p->second->reqid); - else - rejoin->add_dentry_xlock(i.dirfrag, i.dname, p->second->reqid); - } - } - } - } - - // send the messages - for (map::iterator p = rejoins.begin(); - p != rejoins.end(); - ++p) { - assert(rejoin_sent.count(p->first) == 0); - assert(rejoin_ack_gather.count(p->first) == 0); - rejoin_sent.insert(p->first); - rejoin_ack_gather.insert(p->first); - mds->send_message_mds(p->second, p->first, MDS_PORT_CACHE); - } - - // nothing? - if (mds->is_rejoin() && rejoins.empty()) { - dout(10) << "nothing to rejoin" << dendl; - mds->rejoin_done(); - } -} - - -/** - * rejoin_walk - build rejoin declarations for a subtree - * - * @dir subtree root - * @rejoin rejoin message - * - * from a rejoining node: - * weak dirfrag - * weak dentries (w/ connectivity) - * - * from a surviving node: - * strong dirfrag - * strong dentries (no connectivity!) - * strong inodes - */ -void MDCache::rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin) -{ - dout(10) << "rejoin_walk " << *dir << dendl; - - list nested; // finish this dir, then do nested items - - if (mds->is_rejoin()) { - // WEAK - dout(15) << " add_weak_dirfrag " << *dir << dendl; - rejoin->add_weak_dirfrag(dir->dirfrag()); - - for (CDir::map_t::iterator p = dir->items.begin(); - p != dir->items.end(); - ++p) { - CDentry *dn = p->second; - dout(15) << " add_weak_primary_dentry " << *dn << dendl; - assert(dn->is_primary()); - assert(dn->inode->is_dir()); - rejoin->add_weak_primary_dentry(dir->dirfrag(), p->first, dn->get_inode()->ino()); - dn->get_inode()->get_nested_dirfrags(nested); - - if (dn->get_inode()->dirlock.is_updated()) { - // include full inode to shed any dirtyscattered state - rejoin->add_full_inode(dn->get_inode()->inode, - dn->get_inode()->symlink, - dn->get_inode()->dirfragtree); - dn->get_inode()->dirlock.clear_updated(); - } - } - } else { - // STRONG - dout(15) << " add_strong_dirfrag " << *dir << dendl; - rejoin->add_strong_dirfrag(dir->dirfrag(), dir->get_replica_nonce(), dir->get_dir_rep()); - - for (CDir::map_t::iterator p = dir->items.begin(); - p != dir->items.end(); - ++p) { - CDentry *dn = p->second; - dout(15) << " add_strong_dentry " << *dn << dendl; - rejoin->add_strong_dentry(dir->dirfrag(), p->first, - dn->is_primary() ? dn->get_inode()->ino():inodeno_t(0), - dn->is_remote() ? dn->get_remote_ino():inodeno_t(0), - dn->is_remote() ? dn->get_remote_d_type():0, - dn->get_replica_nonce(), - dn->lock.get_state()); - if (dn->is_primary()) { - CInode *in = dn->get_inode(); - dout(15) << " add_strong_inode " << *in << dendl; - rejoin->add_strong_inode(in->ino(), in->get_replica_nonce(), - in->get_caps_wanted(), - in->authlock.get_state(), - in->linklock.get_state(), - in->dirfragtreelock.get_state(), - in->filelock.get_state(), - in->dirlock.get_state()); - in->get_nested_dirfrags(nested); - } - } - } - - // recurse into nested dirs - for (list::iterator p = nested.begin(); - p != nested.end(); - ++p) - rejoin_walk(*p, rejoin); -} - - -/* - * i got a rejoin. - * - reply with the lockstate - * - * if i am active|stopping, - * - remove source from replica list for everything not referenced here. - */ -void MDCache::handle_cache_rejoin(MMDSCacheRejoin *m) -{ - dout(7) << "handle_cache_rejoin " << *m << " from " << m->get_source() - << " (" << m->get_payload().length() << " bytes)" - << dendl; - - switch (m->op) { - case MMDSCacheRejoin::OP_WEAK: - handle_cache_rejoin_weak(m); - break; - case MMDSCacheRejoin::OP_STRONG: - handle_cache_rejoin_strong(m); - break; - - case MMDSCacheRejoin::OP_ACK: - handle_cache_rejoin_ack(m); - break; - case MMDSCacheRejoin::OP_MISSING: - handle_cache_rejoin_missing(m); - break; - - case MMDSCacheRejoin::OP_FULL: - handle_cache_rejoin_full(m); - break; - - default: - assert(0); - } - delete m; -} - - -/* - * handle_cache_rejoin_weak - * - * the sender - * - is recovering from their journal. - * - may have incorrect (out of date) inode contents - * - will include full inodes IFF they contain dirty scatterlock content - * - * if the sender didn't trim_non_auth(), they - * - may have incorrect (out of date) dentry/inode linkage - * - may have deleted/purged inodes - * and i may have to go to disk to get accurate inode contents. yuck. - */ -void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak) -{ - int from = weak->get_source().num(); - - // possible response(s) - MMDSCacheRejoin *ack = 0; // if survivor - bool survivor = false; // am i a survivor? - - if (mds->is_active() || mds->is_stopping()) { - survivor = true; - dout(10) << "i am a surivivor, and will ack immediately" << dendl; - ack = new MMDSCacheRejoin(MMDSCacheRejoin::OP_ACK); - - // check cap exports - for (map >::iterator p = weak->cap_exports.begin(); - p != weak->cap_exports.end(); - ++p) { - CInode *in = get_inode(p->first); - if (!in || !in->is_auth()) continue; - for (map::iterator q = p->second.begin(); - q != p->second.end(); - ++q) { - dout(10) << " claiming cap import " << p->first << " client" << q->first << " on " << *in << dendl; - rejoin_import_cap(in, q->first, q->second, from); - } - } - } else { - assert(mds->is_rejoin()); - - // check cap exports. - for (map >::iterator p = weak->cap_exports.begin(); - p != weak->cap_exports.end(); - ++p) { - CInode *in = get_inode(p->first); - if (in && !in->is_auth()) continue; - if (!in) { - if (!path_is_mine(weak->cap_export_paths[p->first])) - continue; - cap_import_paths[p->first] = weak->cap_export_paths[p->first]; - dout(10) << " noting cap import " << p->first << " path " << weak->cap_export_paths[p->first] << dendl; - } - - // note - for (map::iterator q = p->second.begin(); - q != p->second.end(); - ++q) { - dout(10) << " claiming cap import " << p->first << " client" << q->first << dendl; - cap_imports[p->first][q->first][from] = q->second; - } - } - } - - // full inodes? - // dirty scatterlock content! - for (list::iterator p = weak->full_inodes.begin(); - p != weak->full_inodes.end(); - ++p) { - CInode *in = get_inode(p->inode.ino); - if (!in) continue; - if (p->inode.mtime > in->inode.mtime) in->inode.mtime = p->inode.mtime; - dout(10) << " got dirty inode scatterlock content " << *in << dendl; - in->dirlock.set_updated(); - } - - // walk weak map - for (map >::iterator p = weak->weak.begin(); - p != weak->weak.end(); - ++p) { - CDir *dir = get_dirfrag(p->first); - if (!dir) dout(0) << " missing dirfrag " << p->first << dendl; - assert(dir); - - int nonce = dir->add_replica(from); - dout(10) << " have " << *dir << dendl; - if (ack) - ack->add_strong_dirfrag(p->first, nonce, dir->dir_rep); - - // weak dentries - for (map::iterator q = p->second.begin(); - q != p->second.end(); - ++q) { - CDentry *dn = dir->lookup(q->first); - assert(dn); - assert(dn->is_primary()); - - if (survivor && dn->is_replica(from)) - dentry_remove_replica(dn, from); // this induces a lock gather completion - int dnonce = dn->add_replica(from); - dout(10) << " have " << *dn << dendl; - if (ack) - ack->add_strong_dentry(p->first, q->first, - dn->get_inode()->ino(), inodeno_t(0), 0, - dnonce, dn->lock.get_replica_state()); - - // inode - CInode *in = dn->get_inode(); - assert(in); - - if (survivor && in->is_replica(from)) - inode_remove_replica(in, from); // this induces a lock gather completion - int inonce = in->add_replica(from); - dout(10) << " have " << *in << dendl; - - // scatter the dirlock, just in case? - if (!survivor && in->is_dir()) - in->dirlock.set_state(LOCK_SCATTER); - - if (ack) { - ack->add_full_inode(in->inode, in->symlink, in->dirfragtree); - ack->add_strong_inode(in->ino(), - inonce, - 0, - in->authlock.get_replica_state(), - in->linklock.get_replica_state(), - in->dirfragtreelock.get_replica_state(), - in->filelock.get_replica_state(), - in->dirlock.get_replica_state()); - } - } - } - - // weak base inodes? (root, stray, etc.) - for (set::iterator p = weak->weak_inodes.begin(); - p != weak->weak_inodes.end(); - ++p) { - CInode *in = get_inode(*p); - assert(in); // hmm fixme wrt stray? - if (survivor && in->is_replica(from)) - inode_remove_replica(in, from); // this induces a lock gather completion - int inonce = in->add_replica(from); - dout(10) << " have base " << *in << dendl; - - if (ack) - ack->add_strong_inode(in->ino(), - inonce, - 0, - in->authlock.get_replica_state(), - in->linklock.get_replica_state(), - in->dirfragtreelock.get_replica_state(), - in->filelock.get_replica_state(), - in->dirlock.get_replica_state()); - } - - if (survivor) { - // survivor. do everything now. - rejoin_scour_survivor_replicas(from, ack); - mds->send_message_mds(ack, from, MDS_PORT_CACHE); - } else { - // done? - assert(rejoin_gather.count(from)); - rejoin_gather.erase(from); - if (rejoin_gather.empty()) { - rejoin_gather_finish(); - } else { - dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl; - } - } -} - - -/** - * parallel_fetch -- make a pass at fetching a bunch of paths in parallel - * - * @pathmap - map of inodeno to full pathnames. we remove items from this map - * as we discover we have them. - * - * returns a C_Gather* is there is work to do. caller is responsible for setting - * the C_Gather completer. - */ -C_Gather *MDCache::parallel_fetch(map& pathmap) -{ - dout(10) << "parallel_fetch on " << pathmap.size() << " paths" << dendl; - - // scan list - set fetch_queue; - map::iterator p = pathmap.begin(); - while (p != pathmap.end()) { - CInode *in = get_inode(p->first); - if (in) { - dout(15) << " have " << *in << dendl; - pathmap.erase(p++); - continue; - } - - // traverse - dout(17) << " missing " << p->first << " at " << p->second << dendl; - filepath path(p->second); - CDir *dir = path_traverse_to_dir(path); - assert(dir); - fetch_queue.insert(dir); - p++; - } - - if (pathmap.empty()) { - dout(10) << "parallel_fetch done" << dendl; - assert(fetch_queue.empty()); - return false; - } - - // do a parallel fetch - C_Gather *gather = new C_Gather; - for (set::iterator p = fetch_queue.begin(); - p != fetch_queue.end(); - ++p) { - dout(10) << "parallel_fetch fetching " << **p << dendl; - (*p)->fetch(gather->new_sub()); - } - - return gather; -} - - - -/* - * rejoin_scour_survivor_replica - remove source from replica list on unmentioned objects - * - * all validated replicas are acked with a strong nonce, etc. if that isn't in the - * ack, the replica dne, and we can remove it from our replica maps. - */ -void MDCache::rejoin_scour_survivor_replicas(int from, MMDSCacheRejoin *ack) -{ - dout(10) << "rejoin_scour_survivor_replicas from mds" << from << dendl; - - // FIXME: what about root and stray inodes. - - for (hash_map::iterator p = inode_map.begin(); - p != inode_map.end(); - ++p) { - CInode *in = p->second; - - // inode? - if (in->is_auth() && - in->is_replica(from) && - ack->strong_inodes.count(p->second->ino()) == 0) { - inode_remove_replica(in, from); - dout(10) << " rem " << *in << dendl; - } - - if (!in->is_dir()) continue; - - list dfs; - in->get_dirfrags(dfs); - for (list::iterator p = dfs.begin(); - p != dfs.end(); - ++p) { - CDir *dir = *p; - - if (dir->is_auth() && - dir->is_replica(from) && - ack->strong_dirfrags.count(dir->dirfrag()) == 0) { - dir->remove_replica(from); - dout(10) << " rem " << *dir << dendl; - } - - // dentries - for (CDir::map_t::iterator p = dir->items.begin(); - p != dir->items.end(); - ++p) { - CDentry *dn = p->second; - - if (dn->is_replica(from) && - (ack->strong_dentries.count(dir->dirfrag()) == 0 || - ack->strong_dentries[dir->dirfrag()].count(dn->get_name()) == 0)) { - dentry_remove_replica(dn, from); - dout(10) << " rem " << *dn << dendl; - } - } - } - } -} - - -CInode *MDCache::rejoin_invent_inode(inodeno_t ino) -{ - CInode *in = new CInode(this); - memset(&in->inode, 0, sizeof(inode_t)); - in->inode.ino = ino; - in->state_set(CInode::STATE_REJOINUNDEF); - add_inode(in); - rejoin_undef_inodes.insert(in); - dout(10) << " invented " << *in << dendl; - return in; -} - - -void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong) -{ - int from = strong->get_source().num(); - - // only a recovering node will get a strong rejoin. - assert(mds->is_rejoin()); - - MMDSCacheRejoin *missing = 0; // if i'm missing something.. - - // strong dirfrags/dentries. - // also process auth_pins, xlocks. - for (map::iterator p = strong->strong_dirfrags.begin(); - p != strong->strong_dirfrags.end(); - ++p) { - CDir *dir = get_dirfrag(p->first); - if (!dir) { - CInode *in = get_inode(p->first.ino); - if (!in) in = rejoin_invent_inode(p->first.ino); - if (!in->is_dir()) { - assert(in->state_test(CInode::STATE_REJOINUNDEF)); - in->inode.mode = S_IFDIR; - } - dir = in->get_or_open_dirfrag(this, p->first.frag); - } else { - dout(10) << " have " << *dir << dendl; - } - dir->add_replica(from); - dir->dir_rep = p->second.dir_rep; - - for (map::iterator q = strong->strong_dentries[p->first].begin(); - q != strong->strong_dentries[p->first].end(); - ++q) { - CDentry *dn = dir->lookup(q->first); - if (!dn) { - if (q->second.is_remote()) { - dn = dir->add_remote_dentry(q->first, q->second.remote_ino, q->second.remote_d_type); - } else if (q->second.is_null()) { - dn = dir->add_null_dentry(q->first); - } else { - CInode *in = get_inode(q->second.ino); - if (!in) in = rejoin_invent_inode(q->second.ino); - dn = dir->add_primary_dentry(q->first, in); - - dout(10) << " missing " << q->second.ino << dendl; - if (!missing) missing = new MMDSCacheRejoin(MMDSCacheRejoin::OP_MISSING); - missing->add_weak_inode(q->second.ino); // we want it back! - } - dout(10) << " invented " << *dn << dendl; - } - - // dn auth_pin? - if (strong->authpinned_dentries.count(p->first) && - strong->authpinned_dentries[p->first].count(q->first)) { - metareqid_t ri = strong->authpinned_dentries[p->first][q->first]; - dout(10) << " dn authpin by " << ri << " on " << *dn << dendl; - - // get/create slave mdrequest - MDRequest *mdr; - if (have_request(ri)) - mdr = request_get(ri); - else - mdr = request_start_slave(ri, from); - mdr->auth_pin(dn); - } - - // dn xlock? - if (strong->xlocked_dentries.count(p->first) && - strong->xlocked_dentries[p->first].count(q->first)) { - metareqid_t ri = strong->xlocked_dentries[p->first][q->first]; - dout(10) << " dn xlock by " << ri << " on " << *dn << dendl; - MDRequest *mdr = request_get(ri); // should have this from auth_pin above. - assert(mdr->is_auth_pinned(dn)); - dn->lock.set_state(LOCK_LOCK); - dn->lock.get_xlock(mdr); - mdr->xlocks.insert(&dn->lock); - mdr->locks.insert(&dn->lock); - } - - dn->add_replica(from); - dout(10) << " have " << *dn << dendl; - - // inode? - if (dn->is_primary()) { - CInode *in = dn->get_inode(); - assert(in); - - if (strong->strong_inodes.count(in->ino())) { - MMDSCacheRejoin::inode_strong &is = strong->strong_inodes[in->ino()]; - - // caps_wanted - if (is.caps_wanted) { - in->mds_caps_wanted[from] = is.caps_wanted; - dout(15) << " inode caps_wanted " << cap_string(is.caps_wanted) - << " on " << *in << dendl; - } - - // scatterlock? - if (is.dirlock == LOCK_SCATTER || - is.dirlock == LOCK_GLOCKC) // replica still has wrlocks - in->dirlock.set_state(LOCK_SCATTER); - - // auth pin? - if (strong->authpinned_inodes.count(in->ino())) { - metareqid_t ri = strong->authpinned_inodes[in->ino()]; - dout(10) << " inode authpin by " << ri << " on " << *in << dendl; - - // get/create slave mdrequest - MDRequest *mdr; - if (have_request(ri)) - mdr = request_get(ri); - else - mdr = request_start_slave(ri, from); - mdr->auth_pin(in); - } - - // xlock(s)? - if (strong->xlocked_inodes.count(in->ino())) { - for (map::iterator r = strong->xlocked_inodes[in->ino()].begin(); - r != strong->xlocked_inodes[in->ino()].end(); - ++r) { - SimpleLock *lock = in->get_lock(r->first); - dout(10) << " inode xlock by " << r->second << " on " << *lock << " on " << *in << dendl; - MDRequest *mdr = request_get(r->second); // should have this from auth_pin above. - assert(mdr->is_auth_pinned(in)); - lock->set_state(LOCK_LOCK); - lock->get_xlock(mdr); - mdr->xlocks.insert(lock); - mdr->locks.insert(lock); - } - } - } else { - dout(10) << " sender has dentry but not inode, adding them as a replica" << dendl; - } - - in->add_replica(from); - dout(10) << " have " << *in << dendl; - } - } - } - - // base inodes? (root, stray, etc.) - for (set::iterator p = strong->weak_inodes.begin(); - p != strong->weak_inodes.end(); - ++p) { - CInode *in = get_inode(*p); - dout(10) << " have base " << *in << dendl; - in->add_replica(from); - } - - // send missing? - if (missing) { - // we expect a FULL soon. - mds->send_message_mds(missing, from, MDS_PORT_CACHE); - } else { - // done? - assert(rejoin_gather.count(from)); - rejoin_gather.erase(from); - if (rejoin_gather.empty()) { - rejoin_gather_finish(); - } else { - dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl; - } - } -} - - -void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack) -{ - dout(7) << "handle_cache_rejoin_ack from " << ack->get_source() << dendl; - int from = ack->get_source().num(); - - list waiters; - - // dirs - for (map::iterator p = ack->strong_dirfrags.begin(); - p != ack->strong_dirfrags.end(); - ++p) { - CDir *dir = get_dirfrag(p->first); - if (!dir) continue; // must have trimmed? - - dir->set_replica_nonce(p->second.nonce); - dir->state_clear(CDir::STATE_REJOINING); - dout(10) << " got " << *dir << dendl; - - // dentries - for (map::iterator q = ack->strong_dentries[p->first].begin(); - q != ack->strong_dentries[p->first].end(); - ++q) { - CDentry *dn = dir->lookup(q->first); - if (!dn) continue; // must have trimmed? - - // hmm, did we have the proper linkage here? - if (dn->is_null() && - !q->second.is_null()) { - dout(10) << " had bad (missing) linkage for " << *dn << dendl; - if (q->second.is_remote()) { - dn->dir->link_remote_inode(dn, q->second.remote_ino, q->second.remote_d_type); - } else { - CInode *in = get_inode(q->second.ino); - assert(in == 0); // a rename would have been caught be the resolve stage. - // barebones inode; the full inode loop below will clean up. - in = new CInode(this, false); - in->inode.ino = q->second.ino; - add_inode(in); - dn->dir->link_primary_inode(dn, in); - } - } - else if (!dn->is_null() && - q->second.is_null()) { - dout(-10) << " had bad linkage for " << *dn << dendl; - /* - * this should happen: - * if we're a survivor, any unlink should commit or rollback during - * the resolve stage. - * if we failed, we shouldn't have non-auth leaf dentries at all - */ - assert(0); // uh oh. - } - dn->set_replica_nonce(q->second.nonce); - mds->locker->rejoin_set_state(&dn->lock, q->second.lock, waiters); - dn->state_clear(CDentry::STATE_REJOINING); - dout(10) << " got " << *dn << dendl; - } - } - - // full inodes - for (list::iterator p = ack->full_inodes.begin(); - p != ack->full_inodes.end(); - ++p) { - CInode *in = get_inode(p->inode.ino); - if (!in) continue; - in->inode = p->inode; - in->symlink = p->symlink; - in->dirfragtree = p->dirfragtree; - dout(10) << " got inode content " << *in << dendl; - } - - // inodes - for (map::iterator p = ack->strong_inodes.begin(); - p != ack->strong_inodes.end(); - ++p) { - CInode *in = get_inode(p->first); - if (!in) continue; - in->set_replica_nonce(p->second.nonce); - mds->locker->rejoin_set_state(&in->authlock, p->second.authlock, waiters); - mds->locker->rejoin_set_state(&in->linklock, p->second.linklock, waiters); - mds->locker->rejoin_set_state(&in->dirfragtreelock, p->second.dirfragtreelock, waiters); - mds->locker->rejoin_set_state(&in->filelock, p->second.filelock, waiters); - mds->locker->rejoin_set_state(&in->dirlock, p->second.dirlock, waiters); - in->state_clear(CInode::STATE_REJOINING); - dout(10) << " got " << *in << dendl; - } - - // done? - assert(rejoin_ack_gather.count(from)); - rejoin_ack_gather.erase(from); - if (mds->is_rejoin() && - rejoin_gather.empty() && // make sure we've gotten our FULL inodes, too. - rejoin_ack_gather.empty()) { - mds->rejoin_done(); - } else { - dout(7) << "still need rejoin from (" << rejoin_gather << ")" - << ", rejoin_ack from (" << rejoin_ack_gather << ")" << dendl; - } -} - - - - -void MDCache::handle_cache_rejoin_missing(MMDSCacheRejoin *missing) -{ - dout(7) << "handle_cache_rejoin_missing from " << missing->get_source() << dendl; - - MMDSCacheRejoin *full = new MMDSCacheRejoin(MMDSCacheRejoin::OP_FULL); - - // inodes - for (set::iterator p = missing->weak_inodes.begin(); - p != missing->weak_inodes.end(); - ++p) { - CInode *in = get_inode(*p); - if (!in) { - dout(10) << " don't have inode " << *p << dendl; - continue; // we must have trimmed it after the originalo rejoin - } - - dout(10) << " sending " << *in << dendl; - full->add_full_inode(in->inode, in->symlink, in->dirfragtree); - } - - mds->send_message_mds(full, missing->get_source().num(), MDS_PORT_CACHE); -} - -void MDCache::handle_cache_rejoin_full(MMDSCacheRejoin *full) -{ - dout(7) << "handle_cache_rejoin_full from " << full->get_source() << dendl; - int from = full->get_source().num(); - - // integrate full inodes - for (list::iterator p = full->full_inodes.begin(); - p != full->full_inodes.end(); - ++p) { - CInode *in = get_inode(p->inode.ino); - assert(in); - - set::iterator q = rejoin_undef_inodes.find(in); - if (q != rejoin_undef_inodes.end()) { - CInode *in = *q; - in->inode = p->inode; - in->symlink = p->symlink; - in->dirfragtree = p->dirfragtree; - in->state_clear(CInode::STATE_REJOINUNDEF); - dout(10) << " got full " << *in << dendl; - rejoin_undef_inodes.erase(q); - } else { - dout(10) << " had full " << *in << dendl; - } - } - - // done? - assert(rejoin_gather.count(from)); - rejoin_gather.erase(from); - if (rejoin_gather.empty()) { - rejoin_gather_finish(); - } else { - dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl; - } -} - - - -/** - * rejoin_trim_undef_inodes() -- remove REJOINUNDEF flagged inodes - * - * FIXME: wait, can this actually happen? a survivor should generate cache trim - * messages that clean these guys up... - */ -void MDCache::rejoin_trim_undef_inodes() -{ - dout(10) << "rejoin_trim_undef_inodes" << dendl; - - while (!rejoin_undef_inodes.empty()) { - set::iterator p = rejoin_undef_inodes.begin(); - CInode *in = *p; - rejoin_undef_inodes.erase(p); - - in->clear_replica_map(); - - // close out dirfrags - if (in->is_dir()) { - list dfls; - in->get_dirfrags(dfls); - for (list::iterator p = dfls.begin(); - p != dfls.end(); - ++p) { - CDir *dir = *p; - dir->clear_replica_map(); - - for (CDir::map_t::iterator p = dir->items.begin(); - p != dir->items.end(); - ++p) { - CDentry *dn = p->second; - dn->clear_replica_map(); - - dout(10) << " trimming " << *dn << dendl; - dir->remove_dentry(dn); - } - - dout(10) << " trimming " << *dir << dendl; - in->close_dirfrag(dir->dirfrag().frag); - } - } - - CDentry *dn = in->get_parent_dn(); - if (dn) { - dn->clear_replica_map(); - dout(10) << " trimming " << *dn << dendl; - dn->dir->remove_dentry(dn); - } else { - dout(10) << " trimming " << *in << dendl; - remove_inode(in); - } - } - - assert(rejoin_undef_inodes.empty()); -} - -class C_MDC_RejoinGatherFinish : public Context { - MDCache *cache; -public: - C_MDC_RejoinGatherFinish(MDCache *c) : cache(c) {} - void finish(int r) { - cache->rejoin_gather_finish(); - } -}; - - - -void MDCache::rejoin_gather_finish() -{ - dout(10) << "rejoin_gather_finish" << dendl; - assert(mds->is_rejoin()); - - rejoin_trim_undef_inodes(); - - // fetch paths? - // do this before ack, since some inodes we may have already gotten - // from surviving MDSs. - if (!cap_import_paths.empty()) { - C_Gather *gather = parallel_fetch(cap_import_paths); - if (gather) { - gather->set_finisher(new C_MDC_RejoinGatherFinish(this)); - return; - } - } - - // process cap imports - // ino -> client -> frommds -> capex - for (map > >::iterator p = cap_imports.begin(); - p != cap_imports.end(); - ++p) { - CInode *in = get_inode(p->first); - assert(in); - mds->server->add_reconnected_cap_inode(in); - for (map >::iterator q = p->second.begin(); - q != p->second.end(); - ++q) - for (map::iterator r = q->second.begin(); - r != q->second.end(); - ++r) - if (r->first >= 0) - rejoin_import_cap(in, q->first, r->second, r->first); - } - - mds->server->process_reconnected_caps(); - - rejoin_send_acks(); - - // did we already get our acks too? - // this happens when the rejoin_gather has to wait on a MISSING/FULL exchange. - if (rejoin_ack_gather.empty()) - mds->rejoin_done(); -} - -void MDCache::rejoin_import_cap(CInode *in, int client, inode_caps_reconnect_t& icr, int frommds) -{ - dout(10) << "rejoin_import_cap for client" << client << " from mds" << frommds - << " on " << *in << dendl; - - // add cap - in->reconnect_cap(client, icr); - - // send REAP - // FIXME client session weirdness. - MClientFileCaps *reap = new MClientFileCaps(MClientFileCaps::OP_IMPORT, - in->inode, - in->client_caps[client].get_last_seq(), - in->client_caps[client].pending(), - in->client_caps[client].wanted()); - - reap->set_mds( frommds ); // reap from whom? - mds->messenger->send_message(reap, - mds->clientmap.get_inst(client), - 0, MDS_PORT_CACHE); -} - -void MDCache::rejoin_send_acks() -{ - dout(7) << "rejoin_send_acks" << dendl; - - // send acks to everyone in the recovery set - map ack; - for (set::iterator p = recovery_set.begin(); - p != recovery_set.end(); - ++p) - ack[*p] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_ACK); - - // walk subtrees - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - p++) { - CDir *dir = p->first; - if (!dir->is_auth()) continue; - dout(10) << "subtree " << *dir << dendl; - - // auth items in this subtree - list dq; - dq.push_back(dir); - - while (!dq.empty()) { - CDir *dir = dq.front(); - dq.pop_front(); - - // dir - for (map::iterator r = dir->replicas_begin(); - r != dir->replicas_end(); - ++r) - ack[r->first]->add_strong_dirfrag(dir->dirfrag(), r->second, dir->dir_rep); - - for (CDir::map_t::iterator q = dir->items.begin(); - q != dir->items.end(); - ++q) { - CDentry *dn = q->second; - - // dentry - for (map::iterator r = dn->replicas_begin(); - r != dn->replicas_end(); - ++r) - ack[r->first]->add_strong_dentry(dir->dirfrag(), dn->name, - dn->is_primary() ? dn->get_inode()->ino():inodeno_t(0), - dn->is_remote() ? dn->get_remote_ino():inodeno_t(0), - dn->is_remote() ? dn->get_remote_d_type():0, - r->second, - dn->lock.get_replica_state()); - - if (!dn->is_primary()) continue; - - // inode - CInode *in = dn->inode; - - for (map::iterator r = in->replicas_begin(); - r != in->replicas_end(); - ++r) { - ack[r->first]->add_full_inode(in->inode, in->symlink, in->dirfragtree); - ack[r->first]->add_strong_inode(in->ino(), r->second, 0, - in->authlock.get_replica_state(), - in->linklock.get_replica_state(), - in->dirfragtreelock.get_replica_state(), - in->filelock.get_replica_state(), - in->dirlock.get_replica_state()); - } - - // subdirs in this subtree? - in->get_nested_dirfrags(dq); - } - } - } - - // root inodes too - if (root) - for (map::iterator r = root->replicas_begin(); - r != root->replicas_end(); - ++r) { - ack[r->first]->add_full_inode(root->inode, root->symlink, root->dirfragtree); - ack[r->first]->add_strong_inode(root->ino(), r->second, 0, - root->authlock.get_replica_state(), - root->linklock.get_replica_state(), - root->dirfragtreelock.get_replica_state(), - root->filelock.get_replica_state(), - root->dirlock.get_replica_state()); - } - if (stray) - for (map::iterator r = stray->replicas_begin(); - r != stray->replicas_end(); - ++r) { - ack[r->first]->add_full_inode(stray->inode, stray->symlink, stray->dirfragtree); - ack[r->first]->add_strong_inode(stray->ino(), r->second, 0, - stray->authlock.get_replica_state(), - stray->linklock.get_replica_state(), - stray->dirfragtreelock.get_replica_state(), - stray->filelock.get_replica_state(), - stray->dirlock.get_replica_state()); - } - - // send acks - for (map::iterator p = ack.begin(); - p != ack.end(); - ++p) - mds->send_message_mds(p->second, p->first, MDS_PORT_CACHE); - -} - - - -// =============================================================================== - - -void MDCache::set_root(CInode *in) -{ - assert(root == 0); - root = in; - base_inodes.insert(in); -} - - - - - - -// ************** -// Inode purging -- reliably removing deleted file's objects - -class C_MDC_PurgeFinish : public Context { - MDCache *mdc; - CInode *in; - off_t newsize, oldsize; -public: - C_MDC_PurgeFinish(MDCache *c, CInode *i, off_t ns, off_t os) : - mdc(c), in(i), newsize(ns), oldsize(os) {} - void finish(int r) { - mdc->purge_inode_finish(in, newsize, oldsize); - } -}; -class C_MDC_PurgeFinish2 : public Context { - MDCache *mdc; - CInode *in; - off_t newsize, oldsize; -public: - C_MDC_PurgeFinish2(MDCache *c, CInode *i, off_t ns, off_t os) : - mdc(c), in(i), newsize(ns), oldsize(os) {} - void finish(int r) { - mdc->purge_inode_finish_2(in, newsize, oldsize); - } -}; - -/* purge_inode in - * will be called by on unlink or rmdir or truncate or purge - * caller responsible for journaling a matching EUpdate - */ -void MDCache::purge_inode(CInode *in, off_t newsize, off_t oldsize, LogSegment *ls) -{ - dout(10) << "purge_inode " << oldsize << " -> " << newsize - << " on " << *in - << dendl; - - assert(oldsize >= newsize); - - purging[in][newsize] = oldsize; - purging_ls[in][newsize] = ls; - ls->purging_inodes[in][newsize] = oldsize; - - _do_purge_inode(in, newsize, oldsize); -} - -void MDCache::_do_purge_inode(CInode *in, off_t newsize, off_t oldsize) -{ - in->get(CInode::PIN_PURGING); - - // remove - if (in->inode.size > 0) { - mds->filer->remove(in->inode, newsize, oldsize, - 0, new C_MDC_PurgeFinish(this, in, newsize, oldsize)); - } else { - // no need, empty file, just log it - purge_inode_finish(in, newsize, oldsize); - } -} - -void MDCache::purge_inode_finish(CInode *in, off_t newsize, off_t oldsize) -{ - dout(10) << "purge_inode_finish " << oldsize << " -> " << newsize - << " on " << *in << dendl; - - // log completion - mds->mdlog->submit_entry(new EPurgeFinish(in->ino(), newsize, oldsize), - new C_MDC_PurgeFinish2(this, in, newsize, oldsize)); -} - -void MDCache::purge_inode_finish_2(CInode *in, off_t newsize, off_t oldsize) -{ - dout(10) << "purge_inode_finish_2 " << oldsize << " -> " << newsize - << " on " << *in << dendl; - - // remove from purging list - LogSegment *ls = purging_ls[in][newsize]; - purging[in].erase(newsize); - purging_ls[in].erase(newsize); - if (purging[in].empty()) { - purging.erase(in); - purging_ls.erase(in); - } - - assert(ls->purging_inodes.count(in)); - assert(ls->purging_inodes[in].count(newsize)); - assert(ls->purging_inodes[in][newsize] == oldsize); - ls->purging_inodes[in].erase(newsize); - if (ls->purging_inodes[in].empty()) - ls->purging_inodes.erase(in); - - in->put(CInode::PIN_PURGING); - - // tell anyone who cares (log flusher?) - if (purging.count(in) == 0 || - purging[in].rbegin()->first < newsize) { - list ls; - ls.swap(waiting_for_purge[in][newsize]); - waiting_for_purge[in].erase(newsize); - if (waiting_for_purge[in].empty()) - waiting_for_purge.erase(in); - finish_contexts(ls, 0); - } - - // done with inode? - if (in->get_num_ref() == 0) - remove_inode(in); -} - -void MDCache::add_recovered_purge(CInode *in, off_t newsize, off_t oldsize, LogSegment *ls) -{ - assert(purging[in].count(newsize) == 0); - purging[in][newsize] = oldsize; - purging_ls[in][newsize] = ls; - ls->purging_inodes[in][newsize] = oldsize; -} - -void MDCache::remove_recovered_purge(CInode *in, off_t newsize, off_t oldsize) -{ - purging[in].erase(newsize); -} - -void MDCache::start_recovered_purges() -{ - dout(10) << "start_recovered_purges (" << purging.size() << " purges)" << dendl; - - for (map >::iterator p = purging.begin(); - p != purging.end(); - ++p) { - for (map::iterator q = p->second.begin(); - q != p->second.end(); - ++q) { - dout(10) << "start_recovered_purges " - << q->second << " -> " << q->first - << " on " << *p->first - << dendl; - _do_purge_inode(p->first, q->first, q->second); - } - } -} - - - -// ================================================================================ -// cache trimming - - -/* - * note: only called while MDS is active or stopping... NOT during recovery. - * however, we may expire a replica whose authority is recovering. - * - */ -bool MDCache::trim(int max) -{ - // trim LRU - if (max < 0) { - max = lru.lru_get_max(); - if (!max) return false; - } - dout(7) << "trim max=" << max << " cur=" << lru.lru_get_size() << dendl; - - map expiremap; - - // trim dentries from the LRU - while (lru.lru_get_size() > (unsigned)max) { - CDentry *dn = (CDentry*)lru.lru_expire(); - if (!dn) break; - trim_dentry(dn, expiremap); - } - - // trim base inodes? - if (max == 0) { - set::iterator p = base_inodes.begin(); - while (p != base_inodes.end()) { - CInode *in = *p++; - list ls; - in->get_dirfrags(ls); - for (list::iterator p = ls.begin(); p != ls.end(); ++p) { - CDir *dir = *p; - if (dir->get_num_ref() == 1) // subtree pin - trim_dirfrag(dir, 0, expiremap); - } - if (in->get_num_ref() == 0) - trim_inode(0, in, 0, expiremap); - } - } - - // send any expire messages - send_expire_messages(expiremap); - - return true; -} - -void MDCache::send_expire_messages(map& expiremap) -{ - // send expires - for (map::iterator it = expiremap.begin(); - it != expiremap.end(); - it++) { - dout(7) << "sending cache_expire to " << it->first << dendl; - mds->send_message_mds(it->second, it->first, MDS_PORT_CACHE); - } -} - - -void MDCache::trim_dentry(CDentry *dn, map& expiremap) -{ - dout(12) << "trim_dentry " << *dn << dendl; - - CDir *dir = dn->get_dir(); - assert(dir); - - CDir *con = get_subtree_root(dir); - assert(con); - - dout(12) << " in container " << *con << dendl; - - // notify dentry authority? - if (!dn->is_auth()) { - pair auth = dn->authority(); - - for (int p=0; p<2; p++) { - int a = auth.first; - if (p) a = auth.second; - if (a < 0 || (p == 1 && auth.second == auth.first)) break; - if (mds->get_nodeid() == auth.second && - con->is_importing()) break; // don't send any expire while importing. - if (a == mds->get_nodeid()) continue; // on export, ignore myself. - - dout(12) << " sending expire to mds" << a << " on " << *dn << dendl; - assert(a != mds->get_nodeid()); - if (expiremap.count(a) == 0) - expiremap[a] = new MCacheExpire(mds->get_nodeid()); - expiremap[a]->add_dentry(con->dirfrag(), dir->dirfrag(), dn->get_name(), dn->get_replica_nonce()); - } - } - - // adjust the dir state - // NOTE: we can safely remove a clean, null dentry without effecting - // directory completeness. - // (do this _before_ we unlink the inode, below!) - if (!(dn->is_null() && dn->is_clean())) - dir->state_clear(CDir::STATE_COMPLETE); - - // unlink the dentry - if (dn->is_remote()) { - // just unlink. - dir->unlink_inode(dn); - } - else if (dn->is_primary()) { - // expire the inode, too. - CInode *in = dn->get_inode(); - assert(in); - trim_inode(dn, in, con, expiremap); - } - else { - assert(dn->is_null()); - } - - // remove dentry - dir->remove_dentry(dn); - - // reexport? - if (dir->get_size() == 0 && dir->is_subtree_root()) - migrator->export_empty_import(dir); - - if (mds->logger) mds->logger->inc("cex"); -} - - -void MDCache::trim_dirfrag(CDir *dir, CDir *con, map& expiremap) -{ - dout(15) << "trim_dirfrag " << *dir << dendl; - - if (dir->is_subtree_root()) { - assert(!dir->is_auth() || - (!dir->is_replicated() && dir->inode->is_base())); - remove_subtree(dir); // remove from subtree map - } - assert(dir->get_num_ref() == 0); - - CInode *in = dir->get_inode(); - - if (!dir->is_auth()) { - pair auth = dir->authority(); - - // was this an auth delegation? (if so, slightly modified container) - dirfrag_t condf; - if (dir->is_subtree_root()) { - dout(12) << " subtree root, container is " << *dir << dendl; - con = dir; - condf = dir->dirfrag(); - } else { - condf = con->dirfrag(); - } - - for (int p=0; p<2; p++) { - int a = auth.first; - if (p) a = auth.second; - if (a < 0 || (p == 1 && auth.second == auth.first)) break; - if (mds->get_nodeid() == auth.second && - con->is_importing()) break; // don't send any expire while importing. - if (a == mds->get_nodeid()) continue; // on export, ignore myself. - - dout(12) << " sending expire to mds" << a << " on " << *dir << dendl; - assert(a != mds->get_nodeid()); - if (expiremap.count(a) == 0) - expiremap[a] = new MCacheExpire(mds->get_nodeid()); - expiremap[a]->add_dir(condf, dir->dirfrag(), dir->replica_nonce); - } - } - - in->close_dirfrag(dir->dirfrag().frag); -} - -void MDCache::trim_inode(CDentry *dn, CInode *in, CDir *con, map& expiremap) -{ - dout(15) << "trim_inode " << *in << dendl; - assert(in->get_num_ref() == 0); - - // DIR - list dfls; - in->get_dirfrags(dfls); - for (list::iterator p = dfls.begin(); - p != dfls.end(); - ++p) - trim_dirfrag(*p, con ? con:*p, expiremap); // if no container (e.g. root dirfrag), use *p - - // INODE - if (!in->is_auth()) { - pair auth = in->authority(); - - dirfrag_t df; - if (con) - df = con->dirfrag(); - else - df = dirfrag_t(0,frag_t()); // must be a root or stray inode. - - for (int p=0; p<2; p++) { - int a = auth.first; - if (p) a = auth.second; - if (a < 0 || (p == 1 && auth.second == auth.first)) break; - if (con && mds->get_nodeid() == auth.second && - con->is_importing()) break; // don't send any expire while importing. - if (a == mds->get_nodeid()) continue; // on export, ignore myself. - - dout(12) << " sending expire to mds" << a << " on " << *in << dendl; - assert(a != mds->get_nodeid()); - if (expiremap.count(a) == 0) - expiremap[a] = new MCacheExpire(mds->get_nodeid()); - expiremap[a]->add_inode(df, in->ino(), in->get_replica_nonce()); - } - } - - /* - if (in->is_auth()) { - if (in->hack_accessed) - mds->logger->inc("outt"); - else { - mds->logger->inc("outut"); - mds->logger->favg("oututl", g_clock.now() - in->hack_load_stamp); - } - } - */ - - // unlink - if (dn) - dn->get_dir()->unlink_inode(dn); - remove_inode(in); -} - - -/** - * trim_non_auth - remove any non-auth items from our cache - * - * this reduces the amount of non-auth metadata in our cache, reducing the - * load incurred by the rejoin phase. - * - * the only non-auth items that remain are those that are needed to - * attach our own subtrees to the root. - * - * when we are done, all dentries will be in the top bit of the lru. - * - * why we have to do this: - * we may not have accurate linkage for non-auth items. which means we will - * know which subtree it falls into, and can not be sure to declare it to the - * correct authority. - */ -void MDCache::trim_non_auth() -{ - dout(7) << "trim_non_auth" << dendl; - - // temporarily pin all subtree roots - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - p++) - p->first->get(CDir::PIN_SUBTREETEMP); - - // note first auth item we see. - // when we see it the second time, stop. - CDentry *first_auth = 0; - - // trim non-auth items from the lru - while (lru.lru_get_size() > 0) { - CDentry *dn = (CDentry*)lru.lru_expire(); - if (!dn) break; - - if (dn->is_auth()) { - // add back into lru (at the top) - lru.lru_insert_top(dn); - - if (!first_auth) { - first_auth = dn; - } else { - if (first_auth == dn) - break; - } - } else { - // non-auth. expire. - CDir *dir = dn->get_dir(); - assert(dir); - - // unlink the dentry - dout(15) << "trim_non_auth removing " << *dn << dendl; - if (dn->is_remote()) { - dir->unlink_inode(dn); - } - else if (dn->is_primary()) { - CInode *in = dn->get_inode(); - list ls; - in->get_dirfrags(ls); - for (list::iterator p = ls.begin(); p != ls.end(); ++p) { - CDir *subdir = *p; - if (subdir->is_subtree_root()) - remove_subtree(subdir); - in->close_dirfrag(subdir->dirfrag().frag); - } - dir->unlink_inode(dn); - remove_inode(in); - } - else { - assert(dn->is_null()); - } - dir->remove_dentry(dn); - - // adjust the dir state - dir->state_clear(CDir::STATE_COMPLETE); // dir incomplete! - } - } - - if (lru.lru_get_size() == 0) { - // root, stray, etc.? - hash_map::iterator p = inode_map.begin(); - while (p != inode_map.end()) { - hash_map::iterator next = p; - ++next; - CInode *in = p->second; - if (!in->is_auth()) { - list ls; - in->get_dirfrags(ls); - for (list::iterator p = ls.begin(); - p != ls.end(); - ++p) { - assert((*p)->get_num_ref() == 0); - remove_subtree((*p)); - in->close_dirfrag((*p)->dirfrag().frag); - } - assert(in->get_num_ref() == 0); - remove_inode(in); - } - p = next; - } - } - - // move everything in the pintail to the top bit of the lru. - lru.lru_touch_entire_pintail(); - - // unpin all subtrees - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - p++) - p->first->put(CDir::PIN_SUBTREETEMP); - - show_subtrees(); -} - -void MDCache::handle_cache_expire(MCacheExpire *m) -{ - int from = m->get_from(); - - dout(7) << "cache_expire from mds" << from << dendl; - - if (mds->get_state() < MDSMap::STATE_REJOIN) { - delete m; - return; - } - - // loop over realms - for (map::iterator p = m->realms.begin(); - p != m->realms.end(); - ++p) { - // check container? - if (p->first.ino > 0) { - CInode *coni = get_inode(p->first.ino); - assert(coni); // we had better have this. - CDir *con = coni->get_approx_dirfrag(p->first.frag); - assert(con); - - if (!con->is_auth() || - (con->is_auth() && con->is_exporting() && - migrator->get_export_state(con) == Migrator::EXPORT_WARNING && - migrator->export_has_warned(con,from))) { - // not auth. - dout(7) << "delaying nonauth|warned expires for " << *con << dendl; - assert(con->is_frozen_tree_root()); - - // make a message container - if (delayed_expire[con].count(from) == 0) - delayed_expire[con][from] = new MCacheExpire(from); - - // merge these expires into it - delayed_expire[con][from]->add_realm(p->first, p->second); - continue; - } - dout(7) << "expires for " << *con << dendl; - } else { - dout(7) << "containerless expires (root, stray inodes)" << dendl; - } - - // INODES - for (map::iterator it = p->second.inodes.begin(); - it != p->second.inodes.end(); - it++) { - CInode *in = get_inode(it->first); - int nonce = it->second; - - if (!in) { - dout(0) << " inode expire on " << it->first << " from " << from - << ", don't have it" << dendl; - assert(in); - } - assert(in->is_auth()); - - // check nonce - if (nonce == in->get_replica_nonce(from)) { - // remove from our cached_by - dout(7) << " inode expire on " << *in << " from mds" << from - << " cached_by was " << in->get_replicas() << dendl; - inode_remove_replica(in, from); - } - else { - // this is an old nonce, ignore expire. - dout(7) << " inode expire on " << *in << " from mds" << from - << " with old nonce " << nonce - << " (current " << in->get_replica_nonce(from) << "), dropping" - << dendl; - assert(in->get_replica_nonce(from) > nonce); - } - } - - // DIRS - for (map::iterator it = p->second.dirs.begin(); - it != p->second.dirs.end(); - it++) { - CDir *dir = get_dirfrag(it->first); - int nonce = it->second; - - if (!dir) { - dout(0) << " dir expire on " << it->first << " from " << from - << ", don't have it" << dendl; - assert(dir); - } - assert(dir->is_auth()); - - // check nonce - if (nonce == dir->get_replica_nonce(from)) { - // remove from our cached_by - dout(7) << " dir expire on " << *dir << " from mds" << from - << " replicas was " << dir->replica_map << dendl; - dir->remove_replica(from); - } - else { - // this is an old nonce, ignore expire. - dout(7) << " dir expire on " << *dir << " from mds" << from - << " with old nonce " << nonce << " (current " << dir->get_replica_nonce(from) - << "), dropping" << dendl; - assert(dir->get_replica_nonce(from) > nonce); - } - } - - // DENTRIES - for (map >::iterator pd = p->second.dentries.begin(); - pd != p->second.dentries.end(); - ++pd) { - dout(10) << " dn expires in dir " << pd->first << dendl; - CInode *diri = get_inode(pd->first.ino); - assert(diri); - CDir *dir = diri->get_dirfrag(pd->first.frag); - - if (!dir) { - dout(0) << " dn expires on " << pd->first << " from " << from - << ", must have refragmented" << dendl; - } else { - assert(dir->is_auth()); - } - - for (map::iterator p = pd->second.begin(); - p != pd->second.end(); - ++p) { - int nonce = p->second; - CDentry *dn; - - if (dir) { - dn = dir->lookup(p->first); - } else { - // which dirfrag for this dentry? - CDir *dir = diri->get_dirfrag(diri->pick_dirfrag(p->first)); - assert(dir->is_auth()); - dn = dir->lookup(p->first); - } - - if (!dn) - dout(0) << " missing dentry for " << p->first << " in " << *dir << dendl; - assert(dn); - - if (nonce == dn->get_replica_nonce(from)) { - dout(7) << " dentry_expire on " << *dn << " from mds" << from << dendl; - dentry_remove_replica(dn, from); - } - else { - dout(7) << " dentry_expire on " << *dn << " from mds" << from - << " with old nonce " << nonce << " (current " << dn->get_replica_nonce(from) - << "), dropping" << dendl; - assert(dn->get_replica_nonce(from) > nonce); - } - } - } - } - - - // done - delete m; -} - -void MDCache::process_delayed_expire(CDir *dir) -{ - dout(7) << "process_delayed_expire on " << *dir << dendl; - for (map::iterator p = delayed_expire[dir].begin(); - p != delayed_expire[dir].end(); - ++p) - handle_cache_expire(p->second); - delayed_expire.erase(dir); -} - -void MDCache::discard_delayed_expire(CDir *dir) -{ - dout(7) << "discard_delayed_expire on " << *dir << dendl; - for (map::iterator p = delayed_expire[dir].begin(); - p != delayed_expire[dir].end(); - ++p) - delete p->second; - delayed_expire.erase(dir); -} - -void MDCache::inode_remove_replica(CInode *in, int from) -{ - in->remove_replica(from); - in->mds_caps_wanted.erase(from); - - // note: this code calls _eval more often than it needs to! - // fix lock - if (in->authlock.remove_replica(from)) mds->locker->simple_eval_gather(&in->authlock); - if (in->linklock.remove_replica(from)) mds->locker->simple_eval_gather(&in->linklock); - if (in->dirfragtreelock.remove_replica(from)) mds->locker->simple_eval_gather(&in->dirfragtreelock); - if (in->filelock.remove_replica(from)) mds->locker->file_eval_gather(&in->filelock); - if (in->dirlock.remove_replica(from)) mds->locker->scatter_eval_gather(&in->dirlock); - - // alone now? - /* - if (!in->is_replicated()) { - mds->locker->simple_eval_gather(&in->authlock); - mds->locker->simple_eval_gather(&in->linklock); - mds->locker->simple_eval_gather(&in->dirfragtreelock); - mds->locker->file_eval_gather(&in->filelock); - mds->locker->scatter_eval_gather(&in->dirlock); - } - */ -} - -void MDCache::dentry_remove_replica(CDentry *dn, int from) -{ - dn->remove_replica(from); - - // fix lock - if (dn->lock.remove_replica(from) || - !dn->is_replicated()) - mds->locker->simple_eval_gather(&dn->lock); -} - - - -// ========================================================================================= -// shutdown - -class C_MDC_ShutdownCheck : public Context { - MDCache *mdc; -public: - C_MDC_ShutdownCheck(MDCache *m) : mdc(m) {} - void finish(int) { - mdc->shutdown_check(); - } -}; - -void MDCache::shutdown_check() -{ - dout(0) << "shutdown_check at " << g_clock.now() << dendl; - - // cache - int o = g_conf.debug_mds; - g_conf.debug_mds = 10; - show_cache(); - g_conf.debug_mds = o; - mds->timer.add_event_after(g_conf.mds_shutdown_check, new C_MDC_ShutdownCheck(this)); - - // this - dout(0) << "lru size now " << lru.lru_get_size() << dendl; - dout(0) << "log len " << mds->mdlog->get_num_events() << dendl; - - - if (mds->filer->is_active()) - dout(0) << "filer still active" << dendl; -} - -void MDCache::shutdown_start() -{ - dout(2) << "shutdown_start" << dendl; - - if (g_conf.mds_shutdown_check) - mds->timer.add_event_after(g_conf.mds_shutdown_check, new C_MDC_ShutdownCheck(this)); - - // g_conf.debug_mds = 10; -} - - - -bool MDCache::shutdown_pass() -{ - dout(7) << "shutdown_pass" << dendl; - - if (mds->is_stopped()) { - dout(7) << " already shut down" << dendl; - show_cache(); - show_subtrees(); - return true; - } - - // flush batching eopens, so that we can properly expire them. - mds->server->journal_opens(); // hrm, this is sort of a hack. - - // flush what we can from the log - mds->mdlog->set_max_events(0); - mds->mdlog->trim(); - - if (mds->mdlog->get_num_segments() > 1) { - dout(7) << "still >1 segments, waiting for log to trim" << dendl; - return false; - } - - if (!shutdown_export_strays()) { - dout(7) << "waiting for strays to migrate" << dendl; - return false; - } - - // trim cache - trim(0); - dout(5) << "lru size now " << lru.lru_get_size() << dendl; - - // SUBTREES - if (!subtrees.empty() && - mds->get_nodeid() != 0 && - !migrator->is_exporting() //&& - //!migrator->is_importing() - ) { - dout(7) << "looking for subtrees to export to mds0" << dendl; - list ls; - for (map >::iterator it = subtrees.begin(); - it != subtrees.end(); - it++) { - CDir *dir = it->first; - if (dir->get_inode()->is_stray()) continue; - if (dir->is_frozen() || dir->is_freezing()) continue; - if (!dir->is_full_dir_auth()) continue; - ls.push_back(dir); - } - int max = 5; // throttle shutdown exports.. hack! - for (list::iterator p = ls.begin(); p != ls.end(); ++p) { - CDir *dir = *p; - int dest = dir->get_inode()->authority().first; - if (dest > 0 && !mds->mdsmap->is_active(dest)) dest = 0; - dout(7) << "sending " << *dir << " back to mds" << dest << dendl; - migrator->export_dir(dir, dest); - if (--max == 0) break; - } - } - - if (!shutdown_export_caps()) { - dout(7) << "waiting for residual caps to export" << dendl; - return false; - } - - // subtrees map not empty yet? - if (!subtrees.empty()) { - dout(7) << "still have " << num_subtrees() << " subtrees" << dendl; - show_subtrees(); - migrator->show_importing(); - migrator->show_exporting(); - if (!migrator->is_importing() && !migrator->is_exporting()) - show_cache(); - return false; - } - assert(subtrees.empty()); - assert(!migrator->is_exporting()); - assert(!migrator->is_importing()); - - - - // empty out stray contents - // FIXME - dout(7) << "FIXME: i need to empty out stray dir contents..." << dendl; - - // (only do this once!) - if (!mds->mdlog->is_capped()) { - dout(7) << "capping the log" << dendl; - mds->mdlog->cap(); - mds->mdlog->trim(); - } - - if (!mds->mdlog->empty()) { - dout(7) << "waiting for log to flush.. " << mds->mdlog->get_num_events() - << " in " << mds->mdlog->get_num_segments() << " segments" << dendl; - return false; - } - - if (!did_shutdown_log_cap) { - // flush journal header - dout(7) << "writing header for (now-empty) journal" << dendl; - assert(mds->mdlog->empty()); - mds->mdlog->write_head(0); - // NOTE: filer active checker below will block us until this completes. - did_shutdown_log_cap = true; - return false; - } - - // filer active? - if (mds->filer->is_active()) { - dout(7) << "filer still active" << dendl; - return false; - } - - // trim what we can from the cache - if (lru.lru_get_size() > 0) { - dout(7) << "there's still stuff in the cache: " << lru.lru_get_size() << dendl; - show_cache(); - //dump(); - return false; - } - - // done! - dout(2) << "shutdown done." << dendl; - return true; -} - -bool MDCache::shutdown_export_strays() -{ - if (mds->get_nodeid() == 0) return true; - if (!stray) return true; - - bool done = true; - static set exported_strays; - list dfs; - - stray->get_dirfrags(dfs); - while (!dfs.empty()) { - CDir *dir = dfs.front(); - dfs.pop_front(); - - if (!dir->is_complete()) { - dir->fetch(0); - done = false; - } - - for (CDir::map_t::iterator p = dir->items.begin(); - p != dir->items.end(); - p++) { - CDentry *dn = p->second; - if (dn->is_null()) continue; - done = false; - - // FIXME: we'll deadlock if a rename fails. - if (exported_strays.count(dn->get_inode()->ino()) == 0) { - exported_strays.insert(dn->get_inode()->ino()); - migrate_stray(dn, mds->get_nodeid(), 0); // send to root! - } - } - } - - return done; -} - -bool MDCache::shutdown_export_caps() -{ - // export caps? - // note: this runs more often than it should. - static bool exported_caps = false; - static set exported_caps_in; - if (!exported_caps) { - dout(7) << "searching for caps to export" << dendl; - exported_caps = true; - - list dirq; - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) { - if (exported_caps_in.count(p->first)) continue; - if (p->first->is_auth() || - p->first->is_ambiguous_auth()) - exported_caps = false; // we'll have to try again - else { - dirq.push_back(p->first); - exported_caps_in.insert(p->first); - } - } - while (!dirq.empty()) { - CDir *dir = dirq.front(); - dirq.pop_front(); - for (CDir::map_t::iterator p = dir->items.begin(); - p != dir->items.end(); - ++p) { - CDentry *dn = p->second; - if (!dn->is_primary()) continue; - CInode *in = dn->get_inode(); - if (in->is_dir()) - in->get_nested_dirfrags(dirq); - if (in->is_any_caps() && !in->state_test(CInode::STATE_EXPORTINGCAPS)) - migrator->export_caps(in); - } - } - } - - return true; -} - - - - - -// ========= messaging ============== - - -void MDCache::dispatch(Message *m) -{ - switch (m->get_type()) { - - // RESOLVE - case MSG_MDS_RESOLVE: - handle_resolve((MMDSResolve*)m); - break; - case MSG_MDS_RESOLVEACK: - handle_resolve_ack((MMDSResolveAck*)m); - break; - - // REJOIN - case MSG_MDS_CACHEREJOIN: - handle_cache_rejoin((MMDSCacheRejoin*)m); - break; - - case MSG_MDS_DISCOVER: - handle_discover((MDiscover*)m); - break; - case MSG_MDS_DISCOVERREPLY: - handle_discover_reply((MDiscoverReply*)m); - break; - - /* - case MSG_MDS_INODEUPDATE: - handle_inode_update((MInodeUpdate*)m); - break; - */ - - case MSG_MDS_DIRUPDATE: - handle_dir_update((MDirUpdate*)m); - break; - - case MSG_MDS_CACHEEXPIRE: - handle_cache_expire((MCacheExpire*)m); - break; - - - - case MSG_MDS_DENTRYUNLINK: - handle_dentry_unlink((MDentryUnlink*)m); - break; - - - case MSG_MDS_FRAGMENTNOTIFY: - handle_fragment_notify((MMDSFragmentNotify*)m); - break; - - - - default: - dout(7) << "cache unknown message " << m->get_type() << dendl; - assert(0); - break; - } -} - - -/* path_traverse - * - * return values: - * <0 : traverse error (ENOTDIR, ENOENT, etc.) - * 0 : success - * >0 : delayed or forwarded - * - * onfail values: - * - * MDS_TRAVERSE_FORWARD - forward to auth (or best guess) - * MDS_TRAVERSE_DISCOVER - discover missing items. skip permission checks. - * MDS_TRAVERSE_DISCOVERXLOCK - discover XLOCKED items too (be careful!). - * MDS_TRAVERSE_FAIL - return an error - */ - -Context *MDCache::_get_waiter(MDRequest *mdr, Message *req) -{ - if (mdr) { - dout(20) << "_get_waiter retryrequest" << dendl; - return new C_MDS_RetryRequest(this, mdr); - } else { - dout(20) << "_get_waiter retrymessage" << dendl; - return new C_MDS_RetryMessage(mds, req); - } -} - -int MDCache::path_traverse(MDRequest *mdr, Message *req, // who - filepath& origpath, // what - vector& trace, // result - bool follow_trailing_symlink, // how - int onfail) -{ - assert(mdr || req); - bool null_okay = (onfail == MDS_TRAVERSE_DISCOVERXLOCK); - bool noperm = (onfail == MDS_TRAVERSE_DISCOVER || - onfail == MDS_TRAVERSE_DISCOVERXLOCK); - - // keep a list of symlinks we touch to avoid loops - set< pair > symlinks_resolved; - - // root - CInode *cur = get_inode(origpath.get_ino()); - if (cur == NULL) { - dout(7) << "traverse: opening base ino " << origpath.get_ino() << dendl; - if (origpath.get_ino() == MDS_INO_ROOT) - open_root(_get_waiter(mdr, req)); - else if (MDS_INO_IS_STRAY(origpath.get_ino())) - open_foreign_stray(origpath.get_ino() - MDS_INO_STRAY_OFFSET, _get_waiter(mdr, req)); - else { - assert(0); // hrm.. broken - return -EIO; - } - return 1; - } - - if (mds->logger) mds->logger->inc("t"); - - // start trace - trace.clear(); - - // make our own copy, since we'll modify when we hit symlinks - filepath path = origpath; - - unsigned depth = 0; - while (depth < path.depth()) { - dout(12) << "traverse: path seg depth " << depth << " = " << path[depth] << dendl; - - if (!cur->is_dir()) { - dout(7) << "traverse: " << *cur << " not a dir " << dendl; - return -ENOTDIR; - } - - // open dir - frag_t fg = cur->pick_dirfrag(path[depth]); - CDir *curdir = cur->get_dirfrag(fg); - if (!curdir) { - if (cur->is_auth()) { - // parent dir frozen_dir? - if (cur->is_frozen_dir()) { - dout(7) << "traverse: " << *cur->get_parent_dir() << " is frozen_dir, waiting" << dendl; - cur->get_parent_dn()->get_dir()->add_waiter(CDir::WAIT_UNFREEZE, _get_waiter(mdr, req)); - return 1; - } - curdir = cur->get_or_open_dirfrag(this, fg); - } else { - // discover? - dout(10) << "traverse: need dirfrag " << fg << ", doing discover from " << *cur << dendl; - discover_path(cur, path.postfixpath(depth), _get_waiter(mdr, req), - onfail == MDS_TRAVERSE_DISCOVERXLOCK); - if (mds->logger) mds->logger->inc("tdis"); - return 1; - } - } - assert(curdir); - - // frozen? - /* - if (curdir->is_frozen()) { - // doh! - // FIXME: traverse is allowed? - dout(7) << "traverse: " << *curdir << " is frozen, waiting" << dendl; - curdir->add_waiter(CDir::WAIT_UNFREEZE, _get_waiter(mdr, req)); - if (onfinish) delete onfinish; - return 1; - } - */ - - // must read directory hard data (permissions, x bit) to traverse - if (!noperm && - !mds->locker->simple_rdlock_try(&cur->authlock, 0)) { - dout(7) << "traverse: waiting on authlock rdlock on " << *cur << dendl; - cur->authlock.add_waiter(SimpleLock::WAIT_RD, _get_waiter(mdr, req)); - return 1; - } - - // check permissions? - // XXX - - // ..? - if (path[depth] == "..") { - trace.pop_back(); - depth++; - cur = cur->get_parent_inode(); - dout(10) << "traverse: following .. back to " << *cur << dendl; - continue; - } - - - // dentry - CDentry *dn = curdir->lookup(path[depth]); - - // null and last_bit and xlocked by me? - if (dn && dn->is_null() && null_okay) { - dout(10) << "traverse: hit null dentry at tail of traverse, succeeding" << dendl; - trace.push_back(dn); - break; // done! - } - - if (dn && !dn->is_null()) { - // dentry exists. xlocked? - if (!noperm && dn->lock.is_xlocked() && dn->lock.get_xlocked_by() != mdr) { - dout(10) << "traverse: xlocked dentry at " << *dn << dendl; - dn->lock.add_waiter(SimpleLock::WAIT_RD, _get_waiter(mdr, req)); - if (mds->logger) mds->logger->inc("tlock"); - return 1; - } - - // do we have inode? - if (!dn->inode) { - assert(dn->is_remote()); - // do i have it? - CInode *in = get_inode(dn->get_remote_ino()); - if (in) { - dout(7) << "linking in remote in " << *in << dendl; - dn->link_remote(in); - } else { - dout(7) << "remote link to " << dn->get_remote_ino() << ", which i don't have" << dendl; - assert(mdr); // we shouldn't hit non-primary dentries doing a non-mdr traversal! - open_remote_ino(dn->get_remote_ino(), mdr, _get_waiter(mdr, req)); - if (mds->logger) mds->logger->inc("trino"); - return 1; - } - } - - // symlink? - if (dn->inode->is_symlink() && - (follow_trailing_symlink || depth < path.depth()-1)) { - // symlink, resolve! - filepath sym = dn->inode->symlink; - dout(10) << "traverse: hit symlink " << *dn->inode << " to " << sym << dendl; - - // break up path components - // /head/symlink/tail - filepath head = path.prefixpath(depth); - filepath tail = path.postfixpath(depth+1); - dout(10) << "traverse: path head = " << head << dendl; - dout(10) << "traverse: path tail = " << tail << dendl; - - if (symlinks_resolved.count(pair(dn->inode, tail.get_path()))) { - dout(10) << "already hit this symlink, bailing to avoid the loop" << dendl; - return -ELOOP; - } - symlinks_resolved.insert(pair(dn->inode, tail.get_path())); - - // start at root? - if (dn->inode->symlink[0] == '/') { - // absolute - trace.clear(); - depth = 0; - path = dn->inode->symlink; - path.append(tail); - dout(10) << "traverse: absolute symlink, path now " << path << " depth " << depth << dendl; - } else { - // relative - path = head; - path.append(sym); - path.append(tail); - dout(10) << "traverse: relative symlink, path now " << path << " depth " << depth << dendl; - } - continue; - } - - // forwarder wants replicas? - if (mdr && mdr->client_request && - mdr->client_request->get_mds_wants_replica_in_dirino()) { - dout(30) << "traverse: REP is here, " - << mdr->client_request->get_mds_wants_replica_in_dirino() - << " vs " << curdir->dirfrag() << dendl; - - if (mdr->client_request->get_mds_wants_replica_in_dirino() == curdir->ino() && - curdir->is_auth() && - curdir->is_rep() && - curdir->is_replica(req->get_source().num()) && - dn->is_auth() - ) { - assert(req->get_source().is_mds()); - int from = req->get_source().num(); - - if (dn->is_replica(from)) { - dout(15) << "traverse: REP would replicate to mds" << from << ", but already cached_by " - << req->get_source() << " dn " << *dn << dendl; - } else { - dout(10) << "traverse: REP replicating to " << req->get_source() << " dn " << *dn << dendl; - MDiscoverReply *reply = new MDiscoverReply(curdir->dirfrag()); - reply->add_dentry( dn->replicate_to( from ) ); - if (dn->is_primary()) - reply->add_inode( dn->inode->replicate_to( from ) ); - mds->send_message_mds(reply, req->get_source().num(), MDS_PORT_CACHE); - } - } - } - - // add to trace, continue. - trace.push_back(dn); - cur = dn->inode; - touch_inode(cur); - depth++; - continue; - } - - // MISS. dentry doesn't exist. - dout(12) << "traverse: miss on dentry " << path[depth] << " in " << *curdir << dendl; - - if (curdir->is_auth()) { - // dentry is mine. - if (curdir->is_complete()) { - // file not found - return -ENOENT; - } else { - // directory isn't complete; reload - dout(7) << "traverse: incomplete dir contents for " << *cur << ", fetching" << dendl; - touch_inode(cur); - curdir->fetch(_get_waiter(mdr, req)); - if (mds->logger) mds->logger->inc("tdirf"); - return 1; - } - } else { - // dirfrag/dentry is not mine. - pair dauth = curdir->authority(); - - if ((onfail == MDS_TRAVERSE_DISCOVER || - onfail == MDS_TRAVERSE_DISCOVERXLOCK)) { - dout(7) << "traverse: discover from " << path[depth] << " from " << *curdir << dendl; - discover_path(curdir, path.postfixpath(depth), _get_waiter(mdr, req), - onfail == MDS_TRAVERSE_DISCOVERXLOCK); - if (mds->logger) mds->logger->inc("tdis"); - return 1; - } - if (onfail == MDS_TRAVERSE_FORWARD) { - // forward - dout(7) << "traverse: not auth for " << path << " in " << *curdir << dendl; - - if (curdir->is_ambiguous_auth()) { - // wait - dout(7) << "traverse: waiting for single auth in " << *curdir << dendl; - curdir->add_waiter(CDir::WAIT_SINGLEAUTH, _get_waiter(mdr, req)); - return 1; - } - - dout(7) << "traverse: forwarding, not auth for " << *curdir << dendl; - - // request replication? - if (mdr && mdr->client_request && curdir->is_rep()) { - dout(15) << "traverse: REP fw to mds" << dauth << ", requesting rep under " - << *curdir << " req " << *(MClientRequest*)req << dendl; - mdr->client_request->set_mds_wants_replica_in_dirino(curdir->ino()); - req->clear_payload(); // reencode! - } - - if (mdr) - request_forward(mdr, dauth.first, req->get_dest_port()); - else - mds->forward_message_mds(req, dauth.first, req->get_dest_port()); - - if (mds->logger) mds->logger->inc("tfw"); - return 2; - } - if (onfail == MDS_TRAVERSE_FAIL) - return -ENOENT; // not necessarily exactly true.... - } - - assert(0); // i shouldn't get here - } - - // success. - if (mds->logger) mds->logger->inc("thit"); - return 0; -} - -bool MDCache::path_is_mine(filepath& path) -{ - dout(15) << "path_is_mine " << path << dendl; - - // start at root. FIXME. - CInode *cur = root; - assert(cur); - - for (unsigned i=0; ipick_dirfrag(path[i]); - CDir *dir = cur->get_dirfrag(fg); - if (!dir) return cur->is_auth(); - CDentry *dn = dir->lookup(path[i]); - if (!dn) return dir->is_auth(); - assert(dn->is_primary()); - cur = dn->get_inode(); - } - - return cur->is_auth(); -} - -/** - * path_traverse_to_dir -- traverse to deepest dir we have - * - * @path - path to traverse (as far as we can) - * - * assumes we _don't_ have the full path. (if we do, we return NULL.) - */ -CDir *MDCache::path_traverse_to_dir(filepath& path) -{ - CInode *cur = root; - assert(cur); - for (unsigned i=0; ipick_dirfrag(path[i]); - CDir *dir = cur->get_or_open_dirfrag(this, fg); - CDentry *dn = dir->lookup(path[i]); - if (!dn) return dir; - assert(dn->is_primary()); - cur = dn->get_inode(); - } - - return NULL; // oh, we have the full path. -} - - -/** - * open_remote_dir -- open up a remote dirfrag - * - * @diri - base inode - * @approxfg - approximate fragment. - * @fin - completion callback - */ -void MDCache::open_remote_dirfrag(CInode *diri, frag_t approxfg, Context *fin) -{ - dout(10) << "open_remote_dir on " << *diri << dendl; - - assert(diri->is_dir()); - assert(!diri->is_auth()); - assert(diri->get_dirfrag(approxfg) == 0); - - int auth = diri->authority().first; - - if (mds->mdsmap->get_state(auth) >= MDSMap::STATE_REJOIN) { - discover_dir_frag(diri, approxfg, fin); - } else { - // mds is down or recovering. forge a replica! - forge_replica_dir(diri, approxfg, auth); - } -} - - -/** - * get_dentry_inode - get or open inode - * - * @dn the dentry - * @mdr current request - * - * will return inode for primary, or link up/open up remote link's inode as necessary. - */ -CInode *MDCache::get_dentry_inode(CDentry *dn, MDRequest *mdr) -{ - assert(!dn->is_null()); - - if (dn->is_primary()) - return dn->inode; - - assert(dn->is_remote()); - CInode *in = get_inode(dn->get_remote_ino()); - if (in) { - dout(7) << "get_dentry_inode linking in remote in " << *in << dendl; - dn->link_remote(in); - return in; - } else { - dout(10) << "get_dentry_inode on remote dn, opening inode for " << *dn << dendl; - open_remote_ino(dn->get_remote_ino(), mdr, new C_MDS_RetryRequest(this, mdr)); - return 0; - } -} - -class C_MDC_RetryOpenRemoteIno : public Context { - MDCache *mdcache; - inodeno_t ino; - MDRequest *mdr; - Context *onfinish; -public: - C_MDC_RetryOpenRemoteIno(MDCache *mdc, inodeno_t i, MDRequest *r, Context *c) : - mdcache(mdc), ino(i), mdr(r), onfinish(c) {} - void finish(int r) { - mdcache->open_remote_ino(ino, mdr, onfinish); - } -}; - - -class C_MDC_OpenRemoteIno : public Context { - MDCache *mdcache; - inodeno_t ino; - MDRequest *mdr; - Context *onfinish; -public: - vector anchortrace; - - C_MDC_OpenRemoteIno(MDCache *mdc, inodeno_t i, MDRequest *r, Context *c) : - mdcache(mdc), ino(i), mdr(r), onfinish(c) {} - C_MDC_OpenRemoteIno(MDCache *mdc, inodeno_t i, vector& at, - MDRequest *r, Context *c) : - mdcache(mdc), ino(i), mdr(r), onfinish(c), anchortrace(at) {} - - void finish(int r) { - assert(r == 0); - if (r == 0) - mdcache->open_remote_ino_2(ino, mdr, anchortrace, onfinish); - else { - onfinish->finish(r); - delete onfinish; - } - } -}; - -void MDCache::open_remote_ino(inodeno_t ino, - MDRequest *mdr, - Context *onfinish) -{ - dout(7) << "open_remote_ino on " << ino << dendl; - - C_MDC_OpenRemoteIno *c = new C_MDC_OpenRemoteIno(this, ino, mdr, onfinish); - mds->anchorclient->lookup(ino, c->anchortrace, c); -} - -void MDCache::open_remote_ino_2(inodeno_t ino, - MDRequest *mdr, - vector& anchortrace, - Context *onfinish) -{ - dout(7) << "open_remote_ino_2 on " << ino - << ", trace depth is " << anchortrace.size() << dendl; - - // find deepest cached inode in prefix - unsigned i = anchortrace.size(); // i := array index + 1 - CInode *in = 0; - while (1) { - // inode? - dout(10) << " " << i << ": " << anchortrace[i-1] << dendl; - in = get_inode(anchortrace[i-1].ino); - if (in) break; - i--; - if (!i) { - in = get_inode(anchortrace[i].dirfrag.ino); - assert(in); // actually, we may need to open the root or a foreign stray inode, here. - break; - } - } - dout(10) << "deepest cached inode at " << i << " is " << *in << dendl; - - if (in->ino() == ino) { - // success - dout(10) << "open_remote_ino_2 have " << *in << dendl; - onfinish->finish(0); - delete onfinish; - return; - } - - // open dirfrag beneath *in - frag_t frag = anchortrace[i].dirfrag.frag; - - if (!in->dirfragtree.contains(frag)) { - dout(10) << "frag " << frag << " not valid, requerying anchortable" << dendl; - open_remote_ino(ino, mdr, onfinish); - return; - } - - CDir *dir = in->get_dirfrag(frag); - - if (!dir && !in->is_auth()) { - dout(10) << "opening remote dirfrag " << frag << " under " << *in << dendl; - /* FIXME: we re-query the anchortable just to avoid a fragtree update race */ - open_remote_dirfrag(in, frag, - new C_MDC_RetryOpenRemoteIno(this, ino, mdr, onfinish)); - return; - } - - if (!dir && in->is_auth()) - dir = in->get_or_open_dirfrag(this, frag); - - assert(dir); - if (dir->is_auth()) { - if (dir->is_complete()) { - // hrm. requery anchor table. - dout(10) << "expected ino " << anchortrace[i].ino - << " in complete dir " << *dir - << ", requerying anchortable" - << dendl; - open_remote_ino(ino, mdr, onfinish); - } else { - dout(10) << "need ino " << anchortrace[i].ino - << ", fetching incomplete dir " << *dir - << dendl; - dir->fetch(new C_MDC_OpenRemoteIno(this, ino, anchortrace, mdr, onfinish)); - } - } else { - // hmm, discover. - dout(10) << "have remote dirfrag " << *dir << ", discovering " - << anchortrace[i].ino << dendl; - discover_ino(dir, anchortrace[i].ino, - new C_MDC_OpenRemoteIno(this, ino, anchortrace, mdr, onfinish)); - } -} - - - - -void MDCache::make_trace(vector& trace, CInode *in) -{ - CInode *parent = in->get_parent_inode(); - if (parent) { - make_trace(trace, parent); - - CDentry *dn = in->get_parent_dn(); - dout(15) << "make_trace adding " << *dn << dendl; - trace.push_back(dn); - } -} - - -MDRequest *MDCache::request_start(MClientRequest *req) -{ - // did we win a forward race against a slave? - if (active_requests.count(req->get_reqid())) { - MDRequest *mdr = active_requests[req->get_reqid()]; - if (mdr->is_slave()) { - dout(10) << "request_start already had " << *mdr << ", cleaning up" << dendl; - request_cleanup(mdr); - delete mdr; - } else { - dout(10) << "request_start already processing " << *mdr << ", dropping new msg" << dendl; - delete req; - return 0; - } - } - - // register new client request - MDRequest *mdr = new MDRequest(req->get_reqid(), req); - active_requests[req->get_reqid()] = mdr; - dout(7) << "request_start " << *mdr << dendl; - return mdr; -} - -MDRequest *MDCache::request_start_slave(metareqid_t ri, int by) -{ - MDRequest *mdr = new MDRequest(ri, by); - assert(active_requests.count(mdr->reqid) == 0); - active_requests[mdr->reqid] = mdr; - dout(7) << "request_start_slave " << *mdr << " by mds" << by << dendl; - return mdr; -} - - -MDRequest *MDCache::request_get(metareqid_t rid) -{ - assert(active_requests.count(rid)); - dout(7) << "request_get " << rid << " " << *active_requests[rid] << dendl; - return active_requests[rid]; -} - -void MDCache::request_finish(MDRequest *mdr) -{ - dout(7) << "request_finish " << *mdr << dendl; - - // slave finisher? - if (mdr->more()->slave_commit) { - mdr->more()->slave_commit->finish(0); - delete mdr->more()->slave_commit; - mdr->more()->slave_commit = 0; - } - - if (mdr->client_request && mds->logger) { - mds->logger->inc("reply"); - mds->logger->favg("replyl", g_clock.now() - mdr->client_request->get_recv_stamp()); - } - - delete mdr->client_request; - delete mdr->slave_request; - request_cleanup(mdr); -} - - -void MDCache::request_forward(MDRequest *mdr, int who, int port) -{ - if (!port) port = MDS_PORT_SERVER; - dout(7) << "request_forward " << *mdr << " to mds" << who << " req " << *mdr << dendl; - - mds->forward_message_mds(mdr->client_request, who, port); - request_cleanup(mdr); - - if (mds->logger) mds->logger->inc("fw"); -} - - -void MDCache::dispatch_request(MDRequest *mdr) -{ - if (mdr->client_request) { - mds->server->dispatch_client_request(mdr); - } else if (mdr->slave_request) { - mds->server->dispatch_slave_request(mdr); - } else - assert(0); -} - - - -void MDCache::request_forget_foreign_locks(MDRequest *mdr) -{ - // xlocks - set::iterator p = mdr->xlocks.begin(); - while (p != mdr->xlocks.end()) { - if ((*p)->get_parent()->is_auth()) - p++; - else { - dout(10) << "request_forget_foreign_locks " << **p - << " on " << *(*p)->get_parent() << dendl; - (*p)->put_xlock(); - mdr->locks.erase(*p); - mdr->xlocks.erase(p++); - } - } -} - -void MDCache::request_cleanup(MDRequest *mdr) -{ - dout(15) << "request_cleanup " << *mdr << dendl; - metareqid_t ri = mdr->reqid; - - // clear ref, trace - mdr->ref = 0; - mdr->trace.clear(); - - // clean up slaves - // (will implicitly drop remote dn pins) - for (set::iterator p = mdr->more()->slaves.begin(); - p != mdr->more()->slaves.end(); - ++p) { - MMDSSlaveRequest *r = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_FINISH); - mds->send_message_mds(r, *p, MDS_PORT_SERVER); - } - // strip foreign xlocks out of lock lists, since the OP_FINISH drops them implicitly. - request_forget_foreign_locks(mdr); - - - // drop locks - mds->locker->drop_locks(mdr); - - // drop (local) auth pins - mdr->drop_local_auth_pins(); - - // drop stickydirs - for (set::iterator p = mdr->stickydirs.begin(); - p != mdr->stickydirs.end(); - ++p) - (*p)->put_stickydirs(); - - // drop cache pins - for (set::iterator it = mdr->pins.begin(); - it != mdr->pins.end(); - it++) - (*it)->put(MDSCacheObject::PIN_REQUEST); - mdr->pins.clear(); - - // remove from map - active_requests.erase(mdr->reqid); - delete mdr; - - - - - // log some stats ***** - if (mds->logger) { - mds->logger->set("c", lru.lru_get_size()); - mds->logger->set("cpin", lru.lru_get_num_pinned()); - mds->logger->set("ctop", lru.lru_get_top()); - mds->logger->set("cbot", lru.lru_get_bot()); - mds->logger->set("cptail", lru.lru_get_pintail()); - //mds->logger->set("buf",buffer_total_alloc); - } - - if (g_conf.log_pins) { - // pin - /* -for (int i=0; ilogger2) mds->logger2->set(cinode_pin_names[i], - cinode_pins[i]); - } - */ - /* - for (map::iterator it = cdir_pins.begin(); - it != cdir_pins.end(); - it++) { - //string s = "D"; - //s += cdir_pin_names[it->first]; - if (mds->logger2) mds->logger2->set(//s, - cdir_pin_names[it->first], - it->second); - } - */ - } - -} - - -// -------------------------------------------------------------------- -// ANCHORS - -// CREATE - -class C_MDC_AnchorCreatePrepared : public Context { - MDCache *cache; - CInode *in; -public: - version_t atid; - C_MDC_AnchorCreatePrepared(MDCache *c, CInode *i) : cache(c), in(i) {} - void finish(int r) { - cache->_anchor_create_prepared(in, atid); - } -}; - -void MDCache::anchor_create(MDRequest *mdr, CInode *in, Context *onfinish) -{ - assert(in->is_auth()); - - // auth pin - if (!in->can_auth_pin() && - !mdr->is_auth_pinned(in)) { - dout(7) << "anchor_create not authpinnable, waiting on " << *in << dendl; - in->add_waiter(CInode::WAIT_UNFREEZE, onfinish); - return; - } - - // wait - in->add_waiter(CInode::WAIT_ANCHORED, onfinish); - - // already anchoring? - if (in->state_test(CInode::STATE_ANCHORING)) { - dout(7) << "anchor_create already anchoring " << *in << dendl; - return; - } - - dout(7) << "anchor_create " << *in << dendl; - - // auth: do it - in->state_set(CInode::STATE_ANCHORING); - in->get(CInode::PIN_ANCHORING); - in->auth_pin(); - - // make trace - vector trace; - in->make_anchor_trace(trace); - - // do it - C_MDC_AnchorCreatePrepared *fin = new C_MDC_AnchorCreatePrepared(this, in); - mds->anchorclient->prepare_create(in->ino(), trace, &fin->atid, fin); -} - -class C_MDC_AnchorCreateLogged : public Context { - MDCache *cache; - CInode *in; - version_t atid; - LogSegment *ls; -public: - C_MDC_AnchorCreateLogged(MDCache *c, CInode *i, version_t t, LogSegment *s) : - cache(c), in(i), atid(t), ls(s) {} - void finish(int r) { - cache->_anchor_create_logged(in, atid, ls); - } -}; - -void MDCache::_anchor_create_prepared(CInode *in, version_t atid) -{ - dout(10) << "_anchor_create_prepared " << *in << " atid " << atid << dendl; - assert(in->inode.anchored == false); - - // update the logged inode copy - inode_t *pi = in->project_inode(); - pi->anchored = true; - pi->version = in->pre_dirty(); - - // note anchor transaction - EUpdate *le = new EUpdate(mds->mdlog, "anchor_create"); - le->metablob.add_dir_context(in->get_parent_dir()); - le->metablob.add_primary_dentry(in->parent, true, 0, pi); - le->metablob.add_anchor_transaction(atid); - mds->mdlog->submit_entry(le, new C_MDC_AnchorCreateLogged(this, in, atid, - mds->mdlog->get_current_segment())); -} - - -void MDCache::_anchor_create_logged(CInode *in, version_t atid, LogSegment *ls) -{ - dout(10) << "_anchor_create_logged on " << *in << dendl; - - // unpin - assert(in->state_test(CInode::STATE_ANCHORING)); - in->state_clear(CInode::STATE_ANCHORING); - in->put(CInode::PIN_ANCHORING); - in->auth_unpin(); - - // apply update to cache - in->pop_and_dirty_projected_inode(ls); - - // tell the anchortable we've committed - mds->anchorclient->commit(atid, ls); - - // trigger waiters - in->finish_waiting(CInode::WAIT_ANCHORED, 0); -} - - -// DESTROY - -class C_MDC_AnchorDestroyPrepared : public Context { - MDCache *cache; - CInode *in; -public: - version_t atid; - C_MDC_AnchorDestroyPrepared(MDCache *c, CInode *i) : cache(c), in(i) {} - void finish(int r) { - cache->_anchor_destroy_prepared(in, atid); - } -}; - -void MDCache::anchor_destroy(CInode *in, Context *onfinish) -{ - assert(in->is_auth()); - - // auth pin - if (!in->can_auth_pin()/* && - !mdr->is_auth_pinned(in)*/) { - dout(7) << "anchor_destroy not authpinnable, waiting on " << *in << dendl; - in->add_waiter(CInode::WAIT_UNFREEZE, onfinish); - return; - } - - // wait - if (onfinish) - in->add_waiter(CInode::WAIT_UNANCHORED, onfinish); - - // already anchoring? - if (in->state_test(CInode::STATE_UNANCHORING)) { - dout(7) << "anchor_destroy already unanchoring " << *in << dendl; - return; - } - - dout(7) << "anchor_destroy " << *in << dendl; - - // auth: do it - in->state_set(CInode::STATE_UNANCHORING); - in->get(CInode::PIN_UNANCHORING); - in->auth_pin(); - - // do it - C_MDC_AnchorDestroyPrepared *fin = new C_MDC_AnchorDestroyPrepared(this, in); - mds->anchorclient->prepare_destroy(in->ino(), &fin->atid, fin); -} - -class C_MDC_AnchorDestroyLogged : public Context { - MDCache *cache; - CInode *in; - version_t atid; - LogSegment *ls; -public: - C_MDC_AnchorDestroyLogged(MDCache *c, CInode *i, version_t t, LogSegment *l) : - cache(c), in(i), atid(t), ls(l) {} - void finish(int r) { - cache->_anchor_destroy_logged(in, atid, ls); - } -}; - -void MDCache::_anchor_destroy_prepared(CInode *in, version_t atid) -{ - dout(10) << "_anchor_destroy_prepared " << *in << " atid " << atid << dendl; - - assert(in->inode.anchored == true); - - // update the logged inode copy - inode_t *pi = in->project_inode(); - pi->anchored = true; - pi->version = in->pre_dirty(); - - // log + wait - EUpdate *le = new EUpdate(mds->mdlog, "anchor_destroy"); - le->metablob.add_dir_context(in->get_parent_dir()); - le->metablob.add_primary_dentry(in->parent, true, 0, pi); - le->metablob.add_anchor_transaction(atid); - mds->mdlog->submit_entry(le, new C_MDC_AnchorDestroyLogged(this, in, atid, mds->mdlog->get_current_segment())); -} - - -void MDCache::_anchor_destroy_logged(CInode *in, version_t atid, LogSegment *ls) -{ - dout(10) << "_anchor_destroy_logged on " << *in << dendl; - - // unpin - assert(in->state_test(CInode::STATE_UNANCHORING)); - in->state_clear(CInode::STATE_UNANCHORING); - in->put(CInode::PIN_UNANCHORING); - in->auth_unpin(); - - // apply update to cache - in->pop_and_dirty_projected_inode(ls); - - // tell the anchortable we've committed - mds->anchorclient->commit(atid, ls); - - // trigger waiters - in->finish_waiting(CInode::WAIT_UNANCHORED, 0); -} - - -// ------------------------------------------------------------------------------- -// STRAYS - -void MDCache::eval_stray(CDentry *dn) -{ - dout(10) << "eval_stray " << *dn << dendl; - assert(dn->is_primary()); - CInode *in = dn->inode; - assert(in); - - if (!dn->is_auth()) return; // has to be mine - - // purge? - if (in->inode.nlink == 0) { - if (dn->is_replicated() || in->is_any_caps()) return; // wait - if (!in->dirfrags.empty()) return; // wait for dirs to close/trim - _purge_stray(dn); - } - else if (in->inode.nlink == 1) { - // trivial reintegrate? - if (!in->remote_parents.empty()) { - CDentry *rlink = *in->remote_parents.begin(); - if (rlink->is_auth() && rlink->dir->can_auth_pin()) - reintegrate_stray(dn, rlink); - - if (!rlink->is_auth() && dn->is_auth()) - migrate_stray(dn, mds->get_nodeid(), rlink->authority().first); - } - } else { - // wait for next use. - } -} - -void MDCache::eval_remote(CDentry *dn) -{ - dout(10) << "eval_remote " << *dn << dendl; - assert(dn->is_remote()); - CInode *in = dn->get_inode(); - if (!in) return; - - // refers to stray? - if (in->get_parent_dn()->get_dir()->get_inode()->is_stray()) { - if (in->is_auth()) - eval_stray(in->get_parent_dn()); - else - migrate_stray(in->get_parent_dn(), in->authority().first, mds->get_nodeid()); - } -} - - -class C_MDC_PurgeStray : public Context { - MDCache *cache; - CDentry *dn; - version_t pdv; - LogSegment *ls; -public: - C_MDC_PurgeStray(MDCache *c, CDentry *d, version_t v, LogSegment *s) : - cache(c), dn(d), pdv(v), ls(s) { } - void finish(int r) { - cache->_purge_stray_logged(dn, pdv, ls); - } -}; - -void MDCache::_purge_stray(CDentry *dn) -{ - dout(10) << "_purge_stray " << *dn << " " << *dn->inode << dendl; - assert(!dn->is_replicated()); - - // log removal - version_t pdv = dn->pre_dirty(); - - EUpdate *le = new EUpdate(mds->mdlog, "purge_stray"); - le->metablob.add_dir_context(dn->dir); - le->metablob.add_null_dentry(dn, true); - le->metablob.add_inode_truncate(dn->inode->ino(), 0, dn->inode->inode.size); - - mds->mdlog->submit_entry(le, new C_MDC_PurgeStray(this, dn, pdv, mds->mdlog->get_current_segment())); - - -} - -void MDCache::_purge_stray_logged(CDentry *dn, version_t pdv, LogSegment *ls) -{ - dout(10) << "_purge_stray_logged " << *dn << " " << *dn->inode << dendl; - CInode *in = dn->inode; - - // dirty+unlink dentry - dn->dir->mark_dirty(pdv, ls); - dn->dir->unlink_inode(dn); - dn->dir->remove_dentry(dn); - - // purge+remove inode - in->mark_clean(); - purge_inode(in, 0, in->inode.size, ls); -} - - - -void MDCache::reintegrate_stray(CDentry *straydn, CDentry *rdn) -{ - dout(10) << "reintegrate_stray " << *straydn << " into " << *rdn << dendl; - - // rename it to another mds. - filepath src; - straydn->make_path(src); - filepath dst; - rdn->make_path(dst); - - MClientRequest *req = new MClientRequest(MDS_OP_RENAME, mds->messenger->get_myinst()); - req->set_filepath(src); - req->set_filepath2(dst); - req->set_tid(mds->issue_tid()); - - mds->send_message_mds(req, rdn->authority().first, MDS_PORT_SERVER); -} - - -void MDCache::migrate_stray(CDentry *dn, int from, int to) -{ - dout(10) << "migrate_stray from mds" << from << " to mds" << to - << " " << *dn << " " << *dn->inode << dendl; - - // rename it to another mds. - string dname; - dn->get_inode()->name_stray_dentry(dname); - filepath src(dname, MDS_INO_STRAY(from)); - filepath dst(dname, MDS_INO_STRAY(to)); - - MClientRequest *req = new MClientRequest(MDS_OP_RENAME, mds->messenger->get_myinst()); - req->set_filepath(src); - req->set_filepath2(dst); - req->set_tid(mds->issue_tid()); - - mds->send_message_mds(req, to, MDS_PORT_SERVER); -} - - - - -// ======================================================================================== -// DISCOVER -/* - - - for all discovers (except base_inos, e.g. root, stray), waiters are attached - to the parent metadata object in the cache (pinning it). - - - the discover is also registered under the per-mds discover_ hashes, so that - waiters can be kicked in the event of a failure. that is, every discover will - be followed by a reply, unless the remote node fails.. - - - each discover_reply must reliably decrement the discover_ counts. - - - base_inos are the exception. those waiters are under waiting_for_base_ino. - -*/ - -void MDCache::discover_base_ino(inodeno_t want_ino, - Context *onfinish, - int from) -{ - dout(7) << "discover_base_ino " << want_ino << " from mds" << from << dendl; - if (waiting_for_base_ino[from].count(want_ino) == 0) { - filepath want_path; - MDiscover *dis = new MDiscover(mds->get_nodeid(), - want_ino, - want_path, - false); - mds->send_message_mds(dis, from, MDS_PORT_CACHE); - } - - waiting_for_base_ino[from][want_ino].push_back(onfinish); -} - - -void MDCache::discover_dir_frag(CInode *base, - frag_t approx_fg, - Context *onfinish, - int from) -{ - if (from < 0) from = base->authority().first; - - dout(7) << "discover_dir_frag " << base->ino() << " " << approx_fg - << " from mds" << from << dendl; - - if (!base->is_waiter_for(CInode::WAIT_DIR) || !onfinish) { // this is overly conservative - filepath want_path; - MDiscover *dis = new MDiscover(mds->get_nodeid(), - base->ino(), - want_path, - true); // need the base dir open - dis->set_base_dir_frag(approx_fg); - mds->send_message_mds(dis, from, MDS_PORT_CACHE); - } - - // register + wait - if (onfinish) - base->add_waiter(CInode::WAIT_DIR, onfinish); - discover_dir[from][base->ino()]++; -} - -void MDCache::discover_path(CInode *base, - filepath want_path, - Context *onfinish, - bool want_xlocked, - int from) -{ - if (from < 0) from = base->authority().first; - - dout(7) << "discover_path " << base->ino() << " " << want_path << " from mds" << from - << (want_xlocked ? " want_xlocked":"") - << dendl; - - if (base->is_ambiguous_auth()) { - dout(10) << " waiting for single auth on " << *base << dendl; - base->add_waiter(CInode::WAIT_SINGLEAUTH, onfinish); - return; - } - - if (!base->is_waiter_for(CInode::WAIT_DIR) || !onfinish) { // this is overly conservative - MDiscover *dis = new MDiscover(mds->get_nodeid(), - base->ino(), - want_path, - true, // we want the base dir; we are relative to ino. - want_xlocked); - mds->send_message_mds(dis, from, MDS_PORT_CACHE); - } - - // register + wait - if (onfinish) base->add_waiter(CInode::WAIT_DIR, onfinish); - discover_dir[from][base->ino()]++; -} - -void MDCache::discover_path(CDir *base, - filepath want_path, - Context *onfinish, - bool want_xlocked) -{ - int from = base->authority().first; - - dout(7) << "discover_path " << base->dirfrag() << " " << want_path << " from mds" << from - << (want_xlocked ? " want_xlocked":"") - << dendl; - - if (base->is_ambiguous_auth()) { - dout(7) << " waiting for single auth on " << *base << dendl; - base->add_waiter(CDir::WAIT_SINGLEAUTH, onfinish); - return; - } - - if (!base->is_waiting_for_dentry(want_path[0]) || !onfinish) { - MDiscover *dis = new MDiscover(mds->get_nodeid(), - base->ino(), - want_path, - false, // no base dir; we are relative to dir - want_xlocked); - mds->send_message_mds(dis, from, MDS_PORT_CACHE); - } - - // register + wait - if (onfinish) base->add_dentry_waiter(want_path[0], onfinish); - discover_dir_sub[from][base->dirfrag()]++; -} - -void MDCache::discover_ino(CDir *base, - inodeno_t want_ino, - Context *onfinish, - bool want_xlocked) -{ - int from = base->authority().first; - - dout(7) << "discover_ino " << base->dirfrag() << " " << want_ino << " from mds" << from - << (want_xlocked ? " want_xlocked":"") - << dendl; - - if (!base->is_waiting_for_ino(want_ino)) { - MDiscover *dis = new MDiscover(mds->get_nodeid(), - base->dirfrag(), - want_ino, - want_xlocked); - mds->send_message_mds(dis, from, MDS_PORT_CACHE); - } - - // register + wait - base->add_ino_waiter(want_ino, onfinish); - discover_dir_sub[from][base->dirfrag()]++; -} - - - -void MDCache::kick_discovers(int who) -{ - list waiters; - - for (hash_map >::iterator p = waiting_for_base_ino[who].begin(); - p != waiting_for_base_ino[who].end(); - ++p) { - dout(10) << "kick_discovers on base ino " << p->first << dendl; - mds->queue_waiters(p->second); - } - waiting_for_base_ino.erase(who); - - for (hash_map::iterator p = discover_dir[who].begin(); - p != discover_dir[who].end(); - ++p) { - CInode *in = get_inode(p->first); - if (!in) continue; - dout(10) << "kick_discovers dir waiters on " << *in << dendl; - in->take_waiting(CInode::WAIT_DIR, waiters); - } - discover_dir.erase(who); - - for (hash_map::iterator p = discover_dir_sub[who].begin(); - p != discover_dir_sub[who].end(); - ++p) { - CDir *dir = get_dirfrag(p->first); - if (!dir) continue; - dout(10) << "kick_discovers dentry+ino waiters on " << *dir << dendl; - dir->take_sub_waiting(waiters); - } - discover_dir_sub.erase(who); - - mds->queue_waiters(waiters); -} - - - -void MDCache::handle_discover(MDiscover *dis) -{ - int whoami = mds->get_nodeid(); - - assert(dis->get_asker() != whoami); - - /* - if (mds->get_state() < MDSMap::STATE_ACTIVE) { - dout(-7) << "discover_reply NOT ACTIVE YET" << dendl; - delete dis; - return; - } - */ - - - CInode *cur = 0; - MDiscoverReply *reply = new MDiscoverReply(dis); - - // get started. - if (dis->get_base_ino() == MDS_INO_ROOT) { - // wants root - dout(7) << "handle_discover from mds" << dis->get_asker() - << " wants root + " << dis->get_want().get_path() << dendl; - - assert(mds->get_nodeid() == 0); - assert(root->is_auth()); - - // add root - reply->add_inode( root->replicate_to( dis->get_asker() ) ); - dout(10) << "added root " << *root << dendl; - - cur = root; - } - else if (dis->get_base_ino() == MDS_INO_STRAY(whoami)) { - // wants root - dout(7) << "handle_discover from mds" << dis->get_asker() - << " wants stray + " << dis->get_want().get_path() << dendl; - - reply->add_inode( stray->replicate_to( dis->get_asker() ) ); - dout(10) << "added stray " << *stray << dendl; - - cur = stray; - } - else { - // there's a base inode - cur = get_inode(dis->get_base_ino()); - - if (!cur) { - dout(7) << "handle_discover mds" << dis->get_asker() - << " don't have base ino " << dis->get_base_ino() - << dendl; - reply->set_flag_error_dir(); - } - - if (dis->wants_base_dir()) { - dout(7) << "handle_discover mds" << dis->get_asker() - << " wants basedir+" << dis->get_want().get_path() - << " has " << *cur - << dendl; - } else { - dout(7) << "handle_discover mds" << dis->get_asker() - << " wants " << dis->get_want().get_path() - << " has " << *cur - << dendl; - } - } - - assert(reply); - - // add content - // do some fidgeting to include a dir if they asked for the base dir, or just root. - for (unsigned i = 0; - cur && (i < dis->get_want().depth() || dis->get_want().depth() == 0); - i++) { - - // -- figure out the dir - - // is *cur even a dir at all? - if (!cur->is_dir()) { - dout(7) << *cur << " not a dir" << dendl; - reply->set_flag_error_dir(); - break; - } - - // pick frag - frag_t fg; - if (dis->get_want().depth()) { - // dentry specifies - fg = cur->pick_dirfrag(dis->get_dentry(i)); - } else { - // requester explicity specified the frag - fg = dis->get_base_dir_frag(); - assert(dis->wants_base_dir() || dis->get_want_ino() || dis->get_base_ino() < MDS_INO_BASE); - } - CDir *curdir = cur->get_dirfrag(fg); - - if ((!curdir && !cur->is_auth()) || - (curdir && !curdir->is_auth())) { - - /* before: - * ONLY set flag if empty!! - * otherwise requester will wake up waiter(s) _and_ continue with discover, - * resulting in duplicate discovers in flight, - * which can wreak havoc when discovering rename srcdn (which may move) - */ - - if (reply->is_empty()) { - // only hint if empty. - // someday this could be better, but right now the waiter logic isn't smart enough. - - // hint - if (curdir) { - dout(7) << " not dirfrag auth, setting dir_auth_hint for " << *curdir << dendl; - reply->set_dir_auth_hint(curdir->authority().first); - } else { - dout(7) << " dirfrag not open, not inode auth, setting dir_auth_hint for " - << *cur << dendl; - reply->set_dir_auth_hint(cur->authority().first); - } - - // note error dentry, if any - // NOTE: important, as it allows requester to issue an equivalent discover - // to whomever we hint at. - if (dis->get_want().depth() > i) - reply->set_error_dentry(dis->get_dentry(i)); - } - - break; - } - - // open dir? - if (!curdir) - curdir = cur->get_or_open_dirfrag(this, fg); - assert(curdir); - assert(curdir->is_auth()); - - // is dir frozen? - if (curdir->is_frozen()) { - if (reply->is_empty()) { - dout(7) << *curdir << " is frozen, empty reply, waiting" << dendl; - curdir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis)); - delete reply; - return; - } else { - dout(7) << *curdir << " is frozen, non-empty reply, stopping" << dendl; - break; - } - } - - // add dir - if (reply->is_empty() && !dis->wants_base_dir()) { - dout(7) << "handle_discover not adding unwanted base dir " << *curdir << dendl; - } else { - assert(!curdir->is_ambiguous_auth()); // would be frozen. - reply->add_dir( curdir->replicate_to(dis->get_asker()) ); - dout(7) << "handle_discover added dir " << *curdir << dendl; - } - - // lookup - CDentry *dn = 0; - if (dis->get_want_ino()) { - // lookup by ino - CInode *in = get_inode(dis->get_want_ino()); - if (in && in->is_auth() && in->get_parent_dn()->get_dir() == curdir) - dn = in->get_parent_dn(); - } else if (dis->get_want().depth() > 0) { - // lookup dentry - dn = curdir->lookup( dis->get_dentry(i) ); - } else - break; // done! - - // incomplete dir? - if (!dn) { - if (!curdir->is_complete()) { - // readdir - dout(7) << "incomplete dir contents for " << *curdir << ", fetching" << dendl; - if (reply->is_empty()) { - // fetch and wait - curdir->fetch(new C_MDS_RetryMessage(mds, dis)); - return; - } else { - // initiate fetch, but send what we have so far - curdir->fetch(0); - break; - } - } - - // don't have wanted ino in this dir? - if (dis->get_want_ino()) { - // set error flag in reply - dout(7) << "ino " << dis->get_want_ino() << " in this dir, flagging error in " - << *curdir << dendl; - reply->set_flag_error_ino(); - break; - } - - // send null dentry - dout(7) << "dentry " << dis->get_dentry(i) << " dne, returning null in " - << *curdir << dendl; - dn = curdir->add_null_dentry(dis->get_dentry(i)); - } - assert(dn); - - // xlocked dentry? - // ...always block on non-tail items (they are unrelated) - // ...allow xlocked tail disocvery _only_ if explicitly requested - bool tailitem = (dis->get_want().depth() == 0) || (i == dis->get_want().depth() - 1); - if (dn->lock.is_xlocked()) { - // is this the last (tail) item in the discover traversal? - if (tailitem && dis->wants_xlocked()) { - dout(7) << "handle_discover allowing discovery of xlocked tail " << *dn << dendl; - } else { - dout(7) << "handle_discover blocking on xlocked " << *dn << dendl; - dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryMessage(mds, dis)); - delete reply; - return; - } - } - - // frozen inode? - if (dn->is_primary() && dn->inode->is_frozen()) { - if (tailitem && dis->wants_xlocked()) { - dout(7) << "handle_discover allowing discovery of frozen tail " << *dn->inode << dendl; - } else if (reply->is_empty()) { - dout(7) << *dn->inode << " is frozen, empty reply, waiting" << dendl; - dn->inode->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis)); - delete reply; - return; - } else { - dout(7) << *dn->inode << " is frozen, non-empty reply, stopping" << dendl; - break; - } - } - - // add dentry - reply->add_dentry( dn->replicate_to( dis->get_asker() ) ); - dout(7) << "handle_discover added dentry " << *dn << dendl; - - if (!dn->is_primary()) break; // stop on null or remote link. - - // add inode - CInode *next = dn->inode; - assert(next->is_auth()); - - reply->add_inode( next->replicate_to( dis->get_asker() ) ); - dout(7) << "handle_discover added inode " << *next << dendl; - - // descend, keep going. - cur = next; - continue; - } - - // how did we do? - assert(!reply->is_empty()); - dout(7) << "handle_discover sending result back to asker mds" << dis->get_asker() << dendl; - mds->send_message_mds(reply, dis->get_asker(), MDS_PORT_CACHE); - - delete dis; -} - - -void MDCache::handle_discover_reply(MDiscoverReply *m) -{ - /* - if (mds->get_state() < MDSMap::STATE_ACTIVE) { - dout(-7) << "discover_reply NOT ACTIVE YET" << dendl; - delete m; - return; - } - */ - - list finished, error; - int from = m->get_source().num(); - - // starting point - CInode *cur = get_inode(m->get_base_ino()); - - if (m->has_base_inode()) { - assert(m->get_base_ino() < MDS_INO_BASE); - assert(!m->has_base_dentry()); - assert(!m->has_base_dir()); - - // add base inode - cur = add_replica_inode(m->get_inode(0), NULL, finished); - - dout(7) << "discover_reply got base inode " << *cur << dendl; - - // take waiters - finished.swap(waiting_for_base_ino[from][cur->ino()]); - waiting_for_base_ino[from].erase(cur->ino()); - } - assert(cur); - - dout(7) << "discover_reply " << *cur - << " + " << m->get_num_dentries() << " dn, " - << m->get_num_inodes() << " inodes" - << dendl; - - // fyi - if (m->is_flag_error_dir()) - dout(7) << " flag error, dir" << dendl; - if (m->is_flag_error_dn()) - dout(7) << " flag error, dentry = " << m->get_error_dentry() << dendl; - if (m->is_flag_error_ino()) - dout(7) << " flag error, ino = " << m->get_wanted_ino() << dendl; - - dout(10) << "depth = " << m->get_depth() - << ", has base_dir/base_dn/root = " - << m->has_base_dir() << " / " << m->has_base_dentry() << " / " << m->has_base_inode() - << ", num dirs/dentries/inodes = " - << m->get_num_dirs() << " / " << m->get_num_dentries() << " / " << m->get_num_inodes() - << dendl; - - // decrement discover counters - if (m->get_wanted_base_dir()) { - inodeno_t ino = m->get_base_ino(); - assert(discover_dir[from].count(ino)); - if (--discover_dir[from][ino] == 0) - discover_dir[from].erase(ino); - } else if (m->get_base_ino() >= MDS_INO_BASE) { - dirfrag_t df(m->get_base_ino(), m->get_base_dir_frag()); - assert(discover_dir_sub[from].count(df)); - if (--discover_dir_sub[from][df] == 0) - discover_dir_sub[from].erase(df); - } - - // loop over discover results. - // indexes follow each ([[dir] dentry] inode) - // can start, end with any type. - for (int i=m->has_base_inode(); iget_depth(); i++) { - dout(10) << "discover_reply i=" << i << " cur " << *cur << dendl; - - // dir - frag_t fg; - CDir *curdir = 0; - if (i > 0 || m->has_base_dir()) { - assert(m->get_dir(i).get_dirfrag().ino == cur->ino()); - fg = m->get_dir(i).get_dirfrag().frag; - curdir = add_replica_dir(cur, fg, m->get_dir(i), - m->get_source().num(), - finished); - } - if (!curdir) { - fg = cur->pick_dirfrag(m->get_dentry(i).get_dname()); - curdir = cur->get_dirfrag(fg); - } - - // dentry error? - if (i == m->get_depth()-1 && (m->is_flag_error_dn() || m->is_flag_error_ino())) { - // error! - assert(cur->is_dir()); - if (curdir) { - if (m->get_error_dentry().length()) { - dout(7) << " flag_error on dentry " << m->get_error_dentry() - << ", triggering dentry" << dendl; - curdir->take_dentry_waiting(m->get_error_dentry(), error); - } else { - dout(7) << " flag_error on ino " << m->get_wanted_ino() - << ", triggering ino" << dendl; - curdir->take_ino_waiting(m->get_wanted_ino(), error); - } - } else { - dout(7) << " flag_error on dentry " << m->get_error_dentry() - << ", triggering dir?" << dendl; - cur->take_waiting(CInode::WAIT_DIR, error); - } - break; - } - - assert(curdir); - - // dentry - CDentry *dn = 0; - if (i >= m->get_last_dentry()) break; - if (i > 0 || m->has_base_dentry()) - dn = add_replica_dentry(curdir, m->get_dentry(i), finished); - - // inode - if (i >= m->get_last_inode()) break; - cur = add_replica_inode(m->get_inode(i), dn, finished); - } - - // dir error? - // or dir_auth hint? - if (m->is_flag_error_dir() && !cur->is_dir()) { - // not a dir. - cur->take_waiting(CInode::WAIT_DIR, error); - } else if (m->is_flag_error_dir() || - (m->get_dir_auth_hint() != CDIR_AUTH_UNKNOWN && - m->get_dir_auth_hint() != mds->get_nodeid())) { - int who = m->get_dir_auth_hint(); - if (who == mds->get_nodeid()) who = -1; - if (who >= 0) - dout(7) << " dir_auth_hint is " << m->get_dir_auth_hint() << dendl; - - // try again? - if (m->get_error_dentry().length()) { - // wanted a dentry - frag_t fg = cur->pick_dirfrag(m->get_error_dentry()); - CDir *dir = cur->get_dirfrag(fg); - if (dir) { - // don't actaully need the hint, now - if (dir->lookup(m->get_error_dentry()) == 0 && - dir->is_waiting_for_dentry(m->get_error_dentry())) - discover_path(dir, m->get_error_dentry(), 0, m->get_wanted_xlocked()); - else - dout(7) << " doing nothing, have dir but nobody is waiting on dentry " - << m->get_error_dentry() << dendl; - } else { - if (cur->is_waiter_for(CInode::WAIT_DIR)) - discover_path(cur, m->get_error_dentry(), 0, m->get_wanted_xlocked(), who); - else - dout(7) << " doing nothing, nobody is waiting for dir" << dendl; - } - } else { - // wanted just the dir - frag_t fg = m->get_base_dir_frag(); - if (cur->get_dirfrag(fg) == 0 && cur->is_waiter_for(CInode::WAIT_DIR)) - discover_dir_frag(cur, fg, 0, who); - else - dout(7) << " doing nothing, nobody is waiting for dir" << dendl; - } - } - - // waiters - finish_contexts(error, -ENOENT); // finish errors directly - mds->queue_waiters(finished); - - // done - delete m; -} - - - -// ---------------------------- -// REPLICAS - -CDir *MDCache::add_replica_dir(CInode *diri, - frag_t fg, CDirDiscover &dis, int from, - list& finished) -{ - // add it (_replica_) - CDir *dir = diri->get_dirfrag(fg); - - if (dir) { - // had replica. update w/ new nonce. - dis.update_dir(dir); - dout(7) << "add_replica_dir had " << *dir << " nonce " << dir->replica_nonce << dendl; - } else { - // force frag to leaf in the diri tree - if (!diri->dirfragtree.is_leaf(fg)) { - dout(7) << "add_replica_dir forcing frag " << fg << " to leaf in the fragtree " - << diri->dirfragtree << dendl; - diri->dirfragtree.force_to_leaf(fg); - } - - // add replica. - dir = diri->add_dirfrag( new CDir(diri, fg, this, false) ); - dis.update_dir(dir); - - // is this a dir_auth delegation boundary? - if (from != diri->authority().first || - diri->is_ambiguous_auth() || - diri->ino() < MDS_INO_BASE) - adjust_subtree_auth(dir, from); - - dout(7) << "add_replica_dir added " << *dir << " nonce " << dir->replica_nonce << dendl; - - // get waiters - diri->take_waiting(CInode::WAIT_DIR, finished); - } - - return dir; -} - -CDir *MDCache::forge_replica_dir(CInode *diri, frag_t fg, int from) -{ - assert(mds->mdsmap->get_state(from) < MDSMap::STATE_REJOIN); - - // forge a replica. - CDir *dir = diri->add_dirfrag( new CDir(diri, fg, this, false) ); - - // i'm assuming this is a subtree root. - adjust_subtree_auth(dir, from); - - dout(7) << "forge_replica_dir added " << *dir << " while mds" << from << " is down" << dendl; - - return dir; -} - -CDentry *MDCache::add_replica_dentry(CDir *dir, CDentryDiscover &dis, list& finished) -{ - CDentry *dn = dir->lookup( dis.get_dname() ); - - // have it? - if (dn) { - dis.update_dentry(dn); - dout(7) << "add_replica_dentry had " << *dn << dendl; - } else { - dn = dir->add_null_dentry(dis.get_dname()); - dis.update_dentry(dn); - dis.init_dentry_lock(dn); - dout(7) << "add_replica_dentry added " << *dn << dendl; - } - - // remote_ino linkage? - if (dis.get_remote_ino()) { - if (dn->is_null()) - dir->link_remote_inode(dn, dis.get_remote_ino(), dis.get_remote_d_type()); - - // hrm. yeah. - assert(dn->is_remote() && dn->get_remote_ino() == dis.get_remote_ino()); - } - - dir->take_dentry_waiting(dis.get_dname(), finished); - - return dn; -} - -CInode *MDCache::add_replica_inode(CInodeDiscover& dis, CDentry *dn, list& finished) -{ - CInode *in = get_inode(dis.get_ino()); - if (!in) { - in = new CInode(this, false); - dis.update_inode(in); - dis.init_inode_locks(in); - add_inode(in); - if (in->is_base()) { - if (in->ino() == MDS_INO_ROOT) - in->inode_auth.first = 0; - else if (MDS_INO_IS_STRAY(in->ino())) - in->inode_auth.first = in->ino() - MDS_INO_STRAY_OFFSET; - else - assert(0); - } - dout(10) << "add_replica_inode added " << *in << dendl; - if (dn && dn->is_null()) - dn->dir->link_primary_inode(dn, in); - } else { - dis.update_inode(in); - dout(10) << "add_replica_inode had " << *in << dendl; - } - - if (dn) { - assert(dn->is_primary()); - assert(dn->inode == in); - - dn->get_dir()->take_ino_waiting(in->ino(), finished); - } - - return in; -} - - -CDentry *MDCache::add_replica_stray(bufferlist &bl, CInode *in, int from) -{ - list finished; - int off = 0; - - // inode - CInodeDiscover indis; - indis._decode(bl, off); - CInode *strayin = add_replica_inode(indis, NULL, finished); - dout(15) << "strayin " << *strayin << dendl; - - // dir - CDirDiscover dirdis; - dirdis._decode(bl, off); - CDir *straydir = add_replica_dir(strayin, dirdis.get_dirfrag().frag, dirdis, - from, finished); - dout(15) << "straydir " << *straydir << dendl; - - // dentry - CDentryDiscover dndis; - dndis._decode(bl, off); - - string straydname; - in->name_stray_dentry(straydname); - CDentry *straydn = add_replica_dentry(straydir, dndis, finished); - - mds->queue_waiters(finished); - - return straydn; -} - - - -/* -int MDCache::send_inode_updates(CInode *in) -{ - assert(in->is_auth()); - for (set::iterator it = in->cached_by_begin(); - it != in->cached_by_end(); - it++) { - dout(7) << "sending inode_update on " << *in << " to " << *it << dendl; - assert(*it != mds->get_nodeid()); - mds->send_message_mds(new MInodeUpdate(in, in->get_cached_by_nonce(*it)), *it, MDS_PORT_CACHE); - } - - return 0; -} - - -void MDCache::handle_inode_update(MInodeUpdate *m) -{ - inodeno_t ino = m->get_ino(); - CInode *in = get_inode(m->get_ino()); - if (!in) { - //dout(7) << "inode_update on " << m->get_ino() << ", don't have it, ignoring" << dendl; - dout(7) << "inode_update on " << m->get_ino() << ", don't have it, sending expire" << dendl; - MCacheExpire *expire = new MCacheExpire(mds->get_nodeid()); - expire->add_inode(m->get_ino(), m->get_nonce()); - mds->send_message_mds(expire, m->get_source().num(), MDS_PORT_CACHE); - goto out; - } - - if (in->is_auth()) { - dout(7) << "inode_update on " << *in << ", but i'm the authority!" << dendl; - assert(0); // this should never happen - } - - dout(7) << "inode_update on " << *in << dendl; - - // update! NOTE dir_auth is unaffected by this. - in->decode_basic_state(m->get_payload()); - - out: - // done - delete m; -} -*/ - - - - - - -int MDCache::send_dir_updates(CDir *dir, bool bcast) -{ - // this is an FYI, re: replication - - set who; - if (bcast) { - mds->get_mds_map()->get_active_mds_set(who); - } else { - for (map::iterator p = dir->replicas_begin(); - p != dir->replicas_end(); - ++p) - who.insert(p->first); - } - - dout(7) << "sending dir_update on " << *dir << " bcast " << bcast << " to " << who << dendl; - - filepath path; - dir->inode->make_path(path); - - int whoami = mds->get_nodeid(); - for (set::iterator it = who.begin(); - it != who.end(); - it++) { - if (*it == whoami) continue; - //if (*it == except) continue; - dout(7) << "sending dir_update on " << *dir << " to " << *it << dendl; - - mds->send_message_mds(new MDirUpdate(dir->dirfrag(), - dir->dir_rep, - dir->dir_rep_by, - path, - bcast), - *it, MDS_PORT_CACHE); - } - - return 0; -} - - -void MDCache::handle_dir_update(MDirUpdate *m) -{ - CDir *dir = get_dirfrag(m->get_dirfrag()); - if (!dir) { - dout(5) << "dir_update on " << m->get_dirfrag() << ", don't have it" << dendl; - - // discover it? - if (m->should_discover()) { - // only try once! - // this is key to avoid a fragtree update race, among other things. - m->tried_discover(); - vector trace; - filepath path = m->get_path(); - - dout(5) << "trying discover on dir_update for " << path << dendl; - - int r = path_traverse(0, m, - path, trace, true, - MDS_TRAVERSE_DISCOVER); - if (r > 0) - return; - assert(r == 0); - - CInode *in = get_inode(m->get_dirfrag().ino); - assert(in); - open_remote_dirfrag(in, m->get_dirfrag().frag, - new C_MDS_RetryMessage(mds, m)); - return; - } - - delete m; - return; - } - - // update - dout(5) << "dir_update on " << *dir << dendl; - dir->dir_rep = m->get_dir_rep(); - dir->dir_rep_by = m->get_dir_rep_by(); - - // done - delete m; -} - - - - - - -// UNLINK - -void MDCache::handle_dentry_unlink(MDentryUnlink *m) -{ - CDir *dir = get_dirfrag(m->get_dirfrag()); - - if (!dir) { - dout(7) << "handle_dentry_unlink don't have dirfrag " << m->get_dirfrag() << dendl; - } - else { - CDentry *dn = dir->lookup(m->get_dn()); - if (!dn) { - dout(7) << "handle_dentry_unlink don't have dentry " << *dir << " dn " << m->get_dn() << dendl; - } else { - dout(7) << "handle_dentry_unlink on " << *dn << dendl; - - // move to stray? - CDentry *straydn = 0; - if (m->strayin) { - list finished; - CInode *in = add_replica_inode(*m->strayin, NULL, finished); - CDir *dir = add_replica_dir(in, m->straydir->get_dirfrag().frag, *m->straydir, - m->get_source().num(), finished); - straydn = add_replica_dentry(dir, *m->straydn, finished); - if (!finished.empty()) mds->queue_waiters(finished); - } - - // open inode? - if (dn->is_primary()) { - CInode *in = dn->inode; - dn->dir->unlink_inode(dn); - assert(straydn); - straydn->dir->link_primary_inode(straydn, in); - - // send caps to auth (if we're not already) - if (in->is_any_caps() && - !in->state_test(CInode::STATE_EXPORTINGCAPS)) - migrator->export_caps(in); - - lru.lru_bottouch(straydn); // move stray to end of lru - - } else { - assert(dn->is_remote()); - dn->dir->unlink_inode(dn); - } - assert(dn->is_null()); - - // move to bottom of lru - lru.lru_bottouch(dn); - } - } - - delete m; - return; -} - - - - - - -// =================================================================== -// FRAGMENT - - -/** - * adjust_dir_fragments -- adjust fragmentation for a directory - * - * @diri - directory inode - * @basefrag - base fragment - * @bits - bit adjustment. positive for split, negative for merge. - */ -void MDCache::adjust_dir_fragments(CInode *diri, frag_t basefrag, int bits, - list& resultfrags, - list& waiters) -{ - dout(10) << "adjust_dir_fragments " << basefrag << " " << bits - << " on " << *diri << dendl; - - // yuck. we may have discovered the inode while it was being fragmented. - if (!diri->dirfragtree.is_leaf(basefrag)) - diri->dirfragtree.force_to_leaf(basefrag); - - CDir *base = diri->get_or_open_dirfrag(this, basefrag); - - // adjust fragtree - diri->dirfragtree.split(basefrag, bits); - dout(10) << " new fragtree is " << diri->dirfragtree << dendl; - - if (bits > 0) { - if (base) { - CDir *baseparent = base->get_parent_dir(); - - base->split(bits, resultfrags, waiters); - - // did i change the subtree map? - if (base->is_subtree_root()) { - // am i a bound? - if (baseparent) { - CDir *parent = get_subtree_root(baseparent); - assert(subtrees[parent].count(base)); - subtrees[parent].erase(base); - for (list::iterator p = resultfrags.begin(); - p != resultfrags.end(); - ++p) { - subtrees[parent].insert(*p); - subtrees[*p].clear(); // new frag is now its own subtree - } - } - - // adjust my bounds. - set bounds; - bounds.swap(subtrees[base]); - subtrees.erase(base); - for (set::iterator p = bounds.begin(); - p != bounds.end(); - ++p) { - CDir *frag = get_subtree_root((*p)->get_parent_dir()); - subtrees[frag].insert(*p); - } - - show_subtrees(10); - } - } - } else { - assert(base); - base->merge(bits, waiters); - resultfrags.push_back(base); - assert(0); // FIXME adjust subtree map! and clean up this code, probably. - } -} - -class C_MDC_FragmentGo : public Context { - MDCache *mdcache; - CInode *diri; - list dirs; - frag_t basefrag; - int bits; -public: - C_MDC_FragmentGo(MDCache *m, CInode *di, list& dls, frag_t bf, int b) : - mdcache(m), diri(di), dirs(dls), basefrag(bf), bits(b) { } - virtual void finish(int r) { - mdcache->fragment_go(diri, dirs, basefrag, bits); - } -}; - -void MDCache::split_dir(CDir *dir, int bits) -{ - dout(7) << "split_dir " << *dir << " bits " << bits << dendl; - assert(dir->is_auth()); - - if (mds->mdsmap->is_degraded()) { - dout(7) << "cluster degraded, no fragmenting for now" << dendl; - return; - } - if (dir->inode->is_root()) { - dout(7) << "i won't fragment root" << dendl; - //assert(0); - return; - } - if (dir->state_test(CDir::STATE_FRAGMENTING)) { - dout(7) << "already fragmenting" << dendl; - return; - } - if (!dir->can_auth_pin()) { - dout(7) << "not authpinnable on " << *dir << dendl; - return; - } - - list startfrags; - startfrags.push_back(dir); - - dir->state_set(CDir::STATE_FRAGMENTING); - - fragment_freeze(dir->get_inode(), startfrags, dir->get_frag(), bits); - fragment_mark_and_complete(dir->get_inode(), startfrags, dir->get_frag(), bits); -} - -/* - * initial the freeze, blocking with an auth_pin. - * - * some reason(s) we have to freeze: - * - on merge, version/projected version are unified from all fragments; - * concurrent pipelined updates in the directory will have divergent - * versioning... and that's no good. - */ -void MDCache::fragment_freeze(CInode *diri, list& frags, frag_t basefrag, int bits) -{ - C_Gather *gather = new C_Gather(new C_MDC_FragmentGo(this, diri, frags, basefrag, bits)); - - // freeze the dirs - for (list::iterator p = frags.begin(); - p != frags.end(); - ++p) { - CDir *dir = *p; - dir->auth_pin(); // this will block the freeze - dir->freeze_dir(); - assert(dir->is_freezing_dir()); - dir->add_waiter(CDir::WAIT_FROZEN, gather->new_sub()); - } -} - -class C_MDC_FragmentMarking : public Context { - MDCache *mdcache; - CInode *diri; - list dirs; - frag_t basefrag; - int bits; -public: - C_MDC_FragmentMarking(MDCache *m, CInode *di, list& dls, frag_t bf, int b) : - mdcache(m), diri(di), dirs(dls), basefrag(bf), bits(b) { } - virtual void finish(int r) { - mdcache->fragment_mark_and_complete(diri, dirs, basefrag, bits); - } -}; - -void MDCache::fragment_mark_and_complete(CInode *diri, - list& startfrags, - frag_t basefrag, int bits) -{ - dout(10) << "fragment_mark_and_complete " << basefrag << " by " << bits - << " on " << *diri << dendl; - - C_Gather *gather = 0; - - for (list::iterator p = startfrags.begin(); - p != startfrags.end(); - ++p) { - CDir *dir = *p; - - if (!dir->is_complete()) { - dout(15) << " fetching incomplete " << *dir << dendl; - if (!gather) gather = new C_Gather(new C_MDC_FragmentMarking(this, diri, startfrags, basefrag, bits)); - dir->fetch(gather->new_sub(), - true); // ignore authpinnability - } - else if (!dir->state_test(CDir::STATE_DNPINNEDFRAG)) { - dout(15) << " marking " << *dir << dendl; - for (CDir::map_t::iterator p = dir->items.begin(); - p != dir->items.end(); - ++p) { - p->second->get(CDentry::PIN_FRAGMENTING); - p->second->state_set(CDentry::STATE_FRAGMENTING); - } - dir->state_set(CDir::STATE_DNPINNEDFRAG); - dir->auth_unpin(); // allow our freeze to complete - } - else { - dout(15) << " marked " << *dir << dendl; - } - } -} - - -class C_MDC_FragmentStored : public Context { - MDCache *mdcache; - CInode *diri; - frag_t basefrag; - int bits; - list resultfrags; -public: - C_MDC_FragmentStored(MDCache *m, CInode *di, frag_t bf, int b, - list& rf) : - mdcache(m), diri(di), basefrag(bf), bits(b), resultfrags(rf) { } - virtual void finish(int r) { - mdcache->fragment_stored(diri, basefrag, bits, resultfrags); - } -}; - -void MDCache::fragment_go(CInode *diri, list& startfrags, frag_t basefrag, int bits) -{ - dout(10) << "fragment_go " << basefrag << " by " << bits - << " on " << *diri << dendl; - - // refragment - list resultfrags; - list waiters; - adjust_dir_fragments(diri, basefrag, bits, resultfrags, waiters); - mds->queue_waiters(waiters); - - C_Gather *gather = new C_Gather(new C_MDC_FragmentStored(this, diri, basefrag, bits, resultfrags)); - - // freeze, store resulting frags - for (list::iterator p = resultfrags.begin(); - p != resultfrags.end(); - p++) { - CDir *dir = *p; - dout(10) << " result frag " << *dir << dendl; - dir->state_set(CDir::STATE_FRAGMENTING); - dir->commit(0, gather->new_sub()); - dir->_freeze_dir(); - } -} - -class C_MDC_FragmentLogged : public Context { - MDCache *mdcache; - CInode *diri; - frag_t basefrag; - int bits; - list resultfrags; - vector pvs; - LogSegment *ls; -public: - C_MDC_FragmentLogged(MDCache *m, CInode *di, frag_t bf, int b, - list& rf, vector& p, - LogSegment *s) : - mdcache(m), diri(di), basefrag(bf), bits(b), ls(s) { - resultfrags.swap(rf); - pvs.swap(p); - } - virtual void finish(int r) { - mdcache->fragment_logged(diri, basefrag, bits, - resultfrags, pvs, - ls); - } -}; - -void MDCache::fragment_stored(CInode *diri, frag_t basefrag, int bits, - list& resultfrags) -{ - dout(10) << "fragment_stored " << basefrag << " by " << bits - << " on " << *diri << dendl; - - EFragment *le = new EFragment(mds->mdlog, diri->ino(), basefrag, bits); - - set peers; - vector pvs; - for (list::iterator p = resultfrags.begin(); - p != resultfrags.end(); - p++) { - CDir *dir = *p; - dout(10) << " result frag " << *dir << dendl; - - if (p == resultfrags.begin()) { - le->metablob.add_dir_context(dir); - // note peers - // only do this once: all frags have identical replica_maps. - if (peers.empty()) - for (map::iterator p = dir->replica_map.begin(); - p != dir->replica_map.end(); - ++p) - peers.insert(p->first); - } - - pvs.push_back(dir->pre_dirty()); - le->metablob.add_dir(dir, true); - } - - mds->mdlog->submit_entry(le, - new C_MDC_FragmentLogged(this, diri, basefrag, bits, - resultfrags, pvs, mds->mdlog->get_current_segment())); - - // announcelist& resultfrags, - for (set::iterator p = peers.begin(); - p != peers.end(); - ++p) { - MMDSFragmentNotify *notify = new MMDSFragmentNotify(diri->ino(), basefrag, bits); - if (bits < 0) { - // freshly replicate basedir to peer on merge - CDir *base = resultfrags.front(); - CDirDiscover *basedis = base->replicate_to(*p); - basedis->_encode(notify->basebl); - delete basedis; - } - mds->send_message_mds(notify, *p, MDS_PORT_CACHE); - } - -} - -void MDCache::fragment_logged(CInode *diri, frag_t basefrag, int bits, - list& resultfrags, - vector& pvs, - LogSegment *ls) -{ - dout(10) << "fragment_logged " << basefrag << " bits " << bits - << " on " << *diri << dendl; - - - // dirty resulting frags - set peers; - vector::iterator pv = pvs.begin(); - for (list::iterator p = resultfrags.begin(); - p != resultfrags.end(); - p++) { - CDir *dir = *p; - dout(10) << " result frag " << *dir << dendl; - - // dirty, unpin, unfreeze - dir->state_clear(CDir::STATE_FRAGMENTING); - dir->mark_dirty(*pv, ls); - pv++; - - for (CDir::map_t::iterator p = dir->items.begin(); - p != dir->items.end(); - ++p) { - CDentry *dn = p->second; - if (dn->state_test(CDentry::STATE_FRAGMENTING)) - dn->put(CDentry::PIN_FRAGMENTING); - } - - dir->unfreeze_dir(); - } -} - - - -void MDCache::handle_fragment_notify(MMDSFragmentNotify *notify) -{ - dout(10) << "handle_fragment_notify " << *notify << " from " << notify->get_source() << dendl; - - CInode *diri = get_inode(notify->get_ino()); - if (diri) { - list waiters; - - // add replica dir (for merge)? - // (adjust_dir_fragments expects base to already exist, if non-auth) - if (notify->get_bits() < 0) { - CDirDiscover basedis; - int off = 0; - basedis._decode(notify->basebl, off); - add_replica_dir(diri, notify->get_basefrag(), basedis, - notify->get_source().num(), waiters); - } - - // refragment - list resultfrags; - adjust_dir_fragments(diri, notify->get_basefrag(), notify->get_bits(), - resultfrags, waiters); - mds->queue_waiters(waiters); - } - - delete notify; -} - - - - - -// ============================================================== -// debug crap - -void MDCache::show_subtrees(int dbl) -{ - //dout(10) << "show_subtrees" << dendl; - - if (dbl > g_conf.debug && dbl > g_conf.debug_mds) - return; // i won't print anything. - - if (subtrees.empty()) { - dout(dbl) << "show_subtrees - no subtrees" << dendl; - return; - } - - // root frags - list basefrags; - for (set::iterator p = base_inodes.begin(); - p != base_inodes.end(); - ++p) - (*p)->get_dirfrags(basefrags); - //dout(15) << "show_subtrees, base dirfrags " << basefrags << dendl; - dout(15) << "show_subtrees" << dendl; - - // queue stuff - list > q; - string indent; - set seen; - - // calc max depth - for (list::iterator p = basefrags.begin(); p != basefrags.end(); ++p) - q.push_back(pair(*p, 0)); - - set subtrees_seen; - - int depth = 0; - while (!q.empty()) { - CDir *dir = q.front().first; - int d = q.front().second; - q.pop_front(); - - if (subtrees.count(dir) == 0) continue; - - subtrees_seen.insert(dir); - - if (d > depth) depth = d; - - // sanity check - //dout(25) << "saw depth " << d << " " << *dir << dendl; - if (seen.count(dir)) dout(0) << "aah, already seen " << *dir << dendl; - assert(seen.count(dir) == 0); - seen.insert(dir); - - // nested items? - if (!subtrees[dir].empty()) { - for (set::iterator p = subtrees[dir].begin(); - p != subtrees[dir].end(); - ++p) { - //dout(25) << " saw sub " << **p << dendl; - q.push_front(pair(*p, d+1)); - } - } - } - - - // print tree - for (list::iterator p = basefrags.begin(); p != basefrags.end(); ++p) - q.push_back(pair(*p, 0)); - - while (!q.empty()) { - CDir *dir = q.front().first; - int d = q.front().second; - q.pop_front(); - - if (subtrees.count(dir) == 0) continue; - - // adjust indenter - while ((unsigned)d < indent.size()) - indent.resize(d); - - // pad - string pad = "______________________________________"; - pad.resize(depth*2+1-indent.size()); - if (!subtrees[dir].empty()) - pad[0] = '.'; // parent - - - string auth; - if (dir->is_auth()) - auth = "auth "; - else - auth = " rep "; - - char s[10]; - if (dir->get_dir_auth().second == CDIR_AUTH_UNKNOWN) - sprintf(s, "%2d ", dir->get_dir_auth().first); - else - sprintf(s, "%2d,%2d", dir->get_dir_auth().first, dir->get_dir_auth().second); - - // print - dout(dbl) << indent << "|_" << pad << s << " " << auth << *dir << dendl; - - if (dir->ino() == MDS_INO_ROOT) - assert(dir->inode == root); - if (dir->ino() == MDS_INO_STRAY(mds->get_nodeid())) - assert(dir->inode == stray); - - // nested items? - if (!subtrees[dir].empty()) { - // more at my level? - if (!q.empty() && q.front().second == d) - indent += "| "; - else - indent += " "; - - for (set::iterator p = subtrees[dir].begin(); - p != subtrees[dir].end(); - ++p) - q.push_front(pair(*p, d+2)); - } - } - - // verify there isn't stray crap in subtree map - int lost = 0; - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) { - if (subtrees_seen.count(p->first)) continue; - dout(10) << "*** stray/lost entry in subtree map: " << *p->first << dendl; - lost++; - } - assert(lost == 0); -} - - -void MDCache::show_cache() -{ - dout(7) << "show_cache" << dendl; - - for (hash_map::iterator it = inode_map.begin(); - it != inode_map.end(); - it++) { - // unlinked? - if (!it->second->parent) - dout(7) << " unlinked " << *it->second << dendl; - - // dirfrags? - list dfs; - it->second->get_dirfrags(dfs); - for (list::iterator p = dfs.begin(); p != dfs.end(); ++p) { - CDir *dir = *p; - dout(7) << " dirfrag " << *dir << dendl; - - for (CDir::map_t::iterator p = dir->items.begin(); - p != dir->items.end(); - ++p) { - CDentry *dn = p->second; - dout(7) << " dentry " << *dn << dendl; - if (dn->is_primary() && dn->inode) - dout(7) << " inode " << *dn->inode << dendl; - } - } - } -} - - -void MDCache::dump_cache() -{ - if (g_conf.debug_mds < 2) return; - - char fn[20]; - sprintf(fn, "cachedump.%d.mds%d", mds->mdsmap->get_epoch(), mds->get_nodeid()); - - dout(1) << "dump_cache to " << fn << dendl; - - ofstream myfile; - myfile.open(fn); - - for (hash_map::iterator it = inode_map.begin(); - it != inode_map.end(); - it++) { - list dfs; - it->second->get_dirfrags(dfs); - for (list::iterator p = dfs.begin(); p != dfs.end(); ++p) { - CDir *dir = *p; - myfile << *dir->inode << std::endl; - myfile << *dir << std::endl; - - for (CDir::map_t::iterator p = dir->items.begin(); - p != dir->items.end(); - ++p) { - CDentry *dn = p->second; - myfile << *dn << std::endl; - } - } - } - - myfile.close(); -} diff --git a/branches/sage/mds/mds/MDLog.cc b/branches/sage/mds/mds/MDLog.cc deleted file mode 100644 index 52c50ff82a4fc..0000000000000 --- a/branches/sage/mds/mds/MDLog.cc +++ /dev/null @@ -1,511 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "MDLog.h" -#include "MDS.h" -#include "MDCache.h" -#include "LogEvent.h" - -#include "osdc/Journaler.h" - -#include "common/LogType.h" -#include "common/Logger.h" - -#include "events/ESubtreeMap.h" - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug_mds || l <= g_conf.debug_mds_log) *_dout << dbeginl << g_clock.now() << " mds" << mds->get_nodeid() << ".log " -#define derr(l) if (l<=g_conf.debug_mds || l <= g_conf.debug_mds_log) *_derr << dbeginl << g_clock.now() << " mds" << mds->get_nodeid() << ".log " - -// cons/des - -LogType mdlog_logtype; - - -MDLog::~MDLog() -{ - if (journaler) { delete journaler; journaler = 0; } - if (logger) { delete logger; logger = 0; } -} - - -void MDLog::reopen_logger(utime_t start, bool append) -{ - // logger - char name[80]; - sprintf(name, "mds%d.log", mds->get_nodeid()); - logger = new Logger(name, &mdlog_logtype, append); - logger->set_start(start); - - static bool didit = false; - if (!didit) { - didit = true; - mdlog_logtype.add_inc("evadd"); - mdlog_logtype.add_inc("evex"); - mdlog_logtype.add_inc("evtrm"); - mdlog_logtype.add_set("ev"); - mdlog_logtype.add_set("evexg"); - mdlog_logtype.add_set("evexd"); - - mdlog_logtype.add_inc("segadd"); - mdlog_logtype.add_inc("segex"); - mdlog_logtype.add_inc("segtrm"); - mdlog_logtype.add_set("seg"); - mdlog_logtype.add_set("segexg"); - mdlog_logtype.add_set("segexd"); - - mdlog_logtype.add_set("expos"); - mdlog_logtype.add_set("wrpos"); - - mdlog_logtype.add_avg("jlat"); - } - -} - -void MDLog::init_journaler() -{ - // inode - memset(&log_inode, 0, sizeof(log_inode)); - log_inode.ino = MDS_INO_LOG_OFFSET + mds->get_nodeid(); - log_inode.layout = g_OSD_MDLogLayout; - - if (g_conf.mds_local_osd) - log_inode.layout.fl_pg_preferred = mds->get_nodeid() + g_conf.mds_local_osd_offset; // hack - - // log streamer - if (journaler) delete journaler; - journaler = new Journaler(log_inode, mds->objecter, logger, &mds->mds_lock); -} - -void MDLog::write_head(Context *c) -{ - journaler->write_head(c); -} - -off_t MDLog::get_read_pos() -{ - return journaler->get_read_pos(); -} - -off_t MDLog::get_write_pos() -{ - return journaler->get_write_pos(); -} - - - -void MDLog::create(Context *c) -{ - dout(5) << "create empty log" << dendl; - init_journaler(); - journaler->reset(); - write_head(c); - - logger->set("expos", journaler->get_expire_pos()); - logger->set("wrpos", journaler->get_write_pos()); -} - -void MDLog::open(Context *c) -{ - dout(5) << "open discovering log bounds" << dendl; - init_journaler(); - journaler->recover(c); - - // either append() or replay() will follow. -} - -void MDLog::append() -{ - dout(5) << "append positioning at end" << dendl; - journaler->set_read_pos(journaler->get_write_pos()); - journaler->set_expire_pos(journaler->get_write_pos()); - - logger->set("expos", journaler->get_write_pos()); -} - - - -// ------------------------------------------------- - -void MDLog::submit_entry( LogEvent *le, Context *c ) -{ - if (!g_conf.mds_log) { - // hack: log is disabled. - if (c) { - c->finish(0); - delete c; - } - return; - } - - dout(5) << "submit_entry " << journaler->get_write_pos() << " : " << *le << dendl; - - // let the event register itself in the segment - assert(!segments.empty()); - le->_segment = segments.rbegin()->second; - le->_segment->num_events++; - le->update_segment(); - - num_events++; - assert(!capped); - - // encode it, with event type - { - bufferlist bl; - ::_encode(le->_type, bl); - le->encode_payload(bl); - - // journal it. - journaler->append_entry(bl); // bl is destroyed. - } - - le->_segment->end = journaler->get_write_pos(); - - delete le; - - if (logger) { - logger->inc("evadd"); - logger->set("ev", num_events); - logger->set("wrpos", journaler->get_write_pos()); - } - - if (c) { - unflushed = 0; - journaler->flush(c); - } - else - unflushed++; - - // start a new segment? - // FIXME: should this go elsewhere? - off_t last_seg = get_last_segment_offset(); - if (!segments.empty() && - !writing_subtree_map && - (journaler->get_write_pos() / ceph_file_layout_period(log_inode.layout) != (last_seg / ceph_file_layout_period(log_inode.layout)) && - (journaler->get_write_pos() - last_seg > ceph_file_layout_period(log_inode.layout)/2))) { - dout(10) << "submit_entry also starting new segment: last = " << last_seg - << ", cur pos = " << journaler->get_write_pos() << dendl; - start_new_segment(); - } -} - -void MDLog::wait_for_sync( Context *c ) -{ - if (g_conf.mds_log) { - // wait - journaler->flush(c); - } else { - // hack: bypass. - c->finish(0); - delete c; - } -} - -void MDLog::flush() -{ - if (unflushed) - journaler->flush(); - unflushed = 0; - - // trim - trim(); -} - -void MDLog::cap() -{ - dout(5) << "cap" << dendl; - capped = true; -} - - -// ----------------------------- -// segments - -void MDLog::start_new_segment(Context *onsync) -{ - dout(7) << "start_new_segment at " << journaler->get_write_pos() << dendl; - assert(!writing_subtree_map); - - segments[journaler->get_write_pos()] = new LogSegment(journaler->get_write_pos()); - - writing_subtree_map = true; - - ESubtreeMap *le = mds->mdcache->create_subtree_map(); - submit_entry(le, new C_MDL_WroteSubtreeMap(this, mds->mdlog->get_write_pos())); - if (onsync) - wait_for_sync(onsync); - - logger->inc("segadd"); - logger->set("seg", segments.size()); -} - -void MDLog::_logged_subtree_map(off_t off) -{ - dout(10) << "_logged_subtree_map at " << off << dendl; - writing_subtree_map = false; - - /* - list ls; - take_subtree_map_expire_waiters(ls); - mds->queue_waiters(ls); - */ -} - - - -void MDLog::trim() -{ - // trim! - dout(10) << "trim " - << segments.size() << " / " << max_segments << " segments, " - << num_events << " / " << max_events << " events" - << ", " << expiring_segments.size() << " (" << expiring_events << ") expiring" - << ", " << expired_segments.size() << " (" << expired_events << ") expired" - << dendl; - - if (segments.empty()) return; - - // hack: only trim for a few seconds at a time - utime_t stop = g_clock.now(); - stop += 2.0; - - map::iterator p = segments.begin(); - int left = num_events; - while (p != segments.end() && - ((max_events >= 0 && left-expiring_events-expired_events > max_events) || - (max_segments >= 0 && (int)(segments.size()-expiring_segments.size()-expired_segments.size()) > max_segments))) { - - if (stop < g_clock.now()) - break; - - if ((int)expiring_segments.size() >= g_conf.mds_log_max_expiring) - break; - - // look at first segment - LogSegment *ls = p->second; - assert(ls); - - p++; - - left -= ls->num_events; - - if (expiring_segments.count(ls)) { - dout(5) << "trim already expiring segment " << ls->offset << ", " << ls->num_events << " events" << dendl; - } else if (expired_segments.count(ls)) { - dout(5) << "trim already expired segment " << ls->offset << ", " << ls->num_events << " events" << dendl; - } else { - try_expire(ls); - } - } -} - - -void MDLog::try_expire(LogSegment *ls) -{ - C_Gather *exp = ls->try_to_expire(mds); - if (exp) { - assert(expiring_segments.count(ls) == 0); - expiring_segments.insert(ls); - expiring_events += ls->num_events; - dout(5) << "try_expire expiring segment " << ls->offset << dendl; - exp->set_finisher(new C_MaybeExpiredSegment(this, ls)); - } else { - dout(10) << "try_expire expired segment " << ls->offset << dendl; - _expired(ls); - } - - logger->set("segexg", expiring_segments.size()); - logger->set("evexg", expiring_events); -} - -void MDLog::_maybe_expired(LogSegment *ls) -{ - dout(10) << "_maybe_expired segment " << ls->offset << " " << ls->num_events << " events" << dendl; - assert(expiring_segments.count(ls)); - expiring_segments.erase(ls); - expiring_events -= ls->num_events; - try_expire(ls); -} - -void MDLog::_expired(LogSegment *ls) -{ - dout(5) << "_expired segment " << ls->offset << " " << ls->num_events << " events" << dendl; - - if (!capped && ls == get_current_segment()) { - dout(5) << "_expired not expiring " << ls->offset << ", last one and !capped" << dendl; - } else if (ls->end > journaler->get_write_ack_pos()) { - dout(5) << "_expired not expiring " << ls->offset << ", not fully flushed yet, ack " - << journaler->get_write_ack_pos() << " < end " << ls->end << dendl; - } else { - // expired. - expired_segments.insert(ls); - expired_events += ls->num_events; - - logger->inc("evex", ls->num_events); - logger->inc("segex"); - - // trim expired segments? - while (!segments.empty()) { - ls = segments.begin()->second; - if (!expired_segments.count(ls)) break; - - expired_events -= ls->num_events; - expired_segments.erase(ls); - num_events -= ls->num_events; - - journaler->set_expire_pos(ls->offset); // this was the oldest segment, adjust expire pos - journaler->write_head(0); - - logger->set("expos", ls->offset); - logger->inc("segtrm"); - logger->inc("evtrm", ls->num_events); - - segments.erase(ls->offset); - delete ls; - } - } - - logger->set("ev", num_events); - logger->set("evexd", expired_events); - logger->set("seg", segments.size()); - logger->set("segexd", expired_segments.size()); -} - - - -void MDLog::replay(Context *c) -{ - assert(journaler->is_active()); - - // start reading at the last known expire point. - journaler->set_read_pos( journaler->get_expire_pos() ); - - // empty? - if (journaler->get_read_pos() == journaler->get_write_pos()) { - dout(10) << "replay - journal empty, done." << dendl; - if (c) { - c->finish(0); - delete c; - } - return; - } - - // add waiter - if (c) - waitfor_replay.push_back(c); - - // go! - dout(10) << "replay start, from " << journaler->get_read_pos() - << " to " << journaler->get_write_pos() << dendl; - - assert(num_events == 0); - - replay_thread.create(); -} - -class C_MDL_Replay : public Context { - MDLog *mdlog; -public: - C_MDL_Replay(MDLog *l) : mdlog(l) {} - void finish(int r) { - mdlog->replay_cond.Signal(); - } -}; - - - -// i am a separate thread -void MDLog::_replay_thread() -{ - mds->mds_lock.Lock(); - dout(10) << "_replay_thread start" << dendl; - - // loop - off_t new_expire_pos = journaler->get_expire_pos(); - while (1) { - // wait for read? - while (!journaler->is_readable() && - journaler->get_read_pos() < journaler->get_write_pos()) { - journaler->wait_for_readable(new C_MDL_Replay(this)); - replay_cond.Wait(mds->mds_lock); - } - - if (!journaler->is_readable() && - journaler->get_read_pos() == journaler->get_write_pos()) - break; - - assert(journaler->is_readable()); - - // read it - off_t pos = journaler->get_read_pos(); - bufferlist bl; - bool r = journaler->try_read_entry(bl); - assert(r); - - // unpack event - LogEvent *le = LogEvent::decode(bl); - - // new segment? - if (le->get_type() == EVENT_SUBTREEMAP) { - segments[pos] = new LogSegment(pos); - logger->set("seg", segments.size()); - } - - // have we seen an import map yet? - if (segments.empty()) { - dout(10) << "_replay " << pos << " / " << journaler->get_write_pos() - << " -- waiting for subtree_map. (skipping " << *le << ")" << dendl; - } else { - dout(10) << "_replay " << pos << " / " << journaler->get_write_pos() - << " : " << *le << dendl; - le->_segment = get_current_segment(); // replay may need this - le->_segment->num_events++; - le->_segment->end = journaler->get_read_pos(); - num_events++; - - le->replay(mds); - - if (!new_expire_pos) - new_expire_pos = pos; - } - delete le; - - logger->set("rdpos", pos); - - // drop lock for a second, so other events/messages (e.g. beacon timer!) can go off - mds->mds_lock.Unlock(); - mds->mds_lock.Lock(); - } - - // done! - assert(journaler->get_read_pos() == journaler->get_write_pos()); - dout(10) << "_replay - complete, " << num_events << " events, new read/expire pos is " << new_expire_pos << dendl; - - // move read pointer _back_ to first subtree map we saw, for eventual trimming - journaler->set_read_pos(new_expire_pos); - journaler->set_expire_pos(new_expire_pos); - logger->set("expos", new_expire_pos); - - // kick waiter(s) - list ls; - ls.swap(waitfor_replay); - finish_contexts(ls,0); - - dout(10) << "_replay_thread finish" << dendl; - mds->mds_lock.Unlock(); -} - - - diff --git a/branches/sage/mds/mds/MDS.cc b/branches/sage/mds/mds/MDS.cc deleted file mode 100644 index 3a759518b3c47..0000000000000 --- a/branches/sage/mds/mds/MDS.cc +++ /dev/null @@ -1,1239 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "include/types.h" -#include "common/Clock.h" - -#include "msg/Messenger.h" - -#include "osd/OSDMap.h" -#include "osdc/Objecter.h" -#include "osdc/Filer.h" - -#include "MDSMap.h" - -#include "MDS.h" -#include "Server.h" -#include "Locker.h" -#include "MDCache.h" -#include "MDLog.h" -#include "MDBalancer.h" -#include "IdAllocator.h" -#include "Migrator.h" -//#include "Renamer.h" - -#include "AnchorTable.h" -#include "AnchorClient.h" - -#include "common/Logger.h" -#include "common/LogType.h" - -#include "common/Timer.h" - -#include "events/ESession.h" - -#include "messages/MMDSMap.h" -#include "messages/MMDSBeacon.h" - -#include "messages/MPing.h" -#include "messages/MPingAck.h" -#include "messages/MGenericMessage.h" - -#include "messages/MOSDMap.h" -#include "messages/MOSDGetMap.h" - -#include "messages/MClientRequest.h" -#include "messages/MClientRequestForward.h" - - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) *_dout << dbeginl << g_clock.now() << " mds" << whoami << " " -#define derr(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) *_derr << dbeginl << g_clock.now() << " mds" << whoami << " " - - - - - -// cons/des -MDS::MDS(int whoami, Messenger *m, MonMap *mm) : - timer(mds_lock), - clientmap(this) { - - this->whoami = whoami; - - last_tid = 0; - - monmap = mm; - messenger = m; - - mdsmap = new MDSMap; - osdmap = new OSDMap; - - objecter = new Objecter(messenger, monmap, osdmap, mds_lock); - filer = new Filer(objecter); - - mdcache = new MDCache(this); - mdlog = new MDLog(this); - balancer = new MDBalancer(this); - - anchorclient = new AnchorClient(this); - idalloc = new IdAllocator(this); - - anchortable = new AnchorTable(this); - - server = new Server(this); - locker = new Locker(this, mdcache); - - // clients - last_client_mdsmap_bcast = 0; - - // beacon - beacon_last_seq = 0; - beacon_sender = 0; - beacon_killer = 0; - - // tick - tick_event = 0; - - req_rate = 0; - - want_state = state = MDSMap::STATE_DNE; - - logger = logger2 = 0; - - // i'm ready! - messenger->set_dispatcher(this); -} - -MDS::~MDS() { - Mutex::Locker lock(mds_lock); - - if (mdcache) { delete mdcache; mdcache = NULL; } - if (mdlog) { delete mdlog; mdlog = NULL; } - if (balancer) { delete balancer; balancer = NULL; } - if (idalloc) { delete idalloc; idalloc = NULL; } - if (anchortable) { delete anchortable; anchortable = NULL; } - if (anchorclient) { delete anchorclient; anchorclient = NULL; } - if (osdmap) { delete osdmap; osdmap = 0; } - if (mdsmap) { delete mdsmap; mdsmap = 0; } - - if (server) { delete server; server = 0; } - if (locker) { delete locker; locker = 0; } - - if (filer) { delete filer; filer = 0; } - if (objecter) { delete objecter; objecter = 0; } - if (messenger) { delete messenger; messenger = NULL; } - - if (logger) { delete logger; logger = 0; } - if (logger2) { delete logger2; logger2 = 0; } - -} - - -void MDS::reopen_logger(utime_t start) -{ - static LogType mds_logtype, mds_cache_logtype; - static bool didit = false; - if (!didit) { - didit = true; - - //mds_logtype.add_inc("req"); - mds_logtype.add_inc("reply"); - mds_logtype.add_inc("fw"); - - mds_logtype.add_inc("dir_f"); - mds_logtype.add_inc("dir_c"); - //mds_logtype.add_inc("mkdir"); - - /* - mds_logtype.add_inc("newin"); // new inodes (pre)loaded - mds_logtype.add_inc("newt"); // inodes first touched/used - mds_logtype.add_inc("outt"); // trimmed touched - mds_logtype.add_inc("outut"); // trimmed untouched (wasted effort) - mds_logtype.add_avg("oututl"); // avg trim latency for untouched - - mds_logtype.add_inc("dirt1"); - mds_logtype.add_inc("dirt2"); - mds_logtype.add_inc("dirt3"); - mds_logtype.add_inc("dirt4"); - mds_logtype.add_inc("dirt5"); - */ - - mds_logtype.add_set("c"); - mds_logtype.add_set("ctop"); - mds_logtype.add_set("cbot"); - mds_logtype.add_set("cptail"); - mds_logtype.add_set("cpin"); - mds_logtype.add_inc("cex"); - mds_logtype.add_inc("dis"); - - mds_logtype.add_inc("t"); - mds_logtype.add_inc("thit"); - mds_logtype.add_inc("tfw"); - mds_logtype.add_inc("tdis"); - mds_logtype.add_inc("tdirf"); - mds_logtype.add_inc("trino"); - mds_logtype.add_inc("tlock"); - - mds_logtype.add_set("l"); - mds_logtype.add_set("q"); - mds_logtype.add_set("popanyd"); - mds_logtype.add_set("popnest"); - - mds_logtype.add_set("buf"); - - mds_logtype.add_set("sm"); - mds_logtype.add_inc("ex"); - mds_logtype.add_inc("iex"); - mds_logtype.add_inc("im"); - mds_logtype.add_inc("iim"); - /* - mds_logtype.add_inc("imex"); - mds_logtype.add_set("nex"); - mds_logtype.add_set("nim"); - */ - - mds_logtype.add_avg("replyl"); - - } - - if (whoami < 0) return; - - // flush+close old log - if (logger) delete logger; - if (logger2) delete logger2; - - // log - char name[80]; - sprintf(name, "mds%d", whoami); - - bool append = mdsmap->get_inc(whoami) > 1; - - logger = new Logger(name, (LogType*)&mds_logtype, append); - logger->set_start(start); - - char n[80]; - sprintf(n, "mds%d.cache", whoami); - logger2 = new Logger(n, (LogType*)&mds_cache_logtype, append); - logger2->set_start(start); - - mdlog->reopen_logger(start, append); - server->reopen_logger(start, append); -} - -void MDS::send_message_mds(Message *m, int mds, int port, int fromport) -{ - // send mdsmap first? - if (peer_mdsmap_epoch[mds] < mdsmap->get_epoch()) { - messenger->send_message(new MMDSMap(mdsmap), - mdsmap->get_inst(mds)); - peer_mdsmap_epoch[mds] = mdsmap->get_epoch(); - } - - // send message - if (port && !fromport) - fromport = port; - messenger->send_message(m, mdsmap->get_inst(mds), port, fromport); -} - -void MDS::forward_message_mds(Message *req, int mds, int port) -{ - // client request? - if (req->get_type() == MSG_CLIENT_REQUEST && - ((MClientRequest*)req)->get_client_inst().name.is_client()) { - MClientRequest *creq = (MClientRequest*)req; - creq->inc_num_fwd(); // inc forward counter - - // tell the client where it should go - messenger->send_message(new MClientRequestForward(creq->get_tid(), mds, creq->get_num_fwd()), - creq->get_client_inst()); - - if (!creq->is_idempotent()) { - /* don't actually forward if non-idempotent! - * client has to do it. although the MDS will ignore duplicate requests, - * the affected metadata may migrate, in which case the new authority - * won't have the metareq_id in the completed request map. - */ - delete req; - return; - } - } - - // forward - send_message_mds(req, mds, port); -} - - - -void MDS::send_message_client(Message *m, int client) -{ - version_t seq = clientmap.inc_push_seq(client); - dout(10) << "send_message_client client" << client << " seq " << seq << " " << *m << dendl; - messenger->send_message(m, clientmap.get_inst(client)); -} - -void MDS::send_message_client(Message *m, entity_inst_t clientinst) -{ - version_t seq = clientmap.inc_push_seq(clientinst.name.num()); - dout(10) << "send_message_client client" << clientinst.name.num() << " seq " << seq << " " << *m << dendl; - messenger->send_message(m, clientinst); -} - - - -int MDS::init(bool standby) -{ - mds_lock.Lock(); - - objecter->init(); - - want_state = MDSMap::STATE_BOOT; - - // starting beacon. this will induce an MDSMap from the monitor - beacon_start(); - - // schedule tick - reset_tick(); - - mds_lock.Unlock(); - return 0; -} - -void MDS::reset_tick() -{ - // cancel old - if (tick_event) timer.cancel_event(tick_event); - - // schedule - tick_event = new C_MDS_Tick(this); - timer.add_event_after(g_conf.mon_tick_interval, tick_event); -} - -void MDS::tick() -{ - tick_event = 0; - - // reschedule - reset_tick(); - - // log - mds_load_t load = balancer->get_load(); - - if (logger) { - req_rate = logger->get("req"); - - logger->fset("l", (int)load.mds_load()); - logger->set("q", messenger->get_dispatch_queue_len()); - logger->set("buf", buffer_total_alloc); - logger->set("sm", mdcache->num_subtrees()); - - mdcache->log_stat(logger); - } - - if (is_active() || is_stopping()) - locker->scatter_unscatter_autoscattered(); - - // booted? - if (is_active()) { - - // balancer - balancer->tick(); - - } -} - - - - -// ----------------------- -// beacons - -void MDS::beacon_start() -{ - beacon_send(); // send first beacon - - //reset_beacon_killer(); // schedule killer -} - - - -void MDS::beacon_send() -{ - ++beacon_last_seq; - dout(10) << "beacon_send " << MDSMap::get_state_name(want_state) - << " seq " << beacon_last_seq - << " (currently " << MDSMap::get_state_name(state) << ")" - << dendl; - - beacon_seq_stamp[beacon_last_seq] = g_clock.now(); - - int mon = monmap->pick_mon(); - messenger->send_message(new MMDSBeacon(messenger->get_myinst(), mdsmap->get_epoch(), - want_state, beacon_last_seq), - monmap->get_inst(mon)); - - // schedule next sender - if (beacon_sender) timer.cancel_event(beacon_sender); - beacon_sender = new C_MDS_BeaconSender(this); - timer.add_event_after(g_conf.mds_beacon_interval, beacon_sender); -} - -void MDS::handle_mds_beacon(MMDSBeacon *m) -{ - dout(10) << "handle_mds_beacon " << MDSMap::get_state_name(m->get_state()) - << " seq " << m->get_seq() << dendl; - version_t seq = m->get_seq(); - - // update lab - if (beacon_seq_stamp.count(seq)) { - assert(beacon_seq_stamp[seq] > beacon_last_acked_stamp); - beacon_last_acked_stamp = beacon_seq_stamp[seq]; - - // clean up seq_stamp map - while (!beacon_seq_stamp.empty() && - beacon_seq_stamp.begin()->first <= seq) - beacon_seq_stamp.erase(beacon_seq_stamp.begin()); - - reset_beacon_killer(); - } - - delete m; -} - -void MDS::reset_beacon_killer() -{ - utime_t when = beacon_last_acked_stamp; - when += g_conf.mds_beacon_grace; - - dout(15) << "reset_beacon_killer last_acked_stamp at " << beacon_last_acked_stamp - << ", will die at " << when << dendl; - - if (beacon_killer) timer.cancel_event(beacon_killer); - - beacon_killer = new C_MDS_BeaconKiller(this, beacon_last_acked_stamp); - timer.add_event_at(when, beacon_killer); -} - -void MDS::beacon_kill(utime_t lab) -{ - if (lab == beacon_last_acked_stamp) { - dout(0) << "beacon_kill last_acked_stamp " << lab - << ", killing myself." - << dendl; - suicide(); - } else { - dout(20) << "beacon_kill last_acked_stamp " << beacon_last_acked_stamp - << " != my " << lab - << ", doing nothing." - << dendl; - } -} - - - -void MDS::handle_mds_map(MMDSMap *m) -{ - version_t hadepoch = mdsmap->get_epoch(); - version_t epoch = m->get_epoch(); - dout(5) << "handle_mds_map epoch " << epoch << " from " << m->get_source() << dendl; - - // note source's map version - if (m->get_source().is_mds() && - peer_mdsmap_epoch[m->get_source().num()] < epoch) { - dout(15) << " peer " << m->get_source() - << " has mdsmap epoch >= " << epoch - << dendl; - peer_mdsmap_epoch[m->get_source().num()] = epoch; - } - - // is it new? - if (epoch <= mdsmap->get_epoch()) { - dout(5) << " old map epoch " << epoch << " <= " << mdsmap->get_epoch() - << ", discarding" << dendl; - delete m; - return; - } - - // keep old map, for a moment - MDSMap *oldmap = mdsmap; - int oldwhoami = whoami; - int oldstate = state; - - // decode and process - mdsmap = new MDSMap; - mdsmap->decode(m->get_encoded()); - - // see who i am - whoami = mdsmap->get_addr_rank(messenger->get_myaddr()); - if (whoami < 0) { - dout(1) << "handle_mds_map i'm not in the mdsmap, killing myself" << dendl; - suicide(); - return; - } - - // open logger? - // note that fakesyn/newsyn starts knowing who they are - if (whoami >= 0 && - mdsmap->is_up(whoami) && !mdsmap->is_standby(whoami) && - (oldwhoami != whoami || !logger)) - reopen_logger(mdsmap->get_create()); // adopt mds cluster timeline - - if (oldwhoami != whoami) { - // update messenger. - dout(1) << "handle_mds_map i am now mds" << whoami - << " incarnation " << mdsmap->get_inc(whoami) - << dendl; - messenger->reset_myname(entity_name_t::MDS(whoami)); - - // do i need an osdmap? - if (oldwhoami < 0) { - // we need an osdmap too. - int mon = monmap->pick_mon(); - messenger->send_message(new MOSDGetMap(0), - monmap->get_inst(mon)); - } - } - - // tell objecter my incarnation - if (objecter->get_client_incarnation() < 0 && - mdsmap->have_inst(whoami)) { - assert(mdsmap->get_inc(whoami) > 0); - objecter->set_client_incarnation(mdsmap->get_inc(whoami)); - } - - // for debug - if (g_conf.mds_dump_cache_on_map) - mdcache->dump_cache(); - - // update my state - state = mdsmap->get_state(whoami); - - // did it change? - if (oldstate != state) { - if (state == want_state) { - dout(1) << "handle_mds_map new state " << mdsmap->get_state_name(state) << dendl; - } else { - dout(1) << "handle_mds_map new state " << mdsmap->get_state_name(state) - // << ", although i wanted " << mdsmap->get_state_name(want_state) - << dendl; - want_state = state; - } - - // now active? - if (is_active()) { - // did i just recover? - if (oldstate == MDSMap::STATE_REJOIN || - oldstate == MDSMap::STATE_RECONNECT) - recovery_done(); - finish_contexts(waiting_for_active); // kick waiters - } else if (is_replay()) { - replay_start(); - } else if (is_resolve()) { - resolve_start(); - } else if (is_reconnect()) { - reconnect_start(); - } else if (is_stopping()) { - assert(oldstate == MDSMap::STATE_ACTIVE); - stopping_start(); - } else if (is_stopped()) { - assert(oldstate == MDSMap::STATE_STOPPING); - suicide(); - return; - } - } - - - // RESOLVE - // is someone else newly resolving? - if (is_resolve() || is_rejoin() || is_active() || is_stopping()) { - set oldresolve, resolve; - oldmap->get_mds_set(oldresolve, MDSMap::STATE_RESOLVE); - mdsmap->get_mds_set(resolve, MDSMap::STATE_RESOLVE); - if (oldresolve != resolve) { - dout(10) << "resolve set is " << resolve << ", was " << oldresolve << dendl; - for (set::iterator p = resolve.begin(); p != resolve.end(); ++p) - if (*p != whoami && - oldresolve.count(*p) == 0) - mdcache->send_resolve(*p); // now or later. - } - } - - // REJOIN - // is everybody finally rejoining? - if (is_rejoin() || is_active() || is_stopping()) { - // did we start? - if (!oldmap->is_rejoining() && mdsmap->is_rejoining()) - rejoin_joint_start(); - - // did we finish? - if (g_conf.mds_dump_cache_after_rejoin && - oldmap->is_rejoining() && !mdsmap->is_rejoining()) - mdcache->dump_cache(); // for DEBUG only - } - if (oldmap->is_degraded() && !mdsmap->is_degraded() && state >= MDSMap::STATE_ACTIVE) - dout(1) << "cluster recovered." << dendl; - - // did someone go active? - if (is_active() || is_stopping()) { - set oldactive, active; - oldmap->get_mds_set(oldactive, MDSMap::STATE_ACTIVE); - mdsmap->get_mds_set(active, MDSMap::STATE_ACTIVE); - for (set::iterator p = active.begin(); p != active.end(); ++p) - if (*p != whoami && // not me - oldactive.count(*p) == 0) // newly so? - handle_mds_recovery(*p); - } - - // did someone fail or stop? - if (is_active() || is_stopping()) { - // new failed? - set oldfailed, failed; - oldmap->get_mds_set(oldfailed, MDSMap::STATE_FAILED); - mdsmap->get_mds_set(failed, MDSMap::STATE_FAILED); - for (set::iterator p = failed.begin(); p != failed.end(); ++p) - if (oldfailed.count(*p) == 0) - mdcache->handle_mds_failure(*p); - - // or down then up? - // did their addr/inst change? - set up; - mdsmap->get_up_mds_set(up); - for (set::iterator p = up.begin(); p != up.end(); ++p) - if (oldmap->have_inst(*p) && - oldmap->get_inst(*p) != mdsmap->get_inst(*p)) - mdcache->handle_mds_failure(*p); - - // did anyone stop? - set oldstopped, stopped; - oldmap->get_mds_set(oldstopped, MDSMap::STATE_STOPPED); - mdsmap->get_mds_set(stopped, MDSMap::STATE_STOPPED); - for (set::iterator p = stopped.begin(); p != stopped.end(); ++p) - if (oldstopped.count(*p) == 0) // newly so? - mdcache->migrator->handle_mds_failure_or_stop(*p); - } - - // just got mdsmap+osdmap? - if (hadepoch == 0 && - mdsmap->get_epoch() > 0 && - osdmap->get_epoch() > 0) { - boot(); - } else if (want_state != state) { - // resend beacon. - beacon_send(); - } - - delete m; - delete oldmap; -} - -void MDS::bcast_mds_map() -{ - dout(7) << "bcast_mds_map " << mdsmap->get_epoch() << dendl; - - // share the map with mounted clients - for (set::const_iterator p = clientmap.get_session_set().begin(); - p != clientmap.get_session_set().end(); - ++p) { - messenger->send_message(new MMDSMap(mdsmap), - clientmap.get_inst(*p)); - } - last_client_mdsmap_bcast = mdsmap->get_epoch(); -} - - -void MDS::handle_osd_map(MOSDMap *m) -{ - version_t hadepoch = osdmap->get_epoch(); - dout(10) << "handle_osd_map had " << hadepoch << dendl; - - // process - objecter->handle_osd_map(m); - - // just got mdsmap+osdmap? - if (hadepoch == 0 && - osdmap->get_epoch() > 0 && - mdsmap->get_epoch() > 0) - boot(); -} - - -void MDS::set_want_state(int s) -{ - dout(3) << "set_want_state " << MDSMap::get_state_name(s) << dendl; - want_state = s; - beacon_send(); -} - -void MDS::boot() -{ - if (is_creating()) - boot_create(); // new tables, journal - else if (is_starting() || is_replay()) - boot_start(); // start|replay, join - else - assert(is_standby()); -} - - -class C_MDS_CreateFinish : public Context { - MDS *mds; -public: - C_MDS_CreateFinish(MDS *m) : mds(m) {} - void finish(int r) { mds->creating_done(); } -}; - -void MDS::boot_create() -{ - dout(3) << "boot_create" << dendl; - - C_Gather *fin = new C_Gather(new C_MDS_CreateFinish(this)); - - CDir *rootdir = 0; - if (whoami == 0) { - dout(3) << "boot_create since i am also mds0, creating root inode and dir" << dendl; - - // create root inode. - mdcache->open_root(0); - CInode *root = mdcache->get_root(); - assert(root); - - // force empty root dir - rootdir = root->get_dirfrag(frag_t()); - rootdir->mark_complete(); - } - - // create my stray dir - CDir *straydir; - { - dout(10) << "boot_create creating local stray dir" << dendl; - mdcache->open_local_stray(); - CInode *stray = mdcache->get_stray(); - straydir = stray->get_dirfrag(frag_t()); - straydir->mark_complete(); - } - - // start with a fresh journal - dout(10) << "boot_create creating fresh journal" << dendl; - mdlog->create(fin->new_sub()); - - // write our first subtreemap - mdlog->start_new_segment(fin->new_sub()); - - // dirty, commit (root and) stray dir(s) - if (whoami == 0) { - rootdir->mark_dirty(rootdir->pre_dirty(), mdlog->get_current_segment()); - rootdir->commit(0, fin->new_sub()); - } - straydir->mark_dirty(straydir->pre_dirty(), mdlog->get_current_segment()); - straydir->commit(0, fin->new_sub()); - - // fixme: fake out idalloc (reset, pretend loaded) - dout(10) << "boot_create creating fresh idalloc table" << dendl; - idalloc->reset(); - idalloc->save(fin->new_sub()); - - // write empty clientmap - clientmap.save(fin->new_sub()); - - // fixme: fake out anchortable - if (mdsmap->get_anchortable() == whoami) { - dout(10) << "boot_create creating fresh anchortable" << dendl; - anchortable->create_fresh(); - anchortable->save(fin->new_sub()); - } -} - -void MDS::creating_done() -{ - dout(1)<< "creating_done" << dendl; - set_want_state(MDSMap::STATE_ACTIVE); -} - - -class C_MDS_BootStart : public Context { - MDS *mds; - int nextstep; -public: - C_MDS_BootStart(MDS *m, int n) : mds(m), nextstep(n) {} - void finish(int r) { mds->boot_start(nextstep); } -}; - -void MDS::boot_start(int step) -{ - switch (step) { - case 0: - step = 1; // fall-thru. - - case 1: - { - C_Gather *gather = new C_Gather(new C_MDS_BootStart(this, 2)); - dout(2) << "boot_start " << step << ": opening idalloc" << dendl; - idalloc->load(gather->new_sub()); - - dout(2) << "boot_start " << step << ": opening clientmap" << dendl; - clientmap.load(gather->new_sub()); - - if (mdsmap->get_anchortable() == whoami) { - dout(2) << "boot_start " << step << ": opening anchor table" << dendl; - anchortable->load(gather->new_sub()); - } - - dout(2) << "boot_start " << step << ": opening mds log" << dendl; - mdlog->open(gather->new_sub()); - } - break; - - case 2: - if (is_replay()) { - dout(2) << "boot_start " << step << ": replaying mds log" << dendl; - mdlog->replay(new C_MDS_BootStart(this, 3)); - break; - } else { - dout(2) << "boot_start " << step << ": positioning at end of old mds log" << dendl; - mdlog->append(); - step++; - } - - case 3: - if (is_replay()) { - replay_done(); - break; - } - - // starting only - assert(is_starting()); - if (mdsmap->get_root() == whoami) { - dout(2) << "boot_start " << step << ": opening root directory" << dendl; - mdcache->open_root(new C_MDS_BootStart(this, 4)); - break; - } - step++; - - case 4: - dout(2) << "boot_start " << step << ": opening local stray directory" << dendl; - mdcache->open_local_stray(); - - starting_done(); - break; - } -} - -void MDS::starting_done() -{ - dout(3) << "starting_done" << dendl; - assert(is_starting()); - set_want_state(MDSMap::STATE_ACTIVE); - - // start new segment - mdlog->start_new_segment(0); -} - - -void MDS::replay_start() -{ - dout(1) << "replay_start" << dendl; - - // initialize gather sets - set rs; - mdsmap->get_recovery_mds_set(rs); - rs.erase(whoami); - dout(1) << "now replay. my recovery peers are " << rs << dendl; - mdcache->set_recovery_set(rs); - - // start? - if (osdmap->get_epoch() > 0 && - mdsmap->get_epoch() > 0) - boot_start(); -} - -void MDS::replay_done() -{ - dout(1) << "replay_done" << dendl; - - if (mdsmap->get_num_in_mds() == 1 && - mdsmap->get_num_mds(MDSMap::STATE_FAILED) == 0) { // just me! - dout(2) << "i am alone, moving to state reconnect" << dendl; - set_want_state(MDSMap::STATE_RECONNECT); - } else { - dout(2) << "i am not alone, moving to state resolve" << dendl; - set_want_state(MDSMap::STATE_RESOLVE); - } - - // start new segment - mdlog->start_new_segment(0); -} - - -void MDS::resolve_start() -{ - dout(1) << "resolve_start" << dendl; - - set who; - mdsmap->get_mds_set(who, MDSMap::STATE_RESOLVE); - mdsmap->get_mds_set(who, MDSMap::STATE_REJOIN); - mdsmap->get_mds_set(who, MDSMap::STATE_ACTIVE); - mdsmap->get_mds_set(who, MDSMap::STATE_STOPPING); - for (set::iterator p = who.begin(); p != who.end(); ++p) { - if (*p == whoami) continue; - mdcache->send_resolve(*p); // now. - } -} -void MDS::resolve_done() -{ - dout(1) << "resolve_done" << dendl; - set_want_state(MDSMap::STATE_RECONNECT); -} - -void MDS::reconnect_start() -{ - dout(1) << "reconnect_start" << dendl; - server->reconnect_clients(); -} -void MDS::reconnect_done() -{ - dout(1) << "reconnect_done" << dendl; - set_want_state(MDSMap::STATE_REJOIN); // move to rejoin state - - /* - if (mdsmap->get_num_in_mds() == 1 && - mdsmap->get_num_mds(MDSMap::STATE_FAILED) == 0) { // just me! - - // finish processing caps (normally, this happens during rejoin, but we're skipping that...) - mdcache->rejoin_gather_finish(); - - set_want_state(MDSMap::STATE_ACTIVE); // go active - } else { - set_want_state(MDSMap::STATE_REJOIN); // move to rejoin state - } - */ -} - -void MDS::rejoin_joint_start() -{ - dout(1) << "rejoin_joint_start" << dendl; - mdcache->rejoin_send_rejoins(); -} -void MDS::rejoin_done() -{ - dout(1) << "rejoin_done" << dendl; - mdcache->show_subtrees(); - mdcache->show_cache(); - set_want_state(MDSMap::STATE_ACTIVE); -} - - -void MDS::recovery_done() -{ - dout(1) << "recovery_done -- successful recovery!" << dendl; - assert(is_active()); - - // kick anchortable (resent AGREEs) - if (mdsmap->get_anchortable() == whoami) - anchortable->finish_recovery(); - - // kick anchorclient (resent COMMITs) - anchorclient->finish_recovery(); - - mdcache->start_recovered_purges(); - - // tell connected clients - bcast_mds_map(); -} - -void MDS::handle_mds_recovery(int who) -{ - dout(5) << "handle_mds_recovery mds" << who << dendl; - - mdcache->handle_mds_recovery(who); - - if (anchortable) - anchortable->handle_mds_recovery(who); - anchorclient->handle_mds_recovery(who); - - queue_waiters(waiting_for_active_peer[who]); - waiting_for_active_peer.erase(who); -} - -void MDS::stopping_start() -{ - dout(2) << "stopping_start" << dendl; - - // start cache shutdown - mdcache->shutdown_start(); - - // terminate client sessions - server->terminate_sessions(); -} - -void MDS::stopping_done() -{ - dout(2) << "stopping_done" << dendl; - - // tell monitor we shut down cleanly. - set_want_state(MDSMap::STATE_STOPPED); -} - - - -void MDS::suicide() -{ - dout(1) << "suicide" << dendl; - - // stop timers - if (beacon_killer) { - timer.cancel_event(beacon_killer); - beacon_killer = 0; - } - if (beacon_sender) { - timer.cancel_event(beacon_sender); - beacon_sender = 0; - } - if (tick_event) { - timer.cancel_event(tick_event); - tick_event = 0; - } - timer.cancel_all(); - //timer.join(); // this will deadlock from beacon_kill -> suicide - - // shut down cache - mdcache->shutdown(); - - objecter->shutdown(); - - // shut down messenger - messenger->shutdown(); -} - - - - - -void MDS::dispatch(Message *m) -{ - mds_lock.Lock(); - my_dispatch(m); - mds_lock.Unlock(); -} - - - -void MDS::my_dispatch(Message *m) -{ - // from bad mds? - if (m->get_source().is_mds()) { - int from = m->get_source().num(); - if (!mdsmap->have_inst(from) || - mdsmap->get_inst(from) != m->get_source_inst() || - mdsmap->is_down(from)) { - // bogus mds? - if (m->get_type() == MSG_MDS_MAP) { - dout(5) << "got " << *m << " from old/bad/imposter mds " << m->get_source() - << ", but it's an mdsmap, looking at it" << dendl; - } else if (m->get_type() == MSG_MDS_CACHEEXPIRE && - mdsmap->get_inst(from) == m->get_source_inst()) { - dout(5) << "got " << *m << " from down mds " << m->get_source() - << ", but it's a cache_expire, looking at it" << dendl; - } else { - dout(5) << "got " << *m << " from down/old/bad/imposter mds " << m->get_source() - << ", dropping" << dendl; - delete m; - return; - } - } - } - - - switch (m->get_dest_port()) { - - case MDS_PORT_ANCHORTABLE: - anchortable->dispatch(m); - break; - case MDS_PORT_ANCHORCLIENT: - anchorclient->dispatch(m); - break; - - case MDS_PORT_CACHE: - mdcache->dispatch(m); - break; - case MDS_PORT_LOCKER: - locker->dispatch(m); - break; - - case MDS_PORT_MIGRATOR: - mdcache->migrator->dispatch(m); - break; - case MDS_PORT_RENAMER: - //mdcache->renamer->dispatch(m); - break; - - case MDS_PORT_BALANCER: - balancer->proc_message(m); - break; - - case MDS_PORT_MAIN: - proc_message(m); - break; - - case MDS_PORT_SERVER: - server->dispatch(m); - break; - - default: - dout(1) << "MDS dispatch unknown message port" << m->get_dest_port() << dendl; - assert(0); - } - - // finish any triggered contexts - if (finished_queue.size()) { - dout(7) << "mds has " << finished_queue.size() << " queued contexts" << dendl; - dout(10) << finished_queue << dendl; - list ls; - ls.splice(ls.begin(), finished_queue); - assert(finished_queue.empty()); - finish_contexts(ls); - } - - - // HACK FOR NOW - if (is_active() || is_stopping()) { - // flush log to disk after every op. for now. - mdlog->flush(); - - // trim cache - mdcache->trim(); - } - - - // hack: thrash exports - static utime_t start; - utime_t now = g_clock.now(); - if (start == utime_t()) - start = now; - double el = now - start; - if (el > 30.0 && - el < 60.0) - for (int i=0; i s; - if (!is_active()) break; - mdsmap->get_mds_set(s, MDSMap::STATE_ACTIVE); - if (s.size() < 2 || mdcache->get_num_inodes() < 10) - break; // need peers for this to work. - - dout(7) << "mds thrashing exports pass " << (i+1) << "/" << g_conf.mds_thrash_exports << dendl; - - // pick a random dir inode - CInode *in = mdcache->hack_pick_random_inode(); - - list ls; - in->get_dirfrags(ls); - if (ls.empty()) continue; // must be an open dir. - CDir *dir = ls.front(); - if (!dir->get_parent_dir()) continue; // must be linked. - if (!dir->is_auth()) continue; // must be auth. - - int dest; - do { - int k = rand() % s.size(); - set::iterator p = s.begin(); - while (k--) p++; - dest = *p; - } while (dest == whoami); - mdcache->migrator->export_dir_nicely(dir,dest); - } - // hack: thrash exports - for (int i=0; ihack_pick_random_inode(); - - list ls; - in->get_dirfrags(ls); - if (ls.empty()) continue; // must be an open dir. - CDir *dir = ls.front(); - if (!dir->get_parent_dir()) continue; // must be linked. - if (!dir->is_auth()) continue; // must be auth. - mdcache->split_dir(dir, 1);// + (rand() % 3)); - } - - // hack: force hash root? - /* - if (false && - mdcache->get_root() && - mdcache->get_root()->dir && - !(mdcache->get_root()->dir->is_hashed() || - mdcache->get_root()->dir->is_hashing())) { - dout(0) << "hashing root" << dendl; - mdcache->migrator->hash_dir(mdcache->get_root()->dir); - } - */ - - - - // shut down? - if (is_stopping()) { - if (mdcache->shutdown_pass()) { - dout(7) << "shutdown_pass=true, finished w/ shutdown, moving to down:stopped" << dendl; - stopping_done(); - } - } - -} - - -void MDS::proc_message(Message *m) -{ - switch (m->get_type()) { - - // OSD - case MSG_OSD_OPREPLY: - objecter->handle_osd_op_reply((class MOSDOpReply*)m); - return; - case MSG_OSD_MAP: - handle_osd_map((MOSDMap*)m); - return; - - - // MDS - case MSG_MDS_MAP: - handle_mds_map((MMDSMap*)m); - return; - case MSG_MDS_BEACON: - handle_mds_beacon((MMDSBeacon*)m); - return; - - default: - assert(0); - } - -} - - - -void MDS::ms_handle_failure(Message *m, const entity_inst_t& inst) -{ - mds_lock.Lock(); - dout(10) << "handle_ms_failure to " << inst << " on " << *m << dendl; - - if (m->get_type() == MSG_MDS_MAP && m->get_dest().is_client()) - server->client_reconnect_failure(m->get_dest().num()); - - delete m; - mds_lock.Unlock(); -} - diff --git a/branches/sage/mds/mds/MDS.h b/branches/sage/mds/mds/MDS.h deleted file mode 100644 index 89f6e6d278dd3..0000000000000 --- a/branches/sage/mds/mds/MDS.h +++ /dev/null @@ -1,299 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __MDS_H -#define __MDS_H - -#include -#include -#include -#include -#include -using namespace std; - -#include -using namespace __gnu_cxx; - -#include "mdstypes.h" - -#include "msg/Dispatcher.h" -#include "include/types.h" -#include "include/Context.h" -#include "common/DecayCounter.h" -#include "common/Logger.h" -#include "common/Mutex.h" -#include "common/Cond.h" -#include "common/Timer.h" - -#include "mon/MonMap.h" -#include "MDSMap.h" - -#include "ClientMap.h" - - - - -class filepath; - -class OSDMap; -class Objecter; -class Filer; - -class Server; -class Locker; -class AnchorTable; -class AnchorClient; -class MDCache; -class MDLog; -class MDBalancer; -class IdAllocator; - -class CInode; -class CDir; -class CDentry; - -class Messenger; -class Message; - -class MClientRequest; -class MClientReply; -class MHashReaddir; -class MHashReaddirReply; - -class MMDSBeacon; - - -class MDS : public Dispatcher { - public: - Mutex mds_lock; - - SafeTimer timer; - - protected: - int whoami; - - public: - Messenger *messenger; - MDSMap *mdsmap; - MonMap *monmap; - OSDMap *osdmap; - Objecter *objecter; - Filer *filer; // for reading/writing to/from osds - - // sub systems - Server *server; - MDCache *mdcache; - Locker *locker; - MDLog *mdlog; - MDBalancer *balancer; - - IdAllocator *idalloc; - - AnchorTable *anchortable; - AnchorClient *anchorclient; - - Logger *logger, *logger2; - - - protected: - // -- MDS state -- - int state; // my confirmed state - int want_state; // the state i want - - list waiting_for_active; - map > waiting_for_active_peer; - - map peer_mdsmap_epoch; - - tid_t last_tid; // for mds-initiated requests (e.g. stray rename) - - public: - void wait_for_active(Context *c) { - waiting_for_active.push_back(c); - } - void wait_for_active_peer(int who, Context *c) { - waiting_for_active_peer[who].push_back(c); - } - - int get_state() { return state; } - bool is_dne() { return state == MDSMap::STATE_DNE; } - bool is_failed() { return state == MDSMap::STATE_FAILED; } - bool is_creating() { return state == MDSMap::STATE_CREATING; } - bool is_starting() { return state == MDSMap::STATE_STARTING; } - bool is_standby() { return state == MDSMap::STATE_STANDBY; } - bool is_replay() { return state == MDSMap::STATE_REPLAY; } - bool is_resolve() { return state == MDSMap::STATE_RESOLVE; } - bool is_reconnect() { return state == MDSMap::STATE_RECONNECT; } - bool is_rejoin() { return state == MDSMap::STATE_REJOIN; } - bool is_active() { return state == MDSMap::STATE_ACTIVE; } - bool is_stopping() { return state == MDSMap::STATE_STOPPING; } - bool is_stopped() { return state == MDSMap::STATE_STOPPED; } - - void set_want_state(int s); - - tid_t issue_tid() { return ++last_tid; } - - - // -- waiters -- - list finished_queue; - - void queue_waiter(Context *c) { - finished_queue.push_back(c); - } - void queue_waiters(list& ls) { - finished_queue.splice( finished_queue.end(), ls ); - } - - // -- keepalive beacon -- - version_t beacon_last_seq; // last seq sent to monitor - map beacon_seq_stamp; // seq # -> time sent - utime_t beacon_last_acked_stamp; // last time we sent a beacon that got acked - - class C_MDS_BeaconSender : public Context { - MDS *mds; - public: - C_MDS_BeaconSender(MDS *m) : mds(m) {} - void finish(int r) { - mds->beacon_sender = 0; - mds->beacon_send(); - } - } *beacon_sender; - class C_MDS_BeaconKiller : public Context { - MDS *mds; - utime_t lab; - public: - C_MDS_BeaconKiller(MDS *m, utime_t l) : mds(m), lab(l) {} - void finish(int r) { - if (mds->beacon_killer) { - mds->beacon_killer = 0; - mds->beacon_kill(lab); - } - // else mds is pbly already shutting down - } - } *beacon_killer; - - // tick and other timer fun - class C_MDS_Tick : public Context { - MDS *mds; - public: - C_MDS_Tick(MDS *m) : mds(m) {} - void finish(int r) { - mds->tick_event = 0; - mds->tick(); - } - } *tick_event; - void reset_tick(); - - // -- client map -- - ClientMap clientmap; - epoch_t last_client_mdsmap_bcast; - //void log_clientmap(Context *c); - - - // shutdown crap - int req_rate; - - // ino's and fh's - public: - - int get_req_rate() { return req_rate; } - - - public: - MDS(int whoami, Messenger *m, MonMap *mm); - ~MDS(); - - // who am i etc - int get_nodeid() { return whoami; } - MDSMap *get_mds_map() { return mdsmap; } - OSDMap *get_osd_map() { return osdmap; } - - void send_message_mds(Message *m, int mds, int port=0, int fromport=0); - void forward_message_mds(Message *req, int mds, int port=0); - - void send_message_client(Message *m, int client); - void send_message_client(Message *m, entity_inst_t clientinst); - - - // start up, shutdown - int init(bool standby=false); - void reopen_logger(utime_t start); - - void bcast_mds_map(); // to mounted clients - - void boot(); - void boot_create(); // i am new mds. - void boot_start(int step=0); // starting|replay - - void replay_start(); - void creating_done(); - void starting_done(); - void replay_done(); - - void resolve_start(); - void resolve_done(); - void reconnect_start(); - void reconnect_done(); - void rejoin_joint_start(); - void rejoin_done(); - void recovery_done(); - void handle_mds_recovery(int who); - - void stopping_start(); - void stopping_done(); - void suicide(); - - void tick(); - - void beacon_start(); - void beacon_send(); - void beacon_kill(utime_t lab); - void handle_mds_beacon(MMDSBeacon *m); - void reset_beacon_killer(); - - // messages - void proc_message(Message *m); - virtual void dispatch(Message *m); - void my_dispatch(Message *m); - - void ms_handle_failure(Message *m, const entity_inst_t& inst); - - // special message types - void handle_mds_map(class MMDSMap *m); - - // osds - void handle_osd_map(class MOSDMap *m); -}; - - - -class C_MDS_RetryMessage : public Context { - Message *m; - MDS *mds; -public: - C_MDS_RetryMessage(MDS *mds, Message *m) { - assert(m); - this->m = m; - this->mds = mds; - } - virtual void finish(int r) { - mds->my_dispatch(m); - } -}; - - - -#endif diff --git a/branches/sage/mds/mds/MDSMap.h b/branches/sage/mds/mds/MDSMap.h deleted file mode 100644 index f2b31ca0fd1c1..0000000000000 --- a/branches/sage/mds/mds/MDSMap.h +++ /dev/null @@ -1,357 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MDSMAP_H -#define __MDSMAP_H - -#include "common/Clock.h" -#include "msg/Message.h" - -#include "include/types.h" - -#include -#include -#include -using namespace std; - - -/* - - beautiful state diagram: - - STOPPED DNE FAILED - / | \ / | | - / | \________ _______/ | | -| v v v v | -| STARTING <--> STANDBY <--> CREATING | -| \ / | -| \____ ____________/ | - \ v v | - \ ACTIVE <-- REJOIN <-- RECONNECT <-- REPLAY - \ | - \ | - \ v - \-- STOPPING - - - - -*/ - - -class MDSMap { - public: - // mds states - static const int STATE_DNE = 0; // down, never existed. - static const int STATE_STOPPED = -1; // down, once existed, but no subtrees. empty log. - static const int STATE_FAILED = 2; // down, active subtrees; needs to be recovered. - - static const int STATE_BOOT = -3; // up, boot announcement. destiny unknown. - static const int STATE_STANDBY = -4; // up, idle. waiting for assignment by monitor. - static const int STATE_CREATING = -5; // up, creating MDS instance (new journal, idalloc..). - static const int STATE_STARTING = -6; // up, starting prior stopped MDS instance. - - static const int STATE_REPLAY = 7; // up, starting prior failed instance. scanning journal. - static const int STATE_RESOLVE = 8; // up, disambiguating distributed operations (import, rename, etc.) - static const int STATE_RECONNECT = 9; // up, reconnect to clients - static const int STATE_REJOIN = 10; // up, replayed journal, rejoining distributed cache - static const int STATE_ACTIVE = 11; // up, active - static const int STATE_STOPPING = 12; // up, exporting metadata (-> standby or out) - - static const char *get_state_name(int s) { - switch (s) { - // down and out - case STATE_DNE: return "down:dne"; - case STATE_STOPPED: return "down:stopped"; - // down and in - case STATE_FAILED: return "down:failed"; - // up and out - case STATE_BOOT: return "up:boot"; - case STATE_CREATING: return "up:creating"; - case STATE_STARTING: return "up:starting"; - case STATE_STANDBY: return "up:standby"; - // up and in - case STATE_REPLAY: return "up:replay"; - case STATE_RESOLVE: return "up:resolve"; - case STATE_RECONNECT: return "up:reconnect"; - case STATE_REJOIN: return "up:rejoin"; - case STATE_ACTIVE: return "up:active"; - case STATE_STOPPING: return "up:stopping"; - default: assert(0); - } - return 0; - } - - protected: - epoch_t epoch; - utime_t created; - epoch_t same_in_set_since; // note: this does not reflect exit-by-failure. - - int target_num; - int anchortable; // which MDS has anchortable (fixme someday) - int root; // which MDS has root directory - - set mds_created; // which mds ids have initialized journals and id tables. - map mds_state; // MDS state - map mds_state_seq; - map mds_inst; // up instances - map mds_inc; // incarnation count (monotonically increases) - - friend class MDSMonitor; - - public: - MDSMap() : epoch(0), same_in_set_since(0), anchortable(0), root(0) {} - - epoch_t get_epoch() const { return epoch; } - void inc_epoch() { epoch++; } - - const utime_t& get_create() const { return created; } - epoch_t get_same_in_set_since() const { return same_in_set_since; } - - int get_anchortable() const { return anchortable; } - int get_root() const { return root; } - - // counts - int get_num_mds() { - return get_num_in_mds(); - } - int get_num_mds(int state) { - int n = 0; - for (map::const_iterator p = mds_state.begin(); - p != mds_state.end(); - p++) - if (p->second == state) ++n; - return n; - } - - int get_num_in_mds() { - int n = 0; - for (map::const_iterator p = mds_state.begin(); - p != mds_state.end(); - p++) - if (p->second > 0) ++n; - return n; - } - - // sets - void get_mds_set(set& s) { - for (map::const_iterator p = mds_state.begin(); - p != mds_state.end(); - p++) - s.insert(p->first); - } - void get_mds_set(set& s, int state) { - for (map::const_iterator p = mds_state.begin(); - p != mds_state.end(); - p++) - if (p->second == state) - s.insert(p->first); - } - void get_up_mds_set(set& s) { - for (map::const_iterator p = mds_state.begin(); - p != mds_state.end(); - p++) - if (is_up(p->first)) s.insert(p->first); - } - void get_in_mds_set(set& s) { - for (map::const_iterator p = mds_state.begin(); - p != mds_state.end(); - p++) - if (is_in(p->first)) s.insert(p->first); - } - void get_active_mds_set(set& s) { - get_mds_set(s, MDSMap::STATE_ACTIVE); - } - void get_failed_mds_set(set& s) { - get_mds_set(s, MDSMap::STATE_FAILED); - } - void get_recovery_mds_set(set& s) { - for (map::const_iterator p = mds_state.begin(); - p != mds_state.end(); - p++) - if (is_failed(p->first) || - (p->second >= STATE_REPLAY && p->second <= STATE_STOPPING)) - s.insert(p->first); - } - - int get_random_in_mds() { - vector v; - for (map::const_iterator p = mds_state.begin(); - p != mds_state.end(); - p++) - if (p->second > 0) v.push_back(p->first); - if (v.empty()) - return -1; - else - return v[rand() % v.size()]; - } - - - // mds states - bool is_down(int m) { return is_dne(m) || is_stopped(m) || is_failed(m); } - bool is_up(int m) { return !is_down(m); } - bool is_in(int m) { return mds_state.count(m) && mds_state[m] > 0; } - bool is_out(int m) { return !mds_state.count(m) || mds_state[m] <= 0; } - - bool is_dne(int m) { return mds_state.count(m) == 0 || mds_state[m] == STATE_DNE; } - bool is_failed(int m) { return mds_state.count(m) && mds_state[m] == STATE_FAILED; } - - bool is_boot(int m) { return mds_state.count(m) && mds_state[m] == STATE_BOOT; } - bool is_standby(int m) { return mds_state.count(m) && mds_state[m] == STATE_STANDBY; } - bool is_creating(int m) { return mds_state.count(m) && mds_state[m] == STATE_CREATING; } - bool is_starting(int m) { return mds_state.count(m) && mds_state[m] == STATE_STARTING; } - bool is_replay(int m) { return mds_state.count(m) && mds_state[m] == STATE_REPLAY; } - bool is_resolve(int m) { return mds_state.count(m) && mds_state[m] == STATE_RESOLVE; } - bool is_reconnect(int m) { return mds_state.count(m) && mds_state[m] == STATE_RECONNECT; } - bool is_rejoin(int m) { return mds_state.count(m) && mds_state[m] == STATE_REJOIN; } - bool is_active(int m) { return mds_state.count(m) && mds_state[m] == STATE_ACTIVE; } - bool is_stopping(int m) { return mds_state.count(m) && mds_state[m] == STATE_STOPPING; } - bool is_active_or_stopping(int m) { return is_active(m) || is_stopping(m); } - bool is_stopped(int m) { return mds_state.count(m) && mds_state[m] == STATE_STOPPED; } - - bool has_created(int m) { return mds_created.count(m); } - - // cluster states - bool is_full() { - return get_num_in_mds() >= target_num; - } - bool is_degraded() { // degraded = some recovery in process. fixes active membership and recovery_set. - return - get_num_mds(STATE_REPLAY) + - get_num_mds(STATE_RESOLVE) + - get_num_mds(STATE_RECONNECT) + - get_num_mds(STATE_REJOIN) + - get_num_mds(STATE_FAILED); - } - bool is_rejoining() { - // nodes are rejoining cache state - return - get_num_mds(STATE_REJOIN) > 0 && - get_num_mds(STATE_REPLAY) == 0 && - get_num_mds(STATE_RECONNECT) == 0 && - get_num_mds(STATE_RESOLVE) == 0 && - get_num_mds(STATE_FAILED) == 0; - } - bool is_stopped() { - return - get_num_in_mds() == 0 && - get_num_mds(STATE_CREATING) == 0 && - get_num_mds(STATE_STARTING) == 0 && - get_num_mds(STATE_STANDBY) == 0; - } - - bool would_be_overfull_with(int mds) { - int in = 1; // mds! - for (map::const_iterator p = mds_state.begin(); - p != mds_state.end(); - p++) { - if (p->first == mds) continue; - if (p->second > 0 || - p->second == STATE_STARTING || - p->second == STATE_CREATING) - in++; - } - return (in > target_num); - } - - int get_state(int m) { - if (mds_state.count(m)) - return mds_state[m]; - else - return STATE_DNE; - } - - // inst - bool have_inst(int m) { - return mds_inst.count(m); - } - const entity_inst_t& get_inst(int m) { - assert(mds_inst.count(m)); - return mds_inst[m]; - } - bool get_inst(int m, entity_inst_t& inst) { - if (mds_inst.count(m)) { - inst = mds_inst[m]; - return true; - } - return false; - } - - int get_addr_rank(const entity_addr_t& addr) { - for (map::iterator p = mds_inst.begin(); - p != mds_inst.end(); - ++p) { - if (p->second.addr == addr) return p->first; - } - /*else - for (map::iterator p = mds_inst.begin(); - p != mds_inst.end(); - ++p) { - if (memcmp(&p->second.addr,&inst.addr, sizeof(inst.addr)) == 0) return p->first; - } - */ - - return -1; - } - - int get_inc(int m) { - if (mds_inc.count(m)) - return mds_inc[m]; - return 0; - } - - - void remove_mds(int m) { - mds_inst.erase(m); - mds_state.erase(m); - mds_state_seq.erase(m); - } - - - // serialize, unserialize - void encode(bufferlist& bl) { - ::_encode(epoch, bl); - ::_encode(target_num, bl); - ::_encode(created, bl); - ::_encode(same_in_set_since, bl); - ::_encode(anchortable, bl); - ::_encode(root, bl); - ::_encode(mds_state, bl); - ::_encode(mds_state_seq, bl); - ::_encode(mds_inst, bl); - ::_encode(mds_inc, bl); - } - - void decode(bufferlist& bl) { - int off = 0; - ::_decode(epoch, bl, off); - ::_decode(target_num, bl, off); - ::_decode(created, bl, off); - ::_decode(same_in_set_since, bl, off); - ::_decode(anchortable, bl, off); - ::_decode(root, bl, off); - ::_decode(mds_state, bl, off); - ::_decode(mds_state_seq, bl, off); - ::_decode(mds_inst, bl, off); - ::_decode(mds_inc, bl, off); - } - - - /*** mapping functions ***/ - - int hash_dentry( inodeno_t dirino, const string& dn ); -}; - -#endif diff --git a/branches/sage/mds/mds/Migrator.cc b/branches/sage/mds/mds/Migrator.cc deleted file mode 100644 index ea1e9f1b216c2..0000000000000 --- a/branches/sage/mds/mds/Migrator.cc +++ /dev/null @@ -1,2315 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "MDS.h" -#include "MDCache.h" -#include "CInode.h" -#include "CDir.h" -#include "CDentry.h" -#include "Migrator.h" -#include "Locker.h" -#include "Migrator.h" -#include "Server.h" - -#include "MDBalancer.h" -#include "MDLog.h" -#include "MDSMap.h" - -#include "include/filepath.h" - -#include "events/EString.h" -#include "events/EExport.h" -#include "events/EImportStart.h" -#include "events/EImportFinish.h" -#include "events/ESessions.h" - -#include "msg/Messenger.h" - -#include "messages/MClientFileCaps.h" - -#include "messages/MExportDirDiscover.h" -#include "messages/MExportDirDiscoverAck.h" -#include "messages/MExportDirCancel.h" -#include "messages/MExportDirPrep.h" -#include "messages/MExportDirPrepAck.h" -#include "messages/MExportDir.h" -#include "messages/MExportDirAck.h" -#include "messages/MExportDirNotify.h" -#include "messages/MExportDirNotifyAck.h" -#include "messages/MExportDirFinish.h" - -#include "messages/MExportCaps.h" -#include "messages/MExportCapsAck.h" - - - - - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_mds || l <= g_conf.debug_mds_migrator) *_dout << dbeginl << g_clock.now() << " mds" << mds->get_nodeid() << ".migrator " - - - -void Migrator::dispatch(Message *m) -{ - switch (m->get_type()) { - // import - case MSG_MDS_EXPORTDIRDISCOVER: - handle_export_discover((MExportDirDiscover*)m); - break; - case MSG_MDS_EXPORTDIRPREP: - handle_export_prep((MExportDirPrep*)m); - break; - case MSG_MDS_EXPORTDIR: - handle_export_dir((MExportDir*)m); - break; - case MSG_MDS_EXPORTDIRFINISH: - handle_export_finish((MExportDirFinish*)m); - break; - case MSG_MDS_EXPORTDIRCANCEL: - handle_export_cancel((MExportDirCancel*)m); - break; - - // export - case MSG_MDS_EXPORTDIRDISCOVERACK: - handle_export_discover_ack((MExportDirDiscoverAck*)m); - break; - case MSG_MDS_EXPORTDIRPREPACK: - handle_export_prep_ack((MExportDirPrepAck*)m); - break; - case MSG_MDS_EXPORTDIRACK: - handle_export_ack((MExportDirAck*)m); - break; - case MSG_MDS_EXPORTDIRNOTIFYACK: - handle_export_notify_ack((MExportDirNotifyAck*)m); - break; - - // export 3rd party (dir_auth adjustments) - case MSG_MDS_EXPORTDIRNOTIFY: - handle_export_notify((MExportDirNotify*)m); - break; - - // caps - case MSG_MDS_EXPORTCAPS: - handle_export_caps((MExportCaps*)m); - break; - case MSG_MDS_EXPORTCAPSACK: - handle_export_caps_ack((MExportCapsAck*)m); - break; - - default: - assert(0); - } -} - - -class C_MDC_EmptyImport : public Context { - Migrator *mig; - CDir *dir; -public: - C_MDC_EmptyImport(Migrator *m, CDir *d) : mig(m), dir(d) {} - void finish(int r) { - mig->export_empty_import(dir); - } -}; - - -void Migrator::export_empty_import(CDir *dir) -{ - dout(7) << "export_empty_import " << *dir << dendl; - assert(dir->is_subtree_root()); - - if (dir->inode->is_auth()) { - dout(7) << " inode is auth" << dendl; - return; - } - if (!dir->is_auth()) { - dout(7) << " not auth" << dendl; - return; - } - if (dir->is_freezing() || dir->is_frozen()) { - dout(7) << " freezing or frozen" << dendl; - return; - } - if (dir->get_size() > 0) { - dout(7) << " not actually empty" << dendl; - return; - } - if (dir->inode->is_root()) { - dout(7) << " root" << dendl; - return; - } - - int dest = dir->inode->authority().first; - //if (mds->is_shutting_down()) dest = 0; // this is more efficient. - - dout(7) << " really empty, exporting to " << dest << dendl; - assert (dest != mds->get_nodeid()); - - dout(7) << "exporting to mds" << dest - << " empty import " << *dir << dendl; - export_dir( dir, dest ); -} - - - - -// ========================================================== -// mds failure handling - -void Migrator::handle_mds_failure_or_stop(int who) -{ - dout(5) << "handle_mds_failure_or_stop mds" << who << dendl; - - // check my exports - map::iterator p = export_state.begin(); - while (p != export_state.end()) { - map::iterator next = p; - next++; - CDir *dir = p->first; - - // abort exports: - // - that are going to the failed node - // - that aren't frozen yet (to avoid auth_pin deadlock) - if (export_peer[dir] == who || - p->second == EXPORT_DISCOVERING || p->second == EXPORT_FREEZING) { - // the guy i'm exporting to failed, or we're just freezing. - dout(10) << "cleaning up export state " << p->second << " of " << *dir << dendl; - - switch (p->second) { - case EXPORT_DISCOVERING: - dout(10) << "export state=discovering : canceling freeze and removing auth_pin" << dendl; - dir->unfreeze_tree(); // cancel the freeze - dir->auth_unpin(); - export_state.erase(dir); // clean up - dir->state_clear(CDir::STATE_EXPORTING); - if (export_peer[dir] != who) // tell them. - mds->send_message_mds(new MExportDirCancel(dir->dirfrag()), export_peer[dir], MDS_PORT_MIGRATOR); - break; - - case EXPORT_FREEZING: - dout(10) << "export state=freezing : canceling freeze" << dendl; - dir->unfreeze_tree(); // cancel the freeze - export_state.erase(dir); // clean up - dir->state_clear(CDir::STATE_EXPORTING); - if (export_peer[dir] != who) // tell them. - mds->send_message_mds(new MExportDirCancel(dir->dirfrag()), export_peer[dir], MDS_PORT_MIGRATOR); - break; - - // NOTE: state order reversal, warning comes after loggingstart+prepping - case EXPORT_WARNING: - dout(10) << "export state=warning : unpinning bounds, unfreezing, notifying" << dendl; - // fall-thru - - //case EXPORT_LOGGINGSTART: - case EXPORT_PREPPING: - if (p->second != EXPORT_WARNING) - dout(10) << "export state=loggingstart|prepping : unpinning bounds, unfreezing" << dendl; - { - // unpin bounds - set bounds; - cache->get_subtree_bounds(dir, bounds); - for (set::iterator p = bounds.begin(); - p != bounds.end(); - ++p) { - CDir *bd = *p; - bd->put(CDir::PIN_EXPORTBOUND); - bd->state_clear(CDir::STATE_EXPORTBOUND); - } - } - dir->unfreeze_tree(); - cache->adjust_subtree_auth(dir, mds->get_nodeid()); - cache->try_subtree_merge(dir); - export_state.erase(dir); // clean up - dir->state_clear(CDir::STATE_EXPORTING); - break; - - case EXPORT_EXPORTING: - dout(10) << "export state=exporting : reversing, and unfreezing" << dendl; - export_reverse(dir); - export_state.erase(dir); // clean up - dir->state_clear(CDir::STATE_EXPORTING); - break; - - case EXPORT_LOGGINGFINISH: - case EXPORT_NOTIFYING: - dout(10) << "export state=loggingfinish|notifying : ignoring dest failure, we were successful." << dendl; - // leave export_state, don't clean up now. - break; - - default: - assert(0); - } - - // finish clean-up? - if (export_state.count(dir) == 0) { - export_peer.erase(dir); - export_warning_ack_waiting.erase(dir); - export_notify_ack_waiting.erase(dir); - - // unpin the path - vector trace; - cache->make_trace(trace, dir->inode); - mds->locker->dentry_anon_rdlock_trace_finish(trace); - - // wake up any waiters - mds->queue_waiters(export_finish_waiters[dir]); - export_finish_waiters.erase(dir); - - // send pending import_maps? (these need to go out when all exports have finished.) - cache->maybe_send_pending_resolves(); - - cache->show_subtrees(); - - maybe_do_queued_export(); - } - } else { - // bystander failed. - if (export_warning_ack_waiting.count(dir) && - export_warning_ack_waiting[dir].count(who)) { - export_warning_ack_waiting[dir].erase(who); - export_notify_ack_waiting[dir].erase(who); // they won't get a notify either. - if (p->second == EXPORT_WARNING) { - // exporter waiting for warning acks, let's fake theirs. - dout(10) << "faking export_warning_ack from mds" << who - << " on " << *dir << " to mds" << export_peer[dir] - << dendl; - if (export_warning_ack_waiting[dir].empty()) - export_go(dir); - } - } - if (export_notify_ack_waiting.count(dir) && - export_notify_ack_waiting[dir].count(who)) { - export_notify_ack_waiting[dir].erase(who); - if (p->second == EXPORT_NOTIFYING) { - // exporter is waiting for notify acks, fake it - dout(10) << "faking export_notify_ack from mds" << who - << " on " << *dir << " to mds" << export_peer[dir] - << dendl; - if (export_notify_ack_waiting[dir].empty()) - export_finish(dir); - } - } - } - - // next! - p = next; - } - - - // check my imports - map::iterator q = import_state.begin(); - while (q != import_state.end()) { - map::iterator next = q; - next++; - dirfrag_t df = q->first; - CInode *diri = mds->mdcache->get_inode(df.ino); - CDir *dir = mds->mdcache->get_dirfrag(df); - - if (import_peer[df] == who) { - switch (q->second) { - case IMPORT_DISCOVERING: - dout(10) << "import state=discovering : clearing state" << dendl; - import_state.erase(df); - import_peer.erase(df); - break; - - case IMPORT_DISCOVERED: - dout(10) << "import state=discovered : unpinning inode " << *diri << dendl; - assert(diri); - // unpin base - diri->put(CInode::PIN_IMPORTING); - import_state.erase(df); - import_peer.erase(df); - break; - - case IMPORT_PREPPING: - if (q->second == IMPORT_PREPPING) { - dout(10) << "import state=prepping : unpinning base+bounds " << *dir << dendl; - } - assert(dir); - { - set bounds; - cache->map_dirfrag_set(import_bound_ls[dir], bounds); - import_remove_pins(dir, bounds); - import_reverse_final(dir); - } - break; - - case IMPORT_PREPPED: - dout(10) << "import state=prepped : unpinning base+bounds, unfreezing " << *dir << dendl; - assert(dir); - { - set bounds; - cache->get_subtree_bounds(dir, bounds); - import_remove_pins(dir, bounds); - - // adjust auth back to me - cache->adjust_subtree_auth(dir, import_peer[df]); - cache->try_subtree_merge(dir); - - // bystanders? - if (import_bystanders[dir].empty()) { - import_reverse_unfreeze(dir); - } else { - // notify them; wait in aborting state - import_notify_abort(dir, bounds); - import_state[df] = IMPORT_ABORTING; - } - } - break; - - case IMPORT_LOGGINGSTART: - dout(10) << "import state=loggingstart : reversing import on " << *dir << dendl; - import_reverse(dir); - break; - - case IMPORT_ACKING: - // hrm. make this an ambiguous import, and wait for exporter recovery to disambiguate - dout(10) << "import state=acking : noting ambiguous import " << *dir << dendl; - { - set bounds; - cache->get_subtree_bounds(dir, bounds); - cache->add_ambiguous_import(dir, bounds); - } - break; - - case IMPORT_ABORTING: - dout(10) << "import state=aborting : ignoring repeat failure " << *dir << dendl; - break; - } - } else { - if (q->second == IMPORT_ABORTING && - import_bystanders[dir].count(who)) { - dout(10) << "faking export_notify_ack from mds" << who - << " on aborting import " << *dir << " from mds" << import_peer[df] - << dendl; - import_bystanders[dir].erase(who); - if (import_bystanders[dir].empty()) { - import_bystanders.erase(dir); - import_reverse_unfreeze(dir); - } - } - } - - // next! - q = next; - } -} - - - -void Migrator::show_importing() -{ - dout(10) << "show_importing" << dendl; - for (map::iterator p = import_state.begin(); - p != import_state.end(); - p++) { - CDir *dir = mds->mdcache->get_dirfrag(p->first); - if (dir) { - dout(10) << " importing from " << import_peer[p->first] - << ": (" << p->second << ") " << get_import_statename(p->second) - << " " << p->first - << " " << *dir - << dendl; - } else { - dout(10) << " importing from " << import_peer[p->first] - << ": (" << p->second << ") " << get_import_statename(p->second) - << " " << p->first - << dendl; - } - } -} - -void Migrator::show_exporting() -{ - dout(10) << "show_exporting" << dendl; - for (map::iterator p = export_state.begin(); - p != export_state.end(); - p++) - dout(10) << " exporting to " << export_peer[p->first] - << ": (" << p->second << ") " << get_export_statename(p->second) - << " " << p->first->dirfrag() - << " " << *p->first - << dendl; -} - - - -void Migrator::audit() -{ - if (g_conf.debug_mds < 5) return; // hrm. - - // import_state - show_importing(); - for (map::iterator p = import_state.begin(); - p != import_state.end(); - p++) { - if (p->second == IMPORT_DISCOVERING) - continue; - if (p->second == IMPORT_DISCOVERED) { - CInode *in = cache->get_inode(p->first.ino); - assert(in); - continue; - } - CDir *dir = cache->get_dirfrag(p->first); - assert(dir); - if (p->second == IMPORT_PREPPING) - continue; - assert(dir->is_ambiguous_dir_auth()); - assert(dir->authority().first == mds->get_nodeid() || - dir->authority().second == mds->get_nodeid()); - } - - // export_state - show_exporting(); - for (map::iterator p = export_state.begin(); - p != export_state.end(); - p++) { - CDir *dir = p->first; - if (p->second == EXPORT_DISCOVERING || - p->second == EXPORT_FREEZING) continue; - assert(dir->is_ambiguous_dir_auth()); - assert(dir->authority().first == mds->get_nodeid() || - dir->authority().second == mds->get_nodeid()); - } - - // ambiguous+me subtrees should be importing|exporting - - // write me -} - - - - - -// ========================================================== -// EXPORT - -void Migrator::export_dir_nicely(CDir *dir, int dest) -{ - // enqueue - dout(7) << "export_dir_nicely " << *dir << " to " << dest << dendl; - export_queue.push_back(pair(dir->dirfrag(), dest)); - - maybe_do_queued_export(); -} - -void Migrator::maybe_do_queued_export() -{ - while (!export_queue.empty() && - export_state.size() <= 4) { - dirfrag_t df = export_queue.front().first; - int dest = export_queue.front().second; - export_queue.pop_front(); - - CDir *dir = mds->mdcache->get_dirfrag(df); - if (!dir) continue; - if (!dir->is_auth()) continue; - - dout(-7) << "nicely exporting to mds" << dest << " " << *dir << dendl; - - export_dir(dir, dest); - } -} - - - - -class C_MDC_ExportFreeze : public Context { - Migrator *mig; - CDir *ex; // dir i'm exporting - -public: - C_MDC_ExportFreeze(Migrator *m, CDir *e) : - mig(m), ex(e) {} - virtual void finish(int r) { - if (r >= 0) - mig->export_frozen(ex); - } -}; - - -/** export_dir(dir, dest) - * public method to initiate an export. - * will fail if the directory is freezing, frozen, unpinnable, or root. - */ -void Migrator::export_dir(CDir *dir, int dest) -{ - dout(7) << "export_dir " << *dir << " to " << dest << dendl; - assert(dir->is_auth()); - assert(dest != mds->get_nodeid()); - - if (mds->mdsmap->is_degraded()) { - dout(7) << "cluster degraded, no exports for now" << dendl; - return; - } - - if (dir->inode->is_root()) { - dout(7) << "i won't export root" << dendl; - //assert(0); - return; - } - - if (dir->is_frozen() || - dir->is_freezing()) { - dout(7) << " can't export, freezing|frozen. wait for other exports to finish first." << dendl; - return; - } - if (dir->state_test(CDir::STATE_EXPORTING)) { - dout(7) << "already exporting" << dendl; - return; - } - - // pin path? - vector trace; - cache->make_trace(trace, dir->inode); - if (!mds->locker->dentry_can_rdlock_trace(trace)) { - dout(7) << "export_dir couldn't pin path, failing." << dendl; - return; - } - - // ok. - mds->locker->dentry_anon_rdlock_trace_start(trace); - assert(export_state.count(dir) == 0); - export_state[dir] = EXPORT_DISCOVERING; - export_peer[dir] = dest; - - dir->state_set(CDir::STATE_EXPORTING); - - // send ExportDirDiscover (ask target) - mds->send_message_mds(new MExportDirDiscover(dir), dest, MDS_PORT_MIGRATOR); - - // start the freeze, but hold it up with an auth_pin. - dir->auth_pin(); - dir->freeze_tree(); - assert(dir->is_freezing_tree()); - dir->add_waiter(CDir::WAIT_FROZEN, new C_MDC_ExportFreeze(this, dir)); -} - - -/* - * called on receipt of MExportDirDiscoverAck - * the importer now has the directory's _inode_ in memory, and pinned. - */ -void Migrator::handle_export_discover_ack(MExportDirDiscoverAck *m) -{ - CDir *dir = cache->get_dirfrag(m->get_dirfrag()); - assert(dir); - - dout(7) << "export_discover_ack from " << m->get_source() - << " on " << *dir << dendl; - - if (export_state.count(dir) == 0 || - export_state[dir] != EXPORT_DISCOVERING || - export_peer[dir] != m->get_source().num()) { - dout(7) << "must have aborted" << dendl; - } else { - // freeze the subtree - export_state[dir] = EXPORT_FREEZING; - dir->auth_unpin(); - } - - delete m; // done -} - -void Migrator::export_frozen(CDir *dir) -{ - dout(7) << "export_frozen on " << *dir << dendl; - assert(dir->is_frozen()); - assert(dir->get_cum_auth_pins() == 0); - - // ok! - int dest = export_peer[dir]; - - cache->show_subtrees(); - - // note the bounds. - // force it into a subtree by listing auth as . - cache->adjust_subtree_auth(dir, mds->get_nodeid(), mds->get_nodeid()); - set bounds; - cache->get_subtree_bounds(dir, bounds); - - // generate prep message, log entry. - MExportDirPrep *prep = new MExportDirPrep(dir->dirfrag()); - - // include list of bystanders - for (map::iterator p = dir->replicas_begin(); - p != dir->replicas_end(); - p++) { - if (p->first != dest) { - dout(10) << "bystander mds" << p->first << dendl; - prep->add_bystander(p->first); - } - } - - /* include spanning tree for all nested exports. - * these need to be on the destination _before_ the final export so that - * dir_auth updates on any nested exports are properly absorbed. - * this includes inodes and dirfrags included in the subtree, but - * only the inodes at the bounds. - */ - set inodes_added; - - // include base dirfrag - prep->add_dirfrag( new CDirDiscover(dir, dir->add_replica(dest)) ); - - // check bounds - for (set::iterator it = bounds.begin(); - it != bounds.end(); - it++) { - CDir *bound = *it; - - // pin it. - bound->get(CDir::PIN_EXPORTBOUND); - bound->state_set(CDir::STATE_EXPORTBOUND); - - dout(7) << " export bound " << *bound << dendl; - - prep->add_export( bound->dirfrag() ); - - /* first assemble each trace, in trace order, and put in message */ - list inode_trace; - - // trace to dir - CDir *cur = bound; - while (cur != dir) { - // don't repeat ourselves - if (inodes_added.count(cur->ino())) break; // did already! - inodes_added.insert(cur->ino()); - - // inode - assert(cur->inode->is_auth()); - inode_trace.push_front(cur->inode); - dout(7) << " will add " << *cur->inode << dendl; - - // include the dirfrag? only if it's not the bounding subtree root. - if (cur != bound) { - assert(cur->is_auth()); - prep->add_dirfrag( cur->replicate_to(dest) ); // yay! - dout(7) << " added " << *cur << dendl; - } - - cur = cur->get_parent_dir(); - } - - for (list::iterator it = inode_trace.begin(); - it != inode_trace.end(); - it++) { - CInode *in = *it; - dout(7) << " added " << *in->parent << dendl; - dout(7) << " added " << *in << dendl; - prep->add_inode( in->parent->get_dir()->dirfrag(), - in->parent->get_name(), - in->parent->replicate_to(dest), - in->replicate_to(dest) ); - } - - } - - // send. - export_state[dir] = EXPORT_PREPPING; - mds->send_message_mds(prep, dest, MDS_PORT_MIGRATOR); -} - -void Migrator::handle_export_prep_ack(MExportDirPrepAck *m) -{ - CDir *dir = cache->get_dirfrag(m->get_dirfrag()); - assert(dir); - - dout(7) << "export_prep_ack " << *dir << dendl; - - if (export_state.count(dir) == 0 || - export_state[dir] != EXPORT_PREPPING) { - // export must have aborted. - dout(7) << "export must have aborted" << dendl; - delete m; - return; - } - - // send warnings - int dest = export_peer[dir]; - set bounds; - cache->get_subtree_bounds(dir, bounds); - - assert(export_peer.count(dir)); - assert(export_warning_ack_waiting.count(dir) == 0); - assert(export_notify_ack_waiting.count(dir) == 0); - - for (map::iterator p = dir->replicas_begin(); - p != dir->replicas_end(); - ++p) { - if (p->first == dest) continue; - if (!mds->mdsmap->is_active_or_stopping(p->first)) - continue; // only if active - export_warning_ack_waiting[dir].insert(p->first); - export_notify_ack_waiting[dir].insert(p->first); // we'll eventually get a notifyack, too! - - MExportDirNotify *notify = new MExportDirNotify(dir->dirfrag(), true, - pair(mds->get_nodeid(),CDIR_AUTH_UNKNOWN), - pair(mds->get_nodeid(),export_peer[dir])); - notify->copy_bounds(bounds); - mds->send_message_mds(notify, p->first, MDS_PORT_MIGRATOR); - - } - export_state[dir] = EXPORT_WARNING; - - // nobody to warn? - if (export_warning_ack_waiting.count(dir) == 0) - export_go(dir); // start export. - - // done. - delete m; -} - - -class C_M_ExportGo : public Context { - Migrator *migrator; - CDir *dir; -public: - C_M_ExportGo(Migrator *m, CDir *d) : migrator(m), dir(d) {} - void finish(int r) { - migrator->export_go_synced(dir); - } -}; - -void Migrator::export_go(CDir *dir) -{ - assert(export_peer.count(dir)); - int dest = export_peer[dir]; - dout(7) << "export_go " << *dir << " to " << dest << dendl; - - // first sync log to flush out e.g. any cap imports - mds->mdlog->wait_for_sync(new C_M_ExportGo(this, dir)); -} - -void Migrator::export_go_synced(CDir *dir) -{ - assert(export_peer.count(dir)); - int dest = export_peer[dir]; - dout(7) << "export_go_synced " << *dir << " to " << dest << dendl; - - cache->show_subtrees(); - - export_warning_ack_waiting.erase(dir); - export_state[dir] = EXPORT_EXPORTING; - - assert(dir->get_cum_auth_pins() == 0); - - // set ambiguous auth - cache->adjust_subtree_auth(dir, dest, mds->get_nodeid()); - - // take away the popularity we're sending. - mds->balancer->subtract_export(dir); - - // fill export message with cache data - utime_t now = g_clock.now(); - map exported_client_map; - bufferlist export_data; - int num_exported_inodes = encode_export_dir( export_data, - dir, // recur start point - exported_client_map, - now ); - bufferlist bl; - ::_encode(exported_client_map, bl); - bl.claim_append(export_data); - export_data.claim(bl); - - // send the export data! - MExportDir *req = new MExportDir(dir->dirfrag()); - req->take_dirstate(export_data); - - // add bounds to message - set bounds; - cache->get_subtree_bounds(dir, bounds); - for (set::iterator p = bounds.begin(); - p != bounds.end(); - ++p) - req->add_export((*p)->dirfrag()); - - // send - mds->send_message_mds(req, dest, MDS_PORT_MIGRATOR); - - // stats - if (mds->logger) mds->logger->inc("ex"); - if (mds->logger) mds->logger->inc("iex", num_exported_inodes); - - cache->show_subtrees(); -} - - -/** encode_export_inode - * update our local state for this inode to export. - * encode relevant state to be sent over the wire. - * used by: encode_export_dir, file_rename (if foreign) - * - * FIXME: the separation between CInode.encode_export and these methods - * is pretty arbitrary and dumb. - */ -void Migrator::encode_export_inode(CInode *in, bufferlist& enc_state, - map& exported_client_map) -{ - dout(7) << "encode_export_inode " << *in << dendl; - assert(!in->is_replica(mds->get_nodeid())); - - ::_encode_simple(in->inode.ino, enc_state); - in->encode_export(enc_state); - - // caps - encode_export_inode_caps(in, enc_state, exported_client_map); -} - -void Migrator::encode_export_inode_caps(CInode *in, bufferlist& bl, - map& exported_client_map) -{ - // encode caps - map cap_map; - in->export_client_caps(cap_map); - ::_encode_simple(cap_map, bl); - - in->state_set(CInode::STATE_EXPORTINGCAPS); - - // make note of clients named by exported capabilities - for (map::iterator it = in->client_caps.begin(); - it != in->client_caps.end(); - it++) - exported_client_map[it->first] = mds->clientmap.get_inst(it->first); -} - -void Migrator::finish_export_inode_caps(CInode *in) -{ - in->state_clear(CInode::STATE_EXPORTINGCAPS); - - // tell (all) clients about migrating caps.. - for (map::iterator it = in->client_caps.begin(); - it != in->client_caps.end(); - it++) { - dout(7) << "finish_export_inode telling client" << it->first - << " exported caps on " << *in << dendl; - MClientFileCaps *m = new MClientFileCaps(MClientFileCaps::OP_EXPORT, - in->inode, - it->second.get_last_seq(), - it->second.pending(), - it->second.wanted()); - mds->send_message_client(m, it->first); - } - in->clear_client_caps(); -} - -void Migrator::finish_export_inode(CInode *in, utime_t now, list& finished) -{ - dout(12) << "finish_export_inode " << *in << dendl; - - in->finish_export(now); - - finish_export_inode_caps(in); - - // relax locks? - if (!in->is_replicated()) - in->replicate_relax_locks(); - - // clean - if (in->is_dirty()) in->mark_clean(); - - // clear/unpin cached_by (we're no longer the authority) - in->clear_replica_map(); - - // twiddle lock states for auth -> replica transition - in->authlock.export_twiddle(); - in->linklock.export_twiddle(); - in->dirfragtreelock.export_twiddle(); - in->filelock.export_twiddle(); - in->dirlock.export_twiddle(); - - // mark auth - assert(in->is_auth()); - in->state_clear(CInode::STATE_AUTH); - in->replica_nonce = CInode::EXPORT_NONCE; - - // waiters - in->take_waiting(CInode::WAIT_ANY, finished); - - // *** other state too? - - // move to end of LRU so we drop out of cache quickly! - if (in->get_parent_dn()) - cache->lru.lru_bottouch(in->get_parent_dn()); - -} - -int Migrator::encode_export_dir(bufferlist& exportbl, - CDir *dir, - map& exported_client_map, - utime_t now) -{ - int num_exported = 0; - - dout(7) << "encode_export_dir " << *dir << " " << dir->nitems << " items" << dendl; - - assert(dir->get_projected_version() == dir->get_version()); - - // dir - dirfrag_t df = dir->dirfrag(); - ::_encode_simple(df, exportbl); - dir->encode_export(exportbl); - - long nden = dir->items.size(); - ::_encode_simple(nden, exportbl); - - // dentries - list subdirs; - CDir::map_t::iterator it; - for (it = dir->begin(); it != dir->end(); it++) { - CDentry *dn = it->second; - CInode *in = dn->get_inode(); - - num_exported++; - - // -- dentry - dout(7) << "encode_export_dir exporting " << *dn << dendl; - - // dn name - ::_encode(it->first, exportbl); - - // state - dn->encode_export(exportbl); - - // points to... - - // null dentry? - if (dn->is_null()) { - exportbl.append("N", 1); // null dentry - continue; - } - - if (dn->is_remote()) { - // remote link - exportbl.append("L", 1); // remote link - - inodeno_t ino = dn->get_remote_ino(); - unsigned char d_type = dn->get_remote_d_type(); - ::_encode(ino, exportbl); - ::_encode(d_type, exportbl); - continue; - } - - // primary link - // -- inode - exportbl.append("I", 1); // inode dentry - - encode_export_inode(in, exportbl, exported_client_map); // encode, and (update state for) export - - // directory? - list dfs; - in->get_dirfrags(dfs); - for (list::iterator p = dfs.begin(); p != dfs.end(); ++p) { - CDir *dir = *p; - if (!dir->state_test(CDir::STATE_EXPORTBOUND)) { - // include nested dirfrag - assert(dir->get_dir_auth().first == CDIR_AUTH_PARENT); - subdirs.push_back(dir); // it's ours, recurse (later) - } - } - } - - // subdirs - for (list::iterator it = subdirs.begin(); it != subdirs.end(); it++) - num_exported += encode_export_dir(exportbl, *it, exported_client_map, now); - - return num_exported; -} - -void Migrator::finish_export_dir(CDir *dir, list& finished, utime_t now) -{ - dout(10) << "finish_export_dir " << *dir << dendl; - - // release open_by - dir->clear_replica_map(); - - // mark - assert(dir->is_auth()); - dir->state_clear(CDir::STATE_AUTH); - dir->replica_nonce = CDir::NONCE_EXPORT; - - if (dir->is_dirty()) - dir->mark_clean(); - - // discard most dir state - dir->state &= CDir::MASK_STATE_EXPORT_KEPT; // i only retain a few things. - - // suck up all waiters - dir->take_waiting(CDir::WAIT_ANY, finished); // all dir waiters - - // pop - dir->finish_export(now); - - // dentries - list subdirs; - CDir::map_t::iterator it; - for (it = dir->begin(); it != dir->end(); it++) { - CDentry *dn = it->second; - CInode *in = dn->get_inode(); - - // dentry - dn->finish_export(); - - // inode? - if (dn->is_primary()) { - finish_export_inode(in, now, finished); - - // subdirs? - in->get_nested_dirfrags(subdirs); - } - } - - // subdirs - for (list::iterator it = subdirs.begin(); it != subdirs.end(); it++) - finish_export_dir(*it, finished, now); -} - -class C_MDS_ExportFinishLogged : public Context { - Migrator *migrator; - CDir *dir; -public: - C_MDS_ExportFinishLogged(Migrator *m, CDir *d) : migrator(m), dir(d) {} - void finish(int r) { - migrator->export_logged_finish(dir); - } -}; - - -/* - * i should get an export_ack from the export target. - */ -void Migrator::handle_export_ack(MExportDirAck *m) -{ - CDir *dir = cache->get_dirfrag(m->get_dirfrag()); - assert(dir); - assert(dir->is_frozen_tree_root()); // i'm exporting! - - // yay! - dout(7) << "handle_export_ack " << *dir << dendl; - - export_warning_ack_waiting.erase(dir); - - export_state[dir] = EXPORT_LOGGINGFINISH; - - set bounds; - cache->get_subtree_bounds(dir, bounds); - - // log completion. - // include export bounds, to ensure they're in the journal. - EExport *le = new EExport(mds->mdlog, dir); - le->metablob.add_dir_context(dir); - le->metablob.add_dir( dir, false ); - for (set::iterator p = bounds.begin(); - p != bounds.end(); - ++p) { - CDir *bound = *p; - le->get_bounds().insert(bound->dirfrag()); - le->metablob.add_dir_context(bound); - le->metablob.add_dir(bound, false); - } - - // log export completion, then finish (unfreeze, trigger finish context, etc.) - mds->mdlog->submit_entry(le, - new C_MDS_ExportFinishLogged(this, dir)); - - delete m; -} - - - - - -/* - * this happens if hte dest failes after i send teh export data but before it is acked - * that is, we don't know they safely received and logged it, so we reverse our changes - * and go on. - */ -void Migrator::export_reverse(CDir *dir) -{ - dout(7) << "export_reverse " << *dir << dendl; - - assert(export_state[dir] == EXPORT_EXPORTING); - - set bounds; - cache->get_subtree_bounds(dir, bounds); - - // adjust auth, with possible subtree merge. - cache->adjust_subtree_auth(dir, mds->get_nodeid()); - cache->try_subtree_merge(dir); - - // remove exporting pins - list rq; - rq.push_back(dir); - while (!rq.empty()) { - CDir *dir = rq.front(); - rq.pop_front(); - dir->abort_export(); - for (CDir::map_t::iterator p = dir->items.begin(); p != dir->items.end(); ++p) { - p->second->abort_export(); - if (!p->second->is_primary()) continue; - CInode *in = p->second->get_inode(); - in->abort_export(); - if (in->is_dir()) - in->get_nested_dirfrags(rq); - } - } - - // unpin bounds - for (set::iterator p = bounds.begin(); - p != bounds.end(); - ++p) { - CDir *bd = *p; - bd->put(CDir::PIN_EXPORTBOUND); - bd->state_clear(CDir::STATE_EXPORTBOUND); - } - - // process delayed expires - cache->process_delayed_expire(dir); - - // some clean up - export_warning_ack_waiting.erase(dir); - export_notify_ack_waiting.erase(dir); - - // unfreeze - dir->unfreeze_tree(); - - cache->show_cache(); -} - - -/* - * once i get the ack, and logged the EExportFinish(true), - * send notifies (if any), otherwise go straight to finish. - * - */ -void Migrator::export_logged_finish(CDir *dir) -{ - dout(7) << "export_logged_finish " << *dir << dendl; - - // send notifies - int dest = export_peer[dir]; - - set bounds; - cache->get_subtree_bounds(dir, bounds); - - for (set::iterator p = export_notify_ack_waiting[dir].begin(); - p != export_notify_ack_waiting[dir].end(); - ++p) { - MExportDirNotify *notify; - if (mds->mdsmap->is_active_or_stopping(export_peer[dir])) - // dest is still alive. - notify = new MExportDirNotify(dir->dirfrag(), true, - pair(mds->get_nodeid(), dest), - pair(dest, CDIR_AUTH_UNKNOWN)); - else - // dest is dead. bystanders will think i am only auth, as per mdcache->handle_mds_failure() - notify = new MExportDirNotify(dir->dirfrag(), true, - pair(mds->get_nodeid(), CDIR_AUTH_UNKNOWN), - pair(dest, CDIR_AUTH_UNKNOWN)); - - notify->copy_bounds(bounds); - - mds->send_message_mds(notify, *p, MDS_PORT_MIGRATOR); - } - - // wait for notifyacks - export_state[dir] = EXPORT_NOTIFYING; - - // no notifies to wait for? - if (export_notify_ack_waiting[dir].empty()) - export_finish(dir); // skip notify/notify_ack stage. -} - -/* - * warning: - * i'll get an ack from each bystander. - * when i get them all, do the export. - * notify: - * i'll get an ack from each bystander. - * when i get them all, unfreeze and send the finish. - */ -void Migrator::handle_export_notify_ack(MExportDirNotifyAck *m) -{ - CDir *dir = cache->get_dirfrag(m->get_dirfrag()); - assert(dir); - int from = m->get_source().num(); - - if (export_state.count(dir) && export_state[dir] == EXPORT_WARNING) { - // exporting. process warning. - dout(7) << "handle_export_notify_ack from " << m->get_source() - << ": exporting, processing warning on " - << *dir << dendl; - assert(export_warning_ack_waiting.count(dir)); - export_warning_ack_waiting[dir].erase(from); - - if (export_warning_ack_waiting[dir].empty()) - export_go(dir); // start export. - } - else if (export_state.count(dir) && export_state[dir] == EXPORT_NOTIFYING) { - // exporting. process notify. - dout(7) << "handle_export_notify_ack from " << m->get_source() - << ": exporting, processing notify on " - << *dir << dendl; - assert(export_notify_ack_waiting.count(dir)); - export_notify_ack_waiting[dir].erase(from); - - if (export_notify_ack_waiting[dir].empty()) - export_finish(dir); - } - else if (import_state.count(dir->dirfrag()) && import_state[dir->dirfrag()] == IMPORT_ABORTING) { - // reversing import - dout(7) << "handle_export_notify_ack from " << m->get_source() - << ": aborting import on " - << *dir << dendl; - assert(import_bystanders[dir].count(from)); - import_bystanders[dir].erase(from); - if (import_bystanders[dir].empty()) { - import_bystanders.erase(dir); - import_reverse_unfreeze(dir); - } - } - - delete m; -} - - -void Migrator::export_finish(CDir *dir) -{ - dout(5) << "export_finish " << *dir << dendl; - - if (export_state.count(dir) == 0) { - dout(7) << "target must have failed, not sending final commit message. export succeeded anyway." << dendl; - return; - } - - // send finish/commit to new auth - if (mds->mdsmap->is_active_or_stopping(export_peer[dir])) { - mds->send_message_mds(new MExportDirFinish(dir->dirfrag()), - export_peer[dir], MDS_PORT_MIGRATOR); - } else { - dout(7) << "not sending MExportDirFinish, dest has failed" << dendl; - } - - // finish export (adjust local cache state) - C_Contexts *fin = new C_Contexts; - finish_export_dir(dir, fin->contexts, g_clock.now()); - dir->add_waiter(CDir::WAIT_UNFREEZE, fin); - - // unfreeze - dout(7) << "export_finish unfreezing" << dendl; - dir->unfreeze_tree(); - - // unpin bounds - set bounds; - cache->get_subtree_bounds(dir, bounds); - for (set::iterator p = bounds.begin(); - p != bounds.end(); - ++p) { - CDir *bd = *p; - bd->put(CDir::PIN_EXPORTBOUND); - bd->state_clear(CDir::STATE_EXPORTBOUND); - } - - // adjust auth, with possible subtree merge. - // (we do this _after_ removing EXPORTBOUND pins, to allow merges) - cache->adjust_subtree_auth(dir, export_peer[dir]); - cache->try_subtree_merge(dir); - - // unpin path - dout(7) << "export_finish unpinning path" << dendl; - vector trace; - cache->make_trace(trace, dir->inode); - mds->locker->dentry_anon_rdlock_trace_finish(trace); - - // discard delayed expires - cache->discard_delayed_expire(dir); - - // remove from exporting list, clean up state - dir->state_clear(CDir::STATE_EXPORTING); - export_state.erase(dir); - export_peer.erase(dir); - export_notify_ack_waiting.erase(dir); - - // queue finishers - mds->queue_waiters(export_finish_waiters[dir]); - export_finish_waiters.erase(dir); - - cache->show_subtrees(); - audit(); - - // send pending import_maps? - mds->mdcache->maybe_send_pending_resolves(); - - maybe_do_queued_export(); -} - - - - - - - - -// ========================================================== -// IMPORT - -void Migrator::handle_export_discover(MExportDirDiscover *m) -{ - assert(m->get_source().num() != mds->get_nodeid()); - - dout(7) << "handle_export_discover on " << m->get_path() << dendl; - - // note import state - dirfrag_t df = m->get_dirfrag(); - - // only start discovering on this message once. - if (!m->started) { - m->started = true; - import_state[df] = IMPORT_DISCOVERING; - import_peer[df] = m->get_source().num(); - } - - // am i retrying after ancient path_traverse results? - if (import_state.count(df) == 0 && - import_state[df] != IMPORT_DISCOVERING) { - dout(7) << "hmm import_state is off, i must be obsolete lookup" << dendl; - delete m; - return; - } - - // do we have it? - CInode *in = cache->get_inode(m->get_dirfrag().ino); - if (!in) { - // must discover it! - filepath fpath(m->get_path()); - vector trace; - int r = cache->path_traverse(0, m, fpath, trace, true, MDS_TRAVERSE_DISCOVER); - if (r > 0) return; // wait - if (r < 0) { - dout(7) << "handle_export_discover_2 failed to discover or not dir " << m->get_path() << ", NAK" << dendl; - assert(0); // this shouldn't happen if the auth pins his path properly!!!! - } - - assert(0); // this shouldn't happen; the get_inode above would have succeeded. - } - - // yay - dout(7) << "handle_export_discover have " << df << " inode " << *in << dendl; - - import_state[m->get_dirfrag()] = IMPORT_DISCOVERED; - - // pin inode in the cache (for now) - assert(in->is_dir()); - in->get(CInode::PIN_IMPORTING); - - // reply - dout(7) << " sending export_discover_ack on " << *in << dendl; - mds->send_message_mds(new MExportDirDiscoverAck(df), - import_peer[df], MDS_PORT_MIGRATOR); -} - -void Migrator::handle_export_cancel(MExportDirCancel *m) -{ - dout(7) << "handle_export_cancel on " << m->get_dirfrag() << dendl; - - if (import_state[m->get_dirfrag()] == IMPORT_DISCOVERED) { - CInode *in = cache->get_inode(m->get_dirfrag().ino); - assert(in); - in->put(CInode::PIN_IMPORTING); - } else { - assert(import_state[m->get_dirfrag()] == IMPORT_DISCOVERING); - } - - import_state.erase(m->get_dirfrag()); - import_peer.erase(m->get_dirfrag()); - - delete m; -} - - -void Migrator::handle_export_prep(MExportDirPrep *m) -{ - int oldauth = m->get_source().num(); - assert(oldauth != mds->get_nodeid()); - - // make sure we didn't abort - if (import_state.count(m->get_dirfrag()) == 0 || - (import_state[m->get_dirfrag()] != IMPORT_DISCOVERED && - import_state[m->get_dirfrag()] != IMPORT_PREPPING) || - import_peer[m->get_dirfrag()] != oldauth) { - dout(10) << "handle_export_prep import has aborted, dropping" << dendl; - delete m; - return; - } - - CInode *diri = cache->get_inode(m->get_dirfrag().ino); - assert(diri); - - list finished; - - // assimilate root dir. - CDir *dir; - - if (!m->did_assim()) { - dir = cache->add_replica_dir(diri, - m->get_dirfrag().frag, *m->get_dirfrag_discover(m->get_dirfrag()), - oldauth, finished); - dout(7) << "handle_export_prep on " << *dir << " (first pass)" << dendl; - } else { - dir = cache->get_dirfrag(m->get_dirfrag()); - assert(dir); - dout(7) << "handle_export_prep on " << *dir << " (subsequent pass)" << dendl; - } - assert(dir->is_auth() == false); - - cache->show_subtrees(); - - // build import bound map - map import_bound_fragset; - for (list::iterator p = m->get_bounds().begin(); - p != m->get_bounds().end(); - ++p) { - dout(10) << " bound " << *p << dendl; - import_bound_fragset[p->ino].insert(p->frag); - } - - // assimilate contents? - if (!m->did_assim()) { - dout(7) << "doing assim on " << *dir << dendl; - m->mark_assim(); // only do this the first time! - - // move pin to dir - diri->put(CInode::PIN_IMPORTING); - dir->get(CDir::PIN_IMPORTING); - dir->state_set(CDir::STATE_IMPORTING); - - // change import state - import_state[dir->dirfrag()] = IMPORT_PREPPING; - import_bound_ls[dir] = m->get_bounds(); - - // bystander list - import_bystanders[dir] = m->get_bystanders(); - dout(7) << "bystanders are " << import_bystanders[dir] << dendl; - - // assimilate traces to exports - for (list::iterator it = m->get_inodes().begin(); - it != m->get_inodes().end(); - it++) { - // inode - CInode *in = cache->get_inode( (*it)->get_ino() ); - if (in) { - (*it)->update_inode(in); - dout(7) << " updated " << *in << dendl; - } else { - in = new CInode(mds->mdcache, false); - (*it)->update_inode(in); - - // link to the containing dir - CDir *condir = cache->get_dirfrag( m->get_containing_dirfrag(in->ino()) ); - assert(condir); - cache->add_inode( in ); - condir->add_primary_dentry( m->get_dentry(in->ino()), in ); - - dout(7) << " added " << *in << dendl; - } - - assert( in->get_parent_dir()->dirfrag() == m->get_containing_dirfrag(in->ino()) ); - - // dirs - for (list::iterator pf = m->get_inode_dirfrags(in->ino()).begin(); - pf != m->get_inode_dirfrags(in->ino()).end(); - ++pf) { - // add/update - cache->add_replica_dir(in, *pf, *m->get_dirfrag_discover(dirfrag_t(in->ino(), *pf)), - oldauth, finished); - } - } - - // make bound sticky - for (map::iterator p = import_bound_fragset.begin(); - p != import_bound_fragset.end(); - ++p) { - CInode *in = cache->get_inode(p->first); - assert(in); - in->get_stickydirs(); - dout(7) << " set stickydirs on bound inode " << *in << dendl; - } - - } else { - dout(7) << " not doing assim on " << *dir << dendl; - } - - if (!finished.empty()) - mds->queue_waiters(finished); - - - // open all bounds - set import_bounds; - for (map::iterator p = import_bound_fragset.begin(); - p != import_bound_fragset.end(); - ++p) { - CInode *in = cache->get_inode(p->first); - assert(in); - - // map fragset into a frag_t list, based on the inode fragtree - list fglist; - for (set::iterator q = p->second.begin(); q != p->second.end(); ++q) - in->dirfragtree.get_leaves_under(*q, fglist); - dout(10) << " bound inode " << p->first << " fragset " << p->second << " maps to " << fglist << dendl; - - for (list::iterator q = fglist.begin(); - q != fglist.end(); - ++q) { - CDir *bound = cache->get_dirfrag(dirfrag_t(p->first, *q)); - if (!bound) { - dout(7) << " opening bounding dirfrag " << *q << " on " << *in << dendl; - cache->open_remote_dirfrag(in, *q, - new C_MDS_RetryMessage(mds, m)); - return; - } - - if (!bound->state_test(CDir::STATE_IMPORTBOUND)) { - dout(7) << " pinning import bound " << *bound << dendl; - bound->get(CDir::PIN_IMPORTBOUND); - bound->state_set(CDir::STATE_IMPORTBOUND); - } else { - dout(7) << " already pinned import bound " << *bound << dendl; - } - import_bounds.insert(bound); - } - } - - dout(7) << " all ready, noting auth and freezing import region" << dendl; - - // note that i am an ambiguous auth for this subtree. - // specify bounds, since the exporter explicitly defines the region. - cache->adjust_bounded_subtree_auth(dir, import_bounds, - pair(oldauth, mds->get_nodeid())); - cache->verify_subtree_bounds(dir, import_bounds); - - // freeze. - dir->_freeze_tree(); - - // ok! - dout(7) << " sending export_prep_ack on " << *dir << dendl; - mds->send_message_mds(new MExportDirPrepAck(dir->dirfrag()), - m->get_source().num(), MDS_PORT_MIGRATOR); - - // note new state - import_state[dir->dirfrag()] = IMPORT_PREPPED; - - // done - delete m; - -} - - - - -class C_MDS_ImportDirLoggedStart : public Context { - Migrator *migrator; - CDir *dir; - int from; -public: - map imported_client_map; - - C_MDS_ImportDirLoggedStart(Migrator *m, CDir *d, int f) : - migrator(m), dir(d), from(f) { - } - void finish(int r) { - migrator->import_logged_start(dir, from, imported_client_map); - } -}; - -void Migrator::handle_export_dir(MExportDir *m) -{ - CDir *dir = cache->get_dirfrag(m->get_dirfrag()); - assert(dir); - - int oldauth = m->get_source().num(); - dout(7) << "handle_export_dir importing " << *dir << " from " << oldauth << dendl; - assert(dir->is_auth() == false); - - cache->show_subtrees(); - - C_MDS_ImportDirLoggedStart *onlogged = new C_MDS_ImportDirLoggedStart(this, dir, m->get_source().num()); - - // start the journal entry - EImportStart *le = new EImportStart(dir->dirfrag(), m->get_bounds()); - le->metablob.add_dir_context(dir); - - // adjust auth (list us _first_) - cache->adjust_subtree_auth(dir, mds->get_nodeid(), oldauth); - - // add this crap to my cache - bufferlist::iterator blp = m->get_dirstate().begin(); - - // new client sessions, open these after we journal - ::_decode_simple(onlogged->imported_client_map, blp); - mds->server->prepare_force_open_sessions(onlogged->imported_client_map); - - int num_imported_inodes = 0; - while (!blp.end()) { - num_imported_inodes += - decode_import_dir(blp, - oldauth, - dir, // import root - le, - mds->mdlog->get_current_segment(), - import_caps[dir], - import_updated_scatterlocks[dir]); - } - dout(10) << " " << m->get_bounds().size() << " imported bounds" << dendl; - - // include imported sessions in EImportStart - le->client_map.claim(m->get_dirstate()); - - // include bounds in EImportStart - set import_bounds; - cache->get_subtree_bounds(dir, import_bounds); - for (set::iterator it = import_bounds.begin(); - it != import_bounds.end(); - it++) - le->metablob.add_dir(*it, false); // note that parent metadata is already in the event - - // adjust popularity - mds->balancer->add_import(dir); - - dout(7) << "handle_export_dir did " << *dir << dendl; - - // log it - mds->mdlog->submit_entry(le, onlogged); - - // note state - import_state[dir->dirfrag()] = IMPORT_LOGGINGSTART; - - // some stats - if (mds->logger) { - mds->logger->inc("im"); - mds->logger->inc("iim", num_imported_inodes); - } - - delete m; -} - - -/* - * this is an import helper - * called by import_finish, and import_reverse and friends. - */ -void Migrator::import_remove_pins(CDir *dir, set& bounds) -{ - // root - dir->put(CDir::PIN_IMPORTING); - dir->state_clear(CDir::STATE_IMPORTING); - - // bounds - set didinodes; - for (set::iterator it = bounds.begin(); - it != bounds.end(); - it++) { - CDir *bd = *it; - bd->put(CDir::PIN_IMPORTBOUND); - bd->state_clear(CDir::STATE_IMPORTBOUND); - CInode *bdi = bd->get_inode(); - if (didinodes.count(bdi) == 0) { - bdi->put_stickydirs(); - didinodes.insert(bdi); - } - } -} - - -/* - * note: this does teh full work of reversing and import and cleaning up - * state. - * called by both handle_mds_failure and by handle_resolve (if we are - * a survivor coping with an exporter failure+recovery). - */ -void Migrator::import_reverse(CDir *dir) -{ - dout(7) << "import_reverse " << *dir << dendl; - - set bounds; - cache->get_subtree_bounds(dir, bounds); - - // remove pins - import_remove_pins(dir, bounds); - - // update auth, with possible subtree merge. - assert(dir->is_subtree_root()); - cache->adjust_subtree_auth(dir, import_peer[dir->dirfrag()]); - cache->try_subtree_merge(dir); - - // adjust auth bits. - list q; - q.push_back(dir); - while (!q.empty()) { - CDir *cur = q.front(); - q.pop_front(); - - // dir - assert(cur->is_auth()); - cur->state_clear(CDir::STATE_AUTH); - cur->clear_replica_map(); - if (cur->is_dirty()) - cur->mark_clean(); - - CDir::map_t::iterator it; - for (it = cur->begin(); it != cur->end(); it++) { - CDentry *dn = it->second; - - // dentry - dn->state_clear(CDentry::STATE_AUTH); - dn->clear_replica_map(); - if (dn->is_dirty()) - dn->mark_clean(); - - // inode? - if (dn->is_primary()) { - CInode *in = dn->get_inode(); - in->state_clear(CDentry::STATE_AUTH); - in->clear_replica_map(); - if (in->is_dirty()) - in->mark_clean(); - in->authlock.clear_gather(); - in->linklock.clear_gather(); - in->dirfragtreelock.clear_gather(); - in->filelock.clear_gather(); - - // non-bounding dir? - list dfs; - in->get_dirfrags(dfs); - for (list::iterator p = dfs.begin(); p != dfs.end(); ++p) - if (bounds.count(*p) == 0) - q.push_back(*p); - } - } - } - - // reexport caps - for (map >::iterator p = import_caps[dir].begin(); - p != import_caps[dir].end(); - ++p) { - CInode *in = p->first; - /* - * bleh.. just export all caps for this inode. the auth mds - * will pick them up during recovery. - */ - map cap_map; // throw this away - in->export_client_caps(cap_map); - finish_export_inode_caps(in); - } - - // log our failure - mds->mdlog->submit_entry(new EImportFinish(dir, false)); // log failure - - // bystanders? - if (import_bystanders[dir].empty()) { - dout(7) << "no bystanders, finishing reverse now" << dendl; - import_reverse_unfreeze(dir); - } else { - // notify them; wait in aborting state - dout(7) << "notifying bystanders of abort" << dendl; - import_notify_abort(dir, bounds); - import_state[dir->dirfrag()] = IMPORT_ABORTING; - } -} - -void Migrator::import_notify_abort(CDir *dir, set& bounds) -{ - dout(7) << "import_notify_abort " << *dir << dendl; - - for (set::iterator p = import_bystanders[dir].begin(); - p != import_bystanders[dir].end(); - ++p) { - // NOTE: the bystander will think i am _only_ auth, because they will have seen - // the exporter's failure and updated the subtree auth. see mdcache->handle_mds_failure(). - MExportDirNotify *notify = - new MExportDirNotify(dir->dirfrag(), true, - pair(mds->get_nodeid(), CDIR_AUTH_UNKNOWN), - pair(import_peer[dir->dirfrag()], CDIR_AUTH_UNKNOWN)); - notify->copy_bounds(bounds); - mds->send_message_mds(notify, *p, MDS_PORT_MIGRATOR); - } -} - -void Migrator::import_reverse_unfreeze(CDir *dir) -{ - dout(7) << "import_reverse_unfreeze " << *dir << dendl; - dir->unfreeze_tree(); - cache->discard_delayed_expire(dir); - import_reverse_final(dir); -} - -void Migrator::import_reverse_final(CDir *dir) -{ - dout(7) << "import_reverse_final " << *dir << dendl; - - // clean up - import_state.erase(dir->dirfrag()); - import_peer.erase(dir->dirfrag()); - import_bystanders.erase(dir); - import_bound_ls.erase(dir); - import_updated_scatterlocks.erase(dir); - import_caps.erase(dir); - - // send pending import_maps? - mds->mdcache->maybe_send_pending_resolves(); - - cache->show_subtrees(); - //audit(); // this fails, bc we munge up the subtree map during handle_import_map (resolve phase) -} - - - - -void Migrator::import_logged_start(CDir *dir, int from, - map& imported_client_map) -{ - dout(7) << "import_logged " << *dir << dendl; - - // note state - import_state[dir->dirfrag()] = IMPORT_ACKING; - - // force open client sessions and finish cap import - mds->server->finish_force_open_sessions(imported_client_map); - - for (map >::iterator p = import_caps[dir].begin(); - p != import_caps[dir].end(); - ++p) { - finish_import_inode_caps(p->first, from, p->second); - } - - // send notify's etc. - dout(7) << "sending ack for " << *dir << " to old auth mds" << from << dendl; - mds->send_message_mds(new MExportDirAck(dir->dirfrag()), - from, MDS_PORT_MIGRATOR); - - cache->show_subtrees(); -} - - -void Migrator::handle_export_finish(MExportDirFinish *m) -{ - CDir *dir = cache->get_dirfrag(m->get_dirfrag()); - assert(dir); - dout(7) << "handle_export_finish on " << *dir << dendl; - import_finish(dir); - delete m; -} - -void Migrator::import_finish(CDir *dir) -{ - dout(7) << "import_finish on " << *dir << dendl; - - // log finish - mds->mdlog->submit_entry(new EImportFinish(dir, true)); - - // clear updated scatterlocks - for (list::iterator p = import_updated_scatterlocks[dir].begin(); - p != import_updated_scatterlocks[dir].end(); - ++p) - (*p)->clear_updated(); - - // remove pins - set bounds; - cache->get_subtree_bounds(dir, bounds); - import_remove_pins(dir, bounds); - - // adjust auth, with possible subtree merge. - cache->adjust_subtree_auth(dir, mds->get_nodeid()); - cache->try_subtree_merge(dir); - - // clear import state (we're done!) - import_state.erase(dir->dirfrag()); - import_peer.erase(dir->dirfrag()); - import_bystanders.erase(dir); - import_bound_ls.erase(dir); - import_caps.erase(dir); - import_updated_scatterlocks.erase(dir); - - // process delayed expires - cache->process_delayed_expire(dir); - - // ok now unfreeze (and thus kick waiters) - dir->unfreeze_tree(); - - cache->show_subtrees(); - //audit(); // this fails, bc we munge up the subtree map during handle_import_map (resolve phase) - - // send pending import_maps? - mds->mdcache->maybe_send_pending_resolves(); - - // is it empty? - if (dir->get_size() == 0 && - !dir->inode->is_auth()) { - // reexport! - export_empty_import(dir); - } -} - - -void Migrator::decode_import_inode(CDentry *dn, bufferlist::iterator& blp, int oldauth, - LogSegment *ls, - map >& cap_imports, - list& updated_scatterlocks) -{ - dout(15) << "decode_import_inode on " << *dn << dendl; - - inodeno_t ino; - ::_decode_simple(ino, blp); - - bool added = false; - CInode *in = cache->get_inode(ino); - if (!in) { - in = new CInode(mds->mdcache); - added = true; - } else { - in->state_set(CInode::STATE_AUTH); - } - - // state after link -- or not! -sage - in->decode_import(blp, ls); // cap imports are noted for later action - - // caps - decode_import_inode_caps(in, blp, cap_imports); - - // link before state -- or not! -sage - if (dn->inode != in) { - assert(!dn->inode); - dn->dir->link_primary_inode(dn, in); - } - - // add inode? - if (added) { - cache->add_inode(in); - dout(10) << "added " << *in << dendl; - } else { - dout(10) << " had " << *in << dendl; - } - - // clear if dirtyscattered, since we're going to journal this - // but not until we _actually_ finish the import... - if (in->dirlock.is_updated()) - updated_scatterlocks.push_back(&in->dirlock); - - // put in autoscatter list? - // this is conservative, but safe. - if (in->dirlock.get_state() == LOCK_SCATTER) - mds->locker->note_autoscattered(&in->dirlock); - - // adjust replica list - //assert(!in->is_replica(oldauth)); // not true on failed export - in->add_replica(oldauth, CInode::EXPORT_NONCE); - if (in->is_replica(mds->get_nodeid())) - in->remove_replica(mds->get_nodeid()); - -} - -void Migrator::decode_import_inode_caps(CInode *in, - bufferlist::iterator &blp, - map >& cap_imports) -{ - map cap_map; - ::_decode_simple(cap_map, blp); - if (!cap_map.empty()) { - cap_imports[in].swap(cap_map); - in->get(CInode::PIN_IMPORTINGCAPS); - } -} - -void Migrator::finish_import_inode_caps(CInode *in, int from, - map &cap_map) -{ - assert(!cap_map.empty()); - - set new_caps; - in->merge_client_caps(cap_map, new_caps); - in->put(CInode::PIN_IMPORTINGCAPS); - - for (set::iterator it = new_caps.begin(); - it != new_caps.end(); - it++) { - dout(0) << "finish_import_inode_caps for client" << *it << " on " << *in << dendl; - MClientFileCaps *caps = new MClientFileCaps(MClientFileCaps::OP_IMPORT, - in->inode, - in->client_caps[*it].get_last_seq(), - in->client_caps[*it].pending(), - in->client_caps[*it].wanted()); - caps->set_mds(from); // from whom? - mds->send_message_client(caps, *it); - } -} - -int Migrator::decode_import_dir(bufferlist::iterator& blp, - int oldauth, - CDir *import_root, - EImportStart *le, - LogSegment *ls, - map >& cap_imports, - list& updated_scatterlocks) -{ - // set up dir - dirfrag_t df; - ::_decode_simple(df, blp); - - CInode *diri = cache->get_inode(df.ino); - assert(diri); - CDir *dir = diri->get_or_open_dirfrag(mds->mdcache, df.frag); - assert(dir); - - dout(7) << "decode_import_dir " << *dir << dendl; - - // assimilate state - dir->decode_import(blp); - - // mark (may already be marked from get_or_open_dir() above) - if (!dir->is_auth()) - dir->state_set(CDir::STATE_AUTH); - - // adjust replica list - //assert(!dir->is_replica(oldauth)); // not true on failed export - dir->add_replica(oldauth); - if (dir->is_replica(mds->get_nodeid())) - dir->remove_replica(mds->get_nodeid()); - - // add to journal entry - if (le) - le->metablob.add_dir(dir, - true, // Hmm: dirty=false would be okay in some cases - dir->is_complete()); - - int num_imported = 0; - - // take all waiters on this dir - // NOTE: a pass of imported data is guaranteed to get all of my waiters because - // a replica's presense in my cache implies/forces it's presense in authority's. - list waiters; - - dir->take_waiting(CDir::WAIT_ANY, waiters); - for (list::iterator it = waiters.begin(); - it != waiters.end(); - it++) - import_root->add_waiter(CDir::WAIT_UNFREEZE, *it); // UNFREEZE will get kicked both on success or failure - - dout(15) << "doing contents" << dendl; - - // contents - long nden; - ::_decode_simple(nden, blp); - - for (; nden>0; nden--) { - num_imported++; - - // dentry - string dname; - ::_decode_simple(dname, blp); - - CDentry *dn = dir->lookup(dname); - if (!dn) - dn = dir->add_null_dentry(dname); - - dn->decode_import(blp, ls); - - dn->add_replica(oldauth, CDentry::EXPORT_NONCE); - if (dn->is_replica(mds->get_nodeid())) - dn->remove_replica(mds->get_nodeid()); - - dout(15) << "decode_import_dir got " << *dn << dendl; - - // points to... - char icode; - ::_decode_simple(icode, blp); - - if (icode == 'N') { - // null dentry - assert(dn->is_null()); - - // fall thru - } - else if (icode == 'L') { - // remote link - inodeno_t ino; - unsigned char d_type; - ::_decode_simple(ino, blp); - ::_decode_simple(d_type, blp); - if (dn->is_remote()) { - assert(dn->get_remote_ino() == ino); - } else { - dir->link_remote_inode(dn, ino, d_type); - } - } - else if (icode == 'I') { - // inode - decode_import_inode(dn, blp, oldauth, ls, cap_imports, updated_scatterlocks); - } - - // add dentry to journal entry - if (le) - le->metablob.add_dentry(dn, dn->is_dirty()); - } - - dout(7) << "decode_import_dir done " << *dir << dendl; - return num_imported; -} - - - - - -// authority bystander - -void Migrator::handle_export_notify(MExportDirNotify *m) -{ - CDir *dir = cache->get_dirfrag(m->get_dirfrag()); - - int from = m->get_source().num(); - pair old_auth = m->get_old_auth(); - pair new_auth = m->get_new_auth(); - - if (!dir) { - dout(7) << "handle_export_notify " << old_auth << " -> " << new_auth - << " on missing dir " << m->get_dirfrag() << dendl; - } else if (dir->authority() != old_auth) { - dout(7) << "handle_export_notify old_auth was " << dir->authority() - << " != " << old_auth << " -> " << new_auth - << " on " << *dir << dendl; - } else { - dout(7) << "handle_export_notify " << old_auth << " -> " << new_auth - << " on " << *dir << dendl; - // adjust auth - set have; - cache->map_dirfrag_set(m->get_bounds(), have); - cache->adjust_bounded_subtree_auth(dir, have, new_auth); - - // induce a merge? - cache->try_subtree_merge(dir); - } - - // send ack - if (m->wants_ack()) { - mds->send_message_mds(new MExportDirNotifyAck(m->get_dirfrag()), - from, MDS_PORT_MIGRATOR); - } else { - // aborted. no ack. - dout(7) << "handle_export_notify no ack requested" << dendl; - } - - delete m; -} - - - - - - - - -/** cap exports **/ - - - -void Migrator::export_caps(CInode *in) -{ - int dest = in->authority().first; - dout(7) << "export_caps to mds" << dest << " " << *in << dendl; - - assert(in->is_any_caps()); - assert(!in->is_auth()); - assert(!in->is_ambiguous_auth()); - assert(!in->state_test(CInode::STATE_EXPORTINGCAPS)); - - MExportCaps *ex = new MExportCaps; - ex->ino = in->ino(); - - encode_export_inode_caps(in, ex->cap_bl, ex->client_map); - - mds->send_message_mds(ex, dest, MDS_PORT_MIGRATOR); -} - -void Migrator::handle_export_caps_ack(MExportCapsAck *ack) -{ - CInode *in = cache->get_inode(ack->ino); - assert(in); - dout(10) << "handle_export_caps_ack " << *ack << " from " << ack->get_source() - << " on " << *in - << dendl; - - finish_export_inode_caps(in); - delete ack; -} - - -class C_M_LoggedImportCaps : public Context { - Migrator *migrator; - CInode *in; - int from; -public: - map > cap_imports; - - C_M_LoggedImportCaps(Migrator *m, CInode *i, int f) : migrator(m), in(i), from(f) {} - void finish(int r) { - migrator->logged_import_caps(in, from, cap_imports); - } -}; - -void Migrator::handle_export_caps(MExportCaps *ex) -{ - dout(10) << "handle_export_caps " << *ex << " from " << ex->get_source() << dendl; - CInode *in = cache->get_inode(ex->ino); - - assert(in->is_auth()); - /* - * note: i may be frozen, but i won't have been encoded for export (yet)! - * see export_go() vs export_go_synced(). - */ - - C_M_LoggedImportCaps *finish = new C_M_LoggedImportCaps(this, in, ex->get_source().num()); - ESessions *le = new ESessions(mds->clientmap.inc_projected()); - - // decode new caps - bufferlist::iterator blp = ex->cap_bl.begin(); - decode_import_inode_caps(in, blp, finish->cap_imports); - assert(!finish->cap_imports.empty()); // thus, inode is pinned. - - // journal open client sessions - mds->server->prepare_force_open_sessions(ex->client_map); - le->client_map.swap(ex->client_map); - - mds->mdlog->submit_entry(le, finish); - - delete ex; -} - - -void Migrator::logged_import_caps(CInode *in, - int from, - map >& cap_imports) -{ - dout(10) << "logged_import_caps on " << *in << dendl; - assert(cap_imports.count(in)); - finish_import_inode_caps(in, from, cap_imports[in]); - - mds->send_message_mds(new MExportCapsAck(in->ino()), from, MDS_PORT_MIGRATOR); -} - - - - - diff --git a/branches/sage/mds/mds/ScatterLock.h b/branches/sage/mds/mds/ScatterLock.h deleted file mode 100644 index 24a1361f82d68..0000000000000 --- a/branches/sage/mds/mds/ScatterLock.h +++ /dev/null @@ -1,183 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __SCATTERLOCK_H -#define __SCATTERLOCK_H - -#include "SimpleLock.h" - - -// lock state machine states: -// Sync -- Lock -- sCatter -// Tempsync _/ -// auth repl -#define LOCK_SYNC__ // R . R . rdlocks allowed on auth and replicas -#define LOCK_GLOCKS -20 // r . r . waiting for replicas+rdlocks (auth), or rdlocks to release (replica) -#define LOCK_GSCATTERS -28 // r . r . - -#define LOCK_GSYNCL__ // . w LOCK on replica. -#define LOCK_LOCK__ // . W . . -#define LOCK_GTEMPSYNCL -21 // . w LOCK on replica. - -#define LOCK_GLOCKC -22 // . wp . wp waiting for replicas+wrlocks (auth), or wrlocks to release (replica) -#define LOCK_SCATTER 23 // . Wp . WP mtime updates on replicas allowed, no reads. stable here. -#define LOCK_GTEMPSYNCC -24 // . wp . wp GLOCKC|LOCK on replica - -#define LOCK_GSCATTERT -25 // r . LOCK on replica. -#define LOCK_GLOCKT -26 // r . LOCK on replica. -#define LOCK_TEMPSYNC 27 // R . LOCK on replica. - - -inline const char *get_scatterlock_state_name(int s) { - switch(s) { - case LOCK_SYNC: return "Sync"; - case LOCK_GLOCKS: return "gLockS"; - case LOCK_GSCATTERS: return "gScatterS"; - - case LOCK_GSYNCL: return "gSyncL"; - case LOCK_LOCK: return "Lock"; - case LOCK_GTEMPSYNCL: return "gTempsyncL"; - - case LOCK_GLOCKC: return "gLockC"; - case LOCK_SCATTER: return "sCatter"; - case LOCK_GTEMPSYNCC: return "gTempsyncC"; - - case LOCK_GSCATTERT: return "gsCatterT"; - case LOCK_GLOCKT: return "gLockT"; - case LOCK_TEMPSYNC: return "Tempsync"; - - default: assert(0); return 0; - } -} - -class ScatterLock : public SimpleLock { - int num_wrlock; - bool updated; - utime_t last_scatter; - -public: - xlist::item xlistitem_autoscattered; - - ScatterLock(MDSCacheObject *o, int t, int wo) : - SimpleLock(o, t, wo), - num_wrlock(0), - updated(false), - xlistitem_autoscattered(this) {} - - int get_replica_state() { - switch (state) { - case LOCK_SYNC: - return LOCK_SYNC; - - case LOCK_GSCATTERS: // hrm. - case LOCK_GLOCKS: - case LOCK_GSYNCL: - case LOCK_LOCK: - case LOCK_GTEMPSYNCL: - case LOCK_GLOCKC: - return LOCK_LOCK; - - case LOCK_SCATTER: - return LOCK_SCATTER; - - case LOCK_GTEMPSYNCC: - case LOCK_GSCATTERT: - case LOCK_GLOCKT: - case LOCK_TEMPSYNC: - return LOCK_LOCK; - default: - assert(0); - return 0; - } - } - - void set_updated() { - if (!updated) { - parent->get(MDSCacheObject::PIN_DIRTYSCATTERED); - updated = true; - } - } - void clear_updated() { - if (updated) { - parent->put(MDSCacheObject::PIN_DIRTYSCATTERED); - updated = false; - parent->clear_dirty_scattered(type); - } - } - bool is_updated() { return updated; } - - void set_last_scatter(utime_t t) { last_scatter = t; } - utime_t get_last_scatter() { return last_scatter; } - - void replicate_relax() { - } - - void export_twiddle() { - clear_gather(); - state = get_replica_state(); - } - - // rdlock - bool can_rdlock(MDRequest *mdr) { - return state == LOCK_SYNC || state == LOCK_TEMPSYNC; - } - bool can_rdlock_soon() { - return state == LOCK_GTEMPSYNCC; - } - - // xlock - bool can_xlock_soon() { - if (parent->is_auth()) - return (state == LOCK_GLOCKC || - state == LOCK_GLOCKS); - else - return false; - } - - // wrlock - bool can_wrlock() { - return state == LOCK_SCATTER || state == LOCK_LOCK; - } - void get_wrlock() { - assert(can_wrlock()); - if (num_wrlock == 0) parent->get(MDSCacheObject::PIN_LOCK); - ++num_wrlock; - } - void put_wrlock() { - --num_wrlock; - if (num_wrlock == 0) parent->put(MDSCacheObject::PIN_LOCK); - } - bool is_wrlocked() { return num_wrlock > 0; } - int get_num_wrlocks() { return num_wrlock; } - - void print(ostream& out) { - out << "("; - out << get_lock_type_name(get_type()) << " "; - out << get_scatterlock_state_name(get_state()); - if (!get_gather_set().empty()) out << " g=" << get_gather_set(); - if (is_rdlocked()) - out << " r=" << get_num_rdlocks(); - if (is_xlocked()) - out << " x=" << get_xlocked_by(); - if (is_wrlocked()) - out << " wr=" << get_num_wrlocks(); - if (updated) - out << " updated"; - out << ")"; - } - -}; - -#endif diff --git a/branches/sage/mds/mds/Server.cc b/branches/sage/mds/mds/Server.cc deleted file mode 100644 index b2fce5779d14e..0000000000000 --- a/branches/sage/mds/mds/Server.cc +++ /dev/null @@ -1,4057 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "MDS.h" -#include "Server.h" -#include "Locker.h" -#include "MDCache.h" -#include "MDLog.h" -#include "Migrator.h" -#include "MDBalancer.h" -#include "AnchorClient.h" -#include "IdAllocator.h" - -#include "msg/Messenger.h" - -#include "messages/MClientSession.h" -#include "messages/MClientRequest.h" -#include "messages/MClientReply.h" -#include "messages/MClientReconnect.h" -#include "messages/MClientFileCaps.h" - -#include "messages/MMDSSlaveRequest.h" - -#include "messages/MLock.h" - -#include "messages/MDentryUnlink.h" - -#include "events/EString.h" -#include "events/EUpdate.h" -#include "events/ESlaveUpdate.h" -#include "events/ESession.h" -#include "events/EOpen.h" - -#include "include/filepath.h" -#include "common/Timer.h" -#include "common/Logger.h" -#include "common/LogType.h" - -#include -#include - -#include -#include -using namespace std; - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) *_dout << dbeginl << g_clock.now() << " mds" << mds->get_nodeid() << ".server " -#define derr(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) *_derr << dbeginl << g_clock.now() << " mds" << mds->get_nodeid() << ".server " - - -void Server::reopen_logger(utime_t start, bool append) -{ - static LogType mdserver_logtype; - static bool didit = false; - if (!didit) { - didit = true; - mdserver_logtype.add_inc("hcreq"); // handle client req - mdserver_logtype.add_inc("hsreq"); // slave - mdserver_logtype.add_inc("hcsess"); // client session - mdserver_logtype.add_inc("dcreq"); // dispatch client req - mdserver_logtype.add_inc("dsreq"); // slave - } - - if (logger) - delete logger; - - // logger - char name[80]; - sprintf(name, "mds%d.server", mds->get_nodeid()); - logger = new Logger(name, &mdserver_logtype, append); - logger->set_start(start); -} - - -void Server::dispatch(Message *m) -{ - switch (m->get_type()) { - case MSG_CLIENT_RECONNECT: - handle_client_reconnect((MClientReconnect*)m); - return; - } - - // active? - if (!mds->is_active() && !mds->is_stopping()) { - dout(3) << "not active yet, waiting" << dendl; - mds->wait_for_active(new C_MDS_RetryMessage(mds, m)); - return; - } - - switch (m->get_type()) { - case MSG_CLIENT_SESSION: - handle_client_session((MClientSession*)m); - return; - case MSG_CLIENT_REQUEST: - handle_client_request((MClientRequest*)m); - return; - case MSG_MDS_SLAVE_REQUEST: - handle_slave_request((MMDSSlaveRequest*)m); - return; - } - - dout(1) << "server unknown message " << m->get_type() << dendl; - assert(0); -} - - - -// ---------------------------------------------------------- -// SESSION management - - -class C_MDS_session_finish : public Context { - MDS *mds; - entity_inst_t client_inst; - bool open; - version_t cmapv; -public: - C_MDS_session_finish(MDS *m, entity_inst_t ci, bool s, version_t mv) : - mds(m), client_inst(ci), open(s), cmapv(mv) { } - void finish(int r) { - assert(r == 0); - mds->server->_session_logged(client_inst, open, cmapv); - } -}; - - -void Server::handle_client_session(MClientSession *m) -{ - dout(3) << "handle_client_session " << *m << " from " << m->get_source() << dendl; - int from = m->get_source().num(); - bool open = m->op == MClientSession::OP_REQUEST_OPEN; - - if (open) { - if (mds->clientmap.have_session(from)) { - dout(10) << "already open, dropping this req" << dendl; - delete m; - return; - } - if (mds->clientmap.is_opening(from)) { - dout(10) << "already opening, dropping this req" << dendl; - delete m; - return; - } - mds->clientmap.add_opening(from); - } else { - if (mds->clientmap.is_closing(from)) { - dout(10) << "already closing, dropping this req" << dendl; - delete m; - return; - } - if (m->seq < mds->clientmap.get_push_seq(from)) { - dout(10) << "old push seq " << m->seq << " < " << mds->clientmap.get_push_seq(from) - << ", dropping" << dendl; - delete m; - return; - } - assert(m->seq == mds->clientmap.get_push_seq(from)); - - mds->clientmap.add_closing(from); - } - - // journal it - version_t cmapv = mds->clientmap.inc_projected(); - dout(10) << " clientmap v " << mds->clientmap.get_version() << " pv " << cmapv << dendl; - mdlog->submit_entry(new ESession(m->get_source_inst(), open, cmapv), - new C_MDS_session_finish(mds, m->get_source_inst(), open, cmapv)); - delete m; - - if (logger) logger->inc("hcsess"); -} - -void Server::_session_logged(entity_inst_t client_inst, bool open, version_t cmapv) -{ - dout(10) << "_session_logged " << client_inst << " " << (open ? "open":"close") - << " " << cmapv - << dendl; - - // apply - int from = client_inst.name.num(); - if (open) { - assert(mds->clientmap.is_opening(from)); - mds->clientmap.open_session(client_inst); - } else if (mds->clientmap.is_closing(from)) { - mds->clientmap.close_session(from); - - // purge completed requests from clientmap - mds->clientmap.trim_completed_requests(client_inst.name, 0); - } else { - // close must have been canceled (by an import?) ... - assert(!open); - mds->clientmap.noop(); - } - - assert(cmapv == mds->clientmap.get_version()); - - // reply - if (open) - mds->messenger->send_message(new MClientSession(MClientSession::OP_OPEN), client_inst); - else - mds->messenger->send_message(new MClientSession(MClientSession::OP_CLOSE), client_inst); -} - -void Server::prepare_force_open_sessions(map& cm) -{ - version_t cmapv = mds->clientmap.inc_projected(); - dout(10) << "prepare_force_open_sessions " << cmapv - << " on " << cm.size() << " clients" - << dendl; - for (map::iterator p = cm.begin(); p != cm.end(); ++p) { - mds->clientmap.add_opening(p->first); - } -} - -void Server::finish_force_open_sessions(map& cm) -{ - version_t v = mds->clientmap.get_version(); - dout(10) << "finish_force_open_sessions on " << cm.size() << " clients, v " << v << " -> " << (v+1) << dendl; - for (map::iterator p = cm.begin(); p != cm.end(); ++p) { - if (mds->clientmap.is_closing(p->first)) { - dout(15) << "force_open_sessions canceling close on " << p->second << dendl; - mds->clientmap.remove_closing(p->first); - continue; - } - if (mds->clientmap.have_session(p->first)) { - dout(15) << "force_open_sessions have session " << p->second << dendl; - continue; - } - - dout(10) << "force_open_sessions opening " << p->second << dendl; - mds->clientmap.open_session(p->second); - mds->messenger->send_message(new MClientSession(MClientSession::OP_OPEN), p->second); - } - mds->clientmap.set_version(v+1); -} - - -void Server::terminate_sessions() -{ - dout(2) << "terminate_sessions" << dendl; - - // kill them off. clients will retry etc. - for (set::const_iterator p = mds->clientmap.get_session_set().begin(); - p != mds->clientmap.get_session_set().end(); - ++p) { - if (mds->clientmap.is_closing(*p)) - continue; - mds->clientmap.add_closing(*p); - version_t cmapv = mds->clientmap.inc_projected(); - mdlog->submit_entry(new ESession(mds->clientmap.get_inst(*p), false, cmapv), - new C_MDS_session_finish(mds, mds->clientmap.get_inst(*p), false, cmapv)); - } -} - - -void Server::reconnect_clients() -{ - // reconnect with clients - if (mds->clientmap.get_session_set().empty()) { - dout(7) << "reconnect_clients -- no sessions, doing nothing." << dendl; - reconnect_gather_finish(); - return; - } - - dout(7) << "reconnect_clients -- sending mdsmap to clients with sessions" << dendl; - - mds->bcast_mds_map(); // send mdsmap to all client sessions - - // init gather list - reconnect_start = g_clock.now(); - client_reconnect_gather = mds->clientmap.get_session_set(); - dout(1) << "reconnect_clients -- " << client_reconnect_gather.size() << " sessions" << dendl; -} - -void Server::handle_client_reconnect(MClientReconnect *m) -{ - dout(7) << "handle_client_reconnect " << m->get_source() << dendl; - int from = m->get_source().num(); - - if (m->closed) { - dout(7) << " client had no session, removing from clientmap" << dendl; - - mds->clientmap.add_closing(from); - version_t cmapv = mds->clientmap.inc_projected(); - mdlog->submit_entry(new ESession(mds->clientmap.get_inst(from), false, cmapv), - new C_MDS_session_finish(mds, mds->clientmap.get_inst(from), false, cmapv)); - - } else { - - // caps - for (map::iterator p = m->inode_caps.begin(); - p != m->inode_caps.end(); - ++p) { - CInode *in = mdcache->get_inode(p->first); - if (in && in->is_auth()) { - // we recovered it, and it's ours. take note. - dout(15) << "open caps on " << *in << dendl; - in->reconnect_cap(from, p->second); - reconnected_caps.insert(in); - continue; - } - - filepath path = m->inode_path[p->first]; - if ((in && !in->is_auth()) || - !mds->mdcache->path_is_mine(path)) { - // not mine. - dout(0) << "non-auth " << p->first << " " << m->inode_path[p->first] - << ", will pass off to authority" << dendl; - - // mark client caps stale. - inode_t fake_inode; - fake_inode.ino = p->first; - MClientFileCaps *stale = new MClientFileCaps(MClientFileCaps::OP_EXPORT, - fake_inode, - 0, - 0, // doesn't matter. - p->second.wanted); // doesn't matter. - mds->send_message_client(stale, m->get_source_inst()); - - // add to cap export list. - mdcache->rejoin_export_caps(p->first, m->inode_path[p->first], from, p->second); - } else { - // mine. fetch later. - dout(0) << "missing " << p->first << " " << m->inode_path[p->first] - << " (mine), will load later" << dendl; - mdcache->rejoin_recovered_caps(p->first, m->inode_path[p->first], from, p->second, - -1); // "from" me. - } - } - } - - // remove from gather set - client_reconnect_gather.erase(from); - if (client_reconnect_gather.empty()) reconnect_gather_finish(); - - delete m; -} - -/* - * called by mdcache, late in rejoin (right before acks are sent) - */ -void Server::process_reconnected_caps() -{ - dout(10) << "process_reconnected_caps" << dendl; - - // adjust filelock state appropriately - for (set::iterator p = reconnected_caps.begin(); - p != reconnected_caps.end(); - ++p) { - CInode *in = *p; - int issued = in->get_caps_issued(); - if (in->is_auth()) { - // wr? - if (issued & (CAP_FILE_WR|CAP_FILE_WRBUFFER)) { - if (issued & (CAP_FILE_RDCACHE|CAP_FILE_WRBUFFER)) { - in->filelock.set_state(LOCK_LONER); - } else { - in->filelock.set_state(LOCK_MIXED); - } - } - } else { - // note that client should perform stale/reap cleanup during reconnect. - assert(issued & (CAP_FILE_WR|CAP_FILE_WRBUFFER) == 0); // ???? - if (in->filelock.is_xlocked()) - in->filelock.set_state(LOCK_LOCK); - else - in->filelock.set_state(LOCK_SYNC); // might have been lock, previously - } - dout(15) << " issued " << cap_string(issued) - << " chose " << in->filelock - << " on " << *in << dendl; - } - reconnected_caps.clear(); // clean up -} - - -void Server::client_reconnect_failure(int from) -{ - dout(5) << "client_reconnect_failure on client" << from << dendl; - if (mds->is_reconnect() && - client_reconnect_gather.count(from)) { - client_reconnect_gather.erase(from); - if (client_reconnect_gather.empty()) - reconnect_gather_finish(); - } -} - -void Server::reconnect_gather_finish() -{ - dout(7) << "reconnect_gather_finish" << dendl; - mds->reconnect_done(); -} - - - -/******* - * some generic stuff for finishing off requests - */ - - -/* - * send generic response (just and error code) - */ -void Server::reply_request(MDRequest *mdr, int r, CInode *tracei) -{ - reply_request(mdr, new MClientReply(mdr->client_request, r), tracei); -} - - -/* - * send given reply - * include a trace to tracei - */ -void Server::reply_request(MDRequest *mdr, MClientReply *reply, CInode *tracei) -{ - MClientRequest *req = mdr->client_request; - - dout(10) << "reply_request " << reply->get_result() - << " (" << strerror(-reply->get_result()) - << ") " << *req << dendl; - - // note result code in clientmap? - if (!req->is_idempotent()) - mds->clientmap.add_completed_request(mdr->reqid); - - /* - if (tracei && !tracei->hack_accessed) { - tracei->hack_accessed = true; - mds->logger->inc("newt"); - if (tracei->parent && - tracei->parent->dir->hack_num_accessed >= 0) { - tracei->parent->dir->hack_num_accessed++; - if (tracei->parent->dir->hack_num_accessed == 1) - mds->logger->inc("dirt1"); - if (tracei->parent->dir->hack_num_accessed == 2) - mds->logger->inc("dirt2"); - if (tracei->parent->dir->hack_num_accessed == 3) - mds->logger->inc("dirt3"); - if (tracei->parent->dir->hack_num_accessed == 4) - mds->logger->inc("dirt4"); - if (tracei->parent->dir->hack_num_accessed == 5) - mds->logger->inc("dirt5"); - } - } - */ - - // include trace - if (tracei) - reply->set_trace_dist( tracei, mds->get_nodeid() ); - - reply->set_mdsmap_epoch(mds->mdsmap->get_epoch()); - - // send reply - if (req->get_client_inst().name.is_mds()) - delete reply; // mds doesn't need a reply - else - messenger->send_message(reply, req->get_client_inst()); - - // finish request - mdcache->request_finish(mdr); - - if (tracei && - tracei->get_parent_dn() && - tracei->get_parent_dn()->is_remote()) - mdcache->eval_remote(tracei->get_parent_dn()); -} - - - - - -/*** - * process a client request - */ -void Server::handle_client_request(MClientRequest *req) -{ - dout(4) << "handle_client_request " << *req << dendl; - - if (logger) logger->inc("hcreq"); - - if (!mds->is_active() && - !(mds->is_stopping() && req->get_client_inst().name.is_mds())) { - dout(5) << " not active (or stopping+mds), discarding request." << dendl; - delete req; - return; - } - - if (!mdcache->get_root()) { - dout(5) << "need to open root" << dendl; - mdcache->open_root(new C_MDS_RetryMessage(mds, req)); - return; - } - - // active session? - if (req->get_client_inst().name.is_client() && - !mds->clientmap.have_session(req->get_client_inst().name.num())) { - dout(5) << "no session for " << req->get_client_inst().name << ", dropping" << dendl; - delete req; - return; - } - - // old mdsmap? - if (req->get_mdsmap_epoch() < mds->mdsmap->get_epoch()) { - // send it? hrm, this isn't ideal; they may get a lot of copies if - // they have a high request rate. - } - - // okay, i want - CInode *ref = 0; - - // retry? - if (req->get_retry_attempt()) { - if (mds->clientmap.have_completed_request(req->get_reqid())) { - dout(5) << "already completed " << req->get_reqid() << dendl; - mds->messenger->send_message(new MClientReply(req, 0), req->get_client_inst()); - delete req; - return; - } - } - // trim completed_request list - if (req->get_oldest_client_tid() > 0) { - dout(15) << " oldest_client_tid=" << req->get_oldest_client_tid() << dendl; - mds->clientmap.trim_completed_requests(req->get_client_inst().name, - req->get_oldest_client_tid()); - } - - - // ----- - // some ops are on ino's - switch (req->get_op()) { - case MDS_OP_FSTAT: - ref = mdcache->get_inode(req->args.fstat.ino); - assert(ref); - break; - - case MDS_OP_TRUNCATE: - if (!req->args.truncate.ino) - break; // can be called w/ either fh OR path - ref = mdcache->get_inode(req->args.truncate.ino); - assert(ref); - break; - - case MDS_OP_FSYNC: - ref = mdcache->get_inode(req->args.fsync.ino); // fixme someday no ino needed? - assert(ref); - break; - } - - // register + dispatch - MDRequest *mdr = mdcache->request_start(req); - if (!mdr) return; - - if (ref) { - dout(10) << "inode op on ref " << *ref << dendl; - mdr->ref = ref; - mdr->pin(ref); - } - - dispatch_client_request(mdr); - return; -} - - -void Server::dispatch_client_request(MDRequest *mdr) -{ - MClientRequest *req = mdr->client_request; - - if (logger) logger->inc("dcreq"); - - if (mdr->ref) { - dout(7) << "dispatch_client_request " << *req << " ref " << *mdr->ref << dendl; - } else { - dout(7) << "dispatch_client_request " << *req << dendl; - } - - // we shouldn't be waiting on anyone. - assert(mdr->more()->waiting_on_slave.empty()); - - switch (req->get_op()) { - - // inodes ops. - case MDS_OP_STAT: - case MDS_OP_LSTAT: - handle_client_stat(mdr); - break; - case MDS_OP_UTIME: - handle_client_utime(mdr); - break; - case MDS_OP_CHMOD: - handle_client_chmod(mdr); - break; - case MDS_OP_CHOWN: - handle_client_chown(mdr); - break; - case MDS_OP_TRUNCATE: - handle_client_truncate(mdr); - break; - case MDS_OP_READDIR: - handle_client_readdir(mdr); - break; - case MDS_OP_FSYNC: - //handle_client_fsync(req, ref); - break; - - // funky. - case MDS_OP_OPEN: - if (req->args.open.flags & O_CREAT) - handle_client_openc(mdr); - else - handle_client_open(mdr); - break; - - // namespace. - // no prior locks. - case MDS_OP_MKNOD: - handle_client_mknod(mdr); - break; - case MDS_OP_LINK: - handle_client_link(mdr); - break; - case MDS_OP_UNLINK: - case MDS_OP_RMDIR: - handle_client_unlink(mdr); - break; - case MDS_OP_RENAME: - handle_client_rename(mdr); - break; - case MDS_OP_MKDIR: - handle_client_mkdir(mdr); - break; - case MDS_OP_SYMLINK: - handle_client_symlink(mdr); - break; - - - default: - dout(1) << " unknown client op " << req->get_op() << dendl; - assert(0); - } -} - - -// --------------------------------------- -// SLAVE REQUESTS - -void Server::handle_slave_request(MMDSSlaveRequest *m) -{ - dout(4) << "handle_slave_request " << m->get_reqid() << " from " << m->get_source() << dendl; - int from = m->get_source().num(); - - if (logger) logger->inc("hsreq"); - - // reply? - if (m->is_reply()) { - - switch (m->get_op()) { - case MMDSSlaveRequest::OP_XLOCKACK: - { - // identify lock, master request - SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(), - m->get_object_info()); - MDRequest *mdr = mdcache->request_get(m->get_reqid()); - mdr->more()->slaves.insert(from); - dout(10) << "got remote xlock on " << *lock << " on " << *lock->get_parent() << dendl; - mdr->xlocks.insert(lock); - mdr->locks.insert(lock); - lock->get_xlock(mdr); - lock->finish_waiters(SimpleLock::WAIT_REMOTEXLOCK); - } - break; - - case MMDSSlaveRequest::OP_AUTHPINACK: - { - MDRequest *mdr = mdcache->request_get(m->get_reqid()); - handle_slave_auth_pin_ack(mdr, m); - } - break; - - case MMDSSlaveRequest::OP_LINKPREPACK: - { - MDRequest *mdr = mdcache->request_get(m->get_reqid()); - handle_slave_link_prep_ack(mdr, m); - } - break; - - case MMDSSlaveRequest::OP_RENAMEPREPACK: - { - MDRequest *mdr = mdcache->request_get(m->get_reqid()); - handle_slave_rename_prep_ack(mdr, m); - } - break; - - default: - assert(0); - } - - // done with reply. - delete m; - return; - - } else { - // am i a new slave? - MDRequest *mdr; - if (mdcache->have_request(m->get_reqid())) { - // existing? - mdr = mdcache->request_get(m->get_reqid()); - if (mdr->slave_to_mds != from) { // may not even be a slave! (e.g. forward race) - dout(10) << "local request " << *mdr << " not slave to mds" << from - << ", ignoring " << *m << dendl; - delete m; - return; - } - } else { - // new? - if (m->get_op() == MMDSSlaveRequest::OP_FINISH) { - dout(10) << "missing slave request for " << m->get_reqid() - << " OP_FINISH, must have lost race with a forward" << dendl; - delete m; - return; - } - mdr = mdcache->request_start_slave(m->get_reqid(), m->get_source().num()); - } - assert(mdr->slave_request == 0); // only one at a time, please! - mdr->slave_request = m; - - dispatch_slave_request(mdr); - } -} - -void Server::dispatch_slave_request(MDRequest *mdr) -{ - dout(7) << "dispatch_slave_request " << *mdr << " " << *mdr->slave_request << dendl; - - if (mdr->aborted) { - dout(7) << " abort flag set, finishing" << dendl; - mdcache->request_finish(mdr); - return; - } - - if (logger) logger->inc("dsreq"); - - switch (mdr->slave_request->get_op()) { - case MMDSSlaveRequest::OP_XLOCK: - { - // identify object - SimpleLock *lock = mds->locker->get_lock(mdr->slave_request->get_lock_type(), - mdr->slave_request->get_object_info()); - - if (lock && lock->get_parent()->is_auth()) { - // xlock. - // use acquire_locks so that we get auth_pinning. - set rdlocks; - set wrlocks; - set xlocks = mdr->xlocks; - xlocks.insert(lock); - - if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) - return; - - // ack - MMDSSlaveRequest *r = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_XLOCKACK); - r->set_lock_type(lock->get_type()); - lock->get_parent()->set_object_info(r->get_object_info()); - mds->send_message_mds(r, mdr->slave_request->get_source().num(), MDS_PORT_SERVER); - } else { - if (lock) { - dout(10) << "not auth for remote xlock attempt, dropping on " - << *lock << " on " << *lock->get_parent() << dendl; - } else { - dout(10) << "don't have object, dropping" << dendl; - assert(0); // can this happen, if we auth pinned properly. - } - } - - // done. - delete mdr->slave_request; - mdr->slave_request = 0; - } - break; - - case MMDSSlaveRequest::OP_UNXLOCK: - { - SimpleLock *lock = mds->locker->get_lock(mdr->slave_request->get_lock_type(), - mdr->slave_request->get_object_info()); - assert(lock); - mds->locker->xlock_finish(lock, mdr); - - // done. no ack necessary. - delete mdr->slave_request; - mdr->slave_request = 0; - } - break; - - case MMDSSlaveRequest::OP_AUTHPIN: - handle_slave_auth_pin(mdr); - break; - - case MMDSSlaveRequest::OP_LINKPREP: - case MMDSSlaveRequest::OP_UNLINKPREP: - handle_slave_link_prep(mdr); - break; - - case MMDSSlaveRequest::OP_RENAMEPREP: - handle_slave_rename_prep(mdr); - break; - - case MMDSSlaveRequest::OP_FINISH: - // finish off request. - mdcache->request_finish(mdr); - break; - - default: - assert(0); - } -} - - -void Server::handle_slave_auth_pin(MDRequest *mdr) -{ - dout(10) << "handle_slave_auth_pin " << *mdr << dendl; - - // build list of objects - list objects; - bool fail = false; - - for (list::iterator p = mdr->slave_request->get_authpins().begin(); - p != mdr->slave_request->get_authpins().end(); - ++p) { - MDSCacheObject *object = mdcache->get_object(*p); - if (!object) { - dout(10) << " don't have " << *p << dendl; - fail = true; - break; - } - - objects.push_back(object); - } - - // can we auth pin them? - if (!fail) { - for (list::iterator p = objects.begin(); - p != objects.end(); - ++p) { - if (!(*p)->is_auth()) { - dout(10) << " not auth for " << **p << dendl; - fail = true; - break; - } - if (!mdr->is_auth_pinned(*p) && - !(*p)->can_auth_pin()) { - // wait - dout(10) << " waiting for authpinnable on " << **p << dendl; - (*p)->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr)); - mdr->drop_local_auth_pins(); - return; - } - } - } - - // auth pin! - if (fail) { - mdr->drop_local_auth_pins(); // just in case - } else { - for (list::iterator p = objects.begin(); - p != objects.end(); - ++p) { - dout(10) << "auth_pinning " << **p << dendl; - mdr->auth_pin(*p); - } - } - - // ack! - MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_AUTHPINACK); - - // return list of my auth_pins (if any) - for (set::iterator p = mdr->auth_pins.begin(); - p != mdr->auth_pins.end(); - ++p) { - MDSCacheObjectInfo info; - (*p)->set_object_info(info); - reply->get_authpins().push_back(info); - } - - mds->send_message_mds(reply, mdr->slave_to_mds, MDS_PORT_SERVER); - - // clean up this request - delete mdr->slave_request; - mdr->slave_request = 0; - return; -} - -void Server::handle_slave_auth_pin_ack(MDRequest *mdr, MMDSSlaveRequest *ack) -{ - dout(10) << "handle_slave_auth_pin_ack on " << *mdr << " " << *ack << dendl; - int from = ack->get_source().num(); - - // added auth pins? - set pinned; - for (list::iterator p = ack->get_authpins().begin(); - p != ack->get_authpins().end(); - ++p) { - MDSCacheObject *object = mdcache->get_object(*p); - assert(object); // we pinned it - dout(10) << " remote has pinned " << *object << dendl; - if (!mdr->is_auth_pinned(object)) - mdr->remote_auth_pins.insert(object); - pinned.insert(object); - } - - // removed auth pins? - set::iterator p = mdr->remote_auth_pins.begin(); - while (p != mdr->remote_auth_pins.end()) { - if ((*p)->authority().first == from && - pinned.count(*p) == 0) { - dout(10) << " remote has unpinned " << **p << dendl; - set::iterator o = p; - ++p; - mdr->remote_auth_pins.erase(o); - } else { - ++p; - } - } - - // note slave - mdr->more()->slaves.insert(from); - - // clear from waiting list - assert(mdr->more()->waiting_on_slave.count(from)); - mdr->more()->waiting_on_slave.erase(from); - - // go again? - if (mdr->more()->waiting_on_slave.empty()) - dispatch_client_request(mdr); - else - dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl; -} - - -// --------------------------------------- -// HELPERS - - -/** validate_dentry_dir - * - * verify that the dir exists and would own the dname. - * do not check if the dentry exists. - */ -CDir *Server::validate_dentry_dir(MDRequest *mdr, CInode *diri, const string& dname) -{ - // make sure parent is a dir? - if (!diri->is_dir()) { - dout(7) << "validate_dentry_dir: not a dir" << dendl; - reply_request(mdr, -ENOTDIR); - return false; - } - - // which dirfrag? - frag_t fg = diri->pick_dirfrag(dname); - CDir *dir = try_open_auth_dirfrag(diri, fg, mdr); - if (!dir) - return 0; - - // frozen? - if (dir->is_frozen()) { - dout(7) << "dir is frozen " << *dir << dendl; - dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr)); - return false; - } - - return dir; -} - - -/** prepare_null_dentry - * prepare a null (or existing) dentry in given dir. - * wait for any dn lock. - */ -CDentry* Server::prepare_null_dentry(MDRequest *mdr, CDir *dir, const string& dname, bool okexist) -{ - dout(10) << "prepare_null_dentry " << dname << " in " << *dir << dendl; - assert(dir->is_auth()); - - // does it already exist? - CDentry *dn = dir->lookup(dname); - if (dn) { - if (dn->lock.is_xlocked_by_other(mdr)) { - dout(10) << "waiting on xlocked dentry " << *dn << dendl; - dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr)); - return 0; - } - - if (!dn->is_null()) { - // name already exists - dout(10) << "dentry " << dname << " exists in " << *dir << dendl; - if (!okexist) { - reply_request(mdr, -EEXIST); - return 0; - } - } - - return dn; - } - - // make sure dir is complete - if (!dir->is_complete()) { - dout(7) << " incomplete dir contents for " << *dir << ", fetching" << dendl; - dir->fetch(new C_MDS_RetryRequest(mdcache, mdr)); - return 0; - } - - // create - dn = dir->add_null_dentry(dname); - dn->mark_new(); - dout(10) << "prepare_null_dentry added " << *dn << dendl; - - return dn; -} - - -/** prepare_new_inode - * - * create a new inode. set c/m/atime. hit dir pop. - */ -CInode* Server::prepare_new_inode(MDRequest *mdr, CDir *dir) -{ - CInode *in = mdcache->create_inode(); - in->inode.uid = mdr->client_request->get_caller_uid(); - in->inode.gid = mdr->client_request->get_caller_gid(); - in->inode.ctime = in->inode.mtime = in->inode.atime = mdr->now; // now - dout(10) << "prepare_new_inode " << *in << dendl; - - return in; -} - - - -CDir *Server::traverse_to_auth_dir(MDRequest *mdr, vector &trace, filepath refpath) -{ - // figure parent dir vs dname - if (refpath.depth() == 0) { - dout(7) << "can't do that to root" << dendl; - reply_request(mdr, -EINVAL); - return 0; - } - string dname = refpath.last_dentry(); - refpath.pop_dentry(); - - dout(10) << "traverse_to_auth_dir dirpath " << refpath << " dname " << dname << dendl; - - // traverse to parent dir - int r = mdcache->path_traverse(mdr, mdr->client_request, - refpath, trace, true, - MDS_TRAVERSE_FORWARD); - if (r > 0) return 0; // delayed - if (r < 0) { - reply_request(mdr, r); - return 0; - } - - // open inode - CInode *diri; - if (trace.empty()) - diri = mdcache->get_inode(refpath.get_ino()); - else - diri = mdcache->get_dentry_inode(trace[trace.size()-1], mdr); - if (!diri) - return 0; // opening inode. - - // is it an auth dir? - CDir *dir = validate_dentry_dir(mdr, diri, dname); - if (!dir) - return 0; // forwarded or waiting for freeze - - dout(10) << "traverse_to_auth_dir " << *dir << dendl; - return dir; -} - - - -CInode* Server::rdlock_path_pin_ref(MDRequest *mdr, bool want_auth) -{ - // already got ref? - if (mdr->ref) - return mdr->ref; - - MClientRequest *req = mdr->client_request; - - // traverse - filepath refpath = req->get_filepath(); - vector trace; - int r = mdcache->path_traverse(mdr, req, - refpath, - trace, req->follow_trailing_symlink(), - MDS_TRAVERSE_FORWARD); - if (r > 0) return false; // delayed - if (r < 0) { // error - reply_request(mdr, r); - return 0; - } - - // open ref inode - CInode *ref = 0; - if (trace.empty()) - ref = mdcache->get_root(); - else { - CDentry *dn = trace[trace.size()-1]; - - // if no inode (null or unattached remote), fw to dentry auth? - if (want_auth && !dn->is_auth() && - (dn->is_null() || - (dn->is_remote() && dn->inode))) { - if (dn->is_ambiguous_auth()) { - dout(10) << "waiting for single auth on " << *dn << dendl; - dn->dir->add_waiter(CInode::WAIT_SINGLEAUTH, new C_MDS_RetryRequest(mdcache, mdr)); - } else { - dout(10) << "fw to auth for " << *dn << dendl; - mdcache->request_forward(mdr, dn->authority().first); - return 0; - } - } - - // open ref inode - ref = mdcache->get_dentry_inode(dn, mdr); - if (!ref) return 0; - } - dout(10) << "ref is " << *ref << dendl; - - // fw to inode auth? - if (want_auth && !ref->is_auth()) { - if (ref->is_ambiguous_auth()) { - dout(10) << "waiting for single auth on " << *ref << dendl; - ref->add_waiter(CInode::WAIT_SINGLEAUTH, new C_MDS_RetryRequest(mdcache, mdr)); - } else { - dout(10) << "fw to auth for " << *ref << dendl; - mdcache->request_forward(mdr, ref->authority().first); - } - return 0; - } - - // auth_pin? - if (want_auth) { - if (ref->is_frozen()) { - dout(7) << "waiting for !frozen/authpinnable on " << *ref << dendl; - ref->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr)); - return 0; - } - mdr->auth_pin(ref); - } - - // lock the path - set rdlocks, empty; - - for (int i=0; i<(int)trace.size(); i++) - rdlocks.insert(&trace[i]->lock); - - if (!mds->locker->acquire_locks(mdr, rdlocks, empty, empty)) - return 0; - - // set and pin ref - mdr->pin(ref); - mdr->ref = ref; - - // save the locked trace. - mdr->trace.swap(trace); - - return ref; -} - - -/** rdlock_path_xlock_dentry - * traverse path to the directory that could/would contain dentry. - * make sure i am auth for that dentry, forward as necessary. - * create null dentry in place (or use existing if okexist). - * get rdlocks on traversed dentries, xlock on new dentry. - */ -CDentry* Server::rdlock_path_xlock_dentry(MDRequest *mdr, bool okexist, bool mustexist) -{ - MClientRequest *req = mdr->client_request; - - vector trace; - CDir *dir = traverse_to_auth_dir(mdr, trace, req->get_filepath()); - if (!dir) return 0; - dout(10) << "rdlock_path_xlock_dentry dir " << *dir << dendl; - - // make sure we can auth_pin (or have already authpinned) dir - if (dir->is_frozen()) { - dout(7) << "waiting for !frozen/authpinnable on " << *dir << dendl; - dir->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr)); - return 0; - } - - // make a null dentry? - const string &dname = req->get_filepath().last_dentry(); - CDentry *dn; - if (mustexist) { - dn = dir->lookup(dname); - - // make sure dir is complete - if (!dn && !dir->is_complete()) { - dout(7) << " incomplete dir contents for " << *dir << ", fetching" << dendl; - dir->fetch(new C_MDS_RetryRequest(mdcache, mdr)); - return 0; - } - - // readable? - if (dn && dn->lock.is_xlocked_by_other(mdr)) { - dout(10) << "waiting on xlocked dentry " << *dn << dendl; - dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr)); - return 0; - } - - // exists? - if (!dn || dn->is_null()) { - dout(7) << "dentry " << dname << " dne in " << *dir << dendl; - reply_request(mdr, -ENOENT); - return 0; - } - } else { - dn = prepare_null_dentry(mdr, dir, dname, okexist); - if (!dn) - return 0; - } - - // -- lock -- - set rdlocks, wrlocks, xlocks; - - for (int i=0; i<(int)trace.size(); i++) - rdlocks.insert(&trace[i]->lock); - if (dn->is_null()) { - xlocks.insert(&dn->lock); // new dn, xlock - wrlocks.insert(&dn->dir->inode->dirlock); // also, wrlock on dir mtime - } else - rdlocks.insert(&dn->lock); // existing dn, rdlock - - if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) - return 0; - - // save the locked trace. - mdr->trace.swap(trace); - - return dn; -} - - - - - -/** - * try_open_auth_dirfrag -- open dirfrag, or forward to dirfrag auth - * - * @diri base indoe - * @fg the exact frag we want - * @mdr request - */ -CDir* Server::try_open_auth_dirfrag(CInode *diri, frag_t fg, MDRequest *mdr) -{ - CDir *dir = diri->get_dirfrag(fg); - - // not open and inode not mine? - if (!dir && !diri->is_auth()) { - int inauth = diri->authority().first; - dout(7) << "try_open_auth_dirfrag: not open, not inode auth, fw to mds" << inauth << dendl; - mdcache->request_forward(mdr, inauth); - return 0; - } - - // not open and inode frozen? - if (!dir && diri->is_frozen_dir()) { - dout(10) << "try_open_auth_dirfrag: dir inode is frozen, waiting " << *diri << dendl; - assert(diri->get_parent_dir()); - diri->get_parent_dir()->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr)); - return 0; - } - - // invent? - if (!dir) - dir = diri->get_or_open_dirfrag(mds->mdcache, fg); - - // am i auth for the dirfrag? - if (!dir->is_auth()) { - int auth = dir->authority().first; - dout(7) << "try_open_auth_dirfrag: not auth for " << *dir - << ", fw to mds" << auth << dendl; - mdcache->request_forward(mdr, auth); - return 0; - } - - return dir; -} - - - -/** predirty_dn_diri - * predirty the directory inode for a new dentry, if it is auth (and not root) - * BUG: root inode doesn't get dirtied properly, currently. blech. - */ -version_t Server::predirty_dn_diri(MDRequest *mdr, CDentry *dn, EMetaBlob *blob) -{ - version_t dirpv = 0; - CInode *diri = dn->dir->inode; - - if (diri->is_base()) return 0; - - if (diri->is_auth()) { - assert(mdr->wrlocks.count(&diri->dirlock)); - - dirpv = diri->pre_dirty(); - dout(10) << "predirty_dn_diri ctime/mtime " << mdr->now << " pv " << dirpv << " on " << *diri << dendl; - - // predirty+journal - inode_t *pi = diri->project_inode(); - if (dirpv) pi->version = dirpv; - pi->ctime = pi->mtime = mdr->now; - blob->add_dir_context(diri->get_parent_dn()->get_dir()); - blob->add_primary_dentry(diri->get_parent_dn(), true, 0, pi); - } else { - // journal the mtime change anyway. - inode_t *ji = blob->add_primary_dentry(diri->get_parent_dn(), true); - ji->ctime = ji->mtime = mdr->now; - - dout(10) << "predirty_dn_diri (non-auth) ctime/mtime " << mdr->now << " on " << *diri << dendl; - - blob->add_dirtied_inode_mtime(diri->ino(), mdr->now); - assert(mdr->ls); - mdr->ls->dirty_inode_mtimes.push_back(&diri->xlist_dirty_inode_mtime); - } - - return dirpv; -} - -/** dirty_dn_diri - * follow-up with actual dirty of inode after journal entry commits. - */ -void Server::dirty_dn_diri(MDRequest *mdr, CDentry *dn, version_t dirpv) -{ - CInode *diri = dn->dir->inode; - - if (diri->is_root()) return; - - if (dirpv) { - // we journaled and predirtied. - assert(diri->is_auth() && !diri->is_root()); - diri->pop_and_dirty_projected_inode(mdr->ls); - dout(10) << "dirty_dn_diri ctime/mtime " << mdr->now << " v " << diri->inode.version << " on " << *diri << dendl; - } else { - // dirlock scatterlock will propagate the update. - diri->inode.ctime = diri->inode.mtime = mdr->now; - diri->dirlock.set_updated(); - dout(10) << "dirty_dn_diri (non-dirty) ctime/mtime " << mdr->now << " on " << *diri << dendl; - } -} - - - - - - -// =============================================================================== -// STAT - -void Server::handle_client_stat(MDRequest *mdr) -{ - MClientRequest *req = mdr->client_request; - CInode *ref = rdlock_path_pin_ref(mdr, false); - if (!ref) return; - - // which inode locks do I want? - /* note: this works because we include existing locks in our lists, - and because all new locks are on inodes and sort to the right of - the dentry rdlocks previous acquired by rdlock_path_pin_ref(). */ - set rdlocks = mdr->rdlocks; - set wrlocks = mdr->wrlocks; - set xlocks = mdr->xlocks; - - int mask = req->args.stat.mask; - if (mask & STAT_MASK_LINK) rdlocks.insert(&ref->linklock); - if (mask & STAT_MASK_AUTH) rdlocks.insert(&ref->authlock); - if (ref->is_file() && - mask & STAT_MASK_FILE) rdlocks.insert(&ref->filelock); - if (ref->is_dir() && - mask & STAT_MASK_MTIME) rdlocks.insert(&ref->dirlock); - - if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) - return; - - mds->balancer->hit_inode(g_clock.now(), ref, META_POP_IRD, - mdr->client_request->get_client_inst().name.num()); - - // reply - dout(10) << "reply to stat on " << *req << dendl; - MClientReply *reply = new MClientReply(req); - reply_request(mdr, reply, ref); -} - - - - -// =============================================================================== -// INODE UPDATES - - -/* - * finisher for basic inode updates - */ -class C_MDS_inode_update_finish : public Context { - MDS *mds; - MDRequest *mdr; - CInode *in; -public: - C_MDS_inode_update_finish(MDS *m, MDRequest *r, CInode *i) : - mds(m), mdr(r), in(i) { } - void finish(int r) { - assert(r == 0); - - // apply - in->pop_and_dirty_projected_inode(mdr->ls); - - mds->balancer->hit_inode(mdr->now, in, META_POP_IWR); - - // reply - MClientReply *reply = new MClientReply(mdr->client_request, 0); - reply->set_result(0); - mds->server->reply_request(mdr, reply, in); - } -}; - - -// utime - -void Server::handle_client_utime(MDRequest *mdr) -{ - MClientRequest *req = mdr->client_request; - CInode *cur = rdlock_path_pin_ref(mdr, true); - if (!cur) return; - - if (cur->is_root()) { - reply_request(mdr, -EINVAL); // for now - return; - } - - // xlock inode - set rdlocks = mdr->rdlocks; - set wrlocks = mdr->wrlocks; - set xlocks = mdr->xlocks; - xlocks.insert(&cur->filelock); - if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) - return; - - // project update - inode_t *pi = cur->project_inode(); - pi->mtime = req->args.utime.mtime; - pi->atime = req->args.utime.atime; - pi->version = cur->pre_dirty(); - pi->ctime = g_clock.real_now(); - - // log + wait - mdr->ls = mdlog->get_current_segment(); - EUpdate *le = new EUpdate(mdlog, "utime"); - le->metablob.add_client_req(req->get_reqid()); - le->metablob.add_dir_context(cur->get_parent_dir()); - le->metablob.add_primary_dentry(cur->parent, true, 0, pi); - - mdlog->submit_entry(le, new C_MDS_inode_update_finish(mds, mdr, cur)); -} - - -// chmod - -void Server::handle_client_chmod(MDRequest *mdr) -{ - MClientRequest *req = mdr->client_request; - CInode *cur = rdlock_path_pin_ref(mdr, true); - if (!cur) return; - - if (cur->is_root()) { - reply_request(mdr, -EINVAL); // for now - return; - } - - // write - set rdlocks = mdr->rdlocks; - set wrlocks = mdr->wrlocks; - set xlocks = mdr->xlocks; - xlocks.insert(&cur->authlock); - if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) - return; - - // project update - inode_t *pi = cur->project_inode(); - pi->mode = - (pi->mode & ~04777) | - (req->args.chmod.mode & 04777); - pi->version = cur->pre_dirty(); - pi->ctime = g_clock.real_now(); - - // log + wait - mdr->ls = mdlog->get_current_segment(); - EUpdate *le = new EUpdate(mdlog, "chmod"); - le->metablob.add_client_req(req->get_reqid()); - le->metablob.add_dir_context(cur->get_parent_dir()); - le->metablob.add_primary_dentry(cur->parent, true, 0, pi); - - mdlog->submit_entry(le, new C_MDS_inode_update_finish(mds, mdr, cur)); -} - - -// chown - -void Server::handle_client_chown(MDRequest *mdr) -{ - MClientRequest *req = mdr->client_request; - CInode *cur = rdlock_path_pin_ref(mdr, true); - if (!cur) return; - - if (cur->is_root()) { - reply_request(mdr, -EINVAL); // for now - return; - } - - // write - set rdlocks = mdr->rdlocks; - set wrlocks = mdr->wrlocks; - set xlocks = mdr->xlocks; - xlocks.insert(&cur->authlock); - if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) - return; - - // project update - inode_t *pi = cur->project_inode(); - pi->uid = MAX(req->args.chown.uid, 0); - pi->gid = MAX(req->args.chown.gid, 0); - pi->version = cur->pre_dirty(); - pi->ctime = g_clock.real_now(); - - // log + wait - mdr->ls = mdlog->get_current_segment(); - EUpdate *le = new EUpdate(mdlog, "chown"); - le->metablob.add_client_req(req->get_reqid()); - le->metablob.add_dir_context(cur->get_parent_dir()); - le->metablob.add_primary_dentry(cur->parent, true, 0, pi); - - mdlog->submit_entry(le); - mdlog->wait_for_sync(new C_MDS_inode_update_finish(mds, mdr, cur)); -} - - - - -// ================================================================= -// DIRECTORY and NAMESPACE OPS - -// READDIR - -void Server::handle_client_readdir(MDRequest *mdr) -{ - MClientRequest *req = mdr->client_request; - CInode *diri = rdlock_path_pin_ref(mdr, false); - if (!diri) return; - - // it's a directory, right? - if (!diri->is_dir()) { - // not a dir - dout(10) << "reply to " << *req << " readdir -ENOTDIR" << dendl; - reply_request(mdr, -ENOTDIR, diri); - return; - } - - // which frag? - frag_t fg = req->args.readdir.frag; - - // does the frag exist? - if (diri->dirfragtree[fg] != fg) { - dout(10) << "frag " << fg << " doesn't appear in fragtree " << diri->dirfragtree << dendl; - reply_request(mdr, -EAGAIN, diri); - return; - } - - CDir *dir = try_open_auth_dirfrag(diri, fg, mdr); - if (!dir) return; - - // ok! - assert(dir->is_auth()); - - // check perm - /* - if (!mds->locker->inode_hard_rdlock_start(diri, mdr)) - return; - mds->locker->inode_hard_rdlock_finish(diri, mdr); - */ - - if (!dir->is_complete()) { - // fetch - dout(10) << " incomplete dir contents for readdir on " << *dir << ", fetching" << dendl; - dir->fetch(new C_MDS_RetryRequest(mdcache, mdr)); - return; - } - - // build dir contents - bufferlist dirbl; - - DirStat::_encode(dirbl, dir, mds->get_nodeid()); - - int numfiles = 0; - for (CDir::map_t::iterator it = dir->begin(); - it != dir->end(); - it++) { - CDentry *dn = it->second; - if (dn->is_null()) continue; - - CInode *in = dn->inode; - - // remote link? - // better for the MDS to do the work, if we think the client will stat any of these files. - if (dn->is_remote() && !in) { - in = mdcache->get_inode(dn->get_remote_ino()); - if (in) { - dn->link_remote(in); - } else { - mdcache->open_remote_ino(dn->get_remote_ino(), - mdr, - new C_MDS_RetryRequest(mdcache, mdr)); - - // touch everything i _do_ have - for (it = dir->begin(); - it != dir->end(); - it++) - if (!it->second->is_null()) - mdcache->lru.lru_touch(it->second); - return; - } - } - assert(in); - - dout(12) << "including inode " << *in << dendl; - - // add this dentry + inodeinfo - ::_encode(it->first, dirbl); - InodeStat::_encode(dirbl, in); - - // touch it - mdcache->lru.lru_touch(dn); - } - - // yay, reply - MClientReply *reply = new MClientReply(req); - reply->take_dir_items(dirbl); - - dout(10) << "reply to " << *req << " readdir " << numfiles << " files" << dendl; - reply->set_result(0); - - // bump popularity. NOTE: this doesn't quite capture it. - mds->balancer->hit_dir(g_clock.now(), dir, META_POP_IRD, -1, numfiles); - - // reply - reply_request(mdr, reply, diri); -} - - - -// ------------------------------------------------ - -// MKNOD - -class C_MDS_mknod_finish : public Context { - MDS *mds; - MDRequest *mdr; - CDentry *dn; - CInode *newi; - version_t dirpv; - version_t newdirpv; -public: - C_MDS_mknod_finish(MDS *m, MDRequest *r, CDentry *d, CInode *ni, version_t dirpv_, version_t newdirpv_=0) : - mds(m), mdr(r), dn(d), newi(ni), - dirpv(dirpv_), newdirpv(newdirpv_) {} - void finish(int r) { - assert(r == 0); - - // link the inode - dn->get_dir()->link_primary_inode(dn, newi); - - // dirty inode, dn, dir - newi->mark_dirty(newi->inode.version + 1, mdr->ls); - - // mkdir? - if (newdirpv) { - CDir *dir = newi->get_dirfrag(frag_t()); - assert(dir); - dir->mark_dirty(newdirpv, mdr->ls); - } - - // dir inode's mtime - mds->server->dirty_dn_diri(mdr, dn, dirpv); - - // hit pop - mds->balancer->hit_inode(mdr->now, newi, META_POP_IWR); - //mds->balancer->hit_dir(mdr->now, dn->get_dir(), META_POP_DWR); - - // reply - MClientReply *reply = new MClientReply(mdr->client_request, 0); - reply->set_result(0); - mds->server->reply_request(mdr, reply, newi); - } -}; - - -void Server::handle_client_mknod(MDRequest *mdr) -{ - MClientRequest *req = mdr->client_request; - - CDentry *dn = rdlock_path_xlock_dentry(mdr, false, false); - if (!dn) return; - - mdr->now = g_clock.real_now(); - CInode *newi = prepare_new_inode(mdr, dn->dir); - assert(newi); - - // it's a file. - newi->inode.rdev = req->args.mknod.rdev; - newi->inode.mode = req->args.mknod.mode; - newi->inode.mode &= ~S_IFMT; - newi->inode.mode |= S_IFREG; - newi->inode.version = dn->pre_dirty() - 1; - - // prepare finisher - mdr->ls = mdlog->get_current_segment(); - EUpdate *le = new EUpdate(mdlog, "mknod"); - le->metablob.add_client_req(req->get_reqid()); - le->metablob.add_allocated_ino(newi->ino(), mds->idalloc->get_version()); - version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob); // dir mtime too - le->metablob.add_dir_context(dn->dir); - le->metablob.add_primary_dentry(dn, true, newi, &newi->inode); - - // log + wait - mdlog->submit_entry(le, new C_MDS_mknod_finish(mds, mdr, dn, newi, dirpv)); -} - - - -// MKDIR - -void Server::handle_client_mkdir(MDRequest *mdr) -{ - MClientRequest *req = mdr->client_request; - - CDentry *dn = rdlock_path_xlock_dentry(mdr, false, false); - if (!dn) return; - - // new inode - mdr->now = g_clock.real_now(); - CInode *newi = prepare_new_inode(mdr, dn->dir); - assert(newi); - - // it's a directory. - newi->inode.mode = req->args.mkdir.mode; - newi->inode.mode &= ~S_IFMT; - newi->inode.mode |= S_IFDIR; - newi->inode.layout = g_OSD_MDDirLayout; - newi->inode.version = dn->pre_dirty() - 1; - - // ...and that new dir is empty. - CDir *newdir = newi->get_or_open_dirfrag(mds->mdcache, frag_t()); - newdir->mark_complete(); - version_t newdirpv = newdir->pre_dirty(); - - //if (mds->logger) mds->logger->inc("mkdir"); - - // prepare finisher - mdr->ls = mdlog->get_current_segment(); - EUpdate *le = new EUpdate(mdlog, "mkdir"); - le->metablob.add_client_req(req->get_reqid()); - le->metablob.add_allocated_ino(newi->ino(), mds->idalloc->get_version()); - version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob); // dir mtime too - le->metablob.add_dir_context(dn->dir); - le->metablob.add_primary_dentry(dn, true, newi, &newi->inode); - le->metablob.add_dir(newdir, true, true); // dirty AND complete - - // log + wait - mdlog->submit_entry(le, new C_MDS_mknod_finish(mds, mdr, dn, newi, dirpv, newdirpv)); - - /* old export heuristic. pbly need to reimplement this at some point. - if ( - diri->dir->is_auth() && - diri->dir->is_rep() && - newdir->is_auth() && - !newdir->is_hashing()) { - int dest = rand() % mds->mdsmap->get_num_mds(); - if (dest != whoami) { - dout(10) << "exporting new dir " << *newdir << " in replicated parent " << *diri->dir << dendl; - mdcache->migrator->export_dir(newdir, dest); - } - } - */ -} - - -// SYMLINK - -void Server::handle_client_symlink(MDRequest *mdr) -{ - MClientRequest *req = mdr->client_request; - - CDentry *dn = rdlock_path_xlock_dentry(mdr, false, false); - if (!dn) return; - - mdr->now = g_clock.real_now(); - - CInode *newi = prepare_new_inode(mdr, dn->dir); - assert(newi); - - // it's a symlink - newi->inode.mode &= ~S_IFMT; - newi->inode.mode |= S_IFLNK; - newi->symlink = req->get_path2(); - newi->inode.version = dn->pre_dirty() - 1; - - // prepare finisher - mdr->ls = mdlog->get_current_segment(); - EUpdate *le = new EUpdate(mdlog, "symlink"); - le->metablob.add_client_req(req->get_reqid()); - le->metablob.add_allocated_ino(newi->ino(), mds->idalloc->get_version()); - version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob); // dir mtime too - le->metablob.add_dir_context(dn->dir); - le->metablob.add_primary_dentry(dn, true, newi, &newi->inode); - - // log + wait - mdlog->submit_entry(le, new C_MDS_mknod_finish(mds, mdr, dn, newi, dirpv)); -} - - - - - -// LINK - -void Server::handle_client_link(MDRequest *mdr) -{ - MClientRequest *req = mdr->client_request; - - dout(7) << "handle_client_link " << req->get_filepath() - << " to " << req->get_filepath2() - << dendl; - - // traverse to dest dir, make sure it's ours. - const filepath &linkpath = req->get_filepath(); - const string &dname = linkpath.last_dentry(); - vector linktrace; - CDir *dir = traverse_to_auth_dir(mdr, linktrace, linkpath); - if (!dir) return; - dout(7) << "handle_client_link link " << dname << " in " << *dir << dendl; - - // traverse to link target - filepath targetpath = req->get_filepath2(); - dout(7) << "handle_client_link discovering target " << targetpath << dendl; - vector targettrace; - int r = mdcache->path_traverse(mdr, req, - targetpath, targettrace, false, - MDS_TRAVERSE_DISCOVER); - if (r > 0) return; // wait - if (targettrace.empty()) r = -EINVAL; - if (r < 0) { - reply_request(mdr, r); - return; - } - - // identify target inode - CInode *targeti = targettrace[targettrace.size()-1]->inode; - assert(targeti); - - // dir? - dout(7) << "target is " << *targeti << dendl; - if (targeti->is_dir()) { - dout(7) << "target is a dir, failing..." << dendl; - reply_request(mdr, -EINVAL); - return; - } - - // get/make null link dentry - CDentry *dn = prepare_null_dentry(mdr, dir, dname, false); - if (!dn) return; - - // create lock lists - set rdlocks, wrlocks, xlocks; - - for (int i=0; i<(int)linktrace.size(); i++) - rdlocks.insert(&linktrace[i]->lock); - xlocks.insert(&dn->lock); - wrlocks.insert(&dn->dir->inode->dirlock); - for (int i=0; i<(int)targettrace.size(); i++) - rdlocks.insert(&targettrace[i]->lock); - xlocks.insert(&targeti->linklock); - - if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) - return; - - mdr->done_locking = true; // avoid wrlock moving target issues. - - // pick mtime - if (mdr->now == utime_t()) - mdr->now = g_clock.real_now(); - - // does the target need an anchor? - if (targeti->is_auth()) { - /*if (targeti->get_parent_dir() == dn->dir) { - dout(7) << "target is in the same dirfrag, sweet" << dendl; - } - else - */ - if (targeti->is_anchored() && !targeti->is_unanchoring()) { - dout(7) << "target anchored already (nlink=" << targeti->inode.nlink << "), sweet" << dendl; - } - else { - dout(7) << "target needs anchor, nlink=" << targeti->inode.nlink << ", creating anchor" << dendl; - - mdcache->anchor_create(mdr, targeti, - new C_MDS_RetryRequest(mdcache, mdr)); - return; - } - } - - // go! - - // local or remote? - if (targeti->is_auth()) - _link_local(mdr, dn, targeti); - else - _link_remote(mdr, dn, targeti); -} - - -class C_MDS_link_local_finish : public Context { - MDS *mds; - MDRequest *mdr; - CDentry *dn; - CInode *targeti; - version_t dnpv; - version_t tipv; - version_t dirpv; -public: - C_MDS_link_local_finish(MDS *m, MDRequest *r, CDentry *d, CInode *ti, - version_t dnpv_, version_t tipv_, version_t dirpv_) : - mds(m), mdr(r), dn(d), targeti(ti), - dnpv(dnpv_), tipv(tipv_), dirpv(dirpv_) { } - void finish(int r) { - assert(r == 0); - mds->server->_link_local_finish(mdr, dn, targeti, dnpv, tipv, dirpv); - } -}; - - -void Server::_link_local(MDRequest *mdr, CDentry *dn, CInode *targeti) -{ - dout(10) << "_link_local " << *dn << " to " << *targeti << dendl; - - mdr->ls = mdlog->get_current_segment(); - - // predirty NEW dentry - version_t dnpv = dn->pre_dirty(); - version_t tipv = targeti->pre_dirty(); - - // project inode update - inode_t *pi = targeti->project_inode(); - pi->nlink++; - pi->ctime = mdr->now; - pi->version = tipv; - - // log + wait - EUpdate *le = new EUpdate(mdlog, "link_local"); - le->metablob.add_client_req(mdr->reqid); - version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob); // dir inode's mtime - le->metablob.add_dir_context(dn->get_dir()); - le->metablob.add_remote_dentry(dn, true, targeti->ino()); // new remote - le->metablob.add_dir_context(targeti->get_parent_dir()); - le->metablob.add_primary_dentry(targeti->parent, true, targeti, pi); // update old primary - - mdlog->submit_entry(le, new C_MDS_link_local_finish(mds, mdr, dn, targeti, dnpv, tipv, dirpv)); -} - -void Server::_link_local_finish(MDRequest *mdr, CDentry *dn, CInode *targeti, - version_t dnpv, version_t tipv, version_t dirpv) -{ - dout(10) << "_link_local_finish " << *dn << " to " << *targeti << dendl; - - // link and unlock the NEW dentry - dn->dir->link_remote_inode(dn, targeti->ino(), MODE_TO_DT(targeti->inode.mode)); - dn->mark_dirty(dnpv, mdr->ls); - - // target inode - targeti->pop_and_dirty_projected_inode(mdr->ls); - - // new dentry dir mtime - dirty_dn_diri(mdr, dn, dirpv); - - // bump target popularity - mds->balancer->hit_inode(mdr->now, targeti, META_POP_IWR); - //mds->balancer->hit_dir(mdr->now, dn->get_dir(), META_POP_DWR); - - // reply - MClientReply *reply = new MClientReply(mdr->client_request, 0); - reply_request(mdr, reply, dn->get_dir()->get_inode()); // FIXME: imprecise ref -} - - -// remote - -class C_MDS_link_remote_finish : public Context { - MDS *mds; - MDRequest *mdr; - CDentry *dn; - CInode *targeti; - version_t dpv; - version_t dirpv; -public: - C_MDS_link_remote_finish(MDS *m, MDRequest *r, CDentry *d, CInode *ti, version_t dirpv_) : - mds(m), mdr(r), dn(d), targeti(ti), - dpv(d->get_projected_version()), - dirpv(dirpv_) { } - void finish(int r) { - assert(r == 0); - mds->server->_link_remote_finish(mdr, dn, targeti, dpv, dirpv); - } -}; - -void Server::_link_remote(MDRequest *mdr, CDentry *dn, CInode *targeti) -{ - dout(10) << "_link_remote " << *dn << " to " << *targeti << dendl; - - // 1. send LinkPrepare to dest (journal nlink++ prepare) - int linkauth = targeti->authority().first; - if (mdr->more()->witnessed.count(linkauth) == 0) { - dout(10) << " targeti auth must prepare nlink++" << dendl; - - MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_LINKPREP); - targeti->set_object_info(req->get_object_info()); - req->now = mdr->now; - mds->send_message_mds(req, linkauth, MDS_PORT_SERVER); - - assert(mdr->more()->waiting_on_slave.count(linkauth) == 0); - mdr->more()->waiting_on_slave.insert(linkauth); - return; - } - dout(10) << " targeti auth has prepared nlink++" << dendl; - - // go. - // predirty dentry - dn->pre_dirty(); - - // add to event - mdr->ls = mdlog->get_current_segment(); - EUpdate *le = new EUpdate(mdlog, "link_remote"); - le->metablob.add_client_req(mdr->reqid); - version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob); // dir inode's mtime - le->metablob.add_dir_context(dn->get_dir()); - le->metablob.add_remote_dentry(dn, true, targeti->ino()); // new remote - - // mark committing (needed for proper recovery) - mdr->committing = true; - - // log + wait - mdlog->submit_entry(le, new C_MDS_link_remote_finish(mds, mdr, dn, targeti, dirpv)); -} - -void Server::_link_remote_finish(MDRequest *mdr, CDentry *dn, CInode *targeti, - version_t dpv, version_t dirpv) -{ - dout(10) << "_link_remote_finish " << *dn << " to " << *targeti << dendl; - - // link the new dentry - dn->dir->link_remote_inode(dn, targeti->ino(), MODE_TO_DT(targeti->inode.mode)); - dn->mark_dirty(dpv, mdr->ls); - - // dir inode's mtime - dirty_dn_diri(mdr, dn, dirpv); - - // bump target popularity - mds->balancer->hit_inode(mdr->now, targeti, META_POP_IWR); - //mds->balancer->hit_dir(mdr->now, dn->get_dir(), META_POP_DWR); - - // reply - MClientReply *reply = new MClientReply(mdr->client_request, 0); - reply_request(mdr, reply, dn->get_dir()->get_inode()); // FIXME: imprecise ref -} - - -// remote linking/unlinking - -class C_MDS_SlaveLinkPrep : public Context { - Server *server; - MDRequest *mdr; - CInode *targeti; - utime_t old_ctime; - bool inc; -public: - C_MDS_SlaveLinkPrep(Server *s, MDRequest *r, CInode *t, utime_t oct, bool in) : - server(s), mdr(r), targeti(t), old_ctime(oct), inc(in) { } - void finish(int r) { - assert(r == 0); - server->_logged_slave_link(mdr, targeti, old_ctime, inc); - } -}; - -void Server::handle_slave_link_prep(MDRequest *mdr) -{ - dout(10) << "handle_slave_link_prep " << *mdr - << " on " << mdr->slave_request->get_object_info() - << dendl; - - CInode *targeti = mdcache->get_inode(mdr->slave_request->get_object_info().ino); - assert(targeti); - dout(10) << "targeti " << *targeti << dendl; - CDentry *dn = targeti->get_parent_dn(); - assert(dn->is_primary()); - - mdr->now = mdr->slave_request->now; - - // anchor? - if (mdr->slave_request->get_op() == MMDSSlaveRequest::OP_LINKPREP) { - if (targeti->is_anchored() && !targeti->is_unanchoring()) { - dout(7) << "target anchored already (nlink=" << targeti->inode.nlink << "), sweet" << dendl; - } - else { - dout(7) << "target needs anchor, nlink=" << targeti->inode.nlink << ", creating anchor" << dendl; - mdcache->anchor_create(mdr, targeti, - new C_MDS_RetryRequest(mdcache, mdr)); - return; - } - } - - // journal it - mdr->ls = mdlog->get_current_segment(); - ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_link_prep", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_PREPARE); - - inode_t *pi = dn->inode->project_inode(); - - // rollback case - le->rollback.add_dir_context(targeti->get_parent_dir()); - le->rollback.add_primary_dentry(dn, true, targeti, pi); // update old primary - - // update journaled target inode - bool inc; - if (mdr->slave_request->get_op() == MMDSSlaveRequest::OP_LINKPREP) { - inc = true; - pi->nlink++; - } else { - inc = false; - pi->nlink--; - } - utime_t old_ctime = pi->ctime; - pi->ctime = mdr->now; - pi->version = targeti->pre_dirty(); - - dout(10) << " projected inode " << pi << " v " << pi->version << dendl; - - // commit case - le->commit.add_dir_context(targeti->get_parent_dir()); - le->commit.add_primary_dentry(dn, true, targeti, pi); // update old primary - - mdlog->submit_entry(le, new C_MDS_SlaveLinkPrep(this, mdr, targeti, old_ctime, inc)); -} - -class C_MDS_SlaveLinkCommit : public Context { - Server *server; - MDRequest *mdr; - CInode *targeti; - utime_t old_ctime; - version_t old_version; - bool inc; -public: - C_MDS_SlaveLinkCommit(Server *s, MDRequest *r, CInode *t, utime_t oct, version_t ov, bool in) : - server(s), mdr(r), targeti(t), old_ctime(oct), old_version(ov), inc(in) { } - void finish(int r) { - server->_commit_slave_link(mdr, r, targeti, - old_ctime, old_version, inc); - } -}; - -void Server::_logged_slave_link(MDRequest *mdr, CInode *targeti, utime_t old_ctime, bool inc) -{ - dout(10) << "_logged_slave_link " << *mdr - << " inc=" << inc - << " " << *targeti << dendl; - - version_t old_version = targeti->inode.version; - - // update the target - targeti->pop_and_dirty_projected_inode(mdr->ls); - - // hit pop - mds->balancer->hit_inode(mdr->now, targeti, META_POP_IWR); - - // ack - MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_LINKPREPACK); - mds->send_message_mds(reply, mdr->slave_to_mds, MDS_PORT_SERVER); - - // set up commit waiter - mdr->more()->slave_commit = new C_MDS_SlaveLinkCommit(this, mdr, targeti, old_ctime, old_version, inc); - - // done. - delete mdr->slave_request; - mdr->slave_request = 0; -} - - -void Server::_commit_slave_link(MDRequest *mdr, int r, CInode *targeti, - utime_t old_ctime, version_t old_version, bool inc) -{ - dout(10) << "_commit_slave_link " << *mdr - << " r=" << r - << " inc=" << inc - << " " << *targeti << dendl; - - ESlaveUpdate *le; - if (r == 0) { - // write a commit to the journal - le = new ESlaveUpdate(mdlog, "slave_link_commit", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_COMMIT); - } else { - le = new ESlaveUpdate(mdlog, "slave_link_rollback", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_ROLLBACK); - - // -- rollback in memory -- - assert(targeti->inode.ctime == mdr->now); - assert(targeti->projected_inode.empty()); // we're holding the version lock. - - targeti->inode.ctime = old_ctime; - targeti->inode.version = old_version; - if (inc) - targeti->inode.nlink++; - else - targeti->inode.nlink--; - } - - mdlog->submit_entry(le); -} - - - -void Server::handle_slave_link_prep_ack(MDRequest *mdr, MMDSSlaveRequest *m) -{ - dout(10) << "handle_slave_link_prep_ack " << *mdr - << " " << *m << dendl; - int from = m->get_source().num(); - - // note slave - mdr->more()->slaves.insert(from); - - // witnessed! - assert(mdr->more()->witnessed.count(from) == 0); - mdr->more()->witnessed.insert(from); - - // remove from waiting list - assert(mdr->more()->waiting_on_slave.count(from)); - mdr->more()->waiting_on_slave.erase(from); - - assert(mdr->more()->waiting_on_slave.empty()); - - dispatch_client_request(mdr); // go again! -} - - - - - -// UNLINK - -void Server::handle_client_unlink(MDRequest *mdr) -{ - MClientRequest *req = mdr->client_request; - - // traverse to path - vector trace; - int r = mdcache->path_traverse(mdr, req, - req->get_filepath(), trace, false, - MDS_TRAVERSE_FORWARD); - if (r > 0) return; - if (trace.empty()) r = -EINVAL; // can't unlink root - if (r < 0) { - reply_request(mdr, r); - return; - } - - CDentry *dn = trace[trace.size()-1]; - assert(dn); - - // is it my dentry? - if (!dn->is_auth()) { - // fw to auth - mdcache->request_forward(mdr, dn->authority().first); - return; - } - - // rmdir or unlink? - bool rmdir = false; - if (req->get_op() == MDS_OP_RMDIR) rmdir = true; - - if (rmdir) { - dout(7) << "handle_client_rmdir on " << *dn << dendl; - } else { - dout(7) << "handle_client_unlink on " << *dn << dendl; - } - - // readable? - if (dn->lock.is_xlocked_by_other(mdr)) { - dout(10) << "waiting on xlocked dentry " << *dn << dendl; - dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr)); - return; - } - - // dn looks ok. - - // get/open inode. - mdr->trace.swap(trace); - CInode *in = mdcache->get_dentry_inode(dn, mdr); - if (!in) return; - dout(7) << "dn links to " << *in << dendl; - - // rmdir vs is_dir - if (in->is_dir()) { - if (rmdir) { - // do empty directory checks - if (!_verify_rmdir(mdr, in)) - return; - } else { - dout(7) << "handle_client_unlink on dir " << *in << ", returning error" << dendl; - reply_request(mdr, -EISDIR); - return; - } - } else { - if (rmdir) { - // unlink - dout(7) << "handle_client_rmdir on non-dir " << *in << ", returning error" << dendl; - reply_request(mdr, -ENOTDIR); - return; - } - } - - // lock - set rdlocks, wrlocks, xlocks; - - for (int i=0; i<(int)trace.size()-1; i++) - rdlocks.insert(&trace[i]->lock); - xlocks.insert(&dn->lock); - wrlocks.insert(&dn->dir->inode->dirlock); - xlocks.insert(&in->linklock); - - if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) - return; - - // yay! - mdr->done_locking = true; // avoid wrlock racing - if (mdr->now == utime_t()) - mdr->now = g_clock.real_now(); - - // get stray dn ready? - CDentry *straydn = 0; - if (dn->is_primary()) { - straydn = mdcache->get_or_create_stray_dentry(dn->inode); - mdr->pin(straydn); // pin it. - dout(10) << " straydn is " << *straydn << dendl; - assert(straydn->is_null()); - - if (!mdr->more()->dst_reanchor_atid && - dn->inode->is_anchored()) { - dout(10) << "reanchoring to stray " << *dn->inode << dendl; - vector trace; - straydn->make_anchor_trace(trace, dn->inode); - mds->anchorclient->prepare_update(dn->inode->ino(), trace, &mdr->more()->dst_reanchor_atid, - new C_MDS_RetryRequest(mdcache, mdr)); - return; - } - } - - // ok! - if (dn->is_remote() && !dn->inode->is_auth()) - _unlink_remote(mdr, dn); - else - _unlink_local(mdr, dn, straydn); -} - - - -class C_MDS_unlink_local_finish : public Context { - MDS *mds; - MDRequest *mdr; - CDentry *dn; - CDentry *straydn; - version_t dnpv; // deleted dentry - version_t dirpv; -public: - C_MDS_unlink_local_finish(MDS *m, MDRequest *r, CDentry *d, CDentry *sd, - version_t dirpv_) : - mds(m), mdr(r), dn(d), straydn(sd), - dnpv(d->get_projected_version()), dirpv(dirpv_) { } - void finish(int r) { - assert(r == 0); - mds->server->_unlink_local_finish(mdr, dn, straydn, dnpv, dirpv); - } -}; - - -void Server::_unlink_local(MDRequest *mdr, CDentry *dn, CDentry *straydn) -{ - dout(10) << "_unlink_local " << *dn << dendl; - - // ok, let's do it. - mdr->ls = mdlog->get_current_segment(); - - // prepare log entry - EUpdate *le = new EUpdate(mdlog, "unlink_local"); - le->metablob.add_client_req(mdr->reqid); - - version_t ipv = 0; // dirty inode version - inode_t *ji = 0; // journaled projected inode - if (dn->is_primary()) { - // primary link. add stray dentry. - assert(straydn); - ipv = straydn->pre_dirty(dn->inode->inode.version); - le->metablob.add_dir_context(straydn->dir); - ji = le->metablob.add_primary_dentry(straydn, true, dn->inode); - } else { - // remote link. update remote inode. - ipv = dn->inode->pre_dirty(); - le->metablob.add_dir_context(dn->inode->get_parent_dir()); - ji = le->metablob.add_primary_dentry(dn->inode->parent, true, dn->inode); - } - - // update journaled target inode - inode_t *pi = dn->inode->project_inode(); - pi->nlink--; - pi->ctime = mdr->now; - pi->version = ipv; - *ji = *pi; // copy into journal - - // the unlinked dentry - dn->pre_dirty(); - version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob); - le->metablob.add_dir_context(dn->get_dir()); - le->metablob.add_null_dentry(dn, true); - - if (mdr->more()->dst_reanchor_atid) - le->metablob.add_anchor_transaction(mdr->more()->dst_reanchor_atid); - - // log + wait - journal_opens(); // journal pending opens, just in case - mdlog->submit_entry(le, new C_MDS_unlink_local_finish(mds, mdr, dn, straydn, - dirpv)); -} - -void Server::_unlink_local_finish(MDRequest *mdr, - CDentry *dn, CDentry *straydn, - version_t dnpv, version_t dirpv) -{ - dout(10) << "_unlink_local_finish " << *dn << dendl; - - // unlink main dentry - CInode *in = dn->inode; - dn->dir->unlink_inode(dn); - - // relink as stray? (i.e. was primary link?) - if (straydn) { - dout(20) << " straydn is " << *straydn << dendl; - straydn->dir->link_primary_inode(straydn, in); - } - - // nlink--, dirty old dentry - in->pop_and_dirty_projected_inode(mdr->ls); - dn->mark_dirty(dnpv, mdr->ls); - - // dir inode's mtime - dirty_dn_diri(mdr, dn, dirpv); - - // share unlink news with replicas - for (map::iterator it = dn->replicas_begin(); - it != dn->replicas_end(); - it++) { - dout(7) << "_unlink_local_finish sending MDentryUnlink to mds" << it->first << dendl; - MDentryUnlink *unlink = new MDentryUnlink(dn->dir->dirfrag(), dn->name); - if (straydn) { - unlink->strayin = straydn->dir->inode->replicate_to(it->first); - unlink->straydir = straydn->dir->replicate_to(it->first); - unlink->straydn = straydn->replicate_to(it->first); - } - mds->send_message_mds(unlink, it->first, MDS_PORT_CACHE); - } - - // commit anchor update? - if (mdr->more()->dst_reanchor_atid) - mds->anchorclient->commit(mdr->more()->dst_reanchor_atid, mdr->ls); - - // bump pop - //mds->balancer->hit_dir(mdr->now, dn->dir, META_POP_DWR); - - // reply - MClientReply *reply = new MClientReply(mdr->client_request, 0); - reply_request(mdr, reply, dn->dir->get_inode()); // FIXME: imprecise ref - - // clean up? - if (straydn) - mdcache->eval_stray(straydn); - - // removing a new dn? - dn->dir->try_remove_unlinked_dn(dn); -} - - -class C_MDS_unlink_remote_finish : public Context { - MDS *mds; - MDRequest *mdr; - CDentry *dn; - version_t dnpv; // deleted dentry - version_t dirpv; -public: - C_MDS_unlink_remote_finish(MDS *m, MDRequest *r, CDentry *d, - version_t dirpv_) : - mds(m), mdr(r), dn(d), - dnpv(d->get_projected_version()), dirpv(dirpv_) { } - void finish(int r) { - assert(r == 0); - mds->server->_unlink_remote_finish(mdr, dn, dnpv, dirpv); - } -}; - -void Server::_unlink_remote(MDRequest *mdr, CDentry *dn) -{ - dout(10) << "_unlink_remote " << *dn << " " << *dn->inode << dendl; - - // 1. send LinkPrepare to dest (journal nlink-- prepare) - int inauth = dn->inode->authority().first; - if (mdr->more()->witnessed.count(inauth) == 0) { - dout(10) << " inode auth must prepare nlink--" << dendl; - - MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_UNLINKPREP); - dn->inode->set_object_info(req->get_object_info()); - req->now = mdr->now; - mds->send_message_mds(req, inauth, MDS_PORT_SERVER); - - assert(mdr->more()->waiting_on_slave.count(inauth) == 0); - mdr->more()->waiting_on_slave.insert(inauth); - return; - } - dout(10) << " inode auth has prepared nlink--" << dendl; - - // ok, let's do it. - // prepare log entry - mdr->ls = mdlog->get_current_segment(); - EUpdate *le = new EUpdate(mdlog, "unlink_remote"); - le->metablob.add_client_req(mdr->reqid); - - // the unlinked dentry - dn->pre_dirty(); - version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob); - le->metablob.add_dir_context(dn->get_dir()); - le->metablob.add_null_dentry(dn, true); - - if (mdr->more()->dst_reanchor_atid) - le->metablob.add_anchor_transaction(mdr->more()->dst_reanchor_atid); - - // finisher - C_MDS_unlink_remote_finish *fin = new C_MDS_unlink_remote_finish(mds, mdr, dn, dirpv); - - journal_opens(); // journal pending opens, just in case - - // mark committing (needed for proper recovery) - mdr->committing = true; - - // log + wait - mdlog->submit_entry(le, fin); -} - -void Server::_unlink_remote_finish(MDRequest *mdr, - CDentry *dn, - version_t dnpv, version_t dirpv) -{ - dout(10) << "_unlink_remote_finish " << *dn << dendl; - - // unlink main dentry - dn->dir->unlink_inode(dn); - dn->mark_dirty(dnpv, mdr->ls); // dirty old dentry - - // dir inode's mtime - dirty_dn_diri(mdr, dn, dirpv); - - // share unlink news with replicas - for (map::iterator it = dn->replicas_begin(); - it != dn->replicas_end(); - it++) { - dout(7) << "_unlink_remote_finish sending MDentryUnlink to mds" << it->first << dendl; - MDentryUnlink *unlink = new MDentryUnlink(dn->dir->dirfrag(), dn->name); - mds->send_message_mds(unlink, it->first, MDS_PORT_CACHE); - } - - // commit anchor update? - if (mdr->more()->dst_reanchor_atid) - mds->anchorclient->commit(mdr->more()->dst_reanchor_atid, mdr->ls); - - //mds->balancer->hit_dir(mdr->now, dn->dir, META_POP_DWR); - - // reply - MClientReply *reply = new MClientReply(mdr->client_request, 0); - reply_request(mdr, reply, dn->dir->get_inode()); // FIXME: imprecise ref - - // removing a new dn? - dn->dir->try_remove_unlinked_dn(dn); -} - - - - -/** _verify_rmdir - * - * verify that a directory is empty (i.e. we can rmdir it), - * and make sure it is part of the same subtree (i.e. local) - * so that rmdir will occur locally. - * - * @param in is the inode being rmdir'd. - */ -bool Server::_verify_rmdir(MDRequest *mdr, CInode *in) -{ - dout(10) << "_verify_rmdir " << *in << dendl; - assert(in->is_auth()); - - list frags; - in->dirfragtree.get_leaves(frags); - - for (list::iterator p = frags.begin(); - p != frags.end(); - ++p) { - CDir *dir = in->get_or_open_dirfrag(mdcache, *p); - assert(dir); - - // dir looks empty but incomplete? - if (dir->is_auth() && - dir->get_size() == 0 && - !dir->is_complete()) { - dout(7) << "_verify_rmdir fetching incomplete dir " << *dir << dendl; - dir->fetch(new C_MDS_RetryRequest(mdcache, mdr)); - return false; - } - - // does the frag _look_ empty? - if (dir->get_size()) { - dout(10) << "_verify_rmdir still " << dir->get_size() << " items in frag " << *dir << dendl; - reply_request(mdr, -ENOTEMPTY); - return false; - } - - // not dir auth? - if (!dir->is_auth()) { - dout(10) << "_verify_rmdir not auth for " << *dir << ", FIXME BUG" << dendl; - reply_request(mdr, -ENOTEMPTY); - return false; - } - } - - return true; -} -/* - // export sanity check - if (!in->is_auth()) { - // i should be exporting this now/soon, since the dir is empty. - dout(7) << "handle_client_rmdir dir is auth, but not inode." << dendl; - mdcache->migrator->export_empty_import(in->dir); - in->dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mds, req, diri)); - return; - } -*/ - - - - -// ====================================================== - - -class C_MDS_rename_finish : public Context { - MDS *mds; - MDRequest *mdr; - CDentry *srcdn; - CDentry *destdn; - CDentry *straydn; -public: - C_MDS_rename_finish(MDS *m, MDRequest *r, - CDentry *sdn, CDentry *ddn, CDentry *stdn) : - mds(m), mdr(r), - srcdn(sdn), destdn(ddn), straydn(stdn) { } - void finish(int r) { - assert(r == 0); - mds->server->_rename_finish(mdr, srcdn, destdn, straydn); - } -}; - - -/** handle_client_rename - * - */ -void Server::handle_client_rename(MDRequest *mdr) -{ - MClientRequest *req = mdr->client_request; - dout(7) << "handle_client_rename " << *req << dendl; - - // traverse to dest dir (not dest) - // we do this FIRST, because the rename should occur on the - // destdn's auth. - const filepath &destpath = req->get_filepath2(); - const string &destname = destpath.last_dentry(); - vector desttrace; - CDir *destdir = traverse_to_auth_dir(mdr, desttrace, destpath); - if (!destdir) return; // fw or error out - dout(10) << "dest will be " << destname << " in " << *destdir << dendl; - assert(destdir->is_auth()); - - // traverse to src - filepath srcpath = req->get_filepath(); - vector srctrace; - int r = mdcache->path_traverse(mdr, req, - srcpath, srctrace, false, - MDS_TRAVERSE_DISCOVER); - if (r > 0) return; - if (srctrace.empty()) r = -EINVAL; // can't rename root - if (r < 0) { - reply_request(mdr, r); - return; - } - CDentry *srcdn = srctrace[srctrace.size()-1]; - dout(10) << " srcdn " << *srcdn << dendl; - CInode *srci = mdcache->get_dentry_inode(srcdn, mdr); - dout(10) << " srci " << *srci << dendl; - mdr->pin(srci); - - // -- some sanity checks -- - // src == dest? - if (srcdn->get_dir() == destdir && srcdn->name == destname) { - dout(7) << "rename src=dest, noop" << dendl; - reply_request(mdr, 0); - return; - } - - // dest a child of src? - // e.g. mv /usr /usr/foo - CDentry *pdn = destdir->inode->parent; - while (pdn) { - if (pdn == srcdn) { - dout(7) << "cannot rename item to be a child of itself" << dendl; - reply_request(mdr, -EINVAL); - return; - } - pdn = pdn->dir->inode->parent; - } - - - // identify/create dest dentry - CDentry *destdn = destdir->lookup(destname); - if (destdn && destdn->lock.is_xlocked_by_other(mdr)) { - destdn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr)); - return; - } - - CInode *oldin = 0; - if (destdn && !destdn->is_null()) { - //dout(10) << "dest dn exists " << *destdn << dendl; - oldin = mdcache->get_dentry_inode(destdn, mdr); - if (!oldin) return; - dout(10) << " oldin " << *oldin << dendl; - - // mv /some/thing /to/some/existing_other_thing - if (oldin->is_dir() && !srci->is_dir()) { - reply_request(mdr, -EISDIR); - return; - } - if (!oldin->is_dir() && srci->is_dir()) { - reply_request(mdr, -ENOTDIR); - return; - } - - // non-empty dir? - if (oldin->is_dir() && !_verify_rmdir(mdr, oldin)) - return; - } - if (!destdn) { - // mv /some/thing /to/some/non_existent_name - destdn = prepare_null_dentry(mdr, destdir, destname); - if (!destdn) return; - } - - dout(10) << " destdn " << *destdn << dendl; - - // -- locks -- - set rdlocks, wrlocks, xlocks; - - // rdlock sourcedir path, xlock src dentry - for (int i=0; i<(int)srctrace.size()-1; i++) - rdlocks.insert(&srctrace[i]->lock); - xlocks.insert(&srcdn->lock); - wrlocks.insert(&srcdn->dir->inode->dirlock); - /* - * no, this causes problems if the dftlock is scattered... - * and what was i thinking anyway? - * rdlocks.insert(&srcdn->dir->inode->dirfragtreelock); // rd lock on srci dirfragtree. - */ - - // rdlock destdir path, xlock dest dentry - for (int i=0; i<(int)desttrace.size(); i++) - rdlocks.insert(&desttrace[i]->lock); - xlocks.insert(&destdn->lock); - wrlocks.insert(&destdn->dir->inode->dirlock); - - // xlock versionlock on srci if remote? - // this ensures it gets safely remotely auth_pinned, avoiding deadlock; - // strictly speaking, having the slave node freeze the inode is - // otherwise sufficient for avoiding conflicts with inode locks, etc. - if (!srcdn->is_auth() && srcdn->is_primary()) - xlocks.insert(&srcdn->inode->versionlock); - - // xlock oldin (for nlink--) - if (oldin) xlocks.insert(&oldin->linklock); - - if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) - return; - - // set done_locking flag, to avoid problems with wrlock moving auth target - mdr->done_locking = true; - - // -- open all srcdn inode frags, if any -- - // we need these open so that auth can properly delegate from inode to dirfrags - // after the inode is _ours_. - if (srcdn->is_primary() && - !srcdn->is_auth() && - srci->is_dir()) { - dout(10) << "srci is remote dir, setting stickydirs and opening all frags" << dendl; - mdr->set_stickydirs(srci); - - list frags; - srci->dirfragtree.get_leaves(frags); - for (list::iterator p = frags.begin(); - p != frags.end(); - ++p) { - CDir *dir = srci->get_dirfrag(*p); - if (!dir) { - dout(10) << " opening " << *p << " under " << *srci << dendl; - mdcache->open_remote_dirfrag(srci, *p, new C_MDS_RetryRequest(mdcache, mdr)); - return; - } - } - } - - // -- declare now -- - if (mdr->now == utime_t()) - mdr->now = g_clock.real_now(); - - // -- create stray dentry? -- - CDentry *straydn = 0; - if (destdn->is_primary()) { - straydn = mdcache->get_or_create_stray_dentry(destdn->inode); - mdr->pin(straydn); - dout(10) << "straydn is " << *straydn << dendl; - } - - // -- prepare witnesses -- - /* - * NOTE: we use _all_ replicas as witnesses. - * this probably isn't totally necessary (esp for file renames), - * but if/when we change that, we have to make sure rejoin is - * sufficiently robust to handle strong rejoins from survivors - * with totally wrong dentry->inode linkage. - * (currently, it can ignore rename effects, because the resolve - * stage will sort them out.) - */ - set witnesses = mdr->more()->extra_witnesses; - if (srcdn->is_auth()) - srcdn->list_replicas(witnesses); - else - witnesses.insert(srcdn->authority().first); - destdn->list_replicas(witnesses); - dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl; - - // do srcdn auth last - int last = -1; - if (!srcdn->is_auth()) - last = srcdn->authority().first; - - for (set::iterator p = witnesses.begin(); - p != witnesses.end(); - ++p) { - if (*p == last) continue; // do it last! - if (mdr->more()->witnessed.count(*p)) { - dout(10) << " already witnessed by mds" << *p << dendl; - } else if (mdr->more()->waiting_on_slave.count(*p)) { - dout(10) << " already waiting on witness mds" << *p << dendl; - } else { - _rename_prepare_witness(mdr, *p, srcdn, destdn, straydn); - } - } - if (!mdr->more()->waiting_on_slave.empty()) - return; // we're waiting for a witness. - - if (last >= 0 && - mdr->more()->witnessed.count(last) == 0 && - mdr->more()->waiting_on_slave.count(last) == 0) { - dout(10) << " preparing last witness (srcdn auth)" << dendl; - _rename_prepare_witness(mdr, last, srcdn, destdn, straydn); - return; - } - - // -- prepare anchor updates -- - bool linkmerge = (srcdn->inode == destdn->inode && - (srcdn->is_primary() || destdn->is_primary())); - - if (!linkmerge) { - C_Gather *anchorgather = 0; - - if (srcdn->is_primary() && srcdn->inode->is_anchored() && - srcdn->dir != destdn->dir && - !mdr->more()->src_reanchor_atid) { - dout(10) << "reanchoring src->dst " << *srcdn->inode << dendl; - vector trace; - destdn->make_anchor_trace(trace, srcdn->inode); - - anchorgather = new C_Gather(new C_MDS_RetryRequest(mdcache, mdr)); - mds->anchorclient->prepare_update(srcdn->inode->ino(), trace, &mdr->more()->src_reanchor_atid, - anchorgather->new_sub()); - } - if (destdn->is_primary() && - destdn->inode->is_anchored() && - !mdr->more()->dst_reanchor_atid) { - dout(10) << "reanchoring dst->stray " << *destdn->inode << dendl; - - assert(straydn); - vector trace; - straydn->make_anchor_trace(trace, destdn->inode); - - if (!anchorgather) - anchorgather = new C_Gather(new C_MDS_RetryRequest(mdcache, mdr)); - mds->anchorclient->prepare_update(destdn->inode->ino(), trace, &mdr->more()->dst_reanchor_atid, - anchorgather->new_sub()); - } - - if (anchorgather) - return; // waiting for anchor prepares - } - - // -- prepare journal entry -- - mdr->ls = mdlog->get_current_segment(); - EUpdate *le = new EUpdate(mdlog, "rename"); - le->metablob.add_client_req(mdr->reqid); - - _rename_prepare(mdr, &le->metablob, &le->client_map, srcdn, destdn, straydn); - - if (!srcdn->is_auth() && srcdn->is_primary()) { - // importing inode; also journal imported client map - - // ** DER FIXME ** - } - - // -- commit locally -- - C_MDS_rename_finish *fin = new C_MDS_rename_finish(mds, mdr, srcdn, destdn, straydn); - - journal_opens(); // journal pending opens, just in case - - // mark committing (needed for proper recovery) - mdr->committing = true; - - // log + wait - mdlog->submit_entry(le, fin); -} - - -void Server::_rename_finish(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn) -{ - dout(10) << "_rename_finish " << *mdr << dendl; - - // apply - _rename_apply(mdr, srcdn, destdn, straydn); - - // commit anchor updates? - if (mdr->more()->src_reanchor_atid) - mds->anchorclient->commit(mdr->more()->src_reanchor_atid, mdr->ls); - if (mdr->more()->dst_reanchor_atid) - mds->anchorclient->commit(mdr->more()->dst_reanchor_atid, mdr->ls); - - // bump popularity - //if (srcdn->is_auth()) - //mds->balancer->hit_dir(mdr->now, srcdn->get_dir(), META_POP_DWR); - // mds->balancer->hit_dir(mdr->now, destdn->get_dir(), META_POP_DWR); - if (destdn->is_remote() && - destdn->inode->is_auth()) - mds->balancer->hit_inode(mdr->now, destdn->get_inode(), META_POP_IWR); - - // reply - MClientReply *reply = new MClientReply(mdr->client_request, 0); - reply_request(mdr, reply, destdn->get_inode()); // FIXME: imprecise ref - - // clean up? - if (straydn) - mdcache->eval_stray(straydn); -} - - - -// helpers - -void Server::_rename_prepare_witness(MDRequest *mdr, int who, CDentry *srcdn, CDentry *destdn, CDentry *straydn) -{ - dout(10) << "_rename_prepare_witness mds" << who << dendl; - MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_RENAMEPREP); - srcdn->make_path(req->srcdnpath); - destdn->make_path(req->destdnpath); - req->now = mdr->now; - - if (straydn) { - CInodeDiscover *indis = straydn->dir->inode->replicate_to(who); - CDirDiscover *dirdis = straydn->dir->replicate_to(who); - CDentryDiscover *dndis = straydn->replicate_to(who); - indis->_encode(req->stray); - dirdis->_encode(req->stray); - dndis->_encode(req->stray); - delete indis; - delete dirdis; - delete dndis; - } - - // srcdn auth will verify our current witness list is sufficient - req->witnesses = mdr->more()->witnessed; - - mds->send_message_mds(req, who, MDS_PORT_SERVER); - - assert(mdr->more()->waiting_on_slave.count(who) == 0); - mdr->more()->waiting_on_slave.insert(who); -} - - -void Server::_rename_prepare(MDRequest *mdr, - EMetaBlob *metablob, bufferlist *client_map_bl, - CDentry *srcdn, CDentry *destdn, CDentry *straydn) -{ - dout(10) << "_rename_prepare " << *mdr << " " << *srcdn << " " << *destdn << dendl; - - // primary+remote link merge? - bool linkmerge = (srcdn->inode == destdn->inode && - (srcdn->is_primary() || destdn->is_primary())); - - if (mdr->is_master()) { - mdr->more()->pvmap[destdn->dir->inode] = predirty_dn_diri(mdr, destdn, metablob); - if (destdn->dir != srcdn->dir) - mdr->more()->pvmap[srcdn->dir->inode] = predirty_dn_diri(mdr, srcdn, metablob); - } - - inode_t *ji = 0; // journaled inode getting nlink-- - version_t ipv = 0; // it's version - - if (linkmerge) { - dout(10) << "will merge remote+primary links" << dendl; - - // destdn -> primary - metablob->add_dir_context(destdn->dir); - if (destdn->is_auth()) - ipv = mdr->more()->pvmap[destdn] = destdn->pre_dirty(destdn->inode->inode.version); - ji = metablob->add_primary_dentry(destdn, true, destdn->inode); - - // do src dentry - metablob->add_dir_context(srcdn->dir); - if (srcdn->is_auth()) - mdr->more()->pvmap[srcdn] = srcdn->pre_dirty(); - metablob->add_null_dentry(srcdn, true); - - } else { - // move to stray? - if (destdn->is_primary()) { - // primary. we'll move inode to stray dir. - assert(straydn); - - // link-- inode, move to stray dir. - metablob->add_dir_context(straydn->dir); - if (straydn->is_auth()) - ipv = mdr->more()->pvmap[straydn] = straydn->pre_dirty(destdn->inode->inode.version); - ji = metablob->add_primary_dentry(straydn, true, destdn->inode); - } - else if (destdn->is_remote()) { - // remote. - // nlink-- targeti - metablob->add_dir_context(destdn->inode->get_parent_dir()); - if (destdn->inode->is_auth()) - ipv = mdr->more()->pvmap[destdn->inode] = destdn->inode->pre_dirty(); - ji = metablob->add_primary_dentry(destdn->inode->parent, true, destdn->inode); // update primary - dout(10) << "remote targeti (nlink--) is " << *destdn->inode << dendl; - } - else { - assert(destdn->is_null()); - } - - // add dest dentry - metablob->add_dir_context(destdn->dir); - if (srcdn->is_primary()) { - dout(10) << "src is a primary dentry" << dendl; - if (destdn->is_auth()) { - version_t siv; - if (srcdn->is_auth()) - siv = srcdn->inode->get_projected_version(); - else { - siv = mdr->more()->inode_import_v; - - /* import node */ - bufferlist::iterator blp = mdr->more()->inode_import.begin(); - - // imported caps - ::_decode_simple(mdr->more()->imported_client_map, blp); - ::_encode_simple(mdr->more()->imported_client_map, *client_map_bl); - prepare_force_open_sessions(mdr->more()->imported_client_map); - - list updated_scatterlocks; // we clear_updated explicitly below - - mdcache->migrator->decode_import_inode(srcdn, blp, - srcdn->authority().first, - mdr->ls, - mdr->more()->cap_imports, updated_scatterlocks); - srcdn->inode->dirlock.clear_updated(); - - - // hack: force back to !auth and clean, temporarily - srcdn->inode->state_clear(CInode::STATE_AUTH); - srcdn->inode->mark_clean(); - } - mdr->more()->pvmap[destdn] = destdn->pre_dirty(siv+1); - } - metablob->add_primary_dentry(destdn, true, srcdn->inode); - - } else { - assert(srcdn->is_remote()); - dout(10) << "src is a remote dentry" << dendl; - if (destdn->is_auth()) - mdr->more()->pvmap[destdn] = destdn->pre_dirty(); - metablob->add_remote_dentry(destdn, true, srcdn->get_remote_ino()); - } - - // remove src dentry - metablob->add_dir_context(srcdn->dir); - if (srcdn->is_auth()) - mdr->more()->pvmap[srcdn] = srcdn->pre_dirty(); - metablob->add_null_dentry(srcdn, true); - - // new subtree? - if (srcdn->is_primary() && - srcdn->inode->is_dir()) { - list ls; - srcdn->inode->get_nested_dirfrags(ls); - int auth = srcdn->authority().first; - for (list::iterator p = ls.begin(); p != ls.end(); ++p) - mdcache->adjust_subtree_auth(*p, auth, auth); - } - } - - if (ipv) { - // update journaled target inode - inode_t *pi = destdn->inode->project_inode(); - pi->nlink--; - pi->ctime = mdr->now; - pi->version = ipv; - *ji = *pi; // copy into journal - } - - // anchor updates? - if (mdr->more()->src_reanchor_atid) - metablob->add_anchor_transaction(mdr->more()->src_reanchor_atid); - if (mdr->more()->dst_reanchor_atid) - metablob->add_anchor_transaction(mdr->more()->dst_reanchor_atid); -} - - -void Server::_rename_apply(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn) -{ - dout(10) << "_rename_apply " << *mdr << " " << *srcdn << " " << *destdn << dendl; - dout(10) << " pvs " << mdr->more()->pvmap << dendl; - - CInode *oldin = destdn->inode; - - // primary+remote link merge? - bool linkmerge = (srcdn->inode == destdn->inode && - (srcdn->is_primary() || destdn->is_primary())); - - // dir mtimes - if (mdr->is_master()) { - dirty_dn_diri(mdr, destdn, mdr->more()->pvmap[destdn->dir->inode]); - if (destdn->dir != srcdn->dir) - dirty_dn_diri(mdr, srcdn, mdr->more()->pvmap[srcdn->dir->inode]); - } - - if (linkmerge) { - if (destdn->is_primary()) { - dout(10) << "merging remote onto primary link" << dendl; - - // nlink-- in place - destdn->inode->inode.nlink--; - destdn->inode->inode.ctime = mdr->now; - if (destdn->inode->is_auth()) - destdn->inode->mark_dirty(mdr->more()->pvmap[destdn], mdr->ls); - - // unlink srcdn - srcdn->dir->unlink_inode(srcdn); - if (srcdn->is_auth()) - srcdn->mark_dirty(mdr->more()->pvmap[srcdn], mdr->ls); - } else { - dout(10) << "merging primary onto remote link" << dendl; - assert(srcdn->is_primary()); - - // move inode to dest - srcdn->dir->unlink_inode(srcdn); - destdn->dir->unlink_inode(destdn); - destdn->dir->link_primary_inode(destdn, oldin); - - // nlink-- - destdn->inode->inode.nlink--; - destdn->inode->inode.ctime = mdr->now; - if (destdn->inode->is_auth()) - destdn->inode->mark_dirty(mdr->more()->pvmap[destdn], mdr->ls); - - // mark src dirty - if (srcdn->is_auth()) - srcdn->mark_dirty(mdr->more()->pvmap[srcdn], mdr->ls); - } - } - else { - // unlink destdn? - if (!destdn->is_null()) - destdn->dir->unlink_inode(destdn); - - if (straydn) { - dout(10) << "straydn is " << *straydn << dendl; - - // relink oldin to stray dir. destdn was primary. - assert(oldin); - straydn->dir->link_primary_inode(straydn, oldin); - //assert(straypv == ipv); - - // nlink-- in stray dir. - oldin->inode.nlink--; - oldin->inode.ctime = mdr->now; - if (oldin->is_auth()) - oldin->pop_and_dirty_projected_inode(mdr->ls); - } - else if (oldin) { - // nlink-- remote. destdn was remote. - oldin->inode.nlink--; - oldin->inode.ctime = mdr->now; - if (oldin->is_auth()) - oldin->pop_and_dirty_projected_inode(mdr->ls); - } - - CInode *in = srcdn->inode; - assert(in); - if (srcdn->is_remote()) { - // srcdn was remote. - srcdn->dir->unlink_inode(srcdn); - destdn->dir->link_remote_inode(destdn, in->ino(), MODE_TO_DT(in->inode.mode)); - destdn->link_remote(in); - if (destdn->is_auth()) - destdn->mark_dirty(mdr->more()->pvmap[destdn], mdr->ls); - } else { - // srcdn was primary. - srcdn->dir->unlink_inode(srcdn); - destdn->dir->link_primary_inode(destdn, in); - - // srcdn inode import? - if (!srcdn->is_auth() && destdn->is_auth()) { - assert(mdr->more()->inode_import.length() > 0); - - // finish cap imports - finish_force_open_sessions(mdr->more()->imported_client_map); - if (mdr->more()->cap_imports.count(destdn->inode)) - mds->mdcache->migrator->finish_import_inode_caps(destdn->inode, srcdn->authority().first, - mdr->more()->cap_imports[destdn->inode]); - - // hack: fix auth bit - destdn->inode->state_set(CInode::STATE_AUTH); - } - if (destdn->inode->is_auth()) - destdn->inode->mark_dirty(mdr->more()->pvmap[destdn], mdr->ls); - } - - if (srcdn->is_auth()) - srcdn->mark_dirty(mdr->more()->pvmap[srcdn], mdr->ls); - } - - // update subtree map? - if (destdn->is_primary() && destdn->inode->is_dir()) - mdcache->adjust_subtree_after_rename(destdn->inode, srcdn->dir); - - // removing a new dn? - if (srcdn->is_auth()) - srcdn->dir->try_remove_unlinked_dn(srcdn); -} - - - - - -// ------------ -// SLAVE - -class C_MDS_SlaveRenamePrep : public Context { - Server *server; - MDRequest *mdr; - CDentry *srcdn, *destdn, *straydn; -public: - C_MDS_SlaveRenamePrep(Server *s, MDRequest *m, CDentry *sr, CDentry *de, CDentry *st) : - server(s), mdr(m), srcdn(sr), destdn(de), straydn(st) {} - void finish(int r) { - server->_logged_slave_rename(mdr, srcdn, destdn, straydn); - } -}; - -class C_MDS_SlaveRenameCommit : public Context { - Server *server; - MDRequest *mdr; - CDentry *srcdn, *destdn, *straydn; -public: - C_MDS_SlaveRenameCommit(Server *s, MDRequest *m, CDentry *sr, CDentry *de, CDentry *st) : - server(s), mdr(m), srcdn(sr), destdn(de), straydn(st) {} - void finish(int r) { - server->_commit_slave_rename(mdr, r, srcdn, destdn, straydn); - } -}; - -void Server::handle_slave_rename_prep(MDRequest *mdr) -{ - dout(10) << "handle_slave_rename_prep " << *mdr - << " " << mdr->slave_request->srcdnpath - << " to " << mdr->slave_request->destdnpath - << dendl; - - // discover destdn - filepath destpath(mdr->slave_request->destdnpath); - dout(10) << " dest " << destpath << dendl; - vector trace; - int r = mdcache->path_traverse(mdr, mdr->slave_request, - destpath, trace, false, - MDS_TRAVERSE_DISCOVERXLOCK); - if (r > 0) return; - assert(r == 0); // we shouldn't get an error here! - - CDentry *destdn = trace[trace.size()-1]; - dout(10) << " destdn " << *destdn << dendl; - mdr->pin(destdn); - - // discover srcdn - filepath srcpath(mdr->slave_request->srcdnpath); - dout(10) << " src " << srcpath << dendl; - r = mdcache->path_traverse(mdr, mdr->slave_request, - srcpath, trace, false, - MDS_TRAVERSE_DISCOVERXLOCK); - if (r > 0) return; - assert(r == 0); - - CDentry *srcdn = trace[trace.size()-1]; - dout(10) << " srcdn " << *srcdn << dendl; - mdr->pin(srcdn); - assert(srcdn->inode); - mdr->pin(srcdn->inode); - - // stray? - CDentry *straydn = 0; - if (destdn->is_primary()) { - assert(mdr->slave_request->stray.length() > 0); - straydn = mdcache->add_replica_stray(mdr->slave_request->stray, - destdn->inode, mdr->slave_to_mds); - assert(straydn); - mdr->pin(straydn); - } - - mdr->now = mdr->slave_request->now; - - // set up commit waiter (early, to clean up any freezing etc we do) - if (!mdr->more()->slave_commit) - mdr->more()->slave_commit = new C_MDS_SlaveRenameCommit(this, mdr, srcdn, destdn, straydn); - - // am i srcdn auth? - if (srcdn->is_auth()) { - if (srcdn->is_primary() && - !srcdn->inode->is_freezing_inode() && - !srcdn->inode->is_frozen_inode()) { - // srci auth. - // set ambiguous auth. - srcdn->inode->state_set(CInode::STATE_AMBIGUOUSAUTH); - - // freeze? - // we need this to - // - avoid conflicting lock state changes - // - avoid concurrent updates to the inode - // (this could also be accomplished with the versionlock) - int allowance = 1; // for the versionlock and possible linklock xlock (both are tied to mdr) - dout(10) << " freezing srci " << *srcdn->inode << " with allowance " << allowance << dendl; - if (!srcdn->inode->freeze_inode(allowance)) { - srcdn->inode->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr)); - return; - } - } - - // is witness list sufficient? - set srcdnrep; - srcdn->list_replicas(srcdnrep); - for (set::iterator p = srcdnrep.begin(); - p != srcdnrep.end(); - ++p) { - if (*p == mdr->slave_to_mds || - mdr->slave_request->witnesses.count(*p)) continue; - dout(10) << " witness list insufficient; providing srcdn replica list" << dendl; - MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_RENAMEPREPACK); - reply->witnesses.swap(srcdnrep); - mds->send_message_mds(reply, mdr->slave_to_mds, MDS_PORT_SERVER); - delete mdr->slave_request; - mdr->slave_request = 0; - return; - } - dout(10) << " witness list sufficient: includes all srcdn replicas" << dendl; - } - - // journal it? - if (srcdn->is_auth() || - (destdn->inode && destdn->inode->is_auth()) || - srcdn->inode->is_any_caps()) { - // journal. - mdr->ls = mdlog->get_current_segment(); - ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rename_prep", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_PREPARE); - - // rollback case - if (destdn->inode && destdn->inode->is_auth()) { - assert(destdn->is_remote()); - le->rollback.add_dir_context(destdn->dir); - le->rollback.add_dentry(destdn, true); - } - if (srcdn->is_auth() || - (srcdn->inode && srcdn->inode->is_auth())) { - le->rollback.add_dir_context(srcdn->dir); - le->rollback.add_dentry(srcdn, true); - } - - // commit case - bufferlist blah; - _rename_prepare(mdr, &le->commit, &blah, srcdn, destdn, straydn); - - mdlog->submit_entry(le, new C_MDS_SlaveRenamePrep(this, mdr, srcdn, destdn, straydn)); - } else { - // don't journal. - dout(10) << "not journaling, i'm not auth for anything, and srci isn't open" << dendl; - - // prepare anyway; this may twiddle dir_auth - EMetaBlob blob; - bufferlist blah; - _rename_prepare(mdr, &blob, &blah, srcdn, destdn, straydn); - _logged_slave_rename(mdr, srcdn, destdn, straydn); - } -} - -void Server::_logged_slave_rename(MDRequest *mdr, - CDentry *srcdn, CDentry *destdn, CDentry *straydn) -{ - dout(10) << "_logged_slave_rename " << *mdr << dendl; - - // prepare ack - MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_RENAMEPREPACK); - - // export srci? - if (srcdn->is_auth() && srcdn->is_primary()) { - list finished; - map exported_client_map; - bufferlist inodebl; - mdcache->migrator->encode_export_inode(srcdn->inode, inodebl, - exported_client_map); - mdcache->migrator->finish_export_inode(srcdn->inode, mdr->now, finished); - mds->queue_waiters(finished); // this includes SINGLEAUTH waiters. - ::_encode(exported_client_map, reply->inode_export); - reply->inode_export.claim_append(inodebl); - reply->inode_export_v = srcdn->inode->inode.version; - - // remove mdr auth pin - mdr->auth_unpin(srcdn->inode); - assert(!srcdn->inode->is_auth_pinned()); - - dout(10) << " exported srci " << *srcdn->inode << dendl; - } - - // apply - _rename_apply(mdr, srcdn, destdn, straydn); - - mds->send_message_mds(reply, mdr->slave_to_mds, MDS_PORT_SERVER); - - // bump popularity - //if (srcdn->is_auth()) - //mds->balancer->hit_dir(mdr->now, srcdn->get_dir(), META_POP_DWR); - if (destdn->inode && destdn->inode->is_auth()) - mds->balancer->hit_inode(mdr->now, destdn->inode, META_POP_IWR); - - // done. - delete mdr->slave_request; - mdr->slave_request = 0; -} - -void Server::_commit_slave_rename(MDRequest *mdr, int r, - CDentry *srcdn, CDentry *destdn, CDentry *straydn) -{ - dout(10) << "_commit_slave_rename " << *mdr << " r=" << r << dendl; - - // unfreeze+singleauth inode - // hmm, do i really need to delay this? - if (srcdn->is_auth() && destdn->is_primary()) { - dout(10) << " unfreezing exported inode " << *destdn->inode << dendl; - list finished; - - // singleauth - assert(destdn->inode->state_test(CInode::STATE_AMBIGUOUSAUTH)); - destdn->inode->state_clear(CInode::STATE_AMBIGUOUSAUTH); - destdn->inode->take_waiting(CInode::WAIT_SINGLEAUTH, finished); - - // unfreeze - assert(destdn->inode->is_frozen_inode() || - destdn->inode->is_freezing_inode()); - destdn->inode->unfreeze_inode(finished); - - mds->queue_waiters(finished); - } - - - ESlaveUpdate *le; - if (r == 0) { - // write a commit to the journal - le = new ESlaveUpdate(mdlog, "slave_rename_commit", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_COMMIT); - - } else { - // abort - le = new ESlaveUpdate(mdlog, "slave_rename_abort", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_ROLLBACK); - - // -- rollback in memory -- - - if (mdr->more()->was_link_merge) { - // link merge - CInode *in = destdn->inode; - in->inode.nlink++; - if (mdr->more()->destdn_was_remote_inode) { - destdn->dir->unlink_inode(destdn); - srcdn->dir->link_primary_inode(srcdn, in); - destdn->dir->link_remote_inode(destdn, in->ino(), MODE_TO_DT(in->inode.mode)); - } else { - srcdn->dir->link_remote_inode(srcdn, in->ino(), MODE_TO_DT(in->inode.mode)); - } - } else { - // normal - - // revert srcdn - if (destdn->is_remote()) { - srcdn->dir->link_remote_inode(srcdn, destdn->inode->ino(), MODE_TO_DT(destdn->inode->inode.mode)); - destdn->dir->unlink_inode(destdn); - } else { - // renamed a primary - CInode *in = destdn->inode; - destdn->dir->unlink_inode(destdn); - srcdn->dir->link_primary_inode(srcdn, in); - } - - // revert destdn - if (mdr->more()->destdn_was_remote_inode) { - destdn->dir->link_remote_inode(destdn, - mdr->more()->destdn_was_remote_inode->ino(), - MODE_TO_DT(mdr->more()->destdn_was_remote_inode->inode.mode)); - mdr->more()->destdn_was_remote_inode->inode.nlink++; - } else if (straydn && straydn->inode) { - CInode *in = straydn->inode; - straydn->dir->unlink_inode(straydn); - destdn->dir->link_primary_inode(destdn, in); - straydn->dir->remove_dentry(straydn); - } - } - - // FIXME: reverse srci export? - - dout(-10) << " srcdn back to " << *srcdn << dendl; - dout(-10) << " srci back to " << *srcdn->inode << dendl; - dout(-10) << " destdn back to " << *destdn << dendl; - if (destdn->inode) dout(-10) << " desti back to " << *destdn->inode << dendl; - - // *** WRITE ME *** - assert(0); - - } - - - - mdlog->submit_entry(le); -} - -void Server::handle_slave_rename_prep_ack(MDRequest *mdr, MMDSSlaveRequest *ack) -{ - dout(10) << "handle_slave_rename_prep_ack " << *mdr - << " witnessed by " << ack->get_source() - << " " << *ack << dendl; - int from = ack->get_source().num(); - - // note slave - mdr->more()->slaves.insert(from); - - // witnessed? or add extra witnesses? - assert(mdr->more()->witnessed.count(from) == 0); - if (ack->witnesses.empty()) { - mdr->more()->witnessed.insert(from); - } else { - dout(10) << " extra witnesses (srcdn replicas) are " << ack->witnesses << dendl; - mdr->more()->extra_witnesses.swap(ack->witnesses); - mdr->more()->extra_witnesses.erase(mds->get_nodeid()); // not me! - } - - // srci import? - if (ack->inode_export.length()) { - dout(10) << " got srci import" << dendl; - mdr->more()->inode_import.claim(ack->inode_export); - mdr->more()->inode_import_v = ack->inode_export_v; - } - - // remove from waiting list - assert(mdr->more()->waiting_on_slave.count(from)); - mdr->more()->waiting_on_slave.erase(from); - - if (mdr->more()->waiting_on_slave.empty()) - dispatch_client_request(mdr); // go again! - else - dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl; -} - - - - - -// =================================== -// TRUNCATE, FSYNC - -class C_MDS_truncate_purged : public Context { - MDS *mds; - MDRequest *mdr; - CInode *in; - version_t pv; - off_t size; - utime_t ctime; -public: - C_MDS_truncate_purged(MDS *m, MDRequest *r, CInode *i, version_t pdv, off_t sz, utime_t ct) : - mds(m), mdr(r), in(i), - pv(pdv), - size(sz), ctime(ct) { } - void finish(int r) { - assert(r == 0); - - // apply to cache - in->inode.size = size; - in->inode.ctime = ctime; - in->inode.mtime = ctime; - in->mark_dirty(pv, mdr->ls); - - // reply - mds->server->reply_request(mdr, 0); - } -}; - -class C_MDS_truncate_logged : public Context { - MDS *mds; - MDRequest *mdr; - CInode *in; - version_t pv; - off_t size; - utime_t ctime; -public: - C_MDS_truncate_logged(MDS *m, MDRequest *r, CInode *i, version_t pdv, off_t sz, utime_t ct) : - mds(m), mdr(r), in(i), - pv(pdv), - size(sz), ctime(ct) { } - void finish(int r) { - assert(r == 0); - - // purge - mds->mdcache->purge_inode(in, size, in->inode.size, mdr->ls); - mds->mdcache->wait_for_purge(in, size, - new C_MDS_truncate_purged(mds, mdr, in, pv, size, ctime)); - } -}; - -void Server::handle_client_truncate(MDRequest *mdr) -{ - MClientRequest *req = mdr->client_request; - CInode *cur = rdlock_path_pin_ref(mdr, true); - if (!cur) return; - - // check permissions? - - // xlock inode - set rdlocks = mdr->rdlocks; - set wrlocks = mdr->wrlocks; - set xlocks = mdr->xlocks; - xlocks.insert(&cur->filelock); - if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) - return; - - // already small enough? - if (cur->inode.size <= req->args.truncate.length) { - reply_request(mdr, 0); - return; - } - - // prepare - version_t pdv = cur->pre_dirty(); - utime_t ctime = g_clock.real_now(); - Context *fin = new C_MDS_truncate_logged(mds, mdr, cur, - pdv, req->args.truncate.length, ctime); - - // log + wait - mdr->ls = mdlog->get_current_segment(); - EUpdate *le = new EUpdate(mdlog, "truncate"); - le->metablob.add_client_req(mdr->reqid); - le->metablob.add_dir_context(cur->get_parent_dir()); - le->metablob.add_inode_truncate(cur->ino(), req->args.truncate.length, cur->inode.size); - inode_t *pi = le->metablob.add_dentry(cur->parent, true); - pi->mtime = ctime; - pi->ctime = ctime; - pi->version = pdv; - pi->size = req->args.truncate.length; - - - mdlog->submit_entry(le, fin); -} - - -// =========================== -// open, openc, close - -void Server::handle_client_open(MDRequest *mdr) -{ - MClientRequest *req = mdr->client_request; - - int flags = req->args.open.flags; - int cmode = req->get_open_file_mode(); - bool need_auth = ((cmode != FILE_MODE_R && cmode != FILE_MODE_LAZY) || - (flags & O_TRUNC)); - dout(10) << "open flags = " << flags - << ", filemode = " << cmode - << ", need_auth = " << need_auth - << dendl; - - CInode *cur = rdlock_path_pin_ref(mdr, need_auth); - if (!cur) return; - - // regular file? - if (!cur->inode.is_file() && !cur->inode.is_dir()) { - dout(7) << "not a file or dir " << *cur << dendl; - reply_request(mdr, -EINVAL); // FIXME what error do we want? - return; - } - // can only open a dir rdonly, no flags. - if (cur->inode.is_dir() && (cmode != FILE_MODE_R || flags != 0)) { - reply_request(mdr, -EINVAL); - return; - } - - // hmm, check permissions or something. - - - // O_TRUNC - if (flags & O_TRUNC) { - assert(cur->is_auth()); - - // xlock file size - set rdlocks = mdr->rdlocks; - set wrlocks = mdr->wrlocks; - set xlocks = mdr->xlocks; - xlocks.insert(&cur->filelock); - if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) - return; - - if (cur->inode.size > 0) { - handle_client_opent(mdr); - return; - } - } - - // do it - _do_open(mdr, cur); -} - -void Server::_do_open(MDRequest *mdr, CInode *cur) -{ - MClientRequest *req = mdr->client_request; - int cmode = req->get_open_file_mode(); - - // can we issue the caps they want? - version_t fdv = mds->locker->issue_file_data_version(cur); - Capability *cap = mds->locker->issue_new_caps(cur, cmode, req); - if (!cap) return; // can't issue (yet), so wait! - - dout(12) << "_do_open issuing caps " << cap_string(cap->pending()) - << " for " << req->get_source() - << " on " << *cur << dendl; - - // hit pop - mdr->now = g_clock.now(); - if (cmode == FILE_MODE_RW || - cmode == FILE_MODE_W) - mds->balancer->hit_inode(mdr->now, cur, META_POP_IWR); - else - mds->balancer->hit_inode(mdr->now, cur, META_POP_IRD, - mdr->client_request->get_client_inst().name.num()); - - // reply - MClientReply *reply = new MClientReply(req, 0); - reply->set_file_caps(cap->pending()); - reply->set_file_caps_seq(cap->get_last_seq()); - reply->set_file_data_version(fdv); - reply_request(mdr, reply, cur); - - // journal? - if (cur->last_open_journaled == 0) { - queue_journal_open(cur); - maybe_journal_opens(); - } - -} - -void Server::queue_journal_open(CInode *in) -{ - dout(10) << "queue_journal_open on " << *in << dendl; - - if (journal_open_queue.count(in) == 0) { - // pin so our pointer stays valid - in->get(CInode::PIN_BATCHOPENJOURNAL); - - // queue it up for a bit - journal_open_queue.insert(in); - } -} - - -void Server::journal_opens() -{ - dout(10) << "journal_opens " << journal_open_queue.size() << " inodes" << dendl; - if (journal_open_queue.empty()) return; - - EOpen *le = 0; - - // check queued inodes - LogSegment *ls = mdlog->get_current_segment(); - for (set::iterator p = journal_open_queue.begin(); - p != journal_open_queue.end(); - ++p) { - CInode *in = *p; - in->put(CInode::PIN_BATCHOPENJOURNAL); - if (in->is_any_caps()) { - if (!le) le = new EOpen(mdlog); - le->add_inode(in); - in->last_open_journaled = mds->mdlog->get_write_pos(); - ls->open_files.push_back(&in->xlist_open_file); - } - } - journal_open_queue.clear(); - - if (le) { - // journal - mdlog->submit_entry(le); - - // add waiters to journal entry - for (list::iterator p = journal_open_waiters.begin(); - p != journal_open_waiters.end(); - ++p) - mds->mdlog->wait_for_sync(*p); - journal_open_waiters.clear(); - } else { - // nothing worth journaling here, just kick the waiters. - mds->queue_waiters(journal_open_waiters); - } -} - - - - -class C_MDS_open_truncate_purged : public Context { - MDS *mds; - MDRequest *mdr; - CInode *in; - version_t pv; - utime_t ctime; -public: - C_MDS_open_truncate_purged(MDS *m, MDRequest *r, CInode *i, version_t pdv, utime_t ct) : - mds(m), mdr(r), in(i), - pv(pdv), - ctime(ct) { } - void finish(int r) { - assert(r == 0); - - // apply to cache - in->inode.size = 0; - in->inode.ctime = ctime; - in->inode.mtime = ctime; - in->mark_dirty(pv, mdr->ls); - - // do the open - mds->server->_do_open(mdr, in); - } -}; - -class C_MDS_open_truncate_logged : public Context { - MDS *mds; - MDRequest *mdr; - CInode *in; - version_t pv; - utime_t ctime; -public: - C_MDS_open_truncate_logged(MDS *m, MDRequest *r, CInode *i, version_t pdv, utime_t ct) : - mds(m), mdr(r), in(i), - pv(pdv), - ctime(ct) { } - void finish(int r) { - assert(r == 0); - - // hit pop - mds->balancer->hit_inode(mdr->now, in, META_POP_IWR); - - // purge also... - mds->mdcache->purge_inode(in, 0, in->inode.size, mdr->ls); - mds->mdcache->wait_for_purge(in, 0, - new C_MDS_open_truncate_purged(mds, mdr, in, pv, ctime)); - } -}; - - -void Server::handle_client_opent(MDRequest *mdr) -{ - CInode *cur = mdr->ref; - assert(cur); - - // prepare - version_t pdv = cur->pre_dirty(); - utime_t ctime = g_clock.real_now(); - Context *fin = new C_MDS_open_truncate_logged(mds, mdr, cur, - pdv, ctime); - - // log + wait - mdr->ls = mdlog->get_current_segment(); - EUpdate *le = new EUpdate(mdlog, "open_truncate"); - le->metablob.add_client_req(mdr->reqid); - le->metablob.add_dir_context(cur->get_parent_dir()); - le->metablob.add_inode_truncate(cur->ino(), 0, cur->inode.size); - inode_t *pi = le->metablob.add_dentry(cur->parent, true); - pi->mtime = ctime; - pi->ctime = ctime; - pi->version = pdv; - pi->size = 0; - - mdlog->submit_entry(le, fin); -} - - - -class C_MDS_openc_finish : public Context { - MDS *mds; - MDRequest *mdr; - CDentry *dn; - CInode *newi; - version_t pv; -public: - C_MDS_openc_finish(MDS *m, MDRequest *r, CDentry *d, CInode *ni) : - mds(m), mdr(r), dn(d), newi(ni), - pv(d->get_projected_version()) {} - void finish(int r) { - assert(r == 0); - - // link the inode - dn->get_dir()->link_primary_inode(dn, newi); - - // dirty inode, dn, dir - newi->mark_dirty(pv, mdr->ls); - - // downgrade xlock to rdlock - //mds->locker->dentry_xlock_downgrade_to_rdlock(dn, mdr); - - // set/pin ref inode for open() - mdr->ref = newi; - mdr->pin(newi); - - // ok, do the open. - mds->server->handle_client_open(mdr); - } -}; - - -void Server::handle_client_openc(MDRequest *mdr) -{ - MClientRequest *req = mdr->client_request; - - dout(7) << "open w/ O_CREAT on " << req->get_filepath() << dendl; - - bool excl = (req->args.open.flags & O_EXCL); - CDentry *dn = rdlock_path_xlock_dentry(mdr, !excl, false); - if (!dn) return; - - if (!dn->is_null()) { - // it existed. - if (req->args.open.flags & O_EXCL) { - dout(10) << "O_EXCL, target exists, failing with -EEXIST" << dendl; - reply_request(mdr, -EEXIST, dn->get_dir()->get_inode()); - return; - } - - // pass to regular open handler. - handle_client_open(mdr); - return; - } - - // created null dn. - - // create inode. - mdr->now = g_clock.real_now(); - CInode *in = prepare_new_inode(mdr, dn->dir); - assert(in); - - // it's a file. - in->inode.mode = req->args.open.mode; - in->inode.mode |= S_IFREG; - in->inode.version = dn->pre_dirty() - 1; - - // prepare finisher - C_MDS_openc_finish *fin = new C_MDS_openc_finish(mds, mdr, dn, in); - mdr->ls = mdlog->get_current_segment(); - EUpdate *le = new EUpdate(mdlog, "openc"); - le->metablob.add_client_req(req->get_reqid()); - le->metablob.add_allocated_ino(in->ino(), mds->idalloc->get_version()); - le->metablob.add_dir_context(dn->dir); - le->metablob.add_primary_dentry(dn, true, in, &in->inode); - - // log + wait - mdlog->submit_entry(le, fin); - - /* - FIXME. this needs to be rewritten when the write capability stuff starts - getting journaled. - */ -} - - - - - - - - - - - - - - diff --git a/branches/sage/mds/mds/SimpleLock.h b/branches/sage/mds/mds/SimpleLock.h deleted file mode 100644 index e785e2c36d50c..0000000000000 --- a/branches/sage/mds/mds/SimpleLock.h +++ /dev/null @@ -1,301 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __SIMPLELOCK_H -#define __SIMPLELOCK_H - -// -- lock types -- -// NOTE: this also defines the lock ordering! -#define LOCK_OTYPE_DN 1 - -#define LOCK_OTYPE_IVERSION 2 -#define LOCK_OTYPE_IFILE 3 -#define LOCK_OTYPE_IAUTH 4 -#define LOCK_OTYPE_ILINK 5 -#define LOCK_OTYPE_IDIRFRAGTREE 6 -#define LOCK_OTYPE_IDIR 7 - -//#define LOCK_OTYPE_DIR 7 // not used - -inline const char *get_lock_type_name(int t) { - switch (t) { - case LOCK_OTYPE_DN: return "dn"; - case LOCK_OTYPE_IVERSION: return "iversion"; - case LOCK_OTYPE_IFILE: return "ifile"; - case LOCK_OTYPE_IAUTH: return "iauth"; - case LOCK_OTYPE_ILINK: return "ilink"; - case LOCK_OTYPE_IDIRFRAGTREE: return "idft"; - case LOCK_OTYPE_IDIR: return "idir"; - default: assert(0); return 0; - } -} - -// -- lock states -- -// sync <-> lock -#define LOCK_UNDEF 0 -// auth rep -#define LOCK_SYNC 1 // AR R . R . -#define LOCK_LOCK 2 // AR R W . . -#define LOCK_GLOCKR -3 // AR R . . . -#define LOCK_REMOTEXLOCK -50 // on NON-auth - -inline const char *get_simplelock_state_name(int n) { - switch (n) { - case LOCK_UNDEF: return "UNDEF"; - case LOCK_SYNC: return "sync"; - case LOCK_LOCK: return "lock"; - case LOCK_GLOCKR: return "glockr"; - case LOCK_REMOTEXLOCK: return "remote_xlock"; - default: assert(0); return 0; - } -} - -class MDRequest; - -class SimpleLock { -public: - static const int WAIT_RD = (1<<0); // to read - static const int WAIT_WR = (1<<1); // to write - static const int WAIT_XLOCK = (1<<2); // to xlock (** dup) - static const int WAIT_STABLE = (1<<2); // for a stable state - static const int WAIT_REMOTEXLOCK = (1<<3); // for a remote xlock - static const int WAIT_BITS = 4; - static const int WAIT_ALL = ((1< gather_set; // auth - - // local state - int num_rdlock; - MDRequest *xlock_by; - -public: - SimpleLock(MDSCacheObject *o, int t, int wo) : - parent(o), type(t), wait_offset(wo), - state(LOCK_SYNC), - num_rdlock(0), xlock_by(0) { } - virtual ~SimpleLock() {} - - // parent - MDSCacheObject *get_parent() { return parent; } - int get_type() { return type; } - - struct ptr_lt { - bool operator()(const SimpleLock* l, const SimpleLock* r) const { - // first sort by object type (dn < inode) - if ((l->type>LOCK_OTYPE_DN) < (r->type>LOCK_OTYPE_DN)) return true; - if ((l->type>LOCK_OTYPE_DN) == (r->type>LOCK_OTYPE_DN)) { - // then sort by object - if (l->parent->is_lt(r->parent)) return true; - if (l->parent == r->parent) { - // then sort by (inode) lock type - if (l->type < r->type) return true; - } - } - return false; - } - }; - - void decode_locked_state(bufferlist& bl) { - parent->decode_lock_state(type, bl); - } - void encode_locked_state(bufferlist& bl) { - parent->encode_lock_state(type, bl); - } - void finish_waiters(int mask, int r=0) { - parent->finish_waiting(mask << wait_offset, r); - } - void take_waiting(int mask, list& ls) { - parent->take_waiting(mask << wait_offset, ls); - } - void add_waiter(int mask, Context *c) { - parent->add_waiter(mask << wait_offset, c); - } - bool is_waiter_for(int mask) { - return parent->is_waiter_for(mask << wait_offset); - } - - - - // state - int get_state() { return state; } - int set_state(int s) { - state = s; - assert(!is_stable() || gather_set.size() == 0); // gather should be empty in stable states. - return s; - }; - bool is_stable() { - return state >= 0; - } - - - // gather set - const set& get_gather_set() { return gather_set; } - void init_gather() { - for (map::const_iterator p = parent->replicas_begin(); - p != parent->replicas_end(); - ++p) - gather_set.insert(p->first); - } - bool is_gathering() { return !gather_set.empty(); } - bool is_gathering(int i) { - return gather_set.count(i); - } - void clear_gather() { - gather_set.clear(); - } - void remove_gather(int i) { - gather_set.erase(i); - } - - // ref counting - bool is_rdlocked() { return num_rdlock > 0; } - int get_rdlock() { - if (!num_rdlock) parent->get(MDSCacheObject::PIN_LOCK); - return ++num_rdlock; - } - int put_rdlock() { - assert(num_rdlock>0); - --num_rdlock; - if (num_rdlock == 0) parent->put(MDSCacheObject::PIN_LOCK); - return num_rdlock; - } - int get_num_rdlocks() { return num_rdlock; } - - void get_xlock(MDRequest *who) { - assert(xlock_by == 0); - parent->get(MDSCacheObject::PIN_LOCK); - xlock_by = who; - } - void put_xlock() { - assert(xlock_by); - parent->put(MDSCacheObject::PIN_LOCK); - xlock_by = 0; - } - bool is_xlocked() { return xlock_by ? true:false; } - bool is_xlocked_by_other(MDRequest *mdr) { - return is_xlocked() && xlock_by != mdr; - } - MDRequest *get_xlocked_by() { return xlock_by; } - bool is_used() { - return is_xlocked() || is_rdlocked(); - } - - // encode/decode - void _encode(bufferlist& bl) { - ::_encode_simple(state, bl); - ::_encode_simple(gather_set, bl); - } - void _decode(bufferlist::iterator& p) { - ::_decode_simple(state, p); - ::_decode_simple(gather_set, p); - } - - - // simplelock specifics - int get_replica_state() { - switch (state) { - case LOCK_LOCK: - case LOCK_GLOCKR: - return LOCK_LOCK; - case LOCK_SYNC: - return LOCK_SYNC; - default: - assert(0); - } - return 0; - } - void export_twiddle() { - clear_gather(); - state = get_replica_state(); - } - - /** replicate_relax - * called on first replica creation. - */ - void replicate_relax() { - assert(parent->is_auth()); - assert(!parent->is_replicated()); - if (state == LOCK_LOCK && !is_used()) - state = LOCK_SYNC; - } - bool remove_replica(int from) { - if (is_gathering(from)) { - remove_gather(from); - if (!is_gathering()) - return true; - } - return false; - } - bool do_import(int from, int to) { - if (!is_stable()) { - remove_gather(from); - remove_gather(to); - if (!is_gathering()) - return true; - } - if (!is_stable() && !is_gathering()) - return true; - return false; - } - - bool can_rdlock(MDRequest *mdr) { - //if (state == LOCK_LOCK && mdr && xlock_by == mdr) return true; // xlocked by me. (actually, is this right?) - //if (state == LOCK_LOCK && !xlock_by && parent->is_auth()) return true; - return (state == LOCK_SYNC); - } - bool can_xlock(MDRequest *mdr) { - if (mdr && xlock_by == mdr) { - assert(state == LOCK_LOCK); - return true; // auth or replica! xlocked by me. - } - if (state == LOCK_LOCK && parent->is_auth() && !xlock_by) return true; - return false; - } - bool can_xlock_soon() { - if (parent->is_auth()) - return (state == LOCK_GLOCKR); - else - return false; - } - - virtual void print(ostream& out) { - out << "("; - out << get_lock_type_name(get_type()) << " "; - out << get_simplelock_state_name(get_state()); - if (!get_gather_set().empty()) out << " g=" << get_gather_set(); - if (is_rdlocked()) - out << " r=" << get_num_rdlocks(); - if (is_xlocked()) - out << " x=" << get_xlocked_by(); - out << ")"; - } -}; - -inline ostream& operator<<(ostream& out, SimpleLock& l) -{ - l.print(out); - return out; -} - - -#endif diff --git a/branches/sage/mds/mds/events/EAnchor.h b/branches/sage/mds/mds/events/EAnchor.h deleted file mode 100644 index 97a21a36734be..0000000000000 --- a/branches/sage/mds/mds/events/EAnchor.h +++ /dev/null @@ -1,80 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_EANCHOR_H -#define __MDS_EANCHOR_H - -#include -#include "config.h" -#include "include/types.h" - -#include "../LogEvent.h" -#include "../Anchor.h" - -class EAnchor : public LogEvent { -protected: - int op; - inodeno_t ino; - version_t atid; - vector trace; - version_t version; // anchor table version - int reqmds; - - public: - EAnchor() : LogEvent(EVENT_ANCHOR) { } - EAnchor(int o, inodeno_t i, version_t v, int rm) : - LogEvent(EVENT_ANCHOR), - op(o), ino(i), atid(0), version(v), reqmds(rm) { } - EAnchor(int o, version_t a, version_t v) : - LogEvent(EVENT_ANCHOR), - op(o), atid(a), version(v), reqmds(-1) { } - - void set_trace(vector& t) { trace = t; } - vector& get_trace() { return trace; } - - void encode_payload(bufferlist& bl) { - bl.append((char*)&op, sizeof(op)); - bl.append((char*)&ino, sizeof(ino)); - bl.append((char*)&atid, sizeof(atid)); - ::_encode(trace, bl); - bl.append((char*)&version, sizeof(version)); - bl.append((char*)&reqmds, sizeof(reqmds)); - } - void decode_payload(bufferlist& bl, int& off) { - bl.copy(off, sizeof(op), (char*)&op); - off += sizeof(op); - bl.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - bl.copy(off, sizeof(atid), (char*)&atid); - off += sizeof(atid); - ::_decode(trace, bl, off); - bl.copy(off, sizeof(version), (char*)&version); - off += sizeof(version); - bl.copy(off, sizeof(reqmds), (char*)&reqmds); - off += sizeof(reqmds); - } - - void print(ostream& out) { - out << "EAnchor " << get_anchor_opname(op); - if (ino) out << " " << ino; - if (atid) out << " atid " << atid; - if (version) out << " v " << version; - if (reqmds >= 0) out << " by mds" << reqmds; - } - - void update_segment(); - void replay(MDS *mds); -}; - -#endif diff --git a/branches/sage/mds/mds/events/EAnchorClient.h b/branches/sage/mds/mds/events/EAnchorClient.h deleted file mode 100644 index 21f78369cae72..0000000000000 --- a/branches/sage/mds/mds/events/EAnchorClient.h +++ /dev/null @@ -1,56 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_EANCHORCLIENT_H -#define __MDS_EANCHORCLIENT_H - -#include -#include "config.h" -#include "include/types.h" - -#include "../LogEvent.h" -#include "../Anchor.h" - -class EAnchorClient : public LogEvent { -protected: - int op; - version_t atid; - - public: - EAnchorClient() : LogEvent(EVENT_ANCHORCLIENT) { } - EAnchorClient(int o, version_t at) : - LogEvent(EVENT_ANCHORCLIENT), - op(o), atid(at) { } - - void encode_payload(bufferlist& bl) { - bl.append((char*)&op, sizeof(op)); - bl.append((char*)&atid, sizeof(atid)); - } - void decode_payload(bufferlist& bl, int& off) { - bl.copy(off, sizeof(op), (char*)&op); - off += sizeof(op); - bl.copy(off, sizeof(atid), (char*)&atid); - off += sizeof(atid); - } - - void print(ostream& out) { - out << "EAnchorClient " << get_anchor_opname(op); - if (atid) out << " atid " << atid; - } - - void replay(MDS *mds); - -}; - -#endif diff --git a/branches/sage/mds/mds/events/EExport.h b/branches/sage/mds/mds/events/EExport.h deleted file mode 100644 index 89534f12b51bf..0000000000000 --- a/branches/sage/mds/mds/events/EExport.h +++ /dev/null @@ -1,63 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __EEXPORT_H -#define __EEXPORT_H - -#include -#include "config.h" -#include "include/types.h" - -#include "../MDS.h" - -#include "EMetaBlob.h" - -class EExport : public LogEvent { -public: - EMetaBlob metablob; // exported dir -protected: - dirfrag_t base; - set bounds; - -public: - EExport() : LogEvent(EVENT_EXPORT) { } - EExport(MDLog *mdlog, CDir *dir) : - LogEvent(EVENT_EXPORT), metablob(mdlog), - base(dir->dirfrag()) { } - - set &get_bounds() { return bounds; } - - void print(ostream& out) { - out << "EExport " << base << " " << metablob; - } - - virtual void encode_payload(bufferlist& bl) { - metablob._encode(bl); - bl.append((char*)&base, sizeof(base)); - ::_encode(bounds, bl); - } - void decode_payload(bufferlist& bl, int& off) { - metablob._decode(bl, off); - bl.copy(off, sizeof(base), (char*)&base); - off += sizeof(base); - ::_decode(bounds, bl, off); - } - - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); - void replay(MDS *mds); - -}; - -#endif diff --git a/branches/sage/mds/mds/events/EFragment.h b/branches/sage/mds/mds/events/EFragment.h deleted file mode 100644 index 64969111193c0..0000000000000 --- a/branches/sage/mds/mds/events/EFragment.h +++ /dev/null @@ -1,54 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_EFRAGMENT_H -#define __MDS_EFRAGMENT_H - -#include "../LogEvent.h" -#include "EMetaBlob.h" - -class EFragment : public LogEvent { -public: - EMetaBlob metablob; - inodeno_t ino; - frag_t basefrag; - int bits; // positive for split (from basefrag), negative for merge (to basefrag) - - EFragment() : LogEvent(EVENT_FRAGMENT) { } - EFragment(MDLog *mdlog, inodeno_t i, frag_t bf, int b) : - LogEvent(EVENT_FRAGMENT), metablob(mdlog), - ino(i), basefrag(bf), bits(b) { } - void print(ostream& out) { - out << "EFragment " << ino << " " << basefrag << " by " << bits << " " << metablob; - } - - void encode_payload(bufferlist& bl) { - ::_encode(ino, bl); - ::_encode(basefrag, bl); - ::_encode(bits, bl); - metablob._encode(bl); - } - void decode_payload(bufferlist& bl, int& off) { - ::_decode(ino, bl, off); - ::_decode(basefrag, bl, off); - ::_decode(bits, bl, off); - metablob._decode(bl, off); - } - - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); - void replay(MDS *mds); -}; - -#endif diff --git a/branches/sage/mds/mds/events/EImportFinish.h b/branches/sage/mds/mds/events/EImportFinish.h deleted file mode 100644 index 0ee6d71ffdc13..0000000000000 --- a/branches/sage/mds/mds/events/EImportFinish.h +++ /dev/null @@ -1,60 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __EIMPORTFINISH_H -#define __EIMPORTFINISH_H - -#include -#include "config.h" -#include "include/types.h" - -#include "../MDS.h" - -class EImportFinish : public LogEvent { - protected: - dirfrag_t base; // imported dir - bool success; - - public: - EImportFinish(CDir *dir, bool s) : LogEvent(EVENT_IMPORTFINISH), - base(dir->dirfrag()), - success(s) { } - EImportFinish() : LogEvent(EVENT_IMPORTFINISH) { } - - void print(ostream& out) { - out << "EImportFinish " << base; - if (success) - out << " success"; - else - out << " failed"; - } - - virtual void encode_payload(bufferlist& bl) { - bl.append((char*)&base, sizeof(base)); - bl.append((char*)&success, sizeof(success)); - } - void decode_payload(bufferlist& bl, int& off) { - bl.copy(off, sizeof(base), (char*)&base); - off += sizeof(base); - bl.copy(off, sizeof(success), (char*)&success); - off += sizeof(success); - } - - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); - void replay(MDS *mds); - -}; - -#endif diff --git a/branches/sage/mds/mds/events/EMetaBlob.h b/branches/sage/mds/mds/events/EMetaBlob.h deleted file mode 100644 index 767521523f9fe..0000000000000 --- a/branches/sage/mds/mds/events/EMetaBlob.h +++ /dev/null @@ -1,501 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_EMETABLOB_H -#define __MDS_EMETABLOB_H - -#include -#include -using std::string; - -#include "../CInode.h" -#include "../CDir.h" -#include "../CDentry.h" - -#include "include/triple.h" - -class MDS; -class MDLog; -class LogSegment; - -/* - * a bunch of metadata in the journal - */ - -/* notes: - * - * - make sure you adjust the inode.version for any modified inode you - * journal. CDir and CDentry maintain a projected_version, but CInode - * doesn't, since the journaled inode usually has to be modifed - * manually anyway (to delay the change in the MDS's cache until after - * it is journaled). - * - */ - - -class EMetaBlob { - - /* fullbit - a regular dentry + inode - */ - struct fullbit { - string dn; // dentry - version_t dnv; - inode_t inode; // if it's not - fragtree_t dirfragtree; - string symlink; - bool dirty; - - fullbit(const string& d, version_t v, inode_t& i, fragtree_t dft, bool dr) : - dn(d), dnv(v), inode(i), dirfragtree(dft), dirty(dr) { } - fullbit(const string& d, version_t v, inode_t& i, fragtree_t dft, string& sym, bool dr) : - dn(d), dnv(v), inode(i), dirfragtree(dft), symlink(sym), dirty(dr) { } - fullbit(bufferlist& bl, int& off) { _decode(bl, off); } - void _encode(bufferlist& bl) { - ::_encode(dn, bl); - ::_encode(dnv, bl); - ::_encode(inode, bl); - dirfragtree._encode(bl); - if (inode.is_symlink()) - ::_encode(symlink, bl); - ::_encode(dirty, bl); - } - void _decode(bufferlist& bl, int& off) { - ::_decode(dn, bl, off); - ::_decode(dnv, bl, off); - ::_decode(inode, bl, off); - dirfragtree._decode(bl, off); - if (inode.is_symlink()) - ::_decode(symlink, bl, off); - ::_decode(dirty, bl, off); - } - void print(ostream& out) { - out << " fullbit dn " << dn << " dnv " << dnv - << " inode " << inode.ino - << " dirty=" << dirty << std::endl; - } - }; - - /* remotebit - a dentry + remote inode link (i.e. just an ino) - */ - struct remotebit { - string dn; - version_t dnv; - inodeno_t ino; - unsigned char d_type; - bool dirty; - - remotebit(const string& d, version_t v, inodeno_t i, unsigned char dt, bool dr) : - dn(d), dnv(v), ino(i), d_type(dt), dirty(dr) { } - remotebit(bufferlist& bl, int& off) { _decode(bl, off); } - void _encode(bufferlist& bl) { - ::_encode(dn, bl); - ::_encode(dnv, bl); - ::_encode(ino, bl); - ::_encode(d_type, bl); - ::_encode(dirty, bl); - } - void _decode(bufferlist& bl, int& off) { - ::_decode(dn, bl, off); - ::_decode(dnv, bl, off); - ::_decode(ino, bl, off); - ::_decode(d_type, bl, off); - ::_decode(dirty, bl, off); - } - void print(ostream& out) { - out << " remotebit dn " << dn << " dnv " << dnv - << " ino " << ino - << " dirty=" << dirty << std::endl; - } - }; - - /* - * nullbit - a null dentry - */ - struct nullbit { - string dn; - version_t dnv; - bool dirty; - nullbit(const string& d, version_t v, bool dr) : dn(d), dnv(v), dirty(dr) { } - nullbit(bufferlist& bl, int& off) { _decode(bl, off); } - void _encode(bufferlist& bl) { - ::_encode(dn, bl); - ::_encode(dnv, bl); - ::_encode(dirty, bl); - } - void _decode(bufferlist& bl, int& off) { - ::_decode(dn, bl, off); - ::_decode(dnv, bl, off); - ::_decode(dirty, bl, off); - } - void print(ostream& out) { - out << " nullbit dn " << dn << " dnv " << dnv - << " dirty=" << dirty << std::endl; - } - }; - - - /* dirlump - contains metadata for any dir we have contents for. - */ -public: - struct dirlump { - static const int STATE_COMPLETE = (1<<1); - static const int STATE_DIRTY = (1<<2); // dirty due to THIS journal item, that is! - - version_t dirv; - int state; - int nfull, nremote, nnull; - - private: - bufferlist dnbl; - bool dn_decoded; - list dfull; - list dremote; - list dnull; - - public: - dirlump() : dirv(0), state(0), nfull(0), nremote(0), nnull(0), dn_decoded(true) { } - - bool is_complete() { return state & STATE_COMPLETE; } - void mark_complete() { state |= STATE_COMPLETE; } - bool is_dirty() { return state & STATE_DIRTY; } - void mark_dirty() { state |= STATE_DIRTY; } - - list &get_dfull() { return dfull; } - list &get_dremote() { return dremote; } - list &get_dnull() { return dnull; } - - void print(dirfrag_t dirfrag, ostream& out) { - out << "dirlump " << dirfrag << " dirv " << dirv - << " state " << state - << " num " << nfull << "/" << nremote << "/" << nnull - << std::endl; - _decode_bits(); - for (list::iterator p = dfull.begin(); p != dfull.end(); ++p) - p->print(out); - for (list::iterator p = dremote.begin(); p != dremote.end(); ++p) - p->print(out); - for (list::iterator p = dnull.begin(); p != dnull.end(); ++p) - p->print(out); - } - - void _encode_bits() { - for (list::iterator p = dfull.begin(); p != dfull.end(); ++p) - p->_encode(dnbl); - for (list::iterator p = dremote.begin(); p != dremote.end(); ++p) - p->_encode(dnbl); - for (list::iterator p = dnull.begin(); p != dnull.end(); ++p) - p->_encode(dnbl); - } - void _decode_bits() { - if (dn_decoded) return; - int off = 0; - for (int i=0; i lump_order; - map lump_map; - - // anchor transactions included in this update. - list atids; - - // inode dirlocks (scatterlocks) i've touched. - map dirty_inode_mtimes; - - // ino's i've allocated - list allocated_inos; - version_t alloc_tablev; - - // inodes i've destroyed. - list< triple > truncated_inodes; - - // idempotent op(s) - list client_reqs; - - public: - // soft state - off_t last_subtree_map; - off_t my_offset; - - // for replay, in certain cases - LogSegment *_segment; - - EMetaBlob() : last_subtree_map(0), my_offset(0), _segment(0) { } - EMetaBlob(MDLog *mdl); // defined in journal.cc - - void print(ostream& out) { - for (list::iterator p = lump_order.begin(); - p != lump_order.end(); - ++p) { - lump_map[*p].print(*p, out); - } - } - - void add_client_req(metareqid_t r) { - client_reqs.push_back(r); - } - - void add_anchor_transaction(version_t atid) { - atids.push_back(atid); - } - - void add_dirtied_inode_mtime(inodeno_t ino, utime_t ctime) { - dirty_inode_mtimes[ino] = ctime; - } - - void add_allocated_ino(inodeno_t ino, version_t tablev) { - allocated_inos.push_back(ino); - alloc_tablev = tablev; - } - - void add_inode_truncate(inodeno_t ino, off_t newsize, off_t oldsize) { - truncated_inodes.push_back(triple(ino, newsize, oldsize)); - } - - void add_null_dentry(CDentry *dn, bool dirty) { - add_null_dentry(add_dir(dn->get_dir(), false), dn, dirty); - } - void add_null_dentry(dirlump& lump, CDentry *dn, bool dirty) { - // add the dir - lump.nnull++; - if (dirty) - lump.get_dnull().push_front(nullbit(dn->get_name(), - dn->get_projected_version(), - dirty)); - else - lump.get_dnull().push_back(nullbit(dn->get_name(), - dn->get_projected_version(), - dirty)); - } - - void add_remote_dentry(CDentry *dn, bool dirty, inodeno_t rino=0) { - add_remote_dentry(add_dir(dn->get_dir(), false), - dn, dirty, rino); - } - void add_remote_dentry(dirlump& lump, CDentry *dn, bool dirty, - inodeno_t rino=0, unsigned char rdt=0) { - if (!rino) { - rino = dn->get_remote_ino(); - rdt = dn->get_remote_d_type(); - } - lump.nremote++; - if (dirty) - lump.get_dremote().push_front(remotebit(dn->get_name(), - dn->get_projected_version(), - rino, rdt, - dirty)); - else - lump.get_dremote().push_back(remotebit(dn->get_name(), - dn->get_projected_version(), - rino, rdt, - dirty)); - } - - // return remote pointer to to-be-journaled inode - inode_t *add_primary_dentry(CDentry *dn, bool dirty, - CInode *in=0, inode_t *pi=0, fragtree_t *pdft=0) { - return add_primary_dentry(add_dir(dn->get_dir(), false), - dn, dirty, in, pi, pdft); - } - inode_t *add_primary_dentry(dirlump& lump, CDentry *dn, bool dirty, - CInode *in=0, inode_t *pi=0, fragtree_t *pdft=0) { - if (!in) - in = dn->get_inode(); - - // make note of where this inode was last journaled - in->last_journaled = my_offset; - //cout << "journaling " << in->inode.ino << " at " << my_offset << std::endl; - - lump.nfull++; - if (dirty) { - lump.get_dfull().push_front(fullbit(dn->get_name(), - dn->get_projected_version(), - in->inode, in->dirfragtree, in->symlink, - dirty)); - if (pi) lump.get_dfull().front().inode = *pi; - return &lump.get_dfull().front().inode; - } else { - lump.get_dfull().push_back(fullbit(dn->get_name(), - dn->get_projected_version(), - in->inode, in->dirfragtree, in->symlink, - dirty)); - if (pi) lump.get_dfull().back().inode = *pi; - return &lump.get_dfull().back().inode; - } - } - - // convenience: primary or remote? figure it out. - inode_t *add_dentry(CDentry *dn, bool dirty) { - dirlump& lump = add_dir(dn->get_dir(), false); - return add_dentry(lump, dn, dirty); - } - inode_t *add_dentry(dirlump& lump, CDentry *dn, bool dirty) { - // primary or remote - if (dn->is_remote()) { - add_remote_dentry(dn, dirty); - return 0; - } else if (dn->is_null()) { - add_null_dentry(dn, dirty); - return 0; - } - assert(dn->is_primary()); - return add_primary_dentry(dn, dirty); - } - - - dirlump& add_dir(CDir *dir, bool dirty, bool complete=false) { - return add_dir(dir->dirfrag(), dir->get_projected_version(), dirty, complete); - } - dirlump& add_dir(dirfrag_t df, version_t pv, bool dirty, bool complete=false) { - if (lump_map.count(df) == 0) { - lump_order.push_back(df); - lump_map[df].dirv = pv; - } - dirlump& l = lump_map[df]; - if (complete) l.mark_complete(); - if (dirty) l.mark_dirty(); - return l; - } - - static const int TO_AUTH_SUBTREE_ROOT = 0; // default. - static const int TO_ROOT = 1; - - void add_dir_context(CDir *dir, int mode = TO_AUTH_SUBTREE_ROOT) { - // already have this dir? (we must always add in order) - if (lump_map.count(dir->dirfrag())) - return; - - if (mode == TO_AUTH_SUBTREE_ROOT) { - //return; // hack: for comparison purposes.. what if NO context? - - // subtree root? - if (dir->is_subtree_root() && dir->is_auth()) - return; - - // was the inode journaled since the last subtree_map? - if (//false && // for benchmarking - last_subtree_map && - dir->inode->last_journaled >= last_subtree_map) { - /* - cout << " inode " << dir->inode->inode.ino - << " last journaled at " << dir->inode->last_journaled - << " and last_subtree_map is " << last_subtree_map - << std::endl; - */ - return; - } - } - - // stop at root/stray - CInode *diri = dir->get_inode(); - if (!diri->get_parent_dn()) - return; - - // journaled? - - // add parent dn - CDentry *parent = diri->get_parent_dn(); - add_dir_context(parent->get_dir(), mode); - add_dentry(parent, false); - } - - - // encoding - - void _encode(bufferlist& bl) { - int32_t n = lump_map.size(); - ::_encode(n, bl); - for (list::iterator i = lump_order.begin(); - i != lump_order.end(); - ++i) { - dirfrag_t dirfrag = *i; - ::_encode(dirfrag, bl); - lump_map[*i]._encode(bl); - } - ::_encode(atids, bl); - ::_encode(dirty_inode_mtimes, bl); - ::_encode(allocated_inos, bl); - if (!allocated_inos.empty()) - ::_encode(alloc_tablev, bl); - ::_encode(truncated_inodes, bl); - ::_encode(client_reqs, bl); - } - void _decode(bufferlist& bl, int& off) { - int32_t n; - ::_decode(n, bl, off); - for (int i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_EOPEN_H -#define __MDS_EOPEN_H - -#include "../LogEvent.h" -#include "EMetaBlob.h" - -class EOpen : public LogEvent { -public: - EMetaBlob metablob; - list inos; - - EOpen() : LogEvent(EVENT_OPEN) { } - EOpen(MDLog *mdlog) : - LogEvent(EVENT_OPEN), metablob(mdlog) { } - - void print(ostream& out) { - out << "EOpen " << metablob; - } - - void add_inode(CInode *in) { - inos.push_back(in->ino()); - metablob.add_dir_context(in->get_parent_dn()->get_dir()); - metablob.add_primary_dentry(in->get_parent_dn(), false); - } - - void encode_payload(bufferlist& bl) { - ::_encode(inos, bl); - metablob._encode(bl); - } - void decode_payload(bufferlist& bl, int& off) { - ::_decode(inos, bl, off); - metablob._decode(bl, off); - } - - void update_segment(); - void replay(MDS *mds); -}; - -#endif diff --git a/branches/sage/mds/mds/events/EPurgeFinish.h b/branches/sage/mds/mds/events/EPurgeFinish.h deleted file mode 100644 index dff0101b7699a..0000000000000 --- a/branches/sage/mds/mds/events/EPurgeFinish.h +++ /dev/null @@ -1,54 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __EPURGE_H -#define __EPURGE_H - -#include -#include "config.h" -#include "include/types.h" - -class EPurgeFinish : public LogEvent { - protected: - inodeno_t ino; - off_t newsize, oldsize; - - public: - EPurgeFinish(inodeno_t i, off_t ns, off_t os) : - LogEvent(EVENT_PURGEFINISH), - ino(i), newsize(ns), oldsize(os) { } - EPurgeFinish() : LogEvent(EVENT_PURGEFINISH) { } - - void print(ostream& out) { - out << "purgefinish " << ino << " " << oldsize << " ->" << newsize; - } - - virtual void encode_payload(bufferlist& bl) { - bl.append((char*)&ino, sizeof(ino)); - bl.append((char*)&newsize, sizeof(newsize)); - bl.append((char*)&oldsize, sizeof(oldsize)); - } - void decode_payload(bufferlist& bl, int& off) { - ::_decode(ino, bl, off); - ::_decode(newsize, bl, off); - ::_decode(oldsize, bl, off); - } - - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); - void update_segment(); - void replay(MDS *mds); -}; - -#endif diff --git a/branches/sage/mds/mds/events/ESlaveUpdate.h b/branches/sage/mds/mds/events/ESlaveUpdate.h deleted file mode 100644 index 54eaef9c6a296..0000000000000 --- a/branches/sage/mds/mds/events/ESlaveUpdate.h +++ /dev/null @@ -1,79 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_ESLAVEUPDATE_H -#define __MDS_ESLAVEUPDATE_H - -#include "../LogEvent.h" -#include "EMetaBlob.h" - -class ESlaveUpdate : public LogEvent { -public: - const static int OP_PREPARE = 1; - const static int OP_COMMIT = 2; - const static int OP_ROLLBACK = 3; - - /* - * we journal a rollback metablob that contains the unmodified metadata - * too, because we may be updating previously dirty metadata, which - * will allow old log segments to be trimmed. if we end of rolling back, - * those updates could be lost.. so we re-journal the unmodified metadata, - * and replay will apply _either_ commit or rollback. - */ - EMetaBlob commit, rollback; - string type; - metareqid_t reqid; - int master; - int op; // prepare, commit, abort - - ESlaveUpdate() : LogEvent(EVENT_SLAVEUPDATE) { } - ESlaveUpdate(MDLog *mdlog, const char *s, metareqid_t ri, int mastermds, int o) : - LogEvent(EVENT_SLAVEUPDATE), commit(mdlog), rollback(mdlog), - type(s), - reqid(ri), - master(mastermds), - op(o) { } - - void print(ostream& out) { - if (type.length()) - out << type << " "; - out << " " << op; - out << " " << reqid; - out << " for mds" << master; - out << commit << " " << rollback; - } - - void encode_payload(bufferlist& bl) { - ::_encode(type, bl); - ::_encode(reqid, bl); - ::_encode(master, bl); - ::_encode(op, bl); - commit._encode(bl); - rollback._encode(bl); - } - void decode_payload(bufferlist& bl, int& off) { - ::_decode(type, bl, off); - ::_decode(reqid, bl, off); - ::_decode(master, bl, off); - ::_decode(op, bl, off); - commit._decode(bl, off); - rollback._decode(bl, off); - } - - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); - void replay(MDS *mds); -}; - -#endif diff --git a/branches/sage/mds/mds/events/EString.h b/branches/sage/mds/mds/events/EString.h deleted file mode 100644 index b292f9927d76f..0000000000000 --- a/branches/sage/mds/mds/events/EString.h +++ /dev/null @@ -1,56 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __ESTRING_H -#define __ESTRING_H - -#include -#include -using namespace std; - -#include "../LogEvent.h" - -// generic log event -class EString : public LogEvent { - protected: - string event; - - public: - EString(string e) : - LogEvent(EVENT_STRING) { - event = e; - } - EString() : - LogEvent(EVENT_STRING) { - } - - void decode_payload(bufferlist& bl, int& off) { - ::_decode(event, bl, off); - } - void encode_payload(bufferlist& bl) { - ::_encode(event, bl); - } - - void print(ostream& out) { - out << '"' << event << '"'; - } - - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); - void replay(MDS *mds); - -}; - -#endif diff --git a/branches/sage/mds/mds/events/ESubtreeMap.h b/branches/sage/mds/mds/events/ESubtreeMap.h deleted file mode 100644 index cb6feb1d92ec6..0000000000000 --- a/branches/sage/mds/mds/events/ESubtreeMap.h +++ /dev/null @@ -1,47 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_ESUBTREEMAP_H -#define __MDS_ESUBTREEMAP_H - -#include "../LogEvent.h" -#include "EMetaBlob.h" - -class ESubtreeMap : public LogEvent { -public: - EMetaBlob metablob; - map > subtrees; - - ESubtreeMap() : LogEvent(EVENT_SUBTREEMAP) { } - - void print(ostream& out) { - out << "subtree_map " << subtrees.size() << " subtrees " - << metablob; - } - - void encode_payload(bufferlist& bl) { - metablob._encode(bl); - ::_encode(subtrees, bl); - } - void decode_payload(bufferlist& bl, int& off) { - metablob._decode(bl, off); - ::_decode(subtrees, bl, off); - } - - //bool has_expired(MDS *mds); - //void expire(MDS *mds, Context *c); - void replay(MDS *mds); -}; - -#endif diff --git a/branches/sage/mds/mds/mdstypes.h b/branches/sage/mds/mds/mdstypes.h deleted file mode 100644 index 84d59bfc01296..0000000000000 --- a/branches/sage/mds/mds/mdstypes.h +++ /dev/null @@ -1,690 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -#ifndef __MDSTYPES_H -#define __MDSTYPES_H - - -#include -#include -#include -#include -using namespace std; - -#include "config.h" -#include "common/DecayCounter.h" -#include "include/Context.h" - -#include - -#include "include/frag.h" -#include "include/xlist.h" - -#define MDS_REF_SET // define me for improved debug output, sanity checking - -#define MDS_PORT_MAIN 0 -#define MDS_PORT_SERVER 1 -#define MDS_PORT_CACHE 2 -#define MDS_PORT_LOCKER 3 -#define MDS_PORT_STORE 4 -#define MDS_PORT_BALANCER 5 -#define MDS_PORT_MIGRATOR 6 -#define MDS_PORT_RENAMER 7 -#define MDS_PORT_ANCHORCLIENT 10 -#define MDS_PORT_ANCHORTABLE 11 - -#define MAX_MDS 0x100 - -#define MDS_INO_ROOT 1 -#define MDS_INO_PGTABLE 2 -#define MDS_INO_ANCHORTABLE 3 -#define MDS_INO_PG 4 // *** WARNING: this should match osd/osd_types.h PG_INO *** -#define MDS_INO_LOG_OFFSET (1*MAX_MDS) -#define MDS_INO_IDS_OFFSET (2*MAX_MDS) -#define MDS_INO_CLIENTMAP_OFFSET (3*MAX_MDS) -#define MDS_INO_STRAY_OFFSET (4*MAX_MDS) -#define MDS_INO_BASE (5*MAX_MDS) - -#define MDS_INO_STRAY(x) (MDS_INO_STRAY_OFFSET+((unsigned)x)) -#define MDS_INO_IS_STRAY(i) ((i) >= MDS_INO_STRAY_OFFSET && (i) < MDS_INO_STRAY_OFFSET+MAX_MDS) - -#define MDS_TRAVERSE_FORWARD 1 -#define MDS_TRAVERSE_DISCOVER 2 // skips permissions checks etc. -#define MDS_TRAVERSE_DISCOVERXLOCK 3 // succeeds on (foreign?) null, xlocked dentries. -#define MDS_TRAVERSE_FAIL 4 - - -struct metareqid_t { - entity_name_t name; - uint64_t tid; - metareqid_t() : tid(0) {} - //metareqid_t(int c, tid_t t) : tid(t) { name = entity_name_t::CLIENT(c); } - metareqid_t(entity_name_t n, tid_t t) : name(n), tid(t) {} -}; - -inline ostream& operator<<(ostream& out, const metareqid_t& r) { - return out << r.name << ":" << r.tid; -} - -inline bool operator==(const metareqid_t& l, const metareqid_t& r) { - return (l.name == r.name) && (l.tid == r.tid); -} -inline bool operator!=(const metareqid_t& l, const metareqid_t& r) { - return (l.name != r.name) || (l.tid != r.tid); -} -inline bool operator<(const metareqid_t& l, const metareqid_t& r) { - return (l.name < r.name) || - (l.name == r.name && l.tid < r.tid); -} -inline bool operator<=(const metareqid_t& l, const metareqid_t& r) { - return (l.name < r.name) || - (l.name == r.name && l.tid <= r.tid); -} -inline bool operator>(const metareqid_t& l, const metareqid_t& r) { return !(l <= r); } -inline bool operator>=(const metareqid_t& l, const metareqid_t& r) { return !(l < r); } - -namespace __gnu_cxx { - template<> struct hash { - size_t operator()(const metareqid_t &r) const { - hash H; - return H(r.name.num()) ^ H(r.name.type()) ^ H(r.tid); - } - }; -} - - -// inode caps info for client reconnect -struct inode_caps_reconnect_t { - int32_t wanted; - int32_t issued; - off_t size; - utime_t mtime, atime; - - inode_caps_reconnect_t() {} - inode_caps_reconnect_t(int w, int i) : - wanted(w), issued(i), size(0) {} - inode_caps_reconnect_t(int w, int i, off_t sz, utime_t mt, utime_t at) : - wanted(w), issued(i), size(sz), mtime(mt), atime(at) {} -}; - - -// ================================================================ -// dir frag - -struct dirfrag_t { - inodeno_t ino; - frag_t frag; - uint32_t _pad; - - dirfrag_t() : ino(0), _pad(0) { } - dirfrag_t(inodeno_t i, frag_t f) : ino(i), frag(f), _pad(0) { } -}; - -inline ostream& operator<<(ostream& out, const dirfrag_t df) { - out << df.ino; - if (!df.frag.is_root()) out << "." << df.frag; - return out; -} -inline bool operator<(dirfrag_t l, dirfrag_t r) { - if (l.ino < r.ino) return true; - if (l.ino == r.ino && l.frag < r.frag) return true; - return false; -} -inline bool operator==(dirfrag_t l, dirfrag_t r) { - return l.ino == r.ino && l.frag == r.frag; -} - -namespace __gnu_cxx { - template<> struct hash { - size_t operator()(const dirfrag_t &df) const { - static rjhash H; - static rjhash I; - return H(df.ino) ^ I(df.frag); - } - }; -} - - - -// ================================================================ - -#define META_POP_IRD 0 -#define META_POP_IWR 1 -#define META_POP_READDIR 2 -#define META_POP_FETCH 3 -#define META_POP_STORE 4 -#define META_NPOP 5 - -class inode_load_vec_t { - static const int NUM = 2; - DecayCounter vec[NUM]; -public: - DecayCounter &get(int t) { - assert(t < NUM); - return vec[t]; - } - void zero(utime_t now) { - for (int i=0; i"; -} - -/* -inline mds_load_t& operator+=( mds_load_t& l, mds_load_t& r ) -{ - l.root_pop += r.root_pop; - l.req_rate += r.req_rate; - l.queue_len += r.queue_len; - return l; -} - -inline mds_load_t operator/( mds_load_t& a, double d ) -{ - mds_load_t r; - r.root_pop = a.root_pop / d; - r.req_rate = a.req_rate / d; - r.queue_len = a.queue_len / d; - return r; -} -*/ - - -class load_spread_t { -public: - static const int MAX = 4; - int last[MAX]; - int p, n; - DecayCounter count; - -public: - load_spread_t() : p(0), n(0) { - for (int i=0; i= 0 is the auth mds -#define CDIR_AUTH_PARENT -1 // default -#define CDIR_AUTH_UNKNOWN -2 -#define CDIR_AUTH_DEFAULT pair(-1, -2) -#define CDIR_AUTH_UNDEF pair(-2, -2) -//#define CDIR_AUTH_ROOTINODE pair( 0, -2) - - - -// print hack -struct mdsco_db_line_prefix { - MDSCacheObject *object; - mdsco_db_line_prefix(MDSCacheObject *o) : object(o) {} -}; -ostream& operator<<(ostream& out, mdsco_db_line_prefix o); - -// printer -ostream& operator<<(ostream& out, MDSCacheObject &o); - -class MDSCacheObjectInfo { -public: - inodeno_t ino; - dirfrag_t dirfrag; - string dname; - - MDSCacheObjectInfo() : ino(0) {} - - void _encode(bufferlist& bl) const { - ::_encode(ino, bl); - ::_encode(dirfrag, bl); - ::_encode(dname, bl); - } - void _decode(bufferlist& bl, int& off) { - ::_decode(ino, bl, off); - ::_decode(dirfrag, bl, off); - ::_decode(dname, bl, off); - } - void _decode(bufferlist::iterator& p) { - ::_decode_simple(ino, p); - ::_decode_simple(dirfrag, p); - ::_decode_simple(dname, p); - } -}; - - -class MDSCacheObject { - public: - // -- pins -- - const static int PIN_REPLICATED = 1000; - const static int PIN_DIRTY = 1001; - const static int PIN_LOCK = -1002; - const static int PIN_REQUEST = -1003; - const static int PIN_WAITER = 1004; - const static int PIN_DIRTYSCATTERED = 1005; // make this neg if we start using multiple scatterlocks? - static const int PIN_AUTHPIN = 1006; - static const int PIN_PTRWAITER = -1007; - const static int PIN_TEMPEXPORTING = 1008; // temp pin between encode_ and finish_export - - const char *generic_pin_name(int p) { - switch (p) { - case PIN_REPLICATED: return "replicated"; - case PIN_DIRTY: return "dirty"; - case PIN_LOCK: return "lock"; - case PIN_REQUEST: return "request"; - case PIN_WAITER: return "waiter"; - case PIN_DIRTYSCATTERED: return "dirtyscattered"; - case PIN_AUTHPIN: return "authpin"; - case PIN_PTRWAITER: return "ptrwaiter"; - case PIN_TEMPEXPORTING: return "tempexporting"; - default: assert(0); return 0; - } - } - - // -- state -- - const static int STATE_AUTH = (1<<30); - const static int STATE_DIRTY = (1<<29); - const static int STATE_REJOINING = (1<<28); // replica has not joined w/ primary copy - - // -- wait -- - const static int WAIT_SINGLEAUTH = (1<<30); - const static int WAIT_UNFREEZE = (1<<29); // pka AUTHPINNABLE - - - // ============================================ - // cons - public: - MDSCacheObject() : - state(0), - ref(0), - replica_nonce(0) {} - virtual ~MDSCacheObject() {} - - // printing - virtual void print(ostream& out) = 0; - virtual ostream& print_db_line_prefix(ostream& out) { - return out << "mdscacheobject(" << this << ") "; - } - - // -------------------------------------------- - // state - protected: - unsigned state; // state bits - - public: - unsigned get_state() const { return state; } - unsigned state_test(unsigned mask) const { return (state & mask); } - void state_clear(unsigned mask) { state &= ~mask; } - void state_set(unsigned mask) { state |= mask; } - void state_reset(unsigned s) { state = s; } - - bool is_auth() const { return state_test(STATE_AUTH); } - bool is_dirty() const { return state_test(STATE_DIRTY); } - bool is_clean() const { return !is_dirty(); } - bool is_rejoining() const { return state_test(STATE_REJOINING); } - - // -------------------------------------------- - // authority - virtual pair authority() = 0; - bool is_ambiguous_auth() { - return authority().second != CDIR_AUTH_UNKNOWN; - } - - // -------------------------------------------- - // pins -protected: - int ref; // reference count -#ifdef MDS_REF_SET - multiset ref_set; -#endif - - public: - int get_num_ref() { return ref; } - virtual const char *pin_name(int by) = 0; - //bool is_pinned_by(int by) { return ref_set.count(by); } - //multiset& get_ref_set() { return ref_set; } - - virtual void last_put() {} - virtual void bad_put(int by) { -#ifdef MDS_REF_SET - assert(ref_set.count(by) > 0); -#endif - assert(ref > 0); - } - void put(int by) { -#ifdef MDS_REF_SET - if (ref == 0 || ref_set.count(by) == 0) { -#else - if (ref == 0) { -#endif - bad_put(by); - } else { - ref--; -#ifdef MDS_REF_SET - ref_set.erase(ref_set.find(by)); - assert(ref == (int)ref_set.size()); -#endif - if (ref == 0) - last_put(); - } - } - - virtual void first_get() {} - virtual void bad_get(int by) { -#ifdef MDS_REF_SET - assert(by < 0 || ref_set.count(by) == 0); -#endif - assert(0); - } - void get(int by) { -#ifdef MDS_REF_SET - if (by >= 0 && ref_set.count(by)) { - bad_get(by); - } else { -#endif - if (ref == 0) - first_get(); - ref++; -#ifdef MDS_REF_SET - ref_set.insert(by); - assert(ref == (int)ref_set.size()); - } -#endif - } - - void print_pin_set(ostream& out) { -#ifdef MDS_REF_SET - multiset::iterator it = ref_set.begin(); - while (it != ref_set.end()) { - out << " " << pin_name(*it); - int last = *it; - int c = 1; - do { - it++; - if (it == ref_set.end()) break; - } while (*it == last); - if (c > 1) - out << "*" << c; - } -#endif - } - - - // -------------------------------------------- - // auth pins - virtual bool can_auth_pin() = 0; - virtual void auth_pin() = 0; - virtual void auth_unpin() = 0; - virtual bool is_frozen() = 0; - - - // -------------------------------------------- - // replication - protected: - map replica_map; // [auth] mds -> nonce - int replica_nonce; // [replica] defined on replica - - public: - bool is_replicated() { return !replica_map.empty(); } - bool is_replica(int mds) { return replica_map.count(mds); } - int num_replicas() { return replica_map.size(); } - int add_replica(int mds) { - if (replica_map.count(mds)) - return ++replica_map[mds]; // inc nonce - if (replica_map.empty()) - get(PIN_REPLICATED); - return replica_map[mds] = 1; - } - void add_replica(int mds, int nonce) { - if (replica_map.empty()) - get(PIN_REPLICATED); - replica_map[mds] = nonce; - } - int get_replica_nonce(int mds) { - assert(replica_map.count(mds)); - return replica_map[mds]; - } - void remove_replica(int mds) { - assert(replica_map.count(mds)); - replica_map.erase(mds); - if (replica_map.empty()) - put(PIN_REPLICATED); - } - void clear_replica_map() { - if (!replica_map.empty()) - put(PIN_REPLICATED); - replica_map.clear(); - } - map::iterator replicas_begin() { return replica_map.begin(); } - map::iterator replicas_end() { return replica_map.end(); } - const map& get_replicas() { return replica_map; } - void list_replicas(set& ls) { - for (map::const_iterator p = replica_map.begin(); - p != replica_map.end(); - ++p) - ls.insert(p->first); - } - - int get_replica_nonce() { return replica_nonce;} - void set_replica_nonce(int n) { replica_nonce = n; } - - - // --------------------------------------------- - // waiting - protected: - multimap waiting; - - public: - bool is_waiter_for(int mask) { - return waiting.count(mask) > 0; // FIXME: not quite right. - } - virtual void add_waiter(int mask, Context *c) { - if (waiting.empty()) - get(PIN_WAITER); - waiting.insert(pair(mask, c)); - pdout(10,g_conf.debug_mds) << (mdsco_db_line_prefix(this)) - << "add_waiter " << hex << mask << dec << " " << c - << " on " << *this - << dendl; - - } - virtual void take_waiting(int mask, list& ls) { - if (waiting.empty()) return; - multimap::iterator it = waiting.begin(); - while (it != waiting.end()) { - if (it->first & mask) { - ls.push_back(it->second); - pdout(10,g_conf.debug_mds) << (mdsco_db_line_prefix(this)) - << "take_waiting mask " << hex << mask << dec << " took " << it->second - << " tag " << it->first - << " on " << *this - << dendl; - waiting.erase(it++); - } else { - pdout(10,g_conf.debug_mds) << "take_waiting mask " << hex << mask << dec << " SKIPPING " << it->second - << " tag " << it->first - << " on " << *this - << dendl; - it++; - } - } - if (waiting.empty()) - put(PIN_WAITER); - } - void finish_waiting(int mask, int result = 0) { - list finished; - take_waiting(mask, finished); - finish_contexts(finished, result); - } - - - // --------------------------------------------- - // locking - // noop unless overloaded. - virtual SimpleLock* get_lock(int type) { assert(0); return 0; } - virtual void set_object_info(MDSCacheObjectInfo &info) { assert(0); } - virtual void encode_lock_state(int type, bufferlist& bl) { assert(0); } - virtual void decode_lock_state(int type, bufferlist& bl) { assert(0); } - virtual void finish_lock_waiters(int type, int mask, int r=0) { assert(0); } - virtual void add_lock_waiter(int type, int mask, Context *c) { assert(0); } - virtual bool is_lock_waiting(int type, int mask) { assert(0); return false; } - - virtual void clear_dirty_scattered(int type) { assert(0); } - - // --------------------------------------------- - // ordering - virtual bool is_lt(const MDSCacheObject *r) const = 0; - struct ptr_lt { - bool operator()(const MDSCacheObject* l, const MDSCacheObject* r) const { - return l->is_lt(r); - } - }; - -}; - -inline ostream& operator<<(ostream& out, MDSCacheObject &o) { - o.print(out); - return out; -} - -inline ostream& operator<<(ostream& out, const MDSCacheObjectInfo &info) { - if (info.ino) return out << info.ino; - if (info.dname.length()) return out << info.dirfrag << "/" << info.dname; - return out << info.dirfrag; -} - -inline ostream& operator<<(ostream& out, mdsco_db_line_prefix o) { - o.object->print_db_line_prefix(out); - return out; -} - - -#endif diff --git a/branches/sage/mds/messages/MAnchor.h b/branches/sage/mds/messages/MAnchor.h deleted file mode 100644 index 6ceb8981244fa..0000000000000 --- a/branches/sage/mds/messages/MAnchor.h +++ /dev/null @@ -1,74 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MANCHORREQUEST_H -#define __MANCHORREQUEST_H - -#include - -#include "msg/Message.h" -#include "mds/Anchor.h" - - -class MAnchor : public Message { - int op; - inodeno_t ino; - vector trace; - version_t atid; // anchor table version. - - public: - MAnchor() {} - MAnchor(int o, inodeno_t i, version_t v=0) : - Message(MSG_MDS_ANCHOR), - op(o), ino(i), atid(v) { } - - virtual char *get_type_name() { return "anchor"; } - void print(ostream& o) { - o << "anchor(" << get_anchor_opname(op); - if (ino) o << " " << ino; - if (atid) o << " atid " << atid; - if (!trace.empty()) o << ' ' << trace; - o << ")"; - } - - void set_trace(vector& trace) { - this->trace = trace; - } - - int get_op() { return op; } - inodeno_t get_ino() { return ino; } - vector& get_trace() { return trace; } - version_t get_atid() { return atid; } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(op), (char*)&op); - off += sizeof(op); - payload.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - payload.copy(off, sizeof(atid), (char*)&atid); - off += sizeof(atid); - ::_decode(trace, payload, off); - } - - virtual void encode_payload() { - payload.append((char*)&op, sizeof(op)); - payload.append((char*)&ino, sizeof(ino)); - payload.append((char*)&atid, sizeof(atid)); - ::_encode(trace, payload); - } -}; - -#endif diff --git a/branches/sage/mds/messages/MCacheExpire.h b/branches/sage/mds/messages/MCacheExpire.h deleted file mode 100644 index 015aa562038a7..0000000000000 --- a/branches/sage/mds/messages/MCacheExpire.h +++ /dev/null @@ -1,127 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MCACHEEXPIRE_H -#define __MCACHEEXPIRE_H - -class MCacheExpire : public Message { - int from; - -public: - /* - group things by realm (auth delgation root), since that's how auth is determined. - that makes it less work to process when exports are in progress. - */ - struct realm { - map inodes; - map dirs; - map > dentries; - }; - map realms; - - int get_from() { return from; } - - MCacheExpire() {} - MCacheExpire(int f) : - Message(MSG_MDS_CACHEEXPIRE), - from(f) { } - - virtual char *get_type_name() { return "CEx";} - - void add_inode(dirfrag_t r, inodeno_t ino, int nonce) { - realms[r].inodes[ino] = nonce; - } - void add_dir(dirfrag_t r, dirfrag_t df, int nonce) { - realms[r].dirs[df] = nonce; - } - void add_dentry(dirfrag_t r, dirfrag_t df, const string& dn, int nonce) { - realms[r].dentries[df][dn] = nonce; - } - - void add_realm(dirfrag_t df, realm& r) { - realm& myr = realms[df]; - for (map::iterator p = r.inodes.begin(); - p != r.inodes.end(); - ++p) - myr.inodes[p->first] = p->second; - for (map::iterator p = r.dirs.begin(); - p != r.dirs.end(); - ++p) - myr.dirs[p->first] = p->second; - for (map >::iterator p = r.dentries.begin(); - p != r.dentries.end(); - ++p) - for (map::iterator q = p->second.begin(); - q != p->second.end(); - ++q) - myr.dentries[p->first][q->first] = q->second; - } - - void decode_payload() { - int off = 0; - - payload.copy(off, sizeof(from), (char*)&from); - off += sizeof(from); - - int nr; - payload.copy(off, sizeof(nr), (char*)&nr); - off += sizeof(nr); - - while (nr--) { - dirfrag_t r; - payload.copy(off, sizeof(r), (char*)&r); - off += sizeof(r); - - ::_decode(realms[r].inodes, payload, off); - ::_decode(realms[r].dirs, payload, off); - - int n; - payload.copy(off, sizeof(int), (char*)&n); - off += sizeof(int); - for (int i=0; i::iterator q = realms.begin(); - q != realms.end(); - ++q) { - payload.append((char*)&q->first, sizeof(q->first)); - - ::_encode(q->second.inodes, payload); - ::_encode(q->second.dirs, payload); - - int n = q->second.dentries.size(); - payload.append((char*)&n, sizeof(n)); - for (map >::iterator p = q->second.dentries.begin(); - p != q->second.dentries.end(); - ++p) { - payload.append((char*)&p->first, sizeof(p->first)); - ::_encode(p->second, payload); - } - } - } -}; - -#endif diff --git a/branches/sage/mds/messages/MClientFileCaps.h b/branches/sage/mds/messages/MClientFileCaps.h deleted file mode 100644 index 758205902b41b..0000000000000 --- a/branches/sage/mds/messages/MClientFileCaps.h +++ /dev/null @@ -1,115 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MCLIENTFILECAPS_H -#define __MCLIENTFILECAPS_H - -#include "msg/Message.h" -#include "mds/Capability.h" - - -class MClientFileCaps : public Message { - public: - static const int OP_GRANT = 0; // mds->client grant. - static const int OP_ACK = 1; // client->mds ack (if prior grant was a recall) - static const int OP_RELEASE = 2; // mds->client release cap (*) - static const int OP_EXPORT = 3; // mds has exported the cap - static const int OP_IMPORT = 4; // mds has imported the cap from get_mds() - /* - * (*) it's a bit counterintuitive, but the mds has to - * close the cap because the client isn't able to tell - * if a concurrent open() would map to the same inode. - */ - static const char* get_opname(int op) { - switch (op) { - case OP_GRANT: return "grant"; - case OP_ACK: return "ack"; - case OP_RELEASE: return "release"; - case OP_EXPORT: return "export"; - case OP_IMPORT: return "import"; - default: assert(0); return 0; - } - } - - private: - int32_t op; - inode_t inode; - capseq_t seq; - int32_t caps; - int32_t wanted; - - int32_t mds; - - public: - inodeno_t get_ino() { return inode.ino; } - inode_t& get_inode() { return inode; } - int get_caps() { return caps; } - int get_wanted() { return wanted; } - capseq_t get_seq() { return seq; } - - // for cap migration - int get_mds() { return mds; } - int get_op() { return op; } - - void set_caps(int c) { caps = c; } - void set_wanted(int w) { wanted = w; } - - void set_mds(int m) { mds = m; } - void set_op(int s) { op = s; } - - MClientFileCaps() {} - MClientFileCaps(int op_, - inode_t& inode_, - long seq_, - int caps_, - int wanted_, - int mds_=0) : - Message(MSG_CLIENT_FILECAPS), - op(op_), - inode(inode_), - seq(seq_), - caps(caps_), - wanted(wanted_), - mds(mds_) { } - - char *get_type_name() { return "Cfcap";} - void print(ostream& out) { - out << "client_file_caps(" << get_opname(op) - << " " << inode.ino - << " seq " << seq - << " caps " << cap_string(caps) - << " wanted" << cap_string(wanted) - << ")"; - } - - void decode_payload() { - int off = 0; - ::_decode(op, payload, off); - ::_decode(seq, payload, off); - ::_decode(inode, payload, off); - ::_decode(caps, payload, off); - ::_decode(wanted, payload, off); - ::_decode(mds, payload, off); - } - void encode_payload() { - ::_encode(op, payload); - ::_encode(seq, payload); - ::_encode(inode, payload); - ::_encode(caps, payload); - ::_encode(wanted, payload); - ::_encode(mds, payload); - } -}; - -#endif diff --git a/branches/sage/mds/messages/MClientMount.h b/branches/sage/mds/messages/MClientMount.h deleted file mode 100644 index a49b558c7f040..0000000000000 --- a/branches/sage/mds/messages/MClientMount.h +++ /dev/null @@ -1,43 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MCLIENTMOUNT_H -#define __MCLIENTMOUNT_H - -#include "msg/Message.h" - -class MClientMount : public Message { -public: - entity_addr_t addr; - int32_t instance; // on this node - - MClientMount() : Message(MSG_CLIENT_MOUNT) { } - MClientMount(entity_addr_t a, int i = 0) : - Message(MSG_CLIENT_MOUNT), - addr(a), instance(i) { } - - char *get_type_name() { return "client_mount"; } - - void decode_payload() { - int off = 0; - ::_decode(addr, payload, off); - ::_decode(instance, payload, off); - } - void encode_payload() { - ::_encode(addr, payload); - ::_encode(instance, payload); - } -}; - -#endif diff --git a/branches/sage/mds/messages/MClientReconnect.h b/branches/sage/mds/messages/MClientReconnect.h deleted file mode 100644 index bf1fbacd4b75c..0000000000000 --- a/branches/sage/mds/messages/MClientReconnect.h +++ /dev/null @@ -1,59 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MCLIENTRECONNECT_H -#define __MCLIENTRECONNECT_H - -#include "msg/Message.h" -#include "mds/mdstypes.h" - -class MClientReconnect : public Message { -public: - map inode_caps; - map inode_path; - bool closed; // true if this session was closed by the client. - - MClientReconnect() : Message(MSG_CLIENT_RECONNECT), - closed(false) { } - - char *get_type_name() { return "client_reconnect"; } - void print(ostream& out) { - out << "client_reconnect(" << inode_caps.size() << " caps)"; - } - - void add_inode_caps(inodeno_t ino, - int wanted, int issued, - off_t sz, utime_t mt, utime_t at) { - inode_caps[ino] = inode_caps_reconnect_t(wanted, issued, sz, mt, at); - } - void add_inode_path(inodeno_t ino, const string& path) { - inode_path[ino] = path; - } - - void encode_payload() { - ::_encode(closed, payload); - ::_encode(inode_caps, payload); - ::_encode(inode_path, payload); - } - void decode_payload() { - int off = 0; - ::_decode(closed, payload, off); - ::_decode(inode_caps, payload, off); - ::_decode(inode_path, payload, off); - } - -}; - - -#endif diff --git a/branches/sage/mds/messages/MClientReply.h b/branches/sage/mds/messages/MClientReply.h deleted file mode 100644 index f50384000f2f5..0000000000000 --- a/branches/sage/mds/messages/MClientReply.h +++ /dev/null @@ -1,289 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MCLIENTREPLY_H -#define __MCLIENTREPLY_H - -#include "include/types.h" -#include "include/encodable.h" -#include "MClientRequest.h" - -#include "msg/Message.h" -#include "mds/CInode.h" -#include "mds/CDir.h" -#include "mds/CDentry.h" - -#include -using namespace std; - -class CInode; - -/*** - * - * MClientReply - container message for MDS reply to a client's MClientRequest - * - * key fields: - * long tid - transaction id, so the client can match up with pending request - * int result - error code, or fh if it was open - * - * for most requests: - * trace is a vector of InodeStat's tracing from root to the file/dir/whatever - * the operation referred to, so that the client can update it's info about what - * metadata lives on what MDS. - * - * for readdir replies: - * dir_contents is a vector of InodeStat*'s. - * - * that's mostly it, i think! - * - */ - -struct DirStat { - // mds distribution hints - frag_t frag; - int auth; - set dist; - bool is_rep; - - DirStat() {} - DirStat(bufferlist::iterator& p) { - _decode(p); - } - - void _decode(bufferlist::iterator& p) { - ::_decode_simple(frag, p); - ::_decode_simple(auth, p); - ::_decode_simple(dist, p); - ::_decode_simple(is_rep, p); - } - - static void _encode(bufferlist& bl, CDir *dir, int whoami) { - frag_t frag = dir->get_frag(); - int auth; - set dist; - bool is_rep; - - auth = dir->get_dir_auth().first; - if (dir->is_auth()) - dir->get_dist_spec(dist, whoami); - is_rep = dir->is_rep(); - - ::_encode_simple(frag, bl); - ::_encode_simple(auth, bl); - ::_encode_simple(dist, bl); - ::_encode_simple(is_rep, bl); - } -}; - -struct InodeStat { - inode_t inode; - string symlink; // symlink content (if symlink) - fragtree_t dirfragtree; - uint32_t mask; - - public: - InodeStat() {} - InodeStat(bufferlist::iterator& p) { - _decode(p); - } - - void _decode(bufferlist::iterator &p) { - ::_decode_simple(mask, p); - ::_decode_simple(inode, p); - ::_decode_simple(symlink, p); - dirfragtree._decode(p); - } - - static void _encode(bufferlist &bl, CInode *in) { - int mask = STAT_MASK_INO|STAT_MASK_TYPE|STAT_MASK_BASE; - - // mask - if (in->authlock.can_rdlock(0)) mask |= STAT_MASK_AUTH; - if (in->linklock.can_rdlock(0)) mask |= STAT_MASK_LINK; - if (in->filelock.can_rdlock(0)) mask |= STAT_MASK_FILE; - - ::_encode_simple(mask, bl); - ::_encode_simple(in->inode, bl); - ::_encode_simple(in->symlink, bl); - in->dirfragtree._encode(bl); - } - -}; - - -class MClientReply : public Message { - // reply data - struct st_ { - long tid; - epoch_t mdsmap_epoch; - int op; - int result; // error code - unsigned char file_caps; // for open - long file_caps_seq; - uint64_t file_data_version; // for client buffercache consistency - } st; - - string path; - - list trace_in; - list trace_dir; - list trace_dn; - bufferlist trace_bl; - - DirStat *dir_dir; - list dir_in; - list dir_dn; - bufferlist dir_bl; - - public: - long get_tid() { return st.tid; } - int get_op() { return st.op; } - - void set_mdsmap_epoch(epoch_t e) { st.mdsmap_epoch = e; } - epoch_t get_mdsmap_epoch() { return st.mdsmap_epoch; } - - int get_result() { return st.result; } - const string& get_path() { return path; } - - inodeno_t get_ino() { return trace_in.back()->inode.ino; } - const inode_t& get_inode() { return trace_in.back()->inode; } - - unsigned char get_file_caps() { return st.file_caps; } - long get_file_caps_seq() { return st.file_caps_seq; } - uint64_t get_file_data_version() { return st.file_data_version; } - - void set_result(int r) { st.result = r; } - void set_file_caps(unsigned char c) { st.file_caps = c; } - void set_file_caps_seq(long s) { st.file_caps_seq = s; } - void set_file_data_version(uint64_t v) { st.file_data_version = v; } - - MClientReply() : dir_dir(0) {}; - MClientReply(MClientRequest *req, int result = 0) : - Message(MSG_CLIENT_REPLY), dir_dir(0) { - memset(&st, 0, sizeof(st)); - this->st.tid = req->get_tid(); - this->st.op = req->get_op(); - this->path = req->get_path(); - - this->st.result = result; - } - virtual ~MClientReply() { - list::iterator it; - - for (it = trace_in.begin(); it != trace_in.end(); ++it) - delete *it; - for (it = dir_in.begin(); it != dir_in.end(); ++it) - delete *it; - } - virtual char *get_type_name() { return "creply"; } - void print(ostream& o) { - o << "creply(" << env.dst.name << "." << st.tid; - o << " = " << st.result; - if (st.result <= 0) - o << " " << strerror(-st.result); - o << ")"; - } - - // serialization - virtual void decode_payload() { - bufferlist::iterator p = payload.begin(); - ::_decode_simple(st, p); - ::_decode_simple(path, p); - ::_decode_simple(trace_bl, p); - ::_decode_simple(dir_bl, p); - assert(p.end()); - } - virtual void encode_payload() { - ::_encode_simple(st, payload); - ::_encode_simple(path, payload); - ::_encode_simple(trace_bl, payload); - ::_encode_simple(dir_bl, payload); - } - - - // dir contents - void take_dir_items(bufferlist& bl) { - dir_bl.claim(bl); - } - void _decode_dir() { - bufferlist::iterator p = dir_bl.begin(); - dir_dir = new DirStat(p); - while (!p.end()) { - string dn; - ::_decode_simple(dn, p); - dir_dn.push_back(dn); - dir_in.push_back(new InodeStat(p)); - } - } - - const list& get_dir_in() { - if (dir_in.empty() && dir_bl.length()) _decode_dir(); - return dir_in; - } - const list& get_dir_dn() { - if (dir_dn.empty() && dir_bl.length()) _decode_dir(); - return dir_dn; - } - const DirStat* get_dir_dir() { - return dir_dir; - } - - - // trace - void set_trace_dist(CInode *in, int whoami) { - // inode, dentry, dir, ..., inode - while (in) { - InodeStat::_encode(trace_bl, in); - CDentry *dn = in->get_parent_dn(); - if (!dn) break; - ::_encode_simple(in->get_parent_dn()->get_name(), trace_bl); - DirStat::_encode(trace_bl, dn->get_dir(), whoami); - in = dn->get_dir()->get_inode(); - } - } - void _decode_trace() { - bufferlist::iterator p = trace_bl.begin(); - while (!p.end()) { - // inode - trace_in.push_front(new InodeStat(p)); - if (!p.end()) { - // dentry - string ref_dn; - ::_decode_simple(ref_dn, p); - trace_dn.push_front(ref_dn); - - // dir - trace_dir.push_front(new DirStat(p)); - } - } - } - - const list& get_trace_in() { - if (trace_in.empty() && trace_bl.length()) _decode_trace(); - return trace_in; - } - const list& get_trace_dir() { - if (trace_in.empty() && trace_bl.length()) _decode_trace(); - return trace_dir; - } - const list& get_trace_dn() { - if (trace_in.empty() && trace_bl.length()) _decode_trace(); - return trace_dn; - } - - -}; - -#endif diff --git a/branches/sage/mds/messages/MClientRequest.h b/branches/sage/mds/messages/MClientRequest.h deleted file mode 100644 index d0c4a57a4af8a..0000000000000 --- a/branches/sage/mds/messages/MClientRequest.h +++ /dev/null @@ -1,331 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MCLIENTREQUEST_H -#define __MCLIENTREQUEST_H - -/** - * - * MClientRequest - container for a client METADATA request. created/sent by clients. - * can be forwarded around between MDS's. - * - * int client - the originating client - * long tid - transaction id, unique among requests for that client. probably just a counter! - * -> the MDS passes the Request to the Reply constructor, so this always matches. - * - * int op - the metadata op code. MDS_OP_RENAME, etc. - * int caller_uid, _gid - guess - * - * fixed size arguments are in a union. - * there's also a string argument, for e.g. symlink(). - * - */ - -#include "msg/Message.h" -#include "include/filepath.h" -#include "mds/mdstypes.h" - -#include -#include -#include -#include -#include - - -// metadata ops. -// >=1000 --> an update, non-idempotent (i.e. an update) -#define MDS_OP_STATFS 1 - -#define MDS_OP_STAT 100 -#define MDS_OP_LSTAT 101 -#define MDS_OP_FSTAT 102 -#define MDS_OP_UTIME 1102 -#define MDS_OP_CHMOD 1104 -#define MDS_OP_CHOWN 1105 - -#define MDS_OP_READDIR 200 -#define MDS_OP_MKNOD 1201 -#define MDS_OP_LINK 1202 -#define MDS_OP_UNLINK 1203 -#define MDS_OP_RENAME 1204 - -#define MDS_OP_MKDIR 1220 -#define MDS_OP_RMDIR 1221 -#define MDS_OP_SYMLINK 1222 - -#define MDS_OP_OPEN 301 -#define MDS_OP_TRUNCATE 1306 -#define MDS_OP_FSYNC 307 - -#define MDS_OP_RELEASE 308 // used only by SyntheticClient op_dist thinger - - -class MClientRequest : public Message { - struct { - tid_t tid; - tid_t oldest_client_tid; - int num_fwd; - int retry_attempt; - inodeno_t mds_wants_replica_in_dirino; - - entity_inst_t client_inst; - epoch_t mdsmap_epoch; - - int op; - int caller_uid, caller_gid; - inodeno_t cwd_ino; - - } st; - - // path arguments - filepath path, path2; - - public: - // fixed size arguments. in a union. - // note: nothing with a constructor can go here; use underlying base - // types for _inodeno_t, _frag_t. - union { - struct { - int mask; - } stat; - struct { - _inodeno_t ino; - int mask; - } fstat; - struct { - _frag_t frag; - } readdir; - struct { - _utime_t mtime; - _utime_t atime; - } utime; - struct { - mode_t mode; - } chmod; - struct { - uid_t uid; - gid_t gid; - } chown; - struct { - mode_t mode; - dev_t rdev; - } mknod; - struct { - mode_t mode; - } mkdir; - struct { - int flags; - mode_t mode; - } open; - struct { - _inodeno_t ino; // optional - off_t length; - } truncate; - struct { - _inodeno_t ino; - } fsync; - } args; - - // cons - MClientRequest() : Message(MSG_CLIENT_REQUEST) {} - MClientRequest(int op, entity_inst_t ci) : Message(MSG_CLIENT_REQUEST) { - memset(&st, 0, sizeof(st)); - memset(&args, 0, sizeof(args)); - this->st.op = op; - this->st.client_inst = ci; - } - - void set_mdsmap_epoch(epoch_t e) { st.mdsmap_epoch = e; } - epoch_t get_mdsmap_epoch() { return st.mdsmap_epoch; } - - metareqid_t get_reqid() { - // FIXME: for now, assume clients always have 1 incarnation - return metareqid_t(st.client_inst.name, st.tid); - } - - int get_open_file_mode() { - if (args.open.flags & O_LAZY) - return FILE_MODE_LAZY; - if (args.open.flags & O_WRONLY) - return FILE_MODE_W; - if (args.open.flags & O_RDWR) - return FILE_MODE_RW; - if (args.open.flags & O_APPEND) - return FILE_MODE_W; - return FILE_MODE_R; - } - bool open_file_mode_is_readonly() { - return get_open_file_mode() == FILE_MODE_R; - } - bool is_idempotent() { - if (st.op == MDS_OP_OPEN) - return open_file_mode_is_readonly(); - return (st.op < 1000); - } - bool auth_is_best() { - if (!is_idempotent()) return true; - if (st.op == MDS_OP_READDIR) return true; - return false; - } - bool follow_trailing_symlink() { - switch (st.op) { - case MDS_OP_LSTAT: - case MDS_OP_FSTAT: - case MDS_OP_LINK: - case MDS_OP_UNLINK: - case MDS_OP_RENAME: - return false; - - case MDS_OP_STAT: - case MDS_OP_UTIME: - case MDS_OP_CHMOD: - case MDS_OP_CHOWN: - case MDS_OP_READDIR: - case MDS_OP_OPEN: - case MDS_OP_TRUNCATE: - - case MDS_OP_FSYNC: - case MDS_OP_MKNOD: - case MDS_OP_MKDIR: - case MDS_OP_RMDIR: - case MDS_OP_SYMLINK: - return true; - - default: - assert(0); - return false; - } - } - - - - // normal fields - void set_tid(tid_t t) { st.tid = t; } - void set_oldest_client_tid(tid_t t) { st.oldest_client_tid = t; } - void inc_num_fwd() { st.num_fwd++; } - void set_retry_attempt(int a) { st.retry_attempt = a; } - void set_path(string& p) { path.set_path(p); } - void set_path(const char *p) { path.set_path(p); } - void set_filepath(const filepath& fp) { path = fp; } - void set_path2(string& p) { path2.set_path(p); } - void set_path2(const char *p) { path2.set_path(p); } - void set_filepath2(const filepath& fp) { path2 = fp; } - void set_caller_uid(int u) { st.caller_uid = u; } - void set_caller_gid(int g) { st.caller_gid = g; } - void set_mds_wants_replica_in_dirino(inodeno_t dirino) { - st.mds_wants_replica_in_dirino = dirino; } - - void set_client_inst(const entity_inst_t& i) { st.client_inst = i; } - const entity_inst_t& get_client_inst() { return st.client_inst; } - entity_name_t get_client() { return st.client_inst.name; } - - tid_t get_tid() { return st.tid; } - tid_t get_oldest_client_tid() { return st.oldest_client_tid; } - int get_num_fwd() { return st.num_fwd; } - int get_retry_attempt() { return st.retry_attempt; } - int get_op() { return st.op; } - int get_caller_uid() { return st.caller_uid; } - int get_caller_gid() { return st.caller_gid; } - const string& get_path() { return path.get_path(); } - filepath& get_filepath() { return path; } - const string& get_path2() { return path.get_path(); } - filepath& get_filepath2() { return path2; } - - inodeno_t get_mds_wants_replica_in_dirino() { - return st.mds_wants_replica_in_dirino; } - - inodeno_t get_cwd_ino() { return st.cwd_ino ? st.cwd_ino:inodeno_t(MDS_INO_ROOT); } - - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(st), (char*)&st); - off += sizeof(st); - payload.copy(off, sizeof(args), (char*)&args); - off += sizeof(args); - path._decode(payload, off); - path2._decode(payload, off); - } - - void encode_payload() { - payload.append((char*)&st, sizeof(st)); - payload.append((char*)&args, sizeof(args)); - path._encode(payload); - path2._encode(payload); - } - - char *get_type_name() { return "creq"; } - void print(ostream& out) { - out << "clientreq(" << get_client() - << "." << get_tid() - << " "; - switch(get_op()) { - case MDS_OP_STATFS: - out << "statfs"; break; - - case MDS_OP_STAT: - out << "stat"; break; - case MDS_OP_LSTAT: - out << "lstat"; break; - case MDS_OP_FSTAT: - out << "fstat"; break; - case MDS_OP_UTIME: - out << "utime"; break; - case MDS_OP_CHMOD: - out << "chmod"; break; - case MDS_OP_CHOWN: - out << "chown"; break; - - case MDS_OP_READDIR: - out << "readdir"; break; - case MDS_OP_MKNOD: - out << "mknod"; break; - case MDS_OP_LINK: - out << "link"; break; - case MDS_OP_UNLINK: - out << "unlink"; break; - case MDS_OP_RENAME: - out << "rename"; break; - - case MDS_OP_MKDIR: - out << "mkdir"; break; - case MDS_OP_RMDIR: - out << "rmdir"; break; - case MDS_OP_SYMLINK: - out << "symlink"; break; - - case MDS_OP_OPEN: - out << "open"; break; - case MDS_OP_TRUNCATE: - out << "truncate"; break; - case MDS_OP_FSYNC: - out << "fsync"; break; - // case MDS_OP_RELEASE: - //out << "release"; break; - default: - out << "unknown=" << get_op(); - assert(0); - } - if (!get_filepath().empty()) - out << " " << get_filepath(); - if (!get_filepath2().empty()) - out << " " << get_filepath2(); - if (st.retry_attempt) - out << " RETRY=" << st.retry_attempt; - out << ")"; - } - -}; - -#endif diff --git a/branches/sage/mds/messages/MClientRequestForward.h b/branches/sage/mds/messages/MClientRequestForward.h deleted file mode 100644 index 53fb5270d30a9..0000000000000 --- a/branches/sage/mds/messages/MClientRequestForward.h +++ /dev/null @@ -1,59 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MCLIENTREQUESTFORWARD_H -#define __MCLIENTREQUESTFORWARD_H - -class MClientRequestForward : public Message { - tid_t tid; - int32_t dest_mds; - int32_t num_fwd; - - public: - MClientRequestForward() : Message(MSG_CLIENT_REQUEST_FORWARD) {} - MClientRequestForward(tid_t t, int dm, int nf) : - Message(MSG_CLIENT_REQUEST_FORWARD), - tid(t), dest_mds(dm), num_fwd(nf) { } - - tid_t get_tid() { return tid; } - int get_dest_mds() { return dest_mds; } - int get_num_fwd() { return num_fwd; } - - char *get_type_name() { return "cfwd"; } - void print(ostream& o) { - o << "client_request_forward(" << tid - << " to " << dest_mds - << " num_fwd=" << num_fwd - << ")"; - } - - void encode_payload() { - payload.append((char*)&tid, sizeof(tid)); - payload.append((char*)&dest_mds, sizeof(dest_mds)); - payload.append((char*)&num_fwd, sizeof(num_fwd)); - } - - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(tid), (char*)&tid); - off += sizeof(tid); - payload.copy(off, sizeof(dest_mds), (char*)&dest_mds); - off += sizeof(dest_mds); - payload.copy(off, sizeof(num_fwd), (char*)&num_fwd); - off += sizeof(num_fwd); - } -}; - -#endif diff --git a/branches/sage/mds/messages/MClientSession.h b/branches/sage/mds/messages/MClientSession.h deleted file mode 100644 index dc4252ac73d8e..0000000000000 --- a/branches/sage/mds/messages/MClientSession.h +++ /dev/null @@ -1,62 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MCLIENTSESSION_H -#define __MCLIENTSESSION_H - -#include "msg/Message.h" - -class MClientSession : public Message { -public: - const static int OP_REQUEST_OPEN = 1; - const static int OP_OPEN = 2; - const static int OP_REQUEST_CLOSE = 3; - const static int OP_CLOSE = 4; - static const char *get_opname(int o) { - switch (o) { - case OP_REQUEST_OPEN: return "request_open"; - case OP_OPEN: return "open"; - case OP_REQUEST_CLOSE: return "request_close"; - case OP_CLOSE: return "close"; - default: assert(0); return 0; - } - } - - int32_t op; - version_t seq; - - MClientSession() : Message(MSG_CLIENT_SESSION) { } - MClientSession(int o, version_t s=0) : - Message(MSG_CLIENT_SESSION), - op(o), seq(s) { } - - char *get_type_name() { return "client_session"; } - void print(ostream& out) { - out << "client_session(" << get_opname(op); - if (seq) out << " seq " << seq; - out << ")"; - } - - void decode_payload() { - int off = 0; - ::_decode(op, payload, off); - ::_decode(seq, payload, off); - } - void encode_payload() { - ::_encode(op, payload); - ::_encode(seq, payload); - } -}; - -#endif diff --git a/branches/sage/mds/messages/MClientUnmount.h b/branches/sage/mds/messages/MClientUnmount.h deleted file mode 100644 index 42fa07db7ba05..0000000000000 --- a/branches/sage/mds/messages/MClientUnmount.h +++ /dev/null @@ -1,40 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MCLIENTUNMOUNT_H -#define __MCLIENTUNMOUNT_H - -#include "msg/Message.h" - -class MClientUnmount : public Message { -public: - entity_inst_t inst; - - MClientUnmount() : Message(MSG_CLIENT_UNMOUNT) { } - MClientUnmount(entity_inst_t i) : - Message(MSG_CLIENT_UNMOUNT), - inst(i) { } - - char *get_type_name() { return "client_unmount"; } - - void decode_payload() { - int off = 0; - ::_decode(inst, payload, off); - } - void encode_payload() { - ::_encode(inst, payload); - } -}; - -#endif diff --git a/branches/sage/mds/messages/MDentryUnlink.h b/branches/sage/mds/messages/MDentryUnlink.h deleted file mode 100644 index 6e24d6f45410f..0000000000000 --- a/branches/sage/mds/messages/MDentryUnlink.h +++ /dev/null @@ -1,82 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MDENTRYUNLINK_H -#define __MDENTRYUNLINK_H - -class MDentryUnlink : public Message { - dirfrag_t dirfrag; - string dn; - - public: - dirfrag_t get_dirfrag() { return dirfrag; } - string& get_dn() { return dn; } - - CInodeDiscover *strayin; - CDirDiscover *straydir; - CDentryDiscover *straydn; - - MDentryUnlink() : - Message(MSG_MDS_DENTRYUNLINK), - strayin(0), straydir(0), straydn(0) { } - MDentryUnlink(dirfrag_t df, string& n) : - Message(MSG_MDS_DENTRYUNLINK), - dirfrag(df), - dn(n), - strayin(0), straydir(0), straydn(0) { } - ~MDentryUnlink() { - delete strayin; - delete straydir; - delete straydn; - } - - char *get_type_name() { return "dentry_unlink";} - void print(ostream& o) { - o << "dentry_unlink(" << dirfrag << " " << dn << ")"; - } - - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(dirfrag), (char*)&dirfrag); - off += sizeof(dirfrag); - ::_decode(dn, payload, off); - - bool isstray; - payload.copy(off, sizeof(isstray), (char*)&isstray); - off += sizeof(isstray); - if (isstray) { - strayin = new CInodeDiscover; - strayin->_decode(payload, off); - straydir = new CDirDiscover; - straydir->_decode(payload, off); - straydn = new CDentryDiscover; - straydn->_decode(payload, off); - } - } - void encode_payload() { - payload.append((char*)&dirfrag,sizeof(dirfrag)); - ::_encode(dn, payload); - - bool isstray = strayin ? true:false; - payload.append((char*)&isstray, sizeof(isstray)); - if (isstray) { - strayin->_encode(payload); - straydir->_encode(payload); - straydn->_encode(payload); - } - } -}; - -#endif diff --git a/branches/sage/mds/messages/MDiscover.h b/branches/sage/mds/messages/MDiscover.h deleted file mode 100644 index 7294bad22d796..0000000000000 --- a/branches/sage/mds/messages/MDiscover.h +++ /dev/null @@ -1,108 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MDISCOVER_H -#define __MDISCOVER_H - -#include "msg/Message.h" -#include "mds/CDir.h" -#include "include/filepath.h" - -#include -#include -using namespace std; - - -class MDiscover : public Message { - int asker; - inodeno_t base_ino; // 1 -> root - frag_t base_dir_frag; - - filepath want; // ... [/]need/this/stuff - inodeno_t want_ino; - - bool want_base_dir; - bool want_xlocked; - - public: - int get_asker() { return asker; } - inodeno_t get_base_ino() { return base_ino; } - frag_t get_base_dir_frag() { return base_dir_frag; } - - filepath& get_want() { return want; } - inodeno_t get_want_ino() { return want_ino; } - const string& get_dentry(int n) { return want[n]; } - - bool wants_base_dir() { return want_base_dir; } - bool wants_xlocked() { return want_xlocked; } - - void set_base_dir_frag(frag_t f) { base_dir_frag = f; } - - MDiscover() { } - MDiscover(int asker_, - inodeno_t base_ino_, - filepath& want_, - bool want_base_dir_ = true, - bool discover_xlocks_ = false) : - Message(MSG_MDS_DISCOVER), - asker(asker_), - base_ino(base_ino_), - want(want_), - want_ino(0), - want_base_dir(want_base_dir_), - want_xlocked(discover_xlocks_) { } - MDiscover(int asker_, - dirfrag_t base_dirfrag, - inodeno_t want_ino_, - bool want_base_dir_ = true) : - Message(MSG_MDS_DISCOVER), - asker(asker_), - base_ino(base_dirfrag.ino), - base_dir_frag(base_dirfrag.frag), - want_ino(want_ino_), - want_base_dir(want_base_dir_), - want_xlocked(false) { } - - char *get_type_name() { return "Dis"; } - void print(ostream &out) { - out << "discover(" << base_ino << "." << base_dir_frag - << " " << want; - if (want_ino) out << want_ino; - out << ")"; - } - - void decode_payload() { - int off = 0; - ::_decode(asker, payload, off); - ::_decode(base_ino, payload, off); - ::_decode(base_dir_frag, payload, off); - want._decode(payload, off); - ::_decode(want_ino, payload, off); - ::_decode(want_base_dir, payload, off); - ::_decode(want_xlocked, payload, off); - } - void encode_payload() { - ::_encode(asker, payload); - ::_encode(base_ino, payload); - ::_encode(base_dir_frag, payload); - want._encode(payload); - ::_encode(want_ino, payload); - ::_encode(want_base_dir, payload); - ::_encode(want_xlocked, payload); - } - -}; - -#endif diff --git a/branches/sage/mds/messages/MDiscoverReply.h b/branches/sage/mds/messages/MDiscoverReply.h deleted file mode 100644 index 67491049c0b8f..0000000000000 --- a/branches/sage/mds/messages/MDiscoverReply.h +++ /dev/null @@ -1,300 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MDISCOVERREPLY_H -#define __MDISCOVERREPLY_H - -#include "msg/Message.h" -#include "mds/CDir.h" -#include "mds/CInode.h" -#include "include/filepath.h" - -#include -#include -using namespace std; - -#define max(a,b) ((a)>(b) ? (a):(b)) - - -/** - * MDiscoverReply - return new replicas (of inodes, dirs, dentries) - * - * we group returned items by (dir, dentry, inode). each - * item in each set shares an index (it's "depth"). - * - * we can start and end with any type. - * no_base_dir = true if the first group has an inode but no dir - * no_base_dentry = true if the first group has an inode but no dentry - * they are false if there is no returned data, ie the first group is empty. - * - * we also return errors: - * error_flag_dn(string) - the specified dentry dne - * error_flag_dir - the last item wasn't a dir, so we couldn't continue. - * - * and sometimes, - * dir_auth_hint - where we think the dir auth is - * - * depth() gives us the number of depth units/indices for which we have - * information. this INCLUDES those for which we have errors but no data. - * - * see MDCache::handle_discover, handle_discover_reply. - * - * - * so basically, we get - * - * dir den ino i - * x 0 - * x x x 1 - * or - * x x 0 - * x x x 1 - * or - * x x x 0 - * x x x 1 - * ...and trail off however we want. - * - * - */ - -class MDiscoverReply : public Message { - // info about original request - inodeno_t base_ino; - frag_t base_dir_frag; - bool wanted_base_dir; - bool wanted_xlocked; - inodeno_t wanted_ino; - - // and the response - bool flag_error_dn; - bool flag_error_ino; - bool flag_error_dir; - bool no_base_dir; // no base dir (but IS dentry+inode) - bool no_base_dentry; // no base dentry (but IS inode) - string error_dentry; // dentry that was not found (to trigger waiters on asker) - - int dir_auth_hint; - - vector dirs; // not inode-aligned if no_base_dir = true. - vector dentries; // not inode-aligned if no_base_dentry = true - vector inodes; - - - public: - // accessors - inodeno_t get_base_ino() { return base_ino; } - frag_t get_base_dir_frag() { return base_dir_frag; } - bool get_wanted_base_dir() { return wanted_base_dir; } - bool get_wanted_xlocked() { return wanted_xlocked; } - inodeno_t get_wanted_ino() { return wanted_ino; } - - int get_num_inodes() { return inodes.size(); } - int get_num_dentries() { return dentries.size(); } - int get_num_dirs() { return dirs.size(); } - - int get_last_inode() { return inodes.size(); } - int get_last_dentry() { return dentries.size() + no_base_dentry; } - int get_last_dir() { return dirs.size() + no_base_dir; } - - int get_depth() { // return depth of deepest object (in dir/dentry/inode units) - return max( inodes.size(), // at least this many - max( no_base_dentry + dentries.size() + flag_error_dn, // inode start + path + possible error - dirs.size() + no_base_dir )); // dn/inode + dirs - } - - bool has_base_dir() { return !no_base_dir && dirs.size(); } - bool has_base_dentry() { return !no_base_dentry && dentries.size(); } - bool has_base_inode() { return no_base_dir && no_base_dentry; } - - bool is_flag_error_dn() { return flag_error_dn; } - bool is_flag_error_ino() { return flag_error_ino; } - bool is_flag_error_dir() { return flag_error_dir; } - string& get_error_dentry() { return error_dentry; } - - int get_dir_auth_hint() { return dir_auth_hint; } - - - // these index _arguments_ are aligned to each ([[dir, ] dentry, ] inode) set. - CInodeDiscover& get_inode(int n) { return *(inodes[n]); } - CDentryDiscover& get_dentry(int n) { return *(dentries[n - no_base_dentry]); } - CDirDiscover& get_dir(int n) { return *(dirs[n - no_base_dir]); } - inodeno_t get_ino(int n) { return inodes[n]->get_ino(); } - - // cons - MDiscoverReply() {} - MDiscoverReply(MDiscover *dis) : - Message(MSG_MDS_DISCOVERREPLY), - base_ino(dis->get_base_ino()), - base_dir_frag(dis->get_base_dir_frag()), - wanted_base_dir(dis->wants_base_dir()), - wanted_xlocked(dis->wants_xlocked()), - wanted_ino(dis->get_want_ino()), - flag_error_dn(false), - flag_error_ino(false), - flag_error_dir(false), - no_base_dir(false), no_base_dentry(false), - dir_auth_hint(CDIR_AUTH_UNKNOWN) { - } - MDiscoverReply(dirfrag_t df) : - Message(MSG_MDS_DISCOVERREPLY), - base_ino(df.ino), - base_dir_frag(df.frag), - wanted_base_dir(false), - wanted_xlocked(false), - wanted_ino(inodeno_t()), - flag_error_dn(false), - flag_error_ino(false), - flag_error_dir(false), - no_base_dir(false), no_base_dentry(false), - dir_auth_hint(CDIR_AUTH_UNKNOWN) { - } - ~MDiscoverReply() { - for (vector::iterator it = dirs.begin(); - it != dirs.end(); - it++) - delete *it; - for (vector::iterator it = dentries.begin(); - it != dentries.end(); - it++) - delete *it; - for (vector::iterator it = inodes.begin(); - it != inodes.end(); - it++) - delete *it; - } - virtual char *get_type_name() { return "DisR"; } - - // builders - bool is_empty() { - return dirs.empty() && dentries.empty() && inodes.empty() && - !flag_error_dn && - !flag_error_ino && - !flag_error_dir && - dir_auth_hint == CDIR_AUTH_UNKNOWN; - } - void add_dentry(CDentryDiscover* ddis) { - if (dentries.empty() && dirs.empty()) no_base_dir = true; - dentries.push_back(ddis); - } - - void add_inode(CInodeDiscover* din) { - if (inodes.empty() && dentries.empty()) no_base_dir = no_base_dentry = true; - inodes.push_back( din ); - } - - void add_dir(CDirDiscover* dir) { - dirs.push_back( dir ); - } - - - // void set_flag_forward() { flag_forward = true; } - void set_flag_error_dn(const string& dn) { - flag_error_dn = true; - error_dentry = dn; - } - void set_flag_error_ino() { - flag_error_ino = true; - } - void set_flag_error_dir() { - flag_error_dir = true; - } - void set_dir_auth_hint(int a) { - dir_auth_hint = a; - } - void set_error_dentry(const string& dn) { - error_dentry = dn; - } - - - // ... - virtual void decode_payload() { - int off = 0; - ::_decode(base_ino, payload, off); - ::_decode(base_dir_frag, payload, off); - ::_decode(wanted_base_dir, payload, off); - ::_decode(wanted_xlocked, payload, off); - ::_decode(flag_error_dn, payload, off); - ::_decode(flag_error_ino, payload, off); - ::_decode(flag_error_dir, payload, off); - ::_decode(no_base_dir, payload, off); - ::_decode(no_base_dentry, payload, off); - ::_decode(error_dentry, payload, off); - ::_decode(dir_auth_hint, payload, off); - - // dirs - int n; - payload.copy(off, sizeof(int), (char*)&n); - off += sizeof(int); - for (int i=0; i_decode(payload, off); - } - - // inodes - payload.copy(off, sizeof(int), (char*)&n); - off += sizeof(int); - for (int i=0; i_decode(payload, off); - } - - // dentries - payload.copy(off, sizeof(int), (char*)&n); - off += sizeof(int); - for (int i=0; i_decode(payload, off); - } - } - void encode_payload() { - ::_encode(base_ino, payload); - ::_encode(base_dir_frag, payload); - ::_encode(wanted_base_dir, payload); - ::_encode(wanted_xlocked, payload); - ::_encode(flag_error_dn, payload); - ::_encode(flag_error_ino, payload); - ::_encode(flag_error_dir, payload); - ::_encode(no_base_dir, payload); - ::_encode(no_base_dentry, payload); - ::_encode(error_dentry, payload); - ::_encode(dir_auth_hint, payload); - - // dirs - int n = dirs.size(); - payload.append((char*)&n, sizeof(int)); - for (vector::iterator it = dirs.begin(); - it != dirs.end(); - it++) - (*it)->_encode( payload ); - - // inodes - n = inodes.size(); - payload.append((char*)&n, sizeof(int)); - for (vector::iterator it = inodes.begin(); - it != inodes.end(); - it++) - (*it)->_encode( payload ); - - // dentries - n = dentries.size(); - payload.append((char*)&n, sizeof(int)); - for (vector::iterator it = dentries.begin(); - it != dentries.end(); - it++) - (*it)->_encode( payload ); - } - -}; - -#endif diff --git a/branches/sage/mds/messages/MExportDir.h b/branches/sage/mds/messages/MExportDir.h deleted file mode 100644 index 9964a7059c1d2..0000000000000 --- a/branches/sage/mds/messages/MExportDir.h +++ /dev/null @@ -1,65 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MEXPORTDIR_H -#define __MEXPORTDIR_H - -#include "msg/Message.h" - - -class MExportDir : public Message { - dirfrag_t dirfrag; - - bufferlist dirstate; - list bounds; - - public: - MExportDir() {} - MExportDir(dirfrag_t df) : - Message(MSG_MDS_EXPORTDIR), - dirfrag(df) { - } - virtual char *get_type_name() { return "Ex"; } - void print(ostream& o) { - o << "export(" << dirfrag << ")"; - } - - dirfrag_t get_dirfrag() { return dirfrag; } - bufferlist& get_dirstate() { return dirstate; } - list& get_bounds() { return bounds; } - - void take_dirstate(bufferlist& bl) { - dirstate.claim(bl); - } - void add_export(dirfrag_t df) { - bounds.push_back(df); - } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(dirfrag), (char*)&dirfrag); - off += sizeof(dirfrag); - ::_decode(bounds, payload, off); - ::_decode(dirstate, payload, off); - } - virtual void encode_payload() { - payload.append((char*)&dirfrag, sizeof(dirfrag)); - ::_encode(bounds, payload); - ::_encode(dirstate, payload); - } - -}; - -#endif diff --git a/branches/sage/mds/messages/MExportDirAck.h b/branches/sage/mds/messages/MExportDirAck.h deleted file mode 100644 index 1b9d683b4e36f..0000000000000 --- a/branches/sage/mds/messages/MExportDirAck.h +++ /dev/null @@ -1,46 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MEXPORTDIRACK_H -#define __MEXPORTDIRACK_H - -#include "MExportDir.h" - -class MExportDirAck : public Message { - dirfrag_t dirfrag; - - public: - dirfrag_t get_dirfrag() { return dirfrag; } - - MExportDirAck() {} - MExportDirAck(dirfrag_t i) : - Message(MSG_MDS_EXPORTDIRACK), dirfrag(i) { } - - virtual char *get_type_name() { return "ExAck"; } - void print(ostream& o) { - o << "export_ack(" << dirfrag << ")"; - } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(dirfrag), (char*)&dirfrag); - off += sizeof(dirfrag); - } - virtual void encode_payload() { - payload.append((char*)&dirfrag, sizeof(dirfrag)); - } - -}; - -#endif diff --git a/branches/sage/mds/messages/MExportDirCancel.h b/branches/sage/mds/messages/MExportDirCancel.h deleted file mode 100644 index f13ee1a44fa21..0000000000000 --- a/branches/sage/mds/messages/MExportDirCancel.h +++ /dev/null @@ -1,49 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MEXPORTDIRCANCEL_H -#define __MEXPORTDIRCANCEL_H - -#include "msg/Message.h" -#include "mds/CInode.h" -#include "include/types.h" - -class MExportDirCancel : public Message { - dirfrag_t dirfrag; - - public: - dirfrag_t get_dirfrag() { return dirfrag; } - - MExportDirCancel() {} - MExportDirCancel(dirfrag_t df) : - Message(MSG_MDS_EXPORTDIRCANCEL), - dirfrag(df) { } - - virtual char *get_type_name() { return "ExCancel"; } - void print(ostream& o) { - o << "export_cancel(" << dirfrag << ")"; - } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(dirfrag), (char*)&dirfrag); - off += sizeof(dirfrag); - } - - virtual void encode_payload() { - payload.append((char*)&dirfrag, sizeof(dirfrag)); - } -}; - -#endif diff --git a/branches/sage/mds/messages/MExportDirDiscoverAck.h b/branches/sage/mds/messages/MExportDirDiscoverAck.h deleted file mode 100644 index 5e1924bc57e38..0000000000000 --- a/branches/sage/mds/messages/MExportDirDiscoverAck.h +++ /dev/null @@ -1,60 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MEXPORTDIRDISCOVERACK_H -#define __MEXPORTDIRDISCOVERACK_H - -#include "msg/Message.h" -#include "mds/CInode.h" -#include "include/types.h" - -class MExportDirDiscoverAck : public Message { - dirfrag_t dirfrag; - bool success; - - public: - inodeno_t get_ino() { return dirfrag.ino; } - dirfrag_t get_dirfrag() { return dirfrag; } - bool is_success() { return success; } - - MExportDirDiscoverAck() {} - MExportDirDiscoverAck(dirfrag_t df, bool s=true) : - Message(MSG_MDS_EXPORTDIRDISCOVERACK), - dirfrag(df), - success(s) { } - - virtual char *get_type_name() { return "ExDisA"; } - void print(ostream& o) { - o << "export_discover_ack(" << dirfrag; - if (success) - o << " success)"; - else - o << " failure)"; - } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(dirfrag), (char*)&dirfrag); - off += sizeof(dirfrag); - payload.copy(off, sizeof(success), (char*)&success); - off += sizeof(success); - } - - virtual void encode_payload() { - payload.append((char*)&dirfrag, sizeof(dirfrag)); - payload.append((char*)&success, sizeof(success)); - } -}; - -#endif diff --git a/branches/sage/mds/messages/MExportDirFinish.h b/branches/sage/mds/messages/MExportDirFinish.h deleted file mode 100644 index 03f5e1fcc9ef3..0000000000000 --- a/branches/sage/mds/messages/MExportDirFinish.h +++ /dev/null @@ -1,46 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MEXPORTDIRFINISH_H -#define __MEXPORTDIRFINISH_H - -#include "msg/Message.h" - -class MExportDirFinish : public Message { - dirfrag_t dirfrag; - - public: - dirfrag_t get_dirfrag() { return dirfrag; } - - MExportDirFinish() {} - MExportDirFinish(dirfrag_t dirfrag) : - Message(MSG_MDS_EXPORTDIRFINISH) { - this->dirfrag = dirfrag; - } - virtual char *get_type_name() { return "ExFin"; } - void print(ostream& o) { - o << "export_finish(" << dirfrag << ")"; - } - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(dirfrag), (char*)&dirfrag); - off += sizeof(dirfrag); - } - virtual void encode_payload() { - payload.append((char*)&dirfrag, sizeof(dirfrag)); - } - -}; - -#endif diff --git a/branches/sage/mds/messages/MExportDirNotify.h b/branches/sage/mds/messages/MExportDirNotify.h deleted file mode 100644 index c7a79a64f9317..0000000000000 --- a/branches/sage/mds/messages/MExportDirNotify.h +++ /dev/null @@ -1,85 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MEXPORTDIRNOTIFY_H -#define __MEXPORTDIRNOTIFY_H - -#include "msg/Message.h" -#include -using namespace std; - -class MExportDirNotify : public Message { - dirfrag_t base; - bool ack; - pair old_auth, new_auth; - list bounds; // bounds; these dirs are _not_ included (tho the dirfragdes are) - - public: - dirfrag_t get_dirfrag() { return base; } - pair get_old_auth() { return old_auth; } - pair get_new_auth() { return new_auth; } - bool wants_ack() { return ack; } - list& get_bounds() { return bounds; } - - MExportDirNotify() {} - MExportDirNotify(dirfrag_t i, bool a, pair oa, pair na) : - Message(MSG_MDS_EXPORTDIRNOTIFY), - base(i), ack(a), old_auth(oa), new_auth(na) { } - - virtual char *get_type_name() { return "ExNot"; } - void print(ostream& o) { - o << "export_notify(" << base; - o << " " << old_auth << " -> " << new_auth; - if (ack) - o << " ack)"; - else - o << " no ack)"; - } - - void copy_bounds(list& ex) { - this->bounds = ex; - } - void copy_bounds(set& ex) { - for (set::iterator i = ex.begin(); - i != ex.end(); ++i) - bounds.push_back(*i); - } - void copy_bounds(set& ex) { - for (set::iterator i = ex.begin(); - i != ex.end(); ++i) - bounds.push_back((*i)->dirfrag()); - } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(base), (char*)&base); - off += sizeof(base); - payload.copy(off, sizeof(ack), (char*)&ack); - off += sizeof(ack); - payload.copy(off, sizeof(old_auth), (char*)&old_auth); - off += sizeof(old_auth); - payload.copy(off, sizeof(new_auth), (char*)&new_auth); - off += sizeof(new_auth); - ::_decode(bounds, payload, off); - } - virtual void encode_payload() { - payload.append((char*)&base, sizeof(base)); - payload.append((char*)&ack, sizeof(ack)); - payload.append((char*)&old_auth, sizeof(old_auth)); - payload.append((char*)&new_auth, sizeof(new_auth)); - ::_encode(bounds, payload); - } -}; - -#endif diff --git a/branches/sage/mds/messages/MExportDirNotifyAck.h b/branches/sage/mds/messages/MExportDirNotifyAck.h deleted file mode 100644 index 6a41aee83b5f3..0000000000000 --- a/branches/sage/mds/messages/MExportDirNotifyAck.h +++ /dev/null @@ -1,50 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MEXPORTDIRNOTIFYACK_H -#define __MEXPORTDIRNOTIFYACK_H - -#include "msg/Message.h" -#include -using namespace std; - -class MExportDirNotifyAck : public Message { - dirfrag_t dirfrag; - - public: - dirfrag_t get_dirfrag() { return dirfrag; } - - MExportDirNotifyAck() {} - MExportDirNotifyAck(dirfrag_t dirfrag) : - Message(MSG_MDS_EXPORTDIRNOTIFYACK) { - this->dirfrag = dirfrag; - } - virtual char *get_type_name() { return "ExNotA"; } - void print(ostream& o) { - o << "export_notify_ack(" << dirfrag << ")"; - } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(dirfrag), (char*)&dirfrag); - off += sizeof(dirfrag); - } - - virtual void encode_payload() { - payload.append((char*)&dirfrag, sizeof(dirfrag)); - } - -}; - -#endif diff --git a/branches/sage/mds/messages/MExportDirPrep.h b/branches/sage/mds/messages/MExportDirPrep.h deleted file mode 100644 index 5789e301e8b11..0000000000000 --- a/branches/sage/mds/messages/MExportDirPrep.h +++ /dev/null @@ -1,205 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MEXPORTDIRPREP_H -#define __MEXPORTDIRPREP_H - -#include "msg/Message.h" -#include "mds/CInode.h" -#include "include/types.h" - -class MExportDirPrep : public Message { - dirfrag_t dirfrag; - - /* nested export discover payload. - not all inodes will have dirs; they may require a separate discover. - dentries are the links to each inode. - dirs map includes base dir (ino) - */ - list bounds; - - list inodes; - list dentries; - map inode_dirfrag; - map inode_dentry; - - map > frags_by_ino; - map dirfrags; - - set bystanders; - - bool b_did_assim; - - public: - dirfrag_t get_dirfrag() { return dirfrag; } - list& get_bounds() { return bounds; } - list& get_inodes() { return inodes; } - list& get_dentries() { return dentries; } - list& get_inode_dirfrags(inodeno_t ino) { - return frags_by_ino[ino]; - } - dirfrag_t get_containing_dirfrag(inodeno_t ino) { - return inode_dirfrag[ino]; - } - string& get_dentry(inodeno_t ino) { - return inode_dentry[ino]; - } - bool have_dirfrag(dirfrag_t df) { - return dirfrags.count(df); - } - CDirDiscover* get_dirfrag_discover(dirfrag_t df) { - return dirfrags[df]; - } - set &get_bystanders() { return bystanders; } - - bool did_assim() { return b_did_assim; } - void mark_assim() { b_did_assim = true; } - - MExportDirPrep() { - b_did_assim = false; - } - MExportDirPrep(dirfrag_t df) : - Message(MSG_MDS_EXPORTDIRPREP), - dirfrag(df), - b_did_assim(false) { } - ~MExportDirPrep() { - for (list::iterator iit = inodes.begin(); - iit != inodes.end(); - iit++) - delete *iit; - for (list::iterator p = dentries.begin(); - p != dentries.end(); - p++) - delete *p; - for (map::iterator dit = dirfrags.begin(); - dit != dirfrags.end(); - dit++) - delete dit->second; - } - - - virtual char *get_type_name() { return "ExP"; } - void print(ostream& o) { - o << "export_prep(" << dirfrag << ")"; - } - - void add_export(dirfrag_t df) { - bounds.push_back( df ); - } - void add_inode(dirfrag_t df, const string& name, CDentryDiscover *dn, CInodeDiscover *in) { - inodes.push_back(in); - dentries.push_back(dn); - inode_dirfrag[in->get_ino()] = df; - inode_dentry[in->get_ino()] = name; - } - void add_dirfrag(CDirDiscover *dir) { - dirfrags[dir->get_dirfrag()] = dir; - frags_by_ino[dir->get_dirfrag().ino].push_back(dir->get_dirfrag().frag); - } - void add_bystander(int who) { - bystanders.insert(who); - } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(dirfrag), (char*)&dirfrag); - off += sizeof(dirfrag); - - ::_decode(bounds, payload, off); - - // inodes - int ni; - payload.copy(off, sizeof(int), (char*)&ni); - off += sizeof(int); - for (int i=0; i_decode(payload, off); - inodes.push_back(in); - - // dentry - CDentryDiscover *dn = new CDentryDiscover; - dn->_decode(payload, off); - dentries.push_back(dn); - - // dentry - string d; - _decode(d, payload, off); - inode_dentry[in->get_ino()] = d; - - // dir ino - dirfrag_t df; - payload.copy(off, sizeof(df), (char*)&df); - off += sizeof(df); - inode_dirfrag[in->get_ino()] = df; - - // child frags - ::_decode(frags_by_ino[in->get_ino()], payload, off); - } - - // dirs - int nd; - payload.copy(off, sizeof(int), (char*)&nd); - off += sizeof(int); - for (int i=0; i_decode(payload, off); - dirfrags[dir->get_dirfrag()] = dir; - } - - ::_decode(bystanders, payload, off); - } - - virtual void encode_payload() { - payload.append((char*)&dirfrag, sizeof(dirfrag)); - - ::_encode(bounds, payload); - - // inodes - int ni = inodes.size(); - payload.append((char*)&ni, sizeof(int)); - list::iterator dit = dentries.begin(); - list::iterator iit = inodes.begin(); - while (iit != inodes.end()) { - (*iit)->_encode(payload); - (*dit)->_encode(payload); - - // dentry name - _encode(inode_dentry[(*iit)->get_ino()], payload); - - // dir ino - dirfrag_t df = inode_dirfrag[(*iit)->get_ino()]; - payload.append((char*)&df, sizeof(df)); - - // child frags - ::_encode(frags_by_ino[(*iit)->get_ino()], payload); - - iit++; - dit++; - } - - // dirs - int nd = dirfrags.size(); - payload.append((char*)&nd, sizeof(int)); - for (map::iterator dit = dirfrags.begin(); - dit != dirfrags.end(); - dit++) - dit->second->_encode(payload); - - ::_encode(bystanders, payload); - } -}; - -#endif diff --git a/branches/sage/mds/messages/MExportDirPrepAck.h b/branches/sage/mds/messages/MExportDirPrepAck.h deleted file mode 100644 index 355541e9f1b5c..0000000000000 --- a/branches/sage/mds/messages/MExportDirPrepAck.h +++ /dev/null @@ -1,47 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MEXPORTDIRPREPACK_H -#define __MEXPORTDIRPREPACK_H - -#include "msg/Message.h" -#include "include/types.h" - -class MExportDirPrepAck : public Message { - dirfrag_t dirfrag; - - public: - dirfrag_t get_dirfrag() { return dirfrag; } - - MExportDirPrepAck() {} - MExportDirPrepAck(dirfrag_t df) : - Message(MSG_MDS_EXPORTDIRPREPACK), - dirfrag(df) { } - - virtual char *get_type_name() { return "ExPAck"; } - void print(ostream& o) { - o << "export_prep_ack(" << dirfrag << ")"; - } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(dirfrag), (char*)&dirfrag); - off += sizeof(dirfrag); - } - virtual void encode_payload() { - payload.append((char*)&dirfrag, sizeof(dirfrag)); - } -}; - -#endif diff --git a/branches/sage/mds/messages/MExportDirWarning.h b/branches/sage/mds/messages/MExportDirWarning.h deleted file mode 100644 index b59e2eb12251c..0000000000000 --- a/branches/sage/mds/messages/MExportDirWarning.h +++ /dev/null @@ -1,50 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MEXPORTDIRWARNING_H -#define __MEXPORTDIRWARNING_H - -#include "msg/Message.h" -#include "mds/CInode.h" -#include "include/types.h" - -class MExportDirWarning : public Message { - inodeno_t ino; - int new_dir_auth; - - public: - inodeno_t get_ino() { return ino; } - int get_new_dir_auth() { return new_dir_auth; } - - MExportDirWarning() {} - MExportDirWarning(inodeno_t i, int nda) : - Message(MSG_MDS_EXPORTDIRWARNING), - ino(i), new_dir_auth(nda) {} - - virtual char *get_type_name() { return "ExW"; } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - payload.copy(off, sizeof(new_dir_auth), (char*)&new_dir_auth); - off += sizeof(new_dir_auth); - } - virtual void encode_payload() { - payload.append((char*)&ino, sizeof(ino)); - payload.append((char*)&new_dir_auth, sizeof(new_dir_auth)); - } -}; - -#endif diff --git a/branches/sage/mds/messages/MExportDirWarningAck.h b/branches/sage/mds/messages/MExportDirWarningAck.h deleted file mode 100644 index 7ee3078e61973..0000000000000 --- a/branches/sage/mds/messages/MExportDirWarningAck.h +++ /dev/null @@ -1,45 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MEXPORTDIRWARNINGACK_H -#define __MEXPORTDIRWARNINGACK_H - -#include "msg/Message.h" -#include "mds/CInode.h" -#include "include/types.h" - -class MExportDirWarningAck : public Message { - inodeno_t ino; - - public: - inodeno_t get_ino() { return ino; } - - MExportDirWarningAck() {} - MExportDirWarningAck(inodeno_t i) : - Message(MSG_MDS_EXPORTDIRWARNINGACK), - ino(i) {} - - virtual char *get_type_name() { return "ExWAck"; } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - } - virtual void encode_payload() { - payload.append((char*)&ino, sizeof(ino)); - } -}; - -#endif diff --git a/branches/sage/mds/messages/MGenericMessage.h b/branches/sage/mds/messages/MGenericMessage.h deleted file mode 100644 index fee4e014edaf8..0000000000000 --- a/branches/sage/mds/messages/MGenericMessage.h +++ /dev/null @@ -1,45 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MGENERICMESSAGE_H -#define __MGENERICMESSAGE_H - -#include "msg/Message.h" - -class MGenericMessage : public Message { - char tname[20]; - //long pcid; - - public: - MGenericMessage(int t) : Message(t) { - sprintf(tname, "generic%d", get_type()); - } - - //void set_pcid(long pcid) { this->pcid = pcid; } - //long get_pcid() { return pcid; } - - char *get_type_name() { return tname; } - - virtual void decode_payload() { - //int off = 0; - //payload.copy(off, sizeof(pcid), (char*)&pcid); - //off += sizeof(pcid); - } - virtual void encode_payload() { - //payload.append((char*)&pcid, sizeof(pcid)); - } -}; - -#endif diff --git a/branches/sage/mds/messages/MHeartbeat.h b/branches/sage/mds/messages/MHeartbeat.h deleted file mode 100644 index 964f2a3bd49f2..0000000000000 --- a/branches/sage/mds/messages/MHeartbeat.h +++ /dev/null @@ -1,60 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MHEARTBEAT_H -#define __MHEARTBEAT_H - -#include "include/types.h" -#include "msg/Message.h" - -class MHeartbeat : public Message { - mds_load_t load; - int beat; - map import_map; - - public: - mds_load_t& get_load() { return load; } - int get_beat() { return beat; } - - map& get_import_map() { - return import_map; - } - - MHeartbeat() {} - MHeartbeat(mds_load_t& load, int beat) : - Message(MSG_MDS_HEARTBEAT) { - this->load = load; - this->beat = beat; - } - - virtual char *get_type_name() { return "HB"; } - - virtual void decode_payload() { - int off = 0; - payload.copy(off,sizeof(load), (char*)&load); - off += sizeof(load); - payload.copy(off, sizeof(beat), (char*)&beat); - off += sizeof(beat); - ::_decode(import_map, payload, off); - } - virtual void encode_payload() { - payload.append((char*)&load, sizeof(load)); - payload.append((char*)&beat, sizeof(beat)); - ::_encode(import_map, payload); - } - -}; - -#endif diff --git a/branches/sage/mds/messages/MInodeFileCaps.h b/branches/sage/mds/messages/MInodeFileCaps.h deleted file mode 100644 index 05ade1094c9c8..0000000000000 --- a/branches/sage/mds/messages/MInodeFileCaps.h +++ /dev/null @@ -1,57 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MINODEFILECAPS_H -#define __MINODEFILECAPS_H - -class MInodeFileCaps : public Message { - inodeno_t ino; - int from; - int caps; - - public: - inodeno_t get_ino() { return ino; } - int get_from() { return from; } - int get_caps() { return caps; } - - MInodeFileCaps() {} - // from auth - MInodeFileCaps(inodeno_t ino, int from, int caps) : - Message(MSG_MDS_INODEFILECAPS) { - - this->ino = ino; - this->from = from; - this->caps = caps; - } - - virtual char *get_type_name() { return "Icap";} - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(from), (char*)&from); - off += sizeof(from); - payload.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - payload.copy(off, sizeof(caps), (char*)&caps); - off += sizeof(caps); - } - virtual void encode_payload() { - payload.append((char*)&from, sizeof(from)); - payload.append((char*)&ino, sizeof(ino)); - payload.append((char*)&caps, sizeof(caps)); - } -}; - -#endif diff --git a/branches/sage/mds/messages/MLock.h b/branches/sage/mds/messages/MLock.h deleted file mode 100644 index 95c3e5f325212..0000000000000 --- a/branches/sage/mds/messages/MLock.h +++ /dev/null @@ -1,126 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MLOCK_H -#define __MLOCK_H - -#include "msg/Message.h" -#include "mds/SimpleLock.h" - -// for replicas -#define LOCK_AC_SYNC -1 -#define LOCK_AC_MIXED -2 -#define LOCK_AC_LOCK -3 - -#define LOCK_AC_SCATTER -6 - -// for auth -#define LOCK_AC_SYNCACK 1 -#define LOCK_AC_MIXEDACK 2 -#define LOCK_AC_LOCKACK 3 - -#define LOCK_AC_REQSCATTER 7 -#define LOCK_AC_REQUNSCATTER 8 - -#define LOCK_AC_FOR_REPLICA(a) ((a) < 0) -#define LOCK_AC_FOR_AUTH(a) ((a) > 0) - - -static const char *get_lock_action_name(int a) { - switch (a) { - case LOCK_AC_SYNC: return "sync"; - case LOCK_AC_MIXED: return "mixed"; - case LOCK_AC_LOCK: return "lock"; - case LOCK_AC_SCATTER: return "scatter"; - case LOCK_AC_SYNCACK: return "syncack"; - case LOCK_AC_MIXEDACK: return "mixedack"; - case LOCK_AC_LOCKACK: return "lockack"; - case LOCK_AC_REQSCATTER: return "reqscatter"; - case LOCK_AC_REQUNSCATTER: return "requnscatter"; - default: assert(0); return 0; - } -} - - -class MLock : public Message { - int32_t action; // action type - int32_t asker; // who is initiating this request - metareqid_t reqid; // for remote lock requests - - char lock_type; // lock object type - MDSCacheObjectInfo object_info; - - bufferlist data; // and possibly some data - - public: - bufferlist& get_data() { return data; } - int get_asker() { return asker; } - int get_action() { return action; } - metareqid_t get_reqid() { return reqid; } - - int get_lock_type() { return lock_type; } - MDSCacheObjectInfo &get_object_info() { return object_info; } - - MLock() {} - MLock(int ac, int as) : - Message(MSG_MDS_LOCK), - action(ac), asker(as), - lock_type(0) { } - MLock(SimpleLock *lock, int ac, int as) : - Message(MSG_MDS_LOCK), - action(ac), asker(as), - lock_type(lock->get_type()) { - lock->get_parent()->set_object_info(object_info); - } - MLock(SimpleLock *lock, int ac, int as, bufferlist& bl) : - Message(MSG_MDS_LOCK), - action(ac), asker(as), lock_type(lock->get_type()) { - lock->get_parent()->set_object_info(object_info); - data.claim(bl); - } - virtual char *get_type_name() { return "ILock"; } - void print(ostream& out) { - out << "lock(a=" << get_lock_action_name(action) - << " " << get_lock_type_name(lock_type) - << " " << object_info - << ")"; - } - - void set_reqid(metareqid_t ri) { reqid = ri; } - void set_data(const bufferlist& data) { - this->data = data; - } - - void decode_payload() { - int off = 0; - ::_decode(asker, payload, off); - ::_decode(action, payload, off); - ::_decode(reqid, payload, off); - ::_decode(lock_type, payload, off); - object_info._decode(payload, off); - ::_decode(data, payload, off); - } - virtual void encode_payload() { - ::_encode(asker, payload); - ::_encode(action, payload); - ::_encode(reqid, payload); - ::_encode(lock_type, payload); - object_info._encode(payload); - ::_encode(data, payload); - } - -}; - -#endif diff --git a/branches/sage/mds/messages/MMDSBeacon.h b/branches/sage/mds/messages/MMDSBeacon.h deleted file mode 100644 index c18a05e77f1a8..0000000000000 --- a/branches/sage/mds/messages/MMDSBeacon.h +++ /dev/null @@ -1,63 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMDSBEACON_H -#define __MMDSBEACON_H - -#include "msg/Message.h" - -#include "include/types.h" - -#include "mds/MDSMap.h" - -class MMDSBeacon : public Message { - entity_inst_t inst; - epoch_t last_epoch_seen; // include last mdsmap epoch mds has seen to avoid race with monitor decree - int state; - version_t seq; - - public: - MMDSBeacon() : Message(MSG_MDS_BEACON) {} - MMDSBeacon(entity_inst_t i, epoch_t les, int st, version_t se) : - Message(MSG_MDS_BEACON), - inst(i), last_epoch_seen(les), state(st), seq(se) { } - - entity_inst_t& get_mds_inst() { return inst; } - epoch_t get_last_epoch_seen() { return last_epoch_seen; } - int get_state() { return state; } - version_t get_seq() { return seq; } - char *get_type_name() { return "mdsbeacon"; } - - void print(ostream& out) { - out << "mdsbeacon(" << inst - << " " << MDSMap::get_state_name(state) - << " seq " << seq << ")"; - } - - void encode_payload() { - ::_encode(inst, payload); - ::_encode(last_epoch_seen, payload); - ::_encode(state, payload); - ::_encode(seq, payload); - } - void decode_payload() { - int off = 0; - ::_decode(inst, payload, off); - ::_decode(last_epoch_seen, payload, off); - ::_decode(state, payload, off); - ::_decode(seq, payload, off); - } -}; - -#endif diff --git a/branches/sage/mds/messages/MMDSBoot.h b/branches/sage/mds/messages/MMDSBoot.h deleted file mode 100644 index 8529578e29d56..0000000000000 --- a/branches/sage/mds/messages/MMDSBoot.h +++ /dev/null @@ -1,39 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMDSBOOT_H -#define __MMDSBOOT_H - -#include "msg/Message.h" - -#include "include/types.h" - -class MMDSBoot : public Message { - public: - MMDSBoot() : Message(MSG_MDS_BOOT) { - } - - char *get_type_name() { return "mdsboot"; } - - void encode_payload() { - //payload.append((char*)&sb, sizeof(sb)); - } - void decode_payload() { - //int off = 0; - //payload.copy(off, sizeof(sb), (char*)&sb); - //off += sizeof(sb); - } -}; - -#endif diff --git a/branches/sage/mds/messages/MMDSCacheRejoin.h b/branches/sage/mds/messages/MMDSCacheRejoin.h deleted file mode 100644 index 844ece02000ae..0000000000000 --- a/branches/sage/mds/messages/MMDSCacheRejoin.h +++ /dev/null @@ -1,230 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMDSCACHEREJOIN_H -#define __MMDSCACHEREJOIN_H - -#include "msg/Message.h" - -#include "include/types.h" -#include "include/encodable.h" - -// sent from replica to auth - -class MMDSCacheRejoin : public Message { - public: - static const int OP_WEAK = 1; // replica -> auth, i exist, + maybe open files. - static const int OP_STRONG = 2; // replica -> auth, i exist, + open files and lock state. - static const int OP_ACK = 3; // auth -> replica, here is your lock state. - //static const int OP_PURGE = 4; // auth -> replica, remove these items, they are old/obsolete. - static const int OP_MISSING = 5; // auth -> replica, i am missing these items - static const int OP_FULL = 6; // replica -> auth, here is the full object. - static const char *get_opname(int op) { - switch (op) { - case OP_WEAK: return "weak"; - case OP_STRONG: return "strong"; - case OP_ACK: return "ack"; - case OP_MISSING: return "missing"; - case OP_FULL: return "full"; - default: assert(0); return 0; - } - } - - // -- types -- - struct inode_strong { - int32_t caps_wanted; - int32_t nonce; - int32_t authlock; - int32_t linklock; - int32_t dirfragtreelock; - int32_t filelock; - __int32_t dirlock; - inode_strong() {} - inode_strong(int n, int cw=0, int a=0, int l=0, int dft=0, int f=0, int dl=0) : - caps_wanted(cw), - nonce(n), - authlock(a), linklock(l), dirfragtreelock(dft), filelock(f), dirlock(dl) { } - }; - struct inode_full { - inode_t inode; - string symlink; - fragtree_t dirfragtree; - inode_full() {} - inode_full(const inode_t& i, const string& s, const fragtree_t& f) : - inode(i), symlink(s), dirfragtree(f) {} - - void _decode(bufferlist::iterator& p) { - ::_decode_simple(inode, p); - ::_decode_simple(symlink, p); - dirfragtree._decode(p); - } - void _encode(bufferlist& bl) const { - ::_encode(inode, bl); - ::_encode(symlink, bl); - dirfragtree._encode(bl); - } - }; - - struct dirfrag_strong { - int32_t nonce; - int8_t dir_rep; - dirfrag_strong() {} - dirfrag_strong(int n, int dr) : nonce(n), dir_rep(dr) {} - }; - struct dn_strong { - inodeno_t ino; - inodeno_t remote_ino; - unsigned char remote_d_type; - int32_t nonce; - int32_t lock; - dn_strong() : - ino(0), remote_ino(0), remote_d_type(0), nonce(0), lock(0) {} - dn_strong(inodeno_t pi, inodeno_t ri, unsigned char rdt, int n, int l) : - ino(pi), remote_ino(ri), remote_d_type(rdt), nonce(n), lock(l) {} - bool is_primary() { return ino > 0; } - bool is_remote() { return remote_ino > 0; } - bool is_null() { return ino == 0 && remote_ino == 0; } - }; - - struct dn_weak { - inodeno_t ino; - dn_weak() : ino(0) {} - dn_weak(inodeno_t pi) : ino(pi) {} - }; - - // -- data -- - int32_t op; - - // weak - map > weak; - set weak_inodes; - - // strong - map strong_dirfrags; - map > strong_dentries; - map strong_inodes; - - // open - bufferlist cap_export_bl; - map > cap_exports; - map cap_export_paths; - - // full - list full_inodes; - - // authpins, xlocks - map authpinned_inodes; - map > xlocked_inodes; - map > authpinned_dentries; - map > xlocked_dentries; - - MMDSCacheRejoin() : Message(MSG_MDS_CACHEREJOIN) {} - MMDSCacheRejoin(int o) : - Message(MSG_MDS_CACHEREJOIN), - op(o) {} - - char *get_type_name() { return "cache_rejoin"; } - void print(ostream& out) { - out << "cache_rejoin " << get_opname(op); - } - - // -- builders -- - // inodes - void add_weak_inode(inodeno_t i) { - weak_inodes.insert(i); - } - void add_strong_inode(inodeno_t i, int n, int cw, int a, int l, int dft, int f, int dl) { - strong_inodes[i] = inode_strong(n, cw, a, l, dft, f, dl); - } - void add_full_inode(inode_t &i, const string& s, const fragtree_t &f) { - full_inodes.push_back(inode_full(i, s, f)); - } - void add_inode_authpin(inodeno_t ino, const metareqid_t& ri) { - authpinned_inodes[ino] = ri; - } - void add_inode_xlock(inodeno_t ino, int lt, const metareqid_t& ri) { - xlocked_inodes[ino][lt] = ri; - } - - void copy_cap_exports(bufferlist &bl) { - cap_export_bl = bl; - } - - // dirfrags - void add_weak_dirfrag(dirfrag_t df) { - weak[df]; - } - void add_weak_dirfrag(dirfrag_t df, map& dnmap) { - weak[df] = dnmap; - } - void add_strong_dirfrag(dirfrag_t df, int n, int dr) { - strong_dirfrags[df] = dirfrag_strong(n, dr); - } - - // dentries - void add_weak_dentry(dirfrag_t df, const string& dname, dn_weak& dnw) { - weak[df][dname] = dnw; - } - void add_weak_primary_dentry(dirfrag_t df, const string& dname, inodeno_t ino) { - weak[df][dname] = dn_weak(ino); - } - void add_strong_dentry(dirfrag_t df, const string& dname, inodeno_t pi, inodeno_t ri, unsigned char rdt, int n, int ls) { - strong_dentries[df][dname] = dn_strong(pi, ri, rdt, n, ls); - } - void add_dentry_authpin(dirfrag_t df, const string& dname, const metareqid_t& ri) { - authpinned_dentries[df][dname] = ri; - } - void add_dentry_xlock(dirfrag_t df, const string& dname, const metareqid_t& ri) { - xlocked_dentries[df][dname] = ri; - } - - // -- encoding -- - void encode_payload() { - ::_encode(op, payload); - ::_encode(strong_inodes, payload); - ::_encode_complex(full_inodes, payload); - ::_encode(authpinned_inodes, payload); - ::_encode(xlocked_inodes, payload); - ::_encode(cap_export_bl, payload); - ::_encode(strong_dirfrags, payload); - ::_encode(weak, payload); - ::_encode(weak_inodes, payload); - ::_encode(strong_dentries, payload); - ::_encode(authpinned_dentries, payload); - ::_encode(xlocked_dentries, payload); - } - void decode_payload() { - bufferlist::iterator p = payload.begin(); - ::_decode_simple(op, p); - ::_decode_simple(strong_inodes, p); - ::_decode_complex(full_inodes, p); - ::_decode_simple(authpinned_inodes, p); - ::_decode_simple(xlocked_inodes, p); - ::_decode_simple(cap_export_bl, p); - if (cap_export_bl.length()) { - bufferlist::iterator q = cap_export_bl.begin(); - ::_decode_simple(cap_exports, q); - ::_decode_simple(cap_export_paths, q); - } - ::_decode_simple(strong_dirfrags, p); - ::_decode_simple(weak, p); - ::_decode_simple(weak_inodes, p); - ::_decode_simple(strong_dentries, p); - ::_decode_simple(authpinned_dentries, p); - ::_decode_simple(xlocked_dentries, p); - } - -}; - -#endif diff --git a/branches/sage/mds/messages/MMDSFragmentNotify.h b/branches/sage/mds/messages/MMDSFragmentNotify.h deleted file mode 100644 index 232cce92427bb..0000000000000 --- a/branches/sage/mds/messages/MMDSFragmentNotify.h +++ /dev/null @@ -1,60 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMDSFRAGMENTNOTIFY_H -#define __MMDSFRAGMENTNOTIFY_H - -#include "msg/Message.h" -#include -using namespace std; - -class MMDSFragmentNotify : public Message { - inodeno_t ino; - frag_t basefrag; - int8_t bits; - - public: - inodeno_t get_ino() { return ino; } - frag_t get_basefrag() { return basefrag; } - int get_bits() { return bits; } - - bufferlist basebl; - - MMDSFragmentNotify() {} - MMDSFragmentNotify(inodeno_t i, frag_t bf, int b) : - Message(MSG_MDS_FRAGMENTNOTIFY), - ino(i), basefrag(bf), bits(b) { } - - virtual char *get_type_name() { return "fragment_notify"; } - void print(ostream& o) { - o << "fragment_notify(" << ino << "#" << basefrag - << " " << (int)bits << ")"; - } - - virtual void decode_payload() { - int off = 0; - ::_decode(ino, payload, off); - ::_decode(basefrag, payload, off); - ::_decode(bits, payload, off); - ::_decode(basebl, payload, off); - } - virtual void encode_payload() { - ::_encode(ino, payload); - ::_encode(basefrag, payload); - ::_encode(bits, payload); - ::_encode(basebl, payload); - } -}; - -#endif diff --git a/branches/sage/mds/messages/MMDSGetMap.h b/branches/sage/mds/messages/MMDSGetMap.h deleted file mode 100644 index e762760acf224..0000000000000 --- a/branches/sage/mds/messages/MMDSGetMap.h +++ /dev/null @@ -1,40 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMDSGETMAP_H -#define __MMDSGETMAP_H - -#include "msg/Message.h" - -#include "include/types.h" -#include "include/encodable.h" - -class MMDSGetMap : public Message { - public: - epoch_t have; - - MMDSGetMap(epoch_t h=0) : Message(MSG_MDS_GETMAP), have (h) { } - - char *get_type_name() { return "mdsgetmap"; } - - void encode_payload() { - ::_encode_simple(have, payload); - } - void decode_payload() { - bufferlist::iterator p = payload.begin(); - ::_decode_simple(have, p); - } -}; - -#endif diff --git a/branches/sage/mds/messages/MMDSMap.h b/branches/sage/mds/messages/MMDSMap.h deleted file mode 100644 index 164e547cc513a..0000000000000 --- a/branches/sage/mds/messages/MMDSMap.h +++ /dev/null @@ -1,79 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MMDSMAP_H -#define __MMDSMAP_H - -#include "msg/Message.h" -#include "mds/MDSMap.h" - -class MMDSMap : public Message { - public: - /* - map maps; - map incremental_maps; - - epoch_t get_first() { - epoch_t e = 0; - map::iterator i = maps.begin(); - if (i != maps.end()) e = i->first; - i = incremental_maps.begin(); - if (i != incremental_maps.end() && - (e == 0 || i->first < e)) e = i->first; - return e; - } - epoch_t get_last() { - epoch_t e = 0; - map::reverse_iterator i = maps.rbegin(); - if (i != maps.rend()) e = i->first; - i = incremental_maps.rbegin(); - if (i != incremental_maps.rend() && - (e == 0 || i->first > e)) e = i->first; - return e; - } - */ - - version_t epoch; - bufferlist encoded; - - version_t get_epoch() const { return epoch; } - bufferlist& get_encoded() { return encoded; } - - MMDSMap() : - Message(MSG_MDS_MAP) {} - MMDSMap(MDSMap *mm) : - Message(MSG_MDS_MAP) { - epoch = mm->get_epoch(); - mm->encode(encoded); - } - - char *get_type_name() { return "mdsmap"; } - void print(ostream& out) { - out << "mdsmap(e " << epoch << ")"; - } - - // marshalling - void decode_payload() { - int off = 0; - ::_decode(epoch, payload, off); - ::_decode(encoded, payload, off); - } - void encode_payload() { - ::_encode(epoch, payload); - ::_encode(encoded, payload); - } -}; - -#endif diff --git a/branches/sage/mds/messages/MMDSResolve.h b/branches/sage/mds/messages/MMDSResolve.h deleted file mode 100644 index 2103a0115081d..0000000000000 --- a/branches/sage/mds/messages/MMDSResolve.h +++ /dev/null @@ -1,66 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMDSRESOLVE_H -#define __MMDSRESOLVE_H - -#include "msg/Message.h" - -#include "include/types.h" - -class MMDSResolve : public Message { - public: - map > subtrees; - map > ambiguous_imports; - list slave_requests; - - MMDSResolve() : Message(MSG_MDS_RESOLVE) {} - - char *get_type_name() { return "mds_resolve"; } - - void print(ostream& out) { - out << "mds_resolve(" << subtrees.size() - << "+" << ambiguous_imports.size() - << " subtrees +" << slave_requests.size() << " slave requests)"; - } - - void add_subtree(dirfrag_t im) { - subtrees[im].clear(); - } - void add_subtree_bound(dirfrag_t im, dirfrag_t ex) { - subtrees[im].push_back(ex); - } - - void add_ambiguous_import(dirfrag_t im, const list& m) { - ambiguous_imports[im] = m; - } - - void add_slave_request(metareqid_t reqid) { - slave_requests.push_back(reqid); - } - - void encode_payload() { - ::_encode(subtrees, payload); - ::_encode(ambiguous_imports, payload); - ::_encode(slave_requests, payload); - } - void decode_payload() { - int off = 0; - ::_decode(subtrees, payload, off); - ::_decode(ambiguous_imports, payload, off); - ::_decode(slave_requests, payload, off); - } -}; - -#endif diff --git a/branches/sage/mds/messages/MMDSResolveAck.h b/branches/sage/mds/messages/MMDSResolveAck.h deleted file mode 100644 index 1870e226b4161..0000000000000 --- a/branches/sage/mds/messages/MMDSResolveAck.h +++ /dev/null @@ -1,56 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMDSRESOLVEACK_H -#define __MMDSRESOLVEACK_H - -#include "msg/Message.h" - -#include "include/types.h" - - -class MMDSResolveAck : public Message { - public: - list commit; - list abort; - - MMDSResolveAck() : Message(MSG_MDS_RESOLVEACK) {} - - char *get_type_name() { return "resolve_ack"; } - /*void print(ostream& out) { - out << "resolve_ack.size() - << "+" << ambiguous_imap.size() - << " imports +" << slave_requests.size() << " slave requests)"; - } - */ - - void add_commit(metareqid_t r) { - commit.push_back(r); - } - void add_abort(metareqid_t r) { - abort.push_back(r); - } - - void encode_payload() { - ::_encode(commit, payload); - ::_encode(abort, payload); - } - void decode_payload() { - int off = 0; - ::_decode(commit, payload, off); - ::_decode(abort, payload, off); - } -}; - -#endif diff --git a/branches/sage/mds/messages/MMonCommand.h b/branches/sage/mds/messages/MMonCommand.h deleted file mode 100644 index 19d25dd7a4d77..0000000000000 --- a/branches/sage/mds/messages/MMonCommand.h +++ /dev/null @@ -1,54 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMONCOMMAND_H -#define __MMONCOMMAND_H - -#include "msg/Message.h" - -#include -using std::vector; - -class MMonCommand : public Message { - public: - entity_inst_t inst; - vector cmd; - - MMonCommand() : Message(MSG_MON_COMMAND) {} - MMonCommand(entity_inst_t i) : - Message(MSG_MON_COMMAND), - inst(i) { } - - virtual char *get_type_name() { return "mon_command"; } - void print(ostream& o) { - o << "mon_command("; - for (unsigned i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMONCOMMANDACK_H -#define __MMONCOMMANDACK_H - -#include "msg/Message.h" - -class MMonCommandAck : public Message { - public: - int r; - string rs; - - MMonCommandAck() : Message(MSG_MON_COMMAND_ACK) {} - MMonCommandAck(int _r, string s) : Message(MSG_MON_COMMAND_ACK), - r(_r), rs(s) { } - - virtual char *get_type_name() { return "mon_command"; } - void print(ostream& o) { - o << "mon_command_ack(" << r << " " << rs << ")"; - } - - void encode_payload() { - payload.append((char*)&r, sizeof(r)); - ::_encode(rs, payload); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(r), (char*)&r); - off += sizeof(r); - ::_decode(rs, payload, off); - } -}; - -#endif diff --git a/branches/sage/mds/messages/MMonElection.h b/branches/sage/mds/messages/MMonElection.h deleted file mode 100644 index 14a29af9140f9..0000000000000 --- a/branches/sage/mds/messages/MMonElection.h +++ /dev/null @@ -1,63 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MMONELECTION_H -#define __MMONELECTION_H - -#include "msg/Message.h" - - -class MMonElection : public Message { -public: - static const int OP_PROPOSE = 1; - static const int OP_ACK = 2; - static const int OP_NAK = 3; - static const int OP_VICTORY = 4; - static const char *get_opname(int o) { - switch (o) { - case OP_PROPOSE: return "propose"; - case OP_ACK: return "ack"; - case OP_NAK: return "nak"; - case OP_VICTORY: return "victory"; - default: assert(0); return 0; - } - } - - int32_t op; - epoch_t epoch; - - MMonElection() : Message(MSG_MON_ELECTION) {} - MMonElection(int o, epoch_t e) : - Message(MSG_MON_ELECTION), - op(o), epoch(e) {} - - char *get_type_name() { return "election"; } - void print(ostream& out) { - out << "election(" << get_opname(op) << " " << epoch << ")"; - } - - void encode_payload() { - ::_encode(op, payload); - ::_encode(epoch, payload); - } - void decode_payload() { - int off = 0; - ::_decode(op, payload, off); - ::_decode(epoch, payload, off); - } - -}; - -#endif diff --git a/branches/sage/mds/messages/MMonElectionCollect.h b/branches/sage/mds/messages/MMonElectionCollect.h deleted file mode 100644 index f9f0c12d1ac2e..0000000000000 --- a/branches/sage/mds/messages/MMonElectionCollect.h +++ /dev/null @@ -1,43 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MMONELECTIONCOLLECT_H -#define __MMONELECTIONCOLLECT_H - -#include "msg/Message.h" - - -class MMonElectionCollect : public Message { - public: - int read_num; - - MMonElectionCollect() {} - MMonElectionCollect(int n) : - Message(MSG_MON_ELECTION_COLLECT), - read_num(n) {} - - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(read_num), (char*)&read_num); - off += sizeof(read_num); - } - void encode_payload() { - payload.append((char*)&read_num, sizeof(read_num)); - } - - virtual char *get_type_name() { return "MonElCollect"; } -}; - -#endif diff --git a/branches/sage/mds/messages/MMonElectionRefresh.h b/branches/sage/mds/messages/MMonElectionRefresh.h deleted file mode 100644 index bc0337b8720dc..0000000000000 --- a/branches/sage/mds/messages/MMonElectionRefresh.h +++ /dev/null @@ -1,52 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MMONELECTIONREFRESH_H -#define __MMONELECTIONREFRESH_H - -#include "msg/Message.h" - -#include "mon/Elector.h" - -class MMonElectionRefresh : public Message { - public: - int p; - Elector::State state; - int refresh_num; - - MMonElectionRefresh() {} - MMonElectionRefresh(int _p, Elector::State& s, int r) : - Message(MSG_MON_ELECTION_REFRESH), - p(_p), state(s), refresh_num(r) {} - - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(p), (char*)&p); - off += sizeof(p); - payload.copy(off, sizeof(state), (char*)&state); - off += sizeof(state); - payload.copy(off, sizeof(refresh_num), (char*)&refresh_num); - off += sizeof(refresh_num); - } - void encode_payload() { - payload.append((char*)&p, sizeof(p)); - payload.append((char*)&state, sizeof(state)); - payload.append((char*)&refresh_num, sizeof(refresh_num)); - } - - virtual char *get_type_name() { return "MonElRefresh"; } -}; - -#endif diff --git a/branches/sage/mds/messages/MMonElectionStatus.h b/branches/sage/mds/messages/MMonElectionStatus.h deleted file mode 100644 index f91e42d64b184..0000000000000 --- a/branches/sage/mds/messages/MMonElectionStatus.h +++ /dev/null @@ -1,51 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MMONELECTIONSTATUS_H -#define __MMONELECTIONSTATUS_H - -#include "msg/Message.h" - -#include "mon/Elector.h" - -class MMonElectionStatus : public Message { - public: - int q; - int read_num; - map registry; - - MMonElectionStatus() {} - MMonElectionStatus(int _q, int r, map reg) : - Message(MSG_MON_ELECTION_STATUS), - q(_q), read_num(r), registry(reg) {} - - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(q), (char*)&q); - off += sizeof(q); - payload.copy(off, sizeof(read_num), (char*)&read_num); - off += sizeof(read_num); - ::_decode(registry, payload, off); - } - void encode_payload() { - payload.append((char*)&q, sizeof(q)); - payload.append((char*)&read_num, sizeof(read_num)); - ::_encode(registry, payload); - } - - virtual char *get_type_name() { return "MonElStatus"; } -}; - -#endif diff --git a/branches/sage/mds/messages/MMonOSDMapInfo.h b/branches/sage/mds/messages/MMonOSDMapInfo.h deleted file mode 100644 index 329c05e657d46..0000000000000 --- a/branches/sage/mds/messages/MMonOSDMapInfo.h +++ /dev/null @@ -1,50 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMONOSDMAPINFO_H -#define __MMONOSDMAPINFO_H - -#include "msg/Message.h" - -#include "include/types.h" - -class MMonOSDMapInfo : public Message { - public: - epoch_t epoch; - epoch_t mon_epoch; - - epoch_t get_epoch() { return epoch; } - epoch_t get_mon_epoch() { return mon_epoch; } - - MMonOSDMapInfo(epoch_t e, epoch_t me) : - Message(MSG_MON_OSDMAP_UPDATE_PREPARE), - epoch(e), mon_epoch(me) { - } - - char *get_type_name() { return "omap_info"; } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - payload.append((char*)&mon_epoch, sizeof(mon_epoch)); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - payload.copy(off, sizeof(mon_epoch), (char*)&mon_epoch); - off += sizeof(mon_epoch); - } -}; - -#endif diff --git a/branches/sage/mds/messages/MMonOSDMapLease.h b/branches/sage/mds/messages/MMonOSDMapLease.h deleted file mode 100644 index 3f4ed8ea4db85..0000000000000 --- a/branches/sage/mds/messages/MMonOSDMapLease.h +++ /dev/null @@ -1,50 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMONOSDMAPLEASE_H -#define __MMONOSDMAPLEASE_H - -#include "msg/Message.h" - -#include "include/types.h" - -class MMonOSDMapLease : public Message { - epoch_t epoch; - utime_t lease_expire; - - public: - epoch_t get_epoch() { return epoch; } - const utime_t& get_lease_expire() { return lease_expire; } - - MMonOSDMapLease(epoch_t e, utime_t le) : - Message(MSG_MON_OSDMAP_LEASE), - epoch(e), lease_expire(le) { - } - - char *get_type_name() { return "omap_lease"; } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - payload.append((char*)&lease_expire, sizeof(lease_expire)); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - payload.copy(off, sizeof(lease_expire), (char*)&lease_expire); - off += sizeof(lease_expire); - } -}; - -#endif diff --git a/branches/sage/mds/messages/MMonOSDMapLeaseAck.h b/branches/sage/mds/messages/MMonOSDMapLeaseAck.h deleted file mode 100644 index 449a0ac61a84f..0000000000000 --- a/branches/sage/mds/messages/MMonOSDMapLeaseAck.h +++ /dev/null @@ -1,45 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMONOSDMAPLEASEACK_H -#define __MMONOSDMAPLEASEACK_H - -#include "msg/Message.h" - -#include "include/types.h" - -class MMonOSDMapLeaseAck : public Message { - epoch_t epoch; - -public: - epoch_t get_epoch() { return epoch; } - - MMonOSDMapLeaseAck(epoch_t e) : - Message(MSG_MON_OSDMAP_LEASE_ACK), - epoch(e) { - } - - char *get_type_name() { return "omap_lease_ack"; } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - } -}; - -#endif diff --git a/branches/sage/mds/messages/MMonOSDMapUpdateAck.h b/branches/sage/mds/messages/MMonOSDMapUpdateAck.h deleted file mode 100644 index 9655548dfcb00..0000000000000 --- a/branches/sage/mds/messages/MMonOSDMapUpdateAck.h +++ /dev/null @@ -1,43 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMONOSDMAPUPDATEACK_H -#define __MMONOSDMAPUPDATEACK_H - -#include "msg/Message.h" - -#include "include/types.h" - -class MMonOSDMapUpdateAck : public Message { -public: - epoch_t epoch; - - MMonOSDMapUpdateAck(epoch_t e) : - Message(MSG_MON_OSDMAP_UPDATE_ACK), - epoch(e) { - } - - char *get_type_name() { return "omap_update_ack"; } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - } -}; - -#endif diff --git a/branches/sage/mds/messages/MMonOSDMapUpdateCommit.h b/branches/sage/mds/messages/MMonOSDMapUpdateCommit.h deleted file mode 100644 index 8aa6929c2ed9a..0000000000000 --- a/branches/sage/mds/messages/MMonOSDMapUpdateCommit.h +++ /dev/null @@ -1,43 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMONOSDMAPUPDATECOMMIT_H -#define __MMONOSDMAPUPDATECOMMIT_H - -#include "msg/Message.h" - -#include "include/types.h" - -class MMonOSDMapUpdateCommit : public Message { - public: - epoch_t epoch; - - MMonOSDMapUpdateCommit(epoch_t e) : - Message(MSG_MON_OSDMAP_UPDATE_COMMIT), - epoch(e) { - } - - char *get_type_name() { return "omap_update_commit"; } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - } -}; - -#endif diff --git a/branches/sage/mds/messages/MMonOSDMapUpdatePrepare.h b/branches/sage/mds/messages/MMonOSDMapUpdatePrepare.h deleted file mode 100644 index 8e908e2ed0664..0000000000000 --- a/branches/sage/mds/messages/MMonOSDMapUpdatePrepare.h +++ /dev/null @@ -1,53 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMONOSDMAPUPDATEPREPARE_H -#define __MMONOSDMAPUPDATEPREPARE_H - -#include "msg/Message.h" - -#include "include/types.h" - -class MMonOSDMapUpdatePrepare : public Message { - public: - epoch_t epoch; - bufferlist map_bl; - bufferlist inc_map_bl; - - epoch_t get_epoch() { return epoch; } - - MMonOSDMapUpdatePrepare(epoch_t e, - bufferlist& mbl, bufferlist& incmbl) : - Message(MSG_MON_OSDMAP_UPDATE_PREPARE), - epoch(e), - map_bl(mbl), inc_map_bl(incmbl) { - } - - char *get_type_name() { return "omap_update_prepare"; } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - ::_encode(map_bl, payload); - ::_encode(inc_map_bl, payload); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - ::_decode(map_bl, payload, off); - ::_decode(inc_map_bl, payload, off); - } -}; - -#endif diff --git a/branches/sage/mds/messages/MMonPaxos.h b/branches/sage/mds/messages/MMonPaxos.h deleted file mode 100644 index 7210b179c9a42..0000000000000 --- a/branches/sage/mds/messages/MMonPaxos.h +++ /dev/null @@ -1,98 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MMONPAXOS_H -#define __MMONPAXOS_H - -#include "msg/Message.h" -#include "mon/mon_types.h" - -class MMonPaxos : public Message { - public: - // op types - const static int OP_COLLECT = 1; // proposer: propose round - const static int OP_LAST = 2; // voter: accept proposed round - const static int OP_BEGIN = 3; // proposer: value proposed for this round - const static int OP_ACCEPT = 4; // voter: accept propsed value - const static int OP_COMMIT = 5; // proposer: notify learners of agreed value - const static int OP_LEASE = 6; // leader: extend peon lease - const static int OP_LEASE_ACK = 7; // peon: lease ack - const static char *get_opname(int op) { - switch (op) { - case OP_COLLECT: return "collect"; - case OP_LAST: return "last"; - case OP_BEGIN: return "begin"; - case OP_ACCEPT: return "accept"; - case OP_COMMIT: return "commit"; - case OP_LEASE: return "lease"; - case OP_LEASE_ACK: return "lease_ack"; - default: assert(0); return 0; - } - } - - epoch_t epoch; // monitor epoch - int op; // paxos op - int machine_id; // which state machine? - - version_t last_committed; // i've committed to - version_t pn_from; // i promise to accept after - version_t pn; // with with proposal - version_t uncommitted_pn; // previous pn, if we are a LAST with an uncommitted value - utime_t lease_expire; - - map values; - - MMonPaxos() : Message(MSG_MON_PAXOS) {} - MMonPaxos(epoch_t e, int o, int mid) : - Message(MSG_MON_PAXOS), - epoch(e), - op(o), machine_id(mid), - last_committed(0), pn_from(0), pn(0), uncommitted_pn(0) { } - - virtual char *get_type_name() { return "paxos"; } - - void print(ostream& out) { - out << "paxos(" << get_paxos_name(machine_id) - << " " << get_opname(op) << " lc " << last_committed - << " pn " << pn << " opn " << uncommitted_pn - << ")"; - } - - void encode_payload() { - ::_encode(epoch, payload); - ::_encode(op, payload); - ::_encode(machine_id, payload); - ::_encode(last_committed, payload); - ::_encode(pn_from, payload); - ::_encode(pn, payload); - ::_encode(uncommitted_pn, payload); - ::_encode(lease_expire, payload); - ::_encode(values, payload); - } - void decode_payload() { - int off = 0; - ::_decode(epoch, payload, off); - ::_decode(op, payload, off); - ::_decode(machine_id, payload, off); - ::_decode(last_committed, payload, off); - ::_decode(pn_from, payload, off); - ::_decode(pn, payload, off); - ::_decode(uncommitted_pn, payload, off); - ::_decode(lease_expire, payload, off); - ::_decode(values, payload, off); - } -}; - -#endif diff --git a/branches/sage/mds/messages/MOSDBoot.h b/branches/sage/mds/messages/MOSDBoot.h deleted file mode 100644 index 00c94ad1a2a80..0000000000000 --- a/branches/sage/mds/messages/MOSDBoot.h +++ /dev/null @@ -1,51 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MOSDBOOT_H -#define __MOSDBOOT_H - -#include "msg/Message.h" - -#include "include/types.h" -#include "osd/osd_types.h" - -class MOSDBoot : public Message { - public: - entity_inst_t inst; - OSDSuperblock sb; - - MOSDBoot() {} - MOSDBoot(entity_inst_t i, OSDSuperblock& s) : - Message(MSG_OSD_BOOT), - inst(i), - sb(s) { - } - - char *get_type_name() { return "osd_boot"; } - void print(ostream& out) { - out << "osd_boot(" << inst << ")"; - } - - void encode_payload() { - ::_encode(inst, payload); - ::_encode(sb, payload); - } - void decode_payload() { - int off = 0; - ::_decode(inst, payload, off); - ::_decode(sb, payload, off); - } -}; - -#endif diff --git a/branches/sage/mds/messages/MOSDFailure.h b/branches/sage/mds/messages/MOSDFailure.h deleted file mode 100644 index adc4e700a4f85..0000000000000 --- a/branches/sage/mds/messages/MOSDFailure.h +++ /dev/null @@ -1,55 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDFAILURE_H -#define __MOSDFAILURE_H - -#include "msg/Message.h" - - -class MOSDFailure : public Message { - public: - entity_inst_t from; - entity_inst_t failed; - epoch_t epoch; - - MOSDFailure() {} - MOSDFailure(entity_inst_t fr, entity_inst_t f, epoch_t e) : - Message(MSG_OSD_FAILURE), - from(fr), failed(f), epoch(e) {} - - entity_inst_t get_from() { return from; } - entity_inst_t get_failed() { return failed; } - epoch_t get_epoch() { return epoch; } - - void decode_payload() { - int off = 0; - ::_decode(from, payload, off); - ::_decode(failed, payload, off); - ::_decode(epoch, payload, off); - } - void encode_payload() { - ::_encode(from, payload); - ::_encode(failed, payload); - ::_encode(epoch, payload); - } - - virtual char *get_type_name() { return "osd_failure"; } - /*void print(ostream& out) { - out << "osd_failure(" << failed << " e" << epoch << ")"; - }*/ -}; - -#endif diff --git a/branches/sage/mds/messages/MOSDGetMap.h b/branches/sage/mds/messages/MOSDGetMap.h deleted file mode 100644 index 25f94ef3bcc92..0000000000000 --- a/branches/sage/mds/messages/MOSDGetMap.h +++ /dev/null @@ -1,51 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MOSDGETMAP_H -#define __MOSDGETMAP_H - -#include "msg/Message.h" - -#include "include/types.h" - -class MOSDGetMap : public Message { - public: - epoch_t start, want; - - MOSDGetMap(epoch_t s=0, epoch_t w=0) : - Message(MSG_OSD_GETMAP), - start(s), want(w) { } - - epoch_t get_start_epoch() { return start; } - epoch_t get_want_epoch() { return want; } - - char *get_type_name() { return "get_osd_map"; } - void print(ostream& out) { - out << "get_osd_map(have " << start; - if (want) out << " want " << want; - out << ")"; - } - - void encode_payload() { - ::_encode(start, payload); - ::_encode(want, payload); - } - void decode_payload() { - int off = 0; - ::_decode(start, payload, off); - ::_decode(want, payload, off); - } -}; - -#endif diff --git a/branches/sage/mds/messages/MOSDIn.h b/branches/sage/mds/messages/MOSDIn.h deleted file mode 100644 index 8f8cb4b7877ae..0000000000000 --- a/branches/sage/mds/messages/MOSDIn.h +++ /dev/null @@ -1,43 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __MOSDIN_H -#define __MOSDIN_H - -#include "msg/Message.h" - - -class MOSDIn : public Message { - public: - epoch_t map_epoch; - - MOSDIn(epoch_t e) : Message(MSG_OSD_IN), map_epoch(e) { - } - MOSDIn() {} - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(map_epoch), (char*)&map_epoch); - off += sizeof(map_epoch); - } - virtual void encode_payload() { - payload.append((char*)&map_epoch, sizeof(map_epoch)); - } - - virtual char *get_type_name() { return "oin"; } -}; - -#endif diff --git a/branches/sage/mds/messages/MOSDMap.h b/branches/sage/mds/messages/MOSDMap.h deleted file mode 100644 index 525ed82ae5c29..0000000000000 --- a/branches/sage/mds/messages/MOSDMap.h +++ /dev/null @@ -1,71 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDGETMAPACK_H -#define __MOSDGETMAPACK_H - -#include "msg/Message.h" -#include "osd/OSDMap.h" - - -class MOSDMap : public Message { - public: - map maps; - map incremental_maps; - - epoch_t get_first() { - epoch_t e = 0; - map::iterator i = maps.begin(); - if (i != maps.end()) e = i->first; - i = incremental_maps.begin(); - if (i != incremental_maps.end() && - (e == 0 || i->first < e)) e = i->first; - return e; - } - epoch_t get_last() { - epoch_t e = 0; - map::reverse_iterator i = maps.rbegin(); - if (i != maps.rend()) e = i->first; - i = incremental_maps.rbegin(); - if (i != incremental_maps.rend() && - (e == 0 || i->first > e)) e = i->first; - return e; - } - - - MOSDMap() : Message(MSG_OSD_MAP) { } - MOSDMap(OSDMap *oc) : Message(MSG_OSD_MAP) { - oc->encode(maps[oc->get_epoch()]); - } - - - // marshalling - virtual void decode_payload() { - int off = 0; - ::_decode(maps, payload, off); - ::_decode(incremental_maps, payload, off); - } - virtual void encode_payload() { - ::_encode(maps, payload); - ::_encode(incremental_maps, payload); - } - - virtual char *get_type_name() { return "omap"; } - void print(ostream& out) { - out << "osd_map(" << get_first() << "," << get_last() << ")"; - } -}; - -#endif diff --git a/branches/sage/mds/messages/MOSDOp.h b/branches/sage/mds/messages/MOSDOp.h deleted file mode 100644 index 7ac401bd75a69..0000000000000 --- a/branches/sage/mds/messages/MOSDOp.h +++ /dev/null @@ -1,280 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDOP_H -#define __MOSDOP_H - -#include "msg/Message.h" -#include "osd/osd_types.h" - -/* - * OSD op - * - * oid - object id - * op - OSD_OP_DELETE, etc. - * - */ - -// osd client ops -#define OSD_OP_READ 1 -#define OSD_OP_STAT 2 - -#define OSD_OP_REPLICATE 3 -#define OSD_OP_UNREPLICATE 4 - -#define OSD_OP_WRNOOP 10 -#define OSD_OP_WRITE 11 -#define OSD_OP_DELETE 12 -#define OSD_OP_TRUNCATE 13 -#define OSD_OP_ZERO 14 - -#define OSD_OP_WRLOCK 20 -#define OSD_OP_WRUNLOCK 21 -#define OSD_OP_RDLOCK 22 -#define OSD_OP_RDUNLOCK 23 -#define OSD_OP_UPLOCK 24 -#define OSD_OP_DNLOCK 25 -#define OSD_OP_MININCLOCK 26 // minimum incarnation lock - -#define OSD_OP_PULL 30 -#define OSD_OP_PUSH 31 - -#define OSD_OP_BALANCEREADS 101 -#define OSD_OP_UNBALANCEREADS 102 - - - -class MOSDOp : public Message { -public: - static const char* get_opname(int op) { - switch (op) { - case OSD_OP_READ: return "read"; - case OSD_OP_STAT: return "stat"; - - case OSD_OP_WRNOOP: return "wrnoop"; - case OSD_OP_WRITE: return "write"; - case OSD_OP_ZERO: return "zero"; - case OSD_OP_DELETE: return "delete"; - case OSD_OP_TRUNCATE: return "truncate"; - case OSD_OP_WRLOCK: return "wrlock"; - case OSD_OP_WRUNLOCK: return "wrunlock"; - case OSD_OP_RDLOCK: return "rdlock"; - case OSD_OP_RDUNLOCK: return "rdunlock"; - case OSD_OP_UPLOCK: return "uplock"; - case OSD_OP_DNLOCK: return "dnlock"; - - case OSD_OP_MININCLOCK: return "mininclock"; - - case OSD_OP_BALANCEREADS: return "balance-reads"; - case OSD_OP_UNBALANCEREADS: return "unbalance-reads"; - - case OSD_OP_PULL: return "pull"; - case OSD_OP_PUSH: return "push"; - default: assert(0); - } - return 0; - } - -private: - struct st_ { - // who's asking? - entity_inst_t client; - osdreqid_t reqid; // minor weirdness: entity_name_t is in reqid_t too. - - // for replication - tid_t rep_tid; - - object_t oid; - objectrev_t rev; - ObjectLayout layout; - - epoch_t map_epoch; - - eversion_t pg_trim_to; // primary->replica: trim to here - - int32_t op; - off_t offset, length; - - eversion_t version; - eversion_t old_version; - - bool want_ack; - bool want_commit; - bool retry_attempt; - - int shed_count; - osd_peer_stat_t peer_stat; - } st; - - bufferlist data; - map attrset; - - - friend class MOSDOpReply; - -public: - const osdreqid_t& get_reqid() { return st.reqid; } - const tid_t get_client_tid() { return st.reqid.tid; } - int get_client_inc() { return st.reqid.inc; } - - const entity_name_t& get_client() { return st.client.name; } - const entity_inst_t& get_client_inst() { return st.client; } - void set_client_inst(const entity_inst_t& i) { st.client = i; } - - bool wants_reply() { - if (st.op < 100) return true; - return false; // no reply needed for primary-lock, -unlock. - } - - const tid_t get_rep_tid() { return st.rep_tid; } - void set_rep_tid(tid_t t) { st.rep_tid = t; } - - bool get_retry_attempt() const { return st.retry_attempt; } - void set_retry_attempt(bool a) { st.retry_attempt = a; } - - const object_t get_oid() { return st.oid; } - const pg_t get_pg() { return st.layout.pgid; } - const ObjectLayout& get_layout() { return st.layout; } - const epoch_t get_map_epoch() { return st.map_epoch; } - - //const int get_pg_role() { return st.pg_role; } // who am i asking for? - const eversion_t get_version() { return st.version; } - //const eversion_t get_old_version() { return st.old_version; } - - void set_rev(objectrev_t r) { st.rev = r; } - objectrev_t get_rev() { return st.rev; } - - const eversion_t get_pg_trim_to() { return st.pg_trim_to; } - void set_pg_trim_to(eversion_t v) { st.pg_trim_to = v; } - - const int get_op() { return st.op; } - void set_op(int o) { st.op = o; } - bool is_read() { - return st.op < 10; - } - - const off_t get_length() { return st.length; } - const off_t get_offset() { return st.offset; } - - map& get_attrset() { return attrset; } - void set_attrset(map &as) { attrset.swap(as); } - - const bool wants_ack() { return st.want_ack; } - const bool wants_commit() { return st.want_commit; } - - void set_peer_stat(const osd_peer_stat_t& stat) { st.peer_stat = stat; } - const osd_peer_stat_t& get_peer_stat() { return st.peer_stat; } - void inc_shed_count() { st.shed_count++; } - int get_shed_count() { return st.shed_count; } - - void set_data(bufferlist &d) { - data.claim(d); - } - bufferlist& get_data() { - return data; - } - off_t get_data_len() { return data.length(); } - - - MOSDOp(entity_inst_t asker, int inc, long tid, - object_t oid, ObjectLayout ol, epoch_t mapepoch, int op) : - Message(MSG_OSD_OP) { - memset(&st, 0, sizeof(st)); - this->st.client = asker; - this->st.reqid.name = asker.name; - this->st.reqid.inc = inc; - this->st.reqid.tid = tid; - - this->st.oid = oid; - this->st.layout = ol; - this->st.map_epoch = mapepoch; - this->st.op = op; - - this->st.rep_tid = 0; - - this->st.want_ack = true; - this->st.want_commit = true; - } - MOSDOp() {} - - //void set_pg_role(int r) { st.pg_role = r; } - //void set_rg_nrep(int n) { st.rg_nrep = n; } - - void set_layout(const ObjectLayout& l) { st.layout = l; } - - void set_length(off_t l) { st.length = l; } - void set_offset(off_t o) { st.offset = o; } - void set_version(eversion_t v) { st.version = v; } - void set_old_version(eversion_t ov) { st.old_version = ov; } - - void set_want_ack(bool b) { st.want_ack = b; } - void set_want_commit(bool b) { st.want_commit = b; } - - // marshalling - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(st), (char*)&st); - off += sizeof(st); - ::_decode(attrset, payload, off); - ::_decode(data, payload, off); - } - - static void add_payload_chunk_breaks(int from, int off, int len, - list& breaks) { - if (len > 0 && - len & 4095 == 0 && - off & 4095 == 0) { - // page-sized and aligned data? easy. - breaks.push_back(from); - } else if (len > 8192) { - // there is at least 1 full page in there. somewhere. - int p = 0; - - // leading partial page? - if (off & 4095 != 0) - p = 4096 - (off & 4095); - - // full page(s) - breaks.push_back(from + p); - p += (len - p) & (~4095); - - // tail bit? - if (p != len) - breaks.push_back(from + p); - } - } - - virtual void encode_payload() { - ::_encode(st, payload); - ::_encode(attrset, payload); - add_payload_chunk_breaks(payload.length() + 4, - st.offset, data.length(), - chunk_payload_at); - ::_encode(data, payload); - } - - virtual char *get_type_name() { return "osd_op"; } - void print(ostream& out) { - out << "osd_op(" << st.reqid - << " " << get_opname(st.op) - << " " << st.oid; - if (st.length) out << " " << st.offset << "~" << st.length; - if (st.retry_attempt) out << " RETRY"; - out << ")"; - } -}; - - -#endif diff --git a/branches/sage/mds/messages/MOSDOpReply.h b/branches/sage/mds/messages/MOSDOpReply.h deleted file mode 100644 index 3c567397e6a2d..0000000000000 --- a/branches/sage/mds/messages/MOSDOpReply.h +++ /dev/null @@ -1,164 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDOPREPLY_H -#define __MOSDOPREPLY_H - -#include "msg/Message.h" - -#include "MOSDOp.h" -#include "osd/ObjectStore.h" - -/* - * OSD op reply - * - * oid - object id - * op - OSD_OP_DELETE, etc. - * - */ - -class MOSDOpReply : public Message { - struct { - // req - osdreqid_t reqid; - - tid_t rep_tid; - - object_t oid; - ObjectLayout layout; // pgid, etc. - - int32_t op; - - // reply - int32_t result; - bool commit; - off_t length, offset; - off_t object_size; - eversion_t version; - - eversion_t pg_complete_thru; - - epoch_t map_epoch; - - osd_peer_stat_t peer_stat; - } st; - - bufferlist data; - map attrset; - - public: - const osdreqid_t& get_reqid() { return st.reqid; } - long get_tid() { return st.reqid.tid; } - long get_rep_tid() { return st.rep_tid; } - object_t get_oid() { return st.oid; } - pg_t get_pg() { return st.layout.pgid; } - int get_op() { return st.op; } - bool get_commit() { return st.commit; } - - int get_result() { return st.result; } - off_t get_length() { return st.length; } - off_t get_offset() { return st.offset; } - off_t get_object_size() { return st.object_size; } - eversion_t get_version() { return st.version; } - map& get_attrset() { return attrset; } - - eversion_t get_pg_complete_thru() { return st.pg_complete_thru; } - void set_pg_complete_thru(eversion_t v) { st.pg_complete_thru = v; } - - void set_result(int r) { st.result = r; } - void set_length(off_t s) { st.length = s; } - void set_offset(off_t o) { st.offset = o; } - void set_object_size(off_t s) { st.object_size = s; } - void set_version(eversion_t v) { st.version = v; } - void set_attrset(map &as) { attrset = as; } - - void set_op(int op) { st.op = op; } - void set_rep_tid(tid_t t) { st.rep_tid = t; } - - void set_peer_stat(const osd_peer_stat_t& stat) { st.peer_stat = stat; } - const osd_peer_stat_t& get_peer_stat() { return st.peer_stat; } - - // data payload - void set_data(bufferlist &d) { - data.claim(d); - } - bufferlist& get_data() { - return data; - } - - // osdmap - epoch_t get_map_epoch() { return st.map_epoch; } - - -public: - MOSDOpReply(MOSDOp *req, int result, epoch_t e, bool commit) : - Message(MSG_OSD_OPREPLY) { - memset(&st, 0, sizeof(st)); - this->st.reqid = req->st.reqid; - this->st.op = req->st.op; - this->st.rep_tid = req->st.rep_tid; - - this->st.oid = req->st.oid; - this->st.layout = req->st.layout; - this->st.result = result; - this->st.commit = commit; - - this->st.length = req->st.length; // speculative... OSD should ensure these are correct - this->st.offset = req->st.offset; - this->st.version = req->st.version; - - this->st.map_epoch = e; - } - MOSDOpReply() {} - - - // marshalling - virtual void decode_payload() { - payload.copy(0, sizeof(st), (char*)&st); - payload.splice(0, sizeof(st)); - int off = 0; - ::_decode(attrset, payload, off); - ::_decode(data, payload, off); - } - virtual void encode_payload() { - payload.append((char*)&st, sizeof(st)); - ::_encode(attrset, payload); - MOSDOp::add_payload_chunk_breaks(payload.length() + 4, - st.offset, data.length(), - chunk_payload_at); - ::_encode(data, payload); - } - - virtual char *get_type_name() { return "osd_op_reply"; } - - void print(ostream& out) { - out << "osd_op_reply(" << st.reqid - << " " << MOSDOp::get_opname(st.op) - << " " << st.oid; - if (st.length) out << " " << st.offset << "~" << st.length; - if (st.op >= 10) { - if (st.commit) - out << " commit"; - else - out << " ack"; - } - out << " = " << st.result; - out << ")"; - } - -}; - - -#endif diff --git a/branches/sage/mds/messages/MOSDOut.h b/branches/sage/mds/messages/MOSDOut.h deleted file mode 100644 index 798356f663f9e..0000000000000 --- a/branches/sage/mds/messages/MOSDOut.h +++ /dev/null @@ -1,43 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __MOSDOUT_H -#define __MOSDOUT_H - -#include "msg/Message.h" - - -class MOSDOut : public Message { - public: - epoch_t map_epoch; - - MOSDOut(epoch_t e) : Message(MSG_OSD_OUT), map_epoch(e) { - } - MOSDOut() {} - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(map_epoch), (char*)&map_epoch); - off += sizeof(map_epoch); - } - virtual void encode_payload() { - payload.append((char*)&map_epoch, sizeof(map_epoch)); - } - - virtual char *get_type_name() { return "oout"; } -}; - -#endif diff --git a/branches/sage/mds/messages/MOSDPGActivateSet.h b/branches/sage/mds/messages/MOSDPGActivateSet.h deleted file mode 100644 index cdee7996e9647..0000000000000 --- a/branches/sage/mds/messages/MOSDPGActivateSet.h +++ /dev/null @@ -1,50 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDPGACTIVATESET_H -#define __MOSDPGACTIVATESET_H - -#include "msg/Message.h" - -class MOSDPGActivateSet : public Message { - epoch_t epoch; - -public: - list pg_info; - - epoch_t get_epoch() { return epoch; } - - MOSDPGActivateSet() {} - MOSDPGActivateSet(version_t mv) : - Message(MSG_OSD_PG_ACTIVATE_SET), - epoch(mv) { } - - char *get_type_name() { return "pg_activate_set"; } - void print(ostream& out) { - out << "pg_activate_set(" << pg_info.size() << " pgs e" << epoch << ")"; - } - - void encode_payload() { - ::_encode(epoch, payload); - ::_encode(pg_info, payload); - } - void decode_payload() { - int off = 0; - ::_decode(epoch, payload, off); - ::_decode(pg_info, payload, off); - } -}; - -#endif diff --git a/branches/sage/mds/messages/MOSDPGLog.h b/branches/sage/mds/messages/MOSDPGLog.h deleted file mode 100644 index 653bb9f10570c..0000000000000 --- a/branches/sage/mds/messages/MOSDPGLog.h +++ /dev/null @@ -1,59 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDPGLOG_H -#define __MOSDPGLOG_H - -#include "msg/Message.h" - -class MOSDPGLog : public Message { - epoch_t epoch; - -public: - PG::Info info; - PG::Log log; - PG::Missing missing; - - epoch_t get_epoch() { return epoch; } - pg_t get_pgid() { return info.pgid; } - - MOSDPGLog() {} - MOSDPGLog(version_t mv, PG::Info& i) : - Message(MSG_OSD_PG_LOG), - epoch(mv), info(i) { } - - char *get_type_name() { return "PGlog"; } - void print(ostream& out) { - out << "pg_log(" << info.pgid << " e" << epoch << ")"; - } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - payload.append((char*)&info, sizeof(info)); - log._encode(payload); - missing._encode(payload); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - payload.copy(off, sizeof(info), (char*)&info); - off += sizeof(info); - log._decode(payload, off); - missing._decode(payload, off); - } -}; - -#endif diff --git a/branches/sage/mds/messages/MOSDPGNotify.h b/branches/sage/mds/messages/MOSDPGNotify.h deleted file mode 100644 index 76a984276b66b..0000000000000 --- a/branches/sage/mds/messages/MOSDPGNotify.h +++ /dev/null @@ -1,55 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MOSDPGPEERNOTIFY_H -#define __MOSDPGPEERNOTIFY_H - -#include "msg/Message.h" - -#include "osd/PG.h" - -/* - * PGNotify - notify primary of my PGs and versions. - */ - -class MOSDPGNotify : public Message { - epoch_t epoch; - list pg_list; // pgid -> version - - public: - version_t get_epoch() { return epoch; } - list& get_pg_list() { return pg_list; } - - MOSDPGNotify() {} - MOSDPGNotify(epoch_t e, list& l) : - Message(MSG_OSD_PG_NOTIFY) { - this->epoch = e; - pg_list.splice(pg_list.begin(),l); - } - - char *get_type_name() { return "PGnot"; } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - _encode(pg_list, payload); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - _decode(pg_list, payload, off); - } -}; - -#endif diff --git a/branches/sage/mds/messages/MOSDPGPeer.h b/branches/sage/mds/messages/MOSDPGPeer.h deleted file mode 100644 index dd3164cdc1124..0000000000000 --- a/branches/sage/mds/messages/MOSDPGPeer.h +++ /dev/null @@ -1,58 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDPGPEER_H -#define __MOSDPGPEER_H - -#include "msg/Message.h" - - -class MOSDPGPeer : public Message { - uint64_t map_version; - list pg_list; - - bool complete; - - public: - uint64_t get_version() { return map_version; } - list& get_pg_list() { return pg_list; } - bool get_complete() { return complete; } - - MOSDPGPeer() {} - MOSDPGPeer(uint64_t v, list& l, bool c=false) : - Message(MSG_OSD_PG_PEER) { - this->map_version = v; - this->complete = c; - pg_list.splice(pg_list.begin(), l); - } - - char *get_type_name() { return "PGPeer"; } - - void encode_payload() { - payload.append((char*)&map_version, sizeof(map_version)); - payload.append((char*)&complete, sizeof(complete)); - _encode(pg_list, payload); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(map_version), (char*)&map_version); - off += sizeof(map_version); - payload.copy(off, sizeof(complete), (char*)&complete); - off += sizeof(complete); - _decode(pg_list, payload, off); - } -}; - -#endif diff --git a/branches/sage/mds/messages/MOSDPGPeerAck.h b/branches/sage/mds/messages/MOSDPGPeerAck.h deleted file mode 100644 index dc4fac1a9436b..0000000000000 --- a/branches/sage/mds/messages/MOSDPGPeerAck.h +++ /dev/null @@ -1,70 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDPGPEERACK_H -#define __MOSDPGPEERACK_H - -#include "msg/Message.h" -#include "osd/OSD.h" - -class MOSDPGPeerAck : public Message { - version_t map_version; - - public: - list pg_dne; // pg dne - map pg_state; // state, lists, etc. - - version_t get_version() { return map_version; } - - MOSDPGPeerAck() {} - MOSDPGPeerAck(version_t v) : - Message(MSG_OSD_PG_PEERACK) { - this->map_version = v; - } - - char *get_type_name() { return "PGPeer"; } - - void encode_payload() { - payload.append((char*)&map_version, sizeof(map_version)); - _encode(pg_dne, payload); - - int n = pg_state.size(); - payload.append((char*)&n, sizeof(n)); - for (map::iterator it = pg_state.begin(); - it != pg_state.end(); - it++) { - payload.append((char*)&it->first, sizeof(it->first)); - it->second._encode(payload); - } - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(map_version), (char*)&map_version); - off += sizeof(map_version); - _decode(pg_dne, payload, off); - - int n; - payload.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDPEERREQUEST_H -#define __MOSDPEERREQUEST_H - -#include "msg/Message.h" - - -class MOSDPGPeerRequest : public Message { - version_t map_version; - list pg_list; - - public: - version_t get_version() { return map_version; } - list& get_pg_list() { return pg_list; } - - MOSDPGPeerRequest() {} - MOSDPGPeerRequest(version_t v, list& l) : - Message(MSG_OSD_PG_PEERREQUEST) { - this->map_version = v; - pg_list.splice(pg_list.begin(), l); - } - - char *get_type_name() { return "PGPR"; } - - void encode_payload() { - payload.append((char*)&map_version, sizeof(map_version)); - _encode(pg_list, payload); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(map_version), (char*)&map_version); - off += sizeof(map_version); - _decode(pg_list, payload, off); - } -}; - -#endif diff --git a/branches/sage/mds/messages/MOSDPGQuery.h b/branches/sage/mds/messages/MOSDPGQuery.h deleted file mode 100644 index 70dbfdbb96fd7..0000000000000 --- a/branches/sage/mds/messages/MOSDPGQuery.h +++ /dev/null @@ -1,52 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDPGQUERY_H -#define __MOSDPGQUERY_H - -#include "msg/Message.h" - -/* - * PGQuery - query another OSD as to the contents of their PGs - */ - -class MOSDPGQuery : public Message { - version_t epoch; - - public: - version_t get_epoch() { return epoch; } - map pg_list; - - MOSDPGQuery() {} - MOSDPGQuery(epoch_t e, map& ls) : - Message(MSG_OSD_PG_QUERY), - epoch(e), pg_list(ls) { - } - - char *get_type_name() { return "PGq"; } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - ::_encode(pg_list, payload); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - ::_decode(pg_list, payload, off); - } -}; - -#endif diff --git a/branches/sage/mds/messages/MOSDPGRemove.h b/branches/sage/mds/messages/MOSDPGRemove.h deleted file mode 100644 index 17cb28a3c95a1..0000000000000 --- a/branches/sage/mds/messages/MOSDPGRemove.h +++ /dev/null @@ -1,52 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDPGREMOVE_H -#define __MOSDPGREMOVE_H - -#include "msg/Message.h" - - -class MOSDPGRemove : public Message { - epoch_t epoch; - - public: - set pg_list; - - epoch_t get_epoch() { return epoch; } - - MOSDPGRemove() {} - MOSDPGRemove(epoch_t e, set& l) : - Message(MSG_OSD_PG_REMOVE) { - this->epoch = e; - pg_list = l; - } - - char *get_type_name() { return "PGrm"; } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - _encode(pg_list, payload); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - _decode(pg_list, payload, off); - } - -}; - -#endif diff --git a/branches/sage/mds/messages/MOSDPGSummary.h b/branches/sage/mds/messages/MOSDPGSummary.h deleted file mode 100644 index 0dcebffaf74da..0000000000000 --- a/branches/sage/mds/messages/MOSDPGSummary.h +++ /dev/null @@ -1,69 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDPGQUERYREPLY_H -#define __MOSDPGQUERYREPLY_H - -#include "msg/Message.h" - -class MOSDPGSummary : public Message { - epoch_t epoch; - pg_t pgid; - -public: - PG::PGInfo info; - bufferlist sumbl; - - epoch_t get_epoch() { return epoch; } - - MOSDPGSummary() {} - MOSDPGSummary(version_t mv, pg_t pgid, PG::PGSummary &summary) : - Message(MSG_OSD_PG_SUMMARY) { - this->epoch = mv; - this->pgid = pgid; - summary._encode(sumbl); - } - - pg_t get_pgid() { return pgid; } - bufferlist& get_summary_bl() { - return sumbl; - } - - char *get_type_name() { return "PGsum"; } - void print(ostream& out) { - out << "pg_summary(" << pgid << " e" << epoch << ")"; - } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - payload.append((char*)&pgid, sizeof(pgid)); - payload.append((char*)&info, sizeof(info)); - payload.claim_append(sumbl); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - payload.copy(off, sizeof(pgid), (char*)&pgid); - off += sizeof(pgid); - payload.copy(off, sizeof(info), (char*)&info); - off += sizeof(info); - - payload.splice(0, off); - sumbl.claim(payload); - } -}; - -#endif diff --git a/branches/sage/mds/messages/MOSDPGUpdate.h b/branches/sage/mds/messages/MOSDPGUpdate.h deleted file mode 100644 index 869c02e18c156..0000000000000 --- a/branches/sage/mds/messages/MOSDPGUpdate.h +++ /dev/null @@ -1,71 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDPGUPDATE_H -#define __MOSDPGUPDATE_H - -#include "msg/Message.h" - -class MOSDPGUpdate : public Message { - version_t map_version; - pg_t pgid; - //pginfo_t info; - bool complete; - version_t last_any_complete; - - public: - version_t get_version() { return map_version; } - pg_t get_pgid() { return pgid; } - //pginfo_t& get_pginfo() { return info; } - bool is_complete() { return complete; } - version_t get_last_any_complete() { return last_any_complete; } - - MOSDPGUpdate() {} - MOSDPGUpdate(version_t mv, pg_t pgid, bool complete, version_t last_any_complete) : - Message(MSG_OSD_PG_UPDATE) { - this->map_version = mv; - this->pgid = pgid; - this->complete = complete; - this->last_any_complete = last_any_complete; - } - - char *get_type_name() { return "PGUp"; } - void print(ostream& out) { - out << "pg_update(" << pgid << " e" << map_version; - if (complete) out << " complete"; - out << " lac=" << last_any_complete; - out << ")"; - } - - void encode_payload() { - payload.append((char*)&map_version, sizeof(map_version)); - payload.append((char*)&pgid, sizeof(pgid)); - payload.append((char*)&complete, sizeof(complete)); - payload.append((char*)&last_any_complete, sizeof(last_any_complete)); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(map_version), (char*)&map_version); - off += sizeof(map_version); - payload.copy(off, sizeof(pgid), (char*)&pgid); - off += sizeof(pgid); - payload.copy(off, sizeof(complete), (char*)&complete); - off += sizeof(complete); - payload.copy(off, sizeof(last_any_complete), (char*)&last_any_complete); - off += sizeof(last_any_complete); - } -}; - -#endif diff --git a/branches/sage/mds/messages/MOSDPing.h b/branches/sage/mds/messages/MOSDPing.h deleted file mode 100644 index 37be289c0a923..0000000000000 --- a/branches/sage/mds/messages/MOSDPing.h +++ /dev/null @@ -1,49 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MOSDPING_H -#define __MOSDPING_H - -#include "common/Clock.h" - -#include "msg/Message.h" -#include "osd/osd_types.h" - - -class MOSDPing : public Message { - public: - epoch_t map_epoch; - bool ack; - osd_peer_stat_t peer_stat; - - MOSDPing(epoch_t e, osd_peer_stat_t& ps, bool a=false) : - Message(MSG_OSD_PING), map_epoch(e), ack(a), peer_stat(ps) { } - MOSDPing() {} - - virtual void decode_payload() { - int off = 0; - ::_decode(map_epoch, payload, off); - ::_decode(ack, payload, off); - ::_decode(peer_stat, payload, off); - } - virtual void encode_payload() { - ::_encode(map_epoch, payload); - ::_encode(ack, payload); - ::_encode(peer_stat, payload); - } - - virtual char *get_type_name() { return "osd_ping"; } -}; - -#endif diff --git a/branches/sage/mds/messages/MPGStats.h b/branches/sage/mds/messages/MPGStats.h deleted file mode 100644 index a851eb103f07f..0000000000000 --- a/branches/sage/mds/messages/MPGStats.h +++ /dev/null @@ -1,43 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MPGSTATS_H -#define __MPGSTATS_H - -#include "osd/osd_types.h" - -class MPGStats : public Message { -public: - map pg_stat; - osd_stat_t osd_stat; - - MPGStats() : Message(MSG_PGSTATS) {} - - char *get_type_name() { return "pg_stats"; } - void print(ostream& out) { - out << "pg_stats"; - } - - void encode_payload() { - ::_encode(osd_stat, payload); - ::_encode(pg_stat, payload); - } - void decode_payload() { - int off = 0; - ::_decode(osd_stat, payload, off); - ::_decode(pg_stat, payload, off); - } -}; - -#endif diff --git a/branches/sage/mds/messages/MPing.h b/branches/sage/mds/messages/MPing.h deleted file mode 100644 index 6b569666ed377..0000000000000 --- a/branches/sage/mds/messages/MPing.h +++ /dev/null @@ -1,43 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __MPING_H -#define __MPING_H - -#include "msg/Message.h" - - -class MPing : public Message { - public: - int seq; - MPing(int s) : Message(MSG_PING) { - seq = s; - } - MPing() : Message(MSG_PING) {} - - virtual void decode_payload() { - int off = 0; - payload.copy(0, sizeof(seq), (char*)&seq); - off += sizeof(seq); - } - virtual void encode_payload() { - payload.append((char*)&seq, sizeof(seq)); - } - - virtual char *get_type_name() { return "ping"; } -}; - -#endif diff --git a/branches/sage/mds/messages/MPingAck.h b/branches/sage/mds/messages/MPingAck.h deleted file mode 100644 index f8f32aee43ee0..0000000000000 --- a/branches/sage/mds/messages/MPingAck.h +++ /dev/null @@ -1,42 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MPINGACK_H -#define __MPINGACK_H - -#include "MPing.h" - - -class MPingAck : public Message { - public: - int seq; - MPingAck() {} - MPingAck(MPing *p) : Message(MSG_PING_ACK) { - this->seq = p->seq; - } - - virtual void decode_payload() { - int off = 0; - payload.copy(0, sizeof(seq), (char*)&seq); - off += sizeof(seq); - } - virtual void encode_payload() { - payload.append((char*)&seq, sizeof(seq)); - } - - virtual char *get_type_name() { return "pinga"; } -}; - -#endif diff --git a/branches/sage/mds/messages/MStatfs.h b/branches/sage/mds/messages/MStatfs.h deleted file mode 100644 index 66e5847206a7b..0000000000000 --- a/branches/sage/mds/messages/MStatfs.h +++ /dev/null @@ -1,42 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MSTATFS_H -#define __MSTATFS_H - -#include /* or */ - -class MStatfs : public Message { -public: - tid_t tid; - - MStatfs() : Message(MSG_STATFS) {} - MStatfs(tid_t t) : Message(MSG_STATFS), tid(t) {} - - char *get_type_name() { return "statfs"; } - void print(ostream& out) { - out << "statfs(" << tid << ")"; - } - - void encode_payload() { - ::_encode(tid, payload); - } - void decode_payload() { - int off = 0; - ::_decode(tid, payload, off); - } -}; - -#endif diff --git a/branches/sage/mds/messages/MStatfsReply.h b/branches/sage/mds/messages/MStatfsReply.h deleted file mode 100644 index f8e21ddcc2b31..0000000000000 --- a/branches/sage/mds/messages/MStatfsReply.h +++ /dev/null @@ -1,45 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MSTATFSREPLY_H -#define __MSTATFSREPLY_H - -#include /* or */ - -class MStatfsReply : public Message { -public: - tid_t tid; - struct statvfs stfs; - - MStatfsReply() : Message(MSG_STATFS_REPLY) {} - MStatfsReply(tid_t t) : Message(MSG_STATFS_REPLY), tid(t) {} - - char *get_type_name() { return "statfs_reply"; } - void print(ostream& out) { - out << "statfs_reply(" << tid << ")"; - } - - void encode_payload() { - ::_encode(tid, payload); - ::_encode(stfs, payload); - } - void decode_payload() { - int off = 0; - ::_decode(tid, payload, off); - ::_decode(stfs, payload, off); - } -}; - -#endif diff --git a/branches/sage/mds/mkmonmap.cc b/branches/sage/mds/mkmonmap.cc deleted file mode 100644 index 0a80e93c40bd2..0000000000000 --- a/branches/sage/mds/mkmonmap.cc +++ /dev/null @@ -1,68 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include -#include -#include - -#include -#include -#include -using namespace std; - -#include "config.h" - -#include "mon/MonMap.h" - - - - - -int main(int argc, char **argv) -{ - vector args; - argv_to_vec(argc, argv, args); - - MonMap monmap; - - char *outfn = ".ceph_monmap"; - - for (unsigned i=0; i= 0); - - return 0; -} diff --git a/branches/sage/mds/mon/ClientMonitor.cc b/branches/sage/mds/mon/ClientMonitor.cc deleted file mode 100644 index b7ac275b0afca..0000000000000 --- a/branches/sage/mds/mon/ClientMonitor.cc +++ /dev/null @@ -1,256 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include "ClientMonitor.h" -#include "Monitor.h" -#include "MDSMonitor.h" -#include "OSDMonitor.h" -#include "MonitorStore.h" - -#include "messages/MClientMount.h" -#include "messages/MClientUnmount.h" - -#include "common/Timer.h" - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_dout << dbeginl << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".client v" << client_map.version << " " -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_derr << dbeginl << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".client v" << client_map.version << " " - - - -bool ClientMonitor::update_from_paxos() -{ - assert(paxos->is_active()); - - version_t paxosv = paxos->get_version(); - if (paxosv == client_map.version) return true; - assert(paxosv >= client_map.version); - - dout(10) << "update_from_paxos paxosv " << paxosv - << ", my v " << client_map.version << dendl; - - - if (client_map.version == 0 && paxosv > 1 && - mon->store->exists_bl_ss("clientmap","latest")) { - // starting up: load latest - dout(7) << "update_from_paxos startup: loading latest full clientmap" << dendl; - bufferlist bl; - mon->store->get_bl_ss(bl, "clientmap", "latest"); - int off = 0; - client_map._decode(bl, off); - } - - // walk through incrementals - while (paxosv > client_map.version) { - bufferlist bl; - bool success = paxos->read(client_map.version+1, bl); - if (success) { - dout(7) << "update_from_paxos applying incremental " << client_map.version+1 << dendl; - Incremental inc; - int off = 0; - inc._decode(bl, off); - client_map.apply_incremental(inc); - - dout(1) << client_map.client_addr.size() << " clients (+" - << inc.mount.size() << " -" << inc.unmount.size() << ")" - << dendl; - - } else { - dout(7) << "update_from_paxos couldn't read incremental " << client_map.version+1 << dendl; - return false; - } - } - - // save latest - bufferlist bl; - client_map._encode(bl); - mon->store->put_bl_ss(bl, "clientmap", "latest"); - - return true; -} - -void ClientMonitor::create_pending() -{ - assert(mon->is_leader()); - pending_inc = Incremental(); - pending_inc.version = client_map.version + 1; - pending_inc.next_client = client_map.next_client; - dout(10) << "create_pending v " << pending_inc.version - << ", next is " << pending_inc.next_client - << dendl; -} - -void ClientMonitor::create_initial() -{ - dout(1) << "create_initial -- creating initial map" << dendl; -} - -void ClientMonitor::committed() -{ - -} - - -void ClientMonitor::encode_pending(bufferlist &bl) -{ - assert(mon->is_leader()); - dout(10) << "encode_pending v " << pending_inc.version - << ", next is " << pending_inc.next_client - << dendl; - assert(paxos->get_version() + 1 == pending_inc.version); - pending_inc._encode(bl); -} - - -// ------- - - -bool ClientMonitor::preprocess_query(Message *m) -{ - dout(10) << "preprocess_query " << *m << " from " << m->get_source_inst() << dendl; - - switch (m->get_type()) { - case MSG_CLIENT_MOUNT: - { - // already mounted? - MClientMount *mount = (MClientMount*)m; - entity_addr_t addr = m->get_source_addr(); - pair addrinst(addr, mount->instance); - if (client_map.addr_client.count(addrinst)) { - int client = client_map.addr_client[addrinst]; - dout(7) << " client" << client << " already mounted" << dendl; - _mounted(client, (MClientMount*)m); - return true; - } - } - return false; - - case MSG_CLIENT_UNMOUNT: - { - // already unmounted? - int client = m->get_source().num(); - if (client_map.client_addr.count(client) == 0) { - dout(7) << " client" << client << " not mounted" << dendl; - _unmounted((MClientUnmount*)m); - return true; - } - } - return false; - - - default: - assert(0); - delete m; - return true; - } -} - -bool ClientMonitor::prepare_update(Message *m) -{ - dout(10) << "prepare_update " << *m << " from " << m->get_source_inst() << dendl; - - switch (m->get_type()) { - case MSG_CLIENT_MOUNT: - { - MClientMount *mount = (MClientMount*)m; - pair addrinst(mount->addr, mount->instance); - int client = -1; - if (mount->get_source().is_client()) - client = mount->get_source().num(); - - // choose a client id - if (client < 0) { - client = pending_inc.next_client; - dout(10) << "mount: assigned client" << client << " to " << mount->addr << dendl; - } else { - dout(10) << "mount: client" << client << " requested by " - << mount->addr << "i" << mount->instance - << dendl; - if (client_map.client_addr.count(client)) { - assert(client_map.client_addr[client] != addrinst); - dout(0) << "mount: WARNING: client" << client << " requested by " - << mount->addr << "." << mount->instance - << ", which used to be " - << client_map.client_addr[client].first << "i" << client_map.client_addr[client].second - << dendl; - } - } - - pending_inc.add_mount(client, mount->addr, mount->instance); - paxos->wait_for_commit(new C_Mounted(this, client, mount)); - } - return true; - - case MSG_CLIENT_UNMOUNT: - { - MClientUnmount *unmount = (MClientUnmount*)m; - assert(unmount->inst.name.is_client()); - int client = unmount->inst.name.num(); - - assert(client_map.client_addr.count(client)); - - pending_inc.add_unmount(client); - paxos->wait_for_commit(new C_Unmounted(this, unmount)); - } - return true; - - default: - assert(0); - delete m; - return false; - } - -} - - -// MOUNT - - -void ClientMonitor::_mounted(int client, MClientMount *m) -{ - entity_inst_t to; - to.addr = m->addr; - to.name = entity_name_t::CLIENT(client); - - dout(10) << "_mounted client" << client << " at " << to << dendl; - - // reply with latest mds, osd maps - mon->mdsmon->send_latest(to); - mon->osdmon->send_latest(to); - - delete m; -} - -void ClientMonitor::_unmounted(MClientUnmount *m) -{ - dout(10) << "_unmounted " << m->inst << dendl; - - // reply with (same) unmount message - mon->messenger->send_message(m, m->inst); - - // auto-shutdown? - // (hack for fakesyn/newsyn, mostly) - if (mon->is_leader() && - client_map.version > 1 && - client_map.client_addr.empty() && - g_conf.mon_stop_on_last_unmount && - !mon->is_stopping()) { - dout(1) << "last client unmounted" << dendl; - mon->do_stop(); - } -} - - diff --git a/branches/sage/mds/mon/ClientMonitor.h b/branches/sage/mds/mon/ClientMonitor.h deleted file mode 100644 index f36ee9f7c18bd..0000000000000 --- a/branches/sage/mds/mon/ClientMonitor.h +++ /dev/null @@ -1,178 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __CLIENTMONITOR_H -#define __CLIENTMONITOR_H - -#include -#include -using namespace std; - -#include "include/types.h" -#include "msg/Messenger.h" - -#include "mds/MDSMap.h" - -#include "PaxosService.h" - -class Monitor; -class Paxos; -class MClientMount; -class MClientUnmount; - -class ClientMonitor : public PaxosService { -public: - - struct Incremental { - version_t version; - uint32_t next_client; - map > mount; - set unmount; - - Incremental() : version(0), next_client() {} - - bool is_empty() { return mount.empty() && unmount.empty(); } - void add_mount(uint32_t client, entity_addr_t addr, int instance) { - next_client = MAX(next_client, client+1); - mount[client] = pair(addr, instance); - } - void add_unmount(uint32_t client) { - assert(client < next_client); - if (mount.count(client)) - mount.erase(client); - else - unmount.insert(client); - } - - void _encode(bufferlist &bl) { - ::_encode(version, bl); - ::_encode(next_client, bl); - ::_encode(mount, bl); - ::_encode(unmount, bl); - } - void _decode(bufferlist &bl, int& off) { - ::_decode(version, bl, off); - ::_decode(next_client, bl, off); - ::_decode(mount, bl, off); - ::_decode(unmount, bl, off); - } - }; - - struct Map { - version_t version; - uint32_t next_client; - map > client_addr; - map,uint32_t> addr_client; - - Map() : version(0), next_client(0) {} - - void reverse() { - addr_client.clear(); - for (map >::iterator p = client_addr.begin(); - p != client_addr.end(); - ++p) { - addr_client[p->second] = p->first; - } - } - void apply_incremental(Incremental &inc) { - assert(inc.version == version+1); - version = inc.version; - next_client = inc.next_client; - for (map >::iterator p = inc.mount.begin(); - p != inc.mount.end(); - ++p) { - client_addr[p->first] = p->second; - addr_client[p->second] = p->first; - } - - for (set::iterator p = inc.unmount.begin(); - p != inc.unmount.end(); - ++p) { - assert(client_addr.count(*p)); - addr_client.erase(client_addr[*p]); - client_addr.erase(*p); - } - } - - void _encode(bufferlist &bl) { - ::_encode(version, bl); - ::_encode(next_client, bl); - ::_encode(client_addr, bl); - } - void _decode(bufferlist &bl, int& off) { - ::_decode(version, bl, off); - ::_decode(next_client, bl, off); - ::_decode(client_addr, bl, off); - reverse(); - } - }; - - class C_Mounted : public Context { - ClientMonitor *cmon; - int client; - MClientMount *m; - public: - C_Mounted(ClientMonitor *cm, int c, MClientMount *m_) : - cmon(cm), client(c), m(m_) {} - void finish(int r) { - if (r >= 0) - cmon->_mounted(client, m); - else - cmon->dispatch((Message*)m); - } - }; - - class C_Unmounted : public Context { - ClientMonitor *cmon; - MClientUnmount *m; - public: - C_Unmounted(ClientMonitor *cm, MClientUnmount *m_) : - cmon(cm), m(m_) {} - void finish(int r) { - if (r >= 0) - cmon->_unmounted(m); - else - cmon->dispatch((Message*)m); - } - }; - - -private: - Map client_map; - - // leader - Incremental pending_inc; - - void create_initial(); - bool update_from_paxos(); - void create_pending(); // prepare a new pending - void encode_pending(bufferlist &bl); // propose pending update to peers - - void committed(); - - void _mounted(int c, MClientMount *m); - void _unmounted(MClientUnmount *m); - - bool preprocess_query(Message *m); // true if processed. - bool prepare_update(Message *m); - - - public: - ClientMonitor(Monitor *mn, Paxos *p) : PaxosService(mn, p) { } - - //void tick(); // check state, take actions - -}; - -#endif diff --git a/branches/sage/mds/mon/Elector.cc b/branches/sage/mds/mon/Elector.cc deleted file mode 100644 index 4a09b58ab5073..0000000000000 --- a/branches/sage/mds/mon/Elector.cc +++ /dev/null @@ -1,293 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "Elector.h" -#include "Monitor.h" - -#include "common/Timer.h" -#include "MonitorStore.h" -#include "messages/MMonElection.h" - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_dout << dbeginl << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".elector(" << epoch << ") " -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_derr << dbeginl << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".elector(" << epoch << ") " - - -void Elector::init() -{ - epoch = mon->store->get_int("mon_epoch"); - if (!epoch) - epoch = 1; - dout(1) << "init, last seen epoch " << epoch << dendl; -} - -void Elector::shutdown() -{ - if (expire_event) - mon->timer.cancel_event(expire_event); -} - -void Elector::bump_epoch(epoch_t e) -{ - dout(10) << "bump_epoch " << epoch << " to " << e << dendl; - assert(epoch < e); - epoch = e; - mon->store->put_int(epoch, "mon_epoch"); - - // clear up some state - electing_me = false; - acked_me.clear(); - leader_acked = -1; -} - - -void Elector::start() -{ - dout(5) << "start -- can i be leader?" << dendl; - - // start by trying to elect me - if (epoch % 2 == 0) - bump_epoch(epoch+1); // odd == election cycle - start_stamp = g_clock.now(); - electing_me = true; - acked_me.insert(whoami); - - // bcast to everyone else - for (int i=0; imonmap->num_mon; ++i) { - if (i == whoami) continue; - mon->messenger->send_message(new MMonElection(MMonElection::OP_PROPOSE, epoch), - mon->monmap->get_inst(i)); - } - - reset_timer(); -} - -void Elector::defer(int who) -{ - dout(5) << "defer to " << who << dendl; - - if (electing_me) { - // drop out - acked_me.clear(); - electing_me = false; - } - - // ack them - leader_acked = who; - ack_stamp = g_clock.now(); - mon->messenger->send_message(new MMonElection(MMonElection::OP_ACK, epoch), - mon->monmap->get_inst(who)); - - // set a timer - reset_timer(1.0); // give the leader some extra time to declare victory -} - - -void Elector::reset_timer(double plus) -{ - // set the timer - cancel_timer(); - expire_event = new C_ElectionExpire(this); - mon->timer.add_event_after(g_conf.mon_lease + plus, - expire_event); -} - - -void Elector::cancel_timer() -{ - if (expire_event) { - mon->timer.cancel_event(expire_event); - expire_event = 0; - } -} - -void Elector::expire() -{ - dout(5) << "election timer expired" << dendl; - - // did i win? - if (electing_me && - acked_me.size() > (unsigned)(mon->monmap->num_mon / 2)) { - // i win - victory(); - } else { - // whoever i deferred to didn't declare victory quickly enough. - start(); - } -} - - -void Elector::victory() -{ - leader_acked = -1; - electing_me = false; - set quorum = acked_me; - - cancel_timer(); - - assert(epoch % 2 == 1); // election - bump_epoch(epoch+1); // is over! - - // tell everyone - for (set::iterator p = quorum.begin(); - p != quorum.end(); - ++p) { - if (*p == whoami) continue; - mon->messenger->send_message(new MMonElection(MMonElection::OP_VICTORY, epoch), - mon->monmap->get_inst(*p)); - } - - // tell monitor - mon->win_election(epoch, quorum); -} - - -void Elector::handle_propose(MMonElection *m) -{ - dout(5) << "handle_propose from " << m->get_source() << dendl; - int from = m->get_source().num(); - - assert(m->epoch % 2 == 1); // election - if (m->epoch > epoch) { - bump_epoch(m->epoch); - } - else if (m->epoch < epoch && // got an "old" propose, - epoch % 2 == 0 && // in a non-election cycle - mon->quorum.count(from) == 0) { // from someone outside the quorum - // a mon just started up, call a new election so they can rejoin! - dout(5) << " got propose from old epoch, " << m->get_source() << " must have just started" << dendl; - start(); - } - - if (whoami < from) { - // i would win over them. - if (leader_acked >= 0) { // we already acked someone - assert(leader_acked < from); // and they still win, of course - dout(5) << "no, we already acked " << leader_acked << dendl; - } else { - // wait, i should win! - if (!electing_me) - start(); - } - } else { - // they would win over me - if (leader_acked < 0 || // haven't acked anyone yet, or - leader_acked > from || // they would win over who you did ack, or - leader_acked == from) { // this is the guy we're already deferring to - defer(from); - } else { - // ignore them! - dout(5) << "no, we already acked " << leader_acked << dendl; - } - } - - delete m; -} - -void Elector::handle_ack(MMonElection *m) -{ - dout(5) << "handle_ack from " << m->get_source() << dendl; - int from = m->get_source().num(); - - assert(m->epoch % 2 == 1); // election - if (m->epoch > epoch) { - dout(5) << "woah, that's a newer epoch, i must have rebooted. bumping and re-starting!" << dendl; - bump_epoch(m->epoch); - start(); - delete m; - return; - } - assert(m->epoch == epoch); - - if (electing_me) { - // thanks - acked_me.insert(from); - dout(5) << " so far i have " << acked_me << dendl; - - // is that _everyone_? - if (acked_me.size() == (unsigned)mon->monmap->num_mon) { - // if yes, shortcut to election finish - victory(); - } - } else { - // ignore, i'm deferring already. - assert(leader_acked >= 0); - } - - delete m; -} - - -void Elector::handle_victory(MMonElection *m) -{ - dout(5) << "handle_victory from " << m->get_source() << dendl; - int from = m->get_source().num(); - - assert(from < whoami); - assert(m->epoch % 2 == 0); - assert(m->epoch == epoch + 1); // i should have seen this election if i'm getting the victory. - bump_epoch(m->epoch); - - // they win - mon->lose_election(epoch, from); - - // cancel my timer - cancel_timer(); -} - - - - -void Elector::dispatch(Message *m) -{ - switch (m->get_type()) { - - case MSG_MON_ELECTION: - { - MMonElection *em = (MMonElection*)m; - - switch (em->op) { - case MMonElection::OP_PROPOSE: - handle_propose(em); - return; - } - - if (em->epoch < epoch) { - dout(5) << "old epoch, dropping" << dendl; - delete em; - break; - } - - switch (em->op) { - case MMonElection::OP_ACK: - handle_ack(em); - return; - case MMonElection::OP_VICTORY: - handle_victory(em); - return; - default: - assert(0); - } - } - break; - - default: - assert(0); - } -} - - - - diff --git a/branches/sage/mds/mon/Elector.h b/branches/sage/mds/mon/Elector.h deleted file mode 100644 index 9bfd7cb644fc7..0000000000000 --- a/branches/sage/mds/mon/Elector.h +++ /dev/null @@ -1,92 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MON_ELECTOR_H -#define __MON_ELECTOR_H - -#include -using namespace std; - -#include "include/types.h" -#include "msg/Message.h" - -#include "include/Context.h" - -#include "common/Timer.h" - -class Monitor; - - -class Elector { - private: - Monitor *mon; - int whoami; - - Context *expire_event; - - void reset_timer(double plus=0.0); - void cancel_timer(); - - epoch_t epoch; // latest epoch we've seen. odd == election, even == stable, - - // electing me - bool electing_me; - utime_t start_stamp; - set acked_me; - - // electing them - int leader_acked; // who i've acked - utime_t ack_stamp; // and when - - void bump_epoch(epoch_t e=0); // i just saw a larger epoch - - class C_ElectionExpire : public Context { - Elector *elector; - public: - C_ElectionExpire(Elector *e) : elector(e) { } - void finish(int r) { - elector->expire(); - } - }; - - void start(); // start an electing me - void defer(int who); - void expire(); // timer goes off - void victory(); - - void handle_propose(class MMonElection *m); - void handle_ack(class MMonElection *m); - void handle_victory(class MMonElection *m); - - public: - Elector(Monitor *m, int w) : mon(m), whoami(w), - expire_event(0), - epoch(0), - electing_me(false), - leader_acked(-1) { } - - void init(); - void shutdown(); - - void dispatch(Message *m); - - void call_election() { - start(); - } - -}; - - -#endif diff --git a/branches/sage/mds/mon/MDSMonitor.cc b/branches/sage/mds/mon/MDSMonitor.cc deleted file mode 100644 index 24c5fc76e75ef..0000000000000 --- a/branches/sage/mds/mon/MDSMonitor.cc +++ /dev/null @@ -1,633 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include "MDSMonitor.h" -#include "Monitor.h" -#include "MonitorStore.h" -#include "OSDMonitor.h" - -#include "messages/MMDSMap.h" -#include "messages/MMDSGetMap.h" -#include "messages/MMDSBeacon.h" - -#include "messages/MMonCommand.h" -#include "messages/MMonCommandAck.h" - -#include "messages/MGenericMessage.h" - - -#include "common/Timer.h" - -#include - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_dout << dbeginl << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".mds e" << mdsmap.get_epoch() << " " -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_derr << dbeginl << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".mds e" << mdsmap.get_epoch() << " " - - - -// my methods - -void MDSMonitor::print_map(MDSMap &m, int dbl) -{ - dout(7) << "print_map epoch " << m.get_epoch() << " target_num " << m.target_num << dendl; - entity_inst_t blank; - set all; - m.get_mds_set(all); - for (set::iterator p = all.begin(); - p != all.end(); - ++p) { - dout(dbl) << " mds" << *p << "." << m.mds_inc[*p] - << " : " << MDSMap::get_state_name(m.get_state(*p)) - << " : " << (m.have_inst(*p) ? m.get_inst(*p) : blank) - << dendl; - } -} - - - -// service methods - -void MDSMonitor::create_initial() -{ - dout(10) << "create_initial" << dendl; - pending_mdsmap.target_num = g_conf.num_mds; - pending_mdsmap.created = g_clock.now(); - print_map(pending_mdsmap); -} - -bool MDSMonitor::update_from_paxos() -{ - assert(paxos->is_active()); - - version_t paxosv = paxos->get_version(); - if (paxosv == mdsmap.epoch) return true; - assert(paxosv >= mdsmap.epoch); - - dout(10) << "update_from_paxos paxosv " << paxosv - << ", my e " << mdsmap.epoch << dendl; - - // read and decode - mdsmap_bl.clear(); - bool success = paxos->read(paxosv, mdsmap_bl); - assert(success); - dout(10) << "update_from_paxos got " << paxosv << dendl; - mdsmap.decode(mdsmap_bl); - - // new map - dout(0) << "new map" << dendl; - print_map(mdsmap, 0); - - // bcast map to mds, waiters - if (mon->is_leader()) - bcast_latest_mds(); - send_to_waiting(); - - return true; -} - -void MDSMonitor::create_pending() -{ - pending_mdsmap = mdsmap; - pending_mdsmap.epoch++; - dout(10) << "create_pending e" << pending_mdsmap.epoch << dendl; -} - -void MDSMonitor::encode_pending(bufferlist &bl) -{ - dout(10) << "encode_pending e" << pending_mdsmap.epoch << dendl; - - //print_map(pending_mdsmap); - - // apply to paxos - assert(paxos->get_version() + 1 == pending_mdsmap.epoch); - pending_mdsmap.encode(bl); -} - - -bool MDSMonitor::preprocess_query(Message *m) -{ - dout(10) << "preprocess_query " << *m << " from " << m->get_source_inst() << dendl; - - switch (m->get_type()) { - - case MSG_MDS_BEACON: - return preprocess_beacon((MMDSBeacon*)m); - - case MSG_MDS_GETMAP: - handle_mds_getmap((MMDSGetMap*)m); - return true; - - case MSG_MON_COMMAND: - return false; - - default: - assert(0); - delete m; - return true; - } -} - -void MDSMonitor::handle_mds_getmap(MMDSGetMap *m) -{ - if (m->have < mdsmap.get_epoch()) - send_full(m->get_source_inst()); - else - waiting_for_map.push_back(m->get_source_inst()); -} - - -bool MDSMonitor::preprocess_beacon(MMDSBeacon *m) -{ - dout(12) << "preprocess_beacon " << *m - << " from " << m->get_mds_inst() - << dendl; - - // fw to leader? - if (!mon->is_leader()) { - dout(10) << "fw to leader" << dendl; - mon->messenger->send_message(m, mon->monmap->get_inst(mon->get_leader())); - return true; - } - - // let's see. - int from = m->get_mds_inst().name.num(); - int state = m->get_state(); - version_t seq = m->get_seq(); - - // can i handle this query without a map update? - - // boot? - if (state == MDSMap::STATE_BOOT) { - // already booted? - int already = mdsmap.get_addr_rank(m->get_mds_inst().addr); - if (already < 0) - return false; // need to update map - - // already booted. just reply to beacon, as per usual. - from = already; - } - - // reply to beacon - if (mdsmap.mds_state_seq[from] > seq) { - dout(7) << "mds_beacon " << *m << " has old seq, ignoring" << dendl; - delete m; - return true; - } - - // reply to beacon? - if (state != MDSMap::STATE_STOPPED) { - last_beacon[from] = g_clock.now(); // note time - mon->messenger->send_message(new MMDSBeacon(m->get_mds_inst(), mdsmap.get_epoch(), state, seq), - m->get_mds_inst()); - } - - // is there a state change here? - if (mdsmap.mds_state.count(from) == 0) { - if (state == MDSMap::STATE_BOOT) - return false; // need to add to map - dout(1) << "mds_beacon " << *m << " announcing non-boot state, ignoring" << dendl; - } else if (mdsmap.mds_state[from] != state) { - if (mdsmap.get_epoch() == m->get_last_epoch_seen()) - return false; // need to update map - dout(10) << "mds_beacon " << *m << " ignoring requested state, because mds hasn't seen latest map" << dendl; - } - - // we're done. - delete m; - return true; -} - - -bool MDSMonitor::prepare_update(Message *m) -{ - dout(7) << "prepare_update " << *m << dendl; - - switch (m->get_type()) { - - case MSG_MDS_BEACON: - return handle_beacon((MMDSBeacon*)m); - - case MSG_MON_COMMAND: - return handle_command((MMonCommand*)m); - - default: - assert(0); - delete m; - } - - return true; -} - - - -bool MDSMonitor::handle_beacon(MMDSBeacon *m) -{ - // -- this is an update -- - dout(12) << "handle_beacon " << *m - << " from " << m->get_mds_inst() - << dendl; - int from = m->get_mds_inst().name.num(); - int state = m->get_state(); - version_t seq = m->get_seq(); - - assert(state != mdsmap.get_state(from)); - - // boot? - if (state == MDSMap::STATE_BOOT) { - // assign a name. - if (from >= 0) { - // wants to be (or already is) a specific MDS. - if (!g_conf.mon_allow_mds_bully && - (!mdsmap.have_inst(from) || mdsmap.get_inst(from) != m->get_mds_inst())) { - dout(10) << "mds_beacon boot: mds" << from << " is someone else" << dendl; - from = -1; - } else { - switch (mdsmap.get_state(from)) { - case MDSMap::STATE_STOPPED: - case MDSMap::STATE_STARTING: - case MDSMap::STATE_STANDBY: - state = MDSMap::STATE_STARTING; - break; - case MDSMap::STATE_DNE: - case MDSMap::STATE_CREATING: - state = MDSMap::STATE_CREATING; - break; - case MDSMap::STATE_FAILED: - default: - state = MDSMap::STATE_REPLAY; - break; - } - dout(10) << "mds_beacon boot: mds" << from - << " was " << MDSMap::get_state_name(mdsmap.get_state(from)) - << ", " << MDSMap::get_state_name(state) - << dendl; - } - } - if (from < 0) { - from = pending_mdsmap.get_addr_rank(m->get_mds_inst().addr); - if (from >= 0) { - state = pending_mdsmap.mds_state[from]; - dout(10) << "mds_beacon boot: already pending mds" << from - << " " << MDSMap::get_state_name(state) << dendl; - delete m; - return false; - } - } - if (from < 0) { - // pick a failed mds? - set failed; - pending_mdsmap.get_failed_mds_set(failed); - if (!failed.empty()) { - from = *failed.begin(); - dout(10) << "mds_beacon boot: assigned failed mds" << from << dendl; - state = MDSMap::STATE_REPLAY; - } - } - if (from < 0) { - // ok, just pick any unused mds id. - for (from=0; ; ++from) { - if (pending_mdsmap.is_dne(from)) { - dout(10) << "mds_beacon boot: assigned new mds" << from << dendl; - state = MDSMap::STATE_CREATING; - break; - } else if (pending_mdsmap.is_stopped(from)) { - dout(10) << "mds_beacon boot: assigned stopped mds" << from << dendl; - state = MDSMap::STATE_STARTING; - break; - } - } - } - - assert(state == MDSMap::STATE_CREATING || - state == MDSMap::STATE_STARTING || - state == MDSMap::STATE_REPLAY); - - // put it in the map. - pending_mdsmap.mds_inst[from].addr = m->get_mds_inst().addr; - pending_mdsmap.mds_inst[from].name = entity_name_t::MDS(from); - pending_mdsmap.mds_inc[from]++; - - // reset the beacon timer - last_beacon[from] = g_clock.now(); - - // if starting|creating and degraded|full, go to standby - if ((state == MDSMap::STATE_CREATING || state == MDSMap::STATE_STARTING) && - (pending_mdsmap.would_be_overfull_with(from) || - pending_mdsmap.is_degraded())) { - dout(10) << "mds_beacon cluster full, mds" << from << " will be standby" << dendl; - state = MDSMap::STATE_STANDBY; - } - } - - // created? - if (state == MDSMap::STATE_ACTIVE && - mdsmap.is_creating(from)) { - pending_mdsmap.mds_created.insert(from); - dout(10) << "mds_beacon created mds" << from << dendl; - } - - // update the map - dout(10) << "mds_beacon mds" << from << " " << MDSMap::get_state_name(mdsmap.mds_state[from]) - << " -> " << MDSMap::get_state_name(state) - << dendl; - - // has someone join or leave the cluster? - if (state == MDSMap::STATE_REPLAY || - state == MDSMap::STATE_ACTIVE || - state == MDSMap::STATE_STOPPED) { - pending_mdsmap.same_in_set_since = pending_mdsmap.epoch; - } - - // change the state - pending_mdsmap.mds_state[from] = state; - if (pending_mdsmap.is_up(from)) - pending_mdsmap.mds_state_seq[from] = seq; - else - pending_mdsmap.mds_state_seq.erase(from); - - dout(7) << "pending map now:" << dendl; - print_map(pending_mdsmap); - - paxos->wait_for_commit(new C_Updated(this, from, m)); - - return true; -} - -bool MDSMonitor::should_propose(double& delay) -{ - delay = 0.0; - return true; -} - -void MDSMonitor::_updated(int from, MMDSBeacon *m) -{ - if (m->get_state() == MDSMap::STATE_BOOT) { - dout(10) << "_updated (booted) mds" << from << " " << *m << dendl; - mon->osdmon->send_latest(mdsmap.get_inst(from)); - } else { - dout(10) << "_updated mds" << from << " " << *m << dendl; - } - if (m->get_state() == MDSMap::STATE_STOPPED) { - // send the map manually (they're out of the map, so they won't get it automatic) - send_latest(m->get_mds_inst()); - } - - delete m; -} - - -void MDSMonitor::committed() -{ - // check for failed - set standby; - set failed; - mdsmap.get_mds_set(standby, MDSMap::STATE_STANDBY); - mdsmap.get_failed_mds_set(failed); - - if (!standby.empty() && !failed.empty()) { - while (!standby.empty() && !failed.empty()) { - int f = *failed.begin(); - int t = *standby.begin(); - failed.erase(failed.begin()); - standby.erase(standby.begin()); - - dout(0) << "mds" << t << " taking over for mds" << f << dendl; - - // send new map to old inst/name - waiting_for_map.push_back(mdsmap.mds_inst[t]); - - pending_mdsmap.mds_inst[f] = mdsmap.mds_inst[t]; - pending_mdsmap.mds_inst[f].name = entity_name_t::MDS(f); - pending_mdsmap.mds_inc[f]++; - pending_mdsmap.mds_state[f] = MDSMap::STATE_REPLAY; - pending_mdsmap.mds_state_seq[f] = mdsmap.mds_state_seq[t]; - - pending_mdsmap.mds_inst.erase(t); - pending_mdsmap.mds_state.erase(t); - pending_mdsmap.mds_state_seq.erase(t); - - last_beacon[f] = last_beacon[t]; - last_beacon.erase(t); - } - - dout(7) << "pending map now:" << dendl; - print_map(pending_mdsmap); - - propose_pending(); - } - - // hackish: did all mds's shut down? - if (mon->is_leader() && - g_conf.mon_stop_with_last_mds && - mdsmap.get_epoch() > 1 && - mdsmap.is_stopped()) - mon->messenger->send_message(new MGenericMessage(MSG_SHUTDOWN), - mon->monmap->get_inst(mon->whoami)); -} - - -bool MDSMonitor::handle_command(MMonCommand *m) -{ - int r = -EINVAL; - stringstream ss; - - if (m->cmd.size() > 1) { - if (m->cmd[1] == "stop" && m->cmd.size() > 2) { - int who = atoi(m->cmd[2].c_str()); - if (mdsmap.is_active(who)) { - r = 0; - ss << "telling mds" << who << " to stop"; - pending_mdsmap.mds_state[who] = MDSMap::STATE_STOPPING; - } else { - r = -EEXIST; - ss << "mds" << who << " not active (" << mdsmap.get_state_name(mdsmap.get_state(who)) << ")"; - } - } - else if (m->cmd[1] == "set_target_num" && m->cmd.size() > 2) { - pending_mdsmap.target_num = atoi(m->cmd[2].c_str()); - r = 0; - ss << "target_num = " << pending_mdsmap.target_num; - } - } - if (r == -EINVAL) { - ss << "unrecognized command"; - } - - // reply - string rs; - getline(ss,rs); - mon->messenger->send_message(new MMonCommandAck(r, rs), m->get_source_inst()); - delete m; - return r >= 0; -} - - - -void MDSMonitor::bcast_latest_mds() -{ - dout(10) << "bcast_latest_mds " << mdsmap.get_epoch() << dendl; - - // tell mds - set up; - mdsmap.get_up_mds_set(up); - for (set::iterator p = up.begin(); - p != up.end(); - p++) - send_full(mdsmap.get_inst(*p)); -} - -void MDSMonitor::send_full(entity_inst_t dest) -{ - dout(11) << "send_full to " << dest << dendl; - mon->messenger->send_message(new MMDSMap(&mdsmap), dest); -} - -void MDSMonitor::send_to_waiting() -{ - dout(10) << "send_to_waiting " << mdsmap.get_epoch() << dendl; - for (list::iterator i = waiting_for_map.begin(); - i != waiting_for_map.end(); - i++) - send_full(*i); - waiting_for_map.clear(); -} - -void MDSMonitor::send_latest(entity_inst_t dest) -{ - if (paxos->is_readable()) - send_full(dest); - else - waiting_for_map.push_back(dest); -} - - -void MDSMonitor::tick() -{ - // make sure mds's are still alive - utime_t now = g_clock.now(); - - // ...if i am an active leader - if (!mon->is_leader()) return; - if (!paxos->is_active()) return; - - if (now > g_conf.mds_beacon_grace) { - utime_t cutoff = now; - cutoff -= g_conf.mds_beacon_grace; - - bool changed = false; - - set up; - mdsmap.get_up_mds_set(up); - - for (set::iterator p = up.begin(); - p != up.end(); - ++p) { - if (last_beacon.count(*p)) { - if (last_beacon[*p] < cutoff) { - - // failure! - int newstate; - switch (mdsmap.get_state(*p)) { - case MDSMap::STATE_STANDBY: - if (mdsmap.has_created(*p)) - newstate = MDSMap::STATE_STOPPED; - else - newstate = MDSMap::STATE_DNE; - break; - - case MDSMap::STATE_CREATING: - // didn't finish creating - newstate = MDSMap::STATE_DNE; - break; - - case MDSMap::STATE_STARTING: - newstate = MDSMap::STATE_STOPPED; - break; - - case MDSMap::STATE_REPLAY: - case MDSMap::STATE_RESOLVE: - case MDSMap::STATE_RECONNECT: - case MDSMap::STATE_REJOIN: - case MDSMap::STATE_ACTIVE: - case MDSMap::STATE_STOPPING: - newstate = MDSMap::STATE_FAILED; - break; - - default: - assert(0); - } - - dout(10) << "no beacon from mds" << *p << " since " << last_beacon[*p] - << ", marking " << mdsmap.get_state_name(newstate) - << dendl; - - // update map - pending_mdsmap.mds_state[*p] = newstate; - pending_mdsmap.mds_state_seq.erase(*p); - changed = true; - } - } else { - dout(10) << "no beacons from mds" << *p << ", assuming one " << now << dendl; - last_beacon[*p] = now; - } - } - - if (changed) - propose_pending(); - } -} - - -void MDSMonitor::do_stop() -{ - // hrm... - if (!mon->is_leader() || - !paxos->is_active()) { - dout(-10) << "do_stop can't stop right now, mdsmap not writeable" << dendl; - return; - } - - dout(7) << "do_stop stopping active mds nodes" << dendl; - print_map(mdsmap); - - for (map::iterator p = mdsmap.mds_state.begin(); - p != mdsmap.mds_state.end(); - ++p) { - switch (p->second) { - case MDSMap::STATE_ACTIVE: - case MDSMap::STATE_STOPPING: - pending_mdsmap.mds_state[p->first] = MDSMap::STATE_STOPPING; - break; - case MDSMap::STATE_CREATING: - case MDSMap::STATE_STANDBY: - pending_mdsmap.mds_state[p->first] = MDSMap::STATE_DNE; - break; - case MDSMap::STATE_STARTING: - pending_mdsmap.mds_state[p->first] = MDSMap::STATE_STOPPED; - break; - case MDSMap::STATE_REPLAY: - case MDSMap::STATE_RESOLVE: - case MDSMap::STATE_RECONNECT: - case MDSMap::STATE_REJOIN: - // BUG: hrm, if this is the case, the STOPPING gusy won't be able to stop, will they? - pending_mdsmap.mds_state[p->first] = MDSMap::STATE_FAILED; - break; - } - } - - propose_pending(); -} diff --git a/branches/sage/mds/mon/MDSMonitor.h b/branches/sage/mds/mon/MDSMonitor.h deleted file mode 100644 index 49e8f680c7b41..0000000000000 --- a/branches/sage/mds/mon/MDSMonitor.h +++ /dev/null @@ -1,100 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDSMONITOR_H -#define __MDSMONITOR_H - -#include -#include -using namespace std; - -#include "include/types.h" -#include "msg/Messenger.h" - -#include "mds/MDSMap.h" - -#include "PaxosService.h" - -class MMDSBeacon; -class MMDSGetMap; - -class MDSMonitor : public PaxosService { - public: - // mds maps - MDSMap mdsmap; // current - bufferlist mdsmap_bl; // encoded - - MDSMap pending_mdsmap; // current + pending updates - - // my helpers - void print_map(MDSMap &m, int dbl=7); - - class C_Updated : public Context { - MDSMonitor *mm; - int mds; - MMDSBeacon *m; - public: - C_Updated(MDSMonitor *a, int b, MMDSBeacon *c) : - mm(a), mds(b), m(c) {} - void finish(int r) { - if (r >= 0) - mm->_updated(mds, m); // success - else - mm->dispatch((Message*)m); // try again - } - }; - - - // service methods - void create_initial(); - bool update_from_paxos(); - void create_pending(); - void encode_pending(bufferlist &bl); - - void _updated(int m, MMDSBeacon *m); - - bool preprocess_query(Message *m); // true if processed. - bool prepare_update(Message *m); - bool should_propose(double& delay); - - void committed(); - - bool preprocess_beacon(class MMDSBeacon *m); - bool handle_beacon(class MMDSBeacon *m); - bool handle_command(class MMonCommand *m); - void handle_mds_getmap(MMDSGetMap *m); - - // beacons - map last_beacon; - -public: - MDSMonitor(Monitor *mn, Paxos *p) : PaxosService(mn, p) { } - - // sending the map -private: - list waiting_for_map; - - void bcast_latest_mds(); - void send_full(entity_inst_t dest); - void send_to_waiting(); - -public: - void send_latest(entity_inst_t dest); - - void tick(); // check state, take actions - void do_stop(); - -}; - -#endif diff --git a/branches/sage/mds/mon/MonMap.h b/branches/sage/mds/mon/MonMap.h deleted file mode 100644 index dbe9c9b5ac5e9..0000000000000 --- a/branches/sage/mds/mon/MonMap.h +++ /dev/null @@ -1,101 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MONMAP_H -#define __MONMAP_H - -#include -#include -#include - -#include "msg/Message.h" -#include "include/types.h" - -class MonMap { - public: - epoch_t epoch; // what epoch/version of the monmap - int32_t num_mon; - vector mon_inst; - - int last_mon; // last mon i talked to - - MonMap(int s=0) : epoch(0), num_mon(s), mon_inst(s), last_mon(-1) {} - - void add_mon(entity_inst_t inst) { - mon_inst.push_back(inst); - num_mon++; - } - - // pick a mon. - // choice should be stable, unless we explicitly ask for a new one. - int pick_mon(bool newmon=false) { - if (newmon || (last_mon < 0)) { - last_mon = rand() % num_mon; - } - return last_mon; - } - - const entity_inst_t &get_inst(int m) { - assert(m < num_mon); - return mon_inst[m]; - } - - void encode(bufferlist& blist) { - ::_encode(epoch, blist); - ::_encode(num_mon, blist); - ::_encode(mon_inst, blist); - } - - void decode(bufferlist& blist) { - int off = 0; - ::_decode(epoch, blist, off); - ::_decode(num_mon, blist, off); - ::_decode(mon_inst, blist, off); - } - - // read from/write to a file - int write(char *fn) { - // encode - bufferlist bl; - encode(bl); - - // write - int fd = ::open(fn, O_RDWR|O_CREAT); - if (fd < 0) return fd; - ::fchmod(fd, 0644); - ::write(fd, (void*)bl.c_str(), bl.length()); - ::close(fd); - return 0; - } - - int read(char *fn) { - // read - bufferlist bl; - int fd = ::open(fn, O_RDONLY); - if (fd < 0) return fd; - struct stat st; - ::fstat(fd, &st); - bufferptr bp(st.st_size); - bl.append(bp); - ::read(fd, (void*)bl.c_str(), bl.length()); - ::close(fd); - - // decode - decode(bl); - return 0; - } - -}; - -#endif diff --git a/branches/sage/mds/mon/Monitor.cc b/branches/sage/mds/mon/Monitor.cc deleted file mode 100644 index 1db23b0270e57..0000000000000 --- a/branches/sage/mds/mon/Monitor.cc +++ /dev/null @@ -1,405 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -// TODO: missing run() method, which creates the two main timers, refreshTimer and readTimer - -#include "Monitor.h" - -#include "osd/OSDMap.h" - -#include "MonitorStore.h" - -#include "msg/Message.h" -#include "msg/Messenger.h" - -#include "messages/MPing.h" -#include "messages/MPingAck.h" -#include "messages/MGenericMessage.h" -#include "messages/MMonCommand.h" -#include "messages/MMonCommandAck.h" - -#include "messages/MMonPaxos.h" - -#include "common/Timer.h" -#include "common/Clock.h" - -#include "OSDMonitor.h" -#include "MDSMonitor.h" -#include "ClientMonitor.h" -#include "PGMonitor.h" - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_dout << dbeginl << g_clock.now() << " mon" << whoami << (is_starting() ? (const char*)"(starting)":(is_leader() ? (const char*)"(leader)":(is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << " " -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_derr << dbeginl << g_clock.now() << " mon" << whoami << (is_starting() ? (const char*)"(starting)":(is_leader() ? (const char*)"(leader)":(is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << " " - - - -void Monitor::init() -{ - lock.Lock(); - - dout(1) << "init" << dendl; - - // store - char s[80]; - sprintf(s, "mondata/mon%d", whoami); - store = new MonitorStore(s); - - if (g_conf.mkfs) - store->mkfs(); - - store->mount(); - - // create - osdmon = new OSDMonitor(this, &paxos_osdmap); - mdsmon = new MDSMonitor(this, &paxos_mdsmap); - clientmon = new ClientMonitor(this, &paxos_clientmap); - pgmon = new PGMonitor(this, &paxos_pgmap); - - // init paxos - paxos_test.init(); - paxos_osdmap.init(); - paxos_mdsmap.init(); - paxos_clientmap.init(); - paxos_pgmap.init(); - - // i'm ready! - messenger->set_dispatcher(this); - - // start ticker - reset_tick(); - - // call election? - if (monmap->num_mon > 1) { - assert(monmap->num_mon != 2); - call_election(); - } else { - // we're standalone. - set q; - q.insert(whoami); - win_election(1, q); - } - - lock.Unlock(); -} - -void Monitor::shutdown() -{ - dout(1) << "shutdown" << dendl; - - elector.shutdown(); - - if (is_leader()) { - // stop osds. - for (set::iterator it = osdmon->osdmap.get_osds().begin(); - it != osdmon->osdmap.get_osds().end(); - it++) { - if (osdmon->osdmap.is_down(*it)) continue; - dout(10) << "sending shutdown to osd" << *it << dendl; - messenger->send_message(new MGenericMessage(MSG_SHUTDOWN), - osdmon->osdmap.get_inst(*it)); - } - osdmon->mark_all_down(); - - // monitors too. - for (int i=0; inum_mon; i++) - if (i != whoami) - messenger->send_message(new MGenericMessage(MSG_SHUTDOWN), - monmap->get_inst(i)); - } - - // cancel all events - cancel_tick(); - timer.cancel_all(); - timer.join(); - - // unmount my local storage - if (store) - delete store; - - // clean up - if (osdmon) delete osdmon; - if (mdsmon) delete mdsmon; - if (clientmon) delete clientmon; - if (pgmon) delete pgmon; - - // die. - messenger->shutdown(); -} - - -void Monitor::call_election() -{ - if (monmap->num_mon == 1) return; - - dout(10) << "call_election" << dendl; - state = STATE_STARTING; - - // tell paxos - paxos_test.election_starting(); - paxos_mdsmap.election_starting(); - paxos_osdmap.election_starting(); - paxos_clientmap.election_starting(); - - // call a new election - elector.call_election(); -} - -void Monitor::win_election(epoch_t epoch, set& active) -{ - state = STATE_LEADER; - leader = whoami; - mon_epoch = epoch; - quorum = active; - dout(10) << "win_election, epoch " << mon_epoch << " quorum is " << quorum << dendl; - - // init paxos - paxos_test.leader_init(); - paxos_mdsmap.leader_init(); - paxos_osdmap.leader_init(); - paxos_clientmap.leader_init(); - paxos_pgmap.leader_init(); - - // init - osdmon->election_finished(); - mdsmon->election_finished(); - clientmon->election_finished(); - pgmon->election_finished(); -} - -void Monitor::lose_election(epoch_t epoch, int l) -{ - state = STATE_PEON; - mon_epoch = epoch; - leader = l; - dout(10) << "lose_election, epoch " << mon_epoch << " leader is mon" << leader << dendl; - - // init paxos - paxos_test.peon_init(); - paxos_mdsmap.peon_init(); - paxos_osdmap.peon_init(); - paxos_clientmap.peon_init(); - paxos_pgmap.peon_init(); - - // init - osdmon->election_finished(); - mdsmon->election_finished(); - clientmon->election_finished(); - pgmon->election_finished(); -} - - -void Monitor::handle_command(MMonCommand *m) -{ - dout(0) << "handle_command " << *m << dendl; - - int r = -1; - string rs = "unrecognized command"; - - if (!m->cmd.empty()) { - if (m->cmd[0] == "stop") { - r = 0; - rs = "stopping"; - do_stop(); - } - else if (m->cmd[0] == "mds") { - mdsmon->dispatch(m); - return; - } - else if (m->cmd[0] == "osd") { - - } - } - - // reply - messenger->send_message(new MMonCommandAck(r, rs), m->get_source_inst()); - delete m; -} - - -void Monitor::do_stop() -{ - dout(0) << "do_stop -- shutting down" << dendl; - stopping = true; - mdsmon->do_stop(); -} - - -void Monitor::dispatch(Message *m) -{ - lock.Lock(); - { - switch (m->get_type()) { - - // misc - case MSG_PING_ACK: - handle_ping_ack((MPingAck*)m); - break; - - case MSG_SHUTDOWN: - if (m->get_source().is_osd()) - osdmon->dispatch(m); - else - handle_shutdown(m); - break; - - case MSG_MON_COMMAND: - handle_command((MMonCommand*)m); - break; - - - // OSDs - case MSG_OSD_GETMAP: - case MSG_OSD_FAILURE: - case MSG_OSD_BOOT: - case MSG_OSD_IN: - case MSG_OSD_OUT: - osdmon->dispatch(m); - break; - - - // MDSs - case MSG_MDS_BEACON: - case MSG_MDS_GETMAP: - mdsmon->dispatch(m); - break; - - // clients - case MSG_CLIENT_MOUNT: - case MSG_CLIENT_UNMOUNT: - clientmon->dispatch(m); - break; - - // pg - case MSG_STATFS: - case MSG_PGSTATS: - pgmon->dispatch(m); - break; - - - // paxos - case MSG_MON_PAXOS: - { - MMonPaxos *pm = (MMonPaxos*)m; - - // sanitize - if (pm->epoch > mon_epoch) - call_election(); - if (pm->epoch != mon_epoch) { - delete pm; - break; - } - - // send it to the right paxos instance - switch (pm->machine_id) { - case PAXOS_TEST: - paxos_test.dispatch(m); - break; - case PAXOS_OSDMAP: - paxos_osdmap.dispatch(m); - break; - case PAXOS_MDSMAP: - paxos_mdsmap.dispatch(m); - break; - case PAXOS_CLIENTMAP: - paxos_clientmap.dispatch(m); - break; - default: - assert(0); - } - } - break; - - // elector messages - case MSG_MON_ELECTION: - elector.dispatch(m); - break; - - - default: - dout(0) << "unknown message " << m << " " << *m << " from " << m->get_source_inst() << dendl; - assert(0); - } - } - lock.Unlock(); -} - - -void Monitor::handle_shutdown(Message *m) -{ - assert(m->get_source().is_mon()); - if (m->get_source().num() == get_leader()) { - dout(1) << "shutdown from leader " << m->get_source() << dendl; - shutdown(); - } else { - dout(1) << "ignoring shutdown from non-leader " << m->get_source() << dendl; - } - delete m; -} - -void Monitor::handle_ping_ack(MPingAck *m) -{ - // ... - - delete m; -} - - - - -/************ TICK ***************/ - -class C_Mon_Tick : public Context { - Monitor *mon; -public: - C_Mon_Tick(Monitor *m) : mon(m) {} - void finish(int r) { - mon->tick(); - } -}; - -void Monitor::cancel_tick() -{ - if (tick_timer) timer.cancel_event(tick_timer); -} - -void Monitor::reset_tick() -{ - cancel_tick(); - tick_timer = new C_Mon_Tick(this); - timer.add_event_after(g_conf.mon_tick_interval, tick_timer); -} - - -void Monitor::tick() -{ - tick_timer = 0; - - // ok go. - dout(11) << "tick" << dendl; - - osdmon->tick(); - mdsmon->tick(); - - // next tick! - reset_tick(); -} - - - - - - - diff --git a/branches/sage/mds/mon/Monitor.h b/branches/sage/mds/mon/Monitor.h deleted file mode 100644 index bd278a2092308..0000000000000 --- a/branches/sage/mds/mon/Monitor.h +++ /dev/null @@ -1,154 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MONITOR_H -#define __MONITOR_H - -#include "include/types.h" -#include "msg/Messenger.h" - -#include "common/Timer.h" - -#include "MonMap.h" -#include "Elector.h" -#include "Paxos.h" - - -class MonitorStore; -class OSDMonitor; -class MDSMonitor; -class ClientMonitor; -class PGMonitor; - -class Monitor : public Dispatcher { -public: - // me - int whoami; - Messenger *messenger; - Mutex lock; - - MonMap *monmap; - - // timer. - SafeTimer timer; - Context *tick_timer; - void cancel_tick(); - void reset_tick(); - friend class C_Mon_Tick; - - // -- local storage -- -public: - MonitorStore *store; - - // -- monitor state -- -private: - const static int STATE_STARTING = 0; // electing - const static int STATE_LEADER = 1; - const static int STATE_PEON = 2; - int state; - bool stopping; - -public: - bool is_starting() { return state == STATE_STARTING; } - bool is_leader() { return state == STATE_LEADER; } - bool is_peon() { return state == STATE_PEON; } - bool is_stopping() { return stopping; } - - - // -- elector -- -private: - Elector elector; - friend class Elector; - - epoch_t mon_epoch; // monitor epoch (election instance) - int leader; // current leader (to best of knowledge) - set quorum; // current active set of monitors (if !starting) - utime_t last_called_election; // [starting] last time i called an election - -public: - epoch_t get_epoch() { return mon_epoch; } - int get_leader() { return leader; } - const set& get_quorum() { return quorum; } - - void call_election(); // initiate election - void win_election(epoch_t epoch, set& q); // end election (called by Elector) - void lose_election(epoch_t epoch, int l); // end election (called by Elector) - - - // -- paxos -- - Paxos paxos_test; - Paxos paxos_mdsmap; - Paxos paxos_osdmap; - Paxos paxos_clientmap; - Paxos paxos_pgmap; - friend class Paxos; - - - // -- services -- - OSDMonitor *osdmon; - MDSMonitor *mdsmon; - ClientMonitor *clientmon; - PGMonitor *pgmon; - - friend class OSDMonitor; - friend class MDSMonitor; - friend class ClientMonitor; - friend class PGMonitor; - - - // messages - void handle_shutdown(Message *m); - void handle_ping_ack(class MPingAck *m); - void handle_command(class MMonCommand *m); - - - - public: - Monitor(int w, Messenger *m, MonMap *mm) : - whoami(w), - messenger(m), - monmap(mm), - timer(lock), tick_timer(0), - store(0), - - state(STATE_STARTING), stopping(false), - - elector(this, w), - mon_epoch(0), - leader(0), - - paxos_test(this, w, PAXOS_TEST), - paxos_mdsmap(this, w, PAXOS_MDSMAP), - paxos_osdmap(this, w, PAXOS_OSDMAP), - paxos_clientmap(this, w, PAXOS_CLIENTMAP), - paxos_pgmap(this, w, PAXOS_PGMAP), - - osdmon(0), mdsmon(0), clientmon(0) - { - } - ~Monitor() { - delete messenger; - } - - void init(); - void shutdown(); - void dispatch(Message *m); - void tick(); - - void do_stop(); - -}; - -#endif diff --git a/branches/sage/mds/mon/MonitorStore.cc b/branches/sage/mds/mon/MonitorStore.cc deleted file mode 100644 index 86df22bcd6590..0000000000000 --- a/branches/sage/mds/mon/MonitorStore.cc +++ /dev/null @@ -1,222 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "MonitorStore.h" -#include "common/Clock.h" - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_dout << dbeginl << g_clock.now() << " store(" << dir <<") " -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_derr << dbeginl << g_clock.now() << " store(" << dir <<") " - -#include -#include -#include -#include -#include - -void MonitorStore::mount() -{ - dout(1) << "mount" << dendl; - // verify dir exists - DIR *d = ::opendir(dir.c_str()); - if (!d) { - derr(1) << "basedir " << dir << " dne" << dendl; - assert(0); - } - ::closedir(d); - - if (g_conf.use_abspaths) { - // combine it with the cwd, in case fuse screws things up (i.e. fakefuse) - string old = dir; - char *cwd = get_current_dir_name(); - dir = cwd; - free(cwd); - dir += "/"; - dir += old; - } -} - - -void MonitorStore::mkfs() -{ - dout(1) << "mkfs" << dendl; - - char cmd[200]; - sprintf(cmd, "test -d %s && /bin/rm -r %s ; mkdir -p %s", dir.c_str(), dir.c_str(), dir.c_str()); - dout(1) << cmd << dendl; - system(cmd); -} - - -version_t MonitorStore::get_int(const char *a, const char *b) -{ - char fn[200]; - if (b) - sprintf(fn, "%s/%s/%s", dir.c_str(), a, b); - else - sprintf(fn, "%s/%s", dir.c_str(), a); - - FILE *f = ::fopen(fn, "r"); - if (!f) - return 0; - - char buf[20]; - ::fgets(buf, 20, f); - ::fclose(f); - - version_t val = atoi(buf); - - if (b) { - dout(15) << "get_int " << a << "/" << b << " = " << val << dendl; - } else { - dout(15) << "get_int " << a << " = " << val << dendl; - } - return val; -} - - -void MonitorStore::put_int(version_t val, const char *a, const char *b) -{ - char fn[200]; - sprintf(fn, "%s/%s", dir.c_str(), a); - if (b) { - ::mkdir(fn, 0755); - dout(15) << "set_int " << a << "/" << b << " = " << val << dendl; - sprintf(fn, "%s/%s/%s", dir.c_str(), a, b); - } else { - dout(15) << "set_int " << a << " = " << val << dendl; - } - - char vs[30]; -#ifdef __LP64__ - sprintf(vs, "%ld\n", val); -#else - sprintf(vs, "%lld\n", val); -#endif - - char tfn[200]; - sprintf(tfn, "%s.new", fn); - - int fd = ::open(tfn, O_WRONLY|O_CREAT, 0644); - assert(fd > 0); - ::write(fd, vs, strlen(vs)); - ::close(fd); - ::rename(tfn, fn); -} - - -// ---------------------------------------- -// buffers - -bool MonitorStore::exists_bl_ss(const char *a, const char *b) -{ - char fn[200]; - if (b) { - dout(15) << "exists_bl " << a << "/" << b << dendl; - sprintf(fn, "%s/%s/%s", dir.c_str(), a, b); - } else { - dout(15) << "exists_bl " << a << dendl; - sprintf(fn, "%s/%s", dir.c_str(), a); - } - - struct stat st; - int r = ::stat(fn, &st); - //dout(15) << "exists_bl stat " << fn << " r=" << r << " errno " << errno << " " << strerror(errno) << dendl; - return r == 0; -} - - -int MonitorStore::get_bl_ss(bufferlist& bl, const char *a, const char *b) -{ - char fn[200]; - if (b) { - sprintf(fn, "%s/%s/%s", dir.c_str(), a, b); - } else { - sprintf(fn, "%s/%s", dir.c_str(), a); - } - - int fd = ::open(fn, O_RDONLY); - if (!fd) { - if (b) { - dout(15) << "get_bl " << a << "/" << b << " DNE" << dendl; - } else { - dout(15) << "get_bl " << a << " DNE" << dendl; - } - return 0; - } - - // get size - struct stat st; - int rc = ::fstat(fd, &st); - assert(rc == 0); - __int32_t len = st.st_size; - - // read buffer - bl.clear(); - bufferptr bp(len); - int off = 0; - while (off < len) { - dout(20) << "reading at off " << off << " of " << len << dendl; - int r = ::read(fd, bp.c_str()+off, len-off); - if (r < 0) derr(0) << "errno on read " << strerror(errno) << dendl; - assert(r>0); - off += r; - } - bl.append(bp); - ::close(fd); - - if (b) { - dout(15) << "get_bl " << a << "/" << b << " = " << bl.length() << " bytes" << dendl; - } else { - dout(15) << "get_bl " << a << " = " << bl.length() << " bytes" << dendl; - } - - return len; -} - -int MonitorStore::put_bl_ss(bufferlist& bl, const char *a, const char *b) -{ - char fn[200]; - sprintf(fn, "%s/%s", dir.c_str(), a); - if (b) { - ::mkdir(fn, 0755); - dout(15) << "put_bl " << a << "/" << b << " = " << bl.length() << " bytes" << dendl; - sprintf(fn, "%s/%s/%s", dir.c_str(), a, b); - } else { - dout(15) << "put_bl " << a << " = " << bl.length() << " bytes" << dendl; - } - - char tfn[200]; - sprintf(tfn, "%s.new", fn); - int fd = ::open(tfn, O_WRONLY|O_CREAT, 0644); - assert(fd); - - // write data - for (list::const_iterator it = bl.buffers().begin(); - it != bl.buffers().end(); - it++) { - int r = ::write(fd, it->c_str(), it->length()); - if (r != (int)it->length()) - derr(0) << "put_bl_ss ::write() returned " << r << " not " << it->length() << dendl; - if (r < 0) - derr(0) << "put_bl_ss ::write() errored out, errno is " << strerror(errno) << dendl; - } - - ::fsync(fd); - ::close(fd); - ::rename(tfn, fn); - - return 0; -} diff --git a/branches/sage/mds/mon/MonitorStore.h b/branches/sage/mds/mon/MonitorStore.h deleted file mode 100644 index 485bf972551c4..0000000000000 --- a/branches/sage/mds/mon/MonitorStore.h +++ /dev/null @@ -1,82 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MON_MONITORSTORE_H -#define __MON_MONITORSTORE_H - -#include "include/types.h" -#include "include/buffer.h" - -#include - -class MonitorStore { - string dir; - -public: - MonitorStore(char *d) : dir(d) { - } - ~MonitorStore() { - } - - void mkfs(); // wipe - void mount(); - - // ints (stored as ascii) - version_t get_int(const char *a, const char *b=0); - void put_int(version_t v, const char *a, const char *b=0); - - // buffers - // ss and sn varieties. - bool exists_bl_ss(const char *a, const char *b=0); - int get_bl_ss(bufferlist& bl, const char *a, const char *b); - int put_bl_ss(bufferlist& bl, const char *a, const char *b); - bool exists_bl_sn(const char *a, version_t b) { - char bs[20]; -#ifdef __LP64__ - sprintf(bs, "%lu", b); -#else - sprintf(bs, "%llu", b); -#endif - return exists_bl_ss(a, bs); - } - int get_bl_sn(bufferlist& bl, const char *a, version_t b) { - char bs[20]; -#ifdef __LP64__ - sprintf(bs, "%lu", b); -#else - sprintf(bs, "%llu", b); -#endif - return get_bl_ss(bl, a, bs); - } - int put_bl_sn(bufferlist& bl, const char *a, version_t b) { - char bs[20]; -#ifdef __LP64__ - sprintf(bs, "%lu", b); -#else - sprintf(bs, "%llu", b); -#endif - return put_bl_ss(bl, a, bs); - } - - /* - version_t get_incarnation() { return get_int("incarnation"); } - void set_incarnation(version_t i) { set_int(i, "incarnation"); } - - version_t get_last_proposal() { return get_int("last_proposal"); } - void set_last_proposal(version_t i) { set_int(i, "last_proposal"); } - */ -}; - - -#endif diff --git a/branches/sage/mds/mon/OSDMonitor.cc b/branches/sage/mds/mon/OSDMonitor.cc deleted file mode 100644 index 200187510f698..0000000000000 --- a/branches/sage/mds/mon/OSDMonitor.cc +++ /dev/null @@ -1,829 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "OSDMonitor.h" -#include "Monitor.h" -#include "MDSMonitor.h" - -#include "MonitorStore.h" - -#include "messages/MOSDFailure.h" -#include "messages/MOSDMap.h" -#include "messages/MOSDGetMap.h" -#include "messages/MOSDBoot.h" -#include "messages/MOSDIn.h" -#include "messages/MOSDOut.h" - -#include "messages/MMonOSDMapInfo.h" -#include "messages/MMonOSDMapLease.h" -#include "messages/MMonOSDMapLeaseAck.h" -#include "messages/MMonOSDMapUpdatePrepare.h" -#include "messages/MMonOSDMapUpdateAck.h" -#include "messages/MMonOSDMapUpdateCommit.h" - -#include "common/Timer.h" - -#include "config.h" - - -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_dout << dbeginl << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".osd e" << osdmap.get_epoch() << " " -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_derr << dbeginl << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".osd e" << osdmap.get_epoch() << " " - - -// FAKING - -class C_Mon_FakeOSDFailure : public Context { - OSDMonitor *mon; - int osd; - bool down; -public: - C_Mon_FakeOSDFailure(OSDMonitor *m, int o, bool d) : mon(m), osd(o), down(d) {} - void finish(int r) { - mon->fake_osd_failure(osd,down); - } -}; - -void OSDMonitor::fake_osd_failure(int osd, bool down) -{ - if (down) { - dout(1) << "fake_osd_failure DOWN osd" << osd << dendl; - pending_inc.new_down[osd].first = osdmap.osd_inst[osd]; - pending_inc.new_down[osd].second = false; - } else { - dout(1) << "fake_osd_failure OUT osd" << osd << dendl; - pending_inc.new_out.push_back(osd); - } - propose_pending(); - - // fixme - //bcast_latest_osd(); - //bcast_latest_mds(); -} - -void OSDMonitor::fake_osdmap_update() -{ - dout(1) << "fake_osdmap_update" << dendl; - propose_pending(); - - // tell a random osd - int osd = rand() % g_conf.num_osd; - send_latest(osdmap.get_inst(osd)); -} - - -void OSDMonitor::fake_reorg() -{ - int r = rand() % g_conf.num_osd; - - if (osdmap.is_out(r)) { - dout(1) << "fake_reorg marking osd" << r << " in" << dendl; - pending_inc.new_in.push_back(r); - } else { - dout(1) << "fake_reorg marking osd" << r << " out" << dendl; - pending_inc.new_out.push_back(r); - } - - propose_pending(); - send_latest(osdmap.get_inst(r)); // after -} - - - -/************ MAPS ****************/ - -void OSDMonitor::create_initial() -{ - assert(mon->is_leader()); - assert(paxos->get_version() == 0); - - dout(1) << "create_initial -- creating initial osdmap from g_conf" << dendl; - - // - OSDMap newmap; - newmap.mon_epoch = mon->mon_epoch; - newmap.ctime = g_clock.now(); - - newmap.set_pg_num(g_conf.num_osd << g_conf.osd_pg_bits); - - // start at epoch 1 until all osds boot - newmap.inc_epoch(); // = 1 - assert(newmap.get_epoch() == 1); - - map weights; - build_crush_map(newmap.crush, weights); - - for (int i=0; i - - // fake osd failures - for (map::iterator i = g_fake_osd_down.begin(); - i != g_fake_osd_down.end(); - i++) { - dout(0) << "will fake osd" << i->first << " DOWN after " << i->second << dendl; - mon->timer.add_event_after(i->second, new C_Mon_FakeOSDFailure(this, i->first, 1)); - } - for (map::iterator i = g_fake_osd_out.begin(); - i != g_fake_osd_out.end(); - i++) { - dout(0) << "will fake osd" << i->first << " OUT after " << i->second << dendl; - mon->timer.add_event_after(i->second, new C_Mon_FakeOSDFailure(this, i->first, 0)); - } - - // encode into pending incremental - newmap.encode(pending_inc.fullmap); -} - - -void OSDMonitor::build_crush_map(Crush& crush, - map& weights) -{ - - if (g_conf.num_osd >= 12) { - int ndom = g_conf.osd_max_rep; - UniformBucket *domain[ndom]; - int domid[ndom]; - for (int i=0; iadd_item(i, weights[i] ? weights[i]:1.0); - //derr(0) << "osd" << i << " in domain " << dom << dendl; - i++; - if (i == g_conf.num_osd) break; - } - } - - // root - Bucket *root = new ListBucket(2); - for (int i=0; iget_weight() << dendl; - root->add_item(domid[i], domain[i]->get_weight()); - } - int nroot = crush.add_bucket(root); - - // rules - // replication - for (int i=1; i<=ndom; i++) { - int r = CRUSH_REP_RULE(i); - crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_TAKE, nroot)); - crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, i, 1)); - crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, 1, 0)); - crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - } - // raid - for (int i=g_conf.osd_min_raid_width; i <= g_conf.osd_max_raid_width; i++) { - int r = CRUSH_RAID_RULE(i); - if (ndom >= i) { - crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_TAKE, nroot)); - crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE_INDEP, i, 1)); - crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE_INDEP, 1, 0)); - crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - } else { - crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_TAKE, nroot)); - crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE_INDEP, i, 0)); - crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - } - } - - // test - //vector out; - //pg_to_osds(0x40200000110ULL, out); - - } else { - // one bucket - Bucket *b = new UniformBucket(1, 0); - int root = crush.add_bucket(b); - for (int i=0; iadd_item(i, weights[i] ? weights[i]:1.0); - } - - // rules - // replication - for (int i=1; i<=g_conf.osd_max_rep; i++) { - int r = CRUSH_REP_RULE(i); - crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, i, 0)); - crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - } - // raid - for (int i=g_conf.osd_min_raid_width; i <= g_conf.osd_max_raid_width; i++) { - int r = CRUSH_RAID_RULE(i); - crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE_INDEP, i, 0)); - crush.rules[r].steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - } - } -} - - -bool OSDMonitor::update_from_paxos() -{ - assert(paxos->is_active()); - - version_t paxosv = paxos->get_version(); - if (paxosv == osdmap.epoch) return true; - assert(paxosv >= osdmap.epoch); - - dout(15) << "update_from_paxos paxos e " << paxosv - << ", my e " << osdmap.epoch << dendl; - - if (osdmap.epoch == 0 && paxosv > 1) { - // startup: just load latest full map - epoch_t lastfull = mon->store->get_int("osdmap_full","last_epoch"); - if (lastfull) { - dout(7) << "update_from_paxos startup: loading latest full map e" << lastfull << dendl; - bufferlist bl; - mon->store->get_bl_sn(bl, "osdmap_full", lastfull); - osdmap.decode(bl); - } - } - - // walk through incrementals - while (paxosv > osdmap.epoch) { - bufferlist bl; - bool success = paxos->read(osdmap.epoch+1, bl); - assert(success); - - dout(7) << "update_from_paxos applying incremental " << osdmap.epoch+1 << dendl; - OSDMap::Incremental inc; - int off = 0; - inc.decode(bl, off); - osdmap.apply_incremental(inc); - - // write out the full map, too. - bl.clear(); - osdmap.encode(bl); - mon->store->put_bl_sn(bl, "osdmap_full", osdmap.epoch); - - // share - dout(1) << osdmap.osds.size() << " osds, " - << osdmap.down_osds.size() << " down, " - << osdmap.out_osds.size() << " out" - << dendl; - } - mon->store->put_int(osdmap.epoch, "osdmap_full","last_epoch"); - - // new map! - bcast_latest_mds(); - send_to_waiting(); - - return true; -} - - -void OSDMonitor::create_pending() -{ - pending_inc = OSDMap::Incremental(osdmap.epoch+1); - dout(10) << "create_pending e " << pending_inc.epoch - << dendl; -} - -void OSDMonitor::encode_pending(bufferlist &bl) -{ - dout(10) << "encode_pending e " << pending_inc.epoch - << dendl; - - // finish up pending_inc - pending_inc.ctime = g_clock.now(); - pending_inc.mon_epoch = mon->mon_epoch; - - // tell me about it - for (map >::iterator i = pending_inc.new_down.begin(); - i != pending_inc.new_down.end(); - i++) { - dout(2) << " osd" << i->first << " DOWN " << i->second.first << " clean=" << i->second.second << dendl; - derr(0) << " osd" << i->first << " DOWN " << i->second.first << " clean=" << i->second.second << dendl; - mon->messenger->mark_down(i->second.first.addr); - } - for (map::iterator i = pending_inc.new_up.begin(); - i != pending_inc.new_up.end(); - i++) { - dout(2) << " osd" << i->first << " UP " << i->second << dendl; - derr(0) << " osd" << i->first << " UP " << i->second << dendl; - } - for (list::iterator i = pending_inc.new_out.begin(); - i != pending_inc.new_out.end(); - i++) { - dout(2) << " osd" << *i << " OUT" << dendl; - derr(0) << " osd" << *i << " OUT" << dendl; - } - for (list::iterator i = pending_inc.new_in.begin(); - i != pending_inc.new_in.end(); - i++) { - dout(2) << " osd" << *i << " IN" << dendl; - derr(0) << " osd" << *i << " IN" << dendl; - } - - // encode - assert(paxos->get_version() + 1 == pending_inc.epoch); - pending_inc.encode(bl); -} - - -void OSDMonitor::committed() -{ - -} - - -// ------------- - -bool OSDMonitor::preprocess_query(Message *m) -{ - dout(10) << "preprocess_query " << *m << " from " << m->get_source_inst() << dendl; - - switch (m->get_type()) { - // READs - case MSG_OSD_GETMAP: - handle_osd_getmap((MOSDGetMap*)m); - return true; - - // damp updates - case MSG_OSD_FAILURE: - return preprocess_failure((MOSDFailure*)m); - case MSG_OSD_BOOT: - return preprocess_boot((MOSDBoot*)m); - /* - case MSG_OSD_IN: - return preprocess_in((MOSDIn*)m); - case MSG_OSD_OUT: - return preprocess_out((MOSDOut*)m); - */ - - default: - assert(0); - delete m; - return true; - } -} - -bool OSDMonitor::prepare_update(Message *m) -{ - dout(7) << "prepare_update " << *m << " from " << m->get_source_inst() << dendl; - - switch (m->get_type()) { - // damp updates - case MSG_OSD_FAILURE: - return prepare_failure((MOSDFailure*)m); - case MSG_OSD_BOOT: - return prepare_boot((MOSDBoot*)m); - - /* - case MSG_OSD_IN: - return prepare_in((MOSDIn*)m); - case MSG_OSD_OUT: - return prepare_out((MOSDOut*)m); - */ - - default: - assert(0); - delete m; - } - - return false; -} - -bool OSDMonitor::should_propose(double& delay) -{ - if (osdmap.epoch == 1) { - if (pending_inc.new_up.size() == osdmap.get_osds().size()) { - delay = 0.0; - if (g_conf.osd_auto_weight) { - Crush crush; - build_crush_map(crush, osd_weight); - crush._encode(pending_inc.crush); - } - return true; - } else - return false; - } - return PaxosService::should_propose(delay); -} - - - -// --------------------------- -// READs - -void OSDMonitor::handle_osd_getmap(MOSDGetMap *m) -{ - dout(7) << "handle_osd_getmap from " << m->get_source() << " from " << m->get_start_epoch() << dendl; - - if (m->get_start_epoch()) { - if (m->get_want_epoch() <= osdmap.get_epoch()) - send_incremental(m->get_source_inst(), m->get_start_epoch()); - else - waiting_for_map[m->get_source_inst()] = pair(m->get_start_epoch(), - m->get_want_epoch()); - } else - send_full(m->get_source_inst()); - - delete m; -} - - - -// --------------------------- -// UPDATEs - -// failure -- - -bool OSDMonitor::preprocess_failure(MOSDFailure *m) -{ - int badboy = m->get_failed().name.num(); - - // weird? - if (!osdmap.have_inst(badboy)) { - dout(5) << "preprocess_failure dne(/dup?): " << m->get_failed() << ", from " << m->get_from() << dendl; - send_incremental(m->get_from(), m->get_epoch()+1); - return true; - } - if (osdmap.get_inst(badboy) != m->get_failed()) { - dout(5) << "preprocess_failure wrong osd: report " << m->get_failed() << " != map's " << osdmap.get_inst(badboy) - << ", from " << m->get_from() << dendl; - send_incremental(m->get_from(), m->get_epoch()+1); - return true; - } - // already reported? - if (osdmap.is_down(badboy)) { - dout(5) << "preprocess_failure dup: " << m->get_failed() << ", from " << m->get_from() << dendl; - send_incremental(m->get_from(), m->get_epoch()+1); - return true; - } - - dout(10) << "preprocess_failure new: " << m->get_failed() << ", from " << m->get_from() << dendl; - return false; -} - -bool OSDMonitor::prepare_failure(MOSDFailure *m) -{ - dout(1) << "prepare_failure " << m->get_failed() << " from " << m->get_from() << dendl; - - // FIXME - // take their word for it - int badboy = m->get_failed().name.num(); - assert(osdmap.is_up(badboy)); - assert(osdmap.osd_inst[badboy] == m->get_failed()); - - pending_inc.new_down[badboy].first = m->get_failed(); - pending_inc.new_down[badboy].second = false; - - if (osdmap.is_in(badboy)) - down_pending_out[badboy] = g_clock.now(); - - paxos->wait_for_commit(new C_Reported(this, m)); - - return true; -} - -void OSDMonitor::_reported_failure(MOSDFailure *m) -{ - dout(7) << "_reported_failure on " << m->get_failed() << ", telling " << m->get_from() << dendl; - send_latest(m->get_from(), m->get_epoch()); -} - - -// boot -- - -bool OSDMonitor::preprocess_boot(MOSDBoot *m) -{ - assert(m->inst.name.is_osd()); - int from = m->inst.name.num(); - - // already booted? - if (osdmap.is_up(from) && - osdmap.get_inst(from) == m->inst) { - // yup. - dout(7) << "preprocess_boot dup from " << m->inst << dendl; - _booted(m); - return true; - } - - dout(10) << "preprocess_boot from " << m->inst << dendl; - return false; -} - -bool OSDMonitor::prepare_boot(MOSDBoot *m) -{ - dout(7) << "prepare_boot from " << m->inst << dendl; - assert(m->inst.name.is_osd()); - int from = m->inst.name.num(); - - // does this osd exist? - if (!osdmap.exists(from)) { - dout(1) << "boot from non-existent osd" << from << dendl; - delete m; - return true; - } - - // already up? mark down first? - if (osdmap.is_up(from)) { - dout(7) << "prepare_boot was up, first marking down " << osdmap.get_inst(from) << dendl; - assert(osdmap.get_inst(from) != m->inst); // preproces should have caught it - - // mark previous guy down - pending_inc.new_down[from].first = osdmap.osd_inst[from]; - pending_inc.new_down[from].second = false; - - paxos->wait_for_commit(new C_RetryMessage(this, m)); - } else { - // mark new guy up. - down_pending_out.erase(from); // if any - pending_inc.new_up[from] = m->inst; - - // mark in? - if (osdmap.out_osds.count(from)) - pending_inc.new_in.push_back(from); - - osd_weight[from] = m->sb.weight; - - // wait - paxos->wait_for_commit(new C_Booted(this, m)); - } - return true; -} - -void OSDMonitor::_booted(MOSDBoot *m) -{ - dout(7) << "_booted " << m->inst << " w " << m->sb.weight << dendl; - send_latest(m->inst, m->sb.current_epoch); - delete m; -} - - - - - -// --------------- -// map helpers - -void OSDMonitor::send_to_waiting() -{ - dout(10) << "send_to_waiting " << osdmap.get_epoch() << dendl; - - map >::iterator i = waiting_for_map.begin(); - while (i != waiting_for_map.end()) { - if (i->second.first) { - if (i->second.second <= osdmap.get_epoch()) - send_incremental(i->first, i->second.first); - else { - dout(10) << "send_to_waiting skipping " << i->first - << " has " << i->second.first - << " wants " << i->second.second - << dendl; - i++; - continue; - } - } else - send_full(i->first); - - waiting_for_map.erase(i++); - } -} - -void OSDMonitor::send_latest(entity_inst_t who, epoch_t start) -{ - if (paxos->is_readable()) { - dout(5) << "send_latest to " << who << " now" << dendl; - if (start == 0) - send_full(who); - else - send_incremental(who, start); - } else { - dout(5) << "send_latest to " << who << " later" << dendl; - waiting_for_map[who] = pair(start, 0); - } -} - - -void OSDMonitor::send_full(entity_inst_t who) -{ - dout(5) << "send_full to " << who << dendl; - mon->messenger->send_message(new MOSDMap(&osdmap), who); -} - -void OSDMonitor::send_incremental(entity_inst_t dest, epoch_t from) -{ - dout(5) << "send_incremental from " << from << " -> " << osdmap.get_epoch() - << " to " << dest << dendl; - - MOSDMap *m = new MOSDMap; - - for (epoch_t e = osdmap.get_epoch(); - e >= from; - e--) { - bufferlist bl; - if (mon->store->get_bl_sn(bl, "osdmap", e) > 0) { - dout(20) << "send_incremental inc " << e << " " << bl.length() << " bytes" << dendl; - m->incremental_maps[e] = bl; - } - else if (mon->store->get_bl_sn(bl, "osdmap_full", e) > 0) { - dout(20) << "send_incremental full " << e << dendl; - m->maps[e] = bl; - } - else { - assert(0); // we should have all maps. - } - } - - mon->messenger->send_message(m, dest); -} - - -void OSDMonitor::bcast_latest_mds() -{ - epoch_t e = osdmap.get_epoch(); - dout(1) << "bcast_latest_mds epoch " << e << dendl; - - // tell mds - set up; - mon->mdsmon->mdsmap.get_up_mds_set(up); - for (set::iterator i = up.begin(); - i != up.end(); - i++) { - send_incremental(mon->mdsmon->mdsmap.get_inst(*i), osdmap.get_epoch()); - } -} - -void OSDMonitor::bcast_latest_osd() -{ - epoch_t e = osdmap.get_epoch(); - dout(1) << "bcast_latest_osd epoch " << e << dendl; - - // tell osds - set osds; - osdmap.get_all_osds(osds); - for (set::iterator it = osds.begin(); - it != osds.end(); - it++) { - if (osdmap.is_down(*it)) continue; - - send_incremental(osdmap.get_inst(*it), osdmap.get_epoch()); - } -} - -void OSDMonitor::bcast_full_osd() -{ - epoch_t e = osdmap.get_epoch(); - dout(1) << "bcast_full_osd epoch " << e << dendl; - - // tell osds - set osds; - osdmap.get_all_osds(osds); - for (set::iterator it = osds.begin(); - it != osds.end(); - it++) { - if (osdmap.is_down(*it)) continue; - send_full(osdmap.get_inst(*it)); - } -} - - -// TICK - - -void OSDMonitor::tick() -{ - // mark down osds out? - utime_t now = g_clock.now(); - list mark_out; - for (map::iterator i = down_pending_out.begin(); - i != down_pending_out.end(); - i++) { - utime_t down = now; - down -= i->second; - - if (down.sec() >= g_conf.mon_osd_down_out_interval) { - dout(10) << "tick marking osd" << i->first << " OUT after " << down << " sec" << dendl; - mark_out.push_back(i->first); - } - } - for (list::iterator i = mark_out.begin(); - i != mark_out.end(); - i++) { - down_pending_out.erase(*i); - pending_inc.new_out.push_back( *i ); - } - if (!mark_out.empty()) { - propose_pending(); - } -} - - - - - -/* -void OSDMonitor::init() -{ - // start with blank map - - // load my last state from the store - bufferlist bl; - if (get_map_bl(0, bl)) { // FIXME - // yay! - osdmap.decode(bl); - dout(1) << "init got epoch " << osdmap.get_epoch() << " from store" << dendl; - - // set up pending_inc - pending_inc.epoch = osdmap.get_epoch()+1; - } -} -*/ - - - - -void OSDMonitor::mark_all_down() -{ - assert(mon->is_leader()); - - dout(7) << "mark_all_down" << dendl; - - for (set::iterator it = osdmap.get_osds().begin(); - it != osdmap.get_osds().end(); - it++) { - if (osdmap.is_down(*it)) continue; - pending_inc.new_down[*it].first = osdmap.get_inst(*it); - pending_inc.new_down[*it].second = true; // FIXME: am i sure it's clean? we need a proper osd shutdown sequence! - } - - propose_pending(); -} - - - - - - - - - - - - - - - -/* - - -void OSDMonitor::election_finished() -{ - dout(10) << "election_finished" << dendl; - - if (mon->is_leader()) { - if (g_conf.mkfs) { - create_initial(); - save_map(); - } else { - // - epoch_t epoch = mon->store->get_int("osd_epoch"); - dout(10) << " last epoch was " << epoch << dendl; - bufferlist bl, blinc; - int r = mon->store->get_bl_sn(bl, "osdmap_full", epoch); - assert(r>0); - osdmap.decode(bl); - - // pending_inc - pending_inc.epoch = epoch+1; - } - - } - -} - - - -*/ diff --git a/branches/sage/mds/mon/OSDMonitor.h b/branches/sage/mds/mon/OSDMonitor.h deleted file mode 100644 index afdd625ae04aa..0000000000000 --- a/branches/sage/mds/mon/OSDMonitor.h +++ /dev/null @@ -1,131 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __OSDMONITOR_H -#define __OSDMONITOR_H - -#include -#include -using namespace std; - -#include "include/types.h" -#include "msg/Messenger.h" - -#include "osd/OSDMap.h" - -#include "PaxosService.h" - -class Monitor; -class MOSDBoot; - -class OSDMonitor : public PaxosService { -public: - OSDMap osdmap; - -private: - map > waiting_for_map; // who -> (has, wants) - - // [leader] - OSDMap::Incremental pending_inc; - map down_pending_out; // osd down -> out - - map osd_weight; - - void build_crush_map(Crush& crush, - map& weights); - - // svc - void create_initial(); - bool update_from_paxos(); - void create_pending(); // prepare a new pending - void encode_pending(bufferlist &bl); - - void committed(); - - void handle_query(Message *m); - bool preprocess_query(Message *m); // true if processed. - bool prepare_update(Message *m); - bool should_propose(double &delay); - - // ... - bool get_map_bl(epoch_t epoch, bufferlist &bl); - bool get_inc_map_bl(epoch_t epoch, bufferlist &bl); - - void send_to_waiting(); // send current map to waiters. - void send_full(entity_inst_t dest); - void send_incremental(entity_inst_t dest, epoch_t since); - void bcast_latest_mds(); - void bcast_latest_osd(); - void bcast_full_osd(); - - void handle_osd_getmap(class MOSDGetMap *m); - - bool preprocess_failure(class MOSDFailure *m); - bool prepare_failure(class MOSDFailure *m); - void _reported_failure(MOSDFailure *m); - - bool preprocess_boot(class MOSDBoot *m); - bool prepare_boot(class MOSDBoot *m); - void _booted(MOSDBoot *m); - - class C_Booted : public Context { - OSDMonitor *cmon; - MOSDBoot *m; - public: - C_Booted(OSDMonitor *cm, MOSDBoot *m_) : - cmon(cm), m(m_) {} - void finish(int r) { - if (r >= 0) - cmon->_booted(m); - else - cmon->dispatch((Message*)m); - } - }; - class C_Reported : public Context { - OSDMonitor *cmon; - MOSDFailure *m; - public: - C_Reported(OSDMonitor *cm, MOSDFailure *m_) : - cmon(cm), m(m_) {} - void finish(int r) { - if (r >= 0) - cmon->_reported_failure(m); - else - cmon->dispatch((Message*)m); - } - }; - - bool preprocess_in(class MOSDIn *m); - bool prepare_in(class MOSDIn *m); - - bool preprocess_out(class MOSDOut *m); - bool prepare_out(class MOSDOut *m); - - public: - OSDMonitor(Monitor *mn, Paxos *p) : - PaxosService(mn, p) { } - - void tick(); // check state, take actions - - void mark_all_down(); - - void send_latest(entity_inst_t i, epoch_t start=0); - - void fake_osd_failure(int osd, bool down); - void fake_osdmap_update(); - void fake_reorg(); -}; - -#endif diff --git a/branches/sage/mds/mon/PGMap.h b/branches/sage/mds/mon/PGMap.h deleted file mode 100644 index b915c28cbd755..0000000000000 --- a/branches/sage/mds/mon/PGMap.h +++ /dev/null @@ -1,103 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __PGMAP_H -#define __PGMAP_H - -#include "osd/osd_types.h" - -class PGMap { -public: - // the map - version_t version; - hash_map pg_stat; - hash_map osd_stat; - - class Incremental { - public: - version_t version; - map pg_stat_updates; - map osd_stat_updates; - - void _encode(bufferlist &bl) { - ::_encode(version, bl); - ::_encode(pg_stat_updates, bl); - ::_encode(osd_stat_updates, bl); - } - void _decode(bufferlist& bl, int& off) { - ::_decode(version, bl, off); - ::_decode(pg_stat_updates, bl, off); - ::_decode(osd_stat_updates, bl, off); - } - - Incremental() : version(0) {} - }; - - void apply_incremental(Incremental& inc) { - assert(inc.version == version+1); - version++; - for (map::iterator p = inc.pg_stat_updates.begin(); - p != inc.pg_stat_updates.end(); - ++p) { - if (pg_stat.count(p->first)) - stat_sub(pg_stat[p->first]); - pg_stat[p->first] = p->second; - stat_add(p->second); - } - } - - // aggregate stats (soft state) - hash_map num_pg_by_state; - int64_t num_pg; - int64_t total_size; - int64_t total_num_blocks; - - void stat_zero() { - num_pg = 0; - num_pg_by_state.clear(); - total_size = 0; - total_num_blocks = 0; - } - void stat_add(pg_stat_t &s) { - num_pg++; - num_pg_by_state[s.state]++; - total_size += s.size; - total_num_blocks += s.num_blocks; - } - void stat_sub(pg_stat_t &s) { - num_pg--; - num_pg_by_state[s.state]--; - total_size -= s.size; - total_num_blocks -= s.num_blocks; - } - - PGMap() : version(0), - num_pg(0), total_size(0), total_num_blocks(0) {} - - void _encode(bufferlist &bl) { - ::_encode(version, bl); - ::_encode(pg_stat, bl); - } - void _decode(bufferlist& bl, int& off) { - ::_decode(version, bl, off); - ::_decode(pg_stat, bl, off); - stat_zero(); - for (hash_map::iterator p = pg_stat.begin(); - p != pg_stat.end(); - ++p) - stat_add(p->second); - } -}; - -#endif diff --git a/branches/sage/mds/mon/PGMonitor.cc b/branches/sage/mds/mon/PGMonitor.cc deleted file mode 100644 index 6e571fea7f612..0000000000000 --- a/branches/sage/mds/mon/PGMonitor.cc +++ /dev/null @@ -1,219 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include "PGMonitor.h" -#include "Monitor.h" -#include "MDSMonitor.h" -#include "OSDMonitor.h" -#include "MonitorStore.h" - -#include "messages/MPGStats.h" - -#include "messages/MStatfs.h" -#include "messages/MStatfsReply.h" - -#include "common/Timer.h" - -#include "osd/osd_types.h" -#include "osd/PG.h" // yuck - -#include "config.h" -#include - - -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_dout << dbeginl << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".pg " -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_derr << dbeginl << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".pg " - - - -void PGMonitor::create_initial() -{ - dout(1) << "create_initial -- creating initial map" << dendl; -} - -bool PGMonitor::update_from_paxos() -{ - version_t paxosv = paxos->get_version(); - if (paxosv == pg_map.version) return true; - assert(paxosv >= pg_map.version); - - if (pg_map.version == 0 && paxosv > 1 && - mon->store->exists_bl_ss("pgmap","latest")) { - // starting up: load latest - dout(7) << "update_from_paxos startup: loading latest full pgmap" << dendl; - bufferlist bl; - mon->store->get_bl_ss(bl, "pgmap", "latest"); - int off = 0; - pg_map._decode(bl, off); - } - - // walk through incrementals - while (paxosv > pg_map.version) { - bufferlist bl; - bool success = paxos->read(pg_map.version+1, bl); - if (success) { - dout(7) << "update_from_paxos applying incremental " << pg_map.version+1 << dendl; - PGMap::Incremental inc; - int off = 0; - inc._decode(bl, off); - pg_map.apply_incremental(inc); - - std::stringstream ss; - for (hash_map::iterator p = pg_map.num_pg_by_state.begin(); - p != pg_map.num_pg_by_state.end(); - ++p) { - if (p != pg_map.num_pg_by_state.begin()) - ss << ", "; - ss << p->second << " " << PG::get_state_string(p->first) << "(" << p->first << ")"; - } - string states = ss.str(); - dout(0) << "v" << pg_map.version << " " << states << dendl; - - } else { - dout(7) << "update_from_paxos couldn't read incremental " << pg_map.version+1 << dendl; - return false; - } - } - - // save latest - bufferlist bl; - pg_map._encode(bl); - mon->store->put_bl_ss(bl, "pgmap", "latest"); - - return true; -} - -void PGMonitor::create_pending() -{ - pending_inc = PGMap::Incremental(); - pending_inc.version = pg_map.version + 1; - dout(10) << "create_pending v " << pending_inc.version << dendl; -} - -void PGMonitor::encode_pending(bufferlist &bl) -{ - assert(mon->is_leader()); - dout(10) << "encode_pending v " << pending_inc.version << dendl; - assert(paxos->get_version() + 1 == pending_inc.version); - pending_inc._encode(bl); -} - -bool PGMonitor::preprocess_query(Message *m) -{ - dout(10) << "preprocess_query " << *m << " from " << m->get_source_inst() << dendl; - - switch (m->get_type()) { - case MSG_STATFS: - handle_statfs((MStatfs*)m); - return true; - - case MSG_PGSTATS: - { - MPGStats *stats = (MPGStats*)m; - for (map::iterator p = stats->pg_stat.begin(); - p != stats->pg_stat.end(); - p++) { - if (pg_map.pg_stat.count(p->first) == 0 || - pg_map.pg_stat[p->first].reported < p->second.reported) - return false; - } - dout(10) << " message contains no new pg stats" << dendl; - return true; - } - - default: - assert(0); - delete m; - return true; - } -} - -bool PGMonitor::prepare_update(Message *m) -{ - dout(10) << "prepare_update " << *m << " from " << m->get_source_inst() << dendl; - switch (m->get_type()) { - case MSG_PGSTATS: - return handle_pg_stats((MPGStats*)m); - - default: - assert(0); - delete m; - return false; - } -} - -void PGMonitor::committed() -{ - -} - -void PGMonitor::handle_statfs(MStatfs *statfs) -{ - dout(10) << "handle_statfs " << *statfs << " from " << statfs->get_source() << dendl; - - // fill out stfs - MStatfsReply *reply = new MStatfsReply(statfs->tid); - memset(&reply->stfs, 0, sizeof(reply->stfs)); - reply->stfs.f_bsize = 1024; - reply->stfs.f_frsize = 1024; - reply->stfs.f_blocks = 1024 * 1024; //pg_map.total_num_blocks; - reply->stfs.f_bfree = 1024 * 1024; - reply->stfs.f_bavail = 1024 * 1024; - reply->stfs.f_files = 1024 * 1024; - reply->stfs.f_ffree = 1024 * 1024; - reply->stfs.f_favail = 1024 * 1024; - reply->stfs.f_namemax = 1024; - reply->stfs.f_flag = ST_NOATIME|ST_NODIRATIME; // for now. - - // reply - mon->messenger->send_message(reply, statfs->get_source_inst()); - delete statfs; -} - -bool PGMonitor::handle_pg_stats(MPGStats *stats) -{ - dout(10) << "handle_pg_stats " << *stats << " from " << stats->get_source() << dendl; - - for (map::iterator p = stats->pg_stat.begin(); - p != stats->pg_stat.end(); - p++) { - pg_t pgid = p->first; - if ((pg_map.pg_stat.count(pgid) && - pg_map.pg_stat[pgid].reported > p->second.reported)) { - dout(15) << " had " << pgid << " from " << pg_map.pg_stat[pgid].reported << dendl; - continue; - } - if (pending_inc.pg_stat_updates.count(pgid) && - pending_inc.pg_stat_updates[pgid].reported > p->second.reported) { - dout(15) << " had " << pgid << " from " << pending_inc.pg_stat_updates[pgid].reported - << " (pending)" << dendl; - continue; - } - - dout(15) << " got " << pgid << " reported at " << p->second.reported - << " state " << PG::get_state_string(p->second.state) - << dendl; - pending_inc.pg_stat_updates[pgid] = p->second; - - // we don't care about consistency; apply to live map. - if (pg_map.pg_stat.count(pgid)) - pg_map.stat_sub(pg_map.pg_stat[pgid]); - pg_map.pg_stat[pgid] = p->second; - pg_map.stat_add(pg_map.pg_stat[pgid]); - } - - delete stats; - return true; -} diff --git a/branches/sage/mds/mon/PGMonitor.h b/branches/sage/mds/mon/PGMonitor.h deleted file mode 100644 index 7b6d44f814fd2..0000000000000 --- a/branches/sage/mds/mon/PGMonitor.h +++ /dev/null @@ -1,58 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __PGMONITOR_H -#define __PGMONITOR_H - -#include -#include -using namespace std; - -#include "include/types.h" -#include "msg/Messenger.h" -#include "PaxosService.h" - -#include "PGMap.h" - -class MPGStats; -class MStatfs; - -class PGMonitor : public PaxosService { -public: - -private: - PGMap pg_map; - PGMap::Incremental pending_inc; - - void create_initial(); - bool update_from_paxos(); - void create_pending(); // prepare a new pending - void encode_pending(bufferlist &bl); // propose pending update to peers - - void committed(); - - bool preprocess_query(Message *m); // true if processed. - bool prepare_update(Message *m); - - void handle_statfs(MStatfs *statfs); - bool handle_pg_stats(MPGStats *stats); - - public: - PGMonitor(Monitor *mn, Paxos *p) : PaxosService(mn, p) { } - - //void tick(); // check state, take actions - -}; - -#endif diff --git a/branches/sage/mds/mon/Paxos.cc b/branches/sage/mds/mon/Paxos.cc deleted file mode 100644 index c1f4472059ff5..0000000000000 --- a/branches/sage/mds/mon/Paxos.cc +++ /dev/null @@ -1,784 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "Paxos.h" -#include "Monitor.h" -#include "MonitorStore.h" - -#include "messages/MMonPaxos.h" - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_paxos) *_dout << dbeginl << g_clock.now() << " mon" << whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".paxos(" << machine_name << " " << get_statename(state) << " lc " << last_committed << ") " -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_paxos) *_derr << dbeginl << g_clock.now() << " mon" << whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".paxos(" << machine_name << " " << get_statename(state) << " lc " << last_committed << ") " - - -void Paxos::init() -{ - // load paxos variables from stable storage - last_pn = mon->store->get_int(machine_name, "last_pn"); - accepted_pn = mon->store->get_int(machine_name, "accepted_pn"); - last_committed = mon->store->get_int(machine_name, "last_committed"); - - dout(10) << "init" << dendl; -} - -// --------------------------------- - -// PHASE 1 - -// leader -void Paxos::collect(version_t oldpn) -{ - // we're recoverying, it seems! - state = STATE_RECOVERING; - assert(mon->is_leader()); - - // reset the number of lasts received - uncommitted_v = 0; - uncommitted_pn = 0; - uncommitted_value.clear(); - - // look for uncommitted value - if (mon->store->exists_bl_sn(machine_name, last_committed+1)) { - uncommitted_v = last_committed+1; - uncommitted_pn = accepted_pn; - mon->store->get_bl_sn(uncommitted_value, machine_name, last_committed+1); - dout(10) << "learned uncommitted " << (last_committed+1) - << " (" << uncommitted_value.length() << " bytes) from myself" - << dendl; - } - - // pick new pn - accepted_pn = get_new_proposal_number(MAX(accepted_pn, oldpn)); - accepted_pn_from = last_committed; - num_last = 1; - dout(10) << "collect with pn " << accepted_pn << dendl; - - // send collect - for (set::const_iterator p = mon->get_quorum().begin(); - p != mon->get_quorum().end(); - ++p) { - if (*p == whoami) continue; - - MMonPaxos *collect = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_COLLECT, machine_id); - collect->last_committed = last_committed; - collect->pn = accepted_pn; - mon->messenger->send_message(collect, mon->monmap->get_inst(*p)); - } - -} - - -// peon -void Paxos::handle_collect(MMonPaxos *collect) -{ - dout(10) << "handle_collect " << *collect << dendl; - - assert(mon->is_peon()); // mon epoch filter should catch strays - - // we're recoverying, it seems! - state = STATE_RECOVERING; - - // reply - MMonPaxos *last = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_LAST, machine_id); - last->last_committed = last_committed; - - // do we have an accepted but uncommitted value? - // (it'll be at last_committed+1) - bufferlist bl; - if (mon->store->exists_bl_sn(machine_name, last_committed+1)) { - mon->store->get_bl_sn(bl, machine_name, last_committed+1); - assert(bl.length() > 0); - dout(10) << " sharing our accepted but uncommitted value for " << last_committed+1 - << " (" << bl.length() << " bytes)" << dendl; - last->values[last_committed+1] = bl; - last->uncommitted_pn = accepted_pn; - } - - // can we accept this pn? - if (collect->pn > accepted_pn) { - // ok, accept it - accepted_pn = collect->pn; - accepted_pn_from = collect->pn_from; - dout(10) << "accepting pn " << accepted_pn << " from " << accepted_pn_from << dendl; - mon->store->put_int(accepted_pn, machine_name, "accepted_pn"); - } else { - // don't accept! - dout(10) << "NOT accepting pn " << collect->pn << " from " << collect->pn_from - << ", we already accepted " << accepted_pn << " from " << accepted_pn_from - << dendl; - } - last->pn = accepted_pn; - last->pn_from = accepted_pn_from; - - // and share whatever data we have - for (version_t v = collect->last_committed+1; - v <= last_committed; - v++) { - if (mon->store->exists_bl_sn(machine_name, v)) { - mon->store->get_bl_sn(last->values[v], machine_name, v); - dout(10) << " sharing " << v << " (" - << last->values[v].length() << " bytes)" << dendl; - } - } - - // send reply - mon->messenger->send_message(last, collect->get_source_inst()); - delete collect; -} - - -// leader -void Paxos::handle_last(MMonPaxos *last) -{ - dout(10) << "handle_last " << *last << dendl; - - if (!mon->is_leader()) { - dout(10) << "not leader, dropping" << dendl; - delete last; - return; - } - - // share committed values? - if (last->last_committed < last_committed) { - // share committed values - dout(10) << "sending commit to " << last->get_source() << dendl; - MMonPaxos *commit = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_COMMIT, machine_id); - for (version_t v = last->last_committed+1; - v <= last_committed; - v++) { - mon->store->get_bl_sn(commit->values[v], machine_name, v); - dout(10) << " sharing " << v << " (" - << commit->values[v].length() << " bytes)" << dendl; - } - commit->last_committed = last_committed; - mon->messenger->send_message(commit, last->get_source_inst()); - } - - // did we receive a committed value? - if (last->last_committed > last_committed) { - for (version_t v = last_committed+1; - v <= last->last_committed; - v++) { - mon->store->put_bl_sn(last->values[v], machine_name, v); - dout(10) << "committing " << v << " " - << last->values[v].length() << " bytes" << dendl; - } - last_committed = last->last_committed; - mon->store->put_int(last_committed, machine_name, "last_committed"); - dout(10) << "last_committed now " << last_committed << dendl; - } - - // do they accept your pn? - if (last->pn > accepted_pn) { - // no, try again. - dout(10) << " they had a higher pn than us, picking a new one." << dendl; - collect(last->pn); - } else { - // yes, they accepted our pn. great. - num_last++; - dout(10) << " they accepted our pn, we now have " - << num_last << " peons" << dendl; - - // did this person send back an accepted but uncommitted value? - if (last->uncommitted_pn && - last->uncommitted_pn > uncommitted_pn) { - uncommitted_v = last->last_committed+1; - uncommitted_pn = last->uncommitted_pn; - uncommitted_value = last->values[uncommitted_v]; - dout(10) << "we learned an uncommitted value for " << uncommitted_v - << " pn " << uncommitted_pn - << " " << uncommitted_value.length() << " bytes" - << dendl; - } - - // is that everyone? - if (num_last == mon->get_quorum().size()) { - // almost... - state = STATE_ACTIVE; - - // did we learn an old value? - if (uncommitted_v == last_committed+1 && - uncommitted_value.length()) { - dout(10) << "that's everyone. begin on old learned value" << dendl; - begin(uncommitted_value); - } else { - // active! - dout(10) << "that's everyone. active!" << dendl; - extend_lease(); - - // wake people up - finish_contexts(waiting_for_active); - finish_contexts(waiting_for_readable); - finish_contexts(waiting_for_writeable); - } - } - } - - delete last; -} - - -// leader -void Paxos::begin(bufferlist& v) -{ - dout(10) << "begin for " << last_committed+1 << " " - << v.length() << " bytes" - << dendl; - - assert(mon->is_leader()); - assert(is_active()); - state = STATE_UPDATING; - - // we must already have a majority for this to work. - assert(mon->get_quorum().size() == 1 || - num_last > (unsigned)mon->monmap->num_mon/2); - - // and no value, yet. - assert(new_value.length() == 0); - - // accept it ourselves - accepted.clear(); - accepted.insert(whoami); - new_value = v; - mon->store->put_bl_sn(new_value, machine_name, last_committed+1); - - if (mon->get_quorum().size() == 1) { - // we're alone, take it easy - commit(); - state = STATE_ACTIVE; - finish_contexts(waiting_for_active); - finish_contexts(waiting_for_commit); - finish_contexts(waiting_for_readable); - finish_contexts(waiting_for_writeable); - return; - } - - // ask others to accept it to! - for (set::const_iterator p = mon->get_quorum().begin(); - p != mon->get_quorum().end(); - ++p) { - if (*p == whoami) continue; - - dout(10) << " sending begin to mon" << *p << dendl; - MMonPaxos *begin = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_BEGIN, machine_id); - begin->values[last_committed+1] = new_value; - begin->last_committed = last_committed; - begin->pn = accepted_pn; - - mon->messenger->send_message(begin, mon->monmap->get_inst(*p)); - } - - // set timeout event - accept_timeout_event = new C_AcceptTimeout(this); - mon->timer.add_event_after(g_conf.mon_accept_timeout, accept_timeout_event); -} - -// peon -void Paxos::handle_begin(MMonPaxos *begin) -{ - dout(10) << "handle_begin " << *begin << dendl; - - // can we accept this? - if (begin->pn < accepted_pn) { - dout(10) << " we accepted a higher pn " << accepted_pn << ", ignoring" << dendl; - delete begin; - return; - } - assert(begin->pn == accepted_pn); - assert(begin->last_committed == last_committed); - - // set state. - state = STATE_UPDATING; - lease_expire = utime_t(); // cancel lease - - // yes. - version_t v = last_committed+1; - dout(10) << "accepting value for " << v << " pn " << accepted_pn << dendl; - mon->store->put_bl_sn(begin->values[v], machine_name, v); - - // reply - MMonPaxos *accept = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_ACCEPT, machine_id); - accept->pn = accepted_pn; - accept->last_committed = last_committed; - mon->messenger->send_message(accept, begin->get_source_inst()); - - delete begin; -} - -// leader -void Paxos::handle_accept(MMonPaxos *accept) -{ - dout(10) << "handle_accept " << *accept << dendl; - int from = accept->get_source().num(); - - if (accept->pn != accepted_pn) { - // we accepted a higher pn, from some other leader - dout(10) << " we accepted a higher pn " << accepted_pn << ", ignoring" << dendl; - delete accept; - return; - } - if (last_committed > 0 && - accept->last_committed < last_committed-1) { - dout(10) << " this is from an old round, ignoring" << dendl; - delete accept; - return; - } - assert(accept->last_committed == last_committed || // not committed - accept->last_committed == last_committed-1); // committed - - assert(state == STATE_UPDATING); - assert(accepted.count(from) == 0); - accepted.insert(from); - dout(10) << " now " << accepted << " have accepted" << dendl; - - // new majority? - if (accepted.size() == (unsigned)mon->monmap->num_mon/2+1) { - // yay, commit! - // note: this may happen before the lease is reextended (below) - dout(10) << " got majority, committing" << dendl; - commit(); - } - - // done? - if (accepted == mon->get_quorum()) { - dout(10) << " got quorum, done with update" << dendl; - // cancel timeout event - mon->timer.cancel_event(accept_timeout_event); - accept_timeout_event = 0; - - // yay! - state = STATE_ACTIVE; - extend_lease(); - - // wake people up - finish_contexts(waiting_for_active); - finish_contexts(waiting_for_commit); - finish_contexts(waiting_for_readable); - finish_contexts(waiting_for_writeable); - } -} - -void Paxos::accept_timeout() -{ - dout(5) << "accept timeout, calling fresh election" << dendl; - accept_timeout_event = 0; - assert(mon->is_leader()); - assert(is_updating()); - cancel_events(); - mon->call_election(); -} - -void Paxos::commit() -{ - dout(10) << "commit " << last_committed+1 << dendl; - - // commit locally - last_committed++; - mon->store->put_int(last_committed, machine_name, "last_committed"); - - // tell everyone - for (set::const_iterator p = mon->get_quorum().begin(); - p != mon->get_quorum().end(); - ++p) { - if (*p == whoami) continue; - - dout(10) << " sending commit to mon" << *p << dendl; - MMonPaxos *commit = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_COMMIT, machine_id); - commit->values[last_committed] = new_value; - commit->pn = accepted_pn; - - mon->messenger->send_message(commit, mon->monmap->get_inst(*p)); - } - - // get ready for a new round. - new_value.clear(); -} - - -void Paxos::handle_commit(MMonPaxos *commit) -{ - dout(10) << "handle_commit on " << commit->last_committed << dendl; - - if (!mon->is_peon()) { - dout(10) << "not a peon, dropping" << dendl; - assert(0); - delete commit; - return; - } - - // commit locally. - for (map::iterator p = commit->values.begin(); - p != commit->values.end(); - ++p) { - assert(p->first == last_committed+1); - last_committed = p->first; - dout(10) << " storing " << last_committed << " (" << p->second.length() << " bytes)" << dendl; - mon->store->put_bl_sn(p->second, machine_name, last_committed); - } - mon->store->put_int(last_committed, machine_name, "last_committed"); - - delete commit; -} - -void Paxos::extend_lease() -{ - assert(mon->is_leader()); - assert(is_active()); - - lease_expire = g_clock.now(); - lease_expire += g_conf.mon_lease; - acked_lease.clear(); - acked_lease.insert(whoami); - - dout(7) << "extend_lease now+" << g_conf.mon_lease << " (" << lease_expire << ")" << dendl; - - // bcast - for (set::const_iterator p = mon->get_quorum().begin(); - p != mon->get_quorum().end(); - ++p) { - if (*p == whoami) continue; - MMonPaxos *lease = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_LEASE, machine_id); - lease->last_committed = last_committed; - lease->lease_expire = lease_expire; - mon->messenger->send_message(lease, mon->monmap->get_inst(*p)); - } - - // set timeout event. - // if old timeout is still in place, leave it. - if (!lease_ack_timeout_event) { - lease_ack_timeout_event = new C_LeaseAckTimeout(this); - mon->timer.add_event_after(g_conf.mon_lease_ack_timeout, lease_ack_timeout_event); - } - - // set renew event - lease_renew_event = new C_LeaseRenew(this); - utime_t at = lease_expire; - at -= g_conf.mon_lease; - at += g_conf.mon_lease_renew_interval; - mon->timer.add_event_at(at, lease_renew_event); -} - - -// peon -void Paxos::handle_lease(MMonPaxos *lease) -{ - // sanity - if (!mon->is_peon() || - last_committed != lease->last_committed) { - dout(10) << "handle_lease i'm not a peon, or they're not the leader, or the last_committed doesn't match, dropping" << dendl; - delete lease; - return; - } - - // extend lease - if (lease_expire < lease->lease_expire) - lease_expire = lease->lease_expire; - - state = STATE_ACTIVE; - - dout(10) << "handle_lease on " << lease->last_committed - << " now " << lease_expire << dendl; - - // ack - MMonPaxos *ack = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_LEASE_ACK, machine_id); - ack->last_committed = last_committed; - ack->lease_expire = lease_expire; - mon->messenger->send_message(ack, lease->get_source_inst()); - - // (re)set timeout event. - if (lease_timeout_event) - mon->timer.cancel_event(lease_timeout_event); - lease_timeout_event = new C_LeaseTimeout(this); - mon->timer.add_event_after(g_conf.mon_lease_ack_timeout, lease_timeout_event); - - // kick waiters - finish_contexts(waiting_for_active); - if (is_readable()) - finish_contexts(waiting_for_readable); - - delete lease; -} - -void Paxos::handle_lease_ack(MMonPaxos *ack) -{ - int from = ack->get_source().num(); - - if (!lease_ack_timeout_event) { - dout(10) << "handle_lease_ack from " << ack->get_source() << " -- stray (probably since revoked)" << dendl; - } - else if (acked_lease.count(from) == 0) { - acked_lease.insert(from); - - if (acked_lease == mon->get_quorum()) { - // yay! - dout(10) << "handle_lease_ack from " << ack->get_source() - << " -- got everyone" << dendl; - mon->timer.cancel_event(lease_ack_timeout_event); - lease_ack_timeout_event = 0; - } else { - dout(10) << "handle_lease_ack from " << ack->get_source() - << " -- still need " - << mon->get_quorum().size() - acked_lease.size() - << " more" << dendl; - } - } else { - dout(10) << "handle_lease_ack from " << ack->get_source() - << " dup (lagging!), ignoring" << dendl; - } - - delete ack; -} - -void Paxos::lease_ack_timeout() -{ - dout(5) << "lease_ack_timeout -- calling new election" << dendl; - assert(mon->is_leader()); - assert(is_active()); - - lease_ack_timeout_event = 0; - cancel_events(); - mon->call_election(); -} - -void Paxos::lease_timeout() -{ - dout(5) << "lease_timeout -- calling new election" << dendl; - assert(mon->is_peon()); - - lease_timeout_event = 0; - cancel_events(); - mon->call_election(); -} - -void Paxos::lease_renew_timeout() -{ - lease_renew_event = 0; - extend_lease(); -} - - -/* - * return a globally unique, monotonically increasing proposal number - */ -version_t Paxos::get_new_proposal_number(version_t gt) -{ - if (last_pn < gt) - last_pn = gt; - - // update. make it unique among all monitors. - last_pn /= 100; - last_pn++; - last_pn *= 100; - last_pn += (version_t)whoami; - - // write - mon->store->put_int(last_pn, machine_name, "last_pn"); - - dout(10) << "get_new_proposal_number = " << last_pn << dendl; - return last_pn; -} - - -void Paxos::cancel_events() -{ - if (accept_timeout_event) { - mon->timer.cancel_event(accept_timeout_event); - accept_timeout_event = 0; - } - if (lease_renew_event) { - mon->timer.cancel_event(lease_renew_event); - lease_renew_event = 0; - } - if (lease_ack_timeout_event) { - mon->timer.cancel_event(lease_ack_timeout_event); - lease_ack_timeout_event = 0; - } - if (lease_timeout_event) { - mon->timer.cancel_event(lease_timeout_event); - lease_timeout_event = 0; - } -} - -void Paxos::leader_init() -{ - if (mon->get_quorum().size() == 1) { - state = STATE_ACTIVE; - return; - } - cancel_events(); - state = STATE_RECOVERING; - lease_expire = utime_t(); - dout(10) << "leader_init -- starting paxos recovery" << dendl; - collect(0); -} - -void Paxos::peon_init() -{ - cancel_events(); - state = STATE_RECOVERING; - lease_expire = utime_t(); - dout(10) << "peon_init -- i am a peon" << dendl; - - // no chance to write now! - finish_contexts(waiting_for_writeable, -1); - finish_contexts(waiting_for_commit, -1); -} - -void Paxos::election_starting() -{ - dout(10) << "election_starting -- canceling timeouts" << dendl; - cancel_events(); - new_value.clear(); - - finish_contexts(waiting_for_commit, -1); -} - - -void Paxos::dispatch(Message *m) -{ - // election in progress? - if (mon->is_starting()) { - dout(5) << "election in progress, dropping " << *m << dendl; - delete m; - return; - } - - // check sanity - assert(mon->is_leader() || - (mon->is_peon() && m->get_source().num() == mon->get_leader())); - - switch (m->get_type()) { - - case MSG_MON_PAXOS: - { - MMonPaxos *pm = (MMonPaxos*)m; - - // NOTE: these ops are defined in messages/MMonPaxos.h - switch (pm->op) { - // learner - case MMonPaxos::OP_COLLECT: - handle_collect(pm); - break; - case MMonPaxos::OP_LAST: - handle_last(pm); - break; - case MMonPaxos::OP_BEGIN: - handle_begin(pm); - break; - case MMonPaxos::OP_ACCEPT: - handle_accept(pm); - break; - case MMonPaxos::OP_COMMIT: - handle_commit(pm); - break; - case MMonPaxos::OP_LEASE: - handle_lease(pm); - break; - case MMonPaxos::OP_LEASE_ACK: - handle_lease_ack(pm); - break; - default: - assert(0); - } - } - break; - - default: - assert(0); - } -} - - - - -// ----------------- -// service interface - -// -- READ -- - -bool Paxos::is_readable() -{ - //dout(15) << "is_readable now=" << g_clock.now() << " lease_expire=" << lease_expire << dendl; - return - (mon->is_peon() || mon->is_leader()) && - is_active() && - last_committed > 0 && // must have a value - (mon->get_quorum().size() == 1 || // alone, or - g_clock.now() < lease_expire); // have lease -} - -bool Paxos::read(version_t v, bufferlist &bl) -{ - if (!is_readable()) - return false; - - if (!mon->store->get_bl_sn(bl, machine_name, v)) - return false; - return true; -} - -version_t Paxos::read_current(bufferlist &bl) -{ - if (!is_readable()) - return 0; - if (read(last_committed, bl)) - return last_committed; - return 0; -} - - - - -// -- WRITE -- - -bool Paxos::is_writeable() -{ - if (mon->get_quorum().size() == 1) return true; - return - mon->is_leader() && - is_active() && - g_clock.now() < lease_expire; -} - -bool Paxos::propose_new_value(bufferlist& bl, Context *oncommit) -{ - /* - // writeable? - if (!is_writeable()) { - dout(5) << "propose_new_value " << last_committed+1 << " " << bl.length() << " bytes" - << " -- not writeable" << dendl; - if (oncommit) { - oncommit->finish(-1); - delete oncommit; - } - return false; - } - */ - - assert(mon->is_leader() && is_active()); - - // cancel lease renewal and timeout events. - cancel_events(); - - // ok! - dout(5) << "propose_new_value " << last_committed+1 << " " << bl.length() << " bytes" << dendl; - if (oncommit) - waiting_for_commit.push_back(oncommit); - begin(bl); - - return true; -} - diff --git a/branches/sage/mds/mon/Paxos.h b/branches/sage/mds/mon/Paxos.h deleted file mode 100644 index a6d28dd1cea9a..0000000000000 --- a/branches/sage/mds/mon/Paxos.h +++ /dev/null @@ -1,251 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -/* -time----> - -cccccccccccccccccca???????????????????????????????????????? -cccccccccccccccccca???????????????????????????????????????? -cccccccccccccccccca???????????????????????????????????????? leader -cccccccccccccccccc????????????????????????????????????????? -ccccc?????????????????????????????????????????????????????? - -last_committed - -pn_from -pn - -a 12v -b 12v -c 14v -d -e 12v - - -*/ - - -/* - * NOTE: This libary is based on the Paxos algorithm, but varies in a few key ways: - * 1- Only a single new value is generated at a time, simplifying the recovery logic. - * 2- Nodes track "committed" values, and share them generously (and trustingly) - * 3- A 'leasing' mechism is built-in, allowing nodes to determine when it is safe to - * "read" their copy of the last committed value. - * - * This provides a simple replication substrate that services can be built on top of. - */ - -#ifndef __MON_PAXOS_H -#define __MON_PAXOS_H - -#include "include/types.h" -#include "mon_types.h" -#include "include/buffer.h" -#include "msg/Message.h" - -#include "include/Context.h" - -#include "common/Timer.h" - -class Monitor; -class MMonPaxos; - - -// i am one state machine. -class Paxos { - Monitor *mon; - int whoami; - - // my state machine info - int machine_id; - const char *machine_name; - - friend class PaxosService; - - // LEADER+PEON - - // -- generic state -- -public: - const static int STATE_RECOVERING = 1; // leader|peon: recovering paxos state - const static int STATE_ACTIVE = 2; // leader|peon: idle. peon may or may not have valid lease - const static int STATE_UPDATING = 3; // leader|peon: updating to new value - const char *get_statename(int s) { - switch (s) { - case STATE_RECOVERING: return "recovering"; - case STATE_ACTIVE: return "active"; - case STATE_UPDATING: return "updating"; - default: assert(0); return 0; - } - } - -private: - int state; - -public: - bool is_recovering() { return state == STATE_RECOVERING; } - bool is_active() { return state == STATE_ACTIVE; } - bool is_updating() { return state == STATE_UPDATING; } - -private: - // recovery (phase 1) - version_t last_pn; - version_t last_committed; - version_t accepted_pn; - version_t accepted_pn_from; - - // active (phase 2) - utime_t lease_expire; - list waiting_for_active; - list waiting_for_readable; - - - // -- leader -- - // recovery (paxos phase 1) - unsigned num_last; - version_t uncommitted_v; - version_t uncommitted_pn; - bufferlist uncommitted_value; - - // active - set acked_lease; - Context *lease_renew_event; - Context *lease_ack_timeout_event; - Context *lease_timeout_event; - - // updating (paxos phase 2) - bufferlist new_value; - set accepted; - - Context *accept_timeout_event; - - list waiting_for_writeable; - list waiting_for_commit; - - class C_AcceptTimeout : public Context { - Paxos *paxos; - public: - C_AcceptTimeout(Paxos *p) : paxos(p) {} - void finish(int r) { - paxos->accept_timeout(); - } - }; - - class C_LeaseAckTimeout : public Context { - Paxos *paxos; - public: - C_LeaseAckTimeout(Paxos *p) : paxos(p) {} - void finish(int r) { - paxos->lease_ack_timeout(); - } - }; - - class C_LeaseTimeout : public Context { - Paxos *paxos; - public: - C_LeaseTimeout(Paxos *p) : paxos(p) {} - void finish(int r) { - paxos->lease_timeout(); - } - }; - - class C_LeaseRenew : public Context { - Paxos *paxos; - public: - C_LeaseRenew(Paxos *p) : paxos(p) {} - void finish(int r) { - std::cout << "HI MOM" << std::endl; - paxos->lease_renew_timeout(); - } - }; - - - void collect(version_t oldpn); - void handle_collect(MMonPaxos*); - void handle_last(MMonPaxos*); - void begin(bufferlist& value); - void handle_begin(MMonPaxos*); - void handle_accept(MMonPaxos*); - void accept_timeout(); - void commit(); - void handle_commit(MMonPaxos*); - void extend_lease(); - void handle_lease(MMonPaxos*); - void handle_lease_ack(MMonPaxos*); - - void lease_ack_timeout(); // on leader, if lease isn't acked by all peons - void lease_renew_timeout(); // on leader, to renew the lease - void lease_timeout(); // on peon, if lease isn't extended - - void cancel_events(); - - version_t get_new_proposal_number(version_t gt=0); - -public: - Paxos(Monitor *m, int w, - int mid) : mon(m), whoami(w), - machine_id(mid), - machine_name(get_paxos_name(mid)), - state(STATE_RECOVERING), - lease_renew_event(0), - lease_ack_timeout_event(0), - lease_timeout_event(0), - accept_timeout_event(0) { } - - void dispatch(Message *m); - - void init(); - - void election_starting(); - void leader_init(); - void peon_init(); - - - // -- service interface -- - void wait_for_active(Context *c) { - assert(!is_active()); - waiting_for_active.push_back(c); - } - - // read - version_t get_version() { return last_committed; } - bool is_readable(); - bool read(version_t v, bufferlist &bl); - version_t read_current(bufferlist &bl); - void wait_for_readable(Context *onreadable) { - assert(!is_readable()); - waiting_for_readable.push_back(onreadable); - } - - // write - bool is_leader(); - bool is_writeable(); - void wait_for_writeable(Context *c) { - assert(!is_writeable()); - waiting_for_writeable.push_back(c); - } - - bool propose_new_value(bufferlist& bl, Context *oncommit=0); - void wait_for_commit(Context *oncommit) { - waiting_for_commit.push_back(oncommit); - } - void wait_for_commit_front(Context *oncommit) { - waiting_for_commit.push_front(oncommit); - } - -}; - - - -#endif - diff --git a/branches/sage/mds/mon/PaxosService.cc b/branches/sage/mds/mon/PaxosService.cc deleted file mode 100644 index 7b0eed20972a0..0000000000000 --- a/branches/sage/mds/mon/PaxosService.cc +++ /dev/null @@ -1,172 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "PaxosService.h" -#include "common/Clock.h" -#include "Monitor.h" - - - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_paxos) *_dout << dbeginl << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".paxosservice(" << get_paxos_name(paxos->machine_id) << ") " - - - - -void PaxosService::dispatch(Message *m) -{ - dout(10) << "dispatch " << *m << " from " << m->get_source_inst() << dendl; - - // make sure our map is readable and up to date - if (!paxos->is_readable()) { - dout(10) << " waiting for paxos -> readable" << dendl; - paxos->wait_for_readable(new C_RetryMessage(this, m)); - return; - } - - // make sure service has latest from paxos. - update_from_paxos(); - - // preprocess - if (preprocess_query(m)) - return; // easy! - - // leader? - if (!mon->is_leader()) { - // fw to leader - dout(10) << " fw to leader mon" << mon->get_leader() << dendl; - mon->messenger->send_message(m, mon->monmap->get_inst(mon->get_leader())); - return; - } - - // writeable? - if (!paxos->is_writeable()) { - dout(10) << " waiting for paxos -> writeable" << dendl; - paxos->wait_for_writeable(new C_RetryMessage(this, m)); - return; - } - - // update - if (prepare_update(m)) { - double delay; - if (should_propose(delay)) { - if (delay == 0.0) { - propose_pending(); - } else { - // delay a bit - if (!proposal_timer) { - dout(10) << " setting propose timer with dealy of " << delay << dendl; - proposal_timer = new C_Propose(this); - mon->timer.add_event_after(delay, proposal_timer); - } else { - dout(10) << " propose timer already set" << dendl; - } - } - } else { - dout(10) << " not proposing" << dendl; - } - } -} - -bool PaxosService::should_propose(double& delay) -{ - // simple default policy: quick startup, then some damping. - if (paxos->last_committed <= 1) - delay = 0.0; - else - delay = g_conf.paxos_propose_interval; - return true; -} - -void PaxosService::_commit() -{ - dout(7) << "_commit" << dendl; - update_from_paxos(); // notify service of new paxos state - - if (mon->is_leader()) { - dout(7) << "_commit creating new pending" << dendl; - assert(have_pending == false); - create_pending(); - have_pending = true; - - committed(); - } -} - - -void PaxosService::propose_pending() -{ - dout(10) << "propose_pending" << dendl; - assert(have_pending); - - if (proposal_timer) { - mon->timer.cancel_event(proposal_timer); - proposal_timer = 0; - } - - // finish and encode - bufferlist bl; - encode_pending(bl); - have_pending = false; - - // apply to paxos - paxos->wait_for_commit_front(new C_Commit(this)); - paxos->propose_new_value(bl); -} - - - - -void PaxosService::election_finished() -{ - dout(10) << "election_finished" << dendl; - - if (have_pending && - !mon->is_leader()) { - discard_pending(); - have_pending = false; - } - - // make sure we update our state - if (paxos->is_active()) - _active(); - else - paxos->wait_for_active(new C_Active(this)); -} - -void PaxosService::_active() -{ - dout(10) << "_active" << dendl; - assert(paxos->is_active()); - - // pull latest from paxos - update_from_paxos(); - - // create pending state? - if (mon->is_leader()) { - if (!have_pending) { - create_pending(); - have_pending = true; - } - - if (g_conf.mkfs && - paxos->get_version() == 0) { - create_initial(); - propose_pending(); - } - } -} - - diff --git a/branches/sage/mds/mon/PaxosService.h b/branches/sage/mds/mon/PaxosService.h deleted file mode 100644 index a0f39c7862273..0000000000000 --- a/branches/sage/mds/mon/PaxosService.h +++ /dev/null @@ -1,107 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __PAXOSSERVICE_H -#define __PAXOSSERVICE_H - -#include "msg/Dispatcher.h" -#include "include/Context.h" - -class Monitor; -class Paxos; - -class PaxosService : public Dispatcher { -protected: - Monitor *mon; - Paxos *paxos; - - class C_RetryMessage : public Context { - PaxosService *svc; - Message *m; - public: - C_RetryMessage(PaxosService *s, Message *m_) : svc(s), m(m_) {} - void finish(int r) { - svc->dispatch(m); - } - }; - class C_Active : public Context { - PaxosService *svc; - public: - C_Active(PaxosService *s) : svc(s) {} - void finish(int r) { - if (r >= 0) - svc->_active(); - } - }; - class C_Commit : public Context { - PaxosService *svc; - public: - C_Commit(PaxosService *s) : svc(s) {} - void finish(int r) { - if (r >= 0) - svc->_commit(); - } - }; - friend class C_Update; - - class C_Propose : public Context { - PaxosService *ps; - public: - C_Propose(PaxosService *p) : ps(p) { } - void finish(int r) { - ps->proposal_timer = 0; - ps->propose_pending(); - } - }; - friend class C_Propose; - - -private: - Context *proposal_timer; - bool have_pending; - -public: - PaxosService(Monitor *mn, Paxos *p) : mon(mn), paxos(p), - proposal_timer(0), - have_pending(false) { } - - // i implement and you ignore - void dispatch(Message *m); - void election_finished(); - -private: - void _active(); - void _commit(); - -public: - // i implement and you use - void propose_pending(); // propose current pending as new paxos state - - // you implement - virtual bool update_from_paxos() = 0; // assimilate latest paxos state - virtual void create_pending() = 0; // [leader] create new pending structures - virtual void create_initial() = 0; // [leader] populate pending with initial state (1) - virtual void encode_pending(bufferlist& bl) = 0; // [leader] finish and encode pending for next paxos state - virtual void discard_pending() { } // [leader] discard pending - - virtual bool preprocess_query(Message *m) = 0; // true if processed (e.g., read-only) - virtual bool prepare_update(Message *m) = 0; - virtual bool should_propose(double &delay); - - virtual void committed() = 0; - -}; - -#endif - diff --git a/branches/sage/mds/mon/mon_types.h b/branches/sage/mds/mon/mon_types.h deleted file mode 100644 index 8d1ac92822356..0000000000000 --- a/branches/sage/mds/mon/mon_types.h +++ /dev/null @@ -1,35 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MON_TYPES_H -#define __MON_TYPES_H - -#define PAXOS_TEST 0 -#define PAXOS_MDSMAP 1 -#define PAXOS_OSDMAP 2 -#define PAXOS_CLIENTMAP 3 -#define PAXOS_PGMAP 4 - -inline const char *get_paxos_name(int p) { - switch (p) { - case PAXOS_TEST: return "test"; - case PAXOS_MDSMAP: return "mdsmap"; - case PAXOS_OSDMAP: return "osdmap"; - case PAXOS_CLIENTMAP: return "clientmap"; - case PAXOS_PGMAP: return "pgmap"; - default: assert(0); return 0; - } -} - -#endif diff --git a/branches/sage/mds/msg/Dispatcher.cc b/branches/sage/mds/msg/Dispatcher.cc deleted file mode 100644 index 4fa04d7d4c92a..0000000000000 --- a/branches/sage/mds/msg/Dispatcher.cc +++ /dev/null @@ -1,28 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "Dispatcher.h" -#include "Messenger.h" - -#include "mds/MDS.h" - -/* -int Dispatcher::send_message(Message *m, msg_addr_t dest, int dest_port) -{ - assert(0); - //return dis_messenger->send_message(m, dest, dest_port, MDS_PORT_SERVER); // on my port! -} -*/ diff --git a/branches/sage/mds/msg/Dispatcher.h b/branches/sage/mds/msg/Dispatcher.h deleted file mode 100644 index 0a77de3d20369..0000000000000 --- a/branches/sage/mds/msg/Dispatcher.h +++ /dev/null @@ -1,34 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __DISPATCHER_H -#define __DISPATCHER_H - -#include "Message.h" - -class Messenger; - -class Dispatcher { - public: - virtual ~Dispatcher() { } - - // how i receive messages - virtual void dispatch(Message *m) = 0; - - // how i deal with transmission failures. - virtual void ms_handle_failure(Message *m, const entity_inst_t& inst) { delete m; } -}; - -#endif diff --git a/branches/sage/mds/msg/FakeMessenger.cc b/branches/sage/mds/msg/FakeMessenger.cc deleted file mode 100644 index 77434c35e64da..0000000000000 --- a/branches/sage/mds/msg/FakeMessenger.cc +++ /dev/null @@ -1,413 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "Message.h" -#include "FakeMessenger.h" -#include "mds/MDS.h" - -#include "common/Timer.h" - -#include "common/LogType.h" -#include "common/Logger.h" - -#include "config.h" - -#define dout(x) if ((x) <= g_conf.debug_ms) *_dout << dbeginl << g_clock.now() << " " - - - -#include -#include -#include -#include -#include - -using namespace std; - -#include -using namespace __gnu_cxx; - - -#include "common/Cond.h" -#include "common/Mutex.h" -#include - - -// global queue. - -int nranks = 0; // this identify each entity_inst_t - -map directory; -hash_map loggers; -LogType fakemsg_logtype; - -set shutdown_set; - -Mutex lock; -Cond cond; - -bool awake = false; -bool fm_shutdown = false; -pthread_t thread_id; - -extern std::map g_fake_kill_after; // in config.cc -utime_t start_time; -map fail_queue; -list sent_to_failed_queue; - -void *fakemessenger_thread(void *ptr) -{ - start_time = g_clock.now(); - - lock.Lock(); - while (1) { - if (fm_shutdown) break; - fakemessenger_do_loop_2(); - - if (directory.empty() && nranks > 0) break; - - dout(20) << "thread waiting" << dendl; - if (fm_shutdown) break; - awake = false; - cond.Wait(lock); - awake = true; - dout(20) << "thread woke up" << dendl; - } - lock.Unlock(); - - dout(1) << "thread finish (i woke up but no messages, bye)" << dendl; - return 0; -} - - -void fakemessenger_startthread() { - pthread_create(&thread_id, NULL, fakemessenger_thread, 0); -} - -void fakemessenger_stopthread() { - dout(0) << "fakemessenger_stopthread setting stop flag" << dendl; - lock.Lock(); - fm_shutdown = true; - lock.Unlock(); - cond.Signal(); - - fakemessenger_wait(); -} - -void fakemessenger_wait() -{ - dout(0) << "fakemessenger_wait waiting" << dendl; - void *ptr; - pthread_join(thread_id, &ptr); -} - - -// fake failure - - - -// lame main looper - -int fakemessenger_do_loop() -{ - lock.Lock(); - fakemessenger_do_loop_2(); - lock.Unlock(); - - g_timer.shutdown(); - return 0; -} - - -int fakemessenger_do_loop_2() -{ - //lock.Lock(); - dout(18) << "do_loop begin." << dendl; - - while (1) { - bool didone = false; - - dout(18) << "do_loop top" << dendl; - - // fail_queue - while (!fail_queue.empty() && - fail_queue.begin()->first < g_clock.now()) { - entity_name_t nm = fail_queue.begin()->second; - fail_queue.erase(fail_queue.begin()); - - dout(0) << "MUST FAKE KILL " << nm << dendl; - - for (map::iterator p = directory.begin(); - p != directory.end(); - ++p) { - if (p->second->get_myname() == nm) { - dout(0) << "FAKING FAILURE of " << nm << " at " << p->first << dendl; - directory.erase(p); - p->second->failed = true; - break; - } - } - } - - list ls; - ls.swap(sent_to_failed_queue); - for (list::iterator p = ls.begin(); - p != ls.end(); - ++p) { - Message *m = *p; - FakeMessenger *mgr = 0; - Dispatcher *dis = 0; - if (directory.count(m->get_source_addr())) { - mgr = directory[m->get_source_addr()]; - if (mgr) - dis = mgr->get_dispatcher(); - } - if (dis) { - dout(1) << "fail on " << *m - << " to " << m->get_dest() << " from " << m->get_source() - << ", passing back to sender." << dendl; - dis->ms_handle_failure(m, m->get_dest_inst()); - } else { - dout(1) << "fail on " << *m - << " to " << m->get_dest() << " from " << m->get_source() - << ", sender gone, dropping." << dendl; - delete m; - } - } - - // messages - map::iterator it = directory.begin(); - while (it != directory.end()) { - FakeMessenger *mgr = it->second; - - dout(18) << "messenger " << mgr << " at " << mgr->get_myname() << " has " << mgr->num_incoming() << " queued" << dendl; - - if (!mgr->is_ready()) { - dout(18) << "messenger " << mgr << " at " << mgr->get_myname() << " has no dispatcher, skipping" << dendl; - it++; - continue; - } - - Message *m = mgr->get_message(); - it++; - - if (m) { - m->set_recv_stamp(g_clock.now()); - - //dout(18) << "got " << m << dendl; - dout(1) << "==== " << m->get_dest() - << " <- " << m->get_source() - << " ==== " << *m - << " ---- " << m - << dendl; - - if (g_conf.fakemessenger_serialize) { - // encode - if (m->empty_payload()) - m->encode_payload(); - ceph_message_header env = m->get_envelope(); - bufferlist bl; - bl.claim( m->get_payload() ); - //bl.c_str(); // condense into 1 buffer - - delete m; - - // decode - m = decode_message(env, bl); - assert(m); - } - - didone = true; - - lock.Unlock(); - mgr->dispatch(m); - lock.Lock(); - } - } - - // deal with shutdowns.. delayed to avoid concurrent directory modification - if (!shutdown_set.empty()) { - for (set::iterator it = shutdown_set.begin(); - it != shutdown_set.end(); - it++) { - dout(7) << "fakemessenger: removing " << *it << " from directory" << dendl; - assert(directory.count(*it)); - directory.erase(*it); - if (directory.empty()) { - dout(1) << "fakemessenger: last shutdown" << dendl; - ::fm_shutdown = true; - } - } - shutdown_set.clear(); - } - - if (!didone) - break; - } - - - dout(18) << "do_loop end (no more messages)." << dendl; - //lock.Unlock(); - return 0; -} - - -FakeMessenger::FakeMessenger(entity_name_t me) : Messenger(me) -{ - failed = false; - - lock.Lock(); - { - // assign rank - _myinst.name = me; - _myinst.addr.v.port = nranks++; - //if (!me.is_mon()) - _myinst.addr.v.nonce = getpid(); - - // add to directory - directory[ _myinst.addr ] = this; - - // put myself in the fail queue? - if (g_fake_kill_after.count(me)) { - utime_t w = start_time; - w += g_fake_kill_after[me]; - dout(0) << "will fake failure of " << me << " at " << w << dendl; - fail_queue[w] = me; - } - } - lock.Unlock(); - - - dout(0) << "fakemessenger " << get_myname() << " messenger is " << this - << " at " << get_myaddr() << dendl; - - qlen = 0; - - /* - string name; - name = "m."; - name += MSG_ADDR_TYPE(myaddr); - int w = MSG_ADDR_NUM(myaddr); - if (w >= 1000) name += ('0' + ((w/1000)%10)); - if (w >= 100) name += ('0' + ((w/100)%10)); - if (w >= 10) name += ('0' + ((w/10)%10)); - name += ('0' + ((w/1)%10)); - - loggers[ myaddr ] = new Logger(name, (LogType*)&fakemsg_logtype); - */ -} - -FakeMessenger::~FakeMessenger() -{ - // hose any undelivered messages - for (list::iterator p = incoming.begin(); - p != incoming.end(); - ++p) - delete *p; -} - - -int FakeMessenger::shutdown() -{ - dout(2) << "shutdown on messenger " << this << " has " << num_incoming() << " queued" << dendl; - lock.Lock(); - assert(directory.count(_myinst.addr) == 1); - shutdown_set.insert(_myinst.addr); - - /* - if (loggers[myaddr]) { - delete loggers[myaddr]; - loggers.erase(myaddr); - } - */ - - lock.Unlock(); - return 0; -} - - -void FakeMessenger::reset_myname(entity_name_t m) -{ - dout(1) << "reset_myname from " << get_myname() << " to " << m << dendl; - _myinst.name = m; - - // put myself in the fail queue? - if (g_fake_kill_after.count(m)) { - utime_t w = start_time; - w += g_fake_kill_after[m]; - dout(0) << "will fake failure of " << m << " at " << w << dendl; - fail_queue[w] = m; - } - -} - - -int FakeMessenger::send_message(Message *m, entity_inst_t inst, int port, int fromport) -{ - entity_name_t dest = inst.name; - - m->set_source(get_myname(), fromport); - m->set_source_addr(get_myaddr()); - - m->set_dest_inst(inst); - m->set_dest_port(port); - - lock.Lock(); - -#ifdef LOG_MESSAGES - // stats - loggers[get_myaddr()]->inc("+send",1); - loggers[dest]->inc("-recv",1); - - char s[20]; - sprintf(s,"+%s", m->get_type_name()); - loggers[get_myaddr()]->inc(s); - sprintf(s,"-%s", m->get_type_name()); - loggers[dest]->inc(s); -#endif - - // queue - if (directory.count(inst.addr) && - shutdown_set.count(inst.addr) == 0) { - dout(1) << "--> " << get_myname() << " -> " << inst.name << " --- " << *m << " -- " << m - << dendl; - directory[inst.addr]->queue_incoming(m); - } else { - dout(0) << "--> " << get_myname() << " -> " << inst.name << " " << *m << " -- " << m - << " *** destination " << inst.addr << " DNE ***" - << dendl; - for (map::iterator p = directory.begin(); - p != directory.end(); - ++p) { - dout(20) << "** have " << p->first << " to " << p->second << dendl; - } - - // do the failure callback - sent_to_failed_queue.push_back(m); - } - - // wake up loop? - if (!awake) { - dout(10) << "waking up fakemessenger thread" << dendl; - cond.Signal(); - lock.Unlock(); - } else - lock.Unlock(); - - return 0; -} - - diff --git a/branches/sage/mds/msg/FakeMessenger.h b/branches/sage/mds/msg/FakeMessenger.h deleted file mode 100644 index 57379133812a9..0000000000000 --- a/branches/sage/mds/msg/FakeMessenger.h +++ /dev/null @@ -1,89 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __FAKEMESSENGER_H -#define __FAKEMESSENGER_H - -#include "Messenger.h" -#include "Dispatcher.h" - -#include -#include - -class Timer; - -class FakeMessenger : public Messenger { - protected: - class Logger *logger; - - int qlen; - list incoming; // incoming queue - - public: - bool failed; - - FakeMessenger(entity_name_t me); - ~FakeMessenger(); - - virtual int shutdown(); - - void reset_myname(entity_name_t m); - - // msg interface - virtual int send_message(Message *m, entity_inst_t dest, int port=0, int fromport=0); - - // events - //virtual void trigger_timer(Timer *t); - - int get_dispatch_queue_len() { return qlen; } - - // -- incoming queue -- - // (that nothing uses) - Message *get_message() { - if (!incoming.empty()) { - Message *m = incoming.front(); - incoming.pop_front(); - qlen--; - return m; - } - return NULL; - } - bool queue_incoming(Message *m) { - incoming.push_back(m); - qlen++; - return true; - } - int num_incoming() { - //return incoming.size(); - return qlen; - } - - void suicide() { - if (!failed) { - failed = true; - } - shutdown(); - } - -}; - -int fakemessenger_do_loop(); -int fakemessenger_do_loop_2(); -void fakemessenger_startthread(); -void fakemessenger_stopthread(); -void fakemessenger_wait(); - -#endif diff --git a/branches/sage/mds/msg/Message.cc b/branches/sage/mds/msg/Message.cc deleted file mode 100644 index e3de6ce0677ba..0000000000000 --- a/branches/sage/mds/msg/Message.cc +++ /dev/null @@ -1,383 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#include -#include -using namespace std; - -#include "include/types.h" - -#include "Message.h" - -#include "messages/MGenericMessage.h" - -#include "messages/MPGStats.h" - -#include "messages/MStatfs.h" -#include "messages/MStatfsReply.h" - -#include "messages/MMonCommand.h" -#include "messages/MMonCommandAck.h" -#include "messages/MMonPaxos.h" - -#include "messages/MMonElection.h" - -#include "messages/MPing.h" -#include "messages/MPingAck.h" -//#include "messages/MFailure.h" -//#include "messages/MFailureAck.h" - -#include "messages/MOSDBoot.h" -#include "messages/MOSDIn.h" -#include "messages/MOSDOut.h" -#include "messages/MOSDFailure.h" -#include "messages/MOSDPing.h" -#include "messages/MOSDOp.h" -#include "messages/MOSDOpReply.h" -#include "messages/MOSDMap.h" -#include "messages/MOSDGetMap.h" - -#include "messages/MOSDPGNotify.h" -#include "messages/MOSDPGQuery.h" -#include "messages/MOSDPGLog.h" -#include "messages/MOSDPGRemove.h" -#include "messages/MOSDPGActivateSet.h" - -#include "messages/MClientMount.h" -#include "messages/MClientUnmount.h" -#include "messages/MClientSession.h" -#include "messages/MClientReconnect.h" -#include "messages/MClientRequest.h" -#include "messages/MClientRequestForward.h" -#include "messages/MClientReply.h" -#include "messages/MClientFileCaps.h" - -#include "messages/MMDSSlaveRequest.h" - -#include "messages/MMDSGetMap.h" -#include "messages/MMDSMap.h" -#include "messages/MMDSBeacon.h" -#include "messages/MMDSResolve.h" -#include "messages/MMDSResolveAck.h" -#include "messages/MMDSCacheRejoin.h" -//#include "messages/MMDSCacheRejoinAck.h" - -#include "messages/MDirUpdate.h" -#include "messages/MDiscover.h" -#include "messages/MDiscoverReply.h" - -#include "messages/MMDSFragmentNotify.h" - -#include "messages/MExportDirDiscover.h" -#include "messages/MExportDirDiscoverAck.h" -#include "messages/MExportDirCancel.h" -#include "messages/MExportDirPrep.h" -#include "messages/MExportDirPrepAck.h" -#include "messages/MExportDirWarning.h" -#include "messages/MExportDirWarningAck.h" -#include "messages/MExportDir.h" -#include "messages/MExportDirAck.h" -#include "messages/MExportDirNotify.h" -#include "messages/MExportDirNotifyAck.h" -#include "messages/MExportDirFinish.h" - -#include "messages/MExportCaps.h" -#include "messages/MExportCapsAck.h" - - -#include "messages/MDentryUnlink.h" - -#include "messages/MHeartbeat.h" - -#include "messages/MAnchor.h" - -//#include "messages/MInodeUpdate.h" -#include "messages/MCacheExpire.h" -#include "messages/MInodeFileCaps.h" - -#include "messages/MLock.h" - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug) *_dout << dbeginl << g_clock.now() << " MESSENGER: " -#define DEBUGLVL 10 // debug level of output - - - - - - - -Message * -decode_message(ceph_message_header& env, bufferlist& payload) -{ - // make message - Message *m = 0; - switch(env.type) { - - // -- with payload -- - - case MSG_PGSTATS: - m = new MPGStats; - break; - - case MSG_STATFS: - m = new MStatfs; - break; - case MSG_STATFS_REPLY: - m = new MStatfsReply; - break; - - case MSG_MON_COMMAND: - m = new MMonCommand; - break; - case MSG_MON_COMMAND_ACK: - m = new MMonCommandAck; - break; - case MSG_MON_PAXOS: - m = new MMonPaxos; - break; - - case MSG_MON_ELECTION: - m = new MMonElection; - break; - - case MSG_PING: - m = new MPing(); - break; - case MSG_PING_ACK: - m = new MPingAck(); - break; - /* - case MSG_FAILURE: - m = new MFailure(); - break; - case MSG_FAILURE_ACK: - m = new MFailureAck(); - break; - */ - - case MSG_OSD_BOOT: - m = new MOSDBoot(); - break; - case MSG_OSD_IN: - m = new MOSDIn(); - break; - case MSG_OSD_OUT: - m = new MOSDOut(); - break; - case MSG_OSD_FAILURE: - m = new MOSDFailure(); - break; - case MSG_OSD_PING: - m = new MOSDPing(); - break; - case MSG_OSD_OP: - m = new MOSDOp(); - break; - case MSG_OSD_OPREPLY: - m = new MOSDOpReply(); - break; - - case MSG_OSD_MAP: - m = new MOSDMap(); - break; - case MSG_OSD_GETMAP: - m = new MOSDGetMap(); - break; - - case MSG_OSD_PG_NOTIFY: - m = new MOSDPGNotify(); - break; - case MSG_OSD_PG_QUERY: - m = new MOSDPGQuery(); - break; - case MSG_OSD_PG_LOG: - m = new MOSDPGLog(); - break; - case MSG_OSD_PG_REMOVE: - m = new MOSDPGRemove(); - break; - case MSG_OSD_PG_ACTIVATE_SET: - m = new MOSDPGActivateSet(); - break; - - // clients - case MSG_CLIENT_MOUNT: - m = new MClientMount; - break; - case MSG_CLIENT_UNMOUNT: - m = new MClientUnmount; - break; - case MSG_CLIENT_SESSION: - m = new MClientSession; - break; - case MSG_CLIENT_RECONNECT: - m = new MClientReconnect; - break; - case MSG_CLIENT_REQUEST: - m = new MClientRequest; - break; - case MSG_CLIENT_REQUEST_FORWARD: - m = new MClientRequestForward; - break; - case MSG_CLIENT_REPLY: - m = new MClientReply; - break; - case MSG_CLIENT_FILECAPS: - m = new MClientFileCaps; - break; - - // mds - case MSG_MDS_SLAVE_REQUEST: - m = new MMDSSlaveRequest; - break; - - case MSG_MDS_GETMAP: - m = new MMDSGetMap(); - break; - case MSG_MDS_MAP: - m = new MMDSMap(); - break; - case MSG_MDS_BEACON: - m = new MMDSBeacon; - break; - case MSG_MDS_RESOLVE: - m = new MMDSResolve; - break; - case MSG_MDS_RESOLVEACK: - m = new MMDSResolveAck; - break; - case MSG_MDS_CACHEREJOIN: - m = new MMDSCacheRejoin; - break; - /* - case MSG_MDS_CACHEREJOINACK: - m = new MMDSCacheRejoinAck; - break; - */ - - case MSG_MDS_DIRUPDATE: - m = new MDirUpdate(); - break; - - case MSG_MDS_DISCOVER: - m = new MDiscover(); - break; - case MSG_MDS_DISCOVERREPLY: - m = new MDiscoverReply(); - break; - - case MSG_MDS_FRAGMENTNOTIFY: - m = new MMDSFragmentNotify; - break; - - case MSG_MDS_EXPORTDIRDISCOVER: - m = new MExportDirDiscover(); - break; - case MSG_MDS_EXPORTDIRDISCOVERACK: - m = new MExportDirDiscoverAck(); - break; - case MSG_MDS_EXPORTDIRCANCEL: - m = new MExportDirCancel(); - break; - - case MSG_MDS_EXPORTDIR: - m = new MExportDir; - break; - case MSG_MDS_EXPORTDIRACK: - m = new MExportDirAck; - break; - case MSG_MDS_EXPORTDIRFINISH: - m = new MExportDirFinish; - break; - - case MSG_MDS_EXPORTDIRNOTIFY: - m = new MExportDirNotify(); - break; - - case MSG_MDS_EXPORTDIRNOTIFYACK: - m = new MExportDirNotifyAck(); - break; - - case MSG_MDS_EXPORTDIRPREP: - m = new MExportDirPrep(); - break; - - case MSG_MDS_EXPORTDIRPREPACK: - m = new MExportDirPrepAck(); - break; - - case MSG_MDS_EXPORTDIRWARNING: - m = new MExportDirWarning; - break; - case MSG_MDS_EXPORTDIRWARNINGACK: - m = new MExportDirWarningAck; - break; - - - case MSG_MDS_EXPORTCAPS: - m = new MExportCaps; - break; - case MSG_MDS_EXPORTCAPSACK: - m = new MExportCapsAck; - break; - - - case MSG_MDS_DENTRYUNLINK: - m = new MDentryUnlink(); - break; - - case MSG_MDS_HEARTBEAT: - m = new MHeartbeat(); - break; - - case MSG_MDS_CACHEEXPIRE: - m = new MCacheExpire(); - break; - - case MSG_MDS_ANCHOR: - m = new MAnchor(); - break; - - /* case MSG_MDS_INODEUPDATE: - m = new MInodeUpdate(); - break; - */ - - case MSG_MDS_INODEFILECAPS: - m = new MInodeFileCaps(); - break; - - case MSG_MDS_LOCK: - m = new MLock(); - break; - - - // -- simple messages without payload -- - - case MSG_CLOSE: - case MSG_SHUTDOWN: - case MSG_MDS_SHUTDOWNSTART: - case MSG_MDS_SHUTDOWNFINISH: - case MSG_OSD_MKFS_ACK: - m = new MGenericMessage(env.type); - break; - - default: - dout(1) << "can't decode unknown message type " << env.type << dendl; - assert(0); - } - - // env - m->set_envelope(env); - - // decode - m->set_payload(payload); - m->decode_payload(); - - // done! - return m; -} - - diff --git a/branches/sage/mds/msg/Message.h b/branches/sage/mds/msg/Message.h deleted file mode 100644 index cb106251a425b..0000000000000 --- a/branches/sage/mds/msg/Message.h +++ /dev/null @@ -1,262 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MESSAGE_H -#define __MESSAGE_H - -#define MSG_CLOSE 0 - -#define MSG_STATFS 1 -#define MSG_STATFS_REPLY 2 -#define MSG_PGSTATS 3 - -#define MSG_PING 10 -#define MSG_PING_ACK 11 - -#define MSG_SHUTDOWN 99999 - -#define MSG_MON_COMMAND 13 -#define MSG_MON_COMMAND_ACK 14 - - -#define MSG_MON_ELECTION 15 - -#define MSG_MON_OSDMAP_INFO 20 -#define MSG_MON_OSDMAP_LEASE 21 -#define MSG_MON_OSDMAP_LEASE_ACK 22 -#define MSG_MON_OSDMAP_UPDATE_PREPARE 23 -#define MSG_MON_OSDMAP_UPDATE_ACK 24 -#define MSG_MON_OSDMAP_UPDATE_COMMIT 25 - -#define MSG_MON_PAXOS 30 - -#define MSG_OSD_OP 40 // delete, etc. -#define MSG_OSD_OPREPLY 41 // delete, etc. -#define MSG_OSD_PING 42 - -#define MSG_OSD_GETMAP 43 -#define MSG_OSD_MAP 44 - -#define MSG_OSD_BOOT 45 -#define MSG_OSD_MKFS_ACK 46 - -#define MSG_OSD_FAILURE 47 - -#define MSG_OSD_IN 48 -#define MSG_OSD_OUT 49 - - - -#define MSG_OSD_PG_NOTIFY 50 -#define MSG_OSD_PG_QUERY 51 -#define MSG_OSD_PG_SUMMARY 52 -#define MSG_OSD_PG_LOG 53 -#define MSG_OSD_PG_REMOVE 54 -#define MSG_OSD_PG_ACTIVATE_SET 55 - -// -- client -- -// to monitor -#define MSG_CLIENT_MOUNT 60 -#define MSG_CLIENT_UNMOUNT 61 - -// to mds -#define MSG_CLIENT_SESSION 70 // start or stop -#define MSG_CLIENT_RECONNECT 71 - -#define MSG_CLIENT_REQUEST 80 -#define MSG_CLIENT_REQUEST_FORWARD 81 -#define MSG_CLIENT_REPLY 82 -#define MSG_CLIENT_FILECAPS 83 - - - -// *** MDS *** - -#define MSG_MDS_GETMAP 102 -#define MSG_MDS_MAP 103 -#define MSG_MDS_HEARTBEAT 104 // for mds load balancer -#define MSG_MDS_BEACON 105 // to monitor - -#define MSG_MDS_RESOLVE 106 -#define MSG_MDS_RESOLVEACK 107 - -#define MSG_MDS_CACHEREJOIN 108 - -#define MSG_MDS_DISCOVER 110 -#define MSG_MDS_DISCOVERREPLY 111 - -#define MSG_MDS_INODEGETREPLICA 112 -#define MSG_MDS_INODEGETREPLICAACK 113 - -#define MSG_MDS_INODEFILECAPS 115 - -#define MSG_MDS_INODEUPDATE 120 -#define MSG_MDS_DIRUPDATE 121 -#define MSG_MDS_INODEEXPIRE 122 -#define MSG_MDS_DIREXPIRE 123 - -#define MSG_MDS_DIREXPIREREQ 124 - -#define MSG_MDS_CACHEEXPIRE 125 - -#define MSG_MDS_ANCHOR 130 - -#define MSG_MDS_FRAGMENTNOTIFY 140 - -#define MSG_MDS_EXPORTDIRDISCOVER 149 -#define MSG_MDS_EXPORTDIRDISCOVERACK 150 -#define MSG_MDS_EXPORTDIRCANCEL 151 -#define MSG_MDS_EXPORTDIRPREP 152 -#define MSG_MDS_EXPORTDIRPREPACK 153 -#define MSG_MDS_EXPORTDIRWARNING 154 -#define MSG_MDS_EXPORTDIRWARNINGACK 155 -#define MSG_MDS_EXPORTDIR 156 -#define MSG_MDS_EXPORTDIRACK 157 -#define MSG_MDS_EXPORTDIRNOTIFY 158 -#define MSG_MDS_EXPORTDIRNOTIFYACK 159 -#define MSG_MDS_EXPORTDIRFINISH 160 - -#define MSG_MDS_EXPORTCAPS 166 -#define MSG_MDS_EXPORTCAPSACK 167 - -#define MSG_MDS_SLAVE_REQUEST 170 - -#define MSG_MDS_DENTRYUNLINK 200 - -#define MSG_MDS_LOCK 500 - -#define MSG_MDS_SHUTDOWNSTART 900 -#define MSG_MDS_SHUTDOWNFINISH 901 - - -#include -#include - -#include -#include -using std::list; - -#include - - -#include "include/types.h" -#include "include/buffer.h" -#include "msg_types.h" - - - - -// ====================================================== - -// abstract Message class - - -class Message { - private: - - protected: - ceph_message_header env; // envelope - bufferlist payload; // payload - list chunk_payload_at; - - utime_t recv_stamp; - - friend class Messenger; -public: - - public: - Message() { - env.source_port = env.dest_port = 0; - env.nchunks = 0; - }; - Message(int t) { - env.source_port = env.dest_port = 0; - env.nchunks = 0; - env.type = t; - } - virtual ~Message() { - } - - - void clear_payload() { payload.clear(); } - bool empty_payload() { return payload.length() == 0; } - bufferlist& get_payload() { - return payload; - } - void set_payload(bufferlist& bl) { - payload.claim(bl); - } - void copy_payload(const bufferlist& bl) { - payload = bl; - } - const list& get_chunk_payload_at() const { return chunk_payload_at; } - void set_chunk_payload_at(list& o) { chunk_payload_at.swap(o); } - ceph_message_header& get_envelope() { - return env; - } - void set_envelope(ceph_message_header& env) { - this->env = env; - } - - - void set_recv_stamp(utime_t t) { recv_stamp = t; } - utime_t get_recv_stamp() { return recv_stamp; } - - // ENVELOPE ---- - - // type - int get_type() { return env.type; } - void set_type(int t) { env.type = t; } - virtual char *get_type_name() = 0; - - // source/dest - entity_inst_t& get_dest_inst() { return *(entity_inst_t*)&env.dst; } - void set_dest_inst(entity_inst_t& inst) { env.dst = *(ceph_entity_inst*)&inst; } - - entity_inst_t& get_source_inst() { return *(entity_inst_t*)&env.src; } - void set_source_inst(entity_inst_t& inst) { env.src = *(ceph_entity_inst*)&inst; } - - entity_name_t& get_dest() { return *(entity_name_t*)&env.dst.name; } - void set_dest(entity_name_t a, int p) { env.dst.name = *(ceph_entity_name*)&a; env.dest_port = p; } - int get_dest_port() { return env.dest_port; } - void set_dest_port(int p) { env.dest_port = p; } - - entity_name_t& get_source() { return *(entity_name_t*)&env.src.name; } - void set_source(entity_name_t a, int p) { env.src.name = *(ceph_entity_name*)&a; env.source_port = p; } - int get_source_port() { return env.source_port; } - - entity_addr_t& get_source_addr() { return *(entity_addr_t*)&env.src.addr; } - void set_source_addr(const entity_addr_t &i) { env.src.addr = *(ceph_entity_addr*)&i; } - - // PAYLOAD ---- - void reset_payload() { - payload.clear(); - } - - virtual void decode_payload() = 0; - virtual void encode_payload() = 0; - - virtual void print(ostream& out) { - out << get_type_name(); - } - -}; - -extern Message *decode_message(ceph_message_header &env, bufferlist& bl); -inline ostream& operator<<(ostream& out, Message& m) { - m.print(out); - return out; -} - -#endif diff --git a/branches/sage/mds/msg/Messenger.cc b/branches/sage/mds/msg/Messenger.cc deleted file mode 100644 index 5af83462b2995..0000000000000 --- a/branches/sage/mds/msg/Messenger.cc +++ /dev/null @@ -1,39 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include -#include "include/types.h" - -#include "Message.h" -#include "Messenger.h" -#include "messages/MGenericMessage.h" - -#include -#include -using namespace std; - - -// --------- -// incoming messages - -void Messenger::dispatch(Message *m) -{ - assert(dispatcher); - dispatcher->dispatch(m); -} - - - diff --git a/branches/sage/mds/msg/Messenger.h b/branches/sage/mds/msg/Messenger.h deleted file mode 100644 index 85ac2745ad0d7..0000000000000 --- a/branches/sage/mds/msg/Messenger.h +++ /dev/null @@ -1,101 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __MESSENGER_H -#define __MESSENGER_H - -#include -using namespace std; - -#include "Message.h" -#include "Dispatcher.h" -#include "common/Mutex.h" -#include "common/Cond.h" -#include "include/Context.h" - - - -class MDS; -class Timer; - -class Messenger { - private: - Dispatcher *dispatcher; - -protected: - entity_inst_t _myinst; - - public: - Messenger(entity_name_t w) : dispatcher(0) { - _myinst.name = w; - } - virtual ~Messenger() { } - - // accessors - entity_name_t get_myname() { return _myinst.name; } - const entity_addr_t& get_myaddr() { return _myinst.addr; } - const entity_inst_t& get_myinst() { return _myinst; } - - void _set_myname(entity_name_t m) { _myinst.name = m; } - virtual void reset_myname(entity_name_t m) = 0; - - // hrmpf. - virtual int get_dispatch_queue_len() { return 0; }; - - // setup - void set_dispatcher(Dispatcher *d) { - if (!dispatcher) { - dispatcher = d; - ready(); - } - } - Dispatcher *get_dispatcher() { return dispatcher; } - virtual void ready() { } - bool is_ready() { return dispatcher != 0; } - - // dispatch incoming messages - virtual void dispatch(Message *m) { - assert(dispatcher); - dispatcher->dispatch(m); - } - - // shutdown - virtual int shutdown() = 0; - virtual void suicide() = 0; - - // send message - virtual void prepare_dest(const entity_addr_t& addr) {} - virtual int send_message(Message *m, entity_inst_t dest, - int port=0, int fromport=0) = 0; - virtual int send_first_message(Dispatcher *d, - Message *m, entity_inst_t dest, - int port=0, int fromport=0) { - set_dispatcher(d); - return send_message(m, dest, port, fromport); - } - - // make a procedure call - //virtual Message* sendrecv(Message *m, msg_name_t dest, int port=0); - - virtual void mark_down(entity_addr_t a) {} - -}; - - - - - -#endif diff --git a/branches/sage/mds/msg/SimpleMessenger.cc b/branches/sage/mds/msg/SimpleMessenger.cc deleted file mode 100644 index 51ff661ebb7ec..0000000000000 --- a/branches/sage/mds/msg/SimpleMessenger.cc +++ /dev/null @@ -1,1410 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "SimpleMessenger.h" - -#include -#include -#include -#include -#include -#include -#include - -#include "config.h" - -#include "messages/MGenericMessage.h" - -#include - -#include -#include - -#define dout(l) if (l<=g_conf.debug_ms) *_dout << dbeginl << g_clock.now() << " " << pthread_self() << " -- " << rank.my_addr << " " -#define derr(l) if (l<=g_conf.debug_ms) *_derr << dbeginl << g_clock.now() << " " << pthread_self() << " -- " << rank.my_addr << " " - - - -#include "tcp.cc" - - -Rank rank; - - -sighandler_t old_sigint_handler = 0; - - -/******************************************** - * Accepter - */ - -void simplemessenger_sigint(int r) -{ - rank.sigint(); - if (old_sigint_handler) - old_sigint_handler(r); -} - -void Rank::sigint() -{ - lock.Lock(); - derr(0) << "got control-c, exiting" << dendl; - - // force close listener socket - ::close(accepter.listen_sd); - - // force close all pipe sockets, too - for (hash_map::iterator p = rank_pipe.begin(); - p != rank_pipe.end(); - ++p) - p->second->force_close(); - - lock.Unlock(); -} - - - -void noop_signal_handler(int s) -{ - //dout(0) << "blah_handler got " << s << dendl; -} - -int Rank::Accepter::start() -{ - // bind to a socket - dout(10) << "accepter.start" << dendl; - - char hostname[100]; - memset(hostname, 0, 100); - gethostname(hostname, 100); - dout(2) << "accepter.start my hostname is " << hostname << dendl; - - // is there a .ceph_hosts file? - { - ifstream fh; - fh.open(".ceph_hosts"); - if (fh.is_open()) { - while (1) { - string line; - getline(fh, line); - if (fh.eof()) break; - if (line[0] == '#' || line[0] == ';') continue; - int ospace = line.find(" "); - if (!ospace) continue; - string host = line.substr(0, ospace); - string addr = line.substr(ospace+1); - dout(15) << ".ceph_hosts: host '" << host << "' -> '" << addr << "'" << dendl; - if (host == hostname) { - parse_ip_port(addr.c_str(), g_my_addr); - dout(1) << ".ceph_hosts: my addr is " << g_my_addr << dendl; - break; - } - } - fh.close(); - } - } - - // use whatever user specified (if anything) - tcpaddr_t listen_addr; - g_my_addr.make_addr(listen_addr); - - /* socket creation */ - listen_sd = socket(AF_INET,SOCK_STREAM,0); - assert(listen_sd > 0); - - /* bind to port */ - int rc = bind(listen_sd, (struct sockaddr *) &listen_addr, sizeof(listen_addr)); - if (rc < 0) - derr(0) << "accepter.start unable to bind to " << listen_addr << dendl; - assert(rc >= 0); - - // what port did we get? - socklen_t llen = sizeof(listen_addr); - getsockname(listen_sd, (sockaddr*)&listen_addr, &llen); - - dout(10) << "accepter.start bound to " << listen_addr << dendl; - - // listen! - rc = ::listen(listen_sd, 1000); - assert(rc >= 0); - - // figure out my_addr - if (g_my_addr != entity_addr_t()) { - // user specified it, easy peasy. - rank.my_addr = g_my_addr; - } else { - // my IP is... HELP! - struct hostent *myhostname = gethostbyname(hostname); - - // look up my hostname. - listen_addr.sin_family = myhostname->h_addrtype; - memcpy((char*)&listen_addr.sin_addr.s_addr, - myhostname->h_addr_list[0], - myhostname->h_length); - rank.my_addr.set_addr(listen_addr); - rank.my_addr.v.port = 0; // see below - } - if (rank.my_addr.v.port == 0) { - entity_addr_t tmp; - tmp.set_addr(listen_addr); - rank.my_addr.v.port = tmp.v.port; - rank.my_addr.v.nonce = getpid(); // FIXME: pid might not be best choice here. - } - - dout(1) << "accepter.start my_addr is " << rank.my_addr << dendl; - - // set up signal handler - //old_sigint_handler = signal(SIGINT, simplemessenger_sigint); - - // set a harmless handle for SIGUSR1 (we'll use it to stop the accepter) - struct sigaction sa; - memset(&sa, 0, sizeof(sa)); - sa.sa_handler = noop_signal_handler; - sa.sa_flags = 0; - sigemptyset(&sa.sa_mask); - sigaction(SIGUSR1, &sa, NULL); - - // start thread - create(); - - return 0; -} - -void *Rank::Accepter::entry() -{ - dout(10) << "accepter starting" << dendl; - - fd_set fds; - while (!done) { - FD_ZERO(&fds); - FD_SET(listen_sd, &fds); - dout(20) << "accepter calling select" << dendl; - int r = ::select(listen_sd+1, &fds, 0, &fds, 0); - dout(20) << "accepter select got " << r << dendl; - - if (done) break; - - // accept - struct sockaddr_in addr; - socklen_t slen = sizeof(addr); - int sd = ::accept(listen_sd, (sockaddr*)&addr, &slen); - if (sd > 0) { - dout(10) << "accepted incoming on sd " << sd << dendl; - - rank.lock.Lock(); - if (!rank.local.empty()) { - Pipe *p = new Pipe(sd); - rank.pipes.insert(p); - } - rank.lock.Unlock(); - } else { - dout(10) << "no incoming connection?" << dendl; - } - } - - dout(20) << "accepter closing" << dendl; - ::close(listen_sd); - dout(10) << "accepter stopping" << dendl; - return 0; -} - -void Rank::Accepter::stop() -{ - done = true; - this->kill(SIGUSR1); - join(); -} - - -/************************************** - * Pipe - */ - -int Rank::Pipe::accept() -{ - // my creater gave me sd via accept() - - // announce myself. - int rc = tcp_write(sd, (char*)&rank.my_addr, sizeof(rank.my_addr)); - if (rc < 0) { - ::close(sd); - done = true; - return -1; - } - - // identify peer - rc = tcp_read(sd, (char*)&peer_addr, sizeof(peer_addr)); - if (rc < 0) { - dout(10) << "pipe(? " << this << ").accept couldn't read peer inst" << dendl; - ::close(sd); - done = true; - return -1; - } - - // register pipe. - rank.lock.Lock(); - { - if (rank.rank_pipe.count(peer_addr) == 0) { - // install as outgoing pipe! - dout(10) << "pipe(" << peer_addr << ' ' << this << ").accept peer is " << peer_addr << dendl; - rank.rank_pipe[peer_addr] = this; - - // create writer thread. - writer_running = true; - writer_thread.create(); - } else { - // hrm, this may affect message delivery order.. keep both pipes! - dout(10) << "pipe(" << peer_addr << ' ' << this << ").accept already have a pipe for this peer (" << rank.rank_pipe[peer_addr] << "), will receive on this pipe only" << dendl; - - // FIXME i could stop the receiver on the other pipe.. - - /* - // low ranks' Pipes "win" - if (peer_addr < rank.my_addr) { - dout(10) << "pipe(" << peer_addr << ' ' << this << ").accept peer is " << peer_addr - << ", already had pipe, but switching to this new one" << dendl; - // switch to this new Pipe - rank.rank_pipe[peer_addr]->unregister(); // close old one - rank.rank_pipe[peer_addr]->close(); // close old one - rank.rank_pipe[peer_addr] = this; - } else { - dout(10) << "pipe(" << peer_addr << ' ' << this << ").accept peer is " << peer_addr - << ", already had pipe, sticking with it" << dendl; - } - */ - } - } - rank.lock.Unlock(); - - return 0; // success. -} - -int Rank::Pipe::connect() -{ - dout(10) << "pipe(" << peer_addr << ' ' << this << ").connect" << dendl; - - // create socket? - sd = socket(AF_INET,SOCK_STREAM,0); - assert(sd > 0); - - // bind any port - struct sockaddr_in myAddr; - myAddr.sin_family = AF_INET; - myAddr.sin_addr.s_addr = htonl(INADDR_ANY); - myAddr.sin_port = htons( 0 ); - - int rc = bind(sd, (struct sockaddr *) &myAddr, sizeof(myAddr)); - assert(rc>=0); - - // connect! - tcpaddr_t tcpaddr; - peer_addr.make_addr(tcpaddr); - rc = ::connect(sd, (sockaddr*)&tcpaddr, sizeof(myAddr)); - if (rc < 0) { - dout(10) << "connect error " << peer_addr - << ", " << errno << ": " << strerror(errno) << dendl; - return rc; - } - - // identify peer ..... FIXME - entity_addr_t paddr; - rc = tcp_read(sd, (char*)&paddr, sizeof(paddr)); - if (!rc) { // bool - dout(10) << "pipe(" << peer_addr << ' ' << this << ").connect couldn't read peer addr" << dendl; - return -1; - } - if (peer_addr != paddr) { - dout(10) << "pipe(" << peer_addr << ' ' << this << ").connect peer identifies itself as " << paddr << ", wrong guy!" << dendl; - ::close(sd); - sd = 0; - return -1; - } - - // identify myself - rc = tcp_write(sd, (char*)&rank.my_addr, sizeof(rank.my_addr)); - if (rc < 0) - return -1; - - // register pipe - /* - rank.lock.Lock(); - { - if (rank.rank_pipe.count(peer_addr) == 0) { - dout(10) << "pipe(" << peer_addr << ' ' << this << ").connect registering pipe" << dendl; - rank.rank_pipe[peer_addr] = this; - } else { - // this is normal. - dout(10) << "pipe(" << peer_addr << ' ' << this << ").connect pipe already registered." << dendl; - } - } - rank.lock.Unlock(); - */ - - // start reader - reader_running = true; - reader_thread.create(); - - return 0; -} - - -void Rank::Pipe::unregister() -{ - assert(rank.lock.is_locked()); - if (rank.rank_pipe.count(peer_addr) && - rank.rank_pipe[peer_addr] == this) { - dout(10) << "pipe(" << peer_addr << ' ' << this - << ").unregister" << dendl; - rank.rank_pipe.erase(peer_addr); - } else { - dout(10) << "pipe(" << peer_addr << ' ' << this - << ").unregister - not registerd" << dendl; - } -} - -void Rank::Pipe::close() -{ - dout(10) << "pipe(" << peer_addr << ' ' << this << ").close" << dendl; - - // queue close message? - if (!need_to_send_close) { - dout(10) << "pipe(" << peer_addr << ' ' << this - << ").close already closing/closed" << dendl; - return; - } - - if (!writer_running) { - dout(10) << "pipe(" << peer_addr << ' ' << this - << ").close not queueing MSG_CLOSE, no writer running" << dendl; - } else { - dout(10) << "pipe(" << peer_addr << ' ' << this - << ").close queueing MSG_CLOSE" << dendl; - lock.Lock(); - q.push_back(new MGenericMessage(MSG_CLOSE)); - cond.Signal(); - need_to_send_close = false; - lock.Unlock(); - } -} - - -/* read msgs from socket. - * also, server. - * - */ -void Rank::Pipe::reader() -{ - if (server) - accept(); - - // loop. - while (!done) { - Message *m = read_message(); - if (!m || m->get_type() == 0) { - if (m) { - delete m; - dout(10) << "pipe(" << peer_addr << ' ' << this << ").reader read MSG_CLOSE message" << dendl; - need_to_send_close = false; - } else { - derr(10) << "pipe(" << peer_addr << ' ' << this << ").reader read null message" << dendl; - } - - rank.lock.Lock(); - unregister(); - rank.lock.Unlock(); - close(); - - done = true; - cond.Signal(); // wake up writer too. - break; - } - - dout(10) << "pipe(" << peer_addr << ' ' << this << ").reader got message " - << m << " " << *m - << " for " << m->get_dest() << dendl; - - // deliver - EntityMessenger *entity = 0; - - rank.lock.Lock(); - { - if (g_conf.ms_single_dispatch) { - // submit to single dispatch queue - rank._submit_single_dispatch(m); - } else { - if (rank.local.count(m->get_dest())) { - // find entity - entity = rank.local[m->get_dest()]; - } else { - entity = rank.find_unnamed(m->get_dest()); - if (entity) { - dout(3) << "pipe(" << peer_addr << ' ' << this << ").reader blessing " << m->get_dest() << dendl; - //entity->reset_myname(m->get_dest()); - rank.local.erase(entity->get_myname()); - rank.local[m->get_dest()] = entity; - entity->_set_myname(m->get_dest()); - - } else { - if (rank.stopped.count(m->get_dest())) { - // ignore it - } else { - derr(0) << "pipe(" << peer_addr << ' ' << this << ").reader got message " << *m << " for " << m->get_dest() << ", which isn't local" << dendl; - //assert(0); // FIXME do this differently - } - } - } - } - } - rank.lock.Unlock(); - - if (entity) - entity->queue_message(m); // queue - } - - - // reap? - bool reap = false; - lock.Lock(); - { - reader_running = false; - if (!writer_running) reap = true; - } - lock.Unlock(); - - if (reap) { - dout(20) << "pipe(" << peer_addr << ' ' << this << ").reader queueing for reap" << dendl; - ::close(sd); - rank.lock.Lock(); - { - rank.pipe_reap_queue.push_back(this); - rank.wait_cond.Signal(); - } - rank.lock.Unlock(); - } -} - - -/* write msgs to socket. - * also, client. - */ -void Rank::Pipe::writer() -{ - if (!server) { - int rc = connect(); - if (rc < 0) { - derr(1) << "pipe(" << peer_addr << ' ' << this << ").writer error connecting, " - << errno << ": " << strerror(errno) - << dendl; - done = true; - list out; - fail(out); - } - } - - // disable Nagle algorithm? - if (g_conf.ms_tcp_nodelay) { - int flag = 1; - int r = ::setsockopt(sd, IPPROTO_TCP, TCP_NODELAY, (char*)&flag, sizeof(flag)); - if (r < 0) - dout(0) << "pipe(" << peer_addr << ' ' << this << ").writer couldn't set TCP_NODELAY: " << strerror(errno) << dendl; - } - - // loop. - lock.Lock(); - while (!q.empty() || !done) { - - if (!q.empty()) { - dout(20) << "pipe(" << peer_addr << ' ' << this << ").writer grabbing message(s)" << dendl; - - // grab outgoing list - list out; - out.swap(q); - - // drop lock while i send these - lock.Unlock(); - - while (!out.empty()) { - Message *m = out.front(); - out.pop_front(); - - dout(20) << "pipe(" << peer_addr << ' ' << this << ").writer sending " << m << " " << *m << dendl; - - // stamp. - m->set_source_addr(rank.my_addr); - - // marshall - if (m->empty_payload()) - m->encode_payload(); - - if (write_message(m) < 0) { - // failed! - derr(1) << "pipe(" << peer_addr << ' ' << this << ").writer error sending " << *m << " to " << m->get_dest() - << ", " << errno << ": " << strerror(errno) - << dendl; - out.push_front(m); - fail(out); - done = true; - break; - } - - // did i just send a close? - if (m->get_type() == MSG_CLOSE) - done = true; - - // clean up - delete m; - } - - lock.Lock(); - continue; - } - - // wait - dout(20) << "pipe(" << peer_addr << ' ' << this << ").writer sleeping" << dendl; - cond.Wait(lock); - } - lock.Unlock(); - - dout(20) << "pipe(" << peer_addr << ' ' << this << ").writer finishing" << dendl; - - // reap? - bool reap = false; - lock.Lock(); - { - writer_running = false; - if (!reader_running) reap = true; - } - lock.Unlock(); - - if (reap) { - dout(20) << "pipe(" << peer_addr << ' ' << this << ").writer queueing for reap" << dendl; - ::close(sd); - rank.lock.Lock(); - { - rank.pipe_reap_queue.push_back(this); - rank.wait_cond.Signal(); - } - rank.lock.Unlock(); - } -} - - -Message *Rank::Pipe::read_message() -{ - // envelope - //dout(10) << "receiver.read_message from sd " << sd << dendl; - - ceph_message_header env; - if (!tcp_read( sd, (char*)&env, sizeof(env) )) { - need_to_send_close = false; - return 0; - } - - dout(20) << "pipe(" << peer_addr << ' ' << this << ").reader got envelope type=" << env.type - << " src " << env.src << " dst " << env.dst - << " nchunks=" << env.nchunks - << dendl; - - // payload - bufferlist blist; - int32_t pos = 0; - list chunk_at; - for (unsigned i=0; iset_chunk_payload_at(chunk_at); - - dout(20) << "pipe(" << peer_addr << ' ' << this << ").reader got " << s << " byte message from " - << m->get_source() << dendl; - - return m; -} - - -int Rank::Pipe::do_sendmsg(Message *m, struct msghdr *msg, int len) -{ - while (len > 0) { - if (0) { // sanity - int l = 0; - for (unsigned i=0; imsg_iovlen; i++) - l += msg->msg_iov[i].iov_len; - assert(l == len); - } - - int r = ::sendmsg(sd, msg, 0); - if (r < 0) { - assert(r == -1); - derr(1) << "pipe(" << peer_addr << ' ' << this << ").writer error on sendmsg for " << *m - << " to " << m->get_dest() - << ", " << strerror(errno) - << dendl; - need_to_send_close = false; - return -1; - } - len -= r; - if (len == 0) break; - - // hrmph. trim r bytes off the front of our message. - dout(20) << "pipe(" << peer_addr << ' ' << this << ").writer partial sendmsg for " << *m - << " to " << m->get_dest() - << " did " << r << ", still have " << len - << dendl; - while (r > 0) { - if (msg->msg_iov[0].iov_len <= (size_t)r) { - // lose this whole item - //dout(30) << "skipping " << msg->msg_iov[0].iov_len << ", " << (msg->msg_iovlen-1) << " v, " << r << " left" << dendl; - r -= msg->msg_iov[0].iov_len; - msg->msg_iov++; - msg->msg_iovlen--; - } else { - // partial! - //dout(30) << "adjusting " << msg->msg_iov[0].iov_len << ", " << msg->msg_iovlen << " v, " << r << " left" << dendl; - msg->msg_iov[0].iov_base = (void*)((long)msg->msg_iov[0].iov_base + r); - msg->msg_iov[0].iov_len -= r; - break; - } - } - } - return 0; -} - - -int Rank::Pipe::write_message(Message *m) -{ - // get envelope, buffers - ceph_message_header *env = &m->get_envelope(); - bufferlist blist; - blist.claim( m->get_payload() ); - - // chunk out page aligned buffers? - if (blist.length() == 0) - env->nchunks = 0; - else { - env->nchunks = 1 + m->get_chunk_payload_at().size(); // header + explicit chunk points - if (!m->get_chunk_payload_at().empty()) - dout(20) << "chunking at " << m->get_chunk_payload_at() - << " in " << *m << " len " << blist.length() - << dendl; - } - - dout(20) << "pipe(" << peer_addr << ' ' << this << ").write_message " << m << " " << *m - << " to " << m->get_dest() - << " in " << env->nchunks - << dendl; - - // set up msghdr and iovecs - struct msghdr msg; - memset(&msg, 0, sizeof(msg)); - struct iovec msgvec[1 + blist.buffers().size() + env->nchunks*2]; // conservative upper bound - msg.msg_iov = msgvec; - int msglen = 0; - - // send envelope - msgvec[0].iov_base = (char*)env; - msgvec[0].iov_len = sizeof(*env); - msglen += sizeof(*env); - msg.msg_iovlen++; - - // payload - list::const_iterator pb = blist.buffers().begin(); - list::const_iterator pc = m->get_chunk_payload_at().begin(); - int b_off = 0; // carry-over buffer offset, if any - int bl_pos = 0; // blist pos - int nchunks = env->nchunks; - int32_t chunksizes[nchunks]; - - for (int curchunk=0; curchunk < nchunks; curchunk++) { - // start a chunk - int32_t size = blist.length() - bl_pos; - if (pc != m->get_chunk_payload_at().end()) { - assert(*pc > bl_pos); - size = *pc - bl_pos; - dout(30) << "pos " << bl_pos << " explicit chunk at " << *pc << " size " << size << " of " << blist.length() << dendl; - pc++; - } - assert(size > 0); - dout(30) << "chunk " << curchunk << " pos " << bl_pos << " size " << size << dendl; - - // chunk size - chunksizes[curchunk] = size; - msgvec[msg.msg_iovlen].iov_base = &chunksizes[curchunk]; - msgvec[msg.msg_iovlen].iov_len = sizeof(int32_t); - msglen += sizeof(int32_t); - msg.msg_iovlen++; - - // chunk contents - int left = size; - while (left > 0) { - int donow = MIN(left, (int)pb->length()-b_off); - assert(donow > 0); - dout(30) << " bl_pos " << bl_pos << " b_off " << b_off - << " leftinchunk " << left - << " buffer len " << pb->length() - << " writing " << donow - << dendl; - - if (msg.msg_iovlen >= IOV_MAX-1) { - if (do_sendmsg(m, &msg, msglen)) - return -1; - - // and restart the iov - msg.msg_iov = msgvec; - msg.msg_iovlen = 0; - msglen = 0; - } - - msgvec[msg.msg_iovlen].iov_base = (void*)(pb->c_str()+b_off); - msgvec[msg.msg_iovlen].iov_len = donow; - msglen += donow; - msg.msg_iovlen++; - - left -= donow; - assert(left >= 0); - b_off += donow; - bl_pos += donow; - if (b_off != (int)pb->length()) - break; - pb++; - b_off = 0; - } - assert(left == 0); - } - assert(pb == blist.buffers().end()); - - // send - if (do_sendmsg(m, &msg, msglen)) - return -1; - - return 0; -} - - -void Rank::Pipe::fail(list& out) -{ - derr(10) << "pipe(" << peer_addr << ' ' << this << ").fail" << dendl; - - // FIXME: possible race before i reclaim lock here? - - // deactivate myself - rank.lock.Lock(); - { - if (rank.rank_pipe.count(peer_addr) && - rank.rank_pipe[peer_addr] == this) - rank.rank_pipe.erase(peer_addr); - } - rank.lock.Unlock(); - - // what do i do about reader()? FIXME - - // sort my messages by (source) dispatcher, dest. - map > > by_dis; - lock.Lock(); - { - // include out at front of queue - q.splice(q.begin(), out); - - // sort - while (!q.empty()) { - if (q.front()->get_type() == MSG_CLOSE) { - delete q.front(); - } - else if (rank.local.count(q.front()->get_source())) { - EntityMessenger *mgr = rank.local[q.front()->get_source()]; - Dispatcher *dis = mgr->get_dispatcher(); - if (mgr->is_stopped()) { - // ignore. - dout(1) << "pipe(" << peer_addr << ' ' << this << ").fail on " << *q.front() << ", dispatcher stopping, ignoring." << dendl; - delete q.front(); - } else { - by_dis[dis][q.front()->get_dest()].push_back(q.front()); - } - } - else { - // oh well. sending entity musta just shut down? - delete q.front(); - } - q.pop_front(); - } - } - lock.Unlock(); - - // report failure(s) to dispatcher(s) - for (map > >::iterator i = by_dis.begin(); - i != by_dis.end(); - ++i) - for (map >::iterator j = i->second.begin(); - j != i->second.end(); - ++j) - for (list::iterator k = j->second.begin(); - k != j->second.end(); - ++k) { - derr(1) << "pipe(" << peer_addr << ' ' << this << ").fail on " << **k << " to " << (*k)->get_dest_inst() << dendl; - if (i->first) - i->first->ms_handle_failure(*k, (*k)->get_dest_inst()); - } -} - - - - - - -/******************************************** - * Rank - */ - -Rank::Rank() : - single_dispatcher(this), - started(false) { -} -Rank::~Rank() -{ -} - -/* -void Rank::set_listen_addr(tcpaddr_t& a) -{ - dout(10) << "set_listen_addr " << a << dendl; - memcpy((char*)&listen_addr.sin_addr.s_addr, (char*)&a.sin_addr.s_addr, 4); - listen_addr.sin_port = a.sin_port; -} -*/ - -void Rank::_submit_single_dispatch(Message *m) -{ - assert(lock.is_locked()); - - if (local.count(m->get_dest()) && - local[m->get_dest()]->is_ready()) { - rank.single_dispatch_queue.push_back(m); - rank.single_dispatch_cond.Signal(); - } else { - waiting_for_ready[m->get_dest()].push_back(m); - } -} - - -void Rank::single_dispatcher_entry() -{ - lock.Lock(); - while (!single_dispatch_stop || !single_dispatch_queue.empty()) { - if (!single_dispatch_queue.empty()) { - list ls; - ls.swap(single_dispatch_queue); - - lock.Unlock(); - { - while (!ls.empty()) { - Message *m = ls.front(); - ls.pop_front(); - - dout(1) << m->get_dest() - << " <-- " << m->get_source_inst() - << " ---- " << *m - << " -- " << m - << dendl; - - assert(local.count(m->get_dest())); - local[m->get_dest()]->dispatch(m); - } - } - lock.Lock(); - continue; - } - single_dispatch_cond.Wait(lock); - } - lock.Unlock(); -} - - -/* - * note: assumes lock is held - */ -void Rank::reaper() -{ - dout(10) << "reaper" << dendl; - assert(lock.is_locked()); - - while (!pipe_reap_queue.empty()) { - Pipe *p = pipe_reap_queue.front(); - dout(10) << "reaper reaping pipe " << p->get_peer_addr() << dendl; - pipe_reap_queue.pop_front(); - assert(pipes.count(p)); - pipes.erase(p); - p->join(); - dout(10) << "reaper reaped pipe " << p->get_peer_addr() << dendl; - delete p; - } -} - - -int Rank::start_rank() -{ - lock.Lock(); - if (started) { - dout(10) << "start_rank already started" << dendl; - lock.Unlock(); - return 0; - } - dout(10) << "start_rank" << dendl; - lock.Unlock(); - - // bind to a socket - if (accepter.start() < 0) - return -1; - - // start single thread dispatcher? - if (g_conf.ms_single_dispatch) { - single_dispatch_stop = false; - single_dispatcher.create(); - } - - lock.Lock(); - - dout(1) << "start_rank at " << my_addr << dendl; - started = true; - lock.Unlock(); - return 0; -} - - - -/* connect_rank - * NOTE: assumes rank.lock held. - */ -Rank::Pipe *Rank::connect_rank(const entity_addr_t& addr) -{ - assert(rank.lock.is_locked()); - assert(addr != rank.my_addr); - - dout(10) << "connect_rank to " << addr << ", creating pipe and registering" << dendl; - - // create pipe - Pipe *pipe = new Pipe(addr); - rank.rank_pipe[addr] = pipe; - pipes.insert(pipe); - - // register - rank.rank_pipe[addr] = pipe; - - return pipe; -} - - - - - - -Rank::EntityMessenger *Rank::find_unnamed(entity_name_t a) -{ - // find an unnamed (and _ready_) local entity of the right type - for (map::iterator p = local.begin(); - p != local.end(); - ++p) { - if (p->first.type() == a.type() && p->first.is_new() && - p->second->is_ready()) - return p->second; - } - return 0; -} - - - - -/* register_entity - */ -Rank::EntityMessenger *Rank::register_entity(entity_name_t name) -{ - dout(10) << "register_entity " << name << dendl; - lock.Lock(); - - // create messenger - EntityMessenger *msgr = new EntityMessenger(name); - - // add to directory - assert(local.count(name) == 0); - local[name] = msgr; - - lock.Unlock(); - return msgr; -} - - -void Rank::unregister_entity(EntityMessenger *msgr) -{ - lock.Lock(); - dout(10) << "unregister_entity " << msgr->get_myname() << dendl; - - // remove from local directory. - entity_name_t name = msgr->get_myname(); - assert(local.count(name)); - local.erase(name); - - stopped.insert(name); - wait_cond.Signal(); - - lock.Unlock(); -} - - -void Rank::submit_message(Message *m, const entity_addr_t& dest_addr) -{ - const entity_name_t dest = m->get_dest(); - - // lookup - EntityMessenger *entity = 0; - Pipe *pipe = 0; - - lock.Lock(); - { - // local? - if (dest_addr == my_addr) { - if (local.count(dest)) { - // local - dout(20) << "submit_message " << *m << " dest " << dest << " local" << dendl; - if (g_conf.ms_single_dispatch) { - _submit_single_dispatch(m); - } else { - entity = local[dest]; - } - } else { - derr(0) << "submit_message " << *m << " dest " << dest << " " << dest_addr << " local but not in local map?" << dendl; - //assert(0); // hmpf, this is probably mds->mon beacon from newsyn. - } - } - else { - // remote. - if (rank_pipe.count( dest_addr )) { - dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << dest_addr << ", already connected." << dendl; - // connected. - pipe = rank_pipe[ dest_addr ]; - } else { - dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << dest_addr << ", connecting." << dendl; - // not connected. - pipe = connect_rank( dest_addr ); - } - } - } - - // do it - if (entity) { - // local! - dout(20) << "submit_message " << *m << " dest " << dest << " local, queueing" << dendl; - entity->queue_message(m); - } - else if (pipe) { - // remote! - dout(20) << "submit_message " << *m << " dest " << dest << " remote, sending" << dendl; - pipe->send(m); - } - - lock.Unlock(); -} - - - - - -void Rank::wait() -{ - lock.Lock(); - while (1) { - // reap dead pipes - reaper(); - - if (local.empty()) { - dout(10) << "wait: everything stopped" << dendl; - break; // everything stopped. - } else { - dout(10) << "wait: local still has " << local.size() << " items, waiting" << dendl; - } - - wait_cond.Wait(lock); - } - lock.Unlock(); - - // done! clean up. - dout(20) << "wait: stopping accepter thread" << dendl; - accepter.stop(); - dout(20) << "wait: stopped accepter thread" << dendl; - - // stop dispatch thread - if (g_conf.ms_single_dispatch) { - dout(10) << "wait: stopping dispatch thread" << dendl; - lock.Lock(); - single_dispatch_stop = true; - single_dispatch_cond.Signal(); - lock.Unlock(); - single_dispatcher.join(); - } - - // close+reap all pipes - lock.Lock(); - { - dout(10) << "wait: closing pipes" << dendl; - list toclose; - for (hash_map::iterator i = rank_pipe.begin(); - i != rank_pipe.end(); - i++) - toclose.push_back(i->second); - for (list::iterator i = toclose.begin(); - i != toclose.end(); - i++) { - (*i)->unregister(); - (*i)->close(); - } - - reaper(); - dout(10) << "wait: waiting for pipes " << pipes << " to close" << dendl; - while (!pipes.empty()) { - wait_cond.Wait(lock); - reaper(); - } - } - lock.Unlock(); - - dout(10) << "wait: done." << dendl; - dout(1) << "shutdown complete." << dendl; -} - - - - - - -/********************************** - * EntityMessenger - */ - -void Rank::EntityMessenger::dispatch_entry() -{ - lock.Lock(); - while (!stop) { - if (!dispatch_queue.empty() || !prio_dispatch_queue.empty()) { - list ls; - if (!prio_dispatch_queue.empty()) { - ls.swap(prio_dispatch_queue); - pqlen = 0; - } else { - if (0) { - ls.swap(dispatch_queue); - qlen = 0; - } else { - // limit how much low-prio stuff we grab, to avoid starving high-prio messages! - ls.push_back(dispatch_queue.front()); - dispatch_queue.pop_front(); - qlen--; - } - } - - lock.Unlock(); - { - // deliver - while (!ls.empty()) { - if (stop) { - dout(1) << "dispatch: stop=true, discarding " << ls.size() - << " messages in dispatch queue" << dendl; - break; - } - Message *m = ls.front(); - ls.pop_front(); - dout(1) << m->get_dest() - << " <== " << m->get_source_inst() - << " ==== " << *m - << " ==== " << m - << dendl; - dispatch(m); - dout(20) << "done calling dispatch on " << m << dendl; - } - } - lock.Lock(); - continue; - } - cond.Wait(lock); - } - lock.Unlock(); - - // deregister - rank.unregister_entity(this); -} - -void Rank::EntityMessenger::ready() -{ - dout(10) << "ready " << get_myaddr() << dendl; - assert(!dispatch_thread.is_started()); - - if (g_conf.ms_single_dispatch) { - rank.lock.Lock(); - if (rank.waiting_for_ready.count(get_myname())) { - rank.single_dispatch_queue.splice(rank.single_dispatch_queue.end(), - rank.waiting_for_ready[get_myname()]); - rank.waiting_for_ready.erase(get_myname()); - rank.single_dispatch_cond.Signal(); - } - rank.lock.Unlock(); - } else { - // start my dispatch thread - dispatch_thread.create(); - } -} - - -int Rank::EntityMessenger::shutdown() -{ - dout(10) << "shutdown " << get_myaddr() << dendl; - - // stop my dispatch thread - if (dispatch_thread.am_self()) { - dout(10) << "shutdown i am dispatch, setting stop flag" << dendl; - stop = true; - } else { - dout(10) << "shutdown i am not dispatch, setting stop flag and joining thread." << dendl; - lock.Lock(); - stop = true; - cond.Signal(); - lock.Unlock(); - } - - return 0; -} - -void Rank::EntityMessenger::suicide() -{ - dout(10) << "suicide " << get_myaddr() << dendl; - shutdown(); - // hmm, or exit(0)? -} - -void Rank::EntityMessenger::prepare_dest(const entity_addr_t& addr) -{ - rank.lock.Lock(); - { - if (rank.rank_pipe.count(addr) == 0) - rank.connect_rank(addr); - } - rank.lock.Unlock(); -} - -int Rank::EntityMessenger::send_message(Message *m, entity_inst_t dest, - int port, int fromport) -{ - // set envelope - m->set_source(get_myname(), fromport); - m->set_source_addr(rank.my_addr); - m->set_dest_inst(dest); - m->set_dest_port(port); - - dout(1) << m->get_source() - << " --> " << dest.name << " " << dest.addr - << " -- " << *m - << " -- " << m - << dendl; - - rank.submit_message(m, dest.addr); - - return 0; -} - -int Rank::EntityMessenger::send_first_message(Dispatcher *d, - Message *m, entity_inst_t dest, - int port, int fromport) -{ - /* hacky thing for csyn and newsyn: - * set dispatcher (go active) AND set sender for this - * message while holding rank.lock. this prevents any - * races against incoming unnamed messages naming us before - * we fire off our first message. - */ - rank.lock.Lock(); - set_dispatcher(d); - - // set envelope - m->set_source(get_myname(), fromport); - m->set_source_addr(rank.my_addr); - m->set_dest_inst(dest); - m->set_dest_port(port); - rank.lock.Unlock(); - - dout(1) << m->get_source() - << " --> " << dest.name << " " << dest.addr - << " -- " << *m - << " -- " << m - << dendl; - - rank.submit_message(m, dest.addr); - - return 0; -} - -void Rank::EntityMessenger::reset_myname(entity_name_t newname) -{ - rank.lock.Lock(); - { - entity_name_t oldname = get_myname(); - dout(10) << "reset_myname " << oldname << " to " << newname << dendl; - - rank.local.erase(oldname); - rank.local[newname] = this; - - _set_myname(newname); - } - rank.lock.Unlock(); -} - - -void Rank::EntityMessenger::mark_down(entity_addr_t a) -{ - rank.mark_down(a); -} - -void Rank::mark_down(entity_addr_t addr) -{ - lock.Lock(); - // FIXME - lock.Unlock(); -} - - diff --git a/branches/sage/mds/msg/SimpleMessenger.h b/branches/sage/mds/msg/SimpleMessenger.h deleted file mode 100644 index ae888a8f461b6..0000000000000 --- a/branches/sage/mds/msg/SimpleMessenger.h +++ /dev/null @@ -1,312 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __SIMPLEMESSENGER_H -#define __SIMPLEMESSENGER_H - - -#include -#include -using namespace std; -#include -#include -using namespace __gnu_cxx; - - -#include "include/types.h" - -#include "common/Mutex.h" -#include "common/Cond.h" -#include "common/Thread.h" - -#include "Messenger.h" -#include "Message.h" -#include "tcp.h" - - - -/* Rank - per-process - */ -class Rank { -public: - void sigint(); - -private: - class EntityMessenger; - class Pipe; - - // incoming - class Accepter : public Thread { - public: - bool done; - - int listen_sd; - - Accepter() : done(false) {} - - void *entry(); - void stop(); - int start(); - } accepter; - - void sigint(int r); - - - // pipe - class Pipe { - protected: - int sd; - bool done; - entity_addr_t peer_addr; - bool server; - bool need_to_send_close; - - bool reader_running; - bool writer_running; - - list q; - Mutex lock; - Cond cond; - - int accept(); // server handshake - int connect(); // client handshake - void reader(); - void writer(); - - Message *read_message(); - int write_message(Message *m); - int do_sendmsg(Message *m, struct msghdr *msg, int len); - void fail(list& ls); - - // threads - class Reader : public Thread { - Pipe *pipe; - public: - Reader(Pipe *p) : pipe(p) {} - void *entry() { pipe->reader(); return 0; } - } reader_thread; - friend class Reader; - - class Writer : public Thread { - Pipe *pipe; - public: - Writer(Pipe *p) : pipe(p) {} - void *entry() { pipe->writer(); return 0; } - } writer_thread; - friend class Writer; - - public: - Pipe(int s) : sd(s), - done(false), server(true), - need_to_send_close(true), - reader_running(false), writer_running(false), - reader_thread(this), writer_thread(this) { - // server - reader_running = true; - reader_thread.create(); - } - Pipe(const entity_addr_t &pi) : sd(0), - done(false), peer_addr(pi), server(false), - need_to_send_close(true), - reader_running(false), writer_running(false), - reader_thread(this), writer_thread(this) { - // client - writer_running = true; - writer_thread.create(); - } - - // public constructors - static const Pipe& Server(int s); - static const Pipe& Client(const entity_addr_t& pi); - - entity_addr_t& get_peer_addr() { return peer_addr; } - - void unregister(); - void close(); - void join() { - if (writer_thread.is_started()) writer_thread.join(); - if (reader_thread.is_started()) reader_thread.join(); - } - - void send(Message *m) { - lock.Lock(); - q.push_back(m); - cond.Signal(); - lock.Unlock(); - } - void send(list& ls) { - lock.Lock(); - q.splice(q.end(), ls); - cond.Signal(); - lock.Unlock(); - } - - void force_close() { - ::close(sd); - } - }; - - - - // messenger interface - class EntityMessenger : public Messenger { - Mutex lock; - Cond cond; - list dispatch_queue; - list prio_dispatch_queue; - bool stop; - int qlen, pqlen; - - class DispatchThread : public Thread { - EntityMessenger *m; - public: - DispatchThread(EntityMessenger *_m) : m(_m) {} - void *entry() { - m->dispatch_entry(); - return 0; - } - } dispatch_thread; - void dispatch_entry(); - - public: - void queue_message(Message *m) { - // set recv stamp - m->set_recv_stamp(g_clock.now()); - - lock.Lock(); - if (m->get_source().is_mon()) { - prio_dispatch_queue.push_back(m); - pqlen++; - } else { - qlen++; - dispatch_queue.push_back(m); - } - cond.Signal(); - lock.Unlock(); - } - - public: - EntityMessenger(entity_name_t name) : - Messenger(name), - stop(false), - qlen(0), pqlen(0), - dispatch_thread(this) { } - ~EntityMessenger() { - // join dispatch thread - if (dispatch_thread.is_started()) - dispatch_thread.join(); - } - - void ready(); - bool is_stopped() { return stop; } - - void wait() { - dispatch_thread.join(); - } - - int get_dispatch_queue_len() { return qlen + pqlen; } - - void reset_myname(entity_name_t m); - - int shutdown(); - void suicide(); - void prepare_dest(const entity_addr_t& addr); - int send_message(Message *m, entity_inst_t dest, - int port=0, int fromport=0); - int send_first_message(Dispatcher *d, - Message *m, entity_inst_t dest, - int port=0, int fromport=0); - - void mark_down(entity_addr_t a); - void mark_up(entity_name_t a, entity_addr_t& i); - }; - - - class SingleDispatcher : public Thread { - Rank *rank; - public: - SingleDispatcher(Rank *r) : rank(r) {} - void *entry() { - rank->single_dispatcher_entry(); - return 0; - } - } single_dispatcher; - - Cond single_dispatch_cond; - bool single_dispatch_stop; - list single_dispatch_queue; - - map > waiting_for_ready; - - void single_dispatcher_entry(); - void _submit_single_dispatch(Message *m); - - - // Rank stuff - public: - Mutex lock; - Cond wait_cond; // for wait() - bool started; - - // where i listen - entity_addr_t my_addr; - - // local - map local; - set stopped; - //hash_set entity_unstarted; - - // remote - hash_map rank_pipe; - - set pipes; - list pipe_reap_queue; - - Pipe *connect_rank(const entity_addr_t& addr); - - void mark_down(entity_addr_t addr); - //void mark_up(entity_name_t addr, entity_addr_t& i); - - entity_addr_t get_my_addr() { return my_addr; } - - void reaper(); - - EntityMessenger *find_unnamed(entity_name_t a); - -public: - Rank(); - ~Rank(); - - //void set_listen_addr(tcpaddr_t& a); - - int start_rank(); - void wait(); - - EntityMessenger *register_entity(entity_name_t addr); - void rename_entity(EntityMessenger *ms, entity_name_t newaddr); - void unregister_entity(EntityMessenger *ms); - - void submit_message(Message *m, const entity_addr_t& addr); - void prepare_dest(const entity_addr_t& addr); - - // create a new messenger - EntityMessenger *new_entity(entity_name_t addr); - -} ; - - - -extern Rank rank; - -#endif diff --git a/branches/sage/mds/msg/msg_types.h b/branches/sage/mds/msg/msg_types.h deleted file mode 100644 index d5ce108583564..0000000000000 --- a/branches/sage/mds/msg/msg_types.h +++ /dev/null @@ -1,192 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MSG_TYPES_H -#define __MSG_TYPES_H - -#include "include/types.h" -#include "include/blobhash.h" -#include "tcp.h" - -class entity_name_t { - struct ceph_entity_name v; - -public: - static const int TYPE_MON = CEPH_ENTITY_TYPE_MON; - static const int TYPE_MDS = CEPH_ENTITY_TYPE_MDS; - static const int TYPE_OSD = CEPH_ENTITY_TYPE_OSD; - static const int TYPE_CLIENT = CEPH_ENTITY_TYPE_CLIENT; - static const int TYPE_ADMIN = CEPH_ENTITY_TYPE_ADMIN; - - static const int NEW = -1; - - // cons - entity_name_t() { v.type = v.num = 0; } - entity_name_t(int t, int n) { v.type = t; v.num = n; } - - // static cons - static entity_name_t MON(int i=NEW) { return entity_name_t(TYPE_MON, i); } - static entity_name_t MDS(int i=NEW) { return entity_name_t(TYPE_MDS, i); } - static entity_name_t OSD(int i=NEW) { return entity_name_t(TYPE_OSD, i); } - static entity_name_t CLIENT(int i=NEW) { return entity_name_t(TYPE_CLIENT, i); } - static entity_name_t ADMIN(int i=NEW) { return entity_name_t(TYPE_ADMIN, i); } - - int num() const { return v.num; } - int type() const { return v.type; } - const char *type_str() const { - switch (type()) { - case TYPE_MDS: return "mds"; - case TYPE_OSD: return "osd"; - case TYPE_MON: return "mon"; - case TYPE_CLIENT: return "client"; - case TYPE_ADMIN: return "admin"; - default: return "unknown"; - } - } - - bool is_new() const { return num() < 0; } - - bool is_client() const { return type() == TYPE_CLIENT; } - bool is_mds() const { return type() == TYPE_MDS; } - bool is_osd() const { return type() == TYPE_OSD; } - bool is_mon() const { return type() == TYPE_MON; } - bool is_admin() const { return type() == TYPE_ADMIN; } -}; - -inline bool operator== (const entity_name_t& l, const entity_name_t& r) { - return (l.type() == r.type()) && (l.num() == r.num()); } -inline bool operator!= (const entity_name_t& l, const entity_name_t& r) { - return (l.type() != r.type()) || (l.num() != r.num()); } -inline bool operator< (const entity_name_t& l, const entity_name_t& r) { - return (l.type() < r.type()) || (l.type() == r.type() && l.num() < r.num()); } - -inline std::ostream& operator<<(std::ostream& out, const entity_name_t& addr) { - //if (addr.is_namer()) return out << "namer"; - if (addr.is_new() || addr.num() < 0) - return out << addr.type_str() << "?"; - else - return out << addr.type_str() << addr.num(); -} -inline std::ostream& operator<<(std::ostream& out, const ceph_entity_name& addr) { - return out << *(const entity_name_t*)&addr; -} - -namespace __gnu_cxx { - template<> struct hash< entity_name_t > - { - size_t operator()( const entity_name_t m ) const - { - static blobhash H; - return H((const char*)&m, sizeof(m)); - } - }; -} - - - -/* - * an entity's network address. - * includes a random value that prevents it from being reused. - * thus identifies a particular process instance. - * ipv4 for now. - */ -struct entity_addr_t { - struct ceph_entity_addr v; - - entity_addr_t() { - memset(&v, 0, sizeof(v)); - } - - void set_addr(tcpaddr_t a) { - memcpy((char*)v.ipq, (char*)&a.sin_addr.s_addr, 4); - v.port = ntohs(a.sin_port); - } - void make_addr(tcpaddr_t& a) const { - memset(&a, 0, sizeof(a)); - a.sin_family = AF_INET; - memcpy((char*)&a.sin_addr.s_addr, (char*)v.ipq, 4); - a.sin_port = htons(v.port); - } -}; - -inline ostream& operator<<(ostream& out, const entity_addr_t &addr) -{ - return out << (int)addr.v.ipq[0] - << '.' << (int)addr.v.ipq[1] - << '.' << (int)addr.v.ipq[2] - << '.' << (int)addr.v.ipq[3] - << ':' << addr.v.port - << '.' << addr.v.nonce; -} - -inline bool operator==(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) == 0; } -inline bool operator!=(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) != 0; } -inline bool operator<(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) < 0; } -inline bool operator<=(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) <= 0; } -inline bool operator>(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) > 0; } -inline bool operator>=(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) >= 0; } - -namespace __gnu_cxx { - template<> struct hash< entity_addr_t > - { - size_t operator()( const entity_addr_t& x ) const - { - static blobhash H; - return H((const char*)&x, sizeof(x)); - } - }; -} - - -/* - * a particular entity instance - */ -struct entity_inst_t { - entity_name_t name; - entity_addr_t addr; - entity_inst_t() {} - entity_inst_t(entity_name_t n, const entity_addr_t& a) : name(n), addr(a) {} -}; - - -inline bool operator==(const entity_inst_t& a, const entity_inst_t& b) { return memcmp(&a, &b, sizeof(a)) == 0; } -inline bool operator!=(const entity_inst_t& a, const entity_inst_t& b) { return memcmp(&a, &b, sizeof(a)) != 0; } -inline bool operator<(const entity_inst_t& a, const entity_inst_t& b) { return memcmp(&a, &b, sizeof(a)) < 0; } -inline bool operator<=(const entity_inst_t& a, const entity_inst_t& b) { return memcmp(&a, &b, sizeof(a)) <= 0; } -inline bool operator>(const entity_inst_t& a, const entity_inst_t& b) { return memcmp(&a, &b, sizeof(a)) > 0; } -inline bool operator>=(const entity_inst_t& a, const entity_inst_t& b) { return memcmp(&a, &b, sizeof(a)) >= 0; } - -namespace __gnu_cxx { - template<> struct hash< entity_inst_t > - { - size_t operator()( const entity_inst_t& x ) const - { - static blobhash H; - return H((const char*)&x, sizeof(x)); - } - }; -} - -inline ostream& operator<<(ostream& out, const entity_inst_t &i) -{ - return out << i.name << " " << i.addr; -} -inline ostream& operator<<(ostream& out, const ceph_entity_inst &i) -{ - return out << *(const entity_inst_t*)&i; -} - - - -#endif diff --git a/branches/sage/mds/msg/tcp.cc b/branches/sage/mds/msg/tcp.cc deleted file mode 100644 index a131e3d6dd7dc..0000000000000 --- a/branches/sage/mds/msg/tcp.cc +++ /dev/null @@ -1,93 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#include "tcp.h" - -/****************** - * tcp crap - */ - -/* -inlined, see tcp.h - - -bool tcp_read(int sd, char *buf, int len) -{ - while (len > 0) { - int got = ::recv( sd, buf, len, 0 ); - if (got == 0) { - generic_dout(18) << "tcp_read socket " << sd << " closed" << dendl; - return false; - } - if (got < 0) { - generic_dout(18) << "tcp_read bailing with " << got << dendl; - return false; - } - assert(got >= 0); - len -= got; - buf += got; - //generic_dout(DBL) << "tcp_read got " << got << ", " << len << " left" << dendl; - } - return true; -} - -int tcp_write(int sd, char *buf, int len) -{ - //generic_dout(DBL) << "tcp_write writing " << len << dendl; - assert(len > 0); - while (len > 0) { - int did = ::send( sd, buf, len, 0 ); - if (did < 0) { - generic_dout(1) << "tcp_write error did = " << did << " errno " << errno << " " << strerror(errno) << dendl; - //derr(0) << "tcp_write error did = " << did << " errno " << errno << " " << strerror(errno) << dendl; - } - //assert(did >= 0); - if (did < 0) return did; - len -= did; - buf += did; - //generic_dout(DBL) << "tcp_write did " << did << ", " << len << " left" << dendl; - } - return 0; -} -*/ - -int tcp_hostlookup(char *str, tcpaddr_t& ta) -{ - char *host = str; - char *port = 0; - - for (int i=0; str[i]; i++) { - if (str[i] == ':') { - port = str+i+1; - str[i] = 0; - break; - } - } - if (!port) { - cerr << "addr '" << str << "' doesn't look like 'host:port'" << std::endl; - return -1; - } - //cout << "host '" << host << "' port '" << port << "'" << std::endl; - - int iport = atoi(port); - - struct hostent *myhostname = gethostbyname( host ); - if (!myhostname) { - cerr << "host " << host << " not found" << std::endl; - return -1; - } - - memset(&ta, 0, sizeof(ta)); - - //cout << "addrtype " << myhostname->h_addrtype << " len " << myhostname->h_length << std::endl; - - ta.sin_family = myhostname->h_addrtype; - memcpy((char *)&ta.sin_addr, - myhostname->h_addr, - myhostname->h_length); - ta.sin_port = iport; - - cout << "lookup '" << host << ":" << port << "' -> " << ta << std::endl; - - return 0; -} diff --git a/branches/sage/mds/msg/tcp.h b/branches/sage/mds/msg/tcp.h deleted file mode 100644 index e234da400dfe4..0000000000000 --- a/branches/sage/mds/msg/tcp.h +++ /dev/null @@ -1,69 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -#ifndef __TCP_H -#define __TCP_H - -#include -#include -#include -#include - -typedef struct sockaddr_in tcpaddr_t; - -using std::ostream; - -inline ostream& operator<<(ostream& out, const tcpaddr_t &a) -{ - unsigned char addr[4]; - memcpy((char*)addr, (char*)&a.sin_addr.s_addr, 4); - out << (unsigned)addr[0] << "." - << (unsigned)addr[1] << "." - << (unsigned)addr[2] << "." - << (unsigned)addr[3] << ":" - << ntohs(a.sin_port); - return out; -} - -inline bool tcp_read(int sd, char *buf, int len) { - while (len > 0) { - int got = ::recv( sd, buf, len, 0 ); - if (got <= 0) { - //generic_dout(18) << "tcp_read socket " << sd << " closed" << dendl; - return false; - } - len -= got; - buf += got; - //generic_dout(DBL) << "tcp_read got " << got << ", " << len << " left" << dendl; - } - return true; -} - -inline int tcp_write(int sd, const char *buf, int len) { - //generic_dout(DBL) << "tcp_write writing " << len << dendl; - assert(len > 0); - while (len > 0) { - int did = ::send( sd, buf, len, 0 ); - if (did < 0) { - //generic_dout(1) << "tcp_write error did = " << did << " errno " << errno << " " << strerror(errno) << dendl; - //generic_derr(1) << "tcp_write error did = " << did << " errno " << errno << " " << strerror(errno) << dendl; - return did; - } - len -= did; - buf += did; - //generic_dout(DBL) << "tcp_write did " << did << ", " << len << " left" << dendl; - } - return 0; -} - - -extern int tcp_hostlookup(char *str, tcpaddr_t& ta); - -inline bool operator==(const tcpaddr_t& a, const tcpaddr_t& b) { - return strncmp((const char*)&a, (const char*)&b, sizeof(a)) == 0; -} -inline bool operator!=(const tcpaddr_t& a, const tcpaddr_t& b) { - return strncmp((const char*)&a, (const char*)&b, sizeof(a)) != 0; -} - - -#endif diff --git a/branches/sage/mds/newsyn.cc b/branches/sage/mds/newsyn.cc deleted file mode 100644 index e580e49a9b7e9..0000000000000 --- a/branches/sage/mds/newsyn.cc +++ /dev/null @@ -1,438 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#define intabs(x) ((x) >= 0 ? (x):(-(x))) - -#include - -#include -#include -#include -using namespace std; - -#include - -#include "config.h" - -#include "mds/MDS.h" -#include "osd/OSD.h" -#include "mon/Monitor.h" -#include "client/Client.h" -#include "client/SyntheticClient.h" - -#include "msg/SimpleMessenger.h" - -#include "common/Timer.h" - -class C_Test : public Context { -public: - void finish(int r) { - cout << "C_Test->finish(" << r << ")" << std::endl; - } -}; - -extern std::map g_fake_kill_after; - - -/* - * start up NewMessenger via MPI. - */ - -pair mpi_bootstrap_new(int& argc, char**& argv, MonMap *monmap) -{ - MPI_Init(&argc, &argv); - - int mpi_world; - int mpi_rank; - MPI_Comm_size(MPI_COMM_WORLD, &mpi_world); - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - - // first, synchronize clocks. - if (g_conf.clock_tare) { - if (1) { - // use an MPI barrier. probably not terribly precise. - MPI_Barrier(MPI_COMM_WORLD); - g_clock.tare(); - } else { - // use wall clock; assume NTP has all nodes synchronized already. - // FIXME someday: this hangs for some reason. whatever. - utime_t z = g_clock.now(); - MPI_Bcast( &z, sizeof(z), MPI_CHAR, - 0, MPI_COMM_WORLD); - cout << "z is " << z << std::endl; - g_clock.tare(z); - } - } - - // start up all monitors at known addresses. - entity_inst_t moninst[mpi_world]; // only care about first g_conf.num_mon of these. - - rank.start_rank(); // bind and listen - - if (mpi_rank < g_conf.num_mon) { - moninst[mpi_rank].addr = rank.my_addr; - moninst[mpi_rank].name = entity_name_t(entity_name_t::TYPE_MON, mpi_rank); - - //cerr << mpi_rank << " at " << rank.get_listen_addr() << std::endl; - } - - MPI_Gather( &moninst[mpi_rank], sizeof(entity_inst_t), MPI_CHAR, - moninst, sizeof(entity_inst_t), MPI_CHAR, - 0, MPI_COMM_WORLD); - - if (mpi_rank == 0) { - for (int i=0; imon_inst[i] = moninst[i]; - } - } - - - // distribute monmap - bufferlist bl; - if (mpi_rank == 0) { - monmap->encode(bl); - monmap->write(".ceph_monmap"); - } else { - int l = g_conf.num_mon * 1000; // nice'n big. - bufferptr bp(l); - bl.append(bp); - } - - MPI_Bcast(bl.c_str(), bl.length(), MPI_CHAR, - 0, MPI_COMM_WORLD); - - if (mpi_rank > 0) { - monmap->decode(bl); - } - - // wait for everyone! - MPI_Barrier(MPI_COMM_WORLD); - - return pair(mpi_rank, mpi_world); -} - -utime_t tick_start; -int tick_count = 0; - -class C_Tick : public Context { -public: - void finish(int) { - utime_t now = g_clock.now() - tick_start; - cout << "tick +" << g_conf.tick << " -> " << now << " (" << tick_count << ")" << std::endl; - tick_count += g_conf.tick; - utime_t next = tick_start; - next.sec_ref() += tick_count; - g_timer.add_event_at(next, new C_Tick); - } -}; - -class C_Die : public Context { -public: - void finish(int) { - cerr << "die" << std::endl; - _exit(1); - } -}; - -class C_Debug : public Context { - public: - void finish(int) { - int size = (long)&g_conf.debug_after - (long)&g_conf.debug; - memcpy((char*)&g_conf.debug, (char*)&g_debug_after_conf.debug, size); - cout << "debug_after flipping debug settings" << std::endl; - //g_conf.debug_ms = 1; - } -}; - - -int main(int argc, char **argv) -{ - vector args; - argv_to_vec(argc, argv, args); - - map kill_osd_after; - if (1) { - vector nargs; - for (unsigned i=0; i 0 ? g_conf.num_mon:0; - int start_mds = g_conf.num_mds > 0 ? g_conf.num_mds:0; - int start_osd = g_conf.num_osd > 0 ? g_conf.num_osd:0; - int start_client = g_conf.num_client > 0 ? g_conf.num_client:0; - - //g_conf.num_mon = intabs(g_conf.num_mon); - g_conf.num_mds = intabs(g_conf.num_mds); - g_conf.num_client = intabs(g_conf.num_client); - g_conf.num_osd = intabs(g_conf.num_osd); - - - if (g_conf.kill_after) - g_timer.add_event_after(g_conf.kill_after, new C_Die); - if (g_conf.debug_after) - g_timer.add_event_after(g_conf.debug_after, new C_Debug); - - if (g_conf.tick) { - tick_start = g_clock.now(); - g_timer.add_event_after(g_conf.tick, new C_Tick); - } - - vector nargs; - for (unsigned i=0; i mpiwho = mpi_bootstrap_new(argc, argv, monmap); - int myrank = mpiwho.first; - int world = mpiwho.second; - - int need = 0; - if (g_conf.ms_skip_rank0) need++; - need += start_mds; - if (g_conf.ms_stripe_osds) - need++; - else - need += start_osd; - if (start_client) { - if (!g_conf.ms_overlay_clients) - need += 1; - } - assert(need <= world); - - if (myrank == 0) - cerr << "nummds " << start_mds << " numosd " << start_osd << " numclient " << start_client << " .. need " << need << ", have " << world << std::endl; - - - char hostname[100]; - gethostname(hostname,100); - int pid = getpid(); - - int started = 0; - - //if (myrank == 0) g_conf.debug = 20; - - // courtesy symlinks - char ffrom[100]; - char fto[100]; - sprintf(fto, "%s.%d", hostname, pid); - - - // create mon - if (myrank < g_conf.num_mon) { - Monitor *mon = new Monitor(myrank, rank.register_entity(entity_name_t(entity_name_t::TYPE_MON, myrank)), monmap); - mon->init(); - if (g_conf.dout_dir) { - sprintf(ffrom, "%s/mon%d", g_conf.dout_dir, myrank); - ::symlink(fto, ffrom); - } - } - - // wait for monitors to start. - MPI_Barrier(MPI_COMM_WORLD); - - // okay, home free! - MPI_Finalize(); - - - // create mds - map mds; - map mdsosd; - for (int i=0; iinit(); - started++; - - if (g_conf.mds_local_osd) { - int n = i+g_conf.mds_local_osd_offset; - mdsosd[i] = new OSD(n, rank.register_entity(entity_name_t(entity_name_t::TYPE_OSD, n)), monmap); - mdsosd[i]->init(); - } - - if (g_fake_kill_after.count(entity_name_t::MDS(i))) { - cerr << "mds" << i << " will die after " << g_fake_kill_after[entity_name_t::MDS(i)] << std::endl; - g_timer.add_event_after(g_fake_kill_after[entity_name_t::MDS(i)], new C_Die); - } - } - - // create osd - map osd; - int max_osd_nodes = world - start_mds - g_conf.ms_skip_rank0; // assumes 0 clients, if we stripe. - int osds_per_node = (start_osd-1)/max_osd_nodes + 1; - for (int i=0; iinit() < 0) - return 1; - started++; - } - - if (g_conf.ms_overlay_clients) sleep(5); - - // create client - int skip_osd = start_osd; - if (g_conf.ms_overlay_clients) - skip_osd = 0; // put clients with osds too! - int client_nodes = world - start_mds - skip_osd - g_conf.ms_skip_rank0; - int clients_per_node = 1; - if (start_client && client_nodes > 0) clients_per_node = (start_client-1) / client_nodes + 1; - set clientlist; - map client;//[start_client]; - map syn;//[start_client]; - int nclients = 0; - for (int i=0; i::iterator it = clientlist.begin(); - it != clientlist.end(); - it++) { - int i = *it; - - //cerr << "starting synthetic client" << i << " on rank " << myrank << std::endl; - syn[i]->start_thread(); - } - if (nclients) { - cerr << nclients << " clients at " << rank.my_addr << " " << hostname << "." << pid << std::endl; - } - - for (set::iterator it = clientlist.begin(); - it != clientlist.end(); - it++) { - int i = *it; - // cout << "waiting for synthetic client" << i << " to finish" << std::endl; - syn[i]->join_thread(); - // fix simpelmeessenger race first! - //delete syn[i]; - //delete client[i]; - } - - - if (myrank && !started) { - //dout(1) << "IDLE" << dendl; - cerr << "idle at " << rank.my_addr << " rank " << myrank << " " << hostname << "." << pid << std::endl; - } - - // wait for everything to finish - rank.wait(); - - cerr << "newsyn done on " << hostname << "." << pid << std::endl; - - // cd on exit, so that gmon.out (if any) goes into a separate directory for each node. - char s[20]; - sprintf(s, "gmon/%d", myrank); - mkdir(s, 0755); - chdir(s); - - return 0; // whatever, cleanup hangs sometimes (stopping ebofs threads?). - - // cleanup - for (map::iterator i = mds.begin(); i != mds.end(); i++) - delete i->second; - for (map::iterator i = mdsosd.begin(); i != mdsosd.end(); i++) - delete i->second; - for (map::iterator i = osd.begin(); i != osd.end(); i++) - delete i->second; - /* - for (map::iterator i = client.begin(); i != client.end(); i++) - delete i->second; - for (map::iterator i = syn.begin(); i != syn.end(); i++) - delete i->second; - */ - /* - for (int i=0; i - -Ceph - scalable distributed file system - -This is free software; you can redistribute it and/or -modify it under the terms of the GNU Lesser General Public -License version 2.1, as published by the Free Software -Foundation. See file COPYING. */ - - -#include -#include -#include -#include "OSBDB.h" -#include "common/Timer.h" - -using namespace std; - -#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_bdbstore) *_dout << dbeginl << "bdbstore(" << device << ")@" << __LINE__ << "." -#define derr(x) if (x <= g_conf.debug || x <= g_conf.debug_bdbstore) *_derr << dbeginl << "bdbstore(" << device << ")@" << __LINE__ << "." - -#define CLEANUP(onsafe) do { \ - dout(6) << "DELETE " << hex << onsafe << dec << dendl; \ - delete onsafe; \ - } while (0) -#define COMMIT(onsafe) do { \ - dout(6) << "COMMIT " << hex << onsafe << dec << dendl; \ - sync(onsafe); \ - } while (0) - - // Have a lock, already. - -class scoped_lock -{ -private: - Mutex *m; -public: - scoped_lock(Mutex *m) : m(m) { m->Lock(); } - ~scoped_lock() { m->Unlock(); } -}; - - // Utilities. - -// Starting off with my own bsearch; mail reader to follow... - -// Perform a binary search on a sorted array, returning the insertion -// point for key, or key if it is exactly found. In other words, this -// will return a pointer to the element that will come after key if -// key were to be inserted into the sorted array. -// -// Requires that T have < and > operators defined. -template -uint32_t binary_search (T *array, size_t size, T key) -{ - int low = 0; - int high = size; - int p = (low + high) / 2; - - while (low < high - 1) - { - if (array[p] > key) - { - high = p; - } - else if (array[p] < key) - { - low = p; - } - else - return p; - - p = (low + high) / 2; - } - - if (array[p] < key) - p++; - else if (array[p] > key && p > 0) - p--; - return p; -} - - // Management. - -DbEnv *OSBDB::getenv () -{ - DbEnv *envp = new DbEnv (DB_CXX_NO_EXCEPTIONS); - if (g_conf.debug > 1 || g_conf.debug_bdbstore > 1) - envp->set_error_stream (&std::cerr); - if (g_conf.debug > 2 || g_conf.debug_bdbstore > 2) - envp->set_message_stream (&std::cout); - envp->set_flags (DB_LOG_INMEMORY, 1); - //env->set_flags (DB_DIRECT_DB, 1); - int env_flags = (DB_CREATE - | DB_THREAD - //| DB_INIT_LOCK - | DB_INIT_MPOOL - //| DB_INIT_TXN - //| DB_INIT_LOG - | DB_PRIVATE); - if (envp->open (NULL, env_flags, 0) != 0) - { - std::cerr << "failed to open environment " << std::endl; - assert(0); - } - return envp; -} - -int OSBDB::opendb(DBTYPE type, int flags, bool new_env) -{ - env = getenv(); - db = new Db(env, 0); - db->set_error_stream (&std::cerr); - db->set_message_stream (&std::cout); - db->set_flags (0); - if (!g_conf.bdbstore_btree) - { - if (g_conf.bdbstore_pagesize > 0) - db->set_pagesize (g_conf.bdbstore_pagesize); - if (g_conf.bdbstore_ffactor > 0 && g_conf.bdbstore_nelem > 0) - { - db->set_h_ffactor (g_conf.bdbstore_ffactor); - db->set_h_nelem (g_conf.bdbstore_nelem); - } - } - if (g_conf.bdbstore_cachesize > 0) - { - db->set_cachesize (0, g_conf.bdbstore_cachesize, 0); - } - - flags = flags | DB_THREAD; - if (transactional) - flags = flags | DB_AUTO_COMMIT; - - int ret; - if ((ret = db->open (NULL, device.c_str(), NULL, type, flags, 0)) != 0) - { - derr(1) << "failed to open database: " << device << ": " - << db_strerror(ret) << std::dendl; - return -EINVAL; - } - opened = true; - return 0; -} - -int OSBDB::mount() -{ - dout(2) << "mount " << device << dendl; - - if (mounted) - { - dout(4) << "..already mounted" << dendl; - return 0; - } - - if (!opened) - { - int ret; - if ((ret = opendb ()) != 0) - { - dout(4) << "..returns " << ret << dendl; - return ret; - } - } - - // XXX Do we want anything else in the superblock? - - Dbt key (OSBDB_SUPERBLOCK_KEY, 1); - stored_superblock super; - Dbt value (&super, sizeof (super)); - value.set_dlen (sizeof (super)); - value.set_ulen (sizeof (super)); - value.set_flags (DB_DBT_USERMEM | DB_DBT_PARTIAL); - - if (db->get (NULL, &key, &value, 0) != 0) - { - dout(4) << "..get superblock fails" << dendl; - return -EINVAL; // XXX how to say "badly formed fs?" - } - - dout(3) << ".mount " << super << dendl; - - if (super.version != OSBDB_THIS_VERSION) - { - dout(4) << "version mismatch (" << super.version << ")" << dendl; - return -EINVAL; - } - - DBTYPE t; - db->get_type (&t); - - if (t == DB_BTREE) - { - u_int32_t minkey; - u_int32_t flags; - db->get_bt_minkey (&minkey); - db->get_flags (&flags); - dout(1) << "mounted version " << OSBDB_THIS_VERSION << "; Btree; " - << "min keys per page: " << minkey << "; flags: " - << hex << flags << dec << dendl; - cout << dec; - } - else - { - u_int32_t ffactor; - u_int32_t nelem; - u_int32_t flags; - db->get_h_ffactor (&ffactor); - db->get_h_nelem (&nelem); - db->get_flags (&flags); - dout(1) << "mounted version " << OSBDB_THIS_VERSION << "; Hash; " - << "fill factor: " << ffactor - << " table size: " << nelem << "; flags: " - << hex << flags << dec << dendl; - cout << dec; - } - - mounted = true; - dout(4) << "..mounted" << dendl; - return 0; -} - -int OSBDB::umount() -{ - if (!mounted) - return -EINVAL; - - dout(2) << "umount" << dendl; - - int ret; - if (opened) - { - if (transactional) - { - env->log_flush (NULL); - if ((ret = env->lsn_reset (device.c_str(), 0)) != 0) - { - derr(1) << "lsn_reset: " << db_strerror (ret) << dendl; - } - } - - db->sync (0); - - if ((ret = db->close (0)) != 0) - { - derr(1) << "close: " << db_strerror(ret) << dendl; - return -EINVAL; - } - delete db; - db = NULL; - - if (env) - { - env->close (0); - delete env; - env = NULL; - } - } - mounted = false; - opened = false; - dout(4) << "..unmounted" << dendl; - return 0; -} - -int OSBDB::mkfs() -{ - if (mounted) - return -EINVAL; - - dout(2) << "mkfs" << dendl; - - string d = env_dir; - d += device; - unlink (d.c_str()); - - int ret; - if ((ret = opendb((g_conf.bdbstore_btree ? DB_BTREE : DB_HASH), - DB_CREATE, true)) != 0) - { - derr(1) << "failed to open database: " << device << ": " - << db_strerror(ret) << std::dendl; - return -EINVAL; - } - opened = true; - dout(3) << "..opened " << device << dendl; - - uint32_t c; - ret = db->truncate (NULL, &c, 0); - if (ret != 0) - { - derr(1) << "db truncate failed: " << db_strerror (ret) << dendl; - return -EIO; // ??? - } - - Dbt key (OSBDB_SUPERBLOCK_KEY, 1); - struct stored_superblock sb; - sb.version = OSBDB_THIS_VERSION; - Dbt value (&sb, sizeof (sb)); - - dout(3) << "..writing superblock" << dendl; - if ((ret = db->put (NULL, &key, &value, 0)) != 0) - { - derr(1) << "failed to write superblock: " << db_strerror (ret) - << dendl; - return -EIO; - } - dout(3) << "..wrote superblock" << dendl; - dout(4) << "..mkfs done" << dendl; - return 0; -} - - // Objects. - -int OSBDB::pick_object_revision_lt(object_t& oid) -{ - // Not really needed. - dout(0) << "pick_object_revision_lt " << oid << dendl; - return -ENOSYS; -} - -bool OSBDB::exists(object_t oid) -{ - dout(2) << "exists " << oid << dendl; - struct stat st; - bool ret = (stat (oid, &st) == 0); - dout(4) << "..returns " << ret << dendl; - return ret; -} - -int OSBDB::statfs (struct statfs *st) -{ - // Hacky? - if (::statfs (device.c_str(), st) != 0) - { - int ret = -errno; - derr(1) << "statfs returns " << ret << dendl; - return ret; - } - st->f_type = OSBDB_MAGIC; - dout(4) << "..statfs OK" << dendl; - return 0; -} - -int OSBDB::stat(object_t oid, struct stat *st) -{ - if (!mounted) - { - dout(4) << "not mounted!" << dendl; - return -EINVAL; - } - - dout(2) << "stat " << oid << dendl; - - object_inode_key ikey = new_object_inode_key(oid); - stored_object obj; - Dbt key (&ikey, sizeof_object_inode_key()); - Dbt value (&obj, sizeof (obj)); - value.set_flags (DB_DBT_USERMEM); - value.set_ulen (sizeof (obj)); - - dout(3) << " lookup " << ikey << dendl; - int ret; - if ((ret = db->get (NULL, &key, &value, 0)) != 0) - { - derr(1) << " get returned " << ret << dendl; - return -ENOENT; - } - - st->st_size = obj.length; - dout(3) << "stat length:" << obj.length << dendl; - dout(4) << "..stat OK" << dendl; - return 0; -} - -int OSBDB::remove(object_t oid, Context *onsafe) -{ - if (!mounted) - { - derr(1) << "not mounted!" << dendl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - dout(6) << "Context " << hex << onsafe << dec << dendl; - scoped_lock __lock(&lock); - dout(2) << "remove " << oid << dendl; - - DbTxn *txn = NULL; - if (transactional) - env->txn_begin (NULL, &txn, 0); - - oid_t id; - mkoid(id, oid); - Dbt key (&id, sizeof (oid_t)); - int ret; - if ((ret = db->del (txn, &key, 0)) != 0) - { - derr(1) << ".del returned error: " << db_strerror (ret) << dendl; - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - - object_inode_key _ikey = new_object_inode_key (oid); - Dbt ikey (&_ikey, sizeof_object_inode_key()); - if ((ret = db->del (txn, &ikey, 0)) != 0) - { - derr(1) << ".del returned error: " << db_strerror (ret) << dendl; - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - - attrs_id aids = new_attrs_id (oid); - Dbt askey (&aids, sizeof_attrs_id()); - Dbt asval; - asval.set_flags (DB_DBT_MALLOC); - if (db->get (txn, &askey, &asval, 0) == 0) - { - // We have attributes; remove them. - stored_attrs *sap = (stored_attrs *) asval.get_data(); - auto_ptr sa (sap); - for (unsigned i = 0; i < sap->count; i++) - { - attr_id aid = new_attr_id (oid, sap->names[i].name); - Dbt akey (&aid, sizeof (aid)); - if ((ret = db->del (txn, &akey, 0)) != 0) - { - derr(1) << ".del returns error: " << db_strerror (ret) << dendl; - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - } - if ((ret = db->del (txn, &askey, 0)) != 0) - { - derr(1) << ".del returns error: " << db_strerror (ret) << dendl; - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - } - - // XXX check del return value - - if (txn != NULL) - txn->commit (0); - if (onsafe != NULL) - COMMIT(onsafe); - dout(4) << "..remove OK" << dendl; - return 0; -} - -int OSBDB::truncate(object_t oid, off_t size, Context *onsafe) -{ - dout(6) << "Context " << hex << onsafe << dec << dendl; - - if (!mounted) - { - derr(1) << "not mounted!" << dendl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - scoped_lock __lock(&lock); - dout(2) << "truncate " << size << dendl; - - if (size > 0xFFFFFFFF) - { - derr(1) << "object size too big!" << dendl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -ENOSPC; - } - - DbTxn *txn = NULL; - - if (transactional) - env->txn_begin (NULL, &txn, 0); - - object_inode_key ikey = new_object_inode_key (oid); - stored_object obj; - Dbt key (&ikey, sizeof_object_inode_key()); - Dbt value (&obj, sizeof (obj)); - value.set_dlen (sizeof (obj)); - value.set_ulen (sizeof (obj)); - value.set_flags (DB_DBT_USERMEM); - - if (db->get (txn, &key, &value, 0) != 0) - { - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - dout(4) << "..returns -ENOENT" << dendl; - return -ENOENT; - } - - if (obj.length < size) - { - oid_t id; - mkoid (id, oid); - Dbt okey (&id, sizeof (oid_t)); - char b[] = { '\0' }; - Dbt newVal (b, 1); - newVal.set_doff ((size_t) size); - newVal.set_dlen (1); - newVal.set_ulen (1); - newVal.set_flags (DB_DBT_PARTIAL); - if (db->put (txn, &okey, &newVal, 0) != 0) - { - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".updating object failed" << dendl; - return -EIO; - } - - obj.length = size; - value.set_ulen (sizeof (obj)); - if (db->put (txn, &key, &value, 0) != 0) - { - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".updating object info failed" << dendl; - return -EIO; - } - } - else if (obj.length > size) - { - obj.length = size; - Dbt tval (&obj, sizeof (obj)); - tval.set_ulen (sizeof (obj)); - tval.set_flags (DB_DBT_USERMEM); - if (db->put (txn, &key, &tval, 0) != 0) - { - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".updating object info failed" << dendl; - return -EIO; - } - if (size == 0) - { - char x[1]; - oid_t id; - mkoid (id, oid); - Dbt okey (&id, sizeof (oid_t)); - Dbt oval (&x, 0); - if (db->put (txn, &okey, &oval, 0) != 0) - { - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".updating object failed" << dendl; - return -EIO; - } - } - else - { - oid_t id; - mkoid (id, oid); - Dbt okey (&id, sizeof (oid_t)); - Dbt oval; - oval.set_flags (DB_DBT_MALLOC); - if (db->get (txn, &okey, &oval, 0) != 0) - { - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".getting old object failed" << dendl; - return -EIO; - } - auto_ptr ovalPtr ((char *) oval.get_data()); - oval.set_size ((size_t) size); - oval.set_ulen ((size_t) size); - if (db->put (txn, &okey, &oval, 0) != 0) - { - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".putting new object failed" << dendl; - return -EIO; - } - } - } - - if (txn) - txn->commit (0); - if (onsafe != NULL) - COMMIT(onsafe); - - dout(4) << "..truncate OK" << dendl; - return 0; -} - -int OSBDB::read(object_t oid, off_t offset, size_t len, bufferlist& bl) -{ - if (!mounted) - { - derr(1) << "not mounted!" << dendl; - return -EINVAL; - } - - dout(2) << "read " << oid << " " << offset << " " - << len << dendl; - - if (bl.length() < len) - { - int remain = len - bl.length(); - bufferptr ptr (remain); - bl.push_back(ptr); - } - - DbTxn *txn = NULL; - if (transactional) - env->txn_begin (NULL, &txn, 0); - - object_inode_key _ikey = new_object_inode_key (oid); - stored_object obj; - Dbt ikey (&_ikey, sizeof_object_inode_key()); - Dbt ival (&obj, sizeof (obj)); - ival.set_flags (DB_DBT_USERMEM); - ival.set_ulen (sizeof(obj)); - - dout(3) << "..get " << _ikey << dendl; - int ret; - if ((ret = db->get (txn, &ikey, &ival, 0)) != 0) - { - if (txn) - txn->abort(); - derr(1) << "get returned " << db_strerror (ret) << dendl; - return -ENOENT; - } - - dout(3) << "..object has size " << obj.length << dendl; - - if (offset == 0 && len >= obj.length) - { - len = obj.length; - dout(3) << "..doing full read of " << len << dendl; - oid_t id; - mkoid (id, oid); - Dbt key (&id, sizeof (oid_t)); - Dbt value (bl.c_str(), len); - value.set_ulen (len); - value.set_flags (DB_DBT_USERMEM); - dout(3) << "..getting " << oid << dendl; - if ((ret = db->get (txn, &key, &value, 0)) != 0) - { - derr(1) << ".get returned " << db_strerror (ret) << dendl; - if (txn) - txn->abort(); - return -EIO; - } - } - else - { - if (offset > obj.length) - { - dout(2) << "..offset out of range" << dendl; - return 0; - } - if (offset + len > obj.length) - len = obj.length - (size_t) offset; - dout(3) << "..doing partial read of " << len << dendl; - oid_t id; - mkoid (id, oid); - Dbt key (&id, sizeof (oid)); - Dbt value; - char *data = bl.c_str(); - dout(3) << ".bufferlist c_str returned " << ((void*) data) << dendl; - value.set_data (data); - value.set_doff ((size_t) offset); - value.set_dlen (len); - value.set_ulen (len); - value.set_flags (DB_DBT_USERMEM | DB_DBT_PARTIAL); - dout(3) << "..getting " << oid << dendl; - if ((ret = db->get (txn, &key, &value, 0)) != 0) - { - derr(1) << ".get returned " << db_strerror (ret) << dendl; - if (txn) - txn->abort(); - return -EIO; - } - } - - if (txn) - txn->commit (0); - dout(4) << "..read OK, returning " << len << dendl; - return len; -} - -int OSBDB::write(object_t oid, off_t offset, size_t len, - bufferlist& bl, Context *onsafe) -{ - dout(6) << "Context " << hex << onsafe << dec << dendl; - if (!mounted) - { - derr(1) << "not mounted!" << dendl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - scoped_lock __lock(&lock); - dout(2) << "write " << oid << " " << offset << " " - << len << dendl; - - if (offset > 0xFFFFFFFFL || offset + len > 0xFFFFFFFFL) - { - derr(1) << "object too big" << dendl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -ENOSPC; - } - - DbTxn *txn = NULL; - if (transactional) - env->txn_begin (txn, &txn, 0); - - object_inode_key _ikey = new_object_inode_key (oid); - stored_object obj; - Dbt ikey (&_ikey, sizeof_object_inode_key()); - Dbt ival (&obj, sizeof (obj)); - ival.set_ulen (sizeof (obj)); - ival.set_flags (DB_DBT_USERMEM); - - int ret; - dout(3) << "..getting " << _ikey << dendl; - if (db->get (txn, &ikey, &ival, 0) != 0) - { - dout(3) << "..writing new object" << dendl; - - // New object. - obj.length = (size_t) offset + len; - dout(3) << "..mapping " << _ikey << " => " - << obj << dendl; - if ((ret = db->put (txn, &ikey, &ival, 0)) != 0) - { - derr(1) << "..put returned " << db_strerror (ret) << dendl; - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - - oid_t id; - mkoid (id, oid); - Dbt key (&id, sizeof (oid_t)); - Dbt value (bl.c_str(), len); - if (offset == 0) // whole object - { - value.set_flags (DB_DBT_USERMEM); - value.set_ulen (len); - } - else - { - value.set_flags (DB_DBT_USERMEM | DB_DBT_PARTIAL); - value.set_ulen (len); - value.set_doff ((size_t) offset); - value.set_dlen (len); - } - dout(3) << "..mapping " << oid << " => (" - << obj.length << " bytes)" << dendl; - if ((ret = db->put (txn, &key, &value, 0)) != 0) - { - derr(1) << "..put returned " << db_strerror (ret) << dendl; - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - - if (txn != NULL) - txn->commit (0); - if (onsafe != NULL) - COMMIT(onsafe); - - dout(4) << "..write OK, returning " << len << dendl; - return len; - } - - if (offset == 0 && len >= obj.length) - { - if (len != obj.length) - { - obj.length = len; - if ((ret = db->put (txn, &ikey, &ival, 0)) != 0) - { - derr(1) << " put returned " << db_strerror (ret) << dendl; - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - } - oid_t id; - mkoid(id, oid); - Dbt key (&id, sizeof (oid_t)); - Dbt value (bl.c_str(), len); - if (db->put (txn, &key, &value, 0) != 0) - { - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << "..writing object failed!" << dendl; - return -EIO; - } - } - else - { - if (offset + len > obj.length) - { - obj.length = (size_t) offset + len; - if (db->put (txn, &ikey, &ival, 0) != 0) - { - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << "..writing object info failed!" << dendl; - return -EIO; - } - } - oid_t id; - mkoid(id, oid); - Dbt key (&id, sizeof (oid_t)); - Dbt value (bl.c_str(), len); - value.set_doff ((size_t) offset); - value.set_dlen (len); - value.set_ulen (len); - value.set_flags (DB_DBT_PARTIAL); - if (db->put (txn, &key, &value, 0) != 0) - { - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << "..writing object failed!" << dendl; - return -EIO; - } - } - - if (txn != NULL) - txn->commit (0); - if (onsafe != NULL) - COMMIT(onsafe); - - dout(4) << "..write OK, returning " << len << dendl; - return len; -} - -int OSBDB::clone(object_t oid, object_t noid) -{ - if (!mounted) - { - derr(1) << "not mounted!" << dendl; - return -EINVAL; - } - - dout(2) << "clone " << oid << ", " << noid << dendl; - - if (exists (noid)) - { - dout(4) << "..target exists; returning -EEXIST" << dendl; - return -EEXIST; - } - - DbTxn *txn = NULL; - if (transactional) - env->txn_begin (NULL, &txn, 0); - - object_inode_key _ikey = new_object_inode_key (oid); - object_inode_key _nikey = new_object_inode_key (noid); - stored_object obj; - Dbt ikey (&_ikey, sizeof_object_inode_key()); - Dbt ival (&obj, sizeof (obj)); - Dbt nikey (&_nikey, sizeof_object_inode_key()); - ival.set_ulen (sizeof (obj)); - ival.set_flags (DB_DBT_USERMEM); - - oid_t id, nid; - mkoid(id, oid); - mkoid(nid, noid); - Dbt key (&id, sizeof (oid_t)); - Dbt nkey (&oid, sizeof (oid_t)); - Dbt value; - value.set_flags (DB_DBT_MALLOC); - - if (db->get (txn, &ikey, &ival, 0) != 0) - { - if (txn) - txn->abort(); - derr(1) << "..getting object info failed!" << dendl; - return -ENOENT; - } - if (db->get (txn, &key, &value, 0) != 0) - { - if (txn) - txn->abort(); - derr(1) << "..getting original object failed" << dendl; - return -ENOENT; - } - auto_ptr valueptr ((char *) value.get_data()); - - if (db->put (txn, &nikey, &ival, 0) != 0) - { - if (txn) - txn->abort(); - derr(1) << "..putting object info failed" << dendl; - return -EIO; - } - if (db->put (txn, &nkey, &value, 0) != 0) - { - if (txn) - txn->abort(); - derr(1) << "..putting new object failed" << dendl; - return -EIO; - } - - if (txn) - txn->commit (0); - - dout(4) << "..clone OK" << dendl; - return 0; -} - - // Collections - -int OSBDB::list_collections(list& ls) -{ - if (!mounted) - { - derr(1) << "not mounted!" << dendl; - return -EINVAL; - } - - dout(2) << "list_collections" << dendl; - - Dbt key (COLLECTIONS_KEY, 1); - Dbt value; - value.set_flags (DB_DBT_MALLOC); - - if (db->get (NULL, &key, &value, 0) != 0) - { - dout(4) << "..no collections" << dendl; - return 0; // no collections. - } - - auto_ptr sc ((stored_colls *) value.get_data()); - stored_colls *scp = sc.get(); - for (uint32_t i = 0; i < sc->count; i++) - ls.push_back (scp->colls[i]); - - dout(4) << "..list_collections returns " << scp->count << dendl; - return scp->count; -} - -int OSBDB::create_collection(coll_t c, Context *onsafe) -{ - dout(6) << "Context " << hex << onsafe << dec << dendl; - if (!mounted) - { - derr(1) << "not mounted" << dendl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - scoped_lock __lock(&lock); - dout(2) << "create_collection " << hex << c << dec << dendl; - - Dbt key (COLLECTIONS_KEY, 1); - Dbt value; - value.set_flags (DB_DBT_MALLOC); - - DbTxn *txn = NULL; - if (transactional) - env->txn_begin (NULL, &txn, 0); - - stored_colls *scp = NULL; - size_t sz = 0; - bool created = false; - if (db->get (txn, &key, &value, 0) != 0) - { - sz = sizeof (stored_colls) + sizeof (coll_t); - scp = (stored_colls *) malloc (sz); - scp->count = 0; - created = true; - } - else - { - scp = (stored_colls *) value.get_data(); - sz = value.get_size(); - } - - auto_ptr sc (scp); - int ins = 0; - if (scp->count > 0) - ins = binary_search (scp->colls, scp->count, c); - if (ins < scp->count && scp->colls[ins] == c) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".collection " << c << " already exists " << dendl; - return -EEXIST; - } - - dout(3) << "..insertion point: " << ins << dendl; - - // Make room for a new collection ID. - if (!created) - { - sz += sizeof (coll_t); - dout(3) << "..increase size to " << sz << dendl; - stored_colls *scp2 = (stored_colls *) realloc (scp, sz); - sc.release (); - sc.reset (scp2); - scp = scp2; - } - - int n = (scp->count - ins) * sizeof (coll_t); - if (n > 0) - { - dout(3) << "..moving " << n << " bytes up" << dendl; - memmove (&scp->colls[ins + 1], &scp->colls[ins], n); - } - scp->count++; - scp->colls[ins] = c; - - dout(3) << "..collections: " << scp << dendl; - - // Put the modified collection list back. - { - Dbt value2 (scp, sz); - if (db->put (txn, &key, &value2, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".writing new collections list failed" << dendl; - return -EIO; - } - } - - // Create the new collection. - { - stored_coll new_coll; - new_coll.count = 0; - Dbt coll_key (&c, sizeof (coll_t)); - Dbt coll_value (&new_coll, sizeof (stored_coll)); - if (db->put (txn, &coll_key, &coll_value, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".writing new collection failed" << dendl; - return -EIO; - } - } - - if (txn != NULL) - txn->commit (0); - if (onsafe != NULL) - COMMIT(onsafe); - - dout(4) << "..create_collection OK" << dendl; - return 0; -} - -int OSBDB::destroy_collection(coll_t c, Context *onsafe) -{ - dout(6) << "Context " << hex << onsafe << dec << dendl; - if (!mounted) - { - derr(1) << "not mounted" << dendl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - scoped_lock __lock(&lock); - dout(2) << "destroy_collection " << hex << c << dec << dendl; - - Dbt key (COLLECTIONS_KEY, 1); - Dbt value; - value.set_flags (DB_DBT_MALLOC); - DbTxn *txn = NULL; - - if (transactional) - env->txn_begin (NULL, &txn, 0); - - if (db->get (txn, &key, &value, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".collection list doesn't exist" << dendl; - return -ENOENT; // XXX - } - - stored_colls *scp = (stored_colls *) value.get_data(); - auto_ptr valueBuf (scp); - if (scp->count == 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".collection " << c << " not listed" << dendl; - return -ENOENT; - } - uint32_t ins = binary_search (scp->colls, scp->count, c); - dout(4) << "..insertion point is " << ins << dendl; - if (ins >= scp->count || scp->colls[ins] != c) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".collection " << c << " not listed" << dendl; - return -ENOENT; - } - - dout(4) << "..collections list is " << scp << dendl; - - // Move the rest of the list down in memory, if needed. - if (ins < scp->count) - { - size_t n = scp->count - ins - 1; - dout(4) << "..shift list down " << n << dendl; - memmove (&scp->colls[ins], &scp->colls[ins + 1], n); - } - - dout(4) << "..collections list is " << scp << dendl; - - // Modify the record size to be one less. - Dbt nvalue (scp, value.get_size() - sizeof (coll_t)); - nvalue.set_flags (DB_DBT_USERMEM); - if (db->put (txn, &key, &nvalue, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".putting modified collection list failed" << dendl; - return -EIO; - } - - // Delete the collection. - Dbt collKey (&c, sizeof (coll_t)); - if (db->del (txn, &collKey, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".deleting collection failed" << dendl; - return -EIO; - } - - if (txn != NULL) - txn->commit (0); - if (onsafe != NULL) - COMMIT(onsafe); - dout(4) << "..destroy_collection OK" << dendl; - return 0; -} - -bool OSBDB::collection_exists(coll_t c) -{ - if (!mounted) - { - derr(1) << "not mounted" << dendl; - return -EINVAL; - } - - dout(2) << "collection_exists " << hex << c << dec << dendl; - - /*Dbt key (COLLECTIONS_KEY, 1); - Dbt value; - value.set_flags (DB_DBT_MALLOC); - - if (db->get (NULL, &key, &value, 0) != 0) - { - dout(4) << "..no collection list; return false" << dendl; - return false; - } - - stored_colls *scp = (stored_colls *) value.get_data(); - auto_ptr sc (scp); - dout(5) << "..collection list is " << scp << dendl; - if (scp->count == 0) - { - dout(4) << "..empty collection list; return false" << dendl; - return false; - } - uint32_t ins = binary_search (scp->colls, scp->count, c); - dout(4) << "..insertion point is " << ins << dendl; - - int ret = (scp->colls[ins] == c); - dout(4) << "..returns " << ret << dendl; - return ret;*/ - - Dbt key (&c, sizeof (coll_t)); - Dbt value; - value.set_flags (DB_DBT_MALLOC); - if (db->get (NULL, &key, &value, 0) != 0) - { - dout(4) << "..no collection, return false" << dendl; - return false; - } - void *val = value.get_data(); - free (val); - dout(4) << "..collection exists; return true" << dendl; - return true; -} - -int OSBDB::collection_stat(coll_t c, struct stat *st) -{ - if (!mounted) - { - derr(1) << "not mounted" << dendl; - return -EINVAL; - } - - dout(2) << "collection_stat " << c << dendl; - // XXX is this needed? - return -ENOSYS; -} - -int OSBDB::collection_add(coll_t c, object_t o, Context *onsafe) -{ - dout(6) << "Context " << hex << onsafe << dec << dendl; - if (!mounted) - { - dout(2) << "not mounted" << dendl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - scoped_lock __lock(&lock); - dout(2) << "collection_add " << hex << c << dec << " " << o << dendl; - - Dbt key (&c, sizeof (coll_t)); - Dbt value; - value.set_flags (DB_DBT_MALLOC); - DbTxn *txn = NULL; - - if (transactional) - env->txn_begin (NULL, &txn, 0); - - if (db->get (txn, &key, &value, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << "failed to find collection" << dendl; - return -ENOENT; - } - - size_t sz = value.get_size(); - stored_coll *scp = (stored_coll *) value.get_data(); - auto_ptr sc (scp); - - // Find the insertion point for the new object ID. - uint32_t ins = 0; - if (scp->count > 0) - { - ins = binary_search (scp->objects, scp->count, o); - // Already there? - if (ins < scp->count && scp->objects[ins] == o) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << "collection already has object" << dendl; - return -EEXIST; - } - } - - // Make room for the new value, and add it. - sz += sizeof (object_t); - scp = (stored_coll *) realloc (scp, sz); - sc.release(); - sc.reset (scp); - dout(3) << "..current collection: " << scp << dendl; - if (ins < scp->count - 1) - { - size_t n = (scp->count - ins) * sizeof (object_t); - dout(3) << "..move up " << n << " bytes" << dendl; - memmove (&scp->objects[ins + 1], &scp->objects[ins], n); - } - scp->count++; - scp->objects[ins] = o; - - dout(3) << "..collection: " << scp << dendl; - - Dbt nvalue (scp, sz); - if (db->put (txn, &key, &nvalue, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << "..putting modified collection failed" << dendl; - return -EIO; - } - - if (txn != NULL) - txn->commit (0); - if (onsafe != NULL) - COMMIT(onsafe); - dout(4) << "..collection add OK" << dendl; - return 0; -} - -int OSBDB::collection_remove(coll_t c, object_t o, Context *onsafe) -{ - dout(6) << "Context " << hex << onsafe << dec << dendl; - if (!mounted) - { - derr(1) << "not mounted" << dendl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - scoped_lock __lock(&lock); - dout(2) << "collection_remove " << hex << c << dec << " " << o << dendl; - - Dbt key (&c, sizeof (coll_t)); - Dbt value; - value.set_flags (DB_DBT_MALLOC); - DbTxn *txn = NULL; - - if (transactional) - env->txn_begin (NULL, &txn, 0); - - if (db->get (txn, &key, &value, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - dout(1) << "..collection doesn't exist" << dendl; - return -ENOENT; - } - - stored_coll *scp = (stored_coll *) value.get_data(); - auto_ptr sc (scp); - - dout(5) << "..collection is " << scp << dendl; - if (scp->count == 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - dout(1) << "..collection is empty" << dendl; - return -ENOENT; - } - uint32_t ins = binary_search (scp->objects, scp->count, o); - dout(4) << "..insertion point is " << ins << dendl; - if (ins >= scp->count || scp->objects[ins] != o) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - dout(1) << "..object not in collection" << dendl; - return -ENOENT; - } - - if (ins < scp->count - 1) - { - size_t n = (scp->count - ins - 1) * sizeof (object_t); - dout(5) << "..moving " << n << " bytes down" << dendl; - memmove (&scp->objects[ins], &scp->objects[ins + 1], n); - } - scp->count--; - - dout(3) << "..collection " << scp << dendl; - - Dbt nval (scp, value.get_size() - sizeof (object_t)); - if (db->put (txn, &key, &nval, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << "..putting modified collection failed" << dendl; - return -EIO; - } - - if (txn != NULL) - txn->commit (0); - if (onsafe != NULL) - COMMIT(onsafe); - dout(4) << "..collection remove OK" << dendl; - return 0; -} - -int OSBDB::collection_list(coll_t c, list& o) -{ - if (!mounted) - { - derr(1) << "not mounted" << dendl; - return -EINVAL; - } - - Dbt key (&c, sizeof (coll_t)); - Dbt value; - DbTxn *txn = NULL; - - if (transactional) - env->txn_begin (NULL, &txn, 0); - - if (db->get (txn, &key, &value, 0) != 0) - { - if (txn != NULL) - txn->abort(); - return -ENOENT; - } - - stored_coll *scp = (stored_coll *) value.get_data(); - auto_ptr sc (scp); - for (uint32_t i = 0; i < scp->count; i++) - o.push_back (scp->objects[i]); - - if (txn != NULL) - txn->commit (0); - return 0; -} - - // Attributes - -int OSBDB::_setattr(object_t oid, const char *name, - const void *value, size_t size, Context *onsafe, - DbTxn *txn) -{ - dout(6) << "Context " << hex << onsafe << dec << dendl; - if (!mounted) - { - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - if (strlen (name) >= OSBDB_MAX_ATTR_LEN) - { - derr(1) << "name too long: " << name << dendl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -ENAMETOOLONG; - } - - scoped_lock __lock(&lock); - - // Add name to attribute list, if needed. - attrs_id aids = new_attrs_id (oid); - Dbt attrs_key (&aids, sizeof_attrs_id()); - Dbt attrs_val; - attrs_val.set_flags (DB_DBT_MALLOC); - stored_attrs *sap = NULL; - size_t sz = 0; - - dout(3) << " getting " << aids << dendl; - if (db->get (txn, &attrs_key, &attrs_val, 0) != 0) - { - dout(2) << " first attribute" << dendl; - sz = sizeof (stored_attrs); - sap = (stored_attrs *) malloc(sz); - sap->count = 0; - } - else - { - sz = attrs_val.get_size(); - sap = (stored_attrs *) attrs_val.get_data(); - dout(2) << "..add to list of " << sap->count << " attrs" << dendl; - } - auto_ptr sa (sap); - - attr_name _name; - strncpy (_name.name, name, OSBDB_MAX_ATTR_LEN); - - int ins = 0; - if (sap->count > 0) - ins = binary_search (sap->names, sap->count, _name); - dout(3) << "..insertion point is " << ins << dendl; - if (sap->count == 0 || - (ins >= sap->count || strcmp (sap->names[ins].name, name) != 0)) - { - sz += sizeof (attr_name); - dout(3) << "..realloc " << ((void *) sap) << " to " - << dec << sz << dendl; - sap = (stored_attrs *) realloc (sap, sz); - dout(3) << "..returns " << ((void *) sap) << dendl; - sa.release (); - sa.reset (sap); - int n = (sap->count - ins) * sizeof (attr_name); - if (n > 0) - { - dout(3) << "..move " << n << " bytes from 0x" - << hex << (&sap->names[ins]) << " to 0x" - << hex << (&sap->names[ins+1]) << dec << dendl; - memmove (&sap->names[ins+1], &sap->names[ins], n); - } - memset (&sap->names[ins], 0, sizeof (attr_name)); - strncpy (sap->names[ins].name, name, OSBDB_MAX_ATTR_LEN); - sap->count++; - - Dbt newAttrs_val (sap, sz); - newAttrs_val.set_ulen (sz); - newAttrs_val.set_flags (DB_DBT_USERMEM); - dout(3) << "..putting " << aids << dendl; - if (db->put (txn, &attrs_key, &newAttrs_val, 0) != 0) - { - derr(1) << ".writing attributes list failed" << dendl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - } - else - { - dout(3) << "..attribute " << name << " already exists" << dendl; - } - - dout(5) << "..attributes list: " << sap << dendl; - - // Add the attribute. - attr_id aid = new_attr_id (oid, name); - Dbt attr_key (&aid, sizeof (aid)); - Dbt attr_val ((void *) value, size); - dout(3) << "..writing attribute key " << aid << dendl; - if (db->put (txn, &attr_key, &attr_val, 0) != 0) - { - derr(1) << ".writing attribute key failed" << dendl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - - dout(4) << "..setattr OK" << dendl; - if (onsafe != NULL) - COMMIT(onsafe); - return 0; -} - -int OSBDB::setattr(object_t oid, const char *name, - const void *value, size_t size, - Context *onsafe) -{ - dout(6) << "Context " << hex << onsafe << dec << dendl; - if (!mounted) - { - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - DbTxn *txn = NULL; - if (transactional) - env->txn_begin (NULL, &txn, 0); - - dout(2) << "setattr " << oid << ":" << name << " => (" - << size << " bytes)" << dendl; - int ret = _setattr (oid, name, value, size, onsafe, txn); - if (ret == 0) - { - if (txn != NULL) - txn->commit (0); - } - else - { - if (txn != NULL) - txn->abort(); - } - return ret; -} - -int OSBDB::setattrs(object_t oid, map& aset, - Context *onsafe) -{ - dout(6) << "Context " << hex << onsafe << dec << dendl; - if (!mounted) - { - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - DbTxn *txn = NULL; - - if (transactional) - env->txn_begin (NULL, &txn, 0); - - map::iterator it; - for (it = aset.begin(); it != aset.end(); it++) - { - string name = it->first; - bufferptr value = it->second; - int ret = _setattr (oid, name.c_str(), value.c_str(), - value.length(), onsafe, txn); - if (ret != 0) - { - if (txn != NULL) - txn->abort(); - return ret; - } - } - - if (txn != NULL) - txn->commit (0); - return 0; -} - -int OSBDB::_getattr (object_t oid, const char *name, void *value, size_t size) -{ - if (!mounted) - return -EINVAL; - - dout(2) << "_getattr " << oid << " " << name << " " << size << dendl; - - attr_id aid = new_attr_id (oid, name); - Dbt key (&aid, sizeof (aid)); - Dbt val (value, size); - val.set_ulen (size); - val.set_doff (0); - val.set_dlen (size); - val.set_flags (DB_DBT_USERMEM | DB_DBT_PARTIAL); - - int ret; - if ((ret = db->get (NULL, &key, &val, 0)) != 0) - { - derr(1) << ".getting value failed: " << db_strerror (ret) << dendl; - return -ENOENT; - } - - dout(4) << ".._getattr OK; returns " << val.get_size() << dendl; - return val.get_size(); -} - -int OSBDB::getattr(object_t oid, const char *name, void *value, size_t size) -{ - if (!mounted) - return -EINVAL; - - return _getattr (oid, name, value, size); -} - -int OSBDB::getattrs(object_t oid, map& aset) -{ - if (!mounted) - return -EINVAL; - - for (map::iterator it = aset.begin(); - it != aset.end(); it++) - { - int ret = _getattr (oid, (*it).first.c_str(), - (*it).second.c_str(), - (*it).second.length()); - if (ret < 0) - return ret; - } - return 0; -} - -int OSBDB::rmattr(object_t oid, const char *name, Context *onsafe) -{ - dout(6) << "Context " << hex << onsafe << dec << dendl; - if (!mounted) - { - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - scoped_lock __lock(&lock); - dout(2) << "rmattr " << oid << " " << name << dendl; - - attrs_id aids = new_attrs_id (oid); - Dbt askey (&aids, sizeof_attrs_id()); - Dbt asvalue; - asvalue.set_flags (DB_DBT_MALLOC); - - DbTxn *txn = NULL; - - if (transactional) - env->txn_begin (NULL, &txn, 0); - - if (db->get (txn, &askey, &asvalue, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - return -ENOENT; - } - - stored_attrs *sap = (stored_attrs *) asvalue.get_data(); - auto_ptr sa (sap); - - dout(5) << "..attributes list " << sap << dendl; - - if (sap->count == 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".empty attribute list" << dendl; - return -ENOENT; - } - - attr_name _name; - memset(&_name, 0, sizeof (_name)); - strncpy (_name.name, name, OSBDB_MAX_ATTR_LEN); - int ins = binary_search (sap->names, sap->count, _name); - dout(4) << "..insertion point is " << ins << dendl; - if (ins >= sap->count || strcmp (sap->names[ins].name, name) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".attribute not found in list" << dendl; - return -ENOENT; - } - - // Shift the later elements down by one, if needed. - int n = (sap->count - ins) * sizeof (attr_name); - if (n > 0) - { - dout(4) << "..shift down by " << n << dendl; - memmove (&(sap->names[ins]), &(sap->names[ins + 1]), n); - } - sap->count--; - dout(5) << "..attributes list now " << sap << dendl; - - asvalue.set_size(asvalue.get_size() - sizeof (attr_name)); - int ret; - if ((ret = db->put (txn, &askey, &asvalue, 0)) != 0) - { - derr(1) << "put stored_attrs " << db_strerror (ret) << dendl; - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - - // Remove the attribute. - attr_id aid = new_attr_id (oid, name); - Dbt key (&aid, sizeof (aid)); - if ((ret = db->del (txn, &key, 0)) != 0) - { - derr(1) << "deleting " << aid << ": " << db_strerror(ret) << dendl; - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - - if (txn != NULL) - txn->commit (0); - if (onsafe != NULL) - COMMIT(onsafe); - dout(4) << "..rmattr OK" << dendl; - return 0; -} - -int OSBDB::listattr(object_t oid, char *attrs, size_t size) -{ - if (!mounted) - return -EINVAL; - - dout(2) << "listattr " << oid << dendl; - - attrs_id aids = new_attrs_id (oid); - Dbt key (&aids, sizeof_attrs_id()); - Dbt value; - value.set_flags (DB_DBT_MALLOC); - - // XXX Transactions for read atomicity??? - - int ret; - if ((ret = db->get (NULL, &key, &value, 0)) != 0) - { - derr(1) << "fetching " << aids << ": " << db_strerror (ret) - << dendl; - return -ENOENT; - } - - stored_attrs *attrsp = (stored_attrs *) value.get_data(); - auto_ptr _attrs (attrsp); - size_t s = 0; - char *p = attrs; - for (unsigned i = 0; i < attrsp->count && s < size; i++) - { - int n = MIN (OSBDB_MAX_ATTR_LEN, - MIN (strlen (attrsp->names[i].name), size - s - 1)); - strncpy (p, attrsp->names[i].name, n); - p[n] = '\0'; - p = p + n + 1; - } - - dout(4) << "listattr OK" << dendl; - return 0; -} - - // Collection attributes. - -int OSBDB::collection_setattr(coll_t cid, const char *name, - const void *value, size_t size, - Context *onsafe) -{ - dout(6) << "Context " << hex << onsafe << dec << dendl; - if (!mounted) - { - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - scoped_lock __lock(&lock); - dout(2) << "collection_setattr " << hex << cid << dec << " " << name - << " (" << size << " bytes)" << dendl; - if (strlen (name) >= OSBDB_MAX_ATTR_LEN) - { - derr(1) << "name too long" << dendl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -ENAMETOOLONG; - } - - // Add name to attribute list, if needed. - coll_attrs_id aids = new_coll_attrs_id (cid); - Dbt attrs_key (&aids, sizeof_coll_attrs_id()); - Dbt attrs_val; - attrs_val.set_flags (DB_DBT_MALLOC); - stored_attrs *sap = NULL; - size_t sz = 0; - - DbTxn *txn = NULL; - if (transactional) - env->txn_begin (NULL, &txn, 0); - - dout(3) << " getting " << aids << dendl; - if (db->get (txn, &attrs_key, &attrs_val, 0) != 0) - { - dout(2) << " first attribute" << dendl; - sz = sizeof (stored_attrs); - sap = (stored_attrs *) malloc(sz); - sap->count = 0; - } - else - { - sz = attrs_val.get_size(); - sap = (stored_attrs *) attrs_val.get_data(); - dout(2) << " add to list of " << sap->count << " attrs" << dendl; - } - auto_ptr sa (sap); - - attr_name _name; - strncpy (_name.name, name, OSBDB_MAX_ATTR_LEN); - - int ins = 0; - if (sap->count > 0) - ins = binary_search (sap->names, sap->count, _name); - dout(3) << " insertion point is " << ins << dendl; - if (ins >= sap->count || strcmp (sap->names[ins].name, name) != 0) - { - sz += sizeof (attr_name); - dout(3) << " realloc " << hex << ((void *) sap) << " to " - << dec << sz << dendl; - sap = (stored_attrs *) realloc (sap, sz); - dout(3) << " returns " << hex << ((void *) sap) << dec << dendl; - sa.release (); - sa.reset (sap); - int n = (sap->count - ins) * sizeof (attr_name); - if (n > 0) - { - dout(3) << " move " << n << " bytes from 0x" - << hex << (&sap->names[ins]) << " to 0x" - << hex << (&sap->names[ins+1]) << dec << dendl; - memmove (&sap->names[ins+1], &sap->names[ins], n); - } - memset (&sap->names[ins], 0, sizeof (attr_name)); - strncpy (sap->names[ins].name, name, OSBDB_MAX_ATTR_LEN); - sap->count++; - - Dbt newAttrs_val (sap, sz); - newAttrs_val.set_ulen (sz); - newAttrs_val.set_flags (DB_DBT_USERMEM); - dout(3) << " putting " << aids << dendl; - if (db->put (txn, &attrs_key, &newAttrs_val, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".putting new attributes failed" << dendl; - return -EIO; - } - } - else - { - dout(3) << "..attribute " << name << " already exists" << dendl; - } - - dout(3) << "..attributes list: " << sap << dendl; - - // Add the attribute. - coll_attr_id aid = new_coll_attr_id (cid, name); - Dbt attr_key (&aid, sizeof (aid)); - Dbt attr_val ((void *) value, size); - dout(3) << " writing attribute key " << aid << dendl; - if (db->put (txn, &attr_key, &attr_val, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".putting attribute failed" << dendl; - return -EIO; - } - - if (txn != NULL) - txn->commit (0); - if (onsafe != NULL) - COMMIT(onsafe); - - dout(4) << "..collection setattr OK" << dendl; - return 0; -} - -int OSBDB::collection_rmattr(coll_t cid, const char *name, - Context *onsafe) -{ - dout(6) << "Context " << hex << onsafe << dec << dendl; - if (!mounted) - { - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - scoped_lock __lock(&lock); - dout(2) << "collection_rmattr " << hex << cid << dec - << " " << name << dendl; - - coll_attrs_id aids = new_coll_attrs_id (cid); - Dbt askey (&aids, sizeof_coll_attrs_id()); - Dbt asvalue; - asvalue.set_flags (DB_DBT_MALLOC); - - DbTxn *txn = NULL; - if (transactional) - env->txn_begin (NULL, &txn, 0); - - if (db->get (txn, &askey, &asvalue, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".no attributes list" << dendl; - return -ENOENT; - } - - stored_attrs *sap = (stored_attrs *) asvalue.get_data(); - auto_ptr sa (sap); - - dout(5) << "..attributes list " << sap << dendl; - if (sap->count == 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".empty attributes list" << dendl; - return -ENOENT; - } - - attr_name _name; - memset(&_name, 0, sizeof (_name)); - strncpy (_name.name, name, OSBDB_MAX_ATTR_LEN); - int ins = binary_search (sap->names, sap->count, _name); - if (ins >= sap->count || strcmp (sap->names[ins].name, name) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".attribute not listed" << dendl; - return -ENOENT; - } - - // Shift the later elements down by one, if needed. - int n = (sap->count - ins) * sizeof (attr_name); - if (n > 0) - { - dout(4) << "..shift down by " << n << dendl; - memmove (&(sap->names[ins]), &(sap->names[ins + 1]), n); - } - sap->count--; - dout(5) << "..attributes list now " << sap << dendl; - - asvalue.set_size(asvalue.get_size() - sizeof (attr_name)); - int ret; - if ((ret = db->put (txn, &askey, &asvalue, 0)) != 0) - { - derr(1) << "put stored_attrs " << db_strerror (ret) << dendl; - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - - // Remove the attribute. - coll_attr_id aid = new_coll_attr_id (cid, name); - Dbt key (&aid, sizeof (aid)); - if ((ret = db->del (txn, &key, 0)) != 0) - { - derr(1) << "deleting " << aid << ": " << db_strerror(ret) << dendl; - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - - if (txn != NULL) - txn->commit (0); - if (onsafe != NULL) - COMMIT(onsafe); - - dout(4) << "..collection rmattr OK" << dendl; - return 0; -} - -int OSBDB::collection_getattr(coll_t cid, const char *name, - void *value, size_t size) -{ - if (!mounted) - return -EINVAL; - - dout(2) << "collection_getattr " << hex << cid << dec - << " " << name << dendl; - - // XXX transactions/read isolation? - - coll_attr_id caid = new_coll_attr_id (cid, name); - Dbt key (&caid, sizeof (caid)); - Dbt val (value, size); - val.set_ulen (size); - val.set_dlen (size); - val.set_flags (DB_DBT_USERMEM | DB_DBT_PARTIAL); - - if (db->get (NULL, &key, &val, 0) != 0) - { - derr(1) << ".no attribute entry" << dendl; - return -ENOENT; - } - - dout(4) << "..collection getattr OK; returns " << val.get_size() << dendl; - return val.get_size(); -} - -int OSBDB::collection_listattr(coll_t cid, char *attrs, size_t size) -{ - if (!mounted) - return -EINVAL; - - dout(2) << "collection_listattr " << hex << cid << dec << dendl; - - // XXX transactions/read isolation? - - coll_attrs_id caids = new_coll_attrs_id (cid); - Dbt key (&caids, sizeof_coll_attrs_id()); - Dbt value; - value.set_flags (DB_DBT_MALLOC); - - int ret; - if ((ret = db->get (NULL, &key, &value, 0)) != 0) - { - derr(1) << "fetching " << caids << ": " << db_strerror (ret) - << dendl; - return -ENOENT; - } - - stored_attrs *attrsp = (stored_attrs *) value.get_data(); - auto_ptr _attrs (attrsp); - size_t s = 0; - char *p = attrs; - for (unsigned i = 0; i < attrsp->count && s < size; i++) - { - int n = MIN (OSBDB_MAX_ATTR_LEN, - MIN (strlen (attrsp->names[i].name), size - s - 1)); - strncpy (p, attrsp->names[i].name, n); - p[n] = '\0'; - p = p + n + 1; - } - return 0; -} - - // Sync. - -void OSBDB::sync (Context *onsync) -{ - if (!mounted) - return; - - sync(); - - if (onsync != NULL) - { - g_timer.add_event_after(0.1, onsync); - } -} - -void OSBDB::sync() -{ - if (!mounted) - return; - - if (transactional) - { - env->log_flush (NULL); - env->lsn_reset (device.c_str(), 0); - } - db->sync(0); -} diff --git a/branches/sage/mds/osbdb/OSBDB.h b/branches/sage/mds/osbdb/OSBDB.h deleted file mode 100644 index 8eb2004d3903f..0000000000000 --- a/branches/sage/mds/osbdb/OSBDB.h +++ /dev/null @@ -1,482 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* OSBDB.h -- ObjectStore on Berkeley DB. -*- c++ -*- - Copyright (C) 2007 Casey Marshall - -Ceph - scalable distributed file system - -This is free software; you can redistribute it and/or -modify it under the terms of the GNU Lesser General Public -License version 2.1, as published by the Free Software -Foundation. See file COPYING. */ - - -#include -#include "osd/ObjectStore.h" - -#define OSBDB_MAGIC 0x05BDB - -/* - * Maximum length of an attribute name. - */ -#define OSBDB_MAX_ATTR_LEN 256 - -#define OSBDB_THIS_VERSION 1 - -#define OSBDB_SUPERBLOCK_KEY ((void *) "s") - -/* - * The "superblock" of the BDB object store. We store one of these in - * the DB, to store version and other information. We don't record - * anything special here, just the version number the database was - * written with. - * - * In principle, this structure is variable-length, depending on the - * software version writing the superblock. - */ -struct stored_superblock -{ - uint32_t version; -}; - -inline ostream& operator<<(ostream& out, const stored_superblock sb) -{ - out << "osbdb.super(" << sb.version << ")" << endl; - return out; -} - -/** - * An object identifier; we define this so we can have a POD object to - * work with. - */ -struct oid_t // POD -{ - char id[16]; -}; - -inline void mkoid (oid_t& id, object_t& oid) -{ - // XXX byte order? - memcpy (id.id, &oid, sizeof (oid_t)); -} - -inline ostream& operator<<(ostream& out, const oid_t id) -{ - for (int i = 0; i < 16; i++) - { - out.fill('0'); - out << setw(2) << hex << (id.id[i] & 0xFF); - if ((i & 3) == 3) - out << ':'; - } - out.unsetf(ios::right); - out << dec; - return out; -} - -/** - * An "inode" key. We map a 'stored_object' struct to this key for - * every object. - */ -struct object_inode_key // POD -{ - oid_t oid; - char tag; -}; - -/** - * "Constructor" for an object_inode_key. - */ -inline object_inode_key new_object_inode_key (object_t& oid) -{ - object_inode_key key; - memset(&key, 0, sizeof (object_inode_key)); - mkoid (key.oid, oid); - key.tag = 'i'; - return key; -} - -/* - * We use this, instead of sizeof(), to try and guarantee that we - * don't include the structure padding, if any. - * - * This *should* return 17: sizeof (oid_t) == 16; sizeof (char) == 1. - */ -inline size_t sizeof_object_inode_key() -{ - return offsetof(object_inode_key, tag) + sizeof (char); -} - - // Frank Poole: Unfortunately, that sounds a little - // like famous last words. - // -- 2001: A Space Odyssey - -inline ostream& operator<<(ostream& out, const object_inode_key o) -{ - out << o.tag << "/" << o.oid; - return out; -} - -/** - * A stored object. This is essentially the "inode" of the object, - * containing things like the object's length. The object itself is - * stored as-is, mapped by the 128-bit object ID. - */ -struct stored_object -{ - uint32_t length; -}; - -inline ostream& operator<<(ostream& out, const stored_object s) -{ - out << "inode(l:" << s.length << ")"; - return out; -} - -/* - * Key referencing the list of attribute names for an object. This is - * simply the object's ID, with an additional character 'a' appended. - */ -struct attrs_id // POD -{ - oid_t oid; - char tag; -}; - -/* - * "Construtor" for attrs_id. - */ -inline struct attrs_id new_attrs_id (object_t& oid) -{ - attrs_id aid; - memset (&aid, 0, sizeof (attrs_id)); - mkoid(aid.oid, oid); - aid.tag = 'a'; - return aid; -} - -/* - * See explanation for sizeof_object_inode_id. - */ -inline size_t sizeof_attrs_id() -{ - return offsetof(struct attrs_id, tag) + sizeof (char); -} - -inline ostream& operator<<(ostream& out, const attrs_id id) -{ - out << id.tag << "/" << id.oid; - return out; -} - -/* - * Encapsulation of a single attribute name. - */ -struct attr_name // POD -{ - char name[OSBDB_MAX_ATTR_LEN]; -}; - -inline ostream& operator<<(ostream& out, const attr_name n) -{ - out << n.name; - return out; -} - -inline bool operator<(const attr_name n1, const attr_name n2) -{ - return (strncmp (n1.name, n2.name, OSBDB_MAX_ATTR_LEN) < 0); -} - -inline bool operator>(const attr_name n1, const attr_name n2) -{ - return (strncmp (n1.name, n2.name, OSBDB_MAX_ATTR_LEN) > 0); -} - -inline bool operator==(const attr_name n1, const attr_name n2) -{ - std::cerr << n1.name << " == " << n2.name << "?" << endl; - return (strncmp (n1.name, n2.name, OSBDB_MAX_ATTR_LEN) == 0); -} - -inline bool operator!=(const attr_name n1, const attr_name n2) -{ - return !(n1 == n2); -} - -inline bool operator>=(const attr_name n1, const attr_name n2) -{ - return !(n1 < n2); -} - -inline bool operator<=(const attr_name n1, const attr_name n2) -{ - return !(n1 > n2); -} - -/* - * A list of an object or collection's attribute names. - */ -struct stored_attrs -{ - uint32_t count; - attr_name names[0]; // actually variable-length -}; - -inline ostream& operator<<(ostream& out, const stored_attrs *sa) -{ - out << sa->count << " [ "; - for (unsigned i = 0; i < sa->count; i++) - out << sa->names[i] << (i == sa->count - 1 ? " " : ", "); - out << "]"; - return out; -} - -/* - * An object attribute key. An object attribute is mapped simply by - * the object ID appended with the attribute name. Attribute names - * may not be empty, and must be less than 256 characters, in this - * implementation. - */ -struct attr_id // POD -{ - oid_t oid; - attr_name name; -}; - -inline attr_id new_attr_id (object_t& oid, const char *name) -{ - attr_id aid; - memset(&aid, 0, sizeof (attr_id)); - mkoid (aid.oid, oid); - strncpy (aid.name.name, name, OSBDB_MAX_ATTR_LEN); - return aid; -} - -inline ostream& operator<<(ostream &out, const attr_id id) -{ - out << id.oid << ":" << id.name; - return out; -} - -/* - * A key for a collection attributes list. - */ -struct coll_attrs_id // POD -{ - coll_t cid; - char tag; -}; - -inline coll_attrs_id new_coll_attrs_id (coll_t cid) -{ - coll_attrs_id catts; - memset(&catts, 0, sizeof (coll_attrs_id)); - catts.cid = cid; - catts.tag = 'C'; - return catts; -} - -inline size_t sizeof_coll_attrs_id() -{ - return offsetof(coll_attrs_id, tag) + sizeof (char); -} - -inline ostream& operator<<(ostream& out, coll_attrs_id id) -{ - out << id.tag << "/" << id.cid; - return out; -} - -/* - * A collection attribute key. Similar to - */ -struct coll_attr_id // POD -{ - coll_t cid; - attr_name name; -}; - -inline coll_attr_id new_coll_attr_id (coll_t cid, const char *name) -{ - coll_attr_id catt; - memset(&catt, 0, sizeof (coll_attr_id)); - catt.cid = cid; - strncpy (catt.name.name, name, OSBDB_MAX_ATTR_LEN); - return catt; -} - -inline ostream& operator<<(ostream& out, coll_attr_id id) -{ - out << id.cid << ":" << id.name; - return out; -} - -/* - * This is the key we store the master collections list under. - */ -#define COLLECTIONS_KEY ((void *) "c") - -/* - * The master list of collections. There should be one of these per - * OSD. The sole reason for this structure is to have the ability - * to enumerate all collections stored on this OSD. - */ -struct stored_colls -{ - // The number of collections. - uint32_t count; - - // The collection identifiers. This is a sorted list of coll_t - // values. - coll_t colls[0]; // actually variable-length -}; - -inline ostream& operator<<(ostream& out, stored_colls *c) -{ - out << c->count << " [ "; - for (unsigned i = 0; i < c->count; i++) - { - out << hex << c->colls[i]; - if (i < c->count - 1) - out << ", "; - } - out << " ]" << dec; - return out; -} - -/* - * A stored collection (a bag of object IDs). These are referenced by - * the bare collection identifier type, a coll_t (thus, a 32-bit - * integer). Internally this is stored as a sorted list of object IDs. - * - * Note, this structure places all collection items in a single - * record; this may be a memory burden for large collections. - */ -struct stored_coll -{ - // The size of this collection. - uint32_t count; - - // The object IDs in this collection. This is a sorted list of all - // object ID's in this collection. - object_t objects[0]; // actually variable-length -}; - -inline ostream& operator<<(ostream& out, stored_coll *c) -{ - out << c->count << " [ "; - for (unsigned i = 0; i < c->count; i++) - { - out << c->objects[i]; - if (i < c->count - 1) - out << ", "; - } - out << " ]"; - return out; -} - -class OSBDBException : public std::exception -{ - const char *msg; - -public: - OSBDBException(const char *msg) : msg(msg) { } - const char *what() { return msg; } -}; - -/* - * The object store interface for Berkeley DB. - */ -class OSBDB : public ObjectStore -{ - private: - Mutex lock; - DbEnv *env; - Db *db; - string device; - string env_dir; - bool mounted; - bool opened; - bool transactional; - - public: - - OSBDB(const char *dev) throw(OSBDBException) - : lock(true), env(0), db (0), device (dev), mounted(false), opened(false), - transactional(g_conf.bdbstore_transactional) - { - } - - ~OSBDB() - { - if (mounted) - { - umount(); - } - } - - int mount(); - int umount(); - int mkfs(); - - int statfs(struct statfs *buf); - - int pick_object_revision_lt(object_t& oid); - - bool exists(object_t oid); - int stat(object_t oid, struct stat *st); - - int remove(object_t oid, Context *onsafe=0); - - int truncate(object_t oid, off_t size, Context *onsafe=0); - - int read(object_t oid, off_t offset, size_t len, - bufferlist& bl); - int write(object_t oid, off_t offset, size_t len, - bufferlist& bl, Context *onsafe); - - int setattr(object_t oid, const char *name, - const void *value, size_t size, Context *onsafe=0); - int setattrs(object_t oid, map& aset, - Context *onsafe=0); - int getattr(object_t oid, const char *name, - void *value, size_t size); - int getattrs(object_t oid, map& aset); - int rmattr(object_t oid, const char *name, - Context *onsafe=0); - int listattr(object_t oid, char *attrs, size_t size); - - int clone(object_t oid, object_t noid); - - // Collections. - - int list_collections(list& ls); - int create_collection(coll_t c, Context *onsafe=0); - int destroy_collection(coll_t c, Context *onsafe=0); - bool collection_exists(coll_t c); - int collection_stat(coll_t c, struct stat *st); - int collection_add(coll_t c, object_t o, Context *onsafe=0); - int collection_remove(coll_t c, object_t o, Context *onsafe=0); - int collection_list(coll_t c, list& o); - - int collection_setattr(coll_t cid, const char *name, - const void *value, size_t size, - Context *onsafe=0); - int collection_rmattr(coll_t cid, const char *name, - Context *onsafe=0); - int collection_getattr(coll_t cid, const char *name, - void *value, size_t size); - int collection_listattr(coll_t cid, char *attrs, size_t size); - - void sync(Context *onsync); - void sync(); - -private: - int opendb (DBTYPE type=DB_UNKNOWN, int flags=0, bool new_env=false); - - int _setattr(object_t oid, const char *name, const void *value, - size_t size, Context *onsync, DbTxn *txn); - int _getattr(object_t oid, const char *name, void *value, size_t size); - DbEnv *getenv(); -}; diff --git a/branches/sage/mds/osd/Ager.cc b/branches/sage/mds/osd/Ager.cc deleted file mode 100644 index fb777238da8fb..0000000000000 --- a/branches/sage/mds/osd/Ager.cc +++ /dev/null @@ -1,333 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#include "include/types.h" - -#include "Ager.h" -#include "ObjectStore.h" - -#include "config.h" -#include "common/Clock.h" - -// ick -#include "ebofs/Ebofs.h" -#include -#include -#include - -#ifdef DARWIN -#include -#include -#endif // DARWIN - - -int myrand() -{ - if (0) - return rand(); - else { - static int n = 0; - srand(n++); - return rand(); - } -} - - -object_t Ager::age_get_oid() { - if (!age_free_oids.empty()) { - object_t o = age_free_oids.front(); - age_free_oids.pop_front(); - return o; - } - object_t last = age_cur_oid; - ++age_cur_oid.bno; - return last; -} - -ssize_t Ager::age_pick_size() { - ssize_t max = file_size_distn.sample() * 1024; - return max/2 + (myrand() % 100) * max/200 + 1; -} - -bool start_debug = false; - -uint64_t Ager::age_fill(float pc, utime_t until) { - int max = 1024*1024; - bufferptr bp(max); - bp.zero(); - bufferlist bl; - bl.push_back(bp); - uint64_t wrote = 0; - while (1) { - if (g_clock.now() > until) break; - - struct statfs st; - store->statfs(&st); - float free = 1.0 - ((float)(st.f_bfree) / (float)st.f_blocks); - float avail = 1.0 - ((float)(st.f_bavail) / (float)st.f_blocks); // to write to - //float a = (float)(st.f_bfree) / (float)st.f_blocks; - //dout(10) << "age_fill at " << a << " / " << pc << " .. " << st.f_blocks << " " << st.f_bavail << dendl; - if (free >= pc) { - generic_dout(2) << "age_fill at " << free << " / " << avail << " / " << " / " << pc << " stopping" << dendl; - break; - } - - // make sure we can write to it.. - if (avail > .98 || - avail - free > .02) - store->sync(); - - object_t oid = age_get_oid(); - - int b = myrand() % 10; - age_objects[b].push_back(oid); - - ssize_t s = age_pick_size(); - wrote += (s + 4095) / 4096; - - - - - generic_dout(2) << "age_fill at " << free << " / " << avail << " / " << pc << " creating " << hex << oid << dec << " sz " << s << dendl; - - - if (false && !g_conf.ebofs_verify && start_debug && wrote > 1000000ULL) { - /* - - - 1005700 -? -1005000 -1005700 - 1005710 - 1005725ULL - 1005750ULL - 1005800 - 1006000 - -// 99 1000500 ? 1000750 1006000 -*/ - g_conf.debug_ebofs = 30; - g_conf.ebofs_verify = true; - } - - off_t off = 0; - while (s) { - ssize_t t = MIN(s, max); - bufferlist sbl; - sbl.substr_of(bl, 0, t); - store->write(oid, off, t, sbl, false); - off += t; - s -= t; - } - oid.bno++; - } - - return wrote*4; // KB -} - -void Ager::age_empty(float pc) { - int nper = 20; - int n = nper; - - //g_conf.ebofs_verify = true; - - while (1) { - struct statfs st; - store->statfs(&st); - float free = 1.0 - ((float)(st.f_bfree) / (float)st.f_blocks); - float avail = 1.0 - ((float)(st.f_bavail) / (float)st.f_blocks); // to write to - generic_dout(2) << "age_empty at " << free << " / " << avail << " / " << pc << dendl;//" stopping" << dendl; - if (free <= pc) { - generic_dout(2) << "age_empty at " << free << " / " << avail << " / " << pc << " stopping" << dendl; - break; - } - - int b = myrand() % 10; - n--; - if (n == 0 || age_objects[b].empty()) { - generic_dout(2) << "age_empty sync" << dendl; - //sync(); - //sync(); - n = nper; - continue; - } - object_t oid = age_objects[b].front(); - age_objects[b].pop_front(); - - generic_dout(2) << "age_empty at " << free << " / " << avail << " / " << pc << " removing " << hex << oid << dec << dendl; - - store->remove(oid); - age_free_oids.push_back(oid); - } - - g_conf.ebofs_verify = false; -} - -void pfrag(uint64_t written, ObjectStore::FragmentationStat &st) -{ - cout << "#gb wr\ttotal\tn x\tavg x\tavg per\tavg j\tfree\tn fr\tavg fr\tnum<2\tsum<2\tnum<4\tsum<4\t..." - << std::endl; - cout << written - << "\t" << st.total - << "\t" << st.num_extent - << "\t" << st.avg_extent - << "\t" << st.avg_extent_per_object - << "\t" << st.avg_extent_jump - << "\t" << st.total_free - << "\t" << st.num_free_extent - << "\t" << st.avg_free_extent; - - int n = st.num_extent; - for (uint64_t i=1; i <= 30; i += 1) { - cout << "\t" << st.extent_dist[i]; - cout << "\t" << st.extent_dist_sum[i]; - //cout << "\ta " << (st.extent_dist[i] ? (st.extent_dist_sum[i] / st.extent_dist[i]):0); - n -= st.extent_dist[i]; - if (n == 0) break; - } - cout << std::endl; -} - - -void Ager::age(int time, - float high_water, // fill to this % - float low_water, // then empty to this % - int count, // this many times - float final_water, // and end here ( <= low_water) - int fake_size_mb) { - - store->_fake_writes(true); - srand(0); - - utime_t start = g_clock.now(); - utime_t until = start; - until.sec_ref() += time; - - int elapsed = 0; - int freelist_inc = 60; - utime_t nextfl = start; - nextfl.sec_ref() += freelist_inc; - - while (age_objects.size() < 10) age_objects.push_back( list() ); - - if (fake_size_mb) { - int fake_bl = fake_size_mb * 256; - struct statfs st; - store->statfs(&st); - float f = (float)fake_bl / (float)st.f_blocks; - high_water = (float)high_water * f; - low_water = (float)low_water * f; - final_water = (float)final_water * f; - generic_dout(2) << "fake " << fake_bl << " / " << st.f_blocks << " is " << f << ", high " << high_water << " low " << low_water << " final " << final_water << dendl; - } - - // init size distn (once) - if (!did_distn) { - did_distn = true; - age_cur_oid = object_t(0,1); - file_size_distn.add(1, 19.0758125+0.65434375); - file_size_distn.add(512, 35.6566); - file_size_distn.add(1024, 27.7271875); - file_size_distn.add(2*1024, 16.63503125); - //file_size_distn.add(4*1024, 106.82384375); - //file_size_distn.add(8*1024, 81.493375); - //file_size_distn.add(16*1024, 14.13553125); - //file_size_distn.add(32*1024, 2.176); - //file_size_distn.add(256*1024, 0.655938); - //file_size_distn.add(512*1024, 0.1480625); - //file_size_distn.add(1*1024*1024, 0.020125); // actually 2, but 32bit - file_size_distn.normalize(); - } - - // clear - for (int i=0; i<10; i++) - age_objects[i].clear(); - - ObjectStore::FragmentationStat st; - - uint64_t wrote = 0; - - for (int c=1; c<=count; c++) { - if (g_clock.now() > until) break; - - //if (c == 7) start_debug = true; - - generic_dout(1) << "#age " << c << "/" << count << " filling to " << high_water << dendl; - uint64_t w = age_fill(high_water, until); - //dout(1) << "age wrote " << w << dendl; - wrote += w; - //store->sync(); - //store->_get_frag_stat(st); - //pfrag(st); - - - if (c == count) { - generic_dout(1) << "#age final empty to " << final_water << dendl; - age_empty(final_water); - } else { - generic_dout(1) << "#age " << c << "/" << count << " emptying to " << low_water << dendl; - age_empty(low_water); - } - //store->sync(); - //store->sync(); - - // show frag state - store->_get_frag_stat(st); - pfrag(wrote / (1024ULL*1024ULL) , // GB - st); - - // dump freelist? - if (g_clock.now() > nextfl) { - elapsed += freelist_inc; - save_freelist(elapsed); - nextfl.sec_ref() += freelist_inc; - } - } - - // dump the freelist - save_freelist(0); - exit(0); // hack - - // ok! - store->_fake_writes(false); - store->sync(); - store->sync(); - generic_dout(1) << "age finished" << dendl; -} - - -void Ager::load_freelist() -{ - generic_dout(1) << "load_freelist" << dendl; - - struct stat st; - - int r = ::stat("ebofs.freelist", &st); - assert(r == 0); - - bufferptr bp(st.st_size); - bufferlist bl; - bl.push_back(bp); - int fd = ::open("ebofs.freelist", O_RDONLY); - ::read(fd, bl.c_str(), st.st_size); - ::close(fd); - - ((Ebofs*)store)->_import_freelist(bl); - store->sync(); - store->sync(); -} - -void Ager::save_freelist(int el) -{ - generic_dout(1) << "save_freelist " << el << dendl; - char s[100]; - sprintf(s, "ebofs.freelist.%d", el); - bufferlist bl; - ((Ebofs*)store)->_export_freelist(bl); - ::unlink(s); - int fd = ::open(s, O_CREAT|O_WRONLY); - ::fchmod(fd, 0644); - ::write(fd, bl.c_str(), bl.length()); - ::close(fd); -} diff --git a/branches/sage/mds/osd/Ager.h b/branches/sage/mds/osd/Ager.h deleted file mode 100644 index ad160c0e9f9ff..0000000000000 --- a/branches/sage/mds/osd/Ager.h +++ /dev/null @@ -1,44 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -#ifndef __AGER_H -#define __AGER_H - -#include "include/types.h" -#include "include/Distribution.h" -#include "ObjectStore.h" -#include "common/Clock.h" - -#include -#include -using namespace std; - -class Ager { - ObjectStore *store; - - private: - list age_free_oids; - object_t age_cur_oid; - vector< list > age_objects; - Distribution file_size_distn; //kb - bool did_distn; - - void age_empty(float pc); - uint64_t age_fill(float pc, utime_t until); - ssize_t age_pick_size(); - object_t age_get_oid(); - - public: - Ager(ObjectStore *s) : store(s), did_distn(false) {} - - void age(int time, - float high_water, // fill to this % - float low_water, // then empty to this % - int count, // this many times - float final_water, // and end here ( <= low_water) - int fake_size_mb=0); - - void save_freelist(int); - void load_freelist(); -}; - -#endif diff --git a/branches/sage/mds/osd/BDBMap.h b/branches/sage/mds/osd/BDBMap.h deleted file mode 100644 index a8e96a8a192f7..0000000000000 --- a/branches/sage/mds/osd/BDBMap.h +++ /dev/null @@ -1,137 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __BERKELEYDB_H -#define __BERKELEYDB_H - -#include -#include - -#include -using namespace std; - - -template -class BDBMap { - private: - DB *dbp; - - public: - BDBMap() : dbp(0) {} - ~BDBMap() { - close(); - } - - bool is_open() { return dbp ? true:false; } - - // open/close - int open(const char *fn) { - //cout << "open " << fn << endl; - - int r; - if ((r = db_create(&dbp, NULL, 0)) != 0) { - cerr << "db_create: " << db_strerror(r) << endl; - assert(0); - } - - dbp->set_errfile(dbp, stderr); - dbp->set_errpfx(dbp, "bdbmap"); - - r = dbp->open(dbp, NULL, fn, NULL, DB_BTREE, DB_CREATE, 0644); - if (r != 0) { - dbp->err(dbp, r, "%s", fn); - } - assert(r == 0); - return 0; - } - void close() { - if (dbp) { - dbp->close(dbp,0); - dbp = 0; - } - } - void remove(const char *fn) { - if (!dbp) open(fn); - if (dbp) { - dbp->remove(dbp, fn, 0, 0); - dbp = 0; - } else { - ::unlink(fn); - } - } - - // accessors - int put(K key, - D data) { - DBT k; - memset(&k, 0, sizeof(k)); - k.data = &key; - k.size = sizeof(K); - DBT d; - memset(&d, 0, sizeof(d)); - d.data = &data; - d.size = sizeof(data); - return dbp->put(dbp, NULL, &k, &d, 0); - } - - int get(K key, - D& data) { - DBT k; - memset(&k, 0, sizeof(k)); - k.data = &key; - k.size = sizeof(key); - DBT d; - memset(&d, 0, sizeof(d)); - d.data = &data; - d.size = sizeof(data); - int r = dbp->get(dbp, NULL, &k, &d, 0); - return r; - } - - int del(K key) { - DBT k; - memset(&k, 0, sizeof(k)); - k.data = &key; - k.size = sizeof(key); - return dbp->del(dbp, NULL, &k, 0); - } - - int list_keys(list& ls) { - DBC *cursor = 0; - int r = dbp->cursor(dbp, NULL, &cursor, 0); - assert(r == 0); - - DBT k,d; - memset(&k, 0, sizeof(k)); - memset(&d, 0, sizeof(d)); - - while ((r = cursor->c_get(cursor, &k, &d, DB_NEXT)) == 0) { - K key; - assert(k.size == sizeof(key)); - memcpy(&key, k.data, k.size); - ls.push_back(key); - } - if (r != DB_NOTFOUND) { - dbp->err(dbp, r, "DBcursor->get"); - assert(r == DB_NOTFOUND); - } - - cursor->c_close(cursor); - return 0; - } - -}; - -#endif diff --git a/branches/sage/mds/osd/Fake.h b/branches/sage/mds/osd/Fake.h deleted file mode 100644 index 342c153c25cfd..0000000000000 --- a/branches/sage/mds/osd/Fake.h +++ /dev/null @@ -1,262 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __FAKE_H -#define __FAKE_H - -#include "include/types.h" - -#include -#include -#include -using namespace std; -using namespace __gnu_cxx; - -class FakeStoreCollections { - private: - Mutex faker_lock; - ObjectStore *store; - hash_map > fakecollections; - - public: - FakeStoreCollections(ObjectStore *s) : store(s) {} - - // faked collections - int list_collections(list& ls) { - faker_lock.Lock(); - int r = 0; - for (hash_map< coll_t, set >::iterator p = fakecollections.begin(); - p != fakecollections.end(); - p++) { - r++; - ls.push_back(p->first); - } - faker_lock.Unlock(); - return r; - } - - int create_collection(coll_t c, - Context *onsafe=0) { - faker_lock.Lock(); - fakecollections[c].size(); - if (onsafe) store->sync(onsafe); - faker_lock.Unlock(); - return 0; - } - - int destroy_collection(coll_t c, - Context *onsafe=0) { - int r = 0; - faker_lock.Lock(); - if (fakecollections.count(c)) { - fakecollections.erase(c); - //fakecattr.erase(c); - if (onsafe) store->sync(onsafe); - } else - r = -1; - faker_lock.Unlock(); - return r; - } - - int collection_stat(coll_t c, struct stat *st) { - return collection_exists(c) ? 0:-1; - } - - bool collection_exists(coll_t c) { - faker_lock.Lock(); - int r = fakecollections.count(c); - faker_lock.Unlock(); - return r; - } - - int collection_add(coll_t c, object_t o, - Context *onsafe=0) { - faker_lock.Lock(); - fakecollections[c].insert(o); - if (onsafe) store->sync(onsafe); - faker_lock.Unlock(); - return 0; - } - - int collection_remove(coll_t c, object_t o, - Context *onsafe=0) { - faker_lock.Lock(); - fakecollections[c].erase(o); - if (onsafe) store->sync(onsafe); - faker_lock.Unlock(); - return 0; - } - - int collection_list(coll_t c, list& o) { - faker_lock.Lock(); - int r = 0; - for (set::iterator p = fakecollections[c].begin(); - p != fakecollections[c].end(); - p++) { - o.push_back(*p); - r++; - } - faker_lock.Unlock(); - return r; - } - -}; - -class FakeStoreAttrs { - private: - - class FakeAttrSet { - public: - map attrs; - - int getattr(const char *name, void *value, size_t size) { - string n = name; - if (attrs.count(n)) { - size_t l = MIN( attrs[n].length(), size ); - bufferlist bl; - bl.append(attrs[n]); - bl.copy(0, l, (char*)value); - return l; - } - return -1; - } - int getattrs(map& aset) { - aset = attrs; - return 0; - } - int setattrs(map& aset) { - attrs = aset; - return 0; - } - - int setattr(const char *name, const void *value, size_t size) { - string n = name; - bufferptr bp = buffer::copy((char*)value, size); - attrs[n] = bp; - return 0; - } - - int listattr(char *attrs, size_t size) { - assert(0); - return 0; - } - - int rmattr(const char *name) { - string n = name; - attrs.erase(n); - return 0; - } - - bool empty() { return attrs.empty(); } - }; - - Mutex faker_lock; - ObjectStore *store; - hash_map fakeoattrs; - hash_map fakecattrs; - - public: - FakeStoreAttrs(ObjectStore *s) : store(s) {} - - int setattr(object_t oid, const char *name, - const void *value, size_t size, - Context *onsafe=0) { - faker_lock.Lock(); - int r = fakeoattrs[oid].setattr(name, value, size); - if (onsafe) store->sync(onsafe); - faker_lock.Unlock(); - return r; - } - int setattrs(object_t oid, map& aset) { - faker_lock.Lock(); - int r = fakeoattrs[oid].setattrs(aset); - faker_lock.Unlock(); - return r; - } - int getattr(object_t oid, const char *name, - void *value, size_t size) { - faker_lock.Lock(); - int r = fakeoattrs[oid].getattr(name, value, size); - faker_lock.Unlock(); - return r; - } - int getattrs(object_t oid, map& aset) { - faker_lock.Lock(); - int r = fakeoattrs[oid].getattrs(aset); - faker_lock.Unlock(); - return r; - } - int rmattr(object_t oid, const char *name, - Context *onsafe=0) { - faker_lock.Lock(); - int r = fakeoattrs[oid].rmattr(name); - if (onsafe) store->sync(onsafe); - faker_lock.Unlock(); - return r; - } - - int listattr(object_t oid, char *attrs, size_t size) { - faker_lock.Lock(); - int r = fakeoattrs[oid].listattr(attrs,size); - faker_lock.Unlock(); - return r; - } - - int collection_setattr(coll_t c, const char *name, - void *value, size_t size, - Context *onsafe=0) { - faker_lock.Lock(); - int r = fakecattrs[c].setattr(name, value, size); - if (onsafe) store->sync(onsafe); - faker_lock.Unlock(); - return r; - } - int collection_setattrs(coll_t cid, map& aset) { - faker_lock.Lock(); - int r = fakecattrs[cid].setattrs(aset); - faker_lock.Unlock(); - return r; - } - int collection_getattrs(coll_t cid, map& aset) { - faker_lock.Lock(); - int r = fakecattrs[cid].getattrs(aset); - faker_lock.Unlock(); - return r; - } - int collection_rmattr(coll_t c, const char *name, - Context *onsafe=0) { - faker_lock.Lock(); - int r = fakecattrs[c].rmattr(name); - if (onsafe) store->sync(onsafe); - faker_lock.Unlock(); - return r; - } - int collection_getattr(coll_t c, const char *name, - void *value, size_t size) { - faker_lock.Lock(); - int r = fakecattrs[c].getattr(name, value, size); - faker_lock.Unlock(); - return r; - } - int collection_listattr(coll_t c, char *attrs, size_t size) { - faker_lock.Lock(); - int r = fakecattrs[c].listattr(attrs,size); - faker_lock.Unlock(); - return r; - } - -}; - -#endif diff --git a/branches/sage/mds/osd/FakeStore.cc b/branches/sage/mds/osd/FakeStore.cc deleted file mode 100644 index e7c77f3eab558..0000000000000 --- a/branches/sage/mds/osd/FakeStore.cc +++ /dev/null @@ -1,742 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "FakeStore.h" -#include "include/types.h" - -#include "common/Timer.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#ifndef __CYGWIN__ -# include -#endif -//#include - -#ifdef DARWIN -#include -#include -#endif // DARWIN - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug) *_dout << dbeginl << g_clock.now() << " fakestore(" << basedir << ") " -#define derr(l) if (l<=g_conf.debug) *_derr << dbeginl << g_clock.now() << " fakestore(" << basedir << ") " - -#include "include/buffer.h" - -#include - - -// crap-a-crap hash -//#define HASH_DIRS 0x80 -//#define HASH_MASK 0x7f -// end crap hash - - - - -int FakeStore::statfs(struct statfs *buf) -{ - return ::statfs(basedir.c_str(), buf); -} - - -/* - * sorry, these are sentitive to the object_t and coll_t typing. - */ -void FakeStore::get_oname(object_t oid, char *s) -{ - //static hash H; - assert(sizeof(oid) == 16); -#ifdef __LP64__ - //sprintf(s, "%s/objects/%02lx/%016lx.%016lx", basedir.c_str(), H(oid) & HASH_MASK, - sprintf(s, "%s/objects/%016lx.%016lx", basedir.c_str(), - *((uint64_t*)&oid), - *(((uint64_t*)&oid) + 1)); -#else - //sprintf(s, "%s/objects/%02x/%016llx.%016llx", basedir.c_str(), H(oid) & HASH_MASK, - sprintf(s, "%s/objects/%016llx.%016llx", basedir.c_str(), - *((uint64_t*)&oid), - *(((uint64_t*)&oid) + 1)); -#endif -} - -void FakeStore::get_cdir(coll_t cid, char *s) -{ - assert(sizeof(cid) == 8); -#ifdef __LP64__ - sprintf(s, "%s/collections/%016lx", basedir.c_str(), - cid); -#else - sprintf(s, "%s/collections/%016llx", basedir.c_str(), - cid); -#endif -} - -void FakeStore::get_coname(coll_t cid, object_t oid, char *s) -{ - assert(sizeof(oid) == 16); -#ifdef __LP64__ - sprintf(s, "%s/collections/%016lx/%016lx.%016lx", basedir.c_str(), cid, - *((uint64_t*)&oid), - *(((uint64_t*)&oid) + 1)); -#else - sprintf(s, "%s/collections/%016llx/%016llx.%016llx", basedir.c_str(), cid, - *((uint64_t*)&oid), - *(((uint64_t*)&oid) + 1)); -#endif -} - - - - -int FakeStore::mkfs() -{ - char cmd[200]; - if (g_conf.fakestore_dev) { - dout(0) << "mounting" << dendl; - sprintf(cmd,"mount %s", g_conf.fakestore_dev); - system(cmd); - } - - dout(1) << "mkfs in " << basedir << dendl; - - // wipe - sprintf(cmd, "test -d %s && rm -r %s ; mkdir -p %s/collections && mkdir -p %s/objects", - basedir.c_str(), basedir.c_str(), basedir.c_str(), basedir.c_str()); - - dout(5) << "wipe: " << cmd << dendl; - system(cmd); - - // hashed bits too - /* - for (int i=0; i 0) bl.push_back( bptr ); // put it in the target bufferlist - } - ::flock(fd, LOCK_UN); - ::close(fd); - return got; -} - - -int FakeStore::write(object_t oid, - off_t offset, size_t len, - const bufferlist& bl, - Context *onsafe) -{ - char fn[200]; - get_oname(oid,fn); - - dout(20) << "write " << fn << " len " << len << " off " << offset << dendl; - - - ::mknod(fn, 0644, 0); // in case it doesn't exist yet. - - int flags = O_WRONLY;//|O_CREAT; - int fd = ::open(fn, flags); - if (fd < 0) { - derr(0) << "write couldn't open " << fn << " flags " << flags << " errno " << errno << " " << strerror(errno) << dendl; - return fd; - } - ::fchmod(fd, 0664); - ::flock(fd, LOCK_EX); // lock for safety - - // seek - off_t actual = ::lseek(fd, offset, SEEK_SET); - int did = 0; - assert(actual == offset); - - // write buffers - for (list::const_iterator it = bl.buffers().begin(); - it != bl.buffers().end(); - it++) { - int r = ::write(fd, (char*)(*it).c_str(), (*it).length()); - if (r > 0) - did += r; - else { - derr(0) << "couldn't write to " << fn << " len " << len << " off " << offset << " errno " << errno << " " << strerror(errno) << dendl; - } - } - - if (did < 0) { - derr(0) << "couldn't write to " << fn << " len " << len << " off " << offset << " errno " << errno << " " << strerror(errno) << dendl; - } - - ::flock(fd, LOCK_UN); - - // schedule sync - if (onsafe) sync(onsafe); - - ::close(fd); - - return did; -} - - -class C_FakeSync : public Context { - Context *c; - int *n; - Mutex *lock; - Cond *cond; - -public: - C_FakeSync(Context *c_, int *n_, Mutex *lo, Cond *co) : - c(c_), n(n_), - lock(lo), cond(co) { - lock->Lock(); - ++*n; - lock->Unlock(); - } - void finish(int r) { - c->finish(r); - - lock->Lock(); - --(*n); - if (*n == 0) cond->Signal(); - lock->Unlock(); - } -}; - -void FakeStore::sync() -{ - synclock.Lock(); - while (unsync > 0) { - dout(0) << "sync waiting for " << unsync << " items to (fake) sync" << dendl; - synccond.Wait(synclock); - } - synclock.Unlock(); -} - -void FakeStore::sync(Context *onsafe) -{ - if (g_conf.fakestore_fake_sync > 0.0) { - g_timer.add_event_after((float)g_conf.fakestore_fake_sync, - new C_FakeSync(onsafe, &unsync, &synclock, &synccond)); - - } else { - assert(0); // der..no implemented anymore - } -} - - -// ------------------------------- -// attributes - -// objects - -int FakeStore::setattr(object_t oid, const char *name, - const void *value, size_t size, - Context *onsafe) -{ - if (fake_attrs) return attrs.setattr(oid, name, value, size, onsafe); - - int r = 0; -#ifndef __CYGWIN__ - char fn[100]; - get_oname(oid, fn); - r = ::setxattr(fn, name, value, size, 0); -#endif - return r; -} - -int FakeStore::setattrs(object_t oid, map& aset) -{ - if (fake_attrs) return attrs.setattrs(oid, aset); - - char fn[100]; - get_oname(oid, fn); - int r = 0; -#ifndef __CYGWIN__ - for (map::iterator p = aset.begin(); - p != aset.end(); - ++p) { - r = ::setxattr(fn, p->first.c_str(), p->second.c_str(), p->second.length(), 0); - if (r < 0) { - cerr << "error setxattr " << strerror(errno) << std::endl; - break; - } - } -#endif - return r; -} - -int FakeStore::getattr(object_t oid, const char *name, - void *value, size_t size) -{ - if (fake_attrs) return attrs.getattr(oid, name, value, size); - int r = 0; -#ifndef __CYGWIN__ - char fn[100]; - get_oname(oid, fn); - r = ::getxattr(fn, name, value, size); -#endif - return r; -} - -int FakeStore::getattrs(object_t oid, map& aset) -{ - if (fake_attrs) return attrs.getattrs(oid, aset); - -#ifndef __CYGWIN__ - char fn[100]; - get_oname(oid, fn); - - char val[1000]; - char names[1000]; - int num = ::listxattr(fn, names, 1000); - - char *name = names; - for (int i=0; i& aset) -{ - if (fake_attrs) return attrs.collection_setattrs(cid, aset); - - char fn[100]; - get_cdir(cid, fn); - int r = 0; -#ifndef __CYGWIN__ - for (map::iterator p = aset.begin(); - p != aset.end(); - ++p) { - r = ::setxattr(fn, p->first.c_str(), p->second.c_str(), p->second.length(), 0); - if (r < 0) break; - } -#endif - return r; -} - -int FakeStore::collection_getattrs(coll_t cid, map& aset) -{ - if (fake_attrs) return attrs.collection_getattrs(cid, aset); - -#ifndef __CYGWIN__ - char fn[100]; - get_cdir(cid, fn); - - char val[1000]; - char names[1000]; - int num = ::listxattr(fn, names, 1000); - - char *name = names; - for (int i=0; i& ls) -{ - char fn[200]; - sprintf(fn, "%s/objects", basedir.c_str()); - - DIR *dir = ::opendir(fn); - assert(dir); - - struct dirent *de; - while ((de = ::readdir(dir)) != 0) { - if (de->d_name[0] == '.') continue; - // parse - object_t o; - assert(sizeof(o) == 16); - //cout << " got object " << de->d_name << std::endl; - *(((uint64_t*)&o) + 0) = strtoll(de->d_name, 0, 16); - assert(de->d_name[16] == '.'); - *(((uint64_t*)&o) + 1) = strtoll(de->d_name+17, 0, 16); - //dout(0) << " got " << o << " errno " << errno << " on " << de->d_name << dendl; - if (errno) continue; - ls.push_back(o); - } - - ::closedir(dir); - return 0; -} - - -// -------------------------- -// collections - -int FakeStore::list_collections(list& ls) -{ - if (fake_collections) return collections.list_collections(ls); - - char fn[200]; - sprintf(fn, "%s/collections", basedir.c_str()); - - DIR *dir = ::opendir(fn); - assert(dir); - - struct dirent *de; - while ((de = ::readdir(dir)) != 0) { - // parse - errno = 0; - coll_t c = strtoll(de->d_name, 0, 16); - if (c) ls.push_back(c); - } - - ::closedir(dir); - return 0; -} - -int FakeStore::create_collection(coll_t c, - Context *onsafe) -{ - if (fake_collections) return collections.create_collection(c, onsafe); - - char fn[200]; - get_cdir(c, fn); - - int r = ::mkdir(fn, 0755); - - if (onsafe) sync(onsafe); - return r; -} - -int FakeStore::destroy_collection(coll_t c, - Context *onsafe) -{ - if (fake_collections) return collections.destroy_collection(c, onsafe); - - char fn[200]; - get_cdir(c, fn); - char cmd[200]; - sprintf(cmd, "test -d %s && rm -r %s", fn, fn); - system(cmd); - - if (onsafe) sync(onsafe); - return 0; -} - -int FakeStore::collection_stat(coll_t c, struct stat *st) -{ - if (fake_collections) return collections.collection_stat(c, st); - - char fn[200]; - get_cdir(c, fn); - return ::lstat(fn, st); -} - -bool FakeStore::collection_exists(coll_t c) -{ - if (fake_collections) return collections.collection_exists(c); - - struct stat st; - return collection_stat(c, &st) == 0; -} - - -int FakeStore::collection_add(coll_t c, object_t o, - Context *onsafe) -{ - if (fake_collections) return collections.collection_add(c, o, onsafe); - - char cof[200]; - get_coname(c, o, cof); - char of[200]; - get_oname(o, of); - - int r = ::link(of, cof); - if (onsafe) sync(onsafe); - return r; -} - -int FakeStore::collection_remove(coll_t c, object_t o, - Context *onsafe) -{ - if (fake_collections) return collections.collection_remove(c, o, onsafe); - - char cof[200]; - get_coname(c, o, cof); - - int r = ::unlink(cof); - if (onsafe) sync(onsafe); - return r; -} - -int FakeStore::collection_list(coll_t c, list& ls) -{ - if (fake_collections) return collections.collection_list(c, ls); - - char fn[200]; - get_cdir(c, fn); - - DIR *dir = ::opendir(fn); - assert(dir); - - struct dirent *de; - while ((de = ::readdir(dir)) != 0) { - // parse - if (de->d_name[0] == '.') continue; - //cout << " got object " << de->d_name << std::endl; - object_t o; - assert(sizeof(o) == 16); - *(((uint64_t*)&o) + 0) = strtoll(de->d_name, 0, 16); - assert(de->d_name[16] == '.'); - *(((uint64_t*)&o) + 1) = strtoll(de->d_name+17, 0, 16); - dout(0) << " got " << o << " errno " << errno << " on " << de->d_name << dendl; - if (errno) continue; - ls.push_back(o); - } - - ::closedir(dir); - return 0; -} - -// eof. diff --git a/branches/sage/mds/osd/FakeStore.h b/branches/sage/mds/osd/FakeStore.h deleted file mode 100644 index 5828c27c14d96..0000000000000 --- a/branches/sage/mds/osd/FakeStore.h +++ /dev/null @@ -1,114 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __FAKESTORE_H -#define __FAKESTORE_H - -#include "ObjectStore.h" -#include "common/ThreadPool.h" -#include "common/Mutex.h" - -#include "Fake.h" -//#include "FakeStoreBDBCollections.h" - - -#include -using namespace std; - -#include -using namespace __gnu_cxx; - - -// fake attributes in memory, if we need to. - -class FakeStore : public ObjectStore { - string basedir; - - Mutex synclock; - Cond synccond; - int unsync; - - // fake attrs? - FakeStoreAttrs attrs; - bool fake_attrs; - - // fake collections? - FakeStoreCollections collections; - bool fake_collections; - - // helper fns - void get_oname(object_t oid, char *s); - void get_cdir(coll_t cid, char *s); - void get_coname(coll_t cid, object_t oid, char *s); - - public: - FakeStore(char *base) : - basedir(base), - unsync(0), - attrs(this), fake_attrs(false), - collections(this), fake_collections(false) { } - - int mount(); - int umount(); - int mkfs(); - - int statfs(struct statfs *buf); - - // ------------------ - // objects - int pick_object_revision_lt(object_t& oid) { - return 0; - } - bool exists(object_t oid); - int stat(object_t oid, struct stat *st); - int remove(object_t oid, Context *onsafe); - int truncate(object_t oid, off_t size, Context *onsafe); - int read(object_t oid, off_t offset, size_t len, bufferlist& bl); - int write(object_t oid, off_t offset, size_t len, const bufferlist& bl, Context *onsafe); - - void sync(); - void sync(Context *onsafe); - - int list_objects(list& ls); - - // attrs - int setattr(object_t oid, const char *name, const void *value, size_t size, Context *onsafe=0); - int setattrs(object_t oid, map& aset); - int getattr(object_t oid, const char *name, void *value, size_t size); - int getattrs(object_t oid, map& aset); - int rmattr(object_t oid, const char *name, Context *onsafe=0); - //int listattr(object_t oid, char *attrs, size_t size); - int collection_setattr(coll_t c, const char *name, void *value, size_t size, Context *onsafe=0); - int collection_rmattr(coll_t c, const char *name, Context *onsafe=0); - int collection_getattr(coll_t c, const char *name, void *value, size_t size); - //int collection_listattr(coll_t c, char *attrs, size_t size); - int collection_getattrs(coll_t cid, map &aset); - int collection_setattrs(coll_t cid, map &aset); - - // collections - int list_collections(list& ls); - int create_collection(coll_t c, Context *onsafe=0); - int destroy_collection(coll_t c, Context *onsafe=0); - int collection_stat(coll_t c, struct stat *st); - bool collection_exists(coll_t c); - int collection_add(coll_t c, object_t o, Context *onsafe=0); - int collection_remove(coll_t c, object_t o, Context *onsafe=0); - int collection_list(coll_t c, list& o); - - - -}; - -#endif diff --git a/branches/sage/mds/osd/FakeStoreBDBCollections.h b/branches/sage/mds/osd/FakeStoreBDBCollections.h deleted file mode 100644 index a779a2a57972c..0000000000000 --- a/branches/sage/mds/osd/FakeStoreBDBCollections.h +++ /dev/null @@ -1,169 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __FAKESTOREBDBCOLLECTIONS_H -#define __FAKESTOREBDBCOLLECTIONS_H - -#include "BDBMap.h" -#include "ObjectStore.h" -#include "common/Mutex.h" - -#define BDBHASH_DIRS 128LL -#define BDBHASH_FUNC(x) (((x) ^ ((x)>>30) ^ ((x)>>18) ^ ((x)>>45) ^ 0xdead1234) * 884811 % BDBHASH_DIRS) - -class FakeStoreBDBCollections { - private: - int whoami; - string basedir; - - Mutex bdblock; - - // collection dbs - BDBMap collections; - map*> collection_map; - - // dirs - void get_dir(string& dir) { - char s[30]; - sprintf(s, "%d", whoami); - dir = basedir + "/" + s; - } - void get_collfn(coll_t c, string &fn) { - char s[100]; - sprintf(s, "%d/%02llx/%016llx.co", whoami, BDBHASH_FUNC(c), c); - fn = basedir + "/" + s; - } - - void open_collections() { - string cfn; - get_dir(cfn); - cfn += "/collections"; - collections.open(cfn.c_str()); - list ls; - collections.list_keys(ls); - } - void close_collections() { - if (collections.is_open()) - collections.close(); - - for (map*>::iterator it = collection_map.begin(); - it != collection_map.end(); - it++) { - it->second->close(); - } - collection_map.clear(); - } - - int open_collection(coll_t c) { - if (collection_map.count(c)) - return 0; // already open. - - string fn; - get_collfn(c,fn); - collection_map[c] = new BDBMap; - int r = collection_map[c]->open(fn.c_str()); - if (r != 0) - collection_map.erase(c); // failed - return r; - } - - public: - FakeStoreBDBCollections(int w, string& bd) : whoami(w), basedir(bd) {} - ~FakeStoreBDBCollections() { - close_collections(); - } - - int list_collections(list& ls) { - bdblock.Lock(); - if (!collections.is_open()) open_collections(); - - ls.clear(); - collections.list_keys(ls); - bdblock.Unlock(); - return 0; - } - int create_collection(coll_t c) { - bdblock.Lock(); - if (!collections.is_open()) open_collections(); - - collections.put(c, 1); - open_collection(c); - bdblock.Unlock(); - return 0; - } - int destroy_collection(coll_t c) { - bdblock.Lock(); - if (!collections.is_open()) open_collections(); - - collections.del(c); - - open_collection(c); - collection_map[c]->close(); - - string fn; - get_collfn(c,fn); - collection_map[c]->remove(fn.c_str()); - delete collection_map[c]; - collection_map.erase(c); - bdblock.Unlock(); - return 0; - } - int collection_stat(coll_t c, struct stat *st) { - bdblock.Lock(); - if (!collections.is_open()) open_collections(); - - string fn; - get_collfn(c,fn); - int r = ::stat(fn.c_str(), st); - bdblock.Unlock(); - return r; - } - bool collection_exists(coll_t c) { - bdblock.Lock(); - struct stat st; - int r = collection_stat(c, &st) == 0; - bdblock.Unlock(); - return r; - } - int collection_add(coll_t c, object_t o) { - bdblock.Lock(); - if (!collections.is_open()) open_collections(); - - open_collection(c); - collection_map[c]->put(o,1); - bdblock.Unlock(); - return 0; - } - int collection_remove(coll_t c, object_t o) { - bdblock.Lock(); - if (!collections.is_open()) open_collections(); - - open_collection(c); - collection_map[c]->del(o); - bdblock.Unlock(); - return 0; - } - int collection_list(coll_t c, list& o) { - bdblock.Lock(); - if (!collections.is_open()) open_collections(); - - open_collection(c); - collection_map[c]->list_keys(o); - bdblock.Unlock(); - return 0; - } -}; - -#endif diff --git a/branches/sage/mds/osd/OSD.cc b/branches/sage/mds/osd/OSD.cc deleted file mode 100644 index ab57f0c603302..0000000000000 --- a/branches/sage/mds/osd/OSD.cc +++ /dev/null @@ -1,2377 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "include/types.h" - -#include "OSD.h" -#include "OSDMap.h" - -#include "FakeStore.h" - -#include "ebofs/Ebofs.h" - -#ifdef USE_OSBDB -#include "osbdb/OSBDB.h" -#endif // USE_OSBDB - - -#include "ReplicatedPG.h" -//#include "RAID4PG.h" - -#include "Ager.h" - - -#include "msg/Messenger.h" -#include "msg/Message.h" - -#include "messages/MGenericMessage.h" -#include "messages/MPing.h" -#include "messages/MPingAck.h" -#include "messages/MOSDPing.h" -#include "messages/MOSDFailure.h" -#include "messages/MOSDOp.h" -#include "messages/MOSDOpReply.h" -#include "messages/MOSDBoot.h" -#include "messages/MOSDIn.h" -#include "messages/MOSDOut.h" - -#include "messages/MOSDMap.h" -#include "messages/MOSDGetMap.h" -#include "messages/MOSDPGNotify.h" -#include "messages/MOSDPGQuery.h" -#include "messages/MOSDPGLog.h" -#include "messages/MOSDPGRemove.h" -#include "messages/MOSDPGActivateSet.h" - -#include "messages/MPGStats.h" - -#include "common/Logger.h" -#include "common/LogType.h" -#include "common/Timer.h" -#include "common/ThreadPool.h" - -#include -#include -#include -#include - - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_osd) *_dout << dbeginl << g_clock.now() << " osd" << whoami << " " << (osdmap ? osdmap->get_epoch():0) << " " -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_osd) *_derr << dbeginl << g_clock.now() << " osd" << whoami << " " << (osdmap ? osdmap->get_epoch():0) << " " - -char *osd_base_path = "./osddata"; -char *ebofs_base_path = "./dev"; - -static const object_t SUPERBLOCK_OBJECT(0,0); - -// force remount hack for performance testing FakeStore -class C_Remount : public Context { - OSD *osd; -public: - C_Remount(OSD *o) : osd(o) {} - void finish(int) { - osd->force_remount(); - } -}; - -void OSD::force_remount() -{ - dout(0) << "forcing remount" << dendl; - osd_lock.Lock(); - { - store->umount(); - store->mount(); - } - osd_lock.Unlock(); - dout(0) << "finished remount" << dendl; -} -// - - -// cons/des - -LogType osd_logtype; - -OSD::OSD(int id, Messenger *m, MonMap *mm, char *dev) : - timer(osd_lock), - stat_oprate(5.0), - read_latency_calc(g_conf.osd_max_opq<1 ? 1:g_conf.osd_max_opq), - qlen_calc(3), - iat_averager(g_conf.osd_flash_crowd_iat_alpha) -{ - whoami = id; - messenger = m; - monmap = mm; - - osdmap = 0; - boot_epoch = 0; - - last_tid = 0; - num_pulling = 0; - - state = STATE_BOOTING; - - stat_ops = 0; - stat_qlen = 0; - stat_rd_ops = stat_rd_ops_shed_in = stat_rd_ops_shed_out = 0; - stat_rd_ops_in_queue = 0; - - pending_ops = 0; - waiting_for_no_ops = false; - - if (g_conf.osd_remount_at) - timer.add_event_after(g_conf.osd_remount_at, new C_Remount(this)); - - - // init object store - // try in this order: - // dev/osd$num - // dev/osd.$hostname - // dev/osd.all - - if (dev) { - strcpy(dev_path,dev); - } else { - char hostname[100]; - hostname[0] = 0; - gethostname(hostname,100); - - sprintf(dev_path, "%s/osd%d", ebofs_base_path, whoami); - - struct stat sta; - if (::lstat(dev_path, &sta) != 0) - sprintf(dev_path, "%s/osd.%s", ebofs_base_path, hostname); - - if (::lstat(dev_path, &sta) != 0) - sprintf(dev_path, "%s/osd.all", ebofs_base_path); - } - - if (g_conf.ebofs) { - store = new Ebofs(dev_path); - //store->_fake_writes(true); - } -#ifdef USE_OSBDB - else if (g_conf.bdbstore) { - store = new OSBDB(dev_path); - } -#endif // USE_OSBDB - else { - sprintf(dev_path, "osddata/osd%d", whoami); - store = new FakeStore(dev_path); - } - -} - -OSD::~OSD() -{ - if (threadpool) { delete threadpool; threadpool = 0; } - if (osdmap) { delete osdmap; osdmap = 0; } - //if (monitor) { delete monitor; monitor = 0; } - if (messenger) { delete messenger; messenger = 0; } - if (logger) { delete logger; logger = 0; } - if (store) { delete store; store = 0; } -} - -int OSD::init() -{ - Mutex::Locker lock(osd_lock); - - // mkfs? - if (g_conf.osd_mkfs) { - dout(2) << "mkfs on local store" << dendl; - if (store->mkfs() < 0) - return -1; - - // make up a superblock - //superblock.fsid = ???; - superblock.whoami = whoami; - } - - // mount. - dout(2) << "mounting " << dev_path << dendl; - int r = store->mount(); - if (r < 0) return -1; - - if (g_conf.osd_mkfs) { - // age? - if (g_conf.osd_age_time != 0) { - dout(2) << "age" << dendl; - Ager ager(store); - if (g_conf.osd_age_time < 0) - ager.load_freelist(); - else - ager.age(g_conf.osd_age_time, - g_conf.osd_age, - g_conf.osd_age - .05, - 50000, - g_conf.osd_age - .05); - } - - if (g_conf.osd_auto_weight) { - // benchmark - bufferlist bl; - bufferptr bp(1048576); - bp.zero(); - bl.push_back(bp); - utime_t start = g_clock.now(); - for (int i=0; i<1000; i++) - store->write(object_t(999,i), 0, bl.length(), bl, 0); - store->sync(); - utime_t end = g_clock.now(); - end -= start; - dout(0) << "measured " << (1000.0 / (double)end) << " mb/sec" << dendl; - for (int i=0; i<1000; i++) - store->remove(object_t(999,i), 0); - - // set osd weight - superblock.weight = (1000.0 / (double)end); - } - } - else { - dout(2) << "boot" << dendl; - - // read superblock - read_superblock(); - - // load up pgs (as they previously existed) - load_pgs(); - - dout(2) << "superblock: i am osd" << superblock.whoami << dendl; - assert(whoami == superblock.whoami); - } - - - - - // log - char name[80]; - sprintf(name, "osd%d", whoami); - logger = new Logger(name, (LogType*)&osd_logtype); - osd_logtype.add_set("opq"); - osd_logtype.add_inc("op"); - osd_logtype.add_inc("c_rd"); - osd_logtype.add_inc("c_rdb"); - osd_logtype.add_inc("c_wr"); - osd_logtype.add_inc("c_wrb"); - - osd_logtype.add_inc("r_push"); - osd_logtype.add_inc("r_pushb"); - osd_logtype.add_inc("r_wr"); - osd_logtype.add_inc("r_wrb"); - - osd_logtype.add_set("qlen"); - osd_logtype.add_set("rqlen"); - osd_logtype.add_set("rdlat"); - osd_logtype.add_set("rdlatm"); - osd_logtype.add_set("fshdin"); - osd_logtype.add_set("fshdout"); - osd_logtype.add_inc("shdout"); - osd_logtype.add_inc("shdin"); - - osd_logtype.add_set("loadavg"); - - osd_logtype.add_inc("rlsum"); - osd_logtype.add_inc("rlnum"); - - osd_logtype.add_set("numpg"); - osd_logtype.add_set("pingset"); - - osd_logtype.add_set("buf"); - - osd_logtype.add_inc("map"); - osd_logtype.add_inc("mapi"); - osd_logtype.add_inc("mapidup"); - osd_logtype.add_inc("mapf"); - osd_logtype.add_inc("mapfdup"); - - // request thread pool - { - char name[80]; - sprintf(name,"osd%d.threadpool", whoami); - threadpool = new ThreadPool(name, g_conf.osd_maxthreads, - static_dequeueop, - this); - } - - // i'm ready! - messenger->set_dispatcher(this); - - // announce to monitor i exist and have booted. - int mon = monmap->pick_mon(); - messenger->send_message(new MOSDBoot(messenger->get_myinst(), superblock), monmap->get_inst(mon)); - - // start the heart - timer.add_event_after(g_conf.osd_heartbeat_interval, new C_Heartbeat(this)); - - // and stat beacon - timer.add_event_after(g_conf.osd_pg_stats_interval, new C_Stats(this)); - - return 0; -} - -int OSD::shutdown() -{ - dout(1) << "shutdown" << dendl; - - state = STATE_STOPPING; - - // cancel timers - timer.cancel_all(); - timer.join(); - - // finish ops - wait_for_no_ops(); - - // stop threads - delete threadpool; - threadpool = 0; - - // close pgs - for (hash_map::iterator p = pg_map.begin(); - p != pg_map.end(); - p++) { - delete p->second; - } - pg_map.clear(); - - // shut everything else down - //monitor->shutdown(); - messenger->shutdown(); - - osd_lock.Unlock(); - int r = store->umount(); - osd_lock.Lock(); - return r; -} - - - -void OSD::write_superblock(ObjectStore::Transaction& t) -{ - dout(10) << "write_superblock " << superblock << dendl; - - bufferlist bl; - bl.append((char*)&superblock, sizeof(superblock)); - t.write(SUPERBLOCK_OBJECT, 0, sizeof(superblock), bl); -} - -int OSD::read_superblock() -{ - bufferlist bl; - int r = store->read(SUPERBLOCK_OBJECT, 0, sizeof(superblock), bl); - if (bl.length() != sizeof(superblock)) { - dout(10) << "read_superblock failed, r = " << r << ", i got " << bl.length() << " bytes, not " << sizeof(superblock) << dendl; - return -1; - } - - bl.copy(0, sizeof(superblock), (char*)&superblock); - - dout(10) << "read_superblock " << superblock << dendl; - - // load up "current" osdmap - assert(!osdmap); - osdmap = new OSDMap; - bl.clear(); - get_map_bl(superblock.current_epoch, bl); - osdmap->decode(bl); - - assert(whoami == superblock.whoami); // fixme! - return 0; -} - - - - - -// ====================================================== -// PG's - -PG *OSD::_new_lock_pg(pg_t pgid) -{ - // create - PG *pg; - if (pgid.is_rep()) - pg = new ReplicatedPG(this, pgid); - //else if (pgid.is_raid4()) - //pg = new RAID4PG(this, pgid); - else - assert(0); - - assert(pg_map.count(pgid) == 0); - pg_map[pgid] = pg; - - pg->lock(); // always lock. - pg->get(); // because it's in pg_map - return pg; -} - - -PG *OSD::_create_lock_pg(pg_t pgid, ObjectStore::Transaction& t) -{ - dout(10) << "_create_lock_pg " << pgid << dendl; - - if (pg_map.count(pgid)) - dout(0) << "_create_lock_pg on " << pgid << ", already have " << *pg_map[pgid] << dendl; - - // open - PG *pg = _new_lock_pg(pgid); - - // create collection - assert(!store->collection_exists(pgid)); - t.create_collection(pgid); - - return pg; -} - -bool OSD::_have_pg(pg_t pgid) -{ - return pg_map.count(pgid); -} - -PG *OSD::_lookup_lock_pg(pg_t pgid) -{ - assert(pg_map.count(pgid)); - PG *pg = pg_map[pgid]; - pg->lock(); - return pg; -} - - -void OSD::_remove_unlock_pg(PG *pg) -{ - pg_t pgid = pg->info.pgid; - - dout(10) << "_remove_unlock_pg " << pgid << dendl; - - // remove from store - list olist; - store->collection_list(pgid, olist); - - ObjectStore::Transaction t; - { - for (list::iterator p = olist.begin(); - p != olist.end(); - p++) - t.remove(*p); - t.remove_collection(pgid); - t.remove(pgid.to_object()); // log too - } - store->apply_transaction(t); - - // mark deleted - pg->mark_deleted(); - - // remove from map - pg_map.erase(pgid); - - // unlock, and probably delete - pg->put_unlock(); // will delete, if last reference -} - - -void OSD::try_create_pg(pg_t pgid, ObjectStore::Transaction& t) -{ - vector acting; - int nrep = osdmap->pg_to_acting_osds(pgid, acting); - int role = osdmap->calc_pg_role(whoami, acting, nrep); - if (role < 0) return; - - PG *pg = _create_lock_pg(pgid, t); - pg->set_role(role); - pg->acting.swap(acting); - pg->last_epoch_started_any = - pg->info.last_epoch_started = - pg->info.history.same_since = - pg->info.history.same_primary_since = - pg->info.history.same_acker_since = osdmap->get_epoch(); - pg->write_log(t); - if (g_conf.osd_hack_fast_startup) - pg->activate(t); - - dout(7) << "created " << *pg << dendl; - pg->unlock(); -} - -void OSD::load_pgs() -{ - dout(10) << "load_pgs" << dendl; - assert(pg_map.empty()); - - list ls; - store->list_collections(ls); - - for (list::iterator it = ls.begin(); - it != ls.end(); - it++) { - pg_t pgid = *it; - PG *pg = _new_lock_pg(pgid); - - // read pg info - store->collection_getattr(pgid, "info", &pg->info, sizeof(pg->info)); - - // read pg log - pg->read_log(store); - - // generate state for current mapping - int nrep = osdmap->pg_to_acting_osds(pgid, pg->acting); - int role = osdmap->calc_pg_role(whoami, pg->acting, nrep); - pg->set_role(role); - - dout(10) << "load_pgs loaded " << *pg << " " << pg->log << dendl; - pg->unlock(); - } -} - - - -/** - * check epochs starting from start to verify the pg acting set hasn't changed - * up until now - */ -void OSD::project_pg_history(pg_t pgid, PG::Info::History& h, epoch_t from, - vector& last) -{ - dout(15) << "project_pg_history " << pgid - << " from " << from << " to " << osdmap->get_epoch() - << ", start " << h - << dendl; - - for (epoch_t e = osdmap->get_epoch()-1; - e >= from; - e--) { - // verify during intermediate epoch - OSDMap oldmap; - get_map(e, oldmap); - - vector acting; - oldmap.pg_to_acting_osds(pgid, acting); - - // acting set change? - if (acting != last && - e > h.same_since) { - dout(15) << "project_pg_history " << pgid << " changed in " << e+1 - << " from " << acting << " -> " << last << dendl; - h.same_since = e+1; - } - - // primary change? - if (!(!acting.empty() && !last.empty() && acting[0] == last[0]) && - e > h.same_primary_since) { - dout(15) << "project_pg_history " << pgid << " primary changed in " << e+1 << dendl; - h.same_primary_since = e+1; - - if (g_conf.osd_rep == OSD_REP_PRIMARY) - h.same_acker_since = h.same_primary_since; - } - - // acker change? - if (g_conf.osd_rep != OSD_REP_PRIMARY) { - if (!(!acting.empty() && !last.empty() && acting[acting.size()-1] == last[last.size()-1]) && - e > h.same_acker_since) { - dout(15) << "project_pg_history " << pgid << " acker changed in " << e+1 << dendl; - h.same_acker_since = e+1; - } - } - - if (h.same_since > e && - h.same_primary_since > e && - h.same_acker_since > e) break; - } - - dout(15) << "project_pg_history end " << h << dendl; -} - -void OSD::activate_pg(pg_t pgid, epoch_t epoch) -{ - osd_lock.Lock(); - { - if (pg_map.count(pgid)) { - PG *pg = _lookup_lock_pg(pgid); - if (pg->is_crashed() && - pg->is_replay() && - pg->get_role() == 0 && - pg->info.history.same_primary_since <= epoch) { - ObjectStore::Transaction t; - pg->activate(t); - store->apply_transaction(t); - } - pg->unlock(); - } - } - - // finishers? - finished_lock.Lock(); - if (finished.empty()) { - finished_lock.Unlock(); - osd_lock.Unlock(); - } else { - list waiting; - waiting.splice(waiting.begin(), finished); - - finished_lock.Unlock(); - osd_lock.Unlock(); - - for (list::iterator it = waiting.begin(); - it != waiting.end(); - it++) { - dispatch(*it); - } - } -} - - -// ------------------------------------- - -void OSD::_refresh_my_stat(utime_t now) -{ - assert(peer_stat_lock.is_locked()); - - // refresh? - if (now - my_stat.stamp > g_conf.osd_stat_refresh_interval || - pending_ops > 2*my_stat.qlen) { - - my_stat.stamp = now; - my_stat.oprate = stat_oprate.get(now); - - //read_latency_calc.set_size( 20 ); // hrm. - - // qlen - my_stat.qlen = 0; - if (stat_ops) my_stat.qlen = (float)stat_qlen / (float)stat_ops; //get_average(); - - // rd ops shed in - float frac_rd_ops_shed_in = 0; - float frac_rd_ops_shed_out = 0; - if (stat_rd_ops) { - frac_rd_ops_shed_in = (float)stat_rd_ops_shed_in / (float)stat_rd_ops; - frac_rd_ops_shed_out = (float)stat_rd_ops_shed_out / (float)stat_rd_ops; - } - my_stat.frac_rd_ops_shed_in = (my_stat.frac_rd_ops_shed_in + frac_rd_ops_shed_in) / 2.0; - my_stat.frac_rd_ops_shed_out = (my_stat.frac_rd_ops_shed_out + frac_rd_ops_shed_out) / 2.0; - - // recent_qlen - qlen_calc.add(my_stat.qlen); - my_stat.recent_qlen = qlen_calc.get_average(); - - // read latency - if (stat_rd_ops) { - my_stat.read_latency = read_latency_calc.get_average(); - if (my_stat.read_latency < 0) my_stat.read_latency = 0; - } else { - my_stat.read_latency = 0; - } - - my_stat.read_latency_mine = my_stat.read_latency * (1.0 - frac_rd_ops_shed_in); - - logger->fset("qlen", my_stat.qlen); - logger->fset("rqlen", my_stat.recent_qlen); - logger->fset("rdlat", my_stat.read_latency); - logger->fset("rdlatm", my_stat.read_latency_mine); - logger->fset("fshdin", my_stat.frac_rd_ops_shed_in); - logger->fset("fshdout", my_stat.frac_rd_ops_shed_out); - dout(12) << "_refresh_my_stat " << my_stat << dendl; - - stat_rd_ops = 0; - stat_rd_ops_shed_in = 0; - stat_rd_ops_shed_out = 0; - stat_ops = 0; - stat_qlen = 0; - } -} - -osd_peer_stat_t OSD::get_my_stat_for(utime_t now, int peer) -{ - Mutex::Locker lock(peer_stat_lock); - _refresh_my_stat(now); - my_stat_on_peer[peer] = my_stat; - return my_stat; -} - -void OSD::take_peer_stat(int peer, const osd_peer_stat_t& stat) -{ - Mutex::Locker lock(peer_stat_lock); - dout(10) << "take_peer_stat peer osd" << peer << " " << stat << dendl; - peer_stat[peer] = stat; -} - -void OSD::heartbeat() -{ - utime_t now = g_clock.now(); - utime_t since = now; - since.sec_ref() -= g_conf.osd_heartbeat_interval; - - // get CPU load avg - ifstream in("/proc/loadavg"); - if (in.is_open()) { - float oneminavg; - in >> oneminavg; - logger->fset("loadavg", oneminavg); - in.close(); - } - - // calc my stats - Mutex::Locker lock(peer_stat_lock); - _refresh_my_stat(now); - - dout(5) << "heartbeat: " << my_stat << dendl; - - //load_calc.set_size(stat_ops); - - // send pings - set pingset; - for (hash_map::iterator i = pg_map.begin(); - i != pg_map.end(); - i++) { - PG *pg = i->second; - - // we want to ping the primary. - if (pg->get_role() <= 0) continue; - if (pg->acting.size() < 1) continue; - - if (pg->last_heartbeat < since) { - pg->last_heartbeat = now; - pingset.insert(pg->acting[0]); - } - } - my_stat_on_peer.clear(); - for (set::iterator i = pingset.begin(); - i != pingset.end(); - i++) { - _share_map_outgoing( osdmap->get_inst(*i) ); - my_stat_on_peer[*i] = my_stat; - messenger->send_message(new MOSDPing(osdmap->get_epoch(), my_stat), - osdmap->get_inst(*i)); - } - - if (logger) logger->set("pingset", pingset.size()); - - // hack: fake reorg? - if (osdmap && g_conf.fake_osdmap_updates) { - int mon = monmap->pick_mon(); - if ((rand() % g_conf.fake_osdmap_updates) == 0) { - //if ((rand() % (g_conf.num_osd / g_conf.fake_osdmap_updates)) == whoami / g_conf.fake_osdmap_updates) { - messenger->send_message(new MOSDIn(osdmap->get_epoch()), - monmap->get_inst(mon)); - } - /* - if (osdmap->is_out(whoami)) { - messenger->send_message(new MOSDIn(osdmap->get_epoch()), - MSG_ADDR_MON(mon), monmap->get_inst(mon)); - } - else if ((rand() % g_conf.fake_osdmap_updates) == 0) { - //messenger->send_message(new MOSDOut(osdmap->get_epoch()), - //MSG_ADDR_MON(mon), monmap->get_inst(mon)); - } - } - */ - } - - // schedule next! randomly. - float wait = .5 + ((float)(rand() % 10)/10.0) * (float)g_conf.osd_heartbeat_interval; - timer.add_event_after(wait, new C_Heartbeat(this)); -} - - - -void OSD::send_pg_stats() -{ - //dout(-10) << "send_pg_stats" << dendl; - - // grab queue - set q; - pg_stat_queue_lock.Lock(); - q.swap(pg_stat_queue); - pg_stat_queue_lock.Unlock(); - - if (!q.empty()) { - dout(1) << "send_pg_stats - " << q.size() << " pgs updated" << dendl; - - MPGStats *m = new MPGStats; - while (!q.empty()) { - pg_t pgid = *q.begin(); - q.erase(q.begin()); - - if (!pg_map.count(pgid)) continue; - PG *pg = pg_map[pgid]; - pg->pg_stats_lock.Lock(); - m->pg_stat[pgid] = pg->pg_stats; - dout(20) << " sending " << pgid << " " << pg->pg_stats.state << dendl; - pg->pg_stats_lock.Unlock(); - } - - // fill in osd stats too - struct statfs stbuf; - store->statfs(&stbuf); - m->osd_stat.num_blocks = stbuf.f_blocks; - m->osd_stat.num_blocks_avail = stbuf.f_bavail; - m->osd_stat.num_objects = stbuf.f_files; - - int mon = monmap->pick_mon(); - messenger->send_message(m, monmap->get_inst(mon)); - } - - // reschedule - timer.add_event_after(g_conf.osd_pg_stats_interval, new C_Stats(this)); -} - - - - -// -------------------------------------- -// dispatch - -bool OSD::_share_map_incoming(const entity_inst_t& inst, epoch_t epoch) -{ - bool shared = false; - dout(20) << "_share_map_incoming " << inst << " " << epoch << dendl; - assert(osd_lock.is_locked()); - - // does client have old map? - if (inst.name.is_client()) { - if (epoch < osdmap->get_epoch()) { - dout(10) << inst.name << " has old map " << epoch << " < " << osdmap->get_epoch() << dendl; - send_incremental_map(epoch, inst, true); - shared = true; - } - } - - // does peer have old map? - if (inst.name.is_osd()) { - // remember - if (peer_map_epoch[inst.name] < epoch) { - dout(20) << "peer " << inst.name << " has " << epoch << dendl; - peer_map_epoch[inst.name] = epoch; - } - - // older? - if (peer_map_epoch[inst.name] < osdmap->get_epoch()) { - dout(10) << inst.name << " has old map " << epoch << " < " << osdmap->get_epoch() << dendl; - send_incremental_map(epoch, inst, true); - peer_map_epoch[inst.name] = osdmap->get_epoch(); // so we don't send it again. - shared = true; - } - } - - return shared; -} - - -void OSD::_share_map_outgoing(const entity_inst_t& inst) -{ - assert(inst.name.is_osd()); - - if (inst.name.is_osd()) { - // send map? - if (peer_map_epoch.count(inst.name)) { - epoch_t pe = peer_map_epoch[inst.name]; - if (pe < osdmap->get_epoch()) { - send_incremental_map(pe, inst, true); - peer_map_epoch[inst.name] = osdmap->get_epoch(); - } - } else { - // no idea about peer's epoch. - // ??? send recent ??? - // do nothing. - } - } -} - - - -void OSD::dispatch(Message *m) -{ - // lock! - osd_lock.Lock(); - dout(20) << "dispatch " << m << dendl; - - switch (m->get_type()) { - - // -- don't need lock -- - case MSG_PING: - dout(10) << "ping from " << m->get_source() << dendl; - delete m; - break; - - // -- don't need OSDMap -- - - // map and replication - case MSG_OSD_MAP: - handle_osd_map((MOSDMap*)m); - break; - - // osd - case MSG_SHUTDOWN: - shutdown(); - delete m; - break; - - - - // -- need OSDMap -- - - default: - { - // no map? starting up? - if (!osdmap) { - dout(7) << "no OSDMap, not booted" << dendl; - waiting_for_osdmap.push_back(m); - break; - } - - // down? - if (osdmap->is_down(whoami)) { - dout(7) << "i am marked down, dropping " << *m << dendl; - delete m; - break; - } - - - - - // need OSDMap - switch (m->get_type()) { - - case MSG_OSD_PING: - // take note. - handle_osd_ping((MOSDPing*)m); - break; - - case MSG_OSD_PG_NOTIFY: - handle_pg_notify((MOSDPGNotify*)m); - break; - case MSG_OSD_PG_QUERY: - handle_pg_query((MOSDPGQuery*)m); - break; - case MSG_OSD_PG_LOG: - handle_pg_log((MOSDPGLog*)m); - break; - case MSG_OSD_PG_REMOVE: - handle_pg_remove((MOSDPGRemove*)m); - break; - case MSG_OSD_PG_ACTIVATE_SET: - handle_pg_activate_set((MOSDPGActivateSet*)m); - break; - - case MSG_OSD_OP: - handle_op((MOSDOp*)m); - break; - - // for replication etc. - case MSG_OSD_OPREPLY: - handle_op_reply((MOSDOpReply*)m); - break; - - - default: - dout(1) << " got unknown message " << m->get_type() << dendl; - assert(0); - } - } - } - - // finishers? - finished_lock.Lock(); - if (!finished.empty()) { - list waiting; - waiting.splice(waiting.begin(), finished); - - finished_lock.Unlock(); - osd_lock.Unlock(); - - while (!waiting.empty()) { - dout(20) << "doing finished " << waiting.front() << dendl; - dispatch(waiting.front()); - waiting.pop_front(); - } - return; - } - - finished_lock.Unlock(); - osd_lock.Unlock(); -} - - -void OSD::ms_handle_failure(Message *m, const entity_inst_t& inst) -{ - entity_name_t dest = inst.name; - - if (g_conf.ms_die_on_failure) { - dout(0) << "ms_handle_failure " << inst << " on " << *m << dendl; - exit(0); - } - - if (is_stopping()) { - delete m; - return; - } - - if (dest.is_osd()) { - // failed osd. drop message, report to mon. - int mon = monmap->pick_mon(); - dout(1) << "ms_handle_failure " << inst - << ", dropping and reporting to mon" << mon - << " " << *m - << dendl; - messenger->send_message(new MOSDFailure(messenger->get_myinst(), inst, osdmap->get_epoch()), - monmap->get_inst(mon)); - delete m; - } else if (dest.is_mon()) { - // resend to a different monitor. - int mon = monmap->pick_mon(true); - dout(1) << "ms_handle_failure " << inst - << ", resending to mon" << mon - << " " << *m - << dendl; - messenger->send_message(m, monmap->get_inst(mon)); - } - else { - // client? - dout(1) << "ms_handle_failure " << inst - << ", dropping " << *m << dendl; - delete m; - } -} - - - - -void OSD::handle_osd_ping(MOSDPing *m) -{ - dout(20) << "osdping from " << m->get_source() << " got stat " << m->peer_stat << dendl; - - _share_map_incoming(m->get_source_inst(), ((MOSDPing*)m)->map_epoch); - - int from = m->get_source().num(); - take_peer_stat(from, m->peer_stat); - - delete m; -} - - - - -// ===================================================== -// MAP - -void OSD::wait_for_new_map(Message *m) -{ - // ask - if (waiting_for_osdmap.empty()) { - int mon = monmap->pick_mon(); - messenger->send_message(new MOSDGetMap(osdmap->get_epoch()+1), - monmap->get_inst(mon)); - } - - waiting_for_osdmap.push_back(m); -} - - -/** update_map - * assimilate new OSDMap(s). scan pgs, etc. - */ -void OSD::handle_osd_map(MOSDMap *m) -{ - wait_for_no_ops(); - - assert(osd_lock.is_locked()); - - ObjectStore::Transaction t; - - if (osdmap) { - dout(3) << "handle_osd_map epochs [" - << m->get_first() << "," << m->get_last() - << "], i have " << osdmap->get_epoch() - << dendl; - } else { - dout(3) << "handle_osd_map epochs [" - << m->get_first() << "," << m->get_last() - << "], i have none" - << dendl; - osdmap = new OSDMap; - boot_epoch = m->get_last(); // hrm...? - } - - logger->inc("mapmsg"); - - // store them? - for (map::iterator p = m->maps.begin(); - p != m->maps.end(); - p++) { - object_t oid = get_osdmap_object_name(p->first); - if (store->exists(oid)) { - dout(10) << "handle_osd_map already had full map epoch " << p->first << dendl; - logger->inc("mapfdup"); - bufferlist bl; - get_map_bl(p->first, bl); - dout(10) << " .. it is " << bl.length() << " bytes" << dendl; - continue; - } - - dout(10) << "handle_osd_map got full map epoch " << p->first << dendl; - store->write(oid, 0, p->second.length(), p->second, 0); // store _outside_ transaction; activate_map reads it. - - if (p->first > superblock.newest_map) - superblock.newest_map = p->first; - if (p->first < superblock.oldest_map || - superblock.oldest_map == 0) - superblock.oldest_map = p->first; - - logger->inc("mapf"); - } - for (map::iterator p = m->incremental_maps.begin(); - p != m->incremental_maps.end(); - p++) { - object_t oid = get_inc_osdmap_object_name(p->first); - if (store->exists(oid)) { - dout(10) << "handle_osd_map already had incremental map epoch " << p->first << dendl; - logger->inc("mapidup"); - bufferlist bl; - get_inc_map_bl(p->first, bl); - dout(10) << " .. it is " << bl.length() << " bytes" << dendl; - continue; - } - - dout(10) << "handle_osd_map got incremental map epoch " << p->first << dendl; - store->write(oid, 0, p->second.length(), p->second, 0); // store _outside_ transaction; activate_map reads it. - - if (p->first > superblock.newest_map) - superblock.newest_map = p->first; - if (p->first < superblock.oldest_map || - superblock.oldest_map == 0) - superblock.oldest_map = p->first; - - logger->inc("mapi"); - } - - // advance if we can - bool advanced = false; - - epoch_t cur = superblock.current_epoch; - while (cur < superblock.newest_map) { - dout(10) << "cur " << cur << " < newest " << superblock.newest_map << dendl; - - if (m->incremental_maps.count(cur+1) || - store->exists(get_inc_osdmap_object_name(cur+1))) { - dout(10) << "handle_osd_map decoding inc map epoch " << cur+1 << dendl; - - bufferlist bl; - if (m->incremental_maps.count(cur+1)) { - dout(10) << " using provided inc map" << dendl; - bl = m->incremental_maps[cur+1]; - } else { - dout(10) << " using my locally stored inc map" << dendl; - get_inc_map_bl(cur+1, bl); - } - - OSDMap::Incremental inc; - int off = 0; - inc.decode(bl, off); - - osdmap->apply_incremental(inc); - - // archive the full map - bl.clear(); - osdmap->encode(bl); - t.write( get_osdmap_object_name(cur+1), 0, bl.length(), bl); - - // notify messenger - for (map >::iterator i = inc.new_down.begin(); - i != inc.new_down.end(); - i++) { - int osd = i->first; - if (osd == whoami) continue; - messenger->mark_down(i->second.first.addr); - peer_map_epoch.erase(i->second.first.name); - - // kick any replica ops - for (hash_map::iterator it = pg_map.begin(); - it != pg_map.end(); - it++) { - PG *pg = it->second; - - pg->lock(); - pg->note_failed_osd(osd); - pg->unlock(); - } - } - for (map::iterator i = inc.new_up.begin(); - i != inc.new_up.end(); - i++) { - if (i->first == whoami) continue; - peer_map_epoch.erase(i->second.name); - } - } - else if (m->maps.count(cur+1) || - store->exists(get_osdmap_object_name(cur+1))) { - dout(10) << "handle_osd_map decoding full map epoch " << cur+1 << dendl; - bufferlist bl; - if (m->maps.count(cur+1)) - bl = m->maps[cur+1]; - else - get_map_bl(cur+1, bl); - osdmap->decode(bl); - - // FIXME BUG: need to notify messenger of ups/downs!! - } - else { - dout(10) << "handle_osd_map missing epoch " << cur+1 << dendl; - int mon = monmap->pick_mon(); - messenger->send_message(new MOSDGetMap(cur+1), monmap->get_inst(mon)); - break; - } - - cur++; - superblock.current_epoch = cur; - advance_map(t); - advanced = true; - } - - // all the way? - if (advanced && cur == superblock.newest_map) { - // yay! - activate_map(t); - - // process waiters - take_waiters(waiting_for_osdmap); - } - - // write updated pg state to store - for (hash_map::iterator i = pg_map.begin(); - i != pg_map.end(); - i++) { - pg_t pgid = i->first; - PG *pg = i->second; - t.collection_setattr( pgid, "info", &pg->info, sizeof(pg->info)); - } - - // superblock and commit - write_superblock(t); - store->apply_transaction(t); - - //if (osdmap->get_epoch() == 1) store->sync(); // in case of early death, blah - - delete m; -} - - -/** - * scan placement groups, initiate any replication - * activities. - */ -void OSD::advance_map(ObjectStore::Transaction& t) -{ - dout(7) << "advance_map epoch " << osdmap->get_epoch() - << " " << pg_map.size() << " pgs" - << dendl; - - if (osdmap->is_mkfs()) { - ps_t numps = osdmap->get_pg_num(); - ps_t numlps = osdmap->get_localized_pg_num(); - dout(1) << "mkfs on " << numps << " normal, " << numlps << " localized pg sets" << dendl; - int minrep = 1; - int maxrep = MIN(g_conf.num_osd, g_conf.osd_max_rep); - int minraid = g_conf.osd_min_raid_width; - int maxraid = g_conf.osd_max_raid_width; - dout(1) << "mkfs " << minrep << ".." << maxrep << " replicas, " - << minraid << ".." << maxraid << " osd raid groups" << dendl; - - //derr(0) << "osdmap " << osdmap->get_ctime() << " logger start " << logger->get_start() << dendl; - logger->set_start( osdmap->get_ctime() ); - - assert(g_conf.osd_mkfs); // make sure we did a mkfs! - - // create PGs - // replicated - for (int nrep = 1; nrep <= maxrep; nrep++) { - for (ps_t ps = 0; ps < numps; ++ps) - try_create_pg(pg_t(pg_t::TYPE_REP, nrep, ps, -1), t); - for (ps_t ps = 0; ps < numlps; ++ps) - try_create_pg(pg_t(pg_t::TYPE_REP, nrep, ps, whoami), t); - } - - // raided - /* - for (int size = minraid; size <= maxraid; size++) { - for (ps_t ps = 0; ps < numps; ++ps) - try_create_pg(pg_t(pg_t::TYPE_RAID4, size, ps, -1), t); - for (ps_t ps = 0; ps < numlps; ++ps) - try_create_pg(pg_t(pg_t::TYPE_RAID4, size, ps, whoami), t); - } - */ - dout(1) << "mkfs done, created " << pg_map.size() << " pgs" << dendl; - - } else { - // scan existing pg's - for (hash_map::iterator it = pg_map.begin(); - it != pg_map.end(); - it++) { - pg_t pgid = it->first; - PG *pg = it->second; - - // did i finish this epoch? - if (pg->is_active()) { - pg->info.last_epoch_finished = osdmap->get_epoch()-1; - } - - // get new acting set - vector tacting; - int nrep = osdmap->pg_to_acting_osds(pgid, tacting); - int role = osdmap->calc_pg_role(whoami, tacting, nrep); - - // no change? - if (tacting == pg->acting) - continue; - - // -- there was a change! -- - pg->lock(); - - int oldrole = pg->get_role(); - int oldprimary = pg->get_primary(); - int oldacker = pg->get_acker(); - vector oldacting = pg->acting; - - // update PG - pg->acting.swap(tacting); - pg->set_role(role); - - // did primary|acker change? - pg->info.history.same_since = osdmap->get_epoch(); - if (oldprimary != pg->get_primary()) { - pg->info.history.same_primary_since = osdmap->get_epoch(); - pg->cancel_recovery(); - } - if (oldacker != pg->get_acker()) { - pg->info.history.same_acker_since = osdmap->get_epoch(); - } - - // deactivate. - pg->state_clear(PG::STATE_ACTIVE); - - // reset primary state? - if (oldrole == 0 || pg->get_role() == 0) - pg->clear_primary_state(); - - // apply any repops in progress. - if (oldacker == whoami) { - pg->on_acker_change(); - } - - if (role != oldrole) { - // old primary? - if (oldrole == 0) { - pg->state_clear(PG::STATE_CLEAN); - - // take replay queue waiters - list ls; - for (map::iterator it = pg->replay_queue.begin(); - it != pg->replay_queue.end(); - it++) - ls.push_back(it->second); - pg->replay_queue.clear(); - take_waiters(ls); - - // take active waiters - take_waiters(pg->waiting_for_active); - - pg->on_role_change(); - } - - // new primary? - if (role == 0) { - // i am new primary - pg->state_clear(PG::STATE_STRAY); - } else { - // i am now replica|stray. we need to send a notify. - pg->state_set(PG::STATE_STRAY); - - if (nrep == 0) { - // did they all shut down cleanly? - bool clean = true; - vector inset; - osdmap->pg_to_osds(pg->info.pgid, inset); - for (unsigned i=0; iis_down_clean(inset[i])) clean = false; - if (clean) { - dout(1) << *pg << " is cleanly inactive" << dendl; - } else { - pg->state_set(PG::STATE_CRASHED); - dout(1) << *pg << " is crashed" << dendl; - } - } - } - - // my role changed. - dout(10) << *pg << " " << oldacting << " -> " << pg->acting - << ", role " << oldrole << " -> " << role << dendl; - - } else { - // no role change. - // did primary change? - if (pg->get_primary() != oldprimary) { - // we need to announce - pg->state_set(PG::STATE_STRAY); - - dout(10) << *pg << " " << oldacting << " -> " << pg->acting - << ", acting primary " - << oldprimary << " -> " << pg->get_primary() - << dendl; - } else { - // primary is the same. - if (role == 0) { - // i am (still) primary. but my replica set changed. - pg->state_clear(PG::STATE_CLEAN); - pg->state_clear(PG::STATE_REPLAY); - - dout(10) << *pg << " " << oldacting << " -> " << pg->acting - << ", replicas changed" << dendl; - } - } - } - - pg->unlock(); - } - } -} - -void OSD::activate_map(ObjectStore::Transaction& t) -{ - dout(7) << "activate_map version " << osdmap->get_epoch() << dendl; - - map< int, list > notify_list; // primary -> list - map< int, map > query_map; // peer -> PG -> get_summary_since - map activator_map; // peer -> message - - // scan pg's - for (hash_map::iterator it = pg_map.begin(); - it != pg_map.end(); - it++) { - //pg_t pgid = it->first; - PG *pg = it->second; - pg->lock(); - if (pg->is_active()) { - // update started counter - pg->info.last_epoch_started = osdmap->get_epoch(); - } - else if (pg->get_role() == 0 && !pg->is_active()) { - // i am (inactive) primary - pg->build_prior(); - pg->peer(t, query_map, &activator_map); - } - else if (pg->is_stray() && - pg->get_primary() >= 0) { - // i am residual|replica - notify_list[pg->get_primary()].push_back(pg->info); - } - if (pg->is_primary()) - pg->update_stats(); - pg->unlock(); - } - - if (g_conf.osd_hack_fast_startup && - osdmap->is_mkfs()) // hack: skip the queries/summaries if it's a mkfs - return; - - do_notifies(notify_list); // notify? (residual|replica) - do_queries(query_map); - do_activators(activator_map); - - logger->set("numpg", pg_map.size()); -} - - -void OSD::send_incremental_map(epoch_t since, const entity_inst_t& inst, bool full) -{ - dout(10) << "send_incremental_map " << since << " -> " << osdmap->get_epoch() - << " to " << inst << dendl; - - MOSDMap *m = new MOSDMap; - - for (epoch_t e = osdmap->get_epoch(); - e > since; - e--) { - bufferlist bl; - if (get_inc_map_bl(e,bl)) { - m->incremental_maps[e].claim(bl); - } else if (get_map_bl(e,bl)) { - m->maps[e].claim(bl); - if (!full) break; - } - else { - assert(0); // we should have all maps. - } - } - - messenger->send_message(m, inst); -} - -bool OSD::get_map_bl(epoch_t e, bufferlist& bl) -{ - return store->read(get_osdmap_object_name(e), 0, 0, bl) >= 0; -} - -bool OSD::get_inc_map_bl(epoch_t e, bufferlist& bl) -{ - return store->read(get_inc_osdmap_object_name(e), 0, 0, bl) >= 0; -} - -void OSD::get_map(epoch_t epoch, OSDMap &m) -{ - // find a complete map - list incs; - epoch_t e; - for (e = epoch; e > 0; e--) { - bufferlist bl; - if (get_map_bl(e, bl)) { - //dout(10) << "get_map " << epoch << " full " << e << dendl; - m.decode(bl); - break; - } else { - OSDMap::Incremental inc; - bool got = get_inc_map(e, inc); - assert(got); - incs.push_front(inc); - } - } - assert(e >= 0); - - // apply incrementals - for (e++; e <= epoch; e++) { - //dout(10) << "get_map " << epoch << " inc " << e << dendl; - m.apply_incremental( incs.front() ); - incs.pop_front(); - } -} - - -bool OSD::get_inc_map(epoch_t e, OSDMap::Incremental &inc) -{ - bufferlist bl; - if (!get_inc_map_bl(e, bl)) - return false; - int off = 0; - inc.decode(bl, off); - return true; -} - - - - - -bool OSD::require_current_map(Message *m, epoch_t ep) -{ - // older map? - if (ep < osdmap->get_epoch()) { - dout(7) << "require_current_map epoch " << ep << " < " << osdmap->get_epoch() << dendl; - delete m; // discard and ignore. - return false; - } - - // newer map? - if (ep > osdmap->get_epoch()) { - dout(7) << "require_current_map epoch " << ep << " > " << osdmap->get_epoch() << dendl; - wait_for_new_map(m); - return false; - } - - assert(ep == osdmap->get_epoch()); - return true; -} - - -/* - * require that we have same (or newer) map, and that - * the source is the pg primary. - */ -bool OSD::require_same_or_newer_map(Message *m, epoch_t epoch) -{ - dout(15) << "require_same_or_newer_map " << epoch << " (i am " << osdmap->get_epoch() << ") " << m << dendl; - - // newer map? - if (epoch > osdmap->get_epoch()) { - dout(7) << "waiting for newer map epoch " << epoch << " > my " << osdmap->get_epoch() << " with " << m << dendl; - wait_for_new_map(m); - return false; - } - - if (epoch < boot_epoch) { - dout(7) << "from pre-boot epoch " << epoch << " < " << boot_epoch << dendl; - delete m; - return false; - } - - return true; -} - - - - - -/** do_notifies - * Send an MOSDPGNotify to a primary, with a list of PGs that I have - * content for, and they are primary for. - */ - -void OSD::do_notifies(map< int, list >& notify_list) -{ - for (map< int, list >::iterator it = notify_list.begin(); - it != notify_list.end(); - it++) { - if (it->first == whoami) { - dout(7) << "do_notify osd" << it->first << " is self, skipping" << dendl; - continue; - } - dout(7) << "do_notify osd" << it->first << " on " << it->second.size() << " PGs" << dendl; - MOSDPGNotify *m = new MOSDPGNotify(osdmap->get_epoch(), it->second); - _share_map_outgoing(osdmap->get_inst(it->first)); - messenger->send_message(m, osdmap->get_inst(it->first)); - } -} - - -/** do_queries - * send out pending queries for info | summaries - */ -void OSD::do_queries(map< int, map >& query_map) -{ - for (map< int, map >::iterator pit = query_map.begin(); - pit != query_map.end(); - pit++) { - int who = pit->first; - dout(7) << "do_queries querying osd" << who - << " on " << pit->second.size() << " PGs" << dendl; - - MOSDPGQuery *m = new MOSDPGQuery(osdmap->get_epoch(), - pit->second); - _share_map_outgoing(osdmap->get_inst(who)); - messenger->send_message(m, osdmap->get_inst(who)); - } -} - - -void OSD::do_activators(map& activator_map) -{ - for (map::iterator p = activator_map.begin(); - p != activator_map.end(); - ++p) - messenger->send_message(p->second, osdmap->get_inst(p->first)); - activator_map.clear(); -} - - - - - -/** PGNotify - * from non-primary to primary - * includes PG::Info. - * NOTE: called with opqueue active. - */ -void OSD::handle_pg_notify(MOSDPGNotify *m) -{ - dout(7) << "handle_pg_notify from " << m->get_source() << dendl; - int from = m->get_source().num(); - - if (!require_same_or_newer_map(m, m->get_epoch())) return; - - ObjectStore::Transaction t; - - // look for unknown PGs i'm primary for - map< int, map > query_map; - map activator_map; - - for (list::iterator it = m->get_pg_list().begin(); - it != m->get_pg_list().end(); - it++) { - pg_t pgid = it->pgid; - PG *pg; - - if (!_have_pg(pgid)) { - // same primary? - vector acting; - int nrep = osdmap->pg_to_acting_osds(pgid, acting); - int role = osdmap->calc_pg_role(whoami, acting, nrep); - - PG::Info::History history = it->history; - project_pg_history(pgid, history, m->get_epoch(), acting); - - if (m->get_epoch() < history.same_primary_since) { - dout(10) << "handle_pg_notify pg " << pgid << " dne, and primary changed in " - << history.same_primary_since << " (msg from " << m->get_epoch() << ")" << dendl; - continue; - } - - assert(role == 0); // otherwise, probably bug in project_pg_history. - - // ok, create PG! - pg = _create_lock_pg(pgid, t); - pg->acting.swap(acting); - pg->set_role(role); - pg->info.history = history; - pg->last_epoch_started_any = it->last_epoch_started; - pg->clear_primary_state(); // yep, notably, set hml=false - pg->build_prior(); - pg->write_log(t); - - dout(10) << *pg << " is new" << dendl; - - // kick any waiters - if (waiting_for_pg.count(pgid)) { - take_waiters(waiting_for_pg[pgid]); - waiting_for_pg.erase(pgid); - } - } else { - // already had it. am i (still) the primary? - pg = _lookup_lock_pg(pgid); - if (m->get_epoch() < pg->info.history.same_primary_since) { - dout(10) << *pg << " handle_pg_notify primary changed in " - << pg->info.history.same_primary_since - << " (msg from " << m->get_epoch() << ")" << dendl; - pg->unlock(); - continue; - } - } - - // ok! - - // stray? - bool acting = pg->is_acting(from); - if (!acting && (*it).last_epoch_started > 0) { - dout(10) << *pg << " osd" << from << " has stray content: " << *it << dendl; - pg->stray_set.insert(from); - pg->state_clear(PG::STATE_CLEAN); - } - - // save info. - bool had = pg->peer_info.count(from); - pg->peer_info[from] = *it; - - if (had) { - if (pg->is_active() && - (*it).is_uptodate() && acting) { - pg->uptodate_set.insert(from); - dout(10) << *pg << " osd" << from << " now uptodate (" << pg->uptodate_set - << "): " << *it << dendl; - if (pg->is_all_uptodate()) - pg->finish_recovery(); - } else { - // hmm, maybe keep an eye out for cases where we see this, but peer should happen. - dout(10) << *pg << " already had notify info from osd" << from << ": " << *it << dendl; - } - } else { - // adjust prior? - if (it->last_epoch_started > pg->last_epoch_started_any) - pg->adjust_prior(); - - // peer - pg->peer(t, query_map, &activator_map); - } - - pg->unlock(); - } - - unsigned tr = store->apply_transaction(t); - assert(tr == 0); - - do_queries(query_map); - do_activators(activator_map); - - delete m; -} - - - -/** PGLog - * from non-primary to primary - * includes log and info - * from primary to non-primary - * includes log for use in recovery - * NOTE: called with opqueue active. - */ - - -void OSD::_process_pg_info(epoch_t epoch, int from, - PG::Info &info, - PG::Log &log, - PG::Missing &missing, - map* activator_map) -{ - if (pg_map.count(info.pgid) == 0) { - dout(10) << "_process_pg_info " << info << " don't have pg" << dendl; - assert(epoch < osdmap->get_epoch()); - return; - } - - PG *pg = _lookup_lock_pg(info.pgid); - assert(pg); - - dout(10) << *pg << " got " << info << " " << log << " " << missing << dendl; - - if (epoch < pg->info.history.same_since) { - dout(10) << *pg << " got old info " << info << ", ignoring" << dendl; - pg->unlock(); - return; - } - - //m->log.print(cout); - - ObjectStore::Transaction t; - - if (pg->is_primary()) { - // i am PRIMARY - assert(pg->peer_log_requested.count(from) || - pg->peer_summary_requested.count(from)); - - pg->proc_replica_log(log, missing, from); - - // peer - map< int, map > query_map; - pg->peer(t, query_map, activator_map); - do_queries(query_map); - - } else { - // i am REPLICA - // merge log - pg->merge_log(log, missing, from); - pg->proc_missing(log, missing, from); - assert(pg->missing.num_lost() == 0); - - // ok activate! - pg->activate(t, activator_map); - } - - unsigned tr = store->apply_transaction(t); - assert(tr == 0); - - pg->unlock(); -} - - -void OSD::handle_pg_log(MOSDPGLog *m) -{ - dout(7) << "handle_pg_log " << *m << " from " << m->get_source() << dendl; - - int from = m->get_source().num(); - if (!require_same_or_newer_map(m, m->get_epoch())) return; - - _process_pg_info(m->get_epoch(), from, - m->info, m->log, m->missing, 0); - - delete m; -} - -void OSD::handle_pg_activate_set(MOSDPGActivateSet *m) -{ - dout(7) << "handle_pg_activate_set " << *m << " from " << m->get_source() << dendl; - - int from = m->get_source().num(); - if (!require_same_or_newer_map(m, m->get_epoch())) return; - - PG::Log empty_log; - PG::Missing empty_missing; - map activator_map; - - for (list::iterator p = m->pg_info.begin(); - p != m->pg_info.end(); - ++p) - _process_pg_info(m->get_epoch(), from, *p, empty_log, empty_missing, &activator_map); - - do_activators(activator_map); - - delete m; -} - - -/** PGQuery - * from primary to replica | stray - * NOTE: called with opqueue active. - */ -void OSD::handle_pg_query(MOSDPGQuery *m) -{ - dout(7) << "handle_pg_query from " << m->get_source() << " epoch " << m->get_epoch() << dendl; - int from = m->get_source().num(); - - if (!require_same_or_newer_map(m, m->get_epoch())) return; - - map< int, list > notify_list; - - for (map::iterator it = m->pg_list.begin(); - it != m->pg_list.end(); - it++) { - pg_t pgid = it->first; - PG *pg = 0; - - if (pg_map.count(pgid) == 0) { - // get active crush mapping - vector acting; - int nrep = osdmap->pg_to_acting_osds(pgid, acting); - int role = osdmap->calc_pg_role(whoami, acting, nrep); - - // same primary? - PG::Info::History history = it->second.history; - project_pg_history(pgid, history, m->get_epoch(), acting); - - if (m->get_epoch() < history.same_since) { - dout(10) << " pg " << pgid << " dne, and pg has changed in " - << history.same_primary_since << " (msg from " << m->get_epoch() << ")" << dendl; - continue; - } - - if (role < 0) { - dout(10) << " pg " << pgid << " dne, and i am not an active replica" << dendl; - PG::Info empty(pgid); - notify_list[from].push_back(empty); - continue; - } - assert(role > 0); - - ObjectStore::Transaction t; - pg = _create_lock_pg(pgid, t); - pg->acting.swap( acting ); - pg->set_role(role); - pg->info.history = history; - pg->write_log(t); - store->apply_transaction(t); - - dout(10) << *pg << " dne (before), but i am role " << role << dendl; - } else { - pg = _lookup_lock_pg(pgid); - - // same primary? - if (m->get_epoch() < pg->info.history.same_since) { - dout(10) << *pg << " handle_pg_query primary changed in " - << pg->info.history.same_since - << " (msg from " << m->get_epoch() << ")" << dendl; - pg->unlock(); - continue; - } - } - - // ok, process query! - assert(!pg->acting.empty()); - assert(from == pg->acting[0]); - - if (it->second.type == PG::Query::INFO) { - // info - dout(10) << *pg << " sending info" << dendl; - notify_list[from].push_back(pg->info); - } else { - MOSDPGLog *m = new MOSDPGLog(osdmap->get_epoch(), pg->info); - m->missing = pg->missing; - - if (it->second.type == PG::Query::LOG) { - dout(10) << *pg << " sending info+missing+log since split " << it->second.split - << " from floor " << it->second.floor - << dendl; - if (!m->log.copy_after_unless_divergent(pg->log, it->second.split, it->second.floor)) { - dout(10) << *pg << " divergent, sending backlog" << dendl; - it->second.type = PG::Query::BACKLOG; - } - } - - if (it->second.type == PG::Query::BACKLOG) { - dout(10) << *pg << " sending info+missing+backlog" << dendl; - if (pg->log.backlog) { - m->log = pg->log; - } else { - pg->generate_backlog(); - m->log = pg->log; - pg->drop_backlog(); - } - } - else if (it->second.type == PG::Query::FULLLOG) { - dout(10) << *pg << " sending info+missing+full log" << dendl; - m->log.copy_non_backlog(pg->log); - } - - dout(10) << *pg << " sending " << m->log << " " << m->missing << dendl; - //m->log.print(cout); - - _share_map_outgoing(osdmap->get_inst(from)); - messenger->send_message(m, osdmap->get_inst(from)); - } - - pg->unlock(); - } - - do_notifies(notify_list); - - delete m; -} - - -void OSD::handle_pg_remove(MOSDPGRemove *m) -{ - dout(7) << "handle_pg_remove from " << m->get_source() << dendl; - - if (!require_same_or_newer_map(m, m->get_epoch())) return; - - for (set::iterator it = m->pg_list.begin(); - it != m->pg_list.end(); - it++) { - pg_t pgid = *it; - PG *pg; - - if (pg_map.count(pgid) == 0) { - dout(10) << " don't have pg " << pgid << dendl; - continue; - } - - pg = _lookup_lock_pg(pgid); - - dout(10) << *pg << " removing." << dendl; - assert(pg->get_role() == -1); - - _remove_unlock_pg(pg); - } - - delete m; -} - - - - - - -// ========================================================= -// OPS - -void OSD::handle_op(MOSDOp *op) -{ - // throttle? FIXME PROBABLY! - while (pending_ops > g_conf.osd_max_opq) { - dout(10) << "enqueue_op waiting for pending_ops " << pending_ops << " to drop to " << g_conf.osd_max_opq << dendl; - op_queue_cond.Wait(osd_lock); - } - - // get and lock *pg. - const pg_t pgid = op->get_pg(); - PG *pg = _have_pg(pgid) ? _lookup_lock_pg(pgid):0; - - logger->set("buf", buffer_total_alloc); - - utime_t now = g_clock.now(); - - // update qlen stats - stat_oprate.hit(now); - stat_ops++; - stat_qlen += pending_ops; - if (op->get_op() == OSD_OP_READ) { - stat_rd_ops++; - if (op->get_source().is_osd()) { - //derr(-10) << "shed in " << stat_rd_ops_shed_in << " / " << stat_rd_ops << dendl; - stat_rd_ops_shed_in++; - } - } - - // require same or newer map - if (!require_same_or_newer_map(op, op->get_map_epoch())) { - if (pg) pg->unlock(); - return; - } - - // share our map with sender, if they're old - _share_map_incoming(op->get_source_inst(), op->get_map_epoch()); - - - if (!op->get_source().is_osd()) { - // REGULAR OP (non-replication) - - // note original source - op->set_client_inst( op->get_source_inst() ); - op->clear_payload(); // and hose encoded payload (in case we forward) - - // have pg? - if (!pg) { - dout(7) << "hit non-existent pg " - << pgid - << ", waiting" << dendl; - waiting_for_pg[pgid].push_back(op); - return; - } - - // pg must be same-ish... - if (op->is_read()) { - // read - if (!pg->same_for_read_since(op->get_map_epoch())) { - dout(7) << "handle_rep_op pg changed " << pg->info.history - << " after " << op->get_map_epoch() - << ", dropping" << dendl; - assert(op->get_map_epoch() < osdmap->get_epoch()); - pg->unlock(); - delete op; - return; - } - - /* - if (read && op->get_oid().rev > 0) { - // versioned read. hrm. - // are we missing a revision that we might need? - object_t moid = op->get_oid(); - if (pick_missing_object_rev(moid, pg)) { - // is there a local revision we might use instead? - object_t loid = op->get_oid(); - if (store->pick_object_revision_lt(loid) && - moid <= loid) { - // we need moid. pull it. - dout(10) << "handle_op read on " << op->get_oid() - << ", have " << loid - << ", but need missing " << moid - << ", pulling" << dendl; - pull(pg, moid); - pg->waiting_for_missing_object[moid].push_back(op); - return; - } - - dout(10) << "handle_op read on " << op->get_oid() - << ", have " << loid - << ", don't need missing " << moid - << dendl; - } - } else { - // live revision. easy. - if (op->get_op() != OSD_OP_PUSH && - waitfor_missing_object(op, pg)) return; - } - */ - - } else { - // modify - if ((pg->get_primary() != whoami || - !pg->same_for_modify_since(op->get_map_epoch()))) { - dout(7) << "handle_rep_op pg changed " << pg->info.history - << " after " << op->get_map_epoch() - << ", dropping" << dendl; - assert(op->get_map_epoch() < osdmap->get_epoch()); - pg->unlock(); - delete op; - return; - } - } - - // pg must be active. - if (!pg->is_active()) { - // replay? - if (op->get_version().version > 0) { - if (op->get_version() > pg->info.last_update) { - dout(7) << *pg << " queueing replay at " << op->get_version() - << " for " << *op << dendl; - pg->replay_queue[op->get_version()] = op; - pg->unlock(); - return; - } else { - dout(7) << *pg << " replay at " << op->get_version() << " <= " << pg->info.last_update - << " for " << *op - << ", will queue for WRNOOP" << dendl; - } - } - - dout(7) << *pg << " not active (yet)" << dendl; - pg->waiting_for_active.push_back(op); - pg->unlock(); - return; - } - - // missing object? - if (pg->is_missing_object(op->get_oid())) { - pg->wait_for_missing_object(op->get_oid(), op); - pg->unlock(); - return; - } - - dout(10) << "handle_op " << *op << " in " << *pg << dendl; - - } else { - // REPLICATION OP (it's from another OSD) - - // have pg? - if (!pg) { - derr(-7) << "handle_rep_op " << *op - << " pgid " << pgid << " dne" << dendl; - delete op; - //assert(0); // wtf, shouldn't happen. - return; - } - - // check osd map: same set, or primary+acker? - if (!pg->same_for_rep_modify_since(op->get_map_epoch())) { - dout(10) << "handle_rep_op pg changed " << pg->info.history - << " after " << op->get_map_epoch() - << ", dropping" << dendl; - pg->unlock(); - delete op; - return; - } - - assert(pg->get_role() >= 0); - dout(7) << "handle_rep_op " << op << " in " << *pg << dendl; - } - - // proprocess op? - if (pg->preprocess_op(op, now)) { - pg->unlock(); - return; - } - - if (op->get_op() == OSD_OP_READ) { - Mutex::Locker lock(peer_stat_lock); - stat_rd_ops_in_queue++; - } - - if (g_conf.osd_maxthreads < 1) { - // do it now. - if (op->get_type() == MSG_OSD_OP) - pg->do_op((MOSDOp*)op); - else if (op->get_type() == MSG_OSD_OPREPLY) - pg->do_op_reply((MOSDOpReply*)op); - else - assert(0); - } else { - // queue for worker threads - enqueue_op(pg, op); - } - - pg->unlock(); -} - - -void OSD::handle_op_reply(MOSDOpReply *op) -{ - if (op->get_map_epoch() < boot_epoch) { - dout(3) << "replica op reply from before boot" << dendl; - delete op; - return; - } - - // must be a rep op. - assert(op->get_source().is_osd()); - - // make sure we have the pg - const pg_t pgid = op->get_pg(); - - // require same or newer map - if (!require_same_or_newer_map(op, op->get_map_epoch())) return; - - // share our map with sender, if they're old - _share_map_incoming(op->get_source_inst(), op->get_map_epoch()); - - if (!_have_pg(pgid)) { - // hmm. - delete op; - return; - } - - PG *pg = _lookup_lock_pg(pgid); - if (g_conf.osd_maxthreads < 1) { - pg->do_op_reply(op); // do it now - } else { - enqueue_op(pg, op); // queue for worker threads - } - pg->unlock(); -} - - -/* - * enqueue called with osd_lock held - */ -void OSD::enqueue_op(PG *pg, Message *op) -{ - // add to pg's op_queue - pg->op_queue.push_back(op); - pending_ops++; - logger->set("opq", pending_ops); - - // add pg to threadpool queue - pg->get(); // we're exposing the pointer, here. - threadpool->put_op(pg); -} - -/* - * NOTE: dequeue called in worker thread, without osd_lock - */ -void OSD::dequeue_op(PG *pg) -{ - Message *op = 0; - - osd_lock.Lock(); - { - // lock pg and get pending op - pg->lock(); - - assert(!pg->op_queue.empty()); - op = pg->op_queue.front(); - pg->op_queue.pop_front(); - - dout(10) << "dequeue_op " << *op << " pg " << *pg - << ", " << (pending_ops-1) << " more pending" - << dendl; - - // share map? - // do this preemptively while we hold osd_lock and pg->lock - // to avoid lock ordering issues later. - for (unsigned i=1; iacting.size(); i++) - _share_map_outgoing( osdmap->get_inst(pg->acting[i]) ); - } - osd_lock.Unlock(); - - // do it - if (op->get_type() == MSG_OSD_OP) - pg->do_op((MOSDOp*)op); // do it now - else if (op->get_type() == MSG_OSD_OPREPLY) - pg->do_op_reply((MOSDOpReply*)op); - else - assert(0); - - // unlock and put pg - pg->put_unlock(); - - // finish - osd_lock.Lock(); - { - dout(10) << "dequeue_op " << op << " finish" << dendl; - assert(pending_ops > 0); - - if (pending_ops > g_conf.osd_max_opq) - op_queue_cond.Signal(); - - pending_ops--; - logger->set("opq", pending_ops); - if (pending_ops == 0 && waiting_for_no_ops) - no_pending_ops.Signal(); - } - osd_lock.Unlock(); -} - - - - -void OSD::wait_for_no_ops() -{ - if (pending_ops > 0) { - dout(7) << "wait_for_no_ops - waiting for " << pending_ops << dendl; - waiting_for_no_ops = true; - while (pending_ops > 0) - no_pending_ops.Wait(osd_lock); - waiting_for_no_ops = false; - assert(pending_ops == 0); - } - dout(7) << "wait_for_no_ops - none" << dendl; -} - - - - diff --git a/branches/sage/mds/osd/OSD.h b/branches/sage/mds/osd/OSD.h deleted file mode 100644 index be6348eceb126..0000000000000 --- a/branches/sage/mds/osd/OSD.h +++ /dev/null @@ -1,366 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __OSD_H -#define __OSD_H - -#include "msg/Dispatcher.h" - -#include "common/Mutex.h" -#include "common/ThreadPool.h" -#include "common/Timer.h" - -#include "mon/MonMap.h" - -#include "ObjectStore.h" -#include "PG.h" - -#include "common/DecayCounter.h" - - -#include -using namespace std; - -#include -#include -using namespace __gnu_cxx; - - -class Messenger; -class Message; -class Logger; -class ObjectStore; -class OSDMap; - -class OSD : public Dispatcher { -public: - // -- states -- - static const int STATE_BOOTING = 1; - static const int STATE_ACTIVE = 2; - static const int STATE_STOPPING = 3; - - - - /** OSD **/ -protected: - Mutex osd_lock; // global lock - SafeTimer timer; // safe timer - - Messenger *messenger; - Logger *logger; - ObjectStore *store; - MonMap *monmap; - - int whoami; - char dev_path[100]; - -public: - int get_nodeid() { return whoami; } - - static object_t get_osdmap_object_name(epoch_t epoch) { return object_t(0,epoch << 1); } - static object_t get_inc_osdmap_object_name(epoch_t epoch) { return object_t(0, (epoch << 1) + 1); } - - -private: - /** superblock **/ - OSDSuperblock superblock; - epoch_t boot_epoch; - - void write_superblock(); - void write_superblock(ObjectStore::Transaction& t); - int read_superblock(); - - - // -- state -- - int state; - -public: - bool is_booting() { return state == STATE_BOOTING; } - bool is_active() { return state == STATE_ACTIVE; } - bool is_stopping() { return state == STATE_STOPPING; } - -private: - - // heartbeat - void heartbeat(); - - class C_Heartbeat : public Context { - OSD *osd; - public: - C_Heartbeat(OSD *o) : osd(o) {} - void finish(int r) { - osd->heartbeat(); - } - }; - - - // -- stats -- - DecayCounter stat_oprate; - int stat_ops; // ops since last heartbeat - int stat_rd_ops; - int stat_rd_ops_shed_in; - int stat_rd_ops_shed_out; - int stat_qlen; // cumulative queue length since last refresh - int stat_rd_ops_in_queue; // in queue - - Mutex peer_stat_lock; - osd_peer_stat_t my_stat; - hash_map > peer_stat; - hash_map > my_stat_on_peer; // what the peer thinks of me - - void _refresh_my_stat(utime_t now); - osd_peer_stat_t get_my_stat_for(utime_t now, int peer); - void take_peer_stat(int peer, const osd_peer_stat_t& stat); - - // load calculation - //current implementation is moving averges. - class MovingAverager { - private: - Mutex lock; - deque m_Data; - unsigned m_Size; - double m_Total; - - public: - MovingAverager(unsigned size) : m_Size(size), m_Total(0) { } - - void set_size(unsigned size) { - m_Size = size; - } - - void add(double value) { - Mutex::Locker locker(lock); - - // add item - m_Data.push_back(value); - m_Total += value; - - // trim - while (m_Data.size() > m_Size) { - m_Total -= m_Data.front(); - m_Data.pop_front(); - } - } - - double get_average() { - Mutex::Locker locker(lock); - if (m_Data.empty()) return -1; - return m_Total / (double)m_Data.size(); - } - } read_latency_calc, qlen_calc; - - class IATAverager { - public: - struct iat_data { - double last_req_stamp; - double average_iat; - iat_data() : last_req_stamp(0), average_iat(0) {} - }; - private: - mutable Mutex lock; - double alpha; - hash_map iat_map; - - public: - IATAverager(double a) : alpha(a) {} - - void add_sample(object_t oid, double now) { - Mutex::Locker locker(lock); - iat_data &r = iat_map[oid]; - double iat = now - r.last_req_stamp; - r.last_req_stamp = now; - r.average_iat = r.average_iat*(1.0-alpha) + iat*alpha; - } - - bool have(object_t oid) const { - Mutex::Locker locker(lock); - return iat_map.count(oid); - } - - double get_average_iat(object_t oid) const { - Mutex::Locker locker(lock); - hash_map::const_iterator p = iat_map.find(oid); - assert(p != iat_map.end()); - return p->second.average_iat; - } - - bool is_flash_crowd_candidate(object_t oid) const { - Mutex::Locker locker(lock); - return get_average_iat(oid) <= g_conf.osd_flash_crowd_iat_threshold; - } - }; - - IATAverager iat_averager; - - - // -- waiters -- - list finished; - Mutex finished_lock; - - void take_waiters(list& ls) { - finished_lock.Lock(); - finished.splice(finished.end(), ls); - finished_lock.Unlock(); - } - - // -- op queue -- - class ThreadPool *threadpool; - - int pending_ops; - bool waiting_for_no_ops; - Cond no_pending_ops; - Cond op_queue_cond; - - void wait_for_no_ops(); - - void enqueue_op(PG *pg, Message *op); - void dequeue_op(PG *pg); - static void static_dequeueop(OSD *o, PG *pg) { - o->dequeue_op(pg); - }; - - - friend class PG; - friend class ReplicatedPG; - friend class RAID4PG; - - - protected: - - // -- osd map -- - OSDMap *osdmap; - list waiting_for_osdmap; - - hash_map peer_map_epoch; // FIXME types - bool _share_map_incoming(const entity_inst_t& inst, epoch_t epoch); - void _share_map_outgoing(const entity_inst_t& inst); - - void wait_for_new_map(Message *m); - void handle_osd_map(class MOSDMap *m); - - void advance_map(ObjectStore::Transaction& t); - void activate_map(ObjectStore::Transaction& t); - - void get_map(epoch_t e, OSDMap &m); - bool get_map_bl(epoch_t e, bufferlist& bl); - bool get_inc_map_bl(epoch_t e, bufferlist& bl); - bool get_inc_map(epoch_t e, OSDMap::Incremental &inc); - - void send_incremental_map(epoch_t since, const entity_inst_t& inst, bool full); - - - - // -- placement groups -- - hash_map pg_map; - hash_map > waiting_for_pg; - - bool _have_pg(pg_t pgid); - PG *_lookup_lock_pg(pg_t pgid); - PG *_new_lock_pg(pg_t pg); // create new PG (in memory) - PG *_create_lock_pg(pg_t pg, ObjectStore::Transaction& t); // create new PG - void _remove_unlock_pg(PG *pg); // remove from store and memory - - void try_create_pg(pg_t pgid, ObjectStore::Transaction& t); - - void load_pgs(); - void project_pg_history(pg_t pgid, PG::Info::History& h, epoch_t from, - vector& last); - void activate_pg(pg_t pgid, epoch_t epoch); - - class C_Activate : public Context { - OSD *osd; - pg_t pgid; - epoch_t epoch; - public: - C_Activate(OSD *o, pg_t p, epoch_t e) : osd(o), pgid(p), epoch(e) {} - void finish(int r) { - osd->activate_pg(pgid, epoch); - } - }; - - - // -- pg stats -- - Mutex pg_stat_queue_lock; - set pg_stat_queue; - - class C_Stats : public Context { - OSD *osd; - public: - C_Stats(OSD *o) : osd(o) {} - void finish(int r) { - osd->send_pg_stats(); - } - }; - void send_pg_stats(); - - - // -- tids -- - // for ops i issue - tid_t last_tid; - - Mutex tid_lock; - tid_t get_tid() { - tid_t t; - tid_lock.Lock(); - t = ++last_tid; - tid_lock.Unlock(); - return t; - } - - - // -- generic pg recovery -- - int num_pulling; - - void do_notifies(map< int, list >& notify_list); - void do_queries(map< int, map >& query_map); - void do_activators(map& activator_map); - void repeer(PG *pg, map< int, map >& query_map); - - bool require_current_map(Message *m, epoch_t v); - bool require_same_or_newer_map(Message *m, epoch_t e); - - void handle_pg_query(class MOSDPGQuery *m); - void handle_pg_notify(class MOSDPGNotify *m); - void handle_pg_log(class MOSDPGLog *m); - void handle_pg_activate_set(class MOSDPGActivateSet *m); - void handle_pg_remove(class MOSDPGRemove *m); - - // helper for handle_pg_log and handle_pg_activate_set - void _process_pg_info(epoch_t epoch, int from, - PG::Info &info, - PG::Log &log, - PG::Missing &missing, - map* activator_map); - - - public: - OSD(int id, Messenger *m, MonMap *mm, char *dev = 0); - ~OSD(); - - // startup/shutdown - int init(); - int shutdown(); - - // messages - virtual void dispatch(Message *m); - virtual void ms_handle_failure(Message *m, const entity_inst_t& inst); - - void handle_osd_ping(class MOSDPing *m); - void handle_op(class MOSDOp *m); - void handle_op_reply(class MOSDOpReply *m); - - void force_remount(); -}; - -#endif diff --git a/branches/sage/mds/osd/OSDMap.h b/branches/sage/mds/osd/OSDMap.h deleted file mode 100644 index fda57d73ef99e..0000000000000 --- a/branches/sage/mds/osd/OSDMap.h +++ /dev/null @@ -1,531 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __OSDMAP_H -#define __OSDMAP_H - -/* - * describe properties of the OSD cluster. - * disks, disk groups, total # osds, - * - */ -#include "config.h" -#include "include/types.h" -#include "osd_types.h" -#include "msg/Message.h" -#include "common/Mutex.h" -#include "common/Clock.h" - -#include "crush/crush.h" -using namespace crush; - -#include -#include -#include -#include -using namespace std; - - -/* - * some system constants - */ - -// from LSB to MSB, -#define PG_PS_BITS 16 // max bits for placement seed/group portion of PG -#define PG_REP_BITS 6 // up to 64 replicas -#define PG_TYPE_BITS 2 -#define PG_PS_MASK ((1LL<>1)); -} - -inline int calc_bits_of(int t) { - int b = 0; - while (t) { - t = t >> 1; - b++; - } - return b; -} - - - -/** OSDMap - */ -class OSDMap { - -public: - class Incremental { - public: - epoch_t epoch; // new epoch; we are a diff from epoch-1 to epoch - epoch_t mon_epoch; // monitor epoch (election iteration) - utime_t ctime; - - // full (rare) - bufferlist fullmap; // in leiu of below. - bufferlist crush; - - // incremental - map new_up; - map > new_down; - list new_in; - list new_out; - map new_overload; // updated overload value - list old_overload; // no longer overload - - void encode(bufferlist& bl) { - ::_encode(epoch, bl); - ::_encode(mon_epoch, bl); - ::_encode(ctime, bl); - ::_encode(new_up, bl); - ::_encode(new_down, bl); - ::_encode(new_in, bl); - ::_encode(new_out, bl); - ::_encode(new_overload, bl); - ::_encode(fullmap, bl); - ::_encode(crush, bl); - } - void decode(bufferlist& bl, int& off) { - ::_decode(epoch, bl, off); - ::_decode(mon_epoch, bl, off); - ::_decode(ctime, bl, off); - ::_decode(new_up, bl, off); - ::_decode(new_down, bl, off); - ::_decode(new_in, bl, off); - ::_decode(new_out, bl, off); - ::_decode(new_overload, bl, off); - ::_decode(fullmap, bl, off); - ::_decode(crush, bl, off); - } - - Incremental(epoch_t e=0) : epoch(e), mon_epoch(0) {} - }; - -private: - epoch_t epoch; // what epoch of the osd cluster descriptor is this - epoch_t mon_epoch; // monitor epoch (election iteration) - utime_t ctime; // epoch start time - int32_t pg_num; // placement group count - int32_t pg_num_mask; // bitmask for above - int32_t localized_pg_num; // localized place group count - int32_t localized_pg_num_mask; // ditto - - set osds; // all osds - map down_osds; // list of down disks, -> clean shutdown (true/false) - set out_osds; // list of unmapped disks - map overload_osds; - map osd_inst; - - public: - Crush crush; // hierarchical map - - friend class OSDMonitor; - friend class MDS; - - public: - OSDMap() : epoch(0), mon_epoch(0), - pg_num(1<<5), - localized_pg_num(1<<3) { - calc_pg_masks(); - } - - // map info - epoch_t get_epoch() const { return epoch; } - void inc_epoch() { epoch++; } - - void calc_pg_masks() { - pg_num_mask = (1 << calc_bits_of(pg_num-1)) - 1; - localized_pg_num_mask = (1 << calc_bits_of(localized_pg_num-1)) - 1; - } - - int get_pg_num() const { return pg_num; } - void set_pg_num(int m) { pg_num = m; calc_pg_masks(); } - int get_localized_pg_num() const { return localized_pg_num; } - - const utime_t& get_ctime() const { return ctime; } - - bool is_mkfs() const { return epoch == 2; } - bool post_mkfs() const { return epoch > 2; } - - /***** cluster state *****/ - int num_osds() { return osds.size(); } - void get_all_osds(set& ls) { ls = osds; } - - const set& get_osds() { return osds; } - const map& get_down_osds() { return down_osds; } - const set& get_out_osds() { return out_osds; } - const map& get_overload_osds() { return overload_osds; } - - bool exists(int osd) { return osds.count(osd); } - bool is_down(int osd) { return down_osds.count(osd); } - bool is_down_clean(int osd) { return down_osds.count(osd) && down_osds[osd]; } - bool is_up(int osd) { return exists(osd) && !is_down(osd); } - bool is_out(int osd) { return out_osds.count(osd); } - bool is_in(int osd) { return exists(osd) && !is_out(osd); } - - bool have_inst(int osd) { - return osd_inst.count(osd); - } - const entity_inst_t& get_inst(int osd) { - assert(osd_inst.count(osd)); - return osd_inst[osd]; - } - bool get_inst(int osd, entity_inst_t& inst) { - if (osd_inst.count(osd)) { - inst = osd_inst[osd]; - return true; - } - return false; - } - - void mark_down(int o, bool clean) { down_osds[o] = clean; } - void mark_up(int o) { down_osds.erase(o); } - void mark_out(int o) { out_osds.insert(o); } - void mark_in(int o) { out_osds.erase(o); } - - - void apply_incremental(Incremental &inc) { - assert(inc.epoch == epoch+1); - epoch++; - mon_epoch = inc.mon_epoch; - ctime = inc.ctime; - - // full map? - if (inc.fullmap.length()) { - decode(inc.fullmap); - return; - } - if (inc.crush.length()) { - int off = 0; - crush._decode(inc.crush, off); - } - - // nope, incremental. - for (map >::iterator i = inc.new_down.begin(); - i != inc.new_down.end(); - i++) { - assert(down_osds.count(i->first) == 0); - down_osds[i->first] = i->second.second; - //assert(osd_inst.count(i->first) == 0 || osd_inst[i->first] == i->second.first); - osd_inst.erase(i->first); - //cout << "epoch " << epoch << " down osd" << i->first << endl; - } - for (list::iterator i = inc.new_out.begin(); - i != inc.new_out.end(); - i++) { - assert(out_osds.count(*i) == 0); - out_osds.insert(*i); - //cout << "epoch " << epoch << " out osd" << *i << endl; - } - for (list::iterator i = inc.old_overload.begin(); - i != inc.old_overload.end(); - i++) { - assert(overload_osds.count(*i)); - overload_osds.erase(*i); - } - - for (map::iterator i = inc.new_up.begin(); - i != inc.new_up.end(); - i++) { - assert(down_osds.count(i->first)); - down_osds.erase(i->first); - assert(osd_inst.count(i->first) == 0); - osd_inst[i->first] = i->second; - //cout << "epoch " << epoch << " up osd" << i->first << endl; - } - for (list::iterator i = inc.new_in.begin(); - i != inc.new_in.end(); - i++) { - assert(out_osds.count(*i)); - out_osds.erase(*i); - //cout << "epoch " << epoch << " in osd" << *i << endl; - } - for (map::iterator i = inc.new_overload.begin(); - i != inc.new_overload.end(); - i++) { - overload_osds[i->first] = i->second; - } - } - - // serialize, unserialize - void encode(bufferlist& blist) { - ::_encode(epoch, blist); - ::_encode(mon_epoch, blist); - ::_encode(ctime, blist); - ::_encode(pg_num, blist); - ::_encode(localized_pg_num, blist); - - ::_encode(osds, blist); - ::_encode(down_osds, blist); - ::_encode(out_osds, blist); - ::_encode(overload_osds, blist); - ::_encode(osd_inst, blist); - - crush._encode(blist); - } - - void decode(bufferlist& blist) { - int off = 0; - ::_decode(epoch, blist, off); - ::_decode(mon_epoch, blist, off); - ::_decode(ctime, blist, off); - ::_decode(pg_num, blist, off); - ::_decode(localized_pg_num, blist, off); - calc_pg_masks(); - - ::_decode(osds, blist, off); - ::_decode(down_osds, blist, off); - ::_decode(out_osds, blist, off); - ::_decode(overload_osds, blist, off); - ::_decode(osd_inst, blist, off); - - crush._decode(blist, off); - } - - - - - /**** mapping facilities ****/ - - // oid -> pg - ObjectLayout file_to_object_layout(object_t oid, FileLayout& layout) { - return make_object_layout(oid, layout.fl_pg_type, layout.fl_pg_size, layout.fl_pg_preferred, layout.fl_object_stripe_unit); - } - - ObjectLayout make_object_layout(object_t oid, int pg_type, int pg_size, int preferred=-1, int object_stripe_unit = 0) { - static crush::Hash H(777); - - int num = preferred >= 0 ? localized_pg_num:pg_num; - int num_mask = preferred >= 0 ? localized_pg_num_mask:pg_num_mask; - - // calculate ps (placement seed) - ps_t ps; - switch (g_conf.osd_object_layout) { - case CEPH_OBJECT_LAYOUT_LINEAR: - ps = stable_mod(oid.bno + oid.ino, num, num_mask); - break; - - case CEPH_OBJECT_LAYOUT_HASHINO: - //ps = stable_mod(oid.bno + H(oid.bno+oid.ino)^H(oid.ino>>32), num, num_mask); - ps = stable_mod(oid.bno + H(oid.ino)^H(oid.ino>>32), num, num_mask); - break; - - case CEPH_OBJECT_LAYOUT_HASH: - //ps = stable_mod(H( (oid.bno & oid.ino) ^ ((oid.bno^oid.ino) >> 32) ), num, num_mask); - //ps = stable_mod(H(oid.bno) + H(oid.ino)^H(oid.ino>>32), num, num_mask); - //ps = stable_mod(oid.bno + H(oid.bno+oid.ino)^H(oid.bno+oid.ino>>32), num, num_mask); - ps = stable_mod(oid.bno + H(oid.ino)^H(oid.ino>>32), num, num_mask); - break; - - default: - assert(0); - } - - //cout << "preferred " << preferred << " num " << num << " mask " << num_mask << " ps " << ps << endl; - - // construct object layout - return ObjectLayout(pg_t(pg_type, pg_size, ps, preferred), - object_stripe_unit); - } - - - // pg -> (osd list) - int pg_to_osds(pg_t pg, - vector& osds) { // list of osd addr's - // map to osds[] - switch (g_conf.osd_pg_layout) { - case CEPH_PG_LAYOUT_CRUSH: - { - // what crush rule? - int rule; - if (pg.is_rep()) rule = CRUSH_REP_RULE(pg.size()); - else if (pg.is_raid4()) rule = CRUSH_RAID_RULE(pg.size()); - else assert(0); - - // forcefeed? - int forcefeed = -1; - if (pg.preferred() >= 0 && - out_osds.count(pg.preferred()) == 0) - forcefeed = pg.preferred(); - crush.do_rule(crush.rules[rule], - pg.ps(), - osds, - out_osds, overload_osds, - forcefeed); - } - break; - - case CEPH_PG_LAYOUT_LINEAR: - for (int i=0; i= 0 && - g_conf.osd_pg_layout != CEPH_PG_LAYOUT_CRUSH) { - int osd = pg.preferred(); - - // already in there? - if (osds.empty()) { - osds.push_back(osd); - } else { - assert(pg.size() > 0); - for (int i=1; i (up osd list) - int pg_to_acting_osds(pg_t pg, - vector& osds) { // list of osd addr's - // get rush list - vector raw; - pg_to_osds(pg, raw); - - osds.clear(); - for (unsigned i=0; i primary osd - int get_pg_primary(pg_t pg) { - vector group; - int nrep = pg_to_osds(pg, group); - if (nrep) - return group[0]; - return -1; // we fail! - } - - // pg -> acting primary osd - int get_pg_acting_primary(pg_t pg) { - vector group; - int nrep = pg_to_acting_osds(pg, group); - if (nrep > 0) - return group[0]; - return -1; // we fail! - } - int get_pg_acting_tail(pg_t pg) { - vector group; - int nrep = pg_to_acting_osds(pg, group); - if (nrep > 0) - return group[group.size()-1]; - return -1; // we fail! - } - - - /* what replica # is a given osd? 0 primary, -1 for none. */ - int calc_pg_rank(int osd, vector& acting, int nrep=0) { - if (!nrep) nrep = acting.size(); - for (int i=0; i& acting, int nrep=0) { - if (!nrep) nrep = acting.size(); - int rank = calc_pg_rank(osd, acting, nrep); - - if (rank < 0) return PG_ROLE_STRAY; - else if (rank == 0) return PG_ROLE_HEAD; - else if (rank == 1) return PG_ROLE_ACKER; - else return PG_ROLE_MIDDLE; - } - - int get_pg_role(pg_t pg, int osd) { - vector group; - int nrep = pg_to_osds(pg, group); - return calc_pg_role(osd, group, nrep); - } - - /* rank is -1 (stray), 0 (primary), 1,2,3,... (replica) */ - int get_pg_acting_rank(pg_t pg, int osd) { - vector group; - int nrep = pg_to_acting_osds(pg, group); - return calc_pg_rank(osd, group, nrep); - } - /* role is -1 (stray), 0 (primary), 1 (replica) */ - int get_pg_acting_role(pg_t pg, int osd) { - vector group; - int nrep = pg_to_acting_osds(pg, group); - return calc_pg_role(osd, group, nrep); - } - - - - -}; - - -#endif diff --git a/branches/sage/mds/osd/ObjectStore.cc b/branches/sage/mds/osd/ObjectStore.cc deleted file mode 100644 index 7aeab1d063d4d..0000000000000 --- a/branches/sage/mds/osd/ObjectStore.cc +++ /dev/null @@ -1,152 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#include "ObjectStore.h" - -#include "config.h" -#include "common/Clock.h" - -#define dout(x) if (x < g_conf.debug) *_dout << dbeginl << g_clock.now() << " ager: " - -object_t ObjectStore::age_get_oid() { - if (!age_free_oids.empty()) { - object_t o = age_free_oids.front(); - age_free_oids.pop_front(); - return o; - } - return age_cur_oid++; - } - - ssize_t ObjectStore::age_pick_size() { - ssize_t max = file_size_distn.sample() * 1024; - return max/2 + (rand() % 100) * max/200 + 1; - } - - void ObjectStore::age_fill(float pc, utime_t until) { - bufferptr bp(1024*1024); - bp.zero(); - bufferlist bl; - bl.push_back(bp); - while (1) { - if (g_clock.now() > until) break; - - struct statfs st; - statfs(&st); - float a = (float)(st.f_blocks-st.f_bavail) / (float)st.f_blocks; - if (a >= pc) { - dout(10) << "age_fill at " << a << " / " << pc << " stopping" << dendl; - break; - } - - object_t oid = age_get_oid(); - - int b = rand() % 10; - age_objects[b].push_back(oid); - - ssize_t s = age_pick_size(); - - dout(10) << "age_fill at " << a << " / " << pc << " creating " << hex << oid << dec << " sz " << s << dendl; - - off_t off = 0; - while (s) { - ssize_t t = MIN(s, 1024*1024); - write(oid, t, off, bl, false); - off += t; - s -= t; - } - oid++; - } - } - - void ObjectStore::age_empty(float pc) { - int nper = 20; - int n = nper; - while (1) { - struct statfs st; - statfs(&st); - float a = (float)(st.f_blocks-st.f_bavail) / (float)st.f_blocks; - if (a <= pc) { - dout(10) << "age_empty at " << a << " / " << pc << " stopping" << dendl; - break; - } - - int b = rand() % 10; - n--; - if (n == 0 || age_objects[b].empty()) { - dout(10) << "age_empty sync" << dendl; - //sync(); - sync(); - n = nper; - continue; - } - object_t oid = age_objects[b].front(); - age_objects[b].pop_front(); - - dout(10) << "age_empty at " << a << " / " << pc << " removing " << hex << oid << dec << dendl; - - remove(oid); - age_free_oids.push_back(oid); - } - } - - - void ObjectStore::age(int time, - float high_water, // fill to this % - float low_water, // then empty to this % - int count, // this many times - float final_water, // and end here ( <= low_water) - int fake_size_mb) { - utime_t until = g_clock.now(); - until.sec_ref() += time; - - while (age_objects.size() < 10) age_objects.push_back( list() ); - - if (fake_size_mb) { - int fake_bl = fake_size_mb * 256; - struct statfs st; - statfs(&st); - float f = (float)fake_bl / (float)st.f_blocks; - high_water = (float)high_water * f; - low_water = (float)low_water * f; - final_water = (float)final_water * f; - dout(10) << "fake " << fake_bl << " / " << st.f_blocks << " is " << f << ", high " << high_water << " low " << low_water << " final " << final_water << dendl; - } - - // init size distn (once) - if (!did_distn) { - did_distn = true; - age_cur_oid = 1; - file_size_distn.add(1, 19.0758125+0.65434375); - file_size_distn.add(512, 35.6566); - file_size_distn.add(1024, 27.7271875); - file_size_distn.add(2*1024, 16.63503125); - //file_size_distn.add(4*1024, 106.82384375); - //file_size_distn.add(8*1024, 81.493375); - //file_size_distn.add(16*1024, 14.13553125); - //file_size_distn.add(32*1024, 2.176); - //file_size_distn.add(256*1024, 0.655938); - //file_size_distn.add(512*1024, 0.1480625); - //file_size_distn.add(1*1024*1024, 0.020125); // actually 2, but 32bit - file_size_distn.normalize(); - } - - // clear - for (int i=0; i<10; i++) - age_objects[i].clear(); - - for (int c=1; c<=count; c++) { - if (g_clock.now() > until) break; - - dout(1) << "age " << c << "/" << count << " filling to " << high_water << dendl; - age_fill(high_water, until); - if (c == count) { - dout(1) << "age final empty to " << final_water << dendl; - age_empty(final_water); - } else { - dout(1) << "age " << c << "/" << count << " emptying to " << low_water << dendl; - age_empty(low_water); - } - } - dout(1) << "age finished" << dendl; - } - diff --git a/branches/sage/mds/osd/ObjectStore.h b/branches/sage/mds/osd/ObjectStore.h deleted file mode 100644 index c8df5d8218fed..0000000000000 --- a/branches/sage/mds/osd/ObjectStore.h +++ /dev/null @@ -1,611 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __OBJECTSTORE_H -#define __OBJECTSTORE_H - -#include "include/types.h" -#include "osd_types.h" -#include "include/Context.h" -#include "include/buffer.h" - -#include "include/Distribution.h" - -#include - -#ifdef DARWIN -#include -#else -#include /* or */ -#endif /* DARWIN */ - -#include -using std::list; - -#ifndef MIN -# define MIN(a,b) ((a) < (b) ? (a):(b)) -#endif - -/* - * low-level interface to the local OSD file system - */ - - - -class ObjectStore { -public: - - - class FragmentationStat { - public: - int total; - int num_extent; - int avg_extent; - map extent_dist; // powers of two - map extent_dist_sum; // powers of two - - float avg_extent_per_object; - int avg_extent_jump; // avg distance bweteen consecutive extents - - int total_free; - int num_free_extent; - int avg_free_extent; - map free_extent_dist; // powers of two - map free_extent_dist_sum; // powers of two - }; - - - - /********************************* - * transaction - */ - class Transaction { - public: - static const int OP_READ = 1; // oid, offset, len, pbl - static const int OP_STAT = 2; // oid, pstat - static const int OP_GETATTR = 3; // oid, attrname, pattrval - static const int OP_GETATTRS = 4; // oid, pattrset - - static const int OP_WRITE = 10; // oid, offset, len, bl - static const int OP_TRUNCATE = 11; // oid, len - static const int OP_REMOVE = 13; // oid - static const int OP_SETATTR = 14; // oid, attrname, attrval - static const int OP_SETATTRS = 15; // oid, attrset - static const int OP_RMATTR = 16; // oid, attrname - static const int OP_CLONE = 17; // oid, newoid - - static const int OP_TRIMCACHE = 18; // oid, offset, len - - static const int OP_MKCOLL = 20; // cid - static const int OP_RMCOLL = 21; // cid - static const int OP_COLL_ADD = 22; // cid, oid - static const int OP_COLL_REMOVE = 23; // cid, oid - static const int OP_COLL_SETATTR = 24; // cid, attrname, attrval - static const int OP_COLL_RMATTR = 25; // cid, attrname - - private: - list ops; - list bls; - list oids; - list cids; - list lengths; - list attrnames; - list attrnames2; - - // for reads only (not encoded) - list pbls; - list psts; - list< pair > pattrvals; - list< map* > pattrsets; - - public: - bool have_op() { - return !ops.empty(); - } - int get_num_ops() { return ops.size(); } - int get_op() { - int op = ops.front(); - ops.pop_front(); - return op; - } - void get_bl(bufferlist& bl) { - bl.claim(bls.front()); - bls.pop_front(); - } - void get_oid(object_t& oid) { - oid = oids.front(); - oids.pop_front(); - } - void get_cid(coll_t& cid) { - cid = cids.front(); - cids.pop_front(); - } - void get_length(off_t& len) { - len = lengths.front(); - lengths.pop_front(); - } - void get_attrname(const char * &p) { - p = attrnames.front(); - attrnames.pop_front(); - } - void get_pbl(bufferlist* &pbl) { - pbl = pbls.front(); - pbls.pop_front(); - } - void get_pstat(struct stat* &pst) { - pst = psts.front(); - psts.pop_front(); - } - void get_pattrval(pair& p) { - p = pattrvals.front(); - pattrvals.pop_front(); - } - void get_pattrset(map* &ps) { - ps = pattrsets.front(); - pattrsets.pop_front(); - } - - - void read(object_t oid, off_t off, size_t len, bufferlist *pbl) { - int op = OP_READ; - ops.push_back(op); - oids.push_back(oid); - lengths.push_back(off); - lengths.push_back(len); - pbls.push_back(pbl); - } - void stat(object_t oid, struct stat *st) { - int op = OP_STAT; - ops.push_back(op); - oids.push_back(oid); - psts.push_back(st); - } - void getattr(object_t oid, const char* name, void* val, int *plen) { - int op = OP_GETATTR; - ops.push_back(op); - oids.push_back(oid); - attrnames.push_back(name); - pattrvals.push_back(pair(val,plen)); - } - void getattrs(object_t oid, map& aset) { - int op = OP_GETATTRS; - ops.push_back(op); - oids.push_back(oid); - pattrsets.push_back(&aset); - } - - void write(object_t oid, off_t off, size_t len, const bufferlist& bl) { - int op = OP_WRITE; - ops.push_back(op); - oids.push_back(oid); - lengths.push_back(off); - lengths.push_back(len); - bls.push_back(bl); - } - void trim_from_cache(object_t oid, off_t off, size_t len) { - int op = OP_TRIMCACHE; - ops.push_back(op); - oids.push_back(oid); - lengths.push_back(off); - lengths.push_back(len); - } - void truncate(object_t oid, off_t off) { - int op = OP_TRUNCATE; - ops.push_back(op); - oids.push_back(oid); - lengths.push_back(off); - } - void remove(object_t oid) { - int op = OP_REMOVE; - ops.push_back(op); - oids.push_back(oid); - } - void setattr(object_t oid, const char* name, const void* val, int len) { - int op = OP_SETATTR; - ops.push_back(op); - oids.push_back(oid); - attrnames.push_back(name); - //attrvals.push_back(pair(val,len)); - bufferlist bl; - bl.append((char*)val,len); - bls.push_back(bl); - } - void setattrs(object_t oid, map& attrset) { - int op = OP_SETATTRS; - ops.push_back(op); - oids.push_back(oid); - pattrsets.push_back(&attrset); - } - void rmattr(object_t oid, const char* name) { - int op = OP_RMATTR; - ops.push_back(op); - oids.push_back(oid); - attrnames.push_back(name); - } - void clone(object_t oid, object_t noid) { - int op = OP_CLONE; - ops.push_back(op); - oids.push_back(oid); - oids.push_back(noid); - } - void create_collection(coll_t cid) { - int op = OP_MKCOLL; - ops.push_back(op); - cids.push_back(cid); - } - void remove_collection(coll_t cid) { - int op = OP_RMCOLL; - ops.push_back(op); - cids.push_back(cid); - } - void collection_add(coll_t cid, object_t oid) { - int op = OP_COLL_ADD; - ops.push_back(op); - cids.push_back(cid); - oids.push_back(oid); - } - void collection_remove(coll_t cid, object_t oid) { - int op = OP_COLL_REMOVE; - ops.push_back(op); - cids.push_back(cid); - oids.push_back(oid); - } - void collection_setattr(coll_t cid, const char* name, const void* val, int len) { - int op = OP_COLL_SETATTR; - ops.push_back(op); - cids.push_back(cid); - attrnames.push_back(name); - bufferlist bl; - bl.append((char*)val, len); - bls.push_back(bl); - } - void collection_rmattr(coll_t cid, const char* name) { - int op = OP_COLL_RMATTR; - ops.push_back(op); - cids.push_back(cid); - attrnames.push_back(name); - } - - // etc. - - void _encode(bufferlist& bl) { - ::_encode(ops, bl); - ::_encode(bls, bl); - ::_encode(oids, bl); - ::_encode(cids, bl); - ::_encode(lengths, bl); - ::_encode(attrnames, bl); - } - void _decode(bufferlist& bl, int& off) { - ::_decode(ops, bl, off); - ::_decode(bls, bl, off); - ::_decode(oids, bl, off); - ::_decode(cids, bl, off); - ::_decode(lengths, bl, off); - ::_decode(attrnames2, bl, off); - for (list::iterator p = attrnames2.begin(); - p != attrnames2.end(); - ++p) - attrnames.push_back((*p).c_str()); - } - }; - - - - /* this implementation is here only for naive ObjectStores that - * do not do atomic transactions natively. it is not atomic. - */ - virtual unsigned apply_transaction(Transaction& t, Context *onsafe=0) { - // non-atomic implementation - while (t.have_op()) { - int op = t.get_op(); - switch (op) { - case Transaction::OP_READ: - { - object_t oid; - off_t offset, len; - t.get_oid(oid); - t.get_length(offset); - t.get_length(len); - bufferlist *pbl; - t.get_pbl(pbl); - read(oid, offset, len, *pbl); - } - break; - case Transaction::OP_STAT: - { - object_t oid; - t.get_oid(oid); - struct stat *st; - t.get_pstat(st); - stat(oid, st); - } - break; - case Transaction::OP_GETATTR: - { - object_t oid; - t.get_oid(oid); - const char *attrname; - t.get_attrname(attrname); - pair pattrval; - t.get_pattrval(pattrval); - *pattrval.second = getattr(oid, attrname, pattrval.first, *pattrval.second); - } - break; - case Transaction::OP_GETATTRS: - { - object_t oid; - t.get_oid(oid); - map *pset; - t.get_pattrset(pset); - getattrs(oid, *pset); - } - break; - - case Transaction::OP_WRITE: - { - object_t oid; - t.get_oid(oid); - off_t offset, len; - t.get_length(offset); - t.get_length(len); - bufferlist bl; - t.get_bl(bl); - write(oid, offset, len, bl, 0); - } - break; - - case Transaction::OP_TRIMCACHE: - { - object_t oid; - t.get_oid(oid); - off_t offset, len; - t.get_length(offset); - t.get_length(len); - trim_from_cache(oid, offset, len); - } - break; - - case Transaction::OP_TRUNCATE: - { - object_t oid; - t.get_oid(oid); - off_t len; - t.get_length(len); - truncate(oid, len, 0); - } - break; - - case Transaction::OP_REMOVE: - { - object_t oid; - t.get_oid(oid); - remove(oid, 0); - } - break; - - case Transaction::OP_SETATTR: - { - object_t oid; - t.get_oid(oid); - const char *attrname; - t.get_attrname(attrname); - bufferlist bl; - t.get_bl(bl); - setattr(oid, attrname, bl.c_str(), bl.length(), 0); - } - break; - case Transaction::OP_SETATTRS: - { - object_t oid; - t.get_oid(oid); - map *pattrset; - t.get_pattrset(pattrset); - setattrs(oid, *pattrset, 0); - } - break; - - case Transaction::OP_RMATTR: - { - object_t oid; - t.get_oid(oid); - const char *attrname; - t.get_attrname(attrname); - rmattr(oid, attrname, 0); - } - break; - - case Transaction::OP_CLONE: - { - object_t oid; - t.get_oid(oid); - object_t noid; - t.get_oid(noid); - clone(oid, noid); - } - break; - - case Transaction::OP_MKCOLL: - { - coll_t cid; - t.get_cid(cid); - create_collection(cid, 0); - } - break; - - case Transaction::OP_RMCOLL: - { - coll_t cid; - t.get_cid(cid); - destroy_collection(cid, 0); - } - break; - - case Transaction::OP_COLL_ADD: - { - coll_t cid; - t.get_cid(cid); - object_t oid; - t.get_oid(oid); - collection_add(cid, oid, 0); - } - break; - - case Transaction::OP_COLL_REMOVE: - { - coll_t cid; - t.get_cid(cid); - object_t oid; - t.get_oid(oid); - collection_remove(cid, oid, 0); - } - break; - - case Transaction::OP_COLL_SETATTR: - { - coll_t cid; - t.get_cid(cid); - const char *attrname; - t.get_attrname(attrname); - bufferlist bl; - t.get_bl(bl); - collection_setattr(cid, attrname, bl.c_str(), bl.length(), 0); - } - break; - - case Transaction::OP_COLL_RMATTR: - { - coll_t cid; - t.get_cid(cid); - const char *attrname; - t.get_attrname(attrname); - collection_rmattr(cid, attrname, 0); - } - break; - - - default: - cerr << "bad op " << op << std::endl; - assert(0); - } - } - - if (onsafe) sync(onsafe); - - return 0; // FIXME count errors - } - - /*********************************************/ - - - - public: - ObjectStore() {} - virtual ~ObjectStore() {} - - // mgmt - virtual int mount() = 0; - virtual int umount() = 0; - virtual int mkfs() = 0; // wipe - - virtual int statfs(struct statfs *buf) = 0; - - // objects - virtual int pick_object_revision_lt(object_t& oid) = 0; - - virtual bool exists(object_t oid) = 0; // useful? - virtual int stat(object_t oid, struct stat *st) = 0; // struct stat? - - virtual int remove(object_t oid, - Context *onsafe=0) = 0; - - virtual int truncate(object_t oid, off_t size, - Context *onsafe=0) = 0; - - virtual int read(object_t oid, - off_t offset, size_t len, - bufferlist& bl) = 0; - virtual int write(object_t oid, - off_t offset, size_t len, - const bufferlist& bl, - Context *onsafe) = 0;//{ return -1; } - virtual void trim_from_cache(object_t oid, - off_t offset, size_t len) { } - virtual int is_cached(object_t oid, - off_t offset, - size_t len) { return -1; } - - virtual int setattr(object_t oid, const char *name, - const void *value, size_t size, - Context *onsafe=0) {return 0;} //= 0; - virtual int setattrs(object_t oid, map& aset, - Context *onsafe=0) {return 0;} //= 0; - virtual int getattr(object_t oid, const char *name, - void *value, size_t size) {return 0;} //= 0; - virtual int getattrs(object_t oid, map& aset) {return 0;}; - - virtual int rmattr(object_t oid, const char *name, - Context *onsafe=0) {return 0;} - - virtual int clone(object_t oid, object_t noid) { - return -1; - } - - virtual int list_objects(list& ls) = 0;//{ return -1; } - - virtual int get_object_collections(object_t oid, set& ls) { return -1; } - - //virtual int listattr(object_t oid, char *attrs, size_t size) {return 0;} //= 0; - - // collections - virtual int list_collections(list& ls) {return 0;}//= 0; - virtual int create_collection(coll_t c, - Context *onsafe=0) {return 0;}//= 0; - virtual int destroy_collection(coll_t c, - Context *onsafe=0) {return 0;}//= 0; - virtual bool collection_exists(coll_t c) {return 0;} - virtual int collection_stat(coll_t c, struct stat *st) {return 0;}//= 0; - virtual int collection_add(coll_t c, object_t o, - Context *onsafe=0) {return 0;}//= 0; - virtual int collection_remove(coll_t c, object_t o, - Context *onsafe=0) {return 0;}// = 0; - virtual int collection_list(coll_t c, list& o) {return 0;}//= 0; - - virtual int collection_setattr(coll_t cid, const char *name, - const void *value, size_t size, - Context *onsafe=0) {return 0;} //= 0; - virtual int collection_rmattr(coll_t cid, const char *name, - Context *onsafe=0) {return 0;} //= 0; - virtual int collection_getattr(coll_t cid, const char *name, - void *value, size_t size) {return 0;} //= 0; - - virtual int collection_getattrs(coll_t cid, map &aset) = 0;//{ return -1; } - virtual int collection_setattrs(coll_t cid, map &aset) = 0;//{ return -1; } - - - //virtual int collection_listattr(coll_t cid, char *attrs, size_t size) {return 0;} //= 0; - - virtual void sync(Context *onsync) {} - virtual void sync() {} - - - virtual void _fake_writes(bool b) {}; - - virtual void _get_frag_stat(FragmentationStat& st) {}; - -}; - - -#endif diff --git a/branches/sage/mds/osd/PG.cc b/branches/sage/mds/osd/PG.cc deleted file mode 100644 index 5b55c9a88e1de..0000000000000 --- a/branches/sage/mds/osd/PG.cc +++ /dev/null @@ -1,1289 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "PG.h" -#include "config.h" -#include "OSD.h" - -#include "common/Timer.h" - -#include "messages/MOSDOp.h" -#include "messages/MOSDPGNotify.h" -#include "messages/MOSDPGLog.h" -#include "messages/MOSDPGRemove.h" -#include "messages/MOSDPGActivateSet.h" - -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_osd) *_dout << dbeginl << g_clock.now() << " osd" << osd->whoami << " " << (osd->osdmap ? osd->osdmap->get_epoch():0) << " " << *this << " " - - -/******* PGLog ********/ - -void PG::Log::copy_after(const Log &other, eversion_t v) -{ - assert(v >= other.bottom); - top = bottom = other.top; - for (list::const_reverse_iterator i = other.log.rbegin(); - i != other.log.rend(); - i++) { - if (i->version == v) break; - assert(i->version > v); - log.push_front(*i); - } - bottom = v; -} - -bool PG::Log::copy_after_unless_divergent(const Log &other, eversion_t split, eversion_t floor) -{ - assert(split >= other.bottom); - assert(floor >= other.bottom); - assert(floor <= split); - top = bottom = other.top; - - /* runs on replica. split is primary's log.top. floor is how much they want. - split tell us if the primary is divergent.. e.g.: - -> i am A, B is primary, split is 2'6, floor is 2'2. -A B C -2'2 2'2 -2'3 2'3 2'3 -2'4 2'4 2'4 -3'5 | 2'5 2'5 -3'6 | 2'6 -3'7 | -3'8 | -3'9 | - -> i return full backlog. - */ - - for (list::const_reverse_iterator i = other.log.rbegin(); - i != other.log.rend(); - i++) { - // is primary divergent? - // e.g. my 3'6 vs their 2'6 split - if (i->version.version == split.version && i->version.epoch > split.epoch) { - clear(); - return false; // divergent! - } - if (i->version == floor) break; - assert(i->version > floor); - - // e.g. my 2'23 > '12 - log.push_front(*i); - } - bottom = floor; - return true; -} - -void PG::Log::copy_non_backlog(const Log &other) -{ - if (other.backlog) { - top = other.top; - bottom = other.bottom; - for (list::const_reverse_iterator i = other.log.rbegin(); - i != other.log.rend(); - i++) - if (i->version > bottom) - log.push_front(*i); - else - break; - } else { - *this = other; - } -} - - - -void PG::IndexedLog::trim(ObjectStore::Transaction& t, eversion_t s) -{ - if (backlog && s < bottom) - s = bottom; - - while (!log.empty()) { - Entry &e = *log.begin(); - - if (e.version > s) break; - - assert(complete_to != log.begin()); - assert(requested_to != log.begin()); - - // remove from index, - unindex(e); - - // from log - log.pop_front(); - } - - // raise bottom? - if (backlog) backlog = false; - if (bottom < s) bottom = s; -} - - -void PG::IndexedLog::trim_write_ahead(eversion_t last_update) -{ - while (!log.empty() && - log.rbegin()->version > last_update) { - // remove from index - unindex(*log.rbegin()); - - // remove - log.pop_back(); - } -} - -void PG::trim_write_ahead() -{ - if (info.last_update < log.top) { - dout(10) << "trim_write_ahead (" << info.last_update << "," << log.top << "]" << dendl; - log.trim_write_ahead(info.last_update); - } else { - assert(info.last_update == log.top); - dout(10) << "trim_write_ahead last_update=top=" << info.last_update << dendl; - } - -} - -void PG::proc_replica_log(Log &olog, Missing& omissing, int from) -{ - dout(10) << "proc_replica_log for osd" << from << ": " << olog << " " << omissing << dendl; - assert(!is_active()); - - if (!have_master_log) { - // i'm building master log. - // note peer's missing. - peer_missing[from] = omissing; - - // merge log into our own log - merge_log(olog, omissing, from); - proc_missing(olog, omissing, from); - } else { - // i'm just building missing lists. - peer_missing[from] = omissing; - - // iterate over peer log. in reverse. - list::reverse_iterator pp = olog.log.rbegin(); - eversion_t lu = peer_info[from].last_update; - while (pp != olog.log.rend()) { - if (!log.objects.count(pp->oid)) { - dout(10) << " divergent " << *pp << " not in our log, generating backlog" << dendl; - generate_backlog(); - } - - if (!log.objects.count(pp->oid)) { - dout(10) << " divergent " << *pp << " dne, must have been new, ignoring" << dendl; - ++pp; - continue; - } - - if (log.objects[pp->oid]->version == pp->version) { - break; // we're no longer divergent. - //++pp; - //continue; - } - - if (log.objects[pp->oid]->version > pp->version) { - dout(10) << " divergent " << *pp - << " superceded by " << log.objects[pp->oid] - << ", ignoring" << dendl; - } else { - dout(10) << " divergent " << *pp << ", adding to missing" << dendl; - peer_missing[from].add(pp->oid, pp->version); - } - - ++pp; - if (pp != olog.log.rend()) - lu = pp->version; - else - lu = olog.bottom; - } - - if (lu < peer_info[from].last_update) { - dout(10) << " peer osd" << from << " last_update now " << lu << dendl; - peer_info[from].last_update = lu; - if (lu < oldest_update) { - dout(10) << " oldest_update now " << lu << dendl; - oldest_update = lu; - } - } - - proc_missing(olog, peer_missing[from], from); - } -} - -void PG::merge_log(Log &olog, Missing &omissing, int fromosd) -{ - dout(10) << "merge_log " << olog << " from osd" << fromosd - << " into " << log << dendl; - - //dout(0) << "log" << dendl; - //log.print(cout); - //dout(0) << "olog" << dendl; - //olog.print(cout); - - if (log.empty() || - (olog.bottom > log.top && olog.backlog)) { // e.g. log=(0,20] olog=(40,50]+backlog) - - // swap and index - log.log.swap(olog.log); - log.index(); - - // find split point (old log.top) in new log - // add new items to missing along the way. - for (list::reverse_iterator p = log.log.rbegin(); - p != log.log.rend(); - p++) { - if (p->version <= log.top) { - // ok, p is at split point. - - // was our old log divergent? - if (log.top > p->version) { - dout(10) << "merge_log i was (possibly) divergent for (" << p->version << "," << log.top << "]" << dendl; - if (p->version < oldest_update) - oldest_update = p->version; - - while (!olog.log.empty() && - olog.log.rbegin()->version > p->version) { - Log::Entry &oe = *olog.log.rbegin(); // old entry (possibly divergent) - if (log.objects.count(oe.oid)) { - if (log.objects[oe.oid]->version < oe.version) { - dout(10) << "merge_log divergent entry " << oe - << " not superceded by " << *log.objects[oe.oid] - << ", adding to missing" << dendl; - missing.add(oe.oid, oe.version); - } else { - dout(10) << "merge_log divergent entry " << oe - << " superceded by " << *log.objects[oe.oid] - << ", ignoring" << dendl; - } - } else { - dout(10) << "merge_log divergent entry " << oe << ", adding to missing" << dendl; - missing.add(oe.oid, oe.version); - } - olog.log.pop_back(); // discard divergent entry - } - } - break; - } - - if (p->is_delete()) { - dout(10) << "merge_log merging " << *p << ", not missing" << dendl; - missing.rm(p->oid, p->version); - } else { - dout(10) << "merge_log merging " << *p << ", now missing" << dendl; - missing.add(p->oid, p->version); - } - } - - info.last_update = log.top = olog.top; - info.log_bottom = log.bottom = olog.bottom; - info.log_backlog = log.backlog = olog.backlog; - } - - else { - // i can merge the two logs! - - // extend on bottom? - // FIXME: what if we have backlog, but they have lower bottom? - if (olog.bottom < log.bottom && olog.top >= log.bottom && !log.backlog) { - dout(10) << "merge_log extending bottom to " << olog.bottom - << (olog.backlog ? " +backlog":"") - << dendl; - - // ok - list::iterator from = olog.log.begin(); - list::iterator to; - for (to = from; - to != olog.log.end(); - to++) { - if (to->version > log.bottom) break; - - // update our index while we're here - log.index(*to); - - dout(15) << *to << dendl; - - // new missing object? - if (to->version > info.last_complete) { - if (to->is_update()) - missing.add(to->oid, to->version); - else - missing.rm(to->oid, to->version); - } - } - assert(to != olog.log.end()); - - // splice into our log. - log.log.splice(log.log.begin(), - olog.log, from, to); - - info.log_bottom = log.bottom = olog.bottom; - info.log_backlog = log.backlog = olog.backlog; - } - - // extend on top? - if (olog.top > log.top && - olog.bottom <= log.top) { - dout(10) << "merge_log extending top to " << olog.top << dendl; - - list::iterator to = olog.log.end(); - list::iterator from = olog.log.end(); - while (1) { - if (from == olog.log.begin()) break; - from--; - //dout(0) << "? " << *from << dendl; - if (from->version < log.top) { - from++; - break; - } - - log.index(*from); - dout(10) << "merge_log " << *from << dendl; - - // add to missing - if (from->is_update()) { - missing.add(from->oid, from->version); - } else - missing.rm(from->oid, from->version); - } - - // remove divergent items - while (1) { - Log::Entry *oldtail = &(*log.log.rbegin()); - if (oldtail->version.version+1 == from->version.version) break; - - // divergent! - assert(oldtail->version.version >= from->version.version); - - if (log.objects[oldtail->oid]->version == oldtail->version) { - // and significant. - dout(10) << "merge_log had divergent " << *oldtail << ", adding to missing" << dendl; - //missing.add(oldtail->oid); - assert(0); - } else { - dout(10) << "merge_log had divergent " << *oldtail << ", already missing" << dendl; - assert(missing.is_missing(oldtail->oid)); - } - log.log.pop_back(); - } - - // splice - log.log.splice(log.log.end(), - olog.log, from, to); - - info.last_update = log.top = olog.top; - } - } - - dout(10) << "merge_log result " << log << " " << missing << dendl; - //log.print(cout); - -} - -void PG::proc_missing(Log &olog, Missing &omissing, int fromosd) -{ - // found items? - for (map::iterator p = missing.missing.begin(); - p != missing.missing.end(); - p++) { - if (omissing.is_missing(p->first)) { - assert(omissing.is_missing(p->first, p->second)); - if (omissing.loc.count(p->first)) { - dout(10) << "proc_missing missing " << p->first << " " << p->second - << " on osd" << omissing.loc[p->first] << dendl; - missing.loc[p->first] = omissing.loc[p->first]; - } else { - dout(10) << "proc_missing missing " << p->first << " " << p->second - << " also LOST on source, osd" << fromosd << dendl; - } - } - else if (p->second <= olog.top) { - dout(10) << "proc_missing missing " << p->first << " " << p->second - << " on source, osd" << fromosd << dendl; - missing.loc[p->first] = fromosd; - } else { - dout(10) << "proc_missing " << p->first << " " << p->second - << " > olog.top " << olog.top << ", not found...." - << dendl; - } - } - - dout(10) << "proc_missing missing " << missing.missing << dendl; -} - - - -void PG::generate_backlog() -{ - dout(10) << "generate_backlog to " << log << dendl; - assert(!log.backlog); - log.backlog = true; - - list olist; - osd->store->collection_list(info.pgid, olist); - - int local = 0; - map add; - for (list::iterator it = olist.begin(); - it != olist.end(); - it++) { - local++; - - if (log.logged_object(*it)) continue; // already have it logged. - - // add entry - Log::Entry e; - e.op = Log::Entry::MODIFY; // FIXME when we do smarter op codes! - e.oid = *it; - osd->store->getattr(*it, - "version", - &e.version, sizeof(e.version)); - add[e.version] = e; - dout(10) << "generate_backlog found " << e << dendl; - } - - for (map::reverse_iterator i = add.rbegin(); - i != add.rend(); - i++) { - log.log.push_front(i->second); - log.index( *log.log.begin() ); // index - } - - dout(10) << local << " local objects, " - << add.size() << " objects added to backlog, " - << log.objects.size() << " in pg" << dendl; - - //log.print(cout); -} - -void PG::drop_backlog() -{ - dout(10) << "drop_backlog for " << log << dendl; - //log.print(cout); - - assert(log.backlog); - log.backlog = false; - - while (!log.log.empty()) { - Log::Entry &e = *log.log.begin(); - if (e.version > log.bottom) break; - - dout(15) << "drop_backlog trimming " << e.version << dendl; - log.unindex(e); - log.log.pop_front(); - } -} - - - - - -ostream& PG::Log::print(ostream& out) const -{ - out << *this << dendl; - for (list::const_iterator p = log.begin(); - p != log.end(); - p++) - out << *p << dendl; - return out; -} - - - - - -/******* PG ***********/ -void PG::build_prior() -{ - // build prior set. - prior_set.clear(); - - // current - for (unsigned i=1; iosdmap->get_epoch(); - epoch++) { - OSDMap omap; - osd->get_map(epoch, omap); - - vector acting; - omap.pg_to_acting_osds(get_pgid(), acting); - - for (unsigned i=0; iosdmap->is_up(acting[i]) && // is up now - acting[i] != osd->whoami) // and is not me - prior_set.insert(acting[i]); - } - } - - dout(10) << "build_prior built " << prior_set << dendl; -} - -void PG::adjust_prior() -{ - assert(!prior_set.empty()); - - // raise last_epoch_started_any - epoch_t max = 0; - for (map::iterator it = peer_info.begin(); - it != peer_info.end(); - it++) { - if (it->second.last_epoch_started > max) - max = it->second.last_epoch_started; - } - - dout(10) << "adjust_prior last_epoch_started_any " - << last_epoch_started_any << " -> " << max << dendl; - assert(max > last_epoch_started_any); - last_epoch_started_any = max; - - // rebuild prior set - build_prior(); -} - - -void PG::clear_primary_state() -{ - dout(10) << "clear_primary_state" << dendl; - - // clear peering state - have_master_log = false; - prior_set.clear(); - stray_set.clear(); - uptodate_set.clear(); - peer_info_requested.clear(); - peer_log_requested.clear(); - peer_info.clear(); - peer_missing.clear(); - - stat_object_temp_rd.clear(); - - last_epoch_started_any = info.last_epoch_started; -} - -void PG::peer(ObjectStore::Transaction& t, - map< int, map >& query_map, - map *activator_map) -{ - dout(10) << "peer. acting is " << acting - << ", prior_set is " << prior_set << dendl; - - - /** GET ALL PG::Info *********/ - - // -- query info from everyone in prior_set. - bool missing_info = false; - for (set::iterator it = prior_set.begin(); - it != prior_set.end(); - it++) { - if (peer_info.count(*it)) { - dout(10) << " have info from osd" << *it - << ": " << peer_info[*it] - << dendl; - continue; - } - missing_info = true; - - if (peer_info_requested.count(*it)) { - dout(10) << " waiting for osd" << *it << dendl; - continue; - } - - dout(10) << " querying info from osd" << *it << dendl; - query_map[*it][info.pgid] = Query(Query::INFO, info.history); - peer_info_requested.insert(*it); - } - if (missing_info) return; - - - // -- ok, we have all (prior_set) info. (and maybe others.) - - // did we crash? - dout(10) << " last_epoch_started_any " << last_epoch_started_any << dendl; - if (last_epoch_started_any) { - OSDMap omap; - osd->get_map(last_epoch_started_any, omap); - - // start with the last active set of replicas - set last_started; - vector acting; - bool cleanly_down = true; - omap.pg_to_acting_osds(get_pgid(), acting); - for (unsigned i=0; iosdmap->get_epoch(); - e++) { - OSDMap omap; - osd->get_map(e, omap); - - set still_up; - - for (set::iterator i = last_started.begin(); - i != last_started.end(); - i++) { - //dout(10) << " down in epoch " << e << " is " << omap.get_down_osds() << dendl; - if (omap.is_up(*i)) - still_up.insert(*i); - else if (!omap.is_down_clean(*i)) - cleanly_down = false; - } - - last_started.swap(still_up); - //dout(10) << " still active as of epoch " << e << ": " << last_started << dendl; - } - - if (last_started.empty()) { - if (cleanly_down) { - dout(10) << " cleanly stopped since epoch " << last_epoch_started_any << dendl; - } else { - dout(10) << " crashed since epoch " << last_epoch_started_any << dendl; - state_set(STATE_CRASHED); - } - } else { - dout(10) << " still active from last started: " << last_started << dendl; - } - } else if (osd->osdmap->post_mkfs()) { - dout(10) << " crashed since epoch " << last_epoch_started_any << dendl; - state_set(STATE_CRASHED); - } - - dout(10) << " peers_complete_thru " << peers_complete_thru << dendl; - - - - - /** CREATE THE MASTER PG::Log *********/ - - // who (of all priors and active) has the latest PG version? - eversion_t newest_update = info.last_update; - int newest_update_osd = osd->whoami; - - oldest_update = info.last_update; // only of acting (current) osd set. - peers_complete_thru = info.last_complete; - - for (map::iterator it = peer_info.begin(); - it != peer_info.end(); - it++) { - if (it->second.last_update > newest_update) { - newest_update = it->second.last_update; - newest_update_osd = it->first; - } - if (is_acting(it->first)) { - if (it->second.last_update < oldest_update) - oldest_update = it->second.last_update; - if (it->second.last_complete < peers_complete_thru) - peers_complete_thru = it->second.last_complete; - } - } - - // gather log(+missing) from that person! - if (newest_update_osd != osd->whoami) { - if (peer_log_requested.count(newest_update_osd) || - peer_summary_requested.count(newest_update_osd)) { - dout(10) << " newest update on osd" << newest_update_osd - << " v " << newest_update - << ", already queried" - << dendl; - } else { - // we'd like it back to oldest_update, but will settle for log_bottom - eversion_t since = MAX(peer_info[newest_update_osd].log_bottom, - oldest_update); - if (peer_info[newest_update_osd].log_bottom < log.top) { - dout(10) << " newest update on osd" << newest_update_osd - << " v " << newest_update - << ", querying since " << since - << dendl; - query_map[newest_update_osd][info.pgid] = Query(Query::LOG, log.top, since, info.history); - peer_log_requested.insert(newest_update_osd); - } else { - dout(10) << " newest update on osd" << newest_update_osd - << " v " << newest_update - << ", querying entire summary/backlog" - << dendl; - assert((peer_info[newest_update_osd].last_complete >= - peer_info[newest_update_osd].log_bottom) || - peer_info[newest_update_osd].log_backlog); // or else we're in trouble. - query_map[newest_update_osd][info.pgid] = Query(Query::BACKLOG, info.history); - peer_summary_requested.insert(newest_update_osd); - } - } - return; - } else { - dout(10) << " newest_update " << info.last_update << " (me)" << dendl; - } - - dout(10) << " oldest_update " << oldest_update << dendl; - - have_master_log = true; - - - // -- do i need to generate backlog for any of my peers? - if (oldest_update < log.bottom && !log.backlog) { - dout(10) << "generating backlog for some peers, bottom " - << log.bottom << " > " << oldest_update - << dendl; - generate_backlog(); - } - - - /** COLLECT MISSING+LOG FROM PEERS **********/ - /* - we also detect divergent replicas here by pulling the full log - from everyone. - */ - - // gather missing from peers - for (unsigned i=1; i 0) { - dout(10) << "there are still " << missing.num_lost() << " lost objects" << dendl; - - // ***** - // FIXME: i don't think this actually accomplishes anything! - // ***** - - // ok, let's get more summaries! - bool waiting = false; - for (map::iterator it = peer_info.begin(); - it != peer_info.end(); - it++) { - int peer = it->first; - - if (peer_summary_requested.count(peer)) { - dout(10) << " already requested summary/backlog from osd" << peer << dendl; - waiting = true; - continue; - } - - dout(10) << " requesting summary/backlog from osd" << peer << dendl; - query_map[peer][info.pgid] = Query(Query::BACKLOG, info.history); - peer_summary_requested.insert(peer); - waiting = true; - } - - if (!waiting) { - dout(10) << missing.num_lost() << " objects are still lost, waiting+hoping for a notify from someone else!" << dendl; - } - return; - } - - // sanity check - assert(missing.num_lost() == 0); - assert(info.last_complete >= log.bottom || log.backlog); - - - // -- crash recovery? - if (is_crashed()) { - dout(10) << "crashed, allowing op replay for " << g_conf.osd_replay_window << dendl; - state_set(STATE_REPLAY); - osd->timer.add_event_after(g_conf.osd_replay_window, - new OSD::C_Activate(osd, info.pgid, osd->osdmap->get_epoch())); - } - else if (!is_active()) { - // -- ok, activate! - activate(t, activator_map); - } -} - - -void PG::activate(ObjectStore::Transaction& t, - map *activator_map) -{ - assert(!is_active()); - - // twiddle pg state - state_set(STATE_ACTIVE); - state_clear(STATE_STRAY); - if (is_crashed()) { - //assert(is_replay()); // HELP.. not on replica? - state_clear(STATE_CRASHED); - state_clear(STATE_REPLAY); - } - info.last_epoch_started = osd->osdmap->get_epoch(); - - if (role == 0) { // primary state - peers_complete_thru = 0; // we don't know (yet)! - } - - assert(info.last_complete >= log.bottom || log.backlog); - - // write pg info - t.collection_setattr(info.pgid, "info", (char*)&info, sizeof(info)); - - // write log - write_log(t); - - // clean up stray objects - clean_up_local(t); - - // init complete pointer - if (info.last_complete == info.last_update) { - dout(10) << "activate - complete" << dendl; - log.complete_to == log.log.end(); - log.requested_to = log.log.end(); - } - else if (true) { - dout(10) << "activate - not complete, " << missing << dendl; - - // init complete_to - log.complete_to = log.log.begin(); - while (log.complete_to->version < info.last_complete) { - log.complete_to++; - assert(log.complete_to != log.log.end()); - } - - if (is_primary()) { - // start recovery - dout(10) << "activate - starting recovery" << dendl; - log.requested_to = log.complete_to; - do_recovery(); - } - } else { - dout(10) << "activate - not complete, " << missing << dendl; - } - - // if primary.. - if (role == 0 && - (!g_conf.osd_hack_fast_startup || osd->osdmap->post_mkfs())) { - // who is clean? - uptodate_set.clear(); - if (info.is_uptodate()) - uptodate_set.insert(osd->whoami); - - // start up replicas - for (unsigned i=1; icount(peer) == 0) - (*activator_map)[peer] = new MOSDPGActivateSet(osd->osdmap->get_epoch()); - (*activator_map)[peer]->pg_info.push_back(info); - } else { - dout(10) << "activate - peer osd" << peer << " is up to date, but sending pg_log anyway" << dendl; - m = new MOSDPGLog(osd->osdmap->get_epoch(), info); - } - } - else { - m = new MOSDPGLog(osd->osdmap->get_epoch(), info); - if (peer_info[peer].last_update < log.bottom) { - // summary/backlog - assert(log.backlog); - m->log = log; - } else { - // incremental log - assert(peer_info[peer].last_update < info.last_update); - m->log.copy_after(log, peer_info[peer].last_update); - } - } - - // update local version of peer's missing list! - if (m) { - eversion_t plu = peer_info[peer].last_update; - Missing& pm = peer_missing[peer]; - for (list::iterator p = m->log.log.begin(); - p != m->log.log.end(); - p++) - if (p->version > plu) - pm.add(p->oid, p->version); - } - - if (m) { - dout(10) << "activate sending " << m->log << " " << m->missing - << " to osd" << peer << dendl; - //m->log.print(cout); - osd->messenger->send_message(m, osd->osdmap->get_inst(peer)); - } - - // update our missing - if (peer_missing[peer].num_missing() == 0) { - dout(10) << "activate peer osd" << peer << " already uptodate, " << peer_info[peer] << dendl; - assert(peer_info[peer].is_uptodate()); - uptodate_set.insert(peer); - } else { - dout(10) << "activate peer osd" << peer << " " << peer_info[peer] - << " missing " << peer_missing[peer] << dendl; - } - - } - - // discard unneeded peering state - //peer_log.clear(); // actually, do this carefully, in case peer() is called again. - - // all clean? - if (is_all_uptodate()) - finish_recovery(); - else { - dout(10) << "activate not all replicas are uptodate, starting recovery" << dendl; - do_recovery(); - } - } - - - // replay (queue them _before_ other waiting ops!) - if (!replay_queue.empty()) { - eversion_t c = info.last_update; - list replay; - for (map::iterator p = replay_queue.begin(); - p != replay_queue.end(); - p++) { - if (p->first <= info.last_update) { - dout(10) << "activate will WRNOOP " << p->first << " " << *p->second << dendl; - replay.push_back(p->second); - continue; - } - if (p->first.version != c.version+1) { - dout(10) << "activate replay " << p->first - << " skipping " << c.version+1 - p->first.version - << " ops" - << dendl; - } - dout(10) << "activate replay " << p->first << " " << *p->second << dendl; - replay.push_back(p->second); - c = p->first; - } - replay_queue.clear(); - osd->take_waiters(replay); - } - - if (is_primary()) - update_stats(); // update stats - - // waiters - osd->take_waiters(waiting_for_active); -} - - -void PG::finish_recovery() -{ - dout(10) << "finish_recovery" << dendl; - - state_set(PG::STATE_CLEAN); - purge_strays(); - update_stats(); -} - - - -void PG::update_stats() -{ - dout(15) << "update_stats" << dendl; - assert(is_primary()); - - // update our stat summary - pg_stats_lock.Lock(); - pg_stats.reported = info.last_update; - pg_stats.state = state; - pg_stats.size = stat_size; - pg_stats.num_blocks = stat_num_blocks; - pg_stats_lock.Unlock(); - - // put in osd stat_queue - osd->pg_stat_queue_lock.Lock(); - osd->pg_stat_queue.insert(info.pgid); - osd->pg_stat_queue_lock.Unlock(); -} - - -void PG::write_log(ObjectStore::Transaction& t) -{ - dout(10) << "write_log" << dendl; - - // assemble buffer - bufferlist bl; - - // build buffer - ondisklog.bottom = 0; - ondisklog.block_map.clear(); - for (list::iterator p = log.log.begin(); - p != log.log.end(); - p++) { - if (bl.length() % 4096 == 0) - ondisklog.block_map[bl.length()] = p->version; - bl.append((char*)&(*p), sizeof(*p)); - if (g_conf.osd_pad_pg_log) { // pad to 4k, until i fix ebofs reallocation crap. FIXME. - bufferptr bp(4096 - sizeof(*p)); - bl.push_back(bp); - } - } - ondisklog.top = bl.length(); - - // write it - t.remove( info.pgid.to_object() ); - t.write( info.pgid.to_object() , 0, bl.length(), bl); - t.collection_setattr(info.pgid, "ondisklog_bottom", &ondisklog.bottom, sizeof(ondisklog.bottom)); - t.collection_setattr(info.pgid, "ondisklog_top", &ondisklog.top, sizeof(ondisklog.top)); - - t.collection_setattr(info.pgid, "info", &info, sizeof(info)); -} - -void PG::trim_ondisklog_to(ObjectStore::Transaction& t, eversion_t v) -{ - dout(15) << " trim_ondisk_log_to v " << v << dendl; - - map::iterator p = ondisklog.block_map.begin(); - while (p != ondisklog.block_map.end()) { - dout(15) << " " << p->first << " -> " << p->second << dendl; - p++; - if (p == ondisklog.block_map.end() || - p->second > v) { // too far! - p--; // back up - break; - } - } - dout(15) << " * " << p->first << " -> " << p->second << dendl; - if (p == ondisklog.block_map.begin()) - return; // can't trim anything! - - // we can trim! - off_t trim = p->first; - dout(10) << " trimming ondisklog to [" << ondisklog.bottom << "," << ondisklog.top << ")" << dendl; - - assert(trim >= ondisklog.bottom); - ondisklog.bottom = trim; - - // adjust block_map - while (p != ondisklog.block_map.begin()) - ondisklog.block_map.erase(ondisklog.block_map.begin()); - - t.collection_setattr(info.pgid, "ondisklog_bottom", &ondisklog.bottom, sizeof(ondisklog.bottom)); - t.collection_setattr(info.pgid, "ondisklog_top", &ondisklog.top, sizeof(ondisklog.top)); -} - - -void PG::append_log(ObjectStore::Transaction& t, PG::Log::Entry& logentry, - eversion_t trim_to) -{ - dout(10) << "append_log " << ondisklog.top << " " << logentry << dendl; - - // write entry on disk - bufferlist bl; - bl.append( (char*)&logentry, sizeof(logentry) ); - if (g_conf.osd_pad_pg_log) { // pad to 4k, until i fix ebofs reallocation crap. FIXME. - bufferptr bp(4096 - sizeof(logentry)); - bl.push_back(bp); - } - t.write( info.pgid.to_object(), ondisklog.top, bl.length(), bl ); - - // update block map? - if (ondisklog.top % 4096 == 0) - ondisklog.block_map[ondisklog.top] = logentry.version; - - ondisklog.top += bl.length(); - t.collection_setattr(info.pgid, "ondisklog_top", &ondisklog.top, sizeof(ondisklog.top)); - - // trim? - if (trim_to > log.bottom) { - dout(10) << " trimming " << log << " to " << trim_to << dendl; - log.trim(t, trim_to); - info.log_bottom = log.bottom; - info.log_backlog = log.backlog; - trim_ondisklog_to(t, trim_to); - } - dout(10) << " ondisklog [" << ondisklog.bottom << "," << ondisklog.top << ")" << dendl; -} - -void PG::read_log(ObjectStore *store) -{ - int r; - // load bounds - ondisklog.bottom = ondisklog.top = 0; - r = store->collection_getattr(info.pgid, "ondisklog_bottom", &ondisklog.bottom, sizeof(ondisklog.bottom)); - assert(r == sizeof(ondisklog.bottom)); - r = store->collection_getattr(info.pgid, "ondisklog_top", &ondisklog.top, sizeof(ondisklog.top)); - assert(r == sizeof(ondisklog.top)); - - dout(10) << "read_log [" << ondisklog.bottom << "," << ondisklog.top << ")" << dendl; - - log.backlog = info.log_backlog; - log.bottom = info.log_bottom; - - if (ondisklog.top > 0) { - // read - bufferlist bl; - store->read(info.pgid.to_object(), ondisklog.bottom, ondisklog.top-ondisklog.bottom, bl); - if (bl.length() < ondisklog.top-ondisklog.bottom) { - dout(0) << "read_log data doesn't match attrs" << dendl; - assert(0); - } - - PG::Log::Entry e; - off_t pos = ondisklog.bottom; - assert(log.log.empty()); - while (pos < ondisklog.top) { - bl.copy(pos-ondisklog.bottom, sizeof(e), (char*)&e); - dout(10) << "read_log " << pos << " " << e << dendl; - - if (e.version > log.bottom || log.backlog) { // ignore items below log.bottom - if (pos % 4096 == 0) - ondisklog.block_map[pos] = e.version; - log.log.push_back(e); - } else { - dout(10) << "read_log ignoring entry at " << pos << dendl; - } - - if (g_conf.osd_pad_pg_log) // pad to 4k, until i fix ebofs reallocation crap. FIXME. - pos += 4096; - else - pos += sizeof(e); - } - } - log.top = info.last_update; - log.index(); - - // build missing - set did; - for (list::reverse_iterator i = log.log.rbegin(); - i != log.log.rend(); - i++) { - if (i->version <= info.last_complete) break; - if (did.count(i->oid)) continue; - did.insert(i->oid); - - if (i->is_delete()) continue; - - eversion_t v; - int r = osd->store->getattr(i->oid, "version", &v, sizeof(v)); - if (r < 0 || v < i->version) - missing.add(i->oid, i->version); - } -} - - - - -// ============================== -// Object locking - -// -// If the target object of the operation op is locked for writing by another client, the function puts op to the waiting queue waiting_for_wr_unlock -// returns true if object was locked, otherwise returns false -// -bool PG::block_if_wrlocked(MOSDOp* op) -{ - object_t oid = op->get_oid(); - - entity_name_t source; - int len = osd->store->getattr(oid, "wrlock", &source, sizeof(entity_name_t)); - //dout(0) << "getattr returns " << len << " on " << oid << dendl; - - if (len == sizeof(source) && - source != op->get_client()) { - //the object is locked for writing by someone else -- add the op to the waiting queue - waiting_for_wr_unlock[oid].push_back(op); - return true; - } - - return false; //the object wasn't locked, so the operation can be handled right away -} - - - - -// ======================= -// revisions - - -/* -int OSD::list_missing_revs(object_t oid, set& revs, PG *pg) -{ - int c = 0; - oid.rev = 0; - - map::iterator p = pg->missing.missing.lower_bound(oid); - if (p == pg->missing.missing.end()) - return 0; // clearly not - - while (p->first.ino == oid.ino && - p->first.bno == oid.bno) { - revs.insert(p->first); - c++; - } - return c; -}*/ - -bool PG::pick_missing_object_rev(object_t& oid) -{ - map::iterator p = missing.missing.upper_bound(oid); - if (p == missing.missing.end()) - return false; // clearly no candidate - - if (p->first.ino == oid.ino && p->first.bno == oid.bno) { - oid = p->first; // yes! it's an upper bound revision for me. - return true; - } - return false; -} - -bool PG::pick_object_rev(object_t& oid) -{ - object_t t = oid; - - if (!osd->store->pick_object_revision_lt(t)) - return false; // we have no revisions of this object! - - objectrev_t crev; - int r = osd->store->getattr(t, "crev", &crev, sizeof(crev)); - assert(r >= 0); - if (crev <= oid.rev) { - dout(10) << "pick_object_rev choosing " << t << " crev " << crev << " for " << oid << dendl; - oid = t; - return true; - } - - return false; -} - - - - - diff --git a/branches/sage/mds/osd/PG.h b/branches/sage/mds/osd/PG.h deleted file mode 100644 index 7c68ecec2b6e4..0000000000000 --- a/branches/sage/mds/osd/PG.h +++ /dev/null @@ -1,753 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __PG_H -#define __PG_H - - -#include "include/types.h" -#include "osd_types.h" -#include "include/buffer.h" - -#include "OSDMap.h" -#include "ObjectStore.h" -#include "msg/Messenger.h" - -#include "common/DecayCounter.h" - -#include -#include -using namespace std; - -#include -using namespace __gnu_cxx; - - -class OSD; -class MOSDOp; -class MOSDOpReply; -class MOSDPGActivateSet; - -/** PG - Replica Placement Group - * - */ - -class PG { -public: - - /* - * PG::Info - summary of PG statistics. - * - * some notes: - * - last_complete implies we have all objects that existed as of that - * stamp, OR a newer object, OR have already applied a later delete. - * - if last_complete >= log.bottom, then we know pg contents thru log.top. - * otherwise, we have no idea what the pg is supposed to contain. - */ - struct Info { - pg_t pgid; - eversion_t last_update; // last object version applied to store. - eversion_t last_complete; // last version pg was complete through. - - eversion_t log_bottom; // oldest log entry. - bool log_backlog; // do we store a complete log? - - epoch_t last_epoch_started; // last epoch started. - epoch_t last_epoch_finished; // last epoch finished. - - struct History { - epoch_t same_since; // same acting set since - epoch_t same_primary_since; // same primary at least back through this epoch. - epoch_t same_acker_since; // same acker at least back through this epoch. - History() : same_since(0), same_primary_since(0), same_acker_since(0) {} - } history; - - Info(pg_t p=0) : pgid(p), - log_backlog(false), - last_epoch_started(0), last_epoch_finished(0) {} - bool is_uptodate() const { return last_update == last_complete; } - bool is_empty() const { return last_update.version == 0; } - }; - - - /** - * Query - used to ask a peer for information about a pg. - * - * note: if version=0, type=LOG, then we just provide our full log. - * only if type=BACKLOG do we generate a backlog and provide that too. - */ - struct Query { - const static int INFO = 0; - const static int LOG = 1; - const static int BACKLOG = 2; - const static int FULLLOG = 3; - - int type; - eversion_t split, floor; - Info::History history; - - Query() : type(-1) {} - Query(int t, Info::History& h) : - type(t), history(h) { assert(t != LOG); } - Query(int t, eversion_t s, eversion_t f, Info::History& h) : - type(t), split(s), floor(f), history(h) { assert(t == LOG); } - }; - - - /* - * Missing - summary of missing objects. - * kept in memory, as a supplement to Log. - * also used to pass missing info in messages. - */ - class Missing { - public: - map missing; // oid -> v - map rmissing; // v -> oid - - map loc; // where i think i can get them. - - int num_lost() const { return missing.size() - loc.size(); } - int num_missing() const { return missing.size(); } - - bool is_missing(object_t oid) { - return missing.count(oid); - } - bool is_missing(object_t oid, eversion_t v) { - return missing.count(oid) && missing[oid] <= v; - } - void add(object_t oid) { - eversion_t z; - add(oid,z); - } - void add(object_t oid, eversion_t v) { - if (missing.count(oid)) { - if (missing[oid] > v) return; // already missing newer. - rmissing.erase(missing[oid]); - } - missing[oid] = v; - rmissing[v] = oid; - } - void rm(object_t oid, eversion_t when) { - if (missing.count(oid) && missing[oid] < when) { - rmissing.erase(missing[oid]); - missing.erase(oid); - loc.erase(oid); - } - } - void got(object_t oid, eversion_t v) { - assert(missing.count(oid)); - assert(missing[oid] <= v); - loc.erase(oid); - rmissing.erase(missing[oid]); - missing.erase(oid); - } - void got(object_t oid) { - assert(missing.count(oid)); - loc.erase(oid); - rmissing.erase(missing[oid]); - missing.erase(oid); - } - - void _encode(bufferlist& blist) { - ::_encode(missing, blist); - ::_encode(loc, blist); - } - void _decode(bufferlist& blist, int& off) { - ::_decode(missing, blist, off); - ::_decode(loc, blist, off); - - for (map::iterator it = missing.begin(); - it != missing.end(); - it++) - rmissing[it->second] = it->first; - } - }; - - - /* - * Log - incremental log of recent pg changes. - * also, serves as a recovery queue. - * - * when backlog is true, - * objects with versions <= bottom are in log. - * we do not have any deletion info before that time, however. - * log is a "summary" in that it contains all objects in the PG. - */ - class Log { - public: - /** top, bottom - * top - newest entry (update|delete) - * bottom - entry previous to oldest (update|delete) for which we have - * complete negative information. - * i.e. we can infer pg contents for any store whose last_update >= bottom. - */ - eversion_t top; // newest entry (update|delete) - eversion_t bottom; // version prior to oldest (update|delete) - - /** backlog - true if log is a complete summary of pg contents. - * updated will include all items in pg, but deleted will not include - * negative entries for items deleted prior to 'bottom'. - */ - bool backlog; - - /** Entry - * mapped from the eversion_t, so don't include that. - */ - class Entry { - public: - const static int LOST = 0; - const static int MODIFY = 1; - const static int CLONE = 2; - const static int DELETE = 3; - - int op; // write, zero, trunc, remove - object_t oid; - eversion_t version; - - osdreqid_t reqid; // caller+tid to uniquely identify request - - Entry() : op(0) {} - Entry(int _op, object_t _oid, const eversion_t& v, - const osdreqid_t& rid) : - op(_op), oid(_oid), version(v), reqid(rid) {} - - bool is_delete() const { return op == DELETE; } - bool is_clone() const { return op == CLONE; } - bool is_modify() const { return op == MODIFY; } - bool is_update() const { return is_clone() || is_modify(); } - }; - - list log; // the actual log. - - Log() : backlog(false) {} - - void clear() { - eversion_t z; - top = bottom = z; - backlog = false; - log.clear(); - } - bool empty() const { - return top.version == 0 && top.epoch == 0; - } - - void _encode(bufferlist& blist) const { - blist.append((char*)&top, sizeof(top)); - blist.append((char*)&bottom, sizeof(bottom)); - blist.append((char*)&backlog, sizeof(backlog)); - ::_encode(log, blist); - } - void _decode(bufferlist& blist, int& off) { - blist.copy(off, sizeof(top), (char*)&top); - off += sizeof(top); - blist.copy(off, sizeof(bottom), (char*)&bottom); - off += sizeof(bottom); - blist.copy(off, sizeof(backlog), (char*)&backlog); - off += sizeof(backlog); - - ::_decode(log, blist, off); - } - - void copy_after(const Log &other, eversion_t v); - bool copy_after_unless_divergent(const Log &other, eversion_t split, eversion_t floor); - void copy_non_backlog(const Log &other); - ostream& print(ostream& out) const; - }; - - /** - * IndexLog - adds in-memory index of the log, by oid. - * plus some methods to manipulate it all. - */ - class IndexedLog : public Log { - public: - hash_map objects; // ptrs into log. be careful! - hash_set caller_ops; - - // recovery pointers - list::iterator requested_to; // not inclusive of referenced item - list::iterator complete_to; // not inclusive of referenced item - - /****/ - IndexedLog() {} - - void clear() { - assert(0); - unindex(); - Log::clear(); - } - - bool logged_object(object_t oid) { - return objects.count(oid); - } - bool logged_req(const osdreqid_t &r) { - return caller_ops.count(r); - } - - void index() { - objects.clear(); - caller_ops.clear(); - for (list::iterator i = log.begin(); - i != log.end(); - i++) { - objects[i->oid] = &(*i); - caller_ops.insert(i->reqid); - } - } - - void index(Entry& e) { - if (objects.count(e.oid) == 0 || - objects[e.oid]->version < e.version) - objects[e.oid] = &e; - caller_ops.insert(e.reqid); - } - void unindex() { - objects.clear(); - caller_ops.clear(); - } - void unindex(Entry& e) { - // NOTE: this only works if we remove from the _bottom_ of the log! - assert(objects.count(e.oid)); - if (objects[e.oid]->version == e.version) - objects.erase(e.oid); - caller_ops.erase(e.reqid); - } - - - // accessors - Entry *is_updated(object_t oid) { - if (objects.count(oid) && objects[oid]->is_update()) return objects[oid]; - return 0; - } - Entry *is_deleted(object_t oid) { - if (objects.count(oid) && objects[oid]->is_delete()) return objects[oid]; - return 0; - } - - // actors - void add(Entry& e) { - // add to log - log.push_back(e); - assert(e.version > top); - assert(top.version == 0 || e.version.version > top.version); - top = e.version; - - // to our index - objects[e.oid] = &(log.back()); - caller_ops.insert(e.reqid); - } - - void trim(ObjectStore::Transaction &t, eversion_t s); - void trim_write_ahead(eversion_t last_update); - }; - - - /** - * OndiskLog - some info about how we store the log on disk. - */ - class OndiskLog { - public: - // ok - off_t bottom; // first byte of log. - off_t top; // byte following end of log. - map block_map; // block -> first stamp logged there - - OndiskLog() : bottom(0), top(0) {} - - bool trim_to(eversion_t v, ObjectStore::Transaction& t); - }; - - - /*** PG ****/ -public: - // any - static const int STATE_ACTIVE = 1; // i am active. (primary: replicas too) - - // primary - static const int STATE_CLEAN = 2; // peers are complete, clean of stray replicas. - static const int STATE_CRASHED = 4; // all replicas went down. - static const int STATE_REPLAY = 8; // crashed, waiting for replay - - // non-primary - static const int STATE_STRAY = 16; // i must notify the primary i exist. - - static std::string get_state_string(int state) { - std::string st; - if (state & STATE_ACTIVE) st += "active+"; - if (state & STATE_CLEAN) st += "clean+"; - if (state & STATE_CRASHED) st += "crashed+"; - if (state & STATE_REPLAY) st += "replay+"; - if (state & STATE_STRAY) st += "stray+"; - if (!st.length()) - st = "inactive"; - else - st.resize(st.length()-1); - return st; - } - -protected: - OSD *osd; - - /** locking and reference counting. - * I destroy myself when the reference count hits zero. - * lock() should be called before doing anything. - * get() should be called on pointer copy (to another thread, etc.). - * put() should be called on destruction of some previously copied pointer. - * put_unlock() when done with the current pointer (_most common_). - */ - Mutex _lock; - int ref; - bool deleted; - -public: - void lock() { - //cout << this << " " << info.pgid << " lock" << endl; - _lock.Lock(); - } - void unlock() { - //cout << this << " " << info.pgid << " unlock" << endl; - _lock.Unlock(); - } - void get() { - //cout << this << " " << info.pgid << " get " << ref << endl; - assert(_lock.is_locked()); - ++ref; - } - void put() { - //cout << this << " " << info.pgid << " put " << ref << endl; - assert(_lock.is_locked()); - --ref; - assert(ref > 0); // last put must be a put_unlock. - } - void put_unlock() { - //cout << this << " " << info.pgid << " put_unlock " << ref << endl; - assert(_lock.is_locked()); - --ref; - _lock.Unlock(); - if (ref == 0) delete this; - } - - - list op_queue; // op queue - - - void mark_deleted() { deleted = true; } - bool is_deleted() { return deleted; } - -public: - // pg state - Info info; - IndexedLog log; - OndiskLog ondisklog; - Missing missing; - utime_t last_heartbeat; // - -protected: - int role; // 0 = primary, 1 = replica, -1=none. - int state; // see bit defns above - - // primary state - public: - vector acting; - epoch_t last_epoch_started_any; - eversion_t last_complete_commit; - - // [primary only] content recovery state - eversion_t peers_complete_thru; - bool have_master_log; - protected: - set prior_set; // current+prior OSDs, as defined by last_epoch_started_any. - set stray_set; // non-acting osds that have PG data. - set uptodate_set; // current OSDs that are uptodate - eversion_t oldest_update; // lowest (valid) last_update in active set - map peer_info; // info from peers (stray or prior) - set peer_info_requested; - map peer_missing; - set peer_log_requested; // logs i've requested (and start stamps) - set peer_summary_requested; - friend class OSD; - - - // pg waiters - list waiting_for_active; - hash_map > waiting_for_missing_object; - map replay_queue; - - hash_map > waiting_for_wr_unlock; - - bool block_if_wrlocked(MOSDOp* op); - - - // recovery - map objects_pulling; // which objects are currently being pulled - - - - // stats - off_t stat_size; - off_t stat_num_blocks; - - hash_map stat_object_temp_rd; - - Mutex pg_stats_lock; - pg_stat_t pg_stats; - - void update_stats(); - -public: - void clear_primary_state(); - - public: - bool is_acting(int osd) const { - for (unsigned i=0; i peers_complete_thru) { - peers_complete_thru = t; - return true; - } - return false; - } - - void proc_replica_log(Log &olog, Missing& omissing, int from); - void merge_log(Log &olog, Missing& omissing, int from); - void proc_missing(Log &olog, Missing &omissing, int fromosd); - - void generate_backlog(); - void drop_backlog(); - - void trim_write_ahead(); - - void peer(ObjectStore::Transaction& t, - map< int, map >& query_map, - map *activator_map=0); - void activate(ObjectStore::Transaction& t, - map *activator_map=0); - - virtual void clean_up_local(ObjectStore::Transaction& t) = 0; - - virtual void cancel_recovery() = 0; - virtual bool do_recovery() = 0; - virtual void purge_strays() = 0; - - void finish_recovery(); - - off_t get_log_write_pos() { - return 0; - } - - friend class C_OSD_RepModify_Commit; - - public: - PG(OSD *o, pg_t p) : - osd(o), - ref(0), deleted(false), - info(p), - role(0), - state(0), - last_epoch_started_any(0), - last_complete_commit(0), - peers_complete_thru(0), - have_master_log(true), - stat_size(0), stat_num_blocks(0) - { } - virtual ~PG() { } - - pg_t get_pgid() const { return info.pgid; } - int get_nrep() const { return acting.size(); } - - int get_primary() { return acting.empty() ? -1:acting[0]; } - //int get_tail() { return acting.empty() ? -1:acting[ acting.size()-1 ]; } - //int get_acker() { return g_conf.osd_rep == OSD_REP_PRIMARY ? get_primary():get_tail(); } - int get_acker() { - if (g_conf.osd_rep == OSD_REP_PRIMARY || - acting.size() <= 1) - return get_primary(); - return acting[1]; - } - - int get_role() const { return role; } - void set_role(int r) { role = r; } - - bool is_primary() const { return role == PG_ROLE_HEAD; } - bool is_acker() const { - if (g_conf.osd_rep == OSD_REP_PRIMARY) - return is_primary(); - else - return role == PG_ROLE_ACKER; - } - bool is_head() const { return role == PG_ROLE_HEAD; } - bool is_middle() const { return role == PG_ROLE_MIDDLE; } - bool is_residual() const { return role == PG_ROLE_STRAY; } - - //int get_state() const { return state; } - bool state_test(int m) const { return (state & m) != 0; } - void state_set(int m) { state |= m; } - void state_clear(int m) { state &= ~m; } - - bool is_complete() const { return info.last_complete == info.last_update; } - - bool is_active() const { return state_test(STATE_ACTIVE); } - bool is_crashed() const { return state_test(STATE_CRASHED); } - bool is_replay() const { return state_test(STATE_REPLAY); } - //bool is_complete() { return state_test(STATE_COMPLETE); } - bool is_clean() const { return state_test(STATE_CLEAN); } - bool is_stray() const { return state_test(STATE_STRAY); } - - bool is_empty() const { return info.last_update == 0; } - - int num_active_ops() const { - return objects_pulling.size(); - } - - // pg on-disk state - void write_log(ObjectStore::Transaction& t); - void append_log(ObjectStore::Transaction& t, - PG::Log::Entry& logentry, - eversion_t trim_to); - void read_log(ObjectStore *store); - void trim_ondisklog_to(ObjectStore::Transaction& t, eversion_t v); - - - bool is_dup(osdreqid_t rid) { - return log.logged_req(rid); - } - - - bool pick_missing_object_rev(object_t& oid); - bool pick_object_rev(object_t& oid); - - - - // abstract bits - virtual bool preprocess_op(MOSDOp *op, utime_t now) { return false; } - virtual void do_op(MOSDOp *op) = 0; - virtual void do_op_reply(MOSDOpReply *op) = 0; - - virtual bool same_for_read_since(epoch_t e) = 0; - virtual bool same_for_modify_since(epoch_t e) = 0; - virtual bool same_for_rep_modify_since(epoch_t e) = 0; - - virtual bool is_missing_object(object_t oid) = 0; - virtual void wait_for_missing_object(object_t oid, MOSDOp *op) = 0; - - virtual void note_failed_osd(int osd) = 0; - - virtual void on_acker_change() = 0; - virtual void on_role_change() = 0; -}; - - - -inline ostream& operator<<(ostream& out, const PG::Info::History& h) -{ - return out << h.same_since << "/" << h.same_primary_since << "/" << h.same_acker_since; -} - -inline ostream& operator<<(ostream& out, const PG::Info& pgi) -{ - out << pgi.pgid << "("; - if (pgi.is_empty()) - out << " empty"; - else - out << " v " << pgi.last_update << "/" << pgi.last_complete - << " (" << pgi.log_bottom << "," << pgi.last_update << "]" - << (pgi.log_backlog ? "+backlog":""); - out << " e " << pgi.last_epoch_started << "/" << pgi.last_epoch_finished - << " " << pgi.history - << ")"; - return out; -} - -inline ostream& operator<<(ostream& out, const PG::Log::Entry& e) -{ - return out << " " << e.version - << (e.is_delete() ? " - ": - (e.is_clone() ? " c ": - (e.is_modify() ? " m ": - " ? "))) - << e.oid << " by " << e.reqid; -} - -inline ostream& operator<<(ostream& out, const PG::Log& log) -{ - out << "log(" << log.bottom << "," << log.top << "]"; - if (log.backlog) out << "+backlog"; - return out; -} - -inline ostream& operator<<(ostream& out, const PG::Missing& missing) -{ - out << "missing(" << missing.num_missing(); - if (missing.num_lost()) out << ", " << missing.num_lost() << " lost"; - out << ")"; - return out; -} - -inline ostream& operator<<(ostream& out, const PG& pg) -{ - out << "pg[" << pg.info - << " r=" << pg.get_role(); - - if (pg.log.bottom != pg.info.log_bottom) - out << " (info mismatch, " << pg.log << ")"; - - if (pg.log.log.empty()) { - // shoudl it be? - if (pg.log.top.version - pg.log.bottom.version != 0) { - out << " (log bound mismatch, empty)"; - } - } else { - if (((pg.log.log.begin()->version.version - 1 != pg.log.bottom.version) && - !pg.log.backlog) || - (pg.log.log.rbegin()->version.version != pg.log.top.version)) { - out << " (log bound mismatch, actual=[" - << pg.log.log.begin()->version << "," - << pg.log.log.rbegin()->version << "] len=" << pg.log.log.size() << ")"; - } - } - - if (pg.get_role() == 0) out << " pct " << pg.peers_complete_thru; - if (!pg.have_master_log) out << " !hml"; - if (pg.is_active()) out << " active"; - if (pg.is_crashed()) out << " crashed"; - if (pg.is_replay()) out << " replay"; - if (pg.is_clean()) out << " clean"; - if (pg.is_stray()) out << " stray"; - //out << " (" << pg.log.bottom << "," << pg.log.top << "]"; - if (pg.missing.num_missing()) out << " m=" << pg.missing.num_missing(); - if (pg.missing.num_lost()) out << " l=" << pg.missing.num_lost(); - out << "]"; - - - return out; -} - - - -#endif diff --git a/branches/sage/mds/osd/RAID4PG.cc b/branches/sage/mds/osd/RAID4PG.cc deleted file mode 100644 index 20cd6d8ab416b..0000000000000 --- a/branches/sage/mds/osd/RAID4PG.cc +++ /dev/null @@ -1,123 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "RAID4PG.h" -#include "OSD.h" - -#include "common/Logger.h" - -#include "messages/MOSDOp.h" -#include "messages/MOSDOpReply.h" - -#include "messages/MOSDPGNotify.h" -#include "messages/MOSDPGRemove.h" - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_osd) *_dout << dbeginl << g_clock.now() << " osd" << osd->get_nodeid() << " " << (osd->osdmap ? osd->osdmap->get_epoch():0) << " " << *this << " " - -#include -#include - - - - -void RAID4PG::do_op(MOSDOp *op) -{ - - -} - - - -void RAID4PG::do_op_reply(MOSDOpReply *reply) -{ - -} - - - -// ----------------- -// pg changes - -bool RAID4PG::same_for_read_since(epoch_t e) -{ - return e >= info.history.same_since; // whole pg set same -} - -bool RAID4PG::same_for_modify_since(epoch_t e) -{ - return e >= info.history.same_since; // whole pg set same -} - -bool RAID4PG::same_for_rep_modify_since(epoch_t e) -{ - return e >= info.history.same_since; // whole pg set same -} - - -// ----------------- -// RECOVERY - -bool RAID4PG::is_missing_object(object_t oid) -{ - return false; -} - -void RAID4PG::wait_for_missing_object(object_t oid, MOSDOp *op) -{ - //assert(0); -} - -void RAID4PG::note_failed_osd(int o) -{ - dout(10) << "note_failed_osd osd" << o << dendl; - //assert(0); -} - -void RAID4PG::on_acker_change() -{ - dout(10) << "on_acker_change" << dendl; - //assert(0); -} - - -void RAID4PG::on_role_change() -{ - dout(10) << "on_role_change" << dendl; - //assert(0); -} - - -void RAID4PG::clean_up_local(ObjectStore::Transaction&) -{ -} - -void RAID4PG::cancel_recovery() -{ - //assert(0); -} - -bool RAID4PG::do_recovery() -{ - //assert(0); - return false; -} - -void RAID4PG::purge_strays() -{ - //assert(0); -} - - - diff --git a/branches/sage/mds/osd/RAID4PG.h b/branches/sage/mds/osd/RAID4PG.h deleted file mode 100644 index 98e4deab56895..0000000000000 --- a/branches/sage/mds/osd/RAID4PG.h +++ /dev/null @@ -1,74 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __RAID4PG_H -#define __RAID4PG_H - - -#include "PG.h" - -#include "messages/MOSDOp.h" - - -class RAID4PG : public PG { -public: - -protected: - - void prepare_log_transaction(ObjectStore::Transaction& t, - MOSDOp *op, eversion_t& version, - objectrev_t crev, objectrev_t rev, - eversion_t trim_to); - void prepare_op_transaction(ObjectStore::Transaction& t, - MOSDOp *op, eversion_t& version, - objectrev_t crev, objectrev_t rev); - - void op_stat(MOSDOp *op); - int op_read(MOSDOp *op); - void op_modify(MOSDOp *op); - void op_rep_modify(MOSDOp *op); - void op_push(MOSDOp *op); - void op_pull(MOSDOp *op); - - - -public: - RAID4PG(OSD *o, pg_t p) : PG(o,p) { } - - void do_op(MOSDOp *op); - void do_op_reply(MOSDOpReply *r); - - bool same_for_read_since(epoch_t e); - bool same_for_modify_since(epoch_t e); - bool same_for_rep_modify_since(epoch_t e); - - bool is_missing_object(object_t oid); - void wait_for_missing_object(object_t oid, MOSDOp *op); - - void note_failed_osd(int osd); - - void on_acker_change(); - void on_role_change(); - - void clean_up_local(ObjectStore::Transaction& t); - - void cancel_recovery(); - bool do_recovery(); - - void purge_strays(); - - -}; - - -#endif diff --git a/branches/sage/mds/osd/ReplicatedPG.cc b/branches/sage/mds/osd/ReplicatedPG.cc deleted file mode 100644 index 7b5bdf581d643..0000000000000 --- a/branches/sage/mds/osd/ReplicatedPG.cc +++ /dev/null @@ -1,1972 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include "ReplicatedPG.h" -#include "OSD.h" - -#include "common/Logger.h" - -#include "messages/MOSDOp.h" -#include "messages/MOSDOpReply.h" - -#include "messages/MOSDPGNotify.h" -#include "messages/MOSDPGRemove.h" - -#include "messages/MOSDPing.h" - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_osd) *_dout << dbeginl << g_clock.now() << " osd" << osd->get_nodeid() << " " << (osd->osdmap ? osd->osdmap->get_epoch():0) << " " << *this << " " -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_osd) *_derr << dbeginl << g_clock.now() << " osd" << osd->get_nodeid() << " " << (osd->osdmap ? osd->osdmap->get_epoch():0) << " " << *this << " " - -#include -#include - -static const int LOAD_LATENCY = 1; -static const int LOAD_QUEUE_SIZE = 2; -static const int LOAD_HYBRID = 3; - - -// ======================= -// pg changes - -bool ReplicatedPG::same_for_read_since(epoch_t e) -{ - return (e >= info.history.same_acker_since); -} - -bool ReplicatedPG::same_for_modify_since(epoch_t e) -{ - return (e >= info.history.same_primary_since); -} - -bool ReplicatedPG::same_for_rep_modify_since(epoch_t e) -{ - // check osd map: same set, or primary+acker? - - if (g_conf.osd_rep == OSD_REP_CHAIN) { - return e >= info.history.same_since; // whole pg set same - } else { - // primary, splay - return (e >= info.history.same_primary_since && - e >= info.history.same_acker_since); - } -} - -// ==================== -// missing objects - -bool ReplicatedPG::is_missing_object(object_t oid) -{ - return missing.missing.count(oid); -} - - -void ReplicatedPG::wait_for_missing_object(object_t oid, MOSDOp *op) -{ - assert(is_missing_object(oid)); - - // we don't have it (yet). - eversion_t v = missing.missing[oid]; - if (objects_pulling.count(oid)) { - dout(7) << "missing " - << oid - << " v " << v - << ", already pulling" - << dendl; - } else { - dout(7) << "missing " - << oid - << " v " << v - << ", pulling" - << dendl; - pull(oid); - } - waiting_for_missing_object[oid].push_back(op); -} - - - - -/** preprocess_op - preprocess an op (before it gets queued). - * fasttrack read - */ -bool ReplicatedPG::preprocess_op(MOSDOp *op, utime_t now) -{ - // we only care about reads here on out.. - if (!op->is_read()) - return false; - - object_t oid = op->get_oid(); - - // -- load balance reads -- - if (is_primary() && - g_conf.osd_rep == OSD_REP_PRIMARY) { - // -- read on primary+acker --- - - // test - if (false) { - if (acting.size() > 1) { - int peer = acting[1]; - dout(-10) << "preprocess_op fwd client read op to osd" << peer << " for " << op->get_client() << " " << op->get_client_inst() << dendl; - osd->messenger->send_message(op, osd->osdmap->get_inst(peer)); - return true; - } - } - - // -- balance reads? - if (g_conf.osd_balance_reads && - !op->get_source().is_osd()) { - // flash crowd? - bool is_flash_crowd_candidate = false; - if (g_conf.osd_flash_crowd_iat_threshold > 0) { - osd->iat_averager.add_sample( oid, (double)g_clock.now() ); - is_flash_crowd_candidate = osd->iat_averager.is_flash_crowd_candidate( oid ); - } - - // hot? - double temp = 0; - if (stat_object_temp_rd.count(oid)) - temp = stat_object_temp_rd[oid].get(op->get_recv_stamp()); - bool is_hotly_read = temp > g_conf.osd_balance_reads_temp; - - dout(20) << "balance_reads oid " << oid << " temp " << temp - << (is_hotly_read ? " hotly_read":"") - << (is_flash_crowd_candidate ? " flash_crowd_candidate":"") - << dendl; - - bool should_balance = is_flash_crowd_candidate || is_hotly_read; - bool is_balanced = false; - bool b; - // *** FIXME *** this may block, and we're in the fast path! *** - if (osd->store->getattr(oid, "balance-reads", &b, 1) >= 0) - is_balanced = true; - - if (!is_balanced && should_balance && - balancing_reads.count(oid) == 0) { - dout(-10) << "preprocess_op balance-reads on " << oid << dendl; - balancing_reads.insert(oid); - MOSDOp *pop = new MOSDOp(osd->messenger->get_myinst(), 0, osd->get_tid(), - oid, - ObjectLayout(info.pgid), - osd->osdmap->get_epoch(), - OSD_OP_BALANCEREADS); - do_op(pop); - } - if (is_balanced && !should_balance && - !unbalancing_reads.count(oid) == 0) { - dout(-10) << "preprocess_op unbalance-reads on " << oid << dendl; - unbalancing_reads.insert(oid); - MOSDOp *pop = new MOSDOp(osd->messenger->get_myinst(), 0, osd->get_tid(), - oid, - ObjectLayout(info.pgid), - osd->osdmap->get_epoch(), - OSD_OP_UNBALANCEREADS); - do_op(pop); - } - } - - // -- read shedding - if (g_conf.osd_shed_reads && - g_conf.osd_stat_refresh_interval > 0 && - !op->get_source().is_osd()) { // no re-shedding! - Mutex::Locker lock(osd->peer_stat_lock); - - osd->_refresh_my_stat(now); - - // check my load. - // TODO xxx we must also compare with our own load - // if i am x percentage higher than replica , - // redirect the read - - int shedto = -1; - double bestscore = 0.0; // highest positive score wins - - // we calculate score values such that we can interpret them as a probability. - - switch (g_conf.osd_shed_reads) { - case LOAD_LATENCY: - // above some minimum? - if (osd->my_stat.read_latency >= g_conf.osd_shed_reads_min_latency) { - for (unsigned i=1; ipeer_stat.count(peer) == 0) continue; - - // assume a read_latency of 0 (technically, undefined) is OK, since - // we'll be corrected soon enough if we're wrong. - - double plat = osd->peer_stat[peer].read_latency_mine; - - double diff = osd->my_stat.read_latency - plat; - if (diff < g_conf.osd_shed_reads_min_latency_diff) continue; - - double c = .002; // add in a constant to smooth it a bit - double latratio = - (c+osd->my_stat.read_latency) / - (c+plat); - double p = (latratio - 1.0) / 2.0 / latratio; - dout(15) << "preprocess_op " << op->get_reqid() - << " my read latency " << osd->my_stat.read_latency - << ", peer osd" << peer << " is " << plat << " (" << osd->peer_stat[peer].read_latency << ")" - << ", latratio " << latratio - << ", p=" << p - << dendl; - if (latratio > g_conf.osd_shed_reads_min_latency_ratio && - p > bestscore && - drand48() < p) { - shedto = peer; - bestscore = p; - } - } - } - break; - - case LOAD_HYBRID: - // dumb mostly - if (osd->my_stat.read_latency >= g_conf.osd_shed_reads_min_latency) { - for (unsigned i=1; ipeer_stat.count(peer) == 0/* || - osd->peer_stat[peer].read_latency <= 0*/) continue; - - if (osd->peer_stat[peer].qlen < osd->my_stat.qlen) { - - if (osd->my_stat.read_latency - osd->peer_stat[peer].read_latency > - g_conf.osd_shed_reads_min_latency_diff) continue; - - double qratio = osd->pending_ops / osd->peer_stat[peer].qlen; - - double c = .002; // add in a constant to smooth it a bit - double latratio = - (c+osd->my_stat.read_latency)/ - (c+osd->peer_stat[peer].read_latency); - double p = (latratio - 1.0) / 2.0 / latratio; - - dout(-15) << "preprocess_op " << op->get_reqid() - << " my qlen / rdlat " - << osd->pending_ops << " " << osd->my_stat.read_latency - << ", peer osd" << peer << " is " - << osd->peer_stat[peer].qlen << " " << osd->peer_stat[peer].read_latency - << ", qratio " << qratio - << ", latratio " << latratio - << ", p=" << p - << dendl; - if (latratio > g_conf.osd_shed_reads_min_latency_ratio && - p > bestscore && - drand48() < p) { - shedto = peer; - bestscore = p; - } - } - } - } - break; - - /* - case LOAD_QUEUE_SIZE: - // am i above my average? -- dumb - if (osd->pending_ops > osd->my_stat.qlen) { - // yes. is there a peer who is below my average? - for (unsigned i=1; ipeer_stat.count(peer) == 0) continue; - if (osd->peer_stat[peer].qlen < osd->my_stat.qlen) { - // calculate a probability that we should redirect - float p = (osd->my_stat.qlen - osd->peer_stat[peer].qlen) / osd->my_stat.qlen; // this is dumb. - float v = 1.0 - p; - - dout(10) << "my qlen " << osd->pending_ops << " > my_avg " << osd->my_stat.qlen - << ", peer osd" << peer << " has qlen " << osd->peer_stat[peer].qlen - << ", p=" << p - << ", v= "<< v - << dendl; - - if (v > bestscore) { - shedto = peer; - bestscore = v; - } - } - } - } - break;*/ - - } - - // shed? - if (shedto >= 0) { - dout(10) << "preprocess_op shedding read to peer osd" << shedto - << " " << op->get_reqid() - << dendl; - op->set_peer_stat(osd->my_stat); - osd->messenger->send_message(op, osd->osdmap->get_inst(shedto)); - osd->stat_rd_ops_shed_out++; - osd->logger->inc("shdout"); - return true; - } - } - } // endif balance reads - - - // -- fastpath read? - // if this is a read and the data is in the cache, do an immediate read.. - if ( g_conf.osd_immediate_read_from_cache ) { - if (osd->store->is_cached( oid , - op->get_offset(), - op->get_length() ) == 0) { - if (!is_primary() && !op->get_source().is_osd()) { - // am i allowed? - bool v; - if (osd->store->getattr(oid, "balance-reads", &v, 1) < 0) { - dout(-10) << "preprocess_op in-cache but no balance-reads on " << oid - << ", fwd to primary" << dendl; - osd->messenger->send_message(op, osd->osdmap->get_inst(get_primary())); - return true; - } - } - - // do it now - dout(10) << "preprocess_op data is in cache, reading from cache" << *op << dendl; - do_op(op); - return true; - } - } - - return false; -} - - -/** do_op - do an op - * pg lock will be held (if multithreaded) - * osd_lock NOT held. - */ -void ReplicatedPG::do_op(MOSDOp *op) -{ - //dout(15) << "do_op " << *op << dendl; - - osd->logger->inc("op"); - - switch (op->get_op()) { - - // reads - case OSD_OP_READ: - case OSD_OP_STAT: - op_read(op); - break; - - // rep stuff - case OSD_OP_PULL: - op_pull(op); - break; - case OSD_OP_PUSH: - op_push(op); - break; - - // writes - case OSD_OP_WRNOOP: - case OSD_OP_WRITE: - case OSD_OP_ZERO: - case OSD_OP_DELETE: - case OSD_OP_TRUNCATE: - case OSD_OP_WRLOCK: - case OSD_OP_WRUNLOCK: - case OSD_OP_RDLOCK: - case OSD_OP_RDUNLOCK: - case OSD_OP_UPLOCK: - case OSD_OP_DNLOCK: - case OSD_OP_BALANCEREADS: - case OSD_OP_UNBALANCEREADS: - if (op->get_source().is_osd()) { - op_rep_modify(op); - } else { - // go go gadget pg - op_modify(op); - } - break; - - default: - assert(0); - } -} - -void ReplicatedPG::do_op_reply(MOSDOpReply *r) -{ - if (r->get_op() == OSD_OP_PUSH) { - // continue peer recovery - op_push_reply(r); - } else { - // must be replication. - tid_t rep_tid = r->get_rep_tid(); - int fromosd = r->get_source().num(); - - osd->take_peer_stat(fromosd, r->get_peer_stat()); - - if (rep_gather.count(rep_tid)) { - // oh, good. - repop_ack(rep_gather[rep_tid], - r->get_result(), r->get_commit(), - fromosd, - r->get_pg_complete_thru()); - delete r; - } else { - // early ack. - waiting_for_repop[rep_tid].push_back(r); - } - } -} - - - - -// ======================================================================== -// READS - -void ReplicatedPG::op_read(MOSDOp *op) -{ - object_t oid = op->get_oid(); - - dout(10) << "op_read " << MOSDOp::get_opname(op->get_op()) - << " " << oid - << " " << op->get_offset() << "~" << op->get_length() - << dendl; - - // wrlocked? - if (block_if_wrlocked(op)) - return; - - // !primary and unbalanced? - // (ignore ops forwarded from the primary) - if (!is_primary()) { - if (op->get_source().is_osd() && - op->get_source().num() == get_primary()) { - // read was shed to me by the primary - int from = op->get_source().num(); - osd->take_peer_stat(from, op->get_peer_stat()); - dout(10) << "read shed IN from " << op->get_source() - << " " << op->get_reqid() - << ", me = " << osd->my_stat.read_latency_mine - << ", them = " << op->get_peer_stat().read_latency - << (osd->my_stat.read_latency_mine > op->get_peer_stat().read_latency ? " WTF":"") - << dendl; - osd->logger->inc("shdin"); - - // does it look like they were wrong to do so? - Mutex::Locker lock(osd->peer_stat_lock); - if (osd->my_stat.read_latency_mine > op->get_peer_stat().read_latency && - osd->my_stat_on_peer[from].read_latency_mine < op->get_peer_stat().read_latency) { - dout(-10) << "read shed IN from " << op->get_source() - << " " << op->get_reqid() - << " and me " << osd->my_stat.read_latency_mine - << " > them " << op->get_peer_stat().read_latency - << ", but they didn't know better, sharing" << dendl; - osd->my_stat_on_peer[from] = osd->my_stat; - osd->messenger->send_message(new MOSDPing(osd->osdmap->get_epoch(), osd->my_stat), - osd->osdmap->get_inst(from)); - } - } else { - // make sure i exist and am balanced, otherwise fw back to acker. - bool b; - if (!osd->store->exists(oid) || - osd->store->getattr(oid, "balance-reads", &b, 1) < 0) { - dout(-10) << "read on replica, object " << oid - << " dne or no balance-reads, fw back to primary" << dendl; - osd->messenger->send_message(op, osd->osdmap->get_inst(get_acker())); - return; - } - } - } - - - // set up reply - MOSDOpReply *reply = new MOSDOpReply(op, 0, osd->osdmap->get_epoch(), true); - long r = 0; - - // do it. - if (oid.rev && !pick_object_rev(oid)) { - // we have no revision for this request. - r = -EEXIST; - } else { - switch (op->get_op()) { - case OSD_OP_READ: - { - // read into a buffer - bufferlist bl; - r = osd->store->read(oid, - op->get_offset(), op->get_length(), - bl); - reply->set_data(bl); - reply->set_length(r); - dout(15) << " read got " << r << " / " << op->get_length() << " bytes from obj " << oid << dendl; - } - osd->logger->inc("c_rd"); - osd->logger->inc("c_rdb", op->get_length()); - break; - - case OSD_OP_STAT: - { - struct stat st; - memset(&st, sizeof(st), 0); - r = osd->store->stat(oid, &st); - if (r >= 0) - reply->set_object_size(st.st_size); - } - break; - - default: - assert(0); - } - } - - if (r >= 0) { - reply->set_result(0); - - utime_t now = g_clock.now(); - utime_t diff = now; - diff -= op->get_recv_stamp(); - dout(10) << "op_read " << op->get_reqid() << " total op latency " << diff << dendl; - Mutex::Locker lock(osd->peer_stat_lock); - osd->stat_rd_ops_in_queue--; - osd->read_latency_calc.add(diff); - - if (is_primary() && - g_conf.osd_balance_reads) - stat_object_temp_rd[oid].hit(now); // hit temp. - - } else { - reply->set_result(r); // error - } - - // send it - osd->messenger->send_message(reply, op->get_client_inst()); - - delete op; -} - - - - - - -// ======================================================================== -// MODIFY - -void ReplicatedPG::prepare_log_transaction(ObjectStore::Transaction& t, - MOSDOp *op, eversion_t& version, - objectrev_t crev, objectrev_t rev, - eversion_t trim_to) -{ - const object_t oid = op->get_oid(); - - // clone entry? - if (crev && rev && rev > crev) { - eversion_t cv = version; - cv.version--; - Log::Entry cloneentry(PG::Log::Entry::CLONE, oid, cv, op->get_reqid()); - log.add(cloneentry); - - dout(10) << "prepare_log_transaction " << op->get_op() - << " " << cloneentry - << dendl; - } - - // actual op - int opcode = Log::Entry::MODIFY; - if (op->get_op() == OSD_OP_DELETE) opcode = Log::Entry::DELETE; - Log::Entry logentry(opcode, oid, version, op->get_reqid()); - - dout(10) << "prepare_log_transaction " << op->get_op() - << " " << logentry - << dendl; - - // append to log - assert(version > log.top); - log.add(logentry); - assert(log.top == version); - dout(10) << "prepare_log_transaction appended" << dendl; - - // write to pg log on disk - append_log(t, logentry, trim_to); -} - - -/** prepare_op_transaction - * apply an op to the store wrapped in a transaction. - */ -void ReplicatedPG::prepare_op_transaction(ObjectStore::Transaction& t, - MOSDOp *op, eversion_t& version, - objectrev_t crev, objectrev_t rev) -{ - const object_t oid = op->get_oid(); - const pg_t pgid = op->get_pg(); - - bool did_clone = false; - - dout(10) << "prepare_op_transaction " << MOSDOp::get_opname( op->get_op() ) - << " " << oid - << " v " << version - << " crev " << crev - << " rev " << rev - << dendl; - - // WRNOOP does nothing. - if (op->get_op() == OSD_OP_WRNOOP) - return; - - // raise last_complete? - if (info.last_complete == info.last_update) - info.last_complete = version; - - // raise last_update. - assert(version > info.last_update); - info.last_update = version; - - // write pg info - t.collection_setattr(pgid, "info", &info, sizeof(info)); - - // clone? - if (crev && rev && rev > crev) { - object_t noid = oid; - noid.rev = rev; - dout(10) << "prepare_op_transaction cloning " << oid << " crev " << crev << " to " << noid << dendl; - t.clone(oid, noid); - did_clone = true; - } - - // apply the op - switch (op->get_op()) { - - // -- locking -- - - case OSD_OP_WRLOCK: - { // lock object - t.setattr(oid, "wrlock", &op->get_client(), sizeof(entity_name_t)); - } - break; - case OSD_OP_WRUNLOCK: - { // unlock objects - t.rmattr(oid, "wrlock"); - } - break; - - case OSD_OP_MININCLOCK: - { - uint32_t mininc = op->get_length(); - t.setattr(oid, "mininclock", &mininc, sizeof(mininc)); - } - break; - - case OSD_OP_BALANCEREADS: - { - bool bal = true; - t.setattr(oid, "balance-reads", &bal, sizeof(bal)); - } - break; - case OSD_OP_UNBALANCEREADS: - { - t.rmattr(oid, "balance-reads"); - } - break; - - - // -- modify -- - - case OSD_OP_WRITE: - { // write - assert(op->get_data().length() == op->get_length()); - bufferlist bl; - bl.claim( op->get_data() ); // give buffers to store; we keep *op in memory for a long time! - - //if (oid < 100000000000000ULL) // hack hack-- don't write client data - t.write( oid, op->get_offset(), op->get_length(), bl ); - } - break; - - case OSD_OP_ZERO: - { - // zero, remove, or truncate? - struct stat st; - int r = osd->store->stat(oid, &st); - if (r >= 0) { - if (op->get_length() == 0 || - op->get_offset() + (off_t)op->get_length() >= (off_t)st.st_size) { - if (op->get_offset()) - t.truncate(oid, op->get_length() + op->get_offset()); - else - t.remove(oid); - } else { - // zero. the dumb way. FIXME. - bufferptr bp(op->get_length()); - bp.zero(); - bufferlist bl; - bl.push_back(bp); - t.write(oid, op->get_offset(), op->get_length(), bl); - } - } else { - // noop? - dout(10) << "apply_transaction zero on " << oid << ", but dne? stat returns " << r << dendl; - } - } - break; - - case OSD_OP_TRUNCATE: - { // truncate - t.truncate(oid, op->get_length() ); - } - break; - - case OSD_OP_DELETE: - { // delete - t.remove(oid); - } - break; - - default: - assert(0); - } - - // object collection, version - if (op->get_op() == OSD_OP_DELETE) { - // remove object from c - t.collection_remove(pgid, oid); - } else { - // add object to c - t.collection_add(pgid, oid); - - // object version - t.setattr(oid, "version", &version, sizeof(version)); - - // set object crev - if (crev == 0 || // new object - did_clone) // we cloned - t.setattr(oid, "crev", &rev, sizeof(rev)); - } -} - - - -// ======================================================================== -// rep op gather - -class C_OSD_ModifyCommit : public Context { -public: - ReplicatedPG *pg; - tid_t rep_tid; - eversion_t pg_last_complete; - C_OSD_ModifyCommit(ReplicatedPG *p, tid_t rt, eversion_t lc) : pg(p), rep_tid(rt), pg_last_complete(lc) { - pg->get(); // we're copying the pointer - } - void finish(int r) { - pg->lock(); - if (!pg->is_deleted()) - pg->op_modify_commit(rep_tid, pg_last_complete); - pg->put_unlock(); - } -}; - - -void ReplicatedPG::get_rep_gather(RepGather *repop) -{ - //repop->lock.Lock(); - dout(10) << "get_repop " << *repop << dendl; -} - -void ReplicatedPG::apply_repop(RepGather *repop) -{ - dout(10) << "apply_repop applying update on " << *repop << dendl; - assert(!repop->applied); - - Context *oncommit = new C_OSD_ModifyCommit(this, repop->rep_tid, repop->pg_local_last_complete); - unsigned r = osd->store->apply_transaction(repop->t, oncommit); - if (r) - dout(-10) << "apply_repop apply transaction return " << r << " on " << *repop << dendl; - - // discard my reference to the buffer - repop->op->get_data().clear(); - - repop->applied = true; - - - // any completion stuff to do here? - object_t oid = repop->op->get_oid(); - - switch (repop->op->get_op()) { - case OSD_OP_UNBALANCEREADS: - dout(-10) << "apply_repop completed unbalance-reads on " << oid << dendl; - unbalancing_reads.erase(oid); - if (waiting_for_unbalanced_reads.count(oid)) { - osd->take_waiters(waiting_for_unbalanced_reads[oid]); - waiting_for_unbalanced_reads.erase(oid); - } - break; - - case OSD_OP_BALANCEREADS: - dout(-10) << "apply_repop completed balance-reads on " << oid << dendl; - /* - if (waiting_for_balanced_reads.count(oid)) { - osd->take_waiters(waiting_for_balanced_reads[oid]); - waiting_for_balanced_reads.erase(oid); - } - */ - break; - - case OSD_OP_WRUNLOCK: - dout(-10) << "apply_repop completed wrunlock on " << oid << dendl; - if (waiting_for_wr_unlock.count(oid)) { - osd->take_waiters(waiting_for_wr_unlock[oid]); - waiting_for_wr_unlock.erase(oid); - } - break; - } - - -} - -void ReplicatedPG::put_rep_gather(RepGather *repop) -{ - dout(10) << "put_repop " << *repop << dendl; - - // commit? - if (repop->can_send_commit() && - repop->op->wants_commit()) { - // send commit. - if (repop->op->wants_reply()) { - MOSDOpReply *reply = new MOSDOpReply(repop->op, 0, osd->osdmap->get_epoch(), true); - dout(10) << "put_repop sending commit on " << *repop << " " << reply << dendl; - osd->messenger->send_message(reply, repop->op->get_client_inst()); - } - repop->sent_commit = true; - } - - // ack? - else if (repop->can_send_ack() && - repop->op->wants_ack()) { - // apply - apply_repop(repop); - - // send ack - if (repop->op->wants_reply()) { - MOSDOpReply *reply = new MOSDOpReply(repop->op, 0, osd->osdmap->get_epoch(), false); - dout(10) << "put_repop sending ack on " << *repop << " " << reply << dendl; - osd->messenger->send_message(reply, repop->op->get_client_inst()); - } - repop->sent_ack = true; - - utime_t now = g_clock.now(); - now -= repop->start; - osd->logger->finc("rlsum", now); - osd->logger->inc("rlnum", 1); - } - - // done. - if (repop->can_delete()) { - // adjust peers_complete_thru - if (!repop->pg_complete_thru.empty()) { - eversion_t min = info.last_complete; // hrm.... - for (unsigned i=0; ipg_complete_thru[acting[i]] < min) // note: if we haven't heard, it'll be zero, which is what we want. - min = repop->pg_complete_thru[acting[i]]; - } - - if (min > peers_complete_thru) { - dout(10) << "put_repop peers_complete_thru " - << peers_complete_thru << " -> " << min - << dendl; - peers_complete_thru = min; - } - } - - dout(10) << "put_repop deleting " << *repop << dendl; - - assert(rep_gather.count(repop->rep_tid)); - rep_gather.erase(repop->rep_tid); - - delete repop->op; - delete repop; - } -} - - -void ReplicatedPG::issue_repop(MOSDOp *op, int dest, utime_t now) -{ - object_t oid = op->get_oid(); - - dout(7) << " issue_repop rep_tid " << op->get_rep_tid() - << " o " << oid - << " to osd" << dest - << dendl; - - // forward the write/update/whatever - MOSDOp *wr = new MOSDOp(op->get_client_inst(), op->get_client_inc(), op->get_reqid().tid, - oid, - ObjectLayout(info.pgid), - osd->osdmap->get_epoch(), - op->get_op()); - wr->get_data() = op->get_data(); // _copy_ bufferlist - wr->set_length(op->get_length()); - wr->set_offset(op->get_offset()); - wr->set_version(op->get_version()); - - wr->set_rep_tid(op->get_rep_tid()); - wr->set_pg_trim_to(peers_complete_thru); - - wr->set_peer_stat(osd->get_my_stat_for(now, dest)); - - osd->messenger->send_message(wr, osd->osdmap->get_inst(dest)); -} - -ReplicatedPG::RepGather *ReplicatedPG::new_rep_gather(MOSDOp *op) -{ - dout(10) << "new_rep_gather rep_tid " << op->get_rep_tid() << " on " << *op << dendl; - int whoami = osd->get_nodeid(); - - RepGather *repop = new RepGather(op, op->get_rep_tid(), - op->get_version(), - info.last_complete); - - // osds. commits all come to me. - for (unsigned i=0; iosds.insert(osd); - repop->waitfor_commit.insert(osd); - } - - // acks vary: - if (g_conf.osd_rep == OSD_REP_CHAIN) { - // chain rep. - // there's my local ack... - repop->osds.insert(whoami); - repop->waitfor_ack.insert(whoami); - repop->waitfor_commit.insert(whoami); - - // also, the previous guy will ack to me - int myrank = osd->osdmap->calc_pg_rank(whoami, acting); - if (myrank > 0) { - int osd = acting[ myrank-1 ]; - repop->osds.insert(osd); - repop->waitfor_ack.insert(osd); - repop->waitfor_commit.insert(osd); - } - } else { - // primary, splay. all osds ack to me. - for (unsigned i=0; iwaitfor_ack.insert(osd); - } - } - - repop->start = g_clock.now(); - - rep_gather[ repop->rep_tid ] = repop; - - // anyone waiting? (acks that got here before the op did) - if (waiting_for_repop.count(repop->rep_tid)) { - osd->take_waiters(waiting_for_repop[repop->rep_tid]); - waiting_for_repop.erase(repop->rep_tid); - } - - return repop; -} - - -void ReplicatedPG::repop_ack(RepGather *repop, - int result, bool commit, - int fromosd, eversion_t pg_complete_thru) -{ - MOSDOp *op = repop->op; - - dout(7) << "repop_ack rep_tid " << repop->rep_tid << " op " << *op - << " result " << result << " commit " << commit << " from osd" << fromosd - << dendl; - - get_rep_gather(repop); - { - if (commit) { - // commit - assert(repop->waitfor_commit.count(fromosd)); - repop->waitfor_commit.erase(fromosd); - repop->waitfor_ack.erase(fromosd); - repop->pg_complete_thru[fromosd] = pg_complete_thru; - } else { - // ack - repop->waitfor_ack.erase(fromosd); - } - } - put_rep_gather(repop); -} - - - - - - - - - - - - - - - - - - - - - - - -/** op_modify_commit - * transaction commit on the acker. - */ -void ReplicatedPG::op_modify_commit(tid_t rep_tid, eversion_t pg_complete_thru) -{ - if (rep_gather.count(rep_tid)) { - RepGather *repop = rep_gather[rep_tid]; - - dout(10) << "op_modify_commit " << *repop->op << dendl; - get_rep_gather(repop); - { - assert(repop->waitfor_commit.count(osd->get_nodeid())); - repop->waitfor_commit.erase(osd->get_nodeid()); - repop->pg_complete_thru[osd->get_nodeid()] = pg_complete_thru; - } - put_rep_gather(repop); - dout(10) << "op_modify_commit done on " << repop << dendl; - } else { - dout(10) << "op_modify_commit rep_tid " << rep_tid << " dne" << dendl; - } -} - - - -objectrev_t ReplicatedPG::assign_version(MOSDOp *op) -{ - object_t oid = op->get_oid(); - - // check crev - objectrev_t crev = 0; - osd->store->getattr(oid, "crev", (char*)&crev, sizeof(crev)); - - // assign version - eversion_t clone_version; - eversion_t nv = log.top; - if (op->get_op() != OSD_OP_WRNOOP) { - nv.epoch = osd->osdmap->get_epoch(); - nv.version++; - assert(nv > info.last_update); - assert(nv > log.top); - - // will clone? - if (crev && op->get_rev() && op->get_rev() > crev) { - clone_version = nv; - nv.version++; - } - - if (op->get_version().version) { - // replay! - if (nv.version < op->get_version().version) { - nv.version = op->get_version().version; - - // clone? - if (crev && op->get_rev() && op->get_rev() > crev) { - // backstep clone - clone_version = nv; - clone_version.version--; - } - } - } - } - - // set version in op, for benefit of client and our eventual reply - op->set_version(nv); - - return crev; -} - - -// commit (to disk) callback -class C_OSD_RepModifyCommit : public Context { -public: - ReplicatedPG *pg; - MOSDOp *op; - int destosd; - - eversion_t pg_last_complete; - - Mutex lock; - Cond cond; - bool acked; - bool waiting; - - C_OSD_RepModifyCommit(ReplicatedPG *p, MOSDOp *oo, int dosd, eversion_t lc) : - pg(p), op(oo), destosd(dosd), pg_last_complete(lc), - acked(false), waiting(false) { - pg->get(); // we're copying the pointer. - } - void finish(int r) { - lock.Lock(); - assert(!waiting); - while (!acked) { - waiting = true; - cond.Wait(lock); - } - assert(acked); - lock.Unlock(); - - pg->lock(); - pg->op_rep_modify_commit(op, destosd, pg_last_complete); - pg->put_unlock(); - } - void ack() { - lock.Lock(); - assert(!acked); - acked = true; - if (waiting) cond.Signal(); - - // discard my reference to buffer - op->get_data().clear(); - - lock.Unlock(); - } -}; - - -void ReplicatedPG::op_modify(MOSDOp *op) -{ - int whoami = osd->get_nodeid(); - object_t oid = op->get_oid(); - const char *opname = MOSDOp::get_opname(op->get_op()); - - // --- locking --- - - // wrlock? - if (op->get_op() != OSD_OP_WRNOOP && // except WRNOOP; we just want to flush - block_if_wrlocked(op)) - return; // op will be handled later, after the object unlocks - - // balance-reads set? - char v; - if ((op->get_op() != OSD_OP_BALANCEREADS && op->get_op() != OSD_OP_UNBALANCEREADS) && - (osd->store->getattr(op->get_oid(), "balance-reads", &v, 1) >= 0 || - balancing_reads.count(op->get_oid()))) { - - if (!unbalancing_reads.count(op->get_oid())) { - // unbalance - dout(-10) << "preprocess_op unbalancing-reads on " << op->get_oid() << dendl; - unbalancing_reads.insert(op->get_oid()); - - MOSDOp *pop = new MOSDOp(osd->messenger->get_myinst(), 0, osd->get_tid(), - op->get_oid(), - ObjectLayout(info.pgid), - osd->osdmap->get_epoch(), - OSD_OP_UNBALANCEREADS); - do_op(pop); - } - - // add to wait queue - dout(-10) << "preprocess_op waiting for unbalance-reads on " << op->get_oid() << dendl; - waiting_for_unbalanced_reads[op->get_oid()].push_back(op); - return; - } - - - // dup op? - if (is_dup(op->get_reqid())) { - dout(3) << "op_modify " << opname << " dup op " << op->get_reqid() - << ", doing WRNOOP" << dendl; - op->set_op(OSD_OP_WRNOOP); - opname = MOSDOp::get_opname(op->get_op()); - } - - // assign the op a version - objectrev_t crev = assign_version(op); - eversion_t nv = op->get_version(); - - // are any peers missing this? - for (unsigned i=1; iget_rev() - << " " << op->get_offset() << "~" << op->get_length() - << dendl; - - if (op->get_op() == OSD_OP_WRITE) { - osd->logger->inc("c_wr"); - osd->logger->inc("c_wrb", op->get_length()); - } - - // note my stats - utime_t now = g_clock.now(); - - // issue replica writes - RepGather *repop = 0; - bool alone = (acting.size() == 1); - tid_t rep_tid = osd->get_tid(); - op->set_rep_tid(rep_tid); - - if (g_conf.osd_rep == OSD_REP_CHAIN && !alone) { - // chain rep. send to #2 only. - int next = acting[1]; - if (acting.size() > 2) - next = acting[2]; - issue_repop(op, next, now); - } - else if (g_conf.osd_rep == OSD_REP_SPLAY && !alone) { - // splay rep. send to rest. - for (unsigned i=1; i=1; --i) - issue_repop(op, acting[i], now); - } else { - // primary rep, or alone. - repop = new_rep_gather(op); - - // send to rest. - if (!alone) - for (unsigned i=1; iget_op() != OSD_OP_WRNOOP) { - // log and update later. - prepare_log_transaction(repop->t, op, nv, crev, op->get_rev(), peers_complete_thru); - prepare_op_transaction(repop->t, op, nv, crev, op->get_rev()); - } - - // (logical) local ack. - // (if alone, this will apply the update.) - get_rep_gather(repop); - { - assert(repop->waitfor_ack.count(whoami)); - repop->waitfor_ack.erase(whoami); - } - put_rep_gather(repop); - - } else { - // not acker. - // chain or splay. apply. - ObjectStore::Transaction t; - prepare_log_transaction(t, op, nv, crev, op->get_rev(), peers_complete_thru); - prepare_op_transaction(t, op, nv, crev, op->get_rev()); - - C_OSD_RepModifyCommit *oncommit = new C_OSD_RepModifyCommit(this, op, get_acker(), - info.last_complete); - unsigned r = osd->store->apply_transaction(t, oncommit); - if (r != 0 && // no errors - r != 2) { // or error on collection_add - derr(0) << "error applying transaction: r = " << r << dendl; - assert(r == 0); - } - - // lets evict the data from our cache to maintain a total large cache size - if (g_conf.osd_exclusive_caching) - osd->store->trim_from_cache(op->get_oid(), op->get_offset(), op->get_length()); - - oncommit->ack(); - } - -} - - - -// replicated - - - - -void ReplicatedPG::op_rep_modify(MOSDOp *op) -{ - object_t oid = op->get_oid(); - eversion_t nv = op->get_version(); - - const char *opname = MOSDOp::get_opname(op->get_op()); - - // check crev - objectrev_t crev = 0; - osd->store->getattr(oid, "crev", (char*)&crev, sizeof(crev)); - - dout(10) << "op_rep_modify " << opname - << " " << oid - << " v " << nv - << " " << op->get_offset() << "~" << op->get_length() - << dendl; - - // note peer's stat - int fromosd = op->get_source().num(); - osd->take_peer_stat(fromosd, op->get_peer_stat()); - - // we better not be missing this. - assert(!missing.is_missing(oid)); - - // prepare our transaction - ObjectStore::Transaction t; - - // am i acker? - RepGather *repop = 0; - int ackerosd = acting[0]; - - if ((g_conf.osd_rep == OSD_REP_CHAIN || g_conf.osd_rep == OSD_REP_SPLAY)) { - ackerosd = get_acker(); - - if (is_acker()) { - // i am tail acker. - if (rep_gather.count(op->get_rep_tid())) { - repop = rep_gather[ op->get_rep_tid() ]; - } else { - repop = new_rep_gather(op); - } - - // infer ack from source - get_rep_gather(repop); - { - //assert(repop->waitfor_ack.count(fromosd)); // no, we may come thru here twice. - repop->waitfor_ack.erase(fromosd); - } - put_rep_gather(repop); - - // prepare dest socket - //messenger->prepare_send_message(op->get_client()); - } - - // chain? forward? - if (g_conf.osd_rep == OSD_REP_CHAIN && !is_acker()) { - // chain rep, not at the tail yet. - int myrank = osd->osdmap->calc_pg_rank(osd->get_nodeid(), acting); - int next = myrank+1; - if (next == (int)acting.size()) - next = 1; - issue_repop(op, acting[next], g_clock.now()); - } - } - - // do op? - C_OSD_RepModifyCommit *oncommit = 0; - - osd->logger->inc("r_wr"); - osd->logger->inc("r_wrb", op->get_length()); - - if (repop) { - // acker. we'll apply later. - if (op->get_op() != OSD_OP_WRNOOP) { - prepare_log_transaction(repop->t, op, nv, crev, op->get_rev(), op->get_pg_trim_to()); - prepare_op_transaction(repop->t, op, nv, crev, op->get_rev()); - } - } else { - // middle|replica. - if (op->get_op() != OSD_OP_WRNOOP) { - prepare_log_transaction(t, op, nv, crev, op->get_rev(), op->get_pg_trim_to()); - prepare_op_transaction(t, op, nv, crev, op->get_rev()); - } - - oncommit = new C_OSD_RepModifyCommit(this, op, ackerosd, info.last_complete); - - // apply log update. and possibly update itself. - unsigned tr = osd->store->apply_transaction(t, oncommit); - if (tr != 0 && // no errors - tr != 2) { // or error on collection_add - derr(0) << "error applying transaction: r = " << tr << dendl; - assert(tr == 0); - } - } - - // ack? - if (repop) { - // (logical) local ack. this may induce the actual update. - get_rep_gather(repop); - { - assert(repop->waitfor_ack.count(osd->get_nodeid())); - repop->waitfor_ack.erase(osd->get_nodeid()); - } - put_rep_gather(repop); - } - else { - // send ack to acker? - if (g_conf.osd_rep != OSD_REP_CHAIN) { - MOSDOpReply *ack = new MOSDOpReply(op, 0, osd->osdmap->get_epoch(), false); - ack->set_peer_stat(osd->get_my_stat_for(g_clock.now(), ackerosd)); - osd->messenger->send_message(ack, osd->osdmap->get_inst(ackerosd)); - } - - // ack myself. - assert(oncommit); - oncommit->ack(); - } - -} - - -void ReplicatedPG::op_rep_modify_commit(MOSDOp *op, int ackerosd, eversion_t last_complete) -{ - // send commit. - dout(10) << "rep_modify_commit on op " << *op - << ", sending commit to osd" << ackerosd - << dendl; - if (osd->osdmap->is_up(ackerosd)) { - MOSDOpReply *commit = new MOSDOpReply(op, 0, osd->osdmap->get_epoch(), true); - commit->set_pg_complete_thru(last_complete); - commit->set_peer_stat(osd->get_my_stat_for(g_clock.now(), ackerosd)); - osd->messenger->send_message(commit, osd->osdmap->get_inst(ackerosd)); - delete op; - } -} - - - - - - - - - - -// =========================================================== - -/** pull - request object from a peer - */ -void ReplicatedPG::pull(object_t oid) -{ - assert(missing.loc.count(oid)); - eversion_t v = missing.missing[oid]; - int fromosd = missing.loc[oid]; - - dout(7) << "pull " << oid - << " v " << v - << " from osd" << fromosd - << dendl; - - // send op - tid_t tid = osd->get_tid(); - MOSDOp *op = new MOSDOp(osd->messenger->get_myinst(), 0, tid, - oid, info.pgid, - osd->osdmap->get_epoch(), - OSD_OP_PULL); - op->set_version(v); - osd->messenger->send_message(op, osd->osdmap->get_inst(fromosd)); - - // take note - assert(objects_pulling.count(oid) == 0); - num_pulling++; - objects_pulling[oid] = v; -} - - -/** push - send object to a peer - */ -void ReplicatedPG::push(object_t oid, int peer) -{ - // read data+attrs - bufferlist bl; - eversion_t v; - int vlen = sizeof(v); - map attrset; - - ObjectStore::Transaction t; - t.read(oid, 0, 0, &bl); - t.getattr(oid, "version", &v, &vlen); - t.getattrs(oid, attrset); - unsigned tr = osd->store->apply_transaction(t); - - assert(tr == 0); // !!! - - // ok - dout(7) << "push " << oid << " v " << v - << " size " << bl.length() - << " to osd" << peer - << dendl; - - osd->logger->inc("r_push"); - osd->logger->inc("r_pushb", bl.length()); - - // send - MOSDOp *op = new MOSDOp(osd->messenger->get_myinst(), 0, osd->get_tid(), - oid, info.pgid, osd->osdmap->get_epoch(), - OSD_OP_PUSH); - op->set_offset(0); - op->set_length(bl.length()); - op->set_data(bl); // note: claims bl, set length above here! - op->set_version(v); - op->set_attrset(attrset); - - osd->messenger->send_message(op, osd->osdmap->get_inst(peer)); - - if (is_primary()) { - peer_missing[peer].got(oid); - pushing[oid].insert(peer); - } -} - - - -/** op_pull - * process request to pull an entire object. - * NOTE: called from opqueue. - */ -void ReplicatedPG::op_pull(MOSDOp *op) -{ - const object_t oid = op->get_oid(); - const eversion_t v = op->get_version(); - int from = op->get_source().num(); - - dout(7) << "op_pull " << oid << " v " << op->get_version() - << " from " << op->get_source() - << dendl; - - // is a replica asking? are they missing it? - if (is_primary()) { - // primary - assert(peer_missing.count(from)); // we had better know this, from the peering process. - - if (!peer_missing[from].is_missing(oid)) { - dout(7) << "op_pull replica isn't actually missing it, we must have already pushed to them" << dendl; - delete op; - return; - } - - // do we have it yet? - if (is_missing_object(oid)) { - wait_for_missing_object(oid, op); - return; - } - } else { - // non-primary - if (missing.is_missing(oid)) { - dout(7) << "op_pull not primary, and missing " << oid << ", ignoring" << dendl; - delete op; - return; - } - } - - // push it back! - push(oid, op->get_source().num()); -} - - -/** op_push - * NOTE: called from opqueue. - */ -void ReplicatedPG::op_push(MOSDOp *op) -{ - object_t oid = op->get_oid(); - eversion_t v = op->get_version(); - - if (!is_missing_object(oid)) { - dout(7) << "op_push not missing " << oid << dendl; - return; - } - - dout(7) << "op_push " - << oid - << " v " << v - << " size " << op->get_length() << " " << op->get_data().length() - << dendl; - - assert(op->get_data().length() == op->get_length()); - - // write object and add it to the PG - ObjectStore::Transaction t; - t.remove(oid); // in case old version exists - t.write(oid, 0, op->get_length(), op->get_data()); - t.setattrs(oid, op->get_attrset()); - t.collection_add(info.pgid, oid); - - // close out pull op? - num_pulling--; - if (objects_pulling.count(oid)) - objects_pulling.erase(oid); - missing.got(oid, v); - - - // raise last_complete? - assert(log.complete_to != log.log.end()); - while (log.complete_to != log.log.end()) { - if (missing.missing.count(log.complete_to->oid)) break; - if (info.last_complete < log.complete_to->version) - info.last_complete = log.complete_to->version; - log.complete_to++; - } - dout(10) << "last_complete now " << info.last_complete << dendl; - - - // apply to disk! - t.collection_setattr(info.pgid, "info", &info, sizeof(info)); - unsigned r = osd->store->apply_transaction(t); - assert(r == 0); - - - - // am i primary? are others missing this too? - if (is_primary()) { - for (unsigned i=1; itake_waiters(waiting_for_missing_object[oid]); - waiting_for_missing_object.erase(oid); - } - - if (is_primary()) { - // continue recovery - do_recovery(); - } else { - // ack if i'm a replica and being pushed to. - MOSDOpReply *reply = new MOSDOpReply(op, 0, osd->osdmap->get_epoch(), true); - osd->messenger->send_message(reply, op->get_source_inst()); - } - - delete op; -} - - - - - - -void ReplicatedPG::note_failed_osd(int o) -{ - dout(10) << "note_failed_osd " << o << dendl; - // do async; repop_ack() may modify pg->repop_gather - list ls; - for (hash_map::iterator p = rep_gather.begin(); - p != rep_gather.end(); - p++) { - //dout(-1) << "checking repop tid " << p->first << dendl; - if (p->second->waitfor_ack.count(o) || - p->second->waitfor_commit.count(o)) - ls.push_back(p->second); - } - for (list::iterator p = ls.begin(); - p != ls.end(); - p++) - repop_ack(*p, -1, true, o); -} - - -void ReplicatedPG::on_acker_change() -{ - dout(10) << "on_acker_change" << dendl; - - if (g_conf.osd_rep == OSD_REP_PRIMARY) { - // we're fine. - // note that note_failed_osd() above shoudl ahve implicitly acked/committed - // from the failed guy. - } else { - // for splay or chain replication, any change is significant. - // apply repops - for (hash_map::iterator p = rep_gather.begin(); - p != rep_gather.end(); - p++) { - if (!p->second->applied) - apply_repop(p->second); - delete p->second->op; - delete p->second; - } - rep_gather.clear(); - - // and repop waiters - for (hash_map >::iterator p = waiting_for_repop.begin(); - p != waiting_for_repop.end(); - p++) - for (list::iterator pm = p->second.begin(); - pm != p->second.end(); - pm++) - delete *pm; - waiting_for_repop.clear(); - } -} - - -void ReplicatedPG::on_role_change() -{ - dout(10) << "on_role_change" << dendl; - - // take object waiters - for (hash_map >::iterator it = waiting_for_missing_object.begin(); - it != waiting_for_missing_object.end(); - it++) - osd->take_waiters(it->second); - waiting_for_missing_object.clear(); -} - - - - - - - - - -/** clean_up_local - * remove any objects that we're storing but shouldn't. - * as determined by log. - */ -void ReplicatedPG::clean_up_local(ObjectStore::Transaction& t) -{ - dout(10) << "clean_up_local" << dendl; - - assert(info.last_update >= log.bottom); // otherwise we need some help! - - if (log.backlog) { - // be thorough. - list ls; - osd->store->collection_list(info.pgid, ls); - set s; - - for (list::iterator i = ls.begin(); - i != ls.end(); - i++) - s.insert(*i); - - set did; - for (list::reverse_iterator p = log.log.rbegin(); - p != log.log.rend(); - p++) { - if (did.count(p->oid)) continue; - did.insert(p->oid); - - if (p->is_delete()) { - if (s.count(p->oid)) { - dout(10) << " deleting " << p->oid - << " when " << p->version << dendl; - t.remove(p->oid); - } - s.erase(p->oid); - } else { - // just leave old objects.. they're missing or whatever - s.erase(p->oid); - } - } - - for (set::iterator i = s.begin(); - i != s.end(); - i++) { - dout(10) << " deleting stray " << *i << dendl; - t.remove(*i); - } - - } else { - // just scan the log. - set did; - for (list::reverse_iterator p = log.log.rbegin(); - p != log.log.rend(); - p++) { - if (did.count(p->oid)) continue; - did.insert(p->oid); - - if (p->is_delete()) { - dout(10) << " deleting " << p->oid - << " when " << p->version << dendl; - t.remove(p->oid); - } else { - // keep old(+missing) objects, just for kicks. - } - } - } -} - - - -void ReplicatedPG::cancel_recovery() -{ - // forget about where missing items are, or anything we're pulling - missing.loc.clear(); - osd->num_pulling -= objects_pulling.size(); - objects_pulling.clear(); - num_pulling = 0; - pushing.clear(); -} - -/** - * do one recovery op. - * return true if done, false if nothing left to do. - */ -bool ReplicatedPG::do_recovery() -{ - assert(is_primary()); - /*if (!is_primary()) { - dout(10) << "do_recovery not primary, doing nothing" << dendl; - return true; - } - */ - - if (info.is_uptodate()) { // am i up to date? - if (!is_all_uptodate()) { - dout(-10) << "do_recovery i'm clean but replicas aren't, starting peer recovery" << dendl; - do_peer_recovery(); - } else { - dout(-10) << "do_recovery all clean, nothing to do" << dendl; - } - return true; - } - - dout(-10) << "do_recovery pulling " << objects_pulling.size() << " in pg, " - << osd->num_pulling << "/" << g_conf.osd_max_pull << " total" - << dendl; - dout(10) << "do_recovery " << missing << dendl; - - // can we slow down on this PG? - if (osd->num_pulling >= g_conf.osd_max_pull && !objects_pulling.empty()) { - dout(-10) << "do_recovery already pulling max, waiting" << dendl; - return true; - } - - // look at log! - Log::Entry *latest = 0; - - while (log.requested_to != log.log.end()) { - assert(log.objects.count(log.requested_to->oid)); - latest = log.objects[log.requested_to->oid]; - assert(latest); - - dout(10) << "do_recovery " - << *log.requested_to - << (objects_pulling.count(latest->oid) ? " (pulling)":"") - << dendl; - - if (latest->is_update() && - !objects_pulling.count(latest->oid) && - missing.is_missing(latest->oid)) { - pull(latest->oid); - return true; - } - - log.requested_to++; - } - - if (!objects_pulling.empty()) { - dout(7) << "do_recovery requested everything, still waiting" << dendl; - return false; - } - - // done? - assert(missing.num_missing() == 0); - assert(info.last_complete == info.last_update); - - if (is_primary()) { - // i am primary - dout(-7) << "do_recovery complete, cleaning strays" << dendl; - uptodate_set.insert(osd->whoami); - if (is_all_uptodate()) - finish_recovery(); - } else { - // tell primary - dout(7) << "do_recovery complete, telling primary" << dendl; - list ls; - ls.push_back(info); - osd->messenger->send_message(new MOSDPGNotify(osd->osdmap->get_epoch(), - ls), - osd->osdmap->get_inst(get_primary())); - } - - return false; -} - -void ReplicatedPG::do_peer_recovery() -{ - dout(-10) << "do_peer_recovery" << dendl; - - // this is FAR from an optimal recovery order. pretty lame, really. - for (unsigned i=0; isecond; - eversion_t v = peer_missing[peer].rmissing.begin()->first; - - push(oid, peer); - - // do other peers need it too? - for (i++; iget_source() << " " << *reply << dendl; - - int peer = reply->get_source().num(); - object_t oid = reply->get_oid(); - - if (pushing.count(oid) && - pushing[oid].count(peer)) { - pushing[oid].erase(peer); - - if (peer_missing.count(peer) == 0 || - peer_missing[peer].num_missing() == 0) - uptodate_set.insert(peer); - - if (pushing[oid].empty()) { - dout(10) << "pushed " << oid << " to all replicas" << dendl; - do_peer_recovery(); - } else { - dout(10) << "pushed " << oid << ", still waiting for push ack from " - << pushing[oid] << dendl; - } - } else { - dout(10) << "huh, i wasn't pushing " << oid << dendl; - } - delete reply; -} - -void ReplicatedPG::purge_strays() -{ - dout(10) << "purge_strays " << stray_set << dendl; - - for (set::iterator p = stray_set.begin(); - p != stray_set.end(); - p++) { - dout(10) << "sending PGRemove to osd" << *p << dendl; - set ls; - ls.insert(info.pgid); - MOSDPGRemove *m = new MOSDPGRemove(osd->osdmap->get_epoch(), ls); - osd->messenger->send_message(m, osd->osdmap->get_inst(*p)); - } - - stray_set.clear(); -} - diff --git a/branches/sage/mds/osd/ReplicatedPG.h b/branches/sage/mds/osd/ReplicatedPG.h deleted file mode 100644 index ab44026b43fb2..0000000000000 --- a/branches/sage/mds/osd/ReplicatedPG.h +++ /dev/null @@ -1,170 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __REPLICATEDPG_H -#define __REPLICATEDPG_H - - -#include "PG.h" - -#include "messages/MOSDOp.h" - - -class ReplicatedPG : public PG { -public: - /* - * gather state on the primary/head while replicating an osd op. - */ - class RepGather { - public: - class MOSDOp *op; - tid_t rep_tid; - - ObjectStore::Transaction t; - bool applied; - - set waitfor_ack; - set waitfor_commit; - - utime_t start; - - bool sent_ack, sent_commit; - - set osds; - eversion_t new_version; - - eversion_t pg_local_last_complete; - map pg_complete_thru; - - RepGather(MOSDOp *o, tid_t rt, eversion_t nv, eversion_t lc) : - op(o), rep_tid(rt), - applied(false), - sent_ack(false), sent_commit(false), - new_version(nv), - pg_local_last_complete(lc) { } - - bool can_send_ack() { - return !sent_ack && !sent_commit && - waitfor_ack.empty(); - } - bool can_send_commit() { - return !sent_commit && - waitfor_ack.empty() && waitfor_commit.empty(); - } - bool can_delete() { - return waitfor_ack.empty() && waitfor_commit.empty(); - } - }; - -protected: - // replica ops - // [primary|tail] - hash_map rep_gather; - hash_map > waiting_for_repop; - - // load balancing - set balancing_reads; - set unbalancing_reads; - hash_map > waiting_for_unbalanced_reads; // i.e. primary-lock - - void get_rep_gather(RepGather*); - void apply_repop(RepGather *repop); - void put_rep_gather(RepGather*); - void issue_repop(MOSDOp *op, int osd, utime_t now); - RepGather *new_rep_gather(MOSDOp *op); - void repop_ack(RepGather *repop, - int result, bool commit, - int fromosd, eversion_t pg_complete_thru=0); - - // push/pull - int num_pulling; - map > pushing; - - void push(object_t oid, int dest); - void pull(object_t oid); - - // modify - objectrev_t assign_version(MOSDOp *op); - void op_modify_commit(tid_t rep_tid, eversion_t pg_complete_thru); - void op_rep_modify_commit(MOSDOp *op, int ackerosd, eversion_t last_complete); - - void prepare_log_transaction(ObjectStore::Transaction& t, - MOSDOp *op, eversion_t& version, - objectrev_t crev, objectrev_t rev, - eversion_t trim_to); - void prepare_op_transaction(ObjectStore::Transaction& t, - MOSDOp *op, eversion_t& version, - objectrev_t crev, objectrev_t rev); - - friend class C_OSD_ModifyCommit; - friend class C_OSD_RepModifyCommit; - - - // pg on-disk content - void clean_up_local(ObjectStore::Transaction& t); - - void cancel_recovery(); - bool do_recovery(); - void do_peer_recovery(); - - void purge_strays(); - - - void op_read(MOSDOp *op); - void op_modify(MOSDOp *op); - void op_rep_modify(MOSDOp *op); - void op_push(MOSDOp *op); - void op_pull(MOSDOp *op); - - void op_push_reply(MOSDOpReply *reply); - - -public: - ReplicatedPG(OSD *o, pg_t p) : - PG(o,p), - num_pulling(0) - { } - ~ReplicatedPG() {} - - bool preprocess_op(MOSDOp *op, utime_t now); - void do_op(MOSDOp *op); - void do_op_reply(MOSDOpReply *r); - - bool same_for_read_since(epoch_t e); - bool same_for_modify_since(epoch_t e); - bool same_for_rep_modify_since(epoch_t e); - - bool is_missing_object(object_t oid); - void wait_for_missing_object(object_t oid, MOSDOp *op); - - void note_failed_osd(int o); - void on_acker_change(); - void on_role_change(); - -}; - - -inline ostream& operator<<(ostream& out, ReplicatedPG::RepGather& repop) -{ - out << "repgather(" << &repop << " rep_tid=" << repop.rep_tid - << " wfack=" << repop.waitfor_ack - << " wfcommit=" << repop.waitfor_commit; - out << " pct=" << repop.pg_complete_thru; - out << " op=" << *(repop.op); - out << " repop=" << &repop; - out << ")"; - return out; -} - - -#endif diff --git a/branches/sage/mds/osd/osd_types.h b/branches/sage/mds/osd/osd_types.h deleted file mode 100644 index 24dd9eca74234..0000000000000 --- a/branches/sage/mds/osd/osd_types.h +++ /dev/null @@ -1,321 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __OSD_TYPES_H -#define __OSD_TYPES_H - -#include "msg/msg_types.h" -#include "include/types.h" - -/* osdreqid_t - caller name + incarnation# + tid to unique identify this request - * use for metadata and osd ops. - */ -class osdreqid_t { -public: - entity_name_t name; // who - int32_t inc; // incarnation - tid_t tid; - osdreqid_t() : inc(0), tid(0) {} - osdreqid_t(const entity_name_t& a, int i, tid_t t) : name(a), inc(i), tid(t) {} -}; - -inline ostream& operator<<(ostream& out, const osdreqid_t& r) { - return out << r.name << "." << r.inc << ":" << r.tid; -} - -inline bool operator==(const osdreqid_t& l, const osdreqid_t& r) { - return (l.name == r.name) && (l.inc == r.inc) && (l.tid == r.tid); -} -inline bool operator!=(const osdreqid_t& l, const osdreqid_t& r) { - return (l.name != r.name) || (l.inc != r.inc) || (l.tid != r.tid); -} -inline bool operator<(const osdreqid_t& l, const osdreqid_t& r) { - return (l.name < r.name) || (l.inc < r.inc) || - (l.name == r.name && l.inc == r.inc && l.tid < r.tid); -} -inline bool operator<=(const osdreqid_t& l, const osdreqid_t& r) { - return (l.name < r.name) || (l.inc < r.inc) || - (l.name == r.name && l.inc == r.inc && l.tid <= r.tid); -} -inline bool operator>(const osdreqid_t& l, const osdreqid_t& r) { return !(l <= r); } -inline bool operator>=(const osdreqid_t& l, const osdreqid_t& r) { return !(l < r); } - -namespace __gnu_cxx { - template<> struct hash { - size_t operator()(const osdreqid_t &r) const { - static blobhash H; - return H((const char*)&r, sizeof(r)); - } - }; -} - - - -// osd types -typedef uint64_t coll_t; // collection id - -// pg stuff - -#define PG_INO 4 // this should match mds/mdstypes.h MDS_INO_PG - -typedef uint16_t ps_t; -typedef uint8_t pruleset_t; - - -// crush rule ids -#define CRUSH_REP_RULE(nrep) (100+nrep) // replication -#define CRUSH_RAID_RULE(num) (200+num) // raid - - - -// placement group id -struct pg_t { -public: - static const int TYPE_REP = CEPH_PG_TYPE_REP; - static const int TYPE_RAID4 = CEPH_PG_TYPE_RAID4; - -private: - union ceph_pg u; - -public: - pg_t() { u.pg64 = 0; } - pg_t(const pg_t& o) { u.pg64 = o.u.pg64; } - pg_t(int type, int size, ps_t seed, int pref) {//, pruleset_t r=0) { - u.pg.type = type; - u.pg.size = size; - u.pg.ps = seed; - u.pg.preferred = pref; // hack: avoid negative. - //u.pg.ruleset = r; - assert(sizeof(u.pg) == sizeof(u.pg64)); - } - pg_t(uint64_t v) { u.pg64 = v; } - - int type() { return u.pg.type; } - bool is_rep() { return type() == TYPE_REP; } - bool is_raid4() { return type() == TYPE_RAID4; } - - int size() { return u.pg.size; } - ps_t ps() { return u.pg.ps; } - //pruleset_t ruleset() { return u.pg.ruleset; } - int preferred() { return u.pg.preferred; } // hack: avoid negative. - - /* - pg_t operator=(uint64_t v) { u.val = v; return *this; } - pg_t operator&=(uint64_t v) { u.val &= v; return *this; } - pg_t operator+=(pg_t o) { u.val += o.val; return *this; } - pg_t operator-=(pg_t o) { u.val -= o.val; return *this; } - pg_t operator++() { ++u.val; return *this; } - */ - operator uint64_t() const { return u.pg64; } - - object_t to_object() const { return object_t(PG_INO, u.pg64 >> 32, u.pg64 & 0xffffffff); } -}; - -inline ostream& operator<<(ostream& out, pg_t pg) -{ - if (pg.is_rep()) - out << pg.size() << 'x'; - else if (pg.is_raid4()) - out << pg.size() << 'r'; - else - out << pg.size() << '?'; - - //if (pg.ruleset()) - //out << (int)pg.ruleset() << 's'; - - out << hex << pg.ps() << dec; - - if (pg.preferred() >= 0) - out << 'p' << pg.preferred(); - - //out << "=" << hex << (__uint64_t)pg << dec; - return out; -} - -namespace __gnu_cxx { - template<> struct hash< pg_t > - { - size_t operator()( const pg_t& x ) const - { - static rjhash H; - return H(x); - } - }; -} - - - - - -/** ObjectLayout - * - * describes an object's placement and layout in the storage cluster. - * most importatly, which pg it belongs to. - * if that pg is raided, it also specifies the object's stripe_unit. - */ -struct ObjectLayout { - pg_t pgid; // what pg do i belong to - int32_t stripe_unit; // for object raid in raid pgs - - ObjectLayout() : pgid(0), stripe_unit(0) { } - ObjectLayout(pg_t p, int su=0) : pgid(p), stripe_unit(su) { } -}; - -inline ostream& operator<<(ostream& out, const ObjectLayout &ol) -{ - out << "pg" << ol.pgid; - if (ol.stripe_unit) - out << ".su=" << ol.stripe_unit; - return out; -} - - - -// compound rados version type -class eversion_t { -public: - epoch_t epoch; - version_t version; - eversion_t(epoch_t e=0, version_t v=0) : epoch(e), version(v) {} -}; - -inline bool operator==(const eversion_t& l, const eversion_t& r) { - return (l.epoch == r.epoch) && (l.version == r.version); -} -inline bool operator!=(const eversion_t& l, const eversion_t& r) { - return (l.epoch != r.epoch) || (l.version != r.version); -} -inline bool operator<(const eversion_t& l, const eversion_t& r) { - return (l.epoch == r.epoch) ? (l.version < r.version):(l.epoch < r.epoch); -} -inline bool operator<=(const eversion_t& l, const eversion_t& r) { - return (l.epoch == r.epoch) ? (l.version <= r.version):(l.epoch <= r.epoch); -} -inline bool operator>(const eversion_t& l, const eversion_t& r) { - return (l.epoch == r.epoch) ? (l.version > r.version):(l.epoch > r.epoch); -} -inline bool operator>=(const eversion_t& l, const eversion_t& r) { - return (l.epoch == r.epoch) ? (l.version >= r.version):(l.epoch >= r.epoch); -} -inline ostream& operator<<(ostream& out, const eversion_t e) { - return out << e.epoch << "'" << e.version; -} - - - -/** osd_stat - * aggregate stats for an osd - */ -struct osd_stat_t { - int64_t num_blocks; - int64_t num_blocks_avail; - int64_t num_objects; - - osd_stat_t() : num_blocks(0), num_blocks_avail(0), num_objects(0) {} -}; - - -/** pg_stat - * aggregate stats for a single PG. - */ -struct pg_stat_t { - eversion_t reported; - - int32_t state; - int64_t size; // in bytes - int64_t num_blocks; // in 4k blocks - int64_t num_objects; - - pg_stat_t() : reported(0), state(0), size(0), num_blocks(0), num_objects(0) {} -}; - - - -struct osd_peer_stat_t { - utime_t stamp; - double oprate; - double qlen; - double recent_qlen; - double read_latency; - double read_latency_mine; - double frac_rd_ops_shed_in; - double frac_rd_ops_shed_out; - osd_peer_stat_t() : oprate(0), qlen(0), recent_qlen(0), - read_latency(0), read_latency_mine(0), - frac_rd_ops_shed_in(0), frac_rd_ops_shed_out(0) {} -}; - -inline ostream& operator<<(ostream& out, const osd_peer_stat_t &stat) { - return out << "stat(" << stat.stamp - //<< " oprate=" << stat.oprate - // << " qlen=" << stat.qlen - // << " recent_qlen=" << stat.recent_qlen - << " rdlat=" << stat.read_latency_mine << " / " << stat.read_latency - << " fshedin=" << stat.frac_rd_ops_shed_in - << ")"; -} - -// ----------------------------------------- - -class ObjectExtent { - public: - object_t oid; // object id - off_t start; // in object - size_t length; // in object - - ObjectLayout layout; // object layout (pgid, etc.) - - map buffer_extents; // off -> len. extents in buffer being mapped (may be fragmented bc of striping!) - - ObjectExtent() : start(0), length(0) {} - ObjectExtent(object_t o, off_t s=0, size_t l=0) : oid(o), start(s), length(l) { } -}; - -inline ostream& operator<<(ostream& out, ObjectExtent &ex) -{ - return out << "extent(" - << ex.oid << " in " << ex.layout - << " " << ex.start << "~" << ex.length - << ")"; -} - - - -// --------------------------------------- - -class OSDSuperblock { -public: - const static uint64_t MAGIC = 0xeb0f505dULL; - uint64_t magic; - uint64_t fsid; // unique fs id (random number) - int32_t whoami; // my role in this fs. - epoch_t current_epoch; // most recent epoch - epoch_t oldest_map, newest_map; // oldest/newest maps we have. - double weight; - OSDSuperblock(uint64_t f=0, int w=0) : - magic(MAGIC), fsid(f), whoami(w), - current_epoch(0), oldest_map(0), newest_map(0), weight(0) {} -}; - -inline ostream& operator<<(ostream& out, OSDSuperblock& sb) -{ - return out << "sb(fsid " << sb.fsid - << " osd" << sb.whoami - << " e" << sb.current_epoch - << " [" << sb.oldest_map << "," << sb.newest_map - << "])"; -} - - -#endif diff --git a/branches/sage/mds/osdc/Blinker.h b/branches/sage/mds/osdc/Blinker.h deleted file mode 100644 index e59c9629725ce..0000000000000 --- a/branches/sage/mds/osdc/Blinker.h +++ /dev/null @@ -1,92 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __BLINKER_H -#define __BLINKER_H - -class Blinker { - - public: - - class Op { - int op; - static const int LOOKUP = 1; - static const int INSERT = 2; - static const int REMOVE = 3; - static const int CLEAR = 4; - Op(int o) : op(o) {} - }; - - class OpLookup : public Op { - public: - bufferptr key; - OpLookup(bufferptr& k) : Op(Op::LOOKUP), key(k) {} - }; - - class OpInsert : public Op { - bufferptr key; - bufferlist val; - OpInsert(bufferptr& k, bufferlist& v) : Op(Op::INSERT), key(k), val(v) {} - }; - - class OpRemove : public Op { - public: - bufferptr key; - OpRemove(bufferptr& k) : Op(Op::REMOVE), key(k) {} - }; - - class OpClear : public Op { - public: - OpClear() : Op(Op::CLEAR) {} - }; - - - -private: - Objecter *objecter; - - // in-flight operations. - - - // cache information about tree structure. - - - -public: - // public interface - - // simple accessors - void lookup(inode_t& inode, bufferptr& key, bufferlist *pval, Context *onfinish); - - // simple modifiers - void insert(inode_t& inode, bufferptr& key, bufferlist& val, Context *onack, Context *onsafe); - void remove(inode_t& inode, bufferptr& key, Context *onack, Context *onsafe); - void clear(inode_t& inode, Context *onack, Context *onsafe); - - // these are dangerous: the table may be large. - void listkeys(inode_t& inode, list* pkeys, Context *onfinish); - void listvals(inode_t& inode, list* pkeys, list* pvals, Context *onfinish); - - // fetch *at least* key, but also anything else that is convenient. - // include lexical bounds for which this is a complete result. - // (if *start and *end are empty, it's the entire table) - void prefetch(inode_t& inode, bufferptr& key, - list* pkeys, list* pvals, - bufferptr *start, bufferptr *end, - Context *onfinish); - - -}; - -#endif diff --git a/branches/sage/mds/osdc/Filer.cc b/branches/sage/mds/osdc/Filer.cc deleted file mode 100644 index 193089d3915b1..0000000000000 --- a/branches/sage/mds/osdc/Filer.cc +++ /dev/null @@ -1,235 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include - -#include "Filer.h" -#include "osd/OSDMap.h" - -//#include "messages/MOSDRead.h" -//#include "messages/MOSDReadReply.h" -//#include "messages/MOSDWrite.h" -//#include "messages/MOSDWriteReply.h" -#include "messages/MOSDOp.h" -#include "messages/MOSDOpReply.h" -#include "messages/MOSDMap.h" - -#include "msg/Messenger.h" - -#include "include/Context.h" - -#include "config.h" - -#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_filer) *_dout << dbeginl << g_clock.now() << " " << objecter->messenger->get_myname() << ".filer " - - -class Filer::C_Probe : public Context { -public: - Filer *filer; - Probe *probe; - object_t oid; - off_t size; - C_Probe(Filer *f, Probe *p, object_t o) : filer(f), probe(p), oid(o), size(0) {} - void finish(int r) { - filer->_probed(probe, oid, size); - } -}; - -int Filer::probe_fwd(inode_t& inode, - off_t start_from, - off_t *end, - Context *onfinish) -{ - dout(10) << "probe_fwd " << hex << inode.ino << dec << " starting from " << start_from << dendl; - - Probe *probe = new Probe(inode, start_from, end, onfinish); - - // period (bytes before we jump unto a new set of object(s)) - off_t period = ceph_file_layout_period(inode.layout); - - // start with 1+ periods. - probe->probing_len = period; - if (start_from % period) - probe->probing_len += period - (start_from % period); - - _probe(probe); - return 0; -} - -void Filer::_probe(Probe *probe) -{ - dout(10) << "_probe " << hex << probe->inode.ino << dec << " " << probe->from << "~" << probe->probing_len << dendl; - - // map range onto objects - file_to_extents(probe->inode, probe->from, probe->probing_len, probe->probing); - - for (list::iterator p = probe->probing.begin(); - p != probe->probing.end(); - p++) { - dout(10) << "_probe probing " << p->oid << dendl; - C_Probe *c = new C_Probe(this, probe, p->oid); - probe->ops[p->oid] = objecter->stat(p->oid, &c->size, p->layout, c); - } -} - -void Filer::_probed(Probe *probe, object_t oid, off_t size) -{ - dout(10) << "_probed " << probe->inode.ino << " object " << hex << oid << dec << " has size " << size << dendl; - - probe->known[oid] = size; - assert(probe->ops.count(oid)); - probe->ops.erase(oid); - - if (!probe->ops.empty()) - return; // waiting for more! - - // analyze! - off_t end = 0; - for (list::iterator p = probe->probing.begin(); - p != probe->probing.end(); - p++) { - off_t shouldbe = p->length+p->start; - dout(10) << "_probed " << probe->inode.ino << " object " << hex << p->oid << dec - << " should be " << shouldbe - << ", actual is " << probe->known[p->oid] - << dendl; - - if (probe->known[p->oid] < 0) { end = -1; break; } // error! - - assert(probe->known[p->oid] <= shouldbe); - if (shouldbe == probe->known[p->oid]) continue; // keep going - - // aha, we found the end! - // calc offset into buffer_extent to get distance from probe->from. - off_t oleft = probe->known[p->oid] - p->start; - for (map::iterator i = p->buffer_extents.begin(); - i != p->buffer_extents.end(); - i++) { - if (oleft <= (off_t)i->second) { - end = probe->from + i->first + oleft; - dout(10) << "_probed end is in buffer_extent " << i->first << "~" << i->second << " off " << oleft - << ", from was " << probe->from << ", end is " << end - << dendl; - break; - } - oleft -= i->second; - } - break; - } - - if (end == 0) { - // keep probing! - dout(10) << "_probed didn't find end, probing further" << dendl; - off_t period = probe->inode.layout.fl_object_size * probe->inode.layout.fl_stripe_count; - probe->from += probe->probing_len; - probe->probing_len = period; - _probe(probe); - return; - } - - if (end < 0) { - dout(10) << "_probed encountered an error while probing" << dendl; - *probe->end = -1; - } else { - // hooray! - dout(10) << "_probed found end at " << end << dendl; - *probe->end = end; - } - - // done! finish and clean up. - probe->onfinish->finish(end > 0 ? 0:-1); - delete probe->onfinish; - delete probe; -} - - -void Filer::file_to_extents(inode_t inode, - off_t offset, size_t len, - list& extents, - objectrev_t rev) -{ - dout(10) << "file_to_extents " << offset << "~" << len - << " on " << hex << inode.ino << dec - << dendl; - - /* we want only one extent per object! - * this means that each extent we read may map into different bits of the - * final read buffer.. hence OSDExtent.buffer_extents - */ - map< object_t, ObjectExtent > object_extents; - - assert(inode.layout.fl_object_size >= inode.layout.fl_stripe_unit); - off_t stripes_per_object = inode.layout.fl_object_size / inode.layout.fl_stripe_unit; - dout(20) << " stripes_per_object " << stripes_per_object << dendl; - - off_t cur = offset; - off_t left = len; - while (left > 0) { - // layout into objects - off_t blockno = cur / inode.layout.fl_stripe_unit; // which block - off_t stripeno = blockno / inode.layout.fl_stripe_count; // which horizontal stripe (Y) - off_t stripepos = blockno % inode.layout.fl_stripe_count; // which object in the object set (X) - off_t objectsetno = stripeno / stripes_per_object; // which object set - off_t objectno = objectsetno * inode.layout.fl_stripe_count + stripepos; // object id - - // find oid, extent - ObjectExtent *ex = 0; - object_t oid( inode.ino, objectno, rev ); - if (object_extents.count(oid)) - ex = &object_extents[oid]; - else { - ex = &object_extents[oid]; - ex->oid = oid; - ex->layout = objecter->osdmap->file_to_object_layout( oid, inode.layout ); - } - - // map range into object - off_t block_start = (stripeno % stripes_per_object)*inode.layout.fl_stripe_unit; - off_t block_off = cur % inode.layout.fl_stripe_unit; - off_t max = inode.layout.fl_stripe_unit - block_off; - - off_t x_offset = block_start + block_off; - off_t x_len; - if (left > max) - x_len = max; - else - x_len = left; - - if (ex->start + (off_t)ex->length == x_offset) { - // add to extent - ex->length += x_len; - } else { - // new extent - assert(ex->length == 0); - assert(ex->start == 0); - ex->start = x_offset; - ex->length = x_len; - } - ex->buffer_extents[cur-offset] = x_len; - - dout(15) << "file_to_extents " << *ex << " in " << ex->layout << dendl; - //dout(0) << "map: ino " << ino << " oid " << ex.oid << " osd " << ex.osd << " offset " << ex.offset << " len " << ex.len << " ... left " << left << dendl; - - left -= x_len; - cur += x_len; - } - - // make final list - for (map::iterator it = object_extents.begin(); - it != object_extents.end(); - it++) { - extents.push_back(it->second); - } -} diff --git a/branches/sage/mds/osdc/Filer.h b/branches/sage/mds/osdc/Filer.h deleted file mode 100644 index 0679a9b6ffef3..0000000000000 --- a/branches/sage/mds/osdc/Filer.h +++ /dev/null @@ -1,165 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __FILER_H -#define __FILER_H - -/*** Filer - * - * stripe file ranges onto objects. - * build list for the objecter or objectcacher. - * - * also, provide convenience methods that call objecter for you. - * - * "files" are identified by ino. - */ - -#include -#include -using namespace std; - -#include -using namespace __gnu_cxx; - -#include "include/types.h" - -#include "osd/OSDMap.h" -#include "Objecter.h" - -class Context; -class Messenger; -class OSDMap; - - -/**** Filer interface ***/ - -class Filer { - Objecter *objecter; - - // probes - struct Probe { - inode_t inode; - off_t from; - off_t *end; - Context *onfinish; - - list probing; - off_t probing_len; - - map known; - map ops; - - Probe(inode_t &i, off_t f, off_t *e, Context *c) : - inode(i), from(f), end(e), onfinish(c), probing_len(0) {} - }; - - class C_Probe; - //friend class C_Probe; - - void _probe(Probe *p); - void _probed(Probe *p, object_t oid, off_t size); - - public: - Filer(Objecter *o) : objecter(o) {} - ~Filer() {} - - bool is_active() { - return objecter->is_active(); // || (oc && oc->is_active()); - } - - /*** async file interface ***/ - Objecter::OSDRead *prepare_read(inode_t& inode, - off_t offset, - size_t len, - bufferlist *bl) { - Objecter::OSDRead *rd = new Objecter::OSDRead(bl); - file_to_extents(inode, offset, len, rd->extents); - return rd; - } - int read(inode_t& inode, - off_t offset, - size_t len, - bufferlist *bl, // ptr to data - Context *onfinish) { - Objecter::OSDRead *rd = prepare_read(inode, offset, len, bl); - return objecter->readx(rd, onfinish) > 0 ? 0:-1; - } - - int write(inode_t& inode, - off_t offset, - size_t len, - bufferlist& bl, - int flags, - Context *onack, - Context *oncommit, - objectrev_t rev=0) { - Objecter::OSDWrite *wr = new Objecter::OSDWrite(bl); - file_to_extents(inode, offset, len, wr->extents, rev); - return objecter->modifyx(wr, onack, oncommit) > 0 ? 0:-1; - } - - int zero(inode_t& inode, - off_t offset, - size_t len, - Context *onack, - Context *oncommit) { - Objecter::OSDModify *z = new Objecter::OSDModify(OSD_OP_ZERO); - file_to_extents(inode, offset, len, z->extents); - return objecter->modifyx(z, onack, oncommit) > 0 ? 0:-1; - } - - int remove(inode_t& inode, - off_t offset, - size_t len, - Context *onack, - Context *oncommit) { - Objecter::OSDModify *z = new Objecter::OSDModify(OSD_OP_DELETE); - file_to_extents(inode, offset, len, z->extents); - return objecter->modifyx(z, onack, oncommit) > 0 ? 0:-1; - } - - int probe_fwd(inode_t& inode, - off_t start_from, - off_t *end, - Context *onfinish); - - - /***** mapping *****/ - - /* map (ino, ono) to an object name - (to be used on any osd in the proper replica group) */ - /*object_t file_to_object(inodeno_t ino, - size_t _ono) { - uint64_t ono = _ono; - assert(ino < (1ULL<& extents, - objectrev_t rev=0); - -}; - - - -#endif diff --git a/branches/sage/mds/osdc/Journaler.cc b/branches/sage/mds/osdc/Journaler.cc deleted file mode 100644 index 363b7c60de9aa..0000000000000 --- a/branches/sage/mds/osdc/Journaler.cc +++ /dev/null @@ -1,666 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "Journaler.h" - -#include "include/Context.h" -#include "common/Logger.h" -#include "msg/Messenger.h" - -#include "config.h" - -#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_journaler) *_dout << dbeginl << g_clock.now() << " " << objecter->messenger->get_myname() << ".journaler " -#define derr(x) if (x <= g_conf.debug || x <= g_conf.debug_journaler) *_derr << dbeginl << g_clock.now() << " " << objecter->messenger->get_myname() << ".journaler " - - - -void Journaler::reset() -{ - dout(1) << "reset to blank journal" << dendl; - state = STATE_ACTIVE; - write_pos = flush_pos = ack_pos = - read_pos = requested_pos = received_pos = - expire_pos = trimming_pos = trimmed_pos = ceph_file_layout_period(inode.layout); -} - - -/***************** HEADER *******************/ - -ostream& operator<<(ostream& out, Journaler::Header &h) -{ - return out << "loghead(trim " << h.trimmed_pos - << ", expire " << h.expire_pos - << ", read " << h.read_pos - << ", write " << h.write_pos - << ")"; -} - -class Journaler::C_ReadHead : public Context { - Journaler *ls; -public: - bufferlist bl; - C_ReadHead(Journaler *l) : ls(l) {} - void finish(int r) { - ls->_finish_read_head(r, bl); - } -}; - -class Journaler::C_ProbeEnd : public Context { - Journaler *ls; -public: - off_t end; - C_ProbeEnd(Journaler *l) : ls(l), end(-1) {} - void finish(int r) { - ls->_finish_probe_end(r, end); - } -}; - -void Journaler::recover(Context *onread) -{ - assert(state != STATE_ACTIVE); - - if (onread) - waitfor_recover.push_back(onread); - - if (state != STATE_UNDEF) { - dout(1) << "recover - already recoverying" << dendl; - return; - } - - dout(1) << "read_head" << dendl; - state = STATE_READHEAD; - C_ReadHead *fin = new C_ReadHead(this); - filer.read(inode, 0, sizeof(Header), &fin->bl, fin); -} - -void Journaler::_finish_read_head(int r, bufferlist& bl) -{ - assert(state == STATE_READHEAD); - - if (bl.length() == 0) { - dout(1) << "_finish_read_head r=" << r << " read 0 bytes, assuming empty log" << dendl; - state = STATE_ACTIVE; - list ls; - ls.swap(waitfor_recover); - finish_contexts(ls, 0); - return; - } - - // unpack header - Header h; - assert(bl.length() == sizeof(h)); - bl.copy(0, sizeof(h), (char*)&h); - - write_pos = flush_pos = ack_pos = h.write_pos; - read_pos = requested_pos = received_pos = h.read_pos; - expire_pos = h.expire_pos; - trimmed_pos = trimming_pos = h.trimmed_pos; - - dout(1) << "_finish_read_head " << h << ". probing for end of log (from " << write_pos << ")..." << dendl; - - // probe the log - state = STATE_PROBING; - C_ProbeEnd *fin = new C_ProbeEnd(this); - filer.probe_fwd(inode, h.write_pos, &fin->end, fin); -} - -void Journaler::_finish_probe_end(int r, off_t end) -{ - assert(state == STATE_PROBING); - - if (end == -1) { - end = write_pos; - dout(1) << "_finish_probe_end write_pos = " << end - << " (header had " << write_pos << "). log was empty. recovered." - << dendl; - assert(0); // hrm. - } else { - assert(end >= write_pos); - assert(r >= 0); - dout(1) << "_finish_probe_end write_pos = " << end - << " (header had " << write_pos << "). recovered." - << dendl; - } - - write_pos = flush_pos = ack_pos = end; - - // done. - list ls; - ls.swap(waitfor_recover); - finish_contexts(ls, 0); -} - - -// WRITING - -class Journaler::C_WriteHead : public Context { -public: - Journaler *ls; - Header h; - Context *oncommit; - C_WriteHead(Journaler *l, Header& h_, Context *c) : ls(l), h(h_), oncommit(c) {} - void finish(int r) { - ls->_finish_write_head(h, oncommit); - } -}; - -void Journaler::write_head(Context *oncommit) -{ - assert(state == STATE_ACTIVE); - last_written.trimmed_pos = trimmed_pos; - last_written.expire_pos = expire_pos; - last_written.read_pos = read_pos; - last_written.write_pos = ack_pos; //write_pos; - dout(10) << "write_head " << last_written << dendl; - - last_wrote_head = g_clock.now(); - - bufferlist bl; - bl.append((char*)&last_written, sizeof(last_written)); - filer.write(inode, 0, bl.length(), bl, 0, - 0, - new C_WriteHead(this, last_written, oncommit)); -} - -void Journaler::_finish_write_head(Header &wrote, Context *oncommit) -{ - dout(10) << "_finish_write_head " << wrote << dendl; - last_committed = wrote; - if (oncommit) { - oncommit->finish(0); - delete oncommit; - } - - trim(); // trim? -} - - -/***************** WRITING *******************/ - -class Journaler::C_Flush : public Context { - Journaler *ls; - off_t start; -public: - C_Flush(Journaler *l, off_t s) : ls(l), start(s) {} - void finish(int r) { ls->_finish_flush(r, start); } -}; - -void Journaler::_finish_flush(int r, off_t start) -{ - assert(r>=0); - - assert(start >= ack_pos); - assert(start < flush_pos); - assert(pending_flush.count(start)); - - // calc latency? - if (logger) { - utime_t lat = g_clock.now(); - lat -= pending_flush[start]; - logger->favg("jlat", lat); - } - - pending_flush.erase(start); - - // adjust ack_pos - if (pending_flush.empty()) - ack_pos = flush_pos; - else - ack_pos = pending_flush.begin()->first; - - dout(10) << "_finish_flush from " << start - << ", pending_flush now " << pending_flush - << ", write positions now " << write_pos << "/" << flush_pos << "/" << ack_pos - << dendl; - - // kick waiters <= ack_pos - while (!waitfor_flush.empty()) { - if (waitfor_flush.begin()->first > ack_pos) break; - finish_contexts(waitfor_flush.begin()->second); - waitfor_flush.erase(waitfor_flush.begin()); - } -} - - -off_t Journaler::append_entry(bufferlist& bl, Context *onsync) -{ - uint32_t s = bl.length(); - - if (!g_conf.journaler_allow_split_entries) { - // will we span a stripe boundary? - int p = inode.layout.fl_stripe_unit; - if (write_pos / p != (write_pos + (off_t)(bl.length() + sizeof(s))) / p) { - // yes. - // move write_pos forward. - off_t owp = write_pos; - write_pos += p; - write_pos -= (write_pos % p); - - // pad with zeros. - bufferptr bp(write_pos - owp); - bp.zero(); - assert(bp.length() >= 4); - write_buf.push_back(bp); - - // now flush. - flush(); - - dout(12) << "append_entry skipped " << (write_pos-owp) << " bytes to " << write_pos << " to avoid spanning stripe boundary" << dendl; - } - } - - dout(10) << "append_entry len " << bl.length() << " to " << write_pos << "~" << (bl.length() + sizeof(uint32_t)) << dendl; - - // cache? - // NOTE: this is a dumb thing to do; this is used for a benchmarking - // purposes only. - if (g_conf.journaler_cache && - write_pos == read_pos + read_buf.length()) { - dout(10) << "append_entry caching in read_buf too" << dendl; - assert(requested_pos == received_pos); - assert(requested_pos == read_pos + read_buf.length()); - read_buf.append((char*)&s, sizeof(s)); - read_buf.append(bl); - requested_pos = received_pos = write_pos + sizeof(s) + s; - } - - // append - write_buf.append((char*)&s, sizeof(s)); - write_buf.claim_append(bl); - write_pos += sizeof(s) + s; - - // flush now? - if (onsync) - flush(onsync); - - return write_pos; -} - - -void Journaler::_do_flush() -{ - if (write_pos == flush_pos) return; - assert(write_pos > flush_pos); - - // flush - unsigned len = write_pos - flush_pos; - assert(len == write_buf.length()); - dout(10) << "_do_flush flushing " << flush_pos << "~" << len << dendl; - - // submit write for anything pending - // flush _start_ pos to _finish_flush - filer.write(inode, flush_pos, len, write_buf, 0, - g_conf.journaler_safe ? 0:new C_Flush(this, flush_pos), // on ACK - g_conf.journaler_safe ? new C_Flush(this, flush_pos):0); // on COMMIT - pending_flush[flush_pos] = g_clock.now(); - - // adjust pointers - flush_pos = write_pos; - write_buf.clear(); - - dout(10) << "_do_flush write pointers now at " << write_pos << "/" << flush_pos << "/" << ack_pos << dendl; -} - - - -void Journaler::flush(Context *onsync) -{ - // all flushed and acked? - if (write_pos == ack_pos) { - assert(write_buf.length() == 0); - dout(10) << "flush nothing to flush, write pointers at " << write_pos << "/" << flush_pos << "/" << ack_pos << dendl; - if (onsync) { - onsync->finish(0); - delete onsync; - } - return; - } - - if (write_pos == flush_pos) { - assert(write_buf.length() == 0); - dout(10) << "flush nothing to flush, write pointers at " << write_pos << "/" << flush_pos << "/" << ack_pos << dendl; - } else { - if (1) { - // maybe buffer - if (write_buf.length() < g_conf.journaler_batch_max) { - // delay! schedule an event. - dout(20) << "flush delaying flush" << dendl; - if (delay_flush_event) timer.cancel_event(delay_flush_event); - delay_flush_event = new C_DelayFlush(this); - timer.add_event_after(g_conf.journaler_batch_interval, delay_flush_event); - } else { - dout(20) << "flush not delaying flush" << dendl; - _do_flush(); - } - } else { - // always flush - _do_flush(); - } - } - - // queue waiter (at _new_ write_pos; will go when reached by ack_pos) - if (onsync) - waitfor_flush[write_pos].push_back(onsync); - - // write head? - if (last_wrote_head.sec() + g_conf.journaler_write_head_interval < g_clock.now().sec()) { - write_head(); - } -} - - - -/***************** READING *******************/ - - -class Journaler::C_Read : public Context { - Journaler *ls; -public: - C_Read(Journaler *l) : ls(l) {} - void finish(int r) { ls->_finish_read(r); } -}; - -class Journaler::C_RetryRead : public Context { - Journaler *ls; -public: - C_RetryRead(Journaler *l) : ls(l) {} - void finish(int r) { ls->is_readable(); } // this'll kickstart. -}; - -void Journaler::_finish_read(int r) -{ - assert(r>=0); - - dout(10) << "_finish_read got " << received_pos << "~" << reading_buf.length() << dendl; - received_pos += reading_buf.length(); - read_buf.claim_append(reading_buf); - assert(received_pos <= requested_pos); - dout(10) << "_finish_read read_buf now " << read_pos << "~" << read_buf.length() - << ", read pointers " << read_pos << "/" << received_pos << "/" << requested_pos - << dendl; - - if (is_readable()) { // NOTE: this check may read more - // readable! - dout(10) << "_finish_read now readable" << dendl; - if (on_readable) { - Context *f = on_readable; - on_readable = 0; - f->finish(0); - delete f; - } - - if (read_bl) { - bool r = try_read_entry(*read_bl); - assert(r); // this should have worked. - - // clear state - Context *f = on_read_finish; - on_read_finish = 0; - read_bl = 0; - - // do callback - f->finish(0); - delete f; - } - } - - // prefetch? - _prefetch(); -} - -/* NOTE: this could be slightly smarter... we could allow - * multiple reads to be in progress. e.g., if we prefetch, but - * then discover we need even more for an especially large entry. - * i don't think that circumstance will arise particularly often. - */ -void Journaler::_issue_read(off_t len) -{ - // make sure we're fully flushed - _do_flush(); - - if (_is_reading()) { - dout(10) << "_issue_read " << len << " waiting, already reading " - << received_pos << "~" << (requested_pos-received_pos) << dendl; - return; - } - assert(requested_pos == received_pos); - - // stuck at ack_pos? - assert(requested_pos <= ack_pos); - if (requested_pos == ack_pos) { - dout(10) << "_issue_read requested_pos = ack_pos = " << ack_pos << ", waiting" << dendl; - assert(write_pos > requested_pos); - if (flush_pos == ack_pos) - flush(); - assert(flush_pos > ack_pos); - waitfor_flush[flush_pos].push_back(new C_RetryRead(this)); - return; - } - - // don't read too much - if (requested_pos + len > ack_pos) { - len = ack_pos - requested_pos; - dout(10) << "_issue_read reading only up to ack_pos " << ack_pos << dendl; - } - - // go. - dout(10) << "_issue_read reading " << requested_pos << "~" << len - << ", read pointers " << read_pos << "/" << received_pos << "/" << (requested_pos+len) - << dendl; - - filer.read(inode, requested_pos, len, &reading_buf, - new C_Read(this)); - requested_pos += len; -} - -void Journaler::_prefetch() -{ - // prefetch? - off_t left = requested_pos - read_pos; - if (left <= prefetch_from && // should read more, - !_is_reading() && // and not reading anything right now - write_pos > requested_pos) { // there's something more to read... - dout(10) << "_prefetch only " << left << " < " << prefetch_from - << ", prefetching " << dendl; - _issue_read(fetch_len); - } -} - - -void Journaler::read_entry(bufferlist *bl, Context *onfinish) -{ - // only one read at a time! - assert(read_bl == 0); - assert(on_read_finish == 0); - - if (is_readable()) { - dout(10) << "read_entry at " << read_pos << ", read_buf is " - << read_pos << "~" << read_buf.length() - << ", readable now" << dendl; - - // nice, just do it now. - bool r = try_read_entry(*bl); - assert(r); - - // callback - onfinish->finish(0); - delete onfinish; - } else { - dout(10) << "read_entry at " << read_pos << ", read_buf is " - << read_pos << "~" << read_buf.length() - << ", not readable now" << dendl; - - bl->clear(); - - // set it up - read_bl = bl; - on_read_finish = onfinish; - - // is_readable() will have already initiated a read (if it was possible) - } -} - - -/* is_readable() - * return true if next entry is ready. - * kickstart read as necessary. - */ -bool Journaler::is_readable() -{ - // anything to read? - if (read_pos == write_pos) return false; - - // have enough for entry size? - uint32_t s = 0; - if (read_buf.length() >= sizeof(s)) - read_buf.copy(0, sizeof(s), (char*)&s); - - // entry and payload? - if (read_buf.length() >= sizeof(s) && - read_buf.length() >= sizeof(s) + s) - return true; // yep, next entry is ready. - - // darn it! - - // partial fragment at the end? - if (received_pos == write_pos) { - dout(10) << "is_readable() detected partial entry at tail, adjusting write_pos to " << read_pos << dendl; - write_pos = flush_pos = ack_pos = read_pos; - assert(write_buf.length() == 0); - - // truncate? - // FIXME: how much? - - return false; - } - - // start reading some more? - if (!_is_reading()) { - if (s) - fetch_len = MAX(fetch_len, (off_t)(sizeof(s)+s-read_buf.length())); - _issue_read(fetch_len); - } - - return false; -} - - -/* try_read_entry(bl) - * read entry into bl if it's ready. - * otherwise, do nothing. (well, we'll start fetching it for good measure.) - */ -bool Journaler::try_read_entry(bufferlist& bl) -{ - if (!is_readable()) { // this may start a read. - dout(10) << "try_read_entry at " << read_pos << " not readable" << dendl; - return false; - } - - uint32_t s; - assert(read_buf.length() >= sizeof(s)); - read_buf.copy(0, sizeof(s), (char*)&s); - assert(read_buf.length() >= sizeof(s) + s); - - dout(10) << "try_read_entry at " << read_pos << " reading " - << read_pos << "~" << (sizeof(s)+s) << dendl; - - // do it - assert(bl.length() == 0); - read_buf.splice(0, sizeof(s)); - read_buf.splice(0, s, &bl); - read_pos += sizeof(s) + s; - - // prefetch? - _prefetch(); - return true; -} - -void Journaler::wait_for_readable(Context *onreadable) -{ - dout(10) << "wait_for_readable at " << read_pos << " onreadable " << onreadable << dendl; - assert(!is_readable()); - assert(on_readable == 0); - on_readable = onreadable; -} - - - - -/***************** TRIMMING *******************/ - - -class Journaler::C_Trim : public Context { - Journaler *ls; - off_t to; -public: - C_Trim(Journaler *l, off_t t) : ls(l), to(t) {} - void finish(int r) { - ls->_trim_finish(r, to); - } -}; - -void Journaler::trim() -{ - off_t trim_to = last_committed.expire_pos; - trim_to -= trim_to % ceph_file_layout_period(inode.layout); - dout(10) << "trim last_commited head was " << last_committed - << ", can trim to " << trim_to - << dendl; - if (trim_to == 0 || trim_to == trimming_pos) { - dout(10) << "trim already trimmed/trimming to " - << trimmed_pos << "/" << trimming_pos << dendl; - return; - } - - if (trimming_pos > trimmed_pos) { - dout(10) << "trim already trimming atm, try again later. trimmed/trimming is " - << trimmed_pos << "/" << trimming_pos << dendl; - return; - } - - // trim - assert(trim_to <= write_pos); - assert(trim_to > trimming_pos); - dout(10) << "trim trimming to " << trim_to - << ", trimmed/trimming/expire are " - << trimmed_pos << "/" << trimming_pos << "/" << expire_pos - << dendl; - - filer.remove(inode, trimming_pos, trim_to-trimming_pos, - 0, new C_Trim(this, trim_to)); - trimming_pos = trim_to; -} - -void Journaler::_trim_finish(int r, off_t to) -{ - dout(10) << "_trim_finish trimmed_pos was " << trimmed_pos - << ", trimmed/trimming/expire now " - << to << "/" << trimming_pos << "/" << expire_pos - << dendl; - assert(r >= 0); - - assert(to <= trimming_pos); - assert(to > trimmed_pos); - trimmed_pos = to; - - // finishers? - while (!waitfor_trim.empty() && - waitfor_trim.begin()->first <= trimmed_pos) { - finish_contexts(waitfor_trim.begin()->second, 0); - waitfor_trim.erase(waitfor_trim.begin()); - } -} - - -// eof. diff --git a/branches/sage/mds/osdc/ObjectCacher.cc b/branches/sage/mds/osdc/ObjectCacher.cc deleted file mode 100644 index d5f347d3863cb..0000000000000 --- a/branches/sage/mds/osdc/ObjectCacher.cc +++ /dev/null @@ -1,1587 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#include "msg/Messenger.h" -#include "ObjectCacher.h" -#include "Objecter.h" - - - -/*** ObjectCacher::BufferHead ***/ - - -/*** ObjectCacher::Object ***/ - -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_objectcacher) *_dout << dbeginl << g_clock.now() << " " << oc->objecter->messenger->get_myname() << ".objectcacher.object(" << oid << ") " - - -ObjectCacher::BufferHead *ObjectCacher::Object::split(BufferHead *left, off_t off) -{ - dout(20) << "split " << *left << " at " << off << dendl; - - // split off right - ObjectCacher::BufferHead *right = new BufferHead(this); - right->last_write_tid = left->last_write_tid; - right->set_state(left->get_state()); - - off_t newleftlen = off - left->start(); - right->set_start(off); - right->set_length(left->length() - newleftlen); - - // shorten left - oc->bh_stat_sub(left); - left->set_length(newleftlen); - oc->bh_stat_add(left); - - // add right - oc->bh_add(this, right); - - // split buffers too - bufferlist bl; - bl.claim(left->bl); - if (bl.length()) { - assert(bl.length() == (left->length() + right->length())); - right->bl.substr_of(bl, left->length(), right->length()); - left->bl.substr_of(bl, 0, left->length()); - } - - // move read waiters - if (!left->waitfor_read.empty()) { - map >::iterator o, p = left->waitfor_read.end(); - p--; - while (p != left->waitfor_read.begin()) { - if (p->first < right->start()) break; - dout(0) << "split moving waiters at byte " << p->first << " to right bh" << dendl; - right->waitfor_read[p->first].swap( p->second ); - o = p; - p--; - left->waitfor_read.erase(o); - } - } - - dout(20) << "split left is " << *left << dendl; - dout(20) << "split right is " << *right << dendl; - return right; -} - - -void ObjectCacher::Object::merge_left(BufferHead *left, BufferHead *right) -{ - assert(left->end() == right->start()); - assert(left->get_state() == right->get_state()); - - dout(10) << "merge_left " << *left << " + " << *right << dendl; - oc->bh_remove(this, right); - oc->bh_stat_sub(left); - left->set_length( left->length() + right->length()); - oc->bh_stat_add(left); - - // data - left->bl.claim_append(right->bl); - - // version - // note: this is sorta busted, but should only be used for dirty buffers - left->last_write_tid = MAX( left->last_write_tid, right->last_write_tid ); - left->last_write = MAX( left->last_write, right->last_write ); - - // waiters - for (map >::iterator p = right->waitfor_read.begin(); - p != right->waitfor_read.end(); - p++) - left->waitfor_read[p->first].splice( left->waitfor_read[p->first].begin(), - p->second ); - - // hose right - delete right; - - dout(10) << "merge_left result " << *left << dendl; -} - -void ObjectCacher::Object::try_merge_bh(BufferHead *bh) -{ - dout(10) << "try_merge_bh " << *bh << dendl; - - // to the left? - map::iterator p = data.find(bh->start()); - assert(p->second == bh); - if (p != data.begin()) { - p--; - if (p->second->end() == bh->start() && - p->second->get_state() == bh->get_state()) { - merge_left(p->second, bh); - bh = p->second; - } else - p++; - } - // to the right? - assert(p->second == bh); - p++; - if (p != data.end() && - p->second->start() == bh->end() && - p->second->get_state() == bh->get_state()) - merge_left(bh, p->second); -} - - -/* - * map a range of bytes into buffer_heads. - * - create missing buffer_heads as necessary. - */ -int ObjectCacher::Object::map_read(Objecter::OSDRead *rd, - map& hits, - map& missing, - map& rx) -{ - for (list::iterator ex_it = rd->extents.begin(); - ex_it != rd->extents.end(); - ex_it++) { - - if (ex_it->oid != oid) continue; - - dout(10) << "map_read " << ex_it->oid - << " " << ex_it->start << "~" << ex_it->length << dendl; - - map::iterator p = data.lower_bound(ex_it->start); - // p->first >= start - - off_t cur = ex_it->start; - off_t left = ex_it->length; - - if (p != data.begin() && - (p == data.end() || p->first > cur)) { - p--; // might overlap! - if (p->first + p->second->length() <= cur) - p++; // doesn't overlap. - } - - while (left > 0) { - // at end? - if (p == data.end()) { - // rest is a miss. - BufferHead *n = new BufferHead(this); - n->set_start( cur ); - n->set_length( left ); - oc->bh_add(this, n); - missing[cur] = n; - dout(20) << "map_read miss " << left << " left, " << *n << dendl; - cur += left; - left -= left; - assert(left == 0); - assert(cur == ex_it->start + (off_t)ex_it->length); - break; // no more. - } - - if (p->first <= cur) { - // have it (or part of it) - BufferHead *e = p->second; - - if (e->is_clean() || - e->is_dirty() || - e->is_tx()) { - hits[cur] = e; // readable! - dout(20) << "map_read hit " << *e << dendl; - } - else if (e->is_rx()) { - rx[cur] = e; // missing, not readable. - dout(20) << "map_read rx " << *e << dendl; - } - else assert(0); - - off_t lenfromcur = MIN(e->end() - cur, left); - cur += lenfromcur; - left -= lenfromcur; - p++; - continue; // more? - - } else if (p->first > cur) { - // gap.. miss - off_t next = p->first; - BufferHead *n = new BufferHead(this); - n->set_start( cur ); - n->set_length( MIN(next - cur, left) ); - oc->bh_add(this,n); - missing[cur] = n; - cur += MIN(left, n->length()); - left -= MIN(left, n->length()); - dout(20) << "map_read gap " << *n << dendl; - continue; // more? - } - else - assert(0); - } - } - return(0); -} - -/* - * map a range of extents on an object's buffer cache. - * - combine any bh's we're writing into one - * - break up bufferheads that don't fall completely within the range - * //no! - return a bh that includes the write. may also include other dirty data to left and/or right. - */ -ObjectCacher::BufferHead *ObjectCacher::Object::map_write(Objecter::OSDWrite *wr) -{ - BufferHead *final = 0; - - for (list::iterator ex_it = wr->extents.begin(); - ex_it != wr->extents.end(); - ex_it++) { - - if (ex_it->oid != oid) continue; - - dout(10) << "map_write oex " << ex_it->oid - << " " << ex_it->start << "~" << ex_it->length << dendl; - - map::iterator p = data.lower_bound(ex_it->start); - // p->first >= start - - off_t cur = ex_it->start; - off_t left = ex_it->length; - - if (p != data.begin() && - (p == data.end() || p->first > cur)) { - p--; // might overlap or butt up! - - /*// dirty and butts up? - if (p->first + p->second->length() == cur && - p->second->is_dirty()) { - dout(10) << "map_write will append to tail of " << *p->second << dendl; - final = p->second; - } - */ - if (p->first + p->second->length() <= cur) - p++; // doesn't overlap. - } - - while (left > 0) { - off_t max = left; - - // at end ? - if (p == data.end()) { - if (final == NULL) { - final = new BufferHead(this); - final->set_start( cur ); - final->set_length( max ); - oc->bh_add(this, final); - dout(10) << "map_write adding trailing bh " << *final << dendl; - } else { - final->set_length( final->length() + max ); - } - left -= max; - cur += max; - continue; - } - - dout(10) << "p is " << *p->second << dendl; - - if (p->first <= cur) { - BufferHead *bh = p->second; - dout(10) << "map_write bh " << *bh << " intersected" << dendl; - - if (p->first < cur) { - assert(final == 0); - if (cur + max >= p->first + p->second->length()) { - // we want right bit (one splice) - final = split(bh, cur); // just split it, take right half. - p++; - assert(p->second == final); - } else { - // we want middle bit (two splices) - final = split(bh, cur); - p++; - assert(p->second == final); - split(final, cur+max); - } - } else if (p->first == cur) { - if (p->second->length() <= max) { - // whole bufferhead, piece of cake. - } else { - // we want left bit (one splice) - split(bh, cur + max); // just split - } - if (final) - merge_left(final, bh); - else - final = bh; - } - - // keep going. - off_t lenfromcur = final->end() - cur; - cur += lenfromcur; - left -= lenfromcur; - p++; - continue; - } else { - // gap! - off_t next = p->first; - off_t glen = MIN(next - cur, max); - dout(10) << "map_write gap " << cur << "~" << glen << dendl; - if (final) { - final->set_length( final->length() + glen ); - } else { - final = new BufferHead(this); - final->set_start( cur ); - final->set_length( glen ); - oc->bh_add(this, final); - } - - cur += glen; - left -= glen; - continue; // more? - } - } - } - - // set versoin - assert(final); - dout(10) << "map_write final is " << *final << dendl; - - return final; -} - - -void ObjectCacher::Object::truncate(off_t s) -{ - dout(10) << "truncate to " << s << dendl; - - while (!data.empty()) { - BufferHead *bh = data.rbegin()->second; - if (bh->end() <= s) - break; - - // split bh at truncation point? - if (bh->start() < s) { - split(bh, s); - continue; - } - - // remove bh entirely - assert(bh->start() >= s); - oc->bh_remove(this, bh); - delete bh; - } -} - - - - - -/*** ObjectCacher ***/ - -#undef dout -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_objectcacher) *_dout << dbeginl << g_clock.now() << " " << objecter->messenger->get_myname() << ".objectcacher " - - - -/* private */ - -void ObjectCacher::close_object(Object *ob) -{ - dout(10) << "close_object " << *ob << dendl; - assert(ob->can_close()); - - // ok! - objects.erase(ob->get_oid()); - objects_by_ino[ob->get_ino()].erase(ob); - if (objects_by_ino[ob->get_ino()].empty()) - objects_by_ino.erase(ob->get_ino()); - delete ob; -} - - - - -void ObjectCacher::bh_read(BufferHead *bh) -{ - dout(7) << "bh_read on " << *bh << dendl; - - mark_rx(bh); - - // finisher - C_ReadFinish *onfinish = new C_ReadFinish(this, bh->ob->get_oid(), bh->start(), bh->length()); - - // go - objecter->read(bh->ob->get_oid(), bh->start(), bh->length(), bh->ob->get_layout(), &onfinish->bl, - onfinish); -} - -void ObjectCacher::bh_read_finish(object_t oid, off_t start, size_t length, bufferlist &bl) -{ - //lock.Lock(); - dout(7) << "bh_read_finish " - << oid - << " " << start << "~" << length - << " (bl is " << bl.length() << ")" - << dendl; - - if (bl.length() < length) { - bufferptr bp(length - bl.length()); - bp.zero(); - dout(7) << "bh_read_finish " << oid << " padding " << start << "~" << length - << " with " << bp.length() << " bytes of zeroes" << dendl; - bl.push_back(bp); - } - - if (objects.count(oid) == 0) { - dout(7) << "bh_read_finish no object cache" << dendl; - } else { - Object *ob = objects[oid]; - - // apply to bh's! - off_t opos = start; - map::iterator p = ob->data.lower_bound(opos); - - while (p != ob->data.end() && - opos < start+(off_t)length) { - BufferHead *bh = p->second; - - if (bh->start() > opos) { - dout(1) << "weirdness: gap when applying read results, " - << opos << "~" << bh->start() - opos - << dendl; - opos = bh->start(); - continue; - } - - if (!bh->is_rx()) { - dout(10) << "bh_read_finish skipping non-rx " << *bh << dendl; - opos = bh->end(); - p++; - continue; - } - - assert(opos >= bh->start()); - assert(bh->start() == opos); // we don't merge rx bh's... yet! - assert(bh->length() <= start+(off_t)length-opos); - - bh->bl.substr_of(bl, - opos-bh->start(), - bh->length()); - mark_clean(bh); - dout(10) << "bh_read_finish read " << *bh << dendl; - - opos = bh->end(); - p++; - - // finishers? - // called with lock held. - list ls; - for (map >::iterator p = bh->waitfor_read.begin(); - p != bh->waitfor_read.end(); - p++) - ls.splice(ls.end(), p->second); - bh->waitfor_read.clear(); - finish_contexts(ls); - - // clean up? - ob->try_merge_bh(bh); - } - } - //lock.Unlock(); -} - - -void ObjectCacher::bh_write(BufferHead *bh) -{ - dout(7) << "bh_write " << *bh << dendl; - - // finishers - C_WriteAck *onack = new C_WriteAck(this, bh->ob->get_oid(), bh->start(), bh->length()); - C_WriteCommit *oncommit = new C_WriteCommit(this, bh->ob->get_oid(), bh->start(), bh->length()); - - // go - tid_t tid = objecter->write(bh->ob->get_oid(), bh->start(), bh->length(), bh->ob->get_layout(), bh->bl, - onack, oncommit); - - // set bh last_write_tid - onack->tid = tid; - oncommit->tid = tid; - bh->ob->last_write_tid = tid; - bh->last_write_tid = tid; - - mark_tx(bh); -} - -void ObjectCacher::lock_ack(list& oids, tid_t tid) -{ - for (list::iterator i = oids.begin(); - i != oids.end(); - i++) { - object_t oid = *i; - - if (objects.count(oid) == 0) { - dout(7) << "lock_ack no object cache" << dendl; - assert(0); - } - - Object *ob = objects[oid]; - - list ls; - - assert(tid <= ob->last_write_tid); - if (ob->last_write_tid == tid) { - dout(10) << "lock_ack " << *ob - << " tid " << tid << dendl; - - switch (ob->lock_state) { - case Object::LOCK_RDUNLOCKING: - case Object::LOCK_WRUNLOCKING: - ob->lock_state = Object::LOCK_NONE; - break; - case Object::LOCK_RDLOCKING: - case Object::LOCK_DOWNGRADING: - ob->lock_state = Object::LOCK_RDLOCK; - ls.splice(ls.begin(), ob->waitfor_rd); - break; - case Object::LOCK_UPGRADING: - case Object::LOCK_WRLOCKING: - ob->lock_state = Object::LOCK_WRLOCK; - ls.splice(ls.begin(), ob->waitfor_wr); - ls.splice(ls.begin(), ob->waitfor_rd); - break; - - default: - assert(0); - } - - ob->last_ack_tid = tid; - - if (ob->can_close()) - close_object(ob); - } else { - dout(10) << "lock_ack " << *ob - << " tid " << tid << " obsolete" << dendl; - } - - // waiters? - if (ob->waitfor_ack.count(tid)) { - ls.splice(ls.end(), ob->waitfor_ack[tid]); - ob->waitfor_ack.erase(tid); - } - - finish_contexts(ls); - - } -} - -void ObjectCacher::bh_write_ack(object_t oid, off_t start, size_t length, tid_t tid) -{ - //lock.Lock(); - - dout(7) << "bh_write_ack " - << oid - << " tid " << tid - << " " << start << "~" << length - << dendl; - if (objects.count(oid) == 0) { - dout(7) << "bh_write_ack no object cache" << dendl; - assert(0); - } else { - Object *ob = objects[oid]; - - // apply to bh's! - for (map::iterator p = ob->data.lower_bound(start); - p != ob->data.end(); - p++) { - BufferHead *bh = p->second; - - if (bh->start() > start+(off_t)length) break; - - if (bh->start() < start && - bh->end() > start+(off_t)length) { - dout(20) << "bh_write_ack skipping " << *bh << dendl; - continue; - } - - // make sure bh is tx - if (!bh->is_tx()) { - dout(10) << "bh_write_ack skipping non-tx " << *bh << dendl; - continue; - } - - // make sure bh tid matches - if (bh->last_write_tid != tid) { - assert(bh->last_write_tid > tid); - dout(10) << "bh_write_ack newer tid on " << *bh << dendl; - continue; - } - - // ok! mark bh clean. - mark_clean(bh); - dout(10) << "bh_write_ack clean " << *bh << dendl; - } - - // update object last_ack. - assert(ob->last_ack_tid < tid); - ob->last_ack_tid = tid; - - // waiters? - if (ob->waitfor_ack.count(tid)) { - list ls; - ls.splice(ls.begin(), ob->waitfor_ack[tid]); - ob->waitfor_ack.erase(tid); - finish_contexts(ls); - } - } - //lock.Unlock(); -} - -void ObjectCacher::bh_write_commit(object_t oid, off_t start, size_t length, tid_t tid) -{ - //lock.Lock(); - - // update object last_commit - dout(7) << "bh_write_commit " - << oid - << " tid " << tid - << " " << start << "~" << length - << dendl; - if (objects.count(oid) == 0) { - dout(7) << "bh_write_commit no object cache" << dendl; - //assert(0); - } else { - Object *ob = objects[oid]; - - // update last_commit. - ob->last_commit_tid = tid; - - // waiters? - if (ob->waitfor_commit.count(tid)) { - list ls; - ls.splice(ls.begin(), ob->waitfor_commit[tid]); - ob->waitfor_commit.erase(tid); - finish_contexts(ls); - } - } - - // lock.Unlock(); -} - - -void ObjectCacher::flush(off_t amount) -{ - utime_t cutoff = g_clock.now(); - //cutoff.sec_ref() -= g_conf.client_oc_max_dirty_age; - - dout(10) << "flush " << amount << dendl; - - /* - * NOTE: we aren't actually pulling things off the LRU here, just looking at the - * tail item. Then we call bh_write, which moves it to the other LRU, so that we - * can call lru_dirty.lru_get_next_expire() again. - */ - off_t did = 0; - while (amount == 0 || did < amount) { - BufferHead *bh = (BufferHead*) lru_dirty.lru_get_next_expire(); - if (!bh) break; - if (bh->last_write > cutoff) break; - - did += bh->length(); - bh_write(bh); - } -} - - -void ObjectCacher::trim(off_t max) -{ - if (max < 0) - max = g_conf.client_oc_size; - - dout(10) << "trim start: max " << max - << " clean " << get_stat_clean() - << dendl; - - while (get_stat_clean() > max) { - BufferHead *bh = (BufferHead*) lru_rest.lru_expire(); - if (!bh) break; - - dout(10) << "trim trimming " << *bh << dendl; - assert(bh->is_clean()); - - Object *ob = bh->ob; - bh_remove(ob, bh); - delete bh; - - if (ob->can_close()) { - dout(10) << "trim trimming " << *ob << dendl; - close_object(ob); - } - } - - dout(10) << "trim finish: max " << max - << " clean " << get_stat_clean() - << dendl; -} - - - -/* public */ - -/* - * returns # bytes read (if in cache). onfinish is untouched (caller must delete it) - * returns 0 if doing async read - */ -int ObjectCacher::readx(Objecter::OSDRead *rd, inodeno_t ino, Context *onfinish) -{ - bool success = true; - list hit_ls; - map stripe_map; // final buffer offset -> substring - - for (list::iterator ex_it = rd->extents.begin(); - ex_it != rd->extents.end(); - ex_it++) { - dout(10) << "readx " << *ex_it << dendl; - - // get Object cache - Object *o = get_object(ex_it->oid, ino, ex_it->layout); - - // map extent into bufferheads - map hits, missing, rx; - o->map_read(rd, hits, missing, rx); - - if (!missing.empty() || !rx.empty()) { - // read missing - for (map::iterator bh_it = missing.begin(); - bh_it != missing.end(); - bh_it++) { - bh_read(bh_it->second); - if (success) { - dout(10) << "readx missed, waiting on " << *bh_it->second - << " off " << bh_it->first << dendl; - success = false; - bh_it->second->waitfor_read[bh_it->first].push_back( new C_RetryRead(this, rd, ino, onfinish) ); - } - } - - // bump rx - for (map::iterator bh_it = rx.begin(); - bh_it != rx.end(); - bh_it++) { - touch_bh(bh_it->second); // bump in lru, so we don't lose it. - if (success) { - dout(10) << "readx missed, waiting on " << *bh_it->second - << " off " << bh_it->first << dendl; - success = false; - bh_it->second->waitfor_read[bh_it->first].push_back( new C_RetryRead(this, rd, ino, onfinish) ); - } - } - } else { - assert(!hits.empty()); - - // make a plain list - for (map::iterator bh_it = hits.begin(); - bh_it != hits.end(); - bh_it++) { - dout(10) << "readx hit bh " << *bh_it->second << dendl; - hit_ls.push_back(bh_it->second); - } - - // create reverse map of buffer offset -> object for the eventual result. - // this is over a single ObjectExtent, so we know that - // - the bh's are contiguous - // - the buffer frags need not be (and almost certainly aren't) - off_t opos = ex_it->start; - map::iterator bh_it = hits.begin(); - assert(bh_it->second->start() <= opos); - size_t bhoff = opos - bh_it->second->start(); - map::iterator f_it = ex_it->buffer_extents.begin(); - size_t foff = 0; - while (1) { - BufferHead *bh = bh_it->second; - assert(opos == (off_t)(bh->start() + bhoff)); - - dout(10) << "readx rmap opos " << opos - << ": " << *bh << " +" << bhoff - << " frag " << f_it->first << "~" << f_it->second << " +" << foff - << dendl; - - size_t len = MIN(f_it->second - foff, - bh->length() - bhoff); - bufferlist bit; // put substr here first, since substr_of clobbers, and - // we may get multiple bh's at this stripe_map position - bit.substr_of(bh->bl, - opos - bh->start(), - len); - stripe_map[f_it->first].claim_append(bit); - - opos += len; - bhoff += len; - foff += len; - if (opos == bh->end()) { - bh_it++; - bhoff = 0; - } - if (foff == f_it->second) { - f_it++; - foff = 0; - } - if (bh_it == hits.end()) break; - if (f_it == ex_it->buffer_extents.end()) break; - } - assert(f_it == ex_it->buffer_extents.end()); - assert(opos == ex_it->start + (off_t)ex_it->length); - } - } - - // bump hits in lru - for (list::iterator bhit = hit_ls.begin(); - bhit != hit_ls.end(); - bhit++) - touch_bh(*bhit); - - if (!success) return 0; // wait! - - // no misses... success! do the read. - assert(!hit_ls.empty()); - dout(10) << "readx has all buffers" << dendl; - - // ok, assemble into result buffer. - rd->bl->clear(); - size_t pos = 0; - for (map::iterator i = stripe_map.begin(); - i != stripe_map.end(); - i++) { - assert(pos == i->first); - dout(10) << "readx adding buffer len " << i->second.length() << " at " << pos << dendl; - pos += i->second.length(); - rd->bl->claim_append(i->second); - assert(rd->bl->length() == pos); - } - dout(10) << "readx result is " << rd->bl->length() << dendl; - - // done with read. - delete rd; - - trim(); - - return pos; -} - - -int ObjectCacher::writex(Objecter::OSDWrite *wr, inodeno_t ino) -{ - utime_t now = g_clock.now(); - - for (list::iterator ex_it = wr->extents.begin(); - ex_it != wr->extents.end(); - ex_it++) { - // get object cache - Object *o = get_object(ex_it->oid, ino, ex_it->layout); - - // map it all into a single bufferhead. - BufferHead *bh = o->map_write(wr); - - // adjust buffer pointers (ie "copy" data into my cache) - // this is over a single ObjectExtent, so we know that - // - there is one contiguous bh - // - the buffer frags need not be (and almost certainly aren't) - // note: i assume striping is monotonic... no jumps backwards, ever! - off_t opos = ex_it->start; - for (map::iterator f_it = ex_it->buffer_extents.begin(); - f_it != ex_it->buffer_extents.end(); - f_it++) { - dout(10) << "writex writing " << f_it->first << "~" << f_it->second << " into " << *bh << " at " << opos << dendl; - size_t bhoff = bh->start() - opos; - assert(f_it->second <= bh->length() - bhoff); - - // get the frag we're mapping in - bufferlist frag; - frag.substr_of(wr->bl, - f_it->first, f_it->second); - - // keep anything left of bhoff - bufferlist newbl; - if (bhoff) - newbl.substr_of(bh->bl, 0, bhoff); - newbl.claim_append(frag); - bh->bl.swap(newbl); - - opos += f_it->second; - } - - // ok, now bh is dirty. - mark_dirty(bh); - touch_bh(bh); - bh->last_write = now; - - o->try_merge_bh(bh); - } - - delete wr; - - trim(); - return 0; -} - - -// blocking wait for write. -void ObjectCacher::wait_for_write(size_t len, Mutex& lock) -{ - while (get_stat_dirty() + get_stat_tx() >= g_conf.client_oc_max_dirty) { - dout(10) << "wait_for_write waiting on " << len << ", dirty|tx " - << (get_stat_dirty() + get_stat_tx()) - << " >= " << g_conf.client_oc_max_dirty - << dendl; - flusher_cond.Signal(); - stat_waiter++; - stat_cond.Wait(lock); - stat_waiter--; - dout(10) << "wait_for_write woke up" << dendl; - } -} - -void ObjectCacher::flusher_entry() -{ - dout(10) << "flusher start" << dendl; - lock.Lock(); - while (!flusher_stop) { - while (!flusher_stop) { - off_t all = get_stat_tx() + get_stat_rx() + get_stat_clean() + get_stat_dirty(); - dout(11) << "flusher " - << all << " / " << g_conf.client_oc_size << ": " - << get_stat_tx() << " tx, " - << get_stat_rx() << " rx, " - << get_stat_clean() << " clean, " - << get_stat_dirty() << " / " << g_conf.client_oc_max_dirty << " dirty" - << dendl; - if (get_stat_dirty() > g_conf.client_oc_max_dirty) { - // flush some dirty pages - dout(10) << "flusher " - << get_stat_dirty() << " / " << g_conf.client_oc_max_dirty << " dirty," - << " flushing some dirty bhs" << dendl; - flush(get_stat_dirty() - g_conf.client_oc_max_dirty); - } - else { - // check tail of lru for old dirty items - utime_t cutoff = g_clock.now(); - cutoff.sec_ref()--; - BufferHead *bh = 0; - while ((bh = (BufferHead*)lru_dirty.lru_get_next_expire()) != 0 && - bh->last_write < cutoff) { - dout(10) << "flusher flushing aged dirty bh " << *bh << dendl; - bh_write(bh); - } - break; - } - } - if (flusher_stop) break; - flusher_cond.WaitInterval(lock, utime_t(1,0)); - } - lock.Unlock(); - dout(10) << "flusher finish" << dendl; -} - - - -// blocking. atomic+sync. -int ObjectCacher::atomic_sync_readx(Objecter::OSDRead *rd, inodeno_t ino, Mutex& lock) -{ - dout(10) << "atomic_sync_readx " << rd - << " in " << ino - << dendl; - - if (rd->extents.size() == 1) { - // single object. - // just write synchronously. - Cond cond; - bool done = false; - objecter->readx(rd, new C_SafeCond(&lock, &cond, &done)); - - // block - while (!done) cond.Wait(lock); - } else { - // spans multiple objects, or is big. - - // sort by object... - map by_oid; - for (list::iterator ex_it = rd->extents.begin(); - ex_it != rd->extents.end(); - ex_it++) - by_oid[ex_it->oid] = *ex_it; - - // lock - for (map::iterator i = by_oid.begin(); - i != by_oid.end(); - i++) { - Object *o = get_object(i->first, ino, i->second.layout); - rdlock(o); - } - - // readx will hose rd - list extents = rd->extents; - - // do the read, into our cache - Cond cond; - bool done = false; - readx(rd, ino, new C_SafeCond(&lock, &cond, &done)); - - // block - while (!done) cond.Wait(lock); - - // release the locks - for (list::iterator ex_it = extents.begin(); - ex_it != extents.end(); - ex_it++) { - assert(objects.count(ex_it->oid)); - Object *o = objects[ex_it->oid]; - rdunlock(o); - } - } - - return 0; -} - -int ObjectCacher::atomic_sync_writex(Objecter::OSDWrite *wr, inodeno_t ino, Mutex& lock) -{ - dout(10) << "atomic_sync_writex " << wr - << " in " << ino - << dendl; - - if (wr->extents.size() == 1 && - wr->extents.front().length <= g_conf.client_oc_max_sync_write) { - // single object. - - // make sure we aren't already locking/locked... - object_t oid = wr->extents.front().oid; - Object *o = 0; - if (objects.count(oid)) o = get_object(oid, ino, wr->extents.front().layout); - if (!o || - (o->lock_state != Object::LOCK_WRLOCK && - o->lock_state != Object::LOCK_WRLOCKING && - o->lock_state != Object::LOCK_UPGRADING)) { - // just write synchronously. - dout(10) << "atomic_sync_writex " << wr - << " in " << ino - << " doing sync write" - << dendl; - - Cond cond; - bool done = false; - objecter->modifyx(wr, new C_SafeCond(&lock, &cond, &done), 0); - - // block - while (!done) cond.Wait(lock); - return 0; - } - } - - // spans multiple objects, or is big. - // sort by object... - map by_oid; - for (list::iterator ex_it = wr->extents.begin(); - ex_it != wr->extents.end(); - ex_it++) - by_oid[ex_it->oid] = *ex_it; - - // wrlock - for (map::iterator i = by_oid.begin(); - i != by_oid.end(); - i++) { - Object *o = get_object(i->first, ino, i->second.layout); - wrlock(o); - } - - // writex will hose wr - list extents = wr->extents; - - // do the write, into our cache - writex(wr, ino); - - // flush - // ...and release the locks? - for (list::iterator ex_it = extents.begin(); - ex_it != extents.end(); - ex_it++) { - assert(objects.count(ex_it->oid)); - Object *o = objects[ex_it->oid]; - - wrunlock(o); - } - - return 0; -} - - - -// locking ----------------------------- - -void ObjectCacher::rdlock(Object *o) -{ - // lock? - if (o->lock_state == Object::LOCK_NONE || - o->lock_state == Object::LOCK_RDUNLOCKING || - o->lock_state == Object::LOCK_WRUNLOCKING) { - dout(10) << "rdlock rdlock " << *o << dendl; - - o->lock_state = Object::LOCK_RDLOCKING; - - C_LockAck *ack = new C_LockAck(this, o->get_oid()); - C_WriteCommit *commit = new C_WriteCommit(this, o->get_oid(), 0, 0); - - commit->tid = - ack->tid = - o->last_write_tid = - objecter->lock(OSD_OP_RDLOCK, o->get_oid(), o->get_layout(), ack, commit); - } - - // stake our claim. - o->rdlock_ref++; - - // wait? - if (o->lock_state == Object::LOCK_RDLOCKING || - o->lock_state == Object::LOCK_WRLOCKING) { - dout(10) << "rdlock waiting for rdlock|wrlock on " << *o << dendl; - Cond cond; - bool done = false; - o->waitfor_rd.push_back(new C_SafeCond(&lock, &cond, &done)); - while (!done) cond.Wait(lock); - } - assert(o->lock_state == Object::LOCK_RDLOCK || - o->lock_state == Object::LOCK_WRLOCK || - o->lock_state == Object::LOCK_UPGRADING || - o->lock_state == Object::LOCK_DOWNGRADING); -} - -void ObjectCacher::wrlock(Object *o) -{ - // lock? - if (o->lock_state != Object::LOCK_WRLOCK && - o->lock_state != Object::LOCK_WRLOCKING && - o->lock_state != Object::LOCK_UPGRADING) { - dout(10) << "wrlock wrlock " << *o << dendl; - - int op = 0; - if (o->lock_state == Object::LOCK_RDLOCK) { - o->lock_state = Object::LOCK_UPGRADING; - op = OSD_OP_UPLOCK; - } else { - o->lock_state = Object::LOCK_WRLOCKING; - op = OSD_OP_WRLOCK; - } - - C_LockAck *ack = new C_LockAck(this, o->get_oid()); - C_WriteCommit *commit = new C_WriteCommit(this, o->get_oid(), 0, 0); - - commit->tid = - ack->tid = - o->last_write_tid = - objecter->lock(op, o->get_oid(), o->get_layout(), ack, commit); - } - - // stake our claim. - o->wrlock_ref++; - - // wait? - if (o->lock_state == Object::LOCK_WRLOCKING || - o->lock_state == Object::LOCK_UPGRADING) { - dout(10) << "wrlock waiting for wrlock on " << *o << dendl; - Cond cond; - bool done = false; - o->waitfor_wr.push_back(new C_SafeCond(&lock, &cond, &done)); - while (!done) cond.Wait(lock); - } - assert(o->lock_state == Object::LOCK_WRLOCK); -} - - -void ObjectCacher::rdunlock(Object *o) -{ - dout(10) << "rdunlock " << *o << dendl; - assert(o->lock_state == Object::LOCK_RDLOCK || - o->lock_state == Object::LOCK_WRLOCK || - o->lock_state == Object::LOCK_UPGRADING || - o->lock_state == Object::LOCK_DOWNGRADING); - - assert(o->rdlock_ref > 0); - o->rdlock_ref--; - if (o->rdlock_ref > 0 || - o->wrlock_ref > 0) { - dout(10) << "rdunlock " << *o << " still has rdlock|wrlock refs" << dendl; - return; - } - - release(o); // release first - - o->lock_state = Object::LOCK_RDUNLOCKING; - - C_LockAck *lockack = new C_LockAck(this, o->get_oid()); - C_WriteCommit *commit = new C_WriteCommit(this, o->get_oid(), 0, 0); - commit->tid = - lockack->tid = - o->last_write_tid = - objecter->lock(OSD_OP_RDUNLOCK, o->get_oid(), o->get_layout(), lockack, commit); -} - -void ObjectCacher::wrunlock(Object *o) -{ - dout(10) << "wrunlock " << *o << dendl; - assert(o->lock_state == Object::LOCK_WRLOCK); - - assert(o->wrlock_ref > 0); - o->wrlock_ref--; - if (o->wrlock_ref > 0) { - dout(10) << "wrunlock " << *o << " still has wrlock refs" << dendl; - return; - } - - flush(o); // flush first - - int op = 0; - if (o->rdlock_ref > 0) { - dout(10) << "wrunlock rdlock " << *o << dendl; - op = OSD_OP_DNLOCK; - o->lock_state = Object::LOCK_DOWNGRADING; - } else { - dout(10) << "wrunlock wrunlock " << *o << dendl; - op = OSD_OP_WRUNLOCK; - o->lock_state = Object::LOCK_WRUNLOCKING; - } - - C_LockAck *lockack = new C_LockAck(this, o->get_oid()); - C_WriteCommit *commit = new C_WriteCommit(this, o->get_oid(), 0, 0); - commit->tid = - lockack->tid = - o->last_write_tid = - objecter->lock(op, o->get_oid(), o->get_layout(), lockack, commit); -} - - -// ------------------------------------------------- - - -bool ObjectCacher::set_is_cached(inodeno_t ino) -{ - if (objects_by_ino.count(ino) == 0) - return false; - - set& s = objects_by_ino[ino]; - for (set::iterator i = s.begin(); - i != s.end(); - i++) { - Object *ob = *i; - if (!ob->data.empty()) return true; - } - - return false; -} - -bool ObjectCacher::set_is_dirty_or_committing(inodeno_t ino) -{ - if (objects_by_ino.count(ino) == 0) - return false; - - set& s = objects_by_ino[ino]; - for (set::iterator i = s.begin(); - i != s.end(); - i++) { - Object *ob = *i; - - for (map::iterator p = ob->data.begin(); - p != ob->data.end(); - p++) { - BufferHead *bh = p->second; - if (bh->is_dirty() || bh->is_tx()) - return true; - } - } - - return false; -} - - -// purge. non-blocking. violently removes dirty buffers from cache. -void ObjectCacher::purge(Object *ob) -{ - dout(10) << "purge " << *ob << dendl; - - for (map::iterator p = ob->data.begin(); - p != ob->data.end(); - p++) { - BufferHead *bh = p->second; - if (!bh->is_clean()) - dout(0) << "purge forcibly removing " << *ob << " " << *bh << dendl; - bh_remove(ob, bh); - delete bh; - } - - if (ob->can_close()) { - dout(10) << "trim trimming " << *ob << dendl; - close_object(ob); - } -} - -// flush. non-blocking. no callback. -// true if clean, already flushed. -// false if we wrote something. -bool ObjectCacher::flush(Object *ob) -{ - bool clean = true; - for (map::iterator p = ob->data.begin(); - p != ob->data.end(); - p++) { - BufferHead *bh = p->second; - if (bh->is_tx()) { - clean = false; - continue; - } - if (!bh->is_dirty()) continue; - - bh_write(bh); - clean = false; - } - return clean; -} - -// flush. non-blocking, takes callback. -// returns true if already flushed -bool ObjectCacher::flush_set(inodeno_t ino, Context *onfinish) -{ - if (objects_by_ino.count(ino) == 0) { - dout(10) << "flush_set on " << ino << " dne" << dendl; - return true; - } - - dout(10) << "flush_set " << ino << dendl; - - C_Gather *gather = 0; // we'll need to wait for all objects to flush! - - set& s = objects_by_ino[ino]; - bool safe = true; - for (set::iterator i = s.begin(); - i != s.end(); - i++) { - Object *ob = *i; - - if (!flush(ob)) { - // we'll need to gather... - if (!gather && onfinish) - gather = new C_Gather(onfinish); - safe = false; - - dout(10) << "flush_set " << ino << " will wait for ack tid " - << ob->last_write_tid - << " on " << *ob - << dendl; - if (gather) - ob->waitfor_ack[ob->last_write_tid].push_back(gather->new_sub()); - } - } - - if (safe) { - dout(10) << "flush_set " << ino << " has no dirty|tx bhs" << dendl; - return true; - } - return false; -} - - -// commit. non-blocking, takes callback. -// return true if already flushed. -bool ObjectCacher::commit_set(inodeno_t ino, Context *onfinish) -{ - assert(onfinish); // doesn't make any sense otherwise. - - if (objects_by_ino.count(ino) == 0) { - dout(10) << "commit_set on " << ino << " dne" << dendl; - return true; - } - - dout(10) << "commit_set " << ino << dendl; - - C_Gather *gather = 0; // we'll need to wait for all objects to commit - - set& s = objects_by_ino[ino]; - bool safe = true; - for (set::iterator i = s.begin(); - i != s.end(); - i++) { - Object *ob = *i; - - // make sure it's flushing. - flush_set(ino); - - if (ob->last_write_tid > ob->last_commit_tid) { - dout(10) << "commit_set " << ino << " " << *ob - << " will finish on commit tid " << ob->last_write_tid - << dendl; - if (!gather && onfinish) gather = new C_Gather(onfinish); - safe = false; - if (gather) - ob->waitfor_commit[ob->last_write_tid].push_back( gather->new_sub() ); - } - } - - if (safe) { - dout(10) << "commit_set " << ino << " all committed" << dendl; - return true; - } - return false; -} - -void ObjectCacher::purge_set(inodeno_t ino) -{ - if (objects_by_ino.count(ino) == 0) { - dout(10) << "purge_set on " << ino << " dne" << dendl; - return; - } - - dout(10) << "purge_set " << ino << dendl; - - set& s = objects_by_ino[ino]; - for (set::iterator i = s.begin(); - i != s.end(); - i++) { - Object *ob = *i; - purge(ob); - } -} - - -off_t ObjectCacher::release(Object *ob) -{ - list clean; - off_t o_unclean = 0; - - for (map::iterator p = ob->data.begin(); - p != ob->data.end(); - p++) { - BufferHead *bh = p->second; - if (bh->is_clean()) - clean.push_back(bh); - else - o_unclean += bh->length(); - } - - for (list::iterator p = clean.begin(); - p != clean.end(); - p++) { - bh_remove(ob, *p); - delete *p; - } - - if (ob->can_close()) { - dout(10) << "trim trimming " << *ob << dendl; - close_object(ob); - } - - return o_unclean; -} - -off_t ObjectCacher::release_set(inodeno_t ino) -{ - // return # bytes not clean (and thus not released). - off_t unclean = 0; - - if (objects_by_ino.count(ino) == 0) { - dout(10) << "release_set on " << ino << " dne" << dendl; - return 0; - } - - dout(10) << "release_set " << ino << dendl; - - set s = objects_by_ino[ino]; - for (set::iterator i = s.begin(); - i != s.end(); - i++) { - Object *ob = *i; - - off_t o_unclean = release(ob); - unclean += o_unclean; - - if (o_unclean) - dout(10) << "release_set " << ino << " " << *ob - << " has " << o_unclean << " bytes left" - << dendl; - - } - - if (unclean) { - dout(10) << "release_set " << ino - << ", " << unclean << " bytes left" << dendl; - } - - return unclean; -} - -void ObjectCacher::truncate_set(inodeno_t ino, list& exls) -{ - if (objects_by_ino.count(ino) == 0) { - dout(10) << "truncate_set on " << ino << " dne" << dendl; - return; - } - - dout(10) << "truncate_set " << ino << dendl; - - for (list::iterator p = exls.begin(); - p != exls.end(); - ++p) { - ObjectExtent &ex = *p; - if (objects.count(ex.oid) == 0) continue; - Object *ob = objects[ex.oid]; - - // purge or truncate? - if (ex.start == 0) { - dout(10) << "truncate_set purging " << *ob << dendl; - purge(ob); - } else { - // hrm, truncate object - dout(10) << "truncate_set truncating " << *ob << " at " << ex.start << dendl; - ob->truncate(ex.start); - - if (ob->can_close()) { - dout(10) << "truncate_set trimming " << *ob << dendl; - close_object(ob); - } - } - } -} - - -void ObjectCacher::kick_sync_writers(inodeno_t ino) -{ - if (objects_by_ino.count(ino) == 0) { - dout(10) << "kick_sync_writers on " << ino << " dne" << dendl; - return; - } - - dout(10) << "kick_sync_writers on " << ino << dendl; - - list ls; - - set& s = objects_by_ino[ino]; - for (set::iterator i = s.begin(); - i != s.end(); - i++) { - Object *ob = *i; - - ls.splice(ls.begin(), ob->waitfor_wr); - } - - finish_contexts(ls); -} - -void ObjectCacher::kick_sync_readers(inodeno_t ino) -{ - if (objects_by_ino.count(ino) == 0) { - dout(10) << "kick_sync_readers on " << ino << " dne" << dendl; - return; - } - - dout(10) << "kick_sync_readers on " << ino << dendl; - - list ls; - - set& s = objects_by_ino[ino]; - for (set::iterator i = s.begin(); - i != s.end(); - i++) { - Object *ob = *i; - - ls.splice(ls.begin(), ob->waitfor_rd); - } - - finish_contexts(ls); -} - - - diff --git a/branches/sage/mds/osdc/ObjectCacher.h b/branches/sage/mds/osdc/ObjectCacher.h deleted file mode 100644 index f1d057beef99c..0000000000000 --- a/branches/sage/mds/osdc/ObjectCacher.h +++ /dev/null @@ -1,566 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -#ifndef __OBJECTCACHER_H_ -#define __OBJECTCACHER_H_ - -#include "include/types.h" -#include "include/lru.h" -#include "include/Context.h" - -#include "common/Cond.h" -#include "common/Thread.h" - -#include "Objecter.h" -#include "Filer.h" - -class Objecter; -class Objecter::OSDRead; -class Objecter::OSDWrite; - -class ObjectCacher { - public: - - class Object; - - // ******* BufferHead ********* - class BufferHead : public LRUObject { - public: - // states - static const int STATE_MISSING = 0; - static const int STATE_CLEAN = 1; - static const int STATE_DIRTY = 2; - static const int STATE_RX = 3; - static const int STATE_TX = 4; - - private: - // my fields - int state; - int ref; - struct { - off_t start, length; // bh extent in object - } ex; - - public: - Object *ob; - bufferlist bl; - tid_t last_write_tid; // version of bh (if non-zero) - utime_t last_write; - - map< off_t, list > waitfor_read; - - public: - // cons - BufferHead(Object *o) : - state(STATE_MISSING), - ref(0), - ob(o), - last_write_tid(0) {} - - // extent - off_t start() { return ex.start; } - void set_start(off_t s) { ex.start = s; } - off_t length() { return ex.length; } - void set_length(off_t l) { ex.length = l; } - off_t end() { return ex.start + ex.length; } - off_t last() { return end() - 1; } - - // states - void set_state(int s) { - if (s == STATE_RX || s == STATE_TX) get(); - if (state == STATE_RX || state == STATE_TX) put(); - state = s; - } - int get_state() { return state; } - - bool is_missing() { return state == STATE_MISSING; } - bool is_dirty() { return state == STATE_DIRTY; } - bool is_clean() { return state == STATE_CLEAN; } - bool is_tx() { return state == STATE_TX; } - bool is_rx() { return state == STATE_RX; } - - // reference counting - int get() { - assert(ref >= 0); - if (ref == 0) lru_pin(); - return ++ref; - } - int put() { - assert(ref > 0); - if (ref == 1) lru_unpin(); - --ref; - return ref; - } - }; - - - // ******* Object ********* - class Object { - private: - // ObjectCacher::Object fields - ObjectCacher *oc; - object_t oid; // this _always_ is oid.rev=0 - inodeno_t ino; - objectrev_t rev; // last rev we're written - ObjectLayout layout; - - public: - map data; - - tid_t last_write_tid; // version of bh (if non-zero) - tid_t last_ack_tid; // last update acked. - tid_t last_commit_tid; // last update commited. - - map< tid_t, list > waitfor_ack; - map< tid_t, list > waitfor_commit; - list waitfor_rd; - list waitfor_wr; - - // lock - static const int LOCK_NONE = 0; - static const int LOCK_WRLOCKING = 1; - static const int LOCK_WRLOCK = 2; - static const int LOCK_WRUNLOCKING = 3; - static const int LOCK_RDLOCKING = 4; - static const int LOCK_RDLOCK = 5; - static const int LOCK_RDUNLOCKING = 6; - static const int LOCK_UPGRADING = 7; // rd -> wr - static const int LOCK_DOWNGRADING = 8; // wr -> rd - int lock_state; - int wrlock_ref; // how many ppl want or are using a WRITE lock - int rdlock_ref; // how many ppl want or are using a READ lock - - public: - Object(ObjectCacher *_oc, object_t o, inodeno_t i, ObjectLayout& l) : - oc(_oc), - oid(o), ino(i), layout(l), - last_write_tid(0), last_ack_tid(0), last_commit_tid(0), - lock_state(LOCK_NONE), wrlock_ref(0), rdlock_ref(0) - {} - ~Object() { - assert(data.empty()); - } - - object_t get_oid() { return oid; } - inodeno_t get_ino() { return ino; } - - ObjectLayout& get_layout() { return layout; } - void set_layout(ObjectLayout& l) { layout = l; } - - bool can_close() { - return data.empty() && lock_state == LOCK_NONE && - waitfor_ack.empty() && waitfor_commit.empty() && - waitfor_rd.empty() && waitfor_wr.empty(); - } - - // bh - void add_bh(BufferHead *bh) { - // add to my map - assert(data.count(bh->start()) == 0); - - if (0) { // sanity check FIXME DEBUG - //cout << "add_bh " << bh->start() << "~" << bh->length() << endl; - map::iterator p = data.lower_bound(bh->start()); - if (p != data.end()) { - //cout << " after " << *p->second << endl; - //cout << " after starts at " << p->first << endl; - assert(p->first >= bh->end()); - } - if (p != data.begin()) { - p--; - //cout << " before starts at " << p->second->start() - //<< " and ends at " << p->second->end() << endl; - //cout << " before " << *p->second << endl; - assert(p->second->end() <= bh->start()); - } - } - - data[bh->start()] = bh; - } - void remove_bh(BufferHead *bh) { - assert(data.count(bh->start())); - data.erase(bh->start()); - } - bool is_empty() { return data.empty(); } - - // mid-level - BufferHead *split(BufferHead *bh, off_t off); - void merge_left(BufferHead *left, BufferHead *right); - void try_merge_bh(BufferHead *bh); - - int map_read(Objecter::OSDRead *rd, - map& hits, - map& missing, - map& rx); - BufferHead *map_write(Objecter::OSDWrite *wr); - - void truncate(off_t s); - - }; - - // ******* ObjectCacher ********* - // ObjectCacher fields - public: - Objecter *objecter; - Filer filer; - - private: - Mutex& lock; - - hash_map objects; - hash_map > objects_by_ino; - - set dirty_bh; - LRU lru_dirty, lru_rest; - - Cond flusher_cond; - bool flusher_stop; - void flusher_entry(); - class FlusherThread : public Thread { - ObjectCacher *oc; - public: - FlusherThread(ObjectCacher *o) : oc(o) {} - void *entry() { - oc->flusher_entry(); - return 0; - } - } flusher_thread; - - - // objects - Object *get_object(object_t oid, inodeno_t ino, ObjectLayout &l) { - // have it? - if (objects.count(oid)) - return objects[oid]; - - // create it. - Object *o = new Object(this, oid, ino, l); - objects[oid] = o; - objects_by_ino[ino].insert(o); - return o; - } - void close_object(Object *ob); - - // bh stats - Cond stat_cond; - int stat_waiter; - - off_t stat_clean; - off_t stat_dirty; - off_t stat_rx; - off_t stat_tx; - off_t stat_missing; - - void bh_stat_add(BufferHead *bh) { - switch (bh->get_state()) { - case BufferHead::STATE_MISSING: stat_missing += bh->length(); break; - case BufferHead::STATE_CLEAN: stat_clean += bh->length(); break; - case BufferHead::STATE_DIRTY: stat_dirty += bh->length(); break; - case BufferHead::STATE_TX: stat_tx += bh->length(); break; - case BufferHead::STATE_RX: stat_rx += bh->length(); break; - } - if (stat_waiter) stat_cond.Signal(); - } - void bh_stat_sub(BufferHead *bh) { - switch (bh->get_state()) { - case BufferHead::STATE_MISSING: stat_missing -= bh->length(); break; - case BufferHead::STATE_CLEAN: stat_clean -= bh->length(); break; - case BufferHead::STATE_DIRTY: stat_dirty -= bh->length(); break; - case BufferHead::STATE_TX: stat_tx -= bh->length(); break; - case BufferHead::STATE_RX: stat_rx -= bh->length(); break; - } - } - off_t get_stat_tx() { return stat_tx; } - off_t get_stat_rx() { return stat_rx; } - off_t get_stat_dirty() { return stat_dirty; } - off_t get_stat_clean() { return stat_clean; } - - void touch_bh(BufferHead *bh) { - if (bh->is_dirty()) - lru_dirty.lru_touch(bh); - else - lru_rest.lru_touch(bh); - } - - // bh states - void bh_set_state(BufferHead *bh, int s) { - // move between lru lists? - if (s == BufferHead::STATE_DIRTY && bh->get_state() != BufferHead::STATE_DIRTY) { - lru_rest.lru_remove(bh); - lru_dirty.lru_insert_top(bh); - dirty_bh.insert(bh); - } - if (s != BufferHead::STATE_DIRTY && bh->get_state() == BufferHead::STATE_DIRTY) { - lru_dirty.lru_remove(bh); - lru_rest.lru_insert_mid(bh); - dirty_bh.erase(bh); - } - - // set state - bh_stat_sub(bh); - bh->set_state(s); - bh_stat_add(bh); - } - - void copy_bh_state(BufferHead *bh1, BufferHead *bh2) { - bh_set_state(bh2, bh1->get_state()); - } - - void mark_missing(BufferHead *bh) { bh_set_state(bh, BufferHead::STATE_MISSING); }; - void mark_clean(BufferHead *bh) { bh_set_state(bh, BufferHead::STATE_CLEAN); }; - void mark_rx(BufferHead *bh) { bh_set_state(bh, BufferHead::STATE_RX); }; - void mark_tx(BufferHead *bh) { bh_set_state(bh, BufferHead::STATE_TX); }; - void mark_dirty(BufferHead *bh) { - bh_set_state(bh, BufferHead::STATE_DIRTY); - lru_dirty.lru_touch(bh); - //bh->set_dirty_stamp(g_clock.now()); - }; - - void bh_add(Object *ob, BufferHead *bh) { - ob->add_bh(bh); - if (bh->is_dirty()) { - lru_dirty.lru_insert_top(bh); - dirty_bh.insert(bh); - } else { - lru_rest.lru_insert_top(bh); - } - bh_stat_add(bh); - } - void bh_remove(Object *ob, BufferHead *bh) { - ob->remove_bh(bh); - if (bh->is_dirty()) { - lru_dirty.lru_remove(bh); - dirty_bh.erase(bh); - } else { - lru_rest.lru_remove(bh); - } - bh_stat_sub(bh); - } - - // io - void bh_read(BufferHead *bh); - void bh_write(BufferHead *bh); - - void trim(off_t max=-1); - void flush(off_t amount=0); - - bool flush(Object *o); - off_t release(Object *o); - void purge(Object *o); - - void rdlock(Object *o); - void rdunlock(Object *o); - void wrlock(Object *o); - void wrunlock(Object *o); - - public: - void bh_read_finish(object_t oid, off_t offset, size_t length, bufferlist &bl); - void bh_write_ack(object_t oid, off_t offset, size_t length, tid_t t); - void bh_write_commit(object_t oid, off_t offset, size_t length, tid_t t); - void lock_ack(list& oids, tid_t tid); - - class C_ReadFinish : public Context { - ObjectCacher *oc; - object_t oid; - off_t start; - size_t length; - public: - bufferlist bl; - C_ReadFinish(ObjectCacher *c, object_t o, off_t s, size_t l) : oc(c), oid(o), start(s), length(l) {} - void finish(int r) { - oc->bh_read_finish(oid, start, length, bl); - } - }; - - class C_WriteAck : public Context { - ObjectCacher *oc; - object_t oid; - off_t start; - size_t length; - public: - tid_t tid; - C_WriteAck(ObjectCacher *c, object_t o, off_t s, size_t l) : oc(c), oid(o), start(s), length(l) {} - void finish(int r) { - oc->bh_write_ack(oid, start, length, tid); - } - }; - class C_WriteCommit : public Context { - ObjectCacher *oc; - object_t oid; - off_t start; - size_t length; - public: - tid_t tid; - C_WriteCommit(ObjectCacher *c, object_t o, off_t s, size_t l) : oc(c), oid(o), start(s), length(l) {} - void finish(int r) { - oc->bh_write_commit(oid, start, length, tid); - } - }; - - class C_LockAck : public Context { - ObjectCacher *oc; - public: - list oids; - tid_t tid; - C_LockAck(ObjectCacher *c, object_t o) : oc(c) { - oids.push_back(o); - } - void finish(int r) { - oc->lock_ack(oids, tid); - } - }; - - - - public: - ObjectCacher(Objecter *o, Mutex& l) : - objecter(o), filer(o), lock(l), - flusher_stop(false), flusher_thread(this), - stat_waiter(0), - stat_clean(0), stat_dirty(0), stat_rx(0), stat_tx(0), stat_missing(0) { - flusher_thread.create(); - } - ~ObjectCacher() { - // we should be empty. - assert(objects.empty()); - assert(lru_rest.lru_get_size() == 0); - assert(lru_dirty.lru_get_size() == 0); - assert(dirty_bh.empty()); - - assert(flusher_thread.is_started()); - lock.Lock(); // hmm.. watch out for deadlock! - flusher_stop = true; - flusher_cond.Signal(); - lock.Unlock(); - flusher_thread.join(); - } - - - class C_RetryRead : public Context { - ObjectCacher *oc; - Objecter::OSDRead *rd; - inodeno_t ino; - Context *onfinish; - public: - C_RetryRead(ObjectCacher *_oc, Objecter::OSDRead *r, inodeno_t i, Context *c) : oc(_oc), rd(r), ino(i), onfinish(c) {} - void finish(int) { - int r = oc->readx(rd, ino, onfinish); - if (r > 0) { - onfinish->finish(r); - delete onfinish; - } - } - }; - - // non-blocking. async. - int readx(Objecter::OSDRead *rd, inodeno_t ino, Context *onfinish); - int writex(Objecter::OSDWrite *wr, inodeno_t ino); - - // write blocking - void wait_for_write(size_t len, Mutex& lock); - - // blocking. atomic+sync. - int atomic_sync_readx(Objecter::OSDRead *rd, inodeno_t ino, Mutex& lock); - int atomic_sync_writex(Objecter::OSDWrite *wr, inodeno_t ino, Mutex& lock); - - bool set_is_cached(inodeno_t ino); - bool set_is_dirty_or_committing(inodeno_t ino); - - bool flush_set(inodeno_t ino, Context *onfinish=0); - void flush_all(Context *onfinish=0); - - bool commit_set(inodeno_t ino, Context *oncommit); - void commit_all(Context *oncommit=0); - - void purge_set(inodeno_t ino); - - off_t release_set(inodeno_t ino); // returns # of bytes not released (ie non-clean) - - void truncate_set(inodeno_t ino, list& ex); - - void kick_sync_writers(inodeno_t ino); - void kick_sync_readers(inodeno_t ino); - - - // file functions - - /*** async+caching (non-blocking) file interface ***/ - int file_read(inode_t& inode, - off_t offset, size_t len, - bufferlist *bl, - Context *onfinish) { - Objecter::OSDRead *rd = new Objecter::OSDRead(bl); - filer.file_to_extents(inode, offset, len, rd->extents); - return readx(rd, inode.ino, onfinish); - } - - int file_write(inode_t& inode, - off_t offset, size_t len, - bufferlist& bl, - objectrev_t rev=0) { - Objecter::OSDWrite *wr = new Objecter::OSDWrite(bl); - filer.file_to_extents(inode, offset, len, wr->extents); - return writex(wr, inode.ino); - } - - - - /*** sync+blocking file interface ***/ - - int file_atomic_sync_read(inode_t& inode, - off_t offset, size_t len, - bufferlist *bl, - Mutex &lock) { - Objecter::OSDRead *rd = new Objecter::OSDRead(bl); - filer.file_to_extents(inode, offset, len, rd->extents); - return atomic_sync_readx(rd, inode.ino, lock); - } - - int file_atomic_sync_write(inode_t& inode, - off_t offset, size_t len, - bufferlist& bl, - Mutex &lock, - objectrev_t rev=0) { - Objecter::OSDWrite *wr = new Objecter::OSDWrite(bl); - filer.file_to_extents(inode, offset, len, wr->extents); - return atomic_sync_writex(wr, inode.ino, lock); - } - -}; - - -inline ostream& operator<<(ostream& out, ObjectCacher::BufferHead &bh) -{ - out << "bh[" - << bh.start() << "~" << bh.length() - << " (" << bh.bl.length() << ")" - << " v " << bh.last_write_tid; - if (bh.is_tx()) out << " tx"; - if (bh.is_rx()) out << " rx"; - if (bh.is_dirty()) out << " dirty"; - if (bh.is_clean()) out << " clean"; - if (bh.is_missing()) out << " missing"; - if (bh.bl.length() > 0) out << " firstbyte=" << (int)bh.bl[0]; - out << "]"; - return out; -} - -inline ostream& operator<<(ostream& out, ObjectCacher::Object &ob) -{ - out << "object[" - << hex << ob.get_oid() << " ino " << ob.get_ino() << dec - << " wr " << ob.last_write_tid << "/" << ob.last_ack_tid << "/" << ob.last_commit_tid; - - switch (ob.lock_state) { - case ObjectCacher::Object::LOCK_WRLOCKING: out << " wrlocking"; break; - case ObjectCacher::Object::LOCK_WRLOCK: out << " wrlock"; break; - case ObjectCacher::Object::LOCK_WRUNLOCKING: out << " wrunlocking"; break; - case ObjectCacher::Object::LOCK_RDLOCKING: out << " rdlocking"; break; - case ObjectCacher::Object::LOCK_RDLOCK: out << " rdlock"; break; - case ObjectCacher::Object::LOCK_RDUNLOCKING: out << " rdunlocking"; break; - } - - out << "]"; - return out; -} - -#endif diff --git a/branches/sage/mds/osdc/Objecter.cc b/branches/sage/mds/osdc/Objecter.cc deleted file mode 100644 index 84563b0af9720..0000000000000 --- a/branches/sage/mds/osdc/Objecter.cc +++ /dev/null @@ -1,913 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "Objecter.h" -#include "osd/OSDMap.h" -#include "mon/MonMap.h" - -#include "msg/Messenger.h" -#include "msg/Message.h" - -#include "messages/MOSDOp.h" -#include "messages/MOSDOpReply.h" -#include "messages/MOSDMap.h" -#include "messages/MOSDGetMap.h" - -#include "messages/MOSDFailure.h" - -#include - -#include "config.h" - -#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_objecter) *_dout << dbeginl << g_clock.now() << " " << messenger->get_myname() << ".objecter " -#define derr(x) if (x <= g_conf.debug || x <= g_conf.debug_objecter) *_derr << dbeginl << g_clock.now() << " " << messenger->get_myname() << ".objecter " - - -// messages ------------------------------ - -void Objecter::init() -{ - assert(client_lock.is_locked()); // otherwise event cancellation is unsafe - timer.add_event_after(g_conf.objecter_tick_interval, new C_Tick(this)); -} - -void Objecter::shutdown() -{ - assert(client_lock.is_locked()); // otherwise event cancellation is unsafe - timer.cancel_all(); -} - - -void Objecter::dispatch(Message *m) -{ - switch (m->get_type()) { - case MSG_OSD_OPREPLY: - handle_osd_op_reply((MOSDOpReply*)m); - break; - - case MSG_OSD_MAP: - handle_osd_map((MOSDMap*)m); - break; - - default: - dout(1) << "don't know message type " << m->get_type() << dendl; - assert(0); - } -} - -void Objecter::handle_osd_map(MOSDMap *m) -{ - assert(osdmap); - - if (m->get_last() <= osdmap->get_epoch()) { - dout(3) << "handle_osd_map ignoring epochs [" - << m->get_first() << "," << m->get_last() - << "] <= " << osdmap->get_epoch() << dendl; - } - else { - dout(3) << "handle_osd_map got epochs [" - << m->get_first() << "," << m->get_last() - << "] > " << osdmap->get_epoch() - << dendl; - - set changed_pgs; - - for (epoch_t e = osdmap->get_epoch() + 1; - e <= m->get_last(); - e++) { - if (m->incremental_maps.count(e)) { - dout(3) << "handle_osd_map decoding incremental epoch " << e << dendl; - OSDMap::Incremental inc; - int off = 0; - inc.decode(m->incremental_maps[e], off); - osdmap->apply_incremental(inc); - - // notify messenger - for (map >::iterator i = inc.new_down.begin(); - i != inc.new_down.end(); - i++) - messenger->mark_down(i->second.first.addr); - - } - else if (m->maps.count(e)) { - dout(3) << "handle_osd_map decoding full epoch " << e << dendl; - osdmap->decode(m->maps[e]); - } - else { - dout(3) << "handle_osd_map requesting missing epoch " << osdmap->get_epoch()+1 << dendl; - int mon = monmap->pick_mon(); - messenger->send_message(new MOSDGetMap(osdmap->get_epoch()+1), - monmap->get_inst(mon)); - break; - } - - // scan pgs for changes - scan_pgs(changed_pgs); - - assert(e == osdmap->get_epoch()); - } - - // kick requests who might be timing out on the wrong osds - if (!changed_pgs.empty()) - kick_requests(changed_pgs); - } - - delete m; -} - - -void Objecter::maybe_request_map() -{ - utime_t now; - if (!osdmap) goto yes; - if (last_epoch_requested <= osdmap->get_epoch()) goto yes; - now = g_clock.now(); - if (now - last_epoch_requested_stamp > g_conf.objecter_map_request_interval) goto yes; - return; - - yes: - dout(10) << "maybe_request_map requesting next osd map" << dendl; - last_epoch_requested_stamp = now; - last_epoch_requested = osdmap->get_epoch()+1; - messenger->send_message(new MOSDGetMap(osdmap->get_epoch(), last_epoch_requested), - monmap->get_inst(monmap->pick_mon())); -} - - - -void Objecter::scan_pgs(set& changed_pgs) -{ - dout(10) << "scan_pgs" << dendl; - - for (hash_map::iterator i = pg_map.begin(); - i != pg_map.end(); - i++) { - pg_t pgid = i->first; - PG& pg = i->second; - - // calc new. - vector other; - osdmap->pg_to_acting_osds(pgid, other); - - if (other == pg.acting) - continue; // no change. - - other.swap(pg.acting); - - if (g_conf.osd_rep == OSD_REP_PRIMARY) { - // same primary? - if (!other.empty() && - !pg.acting.empty() && - other[0] == pg.acting[0]) - continue; - } - else if (g_conf.osd_rep == OSD_REP_SPLAY) { - // same primary and acker? - if (!other.empty() && - !pg.acting.empty() && - other[0] == pg.acting[0] && - other[other.size() > 1 ? 1:0] == pg.acting[pg.acting.size() > 1 ? 1:0]) - continue; - } - else if (g_conf.osd_rep == OSD_REP_CHAIN) { - // any change is significant. - } - - // changed significantly. - dout(10) << "scan_pgs pg " << pgid - << " (" << pg.active_tids << ")" - << " " << other << " -> " << pg.acting - << dendl; - changed_pgs.insert(pgid); - } -} - -void Objecter::kick_requests(set& changed_pgs) -{ - dout(10) << "kick_requests in pgs " << changed_pgs << dendl; - - for (set::iterator i = changed_pgs.begin(); - i != changed_pgs.end(); - i++) { - pg_t pgid = *i; - PG& pg = pg_map[pgid]; - - // resubmit ops! - set tids; - tids.swap( pg.active_tids ); - close_pg( pgid ); // will pbly reopen, unless it's just commits we're missing - - for (set::iterator p = tids.begin(); - p != tids.end(); - p++) { - tid_t tid = *p; - - if (op_modify.count(tid)) { - OSDModify *wr = op_modify[tid]; - op_modify.erase(tid); - - // WRITE - if (wr->tid_version.count(tid)) { - if (wr->op == OSD_OP_WRITE && - !g_conf.objecter_buffer_uncommitted) { - dout(0) << "kick_requests missing commit, cannot replay: objecter_buffer_uncommitted == FALSE" << dendl; - } else { - dout(3) << "kick_requests missing commit, replay write " << tid - << " v " << wr->tid_version[tid] << dendl; - modifyx_submit(wr, wr->waitfor_commit[tid], tid); - } - } - else if (wr->waitfor_ack.count(tid)) { - dout(3) << "kick_requests missing ack, resub write " << tid << dendl; - modifyx_submit(wr, wr->waitfor_ack[tid], tid); - } - } - - else if (op_read.count(tid)) { - // READ - OSDRead *rd = op_read[tid]; - op_read.erase(tid); - dout(3) << "kick_requests resub read " << tid << dendl; - - // resubmit - readx_submit(rd, rd->ops[tid], true); - rd->ops.erase(tid); - } - - else if (op_stat.count(tid)) { - OSDStat *st = op_stat[tid]; - op_stat.erase(tid); - - dout(3) << "kick_requests resub stat " << tid << dendl; - - // resubmit - stat_submit(st); - } - - else - assert(0); - } - } -} - - -void Objecter::tick() -{ - dout(10) << "tick" << dendl; - - // look for laggy pgs - utime_t cutoff = g_clock.now(); - cutoff -= g_conf.objecter_timeout; // timeout - for (hash_map::iterator i = pg_map.begin(); - i != pg_map.end(); - i++) { - if (!i->second.active_tids.empty() && - i->second.last < cutoff) { - dout(10) << "tick pg " << i->first << " is laggy" << dendl; - maybe_request_map(); - break; - } - } - - // reschedule - timer.add_event_after(g_conf.objecter_tick_interval, new C_Tick(this)); -} - - - -void Objecter::handle_osd_op_reply(MOSDOpReply *m) -{ - // read or modify? - switch (m->get_op()) { - case OSD_OP_READ: - handle_osd_read_reply(m); - break; - - case OSD_OP_STAT: - handle_osd_stat_reply(m); - break; - - case OSD_OP_WRNOOP: - case OSD_OP_WRITE: - case OSD_OP_ZERO: - case OSD_OP_DELETE: - case OSD_OP_WRUNLOCK: - case OSD_OP_WRLOCK: - case OSD_OP_RDLOCK: - case OSD_OP_RDUNLOCK: - case OSD_OP_UPLOCK: - case OSD_OP_DNLOCK: - handle_osd_modify_reply(m); - break; - - default: - assert(0); - } -} - - - -// stat ----------------------------------- - -tid_t Objecter::stat(object_t oid, off_t *size, ObjectLayout ol, Context *onfinish) -{ - OSDStat *st = new OSDStat(size); - st->extents.push_back(ObjectExtent(oid, 0, 0)); - st->extents.front().layout = ol; - st->onfinish = onfinish; - - return stat_submit(st); -} - -tid_t Objecter::stat_submit(OSDStat *st) -{ - // find OSD - ObjectExtent &ex = st->extents.front(); - PG &pg = get_pg( ex.layout.pgid ); - - // pick tid - last_tid++; - assert(client_inc >= 0); - - // add to gather set - st->tid = last_tid; - op_stat[last_tid] = st; - - pg.active_tids.insert(last_tid); - - // send? - - dout(10) << "stat_submit " << st << " tid " << last_tid - << " oid " << ex.oid - << " " << ex.layout - << " osd" << pg.acker() - << dendl; - - if (pg.acker() >= 0) { - MOSDOp *m = new MOSDOp(messenger->get_myinst(), client_inc, last_tid, - ex.oid, ex.layout, osdmap->get_epoch(), - OSD_OP_STAT); - - messenger->send_message(m, osdmap->get_inst(pg.acker())); - } - - return last_tid; -} - -void Objecter::handle_osd_stat_reply(MOSDOpReply *m) -{ - // get pio - tid_t tid = m->get_tid(); - - if (op_stat.count(tid) == 0) { - dout(7) << "handle_osd_stat_reply " << tid << " ... stray" << dendl; - delete m; - return; - } - - dout(7) << "handle_osd_stat_reply " << tid - << " r=" << m->get_result() - << " size=" << m->get_object_size() - << dendl; - OSDStat *st = op_stat[ tid ]; - op_stat.erase( tid ); - - // remove from osd/tid maps - PG& pg = get_pg( m->get_pg() ); - assert(pg.active_tids.count(tid)); - pg.active_tids.erase(tid); - if (pg.active_tids.empty()) close_pg( m->get_pg() ); - - // success? - if (m->get_result() == -EAGAIN) { - dout(7) << " got -EAGAIN, resubmitting" << dendl; - stat_submit(st); - delete m; - return; - } - //assert(m->get_result() >= 0); - - // ok! - if (m->get_result() < 0) { - *st->size = -1; - } else { - *st->size = m->get_object_size(); - } - - // finish, clean up - Context *onfinish = st->onfinish; - - // done - delete st; - if (onfinish) { - onfinish->finish(m->get_result()); - delete onfinish; - } - - delete m; -} - - -// read ----------------------------------- - - -tid_t Objecter::read(object_t oid, off_t off, size_t len, ObjectLayout ol, bufferlist *bl, - Context *onfinish) -{ - OSDRead *rd = new OSDRead(bl); - rd->extents.push_back(ObjectExtent(oid, off, len)); - rd->extents.front().layout = ol; - readx(rd, onfinish); - return last_tid; -} - - -tid_t Objecter::readx(OSDRead *rd, Context *onfinish) -{ - rd->onfinish = onfinish; - - // issue reads - for (list::iterator it = rd->extents.begin(); - it != rd->extents.end(); - it++) - readx_submit(rd, *it); - - return last_tid; -} - -tid_t Objecter::readx_submit(OSDRead *rd, ObjectExtent &ex, bool retry) -{ - // find OSD - PG &pg = get_pg( ex.layout.pgid ); - - // pick tid - last_tid++; - assert(client_inc >= 0); - - // add to gather set - rd->ops[last_tid] = ex; - op_read[last_tid] = rd; - - pg.active_tids.insert(last_tid); - pg.last = g_clock.now(); - - // send? - dout(10) << "readx_submit " << rd << " tid " << last_tid - << " oid " << ex.oid << " " << ex.start << "~" << ex.length - << " (" << ex.buffer_extents.size() << " buffer fragments)" - << " " << ex.layout - << " osd" << pg.acker() - << dendl; - - if (pg.acker() >= 0) { - MOSDOp *m = new MOSDOp(messenger->get_myinst(), client_inc, last_tid, - ex.oid, ex.layout, osdmap->get_epoch(), - OSD_OP_READ); - m->set_length(ex.length); - m->set_offset(ex.start); - m->set_retry_attempt(retry); - - int who = pg.acker(); - if (rd->balance_reads) { - int replica = messenger->get_myname().num() % pg.acting.size(); - who = pg.acting[replica]; - dout(-10) << "readx_submit reading from random replica " << replica - << " = osd" << who << dendl; - } - messenger->send_message(m, osdmap->get_inst(who)); - } else - maybe_request_map(); - - return last_tid; -} - - -void Objecter::handle_osd_read_reply(MOSDOpReply *m) -{ - // get pio - tid_t tid = m->get_tid(); - - if (op_read.count(tid) == 0) { - dout(7) << "handle_osd_read_reply " << tid << " ... stray" << dendl; - delete m; - return; - } - - dout(7) << "handle_osd_read_reply " << tid << dendl; - OSDRead *rd = op_read[ tid ]; - op_read.erase( tid ); - - // remove from osd/tid maps - PG& pg = get_pg( m->get_pg() ); - assert(pg.active_tids.count(tid)); - pg.active_tids.erase(tid); - if (pg.active_tids.empty()) close_pg( m->get_pg() ); - - // our op finished - rd->ops.erase(tid); - - // success? - if (m->get_result() == -EAGAIN) { - dout(7) << " got -EAGAIN, resubmitting" << dendl; - readx_submit(rd, rd->ops[tid], true); - delete m; - return; - } - //assert(m->get_result() >= 0); - - // what buffer offset are we? - dout(7) << " got frag from " << m->get_oid() << " " - << m->get_offset() << "~" << m->get_length() - << ", still have " << rd->ops.size() << " more ops" << dendl; - - if (rd->ops.empty()) { - // all done - size_t bytes_read = 0; - - if (rd->read_data.size()) { - dout(15) << " assembling frags" << dendl; - - /** FIXME This doesn't handle holes efficiently. - * It allocates zero buffers to fill whole buffer, and - * then discards trailing ones at the end. - * - * Actually, this whole thing is pretty messy with temporary bufferlist*'s all over - * the heap. - */ - - // we have other fragments, assemble them all... blech! - rd->read_data[m->get_oid()] = new bufferlist; - rd->read_data[m->get_oid()]->claim( m->get_data() ); - - // map extents back into buffer - map by_off; // buffer offset -> bufferlist - - // for each object extent... - for (list::iterator eit = rd->extents.begin(); - eit != rd->extents.end(); - eit++) { - bufferlist *ox_buf = rd->read_data[eit->oid]; - unsigned ox_len = ox_buf->length(); - unsigned ox_off = 0; - assert(ox_len <= eit->length); - - // for each buffer extent we're mapping into... - for (map::iterator bit = eit->buffer_extents.begin(); - bit != eit->buffer_extents.end(); - bit++) { - dout(21) << " object " << eit->oid << " extent " << eit->start << "~" << eit->length << " : ox offset " << ox_off << " -> buffer extent " << bit->first << "~" << bit->second << dendl; - by_off[bit->first] = new bufferlist; - - if (ox_off + bit->second <= ox_len) { - // we got the whole bx - by_off[bit->first]->substr_of(*ox_buf, ox_off, bit->second); - if (bytes_read < bit->first + bit->second) - bytes_read = bit->first + bit->second; - } else if (ox_off + bit->second > ox_len && ox_off < ox_len) { - // we got part of this bx - by_off[bit->first]->substr_of(*ox_buf, ox_off, (ox_len-ox_off)); - if (bytes_read < bit->first + ox_len-ox_off) - bytes_read = bit->first + ox_len-ox_off; - - // zero end of bx - dout(21) << " adding some zeros to the end " << ox_off + bit->second-ox_len << dendl; - bufferptr z(ox_off + bit->second - ox_len); - z.zero(); - by_off[bit->first]->append( z ); - } else { - // we got none of this bx. zero whole thing. - assert(ox_off >= ox_len); - dout(21) << " adding all zeros for this bit " << bit->second << dendl; - bufferptr z(bit->second); - z.zero(); - by_off[bit->first]->append( z ); - } - ox_off += bit->second; - } - assert(ox_off == eit->length); - } - - // sort and string bits together - for (map::iterator it = by_off.begin(); - it != by_off.end(); - it++) { - assert(it->second->length()); - if (it->first < (off_t)bytes_read) { - dout(21) << " concat buffer frag off " << it->first << " len " << it->second->length() << dendl; - rd->bl->claim_append(*(it->second)); - } else { - dout(21) << " NO concat zero buffer frag off " << it->first << " len " << it->second->length() << dendl; - } - delete it->second; - } - - // trim trailing zeros? - if (rd->bl->length() > bytes_read) { - dout(10) << " trimming off trailing zeros . bytes_read=" << bytes_read - << " len=" << rd->bl->length() << dendl; - rd->bl->splice(bytes_read, rd->bl->length() - bytes_read); - assert(bytes_read == rd->bl->length()); - } - - // hose p->read_data bufferlist*'s - for (map::iterator it = rd->read_data.begin(); - it != rd->read_data.end(); - it++) { - delete it->second; - } - } else { - dout(15) << " only one frag" << dendl; - - // only one fragment, easy - rd->bl->claim( m->get_data() ); - bytes_read = rd->bl->length(); - } - - // finish, clean up - Context *onfinish = rd->onfinish; - - dout(7) << " " << bytes_read << " bytes " - << rd->bl->length() - << dendl; - - // done - delete rd; - if (onfinish) { - onfinish->finish(bytes_read);// > 0 ? bytes_read:m->get_result()); - delete onfinish; - } - } else { - // store my bufferlist for later assembling - rd->read_data[m->get_oid()] = new bufferlist; - rd->read_data[m->get_oid()]->claim( m->get_data() ); - } - - delete m; -} - - - -// write ------------------------------------ - -tid_t Objecter::write(object_t oid, off_t off, size_t len, ObjectLayout ol, bufferlist &bl, - Context *onack, Context *oncommit) -{ - OSDWrite *wr = new OSDWrite(bl); - wr->extents.push_back(ObjectExtent(oid, off, len)); - wr->extents.front().layout = ol; - wr->extents.front().buffer_extents[0] = len; - modifyx(wr, onack, oncommit); - return last_tid; -} - - -// zero - -tid_t Objecter::zero(object_t oid, off_t off, size_t len, ObjectLayout ol, - Context *onack, Context *oncommit) -{ - OSDModify *z = new OSDModify(OSD_OP_ZERO); - z->extents.push_back(ObjectExtent(oid, off, len)); - z->extents.front().layout = ol; - modifyx(z, onack, oncommit); - return last_tid; -} - - -// lock ops - -tid_t Objecter::lock(int op, object_t oid, ObjectLayout ol, - Context *onack, Context *oncommit) -{ - OSDModify *l = new OSDModify(op); - l->extents.push_back(ObjectExtent(oid, 0, 0)); - l->extents.front().layout = ol; - modifyx(l, onack, oncommit); - return last_tid; -} - - - -// generic modify ----------------------------------- - -tid_t Objecter::modifyx(OSDModify *wr, Context *onack, Context *oncommit) -{ - wr->onack = onack; - wr->oncommit = oncommit; - - // issue writes/whatevers - for (list::iterator it = wr->extents.begin(); - it != wr->extents.end(); - it++) - modifyx_submit(wr, *it); - - return last_tid; -} - - -tid_t Objecter::modifyx_submit(OSDModify *wr, ObjectExtent &ex, tid_t usetid) -{ - // find - PG &pg = get_pg( ex.layout.pgid ); - - // pick tid - tid_t tid; - if (usetid > 0) - tid = usetid; - else - tid = ++last_tid; - assert(client_inc >= 0); - - // add to gather set - wr->waitfor_ack[tid] = ex; - wr->waitfor_commit[tid] = ex; - op_modify[tid] = wr; - pg.active_tids.insert(tid); - pg.last = g_clock.now(); - - ++num_unacked; - ++num_uncommitted; - - // send? - dout(10) << "modifyx_submit " << MOSDOp::get_opname(wr->op) << " tid " << tid - << " oid " << ex.oid - << " " << ex.start << "~" << ex.length - << " " << ex.layout - << " osd" << pg.primary() - << dendl; - if (pg.primary() >= 0) { - MOSDOp *m = new MOSDOp(messenger->get_myinst(), client_inc, tid, - ex.oid, ex.layout, osdmap->get_epoch(), - wr->op); - m->set_length(ex.length); - m->set_offset(ex.start); - if (usetid > 0) - m->set_retry_attempt(true); - - if (wr->tid_version.count(tid)) - m->set_version(wr->tid_version[tid]); // we're replaying this op! - - // what type of op? - switch (wr->op) { - case OSD_OP_WRITE: - { - // map buffer segments into this extent - // (may be fragmented bc of striping) - bufferlist cur; - for (map::iterator bit = ex.buffer_extents.begin(); - bit != ex.buffer_extents.end(); - bit++) - ((OSDWrite*)wr)->bl.copy(bit->first, bit->second, cur); - assert(cur.length() == ex.length); - m->set_data(cur);//.claim(cur); - } - break; - } - - messenger->send_message(m, osdmap->get_inst(pg.primary())); - } else - maybe_request_map(); - - dout(5) << num_unacked << " unacked, " << num_uncommitted << " uncommitted" << dendl; - - return tid; -} - - - -void Objecter::handle_osd_modify_reply(MOSDOpReply *m) -{ - // get pio - tid_t tid = m->get_tid(); - - if (op_modify.count(tid) == 0) { - dout(7) << "handle_osd_modify_reply " << tid - << (m->get_commit() ? " commit":" ack") - << " ... stray" << dendl; - delete m; - return; - } - - dout(7) << "handle_osd_modify_reply " << tid - << (m->get_commit() ? " commit":" ack") - << " v " << m->get_version() - << dendl; - OSDModify *wr = op_modify[ tid ]; - - Context *onack = 0; - Context *oncommit = 0; - - PG &pg = get_pg( m->get_pg() ); - - // ignore? - if (pg.acker() != m->get_source().num()) { - dout(7) << " ignoring ack|commit from non-acker" << dendl; - delete m; - return; - } - - assert(m->get_result() >= 0); - - // ack or commit? - if (m->get_commit()) { - //dout(15) << " handle_osd_write_reply commit on " << tid << dendl; - assert(wr->tid_version.count(tid) == 0 || - m->get_version() == wr->tid_version[tid]); - - // remove from tid/osd maps - assert(pg.active_tids.count(tid)); - pg.active_tids.erase(tid); - if (pg.active_tids.empty()) close_pg( m->get_pg() ); - - // commit. - op_modify.erase( tid ); - wr->waitfor_ack.erase(tid); - wr->waitfor_commit.erase(tid); - - num_uncommitted--; - - if (wr->waitfor_commit.empty()) { - onack = wr->onack; - oncommit = wr->oncommit; - delete wr; - } - } else { - // ack. - //dout(15) << " handle_osd_write_reply ack on " << tid << dendl; - assert(wr->waitfor_ack.count(tid)); - wr->waitfor_ack.erase(tid); - - num_unacked--; - - if (wr->tid_version.count(tid) && - wr->tid_version[tid].version != m->get_version().version) { - dout(-10) << "handle_osd_modify_reply WARNING: replay of tid " << tid - << " did not achieve previous ordering" << dendl; - } - wr->tid_version[tid] = m->get_version(); - - if (wr->waitfor_ack.empty()) { - onack = wr->onack; - wr->onack = 0; // only do callback once - - // buffer uncommitted? - if (!g_conf.objecter_buffer_uncommitted && - wr->op == OSD_OP_WRITE) { - // discard buffer! - ((OSDWrite*)wr)->bl.clear(); - } - } - } - - dout(5) << num_unacked << " unacked, " << num_uncommitted << " uncommitted" << dendl; - - // do callbacks - if (onack) { - onack->finish(0); - delete onack; - } - if (oncommit) { - oncommit->finish(0); - delete oncommit; - } - - delete m; -} - - - -void Objecter::ms_handle_failure(Message *m, entity_name_t dest, const entity_inst_t& inst) -{ - if (dest.is_mon()) { - // try a new mon - int mon = monmap->pick_mon(true); - dout(0) << "ms_handle_failure " << dest << " inst " << inst - << ", resending to mon" << mon - << dendl; - messenger->send_message(m, monmap->get_inst(mon)); - } - else if (dest.is_osd()) { - int mon = monmap->pick_mon(); - dout(0) << "ms_handle_failure " << dest << " inst " << inst - << ", dropping and reporting to mon" << mon - << dendl; - messenger->send_message(new MOSDFailure(messenger->get_myinst(), inst, osdmap->get_epoch()), - monmap->get_inst(mon)); - delete m; - } else { - dout(0) << "ms_handle_failure " << dest << " inst " << inst - << ", dropping" << dendl; - delete m; - } -} diff --git a/branches/sage/mds/osdc/Objecter.h b/branches/sage/mds/osdc/Objecter.h deleted file mode 100644 index 82a437aa04f8d..0000000000000 --- a/branches/sage/mds/osdc/Objecter.h +++ /dev/null @@ -1,230 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __OBJECTER_H -#define __OBJECTER_H - -#include "include/types.h" -#include "include/buffer.h" - -#include "osd/OSDMap.h" -#include "messages/MOSDOp.h" - -#include "common/Timer.h" - -#include -#include -#include -using namespace std; -using namespace __gnu_cxx; - -class Context; -class Messenger; -class OSDMap; -class MonMap; -class Message; - -class Objecter { - public: - Messenger *messenger; - MonMap *monmap; - OSDMap *osdmap; - - private: - tid_t last_tid; - int client_inc; - int num_unacked; - int num_uncommitted; - - epoch_t last_epoch_requested; - utime_t last_epoch_requested_stamp; - - void maybe_request_map(); - - Mutex &client_lock; - SafeTimer timer; - - class C_Tick : public Context { - Objecter *ob; - public: - C_Tick(Objecter *o) : ob(o) {} - void finish(int r) { ob->tick(); } - }; - void tick(); - - - /*** track pending operations ***/ - // read - public: - class OSDOp { - public: - list extents; - virtual ~OSDOp() {} - }; - - class OSDRead : public OSDOp { - public: - bufferlist *bl; - Context *onfinish; - map ops; - map read_data; // bits of data as they come back - int balance_reads; // if non-zero, direct reads to a pseudo-random replica - - OSDRead(bufferlist *b) : bl(b), onfinish(0), balance_reads(0) { - bl->clear(); - } - }; - - class OSDStat : public OSDOp { - public: - tid_t tid; - off_t *size; // where the size goes. - Context *onfinish; - OSDStat(off_t *s) : tid(0), size(s), onfinish(0) { } - }; - - // generic modify - class OSDModify : public OSDOp { - public: - int op; - list extents; - Context *onack; - Context *oncommit; - map waitfor_ack; - map tid_version; - map waitfor_commit; - - OSDModify(int o) : op(o), onack(0), oncommit(0) {} - }; - - // write (includes the bufferlist) - class OSDWrite : public OSDModify { - public: - bufferlist bl; - OSDWrite(bufferlist &b) : OSDModify(OSD_OP_WRITE), bl(b) {} - }; - - - - private: - // pending ops - hash_map op_stat; - hash_map op_read; - hash_map op_modify; - - /** - * track pending ops by pg - * ...so we can cope with failures, map changes - */ - class PG { - public: - vector acting; - set active_tids; // active ops - utime_t last; - - PG() {} - - // primary - where i write - int primary() { - if (acting.empty()) return -1; - return acting[0]; - } - // acker - where i read, and receive acks from - int acker() { - if (acting.empty()) return -1; - if (g_conf.osd_rep == OSD_REP_PRIMARY) - return acting[0]; - else - return acting[acting.size() > 1 ? 1:0]; - } - }; - - hash_map pg_map; - - - PG &get_pg(pg_t pgid) { - if (!pg_map.count(pgid)) - osdmap->pg_to_acting_osds(pgid, pg_map[pgid].acting); - return pg_map[pgid]; - } - void close_pg(pg_t pgid) { - assert(pg_map.count(pgid)); - assert(pg_map[pgid].active_tids.empty()); - pg_map.erase(pgid); - } - void scan_pgs(set& chnaged_pgs); - void kick_requests(set& changed_pgs); - - - public: - Objecter(Messenger *m, MonMap *mm, OSDMap *om, Mutex& l) : - messenger(m), monmap(mm), osdmap(om), - last_tid(0), client_inc(-1), - num_unacked(0), num_uncommitted(0), - last_epoch_requested(0), - client_lock(l), timer(l) - { } - ~Objecter() { } - - void init(); - void shutdown(); - - // messages - public: - void dispatch(Message *m); - void handle_osd_op_reply(class MOSDOpReply *m); - void handle_osd_stat_reply(class MOSDOpReply *m); - void handle_osd_read_reply(class MOSDOpReply *m); - void handle_osd_modify_reply(class MOSDOpReply *m); - void handle_osd_lock_reply(class MOSDOpReply *m); - void handle_osd_map(class MOSDMap *m); - - private: - tid_t readx_submit(OSDRead *rd, ObjectExtent& ex, bool retry=false); - tid_t modifyx_submit(OSDModify *wr, ObjectExtent& ex, tid_t tid=0); - tid_t stat_submit(OSDStat *st); - - // public interface - public: - bool is_active() { - return !(op_read.empty() && op_modify.empty()); - } - - int get_client_incarnation() { return client_inc; } - void set_client_incarnation(int inc) { - client_inc = inc; - } - - // med level - tid_t readx(OSDRead *read, Context *onfinish); - tid_t modifyx(OSDModify *wr, Context *onack, Context *oncommit); - //tid_t lockx(OSDLock *l, Context *onack, Context *oncommit); - - // even lazier - tid_t read(object_t oid, off_t off, size_t len, ObjectLayout ol, bufferlist *bl, - Context *onfinish); - tid_t write(object_t oid, off_t off, size_t len, ObjectLayout ol, bufferlist &bl, - Context *onack, Context *oncommit); - tid_t zero(object_t oid, off_t off, size_t len, ObjectLayout ol, - Context *onack, Context *oncommit); - tid_t stat(object_t oid, off_t *size, ObjectLayout ol, Context *onfinish); - - tid_t lock(int op, object_t oid, ObjectLayout ol, Context *onack, Context *oncommit); - - - void ms_handle_failure(Message *m, entity_name_t dest, const entity_inst_t& inst); - -}; - -#endif diff --git a/branches/sage/mds/script/add_header.pl b/branches/sage/mds/script/add_header.pl deleted file mode 100755 index 023c06e455fd1..0000000000000 --- a/branches/sage/mds/script/add_header.pl +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/perl - -use strict; -my $fn = shift @ARGV; -my $old = `cat $fn`; - -my $header = `cat doc/header.txt`; - -# strip existing header -my $new = $old; -if ($new =~ /^(.*)\* Ceph - scalable distributed file system/s) { - my ($a,@b) = split(/\*\/\n/, $new); - $new = join("*/\n",@b); -} -$new = $header . $new; - -if ($new ne $old) { - open(O, ">$fn.new"); - print O $new; - close O; - system "diff $fn $fn.new"; - rename "$fn.new", $fn; - #unlink "$fn.new"; - -} - diff --git a/branches/sage/mds/script/adjusttabs.pl b/branches/sage/mds/script/adjusttabs.pl deleted file mode 100755 index 66edff2ac6c02..0000000000000 --- a/branches/sage/mds/script/adjusttabs.pl +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/perl - -my $tablen = shift @ARGV; -my $fn = shift @ARGV; - -my $tab = ' ' x $tablen; -open(I, $fn); -my $f; -my $oldtab = ' ' x 4; -while () { - if (my ($oldlen) = /\-\*\- .*tab-width:(\d)/) { - print "old length was $oldlen\n"; - $oldtab = ' ' x $oldlen; - s/tab-width:\d/tab-width:$tablen/; - } - s/\t/$oldtab/g; - $f .= $_; -} -close I; -open(O, ">$fn.new"); -print O $f; -close O; - -rename "$fn.new", $fn; diff --git a/branches/sage/mds/script/check_cache_dumps.pl b/branches/sage/mds/script/check_cache_dumps.pl deleted file mode 100755 index 95bd28a474991..0000000000000 --- a/branches/sage/mds/script/check_cache_dumps.pl +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/perl - -my $epoch = shift || die "specify epoch"; - -my %auth; # mds -> id -> replica -> nonce -my %replica; # mds -> id -> auth -> nonce - -print "reading\n"; -for (my $i=0; -e "cachedump.$epoch.mds$i"; $i++) { - open(O,"cachedump.$epoch.mds$i"); - while () { - my ($name,$s); - ($name,$s) = /^\[(inode \d+) \S+ (\S+)/; - ($name,$s) = /^\[(dir \d+) \S+ (\S+)/ unless $name; - ($name,$s) = /^\[dentry (\S+) (\S+)/ unless $name; - if ($name) { - if ($s =~ /^auth/) { - $auth{$i}->{$name} = {}; - my ($rl) = $s =~ /\{(.*)\}/; - for my $r (split(/,/,$rl)) { - my ($who,$nonce) = $r =~ /(\d+)\=(\d+)/; - $auth{$i}->{$name}->{$who} = $nonce; - #print "auth $name rep by $who $nonce $s\n"; - } - } - else { - my ($a,$b,$n) = $s =~ /rep@(\d+)\,([\-\d]+)\.(\d+)/; - die $_ unless $a >= 0; - $replica{$i}->{$name}->{$a} = $n; - if ($b >= 0) { - $replica{$i}->{$name}->{$b} = $n; - } - } - } - } -} - -print "verifying replicas\n"; -for my $mds (keys %replica) { - for my $name (keys %{$replica{$mds}}) { - for my $auth (keys %{$replica{$mds}->{$name}}) { - if ($auth{$auth}->{$name}->{$mds}) { - if ($auth{$auth}->{$name}->{$mds} < $replica{$mds}->{$name}->{$auth}) { - print "problem: mds$mds has $name from mds$auth nonce $replica{$mds}->{$name}->{$auth}, auth has nonce $auth{$auth}->{$name}->{$mds}\n"; - } else { - print "ok: mds$mds has $name from mds$auth nonce $replica{$mds}->{$name}->{$auth}, auth has nonce $auth{$auth}->{$name}->{$mds}\n"; - } - } else { - print "??: mds$mds has $name from mds$auth nonce $replica{$mds}->{$name}->{$auth}, auth has no nonce\n"; - } - - } - } -} - - diff --git a/branches/sage/mds/script/clean_osd_cow.sh b/branches/sage/mds/script/clean_osd_cow.sh deleted file mode 100755 index 1e443c95e7ebc..0000000000000 --- a/branches/sage/mds/script/clean_osd_cow.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/sh - -rm osddata/*/*\.* diff --git a/branches/sage/mds/script/clean_trace.pl b/branches/sage/mds/script/clean_trace.pl deleted file mode 100755 index cb02ff7abe7c2..0000000000000 --- a/branches/sage/mds/script/clean_trace.pl +++ /dev/null @@ -1,8 +0,0 @@ -#!/usr/bin/perl - -my $n = 0; -while (<>) { - next unless /trace: /; - my $l = $'; $'; - print $l; -} diff --git a/branches/sage/mds/script/comb.pl b/branches/sage/mds/script/comb.pl deleted file mode 100755 index 1a0d4dcbe6c07..0000000000000 --- a/branches/sage/mds/script/comb.pl +++ /dev/null @@ -1,113 +0,0 @@ -#!/usr/bin/perl - -use strict; - -my $xaxis = shift @ARGV; -my @vars; -while (@ARGV) { - $_ = shift @ARGV; - last if ($_ eq '-'); - push(@vars, $_); -} -my @dirs; -while (@ARGV) { - $_ = shift @ARGV; - last if ($_ eq '-'); - push(@dirs, $_) if -d $_; -} -my @filt = @ARGV; -push( @filt, '.' ) unless @filt; - -print "#xaxis $xaxis -#vars @vars -#dirs @dirs -#filt @filt -"; - -sub load_sum { - my $fn = shift @_; - - open(I, "$fn"); - my $k = ; - chomp($k); - my @k = split(/\s+/,$k); - shift @k; - - my $s; - while () { - chomp; - s/^\#//; - next unless $_; - my @l = split(/\s+/,$_); - my $k = shift @l; - for my $f (@k) { - $s->{$k}->{$f} = shift @l; - } - - # clnode latency? - if ($fn =~ /cl/) { - $s->{$k}->{'wrlat'} = $s->{$k}->{'wrlsum'} / $s->{$k}->{'wrlnum'} if $s->{$k}->{'wrlnum'} > 0; - $s->{$k}->{'rlat'} = $s->{$k}->{'rlsum'} / $s->{$k}->{'rlnum'} if $s->{$k}->{'rlnum'} > 0; - $s->{$k}->{'lat'} = $s->{$k}->{'lsum'} / $s->{$k}->{'lnum'} if $s->{$k}->{'lnum'} > 0; - $s->{$k}->{'latw'} = $s->{$k}->{'lwsum'} / $s->{$k}->{'lwnum'} if $s->{$k}->{'lwnum'} > 0; - $s->{$k}->{'latr'} = $s->{$k}->{'lrsum'} / $s->{$k}->{'lrnum'} if $s->{$k}->{'lrnum'} > 0; - $s->{$k}->{'statlat'} = $s->{$k}->{'lstatsum'} / $s->{$k}->{'lstatnum'} if $s->{$k}->{'lstatnum'} > 0; - $s->{$k}->{'dirlat'} = $s->{$k}->{'ldirsum'} / $s->{$k}->{'ldirnum'} if $s->{$k}->{'ldirnum'} > 0; - } - } - return $s; -} - - -my %res; -my @key; -my %didkey; -for my $f (@filt) { - my @reg = split(/,/, $f); - #print "reg @reg\n"; - for my $d (@dirs) { - if ($f ne '.') { - my $r = (split(/\//,$d))[-1]; - my @db = split(/,/, $r); - #print "db @db\n"; - my $ok = 1; - for my $r (@reg) { - - $ok = 0 unless grep {$_ eq $r} @db; - } - next unless $ok; - } - #next if ($f ne '.' && $d !~ /$reg/); - #print "$d\n"; - my ($x) = $d =~ /$xaxis=([\d\.]+)/; - - for my $v (@vars) { - my ($what, $field) = $v =~ /^(.+)\.([^\.]+)$/; - #print "$what $field .. $v .. $f.$field\n"; - my $s = &load_sum("$d/sum.$what"); - - #print "\t$v"; - if ($field =~ /^sum=/) { - #warn "SUM field $field\n"; - push( @{$res{$x}}, $s->{'sum'}->{$'} ); #'}); - } else { - #warn "avg field $field\n"; - push( @{$res{$x}}, $s->{'avgval'}->{$field} ); - } - - push( @key, "$f.$field" ) unless $didkey{"$f.$field"}; - $didkey{"$f.$field"} = 1; - - if (0 && exists $s->{'avgvaldevt'}) { - push( @{$res{$x}}, $s->{'avgvaldevt'}->{$field} ); - push( @key, "$f.$field.dev" ) unless $didkey{"$f.$field.dev"}; - $didkey{"$f.$field.dev"} = 1; - } - } - } -} - -print join("\t", "#", @key) . "\n"; -for my $x (sort {$a <=> $b} keys %res) { - print join("\t", $x, @{$res{$x}}) . "\n"; -} diff --git a/branches/sage/mds/script/convert_soe_trace.pl b/branches/sage/mds/script/convert_soe_trace.pl deleted file mode 100755 index a6ec80312d0fe..0000000000000 --- a/branches/sage/mds/script/convert_soe_trace.pl +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/perl - -# this reads in one of kristal's anonymized static traces from -# soe and makes it look like output from -# -# find . -exec ls -dilsn --time-style=+%s \{\} \; -# -# (which is what SyntheticClient likes to "import", and -# study_static.pl likes to analyze for hardlinks, dirsizes, etc.) - -while (<>) { - chomp; - my ($file, $ino, $size, $actime, $ctime, $mtime, $uid, $gid, $omode, $nlink) = split(/ /,substr($_,1)); - $file = '.' . $file; - my $nmode = oct($omode); - my $mode = '-...'; - $mode = 'd...' if (($nmode & 0170000) == 0040000); - $mode = 'f...' if (($nmode & 0170000) == 0100000); - $size = hex($size); - $mtime = hex($mtime); - $uid = hex($uid); - $gid = hex($gid); - print "$ino ? $mode ? $nlink $uid $gid $size $mtime $file\n"; -} - -__END__ - -soe format is -0. a space -1. full path of file name (MD5-ed and in base 64) -2. inode number -3. size of file in bytes (hex) -4. atime (hex) -5. ctime (hex) -6. mtime (hex) -7. uid (hex) -8. gid (hex) -9. mode (octal) -10. number of links diff --git a/branches/sage/mds/script/find_auth_pins.pl b/branches/sage/mds/script/find_auth_pins.pl deleted file mode 100755 index d37fb109a48da..0000000000000 --- a/branches/sage/mds/script/find_auth_pins.pl +++ /dev/null @@ -1,51 +0,0 @@ -#!/usr/bin/perl - -my %pin; -my %hist; -my $l = 1; -my @pins; -while (<>) { - - #cdir:adjust_nested_auth_pins on [dir 163 /foo/ rep@13 | child] count now 0 + 1 - - if (/adjust_nested_auth_pins/) { - my ($what) = / (\w+)\]/; - $what =~ s/ 0x/ /; - $hist{$what} .= "$l: $_" - if defined $pin{$what}; - } - - # cinode:auth_pin on inode [1000000002625 /gnu/blah_client_created. 0x89b7700] count now 1 + 0 - - elsif (/auth_pin / && !/waiting/) { - #my ($what) = /\[(\w+ \w+) /; - my ($what) = / (\w+)\]/; - $what =~ s/ 0x/ /; - #print "$_ add_waiter $c $what\n"; - $pin{$what}++; - $hist{$what} .= "$l: $_"; - push( @pins, $what ) unless grep {$_ eq $what} @pins; - } - - # cinode:auth_unpin on inode [1000000002625 (dangling) 0x89b7700] count now 0 + 0 - - elsif (/auth_unpin/) { - #my ($what) = /\[(\w+ \w+) /;# / on (.*\])/; - my ($what) = / (\w+)\]/; - $what =~ s/ 0x/ /; - $pin{$what}--; - $hist{$what} .= "$l: $_"; - unless ($pin{$what}) { - delete $hist{$what}; - delete $pin{$what}; - @pins = grep {$_ ne $what} @pins; - } - } - $l++; -} - -for my $what (@pins) { - print "---- count $pin{$what} on $what -$hist{$what} -"; -} diff --git a/branches/sage/mds/script/find_bufferleaks.pl b/branches/sage/mds/script/find_bufferleaks.pl deleted file mode 100755 index 152515d5e788e..0000000000000 --- a/branches/sage/mds/script/find_bufferleaks.pl +++ /dev/null @@ -1,69 +0,0 @@ -#!/usr/bin/perl - -use strict; -my %buffers; -my %bufferlists; -my %ref; -my %mal; -my $l = 1; -while (<>) { - #print "$l: $_"; - - # cinode:auth_pin on inode [1000000002625 /gnu/blah_client_created. 0x89b7700] count now 1 + 0 - - if (/^buffer\.cons /) { - my ($x) = /(0x\S+)/; - $buffers{$x} = 1; - } - if (/^buffer\.des /) { - my ($x) = /(0x\S+)/; - die "des without cons at $l: $_" unless $buffers{$x}; - delete $buffers{$x}; - die "des with ref>0 at $l: $_" unless $ref{$x} == 0; - delete $ref{$x}; - } - - if (/^bufferlist\.cons /) { - my ($x) = /(0x\S+)/; - $bufferlists{$x} = 1; - } - if (/^bufferlist\.des /) { - my ($x) = /(0x\S+)/; - warn "des without cons at $l: $_" unless $bufferlists{$x}; - delete $bufferlists{$x}; - } - - - if (/^buffer\.malloc /) { - my ($x) = /(0x\S+)/; - $mal{$x} = 1; - } - if (/^buffer\.free /) { - my ($x) = /(0x\S+)/; - die "free with malloc at $l: $_" unless $mal{$x}; - delete $mal{$x}; - } - - if (/^buffer\.get /) { - my ($x) = /(0x\S+)/; - $ref{$x}++; - } - if (/^buffer\.get /) { - my ($x) = /(0x\S+)/; - $ref{$x}--; - } - -$l++; -} - -for my $x (keys %bufferlists) { - print "leaked bufferlist $x\n"; -} - -for my $x (keys %buffers) { - print "leaked buffer $x ref $ref{$x}\n"; -} - -for my $x (keys %mal) { - print "leaked buffer dataptr $x ref $ref{$x}\n"; -} diff --git a/branches/sage/mds/script/find_lost_bdev_ops.pl b/branches/sage/mds/script/find_lost_bdev_ops.pl deleted file mode 100755 index ac1793b42dfac..0000000000000 --- a/branches/sage/mds/script/find_lost_bdev_ops.pl +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/perl - -use strict; -my %op; - -my $line = 0; -while (<>) { - #print $line . $_ if /0x8d4f6a0/; - chomp; - $line++; - - #bdev(./ebofsdev/0)._submit_io bio(wr 269~1 usemap 0x4de33cc0) - if (my ($bio) = /_submit_io bio\(.*(0x\w+)\)/) { - $op{$bio} = $line; - } - - # cancel - #bdev(./ebofsdev/3)._cancel_io bio(wr 1525~1 bh_write 0x8a437b8) - if (my ($bio) = /_cancel_io bio\(.*(0x\w+)\)/ && - !(/FAILED/)) { - delete $op{$bio}; - } - - # finish - #bdev(./ebofsdev/3).complete_thread finishing bio(wr 1131~1 write_cnode 0x832c1f8) - if (my ($bio) = /complete_thread finishing bio\(.*(0x\w+)\)/) { - delete $op{$bio}; - } - -} - -for my $bio (keys %op) { - print "---- lost bio $bio\n"; -} diff --git a/branches/sage/mds/script/find_lost_commit.pl b/branches/sage/mds/script/find_lost_commit.pl deleted file mode 100755 index 73934248ad5c0..0000000000000 --- a/branches/sage/mds/script/find_lost_commit.pl +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/perl - -use strict; -my %op; - -my $line = 0; -while (<>) { - #print "$line: $_"; - $line++; - - #osd3 do_op MOSDOp(client0.933 oid 100000000000008 0x84b4480) in pg[pginfo(4020000000d v 5662/0 e 2/1) r=0 active (0,5662]] - if (my ($from, $opno, $oid, $op) = /do_op MOSDOp\((\S+) op (\d+) oid (\d+) (\w+)\)/) { -# print "$op\n"; - if ($opno == 2 || $opno == 11 || $opno == 12 || $opno == 14 || $opno == 15) { - $op{$op} = $from; - } - } - - # commits - #osd1 op_modify_commit on op MOSDOp(client1.289 oid 100000100000002 0x51a2f788) - if (my ($op) = /op_modify_commit.* (\w+)\)/) { - delete $op{$op}; - } - #osd4 rep_modify_commit on op MOSDOp(osd3.289 oid 100000000000008 0x84b0980) - if (my ($op) = /rep_modify_commit.* (\w+)\)/) { - delete $op{$op}; - } - - # forwarded? - if (my ($op) = /sending (\w+) to osd/) { - delete $op{$op}; - } - -} - -for my $op (keys %op) { - print "---- lost op $op $op{$op}\n"; -} diff --git a/branches/sage/mds/script/find_lost_objecter.pl b/branches/sage/mds/script/find_lost_objecter.pl deleted file mode 100755 index a0c2089140e23..0000000000000 --- a/branches/sage/mds/script/find_lost_objecter.pl +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/perl - -use strict; -my %ack; -my %commit; - -my $line = 0; -while (<>) { - #print "$line: $_"; - $line++; - - #client0.objecter writex_submit tid 21 osd0 oid 100000000000001 851424~100000 - if (my ($who, $tid) = /(\S+)\.objecter writex_submit tid\D+(\d+)\D+osd/) { -# print "$who.$tid\n"; - $ack{"$who.$tid"} = $line; - $commit{"$who.$tid"} = $line; - } - - #client1.objecter handle_osd_write_reply 304 commit 0 - #client1.objecter handle_osd_write_reply 777 commit 1 - if (my ($who, $tid, $commit) = /(\S+)\.objecter handle_osd_write_reply\D+(\d+)\D+commit\D+(\d)/) { -# print "$who.$tid\n"; - delete $ack{"$who.$tid"}; - delete $commit{"$who.$tid"} if $commit; - } - -} - -for my $op (keys %commit) { - print "---- lost commit $op $commit{$op}\n"; -} -for my $op (keys %ack) { - print "---- lost ack $op $commit{$op}\n"; -} diff --git a/branches/sage/mds/script/find_pathpins.pl b/branches/sage/mds/script/find_pathpins.pl deleted file mode 100755 index e4a7d81dfb7b7..0000000000000 --- a/branches/sage/mds/script/find_pathpins.pl +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/perl - -my %pin; -my %hist; -my $l = 1; -my @pins; -while (<>) { - - # cinode:auth_pin on inode [1000000002625 /gnu/blah_client_created. 0x89b7700] count now 1 + 0 - - if (/path_pinned /) { - my ($dname, $dir) = /\[dentry (\S+) .* in \[dir (\d+) /; - $what = "$dname $dir"; - #print "$l pin $what\n"; - $pin{$what}++; - $hist{$what} .= "$l: $_"; - push( @pins, $what ) unless grep {$_ eq $what} @pins; - } - - # cinode:auth_unpin on inode [1000000002625 (dangling) 0x89b7700] count now 0 + 0 - - if (/path_unpinned/) { - my ($dname, $dir) = /\[dentry (\S+) .* in \[dir (\d+) /; - $what = "$dname $dir"; - #print "$l unpin $what\n"; - $pin{$what}--; - $hist{$what} .= "$l: $_"; - unless ($pin{$what}) { - delete $hist{$what}; - delete $pin{$what}; - @pins = grep {$_ ne $what} @pins; - } - } - $l++; -} - -for my $what (@pins) { - print "---- count $pin{$what} on $what -$hist{$what} -"; -} diff --git a/branches/sage/mds/script/find_requests.pl b/branches/sage/mds/script/find_requests.pl deleted file mode 100755 index 5144896249413..0000000000000 --- a/branches/sage/mds/script/find_requests.pl +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/perl - -my %waiting; # context => what where what is "inode ..." or "dir ..." -my %hist; # context => history since waited -my @waiting; - -my $line = 0; -while (<>) { - - #print $line . $_ if /0x8d4f6a0/; - $line++; - if (/request_start/) { - my ($c) = /(0x\w+)/; - my ($what) = $'; #'; - chomp $what; - #print "$line add_waiter $c $what\n" if /0x8d4f6a0/; - $waiting{$c} = $what - if $what && !$waiting{$c}; - $hist{$c} .= "$line: $_"; - unless (grep {$_ eq $c} @waiting) { - push( @waiting, $c ); - } - } - #if (/finish_waiting/) { - # my ($c) = /(0x\w+)/; - # $hist{$c} .= "$line: $_"; - #} - if (/request_finish/ || - /request_forward/) { - my ($c) = /(0x\w+)/; - #print "took\n" if /0x8d4f6a0/; - delete $waiting{$c}; - delete $hist{$c}; - @waiting = grep {$_ ne $c} @waiting; - } -} - -for my $c (@waiting) { - print "---- lost request $c $waiting{$c} -$hist{$c} -"; -} diff --git a/branches/sage/mds/script/find_waiters.pl b/branches/sage/mds/script/find_waiters.pl deleted file mode 100755 index c89d2b1a49db7..0000000000000 --- a/branches/sage/mds/script/find_waiters.pl +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/perl - -my %waiting; # context => what where what is "inode ..." or "dir ..." -my %hist; # context => history since waited -my @waiting; - -my $line = 0; -while (<>) { - #print $line . $_ if /0x8d4f6a0/; - $line++; - if (/add_waiter/) { - my ($c) = /(0x\w+)/; - my ($what) = / on (.*\])/; - #print "$line add_waiter $c $what\n" if /0x8d4f6a0/; - $waiting{$c} = $what - if $what && !$waiting{$c}; - $hist{$c} .= "$line: $_"; - unless (grep {$_ eq $c} @waiting) { - push( @waiting, $c ); - } - } - #if (/finish_waiting/) { - # my ($c) = /(0x\w+)/; - # $hist{$c} .= "$line: $_"; - #} - if (/take_waiting/) { - my ($c) = /(0x\w+)/; - if (/SKIPPING/) { - #print "skipping\n" if /0x8d4f6a0/; - $hist{$c} .= "$line: $_"; - } elsif (/took/) { - #print "took\n" if /0x8d4f6a0/; - delete $waiting{$c}; - delete $hist{$c}; - @waiting = grep {$_ ne $c} @waiting; - } else { - die "i don't understand: $_"; - } - } -} - -for my $c (@waiting) { - print "---- lost waiter $c $waiting{$c} -$hist{$c} -"; -} diff --git a/branches/sage/mds/script/fix_modeline.pl b/branches/sage/mds/script/fix_modeline.pl deleted file mode 100755 index 8eadde9b54e56..0000000000000 --- a/branches/sage/mds/script/fix_modeline.pl +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/perl - -use strict; -my $fn = shift @ARGV; -my $old = `cat $fn`; -my $header = `cat doc/modeline.txt`; - -# strip existing modeline -my $new = $old; -$new =~ s/^\/\/ \-\*\- ([^\n]+) \-\*\-([^\n]*)\n//s; # emacs -$new =~ s/^\/\/ vim: ([^\n]*)\n//s; # vim; -$new =~ s/^\/\/ \-\*\- ([^\n]+) \-\*\-([^\n]*)\n//s; # emacs -$new =~ s/^\/\/ vim: ([^\n]*)\n//s; # vim; -$new =~ s/^\/\/ \-\*\- ([^\n]+) \-\*\-([^\n]*)\n//s; # emacs -$new =~ s/^\/\/ vim: ([^\n]*)\n//s; # vim; - -# add correct header -$new = $header . $new; - -if ($new ne $old) { - print "$fn\n"; - open(O, ">$fn.new"); - print O $new; - close O; - system "diff $fn $fn.new"; - rename "$fn.new", $fn; - #unlink "$fn.new"; -} - diff --git a/branches/sage/mds/script/gprofnewsyn b/branches/sage/mds/script/gprofnewsyn deleted file mode 100755 index 5d352e4e9e52c..0000000000000 --- a/branches/sage/mds/script/gprofnewsyn +++ /dev/null @@ -1,12 +0,0 @@ -#!/usr/bin/perl - -my @ranks = @ARGV; -unless (@ranks) { - @ranks = split(/\n/,`/bin/ls gmon`); -} -print "will do @ranks\n"; -for my $r (@ranks) { - print "$r\n"; - system "test -e gmon.out && rm gmon.out ; ln -f gmon/$r/gmon.out ; gprof newsyn > gmon/$r/o"; -} - diff --git a/branches/sage/mds/script/grepblock b/branches/sage/mds/script/grepblock deleted file mode 100755 index f5acf95732abb..0000000000000 --- a/branches/sage/mds/script/grepblock +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/perl - -use strict; - -my $block = shift ARGV; -die unless int $block; - -while (<>) { - my $yes = 0; - for my $x (/(\d+\~\d+)/) { - my ($s,$l) = split(/\~/,$x); - $yes = 1 if ($block >= $s && $block < $s+$l); - } - print if $yes; -} diff --git a/branches/sage/mds/script/merge_cdfs.pl b/branches/sage/mds/script/merge_cdfs.pl deleted file mode 100755 index 98c22764fc8b3..0000000000000 --- a/branches/sage/mds/script/merge_cdfs.pl +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/perl - -my %rows; # val -> [ count1, count2, ... ] - -my $filen = 0; -for my $file (@ARGV) { - open(I,"$file"); - while () { - next if /^\#/; - chomp; - my ($v, $c) = split(/\t/,$_); - $rows{$v}->[$filen] = $c; - } - $filen++; -} - -for my $v (sort {$a <=> $b} keys %rows) { - print "$v"; - for (my $i=0; $i < $filen; $i++) { - print "\t" . int($rows{$v}->[$i]); - } - print "\n"; - #print join("\t", $v, @{$rows{$v}}) . "\n"; -} diff --git a/branches/sage/mds/script/merge_trace_rw.pl b/branches/sage/mds/script/merge_trace_rw.pl deleted file mode 100644 index 378d629ef43f6..0000000000000 --- a/branches/sage/mds/script/merge_trace_rw.pl +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/perl - -use strict; - -my @file = <>; -sub get_op { - my @op = shift @file; - while (@file && - $file[0] !~ /^[a-z]+$/) { - push( @op, shift @file ); - } - #print "op = ( @op )\n"; - return @op; -} - -my $n = 0; -while (@file) { - my ($op, @args) = &get_op; - while ($op eq "read\n" || - $op eq "write\n") { - die unless scalar(@args) == 3; - my ($nop, @nargs) = &get_op; - if ($nop eq $op - && ($args[0] == $nargs[0] ) - && ($args[2] + $args[1] == $nargs[2]) - ) { - die unless scalar(@nargs) == 3; - $args[1] += $nargs[1]; - $args[1] .= "\n"; - die unless scalar(@args) == 3; - #print STDOUT "combining $n $op @args\n"; - $n++; - } else { -# print STDERR "not combinging\n"; - unshift( @file, $nop, @nargs ); - die unless scalar(@args) == 3; - last; - } - } - print $op; - print join('', @args); -} diff --git a/branches/sage/mds/script/plot.pl b/branches/sage/mds/script/plot.pl deleted file mode 100755 index 2d4e3002bbd4d..0000000000000 --- a/branches/sage/mds/script/plot.pl +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/perl - -use strict; - -my $dir = shift @ARGV; -my ($type,$subtype) = split(/\./, shift @ARGV); -$subtype = '.' . $subtype if $subtype; - -# list files -my @files; -my %fields; -for my $f (`ls $dir/$type*$subtype`) { - chomp $f; - next unless $f =~ /$type(\d+)$subtype$/; - push(@files, $f); - unless (%fields) { - open(I,$f); - while () { - next unless /^\#/; - my @f = split(/\t/,$_); - for (my $n=1; @f; $n++) { - my $f = shift @f; - $fields{$f} = $n; - #print "$f = $n\n"; - } - last; - } - close I; - } -} -#print "#files @files\n"; - -# get field names -my $var = shift @ARGV; -my $rest = join(' ', @ARGV); - -print "set style data lines\nset grid\n"; -print "set title \"$dir .. $var\"\n"; -if (scalar(@files) > 30) { print "set key off\n"; } -#for my $var (@ARGV) { - my @p; - for my $f (@files) { - my ($lastbit) = $f =~ /\/([^\/]+)$/; - push(@p, "\"$f\" u 1:$fields{$var} $rest t \"$lastbit\""); - } - print "plot " . join(',', @p) . "\n"; -#} -print "pause 60000\n"; diff --git a/branches/sage/mds/script/profonly.pl b/branches/sage/mds/script/profonly.pl deleted file mode 100755 index 6a05dec473ca0..0000000000000 --- a/branches/sage/mds/script/profonly.pl +++ /dev/null @@ -1,12 +0,0 @@ -#!/usr/bin/perl - -my $rank = shift @ARGV; -my $args = join(' ',@ARGV); -if ($rank == $ENV{MPD_JRANK}) { - $c = "LD_PRELOAD=$ENV{'HOME'}/csl/obsd/src/pmds/gprof-helper.so ./newsyn $args"; -} else { - $c = "./newsyn.nopg $args"; -} - -#print "$rank: $c\n"; -system $c; diff --git a/branches/sage/mds/script/runjob.pl b/branches/sage/mds/script/runjob.pl deleted file mode 100755 index c432675d33830..0000000000000 --- a/branches/sage/mds/script/runjob.pl +++ /dev/null @@ -1,341 +0,0 @@ -#!/usr/bin/perl - -use strict; -use Data::Dumper; - - -my $usage = "script/runset.pl [--clean] jobs/some/job blah\n"; - -my $clean; -my $use_srun = 0; -my $nobg = '&'; -my $in = shift || die $usage; -if ($in eq '--clean') { - $clean = 1; - $in = shift || die $usage; -} -if ($in eq '--srun') { - $use_srun = 1; - $in = shift || die $usage; -} -if ($in eq '--nobg') { - $nobg = ''; - $in = shift || die $usage; -} -my $tag = shift || die $usage; -my $fake = shift; - - -my ($job) = $in =~ /^jobs\/(.*)/; -my ($jname) = $job =~ /\/(\w+)$/; -$jname ||= $job; -die "not jobs/?" unless defined $job; -my $out = "log/$job.$tag"; -my $relout = "$job.$tag"; - - -my $cwd = `/bin/pwd`; -chomp($cwd); - - - -print "# --- job $job, tag $tag ---\n"; - - -# get input -my $raw = `cat $in`; -my $sim = eval $raw; -unless (ref $sim) { - print "bad input: $in\n"; - system "perl -c $in"; - exit 1; -} - -# prep output -system "mkdir -p $out" unless -d "$out"; - -open(W, ">$out/in"); -print W $raw; -close W; - -my $comb = $sim->{'_comb'}; -delete $sim->{'_comb'}; -my %filters; -my @fulldirs; - - - -sub reset { - print "reset: restarting mpd in 3 seconds\n"; - system "sleep 3 && (mpiexec -l -n 32 killall newsyn ; restartmpd.sh)"; - print "reset: done\n"; -} - - -if (`hostname` =~ /alc/ && !$use_srun) { - print "# this looks like alc\n"; - $sim->{'_psub'} = 'jobs/alc.tp'; -} - - -sub iterate { - my $sim = shift @_; - my $fix = shift @_ || {}; - my $vary; - my @r; - - my $this; - for my $k (sort keys %$sim) { - #next if $k =~ /^_/; - if (defined $fix->{$k}) { - $this->{$k} = $fix->{$k}; - } - elsif (ref $sim->{$k} eq 'HASH') { - # nothing - } - elsif ($k =~ /^_/ || !(ref $sim->{$k})) { - $this->{$k} = $sim->{$k}; - } - else { - #print ref $sim->{$k}; - if (!(defined $vary)) { - $vary = $k; - } - } - } - - if ($vary) { - #print "vary $vary\n"; - for my $v (@{$sim->{$vary}}) { - $this->{$vary} = $v; - push(@r, &iterate($sim, $this)); - } - } else { - - if ($sim->{'_dep'}) { - my @s = @{$sim->{'_dep'}}; - while (@s) { - my $dv = shift @s; - my $eq = shift @s; - - $eq =~ s/\$(\w+)/"\$this->{'$1'}"/eg; - $this->{$dv} = eval $eq; - #print "$dv : $eq -> $this->{$dv}\n"; - } - } - - push(@r, $this); - } - return @r; -} - - - -sub run { - my $h = shift @_; - - my @fn; - my @filt; - my @vals; - for my $k (sort keys %$sim) { - next if $k =~ /^_/; - next unless ref $sim->{$k} eq 'ARRAY'; - push(@fn, "$k=$h->{$k}"); - push(@vals, $h->{$k}); - next if $comb && $k eq $comb->{'x'}; - push(@filt, "$k=$h->{$k}"); - } - my $keys = join(",", @fn); - $keys =~ s/ /_/g; - my $fn = $out . '/' . $keys; - my $name = $jname . '_' . join('_',@vals); #$tag . '_' . $keys; - - push( @fulldirs, "" . $fn ); - - - # filters - $filters{ join(',', @filt) } = 1; - - - #system "sh $fn/sh.post" if -e "$fn/sh.post";# && !(-e "$fn/.post"); - - if (-e "$fn/.done") { - print "already done.\n"; - return; - } - system "rm -r $fn" if $clean && -d "$fn"; - system "mkdir $fn" unless -d "$fn"; - system "mkdir $fn/out" unless -d "$fn/out"; - - my $e = './newsyn'; - #$e = './tcpsynobfs' if $h->{'fs'} eq 'obfs'; - my $c = "$e"; - $c .= " --mkfs" unless $h->{'_no_mkfs'}; - - for my $k (keys %$h) { - next if $k =~ /^_/; - next if $h->{'_noarg'} && grep {$k eq $_} @{$h->{'_noarg'}}; - next if $h->{'_subst'} && grep {$k eq $_} @{$h->{'_subst'}}; - $c .= " --$k $h->{$k}"; - } - - if ($h->{'_custom'}) { - if ($h->{'_subst'}) { - for my $var (@{$h->{'_subst'}}) { - $h->{'_custom'} =~ s/\$$var/$h->{$var}/g; - } - } - $c .= ' ' . $h->{'_custom'}; - } - - $c .= " --log_name $relout/$keys"; - $c .= " --doutdir log/$relout/$keys/out"; - - my $post = "#!/bin/sh -script/sum.pl -start $h->{'_start'} -end $h->{'_end'} $fn/osd\\* > $fn/sum.osd -script/sum.pl -start $h->{'_start'} -end $h->{'_end'} $fn/mds? $fn/mds?? > $fn/sum.mds -script/sum.pl -start $h->{'_start'} -end $h->{'_end'} $fn/mds*.log > $fn/sum.mds.log -script/sum.pl -start $h->{'_start'} -end $h->{'_end'} $fn/clnode* > $fn/sum.cl -touch $fn/.post -"; - open(O,">$fn/sh.post"); - print O $post; - close O; - - my $killmin; - if ($h->{'_kill_after'}) { - $killmin = 1 + int ($h->{'_kill_after'} / 60); - $killmin = "-t $killmin"; - } - - $c = "bash -c \"ulimit -c 0 ; $c\""; - #$c = "bash -c \"$c\""; - - #print "h keys are " . join(' ', sort keys %$h) . "\n"; - - my $srun = "srun --wait=600 -x jobs/ltest.ignore -l $killmin -N $h->{'_n'} -p ltest"; - my $mpiexec = "mpiexec -l -n $h->{'_n'}"; - my $launch; - if ($use_srun) { - $launch = $srun; - } else { - $launch = $mpiexec; - } - - if ($sim->{'_psub'}) { - # template! - my $tp = `cat $sim->{'_psub'}`; - $tp =~ s/\$CWD/$cwd/g; - $tp =~ s/\$NAME/$name/g; - $tp =~ s/\$NUM/$h->{'_n'}/g; - $tp =~ s/\$OUT/$fn\/o/g; - $tp =~ s/\$DONE/$fn\/.done/g; - $tp =~ s/\$CMD/$c/g; - open(O,">$out/$name"); - print O $tp; - close O; - print "\npsub $out/$name\n"; - return; - } else { - # run - my $cmd = "\n$launch $c > $fn/o && touch $fn/.done";# - #my $cmd = "\n$launch $c > $fn/o ; touch $fn/.done"; - print "$cmd $nobg\n"; - my $r = undef; - unless ($fake) { - if ($sim->{'_pre'}) { - print "pre: $launch $sim->{'_pre'}\n"; - system "$launch $sim->{'_pre'}"; - } - $r = system $cmd; - if ($sim->{'_post'}) { - print "post: $launch $sim->{'_post'}\n"; - system "$launch $sim->{'_post'}"; - } - if ($r) { - print "r = $r\n"; - #&reset; - } - system "sh $fn/sh.post"; - } - return $r; - } -} - - - -my @r = &iterate($sim); -my $n = scalar(@r); -my $c = 1; -my %r; -my $nfailed = 0; -for my $h (@r) { - my $d = `date`; - chomp($d); - $d =~ s/ P.T .*//; - print "# === $c/$n"; - print " ($nfailed failed)" if $nfailed; - print " $d: "; - my $r = &run($h); - - if (!(defined $r)) { - # already done - } else { - if ($r) { - $nfailed++; - } - print "sleep $h->{'_sleep'}\n"; - sleep $h->{'_sleep'}; - } - - $c++; -} -print "$nfailed failed\n"; - - -my @comb; -if ($comb) { - my $x = $comb->{'x'}; - my @vars = @{$comb->{'vars'}}; - - print "\n\n# post\n"; - for my $p (@fulldirs) { - print "sh $p/sh.post\n"; - } - - my @filters = sort keys %filters; - my $cmd = "script/comb.pl $x @vars - @fulldirs - @filters > $out/c"; - print "$cmd\n"; - open(O,">$out/comb"); - print O "$cmd\n"; - close O; - system $cmd; - - print "\n\n"; - - my $plot; - $plot .= "set data style linespoints;\n"; - my $s = 2; - for my $v (@vars) { - my $c = $s; - $s++; - my @p; - for my $f (@filters) { - my $t = $f; - if ($comb->{'maptitle'}) { - for my $a (keys %{$comb->{'maptitle'}}) { - my $b = $comb->{'maptitle'}->{$a}; - $t =~ s/$a/$b/; - } - } - push (@p, "\"$out/c\" u 1:$c t \"$t\"" ); - $c += scalar(@vars); - } - $plot .= "# $v\nplot " . join(", ", @p) . ";\n\n"; - } - print $plot; - open(O,">$out/plot"); - print O $plot; - close O; -} - diff --git a/branches/sage/mds/script/runset.pl b/branches/sage/mds/script/runset.pl deleted file mode 100755 index 966cf4e5100cb..0000000000000 --- a/branches/sage/mds/script/runset.pl +++ /dev/null @@ -1,380 +0,0 @@ -#!/usr/bin/perl - -use strict; -use Data::Dumper; - -=item sample input file - -# hi there -{ - # startup - 'n' => 30, # mpi nodes - 'sleep' => 10, # seconds between runs - 'nummds' => 1, - 'numosd' => 8, - 'numclient' => 400,#[10, 50, 100, 200, 400], - - # parameters - 'fs' => [ 'ebofs', 'fakestore' ], - 'until' => 150, # --syn until $n ... when to stop clients - 'writefile' => 1, - 'writefile_size' => [ 4096, 65526, 256000, 1024000, 2560000 ], - 'writefile_mb' => 1000, - - 'custom' => '--tcp_skip_rank0 --osd_maxthreads 0'; - - # for final summation (script/sum.pl) - 'start' => 30, - 'end' => 120, - - '_psub' => 'alc.tp' # switch to psub mode! -}; - -=cut - -my $usage = "script/runset.pl [--clean] jobs/some/job blah\n"; - -my $clean; -my $use_srun; -my $nobg = '&'; -my $in = shift || die $usage; -if ($in eq '--clean') { - $clean = 1; - $in = shift || die $usage; -} -if ($in eq '--srun') { - $use_srun = 1; - $in = shift || die $usage; -} -if ($in eq '--nobg') { - $nobg = ''; - $in = shift || die $usage; -} -my $tag = shift || die $usage; -my $fake = shift; - - -my ($job) = $in =~ /^jobs\/(.*)/; -my ($jname) = $job =~ /\/(\w+)$/; -$jname ||= $job; -die "not jobs/?" unless defined $job; -my $out = "log/$job.$tag"; -my $relout = "$job.$tag"; - - -my $cwd = `/bin/pwd`; -chomp($cwd); - - - -print "# --- job $job, tag $tag ---\n"; - - -# get input -my $raw = `cat $in`; -my $sim = eval $raw; -unless (ref $sim) { - print "bad input: $in\n"; - system "perl -c $in"; - exit 1; -} - -# prep output -system "mkdir -p $out" unless -d "$out"; - -open(W, ">$out/in"); -print W $raw; -close W; - -my $comb = $sim->{'comb'}; -delete $sim->{'comb'}; -my %filters; -my @fulldirs; - - - -sub reset { - print "reset: restarting mpd in 3 seconds\n"; - system "sleep 3 && (mpiexec -l -n 32 killall newsyn ; restartmpd.sh)"; - print "reset: done\n"; -} - - -if (`hostname` =~ /alc/ && !$use_srun) { - print "# this looks like alc\n"; - $sim->{'_psub'} = 'jobs/alc.tp'; -} - - -sub iterate { - my $sim = shift @_; - my $fix = shift @_ || {}; - my $vary; - my @r; - - my $this; - for my $k (sort keys %$sim) { - next if $k =~ /^_/; - if (defined $fix->{$k}) { - $this->{$k} = $fix->{$k}; - } - elsif (ref $sim->{$k} eq 'HASH') { - # nothing - } - elsif (!(ref $sim->{$k})) { - $this->{$k} = $sim->{$k}; - } - else { - #print ref $sim->{$k}; - if (!(defined $vary)) { - $vary = $k; - } - } - } - - if ($vary) { - #print "vary $vary\n"; - for my $v (@{$sim->{$vary}}) { - $this->{$vary} = $v; - push(@r, &iterate($sim, $this)); - } - } else { - - if ($sim->{'_dep'}) { - my @s = @{$sim->{'_dep'}}; - while (@s) { - my $dv = shift @s; - my $eq = shift @s; - - $eq =~ s/\$(\w+)/"\$this->{'$1'}"/eg; - $this->{$dv} = eval $eq; - #print "$dv : $eq -> $this->{$dv}\n"; - } - } - - push(@r, $this); - } - return @r; -} - - - -sub run { - my $h = shift @_; - - my @fn; - my @filt; - my @vals; - for my $k (sort keys %$sim) { - next if $k =~ /^_/; - next unless ref $sim->{$k} eq 'ARRAY'; - push(@fn, "$k=$h->{$k}"); - push(@vals, $h->{$k}); - next if $comb && $k eq $comb->{'x'}; - push(@filt, "$k=$h->{$k}"); - } - my $keys = join(",", @fn); - $keys =~ s/ /_/g; - my $fn = $out . '/' . $keys; - my $name = $jname . '_' . join('_',@vals); #$tag . '_' . $keys; - - push( @fulldirs, "" . $fn ); - - - # filters - $filters{ join(',', @filt) } = 1; - - - #system "sh $fn/sh.post" if -e "$fn/sh.post";# && !(-e "$fn/.post"); - if (-e "$fn/.done") { - print "already done.\n"; - return; - } - system "rm -r $fn" if $clean && -d "$fn"; - system "mkdir $fn" unless -d "$fn"; - - my $e = './newsyn'; - #$e = './tcpsynobfs' if $h->{'fs'} eq 'obfs'; - my $c = "$e"; - $c .= " --mkfs" unless $h->{'no_mkfs'}; - $c .= " --$h->{'fs'}" if $h->{'fs'}; - $c .= " --syn until $h->{'until'}" if $h->{'until'}; - - $c .= " --syn writefile $h->{'writefile_mb'} $h->{'writefile_size'}" if $h->{'writefile'}; - $c .= " --syn rw $h->{'rw_mb'} $h->{'rw_size'}" if $h->{'rw'}; - $c .= " --syn readfile $h->{'readfile_mb'} $h->{'readfile_size'}" if $h->{'readfile'}; - $c .= " --syn makedirs $h->{'makedirs_dirs'} $h->{'makedirs_files'} $h->{'makedirs_depth'}" if $h->{'makedirs'}; - - if ($h->{'ebofs_freelist'}) { - system "cp freelist/ebofs.freelist.$h->{'ebofs_freelist'} ebofs.freelist"; - $c .= " --osd_age_time -1"; - } - - for my $k ('nummds', 'numclient', 'numosd', 'kill_after', - 'osd_maxthreads', 'osd_object_layout', 'osd_pg_layout','osd_pg_bits', - 'mds_bal_rep', 'mds_bal_interval', 'mds_bal_max','mds_decay_halflife', - 'mds_bal_hash_rd','mds_bal_hash_wr','mds_bal_unhash_rd','mds_bal_unhash_wr', - 'mds_cache_size','mds_log_max_len', - 'mds_local_osd', - 'osd_age_time','osd_age', - 'osd_rep', - 'osd_pad_pg_log','ebofs_realloc', - 'osd_balance_reads', - 'tcp_multi_out', - 'client_cache_stat_ttl','client_cache_readdir_ttl', - 'client_oc', - 'fake_osdmap_updates', - 'bdev_el_bidir', 'ebofs_idle_commit_ms', 'ebofs_commit_ms', - 'ebofs_oc_size','ebofs_cc_size','ebofs_bc_size','ebofs_bc_max_dirty','ebofs_abp_max_alloc', - 'file_layout_ssize','file_layout_scount','file_layout_osize','file_layout_num_rep', - 'meta_dir_layout_ssize','meta_dir_layout_scount','meta_dir_layout_osize','meta_dir_layout_num_rep', - 'meta_log_layout_ssize','meta_log_layout_scount','meta_log_layout_osize','meta_log_layout_num_rep') { - $c .= " --$k $h->{$k}" if defined $h->{$k}; - } - - $c .= ' ' . $h->{'custom'} if $h->{'custom'}; - - $c .= " --log_name $relout/$keys"; - - my $post = "#!/bin/sh -script/sum.pl -start $h->{'start'} -end $h->{'end'} $fn/osd\\* > $fn/sum.osd -script/sum.pl -start $h->{'start'} -end $h->{'end'} $fn/mds? $fn/mds?? > $fn/sum.mds -script/sum.pl -start $h->{'start'} -end $h->{'end'} $fn/mds*.log > $fn/sum.mds.log -script/sum.pl -start $h->{'start'} -end $h->{'end'} $fn/clnode* > $fn/sum.cl -touch $fn/.post -"; - open(O,">$fn/sh.post"); - print O $post; - close O; - - my $killmin = 1 + int ($h->{'kill_after'} / 60); - - $c = "bash -c \"ulimit -c 0 ; $c\""; - #$c = "bash -c \"$c\""; - - my $srun = "srun --wait=600 --exclude=jobs/ltest.ignore -l -t $killmin -N $h->{'n'} -p ltest"; - my $mpiexec = "mpiexec -l -n $h->{'n'}"; - my $launch; - if ($use_srun) { - $launch = $srun; - } else { - $launch = $mpiexec; - } - - if ($sim->{'_psub'}) { - # template! - my $tp = `cat $sim->{'_psub'}`; - $tp =~ s/\$CWD/$cwd/g; - $tp =~ s/\$NAME/$name/g; - $tp =~ s/\$NUM/$h->{'n'}/g; - $tp =~ s/\$OUT/$fn\/o/g; - $tp =~ s/\$DONE/$fn\/.done/g; - $tp =~ s/\$CMD/$c/g; - open(O,">$out/$name"); - print O $tp; - close O; - print "\npsub $out/$name\n"; - return; - } else { - # run - my $cmd = "\n$launch $c > $fn/o && touch $fn/.done";# - #my $cmd = "\n$launch $c > $fn/o ; touch $fn/.done"; - print "$cmd $nobg\n"; - my $r = undef; - unless ($fake) { - if ($sim->{'_pre'}) { - print "pre: $launch $sim->{'_pre'}\n"; - system "$launch $sim->{'_pre'}"; - } - $r = system $cmd; - if ($sim->{'_post'}) { - print "post: $launch $sim->{'_post'}\n"; - system "$launch $sim->{'_post'}"; - } - if ($r) { - print "r = $r\n"; - #&reset; - } - system "sh $fn/sh.post"; - } - return $r; - } -} - - - -my @r = &iterate($sim); -my $n = scalar(@r); -my $c = 1; -my %r; -my $nfailed = 0; -for my $h (@r) { - my $d = `date`; - chomp($d); - $d =~ s/ P.T .*//; - print "# === $c/$n"; - print " ($nfailed failed)" if $nfailed; - print " $d: "; - my $r = &run($h); - - if (!(defined $r)) { - # already done - } else { - if ($r) { - $nfailed++; - } - print "sleep $h->{'sleep'}\n"; - sleep $h->{'sleep'}; - } - - $c++; -} -print "$nfailed failed\n"; - - -my @comb; -if ($comb) { - my $x = $comb->{'x'}; - my @vars = @{$comb->{'vars'}}; - - print "\n\n# post\n"; - for my $p (@fulldirs) { - print "sh $p/sh.post\n"; - } - - my @filters = sort keys %filters; - my $cmd = "script/comb.pl $x @vars - @fulldirs - @filters > $out/c"; - print "$cmd\n"; - open(O,">$out/comb"); - print O "$cmd\n"; - close O; - system $cmd; - - print "\n\n"; - - my $plot; - $plot .= "set data style linespoints;\n"; - my $s = 2; - for my $v (@vars) { - my $c = $s; - $s++; - my @p; - for my $f (@filters) { - my $t = $f; - if ($comb->{'maptitle'}) { - for my $a (keys %{$comb->{'maptitle'}}) { - my $b = $comb->{'maptitle'}->{$a}; - $t =~ s/$a/$b/; - } - } - push (@p, "\"$out/c\" u 1:$c t \"$t\"" ); - $c += scalar(@vars); - } - $plot .= "# $v\nplot " . join(", ", @p) . ";\n\n"; - } - print $plot; - open(O,">$out/plot"); - print O $plot; - close O; -} - diff --git a/branches/sage/mds/script/smooth.pl b/branches/sage/mds/script/smooth.pl deleted file mode 100755 index 6cfbaf60ff921..0000000000000 --- a/branches/sage/mds/script/smooth.pl +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/perl - -my $n = shift @ARGV || 2; - -my %v; # t -> [..] -while (<>) { - chomp; - my @l = split(/\t/,$_); - my $t = shift @l; - if (int $t) { - $v{$t} = \@l; - } else { - print "$_\n"; - } -} - -for my $t (sort {$a <=> $b} keys %v) { - my $s = $t - $n/2; - my @v; - my $c = 0; - for (my $a=0; $a < $n; $a++) { - my $x = $t + $a; - next unless ($v{$x}); - my @o = @{$v{$x}}; - #print "$t: $x o @o\n"; - if (@v) { - for (my $y=0; $y<=$#o; $y++) { - $v[$y] += $o[$y]; - } - } else { - @v = @o; - } - #print "$t: $x v @v\n"; - $c++; - } - print "$t"; - for my $sum (@v) { - print "\t" . ($sum / $c); - } - print "\n"; -} diff --git a/branches/sage/mds/script/study_find.pl b/branches/sage/mds/script/study_find.pl deleted file mode 100755 index 6e6cccdf37c89..0000000000000 --- a/branches/sage/mds/script/study_find.pl +++ /dev/null @@ -1,224 +0,0 @@ -#!/usr/bin/perl - -use strict; - -my $name = shift @ARGV || die; - -my $nfiles = 0; -my $ndirs = 0; -my $nreg = 0; -my $nhardlinks = 0; -my %nlinks; -my %ino_nlinks; -my %names; -my %dirsize; - -my %fnlen; - -my %hdepth; - -my $bytes; -my $ebytes; - -# -# output generated with -# -# find . -path ./.snapshot -prune -o -exec ls -dilsn --time-style=+%s \{\} \; -# -# find output looks like this: -#4495744 4 drwxrwxrwx 24 0 0 4096 1187290970 . -#2996320 8 drwxr-xr-x 189 0 1000 8192 1186594257 ./jangle -#28378499 4 drwxr-x--x 4 1068885 52673 4096 1162938122 ./jangle/cymcruise -#28378500 4 drwx--S--- 5 1068885 52673 4096 1162938122 ./jangle/cymcruise/Maildir -#28378501 4 drwx------ 2 1068885 52673 4096 1162938122 ./jangle/cymcruise/Maildir/tmp -#28378502 4 drwx------ 2 1068885 52673 4096 1162938122 ./jangle/cymcruise/Maildir/new -#28378503 4 drwx------ 2 1068885 52673 4096 1162938122 ./jangle/cymcruise/Maildir/cur -#28378504 4 -rw-r--r-- 1 1068885 52673 260 943743700 ./jangle/cymcruise/.alias -#999425 4 drwxr-xr-x 92 1125 100 4096 1186523060 . -#999426 0 lrwxrwxrwx 1 0 0 5 1177701093 ./root -> /root -#1015809 4 drwxr-xr-x 4 1289 1000 4096 1174584949 ./andrea -#541007 4 drwxr-xr-x 3 0 0 4096 1173111449 ./andrea/lux -#5014055 4 drwx--S--- 11 70228 51207 4096 1172250346 ./andrea/lux/Maildir - -# dirs we're currently counting in -my %numindir; - -sub finish_dir { - my $curdir = shift @_; - #print "finish_dir $numindir{$curdir} in $curdir\n"; - $dirsize{$numindir{$curdir}}++; - $ndirs++; - delete $numindir{$curdir}; -} - -my $curdir; -while (<>) { - #print; - chomp; - my ($ino, $blah, $mode, $nlink, $uid, $gid, $size, $mtime, @path) = split(/[ ]+/,$_); - my $file = join(' ',@path); - ($file) = split(/ \-\> /, $file); # ignore symlink dest - my @bits = split(/\//, $file); - my $depth = scalar(@bits); - my $f = pop @bits; - my $dir = join('/', @bits); - #print "file = '$file', dir = '$dir', curdir = '$curdir'\n"; - - if ($dir ne $curdir) { - for my $d (keys %numindir) { - #print "? $d vs $dir\n"; - &finish_dir($d) if ($d ne substr($dir, 0, length($d))); - } - $curdir = $dir; - } - - my $esize = 0; - $esize = int (($size-1)/4096)*4096 + 4096 if $size > 0; - $esize += 160; # for the inode? - $bytes += $size; - $ebytes += $esize; - - $nfiles++; - $numindir{$dir}++; - - $hdepth{$depth}++; - - my $fnlen = length($f); - $fnlen{$fnlen}++; - - if ($mode =~ /^d/) { - # find does depth-first search, so assume we descend, so that on empty dir we "back out" above and &finish_dir. - $numindir{$file} = 0; - $curdir = $file; - } else { - $nreg++ if $mode =~ /^f/; - if ($nlink > 1) { - #system "ls -aldi $file"; - $nhardlinks++; - $nlinks{$nlink}++; - $ino_nlinks{$ino} = $nlink; - push(@{$names{$ino}->{$dir}}, $file); - } - } -} -for my $d (keys %numindir) { - &finish_dir($d); -} - - - -my $nsamedir = 0; -open(LOG, ">$name.log"); -my %dirmap; # from dir -> to dir -for my $ino (keys %names) { - print LOG "# $ino\n"; - my @dirs = keys %{$names{$ino}}; - my $insamedir = 1 if scalar(@dirs) == 1; - for my $dir (@dirs) { - print LOG "#\t$dir\n"; - for my $fn (@{$names{$ino}->{$dir}}) { - print LOG "#\t\t$fn\n"; - $nsamedir++ if $insamedir; - } - } - - # stick in dirmap - for (my $i=0; $i<$#dirs; $i++) { - for (my $j=1; $j <= $#dirs; $j++) { - print LOG "# $dirs[$i] <-> $dirs[$j]\n"; - push(@{$dirmap{$dirs[$i]}->{$dirs[$j]}}, $ino); - push(@{$dirmap{$dirs[$j]}->{$dirs[$i]}}, $ino); - } - } -} - - -my $notherinsamedir = 0; -my $notherinsamedirs = 0; -for my $ino (keys %names) { - my @dirs = keys %{$names{$ino}}; - next unless (scalar(@dirs) > 1); - my $n = 0; - my $np = 0; - for (my $i=0; $i<$#dirs; $i++) { - for (my $j=$i+1; $j <= $#dirs; $j++) { - $np++; - if (scalar(@{$dirmap{$dirs[$i]}->{$dirs[$j]}}) > 1 || - scalar(@{$dirmap{$dirs[$j]}->{$dirs[$i]}}) > 1) { - $n++; - #print LOG "# $ino is not alone between $dirs[$i] and $dirs[$j] : @{$dirmap{$dirs[$j]}->{$dirs[$i]}}\n"; - } - } - } - if ($n) { - print LOG "# $ino\tfor $n / $np dir pairs, there is another hl between the same pair of dirs\n"; - $notherinsamedir += $ino_nlinks{$ino}; - $notherinsamedirs += ($n / $np) * $ino_nlinks{$ino}; - } else { - print LOG "# $ino is ALL ALONE\n"; - } -} -close LOG; -$notherinsamedirs = sprintf("%.1f",$notherinsamedirs); - - -sub do_cdf { - my $hash = shift @_; - my $num = shift @_; - my $fn = shift @_; - - open(CDF, ">$fn") if $fn; - print CDF "# $name\n"; - - my $median; - my $sum = 0; - my $p = 0; - my $lastv = 0; - for my $v (sort {$a <=> $b} keys %$hash) { - print CDF "$v\t$hash->{$v}\n"; - $p += $hash->{$v}; - $sum += $hash->{$v} * $v; - if (!(defined $median) && - $p >= ($num/2)) { - $median = $v; - } - } - if ($p != $num) { - warn "uh oh, BUG, $p != $num in cdf/median calculation\n"; - } - my $avg = sprintf("%.2f", $sum/$num); - print CDF "# avg $avg, median $median, sum $sum, num $num\n"; - return ($avg, $median); -} -close DSLOG; - - -# do cdfs -my ($avgdirsize, $mediandirsize) = &do_cdf(\%dirsize, $ndirs, "$name.ds"); -my ($avgfnlen, $medianfnlen) = &do_cdf(\%fnlen, $nfiles, "$name.fnlen"); -my ($avgdepth, $mediandepth) = &do_cdf(\%hdepth, $nfiles, "$name.hdepth"); - - -# stat fs -#my $df = `df $base`; -#my $line = (split(/\n/,$df))[1]; # second line -#my ($kb) = $df =~ /\s+\d+\s+(\d+)/; -my $gb = sprintf("%.1f",($ebytes / 1024 / 1024 / 1024)); - -open(O, ">$name.sum"); - -# final line -my $pad = '# ' . (' ' x (length($name)-2)); -print O "$pad\tgb\tfiles\tdirs\tdsavg\tdsmed\tfnavg\tfnmed\treg\tnl>1\tsmdr\tothers\totherss\tnlink=2\t=3\t=4\t...\n"; -print O "$name\t$gb\t$nfiles\t$ndirs\t$avgdirsize\t$mediandirsize\t$avgfnlen\t$medianfnlen\t$nreg\t$nhardlinks\t$nsamedir\t$notherinsamedir\t$notherinsamedirs"; -my $i = 2; -for (sort {$a <=> $b} keys %nlinks) { - while ($_ < $i) { - print O "\t0"; - } - print O "\t$nlinks{$_}"; - $i = $_ + 1; -} -print O "\n"; - -close O; diff --git a/branches/sage/mds/script/study_hardlink_lifetimes.pl b/branches/sage/mds/script/study_hardlink_lifetimes.pl deleted file mode 100755 index 012ef6009bb43..0000000000000 --- a/branches/sage/mds/script/study_hardlink_lifetimes.pl +++ /dev/null @@ -1,131 +0,0 @@ -#!/usr/bin/perl - -use strict; - -my %ns; # parent -> fn -> ino -my %nlink; # num links to each ino -my %since; # when it got its second link - -my @ignore = ('ll_getattr','ll_setattr','ll_forget','ll_fsync','ll_readlink','ll_statfs','ll_opendir','ll_releasedir','ll_flush','ll_release','ll_open','ll_read','ll_write'); - -my $when; - -my $sumage; -my $numage; - -sub unlink { - my ($p,$n) = @_; - my $i = $ns{$p}->{$n}; - my $new = --$nlink{$i}; - if ($new == 1) { - my $age = $when - $since{$i}; - #print "$since{$i} to $when on $i\t$age\n"; - delete $since{$i}; - - $numage++; - $sumage += $age; - - } elsif ($new == 0) { - delete $nlink{$i}; - } - delete $ns{$p}->{$n}; -} - - -my ($sec, $usec, $cmd); -$_ = <>; -while (1) { - # read trace record - chomp; - last unless $_ eq '@'; - - chomp(my $sec = <>); - chomp(my $usec = <>); - $when = sprintf("%d.%06d",$sec,$usec);# + ($usec / 1000000); - #$when = "$sec.$usec"; - - chomp($cmd = <>); - - #print "cmd $cmd\n"; - - if ($cmd eq 'll_lookup') { - chomp(my $p = <>); - chomp(my $n = <>); - chomp(my $r = <>); - $ns{$p}->{$n} = $r; - } - - elsif ($cmd eq 'll_create') { - chomp(my $p = <>); - chomp(my $n = <>); - <>; <>; <>; - chomp(my $r = <>); - $ns{$p}->{$n} = $r; - $nlink{$r} = 1; - } - elsif ($cmd eq 'll_mknod') { - chomp(my $p = <>); - chomp(my $n = <>); - <>; <>; - chomp(my $r = <>); - $ns{$p}->{$n} = $r; - $nlink{$r} = 1; - } - elsif ($cmd eq 'll_mkdir') { - chomp(my $p = <>); - chomp(my $n = <>); - <>; - chomp(my $r = <>); - $ns{$p}->{$n} = $r; - $nlink{$r} = 1; - } - elsif ($cmd eq 'll_symlink') { - chomp(my $p = <>); - chomp(my $n = <>); - <>; - chomp(my $r = <>); - $ns{$p}->{$n} = $r; - $nlink{$r} = 1; - } - elsif ($cmd eq 'll_link') { - chomp(my $i = <>); - chomp(my $p = <>); - chomp(my $n = <>); - $ns{$p}->{$n} = $i; - if (++$nlink{$i} == 2) { - $since{$i} = $when; - } - } - elsif ($cmd eq 'll_unlink' || - $cmd eq 'll_rmdir') { - chomp(my $p = <>); - chomp(my $n = <>); - &unlink($p, $n); - } - elsif ($cmd eq 'll_rename') { - chomp(my $p = <>); - chomp(my $n = <>); - chomp(my $np = <>); - chomp(my $nn = <>); - if ($ns{$np}->{$nn}) { - &unlink($np, $nn); - } - $ns{$np}->{$nn} = $ns{$p}->{$n}; - delete $ns{$p}->{$n}; - } - - # skip to @ - while (<>) { - last if $_ eq "@\n"; - print "$cmd: $_" - unless grep {$_ eq $cmd} @ignore; - } -} - -print "num $numage .. sum $sumage .. avg lifetime " . ($sumage / $numage) . "\n"; - -# dump hard link inos -for my $ino (keys %nlink) { - next if $nlink{$ino} < 2; - print "$ino\t$nlink{$ino}\n"; -} diff --git a/branches/sage/mds/script/study_lookups.pl b/branches/sage/mds/script/study_lookups.pl deleted file mode 100644 index 7a0784f3210a4..0000000000000 --- a/branches/sage/mds/script/study_lookups.pl +++ /dev/null @@ -1,137 +0,0 @@ -#!/usr/bin/perl - -use strict; - -my @buckets = (1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096); - -my %dir_miss; # dir(ino) -> last lookup miss -my %dir_has; # ino -> dentries we have - - -my %ns; # parent -> fn -> ino -my %nlink; # num links to each ino -my %since; # when it got its second link - -my @ignore = ('ll_getattr','ll_setattr','ll_forget','ll_fsync','ll_readlink','ll_statfs','ll_opendir','ll_releasedir','ll_flush','ll_release','ll_open','ll_read','ll_write'); - -my $when; - -my $sumage; -my $numage; - -sub unlink { - my ($p,$n) = @_; - my $i = $ns{$p}->{$n}; - my $new = --$nlink{$i}; - if ($new == 1) { - my $age = $when - $since{$i}; - #print "$since{$i} to $when on $i\t$age\n"; - delete $since{$i}; - - $numage++; - $sumage += $age; - - } elsif ($new == 0) { - delete $nlink{$i}; - } - delete $ns{$p}->{$n}; -} - - -my ($sec, $usec, $cmd); -$_ = <>; -while (1) { - # read trace record - chomp; - last unless $_ eq '@'; - - chomp(my $sec = <>); - chomp(my $usec = <>); - $when = sprintf("%d.%06d",$sec,$usec);# + ($usec / 1000000); - #$when = "$sec.$usec"; - - chomp($cmd = <>); - - #print "cmd $cmd\n"; - - if ($cmd eq 'll_lookup') { - chomp(my $p = <>); - chomp(my $n = <>); - chomp(my $r = <>); - $ns{$p}->{$n} = $r; - } - - elsif ($cmd eq 'll_create') { - chomp(my $p = <>); - chomp(my $n = <>); - <>; <>; <>; - chomp(my $r = <>); - $ns{$p}->{$n} = $r; - $nlink{$r} = 1; - } - elsif ($cmd eq 'll_mknod') { - chomp(my $p = <>); - chomp(my $n = <>); - <>; <>; - chomp(my $r = <>); - $ns{$p}->{$n} = $r; - $nlink{$r} = 1; - } - elsif ($cmd eq 'll_mkdir') { - chomp(my $p = <>); - chomp(my $n = <>); - <>; - chomp(my $r = <>); - $ns{$p}->{$n} = $r; - $nlink{$r} = 1; - } - elsif ($cmd eq 'll_symlink') { - chomp(my $p = <>); - chomp(my $n = <>); - <>; - chomp(my $r = <>); - $ns{$p}->{$n} = $r; - $nlink{$r} = 1; - } - elsif ($cmd eq 'll_link') { - chomp(my $i = <>); - chomp(my $p = <>); - chomp(my $n = <>); - $ns{$p}->{$n} = $i; - if (++$nlink{$i} == 2) { - $since{$i} = $when; - } - } - elsif ($cmd eq 'll_unlink' || - $cmd eq 'll_rmdir') { - chomp(my $p = <>); - chomp(my $n = <>); - &unlink($p, $n); - } - elsif ($cmd eq 'll_rename') { - chomp(my $p = <>); - chomp(my $n = <>); - chomp(my $np = <>); - chomp(my $nn = <>); - if ($ns{$np}->{$nn}) { - &unlink($np, $nn); - } - $ns{$np}->{$nn} = $ns{$p}->{$n}; - delete $ns{$p}->{$n}; - } - - # skip to @ - while (<>) { - last if $_ eq "@\n"; - print "$cmd: $_" - unless grep {$_ eq $cmd} @ignore; - } -} - -print "num $numage .. sum $sumage .. avg lifetime " . ($sumage / $numage) . "\n"; - -# dump hard link inos -for my $ino (keys %nlink) { - next if $nlink{$ino} < 2; - print "$ino\t$nlink{$ino}\n"; -} diff --git a/branches/sage/mds/script/sum.pl b/branches/sage/mds/script/sum.pl deleted file mode 100755 index 92ef9a9b222a8..0000000000000 --- a/branches/sage/mds/script/sum.pl +++ /dev/null @@ -1,148 +0,0 @@ -#!/usr/bin/perl - -use strict; -my $starttime = 1; -my $endtime = -1; - -my $avgrows = 0; - -while ($ARGV[0] =~ /^-/) { - $_ = shift @ARGV; - if ($_ eq '-avg') { - $avgrows = 1; - } - elsif ($_ eq '-start') { - $starttime = shift @ARGV; - } - elsif ($_ eq '-end') { - $endtime = shift @ARGV; - } - else { - die "i don't understand arg $_"; - } -} -my @files = @ARGV; - -if (scalar(@files) == 1 && $files[0] =~ /\*/) { - my ($dir, $pat) = $files[0] =~ /^(.*)\/([^\/]+)$/; - @files = (); - $pat =~ s/\*//; -# print "dir $dir pat $pat\n"; - opendir(D,"$dir"); - for my $f (readdir(D)) { - # print "$f\n"; - next unless $f =~ /^$pat/; - push(@files, "$dir/$f"); - } - closedir(D); - -# print "files = @files\n"; -} - -my @data; -for my $f (@files) { - open(I,$f); - push( @data, ); - close I; -} - -my %sum; # time -> name -> val -my %col; # colnum -> name .. colnums start at 0 (time doesn't count) -my %min; -my %max; -my %avg; -my %tcount; -my $files; -for (@data) { - chomp; - my @r = split(/\s+/,$_); - my $r = shift @r; - - # column headings? - if ($r =~ /^\#/) { - my $num = 0; - while (my $name = shift @r) { - $col{$num} = $name; - $num++; - } - next; - } - - next unless int $r; - next if $r < $starttime; - next if $endtime > 0 && $r > $endtime; - - $tcount{$r}++; - $files = $tcount{$r} if $tcount{$r} > $files; - #print "$r: @r\n"; - my $i = 0; - while (@r) { - my $v = shift @r; - $sum{$r}->{$col{$i}} += $v; # if $v > 0; - - $min{$col{$i}} = $v - if ($min{$col{$i}} > $v || !(defined $min{$col{$i}})); - $max{$col{$i}} = $v - if ($max{$col{$i}} < $v); - - $avg{$col{$i}} += $v; - $i++; - } -} - -## dump -my @c = sort {$a <=> $b} keys %col; -# cols -print join("\t",'#', map { $col{$_} } @c) . "\n"; -my $n = 0; -for my $k (sort {$a <=> $b} keys %sum) { - if ($avgrows) { - print join("\t",$k, #map int, - map { $sum{$k}->{$col{$_}}/$tcount{$k} } @c ) . "\n"; - } else { - print join("\t",$k, map { $sum{$k}->{$col{$_}} } @c ) . "\n"; - } - $n++; -} - -my $rows = $n || 1; -#my $files = $tcount{$starttime}; -my %avgval; - -## devt -#warn "rows $rows, files $files\n"; -my %avgvalvart; # std dev of each col avg, over time -for my $k (keys %avg) { - my $av = $avgval{$k} = $avg{$k} / ($rows*$files); - - my $var = 0.0; - for my $t (sort {$a <=> $b} keys %sum) { - my $a = $sum{$t}->{$k} / $files; - $var += ($a - $av) * ($a - $av); - } - - $avgvalvart{$k} = $var / $rows; -} - - - - -print "\n"; -print join("\t",'#', map { $col{$_} } @c) . "\n"; -print join("\t", '#minval', map { $min{$col{$_}} } @c ) . "\n"; -print join("\t", '#maxval', map { $max{$col{$_}} } @c ) . "\n"; -print join("\t", '#rows', map { $rows } @c) . "\n"; -print join("\t", '#files', map { $files } @c) . "\n"; -print join("\t", '#sum', - map { $avg{$col{$_}} } @c ) . "\n"; -print join("\t", '#avgval', #map int, - map { $avgval{$col{$_}} } @c ) . "\n"; -# map { ($rows*$files) ? ($_ / ($rows*$files)):0 } map { $avg{$col{$_}} } @c ) . "\n"; - -print join("\t", '#avgvalvart', - map { $avgvalvart{$col{$_}} } @c ) . "\n"; -print join("\t", '#avgvaldevt', - map { sqrt($_) } map { $avgvalvart{$col{$_}} } @c ) . "\n"; - -print join("\t", '#avgsum', #map int, - map { $_ / $rows } map { $avg{$col{$_}} } @c ) . "\n"; diff --git a/branches/sage/mds/test/fakemds.cc b/branches/sage/mds/test/fakemds.cc deleted file mode 100644 index b75b62d58152c..0000000000000 --- a/branches/sage/mds/test/fakemds.cc +++ /dev/null @@ -1,104 +0,0 @@ - - -#include -#include -#include - -#include "mds/MDS.h" -#include "osd/OSD.h" -#include "fakeclient/FakeClient.h" - -#include "mds/MDCluster.h" -#include "mds/MDCache.h" -#include "mds/MDStore.h" - -#include "msg/FakeMessenger.h" - -#include "messages/MPing.h" - -using namespace std; - -__uint64_t ino = 1; - - - -#include "config.h" -#define NUMMDS g_conf.num_mds -#define NUMOSD g_conf.num_osd -#define NUMCLIENT g_conf.num_fakeclient - -// this parses find output -int play(); - -int main(int oargc, char **oargv) { - cerr << "hi there" << endl; - - int argc; - char **argv; - parse_config_options(oargc, oargv, - argc, argv); - - MDCluster *mdc = new MDCluster(NUMMDS, NUMOSD); - - // local config settings - g_conf.num_client = g_conf.num_fakeclient; // to fool mds, hack gross - - // create osds - OSD *osd[NUMOSD]; - for (int i=0; iinit(); - } - - // create mds - MDS *mds[NUMMDS]; - for (int i=0; iinit(); - } - - - // create clients - FakeClient *client[NUMCLIENT]; - for (int i=0; iinit(); - } - - // mount clients - for (int i=0; imount(); - - // loop - fakemessenger_do_loop(); - - //mds[0]->shutdown_start(); - //fakemessenger_do_loop(); - - // - if (argc > 1 && - strcmp(argv[1], "nocheck") == 0) { - cerr << "---- nocheck" << endl; - } else { - cout << "---- check ----" << endl; - for (int i=0; imdcache->shutdown_pass(); - } - - // cleanup - cout << "cleanup" << endl; - for (int i=0; i - * Daniel Jönsson - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the Do What The Fuck You Want To - * Public License as published by Banlu Kemiyatorn. See - * http://sam.zoy.org/projects/COPYING.WTFPL for more details. - * - * Compilation example: - * gcc -shared -fPIC gprof-helper.c -o gprof-helper.so -lpthread -ldl - * - * Usage example: - * LD_PRELOAD=./gprof-helper.so your_program - */ - -#define _GNU_SOURCE -#include -#include -#include -#include -#include - -static void * wrapper_routine(void *); - -/* Original pthread function */ -static int (*pthread_create_orig)(pthread_t *__restrict, - __const pthread_attr_t *__restrict, - void *(*)(void *), - void *__restrict) = NULL; - -/* Library initialization function */ -void wooinit(void) __attribute__((constructor)); - -void wooinit(void) -{ - pthread_create_orig = dlsym(RTLD_NEXT, "pthread_create"); - fprintf(stderr, "pthreads: using profiling hooks for gprof\n"); - if(pthread_create_orig == NULL) - { - char *error = dlerror(); - if(error == NULL) - { - error = "pthread_create is NULL"; - } - fprintf(stderr, "%s\n", error); - exit(EXIT_FAILURE); - } -} - -/* Our data structure passed to the wrapper */ -typedef struct wrapper_s -{ - void * (*start_routine)(void *); - void * arg; - - pthread_mutex_t lock; - pthread_cond_t wait; - - struct itimerval itimer; - -} wrapper_t; - -/* The wrapper function in charge for setting the itimer value */ -static void * wrapper_routine(void * data) -{ - /* Put user data in thread-local variables */ - void * (*start_routine)(void *) = ((wrapper_t*)data)->start_routine; - void * arg = ((wrapper_t*)data)->arg; - - /* Set the profile timer value */ - setitimer(ITIMER_PROF, &((wrapper_t*)data)->itimer, NULL); - - /* Tell the calling thread that we don't need its data anymore */ - pthread_mutex_lock(&((wrapper_t*)data)->lock); - pthread_cond_signal(&((wrapper_t*)data)->wait); - pthread_mutex_unlock(&((wrapper_t*)data)->lock); - - /* Call the real function */ - return start_routine(arg); -} - -/* Our wrapper function for the real pthread_create() */ -int pthread_create(pthread_t *__restrict thread, - __const pthread_attr_t *__restrict attr, - void * (*start_routine)(void *), - void *__restrict arg) -{ - wrapper_t wrapper_data; - int i_return; - - /* Initialize the wrapper structure */ - wrapper_data.start_routine = start_routine; - wrapper_data.arg = arg; - getitimer(ITIMER_PROF, &wrapper_data.itimer); - pthread_cond_init(&wrapper_data.wait, NULL); - pthread_mutex_init(&wrapper_data.lock, NULL); - pthread_mutex_lock(&wrapper_data.lock); - - /* The real pthread_create call */ - i_return = pthread_create_orig(thread, - attr, - &wrapper_routine, - &wrapper_data); - - /* If the thread was successfully spawned, wait for the data - * to be released */ - if(i_return == 0) - { - pthread_cond_wait(&wrapper_data.wait, &wrapper_data.lock); - } - - pthread_mutex_unlock(&wrapper_data.lock); - pthread_mutex_destroy(&wrapper_data.lock); - pthread_cond_destroy(&wrapper_data.wait); - - return i_return; -} - diff --git a/branches/sage/mds/test/makedirs.cc b/branches/sage/mds/test/makedirs.cc deleted file mode 100644 index 8fd74d996ef9f..0000000000000 --- a/branches/sage/mds/test/makedirs.cc +++ /dev/null @@ -1,38 +0,0 @@ -#include -#include -using namespace std; - -int make_dirs(const char *basedir, int dirs, int files, int depth) -{ - //if (time_to_stop()) return 0; - - // make sure base dir exists - int r = mkdir(basedir, 0755); - if (r != 0) { - cout << "can't make base dir? " << basedir << endl; - return -1; - } - - // children - char d[500]; - cout << "make_dirs " << basedir << " dirs " << dirs << " files " << files << " depth " << depth << endl; - for (int i=0; i -#include -#include -using namespace std; - -#include "mds/MDCluster.h" -#include "mds/MDS.h" -#include "osd/OSD.h" -#include "fakeclient/FakeClient.h" - -#include "mds/MDCache.h" -#include "mds/MDStore.h" - -#include "msg/MPIMessenger.h" -//#include "msg/CheesySerializer.h" - -#include "messages/MPing.h" - - -__uint64_t ino = 1; - - - -#include "config.h" -#define NUMMDS g_conf.num_mds -#define NUMOSD g_conf.num_osd -#define NUMCLIENT g_conf.num_client - -// this parses find output -int play(); - -int main(int argc, char **argv) { - cout << "mpitest starting" << endl; - - int myrank = mpimessenger_init(argc, argv); - int world = mpimessenger_world(); - - - - MDCluster *mdc = new MDCluster(NUMMDS, NUMOSD); - - // create osds - OSD *osd[NUMOSD]; - for (int i=0; iinit(); - } - - // create mds - MDS *mds[NUMMDS]; - for (int i=0; iinit(); - } - - // create clients - FakeClient *client[NUMCLIENT]; - for (int i=0; iset_dispatcher(serializer); - - client[i] = new FakeClient(mdc, i, real, g_conf.fakeclient_requests); - client[i]->init(); - } - - // seed initial requests - for (int i=0; iissue_request(); - } - - mpimessenger_start(); // start message loop - mpimessenger_wait(); // wait for thread to finish - mpimessenger_shutdown(); // shutdown MPI - - // - /* - cout << "---- check ----" << endl; - for (int i=0; imdcache->shutdown_pass(); - } - */ - - // cleanup - //cout << "cleanup" << endl; - for (int i=0; i -#include "mpi.h" - -#include "messages/MClientRequest.h" -#include "msg/MTMessenger.h" -#include "include/error.h" - -#define SARG_SIZE 64 -#define SERVER_RANK 0 -#define NTHREADS 11 // number of threads per rank -#define NMESSAGES 31 // number of messages per thread - -static void server_loop(MTMessenger &msgr, int world_size) -{ - // we expect this many messages from clients, then we quit - // (world_size-1 since server is one of the processes). - int totmsg = NTHREADS * NMESSAGES * (world_size - 1); - int nmsg = 0; - - char buf[SARG_SIZE]; - - while(nmsg < totmsg) { - MClientRequest *req = (MClientRequest*)msgr.recvreq(); - ASSERT(req->get_type() == MSG_CLIENT_REQUEST); - - //cout << "Server acknowledging " << req->get_sarg() << endl; - - sprintf(buf, "%s reply", req->get_sarg().c_str()); - MClientRequest resp(0, 0); - resp.set_sarg(buf); - msgr.sendresp(req, &resp); - - delete req; - nmsg++; - } - - cout << "Server successful" << endl; -} - -// arguments for client thread start function (see pthread_create) -struct client_arg -{ - MTMessenger *msgr; - int rank; - int thread; -}; - -static void *client_session(void *_carg) -{ - client_arg *carg = (client_arg *)_carg; - - char buf[SARG_SIZE]; - - // repeat some number (arbitrary really) of rounds - for (int i = 0; i < NMESSAGES; i++) { - - // send the message, receive the reply and check reply is as - // expected - - MClientRequest request(0, 0); - sprintf(buf, "r%d:t%d:m%d", carg->rank, carg->thread, i); - request.set_sarg(buf); - - //cout << "Client sending " << request.get_sarg() << endl; - - MClientRequest *resp = - (MClientRequest*)carg->msgr->sendrecv(&request, SERVER_RANK); - - ASSERT(resp->get_type() == MSG_CLIENT_REQUEST); - sprintf(buf, "r%d:t%d:m%d reply", carg->rank, carg->thread, i); - ASSERT(strcmp(buf, resp->get_sarg().c_str()) == 0); - - //cout << "Client verified " << resp->get_sarg() << endl; - - delete resp; - } - - cout << "Client (" << carg->rank << "," << carg->thread - << ") successful" << endl; - - delete carg; - return NULL; -} - -static void launch_clients(MTMessenger &msgr, int rank) -{ - pthread_t tid[NTHREADS]; - - // launch some number (arbitrary really) of threads - for (int i = 0; i < NTHREADS; i++) { - - client_arg *carg = (client_arg*)malloc(sizeof(client_arg)); - ASSERT(carg); - carg->msgr = &msgr; - carg->rank = rank; - carg->thread = i; - - if (pthread_create(&tid[i], NULL, client_session, carg) < 0) - SYSERROR(); - } - - // we must wait for all the threads to exit before returning, - // otherwise we shutdown MPI before while the threads are - // chatting. - for (int i = 0; i < NTHREADS; i++) { - void *retval; - - if (pthread_join(tid[i], &retval) < 0) - SYSERROR(); - } -} - -int main(int argc, char **argv) -{ - MTMessenger msgr(argc, argv); - - int rank; - ASSERT(MPI_Comm_rank(MPI_COMM_WORLD, &rank) == MPI_SUCCESS); - int world_size; - ASSERT(MPI_Comm_size(MPI_COMM_WORLD, &world_size) == MPI_SUCCESS); - - if (rank == SERVER_RANK) - server_loop(msgr, world_size); - else - launch_clients(msgr, rank); - - return 0; -} diff --git a/branches/sage/mds/test/rushconfig b/branches/sage/mds/test/rushconfig deleted file mode 100644 index 40d82702ea0a5..0000000000000 --- a/branches/sage/mds/test/rushconfig +++ /dev/null @@ -1,7 +0,0 @@ -6 -8 10.0 -4 20.0 -7 30.0 -9 10.0 -8 15.0 -5 11.0 diff --git a/branches/sage/mds/test/rushtest.cc b/branches/sage/mds/test/rushtest.cc deleted file mode 100644 index ecff83523e0c6..0000000000000 --- a/branches/sage/mds/test/rushtest.cc +++ /dev/null @@ -1,49 +0,0 @@ -// -// $Id$ -// - -#include -#include -#include "../osd/rush.h" - -main (int argc, char *argv[]) -{ - Rush rush; - char buf[200]; - int i, j, k, numClusters; - int numKeys = 5; - int numReplicas = 4; - int curSize; - double curWeight; - int servers[1000]; - - if (argc > 1) { - numKeys = atoi (argv[1]); - } - if (argc > 2) { - numReplicas = atoi (argv[2]); - } - - fgets (buf, sizeof (buf) - 2, stdin); - sscanf (buf, "%d", &numClusters); - for (i = 0; i < numClusters; i++) { - fgets (buf, sizeof (buf) - 2, stdin); - sscanf (buf, "%d %lf", &curSize, &curWeight); - rush.AddCluster (curSize, curWeight); - if (rush.Servers () < numReplicas) { - fprintf (stderr, "ERROR: must have at least %d disks in the system!\n", - rush.Clusters ()); - exit (-1); - } - for (j = 0; j < numKeys; j++) { - rush.GetServersByKey (j, numReplicas, servers); -#if 0 - printf ("%-3d %-6d ", i, j); - for (k = 0; k < numReplicas; k++) { - printf ("%-5d ", servers[k]); - } - putchar ('\n'); -#endif - } - } -} diff --git a/branches/sage/mds/test/rushtest.cc~ b/branches/sage/mds/test/rushtest.cc~ deleted file mode 100644 index 0b9512ccd0c3d..0000000000000 --- a/branches/sage/mds/test/rushtest.cc~ +++ /dev/null @@ -1,49 +0,0 @@ -// -// $Id$ -// - -#include -#include -#include "rush.h" - -main (int argc, char *argv[]) -{ - Rush rush; - char buf[200]; - int i, j, k, numClusters; - int numKeys = 5; - int numReplicas = 4; - int curSize; - double curWeight; - int servers[1000]; - - if (argc > 1) { - numKeys = atoi (argv[1]); - } - if (argc > 2) { - numReplicas = atoi (argv[2]); - } - - fgets (buf, sizeof (buf) - 2, stdin); - sscanf (buf, "%d", &numClusters); - for (i = 0; i < numClusters; i++) { - fgets (buf, sizeof (buf) - 2, stdin); - sscanf (buf, "%d %lf", &curSize, &curWeight); - rush.AddCluster (curSize, curWeight); - if (rush.Servers () < numReplicas) { - fprintf (stderr, "ERROR: must have at least %d disks in the system!\n", - rush.Clusters ()); - exit (-1); - } - for (j = 0; j < numKeys; j++) { - rush.GetServersByKey (j, numReplicas, servers); -#if 0 - printf ("%-3d %-6d ", i, j); - for (k = 0; k < numReplicas; k++) { - printf ("%-5d ", servers[k]); - } - putchar ('\n'); -#endif - } - } -} diff --git a/branches/sage/mds/test/test_disk_bw.cc b/branches/sage/mds/test/test_disk_bw.cc deleted file mode 100644 index fc36da74fadb2..0000000000000 --- a/branches/sage/mds/test/test_disk_bw.cc +++ /dev/null @@ -1,59 +0,0 @@ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "common/Clock.h" - -#include -using namespace std; - -int main(int argc, char **argv) -{ - void *buf; - int fd, count, loop = 0, ret; - - if (argc != 4) { - fprintf(stderr, "Usage: %s device bsize count\n", argv[0]); - exit (0); - } - - int bsize = atoi(argv[2]); - count = atoi(argv[3]); - - posix_memalign(&buf, sysconf(_SC_PAGESIZE), bsize); - - //if ((fd = open(argv[1], O_SYNC|O_RDWR)) < 0) { - if ((fd = open(argv[1], O_DIRECT|O_RDWR)) < 0) { - - fprintf(stderr, "Can't open device %s\n", argv[1]); - exit (4); - } - - - utime_t start = g_clock.now(); - while (loop++ < count) { - ret = ::write(fd, buf, bsize); - //if ((loop % 100) == 0) - //fprintf(stderr, "."); - } - ::fsync(fd); - ::close(fd); - utime_t end = g_clock.now(); - end -= start; - - - char hostname[80]; - gethostname(hostname, 80); - - double mb = bsize*count/1024/1024; - - cout << hostname << "\t" << mb << " MB\t" << end << " seconds\t" << (mb / (double)end) << " MB/sec" << std::endl; -} diff --git a/branches/sage/mds/test/test_seek_read.c b/branches/sage/mds/test/test_seek_read.c deleted file mode 100644 index 988ff1dcec88d..0000000000000 --- a/branches/sage/mds/test/test_seek_read.c +++ /dev/null @@ -1,53 +0,0 @@ -#include "include/types.h" -#include "common/Clock.h" - -#include -#include -#include -#include -#include -#include - -int main(int argc, char **argv) -{ - char *fn = argv[1]; - uint64_t numblocks = atoll(argv[2]) / 4096; - int count = 400; - - cout << "fn " << fn << endl; - cout << "numblocks " << numblocks << endl; - - int blocks = 1; - while (blocks <= 1024) { - int fd = ::open(fn, O_RDWR|O_DIRECT);//|O_SYNC|O_DIRECT); - if (fd < 0) return 1; - //cout << "fd is " << fd << endl; - - void *buf; - ::posix_memalign(&buf, 4096, 4096*blocks); - - int s = blocks*4096; - - utime_t start = g_clock.now(); - for (int i=0; i -#include -using namespace std; - - -ostream& operator<<(ostream& out, vector& v) -{ - out << "["; - for (int i=0; i disks; - for (int i=0; i<20; i++) - disks.push_back(i); - - - /* - UniformBucket ub(1, 1, 0, 10, disks); - ub.make_primes(h); - cout << "primes are " << ub.primes << endl; - */ - - MixedBucket mb(2, 1); - for (int i=0;i<20;i++) - mb.add_item(i, 10); - - /* - MixedBucket b(3, 1); - b.add_item(1, ub.get_weight()); - b.add_item(2, mb.get_weight()); - */ - MixedBucket b= mb; - - vector ocount(disks.size()); - int numrep = 3; - - vector v(numrep); - for (int x=1; x<1000000; x++) { - //cout << H(x) << "\t" << h(x) << endl; - for (int i=0; i -using namespace std; - -#include "include/bufferlist.h" - - -int main() -{ - - bufferptr p1 = new buffer("123456",6); - bufferptr p2 = p1; - - cout << "it is '" << p1.c_str() << "'" << endl; - - bufferptr p3 = new buffer("abcdef",6); - - cout << "p3 is " << p3 << endl; - - bufferlist bl; - bl.push_back(p2); - bl.push_back(p1); - bl.push_back(p3); - - cout << "bl is " << bl << endl; - - cout << "len is " << bl.length() << endl; - - bufferlist took; - bl.splice(10,4,&took); - - cout << "took out " << took << "leftover is " << bl << endl; - //cout << "len is " << bl.length() << endl; - - bufferlist bl2; - bl2.substr_of(bl, 3, 5); - cout << "bl2 is " << bl2 << endl; - - -} diff --git a/branches/sage/mds/test/testcounter.cc b/branches/sage/mds/test/testcounter.cc deleted file mode 100644 index a3194489e4886..0000000000000 --- a/branches/sage/mds/test/testcounter.cc +++ /dev/null @@ -1,70 +0,0 @@ - -#include "common/DecayCounter.h" - -#include -using namespace std; - -struct RealCounter { -public: - list hits; - - void hit(int ms) { - hits.push_back(ms); - } - - int get(double hl, int now) { - trim(now-hl); - return hits.size(); - } - - void trim(int to) { - while (!hits.empty() && - hits.front() < to) - hits.pop_front(); - } - - -}; - -int main(int argc, char **argv) -{ - int target; - double hl = atof(argv[1]); - cerr << "halflife " << hl << endl; - - DecayCounter dc(hl); - RealCounter rc; - - utime_t now = g_clock.now(); - - for (int ms=0; ms < 300*1000; ms++) { - if (ms % 30000 == 0) { - target = 1 + (rand() % 10) * 10; - if (ms > 200000) target = 0; - } - - if (target && - (rand() % (1000/target) == 0)) { - dc.hit(); - rc.hit(ms); - } - - if (ms % 500 == 0) dc.get(now); - if (ms % 100 == 0) { - //dc.get(now); - DecayCounter o = dc; - cout << ms << "\t" - << target*hl << "\t" - << rc.get(hl*1000, ms) << "\t" - << o.get(now) << "\t" - << dc.val << "\t" - // << dc.delta << "\t" - << o.get_last_vel() << "\t" - << o.get_last() + o.get_last_vel() << "\t" - << endl; - } - - now += .001; - } - -} diff --git a/branches/sage/mds/test/testcrush.cc b/branches/sage/mds/test/testcrush.cc deleted file mode 100644 index bd432b23ee95c..0000000000000 --- a/branches/sage/mds/test/testcrush.cc +++ /dev/null @@ -1,266 +0,0 @@ - - -#include "../crush/crush.h" -using namespace crush; - -#include - -#include -#include -using namespace std; - -/* -ostream& operator<<(ostream& out, vector& v) -{ - out << "["; - for (int i=0; i& d) -{ - d.clear(); - while (n) { - d.push_back(no); - no++; - n--; - } -} - - -Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks, int& nbuckets) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - return b; - } else { - // mixed - MixedBucket *b = new MixedBucket(nbuckets--, h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, int& ndisks, int& nbuckets) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks, nbuckets); - return b->get_id(); -} - - - -int main() -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - vector disks; - int root = -1; - int nbuckets = -1; - int ndisks = 0; - - if (0) { - make_disks(12, ndisks, disks); - UniformBucket ub1(-1, 1, 0, 30, disks); - ub1.make_primes(h); - cout << "ub1 primes are " << ub1.primes << endl; - c.add_bucket(&ub1); - - make_disks(17, ndisks, disks); - UniformBucket ub2(-2, 1, 0, 30, disks); - ub2.make_primes(h); - cout << "ub2 primes are " << ub2.primes << endl; - c.add_bucket(&ub2); - - make_disks(4, ndisks, disks); - UniformBucket ub3(-3, 1, 0, 30, disks); - ub3.make_primes(h); - cout << "ub3 primes are " << ub3.primes << endl; - c.add_bucket(&ub3); - - make_disks(20, ndisks, disks); - MixedBucket umb1(-4, 1); - for (int i=0; i<20; i++) - umb1.add_item(disks[i], 30); - c.add_bucket(&umb1); - - MixedBucket b(-100, 1); - //b.add_item(-2, ub1.get_weight()); - b.add_item(-4, umb1.get_weight()); - //b.add_item(-2, ub2.get_weight()); - //b.add_item(-3, ub3.get_weight()); - } - - if (0) { - int bucket = -1; - MixedBucket *root = new MixedBucket(bucket--, 2); - - for (int i=0; i<5; i++) { - MixedBucket *b = new MixedBucket(bucket--, 1); - - int n = 5; - - if (1) { - // add n buckets of n disks - for (int j=0; jadd_item(disks[k], 10); - - //b->add_item(disks[j], 10); - c.add_bucket(d); - b->add_item(d->get_id(), d->get_weight()); - } - - c.add_bucket(b); - root->add_item(b->get_id(), b->get_weight()); - } else { - // add n*n disks - make_disks(n*n, ndisks, disks); - for (int k=0; kadd_item(disks[k], 10); - - c.add_bucket(b); - root->add_item(b->get_id(), b->get_weight()); - } - } - - c.add_bucket(root); - } - - - if (1) { - vector wid; - for (int d=0; d<5; d++) - wid.push_back(10); - root = make_hierarchy(c, wid, ndisks, nbuckets); - } - - - - // rule - int numrep = 1; - - Rule rule; - if (0) { - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, -100)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - } - if (1) { - /* - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, -4)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, 2, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - */ - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, 1, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - } - - //c.overload[10] = .1; - - - int pg_per = 100; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - cout << ndisks << " disks, " << 1-nbuckets << " buckets" << endl; - cout << pg_per << " pgs per disk" << endl; - cout << numpg << " logical pgs" << endl; - cout << "numrep is " << numrep << endl; - - - int place = 1000000; - int times = place / numpg; - if (!times) times = 1; - - cout << "looping " << times << " times" << endl; - - float tvar = 0; - int tvarnum = 0; - - int x = 0; - for (int t=0; t v(numrep); - - for (int z=0; z -using namespace std; - -int print(string s) { - filepath fp = s; - cout << "s = " << s << " filepath = " << fp << endl; - cout << " depth " << fp.depth() << endl; - for (int i=0; i -#include -#include -using namespace std; - -#include "config.h" -#include "messages/MPing.h" -#include "common/Mutex.h" - -#include "msg/MPIMessenger.h" - -class Pinger : public Dispatcher { -public: - Messenger *messenger; - Pinger(Messenger *m) : messenger(m) { - m->set_dispatcher(this); - } - void dispatch(Message *m) { - //dout(1) << "got incoming " << m << endl; - delete m; - - } -}; - -int main(int argc, char **argv) { - int num = 1000; - - int myrank = mpimessenger_init(argc, argv); - int world = mpimessenger_world(); - - Pinger *p = new Pinger( new MPIMessenger(myrank) ); - - mpimessenger_start(); - - //while (1) { - for (int i=0; i<10000; i++) { - - // ping random nodes - int d = rand() % world; - if (d != myrank) { - //cout << "sending " << i << " to " << d << endl; - p->messenger->send_message(new MPing(), d); - } - - } - - - //cout << "shutting down" << endl; - //p->messenger->shutdown(); - - mpimessenger_wait(); - mpimessenger_shutdown(); // shutdown MPI -} diff --git a/branches/sage/mds/test/testnewbuffers.cc b/branches/sage/mds/test/testnewbuffers.cc deleted file mode 100644 index 0fea7571a4572..0000000000000 --- a/branches/sage/mds/test/testnewbuffers.cc +++ /dev/null @@ -1,91 +0,0 @@ - -#include -#include -using namespace std; - - -#include "include/newbuffer.h" -//#include "include/bufferlist.h" - -#include "common/Thread.h" - - - class Th : public Thread { - public: - bufferlist bl; - Th(bufferlist& o) : bl(o) { } - - void *entry() { - //cout << "start" << endl; - // thrash it a bit. - for (int n=0; n<10000; n++) { - bufferlist bl2; - unsigned off = rand() % (bl.length() -1); - unsigned len = 1 + rand() % (bl.length() - off - 1); - bl2.substr_of(bl, off, len); - bufferlist bl3; - bl3.append(bl); - bl3.append(bl2); - //cout << bl3 << endl; - bl2.clear(); - bl3.clear(); - } - //cout << "end" << endl; - } - }; - -int main() -{ - - bufferptr p1 = buffer::copy("123456",7); - //bufferptr p1 = new buffer("123456",7); - bufferptr p2 = p1; - - cout << "p1 is '" << p1.c_str() << "'" << " " << p1 << endl; - cout << "p2 is '" << p2.c_str() << "'" << " " << p2 << endl; - - bufferptr p3 = buffer::copy("abcdef",7); - //bufferptr p3 = new buffer("abcdef",7); - - cout << "p3 is " << p3.c_str() << " " << p3 << endl; - - bufferlist bl; - bl.push_back(p2); - bl.push_back(p1); - bl.push_back(p3); - - cout << "bl is " << bl << endl; - - bufferlist took; - bl.splice(10,4,&took); - - cout << "took out " << took << ", leftover is " << bl << endl; - //cout << "len is " << bl.length() << endl; - - bufferlist bl2; - bl2.substr_of(bl, 3, 5); - cout << "bl2 is " << bl2 << endl; - - - cout << "bl before " << bl << endl; - - list ls; - for (int t=0; t<40; t++) { - Th *t = new Th(bl); - cout << "create" << endl; - t->create(); - ls.push_back(t); - } - - bl.clear(); - - while (!ls.empty()) { - cout << "join" << endl; - ls.front()->join(); - delete ls.front(); - ls.pop_front(); - } - - cout << "bl after " << bl << endl; - -} diff --git a/branches/sage/mds/test/testos.cc b/branches/sage/mds/test/testos.cc deleted file mode 100644 index 24c81590d899c..0000000000000 --- a/branches/sage/mds/test/testos.cc +++ /dev/null @@ -1,343 +0,0 @@ -/* testos.cc -- simple ObjectStore test harness. - Copyright (C) 2007 Casey Marshall - -Ceph - scalable distributed file system - -This is free software; you can redistribute it and/or -modify it under the terms of the GNU Lesser General Public -License version 2.1, as published by the Free Software -Foundation. See file COPYING. */ - - -#include "osd/ObjectStore.h" -#include "ebofs/Ebofs.h" -#include "osbdb/OSBDB.h" -#include "include/buffer.h" - -#include -#include -#include - -#include -#include - -using namespace std; - -static inline unsigned long long -to_usec (struct timeval &time) -{ - return (((unsigned long long) time.tv_sec * 1000000) - + ((unsigned long long) time.tv_usec)); -} - -static inline unsigned long long -to_msec (struct timeval &time) -{ - return (((unsigned long long) time.tv_sec * 1000) - + ((unsigned long long) time.tv_usec / 1000)); -} - -int main (int argc, char **argv) -{ - vector args; - char *osd_name = "ebofs"; - unsigned object_size = 1024; - unsigned object_count = 1024; - unsigned write_iter = 64; - unsigned random_seed = ::time(NULL); - char *device = "/tmp/testos"; - char *mountcmd = "mount /tmp/testos"; - char *umountcmd = "umount /tmp/testos"; - - bool ebofs_raw_device = false; - bool inhibit_remount = (getenv("TESTOS_INHIBIT_REMOUNT") != NULL); - - if (argc > 1 - && (strcmp (argv[1], "-h") == 0 - || strcmp (argv[1], "-help") == 0 - || strcmp (argv[1], "--help") == 0)) - { - cout << "usage: " << argv[0] << " [store [object-size [object-count [iterations [seed]]]]]" << endl; - cout << endl; - cout << "Where the arguments are:" << endl << endl; - cout << " store -- store type; default \"ebofs\"" << endl; - cout << " object-size -- size of objects; default 1024" << endl; - cout << " object-count -- number of objects to write; default 1024" - << endl; - cout << " iterations -- write the objects that many times; default 5" - << endl; - cout << " seed -- random seed; default current time" << endl; - exit (0); - } - - argv_to_vec (argc, argv, args); - for (vector::iterator it = args.begin(); it != args.end(); - it++) - cout << *it << " "; - cout << endl; - parse_config_options (args); - for (vector::iterator it = args.begin(); it != args.end(); - it++) - cout << *it << " "; - cout << endl; - - argc = args.size(); - if (argc > 0) - osd_name = args[0]; - if (argc > 1) - object_size = (unsigned) atol (args[1]); - if (argc > 2) - object_count = (unsigned) atol (args[2]); - if (argc > 3) - write_iter = (unsigned) atol (args[3]); - if (argc > 4) - random_seed = (unsigned) atol (args[4]); - - // algin object size to 'long' - object_size = ((object_size + (sizeof (long) - 1)) / sizeof (long)) * sizeof (long); - - char *osd_file = new char[32]; - strcpy (osd_file, "/tmp/testos/testos.XXXXXX"); - mktemp (osd_file); - - if (strcasecmp (osd_name, "ebofs") == 0) - { - char *dev_env = getenv ("TESTOS_EBOFS_DEV"); - if (dev_env != NULL) - { - // Assume it is a true device. - strncpy (osd_file, dev_env, 32); - inhibit_remount = true; - ebofs_raw_device = true; - } - } - - if (!inhibit_remount) - { - if (system (mountcmd) != 0) - { - cerr << "mount failed" << endl; - exit (1); - } - } - - ObjectStore *os = NULL; - if (strcasecmp (osd_name, "ebofs") == 0) - { - if (!ebofs_raw_device) - { - FILE *f = fopen (osd_file, "w"); - if (f == NULL) - { - cerr << "failed to open " << osd_file << ": " << strerror (errno) - << endl; - exit (1); - } - // 1G file. - fseek (f, 1024 * 1024 * 1024, SEEK_SET); - fputc ('\0', f); - fclose (f); - } - os = new Ebofs (osd_file); - } - else if (strcasecmp (osd_name, "osbdb") == 0) - { - os = new OSBDB (osd_file); - } - else if (strcasecmp (osd_name, "osbdb-btree") == 0) - { - g_conf.bdbstore_btree = true; - os = new OSBDB (osd_file); - } - else - { - cerr << "I don't know about object store \"" << osd_name << "\"" - << endl; - exit (1); - } - - cout << "Writing " << object_count << " objects of size " - << object_size << " to " << osd_name << endl; - - char *val = (char *) malloc (object_size); - char *val2 = (char *) malloc (object_size); - auto_ptr valptr (val); - auto_ptr valptr2(val2); - if (getenv ("TESTOS_UNALIGNED") != NULL) - { - val = val + 1; - val2 = val2 + 1; - } - - for (unsigned i = 0; i < object_size; i++) - { - val[i] = (char) i; - val2[i] = (char) i; - } - object_t *oids = new object_t[object_count]; - - utime_t writes[write_iter]; - utime_t total_write; - utime_t reads[write_iter]; - utime_t total_read; - for (unsigned i = 0; i < write_iter; i++) - { - cerr << "Iteration " << i << endl; - - int ret = os->mkfs(); - if (ret != 0) - { - cerr << "mkfs(" << osd_file << "): " << strerror (-ret) << endl; - exit (1); - } - ret = os->mount(); - if (ret != 0) - { - cerr << "mount(): " << strerror (-ret) << endl; - exit (1); - } - - srandom (random_seed + i); - - for (unsigned j = 0; j < object_count; j++) - { - oids[j].ino = (uint64_t) random() << 32 | random(); - oids[j].bno = random(); - } - - utime_t begin = g_clock.now(); - for (unsigned o = 0; o < object_count; o++) - { - bufferptr bp (val, object_size); - bufferlist bl; - bl.push_back (bp); - int ret; - if ((ret = os->write (oids[o], 0L, object_size, bl, NULL)) < 0) - cerr << "write " << oids[o] << " failed: " - << strerror (-ret) << endl; - } - os->sync(); - - utime_t end = g_clock.now() - begin; - - cerr << "Write finished in " << end << endl; - total_write += end; - writes[i] = end; - - os->umount(); - sync(); - - if (!inhibit_remount) - { - if (system (umountcmd) != 0) - { - cerr << "umount failed" << endl; - exit (1); - } - - if (system (mountcmd) != 0) - { - cerr << "mount(2) failed" << endl; - exit (1); - } - } - - os->mount(); - - // Shuffle the OIDs. - for (int j = 0; j < object_count; j++) - { - int x = random() % object_count; - if (x < 0) - x = -x; - object_t o = oids[j]; - oids[j] = oids[x]; - oids[x] = o; - } - - begin = g_clock.now(); - for (unsigned o = 0; o < object_count; o++) - { - bufferptr bp (val2, object_size); - bufferlist bl; - bl.push_back (bp); - - if (os->read (oids[o], 0L, object_size, bl) < 0) - { - cerr << "object " << oids[o] << " not found!" << endl; - } - } - end = g_clock.now() - begin; - - cerr << "Read finished in " << end << endl; - total_read += end; - reads[i] = end; - - os->umount(); - sync(); - - if (!inhibit_remount) - { - if (system (umountcmd) != 0) - { - cerr << "umount(2) failed" << endl; - exit (1); - } - - if (system (mountcmd) != 0) - { - cerr << "mount(3) failed" << endl; - exit (1); - } - } - } - - cerr << "Finished in " << (total_write + total_read) << endl; - - double write_mean = ((double) total_write) / ((double) write_iter); - double write_sd = 0.0; - for (unsigned i = 0; i < write_iter; i++) - { - double x = ((double) writes[i]) - write_mean; - write_sd += x * x; - } - write_sd = sqrt (write_sd / ((double) write_iter)); - - double read_mean = ((double) total_read) / ((double) write_iter); - double read_sd = 0.0; - for (unsigned i = 0; i < write_iter; i++) - { - double x = ((double) reads[i]) - read_mean; - write_sd += x * x; - } - read_sd = sqrt (read_sd / ((double) write_iter)); - - cout << "TESTOS: write " << osd_name << ":" << object_size << ":" - << object_count << ":" << write_iter << ":" << random_seed - << " -- " << write_mean << " " << write_sd << endl; - - cout << "TESTOS: write.raw -- "; - for (int i = 0; i < write_iter; i++) - cout << ((double) writes[i]) << " "; - cout << endl; - - cout << "TESTOS: read " << osd_name << ":" << object_size << ":" - << object_count << ":" << write_iter << ":" << random_seed - << " -- " << read_mean << " " << read_sd << endl; - - cout << "TESTOS: read.raw -- "; - for (int i = 0; i < write_iter; i++) - cout << ((double) reads[i]) << " "; - cout << endl; - - unlink (osd_file); - if (!inhibit_remount) - { - if (system (umountcmd) != 0) - { - cerr << "umount(3) failed" << endl; - exit (1); - } - } - exit (0); -} diff --git a/branches/sage/mds/test/testosbdb.cc b/branches/sage/mds/test/testosbdb.cc deleted file mode 100644 index 19268e7587531..0000000000000 --- a/branches/sage/mds/test/testosbdb.cc +++ /dev/null @@ -1,347 +0,0 @@ -/* testosbdb.cc -- test OSBDB. - Copyright (C) 2007 Casey Marshall */ - - -#include -#include "osbdb/OSBDB.h" - -using namespace std; - -int -main (int argc, char **argv) -{ - vector args; - argv_to_vec (argc, argv, args); - parse_config_options (args); - - g_conf.debug_bdbstore = 10; - //g_conf.bdbstore_btree = true; - char dbfile[256]; - strncpy (dbfile, "/tmp/testosbdb/db.XXXXXX", 256); - mktemp (dbfile); - OSBDB *os = new OSBDB(dbfile); - auto_ptr osPtr (os); - os->mkfs(); - os->mount(); - - // Put an object. - object_t oid (0xDEADBEEF00000000ULL, 0xFEEDFACE); - - cout << "sizeof oid_t is " << sizeof (oid_t) << endl; - cout << "offsetof oid_t.id " << offsetof (oid_t, id) << endl; - - cout << sizeof (object_t) << endl; - cout << sizeof (oid.ino) << endl; - cout << sizeof (oid.bno) << endl; - cout << sizeof (oid.rev) << endl; - - // Shouldn't be there. - if (os->exists (oid)) - { - cout << "FAIL: oid shouldn't be there " << oid << endl; - } - - // Write an object. - char *x = (char *) malloc (1024); - memset(x, 0xaa, 1024); - bufferptr bp (x, 1024); - bufferlist bl; - bl.push_back (bp); - - if (os->write (oid, 0L, 1024, bl, NULL) != 1024) - { - cout << "FAIL: writing object" << endl; - } - - os->sync(); - - // Should be there. - if (!os->exists (oid)) - { - cout << "FAIL: oid should be there: " << oid << endl; - } - - memset(x, 0, 1024); - if (os->read (oid, 0, 1024, bl) != 1024) - { - cout << "FAIL: reading object" << endl; - } - - for (int i = 0; i < 1024; i++) - { - if ((x[i] & 0xFF) != 0xaa) - { - cout << "FAIL: data read out is different" << endl; - break; - } - } - - // Set some attributes - if (os->setattr (oid, "alpha", "value", strlen ("value")) != 0) - { - cout << "FAIL: set attribute" << endl; - } - if (os->setattr (oid, "beta", "value", strlen ("value")) != 0) - { - cout << "FAIL: set attribute" << endl; - } - if (os->setattr (oid, "gamma", "value", strlen ("value")) != 0) - { - cout << "FAIL: set attribute" << endl; - } - if (os->setattr (oid, "fred", "value", strlen ("value")) != 0) - { - cout << "FAIL: set attribute" << endl; - } - - char *attrs = (char *) malloc (1024); - if (os->listattr (oid, attrs, 1024) != 0) - { - cout << "FAIL: listing attributes" << endl; - } - else - { - char *p = attrs; - if (strcmp (p, "alpha") != 0) - { - cout << "FAIL: should be \"alpha:\" \"" << p << "\"" << endl; - } - p = p + strlen (p) + 1; - if (strcmp (p, "beta") != 0) - { - cout << "FAIL: should be \"beta:\" \"" << p << "\"" << endl; - } - p = p + strlen (p) + 1; - if (strcmp (p, "fred") != 0) - { - cout << "FAIL: should be \"fred:\" \"" << p << "\"" << endl; - } - p = p + strlen (p) + 1; - if (strcmp (p, "gamma") != 0) - { - cout << "FAIL: should be \"gamma:\" \"" << p << "\"" << endl; - } - } - - char attrvalue[256]; - memset(attrvalue, 0, sizeof (attrvalue)); - if (os->getattr (oid, "alpha", attrvalue, sizeof(attrvalue)) < 0) - { - cout << "FAIL: getattr alpha" << endl; - } - else if (strncmp ("value", attrvalue, strlen("value")) != 0) - { - cout << "FAIL: read attribute value differs" << endl; - } - memset(attrvalue, 0, sizeof (attrvalue)); - if (os->getattr (oid, "fred", attrvalue, sizeof(attrvalue)) < 0) - { - cout << "FAIL: getattr fred" << endl; - } - else if (strncmp ("value", attrvalue, strlen("value")) != 0) - { - cout << "FAIL: read attribute value differs" << endl; - } - memset(attrvalue, 0, sizeof (attrvalue)); - if (os->getattr (oid, "beta", attrvalue, sizeof(attrvalue)) < 0) - { - cout << "FAIL: getattr beta" << endl; - } - else if (strncmp ("value", attrvalue, strlen("value")) != 0) - { - cout << "FAIL: read attribute value differs" << endl; - } - memset(attrvalue, 0, sizeof (attrvalue)); - if (os->getattr (oid, "gamma", attrvalue, sizeof(attrvalue)) < 0) - { - cout << "FAIL: getattr gamma" << endl; - } - else if (strncmp ("value", attrvalue, strlen("value")) != 0) - { - cout << "FAIL: read attribute value differs" << endl; - } - - if (os->setattr (oid, "alpha", "different", strlen("different")) != 0) - cout << "FAIL: setattr overwrite" << endl; - memset(attrvalue, 0, sizeof (attrvalue)); - if (os->getattr (oid, "alpha", attrvalue, sizeof(attrvalue)) < 0) - { - cout << "FAIL: getattr alpha" << endl; - } - else if (strncmp ("different", attrvalue, strlen("different")) != 0) - { - cout << "FAIL: read attribute value differs" << endl; - } - - if (os->rmattr (oid, "alpha") != 0) - { - cout << "FAIL: rmattr alpha" << endl; - } - if (os->rmattr (oid, "fred") != 0) - { - cout << "FAIL: rmattr fred" << endl; - } - if (os->rmattr (oid, "beta") != 0) - { - cout << "FAIL: rmattr beta" << endl; - } - if (os->rmattr (oid, "gamma") != 0) - { - cout << "FAIL: rmattr gamma" << endl; - } - - coll_t cid = 0xCAFEBABE; - if (os->create_collection (cid) != 0) - { - cout << "FAIL: create_collection" << endl; - } - if (os->create_collection (cid + 10) != 0) - { - cout << "FAIL: create_collection" << endl; - } - if (os->create_collection (cid + 5) != 0) - { - cout << "FAIL: create_collection" << endl; - } - if (os->create_collection (42) != 0) - { - cout << "FAIL: create_collection" << endl; - } - - if (os->collection_add (cid, oid) != 0) - { - cout << "FAIL: collection_add" << endl; - } - - list ls; - if (os->list_collections (ls) < 0) - { - cout << "FAIL: list_collections" << endl; - } - cout << "collections: "; - for (list::iterator it = ls.begin(); it != ls.end(); it++) - { - cout << *it << ", "; - } - cout << endl; - - if (os->destroy_collection (0xCAFEBABE + 10) != 0) - { - cout << "FAIL: destroy_collection" << endl; - } - - if (os->destroy_collection (0xCAFEBADE + 10) == 0) - { - cout << "FAIL: destroy_collection" << endl; - } - - object_t oid2 (12345, 12345); - for (int i = 0; i < 8; i++) - { - oid2.rev++; - if (os->collection_add (cid, oid2) != 0) - { - cout << "FAIL: collection_add" << endl; - } - } - for (int i = 0; i < 8; i++) - { - if (os->collection_remove (cid, oid2) != 0) - { - cout << "FAIL: collection_remove" << endl; - } - oid2.rev--; - } - - if (os->collection_setattr (cid, "alpha", "value", 5) != 0) - cout << "FAIL: collection_setattr" << endl; - if (os->collection_setattr (cid, "beta", "value", 5) != 0) - cout << "FAIL: collection_setattr" << endl; - if (os->collection_setattr (cid, "gamma", "value", 5) != 0) - cout << "FAIL: collection_setattr" << endl; - if (os->collection_setattr (cid, "fred", "value", 5) != 0) - cout << "FAIL: collection_setattr" << endl; - - memset (attrvalue, 0, sizeof (attrvalue)); - if (os->collection_getattr (cid, "alpha", attrvalue, sizeof (attrvalue)) < 0) - cout << "FAIL: collection_getattr" << endl; - else if (strncmp (attrvalue, "value", 5) != 0) - cout << "FAIL: collection attribute value different" << endl; - memset (attrvalue, 0, sizeof (attrvalue)); - if (os->collection_getattr (cid, "beta", attrvalue, sizeof (attrvalue)) < 0) - cout << "FAIL: collection_getattr" << endl; - else if (strncmp (attrvalue, "value", 5) != 0) - cout << "FAIL: collection attribute value different" << endl; - memset (attrvalue, 0, sizeof (attrvalue)); - if (os->collection_getattr (cid, "gamma", attrvalue, sizeof (attrvalue)) < 0) - cout << "FAIL: collection_getattr" << endl; - else if (strncmp (attrvalue, "value", 5) != 0) - cout << "FAIL: collection attribute value different" << endl; - memset (attrvalue, 0, sizeof (attrvalue)); - if (os->collection_getattr (cid, "fred", attrvalue, sizeof (attrvalue)) < 0) - cout << "FAIL: collection_getattr" << endl; - else if (strncmp (attrvalue, "value", 5) != 0) - cout << "FAIL: collection attribute value different" << endl; - - if (os->collection_setattr (cid, "alpha", "eulavvalue", 10) != 0) - cout << "FAIL: collection setattr overwrite" << endl; - memset (attrvalue, 0, sizeof (attrvalue)); - if (os->collection_getattr (cid, "alpha", attrvalue, sizeof (attrvalue)) < 0) - cout << "FAIL: collection_getattr" << endl; - else if (strncmp (attrvalue, "eulavvalue", 10) != 0) - cout << "FAIL: collection attribute value different" << endl; - memset (attrvalue, 0, sizeof (attrvalue)); - if (os->collection_getattr (cid, "beta", attrvalue, sizeof (attrvalue)) < 0) - cout << "FAIL: collection_getattr" << endl; - else if (strncmp (attrvalue, "value", 5) != 0) - cout << "FAIL: collection attribute value different" << endl; - memset (attrvalue, 0, sizeof (attrvalue)); - if (os->collection_getattr (cid, "gamma", attrvalue, sizeof (attrvalue)) < 0) - cout << "FAIL: collection_getattr" << endl; - else if (strncmp (attrvalue, "value", 5) != 0) - cout << "FAIL: collection attribute value different" << endl; - memset (attrvalue, 0, sizeof (attrvalue)); - if (os->collection_getattr (cid, "fred", attrvalue, sizeof (attrvalue)) < 0) - cout << "FAIL: collection_getattr" << endl; - else if (strncmp (attrvalue, "value", 5) != 0) - cout << "FAIL: collection attribute value different" << endl; - - if (os->collection_rmattr (cid, "alpha") != 0) - cout << "FAIL: collection_rmattr" << endl; - if (os->collection_rmattr (cid, "fred") != 0) - cout << "FAIL: collection_rmattr" << endl; - if (os->collection_rmattr (cid, "beta") != 0) - cout << "FAIL: collection_rmattr" << endl; - if (os->collection_rmattr (cid, "gamma") != 0) - cout << "FAIL: collection_rmattr" << endl; - - if (os->collection_rmattr (cid, "alpha") == 0) - cout << "FAIL: collection_rmattr (nonexistent)" << endl; - - // Truncate the object. - if (os->truncate (oid, 512, NULL) != 0) - { - cout << "FAIL: truncate" << endl; - } - - // Expand the object. - if (os->truncate (oid, 1200, NULL) != 0) - { - cout << "FAIL: expand" << endl; - } - - // Delete the object. - if (os->remove (oid) != 0) - { - cout << "FAIL: could not remove object" << endl; - } - - // Shouldn't be there - if (os->exists (oid)) - { - cout << "FAIL: should not be there" << endl; - } - - os->sync(); - exit (0); -} diff --git a/branches/sage/mds/test/testtree.cc b/branches/sage/mds/test/testtree.cc deleted file mode 100644 index 2c21bcbe52e25..0000000000000 --- a/branches/sage/mds/test/testtree.cc +++ /dev/null @@ -1,46 +0,0 @@ - - -#include "../crush/BinaryTree.h" -using namespace crush; - -#include -#include -using namespace std; - -int main() -{ - BinaryTree t; - - vector nodes; - - for (int i=0; i<30; i++) { - cout << "adding " << i << endl; - int n = t.add_node(1); - nodes.push_back(n); - //cout << t << endl; - } - cout << t << endl; - - for (int k=0; k<10000; k++) { - if (rand() % 2) { - cout << "adding" << endl; - nodes.push_back( t.add_node(1) ); - } else { - if (!nodes.empty()) { - //for (int i=0; i -using namespace std; - - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -int main(int argc, char**argv) -{ - int a = 1; - int b = 2; - - mknod("test", 0600, 0); - - cout << "setxattr " << setxattr("test", "asdf", &a, sizeof(a), 0) << endl; - cout << "errno " << errno << " " << strerror(errno) << endl; - cout << "getxattr " << getxattr("test", "asdf", &b, sizeof(b)) << endl; - cout << "errno " << errno << " " << strerror(errno) << endl; - cout << "a is " << a << " and b is " << b << endl; - return 0; -} diff --git a/branches/sage/mds/valgrind.supp b/branches/sage/mds/valgrind.supp deleted file mode 100644 index 356df039050c4..0000000000000 --- a/branches/sage/mds/valgrind.supp +++ /dev/null @@ -1,62 +0,0 @@ -# some valgrind suppressions -# to load these automagically, -# cat > ~/.valgrindrc -# --suppressions=valgrind.supp -# - - -# this one makes valgrind shut up about what appears to be a bug in libc's writev. -{ - writev uninit bytes thing -sage - Memcheck:Param - writev(vector[...]) - fun:writev - fun:_ZN11BlockDevice6_writeEijjRN6buffer4listE - fun:_ZN11BlockDevice5do_ioEiRSt4listIPNS_6biovecESaIS2_EE - fun:_ZN11BlockDevice15io_thread_entryEv - fun:_ZN11BlockDevice8IOThread5entryEv - fun:_ZN6Thread11_entry_funcEPv - fun:start_thread - fun:clone - obj:* - obj:* - obj:* - obj:* -} - -# gethostbyname -{ - gethostbyname on issdm - Memcheck:Param - socketcall.sendto(msg) - fun:send - fun:get_mapping - fun:__nscd_get_map_ref - fun:nscd_gethst_r - fun:__nscd_gethostbyname_r - fun:gethostbyname_r@@GLIBC_2.2.5 - fun:gethostbyname - fun:_ZN4Rank8Accepter5startEv - fun:_ZN4Rank10start_rankEv - fun:main -} - -# gethostbyname - -{ - gethostbyname on foil - Memcheck:Addr8 - obj:/lib/ld-2.6.1.so - obj:/lib/ld-2.6.1.so - obj:/lib/ld-2.6.1.so - obj:/lib/ld-2.6.1.so - obj:/lib/ld-2.6.1.so - obj:/lib/ld-2.6.1.so - obj:/lib/ld-2.6.1.so - obj:/lib/libc-2.6.1.so - obj:/lib/ld-2.6.1.so - fun:__libc_dlopen_mode - fun:__nss_lookup_function - obj:/lib/libc-2.6.1.so -} - diff --git a/trunk/fusetrace/Makefile b/fusetrace/Makefile similarity index 100% rename from trunk/fusetrace/Makefile rename to fusetrace/Makefile diff --git a/trunk/fusetrace/fusetrace_ll.cc b/fusetrace/fusetrace_ll.cc similarity index 100% rename from trunk/fusetrace/fusetrace_ll.cc rename to fusetrace/fusetrace_ll.cc diff --git a/branches/marnberg/quota/COPYING b/src/COPYING similarity index 100% rename from branches/marnberg/quota/COPYING rename to src/COPYING diff --git a/trunk/ceph/Makefile b/src/Makefile similarity index 100% rename from trunk/ceph/Makefile rename to src/Makefile diff --git a/branches/marnberg/quota/README b/src/README similarity index 100% rename from branches/marnberg/quota/README rename to src/README diff --git a/trunk/ceph/TODO b/src/TODO similarity index 100% rename from trunk/ceph/TODO rename to src/TODO diff --git a/trunk/ceph/active/README b/src/active/README similarity index 100% rename from trunk/ceph/active/README rename to src/active/README diff --git a/branches/sage/crush/active/activemaster.cc b/src/active/activemaster.cc similarity index 100% rename from branches/sage/crush/active/activemaster.cc rename to src/active/activemaster.cc diff --git a/branches/sage/crush/active/activemaster.h b/src/active/activemaster.h similarity index 100% rename from branches/sage/crush/active/activemaster.h rename to src/active/activemaster.h diff --git a/trunk/ceph/active/activeslave.cc b/src/active/activeslave.cc similarity index 100% rename from trunk/ceph/active/activeslave.cc rename to src/active/activeslave.cc diff --git a/trunk/ceph/active/activeslave.h b/src/active/activeslave.h similarity index 100% rename from trunk/ceph/active/activeslave.h rename to src/active/activeslave.h diff --git a/branches/sage/crush/active/activetaskd.cc b/src/active/activetaskd.cc similarity index 100% rename from branches/sage/crush/active/activetaskd.cc rename to src/active/activetaskd.cc diff --git a/branches/sage/crush/active/activetaskd.h b/src/active/activetaskd.h similarity index 100% rename from branches/sage/crush/active/activetaskd.h rename to src/active/activetaskd.h diff --git a/branches/sage/crush/active/client_init.cc b/src/active/client_init.cc similarity index 100% rename from branches/sage/crush/active/client_init.cc rename to src/active/client_init.cc diff --git a/branches/sage/crush/active/client_init.h b/src/active/client_init.h similarity index 100% rename from branches/sage/crush/active/client_init.h rename to src/active/client_init.h diff --git a/trunk/ceph/active/common.h b/src/active/common.h similarity index 100% rename from trunk/ceph/active/common.h rename to src/active/common.h diff --git a/branches/sage/crush/active/echotestclient.cc b/src/active/echotestclient.cc similarity index 100% rename from branches/sage/crush/active/echotestclient.cc rename to src/active/echotestclient.cc diff --git a/branches/sage/crush/active/echotestclient.h b/src/active/echotestclient.h similarity index 100% rename from branches/sage/crush/active/echotestclient.h rename to src/active/echotestclient.h diff --git a/branches/sage/crush/active/inet.h b/src/active/inet.h similarity index 100% rename from branches/sage/crush/active/inet.h rename to src/active/inet.h diff --git a/trunk/ceph/active/msgtestclient.cc b/src/active/msgtestclient.cc similarity index 100% rename from trunk/ceph/active/msgtestclient.cc rename to src/active/msgtestclient.cc diff --git a/trunk/ceph/active/msgtestclient.h b/src/active/msgtestclient.h similarity index 100% rename from trunk/ceph/active/msgtestclient.h rename to src/active/msgtestclient.h diff --git a/branches/sage/crush/active/trivial_task.cc b/src/active/trivial_task.cc similarity index 100% rename from branches/sage/crush/active/trivial_task.cc rename to src/active/trivial_task.cc diff --git a/branches/sage/crush/active/trivial_task.h b/src/active/trivial_task.h similarity index 100% rename from branches/sage/crush/active/trivial_task.h rename to src/active/trivial_task.h diff --git a/trunk/ceph/active/utility.h b/src/active/utility.h similarity index 100% rename from trunk/ceph/active/utility.h rename to src/active/utility.h diff --git a/branches/sage/crush/cfuse.cc b/src/cfuse.cc similarity index 100% rename from branches/sage/crush/cfuse.cc rename to src/cfuse.cc diff --git a/trunk/ceph/client/Client.cc b/src/client/Client.cc similarity index 100% rename from trunk/ceph/client/Client.cc rename to src/client/Client.cc diff --git a/trunk/ceph/client/Client.h b/src/client/Client.h similarity index 100% rename from trunk/ceph/client/Client.h rename to src/client/Client.h diff --git a/branches/sage/crush/client/FileCache.cc b/src/client/FileCache.cc similarity index 100% rename from branches/sage/crush/client/FileCache.cc rename to src/client/FileCache.cc diff --git a/branches/sage/crush/client/FileCache.h b/src/client/FileCache.h similarity index 100% rename from branches/sage/crush/client/FileCache.h rename to src/client/FileCache.h diff --git a/trunk/ceph/client/SyntheticClient.cc b/src/client/SyntheticClient.cc similarity index 100% rename from trunk/ceph/client/SyntheticClient.cc rename to src/client/SyntheticClient.cc diff --git a/trunk/ceph/client/SyntheticClient.h b/src/client/SyntheticClient.h similarity index 100% rename from trunk/ceph/client/SyntheticClient.h rename to src/client/SyntheticClient.h diff --git a/branches/sage/crush/client/Trace.cc b/src/client/Trace.cc similarity index 100% rename from branches/sage/crush/client/Trace.cc rename to src/client/Trace.cc diff --git a/branches/sage/crush/client/Trace.h b/src/client/Trace.h similarity index 100% rename from branches/sage/crush/client/Trace.h rename to src/client/Trace.h diff --git a/branches/sage/crush/client/fuse.cc b/src/client/fuse.cc similarity index 100% rename from branches/sage/crush/client/fuse.cc rename to src/client/fuse.cc diff --git a/branches/sage/crush/client/fuse.h b/src/client/fuse.h similarity index 100% rename from branches/sage/crush/client/fuse.h rename to src/client/fuse.h diff --git a/branches/sage/crush/client/fuse_ll.cc b/src/client/fuse_ll.cc similarity index 100% rename from branches/sage/crush/client/fuse_ll.cc rename to src/client/fuse_ll.cc diff --git a/branches/sage/crush/client/fuse_ll.h b/src/client/fuse_ll.h similarity index 100% rename from branches/sage/crush/client/fuse_ll.h rename to src/client/fuse_ll.h diff --git a/branches/sage/crush/client/hadoop/CephFSInterface.cc b/src/client/hadoop/CephFSInterface.cc similarity index 100% rename from branches/sage/crush/client/hadoop/CephFSInterface.cc rename to src/client/hadoop/CephFSInterface.cc diff --git a/branches/sage/crush/client/hadoop/CephFSInterface.h b/src/client/hadoop/CephFSInterface.h similarity index 100% rename from branches/sage/crush/client/hadoop/CephFSInterface.h rename to src/client/hadoop/CephFSInterface.h diff --git a/branches/sage/crush/client/ldceph.cc b/src/client/ldceph.cc similarity index 100% rename from branches/sage/crush/client/ldceph.cc rename to src/client/ldceph.cc diff --git a/branches/sage/crush/cmds.cc b/src/cmds.cc similarity index 100% rename from branches/sage/crush/cmds.cc rename to src/cmds.cc diff --git a/branches/sage/ebofs2/cmon.cc b/src/cmon.cc similarity index 100% rename from branches/sage/ebofs2/cmon.cc rename to src/cmon.cc diff --git a/branches/sage/crush/cmonctl.cc b/src/cmonctl.cc similarity index 100% rename from branches/sage/crush/cmonctl.cc rename to src/cmonctl.cc diff --git a/branches/sage/crush/common/Clock.cc b/src/common/Clock.cc similarity index 100% rename from branches/sage/crush/common/Clock.cc rename to src/common/Clock.cc diff --git a/trunk/ceph/common/Clock.h b/src/common/Clock.h similarity index 100% rename from trunk/ceph/common/Clock.h rename to src/common/Clock.h diff --git a/branches/sage/crush/common/Cond.h b/src/common/Cond.h similarity index 100% rename from branches/sage/crush/common/Cond.h rename to src/common/Cond.h diff --git a/branches/sage/crush/common/DecayCounter.h b/src/common/DecayCounter.h similarity index 100% rename from branches/sage/crush/common/DecayCounter.h rename to src/common/DecayCounter.h diff --git a/branches/sage/crush/common/LogType.h b/src/common/LogType.h similarity index 100% rename from branches/sage/crush/common/LogType.h rename to src/common/LogType.h diff --git a/trunk/ceph/common/Logger.cc b/src/common/Logger.cc similarity index 100% rename from trunk/ceph/common/Logger.cc rename to src/common/Logger.cc diff --git a/branches/sage/crush/common/Logger.h b/src/common/Logger.h similarity index 100% rename from branches/sage/crush/common/Logger.h rename to src/common/Logger.h diff --git a/branches/sage/crush/common/Mutex.h b/src/common/Mutex.h similarity index 100% rename from branches/sage/crush/common/Mutex.h rename to src/common/Mutex.h diff --git a/branches/sage/crush/common/RWLock.h b/src/common/RWLock.h similarity index 100% rename from branches/sage/crush/common/RWLock.h rename to src/common/RWLock.h diff --git a/branches/sage/crush/common/Semaphore.h b/src/common/Semaphore.h similarity index 100% rename from branches/sage/crush/common/Semaphore.h rename to src/common/Semaphore.h diff --git a/branches/sage/crush/common/Thread.h b/src/common/Thread.h similarity index 100% rename from branches/sage/crush/common/Thread.h rename to src/common/Thread.h diff --git a/branches/sage/crush/common/ThreadPool.h b/src/common/ThreadPool.h similarity index 100% rename from branches/sage/crush/common/ThreadPool.h rename to src/common/ThreadPool.h diff --git a/branches/sage/crush/common/Timer.cc b/src/common/Timer.cc similarity index 100% rename from branches/sage/crush/common/Timer.cc rename to src/common/Timer.cc diff --git a/branches/sage/crush/common/Timer.h b/src/common/Timer.h similarity index 100% rename from branches/sage/crush/common/Timer.h rename to src/common/Timer.h diff --git a/trunk/ceph/config.cc b/src/config.cc similarity index 100% rename from trunk/ceph/config.cc rename to src/config.cc diff --git a/trunk/ceph/config.h b/src/config.h similarity index 100% rename from trunk/ceph/config.h rename to src/config.h diff --git a/branches/sage/crush/cosd.cc b/src/cosd.cc similarity index 100% rename from branches/sage/crush/cosd.cc rename to src/cosd.cc diff --git a/branches/sage/crush/crush.old/BinaryTree.h b/src/crush.old/BinaryTree.h similarity index 100% rename from branches/sage/crush/crush.old/BinaryTree.h rename to src/crush.old/BinaryTree.h diff --git a/branches/sage/crush/crush.old/Bucket.h b/src/crush.old/Bucket.h similarity index 100% rename from branches/sage/crush/crush.old/Bucket.h rename to src/crush.old/Bucket.h diff --git a/branches/sage/crush/crush.old/Hash.h b/src/crush.old/Hash.h similarity index 100% rename from branches/sage/crush/crush.old/Hash.h rename to src/crush.old/Hash.h diff --git a/branches/sage/crush/crush.old/crush.h b/src/crush.old/crush.h similarity index 100% rename from branches/sage/crush/crush.old/crush.h rename to src/crush.old/crush.h diff --git a/branches/marnberg/quota/crush/test/bucket_movement.cc b/src/crush.old/test/bucket_movement.cc similarity index 100% rename from branches/marnberg/quota/crush/test/bucket_movement.cc rename to src/crush.old/test/bucket_movement.cc diff --git a/branches/marnberg/quota/crush/test/bucket_variance.cc b/src/crush.old/test/bucket_variance.cc similarity index 100% rename from branches/marnberg/quota/crush/test/bucket_variance.cc rename to src/crush.old/test/bucket_variance.cc diff --git a/branches/marnberg/quota/crush/test/cluster_movement.cc b/src/crush.old/test/cluster_movement.cc similarity index 100% rename from branches/marnberg/quota/crush/test/cluster_movement.cc rename to src/crush.old/test/cluster_movement.cc diff --git a/branches/marnberg/quota/crush/test/cluster_movement_remove.cc b/src/crush.old/test/cluster_movement_remove.cc similarity index 100% rename from branches/marnberg/quota/crush/test/cluster_movement_remove.cc rename to src/crush.old/test/cluster_movement_remove.cc diff --git a/branches/marnberg/quota/crush/test/cluster_movement_rush.cc b/src/crush.old/test/cluster_movement_rush.cc similarity index 100% rename from branches/marnberg/quota/crush/test/cluster_movement_rush.cc rename to src/crush.old/test/cluster_movement_rush.cc diff --git a/branches/marnberg/quota/crush/test/creeping_failure.cc b/src/crush.old/test/creeping_failure.cc similarity index 100% rename from branches/marnberg/quota/crush/test/creeping_failure.cc rename to src/crush.old/test/creeping_failure.cc diff --git a/branches/marnberg/quota/crush/test/creeping_failure_variance.cc b/src/crush.old/test/creeping_failure_variance.cc similarity index 100% rename from branches/marnberg/quota/crush/test/creeping_failure_variance.cc rename to src/crush.old/test/creeping_failure_variance.cc diff --git a/branches/marnberg/quota/crush/test/depth_variance.cc b/src/crush.old/test/depth_variance.cc similarity index 100% rename from branches/marnberg/quota/crush/test/depth_variance.cc rename to src/crush.old/test/depth_variance.cc diff --git a/branches/marnberg/quota/crush/test/mixed.cc b/src/crush.old/test/mixed.cc similarity index 100% rename from branches/marnberg/quota/crush/test/mixed.cc rename to src/crush.old/test/mixed.cc diff --git a/branches/marnberg/quota/crush/test/movement.cc b/src/crush.old/test/movement.cc similarity index 100% rename from branches/marnberg/quota/crush/test/movement.cc rename to src/crush.old/test/movement.cc diff --git a/branches/marnberg/quota/crush/test/movement_failed.cc b/src/crush.old/test/movement_failed.cc similarity index 100% rename from branches/marnberg/quota/crush/test/movement_failed.cc rename to src/crush.old/test/movement_failed.cc diff --git a/branches/marnberg/quota/crush/test/overload.cc b/src/crush.old/test/overload.cc similarity index 100% rename from branches/marnberg/quota/crush/test/overload.cc rename to src/crush.old/test/overload.cc diff --git a/branches/marnberg/quota/crush/test/overload_variance.cc b/src/crush.old/test/overload_variance.cc similarity index 100% rename from branches/marnberg/quota/crush/test/overload_variance.cc rename to src/crush.old/test/overload_variance.cc diff --git a/branches/marnberg/quota/crush/test/sizes.cc b/src/crush.old/test/sizes.cc similarity index 100% rename from branches/marnberg/quota/crush/test/sizes.cc rename to src/crush.old/test/sizes.cc diff --git a/branches/marnberg/quota/crush/test/smallbucket.cc b/src/crush.old/test/smallbucket.cc similarity index 100% rename from branches/marnberg/quota/crush/test/smallbucket.cc rename to src/crush.old/test/smallbucket.cc diff --git a/branches/marnberg/quota/crush/test/speed_bucket.cc b/src/crush.old/test/speed_bucket.cc similarity index 100% rename from branches/marnberg/quota/crush/test/speed_bucket.cc rename to src/crush.old/test/speed_bucket.cc diff --git a/branches/marnberg/quota/crush/test/speed_depth.cc b/src/crush.old/test/speed_depth.cc similarity index 100% rename from branches/marnberg/quota/crush/test/speed_depth.cc rename to src/crush.old/test/speed_depth.cc diff --git a/branches/marnberg/quota/crush/test/speed_rush.cc b/src/crush.old/test/speed_rush.cc similarity index 100% rename from branches/marnberg/quota/crush/test/speed_rush.cc rename to src/crush.old/test/speed_rush.cc diff --git a/branches/marnberg/quota/crush/test/t.cc b/src/crush.old/test/t.cc similarity index 100% rename from branches/marnberg/quota/crush/test/t.cc rename to src/crush.old/test/t.cc diff --git a/branches/marnberg/quota/crush/test/testbucket.cc b/src/crush.old/test/testbucket.cc similarity index 100% rename from branches/marnberg/quota/crush/test/testbucket.cc rename to src/crush.old/test/testbucket.cc diff --git a/branches/marnberg/quota/crush/test/testnormal.cc b/src/crush.old/test/testnormal.cc similarity index 100% rename from branches/marnberg/quota/crush/test/testnormal.cc rename to src/crush.old/test/testnormal.cc diff --git a/trunk/ceph/crush/CrushWrapper.h b/src/crush/CrushWrapper.h similarity index 100% rename from trunk/ceph/crush/CrushWrapper.h rename to src/crush/CrushWrapper.h diff --git a/trunk/ceph/crush/Makefile b/src/crush/Makefile similarity index 100% rename from trunk/ceph/crush/Makefile rename to src/crush/Makefile diff --git a/branches/sage/crush/crush/buckets.c b/src/crush/buckets.c similarity index 100% rename from branches/sage/crush/crush/buckets.c rename to src/crush/buckets.c diff --git a/trunk/ceph/crush/builder.c b/src/crush/builder.c similarity index 100% rename from trunk/ceph/crush/builder.c rename to src/crush/builder.c diff --git a/trunk/ceph/crush/builder.h b/src/crush/builder.h similarity index 100% rename from trunk/ceph/crush/builder.h rename to src/crush/builder.h diff --git a/trunk/ceph/crush/crush.c b/src/crush/crush.c similarity index 100% rename from trunk/ceph/crush/crush.c rename to src/crush/crush.c diff --git a/trunk/ceph/crush/crush.h b/src/crush/crush.h similarity index 100% rename from trunk/ceph/crush/crush.h rename to src/crush/crush.h diff --git a/branches/sage/crush/crush/hash.h b/src/crush/hash.h similarity index 100% rename from branches/sage/crush/crush/hash.h rename to src/crush/hash.h diff --git a/trunk/ceph/crush/mapper.c b/src/crush/mapper.c similarity index 100% rename from trunk/ceph/crush/mapper.c rename to src/crush/mapper.c diff --git a/branches/sage/crush/crush/mapper.h b/src/crush/mapper.h similarity index 100% rename from branches/sage/crush/crush/mapper.h rename to src/crush/mapper.h diff --git a/trunk/ceph/crush/test.c b/src/crush/test.c similarity index 100% rename from trunk/ceph/crush/test.c rename to src/crush/test.c diff --git a/branches/sage/crush/crush/types.h b/src/crush/types.h similarity index 100% rename from branches/sage/crush/crush/types.h rename to src/crush/types.h diff --git a/branches/sage/crush/csyn.cc b/src/csyn.cc similarity index 100% rename from branches/sage/crush/csyn.cc rename to src/csyn.cc diff --git a/branches/sage/crush/doc/Commitdir.txt b/src/doc/Commitdir.txt similarity index 100% rename from branches/sage/crush/doc/Commitdir.txt rename to src/doc/Commitdir.txt diff --git a/branches/sage/crush/doc/anchortable.txt b/src/doc/anchortable.txt similarity index 100% rename from branches/sage/crush/doc/anchortable.txt rename to src/doc/anchortable.txt diff --git a/branches/marnberg/quota/doc/bdb.txt b/src/doc/bdb.txt similarity index 100% rename from branches/marnberg/quota/doc/bdb.txt rename to src/doc/bdb.txt diff --git a/branches/sage/crush/doc/caching.txt b/src/doc/caching.txt similarity index 100% rename from branches/sage/crush/doc/caching.txt rename to src/doc/caching.txt diff --git a/branches/sage/crush/doc/exports.txt b/src/doc/exports.txt similarity index 100% rename from branches/sage/crush/doc/exports.txt rename to src/doc/exports.txt diff --git a/branches/sage/crush/doc/header.txt b/src/doc/header.txt similarity index 100% rename from branches/sage/crush/doc/header.txt rename to src/doc/header.txt diff --git a/branches/marnberg/quota/doc/inos.txt b/src/doc/inos.txt similarity index 100% rename from branches/marnberg/quota/doc/inos.txt rename to src/doc/inos.txt diff --git a/branches/marnberg/quota/doc/lazy_posix.txt b/src/doc/lazy_posix.txt similarity index 100% rename from branches/marnberg/quota/doc/lazy_posix.txt rename to src/doc/lazy_posix.txt diff --git a/branches/sage/crush/doc/mds_locks.txt b/src/doc/mds_locks.txt similarity index 100% rename from branches/sage/crush/doc/mds_locks.txt rename to src/doc/mds_locks.txt diff --git a/branches/sage/crush/doc/modeline.txt b/src/doc/modeline.txt similarity index 100% rename from branches/sage/crush/doc/modeline.txt rename to src/doc/modeline.txt diff --git a/branches/marnberg/quota/doc/shared_write_states_nogo.txt b/src/doc/shared_write_states_nogo.txt similarity index 100% rename from branches/marnberg/quota/doc/shared_write_states_nogo.txt rename to src/doc/shared_write_states_nogo.txt diff --git a/trunk/ceph/dupstore.cc b/src/dupstore.cc similarity index 100% rename from trunk/ceph/dupstore.cc rename to src/dupstore.cc diff --git a/branches/sage/crush/ebofs/Allocator.cc b/src/ebofs/Allocator.cc similarity index 100% rename from branches/sage/crush/ebofs/Allocator.cc rename to src/ebofs/Allocator.cc diff --git a/branches/sage/crush/ebofs/Allocator.h b/src/ebofs/Allocator.h similarity index 100% rename from branches/sage/crush/ebofs/Allocator.h rename to src/ebofs/Allocator.h diff --git a/trunk/ceph/ebofs/BlockDevice.cc b/src/ebofs/BlockDevice.cc similarity index 100% rename from trunk/ceph/ebofs/BlockDevice.cc rename to src/ebofs/BlockDevice.cc diff --git a/trunk/ceph/ebofs/BlockDevice.h b/src/ebofs/BlockDevice.h similarity index 100% rename from trunk/ceph/ebofs/BlockDevice.h rename to src/ebofs/BlockDevice.h diff --git a/trunk/ceph/ebofs/BufferCache.cc b/src/ebofs/BufferCache.cc similarity index 100% rename from trunk/ceph/ebofs/BufferCache.cc rename to src/ebofs/BufferCache.cc diff --git a/trunk/ceph/ebofs/BufferCache.h b/src/ebofs/BufferCache.h similarity index 100% rename from trunk/ceph/ebofs/BufferCache.h rename to src/ebofs/BufferCache.h diff --git a/trunk/ceph/ebofs/Cnode.h b/src/ebofs/Cnode.h similarity index 100% rename from trunk/ceph/ebofs/Cnode.h rename to src/ebofs/Cnode.h diff --git a/trunk/ceph/ebofs/Ebofs.cc b/src/ebofs/Ebofs.cc similarity index 100% rename from trunk/ceph/ebofs/Ebofs.cc rename to src/ebofs/Ebofs.cc diff --git a/trunk/ceph/ebofs/Ebofs.h b/src/ebofs/Ebofs.h similarity index 100% rename from trunk/ceph/ebofs/Ebofs.h rename to src/ebofs/Ebofs.h diff --git a/branches/sage/crush/ebofs/FileJournal.cc b/src/ebofs/FileJournal.cc similarity index 100% rename from branches/sage/crush/ebofs/FileJournal.cc rename to src/ebofs/FileJournal.cc diff --git a/trunk/ceph/ebofs/FileJournal.h b/src/ebofs/FileJournal.h similarity index 100% rename from trunk/ceph/ebofs/FileJournal.h rename to src/ebofs/FileJournal.h diff --git a/branches/sage/crush/ebofs/Journal.h b/src/ebofs/Journal.h similarity index 100% rename from branches/sage/crush/ebofs/Journal.h rename to src/ebofs/Journal.h diff --git a/trunk/ceph/ebofs/Onode.h b/src/ebofs/Onode.h similarity index 100% rename from trunk/ceph/ebofs/Onode.h rename to src/ebofs/Onode.h diff --git a/trunk/ceph/ebofs/Table.h b/src/ebofs/Table.h similarity index 100% rename from trunk/ceph/ebofs/Table.h rename to src/ebofs/Table.h diff --git a/trunk/ceph/ebofs/csum.h b/src/ebofs/csum.h similarity index 100% rename from trunk/ceph/ebofs/csum.h rename to src/ebofs/csum.h diff --git a/branches/sage/crush/ebofs/mkfs.ebofs.cc b/src/ebofs/mkfs.ebofs.cc similarity index 100% rename from branches/sage/crush/ebofs/mkfs.ebofs.cc rename to src/ebofs/mkfs.ebofs.cc diff --git a/trunk/ceph/ebofs/nodes.h b/src/ebofs/nodes.h similarity index 100% rename from trunk/ceph/ebofs/nodes.h rename to src/ebofs/nodes.h diff --git a/trunk/ceph/ebofs/test.ebofs.cc b/src/ebofs/test.ebofs.cc similarity index 100% rename from trunk/ceph/ebofs/test.ebofs.cc rename to src/ebofs/test.ebofs.cc diff --git a/trunk/ceph/ebofs/types.h b/src/ebofs/types.h similarity index 100% rename from trunk/ceph/ebofs/types.h rename to src/ebofs/types.h diff --git a/branches/sage/crush/extractosdmaps.cc b/src/extractosdmaps.cc similarity index 100% rename from branches/sage/crush/extractosdmaps.cc rename to src/extractosdmaps.cc diff --git a/trunk/ceph/fakefuse.cc b/src/fakefuse.cc similarity index 100% rename from trunk/ceph/fakefuse.cc rename to src/fakefuse.cc diff --git a/trunk/ceph/fakesyn.cc b/src/fakesyn.cc similarity index 100% rename from trunk/ceph/fakesyn.cc rename to src/fakesyn.cc diff --git a/branches/sage/crush/include/Context.h b/src/include/Context.h similarity index 100% rename from branches/sage/crush/include/Context.h rename to src/include/Context.h diff --git a/branches/sage/crush/include/Distribution.h b/src/include/Distribution.h similarity index 100% rename from branches/sage/crush/include/Distribution.h rename to src/include/Distribution.h diff --git a/trunk/ceph/include/atomic.h b/src/include/atomic.h similarity index 100% rename from trunk/ceph/include/atomic.h rename to src/include/atomic.h diff --git a/branches/sage/crush/include/bitmapper.h b/src/include/bitmapper.h similarity index 100% rename from branches/sage/crush/include/bitmapper.h rename to src/include/bitmapper.h diff --git a/branches/sage/crush/include/blobhash.h b/src/include/blobhash.h similarity index 100% rename from branches/sage/crush/include/blobhash.h rename to src/include/blobhash.h diff --git a/trunk/ceph/include/buffer.h b/src/include/buffer.h similarity index 100% rename from trunk/ceph/include/buffer.h rename to src/include/buffer.h diff --git a/trunk/ceph/include/ceph_fs.h b/src/include/ceph_fs.h similarity index 100% rename from trunk/ceph/include/ceph_fs.h rename to src/include/ceph_fs.h diff --git a/branches/sage/crush/include/encodable.h b/src/include/encodable.h similarity index 100% rename from branches/sage/crush/include/encodable.h rename to src/include/encodable.h diff --git a/branches/sage/crush/include/error.h b/src/include/error.h similarity index 100% rename from branches/sage/crush/include/error.h rename to src/include/error.h diff --git a/branches/sage/mds/include/filepath.h b/src/include/filepath.h similarity index 100% rename from branches/sage/mds/include/filepath.h rename to src/include/filepath.h diff --git a/trunk/ceph/include/frag.h b/src/include/frag.h similarity index 100% rename from trunk/ceph/include/frag.h rename to src/include/frag.h diff --git a/trunk/ceph/include/hash.h b/src/include/hash.h similarity index 100% rename from trunk/ceph/include/hash.h rename to src/include/hash.h diff --git a/trunk/ceph/include/interval_set.h b/src/include/interval_set.h similarity index 100% rename from trunk/ceph/include/interval_set.h rename to src/include/interval_set.h diff --git a/branches/sage/crush/include/lru.h b/src/include/lru.h similarity index 100% rename from branches/sage/crush/include/lru.h rename to src/include/lru.h diff --git a/trunk/ceph/include/object.h b/src/include/object.h similarity index 100% rename from trunk/ceph/include/object.h rename to src/include/object.h diff --git a/trunk/ceph/include/pobject.h b/src/include/pobject.h similarity index 100% rename from trunk/ceph/include/pobject.h rename to src/include/pobject.h diff --git a/branches/sage/crush/include/rangeset.h b/src/include/rangeset.h similarity index 100% rename from branches/sage/crush/include/rangeset.h rename to src/include/rangeset.h diff --git a/branches/sage/crush/include/statlite.h b/src/include/statlite.h similarity index 100% rename from branches/sage/crush/include/statlite.h rename to src/include/statlite.h diff --git a/branches/sage/crush/include/triple.h b/src/include/triple.h similarity index 100% rename from branches/sage/crush/include/triple.h rename to src/include/triple.h diff --git a/trunk/ceph/include/types.h b/src/include/types.h similarity index 100% rename from trunk/ceph/include/types.h rename to src/include/types.h diff --git a/branches/sage/crush/include/uofs.h b/src/include/uofs.h similarity index 100% rename from branches/sage/crush/include/uofs.h rename to src/include/uofs.h diff --git a/trunk/ceph/include/utime.h b/src/include/utime.h similarity index 100% rename from trunk/ceph/include/utime.h rename to src/include/utime.h diff --git a/branches/sage/crush/include/xlist.h b/src/include/xlist.h similarity index 100% rename from branches/sage/crush/include/xlist.h rename to src/include/xlist.h diff --git a/branches/marnberg/quota/jobs/alc.tp b/src/jobs/alc.tp similarity index 100% rename from branches/marnberg/quota/jobs/alc.tp rename to src/jobs/alc.tp diff --git a/branches/marnberg/quota/jobs/alcdat/makedirs b/src/jobs/alcdat/makedirs similarity index 100% rename from branches/marnberg/quota/jobs/alcdat/makedirs rename to src/jobs/alcdat/makedirs diff --git a/branches/marnberg/quota/jobs/alcdat/makedirs.big b/src/jobs/alcdat/makedirs.big similarity index 100% rename from branches/marnberg/quota/jobs/alcdat/makedirs.big rename to src/jobs/alcdat/makedirs.big diff --git a/branches/marnberg/quota/jobs/alcdat/makedirs.tput b/src/jobs/alcdat/makedirs.tput similarity index 100% rename from branches/marnberg/quota/jobs/alcdat/makedirs.tput rename to src/jobs/alcdat/makedirs.tput diff --git a/branches/marnberg/quota/jobs/alcdat/makefiles.shared b/src/jobs/alcdat/makefiles.shared similarity index 100% rename from branches/marnberg/quota/jobs/alcdat/makefiles.shared rename to src/jobs/alcdat/makefiles.shared diff --git a/branches/marnberg/quota/jobs/alcdat/openshared b/src/jobs/alcdat/openshared similarity index 100% rename from branches/marnberg/quota/jobs/alcdat/openshared rename to src/jobs/alcdat/openshared diff --git a/branches/marnberg/quota/jobs/alcdat/ossh.include b/src/jobs/alcdat/ossh.include similarity index 100% rename from branches/marnberg/quota/jobs/alcdat/ossh.include rename to src/jobs/alcdat/ossh.include diff --git a/branches/marnberg/quota/jobs/alcdat/ossh.include.big b/src/jobs/alcdat/ossh.include.big similarity index 100% rename from branches/marnberg/quota/jobs/alcdat/ossh.include.big rename to src/jobs/alcdat/ossh.include.big diff --git a/branches/marnberg/quota/jobs/alcdat/ossh.lib b/src/jobs/alcdat/ossh.lib similarity index 100% rename from branches/marnberg/quota/jobs/alcdat/ossh.lib rename to src/jobs/alcdat/ossh.lib diff --git a/branches/marnberg/quota/jobs/alcdat/ossh.lib.big b/src/jobs/alcdat/ossh.lib.big similarity index 100% rename from branches/marnberg/quota/jobs/alcdat/ossh.lib.big rename to src/jobs/alcdat/ossh.lib.big diff --git a/branches/marnberg/quota/jobs/alcdat/striping b/src/jobs/alcdat/striping similarity index 100% rename from branches/marnberg/quota/jobs/alcdat/striping rename to src/jobs/alcdat/striping diff --git a/branches/marnberg/quota/jobs/example b/src/jobs/example similarity index 100% rename from branches/marnberg/quota/jobs/example rename to src/jobs/example diff --git a/branches/marnberg/quota/jobs/mds/log_striping b/src/jobs/mds/log_striping similarity index 100% rename from branches/marnberg/quota/jobs/mds/log_striping rename to src/jobs/mds/log_striping diff --git a/branches/marnberg/quota/jobs/mds/makedir_lat b/src/jobs/mds/makedir_lat similarity index 100% rename from branches/marnberg/quota/jobs/mds/makedir_lat rename to src/jobs/mds/makedir_lat diff --git a/branches/marnberg/quota/jobs/mds/makedirs b/src/jobs/mds/makedirs similarity index 100% rename from branches/marnberg/quota/jobs/mds/makedirs rename to src/jobs/mds/makedirs diff --git a/branches/marnberg/quota/jobs/mds/opensshlib b/src/jobs/mds/opensshlib similarity index 100% rename from branches/marnberg/quota/jobs/mds/opensshlib rename to src/jobs/mds/opensshlib diff --git a/branches/marnberg/quota/jobs/meta1 b/src/jobs/meta1 similarity index 100% rename from branches/marnberg/quota/jobs/meta1 rename to src/jobs/meta1 diff --git a/branches/marnberg/quota/jobs/meta1.proc.sh b/src/jobs/meta1.proc.sh similarity index 100% rename from branches/marnberg/quota/jobs/meta1.proc.sh rename to src/jobs/meta1.proc.sh diff --git a/branches/marnberg/quota/jobs/osd/ebofs b/src/jobs/osd/ebofs similarity index 100% rename from branches/marnberg/quota/jobs/osd/ebofs rename to src/jobs/osd/ebofs diff --git a/branches/marnberg/quota/jobs/osd/mds_log b/src/jobs/osd/mds_log similarity index 100% rename from branches/marnberg/quota/jobs/osd/mds_log rename to src/jobs/osd/mds_log diff --git a/branches/marnberg/quota/jobs/osd/osd_threads b/src/jobs/osd/osd_threads similarity index 100% rename from branches/marnberg/quota/jobs/osd/osd_threads rename to src/jobs/osd/osd_threads diff --git a/branches/marnberg/quota/jobs/osd/striping b/src/jobs/osd/striping similarity index 100% rename from branches/marnberg/quota/jobs/osd/striping rename to src/jobs/osd/striping diff --git a/branches/marnberg/quota/jobs/osd/wr_lat2 b/src/jobs/osd/wr_lat2 similarity index 100% rename from branches/marnberg/quota/jobs/osd/wr_lat2 rename to src/jobs/osd/wr_lat2 diff --git a/branches/marnberg/quota/jobs/osd/write_sizes b/src/jobs/osd/write_sizes similarity index 100% rename from branches/marnberg/quota/jobs/osd/write_sizes rename to src/jobs/osd/write_sizes diff --git a/branches/marnberg/quota/jobs/rados/map_dist b/src/jobs/rados/map_dist similarity index 100% rename from branches/marnberg/quota/jobs/rados/map_dist rename to src/jobs/rados/map_dist diff --git a/branches/marnberg/quota/jobs/rados/rep_lat b/src/jobs/rados/rep_lat similarity index 100% rename from branches/marnberg/quota/jobs/rados/rep_lat rename to src/jobs/rados/rep_lat diff --git a/trunk/ceph/jobs/rados/wr_sizes b/src/jobs/rados/wr_sizes similarity index 100% rename from trunk/ceph/jobs/rados/wr_sizes rename to src/jobs/rados/wr_sizes diff --git a/branches/sage/crush/jobs/runjobsample b/src/jobs/runjobsample similarity index 100% rename from branches/sage/crush/jobs/runjobsample rename to src/jobs/runjobsample diff --git a/trunk/ceph/kernel/Makefile b/src/kernel/Makefile similarity index 100% rename from trunk/ceph/kernel/Makefile rename to src/kernel/Makefile diff --git a/trunk/ceph/kernel/README b/src/kernel/README similarity index 100% rename from trunk/ceph/kernel/README rename to src/kernel/README diff --git a/trunk/ceph/kernel/addr.c b/src/kernel/addr.c similarity index 100% rename from trunk/ceph/kernel/addr.c rename to src/kernel/addr.c diff --git a/trunk/ceph/kernel/client.c b/src/kernel/client.c similarity index 100% rename from trunk/ceph/kernel/client.c rename to src/kernel/client.c diff --git a/trunk/ceph/kernel/client.h b/src/kernel/client.h similarity index 100% rename from trunk/ceph/kernel/client.h rename to src/kernel/client.h diff --git a/trunk/ceph/kernel/crush/crush.c b/src/kernel/crush/crush.c similarity index 100% rename from trunk/ceph/kernel/crush/crush.c rename to src/kernel/crush/crush.c diff --git a/trunk/ceph/kernel/crush/crush.h b/src/kernel/crush/crush.h similarity index 100% rename from trunk/ceph/kernel/crush/crush.h rename to src/kernel/crush/crush.h diff --git a/trunk/ceph/kernel/crush/hash.h b/src/kernel/crush/hash.h similarity index 100% rename from trunk/ceph/kernel/crush/hash.h rename to src/kernel/crush/hash.h diff --git a/trunk/ceph/kernel/crush/mapper.c b/src/kernel/crush/mapper.c similarity index 100% rename from trunk/ceph/kernel/crush/mapper.c rename to src/kernel/crush/mapper.c diff --git a/trunk/ceph/kernel/crush/mapper.h b/src/kernel/crush/mapper.h similarity index 100% rename from trunk/ceph/kernel/crush/mapper.h rename to src/kernel/crush/mapper.h diff --git a/trunk/ceph/kernel/dir.c b/src/kernel/dir.c similarity index 100% rename from trunk/ceph/kernel/dir.c rename to src/kernel/dir.c diff --git a/trunk/ceph/kernel/file.c b/src/kernel/file.c similarity index 100% rename from trunk/ceph/kernel/file.c rename to src/kernel/file.c diff --git a/trunk/ceph/kernel/inode.c b/src/kernel/inode.c similarity index 100% rename from trunk/ceph/kernel/inode.c rename to src/kernel/inode.c diff --git a/trunk/ceph/kernel/kconfig.patch b/src/kernel/kconfig.patch similarity index 100% rename from trunk/ceph/kernel/kconfig.patch rename to src/kernel/kconfig.patch diff --git a/trunk/ceph/kernel/ktcp.c b/src/kernel/ktcp.c similarity index 100% rename from trunk/ceph/kernel/ktcp.c rename to src/kernel/ktcp.c diff --git a/trunk/ceph/kernel/ktcp.h b/src/kernel/ktcp.h similarity index 100% rename from trunk/ceph/kernel/ktcp.h rename to src/kernel/ktcp.h diff --git a/trunk/ceph/kernel/mds_client.c b/src/kernel/mds_client.c similarity index 100% rename from trunk/ceph/kernel/mds_client.c rename to src/kernel/mds_client.c diff --git a/trunk/ceph/kernel/mds_client.h b/src/kernel/mds_client.h similarity index 100% rename from trunk/ceph/kernel/mds_client.h rename to src/kernel/mds_client.h diff --git a/trunk/ceph/kernel/mdsmap.c b/src/kernel/mdsmap.c similarity index 100% rename from trunk/ceph/kernel/mdsmap.c rename to src/kernel/mdsmap.c diff --git a/trunk/ceph/kernel/mdsmap.h b/src/kernel/mdsmap.h similarity index 100% rename from trunk/ceph/kernel/mdsmap.h rename to src/kernel/mdsmap.h diff --git a/trunk/ceph/kernel/messenger.c b/src/kernel/messenger.c similarity index 100% rename from trunk/ceph/kernel/messenger.c rename to src/kernel/messenger.c diff --git a/trunk/ceph/kernel/messenger.h b/src/kernel/messenger.h similarity index 100% rename from trunk/ceph/kernel/messenger.h rename to src/kernel/messenger.h diff --git a/trunk/ceph/kernel/mon_client.c b/src/kernel/mon_client.c similarity index 100% rename from trunk/ceph/kernel/mon_client.c rename to src/kernel/mon_client.c diff --git a/trunk/ceph/kernel/mon_client.h b/src/kernel/mon_client.h similarity index 100% rename from trunk/ceph/kernel/mon_client.h rename to src/kernel/mon_client.h diff --git a/trunk/ceph/kernel/osd_client.c b/src/kernel/osd_client.c similarity index 100% rename from trunk/ceph/kernel/osd_client.c rename to src/kernel/osd_client.c diff --git a/trunk/ceph/kernel/osd_client.h b/src/kernel/osd_client.h similarity index 100% rename from trunk/ceph/kernel/osd_client.h rename to src/kernel/osd_client.h diff --git a/trunk/ceph/kernel/sample.uml.config b/src/kernel/sample.uml.config similarity index 100% rename from trunk/ceph/kernel/sample.uml.config rename to src/kernel/sample.uml.config diff --git a/trunk/ceph/kernel/super.c b/src/kernel/super.c similarity index 100% rename from trunk/ceph/kernel/super.c rename to src/kernel/super.c diff --git a/trunk/ceph/kernel/super.h b/src/kernel/super.h similarity index 100% rename from trunk/ceph/kernel/super.h rename to src/kernel/super.h diff --git a/trunk/ceph/kernel/test/Makefile b/src/kernel/test/Makefile similarity index 100% rename from trunk/ceph/kernel/test/Makefile rename to src/kernel/test/Makefile diff --git a/trunk/ceph/kernel/test/kernclient.c b/src/kernel/test/kernclient.c similarity index 100% rename from trunk/ceph/kernel/test/kernclient.c rename to src/kernel/test/kernclient.c diff --git a/trunk/ceph/kernel/test/kernserver.c b/src/kernel/test/kernserver.c similarity index 100% rename from trunk/ceph/kernel/test/kernserver.c rename to src/kernel/test/kernserver.c diff --git a/trunk/ceph/kernel/test/ktcp.c b/src/kernel/test/ktcp.c similarity index 100% rename from trunk/ceph/kernel/test/ktcp.c rename to src/kernel/test/ktcp.c diff --git a/trunk/ceph/kernel/test/ktcp.h b/src/kernel/test/ktcp.h similarity index 100% rename from trunk/ceph/kernel/test/ktcp.h rename to src/kernel/test/ktcp.h diff --git a/trunk/ceph/kernel/test/messenger.h b/src/kernel/test/messenger.h similarity index 100% rename from trunk/ceph/kernel/test/messenger.h rename to src/kernel/test/messenger.h diff --git a/trunk/ceph/kernel/test/messenger_mini.c b/src/kernel/test/messenger_mini.c similarity index 100% rename from trunk/ceph/kernel/test/messenger_mini.c rename to src/kernel/test/messenger_mini.c diff --git a/trunk/ceph/kernel/test/threadtest.c b/src/kernel/test/threadtest.c similarity index 100% rename from trunk/ceph/kernel/test/threadtest.c rename to src/kernel/test/threadtest.c diff --git a/trunk/ceph/kernel/test/userclient.c b/src/kernel/test/userclient.c similarity index 100% rename from trunk/ceph/kernel/test/userclient.c rename to src/kernel/test/userclient.c diff --git a/trunk/ceph/kernel/test/userserver.c b/src/kernel/test/userserver.c similarity index 100% rename from trunk/ceph/kernel/test/userserver.c rename to src/kernel/test/userserver.c diff --git a/branches/sage/ebofs2/mds/Anchor.h b/src/mds/Anchor.h similarity index 100% rename from branches/sage/ebofs2/mds/Anchor.h rename to src/mds/Anchor.h diff --git a/branches/sage/ebofs2/mds/AnchorClient.cc b/src/mds/AnchorClient.cc similarity index 100% rename from branches/sage/ebofs2/mds/AnchorClient.cc rename to src/mds/AnchorClient.cc diff --git a/branches/sage/crush/mds/AnchorClient.h b/src/mds/AnchorClient.h similarity index 100% rename from branches/sage/crush/mds/AnchorClient.h rename to src/mds/AnchorClient.h diff --git a/branches/sage/ebofs2/mds/AnchorTable.cc b/src/mds/AnchorTable.cc similarity index 100% rename from branches/sage/ebofs2/mds/AnchorTable.cc rename to src/mds/AnchorTable.cc diff --git a/branches/sage/crush/mds/AnchorTable.h b/src/mds/AnchorTable.h similarity index 100% rename from branches/sage/crush/mds/AnchorTable.h rename to src/mds/AnchorTable.h diff --git a/branches/sage/mds/mds/CDentry.cc b/src/mds/CDentry.cc similarity index 100% rename from branches/sage/mds/mds/CDentry.cc rename to src/mds/CDentry.cc diff --git a/branches/sage/mds/mds/CDentry.h b/src/mds/CDentry.h similarity index 100% rename from branches/sage/mds/mds/CDentry.h rename to src/mds/CDentry.h diff --git a/branches/sage/mds/mds/CDir.cc b/src/mds/CDir.cc similarity index 100% rename from branches/sage/mds/mds/CDir.cc rename to src/mds/CDir.cc diff --git a/branches/sage/crush/mds/CDir.h b/src/mds/CDir.h similarity index 100% rename from branches/sage/crush/mds/CDir.h rename to src/mds/CDir.h diff --git a/branches/sage/mds/mds/CInode.cc b/src/mds/CInode.cc similarity index 100% rename from branches/sage/mds/mds/CInode.cc rename to src/mds/CInode.cc diff --git a/branches/sage/mds/mds/CInode.h b/src/mds/CInode.h similarity index 100% rename from branches/sage/mds/mds/CInode.h rename to src/mds/CInode.h diff --git a/branches/sage/crush/mds/Capability.h b/src/mds/Capability.h similarity index 100% rename from branches/sage/crush/mds/Capability.h rename to src/mds/Capability.h diff --git a/branches/sage/crush/mds/ClientMap.cc b/src/mds/ClientMap.cc similarity index 100% rename from branches/sage/crush/mds/ClientMap.cc rename to src/mds/ClientMap.cc diff --git a/branches/sage/mds/mds/ClientMap.h b/src/mds/ClientMap.h similarity index 100% rename from branches/sage/mds/mds/ClientMap.h rename to src/mds/ClientMap.h diff --git a/branches/sage/crush/mds/FileLock.h b/src/mds/FileLock.h similarity index 100% rename from branches/sage/crush/mds/FileLock.h rename to src/mds/FileLock.h diff --git a/branches/sage/crush/mds/IdAllocator.cc b/src/mds/IdAllocator.cc similarity index 100% rename from branches/sage/crush/mds/IdAllocator.cc rename to src/mds/IdAllocator.cc diff --git a/branches/sage/crush/mds/IdAllocator.h b/src/mds/IdAllocator.h similarity index 100% rename from branches/sage/crush/mds/IdAllocator.h rename to src/mds/IdAllocator.h diff --git a/branches/sage/crush/mds/LocalLock.h b/src/mds/LocalLock.h similarity index 100% rename from branches/sage/crush/mds/LocalLock.h rename to src/mds/LocalLock.h diff --git a/trunk/ceph/mds/Locker.cc b/src/mds/Locker.cc similarity index 100% rename from trunk/ceph/mds/Locker.cc rename to src/mds/Locker.cc diff --git a/branches/sage/crush/mds/Locker.h b/src/mds/Locker.h similarity index 100% rename from branches/sage/crush/mds/Locker.h rename to src/mds/Locker.h diff --git a/branches/sage/mds/mds/LogEvent.cc b/src/mds/LogEvent.cc similarity index 100% rename from branches/sage/mds/mds/LogEvent.cc rename to src/mds/LogEvent.cc diff --git a/branches/sage/mds/mds/LogEvent.h b/src/mds/LogEvent.h similarity index 100% rename from branches/sage/mds/mds/LogEvent.h rename to src/mds/LogEvent.h diff --git a/branches/sage/mds/mds/LogSegment.h b/src/mds/LogSegment.h similarity index 100% rename from branches/sage/mds/mds/LogSegment.h rename to src/mds/LogSegment.h diff --git a/trunk/ceph/mds/MDBalancer.cc b/src/mds/MDBalancer.cc similarity index 100% rename from trunk/ceph/mds/MDBalancer.cc rename to src/mds/MDBalancer.cc diff --git a/branches/sage/crush/mds/MDBalancer.h b/src/mds/MDBalancer.h similarity index 100% rename from branches/sage/crush/mds/MDBalancer.h rename to src/mds/MDBalancer.h diff --git a/trunk/ceph/mds/MDCache.cc b/src/mds/MDCache.cc similarity index 100% rename from trunk/ceph/mds/MDCache.cc rename to src/mds/MDCache.cc diff --git a/branches/sage/mds/mds/MDCache.h b/src/mds/MDCache.h similarity index 100% rename from branches/sage/mds/mds/MDCache.h rename to src/mds/MDCache.h diff --git a/trunk/ceph/mds/MDLog.cc b/src/mds/MDLog.cc similarity index 100% rename from trunk/ceph/mds/MDLog.cc rename to src/mds/MDLog.cc diff --git a/branches/sage/mds/mds/MDLog.h b/src/mds/MDLog.h similarity index 100% rename from branches/sage/mds/mds/MDLog.h rename to src/mds/MDLog.h diff --git a/trunk/ceph/mds/MDS.cc b/src/mds/MDS.cc similarity index 100% rename from trunk/ceph/mds/MDS.cc rename to src/mds/MDS.cc diff --git a/trunk/ceph/mds/MDS.h b/src/mds/MDS.h similarity index 100% rename from trunk/ceph/mds/MDS.h rename to src/mds/MDS.h diff --git a/trunk/ceph/mds/MDSMap.h b/src/mds/MDSMap.h similarity index 100% rename from trunk/ceph/mds/MDSMap.h rename to src/mds/MDSMap.h diff --git a/trunk/ceph/mds/Migrator.cc b/src/mds/Migrator.cc similarity index 100% rename from trunk/ceph/mds/Migrator.cc rename to src/mds/Migrator.cc diff --git a/branches/sage/mds/mds/Migrator.h b/src/mds/Migrator.h similarity index 100% rename from branches/sage/mds/mds/Migrator.h rename to src/mds/Migrator.h diff --git a/branches/sage/crush/mds/ScatterLock.h b/src/mds/ScatterLock.h similarity index 100% rename from branches/sage/crush/mds/ScatterLock.h rename to src/mds/ScatterLock.h diff --git a/trunk/ceph/mds/Server.cc b/src/mds/Server.cc similarity index 100% rename from trunk/ceph/mds/Server.cc rename to src/mds/Server.cc diff --git a/branches/sage/mds/mds/Server.h b/src/mds/Server.h similarity index 100% rename from branches/sage/mds/mds/Server.h rename to src/mds/Server.h diff --git a/branches/sage/crush/mds/SimpleLock.h b/src/mds/SimpleLock.h similarity index 100% rename from branches/sage/crush/mds/SimpleLock.h rename to src/mds/SimpleLock.h diff --git a/branches/sage/crush/mds/events/EAnchor.h b/src/mds/events/EAnchor.h similarity index 100% rename from branches/sage/crush/mds/events/EAnchor.h rename to src/mds/events/EAnchor.h diff --git a/branches/sage/crush/mds/events/EAnchorClient.h b/src/mds/events/EAnchorClient.h similarity index 100% rename from branches/sage/crush/mds/events/EAnchorClient.h rename to src/mds/events/EAnchorClient.h diff --git a/branches/sage/crush/mds/events/EExport.h b/src/mds/events/EExport.h similarity index 100% rename from branches/sage/crush/mds/events/EExport.h rename to src/mds/events/EExport.h diff --git a/branches/sage/crush/mds/events/EFragment.h b/src/mds/events/EFragment.h similarity index 100% rename from branches/sage/crush/mds/events/EFragment.h rename to src/mds/events/EFragment.h diff --git a/branches/sage/crush/mds/events/EImportFinish.h b/src/mds/events/EImportFinish.h similarity index 100% rename from branches/sage/crush/mds/events/EImportFinish.h rename to src/mds/events/EImportFinish.h diff --git a/branches/sage/mds/mds/events/EImportStart.h b/src/mds/events/EImportStart.h similarity index 100% rename from branches/sage/mds/mds/events/EImportStart.h rename to src/mds/events/EImportStart.h diff --git a/branches/sage/crush/mds/events/EMetaBlob.h b/src/mds/events/EMetaBlob.h similarity index 100% rename from branches/sage/crush/mds/events/EMetaBlob.h rename to src/mds/events/EMetaBlob.h diff --git a/branches/sage/crush/mds/events/EOpen.h b/src/mds/events/EOpen.h similarity index 100% rename from branches/sage/crush/mds/events/EOpen.h rename to src/mds/events/EOpen.h diff --git a/branches/sage/crush/mds/events/EPurgeFinish.h b/src/mds/events/EPurgeFinish.h similarity index 100% rename from branches/sage/crush/mds/events/EPurgeFinish.h rename to src/mds/events/EPurgeFinish.h diff --git a/branches/sage/mds/mds/events/ESession.h b/src/mds/events/ESession.h similarity index 100% rename from branches/sage/mds/mds/events/ESession.h rename to src/mds/events/ESession.h diff --git a/branches/sage/mds/mds/events/ESessions.h b/src/mds/events/ESessions.h similarity index 100% rename from branches/sage/mds/mds/events/ESessions.h rename to src/mds/events/ESessions.h diff --git a/branches/sage/crush/mds/events/ESlaveUpdate.h b/src/mds/events/ESlaveUpdate.h similarity index 100% rename from branches/sage/crush/mds/events/ESlaveUpdate.h rename to src/mds/events/ESlaveUpdate.h diff --git a/branches/sage/crush/mds/events/EString.h b/src/mds/events/EString.h similarity index 100% rename from branches/sage/crush/mds/events/EString.h rename to src/mds/events/EString.h diff --git a/branches/sage/crush/mds/events/ESubtreeMap.h b/src/mds/events/ESubtreeMap.h similarity index 100% rename from branches/sage/crush/mds/events/ESubtreeMap.h rename to src/mds/events/ESubtreeMap.h diff --git a/branches/sage/mds/mds/events/EUpdate.h b/src/mds/events/EUpdate.h similarity index 100% rename from branches/sage/mds/mds/events/EUpdate.h rename to src/mds/events/EUpdate.h diff --git a/branches/sage/mds/mds/journal.cc b/src/mds/journal.cc similarity index 100% rename from branches/sage/mds/mds/journal.cc rename to src/mds/journal.cc diff --git a/trunk/ceph/mds/mdstypes.h b/src/mds/mdstypes.h similarity index 100% rename from trunk/ceph/mds/mdstypes.h rename to src/mds/mdstypes.h diff --git a/branches/sage/crush/messages/MAnchor.h b/src/messages/MAnchor.h similarity index 100% rename from branches/sage/crush/messages/MAnchor.h rename to src/messages/MAnchor.h diff --git a/branches/sage/crush/messages/MCacheExpire.h b/src/messages/MCacheExpire.h similarity index 100% rename from branches/sage/crush/messages/MCacheExpire.h rename to src/messages/MCacheExpire.h diff --git a/trunk/ceph/messages/MClientFileCaps.h b/src/messages/MClientFileCaps.h similarity index 100% rename from trunk/ceph/messages/MClientFileCaps.h rename to src/messages/MClientFileCaps.h diff --git a/trunk/ceph/messages/MClientMount.h b/src/messages/MClientMount.h similarity index 100% rename from trunk/ceph/messages/MClientMount.h rename to src/messages/MClientMount.h diff --git a/trunk/ceph/messages/MClientReconnect.h b/src/messages/MClientReconnect.h similarity index 100% rename from trunk/ceph/messages/MClientReconnect.h rename to src/messages/MClientReconnect.h diff --git a/trunk/ceph/messages/MClientReply.h b/src/messages/MClientReply.h similarity index 100% rename from trunk/ceph/messages/MClientReply.h rename to src/messages/MClientReply.h diff --git a/trunk/ceph/messages/MClientRequest.h b/src/messages/MClientRequest.h similarity index 100% rename from trunk/ceph/messages/MClientRequest.h rename to src/messages/MClientRequest.h diff --git a/trunk/ceph/messages/MClientRequestForward.h b/src/messages/MClientRequestForward.h similarity index 100% rename from trunk/ceph/messages/MClientRequestForward.h rename to src/messages/MClientRequestForward.h diff --git a/trunk/ceph/messages/MClientSession.h b/src/messages/MClientSession.h similarity index 100% rename from trunk/ceph/messages/MClientSession.h rename to src/messages/MClientSession.h diff --git a/trunk/ceph/messages/MClientUnmount.h b/src/messages/MClientUnmount.h similarity index 100% rename from trunk/ceph/messages/MClientUnmount.h rename to src/messages/MClientUnmount.h diff --git a/branches/sage/crush/messages/MDentryUnlink.h b/src/messages/MDentryUnlink.h similarity index 100% rename from branches/sage/crush/messages/MDentryUnlink.h rename to src/messages/MDentryUnlink.h diff --git a/branches/sage/mds/messages/MDirUpdate.h b/src/messages/MDirUpdate.h similarity index 100% rename from branches/sage/mds/messages/MDirUpdate.h rename to src/messages/MDirUpdate.h diff --git a/branches/sage/crush/messages/MDiscover.h b/src/messages/MDiscover.h similarity index 100% rename from branches/sage/crush/messages/MDiscover.h rename to src/messages/MDiscover.h diff --git a/branches/sage/crush/messages/MDiscoverReply.h b/src/messages/MDiscoverReply.h similarity index 100% rename from branches/sage/crush/messages/MDiscoverReply.h rename to src/messages/MDiscoverReply.h diff --git a/branches/sage/mds/messages/MExportCaps.h b/src/messages/MExportCaps.h similarity index 100% rename from branches/sage/mds/messages/MExportCaps.h rename to src/messages/MExportCaps.h diff --git a/branches/sage/mds/messages/MExportCapsAck.h b/src/messages/MExportCapsAck.h similarity index 100% rename from branches/sage/mds/messages/MExportCapsAck.h rename to src/messages/MExportCapsAck.h diff --git a/branches/sage/crush/messages/MExportDir.h b/src/messages/MExportDir.h similarity index 100% rename from branches/sage/crush/messages/MExportDir.h rename to src/messages/MExportDir.h diff --git a/branches/sage/crush/messages/MExportDirAck.h b/src/messages/MExportDirAck.h similarity index 100% rename from branches/sage/crush/messages/MExportDirAck.h rename to src/messages/MExportDirAck.h diff --git a/branches/sage/crush/messages/MExportDirCancel.h b/src/messages/MExportDirCancel.h similarity index 100% rename from branches/sage/crush/messages/MExportDirCancel.h rename to src/messages/MExportDirCancel.h diff --git a/branches/sage/mds/messages/MExportDirDiscover.h b/src/messages/MExportDirDiscover.h similarity index 100% rename from branches/sage/mds/messages/MExportDirDiscover.h rename to src/messages/MExportDirDiscover.h diff --git a/branches/sage/crush/messages/MExportDirDiscoverAck.h b/src/messages/MExportDirDiscoverAck.h similarity index 100% rename from branches/sage/crush/messages/MExportDirDiscoverAck.h rename to src/messages/MExportDirDiscoverAck.h diff --git a/branches/sage/crush/messages/MExportDirFinish.h b/src/messages/MExportDirFinish.h similarity index 100% rename from branches/sage/crush/messages/MExportDirFinish.h rename to src/messages/MExportDirFinish.h diff --git a/branches/sage/crush/messages/MExportDirNotify.h b/src/messages/MExportDirNotify.h similarity index 100% rename from branches/sage/crush/messages/MExportDirNotify.h rename to src/messages/MExportDirNotify.h diff --git a/branches/sage/crush/messages/MExportDirNotifyAck.h b/src/messages/MExportDirNotifyAck.h similarity index 100% rename from branches/sage/crush/messages/MExportDirNotifyAck.h rename to src/messages/MExportDirNotifyAck.h diff --git a/branches/sage/crush/messages/MExportDirPrep.h b/src/messages/MExportDirPrep.h similarity index 100% rename from branches/sage/crush/messages/MExportDirPrep.h rename to src/messages/MExportDirPrep.h diff --git a/branches/sage/crush/messages/MExportDirPrepAck.h b/src/messages/MExportDirPrepAck.h similarity index 100% rename from branches/sage/crush/messages/MExportDirPrepAck.h rename to src/messages/MExportDirPrepAck.h diff --git a/branches/sage/crush/messages/MExportDirWarning.h b/src/messages/MExportDirWarning.h similarity index 100% rename from branches/sage/crush/messages/MExportDirWarning.h rename to src/messages/MExportDirWarning.h diff --git a/branches/sage/crush/messages/MExportDirWarningAck.h b/src/messages/MExportDirWarningAck.h similarity index 100% rename from branches/sage/crush/messages/MExportDirWarningAck.h rename to src/messages/MExportDirWarningAck.h diff --git a/branches/sage/crush/messages/MGenericMessage.h b/src/messages/MGenericMessage.h similarity index 100% rename from branches/sage/crush/messages/MGenericMessage.h rename to src/messages/MGenericMessage.h diff --git a/branches/sage/crush/messages/MHeartbeat.h b/src/messages/MHeartbeat.h similarity index 100% rename from branches/sage/crush/messages/MHeartbeat.h rename to src/messages/MHeartbeat.h diff --git a/branches/sage/crush/messages/MInodeFileCaps.h b/src/messages/MInodeFileCaps.h similarity index 100% rename from branches/sage/crush/messages/MInodeFileCaps.h rename to src/messages/MInodeFileCaps.h diff --git a/trunk/ceph/messages/MLock.h b/src/messages/MLock.h similarity index 100% rename from trunk/ceph/messages/MLock.h rename to src/messages/MLock.h diff --git a/branches/sage/ebofs2/messages/MMDSBeacon.h b/src/messages/MMDSBeacon.h similarity index 100% rename from branches/sage/ebofs2/messages/MMDSBeacon.h rename to src/messages/MMDSBeacon.h diff --git a/branches/sage/crush/messages/MMDSBoot.h b/src/messages/MMDSBoot.h similarity index 100% rename from branches/sage/crush/messages/MMDSBoot.h rename to src/messages/MMDSBoot.h diff --git a/branches/sage/crush/messages/MMDSCacheRejoin.h b/src/messages/MMDSCacheRejoin.h similarity index 100% rename from branches/sage/crush/messages/MMDSCacheRejoin.h rename to src/messages/MMDSCacheRejoin.h diff --git a/branches/sage/crush/messages/MMDSFragmentNotify.h b/src/messages/MMDSFragmentNotify.h similarity index 100% rename from branches/sage/crush/messages/MMDSFragmentNotify.h rename to src/messages/MMDSFragmentNotify.h diff --git a/trunk/ceph/messages/MMDSGetMap.h b/src/messages/MMDSGetMap.h similarity index 100% rename from trunk/ceph/messages/MMDSGetMap.h rename to src/messages/MMDSGetMap.h diff --git a/trunk/ceph/messages/MMDSMap.h b/src/messages/MMDSMap.h similarity index 100% rename from trunk/ceph/messages/MMDSMap.h rename to src/messages/MMDSMap.h diff --git a/branches/sage/crush/messages/MMDSResolve.h b/src/messages/MMDSResolve.h similarity index 100% rename from branches/sage/crush/messages/MMDSResolve.h rename to src/messages/MMDSResolve.h diff --git a/branches/sage/crush/messages/MMDSResolveAck.h b/src/messages/MMDSResolveAck.h similarity index 100% rename from branches/sage/crush/messages/MMDSResolveAck.h rename to src/messages/MMDSResolveAck.h diff --git a/branches/sage/mds/messages/MMDSSlaveRequest.h b/src/messages/MMDSSlaveRequest.h similarity index 100% rename from branches/sage/mds/messages/MMDSSlaveRequest.h rename to src/messages/MMDSSlaveRequest.h diff --git a/branches/sage/crush/messages/MMonCommand.h b/src/messages/MMonCommand.h similarity index 100% rename from branches/sage/crush/messages/MMonCommand.h rename to src/messages/MMonCommand.h diff --git a/branches/sage/crush/messages/MMonCommandAck.h b/src/messages/MMonCommandAck.h similarity index 100% rename from branches/sage/crush/messages/MMonCommandAck.h rename to src/messages/MMonCommandAck.h diff --git a/branches/sage/crush/messages/MMonElection.h b/src/messages/MMonElection.h similarity index 100% rename from branches/sage/crush/messages/MMonElection.h rename to src/messages/MMonElection.h diff --git a/branches/sage/crush/messages/MMonElectionCollect.h b/src/messages/MMonElectionCollect.h similarity index 100% rename from branches/sage/crush/messages/MMonElectionCollect.h rename to src/messages/MMonElectionCollect.h diff --git a/branches/sage/crush/messages/MMonElectionRefresh.h b/src/messages/MMonElectionRefresh.h similarity index 100% rename from branches/sage/crush/messages/MMonElectionRefresh.h rename to src/messages/MMonElectionRefresh.h diff --git a/branches/sage/crush/messages/MMonElectionStatus.h b/src/messages/MMonElectionStatus.h similarity index 100% rename from branches/sage/crush/messages/MMonElectionStatus.h rename to src/messages/MMonElectionStatus.h diff --git a/trunk/ceph/messages/MMonMap.h b/src/messages/MMonMap.h similarity index 100% rename from trunk/ceph/messages/MMonMap.h rename to src/messages/MMonMap.h diff --git a/branches/sage/crush/messages/MMonOSDMapInfo.h b/src/messages/MMonOSDMapInfo.h similarity index 100% rename from branches/sage/crush/messages/MMonOSDMapInfo.h rename to src/messages/MMonOSDMapInfo.h diff --git a/branches/sage/crush/messages/MMonOSDMapLease.h b/src/messages/MMonOSDMapLease.h similarity index 100% rename from branches/sage/crush/messages/MMonOSDMapLease.h rename to src/messages/MMonOSDMapLease.h diff --git a/branches/sage/crush/messages/MMonOSDMapLeaseAck.h b/src/messages/MMonOSDMapLeaseAck.h similarity index 100% rename from branches/sage/crush/messages/MMonOSDMapLeaseAck.h rename to src/messages/MMonOSDMapLeaseAck.h diff --git a/branches/sage/crush/messages/MMonOSDMapUpdateAck.h b/src/messages/MMonOSDMapUpdateAck.h similarity index 100% rename from branches/sage/crush/messages/MMonOSDMapUpdateAck.h rename to src/messages/MMonOSDMapUpdateAck.h diff --git a/branches/sage/crush/messages/MMonOSDMapUpdateCommit.h b/src/messages/MMonOSDMapUpdateCommit.h similarity index 100% rename from branches/sage/crush/messages/MMonOSDMapUpdateCommit.h rename to src/messages/MMonOSDMapUpdateCommit.h diff --git a/branches/sage/crush/messages/MMonOSDMapUpdatePrepare.h b/src/messages/MMonOSDMapUpdatePrepare.h similarity index 100% rename from branches/sage/crush/messages/MMonOSDMapUpdatePrepare.h rename to src/messages/MMonOSDMapUpdatePrepare.h diff --git a/branches/sage/crush/messages/MMonPaxos.h b/src/messages/MMonPaxos.h similarity index 100% rename from branches/sage/crush/messages/MMonPaxos.h rename to src/messages/MMonPaxos.h diff --git a/branches/sage/crush/messages/MOSDBoot.h b/src/messages/MOSDBoot.h similarity index 100% rename from branches/sage/crush/messages/MOSDBoot.h rename to src/messages/MOSDBoot.h diff --git a/branches/sage/crush/messages/MOSDFailure.h b/src/messages/MOSDFailure.h similarity index 100% rename from branches/sage/crush/messages/MOSDFailure.h rename to src/messages/MOSDFailure.h diff --git a/trunk/ceph/messages/MOSDGetMap.h b/src/messages/MOSDGetMap.h similarity index 100% rename from trunk/ceph/messages/MOSDGetMap.h rename to src/messages/MOSDGetMap.h diff --git a/branches/sage/crush/messages/MOSDIn.h b/src/messages/MOSDIn.h similarity index 100% rename from branches/sage/crush/messages/MOSDIn.h rename to src/messages/MOSDIn.h diff --git a/trunk/ceph/messages/MOSDMap.h b/src/messages/MOSDMap.h similarity index 100% rename from trunk/ceph/messages/MOSDMap.h rename to src/messages/MOSDMap.h diff --git a/trunk/ceph/messages/MOSDOp.h b/src/messages/MOSDOp.h similarity index 100% rename from trunk/ceph/messages/MOSDOp.h rename to src/messages/MOSDOp.h diff --git a/trunk/ceph/messages/MOSDOpReply.h b/src/messages/MOSDOpReply.h similarity index 100% rename from trunk/ceph/messages/MOSDOpReply.h rename to src/messages/MOSDOpReply.h diff --git a/branches/sage/crush/messages/MOSDOut.h b/src/messages/MOSDOut.h similarity index 100% rename from branches/sage/crush/messages/MOSDOut.h rename to src/messages/MOSDOut.h diff --git a/branches/sage/crush/messages/MOSDPGActivateSet.h b/src/messages/MOSDPGActivateSet.h similarity index 100% rename from branches/sage/crush/messages/MOSDPGActivateSet.h rename to src/messages/MOSDPGActivateSet.h diff --git a/branches/sage/crush/messages/MOSDPGLog.h b/src/messages/MOSDPGLog.h similarity index 100% rename from branches/sage/crush/messages/MOSDPGLog.h rename to src/messages/MOSDPGLog.h diff --git a/branches/sage/crush/messages/MOSDPGNotify.h b/src/messages/MOSDPGNotify.h similarity index 100% rename from branches/sage/crush/messages/MOSDPGNotify.h rename to src/messages/MOSDPGNotify.h diff --git a/branches/sage/crush/messages/MOSDPGPeer.h b/src/messages/MOSDPGPeer.h similarity index 100% rename from branches/sage/crush/messages/MOSDPGPeer.h rename to src/messages/MOSDPGPeer.h diff --git a/branches/sage/crush/messages/MOSDPGPeerAck.h b/src/messages/MOSDPGPeerAck.h similarity index 100% rename from branches/sage/crush/messages/MOSDPGPeerAck.h rename to src/messages/MOSDPGPeerAck.h diff --git a/branches/sage/crush/messages/MOSDPGPeerRequest.h b/src/messages/MOSDPGPeerRequest.h similarity index 100% rename from branches/sage/crush/messages/MOSDPGPeerRequest.h rename to src/messages/MOSDPGPeerRequest.h diff --git a/trunk/ceph/messages/MOSDPGQuery.h b/src/messages/MOSDPGQuery.h similarity index 100% rename from trunk/ceph/messages/MOSDPGQuery.h rename to src/messages/MOSDPGQuery.h diff --git a/branches/sage/crush/messages/MOSDPGRemove.h b/src/messages/MOSDPGRemove.h similarity index 100% rename from branches/sage/crush/messages/MOSDPGRemove.h rename to src/messages/MOSDPGRemove.h diff --git a/branches/sage/crush/messages/MOSDPGSummary.h b/src/messages/MOSDPGSummary.h similarity index 100% rename from branches/sage/crush/messages/MOSDPGSummary.h rename to src/messages/MOSDPGSummary.h diff --git a/branches/sage/crush/messages/MOSDPGUpdate.h b/src/messages/MOSDPGUpdate.h similarity index 100% rename from branches/sage/crush/messages/MOSDPGUpdate.h rename to src/messages/MOSDPGUpdate.h diff --git a/branches/sage/crush/messages/MOSDPing.h b/src/messages/MOSDPing.h similarity index 100% rename from branches/sage/crush/messages/MOSDPing.h rename to src/messages/MOSDPing.h diff --git a/branches/sage/crush/messages/MPGStats.h b/src/messages/MPGStats.h similarity index 100% rename from branches/sage/crush/messages/MPGStats.h rename to src/messages/MPGStats.h diff --git a/trunk/ceph/messages/MPing.h b/src/messages/MPing.h similarity index 100% rename from trunk/ceph/messages/MPing.h rename to src/messages/MPing.h diff --git a/trunk/ceph/messages/MPingAck.h b/src/messages/MPingAck.h similarity index 100% rename from trunk/ceph/messages/MPingAck.h rename to src/messages/MPingAck.h diff --git a/trunk/ceph/messages/MStatfs.h b/src/messages/MStatfs.h similarity index 100% rename from trunk/ceph/messages/MStatfs.h rename to src/messages/MStatfs.h diff --git a/trunk/ceph/messages/MStatfsReply.h b/src/messages/MStatfsReply.h similarity index 100% rename from trunk/ceph/messages/MStatfsReply.h rename to src/messages/MStatfsReply.h diff --git a/trunk/ceph/mkmonmap.cc b/src/mkmonmap.cc similarity index 100% rename from trunk/ceph/mkmonmap.cc rename to src/mkmonmap.cc diff --git a/trunk/ceph/mon/ClientMonitor.cc b/src/mon/ClientMonitor.cc similarity index 100% rename from trunk/ceph/mon/ClientMonitor.cc rename to src/mon/ClientMonitor.cc diff --git a/trunk/ceph/mon/ClientMonitor.h b/src/mon/ClientMonitor.h similarity index 100% rename from trunk/ceph/mon/ClientMonitor.h rename to src/mon/ClientMonitor.h diff --git a/trunk/ceph/mon/Elector.cc b/src/mon/Elector.cc similarity index 100% rename from trunk/ceph/mon/Elector.cc rename to src/mon/Elector.cc diff --git a/branches/sage/crush/mon/Elector.h b/src/mon/Elector.h similarity index 100% rename from branches/sage/crush/mon/Elector.h rename to src/mon/Elector.h diff --git a/trunk/ceph/mon/MDSMonitor.cc b/src/mon/MDSMonitor.cc similarity index 100% rename from trunk/ceph/mon/MDSMonitor.cc rename to src/mon/MDSMonitor.cc diff --git a/trunk/ceph/mon/MDSMonitor.h b/src/mon/MDSMonitor.h similarity index 100% rename from trunk/ceph/mon/MDSMonitor.h rename to src/mon/MDSMonitor.h diff --git a/trunk/ceph/mon/MonMap.h b/src/mon/MonMap.h similarity index 100% rename from trunk/ceph/mon/MonMap.h rename to src/mon/MonMap.h diff --git a/trunk/ceph/mon/Monitor.cc b/src/mon/Monitor.cc similarity index 100% rename from trunk/ceph/mon/Monitor.cc rename to src/mon/Monitor.cc diff --git a/trunk/ceph/mon/Monitor.h b/src/mon/Monitor.h similarity index 100% rename from trunk/ceph/mon/Monitor.h rename to src/mon/Monitor.h diff --git a/trunk/ceph/mon/MonitorStore.cc b/src/mon/MonitorStore.cc similarity index 100% rename from trunk/ceph/mon/MonitorStore.cc rename to src/mon/MonitorStore.cc diff --git a/branches/sage/crush/mon/MonitorStore.h b/src/mon/MonitorStore.h similarity index 100% rename from branches/sage/crush/mon/MonitorStore.h rename to src/mon/MonitorStore.h diff --git a/trunk/ceph/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc similarity index 100% rename from trunk/ceph/mon/OSDMonitor.cc rename to src/mon/OSDMonitor.cc diff --git a/trunk/ceph/mon/OSDMonitor.h b/src/mon/OSDMonitor.h similarity index 100% rename from trunk/ceph/mon/OSDMonitor.h rename to src/mon/OSDMonitor.h diff --git a/branches/sage/crush/mon/PGMap.h b/src/mon/PGMap.h similarity index 100% rename from branches/sage/crush/mon/PGMap.h rename to src/mon/PGMap.h diff --git a/trunk/ceph/mon/PGMonitor.cc b/src/mon/PGMonitor.cc similarity index 100% rename from trunk/ceph/mon/PGMonitor.cc rename to src/mon/PGMonitor.cc diff --git a/branches/sage/crush/mon/PGMonitor.h b/src/mon/PGMonitor.h similarity index 100% rename from branches/sage/crush/mon/PGMonitor.h rename to src/mon/PGMonitor.h diff --git a/trunk/ceph/mon/Paxos.cc b/src/mon/Paxos.cc similarity index 100% rename from trunk/ceph/mon/Paxos.cc rename to src/mon/Paxos.cc diff --git a/trunk/ceph/mon/Paxos.h b/src/mon/Paxos.h similarity index 100% rename from trunk/ceph/mon/Paxos.h rename to src/mon/Paxos.h diff --git a/branches/sage/crush/mon/PaxosService.cc b/src/mon/PaxosService.cc similarity index 100% rename from branches/sage/crush/mon/PaxosService.cc rename to src/mon/PaxosService.cc diff --git a/trunk/ceph/mon/PaxosService.h b/src/mon/PaxosService.h similarity index 100% rename from trunk/ceph/mon/PaxosService.h rename to src/mon/PaxosService.h diff --git a/branches/sage/crush/mon/mon_types.h b/src/mon/mon_types.h similarity index 100% rename from branches/sage/crush/mon/mon_types.h rename to src/mon/mon_types.h diff --git a/branches/sage/crush/msg/Dispatcher.cc b/src/msg/Dispatcher.cc similarity index 100% rename from branches/sage/crush/msg/Dispatcher.cc rename to src/msg/Dispatcher.cc diff --git a/branches/sage/crush/msg/Dispatcher.h b/src/msg/Dispatcher.h similarity index 100% rename from branches/sage/crush/msg/Dispatcher.h rename to src/msg/Dispatcher.h diff --git a/trunk/ceph/msg/FakeMessenger.cc b/src/msg/FakeMessenger.cc similarity index 100% rename from trunk/ceph/msg/FakeMessenger.cc rename to src/msg/FakeMessenger.cc diff --git a/trunk/ceph/msg/FakeMessenger.h b/src/msg/FakeMessenger.h similarity index 100% rename from trunk/ceph/msg/FakeMessenger.h rename to src/msg/FakeMessenger.h diff --git a/trunk/ceph/msg/Message.cc b/src/msg/Message.cc similarity index 100% rename from trunk/ceph/msg/Message.cc rename to src/msg/Message.cc diff --git a/trunk/ceph/msg/Message.h b/src/msg/Message.h similarity index 100% rename from trunk/ceph/msg/Message.h rename to src/msg/Message.h diff --git a/branches/sage/crush/msg/Messenger.cc b/src/msg/Messenger.cc similarity index 100% rename from branches/sage/crush/msg/Messenger.cc rename to src/msg/Messenger.cc diff --git a/trunk/ceph/msg/Messenger.h b/src/msg/Messenger.h similarity index 100% rename from trunk/ceph/msg/Messenger.h rename to src/msg/Messenger.h diff --git a/trunk/ceph/msg/SimpleMessenger.cc b/src/msg/SimpleMessenger.cc similarity index 100% rename from trunk/ceph/msg/SimpleMessenger.cc rename to src/msg/SimpleMessenger.cc diff --git a/trunk/ceph/msg/SimpleMessenger.h b/src/msg/SimpleMessenger.h similarity index 100% rename from trunk/ceph/msg/SimpleMessenger.h rename to src/msg/SimpleMessenger.h diff --git a/trunk/ceph/msg/msg_types.h b/src/msg/msg_types.h similarity index 100% rename from trunk/ceph/msg/msg_types.h rename to src/msg/msg_types.h diff --git a/branches/sage/ebofs2/msg/tcp.cc b/src/msg/tcp.cc similarity index 100% rename from branches/sage/ebofs2/msg/tcp.cc rename to src/msg/tcp.cc diff --git a/trunk/ceph/msg/tcp.h b/src/msg/tcp.h similarity index 100% rename from trunk/ceph/msg/tcp.h rename to src/msg/tcp.h diff --git a/trunk/ceph/newsyn.cc b/src/newsyn.cc similarity index 100% rename from trunk/ceph/newsyn.cc rename to src/newsyn.cc diff --git a/branches/sage/crush/osbdb/OSBDB.cc b/src/osbdb/OSBDB.cc similarity index 100% rename from branches/sage/crush/osbdb/OSBDB.cc rename to src/osbdb/OSBDB.cc diff --git a/branches/sage/crush/osbdb/OSBDB.h b/src/osbdb/OSBDB.h similarity index 100% rename from branches/sage/crush/osbdb/OSBDB.h rename to src/osbdb/OSBDB.h diff --git a/branches/sage/crush/osd/Ager.cc b/src/osd/Ager.cc similarity index 100% rename from branches/sage/crush/osd/Ager.cc rename to src/osd/Ager.cc diff --git a/branches/sage/crush/osd/Ager.h b/src/osd/Ager.h similarity index 100% rename from branches/sage/crush/osd/Ager.h rename to src/osd/Ager.h diff --git a/branches/sage/crush/osd/BDBMap.h b/src/osd/BDBMap.h similarity index 100% rename from branches/sage/crush/osd/BDBMap.h rename to src/osd/BDBMap.h diff --git a/trunk/ceph/osd/Fake.h b/src/osd/Fake.h similarity index 100% rename from trunk/ceph/osd/Fake.h rename to src/osd/Fake.h diff --git a/trunk/ceph/osd/FakeStore.cc b/src/osd/FakeStore.cc similarity index 100% rename from trunk/ceph/osd/FakeStore.cc rename to src/osd/FakeStore.cc diff --git a/trunk/ceph/osd/FakeStore.h b/src/osd/FakeStore.h similarity index 100% rename from trunk/ceph/osd/FakeStore.h rename to src/osd/FakeStore.h diff --git a/branches/sage/crush/osd/FakeStoreBDBCollections.h b/src/osd/FakeStoreBDBCollections.h similarity index 100% rename from branches/sage/crush/osd/FakeStoreBDBCollections.h rename to src/osd/FakeStoreBDBCollections.h diff --git a/trunk/ceph/osd/OSD.cc b/src/osd/OSD.cc similarity index 100% rename from trunk/ceph/osd/OSD.cc rename to src/osd/OSD.cc diff --git a/trunk/ceph/osd/OSD.h b/src/osd/OSD.h similarity index 100% rename from trunk/ceph/osd/OSD.h rename to src/osd/OSD.h diff --git a/trunk/ceph/osd/OSDMap.h b/src/osd/OSDMap.h similarity index 100% rename from trunk/ceph/osd/OSDMap.h rename to src/osd/OSDMap.h diff --git a/branches/sage/crush/osd/ObjectStore.cc b/src/osd/ObjectStore.cc similarity index 100% rename from branches/sage/crush/osd/ObjectStore.cc rename to src/osd/ObjectStore.cc diff --git a/trunk/ceph/osd/ObjectStore.h b/src/osd/ObjectStore.h similarity index 100% rename from trunk/ceph/osd/ObjectStore.h rename to src/osd/ObjectStore.h diff --git a/trunk/ceph/osd/PG.cc b/src/osd/PG.cc similarity index 100% rename from trunk/ceph/osd/PG.cc rename to src/osd/PG.cc diff --git a/trunk/ceph/osd/PG.h b/src/osd/PG.h similarity index 100% rename from trunk/ceph/osd/PG.h rename to src/osd/PG.h diff --git a/trunk/ceph/osd/RAID4PG.cc b/src/osd/RAID4PG.cc similarity index 100% rename from trunk/ceph/osd/RAID4PG.cc rename to src/osd/RAID4PG.cc diff --git a/trunk/ceph/osd/RAID4PG.h b/src/osd/RAID4PG.h similarity index 100% rename from trunk/ceph/osd/RAID4PG.h rename to src/osd/RAID4PG.h diff --git a/trunk/ceph/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc similarity index 100% rename from trunk/ceph/osd/ReplicatedPG.cc rename to src/osd/ReplicatedPG.cc diff --git a/trunk/ceph/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h similarity index 100% rename from trunk/ceph/osd/ReplicatedPG.h rename to src/osd/ReplicatedPG.h diff --git a/trunk/ceph/osd/osd_types.h b/src/osd/osd_types.h similarity index 100% rename from trunk/ceph/osd/osd_types.h rename to src/osd/osd_types.h diff --git a/branches/sage/crush/osdc/Blinker.h b/src/osdc/Blinker.h similarity index 100% rename from branches/sage/crush/osdc/Blinker.h rename to src/osdc/Blinker.h diff --git a/branches/sage/crush/osdc/Filer.cc b/src/osdc/Filer.cc similarity index 100% rename from branches/sage/crush/osdc/Filer.cc rename to src/osdc/Filer.cc diff --git a/branches/sage/crush/osdc/Filer.h b/src/osdc/Filer.h similarity index 100% rename from branches/sage/crush/osdc/Filer.h rename to src/osdc/Filer.h diff --git a/branches/sage/crush/osdc/Journaler.cc b/src/osdc/Journaler.cc similarity index 100% rename from branches/sage/crush/osdc/Journaler.cc rename to src/osdc/Journaler.cc diff --git a/branches/sage/mds/osdc/Journaler.h b/src/osdc/Journaler.h similarity index 100% rename from branches/sage/mds/osdc/Journaler.h rename to src/osdc/Journaler.h diff --git a/branches/sage/crush/osdc/ObjectCacher.cc b/src/osdc/ObjectCacher.cc similarity index 100% rename from branches/sage/crush/osdc/ObjectCacher.cc rename to src/osdc/ObjectCacher.cc diff --git a/branches/sage/crush/osdc/ObjectCacher.h b/src/osdc/ObjectCacher.h similarity index 100% rename from branches/sage/crush/osdc/ObjectCacher.h rename to src/osdc/ObjectCacher.h diff --git a/trunk/ceph/osdc/Objecter.cc b/src/osdc/Objecter.cc similarity index 100% rename from trunk/ceph/osdc/Objecter.cc rename to src/osdc/Objecter.cc diff --git a/trunk/ceph/osdc/Objecter.h b/src/osdc/Objecter.h similarity index 100% rename from trunk/ceph/osdc/Objecter.h rename to src/osdc/Objecter.h diff --git a/branches/sage/crush/script/add_header.pl b/src/script/add_header.pl similarity index 100% rename from branches/sage/crush/script/add_header.pl rename to src/script/add_header.pl diff --git a/branches/marnberg/quota/script/adjusttabs.pl b/src/script/adjusttabs.pl similarity index 100% rename from branches/marnberg/quota/script/adjusttabs.pl rename to src/script/adjusttabs.pl diff --git a/branches/sage/crush/script/check_cache_dumps.pl b/src/script/check_cache_dumps.pl similarity index 100% rename from branches/sage/crush/script/check_cache_dumps.pl rename to src/script/check_cache_dumps.pl diff --git a/branches/marnberg/quota/script/clean_osd_cow.sh b/src/script/clean_osd_cow.sh similarity index 100% rename from branches/marnberg/quota/script/clean_osd_cow.sh rename to src/script/clean_osd_cow.sh diff --git a/branches/marnberg/quota/script/clean_trace.pl b/src/script/clean_trace.pl similarity index 100% rename from branches/marnberg/quota/script/clean_trace.pl rename to src/script/clean_trace.pl diff --git a/branches/sage/crush/script/comb.pl b/src/script/comb.pl similarity index 100% rename from branches/sage/crush/script/comb.pl rename to src/script/comb.pl diff --git a/branches/sage/crush/script/convert_soe_trace.pl b/src/script/convert_soe_trace.pl similarity index 100% rename from branches/sage/crush/script/convert_soe_trace.pl rename to src/script/convert_soe_trace.pl diff --git a/branches/sage/crush/script/find_auth_pins.pl b/src/script/find_auth_pins.pl similarity index 100% rename from branches/sage/crush/script/find_auth_pins.pl rename to src/script/find_auth_pins.pl diff --git a/branches/marnberg/quota/script/find_bufferleaks.pl b/src/script/find_bufferleaks.pl similarity index 100% rename from branches/marnberg/quota/script/find_bufferleaks.pl rename to src/script/find_bufferleaks.pl diff --git a/branches/marnberg/quota/script/find_lost_bdev_ops.pl b/src/script/find_lost_bdev_ops.pl similarity index 100% rename from branches/marnberg/quota/script/find_lost_bdev_ops.pl rename to src/script/find_lost_bdev_ops.pl diff --git a/branches/marnberg/quota/script/find_lost_commit.pl b/src/script/find_lost_commit.pl similarity index 100% rename from branches/marnberg/quota/script/find_lost_commit.pl rename to src/script/find_lost_commit.pl diff --git a/branches/marnberg/quota/script/find_lost_objecter.pl b/src/script/find_lost_objecter.pl similarity index 100% rename from branches/marnberg/quota/script/find_lost_objecter.pl rename to src/script/find_lost_objecter.pl diff --git a/branches/marnberg/quota/script/find_pathpins.pl b/src/script/find_pathpins.pl similarity index 100% rename from branches/marnberg/quota/script/find_pathpins.pl rename to src/script/find_pathpins.pl diff --git a/branches/marnberg/quota/script/find_requests.pl b/src/script/find_requests.pl similarity index 100% rename from branches/marnberg/quota/script/find_requests.pl rename to src/script/find_requests.pl diff --git a/branches/marnberg/quota/script/find_waiters.pl b/src/script/find_waiters.pl similarity index 100% rename from branches/marnberg/quota/script/find_waiters.pl rename to src/script/find_waiters.pl diff --git a/branches/sage/crush/script/fix_modeline.pl b/src/script/fix_modeline.pl similarity index 100% rename from branches/sage/crush/script/fix_modeline.pl rename to src/script/fix_modeline.pl diff --git a/branches/sage/crush/script/gprofnewsyn b/src/script/gprofnewsyn similarity index 100% rename from branches/sage/crush/script/gprofnewsyn rename to src/script/gprofnewsyn diff --git a/branches/marnberg/quota/script/grepblock b/src/script/grepblock similarity index 100% rename from branches/marnberg/quota/script/grepblock rename to src/script/grepblock diff --git a/branches/sage/crush/script/merge_cdfs.pl b/src/script/merge_cdfs.pl similarity index 100% rename from branches/sage/crush/script/merge_cdfs.pl rename to src/script/merge_cdfs.pl diff --git a/branches/marnberg/quota/script/merge_trace_rw.pl b/src/script/merge_trace_rw.pl similarity index 100% rename from branches/marnberg/quota/script/merge_trace_rw.pl rename to src/script/merge_trace_rw.pl diff --git a/branches/sage/crush/script/plot.pl b/src/script/plot.pl similarity index 100% rename from branches/sage/crush/script/plot.pl rename to src/script/plot.pl diff --git a/branches/marnberg/quota/script/profonly.pl b/src/script/profonly.pl similarity index 100% rename from branches/marnberg/quota/script/profonly.pl rename to src/script/profonly.pl diff --git a/branches/sage/crush/script/runjob.pl b/src/script/runjob.pl similarity index 100% rename from branches/sage/crush/script/runjob.pl rename to src/script/runjob.pl diff --git a/branches/marnberg/quota/script/runset.pl b/src/script/runset.pl similarity index 100% rename from branches/marnberg/quota/script/runset.pl rename to src/script/runset.pl diff --git a/branches/sage/crush/script/smooth.pl b/src/script/smooth.pl similarity index 100% rename from branches/sage/crush/script/smooth.pl rename to src/script/smooth.pl diff --git a/branches/sage/crush/script/study_find.pl b/src/script/study_find.pl similarity index 100% rename from branches/sage/crush/script/study_find.pl rename to src/script/study_find.pl diff --git a/branches/sage/crush/script/study_hardlink_lifetimes.pl b/src/script/study_hardlink_lifetimes.pl similarity index 100% rename from branches/sage/crush/script/study_hardlink_lifetimes.pl rename to src/script/study_hardlink_lifetimes.pl diff --git a/branches/sage/crush/script/study_lookups.pl b/src/script/study_lookups.pl similarity index 100% rename from branches/sage/crush/script/study_lookups.pl rename to src/script/study_lookups.pl diff --git a/branches/marnberg/quota/script/sum.pl b/src/script/sum.pl similarity index 100% rename from branches/marnberg/quota/script/sum.pl rename to src/script/sum.pl diff --git a/branches/marnberg/quota/test/fakemds.cc b/src/test/fakemds.cc similarity index 100% rename from branches/marnberg/quota/test/fakemds.cc rename to src/test/fakemds.cc diff --git a/branches/sage/crush/test/fg.cc b/src/test/fg.cc similarity index 100% rename from branches/sage/crush/test/fg.cc rename to src/test/fg.cc diff --git a/branches/marnberg/quota/test/gprof-helper.c b/src/test/gprof-helper.c similarity index 100% rename from branches/marnberg/quota/test/gprof-helper.c rename to src/test/gprof-helper.c diff --git a/branches/marnberg/quota/test/makedirs.cc b/src/test/makedirs.cc similarity index 100% rename from branches/marnberg/quota/test/makedirs.cc rename to src/test/makedirs.cc diff --git a/branches/marnberg/quota/test/mpitest.cc b/src/test/mpitest.cc similarity index 100% rename from branches/marnberg/quota/test/mpitest.cc rename to src/test/mpitest.cc diff --git a/branches/marnberg/quota/test/mttest.cc b/src/test/mttest.cc similarity index 100% rename from branches/marnberg/quota/test/mttest.cc rename to src/test/mttest.cc diff --git a/branches/marnberg/quota/test/rushconfig b/src/test/rushconfig similarity index 100% rename from branches/marnberg/quota/test/rushconfig rename to src/test/rushconfig diff --git a/branches/marnberg/quota/test/rushtest.cc b/src/test/rushtest.cc similarity index 100% rename from branches/marnberg/quota/test/rushtest.cc rename to src/test/rushtest.cc diff --git a/branches/marnberg/quota/test/rushtest.cc~ b/src/test/rushtest.cc~ similarity index 100% rename from branches/marnberg/quota/test/rushtest.cc~ rename to src/test/rushtest.cc~ diff --git a/branches/sage/crush/test/test_disk_bw.cc b/src/test/test_disk_bw.cc similarity index 100% rename from branches/sage/crush/test/test_disk_bw.cc rename to src/test/test_disk_bw.cc diff --git a/trunk/ceph/test/test_seek_read.c b/src/test/test_seek_read.c similarity index 100% rename from trunk/ceph/test/test_seek_read.c rename to src/test/test_seek_read.c diff --git a/trunk/ceph/test/test_short_seek_read.c b/src/test/test_short_seek_read.c similarity index 100% rename from trunk/ceph/test/test_short_seek_read.c rename to src/test/test_short_seek_read.c diff --git a/branches/marnberg/quota/test/testbucket.cc b/src/test/testbucket.cc similarity index 100% rename from branches/marnberg/quota/test/testbucket.cc rename to src/test/testbucket.cc diff --git a/branches/marnberg/quota/test/testbuffers.cc b/src/test/testbuffers.cc similarity index 100% rename from branches/marnberg/quota/test/testbuffers.cc rename to src/test/testbuffers.cc diff --git a/branches/sage/crush/test/testcounter.cc b/src/test/testcounter.cc similarity index 100% rename from branches/sage/crush/test/testcounter.cc rename to src/test/testcounter.cc diff --git a/branches/marnberg/quota/test/testcrush.cc b/src/test/testcrush.cc similarity index 100% rename from branches/marnberg/quota/test/testcrush.cc rename to src/test/testcrush.cc diff --git a/branches/marnberg/quota/test/testfilepath.cc b/src/test/testfilepath.cc similarity index 100% rename from branches/marnberg/quota/test/testfilepath.cc rename to src/test/testfilepath.cc diff --git a/branches/marnberg/quota/test/testmpi.cc b/src/test/testmpi.cc similarity index 100% rename from branches/marnberg/quota/test/testmpi.cc rename to src/test/testmpi.cc diff --git a/branches/marnberg/quota/test/testnewbuffers.cc b/src/test/testnewbuffers.cc similarity index 100% rename from branches/marnberg/quota/test/testnewbuffers.cc rename to src/test/testnewbuffers.cc diff --git a/branches/marnberg/quota/test/testos.cc b/src/test/testos.cc similarity index 100% rename from branches/marnberg/quota/test/testos.cc rename to src/test/testos.cc diff --git a/branches/marnberg/quota/test/testosbdb.cc b/src/test/testosbdb.cc similarity index 100% rename from branches/marnberg/quota/test/testosbdb.cc rename to src/test/testosbdb.cc diff --git a/branches/marnberg/quota/test/testtree.cc b/src/test/testtree.cc similarity index 100% rename from branches/marnberg/quota/test/testtree.cc rename to src/test/testtree.cc diff --git a/branches/marnberg/quota/test/testxattr.cc b/src/test/testxattr.cc similarity index 100% rename from branches/marnberg/quota/test/testxattr.cc rename to src/test/testxattr.cc diff --git a/trunk/ceph/valgrind.supp b/src/valgrind.supp similarity index 100% rename from trunk/ceph/valgrind.supp rename to src/valgrind.supp diff --git a/tags/20070517_before_mds_merge/COPYING b/tags/20070517_before_mds_merge/COPYING deleted file mode 100644 index 5ab7695ab8cab..0000000000000 --- a/tags/20070517_before_mds_merge/COPYING +++ /dev/null @@ -1,504 +0,0 @@ - GNU LESSER GENERAL PUBLIC LICENSE - Version 2.1, February 1999 - - Copyright (C) 1991, 1999 Free Software Foundation, Inc. - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - Everyone is permitted to copy and distribute verbatim copies - of this license document, but changing it is not allowed. - -[This is the first released version of the Lesser GPL. It also counts - as the successor of the GNU Library Public License, version 2, hence - the version number 2.1.] - - Preamble - - The licenses for most software are designed to take away your -freedom to share and change it. By contrast, the GNU General Public -Licenses are intended to guarantee your freedom to share and change -free software--to make sure the software is free for all its users. - - This license, the Lesser General Public License, applies to some -specially designated software packages--typically libraries--of the -Free Software Foundation and other authors who decide to use it. You -can use it too, but we suggest you first think carefully about whether -this license or the ordinary General Public License is the better -strategy to use in any particular case, based on the explanations below. - - When we speak of free software, we are referring to freedom of use, -not price. Our General Public Licenses are designed to make sure that -you have the freedom to distribute copies of free software (and charge -for this service if you wish); that you receive source code or can get -it if you want it; that you can change the software and use pieces of -it in new free programs; and that you are informed that you can do -these things. - - To protect your rights, we need to make restrictions that forbid -distributors to deny you these rights or to ask you to surrender these -rights. These restrictions translate to certain responsibilities for -you if you distribute copies of the library or if you modify it. - - For example, if you distribute copies of the library, whether gratis -or for a fee, you must give the recipients all the rights that we gave -you. You must make sure that they, too, receive or can get the source -code. If you link other code with the library, you must provide -complete object files to the recipients, so that they can relink them -with the library after making changes to the library and recompiling -it. And you must show them these terms so they know their rights. - - We protect your rights with a two-step method: (1) we copyright the -library, and (2) we offer you this license, which gives you legal -permission to copy, distribute and/or modify the library. - - To protect each distributor, we want to make it very clear that -there is no warranty for the free library. Also, if the library is -modified by someone else and passed on, the recipients should know -that what they have is not the original version, so that the original -author's reputation will not be affected by problems that might be -introduced by others. - - Finally, software patents pose a constant threat to the existence of -any free program. We wish to make sure that a company cannot -effectively restrict the users of a free program by obtaining a -restrictive license from a patent holder. Therefore, we insist that -any patent license obtained for a version of the library must be -consistent with the full freedom of use specified in this license. - - Most GNU software, including some libraries, is covered by the -ordinary GNU General Public License. This license, the GNU Lesser -General Public License, applies to certain designated libraries, and -is quite different from the ordinary General Public License. We use -this license for certain libraries in order to permit linking those -libraries into non-free programs. - - When a program is linked with a library, whether statically or using -a shared library, the combination of the two is legally speaking a -combined work, a derivative of the original library. The ordinary -General Public License therefore permits such linking only if the -entire combination fits its criteria of freedom. The Lesser General -Public License permits more lax criteria for linking other code with -the library. - - We call this license the "Lesser" General Public License because it -does Less to protect the user's freedom than the ordinary General -Public License. It also provides other free software developers Less -of an advantage over competing non-free programs. These disadvantages -are the reason we use the ordinary General Public License for many -libraries. However, the Lesser license provides advantages in certain -special circumstances. - - For example, on rare occasions, there may be a special need to -encourage the widest possible use of a certain library, so that it becomes -a de-facto standard. To achieve this, non-free programs must be -allowed to use the library. A more frequent case is that a free -library does the same job as widely used non-free libraries. In this -case, there is little to gain by limiting the free library to free -software only, so we use the Lesser General Public License. - - In other cases, permission to use a particular library in non-free -programs enables a greater number of people to use a large body of -free software. For example, permission to use the GNU C Library in -non-free programs enables many more people to use the whole GNU -operating system, as well as its variant, the GNU/Linux operating -system. - - Although the Lesser General Public License is Less protective of the -users' freedom, it does ensure that the user of a program that is -linked with the Library has the freedom and the wherewithal to run -that program using a modified version of the Library. - - The precise terms and conditions for copying, distribution and -modification follow. Pay close attention to the difference between a -"work based on the library" and a "work that uses the library". The -former contains code derived from the library, whereas the latter must -be combined with the library in order to run. - - GNU LESSER GENERAL PUBLIC LICENSE - TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION - - 0. This License Agreement applies to any software library or other -program which contains a notice placed by the copyright holder or -other authorized party saying it may be distributed under the terms of -this Lesser General Public License (also called "this License"). -Each licensee is addressed as "you". - - A "library" means a collection of software functions and/or data -prepared so as to be conveniently linked with application programs -(which use some of those functions and data) to form executables. - - The "Library", below, refers to any such software library or work -which has been distributed under these terms. A "work based on the -Library" means either the Library or any derivative work under -copyright law: that is to say, a work containing the Library or a -portion of it, either verbatim or with modifications and/or translated -straightforwardly into another language. (Hereinafter, translation is -included without limitation in the term "modification".) - - "Source code" for a work means the preferred form of the work for -making modifications to it. For a library, complete source code means -all the source code for all modules it contains, plus any associated -interface definition files, plus the scripts used to control compilation -and installation of the library. - - Activities other than copying, distribution and modification are not -covered by this License; they are outside its scope. The act of -running a program using the Library is not restricted, and output from -such a program is covered only if its contents constitute a work based -on the Library (independent of the use of the Library in a tool for -writing it). Whether that is true depends on what the Library does -and what the program that uses the Library does. - - 1. You may copy and distribute verbatim copies of the Library's -complete source code as you receive it, in any medium, provided that -you conspicuously and appropriately publish on each copy an -appropriate copyright notice and disclaimer of warranty; keep intact -all the notices that refer to this License and to the absence of any -warranty; and distribute a copy of this License along with the -Library. - - You may charge a fee for the physical act of transferring a copy, -and you may at your option offer warranty protection in exchange for a -fee. - - 2. You may modify your copy or copies of the Library or any portion -of it, thus forming a work based on the Library, and copy and -distribute such modifications or work under the terms of Section 1 -above, provided that you also meet all of these conditions: - - a) The modified work must itself be a software library. - - b) You must cause the files modified to carry prominent notices - stating that you changed the files and the date of any change. - - c) You must cause the whole of the work to be licensed at no - charge to all third parties under the terms of this License. - - d) If a facility in the modified Library refers to a function or a - table of data to be supplied by an application program that uses - the facility, other than as an argument passed when the facility - is invoked, then you must make a good faith effort to ensure that, - in the event an application does not supply such function or - table, the facility still operates, and performs whatever part of - its purpose remains meaningful. - - (For example, a function in a library to compute square roots has - a purpose that is entirely well-defined independent of the - application. Therefore, Subsection 2d requires that any - application-supplied function or table used by this function must - be optional: if the application does not supply it, the square - root function must still compute square roots.) - -These requirements apply to the modified work as a whole. If -identifiable sections of that work are not derived from the Library, -and can be reasonably considered independent and separate works in -themselves, then this License, and its terms, do not apply to those -sections when you distribute them as separate works. But when you -distribute the same sections as part of a whole which is a work based -on the Library, the distribution of the whole must be on the terms of -this License, whose permissions for other licensees extend to the -entire whole, and thus to each and every part regardless of who wrote -it. - -Thus, it is not the intent of this section to claim rights or contest -your rights to work written entirely by you; rather, the intent is to -exercise the right to control the distribution of derivative or -collective works based on the Library. - -In addition, mere aggregation of another work not based on the Library -with the Library (or with a work based on the Library) on a volume of -a storage or distribution medium does not bring the other work under -the scope of this License. - - 3. You may opt to apply the terms of the ordinary GNU General Public -License instead of this License to a given copy of the Library. To do -this, you must alter all the notices that refer to this License, so -that they refer to the ordinary GNU General Public License, version 2, -instead of to this License. (If a newer version than version 2 of the -ordinary GNU General Public License has appeared, then you can specify -that version instead if you wish.) Do not make any other change in -these notices. - - Once this change is made in a given copy, it is irreversible for -that copy, so the ordinary GNU General Public License applies to all -subsequent copies and derivative works made from that copy. - - This option is useful when you wish to copy part of the code of -the Library into a program that is not a library. - - 4. You may copy and distribute the Library (or a portion or -derivative of it, under Section 2) in object code or executable form -under the terms of Sections 1 and 2 above provided that you accompany -it with the complete corresponding machine-readable source code, which -must be distributed under the terms of Sections 1 and 2 above on a -medium customarily used for software interchange. - - If distribution of object code is made by offering access to copy -from a designated place, then offering equivalent access to copy the -source code from the same place satisfies the requirement to -distribute the source code, even though third parties are not -compelled to copy the source along with the object code. - - 5. A program that contains no derivative of any portion of the -Library, but is designed to work with the Library by being compiled or -linked with it, is called a "work that uses the Library". Such a -work, in isolation, is not a derivative work of the Library, and -therefore falls outside the scope of this License. - - However, linking a "work that uses the Library" with the Library -creates an executable that is a derivative of the Library (because it -contains portions of the Library), rather than a "work that uses the -library". The executable is therefore covered by this License. -Section 6 states terms for distribution of such executables. - - When a "work that uses the Library" uses material from a header file -that is part of the Library, the object code for the work may be a -derivative work of the Library even though the source code is not. -Whether this is true is especially significant if the work can be -linked without the Library, or if the work is itself a library. The -threshold for this to be true is not precisely defined by law. - - If such an object file uses only numerical parameters, data -structure layouts and accessors, and small macros and small inline -functions (ten lines or less in length), then the use of the object -file is unrestricted, regardless of whether it is legally a derivative -work. (Executables containing this object code plus portions of the -Library will still fall under Section 6.) - - Otherwise, if the work is a derivative of the Library, you may -distribute the object code for the work under the terms of Section 6. -Any executables containing that work also fall under Section 6, -whether or not they are linked directly with the Library itself. - - 6. As an exception to the Sections above, you may also combine or -link a "work that uses the Library" with the Library to produce a -work containing portions of the Library, and distribute that work -under terms of your choice, provided that the terms permit -modification of the work for the customer's own use and reverse -engineering for debugging such modifications. - - You must give prominent notice with each copy of the work that the -Library is used in it and that the Library and its use are covered by -this License. You must supply a copy of this License. If the work -during execution displays copyright notices, you must include the -copyright notice for the Library among them, as well as a reference -directing the user to the copy of this License. Also, you must do one -of these things: - - a) Accompany the work with the complete corresponding - machine-readable source code for the Library including whatever - changes were used in the work (which must be distributed under - Sections 1 and 2 above); and, if the work is an executable linked - with the Library, with the complete machine-readable "work that - uses the Library", as object code and/or source code, so that the - user can modify the Library and then relink to produce a modified - executable containing the modified Library. (It is understood - that the user who changes the contents of definitions files in the - Library will not necessarily be able to recompile the application - to use the modified definitions.) - - b) Use a suitable shared library mechanism for linking with the - Library. A suitable mechanism is one that (1) uses at run time a - copy of the library already present on the user's computer system, - rather than copying library functions into the executable, and (2) - will operate properly with a modified version of the library, if - the user installs one, as long as the modified version is - interface-compatible with the version that the work was made with. - - c) Accompany the work with a written offer, valid for at - least three years, to give the same user the materials - specified in Subsection 6a, above, for a charge no more - than the cost of performing this distribution. - - d) If distribution of the work is made by offering access to copy - from a designated place, offer equivalent access to copy the above - specified materials from the same place. - - e) Verify that the user has already received a copy of these - materials or that you have already sent this user a copy. - - For an executable, the required form of the "work that uses the -Library" must include any data and utility programs needed for -reproducing the executable from it. However, as a special exception, -the materials to be distributed need not include anything that is -normally distributed (in either source or binary form) with the major -components (compiler, kernel, and so on) of the operating system on -which the executable runs, unless that component itself accompanies -the executable. - - It may happen that this requirement contradicts the license -restrictions of other proprietary libraries that do not normally -accompany the operating system. Such a contradiction means you cannot -use both them and the Library together in an executable that you -distribute. - - 7. You may place library facilities that are a work based on the -Library side-by-side in a single library together with other library -facilities not covered by this License, and distribute such a combined -library, provided that the separate distribution of the work based on -the Library and of the other library facilities is otherwise -permitted, and provided that you do these two things: - - a) Accompany the combined library with a copy of the same work - based on the Library, uncombined with any other library - facilities. This must be distributed under the terms of the - Sections above. - - b) Give prominent notice with the combined library of the fact - that part of it is a work based on the Library, and explaining - where to find the accompanying uncombined form of the same work. - - 8. You may not copy, modify, sublicense, link with, or distribute -the Library except as expressly provided under this License. Any -attempt otherwise to copy, modify, sublicense, link with, or -distribute the Library is void, and will automatically terminate your -rights under this License. However, parties who have received copies, -or rights, from you under this License will not have their licenses -terminated so long as such parties remain in full compliance. - - 9. You are not required to accept this License, since you have not -signed it. However, nothing else grants you permission to modify or -distribute the Library or its derivative works. These actions are -prohibited by law if you do not accept this License. Therefore, by -modifying or distributing the Library (or any work based on the -Library), you indicate your acceptance of this License to do so, and -all its terms and conditions for copying, distributing or modifying -the Library or works based on it. - - 10. Each time you redistribute the Library (or any work based on the -Library), the recipient automatically receives a license from the -original licensor to copy, distribute, link with or modify the Library -subject to these terms and conditions. You may not impose any further -restrictions on the recipients' exercise of the rights granted herein. -You are not responsible for enforcing compliance by third parties with -this License. - - 11. If, as a consequence of a court judgment or allegation of patent -infringement or for any other reason (not limited to patent issues), -conditions are imposed on you (whether by court order, agreement or -otherwise) that contradict the conditions of this License, they do not -excuse you from the conditions of this License. If you cannot -distribute so as to satisfy simultaneously your obligations under this -License and any other pertinent obligations, then as a consequence you -may not distribute the Library at all. For example, if a patent -license would not permit royalty-free redistribution of the Library by -all those who receive copies directly or indirectly through you, then -the only way you could satisfy both it and this License would be to -refrain entirely from distribution of the Library. - -If any portion of this section is held invalid or unenforceable under any -particular circumstance, the balance of the section is intended to apply, -and the section as a whole is intended to apply in other circumstances. - -It is not the purpose of this section to induce you to infringe any -patents or other property right claims or to contest validity of any -such claims; this section has the sole purpose of protecting the -integrity of the free software distribution system which is -implemented by public license practices. Many people have made -generous contributions to the wide range of software distributed -through that system in reliance on consistent application of that -system; it is up to the author/donor to decide if he or she is willing -to distribute software through any other system and a licensee cannot -impose that choice. - -This section is intended to make thoroughly clear what is believed to -be a consequence of the rest of this License. - - 12. If the distribution and/or use of the Library is restricted in -certain countries either by patents or by copyrighted interfaces, the -original copyright holder who places the Library under this License may add -an explicit geographical distribution limitation excluding those countries, -so that distribution is permitted only in or among countries not thus -excluded. In such case, this License incorporates the limitation as if -written in the body of this License. - - 13. The Free Software Foundation may publish revised and/or new -versions of the Lesser General Public License from time to time. -Such new versions will be similar in spirit to the present version, -but may differ in detail to address new problems or concerns. - -Each version is given a distinguishing version number. If the Library -specifies a version number of this License which applies to it and -"any later version", you have the option of following the terms and -conditions either of that version or of any later version published by -the Free Software Foundation. If the Library does not specify a -license version number, you may choose any version ever published by -the Free Software Foundation. - - 14. If you wish to incorporate parts of the Library into other free -programs whose distribution conditions are incompatible with these, -write to the author to ask for permission. For software which is -copyrighted by the Free Software Foundation, write to the Free -Software Foundation; we sometimes make exceptions for this. Our -decision will be guided by the two goals of preserving the free status -of all derivatives of our free software and of promoting the sharing -and reuse of software generally. - - NO WARRANTY - - 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO -WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. -EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR -OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY -KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE -LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME -THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. - - 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN -WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY -AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU -FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR -CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE -LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING -RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A -FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF -SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH -DAMAGES. - - END OF TERMS AND CONDITIONS - - How to Apply These Terms to Your New Libraries - - If you develop a new library, and you want it to be of the greatest -possible use to the public, we recommend making it free software that -everyone can redistribute and change. You can do so by permitting -redistribution under these terms (or, alternatively, under the terms of the -ordinary General Public License). - - To apply these terms, attach the following notices to the library. It is -safest to attach them to the start of each source file to most effectively -convey the exclusion of warranty; and each file should have at least the -"copyright" line and a pointer to where the full notice is found. - - - Copyright (C) - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - -Also add information on how to contact you by electronic and paper mail. - -You should also get your employer (if you work as a programmer) or your -school, if any, to sign a "copyright disclaimer" for the library, if -necessary. Here is a sample; alter the names: - - Yoyodyne, Inc., hereby disclaims all copyright interest in the - library `Frob' (a library for tweaking knobs) written by James Random Hacker. - - , 1 April 1990 - Ty Coon, President of Vice - -That's all there is to it! - - diff --git a/tags/20070517_before_mds_merge/Makefile b/tags/20070517_before_mds_merge/Makefile deleted file mode 100644 index e8fb11534a48b..0000000000000 --- a/tags/20070517_before_mds_merge/Makefile +++ /dev/null @@ -1,264 +0,0 @@ - -# mpicxx must be on your path to build newsyn. on googoo, this means -# that /usr/local/mpich2-1.0.2/bin must be on your path. - -# For now, use g++ most of the time. -# When compiling MPI stuff, specify myfile.cc instead of myfile.o so -# that ${MPICC} is invoked instead of the generic .o rule (or it'll -# use g++). This makes it less annoying to build on non-mpi hosts for -# dev work, and seems to behave just fine... change ${CC} back to -# mpicxx if you get paranoid. - -#CC = g++ -#CFLAGS = -g -fPIC -Wall -I. -D_FILE_OFFSET_BITS=64 -DMPICH_IGNORE_CXX_SEEK -D_REENTRANT -D_THREAD_SAFE -#LIBS = -lpthread - -# Hook for extra -I options, etc. -EXTRA_CFLAGS = - -ifeq ($(target),darwin) -# For Darwin -CFLAGS = -g -Wall -I. -D_FILE_OFFSET_BITS=64 -DMPICH_IGNORE_CXX_SEEK -D_REENTRANT -D_THREAD_SAFE -DDARWIN -D__FreeBSD__=10 ${EXTRA_CFLAGS} -LDINC = ar -rc -else -# For linux -CFLAGS = -g -fPIC -Wall -I. -D_FILE_OFFSET_BITS=64 -DMPICH_IGNORE_CXX_SEEK -D_REENTRANT -D_THREAD_SAFE -LDINC = ld -i -o -endif - -CC = g++ -LIBS = -lpthread - -ifeq ($(want_bdb),yes) -CFLAGS += -DUSE_OSBDB -OSBDB_LIBS = -ldb_cxx -endif - -#for normal mpich2 machines -MPICC = mpicxx -MPICFLAGS = ${CFLAGS} -MPILIBS = ${LIBS} - -#for LLNL boxes without mpicxx -#MPICC = g++ -#MPICFLAGS = ${CFLAGS} -I/usr/lib/mpi/include -L/usr/lib/mpi/mpi_gnu/lib -#MPILIBS = ${LIBS} -lelan -lmpi - -EBOFS_OBJS= \ - ebofs/BlockDevice.o\ - ebofs/BufferCache.o\ - ebofs/Ebofs.o\ - ebofs/Allocator.o - -MDS_OBJS= \ - mds/MDS.o\ - mds/journal.o\ - mds/Server.o\ - mds/MDCache.o\ - mds/Locker.o\ - mds/Migrator.o\ - mds/Renamer.o\ - mds/MDBalancer.o\ - mds/CDentry.o\ - mds/CDir.o\ - mds/CInode.o\ - mds/AnchorTable.o\ - mds/AnchorClient.o\ - mds/MDStore.o\ - mds/LogEvent.o\ - mds/IdAllocator.o\ - mds/MDLog.o - -OSD_OBJS= \ - osd/PG.o\ - osd/Ager.o\ - osd/FakeStore.o\ - osd/OSD.o - -OSDC_OBJS= \ - osdc/Objecter.o\ - osdc/ObjectCacher.o\ - osdc/Filer.o\ - osdc/Journaler.o - -MON_OBJS= \ - mon/Monitor.o\ - mon/Paxos.o\ - mon/OSDMonitor.o\ - mon/MDSMonitor.o\ - mon/ClientMonitor.o\ - mon/Elector.o\ - mon/MonitorStore.o - -COMMON_OBJS= \ - msg/Message.o\ - common/Logger.o\ - common/Clock.o\ - common/Timer.o\ - config.o - -CLIENT_OBJS= \ - client/FileCache.o\ - client/Client.o\ - client/SyntheticClient.o\ - client/Trace.o - - -ifeq ($(want_bdb),yes) -OSBDB_OBJS = \ - osbdb/OSBDB.o - -OSBDB_OBJ = osbdb.o -endif - -TARGETS = cmon cosd cmds csyn newsyn fakesyn mkmonmap cfuse fakefuse -NO_FUSE = cmon cosd cmds csyn newsyn fakesyn mkmonmap - - -SRCS=*.cc */*.cc *.h */*.h */*/*.h - -all: depend ${TARGETS} - -nofuse: depend ${NO_FUSE} - -test: depend ${TEST_TARGETS} - -obfs: depend obfstest - - -# real bits -mkmonmap: mkmonmap.cc common.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - -cmon: cmon.cc mon.o msg/SimpleMessenger.o common.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - -cosd: cosd.cc osd.o ebofs.o ${OSBDB_OBJ} msg/SimpleMessenger.o common.o - ${CC} ${CFLAGS} ${LIBS} ${OSBDB_LIBS} $^ -o $@ - -cmds: cmds.cc mds.o osdc.o msg/SimpleMessenger.o common.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - -csyn: csyn.cc client.o osdc.o msg/SimpleMessenger.o common.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ - -cfuse: cfuse.cc client.o osdc.o client/fuse.o msg/SimpleMessenger.o common.o - ${CC} ${CFLAGS} ${LIBS} -lfuse $^ -o $@ - - -# misc -gprof-helper.so: test/gprof-helper.c - gcc -shared -fPIC test/gprof-helper.c -o gprof-helper.so -lpthread -ldl - - -# fake* -fakefuse: fakefuse.cc mon.o mds.o client.o osd.o osdc.o ebofs.o ${OSBDB_OBJ} client/fuse.o msg/FakeMessenger.o common.o - ${CC} -pg ${CFLAGS} ${LIBS} ${OSBDB_LIBS} -lfuse $^ -o $@ - -fakesyn: fakesyn.cc mon.o mds.o client.o osd.o ebofs.o ${OSBDB_OBJ} osdc.o msg/FakeMessenger.o common.o - ${CC} -pg ${CFLAGS} ${LIBS} ${OSBDB_LIBS} $^ -o $@ - - -# mpi startup -newsyn: newsyn.cc mon.o mds.o client.o osd.o ebofs.o ${OSBDB_OBJ} osdc.o msg/SimpleMessenger.o common.o - ${MPICC} -pg ${MPICFLAGS} ${MPILIBS} ${OSBDB_LIBS} $^ -o $@ - -newsyn.nopg: newsyn.cc mon.o mds.o client.o osd.o ebofs.o ${OSBDB_OBJ} osdc.o msg/SimpleMessenger.o common.o - ${MPICC} ${MPICFLAGS} ${MPILIBS} ${OSBDB_LIBS} $^ -o $@ - - -# ebofs -mkfs.ebofs: ebofs/mkfs.ebofs.cc config.cc common/Clock.o ebofs.o - ${CC} -pg ${CFLAGS} ${LIBS} $^ -o $@ - -test.ebofs: ebofs/test.ebofs.cc config.cc common/Clock.o ebofs.o - ${CC} -pg ${CFLAGS} ${LIBS} $^ -o $@ - - -# + obfs (old) -fakesynobfs: fakesyn.cc mds.o client.o osd_obfs.o msg/FakeMessenger.o common.o - ${CC} -DUSE_OBFS ${CFLAGS} ${LIBS} $^ -o $@ - -tcpsynobfs: tcpsyn.cc mds.o client.o osd_obfs.o ${TCP_OBJS} common.o - ${MPICC} -DUSE_OBFS ${MPICFLAGS} ${MPILIBS} $^ -o $@ - -osd_obfs.o: osd/OBFSStore.o osd/OSD.cc osd/PG.o osd/ObjectStore.o osd/FakeStore.o - ${MPICC} -DUSE_OBFS ${MPICFLAGS} ${MPILIBS} $^ -o $@ ../uofs/uofs.a - - -# hadoop -libhadoopcephfs.so: client/hadoop/CephFSInterface.cc client.o osdc.o msg/SimpleMessenger.o common.o - ${CC} -fPIC -shared -Wl,-soname,$@.1 ${CFLAGS} -I/usr/local/java/include -I/usr/local/java/include/linux ${LIBS} $^ -o $@ - -# libceph -libceph.o: client/ldceph.o client/Client.o msg/SimpleMessenger.o ${COMMON_OBJS} ${SYN_OBJS} ${OSDC_OBJS} - ${LDINC} $^ -o $@ - -bench/mdtest/mdtest.o: bench/mdtest/mdtest.c - mpicc -c $^ -o $@ - -mdtest: bench/mdtest/mdtest.o - ${MPICC} ${MPICFLAGS} ${MPILIBS} $^ -o $@ - -mdtest.ceph: bench/mdtest/mdtest.o libceph.o - ${MPICC} ${MPICFLAGS} ${MPILIBS} $^ -o $@ - -# OSD test - -testos: test/testos.o ebofs.o osbdb.o common.o - ${CC} ${CFLAGS} ${LIBS} ${OSBDB_LIBS} -o $@ $^ - -# - -%.so: %.cc - ${CC} -shared -fPIC ${CFLAGS} $< -o $@ - -clean: - rm -f *.o */*.o ${TARGETS} ${TEST_TARGETS} - -common.o: ${COMMON_OBJS} - ${LDINC} $@ $^ - -ebofs.o: ${EBOFS_OBJS} - ${LDINC} $@ $^ - -client.o: ${CLIENT_OBJS} - ${LDINC} $@ $^ - -osd.o: ${OSD_OBJS} - ${LDINC} $@ $^ - -osdc.o: ${OSDC_OBJS} - ${LDINC} $@ $^ - -mds.o: ${MDS_OBJS} - ${LDINC} $@ $^ - -mon.o: ${MON_OBJS} - ${LDINC} $@ $^ - -osbdb.o: ${OSBDB_OBJS} - ${LDINC} $@ $^ - -%.o: %.cc - ${CC} ${CFLAGS} -c $< -o $@ - -%.po: %.cc - ${CC} -fPIC ${CFLAGS} -c $< -o $@ - -count: - cat ${SRCS} | wc -l - cat ${SRCS} | grep -c \; - -TAGS: - etags `find . -name "*.[h|cc]"` - -.depend: - touch .depend - -depend: - $(RM) .depend - makedepend -f- -- $(CFLAGS) -- $(SRCS) > .depend 2>/dev/null - -# now add a line to include the dependency list. -include .depend diff --git a/tags/20070517_before_mds_merge/README b/tags/20070517_before_mds_merge/README deleted file mode 100644 index aa016817cebf0..0000000000000 --- a/tags/20070517_before_mds_merge/README +++ /dev/null @@ -1,4 +0,0 @@ -Ceph - a scalable distributed file system ------------------------------------------ - -Please see http://ceph.sourceforge.net/ for current info. diff --git a/tags/20070517_before_mds_merge/TODO b/tags/20070517_before_mds_merge/TODO deleted file mode 100644 index 8a64da39dfc8a..0000000000000 --- a/tags/20070517_before_mds_merge/TODO +++ /dev/null @@ -1,322 +0,0 @@ - - -monitor -- finish generic paxos - -osdmon -- distribute w/ paxos framework -- allow fresh replacement osds. add osd_created in osdmap, probably -- monitor needs to monitor some osds... -- monitor pg states, notify on out? -- watch osd utilization; adjust overload in cluster map - -mdsmon -- distribute w/ paxos framework - -journaler -- fix up for large events (e.g. imports) -- use set_floor_and_read for safe takeover from possibly-not-quite-dead otherguy. -- should we pad with zeros to avoid splitting individual entries? - - make it a g_conf flag? - - have to fix reader to skip over zeros (either <4 bytes for size, or zeroed sizes) -- need to truncate at detected (valid) write_pos to clear out any other partial trailing writes - - -crush -- xml import/export? -- crush tools - - -rados+ebofs -- purge replicated writes from cache. (with exception of partial tail blocks.) - -rados paper todo? -- better experiments - - berkeleydb objectstore? -- flush log only in response to subsequent read or write? -- better behaving recovery -- justify use of splay. - - dynamic replication -- snapshots - -rados snapshots -- integrate revisions into ObjectCacher -- clean up oid.rev vs op.rev in osd+osdc - -- attr.crev is rev we were created in. -- oid.rev=0 is "live". defined for attr.crev <= rev. -- otherwise, defined for attr.crev <= rev < oid.rev (i.e. oid.rev is upper bound, non-inclusive.) - -- write|delete is tagged with op.rev - - if attr.crev < op.rev - - we clone to oid.rev=rev (clone keeps old crev) - - change live attr.crev=rev. - - apply update -- read is tagged with op.rev - - if 0, we read from 0 (if it exists). - - otherwise we choose object rev based on op.rev vs oid.rev, and then verifying attr.crev <= op.rev. - -- how to get usage feedback to monitor? - -- change messenger entity_inst_t - - no more rank! make it a uniquish nonce? - -- clean up mds caps release in exporter -- figure out client failure modes -- clean up messenger failure modes. -- add connection retry. - - -objecter -- read+floor_lockout - -osd/rados -- read+floor_lockout for clean STOGITH-like/fencing semantics after failover. -- separate out replication code into a PG class, to pave way for RAID - -- efficiently replicate clone() objects -- pg_num instead of pg_bits -- flag missing log entries on crash recovery --> WRNOOP? or WRLOST? -- consider implications of nvram writeahead logs -- fix heartbeat wrt new replication -- mark residual pgs obsolete ??? -- rdlocks -- optimize remove wrt recovery pushes -- pg_bit/pg_num changes -- report crashed pgs? - -simplemessenger -- close idle connections -- retry, timeout on connection or transmission failure - -objectcacher -- ocacher caps transitions vs locks -- test read locks - -reliability -- heartbeat vs ping? -- osdmonitor, filter - -ebofs -- verify proper behavior of conflicting/overlapping reads of clones -- test(fix) sync() -- combine inodes and/or cnodes into same blocks -- allow btree sets instead of maps -- eliminate nodepools -- nonblocking write on missing onodes? -- fix bug in node rotation on insert (and reenable) -- fix NEAR_LAST_FWD (?) -- journaling? in NVRAM? -- metadata in nvram? flash? - - -remaining hard problems -- how to cope with file size changes and read/write sharing - - -crush -- more efficient failure when all/too many osds are down -- allow forcefeed for more complicated rule structures. (e.g. make force_stack a list< set >) - - -mds -- distributed client management -- anchormgr - - 2pc - - independent journal? - - distributed? -- link count management - - also 2pc -- chdir (directory opens!) -- rewrite logstream - - clean up - - be smart about rados ack vs reread - - log locking? root log object - - trimming, rotation - -- efficient stat for single writers -- lstat vs stat -- add FILE_CAP_EXTEND capability bit -- only share osdmap updates with clients holding capabilities -- delayed replica caps release... we need to set a timer event? (and cancel it when appropriate?) -- finish hard links! - - reclaim danglers from inode file on discover... - - fix rename wrt hard links -- interactive hash/unhash interface -- test hashed readdir -- make logstream.flush align itself to stripes - -- carefully define/document frozen wrt dir_auth vs hashing - - - -client -- fstat -- make_request: cope with mds failure -- mixed lazy and non-lazy io will clobber each others' caps in the buffer cache.. how to isolate.. -- test client caps migration w/ mds exports -- some heuristic behavior to consolidate caps to inode auth? - - - -MDS TODO -- fix hashed readdir: should (optionally) do a lock on dir namespace? -- fix hard links - - they mostly work, but they're fragile -- sync clients on stat - - will need to ditch 10s client metadata caching before this is useful - - implement truncate -- implement hashed directories -- statfs? -- rewrite journal + recovery -- figure out online failure recovery -- more distributed fh management? -- btree directories (for efficient large directories) -- consistency points/snapshots - -- fix MExportAck and others to use dir+dentry, not inode - (otherwise this all breaks with hard links.. altho it probably needs reworking already?) - - - - - -why qsync could be wrong (for very strict POSIX) : varying mds -> client message transit or processing times. -- mds -> 1,2 : qsync -- client1 writes at byte 100 -- client1 -> mds : qsync reply (size=100) -- client1 writes at byte 300 -- client1 -> client2 (outside channel) -- client2 writes at byte 200 -- client2 -> mds : qsync reply (size=200) --> stat results in size 200, even though at no single point in time was the max size 500. --> for correct result, need to _stop_ client writers while gathering metadata. - - -SAGE: - -- string table? - -- hard links - - fix MExportAck and others to use dir+dentry, not inode - (otherwise this all breaks with hard links.. altho it probably needs reworking already!) - -- do real permission checks? - - - - - - -ISSUES - - -- discover - - soft: authority selectively repicates, or sets a 'forward' flag in reply - - hard: authority always replicates (eg. discover for export) - - forward flag (see soft) - - error flag (if file not found, etc.) - - [what was i talking about?] make sure waiters are properly triggered, either upon dir_rep update, or (empty!) discover reply - - - -DOCUMENT -- cache, distributed cache structure and invariants -- export process -- hash/unhash process - - -TEST -- hashing - - test hash/unhash operation - - hash+export: encode list of replicated dir inodes so they can be discovered before import is procesed. - - test nauthitems (wrt hashing?) - - -IMPLEMENT - -- smarter balancing - - popularity calculation and management is inconsistent/wrong. - - does it work? - -- dump active config in run output somewhere - - - - - - - - - - -==== MDS RECOVERY ==== - -- how to reliably deliver cache expire messages? - - how should proxy behave? - - exporter failure - - all cacheexpire info has been passed on up until point where export is permanent. no impact. - - importer failure - - exporter collects expire info, so that it can reverse. - - ??? - - maybe hosts should double-up expires until after export is known to have committed? ---> just send expires to both nodes. dir_auth+dir_auth2. clean up export ack/notify process. :) - -*** dar... no, separate bystander dir_auth updates from the prepare/ack/commit cycle! -- expire should go to both old and new auth -- set_dir_auth should take optional second auth, and authority() should optionally set/return a second possible auth -- does inode need it's own replica list? no! -- dirslices. - - -/- exporter recovery if importer fails during EXPORT_EXPORTING stage -- importer recovery if exporter fails - -/?- delay response to sending import_map if export in progress? -/?- finish export before sending import_map? -/- ambiguous imports on active node should include in-progress imports! -/- how to effectively trim cache after resolve but before rejoin -/ - we need to eliminate unneed non-auth metadata, without hosing potentially useful auth metadata - -- osd needs a set_floor_and_read op for safe failover/STOGITH-like semantics. - -- failures during recovery stages (resolve, rejoin)... make sure rejoin still works! - -- fix mds initial osdmap weirdness (which will currently screw up on standby -> almost anything) - - -importmap only sent after exports have completed. -failures update export ack waitlists, so exports will compelte if unrelated nodes fail. -importmap can be sent regardless of import status -- pending import is just flagged ambiguous. -failure of exporter induces some cleanup on importer. importer will disambiguate when it gets an importmap on exporter recovery. -failure of importer induces cleanup on exporter. no ambiguity. - - -/- no new mds may join if cluster is in a recovery state. starting -> standby (unless failed) -/ - make sure creating -> standby, and are not included in recovery set? - - -mdsmap notes -- mds don't care about intervening states, except rejoin > active, and - that transition requires active involvement. thus, no need worry - about delivering/processing the full sequence of maps. - -blech: -- EMetablob should return 'expired' if they have - higher versions (and are thus described by a newer journal entry) - -mds -- mds falure vs clients - - clean up client op redirection - - idempotent ops - -- journal+recovery - - unlink - - open(wr cap), open+create - - file capabilities i/o - - link - - rename - -- should auth_pins really go to the root? - - FIXME: auth_pins on importer versus import beneath an authpinned region? - diff --git a/tags/20070517_before_mds_merge/cfuse.cc b/tags/20070517_before_mds_merge/cfuse.cc deleted file mode 100644 index a9b47f1270afb..0000000000000 --- a/tags/20070517_before_mds_merge/cfuse.cc +++ /dev/null @@ -1,83 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include -#include -#include -using namespace std; - -#include "config.h" - -#include "client/Client.h" -#include "client/fuse.h" - -#include "msg/SimpleMessenger.h" - -#include "common/Timer.h" - -#ifndef DARWIN -#include -#endif // DARWIN - -#include -#include -#include - -int main(int argc, char **argv, char *envp[]) { - - //cerr << "cfuse starting " << myrank << "/" << world << endl; - vector args; - argv_to_vec(argc, argv, args); - parse_config_options(args); - - // args for fuse - vec_to_argv(args, argc, argv); - - // FUSE will chdir("/"); be ready. - g_conf.use_abspaths = true; - - if (g_conf.clock_tare) g_clock.tare(); - - // load monmap - MonMap monmap; - int r = monmap.read(".ceph_monmap"); - assert(r >= 0); - - // start up network - rank.start_rank(); - - // start client - Client *client = new Client(rank.register_entity(MSG_ADDR_CLIENT_NEW), &monmap); - client->init(); - - // start up fuse - // use my argc, argv (make sure you pass a mount point!) - cout << "mounting" << endl; - client->mount(); - - cerr << "starting fuse on pid " << getpid() << endl; - ceph_fuse_main(client, argc, argv); - cerr << "fuse finished on pid " << getpid() << endl; - - client->unmount(); - cout << "unmounted" << endl; - client->shutdown(); - - delete client; - - // wait for messenger to finish - rank.wait(); - - return 0; -} - diff --git a/tags/20070517_before_mds_merge/client/Client.cc b/tags/20070517_before_mds_merge/client/Client.cc deleted file mode 100644 index 5a80d7d38bc6f..0000000000000 --- a/tags/20070517_before_mds_merge/client/Client.cc +++ /dev/null @@ -1,2766 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -// unix-ey fs stuff -#include -#include -#include -#include -#include -#include - -#include - - -#include -using namespace std; - - -// ceph stuff -#include "Client.h" - - -#include "messages/MClientBoot.h" -#include "messages/MClientMount.h" -#include "messages/MClientMountAck.h" -#include "messages/MClientFileCaps.h" - -#include "messages/MGenericMessage.h" - -#include "messages/MMDSGetMap.h" -#include "messages/MMDSMap.h" - -#include "osdc/Filer.h" -#include "osdc/Objecter.h" -#include "osdc/ObjectCacher.h" - -#include "common/Cond.h" -#include "common/Mutex.h" -#include "common/Logger.h" - - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_client) cout << g_clock.now() << " client" << whoami << "." << pthread_self() << " " - -#define tout if (g_conf.client_trace) cout << "trace: " - - -// static logger -LogType client_logtype; -Logger *client_logger = 0; - - - -class C_Client_CloseRelease : public Context { - Client *cl; - Inode *in; -public: - C_Client_CloseRelease(Client *c, Inode *i) : cl(c), in(i) {} - void finish(int) { - cl->close_release(in); - } -}; - -class C_Client_CloseSafe : public Context { - Client *cl; - Inode *in; -public: - C_Client_CloseSafe(Client *c, Inode *i) : cl(c), in(i) {} - void finish(int) { - cl->close_safe(in); - } -}; - - - - - - -// cons/des - -Client::Client(Messenger *m, MonMap *mm) -{ - // which client am i? - whoami = m->get_myname().num(); - monmap = mm; - - mounted = false; - unmounting = false; - - last_tid = 0; - unsafe_sync_write = 0; - - mdsmap = 0; - - // - root = 0; - - set_cache_size(g_conf.client_cache_size); - - // file handles - free_fh_set.insert(10, 1<<30); - - // set up messengers - messenger = m; - messenger->set_dispatcher(this); - - // osd interfaces - osdmap = new OSDMap(); // initially blank.. see mount() - objecter = new Objecter(messenger, monmap, osdmap); - objectcacher = new ObjectCacher(objecter, client_lock); - filer = new Filer(objecter); -} - - -Client::~Client() -{ - tear_down_cache(); - - if (objectcacher) { - delete objectcacher; - objectcacher = 0; - } - - if (filer) { delete filer; filer = 0; } - if (objecter) { delete objecter; objecter = 0; } - if (osdmap) { delete osdmap; osdmap = 0; } - if (mdsmap) { delete mdsmap; mdsmap = 0; } - - if (messenger) { delete messenger; messenger = 0; } -} - - -void Client::tear_down_cache() -{ - // fh's - for (hash_map::iterator it = fh_map.begin(); - it != fh_map.end(); - it++) { - Fh *fh = it->second; - dout(1) << "tear_down_cache forcing close of fh " << it->first << " ino " << fh->inode->inode.ino << endl; - put_inode(fh->inode); - delete fh; - } - fh_map.clear(); - - // caps! - // *** FIXME *** - - // empty lru - lru.lru_set_max(0); - trim_cache(); - assert(lru.lru_get_size() == 0); - - // close root ino - assert(inode_map.size() <= 1); - if (root && inode_map.size() == 1) { - delete root; - root = 0; - inode_map.clear(); - } - - assert(inode_map.empty()); -} - - - -// debug crapola - -void Client::dump_inode(Inode *in, set& did) -{ - dout(1) << "dump_inode: inode " << in->ino() << " ref " << in->ref << " dir " << in->dir << endl; - - if (in->dir) { - dout(1) << " dir size " << in->dir->dentries.size() << endl; - //for (hash_map, eqstr>::iterator it = in->dir->dentries.begin(); - for (hash_map::iterator it = in->dir->dentries.begin(); - it != in->dir->dentries.end(); - it++) { - dout(1) << " dn " << it->first << " ref " << it->second->ref << endl; - dump_inode(it->second->inode, did); - } - } -} - -void Client::dump_cache() -{ - set did; - - if (root) dump_inode(root, did); - - for (hash_map::iterator it = inode_map.begin(); - it != inode_map.end(); - it++) { - if (did.count(it->second)) continue; - - dout(1) << "dump_cache: inode " << it->first - << " ref " << it->second->ref - << " dir " << it->second->dir << endl; - if (it->second->dir) { - dout(1) << " dir size " << it->second->dir->dentries.size() << endl; - } - } - -} - - -void Client::init() { - -} - -void Client::shutdown() { - dout(1) << "shutdown" << endl; - messenger->shutdown(); -} - - - - -// =================== -// metadata cache stuff - -void Client::trim_cache() -{ - unsigned last = 0; - while (lru.lru_get_size() != last) { - last = lru.lru_get_size(); - - if (lru.lru_get_size() <= lru.lru_get_max()) break; - - // trim! - Dentry *dn = (Dentry*)lru.lru_expire(); - if (!dn) break; // done - - //dout(10) << "trim_cache unlinking dn " << dn->name << " in dir " << hex << dn->dir->inode->inode.ino << endl; - unlink(dn); - } - - // hose root? - if (lru.lru_get_size() == 0 && root && inode_map.size() == 1) { - delete root; - root = 0; - inode_map.clear(); - } -} - -/** insert_inode - * - * insert + link a single dentry + inode into the metadata cache. - */ -Inode* Client::insert_inode(Dir *dir, InodeStat *st, const string& dname) -{ - Dentry *dn = NULL; - if (dir->dentries.count(dname)) - dn = dir->dentries[dname]; - - dout(12) << "insert_inode " << dname << " ino " << st->inode.ino - << " size " << st->inode.size - << " mtime " << st->inode.mtime - << " hashed " << st->hashed - << endl; - - if (dn) { - if (dn->inode->inode.ino == st->inode.ino) { - touch_dn(dn); - dout(12) << " had dentry " << dname - << " with correct ino " << dn->inode->inode.ino - << endl; - } else { - dout(12) << " had dentry " << dname - << " with WRONG ino " << dn->inode->inode.ino - << endl; - unlink(dn); - dn = NULL; - } - } - - if (!dn) { - // have inode linked elsewhere? -> unlink and relink! - if (inode_map.count(st->inode.ino)) { - Inode *in = inode_map[st->inode.ino]; - assert(in); - - if (in->dn) { - dout(12) << " had ino " << in->inode.ino - << " linked at wrong position, unlinking" - << endl; - dn = relink(in->dn, dir, dname); - } else { - // link - dout(12) << " had ino " << in->inode.ino - << " unlinked, linking" << endl; - dn = link(dir, dname, in); - } - } - } - - if (!dn) { - Inode *in = new Inode(st->inode, objectcacher); - inode_map[st->inode.ino] = in; - dn = link(dir, dname, in); - dout(12) << " new dentry+node with ino " << st->inode.ino << endl; - } else { - // actually update info - dout(12) << " stat inode mask is " << st->inode.mask << endl; - dn->inode->inode = st->inode; - - // ...but don't clobber our mtime, size! - if ((dn->inode->inode.mask & INODE_MASK_SIZE) == 0 && - dn->inode->file_wr_size > dn->inode->inode.size) - dn->inode->inode.size = dn->inode->file_wr_size; - if ((dn->inode->inode.mask & INODE_MASK_MTIME) == 0 && - dn->inode->file_wr_mtime > dn->inode->inode.mtime) - dn->inode->inode.mtime = dn->inode->file_wr_mtime; - } - - // OK, we found it! - assert(dn && dn->inode); - - // or do we have newer size/mtime from writing? - if (dn->inode->file_caps() & CAP_FILE_WR) { - if (dn->inode->file_wr_size > dn->inode->inode.size) - dn->inode->inode.size = dn->inode->file_wr_size; - if (dn->inode->file_wr_mtime > dn->inode->inode.mtime) - dn->inode->inode.mtime = dn->inode->file_wr_mtime; - } - - // symlink? - if ((dn->inode->inode.mode & INODE_TYPE_MASK) == INODE_MODE_SYMLINK) { - if (!dn->inode->symlink) - dn->inode->symlink = new string; - *(dn->inode->symlink) = st->symlink; - } - - return dn->inode; -} - -/** update_inode_dist - * - * update MDS location cache for a single inode - */ -void Client::update_inode_dist(Inode *in, InodeStat *st) -{ - // dir info - in->dir_auth = st->dir_auth; - in->dir_hashed = st->hashed; - in->dir_replicated = st->replicated; - - // dir replication - if (st->spec_defined) { - if (st->dist.empty() && !in->dir_contacts.empty()) - dout(9) << "lost dist spec for " << in->inode.ino - << " " << st->dist << endl; - if (!st->dist.empty() && in->dir_contacts.empty()) - dout(9) << "got dist spec for " << in->inode.ino - << " " << st->dist << endl; - in->dir_contacts = st->dist; - } -} - - -/** insert_trace - * - * insert a trace from a MDS reply into the cache. - */ -Inode* Client::insert_trace(MClientReply *reply) -{ - Inode *cur = root; - time_t now = time(NULL); - - dout(10) << "insert_trace got " << reply->get_trace_in().size() << " inodes" << endl; - - list::const_iterator pdn = reply->get_trace_dn().begin(); - - for (list::const_iterator pin = reply->get_trace_in().begin(); - pin != reply->get_trace_in().end(); - ++pin) { - - if (pin == reply->get_trace_in().begin()) { - // root - dout(10) << "insert_trace root" << endl; - if (!root) { - // create - cur = root = new Inode((*pin)->inode, objectcacher); - inode_map[root->inode.ino] = root; - } - } else { - // not root. - dout(10) << "insert_trace dn " << *pdn << " ino " << (*pin)->inode.ino << endl; - Dir *dir = cur->open_dir(); - cur = this->insert_inode(dir, *pin, *pdn); - ++pdn; - - // move to top of lru! - if (cur->dn) - lru.lru_touch(cur->dn); - } - - // update dist info - update_inode_dist(cur, *pin); - - // set cache ttl - if (g_conf.client_cache_stat_ttl) - cur->valid_until = now + g_conf.client_cache_stat_ttl; - } - - return cur; -} - - - - -Dentry *Client::lookup(filepath& path) -{ - dout(14) << "lookup " << path << endl; - - Inode *cur = root; - if (!cur) return NULL; - - Dentry *dn = 0; - for (unsigned i=0; iinode << " valid_until " << dn->inode->valid_until << endl; - } else { - dout(14) << " dentry " << path[i] << " dne" << endl; - return NULL; - } - cur = dn->inode; - assert(cur); - } else { - return NULL; // not a dir - } - } - - if (dn) { - dout(11) << "lookup '" << path << "' found " << dn->name << " inode " << dn->inode->inode.ino << " valid_until " << dn->inode->valid_until<< endl; - } - - return dn; -} - -// ------- - -MClientReply *Client::make_request(MClientRequest *req, - bool auth_best, - int use_mds) // this param is purely for debug hacking -{ - // assign a unique tid - req->set_tid(++last_tid); - - // find deepest known prefix - Inode *diri = root; // the deepest known containing dir - Inode *item = 0; // the actual item... if we know it - int missing_dn = -1; // which dn we miss on (if we miss) - - unsigned depth = req->get_filepath().depth(); - for (unsigned i=0; iinode.mode & INODE_MODE_DIR && diri->dir) { - Dir *dir = diri->dir; - - // do we have the next dentry? - if (dir->dentries.count( req->get_filepath()[i] ) == 0) { - missing_dn = i; // no. - break; - } - - dout(7) << " have path seg " << i << " on " << diri->dir_auth << " ino " << diri->inode.ino << " " << req->get_filepath()[i] << endl; - - if (i == depth-1) { // last one! - item = dir->dentries[ req->get_filepath()[i] ]->inode; - break; - } - - // continue.. - diri = dir->dentries[ req->get_filepath()[i] ]->inode; - assert(diri); - } else { - missing_dn = i; - break; - } - } - - // choose an mds - int mds = 0; - if (!diri || g_conf.client_use_random_mds) { - // no root info, pick a random MDS - mds = rand() % mdsmap->get_num_mds(); - } else { - if (auth_best) { - // pick the actual auth (as best we can) - if (item) { - mds = item->authority(mdsmap); - } else if (diri->dir_hashed && missing_dn >= 0) { - mds = diri->dentry_authority(req->get_filepath()[missing_dn].c_str(), - mdsmap); - } else { - mds = diri->authority(mdsmap); - } - } else { - // balance our traffic! - if (diri->dir_hashed && missing_dn >= 0) - mds = diri->dentry_authority(req->get_filepath()[missing_dn].c_str(), - mdsmap); - else - mds = diri->pick_replica(mdsmap); - } - } - dout(20) << "mds is " << mds << endl; - - // force use of a particular mds? - if (use_mds >= 0) mds = use_mds; - - - // time the call - utime_t start = g_clock.now(); - - bool nojournal = false; - int op = req->get_op(); - if (op == MDS_OP_STAT || - op == MDS_OP_LSTAT || - op == MDS_OP_READDIR || - op == MDS_OP_OPEN || - op == MDS_OP_RELEASE) - nojournal = true; - - MClientReply *reply = sendrecv(req, mds); - - if (client_logger) { - utime_t lat = g_clock.now(); - lat -= start; - dout(20) << "lat " << lat << endl; - client_logger->finc("lsum",(double)lat); - client_logger->inc("lnum"); - - if (nojournal) { - client_logger->finc("lrsum",(double)lat); - client_logger->inc("lrnum"); - } else { - client_logger->finc("lwsum",(double)lat); - client_logger->inc("lwnum"); - } - - if (op == MDS_OP_STAT) { - client_logger->finc("lstatsum",(double)lat); - client_logger->inc("lstatnum"); - } - else if (op == MDS_OP_READDIR) { - client_logger->finc("ldirsum",(double)lat); - client_logger->inc("ldirnum"); - } - - } - - return reply; -} - - -MClientReply* Client::sendrecv(MClientRequest *req, int mds) -{ - // NEW way. - Cond cond; - tid_t tid = req->get_tid(); - mds_rpc_cond[tid] = &cond; - - messenger->send_message(req, mdsmap->get_inst(mds), MDS_PORT_SERVER); - - // wait - while (mds_rpc_reply.count(tid) == 0) { - dout(20) << "sendrecv awaiting reply kick on " << &cond << endl; - cond.Wait(client_lock); - } - - // got it! - MClientReply *reply = mds_rpc_reply[tid]; - - // kick dispatcher (we've got it!) - assert(mds_rpc_dispatch_cond.count(tid)); - mds_rpc_dispatch_cond[tid]->Signal(); - dout(20) << "sendrecv kickback on tid " << tid << " " << mds_rpc_dispatch_cond[tid] << endl; - - // clean up. - mds_rpc_cond.erase(tid); - mds_rpc_reply.erase(tid); - - return reply; -} - -void Client::handle_client_reply(MClientReply *reply) -{ - tid_t tid = reply->get_tid(); - - // store reply - mds_rpc_reply[tid] = reply; - - // wake up waiter - assert(mds_rpc_cond.count(tid)); - dout(20) << "handle_client_reply kicking caller on " << mds_rpc_cond[tid] << endl; - mds_rpc_cond[tid]->Signal(); - - // wake for kick back - assert(mds_rpc_dispatch_cond.count(tid) == 0); - Cond cond; - mds_rpc_dispatch_cond[tid] = &cond; - while (mds_rpc_cond.count(tid)) { - dout(20) << "handle_client_reply awaiting kickback on tid " << tid << " " << &cond << endl; - cond.Wait(client_lock); - } - - // ok, clean up! - mds_rpc_dispatch_cond.erase(tid); -} - - -// ------------------------ -// incoming messages - -void Client::dispatch(Message *m) -{ - client_lock.Lock(); - - switch (m->get_type()) { - // osd - case MSG_OSD_OPREPLY: - objecter->handle_osd_op_reply((MOSDOpReply*)m); - break; - - case MSG_OSD_MAP: - objecter->handle_osd_map((class MOSDMap*)m); - break; - - // client - case MSG_MDS_MAP: - handle_mds_map((MMDSMap*)m); - break; - - case MSG_CLIENT_REPLY: - handle_client_reply((MClientReply*)m); - break; - - case MSG_CLIENT_FILECAPS: - handle_file_caps((MClientFileCaps*)m); - break; - - case MSG_CLIENT_MOUNTACK: - handle_mount_ack((MClientMountAck*)m); - break; - case MSG_CLIENT_UNMOUNT: - handle_unmount_ack(m); - break; - - - default: - cout << "dispatch doesn't recognize message type " << m->get_type() << endl; - assert(0); // fail loudly - break; - } - - // unmounting? - if (unmounting) { - dout(10) << "unmounting: trim pass, size was " << lru.lru_get_size() - << "+" << inode_map.size() << endl; - trim_cache(); - if (lru.lru_get_size() == 0 && inode_map.empty()) { - dout(10) << "unmounting: trim pass, cache now empty, waking unmount()" << endl; - mount_cond.Signal(); - } else { - dout(10) << "unmounting: trim pass, size still " << lru.lru_get_size() - << "+" << inode_map.size() << endl; - dump_cache(); - } - } - - client_lock.Unlock(); -} - - -void Client::handle_mds_map(MMDSMap* m) -{ - if (mdsmap == 0) - mdsmap = new MDSMap; - - if (whoami < 0) { - whoami = m->get_dest().num(); - dout(1) << "handle_mds_map i am now " << m->get_dest() << endl; - messenger->reset_myname(m->get_dest()); - } - - dout(1) << "handle_mds_map epoch " << m->get_epoch() << endl; - mdsmap->decode(m->get_encoded()); - - delete m; - - // note our inc # - objecter->set_client_incarnation(0); // fixme - - mount_cond.Signal(); // mount might be waiting for this. -} - - -/**** - * caps - */ - - -class C_Client_ImplementedCaps : public Context { - Client *client; - MClientFileCaps *msg; - Inode *in; -public: - C_Client_ImplementedCaps(Client *c, MClientFileCaps *m, Inode *i) : client(c), msg(m), in(i) {} - void finish(int r) { - client->implemented_caps(msg,in); - } -}; - -/** handle_file_caps - * handle caps update from mds. including mds to mds caps transitions. - * do not block. - */ -void Client::handle_file_caps(MClientFileCaps *m) -{ - int mds = m->get_source().num(); - Inode *in = 0; - if (inode_map.count(m->get_ino())) in = inode_map[ m->get_ino() ]; - - m->clear_payload(); // for if/when we send back to MDS - - // reap? - if (m->get_special() == MClientFileCaps::FILECAP_REAP) { - int other = m->get_mds(); - - if (in && in->stale_caps.count(other)) { - dout(5) << "handle_file_caps on ino " << m->get_ino() << " from mds" << mds << " reap on mds" << other << endl; - - // fresh from new mds? - if (!in->caps.count(mds)) { - if (in->caps.empty()) in->get(); - in->caps[mds].seq = m->get_seq(); - in->caps[mds].caps = m->get_caps(); - } - - assert(in->stale_caps.count(other)); - in->stale_caps.erase(other); - if (in->stale_caps.empty()) put_inode(in); // note: this will never delete *in - - // fall-thru! - } else { - dout(5) << "handle_file_caps on ino " << m->get_ino() << " from mds" << mds << " premature (!!) reap on mds" << other << endl; - // delay! - cap_reap_queue[in->ino()][other] = m; - return; - } - } - - assert(in); - - // stale? - if (m->get_special() == MClientFileCaps::FILECAP_STALE) { - dout(5) << "handle_file_caps on ino " << m->get_ino() << " seq " << m->get_seq() << " from mds" << mds << " now stale" << endl; - - // move to stale list - assert(in->caps.count(mds)); - if (in->stale_caps.empty()) in->get(); - in->stale_caps[mds] = in->caps[mds]; - - assert(in->caps.count(mds)); - in->caps.erase(mds); - if (in->caps.empty()) in->put(); - - // delayed reap? - if (cap_reap_queue.count(in->ino()) && - cap_reap_queue[in->ino()].count(mds)) { - dout(5) << "handle_file_caps on ino " << m->get_ino() << " from mds" << mds << " delayed reap on mds" << m->get_mds() << endl; - - // process delayed reap - handle_file_caps( cap_reap_queue[in->ino()][mds] ); - - cap_reap_queue[in->ino()].erase(mds); - if (cap_reap_queue[in->ino()].empty()) - cap_reap_queue.erase(in->ino()); - } - delete m; - return; - } - - // release? - if (m->get_special() == MClientFileCaps::FILECAP_RELEASE) { - dout(5) << "handle_file_caps on ino " << m->get_ino() << " from mds" << mds << " release" << endl; - assert(in->caps.count(mds)); - in->caps.erase(mds); - for (map::iterator p = in->caps.begin(); - p != in->caps.end(); - p++) - dout(20) << " left cap " << p->first << " " - << cap_string(p->second.caps) << " " - << p->second.seq << endl; - for (map::iterator p = in->stale_caps.begin(); - p != in->stale_caps.end(); - p++) - dout(20) << " left stale cap " << p->first << " " - << cap_string(p->second.caps) << " " - << p->second.seq << endl; - - if (in->caps.empty()) { - //dout(0) << "did put_inode" << endl; - put_inode(in); - } else { - //dout(0) << "didn't put_inode" << endl; - } - delete m; - return; - } - - - // don't want? - if (in->file_caps_wanted() == 0) { - dout(5) << "handle_file_caps on ino " << m->get_ino() - << " seq " << m->get_seq() - << " " << cap_string(m->get_caps()) - << ", which we don't want caps for, releasing." << endl; - m->set_caps(0); - m->set_wanted(0); - messenger->send_message(m, m->get_source_inst(), m->get_source_port()); - return; - } - - assert(in->caps.count(mds)); - - // update per-mds caps - const int old_caps = in->caps[mds].caps; - const int new_caps = m->get_caps(); - in->caps[mds].caps = new_caps; - in->caps[mds].seq = m->get_seq(); - dout(5) << "handle_file_caps on in " << m->get_ino() - << " mds" << mds << " seq " << m->get_seq() - << " caps now " << cap_string(new_caps) - << " was " << cap_string(old_caps) << endl; - - // did file size decrease? - if ((old_caps & (CAP_FILE_RD|CAP_FILE_WR)) == 0 && - (new_caps & (CAP_FILE_RD|CAP_FILE_WR)) != 0 && - in->inode.size > m->get_inode().size) { - dout(10) << "*** file size decreased from " << in->inode.size << " to " << m->get_inode().size << endl; - - // trim filecache? - if (g_conf.client_oc) - in->fc.truncate(in->inode.size, m->get_inode().size); - - in->inode.size = in->file_wr_size = m->get_inode().size; - } - - // update inode - in->inode = m->get_inode(); // might have updated size... FIXME this is overkill! - - // preserve our (possibly newer) file size, mtime - if (in->file_wr_size > in->inode.size) - m->get_inode().size = in->inode.size = in->file_wr_size; - if (in->file_wr_mtime > in->inode.mtime) - m->get_inode().mtime = in->inode.mtime = in->file_wr_mtime; - - - - if (g_conf.client_oc) { - // caching on, use FileCache. - Context *onimplement = 0; - if (old_caps & ~new_caps) { // this mds is revoking caps - if (in->fc.get_caps() & ~(in->file_caps())) // net revocation - onimplement = new C_Client_ImplementedCaps(this, m, in); - else { - implemented_caps(m, in); // ack now. - } - } - in->fc.set_caps(new_caps, onimplement); - } else { - // caching off. - - // wake up waiters? - if (new_caps & CAP_FILE_RD) { - for (list::iterator it = in->waitfor_read.begin(); - it != in->waitfor_read.end(); - it++) { - dout(5) << "signaling read waiter " << *it << endl; - (*it)->Signal(); - } - in->waitfor_read.clear(); - } - if (new_caps & CAP_FILE_WR) { - for (list::iterator it = in->waitfor_write.begin(); - it != in->waitfor_write.end(); - it++) { - dout(5) << "signaling write waiter " << *it << endl; - (*it)->Signal(); - } - in->waitfor_write.clear(); - } - if (new_caps & CAP_FILE_LAZYIO) { - for (list::iterator it = in->waitfor_lazy.begin(); - it != in->waitfor_lazy.end(); - it++) { - dout(5) << "signaling lazy waiter " << *it << endl; - (*it)->Signal(); - } - in->waitfor_lazy.clear(); - } - - // ack? - if (old_caps & ~new_caps) { - if (in->sync_writes) { - // wait for sync writes to finish - dout(5) << "sync writes in progress, will ack on finish" << endl; - in->waitfor_no_write.push_back(new C_Client_ImplementedCaps(this, m, in)); - } else { - // ok now - implemented_caps(m, in); - } - } else { - // discard - delete m; - } - } -} - -void Client::implemented_caps(MClientFileCaps *m, Inode *in) -{ - dout(5) << "implemented_caps " << cap_string(m->get_caps()) - << ", acking to " << m->get_source() << endl; - - if (in->file_caps() == 0) { - in->file_wr_mtime = 0; - in->file_wr_size = 0; - } - - messenger->send_message(m, m->get_source_inst(), m->get_source_port()); -} - - -void Client::release_caps(Inode *in, - int retain) -{ - dout(5) << "releasing caps on ino " << in->inode.ino << dec - << " had " << cap_string(in->file_caps()) - << " retaining " << cap_string(retain) - << " want " << cap_string(in->file_caps_wanted()) - << endl; - - for (map::iterator it = in->caps.begin(); - it != in->caps.end(); - it++) { - //if (it->second.caps & ~retain) { - if (1) { - // release (some of?) these caps - it->second.caps = retain & it->second.caps; - // note: tell mds _full_ wanted; it'll filter/behave based on what it is allowed to do - MClientFileCaps *m = new MClientFileCaps(in->inode, - it->second.seq, - it->second.caps, - in->file_caps_wanted()); - messenger->send_message(m, mdsmap->get_inst(it->first), MDS_PORT_LOCKER); - } - } - - if (in->file_caps() == 0) { - in->file_wr_mtime = 0; - in->file_wr_size = 0; - } -} - -void Client::update_caps_wanted(Inode *in) -{ - dout(5) << "updating caps wanted on ino " << in->inode.ino - << " to " << cap_string(in->file_caps_wanted()) - << endl; - - // FIXME: pick a single mds and let the others off the hook.. - for (map::iterator it = in->caps.begin(); - it != in->caps.end(); - it++) { - MClientFileCaps *m = new MClientFileCaps(in->inode, - it->second.seq, - it->second.caps, - in->file_caps_wanted()); - messenger->send_message(m, - mdsmap->get_inst(it->first), MDS_PORT_LOCKER); - } -} - - - -// ------------------- -// fs ops - -int Client::mount() -{ - client_lock.Lock(); - - assert(!mounted); // caller is confused? - - // FIXME mds map update race with mount. - - dout(2) << "sending boot msg to monitor" << endl; - if (mdsmap) - delete mdsmap; - int mon = monmap->pick_mon(); - messenger->send_message(new MClientBoot(), - monmap->get_inst(mon)); - - while (!mdsmap) - mount_cond.Wait(client_lock); - - dout(2) << "mounting" << endl; - MClientMount *m = new MClientMount(); - - int who = 0; // mdsmap->get_root(); // mount at root, for now - messenger->send_message(m, - mdsmap->get_inst(who), - MDS_PORT_SERVER); - - while (!mounted) - mount_cond.Wait(client_lock); - - client_lock.Unlock(); - - /* - dout(3) << "op: // client trace data structs" << endl; - dout(3) << "op: struct stat st;" << endl; - dout(3) << "op: struct utimbuf utim;" << endl; - dout(3) << "op: int readlinkbuf_len = 1000;" << endl; - dout(3) << "op: char readlinkbuf[readlinkbuf_len];" << endl; - dout(3) << "op: map dir_contents;" << endl; - dout(3) << "op: map open_files;" << endl; - dout(3) << "op: fh_t fh;" << endl; - */ - return 0; -} - -void Client::handle_mount_ack(MClientMountAck *m) -{ - // mdsmap! - if (!mdsmap) mdsmap = new MDSMap; - mdsmap->decode(m->get_mds_map_state()); - - // we got osdmap! - osdmap->decode(m->get_osd_map_state()); - - dout(2) << "mounted" << endl; - mounted = true; - mount_cond.Signal(); - - delete m; -} - - -int Client::unmount() -{ - client_lock.Lock(); - - assert(mounted); // caller is confused? - - dout(2) << "unmounting" << endl; - unmounting = true; - - // NOTE: i'm assuming all caches are already flushing (because all files are closed). - assert(fh_map.empty()); - - // empty lru cache - lru.lru_set_max(0); - trim_cache(); - - if (g_conf.client_oc) { - // release any/all caps - for (hash_map::iterator p = inode_map.begin(); - p != inode_map.end(); - p++) { - Inode *in = p->second; - if (!in->caps.empty()) { - in->fc.release_clean(); - if (in->fc.is_dirty()) { - dout(10) << "unmount residual caps on " << in->ino() << ", flushing" << endl; - in->fc.empty(new C_Client_CloseRelease(this, in)); - } else { - dout(10) << "unmount residual caps on " << in->ino() << ", releasing" << endl; - release_caps(in); - } - } - } - } - - while (lru.lru_get_size() > 0 || - !inode_map.empty()) { - dout(2) << "cache still has " << lru.lru_get_size() - << "+" << inode_map.size() << " items" - << ", waiting (presumably for safe or for caps to be released?)" - << endl; - dump_cache(); - mount_cond.Wait(client_lock); - } - assert(lru.lru_get_size() == 0); - assert(inode_map.empty()); - - // unsafe writes - if (!g_conf.client_oc) { - while (unsafe_sync_write > 0) { - dout(0) << unsafe_sync_write << " unsafe_sync_writes, waiting" - << endl; - mount_cond.Wait(client_lock); - } - } - - // send unmount! - Message *req = new MGenericMessage(MSG_CLIENT_UNMOUNT); - messenger->send_message(req, mdsmap->get_inst(0), MDS_PORT_SERVER); - - while (mounted) - mount_cond.Wait(client_lock); - - dout(2) << "unmounted" << endl; - - client_lock.Unlock(); - return 0; -} - -void Client::handle_unmount_ack(Message* m) -{ - dout(1) << "got unmount ack" << endl; - mounted = false; - mount_cond.Signal(); - delete m; -} - - - -// namespace ops - -int Client::link(const char *existing, const char *newname) -{ - client_lock.Lock(); - dout(3) << "op: client->link(\"" << existing << "\", \"" << newname << "\");" << endl; - tout << "link" << endl; - tout << existing << endl; - tout << newname << endl; - - - // main path arg is new link name - // sarg is target (existing file) - - - MClientRequest *req = new MClientRequest(MDS_OP_LINK, whoami); - req->set_path(newname); - req->set_sarg(existing); - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - MClientReply *reply = make_request(req, true); - int res = reply->get_result(); - - insert_trace(reply); - delete reply; - dout(10) << "link result is " << res << endl; - - trim_cache(); - client_lock.Unlock(); - return res; -} - - -int Client::unlink(const char *relpath) -{ - client_lock.Lock(); - - string abspath; - mkabspath(relpath, abspath); - const char *path = abspath.c_str(); - - dout(3) << "op: client->unlink\(\"" << path << "\");" << endl; - tout << "unlink" << endl; - tout << path << endl; - - - MClientRequest *req = new MClientRequest(MDS_OP_UNLINK, whoami); - req->set_path(path); - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - //FIXME enforce caller uid rights? - - MClientReply *reply = make_request(req, true); - int res = reply->get_result(); - if (res == 0) { - // remove from local cache - filepath fp(path); - Dentry *dn = lookup(fp); - if (dn) { - assert(dn->inode); - unlink(dn); - } - } - insert_trace(reply); - delete reply; - dout(10) << "unlink result is " << res << endl; - - trim_cache(); - client_lock.Unlock(); - return res; -} - -int Client::rename(const char *relfrom, const char *relto) -{ - client_lock.Lock(); - - string absfrom; - mkabspath(relfrom, absfrom); - const char *from = absfrom.c_str(); - string absto; - mkabspath(relto, absto); - const char *to = absto.c_str(); - - dout(3) << "op: client->rename(\"" << from << "\", \"" << to << "\");" << endl; - tout << "rename" << endl; - tout << from << endl; - tout << to << endl; - - - MClientRequest *req = new MClientRequest(MDS_OP_RENAME, whoami); - req->set_path(from); - req->set_sarg(to); - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - //FIXME enforce caller uid rights? - - MClientReply *reply = make_request(req, true); - int res = reply->get_result(); - insert_trace(reply); - delete reply; - dout(10) << "rename result is " << res << endl; - - trim_cache(); - client_lock.Unlock(); - return res; -} - -// dirs - -int Client::mkdir(const char *relpath, mode_t mode) -{ - client_lock.Lock(); - - string abspath; - mkabspath(relpath, abspath); - const char *path = abspath.c_str(); - - dout(3) << "op: client->mkdir(\"" << path << "\", " << mode << ");" << endl; - tout << "mkdir" << endl; - tout << path << endl; - tout << mode << endl; - - - MClientRequest *req = new MClientRequest(MDS_OP_MKDIR, whoami); - req->set_path(path); - req->set_iarg( (int)mode ); - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - //FIXME enforce caller uid rights? - - MClientReply *reply = make_request(req, true); - int res = reply->get_result(); - insert_trace(reply); - delete reply; - dout(10) << "mkdir result is " << res << endl; - - trim_cache(); - client_lock.Unlock(); - return res; -} - -int Client::rmdir(const char *relpath) -{ - client_lock.Lock(); - - string abspath; - mkabspath(relpath, abspath); - const char *path = abspath.c_str(); - - dout(3) << "op: client->rmdir(\"" << path << "\");" << endl; - tout << "rmdir" << endl; - tout << path << endl; - - - MClientRequest *req = new MClientRequest(MDS_OP_RMDIR, whoami); - req->set_path(path); - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - //FIXME enforce caller uid rights? - - MClientReply *reply = make_request(req, true); - int res = reply->get_result(); - if (res == 0) { - // remove from local cache - filepath fp(path); - Dentry *dn = lookup(fp); - if (dn) { - if (dn->inode->dir && dn->inode->dir->is_empty()) - close_dir(dn->inode->dir); // FIXME: maybe i shoudl proactively hose the whole subtree from cache? - unlink(dn); - } - } - insert_trace(reply); - delete reply; - dout(10) << "rmdir result is " << res << endl; - - trim_cache(); - client_lock.Unlock(); - return res; -} - -// symlinks - -int Client::symlink(const char *reltarget, const char *rellink) -{ - client_lock.Lock(); - - string abstarget; - mkabspath(reltarget, abstarget); - const char *target = abstarget.c_str(); - string abslink; - mkabspath(rellink, abslink); - const char *link = abslink.c_str(); - - dout(3) << "op: client->symlink(\"" << target << "\", \"" << link << "\");" << endl; - tout << "symlink" << endl; - tout << target << endl; - tout << link << endl; - - - MClientRequest *req = new MClientRequest(MDS_OP_SYMLINK, whoami); - req->set_path(link); - req->set_sarg(target); - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - //FIXME enforce caller uid rights? - - MClientReply *reply = make_request(req, true); - int res = reply->get_result(); - insert_trace(reply); //FIXME assuming trace of link, not of target - delete reply; - dout(10) << "symlink result is " << res << endl; - - trim_cache(); - client_lock.Unlock(); - return res; -} - -int Client::readlink(const char *relpath, char *buf, off_t size) -{ - client_lock.Lock(); - - string abspath; - mkabspath(relpath, abspath); - const char *path = abspath.c_str(); - - dout(3) << "op: client->readlink(\"" << path << "\", readlinkbuf, readlinkbuf_len);" << endl; - tout << "readlink" << endl; - tout << path << endl; - client_lock.Unlock(); - - // stat first (FIXME, PERF access cache directly) **** - struct stat stbuf; - int r = this->lstat(path, &stbuf); - if (r != 0) return r; - - client_lock.Lock(); - - // pull symlink content from cache - Inode *in = inode_map[stbuf.st_ino]; - assert(in); // i just did a stat - - // copy into buf (at most size bytes) - unsigned res = in->symlink->length(); - if (res > size) res = size; - memcpy(buf, in->symlink->c_str(), res); - - trim_cache(); - client_lock.Unlock(); - return res; // return length in bytes (to mimic the system call) -} - - - -// inode stuff - -int Client::_lstat(const char *path, int mask, Inode **in) -{ - MClientRequest *req = 0; - filepath fpath(path); - - // check whether cache content is fresh enough - int res = 0; - - Dentry *dn = lookup(fpath); - inode_t inode; - time_t now = time(NULL); - if (dn && - now <= dn->inode->valid_until && - ((dn->inode->inode.mask & INODE_MASK_ALL_STAT) == INODE_MASK_ALL_STAT)) { - inode = dn->inode->inode; - dout(10) << "lstat cache hit w/ sufficient inode.mask, valid until " << dn->inode->valid_until << endl; - - if (g_conf.client_cache_stat_ttl == 0) - dn->inode->valid_until = 0; // only one stat allowed after each readdir - - *in = dn->inode; - } else { - // FIXME where does FUSE maintain user information - //struct fuse_context *fc = fuse_get_context(); - //req->set_caller_uid(fc->uid); - //req->set_caller_gid(fc->gid); - - req = new MClientRequest(MDS_OP_LSTAT, whoami); - req->set_iarg(mask); - req->set_path(fpath); - - MClientReply *reply = make_request(req); - res = reply->get_result(); - dout(10) << "lstat res is " << res << endl; - if (res == 0) { - //Transfer information from reply to stbuf - inode = reply->get_inode(); - - //Update metadata cache - *in = insert_trace(reply); - } - - delete reply; - - if (res != 0) - *in = 0; // not a success. - } - - return res; -} - - -void Client::fill_stat(inode_t& inode, struct stat *st) -{ - memset(st, 0, sizeof(struct stat)); - st->st_ino = inode.ino; - st->st_mode = inode.mode; - st->st_nlink = inode.nlink; - st->st_uid = inode.uid; - st->st_gid = inode.gid; - st->st_ctime = inode.ctime; - st->st_atime = inode.atime; - st->st_mtime = inode.mtime; - st->st_size = inode.size; - st->st_blocks = inode.size ? ((inode.size - 1) / 4096 + 1):0; - st->st_blksize = 4096; -} - -void Client::fill_statlite(inode_t& inode, struct statlite *st) -{ - memset(st, 0, sizeof(struct stat)); - st->st_ino = inode.ino; - st->st_mode = inode.mode; - st->st_nlink = inode.nlink; - st->st_uid = inode.uid; - st->st_gid = inode.gid; -#ifndef DARWIN - // FIXME what's going on here with darwin? - st->st_ctime = inode.ctime; - st->st_atime = inode.atime; - st->st_mtime = inode.mtime; -#endif - st->st_size = inode.size; - st->st_blocks = inode.size ? ((inode.size - 1) / 4096 + 1):0; - st->st_blksize = 4096; - - /* - S_REQUIREBLKSIZE(st->st_litemask); - if (inode.mask & INODE_MASK_BASE) S_REQUIRECTIME(st->st_litemask); - if (inode.mask & INODE_MASK_SIZE) { - S_REQUIRESIZE(st->st_litemask); - S_REQUIREBLOCKS(st->st_litemask); - } - if (inode.mask & INODE_MASK_MTIME) S_REQUIREMTIME(st->st_litemask); - if (inode.mask & INODE_MASK_ATIME) S_REQUIREATIME(st->st_litemask); - */ -} - - -int Client::lstat(const char *relpath, struct stat *stbuf) -{ - client_lock.Lock(); - - string abspath; - mkabspath(relpath, abspath); - const char *path = abspath.c_str(); - - dout(3) << "op: client->lstat(\"" << path << "\", &st);" << endl; - tout << "lstat" << endl; - tout << path << endl; - - Inode *in = 0; - - int res = _lstat(path, INODE_MASK_ALL_STAT, &in); - if (res == 0) { - assert(in); - fill_stat(in->inode,stbuf); - dout(10) << "stat sez size = " << in->inode.size << " ino = " << stbuf->st_ino << endl; - } - - trim_cache(); - client_lock.Unlock(); - return res; -} - - -int Client::lstatlite(const char *relpath, struct statlite *stl) -{ - client_lock.Lock(); - - string abspath; - mkabspath(relpath, abspath); - const char *path = abspath.c_str(); - - dout(3) << "op: client->lstatlite(\"" << path << "\", &st);" << endl; - tout << "lstatlite" << endl; - tout << path << endl; - - // make mask - int mask = INODE_MASK_BASE | INODE_MASK_PERM; - if (S_ISVALIDSIZE(stl->st_litemask) || - S_ISVALIDBLOCKS(stl->st_litemask)) mask |= INODE_MASK_SIZE; - if (S_ISVALIDMTIME(stl->st_litemask)) mask |= INODE_MASK_MTIME; - if (S_ISVALIDATIME(stl->st_litemask)) mask |= INODE_MASK_ATIME; - - Inode *in = 0; - int res = _lstat(path, mask, &in); - - if (res == 0) { - fill_statlite(in->inode,stl); - dout(10) << "stat sez size = " << in->inode.size << " ino = " << in->inode.ino << endl; - } - - trim_cache(); - client_lock.Unlock(); - return res; -} - - - -int Client::chmod(const char *relpath, mode_t mode) -{ - client_lock.Lock(); - - string abspath; - mkabspath(relpath, abspath); - const char *path = abspath.c_str(); - - dout(3) << "op: client->chmod(\"" << path << "\", " << mode << ");" << endl; - tout << "chmod" << endl; - tout << path << endl; - tout << mode << endl; - - - MClientRequest *req = new MClientRequest(MDS_OP_CHMOD, whoami); - req->set_path(path); - req->set_iarg( (int)mode ); - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - MClientReply *reply = make_request(req, true); - int res = reply->get_result(); - insert_trace(reply); - delete reply; - dout(10) << "chmod result is " << res << endl; - - trim_cache(); - client_lock.Unlock(); - return res; -} - -int Client::chown(const char *relpath, uid_t uid, gid_t gid) -{ - client_lock.Lock(); - - string abspath; - mkabspath(relpath, abspath); - const char *path = abspath.c_str(); - - dout(3) << "op: client->chown(\"" << path << "\", " << uid << ", " << gid << ");" << endl; - tout << "chown" << endl; - tout << path << endl; - tout << uid << endl; - tout << gid << endl; - - - MClientRequest *req = new MClientRequest(MDS_OP_CHOWN, whoami); - req->set_path(path); - req->set_iarg( (int)uid ); - req->set_iarg2( (int)gid ); - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - //FIXME enforce caller uid rights? - - MClientReply *reply = make_request(req, true); - int res = reply->get_result(); - insert_trace(reply); - delete reply; - dout(10) << "chown result is " << res << endl; - - trim_cache(); - client_lock.Unlock(); - return res; -} - -int Client::utime(const char *relpath, struct utimbuf *buf) -{ - client_lock.Lock(); - - string abspath; - mkabspath(relpath, abspath); - const char *path = abspath.c_str(); - - dout(3) << "op: utim.actime = " << buf->actime << "; utim.modtime = " << buf->modtime << ";" << endl; - dout(3) << "op: client->utime(\"" << path << "\", &utim);" << endl; - tout << "utime" << endl; - tout << path << endl; - tout << buf->actime << endl; - tout << buf->modtime << endl; - - - MClientRequest *req = new MClientRequest(MDS_OP_UTIME, whoami); - req->set_path(path); - req->set_targ( buf->modtime ); - req->set_targ2( buf->actime ); - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - //FIXME enforce caller uid rights? - - MClientReply *reply = make_request(req, true); - int res = reply->get_result(); - insert_trace(reply); - delete reply; - dout(10) << "utime result is " << res << endl; - - trim_cache(); - client_lock.Unlock(); - return res; -} - - - -int Client::mknod(const char *relpath, mode_t mode) -{ - client_lock.Lock(); - - string abspath; - mkabspath(relpath, abspath); - const char *path = abspath.c_str(); - - dout(3) << "op: client->mknod(\"" << path << "\", " << mode << ");" << endl; - tout << "mknod" << endl; - tout << path << endl; - tout << mode << endl; - - - MClientRequest *req = new MClientRequest(MDS_OP_MKNOD, whoami); - req->set_path(path); - req->set_iarg( mode ); - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - //FIXME enforce caller uid rights? - - MClientReply *reply = make_request(req, true); - int res = reply->get_result(); - insert_trace(reply); - - dout(10) << "mknod result is " << res << endl; - - delete reply; - - trim_cache(); - client_lock.Unlock(); - return res; -} - - - - -//readdir usually include inode info for each entry except of locked entries - -// -// getdir - -// fyi: typedef int (*dirfillerfunc_t) (void *handle, const char *name, int type, inodeno_t ino); - -int Client::getdir(const char *relpath, map& contents) -{ - client_lock.Lock(); - - string abspath; - mkabspath(relpath, abspath); - const char *path = abspath.c_str(); - - dout(3) << "op: client->getdir(\"" << path << "\", dir_contents);" << endl; - tout << "getdir" << endl; - tout << path << endl; - - - MClientRequest *req = new MClientRequest(MDS_OP_READDIR, whoami); - req->set_path(path); - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - //FIXME enforce caller uid rights? - - MClientReply *reply = make_request(req, true); - int res = reply->get_result(); - insert_trace(reply); - - if (res == 0) { - - // dir contents to cache! - inodeno_t ino = reply->get_ino(); - Inode *diri = inode_map[ ino ]; - assert(diri); - assert(diri->inode.mode & INODE_MODE_DIR); - - // add . and ..? - string dot("."); - contents[dot] = diri->inode; - if (diri != root) { - string dotdot(".."); - contents[dotdot] = diri->dn->dir->parent_inode->inode; - } - - if (!reply->get_dir_in().empty()) { - // only open dir if we're actually adding stuff to it! - Dir *dir = diri->open_dir(); - assert(dir); - time_t now = time(NULL); - - list::const_iterator pdn = reply->get_dir_dn().begin(); - for (list::const_iterator pin = reply->get_dir_in().begin(); - pin != reply->get_dir_in().end(); - ++pin, ++pdn) { - if (*pdn == ".") - continue; - - // count entries - res++; - - // put in cache - Inode *in = this->insert_inode(dir, *pin, *pdn); - - if (g_conf.client_cache_stat_ttl) - in->valid_until = now + g_conf.client_cache_stat_ttl; - else if (g_conf.client_cache_readdir_ttl) - in->valid_until = now + g_conf.client_cache_readdir_ttl; - - // contents to caller too! - contents[*pdn] = in->inode; - } - if (dir->is_empty()) - close_dir(dir); - } - - - // FIXME: remove items in cache that weren't in my readdir? - // *** - } - - delete reply; //fix thing above first - - client_lock.Unlock(); - return res; -} - - -/** POSIX stubs **/ - -DIR *Client::opendir(const char *name) -{ - DirResult *d = new DirResult; - d->size = getdir(name, d->contents); - d->p = d->contents.begin(); - d->off = 0; - return (DIR*)d; -} - -int Client::closedir(DIR *dir) -{ - DirResult *d = (DirResult*)dir; - delete d; - return 0; -} - -//struct dirent { -// ino_t d_ino; /* inode number */ -// off_t d_off; /* offset to the next dirent */ -// unsigned short d_reclen; /* length of this record */ -// unsigned char d_type; /* type of file */ -// char d_name[256]; /* filename */ -//}; - -struct dirent *Client::readdir(DIR *dirp) -{ - DirResult *d = (DirResult*)dirp; - - // end of dir? - if (d->p == d->contents.end()) - return 0; - - // fill the dirent - d->dp.d_dirent.d_ino = d->p->second.ino; -#ifndef __CYGWIN__ -#ifndef DARWIN - if (d->p->second.is_symlink()) - d->dp.d_dirent.d_type = DT_LNK; - else if (d->p->second.is_dir()) - d->dp.d_dirent.d_type = DT_DIR; - else if (d->p->second.is_file()) - d->dp.d_dirent.d_type = DT_REG; - else - d->dp.d_dirent.d_type = DT_UNKNOWN; - - d->dp.d_dirent.d_off = d->off; - d->dp.d_dirent.d_reclen = 1; // all records are length 1 (wrt offset, seekdir, telldir, etc.) -#endif // DARWIN -#endif - - strncpy(d->dp.d_dirent.d_name, d->p->first.c_str(), 256); - - // move up - ++d->off; - ++d->p; - - return &d->dp.d_dirent; -} - -void Client::rewinddir(DIR *dirp) -{ - DirResult *d = (DirResult*)dirp; - d->p = d->contents.begin(); - d->off = 0; -} - -off_t Client::telldir(DIR *dirp) -{ - DirResult *d = (DirResult*)dirp; - return d->off; -} - -void Client::seekdir(DIR *dirp, off_t offset) -{ - DirResult *d = (DirResult*)dirp; - - d->p = d->contents.begin(); - d->off = 0; - - if (offset >= d->size) offset = d->size-1; - while (offset > 0) { - ++d->p; - ++d->off; - --offset; - } -} - -struct dirent_plus *Client::readdirplus(DIR *dirp) -{ - DirResult *d = (DirResult*)dirp; - - // end of dir? - if (d->p == d->contents.end()) - return 0; - - // fill the dirent - d->dp.d_dirent.d_ino = d->p->second.ino; -#ifndef __CYGWIN__ -#ifndef DARWIN - if (d->p->second.is_symlink()) - d->dp.d_dirent.d_type = DT_LNK; - else if (d->p->second.is_dir()) - d->dp.d_dirent.d_type = DT_DIR; - else if (d->p->second.is_file()) - d->dp.d_dirent.d_type = DT_REG; - else - d->dp.d_dirent.d_type = DT_UNKNOWN; - - d->dp.d_dirent.d_off = d->off; - d->dp.d_dirent.d_reclen = 1; // all records are length 1 (wrt offset, seekdir, telldir, etc.) -#endif // DARWIN -#endif - - strncpy(d->dp.d_dirent.d_name, d->p->first.c_str(), 256); - - // plus - if ((d->p->second.mask & INODE_MASK_ALL_STAT) == INODE_MASK_ALL_STAT) { - // have it - fill_stat(d->p->second, &d->dp.d_stat); - d->dp.d_stat_err = 0; - } else { - // don't have it, stat it - string path = d->path; - path += "/"; - path += d->p->first; - d->dp.d_stat_err = lstat(path.c_str(), &d->dp.d_stat); - } - - // move up - ++d->off; - ++d->p; - - return &d->dp; -} - -/* -struct dirent_lite *Client::readdirlite(DIR *dirp) -{ - DirResult *d = (DirResult*)dirp; - - // end of dir? - if (d->p == d->contents.end()) - return 0; - - // fill the dirent - d->dp.d_dirent.d_ino = d->p->second.ino; - if (d->p->second.is_symlink()) - d->dp.d_dirent.d_type = DT_LNK; - else if (d->p->second.is_dir()) - d->dp.d_dirent.d_type = DT_DIR; - else if (d->p->second.is_file()) - d->dp.d_dirent.d_type = DT_REG; - else - d->dp.d_dirent.d_type = DT_UNKNOWN; - strncpy(d->dp.d_dirent.d_name, d->p->first.c_str(), 256); - - d->dp.d_dirent.d_off = d->off; - d->dp.d_dirent.d_reclen = 1; // all records are length 1 (wrt offset, seekdir, telldir, etc.) - - // plus - if ((d->p->second.mask & INODE_MASK_ALL_STAT) == INODE_MASK_ALL_STAT) { - // have it - fill_statlite(d->p->second,d->dp.d_stat); - d->dp.d_stat_err = 0; - } else { - // don't have it, stat it - string path = p->path; - path += "/"; - path += p->first; - d->dp.d_statlite - d->dp.d_stat_err = lstatlite(path.c_str(), &d->dp.d_statlite); - } - - // move up - ++d->off; - ++d->p; - - return &d->dp; -} -*/ - - - - - - -/****** file i/o **********/ - -int Client::open(const char *relpath, int flags) -{ - client_lock.Lock(); - - string abspath; - mkabspath(relpath, abspath); - const char *path = abspath.c_str(); - - dout(3) << "op: fh = client->open(\"" << path << "\", " << flags << ");" << endl; - tout << "open" << endl; - tout << path << endl; - tout << flags << endl; - - int cmode = 0; - bool tryauth = false; - if (flags & O_LAZY) - cmode = FILE_MODE_LAZY; - else if (flags & O_WRONLY) { - cmode = FILE_MODE_W; - tryauth = true; - } else if (flags & O_RDWR) { - cmode = FILE_MODE_RW; - tryauth = true; - } else if (flags & O_APPEND) { - cmode = FILE_MODE_W; - tryauth = true; - } else - cmode = FILE_MODE_R; - - // go - MClientRequest *req = new MClientRequest(MDS_OP_OPEN, whoami); - req->set_path(path); - req->set_iarg(flags); - req->set_iarg2(cmode); - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - MClientReply *reply = make_request(req, tryauth); // try auth if writer - - assert(reply); - dout(3) << "op: open_files[" << reply->get_result() << "] = fh; // fh = " << reply->get_result() << endl; - tout << reply->get_result() << endl; - - insert_trace(reply); - int result = reply->get_result(); - - // success? - fh_t fh = 0; - if (result >= 0) { - // yay - Fh *f = new Fh; - f->mode = cmode; - - // inode - f->inode = inode_map[reply->get_ino()]; - assert(f->inode); - f->inode->get(); - - if (cmode & FILE_MODE_R) f->inode->num_open_rd++; - if (cmode & FILE_MODE_W) f->inode->num_open_wr++; - if (cmode & FILE_MODE_LAZY) f->inode->num_open_lazy++; - - // caps included? - int mds = reply->get_source().num(); - - if (f->inode->caps.empty()) {// first caps? - dout(7) << " first caps on " << f->inode->inode.ino << endl; - f->inode->get(); - } - - int new_caps = reply->get_file_caps(); - - assert(reply->get_file_caps_seq() >= f->inode->caps[mds].seq); - if (reply->get_file_caps_seq() > f->inode->caps[mds].seq) { - dout(7) << "open got caps " << cap_string(new_caps) - << " for " << f->inode->ino() - << " seq " << reply->get_file_caps_seq() - << " from mds" << mds - << endl; - - int old_caps = f->inode->caps[mds].caps; - f->inode->caps[mds].caps = new_caps; - f->inode->caps[mds].seq = reply->get_file_caps_seq(); - - // we shouldn't ever lose caps at this point. - // actually, we might...? - assert((old_caps & ~f->inode->caps[mds].caps) == 0); - - if (g_conf.client_oc) - f->inode->fc.set_caps(new_caps); - - } else { - dout(7) << "open got SAME caps " << cap_string(new_caps) - << " for " << f->inode->ino() - << " seq " << reply->get_file_caps_seq() - << " from mds" << mds - << endl; - } - - // put in map - result = fh = get_fh(); - assert(fh_map.count(fh) == 0); - fh_map[fh] = f; - - dout(3) << "open success, fh is " << fh << " combined caps " << cap_string(f->inode->file_caps()) << endl; - } else { - dout(0) << "open failure result " << result << endl; - } - - delete reply; - - trim_cache(); - client_lock.Unlock(); - - return result; -} - - - - - -void Client::close_release(Inode *in) -{ - dout(10) << "close_release on " << in->ino() << endl; - dout(10) << " wr " << in->num_open_wr << " rd " << in->num_open_rd - << " dirty " << in->fc.is_dirty() << " cached " << in->fc.is_cached() << endl; - - if (!in->num_open_rd) - in->fc.release_clean(); - - int retain = 0; - if (in->num_open_wr || in->fc.is_dirty()) retain |= CAP_FILE_WR | CAP_FILE_WRBUFFER | CAP_FILE_WREXTEND; - if (in->num_open_rd || in->fc.is_cached()) retain |= CAP_FILE_RD | CAP_FILE_RDCACHE; - - release_caps(in, retain); // release caps now. -} - -void Client::close_safe(Inode *in) -{ - dout(10) << "close_safe on " << in->ino() << endl; - put_inode(in); - if (unmounting) - mount_cond.Signal(); -} - -int Client::close(fh_t fh) -{ - client_lock.Lock(); - dout(3) << "op: client->close(open_files[ " << fh << " ]);" << endl; - dout(3) << "op: open_files.erase( " << fh << " );" << endl; - tout << "close" << endl; - tout << fh << endl; - - // get Fh, Inode - assert(fh_map.count(fh)); - Fh *f = fh_map[fh]; - Inode *in = f->inode; - - // update inode rd/wr counts - int before = in->file_caps_wanted(); - if (f->mode & FILE_MODE_R) - in->num_open_rd--; - if (f->mode & FILE_MODE_W) - in->num_open_wr--; - int after = in->file_caps_wanted(); - - // does this change what caps we want? - if (before != after && after) - update_caps_wanted(in); - - // hose fh - fh_map.erase(fh); - delete f; - - // release caps right away? - dout(10) << "num_open_rd " << in->num_open_rd << " num_open_wr " << in->num_open_wr << endl; - - if (g_conf.client_oc) { - // caching on. - if (in->num_open_rd == 0 && in->num_open_wr == 0) { - in->fc.empty(new C_Client_CloseRelease(this, in)); - } - else if (in->num_open_rd == 0) { - in->fc.release_clean(); - close_release(in); - } - else if (in->num_open_wr == 0) { - in->fc.flush_dirty(new C_Client_CloseRelease(this,in)); - } - - // pin until safe? - if (in->num_open_wr == 0 && !in->fc.all_safe()) { - dout(10) << "pinning ino " << in->ino() << " until safe" << endl; - in->get(); - in->fc.add_safe_waiter(new C_Client_CloseSafe(this, in)); - } - } else { - // caching off. - if (in->num_open_rd == 0 && in->num_open_wr == 0) { - dout(10) << " releasing caps on " << in->ino() << endl; - release_caps(in); // release caps now. - } - } - - put_inode( in ); - int result = 0; - - client_lock.Unlock(); - return result; -} - - - -// ------------ -// read, write - - -off_t Client::lseek(fh_t fh, off_t offset, int whence) -{ - client_lock.Lock(); - dout(3) << "op: client->lseek(" << fh << ", " << offset << ", " << whence << ");" << endl; - - assert(fh_map.count(fh)); - Fh *f = fh_map[fh]; - Inode *in = f->inode; - - switch (whence) { - case SEEK_SET: - f->pos = offset; - break; - - case SEEK_CUR: - f->pos += offset; - break; - - case SEEK_END: - f->pos = in->inode.size + offset; - break; - - default: - assert(0); - } - - off_t pos = f->pos; - client_lock.Unlock(); - - return pos; -} - - -// blocking osd interface - -int Client::read(fh_t fh, char *buf, off_t size, off_t offset) -{ - client_lock.Lock(); - - dout(3) << "op: client->read(" << fh << ", buf, " << size << ", " << offset << "); // that's " << offset << "~" << size << endl; - tout << "read" << endl; - tout << fh << endl; - tout << size << endl; - tout << offset << endl; - - assert(fh_map.count(fh)); - Fh *f = fh_map[fh]; - Inode *in = f->inode; - - bool movepos = false; - if (offset < 0) { - offset = f->pos; - movepos = true; - } - - bool lazy = f->mode == FILE_MODE_LAZY; - - // determine whether read range overlaps with file - // ...ONLY if we're doing async io - if (!lazy && (in->file_caps() & (CAP_FILE_WRBUFFER|CAP_FILE_RDCACHE))) { - // we're doing buffered i/o. make sure we're inside the file. - // we can trust size info bc we get accurate info when buffering/caching caps are issued. - dout(-10) << "file size: " << in->inode.size << endl; - if (offset > 0 && offset >= in->inode.size) { - client_lock.Unlock(); - return 0; - } - if (offset + size > (off_t)in->inode.size) - size = (off_t)in->inode.size - offset; - - if (size == 0) { - dout(-10) << "read is size=0, returning 0" << endl; - client_lock.Unlock(); - return 0; - } - } else { - // unbuffered, synchronous file i/o. - // or lazy. - // defer to OSDs for file bounds. - } - - bufferlist blist; // data will go here - int r = 0; - int rvalue = 0; - - if (g_conf.client_oc) { - // object cache ON - rvalue = r = in->fc.read(offset, size, blist, client_lock); // may block. - } else { - // object cache OFF -- legacy inconsistent way. - - // do we have read file cap? - while (!lazy && (in->file_caps() & CAP_FILE_RD) == 0) { - dout(7) << " don't have read cap, waiting" << endl; - Cond cond; - in->waitfor_read.push_back(&cond); - cond.Wait(client_lock); - } - // lazy cap? - while (lazy && (in->file_caps() & CAP_FILE_LAZYIO) == 0) { - dout(7) << " don't have lazy cap, waiting" << endl; - Cond cond; - in->waitfor_lazy.push_back(&cond); - cond.Wait(client_lock); - } - - // do sync read - Cond cond; - bool done = false; - C_Cond *onfinish = new C_Cond(&cond, &done, &rvalue); - - r = filer->read(in->inode, offset, size, &blist, onfinish); - - assert(r >= 0); - - // wait! - while (!done) - cond.Wait(client_lock); - } - - if (movepos) { - // adjust fd pos - f->pos = offset+blist.length(); - } - - // copy data into caller's char* buf - blist.copy(0, blist.length(), buf); - - //dout(10) << "i read '" << blist.c_str() << "'" << endl; - dout(10) << "read rvalue " << rvalue << ", r " << r << endl; - - // done! - client_lock.Unlock(); - return rvalue; -} - - - -/* - * hack -- - * until we properly implement synchronous writes wrt buffer cache, - * make sure we delay shutdown until they're all safe on disk! - */ -class C_Client_HackUnsafe : public Context { - Client *cl; -public: - C_Client_HackUnsafe(Client *c) : cl(c) {} - void finish(int) { - cl->hack_sync_write_safe(); - } -}; - -void Client::hack_sync_write_safe() -{ - client_lock.Lock(); - assert(unsafe_sync_write > 0); - unsafe_sync_write--; - if (unsafe_sync_write == 0 && unmounting) { - dout(10) << "hack_sync_write_safe -- no more unsafe writes, unmount can proceed" << endl; - mount_cond.Signal(); - } - client_lock.Unlock(); -} - -int Client::write(fh_t fh, const char *buf, off_t size, off_t offset) -{ - client_lock.Lock(); - - //dout(7) << "write fh " << fh << " size " << size << " offset " << offset << endl; - dout(3) << "op: client->write(" << fh << ", buf, " << size << ", " << offset << ");" << endl; - tout << "write" << endl; - tout << fh << endl; - tout << size << endl; - tout << offset << endl; - - assert(fh_map.count(fh)); - Fh *f = fh_map[fh]; - Inode *in = f->inode; - - if (offset < 0) { - offset = f->pos; - // adjust fd pos - f->pos = offset+size; - } - - bool lazy = f->mode == FILE_MODE_LAZY; - - dout(10) << "cur file size is " << in->inode.size << " wr size " << in->file_wr_size << endl; - - // time it. - utime_t start = g_clock.now(); - - // copy into fresh buffer (since our write may be resub, async) - bufferptr bp = buffer::copy(buf, size); - bufferlist blist; - blist.push_back( bp ); - - if (g_conf.client_oc) { // buffer cache ON? - assert(objectcacher); - - // write (this may block!) - in->fc.write(offset, size, blist, client_lock); - - } else { - // legacy, inconsistent synchronous write. - dout(7) << "synchronous write" << endl; - - // do we have write file cap? - while (!lazy && (in->file_caps() & CAP_FILE_WR) == 0) { - dout(7) << " don't have write cap, waiting" << endl; - Cond cond; - in->waitfor_write.push_back(&cond); - cond.Wait(client_lock); - } - while (lazy && (in->file_caps() & CAP_FILE_LAZYIO) == 0) { - dout(7) << " don't have lazy cap, waiting" << endl; - Cond cond; - in->waitfor_lazy.push_back(&cond); - cond.Wait(client_lock); - } - - // prepare write - Cond cond; - bool done = false; - C_Cond *onfinish = new C_Cond(&cond, &done); - C_Client_HackUnsafe *onsafe = new C_Client_HackUnsafe(this); - unsafe_sync_write++; - in->sync_writes++; - - dout(20) << " sync write start " << onfinish << endl; - - filer->write(in->inode, offset, size, blist, 0, - onfinish, onsafe - //, 1+((int)g_clock.now()) / 10 //f->pos // hack hack test osd revision snapshots - ); - - while (!done) { - cond.Wait(client_lock); - dout(20) << " sync write bump " << onfinish << endl; - } - - in->sync_writes--; - if (in->sync_writes == 0 && - !in->waitfor_no_write.empty()) { - for (list::iterator i = in->waitfor_no_write.begin(); - i != in->waitfor_no_write.end(); - i++) - (*i)->finish(0); - in->waitfor_no_write.clear(); - } - - dout(20) << " sync write done " << onfinish << endl; - } - - // time - utime_t lat = g_clock.now(); - lat -= start; - if (client_logger) { - client_logger->finc("wrlsum",(double)lat); - client_logger->inc("wrlnum"); - } - - // assume success for now. FIXME. - off_t totalwritten = size; - - // extend file? - if (totalwritten + offset > in->inode.size) { - in->inode.size = in->file_wr_size = totalwritten + offset; - dout(7) << "wrote to " << totalwritten+offset << ", extending file size" << endl; - } else { - dout(7) << "wrote to " << totalwritten+offset << ", leaving file size at " << in->inode.size << endl; - } - - // mtime - in->file_wr_mtime = in->inode.mtime = g_clock.gettime(); - - // ok! - client_lock.Unlock(); - return totalwritten; -} - - -int Client::truncate(const char *file, off_t size) -{ - client_lock.Lock(); - dout(3) << "op: client->truncate(\"" << file << "\", " << size << ");" << endl; - tout << "truncate" << endl; - tout << file << endl; - tout << size << endl; - - - MClientRequest *req = new MClientRequest(MDS_OP_TRUNCATE, whoami); - req->set_path(file); - req->set_sizearg( size ); - - // FIXME where does FUSE maintain user information - req->set_caller_uid(getuid()); - req->set_caller_gid(getgid()); - - MClientReply *reply = make_request(req, true); - int res = reply->get_result(); - insert_trace(reply); - delete reply; - - dout(10) << " truncate result is " << res << endl; - - client_lock.Unlock(); - return res; -} - - -int Client::fsync(fh_t fh, bool syncdataonly) -{ - client_lock.Lock(); - dout(3) << "op: client->fsync(open_files[ " << fh << " ], " << syncdataonly << ");" << endl; - tout << "fsync" << endl; - tout << fh << endl; - tout << syncdataonly << endl; - - int r = 0; - - assert(fh_map.count(fh)); - Fh *f = fh_map[fh]; - Inode *in = f->inode; - - dout(3) << "fsync fh " << fh << " ino " << in->inode.ino << " syncdataonly " << syncdataonly << endl; - - // metadata? - if (!syncdataonly) { - dout(0) << "fsync - not syncing metadata yet.. implement me" << endl; - } - - // data? - Cond cond; - bool done = false; - if (!objectcacher->commit_set(in->ino(), - new C_Cond(&cond, &done))) { - // wait for callback - while (!done) cond.Wait(client_lock); - } - - client_lock.Unlock(); - return r; -} - - -// not written yet, but i want to link! - -int Client::chdir(const char *path) -{ - // fake it for now! - string abs; - mkabspath(path, abs); - dout(3) << "chdir " << path << " -> cwd now " << abs << endl; - cwd = abs; - return 0; -} - -int Client::statfs(const char *path, struct statvfs *stbuf) -{ - bzero (stbuf, sizeof (struct statvfs)); - // FIXME - stbuf->f_bsize = 1024; - stbuf->f_frsize = 1024; - stbuf->f_blocks = 1024 * 1024; - stbuf->f_bfree = 1024 * 1024; - stbuf->f_bavail = 1024 * 1024; - stbuf->f_files = 1024 * 1024; - stbuf->f_ffree = 1024 * 1024; - stbuf->f_favail = 1024 * 1024; - stbuf->f_namemax = 1024; - - return 0; -} - - -int Client::lazyio_propogate(int fd, off_t offset, size_t count) -{ - client_lock.Lock(); - dout(3) << "op: client->lazyio_propogate(" << fd - << ", " << offset << ", " << count << ")" << endl; - - assert(fh_map.count(fd)); - Fh *f = fh_map[fd]; - Inode *in = f->inode; - - if (f->mode & FILE_MODE_LAZY) { - // wait for lazy cap - while ((in->file_caps() & CAP_FILE_LAZYIO) == 0) { - dout(7) << " don't have lazy cap, waiting" << endl; - Cond cond; - in->waitfor_lazy.push_back(&cond); - cond.Wait(client_lock); - } - - if (g_conf.client_oc) { - Cond cond; - bool done = false; - in->fc.flush_dirty(new C_SafeCond(&client_lock, &cond, &done)); - - while (!done) - cond.Wait(client_lock); - - } else { - // mmm, nothin to do. - } - } - - client_lock.Unlock(); - return 0; -} - -int Client::lazyio_synchronize(int fd, off_t offset, size_t count) -{ - client_lock.Lock(); - dout(3) << "op: client->lazyio_synchronize(" << fd - << ", " << offset << ", " << count << ")" << endl; - - assert(fh_map.count(fd)); - Fh *f = fh_map[fd]; - Inode *in = f->inode; - - if (f->mode & FILE_MODE_LAZY) { - // wait for lazy cap - while ((in->file_caps() & CAP_FILE_LAZYIO) == 0) { - dout(7) << " don't have lazy cap, waiting" << endl; - Cond cond; - in->waitfor_lazy.push_back(&cond); - cond.Wait(client_lock); - } - - if (g_conf.client_oc) { - in->fc.flush_dirty(0); // flush to invalidate. - in->fc.release_clean(); - } else { - // mm, nothin to do. - } - } - - client_lock.Unlock(); - return 0; -} - - -// ========================================= -// layout - - -int Client::describe_layout(int fh, FileLayout *lp) -{ - client_lock.Lock(); - dout(3) << "op: client->describe_layout(" << fh << ");" << endl; - - assert(fh_map.count(fh)); - Fh *f = fh_map[fh]; - Inode *in = f->inode; - - *lp = in->inode.layout; - - client_lock.Unlock(); - return 0; -} - -int Client::get_stripe_unit(int fd) -{ - FileLayout layout; - describe_layout(fd, &layout); - return layout.stripe_size; -} - -int Client::get_stripe_width(int fd) -{ - FileLayout layout; - describe_layout(fd, &layout); - return layout.stripe_size*layout.stripe_count; -} - -int Client::get_stripe_period(int fd) -{ - FileLayout layout; - describe_layout(fd, &layout); - return layout.period(); -} - -int Client::enumerate_layout(int fh, list& result, - off_t length, off_t offset) -{ - client_lock.Lock(); - dout(3) << "op: client->enumerate_layout(" << fh << ", " << length << ", " << offset << ");" << endl; - - assert(fh_map.count(fh)); - Fh *f = fh_map[fh]; - Inode *in = f->inode; - - // map to a list of extents - filer->file_to_extents(in->inode, offset, length, result); - - client_lock.Unlock(); - return 0; -} - - - -void Client::ms_handle_failure(Message *m, const entity_inst_t& inst) -{ - entity_name_t dest = inst.name; - - if (dest.is_mon()) { - // resend to a different monitor. - int mon = monmap->pick_mon(true); - dout(0) << "ms_handle_failure " << dest << " inst " << inst - << ", resending to mon" << mon - << endl; - messenger->send_message(m, monmap->get_inst(mon)); - } - else if (dest.is_osd()) { - objecter->ms_handle_failure(m, dest, inst); - } - else if (dest.is_mds()) { - dout(0) << "ms_handle_failure " << dest << " inst " << inst << endl; - // help! - assert(0); - } - else { - // client? - dout(0) << "ms_handle_failure " << dest << " inst " << inst - << ", dropping" << endl; - delete m; - } -} - diff --git a/tags/20070517_before_mds_merge/client/Client.h b/tags/20070517_before_mds_merge/client/Client.h deleted file mode 100644 index 513a840d62670..0000000000000 --- a/tags/20070517_before_mds_merge/client/Client.h +++ /dev/null @@ -1,600 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __CLIENT_H -#define __CLIENT_H - - -#include "mds/MDSMap.h" -#include "osd/OSDMap.h" -#include "mon/MonMap.h" - -#include "msg/Message.h" -#include "msg/Dispatcher.h" -#include "msg/Messenger.h" -#include "msg/SerialMessenger.h" - -#include "messages/MClientRequest.h" -#include "messages/MClientReply.h" - -//#include "msgthread.h" - -#include "include/types.h" -#include "include/lru.h" -#include "include/filepath.h" -#include "include/interval_set.h" - -#include "common/Mutex.h" - -#include "FileCache.h" - -// stl -#include -#include -using namespace std; - -#include -using namespace __gnu_cxx; - -#define O_LAZY 01000000 - - -class Filer; -class Objecter; -class ObjectCacher; - -extern class LogType client_logtype; -extern class Logger *client_logger; - - - -// ============================================ -// types for my local metadata cache -/* basic structure: - - - Dentries live in an LRU loop. they get expired based on last access. - see include/lru.h. items can be bumped to "mid" or "top" of list, etc. - - Inode has ref count for each Fh, Dir, or Dentry that points to it. - - when Inode ref goes to 0, it's expired. - - when Dir is empty, it's removed (and it's Inode ref--) - -*/ - -typedef int fh_t; - -class Dir; -class Inode; - -class Dentry : public LRUObject { - public: - string name; // sort of lame - //const char *name; - Dir *dir; - Inode *inode; - int ref; // 1 if there's a dir beneath me. - - void get() { assert(ref == 0); ref++; lru_pin(); } - void put() { assert(ref == 1); ref--; lru_unpin(); } - - Dentry() : dir(0), inode(0), ref(0) { } - - /*Dentry() : name(0), dir(0), inode(0), ref(0) { } - Dentry(string& n) : name(0), dir(0), inode(0), ref(0) { - name = new char[n.length()+1]; - strcpy((char*)name, n.c_str()); - } - ~Dentry() { - delete[] name; - }*/ -}; - -class Dir { - public: - Inode *parent_inode; // my inode - //hash_map, eqstr> dentries; - hash_map dentries; - - Dir(Inode* in) { parent_inode = in; } - - bool is_empty() { return dentries.empty(); } -}; - - -class InodeCap { - public: - int caps; - long seq; - InodeCap() : caps(0), seq(0) {} -}; - - -class Inode { - public: - inode_t inode; // the actual inode - time_t valid_until; - - // about the dir (if this is one!) - int dir_auth; - set dir_contacts; - bool dir_hashed, dir_replicated; - - // per-mds caps - map caps; // mds -> InodeCap - map stale_caps; // mds -> cap .. stale - - time_t file_wr_mtime; // [writers] time of last write - off_t file_wr_size; // [writers] largest offset we've written to - int num_open_rd, num_open_wr, num_open_lazy; // num readers, writers - - int ref; // ref count. 1 for each dentry, fh that links to me. - Dir *dir; // if i'm a dir. - Dentry *dn; // if i'm linked to a dentry. - string *symlink; // symlink content, if it's a symlink - - // for caching i/o mode - FileCache fc; - - // for sync i/o mode - int sync_reads; // sync reads in progress - int sync_writes; // sync writes in progress - - list waitfor_write; - list waitfor_read; - list waitfor_lazy; - list waitfor_no_read, waitfor_no_write; - - void get() { - ref++; - //cout << "inode.get on " << hex << inode.ino << dec << " now " << ref << endl; - } - void put() { - ref--; assert(ref >= 0); - //cout << "inode.put on " << hex << inode.ino << dec << " now " << ref << endl; - } - - Inode(inode_t _inode, ObjectCacher *_oc) : - inode(_inode), - valid_until(0), - dir_auth(-1), dir_hashed(false), dir_replicated(false), - file_wr_mtime(0), file_wr_size(0), - num_open_rd(0), num_open_wr(0), num_open_lazy(0), - ref(0), dir(0), dn(0), symlink(0), - fc(_oc, _inode), - sync_reads(0), sync_writes(0) - { } - ~Inode() { - if (symlink) { delete symlink; symlink = 0; } - } - - inodeno_t ino() { return inode.ino; } - - bool is_dir() { - return (inode.mode & INODE_TYPE_MASK) == INODE_MODE_DIR; - } - - int file_caps() { - int c = 0; - for (map::iterator it = caps.begin(); - it != caps.end(); - it++) - c |= it->second.caps; - for (map::iterator it = stale_caps.begin(); - it != stale_caps.end(); - it++) - c |= it->second.caps; - return c; - } - - int file_caps_wanted() { - int w = 0; - if (num_open_rd) w |= CAP_FILE_RD|CAP_FILE_RDCACHE; - if (num_open_wr) w |= CAP_FILE_WR|CAP_FILE_WRBUFFER; - if (num_open_lazy) w |= CAP_FILE_LAZYIO; - if (fc.is_dirty()) w |= CAP_FILE_WRBUFFER; - if (fc.is_cached()) w |= CAP_FILE_RDCACHE; - return w; - } - - int authority(MDSMap *mdsmap) { - //cout << "authority on " << inode.ino << " .. dir_auth is " << dir_auth<< endl; - // parent? - if (dn && dn->dir && dn->dir->parent_inode) { - // parent hashed? - if (dn->dir->parent_inode->dir_hashed) { - // hashed - assert(0); - // fixme - //return mdcluster->hash_dentry( dn->dir->parent_inode->ino(), - //dn->name ); - } - - if (dir_auth >= 0) - return dir_auth; - else - return dn->dir->parent_inode->authority(mdsmap); - } - - if (dir_auth >= 0) - return dir_auth; - - assert(0); // !!! - return 0; - } - int dentry_authority(const char *dn, - MDSMap *mdsmap) { - assert(0); - return 0; - //return ->hash_dentry( ino(), - //dn ); - } - int pick_replica(MDSMap *mdsmap) { - // replicas? - if (ino() > 1ULL && dir_contacts.size()) { - //cout << "dir_contacts if " << dir_contacts << endl; - set::iterator it = dir_contacts.begin(); - if (dir_contacts.size() == 1) - return *it; - else { - int r = rand() % dir_contacts.size(); - while (r--) it++; - return *it; - } - } - - if (dir_replicated || ino() == 1) { - //cout << "num_mds is " << mdcluster->get_num_mds() << endl; - return rand() % mdsmap->get_num_mds(); // huh.. pick a random mds! - } - else - return authority(mdsmap); - } - - - // open Dir for an inode. if it's not open, allocated it (and pin dentry in memory). - Dir *open_dir() { - if (!dir) { - if (dn) dn->get(); // pin dentry - get(); - dir = new Dir(this); - } - return dir; - } - -}; - - - - -// file handle for any open file state - -struct Fh { - Inode *inode; - off_t pos; - int mds; // have to talk to mds we opened with (for now) - int mode; // the mode i opened the file with - - bool is_lazy() { return mode & O_LAZY; } - - Fh() : inode(0), pos(0), mds(0), mode(0) {} -}; - - - - - -// ======================================================== -// client interface - -class Client : public Dispatcher { - public: - - /* getdir result */ - struct DirResult { - string path; - map contents; - map::iterator p; - int off; - int size; - struct dirent_plus dp; - struct dirent_lite dl; - DirResult() : p(contents.end()), off(-1), size(0) {} - }; - - - protected: - Messenger *messenger; - int whoami; - MonMap *monmap; - - // mds fake RPC - tid_t last_tid; - map mds_rpc_cond; - map mds_rpc_reply; - map mds_rpc_dispatch_cond; - - // cluster descriptors - MDSMap *mdsmap; - OSDMap *osdmap; - - bool mounted; - bool unmounting; - Cond mount_cond; - - int unsafe_sync_write; -public: - entity_name_t get_myname() { return messenger->get_myname(); } - void hack_sync_write_safe(); - -protected: - Filer *filer; - ObjectCacher *objectcacher; - Objecter *objecter; // (non-blocking) osd interface - - // cache - hash_map inode_map; - Inode* root; - LRU lru; // lru list of Dentry's in our local metadata cache. - - // cap weirdness - map > cap_reap_queue; // ino -> mds -> msg .. set of (would-be) stale caps to reap - - - // file handles, etc. - string cwd; - interval_set free_fh_set; // unused fh's - hash_map fh_map; - - fh_t get_fh() { - fh_t fh = free_fh_set.start(); - free_fh_set.erase(fh, 1); - return fh; - } - void put_fh(fh_t fh) { - free_fh_set.insert(fh, 1); - } - - void mkabspath(const char *rel, string& abs) { - if (rel[0] == '/') { - abs = rel; - } else { - abs = cwd; - abs += "/"; - abs += rel; - } - } - - - // global client lock - // - protects Client and buffer cache both! - Mutex client_lock; - - - // -- metadata cache stuff - - // decrease inode ref. delete if dangling. - void put_inode(Inode *in) { - in->put(); - if (in->ref == 0) { - inode_map.erase(in->inode.ino); - if (in == root) root = 0; - delete in; - } - } - - void close_dir(Dir *dir) { - assert(dir->is_empty()); - - Inode *in = dir->parent_inode; - if (in->dn) in->dn->put(); // unpin dentry - - delete in->dir; - in->dir = 0; - put_inode(in); - } - - int get_cache_size() { return lru.lru_get_size(); } - void set_cache_size(int m) { lru.lru_set_max(m); } - - Dentry* link(Dir *dir, const string& name, Inode *in) { - Dentry *dn = new Dentry; - dn->name = name; - - // link to dir - dn->dir = dir; - dir->dentries[dn->name] = dn; - - // link to inode - dn->inode = in; - in->dn = dn; - in->get(); - - lru.lru_insert_mid(dn); // mid or top? - return dn; - } - - void unlink(Dentry *dn) { - Inode *in = dn->inode; - - // unlink from inode - dn->inode = 0; - in->dn = 0; - put_inode(in); - - // unlink from dir - dn->dir->dentries.erase(dn->name); - if (dn->dir->is_empty()) - close_dir(dn->dir); - dn->dir = 0; - - // delete den - lru.lru_remove(dn); - delete dn; - } - - Dentry *relink(Dentry *dn, Dir *dir, const string& name) { - // first link new dn to dir - /* - char *oldname = (char*)dn->name; - dn->name = new char[name.length()+1]; - strcpy((char*)dn->name, name.c_str()); - dir->dentries[dn->name] = dn; - */ - dir->dentries[name] = dn; - - // unlink from old dir - dn->dir->dentries.erase(dn->name); - //delete[] oldname; - if (dn->dir->is_empty()) - close_dir(dn->dir); - - // fix up dn - dn->name = name; - dn->dir = dir; - - return dn; - } - - // move dentry to top of lru - void touch_dn(Dentry *dn) { lru.lru_touch(dn); } - - // trim cache. - void trim_cache(); - void dump_inode(Inode *in, set& did); - void dump_cache(); // debug - - // find dentry based on filepath - Dentry *lookup(filepath& path); - - // make blocking mds request - MClientReply *make_request(MClientRequest *req, bool auth_best=false, int use_auth=-1); - MClientReply* sendrecv(MClientRequest *req, int mds); - void handle_client_reply(MClientReply *reply); - - void fill_stat(inode_t& inode, struct stat *st); - void fill_statlite(inode_t& inode, struct statlite *st); - - - // friends - friend class SyntheticClient; - - public: - Client(Messenger *m, MonMap *mm); - ~Client(); - void tear_down_cache(); - - int get_nodeid() { return whoami; } - - void init(); - void shutdown(); - - // messaging - void dispatch(Message *m); - - void handle_mount_ack(class MClientMountAck*); - void handle_unmount_ack(Message*); - void handle_mds_map(class MMDSMap *m); - - // file caps - void handle_file_caps(class MClientFileCaps *m); - void implemented_caps(class MClientFileCaps *m, Inode *in); - void release_caps(Inode *in, int retain=0); - void update_caps_wanted(Inode *in); - - void close_release(Inode *in); - void close_safe(Inode *in); - - // metadata cache - Inode* insert_inode(Dir *dir, InodeStat *in_info, const string& dn); - void update_inode_dist(Inode *in, InodeStat *st); - Inode* insert_trace(MClientReply *reply); - - // ---------------------- - // fs ops. - int mount(); - int unmount(); - - // these shoud (more or less) mirror the actual system calls. - int statfs(const char *path, struct statvfs *stbuf); - - // crap - int chdir(const char *s); - const string getcwd() { return cwd; } - - // namespace ops - int getdir(const char *path, list& contents); - int getdir(const char *path, map& contents); - - DIR *opendir(const char *name); - int closedir(DIR *dir); - struct dirent *readdir(DIR *dir); - void rewinddir(DIR *dir); - off_t telldir(DIR *dir); - void seekdir(DIR *dir, off_t offset); - - struct dirent_plus *readdirplus(DIR *dirp); - int readdirplus_r(DIR *dirp, struct dirent_plus *entry, struct dirent_plus **result); - struct dirent_lite *readdirlite(DIR *dirp); - int readdirlite_r(DIR *dirp, struct dirent_lite *entry, struct dirent_lite **result); - - - int link(const char *existing, const char *newname); - int unlink(const char *path); - int rename(const char *from, const char *to); - - // dirs - int mkdir(const char *path, mode_t mode); - int rmdir(const char *path); - - // symlinks - int readlink(const char *path, char *buf, off_t size); - int symlink(const char *existing, const char *newname); - - // inode stuff - int _lstat(const char *path, int mask, Inode **in); - int lstat(const char *path, struct stat *stbuf); - int lstatlite(const char *path, struct statlite *buf); - - int chmod(const char *path, mode_t mode); - int chown(const char *path, uid_t uid, gid_t gid); - int utime(const char *path, struct utimbuf *buf); - - // file ops - int mknod(const char *path, mode_t mode); - int open(const char *path, int mode); - int close(fh_t fh); - off_t lseek(fh_t fh, off_t offset, int whence); - int read(fh_t fh, char *buf, off_t size, off_t offset=-1); - int write(fh_t fh, const char *buf, off_t size, off_t offset=-1); - int truncate(const char *file, off_t size); - //int truncate(fh_t fh, long long size); - int fsync(fh_t fh, bool syncdataonly); - - - // hpc lazyio - int lazyio_propogate(int fd, off_t offset, size_t count); - int lazyio_synchronize(int fd, off_t offset, size_t count); - - // expose file layout - int describe_layout(int fd, FileLayout* layout); - int get_stripe_unit(int fd); - int get_stripe_width(int fd); - int get_stripe_period(int fd); - int enumerate_layout(int fd, list& result, - off_t length, off_t offset); - - // failure - void ms_handle_failure(Message*, const entity_inst_t& inst); -}; - -#endif diff --git a/tags/20070517_before_mds_merge/client/FileCache.cc b/tags/20070517_before_mds_merge/client/FileCache.cc deleted file mode 100644 index 6645bef09b6df..0000000000000 --- a/tags/20070517_before_mds_merge/client/FileCache.cc +++ /dev/null @@ -1,263 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "config.h" -#include "include/types.h" - -#include "FileCache.h" -#include "osdc/ObjectCacher.h" - -#include "msg/Messenger.h" - -#undef dout -#define dout(x) if (x <= g_conf.debug_client) cout << g_clock.now() << " " << oc->objecter->messenger->get_myname() << ".filecache " -#define derr(x) if (x <= g_conf.debug_client) cout << g_clock.now() << " " << oc->objecter->messenger->get_myname() << ".filecache " - - -// flush/release/clean - -void FileCache::flush_dirty(Context *onflush) -{ - if (oc->flush_set(inode.ino, onflush)) { - onflush->finish(0); - delete onflush; - } -} - -off_t FileCache::release_clean() -{ - return oc->release_set(inode.ino); -} - -bool FileCache::is_cached() -{ - return oc->set_is_cached(inode.ino); -} - -bool FileCache::is_dirty() -{ - return oc->set_is_dirty_or_committing(inode.ino); -} - -void FileCache::empty(Context *onempty) -{ - off_t unclean = release_clean(); - bool clean = oc->flush_set(inode.ino, onempty); - assert(!unclean == clean); - - if (clean) { - onempty->finish(0); - delete onempty; - } -} - - -void FileCache::tear_down() -{ - off_t unclean = release_clean(); - if (unclean) { - dout(0) << "tear_down " << unclean << " unclean bytes, purging" << endl; - oc->purge_set(inode.ino); - } -} - -// truncate - -void FileCache::truncate(off_t olds, off_t news) -{ - dout(5) << "truncate " << olds << " -> " << news << endl; - - // map range to objects - list ls; - oc->filer.file_to_extents(inode, news, olds-news, ls); - oc->truncate_set(inode.ino, ls); -} - -// caps - -class C_FC_CheckCaps : public Context { - FileCache *fc; -public: - C_FC_CheckCaps(FileCache *f) : fc(f) {} - void finish(int r) { - fc->check_caps(); - } -}; - -void FileCache::set_caps(int caps, Context *onimplement) -{ - if (onimplement) { - dout(10) << "set_caps setting onimplement context for " << cap_string(caps) << endl; - assert(latest_caps & ~caps); // we should be losing caps. - caps_callbacks[caps].push_back(onimplement); - } - - latest_caps = caps; - check_caps(); - - // kick waiters? (did we gain caps?) - if (can_read() && !waitfor_read.empty()) - for (set::iterator p = waitfor_read.begin(); - p != waitfor_read.end(); - ++p) - (*p)->Signal(); - if (can_write() && !waitfor_write.empty()) - for (set::iterator p = waitfor_write.begin(); - p != waitfor_write.end(); - ++p) - (*p)->Signal(); - -} - -int FileCache::get_used_caps() -{ - int used = 0; - if (num_reading) used |= CAP_FILE_RD; - if (oc->set_is_cached(inode.ino)) used |= CAP_FILE_RDCACHE; - if (num_writing) used |= CAP_FILE_WR; - if (oc->set_is_dirty_or_committing(inode.ino)) used |= CAP_FILE_WRBUFFER; - return used; -} - -void FileCache::check_caps() -{ - // calc used - int used = get_used_caps(); - dout(10) << "check_caps used was " << cap_string(used) << endl; - - // try to implement caps? - // BUG? latest_caps, not least caps i've seen? - if ((latest_caps & CAP_FILE_RDCACHE) == 0 && - (used & CAP_FILE_RDCACHE)) - release_clean(); - if ((latest_caps & CAP_FILE_WRBUFFER) == 0 && - (used & CAP_FILE_WRBUFFER)) - flush_dirty(new C_FC_CheckCaps(this)); - - used = get_used_caps(); - dout(10) << "check_caps used now " << cap_string(used) << endl; - - // check callbacks - map >::iterator p = caps_callbacks.begin(); - while (p != caps_callbacks.end()) { - if (used == 0 || (~(p->first) & used) == 0) { - // implemented. - dout(10) << "used is " << cap_string(used) - << ", caps " << cap_string(p->first) << " implemented, doing callback(s)" << endl; - finish_contexts(p->second); - map >::iterator o = p; - p++; - caps_callbacks.erase(o); - } else { - dout(10) << "used is " << cap_string(used) - << ", caps " << cap_string(p->first) << " not yet implemented" << endl; - p++; - } - } -} - - - -// read/write - -int FileCache::read(off_t offset, size_t size, bufferlist& blist, Mutex& client_lock) -{ - int r = 0; - - // can i read? - while ((latest_caps & CAP_FILE_RD) == 0) { - dout(10) << "read doesn't have RD cap, blocking" << endl; - Cond c; - waitfor_read.insert(&c); - c.Wait(client_lock); - waitfor_read.erase(&c); - } - - // inc reading counter - num_reading++; - - if (latest_caps & CAP_FILE_RDCACHE) { - // read (and block) - Cond cond; - bool done = false; - int rvalue = 0; - C_Cond *onfinish = new C_Cond(&cond, &done, &rvalue); - - r = oc->file_read(inode, offset, size, &blist, onfinish); - - if (r == 0) { - // block - while (!done) - cond.Wait(client_lock); - r = rvalue; - } else { - // it was cached. - delete onfinish; - } - } else { - r = oc->file_atomic_sync_read(inode, offset, size, &blist, client_lock); - } - - // dec reading counter - num_reading--; - - if (num_reading == 0 && !caps_callbacks.empty()) - check_caps(); - - return r; -} - -void FileCache::write(off_t offset, size_t size, bufferlist& blist, Mutex& client_lock) -{ - // can i write - while ((latest_caps & CAP_FILE_WR) == 0) { - dout(10) << "write doesn't have WR cap, blocking" << endl; - Cond c; - waitfor_write.insert(&c); - c.Wait(client_lock); - waitfor_write.erase(&c); - } - - // inc writing counter - num_writing++; - - if (latest_caps & CAP_FILE_WRBUFFER) { // caps buffered write? - // wait? (this may block!) - oc->wait_for_write(size, client_lock); - - // async, caching, non-blocking. - oc->file_write(inode, offset, size, blist); - } else { - // atomic, synchronous, blocking. - oc->file_atomic_sync_write(inode, offset, size, blist, client_lock); - } - - // dec writing counter - num_writing--; - if (num_writing == 0 && !caps_callbacks.empty()) - check_caps(); -} - -bool FileCache::all_safe() -{ - return !oc->set_is_dirty_or_committing(inode.ino); -} - -void FileCache::add_safe_waiter(Context *c) -{ - bool safe = oc->commit_set(inode.ino, c); - if (safe) { - c->finish(0); - delete c; - } -} diff --git a/tags/20070517_before_mds_merge/client/FileCache.h b/tags/20070517_before_mds_merge/client/FileCache.h deleted file mode 100644 index 9ba82f92eb1ab..0000000000000 --- a/tags/20070517_before_mds_merge/client/FileCache.h +++ /dev/null @@ -1,84 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __FILECACHE_H -#define __FILECACHE_H - -#include -using namespace std; - -#include "common/Cond.h" -#include "mds/Capability.h" - -class ObjectCacher; - -class FileCache { - ObjectCacher *oc; - inode_t inode; - - // caps - int latest_caps; - map > caps_callbacks; - - int num_reading; - int num_writing; - //int num_unsafe; - - // waiters - set waitfor_read; - set waitfor_write; - - bool waitfor_release; - - public: - FileCache(ObjectCacher *_oc, inode_t _inode) : - oc(_oc), - inode(_inode), - latest_caps(0), - num_reading(0), num_writing(0),// num_unsafe(0), - waitfor_release(false) {} - ~FileCache() { - tear_down(); - } - - // waiters/waiting - bool can_read() { return latest_caps & CAP_FILE_RD; } - bool can_write() { return latest_caps & CAP_FILE_WR; } - bool all_safe();// { return num_unsafe == 0; } - - void add_safe_waiter(Context *c); - - void truncate(off_t olds, off_t news); - - // ... - void flush_dirty(Context *onflush=0); - off_t release_clean(); - void empty(Context *onempty=0); - bool is_empty() { return !(is_cached() || is_dirty()); } - bool is_cached(); - bool is_dirty(); - - void tear_down(); - - int get_caps() { return latest_caps; } - int get_used_caps(); - void set_caps(int caps, Context *onimplement=0); - void check_caps(); - - int read(off_t offset, size_t size, bufferlist& blist, Mutex& client_lock); // may block. - void write(off_t offset, size_t size, bufferlist& blist, Mutex& client_lock); // may block. - -}; - - -#endif diff --git a/tags/20070517_before_mds_merge/client/SyntheticClient.cc b/tags/20070517_before_mds_merge/client/SyntheticClient.cc deleted file mode 100644 index 66c1c93dab996..0000000000000 --- a/tags/20070517_before_mds_merge/client/SyntheticClient.cc +++ /dev/null @@ -1,1325 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include -#include -using namespace std; - - - -#include "SyntheticClient.h" - -#include "include/filepath.h" -#include "mds/MDS.h" - -#include -#include -#include -#include -#include -#include - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_client) cout << g_clock.now() << " synthetic" << client->get_nodeid() << " " - -// traces -//void trace_include(SyntheticClient *syn, Client *cl, string& prefix); -//void trace_openssh(SyntheticClient *syn, Client *cl, string& prefix); - - -list syn_modes; -list syn_iargs; -list syn_sargs; - -void parse_syn_options(vector& args) -{ - vector nargs; - - for (unsigned i=0; iclient = client; - thread_id = 0; - - did_readdir = false; - - run_only = -1; - - this->modes = syn_modes; - this->iargs = syn_iargs; - this->sargs = syn_sargs; - - run_start = g_clock.now(); -} - - - - -#define DBL 2 - -void *synthetic_client_thread_entry(void *ptr) -{ - SyntheticClient *sc = (SyntheticClient*)ptr; - //int r = - sc->run(); - return 0;//(void*)r; -} - -string SyntheticClient::get_sarg(int seq) -{ - string a; - if (!sargs.empty()) { - a = sargs.front(); - sargs.pop_front(); - } - if (a.length() == 0 || a == "~") { - char s[20]; - sprintf(s,"syn.%d.%d", client->whoami, seq); - a = s; - } - //cout << "a is " << a << endl; - return a; -} - -int SyntheticClient::run() -{ - //run_start = g_clock.now(); - run_until = utime_t(0,0); - dout(5) << "run" << endl; - - for (list::iterator it = modes.begin(); - it != modes.end(); - it++) { - int mode = *it; - dout(3) << "mode " << mode << endl; - - switch (mode) { - case SYNCLIENT_MODE_RANDOMSLEEP: - { - int iarg1 = iargs.front(); - iargs.pop_front(); - if (run_me()) { - srand(time(0) + getpid() + client->whoami); - sleep(rand() % iarg1); - } - } - break; - - case SYNCLIENT_MODE_SLEEP: - { - int iarg1 = iargs.front(); - iargs.pop_front(); - if (run_me()) { - dout(2) << "sleep " << iarg1 << endl; - sleep(iarg1); - } - } - break; - - case SYNCLIENT_MODE_ONLY: - { - run_only = iargs.front(); - iargs.pop_front(); - if (run_only == client->get_nodeid()) - dout(2) << "only " << run_only << endl; - } - break; - - case SYNCLIENT_MODE_UNTIL: - { - int iarg1 = iargs.front(); - iargs.pop_front(); - if (iarg1) { - dout(2) << "until " << iarg1 << endl; - utime_t dur(iarg1,0); - run_until = run_start + dur; - } else { - dout(2) << "until " << iarg1 << " (no limit)" << endl; - run_until = utime_t(0,0); - } - } - break; - - case SYNCLIENT_MODE_SLEEPUNTIL: - { - int iarg1 = iargs.front(); - iargs.pop_front(); - if (iarg1) { - dout(2) << "sleepuntil " << iarg1 << endl; - utime_t at = g_clock.now() - run_start; - if (at.sec() < iarg1) - sleep(iarg1 - at.sec()); - } - } - break; - - case SYNCLIENT_MODE_RANDOMWALK: - { - int iarg1 = iargs.front(); - iargs.pop_front(); - if (run_me()) { - dout(2) << "randomwalk " << iarg1 << endl; - random_walk(iarg1); - } - } - break; - - case SYNCLIENT_MODE_MAKEDIRMESS: - { - string sarg1 = get_sarg(0); - int iarg1 = iargs.front(); iargs.pop_front(); - if (run_me()) { - dout(2) << "makedirmess " << sarg1 << " " << iarg1 << endl; - make_dir_mess(sarg1.c_str(), iarg1); - } - } - break; - case SYNCLIENT_MODE_MAKEDIRS: - { - string sarg1 = get_sarg(0); - int iarg1 = iargs.front(); iargs.pop_front(); - int iarg2 = iargs.front(); iargs.pop_front(); - int iarg3 = iargs.front(); iargs.pop_front(); - if (run_me()) { - dout(2) << "makedirs " << sarg1 << " " << iarg1 << " " << iarg2 << " " << iarg3 << endl; - make_dirs(sarg1.c_str(), iarg1, iarg2, iarg3); - } - } - break; - case SYNCLIENT_MODE_STATDIRS: - { - string sarg1 = get_sarg(0); - int iarg1 = iargs.front(); iargs.pop_front(); - int iarg2 = iargs.front(); iargs.pop_front(); - int iarg3 = iargs.front(); iargs.pop_front(); - if (run_me()) { - dout(2) << "statdirs " << sarg1 << " " << iarg1 << " " << iarg2 << " " << iarg3 << endl; - stat_dirs(sarg1.c_str(), iarg1, iarg2, iarg3); - } - } - break; - case SYNCLIENT_MODE_READDIRS: - { - string sarg1 = get_sarg(0); - int iarg1 = iargs.front(); iargs.pop_front(); - int iarg2 = iargs.front(); iargs.pop_front(); - int iarg3 = iargs.front(); iargs.pop_front(); - if (run_me()) { - dout(2) << "readdirs " << sarg1 << " " << iarg1 << " " << iarg2 << " " << iarg3 << endl; - read_dirs(sarg1.c_str(), iarg1, iarg2, iarg3); - } - } - break; - - - case SYNCLIENT_MODE_MAKEFILES: - { - int num = iargs.front(); iargs.pop_front(); - int count = iargs.front(); iargs.pop_front(); - int priv = iargs.front(); iargs.pop_front(); - if (run_me()) { - dout(2) << "makefiles " << num << " " << count << " " << priv << endl; - make_files(num, count, priv, false); - } - } - break; - case SYNCLIENT_MODE_MAKEFILES2: - { - int num = iargs.front(); iargs.pop_front(); - int count = iargs.front(); iargs.pop_front(); - int priv = iargs.front(); iargs.pop_front(); - if (run_me()) { - dout(2) << "makefiles2 " << num << " " << count << " " << priv << endl; - make_files(num, count, priv, true); - } - } - break; - case SYNCLIENT_MODE_CREATESHARED: - { - string sarg1 = get_sarg(0); - int num = iargs.front(); iargs.pop_front(); - if (run_me()) { - dout(2) << "createshared " << num << endl; - create_shared(num); - } - } - break; - case SYNCLIENT_MODE_OPENSHARED: - { - string sarg1 = get_sarg(0); - int num = iargs.front(); iargs.pop_front(); - int count = iargs.front(); iargs.pop_front(); - if (run_me()) { - dout(2) << "openshared " << num << endl; - open_shared(num, count); - } - } - break; - - case SYNCLIENT_MODE_FULLWALK: - { - string sarg1;// = get_sarg(0); - if (run_me()) { - dout(2) << "fullwalk" << sarg1 << endl; - full_walk(sarg1); - } - } - break; - case SYNCLIENT_MODE_REPEATWALK: - { - string sarg1 = get_sarg(0); - if (run_me()) { - dout(2) << "repeatwalk " << sarg1 << endl; - while (full_walk(sarg1) == 0) ; - } - } - break; - - case SYNCLIENT_MODE_WRITEFILE: - { - string sarg1 = get_sarg(0); - int iarg1 = iargs.front(); iargs.pop_front(); - int iarg2 = iargs.front(); iargs.pop_front(); - if (run_me()) - write_file(sarg1, iarg1, iarg2); - } - break; - case SYNCLIENT_MODE_WRSHARED: - { - string sarg1 = "shared"; - int iarg1 = iargs.front(); iargs.pop_front(); - int iarg2 = iargs.front(); iargs.pop_front(); - if (run_me()) - write_file(sarg1, iarg1, iarg2); - } - break; - case SYNCLIENT_MODE_WRITEBATCH: - { - int iarg1 = iargs.front(); iargs.pop_front(); - int iarg2 = iargs.front(); iargs.pop_front(); - int iarg3 = iargs.front(); iargs.pop_front(); - - if (run_me()) - write_batch(iarg1, iarg2, iarg3); - } - break; - - case SYNCLIENT_MODE_READFILE: - { - string sarg1 = get_sarg(0); - int iarg1 = iargs.front(); iargs.pop_front(); - int iarg2 = iargs.front(); iargs.pop_front(); - if (run_me()) - read_file(sarg1, iarg1, iarg2); - } - break; - - case SYNCLIENT_MODE_TRACE: - { - string tfile = get_sarg(0); - sargs.push_front(string("~")); - int iarg1 = iargs.front(); iargs.pop_front(); - string prefix = get_sarg(0); - - if (run_me()) { - dout(2) << "trace " << tfile << " prefix " << prefix << " ... " << iarg1 << " times" << endl; - - Trace t(tfile.c_str()); - - client->mkdir(prefix.c_str(), 0755); - - for (int i=0; i 0 - && i < iarg1-1 - ) { - client_logger->finc("trsum", (double)lat); - client_logger->inc("trnum"); - } - } - } - } - break; - - - case SYNCLIENT_MODE_OPENTEST: - { - int count = iargs.front(); iargs.pop_front(); - if (run_me()) { - for (int i=0; iopen("test", rand()%2 ? (O_WRONLY|O_CREAT):O_RDONLY); - if (fd > 0) client->close(fd); - } - } - } - break; - - case SYNCLIENT_MODE_OPTEST: - { - int count = iargs.front(); iargs.pop_front(); - if (run_me()) { - client->mknod("test",0777); - struct stat st; - for (int i=0; ilstat("test", &st); - client->chmod("test", 0777); - } - } - } - break; - - case SYNCLIENT_MODE_TRUNCATE: - { - string file = get_sarg(0); - sargs.push_front(file); - int iarg1 = iargs.front(); iargs.pop_front(); - if (run_me()) - client->truncate(file.c_str(), iarg1); - } - break; - - default: - assert(0); - } - } - return 0; -} - - -int SyntheticClient::start_thread() -{ - assert(!thread_id); - - pthread_create(&thread_id, NULL, synthetic_client_thread_entry, this); - assert(thread_id); - return 0; -} - -int SyntheticClient::join_thread() -{ - assert(thread_id); - void *rv; - pthread_join(thread_id, &rv); - return 0; -} - - -bool roll_die(float p) -{ - float r = (float)(rand() % 100000) / 100000.0; - if (r < p) - return true; - else - return false; -} - -void SyntheticClient::init_op_dist() -{ - op_dist.clear(); - op_dist.add( MDS_OP_STAT, g_conf.fakeclient_op_stat ); - op_dist.add( MDS_OP_UTIME, g_conf.fakeclient_op_utime ); - op_dist.add( MDS_OP_CHMOD, g_conf.fakeclient_op_chmod ); - op_dist.add( MDS_OP_CHOWN, g_conf.fakeclient_op_chown ); - - op_dist.add( MDS_OP_READDIR, g_conf.fakeclient_op_readdir ); - op_dist.add( MDS_OP_MKNOD, g_conf.fakeclient_op_mknod ); - op_dist.add( MDS_OP_LINK, g_conf.fakeclient_op_link ); - op_dist.add( MDS_OP_UNLINK, g_conf.fakeclient_op_unlink ); - op_dist.add( MDS_OP_RENAME, g_conf.fakeclient_op_rename ); - - op_dist.add( MDS_OP_MKDIR, g_conf.fakeclient_op_mkdir ); - op_dist.add( MDS_OP_RMDIR, g_conf.fakeclient_op_rmdir ); - op_dist.add( MDS_OP_SYMLINK, g_conf.fakeclient_op_symlink ); - - op_dist.add( MDS_OP_OPEN, g_conf.fakeclient_op_openrd ); - //op_dist.add( MDS_OP_READ, g_conf.fakeclient_op_read ); - //op_dist.add( MDS_OP_WRITE, g_conf.fakeclient_op_write ); - op_dist.add( MDS_OP_TRUNCATE, g_conf.fakeclient_op_truncate ); - op_dist.add( MDS_OP_FSYNC, g_conf.fakeclient_op_fsync ); - op_dist.add( MDS_OP_RELEASE, g_conf.fakeclient_op_close ); // actually, close() - op_dist.normalize(); -} - -void SyntheticClient::up() -{ - cwd = cwd.prefixpath(cwd.depth()-1); - dout(DBL) << "cd .. -> " << cwd << endl; - clear_dir(); -} - - -int SyntheticClient::play_trace(Trace& t, string& prefix) -{ - dout(4) << "play trace" << endl; - t.start(); - - utime_t start = g_clock.now(); - - const char *p = prefix.c_str(); - - map<__int64_t, __int64_t> open_files; - - while (!t.end()) { - - if (time_to_stop()) break; - - // op - const char *op = t.get_string(); - dout(4) << "trace op " << op << endl; - if (strcmp(op, "link") == 0) { - const char *a = t.get_string(p); - const char *b = t.get_string(p); - client->link(a,b); - } else if (strcmp(op, "unlink") == 0) { - const char *a = t.get_string(p); - client->unlink(a); - } else if (strcmp(op, "rename") == 0) { - const char *a = t.get_string(p); - const char *b = t.get_string(p); - client->rename(a,b); - } else if (strcmp(op, "mkdir") == 0) { - const char *a = t.get_string(p); - __int64_t b = t.get_int(); - client->mkdir(a, b); - } else if (strcmp(op, "rmdir") == 0) { - const char *a = t.get_string(p); - client->rmdir(a); - } else if (strcmp(op, "symlink") == 0) { - const char *a = t.get_string(p); - const char *b = t.get_string(p); - client->symlink(a,b); - } else if (strcmp(op, "readlink") == 0) { - const char *a = t.get_string(p); - char buf[100]; - client->readlink(a, buf, 100); - } else if (strcmp(op, "lstat") == 0) { - struct stat st; - const char *a = t.get_string(p); - client->lstat(a, &st); - } else if (strcmp(op, "chmod") == 0) { - const char *a = t.get_string(p); - __int64_t b = t.get_int(); - client->chmod(a, b); - } else if (strcmp(op, "chown") == 0) { - const char *a = t.get_string(p); - __int64_t b = t.get_int(); - __int64_t c = t.get_int(); - client->chown(a, b, c); - } else if (strcmp(op, "utime") == 0) { - const char *a = t.get_string(p); - __int64_t b = t.get_int(); - __int64_t c = t.get_int(); - struct utimbuf u; - u.actime = b; - u.modtime = c; - client->utime(a, &u); - } else if (strcmp(op, "mknod") == 0) { - const char *a = t.get_string(p); - __int64_t b = t.get_int(); - client->mknod(a, b); - } else if (strcmp(op, "getdir") == 0) { - const char *a = t.get_string(p); - map contents; - client->getdir(a, contents); - } else if (strcmp(op, "open") == 0) { - const char *a = t.get_string(p); - __int64_t b = t.get_int(); - __int64_t id = t.get_int(); - __int64_t fh = client->open(a, b); - open_files[id] = fh; - } else if (strcmp(op, "close") == 0) { - __int64_t id = t.get_int(); - __int64_t fh = open_files[id]; - if (fh > 0) client->close(fh); - open_files.erase(id); - } else if (strcmp(op, "truncate") == 0) { - const char *a = t.get_string(p); - __int64_t b = t.get_int(); - client->truncate(a,b); - } else if (strcmp(op, "read") == 0) { - __int64_t id = t.get_int(); - __int64_t fh = open_files[id]; - int size = t.get_int(); - int off = t.get_int(); - char *buf = new char[size]; - client->read(fh, buf, size, off); - delete[] buf; - } else if (strcmp(op, "lseek") == 0) { - __int64_t id = t.get_int(); - __int64_t fh = open_files[id]; - int off = t.get_int(); - int whence = t.get_int(); - client->lseek(fh, off, whence); - } else if (strcmp(op, "write") == 0) { - __int64_t id = t.get_int(); - __int64_t fh = open_files[id]; - int size = t.get_int(); - int off = t.get_int(); - char *buf = new char[size]; - memset(buf, 1, size); // let's write 1's! - client->write(fh, buf, size, off); - delete[] buf; - } else if (strcmp(op, "fsync") == 0) { - assert(0); - } else - assert(0); - } - - // close open files - for (map<__int64_t, __int64_t>::iterator fi = open_files.begin(); - fi != open_files.end(); - fi++) { - dout(1) << "leftover close " << fi->second << endl; - if (fi->second > 0) client->close(fi->second); - } - - return 0; -} - - -int SyntheticClient::clean_dir(string& basedir) -{ - // read dir - map contents; - int r = client->getdir(basedir.c_str(), contents); - if (r < 0) { - dout(1) << "readdir on " << basedir << " returns " << r << endl; - return r; - } - - for (map::iterator it = contents.begin(); - it != contents.end(); - it++) { - if (it->first == ".") continue; - if (it->first == "..") continue; - string file = basedir + "/" + it->first; - - if (time_to_stop()) break; - - struct stat st; - int r = client->lstat(file.c_str(), &st); - if (r < 0) { - dout(1) << "stat error on " << file << " r=" << r << endl; - continue; - } - - if ((st.st_mode & INODE_TYPE_MASK) == INODE_MODE_DIR) { - clean_dir(file); - client->rmdir(file.c_str()); - } else { - client->unlink(file.c_str()); - } - } - - return 0; - -} - - -int SyntheticClient::full_walk(string& basedir) -{ - if (time_to_stop()) return -1; - - list dirq; - dirq.push_back(basedir); - - while (!dirq.empty()) { - string dir = dirq.front(); - dirq.pop_front(); - - // read dir - map contents; - int r = client->getdir(dir.c_str(), contents); - if (r < 0) { - dout(1) << "readdir on " << dir << " returns " << r << endl; - continue; - } - - for (map::iterator it = contents.begin(); - it != contents.end(); - it++) { - if (it->first == ".") continue; - if (it->first == "..") continue; - string file = dir + "/" + it->first; - - struct stat st; - int r = client->lstat(file.c_str(), &st); - if (r < 0) { - dout(1) << "stat error on " << file << " r=" << r << endl; - continue; - } - - // print - char *tm = ctime(&st.st_mtime); - tm[strlen(tm)-1] = 0; - printf("%c%c%c%c%c%c%c%c%c%c %2d %5d %5d %8d %12s %s\n", - S_ISDIR(st.st_mode) ? 'd':'-', - (st.st_mode & 0400) ? 'r':'-', - (st.st_mode & 0200) ? 'w':'-', - (st.st_mode & 0100) ? 'x':'-', - (st.st_mode & 040) ? 'r':'-', - (st.st_mode & 020) ? 'w':'-', - (st.st_mode & 010) ? 'x':'-', - (st.st_mode & 04) ? 'r':'-', - (st.st_mode & 02) ? 'w':'-', - (st.st_mode & 01) ? 'x':'-', - (int)st.st_nlink, - st.st_uid, st.st_gid, - (int)st.st_size, - tm, - file.c_str()); - - - if ((st.st_mode & INODE_TYPE_MASK) == INODE_MODE_DIR) { - dirq.push_back(file); - } - } - } - - return 0; -} - -int SyntheticClient::make_dirs(const char *basedir, int dirs, int files, int depth) -{ - if (time_to_stop()) return 0; - - // make sure base dir exists - int r = client->mkdir(basedir, 0755); - if (r != 0) { - dout(1) << "can't make base dir? " << basedir << endl; - return -1; - } - - // children - char d[500]; - dout(3) << "make_dirs " << basedir << " dirs " << dirs << " files " << files << " depth " << depth << endl; - for (int i=0; imknod(d, 0644); - } - - if (depth == 0) return 0; - - for (int i=0; ilstat(basedir, &st); - if (r != 0) { - dout(1) << "can't make base dir? " << basedir << endl; - return -1; - } - - // children - char d[500]; - dout(3) << "stat_dirs " << basedir << " dirs " << dirs << " files " << files << " depth " << depth << endl; - for (int i=0; ilstat(d, &st); - } - - if (depth == 0) return 0; - - for (int i=0; i contents; - utime_t s = g_clock.now(); - int r = client->getdir(basedir, contents); - utime_t e = g_clock.now(); - e -= s; - if (client_logger) client_logger->finc("readdir", e); - if (r < 0) { - dout(0) << "read_dirs couldn't readdir " << basedir << ", stopping" << endl; - return -1; - } - - for (int i=0; ilstat(d, &st) < 0) { - dout(2) << "read_dirs failed stat on " << d << ", stopping" << endl; - return -1; - } - utime_t e = g_clock.now(); - e -= s; - if (client_logger) client_logger->finc("stat", e); - } - - if (depth > 0) - for (int i=0; iget_nodeid(); - char d[255]; - - if (priv) { - for (int c=0; cmkdir(d, 0755); - } - } else { - // shared - if (whoami == 0) { - for (int c=0; cmkdir(d, 0755); - } - } else { - sleep(5); - } - } - - // files - struct stat st; - for (int c=0; cmknod(d, 0644); - - if (more) { - client->lstat(d, &st); - int fd = client->open(d, O_RDONLY); - client->unlink(d); - client->close(fd); - } - - if (time_to_stop()) return 0; - } - } - - return 0; -} - - -int SyntheticClient::create_shared(int num) -{ - // files - char d[255]; - for (int n=0; nmknod(d, 0644); - } - - return 0; -} - -int SyntheticClient::open_shared(int num, int count) -{ - // files - char d[255]; - for (int c=0; c fds; - for (int n=0; nopen(d,O_RDONLY); - fds.push_back(fd); - } - - while (!fds.empty()) { - int fd = fds.front(); - fds.pop_front(); - client->close(fd); - } - } - - return 0; -} - - -int SyntheticClient::write_file(string& fn, int size, int wrsize) // size is in MB, wrsize in bytes -{ - //__uint64_t wrsize = 1024*256; - char *buf = new char[wrsize+100]; // 1 MB - memset(buf, 7, wrsize); - __uint64_t chunks = (__uint64_t)size * (__uint64_t)(1024*1024) / (__uint64_t)wrsize; - - int fd = client->open(fn.c_str(), O_RDWR|O_CREAT); - dout(5) << "writing to " << fn << " fd " << fd << endl; - if (fd < 0) return fd; - - for (unsigned i=0; iget_nodeid(); - p++; - } - - client->write(fd, buf, wrsize, i*wrsize); - } - - client->close(fd); - delete[] buf; - - return 0; -} - -int SyntheticClient::write_batch(int nfile, int size, int wrsize) -{ - for (int i=0; iopen(fn.c_str(), O_RDONLY); - dout(5) << "reading from " << fn << " fd " << fd << endl; - if (fd < 0) return fd; - - for (unsigned i=0; iread(fd, buf, rdsize, i*rdsize); - if (r < rdsize) { - dout(1) << "read_file got r = " << r << ", probably end of file" << endl; - break; - } - - // verify fingerprint - int bad = 0; - __int64_t *p = (__int64_t*)buf; - __int64_t readoff, readclient; - while ((char*)p + 32 < buf + rdsize) { - readoff = *p; - __int64_t wantoff = i*rdsize + (__int64_t)((char*)p - buf); - p++; - readclient = *p; - p++; - if (readoff != wantoff || - readclient != client->get_nodeid()) { - if (!bad) - dout(0) << "WARNING: wrong data from OSD, block says fileoffset=" << readoff << " client=" << readclient - << ", should be offset " << wantoff << " clietn " << client->get_nodeid() - << endl; - bad++; - } - } - if (bad) - dout(0) << " + " << (bad-1) << " other bad 16-byte bits in this block" << endl; - } - - client->close(fd); - delete[] buf; - - return 0; -} - - - -int SyntheticClient::random_walk(int num_req) -{ - int left = num_req; - - //dout(1) << "random_walk() will do " << left << " ops" << endl; - - init_op_dist(); // set up metadata op distribution - - while (left > 0) { - left--; - - if (time_to_stop()) break; - - // ascend? - if (cwd.depth() && !roll_die(::pow((double).9, (double)cwd.depth()))) { - dout(DBL) << "die says up" << endl; - up(); - continue; - } - - // descend? - if (.9*roll_die(::pow((double).9,(double)cwd.depth())) && subdirs.size()) { - string s = get_random_subdir(); - cwd.add_dentry( s ); - dout(DBL) << "cd " << s << " -> " << cwd << endl; - clear_dir(); - continue; - } - - int op = 0; - filepath path; - - if (contents.empty() && roll_die(.3)) { - if (did_readdir) { - dout(DBL) << "empty dir, up" << endl; - up(); - } else - op = MDS_OP_READDIR; - } else { - op = op_dist.sample(); - } - //dout(DBL) << "op is " << op << endl; - - int r = 0; - - // do op - if (op == MDS_OP_UNLINK) { - if (contents.empty()) - op = MDS_OP_READDIR; - else - r = client->unlink( get_random_sub() ); // will fail on dirs - } - - if (op == MDS_OP_RENAME) { - if (contents.empty()) - op = MDS_OP_READDIR; - else { - r = client->rename( get_random_sub(), make_sub("ren") ); - } - } - - if (op == MDS_OP_MKDIR) { - r = client->mkdir( make_sub("mkdir"), 0755); - } - - if (op == MDS_OP_RMDIR) { - if (!subdirs.empty()) - r = client->rmdir( get_random_subdir() ); - else - r = client->rmdir( cwd.c_str() ); // will pbly fail - } - - if (op == MDS_OP_SYMLINK) { - } - - if (op == MDS_OP_CHMOD) { - if (contents.empty()) - op = MDS_OP_READDIR; - else - r = client->chmod( get_random_sub(), rand() & 0755 ); - } - - if (op == MDS_OP_CHOWN) { - if (contents.empty()) r = client->chown( cwd.c_str(), rand(), rand() ); - else - r = client->chown( get_random_sub(), rand(), rand() ); - } - - if (op == MDS_OP_LINK) { - } - - if (op == MDS_OP_UTIME) { - struct utimbuf b; - memset(&b, 1, sizeof(b)); - if (contents.empty()) - r = client->utime( cwd.c_str(), &b ); - else - r = client->utime( get_random_sub(), &b ); - } - - if (op == MDS_OP_MKNOD) { - r = client->mknod( make_sub("mknod"), 0644); - } - - if (op == MDS_OP_OPEN) { - if (contents.empty()) - op = MDS_OP_READDIR; - else { - r = client->open( get_random_sub(), O_RDONLY ); - if (r > 0) { - assert(open_files.count(r) == 0); - open_files.insert(r); - } - } - } - - if (op == MDS_OP_RELEASE) { // actually, close - if (open_files.empty()) - op = MDS_OP_STAT; - else { - int fh = get_random_fh(); - r = client->close( fh ); - if (r == 0) open_files.erase(fh); - } - } - - if (op == MDS_OP_STAT) { - struct stat st; - if (contents.empty()) { - if (did_readdir) { - if (roll_die(.1)) { - dout(DBL) << "stat in empty dir, up" << endl; - up(); - } else { - op = MDS_OP_MKNOD; - } - } else - op = MDS_OP_READDIR; - } else - r = client->lstat(get_random_sub(), &st); - } - - if (op == MDS_OP_READDIR) { - clear_dir(); - - map c; - r = client->getdir( cwd.c_str(), c ); - - for (map::iterator it = c.begin(); - it != c.end(); - it++) { - //dout(DBL) << " got " << it->first << endl; - contents[it->first] = it->second; - if (it->second.is_dir()) - subdirs.insert(it->first); - } - - did_readdir = true; - } - - // errors? - if (r < 0) { - // reevaluate cwd. - //while (cwd.depth()) { - //if (client->lookup(cwd)) break; // it's in the cache - - //dout(DBL) << "r = " << r << ", client doesn't have " << cwd << ", cd .." << endl; - dout(DBL) << "r = " << r << ", client may not have " << cwd << ", cd .." << endl; - up(); - //} - } - } - - // close files - dout(DBL) << "closing files" << endl; - while (!open_files.empty()) { - int fh = get_random_fh(); - int r = client->close( fh ); - if (r == 0) open_files.erase(fh); - } - - dout(DBL) << "done" << endl; - return 0; -} - - - - -void SyntheticClient::make_dir_mess(const char *basedir, int n) -{ - vector dirs; - - dirs.push_back(basedir); - dirs.push_back(basedir); - - client->mkdir(basedir, 0755); - - // motivation: - // P(dir) ~ subdirs_of(dir) + 2 - // from 5-year metadata workload paper in fast'07 - - // create dirs - for (int i=0; i> dir; - - // update dirs - dirs.push_back(parent); - dirs.push_back(dir); - dirs.push_back(dir); - - // do it - client->mkdir(dir.c_str(), 0755); - } - - -} - diff --git a/tags/20070517_before_mds_merge/client/SyntheticClient.h b/tags/20070517_before_mds_merge/client/SyntheticClient.h deleted file mode 100644 index adcf7584766e6..0000000000000 --- a/tags/20070517_before_mds_merge/client/SyntheticClient.h +++ /dev/null @@ -1,202 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __SYNTHETICCLIENT_H -#define __SYNTHETICCLIENT_H - -#include - -#include "Client.h" -#include "include/Distribution.h" - -#include "Trace.h" - -#define SYNCLIENT_MODE_RANDOMWALK 1 -#define SYNCLIENT_MODE_FULLWALK 2 -#define SYNCLIENT_MODE_REPEATWALK 3 - -#define SYNCLIENT_MODE_MAKEDIRMESS 7 -#define SYNCLIENT_MODE_MAKEDIRS 8 // dirs files depth -#define SYNCLIENT_MODE_STATDIRS 9 // dirs files depth -#define SYNCLIENT_MODE_READDIRS 10 // dirs files depth - -#define SYNCLIENT_MODE_MAKEFILES 11 // num count private -#define SYNCLIENT_MODE_MAKEFILES2 12 // num count private -#define SYNCLIENT_MODE_CREATESHARED 13 // num -#define SYNCLIENT_MODE_OPENSHARED 14 // num count - -#define SYNCLIENT_MODE_WRITEFILE 20 -#define SYNCLIENT_MODE_READFILE 21 -#define SYNCLIENT_MODE_WRITEBATCH 22 -#define SYNCLIENT_MODE_WRSHARED 23 - -#define SYNCLIENT_MODE_TRACE 30 - -#define SYNCLIENT_MODE_OPENTEST 40 -#define SYNCLIENT_MODE_OPTEST 41 - -#define SYNCLIENT_MODE_ONLY 50 -#define SYNCLIENT_MODE_UNTIL 51 -#define SYNCLIENT_MODE_SLEEPUNTIL 52 - -#define SYNCLIENT_MODE_RANDOMSLEEP 61 -#define SYNCLIENT_MODE_SLEEP 62 - -#define SYNCLIENT_MODE_TRUNCATE 200 - - - - -void parse_syn_options(vector& args); - -class SyntheticClient { - Client *client; - - pthread_t thread_id; - - Distribution op_dist; - - void init_op_dist(); - int get_op(); - - - filepath cwd; - map contents; - set subdirs; - bool did_readdir; - set open_files; - - void up(); - - void clear_dir() { - contents.clear(); - subdirs.clear(); - did_readdir = false; - } - - int get_random_fh() { - int r = rand() % open_files.size(); - set::iterator it = open_files.begin(); - while (r--) it++; - return *it; - } - - - filepath n1; - const char *get_random_subdir() { - assert(!subdirs.empty()); - int r = ((rand() % subdirs.size()) + (rand() % subdirs.size())) / 2; // non-uniform distn - set::iterator it = subdirs.begin(); - while (r--) it++; - - n1 = cwd; - n1.add_dentry( *it ); - return n1.get_path().c_str(); - } - filepath n2; - const char *get_random_sub() { - assert(!contents.empty()); - int r = ((rand() % contents.size()) + (rand() % contents.size())) / 2; // non-uniform distn - if (cwd.depth() && cwd.last_bit().length()) - r += cwd.last_bit().c_str()[0]; // slightly permuted - r %= contents.size(); - - map::iterator it = contents.begin(); - while (r--) it++; - - n2 = cwd; - n2.add_dentry( it->first ); - return n2.get_path().c_str(); - } - - filepath sub; - char sub_s[50]; - const char *make_sub(char *base) { - sprintf(sub_s, "%s.%d", base, rand() % 100); - string f = sub_s; - sub = cwd; - sub.add_dentry(f); - return sub.c_str(); - } - - public: - SyntheticClient(Client *client); - - int start_thread(); - int join_thread(); - - int run(); - - bool run_me() { - if (run_only >= 0) { - if (run_only == client->get_nodeid()) { - run_only = -1; - return true; - } - run_only = -1; - return false; - } - return true; - } - - // run() will do one of these things: - list modes; - list sargs; - list iargs; - utime_t run_start; - utime_t run_until; - - int run_only; - - string get_sarg(int seq); - - bool time_to_stop() { - utime_t now = g_clock.now(); - if (0) cout << "time_to_stop .. now " << now - << " until " << run_until - << " start " << run_start - << endl; - if (run_until.sec() && now > run_until) - return true; - else - return false; - } - - string compose_path(string& prefix, char *rest) { - return prefix + rest; - } - - int full_walk(string& fromdir); - int random_walk(int n); - - int make_dirs(const char *basedir, int dirs, int files, int depth); - int stat_dirs(const char *basedir, int dirs, int files, int depth); - int read_dirs(const char *basedir, int dirs, int files, int depth); - int make_files(int num, int count, int priv, bool more); - - int create_shared(int num); - int open_shared(int num, int count); - - int write_file(string& fn, int mb, int chunk); - int write_batch(int nfile, int mb, int chunk); - int read_file(string& fn, int mb, int chunk); - - int clean_dir(string& basedir); - - int play_trace(Trace& t, string& prefix); - - void make_dir_mess(const char *basedir, int n); -}; - -#endif diff --git a/tags/20070517_before_mds_merge/client/Trace.cc b/tags/20070517_before_mds_merge/client/Trace.cc deleted file mode 100644 index 43459653011a1..0000000000000 --- a/tags/20070517_before_mds_merge/client/Trace.cc +++ /dev/null @@ -1,125 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "Trace.h" - -#include -#include -#include -#include -using namespace __gnu_cxx; - -#include "common/Mutex.h" - -#include "config.h" - -#include -#include -#include - - -Mutex trace_lock; - -class TokenList { -public: - string filename; - char *data; - int len; - list tokens; - - int ref; - - TokenList() : data(0), ref(0) {} - ~TokenList() { - delete[] data; - } -}; - -map traces; - - -// -Trace::Trace(const char* f) -{ - string filename = f; - - trace_lock.Lock(); - - if (traces.count(filename)) - tl = traces[filename]; - else { - tl = new TokenList; - tl->filename = filename; - - // open file - crope cr; - int fd = open(filename.c_str(), O_RDONLY); - assert(fd > 0); - char buf[100]; - while (1) { - int r = read(fd, buf, 100); - if (r == 0) break; - assert(r > 0); - cr.append(buf, r); - } - close(fd); - - // copy - tl->len = cr.length()+1; - tl->data = new char[tl->len]; - memcpy(tl->data, cr.c_str(), cr.length()); - tl->data[tl->len-1] = '\n'; - - // index! - int o = 0; - while (o < tl->len) { - char *n = tl->data + o; - - // find newline - while (tl->data[o] != '\n') o++; - assert(tl->data[o] == '\n'); - tl->data[o] = 0; - - if (tl->data + o > n) tl->tokens.push_back(n); - o++; - } - - dout(1) << "trace " << filename << " loaded with " << tl->tokens.size() << " tokens" << endl; - traces[filename] = tl; - } - - tl->ref++; - - trace_lock.Unlock(); -} - -Trace::~Trace() -{ - trace_lock.Lock(); - - tl->ref--; - if (tl->ref == 0) { - traces.erase(tl->filename); - delete tl; - } - - trace_lock.Unlock(); -} - - -list& Trace::get_list() -{ - return tl->tokens; -} diff --git a/tags/20070517_before_mds_merge/client/Trace.h b/tags/20070517_before_mds_merge/client/Trace.h deleted file mode 100644 index 08b1fa8ff2722..0000000000000 --- a/tags/20070517_before_mds_merge/client/Trace.h +++ /dev/null @@ -1,75 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __CLIENT_TRACE_H -#define __CLIENT_TRACE_H - -#include -#include -#include -using namespace std; - -/* - - this class is more like an iterator over a constant tokenlist (which - is protected by a mutex, see Trace.cc) - - */ - -class Trace { - class TokenList *tl; - - public: - Trace(const char* filename); - ~Trace(); - - list& get_list(); - - list::iterator _cur; - list::iterator _end; - - void start() { - _cur = get_list().begin(); - _end = get_list().end(); - ns = 0; - } - - char strings[10][200]; - int ns; - const char *get_string(const char *prefix = 0) { - assert(_cur != _end); - const char *s = *_cur; - _cur++; - if (prefix) { - if (strstr(s, "/prefix") == s || - strstr(s, "/prefix") == s+1) { - strcpy(strings[ns], prefix); - strcpy(strings[ns] + strlen(prefix), - s + strlen("/prefix")); - s = (const char*)strings[ns]; - ns++; - if (ns == 10) ns = 0; - } - } - return s; - } - __int64_t get_int() { - return atoll(get_string()); - } - bool end() { - return _cur == _end; - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/client/fuse.cc b/tags/20070517_before_mds_merge/client/fuse.cc deleted file mode 100644 index 2feb7472d1c7b..0000000000000 --- a/tags/20070517_before_mds_merge/client/fuse.cc +++ /dev/null @@ -1,280 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -/* - FUSE: Filesystem in Userspace - Copyright (C) 2001-2005 Miklos Szeredi - - This program can be distributed under the terms of the GNU GPL. - See the file COPYING. -*/ - - -// fuse crap -#ifdef linux -/* For pread()/pwrite() */ -#define _XOPEN_SOURCE 500 -#endif - -#define FUSE_USE_VERSION 25 - -#include -#include -#include -#include -#include -#include -#include -#include - - -// ceph stuff -#include "include/types.h" - -#include "Client.h" - -#include "config.h" - -// stl -#include -using namespace std; - - -// globals -Client *client; // the ceph client - - - -// ------ -// fuse hooks - -static int ceph_getattr(const char *path, struct stat *stbuf) -{ - return client->lstat(path, stbuf); -} - -static int ceph_readlink(const char *path, char *buf, size_t size) -{ - int res; - - res = client->readlink(path, buf, size - 1); - if (res < 0) return res; - - buf[res] = '\0'; - return 0; -} - - -static int ceph_getdir(const char *path, fuse_dirh_t h, fuse_dirfil_t filler) -{ - map contents; - - int res = client->getdir(path, contents); - if (res < 0) return res; - - // return contents to fuse via callback - for (map::iterator it = contents.begin(); - it != contents.end(); - it++) { - // (immutable) inode contents too. - res = filler(h, // fuse's handle - it->first.c_str(), // dentry as char* - it->second.mode & INODE_TYPE_MASK, // mask type bits from mode - it->second.ino); // ino.. 64->32 bit issue here? FIXME - if (res != 0) break; // fuse has had enough - } - return res; -} - -static int ceph_mknod(const char *path, mode_t mode, dev_t rdev) -{ - return client->mknod(path, mode); -} - -static int ceph_mkdir(const char *path, mode_t mode) -{ - return client->mkdir(path, mode); -} - -static int ceph_unlink(const char *path) -{ - return client->unlink(path); -} - -static int ceph_rmdir(const char *path) -{ - return client->rmdir(path); -} - -static int ceph_symlink(const char *from, const char *to) -{ - return client->symlink(from, to); -} - -static int ceph_rename(const char *from, const char *to) -{ - return client->rename(from, to); -} - -static int ceph_link(const char *from, const char *to) -{ - return client->link(from, to); -} - -static int ceph_chmod(const char *path, mode_t mode) -{ - return client->chmod(path, mode); -} - -static int ceph_chown(const char *path, uid_t uid, gid_t gid) -{ - return client->chown(path, uid, gid); -} - -static int ceph_truncate(const char *path, off_t size) -{ - return client->truncate(path, size); -} - -static int ceph_utime(const char *path, struct utimbuf *buf) -{ - return client->utime(path, buf); -} - - -static int ceph_open(const char *path, struct fuse_file_info *fi) -{ - int res; - - res = client->open(path, fi->flags); - if (res < 0) return res; - fi->fh = res; - return 0; // fuse wants 0 onsucess -} - -static int ceph_read(const char *path, char *buf, size_t size, off_t offset, - struct fuse_file_info *fi) -{ - fh_t fh = fi->fh; - return client->read(fh, buf, size, offset); -} - -static int ceph_write(const char *path, const char *buf, size_t size, - off_t offset, struct fuse_file_info *fi) -{ - fh_t fh = fi->fh; - return client->write(fh, buf, size, offset); -} - -static int ceph_flush(const char *path, struct fuse_file_info *fi) -{ -//fh_t fh = fi->fh; - //return client->flush(fh); - return 0; -} - - -static int ceph_statfs(const char *path, struct statvfs *stbuf) -{ - return client->statfs(path, stbuf); -} - - - -static int ceph_release(const char *path, struct fuse_file_info *fi) -{ - fh_t fh = fi->fh; - int r = client->close(fh); // close the file - return r; -} - -static int ceph_fsync(const char *path, int isdatasync, - struct fuse_file_info *fi) -{ - fh_t fh = fi->fh; - return client->fsync(fh, isdatasync ? true:false); -} - - -static struct fuse_operations ceph_oper = { - getattr: ceph_getattr, - readlink: ceph_readlink, - getdir: ceph_getdir, - mknod: ceph_mknod, - mkdir: ceph_mkdir, - unlink: ceph_unlink, - rmdir: ceph_rmdir, - symlink: ceph_symlink, - rename: ceph_rename, - link: ceph_link, - chmod: ceph_chmod, - chown: ceph_chown, - truncate: ceph_truncate, - utime: ceph_utime, - open: ceph_open, - read: ceph_read, - write: ceph_write, - statfs: ceph_statfs, - flush: ceph_flush, - release: ceph_release, - fsync: ceph_fsync -}; - - -int ceph_fuse_main(Client *c, int argc, char *argv[]) -{ - // init client - client = c; - - // set up fuse argc/argv - int newargc = 0; - char **newargv = (char **) malloc((argc + 10) * sizeof(char *)); - newargv[newargc++] = argv[0]; - - // allow other (all!) users to see my file system - // NOTE: echo user_allow_other >> /etc/fuse.conf - // NB: seems broken on Darwin -#ifndef DARWIN - newargv[newargc++] = "-o"; - newargv[newargc++] = "allow_other"; -#endif // DARWIN - - // use inos - newargv[newargc++] = "-o"; - newargv[newargc++] = "use_ino"; - - // large reads, direct_io (no kernel cachine) - //newargv[newargc++] = "-o"; - //newargv[newargc++] = "large_read"; - if (g_conf.fuse_direct_io) { - newargv[newargc++] = "-o"; - newargv[newargc++] = "direct_io"; - } - - // disable stupid fuse unlink hiding thing - newargv[newargc++] = "-o"; - newargv[newargc++] = "hard_remove"; - - // force into foreground - // -> we can watch stdout this way!! - newargv[newargc++] = "-f"; - - // copy rest of cmdline (hopefully, the mount point!) - for (int argctr = 1; argctr < argc; argctr++) newargv[newargc++] = argv[argctr]; - - // go fuse go - cout << "ok, calling fuse_main" << endl; - int r = fuse_main(newargc, newargv, &ceph_oper); - return r; -} diff --git a/tags/20070517_before_mds_merge/client/fuse.h b/tags/20070517_before_mds_merge/client/fuse.h deleted file mode 100644 index d0b8dcb1154f5..0000000000000 --- a/tags/20070517_before_mds_merge/client/fuse.h +++ /dev/null @@ -1,23 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -/* ceph_fuse_main - * - start up fuse glue, attached to Client* cl. - * - argc, argv should include a mount point, and - * any weird fuse options you want. by default, - * we will put fuse in the foreground so that it - * won't fork and we can see stdout. - */ -int ceph_fuse_main(Client *cl, int argc, char *argv[]); diff --git a/tags/20070517_before_mds_merge/client/hadoop/CephFSInterface.cc b/tags/20070517_before_mds_merge/client/hadoop/CephFSInterface.cc deleted file mode 100644 index 64d6d76fe00e5..0000000000000 --- a/tags/20070517_before_mds_merge/client/hadoop/CephFSInterface.cc +++ /dev/null @@ -1,824 +0,0 @@ -#include "CephFSInterface.h" - -using namespace std; - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_initializeClient - * Signature: ()J - * Initializes a ceph client. - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1initializeClient - (JNIEnv *, jobject) -{ - - cout << "Initializing Ceph client:" << endl; - - // parse args from CEPH_ARGS - vector args; - env_to_vec(args); - parse_config_options(args); - - if (g_conf.clock_tare) g_clock.tare(); - - // be safe - g_conf.use_abspaths = true; - - // load monmap - MonMap monmap; - int r = monmap.read(".ceph_monmap"); - if (r < 0) { - cout << "could not find .ceph_monmap" << endl; - return 0; - } - assert(r >= 0); - - // start up network - rank.start_rank(); - - // start client - Client *client; - client = new Client(rank.register_entity(MSG_ADDR_CLIENT_NEW), &monmap); - client->init(); - - // mount - client->mount(); - - jlong clientp = *(jlong*)&client; - return clientp; -} - - - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_copyFromLocalFile - * Signature: (JLjava/lang/String;Ljava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1copyFromLocalFile -(JNIEnv * env, jobject obj, jlong clientp, jstring j_local_path, jstring j_ceph_path) { - - cout << "In copyFromLocalFile" << endl; - cout.flush(); - Client* client; - //client = (Client*) clientp; - client = *(Client**)&clientp; - - const char* c_local_path = env->GetStringUTFChars(j_local_path, 0); - const char* c_ceph_path = env->GetStringUTFChars(j_ceph_path, 0); - - cout << "Local source file is "<< c_local_path << " and Ceph destination file is " << c_ceph_path << endl; - struct stat st; - int r = ::stat(c_local_path, &st); - assert (r == 0); - - // open the files - int fh_local = ::open(c_local_path, O_RDONLY); - int fh_ceph = client->open(c_ceph_path, O_WRONLY|O_CREAT|O_TRUNC); - assert (fh_local > -1); - assert (fh_ceph > -1); - cout << "local fd is " << fh_local << " and Ceph fd is " << fh_ceph << endl; - - // get the source file size - off_t remaining = st.st_size; - - // copy the file a MB at a time - const int chunk = 1048576; - bufferptr bp(chunk); - - while (remaining > 0) { - off_t got = ::read(fh_local, bp.c_str(), MIN(remaining,chunk)); - assert(got > 0); - remaining -= got; - off_t wrote = client->write(fh_ceph, bp.c_str(), got, -1); - assert (got == wrote); - } - client->close(fh_ceph); - ::close(fh_local); - - env->ReleaseStringUTFChars(j_local_path, c_local_path); - env->ReleaseStringUTFChars(j_ceph_path, c_ceph_path); - - return JNI_TRUE; -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_copyToLocalFile - * Signature: (JLjava/lang/String;Ljava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1copyToLocalFile -(JNIEnv *env, jobject obj, jlong clientp, jstring j_ceph_path, jstring j_local_path) -{ - - - Client* client; - client = *(Client**)&clientp; - const char* c_ceph_path = env->GetStringUTFChars(j_ceph_path, 0); - const char* c_local_path = env->GetStringUTFChars(j_local_path, 0); - - cout << "In copyToLocalFile, copying from Ceph file " << c_ceph_path << - " to local file " << c_local_path << endl; - cout.flush(); - - - // get source file size - struct stat st; - cout << "Attempting lstat with file " << c_ceph_path << ":" << endl; - int r = client->lstat(c_ceph_path, &st); - assert (r == 0); - - cout << "Opening Ceph source file for read: " << endl; - cout.flush(); - int fh_ceph = client->open(c_ceph_path, O_RDONLY); - assert (fh_ceph > -1); - - cout << " Opened Ceph file! Opening local destination file: " << endl; - cout.flush(); - int fh_local = ::open(c_local_path, O_WRONLY|O_CREAT|O_TRUNC, 0644); - assert (fh_local > -1); - - // copy the file a chunk at a time - const int chunk = 1048576; - bufferptr bp(chunk); - - off_t remaining = st.st_size; - while (remaining > 0) { - off_t got = client->read(fh_ceph, bp.c_str(), MIN(remaining,chunk), -1); - assert(got > 0); - remaining -= got; - off_t wrote = ::write(fh_local, bp.c_str(), got); - assert (got == wrote); - } - client->close(fh_ceph); - ::close(fh_local); - - env->ReleaseStringUTFChars(j_local_path, c_local_path); - env->ReleaseStringUTFChars(j_ceph_path, c_ceph_path); - - return JNI_TRUE; -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_getcwd - * Signature: (J)Ljava/lang/String; - * Returns the current working directory. - */ -JNIEXPORT jstring JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1getcwd - (JNIEnv *env, jobject obj, jlong clientp) -{ - //cout << "In getcwd" << endl; - //cout.flush(); - - Client* client; - client = *(Client**)&clientp; - - return (env->NewStringUTF(client->getcwd().c_str())); -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_setcwd - * Signature: (JLjava/lang/String;)Z - * - * Changes the working directory. - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1setcwd -(JNIEnv *env, jobject obj, jlong clientp, jstring j_path) -{ - //cout << "In setcwd" << endl; - //cout.flush(); - - Client* client; - client = *(Client**)&clientp; - - const char* c_path = env->GetStringUTFChars(j_path, 0); - return (0 <= client->chdir(c_path)) ? JNI_TRUE : JNI_FALSE; - env->ReleaseStringUTFChars(j_path, c_path); -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_rmdir - * Signature: (JLjava/lang/String;)Z - * Removes an empty directory. - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1rmdir - (JNIEnv *env, jobject, jlong clientp, jstring j_path) -{ - cout << "In rmdir" << endl; - cout.flush(); - - Client* client; - client = *(Client**)&clientp; - - const char* c_path = env->GetStringUTFChars(j_path, 0); - return (0 == client->rmdir(c_path)) ? JNI_TRUE : JNI_FALSE; - env->ReleaseStringUTFChars(j_path, c_path); -} - - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_mkdir - * Signature: (JLjava/lang/String;)Z - * Creates a directory with full permissions. - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1mkdir - (JNIEnv * env, jobject, jlong clientp, jstring j_path) -{ - //cout << "In mkdir" << endl; - //cout.flush(); - - - Client* client; - client = *(Client**)&clientp; - - const char* c_path = env->GetStringUTFChars(j_path, 0); - return (0 == client->mkdir(c_path, 0xFF)) ? JNI_TRUE : JNI_FALSE; - env->ReleaseStringUTFChars(j_path, c_path); -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_unlink - * Signature: (JLjava/lang/String;)Z - * Unlinks a path. - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1unlink - (JNIEnv * env, jobject, jlong clientp, jstring j_path) -{ - cout.flush(); - - Client* client; - client = *(Client**)&clientp; - - const char* c_path = env->GetStringUTFChars(j_path, 0); - cout << "In unlink for path " << c_path << ":" << endl; - - // is it a file or a directory? - struct stat stbuf; - int stat_result = client->lstat(c_path, &stbuf); - if (stat_result < 0) {// then the path doesn't even exist - cout << "ceph_unlink: path " << c_path << " does not exist" << endl; - return false; - } - int result; - if (0 != S_ISDIR(stbuf.st_mode)) { // it's a directory - cout << "ceph_unlink: path " << c_path << " is a directory. Calling client->rmdir()" << endl; - result = client->rmdir(c_path); - } - else if (0 != S_ISREG(stbuf.st_mode)) { // it's a file - cout << "ceph_unlink: path " << c_path << " is a file. Calling client->unlink()" << endl; - result = client->unlink(c_path); - } - else { - cout << "ceph_unlink: path " << c_path << " is not a file or a directory. Failing:" << endl; - result = -1; - } - - cout << "In ceph_unlink for path " << c_path << - ": got result " - << result << ". Returning..."<< endl; - - env->ReleaseStringUTFChars(j_path, c_path); - return (0 == result) ? JNI_TRUE : JNI_FALSE; -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_rename - * Signature: (JLjava/lang/String;Ljava/lang/String;)Z - * Renames a file. - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1rename - (JNIEnv *env, jobject, jlong clientp, jstring j_from, jstring j_to) -{ - cout << "In rename" << endl; - cout.flush(); - - - Client* client; - client = *(Client**)&clientp; - - const char* c_from = env->GetStringUTFChars(j_from, 0); - const char* c_to = env->GetStringUTFChars(j_to, 0); - - return (0 <= client->rename(c_from, c_to)) ? JNI_TRUE : JNI_FALSE; - env->ReleaseStringUTFChars(j_from, c_from); - env->ReleaseStringUTFChars(j_to, c_to); -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_exists - * Signature: (JLjava/lang/String;)Z - * Returns true if the path exists. - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1exists -(JNIEnv *env, jobject, jlong clientp, jstring j_path) -{ - - //cout << "In exists" << endl; - //cout.flush(); - - Client* client; - struct stat stbuf; - client = *(Client**)&clientp; - - const char* c_path = env->GetStringUTFChars(j_path, 0); - cout << "Attempting lstat with file " << c_path << ":" ; - //int i = (int) (*c_path); - //cout << "First character value is " << i; - // cout.flush(); - int result = client->lstat(c_path, &stbuf); - cout << "result is " << result << endl; - // cout << "Attempting to release string \"" << c_path << "\"" << endl; - //cout.flush(); - env->ReleaseStringUTFChars(j_path, c_path); - //cout << "String released!" << endl; - if (result < 0) { - //cout << "Returning false (file does not exist)" << endl; - //cout.flush(); - return JNI_FALSE; - } - else { - //cout << "Returning true (file exists)" << endl; - //cout.flush(); - return JNI_TRUE; - } - -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_getblocksize - * Signature: (JLjava/lang/String;)J - * Returns the block size. Size is -1 if the file - * does not exist. - * TODO: see if Hadoop wants something more like stripe size - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1getblocksize - (JNIEnv *env, jobject obj, jlong clientp, jstring j_path) -{ - cout << "In getblocksize" << endl; - cout.flush(); - - - Client* client; - struct stat stbuf; - client = *(Client**)&clientp; - - jint result; - - const char* c_path = env->GetStringUTFChars(j_path, 0); - if (0 > client->lstat(c_path, &stbuf)) - result = -1; - else - result = stbuf.st_blksize; - - env->ReleaseStringUTFChars(j_path, c_path); - return result; -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_getfilesize - * Signature: (JLjava/lang/String;)J - * Returns the file size, or -1 on failure. - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1getfilesize - (JNIEnv *env, jobject, jlong clientp, jstring j_path) -{ - cout << "In getfilesize" << endl; - cout.flush(); - - Client* client; - struct stat stbuf; - client = *(Client**)&clientp; - - jlong result; - - const char* c_path = env->GetStringUTFChars(j_path, 0); - if (0 > client->lstat(c_path, &stbuf)) result = -1; - else result = stbuf.st_size; - env->ReleaseStringUTFChars(j_path, c_path); - - return result; -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_isfile - * Signature: (JLjava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1isfile - (JNIEnv *env, jobject obj, jlong clientp, jstring j_path) -{ - //cout << "In isfile" << endl; - //cout.flush(); - - Client* client; - struct stat stbuf; - client = *(Client**)&clientp; - - - const char* c_path = env->GetStringUTFChars(j_path, 0); - //cout << "Attempting lstat with file " << c_path << ":" << endl; - //cout.flush(); - int result = client->lstat(c_path, &stbuf); - //cout << "Got through lstat without crashing: result is " << result << endl; - //cout.flush(); - - env->ReleaseStringUTFChars(j_path, c_path); - - // if the stat call failed, it's definitely not a file... - if (0 > result) return JNI_FALSE; - - // check the stat result - //cout << "Stat call succeeded: attempting to look inside stbuf for result" << endl; - return (0 == S_ISREG(stbuf.st_mode)) ? JNI_FALSE : JNI_TRUE; -} - - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_isdirectory - * Signature: (JLjava/lang/String;)Z - * Returns true if the path is a directory. - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1isdirectory - (JNIEnv *env, jobject, jlong clientp, jstring j_path) -{ - //cout << "In isdirectory" << endl; - //cout.flush(); - - Client* client; - struct stat stbuf; - client = *(Client**)&clientp; - - const char* c_path = env->GetStringUTFChars(j_path, 0); - int result = client->lstat(c_path, &stbuf); - env->ReleaseStringUTFChars(j_path, c_path); - //cout << "String released!" << endl; - //cout.flush(); - - // if the stat call failed, it's definitely not a directory... - if (0 > result) return JNI_FALSE; - - // check the stat result - return (0 == S_ISDIR(stbuf.st_mode)) ? JNI_FALSE : JNI_TRUE; -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_getdir - * Signature: (JLjava/lang/String;)[Ljava/lang/String; - * Returns a Java array of Strings with the directory contents - */ -JNIEXPORT jobjectArray JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1getdir -(JNIEnv *env, jobject obj, jlong clientp, jstring j_path) { - - //cout << "In getdir" << endl; - //cout.flush(); - - - Client* client; - client = *(Client**)&clientp; - - // get the directory listing - map contents; - const char* c_path = env->GetStringUTFChars(j_path, 0); - int result = client->getdir(c_path, contents); - //cout << "Releasing string" << endl; - env->ReleaseStringUTFChars(j_path, c_path); - - if (result < 0) return NULL; - - //cout << "checking for empty dir" << endl; - jint dir_size = contents.size(); - - // Hadoop doesn't want . or .. in the listing, so we shrink the - // listing size by two, or by one if the directory's root - if(('/' == c_path[0]) && (0 == c_path[1])) - dir_size -= 1; - else - dir_size -= 2; - assert (dir_size >= 0); - - // Create a Java String array of the size of the directory listing - // jstring blankString = env->NewStringUTF(""); - jclass stringClass = env->FindClass("java/lang/String"); - if (NULL == stringClass) { - cout << "ERROR: java String class not found; dying a horrible, painful death" << endl; - assert(0); - } - jobjectArray dirListingStringArray = (jobjectArray) env->NewObjectArray(dir_size, stringClass, NULL); - - // populate the array with the elements of the directory list, - // omitting . and .. - int i = 0; - string dot("."); - string dotdot (".."); - for (map::iterator it = contents.begin(); - it != contents.end(); - it++) { - // is it "."? - if (it->first == dot) continue; - if (it->first == dotdot) continue; - - if (0 == dir_size) - cout << "WARNING: adding stuff to an empty array" << endl; - assert (i < dir_size); - env->SetObjectArrayElement(dirListingStringArray, i, - env->NewStringUTF(it->first.c_str())); - ++i; - } - - return dirListingStringArray; -} - - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_open_for_read - * Signature: (JLjava/lang/String;)I - * Open a file for reading. - */ -JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1open_1for_1read - (JNIEnv *env, jobject obj, jlong clientp, jstring j_path) - -{ - //cout << "In open_for_read" << endl; - //cout.flush(); - - - Client* client; - client = *(Client**)&clientp; - - jint result; - - // open as read-only: flag = O_RDONLY - - const char* c_path = env->GetStringUTFChars(j_path, 0); - result = client->open(c_path, O_RDONLY); - env->ReleaseStringUTFChars(j_path, c_path); - - // returns file handle, or -1 on failure - return result; -} - - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_open_for_overwrite - * Signature: (JLjava/lang/String;)I - * Opens a file for overwriting; creates it if necessary. - */ -JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1open_1for_1overwrite - (JNIEnv *env, jobject obj, jlong clientp, jstring j_path) -{ - //cout << "In open_for_overwrite" << endl; - //cout.flush(); - - Client* client; - client = *(Client**)&clientp; - - jint result; - - - const char* c_path = env->GetStringUTFChars(j_path, 0); - result = client->open(c_path, O_WRONLY|O_CREAT|O_TRUNC); - env->ReleaseStringUTFChars(j_path, c_path); - - // returns file handle, or -1 on failure - return result; -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_kill_client - * Signature: (J)Z - * - * Closes the Ceph client. - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1kill_1client - (JNIEnv *env, jobject obj, jlong clientp) -{ - Client* client; - client = *(Client**)&clientp; - - client->unmount(); - client->shutdown(); - delete client; - - // wait for messenger to finish - rank.wait(); - - return true; -} - - - -/* - * Class: org_apache_hadoop_fs_ceph_CephInputStream - * Method: ceph_read - * Signature: (JI[BII)I - * Reads into the given byte array from the current position. - */ -JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephInputStream_ceph_1read - (JNIEnv *env, jobject obj, jlong clientp, jint fh, jbyteArray j_buffer, jint buffer_offset, jint length) -{ - //cout << "In read" << endl; - //cout.flush(); - - - // IMPORTANT NOTE: Hadoop read arguments are a bit different from POSIX so we - // have to convert. The read is *always* from the current position in the file, - // and buffer_offset is the location in the *buffer* where we start writing. - - - Client* client; - client = *(Client**)&clientp; - jint result; - - // Step 1: get a pointer to the buffer. - jbyte* j_buffer_ptr = env->GetByteArrayElements(j_buffer, NULL); - char* c_buffer = (char*) j_buffer_ptr; - - // Step 2: pointer arithmetic to start in the right buffer position - c_buffer += (int)buffer_offset; - - // Step 3: do the read - result = client->read((int)fh, c_buffer, length, -1); - - // Step 4: release the pointer to the buffer - env->ReleaseByteArrayElements(j_buffer, j_buffer_ptr, 0); - - return result; -} - - -/* - * Class: org_apache_hadoop_fs_ceph_CephInputStream - * Method: ceph_seek_from_start - * Signature: (JIJ)J - * Seeks to the given position. - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephInputStream_ceph_1seek_1from_1start - (JNIEnv *env, jobject obj, jlong clientp, jint fh, jlong pos) -{ - //cout << "In CephInputStream::seek_from_start" << endl; - //cout.flush(); - - - Client* client; - client = *(Client**)&clientp; - jint result; - - result = client->lseek(fh, pos, SEEK_SET); - - return result; -} - - -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephInputStream_ceph_1getpos - (JNIEnv *env, jobject obj, jlong clientp, jint fh) -{ - cout << "In CephInputStream::ceph_getpos" << endl; - cout.flush(); - - - Client* client; - client = *(Client**)&clientp; - jint result; - - // seek a distance of 0 to get current offset - result = client->lseek(fh, 0, SEEK_CUR); - - return result; -} - - -/* - * Class: org_apache_hadoop_fs_ceph_CephInputStream - * Method: ceph_close - * Signature: (JI)I - * Closes the file. - */ -JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephInputStream_ceph_1close - (JNIEnv *env, jobject obj, jlong clientp, jint fh) -{ - cout << "In CephInputStream::ceph_close" << endl; - cout.flush(); - - Client* client; - client = *(Client**)&clientp; - jint result; - - result = client->close(fh); - - return result; -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephOutputStream - * Method: ceph_seek_from_start - * Signature: (JIJ)J - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephOutputStream_ceph_1seek_1from_1start - (JNIEnv *env, jobject obj, jlong clientp, jint fh, jlong pos) -{ - cout << "In CephOutputStream::ceph_seek_from_start" << endl; - cout.flush(); - - Client* client; - client = *(Client**)&clientp; - jint result; - - result = client->lseek(fh, pos, SEEK_SET); - - return result; -} - - -/* - * Class: org_apache_hadoop_fs_ceph_CephOutputStream - * Method: ceph_getpos - * Signature: (JI)J - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephOutputStream_ceph_1getpos - (JNIEnv *env, jobject obj, jlong clientp, jint fh) -{ - cout << "In CephOutputStream::ceph_getpos" << endl; - cout.flush(); - - Client* client; - client = *(Client**)&clientp; - jint result; - - // seek a distance of 0 to get current offset - result = client->lseek(fh, 0, SEEK_CUR); - - return result; -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephOutputStream - * Method: ceph_close - * Signature: (JI)I - */ -JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephOutputStream_ceph_1close - (JNIEnv *env, jobject obj, jlong clientp, jint fh) -{ - cout << "In CephOutputStream::ceph_close" << endl; - cout.flush(); - - Client* client; - client = *(Client**)&clientp; - jint result; - - result = client->close(fh); - - return result; -} - - - -/* - * Class: org_apache_hadoop_fs_ceph_CephOutputStream - * Method: ceph_write - * Signature: (JI[BII)I - */ -JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephOutputStream_ceph_1write - (JNIEnv *env, jobject obj, jlong clientp, jint fh, jbyteArray j_buffer, jint buffer_offset, jint length) -{ - //cout << "In write" << endl; - //cout.flush(); - - - // IMPORTANT NOTE: Hadoop write arguments are a bit different from POSIX so we - // have to convert. The write is *always* from the current position in the file, - // and buffer_offset is the location in the *buffer* where we start writing. - - Client* client; - client = *(Client**)&clientp; - jint result; - - // Step 1: get a pointer to the buffer. - jbyte* j_buffer_ptr = env->GetByteArrayElements(j_buffer, NULL); - char* c_buffer = (char*) j_buffer_ptr; - - // Step 2: pointer arithmetic to start in the right buffer position - c_buffer += (int)buffer_offset; - - // Step 3: do the write - result = client->write((int)fh, c_buffer, length, -1); - - // Step 4: release the pointer to the buffer - env->ReleaseByteArrayElements(j_buffer, j_buffer_ptr, 0); - - return result; -} - diff --git a/tags/20070517_before_mds_merge/client/hadoop/CephFSInterface.h b/tags/20070517_before_mds_merge/client/hadoop/CephFSInterface.h deleted file mode 100644 index 0930ecb25b8f1..0000000000000 --- a/tags/20070517_before_mds_merge/client/hadoop/CephFSInterface.h +++ /dev/null @@ -1,237 +0,0 @@ -/* DO NOT EDIT THIS FILE - it is machine generated */ -#include -/* Header for class org_apache_hadoop_fs_ceph_CephFileSystem */ - -#include -#include "client/Client.h" -#include "config.h" -#include "client/fuse.h" -#include "msg/SimpleMessenger.h" -#include "common/Timer.h" - -#ifndef _Included_org_apache_hadoop_fs_ceph_CephFileSystem -#define _Included_org_apache_hadoop_fs_ceph_CephFileSystem -#ifdef __cplusplus -extern "C" { -#endif - -#undef org_apache_hadoop_fs_ceph_CephFileSystem_DEFAULT_BLOCK_SIZE -#define org_apache_hadoop_fs_ceph_CephFileSystem_DEFAULT_BLOCK_SIZE 1048576LL -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_initializeClient - * Signature: ()J - * Initializes a ceph client. - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1initializeClient -(JNIEnv *, jobject); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_copyFromLocalFile - * Signature: (JLjava/lang/String;Ljava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1copyFromLocalFile - (JNIEnv *, jobject, jlong, jstring, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_copyToLocalFile - * Signature: (JLjava/lang/String;Ljava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1copyToLocalFile - (JNIEnv *, jobject, jlong, jstring, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_getcwd - * Signature: (J)Ljava/lang/String; - */ -JNIEXPORT jstring JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1getcwd - (JNIEnv *, jobject, jlong); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_setcwd - * Signature: (JLjava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1setcwd - (JNIEnv *, jobject, jlong, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_rmdir - * Signature: (JLjava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1rmdir - (JNIEnv *, jobject, jlong, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_mkdir - * Signature: (JLjava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1mkdir - (JNIEnv *, jobject, jlong, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_unlink - * Signature: (JLjava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1unlink - (JNIEnv *, jobject, jlong, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_rename - * Signature: (JLjava/lang/String;Ljava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1rename - (JNIEnv *, jobject, jlong, jstring, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_exists - * Signature: (JLjava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1exists - (JNIEnv *, jobject, jlong, jstring); - - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_getblocksize - * Signature: (JLjava/lang/String;)J - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1getblocksize - (JNIEnv *, jobject, jlong, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_getfilesize - * Signature: (JLjava/lang/String;)J - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1getfilesize - (JNIEnv *, jobject, jlong, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_isdirectory - * Signature: (JLjava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1isdirectory - (JNIEnv *, jobject, jlong, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_isfile - * Signature: (JLjava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1isfile - (JNIEnv *, jobject, jlong, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_getdir - * Signature: (JLjava/lang/String;)[Ljava/lang/String; - */ -JNIEXPORT jobjectArray JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1getdir - (JNIEnv *, jobject, jlong, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_open_for_read - * Signature: (JLjava/lang/String;)I - */ -JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1open_1for_1read - (JNIEnv *, jobject, jlong, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_open_for_overwrite - * Signature: (JLjava/lang/String;)I - */ -JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1open_1for_1overwrite - (JNIEnv *, jobject, jlong, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_kill_client - * Signature: (J)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1kill_1client - (JNIEnv *, jobject, jlong); - -#undef org_apache_hadoop_fs_ceph_CephInputStream_SKIP_BUFFER_SIZE -#define org_apache_hadoop_fs_ceph_CephInputStream_SKIP_BUFFER_SIZE 2048L -/* - * Class: org_apache_hadoop_fs_ceph_CephInputStream - * Method: ceph_read - * Signature: (JI[BII)I - */ -JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephInputStream_ceph_1read - (JNIEnv *, jobject, jlong, jint, jbyteArray, jint, jint); - -/* - * Class: org_apache_hadoop_fs_ceph_CephInputStream - * Method: ceph_seek_from_start - * Signature: (JIJ)J - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephInputStream_ceph_1seek_1from_1start - (JNIEnv *, jobject, jlong, jint, jlong); - -/* - * Class: org_apache_hadoop_fs_ceph_CephInputStream - * Method: ceph_getpos - * Signature: (JI)J - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephInputStream_ceph_1getpos - (JNIEnv *, jobject, jlong, jint); - -/* - * Class: org_apache_hadoop_fs_ceph_CephInputStream - * Method: ceph_close - * Signature: (JI)I - */ -JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephInputStream_ceph_1close - (JNIEnv *, jobject, jlong, jint); - -/* Header for class org_apache_hadoop_fs_ceph_CephOutputStream */ - -/* - * Class: org_apache_hadoop_fs_ceph_CephOutputStream - * Method: ceph_seek_from_start - * Signature: (JIJ)J - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephOutputStream_ceph_1seek_1from_1start - (JNIEnv *, jobject, jlong, jint, jlong); - -/* - * Class: org_apache_hadoop_fs_ceph_CephOutputStream - * Method: ceph_getpos - * Signature: (JI)J - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephOutputStream_ceph_1getpos - (JNIEnv *, jobject, jlong, jint); - -/* - * Class: org_apache_hadoop_fs_ceph_CephOutputStream - * Method: ceph_close - * Signature: (JI)I - */ -JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephOutputStream_ceph_1close - (JNIEnv *, jobject, jlong, jint); - -/* - * Class: org_apache_hadoop_fs_ceph_CephOutputStream - * Method: ceph_write - * Signature: (JI[BII)I - */ -JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephOutputStream_ceph_1write - (JNIEnv *, jobject, jlong, jint, jbyteArray, jint, jint); - -#ifdef __cplusplus -} -#endif -#endif diff --git a/tags/20070517_before_mds_merge/client/ldceph.cc b/tags/20070517_before_mds_merge/client/ldceph.cc deleted file mode 100644 index 9706fd49cad99..0000000000000 --- a/tags/20070517_before_mds_merge/client/ldceph.cc +++ /dev/null @@ -1,297 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include -using namespace std; - -// ceph stuff -#include "config.h" -#include "client/Client.h" -#include "msg/TCPMessenger.h" - -// syscall fun -#include -#include -#include -//#include - -#define _FCNTL_H -#include - -#define CEPH_FD_OFF 50000 - - -/****** startup etc *******/ - -class LdCeph { -public: - // globals - bool started; - char *mount_point; - char *mount_point_parent; - int mount_point_len; - - Client *client; - - filepath fp_mount_point; - filepath cwd; - bool cwd_above_mp, cwd_in_mp; - - const char *get_ceph_path(const char *orig, char *buf) { - if (!started) return 0; - - // relative path? BUG: this won't catch "blah/../../asdf" - if (orig[0] && - orig[0] != '/' && - !(orig[0] == '.' && orig[1] == '.')) { - - if (cwd_in_mp) return orig; // inside mount point, definitely ceph - if (!cwd_above_mp) return 0; // not above mount point, definitely not ceph - - // relative, above mp. - filepath o = orig; - filepath p = cwd; - for (unsigned b = 0; b < o.depth(); b++) { - if (o[b] == "..") - p.pop_dentry(); - else - p.add_dentry(o[b]); - } - - // FIXME rewrite - if (strncmp(p.c_str(), mount_point, mount_point_len) == 0) { - if (p.c_str()[mount_point_len] == 0) - return "/"; - if (p.c_str()[mount_point_len] == '/') { - strcpy(buf, p.c_str() + mount_point_len); - return buf; - } - } - return 0; - } else { - // absolute - if (strncmp(orig, mount_point, mount_point_len) == 0) { - if (orig[mount_point_len] == 0) - return "/"; - if (orig[mount_point_len] == '/') - return orig + mount_point_len; - } - return 0; - } - } - - void refresh_cwd() { - char buf[255]; - syscall(SYS_getcwd, buf, 255); - cwd = buf; - - if (strncmp(buf, mount_point, mount_point_len) == 0 && - (buf[mount_point_len] == 0 || - buf[mount_point_len] == '/')) - cwd_in_mp = true; - else { - if (cwd.depth() > fp_mount_point.depth()) - cwd_above_mp = false; - else { - cwd_above_mp = true; - for (unsigned i=0; iget_myaddr() << endl; - - refresh_cwd(); - } - } - ~LdCeph() { - cout << "ldceph fini" << endl; - if (false && client) { - client->unmount(); - client->shutdown(); - delete client; - client = 0; - tcpmessenger_wait(); - tcpmessenger_shutdown(); - } - } - -} ldceph; - - - -/****** original functions ****/ - - - -/****** captured functions ****/ - - -#define MYFD(f) ((fd) > CEPH_FD_OFF && ldceph.started) -#define TO_FD(fd) (fd > 0 ? fd+CEPH_FD_OFF:fd) -#define FROM_FD(fd) (fd - CEPH_FD_OFF) - -extern "C" { - - // open/close - //int open(const char *pathname, int flags) { - int open(const char *pathname, int flags, mode_t mode) { - char buf[255]; - if (const char *c = ldceph.get_ceph_path(pathname, buf)) - return TO_FD(ldceph.client->open(c, flags)); - else - return syscall(SYS_open, pathname, flags, mode); - } - - int creat(const char *pathname, mode_t mode) { - return open(pathname, O_CREAT|O_WRONLY|O_TRUNC, mode); - } - int close(int fd) { - if (MYFD(fd)) - return ldceph.client->close(FROM_FD(fd)); - else - return syscall(SYS_close, fd); - } - - - // read/write - ssize_t write(int fd, const void *buf, size_t count) { - if (MYFD(fd)) - return ldceph.client->write(FROM_FD(fd), (char*)buf, count); - else - return syscall(SYS_write, fd, buf, count); - } - - ssize_t read(int fd, void *buf, size_t count) { - if (MYFD(fd)) - return ldceph.client->read(FROM_FD(fd), (char*)buf, count); - else - return syscall(SYS_read, fd, buf, count); - } - - //int fsync(int fd); - //int fdatasync(int fd); - - - // namespace - int rmdir(const char *pathname) { - char buf[255]; - if (const char *c = ldceph.get_ceph_path(pathname, buf)) - return ldceph.client->rmdir(c); - else - return syscall(SYS_rmdir, pathname); - } - int mkdir(const char *pathname, mode_t mode) { - char buf[255]; - if (const char *c = ldceph.get_ceph_path(pathname, buf)) - return ldceph.client->mkdir(c, mode); - else - return syscall(SYS_mkdir, pathname, mode); - } - int unlink(const char *pathname) { - char buf[255]; - if (const char *c = ldceph.get_ceph_path(pathname, buf)) - return ldceph.client->unlink(c); - else - return syscall(SYS_unlink, pathname); - } - - int stat(const char *pathname, struct stat *st) { - //int __xstat64(int __ver, const char *pathname, struct stat64 *st64) { // stoopid GLIBC - //struct stat *st = (struct stat*)st64; - char buf[255]; - if (const char *c = ldceph.get_ceph_path(pathname, buf)) - return ldceph.client->lstat(c, st); // FIXME - else - return syscall(SYS_stat, pathname, st); - } - //int fstat(int filedes, struct stat *buf); - //int lstat(const char *file_name, struct stat *buf); - - int chdir(const char *pathname) { - char buf[255]; - if (const char *c = ldceph.get_ceph_path(pathname, buf)) { - int r = ldceph.client->chdir(c); - if (r == 0) { - if (!ldceph.cwd_in_mp) - syscall(SYS_chdir, ldceph.mount_point_parent); - ldceph.cwd_in_mp = true; - ldceph.cwd_above_mp = false; - ldceph.cwd = ldceph.mount_point; - filepath fpc = c; - ldceph.cwd.append(fpc); - } - return r; - } else { - int r = syscall(SYS_chdir, pathname); - if (r) { - ldceph.refresh_cwd(); - } - return r; - } - } - char *getcwd(char *buf, size_t size) { - strncpy(buf, ldceph.cwd.c_str(), size); - return buf; - } - //int fchdir(int fd); - - - - -} diff --git a/tags/20070517_before_mds_merge/client/msgthread.h b/tags/20070517_before_mds_merge/client/msgthread.h deleted file mode 100644 index 69d10be9f6a56..0000000000000 --- a/tags/20070517_before_mds_merge/client/msgthread.h +++ /dev/null @@ -1,25 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include "msg/Message.h" - -// send the message, expecting no response. threads other than the -// MPI thread use this function; if the MPI thread uses this function -// it could deadlock: this function could wait for the out queue to be -// emptied, but only the MPI thread can empty it. -void obfsmpi_send(Message *m) - -// send the message to a server and wait for the response. threads -// other than the MPI thread use this function. -Message *obfsmpi_sendrecv(Message *m) diff --git a/tags/20070517_before_mds_merge/cmds.cc b/tags/20070517_before_mds_merge/cmds.cc deleted file mode 100644 index aeeb42e87488a..0000000000000 --- a/tags/20070517_before_mds_merge/cmds.cc +++ /dev/null @@ -1,102 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include -#include -#include - -#include -#include -#include -using namespace std; - -#include "config.h" - -#include "mon/MonMap.h" -#include "mds/MDS.h" - -#include "msg/SimpleMessenger.h" - -#include "common/Timer.h" - - -class C_Die : public Context { -public: - void finish(int) { - cerr << "die" << endl; - exit(1); - } -}; - -class C_Debug : public Context { - public: - void finish(int) { - int size = &g_conf.debug_after - &g_conf.debug; - memcpy((char*)&g_conf.debug, (char*)&g_debug_after_conf.debug, size); - dout(0) << "debug_after flipping debug settings" << endl; - } -}; - - -int main(int argc, char **argv) -{ - vector args; - argv_to_vec(argc, argv, args); - - parse_config_options(args); - - if (g_conf.kill_after) - g_timer.add_event_after(g_conf.kill_after, new C_Die); - if (g_conf.debug_after) - g_timer.add_event_after(g_conf.debug_after, new C_Debug); - - // mds specific args - int whoami = -1; - bool standby = false; // by default, i'll start active. - for (unsigned i=0; i= 0); - - // start up network - rank.start_rank(); - - // start mds - Messenger *m = rank.register_entity(MSG_ADDR_MDS(whoami)); - assert(m); - - MDS *mds = new MDS(whoami, m, &monmap); - mds->init(standby); - - // wait - rank.wait(); - - // done - delete mds; - - return 0; -} - diff --git a/tags/20070517_before_mds_merge/cmon.cc b/tags/20070517_before_mds_merge/cmon.cc deleted file mode 100644 index 8fd627986a240..0000000000000 --- a/tags/20070517_before_mds_merge/cmon.cc +++ /dev/null @@ -1,128 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include -#include -#include - -#include -#include -#include -using namespace std; - -#include "config.h" - -#include "mon/MonMap.h" -#include "mon/Monitor.h" - -#include "msg/SimpleMessenger.h" - -#include "common/Timer.h" - - -class C_Die : public Context { -public: - void finish(int) { - cerr << "die" << endl; - exit(1); - } -}; - -class C_Debug : public Context { - public: - void finish(int) { - int size = &g_conf.debug_after - &g_conf.debug; - memcpy((char*)&g_conf.debug, (char*)&g_debug_after_conf.debug, size); - dout(0) << "debug_after flipping debug settings" << endl; - } -}; - - -int main(int argc, char **argv) -{ - vector args; - argv_to_vec(argc, argv, args); - - parse_config_options(args); - - if (g_conf.kill_after) - g_timer.add_event_after(g_conf.kill_after, new C_Die); - if (g_conf.debug_after) - g_timer.add_event_after(g_conf.debug_after, new C_Debug); - - // args - int whoami = -1; - char *monmap_fn = ".ceph_monmap"; - for (unsigned i=0; i= 0); - } else { - // i am specific monitor. - - // read monmap - cout << "reading monmap from .ceph_monmap" << endl; - int r = monmap.read(monmap_fn); - assert(r >= 0); - - // bind to a specific port - cout << "starting mon" << whoami << " at " << monmap.get_inst(whoami) << endl; - g_my_addr = monmap.get_inst(whoami).addr; - rank.start_rank(); - } - - // start monitor - Messenger *m = rank.register_entity(MSG_ADDR_MON(whoami)); - Monitor *mon = new Monitor(whoami, m, &monmap); - mon->init(); - - // wait - cout << "waiting for shutdown ..." << endl; - rank.wait(); - - // done - delete mon; - - return 0; -} - diff --git a/tags/20070517_before_mds_merge/common/Clock.cc b/tags/20070517_before_mds_merge/common/Clock.cc deleted file mode 100644 index c970a337826b6..0000000000000 --- a/tags/20070517_before_mds_merge/common/Clock.cc +++ /dev/null @@ -1,19 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include "Clock.h" - -// public -Clock g_clock; - diff --git a/tags/20070517_before_mds_merge/common/Clock.h b/tags/20070517_before_mds_merge/common/Clock.h deleted file mode 100644 index 106e9e9f23701..0000000000000 --- a/tags/20070517_before_mds_merge/common/Clock.h +++ /dev/null @@ -1,206 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __CLOCK_H -#define __CLOCK_H - -#include -#include - -#include -#include -#include - -#include "Mutex.h" - - -// -------- -// utime_t - -class utime_t { - private: - struct timeval tv; - - struct timeval& timeval() { return tv; } - friend class Clock; - - - public: - void normalize() { - if (tv.tv_usec > 1000*1000) { - tv.tv_sec += tv.tv_usec / (1000*1000); - tv.tv_usec %= 1000*1000; - } - } - - // cons - utime_t() { tv.tv_sec = 0; tv.tv_usec = 0; normalize(); } - utime_t(time_t s, int u) { tv.tv_sec = s; tv.tv_usec = u; normalize(); } - - // accessors - time_t sec() const { return tv.tv_sec; } - long usec() const { return tv.tv_usec; } - int nsec() const { return tv.tv_usec*1000; } - - // ref accessors/modifiers - time_t& sec_ref() { return tv.tv_sec; } - // FIXME: tv.tv_usec is a __darwin_suseconds_t on Darwin. - // is just casting it to long& OK? - long& usec_ref() { return (long&) tv.tv_usec; } - - // cast to double - operator double() { - return (double)sec() + ((double)usec() / 1000000.0L); - } -}; - -// arithmetic operators -inline utime_t operator+(const utime_t& l, const utime_t& r) { - return utime_t( l.sec() + r.sec() + (l.usec()+r.usec())/1000000L, - (l.usec()+r.usec())%1000000L ); -} -inline utime_t& operator+=(utime_t& l, const utime_t& r) { - l.sec_ref() += r.sec() + (l.usec()+r.usec())/1000000L; - l.usec_ref() += r.usec(); - l.usec_ref() %= 1000000L; - return l; -} -inline utime_t& operator+=(utime_t& l, double f) { - double fs = trunc(f); - double us = (f - fs) / (double)1000000.0; - l.sec_ref() += (long)fs; - l.usec_ref() += (long)us; - l.normalize(); - return l; -} - -inline utime_t operator-(const utime_t& l, const utime_t& r) { - return utime_t( l.sec() - r.sec() - (l.usec()= r.usec()) - l.usec_ref() -= r.usec(); - else { - l.usec_ref() += 1000000L - r.usec(); - l.sec_ref()--; - } - return l; -} -inline utime_t& operator-=(utime_t& l, double f) { - l += -f; - return l; -} - -inline bool operator>(const utime_t& a, const utime_t& b) -{ - return (a.sec() > b.sec()) || (a.sec() == b.sec() && a.usec() > b.usec()); -} -inline bool operator<(const utime_t& a, const utime_t& b) -{ - return (a.sec() < b.sec()) || (a.sec() == b.sec() && a.usec() < b.usec()); -} - -// ostream -inline std::ostream& operator<<(std::ostream& out, const utime_t& t) -{ - //return out << t.sec() << "." << t.usec(); - out << (long)t.sec() << "."; - out.setf(std::ios::right); - out.fill('0'); - out << std::setw(6) << t.usec(); - out.unsetf(std::ios::right); - return out; - - //return out << (long)t.sec << "." << ios::setf(ios::right) << ios::fill('0') << t.usec() << ios::usetf(); -} - - - - -// -- clock -- -class Clock { - protected: - //utime_t start_offset; - //utime_t abs_last; - utime_t last; - utime_t zero; - - Mutex lock; - - public: - Clock() { - // set offset - //tare(); - } - - // real time. - utime_t real_now() { - utime_t realnow = now(); - realnow += zero; - //gettimeofday(&realnow.timeval(), NULL); - return realnow; - } - - // relative time (from startup) - void tare() { - gettimeofday(&zero.timeval(), NULL); - } - void tare(utime_t z) { - zero = z; - } - utime_t now() { - //lock.Lock(); - utime_t n; - gettimeofday(&n.timeval(), NULL); - n -= zero; - if (n < last) { - //std::cerr << "WARNING: clock jumped backwards from " << last << " to " << n << std::endl; - n = last; // clock jumped backwards! - } else - last = n; - //lock.Unlock(); - return n; - } - utime_t recent_now() { - return last; - } - - void realify(utime_t& t) { - t += zero; - } - - void make_timespec(utime_t& t, struct timespec *ts) { - utime_t real = t; - realify(real); - - memset(ts, 0, sizeof(*ts)); - ts->tv_sec = real.sec(); - ts->tv_nsec = real.nsec(); - } - - - - // absolute time - time_t gettime() { - return real_now().sec(); - } - -}; - -extern Clock g_clock; - -#endif diff --git a/tags/20070517_before_mds_merge/common/Cond.h b/tags/20070517_before_mds_merge/common/Cond.h deleted file mode 100644 index ed465ce3762d6..0000000000000 --- a/tags/20070517_before_mds_merge/common/Cond.h +++ /dev/null @@ -1,118 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __COND_H -#define __COND_H - -#include - -#include "Mutex.h" -#include "Clock.h" - -#include "include/Context.h" - -#include -#include - -class Cond { - // my bits - pthread_cond_t _c; - - // don't allow copying. - void operator=(Cond &C) {} - Cond( const Cond &C ) {} - - public: - Cond() { - int r = pthread_cond_init(&_c,NULL); - assert(r == 0); - } - virtual ~Cond() { - pthread_cond_destroy(&_c); - } - - int Wait(Mutex &mutex) { - int r = pthread_cond_wait(&_c, &mutex._m); - return r; - } - - int Wait(Mutex &mutex, char* s) { - //cout << "Wait: " << s << endl; - int r = pthread_cond_wait(&_c, &mutex._m); - return r; - } - - int WaitUntil(Mutex &mutex, utime_t when) { - struct timespec ts; - g_clock.make_timespec(when, &ts); - //cout << "timedwait for " << ts.tv_sec << " sec " << ts.tv_nsec << " nsec" << endl; - int r = pthread_cond_timedwait(&_c, &mutex._m, &ts); - return r; - } - int WaitInterval(Mutex &mutex, utime_t interval) { - utime_t when = g_clock.now(); - when += interval; - return WaitUntil(mutex, when); - } - - int Signal() { - //int r = pthread_cond_signal(&_c); - int r = pthread_cond_broadcast(&_c); - return r; - } - int SignalOne() { - int r = pthread_cond_signal(&_c); - return r; - } - int SignalAll() { - //int r = pthread_cond_signal(&_c); - int r = pthread_cond_broadcast(&_c); - return r; - } -}; - -class C_Cond : public Context { - Cond *cond; - bool *done; - int *rval; -public: - C_Cond(Cond *c, bool *d, int *r=0) : cond(c), done(d), rval(r) { - *done = false; - } - void finish(int r) { - if (rval) *rval = r; - *done = true; - cond->Signal(); - } -}; - -class C_SafeCond : public Context { - Mutex *lock; - Cond *cond; - bool *done; - int *rval; -public: - C_SafeCond(Mutex *l, Cond *c, bool *d, int *r=0) : lock(l), cond(c), done(d), rval(r) { - *done = false; - } - void finish(int r) { - lock->Lock(); - if (rval) *rval = r; - *done = true; - cond->Signal(); - lock->Unlock(); - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/common/DecayCounter.h b/tags/20070517_before_mds_merge/common/DecayCounter.h deleted file mode 100644 index b95ebea815b7c..0000000000000 --- a/tags/20070517_before_mds_merge/common/DecayCounter.h +++ /dev/null @@ -1,94 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __DECAYCOUNTER_H -#define __DECAYCOUNTER_H - -#include -#include "Clock.h" - -#include "config.h" - -class DecayCounter { - protected: - double val; // value - - double half_life; // in seconds - double k; // k = ln(.5)/half_life - - utime_t last_decay; // time of last decay - - public: - DecayCounter() : val(0) { - set_halflife( g_conf.mds_decay_halflife ); - reset(); - } - /* - DecayCounter(double hl) : val(0) { - set_halflife(hl); - reset(); - } - */ - - void adjust(double a) { - decay(); - val += a; - } - void adjust_down(const DecayCounter& other) { - // assume other has same time stamp as us... - val -= other.val; - } - - void set_halflife(double hl) { - half_life = hl; - k = log(.5) / hl; - } - - void take(DecayCounter& other) { - *this = other; - other.reset(); - } - - void reset() { - last_decay.sec_ref() = 0; - last_decay.usec_ref() = 0; - val = 0; - } - - void decay() { - utime_t el = g_clock.recent_now(); - el -= last_decay; - if (el.sec() >= 1) { - val = val * exp((double)el * k); - if (val < .01) val = 0; - last_decay = g_clock.recent_now(); - } - } - - double get() { - decay(); - return val; - } - - double hit(double v = 1.0) { - decay(); - val += v; - return val; - } - -}; - - -#endif diff --git a/tags/20070517_before_mds_merge/common/LogType.h b/tags/20070517_before_mds_merge/common/LogType.h deleted file mode 100644 index 3de17751ec2f8..0000000000000 --- a/tags/20070517_before_mds_merge/common/LogType.h +++ /dev/null @@ -1,119 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __LOGTYPE_H -#define __LOGTYPE_H - -#include "include/types.h" - -#include -#include -using namespace std; -#include -#include -using namespace __gnu_cxx; - -#include "Mutex.h" - - -class LogType { - protected: - hash_map<__uint64_t, int> keymap; - vector keys; - set inc_keys; - - int version; - - // HACK to avoid the hash table as often as possible... - // cache recent key name lookups in a small ring buffer - const static int cache_keys = 10; - __uint64_t kc_ptr[cache_keys]; - int kc_val[cache_keys]; - int kc_pos; - - friend class Logger; - - public: - LogType() { - version = 1; - - for (int i=0;i= 0) return i; - - i = keys.size(); - keys.push_back(key); - -#ifdef __LP64__ - __uint64_t p = (__uint64_t)key; -#else - __uint64_t p = (__uint32_t)key; -#endif - keymap[p] = i; - if (is_inc) inc_keys.insert(i); - - version++; - return i; - } - int add_inc(const char* key) { - return add_key(key, true); - } - int add_set(const char *key) { - return add_key(key, false); - } - - bool have_key(const char* key) { - return lookup_key(key) < 0; - } - - int lookup_key(const char* key) { -#ifdef __LP64__ - __uint64_t p = (__uint64_t)key; -#else - __uint64_t p = (__uint32_t)key; -#endif - - if (keymap.count(p)) - return keymap[p]; - - // try kc ringbuffer - int pos = kc_pos-1; - for (int j=0; j - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include - -#include "LogType.h" -#include "Logger.h" - -#include -#include "Clock.h" - -#include "config.h" - -#include -#include - - -// per-process lock. lame, but this way I protect LogType too! -Mutex logger_lock; - -Logger::Logger(string fn, LogType *type) -{ - logger_lock.Lock(); - { - filename = ""; - if (g_conf.use_abspaths) { - char *cwd = get_current_dir_name(); - filename = cwd; - delete cwd; - filename += "/"; - } - - filename = "log/"; - if (g_conf.log_name) { - filename += g_conf.log_name; - ::mkdir( filename.c_str(), 0755 ); // make sure dir exists - filename += "/"; - } - filename += fn; - //cout << "log " << filename << endl; - interval = g_conf.log_interval; - - if (!g_conf.clock_tare) - start = g_clock.now(); // time 0! otherwise g_clock does it for us. - - last_logged = 0; - wrote_header = -1; - open = false; - this->type = type; - wrote_header_last = 0; - - version = 0; - } - logger_lock.Unlock(); - flush(false); -} - -Logger::~Logger() -{ - flush(true); - out.close(); -} - -long Logger::inc(const char *key, long v) -{ - if (!g_conf.log) return 0; - logger_lock.Lock(); - int i = type->lookup_key(key); - if (i < 0) i = type->add_inc(key); - flush(); - vals[i] += v; - long r = vals[i]; - logger_lock.Unlock(); - return r; -} - -double Logger::finc(const char *key, double v) -{ - if (!g_conf.log) return 0; - logger_lock.Lock(); - int i = type->lookup_key(key); - if (i < 0) i = type->add_inc(key); - flush(); - fvals[i] += v; - double r = fvals[i]; - logger_lock.Unlock(); - return r; -} - -long Logger::set(const char *key, long v) -{ - if (!g_conf.log) return 0; - logger_lock.Lock(); - int i = type->lookup_key(key); - if (i < 0) i = type->add_set(key); - flush(); - long r = vals[i] = v; - logger_lock.Unlock(); - return r; -} - - -double Logger::fset(const char *key, double v) -{ - if (!g_conf.log) return 0; - logger_lock.Lock(); - int i = type->lookup_key(key); - if (i < 0) i = type->add_set(key); - flush(); - double r = fvals[i] = v; - logger_lock.Unlock(); - return r; -} - -long Logger::get(const char* key) -{ - if (!g_conf.log) return 0; - logger_lock.Lock(); - int i = type->lookup_key(key); - long r = 0; - if (i >= 0 && (int)vals.size() > i) - r = vals[i]; - logger_lock.Unlock(); - return r; -} - -void Logger::flush(bool force) -{ - if (!g_conf.log) return; - logger_lock.Lock(); - - if (version != type->version) { - while (type->keys.size() > vals.size()) - vals.push_back(0); - while (type->keys.size() > fvals.size()) - fvals.push_back(0); - version = type->version; - } - - if (!open) { - out.open(filename.c_str(), ofstream::out); - open = true; - //cout << "opening log file " << filename << endl; - } - - utime_t fromstart = g_clock.now(); - if (fromstart < start) { - cerr << "logger time jumped backwards from " << start << " to " << fromstart << endl; - assert(0); - start = fromstart; - } - fromstart -= start; - - while (force || - ((fromstart.sec() > last_logged) && - (fromstart.sec() - last_logged >= interval))) { - last_logged += interval; - force = false; - - //cout << "logger " << this << " advancing from " << last_logged << " now " << now << endl; - - if (!open) { - out.open(filename.c_str(), ofstream::out); - open = true; - //cout << "opening log file " << filename << endl; - } - - // header? - wrote_header_last++; - if (wrote_header != type->version || - wrote_header_last > 10) { - out << "#" << type->keymap.size(); - for (unsigned i=0; ikeys.size(); i++) - out << "\t" << type->keys[i]; - out << endl; //out << "\t (" << type->keymap.size() << ")" << endl; - wrote_header = type->version; - wrote_header_last = 0; - } - - // write line to log - out << last_logged; - for (unsigned i=0; ikeys.size(); i++) { - if (fvals[i] > 0 && vals[i] == 0) - out << "\t" << fvals[i]; - else - out << "\t" << vals[i]; - } - out << endl; - - // reset the counters - for (unsigned i=0; ikeys.size(); i++) { - if (type->inc_keys.count(i)) { - this->vals[i] = 0; - this->fvals[i] = 0; - } - } - } - - logger_lock.Unlock(); -} - - - - diff --git a/tags/20070517_before_mds_merge/common/Logger.h b/tags/20070517_before_mds_merge/common/Logger.h deleted file mode 100644 index 85102acd90370..0000000000000 --- a/tags/20070517_before_mds_merge/common/Logger.h +++ /dev/null @@ -1,74 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __LOGGER_H -#define __LOGGER_H - -#include "include/types.h" -#include "Clock.h" -#include "Mutex.h" - -#include -#include -using namespace std; - -#include -using namespace __gnu_cxx; - -#include "LogType.h" - - - - -class Logger { - protected: - //hash_map, eqstr> vals; - //hash_map, eqstr> fvals; - vector vals; - vector fvals; - - //Mutex lock; - LogType *type; - - utime_t start; - int last_logged; - int interval; - int wrote_header; - int wrote_header_last; - - string filename; - - int version; - - ofstream out; - bool open; - - public: - Logger(string fn, LogType *type); - ~Logger(); - - void set_start(const utime_t& a) { start = a; } - utime_t& get_start() { return start; } - - long inc(const char *s, long v = 1); - long set(const char *s, long v); - long get(const char *s); - - double fset(const char *s, double v); - double finc(const char *s, double v); - - void flush(bool force = false); -}; - -#endif diff --git a/tags/20070517_before_mds_merge/common/Mutex.h b/tags/20070517_before_mds_merge/common/Mutex.h deleted file mode 100755 index 325ba2a0e11fc..0000000000000 --- a/tags/20070517_before_mds_merge/common/Mutex.h +++ /dev/null @@ -1,82 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MUTEX_H -#define __MUTEX_H - -#include -#include - -class Mutex { -private: - pthread_mutex_t _m; - int nlock; - bool recursive; - - // don't allow copying. - void operator=(Mutex &M) {} - Mutex( const Mutex &M ) {} - -public: - Mutex(bool r = true) : nlock(0), recursive(r) { - if (recursive) { - pthread_mutexattr_t attr; - pthread_mutexattr_init(&attr); - pthread_mutexattr_settype(&attr,PTHREAD_MUTEX_RECURSIVE); - pthread_mutex_init(&_m,&attr); - pthread_mutexattr_destroy(&attr); - } else { - pthread_mutex_init(&_m,NULL); - } - } - virtual ~Mutex() { - assert(nlock == 0); - pthread_mutex_destroy(&_m); - } - - bool is_locked() { - return (nlock > 0); - } - - void Lock() { - int r = pthread_mutex_lock(&_m); - assert(r == 0); - nlock++; - assert(nlock == 1 || recursive); - } - - void Unlock() { - assert(nlock > 0); - --nlock; - int r = pthread_mutex_unlock(&_m); - assert(r == 0); - } - - friend class Cond; - - -public: - class Locker { - Mutex &mutex; - - public: - Locker(Mutex& m) : mutex(m) { - mutex.Lock(); - } - ~Locker() { - mutex.Unlock(); - } - }; -}; - -#endif diff --git a/tags/20070517_before_mds_merge/common/Semaphore.h b/tags/20070517_before_mds_merge/common/Semaphore.h deleted file mode 100644 index 7526f5c1ec9c8..0000000000000 --- a/tags/20070517_before_mds_merge/common/Semaphore.h +++ /dev/null @@ -1,52 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef _Sem_Posix_ -#define _Sem_Posix_ - -#include - -class Semaphore -{ - Mutex m; - Cond c; - int count; - - public: - - Semaphore() - { - count = 0; - } - - void Put() - { - m.Lock(); - count++; - c.Signal(); - m.Unlock(); - } - - void Get() - { - m.Lock(); - while(count <= 0) { - c.Wait(m); - } - count--; - m.Unlock(); - } -}; - -#endif // !_Mutex_Posix_ diff --git a/tags/20070517_before_mds_merge/common/Thread.h b/tags/20070517_before_mds_merge/common/Thread.h deleted file mode 100644 index 43c5f57f4a96c..0000000000000 --- a/tags/20070517_before_mds_merge/common/Thread.h +++ /dev/null @@ -1,66 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __THREAD_H -#define __THREAD_H - -#include -#include - -class Thread { - private: - pthread_t thread_id; - - public: - Thread() : thread_id(0) {} - virtual ~Thread() {} - - pthread_t &get_thread_id() { return thread_id; } - bool is_started() { return thread_id != 0; } - - virtual void *entry() = 0; - - private: - static void *_entry_func(void *arg) { - return ((Thread*)arg)->entry(); - } - - public: - int create() { - return pthread_create( &thread_id, NULL, _entry_func, (void*)this ); - } - - bool am_self() { - return (pthread_self() == thread_id); - } - - int join(void **prval = 0) { - if (thread_id == 0) { - cerr << "WARNING: join on thread that was never started" << endl; - //assert(0); - return -EINVAL; // never started. - } - - int status = pthread_join(thread_id, prval); - if (status == 0) - thread_id = 0; - else { - cout << "join status = " << status << endl; - assert(0); - } - return status; - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/common/ThreadPool.h b/tags/20070517_before_mds_merge/common/ThreadPool.h deleted file mode 100644 index 674053bfe1087..0000000000000 --- a/tags/20070517_before_mds_merge/common/ThreadPool.h +++ /dev/null @@ -1,138 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef THREADPOOL -#define THREADPOOL - -#include -using namespace std; - - -#include -#include -#include -#include - - -// debug output -#include "config.h" -#define tpdout(x) if (x <= g_conf.debug) cout << myname -#define DBLVL 15 - - -using namespace std; - -#define MAX_THREADS 1000 - -template -class ThreadPool { - - private: - list q; - Mutex q_lock; - Semaphore q_sem; - - int num_ops; - int num_threads; - vector thread; - - U u; - void (*func)(U,T); - void (*prefunc)(U,T); - string myname; - - static void *foo(void *arg) - { - ThreadPool *t = (ThreadPool *)arg; - t->do_ops(arg); - return 0; - } - - void *do_ops(void *nothing) - { - tpdout(DBLVL) << ".do_ops thread " << pthread_self() << " starting" << endl; - while (1) { - q_sem.Get(); - if (q.empty()) break; - - T op = get_op(); - tpdout(DBLVL) << ".func thread "<< pthread_self() << " on " << op << endl; - func(u, op); - } - tpdout(DBLVL) << ".do_ops thread " << pthread_self() << " exiting" << endl; - return 0; - } - - - T get_op() - { - T op; - q_lock.Lock(); - { - op = q.front(); - q.pop_front(); - num_ops--; - - if (prefunc && op) { - tpdout(DBLVL) << ".prefunc thread "<< pthread_self() << " on " << op << endl; - prefunc(u, op); - } - } - q_lock.Unlock(); - - return op; - } - - public: - - ThreadPool(char *myname, int howmany, void (*f)(U,T), U obj, void (*pf)(U,T) = 0) : - num_ops(0), num_threads(howmany), - thread(num_threads), - u(obj), - func(f), prefunc(pf), - myname(myname) { - tpdout(DBLVL) << ".cons num_threads " << num_threads << endl; - - // start threads - int status; - for(int i = 0; i < howmany; i++) { - status = pthread_create(&thread[i], NULL, (void*(*)(void *))&ThreadPool::foo, this); - assert(status == 0); - } - } - - ~ThreadPool() { - // bump sem to make threads exit cleanly - for(int i = 0; i < num_threads; i++) - q_sem.Put(); - - // wait for them to die - for(int i = 0; i < num_threads; i++) { - tpdout(DBLVL) << ".des joining thread " << thread[i] << endl; - void *rval = 0; // we don't actually care - pthread_join(thread[i], &rval); - } - } - - void put_op(T op) { - tpdout(DBLVL) << ".put_op " << op << endl; - q_lock.Lock(); - q.push_back(op); - num_ops++; - q_sem.Put(); - q_lock.Unlock(); - } - -}; -#endif diff --git a/tags/20070517_before_mds_merge/common/Timer.cc b/tags/20070517_before_mds_merge/common/Timer.cc deleted file mode 100644 index 522a623d5ebac..0000000000000 --- a/tags/20070517_before_mds_merge/common/Timer.cc +++ /dev/null @@ -1,333 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - - -#include "Timer.h" -#include "Cond.h" - -#include "config.h" -#include "include/Context.h" - -#undef dout -#define dout(x) if (x <= g_conf.debug) cout << g_clock.now() << " TIMER " -#define derr(x) if (x <= g_conf.debug) cerr << g_clock.now() << " TIMER " - -#define DBL 10 - -#include -#include -#include - -// single global instance -Timer g_timer; - - - -/**** thread solution *****/ - -bool Timer::get_next_due(utime_t& when) -{ - if (scheduled.empty()) { - dout(10) << "get_next_due - nothing scheduled" << endl; - return false; - } else { - map< utime_t, set >::iterator it = scheduled.begin(); - when = it->first; - dout(10) << "get_next_due - " << when << endl; - return true; - } -} - - -void Timer::timer_entry() -{ - lock.Lock(); - - while (!thread_stop) { - - // now - utime_t now = g_clock.now(); - - // any events due? - utime_t next; - bool next_due = get_next_due(next); - - if (next_due && now >= next) { - // move to pending list - list pending; - - map< utime_t, set >::iterator it = scheduled.begin(); - while (it != scheduled.end()) { - if (it->first > now) break; - - utime_t t = it->first; - dout(DBL) << "queueing event(s) scheduled at " << t << endl; - - for (set::iterator cit = it->second.begin(); - cit != it->second.end(); - cit++) { - pending.push_back(*cit); - event_times.erase(*cit); - num_event--; - } - - map< utime_t, set >::iterator previt = it; - it++; - scheduled.erase(previt); - } - - if (!pending.empty()) { - sleeping = false; - lock.Unlock(); - { - // make sure we're not holding any locks while we do callbacks - // make the callbacks myself. - for (list::iterator cit = pending.begin(); - cit != pending.end(); - cit++) { - dout(DBL) << "start callback " << *cit << endl; - (*cit)->finish(0); - dout(DBL) << "finish callback " << *cit << endl; - delete *cit; - } - pending.clear(); - assert(pending.empty()); - } - lock.Lock(); - } - - } - else { - // sleep - if (next_due) { - dout(DBL) << "sleeping until " << next << endl; - timed_sleep = true; - sleeping = true; - timeout_cond.WaitUntil(lock, next); // wait for waker or time - utime_t now = g_clock.now(); - dout(DBL) << "kicked or timed out at " << now << endl; - } else { - dout(DBL) << "sleeping" << endl; - timed_sleep = false; - sleeping = true; - sleep_cond.Wait(lock); // wait for waker - utime_t now = g_clock.now(); - dout(DBL) << "kicked at " << now << endl; - } - } - } - - lock.Unlock(); -} - - - -/** - * Timer bits - */ - -void Timer::register_timer() -{ - if (timer_thread.is_started()) { - if (sleeping) { - dout(DBL) << "register_timer kicking thread" << endl; - if (timed_sleep) - timeout_cond.SignalAll(); - else - sleep_cond.SignalAll(); - } else { - dout(DBL) << "register_timer doing nothing; thread is awake" << endl; - // it's probably doing callbacks. - } - } else { - dout(DBL) << "register_timer starting thread" << endl; - timer_thread.create(); - } -} - -void Timer::cancel_timer() -{ - // clear my callback pointers - if (timer_thread.is_started()) { - dout(10) << "setting thread_stop flag" << endl; - lock.Lock(); - thread_stop = true; - if (timed_sleep) - timeout_cond.SignalAll(); - else - sleep_cond.SignalAll(); - lock.Unlock(); - - dout(10) << "waiting for thread to finish" << endl; - void *ptr; - timer_thread.join(&ptr); - - dout(10) << "thread finished, exit code " << ptr << endl; - } -} - - -/* - * schedule - */ - - -void Timer::add_event_after(float seconds, - Context *callback) -{ - utime_t when = g_clock.now(); - when.sec_ref() += (int)seconds; - add_event_at(when, callback); -} - -void Timer::add_event_at(utime_t when, - Context *callback) -{ - lock.Lock(); - - dout(DBL) << "add_event " << callback << " at " << when << endl; - - // insert - scheduled[when].insert(callback); - assert(event_times.count(callback) == 0); - event_times[callback] = when; - - num_event++; - - // make sure i wake up on time - register_timer(); - - lock.Unlock(); -} - -bool Timer::cancel_event(Context *callback) -{ - lock.Lock(); - - dout(DBL) << "cancel_event " << callback << endl; - - if (!event_times.count(callback)) { - dout(DBL) << "cancel_event " << callback << " isn't scheduled (probably executing)" << endl; - lock.Unlock(); - return false; // wasn't scheduled. - } - - utime_t tp = event_times[callback]; - event_times.erase(callback); - - assert(scheduled.count(tp)); - assert(scheduled[tp].count(callback)); - scheduled[tp].erase(callback); - if (scheduled[tp].empty()) - scheduled.erase(tp); - - lock.Unlock(); - - // delete the canceled event. - delete callback; - - return true; -} - - -// ------------------------------- - -void SafeTimer::add_event_after(float seconds, Context *c) -{ - assert(lock.is_locked()); - Context *w = new EventWrapper(this, c); - dout(DBL) << "SafeTimer.add_event_after wrapping " << c << " with " << w << endl; - scheduled[c] = w; - g_timer.add_event_after(seconds, w); -} - -void SafeTimer::add_event_at(utime_t when, Context *c) -{ - assert(lock.is_locked()); - Context *w = new EventWrapper(this, c); - dout(DBL) << "SafeTimer.add_event_at wrapping " << c << " with " << w << endl; - scheduled[c] = w; - g_timer.add_event_at(when, w); -} - -void SafeTimer::EventWrapper::finish(int r) -{ - timer->lock.Lock(); - if (timer->scheduled.count(actual)) { - // still scheduled. execute. - actual->finish(r); - timer->scheduled.erase(actual); - } else { - // i was canceled. - assert(timer->canceled.count(actual)); - } - - // did i get canceled? - // (this can happen even if i just executed above. e.g., i may have canceled myself.) - if (timer->canceled.count(actual)) { - timer->canceled.erase(actual); - timer->cond.Signal(); - } - - // delete the original event - delete actual; - - timer->lock.Unlock(); -} - -void SafeTimer::cancel_event(Context *c) -{ - assert(lock.is_locked()); - assert(scheduled.count(c)); - - if (g_timer.cancel_event(scheduled[c])) { - // hosed wrapper. hose original event too. - delete c; - } else { - // clean up later. - canceled[c] = scheduled[c]; - } - scheduled.erase(c); -} - -void SafeTimer::cancel_all() -{ - assert(lock.is_locked()); - - while (!scheduled.empty()) - cancel_event(scheduled.begin()->first); -} - -void SafeTimer::join() -{ - assert(lock.is_locked()); - assert(scheduled.empty()); - - while (!canceled.empty()) { - // wait - dout(-10) << "SafeTimer.join waiting for " << canceled.size() << " to join" << endl; - dout(-10) << canceled << endl; - cond.Wait(lock); - } -} - -SafeTimer::~SafeTimer() -{ - if (!scheduled.empty() && !canceled.empty()) { - derr(0) << "SafeTimer.~SafeTimer " << scheduled.size() << " events scheduled, " - << canceled.size() << " canceled but unflushed" - << endl; - } -} diff --git a/tags/20070517_before_mds_merge/common/Timer.h b/tags/20070517_before_mds_merge/common/Timer.h deleted file mode 100644 index 88d9929ac5ae1..0000000000000 --- a/tags/20070517_before_mds_merge/common/Timer.h +++ /dev/null @@ -1,177 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __TIMER_H -#define __TIMER_H - -#include "include/types.h" -#include "include/Context.h" -#include "Clock.h" - -#include "Mutex.h" -#include "Cond.h" -#include "Thread.h" - -#include -#include -using namespace std; - -#include -using namespace __gnu_cxx; - - -/*** Timer - * schedule callbacks - */ - -//class Messenger; - - -namespace __gnu_cxx { - template<> struct hash { - size_t operator()(const Context *p) const { - static hash H; - return H((unsigned long)p); - } - }; -} - - -class Timer { - private: - map< utime_t, set > scheduled; // time -> (context ...) - hash_map< Context*, utime_t > event_times; // event -> time - - // get time of the next event - //Context* get_next_scheduled(utime_t& when); - - bool get_next_due(utime_t &when); - - void register_timer(); // make sure i get a callback - void cancel_timer(); // make sure i get a callback - - //pthread_t thread_id; - bool thread_stop; - Mutex lock; - bool timed_sleep; - bool sleeping; - Cond sleep_cond; - Cond timeout_cond; - - public: - void timer_entry(); // waiter thread (that wakes us up) - - class TimerThread : public Thread { - Timer *t; - public: - void *entry() { - t->timer_entry(); - return 0; - } - TimerThread(Timer *_t) : t(_t) {} - } timer_thread; - - - int num_event; - - - public: - Timer() : - thread_stop(false), - timed_sleep(false), - sleeping(false), - timer_thread(this), - num_event(0) - { - } - ~Timer() { - // stop. - cancel_timer(); - - // scheduled - for (map< utime_t, set >::iterator it = scheduled.begin(); - it != scheduled.end(); - it++) { - for (set::iterator sit = it->second.begin(); - sit != it->second.end(); - sit++) - delete *sit; - } - scheduled.clear(); - } - - void init() { - register_timer(); - } - void shutdown() { - cancel_timer(); - } - - // schedule events - void add_event_after(float seconds, - Context *callback); - void add_event_at(utime_t when, - Context *callback); - bool cancel_event(Context *callback); - - // execute pending events - void execute_pending(); - -}; - - -/* - * SafeTimer is a wrapper around the raw Timer (or rather, g_timer, it's global - * instantiation) that protects event execution with an existing mutex. It - * provides for, among other things, reliable event cancellation on class - * destruction. The caller just needs to cancel each event (or cancel_all()), - * and then call join() to ensure any concurrently exectuting events (in other - * threads) get flushed. - */ -class SafeTimer { - Mutex& lock; - Cond cond; - map scheduled; // actual -> wrapper - map canceled; - - class EventWrapper : public Context { - SafeTimer *timer; - Context *actual; - public: - EventWrapper(SafeTimer *st, Context *c) : timer(st), - actual(c) {} - void finish(int r); - }; - -public: - SafeTimer(Mutex& l) : lock(l) { } - ~SafeTimer(); - - void add_event_after(float seconds, Context *c); - void add_event_at(utime_t when, Context *c); - void cancel_event(Context *c); - void cancel_all(); - void join(); - - int get_num_scheduled() { return scheduled.size(); } - int get_num_canceled() { return canceled.size(); } -}; - - -// single global instance -extern Timer g_timer; - - - -#endif diff --git a/tags/20070517_before_mds_merge/config.cc b/tags/20070517_before_mds_merge/config.cc deleted file mode 100644 index 6820ffa327b9f..0000000000000 --- a/tags/20070517_before_mds_merge/config.cc +++ /dev/null @@ -1,838 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include "config.h" -#include "include/types.h" - -//#define MDS_CACHE_SIZE 4*10000 -> <20mb -//#define MDS_CACHE_SIZE 80000 62mb - -#define AVG_PER_INODE_SIZE 450 -#define MDS_CACHE_MB_TO_INODES(x) ((x)*1000000/AVG_PER_INODE_SIZE) - -//#define MDS_CACHE_SIZE MDS_CACHE_MB_TO_INODES( 50 ) -//#define MDS_CACHE_SIZE 1500000 -#define MDS_CACHE_SIZE 150000 - - -// hack hack hack ugly FIXME -#include "common/Mutex.h" -long buffer_total_alloc = 0; -Mutex bufferlock; - -Mutex _dout_lock; - - -FileLayout g_OSD_FileLayout( 1<<20, 1, 1<<20, 2 ); // stripe over 1M objects, 2x replication -//FileLayout g_OSD_FileLayout( 1<<17, 4, 1<<20 ); // 128k stripes over sets of 4 - -// ?? -//FileLayout g_OSD_MDDirLayout( 1<<8, 1<<2, 1<<19, 3 ); // this is stupid, but can bring out an ebofs table bug? -FileLayout g_OSD_MDDirLayout( 1<<20, 1, 1<<20, 2 ); // 1M objects, 2x replication - -// stripe mds log over 128 byte bits (see mds_log_pad_entry below to match!) -FileLayout g_OSD_MDLogLayout( 1<<20, 1, 1<<20, 2 ); // 1M objects -//FileLayout g_OSD_MDLogLayout( 1<<8, 1<<2, 1<<19, 3 ); // 256 byte bits -//FileLayout g_OSD_MDLogLayout( 1<<7, 32, 1<<20, 3 ); // 128 byte stripes over 32 1M objects -//FileLayout g_OSD_MDLogLayout( 57, 32, 1<<20 ); // pathological case to test striping buffer mapping -//FileLayout g_OSD_MDLogLayout( 1<<20, 1, 1<<20 ); // old way - -// fake osd failures: osd -> time -std::map g_fake_osd_down; -std::map g_fake_osd_out; - -entity_addr_t g_my_addr; - -md_config_t g_debug_after_conf; - -md_config_t g_conf = { - num_mon: 1, - num_mds: 1, - num_osd: 4, - num_client: 1, - - mkfs: false, - - // profiling and debugging - log: true, - log_interval: 1, - log_name: (char*)0, - - log_messages: true, - log_pins: true, - - fake_clock: false, - fakemessenger_serialize: true, - - fake_osdmap_expand: 0, - fake_osdmap_updates: 0, - fake_osd_mttf: 0, - fake_osd_mttr: 0, - - osd_remount_at: 0, - - kill_after: 0, - - tick: 0, - - debug: 0, - debug_mds: 1, - debug_mds_balancer: 1, - debug_mds_log: 1, - debug_buffer: 0, - debug_filer: 0, - debug_objecter: 0, - debug_objectcacher: 0, - debug_client: 0, - debug_osd: 0, - debug_ebofs: 1, - debug_bdev: 1, // block device - debug_ns: 0, - debug_ms: 0, - debug_mon: 0, - - debug_after: 0, - - // -- misc -- - use_abspaths: false, // make monitorstore et al use absolute path (to workaround FUSE chdir("/")) - - // --- clock --- - clock_lock: false, - clock_tare: true, - - // --- messenger --- - ms_single_dispatch: false, - ms_requeue_on_sender_fail: false, - - ms_stripe_osds: false, - ms_skip_rank0: false, - ms_overlay_clients: false, - - ms_die_on_failure: false, - - /*tcp_skip_rank0: false, - tcp_overlay_clients: false, // over osds! - tcp_log: false, - tcp_serial_marshall: true, - tcp_serial_out: false, - tcp_multi_out: true, - tcp_multi_dispatch: false, // not fully implemented yet - */ - - // --- mon --- - mon_tick_interval: 5, - mon_osd_down_out_interval: 5, // seconds - mon_lease: 2.000, // seconds - mon_stop_with_last_mds: true, - - // --- client --- - client_cache_size: 300, - client_cache_mid: .5, - client_cache_stat_ttl: 0, // seconds until cached stat results become invalid - client_cache_readdir_ttl: 1, // 1 second only - client_use_random_mds: false, - - client_sync_writes: 0, - - client_oc: true, - client_oc_size: 1024*1024* 5, // MB * n - client_oc_max_dirty: 1024*1024* 5, // MB * n - client_oc_max_sync_write: 128*1024, // writes >= this use wrlock - - client_trace: 0, - fuse_direct_io: 0, - - // --- objecter --- - objecter_buffer_uncommitted: true, - - // --- journaler --- - journaler_allow_split_entries: true, - - // --- mds --- - mds_cache_size: MDS_CACHE_SIZE, - mds_cache_mid: .7, - - mds_decay_halflife: 30, - - mds_beacon_interval: 5.0, - mds_beacon_grace: 100.0, - - mds_log: true, - mds_log_max_len: MDS_CACHE_SIZE / 3, - mds_log_max_trimming: 10000, - mds_log_read_inc: 1<<20, - mds_log_pad_entry: 128,//256,//64, - mds_log_before_reply: true, - mds_log_flush_on_shutdown: true, - mds_log_import_map_interval: 1024*1024, // frequency (in bytes) of EImportMap in log - mds_bal_replicate_threshold: 2000, - mds_bal_unreplicate_threshold: 0,//500, - mds_bal_hash_rd: 10000, - mds_bal_unhash_rd: 1000, - mds_bal_hash_wr: 10000, - mds_bal_unhash_wr: 1000, - mds_bal_interval: 30, // seconds - mds_bal_hash_interval: 5, // seconds - mds_bal_idle_threshold: .1, - mds_bal_max: -1, - mds_bal_max_until: -1, - - mds_bal_mode: 0, - mds_bal_min_start: .2, // if we need less than this, we don't do anything - mds_bal_need_min: .8, // take within this range of what we need - mds_bal_need_max: 1.2, - mds_bal_midchunk: .3, // any sub bigger than this taken in full - mds_bal_minchunk: .001, // never take anything smaller than this - - mds_commit_on_shutdown: true, - mds_shutdown_check: 0, //30, - mds_shutdown_on_last_unmount: true, - - mds_verify_export_dirauth: true, - - mds_local_osd: false, - - - // --- osd --- - osd_rep: OSD_REP_PRIMARY, - osd_balance_reads: false, - osd_pg_bits: 0, // 0 == let osdmonitor decide - osd_object_layout: OBJECT_LAYOUT_HASHINO, - osd_pg_layout: PG_LAYOUT_CRUSH, - osd_max_rep: 4, - osd_maxthreads: 2, // 0 == no threading - osd_max_opq: 10, - osd_mkfs: false, - osd_age: .8, - osd_age_time: 0, - osd_heartbeat_interval: 5, // shut up while i'm debugging - osd_replay_window: 5, - osd_max_pull: 2, - osd_pad_pg_log: false, - - // --- fakestore --- - fakestore_fake_sync: 2, // 2 seconds - fakestore_fsync: false,//true, - fakestore_writesync: false, - fakestore_syncthreads: 4, - fakestore_fake_attrs: false, - fakestore_fake_collections: false, - fakestore_dev: 0, - - // --- ebofs --- - ebofs: 1, - ebofs_cloneable: false, - ebofs_verify: false, - ebofs_commit_ms: 2000, // 0 = no forced commit timeout (for debugging/tracing) - ebofs_idle_commit_ms: 100, // 0 = no idle detection. use this -or- bdev_idle_kick_after_ms - ebofs_oc_size: 10000, // onode cache - ebofs_cc_size: 10000, // cnode cache - ebofs_bc_size: (80 *256), // 4k blocks, *256 for MB - ebofs_bc_max_dirty: (60 *256), // before write() will block - ebofs_max_prefetch: 1000, // 4k blocks - ebofs_realloc: true, - - ebofs_abp_zero: false, // zero newly allocated buffers (may shut up valgrind) - ebofs_abp_max_alloc: 4096*16, // max size of new buffers (larger -> more memory fragmentation) - - // --- obfs --- - uofs: 0, - uofs_fake_sync: 2, // 2 seconds - uofs_cache_size: 1 << 28, //256MB - uofs_onode_size: (int)1024, - uofs_small_block_size: (int)4096, //4KB - uofs_large_block_size: (int)524288, //512KB - uofs_segment_size: (int)268435456, //256MB - uofs_block_meta_ratio: (int)10, - uofs_sync_write: (int)0, - uofs_nr_hash_buckets: (int)1023, - uofs_flush_interval: (int)5, //seconds - uofs_min_flush_pages: (int)1024, //4096 4k-pages - uofs_delay_allocation: (int)1, //true - - // --- block device --- - bdev_lock: true, - bdev_iothreads: 1, // number of ios to queue with kernel - bdev_idle_kick_after_ms: 0,//100, // ms ** FIXME ** this seems to break things, not sure why yet ** - bdev_el_fw_max_ms: 10000, // restart elevator at least once every 1000 ms - bdev_el_bw_max_ms: 3000, // restart elevator at least once every 300 ms - bdev_el_bidir: true, // bidirectional elevator? - bdev_iov_max: 512, // max # iov's to collect into a single readv()/writev() call - bdev_debug_check_io_overlap: true, // [DEBUG] check for any pending io overlaps - bdev_fake_mb: 0, - bdev_fake_max_mb: 0, - - // --- fakeclient (mds regression testing) (ancient history) --- - num_fakeclient: 100, - fakeclient_requests: 100, - fakeclient_deterministic: false, - - fakeclient_op_statfs: false, - - // loosely based on Roselli workload paper numbers - fakeclient_op_stat: 610, - fakeclient_op_lstat: false, - fakeclient_op_utime: 0, - fakeclient_op_chmod: 1, - fakeclient_op_chown: 1, - - fakeclient_op_readdir: 2, - fakeclient_op_mknod: 30, - fakeclient_op_link: false, - fakeclient_op_unlink: 20, - fakeclient_op_rename: 0,//40, - - fakeclient_op_mkdir: 10, - fakeclient_op_rmdir: 20, - fakeclient_op_symlink: 20, - - fakeclient_op_openrd: 200, - fakeclient_op_openwr: 0, - fakeclient_op_openwrc: 0, - fakeclient_op_read: false, // osd! - fakeclient_op_write: false, // osd! - fakeclient_op_truncate: false, - fakeclient_op_fsync: false, - fakeclient_op_close: 200 - -#ifdef USE_OSBDB - , - bdbstore: false, - debug_bdbstore: 1, - bdbstore_btree: false, - bdbstore_ffactor: 0, - bdbstore_nelem: 0, - bdbstore_pagesize: 0, - bdbstore_cachesize: 0, - bdbstore_transactional: false -#endif // USE_OSBDB -}; - - -#include -#include - - -void env_to_vec(std::vector& args) -{ - const char *p = getenv("CEPH_ARGS"); - if (!p) return; - - static char buf[1000]; - int len = strlen(p); - memcpy(buf, p, len); - buf[len] = 0; - //cout << "CEPH_ARGS " << buf << endl; - - int l = 0; - for (int i=0; i& args) -{ - for (int i=1; i& args, - int& argc, char **&argv) -{ - argv = (char**)malloc(sizeof(char*) * argc); - argc = 1; - argv[0] = "asdf"; - - for (unsigned i=0; i= '0' && *s <= '9') { - int digit = *s - '0'; - //cout << "digit " << digit << endl; - val *= 10; - val += digit; - numdigits++; - s++; off++; - } - //cout << "val " << val << endl; - - if (numdigits == 0) { - cerr << "no digits at off " << off << endl; - return false; // no digits - } - if (count < 3 && *s != '.') { - cerr << "should period at " << off << endl; - return false; // should have 3 periods - } - if (count == 3 && *s != ':') { - cerr << "expected : at " << off << endl; - return false; // then a colon - } - s++; off++; - - if (count <= 3) - a.ipq[count] = val; - else - a.port = val; - - count++; - if (count == 5) break; - } - - return true; -} - - - -void parse_config_options(std::vector& args) -{ - std::vector nargs; - - for (unsigned i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __CONFIG_H -#define __CONFIG_H - -extern class FileLayout g_OSD_FileLayout; -extern class FileLayout g_OSD_MDDirLayout; -extern class FileLayout g_OSD_MDLogLayout; - -#include -#include - -#include "common/Mutex.h" - -extern std::map g_fake_osd_down; -extern std::map g_fake_osd_out; - -#define OSD_REP_PRIMARY 0 -#define OSD_REP_SPLAY 1 -#define OSD_REP_CHAIN 2 - - -#include "msg/msg_types.h" - -extern entity_addr_t g_my_addr; - -struct md_config_t { - int num_mon; - int num_mds; - int num_osd; - int num_client; - - bool mkfs; - - // profiling - bool log; - int log_interval; - char *log_name; - - bool log_messages; - bool log_pins; - - bool fake_clock; - bool fakemessenger_serialize; - - int fake_osdmap_expand; - int fake_osdmap_updates; - int fake_osd_mttf; - int fake_osd_mttr; - - int osd_remount_at; - - int kill_after; - - int tick; - - int debug; - int debug_mds; - int debug_mds_balancer; - int debug_mds_log; - int debug_buffer; - int debug_filer; - int debug_objecter; - int debug_objectcacher; - int debug_client; - int debug_osd; - int debug_ebofs; - int debug_bdev; - int debug_ns; - int debug_ms; - int debug_mon; - - int debug_after; - - // misc - bool use_abspaths; - - // clock - bool clock_lock; - bool clock_tare; - - // messenger - - /*bool tcp_skip_rank0; - bool tcp_overlay_clients; - bool tcp_log; - bool tcp_serial_marshall; - bool tcp_serial_out; - bool tcp_multi_out; - bool tcp_multi_dispatch; - */ - - bool ms_single_dispatch; - bool ms_requeue_on_sender_fail; - - bool ms_stripe_osds; - bool ms_skip_rank0; - bool ms_overlay_clients; - bool ms_die_on_failure; - - // mon - int mon_tick_interval; - int mon_osd_down_out_interval; - float mon_lease; - bool mon_stop_with_last_mds; - - // client - int client_cache_size; - float client_cache_mid; - int client_cache_stat_ttl; - int client_cache_readdir_ttl; - bool client_use_random_mds; // debug flag - - bool client_sync_writes; - - bool client_oc; - int client_oc_size; - int client_oc_max_dirty; - size_t client_oc_max_sync_write; - - - - /* - bool client_bcache; - int client_bcache_alloc_minsize; - int client_bcache_alloc_maxsize; - int client_bcache_ttl; - off_t client_bcache_size; - int client_bcache_lowater; - int client_bcache_hiwater; - size_t client_bcache_align; - */ - - int client_trace; - int fuse_direct_io; - - // objecter - bool objecter_buffer_uncommitted; - - // journaler - bool journaler_allow_split_entries; - - // mds - int mds_cache_size; - float mds_cache_mid; - - float mds_decay_halflife; - - float mds_beacon_interval; - float mds_beacon_grace; - - bool mds_log; - int mds_log_max_len; - int mds_log_max_trimming; - int mds_log_read_inc; - int mds_log_pad_entry; - bool mds_log_before_reply; - bool mds_log_flush_on_shutdown; - off_t mds_log_import_map_interval; - - float mds_bal_replicate_threshold; - float mds_bal_unreplicate_threshold; - float mds_bal_hash_rd; - float mds_bal_unhash_rd; - float mds_bal_hash_wr; - float mds_bal_unhash_wr; - int mds_bal_interval; - int mds_bal_hash_interval; - float mds_bal_idle_threshold; - int mds_bal_max; - int mds_bal_max_until; - - int mds_bal_mode; - float mds_bal_min_start; - float mds_bal_need_min; - float mds_bal_need_max; - float mds_bal_midchunk; - float mds_bal_minchunk; - - bool mds_commit_on_shutdown; - int mds_shutdown_check; - bool mds_shutdown_on_last_unmount; - bool mds_verify_export_dirauth; // debug flag - - bool mds_local_osd; - - - // osd - int osd_rep; - bool osd_balance_reads; - int osd_pg_bits; - int osd_object_layout; - int osd_pg_layout; - int osd_max_rep; - int osd_maxthreads; - int osd_max_opq; - bool osd_mkfs; - float osd_age; - int osd_age_time; - int osd_heartbeat_interval; - int osd_replay_window; - int osd_max_pull; - bool osd_pad_pg_log; - - int fakestore_fake_sync; - bool fakestore_fsync; - bool fakestore_writesync; - int fakestore_syncthreads; // such crap - bool fakestore_fake_attrs; - bool fakestore_fake_collections; - char *fakestore_dev; - - // ebofs - int ebofs; - bool ebofs_cloneable; - bool ebofs_verify; - int ebofs_commit_ms; - int ebofs_idle_commit_ms; - int ebofs_oc_size; - int ebofs_cc_size; - off_t ebofs_bc_size; - off_t ebofs_bc_max_dirty; - unsigned ebofs_max_prefetch; - bool ebofs_realloc; - - bool ebofs_abp_zero; - size_t ebofs_abp_max_alloc; - - int uofs; - int uofs_fake_sync; - int uofs_cache_size; - int uofs_onode_size; - int uofs_small_block_size; - int uofs_large_block_size; - int uofs_segment_size; - int uofs_block_meta_ratio; - int uofs_sync_write; - - int uofs_nr_hash_buckets; - int uofs_flush_interval; - int uofs_min_flush_pages; - int uofs_delay_allocation; - - // block device - bool bdev_lock; - int bdev_iothreads; - int bdev_idle_kick_after_ms; - int bdev_el_fw_max_ms; - int bdev_el_bw_max_ms; - bool bdev_el_bidir; - int bdev_iov_max; - bool bdev_debug_check_io_overlap; - int bdev_fake_mb; - int bdev_fake_max_mb; - - // fake client - int num_fakeclient; - unsigned fakeclient_requests; - bool fakeclient_deterministic; // debug flag - - int fakeclient_op_statfs; - - int fakeclient_op_stat; - int fakeclient_op_lstat; - int fakeclient_op_utime; - int fakeclient_op_chmod; - int fakeclient_op_chown; - - int fakeclient_op_readdir; - int fakeclient_op_mknod; - int fakeclient_op_link; - int fakeclient_op_unlink; - int fakeclient_op_rename; - - int fakeclient_op_mkdir; - int fakeclient_op_rmdir; - int fakeclient_op_symlink; - - int fakeclient_op_openrd; - int fakeclient_op_openwr; - int fakeclient_op_openwrc; - int fakeclient_op_read; - int fakeclient_op_write; - int fakeclient_op_truncate; - int fakeclient_op_fsync; - int fakeclient_op_close; - -#ifdef USE_OSBDB - bool bdbstore; - int debug_bdbstore; - bool bdbstore_btree; - int bdbstore_ffactor; - int bdbstore_nelem; - int bdbstore_pagesize; - int bdbstore_cachesize; - bool bdbstore_transactional; -#endif // USE_OSBDB -}; - -extern md_config_t g_conf; -extern md_config_t g_debug_after_conf; - - -/** - * debug output framework - */ -#define dout(x) if ((x) <= g_conf.debug) std::cout -#define dout2(x) if ((x) <= g_conf.debug) std::cout - -/** - * for cleaner output, bracket each line with - * dbeginl (in the dout macro) and dendl (in place of endl). - */ -extern Mutex _dout_lock; -struct _dbeginl_t { - _dbeginl_t(int) {} -}; -struct _dendl_t { - _dendl_t(int) {} -}; -static const _dbeginl_t dbeginl = 0; -static const _dendl_t dendl = 0; - -inline ostream& operator<<(ostream& out, _dbeginl_t) { - _dout_lock.Lock(); - return out; -} -inline ostream& operator<<(ostream& out, _dendl_t) { - out << endl; - _dout_lock.Unlock(); - return out; -} - - -/** - * command line / environment argument parsing - */ -void env_to_vec(std::vector& args); -void argv_to_vec(int argc, char **argv, - std::vector& args); -void vec_to_argv(std::vector& args, - int& argc, char **&argv); - -void parse_config_options(std::vector& args); - -extern bool parse_ip_port(const char *s, entity_addr_t& addr); - - - -#endif diff --git a/tags/20070517_before_mds_merge/cosd.cc b/tags/20070517_before_mds_merge/cosd.cc deleted file mode 100644 index ff1e24b63b905..0000000000000 --- a/tags/20070517_before_mds_merge/cosd.cc +++ /dev/null @@ -1,126 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include -#include -#include - -#include -#include -#include -using namespace std; - -#include "config.h" - -#include "mon/MonMap.h" - -#include "osd/OSD.h" -#include "ebofs/Ebofs.h" - -#include "msg/SimpleMessenger.h" - -#include "common/Timer.h" - - -class C_Die : public Context { -public: - void finish(int) { - cerr << "die" << endl; - exit(1); - } -}; - -class C_Debug : public Context { - public: - void finish(int) { - int size = &g_conf.debug_after - &g_conf.debug; - memcpy((char*)&g_conf.debug, (char*)&g_debug_after_conf.debug, size); - dout(0) << "debug_after flipping debug settings" << endl; - } -}; - - -int main(int argc, char **argv) -{ - vector args; - argv_to_vec(argc, argv, args); - - parse_config_options(args); - - if (g_conf.kill_after) - g_timer.add_event_after(g_conf.kill_after, new C_Die); - if (g_conf.debug_after) - g_timer.add_event_after(g_conf.debug_after, new C_Debug); - - if (g_conf.clock_tare) g_clock.tare(); - - // osd specific args - char *dev; - int whoami = -1; - for (unsigned i=0; imount(); - int r = store->read(object_t(0,0), 0, sizeof(sb), bl); - if (r < 0) { - cerr << "couldn't read superblock object on " << dev << endl; - exit(0); - } - bl.copy(0, sizeof(sb), (char*)&sb); - store->umount(); - delete store; - whoami = sb.whoami; - - cout << "osd fs says i am osd" << whoami << endl; - } else { - cout << "command line arg says i am osd" << whoami << endl; - } - - // load monmap - MonMap monmap; - int r = monmap.read(".ceph_monmap"); - assert(r >= 0); - - // start up network - rank.start_rank(); - - // start osd - Messenger *m = rank.register_entity(MSG_ADDR_OSD(whoami)); - assert(m); - OSD *osd = new OSD(whoami, m, &monmap, dev); - osd->init(); - - // wait - rank.wait(); - - // done - delete osd; - - return 0; -} - diff --git a/tags/20070517_before_mds_merge/crush/BinaryTree.h b/tags/20070517_before_mds_merge/crush/BinaryTree.h deleted file mode 100644 index f13f3f1e565ef..0000000000000 --- a/tags/20070517_before_mds_merge/crush/BinaryTree.h +++ /dev/null @@ -1,284 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __crush_BINARYTREE_H -#define __crush_BINARYTREE_H - -#include -#include -#include -#include -//#include -using namespace std; - -#include "include/buffer.h" - -namespace crush { - - class BinaryTree { - private: - // tree def - int root_node; // 0 for empty tree. - int alloc; - vector node_nested; // all existing nodes in this map - vector node_weight; // and this one - vector node_complete; // only nodes with all possible children - - public: - BinaryTree() : root_node(0), alloc(0) {} - - void _encode(bufferlist& bl) { - bl.append((char*)&root_node, sizeof(root_node)); - bl.append((char*)&alloc, sizeof(alloc)); - ::_encode(node_nested, bl); - ::_encode(node_weight, bl); - ::_encode(node_complete, bl); - } - void _decode(bufferlist& bl, int& off) { - bl.copy(off, sizeof(root_node), (char*)&root_node); - off += sizeof(root_node); - bl.copy(off, sizeof(alloc), (char*)&alloc); - off += sizeof(alloc); - ::_decode(node_nested, bl, off); - ::_decode(node_weight, bl, off); - ::_decode(node_complete, bl, off); - } - - // accessors - bool empty() const { return root_node == 0; } - bool exists(int n) const { return n < alloc && node_nested[n]; } - int nested(int n) const { return exists(n) ? node_nested[n]:0; } - float weight(int n) const { return exists(n) ? node_weight[n]:0; } - bool complete(int n) const { return exists(n) ? node_complete[n]:false; } - - int root() const { return root_node; } - - void realloc(int n) { - /* - while (alloc <= n) { - node_nested.push_back(0); - node_weight.push_back(0); - node_complete.push_back(0); - alloc++; - } - */ - if (alloc <= n) { - int add = n - alloc + 1; - node_nested.insert(node_nested.end(), add, 0); - node_weight.insert(node_weight.end(), add, 0); - node_complete.insert(node_complete.end(), add, 0); - alloc = n+1; - } - } - - // tree navigation - bool terminal(int n) const { return n & 1; } // odd nodes are leaves. - int height(int n) const { - assert(n); - int h = 0; - while ((n & 1) == 0) { - assert(n > 0); - h++; n = n >> 1; - } - return h; - } - int left(int n) const { - int h = height(n); - //cout << "left of " << n << " is " << (n - (1 << h)) << endl; - return n - (1 << (h-1)); - } - int right(int n) const { - int h = height(n); - //cout << "right of " << n << " is " << (n + (1 << h)) << endl; - return n + (1 << (h-1)); - } - bool on_right(int n, int h = -1) const { - if (h < 0) h = height(n); - return n & (1 << (h+1)); - } - bool on_left(int n) const { return !on_right(n); } - int parent(int n) const { - int h = height(n); - if (on_right(n, h)) - return n - (1<0; t--) out << " "; - if (tree.root() == n) - out << "root "; - else { - if (tree.on_left(n)) - out << "left "; - else - out << "right "; - } - out << n << " : nested " << tree.nested(n) << " weight " << tree.weight(n); - if (tree.complete(n)) out << " complete"; - out << endl; - if (!tree.terminal(n)) { - if (tree.exists(tree.left(n))) - print_binary_tree_node(out, tree, tree.left(n), i+2); - if (tree.exists(tree.right(n))) - print_binary_tree_node(out, tree, tree.right(n), i+2); - } - } - - inline ostream& operator<<(ostream& out, const BinaryTree& tree) { - if (tree.empty()) - return out << "tree is empty"; - print_binary_tree_node(out, tree, tree.root(), 0); - return out; - } - -} - -#endif diff --git a/tags/20070517_before_mds_merge/crush/Bucket.h b/tags/20070517_before_mds_merge/crush/Bucket.h deleted file mode 100644 index 5b2d3259e09f8..0000000000000 --- a/tags/20070517_before_mds_merge/crush/Bucket.h +++ /dev/null @@ -1,631 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __crush_BUCKET_H -#define __crush_BUCKET_H - -#include "BinaryTree.h" -#include "Hash.h" - -#include -#include -#include -#include -using namespace std; - -#include - -#include "include/buffer.h" - -namespace crush { - - - const int CRUSH_BUCKET_UNIFORM = 1; - const int CRUSH_BUCKET_TREE = 2; - const int CRUSH_BUCKET_LIST = 3; - const int CRUSH_BUCKET_STRAW = 4; - - /** abstract bucket **/ - class Bucket { - protected: - int id; - int parent; - int type; - float weight; - - public: - Bucket(int _type, - float _weight) : - id(0), parent(0), - type(_type), - weight(_weight) { } - - Bucket(bufferlist& bl, int& off) { - bl.copy(off, sizeof(id), (char*)&id); - off += sizeof(id); - bl.copy(off, sizeof(parent), (char*)&parent); - off += sizeof(parent); - bl.copy(off, sizeof(type), (char*)&type); - off += sizeof(type); - bl.copy(off, sizeof(weight), (char*)&weight); - off += sizeof(weight); - } - - virtual ~Bucket() { } - - virtual const char *get_bucket_type() const = 0; - virtual bool is_uniform() const = 0; - - int get_id() const { return id; } - int get_type() const { return type; } - float get_weight() const { return weight; } - int get_parent() const { return parent; } - virtual int get_size() const = 0; - - void set_id(int i) { id = i; } - void set_parent(int p) { parent = p; } - void set_weight(float w) { weight = w; } - - virtual void get_items(vector& i) const = 0; - virtual float get_item_weight(int item) const = 0; - virtual void add_item(int item, float w, bool back=false) = 0; - virtual void adjust_item_weight(int item, float w) = 0; - virtual void set_item_weight(int item, float w) { - adjust_item_weight(item, w - get_item_weight(item)); - } - - virtual int choose_r(int x, int r, Hash& h) const = 0; - - virtual void _encode(bufferlist& bl) = 0; - }; - - - - - /** uniform bucket **/ - class UniformBucket : public Bucket { - protected: - public: - vector items; - int item_type; - float item_weight; - - // primes - vector primes; - - int get_prime(int j) const { - return primes[ j % primes.size() ]; - } - void make_primes() { - if (items.empty()) return; - - //cout << "make_primes " << get_id() << " " << items.size() << endl; - Hash h(123+get_id()); - primes.clear(); - - // start with odd number > num_items - unsigned x = items.size() + 1; // this is the minimum! - x += h(items.size()) % (3*items.size()); // bump it up some - x |= 1; // make it odd - - while (primes.size() < items.size()) { - unsigned j; - for (j=2; j*j<=x; j++) - if (x % j == 0) break; - if (j*j > x) { - primes.push_back(x); - //cout << "prime " << x << endl; - } - x += 2; - } - } - - public: - UniformBucket(int _type, int _item_type) : - Bucket(_type, 0), - item_type(_item_type) { } - UniformBucket(int _type, int _item_type, - float _item_weight, vector& _items) : - Bucket(_type, _item_weight*_items.size()), - item_type(_item_type), - item_weight(_item_weight) { - items = _items; - make_primes(); - } - - UniformBucket(bufferlist& bl, int& off) : Bucket(bl, off) { - bl.copy(off, sizeof(item_type), (char*)&item_type); - off += sizeof(item_type); - bl.copy(off, sizeof(item_weight), (char*)&item_weight); - off += sizeof(item_weight); - ::_decode(items, bl, off); - make_primes(); - } - - void _encode(bufferlist& bl) { - char t = CRUSH_BUCKET_UNIFORM; - bl.append((char*)&t, sizeof(t)); - bl.append((char*)&id, sizeof(id)); - bl.append((char*)&parent, sizeof(parent)); - bl.append((char*)&type, sizeof(type)); - bl.append((char*)&weight, sizeof(weight)); - - bl.append((char*)&item_type, sizeof(item_type)); - bl.append((char*)&item_weight, sizeof(item_weight)); - - ::_encode(items, bl); - } - - const char *get_bucket_type() const { return "uniform"; } - bool is_uniform() const { return true; } - - int get_size() const { return items.size(); } - - // items - void get_items(vector& i) const { - i = items; - } - int get_item_type() const { return item_type; } - float get_item_weight(int item) const { return item_weight; } - - void add_item(int item, float w, bool back=false) { - if (items.empty()) - item_weight = w; - items.push_back(item); - weight += item_weight; - make_primes(); - } - - void adjust_item_weight(int item, float w) { - assert(0); - } - - int choose_r(int x, int r, Hash& hash) const { - //cout << "uniformbucket.choose_r(" << x << ", " << r << ")" << endl; - //if (r >= get_size()) cout << "warning: r " << r << " >= " << get_size() << " uniformbucket.size" << endl; - - unsigned v = hash(x, get_id());// % get_size(); - unsigned p = get_prime( hash(get_id(), x) ); // choose a prime based on hash(x, get_id(), 2) - unsigned s = (x + v + (r+1)*p) % get_size(); - return items[s]; - } - - }; - - - - - - // list bucket.. RUSH_P sorta - - class ListBucket : public Bucket { - protected: - list items; - list item_weight; - list sum_weight; - - public: - ListBucket(int _type) : Bucket(_type, 0) { } - - ListBucket(bufferlist& bl, int& off) : Bucket(bl, off) { - ::_decode(items, bl, off); - ::_decode(item_weight, bl, off); - ::_decode(sum_weight, bl, off); - } - - void _encode(bufferlist& bl) { - char t = CRUSH_BUCKET_LIST; - bl.append((char*)&t, sizeof(t)); - bl.append((char*)&id, sizeof(id)); - bl.append((char*)&parent, sizeof(parent)); - bl.append((char*)&type, sizeof(type)); - bl.append((char*)&weight, sizeof(weight)); - - ::_encode(items, bl); - ::_encode(item_weight, bl); - ::_encode(sum_weight, bl); - } - - const char *get_bucket_type() const { return "list"; } - bool is_uniform() const { return false; } - - int get_size() const { return items.size(); } - - void get_items(vector& i) const { - for (list::const_iterator it = items.begin(); - it != items.end(); - it++) - i.push_back(*it); - } - float get_item_weight(int item) const { - list::const_iterator i = items.begin(); - list::const_iterator w = item_weight.begin(); - while (i != items.end()) { - if (*i == item) return *w; - i++; w++; - } - assert(0); - return 0; - } - - void add_item(int item, float w, bool back=false) { - if (back) { - items.push_back(item); - item_weight.push_back(w); - sum_weight.clear(); - float s = 0.0; - for (list::reverse_iterator i = item_weight.rbegin(); - i != item_weight.rend(); - i++) { - s += *i; - sum_weight.push_front(s); - } - weight += w; - assert(weight == s); - } else { - items.push_front(item); - item_weight.push_front(w); - weight += w; - sum_weight.push_front(weight); - } - } - - void adjust_item_weight(int item, float dw) { - // find it - list::iterator p = items.begin(); - list::iterator pw = item_weight.begin(); - list::iterator ps = sum_weight.begin(); - - while (*p != item) { - *ps += dw; - p++; pw++; ps++; // next! - assert(p != items.end()); - } - - assert(*p == item); - *pw += dw; - *ps += dw; - } - - - int choose_r(int x, int r, Hash& h) const { - //cout << "linearbucket.choose_r(" << x << ", " << r << ")" << endl; - - list::const_iterator p = items.begin(); - list::const_iterator pw = item_weight.begin(); - list::const_iterator ps = sum_weight.begin(); - - while (p != items.end()) { - const int item = *p; - const float iw = *pw; - const float tw = *ps; - const float f = (float)(h(x, item, r, get_id()) % 10000) * tw / 10000.0; - //cout << "item " << item << " iw = " << iw << " tw = " << tw << " f = " << f << endl; - if (f < iw) { - //cout << "linearbucket.choose_r(" << x << ", " << r << ") = " << item << endl; - return item; - } - p++; pw++; ps++; // next! - } - assert(0); - return 0; - } - - - }; - - - - - // mixed bucket, based on RUSH_T type binary tree - - class TreeBucket : public Bucket { - protected: - //vector item_weight; - - // public: - BinaryTree tree; - map node_item; // node id -> item - vector node_item_vec; // fast version of above - map item_node; // item -> node id - map item_weight; - - public: - TreeBucket(int _type) : Bucket(_type, 0) { } - - TreeBucket(bufferlist& bl, int& off) : Bucket(bl, off) { - tree._decode(bl, off); - - ::_decode(node_item, bl, off); - ::_decode(node_item_vec, bl, off); - ::_decode(item_node, bl, off); - ::_decode(item_weight, bl, off); - } - - void _encode(bufferlist& bl) { - char t = CRUSH_BUCKET_TREE; - bl.append((char*)&t, sizeof(t)); - bl.append((char*)&id, sizeof(id)); - bl.append((char*)&parent, sizeof(parent)); - bl.append((char*)&type, sizeof(type)); - bl.append((char*)&weight, sizeof(weight)); - - tree._encode(bl); - - ::_encode(node_item, bl); - ::_encode(node_item_vec, bl); - ::_encode(item_node, bl); - ::_encode(item_weight, bl); - } - - const char *get_bucket_type() const { return "tree"; } - bool is_uniform() const { return false; } - - int get_size() const { return node_item.size(); } - - // items - void get_items(vector& i) const { - for (map::const_iterator it = node_item.begin(); - it != node_item.end(); - it++) - i.push_back(it->second); - } - float get_item_weight(int i) const { - assert(item_weight.count(i)); - return ((map)item_weight)[i]; - } - - - void add_item(int item, float w, bool back=false) { - item_weight[item] = w; - weight += w; - - unsigned n = tree.add_node(w); - node_item[n] = item; - item_node[item] = n; - - while (node_item_vec.size() <= n) - node_item_vec.push_back(0); - node_item_vec[n] = item; - } - - void adjust_item_weight(int item, float dw) { - // adjust my weight - weight += dw; - item_weight[item] += dw; - - // adjust tree weights - tree.adjust_node_weight(item_node[item], dw); - } - - int choose_r(int x, int r, Hash& h) const { - //cout << "mixedbucket.choose_r(" << x << ", " << r << ")" << endl; - int n = tree.root(); - while (!tree.terminal(n)) { - // pick a point in [0,w) - float w = tree.weight(n); - float f = (float)(h(x, n, r, get_id()) % 10000) * w / 10000.0; - - // left or right? - int l = tree.left(n); - if (tree.exists(l) && - f < tree.weight(l)) - n = l; - else - n = tree.right(n); - } - //assert(node_item.count(n)); - //return ((map)node_item)[n]; - return node_item_vec[n]; - } - }; - - - - - - // straw bucket.. new thing! - - class StrawBucket : public Bucket { - protected: - map item_weight; - map item_straw; - - list _items; - list _straws; - - public: - StrawBucket(int _type) : Bucket(_type, 0) { } - - StrawBucket(bufferlist& bl, int& off) : Bucket(bl, off) { - ::_decode(item_weight, bl, off); - calc_straws(); - } - - void _encode(bufferlist& bl) { - char t = CRUSH_BUCKET_TREE; - bl.append((char*)&t, sizeof(t)); - bl.append((char*)&id, sizeof(id)); - bl.append((char*)&parent, sizeof(parent)); - bl.append((char*)&type, sizeof(type)); - bl.append((char*)&weight, sizeof(weight)); - - ::_encode(item_weight, bl); - } - - const char *get_bucket_type() const { return "straw"; } - bool is_uniform() const { return false; } - - int get_size() const { return item_weight.size(); } - - - // items - void get_items(vector& i) const { - for (map::const_iterator it = item_weight.begin(); - it != item_weight.end(); - it++) - i.push_back(it->first); - } - float get_item_weight(int item) const { - assert(item_weight.count(item)); - return ((map)item_weight)[item]; - } - - void add_item(int item, float w, bool back=false) { - item_weight[item] = w; - weight += w; - calc_straws(); - } - - void adjust_item_weight(int item, float dw) { - //cout << "adjust " << item << " " << dw << endl; - weight += dw; - item_weight[item] += dw; - calc_straws(); - } - - - /* calculate straw lengths. - this is kind of ugly. not sure if there's a closed form way to calculate this or not! - */ - void calc_straws() { - //cout << get_id() << ": calc_straws ============" << endl; - - item_straw.clear(); - _items.clear(); - _straws.clear(); - - // reverse sort by weight; skip zero weight items - map > reverse; - for (map::iterator p = item_weight.begin(); - p != item_weight.end(); - p++) { - //cout << get_id() << ":" << p->first << " " << p->second << endl; - if (p->second > 0) { - //p->second /= minw; - reverse[p->second].insert(p->first); - } - } - - /* 1:2:7 - item_straw[0] = 1.0; - item_straw[1] = item_straw[0]*sqrt(1.0/.6); - item_straw[2] = item_straw[1]*2.0; - */ - - // work from low to high weights - float straw = 1.0; - float numleft = item_weight.size(); - float wbelow = 0.0; - float lastw = 0.0; - - map >::iterator next = reverse.begin(); - //while (next != reverse.end()) { - while (1) { - //cout << "hi " << next->first << endl; - map >::iterator cur = next; - - // set straw length for this set - for (set::iterator s = cur->second.begin(); - s != cur->second.end(); - s++) { - item_straw[*s] = straw; - //cout << "straw " << *s << " w " << item_weight[*s] << " -> " << straw << endl; - _items.push_back(*s); - _straws.push_back(straw); - } - - next++; - if (next == reverse.end()) break; - - wbelow += (cur->first-lastw) * numleft; - //cout << "wbelow " << wbelow << endl; - - numleft -= 1.0 * (float)cur->second.size(); - //cout << "numleft now " << numleft << endl; - - float wnext = numleft * (next->first - cur->first); - //cout << "wnext " << wnext << endl; - - float pbelow = wbelow / (wbelow+wnext); - //cout << "pbelow " << pbelow << endl; - - straw *= pow((double)(1.0/pbelow), (double)1.0/numleft); - - lastw = cur->first; - } - //cout << "============" << endl; - } - - int choose_r(int x, int r, Hash& h) const { - //cout << "strawbucket.choose_r(" << x << ", " << r << ")" << endl; - - float high_draw = -1; - int high = 0; - - list::const_iterator pi = _items.begin(); - list::const_iterator ps = _straws.begin(); - while (pi != _items.end()) { - const int item = *pi; - const float rnd = (float)(h(x, item, r) % 1000000) / 1000000.0; - const float straw = *ps * rnd; - - if (high_draw < 0 || - straw > high_draw) { - high = *pi; - high_draw = straw; - } - - pi++; - ps++; - } - return high; - } - }; - - - - - - inline Bucket* decode_bucket(bufferlist& bl, int& off) { - char t; - bl.copy(off, sizeof(t), (char*)&t); - off += sizeof(t); - - switch (t) { - case CRUSH_BUCKET_UNIFORM: - return new UniformBucket(bl, off); - case CRUSH_BUCKET_LIST: - return new ListBucket(bl, off); - case CRUSH_BUCKET_TREE: - return new TreeBucket(bl, off); - case CRUSH_BUCKET_STRAW: - return new StrawBucket(bl, off); - default: - assert(0); - } - return 0; - } - - - -} - - - - - - - - -#endif diff --git a/tags/20070517_before_mds_merge/crush/Hash.h b/tags/20070517_before_mds_merge/crush/Hash.h deleted file mode 100644 index a321624925d95..0000000000000 --- a/tags/20070517_before_mds_merge/crush/Hash.h +++ /dev/null @@ -1,300 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -// Robert Jenkins' function for mixing 32-bit values -// http://burtleburtle.net/bob/hash/evahash.html -// a, b = random bits, c = input and output -#define hashmix(a,b,c) \ - a=a-b; a=a-c; a=a^(c>>13); \ - b=b-c; b=b-a; b=b^(a<<8); \ - c=c-a; c=c-b; c=c^(b>>13); \ - a=a-b; a=a-c; a=a^(c>>12); \ - b=b-c; b=b-a; b=b^(a<<16); \ - c=c-a; c=c-b; c=c^(b>>5); \ - a=a-b; a=a-c; a=a^(c>>3); \ - b=b-c; b=b-a; b=b^(a<<10); \ - c=c-a; c=c-b; c=c^(b>>15); - -namespace crush { - - class Hash { - int seed; - - public: - int get_seed() { return seed; } - void set_seed(int s) { seed = s; } - - Hash(int s) { - unsigned int hash = 1315423911; - int x = 231232; - int y = 1232; - hashmix(s, x, hash); - hashmix(y, s, hash); - seed = s; - } - - inline int operator()(int a) { - unsigned int hash = seed ^ a; - int b = a; - int x = 231232; - int y = 1232; - hashmix(b, x, hash); - hashmix(y, a, hash); - return (hash & 0x7FFFFFFF); - } - - inline int operator()(int a, int b) { - unsigned int hash = seed ^ a ^ b; - int x = 231232; - int y = 1232; - hashmix(a, b, hash); - hashmix(x, a, hash); - hashmix(b, y, hash); - return (hash & 0x7FFFFFFF); - } - - inline int operator()(int a, int b, int c) { - unsigned int hash = seed ^ a ^ b ^ c; - int x = 231232; - int y = 1232; - hashmix(a, b, hash); - hashmix(c, x, hash); - hashmix(y, a, hash); - hashmix(b, x, hash); - hashmix(y, c, hash); - return (hash & 0x7FFFFFFF); - } - - inline int operator()(int a, int b, int c, int d) { - unsigned int hash = seed ^a ^ b ^ c ^ d; - int x = 231232; - int y = 1232; - hashmix(a, b, hash); - hashmix(c, d, hash); - hashmix(a, x, hash); - hashmix(y, b, hash); - hashmix(c, x, hash); - hashmix(y, d, hash); - return (hash & 0x7FFFFFFF); - } - - inline int operator()(int a, int b, int c, int d, int e) { - unsigned int hash = seed ^ a ^ b ^ c ^ d ^ e; - int x = 231232; - int y = 1232; - hashmix(a, b, hash); - hashmix(c, d, hash); - hashmix(e, x, hash); - hashmix(y, a, hash); - hashmix(b, x, hash); - hashmix(y, c, hash); - hashmix(d, x, hash); - hashmix(y, e, hash); - return (hash & 0x7FFFFFFF); - } - }; - -} - - - -#if 0 - - - //return myhash(a) ^ seed; - return myhash(a, seed); - } - int operator()(int a, int b) { - //return myhash( myhash(a) ^ myhash(b) ^ seed ); - return myhash(a, b, seed); - } - int operator()(int a, int b, int c) { - //return myhash( myhash(a ^ seed) ^ myhash(b ^ seed) ^ myhash(c ^ seed) ^ seed ); - return myhash(a, b, c, seed); - } - int operator()(int a, int b, int c, int d) { - //return myhash( myhash(a ^ seed) ^ myhash(b ^ seed) ^ myhash(c ^ seed) ^ myhash(d ^ seed) ^ seed ); - return myhash(a, b, c, d, seed); - } - - // ethan's rush hash? - if (0) - return (n ^ 0xdead1234) * (884811920 * 3 + 1); - - if (1) { - - // before - hash ^= ((hash << 5) + (n&255) + (hash >> 2)); - hashmix(a, b, hash); - n = n >> 8; - hash ^= ((hash << 5) + (n&255) + (hash >> 2)); - hashmix(a, b, hash); - n = n >> 8; - hash ^= ((hash << 5) + (n&255) + (hash >> 2)); - hashmix(a, b, hash); - n = n >> 8; - hash ^= ((hash << 5) + (n&255) + (hash >> 2)); - hashmix(a, b, hash); - n = n >> 8; - - //return hash; - return (hash & 0x7FFFFFFF); - } - - // JS - // a little better than RS - // + jenkin's mixing thing (which sucks on its own but helps tons here) - // best so far - if (1) { - unsigned int hash = 1315423911; - int a = 231232; - int b = 1232; - - for(unsigned int i = 0; i < 4; i++) - { - hash ^= ((hash << 5) + (n&255) + (hash >> 2)); - hashmix(a, b, hash); - n = n >> 8; - } - - return (hash & 0x7FFFFFFF); - } - - - // Robert jenkins' 96 bit mix - // sucks - if (0) { - int c = n; - int a = 12378912; - int b = 2982827; - a=a-b; a=a-c; a=a^(c>>13); - b=b-c; b=b-a; b=b^(a<<8); - c=c-a; c=c-b; c=c^(b>>13); - a=a-b; a=a-c; a=a^(c>>12); - b=b-c; b=b-a; b=b^(a<<16); - c=c-a; c=c-b; c=c^(b>>5); - a=a-b; a=a-c; a=a^(c>>3); - b=b-c; b=b-a; b=b^(a<<10); - c=c-a; c=c-b; c=c^(b>>15); - return c; - } - // robert jenkins 32-bit - // sucks - if (0) { - n += (n << 12); - n ^= (n >> 22); - n += (n << 4); - n ^= (n >> 9); - n += (n << 10); - n ^= (n >> 2); - n += (n << 7); - n ^= (n >> 12); - return n; - } - - // djb2 - if (0) { - unsigned int hash = 5381; - for (int i=0; i<4; i++) { - hash = ((hash << 5) + hash) + ((n&255) ^ 123); - n = n >> 8; - } - return hash; - } - - - // SDBM - if (1) { - unsigned int hash = 0; - - for(unsigned int i = 0; i < 4; i++) - { - hash = (n&255) + (hash << 6) + (hash << 16) - hash; - n = n >> 8; - } - - return (hash & 0x7FFFFFFF); - } - - // PJW - // horrid - if (0) { - unsigned int BitsInUnsignedInt = (unsigned int)(sizeof(unsigned int) * 8); - unsigned int ThreeQuarters = (unsigned int)((BitsInUnsignedInt * 3) / 4); - unsigned int OneEighth = (unsigned int)(BitsInUnsignedInt / 8); - unsigned int HighBits = (unsigned int)(0xFFFFFFFF) << (BitsInUnsignedInt - OneEighth); - unsigned int hash = 0; - unsigned int test = 0; - - for(unsigned int i = 0; i < 4; i++) - { - hash = (hash << OneEighth) + (n&255); - - if((test = hash & HighBits) != 0) - { - hash = (( hash ^ (test >> ThreeQuarters)) & (~HighBits)); - } - n = n >> 8; - } - - return (hash & 0x7FFFFFFF); - } - - // RS Hash function, from Robert Sedgwicks Algorithms in C book, w/ some changes. - if (0) { - unsigned int b = 378551; - unsigned int a = 63689; - unsigned int hash = 0; - - for(unsigned int i=0; i<4; i++) - { - hash = hash * a + (n&0xff); - a = a * b; - n = n >> 8; - } - - return (hash & 0x7FFFFFFF); - } - - // DJB - // worse than rs - if (0) { - unsigned int hash = 5381; - - for(unsigned int i = 0; i < 4; i++) - { - hash = ((hash << 5) + hash) + (n&255); - n = n >> 8; - } - - return (hash & 0x7FFFFFFF); - } - - // AP - // even worse - if (1) { - unsigned int hash = 0; - - for(unsigned int i = 0; i < 4; i++) - { - hash ^= ((i & 1) == 0) ? ( (hash << 7) ^ (n&255) ^ (hash >> 3)) : - (~((hash << 11) ^ (n&255) ^ (hash >> 5))); - n = n >> 8; - } - - return (hash & 0x7FFFFFFF); - } - - -#endif diff --git a/tags/20070517_before_mds_merge/crush/crush.h b/tags/20070517_before_mds_merge/crush/crush.h deleted file mode 100644 index aa93031beb51e..0000000000000 --- a/tags/20070517_before_mds_merge/crush/crush.h +++ /dev/null @@ -1,534 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __crush_CRUSH_H -#define __crush_CRUSH_H - -#include -#include -#include -#include -#include -using namespace std; -#include -#include -using namespace __gnu_cxx; - - -#include "Bucket.h" - -#include "include/buffer.h" - - -namespace crush { - - - // *** RULES *** - - class RuleStep { - public: - int cmd; - vector args; - - RuleStep(int c) : cmd(c) {} - RuleStep(int c, int a) : cmd(c) { - args.push_back(a); - } - RuleStep(int c, int a, int b) : cmd(c) { - args.push_back(a); - args.push_back(b); - } - RuleStep(int o, int a, int b, int c) : cmd(o) { - args.push_back(a); - args.push_back(b); - args.push_back(c); - } - - void _encode(bufferlist& bl) { - bl.append((char*)&cmd, sizeof(cmd)); - ::_encode(args, bl); - } - void _decode(bufferlist& bl, int& off) { - bl.copy(off, sizeof(cmd), (char*)&cmd); - off += sizeof(cmd); - ::_decode(args, bl, off); - } - }; - - - // Rule operations - const int CRUSH_RULE_TAKE = 0; - const int CRUSH_RULE_CHOOSE = 1; // first n by default - const int CRUSH_RULE_CHOOSE_FIRSTN = 1; - const int CRUSH_RULE_CHOOSE_INDEP = 2; - const int CRUSH_RULE_EMIT = 3; - - class Rule { - public: - vector< RuleStep > steps; - - void _encode(bufferlist& bl) { - int n = steps.size(); - bl.append((char*)&n, sizeof(n)); - for (int i=0; i buckets; - int bucketno; - Hash h; - - hash_map parent_map; // what bucket each leaf/bucket lives in - - public: - map rules; - - //map collisions; - //map bumps; - - void _encode(bufferlist& bl) { - // buckets - int n = buckets.size(); - bl.append((char*)&n, sizeof(n)); - for (map::const_iterator it = buckets.begin(); - it != buckets.end(); - it++) { - bl.append((char*)&it->first, sizeof(it->first)); - it->second->_encode(bl); - } - bl.append((char*)&bucketno, sizeof(bucketno)); - - // hash - int s = h.get_seed(); - bl.append((char*)&s, sizeof(s)); - - //::_encode(out, bl); - //::_encode(overload, bl); - - // rules - n = rules.size(); - bl.append((char*)&n, sizeof(n)); - for(map::iterator it = rules.begin(); - it != rules.end(); - it++) { - bl.append((char*)&it->first, sizeof(it->first)); - it->second._encode(bl); - } - - } - - void _decode(bufferlist& bl, int& off) { - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i::iterator bp = buckets.begin(); - bp != buckets.end(); - ++bp) { - // index bucket items - vector items; - bp->second->get_items(items); - for (vector::iterator ip = items.begin(); - ip != items.end(); - ++ip) - parent_map[*ip] = bp->first; - } - } - - - - public: - Crush(int seed=123) : bucketno(-1), h(seed) {} - ~Crush() { - // hose buckets - for (map::iterator it = buckets.begin(); - it != buckets.end(); - it++) { - delete it->second; - } - } - - int print(ostream& out, int root, int indent=0) { - for (int i=0; iget_weight() << "\t" << b->get_id() << "\t"; - for (int i=0; iget_bucket_type() << ": "; - - vector items; - b->get_items(items); - - if (buckets.count(items[0])) { - out << endl; - for (unsigned i=0; iset_id(n); - buckets[n] = b; - return n; - } - - void add_item(int parent, int item, float w, bool back=false) { - // add item - assert(!buckets[parent]->is_uniform()); - Bucket *p = buckets[parent]; - - p->add_item(item, w, back); - - // set item's parent - Bucket *n = buckets[item]; - if (n) - n->set_parent(parent); - - // update weights - while (buckets.count(p->get_parent())) { - int child = p->get_id(); - p = buckets[p->get_parent()]; - p->adjust_item_weight(child, w); - } - } - - - /* - this is a hack, fix me! weights should be consistent throughout hierarchy! - - */ - void set_bucket_weight(int item, float w) { - Bucket *b = buckets[item]; - float adj = w - b->get_weight(); - - while (buckets.count(b->get_parent())) { - Bucket *p = buckets[b->get_parent()]; - p->adjust_item_weight(b->get_id(), adj); - b = p; - } - } - - - /* - * choose numrep distinct items of type type - */ - void choose(int x, - int numrep, - int type, - Bucket *inbucket, - vector& outvec, - bool firstn, - set& outset, map& overloadmap, - bool forcefeed=false, - int forcefeedval=-1) { - int off = outvec.size(); - - // for each replica - for (int rep=0; repis_uniform()) { - // uniform bucket; be careful! - if (firstn || numrep >= in->get_size()) { - // uniform bucket is too small; just walk thru elements - r += ftotal; // r' = r + f_total (first n) - } else { - // make sure numrep is not a multple of bucket size - int add = numrep*flocal; // r' = r + n*f_local - if (in->get_size() % numrep == 0) { - add += add/in->get_size(); // shift seq once per pass through the bucket - } - r += add; - } - } else { - // mixed bucket; just make a distinct-ish r sequence - if (firstn) - r += ftotal; // r' = r + f_total - else - r += numrep * flocal; // r' = r + n*f_local - } - - // choose - outv = in->choose_r(x, r, h); - - // did we get the type we want? - int itemtype = 0; // 0 is terminal type - Bucket *newin = 0; // remember bucket we hit - if (in->is_uniform()) { - itemtype = ((UniformBucket*)in)->get_item_type(); - } else { - if (buckets.count(outv)) { // another bucket - newin = buckets[outv]; - itemtype = newin->get_type(); - } - } - if (itemtype == type) { // this is what we want! - // collision? - bool collide = false; - for (int prep=0; prep overloadmap[outv]) - bad = true; - } - - if (collide || bad) { - ftotal++; - flocal++; - - if (collide && flocal < 3) - continue; // try locally a few times! - - if (ftotal >= 10) { - // ok fine, just ignore dup. FIXME. - skip_rep = true; - break; - } - - retry_rep = true; - } - - break; // ok then! - } - - // next - in = newin; - } - - if (retry_rep) continue; // try again - - break; - } - - // skip this rep? (e.g. too many collisions, we give up) - if (skip_rep) continue; - - // output this value - outvec.push_back(outv); - } // for rep - - // double check! - if (0) { - for (unsigned i=1; i& result, - set& outset, map& overloadmap, - int forcefeed=-1) { - //int numresult = 0; - result.clear(); - - // determine hierarchical context for first. - list force_stack; - if (forcefeed >= 0) { - int t = forcefeed; - while (1) { - force_stack.push_front(t); - if (parent_map.count(t) == 0) break; // reached root, presumably. - //cout << " " << t << " parent is " << parent_map[t] << endl; - t = parent_map[t]; - } - } - - // working vector - vector w; // working variable - - // go through each statement - for (vector::iterator pc = rule.steps.begin(); - pc != rule.steps.end(); - pc++) { - // move input? - - // do it - switch (pc->cmd) { - case CRUSH_RULE_TAKE: - { - const int arg = pc->args[0]; - //cout << "take " << arg << endl; - - if (!force_stack.empty()) { - int forceval = force_stack.front(); - force_stack.pop_front(); - assert(arg == forceval); - } - - w.clear(); - w.push_back(arg); - } - break; - - case CRUSH_RULE_CHOOSE_FIRSTN: - case CRUSH_RULE_CHOOSE_INDEP: - { - const bool firstn = pc->cmd == CRUSH_RULE_CHOOSE_FIRSTN; - const int numrep = pc->args[0]; - const int type = pc->args[1]; - - //cout << "choose " << numrep << " of type " << type << endl; - - assert(!w.empty()); - - // reset output - vector out; - - // forcefeeding? - bool forcing = false; - int forceval; - if (!force_stack.empty()) { - forceval = force_stack.front(); - force_stack.pop_front(); - //cout << "priming out with " << forceval << endl; - forcing = true; - } - - // do each row independently - for (vector::iterator i = w.begin(); - i != w.end(); - i++) { - assert(buckets.count(*i)); - Bucket *b = buckets[*i]; - choose(x, numrep, type, b, out, firstn, - outset, overloadmap, - forcing, - forceval); - forcing = false; // only once - } // for inrow - - // put back into w - w.swap(out); - out.clear(); - } - break; - - case CRUSH_RULE_EMIT: - { - for (unsigned i=0; i - -#include -#include -using namespace std; - - -void place(Crush& c, Rule& rule, int numpg, int numrep, map >& placement) -{ - vector v(numrep); - map ocount; - - for (int x=1; x<=numpg; x++) { - - //cout << H(x) << "\t" << h(x) << endl; - c.do_rule(rule, x, v); - //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl; - - bool bad = false; - for (int i=0; i::iterator it = ocount.begin(); - it != ocount.end(); - it++) - cout << it->first << "\t" << it->second << endl; - -} - - -float testmovement(int n, float f, int buckettype) -{ - Hash h(73232313); - - // crush - Crush c; - - int ndisks = 0; - - // bucket - Bucket *b; - if (buckettype == 0) - b = new TreeBucket(1); - else if (buckettype == 1 || buckettype == 2) - b = new ListBucket(1); - else if (buckettype == 3) - b = new StrawBucket(1); - else if (buckettype == 4) - b = new UniformBucket(0,0); - - for (int i=0; iadd_item(ndisks++,1); - - c.add_bucket(b); - int root = b->get_id(); - - //c.print(cout,root); - - // rule - int numrep = 2; - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - //c.overload[10] = .1; - - - int pg_per = 1000; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - - /* - cout << ndisks << " disks, " << endl; - cout << pg_per << " pgs per disk" << endl; - cout << numpg << " logical pgs" << endl; - cout << "numrep is " << numrep << endl; - */ - map > placement1, placement2; - - //c.print(cout, root); - - - // ORIGINAL - place(c, rule, numpg, numrep, placement1); - - int olddisks = ndisks; - - // add item - if (buckettype == 2) { - // start over! - ndisks = 0; - b = new ListBucket(1); - for (int i=0; i<=n; i++) - b->add_item(ndisks++,1); - c.add_bucket(b); - root = b->get_id(); - - rule.steps.clear(); - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - } - else - b->add_item(ndisks++, 1); - - - // ADDED - //c.print(cout, root); - place(c, rule, numpg, numrep, placement2); - - int moved = 0; - for (int x=1; x<=numpg; x++) - if (placement1[x] != placement2[x]) - for (int j=0; j - -#include -#include -using namespace std; - - -Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - return b; - } else { - // mixed - //Bucket *b = new MixedBucket(h+1); - Bucket *b = new StrawBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); - return b->get_id(); -} - - -float go(int dep) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - if (0) { - for (int d=0; dadd_item(ndisks++, 10); - root = c.add_bucket(b); - } - if (0) { - vector disks; - for (int i=0; i<10000; i++) - disks.push_back(ndisks++); - UniformBucket *b = new UniformBucket(1, 0, 10000, disks); - Hash h(123); - b->make_primes(h); - root = c.add_bucket(b); - } - - - - // rule - int numrep = 1; - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - //c.overload[10] = .1; - - - int pg_per = 100; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - //cout << ndisks << " disks, " << endl; - //cout << pg_per << " pgs per disk" << endl; - // cout << numpg << " logical pgs" << endl; - //cout << "numrep is " << numrep << endl; - - - int place = 100000; - int times = place / numpg; - if (!times) times = 1; - - cout << "#looping " << times << " times" << endl; - - float tvar = 0; - int tvarnum = 0; - - int x = 0; - for (int t=0; t v(numrep); - - for (int z=0; z - -#include -#include -using namespace std; - -int buckettype = 0; - -Bucket *make_bucket(Crush& c, vector& wid, int h, map< int, list >& buckets, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - buckets[h].push_back(b); - return b; - } else { - // mixed - //Bucket *b = new TreeBucket(h+1); - //Bucket *b = new ListBucket(h+1); - //Bucket *b = new StrawBucket(h+1); - Bucket *b; - if (buckettype == 0) - b = new TreeBucket(h+1); - else if (buckettype == 1 || buckettype == 2) - b = new ListBucket(h+1); - else if (buckettype == 3) - b = new StrawBucket(h+1); - - c.add_bucket(b); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - n->set_parent(b->get_id()); - } - buckets[h].push_back(b); - //cout << b->get_id() << " mixedbucket with " << wid[h] << " at " << h << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, map< int, list >& buckets, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, buckets, ndisks); - return b->get_id(); -} - - -void place(Crush& c, Rule& rule, int numpg, int numrep, map >& placement) -{ - vector v(numrep); - map ocount; - - for (int x=1; x<=numpg; x++) { - - //cout << H(x) << "\t" << h(x) << endl; - c.do_rule(rule, x, v); - //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl; - - bool bad = false; - for (int i=0; i::iterator it = ocount.begin(); - it != ocount.end(); - it++) - cout << it->first << "\t" << it->second << endl; - -} - - -float testmovement(int depth, int branching, int udisks, int add, int modifydepth) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - wid.push_back(udisks); - for (int d=1; d > buckets; - - root = make_hierarchy(c, wid, buckets, ndisks); - - //c.print(cout,root); - - // rule - int numrep = 2; - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - //c.overload[10] = .1; - - - int pg_per = 100; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - - /* - cout << ndisks << " disks, " << endl; - cout << pg_per << " pgs per disk" << endl; - cout << numpg << " logical pgs" << endl; - cout << "numrep is " << numrep << endl; - */ - map > placement1, placement2; - - //c.print(cout, root); - - - // ORIGINAL - place(c, rule, numpg, numrep, placement1); - - int olddisks = ndisks; - - // add disks - //cout << " adding " << add << " disks" << endl; - vector disks; - for (int i=0; imake_primes(h); - - //Bucket *o = buckets[2].back(); - Bucket *o; - if (buckettype == 2) - o = buckets[modifydepth].front(); - else - o = buckets[modifydepth].back(); - - c.add_bucket(b); - //cout << " adding under " << o->get_id() << endl; - c.add_item(o->get_id(), b->get_id(), b->get_weight()); - //((MixedBucket*)o)->add_item(b->get_id(), b->get_weight()); - //newbucket = b; - - - // ADDED - //c.print(cout, root); - place(c, rule, numpg, numrep, placement2); - - int moved = 0; - for (int x=1; x<=numpg; x++) - if (placement1[x] != placement2[x]) - for (int j=0; j - -#include -#include -using namespace std; - - -int buckettype = 2; // 0 = mixed, 1 = linear, 2 = straw - -int big_one_skip = 255; -int big_one_size; -Bucket *big_one = 0; - -Bucket *make_bucket(Crush& c, vector& wid, int h, map< int, list >& buckets, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - - int s = wid[h]; - if (big_one_skip > 0) - big_one_skip--; - if (!big_one_skip && !big_one) - s = big_one_size; - - - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks " << disks.size()<< endl; - buckets[h].push_back(b); - return b; - } else { - // mixed - Bucket *b; - if (buckettype == 0) - b = new TreeBucket(h+1); - else if (buckettype == 1) - b = new ListBucket(h+1); - else if (buckettype == 2) - b = new StrawBucket(h+1); - c.add_bucket(b); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - n->set_parent(b->get_id()); - } - buckets[h].push_back(b); - //cout << b->get_id() << " mixedbucket with " << wid[h] << " at " << h << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, map< int, list >& buckets, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, buckets, ndisks); - return b->get_id(); -} - - -void place(Crush& c, Rule& rule, int numpg, int numrep, map >& placement) -{ - vector v(numrep); - map ocount; - - for (int x=1; x<=numpg; x++) { - - //cout << H(x) << "\t" << h(x) << endl; - c.do_rule(rule, x, v); - //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl; - - bool bad = false; - for (int i=0; i::iterator it = ocount.begin(); - it != ocount.end(); - it++) - cout << it->first << "\t" << it->second << endl; - -} - - -float testmovement(int depth, int branching, int udisks, int add) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - wid.push_back(udisks); - for (int d=1; d > buckets; - - big_one_size = add; - big_one = 0; - - //cout << "making tree" << endl; - root = make_hierarchy(c, wid, buckets, ndisks); - - //c.print(cout, root); - - - // rule - int numrep = 2; - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - //c.overload[10] = .1; - - - int pg_per = 100; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - - /* - cout << ndisks << " disks, " << endl; - cout << pg_per << " pgs per disk" << endl; - cout << numpg << " logical pgs" << endl; - cout << "numrep is " << numrep << endl; - */ - map > placement1, placement2; - - //c.print(cout, root); - - int olddisks = ndisks; - - - place(c, rule, numpg, numrep, placement1); - - if (1) { - // remove disks - assert(big_one); - c.adjust_item(big_one->get_id(), 0); - } - - int newdisks = ndisks - add; - - //c.print(cout, root); - place(c, rule, numpg, numrep, placement2); - - int moved = 0; - for (int x=1; x<=numpg; x++) - if (placement1[x] != placement2[x]) - for (int j=0; j >::iterator i = r.begin(); - i != r.end(); - i++) { - cout << i->first; - for (map::iterator j = i->second.begin(); - j != i->second.end(); - j++) - cout << "\t" << j->first << "\t" << j->second; - cout << endl; - } - */ -} - diff --git a/tags/20070517_before_mds_merge/crush/test/cluster_movement_rush.cc b/tags/20070517_before_mds_merge/crush/test/cluster_movement_rush.cc deleted file mode 100644 index 90cc197c24f65..0000000000000 --- a/tags/20070517_before_mds_merge/crush/test/cluster_movement_rush.cc +++ /dev/null @@ -1,218 +0,0 @@ - - -#include "../crush.h" -using namespace crush; - -#include - -#include -#include -using namespace std; - -int buckettype = 0; - -Bucket *make_bucket(Crush& c, vector& wid, int h, map< int, list >& buckets, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - buckets[h].push_back(b); - return b; - } else { - // mixed - //Bucket *b = new TreeBucket(h+1); - //Bucket *b = new ListBucket(h+1); - //Bucket *b = new StrawBucket(h+1); - Bucket *b; - if (buckettype == 0) - b = new TreeBucket(h+1); - else if (buckettype == 1 || buckettype == 2) - b = new ListBucket(h+1); - else if (buckettype == 3) - b = new StrawBucket(h+1); - - c.add_bucket(b); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - n->set_parent(b->get_id()); - } - buckets[h].push_back(b); - //cout << b->get_id() << " mixedbucket with " << wid[h] << " at " << h << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, map< int, list >& buckets, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, buckets, ndisks); - return b->get_id(); -} - - -void place(Crush& c, Rule& rule, int numpg, int numrep, map >& placement) -{ - vector v(numrep); - map ocount; - - for (int x=1; x<=numpg; x++) { - - //cout << H(x) << "\t" << h(x) << endl; - c.do_rule(rule, x, v); - //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl; - - bool bad = false; - for (int i=0; i::iterator it = ocount.begin(); - it != ocount.end(); - it++) - cout << it->first << "\t" << it->second << endl; - -} - - -float testmovement(int depth, int branching, int udisks, int add, int modifydepth) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - wid.push_back(udisks); - for (int d=1; d > buckets; - - root = make_hierarchy(c, wid, buckets, ndisks); - - //c.print(cout,root); - - // rule - int numrep = 2; - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - //c.overload[10] = .1; - - - int pg_per = 100; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - - /* - cout << ndisks << " disks, " << endl; - cout << pg_per << " pgs per disk" << endl; - cout << numpg << " logical pgs" << endl; - cout << "numrep is " << numrep << endl; - */ - map > placement1, placement2; - - //c.print(cout, root); - - - // ORIGINAL - place(c, rule, numpg, numrep, placement1); - - int olddisks = ndisks; - - // add disks - //cout << " adding " << add << " disks" << endl; - vector disks; - for (int i=0; imake_primes(h); - - //Bucket *o = buckets[2].back(); - Bucket *o; - if (buckettype == 2) - o = buckets[modifydepth].front(); - else - o = buckets[modifydepth].back(); - - c.add_bucket(b); - //cout << " adding under " << o->get_id() << endl; - c.add_item(o->get_id(), b->get_id(), b->get_weight(), buckettype == 2); - //((MixedBucket*)o)->add_item(b->get_id(), b->get_weight()); - //newbucket = b; - - - // ADDED - //c.print(cout, root); - place(c, rule, numpg, numrep, placement2); - - int moved = 0; - for (int x=1; x<=numpg; x++) - if (placement1[x] != placement2[x]) - for (int j=0; j - -#include -#include -using namespace std; - - -Clock g_clock; - - -Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks weight " << w << endl; - return b; - } else { - // mixed - Bucket *b = new TreeBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); - return b->get_id(); -} - - - -float go(int dep, int failpc) -{ - Hash h(73232313); - - //int overloadcutoff = (int)((float)10000.0 / (float)utilization); - - //cout << "util " << utilization << " cutoff " << overloadcutoff << endl; - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - for (int d=0; d ocount(ndisks); - //cout << ndisks << " disks, " << endl; - //cout << pg_per << " pgs per disk" << endl; - // cout << numpg << " logical pgs" << endl; - //cout << "numrep is " << numrep << endl; - - - int place = 1000000; - int times = place / numpg; - if (!times) times = 1; - - - //cout << "looping " << times << " times" << endl; - - float tavg[10]; - float tvar[10]; - for (int j=0;j<10;j++) { - tvar[j] = 0; - tavg[j] = 0; - } - int tvarnum = 0; - float trvar = 0.0; - - float overloadsum = 0.0; - float adjustsum = 0.0; - float afteroverloadsum = 0.0; - float aslowdown = 0.0; - int chooses = 0; - int xs = 1; - for (int t=0; t v(numrep); - - c.out.clear(); - - for (int z=0; z= ndisks) cout << "v[i] " << i << " is " << v[i] << " .. x = " << x << endl; - //assert(v[i] < ndisks); - ocount[v[i]]++; - } - } - utime_t t1b = g_clock.now(); - - // add in numf failed disks - for (int f = 0; f < numf; f++) { - int d = rand() % ndisks; - while (c.out.count(d)) d = rand() % ndisks; - c.out.insert(d); - } - - utime_t t3a = g_clock.now(); - for (int x=xs; x - -#include -#include -using namespace std; - - -Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - return b; - } else { - // mixed - MixedBucket *b = new MixedBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); - return b->get_id(); -} - - -Bucket *make_random(Crush& c, int wid, int height, int& ndisks) -{ - int w = rand() % (wid-1) + 2; - - if (height == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - return b; - } else { - // mixed - int h = rand() % height + 1; - MixedBucket *b = new MixedBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } - -} - - -float go(int dep, int overloadcutoff) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - for (int d=0; dget_id(); - //c.print(cout, root); - } - if (0) { - MixedBucket *b = new MixedBucket(1); - for (int i=0; i<10000; i++) - b->add_item(ndisks++, 10); - root = c.add_bucket(b); - } - if (0) { - vector disks; - for (int i=0; i<10000; i++) - disks.push_back(ndisks++); - UniformBucket *b = new UniformBucket(1, 0, 10000, disks); - Hash h(123); - b->make_primes(h); - root = c.add_bucket(b); - } - //cout << ndisks << " disks" << endl; - - - - // rule - int numrep = 1; - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - //c.overload[10] = .1; - - - int pg_per = 100; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - //cout << ndisks << " disks, " << endl; - //cout << pg_per << " pgs per disk" << endl; - // cout << numpg << " logical pgs" << endl; - //cout << "numrep is " << numrep << endl; - - - int place = 1000000; - int times = place / numpg; - if (!times) times = 1; - - - //cout << "looping " << times << " times" << endl; - - float tvar = 0; - int tvarnum = 0; - - float overloadsum = 0.0; - float adjustsum = 0.0; - float afteroverloadsum = 0.0; - int chooses = 0; - int xs = 1; - for (int t=0; t v(numrep); - - c.overload.clear(); - - for (int z=0; z overloadcutoff) - overloaded++; - - if (ocount[i] > 100+(overloadcutoff-100)/2) { - adjusted++; - c.overload[i] = 100.0 / (float)ocount[i]; - //cout << "disk " << i << " has " << ocount[i] << endl; - } - ocount[i] = 0; - } - //cout << overloaded << " overloaded" << endl; - overloadsum += (float)overloaded / (float)ndisks; - adjustsum += (float)adjusted / (float)ndisks; - - - for (int x=xs; x overloadcutoff) { - still++; - //c.overload[ocount[i]] = 100.0 / (float)ocount[i]; - //cout << "disk " << i << " has " << ocount[i] << endl; - } - } - //if (still) cout << "overload was " << overloaded << " now " << still << endl; - afteroverloadsum += (float)still / (float)ndisks; - - //cout << "collisions: " << c.collisions << endl; - //cout << "r bumps: " << c.bumps << endl; - - float avg = 0.0; - for (int i=0; i100; d -= 5) { - float var = go(3,d); - //cout << "## depth = " << d << endl; - //cout << d << "\t" << var << endl; - } -} diff --git a/tags/20070517_before_mds_merge/crush/test/depth_variance.cc b/tags/20070517_before_mds_merge/crush/test/depth_variance.cc deleted file mode 100644 index 7d60ebaae9501..0000000000000 --- a/tags/20070517_before_mds_merge/crush/test/depth_variance.cc +++ /dev/null @@ -1,185 +0,0 @@ - - -#include "../crush.h" -using namespace crush; - -#include - -#include -#include -using namespace std; - - -Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - return b; - } else { - // mixed - Bucket *b = new TreeBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); - return b->get_id(); -} - - -float go(int dep) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - if (1) { - for (int d=0; d ocount(ndisks); - //cout << ndisks << " disks, " << endl; - //cout << pg_per << " pgs per disk" << endl; - // cout << numpg << " logical pgs" << endl; - //cout << "numrep is " << numrep << endl; - - - int place = 100000; - int times = place / numpg; - if (!times) times = 1; - - cout << "#looping " << times << " times" << endl; - - float tvar = 0; - int tvarnum = 0; - float tavg = 0; - - int x = 0; - for (int t=0; t v(numrep); - - for (int z=0; z - -#include -#include -using namespace std; - - -Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks weight " << w << endl; - return b; - } else { - // mixed - Bucket *b = new TreeBucket(h+1); - //Bucket *b = new StrawBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); - return b->get_id(); -} - - - -float go(int dep, int overloadcutoff) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - for (int d=0; d ocount(ndisks); - //cout << ndisks << " disks, " << endl; - //cout << pg_per << " pgs per disk" << endl; - // cout << numpg << " logical pgs" << endl; - //cout << "numrep is " << numrep << endl; - - - int place = 100000; - int times = place / numpg; - if (!times) times = 1; - - - //cout << "looping " << times << " times" << endl; - - float tavg[10]; - float tvar[10]; - for (int j=0;j<10;j++) { - tvar[j] = 0; - tavg[j] = 0; - } - int tvarnum = 0; - - float overloadsum = 0.0; - float adjustsum = 0.0; - float afteroverloadsum = 0.0; - int chooses = 0; - int xs = 1; - for (int t=0; t v(numrep); - - c.overload.clear(); - - for (int z=0; z cutoff) - overloaded++; - - if (ocount[i] > adjoff) { - adjusted++; - c.overload[i] = (float)target / (float)ocount[i]; - //cout << "setting overload " << i << " to " << c.overload[i] << endl; - //cout << "disk " << i << " has " << ocount[i] << endl; - } - ocount[i] = 0; - } - //cout << overloaded << " overloaded" << endl; - overloadsum += (float)overloaded / (float)ndisks; - adjustsum += (float)adjusted / (float)ndisks; - - - - if (1) { - // second pass - for (int x=xs; x= adjoff) { - adjusted++; - if (c.overload.count(i) == 0) { - c.overload[i] = 1.0; - adjusted++; - } - //else cout << "(re)adjusting " << i << endl; - c.overload[i] *= (float)target / (float)ocount[i]; - //cout << "setting overload " << i << " to " << c.overload[i] << endl; - //cout << "disk " << i << " has " << ocount[i] << endl; - } - ocount[i] = 0; - } - } - - for (int x=xs; x cutoff) { - still++; - //c.overload[ocount[i]] = 100.0 / (float)ocount[i]; - if (c.overload.count(i)) cout << "[adjusted] "; - cout << "disk " << i << " has " << ocount[i] << endl; - } - } - //if (still) cout << "overload was " << overloaded << " now " << still << endl; - afteroverloadsum += (float)still / (float)ndisks; - - //cout << "collisions: " << c.collisions << endl; - //cout << "r bumps: " << c.bumps << endl; - - int n = ndisks/10; - float avg[10]; - float var[10]; - for (int i=0;i<10;i++) { - int s = n*i; - avg[i] = 0.0; - for (int j=0; j100; d -= 5) { - float var = go(3,d); - //cout << "## depth = " << d << endl; - //cout << d << "\t" << var << endl; - } -} diff --git a/tags/20070517_before_mds_merge/crush/test/movement.cc b/tags/20070517_before_mds_merge/crush/test/movement.cc deleted file mode 100644 index 2621f09457fe6..0000000000000 --- a/tags/20070517_before_mds_merge/crush/test/movement.cc +++ /dev/null @@ -1,223 +0,0 @@ - - -#include "../crush.h" -using namespace crush; - -#include - -#include -#include -using namespace std; - - -Bucket *make_bucket(Crush& c, vector& wid, int h, map< int, list >& buckets, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - buckets[h].push_back(b); - return b; - } else { - // mixed - MixedBucket *b = new MixedBucket(h+1); - c.add_bucket(b); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - n->set_parent(b->get_id()); - } - buckets[h].push_back(b); - //cout << b->get_id() << " mixedbucket with " << wid[h] << " at " << h << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, map< int, list >& buckets, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, buckets, ndisks); - return b->get_id(); -} - - -void place(Crush& c, Rule& rule, int numpg, int numrep, map >& placement) -{ - vector v(numrep); - map ocount; - - for (int x=1; x<=numpg; x++) { - - //cout << H(x) << "\t" << h(x) << endl; - c.do_rule(rule, x, v); - //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl; - - bool bad = false; - for (int i=0; i::iterator it = ocount.begin(); - it != ocount.end(); - it++) - cout << it->first << "\t" << it->second << endl; - -} - - -float testmovement(int depth, int branching, int udisks) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - wid.push_back(udisks); - for (int d=1; d > buckets; - - if (1) { - root = make_hierarchy(c, wid, buckets, ndisks); - } - if (0) { - MixedBucket *b = new MixedBucket(1); - for (int i=0; i<10000; i++) - b->add_item(ndisks++, 10); - root = c.add_bucket(b); - } - if (0) { - vector disks; - for (int i=0; i<10000; i++) - disks.push_back(ndisks++); - UniformBucket *b = new UniformBucket(1, 0, 1, disks); - Hash h(123); - b->make_primes(h); - root = c.add_bucket(b); - } - - - - // rule - int numrep = 2; - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - //c.overload[10] = .1; - - - int pg_per = 100; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - - /* - cout << ndisks << " disks, " << endl; - cout << pg_per << " pgs per disk" << endl; - cout << numpg << " logical pgs" << endl; - cout << "numrep is " << numrep << endl; - */ - map > placement1, placement2; - - //c.print(cout, root); - - place(c, rule, numpg, numrep, placement1); - - if (1) { - // failed - - //for (int i=500; i<1000; i++) - //c.failed.insert(i); - c.failed.insert(0); - } - - int olddisks = ndisks; - - if (1) { - int n = udisks; - //cout << " adding " << n << " disks" << endl; - vector disks; - for (int i=0; imake_primes(h); - Bucket *o = buckets[1].back(); - c.add_bucket(b); - //cout << " adding under " << o->get_id() << endl; - c.add_item(o->get_id(), b->get_id(), b->get_weight()); - //((MixedBucket*)o)->add_item(b->get_id(), b->get_weight()); - } - - //c.print(cout, root); - place(c, rule, numpg, numrep, placement2); - - int moved = 0; - for (int x=1; x<=numpg; x++) { - if (placement1[x] != placement2[x]) { - for (int j=0; j v; - cout << depth; - for (int branching = 3; branching < 16; branching += 1) { - float fac = testmovement(depth, branching, udisks); - v.push_back(fac); - int n = udisks * pow((float)branching, (float)depth-1); - cout << "\t" << n; - cout << "\t" << fac; - } - //for (int i=0; i - -#include -#include -using namespace std; - - -Bucket *make_bucket(Crush& c, vector& wid, int h, map< int, list >& buckets, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - buckets[h].push_back(b); - return b; - } else { - // mixed - MixedBucket *b = new MixedBucket(h+1); - c.add_bucket(b); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - n->set_parent(b->get_id()); - } - buckets[h].push_back(b); - //cout << b->get_id() << " mixedbucket with " << wid[h] << " at " << h << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, map< int, list >& buckets, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, buckets, ndisks); - return b->get_id(); -} - - -void place(Crush& c, Rule& rule, int numpg, int numrep, map >& placement) -{ - vector v(numrep); - map ocount; - - for (int x=1; x<=numpg; x++) { - - //cout << H(x) << "\t" << h(x) << endl; - c.do_rule(rule, x, v); - //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl; - - bool bad = false; - for (int i=0; i::iterator it = ocount.begin(); - it != ocount.end(); - it++) - cout << it->first << "\t" << it->second << endl; - -} - - -float testmovement(int depth, int branching, int udisks) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - wid.push_back(udisks); - for (int d=1; d > buckets; - - if (1) { - root = make_hierarchy(c, wid, buckets, ndisks); - } - if (0) { - MixedBucket *b = new MixedBucket(1); - for (int i=0; i<10000; i++) - b->add_item(ndisks++, 10); - root = c.add_bucket(b); - } - if (0) { - vector disks; - for (int i=0; i<10000; i++) - disks.push_back(ndisks++); - UniformBucket *b = new UniformBucket(1, 0, 1, disks); - Hash h(123); - b->make_primes(h); - root = c.add_bucket(b); - } - - - - // rule - int numrep = 2; - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - //c.overload[10] = .1; - - - int pg_per = 100; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - - /* - cout << ndisks << " disks, " << endl; - cout << pg_per << " pgs per disk" << endl; - cout << numpg << " logical pgs" << endl; - cout << "numrep is " << numrep << endl; - */ - map > placement1, placement2; - - //c.print(cout, root); - - place(c, rule, numpg, numrep, placement1); - - float over = .5; - - if (1) { - // failed - - //for (int i=500; i<1000; i++) - //c.failed.insert(i); - //c.failed.insert(0); - c.overload[0] = over; - } - - int olddisks = ndisks; - - - - if (0) { - int n = udisks; - //cout << " adding " << n << " disks" << endl; - vector disks; - for (int i=0; imake_primes(h); - Bucket *o = buckets[1].back(); - c.add_bucket(b); - //cout << " adding under " << o->get_id() << endl; - c.add_item(o->get_id(), b->get_id(), b->get_weight()); - //((MixedBucket*)o)->add_item(b->get_id(), b->get_weight()); - } - - //c.print(cout, root); - place(c, rule, numpg, numrep, placement2); - - vector moved(ndisks); - - //int moved = 0; - for (int d=0; d::iterator it = placement1[d].begin(); - it != placement1[d].end(); - it++) { - placement2[d].erase(*it); - } - } - - float avg = 0; - for (int d=0; d v; - cout << depth; - for (int branching = 3; branching < 16; branching += 1) { - float fac = testmovement(depth, branching, udisks); - v.push_back(fac); - int n = udisks * pow((float)branching, (float)depth-1); - //cout << "\t" << n; - //cout << "\t" << fac; - } - //for (int i=0; i - -#include -#include -using namespace std; - - -Clock g_clock; - - -Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks weight " << w << endl; - return b; - } else { - // mixed - Bucket *b = new TreeBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); - return b->get_id(); -} - - - -float go(int dep, int utilization ) -{ - Hash h(73232313); - - int overloadcutoff = (int)((float)10000.0 / (float)utilization); - - //cout << "util " << utilization << " cutoff " << overloadcutoff << endl; - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - for (int d=0; d ocount(ndisks); - //cout << ndisks << " disks, " << endl; - //cout << pg_per << " pgs per disk" << endl; - // cout << numpg << " logical pgs" << endl; - //cout << "numrep is " << numrep << endl; - - - int place = 100000; - int times = place / numpg; - if (!times) times = 1; - - - //cout << "looping " << times << " times" << endl; - - float tavg[10]; - float tvar[10]; - for (int j=0;j<10;j++) { - tvar[j] = 0; - tavg[j] = 0; - } - int tvarnum = 0; - - float overloadsum = 0.0; - float adjustsum = 0.0; - float afteroverloadsum = 0.0; - float aslowdown = 0.0; - int chooses = 0; - int xs = 1; - for (int t=0; t v(numrep); - - c.overload.clear(); - - for (int z=0; z cutoff) - overloaded++; - - if (ocount[i] > adjoff) { - adjusted++; - c.overload[i] = (float)target / (float)ocount[i]; - //cout << "setting overload " << i << " to " << c.overload[i] << endl; - //cout << "disk " << i << " has " << ocount[i] << endl; - } - ocount[i] = 0; - } - //cout << overloaded << " overloaded" << endl; - overloadsum += (float)overloaded / (float)ndisks; - adjustsum += (float)adjusted / (float)ndisks; - - - - // keep adjusting! - for (int bla=0; bla<5; bla++) { - utime_t t2a = g_clock.now(); - - // second pass - for (int x=xs; x= adjoff) { - numover++; - if (c.overload.count(i) == 0) { - c.overload[i] = 1.0; - adjusted++; - } - //else cout << "(re)adjusting " << i << endl; - c.overload[i] *= (float)target / (float)ocount[i]; - //cout << "setting overload " << i << " to " << c.overload[i] << endl; - //cout << "disk " << i << " has " << ocount[i] << endl; - } - ocount[i] = 0; - } - if (!numover) break; - cout << "readjusting" << endl; - } - - utime_t t3a = g_clock.now(); - - for (int x=xs; x cutoff) { - still++; - //c.overload[ocount[i]] = 100.0 / (float)ocount[i]; - if (c.overload.count(i)) cout << "[adjusted] "; - cout << "disk " << i << " has " << ocount[i] << endl; - } - } - //if (still) cout << "overload was " << overloaded << " now " << still << endl; - afteroverloadsum += (float)still / (float)ndisks; - - //cout << "collisions: " << c.collisions << endl; - //cout << "r bumps: " << c.bumps << endl; - - int n = ndisks/10; - float avg[10]; - float var[10]; - for (int i=0;i<10;i++) { - int s = n*i; - avg[i] = 0.0; - for (int j=0; j - -#include -#include -using namespace std; - - -Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - return b; - } else { - // mixed - MixedBucket *b = new MixedBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); - return b->get_id(); -} - - -Bucket *make_random(Crush& c, int wid, int height, int& ndisks) -{ - int w = rand() % (wid-1) + 2; - - if (height == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - return b; - } else { - // mixed - int h = rand() % height + 1; - MixedBucket *b = new MixedBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } - -} - - -float go(int dep, int overloadcutoff) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - for (int d=0; dget_id(); - //c.print(cout, root); - } - if (0) { - MixedBucket *b = new MixedBucket(1); - for (int i=0; i<10000; i++) - b->add_item(ndisks++, 10); - root = c.add_bucket(b); - } - if (0) { - vector disks; - for (int i=0; i<10000; i++) - disks.push_back(ndisks++); - UniformBucket *b = new UniformBucket(1, 0, 10000, disks); - Hash h(123); - b->make_primes(h); - root = c.add_bucket(b); - } - //cout << ndisks << " disks" << endl; - - - - // rule - int numrep = 1; - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - //c.overload[10] = .1; - - - int pg_per = 100; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - //cout << ndisks << " disks, " << endl; - //cout << pg_per << " pgs per disk" << endl; - // cout << numpg << " logical pgs" << endl; - //cout << "numrep is " << numrep << endl; - - - int place = 1000000; - int times = place / numpg; - if (!times) times = 1; - - - //cout << "looping " << times << " times" << endl; - - float tvar = 0; - int tvarnum = 0; - - float overloadsum = 0.0; - float adjustsum = 0.0; - float afteroverloadsum = 0.0; - int chooses = 0; - int xs = 1; - for (int t=0; t v(numrep); - - c.overload.clear(); - - for (int z=0; z overloadcutoff) - overloaded++; - - if (ocount[i] > 100+(overloadcutoff-100)/2) { - adjusted++; - c.overload[i] = 100.0 / (float)ocount[i]; - //cout << "disk " << i << " has " << ocount[i] << endl; - } - ocount[i] = 0; - } - //cout << overloaded << " overloaded" << endl; - overloadsum += (float)overloaded / (float)ndisks; - adjustsum += (float)adjusted / (float)ndisks; - - - for (int x=xs; x overloadcutoff) { - still++; - //c.overload[ocount[i]] = 100.0 / (float)ocount[i]; - //cout << "disk " << i << " has " << ocount[i] << endl; - } - } - //if (still) cout << "overload was " << overloaded << " now " << still << endl; - afteroverloadsum += (float)still / (float)ndisks; - - //cout << "collisions: " << c.collisions << endl; - //cout << "r bumps: " << c.bumps << endl; - - float avg = 0.0; - for (int i=0; i100; d -= 5) { - float var = go(3,d); - //cout << "## depth = " << d << endl; - //cout << d << "\t" << var << endl; - } -} diff --git a/tags/20070517_before_mds_merge/crush/test/sizes.cc b/tags/20070517_before_mds_merge/crush/test/sizes.cc deleted file mode 100644 index cc5780218210a..0000000000000 --- a/tags/20070517_before_mds_merge/crush/test/sizes.cc +++ /dev/null @@ -1,131 +0,0 @@ - -#include "include/types.h" -#include "include/Distribution.h" -#include "osd/OSDMap.h" - - -Distribution file_size_distn; //kb - - -list object_queue; -int max_object_size = 1024*1024*100; //kb - -off_t no; - -int get_object() //kb -{ - if (object_queue.empty()) { - int max = file_size_distn.sample(); - no++; - int filesize = max/2 + (rand() % 100) * max/200 + 1; - //cout << "file " << filesize << endl; - while (filesize > max_object_size) { - object_queue.push_back(max_object_size); - filesize -= max_object_size; - } - object_queue.push_back(filesize); - } - int s = object_queue.front(); - object_queue.pop_front(); - //cout << "object " << s << endl; - return s; -} - -void getdist(vector& v, float& avg, float& var) -{ - avg = 0.0; - for (int i=0; i pgs(n); - off_t did = 0; - - no = 0; - while (did < dist) { - off_t s = get_object(); - pgs[rand()%n] += s; - did += s; - } - while (!object_queue.empty()) - pgs[rand()%n] += get_object(); - - numo = no; - //cout << did/n << endl; - - //for (int i=0; i - -#include -#include -using namespace std; - - -Bucket *make_bucket(Crush& c, vector& wid, int h, map< int, list >& buckets, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - buckets[h].push_back(b); - return b; - } else { - // mixed - Bucket *b = new TreeBucket(h+1); - c.add_bucket(b); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - n->set_parent(b->get_id()); - } - buckets[h].push_back(b); - //cout << b->get_id() << " mixedbucket with " << wid[h] << " at " << h << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, map< int, list >& buckets, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, buckets, ndisks); - return b->get_id(); -} - - -void place(Crush& c, Rule& rule, int numpg, int numrep, vector& ocount) -{ - vector v(numrep); - //map ocount; - - for (int x=1; x<=numpg; x++) { - - //cout << H(x) << "\t" << h(x) << endl; - c.do_rule(rule, x, v); - //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl; - - bool bad = false; - for (int i=0; i wid; - wid.push_back(10); - wid.push_back(2); - - map< int, list > buckets; - root = make_hierarchy(c, wid, buckets, ndisks); - - // add small bucket - vector disks; - for (int i=0; i<3; i++) - disks.push_back(ndisks++); - UniformBucket *b = new UniformBucket(1, 0, 1, disks); - b->make_primes(h); - Bucket *o = buckets[1].back(); - c.add_bucket(b); - //cout << " adding under " << o->get_id() << endl; - c.add_item(o->get_id(), b->get_id(), b->get_weight()); - - - // rule - int numrep = 6; - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - //c.overload[10] = .1; - - int pg_per = 10000; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - - c.print(cout, root); - - place(c, rule, numpg, numrep, ocount); - - for (int i=0; i - -#include -#include -using namespace std; - - -int numrep = 1; - - -double go(int n, int bucket) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - Bucket *b; - vector items; - if (bucket == 0) b = new UniformBucket(1,0,10,items); - if (bucket == 1) b = new TreeBucket(1); - if (bucket == 2) b = new ListBucket(1); - if (bucket == 3) b = new StrawBucket(1); - - for (int d=0; dadd_item(ndisks++, 1); - - //if (!bucket) ((UniformBucket*)b)->make_primes(h); - - root = c.add_bucket(b); - - // rule - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - - int place = 1000000; - - - vector v(numrep); - set out; - map overload; - - utime_t start = g_clock.now(); - - for (int x=1; x <= place; x++) - c.do_rule(rule, x, v, out, overload); - - utime_t end = g_clock.now(); - - end -= start; - double el = (double)end; - - //cout << "\t" << ndisks; - - return el; -} - - -int main() -{ - - for (int n=4; n<=50; n += 4) { - cout << n; - for (int b=0; b<4; b++) { - double el = go(n,b); - cout << "\t" << el; - } - cout << endl; - } -} diff --git a/tags/20070517_before_mds_merge/crush/test/speed_depth.cc b/tags/20070517_before_mds_merge/crush/test/speed_depth.cc deleted file mode 100644 index 32275d16d2b31..0000000000000 --- a/tags/20070517_before_mds_merge/crush/test/speed_depth.cc +++ /dev/null @@ -1,174 +0,0 @@ - -#include "../../common/Clock.h" -#include "../crush.h" -using namespace crush; - - -Clock g_clock; - -#include - -#include -#include -using namespace std; - - -int uniform = 10; -int branching = 10; -int buckettype = 0; -int numrep = 1; - -Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - return b; - } else { - // mixed - Bucket *b; - if (buckettype == 0) - b = new TreeBucket(h+1); - else if (buckettype == 1 || buckettype == 2) - b = new ListBucket(h+1); - else if (buckettype == 3) - b = new StrawBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); - return b->get_id(); -} - - -double go(int dep, int per) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - if (1) { - wid.push_back(uniform); - for (int d=1; d v(numrep); - - utime_t start = g_clock.now(); - - set out; - map overload; - - for (int x=1; x <= place; x++) - c.do_rule(rule, x, v, out, overload); - - utime_t end = g_clock.now(); - - end -= start; - double el = (double)end; - - //cout << "\t" << ndisks; - - return el; -} - - -int main() -{ - uniform = branching = 8; - - cout << "// dep\tuniform\tbranch\tndisks" << endl; - - for (int d=2; d<=5; d++) { - cout << d;// << "\t" << branching; - cout << "\t" << uniform; - cout << "\t" << branching; - - int n = 1; - for (int i=0; i - -#include -#include -using namespace std; - - -int branching = 10; -bool linear = false; -int numrep = 1; - -Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - return b; - } else { - // mixed - Bucket *b; - if (linear) - b = new ListBucket(h+1); - else - b = new TreeBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); - return b->get_id(); -} - - -double go(int s) -{ - int dep = 2; - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - if (1) { - //for (int d=0; d v(numrep); - - utime_t start = g_clock.now(); - - for (int x=1; x <= place; x++) - c.do_rule(rule, x, v); - - utime_t end = g_clock.now(); - - end -= start; - double el = (double)end; - - cout << "\t" << ndisks; - - return el; -} - - -int main() -{ - branching = 8; - - int d = 2; - numrep = 2; - - for (int s = 64; s <= 32768; s *= 8) { - cout << "t"; - linear = false; - double el = go(s, d); - cout << "\t" << el; - - cout << "\tp"; - linear = true; - el = go(s, d); - cout << "\t" << el; - - cout << endl; - } -} diff --git a/tags/20070517_before_mds_merge/crush/test/t.cc b/tags/20070517_before_mds_merge/crush/test/t.cc deleted file mode 100644 index 0785ef47d6c04..0000000000000 --- a/tags/20070517_before_mds_merge/crush/test/t.cc +++ /dev/null @@ -1,25 +0,0 @@ - -#include "../../common/Clock.h" -#include "../crush.h" -using namespace crush; - - -Clock g_clock; - -#include - -#include -#include -using namespace std; - - -int branching = 10; -bool linear = false; -int numrep = 1; - -int main() { - - Bucket *b = new UniformBucket(1, 0); - //b = new TreeBucket(1); -} - diff --git a/tags/20070517_before_mds_merge/crush/test/testbucket.cc b/tags/20070517_before_mds_merge/crush/test/testbucket.cc deleted file mode 100644 index 065721c2c1967..0000000000000 --- a/tags/20070517_before_mds_merge/crush/test/testbucket.cc +++ /dev/null @@ -1,61 +0,0 @@ - - -#include "../Bucket.h" -using namespace crush; - -#include -#include -using namespace std; - - -ostream& operator<<(ostream& out, vector& v) -{ - out << "["; - for (int i=0; i ocount(ndisks); - - vector v(numrep); - int nplace = 0; - for (int x=1; x<1000000; x++) { - //cout << H(x) << "\t" << h(x) << endl; - for (int i=0; i -#include -using namespace std; - - -void getdist(vector& v, float& avg, float& var) -{ - avg = 0.0; - for (int i=0; i a(n); - vector b(n); - - for (int i=0; i c(n); - for (int i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include -#include -#include -using namespace std; - -#include "config.h" - -#include "client/SyntheticClient.h" -#include "client/Client.h" -#include "client/fuse.h" - -#include "msg/SimpleMessenger.h" - -#include "common/Timer.h" - -#ifndef DARWIN -#include -#endif // DARWIN - -#include -#include -#include - -int main(int argc, char **argv, char *envp[]) { - - //cerr << "cfuse starting " << myrank << "/" << world << endl; - vector args; - argv_to_vec(argc, argv, args); - parse_config_options(args); - parse_syn_options(args); // for SyntheticClient - - // args for fuse - vec_to_argv(args, argc, argv); - - if (g_conf.clock_tare) g_clock.tare(); - - // load monmap - MonMap monmap; - int r = monmap.read(".ceph_monmap"); - assert(r >= 0); - - // start up network - rank.start_rank(); - - list clients; - list synclients; - - cout << "mounting and starting " << g_conf.num_client << " syn client(s)" << endl; - for (int i=0; iinit(); - - // start syntheticclient - SyntheticClient *syn = new SyntheticClient(client); - - client->mount(); - - syn->start_thread(); - - clients.push_back(client); - synclients.push_back(syn); - } - - cout << "waiting for client(s) to finish" << endl; - while (!clients.empty()) { - Client *client = clients.front(); - SyntheticClient *syn = synclients.front(); - clients.pop_front(); - synclients.pop_front(); - - // wait - syn->join_thread(); - - // unmount - client->unmount(); - client->shutdown(); - - delete syn; - delete client; - } - - // wait for messenger to finish - rank.wait(); - - return 0; -} - diff --git a/tags/20070517_before_mds_merge/doc/Commitdir.txt b/tags/20070517_before_mds_merge/doc/Commitdir.txt deleted file mode 100644 index 83c89bdcaef4a..0000000000000 --- a/tags/20070517_before_mds_merge/doc/Commitdir.txt +++ /dev/null @@ -1,22 +0,0 @@ - -How Directory Committing Works: - -Each CDir has: - version - current version of directory - committing_version - which version was sent to stable storage - last_committed_version - last version to be safely stored - -Each Inode has: - parent_dir_version - what dir version i was in when i was dirtied. (*) - - (*) note that if you change an inode, mark_dirty() again, even if it's already dirty! - - -How committing works: - -A call to commit_dir(dir, context) will ensure tha the _current_ version is stored safely on disk before the context is finished. - -When a commit completes, inodes in the directory are checked. If they are dirty and belonged to the _committed_ (or earlier) version, then they are marked clean. If they belong to a newer version, then they are _still dirty_. - - - diff --git a/tags/20070517_before_mds_merge/doc/Replication.txt b/tags/20070517_before_mds_merge/doc/Replication.txt deleted file mode 100644 index 0f8d4c9079e4d..0000000000000 --- a/tags/20070517_before_mds_merge/doc/Replication.txt +++ /dev/null @@ -1,19 +0,0 @@ - -Primary copy replication. - -Inodes: - -- The primary's list of replicas (cached_by) is inclusive at all times. -- The primary's list never includes the local node. -- The primary's list of replicas will only include non-replicas when the relevant CacheExpire notifications are in-flight. - -- Replicas can be created in two ways: - - via a Discover + DiscoverReply - - via an export and import. (The old auth keeps a copy, and adds itself to the replica list as it exports.) - - -Directories (and their dentries): - -- The primary has an open_by list that is inclusive at all times. -- ..Never includes local node -- No per-dentry replica lists. All dentry lock operations (for unlink, etc.) are sent to all nodes in open_by list. \ No newline at end of file diff --git a/tags/20070517_before_mds_merge/doc/bdb.txt b/tags/20070517_before_mds_merge/doc/bdb.txt deleted file mode 100644 index 63e647f5bb3cc..0000000000000 --- a/tags/20070517_before_mds_merge/doc/bdb.txt +++ /dev/null @@ -1,48 +0,0 @@ -OBJECT STORE ON BERKELEY DB ---------------------------- - -OSBDB is an implementation of an object store that uses Berkeley DB as -the underlying storage. It is meant to be an alternative to EBOFS. - -BUILDING --------- - -You will need to have Berkeley DB installed, including the developent -packages. We've tested this with Berkeley DB 4.4.20 on Ubuntu 6.10. - -To compile OSBDB support, you need to pass the argument "want_bdb=yes" -to "make." If you don't specify this, OSBDB and all its associated -support is not included in the executables. - -RUNNING -------- - -To use OSBDB in Ceph, simply pass the --bdbstore flag to programs. You -don't need to create a "device" for OSBDB ahead of time; Berkeley DB -will take care of creating the files. You also *cannot* use a raw -device as your store -- it must be regular file. - -OSBDB additionally accepts the following flags: - - --bdbstore-btree Configures OSBDB to use the "Btree" - database type for Berkeley DB. The default - database type is "Hash". - - --bdbstore-hash-ffactor Sets the "fill factor" for the hash - database type. Takes an integer argument. - - --bdbstore-hash-nelem Sets the "nelem" parameter for the hash - database type. Takes an integer argument. - - --bdbstore-hash-pagesize Sets the page size for the hash database - type. Takes an integer argument. - - --bdbstore-cachesize Sets the cache size. Takes an integer - argument, which must be a power of two, and - no less than 20 KiB. - - --bdbstore-transactional Enable (in-memory-only) transactions for - all operations in the OSBDB store. - - --debug-bdbstore Set the debug level. Takes an integer - argument. diff --git a/tags/20070517_before_mds_merge/doc/caching.txt b/tags/20070517_before_mds_merge/doc/caching.txt deleted file mode 100644 index 77b02480bcd6e..0000000000000 --- a/tags/20070517_before_mds_merge/doc/caching.txt +++ /dev/null @@ -1,200 +0,0 @@ - - -AUTHORITY - -The authority maintains a list of what nodes cache each inode. -Additionally, each replica is assigned a serial (normally 0) to -disambiguate multiple replicas of the same item (see below). - - set cached_by; - map cached_by_serial; - -The cached_by set _always_ includes all nodes that cache the -partcuarly inode, but may additionally include nodes that used to -cache it but no longer do. In those cases, an expire message should -be in transit. - - -REPLICA - -The replica maintains a notion of who it believes is the authority for -each replicated inode. There are two possibilities: - - - Ordinarily, this notion is correct. - - If the part of the file system in question was recently exported to - a new MDS, the inodes old authority is acting as a CACHEPROXY, - and will forward relevant messages on to the authority. - -When a repica is expired from cache, and expire is sent to the -authority. The expire includes the serial number issued when the -replica was originally created to disambiguate potentially concurrent -replication activity. - - -EXPORTS - -- The old authority suddenly becomes a replica. It's serial is well - defined. It also becomes a CACHEPROXY, which means its cached_by - remains defined (with an alternate meaning!). While a proxy, the - node will forward relevant messages from the replica to the - authority (but not the other way around--the authority knows all - replicas). - -- Once the export is acked, the old authority sends a - message to the replica notifying it of the new authority. As soon - as all replicas acknowedge receipt of this notice, the old authority - can cease CACHEPROXY responsibilities and become a regular replica. - At this point it's cached_by is no longer defined. - -- Replicas always know who the authority for the inode is, OR they - know prior owner acting as a CACHEPROXY. (They don't know which it - is.) - - -CACHED_BY - -The authority always has an inclusive list of nodes who cache an item. -As such it can confidently send updates to replicas for locking, -invalidating, etc. When a replica is expired from cache, an expire is -sent to the authority. If the serial matches, the node is removed -from the cached_by list. - - - - - -SUBTREE AUTHORITY DELEGATION: imports versus hashing - -Authority is generally defined recursively: an inode's authority -matches the containing directory, and a directory's authority matches -the directory inode's. Thus the authority delegation chain can be -broken/redefined in two ways: - - - Imports and exports redefine the directory inode -> directory - linkage, such that the directory authority is explicitly specified - via dir.dir_auth: - - dir.dir_auth == -1 -> directory matches its inode - dir.dir_auth >= 0 -> directory authority is dir.dir_auth - - - Hashed directories redefine the directory -> inode linkage. In - non-hashed directories, inodes match their containing directory. - In hashed directories, each dentry's authority is defined by a hash - function. - - inode.hash_seed == 0 -> inode matches containing directory - inode.hash_seed > 0 -> defined by hash(hash_seed, dentry) - -A directory's "containing_import" (bad name, FIXME) is either the -import or hashed directory that is responsible for delegating a -subtree. Note that the containing_import of a directory may be itself -because it is an import, but it cannot be itself because it is hashed. - -Thus: - - - Import and export operations' manipulation of dir_auth is - completely orthogonal to hashing operations. Hashing methods can - ignore dir_auth, except when they create imports/exports (and break - the inode<->dir auth linkage). - - - Hashdirs act sort of like imports in that they bound an - authoritative region. That is, either hashdirs or imports can be - the key for nested_exports. In some cases, a dir may be both an - import and a hash. - - - Export_dir won't export a hashdir. This is because it's tricky - (tho not necessarily impossible) due to the way nested_exports is - used with imports versus hashdirs. - - - - -FREEZING - -There are two types of freezing: - - - TREE: recursively freezes everything nested beneath a directory, - until an export of edge of cache is reached. - - DIR: freezes the contents of a single directory. - -Some notes: - - - Occurs on the authoritative node only. - - - Used for suspending critical operations while migrating authority - between nodes or hashing/unhashing directories. - - - Freezes the contents of the cache such that items may not be added, - items cannot be auth pinned, and/or subsequently reexported. The - namespace of the affected portions of the hierarchy may not change. - The content of inodes and other orthogonal operations - (e.g. replication, inode locking and modification) are unaffected. - -Two states are defined: freezing and frozen. The freezing state is -used while waiting for auth_pins to be removed. Once all auth_pins -are gone, the state is changed to frozen. New auth_pins cannot be -added while freezing or frozen. - - -AUTH PINS - -An auth pin keeps a given item on the authoritative node until it is -removed. The pins are tracked recursively, so that a subtree cannot -be frozen if it contains any auth pins. - -If a pin is placed on a non-authoritative item, the item is allowed to -become authoritative; the specific restriction is it cannot be frozen, -which only happens during export-type operations. - - -TYPES OF EXPORTS - -- Actual export of a subtree from one node to another -- A rename between directories on different nodes exports the renamed -_inode_. (If it is a directory, it becomes an export such that the -directory itself does not move.) -- A hash or unhash operation will migrate inodes within the directory -either to or from the directory's main authority. - -EXPORT PROCESS - - - - -HASHING - -- All nodes discover and open directory - -- Prep message distributes subdir inode replicas for exports so that - peers can open those dirs. This is necessary because subdirs are - converted into exports or imports as needed to avoid migrating - anything except the hashed dir itself. The prep is needed for the - same reasons its important with exports: the inode authority must - always have the exported dir open so that it gets accurate dir - authority updates, and can keep the inode->dir_auth up to date. - -- MHashDir messsage distributes the directory contents. - -- While auth is frozen_dir, we can't get_or_open_dir. Otherwise the - Prep messages won't be inclusive of all dirs, and the - imports/exports won't get set up properly. - -TODO -readdir - - -- subtrees stop at hashed dir. hashed dir's dir_auth follows parent - subtree, unless the dir is also an explicit import. thus a hashed - dir can also be an import dir. - - -bananas -apples -blueberries -green pepper -carrots -celery - - - - diff --git a/tags/20070517_before_mds_merge/doc/dentries.txt b/tags/20070517_before_mds_merge/doc/dentries.txt deleted file mode 100644 index ab14765998b2f..0000000000000 --- a/tags/20070517_before_mds_merge/doc/dentries.txt +++ /dev/null @@ -1,4 +0,0 @@ - -null dentires only exist - - on auth - - on replica, if they are xlock \ No newline at end of file diff --git a/tags/20070517_before_mds_merge/doc/file_modes.txt b/tags/20070517_before_mds_merge/doc/file_modes.txt deleted file mode 100644 index d4ceba4034e5f..0000000000000 --- a/tags/20070517_before_mds_merge/doc/file_modes.txt +++ /dev/null @@ -1,66 +0,0 @@ - -underlying client capabilities: - -- read + cache -- read sync -- write sync -- write + buffer - (...potentially eventually augmented by byte ranges) - -whatever system of modes, tokens, etc. has to satisfy the basic -constraint that no conflicting capabilities are ever in the -hands of clients. - - -questions: -- is there any use to clients writing to a replica? - - reading, yes.. 100,000 open same file.. - - ------- - -simplest approach: -- all readers, writers go through authority -- all open, close traffic at replicas forwarded to auth - -- fh state migrates with exports. - - - --------- - -less simple: -- all writers go through authority - - open, close traffic fw -- readers from any replica - - need token from auth -- weird auth <-> replica <-> client interactions ensue! - - --------- - -even more complex (and totally FLAWED, ignore this!) - -- clients can open a file with any replica (for read or write). -- replica gets a read or write token from the primary - - primary thus knows if it's all read, all write, mixed, or none. -- once replica has a token it can service as many clients (of given type(s)) as it wants. -- on export, tokens are moved too. - - primary give _itself_ a token too! much simpler. - -- clients maintain a mode for each open file: rdonly, wronly, rdwr, lock -- globally, the mode is controlled by the primary, based on the mixture of - read and write tokens issued - - - -- [optional] if a client has a file open rdwr and the mode is rdonly or wronly, it can - request to read or write from the mds (which might twiddle the mode for performance - reasons.. e.g. lots of ppl rdwr but no actual reading) - - - - --------- - - diff --git a/tags/20070517_before_mds_merge/doc/header.txt b/tags/20070517_before_mds_merge/doc/header.txt deleted file mode 100644 index 8a3c51280461d..0000000000000 --- a/tags/20070517_before_mds_merge/doc/header.txt +++ /dev/null @@ -1,12 +0,0 @@ -// -*- mode:C++; tab-width:4; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ diff --git a/tags/20070517_before_mds_merge/doc/inos.txt b/tags/20070517_before_mds_merge/doc/inos.txt deleted file mode 100644 index b5ab1db25ca60..0000000000000 --- a/tags/20070517_before_mds_merge/doc/inos.txt +++ /dev/null @@ -1,11 +0,0 @@ - -inodeno_t namespace - - relevant both for ino's, and for the (ino) input for Filer and object storage namespace... - -1 - root inode - -100+mds - mds log/journal -200+mds - mds ino, fh allocation tables -300+mds - mds inode files (for non-embedded inodes) - -1000+ - regular files and directories \ No newline at end of file diff --git a/tags/20070517_before_mds_merge/doc/journal.txt b/tags/20070517_before_mds_merge/doc/journal.txt deleted file mode 100644 index 22cb4fc9e21b2..0000000000000 --- a/tags/20070517_before_mds_merge/doc/journal.txt +++ /dev/null @@ -1,124 +0,0 @@ - - -- LogEvent.replay() is idempotent. we won't know whether the update is old or not. - - - - - - - - - - - - - - - -journal is distributed among different nodes. because authority changes over time, it's not immedicatley clear to a recoverying node relaying the journal whether the data is "real" or not (it might be exported later in the journal). - - -possibilities: - - -ONE.. bloat the journal! - -- journal entry includes full trace of dirty data (dentries, inodes) up until import point - - local renames implicit.. cache is reattached on replay - - exports are a list of exported dirs.. which are then dumped - ... - -recovery phase 1 -- each entry includes full trace (inodes + dentries) up until the import point -- cache during recovery is fragmetned/dangling beneath import points -- when export is encountered items are discarded (marked clean) - -recovery phase 2 -- import roots ping store to determine attachment points (if not already known) - - if it was imported during period, attachment point is already known. - - renames affecting imports are logged too -- import roots discovered from other nodes, attached to hierarchy - -then -- maybe resume normal operations -- if recovery is a background process on a takeover mds, "export" everything to that node. - - --> journal contains lots of clean data.. maybe 5+ times bigger as a result! - -possible fixes: - - collect dir traces into journal chunks so they aren't repeated as often - - each chunk summarizes traces in previous chunk - - hopefully next chunk will include many of the same traces - - if not, then the entry will include it - - - - -=== log entry types === -- all inode, dentry, dir items include a dirty flag. -- dirs are implicitly _never_ complete; even if they are, a fetch before commit is necessary to confirm - -ImportPath - log change in import path -Import - log import addition (w/ path, dirino) - -InoAlloc - allocate ino -InoRelease - release ino - -Inode - inode info, along with dentry+inode trace up to import point -Unlink - (null) dentry + trace, + flag (whether inode/dir is destroyed) -Link - (new) dentry + inode + trace - - ------------------------------ - -TWO.. -- directories in store contain path at time of commit (relative to import, and root) -- replay without attaching anything to heirarchy -- after replay, directories pinged in store to attach to hierarchy - --> phase 2 too slow! --> and nested dirs may reattach... that won't be apparent from journal. - - put just parent dir+dentry in dir store.. even worse on phase 2! - - -THREE -- - - - - - - - -metadata journal/log - - -event types: - -chown, chmod, utime - InodeUpdate - -mknod, mkdir, symlink - Mknod .. new inode + link - -unlink, rmdir - Unlink - -rename - Link + Unlink (foreign) -or Rename (local) - -link - Link .. link existing inode - - - - -InodeUpdate -DentryLink -DentryUnlink -InodeCreate -InodeDestroy -Mkdir? diff --git a/tags/20070517_before_mds_merge/doc/lazy_posix.txt b/tags/20070517_before_mds_merge/doc/lazy_posix.txt deleted file mode 100644 index 1d226cd03d8e4..0000000000000 --- a/tags/20070517_before_mds_merge/doc/lazy_posix.txt +++ /dev/null @@ -1,53 +0,0 @@ - -http://www.usenix.org/events/fast05/wips/slides/welch.pdf - - - --- STATLITE - statlite(const char *filename, struct statlite *buf); - fstatlite(int fd, struct statlite *buf); - lstatlite(const char *filename, struct statlite *buf); - - * file size, mtime are optionally not guaranteed to be correct - * mask field to specify which fields you need to be correct - - --- READDIR+ - - struct dirent_plus *readdirplus(DIR *dirp); - int readdirplus_r(DIR *dirp, struct dirent_plus *entry, struct dirent_plus **result); - struct dirent_lite *readdirlite(DIR *dirp); - int readdirlite_r(DIR *dirp, struct dirent_lite *entry, struct dirent_lite **result); - - * plus returns lstat - * lite returns lstatlite - - --- lazy i/o integrity - - O_LAZY to open(2) - - * relax data coherency - * writes may not be visible until lazyio_propagate, fsync, close - - lazyio_propagate(int fd, off_t offset, size_t count); - * my writes are safe - - lazyio_synchronize(int fd, off_t offset, size_t count); - * i will see everyone else's propagated writes - --- read/write non-serial vectors - - ssize_t readx(int fd, const struct iovec *iov, size_t iov_count, struct xtvec *xtv, size_t xtv_count); - ssize_t writex(int fd, const struct iovec *iov, size_t iov_count, struct xtvec *xtv, size_t xtv_count); - - * like readv/writev, but serial - * - - -int lockg(int fd, int cmd, lgid_t *lgid) - group locks - -int openg(char *path, int mode, fh_t *handle); - portable file handle -int sutoc(fh_t *fh); \ No newline at end of file diff --git a/tags/20070517_before_mds_merge/doc/osd_outline.txt b/tags/20070517_before_mds_merge/doc/osd_outline.txt deleted file mode 100644 index 2c6f3287aac5f..0000000000000 --- a/tags/20070517_before_mds_merge/doc/osd_outline.txt +++ /dev/null @@ -1,37 +0,0 @@ - -intro - -osd cluster map - requirements - desireable properties - (c)rush - -failure detection - distributed ping or heartbeat - central filter, notifier - -design - placement seed, class/superset, groups - -normal operation - reads - writes - -recovery - triggers: failed disk, or total cluster reorganization - - notify - peering - pull - push - clean - -writes during recovery - -graceful data loss + recovery? - - - - - - diff --git a/tags/20070517_before_mds_merge/doc/osd_replication.txt b/tags/20070517_before_mds_merge/doc/osd_replication.txt deleted file mode 100644 index 907d00e2050a2..0000000000000 --- a/tags/20070517_before_mds_merge/doc/osd_replication.txt +++ /dev/null @@ -1,226 +0,0 @@ - - -SOME GENERAL REQUIREMENTS - -- cluster expansion: - - any or all of the replicas may move to new OSDs. - -- cluster map may change frequently - - map change should translate into pending replication/migration - state quickly (or better yet, instantly), so that we could push - through a series of (say, botched) maps quickly and be fine, so long - as the final map is correct. - -- ideally, unordered osd<->osd, client<->osd communication - (mds<->mds, client<->mds communication is ordered, but file i/o - would be too slow that way?) - - - - -PRIMARY ONLY PICTURE - -let's completely ignore replication for a while, and see how -complicated the picture needs to be to reliably support cluster expansion. - -typedef __uint64_t version_t; - - -per-Object metadata: -- version #. incremented when an object is modified. - e.g. version_t version; -- on primary, keep list of stray replicas - e.g. map stray_replicas; // osds w/ stray replicas - includes old primary osd(s), until deletion is confirmed. used while rg - is importing. - - -per-RG metadata -- object list. well, a method to fetch it by querying a collection or whatever. -- negative list - e.g. map deleted_objects; - - used to enumerate deleted objects, when in "importing" state. -- a RG "state" (enum/int) - - - - - - -Normal RG state: -- role=primary - clean - i am primary, all is well. no stray copies. i can - discard my negative object list, since my local - object store tells me everything. - - -After a map change: -- new primary - undef - initially; i don't know RG exists. -- old primary - homeless - i was primary, still have unmolested data. new primary is not yet migrating - (presumably it's state=undef.) i need to contact new primary and tell them - this RG exists. - -- new primary - importing - i am migrating data from old primary. keep negative dir entries for deletions. - write locally. proxy reads (force immediately migration). do whole objects - initially (on write, block until i migrate the object). later we can do - sub-object state (where "live" object data is spread across new/old primaries.. -- old primary - exporting - primary is migrating my data. - undef - when it finishes. (i will forget this RG existed.) - - -After a second map change (scenario 1): - as above, if we were clean again. - -After a second map change (scenario 2): - we weren't clean yet. -- new primary - undef - initially (until i learn RG exists) -- old primary - importing - i'm still migrating from old old primary -- old old primary - exporting - ... -- old primary -?? importing+exporting - proxy reads as before. continue migrating from old old primary. - - -After a second map change (scenario 3): - we weren't clean yet, and old old primary is also new primary -- new primary (and old old primary) - exporting - change state to importing. be sure to compare object versions, and neg dir - entries (as we always should do, really!). -- old primary - importing - note that the old import source matches new primary, and change - state to exporting, and stop importing. (unlike scenario 2) - --> this approach could mean that a series of fast map changes could - force data to migrate down a "chain" of old primaries to reach the - new one. maybe old primary should go from importing -> exporting, - and pass along old old primary id to new primary such that the - import is a many-to-one thing, instead of one-to-one. version - numbers and neg entries will make it easy to pick out correct versions. - - - -For the importing process on a given RG: - -- metadata for each source - - each source has a state: - 'starting' - don't know anything about source yet. query source! - this probaby induces the source to change from - 'homeless' or something similar to 'exporting'. - 'importing' - i've fetched the source's object list (and neg - object list). i'm busy reading them! these lists - will shrink as the process continues. after i fetch - an object, i will erase it from the source. - (object metadata will include stray copy info - until i confirm that its removed.) - 'finishing' - i've read all my data, and i'm telling the old person - to discard any remaining RG metadata (RG contents - should already be gone) - - unmigrated object list - - migrated but not deleted object list - - stray osd is also listed in per-object MD during this stage - - negative object list - - i can remove these items if i see a newer object version (say, - from a different import source or something). - - i can remove any local objects or ignore imported ones if it is - older than deleted version - -- the lists should be sets or otherwise queryable so that while i'm - importing and a real op comes through I can quickly determine if a - given object_id is pending migration etc or if my local store is to - be trusted. - - - - - -SOME CODE BITS - - -typedef __uint64_t version_t; -class Object { - version_t version; - map stray_replicas; -}; - - -class ReplicaGroup { - int enumerate_objects(list& ls); - - int state; - - // for unstable states, - map deleted_objects; // locally - map exporters; // importing from these guys. -}; - -// primary -#define RG_STATE_CLEAN 1 -#define RG_STATE_IMPORTING 2 // pulling data - -// non-primary -#define RG_STATE_HOMELESS 5 // old primary; new primary not yet - // notified; not yet exporting. -#define RG_STATE_EXPORTING 6 // a newer primary is extracting my - // data. - - -struct RGExporter_t { - int import_state; - - set remaining_objects; // remote object list - set stray_objects; // imported but not deleted. - -}; - - - - - ----- -all crap from here on down - - - - -REPLICAS -- - - - - -OSD STATES -- primary, up to date. -- replica, up to date. - -- primary, proxy to old primary (primaries?) - -- replica, not up to date. - - -REPLICATION STUFF - -Per-RG metadata -- primary - - per-replica state: clean, catching up? -- replica - -Per-object metadata -- primary and replica - - version number/mtime - - rg (reverse indexed) -- primary - - replication level and state. - - commited to memory and/or disk, on which replicas (#1, #2, etc.) -- replica - - - - - --> \ No newline at end of file diff --git a/tags/20070517_before_mds_merge/doc/performance.txt b/tags/20070517_before_mds_merge/doc/performance.txt deleted file mode 100644 index 7ca278bd284b1..0000000000000 --- a/tags/20070517_before_mds_merge/doc/performance.txt +++ /dev/null @@ -1,36 +0,0 @@ - - -quick performance test 2005-05-11. fakemds, 100x100, asdf/asdf, debug 13 - -g marshalling -real 3m8.697s -user 2m53.282s -sys 0m6.291s - -real 3m3.337s -user 2m49.467s -sys 0m6.243s - - -g no marshalling -real 2m1.464s -user 1m42.680s -sys 0m8.128s - -real 1m49.469s -user 1m34.523s -sys 0m6.410s - - -O3 marshalling -real 1m29.833s -user 1m11.474s -sys 0m7.588s - -real 1m9.439s -user 0m56.071s -sys 0m5.643s - - - -O3 no marshalling -real 1m2.739s -user 0m46.578s -sys 0m7.882s - diff --git a/tags/20070517_before_mds_merge/doc/shared_write_states_nogo.txt b/tags/20070517_before_mds_merge/doc/shared_write_states_nogo.txt deleted file mode 100644 index f409617d82681..0000000000000 --- a/tags/20070517_before_mds_merge/doc/shared_write_states_nogo.txt +++ /dev/null @@ -1,39 +0,0 @@ - -// stable states // ------auth----- -----replica----- -#define LOCK_SYNC 0 // R . / . . . WB same ... for stat() -#define LOCK_LOCK 1 // R W / RC . . . . . / RC . . . ... for truncate(), fsync() -#define LOCK_RDONLY 2 // R . / RC R . . same -#define LOCK_MIXED 3 // . . / . R W . same -#define LOCK_WRONLY 4 // . . / . . W WB same - -// transition states -#define LOCK_GSYNCR 8 // R . / RC . . . same -#define LOCK_GSYNCMW 9 // . . / RC . . WB same -#define LOCK_GSYNCMW2 9 // . . / RC . . WB same - -#define LOCK_GLOCKSR 5 // R . / RC . . . . . / RC . . . -#define LOCK_GLOCKMW 7 // . . / RC . . . same - -#define LOCK_GRDONLYM 10 // . . / . R . . same -#define LOCK_GRDONLYM2 10 // --- . . / . R . . -#define LOCK_GRDONLYW 11 // . . / . . . . same -#define LOCK_GRDONLYW2 11 // --- . . / . . . . -#define LOCK_GRDONLYS 12 // R . / RC . . . same -#define LOCK_GRDONLYL 13 // R . / RC . . . --- - -#define LOCK_GMIXEDR 14 // R . / . R . . . . / . R . . -#define LOCK_GMIXEDR2 15 // --- . . / . R . . -#define LOCK_GMIXEDW 16 // . . / . . W . same -#define LOCK_GMIXEDW2 16 // --- . . / . . W . -#define LOCK_GMIXEDS 16 // R . / . . . . . . / . . . . -#define LOCK_GMIXEDS2 16 // --- . . / . . . . -#define LOCK_GMIXEDL 17 // R . / . . . . --- - -#define LOCK_GWRONLYR 18 // R . / . . . . same -#define LOCK_GWRONLYR2 18 // --- . . / . . . . -#define LOCK_GWRONLYM 19 // . . / . . . . same -#define LOCK_GWRONLYM2 19 // --- . . / . . . . -#define LOCK_GWRONLYS 20 // R . / . . . WB same -#define LOCK_GWRONLYS2 20 // --- . . / . . . . -#define LOCK_GWRONLYL 21 - diff --git a/tags/20070517_before_mds_merge/doc/shutdown.txt b/tags/20070517_before_mds_merge/doc/shutdown.txt deleted file mode 100644 index e5ccde3171004..0000000000000 --- a/tags/20070517_before_mds_merge/doc/shutdown.txt +++ /dev/null @@ -1,13 +0,0 @@ - -- mds0 triggers shutdown by sending a shutdown_start to all nodes. - -- from here on out, all client requests are discarded (unless they are a file close?) - -- each mds checks for outstanding inter-mds transations. e.g imports, discoveries, etc. once they're all done, send a shutdown_ready to mds0 - -- each mds successively disassembles its cache, flushing data to long-term storage, and sending inodeexpires, exporting imported dirs to parent (after they're clean + empty) - -- when the cache is empty, send shutdown_done to mds0 and exit. - -- mds0 exits when all mdss have finished. - diff --git a/tags/20070517_before_mds_merge/ebofs/Allocator.cc b/tags/20070517_before_mds_merge/ebofs/Allocator.cc deleted file mode 100644 index 805957f779a11..0000000000000 --- a/tags/20070517_before_mds_merge/ebofs/Allocator.cc +++ /dev/null @@ -1,692 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "Allocator.h" -#include "Ebofs.h" - - -#undef dout -#define dout(x) if (x <= g_conf.debug_ebofs) cout << "ebofs(" << fs->dev.get_device_name() << ").allocator." - - -void Allocator::dump_freelist() -{ - if (1) { - interval_set free; // validate too - - block_t n = 0; - for (int b=0; b<=EBOFS_NUM_FREE_BUCKETS; b++) { - Table *tab; - if (b < EBOFS_NUM_FREE_BUCKETS) { - tab = fs->free_tab[b]; - dout(0) << "dump bucket " << b << " " << tab->get_num_keys() << endl; - } else { - tab = fs->limbo_tab; - dout(0) << "dump limbo " << tab->get_num_keys() << endl;; - } - - if (tab->get_num_keys() > 0) { - Table::Cursor cursor(tab); - assert(tab->find(0, cursor) >= 0); - while (1) { - dout(0) << "dump ex " << cursor.current().key << "~" << cursor.current().value << endl; - assert(cursor.current().value > 0); - - if (b < EBOFS_NUM_FREE_BUCKETS) - n += cursor.current().value; - - if (free.contains( cursor.current().key, cursor.current().value )) - dout(0) << "dump bad " << cursor.current().key << "~" << cursor.current().value << endl; - assert(!free.contains( cursor.current().key, cursor.current().value )); - free.insert( cursor.current().key, cursor.current().value ); - if (cursor.move_right() <= 0) break; - } - } else { - //cout << " empty" << endl; - } - } - - assert(n == fs->free_blocks); - dout(0) << "dump combined freelist is " << free << endl; - - - // alloc_tab - if (fs->alloc_tab->get_num_keys() > 0) { - Table >::Cursor cursor(fs->alloc_tab); - assert(fs->alloc_tab->find(0, cursor) >= 0); - while (1) { - dout(0) << "alloc ex " << cursor.current().key << "~" << cursor.current().value.first << " ref " - << cursor.current().value.second - << endl; - assert(cursor.current().value.first > 0); - - if (cursor.move_right() <= 0) break; - } - } - } -} - - -int Allocator::find(Extent& ex, int bucket, block_t num, block_t near, int dir) -{ - Table::Cursor cursor(fs->free_tab[bucket]); - bool found = false; - - if ((dir == DIR_ANY || dir == DIR_FWD) && - fs->free_tab[bucket]->find( near, cursor ) >= 0) { - // look to the right - do { - if (cursor.current().value >= num) - found = true; - } while (!found && cursor.move_right() > 0); - } - - if ((dir == DIR_ANY || dir == DIR_BACK) && - !found) { - // look to the left - fs->free_tab[bucket]->find( near, cursor ); - - while (!found && cursor.move_left() >= 0) - if (cursor.current().value >= num) - found = true; - } - - if (found) { - ex.start = cursor.current().key; - ex.length = cursor.current().value; - return 0; - } - - return -1; -} - -int Allocator::allocate(Extent& ex, block_t num, block_t near) -{ - //dump_freelist(); - - int dir = DIR_ANY; // no dir - if (near == NEAR_LAST_FWD) { - near = last_pos; - dir = DIR_FWD; // fwd - } - else if (near == NEAR_LAST) - near = last_pos; - - int bucket; - - while (1) { // try twice, if fwd = true - - // look for contiguous extent - for (bucket = pick_bucket(num); bucket < EBOFS_NUM_FREE_BUCKETS; bucket++) { - if (find(ex, bucket, num, near, dir) >= 0) { - // yay! - - // remove original - fs->free_tab[bucket]->remove( ex.start ); - fs->free_blocks -= ex.length; - - if (ex.length > num) { - if (ex.start < near) { - // to the left - if (ex.start + ex.length - num <= near) { - // by a lot. take right-most portion. - Extent left; - left.start = ex.start; - left.length = ex.length - num; - ex.start += left.length; - ex.length -= left.length; - assert(ex.length == num); - _release_loner(left); - } else { - // take middle part. - Extent left,right; - left.start = ex.start; - left.length = near - ex.start; - ex.start = near; - right.start = ex.start + num; - right.length = ex.length - left.length - num; - ex.length = num; - _release_loner(left); - _release_loner(right); - } - } - else { - // to the right. take left-most part. - Extent right; - right.start = ex.start + num; - right.length = ex.length - num; - ex.length = num; - _release_loner(right); - } - } - - dout(20) << "allocate " << ex << " near " << near << endl; - last_pos = ex.end(); - //dump_freelist(); - if (g_conf.ebofs_cloneable) - alloc_inc(ex); - return num; - } - } - - if (dir == DIR_BACK || dir == DIR_ANY) break; - dir = DIR_BACK; - } - - // ok, find partial extent instead. - for (block_t trysize = num/2; trysize >= 1; trysize /= 2) { - int bucket = pick_bucket(trysize); - if (find(ex, bucket, trysize, near) >= 0) { - // yay! - assert(ex.length < num); - - fs->free_tab[bucket]->remove(ex.start); - fs->free_blocks -= ex.length; - last_pos = ex.end(); - dout(20) << "allocate partial " << ex << " (wanted " << num << ") near " << near << endl; - //dump_freelist(); - if (g_conf.ebofs_cloneable) - alloc_inc(ex); - return ex.length; - } - } - - dout(1) << "allocate failed, fs completely full! " << fs->free_blocks << endl; - assert(0); - //dump_freelist(); - return -1; -} - -int Allocator::_release_into_limbo(Extent& ex) -{ - dout(10) << "_release_into_limbo " << ex << endl; - dout(10) << "limbo is " << limbo << endl; - assert(ex.length > 0); - limbo.insert(ex.start, ex.length); - fs->limbo_blocks += ex.length; - return 0; -} - -int Allocator::release(Extent& ex) -{ - if (g_conf.ebofs_cloneable) - return alloc_dec(ex); - - _release_into_limbo(ex); - return 0; -} - -int Allocator::commit_limbo() -{ - dout(20) << "commit_limbo" << endl; - for (map::iterator i = limbo.m.begin(); - i != limbo.m.end(); - i++) { - fs->limbo_tab->insert(i->first, i->second); - //fs->free_blocks += i->second; - } - limbo.clear(); - //fs->limbo_blocks = 0; - //dump_freelist(); - return 0; -} - -int Allocator::release_limbo() -{ - //dump_freelist(); - if (fs->limbo_tab->get_num_keys() > 0) { - Table::Cursor cursor(fs->limbo_tab); - fs->limbo_tab->find(0, cursor); - while (1) { - Extent ex(cursor.current().key, cursor.current().value); - dout(20) << "release_limbo ex " << ex << endl; - - fs->limbo_blocks -= ex.length; - _release_merge(ex); - - if (cursor.move_right() <= 0) break; - } - } - fs->limbo_tab->clear(); - //dump_freelist(); - return 0; -} - - - -/* -int Allocator::_alloc_loner_inc(Extent& ex) -{ - Table >::Cursor cursor(fs->alloc_tab); - - if (fs->alloc_tab->find( ex.start, cursor ) - == Table >::Cursor::MATCH) { - assert(cursor.current().value.first == ex.length); - pair& v = cursor.dirty_current_value(); - v.second++; - dout(10) << "_alloc_loner_inc " << ex << " " - << (v.second-1) << " -> " << v.second - << endl; - } else { - // insert it, @1 - fs->alloc_tab->insert(ex.start, pair(ex.length,1)); - dout(10) << "_alloc_loner_inc " << ex << " 0 -> 1" << endl; - } - return 0; -} - -int Allocator::_alloc_loner_dec(Extent& ex) -{ - Table >::Cursor cursor(fs->alloc_tab); - - if (fs->alloc_tab->find( ex.start, cursor ) - == Table >::Cursor::MATCH) { - assert(cursor.current().value.first == ex.length); - if (cursor.current().value.second == 1) { - dout(10) << "_alloc_loner_dec " << ex << " 1 -> 0" << endl; - fs->alloc_tab->remove( cursor.current().key ); - } else { - pair& v = cursor.dirty_current_value(); - --v.second; - dout(10) << "_alloc_loner_dec " << ex << " " - << (v.second+1) << " -> " << v.second - << endl; - } - } else { - assert(0); - } - return 0; -} -*/ - - -int Allocator::alloc_inc(Extent ex) -{ - dout(10) << "alloc_inc " << ex << endl; - - // empty table? - if (fs->alloc_tab->get_num_keys() == 0) { - // easy. - fs->alloc_tab->insert(ex.start, pair(ex.length,1)); - dout(10) << "alloc_inc + " << ex << " 0 -> 1 (first entry)" << endl; - return 0; - } - - Table >::Cursor cursor(fs->alloc_tab); - - // try to move to left (to check for overlap) - int r = fs->alloc_tab->find( ex.start, cursor ); - if (r == Table >::Cursor::OOB || - cursor.current().key > ex.start) { - r = cursor.move_left(); - dout(10) << "alloc_inc move_left r = " << r << endl; - } - - while (1) { - dout(10) << "alloc_inc loop at " << cursor.current().key - << "~" << cursor.current().value.first - << " ref " << cursor.current().value.second - << endl; - - // too far left? - if (cursor.current().key < ex.start && - cursor.current().key + cursor.current().value.first <= ex.start) { - // adjacent? - bool adjacent = false; - if (cursor.current().key + cursor.current().value.first == ex.start && - cursor.current().value.second == 1) - adjacent = true; - - // no overlap. - r = cursor.move_right(); - dout(10) << "alloc_inc move_right r = " << r << endl; - - // at end? - if (r <= 0) { - // hmm! - if (adjacent) { - // adjust previous entry - cursor.move_left(); - pair &v = cursor.dirty_current_value(); - v.first += ex.length; // yay! - dout(10) << "alloc_inc + " << ex << " 0 -> 1 (adjust at end)" << endl; - } else { - // insert at end, finish. - int r = fs->alloc_tab->insert(ex.start, pair(ex.length,1)); - dout(10) << "alloc_inc + " << ex << " 0 -> 1 (at end) .. r = " << r << endl; - //dump_freelist(); - } - return 0; - } - } - - if (cursor.current().key > ex.start) { - // gap. - // oooooo - // nnnnn..... - block_t l = MIN(ex.length, cursor.current().key - ex.start); - - fs->alloc_tab->insert(ex.start, pair(l,1)); - dout(10) << "alloc_inc + " << ex.start << "~" << l << " 0 -> 1" << endl; - ex.start += l; - ex.length -= l; - if (ex.length == 0) break; - fs->alloc_tab->find( ex.start, cursor ); - } - else if (cursor.current().key < ex.start) { - block_t end = cursor.current().value.first + cursor.current().key; - - if (end <= ex.end()) { - // single split - // oooooo - // nnnnn - pair& v = cursor.dirty_current_value(); - v.first = ex.start - cursor.current().key; - int ref = v.second; - - block_t l = end - ex.start; - fs->alloc_tab->insert(ex.start, pair(l, 1+ref)); - - dout(10) << "alloc_inc " << ex.start << "~" << l - << " " << ref << " -> " << ref+1 - << " (right split)" << endl; - - ex.start += l; - ex.length -= l; - if (ex.length == 0) break; - fs->alloc_tab->find( ex.start, cursor ); - - } else { - // double split, finish. - // ------------- - // ------ - pair& v = cursor.dirty_current_value(); - v.first = ex.start - cursor.current().key; - int ref = v.second; - - fs->alloc_tab->insert(ex.start, pair(ex.length, 1+ref)); - - int rl = end - ex.end(); - fs->alloc_tab->insert(ex.end(), pair(rl, ref)); - - dout(10) << "alloc_inc " << ex - << " " << ref << " -> " << ref+1 - << " (double split finish)" - << endl; - - break; - } - } - else { - assert(cursor.current().key == ex.start); - - if (cursor.current().value.first <= ex.length) { - // inc. - // oooooo - // nnnnnnnn - pair& v = cursor.dirty_current_value(); - v.second++; - dout(10) << "alloc_inc " << ex.start << "~" << cursor.current().value.first - << " " << cursor.current().value.second-1 << " -> " - << cursor.current().value.second - << " (left split)" << endl; - ex.start += v.first; - ex.length -= v.first; - if (ex.length == 0) break; - cursor.move_right(); - } else { - // single split, finish. - // oooooo - // nnn - block_t l = cursor.current().value.first - ex.length; - int ref = cursor.current().value.second; - - pair& v = cursor.dirty_current_value(); - v.first = ex.length; - v.second++; - - fs->alloc_tab->insert(ex.end(), pair(l, ref)); - - dout(10) << "alloc_inc " << ex - << " " << ref << " -> " << ref+1 - << " (left split finish)" - << endl; - - break; - } - } - } - - return 0; -} - - -int Allocator::alloc_dec(Extent ex) -{ - dout(10) << "alloc_dec " << ex << endl; - - assert(fs->alloc_tab->get_num_keys() >= 0); - - Table >::Cursor cursor(fs->alloc_tab); - - // try to move to left (to check for overlap) - int r = fs->alloc_tab->find( ex.start, cursor ); - dout(10) << "alloc_dec find r = " << r << endl; - - if (r == Table >::Cursor::OOB || - cursor.current().key > ex.start) { - r = cursor.move_left(); - dout(10) << "alloc_dec move_left r = " << r << endl; - - // too far left? - if (cursor.current().key < ex.start && - cursor.current().key + cursor.current().value.first <= ex.start) { - // no overlap. - dump_freelist(); - assert(0); - } - } - - while (1) { - dout(10) << "alloc_dec ? " << cursor.current().key - << "~" << cursor.current().value.first - << " " << cursor.current().value.second - << ", ex is " << ex - << endl; - - assert(cursor.current().key <= ex.start); // no gap allowed. - - if (cursor.current().key < ex.start) { - block_t end = cursor.current().value.first + cursor.current().key; - - if (end <= ex.end()) { - // single split - // oooooo - // ----- - pair& v = cursor.dirty_current_value(); - v.first = ex.start - cursor.current().key; - int ref = v.second; - dout(10) << "alloc_dec s " << cursor.current().key << "~" << cursor.current().value.first - << " " << ref - << " shortened left bit of single" << endl; - - block_t l = end - ex.start; - if (ref > 1) { - fs->alloc_tab->insert(ex.start, pair(l, ref-1)); - dout(10) << "alloc_dec . " << ex.start << "~" << l - << " " << ref << " -> " << ref-1 - << endl; - } else { - Extent r(ex.start, l); - _release_into_limbo(r); - } - - ex.start += l; - ex.length -= l; - if (ex.length == 0) break; - fs->alloc_tab->find( ex.start, cursor ); - - } else { - // double split, finish. - // ooooooooooooo - // ------ - pair& v = cursor.dirty_current_value(); - v.first = ex.start - cursor.current().key; - int ref = v.second; - dout(10) << "alloc_dec s " << cursor.current().key << "~" << cursor.current().value.first - << " " << ref - << " shorted left bit of double split" << endl; - - if (ref > 1) { - fs->alloc_tab->insert(ex.start, pair(ex.length, ref-1)); - dout(10) << "alloc_inc s " << ex - << " " << ref << " -> " << ref-1 - << " reinserted middle bit of double split" - << endl; - } else { - _release_into_limbo(ex); - } - - int rl = end - ex.end(); - fs->alloc_tab->insert(ex.end(), pair(rl, ref)); - dout(10) << "alloc_dec s " << ex.end() << "~" << rl - << " " << ref - << " reinserted right bit of double split" << endl; - break; - } - } - else { - assert(cursor.current().key == ex.start); - - if (cursor.current().value.first <= ex.length) { - // inc. - // oooooo - // nnnnnnnn - if (cursor.current().value.second > 1) { - pair& v = cursor.dirty_current_value(); - v.second--; - dout(10) << "alloc_dec s " << ex.start << "~" << cursor.current().value.first - << " " << cursor.current().value.second+1 << " -> " << cursor.current().value.second - << endl; - ex.start += v.first; - ex.length -= v.first; - if (ex.length == 0) break; - cursor.move_right(); - } else { - Extent r(cursor.current().key, cursor.current().value.first); - _release_into_limbo(r); - - ex.start += cursor.current().value.first; - ex.length -= cursor.current().value.first; - cursor.remove(); - - if (ex.length == 0) break; - fs->alloc_tab->find( ex.start, cursor ); - } - } else { - // single split, finish. - // oooooo - // nnn - block_t l = cursor.current().value.first - ex.length; - int ref = cursor.current().value.second; - - if (ref > 1) { - pair& v = cursor.dirty_current_value(); - v.first = ex.length; - v.second--; - dout(10) << "alloc_inc . " << ex - << " " << ref << " -> " << ref-1 - << endl; - } else { - _release_into_limbo(ex); - cursor.remove(); - } - - dout(10) << "alloc_dec s " << ex.end() << "~" << l - << " " << ref - << " reinserted right bit of single split" << endl; - fs->alloc_tab->insert(ex.end(), pair(l, ref)); - break; - } - } - - - } - - return 0; -} - - -/* - * release extent into freelist - * WARNING: *ONLY* use this if you _know_ there are no adjacent free extents - */ -int Allocator::_release_loner(Extent& ex) -{ - assert(ex.length > 0); - int b = pick_bucket(ex.length); - fs->free_tab[b]->insert(ex.start, ex.length); - fs->free_blocks += ex.length; - return 0; -} - -/* - * release extent into freelist - * look for any adjacent extents and merge with them! - */ -int Allocator::_release_merge(Extent& orig) -{ - dout(15) << "_release_merge " << orig << endl; - assert(orig.length > 0); - - Extent newex = orig; - - // one after us? - for (int b=0; b::Cursor cursor(fs->free_tab[b]); - - if (fs->free_tab[b]->find( newex.start+newex.length, cursor ) - == Table::Cursor::MATCH) { - // add following extent to ours - newex.length += cursor.current().value; - - // remove it - fs->free_blocks -= cursor.current().value; - fs->free_tab[b]->remove( cursor.current().key ); - break; - } - } - - // one before us? - for (int b=0; b::Cursor cursor(fs->free_tab[b]); - fs->free_tab[b]->find( newex.start+newex.length, cursor ); - if (cursor.move_left() >= 0 && - (cursor.current().key + cursor.current().value == newex.start)) { - // merge - newex.start = cursor.current().key; - newex.length += cursor.current().value; - - // remove it - fs->free_blocks -= cursor.current().value; - fs->free_tab[b]->remove( cursor.current().key ); - break; - } - } - - // ok, insert newex - _release_loner(newex); - return 0; -} diff --git a/tags/20070517_before_mds_merge/ebofs/Allocator.h b/tags/20070517_before_mds_merge/ebofs/Allocator.h deleted file mode 100644 index c53ff2a69fba1..0000000000000 --- a/tags/20070517_before_mds_merge/ebofs/Allocator.h +++ /dev/null @@ -1,85 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __EBOFS_ALLOCATOR_H -#define __EBOFS_ALLOCATOR_H - -#include "types.h" - -#include "include/interval_set.h" - -class Ebofs; - -class Allocator { -public: - const static block_t NEAR_LAST = 0; - const static block_t NEAR_LAST_FWD = 1; - - const static int DIR_ANY = 0; - const static int DIR_FWD = 2; - const static int DIR_BACK = 1; - -protected: - Ebofs *fs; - block_t last_pos; - - - interval_set limbo; - - static int pick_bucket(block_t num) { - int b = 0; - while (num > 1) { - b++; - num = num >> EBOFS_FREE_BUCKET_BITS; - } - if (b >= EBOFS_NUM_FREE_BUCKETS) - b = EBOFS_NUM_FREE_BUCKETS-1; - return b; - } - - int find(Extent& ex, int bucket, block_t num, block_t near, int dir = DIR_ANY); - - void dump_freelist(); - - public: - int _release_into_limbo(Extent& ex); - - int _release_loner(Extent& ex); // release loner extent - int _release_merge(Extent& ex); // release any extent (searches for adjacent) - - //int _alloc_loner_inc(Extent& ex); - //int _alloc_loner_dec(Extent& ex); - - - public: - Allocator(Ebofs *f) : fs(f), last_pos(0) {} - - int allocate(Extent& ex, block_t num, block_t near=NEAR_LAST); - int release(Extent& ex); // alias for alloc_dec - - int alloc_inc(Extent ex); - int alloc_dec(Extent ex); - - - /*int unallocate(Extent& ex) { // skip limbo - return _release_merge(ex); - } - */ - - int commit_limbo(); // limbo -> fs->limbo_tab - int release_limbo(); // fs->limbo_tab -> free_tabs - -}; - -#endif diff --git a/tags/20070517_before_mds_merge/ebofs/BlockDevice.cc b/tags/20070517_before_mds_merge/ebofs/BlockDevice.cc deleted file mode 100644 index 7044e4ca38f27..0000000000000 --- a/tags/20070517_before_mds_merge/ebofs/BlockDevice.cc +++ /dev/null @@ -1,777 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "BlockDevice.h" - -#include "config.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include - -#ifndef __CYGWIN__ -#ifndef DARWIN -#include -#endif -#endif - - -/******************************************* - * biovec - */ - -inline ostream& operator<<(ostream& out, BlockDevice::biovec &bio) -{ - out << "bio("; - if (bio.type == BlockDevice::biovec::IO_READ) out << "rd "; - if (bio.type == BlockDevice::biovec::IO_WRITE) out << "wr "; - out << bio.start << "~" << bio.length; - if (bio.note) out << " " << bio.note; - out << " " << &bio; - out << ")"; - return out; -} - - - -/******************************************* - * ElevatorQueue - */ - -#undef dout -#define dout(x) if (x <= g_conf.debug_bdev) cout << "bdev(" << dev << ").elevatorq." -#define derr(x) if (x <= g_conf.debug_bdev) cerr << "bdev(" << dev << ").elevatorq." - - -int BlockDevice::ElevatorQueue::dequeue_io(list& biols, - block_t& start, block_t& length, - interval_set& block_lock) -{ - // queue empty? - assert(!io_map.empty()); - - dout(20) << "dequeue_io el_pos " << el_pos << " dir " << el_dir_forward << endl; - - // find our position: i >= pos - map::iterator i; - - int tries = g_conf.bdev_el_bidir + 1; - while (tries > 0) { - if (el_dir_forward) { - i = io_map.lower_bound(el_pos); - if (i != io_map.end()) { - break; // not at end. good. - } - } else { - i = io_map.upper_bound(el_pos); - if (i != io_map.begin()) { - i--; // and back down one (to get i <= pos). good. - break; - } - } - - // reverse (or initial startup)? - if (g_conf.bdev_el_bidir || !el_dir_forward) { - // dout(20) << "restart reversing" << endl; - el_dir_forward = !el_dir_forward; - } - - if (el_dir_forward) { - // forward - el_pos = 0; - - if (g_conf.bdev_el_fw_max_ms) { - el_stop = g_clock.now(); - utime_t max(0, 1000*g_conf.bdev_el_fw_max_ms); // (s,us), convert ms -> us! - el_stop += max; - // dout(20) << "restart forward sweep for " << max << endl; - } else { - // dout(20) << "restart fowrard sweep" << endl; - } - } else { - // reverse - el_pos = bdev->get_num_blocks(); - - if (g_conf.bdev_el_bw_max_ms) { - el_stop = g_clock.now(); - utime_t max(0, 1000*g_conf.bdev_el_bw_max_ms); // (s,us), convert ms -> us! - el_stop += max; - // dout(20) << "restart reverse sweep for " << max << endl; - } else { - // dout(20) << "restart reverse sweep" << endl; - } - } - - tries--; - } - - assert(tries > 0); // this shouldn't happen if the queue is non-empty. - - // get some biovecs - int num_bio = 0; - - dout(20) << "dequeue_io starting with " << i->first << " " << *i->second << endl; - - // merge contiguous ops - char type = i->second->type; // read or write - int num_iovs = 0; // count eventual iov's for readv/writev - - start = i->first; - length = 0; - - if (el_dir_forward) - el_pos = start; - else - el_pos = i->first + i->second->length; - - // while (contiguous) - while ((( el_dir_forward && el_pos == i->first) || - (!el_dir_forward && el_pos == i->first + i->second->length)) && - type == i->second->type) { - biovec *bio = i->second; - - // allowed? (not already submitted to kernel?) - if (block_lock.intersects(bio->start, bio->length)) { - // dout(20) << "dequeue_io " << bio->start << "~" << bio->length - // << " intersects block_lock " << block_lock << endl; - break; // stop, or go with what we've got so far - } - - // add to biols - int nv = bio->bl.buffers().size(); // how many iov's in this bio's bufferlist? - if (num_iovs + nv >= g_conf.bdev_iov_max) break; // too many! - num_iovs += nv; - - start = MIN(start, bio->start); - length += bio->length; - - if (el_dir_forward) { - //dout(20) << "dequeue_io fw dequeue io at " << el_pos << " " << *i->second << endl; - biols.push_back(bio); // add at back - } else { - // dout(20) << "dequeue_io bw dequeue io at " << el_pos << " " << *i->second << endl; - biols.push_front(bio); // add at front - } - num_bio++; - - // move elevator pointer - bool at_end = false; - map::iterator prev = i; - if (el_dir_forward) { - el_pos += bio->length; // cont. next would start right after us - i++; - if (i == io_map.end()) { - at_end = true; - } - } else { - el_pos -= bio->length; - if (i == io_map.begin()) { - at_end = true; - } else { - i--; - } - } - - // dequeue - io_map.erase(prev); - bio->in_queue = 0; - - if (at_end) break; - } - - return num_bio; -} - - - -/******************************************* - * BarrierQueue - */ -#undef dout -#define dout(x) if (x <= g_conf.debug_bdev) cout << "bdev(" << dev << ").barrierq." - -void BlockDevice::BarrierQueue::barrier() -{ - if (!qls.empty() && qls.front()->empty()) { - assert(qls.size() == 1); - dout(10) << "barrier not adding new queue, front is empty" << endl; - } else { - qls.push_back(new ElevatorQueue(bdev, dev)); - dout(10) << "barrier adding new elevator queue (now " << qls.size() << "), front queue has " - << qls.front()->size() << " ios left" << endl; - } -} - -bool BlockDevice::BarrierQueue::bump() -{ - assert(!qls.empty()); - - // is the front queue empty? - if (qls.front()->empty() && - qls.front() != qls.back()) { - delete qls.front(); - qls.pop_front(); - dout(10) << "dequeue_io front empty, moving to next queue (" << qls.front()->size() << ")" << endl; - return true; - } - - return false; -} - -int BlockDevice::BarrierQueue::dequeue_io(list& biols, - block_t& start, block_t& length, - interval_set& locked) -{ - assert(!qls.empty()); - int n = qls.front()->dequeue_io(biols, start, length, locked); - bump(); // in case we emptied the front queue - return n; -} - - - - -/******************************************* - * BlockDevice - */ - -#undef dout -#define dout(x) if (x <= g_conf.debug_bdev) cout << "bdev(" << dev << ")." - - - -block_t BlockDevice::get_num_blocks() -{ - if (!num_blocks) { - assert(fd > 0); - -#ifdef BLKGETSIZE64 - // ioctl block device? - ioctl(fd, BLKGETSIZE64, &num_blocks); -#endif - - if (!num_blocks) { - // hmm, try stat! - struct stat st; - fstat(fd, &st); - num_blocks = st.st_size; - } - - num_blocks /= (__uint64_t)EBOFS_BLOCK_SIZE; - - if (g_conf.bdev_fake_mb) { - num_blocks = g_conf.bdev_fake_mb * 256; - dout(0) << "faking dev size " << g_conf.bdev_fake_mb << " mb" << endl; - } - if (g_conf.bdev_fake_max_mb && - num_blocks > (block_t)g_conf.bdev_fake_max_mb * 256ULL) { - dout(0) << "faking dev size " << g_conf.bdev_fake_max_mb << " mb" << endl; - num_blocks = g_conf.bdev_fake_max_mb * 256; - } - - } - return num_blocks; -} - - - -/** io thread - * each worker thread dequeues ios from the root_queue and submits them to the kernel. - */ -void* BlockDevice::io_thread_entry() -{ - lock.Lock(); - - int whoami = io_threads_started++; - io_threads_running++; - assert(io_threads_running <= g_conf.bdev_iothreads); - dout(10) << "io_thread" << whoami << " start, " << io_threads_running << " now running" << endl; - - // get my own fd (and file position pointer) - int fd = open_fd(); - assert(fd > 0); - - while (!io_stop) { - bool do_sleep = false; - - // queue empty? - if (root_queue.empty()) { - // sleep - do_sleep = true; - } else { - dout(20) << "io_thread" << whoami << " going" << endl; - - block_t start, length; - list biols; - int n = root_queue.dequeue_io(biols, start, length, io_block_lock); - - if (n == 0) { - // failed to dequeue a do-able op, sleep for now - dout(20) << "io_thread" << whoami << " couldn't dequeue doable op, sleeping" << endl; - assert(io_threads_running > 1); // there must be someone else, if we couldn't dequeue something doable. - do_sleep = true; - } - else { - // lock blocks - assert(start == biols.front()->start); - io_block_lock.insert(start, length); - - // drop lock to do the io - lock.Unlock(); - do_io(fd, biols); - lock.Lock(); - - // unlock blocks - io_block_lock.erase(start, length); - - // someone might have blocked on our block_lock? - if (io_threads_running < g_conf.bdev_iothreads && - (int)root_queue.size() > io_threads_running) - io_wakeup.SignalAll(); - } - } - - if (do_sleep) { - do_sleep = false; - - // sleep - io_threads_running--; - dout(20) << "io_thread" << whoami << " sleeping, " << io_threads_running << " threads now running," - << " queue has " << root_queue.size() << endl; - - if (g_conf.bdev_idle_kick_after_ms > 0 && - io_threads_running == 0 && - idle_kicker) { - // first wait for signal | timeout - io_wakeup.WaitInterval(lock, utime_t(0, g_conf.bdev_idle_kick_after_ms*1000)); - - // should we still be sleeping? (did we get woken up, or did timer expire? - if (root_queue.empty() && io_threads_running == 0) { - idle_kicker->kick(); // kick - io_wakeup.Wait(lock); // and wait - } - } else { - // normal, just wait. - io_wakeup.Wait(lock); - } - - io_threads_running++; - assert(io_threads_running <= g_conf.bdev_iothreads); - dout(20) << "io_thread" << whoami << " woke up, " << io_threads_running << " threads now running" << endl; - } - } - - // clean up - ::close(fd); - io_threads_running--; - - lock.Unlock(); - - dout(10) << "io_thread" << whoami << " finish" << endl; - return 0; -} - - - -/** do_io - * do a single io operation - * (lock is NOT held, but we own the *biovec) - */ -void BlockDevice::do_io(int fd, list& biols) -{ - int r; - assert(!biols.empty()); - - // get full range, type, bl - bufferlist bl; - bl.claim(biols.front()->bl); - block_t start = biols.front()->start; - block_t length = biols.front()->length; - char type = biols.front()->type; - - list::iterator p = biols.begin(); - int numbio = 1; - for (p++; p != biols.end(); p++) { - length += (*p)->length; - bl.claim_append((*p)->bl); - numbio++; - } - - // do it - dout(20) << "do_io start " << (type==biovec::IO_WRITE?"write":"read") - << " " << start << "~" << length - << " " << numbio << " bits" << endl; - if (type == biovec::IO_WRITE) { - r = _write(fd, start, length, bl); - } else if (type == biovec::IO_READ) { - r = _read(fd, start, length, bl); - } else assert(0); - dout(20) << "do_io finish " << (type==biovec::IO_WRITE?"write":"read") - << " " << start << "~" << length << endl; - - // set rval - for (p = biols.begin(); p != biols.end(); p++) - (*p)->rval = r; - - if (1) { - // put in completion queue - complete_lock.Lock(); - complete_queue.splice( complete_queue.end(), biols ); - complete_queue_len += numbio; - complete_wakeup.Signal(); - complete_lock.Unlock(); - } else { - // be slow and finish synchronously - for (p = biols.begin(); p != biols.end(); p++) - finish_io(*p); - } -} - - -/** finish_io - * - * finish an io by signaling the cond or performing a callback. - * called by completion thread, unless that's disabled above. - */ -void BlockDevice::finish_io(biovec *bio) -{ - bio->done = true; - if (bio->cond) { - bio->cond->Signal(); - } - else if (bio->cb) { - bio->cb->finish((ioh_t)bio, bio->rval); - delete bio->cb; - delete bio; - } -} - -/*** completion_thread - * handle Cond signals or callbacks for completed ios - */ -void* BlockDevice::complete_thread_entry() -{ - complete_lock.Lock(); - dout(10) << "complete_thread start" << endl; - - while (!io_stop) { - - while (!complete_queue.empty()) { - list ls; - ls.swap(complete_queue); - dout(10) << "complete_thread grabbed " << complete_queue_len << " biovecs" << endl; - complete_queue_len = 0; - - complete_lock.Unlock(); - - // finish - for (list::iterator p = ls.begin(); - p != ls.end(); - p++) { - biovec *bio = *p; - dout(20) << "complete_thread finishing " << *bio << endl; - finish_io(bio); - } - - complete_lock.Lock(); - } - if (io_stop) break; - - /* - if (io_threads_running == 0 && idle_kicker) { - complete_lock.Unlock(); - idle_kicker->kick(); - complete_lock.Lock(); - if (!complete_queue.empty() || io_stop) - continue; - } - */ - - dout(25) << "complete_thread sleeping" << endl; - complete_wakeup.Wait(complete_lock); - } - - dout(10) << "complete_thread finish" << endl; - complete_lock.Unlock(); - return 0; -} - - - - -// io queue - -void BlockDevice::_submit_io(biovec *b) -{ - // NOTE: lock must be held - dout(15) << "_submit_io " << *b << endl; - - // wake up io_thread(s)? - if ((int)root_queue.size() == io_threads_running) - io_wakeup.SignalOne(); - else if ((int)root_queue.size() > io_threads_running) - io_wakeup.SignalAll(); - - // queue - root_queue.submit_io(b); - - /* - // [DEBUG] check for overlapping ios - // BUG: this doesn't detect all overlaps w/ the next queue thing. - if (g_conf.bdev_debug_check_io_overlap) { - // BUG: this doesn't catch everything! eg 1~10000000 will be missed.... - multimap::iterator p = io_queue.lower_bound(b->start); - if ((p != io_queue.end() && - p->first < b->start+b->length) || - (p != io_queue.begin() && - (p--, p->second->start + p->second->length > b->start))) { - dout(1) << "_submit_io new io " << *b - << " overlaps with existing " << *p->second << endl; - cerr << "_submit_io new io " << *b - << " overlaps with existing " << *p->second << endl; - } - } - */ - -} - -int BlockDevice::_cancel_io(biovec *bio) -{ - // NOTE: lock must be held - - if (bio->in_queue == 0) { - dout(15) << "_cancel_io " << *bio << " FAILED" << endl; - return -1; - } else { - dout(15) << "_cancel_io " << *bio << endl; - bio->in_queue->cancel_io(bio); - if (root_queue.bump()) - io_wakeup.SignalAll(); // something happened! - return 0; - } -} - - - -// low level io - -int BlockDevice::_read(int fd, block_t bno, unsigned num, bufferlist& bl) -{ - dout(10) << "_read " << bno << "~" << num << endl; - - assert(fd > 0); - - off_t offset = bno * EBOFS_BLOCK_SIZE; - off_t actual = lseek(fd, offset, SEEK_SET); - assert(actual == offset); - - size_t len = num*EBOFS_BLOCK_SIZE; - assert(bl.length() >= len); - - struct iovec iov[ bl.buffers().size() ]; - int n = 0; - size_t left = len; - for (list::const_iterator i = bl.buffers().begin(); - i != bl.buffers().end(); - i++) { - assert(i->length() % EBOFS_BLOCK_SIZE == 0); - - iov[n].iov_base = (void*)i->c_str(); - iov[n].iov_len = MIN(left, i->length()); - - left -= iov[n].iov_len; - n++; - if (left == 0) break; - } - - int got = ::readv(fd, iov, n); - assert(got <= (int)len); - - return 0; -} - -int BlockDevice::_write(int fd, unsigned bno, unsigned num, bufferlist& bl) -{ - dout(10) << "_write " << bno << "~" << num << endl; - - assert(fd > 0); - - off_t offset = (off_t)bno << EBOFS_BLOCK_BITS; - assert((off_t)bno * (off_t)EBOFS_BLOCK_SIZE == offset); - off_t actual = lseek(fd, offset, SEEK_SET); - assert(actual == offset); - - // write buffers - size_t len = num*EBOFS_BLOCK_SIZE; - - struct iovec iov[ bl.buffers().size() ]; - - int n = 0; - size_t left = len; - for (list::const_iterator i = bl.buffers().begin(); - i != bl.buffers().end(); - i++) { - assert(i->length() % EBOFS_BLOCK_SIZE == 0); - - iov[n].iov_base = (void*)i->c_str(); - iov[n].iov_len = MIN(left, i->length()); - - assert((((unsigned long long)iov[n].iov_base) & 4095ULL) == 0); - assert((iov[n].iov_len & 4095) == 0); - - left -= iov[n].iov_len; - n++; - if (left == 0) break; - } - - int r = ::writev(fd, iov, n); - - if (r < 0) { - dout(1) << "couldn't write bno " << bno << " num " << num - << " (" << len << " bytes) in " << n << " iovs, r=" << r - << " errno " << errno << " " << strerror(errno) << endl; - dout(1) << "bl is " << bl << endl; - assert(0); - } else { - assert(r == (int)len); - } - - return 0; -} - - - -// open/close - -int BlockDevice::open_fd() -{ -#ifdef DARWIN - int fd = ::open(dev.c_str(), O_RDWR|O_SYNC, 0); - ::fcntl(fd, F_NOCACHE); - return fd; -#else - return ::open(dev.c_str(), O_RDWR|O_SYNC|O_DIRECT, 0); -#endif -} - -int BlockDevice::open(kicker *idle) -{ - assert(fd == 0); - - // open? - fd = open_fd(); - if (fd < 0) { - dout(1) << "open failed, r = " << fd << " " << strerror(errno) << endl; - fd = 0; - return -1; - } - - // lock - if (g_conf.bdev_lock) { - int r = ::flock(fd, LOCK_EX|LOCK_NB); - if (r < 0) { - derr(1) << "open " << dev << " failed to get LOCK_EX" << endl; - assert(0); - return -1; - } - } - - // figure size - __uint64_t bsize = get_num_blocks(); - - dout(2) << "open " << bsize << " bytes, " << num_blocks << " blocks" << endl; - - // start thread - io_threads_started = 0; - io_threads.clear(); - for (int i=0; icreate(); - } - complete_thread.create(); - - // idle kicker? - idle_kicker = idle; - - return fd; -} - - -int BlockDevice::close() -{ - assert(fd>0); - - idle_kicker = 0; - - // shut down io thread - dout(10) << "close stopping io+complete threads" << endl; - lock.Lock(); - complete_lock.Lock(); - io_stop = true; - io_wakeup.SignalAll(); - complete_wakeup.SignalAll(); - complete_lock.Unlock(); - lock.Unlock(); - - - for (int i=0; ijoin(); - delete io_threads[i]; - } - io_threads.clear(); - - complete_thread.join(); - - io_stop = false; // in case we start again - - dout(2) << "close " << endl; - - if (g_conf.bdev_lock) - ::flock(fd, LOCK_UN); - - ::close(fd); - fd = 0; - - return 0; -} - -int BlockDevice::cancel_io(ioh_t ioh) -{ - biovec *pbio = (biovec*)ioh; - - lock.Lock(); - int r = _cancel_io(pbio); - lock.Unlock(); - - // FIXME? - if (r == 0 && pbio->cb) { - //pbio->cb->finish(ioh, 0); - delete pbio->cb; - delete pbio; - } - - return r; -} - diff --git a/tags/20070517_before_mds_merge/ebofs/BlockDevice.h b/tags/20070517_before_mds_merge/ebofs/BlockDevice.h deleted file mode 100644 index 18f639f7176b6..0000000000000 --- a/tags/20070517_before_mds_merge/ebofs/BlockDevice.h +++ /dev/null @@ -1,338 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __EBOFS_BLOCKDEVICE_H -#define __EBOFS_BLOCKDEVICE_H - -#include "include/buffer.h" -#include "include/interval_set.h" -#include "include/Context.h" -#include "common/Mutex.h" -#include "common/Cond.h" -#include "common/Thread.h" - -#include "types.h" - - -typedef void *ioh_t; // opaque handle to an io request. (in actuality, a biovec*) - - -class BlockDevice { - public: - // callback type for io completion notification - class callback { - public: - virtual ~callback() {} - virtual void finish(ioh_t ioh, int rval) = 0; - }; - - // kicker for idle notification - class kicker { - public: - virtual ~kicker() {} - virtual void kick() = 0; - }; - - - /********************************************************/ - - class Queue; - - // io item - // two variants: one with Cond*, one with callback*. - class biovec { - public: - static const char IO_WRITE = 1; - static const char IO_READ = 2; - - char type; - block_t start, length; - bufferlist bl; - callback *cb; - Cond *cond; - int rval; - char *note; - bool done; - - Queue *in_queue; - - biovec(char t, block_t s, block_t l, bufferlist& b, callback *c, char *n=0) : - type(t), start(s), length(l), bl(b), cb(c), cond(0), rval(0), note(n), done(false), in_queue(0) {} - biovec(char t, block_t s, block_t l, bufferlist& b, Cond *c, char *n=0) : - type(t), start(s), length(l), bl(b), cb(0), cond(c), rval(0), note(n), done(false), in_queue(0) {} - }; - friend ostream& operator<<(ostream& out, biovec &bio); - - - /********************************************************/ - - /* - * Queue -- abstract IO queue interface - */ - class Queue { - public: - virtual ~Queue() {} - virtual void submit_io(biovec *b) = 0; - virtual void cancel_io(biovec *b) = 0; - virtual int dequeue_io(list& biols, - block_t& start, block_t& length, - interval_set& locked) = 0; - virtual int size() = 0; - virtual bool empty() { return size() == 0; } - }; - - /* - * ElevatorQueue - simple elevator scheduler queue - */ - class ElevatorQueue : public Queue { - BlockDevice *bdev; - const char *dev; - map io_map; - bool el_dir_forward; - block_t el_pos; - utime_t el_stop; - - public: - ElevatorQueue(BlockDevice *bd, const char *d) : - bdev(bd), dev(d), - el_dir_forward(false), - el_pos(0) {} - void submit_io(biovec *b) { - b->in_queue = this; - assert(io_map.count(b->start) == 0); - io_map[b->start] = b; - } - void cancel_io(biovec *b) { - assert(b->in_queue == this); - assert(io_map.count(b->start) && - io_map[b->start] == b); - io_map.erase(b->start); - b->in_queue = 0; - } - int dequeue_io(list& biols, - block_t& start, block_t& length, - interval_set& locked); - int size() { - return io_map.size(); - } - }; - - /* - * BarrierQueue - lets you specify io "barriers" - * barrier() - force completion of all prior IOs before - * future ios are started. - * bump() - must be called after cancel_io to properly - * detect empty subqueue. - */ - class BarrierQueue : public Queue { - BlockDevice *bdev; - const char *dev; - list qls; - public: - BarrierQueue(BlockDevice *bd, const char *d) : bdev(bd), dev(d) { - barrier(); - } - ~BarrierQueue() { - for (list::iterator p = qls.begin(); - p != qls.end(); - ++p) - delete *p; - qls.clear(); - } - int size() { - // this isn't perfectly accurate. - if (!qls.empty()) - return qls.front()->size(); - return 0; - } - void submit_io(biovec *b) { - assert(!qls.empty()); - qls.back()->submit_io(b); - } - void cancel_io(biovec *b) { - assert(0); // shouldn't happen. - } - int dequeue_io(list& biols, - block_t& start, block_t& length, - interval_set& locked); - void barrier(); - bool bump(); - }; - - - private: - string dev; // my device file - int fd; - block_t num_blocks; - - Mutex lock; - - /** the root io queue. - * i current assumeit's a barrier queue,but this can be changed - * with some minor rearchitecting. - */ - BarrierQueue root_queue; - - kicker *idle_kicker; // not used.. - - /* io_block_lock - block ranges current dispatched to kernel - * once a bio is dispatched, it cannot be canceled, so an overlapping - * io and be submitted. the overlapping io cannot be dispatched - * to the kernel, however, until the original io finishes, or else - * there will be a race condition. - */ - interval_set io_block_lock; // blocks currently dispatched to kernel - - // io threads - Cond io_wakeup; - bool io_stop; - int io_threads_started, io_threads_running; - - void *io_thread_entry(); - - class IOThread : public Thread { - BlockDevice *dev; - public: - IOThread(BlockDevice *d) : dev(d) {} - void *entry() { return (void*)dev->io_thread_entry(); } - } ; - - vector io_threads; - - // private io interface - int open_fd(); // get an fd (for a thread) - - void _submit_io(biovec *b); - int _cancel_io(biovec *bio); - void do_io(int fd, list& biols); // called by an io thread - - // low level io - int _read(int fd, block_t bno, unsigned num, bufferlist& bl); - int _write(int fd, unsigned bno, unsigned num, bufferlist& bl); - - - // completion callback queue - Mutex complete_lock; - Cond complete_wakeup; - list complete_queue; - int complete_queue_len; - - void finish_io(biovec *bio); - - // complete thread - void *complete_thread_entry(); - class CompleteThread : public Thread { - BlockDevice *dev; - public: - CompleteThread(BlockDevice *d) : dev(d) {} - void *entry() { return (void*)dev->complete_thread_entry(); } - } complete_thread; - - - public: - BlockDevice(const char *d) : - dev(d), fd(0), num_blocks(0), - root_queue(this, dev.c_str()), - idle_kicker(0), - io_stop(false), io_threads_started(0), io_threads_running(0), - complete_queue_len(0), - complete_thread(this) { } - ~BlockDevice() { - if (fd > 0) close(); - } - - // get size in blocks - block_t get_num_blocks(); - const char *get_device_name() const { return dev.c_str(); } - - // open/close - int open(kicker *idle = 0); - int close(); - - // state stuff - bool is_idle() { - lock.Lock(); - bool idle = (io_threads_running == 0) && root_queue.empty(); - lock.Unlock(); - return idle; - } - void barrier() { - lock.Lock(); - root_queue.barrier(); - lock.Unlock(); - } - - // ** blocking interface ** - - // read - int read(block_t bno, unsigned num, bufferptr& bptr, char *n=0) { - bufferlist bl; - bl.push_back(bptr); - return read(bno, num, bl, n); - } - int read(block_t bno, unsigned num, bufferlist& bl, char *n=0) { - Cond c; - biovec bio(biovec::IO_READ, bno, num, bl, &c, n); - - lock.Lock(); - _submit_io(&bio); - barrier(); // need this, to prevent starvation! - while (!bio.done) - c.Wait(lock); - lock.Unlock(); - return bio.rval; - } - - // write - int write(unsigned bno, unsigned num, bufferptr& bptr, char *n=0) { - bufferlist bl; - bl.push_back(bptr); - return write(bno, num, bl, n); - } - int write(unsigned bno, unsigned num, bufferlist& bl, char *n=0) { - Cond c; - biovec bio(biovec::IO_WRITE, bno, num, bl, &c, n); - - lock.Lock(); - _submit_io(&bio); - barrier(); // need this, to prevent starvation! - while (!bio.done) - c.Wait(lock); - lock.Unlock(); - return bio.rval; - } - - // ** non-blocking interface ** - ioh_t read(block_t bno, unsigned num, bufferlist& bl, callback *fin, char *n=0) { - biovec *pbio = new biovec(biovec::IO_READ, bno, num, bl, fin, n); - lock.Lock(); - _submit_io(pbio); - lock.Unlock(); - return (ioh_t)pbio; - } - ioh_t write(block_t bno, unsigned num, bufferlist& bl, callback *fin, char *n=0) { - biovec *pbio = new biovec(biovec::IO_WRITE, bno, num, bl, fin, n); - lock.Lock(); - _submit_io(pbio); - lock.Unlock(); - return (ioh_t)pbio; - } - int cancel_io(ioh_t ioh); - -}; - - - - -#endif diff --git a/tags/20070517_before_mds_merge/ebofs/BufferCache.cc b/tags/20070517_before_mds_merge/ebofs/BufferCache.cc deleted file mode 100644 index 4ad22b3a5d0fb..0000000000000 --- a/tags/20070517_before_mds_merge/ebofs/BufferCache.cc +++ /dev/null @@ -1,1147 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "BufferCache.h" -#include "Onode.h" - - -/*********** BufferHead **************/ - - -#undef dout -#define dout(x) if (x <= g_conf.debug_ebofs) cout << "ebofs.bh." - - - - - - -/************ ObjectCache **************/ - - -#undef dout -#define dout(x) if (x <= g_conf.debug_ebofs) cout << "ebofs.oc." - - - -void ObjectCache::rx_finish(ioh_t ioh, block_t start, block_t length, bufferlist& bl) -{ - list waiters; - - dout(10) << "rx_finish " << start << "~" << length << endl; - for (map::iterator p = data.lower_bound(start); - p != data.end(); - p++) { - BufferHead *bh = p->second; - dout(10) << "rx_finish ?" << *bh << endl; - assert(p->first == bh->start()); - - // past? - if (p->first >= start+length) break; - if (bh->end() > start+length) break; // past - - assert(p->first >= start); - assert(bh->end() <= start+length); - - dout(10) << "rx_finish !" << *bh << endl; - - if (bh->rx_ioh == ioh) - bh->rx_ioh = 0; - - if (bh->is_rx()) { - assert(bh->get_version() == 0); - assert(bh->end() <= start+length); - assert(bh->start() >= start); - dout(10) << "rx_finish rx -> clean on " << *bh << endl; - bh->data.substr_of(bl, (bh->start()-start)*EBOFS_BLOCK_SIZE, bh->length()*EBOFS_BLOCK_SIZE); - bc->mark_clean(bh); - } - else if (bh->is_partial()) { - dout(10) << "rx_finish partial -> tx on " << *bh << endl; - - if (1) { - // double-check what block i am - vector exv; - on->map_extents(bh->start(), 1, exv); - assert(exv.size() == 1); - block_t cur_block = exv[0].start; - assert(cur_block == bh->partial_tx_to); - } - - // ok, cancel my low-level partial (since we're still here, and can bh_write ourselves) - bc->cancel_partial( bh->rx_from.start, bh->partial_tx_to, bh->partial_tx_epoch ); - - // apply partial to myself - assert(bh->data.length() == 0); - bufferptr bp = buffer::create_page_aligned(EBOFS_BLOCK_SIZE); - bh->data.push_back( bp ); - bh->data.copy_in(0, EBOFS_BLOCK_SIZE, bl); - bh->apply_partial(); - - // write "normally" - bc->mark_dirty(bh); - bc->bh_write(on, bh, bh->partial_tx_to);//cur_block); - - // clean up a bit - bh->partial_tx_to = 0; - bh->partial_tx_epoch = 0; - bh->partial.clear(); - } - else { - dout(10) << "rx_finish ignoring status on (dirty|tx|clean) " << *bh << endl; - assert(bh->is_dirty() || // was overwritten - bh->is_tx() || // was overwritten and queued - bh->is_clean()); // was overwritten, queued, _and_ flushed to disk - } - - // trigger waiters - for (map >::iterator p = bh->waitfor_read.begin(); - p != bh->waitfor_read.end(); - p++) { - assert(p->first >= bh->start() && p->first < bh->end()); - waiters.splice(waiters.begin(), p->second); - } - bh->waitfor_read.clear(); - } - - finish_contexts(waiters); -} - - -void ObjectCache::tx_finish(ioh_t ioh, block_t start, block_t length, - version_t version, version_t epoch) -{ - dout(10) << "tx_finish " << start << "~" << length << " v" << version << endl; - for (map::iterator p = data.lower_bound(start); - p != data.end(); - p++) { - BufferHead *bh = p->second; - dout(30) << "tx_finish ?bh " << *bh << endl; - assert(p->first == bh->start()); - - // past? - if (p->first >= start+length) break; - - if (bh->tx_ioh == ioh) - bh->tx_ioh = 0; - - if (!bh->is_tx()) { - dout(10) << "tx_finish bh not marked tx, skipping" << endl; - continue; - } - assert(bh->is_tx()); - - if (version == bh->version) { - dout(10) << "tx_finish tx -> clean on " << *bh << endl; - assert(bh->end() <= start+length); - bh->set_last_flushed(version); - bc->mark_clean(bh); - } else { - dout(10) << "tx_finish leaving tx, " << bh->version << " > " << version - << " on " << *bh << endl; - assert(bh->version > version); - } - } -} - - - -/* - * return any bh's that are (partially) in this range that are TX. - */ -int ObjectCache::find_tx(block_t start, block_t len, - list& tx) -{ - map::iterator p = data.lower_bound(start); - - block_t cur = start; - block_t left = len; - - /* don't care about overlap, we want things _fully_ in start~len. - if (p != data.begin() && - (p == data.end() || p->first > cur)) { - p--; // might overlap! - if (p->first + p->second->length() <= cur) - p++; // doesn't overlap. - } - */ - - while (left > 0) { - assert(cur+left == start+len); - - // at end? - if (p == data.end()) - break; - - if (p->first <= cur) { - // have it (or part of it) - BufferHead *e = p->second; - - if (e->end() <= start+len && - e->is_tx()) - tx.push_back(e); - - block_t lenfromcur = MIN(e->end() - cur, left); - cur += lenfromcur; - left -= lenfromcur; - p++; - continue; // more? - } else if (p->first > cur) { - // gap.. miss - block_t next = p->first; - left -= (next-cur); - cur = next; - continue; - } - else - assert(0); - } - - return 0; -} - - -int ObjectCache::try_map_read(block_t start, block_t len) -{ - map::iterator p = data.lower_bound(start); - - block_t cur = start; - block_t left = len; - - if (p != data.begin() && - (p == data.end() || p->first > cur)) { - p--; // might overlap! - if (p->first + p->second->length() <= cur) - p++; // doesn't overlap. - } - - int num_missing = 0; - - while (left > 0) { - // at end? - if (p == data.end()) { - // rest is a miss. - vector exv; - on->map_extents(cur, - left, // no prefetch here! - exv); - - num_missing += exv.size(); - left = 0; - cur = start+len; - break; - } - - if (p->first <= cur) { - // have it (or part of it) - BufferHead *e = p->second; - - if (e->is_clean() || - e->is_dirty() || - e->is_tx()) { - dout(20) << "try_map_read hit " << *e << endl; - } - else if (e->is_rx()) { - dout(20) << "try_map_read rx " << *e << endl; - num_missing++; - } - else if (e->is_partial()) { - dout(-20) << "try_map_read partial " << *e << endl; - num_missing++; - } - else { - dout(0) << "try_map_read got unexpected " << *e << endl; - assert(0); - } - - block_t lenfromcur = MIN(e->end() - cur, left); - cur += lenfromcur; - left -= lenfromcur; - p++; - continue; // more? - } else if (p->first > cur) { - // gap.. miss - block_t next = p->first; - vector exv; - on->map_extents(cur, - MIN(next-cur, left), // no prefetch - exv); - - dout(-20) << "try_map_read gap of " << p->first-cur << " blocks, " - << exv.size() << " extents" << endl; - num_missing += exv.size(); - left -= (p->first - cur); - cur = p->first; - continue; // more? - } - else - assert(0); - } - - assert(left == 0); - assert(cur == start+len); - return num_missing; -} - - - - - -/* - * map a range of blocks into buffer_heads. - * - create missing buffer_heads as necessary. - * - fragment along disk extent boundaries - */ -int ObjectCache::map_read(block_t start, block_t len, - map& hits, - map& missing, - map& rx, - map& partial) { - - map::iterator p = data.lower_bound(start); - - block_t cur = start; - block_t left = len; - - if (p != data.begin() && - (p == data.end() || p->first > cur)) { - p--; // might overlap! - if (p->first + p->second->length() <= cur) - p++; // doesn't overlap. - } - - while (left > 0) { - // at end? - if (p == data.end()) { - // rest is a miss. - vector exv; - //on->map_extents(cur, left, exv); // we might consider some prefetch here. - on->map_extents(cur, - //MIN(left + g_conf.ebofs_max_prefetch, // prefetch - //on->object_blocks-cur), - left, // no prefetch - exv); - for (unsigned i=0; i 0; i++) { - BufferHead *n = new BufferHead(this); - n->set_start( cur ); - n->set_length( exv[i].length ); - bc->add_bh(n); - missing[cur] = n; - dout(20) << "map_read miss " << left << " left, " << *n << endl; - cur += MIN(left,exv[i].length); - left -= MIN(left,exv[i].length); - } - assert(left == 0); - assert(cur == start+len); - break; - } - - if (p->first <= cur) { - // have it (or part of it) - BufferHead *e = p->second; - - if (e->is_clean() || - e->is_dirty() || - e->is_tx()) { - hits[cur] = e; // readable! - dout(20) << "map_read hit " << *e << endl; - bc->touch(e); - } - else if (e->is_rx()) { - rx[cur] = e; // missing, not readable. - dout(20) << "map_read rx " << *e << endl; - } - else if (e->is_partial()) { - partial[cur] = e; - dout(20) << "map_read partial " << *e << endl; - } - else { - dout(0) << "map_read ??? got unexpected " << *e << endl; - assert(0); - } - - block_t lenfromcur = MIN(e->end() - cur, left); - cur += lenfromcur; - left -= lenfromcur; - p++; - continue; // more? - } else if (p->first > cur) { - // gap.. miss - block_t next = p->first; - vector exv; - on->map_extents(cur, - //MIN(next-cur, MIN(left + g_conf.ebofs_max_prefetch, // prefetch - // on->object_blocks-cur)), - MIN(next-cur, left), // no prefetch - exv); - - for (unsigned i=0; i0; i++) { - BufferHead *n = new BufferHead(this); - n->set_start( cur ); - n->set_length( exv[i].length ); - bc->add_bh(n); - missing[cur] = n; - cur += MIN(left, n->length()); - left -= MIN(left, n->length()); - dout(20) << "map_read gap " << *n << endl; - } - continue; // more? - } - else - assert(0); - } - - assert(left == 0); - assert(cur == start+len); - return 0; -} - - -/* - * map a range of pages on an object's buffer cache. - * - * - break up bufferheads that don't fall completely within the range - * - cancel rx ops we obsolete. - * - resubmit rx ops if we split bufferheads - * - * - leave potentially obsoleted tx ops alone (for now) - * - don't worry about disk extent boundaries (yet) - */ -int ObjectCache::map_write(block_t start, block_t len, - interval_set& alloc, - map& hits, - version_t super_epoch) -{ - map::iterator p = data.lower_bound(start); - - dout(10) << "map_write " << *on << " " << start << "~" << len << " ... alloc " << alloc << endl; - // p->first >= start - - block_t cur = start; - block_t left = len; - - if (p != data.begin() && - (p == data.end() || p->first > cur)) { - p--; // might overlap! - if (p->first + p->second->length() <= cur) - p++; // doesn't overlap. - } - - //dump(); - - while (left > 0) { - // max for this bh (bc of (re)alloc on disk) - block_t max = left; - bool newalloc = false; - - // based on alloc/no-alloc boundary ... - if (alloc.contains(cur, left)) { - if (alloc.contains(cur)) { - block_t ends = alloc.end_after(cur); - max = MIN(left, ends-cur); - newalloc = true; - } else { - if (alloc.starts_after(cur)) { - block_t st = alloc.start_after(cur); - max = MIN(left, st-cur); - } - } - } - - // based on disk extent boundary ... - vector exv; - on->map_extents(cur, max, exv); - if (exv.size() > 1) - max = exv[0].length; - - if (newalloc) { - dout(10) << "map_write " << cur << "~" << max << " is new alloc on disk" << endl; - } else { - dout(10) << "map_write " << cur << "~" << max << " keeps old alloc on disk" << endl; - } - - // at end? - if (p == data.end()) { - BufferHead *n = new BufferHead(this); - n->set_start( cur ); - n->set_length( max ); - bc->add_bh(n); - hits[cur] = n; - left -= max; - cur += max; - continue; - } - - dout(10) << "p is " << *p->second << endl; - - - if (p->first <= cur) { - BufferHead *bh = p->second; - dout(10) << "map_write bh " << *bh << " intersected" << endl; - - if (p->first < cur) { - if (cur+max >= p->first+p->second->length()) { - // we want right bit (one splice) - if (bh->is_rx() && bc->bh_cancel_read(bh)) { - BufferHead *right = bc->split(bh, cur); - bc->bh_read(on, bh); // reread left bit - bh = right; - } else if (bh->is_tx() && !newalloc && bc->bh_cancel_write(bh, super_epoch)) { - BufferHead *right = bc->split(bh, cur); - bc->bh_write(on, bh); // rewrite left bit - bh = right; - } else { - bh = bc->split(bh, cur); // just split it - } - p++; - assert(p->second == bh); - } else { - // we want middle bit (two splices) - if (bh->is_rx() && bc->bh_cancel_read(bh)) { - BufferHead *middle = bc->split(bh, cur); - bc->bh_read(on, bh); // reread left - p++; - assert(p->second == middle); - BufferHead *right = bc->split(middle, cur+max); - bc->bh_read(on, right); // reread right - bh = middle; - } else if (bh->is_tx() && !newalloc && bc->bh_cancel_write(bh, super_epoch)) { - BufferHead *middle = bc->split(bh, cur); - bc->bh_write(on, bh); // redo left - p++; - assert(p->second == middle); - BufferHead *right = bc->split(middle, cur+max); - bc->bh_write(on, right); // redo right - bh = middle; - } else { - BufferHead *middle = bc->split(bh, cur); - p++; - assert(p->second == middle); - bc->split(middle, cur+max); - bh = middle; - } - } - } else if (p->first == cur) { - if (p->second->length() <= max) { - // whole bufferhead, piece of cake. - } else { - // we want left bit (one splice) - if (bh->is_rx() && bc->bh_cancel_read(bh)) { - BufferHead *right = bc->split(bh, cur+max); - bc->bh_read(on, right); // re-rx the right bit - } else if (bh->is_tx() && !newalloc && bc->bh_cancel_write(bh, super_epoch)) { - BufferHead *right = bc->split(bh, cur+max); - bc->bh_write(on, right); // re-tx the right bit - } else { - bc->split(bh, cur+max); // just split - } - } - } - - // try to cancel tx? - if (bh->is_tx() && !newalloc) bc->bh_cancel_write(bh, super_epoch); - - // put in our map - hits[cur] = bh; - - // keep going. - block_t lenfromcur = bh->end() - cur; - cur += lenfromcur; - left -= lenfromcur; - p++; - continue; - } else { - // gap! - block_t next = p->first; - block_t glen = MIN(next-cur, max); - dout(10) << "map_write gap " << cur << "~" << glen << endl; - BufferHead *n = new BufferHead(this); - n->set_start( cur ); - n->set_length( glen ); - bc->add_bh(n); - hits[cur] = n; - - cur += glen; - left -= glen; - continue; // more? - } - } - - assert(left == 0); - assert(cur == start+len); - return 0; -} - -/* don't need this. -int ObjectCache::scan_versions(block_t start, block_t len, - version_t& low, version_t& high) -{ - map::iterator p = data.lower_bound(start); - // p->first >= start - - if (p != data.begin() && p->first > start) { - p--; // might overlap? - if (p->first + p->second->length() <= start) - p++; // doesn't overlap. - } - if (p->first >= start+len) - return -1; // to the right. no hits. - - // start - low = high = p->second->get_version(); - - for (p++; p != data.end(); p++) { - // past? - if (p->first >= start+len) break; - - const version_t v = p->second->get_version(); - if (low > v) low = v; - if (high < v) high = v; - } - - return 0; -} -*/ - -void ObjectCache::touch_bottom(block_t bstart, block_t blast) -{ - for (map::iterator p = data.lower_bound(bstart); - p != data.end(); - ++p) { - BufferHead *bh = p->second; - - // don't trim unless it's entirely in our range - if (bh->start() < bstart) continue; - if (bh->end() > blast) break; - - dout(12) << "moving " << *bh << " to bottom of lru" << endl; - bc->touch_bottom(bh); // move to bottom of lru list - } -} - - -void ObjectCache::truncate(block_t blocks, version_t super_epoch) -{ - dout(7) << "truncate " << object_id - << " " << blocks << " blocks" - << endl; - - while (!data.empty()) { - block_t bhoff = data.rbegin()->first; - BufferHead *bh = data.rbegin()->second; - - if (bh->end() <= blocks) break; - - bool uncom = on->uncommitted.contains(bh->start(), bh->length()); - dout(10) << "truncate " << *bh << " uncom " << uncom - << " of " << on->uncommitted - << endl; - - if (bhoff < blocks) { - // we want right bit (one splice) - if (bh->is_rx() && bc->bh_cancel_read(bh)) { - BufferHead *right = bc->split(bh, blocks); - bc->bh_read(on, bh); // reread left bit - bh = right; - } else if (bh->is_tx() && uncom && bc->bh_cancel_write(bh, super_epoch)) { - BufferHead *right = bc->split(bh, blocks); - bc->bh_write(on, bh); // rewrite left bit - bh = right; - } else { - bh = bc->split(bh, blocks); // just split it - } - // no worries about partials up here, they're always 1 block (and thus never split) - } else { - // whole thing - // cancel any pending/queued io, if possible. - if (bh->is_rx()) - bc->bh_cancel_read(bh); - if (bh->is_tx() && uncom) - bc->bh_cancel_write(bh, super_epoch); - if (bh->shadow_of) { - dout(10) << "truncate " << *bh << " unshadowing " << *bh->shadow_of << endl; - // shadow - bh->shadow_of->remove_shadow(bh); - if (bh->is_partial()) - bc->cancel_shadow_partial(bh->rx_from.start, bh); - } else { - // normal - if (bh->is_partial() && uncom) - bc->bh_cancel_partial_write(bh); - } - } - - for (map >::iterator p = bh->waitfor_read.begin(); - p != bh->waitfor_read.end(); - p++) { - finish_contexts(p->second, -1); - } - - bc->remove_bh(bh); - delete bh; - } -} - - -void ObjectCache::clone_to(Onode *other) -{ - ObjectCache *ton = 0; - - for (map::iterator p = data.begin(); - p != data.end(); - p++) { - BufferHead *bh = p->second; - dout(10) << "clone_to ? " << *bh << endl; - if (bh->is_dirty() || bh->is_tx() || bh->is_partial()) { - // dup dirty or tx bh's - if (!ton) - ton = other->get_oc(bc); - BufferHead *nbh = new BufferHead(ton); - nbh->set_start( bh->start() ); - nbh->set_length( bh->length() ); - nbh->data = bh->data; // just copy refs to underlying buffers. - bc->add_bh(nbh); - - if (bh->is_partial()) { - dout(0) << "clone_to PARTIAL FIXME NOT FULLY IMPLEMENTED ******" << endl; - nbh->partial = bh->partial; - bc->mark_partial(nbh); - // register as shadow_partial - bc->add_shadow_partial(bh->rx_from.start, nbh); - } else { - // clean buffer will shadow - bh->add_shadow(nbh); - bc->mark_clean(nbh); - } - - dout(10) << "clone_to dup " << *bh << " -> " << *nbh << endl; - } - } -} - - - -/************** BufferCache ***************/ - -#undef dout -#define dout(x) if (x <= g_conf.debug_ebofs) cout << "ebofs.bc." - - - -BufferHead *BufferCache::split(BufferHead *orig, block_t after) -{ - dout(20) << "split " << *orig << " at " << after << endl; - - // split off right - BufferHead *right = new BufferHead(orig->get_oc()); - right->set_version(orig->get_version()); - right->epoch_modified = orig->epoch_modified; - right->last_flushed = orig->last_flushed; - right->set_state(orig->get_state()); - - block_t newleftlen = after - orig->start(); - right->set_start( after ); - right->set_length( orig->length() - newleftlen ); - - // shorten left - stat_sub(orig); - orig->set_length( newleftlen ); - stat_add(orig); - - // add right - add_bh(right); - - // adjust rx_from - if (orig->is_rx()) { - right->rx_from = orig->rx_from; - orig->rx_from.length = newleftlen; - right->rx_from.length -= newleftlen; - right->rx_from.start += newleftlen; - } - - // dup shadows - for (set::iterator p = orig->shadows.begin(); - p != orig->shadows.end(); - ++p) - right->add_shadow(*p); - - // split buffers too - bufferlist bl; - bl.claim(orig->data); - if (bl.length()) { - assert(bl.length() == (orig->length()+right->length())*EBOFS_BLOCK_SIZE); - right->data.substr_of(bl, orig->length()*EBOFS_BLOCK_SIZE, right->length()*EBOFS_BLOCK_SIZE); - orig->data.substr_of(bl, 0, orig->length()*EBOFS_BLOCK_SIZE); - } - - // move read waiters - if (!orig->waitfor_read.empty()) { - map >::iterator o, p = orig->waitfor_read.end(); - p--; - while (p != orig->waitfor_read.begin()) { - if (p->first < right->start()) break; - dout(0) << "split moving waiters at block " << p->first << " to right bh" << endl; - right->waitfor_read[p->first].swap( p->second ); - o = p; - p--; - orig->waitfor_read.erase(o); - } - } - - dout(20) << "split left is " << *orig << endl; - dout(20) << "split right is " << *right << endl; - return right; -} - - -void BufferCache::bh_read(Onode *on, BufferHead *bh, block_t from) -{ - dout(10) << "bh_read " << *on << " on " << *bh << endl; - - if (bh->is_missing()) { - mark_rx(bh); - } else { - assert(bh->is_partial()); - } - - // get extent. there should be only one! - vector exv; - on->map_extents(bh->start(), bh->length(), exv); - assert(exv.size() == 1); - Extent ex = exv[0]; - - if (from) { // force behavior, used for reading partials - dout(10) << "bh_read forcing read from block " << from << " (for a partial)" << endl; - ex.start = from; - ex.length = 1; - } - - // this should be empty!! - assert(bh->rx_ioh == 0); - - dout(20) << "bh_read " << *on << " " << *bh << " from " << ex << endl; - - C_OC_RxFinish *fin = new C_OC_RxFinish(ebofs_lock, on->oc, - bh->start(), bh->length(), - ex.start); - - //bufferpool.alloc(EBOFS_BLOCK_SIZE*bh->length(), fin->bl); // new buffers! - fin->bl.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*bh->length()) ); - - bh->rx_ioh = dev.read(ex.start, ex.length, fin->bl, - fin); - bh->rx_from = ex; - on->oc->get(); - -} - -bool BufferCache::bh_cancel_read(BufferHead *bh) -{ - if (bh->rx_ioh && dev.cancel_io(bh->rx_ioh) >= 0) { - dout(10) << "bh_cancel_read on " << *bh << endl; - bh->rx_ioh = 0; - mark_missing(bh); - int l = bh->oc->put(); - assert(l); - return true; - } - return false; -} - -void BufferCache::bh_write(Onode *on, BufferHead *bh, block_t shouldbe) -{ - dout(10) << "bh_write " << *on << " on " << *bh << " in epoch " << bh->epoch_modified << endl; - assert(bh->get_version() > 0); - - assert(bh->is_dirty()); - mark_tx(bh); - - // get extents - vector exv; - on->map_extents(bh->start(), bh->length(), exv); - assert(exv.size() == 1); - Extent ex = exv[0]; - - if (shouldbe) - assert(ex.length == 1 && ex.start == shouldbe); - - dout(20) << "bh_write " << *on << " " << *bh << " to " << ex << endl; - - //assert(bh->tx_ioh == 0); - - assert(bh->get_last_flushed() < bh->get_version()); - - bh->tx_block = ex.start; - bh->tx_ioh = dev.write(ex.start, ex.length, bh->data, - new C_OC_TxFinish(ebofs_lock, on->oc, - bh->start(), bh->length(), - bh->get_version(), - bh->epoch_modified), - "bh_write"); - - on->oc->get(); - inc_unflushed( EBOFS_BC_FLUSH_BHWRITE, bh->epoch_modified ); - - /* - // assert: no partials on the same block - // hose any partial on the same block - if (bh->partial_write.count(ex.start)) { - dout(10) << "bh_write hosing parital write on same block " << ex.start << " " << *bh << endl; - dec_unflushed( bh->partial_write[ex.start].epoch ); - bh->partial_write.erase(ex.start); - } - */ -} - - -bool BufferCache::bh_cancel_write(BufferHead *bh, version_t cur_epoch) -{ - if (bh->tx_ioh && dev.cancel_io(bh->tx_ioh) >= 0) { - dout(10) << "bh_cancel_write on " << *bh << endl; - bh->tx_ioh = 0; - mark_dirty(bh); - - assert(bh->epoch_modified == cur_epoch); - assert(bh->epoch_modified > 0); - dec_unflushed( EBOFS_BC_FLUSH_BHWRITE, bh->epoch_modified ); // assert.. this should be the same epoch! - - int l = bh->oc->put(); - assert(l); - return true; - } - return false; -} - -void BufferCache::tx_finish(ObjectCache *oc, - ioh_t ioh, block_t start, block_t length, - version_t version, version_t epoch) -{ - ebofs_lock.Lock(); - - // finish oc - if (oc->put() == 0) { - delete oc; - } else - oc->tx_finish(ioh, start, length, version, epoch); - - // update unflushed counter - assert(get_unflushed(EBOFS_BC_FLUSH_BHWRITE, epoch) > 0); - dec_unflushed(EBOFS_BC_FLUSH_BHWRITE, epoch); - - ebofs_lock.Unlock(); -} - -void BufferCache::rx_finish(ObjectCache *oc, - ioh_t ioh, block_t start, block_t length, - block_t diskstart, - bufferlist& bl) -{ - ebofs_lock.Lock(); - dout(10) << "rx_finish ioh " << ioh << " on " << start << "~" << length - << ", at device block " << diskstart << endl; - - // oc - if (oc->put() == 0) - delete oc; - else - oc->rx_finish(ioh, start, length, bl); - - // finish any partials? - // note: these are partials that were re-written after a commit, - // or for whom the OC was destroyed (eg truncated after a commit) - map >::iterator sp = partial_write.lower_bound(diskstart); - while (sp != partial_write.end()) { - if (sp->first >= diskstart+length) break; - assert(sp->first >= diskstart); - - block_t pblock = sp->first; - map writes; - writes.swap( sp->second ); - - map >::iterator t = sp; - sp++; - partial_write.erase(t); - - for (map::iterator p = writes.begin(); - p != writes.end(); - p++) { - dout(10) << "rx_finish partial from " << pblock << " -> " << p->first - << " for epoch " << p->second.epoch - //<< " (bh.epoch_modified is now " << bh->epoch_modified << ")" - << endl; - // this had better be a past epoch - //assert(p->epoch == epoch_modified - 1); // ?? - - // make the combined block - bufferlist combined; - bufferptr bp = buffer::create_page_aligned(EBOFS_BLOCK_SIZE); - combined.push_back( bp ); - combined.copy_in((pblock-diskstart)*EBOFS_BLOCK_SIZE, (pblock-diskstart+1)*EBOFS_BLOCK_SIZE, bl); - BufferHead::apply_partial( combined, p->second.partial ); - - // write it! - dev.write( pblock, 1, combined, - new C_OC_PartialTxFinish( this, p->second.epoch ), - "finish_partials"); - } - } - - // shadow partials? - { - list waiters; - map >::iterator sp = shadow_partials.lower_bound(diskstart); - while (sp != shadow_partials.end()) { - if (sp->first >= diskstart+length) break; - assert(sp->first >= diskstart); - - block_t pblock = sp->first; - set ls; - ls.swap( sp->second ); - - map >::iterator t = sp; - sp++; - shadow_partials.erase(t); - - for (set::iterator p = ls.begin(); - p != ls.end(); - ++p) { - BufferHead *bh = *p; - dout(10) << "rx_finish applying shadow_partial for " << pblock - << " to " << *bh << endl; - bufferptr bp = buffer::create_page_aligned(EBOFS_BLOCK_SIZE); - bh->data.clear(); - bh->data.push_back( bp ); - bh->data.copy_in((pblock-diskstart)*EBOFS_BLOCK_SIZE, - (pblock-diskstart+1)*EBOFS_BLOCK_SIZE, - bl); - bh->apply_partial(); - bh->set_state(BufferHead::STATE_CLEAN); - - // trigger waiters - for (map >::iterator p = bh->waitfor_read.begin(); - p != bh->waitfor_read.end(); - p++) { - assert(p->first >= bh->start() && p->first < bh->end()); - waiters.splice(waiters.begin(), p->second); - } - bh->waitfor_read.clear(); - } - } - - // kick waiters - finish_contexts(waiters); - } - - // done. - ebofs_lock.Unlock(); -} - -void BufferCache::partial_tx_finish(version_t epoch) -{ - ebofs_lock.Lock(); - - dout(10) << "partial_tx_finish in epoch " << epoch << endl; - - // update unflushed counter - assert(get_unflushed(EBOFS_BC_FLUSH_PARTIAL, epoch) > 0); - dec_unflushed(EBOFS_BC_FLUSH_PARTIAL, epoch); - - ebofs_lock.Unlock(); -} - - - - -void BufferCache::bh_queue_partial_write(Onode *on, BufferHead *bh) -{ - assert(bh->get_version() > 0); - - assert(bh->is_partial()); - assert(bh->length() == 1); - - // get the block no - vector exv; - on->map_extents(bh->start(), bh->length(), exv); - assert(exv.size() == 1); - block_t b = exv[0].start; - assert(exv[0].length == 1); - bh->partial_tx_to = exv[0].start; - bh->partial_tx_epoch = bh->epoch_modified; - - dout(10) << "bh_queue_partial_write " << *on << " on " << *bh << " block " << b << " epoch " << bh->epoch_modified << endl; - - - // copy map state, queue for this block - assert(bh->rx_from.length == 1); - queue_partial( bh->rx_from.start, bh->partial_tx_to, bh->partial, bh->partial_tx_epoch ); -} - -void BufferCache::bh_cancel_partial_write(BufferHead *bh) -{ - assert(bh->is_partial()); - assert(bh->length() == 1); - - cancel_partial( bh->rx_from.start, bh->partial_tx_to, bh->partial_tx_epoch ); -} - - -void BufferCache::queue_partial(block_t from, block_t to, - map& partial, version_t epoch) -{ - dout(10) << "queue_partial " << from << " -> " << to - << " in epoch " << epoch - << endl; - - if (partial_write[from].count(to)) { - // this should be in the same epoch. - assert( partial_write[from][to].epoch == epoch); - assert(0); // actually.. no! - } else { - inc_unflushed( EBOFS_BC_FLUSH_PARTIAL, epoch ); - } - - partial_write[from][to].partial = partial; - partial_write[from][to].epoch = epoch; -} - -void BufferCache::cancel_partial(block_t from, block_t to, version_t epoch) -{ - assert(partial_write.count(from)); - assert(partial_write[from].count(to)); - assert(partial_write[from][to].epoch == epoch); - - dout(10) << "cancel_partial " << from << " -> " << to - << " (was epoch " << partial_write[from][to].epoch << ")" - << endl; - - partial_write[from].erase(to); - if (partial_write[from].empty()) - partial_write.erase(from); - - dec_unflushed( EBOFS_BC_FLUSH_PARTIAL, epoch ); -} - - -void BufferCache::add_shadow_partial(block_t from, BufferHead *bh) -{ - dout(10) << "add_shadow_partial from " << from << " " << *bh << endl; - shadow_partials[from].insert(bh); -} - -void BufferCache::cancel_shadow_partial(block_t from, BufferHead *bh) -{ - dout(10) << "cancel_shadow_partial from " << from << " " << *bh << endl; - shadow_partials[from].erase(bh); -} diff --git a/tags/20070517_before_mds_merge/ebofs/BufferCache.h b/tags/20070517_before_mds_merge/ebofs/BufferCache.h deleted file mode 100644 index 563b3e5791c21..0000000000000 --- a/tags/20070517_before_mds_merge/ebofs/BufferCache.h +++ /dev/null @@ -1,697 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __EBOFS_BUFFERCACHE_H -#define __EBOFS_BUFFERCACHE_H - -#include "include/lru.h" -#include "include/Context.h" - -#include "common/Clock.h" - -#include "types.h" -#include "BlockDevice.h" - -#include "include/interval_set.h" - -class ObjectCache; -class BufferCache; -class Onode; - -class BufferHead : public LRUObject { - public: - /* - * - buffer_heads should always break across disk extent boundaries - * - partial buffer_heads are always 1 block. - */ - const static int STATE_MISSING = 0; // missing; data is on disk, but not loaded. - const static int STATE_CLEAN = 1; // Rw clean - const static int STATE_DIRTY = 2; // RW dirty - const static int STATE_TX = 3; // Rw flushing to disk - const static int STATE_RX = 4; // w reading from disk - const static int STATE_PARTIAL = 5; // reading from disk, + partial content map. always 1 block. - - public: - ObjectCache *oc; - - bufferlist data; - - ioh_t rx_ioh; // - Extent rx_from; - ioh_t tx_ioh; // - block_t tx_block; - block_t partial_tx_to; - version_t partial_tx_epoch; - - map partial; // partial dirty content overlayed onto incoming data - - map< block_t, list > waitfor_read; - - set shadows; // shadow bh's that clone()ed me. - BufferHead* shadow_of; - - private: - int ref; - int state; - - public: - version_t epoch_modified; - - version_t version; // current version in cache - version_t last_flushed; // last version flushed to disk - - Extent object_loc; // block position _in_object_ - - utime_t dirty_stamp; - - bool want_to_expire; // wants to be at bottom of lru - - public: - BufferHead(ObjectCache *o) : - oc(o), //cancellable_ioh(0), tx_epoch(0), - rx_ioh(0), tx_ioh(0), tx_block(0), partial_tx_to(0), partial_tx_epoch(0), - shadow_of(0), - ref(0), state(STATE_MISSING), epoch_modified(0), version(0), last_flushed(0), - want_to_expire(false) - {} - ~BufferHead() { - unpin_shadows(); - } - - ObjectCache *get_oc() { return oc; } - - int get() { - assert(ref >= 0); - if (ref == 0) lru_pin(); - return ++ref; - } - int put() { - assert(ref > 0); - if (ref == 1) lru_unpin(); - --ref; - return ref; - } - - block_t start() { return object_loc.start; } - void set_start(block_t s) { object_loc.start = s; } - block_t length() { return object_loc.length; } - void set_length(block_t l) { object_loc.length = l; } - block_t end() { return start() + length(); } - block_t last() { return end()-1; } - - version_t get_version() { return version; } - void set_version(version_t v) { version = v; } - version_t get_last_flushed() { return last_flushed; } - void set_last_flushed(version_t v) { - if (v <= last_flushed) cout << "last_flushed begin set to " << v << ", was " << last_flushed << endl; - assert(v > last_flushed); - last_flushed = v; - } - - utime_t get_dirty_stamp() { return dirty_stamp; } - void set_dirty_stamp(utime_t t) { dirty_stamp = t; } - - void set_state(int s) { - if (s == STATE_PARTIAL || s == STATE_RX || s == STATE_TX) get(); - if (state == STATE_PARTIAL || state == STATE_RX || state == STATE_TX) put(); - - if ((state == STATE_TX && s != STATE_TX) || - (state == STATE_PARTIAL && s != STATE_PARTIAL)) - unpin_shadows(); - - state = s; - } - int get_state() { return state; } - - bool is_missing() { return state == STATE_MISSING; } - bool is_dirty() { return state == STATE_DIRTY; } - bool is_clean() { return state == STATE_CLEAN; } - bool is_tx() { return state == STATE_TX; } - bool is_rx() { return state == STATE_RX; } - bool is_partial() { return state == STATE_PARTIAL; } - - //bool is_partial_writes() { return !partial_write.empty(); } - //void finish_partials(); - //void cancel_partials(); - //void queue_partial_write(block_t b); - - void add_shadow(BufferHead *dup) { - shadows.insert(dup); - dup->shadow_of = this; - dup->get(); - } - void remove_shadow(BufferHead *dup) { - shadows.erase(dup); - dup->shadow_of = 0; - dup->put(); - } - void unpin_shadows() { - for (set::iterator p = shadows.begin(); - p != shadows.end(); - ++p) { - //cout << "unpin shadow " << *p << endl; - (*p)->shadow_of = 0; - (*p)->put(); - } - shadows.clear(); - } - - void copy_partial_substr(off_t start, off_t end, bufferlist& bl) { - map::iterator i = partial.begin(); - - // skip first bits (fully to left) - while ((i->first + i->second.length() < start) && - i != partial.end()) - i++; - assert(i != partial.end()); - assert(i->first <= start); - - // first - unsigned bhoff = MAX(start, i->first) - i->first; - unsigned bhlen = MIN(end-start, i->second.length()); - bl.substr_of( i->second, bhoff, bhlen ); - - off_t pos = i->first + i->second.length(); - - // have continuous to end? - for (i++; i != partial.end(); i++) { - if (pos >= end) break; - assert(pos == i->first); - - pos = i->first + i->second.length(); - - if (pos <= end) { // this whole frag - bl.append( i->second ); - } else { // partial end - unsigned bhlen = end-start-bl.length(); - bufferlist frag; - frag.substr_of( i->second, 0, bhlen ); - bl.claim_append(frag); - break; // done. - } - } - - assert(pos >= end); - assert(bl.length() == (unsigned)(end-start)); - } - - bool have_partial_range(off_t start, off_t end) { - map::iterator i = partial.begin(); - - // skip first bits (fully to left) - while ((i->first + i->second.length() < start) && - i != partial.end()) - i++; - if (i == partial.end()) return false; - - // have start? - if (i->first > start) return false; - off_t pos = i->first + i->second.length(); - - // have continuous to end? - for (i++; i != partial.end(); i++) { - assert(pos <= i->first); - if (pos < i->first) return false; - assert(pos == i->first); - pos = i->first + i->second.length(); - if (pos >= end) break; // gone far enough - } - - if (pos >= end) return true; - return false; - } - - bool partial_is_complete(off_t size) { - return have_partial_range( 0, MIN(size, EBOFS_BLOCK_SIZE) ); - //(off_t)(start()*EBOFS_BLOCK_SIZE), - //MIN( size, (off_t)(end()*EBOFS_BLOCK_SIZE) ) ); - } - void apply_partial() { - apply_partial(data, partial); - partial.clear(); - } - static void apply_partial(bufferlist& bl, map& pm) { - assert(bl.length() == (unsigned)EBOFS_BLOCK_SIZE); - //assert(partial_is_complete()); - //cout << "apply_partial" << endl; - for (map::iterator i = pm.begin(); - i != pm.end(); - i++) { - int pos = i->first; - //cout << " frag at opos " << i->first << " bhpos " << pos << " len " << i->second.length() << endl; - bl.copy_in(pos, i->second.length(), i->second); - } - pm.clear(); - } - void add_partial(off_t off, bufferlist& p) { - unsigned len = p.length(); - assert(len <= (unsigned)EBOFS_BLOCK_SIZE); - //assert(off >= (off_t)(start()*EBOFS_BLOCK_SIZE)); - //assert(off + len <= (off_t)(end()*EBOFS_BLOCK_SIZE)); - assert(off >= 0); - assert(off + len <= EBOFS_BLOCK_SIZE); - - // trim any existing that overlaps - for (map::iterator i = partial.begin(); - i != partial.end(); - ) { - if (i->first + i->second.length() <= off) { // before - i++; - continue; - } - if (i->first >= off+len) break; // past affected area. - - // overlap all? - if (off <= i->first && i->first + i->second.length() <= off+len) { - // erase it and move on. - off_t dead = i->first; - i++; - partial.erase(dead); - continue; - } - // overlap tail? - else if (i->first < off && off < i->first + i->second.length()) { - // shorten. - unsigned newlen = off - i->first; - bufferlist o; - o.claim( i->second ); - i->second.substr_of(o, 0, newlen); - i++; - continue; - } - // overlap head? - else if (off < i->first && off+len < i->first + i->second.length()) { - // move. - off_t oldoff = i->first; - off_t newoff = off+len; - unsigned trim = newoff - oldoff; - partial[newoff].substr_of(i->second, trim, i->second.length()-trim); - i++; // should be at newoff! - partial.erase( oldoff ); - i++; - continue; - } else - assert(0); - } - - // insert - partial[off] = p; - } - - -}; - -inline ostream& operator<<(ostream& out, BufferHead& bh) -{ - out << "bufferhead(" << bh.start() << "~" << bh.length(); - out << " v" << bh.get_version() << "/" << bh.get_last_flushed(); - if (bh.is_missing()) out << " missing"; - if (bh.is_dirty()) out << " dirty"; - if (bh.is_clean()) out << " clean"; - if (bh.is_rx()) out << " rx"; - if (bh.is_tx()) out << " tx"; - if (bh.is_partial()) out << " partial"; - //out << " " << bh.data.length(); - out << " " << &bh; - out << ")"; - return out; -} - - -class ObjectCache { - public: - object_t object_id; - Onode *on; - BufferCache *bc; - - private: - map data; - int ref; - - public: - version_t write_count; - - - public: - ObjectCache(object_t o, Onode *_on, BufferCache *b) : - object_id(o), on(_on), bc(b), ref(0), - write_count(0) { } - ~ObjectCache() { - assert(data.empty()); - assert(ref == 0); - } - - int get() { - ++ref; - //cout << "oc.get " << object_id << " " << ref << endl; - return ref; - } - int put() { - assert(ref > 0); - --ref; - //cout << "oc.put " << object_id << " " << ref << endl; - return ref; - } - - object_t get_object_id() { return object_id; } - - void add_bh(BufferHead *bh) { - // add to my map - assert(data.count(bh->start()) == 0); - - if (0) { // sanity check FIXME DEBUG - //cout << "add_bh " << bh->start() << "~" << bh->length() << endl; - map::iterator p = data.lower_bound(bh->start()); - if (p != data.end()) { - //cout << " after " << *p->second << endl; - //cout << " after starts at " << p->first << endl; - assert(p->first >= bh->end()); - } - if (p != data.begin()) { - p--; - //cout << " before starts at " << p->second->start() - //<< " and ends at " << p->second->end() << endl; - //cout << " before " << *p->second << endl; - assert(p->second->end() <= bh->start()); - } - } - - data[bh->start()] = bh; - } - void remove_bh(BufferHead *bh) { - assert(data.count(bh->start())); - data.erase(bh->start()); - } - bool is_empty() { return data.empty(); } - - int find_tx(block_t start, block_t len, - list& tx); - - int map_read(block_t start, block_t len, - map& hits, // hits - map& missing, // read these from disk - map& rx, // wait for these to finish reading from disk - map& partial); // (maybe) wait for these to read from disk - int try_map_read(block_t start, block_t len); // just tell us how many extents we're missing. - - - int map_write(block_t start, block_t len, - interval_set& alloc, - map& hits, - version_t super_epoch); // can write to these. - void touch_bottom(block_t bstart, block_t blast); - - BufferHead *split(BufferHead *bh, block_t off); - - /*int scan_versions(block_t start, block_t len, - version_t& low, version_t& high); - */ - - void rx_finish(ioh_t ioh, block_t start, block_t length, bufferlist& bl); - void tx_finish(ioh_t ioh, block_t start, block_t length, version_t v, version_t epoch); - - void truncate(block_t blocks, version_t super_epoch); - // void tear_down(); - - void clone_to(Onode *other); - - void dump() { - for (map::iterator i = data.begin(); - i != data.end(); - i++) - cout << "dump: " << i->first << ": " << *i->second << endl; - } - -}; - - - -class BufferCache { - public: - Mutex &ebofs_lock; // hack: this is a ref to global ebofs_lock - BlockDevice &dev; - - set dirty_bh; - - LRU lru_dirty, lru_rest; - - private: - Cond stat_cond; - Cond flush_cond; - int stat_waiter; - - off_t stat_clean; - off_t stat_dirty; - off_t stat_rx; - off_t stat_tx; - off_t stat_partial; - off_t stat_missing; - -#define EBOFS_BC_FLUSH_BHWRITE 0 -#define EBOFS_BC_FLUSH_PARTIAL 1 - - map epoch_unflushed[2]; - - /* partial writes - incomplete blocks that can't be written until - * their prior content is read and overlayed with the new data. - * - * we put partial block management here because objects may be deleted - * before the read completes, but the write may have been committed in a - * prior epoch. - * - * we map: src block -> dest block -> PartialWrite - * - * really, at most there will only ever be two of these, for current+previous epochs. - */ - class PartialWrite { - public: - map partial; // partial dirty content overlayed onto incoming data - version_t epoch; - }; - - map > partial_write; // queued writes w/ partial content - map > shadow_partials; - - public: - BufferCache(BlockDevice& d, Mutex& el) : - ebofs_lock(el), dev(d), - stat_waiter(0), - stat_clean(0), stat_dirty(0), stat_rx(0), stat_tx(0), stat_partial(0), stat_missing(0) - {} - - - off_t get_size() { - return stat_clean+stat_dirty+stat_rx+stat_tx+stat_partial; - } - off_t get_trimmable() { - return stat_clean; - } - - - // bh's in cache - void add_bh(BufferHead *bh) { - bh->get_oc()->add_bh(bh); - if (bh->is_dirty()) { - lru_dirty.lru_insert_mid(bh); - dirty_bh.insert(bh); - } else - lru_rest.lru_insert_mid(bh); - stat_add(bh); - } - void touch(BufferHead *bh) { - if (bh->is_dirty()) { - lru_dirty.lru_touch(bh); - } else - lru_rest.lru_touch(bh); - } - void touch_bottom(BufferHead *bh) { - if (bh->is_dirty()) { - bh->want_to_expire = true; - lru_dirty.lru_bottouch(bh); - } else - lru_rest.lru_bottouch(bh); - } - void remove_bh(BufferHead *bh) { - bh->get_oc()->remove_bh(bh); - stat_sub(bh); - if (bh->is_dirty()) { - lru_dirty.lru_remove(bh); - dirty_bh.erase(bh); - } else - lru_rest.lru_remove(bh); - } - - // stats - void stat_add(BufferHead *bh) { - switch (bh->get_state()) { - case BufferHead::STATE_MISSING: stat_missing += bh->length(); break; - case BufferHead::STATE_CLEAN: stat_clean += bh->length(); break; - case BufferHead::STATE_DIRTY: stat_dirty += bh->length(); break; - case BufferHead::STATE_TX: stat_tx += bh->length(); break; - case BufferHead::STATE_RX: stat_rx += bh->length(); break; - case BufferHead::STATE_PARTIAL: stat_partial += bh->length(); break; - } - if (stat_waiter) stat_cond.Signal(); - } - void stat_sub(BufferHead *bh) { - switch (bh->get_state()) { - case BufferHead::STATE_MISSING: stat_missing -= bh->length(); break; - case BufferHead::STATE_CLEAN: stat_clean -= bh->length(); break; - case BufferHead::STATE_DIRTY: stat_dirty -= bh->length(); break; - case BufferHead::STATE_TX: stat_tx -= bh->length(); break; - case BufferHead::STATE_RX: stat_rx -= bh->length(); break; - case BufferHead::STATE_PARTIAL: stat_partial -= bh->length(); break; - } - } - off_t get_stat_tx() { return stat_tx; } - off_t get_stat_rx() { return stat_rx; } - off_t get_stat_dirty() { return stat_dirty; } - off_t get_stat_clean() { return stat_clean; } - off_t get_stat_partial() { return stat_partial; } - - - map &get_unflushed(int what) { - return epoch_unflushed[what]; - } - - int get_unflushed(int what, version_t epoch) { - return epoch_unflushed[what][epoch]; - } - void inc_unflushed(int what, version_t epoch) { - epoch_unflushed[what][epoch]++; - //cout << "inc_unflushed " << epoch << " now " << epoch_unflushed[epoch] << endl; - } - void dec_unflushed(int what, version_t epoch) { - epoch_unflushed[what][epoch]--; - //cout << "dec_unflushed " << epoch << " now " << epoch_unflushed[epoch] << endl; - if (epoch_unflushed[what][epoch] == 0) - flush_cond.Signal(); - } - - void waitfor_stat() { - stat_waiter++; - stat_cond.Wait(ebofs_lock); - stat_waiter--; - } - void waitfor_flush() { - flush_cond.Wait(ebofs_lock); - } - - - // bh state - void set_state(BufferHead *bh, int s) { - // move between lru lists? - if (s == BufferHead::STATE_DIRTY && bh->get_state() != BufferHead::STATE_DIRTY) { - lru_rest.lru_remove(bh); - lru_dirty.lru_insert_top(bh); - dirty_bh.insert(bh); - } - if (s != BufferHead::STATE_DIRTY && bh->get_state() == BufferHead::STATE_DIRTY) { - lru_dirty.lru_remove(bh); - if (bh->want_to_expire) - lru_rest.lru_insert_bot(bh); - else - lru_rest.lru_insert_mid(bh); - dirty_bh.erase(bh); - } - - // set state - stat_sub(bh); - bh->set_state(s); - stat_add(bh); - } - - void copy_state(BufferHead *bh1, BufferHead *bh2) { - set_state(bh2, bh1->get_state()); - } - - void mark_missing(BufferHead *bh) { set_state(bh, BufferHead::STATE_MISSING); }; - void mark_clean(BufferHead *bh) { set_state(bh, BufferHead::STATE_CLEAN); }; - void mark_rx(BufferHead *bh) { set_state(bh, BufferHead::STATE_RX); }; - void mark_partial(BufferHead *bh) { set_state(bh, BufferHead::STATE_PARTIAL); }; - void mark_tx(BufferHead *bh) { set_state(bh, BufferHead::STATE_TX); }; - void mark_dirty(BufferHead *bh) { - set_state(bh, BufferHead::STATE_DIRTY); - bh->set_dirty_stamp(g_clock.now()); - }; - - - // io - void bh_read(Onode *on, BufferHead *bh, block_t from=0); - void bh_write(Onode *on, BufferHead *bh, block_t shouldbe=0); - - bool bh_cancel_read(BufferHead *bh); - bool bh_cancel_write(BufferHead *bh, version_t cur_epoch); - - void bh_queue_partial_write(Onode *on, BufferHead *bh); - void bh_cancel_partial_write(BufferHead *bh); - - void queue_partial(block_t from, block_t to, map& partial, version_t epoch); - void cancel_partial(block_t from, block_t to, version_t epoch); - - void add_shadow_partial(block_t from, BufferHead *bh); - void cancel_shadow_partial(block_t from, BufferHead *bh); - - void rx_finish(ObjectCache *oc, ioh_t ioh, block_t start, block_t len, block_t diskstart, bufferlist& bl); - void tx_finish(ObjectCache *oc, ioh_t ioh, block_t start, block_t len, version_t v, version_t e); - void partial_tx_finish(version_t epoch); - - friend class C_E_FlushPartial; - - // bh fun - BufferHead *split(BufferHead *orig, block_t after); -}; - - -class C_OC_RxFinish : public BlockDevice::callback { - Mutex &lock; - ObjectCache *oc; - block_t start, length; - block_t diskstart; -public: - bufferlist bl; - C_OC_RxFinish(Mutex &m, ObjectCache *o, block_t s, block_t l, block_t ds) : - lock(m), oc(o), start(s), length(l), diskstart(ds) {} - void finish(ioh_t ioh, int r) { - oc->bc->rx_finish(oc, ioh, start, length, diskstart, bl); - } -}; - -class C_OC_TxFinish : public BlockDevice::callback { - Mutex &lock; - ObjectCache *oc; - block_t start, length; - version_t version; - version_t epoch; - public: - C_OC_TxFinish(Mutex &m, ObjectCache *o, block_t s, block_t l, version_t v, version_t e) : - lock(m), oc(o), start(s), length(l), version(v), epoch(e) {} - void finish(ioh_t ioh, int r) { - oc->bc->tx_finish(oc, ioh, start, length, version, epoch); - } -}; - -class C_OC_PartialTxFinish : public BlockDevice::callback { - BufferCache *bc; - version_t epoch; -public: - C_OC_PartialTxFinish(BufferCache *b, version_t e) : - bc(b), epoch(e) {} - void finish(ioh_t ioh, int r) { - bc->partial_tx_finish(epoch); - } -}; - - -#endif diff --git a/tags/20070517_before_mds_merge/ebofs/Cnode.h b/tags/20070517_before_mds_merge/ebofs/Cnode.h deleted file mode 100644 index b906a6db24c57..0000000000000 --- a/tags/20070517_before_mds_merge/ebofs/Cnode.h +++ /dev/null @@ -1,100 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __EBOFS_CNODE_H -#define __EBOFS_CNODE_H - -#include "Onode.h" - -/* - * collection node - * - * holds attribute metadata for collections. - * colletion membership is stored in b+tree tables, independent of tte cnode. - */ - -class Cnode : public LRUObject -{ - private: - int ref; - bool dirty; - - public: - coll_t coll_id; - Extent cnode_loc; - - map attr; - - public: - Cnode(coll_t cid) : ref(0), dirty(false), coll_id(cid) { - cnode_loc.length = 0; - } - ~Cnode() { - } - - block_t get_cnode_id() { return cnode_loc.start; } - int get_cnode_len() { return cnode_loc.length; } - - void get() { - if (ref == 0) lru_pin(); - ref++; - } - void put() { - ref--; - if (ref == 0) lru_unpin(); - } - int get_ref_count() { return ref; } - - void mark_dirty() { - if (!dirty) { - dirty = true; - get(); - } - } - void mark_clean() { - if (dirty) { - dirty = false; - put(); - } - } - bool is_dirty() { return dirty; } - - - int get_attr_bytes() { - int s = 0; - for (map::iterator i = attr.begin(); - i != attr.end(); - i++) { - s += i->first.length() + 1; - s += i->second.length() + sizeof(int); - } - return s; - } - - // - //???void clear(); - - -}; - -inline ostream& operator<<(ostream& out, Cnode& cn) -{ - out << "cnode(" << hex << cn.coll_id << dec; - if (cn.is_dirty()) out << " dirty"; - //out << " " << &cn; - out << ")"; - return out; -} - -#endif diff --git a/tags/20070517_before_mds_merge/ebofs/Ebofs.cc b/tags/20070517_before_mds_merge/ebofs/Ebofs.cc deleted file mode 100644 index 2008d1961bfae..0000000000000 --- a/tags/20070517_before_mds_merge/ebofs/Ebofs.cc +++ /dev/null @@ -1,3270 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "Ebofs.h" - -#include - -#ifndef DARWIN -#include -#else -#include -#include -#endif // DARWIN - -// ******************* - -#undef dout -#define dout(x) if (x <= g_conf.debug_ebofs) cout << "ebofs(" << dev.get_device_name() << ")." -#define derr(x) if (x <= g_conf.debug_ebofs) cerr << "ebofs(" << dev.get_device_name() << ")." - - -char *nice_blocks(block_t b) -{ - static char s[20]; - float sz = b*4.0; - if (sz > (10 << 20)) - sprintf(s,"%.1f GB", sz / (1024.0*1024.0)); - else if (sz > (10 << 10)) - sprintf(s,"%.1f MB", sz / (1024.0)); - else - sprintf(s,"%llu KB", b*4ULL); - return s; -} - -int Ebofs::mount() -{ - ebofs_lock.Lock(); - assert(!mounted); - - int r = dev.open(&idle_kicker); - if (r < 0) { - ebofs_lock.Unlock(); - return r; - } - - dout(2) << "mounting " << dev.get_device_name() << " " << dev.get_num_blocks() << " blocks, " << nice_blocks(dev.get_num_blocks()) << endl; - - // read super - bufferptr bp1 = buffer::create_page_aligned(EBOFS_BLOCK_SIZE); - bufferptr bp2 = buffer::create_page_aligned(EBOFS_BLOCK_SIZE); - dev.read(0, 1, bp1); - dev.read(1, 1, bp2); - - struct ebofs_super *sb1 = (struct ebofs_super*)bp1.c_str(); - struct ebofs_super *sb2 = (struct ebofs_super*)bp2.c_str(); - dout(3) << "mount super @0 epoch " << sb1->epoch << endl; - dout(3) << "mount super @1 epoch " << sb2->epoch << endl; - - // pick newest super - struct ebofs_super *sb = 0; - if (sb1->epoch > sb2->epoch) - sb = sb1; - else - sb = sb2; - super_epoch = sb->epoch; - dout(3) << "mount epoch " << super_epoch << endl; - assert(super_epoch == sb->epoch); - - free_blocks = sb->free_blocks; - limbo_blocks = sb->limbo_blocks; - - // init node pools - dout(3) << "mount nodepool" << endl; - nodepool.init( &sb->nodepool ); - nodepool.read_usemap( dev, super_epoch ); - nodepool.read_clean_nodes( dev ); - - // open tables - dout(3) << "mount opening tables" << endl; - object_tab = new Table( nodepool, sb->object_tab ); - for (int i=0; i( nodepool, sb->free_tab[i] ); - limbo_tab = new Table( nodepool, sb->limbo_tab ); - alloc_tab = new Table >( nodepool, sb->alloc_tab ); - - collection_tab = new Table( nodepool, sb->collection_tab ); - co_tab = new Table( nodepool, sb->co_tab ); - - allocator.release_limbo(); - - dout(3) << "mount starting commit+finisher threads" << endl; - commit_thread.create(); - finisher_thread.create(); - - dout(1) << "mounted " << dev.get_device_name() << " " << dev.get_num_blocks() << " blocks, " << nice_blocks(dev.get_num_blocks()) << endl; - mounted = true; - - ebofs_lock.Unlock(); - return 0; -} - - -int Ebofs::mkfs() -{ - ebofs_lock.Lock(); - assert(!mounted); - - int r = dev.open(); - if (r < 0) { - ebofs_lock.Unlock(); - return r; - } - - block_t num_blocks = dev.get_num_blocks(); - - free_blocks = 0; - limbo_blocks = 0; - - // create first noderegion - Extent nr; - nr.start = 2; - nr.length = 20+ (num_blocks / 1000); - if (nr.length < 10) nr.length = 10; - nodepool.add_region(nr); - dout(10) << "mkfs: first node region at " << nr << endl; - - // allocate two usemaps - block_t usemap_len = nodepool.get_usemap_len(); - nodepool.usemap_even.start = nr.end(); - nodepool.usemap_even.length = usemap_len; - nodepool.usemap_odd.start = nodepool.usemap_even.end(); - nodepool.usemap_odd.length = usemap_len; - dout(10) << "mkfs: even usemap at " << nodepool.usemap_even << endl; - dout(10) << "mkfs: odd usemap at " << nodepool.usemap_odd << endl; - - // init tables - struct ebofs_table empty; - empty.num_keys = 0; - empty.root = -1; - empty.depth = 0; - - object_tab = new Table( nodepool, empty ); - collection_tab = new Table( nodepool, empty ); - - for (int i=0; i( nodepool, empty ); - limbo_tab = new Table( nodepool, empty ); - alloc_tab = new Table >( nodepool, empty ); - - co_tab = new Table( nodepool, empty ); - - // add free space - Extent left; - left.start = nodepool.usemap_odd.end(); - left.length = num_blocks - left.start; - dout(10) << "mkfs: free data blocks at " << left << endl; - allocator._release_into_limbo( left ); - if (g_conf.ebofs_cloneable) { - allocator.alloc_inc(nr); - allocator.alloc_inc(nodepool.usemap_even); - allocator.alloc_inc(nodepool.usemap_odd); - } - allocator.commit_limbo(); // -> limbo_tab - allocator.release_limbo(); // -> free_tab - - // write nodes, super, 2x - dout(10) << "mkfs: flushing nodepool and superblocks (2x)" << endl; - - nodepool.commit_start( dev, 0 ); - nodepool.commit_wait(); - bufferptr superbp0; - prepare_super(0, superbp0); - write_super(0, superbp0); - - nodepool.commit_start( dev, 1 ); - nodepool.commit_wait(); - bufferptr superbp1; - prepare_super(1, superbp1); - write_super(1, superbp1); - - // free memory - dout(10) << "mkfs: cleaning up" << endl; - close_tables(); - - dev.close(); - - dout(2) << "mkfs: " << dev.get_device_name() << " " << dev.get_num_blocks() << " blocks, " << nice_blocks(dev.get_num_blocks()) << endl; - ebofs_lock.Unlock(); - return 0; -} - -void Ebofs::close_tables() -{ - // close tables - delete object_tab; - for (int i=0; i::iterator i = onode_map.begin(); - i != onode_map.end(); - i++) { - dout(0) << "umount *** leftover: " << i->first << " " << *(i->second) << endl; - } - - // free memory - dout(5) << "umount cleaning up" << endl; - close_tables(); - dev.close(); - readonly = unmounting = mounted = false; - - dout(1) << "umount done on " << dev.get_device_name() << endl; - ebofs_lock.Unlock(); - return 0; -} - - - -void Ebofs::prepare_super(version_t epoch, bufferptr& bp) -{ - struct ebofs_super sb; - - dout(10) << "prepare_super v" << epoch << endl; - - // fill in super - memset(&sb, 0, sizeof(sb)); - sb.s_magic = EBOFS_MAGIC; - sb.epoch = epoch; - sb.num_blocks = dev.get_num_blocks(); - - sb.free_blocks = free_blocks; - sb.limbo_blocks = limbo_blocks; - - - // tables - sb.object_tab.num_keys = object_tab->get_num_keys(); - sb.object_tab.root = object_tab->get_root(); - sb.object_tab.depth = object_tab->get_depth(); - - for (int i=0; iget_num_keys(); - sb.free_tab[i].root = free_tab[i]->get_root(); - sb.free_tab[i].depth = free_tab[i]->get_depth(); - } - sb.limbo_tab.num_keys = limbo_tab->get_num_keys(); - sb.limbo_tab.root = limbo_tab->get_root(); - sb.limbo_tab.depth = limbo_tab->get_depth(); - - sb.alloc_tab.num_keys = alloc_tab->get_num_keys(); - sb.alloc_tab.root = alloc_tab->get_root(); - sb.alloc_tab.depth = alloc_tab->get_depth(); - - sb.collection_tab.num_keys = collection_tab->get_num_keys(); - sb.collection_tab.root = collection_tab->get_root(); - sb.collection_tab.depth = collection_tab->get_depth(); - - sb.co_tab.num_keys = co_tab->get_num_keys(); - sb.co_tab.root = co_tab->get_root(); - sb.co_tab.depth = co_tab->get_depth(); - - // pools - sb.nodepool.num_regions = nodepool.region_loc.size(); - for (unsigned i=0; i 0) { - // periodically check for idle block device - dout(20) << "commit_thread sleeping (up to) " << g_conf.ebofs_commit_ms << " ms, " - << g_conf.ebofs_idle_commit_ms << " ms if idle" << endl; - long left = g_conf.ebofs_commit_ms; - while (left > 0) { - long next = MIN(left, g_conf.ebofs_idle_commit_ms); - if (commit_cond.WaitInterval(ebofs_lock, utime_t(0, next*1000)) != ETIMEDOUT) - break; // we got kicked - if (dev.is_idle()) { - dout(20) << "commit_thread bdev is idle, early commit" << endl; - break; // dev is idle - } - left -= next; - dout(20) << "commit_thread " << left << " ms left" << endl; - - // hack hack - //if (!left) g_conf.debug_ebofs = 10; - // /hack hack - } - } else { - // normal wait+timeout - dout(20) << "commit_thread sleeping (up to) " << g_conf.ebofs_commit_ms << " ms" << endl; - commit_cond.WaitInterval(ebofs_lock, utime_t(0, g_conf.ebofs_commit_ms*1000)); - } - - } else { - // DEBUG.. wait until kicked - dout(10) << "commit_thread no commit_ms, waiting until kicked" << endl; - commit_cond.Wait(ebofs_lock); - } - - if (unmounting) { - dout(10) << "commit_thread unmounting: final commit pass" << endl; - assert(readonly); - unmounting = false; - mounted = false; - dirty = true; - } - - if (!dirty && !limbo_blocks) { - dout(10) << "commit_thread not dirty" << endl; - } - else { - super_epoch++; - dirty = false; - - dout(10) << "commit_thread commit start, new epoch " << super_epoch << endl; - dout(2) << "commit_thread data: " - << 100*(dev.get_num_blocks()-get_free_blocks())/dev.get_num_blocks() << "% used, " - << get_free_blocks() << " (" << 100*get_free_blocks()/dev.get_num_blocks() - << "%) free in " << get_free_extents() - << ", " << get_limbo_blocks() << " (" << 100*get_limbo_blocks()/dev.get_num_blocks() - << "%) limbo in " << get_limbo_extents() - << endl; - dout(2) << "commit_thread nodes: " - << 100*nodepool.num_used()/nodepool.num_total() << "% used, " - << nodepool.num_free() << " (" << 100*nodepool.num_free()/nodepool.num_total() << "%) free, " - << nodepool.num_limbo() << " (" << 100*nodepool.num_limbo()/nodepool.num_total() << "%) limbo, " - << nodepool.num_total() << " total." << endl; - dout(2) << "commit_thread bc: " - << "size " << bc.get_size() - << ", trimmable " << bc.get_trimmable() - << ", max " << g_conf.ebofs_bc_size - << "; dirty " << bc.get_stat_dirty() - << ", tx " << bc.get_stat_tx() - << ", max dirty " << g_conf.ebofs_bc_max_dirty - << endl; - - - // (async) write onodes+condes (do this first; it currently involves inode reallocation) - commit_inodes_start(); - - allocator.commit_limbo(); // limbo -> limbo_tab - - // (async) write btree nodes - nodepool.commit_start( dev, super_epoch ); - - // blockdev barrier (prioritize our writes!) - dout(30) << "commit_thread barrier. flushing inodes " << inodes_flushing << endl; - dev.barrier(); - - // prepare super (before any changes get made!) - bufferptr superbp; - prepare_super(super_epoch, superbp); - - // wait for it all to flush (drops global lock) - commit_bc_wait(super_epoch-1); - dout(30) << "commit_thread bc flushed" << endl; - commit_inodes_wait(); - dout(30) << "commit_thread inodes flushed" << endl; - nodepool.commit_wait(); - dout(30) << "commit_thread btree nodes flushed" << endl; - - // ok, now (synchronously) write the prior super! - dout(10) << "commit_thread commit flushed, writing super for prior epoch" << endl; - ebofs_lock.Unlock(); - write_super(super_epoch, superbp); - ebofs_lock.Lock(); - - dout(10) << "commit_thread wrote super" << endl; - - // free limbo space now - // (since we're done allocating things, - // AND we've flushed all previous epoch data) - allocator.release_limbo(); // limbo_tab -> free_tabs - - // do we need more node space? - if (nodepool.num_free() < nodepool.num_total() / 3) { - dout(2) << "commit_thread running low on node space, allocating more." << endl; - alloc_more_node_space(); - } - - // kick waiters - dout(10) << "commit_thread queueing commit + kicking sync waiters" << endl; - - finisher_lock.Lock(); - finisher_queue.splice(finisher_queue.end(), commit_waiters[super_epoch-1]); - commit_waiters.erase(super_epoch-1); - finisher_cond.Signal(); - finisher_lock.Unlock(); - - sync_cond.Signal(); - - dout(10) << "commit_thread commit finish" << endl; - } - - // trim bc? - trim_bc(); - trim_inodes(); - - } - - dout(10) << "commit_thread finish" << endl; - commit_thread_started = false; - ebofs_lock.Unlock(); - return 0; -} - - -void Ebofs::alloc_more_node_space() -{ - dout(1) << "alloc_more_node_space free " << nodepool.num_free() << "/" << nodepool.num_total() << endl; - - if (nodepool.num_regions() < EBOFS_MAX_NODE_REGIONS) { - int want = nodepool.num_total(); - - Extent ex; - allocator.allocate(ex, want, 2); - dout(1) << "alloc_more_node_space wants " << want << " more, got " << ex << endl; - - Extent even, odd; - unsigned ulen = nodepool.get_usemap_len(nodepool.num_total() + ex.length); - allocator.allocate(even, ulen, 2); - allocator.allocate(odd, ulen, 2); - dout(1) << "alloc_more_node_space maps need " << ulen << " x2, got " << even << " " << odd << endl; - - if (even.length == ulen && odd.length == ulen) { - dout(1) << "alloc_more_node_space got " << ex << ", new usemaps at even " << even << " odd " << odd << endl; - allocator.release(nodepool.usemap_even); - allocator.release(nodepool.usemap_odd); - nodepool.add_region(ex); - nodepool.usemap_even = even; - nodepool.usemap_odd = odd; - } else { - dout (1) << "alloc_more_node_space failed to get space for new usemaps" << endl; - allocator.release(ex); - allocator.release(even); - allocator.release(odd); - //assert(0); - } - } else { - dout(1) << "alloc_more_node_space already have max node regions!" << endl; - assert(0); - } -} - - -void *Ebofs::finisher_thread_entry() -{ - finisher_lock.Lock(); - dout(10) << "finisher_thread start" << endl; - - while (!finisher_stop) { - while (!finisher_queue.empty()) { - list ls; - ls.swap(finisher_queue); - - finisher_lock.Unlock(); - - //ebofs_lock.Lock(); // um.. why lock this? -sage - finish_contexts(ls, 0); - //ebofs_lock.Unlock(); - - finisher_lock.Lock(); - } - if (finisher_stop) break; - - dout(30) << "finisher_thread sleeping" << endl; - finisher_cond.Wait(finisher_lock); - } - - dout(10) << "finisher_thread start" << endl; - finisher_lock.Unlock(); - return 0; -} - - -// *** onodes *** - -Onode* Ebofs::new_onode(object_t oid) -{ - Onode* on = new Onode(oid); - - assert(onode_map.count(oid) == 0); - onode_map[oid] = on; - onode_lru.lru_insert_top(on); - - assert(object_tab->lookup(oid) < 0); - object_tab->insert( oid, on->onode_loc ); // even tho i'm not placed yet - - on->get(); - on->onode_loc.start = 0; - on->onode_loc.length = 0; - - dirty_onode(on); - - dout(7) << "new_onode " << *on << endl; - return on; -} - - -Onode* Ebofs::get_onode(object_t oid) -{ - while (1) { - // in cache? - if (have_onode(oid)) { - // yay - Onode *on = onode_map[oid]; - on->get(); - //cout << "get_onode " << *on << endl; - return on; - } - - // on disk? - Extent onode_loc; - if (object_tab->lookup(oid, onode_loc) < 0) { - dout(10) << "onode lookup failed on " << oid << endl; - // object dne. - return 0; - } - - // already loading? - if (waitfor_onode.count(oid)) { - // yep, just wait. - Cond c; - waitfor_onode[oid].push_back(&c); - dout(10) << "get_onode " << oid << " already loading, waiting" << endl; - c.Wait(ebofs_lock); - continue; - } - - dout(10) << "get_onode reading " << oid << " from " << onode_loc << endl; - - assert(waitfor_onode.count(oid) == 0); - waitfor_onode[oid].clear(); // this should be empty initially. - - // read it! - bufferlist bl; - bl.push_back( buffer::create_page_aligned( EBOFS_BLOCK_SIZE*onode_loc.length ) ); - - ebofs_lock.Unlock(); - dev.read( onode_loc.start, onode_loc.length, bl ); - ebofs_lock.Lock(); - - // add onode - Onode *on = new Onode(oid); - onode_map[oid] = on; - onode_lru.lru_insert_top(on); - - // parse data block - struct ebofs_onode *eo = (struct ebofs_onode*)bl.c_str(); - if (eo->object_id != oid) { - cerr << " wrong oid in onode block: " << eo->object_id << " != " << oid << endl; - cerr << " onode_loc is " << eo->onode_loc << endl; - cerr << " object_size " << eo->object_size << endl; - cerr << " object_blocks " << eo->object_blocks << endl; - cerr << " " << eo->num_collections << " coll + " - << eo->num_attr << " attr + " - << eo->num_extents << " extents" << endl; - assert(eo->object_id == oid); - } - on->readonly = eo->readonly; - on->onode_loc = eo->onode_loc; - on->object_size = eo->object_size; - on->object_blocks = eo->object_blocks; - - // parse - char *p = bl.c_str() + sizeof(*eo); - - // parse collection list - for (int i=0; inum_collections; i++) { - coll_t c = *((coll_t*)p); - p += sizeof(c); - on->collections.insert(c); - } - - // parse attributes - for (int i=0; inum_attr; i++) { - string key = p; - p += key.length() + 1; - int len = *(int*)(p); - p += sizeof(len); - on->attr[key] = buffer::copy(p, len); - p += len; - dout(15) << "get_onode " << *on << " attr " << key << " len " << len << endl; - } - - // parse extents - on->extent_map.clear(); - block_t n = 0; - for (int i=0; inum_extents; i++) { - Extent ex = *((Extent*)p); - on->extent_map[n] = ex; - dout(15) << "get_onode " << *on << " ex " << i << ": " << ex << endl; - n += ex.length; - p += sizeof(Extent); - } - assert(n == on->object_blocks); - - // wake up other waiters - for (list::iterator i = waitfor_onode[oid].begin(); - i != waitfor_onode[oid].end(); - i++) - (*i)->Signal(); - waitfor_onode.erase(oid); // remove Cond list - - on->get(); - //cout << "get_onode " << *on << " (loaded)" << endl; - return on; - } -} - - -class C_E_InodeFlush : public BlockDevice::callback { - Ebofs *ebofs; -public: - C_E_InodeFlush(Ebofs *e) : ebofs(e) {} - void finish(ioh_t ioh, int r) { - ebofs->flush_inode_finish(); - } -}; - - -void Ebofs::encode_onode(Onode *on, bufferlist& bl, unsigned& off) -{ - // onode - struct ebofs_onode eo; - eo.readonly = on->readonly; - eo.onode_loc = on->onode_loc; - eo.object_id = on->object_id; - eo.object_size = on->object_size; - eo.object_blocks = on->object_blocks; - eo.num_collections = on->collections.size(); - eo.num_attr = on->attr.size(); - eo.num_extents = on->extent_map.size(); - bl.copy_in(off, sizeof(eo), (char*)&eo); - off += sizeof(eo); - - // collections - for (set::iterator i = on->collections.begin(); - i != on->collections.end(); - i++) { - bl.copy_in(off, sizeof(*i), (char*)&(*i)); - off += sizeof(*i); - } - - // attr - for (map::iterator i = on->attr.begin(); - i != on->attr.end(); - i++) { - bl.copy_in(off, i->first.length()+1, i->first.c_str()); - off += i->first.length()+1; - int l = i->second.length(); - bl.copy_in(off, sizeof(int), (char*)&l); - off += sizeof(int); - bl.copy_in(off, l, i->second.c_str()); - off += l; - dout(15) << "write_onode " << *on << " attr " << i->first << " len " << l << endl; - } - - // extents - for (map::iterator i = on->extent_map.begin(); - i != on->extent_map.end(); - i++) { - bl.copy_in(off, sizeof(Extent), (char*)&(i->second)); - off += sizeof(Extent); - dout(15) << "write_onode " << *on << " ex " << i->first << ": " << i->second << endl; - } -} - -void Ebofs::write_onode(Onode *on) -{ - // buffer - unsigned bytes = sizeof(ebofs_onode) + on->get_collection_bytes() + on->get_attr_bytes() + on->get_extent_bytes(); - unsigned blocks = (bytes-1)/EBOFS_BLOCK_SIZE + 1; - - bufferlist bl; - bl.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*blocks) ); - - // (always) relocate onode - if (1) { - if (on->onode_loc.length) - allocator.release(on->onode_loc); - - block_t first = 0; - if (on->extent_map.size()) - first = on->extent_map.begin()->second.start; - - allocator.allocate(on->onode_loc, blocks, first); - object_tab->remove( on->object_id ); - object_tab->insert( on->object_id, on->onode_loc ); - //object_tab->verify(); - } - - dout(10) << "write_onode " << *on << " to " << on->onode_loc << endl; - - unsigned off = 0; - encode_onode(on, bl, off); - assert(off == bytes); - - // write - dev.write( on->onode_loc.start, on->onode_loc.length, bl, - new C_E_InodeFlush(this), "write_onode" ); -} - -void Ebofs::remove_onode(Onode *on) -{ - dout(8) << "remove_onode " << *on << endl; - - assert(on->get_ref_count() >= 1); // caller - - // tear down buffer cache - if (on->oc) { - on->oc->truncate(0, super_epoch); // this will kick readers along the way. - on->close_oc(); - } - - // remove from onode map, mark dangling/deleted - onode_map.erase(on->object_id); - onode_lru.lru_remove(on); - on->deleted = true; - on->dangling = true; - - // remove from object table - //dout(0) << "remove_onode on " << *on << endl; - object_tab->remove(on->object_id); - - // free onode space - if (on->onode_loc.length) - allocator.release(on->onode_loc); - - // free data space - for (map::iterator i = on->extent_map.begin(); - i != on->extent_map.end(); - i++) - allocator.release(i->second); - on->extent_map.clear(); - - // remove from collections - for (set::iterator i = on->collections.begin(); - i != on->collections.end(); - i++) { - co_tab->remove(coll_object_t(*i,on->object_id)); - } - on->collections.clear(); - - // dirty -> clean? - if (on->is_dirty()) { - on->mark_clean(); // this unpins *on - dirty_onodes.erase(on); - } - - if (on->get_ref_count() > 1) cout << "remove_onode **** will survive " << *on << endl; - put_onode(on); - - dirty = true; -} - -void Ebofs::put_onode(Onode *on) -{ - on->put(); - //cout << "put_onode " << *on << endl; - - if (on->get_ref_count() == 0 && on->dangling) { - //cout << " *** hosing on " << *on << endl; - delete on; - } -} - -void Ebofs::dirty_onode(Onode *on) -{ - if (!on->is_dirty()) { - on->mark_dirty(); - dirty_onodes.insert(on); - } - dirty = true; -} - -void Ebofs::trim_inodes(int max) -{ - unsigned omax = onode_lru.lru_get_max(); - unsigned cmax = cnode_lru.lru_get_max(); - if (max >= 0) omax = cmax = max; - dout(10) << "trim_inodes start " << onode_lru.lru_get_size() << " / " << omax << " onodes, " - << cnode_lru.lru_get_size() << " / " << cmax << " cnodes" << endl; - - // onodes - while (onode_lru.lru_get_size() > omax) { - // expire an item - Onode *on = (Onode*)onode_lru.lru_expire(); - if (on == 0) break; // nothing to expire - - // expire - dout(20) << "trim_inodes removing onode " << *on << endl; - onode_map.erase(on->object_id); - on->dangling = true; - - if (on->get_ref_count() == 0) { - assert(on->oc == 0); // an open oc pins the onode! - delete on; - } else { - dout(-20) << "trim_inodes still active: " << *on << endl; - assert(0); // huh? - } - } - - - // cnodes - while (cnode_lru.lru_get_size() > cmax) { - // expire an item - Cnode *cn = (Cnode*)cnode_lru.lru_expire(); - if (cn == 0) break; // nothing to expire - - // expire - dout(20) << "trim_inodes removing cnode " << *cn << endl; - cnode_map.erase(cn->coll_id); - - delete cn; - } - - dout(10) << "trim_inodes finish " - << onode_lru.lru_get_size() << " / " << omax << " onodes, " - << cnode_lru.lru_get_size() << " / " << cmax << " cnodes" << endl; -} - - - -// *** cnodes **** - -Cnode* Ebofs::new_cnode(coll_t cid) -{ - Cnode* cn = new Cnode(cid); - - assert(cnode_map.count(cid) == 0); - cnode_map[cid] = cn; - cnode_lru.lru_insert_top(cn); - - assert(collection_tab->lookup(cid) < 0); - collection_tab->insert( cid, cn->cnode_loc ); // even tho i'm not placed yet - - cn->get(); - cn->cnode_loc.start = 0; - cn->cnode_loc.length = 0; - - dirty_cnode(cn); - - return cn; -} - -Cnode* Ebofs::get_cnode(coll_t cid) -{ - while (1) { - // in cache? - if (cnode_map.count(cid)) { - // yay - Cnode *cn = cnode_map[cid]; - cn->get(); - return cn; - } - - // on disk? - Extent cnode_loc; - if (collection_tab->lookup(cid, cnode_loc) < 0) { - // object dne. - return 0; - } - - // already loading? - if (waitfor_cnode.count(cid)) { - // yep, just wait. - Cond c; - waitfor_cnode[cid].push_back(&c); - dout(10) << "get_cnode " << cid << " already loading, waiting" << endl; - c.Wait(ebofs_lock); - continue; - } - - dout(10) << "get_cnode reading " << cid << " from " << cnode_loc << endl; - - assert(waitfor_cnode.count(cid) == 0); - waitfor_cnode[cid].clear(); // this should be empty initially. - - // read it! - bufferlist bl; - //bufferpool.alloc( EBOFS_BLOCK_SIZE*cnode_loc.length, bl ); - bl.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*cnode_loc.length) ); - - ebofs_lock.Unlock(); - dev.read( cnode_loc.start, cnode_loc.length, bl ); - ebofs_lock.Lock(); - - // parse data block - Cnode *cn = new Cnode(cid); - - cnode_map[cid] = cn; - cnode_lru.lru_insert_top(cn); - - struct ebofs_cnode *ec = (struct ebofs_cnode*)bl.c_str(); - cn->cnode_loc = ec->cnode_loc; - - // parse attributes - char *p = bl.c_str() + sizeof(*ec); - for (int i=0; inum_attr; i++) { - string key = p; - p += key.length() + 1; - int len = *(int*)(p); - p += sizeof(len); - cn->attr[key] = buffer::copy(p, len); - p += len; - dout(15) << "get_cnode " << *cn << " attr " << key << " len " << len << endl; - } - - // wake up other waiters - for (list::iterator i = waitfor_cnode[cid].begin(); - i != waitfor_cnode[cid].end(); - i++) - (*i)->Signal(); - waitfor_cnode.erase(cid); // remove Cond list - - cn->get(); - return cn; - } -} - -void Ebofs::encode_cnode(Cnode *cn, bufferlist& bl, unsigned& off) -{ - // cnode - struct ebofs_cnode ec; - ec.cnode_loc = cn->cnode_loc; - ec.coll_id = cn->coll_id; - ec.num_attr = cn->attr.size(); - bl.copy_in(off, sizeof(ec), (char*)&ec); - off += sizeof(ec); - - // attr - for (map::iterator i = cn->attr.begin(); - i != cn->attr.end(); - i++) { - bl.copy_in(off, i->first.length()+1, i->first.c_str()); - off += i->first.length()+1; - int len = i->second.length(); - bl.copy_in(off, sizeof(int), (char*)&len); - off += sizeof(int); - bl.copy_in(off, len, i->second.c_str()); - off += len; - - dout(15) << "write_cnode " << *cn << " attr " << i->first << " len " << len << endl; - } -} - -void Ebofs::write_cnode(Cnode *cn) -{ - // allocate buffer - unsigned bytes = sizeof(ebofs_cnode) + cn->get_attr_bytes(); - unsigned blocks = (bytes-1)/EBOFS_BLOCK_SIZE + 1; - - bufferlist bl; - //bufferpool.alloc( EBOFS_BLOCK_SIZE*blocks, bl ); - bl.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*blocks) ); - - // (always) relocate cnode! - if (1) { - if (cn->cnode_loc.length) - allocator.release(cn->cnode_loc); - - allocator.allocate(cn->cnode_loc, blocks, Allocator::NEAR_LAST_FWD); - collection_tab->remove( cn->coll_id ); - collection_tab->insert( cn->coll_id, cn->cnode_loc ); - } - - dout(10) << "write_cnode " << *cn << " to " << cn->cnode_loc << endl; - - unsigned off = 0; - encode_cnode(cn, bl, off); - assert(off == bytes); - - // write - dev.write( cn->cnode_loc.start, cn->cnode_loc.length, bl, - new C_E_InodeFlush(this), "write_cnode" ); -} - -void Ebofs::remove_cnode(Cnode *cn) -{ - dout(10) << "remove_cnode " << *cn << endl; - - // remove from table - collection_tab->remove(cn->coll_id); - - // free cnode space - if (cn->cnode_loc.length) - allocator.release(cn->cnode_loc); - - // remove from dirty list? - if (cn->is_dirty()) - dirty_cnodes.erase(cn); - - // remove from map and lru - cnode_map.erase(cn->coll_id); - cnode_lru.lru_remove(cn); - - // count down refs - cn->mark_clean(); - cn->put(); - assert(cn->get_ref_count() == 0); - - // hose. - delete cn; - - dirty = true; -} - -void Ebofs::put_cnode(Cnode *cn) -{ - cn->put(); -} - -void Ebofs::dirty_cnode(Cnode *cn) -{ - if (!cn->is_dirty()) { - cn->mark_dirty(); - dirty_cnodes.insert(cn); - } - dirty = true; -} - - - - - -void Ebofs::flush_inode_finish() -{ - ebofs_lock.Lock(); - { - inodes_flushing--; - if (inodes_flushing < 1000) - dout(20) << "flush_inode_finish, " << inodes_flushing << " left" << endl; - if (inodes_flushing == 0) - inode_commit_cond.Signal(); - } - ebofs_lock.Unlock(); -} - -void Ebofs::commit_inodes_start() -{ - dout(10) << "commit_inodes_start" << endl; - - assert(inodes_flushing == 0); - - // onodes - for (set::iterator i = dirty_onodes.begin(); - i != dirty_onodes.end(); - i++) { - Onode *on = *i; - inodes_flushing++; - write_onode(on); - on->mark_clean(); - on->uncommitted.clear(); // commit allocated blocks - on->commit_waiters.clear(); // these guys are gonna get taken care of, bc we committed. - } - dirty_onodes.clear(); - - // cnodes - for (set::iterator i = dirty_cnodes.begin(); - i != dirty_cnodes.end(); - i++) { - Cnode *cn = *i; - inodes_flushing++; - write_cnode(cn); - cn->mark_clean(); - } - dirty_cnodes.clear(); - - dout(10) << "commit_inodes_start writing " << inodes_flushing << " onodes+cnodes" << endl; -} - -void Ebofs::commit_inodes_wait() -{ - // caller must hold ebofs_lock - while (inodes_flushing > 0) { - dout(10) << "commit_inodes_wait waiting for " << inodes_flushing << " onodes+cnodes to flush" << endl; - inode_commit_cond.Wait(ebofs_lock); - } - dout(10) << "commit_inodes_wait all flushed" << endl; -} - - - - - - - -// *** buffer cache *** - -void Ebofs::trim_buffer_cache() -{ - ebofs_lock.Lock(); - trim_bc(0); - ebofs_lock.Unlock(); -} - -void Ebofs::trim_bc(off_t max) -{ - if (max < 0) - max = g_conf.ebofs_bc_size; - dout(10) << "trim_bc start: size " << bc.get_size() << ", trimmable " << bc.get_trimmable() << ", max " << max << endl; - - while (bc.get_size() > max && - bc.get_trimmable()) { - BufferHead *bh = (BufferHead*) bc.lru_rest.lru_expire(); - if (!bh) break; - - dout(25) << "trim_bc trimming " << *bh << endl; - assert(bh->is_clean()); - - ObjectCache *oc = bh->oc; - bc.remove_bh(bh); - delete bh; - - if (oc->is_empty()) { - Onode *on = oc->on; - dout(10) << "trim_bc closing oc on " << *on << endl; - on->close_oc(); - } - } - - dout(10) << "trim_bc finish: size " << bc.get_size() << ", trimmable " << bc.get_trimmable() << ", max " << max << endl; -} - - -void Ebofs::kick_idle() -{ - dout(10) << "kick_idle" << endl; - commit_cond.Signal(); - - /* - ebofs_lock.Lock(); - if (mounted && !unmounting && dirty) { - dout(0) << "kick_idle dirty, doing commit" << endl; - commit_cond.Signal(); - } else { - dout(0) << "kick_idle !dirty or !mounted or unmounting, doing nothing" << endl; - } - ebofs_lock.Unlock(); - */ -} - -void Ebofs::sync(Context *onsafe) -{ - ebofs_lock.Lock(); - if (onsafe) { - dirty = true; - commit_waiters[super_epoch].push_back(onsafe); - } - ebofs_lock.Unlock(); -} - -void Ebofs::sync() -{ - ebofs_lock.Lock(); - if (!dirty) { - dout(7) << "sync in " << super_epoch << ", not dirty" << endl; - } else { - epoch_t start = super_epoch; - dout(7) << "sync start in " << start << endl; - while (super_epoch == start) { - dout(7) << "sync kicking commit in " << super_epoch << endl; - dirty = true; - commit_cond.Signal(); - sync_cond.Wait(ebofs_lock); - } - dout(10) << "sync finish in " << super_epoch << endl; - } - ebofs_lock.Unlock(); -} - - - -void Ebofs::commit_bc_wait(version_t epoch) -{ - dout(10) << "commit_bc_wait on epoch " << epoch << endl; - - while (bc.get_unflushed(EBOFS_BC_FLUSH_BHWRITE,epoch) > 0 || - bc.get_unflushed(EBOFS_BC_FLUSH_PARTIAL,epoch) > 0) { - //dout(10) << "commit_bc_wait " << bc.get_unflushed(epoch) << " unflushed in epoch " << epoch << endl; - dout(10) << "commit_bc_wait epoch " << epoch - << ", unflushed bhwrite " << bc.get_unflushed(EBOFS_BC_FLUSH_BHWRITE) - << ", unflushed partial " << bc.get_unflushed(EBOFS_BC_FLUSH_PARTIAL) - << endl; - bc.waitfor_flush(); - } - - bc.get_unflushed(EBOFS_BC_FLUSH_BHWRITE).erase(epoch); - bc.get_unflushed(EBOFS_BC_FLUSH_PARTIAL).erase(epoch); - - dout(10) << "commit_bc_wait all flushed for epoch " << epoch - << "; " << bc.get_unflushed(EBOFS_BC_FLUSH_BHWRITE) - << " " << bc.get_unflushed(EBOFS_BC_FLUSH_PARTIAL) - << endl; -} - - - -int Ebofs::statfs(struct statfs *buf) -{ - dout(7) << "statfs" << endl; - - buf->f_type = EBOFS_MAGIC; /* type of filesystem */ - buf->f_bsize = 4096; /* optimal transfer block size */ - buf->f_blocks = dev.get_num_blocks(); /* total data blocks in file system */ - buf->f_bfree = get_free_blocks() - + get_limbo_blocks(); /* free blocks in fs */ - buf->f_bavail = get_free_blocks(); /* free blocks avail to non-superuser -- actually, for writing. */ - buf->f_files = nodepool.num_total(); /* total file nodes in file system */ - buf->f_ffree = nodepool.num_free(); /* free file nodes in fs */ - //buf->f_fsid = 0; /* file system id */ -#ifndef DARWIN - buf->f_namelen = 8; /* maximum length of filenames */ -#endif // DARWIN - - return 0; -} - - - - -/* - * allocate a write to blocks on disk. - * - take care to not overwrite any "safe" data blocks. - * - allocate/map new extents on disk as necessary - */ -void Ebofs::alloc_write(Onode *on, - block_t start, block_t len, - interval_set& alloc, - block_t& old_bfirst, block_t& old_blast) -{ - // first decide what pages to (re)allocate - alloc.insert(start, len); // start with whole range - - // figure out what bits are already uncommitted - interval_set already_uncom; - already_uncom.intersection_of(alloc, on->uncommitted); - - // subtract those off, so we're left with the committed bits (that must be reallocated). - alloc.subtract(already_uncom); - - dout(10) << "alloc_write must (re)alloc " << alloc << " on " << *on << endl; - - // release it (into limbo) - for (map::iterator i = alloc.m.begin(); - i != alloc.m.end(); - i++) { - // get old region - vector old; - on->map_extents(i->first, i->second, old); - for (unsigned o=0; ofirst == start) { - old_bfirst = old[0].start; - dout(20) << "alloc_write old_bfirst " << old_bfirst << " of " << old[0] << endl; - } - if (i->first+i->second == start+len) { - old_blast = old[old.size()-1].last(); - dout(20) << "alloc_write old_blast " << old_blast << " of " << old[old.size()-1] << endl; - } - } - } - - // reallocate uncommitted too? - // ( --> yes. we can always make better allocation decisions later, with more information. ) - if (g_conf.ebofs_realloc) { - list tx; - - ObjectCache *oc = on->get_oc(&bc); - oc->find_tx(start, len, tx); - - for (list::reverse_iterator p = tx.rbegin(); - p != tx.rend(); - p++) { - BufferHead *bh = *p; - - // cancelable/moveable? - if (alloc.contains(bh->start(), bh->length())) { - dout(10) << "alloc_write " << *bh << " already in " << alloc << endl; - continue; - } - - vector old; - on->map_extents(bh->start(), bh->length(), old); - assert(old.size() == 1); - - if (bh->start() >= start && bh->end() <= start+len) { - assert(bh->epoch_modified == super_epoch); - if (bc.bh_cancel_write(bh, super_epoch)) { - if (bh->length() == 1) - dout(10) << "alloc_write unallocated tx " << old[0] << ", canceled " << *bh << endl; - // no, this isn't compatible with clone() and extent reference counting. - //allocator.unallocate(old[0]); // release (into free) - allocator.release(old[0]); - alloc.insert(bh->start(), bh->length()); - } else { - if (bh->length() == 1) - dout(10) << "alloc_write released tx " << old[0] << ", couldn't cancel " << *bh << endl; - allocator.release(old[0]); // release (into limbo) - alloc.insert(bh->start(), bh->length()); - } - } else { - if (bh->length() == 1) - dout(10) << "alloc_write skipped tx " << old[0] << ", not entirely within " - << start << "~" << len - << " bh " << *bh << endl; - } - } - - dout(10) << "alloc_write will (re)alloc " << alloc << " on " << *on << endl; - } - - if (alloc.empty()) return; // no need to dirty the onode below! - - - // merge alloc into onode uncommitted map - //dout(10) << " union of " << on->uncommitted << " and " << alloc << endl; - interval_set old = on->uncommitted; - on->uncommitted.union_of(alloc); - - dout(10) << "alloc_write onode.uncommitted is now " << on->uncommitted << endl; - - if (0) { - // verify - interval_set ta; - ta.intersection_of(on->uncommitted, alloc); - cout << " ta " << ta << endl; - assert(alloc == ta); - - interval_set tb; - tb.intersection_of(on->uncommitted, old); - cout << " tb " << tb << endl; - assert(old == tb); - } - - dirty_onode(on); - - // allocate the space - for (map::iterator i = alloc.m.begin(); - i != alloc.m.end(); - i++) { - dout(15) << "alloc_write alloc " << i->first << "~" << i->second << " (of " << start << "~" << len << ")" << endl; - - // allocate new space - block_t left = i->second; - block_t cur = i->first; - while (left > 0) { - Extent ex; - allocator.allocate(ex, left, Allocator::NEAR_LAST_FWD); - dout(10) << "alloc_write got " << ex << " for object offset " << cur << endl; - on->set_extent(cur, ex); // map object to new region - left -= ex.length; - cur += ex.length; - } - } -} - - - - -void Ebofs::apply_write(Onode *on, off_t off, size_t len, bufferlist& bl) -{ - ObjectCache *oc = on->get_oc(&bc); - - // map into blocks - off_t opos = off; // byte pos in object - size_t zleft = 0; // zeros left to write - size_t left = len; // bytes left - - block_t bstart = off / EBOFS_BLOCK_SIZE; - - if (off > on->object_size) { - zleft = off - on->object_size; - opos = on->object_size; - bstart = on->object_size / EBOFS_BLOCK_SIZE; - } - if (off+(off_t)len > on->object_size) { - dout(10) << "apply_write extending size on " << *on << ": " << on->object_size - << " -> " << off+len << endl; - on->object_size = off+len; - dirty_onode(on); - } - if (bl.length() == 0) { - zleft += len; - left = 0; - } - if (zleft) - dout(10) << "apply_write zeroing first " << zleft << " bytes of " << *on << endl; - - block_t blast = (len+off-1) / EBOFS_BLOCK_SIZE; - block_t blen = blast-bstart+1; - - // allocate write on disk. - interval_set alloc; - block_t old_bfirst = 0; // zero means not defined here (since we ultimately pass to bh_read) - block_t old_blast = 0; - alloc_write(on, bstart, blen, alloc, old_bfirst, old_blast); - dout(20) << "apply_write old_bfirst " << old_bfirst << ", old_blast " << old_blast << endl; - - if (fake_writes) { - on->uncommitted.clear(); // worst case! - return; - } - - // map b range onto buffer_heads - map hits; - oc->map_write(bstart, blen, alloc, hits, super_epoch); - - // get current versions - //version_t lowv, highv; - //oc->scan_versions(bstart, blen, lowv, highv); - //highv++; - version_t highv = ++oc->write_count; - - // copy from bl into buffer cache - unsigned blpos = 0; // byte pos in input buffer - - // write data into buffers - for (map::iterator i = hits.begin(); - i != hits.end(); - i++) { - BufferHead *bh = i->second; - bh->set_version(highv); - bh->epoch_modified = super_epoch; - - // old write in progress? - if (bh->is_tx()) { // copy the buffer to avoid munging up in-flight write - dout(10) << "apply_write tx pending, copying buffer on " << *bh << endl; - bufferlist temp; - temp.claim(bh->data); - //bc.bufferpool.alloc(EBOFS_BLOCK_SIZE*bh->length(), bh->data); - bh->data.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*bh->length()) ); - bh->data.copy_in(0, bh->length()*EBOFS_BLOCK_SIZE, temp); - } - - // need to split off partial? (partials can only be ONE block) - if ((bh->is_missing() || bh->is_rx()) && bh->length() > 1) { - if ((bh->start() == bstart && opos % EBOFS_BLOCK_SIZE != 0)) { - BufferHead *right = bc.split(bh, bh->start()+1); - hits[right->start()] = right; - dout(10) << "apply_write split off left block for partial write; rest is " << *right << endl; - } - if ((bh->last() == blast && (len+off) % EBOFS_BLOCK_SIZE != 0) && - ((off_t)len+off < on->object_size)) { - BufferHead *right = bc.split(bh, bh->last()); - hits[right->start()] = right; - dout(10) << "apply_write split off right block for upcoming partial write; rest is " << *right << endl; - } - } - - // partial at head or tail? - if ((bh->start() == bstart && opos % EBOFS_BLOCK_SIZE != 0) || // opos, not off, in case we're zeroing... - (bh->last() == blast && ((off_t)len+off) % EBOFS_BLOCK_SIZE != 0 && ((off_t)len+off) < on->object_size)) { - // locate ourselves in bh - unsigned off_in_bh = opos - bh->start()*EBOFS_BLOCK_SIZE; - assert(off_in_bh >= 0); - unsigned len_in_bh = MIN( (off_t)(zleft+left), - (off_t)(bh->end()*EBOFS_BLOCK_SIZE)-opos ); - - if (bh->is_partial() || bh->is_rx() || bh->is_missing()) { - assert(bh->is_partial() || bh->is_rx() || bh->is_missing()); - assert(bh->length() == 1); - - // add frag to partial - dout(10) << "apply_write writing into partial " << *bh << ":" - << " off_in_bh " << off_in_bh - << " len_in_bh " << len_in_bh - << endl; - unsigned z = MIN( zleft, len_in_bh ); - if (z) { - bufferptr zp(z); - zp.zero(); - bufferlist zb; - zb.push_back(zp); - bh->add_partial(off_in_bh, zb); - zleft -= z; - opos += z; - } - - bufferlist sb; - sb.substr_of(bl, blpos, len_in_bh-z); // substr in existing buffer - bufferlist cp; - cp.append(sb.c_str(), len_in_bh-z); // copy the partial bit! - bh->add_partial(off_in_bh, cp); - left -= len_in_bh-z; - blpos += len_in_bh-z; - opos += len_in_bh-z; - - if (bh->partial_is_complete(on->object_size - bh->start()*EBOFS_BLOCK_SIZE)) { - dout(10) << "apply_write completed partial " << *bh << endl; - //bc.bufferpool.alloc(EBOFS_BLOCK_SIZE*bh->length(), bh->data); // new buffers! - bh->data.clear(); - bh->data.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*bh->length()) ); - bh->data.zero(); - bh->apply_partial(); - bc.mark_dirty(bh); - bc.bh_write(on, bh); - } - else if (bh->is_rx()) { - dout(10) << "apply_write rx -> partial " << *bh << endl; - assert(bh->length() == 1); - bc.mark_partial(bh); - bc.bh_queue_partial_write(on, bh); // queue the eventual write - } - else if (bh->is_missing()) { - dout(10) << "apply_write missing -> partial " << *bh << endl; - assert(bh->length() == 1); - bc.mark_partial(bh); - - // take care to read from _old_ disk block locations! - if (bh->start() == bstart) - bc.bh_read(on, bh, old_bfirst); - else if (bh->start() == blast) - bc.bh_read(on, bh, old_blast); - else assert(0); - - bc.bh_queue_partial_write(on, bh); // queue the eventual write - } - else if (bh->is_partial()) { - dout(10) << "apply_write already partial, no need to submit rx on " << *bh << endl; - if (bh->partial_tx_epoch == super_epoch) - bc.bh_cancel_partial_write(bh); - bc.bh_queue_partial_write(on, bh); // queue the eventual write - } - - - } else { - assert(bh->is_clean() || bh->is_dirty() || bh->is_tx()); - - // just write into the bh! - dout(10) << "apply_write writing leading/tailing partial into " << *bh << ":" - << " off_in_bh " << off_in_bh - << " len_in_bh " << len_in_bh - << endl; - - // copy data into new buffers first (copy on write!) - // FIXME: only do the modified pages? this might be a big bh! - bufferlist temp; - temp.claim(bh->data); - //bc.bufferpool.alloc(EBOFS_BLOCK_SIZE*bh->length(), bh->data); - bh->data.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*bh->length()) ); - bh->data.copy_in(0, bh->length()*EBOFS_BLOCK_SIZE, temp); - - unsigned z = MIN( zleft, len_in_bh ); - if (z) { - bufferptr zp(z); - zp.zero(); - bufferlist zb; - zb.push_back(zp); - bh->data.copy_in(off_in_bh, z, zb); - zleft -= z; - opos += z; - } - - bufferlist sub; - sub.substr_of(bl, blpos, len_in_bh-z); - bh->data.copy_in(off_in_bh+z, len_in_bh-z, sub); - blpos += len_in_bh-z; - left -= len_in_bh-z; - opos += len_in_bh-z; - - if (!bh->is_dirty()) - bc.mark_dirty(bh); - - bc.bh_write(on, bh); - } - continue; - } - - // ok, we're talking full block(s) now (modulo last block of the object) - assert(opos % EBOFS_BLOCK_SIZE == 0); - assert((off_t)(zleft+left) >= (off_t)(EBOFS_BLOCK_SIZE*bh->length()) || - opos+(off_t)(zleft+left) == on->object_size); - - // alloc new buffers. - //bc.bufferpool.alloc(EBOFS_BLOCK_SIZE*bh->length(), bh->data); - bh->data.clear(); - bh->data.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*bh->length()) ); - - // copy! - unsigned len_in_bh = MIN(bh->length()*EBOFS_BLOCK_SIZE, zleft+left); - assert(len_in_bh <= zleft+left); - - dout(10) << "apply_write writing into " << *bh << ":" - << " len_in_bh " << len_in_bh - << endl; - - unsigned z = MIN(len_in_bh, zleft); - if (z) { - bufferptr zp(z); - zp.zero(); - bufferlist zb; - zb.push_back(zp); - bh->data.copy_in(0, z, zb); - zleft -= z; - } - - bufferlist sub; - sub.substr_of(bl, blpos, len_in_bh-z); - bh->data.copy_in(z, len_in_bh-z, sub); - - blpos += len_in_bh-z; - left -= len_in_bh-z; - opos += len_in_bh; - - // old partial? - if (bh->is_partial() && - bh->partial_tx_epoch == super_epoch) - bc.bh_cancel_partial_write(bh); - - // mark dirty - if (!bh->is_dirty()) - bc.mark_dirty(bh); - - bc.bh_write(on, bh); - } - - assert(zleft == 0); - assert(left == 0); - assert(opos == off+(off_t)len); - //assert(blpos == bl.length()); -} - - - - -// *** file i/o *** - -bool Ebofs::attempt_read(Onode *on, off_t off, size_t len, bufferlist& bl, - Cond *will_wait_on, bool *will_wait_on_bool) -{ - dout(10) << "attempt_read " << *on << " " << off << "~" << len << endl; - ObjectCache *oc = on->get_oc(&bc); - - // map - block_t bstart = off / EBOFS_BLOCK_SIZE; - block_t blast = (len+off-1) / EBOFS_BLOCK_SIZE; - block_t blen = blast-bstart+1; - - map hits; - map missing; // read these - map rx; // wait for these - map partials; // ?? - oc->map_read(bstart, blen, hits, missing, rx, partials); - - // missing buffers? - if (!missing.empty()) { - for (map::iterator i = missing.begin(); - i != missing.end(); - i++) { - dout(10) << "attempt_read missing buffer " << *(i->second) << endl; - bc.bh_read(on, i->second); - } - BufferHead *wait_on = missing.begin()->second; - block_t b = MAX(wait_on->start(), bstart); - wait_on->waitfor_read[b].push_back(new C_Cond(will_wait_on, will_wait_on_bool)); - return false; - } - - // are partials sufficient? - bool partials_ok = true; - for (map::iterator i = partials.begin(); - i != partials.end(); - i++) { - BufferHead *bh = i->second; - off_t bhstart = (off_t)(bh->start()*EBOFS_BLOCK_SIZE); - off_t bhend = (off_t)(bh->end()*EBOFS_BLOCK_SIZE); - off_t start = MAX( off, bhstart ); - off_t end = MIN( off+(off_t)len, bhend ); - - if (!i->second->have_partial_range(start-bhstart, end-bhend)) { - if (partials_ok) { - // wait on this one - Context *c = new C_Cond(will_wait_on, will_wait_on_bool); - dout(10) << "attempt_read insufficient partial buffer " << *(i->second) << " c " << c << endl; - i->second->waitfor_read[i->second->start()].push_back(c); - } - partials_ok = false; - } - } - if (!partials_ok) return false; - - // wait on rx? - if (!rx.empty()) { - BufferHead *wait_on = rx.begin()->second; - Context *c = new C_Cond(will_wait_on, will_wait_on_bool); - dout(1) << "attempt_read waiting for read to finish on " << *wait_on << " c " << c << endl; - block_t b = MAX(wait_on->start(), bstart); - wait_on->waitfor_read[b].push_back(c); - return false; - } - - // yay, we have it all! - // concurrently walk thru hits, partials. - map::iterator h = hits.begin(); - map::iterator p = partials.begin(); - - bl.clear(); - off_t pos = off; - block_t curblock = bstart; - while (curblock <= blast) { - BufferHead *bh = 0; - if (h->first == curblock) { - bh = h->second; - h++; - } else if (p->first == curblock) { - bh = p->second; - p++; - } else assert(0); - - off_t bhstart = (off_t)(bh->start()*EBOFS_BLOCK_SIZE); - off_t bhend = (off_t)(bh->end()*EBOFS_BLOCK_SIZE); - off_t start = MAX( pos, bhstart ); - off_t end = MIN( off+(off_t)len, bhend ); - - if (bh->is_partial()) { - // copy from a partial block. yuck! - bufferlist frag; - bh->copy_partial_substr( start-bhstart, end-bhstart, frag ); - bl.claim_append( frag ); - pos += frag.length(); - } else { - // copy from a full block. - if (bhstart == start && bhend == end) { - bl.append( bh->data ); - pos += bh->data.length(); - } else { - bufferlist frag; - dout(10) << "substr " << (start-bhstart) << "~" << (end-start) << " of " << bh->data.length() << " in " << *bh << endl; - frag.substr_of(bh->data, start-bhstart, end-start); - pos += frag.length(); - bl.claim_append( frag ); - } - } - - curblock = bh->end(); - /* this assert is more trouble than it's worth - assert((off_t)(curblock*EBOFS_BLOCK_SIZE) == pos || // should be aligned with next block - end != bhend || // or we ended midway through bh - (bh->last() == blast && end == bhend)); // ended last block ** FIXME WRONG??? - */ - } - - assert(bl.length() == len); - return true; -} - - -/* - * is_cached -- query whether a object extent is in our cache - * return value of -1 if onode isn't loaded. otherwise, the number - * of extents that need to be read (i.e. # of seeks) - */ -int Ebofs::is_cached(object_t oid, off_t off, size_t len) -{ - ebofs_lock.Lock(); - int r = _is_cached(oid, off, len); - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_is_cached(object_t oid, off_t off, size_t len) -{ - if (!have_onode(oid)) { - dout(7) << "_is_cached " << oid << " " << off << "~" << len << " ... onode " << endl; - return -1; // object dne? - } - Onode *on = get_onode(oid); - - if (!on->have_oc()) { - // nothing is cached. return # of extents in file. - dout(10) << "_is_cached have onode but no object cache, returning extent count" << endl; - return on->extent_map.size(); - } - - // map - block_t bstart = off / EBOFS_BLOCK_SIZE; - block_t blast = (len+off-1) / EBOFS_BLOCK_SIZE; - block_t blen = blast-bstart+1; - - map hits; - map missing; // read these - map rx; // wait for these - map partials; // ?? - - int num_missing = on->get_oc(&bc)->try_map_read(bstart, blen); - dout(7) << "_is_cached try_map_read reports " << num_missing << " missing extents" << endl; - return num_missing; - - // FIXME: actually, we should calculate if these extents are contiguous. - // and not using map_read, probably... - /* hrmpf - block_t dpos = 0; - block_t opos = bstart; - while (opos < blen) { - if (hits.begin()->first == opos) { - } else { - block_t d; - if (missing.begin()->first == opos) d = missing.begin()->second. - - } - */ -} - -void Ebofs::trim_from_cache(object_t oid, off_t off, size_t len) -{ - ebofs_lock.Lock(); - _trim_from_cache(oid, off, len); - ebofs_lock.Unlock(); -} - -void Ebofs::_trim_from_cache(object_t oid, off_t off, size_t len) -{ - // be careful not to load it if we don't have it - if (!have_onode(oid)) { - dout(7) << "_trim_from_cache " << oid << " " << off << "~" << len << " ... onode not in cache " << endl; - return; - } - - // ok, we have it, get a pointer. - Onode *on = get_onode(oid); - - if (!on->have_oc()) - return; // nothing is cached. - - // map to blocks - block_t bstart = off / EBOFS_BLOCK_SIZE; - block_t blast = (len+off-1) / EBOFS_BLOCK_SIZE; - - ObjectCache *oc = on->get_oc(&bc); - oc->touch_bottom(bstart, blast); - - return; -} - - -int Ebofs::read(object_t oid, - off_t off, size_t len, - bufferlist& bl) -{ - ebofs_lock.Lock(); - int r = _read(oid, off, len, bl); - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_read(object_t oid, off_t off, size_t len, bufferlist& bl) -{ - dout(7) << "_read " << oid << " " << off << "~" << len << endl; - - Onode *on = get_onode(oid); - if (!on) { - dout(7) << "_read " << oid << " " << off << "~" << len << " ... dne " << endl; - return -ENOENT; // object dne? - } - - // read data into bl. block as necessary. - Cond cond; - - int r = 0; - while (1) { - // check size bound - if (off >= on->object_size) { - dout(7) << "_read " << oid << " " << off << "~" << len << " ... off past eof " << on->object_size << endl; - r = -ESPIPE; // FIXME better errno? - break; - } - - size_t try_len = len ? len:on->object_size; - size_t will_read = MIN(off+(off_t)try_len, on->object_size) - off; - - bool done; - if (attempt_read(on, off, will_read, bl, &cond, &done)) - break; // yay - - // wait - while (!done) - cond.Wait(ebofs_lock); - - if (on->deleted) { - dout(7) << "_read " << oid << " " << off << "~" << len << " ... object deleted" << endl; - r = -ENOENT; - break; - } - } - - put_onode(on); - - trim_bc(); - - if (r < 0) return r; // return error, - dout(7) << "_read " << oid << " " << off << "~" << len << " ... got " << bl.length() << endl; - return bl.length(); // or bytes read. -} - - -bool Ebofs::_write_will_block() -{ - return (bc.get_stat_dirty()+bc.get_stat_tx() > g_conf.ebofs_bc_max_dirty); -} - -bool Ebofs::write_will_block() -{ - ebofs_lock.Lock(); - bool b = _write_will_block(); - ebofs_lock.Unlock(); - return b; -} - - -unsigned Ebofs::apply_transaction(Transaction& t, Context *onsafe) -{ - ebofs_lock.Lock(); - dout(7) << "apply_transaction start (" << t.ops.size() << " ops)" << endl; - - // do ops - unsigned r = 0; // bit fields indicate which ops failed. - int bit = 1; - for (list::iterator p = t.ops.begin(); - p != t.ops.end(); - p++) { - switch (*p) { - case Transaction::OP_READ: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - off_t offset = t.offsets.front(); t.offsets.pop_front(); - size_t len = t.lengths.front(); t.lengths.pop_front(); - bufferlist *pbl = t.pbls.front(); t.pbls.pop_front(); - if (_read(oid, offset, len, *pbl) < 0) { - dout(7) << "apply_transaction fail on _read" << endl; - r &= bit; - } - } - break; - - case Transaction::OP_STAT: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - struct stat *st = t.psts.front(); t.psts.pop_front(); - if (_stat(oid, st) < 0) { - dout(7) << "apply_transaction fail on _stat" << endl; - r &= bit; - } - } - break; - - case Transaction::OP_GETATTR: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - const char *attrname = t.attrnames.front(); t.attrnames.pop_front(); - pair pattrval = t.pattrvals.front(); t.pattrvals.pop_front(); - if ((*(pattrval.second) = _getattr(oid, attrname, pattrval.first, *(pattrval.second))) < 0) { - dout(7) << "apply_transaction fail on _getattr" << endl; - r &= bit; - } - } - break; - - case Transaction::OP_GETATTRS: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - map *pset = t.pattrsets.front(); t.pattrsets.pop_front(); - if (_getattrs(oid, *pset) < 0) { - dout(7) << "apply_transaction fail on _getattrs" << endl; - r &= bit; - } - } - break; - - - case Transaction::OP_WRITE: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - off_t offset = t.offsets.front(); t.offsets.pop_front(); - size_t len = t.lengths.front(); t.lengths.pop_front(); - bufferlist bl = t.bls.front(); t.bls.pop_front(); - if (_write(oid, offset, len, bl) < 0) { - dout(7) << "apply_transaction fail on _write" << endl; - r &= bit; - } - } - break; - - case Transaction::OP_TRIMCACHE: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - off_t offset = t.offsets.front(); t.offsets.pop_front(); - size_t len = t.lengths.front(); t.lengths.pop_front(); - _trim_from_cache(oid, offset, len); - } - break; - - case Transaction::OP_TRUNCATE: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - off_t len = t.offsets.front(); t.offsets.pop_front(); - if (_truncate(oid, len) < 0) { - dout(7) << "apply_transaction fail on _truncate" << endl; - r &= bit; - } - } - break; - - case Transaction::OP_REMOVE: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - if (_remove(oid) < 0) { - dout(7) << "apply_transaction fail on _remove" << endl; - r &= bit; - } - } - break; - - case Transaction::OP_SETATTR: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - const char *attrname = t.attrnames.front(); t.attrnames.pop_front(); - //pair attrval = t.attrvals.front(); t.attrvals.pop_front(); - bufferlist bl; - bl.claim( t.attrbls.front() ); - t.attrbls.pop_front(); - if (_setattr(oid, attrname, bl.c_str(), bl.length()) < 0) { - dout(7) << "apply_transaction fail on _setattr" << endl; - r &= bit; - } - } - break; - - case Transaction::OP_SETATTRS: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - map *pattrset = t.pattrsets.front(); t.pattrsets.pop_front(); - if (_setattrs(oid, *pattrset) < 0) { - dout(7) << "apply_transaction fail on _setattrs" << endl; - r &= bit; - } - } - break; - - case Transaction::OP_RMATTR: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - const char *attrname = t.attrnames.front(); t.attrnames.pop_front(); - if (_rmattr(oid, attrname) < 0) { - dout(7) << "apply_transaction fail on _rmattr" << endl; - r &= bit; - } - } - break; - - case Transaction::OP_CLONE: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - object_t noid = t.oids.front(); t.oids.pop_front(); - if (_clone(oid, noid) < 0) { - dout(7) << "apply_transaction fail on _clone" << endl; - r &= bit; - } - } - break; - - case Transaction::OP_MKCOLL: - { - coll_t cid = t.cids.front(); t.cids.pop_front(); - if (_create_collection(cid) < 0) { - dout(7) << "apply_transaction fail on _create_collection" << endl; - r &= bit; - } - } - break; - - case Transaction::OP_RMCOLL: - { - coll_t cid = t.cids.front(); t.cids.pop_front(); - if (_destroy_collection(cid) < 0) { - dout(7) << "apply_transaction fail on _destroy_collection" << endl; - r &= bit; - } - } - break; - - case Transaction::OP_COLL_ADD: - { - coll_t cid = t.cids.front(); t.cids.pop_front(); - object_t oid = t.oids.front(); t.oids.pop_front(); - if (_collection_add(cid, oid) < 0) { - //dout(7) << "apply_transaction fail on _collection_add" << endl; - //r &= bit; - } - } - break; - - case Transaction::OP_COLL_REMOVE: - { - coll_t cid = t.cids.front(); t.cids.pop_front(); - object_t oid = t.oids.front(); t.oids.pop_front(); - if (_collection_remove(cid, oid) < 0) { - dout(7) << "apply_transaction fail on _collection_remove" << endl; - r &= bit; - } - } - break; - - case Transaction::OP_COLL_SETATTR: - { - coll_t cid = t.cids.front(); t.cids.pop_front(); - const char *attrname = t.attrnames.front(); t.attrnames.pop_front(); - //pair attrval = t.attrvals.front(); t.attrvals.pop_front(); - bufferlist bl; - bl.claim( t.attrbls.front() ); - t.attrbls.pop_front(); - if (_collection_setattr(cid, attrname, bl.c_str(), bl.length()) < 0) { - //if (_collection_setattr(cid, attrname, attrval.first, attrval.second) < 0) { - dout(7) << "apply_transaction fail on _collection_setattr" << endl; - r &= bit; - } - } - break; - - case Transaction::OP_COLL_RMATTR: - { - coll_t cid = t.cids.front(); t.cids.pop_front(); - const char *attrname = t.attrnames.front(); t.attrnames.pop_front(); - if (_collection_rmattr(cid, attrname) < 0) { - dout(7) << "apply_transaction fail on _collection_rmattr" << endl; - r &= bit; - } - } - break; - - default: - cerr << "bad op " << *p << endl; - assert(0); - } - - bit = bit << 1; - } - - dout(7) << "apply_transaction finish (r = " << r << ")" << endl; - - // set up commit waiter - //if (r == 0) { - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - //} else { - //if (onsafe) delete onsafe; - //} - - ebofs_lock.Unlock(); - return r; -} - - - -int Ebofs::_write(object_t oid, off_t offset, size_t length, bufferlist& bl) -{ - dout(7) << "_write " << oid << " " << offset << "~" << length << endl; - assert(bl.length() == length); - - // too much unflushed dirty data? (if so, block!) - if (_write_will_block()) { - dout(10) << "_write blocking " - << oid << " " << offset << "~" << length - << " bc: " - << "size " << bc.get_size() - << ", trimmable " << bc.get_trimmable() - << ", max " << g_conf.ebofs_bc_size - << "; dirty " << bc.get_stat_dirty() - << ", tx " << bc.get_stat_tx() - << ", max dirty " << g_conf.ebofs_bc_max_dirty - << endl; - - while (_write_will_block()) - bc.waitfor_stat(); // waits on ebofs_lock - - dout(10) << "_write unblocked " - << oid << " " << offset << "~" << length - << " bc: " - << "size " << bc.get_size() - << ", trimmable " << bc.get_trimmable() - << ", max " << g_conf.ebofs_bc_size - << "; dirty " << bc.get_stat_dirty() - << ", tx " << bc.get_stat_tx() - << ", max dirty " << g_conf.ebofs_bc_max_dirty - << endl; - } - - // out of space? - unsigned max = (length+offset) / EBOFS_BLOCK_SIZE + 10; // very conservative; assumes we have to rewrite - max += dirty_onodes.size() + dirty_cnodes.size(); - if (max >= free_blocks) { - dout(1) << "write failing, only " << free_blocks << " blocks free, may need up to " << max << endl; - return -ENOSPC; - } - - // get|create inode - Onode *on = get_onode(oid); - if (!on) on = new_onode(oid); // new inode! - if (on->readonly) { - put_onode(on); - return -EACCES; - } - - dirty_onode(on); // dirty onode! - - // apply write to buffer cache - if (length > 0) - apply_write(on, offset, length, bl); - - // done. - put_onode(on); - trim_bc(); - - return length; -} - - -/*int Ebofs::write(object_t oid, - off_t off, size_t len, - bufferlist& bl, bool fsync) -{ - // wait? - if (fsync) { - // wait for flush. - Cond cond; - bool done; - int flush = 1; // write never returns positive - Context *c = new C_Cond(&cond, &done, &flush); - int r = write(oid, off, len, bl, c); - if (r < 0) return r; - - ebofs_lock.Lock(); - { - while (!done) - cond.Wait(ebofs_lock); - assert(flush <= 0); - } - ebofs_lock.Unlock(); - if (flush < 0) return flush; - return r; - } else { - // don't wait for flush. - return write(oid, off, len, bl, (Context*)0); - } -} -*/ - -int Ebofs::write(object_t oid, - off_t off, size_t len, - bufferlist& bl, Context *onsafe) -{ - ebofs_lock.Lock(); - assert(len > 0); - - // go - int r = _write(oid, off, len, bl); - - // commit waiter - if (r > 0) { - assert((size_t)r == len); - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return r; -} - - -int Ebofs::_remove(object_t oid) -{ - dout(7) << "_remove " << oid << endl; - - // get inode - Onode *on = get_onode(oid); - if (!on) return -ENOENT; - - // ok remove it! - remove_onode(on); - - return 0; -} - - -int Ebofs::remove(object_t oid, Context *onsafe) -{ - ebofs_lock.Lock(); - - // do it - int r = _remove(oid); - - // set up commit waiter - if (r >= 0) { - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_truncate(object_t oid, off_t size) -{ - dout(7) << "_truncate " << oid << " size " << size << endl; - - Onode *on = get_onode(oid); - if (!on) - return -ENOENT; - if (on->readonly) { - put_onode(on); - return -EACCES; - } - - int r = 0; - if (size > on->object_size) { - r = -EINVAL; // whatever - } - else if (size < on->object_size) { - // change size - on->object_size = size; - dirty_onode(on); - - // free blocks - block_t nblocks = 0; - if (size) nblocks = 1 + (size-1) / EBOFS_BLOCK_SIZE; - if (on->object_blocks > nblocks) { - vector extra; - on->truncate_extents(nblocks, extra); - for (unsigned i=0; ioc) { - on->oc->truncate(on->object_blocks, super_epoch); - if (on->oc->is_empty()) - on->close_oc(); - } - - // update uncommitted - interval_set uncom; - if (nblocks > 0) { - interval_set left; - left.insert(0, nblocks); - uncom.intersection_of(left, on->uncommitted); - } - dout(10) << "uncommitted was " << on->uncommitted << " now " << uncom << endl; - on->uncommitted = uncom; - - } - else { - assert(size == on->object_size); - } - - put_onode(on); - return r; -} - - -int Ebofs::truncate(object_t oid, off_t size, Context *onsafe) -{ - ebofs_lock.Lock(); - - int r = _truncate(oid, size); - - // set up commit waiter - if (r >= 0) { - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return r; -} - - - -int Ebofs::clone(object_t from, object_t to, Context *onsafe) -{ - ebofs_lock.Lock(); - - int r = _clone(from, to); - - // set up commit waiter - if (r >= 0) { - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_clone(object_t from, object_t to) -{ - dout(7) << "_clone " << from << " -> " << to << endl; - - if (!g_conf.ebofs_cloneable) - return -1; // no! - - Onode *fon = get_onode(from); - if (!fon) return -ENOENT; - Onode *ton = get_onode(to); - if (ton) { - put_onode(fon); - put_onode(ton); - return -EEXIST; - } - ton = new_onode(to); - assert(ton); - - // copy easy bits - ton->readonly = true; - ton->object_size = fon->object_size; - ton->object_blocks = fon->object_blocks; - ton->attr = fon->attr; - - // collections - for (set::iterator p = fon->collections.begin(); - p != fon->collections.end(); - p++) - _collection_add(*p, to); - - // extents - ton->extent_map = fon->extent_map; - for (map::iterator p = ton->extent_map.begin(); - p != ton->extent_map.end(); - ++p) { - allocator.alloc_inc(p->second); - } - - // clear uncommitted - fon->uncommitted.clear(); - - // muck with ObjectCache - if (fon->oc) - fon->oc->clone_to( ton ); - - // ok! - put_onode(ton); - put_onode(fon); - return 0; -} - - - - -/* - * pick object revision with rev < specified rev. - * (oid.rev is a noninclusive upper bound.) - * - */ -int Ebofs::pick_object_revision_lt(object_t& oid) -{ - assert(oid.rev > 0); // this is only useful for non-zero oid.rev - - int r = -EEXIST; // return code - ebofs_lock.Lock(); - { - object_t orig = oid; - object_t live = oid; - live.rev = 0; - - if (object_tab->get_num_keys() > 0) { - Table::Cursor cursor(object_tab); - - object_tab->find(oid, cursor); // this will be just _past_ highest eligible rev - if (cursor.move_left() > 0) { - bool firstpass = true; - while (1) { - object_t t = cursor.current().key; - if (t.ino != oid.ino || - t.bno != oid.bno) // passed to previous object - break; - if (oid.rev < t.rev) { // rev < desired. possible match. - r = 0; - oid = t; - break; - } - if (firstpass && oid.rev >= t.rev) { // there is no old rev < desired. try live. - r = 0; - oid = live; - break; - } - if (cursor.move_left() <= 0) break; - firstpass = false; - } - } - } - - dout(8) << "find_object_revision " << orig << " -> " << oid - << " r=" << r << endl; - } - ebofs_lock.Unlock(); - return r; -} - - - - -bool Ebofs::exists(object_t oid) -{ - ebofs_lock.Lock(); - dout(8) << "exists " << oid << endl; - bool e = (object_tab->lookup(oid) == 0); - ebofs_lock.Unlock(); - return e; -} - -int Ebofs::stat(object_t oid, struct stat *st) -{ - ebofs_lock.Lock(); - int r = _stat(oid,st); - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_stat(object_t oid, struct stat *st) -{ - dout(7) << "_stat " << oid << endl; - - Onode *on = get_onode(oid); - if (!on) return -ENOENT; - - // ?? - st->st_size = on->object_size; - - put_onode(on); - return 0; -} - - -int Ebofs::_setattr(object_t oid, const char *name, const void *value, size_t size) -{ - dout(8) << "setattr " << oid << " '" << name << "' len " << size << endl; - - Onode *on = get_onode(oid); - if (!on) return -ENOENT; - if (on->readonly) { - put_onode(on); - return -EACCES; - } - - string n(name); - on->attr[n] = buffer::copy((char*)value, size); - dirty_onode(on); - put_onode(on); - - dout(8) << "setattr " << oid << " '" << name << "' len " << size << " success" << endl; - - return 0; -} - -int Ebofs::setattr(object_t oid, const char *name, const void *value, size_t size, Context *onsafe) -{ - ebofs_lock.Lock(); - int r = _setattr(oid, name, value, size); - - // set up commit waiter - if (r >= 0) { - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_setattrs(object_t oid, map& attrset) -{ - dout(8) << "setattrs " << oid << endl; - - Onode *on = get_onode(oid); - if (!on) return -ENOENT; - if (on->readonly) { - put_onode(on); - return -EACCES; - } - - on->attr = attrset; - dirty_onode(on); - put_onode(on); - return 0; -} - -int Ebofs::setattrs(object_t oid, map& attrset, Context *onsafe) -{ - ebofs_lock.Lock(); - int r = _setattrs(oid, attrset); - - // set up commit waiter - if (r >= 0) { - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::getattr(object_t oid, const char *name, void *value, size_t size) -{ - ebofs_lock.Lock(); - int r = _getattr(oid, name, value, size); - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_getattr(object_t oid, const char *name, void *value, size_t size) -{ - dout(8) << "_getattr " << oid << " '" << name << "' maxlen " << size << endl; - - Onode *on = get_onode(oid); - if (!on) return -ENOENT; - - string n(name); - int r = 0; - if (on->attr.count(n) == 0) { - dout(10) << "_getattr " << oid << " '" << name << "' dne" << endl; - r = -1; - } else { - r = MIN( on->attr[n].length(), size ); - dout(10) << "_getattr " << oid << " '" << name << "' got len " << r << endl; - memcpy(value, on->attr[n].c_str(), r ); - } - put_onode(on); - return r; -} - -int Ebofs::getattrs(object_t oid, map &aset) -{ - ebofs_lock.Lock(); - int r = _getattrs(oid, aset); - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_getattrs(object_t oid, map &aset) -{ - dout(8) << "_getattrs " << oid << endl; - - Onode *on = get_onode(oid); - if (!on) return -ENOENT; - aset = on->attr; - put_onode(on); - return 0; -} - - - -int Ebofs::_rmattr(object_t oid, const char *name) -{ - dout(8) << "_rmattr " << oid << " '" << name << "'" << endl; - - Onode *on = get_onode(oid); - if (!on) return -ENOENT; - if (on->readonly) { - put_onode(on); - return -EACCES; - } - - string n(name); - on->attr.erase(n); - dirty_onode(on); - put_onode(on); - return 0; -} - -int Ebofs::rmattr(object_t oid, const char *name, Context *onsafe) -{ - ebofs_lock.Lock(); - - int r = _rmattr(oid, name); - - // set up commit waiter - if (r >= 0) { - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::listattr(object_t oid, vector& attrs) -{ - ebofs_lock.Lock(); - dout(8) << "listattr " << oid << endl; - - Onode *on = get_onode(oid); - if (!on) { - ebofs_lock.Unlock(); - return -ENOENT; - } - - attrs.clear(); - for (map::iterator i = on->attr.begin(); - i != on->attr.end(); - i++) { - attrs.push_back(i->first); - } - - put_onode(on); - ebofs_lock.Unlock(); - return 0; -} - - - -/***************** collections ******************/ - -int Ebofs::list_collections(list& ls) -{ - ebofs_lock.Lock(); - dout(9) << "list_collections " << endl; - - Table::Cursor cursor(collection_tab); - - int num = 0; - if (collection_tab->find(0, cursor) >= 0) { - while (1) { - ls.push_back(cursor.current().key); - num++; - if (cursor.move_right() <= 0) break; - } - } - - ebofs_lock.Unlock(); - return num; -} - -int Ebofs::_create_collection(coll_t cid) -{ - dout(9) << "_create_collection " << hex << cid << dec << endl; - - if (_collection_exists(cid)) - return -EEXIST; - - Cnode *cn = new_cnode(cid); - put_cnode(cn); - - return 0; -} - -int Ebofs::create_collection(coll_t cid, Context *onsafe) -{ - ebofs_lock.Lock(); - - int r = _create_collection(cid); - - // set up commit waiter - if (r >= 0) { - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_destroy_collection(coll_t cid) -{ - dout(9) << "_destroy_collection " << hex << cid << dec << endl; - - if (!_collection_exists(cid)) - return -ENOENT; - - Cnode *cn = get_cnode(cid); - assert(cn); - - // hose mappings - list objects; - collection_list(cid, objects); - for (list::iterator i = objects.begin(); - i != objects.end(); - i++) { - co_tab->remove(coll_object_t(cid,*i)); - - Onode *on = get_onode(*i); - if (on) { - on->collections.erase(cid); - dirty_onode(on); - put_onode(on); - } - } - - remove_cnode(cn); - return 0; -} - -int Ebofs::destroy_collection(coll_t cid, Context *onsafe) -{ - ebofs_lock.Lock(); - - int r = _destroy_collection(cid); - - // set up commit waiter - if (r >= 0) { - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return r; -} - -bool Ebofs::collection_exists(coll_t cid) -{ - ebofs_lock.Lock(); - dout(10) << "collection_exists " << hex << cid << dec << endl; - bool r = _collection_exists(cid); - ebofs_lock.Unlock(); - return r; -} -bool Ebofs::_collection_exists(coll_t cid) -{ - return (collection_tab->lookup(cid) == 0); -} - -int Ebofs::_collection_add(coll_t cid, object_t oid) -{ - dout(9) << "_collection_add " << hex << cid << " object " << oid << dec << endl; - - if (!_collection_exists(cid)) - return -ENOENT; - - Onode *on = get_onode(oid); - if (!on) return -ENOENT; - - int r = 0; - - if (on->collections.count(cid) == 0) { - on->collections.insert(cid); - dirty_onode(on); - co_tab->insert(coll_object_t(cid,oid), true); - } else { - r = -ENOENT; // FIXME? already in collection. - } - - put_onode(on); - return r; -} - -int Ebofs::collection_add(coll_t cid, object_t oid, Context *onsafe) -{ - ebofs_lock.Lock(); - - int r = _collection_add(cid, oid); - - // set up commit waiter - if (r >= 0) { - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return 0; -} - -int Ebofs::_collection_remove(coll_t cid, object_t oid) -{ - dout(9) << "_collection_remove " << hex << cid << " object " << oid << dec << endl; - - if (!_collection_exists(cid)) - return -ENOENT; - - Onode *on = get_onode(oid); - if (!on) return -ENOENT; - - int r = 0; - - if (on->collections.count(cid)) { - on->collections.erase(cid); - dirty_onode(on); - co_tab->remove(coll_object_t(cid,oid)); - } else { - r = -ENOENT; // FIXME? - } - - put_onode(on); - return r; -} - -int Ebofs::collection_remove(coll_t cid, object_t oid, Context *onsafe) -{ - ebofs_lock.Lock(); - - int r = _collection_remove(cid, oid); - - // set up commit waiter - if (r >= 0) { - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return 0; -} - -int Ebofs::collection_list(coll_t cid, list& ls) -{ - ebofs_lock.Lock(); - dout(9) << "collection_list " << hex << cid << dec << endl; - - if (!_collection_exists(cid)) { - ebofs_lock.Unlock(); - return -ENOENT; - } - - Table::Cursor cursor(co_tab); - - int num = 0; - if (co_tab->find(coll_object_t(cid,object_t()), cursor) >= 0) { - while (1) { - const coll_t c = cursor.current().key.first; - const object_t o = cursor.current().key.second; - if (c != cid) break; // end! - dout(10) << "collection_list " << hex << cid << " includes " << o << dec << endl; - ls.push_back(o); - num++; - if (cursor.move_right() < 0) break; - } - } - - ebofs_lock.Unlock(); - return num; -} - - -int Ebofs::_collection_setattr(coll_t cid, const char *name, const void *value, size_t size) -{ - dout(10) << "_collection_setattr " << hex << cid << dec << " '" << name << "' len " << size << endl; - - Cnode *cn = get_cnode(cid); - if (!cn) return -ENOENT; - - string n(name); - cn->attr[n] = buffer::copy((char*)value, size); - dirty_cnode(cn); - put_cnode(cn); - - return 0; -} - -int Ebofs::collection_setattr(coll_t cid, const char *name, const void *value, size_t size, Context *onsafe) -{ - ebofs_lock.Lock(); - dout(10) << "collection_setattr " << hex << cid << dec << " '" << name << "' len " << size << endl; - - int r = _collection_setattr(cid, name, value, size); - - // set up commit waiter - if (r >= 0) { - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return 0; -} - -int Ebofs::collection_getattr(coll_t cid, const char *name, void *value, size_t size) -{ - ebofs_lock.Lock(); - dout(10) << "collection_setattr " << hex << cid << dec << " '" << name << "' maxlen " << size << endl; - - Cnode *cn = get_cnode(cid); - if (!cn) { - ebofs_lock.Unlock(); - return -ENOENT; - } - - string n(name); - int r; - if (cn->attr.count(n) == 0) { - r = -1; - } else { - r = MIN( cn->attr[n].length(), size ); - memcpy(value, cn->attr[n].c_str(), r); - } - - put_cnode(cn); - ebofs_lock.Unlock(); - return r; -} - -int Ebofs::_collection_rmattr(coll_t cid, const char *name) -{ - dout(10) << "_collection_rmattr " << hex << cid << dec << " '" << name << "'" << endl; - - Cnode *cn = get_cnode(cid); - if (!cn) return -ENOENT; - - string n(name); - cn->attr.erase(n); - - dirty_cnode(cn); - put_cnode(cn); - - return 0; -} - -int Ebofs::collection_rmattr(coll_t cid, const char *name, Context *onsafe) -{ - ebofs_lock.Lock(); - - int r = _collection_rmattr(cid, name); - - // set up commit waiter - if (r >= 0) { - if (onsafe) commit_waiters[super_epoch].push_back(onsafe); - } else { - if (onsafe) delete onsafe; - } - - ebofs_lock.Unlock(); - return 0; -} - -int Ebofs::collection_listattr(coll_t cid, vector& attrs) -{ - ebofs_lock.Lock(); - dout(10) << "collection_listattr " << hex << cid << dec << endl; - - Cnode *cn = get_cnode(cid); - if (!cn) { - ebofs_lock.Unlock(); - return -ENOENT; - } - - attrs.clear(); - for (map::iterator i = cn->attr.begin(); - i != cn->attr.end(); - i++) { - attrs.push_back(i->first); - } - - put_cnode(cn); - ebofs_lock.Unlock(); - return 0; -} - - - -void Ebofs::_export_freelist(bufferlist& bl) -{ - for (int b=0; b<=EBOFS_NUM_FREE_BUCKETS; b++) { - Table *tab; - if (b < EBOFS_NUM_FREE_BUCKETS) { - tab = free_tab[b]; - } else { - tab = limbo_tab; - } - - if (tab->get_num_keys() > 0) { - Table::Cursor cursor(tab); - assert(tab->find(0, cursor) >= 0); - while (1) { - assert(cursor.current().value > 0); - - Extent ex(cursor.current().key, cursor.current().value); - dout(10) << "_export_freelist " << ex << endl; - bl.append((char*)&ex, sizeof(ex)); - if (cursor.move_right() <= 0) break; - } - } - } -} - -void Ebofs::_import_freelist(bufferlist& bl) -{ - // clear - for (int b=0; bclear(); - limbo_tab->clear(); - - // import! - int num = bl.length() / sizeof(Extent); - Extent *p = (Extent*)bl.c_str(); - for (int i=0; i *tab; - if (b < EBOFS_NUM_FREE_BUCKETS) { - tab = free_tab[b]; - dout(30) << "dump bucket " << b << " " << tab->get_num_keys() << endl; - } else { - tab = limbo_tab; - dout(30) << "dump limbo " << tab->get_num_keys() << endl;; - } - - if (tab->get_num_keys() > 0) { - Table::Cursor cursor(tab); - assert(tab->find(0, cursor) >= 0); - while (1) { - assert(cursor.current().value > 0); - - block_t l = cursor.current().value; - tfree += l; - int b = 0; - do { - l = l >> 1; - b++; - } while (l); - st.free_extent_dist[b]++; - st.free_extent_dist_sum[b] += cursor.current().value; - st.num_free_extent++; - - if (cursor.move_right() <= 0) break; - } - } - } - st.avg_free_extent = tfree / st.num_free_extent; -*/ - - // used extents is harder. :( - st.num_extent = 0; - st.avg_extent = 0; - st.extent_dist.clear(); - st.extent_dist_sum.clear(); - st.avg_extent_per_object = 0; - st.avg_extent_jump = 0; - - Table::Cursor cursor(object_tab); - object_tab->find(object_t(), cursor); - int nobj = 0; - int njump = 0; - while (object_tab->get_num_keys() > 0) { - Onode *on = get_onode(cursor.current().key); - assert(on); - - nobj++; - st.avg_extent_per_object += on->extent_map.size(); - - for (map::iterator p = on->extent_map.begin(); - p != on->extent_map.end(); - p++) { - block_t l = p->second.length; - - st.num_extent++; - st.avg_extent += l; - if (p->first > 0) { - njump++; - st.avg_extent_jump += l; - } - - int b = 0; - do { - l = l >> 1; - b++; - } while (l); - st.extent_dist[b]++; - st.extent_dist_sum[b] += p->second.length; - } - put_onode(on); - if (cursor.move_right() <= 0) break; - } - if (njump) st.avg_extent_jump /= njump; - if (nobj) st.avg_extent_per_object /= (float)nobj; - if (st.num_extent) st.avg_extent /= st.num_extent; - - ebofs_lock.Unlock(); -} diff --git a/tags/20070517_before_mds_merge/ebofs/Ebofs.h b/tags/20070517_before_mds_merge/ebofs/Ebofs.h deleted file mode 100644 index 6d18b7a0204fa..0000000000000 --- a/tags/20070517_before_mds_merge/ebofs/Ebofs.h +++ /dev/null @@ -1,330 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include -using namespace std; - -#include -using namespace __gnu_cxx; - -#include "include/Context.h" -#include "include/buffer.h" - -template -inline ostream& operator<<(ostream& out, const pair& p) { - return out << p.first << "," << p.second; -} - -#include "types.h" -#include "Onode.h" -#include "Cnode.h" -#include "BlockDevice.h" -#include "nodes.h" -#include "Allocator.h" -#include "Table.h" - -#include "common/Mutex.h" -#include "common/Cond.h" - -#include "osd/ObjectStore.h" - -//typedef pair object_coll_t; -typedef pair coll_object_t; - - -class Ebofs : public ObjectStore { - protected: - Mutex ebofs_lock; // a beautiful global lock - - // ** debuggy ** - bool fake_writes; - - // ** super ** - BlockDevice dev; - bool mounted, unmounting, dirty; - bool readonly; - version_t super_epoch; - bool commit_thread_started, mid_commit; - Cond commit_cond; // to wake up the commit thread - Cond sync_cond; - - map > commit_waiters; - - void prepare_super(version_t epoch, bufferptr& bp); - void write_super(version_t epoch, bufferptr& bp); - int commit_thread_entry(); - - class CommitThread : public Thread { - Ebofs *ebofs; - public: - CommitThread(Ebofs *e) : ebofs(e) {} - void *entry() { - ebofs->commit_thread_entry(); - return 0; - } - } commit_thread; - - - - - // ** allocator ** - block_t free_blocks, limbo_blocks; - Allocator allocator; - friend class Allocator; - - block_t get_free_blocks() { return free_blocks; } - block_t get_limbo_blocks() { return limbo_blocks; } - block_t get_free_extents() { - int n = 0; - for (int i=0; iget_num_keys(); - return n; - } - block_t get_limbo_extents() { return limbo_tab->get_num_keys(); } - - - // ** tables and sets ** - // nodes - NodePool nodepool; // for all tables... - - // tables - Table *object_tab; - Table *free_tab[EBOFS_NUM_FREE_BUCKETS]; - Table *limbo_tab; - Table > *alloc_tab; - - // collections - Table *collection_tab; - Table *co_tab; - - void close_tables(); - - - // ** onodes ** - hash_map onode_map; // onode cache - LRU onode_lru; - set dirty_onodes; - map > waitfor_onode; - - Onode* new_onode(object_t oid); // make new onode. ref++. - bool have_onode(object_t oid) { - return onode_map.count(oid); - } - Onode* get_onode(object_t oid); // get cached onode, or read from disk. ref++. - void remove_onode(Onode *on); - void put_onode(Onode* o); // put it back down. ref--. - void dirty_onode(Onode* o); - void encode_onode(Onode *on, bufferlist& bl, unsigned& off); - void write_onode(Onode *on); - - // ** cnodes ** - hash_map cnode_map; - LRU cnode_lru; - set dirty_cnodes; - map > waitfor_cnode; - - Cnode* new_cnode(coll_t cid); - Cnode* get_cnode(coll_t cid); - void remove_cnode(Cnode *cn); - void put_cnode(Cnode *cn); - void dirty_cnode(Cnode *cn); - void encode_cnode(Cnode *cn, bufferlist& bl, unsigned& off); - void write_cnode(Cnode *cn); - - // ** onodes+cnodes = inodes ** - int inodes_flushing; - Cond inode_commit_cond; - - void flush_inode_finish(); - void commit_inodes_start(); - void commit_inodes_wait(); - friend class C_E_InodeFlush; - - void trim_inodes(int max = -1); - - // ** buffer cache ** - BufferCache bc; - pthread_t flushd_thread_id; - - version_t trigger_commit(); - void commit_bc_wait(version_t epoch); - void trim_bc(off_t max = -1); - - public: - void kick_idle(); - void sync(); - void sync(Context *onsafe); - void trim_buffer_cache(); - - class IdleKicker : public BlockDevice::kicker { - Ebofs *ebo; - public: - IdleKicker(Ebofs *t) : ebo(t) {} - void kick() { ebo->kick_idle(); } - } idle_kicker; - - - protected: - //void zero(Onode *on, size_t len, off_t off, off_t write_thru); - void alloc_write(Onode *on, - block_t start, block_t len, - interval_set& alloc, - block_t& old_bfirst, block_t& old_blast); - void apply_write(Onode *on, off_t off, size_t len, bufferlist& bl); - bool attempt_read(Onode *on, off_t off, size_t len, bufferlist& bl, - Cond *will_wait_on, bool *will_wait_on_bool); - - // ** finisher ** - // async write notification to users - Mutex finisher_lock; - Cond finisher_cond; - bool finisher_stop; - list finisher_queue; - - void *finisher_thread_entry(); - class FinisherThread : public Thread { - Ebofs *ebofs; - public: - FinisherThread(Ebofs *e) : ebofs(e) {} - void* entry() { return (void*)ebofs->finisher_thread_entry(); } - } finisher_thread; - - - void alloc_more_node_space(); - - void do_csetattrs(map > > &cmods); - void do_setattrs(Onode *on, map > &setattrs); - - - public: - Ebofs(char *devfn) : - fake_writes(false), - dev(devfn), - mounted(false), unmounting(false), dirty(false), readonly(false), - super_epoch(0), commit_thread_started(false), mid_commit(false), - commit_thread(this), - free_blocks(0), limbo_blocks(0), - allocator(this), - nodepool(ebofs_lock), - object_tab(0), limbo_tab(0), collection_tab(0), co_tab(0), - onode_lru(g_conf.ebofs_oc_size), - cnode_lru(g_conf.ebofs_cc_size), - inodes_flushing(0), - bc(dev, ebofs_lock), - idle_kicker(this), - finisher_stop(false), finisher_thread(this) { - for (int i=0; i& attrset, Context *onsafe=0); - int getattr(object_t oid, const char *name, void *value, size_t size); - int getattrs(object_t oid, map &aset); - int rmattr(object_t oid, const char *name, Context *onsafe=0); - int listattr(object_t oid, vector& attrs); - - // collections - int list_collections(list& ls); - bool collection_exists(coll_t c); - - int create_collection(coll_t c, Context *onsafe); - int destroy_collection(coll_t c, Context *onsafe); - int collection_add(coll_t c, object_t o, Context *onsafe); - int collection_remove(coll_t c, object_t o, Context *onsafe); - - int collection_list(coll_t c, list& o); - - int collection_setattr(coll_t oid, const char *name, const void *value, size_t size, Context *onsafe); - int collection_getattr(coll_t oid, const char *name, void *value, size_t size); - int collection_rmattr(coll_t cid, const char *name, Context *onsafe); - int collection_listattr(coll_t oid, vector& attrs); - - // maps - int map_lookup(object_t o, bufferlist& key, bufferlist& val); - int map_insert(object_t o, bufferlist& key, bufferlist& val); - int map_remove(object_t o, bufferlist& key); - int map_list(object_t o, list& keys); - int map_list(object_t o, map& vals); - int map_list(object_t o, - bufferlist& start, bufferlist& end, - map& vals); - - // crap - void _fake_writes(bool b) { fake_writes = b; } - void _get_frag_stat(FragmentationStat& st); - - void _import_freelist(bufferlist& bl); - void _export_freelist(bufferlist& bl); - - -private: - // private interface -- use if caller already holds lock - int _read(object_t oid, off_t off, size_t len, bufferlist& bl); - int _is_cached(object_t oid, off_t off, size_t len); - int _stat(object_t oid, struct stat *st); - int _getattr(object_t oid, const char *name, void *value, size_t size); - int _getattrs(object_t oid, map &aset); - - bool _write_will_block(); - int _write(object_t oid, off_t off, size_t len, bufferlist& bl); - void _trim_from_cache(object_t oid, off_t off, size_t len); - int _truncate(object_t oid, off_t size); - int _truncate_front(object_t oid, off_t size); - int _remove(object_t oid); - int _clone(object_t from, object_t to); - int _setattr(object_t oid, const char *name, const void *value, size_t size); - int _setattrs(object_t oid, map& attrset); - int _rmattr(object_t oid, const char *name); - bool _collection_exists(coll_t c); - int _create_collection(coll_t c); - int _destroy_collection(coll_t c); - int _collection_add(coll_t c, object_t o); - int _collection_remove(coll_t c, object_t o); - int _collection_setattr(coll_t oid, const char *name, const void *value, size_t size); - int _collection_rmattr(coll_t cid, const char *name); - - -}; diff --git a/tags/20070517_before_mds_merge/ebofs/Onode.h b/tags/20070517_before_mds_merge/ebofs/Onode.h deleted file mode 100644 index 233c97e7ae172..0000000000000 --- a/tags/20070517_before_mds_merge/ebofs/Onode.h +++ /dev/null @@ -1,390 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __EBOFS_ONODE_H -#define __EBOFS_ONODE_H - -#include "include/lru.h" - -#include "types.h" -#include "BufferCache.h" - -#include "include/interval_set.h" - - -/* - * object node (like an inode) - * - * holds object metadata, including - * size - * allocation (extent list) - * attributes - * - */ - -class Onode : public LRUObject { -private: - int ref; - -public: - object_t object_id; - version_t version; // incremented on each modify. - - // data - bool readonly; - Extent onode_loc; - off_t object_size; - unsigned object_blocks; - - // onode - set collections; - map attr; - //vector extents; - map extent_map; - - interval_set uncommitted; - - ObjectCache *oc; - - bool dirty; - bool dangling; // not in onode_map - bool deleted; // deleted - - list commit_waiters; - - public: - Onode(object_t oid) : ref(0), object_id(oid), version(0), - readonly(false), - object_size(0), object_blocks(0), oc(0), - dirty(false), dangling(false), deleted(false) { - onode_loc.length = 0; - } - ~Onode() { - if (oc) delete oc; - } - - block_t get_onode_id() { return onode_loc.start; } - int get_onode_len() { return onode_loc.length; } - - int get_ref_count() { return ref; } - void get() { - if (ref == 0) lru_pin(); - ref++; - //cout << "ebofs.onode.get " << hex << object_id << dec << " " << ref << endl; - } - void put() { - ref--; - if (ref == 0) lru_unpin(); - //cout << "ebofs.onode.put " << hex << object_id << dec << " " << ref << endl; - } - - void mark_dirty() { - if (!dirty) { - dirty = true; - get(); - } - } - void mark_clean() { - if (dirty) { - dirty = false; - put(); - } - } - bool is_dirty() { return dirty; } - bool is_deleted() { return deleted; } - bool is_dangling() { return dangling; } - - - bool have_oc() { - return oc != 0; - } - ObjectCache *get_oc(BufferCache *bc) { - if (!oc) { - oc = new ObjectCache(object_id, this, bc); - oc->get(); - get(); - } - return oc; - } - void close_oc() { - if (oc) { - //cout << "close_oc on " << object_id << endl; - assert(oc->is_empty()); - if (oc->put() == 0){ - //cout << "************************* hosing oc" << endl; - delete oc; - } - oc = 0; - put(); - } - } - - - // allocation - void verify_extents() { - if (0) { // do crazy stupid sanity checking - block_t count = 0; - interval_set is; - - set s; - cout << "verifying" << endl; - - for (map::iterator p = extent_map.begin(); - p != extent_map.end(); - p++) { - cout << " " << p->first << ": " << p->second << endl; - assert(count == p->first); - count += p->second.length; - for (unsigned j=0;jsecond.length;j++) { - assert(s.count(p->second.start+j) == 0); - s.insert(p->second.start+j); - } - } - - assert(s.size() == count); - assert(count == object_blocks); - } - } - void set_extent(block_t offset, Extent ex) { - //cout << "set_extent " << offset << " -> " << ex << " ... " << object_blocks << endl; - assert(offset <= object_blocks); - verify_extents(); - - // at the end? - if (offset == object_blocks) { - //cout << " appending " << ex << endl; - if (!extent_map.empty() && extent_map.rbegin()->second.end() == ex.start) { - //cout << "appending " << ex << " to " << extent_map.rbegin()->second << endl; - extent_map.rbegin()->second.length += ex.length; - } else - extent_map[object_blocks] = ex; - object_blocks += ex.length; - return; - } - - // removing any extent bits we overwrite - if (!extent_map.empty()) { - // preceeding extent? - map::iterator p = extent_map.lower_bound(offset); - if (p != extent_map.begin()) { - p--; - if (p->first + p->second.length > offset) { - //cout << " preceeding was " << p->second << endl; - if (p->first + p->second.length > offset+ex.length) { - // cutting chunk out of middle, add last bit - Extent &n = extent_map[offset+ex.length] = p->second; - n.start += offset+ex.length - p->first; - n.length -= offset+ex.length - p->first; - //cout << " tail frag is " << n << endl; - } - p->second.length = offset - p->first; // cut tail off preceeding extent - //cout << " preceeding now " << p->second << endl; - } - p++; - } - - // overlapping extents - while (p != extent_map.end() && - p->first < offset + ex.length) { - map::iterator next = p; - next++; - - // completely subsumed? - if (p->first + p->second.length <= offset+ex.length) { - //cout << " erasing " << p->second << endl; - extent_map.erase(p); - p = next; - continue; - } - - // spans new extent, cut off head - Extent &n = extent_map[ offset+ex.length ] = p->second; - //cout << " cut head off " << p->second; - n.start += offset+ex.length - p->first; - n.length -= offset+ex.length - p->first; - extent_map.erase(p); - //cout << ", now " << n << endl; - break; - } - } - - extent_map[ offset ] = ex; - - // extend object? - if (offset + ex.length > object_blocks) - object_blocks = offset+ex.length; - - verify_extents(); - } - - - /* map_extents(start, len, ls) - * map teh given page range into extents on disk. - */ - int map_extents(block_t start, block_t len, vector& ls) { - //cout << "map_extents " << start << " " << len << endl; - verify_extents(); - - //assert(start+len <= object_blocks); - - map::iterator p = extent_map.lower_bound(start); - if (p != extent_map.begin() && - (p == extent_map.end() || p->first > start && p->first)) { - p--; - if (p->second.length > start - p->first) { - Extent ex; - ex.start = p->second.start + (start - p->first); - ex.length = MIN(len, p->second.length - (start - p->first)); - ls.push_back(ex); - - //cout << " got (tail of?) " << p->second << " : " << ex << endl; - - start += ex.length; - len -= ex.length; - } - p++; - } - - while (len > 0 && - p != extent_map.end()) { - assert(p->first == start); - Extent ex = p->second; - ex.length = MIN(len, ex.length); - ls.push_back(ex); - //cout << " got (head of?) " << p->second << " : " << ex << endl; - start += ex.length; - len -= ex.length; - p++; - } - - return 0; - } - - int truncate_extents(block_t len, vector& extra) { - verify_extents(); - - map::iterator p = extent_map.lower_bound(len); - if (p != extent_map.begin() && - (p == extent_map.end() || p->first > len && p->first)) { - p--; - if (p->second.length > len - p->first) { - Extent ex; - ex.start = p->second.start + (len - p->first); - ex.length = p->second.length - (len - p->first); - extra.push_back(ex); - - p->second.length = len - p->first; - assert(p->second.length > 0); - - //cout << " got (tail of?) " << p->second << " : " << ex << endl; - } - p++; - } - - while (p != extent_map.end()) { - assert(p->first >= len); - extra.push_back(p->second); - map::iterator n = p; - n++; - extent_map.erase(p); - p = n; - } - - object_blocks = len; - verify_extents(); - return 0; - } - - int truncate_front_extents(block_t len, vector& extra) { - verify_extents(); - - while (len > 0) { - Extent& ex = extent_map.begin()->second; // look, this is a reference! - if (ex.length > len) { - // partial first extent - Extent frontbit( ex.start, len ); - extra.push_back(frontbit); - ex.length -= len; - ex.start += len; - break; - } - - // pull off entire first extent. - assert(ex.length <= len); - len -= ex.length; - extra.push_back(ex); - extent_map.erase(extent_map.begin()); - } - - object_blocks -= len; - verify_extents(); - return 0; - } - - - - /* map_alloc_regions(start, len, map) - * map range into regions that need to be (re)allocated on disk - * because they overlap "safe" (or unallocated) parts of the object - */ - /* - void map_alloc_regions(block_t start, block_t len, - interval_set& alloc) { - interval_set already_uncom; - - alloc.insert(start, len); // start with whole range - already_uncom.intersection_of(alloc, uncommitted); - alloc.subtract(already_uncom); // take out the bits that aren't yet committed - } - */ - - - - // pack/unpack - int get_collection_bytes() { - return sizeof(coll_t) * collections.size(); - } - int get_attr_bytes() { - int s = 0; - for (map::iterator i = attr.begin(); - i != attr.end(); - i++) { - s += i->first.length() + 1; - s += i->second.length() + sizeof(int); - } - return s; - } - int get_extent_bytes() { - return sizeof(Extent) * extent_map.size(); - } - -}; - - -inline ostream& operator<<(ostream& out, Onode& on) -{ - out << "onode(" << hex << on.object_id << dec << " len=" << on.object_size; - out << " ref=" << on.get_ref_count(); - if (on.is_dirty()) out << " dirty"; - if (on.is_dangling()) out << " dangling"; - if (on.is_deleted()) out << " deleted"; - out << " uncom=" << on.uncommitted; - // out << " " << &on; - out << ")"; - return out; -} - - - -#endif diff --git a/tags/20070517_before_mds_merge/ebofs/Table.h b/tags/20070517_before_mds_merge/ebofs/Table.h deleted file mode 100644 index f16e506a9dd63..0000000000000 --- a/tags/20070517_before_mds_merge/ebofs/Table.h +++ /dev/null @@ -1,898 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __EBOFS_TABLE_H -#define __EBOFS_TABLE_H - -#include "types.h" -#include "nodes.h" - -/** table **/ - -#define dbtout if (25 <= g_conf.debug_ebofs) cout << "ebofs.table(" << this << ")." - - -template -class Table { - private: - NodePool &pool; - - nodeid_t root; - int nkeys; - int depth; - - public: - Table(NodePool &p, - struct ebofs_table& bts) : - pool(p), - root(bts.root), nkeys(bts.num_keys), depth(bts.depth) { - dbtout << "cons" << endl; - } - - nodeid_t get_root() { return root; } - int get_num_keys() { return nkeys; } - int get_depth() { return depth; } - - - /* - */ - class _IndexItem { // i just need a struct size for below - K k; - nodeid_t n; - }; - class IndexItem { - public: - K key; - nodeid_t node; - static const int MAX = Node::ITEM_LEN / (sizeof(_IndexItem)); - static const int MIN = MAX/2; - }; - class _LeafItem { // i just need a struct size for below - K k; - V v; - }; - class LeafItem { - public: - K key; - V value; - static const int MAX = Node::ITEM_LEN / (sizeof(_LeafItem)); - static const int MIN = MAX/2; - }; - - class Nodeptr { - public: - Node *node; - - Nodeptr() : node(0) {} - Nodeptr(Node *n) : node(n) {} - Nodeptr& operator=(Node *n) { - node = n; - return *this; - } - - LeafItem& leaf_item(int i) { return (( LeafItem*)(node->item_ptr()))[i]; } - IndexItem& index_item(int i) { return ((IndexItem*)(node->item_ptr()))[i]; } - K key(int i) { - if (node->is_index()) - return index_item(i).key; - else - return leaf_item(i).key; - } - - bool is_leaf() { return node->is_leaf(); } - bool is_index() { return node->is_index(); } - void set_type(int t) { node->set_type(t); } - - int max_items() const { - if (node->is_leaf()) - return LeafItem::MAX; - else - return IndexItem::MAX; - } - int min_items() const { return max_items() / 2; } - - nodeid_t get_id() { return node->get_id(); } - - int size() { return node->size(); } - void set_size(int s) { node->set_size(s); } - - void remove_at_pos(int p) { - if (node->is_index()) { - for (int i=p; ip; i--) - leaf_item(i) = leaf_item(i-1); - leaf_item(p).key = key; - leaf_item(p).value = value; - set_size(size() + 1); - } - void insert_at_index_pos(int p, K key, nodeid_t node) { - assert(is_index()); - for (int i=size(); i>p; i--) - index_item(i) = index_item(i-1); - index_item(p).key = key; - index_item(p).node = node; - set_size(size() + 1); - } - - void append_item(LeafItem& i) { - leaf_item(size()) = i; - set_size(size() + 1); - } - void append_item(IndexItem& i) { - index_item(size()) = i; - set_size(size() + 1); - } - - void split(Nodeptr& right) { - if (node->is_index()) { - for (int i=min_items(); iis_index()) - for (int i=0; i open; // open nodes - vector pos; // position within the node - //Nodeptr open[20]; - //int pos[20]; - int level; - - Cursor(Table *t) : table(t), open(t->depth), pos(t->depth), level(0) {} - - public: - - const LeafItem& current() { - assert(open[level].is_leaf()); - return open[level].leaf_item(pos[level]); - } - V& dirty_current_value() { - assert(open[level].is_leaf()); - dirty(); - return open[level].leaf_item(pos[level]).value; - } - - // ** read-only bits ** - int move_left() { - if (table->depth == 0) return OOB; - - // work up around branch - int l; - for (l = level; l >= 0; l--) - if (pos[l] > 0) break; - if (l < 0) - return OOB; // we are the first item in the btree - - // move left one - pos[l]--; - - // work back down right side - for (; lpool.get_node( open[l].index_item(pos[l]).node ); - pos[l+1] = open[l+1].size() - 1; - } - return 1; - } - int move_right() { - if (table->depth == 0) return OOB; - - // work up branch - int l; - for (l=level; l>=0; l--) - if (pos[l] < open[l].size() - 1) break; - if (l < 0) { - /* we are at last item in btree. */ - if (pos[level] < open[level].size()) { - pos[level]++; /* move into add position! */ - return 0; - } - return -1; - } - - /* move right one */ - assert( pos[l] < open[l].size() ); - pos[l]++; - - /* work back down */ - for (; lpool.get_node( open[l].index_item(pos[l]).node ); - pos[l+1] = 0; // furthest left - } - return 1; - } - - // ** modifications ** - void dirty() { - for (int l=level; l>=0; l--) { - if (open[l].node->is_dirty()) break; // already dirty! (and thus parents are too) - - table->pool.dirty_node(open[l].node); - if (l > 0) - open[l-1].index_item( pos[l-1] ).node = open[l].get_id(); - else - table->root = open[0].get_id(); - } - } - private: - void repair_parents() { - // did i make a change at the start of a node? - if (pos[level] == 0) { - K key = open[level].key(0); // new key parents should have - for (int j=level-1; j>=0; j--) { - if (open[j].index_item(pos[j]).key == key) - break; /* it's the same key, we can stop fixing */ - open[j].index_item(pos[j]).key = key; - if (pos[j] > 0) break; /* last in position 0.. */ - } - } - } - - public: - void remove() { - dirty(); - - // remove from node - open[level].remove_at_pos( pos[level] ); - repair_parents(); - - // was it a key? - if (level == table->depth-1) - table->nkeys--; - } - - void insert(K key, V value) { - dirty(); - - // insert - open[level].insert_at_leaf_pos(pos[level], key, value); - repair_parents(); - - // was it a key? - if (level == table->depth-1) - table->nkeys++; - } - - int rotate_left() { - if (level == 0) return -1; // i am root - if (pos[level-1] == 0) return -1; // nothing to left - - Nodeptr here = open[level]; - Nodeptr parent = open[level-1]; - Nodeptr left = table->pool.get_node( parent.index_item(pos[level-1] - 1).node ); - if (left.size() == left.max_items()) return -1; // it's full - - // make both dirty - dirty(); - if (!left.node->is_dirty()) { - table->pool.dirty_node(left.node); - parent.index_item(pos[level-1]-1).node = left.get_id(); - } - - dbtout << "rotating item " << here.key(0) << " left from " << here.get_id() << " to " << left.get_id() << endl; - - /* add */ - if (here.node->is_leaf()) - left.append_item(here.leaf_item(0)); - else - left.append_item(here.index_item(0)); - - /* remove */ - here.remove_at_pos(0); - - /* fix parent index for me */ - parent.index_item( pos[level-1] ).key = here.key(0); - // we never have to update past immediate parent, since we're not at pos 0 - - /* adjust cursor */ - if (pos[level] > 0) - pos[level]--; - //else - //assert(1); /* if we were positioned here, we're equal */ - /* if it was 0, then the shifted item == our key, and we can stay here safely. */ - return 0; - } - int rotate_right() { - if (level == 0) return -1; // i am root - if (pos[level-1] + 1 >= open[level-1].size()) return -1; // nothing to right - - Nodeptr here = open[level]; - Nodeptr parent = open[level-1]; - Nodeptr right = table->pool.get_node( parent.index_item( pos[level-1] + 1 ).node ); - if (right.size() == right.max_items()) return -1; // it's full - - // make both dirty - dirty(); - if (!right.node->is_dirty()) { - table->pool.dirty_node(right.node); - parent.index_item( pos[level-1]+1 ).node = right.get_id(); - } - - if (pos[level] == here.size()) { - /* let's just move the cursor over! */ - //if (sizeof(K) == 8) - dbtout << "shifting cursor right from " << here.get_id() << " to less-full node " << right.get_id() << endl; - open[level] = right; - pos[level] = 0; - pos[level-1]++; - return 0; - } - - //if (sizeof(K) == 8) - dbtout << "rotating item " << hex << here.key(here.size()-1) << dec << " right from " - << here.get_id() << " to " << right.get_id() << endl; - - /* add */ - if (here.is_index()) - right.insert_at_index_pos(0, - here.index_item( here.size()-1 ).key, - here.index_item( here.size()-1 ).node); - else - right.insert_at_leaf_pos(0, - here.leaf_item( here.size()-1 ).key, - here.leaf_item( here.size()-1 ).value); - - /* remove */ - here.set_size(here.size() - 1); - - /* fix parent index for right */ - parent.index_item( pos[level-1] + 1 ).key = right.key(0); - - return 0; - } - }; - - - public: - bool almost_full() { - if (2*(depth+1) > pool.num_free()) // worst case, plus some. - return true; - return false; - } - - int find(K key, Cursor& cursor) { - dbtout << "find " << key << endl; - - if (depth == 0) - return Cursor::OOB; - - // init - cursor.level = 0; - - // start at root - Nodeptr curnode( pool.get_node(root) ); - cursor.open[0] = curnode; - - if (curnode.size() == 0) return -1; // empty! - - // find leaf - for (cursor.level = 0; cursor.level < depth-1; cursor.level++) { - /* if key=5, we want 2 3 [4] 6 7, or 3 4 [5] 5 6 (err to the left) */ - int left = 0; /* i >= left */ - int right = curnode.size()-1; /* i < right */ - while (left < right) { - int i = left + (right - left) / 2; - if (curnode.index_item(i).key < key) { - left = i + 1; - } else if (i && curnode.index_item(i-1).key >= key) { - right = i; - } else { - left = right = i; - break; - } - } - int i = left; - if (i && curnode.index_item(i).key > key) i--; - -#ifdef EBOFS_DEBUG_BTREE - int j; - for (j=0; j key) break; - } - if (i != j) { - dbtout << "btree binary search failed" << endl; - i = j; - } -#endif - - cursor.pos[cursor.level] = i; - - /* get child node */ - curnode = pool.get_node( cursor.open[cursor.level].index_item(i).node ); - cursor.open[cursor.level+1] = curnode; - } - - /* search leaf */ - /* if key=5, we want 2 3 4 [6] 7, or 3 4 [5] 5 6 (err to the right) */ - int left = 0; /* i >= left */ - int right = curnode.size(); /* i < right */ - while (left < right) { - int i = left + (right - left) / 2; - if (curnode.leaf_item(i).key < key) { - left = i + 1; - } else if (i && curnode.leaf_item(i-1).key >= key) { - right = i; - } else { - left = right = i; - break; - } - } - int i = left; - -#ifdef EBOFS_DEBUG_BTREE - int j; - for (j=0; j= key) break; - } - if (i != j) { - dbtout << "btree binary search failed" << endl; - i = j; - } -#endif - - cursor.pos[cursor.level] = i; /* first key in this node, or key insertion point */ - - if (curnode.size() >= i+1) { - if (curnode.leaf_item(i).key == key) { - return Cursor::MATCH; /* it's the actual key */ - } else { - return Cursor::INSERT; /* it's an insertion point */ - } - } - return Cursor::OOB; /* it's the end of the btree (also a valid insertion point) */ - } - - int lookup(K key) { - dbtout << "lookup" << endl; - Cursor cursor(this); - if (find(key, cursor) == Cursor::MATCH) - return 0; - return -1; - } - - int lookup(K key, V& value) { - dbtout << "lookup" << endl; - Cursor cursor(this); - if (find(key, cursor) == Cursor::MATCH) { - value = cursor.current().value; - return 0; - } - return -1; - } - - int insert(K key, V value) { - dbtout << "insert " << key << " -> " << value << endl; - if (almost_full()) return -1; - - // empty? - if (nkeys == 0) { - if (root == -1) { - // create a root node (leaf!) - assert(depth == 0); - Nodeptr newroot( pool.new_node(Node::TYPE_LEAF) ); - root = newroot.get_id(); - depth++; - } - assert(depth == 1); - assert(root >= 0); - } - - // start at/near key - Cursor cursor(this); - find(key, cursor); - - // insert loop - nodeid_t nodevalue = 0; - while (1) { - - /* room in this node? */ - if (cursor.open[cursor.level].size() < cursor.open[cursor.level].max_items()) { - if (cursor.open[cursor.level].is_leaf()) - cursor.insert( key, value ); // will dirty, etc. - else { - // indices are already dirty - cursor.open[cursor.level].insert_at_index_pos(cursor.pos[cursor.level], key, nodevalue); - } - verify("insert 1"); - return 0; - } - - /* this node is full. */ - assert( cursor.open[cursor.level].size() == cursor.open[cursor.level].max_items() ); - - /* can we rotate? */ - if (false) // NO! there's a bug in here somewhere, don't to it. - if (cursor.level > 0) { - if ((cursor.pos[cursor.level-1] > 0 - && cursor.rotate_left() >= 0) || - (cursor.pos[cursor.level-1] + 1 < cursor.open[cursor.level-1].size() - && cursor.rotate_right() >= 0)) { - - if (cursor.open[cursor.level].is_leaf()) - cursor.insert( key, value ); // will dirty, etc. - else { - // indices are already dirty - cursor.open[cursor.level].insert_at_index_pos(cursor.pos[cursor.level], key, nodevalue); - } - verify("insert 2"); - return 0; - } - } - - /** split node **/ - - if (cursor.level == depth-1) { - dbtout << "splitting leaf " << cursor.open[cursor.level].get_id() << endl; - } else { - dbtout << "splitting index " << cursor.open[cursor.level].get_id() << endl; - } - - cursor.dirty(); - - // split - Nodeptr leftnode = cursor.open[cursor.level]; - Nodeptr newnode( pool.new_node(leftnode.node->get_type()) ); - leftnode.split( newnode ); - - /* insert our item */ - if (cursor.pos[cursor.level] > leftnode.size()) { - // not with cursor, since this node isn't added yet! - if (newnode.is_leaf()) { - newnode.insert_at_leaf_pos( cursor.pos[cursor.level] - leftnode.size(), - key, value ); - nkeys++; - } else { - newnode.insert_at_index_pos( cursor.pos[cursor.level] - leftnode.size(), - key, nodevalue ); - } - } else { - // with cursor (if leaf) - if (leftnode.is_leaf()) - cursor.insert( key, value ); - else - leftnode.insert_at_index_pos( cursor.pos[cursor.level], - key, nodevalue ); - } - - /* are we at the root? */ - if (cursor.level == 0) { - /* split root. */ - dbtout << "that split was the root " << root << endl; - Nodeptr newroot( pool.new_node(Node::TYPE_INDEX) ); - - /* new root node */ - newroot.set_size(2); - newroot.index_item(0).key = leftnode.key(0); - newroot.index_item(0).node = root; - newroot.index_item(1).key = newnode.key(0); - newroot.index_item(1).node = newnode.get_id(); - - /* heighten tree */ - depth++; - root = newroot.get_id(); - verify("insert 3"); - return 0; - } - - /* now insert newindex in level-1 */ - nodevalue = newnode.get_id(); - key = newnode.key(0); - cursor.level--; - cursor.pos[cursor.level]++; // ...to the right of leftnode! - } - } - - - int remove(K key) { - dbtout << "remove " << key << endl; - - if (almost_full()) { - cout << "table almost full, failing" << endl; - assert(0); - return -1; - } - - Cursor cursor(this); - if (find(key, cursor) <= 0) { - cerr << "remove " << key << " 0x" << hex << key << dec << " .. dne" << endl; - g_conf.debug_ebofs = 33; - g_conf.ebofs_verify = true; - verify("remove dne"); - assert(0); - return -1; // key dne - } - - - while (1) { - cursor.remove(); - - // balance + adjust - - if (cursor.level == 0) { - // useless root index? - if (cursor.open[0].size() == 1 && - depth > 1) { - depth--; - root = cursor.open[0].index_item(0).node; - pool.release( cursor.open[0].node ); - } - - // note: root can be small, but not empty - else if (nkeys == 0) { - assert(cursor.open[cursor.level].size() == 0); - assert(depth == 1); - root = -1; - depth = 0; - if (cursor.open[0].node) - pool.release(cursor.open[0].node); - } - verify("remove 1"); - return 0; - } - - if (cursor.open[cursor.level].size() > cursor.open[cursor.level].min_items()) { - verify("remove 2"); - return 0; - } - - // borrow from siblings? - Nodeptr left; - Nodeptr right; - - // left? - if (cursor.pos[cursor.level-1] > 0) { - int left_loc = cursor.open[cursor.level-1].index_item( cursor.pos[cursor.level-1] - 1).node; - left = pool.get_node( left_loc ); - - if (left.size() > left.min_items()) { - /* move cursor left, shift right */ - cursor.pos[cursor.level] = 0; - cursor.open[cursor.level] = left; - cursor.pos[cursor.level-1]--; - cursor.rotate_right(); - verify("remove 3"); - return 0; - } - - /* combine to left */ - right = cursor.open[cursor.level]; - } - else { - assert(cursor.pos[cursor.level-1] < cursor.open[cursor.level-1].size() - 1); - int right_loc = cursor.open[cursor.level-1].index_item( cursor.pos[cursor.level-1] + 1 ).node; - right = pool.get_node( right_loc ); - - if (right.size() > right.min_items()) { - /* move cursor right, shift an item left */ - cursor.pos[cursor.level] = 1; - cursor.open[cursor.level] = right; - cursor.pos[cursor.level-1]++; - cursor.rotate_left(); - verify("remove 4"); - return 0; - } - - /* combine to left */ - left = cursor.open[cursor.level]; - cursor.pos[cursor.level-1]++; /* move cursor to (soon-to-be-empty) right side item */ - } - - // note: cursor now points to _right_ node. - - /* combine (towards left) - * (this makes it so our next delete will be in the index - * interior, which is less scary.) - */ - dbtout << "combining nodes " << left.get_id() << " and " << right.get_id() << endl; - - left.merge(right); - - // dirty left + right - cursor.dirty(); // right - if (!left.node->is_dirty()) { - pool.dirty_node(left.node); - cursor.open[cursor.level-1].index_item( cursor.pos[cursor.level-1]-1 ).node = left.get_id(); - } - - pool.release(right.node); - - cursor.level--; // now point to the link to the obsolete (right-side) sib */ - } - - } - - void clear(Cursor& cursor, int node_loc, int level) { - dbtout << "clear" << endl; - - Nodeptr node = pool.get_node( node_loc ); - cursor.open[level] = node; - - // hose children? - if (level < depth-1) { - for (int i=0; i max) - max = node.key(i); - - if (level < depth-1) { - // index - cursor.pos[level] = i; - err += verify_sub( cursor, cursor.open[level].index_item(i).node, level+1, count, last, on ); - } else { - // leaf - count++; - last = node.key(i); - } - } - - if (level) { - // verify that parent's keys are appropriate - if (min != cursor.open[level-1].index_item(cursor.pos[level-1]).key) { - dbtout << ":: key in index node " << cursor.open[level-1].get_id() - << " != min in child " << node_loc - << "(key is " << hex << cursor.open[level-1].index_item(cursor.pos[level-1]).key - << ", min is " << min << ")" << dec << endl; - err++; - } - if (cursor.pos[level-1] < cursor.open[level-1].size()-1) { - if (max > cursor.open[level-1].index_item(1+cursor.pos[level-1]).key) { - dbtout << ":: next key in index node " << cursor.open[level-1].get_id() - << " < max in child " << node_loc - << "(key is " << hex << cursor.open[level-1].index_item(1+cursor.pos[level-1]).key - << ", max is " << max << ")" << dec << endl; - err++; - } - } - } - - //return err; - - // print it - char s[1000]; - strcpy(s," "); - s[level+1] = 0; - if (1) { - if (root == node_loc) { - dbtout << s << "root " << node_loc << ": " - << node.size() << " / " << node.max_items() << " keys, " << hex << min << "-" << max << dec << endl; - } else if (level == depth-1) { - dbtout << s << "leaf " << node_loc << ": " - << node.size() << " / " << node.max_items() << " keys, " << hex << min << "-" << max << dec << endl; - } else { - dbtout << s << "indx " << node_loc << ": " - << node.size() << " / " << node.max_items() << " keys, " << hex << min << "-" << max << dec << endl; - } - - if (0) { - for (int i=0; i " << node.leaf_item(i).value << dec << endl; - } - } - } - } - - return err; - } - - void verify(const char *on) { - if (!g_conf.ebofs_verify) - return; - - if (root == -1 && depth == 0) { - return; // empty! - } - - int count = 0; - Cursor cursor(this); - K last; - - int before = g_conf.debug_ebofs; - g_conf.debug_ebofs = 0; - - int err = verify_sub(cursor, root, 0, count, last, on); - if (count != nkeys) { - cerr << "** count " << count << " != nkeys " << nkeys << endl; - err++; - } - - g_conf.debug_ebofs = before; - - // ok? - if (err) { - cerr << "verify failure, called by '" << on << "'" << endl; - g_conf.debug_ebofs = 30; - // do it again, so we definitely get the dump. - int count = 0; - Cursor cursor(this); - K last; - verify_sub(cursor, root, 0, count, last, on); - assert(err == 0); - } - } - -}; - - -#endif diff --git a/tags/20070517_before_mds_merge/ebofs/mkfs.ebofs.cc b/tags/20070517_before_mds_merge/ebofs/mkfs.ebofs.cc deleted file mode 100644 index af5f57842068a..0000000000000 --- a/tags/20070517_before_mds_merge/ebofs/mkfs.ebofs.cc +++ /dev/null @@ -1,299 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include -#include "ebofs/Ebofs.h" - - -int main(int argc, char **argv) -{ - // args - vector args; - argv_to_vec(argc, argv, args); - parse_config_options(args); - - if (args.size() < 1) { - cerr << "usage: mkfs.ebofs [options] " << endl; - return -1; - } - char *filename = args[0]; - - // mkfs - Ebofs mfs(filename); - int r = mfs.mkfs(); - if (r < 0) exit(r); - - if (args.size() > 1) { // pass an extra arg of some sort to trigger the test crapola - // test-o-rama! - Ebofs fs(filename); - fs.mount(); - - /* - if (1) { - // partial write tests - char crap[1024*1024]; - memset(crap, 0, 1024*1024); - - bufferlist small; - small.append(crap, 10); - bufferlist med; - med.append(crap, 1000); - bufferlist big; - big.append(crap, 1024*1024); - - cout << "0" << endl; - fs.write(10, 0, 1024*1024, big, (Context*)0); - fs.sync(); - fs.trim_buffer_cache(); - - cout << "1" << endl; - fs.write(10, 10, 10, small, 0); - fs.write(10, 1, 1000, med, 0); - fs.sync(); - fs.trim_buffer_cache(); - - cout << "2" << endl; - fs.write(10, 10, 10, small, 0); - //fs.sync(); - fs.write(10, 1, 1000, med, 0); - fs.sync(); - fs.trim_buffer_cache(); - - cout << "3" << endl; - fs.write(10, 1, 1000, med, 0); - fs.write(10, 10000, 10, small, 0); - fs.truncate(10, 100, 0); - fs.sync(); - fs.trim_buffer_cache(); - - cout << "4" << endl; - fs.remove(10); - fs.sync(); - fs.write(10, 10, 10, small, 0); - fs.sync(); - fs.write(10, 1, 1000, med, 0); - fs.sync(); - fs.truncate(10, 100, 0); - fs.write(10, 10, 10, small, 0); - fs.trim_buffer_cache(); - - - - } - - if (0) { // onode write+read test - bufferlist bl; - char crap[1024*1024]; - memset(crap, 0, 1024*1024); - bl.append(crap, 10); - - fs.write(10, 10, 0, bl, (Context*)0); - fs.umount(); - - Ebofs fs2(filename); - fs2.mount(); - fs2.read(10, 10, 0, bl); - fs2.umount(); - - return 0; - } - - - if (0) { // small write + read test - bufferlist bl; - char crap[1024*1024]; - memset(crap, 0, 1024*1024); - - object_t oid = 10; - int n = 10000; - int l = 128; - bl.append(crap, l); - - - char *p = bl.c_str(); - off_t o = 0; - for (int i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __EBOFS_NODES_H -#define __EBOFS_NODES_H - -/** nodes, node regions **/ - -#include "types.h" -#include "BlockDevice.h" - - -/* - - disk wire memory - - free free -> free can alloc - free used -> dirty can modify - - free used used -> tx - free used free -> limbo - - used used -> clean - used free -> limbo - - - // meaningless - used free free -> free can alloc - used free used __DNE__ - - -*/ - -#undef debofs -#define debofs(x) if (x < g_conf.debug_ebofs) cout << "ebofs.nodepool." - - -class Node { - public: - // bit fields - static const int STATE_CLEAN = 1; - static const int STATE_DIRTY = 2; - static const int STATE_TX = 3; - - static const int ITEM_LEN = EBOFS_NODE_BYTES - sizeof(int) - sizeof(int) - sizeof(int); - - static const int TYPE_INDEX = 1; - static const int TYPE_LEAF = 2; - - protected: - nodeid_t id; - int state; // use bit fields above! - - bufferptr bptr; - bufferptr shadow_bptr; - - // in disk buffer - int *type; - int *nrecs; - - public: - Node(nodeid_t i, bufferptr& b, int s) : id(i), state(s), bptr(b) { - nrecs = (int*)(bptr.c_str()); - type = (int*)(bptr.c_str() + sizeof(*nrecs)); - } - - - // id - nodeid_t get_id() const { return id; } - void set_id(nodeid_t n) { id = n; } - - // buffer - bufferptr& get_buffer() { return bptr; } - - char *item_ptr() { return bptr.c_str() + sizeof(*nrecs) + sizeof(*type); } - - // size - int size() { return *nrecs; } - void set_size(int s) { *nrecs = s; } - - // type - int& get_type() { return *type; } - void set_type(int t) { *type = t; } - bool is_index() { return *type == TYPE_INDEX; } - bool is_leaf() { return *type == TYPE_LEAF; } - - - // state - bool is_dirty() { return state == STATE_DIRTY; } - bool is_tx() { return state == STATE_TX; } - bool is_clean() { return state == STATE_CLEAN; } - - void set_state(int s) { state = s; } - - void make_shadow() { - assert(is_tx()); - - shadow_bptr = bptr; - - // new buffer - bptr = buffer::create_page_aligned(EBOFS_NODE_BYTES); - nrecs = (int*)(bptr.c_str()); - type = (int*)(bptr.c_str() + sizeof(*nrecs)); - - // copy contents! - memcpy(bptr.c_str(), shadow_bptr.c_str(), EBOFS_NODE_BYTES); - } - -}; - - - - - -class NodePool { - protected: - map node_map; // open node map - - public: - vector region_loc; // region locations - Extent usemap_even; - Extent usemap_odd; - - protected: - // on-disk block states - int num_nodes; - set free; - set dirty; - set tx; - set clean; // aka used - set limbo; - - Mutex &ebofs_lock; - Cond commit_cond; - int flushing; - - static int make_nodeid(int region, int offset) { - return (region << 24) | offset; - } - static int nodeid_region(nodeid_t nid) { - return nid >> 24; - } - static int nodeid_offset(nodeid_t nid) { - return nid & ((1 << 24) - 1); - } - - - public: - NodePool(Mutex &el) : - num_nodes(0), - ebofs_lock(el), - flushing(0) {} - ~NodePool() { - // nodes - release_all(); - } - - int num_free() { return free.size(); } - int num_dirty() { return dirty.size(); } - int num_limbo() { return limbo.size(); } - int num_tx() { return tx.size(); } - int num_clean() { return clean.size(); } - int num_total() { return num_nodes; } - int num_used() { return num_clean() + num_dirty() + num_tx(); } - - int get_usemap_len(int n=0) { - if (n == 0) n = num_nodes; - return ((n-1) / 8 / EBOFS_BLOCK_SIZE) + 1; - } - - int num_regions() { return region_loc.size(); } - - // the caller had better adjust usemap locations... - void add_region(Extent ex) { - int region = region_loc.size(); - assert(ex.length <= (1 << 24)); - region_loc.push_back(ex); - for (unsigned o = 0; o < ex.length; o++) { - free.insert( make_nodeid(region, o) ); - } - num_nodes += ex.length; - } - - int init(struct ebofs_nodepool *np) { - // regions - assert(region_loc.empty()); - num_nodes = 0; - for (int i=0; inum_regions; i++) { - debofs(3) << "init region " << i << " at " << np->region_loc[i] << endl; - region_loc.push_back( np->region_loc[i] ); - num_nodes += np->region_loc[i].length; - } - - // usemap - usemap_even = np->node_usemap_even; - usemap_odd = np->node_usemap_odd; - debofs(3) << "init even map at " << usemap_even << endl; - debofs(3) << "init odd map at " << usemap_odd << endl; - - return 0; - } - - void close() { - release_all(); - - region_loc.clear(); - free.clear(); - dirty.clear(); - tx.clear(); - clean.clear(); - limbo.clear(); - flushing = 0; - node_map.clear(); - } - - - // *** blocking i/o routines *** - - int read_usemap(BlockDevice& dev, version_t epoch) { - // read map - Extent loc; - if (epoch & 1) - loc = usemap_odd; - else - loc = usemap_even; - - bufferptr bp = buffer::create_page_aligned(EBOFS_BLOCK_SIZE*loc.length); - dev.read(loc.start, loc.length, bp); - - // parse - unsigned region = 0; // current region - unsigned roff = 0; // offset in region - for (unsigned byte = 0; byte> 1; // move one bit right. - roff++; - if (roff == region_loc[region].length) { - // next region! - roff = 0; - region++; - break; - } - } - if (region == region_loc.size()) break; - } - return 0; - } - - int read_clean_nodes(BlockDevice& dev) { - /* - this relies on the clean set begin defined so that we know which nodes - to read. so it only really works when called from mount()! - */ - for (unsigned r=0; rflushed_usemap(); - } - }; - - void flushed_usemap() { - ebofs_lock.Lock(); - flushing--; - if (flushing == 0) - commit_cond.Signal(); - ebofs_lock.Unlock(); - } - - public: - int write_usemap(BlockDevice& dev, version_t version) { - // alloc - Extent loc; - if (version & 1) - loc = usemap_odd; - else - loc = usemap_even; - - bufferptr bp = buffer::create_page_aligned(EBOFS_BLOCK_SIZE*loc.length); - - // fill in - unsigned region = 0; // current region - unsigned roff = 0; // offset in region - for (unsigned byte = 0; byte> 1; - if (roff == region_loc[region].length) { - // next region! - roff = 0; - region++; - break; - } - } - - *(unsigned char*)(bp.c_str() + byte) = x; - if (region == region_loc.size()) break; - } - - - // write - bufferlist bl; - bl.append(bp); - dev.write(loc.start, loc.length, bl, - new C_NP_FlushUsemap(this), "usemap"); - return 0; - } - - - - // *** node commit *** - private: - - class C_NP_FlushNode : public BlockDevice::callback { - NodePool *pool; - nodeid_t nid; - public: - C_NP_FlushNode(NodePool *p, nodeid_t n) : - pool(p), nid(n) {} - void finish(ioh_t ioh, int r) { - pool->flushed_node(nid); - } - }; - - void flushed_node(nodeid_t nid) { - ebofs_lock.Lock(); - - // mark nid clean|limbo - if (tx.count(nid)) { // tx -> clean - tx.erase(nid); - clean.insert(nid); - - // make node itself clean - node_map[nid]->set_state(Node::STATE_CLEAN); - } - else { // already limbo (was dirtied, or released) - assert(limbo.count(nid)); - } - - flushing--; - if (flushing == 0) - commit_cond.Signal(); - ebofs_lock.Unlock(); - } - - public: - void commit_start(BlockDevice& dev, version_t version) { - dout(20) << "ebofs.nodepool.commit_start start" << endl; - - assert(flushing == 0); - /*if (0) - for (unsigned i=0; i tx (write to disk) - assert(tx.empty()); - set didb; - for (set::iterator i = dirty.begin(); - i != dirty.end(); - i++) { - Node *n = get_node(*i); - assert(n); - assert(n->is_dirty()); - n->set_state(Node::STATE_TX); - - unsigned region = nodeid_region(*i); - block_t off = nodeid_offset(*i); - block_t b = region_loc[region].start + off; - - if (1) { // sanity check debug FIXME - assert(didb.count(b) == 0); - didb.insert(b); - } - - bufferlist bl; - bl.append(n->get_buffer()); - dev.write(b, EBOFS_NODE_BLOCKS, - bl, - new C_NP_FlushNode(this, *i), "node"); - flushing++; - - tx.insert(*i); - } - dirty.clear(); - - // limbo -> free - for (set::iterator i = limbo.begin(); - i != limbo.end(); - i++) { - free.insert(*i); - } - limbo.clear(); - - dout(20) << "ebofs.nodepool.commit_start finish" << endl; - } - - void commit_wait() { - while (flushing > 0) - commit_cond.Wait(ebofs_lock); - dout(20) << "ebofs.nodepool.commit_wait finish" << endl; - } - - - - - - - - - - // *** nodes *** - // opened node - Node* get_node(nodeid_t nid) { - //dbtout << "pool.get " << nid << endl; - assert(node_map.count(nid)); - return node_map[nid]; - } - - // unopened node - /* not implemented yet!! - Node* open_node(nodeid_t nid) { - Node *n = node_regions[ NodeRegion::nodeid_region(nid) ]->open_node(nid); - dbtout << "pool.open_node " << n->get_id() << endl; - node_map[n->get_id()] = n; - return n; - } - */ - - // allocate id/block on disk. always free -> dirty. - nodeid_t alloc_id() { - // pick node id - assert(!free.empty()); - nodeid_t nid = *(free.begin()); - free.erase(nid); - dirty.insert(nid); - return nid; - } - - // new node - Node* new_node(int type) { - nodeid_t nid = alloc_id(); - debofs(15) << "ebofs.nodepool.new_node " << nid << endl; - - // alloc node - bufferptr bp = buffer::create_page_aligned(EBOFS_NODE_BYTES); - Node *n = new Node(nid, bp, Node::STATE_DIRTY); - n->set_type(type); - n->set_size(0); - - assert(node_map.count(nid) == 0); - node_map[nid] = n; - return n; - } - - void release(Node *n) { - const nodeid_t nid = n->get_id(); - debofs(15) << "ebofs.nodepool.release on " << nid << endl; - node_map.erase(nid); - - if (n->is_dirty()) { - assert(dirty.count(nid)); - dirty.erase(nid); - free.insert(nid); - } else if (n->is_clean()) { - assert(clean.count(nid)); - clean.erase(nid); - limbo.insert(nid); - } else if (n->is_tx()) { - assert(tx.count(nid)); // i guess htis happens? -sage - tx.erase(nid); - limbo.insert(nid); - } - - delete n; - } - - void release_all() { - while (!node_map.empty()) { - map::iterator i = node_map.begin(); - debofs(2) << "ebofs.nodepool.release_all leftover " << i->first << " " << i->second << endl; - release( i->second ); - } - assert(node_map.empty()); - } - - void dirty_node(Node *n) { - // get new node id? - nodeid_t oldid = n->get_id(); - nodeid_t newid = alloc_id(); - debofs(15) << "ebofs.nodepool.dirty_node on " << oldid << " now " << newid << endl; - - // release old block - if (n->is_clean()) { - assert(clean.count(oldid)); - clean.erase(oldid); - } else { - assert(n->is_tx()); - assert(tx.count(oldid)); - tx.erase(oldid); - - // move/copy current -> shadow buffer as necessary - n->make_shadow(); - } - limbo.insert(oldid); - node_map.erase(oldid); - - n->set_state(Node::STATE_DIRTY); - - // move to new one! - n->set_id(newid); - node_map[newid] = n; - } - - - -}; - -#endif diff --git a/tags/20070517_before_mds_merge/ebofs/test.ebofs.cc b/tags/20070517_before_mds_merge/ebofs/test.ebofs.cc deleted file mode 100644 index 0e6a7625c502a..0000000000000 --- a/tags/20070517_before_mds_merge/ebofs/test.ebofs.cc +++ /dev/null @@ -1,224 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include -#include "ebofs/Ebofs.h" - -bool stop = false; - - -int nt = 0; -class Tester : public Thread { - Ebofs &fs; - int t; - - char b[1024*1024]; - -public: - Tester(Ebofs &e) : fs(e), t(nt) { nt++; } - void *entry() { - - while (!stop) { - object_t oid; - oid.ino = (rand() % 10) + 0x10000000; - coll_t cid = rand() % 50; - off_t off = rand() % 10000;//0;//rand() % 1000000; - off_t len = 1+rand() % 100000; - char *a = "one"; - if (rand() % 2) a = "two"; - int l = 3;//rand() % 10; - - switch (rand() % 10) { - case 0: - { - oid.rev = rand() % 10; - cout << t << " read " << hex << oid << dec << " at " << off << " len " << len << endl; - bufferlist bl; - fs.read(oid, off, len, bl); - int l = MIN(len,bl.length()); - if (l) { - cout << t << " got " << l << endl; - bl.copy(0, l, b); - char *p = b; - while (l--) { - assert(*p == 0 || - *p == (char)(off ^ oid.ino)); - off++; - p++; - } - } - } - break; - - case 1: - { - cout << t << " write " << hex << oid << dec << " at " << off << " len " << len << endl; - for (int j=0;j args; - argv_to_vec(argc, argv, args); - parse_config_options(args); - - // args - if (args.size() != 3) return -1; - char *filename = args[0]; - int seconds = atoi(args[1]); - int threads = atoi(args[2]); - - cout << "dev " << filename << " .. " << threads << " threads .. " << seconds << " seconds" << endl; - - Ebofs fs(filename); - if (fs.mount() < 0) return -1; - - - // explicit tests - if (1) { - // verify that clone() plays nice with partial writes - object_t oid(1,1); - bufferptr bp(10000); - bp.zero(); - bufferlist bl; - bl.push_back(bp); - fs.write(oid, 0, 10000, bl, 0); - - fs.sync(); - fs.trim_buffer_cache(); - - // induce a partial write - bufferlist bl2; - bl2.substr_of(bl, 0, 100); - fs.write(oid, 100, 100, bl2, 0); - - // clone it - object_t oid2; - oid2 = oid; - oid2.rev = 1; - fs.clone(oid, oid2, 0); - - // ... - if (0) { - // make sure partial still behaves after orig is removed... - fs.remove(oid, 0); - - // or i read for oid2... - bufferlist rbl; - fs.read(oid2, 0, 200, rbl); - } - if (1) { - // make sure things behave if we remove the clone - fs.remove(oid2,0); - } - } - // /explicit tests - - list ls; - for (int i=0; icreate(); - ls.push_back(t); - } - - utime_t now = g_clock.now(); - utime_t dur(seconds,0); - utime_t end = now + dur; - cout << "stop at " << end << endl; - while (now < end) { - sleep(1); - now = g_clock.now(); - cout << now << endl; - } - - cout << "stopping" << endl; - stop = true; - - while (!ls.empty()) { - Tester *t = ls.front(); - ls.pop_front(); - t->join(); - delete t; - } - - fs.umount(); - return 0; -} - diff --git a/tags/20070517_before_mds_merge/ebofs/types.h b/tags/20070517_before_mds_merge/ebofs/types.h deleted file mode 100644 index 1b85d138ec342..0000000000000 --- a/tags/20070517_before_mds_merge/ebofs/types.h +++ /dev/null @@ -1,168 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __EBOFS_TYPES_H -#define __EBOFS_TYPES_H - -#include -#include "include/buffer.h" -#include "include/Context.h" -#include "common/Cond.h" - -#include -#include -#include -#include -using namespace std; -using namespace __gnu_cxx; - - -#include "include/object.h" - - -#ifndef MIN -# define MIN(a,b) ((a)<=(b) ? (a):(b)) -#endif -#ifndef MAX -# define MAX(a,b) ((a)>=(b) ? (a):(b)) -#endif - - -/* -namespace __gnu_cxx { - template<> struct hash { - size_t operator()(unsigned long long __x) const { - static hash H; - return H((__x >> 32) ^ (__x & 0xffffffff)); - } - }; - - template<> struct hash< std::string > - { - size_t operator()( const std::string& x ) const - { - static hash H; - return H(x.c_str()); - } - }; -} -*/ - - -// disk -typedef __uint64_t block_t; // disk location/sector/block - -static const int EBOFS_BLOCK_SIZE = 4096; -static const int EBOFS_BLOCK_BITS = 12; // 1<<12 == 4096 - -class Extent { - public: - block_t start, length; - - Extent() : start(0), length(0) {} - Extent(block_t s, block_t l) : start(s), length(l) {} - - block_t last() const { return start + length - 1; } - block_t end() const { return start + length; } -}; - -inline ostream& operator<<(ostream& out, Extent& ex) -{ - return out << ex.start << "~" << ex.length; -} - - -// tree/set nodes -typedef int nodeid_t; - -static const int EBOFS_NODE_BLOCKS = 1; -static const int EBOFS_NODE_BYTES = EBOFS_NODE_BLOCKS * EBOFS_BLOCK_SIZE; -static const int EBOFS_MAX_NODE_REGIONS = 10; // pick a better value! - -struct ebofs_nodepool { - Extent node_usemap_even; // for even sb versions - Extent node_usemap_odd; // for odd sb versions - - int num_regions; - Extent region_loc[EBOFS_MAX_NODE_REGIONS]; -}; - - -// objects - -typedef __uint64_t coll_t; - -struct ebofs_onode { - Extent onode_loc; /* this is actually the block we live in */ - - object_t object_id; /* for kicks */ - off_t object_size; /* file size in bytes. should this be 64-bit? */ - unsigned object_blocks; - bool readonly; - - int num_collections; - int num_attr; // num attr in onode - int num_extents; /* number of extents used. if 0, data is in the onode */ -}; - -struct ebofs_cnode { - Extent cnode_loc; /* this is actually the block we live in */ - coll_t coll_id; - int num_attr; // num attr in cnode -}; - - -// table -struct ebofs_table { - nodeid_t root; /* root node of btree */ - int num_keys; - int depth; -}; - - -// super -typedef __uint64_t version_t; - -static const unsigned EBOFS_MAGIC = 0x000EB0F5; - -static const int EBOFS_NUM_FREE_BUCKETS = 5; /* see alloc.h for bucket constraints */ -static const int EBOFS_FREE_BUCKET_BITS = 2; - - -struct ebofs_super { - unsigned s_magic; - - unsigned epoch; // version of this superblock. - - unsigned num_blocks; /* # blocks in filesystem */ - - // some basic stats, for kicks - unsigned free_blocks; /* unused blocks */ - unsigned limbo_blocks; /* limbo blocks */ - //unsigned num_objects; - //unsigned num_fragmented; - - struct ebofs_nodepool nodepool; - - // tables - struct ebofs_table free_tab[EBOFS_NUM_FREE_BUCKETS]; - struct ebofs_table limbo_tab; - struct ebofs_table alloc_tab; - struct ebofs_table object_tab; // object directory - struct ebofs_table collection_tab; // collection directory - struct ebofs_table co_tab; -}; - - -#endif diff --git a/tags/20070517_before_mds_merge/fakefuse.cc b/tags/20070517_before_mds_merge/fakefuse.cc deleted file mode 100644 index a9f98e9d5bb08..0000000000000 --- a/tags/20070517_before_mds_merge/fakefuse.cc +++ /dev/null @@ -1,156 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include -#include -#include -using namespace std; - -#include "config.h" - -#include "mon/Monitor.h" - -#include "mds/MDS.h" -#include "osd/OSD.h" -#include "client/Client.h" -#include "client/fuse.h" - -#include "common/Timer.h" - -#include "msg/FakeMessenger.h" - - - - -#define NUMMDS g_conf.num_mds -#define NUMOSD g_conf.num_osd -#define NUMCLIENT g_conf.num_client - - -class C_Test : public Context { -public: - void finish(int r) { - cout << "C_Test->finish(" << r << ")" << endl; - } -}; -class C_Test2 : public Context { -public: - void finish(int r) { - cout << "C_Test2->finish(" << r << ")" << endl; - g_timer.add_event_after(2, new C_Test); - } -}; - - - -int main(int argc, char **argv) { - cerr << "fakefuse starting" << endl; - - vector args; - argv_to_vec(argc, argv, args); - parse_config_options(args); - - // start messenger thread - fakemessenger_startthread(); - - //g_timer.add_event_after(5.0, new C_Test2); - //g_timer.add_event_after(10.0, new C_Test); - - vector nargs; - for (unsigned i=0; iinit(); - } - for (int i=0; iinit(); - } - - for (int i=0; iinit(); - } - - - // create client - Client *client[NUMCLIENT]; - for (int i=0; iinit(); - - - // start up fuse - // use my argc, argv (make sure you pass a mount point!) - cout << "starting fuse on pid " << getpid() << endl; - client[i]->mount(); - - char *oldcwd = get_current_dir_name(); // note previous wd - ceph_fuse_main(client[i], argc, argv); - ::chdir(oldcwd); // return to previous wd - - client[i]->unmount(); - cout << "fuse finished on pid " << getpid() << endl; - client[i]->shutdown(); - } - - - - // wait for it to finish - cout << "DONE -----" << endl; - fakemessenger_wait(); // blocks until messenger stops - - - // cleanup - for (int i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include -#include -#include -using namespace std; - -#include "config.h" - -#include "mds/MDS.h" -#include "osd/OSD.h" -#include "mon/Monitor.h" -#include "client/Client.h" - -#include "client/SyntheticClient.h" - -#include "msg/FakeMessenger.h" - -#include "common/Timer.h" - - -class C_Test : public Context { -public: - void finish(int r) { - cout << "C_Test->finish(" << r << ")" << endl; - } -}; - -class C_Die : public Context { -public: - void finish(int) { - cerr << "die" << endl; - exit(1); - } -}; - - -int main(int argc, char **argv) -{ - cerr << "fakesyn start" << endl; - - //cerr << "inode_t " << sizeof(inode_t) << endl; - - vector args; - argv_to_vec(argc, argv, args); - - parse_config_options(args); - - int start = 0; - - parse_syn_options(args); - - vector nargs; - - for (unsigned i=0; imon_inst[0] = entity_inst_t(MSG_ADDR_MON(0), a); // hack ; see FakeMessenger.cc - - char hostname[100]; - gethostname(hostname,100); - //int pid = getpid(); - - // create mon - Monitor *mon[g_conf.num_mon]; - for (int i=0; iinit(); - } - for (int i=0; iinit(); - if (g_conf.mds_local_osd) - mdsosd[i]->init(); - } - - for (int i=0; iinit(); - } - - - // create client(s) - for (int i=0; iinit(); - - // use my argc, argv (make sure you pass a mount point!) - //cout << "mounting" << endl; - client[i]->mount(); - - //cout << "starting synthetic client " << endl; - syn[i] = new SyntheticClient(client[i]); - - syn[i]->start_thread(); - } - - - for (int i=0; ijoin_thread(); - delete syn[i]; - - client[i]->unmount(); - //cout << "unmounted" << endl; - client[i]->shutdown(); - } - - - // wait for it to finish - fakemessenger_wait(); - - // cleanup - for (int i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __CONTEXT_H -#define __CONTEXT_H - -#include "config.h" - -#include -#include -#include - -#include - - -/* - * Context - abstract callback class - */ -class Context { - public: - virtual ~Context() {} // we want a virtual destructor!!! - virtual void finish(int r) = 0; -}; - - -/* - * finish and destroy a list of Contexts - */ -inline void finish_contexts(std::list& finished, - int result = 0) -{ - using std::cout; - using std::endl; - - if (finished.empty()) return; - - dout(10) << finished.size() << " contexts to finish with " << result << endl; - for (std::list::iterator it = finished.begin(); - it != finished.end(); - it++) { - Context *c = *it; - dout(10) << "---- " << c << endl; - c->finish(result); - delete c; - } -} - -/* - * C_Contexts - set of Contexts - */ -class C_Contexts : public Context { - std::list clist; - -public: - void add(Context* c) { - clist.push_back(c); - } - void take(std::list& ls) { - clist.splice(clist.end(), ls); - } - void finish(int r) { - finish_contexts(clist, r); - } -}; - - -/* - * C_Gather - * - * BUG: does not report errors. - */ -class C_Gather : public Context { -public: - bool sub_finish(int r) { - //cout << "C_Gather sub_finish " << this << endl; - assert(waitfor.count(r)); - waitfor.erase(r); - if (!waitfor.empty()) - return false; // more subs left - - // last one - onfinish->finish(0); - delete onfinish; - onfinish = 0; - return true; - } - - class C_GatherSub : public Context { - C_Gather *gather; - int num; - public: - C_GatherSub(C_Gather *g, int n) : gather(g), num(n) {} - void finish(int r) { - if (gather->sub_finish(num)) - delete gather; // last one! - } - }; - - Context *new_sub() { - num++; - waitfor.insert(num); - return new C_GatherSub(this, num); - } - -private: - Context *onfinish; - std::set waitfor; - int num; - -public: - C_Gather(Context *f) : onfinish(f), num(0) { - //cout << "C_Gather new " << this << endl; - } - ~C_Gather() { - //cout << "C_Gather delete " << this << endl; - assert(!onfinish); - } - void finish(int r) { - // nobody should ever call me. - assert(0); - } - -}; - -#endif diff --git a/tags/20070517_before_mds_merge/include/Distribution.h b/tags/20070517_before_mds_merge/include/Distribution.h deleted file mode 100644 index 00f352d59efab..0000000000000 --- a/tags/20070517_before_mds_merge/include/Distribution.h +++ /dev/null @@ -1,74 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __DISTRIBUTION_H -#define __DISTRIBUTION_H - -#include -#include -using namespace std; - -class Distribution { - vector p; - vector v; - - public: - //Distribution() { - //} - - unsigned get_width() { - return p.size(); - } - - void clear() { - p.clear(); - v.clear(); - } - void add(int val, float pr) { - p.push_back(pr); - v.push_back(val); - } - - void random() { - float sum = 0.0; - for (unsigned i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __BUFFER_H -#define __BUFFER_H - -#include "common/Mutex.h" - -#include -#include - -using std::cout; -using std::endl; - -#ifndef __CYGWIN__ -# include -#endif - -#define BUFFER_PAGE_SIZE 4096 // fixme. - -// -// these are in config.o -extern Mutex bufferlock; -extern long buffer_total_alloc; -// - -class buffer { -private: - - /* hack for memory utilization debugging. */ - static void inc_total_alloc(unsigned len) { - bufferlock.Lock(); - buffer_total_alloc += len; - bufferlock.Unlock(); - } - static void dec_total_alloc(unsigned len) { - bufferlock.Lock(); - buffer_total_alloc -= len; - bufferlock.Unlock(); - } - - /* - * an abstract raw buffer. with a reference count. - */ - class raw { - public: - char *data; - unsigned len; - int nref; - Mutex lock; // we'll make it non-recursive. - - raw(unsigned l) : len(l), nref(0), lock(false) {} - raw(char *c, unsigned l) : data(c), len(l), nref(0), lock(false) {} - virtual ~raw() {}; - - // no copying. - raw(const raw &other); - const raw& operator=(const raw &other); - - virtual raw* clone_empty() = 0; - raw *clone() { - raw *c = clone_empty(); - memcpy(c->data, data, len); - return c; - } - }; - - friend std::ostream& operator<<(std::ostream& out, const raw &r); - - /* - * primitive buffer types - */ - class raw_char : public raw { - public: - raw_char(unsigned l) : raw(l) { - data = new char[len]; - inc_total_alloc(len); - } - ~raw_char() { - delete[] data; - dec_total_alloc(len); - } - raw* clone_empty() { - return new raw_char(len); - } - }; - - class raw_static : public raw { - public: - raw_static(const char *d, unsigned l) : raw((char*)d, l) { } - ~raw_static() {} - raw* clone_empty() { - return new raw_char(len); - } - }; - -#ifndef __CYGWIN__ - class raw_mmap_pages : public raw { - public: - raw_mmap_pages(unsigned l) : raw(l) { - data = (char*)::mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, 0); - inc_total_alloc(len); - } - ~raw_mmap_pages() { - ::munmap(data, len); - dec_total_alloc(len); - } - raw* clone_empty() { - return new raw_mmap_pages(len); - } - }; - - class raw_posix_aligned : public raw { - public: - raw_posix_aligned(unsigned l) : raw(l) { -#ifdef DARWIN - data = (char *) valloc (len); -#else - ::posix_memalign((void**)&data, BUFFER_PAGE_SIZE, len); -#endif /* DARWIN */ - inc_total_alloc(len); - } - ~raw_posix_aligned() { - ::free((void*)data); - dec_total_alloc(len); - } - raw* clone_empty() { - return new raw_posix_aligned(len); - } - }; -#endif - -#ifdef __CYGWIN__ - class raw_hack_aligned : public raw { - char *realdata; - public: - raw_hack_aligned(unsigned l) : raw(l) { - realdata = new char[len+4095]; - unsigned off = ((unsigned)realdata) % 4096; - if (off) - data = realdata + 4096 - off; - else - data = realdata; - inc_total_alloc(len+4095); - //cout << "hack aligned " << (unsigned)data - //<< " in raw " << (unsigned)realdata - //<< " off " << off << endl; - assert(((unsigned)data & 4095) == 0); - } - ~raw_hack_aligned() { - delete[] realdata; - dec_total_alloc(len+4095); - } - raw* clone_empty() { - return new raw_hack_aligned(len); - } - }; -#endif - -public: - - /* - * named constructors - */ - - static raw* copy(const char *c, unsigned len) { - raw* r = new raw_char(len); - memcpy(r->data, c, len); - return r; - } - static raw* create(unsigned len) { - return new raw_char(len); - } - - static raw* create_page_aligned(unsigned len) { -#ifndef __CYGWIN__ - return new raw_mmap_pages(len); -#else - return new raw_hack_aligned(len); -#endif - } - - - /* - * a buffer pointer. references (a subsequence of) a raw buffer. - */ - class ptr { - raw *_raw; - unsigned _off, _len; - - public: - ptr() : _raw(0), _off(0), _len(0) {} - ptr(raw *r) : _raw(r), _off(0), _len(r->len) { // no lock needed; this is an unref raw. - ++r->nref; - } - ptr(unsigned l) : _off(0), _len(l) { - _raw = create(l); - ++_raw->nref; - } - ptr(char *d, unsigned l) : _off(0), _len(l) { // ditto. - _raw = copy(d, l); - ++_raw->nref; - } - ptr(const ptr& p) : _raw(p._raw), _off(p._off), _len(p._len) { - if (_raw) { - _raw->lock.Lock(); - ++_raw->nref; - _raw->lock.Unlock(); - } - } - ptr(const ptr& p, unsigned o, unsigned l) : _raw(p._raw), _off(p._off + o), _len(l) { - assert(o+l <= p._len); - assert(_raw); - _raw->lock.Lock(); - ++_raw->nref; - _raw->lock.Unlock(); - } - ptr& operator= (const ptr& p) { - // be careful -- we need to properly handle self-assignment. - if (p._raw) { - p._raw->lock.Lock(); - ++p._raw->nref; // inc new - p._raw->lock.Unlock(); - } - release(); // dec (+ dealloc) old (if any) - _raw = p._raw; // change my ref - _off = p._off; - _len = p._len; - return *this; - } - ~ptr() { - release(); - } - - void release() { - if (_raw) { - _raw->lock.Lock(); - if (--_raw->nref == 0) { - //cout << "hosing raw " << (void*)_raw << " len " << _raw->len << std::endl; - _raw->lock.Unlock(); - delete _raw; // dealloc old (if any) - } else - _raw->lock.Unlock(); - _raw = 0; - } - } - - // misc - bool at_buffer_head() const { return _off == 0; } - bool at_buffer_tail() const { return _off + _len == _raw->len; } - - // accessors - const char *c_str() const { assert(_raw); return _raw->data + _off; } - char *c_str() { assert(_raw); return _raw->data + _off; } - unsigned length() const { return _len; } - unsigned offset() const { return _off; } - unsigned unused_tail_length() const { return _raw->len - (_off+_len); } - const char& operator[](unsigned n) const { - assert(_raw); - assert(n < _len); - return _raw->data[_off + n]; - } - char& operator[](unsigned n) { - assert(_raw); - assert(n < _len); - return _raw->data[_off + n]; - } - - const char *raw_c_str() const { assert(_raw); return _raw->data; } - unsigned raw_length() const { assert(_raw); return _raw->len; } - int raw_nref() const { assert(_raw); return _raw->nref; } - - void copy_out(unsigned o, unsigned l, char *dest) const { - assert(_raw); - assert(o >= 0 && o <= _len); - assert(l >= 0 && o+l <= _len); - memcpy(dest, c_str()+o, l); - } - - unsigned wasted() { - assert(_raw); - return _raw->len - _len; - } - - // modifiers - void set_offset(unsigned o) { _off = o; } - void set_length(unsigned l) { _len = l; } - - void append(const char *p, unsigned l) { - assert(_raw); - assert(l <= unused_tail_length()); - memcpy(c_str() + _len, p, l); - _len += l; - } - - void copy_in(unsigned o, unsigned l, const char *src) { - assert(_raw); - assert(o >= 0 && o <= _len); - assert(l >= 0 && o+l <= _len); - memcpy(c_str()+o, src, l); - } - - void zero() { - memset(c_str(), 0, _len); - } - - void clean() { - //raw *newraw = _raw->makesib(_len); - } - }; - - friend std::ostream& operator<<(std::ostream& out, const buffer::ptr& bp); - - /* - * list - the useful bit! - */ - - class list { - // my private bits - std::list _buffers; - unsigned _len; - - public: - // cons/des - list() : _len(0) {} - list(const list& other) : _buffers(other._buffers), _len(other._len) { } - list(unsigned l) : _len(0) { - ptr bp(l); - push_back(bp); - } - ~list() {} - - list& operator= (const list& other) { - _buffers = other._buffers; - _len = other._len; - return *this; - } - - const std::list& buffers() const { return _buffers; } - - unsigned length() const { -#if 0 - // DEBUG: verify _len - unsigned len = 0; - for (std::list::iterator it = _buffers.begin(); - it != _buffers.end(); - it++) { - len += (*it).length(); - } - assert(len == _len); -#endif - return _len; - } - - - // modifiers - void clear() { - _buffers.clear(); - _len = 0; - } - void push_front(ptr& bp) { - _buffers.push_front(bp); - _len += bp.length(); - } - void push_front(raw *r) { - ptr bp(r); - _buffers.push_front(bp); - _len += bp.length(); - } - void push_back(ptr& bp) { - _buffers.push_back(bp); - _len += bp.length(); - } - void push_back(raw *r) { - ptr bp(r); - _buffers.push_back(bp); - _len += bp.length(); - } - void zero() { - for (std::list::iterator it = _buffers.begin(); - it != _buffers.end(); - it++) - it->zero(); - } - - // sort-of-like-assignment-op - void claim(list& bl) { - // free my buffers - clear(); - claim_append(bl); - } - void claim_append(list& bl) { - // steal the other guy's buffers - _len += bl._len; - _buffers.splice( _buffers.end(), bl._buffers ); - bl._len = 0; - } - - // crope lookalikes - void copy(unsigned off, unsigned len, char *dest) { - assert(off >= 0); - assert(off + len <= length()); - /*assert(off < length()); - if (off + len > length()) - len = length() - off; - */ - // advance to off - std::list::iterator curbuf = _buffers.begin(); - - // skip off - while (off > 0) { - assert(curbuf != _buffers.end()); - if (off >= (*curbuf).length()) { - // skip this buffer - off -= (*curbuf).length(); - curbuf++; - } else { - // somewhere in this buffer! - break; - } - } - - // copy - while (len > 0) { - // is the rest ALL in this buffer? - if (off + len <= (*curbuf).length()) { - (*curbuf).copy_out(off, len, dest); // yup, last bit! - break; - } - - // get as much as we can from this buffer. - unsigned howmuch = (*curbuf).length() - off; - (*curbuf).copy_out(off, howmuch, dest); - - dest += howmuch; - len -= howmuch; - off = 0; - curbuf++; - assert(curbuf != _buffers.end()); - } - } - - void copy_in(unsigned off, unsigned len, const char *src) { - assert(off >= 0); - assert(off + len <= length()); - - // advance to off - std::list::iterator curbuf = _buffers.begin(); - - // skip off - while (off > 0) { - assert(curbuf != _buffers.end()); - if (off >= (*curbuf).length()) { - // skip this buffer - off -= (*curbuf).length(); - curbuf++; - } else { - // somewhere in this buffer! - break; - } - } - - // copy - while (len > 0) { - // is the rest ALL in this buffer? - if (off + len <= (*curbuf).length()) { - (*curbuf).copy_in(off, len, src); // yup, last bit! - break; - } - - // get as much as we can from this buffer. - unsigned howmuch = (*curbuf).length() - off; - (*curbuf).copy_in(off, howmuch, src); - - src += howmuch; - len -= howmuch; - off = 0; - curbuf++; - assert(curbuf != _buffers.end()); - } - } - void copy_in(unsigned off, unsigned len, const list& bl) { - unsigned left = len; - for (std::list::const_iterator i = bl._buffers.begin(); - i != bl._buffers.end(); - i++) { - unsigned l = (*i).length(); - if (left < l) l = left; - copy_in(off, l, (*i).c_str()); - left -= l; - if (left == 0) break; - off += l; - } - } - - - void append(const char *data, unsigned len) { - if (len == 0) return; - - unsigned alen = 0; - - // copy into the tail buffer? - if (!_buffers.empty()) { - unsigned avail = _buffers.back().unused_tail_length(); - if (avail > 0) { - //std::cout << "copying up to " << len << " into tail " << avail << " bytes of tail buf " << _buffers.back() << std::endl; - if (avail > len) - avail = len; - _buffers.back().append(data, avail); - _len += avail; - data += avail; - len -= avail; - } - alen = _buffers.back().length(); - } - if (len == 0) return; - - // just add another buffer. - // alloc a bit extra, in case we do a bunch of appends. FIXME be smarter! - if (alen < 4096) alen = 4096; - ptr bp = create(alen); - bp.set_length(len); - bp.copy_in(0, len, data); - push_back(bp); - } - void append(ptr& bp) { - push_back(bp); - } - void append(ptr& bp, unsigned off, unsigned len) { - assert(len+off <= bp.length()); - ptr tempbp(bp, off, len); - push_back(tempbp); - } - void append(const list& bl) { - list temp(bl); // copy list - claim_append(temp); // and append - } - - - /* - * get a char - */ - const char& operator[](unsigned n) { - assert(n < _len); - for (std::list::iterator p = _buffers.begin(); - p != _buffers.end(); - p++) { - if (n >= p->length()) { - n -= p->length(); - continue; - } - return (*p)[n]; - } - assert(0); - } - - /* - * return a contiguous ptr to whole bufferlist contents. - */ - char *c_str() { - if (_buffers.size() == 1) { - return _buffers.front().c_str(); // good, we're already contiguous. - } - else if (_buffers.size() == 0) { - return 0; // no buffers - } - else { - ptr newbuf = create(length()); // make one new contiguous buffer. - copy(0, length(), newbuf.c_str()); // copy myself into it. - clear(); - push_back(newbuf); - return newbuf.c_str(); // now it'll work. - } - } - - void substr_of(list& other, unsigned off, unsigned len) { - assert(off + len <= other.length()); - clear(); - - // skip off - std::list::iterator curbuf = other._buffers.begin(); - while (off > 0) { - assert(curbuf != _buffers.end()); - if (off >= (*curbuf).length()) { - // skip this buffer - //cout << "skipping over " << *curbuf << endl; - off -= (*curbuf).length(); - curbuf++; - } else { - // somewhere in this buffer! - //cout << "somewhere in " << *curbuf << endl; - break; - } - } - - while (len > 0) { - // partial? - if (off + len < (*curbuf).length()) { - //cout << "copying partial of " << *curbuf << endl; - _buffers.push_back( ptr( *curbuf, off, len ) ); - _len += len; - break; - } - - // through end - //cout << "copying end (all?) of " << *curbuf << endl; - unsigned howmuch = (*curbuf).length() - off; - _buffers.push_back( ptr( *curbuf, off, howmuch ) ); - _len += howmuch; - len -= howmuch; - off = 0; - curbuf++; - } - } - - - // funky modifer - void splice(unsigned off, unsigned len, list *claim_by=0 /*, bufferlist& replace_with */) { // fixme? - assert(off < length()); - assert(len > 0); - //cout << "splice off " << off << " len " << len << " ... mylen = " << length() << endl; - - // skip off - std::list::iterator curbuf = _buffers.begin(); - while (off > 0) { - assert(curbuf != _buffers.end()); - if (off >= (*curbuf).length()) { - // skip this buffer - //cout << "off = " << off << " skipping over " << *curbuf << endl; - off -= (*curbuf).length(); - curbuf++; - } else { - // somewhere in this buffer! - //cout << "off = " << off << " somewhere in " << *curbuf << endl; - break; - } - } - assert(off >= 0); - - if (off) { - // add a reference to the front bit - // insert it before curbuf (which we'll hose) - //cout << "keeping front " << off << " of " << *curbuf << endl; - _buffers.insert( curbuf, ptr( *curbuf, 0, off ) ); - _len += off; - } - - while (len > 0) { - // partial? - if (off + len < (*curbuf).length()) { - //cout << "keeping end of " << *curbuf << ", losing first " << off+len << endl; - if (claim_by) - claim_by->append( *curbuf, off, len ); - (*curbuf).set_offset( off+len + (*curbuf).offset() ); // ignore beginning big - (*curbuf).set_length( (*curbuf).length() - (len+off) ); - _len -= off+len; - //cout << " now " << *curbuf << endl; - break; - } - - // hose though the end - unsigned howmuch = (*curbuf).length() - off; - //cout << "discarding " << howmuch << " of " << *curbuf << endl; - if (claim_by) - claim_by->append( *curbuf, off, howmuch ); - _len -= (*curbuf).length(); - _buffers.erase( curbuf++ ); - len -= howmuch; - off = 0; - } - - // splice in *replace (implement me later?) - } - - }; - -}; - -typedef buffer::ptr bufferptr; -typedef buffer::list bufferlist; - - -inline bool operator>(bufferlist& l, bufferlist& r) { - for (unsigned p = 0; ; p++) { - if (l.length() > p && r.length() == p) return true; - if (l.length() == p) return false; - if (l[p] > r[p]) return true; - if (l[p] < r[p]) return false; - p++; - } -} -inline bool operator>=(bufferlist& l, bufferlist& r) { - for (unsigned p = 0; ; p++) { - if (l.length() > p && r.length() == p) return true; - if (r.length() == p && l.length() == p) return true; - if (l[p] > r[p]) return true; - if (l[p] < r[p]) return false; - p++; - } -} -inline bool operator<(bufferlist& l, bufferlist& r) { - return r > l; -} -inline bool operator<=(bufferlist& l, bufferlist& r) { - return r >= l; -} - - -inline std::ostream& operator<<(std::ostream& out, const buffer::raw &r) { - return out << "buffer::raw(" << (void*)r.data << " len " << r.len << " nref " << r.nref << ")"; -} - -inline std::ostream& operator<<(std::ostream& out, const buffer::ptr& bp) { - out << "buffer::ptr(" << bp.offset() << "~" << bp.length() - << " " << (void*)bp.c_str() - << " in raw " << (void*)bp.raw_c_str() - << " len " << bp.raw_length() - << " nref " << bp.raw_nref() << ")"; - return out; -} - -inline std::ostream& operator<<(std::ostream& out, const buffer::list& bl) { - out << "buffer::list(len=" << bl.length() << "," << std::endl; - - std::list::const_iterator it = bl.buffers().begin(); - while (it != bl.buffers().end()) { - out << "\t" << *it; - if (++it == bl.buffers().end()) break; - out << "," << std::endl; - } - out << std::endl << ")"; - return out; -} - - - - -// encoder/decode helpers - -// -- basic types -- -// string -inline void _encode(const std::string& s, bufferlist& bl) -{ - bl.append(s.c_str(), s.length()+1); -} -inline void _decode(std::string& s, bufferlist& bl, int& off) -{ - s = bl.c_str() + off; - off += s.length() + 1; -} - -// bufferptr (encapsulated) -inline void _encode(bufferptr& bp, bufferlist& bl) -{ - size_t len = bp.length(); - bl.append((char*)&len, sizeof(len)); - bl.append(bp); -} -inline void _decode(bufferptr& bp, bufferlist& bl, int& off) -{ - size_t len; - bl.copy(off, sizeof(len), (char*)&len); - off += sizeof(len); - bufferlist s; - s.substr_of(bl, off, len); - off += len; - - if (s.buffers().size() == 1) - bp = s.buffers().front(); - else - bp = buffer::copy(s.c_str(), s.length()); -} - -// bufferlist (encapsulated) -inline void _encode(const bufferlist& s, bufferlist& bl) -{ - size_t len = s.length(); - bl.append((char*)&len, sizeof(len)); - bl.append(s); -} -inline void _decode(bufferlist& s, bufferlist& bl, int& off) -{ - size_t len; - bl.copy(off, sizeof(len), (char*)&len); - off += sizeof(len); - s.substr_of(bl, off, len); - off += len; -} - - -#include -#include -#include -#include - -// set -inline void _encode(const std::set& s, bufferlist& bl) -{ - int n = s.size(); - bl.append((char*)&n, sizeof(n)); - for (std::set::const_iterator it = s.begin(); - it != s.end(); - it++) { - ::_encode(*it, bl); - n--; - } - assert(n==0); -} -inline void _decode(std::set& s, bufferlist& bl, int& off) -{ - s.clear(); - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i -inline void _encode(const std::list& s, bufferlist& bl) -{ - int n = s.size(); - bl.append((char*)&n, sizeof(n)); - for (std::list::const_iterator it = s.begin(); - it != s.end(); - it++) { - ::_encode(*it, bl); - n--; - } - assert(n==0); -} -inline void _decode(std::list& s, bufferlist& bl, int& off) -{ - s.clear(); - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i -template -inline void _encode(const std::set& s, bufferlist& bl) -{ - int n = s.size(); - bl.append((char*)&n, sizeof(n)); - for (typename std::set::const_iterator it = s.begin(); - it != s.end(); - it++) { - T v = *it; - bl.append((char*)&v, sizeof(v)); - n--; - } - assert(n==0); -} -template -inline void _decode(std::set& s, bufferlist& bl, int& off) -{ - s.clear(); - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i -template -inline void _encode(std::vector& s, bufferlist& bl) -{ - int n = s.size(); - bl.append((char*)&n, sizeof(n)); - for (typename std::vector::iterator it = s.begin(); - it != s.end(); - it++) { - T v = *it; - bl.append((char*)&v, sizeof(v)); - n--; - } - assert(n==0); -} -template -inline void _decode(std::vector& s, bufferlist& bl, int& off) -{ - s.clear(); - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - s = std::vector(n); - for (int i=0; i -template -inline void _encode(const std::list& s, bufferlist& bl) -{ - int n = s.size(); - bl.append((char*)&n, sizeof(n)); - for (typename std::list::const_iterator it = s.begin(); - it != s.end(); - it++) { - T v = *it; - bl.append((char*)&v, sizeof(v)); - n--; - } - assert(n==0); -} -template -inline void _decode(std::list& s, bufferlist& bl, int& off) -{ - s.clear(); - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i -inline void _encode(std::map& s, bufferlist& bl) -{ - int n = s.size(); - bl.append((char*)&n, sizeof(n)); - for (std::map::iterator it = s.begin(); - it != s.end(); - it++) { - _encode(it->first, bl); - _encode(it->second, bl); - n--; - } - assert(n==0); -} -inline void _decode(std::map& s, bufferlist& bl, int& off) -{ - s.clear(); - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i -template -inline void _encode(const std::map& s, bufferlist& bl) -{ - int n = s.size(); - bl.append((char*)&n, sizeof(n)); - //std::cout << "n = " << n << std::endl; - for (typename std::map::const_iterator it = s.begin(); - it != s.end(); - it++) { - T k = it->first; - bl.append((char*)&k, sizeof(k)); - _encode(it->second, bl); - n--; - //std::cout << "--n = " << n << " after k " << k << std::endl; - } - assert(n==0); -} -template -inline void _decode(std::map& s, bufferlist& bl, int& off) -{ - s.clear(); - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i -template -inline void _encode(const std::map& s, bufferlist& bl) -{ - int n = s.size(); - bl.append((char*)&n, sizeof(n)); - for (typename std::map::const_iterator it = s.begin(); - it != s.end(); - it++) { - ::_encode(it->first, bl); - U v = it->second; - bl.append((char*)&v, sizeof(v)); - n--; - } - assert(n==0); -} -template -inline void _decode(std::map& s, bufferlist& bl, int& off) -{ - s.clear(); - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i> -template -inline void _encode(const std::map >& s, bufferlist& bl) -{ - int n = s.size(); - bl.append((char*)&n, sizeof(n)); - for (typename std::map >::const_iterator it = s.begin(); - it != s.end(); - it++) { - T k = it->first; - bl.append((char*)&k, sizeof(k)); - ::_encode(it->second, bl); - n--; - } - assert(n==0); -} -template -inline void _decode(std::map >& s, bufferlist& bl, int& off) -{ - s.clear(); - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i -template -inline void _encode(const std::map& s, bufferlist& bl) -{ - int n = s.size(); - bl.append((char*)&n, sizeof(n)); - for (typename std::map::const_iterator it = s.begin(); - it != s.end(); - it++) { - T k = it->first; - U v = it->second; - bl.append((char*)&k, sizeof(k)); - bl.append((char*)&v, sizeof(v)); - n--; - } - assert(n==0); -} -template -inline void _decode(std::map& s, bufferlist& bl, int& off) -{ - s.clear(); - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -#define SYSERROR() syserror("At %s:%d", __FILE__, __LINE__) - -#define ASSERT(c) \ - ((c) || (exiterror("Assertion failed at %s:%d", __FILE__, __LINE__), 1)) - -/* print usage error message and exit */ -extern void userror(const char *use, const char *fmt, ...); - -/* print system error message and exit */ -extern void syserror(const char *fmt, ...); - -/* print error message and exit */ -extern void exiterror(const char *fmt, ...); - -/* print error message */ -extern void error(const char *fmt, ...); - -#ifdef __cplusplus -} // extern "C" -#endif diff --git a/tags/20070517_before_mds_merge/include/filepath.h b/tags/20070517_before_mds_merge/include/filepath.h deleted file mode 100644 index 5585e536b42db..0000000000000 --- a/tags/20070517_before_mds_merge/include/filepath.h +++ /dev/null @@ -1,206 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __FILEPATH_H -#define __FILEPATH_H - - -/* - * BUG: /a/b/c is equivalent to a/b/c in dentry-breakdown, but not string. - * -> should it be different? how? should this[0] be "", with depth 4? - * - */ - - -#include -#include -#include -using namespace std; - -#include -using namespace __gnu_cxx; - -#include "buffer.h" - - -class filepath { - string path; - vector bits; - - void rebuild() { - if (absolute()) - path = "/"; - else - path.clear(); - for (unsigned i=0; i::iterator it = bits.begin(); - it != bits.end(); - it++) { - r.append((*it).c_str(), (*it).length()+1); - } - } - - void _unrope(crope& r, int& off) { - clear(); - - char n; - r.copy(off, sizeof(char), (char*)&n); - off += sizeof(char); - for (int i=0; i::iterator it = bits.begin(); - it != bits.end(); - it++) { - bl.append((*it).c_str(), (*it).length()+1); - } - } - - void _decode(bufferlist& bl, int& off) { - clear(); - - char n; - bl.copy(off, sizeof(char), (char*)&n); - off += sizeof(char); - for (int i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __INTERVAL_SET_H -#define __INTERVAL_SET_H - -#include -#include -#include -using namespace std; - -#ifndef MIN -# define MIN(a,b) ((a)<=(b) ? (a):(b)) -#endif -#ifndef MAX -# define MAX(a,b) ((a)>=(b) ? (a):(b)) -#endif - - -template -class interval_set { - public: - map m; // map start -> len - - // helpers - private: - typename map::const_iterator find_inc(T start) const { - typename map::const_iterator p = m.lower_bound(start); // p->first >= start - if (p != m.begin() && - (p == m.end() || p->first > start)) { - p--; // might overlap? - if (p->first + p->second <= start) - p++; // it doesn't. - } - return p; - } - - typename map::iterator find_inc_m(T start) { - typename map::iterator p = m.lower_bound(start); - if (p != m.begin() && - (p == m.end() || p->first > start)) { - p--; // might overlap? - if (p->first + p->second <= start) - p++; // it doesn't. - } - return p; - } - - typename map::const_iterator find_adj(T start) const { - typename map::const_iterator p = m.lower_bound(start); - if (p != m.begin() && - (p == m.end() || p->first > start)) { - p--; // might touch? - if (p->first + p->second < start) - p++; // it doesn't. - } - return p; - } - - typename map::iterator find_adj_m(T start) { - typename map::iterator p = m.lower_bound(start); - if (p != m.begin() && - (p == m.end() || p->first > start)) { - p--; // might touch? - if (p->first + p->second < start) - p++; // it doesn't. - } - return p; - } - - public: - bool operator==(const interval_set& other) const { - return m == other.m; - } - - void clear() { - m.clear(); - } - - bool contains(T i) const { - typename map::const_iterator p = find_inc(i); - if (p == m.end()) return false; - if (p->first > i) return false; - if (p->first+p->second <= i) return false; - assert(p->first <= i && p->first+p->second > i); - return true; - } - bool contains(T start, T len) const { - typename map::const_iterator p = find_inc(start); - if (p == m.end()) return false; - if (p->first > start) return false; - if (p->first+p->second <= start) return false; - assert(p->first <= start && p->first+p->second > start); - if (p->first+p->second < start+len) return false; - return true; - } - bool intersects(T start, T len) const { - interval_set a; - a.insert(start, len); - interval_set i; - i.intersection_of( *this, a ); - if (i.empty()) return false; - return true; - } - - // outer range of set - bool empty() const { - return m.empty(); - } - T start() const { - assert(!empty()); - typename map::const_iterator p = m.begin(); - return p->first; - } - T end() const { - assert(!empty()); - typename map::const_iterator p = m.end(); - p--; - return p->first+p->second; - } - - // interval start after p (where p not in set) - bool starts_after(T i) const { - assert(!contains(i)); - typename map::const_iterator p = find_inc(i); - if (p == m.end()) return false; - return true; - } - T start_after(T i) const { - assert(!contains(i)); - typename map::const_iterator p = find_inc(i); - return p->first; - } - - // interval end that contains start - T end_after(T start) const { - assert(contains(start)); - typename map::const_iterator p = find_inc(start); - return p->first+p->second; - } - - void insert(T val) { - insert(val, 1); - } - - void insert(T start, T len) { - //cout << "insert " << start << "~" << len << endl; - assert(len > 0); - typename map::iterator p = find_adj_m(start); - if (p == m.end()) { - m[start] = len; // new interval - } else { - if (p->first < start) { - - if (p->first + p->second != start) { - //cout << "p is " << p->first << "~" << p->second << ", start is " << start << ", len is " << len << endl; - assert(0); - } - - assert(p->first + p->second == start); - p->second += len; // append to end - - typename map::iterator n = p; - n++; - if (n != m.end() && - start+len == n->first) { // combine with next, too! - p->second += n->second; - m.erase(n); - } - } else { - if (start+len == p->first) { - m[start] = len + p->second; // append to front - m.erase(p); - } else { - assert(p->first > start+len); - m[start] = len; // new interval - } - } - } - } - - void erase(T val) { - erase(val, 1); - } - - void erase(T start, T len) { - typename map::iterator p = find_inc_m(start); - - assert(p != m.end()); - assert(p->first <= start); - - T before = start - p->first; - assert(p->second >= before+len); - T after = p->second - before - len; - - if (before) - p->second = before; // shorten bit before - else - m.erase(p); - if (after) - m[start+len] = after; - } - - - void subtract(const interval_set &a) { - for (typename map::const_iterator p = a.m.begin(); - p != a.m.end(); - p++) - erase(p->first, p->second); - } - - void insert(const interval_set &a) { - for (typename map::const_iterator p = a.m.begin(); - p != a.m.end(); - p++) - insert(p->first, p->second); - } - - - void intersection_of(const interval_set &a, const interval_set &b) { - assert(&a != this); - assert(&b != this); - clear(); - - typename map::const_iterator pa = a.m.begin(); - typename map::const_iterator pb = b.m.begin(); - - while (pa != a.m.end() && pb != b.m.end()) { - // passing? - if (pa->first + pa->second <= pb->first) - { pa++; continue; } - if (pb->first + pb->second <= pa->first) - { pb++; continue; } - T start = MAX(pa->first, pb->first); - T end = MIN(pa->first+pa->second, pb->first+pb->second); - assert(end > start); - insert(start, end-start); - if (pa->first+pa->second > pb->first+pb->second) - pb++; - else - pa++; - } - } - - void union_of(const interval_set &a, const interval_set &b) { - assert(&a != this); - assert(&b != this); - clear(); - - //cout << "union_of" << endl; - - // a - m = a.m; - - // - (a*b) - interval_set ab; - ab.intersection_of(a, b); - subtract(ab); - - // + b - insert(b); - return; - } - void union_of(const interval_set &b) { - interval_set a; - a.m.swap(m); - union_of(a, b); - } - - bool subset_of(const interval_set &big) const { - for (typename map::const_iterator i = m.begin(); - i != m.end(); - i++) - if (!big.contains(i->first, i->second)) return false; - return true; - } - -}; - -template -inline ostream& operator<<(ostream& out, const interval_set &s) { - out << "["; - for (typename map::const_iterator i = s.m.begin(); - i != s.m.end(); - i++) { - if (i != s.m.begin()) out << ","; - out << i->first << "~" << i->second; - } - out << "]"; - return out; -} - - -#endif diff --git a/tags/20070517_before_mds_merge/include/lru.h b/tags/20070517_before_mds_merge/include/lru.h deleted file mode 100644 index 63096d0e32079..0000000000000 --- a/tags/20070517_before_mds_merge/include/lru.h +++ /dev/null @@ -1,321 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __LRU_H -#define __LRU_H - -#include -#include -using namespace std; - -#include "config.h" - - - -class LRUObject { - private: - LRUObject *lru_next, *lru_prev; - bool lru_pinned; - class LRU *lru; - class LRUList *lru_list; - - public: - LRUObject() { - lru_next = lru_prev = NULL; - lru_list = 0; - lru_pinned = false; - lru = 0; - } - - // pin/unpin item in cache - void lru_pin(); - void lru_unpin(); - bool lru_is_expireable() { return !lru_pinned; } - - friend class LRU; - friend class LRUList; -}; - - -class LRUList { - private: - LRUObject *head, *tail; - __uint32_t len; - - public: - LRUList() { - head = tail = 0; - len = 0; - } - - __uint32_t get_length() { return len; } - - LRUObject *get_head() { - return head; - } - LRUObject *get_tail() { - return tail; - } - - void insert_head(LRUObject *o) { - o->lru_next = head; - o->lru_prev = NULL; - if (head) { - head->lru_prev = o; - } else { - tail = o; - } - head = o; - o->lru_list = this; - len++; - } - void insert_tail(LRUObject *o) { - o->lru_next = NULL; - o->lru_prev = tail; - if (tail) { - tail->lru_next = o; - } else { - head = o; - } - tail = o; - o->lru_list = this; - len++; - } - - void remove(LRUObject *o) { - assert(o->lru_list == this); - if (o->lru_next) - o->lru_next->lru_prev = o->lru_prev; - else - tail = o->lru_prev; - if (o->lru_prev) - o->lru_prev->lru_next = o->lru_next; - else - head = o->lru_next; - o->lru_next = o->lru_prev = NULL; - o->lru_list = 0; - assert(len>0); - len--; - } - -}; - - -class LRU { - protected: - LRUList lru_top, lru_bot, lru_pintail; - __uint32_t lru_num, lru_num_pinned; - __uint32_t lru_max; // max items - double lru_midpoint; - - friend class LRUObject; - //friend class MDCache; // hack - - public: - LRU(int max = 0) { - lru_num = 0; - lru_num_pinned = 0; - lru_midpoint = .9; - lru_max = max; - } - - __uint32_t lru_get_size() { return lru_num; } - __uint32_t lru_get_top() { return lru_top.get_length(); } - __uint32_t lru_get_bot() { return lru_bot.get_length(); } - __uint32_t lru_get_pintail() { return lru_pintail.get_length(); } - __uint32_t lru_get_max() { return lru_max; } - __uint32_t lru_get_num_pinned() { return lru_num_pinned; } - - void lru_set_max(__uint32_t m) { lru_max = m; } - void lru_set_midpoint(float f) { lru_midpoint = f; } - - - // insert at top of lru - void lru_insert_top(LRUObject *o) { - //assert(!o->lru_in_lru); - //o->lru_in_lru = true; - assert(!o->lru); - o->lru = this; - lru_top.insert_head( o ); - lru_num++; - if (o->lru_pinned) lru_num_pinned++; - lru_adjust(); - } - - // insert at mid point in lru - void lru_insert_mid(LRUObject *o) { - //assert(!o->lru_in_lru); - //o->lru_in_lru = true; - assert(!o->lru); - o->lru = this; - lru_bot.insert_head(o); - lru_num++; - if (o->lru_pinned) lru_num_pinned++; - } - - // insert at bottom of lru - void lru_insert_bot(LRUObject *o) { - assert(!o->lru); - o->lru = this; - lru_bot.insert_tail(o); - lru_num++; - if (o->lru_pinned) lru_num_pinned++; - } - - /* - // insert at bottom of lru - void lru_insert_pintail(LRUObject *o) { - assert(!o->lru); - o->lru = this; - - assert(o->lru_pinned); - - lru_pintail.insert_head(o); - lru_num++; - lru_num_pinned += o->lru_pinned; - } - */ - - - - - // adjust top/bot balance, as necessary - void lru_adjust() { - if (!lru_max) return; - - unsigned toplen = lru_top.get_length(); - unsigned topwant = (unsigned)(lru_midpoint * (double)lru_max); - while (toplen > 0 && - toplen > topwant) { - // remove from tail of top, stick at head of bot - // FIXME: this could be way more efficient by moving a whole chain of items. - - LRUObject *o = lru_top.get_tail(); - lru_top.remove(o); - lru_bot.insert_head(o); - toplen--; - } - } - - - // remove an item - LRUObject *lru_remove(LRUObject *o) { - // not in list - //assert(o->lru_in_lru); - //if (!o->lru_in_lru) return o; // might have expired and been removed that way. - if (!o->lru) return o; - - - if (o->lru_list == &lru_top) - lru_top.remove(o); - else if (o->lru_list == &lru_bot) - lru_bot.remove(o); - else if (o->lru_list == &lru_pintail) - lru_pintail.remove(o); - else - assert(0); - - lru_num--; - if (o->lru_pinned) lru_num_pinned--; - o->lru = 0; - return o; - } - - // touch item -- move to head of lru - bool lru_touch(LRUObject *o) { - lru_remove(o); - lru_insert_top(o); - return true; - } - - // touch item -- move to midpoint (unless already higher) - bool lru_midtouch(LRUObject *o) { - if (o->lru_list == &lru_top) return false; - - lru_remove(o); - lru_insert_mid(o); - return true; - } - - // touch item -- move to bottom - bool lru_bottouch(LRUObject *o) { - lru_remove(o); - lru_insert_bot(o); - return true; - } - - - // expire -- expire a single item - LRUObject *lru_get_next_expire() { - LRUObject *p; - - // look through tail of bot - while (lru_bot.get_length()) { - p = lru_bot.get_tail(); - if (!p->lru_pinned) return p; - - // move to pintail - lru_bot.remove(p); - lru_pintail.insert_head(p); - } - - // ok, try head then - while (lru_top.get_length()) { - p = lru_top.get_tail(); - if (!p->lru_pinned) return p; - - // move to pintail - lru_top.remove(p); - lru_pintail.insert_head(p); - } - - // no luck! - return NULL; - } - - LRUObject *lru_expire() { - LRUObject *p = lru_get_next_expire(); - if (p) - return lru_remove(p); - return NULL; - } - - - void lru_status() { - dout(10) << "lru: " << lru_num << " items, " << lru_top.get_length() << " top, " << lru_bot.get_length() << " bot, " << lru_pintail.get_length() << " pintail" << endl; - } - -}; - - -inline void LRUObject::lru_pin() -{ - lru_pinned = true; - if (lru) lru->lru_num_pinned++; -} -inline void LRUObject::lru_unpin() { - lru_pinned = false; - if (lru) { - lru->lru_num_pinned--; - - // move from pintail -> bot - if (lru_list == &lru->lru_pintail) { - lru->lru_pintail.remove(this); - lru->lru_bot.insert_tail(this); - } - } -} - -#endif diff --git a/tags/20070517_before_mds_merge/include/object.h b/tags/20070517_before_mds_merge/include/object.h deleted file mode 100644 index 5d5a87727e5ad..0000000000000 --- a/tags/20070517_before_mds_merge/include/object.h +++ /dev/null @@ -1,97 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __OBJECT_H -#define __OBJECT_H - -#include -#include -using namespace std; - - -typedef __uint32_t objectrev_t; - -struct object_t { - static const __uint32_t MAXREV = 0xffffffffU; - - __uint64_t ino; // "file" identifier - __uint32_t bno; // "block" in that "file" - objectrev_t rev; // revision. normally ctime (as epoch). - - object_t() : ino(0), bno(0), rev(0) {} - object_t(__uint64_t i, __uint32_t b) : ino(i), bno(b), rev(0) {} - object_t(__uint64_t i, __uint32_t b, __uint32_t r) : ino(i), bno(b), rev(r) {} -}; - - -inline bool operator==(const object_t l, const object_t r) { - return (l.ino == r.ino) && (l.bno == r.bno) && (l.rev == r.rev); -} -inline bool operator!=(const object_t l, const object_t r) { - return (l.ino != r.ino) || (l.bno != r.bno) || (l.rev != r.rev); -} -inline bool operator>(const object_t l, const object_t r) { - if (l.ino > r.ino) return true; - if (l.ino < r.ino) return false; - if (l.bno > r.bno) return true; - if (l.bno < r.bno) return false; - if (l.rev > r.rev) return true; - return false; -} -inline bool operator<(const object_t l, const object_t r) { - if (l.ino < r.ino) return true; - if (l.ino > r.ino) return false; - if (l.bno < r.bno) return true; - if (l.bno > r.bno) return false; - if (l.rev < r.rev) return true; - return false; -} -inline bool operator>=(const object_t l, const object_t r) { - return !(l < r); -} -inline bool operator<=(const object_t l, const object_t r) { - return !(l > r); -} -inline ostream& operator<<(ostream& out, const object_t o) { - out << hex << o.ino << '.'; - out.setf(ios::right); - out.fill('0'); - out << setw(8) << o.bno << dec; - out.unsetf(ios::right); - if (o.rev) - out << '.' << o.rev; - return out; -} - - -namespace __gnu_cxx { -#ifndef __LP64__ - template<> struct hash<__uint64_t> { - size_t operator()(__uint64_t __x) const { - static hash<__uint32_t> H; - return H((__x >> 32) ^ (__x & 0xffffffff)); - } - }; -#endif - - template<> struct hash { - size_t operator()(const object_t &r) const { - static hash<__uint64_t> H; - static hash<__uint32_t> I; - return H(r.ino) ^ I(r.bno); - } - }; -} - - -#endif diff --git a/tags/20070517_before_mds_merge/include/oldbuffer.h b/tags/20070517_before_mds_merge/include/oldbuffer.h deleted file mode 100644 index fda7336bc6461..0000000000000 --- a/tags/20070517_before_mds_merge/include/oldbuffer.h +++ /dev/null @@ -1,357 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __BUFFER_H -#define __BUFFER_H - -#include -#include - -#include -using namespace std; - -// bit masks -#define BUFFER_MODE_NOCOPY 0 -#define BUFFER_MODE_COPY 1 // copy on create, my buffer - -#define BUFFER_MODE_NOFREE 0 -#define BUFFER_MODE_FREE 2 - -#define BUFFER_MODE_CUSTOMFREE 4 - -#define BUFFER_MODE_DEFAULT 3//(BUFFER_MODE_COPY|BUFFER_MODE_FREE) - - -// debug crap -#include "config.h" -#define bdbout(x) if (x <= g_conf.debug_buffer) cout - -#include "common/Mutex.h" - -// HACK: in config.cc -/* - * WARNING: bufferlock placements are tricky for efficiency. note that only bufferptr and - * buffer ever use buffer._ref, and only bufferptr should call ~buffer(). - * - * So, I only need to protect: - * - buffer()'s modification of buffer_total_alloc - * - ~bufferptr() check of buffer._ref, and ~buffer's mod of buffer_total_alloc - * - * I don't protect - * - buffer._get() .. increment is atomic on any sane architecture - * - buffer._put() .. only called by ~bufferptr. - * - ~buffer .. only called by ~bufferptr *** I HOPE!! - */ -extern Mutex bufferlock; -extern long buffer_total_alloc; - - -typedef void (buffer_free_func_t)(void*,char*,unsigned); - - -/* - * buffer - the underlying buffer container. with a reference count. - * - * the buffer never shrinks. - * - * some invariants: - * _len never shrinks - * _len <= _alloc_len - */ -class buffer { - protected: - //wtf - //static Mutex bufferlock; - //static long buffer_total_alloc;// = 0; - - private: - // raw buffer alloc - char *_dataptr; - bool _myptr; - unsigned _len; - unsigned _alloc_len; - - // ref counts - unsigned _ref; - int _get() { - bdbout(1) << "buffer.get " << *this << " get " << _ref+1 << endl; - return ++_ref; - } - int _put() { - bdbout(1) << "buffer.put " << *this << " put " << _ref-1 << endl; - assert(_ref > 0); - return --_ref; - } - - // custom (de!)allocator - buffer_free_func_t *free_func; - void *free_func_arg; - - friend class bufferptr; - - public: - // constructors - buffer() : _dataptr(0), _myptr(true), _len(0), _alloc_len(0), _ref(0), free_func(0), free_func_arg(0) { - bdbout(1) << "buffer.cons " << *this << endl; - } - buffer(unsigned a) : _dataptr(0), _myptr(true), _len(a), _alloc_len(a), _ref(0), free_func(0), free_func_arg(0) { - bdbout(1) << "buffer.cons " << *this << endl; - _dataptr = new char[a]; - bufferlock.Lock(); - buffer_total_alloc += _alloc_len; - bufferlock.Unlock(); - bdbout(1) << "buffer.malloc " << (void*)_dataptr << endl; - } - ~buffer() { - bdbout(1) << "buffer.des " << *this << " " << (void*)free_func << endl; - if (free_func) { - bdbout(1) << "buffer.custom_free_func " << free_func_arg << " " << (void*)_dataptr << endl; - free_func( free_func_arg, _dataptr, _alloc_len ); - } - else if (_dataptr && _myptr) { - bdbout(1) << "buffer.free " << (void*)_dataptr << endl; - delete[] _dataptr; - buffer_total_alloc -= _alloc_len; - } - } - - buffer(const char *p, int l, int mode=BUFFER_MODE_DEFAULT, int alloc_len=0, - buffer_free_func_t free_func=0, void* free_func_arg=0) : - _dataptr(0), - _myptr(false), - _len(l), - _ref(0), - free_func(0), free_func_arg(0) { - - if (alloc_len) - _alloc_len = alloc_len; - else - _alloc_len = l; - - _myptr = mode & BUFFER_MODE_FREE ? true:false; - bdbout(1) << "buffer.cons " << *this << " mode = " << mode << ", myptr=" << _myptr << endl; - if (mode & BUFFER_MODE_COPY) { - _dataptr = new char[_alloc_len]; - bdbout(1) << "buffer.malloc " << (void*)_dataptr << endl; - bufferlock.Lock(); - buffer_total_alloc += _alloc_len; - bufferlock.Unlock(); - memcpy(_dataptr, p, l); - bdbout(1) << "buffer.copy " << *this << endl; - } else { - _dataptr = (char*)p; // ugly - bdbout(1) << "buffer.claim " << *this << " myptr=" << _myptr << endl; - } - - if (mode & BUFFER_MODE_CUSTOMFREE && free_func) { - this->free_func = free_func; - this->free_func_arg = free_func_arg; - } - } - - // operators - buffer& operator=(buffer& other) { - assert(0); // not implemented, no reasonable assignment semantics. - return *this; - } - - char *c_str() { - return _dataptr; - } - - bool has_free_func() { return free_func != 0; } - - // accessor - unsigned alloc_length() { - return _alloc_len; - } - void set_length(unsigned l) { - assert(l <= _alloc_len); - _len = l; - } - unsigned length() { return _len; } - unsigned unused_tail_length() { return _alloc_len - _len; } - - friend ostream& operator<<(ostream& out, buffer& b); -}; - -inline ostream& operator<<(ostream& out, buffer& b) { - return out << "buffer(this=" << &b << " len=" << b._len << ", alloc=" << b._alloc_len << ", data=" << (void*)b._dataptr << " ref=" << b._ref << ")"; -} - - -/* - * smart pointer class for buffer - * - * we reference count the actual buffer. - * we also let you refer to a subset of a buffer. - * we implement the high-level buffer accessor methods. - * - * some invariants: - * _off < _buffer->_len - * _off + _len <= _buffer->_len - */ -class bufferptr { - private: - buffer *_buffer; - unsigned _len, _off; - - public: - // empty cons - bufferptr() : - _buffer(0), - _len(0), - _off(0) { } - // main cons - the entire buffer - bufferptr(buffer *b) : - _buffer(b), - _len(b->_len), - _off(0) { - assert(_buffer->_ref == 0); - _buffer->_get(); // this is always the first one. - } - // subset cons - a subset of another bufferptr (subset) - bufferptr(const bufferptr& bp, unsigned len, unsigned off) { - bufferlock.Lock(); - _buffer = bp._buffer; - _len = len; - _off = bp._off + off; - _buffer->_get(); - assert(_off < _buffer->_len); // sanity checks - assert(_off + _len <= _buffer->_len); - bufferlock.Unlock(); - } - - // copy cons - bufferptr(const bufferptr &other) { - bufferlock.Lock(); - _buffer = other._buffer; - _len = other._len; - _off = other._off; - if (_buffer) _buffer->_get(); - bufferlock.Unlock(); - } - - // assignment operator - bufferptr& operator=(const bufferptr& other) { - //assert(0); - // discard old - discard_buffer(); - - // point to other - bufferlock.Lock(); - _buffer = other._buffer; - _len = other._len; - _off = other._off; - if (_buffer) _buffer->_get(); - bufferlock.Unlock(); - return *this; - } - - ~bufferptr() { - discard_buffer(); - } - - void discard_buffer() { - if (_buffer) { - bufferlock.Lock(); - if (_buffer->_put() == 0) - delete _buffer; - _buffer = 0; - bufferlock.Unlock(); - } - } - - - // dereference to get the actual buffer - buffer& operator*() { - return *_buffer; - } - - - bool at_buffer_head() const { - return _off == 0; - } - bool at_buffer_tail() const { - return _off + _len == _buffer->_len; - } - - // accessors for my subset - char *c_str() { - return _buffer->c_str() + _off; - } - unsigned length() const { - return _len; - } - unsigned offset() const { - return _off; - } - unsigned unused_tail_length() { - if (!at_buffer_tail()) return 0; - return _buffer->unused_tail_length(); - } - - - - // modifiers - void set_offset(unsigned off) { - assert(off <= _buffer->_alloc_len); - _off = off; - } - void set_length(unsigned len) { - assert(len >= 0 && _off + len <= _buffer->_alloc_len); - if (_buffer->_len < _off + len) - _buffer->_len = _off + len; // set new buffer len (_IF_ i'm expanding it) - _len = len; // my len too - } - void zero() { - //bzero((void*)c_str(), _len); - memset((void*)c_str(), 0, _len); - } - - - // crope lookalikes - void append(const char *p, unsigned len) { - assert(len + _len + _off <= _buffer->_alloc_len); // FIXME later for auto-expansion? - - // copy - memcpy(c_str() + _len, p, len); - _buffer->_len += len; - _len += len; - } - void copy_out(unsigned off, unsigned len, char *dest) { - assert(off >= 0 && off <= _len); - assert(len >= 0 && off + len <= _len); - memcpy(dest, c_str() + off, len); - } - void copy_in(unsigned off, unsigned len, const char *src) { - assert(off >= 0 && off <= _len); - assert(len >= 0 && off + len <= _len); - memcpy(c_str() + off, src, len); - } - - friend ostream& operator<<(ostream& out, bufferptr& bp); -}; - - -inline ostream& operator<<(ostream& out, bufferptr& bp) { - return out << "bufferptr(len=" << bp._len << " off=" << bp._off - << " cstr=" << (void*)bp.c_str() - << " buf=" << *bp._buffer - << ")"; -} - - - -#endif diff --git a/tags/20070517_before_mds_merge/include/oldbufferlist.h b/tags/20070517_before_mds_merge/include/oldbufferlist.h deleted file mode 100644 index 466a5ead25d77..0000000000000 --- a/tags/20070517_before_mds_merge/include/oldbufferlist.h +++ /dev/null @@ -1,681 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __BUFFERLIST_H -#define __BUFFERLIST_H - -#include "buffer.h" - -#include -#include -#include -#include -using namespace std; - -#include -using namespace __gnu_cxx; - - -// debug crap -#include "config.h" -#define bdbout(x) if (x <= g_conf.debug_buffer) cout - - - -class bufferlist { - private: - /* local state limited to _buffers, and _len. - * we maintain _len ourselves, so we must be careful when fiddling with buffers! - */ - list _buffers; - unsigned _len; - - public: - // cons/des - bufferlist() : _len(0) { - bdbout(1) << "bufferlist.cons " << this << endl; - } - bufferlist(const bufferlist& bl) : _len(0) { - //assert(0); // o(n) and stupid - bdbout(1) << "bufferlist.cons " << this << endl; - _buffers = bl._buffers; - _len = bl._len; - } - ~bufferlist() { - bdbout(1) << "bufferlist.des " << this << endl; - } - - bufferlist& operator=(bufferlist& bl) { - //assert(0); // actually, this should be fine, just slow (O(n)) and stupid. - bdbout(1) << "bufferlist.= " << this << endl; - _buffers = bl._buffers; - _len = bl._len; - return *this; - } - - - // accessors - list& buffers() { - return _buffers; - } - //list::iterator begin() { return _buffers.begin(); } - //list::iterator end() { return _buffers.end(); } - - unsigned length() const { -#if 0 - { // DEBUG: verify _len - int len = 0; - for (list::iterator it = _buffers.begin(); - it != _buffers.end(); - it++) { - len += (*it).length(); - } - assert(len == _len); - } -#endif - return _len; - } - - void _rope(crope& r) { - for (list::iterator it = _buffers.begin(); - it != _buffers.end(); - it++) - r.append((*it).c_str(), (*it).length()); - } - - // modifiers - void clear() { - _buffers.clear(); - _len = 0; - } - void push_front(bufferptr& bp) { - _buffers.push_front(bp); - _len += bp.length(); - } - void push_front(buffer *b) { - bufferptr bp(b); - _buffers.push_front(bp); - _len += bp.length(); - } - void push_back(bufferptr& bp) { - _buffers.push_back(bp); - _len += bp.length(); - } - void push_back(buffer *b) { - bufferptr bp(b); - - _buffers.push_back(bp); - _len += bp.length(); - - } - void zero() { - for (list::iterator it = _buffers.begin(); - it != _buffers.end(); - it++) - it->zero(); - } - - // sort-of-like-assignment-op - void claim(bufferlist& bl) { - // free my buffers - clear(); - claim_append(bl); - } - void claim_append(bufferlist& bl) { - // steal the other guy's buffers - _len += bl._len; - _buffers.splice( _buffers.end(), bl._buffers ); - bl._len = 0; - } - - - - - // crope lookalikes - void copy(unsigned off, unsigned len, char *dest) { - assert(off >= 0); - assert(off + len <= length()); - /*assert(off < length()); - if (off + len > length()) - len = length() - off; - */ - // advance to off - list::iterator curbuf = _buffers.begin(); - - // skip off - while (off > 0) { - assert(curbuf != _buffers.end()); - if (off >= (*curbuf).length()) { - // skip this buffer - off -= (*curbuf).length(); - curbuf++; - } else { - // somewhere in this buffer! - break; - } - } - - // copy - while (len > 0) { - // is the rest ALL in this buffer? - if (off + len <= (*curbuf).length()) { - (*curbuf).copy_out(off, len, dest); // yup, last bit! - break; - } - - // get as much as we can from this buffer. - unsigned howmuch = (*curbuf).length() - off; - (*curbuf).copy_out(off, howmuch, dest); - - dest += howmuch; - len -= howmuch; - off = 0; - curbuf++; - assert(curbuf != _buffers.end()); - } - } - - void copy_in(unsigned off, unsigned len, const char *src) { - assert(off >= 0); - assert(off + len <= length()); - - // advance to off - list::iterator curbuf = _buffers.begin(); - - // skip off - while (off > 0) { - assert(curbuf != _buffers.end()); - if (off >= (*curbuf).length()) { - // skip this buffer - off -= (*curbuf).length(); - curbuf++; - } else { - // somewhere in this buffer! - break; - } - } - - // copy - while (len > 0) { - // is the rest ALL in this buffer? - if (off + len <= (*curbuf).length()) { - (*curbuf).copy_in(off, len, src); // yup, last bit! - break; - } - - // get as much as we can from this buffer. - unsigned howmuch = (*curbuf).length() - off; - (*curbuf).copy_in(off, howmuch, src); - - src += howmuch; - len -= howmuch; - off = 0; - curbuf++; - assert(curbuf != _buffers.end()); - } - } - void copy_in(unsigned off, unsigned len, bufferlist& bl) { - unsigned left = len; - for (list::iterator i = bl._buffers.begin(); - i != bl._buffers.end(); - i++) { - unsigned l = (*i).length(); - if (left < l) l = left; - copy_in(off, l, (*i).c_str()); - left -= l; - if (left == 0) break; - off += l; - } - } - - - void append(const char *data, unsigned len) { - if (len == 0) return; - - unsigned alen = 0; - - // copy into the tail buffer? - if (!_buffers.empty()) { - unsigned avail = _buffers.back().unused_tail_length(); - if (avail > 0) { - //cout << "copying up to " << len << " into tail " << avail << " bytes of tail buf" << endl; - if (avail > len) - avail = len; - unsigned blen = _buffers.back().length(); - memcpy(_buffers.back().c_str() + blen, data, avail); - blen += avail; - _buffers.back().set_length(blen); - _len += avail; - data += avail; - len -= avail; - } - alen = _buffers.back().length(); - } - if (len == 0) return; - - // just add another buffer. - // alloc a bit extra, in case we do a bunch of appends. FIXME be smarter! - if (alen < 1024) alen = 1024; - push_back(new buffer(data, len, BUFFER_MODE_DEFAULT, len+alen)); - } - void append(bufferptr& bp) { - push_back(bp); - } - void append(bufferptr& bp, unsigned len, unsigned off) { - bufferptr tempbp(bp, len, off); - push_back(tempbp); - } - void append(const bufferlist& bl) { - bufferlist temp = bl; // copy list - claim_append(temp); // and append - } - - - /* - * return a contiguous ptr to whole bufferlist contents. - */ - char *c_str() { - if (_buffers.size() == 1) { - return _buffers.front().c_str(); // good, we're already contiguous. - } - else if (_buffers.size() == 0) { - return 0; // no buffers - } - else { - // make one new contiguous buffer. - bufferptr newbuf = new buffer(length()); - unsigned off = 0; - - for (list::iterator it = _buffers.begin(); - it != _buffers.end(); - it++) { - //assert((*(*it)).has_free_func() == false); // not allowed if there's a funky free_func.. -sage ...for debugging at least! - memcpy(newbuf.c_str() + off, - (*it).c_str(), (*it).length()); - off += (*it).length(); - } - assert(off == newbuf.length()); - - _buffers.clear(); - _buffers.push_back( newbuf ); - - // now it'll work. - return c_str(); - } - } - - - void substr_of(bufferlist& other, unsigned off, unsigned len) { - assert(off + len <= other.length()); - clear(); - - // skip off - list::iterator curbuf = other._buffers.begin(); - while (off > 0) { - assert(curbuf != _buffers.end()); - if (off >= (*curbuf).length()) { - // skip this buffer - //cout << "skipping over " << *curbuf << endl; - off -= (*curbuf).length(); - curbuf++; - } else { - // somewhere in this buffer! - //cout << "somewhere in " << *curbuf << endl; - break; - } - } - - while (len > 0) { - // partial? - if (off + len < (*curbuf).length()) { - //cout << "copying partial of " << *curbuf << endl; - _buffers.push_back( bufferptr( *curbuf, len, off ) ); - _len += len; - break; - } - - // through end - //cout << "copying end (all?) of " << *curbuf << endl; - unsigned howmuch = (*curbuf).length() - off; - _buffers.push_back( bufferptr( *curbuf, howmuch, off ) ); - _len += howmuch; - len -= howmuch; - off = 0; - curbuf++; - } - } - - // funky modifer - void splice(unsigned off, unsigned len, bufferlist *claim_by=0 /*, bufferlist& replace_with */) { // fixme? - assert(off < length()); - assert(len > 0); - //cout << "splice off " << off << " len " << len << " ... mylen = " << length() << endl; - - // skip off - list::iterator curbuf = _buffers.begin(); - while (off > 0) { - assert(curbuf != _buffers.end()); - if (off >= (*curbuf).length()) { - // skip this buffer - //cout << "off = " << off << " skipping over " << *curbuf << endl; - off -= (*curbuf).length(); - curbuf++; - } else { - // somewhere in this buffer! - //cout << "off = " << off << " somewhere in " << *curbuf << endl; - break; - } - } - assert(off >= 0); - - if (off) { - // add a reference to the front bit - // insert it before curbuf (which we'll hose) - //cout << "keeping front " << off << " of " << *curbuf << endl; - _buffers.insert( curbuf, bufferptr( *curbuf, off, 0 ) ); - _len += off; - } - - while (len > 0) { - // partial? - if (off + len < (*curbuf).length()) { - //cout << "keeping end of " << *curbuf << ", losing first " << off+len << endl; - if (claim_by) - claim_by->append( *curbuf, len, off ); - (*curbuf).set_offset( off+len + (*curbuf).offset() ); // ignore beginning big - (*curbuf).set_length( (*curbuf).length() - (len+off) ); - _len -= off+len; - //cout << " now " << *curbuf << endl; - break; - } - - // hose though the end - unsigned howmuch = (*curbuf).length() - off; - //cout << "discarding " << howmuch << " of " << *curbuf << endl; - if (claim_by) - claim_by->append( *curbuf, howmuch, off ); - _len -= (*curbuf).length(); - _buffers.erase( curbuf++ ); - len -= howmuch; - off = 0; - } - - // splice in *replace (implement me later?) - } - - friend ostream& operator<<(ostream& out, bufferlist& bl); - -}; - -inline ostream& operator<<(ostream& out, bufferlist& bl) { - out << "bufferlist(len=" << bl.length() << endl; - for (list::iterator it = bl._buffers.begin(); - it != bl._buffers.end(); - it++) - out << "\t" << *it << endl; - out << ")" << endl; - return out; -} - - - -// encoder/decode helpers - -// string -inline void _encode(const string& s, bufferlist& bl) -{ - bl.append(s.c_str(), s.length()+1); -} -inline void _decode(string& s, bufferlist& bl, int& off) -{ - s = bl.c_str() + off; - off += s.length() + 1; -} - -// bufferptr (encapsulated) -inline void _encode(bufferptr& bp, bufferlist& bl) -{ - size_t len = bp.length(); - bl.append((char*)&len, sizeof(len)); - bl.append(bp); -} -inline void _decode(bufferptr& bp, bufferlist& bl, int& off) -{ - size_t len; - bl.copy(off, sizeof(len), (char*)&len); - off += sizeof(len); - bufferlist s; - s.substr_of(bl, off, len); - off += len; - - if (s.buffers().size() == 1) - bp = s.buffers().front(); - else - bp = new buffer(s.c_str(), s.length()); -} - -// bufferlist (encapsulated) -inline void _encode(const bufferlist& s, bufferlist& bl) -{ - size_t len = s.length(); - bl.append((char*)&len, sizeof(len)); - bl.append(s); -} -inline void _decode(bufferlist& s, bufferlist& bl, int& off) -{ - size_t len; - bl.copy(off, sizeof(len), (char*)&len); - off += sizeof(len); - s.substr_of(bl, off, len); - off += len; -} - - -// set -template -inline void _encode(set& s, bufferlist& bl) -{ - int n = s.size(); - bl.append((char*)&n, sizeof(n)); - for (typename set::iterator it = s.begin(); - it != s.end(); - it++) { - T v = *it; - bl.append((char*)&v, sizeof(v)); - n--; - } - assert(n==0); -} -template -inline void _decode(set& s, bufferlist& bl, int& off) -{ - s.clear(); - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i -template -inline void _encode(vector& s, bufferlist& bl) -{ - int n = s.size(); - bl.append((char*)&n, sizeof(n)); - for (typename vector::iterator it = s.begin(); - it != s.end(); - it++) { - T v = *it; - bl.append((char*)&v, sizeof(v)); - n--; - } - assert(n==0); -} -template -inline void _decode(vector& s, bufferlist& bl, int& off) -{ - s.clear(); - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - s = vector(n); - for (int i=0; i -template -inline void _encode(const list& s, bufferlist& bl) -{ - int n = s.size(); - bl.append((char*)&n, sizeof(n)); - for (typename list::const_iterator it = s.begin(); - it != s.end(); - it++) { - T v = *it; - bl.append((char*)&v, sizeof(v)); - n--; - } - assert(n==0); -} -template -inline void _decode(list& s, bufferlist& bl, int& off) -{ - s.clear(); - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i -inline void _encode(map& s, bufferlist& bl) -{ - int n = s.size(); - bl.append((char*)&n, sizeof(n)); - for (map::iterator it = s.begin(); - it != s.end(); - it++) { - _encode(it->first, bl); - _encode(it->second, bl); - n--; - } - assert(n==0); -} -inline void _decode(map& s, bufferlist& bl, int& off) -{ - s.clear(); - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i -template -inline void _encode(const map& s, bufferlist& bl) -{ - int n = s.size(); - bl.append((char*)&n, sizeof(n)); - for (typename map::const_iterator it = s.begin(); - it != s.end(); - it++) { - T k = it->first; - bl.append((char*)&k, sizeof(k)); - _encode(it->second, bl); - n--; - } - assert(n==0); -} -template -inline void _decode(map& s, bufferlist& bl, int& off) -{ - s.clear(); - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i -template -inline void _encode(const map& s, bufferlist& bl) -{ - int n = s.size(); - bl.append((char*)&n, sizeof(n)); - for (typename map::const_iterator it = s.begin(); - it != s.end(); - it++) { - T k = it->first; - U v = it->second; - bl.append((char*)&k, sizeof(k)); - bl.append((char*)&v, sizeof(v)); - n--; - } - assert(n==0); -} -template -inline void _decode(map& s, bufferlist& bl, int& off) -{ - s.clear(); - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __RANGESET_H -#define __RANGESET_H - -/* - * - * my first container with iterator! it's pretty ugly. - * - */ - -#include -#include -#include -using namespace std; - -//typedef int T; - -template -struct _rangeset_base { - map ranges; // pair(first,last) (inclusive, e.g. [first,last]) - - typedef typename map::iterator mapit; - - // get iterator for range including val. or ranges.end(). - mapit get_range_for(T val) { - mapit it = ranges.lower_bound(val); - if (it == ranges.end()) { - // search backwards - typename map::reverse_iterator it = ranges.rbegin(); - if (it == ranges.rend()) return ranges.end(); - if (it->first <= val && it->second >= val) - return ranges.find(it->first); - return ranges.end(); - } else { - if (it->first == val) return - it--; - if (it->first <= val && it->second >= val) - return it; - return ranges.end(); - } - } - -}; - - -template -class rangeset_iterator : - public std::iterator -{ - //typedef typename map::iterator mapit; - - map ranges; - typename map::iterator it; - T current; - -public: - // cons - rangeset_iterator() {} - - rangeset_iterator(typename map::iterator& it, map& ranges) { - this->ranges = ranges; - this->it = it; - if (this->it != ranges.end()) - current = it->first; - } - - bool operator==(rangeset_iterator rit) { - return (it == rit.it && rit.current == current); - } - bool operator!=(rangeset_iterator rit) { - return (it != rit.it) || (rit.current != current); - } - - T& operator*() { - return current; - } - - rangeset_iterator operator++(int) { - if (current < it->second) - current++; - else { - it++; - if (it != ranges.end()) - current = it->first; - } - - return *this; - } -}; - - -template -class rangeset -{ - typedef typename map::iterator map_iterator; - - _rangeset_base theset; - inodeno_t _size; - -public: - rangeset() { _size = 0; } - typedef rangeset_iterator iterator; - - iterator begin() { - map_iterator it = theset.ranges.begin(); - return iterator(it, theset.ranges); - } - - iterator end() { - map_iterator it = theset.ranges.end(); - return iterator(it, theset.ranges); - } - - map_iterator map_begin() { - return theset.ranges.begin(); - } - map_iterator map_end() { - return theset.ranges.end(); - } - int map_size() { - return theset.ranges.size(); - } - - void map_insert(T v1, T v2) { - theset.ranges.insert(pair(v1,v2)); - _size += v2 - v1+1; - } - - - // ... - bool contains(T val) { - if (theset.get_range_for(val) == theset.ranges.end()) return false; - assert(!empty()); - return true; - } - - void insert(T val) { - assert(!contains(val)); - - map_iterator left = theset.get_range_for(val-1); - map_iterator right = theset.get_range_for(val+1); - - if (left != theset.ranges.end() && - right != theset.ranges.end()) { - // join! - left->second = right->second; - theset.ranges.erase(right); - _size++; - return; - } - - if (left != theset.ranges.end()) { - // add to left range - left->second = val; - _size++; - return; - } - - if (right != theset.ranges.end()) { - // add to right range - theset.ranges.insert(pair(val, right->second)); - theset.ranges.erase(val+1); - _size++; - return; - } - - // new range - theset.ranges.insert(pair(val,val)); - _size++; - return; - } - - unsigned size() { - return size(); - } - - bool empty() { - if (theset.ranges.empty()) { - assert(_size == 0); - return true; - } - assert(_size>0); - return false; - } - - - T first() { - assert(!empty()); - map_iterator it = theset.ranges.begin(); - return it->first; - } - - void erase(T val) { - assert(contains(val)); - map_iterator it = theset.get_range_for(val); - assert(it != theset.ranges.end()); - - // entire range - if (val == it->first && val == it->second) { - theset.ranges.erase(it); - _size--; - return; - } - - // beginning - if (val == it->first) { - theset.ranges.insert(pair(val+1, it->second)); - theset.ranges.erase(it); - _size--; - return; - } - - // end - if (val == it->second) { - it->second = val-1; - _size--; - return; - } - - // middle split - theset.ranges.insert(pair(it->first, val-1)); - theset.ranges.insert(pair(val+1, it->second)); - theset.ranges.erase(it); - _size--; - return; - } - - void dump() { - for (typename map::iterator it = theset.ranges.begin(); - it != theset.ranges.end(); - it++) { - cout << " " << it->first << "-" << it->second << endl; - } - } - -}; - - -#endif diff --git a/tags/20070517_before_mds_merge/include/reqid.h b/tags/20070517_before_mds_merge/include/reqid.h deleted file mode 100644 index 3c71fbae69ab6..0000000000000 --- a/tags/20070517_before_mds_merge/include/reqid.h +++ /dev/null @@ -1,64 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __REQID_H -#define __REQID_H - - -#include "include/types.h" -#include "msg/msg_types.h" - -/* reqid_t - caller name + incarnation# + tid to unique identify this request - * use for metadata and osd ops. - */ -class reqid_t { -public: - entity_name_t name; // who - int inc; // incarnation - tid_t tid; - reqid_t() : inc(0), tid(0) {} - reqid_t(const entity_name_t& a, int i, tid_t t) : name(a), inc(i), tid(t) {} -}; - -inline ostream& operator<<(ostream& out, const reqid_t& r) { - return out << r.name << "." << r.inc << ":" << r.tid; -} - -inline bool operator==(const reqid_t& l, const reqid_t& r) { - return (l.name == r.name) && (l.inc == r.inc) && (l.tid == r.tid); -} -inline bool operator!=(const reqid_t& l, const reqid_t& r) { - return (l.name != r.name) || (l.inc != r.inc) || (l.tid != r.tid); -} -inline bool operator<(const reqid_t& l, const reqid_t& r) { - return (l.name < r.name) || (l.inc < r.inc) || - (l.name == r.name && l.inc == r.inc && l.tid < r.tid); -} -inline bool operator<=(const reqid_t& l, const reqid_t& r) { - return (l.name < r.name) || (l.inc < r.inc) || - (l.name == r.name && l.inc == r.inc && l.tid <= r.tid); -} -inline bool operator>(const reqid_t& l, const reqid_t& r) { return !(l <= r); } -inline bool operator>=(const reqid_t& l, const reqid_t& r) { return !(l < r); } - -namespace __gnu_cxx { - template<> struct hash { - size_t operator()(const reqid_t &r) const { - static blobhash H; - return H((const char*)&r, sizeof(r)); - } - }; -} - - -#endif diff --git a/tags/20070517_before_mds_merge/include/statlite.h b/tags/20070517_before_mds_merge/include/statlite.h deleted file mode 100644 index 60a977e49a499..0000000000000 --- a/tags/20070517_before_mds_merge/include/statlite.h +++ /dev/null @@ -1,70 +0,0 @@ -#ifndef _STATLITE_H -#define _STATLITE_H - -extern "C" { - -#include -#include -#include -#include -#include - -struct statlite { - dev_t st_dev; /* device */ - ino_t st_ino; /* inode */ - mode_t st_mode; /* protection */ - nlink_t st_nlink; /* number of hard links */ - uid_t st_uid; /* user ID of owner */ - gid_t st_gid; /* group ID of owner */ - dev_t st_rdev; /* device type (if inode device)*/ - unsigned long st_litemask; /* bit mask for optional fields */ - /***************************************************************/ - /**** Remaining fields are optional according to st_litemask ***/ - off_t st_size; /* total size, in bytes */ - blksize_t st_blksize; /* blocksize for filesystem I/O */ - blkcnt_t st_blocks; /* number of blocks allocated */ - struct timespec st_atim; /* Time of last access. */ - struct timespec st_mtim; /* Time of last modification. */ - struct timespec st_ctim; /* Time of last status change. */ - //time_t st_atime; /* time of last access */ - //time_t st_mtime; /* time of last modification */ - //time_t st_ctime; /* time of last change */ -}; - -#define S_STATLITE_SIZE 1 -#define S_STATLITE_BLKSIZE 2 -#define S_STATLITE_BLOCKS 4 -#define S_STATLITE_ATIME 8 -#define S_STATLITE_MTIME 16 -#define S_STATLITE_CTIME 32 - -#define S_REQUIRESIZE(m) (m | S_STATLITE_SIZE) -#define S_REQUIREBLKSIZE(m) (m | S_STATLITE_BLKSIZE) -#define S_REQUIREBLOCKS(m) (m | S_STATLITE_BLOCKS) -#define S_REQUIREATIME(m) (m | S_STATLITE_ATIME) -#define S_REQUIREMTIME(m) (m | S_STATLITE_MTIME) -#define S_REQUIRECTIME(m) (m | S_STATLITE_CTIME) - -#define S_ISVALIDSIZE(m) (m & S_STATLITE_SIZE) -#define S_ISVALIDBLKSIZE(m) (m & S_STATLITE_BLKSIZE) -#define S_ISVALIDBLOCKS(m) (m & S_STATLITE_BLOCKS) -#define S_ISVALIDATIME(m) (m & S_STATLITE_ATIME) -#define S_ISVALIDMTIME(m) (m & S_STATLITE_MTIME) -#define S_ISVALIDCTIME(m) (m & S_STATLITE_CTIME) - - -// readdirplus etc. - -struct dirent_plus { - struct dirent d_dirent; /* dirent struct for this entry */ - struct stat d_stat; /* attributes for this entry */ - int d_stat_err;/* errno for d_stat, or 0 */ -}; -struct dirent_lite { - struct dirent d_dirent; /* dirent struct for this entry */ - struct statlite d_stat; /* attributes for this entry */ - int d_stat_err;/* errno for d_stat, or 0 */ -}; - -} -#endif diff --git a/tags/20070517_before_mds_merge/include/types.h b/tags/20070517_before_mds_merge/include/types.h deleted file mode 100644 index 72893cb62141b..0000000000000 --- a/tags/20070517_before_mds_merge/include/types.h +++ /dev/null @@ -1,367 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_TYPES_H -#define __MDS_TYPES_H - -extern "C" { -#include -#include -#include -#include "statlite.h" -} - -#include -#include -#include -#include -#include -#include -using namespace std; - -#include -using namespace __gnu_cxx; - -#include "object.h" - -#ifndef MIN -# define MIN(a,b) ((a) < (b) ? (a):(b)) -#endif -#ifndef MAX -# define MAX(a,b) ((a) > (b) ? (a):(b)) -#endif - - -// -- stl crap -- - -/* -- this is to make some of the STL types work with 64 bit values, string hash keys, etc. -- added when i was using an old STL.. maybe try taking these out and see if things - compile now? -*/ - -class blobhash { -public: - size_t operator()(const char *p, unsigned len) { - static hash H; - long acc = 0; - while (len >= sizeof(long)) { - acc ^= *(long*)p; - p += sizeof(long); - len -= sizeof(long); - } - int sh = 0; - while (len) { - acc ^= (long)*p << sh; - sh += 8; - len--; - p++; - } - return H(acc); - } -}; - - -namespace __gnu_cxx { - template<> struct hash< std::string > - { - size_t operator()( const std::string& x ) const - { - static hash H; - return H(x.c_str()); - } - }; - -#ifndef __LP64__ - template<> struct hash<__int64_t> { - size_t operator()(__int64_t __x) const { - static hash<__int32_t> H; - return H((__x >> 32) ^ (__x & 0xffffffff)); - } - }; -#endif - -} - - -/* - * comparators for stl containers - */ -// for hash_map: -// hash_map, eqstr> vals; -struct eqstr -{ - bool operator()(const char* s1, const char* s2) const - { - return strcmp(s1, s2) == 0; - } -}; - -// for set, map -struct ltstr -{ - bool operator()(const char* s1, const char* s2) const - { - return strcmp(s1, s2) < 0; - } -}; - - - -// ---------------------- -// some basic types - -typedef __uint64_t tid_t; // transaction id -typedef __uint64_t version_t; -typedef __uint32_t epoch_t; // map epoch (32bits -> 13 epochs/second for 10 years) - - - - - -/** object layout - * how objects are mapped into PGs - */ -#define OBJECT_LAYOUT_DEFAULT 0 // see g_conf -#define OBJECT_LAYOUT_HASH 1 -#define OBJECT_LAYOUT_LINEAR 2 -#define OBJECT_LAYOUT_HASHINO 3 -#define OBJECT_LAYOUT_STARTOSD 4 - -/** pg layout - * how PGs are mapped into (sets of) OSDs - */ -#define PG_LAYOUT_CRUSH 0 -#define PG_LAYOUT_HASH 1 -#define PG_LAYOUT_LINEAR 2 -#define PG_LAYOUT_HYBRID 3 - -/** FileLayout - * specifies a striping and replication strategy - */ - -//#define FILE_LAYOUT_CRUSH 0 // stripe via crush -//#define FILE_LAYOUT_LINEAR 1 // stripe linearly across cluster - -struct FileLayout { - // layout - int object_layout; - - // FIXME: make this a union? - // rushstripe - int stripe_size; // stripe unit, in bytes - int stripe_count; // over this many objects - int object_size; // until objects are this big, then use a new set of objects. - - // period = bytes before i start on a new set of objects. - int period() { return object_size * stripe_count; } - - int osd; // osdlocal - - int num_rep; // replication - - FileLayout() { } - FileLayout(int ss, int sc, int os, int nr=2, int o=-1) : - object_layout(o < 0 ? OBJECT_LAYOUT_DEFAULT:OBJECT_LAYOUT_STARTOSD), - stripe_size(ss), stripe_count(sc), object_size(os), - osd(o), - num_rep(nr) { } - -}; - - - -// -- inode -- - -struct inodeno_t { - __uint64_t val; - inodeno_t() : val() {} - inodeno_t(__uint64_t v) : val(v) {} - inodeno_t operator+=(inodeno_t o) { val += o.val; return *this; } - operator __uint64_t() const { return val; } -}; - -inline ostream& operator<<(ostream& out, inodeno_t ino) { - return out << hex << ino.val << dec; -} - -namespace __gnu_cxx { - template<> struct hash< inodeno_t > - { - size_t operator()( const inodeno_t& x ) const - { - static hash<__uint64_t> H; - return H(x.val); - } - }; -} - - -#define INODE_MODE_FILE 0100000 // S_IFREG -#define INODE_MODE_SYMLINK 0120000 // S_IFLNK -#define INODE_MODE_DIR 0040000 // S_IFDIR -#define INODE_TYPE_MASK 0170000 - -#define FILE_MODE_R 1 -#define FILE_MODE_W 2 -#define FILE_MODE_RW (1|2) -#define FILE_MODE_LAZY 4 - -#define INODE_MASK_BASE 1 // ino, ctime, nlink -#define INODE_MASK_PERM 2 // uid, gid, mode -#define INODE_MASK_SIZE 4 // size, blksize, blocks -#define INODE_MASK_MTIME 8 // mtime -#define INODE_MASK_ATIME 16 // atime - -#define INODE_MASK_ALL_STAT (INODE_MASK_BASE|INODE_MASK_PERM|INODE_MASK_SIZE|INODE_MASK_MTIME) -//#define INODE_MASK_ALL_STAT (INODE_MASK_BASE|INODE_MASK_PERM|INODE_MASK_SIZE|INODE_MASK_MTIME|INODE_MASK_ATIME) - -struct inode_t { - // base (immutable) - inodeno_t ino; // NOTE: ino _must_ come first for MDStore.cc to behave!! - time_t ctime; - - // other - FileLayout layout; // ?immutable? - int nlink; // base, - - // hard/perm (namespace permissions) - mode_t mode; - uid_t uid; - gid_t gid; - - // file (data access) - off_t size; - time_t atime, mtime; // maybe atime different? "lazy"? - - int mask; - - // special stuff - version_t version; // auth only - unsigned char hash_seed; // only defined for dir; 0 if not hashed. - bool anchored; // auth only - version_t file_data_version; // auth only - - bool is_symlink() { return (mode & INODE_TYPE_MASK) == INODE_MODE_SYMLINK; } - bool is_dir() { return (mode & INODE_TYPE_MASK) == INODE_MODE_DIR; } - bool is_file() { return (mode & INODE_TYPE_MASK) == INODE_MODE_FILE; } -}; - - - - -// client types -typedef int fh_t; // file handle - - -// dentries -#define MAX_DENTRY_LEN 255 - - - - -// -- io helpers -- - -template -inline ostream& operator<<(ostream& out, vector& v) { - out << "["; - for (unsigned i=0; i -inline ostream& operator<<(ostream& out, const set& iset) { - for (typename set::const_iterator it = iset.begin(); - it != iset.end(); - it++) { - if (it != iset.begin()) out << ","; - out << *it; - } - return out; -} - -template -inline ostream& operator<<(ostream& out, const multiset& iset) { - for (typename multiset::const_iterator it = iset.begin(); - it != iset.end(); - it++) { - if (it != iset.begin()) out << ","; - out << *it; - } - return out; -} - -template -inline ostream& operator<<(ostream& out, const map& m) -{ - out << "{"; - for (typename map::const_iterator it = m.begin(); - it != m.end(); - it++) { - if (it != m.begin()) out << ","; - out << it->first << "=" << it->second; - } - out << "}"; - return out; -} - - - - -// -- rope helpers -- - -// string -inline void _rope(string& s, crope& r) -{ - r.append(s.c_str(), s.length()+1); -} -inline void _unrope(string& s, crope& r, int& off) -{ - s = r.c_str() + off; - off += s.length() + 1; -} - -// set -inline void _rope(set& s, crope& r) -{ - int n = s.size(); - r.append((char*)&n, sizeof(n)); - for (set::iterator it = s.begin(); - it != s.end(); - it++) { - int v = *it; - r.append((char*)&v, sizeof(v)); - n--; - } - assert(n==0); -} -inline void _unrope(set& s, crope& r, int& off) -{ - s.clear(); - int n; - r.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -/* - * uofs.h - * - * user-level object-based file system - */ - - #ifndef _UOFS_H_ - #define _UOFS_H_ - - #include - #include - #include - - - int device_open(char *path, int xflags); - void device_findsizes(int fd, long long *sz, int *bsz); - - int uofs_format(int bdev_id, int donode_size, int bd_ratio, int reg_size, int sb_size, int lb_size, - int nr_hash_table_buckets, int delay_allocation, int flush_interval); - - int uofs_mount(int bdev_id); - void uofs_shutdown(void); - - int uofs_read(long long oid, void *buf, off_t offset, size_t count); - int uofs_write(long long oid, void *buf, off_t offset, size_t count); - int uofs_del(long long oid); - int uofs_sync(long long oid); - int uofs_exist(long long oid); - - int uofs_get_size(long long oid); - - void uofs_superblock_printout(void); - int get_large_object_pages(void); - - int uofs_buffer_size(void); - #endif diff --git a/tags/20070517_before_mds_merge/jobs/alc.tp b/tags/20070517_before_mds_merge/jobs/alc.tp deleted file mode 100644 index c600850c54be0..0000000000000 --- a/tags/20070517_before_mds_merge/jobs/alc.tp +++ /dev/null @@ -1,38 +0,0 @@ -#PSUB -s /bin/bash # Sets your shell in batch -#PSUB -c alc # Where to run the job - -#PSUB -eo # Send std error & std out to the same file - -#PSUB -ln $NUM # Number of nodes to use -#PSUB -g $NUM # Total Number of tasks to use -#PSUB -cpn 1 # cpus per node - -####PSUB -c 1024Mb # memory limit -#PSUB -lc 1500 # Core file size per process -#PSUB -nr # Do not automatically resubmit job -#PSUB -tM 20m # Select time limit. The default time limit - # is only 30 minutes! Time can be HH:MM:SS or HH:MM - -#PSUB -o $CWD/$OUT # filename for output - -# Put your commands here. Remember to 'cd' to the appropriate -# directory, because the job will initially be in your home directory. -# To run a parallel job, you need to use the srun. - - - -echo job $PSUB_JOBID nodes $NUM name $NAME - -# environment -cd $CWD -export LD_LIBRARY_PATH=/usr/lib/mpi/mpi_gnu/lib - -# create fakestore dirs -srun -l -N $NUM -ppbatch bash -c "test -d tmp/osddata || mkdir tmp/osddata || echo cant make osddata ; uptime" - -# go -srun -l -N $NUM -ppbatch $CMD && touch $DONE - -# clean up fakestore -srun -l -N $NUM -ppbatch bash -c 'uptime ; rm -r tmp/osddata/*' - diff --git a/tags/20070517_before_mds_merge/jobs/alcdat/makedirs b/tags/20070517_before_mds_merge/jobs/alcdat/makedirs deleted file mode 100644 index af5a098a254c9..0000000000000 --- a/tags/20070517_before_mds_merge/jobs/alcdat/makedirs +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - #'nummds' => [1, 2, 4, 8, 16, 32, 48, 64, 80, 96, 128, 160, 192], - 'nummds' => [1, 2, 8, 16, 32, 48, 64, 80, 96, 112, 128],#144, 160, 192, 208], - - 'cper' => [15,20], - '_dep' => [ 'cnode' => '$nummds',# / 4 + 1', - 'numclient' => '$nummds * $cper', - 'numosd' => '$nummds > 1 ? $nummds:2', - 'n' => '1 + $cnode + $nummds + $numosd' ], - - # parameters - 'fs' => 'ebofs', - #'fs' => 'fakestore', - - 'mds_bal_rep' => 10000, # none of that! - 'mds_decay_halflife' => 30, - - 'mds_bal_interval' => 45, - 'mds_bal_max' => [2], - - 'until' => 300, # --syn until $n ... when to stop clients - 'kill_after' => 400, - 'start' => 100, - 'end' => 300, - - 'makedirs' => 1, - 'makedirs_dirs' => 10, - 'makedirs_files' => 10, - 'makedirs_depth' => 4, - - # --meta_log_layout_scount 32 --meta_log_layout_ssize 256 - # --osd_pg_layout linear - 'custom' => '--tcp_skip_rank0 --meta_log_layout_num_rep 1 --meta_dir_layout_num_rep 1 --mds_shutdown_check 60', - #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', - - 'comb' => { - 'x' => 'nummds', - 'vars' => [ 'mds.req' ] - } -}; diff --git a/tags/20070517_before_mds_merge/jobs/alcdat/makedirs.big b/tags/20070517_before_mds_merge/jobs/alcdat/makedirs.big deleted file mode 100644 index c67b2b93dd742..0000000000000 --- a/tags/20070517_before_mds_merge/jobs/alcdat/makedirs.big +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - #'nummds' => [1, 2, 4, 8, 16, 32, 48, 64, 80, 96, 128, 160, 192], - 'nummds' => [160, 200],#[1, 2, 8, 16, 32, 48, 64, 80, 96, 112, 128],#144, 160, 192, 208], - - 'cper' => [15,20], - '_dep' => [ 'cnode' => '40',#$nummds',# / 4 + 1', - 'numclient' => '$nummds * $cper', - 'numosd' => '$nummds * .8', - 'n' => '415'],#1 + $cnode + $nummds + $numosd' ], - - # parameters - 'fs' => 'ebofs', - #'fs' => 'fakestore', - - 'mds_bal_rep' => 10000, # none of that! - 'mds_decay_halflife' => 30, - - 'mds_bal_interval' => 45, - 'mds_bal_max' => 2, - - 'until' => 300, # --syn until $n ... when to stop clients - 'kill_after' => 400, - 'start' => 100, - 'end' => 300, - - 'makedirs' => 1, - 'makedirs_dirs' => 10, - 'makedirs_files' => 10, - 'makedirs_depth' => 4, - - # --meta_log_layout_scount 32 --meta_log_layout_ssize 256 - # --osd_pg_layout linear - 'custom' => '--tcp_skip_rank0 --meta_log_layout_num_rep 1 --meta_dir_layout_num_rep 1 --mds_shutdown_check 60', - #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', - - 'comb' => { - 'x' => 'nummds', - 'vars' => [ 'mds.req' ] - } -}; diff --git a/tags/20070517_before_mds_merge/jobs/alcdat/makedirs.tput b/tags/20070517_before_mds_merge/jobs/alcdat/makedirs.tput deleted file mode 100644 index 8dd5ae4c47d8c..0000000000000 --- a/tags/20070517_before_mds_merge/jobs/alcdat/makedirs.tput +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - #'nummds' => [1, 2, 4, 8, 16, 32, 48, 64, 80, 96, 128, 160, 192], - 'nummds' => [4, 16, 64],#[1, 16, 64, 128],#144, 160, 192, 208], - - #'cper' => [2, 5, 7, 10, 13, 16, 20, 30, 40, 50, 100, 150], - 'cper' => [13, 30, 40], # just for final run... - '_dep' => [ 'cnode' => '$nummds',# / 4 + 1', - 'numclient' => '$nummds * $cper', - 'numosd' => '$nummds', - 'n' => '1 + $cnode + $nummds + $numosd' ], - - # parameters - 'fs' => 'ebofs', - #'fs' => 'fakestore', - - 'mds_bal_rep' => 10000, # none of that! - 'mds_decay_halflife' => 30, - - 'mds_bal_interval' => 45, - 'mds_bal_max' => 2, - - 'until' => 300, # --syn until $n ... when to stop clients - 'kill_after' => 400, - 'start' => 100, - 'end' => 300, - - 'makedirs' => 1, - 'makedirs_dirs' => 10, - 'makedirs_files' => 10, - 'makedirs_depth' => 4, - - # --meta_log_layout_scount 32 --meta_log_layout_ssize 256 - # --osd_pg_layout linear - 'custom' => '--tcp_skip_rank0 --meta_log_layout_num_rep 1 --meta_dir_layout_num_rep 1 --mds_shutdown_check 60', - #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', - - 'comb' => { - 'x' => 'cper',#nummds', - 'vars' => [ 'mds.req', 'cl.lat' ] - } -}; diff --git a/tags/20070517_before_mds_merge/jobs/alcdat/makefiles.shared b/tags/20070517_before_mds_merge/jobs/alcdat/makefiles.shared deleted file mode 100644 index ab96702c73289..0000000000000 --- a/tags/20070517_before_mds_merge/jobs/alcdat/makefiles.shared +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - 'nummds' => [1, 2, 4, 8, 16, 32, 64, 96, 128], #2, 4, 8, 16, 32, 48, 64, 80, 96], - - 'cper' => [25, 50, 100, 150],# 100, 150, 200], - - '_dep' => [ 'cnode' => '$nummds', - 'numclient' => '$nummds * $cper', - 'numosd' => '$nummds', - 'n' => '1 + $cnode + $nummds + $numosd' ], - - # parameters - 'fs' => 'ebofs', - - 'mds_bal_hash_wr' => 1000, - - 'until' => 180, # --syn until $n ... when to stop clients - 'kill_after' => 250, - 'start' => 30, - 'end' => 180, - - 'custom' => '--tcp_skip_rank0 --meta_log_layout_num_rep 1 --meta_dir_layout_num_rep 1 --mds_shutdown_check 60 --syn makefiles 100000 1000 0', - - 'comb' => { - 'x' => 'nummds', - 'vars' => [ 'mds.req', 'cl.lat' ] - } -}; diff --git a/tags/20070517_before_mds_merge/jobs/alcdat/openshared b/tags/20070517_before_mds_merge/jobs/alcdat/openshared deleted file mode 100644 index 5ed7ba95894b3..0000000000000 --- a/tags/20070517_before_mds_merge/jobs/alcdat/openshared +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - 'nummds' => [1, 4, 16, 64, 128, 192 ], - - 'cper' => [10, 50, 100, 150], - '_dep' => [ 'cnode' => '$nummds',# > 30 ? 30:$nummds', - 'numclient' => '$nummds*$cper', - 'numosd' => '$nummds > 30 ? 30:$nummds', - 'n' => '1 + $cnode + $nummds + $numosd' ], - - # parameters - 'fs' => 'ebofs', - - 'mds_bal_interval' => 10000, - 'mds_bal_hash_wr' => 1000, - - 'until' => 120, # --syn until $n ... when to stop clients - 'kill_after' => 180, - 'start' => 10, - 'end' => 120, - - 'custom' => '--tcp_skip_rank0 --debug_mds_balancer 10 --mds_shutdown_check 60 --syn only 0 --syn createshared 10 --syn sleep 5 --syn openshared 10 10000', - - 'comb' => { - 'x' => 'nummds', - 'vars' => [ 'mds.req', 'cl.lat' ] - } -}; diff --git a/tags/20070517_before_mds_merge/jobs/alcdat/ossh.include b/tags/20070517_before_mds_merge/jobs/alcdat/ossh.include deleted file mode 100644 index c9a368ba5c60f..0000000000000 --- a/tags/20070517_before_mds_merge/jobs/alcdat/ossh.include +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 10, - - #'nummds' => [2, 4, 8, 16, 32, 48, 64, 80, 96, 128], - #'nummds' => [1, 2, 4, 6, 7], # googoo - 'nummds' => [ 2, 4, 8, 16, 32, 48, 64, 80, 96, 128 ], - - #'trace' => ['make.lib', 'make.include'], - - 'mds_bal_interval' => 45, - 'mds_bal_max' => 2,#6, #[ 2,4,6 ], - 'mds_decay_halflife' => 30, - 'mds_bal_rep' => 1500, - 'mds_bal_hash_rd' => 100000, - - 'cper' => [15, 20],#25, 50, 100], #50,#[25, 50, 75, 100],#50,# [ 50, 100 ], - #'cper' => 125, #[30, 50, 75, 100, 125, 150], #50, #[10,50,100],# [ 50, 75, 100 ], - - '_dep' => [ 'cnode' => '$nummds', - 'numclient' => '$nummds * $cper', - 'numosd' => '$nummds', - 'n' => '1 + $cnode + $nummds + $numosd' ], - - 'custom' => '--tcp_skip_rank0 --mds_shutdown_check 60 --syn only 0 --syn trace traces/openssh/untar.include 1 --syn sleep 30 --syn trace traces/openssh/make.include 1000', - - # parameters - 'fs' => 'ebofs', - - #'until' => 500, - #'kill_after' => 600, - #'start' => 200, - #'end' => 500, - 'until' => 300, # --syn until $n ... when to stop clients - 'kill_after' => 400, - 'start' => 200, - 'end' => 300, - - 'comb' => { - 'x' => 'nummds', - 'vars' => [ 'mds.req' ] - } -}; diff --git a/tags/20070517_before_mds_merge/jobs/alcdat/ossh.include.big b/tags/20070517_before_mds_merge/jobs/alcdat/ossh.include.big deleted file mode 100644 index b92895a53a763..0000000000000 --- a/tags/20070517_before_mds_merge/jobs/alcdat/ossh.include.big +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 10, - - #'nummds' => [2, 4, 8, 16, 32, 48, 64, 80, 96, 128], - #'nummds' => [1, 2, 4, 6, 7], # googoo - #'nummds' => [ 2, 4, 8, 16, 32, 48, 64, 80, 96, 128 ], - 'nummds' => [160,200], - - #'trace' => ['make.lib', 'make.include'], - - 'mds_bal_interval' => 45, - 'mds_bal_max' => 2,#6, #[ 2,4,6 ], - 'mds_decay_halflife' => 30, - 'mds_bal_rep' => 1500, - 'mds_bal_hash_rd' => 100000, - - 'cper' => [25, 50], #50,#[25, 50, 75, 100],#50,# [ 50, 100 ], - #'cper' => 125, #[30, 50, 75, 100, 125, 150], #50, #[10,50,100],# [ 50, 75, 100 ], - - '_dep' => [ 'cnode' => '$nummds', - 'numclient' => '$nummds * $cper', - 'numosd' => '$nummds * .6', - 'n' => '415'],#1 + $cnode + $nummds + $numosd' ], - - 'custom' => '--tcp_skip_rank0 --tcp_overlay_clients --mds_shutdown_check 60 --syn only 0 --syn trace traces/openssh/untar.include 1 --syn sleep 30 --syn trace traces/openssh/make.include 1000', - - # parameters - 'fs' => 'ebofs', - - #'until' => 500, - #'kill_after' => 600, - #'start' => 200, - #'end' => 500, - 'until' => 300, # --syn until $n ... when to stop clients - 'kill_after' => 400, - 'start' => 200, - 'end' => 300, - - 'comb' => { - 'x' => 'nummds', - 'vars' => [ 'mds.req' ] - } -}; diff --git a/tags/20070517_before_mds_merge/jobs/alcdat/ossh.lib b/tags/20070517_before_mds_merge/jobs/alcdat/ossh.lib deleted file mode 100644 index 73372866f051f..0000000000000 --- a/tags/20070517_before_mds_merge/jobs/alcdat/ossh.lib +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 10, - - #'nummds' => [1, 2, 4, 8, 16, 32, 48, 64, 80, 96, 128], - 'nummds' => [2, 4, 8, 16, 32, 48, 64, 80, 96, 128], - - #'nummds' => [1, 2, 4, 6, 7], # googoo - #'trace' => ['make.lib', 'make.include'], - - 'mds_bal_interval' => 90, #[30, 60, 90], #$[60,90], #[60,90],#[30, 60, 90], - #'mds_bal_max' => [4, 10],#6,#[2,4,6,8], - - 'mds_decay_halflife' => 30, - 'mds_bal_rep' => 1500, - 'cper' => [10, 16], #50,#[25, 50, 75, 100],#50,# [ 50, 100 ], - - '_dep' => [ 'cnode' => '$nummds', - 'numclient' => '$nummds * $cper', - 'numosd' => '$nummds', - 'n' => '1 + $cnode + $nummds + $numosd' ], - - - 'custom' => '--tcp_skip_rank0 --debug_mds_balancer 1 --mds_shutdown_check 60 --syn only 0 --syn trace traces/openssh/untar.lib 1 --syn sleep 10 --syn trace traces/openssh/make.lib 1000', - - # parameters - #'fs' => ['fakestore'], - 'fs' => 'ebofs', - - #'until' => 500, - #'kill_after' => 600, - #'start' => 200, - #'end' => 500, - 'until' => 300, # --syn until $n ... when to stop clients - 'kill_after' => 400, - 'start' => 150, - 'end' => 300, - - 'comb' => { - 'x' => 'nummds', - 'vars' => [ 'mds.req' ] - } -}; diff --git a/tags/20070517_before_mds_merge/jobs/alcdat/ossh.lib.big b/tags/20070517_before_mds_merge/jobs/alcdat/ossh.lib.big deleted file mode 100644 index b9e0dd1ff68cd..0000000000000 --- a/tags/20070517_before_mds_merge/jobs/alcdat/ossh.lib.big +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 10, - - #'nummds' => [1, 2, 4, 8, 16, 32, 48, 64, 80, 96, 128], - #'nummds' => [2, 4, 8, 16, 32, 48, 64, 80, 96, 128], - 'nummds' => [160,200], - - #'nummds' => [1, 2, 4, 6, 7], # googoo - #'trace' => ['make.lib', 'make.include'], - - 'mds_bal_interval' => 90, #[30, 60, 90], #$[60,90], #[60,90],#[30, 60, 90], - #'mds_bal_max' => [4, 10],#6,#[2,4,6,8], - - 'mds_decay_halflife' => 30, - 'mds_bal_rep' => 1500, - 'cper' => [25, 50, 100], #50,#[25, 50, 75, 100],#50,# [ 50, 100 ], - - '_dep' => [ 'cnode' => 0,#'30', - 'numclient' => '$nummds * $cper', - 'numosd' => '$nummds * .6', - 'n' => '415'],#'1 + $cnode + $nummds + $numosd' ], - - - 'custom' => '--tcp_skip_rank0 --debug_mds_balancer 1 --mds_shutdown_check 60 --syn only 0 --syn trace traces/openssh/untar.lib 1 --syn sleep 10 --syn trace traces/openssh/make.lib 1000', - - # parameters - #'fs' => ['fakestore'], - 'fs' => 'ebofs', - - #'until' => 500, - #'kill_after' => 600, - #'start' => 200, - #'end' => 500, - 'until' => 300, # --syn until $n ... when to stop clients - 'kill_after' => 400, - 'start' => 150, - 'end' => 300, - - 'comb' => { - 'x' => 'nummds', - 'vars' => [ 'mds.req' ] - } -}; diff --git a/tags/20070517_before_mds_merge/jobs/alcdat/striping b/tags/20070517_before_mds_merge/jobs/alcdat/striping deleted file mode 100644 index de71828d12bde..0000000000000 --- a/tags/20070517_before_mds_merge/jobs/alcdat/striping +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - 'nummds' => 1, - 'numosd' => 10, - - 'cnode' => 10, - 'cper' => [ 10, 25, 50, 100 ], - - '_dep' => [ 'numclient' => '$cper * $cnode', - 'n' => '1 + $cnode + $nummds + $numosd', - 'file_layout_osize' => '$writefile_size' ], - - # parameters - 'fs' => 'ebofs', - #'fs' => 'fakestore', - - 'until' => 160, # --syn until $n ... when to stop clients - 'kill_after' => 200, - 'start' => 100, - 'end' => 160, - - 'writefile' => 1, - 'writefile_size' => [ -# 4*1024*1024, - 1024*1024 ], -# 256*1024, -# 64*1024 - 'writefile_mb' => 100000, - - 'osd_pg_bits' => 10,#16, - #'osd_pg_bits' => [ 16, 20 ], - - #'osd_object_layout' => [ 'hash', 'hashino', 'linear' ], - 'osd_pg_layout' => [ 'crush', -# 'hash', - 'linear' ], - - 'custom' => '--tcp_skip_rank0 --file_layout_num_rep 1 --mds_shutdown_check 60', - - 'comb' => { - 'x' => 'cper',#writefile_size', - 'vars' => [ 'osd.c_wrb', 'osd.c_wr' ], - } -}; diff --git a/tags/20070517_before_mds_merge/jobs/example b/tags/20070517_before_mds_merge/jobs/example deleted file mode 100644 index 802a8b66e6332..0000000000000 --- a/tags/20070517_before_mds_merge/jobs/example +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/perl -# hi there -{ - # startup - 'n' => 30, # number of mpi nodes - 'sleep' => 3, # seconds to sleep between runs (so you have time to control-c out) - 'nummds' => 1, - 'numosd' => 6, - 'numclient' => 100, - - 'until' => 100, # --syn until $n ... synthetic client will stop itself after this many seconds. - 'kill_after' => 300, # seconds before everything commits suicide (in case something hangs) - - # stuff i want to vary - # here's a simple example: - - # do --syn writefile command - 'writefile' => 1, - # and very the write size - 'writefile_size' => [ # vary -# 2048*1024, - 1024*1024, - 512*1024, - 256*1024, - 128*1024, - 64*1024, - 48*1024, - 32*1024, - 28*1024, - 24*1024, - 16*1024, - 12*1024, - 8*1024, - 4096, -# 256, -# 16, -# 1 - ], - 'writefile_mb' => 1000, # each client shoudl write 1GB (or more likely, keep going until time runs out) - - 'file_layout_num_rep'=> [1,2], # also vary the replication level - - # pass some other random things to newsyn - 'custom' => '--', - - # for final summation (script/sum.pl) - # specify time period to look at the results - 'start' => 30, # skip first 30 seconds, so that caches are full etc. - 'end' => 90, # go for 60 seconds - - # what should i parse/plot? - 'comb' => { - 'x' => 'writefile_size', - 'vars' => [ 'osd.c_wrb', 'osd.r_wrb' ], - } -}; diff --git a/tags/20070517_before_mds_merge/jobs/mds/log_striping b/tags/20070517_before_mds_merge/jobs/mds/log_striping deleted file mode 100644 index 46242cdda4f00..0000000000000 --- a/tags/20070517_before_mds_merge/jobs/mds/log_striping +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - 'kill_after' => 300, - - 'nummds' => 1, - 'numosd' => 8, - 'numclient' => 100, - 'n' => 16, - - # parameters - 'fs' => ['ebofs','fakestore'], - 'meta_log_ssize' => [ 128, 256, 1024, 1 << 15, 1 << 20 ], - 'meta_log_scount' => 4,#[ 1, 2, 4, 8 ], - - 'until' => 200, # --syn until $n ... when to stop clients - - 'makedirs' => 1, - 'makedirs_dirs' => 10, - 'makedirs_files' => 10, - 'makedirs_depth' => 4, - - 'custom' => '--tcp_skip_rank0', - #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', - - # for final summation (script/sum.pl) - 'start' => 100, - 'end' => 550, - - 'comb' => { - 'x' => 'nummds', - 'vars' => [ 'mds.req' ] - } -}; diff --git a/tags/20070517_before_mds_merge/jobs/mds/makedir_lat b/tags/20070517_before_mds_merge/jobs/mds/makedir_lat deleted file mode 100644 index 63374f52a36c0..0000000000000 --- a/tags/20070517_before_mds_merge/jobs/mds/makedir_lat +++ /dev/null @@ -1,33 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - 'nummds' => 1, - 'numosd' => 8, - 'numclient' => [1],#, 40, 80, 160 ], - 'n' => 20, - - 'fs' => 'ebofs', - - 'start' => 20, - 'end' => 40, - 'until' => 40, - 'kill_after' => 60, - - 'makedirs' => 1, - 'makedirs_dirs' => 10, - 'makedirs_files' => 10, - 'makedirs_depth' => 5, - - 'mds_local_osd' => [ 0, 1 ], - 'meta_log_layout_num_rep' => [ 0, 1, 2, 3, 4], - - 'custom' => '--tcp_skip_rank0', - - 'comb' => { - 'x' => 'meta_log_layout_num_rep', - 'vars' => [ 'mds.log.lat', 'cl.lat', 'osd.rlat' ] - } -}; diff --git a/tags/20070517_before_mds_merge/jobs/mds/makedirs b/tags/20070517_before_mds_merge/jobs/mds/makedirs deleted file mode 100644 index 4ca42d72fa37e..0000000000000 --- a/tags/20070517_before_mds_merge/jobs/mds/makedirs +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - '_psub' => 'jobs/alc.tp', - - 'sleep' => 3, - - 'nummds' => [1, 2, 4, 6, 8, 12, 16, 24, 32, 40, 48, 64], - - 'cper' => 50, - '_dep' => [ 'cnode' => '$nummds', - 'numclient' => '$cnode * $cper', - 'numosd' => '$nummds * 2', - 'n' => '1 + $cnode + $nummds + $numosd' ], - - # parameters - #'fs' => 'ebofs', - 'fs' => 'fakestore', - - 'until' => 300, # --syn until $n ... when to stop clients - 'kill_after' => 400, - - 'makedirs' => 1, - 'makedirs_dirs' => 10, - 'makedirs_files' => 10, - 'makedirs_depth' => 3, - - 'custom' => '--tcp_skip_rank0', - #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', - - # for final summation (script/sum.pl) - 'start' => 100, - 'end' => 550, - - 'comb' => { - 'x' => 'nummds', - 'vars' => [ 'mds.req' ] - } -}; diff --git a/tags/20070517_before_mds_merge/jobs/mds/opensshlib b/tags/20070517_before_mds_merge/jobs/mds/opensshlib deleted file mode 100644 index d8b61ae52c655..0000000000000 --- a/tags/20070517_before_mds_merge/jobs/mds/opensshlib +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - 'nummds' => [1, 2, 4, 7], # googoo - #'nummds' => [1, 2, 4, 6, 8, 12, 16, 24, 32, 40, 48, 64], # alc - - - # parameters - 'fs' => 'ebofs', - #'fs' => 'fakestore', - - 'until' => 300, # --syn until $n ... when to stop clients - 'kill_after' => 400, - 'start' => 150, - 'end' => 300, - - 'mds_bal_interval' => 90,#[60, 90], - #'mds_bal_max' => [3,4,5], - 'mds_bal_max' => 4, - 'mds_decay_halflife' => 30,#[15, 25, 30, 45, 60], - 'mds_bal_rep' => 1500,#[1000, 1500, 2000], - - 'decay_hl' => 100,#[ 25, 50, 100, 150 ], - - 'cper' => 100, #[50, 75, 100, 125, 150, 200], - '_dep' => [ 'cnode' => '$nummds', - 'numclient' => '$nummds * $cper', - 'numosd' => '$nummds * 2', - 'n' => '1 + $cnode + $nummds + $numosd', - 'mds_bal_rep' => '$mds_decay_halflife * $decay_hl'], - - 'custom' => '--tcp_skip_rank0 --syn only 0 --syn trace traces/openssh/untar.lib 1 --syn sleep 10 --syn randomsleep 30 --syn trace traces/openssh/make.lib 100 --debug_mds_balancer 1 --mds_shutdown_check 60', - #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', - - # for final summation (script/sum.pl) - - 'comb' => { - 'x' => 'nummds',#decay_hl',#'nummds', - 'vars' => [ 'mds.req' ] - } -}; diff --git a/tags/20070517_before_mds_merge/jobs/meta1 b/tags/20070517_before_mds_merge/jobs/meta1 deleted file mode 100644 index 743212f1c3009..0000000000000 --- a/tags/20070517_before_mds_merge/jobs/meta1 +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/sh - -# makedirs for 300 seconds -# first bit in memory -# second bit is commiting from journal too -# then walk fs for 300 seconds -# this should all be in memory. - -JOB="meta1" -ARGS="--numosd 10 --fullmkfs --syn until 180 --syn makedirs 10 10 4 --syn until 360 --syn repeatwalk --mds_bal_max 1 --osd_fsync 0 --mds_log_max_len 200000 --mds_cache_size 500000" - -#rm core* ; make tcpsyn && mpiexec -l -n 17 ./tcpsyn $ARGS --nummds 1 --log_name $JOB/1 --numclient 25 > log/$JOB/o.1 -#rm core* ; make tcpsyn && mpiexec -l -n 18 ./tcpsyn $ARGS --nummds 2 --log_name $JOB/2 --numclient 50 > log/$JOB/o.2 -#rm core* ; make tcpsyn && mpiexec -l -n 20 ./tcpsyn $ARGS --nummds 4 --log_name $JOB/4 --numclient 100 > log/$JOB/o.4 -#rm core* ; make tcpsyn && mpiexec -l -n 24 ./tcpsyn $ARGS --nummds 8 --log_name $JOB/8 --numclient 200 > log/$JOB/o.8 -#rm core* ; make tcpsyn && mpiexec -l -n 28 ./tcpsyn $ARGS --nummds 12 --log_name $JOB/12 --numclient 300 > log/$JOB/o.12 -rm core* ; make tcpsyn && mpiexec -l -n 32 ./tcpsyn $ARGS --nummds 16 --log_name $JOB/16 --numclient 300 > log/$JOB/o.16 - - diff --git a/tags/20070517_before_mds_merge/jobs/meta1.proc.sh b/tags/20070517_before_mds_merge/jobs/meta1.proc.sh deleted file mode 100755 index 616acbefff619..0000000000000 --- a/tags/20070517_before_mds_merge/jobs/meta1.proc.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/sh - -for d in 1 2 4 8 12 -do - echo $d - cd $d - ../../../script/sum.pl mds? mds?? > mds.sum - ../../../script/sum.pl -avg mds? mds?? > mds.avg - - ../../../script/sum.pl -start 90 -end 180 mds? mds?? > mds.sum.makedirs - ../../../script/sum.pl -start 200 -end 300 mds? mds?? > mds.sum.walk - - cd .. -done diff --git a/tags/20070517_before_mds_merge/jobs/osd/ebofs b/tags/20070517_before_mds_merge/jobs/osd/ebofs deleted file mode 100644 index 5d11523f6f832..0000000000000 --- a/tags/20070517_before_mds_merge/jobs/osd/ebofs +++ /dev/null @@ -1,51 +0,0 @@ -# hi there -{ - # startup - 'n' => 30, # mpi nodes - 'sleep' => 3, # seconds between runs - 'nummds' => 1, - 'numosd' => 8, - 'numclient' => 100,#[10, 50, 100, 200, 400], - -'kill_after' => 200, - - # parameters - 'fs' => 'ebofs',#[ -# 'obfs', -# 'fakestore', -# 'ebofs' -# ], - 'until' => 100, # --syn until $n ... when to stop clients - 'writefile' => 1, - 'writefile_size' => [ -# 2560000, - 1024000, - 262144, -# 131072, -# 98304, - 65536, -# 16384, -# 4096, - 256, -# 16, -# 1 - ], - 'writefile_mb' => 1000, - - 'ebofs_idle_commit_ms' => [ 100, 500 ], - 'ebofs_abp_max_alloc' => [ 4096*16, 4096*64 ], - -# 'custom' => '--tcp_skip_rank0',# --osd_maxthreads 0', - 'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', - - # for final summation (script/sum.pl) - 'start' => 30, - 'end' => 90, - -'comb' => { - 'x' => 'writefile_size', - 'vars' => [ 'osd.c_wrb' ], -# 'maptitle' => { 'osd_object_layout=' => '', -# ',osd_pg_layout=' => ' + '} - } -}; diff --git a/tags/20070517_before_mds_merge/jobs/osd/mds_log b/tags/20070517_before_mds_merge/jobs/osd/mds_log deleted file mode 100644 index 0f99f6998dcfc..0000000000000 --- a/tags/20070517_before_mds_merge/jobs/osd/mds_log +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - #'_psub' => 'jobs/alc.tp', - 'sleep' => 3, - - 'nummds' => 1, - 'numclient' => [5, 10, 15, 25, 50, 100, 200, 300, 400], - #'numclient' => [ 50, 100, 200 ], - 'numosd' => [2,4],#[ 4, 8, 12, 16, 20, 24 ], - 'n' => 12, - - # parameters - 'fs' => 'fakestore',#['ebofs', 'fakestore','obfs'], - #'fs' => 'ebofs', - #'ebofs_commit_ms' => [ 1000, 5000 ], - #'osd_maxthreads' => [ 0, 1, 2, 4, 8 ], - - 'until' => 100, # --syn until $n ... when to stop clients - 'kill_after' => 300, - 'start' => 20, - 'end' => 90, - - 'makedirs' => 1, - 'makedirs_dirs' => 10, - 'makedirs_files' => 10, - 'makedirs_depth' => 3, - - - #'meta_log_layout_ssize' => [256, 512, 1024, 4096, 16384, 65536, 262400], - #'meta_log_layout_scount' => [2, 4, 8], - #'meta_log_layout_num_rep' => [1, 2], - #'meta_log_layout_num_rep' => 1, - - 'custom' => '--tcp_skip_rank0 --mds_shutdown_check 60', - #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', - - 'comb' => { - 'x' => 'numclient',#'meta_log_layout_ssize', - 'vars' => [ 'mds.req' ] - } -}; diff --git a/tags/20070517_before_mds_merge/jobs/osd/osd_threads b/tags/20070517_before_mds_merge/jobs/osd/osd_threads deleted file mode 100644 index ef271f9e88710..0000000000000 --- a/tags/20070517_before_mds_merge/jobs/osd/osd_threads +++ /dev/null @@ -1,33 +0,0 @@ -# hi there -{ - # startup - 'n' => 30, # mpi nodes - 'sleep' => 10, # seconds between runs - 'nummds' => 1, - 'numosd' => 8, - 'numclient' => 50, - - # parameters - 'fs' => [ -# 'obfs', - 'fakestore', - 'ebofs' - ], - 'until' => 100, # --syn until $n ... when to stop clients - 'writefile' => 1, - 'writefile_size' => [ - 1024000, - 131072, - 65536, - 16 - ], - 'writefile_mb' => 1000, - - 'osd_maxthreads' => [0, 1, 2, 4, 8], - - 'custom' => '--tcp_skip_rank0', - - # for final summation (script/sum.pl) - 'start' => 30, - 'end' => 90 -}; diff --git a/tags/20070517_before_mds_merge/jobs/osd/striping b/tags/20070517_before_mds_merge/jobs/osd/striping deleted file mode 100644 index ea8cabe643274..0000000000000 --- a/tags/20070517_before_mds_merge/jobs/osd/striping +++ /dev/null @@ -1,78 +0,0 @@ -#!/usr/bin/perl -# hi there -{ - # startup - #'n' => 28, # mpi nodes - - 'sleep' => 3, # seconds between runs - 'nummds' => 1, - - 'numosd' => [2,3,4,5,6,7,8,10,12], #[6, 8, 10, 12, 16], - 'numosd' => [14], - #'cper' => [4, 5, 6, 7, 8, 9, 10, 11, 12], #[1, 4, 6, 8, 16, 32, 64], - #'cper' => [4, 6, 8, 10, 12, 16, 24, 32 ], #[1, 4, 6, 8, 16, 32, 64], - 'cper' => [30], - - '_dep' => [ 'cnode' => '$numosd', - 'numclient' => '$cnode * $cper', - 'n' => 38],#'$nummds + $numosd + $cnode'], - #'numclient' => [5, 10, 20, 50, 75, 100, 150 ], - - 'start' => 30, - 'end' => 90, - 'until' => 100, # --syn until $n ... when to stop clients - 'kill_after' => 260, - - # parameters - 'fs' => 'ebofs', - 'writefile' => 1, - - 'writefile_size' => [# 4096, - # 16*1024, - # 64*1024, - # 256*1024, - 1024*1024 ], -# 'writefile_size' => [ -# 2048*1024, -# 1048576, -# 512*1024, -# 262144, -# 65536, -# 16384 -# ], - 'writefile_mb' => 1000, - - 'file_layout_num_rep'=> [1,2,3], - - 'osd_pg_bits' => 12,#[6, 8, 10, 12, 14], - - 'osd_object_layout' => [ 'hashino' ],#'hash', 'hashino', 'linear' ], - 'osd_pg_layout' => [ 'crush', 'linear' ],#, 'linear'],#, 'hash' ],#, 'linear' ],#, 'hash' ], - - #'custom' => '--tcp_skip_rank0', # --osd_maxthreads 0', - #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', - - # for final summation (script/sum.pl) - - 'comb' => { - 'x' => 'numosd',#'writefile_size', - 'vars' => [ 'osd.c_wrb', 'cl.wrlat' ], -# 'maptitle' => { 'osd_object_layout=' => '', -# ',osd_pg_layout=' => ' + '} - } -}; - - -=item some googoo notes - -for 1mb 1x writes, - - with numosd=6, min cper=6 to saturate (cper_saturate) - googoo saturates at numosd=8. (osd_saturate) - - -> so, numosd=6 or 7 is a safe size! - - - - -=cut diff --git a/tags/20070517_before_mds_merge/jobs/osd/wr_lat2 b/tags/20070517_before_mds_merge/jobs/osd/wr_lat2 deleted file mode 100644 index 47053dd61f3ab..0000000000000 --- a/tags/20070517_before_mds_merge/jobs/osd/wr_lat2 +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - 'nummds' => 1, - 'numosd' => [12], - 'numclient' => [1],#, 40, 80, 160 ], - 'n' => 16, - - 'fs' => 'ebofs', - - 'start' => 10, - 'end' => 40, - 'until' => 40, - 'kill_after' => 90, - - 'writefile' => 1, - 'writefile_size' => [4096, - 8*1024, - 16*1024, - 32*1024, - 64*1024, - 128*1024, - 256*1024, - 512*1024, - 1024*1024], - 'writefile_mb' => 10000, - - #'tcp_multi_out' => [0,1], - -# 'mds_local_osd' => [ 0, 1 ], - 'file_layout_num_rep' => [1,2,3],#, 2, 3, 4], - - 'client_oc' => [0,1], - - 'custom' => '--tcp_skip_rank0', - - 'comb' => { - 'x' => 'writefile_size',#'file_layout_num_rep', - 'vars' => [ 'osd.c_wrb','cl.wrlat' ] - } -}; diff --git a/tags/20070517_before_mds_merge/jobs/osd/write_sizes b/tags/20070517_before_mds_merge/jobs/osd/write_sizes deleted file mode 100644 index 57369f3a97c50..0000000000000 --- a/tags/20070517_before_mds_merge/jobs/osd/write_sizes +++ /dev/null @@ -1,60 +0,0 @@ -#!/usr/bin/perl -# hi there -{ - # startup - 'n' => 30, # mpi nodes - 'sleep' => 3, # seconds between runs - 'nummds' => 1, - 'numosd' => 6, - 'numclient' => 100,#[25,50,100,300],#100,#[10, 50, 100, 200, 400], - - 'until' => 100, # --syn until $n ... when to stop clients - 'kill_after' => 300, - - # parameters - 'fs' => [ -# 'obfs', - 'fakestore', -# 'ebofs' - ], - 'writefile' => 1, - 'writefile_size' => [ -# 2048*1024, - 1024*1024, - 512*1024, - 256*1024, - 128*1024, - 64*1024, - 48*1024, - 32*1024, - 28*1024, - 24*1024, - 16*1024, - 12*1024, - 8*1024, - 4096, -# 256, -# 16, -# 1 - ], - 'writefile_mb' => 1000, - - 'file_layout_num_rep'=> 1,#[1,2], - - -# 'ebofs_idle_commit_ms' => [ 100, 500 ], -# 'ebofs_abp_max_alloc' => [ 4096*16, 4096*64 ], - - 'custom' => '--debug_after 110 --debug_mds 15 --debug 5 --mds_shutdown_check 60', - - # for final summation (script/sum.pl) - 'start' => 30, - 'end' => 90, - - 'comb' => { - 'x' => 'writefile_size', - 'vars' => [ 'osd.c_wrb' ], -# 'maptitle' => { 'osd_object_layout=' => '', -# ',osd_pg_layout=' => ' + '} - } -}; diff --git a/tags/20070517_before_mds_merge/jobs/rados/map_dist b/tags/20070517_before_mds_merge/jobs/rados/map_dist deleted file mode 100644 index 39f16daa1cdc2..0000000000000 --- a/tags/20070517_before_mds_merge/jobs/rados/map_dist +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - 'osdbits' => [6,7,8],#,9],10,11], - 'pgperbits' => [3],#,4,5],#[4,6,8], - - 'nummds' => 1, - - '_dep' => [ 'numosd' => '1 << $osdbits', - 'osd_pg_bits' => '$pgperbits + $osdbits', - 'n' => '3 + $numosd / 32'], - 'numclient' => 0, - - 'fake_osdmap_updates' => [30], - - 'fs' => 'ebofs', - - 'start' => 30, - 'end' => 300, - 'kill_after' => 300, - - 'custom' => '--bdev_lock 0 --ms_stripe_osds --osd_maxthreads 0', - #'custom' => '--tcp_skip_rank0', - - 'comb' => { - 'x' => 'osdbits', - 'vars' => [ 'osd.sum=mapi', 'osd.sum=mapidup', 'osd.numpg', 'osd.pingset' ] - } -}; diff --git a/tags/20070517_before_mds_merge/jobs/rados/rep_lat b/tags/20070517_before_mds_merge/jobs/rados/rep_lat deleted file mode 100644 index 3f5ab0c8a7d87..0000000000000 --- a/tags/20070517_before_mds_merge/jobs/rados/rep_lat +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - 'nummds' => 1, - 'numosd' => 8, #[6], - 'numclient' => 1,#, 40, 80, 160 ], - 'n' => 10, - - 'fs' => 'ebofs', - - 'start' => 10, - 'end' => 40, - 'until' => 40, - 'kill_after' => 45, - - 'writefile' => 1, - 'writefile_size' => [4096, -# 8*1024, -# 16*1024, -# 32*1024, - 64*1024, -# 128*1024, -# 256*1024, -# 512*1024, -# 1024*1024 -], - 'writefile_mb' => 10000, - - 'osd_rep' => [0,1,2], - - 'file_layout_num_rep' => [1,2,3,4,5,6],#, 2, 3, 4], - - 'osd_pg_bits' => 4, - 'custom' => '--osd_max_rep 8', - - 'comb' => { - 'x' => 'file_layout_num_rep', - 'vars' => [ 'cl.wrlat' ] - } -}; diff --git a/tags/20070517_before_mds_merge/jobs/rados/wr_sizes b/tags/20070517_before_mds_merge/jobs/rados/wr_sizes deleted file mode 100644 index 9b73477ea6142..0000000000000 --- a/tags/20070517_before_mds_merge/jobs/rados/wr_sizes +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - 'nummds' => 1, - 'numosd' => [8],#10,14,16], - 'numclient' => [10*16], - 'n' => 15, - - 'fs' => 'ebofs', - - 'start' => 60, - 'end' => 90, - 'until' => 90, - 'kill_after' => 190, - - 'writefile' => 1, - 'writefile_size' => [4096, - 8*1024, - 16*1024, - 32*1024, - 64*1024, - 128*1024, - 256*1024, - # 512*1024, -# 4*1024*1024, -# 2*1024*1024, -# 1024*1024 -], - 'writefile_mb' => 10000, - - 'file_layout_num_rep' => 1, - 'file_layout_ssize' => 4*1024*1024, - 'file_layout_osize' => 4*1024*1024, - - 'osd_pg_bits' => 12, - -# 'ebofs_freelist' => [0, 1080, 65400], - - 'custom' => '--objecter_buffer_uncommitted 0', - - #'custom' => '--tcp_skip_rank0', - - 'comb' => { - 'x' => 'writefile_size', - 'vars' => [ 'osd.c_wrb', 'cl.wrlat' ] - } -}; diff --git a/tags/20070517_before_mds_merge/mds/Anchor.h b/tags/20070517_before_mds_merge/mds/Anchor.h deleted file mode 100644 index 8da2bbdb52cd5..0000000000000 --- a/tags/20070517_before_mds_merge/mds/Anchor.h +++ /dev/null @@ -1,55 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __ANCHOR_H -#define __ANCHOR_H - -#include -using std::string; - -#include "include/types.h" -#include "include/buffer.h" - -class Anchor { -public: - inodeno_t ino; // my ino - inodeno_t dirino; // containing dir - string ref_dn; // referring dentry - int nref; // reference count - - Anchor() {} - Anchor(inodeno_t ino, inodeno_t dirino, string& ref_dn, int nref=0) { - this->ino = ino; - this->dirino = dirino; - this->ref_dn = ref_dn; - this->nref = nref; - } - - void _encode(bufferlist &bl) { - bl.append((char*)&ino, sizeof(ino)); - bl.append((char*)&dirino, sizeof(dirino)); - bl.append((char*)&nref, sizeof(nref)); - ::_encode(ref_dn, bl); - } - void _decode(bufferlist& bl, int& off) { - bl.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - bl.copy(off, sizeof(dirino), (char*)&dirino); - off += sizeof(dirino); - bl.copy(off, sizeof(nref), (char*)&nref); - off += sizeof(nref); - ::_decode(ref_dn, bl, off); - } -} ; - -#endif diff --git a/tags/20070517_before_mds_merge/mds/AnchorClient.cc b/tags/20070517_before_mds_merge/mds/AnchorClient.cc deleted file mode 100644 index af84eb6c2448a..0000000000000 --- a/tags/20070517_before_mds_merge/mds/AnchorClient.cc +++ /dev/null @@ -1,149 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include -using std::cout; -using std::cerr; -using std::endl; - -#include "Anchor.h" -#include "AnchorClient.h" -#include "MDSMap.h" - -#include "include/Context.h" -#include "msg/Messenger.h" - -#include "MDS.h" - -#include "messages/MAnchorRequest.h" -#include "messages/MAnchorReply.h" - -#include "config.h" -#undef dout -#define dout(x) if (x <= g_conf.debug) cout << g_clock.now() << " " << messenger->get_myaddr() << ".anchorclient " -#define derr(x) if (x <= g_conf.debug) cout << g_clock.now() << " " << messenger->get_myaddr() << ".anchorclient " - - -void AnchorClient::dispatch(Message *m) -{ - switch (m->get_type()) { - case MSG_MDS_ANCHORREPLY: - handle_anchor_reply((MAnchorReply*)m); - break; - - default: - assert(0); - } -} - -void AnchorClient::handle_anchor_reply(class MAnchorReply *m) -{ - switch (m->get_op()) { - - case ANCHOR_OP_LOOKUP: - { - assert(pending_lookup_trace.count(m->get_ino()) == 1); - - *(pending_lookup_trace[ m->get_ino() ]) = m->get_trace(); - Context *onfinish = pending_lookup_context[ m->get_ino() ]; - - pending_lookup_trace.erase(m->get_ino()); - pending_lookup_context.erase(m->get_ino()); - - if (onfinish) { - onfinish->finish(0); - delete onfinish; - } - } - break; - - case ANCHOR_OP_UPDATE: - case ANCHOR_OP_CREATE: - case ANCHOR_OP_DESTROY: - { - assert(pending_op.count(m->get_ino()) == 1); - - Context *onfinish = pending_op[m->get_ino()]; - pending_op.erase(m->get_ino()); - - if (onfinish) { - onfinish->finish(0); - delete onfinish; - } - } - break; - - default: - assert(0); - } - -} - - - -/* - * public async interface - */ - -void AnchorClient::lookup(inodeno_t ino, vector& trace, Context *onfinish) -{ - // send message - MAnchorRequest *req = new MAnchorRequest(ANCHOR_OP_LOOKUP, ino); - - pending_lookup_trace[ino] = &trace; - pending_lookup_context[ino] = onfinish; - - messenger->send_message(req, - mdsmap->get_inst(mdsmap->get_anchortable()), - MDS_PORT_ANCHORMGR, MDS_PORT_ANCHORCLIENT); -} - -void AnchorClient::create(inodeno_t ino, vector& trace, Context *onfinish) -{ - // send message - MAnchorRequest *req = new MAnchorRequest(ANCHOR_OP_CREATE, ino); - req->set_trace(trace); - - pending_op[ino] = onfinish; - - messenger->send_message(req, - mdsmap->get_inst(mdsmap->get_anchortable()), - MDS_PORT_ANCHORMGR, MDS_PORT_ANCHORCLIENT); -} - -void AnchorClient::update(inodeno_t ino, vector& trace, Context *onfinish) -{ - // send message - MAnchorRequest *req = new MAnchorRequest(ANCHOR_OP_UPDATE, ino); - req->set_trace(trace); - - pending_op[ino] = onfinish; - - messenger->send_message(req, - mdsmap->get_inst(mdsmap->get_anchortable()), - MDS_PORT_ANCHORMGR, MDS_PORT_ANCHORCLIENT); -} - -void AnchorClient::destroy(inodeno_t ino, Context *onfinish) -{ - // send message - MAnchorRequest *req = new MAnchorRequest(ANCHOR_OP_DESTROY, ino); - - pending_op[ino] = onfinish; - - messenger->send_message(req, - mdsmap->get_inst(mdsmap->get_anchortable()), - MDS_PORT_ANCHORMGR, MDS_PORT_ANCHORCLIENT); -} - - diff --git a/tags/20070517_before_mds_merge/mds/AnchorClient.h b/tags/20070517_before_mds_merge/mds/AnchorClient.h deleted file mode 100644 index 80b736a4b65c7..0000000000000 --- a/tags/20070517_before_mds_merge/mds/AnchorClient.h +++ /dev/null @@ -1,55 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __ANCHORCLIENT_H -#define __ANCHORCLIENT_H - -#include -using std::vector; -#include -using __gnu_cxx::hash_map; - -#include "include/types.h" -#include "msg/Dispatcher.h" - -#include "Anchor.h" - -class Messenger; -class MDSMap; -class Context; - -class AnchorClient : public Dispatcher { - Messenger *messenger; - MDSMap *mdsmap; - - // remote state - hash_map pending_op; - hash_map pending_lookup_context; - hash_map*> pending_lookup_trace; - - void handle_anchor_reply(class MAnchorReply *m); - - -public: - AnchorClient(Messenger *ms, MDSMap *mm) : messenger(ms), mdsmap(mm) {} - - // async user interface - void lookup(inodeno_t ino, vector& trace, Context *onfinish); - void create(inodeno_t ino, vector& trace, Context *onfinish); - void update(inodeno_t ino, vector& trace, Context *onfinish); - void destroy(inodeno_t ino, Context *onfinish); - - void dispatch(Message *m); -}; - -#endif diff --git a/tags/20070517_before_mds_merge/mds/AnchorTable.cc b/tags/20070517_before_mds_merge/mds/AnchorTable.cc deleted file mode 100644 index 6f380b0908d8d..0000000000000 --- a/tags/20070517_before_mds_merge/mds/AnchorTable.cc +++ /dev/null @@ -1,358 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "AnchorTable.h" -#include "MDS.h" - -#include "osdc/Filer.h" - -#include "msg/Messenger.h" -#include "messages/MAnchorRequest.h" -#include "messages/MAnchorReply.h" - -#include "common/Clock.h" - -#include "config.h" -#undef dout -#define dout(x) if (x <= g_conf.debug_mds) cout << g_clock.now() << " " << mds->messenger->get_myaddr() << ".anchortable " -#define derr(x) if (x <= g_conf.debug_mds) cerr << g_clock.now() << " " << mds->messenger->get_myaddr() << ".anchortable " - -AnchorTable::AnchorTable(MDS *mds) -{ - this->mds = mds; - opening = false; - opened = false; -} - -void AnchorTable::init_inode() -{ - memset(&table_inode, 0, sizeof(table_inode)); - table_inode.ino = MDS_INO_ANCHORTABLE+mds->get_nodeid(); - table_inode.layout = g_OSD_FileLayout; -} - -void AnchorTable::reset() -{ - init_inode(); - opened = true; - anchor_map.clear(); -} - -/* - * basic updates - */ - -bool AnchorTable::add(inodeno_t ino, inodeno_t dirino, string& ref_dn) -{ - dout(7) << "add " << std::hex << ino << " dirino " << dirino << std::dec << " ref_dn " << ref_dn << endl; - - // parent should be there - assert(dirino < 1000 || // system dirino - anchor_map.count(dirino)); // have - - if (anchor_map.count(ino) == 0) { - // new item - anchor_map[ ino ] = new Anchor(ino, dirino, ref_dn); - dout(10) << " add: added " << std::hex << ino << std::dec << endl; - return true; - } else { - dout(10) << " add: had " << std::hex << ino << std::dec << endl; - return false; - } -} - -void AnchorTable::inc(inodeno_t ino) -{ - dout(7) << "inc " << std::hex << ino << std::dec << endl; - - assert(anchor_map.count(ino) != 0); - Anchor *anchor = anchor_map[ino]; - assert(anchor); - - while (1) { - anchor->nref++; - - dout(10) << " inc: record " << std::hex << ino << std::dec << " now " << anchor->nref << endl; - ino = anchor->dirino; - - if (ino == 0) break; - if (anchor_map.count(ino) == 0) break; - anchor = anchor_map[ino]; - assert(anchor); - } -} - -void AnchorTable::dec(inodeno_t ino) -{ - dout(7) << "dec " << std::hex << ino << std::dec << endl; - - assert(anchor_map.count(ino) != 0); - Anchor *anchor = anchor_map[ino]; - assert(anchor); - - while (true) { - anchor->nref--; - - if (anchor->nref == 0) { - dout(10) << " dec: record " << std::hex << ino << std::dec << " now 0, removing" << endl; - inodeno_t dirino = anchor->dirino; - anchor_map.erase(ino); - delete anchor; - ino = dirino; - } else { - dout(10) << " dec: record " << std::hex << ino << std::dec << " now " << anchor->nref << endl; - ino = anchor->dirino; - } - - if (ino == 0) break; - if (anchor_map.count(ino) == 0) break; - anchor = anchor_map[ino]; - assert(anchor); - } -} - - -/* - * high level - */ - -void AnchorTable::lookup(inodeno_t ino, vector& trace) -{ - dout(7) << "lookup " << std::hex << ino << std::dec << endl; - - assert(anchor_map.count(ino) == 1); - Anchor *anchor = anchor_map[ino]; - assert(anchor); - - while (true) { - dout(10) << " record " << std::hex << anchor->ino << " dirino " << anchor->dirino << std::dec << " ref_dn " << anchor->ref_dn << endl; - trace.insert(trace.begin(), anchor); // lame FIXME - - if (anchor->dirino < MDS_INO_BASE) break; - - assert(anchor_map.count(anchor->dirino) == 1); - anchor = anchor_map[anchor->dirino]; - assert(anchor); - } -} - -void AnchorTable::create(inodeno_t ino, vector& trace) -{ - dout(7) << "create " << std::hex << ino << std::dec << endl; - - // make sure trace is in table - for (unsigned i=0; iino, trace[i]->dirino, trace[i]->ref_dn); - - inc(ino); // ok! -} - -void AnchorTable::destroy(inodeno_t ino) -{ - dec(ino); -} - - - -/* - * messages - */ - -void AnchorTable::dispatch(Message *m) -{ - switch (m->get_type()) { - case MSG_MDS_ANCHORREQUEST: - handle_anchor_request((MAnchorRequest*)m); - break; - - default: - assert(0); - } -} - - - -void AnchorTable::handle_anchor_request(class MAnchorRequest *m) -{ - // make sure i'm open! - if (!opened) { - dout(7) << "not open yet" << endl; - - waiting_for_open.push_back(new C_MDS_RetryMessage(mds,m)); - - if (!opening) { - opening = true; - load(0); - } - return; - } - - // go - MAnchorReply *reply = new MAnchorReply(m); - - switch (m->get_op()) { - - case ANCHOR_OP_LOOKUP: - lookup( m->get_ino(), reply->get_trace() ); - break; - - case ANCHOR_OP_UPDATE: - destroy( m->get_ino() ); - create( m->get_ino(), m->get_trace() ); - break; - - case ANCHOR_OP_CREATE: - create( m->get_ino(), m->get_trace() ); - break; - - case ANCHOR_OP_DESTROY: - destroy( m->get_ino() ); - break; - - default: - assert(0); - } - - // send reply - mds->messenger->send_message(reply, m->get_source_inst(), m->get_source_port()); - delete m; -} - - - - -// primitive load/save for now! - -// load/save entire table for now! - -void AnchorTable::save(Context *onfinish) -{ - dout(7) << "save" << endl; - if (!opened) return; - - // build up write - bufferlist tabbl; - - int num = anchor_map.size(); - tabbl.append((char*)&num, sizeof(int)); - - for (hash_map::iterator it = anchor_map.begin(); - it != anchor_map.end(); - it++) { - dout(14) << " saving anchor for " << std::hex << it->first << std::dec << endl; - Anchor *a = it->second; - assert(a); - a->_encode(tabbl); - } - - bufferlist bl; - size_t size = tabbl.length(); - bl.append((char*)&size, sizeof(size)); - bl.claim_append(tabbl); - - dout(7) << " " << num << " anchors, " << size << " bytes" << endl; - - // write! - mds->filer->write(table_inode, - 0, bl.length(), - bl, 0, - NULL, onfinish); -} - - - -class C_AT_Load : public Context { - AnchorTable *at; -public: - size_t size; - bufferlist bl; - C_AT_Load(size_t size, AnchorTable *at) { - this->size = size; - this->at = at; - } - void finish(int result) { - assert(result > 0); - - at->load_2(size, bl); - } -}; - -class C_AT_LoadSize : public Context { - AnchorTable *at; - MDS *mds; -public: - bufferlist bl; - C_AT_LoadSize(AnchorTable *at, MDS *mds) { - this->at = at; - this->mds = mds; - } - void finish(int r) { - size_t size = 0; - assert(bl.length() >= sizeof(size)); - bl.copy(0, sizeof(size), (char*)&size); - cout << "r is " << r << " size is " << size << endl; - if (r > 0 && size > 0) { - C_AT_Load *c = new C_AT_Load(size, at); - mds->filer->read(at->table_inode, - sizeof(size), size, - &c->bl, - c); - } else { - // fail - bufferlist empty; - at->load_2(0, empty); - } - } -}; - -void AnchorTable::load(Context *onfinish) -{ - dout(7) << "load" << endl; - init_inode(); - - assert(!opened); - - waiting_for_open.push_back(onfinish); - - C_AT_LoadSize *c = new C_AT_LoadSize(this, mds); - mds->filer->read(table_inode, - 0, sizeof(size_t), - &c->bl, - c); -} - -void AnchorTable::load_2(size_t size, bufferlist& bl) -{ - // num - int off = 0; - int num; - bl.copy(0, sizeof(num), (char*)&num); - off += sizeof(num); - - // parse anchors - for (int i=0; i_decode(bl, off); - dout(10) << "load_2 decoded " << std::hex << a->ino << " dirino " << a->dirino << std::dec << " ref_dn " << a->ref_dn << endl; - anchor_map[a->ino] = a; - } - - dout(7) << "load_2 got " << num << " anchors" << endl; - - opened = true; - opening = false; - - // finish - finish_contexts(waiting_for_open); -} - diff --git a/tags/20070517_before_mds_merge/mds/AnchorTable.h b/tags/20070517_before_mds_merge/mds/AnchorTable.h deleted file mode 100644 index 0b0af03af5b68..0000000000000 --- a/tags/20070517_before_mds_merge/mds/AnchorTable.h +++ /dev/null @@ -1,81 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __ANCHORTABLE_H -#define __ANCHORTABLE_H - -#include "Anchor.h" -#include "include/Context.h" - -#include -using namespace __gnu_cxx; - -class MDS; - - -class AnchorTable { - MDS *mds; - hash_map anchor_map; - - bool opening, opened; - list waiting_for_open; - - public: - inode_t table_inode; - - public: - AnchorTable(MDS *mds); - - protected: - void init_inode(); // call this before doing anything. - - // - bool have_ino(inodeno_t ino) { - return true; // always in memory for now. - } - void fetch_ino(inodeno_t ino, Context *onfinish) { - assert(!opened); - load(onfinish); - } - - // adjust table - bool add(inodeno_t ino, inodeno_t dirino, string& ref_dn); - void inc(inodeno_t ino); - void dec(inodeno_t ino); - - - // high level interface - void lookup(inodeno_t ino, vector& trace); - void create(inodeno_t ino, vector& trace); - void destroy(inodeno_t ino); - - // messages - public: - void dispatch(class Message *m); - protected: - void handle_anchor_request(class MAnchorRequest *m); - - - public: - - // load/save entire table for now! - void reset(); - void save(Context *onfinish); - void load(Context *onfinish); - void load_2(size_t size, bufferlist& bl); - - -}; - -#endif diff --git a/tags/20070517_before_mds_merge/mds/CDentry.cc b/tags/20070517_before_mds_merge/mds/CDentry.cc deleted file mode 100644 index 22d292a001e33..0000000000000 --- a/tags/20070517_before_mds_merge/mds/CDentry.cc +++ /dev/null @@ -1,203 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "CDentry.h" -#include "CInode.h" -#include "CDir.h" - -#include "MDS.h" -#include "MDCache.h" - -#include - -#undef dout -#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_mds) cout << g_clock.now() << " mds" << dir->cache->mds->get_nodeid() << ".cache.den(" << dir->ino() << " " << name << ") " - - -// CDentry - -ostream& operator<<(ostream& out, CDentry& dn) -{ - string path; - dn.make_path(path); - - out << "[dentry " << path; - if (dn.is_auth()) { - out << " auth"; - if (dn.is_replicated()) - out << dn.get_replicas(); - } else { - out << " rep@" << dn.authority(); - out << "." << dn.get_replica_nonce(); - assert(dn.get_replica_nonce() >= 0); - } - - if (dn.is_null()) out << " NULL"; - if (dn.is_remote()) out << " REMOTE"; - - if (dn.is_pinned()) out << " " << dn.num_pins() << " pathpins"; - - if (dn.get_lockstate() == DN_LOCK_UNPINNING) out << " unpinning"; - if (dn.get_lockstate() == DN_LOCK_PREXLOCK) out << " prexlock=" << dn.get_xlockedby() << " g=" << dn.get_gather_set(); - if (dn.get_lockstate() == DN_LOCK_XLOCK) out << " xlock=" << dn.get_xlockedby(); - - out << " v=" << dn.get_version(); - out << " pv=" << dn.get_projected_version(); - - out << " inode=" << dn.get_inode(); - - if (dn.get_num_ref()) { - out << " |"; - for(set::iterator it = dn.get_ref_set().begin(); - it != dn.get_ref_set().end(); - it++) - out << " " << CDentry::pin_name(*it); - } - - out << " " << &dn; - out << "]"; - return out; -} - -CDentry::CDentry(const CDentry& m) { - assert(0); //std::cerr << "copy cons called, implement me" << endl; -} - - -inodeno_t CDentry::get_ino() -{ - if (inode) - return inode->ino(); - return inodeno_t(); -} - - -int CDentry::authority() -{ - return dir->dentry_authority( name ); -} - - -version_t CDentry::pre_dirty() -{ - // NOTE: in the future, this will dirty a particular slice/subset of the dir. - projected_version = dir->pre_dirty(); - dout(10) << " pre_dirty " << *this << endl; - return projected_version; -} - - -void CDentry::_mark_dirty() -{ - // state+pin - if (!state_test(STATE_DIRTY)) { - state_set(STATE_DIRTY); - get(PIN_DIRTY); - } -} - -void CDentry::mark_dirty(version_t pv) -{ - dout(10) << " mark_dirty " << *this << endl; - - // i now live in this new dir version - assert(pv == projected_version); - version = pv; - _mark_dirty(); - - // mark dir too - dir->mark_dirty(pv); -} - -void CDentry::mark_clean() { - dout(10) << " mark_clean " << *this << endl; - assert(is_dirty()); - assert(version <= dir->get_version()); - - // this happens on export. - //assert(version <= dir->get_last_committed_version()); - - // state+pin - state_clear(STATE_DIRTY); - put(PIN_DIRTY); -} - - -void CDentry::make_path(string& s) -{ - if (dir) { - if (dir->inode->get_parent_dn()) - dir->inode->get_parent_dn()->make_path(s); - } else { - s = "???"; - } - s += "/"; - s += name; -} - - -void CDentry::link_remote(CInode *in) -{ - assert(is_remote()); - assert(in->ino() == remote_ino); - - inode = in; - in->add_remote_parent(this); -} - -void CDentry::unlink_remote() -{ - assert(is_remote()); - assert(inode); - - inode->remove_remote_parent(this); - inode = 0; -} - - -CDentryDiscover *CDentry::replicate_to(int who) -{ - int nonce = add_replica(who); - return new CDentryDiscover(this, nonce); -} - - - - -// = -const CDentry& CDentry::operator= (const CDentry& right) { - assert(0); //std::cerr << "copy op called, implement me" << endl; - return *this; -} - - // comparisons - bool CDentry::operator== (const CDentry& right) const { - return name == right.name; - } - bool CDentry::operator!= (const CDentry& right) const { - return name == right.name; - } - bool CDentry::operator< (const CDentry& right) const { - return name < right.name; - } - bool CDentry::operator> (const CDentry& right) const { - return name > right.name; - } - bool CDentry::operator>= (const CDentry& right) const { - return name >= right.name; - } - bool CDentry::operator<= (const CDentry& right) const { - return name <= right.name; - } diff --git a/tags/20070517_before_mds_merge/mds/CDentry.h b/tags/20070517_before_mds_merge/mds/CDentry.h deleted file mode 100644 index 65b9155ce69f9..0000000000000 --- a/tags/20070517_before_mds_merge/mds/CDentry.h +++ /dev/null @@ -1,288 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __CDENTRY_H -#define __CDENTRY_H - -#include -#include -#include -using namespace std; - -#include "include/types.h" -#include "include/buffer.h" -#include "include/lru.h" -#include "mdstypes.h" - -class CInode; -class CDir; - -#define DN_LOCK_SYNC 0 -#define DN_LOCK_PREXLOCK 1 -#define DN_LOCK_XLOCK 2 -#define DN_LOCK_UNPINNING 3 // waiting for pins to go away .. FIXME REVIEW THIS CODE .. - -#define DN_XLOCK_FOREIGN ((Message*)0x1) // not 0, not a valid pointer. - -class Message; -class CDentryDiscover; - -// dentry -class CDentry : public MDSCacheObject, public LRUObject { - public: - // state - static const int STATE_AUTH = (1<<0); - static const int STATE_DIRTY = (1<<1); - - // pins - static const int PIN_INODEPIN = 0; // linked inode is pinned - static const int PIN_REPLICATED = 1; // replicated by another MDS - static const int PIN_DIRTY = 2; // - static const int PIN_PROXY = 3; // - static const char *pin_name(int p) { - switch (p) { - case PIN_INODEPIN: return "inodepin"; - case PIN_REPLICATED: return "replicated"; - case PIN_DIRTY: return "dirty"; - case PIN_PROXY: return "proxy"; - default: assert(0); - } - }; - - - protected: - string name; - CInode *inode; - CDir *dir; - - inodeno_t remote_ino; // if remote dentry - - version_t version; // dir version when last touched. - version_t projected_version; // what it will be when i unlock/commit. - - // locking - int lockstate; - Message *xlockedby; - set gather_set; - - // path pins - int npins; - multiset pinset; - - friend class Migrator; - friend class Locker; - friend class Renamer; - friend class Server; - friend class MDCache; - friend class MDS; - friend class CInode; - friend class C_MDC_XlockRequest; - - public: - // cons - CDentry() : - inode(0), - dir(0), - remote_ino(0), - version(0), - projected_version(0), - lockstate(DN_LOCK_SYNC), - xlockedby(0), - npins(0) { } - CDentry(const string& n, inodeno_t ino, CInode *in=0) : - name(n), - inode(in), - dir(0), - remote_ino(ino), - version(0), - projected_version(0), - lockstate(DN_LOCK_SYNC), - xlockedby(0), - npins(0) { } - CDentry(const string& n, CInode *in) : - name(n), - inode(in), - dir(0), - remote_ino(0), - version(0), - projected_version(0), - lockstate(DN_LOCK_SYNC), - xlockedby(0), - npins(0) { } - - CInode *get_inode() { return inode; } - CDir *get_dir() { return dir; } - const string& get_name() { return name; } - inodeno_t get_ino(); - inodeno_t get_remote_ino() { return remote_ino; } - - void set_remote_ino(inodeno_t ino) { remote_ino = ino; } - - - // ref counts: pin ourselves in the LRU when we're pinned. - void first_get() { - lru_pin(); - } - void last_put() { - lru_unpin(); - } - - - // dentry type is primary || remote || null - // inode ptr is required for primary, optional for remote, undefined for null - bool is_primary() { return remote_ino == 0 && inode != 0; } - bool is_remote() { return remote_ino > 0; } - bool is_null() { return (remote_ino == 0 && inode == 0) ? true:false; } - - // remote links - void link_remote(CInode *in); - void unlink_remote(); - - - // copy cons - CDentry(const CDentry& m); - const CDentry& operator= (const CDentry& right); - - // comparisons - bool operator== (const CDentry& right) const; - bool operator!= (const CDentry& right) const; - bool operator< (const CDentry& right) const; - bool operator> (const CDentry& right) const; - bool operator>= (const CDentry& right) const; - bool operator<= (const CDentry& right) const; - - // misc - void make_path(string& p); - - // -- state - version_t get_version() { return version; } - void set_version(version_t v) { projected_version = version = v; } - version_t get_projected_version() { return projected_version; } - void set_projected_version(version_t v) { projected_version = v; } - - int authority(); - - bool is_auth() { return state & STATE_AUTH; } - bool is_dirty() { return state & STATE_DIRTY; } - bool is_clean() { return !is_dirty(); } - - version_t pre_dirty(); - void _mark_dirty(); - void mark_dirty(version_t projected_dirv); - void mark_clean(); - - - // -- replication - CDentryDiscover *replicate_to(int rep); - - - // -- locking - int get_lockstate() { return lockstate; } - set& get_gather_set() { return gather_set; } - - bool is_sync() { return lockstate == DN_LOCK_SYNC; } - bool can_read() { return (lockstate == DN_LOCK_SYNC) || (lockstate == DN_LOCK_UNPINNING); } - bool can_read(Message *m) { return is_xlockedbyme(m) || can_read(); } - bool is_xlocked() { return lockstate == DN_LOCK_XLOCK; } - Message* get_xlockedby() { return xlockedby; } - bool is_xlockedbyother(Message *m) { return (lockstate == DN_LOCK_XLOCK) && m != xlockedby; } - bool is_xlockedbyme(Message *m) { return (lockstate == DN_LOCK_XLOCK) && m == xlockedby; } - bool is_prexlockbyother(Message *m) { - return (lockstate == DN_LOCK_PREXLOCK) && m != xlockedby; - } - - int get_replica_lockstate() { - switch (lockstate) { - case DN_LOCK_XLOCK: - case DN_LOCK_SYNC: - return lockstate; - case DN_LOCK_PREXLOCK: - return DN_LOCK_XLOCK; - case DN_LOCK_UNPINNING: - return DN_LOCK_SYNC; - } - assert(0); - return 0; - } - void set_lockstate(int s) { lockstate = s; } - - // path pins - void pin(Message *m) { - npins++; - pinset.insert(m); - assert(pinset.size() == (unsigned)npins); - } - void unpin(Message *m) { - npins--; - assert(npins >= 0); - assert(pinset.count(m) > 0); - pinset.erase(pinset.find(m)); - assert(pinset.size() == (unsigned)npins); - } - bool is_pinnable(Message *m) { - return (lockstate == DN_LOCK_SYNC) || - (lockstate == DN_LOCK_UNPINNING && pinset.count(m)); - } - bool is_pinned() { return npins>0; } - int num_pins() { return npins; } - - friend class CDir; -}; - -ostream& operator<<(ostream& out, CDentry& dn); - - -class CDentryDiscover { - string dname; - int replica_nonce; - int lockstate; - - inodeno_t ino; - inodeno_t remote_ino; - -public: - CDentryDiscover() {} - CDentryDiscover(CDentry *dn, int nonce) : - dname(dn->get_name()), replica_nonce(nonce), - lockstate(dn->get_replica_lockstate()), - ino(dn->get_ino()), - remote_ino(dn->get_remote_ino()) { } - - string& get_dname() { return dname; } - int get_nonce() { return replica_nonce; } - - void update_dentry(CDentry *dn) { - dn->set_replica_nonce( replica_nonce ); - dn->set_lockstate( lockstate ); - } - - void _encode(bufferlist& bl) { - ::_encode(dname, bl); - bl.append((char*)&replica_nonce, sizeof(replica_nonce)); - bl.append((char*)&lockstate, sizeof(lockstate)); - } - - void _decode(bufferlist& bl, int& off) { - ::_decode(dname, bl, off); - bl.copy(off, sizeof(replica_nonce), (char*)&replica_nonce); - off += sizeof(replica_nonce); - bl.copy(off, sizeof(lockstate), (char*)&lockstate); - off += sizeof(lockstate); - } - -}; - - -#endif diff --git a/tags/20070517_before_mds_merge/mds/CDir.cc b/tags/20070517_before_mds_merge/mds/CDir.cc deleted file mode 100644 index c9b9996d91c2d..0000000000000 --- a/tags/20070517_before_mds_merge/mds/CDir.cc +++ /dev/null @@ -1,890 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "CDir.h" -#include "CDentry.h" -#include "CInode.h" - -#include "MDS.h" -#include "MDCache.h" -#include "MDSMap.h" - -#include "include/Context.h" -#include "common/Clock.h" - -#include - -#include "config.h" -#undef dout -#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_mds) cout << g_clock.now() << " mds" << cache->mds->get_nodeid() << ".cache.dir(" << inode->inode.ino << ") " - - -// PINS -//int cdir_pins[CDIR_NUM_PINS] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0 }; - - - -ostream& operator<<(ostream& out, CDir& dir) -{ - string path; - dir.get_inode()->make_path(path); - out << "[dir " << dir.ino() << " " << path << "/"; - if (dir.is_auth()) { - out << " auth"; - if (dir.is_replicated()) - out << dir.get_replicas(); - - out << " v=" << dir.get_version(); - out << " pv=" << dir.get_projected_version(); - out << " cv=" << dir.get_committing_version(); - out << " lastcv=" << dir.get_last_committed_version(); - } else { - out << " rep@" << dir.authority(); - if (dir.get_replica_nonce() > 1) - out << "." << dir.get_replica_nonce(); - } - - if (dir.get_dir_auth() != CDIR_AUTH_PARENT) - out << " dir_auth=" << dir.get_dir_auth(); - - out << " state=" << dir.get_state(); - if (dir.state_test(CDIR_STATE_PROXY)) out << "|proxy"; - if (dir.state_test(CDIR_STATE_COMPLETE)) out << "|complete"; - if (dir.state_test(CDIR_STATE_FREEZINGTREE)) out << "|freezingtree"; - if (dir.state_test(CDIR_STATE_FROZENTREE)) out << "|frozentree"; - if (dir.state_test(CDIR_STATE_FROZENTREELEAF)) out << "|frozentreeleaf"; - if (dir.state_test(CDIR_STATE_FROZENDIR)) out << "|frozendir"; - if (dir.state_test(CDIR_STATE_FREEZINGDIR)) out << "|freezingdir"; - - out << " sz=" << dir.get_nitems() << "+" << dir.get_nnull(); - - if (dir.get_num_ref()) { - out << " |"; - for(set::iterator it = dir.get_ref_set().begin(); - it != dir.get_ref_set().end(); - it++) - out << " " << CDir::pin_name(*it); - } - - out << " " << &dir; - return out << "]"; -} - - -// ------------------------------------------------------------------- -// CDir - -CDir::CDir(CInode *in, MDCache *mdcache, bool auth) -{ - inode = in; - this->cache = mdcache; - - nitems = 0; - nnull = 0; - state = CDIR_STATE_INITIAL; - - projected_version = version = 0; - committing_version = 0; - last_committed_version = 0; - - ref = 0; - - // auth - dir_auth = -1; - assert(in->is_dir()); - if (auth) - state |= CDIR_STATE_AUTH; - /* - if (in->dir_is_hashed()) { - assert(0); // when does this happen? - state |= CDIR_STATE_HASHED; - } - */ - - auth_pins = 0; - nested_auth_pins = 0; - request_pins = 0; - - dir_rep = CDIR_REP_NONE; -} - - - - -/*** - * linking fun - */ - -CDentry* CDir::add_dentry( const string& dname, inodeno_t ino, bool auth) -{ - // foreign - assert(lookup(dname) == 0); - - // create dentry - CDentry* dn = new CDentry(dname, ino); - if (auth) - dn->state_set(CDentry::STATE_AUTH); - cache->lru.lru_insert_mid(dn); - - dn->dir = this; - dn->version = projected_version; - - // add to dir - assert(items.count(dn->name) == 0); - assert(null_items.count(dn->name) == 0); - - items[dn->name] = dn; - nitems++; - - dout(12) << "add_dentry " << *dn << endl; - - // pin? - if (nnull + nitems == 1) get(PIN_CHILD); - - assert(nnull + nitems == items.size()); - assert(nnull == null_items.size()); - return dn; -} - - -CDentry* CDir::add_dentry( const string& dname, CInode *in, bool auth ) -{ - // primary - assert(lookup(dname) == 0); - - // create dentry - CDentry* dn = new CDentry(dname, in); - if (auth) - dn->state_set(CDentry::STATE_AUTH); - cache->lru.lru_insert_mid(dn); - - dn->dir = this; - dn->version = projected_version; - - // add to dir - assert(items.count(dn->name) == 0); - assert(null_items.count(dn->name) == 0); - - items[dn->name] = dn; - - if (in) { - link_inode_work( dn, in ); - } else { - assert(dn->inode == 0); - null_items[dn->name] = dn; - nnull++; - } - - dout(12) << "add_dentry " << *dn << endl; - - // pin? - if (nnull + nitems == 1) get(PIN_CHILD); - - assert(nnull + nitems == items.size()); - assert(nnull == null_items.size()); - return dn; -} - - - -void CDir::remove_dentry(CDentry *dn) -{ - dout(12) << "remove_dentry " << *dn << endl; - - if (dn->inode) { - // detach inode and dentry - unlink_inode_work(dn); - } else { - // remove from null list - assert(null_items.count(dn->name) == 1); - null_items.erase(dn->name); - nnull--; - } - - // remove from list - assert(items.count(dn->name) == 1); - items.erase(dn->name); - - cache->lru.lru_remove(dn); - delete dn; - - // unpin? - if (nnull + nitems == 0) put(PIN_CHILD); - - assert(nnull + nitems == items.size()); - assert(nnull == null_items.size()); -} - -void CDir::link_inode( CDentry *dn, inodeno_t ino) -{ - dout(12) << "link_inode " << *dn << " remote " << ino << endl; - - assert(dn->is_null()); - dn->set_remote_ino(ino); - nitems++; - - assert(null_items.count(dn->name) == 1); - null_items.erase(dn->name); - nnull--; -} - -void CDir::link_inode( CDentry *dn, CInode *in ) -{ - dout(12) << "link_inode " << *dn << " " << *in << endl; - assert(!dn->is_remote()); - - link_inode_work(dn,in); - - // remove from null list - assert(null_items.count(dn->name) == 1); - null_items.erase(dn->name); - nnull--; - - assert(nnull + nitems == items.size()); - assert(nnull == null_items.size()); -} - -void CDir::link_inode_work( CDentry *dn, CInode *in ) -{ - dn->inode = in; - in->set_primary_parent(dn); - - nitems++; // adjust dir size - - // set dir version - in->inode.version = dn->get_version(); - - // clear dangling - in->state_clear(CInode::STATE_DANGLING); - - // pin dentry? - if (in->get_num_ref()) - dn->get(CDentry::PIN_INODEPIN); - - // adjust auth pin count - if (in->auth_pins + in->nested_auth_pins) - adjust_nested_auth_pins( in->auth_pins + in->nested_auth_pins ); -} - -void CDir::unlink_inode( CDentry *dn ) -{ - dout(12) << "unlink_inode " << *dn << " " << *dn->inode << endl; - - unlink_inode_work(dn); - - // add to null list - assert(null_items.count(dn->name) == 0); - null_items[dn->name] = dn; - nnull++; - - assert(nnull + nitems == items.size()); - assert(nnull == null_items.size()); -} - -void CDir::unlink_inode_work( CDentry *dn ) -{ - CInode *in = dn->inode; - - if (dn->is_remote()) { - // remote - if (in) - dn->unlink_remote(); - - dn->set_remote_ino(0); - } else { - // primary - assert(dn->is_primary()); - - // explicitly define auth - in->dangling_auth = in->authority(); - //dout(10) << "unlink_inode " << *in << " dangling_auth now " << in->dangling_auth << endl; - - // unpin dentry? - if (in->get_num_ref()) - dn->put(CDentry::PIN_INODEPIN); - - // unlink auth_pin count - if (in->auth_pins + in->nested_auth_pins) - adjust_nested_auth_pins( 0 - (in->auth_pins + in->nested_auth_pins) ); - - // set dangling flag - in->state_set(CInode::STATE_DANGLING); - - // detach inode - in->remove_primary_parent(dn); - dn->inode = 0; - } - - nitems--; // adjust dir size -} - -void CDir::remove_null_dentries() { - dout(12) << "remove_null_dentries " << *this << endl; - - list dns; - for (CDir_map_t::iterator it = null_items.begin(); - it != null_items.end(); - it++) { - dns.push_back(it->second); - } - - for (list::iterator it = dns.begin(); - it != dns.end(); - it++) { - CDentry *dn = *it; - assert(dn->is_sync()); - remove_dentry(dn); - } - assert(null_items.empty()); - assert(nnull == 0); - assert(nnull + nitems == items.size()); -} - - - -/**************************************** - * WAITING - */ - -bool CDir::waiting_for(int tag) -{ - return waiting.count(tag) > 0; -} - -bool CDir::waiting_for(int tag, const string& dn) -{ - if (!waiting_on_dentry.count(dn)) - return false; - return waiting_on_dentry[dn].count(tag) > 0; -} - -void CDir::add_waiter(int tag, - const string& dentry, - Context *c) { - if (waiting.empty() && waiting_on_dentry.size() == 0) - get(PIN_WAITER); - waiting_on_dentry[ dentry ].insert(pair(tag,c)); - dout(10) << "add_waiter dentry " << dentry << " tag " << tag << " " << c << " on " << *this << endl; -} - -void CDir::add_waiter(int tag, Context *c) { - // hierarchical? - if (tag & CDIR_WAIT_ATFREEZEROOT && (is_freezing() || is_frozen())) { - if (is_freezing_tree_root() || is_frozen_tree_root() || - is_freezing_dir() || is_frozen_dir()) { - // it's us, pin here. (fall thru) - } else { - // pin parent! - dout(10) << "add_waiter " << tag << " " << c << " should be ATFREEZEROOT, " << *this << " is not root, trying parent" << endl; - inode->parent->dir->add_waiter(tag, c); - return; - } - } - - // this dir. - if (waiting.empty() && waiting_on_dentry.size() == 0) - get(PIN_WAITER); - waiting.insert(pair(tag,c)); - dout(10) << "add_waiter " << tag << " " << c << " on " << *this << endl; -} - - -void CDir::take_waiting(int mask, - const string& dentry, - list& ls, - int num) -{ - if (waiting_on_dentry.empty()) return; - - multimap::iterator it = waiting_on_dentry[dentry].begin(); - while (it != waiting_on_dentry[dentry].end()) { - if (it->first & mask) { - ls.push_back(it->second); - dout(10) << "take_waiting dentry " << dentry << " mask " << mask << " took " << it->second << " tag " << it->first << " on " << *this << endl; - waiting_on_dentry[dentry].erase(it++); - - if (num) { - if (num == 1) break; - num--; - } - } else { - dout(10) << "take_waiting dentry " << dentry << " mask " << mask << " SKIPPING " << it->second << " tag " << it->first << " on " << *this << endl; - it++; - } - } - - // did we clear dentry? - if (waiting_on_dentry[dentry].empty()) - waiting_on_dentry.erase(dentry); - - // ...whole map? - if (waiting_on_dentry.size() == 0 && waiting.empty()) - put(PIN_WAITER); -} - -/* NOTE: this checks dentry waiters too */ -void CDir::take_waiting(int mask, - list& ls) -{ - if (waiting_on_dentry.size()) { - // try each dentry - hash_map >::iterator it = - waiting_on_dentry.begin(); - while (it != waiting_on_dentry.end()) { - take_waiting(mask, (it++)->first, ls); // not post-inc - } - } - - // waiting - if (!waiting.empty()) { - multimap::iterator it = waiting.begin(); - while (it != waiting.end()) { - if (it->first & mask) { - ls.push_back(it->second); - dout(10) << "take_waiting mask " << mask << " took " << it->second << " tag " << it->first << " on " << *this << endl; - waiting.erase(it++); - } else { - dout(10) << "take_waiting mask " << mask << " SKIPPING " << it->second << " tag " << it->first << " on " << *this<< endl; - it++; - } - } - - if (waiting_on_dentry.size() == 0 && waiting.empty()) - put(PIN_WAITER); - } -} - - -void CDir::finish_waiting(int mask, int result) -{ - dout(11) << "finish_waiting mask " << mask << " result " << result << " on " << *this << endl; - - list finished; - take_waiting(mask, finished); - finish_contexts(finished, result); -} - -void CDir::finish_waiting(int mask, const string& dn, int result) -{ - dout(11) << "finish_waiting mask " << mask << " dn " << dn << " result " << result << " on " << *this << endl; - - list finished; - take_waiting(mask, dn, finished); - finish_contexts(finished, result); -} - - -// dirty/clean - -version_t CDir::pre_dirty() -{ - ++projected_version; - dout(10) << "pre_dirty " << projected_version << endl; - return projected_version; -} - -void CDir::_mark_dirty() -{ - if (!state_test(CDIR_STATE_DIRTY)) { - state_set(CDIR_STATE_DIRTY); - dout(10) << "mark_dirty (was clean) " << *this << " version " << version << endl; - get(PIN_DIRTY); - } else { - dout(10) << "mark_dirty (already dirty) " << *this << " version " << version << endl; - } -} - -void CDir::mark_dirty(version_t pv) -{ - ++version; - assert(pv == version); - _mark_dirty(); -} - -void CDir::mark_clean() -{ - dout(10) << "mark_clean " << *this << " version " << version << endl; - if (state_test(CDIR_STATE_DIRTY)) { - state_clear(CDIR_STATE_DIRTY); - put(PIN_DIRTY); - } -} - - - - -void CDir::first_get() -{ - inode->get(CInode::PIN_DIR); -} - -void CDir::last_put() -{ - inode->put(CInode::PIN_DIR); -} - - - -/******************************** - * AUTHORITY - */ - -/* - * simple rule: if dir_auth isn't explicit, auth is the same as the inode. - */ -int CDir::authority() -{ - if (dir_auth == CDIR_AUTH_PARENT) - return inode->authority(); - return dir_auth; -} - -int CDir::dentry_authority(const string& dn ) -{ - // hashing -- subset of nodes have hashed the contents - if (is_hashing() && !hashed_subset.empty()) { - int hashauth = cache->hash_dentry( inode->ino(), dn ); // hashed - if (hashed_subset.count(hashauth)) - return hashauth; - } - - // hashed - if (is_hashed()) { - return cache->hash_dentry( inode->ino(), dn ); // hashed - } - - if (get_dir_auth() == CDIR_AUTH_PARENT) { - //dout(15) << "dir_auth = parent at " << *this << endl; - return inode->authority(); // same as my inode - } - - // it's explicit for this whole dir - //dout(15) << "dir_auth explicit " << dir_auth << " at " << *this << endl; - return get_dir_auth(); -} - -void CDir::set_dir_auth(int d) -{ - dout(10) << "setting dir_auth=" << d << " from " << dir_auth << " on " << *this << endl; - dir_auth = d; -} - - -/***************************************** - * AUTH PINS - */ - -void CDir::auth_pin() { - if (auth_pins == 0) - get(PIN_AUTHPIN); - auth_pins++; - - dout(7) << "auth_pin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << endl; - - inode->nested_auth_pins++; - if (inode->parent) - inode->parent->dir->adjust_nested_auth_pins( 1 ); -} - -void CDir::auth_unpin() { - auth_pins--; - if (auth_pins == 0) - put(PIN_AUTHPIN); - - dout(7) << "auth_unpin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << endl; - assert(auth_pins >= 0); - - // pending freeze? - if (auth_pins + nested_auth_pins == 0) - on_freezeable(); - - inode->nested_auth_pins--; - if (inode->parent) - inode->parent->dir->adjust_nested_auth_pins( -1 ); -} - -void CDir::adjust_nested_auth_pins(int inc) -{ - CDir *dir = this; - - while (1) { - // dir - dir->nested_auth_pins += inc; - - dout(10) << "adjust_nested_auth_pins on " << *dir << " count now " << dir->auth_pins << " + " << dir->nested_auth_pins << endl; - assert(dir->nested_auth_pins >= 0); - - // pending freeze? - if (dir->auth_pins + dir->nested_auth_pins == 0) - dir->on_freezeable(); - - // it's inode - dir->inode->nested_auth_pins += inc; - - if (dir->inode->parent) - dir = dir->inode->parent->dir; - else - break; - } -} - - - -/***************************************************************************** - * FREEZING - */ - -void CDir::on_freezeable() -{ - // check for anything pending freezeable - - /* NOTE: the first of these will likely freeze the dir, and unmark - FREEZING. additional ones will re-flag FREEZING. this isn't - particularly graceful, and might cause problems if the first one - needs to know about other waiters.... FIXME? */ - - finish_waiting(CDIR_WAIT_FREEZEABLE); -} - -// FREEZE TREE - -class C_MDS_FreezeTree : public Context { - CDir *dir; - Context *con; -public: - C_MDS_FreezeTree(CDir *dir, Context *c) { - this->dir = dir; - this->con = c; - } - virtual void finish(int r) { - dir->freeze_tree_finish(con); - } -}; - -void CDir::freeze_tree(Context *c) -{ - assert(!is_frozen()); - assert(!is_freezing()); - - if (is_freezeable()) { - dout(10) << "freeze_tree " << *this << endl; - - state_set(CDIR_STATE_FROZENTREE); - inode->auth_pin(); // auth_pin for duration of freeze - - // easy, we're frozen - c->finish(0); - delete c; - - } else { - state_set(CDIR_STATE_FREEZINGTREE); - dout(10) << "freeze_tree + wait " << *this << endl; - - // need to wait for auth pins to expire - add_waiter(CDIR_WAIT_FREEZEABLE, new C_MDS_FreezeTree(this, c)); - } -} - -void CDir::freeze_tree_finish(Context *c) -{ - // freezeable now? - if (!is_freezeable()) { - // wait again! - dout(10) << "freeze_tree_finish still waiting " << *this << endl; - state_set(CDIR_STATE_FREEZINGTREE); - add_waiter(CDIR_WAIT_FREEZEABLE, new C_MDS_FreezeTree(this, c)); - return; - } - - dout(10) << "freeze_tree_finish " << *this << endl; - state_set(CDIR_STATE_FROZENTREE); - state_clear(CDIR_STATE_FREEZINGTREE); // actually, this may get set again by next context? - - inode->auth_pin(); // auth_pin for duration of freeze - - // continue to frozen land - if (c) { - c->finish(0); - delete c; - } -} - -void CDir::unfreeze_tree() -{ - dout(10) << "unfreeze_tree " << *this << endl; - - if (state_test(CDIR_STATE_FROZENTREE)) { - // frozen. unfreeze. - state_clear(CDIR_STATE_FROZENTREE); - - // unpin (may => FREEZEABLE) FIXME: is this order good? - inode->auth_unpin(); - - // waiters? - finish_waiting(CDIR_WAIT_UNFREEZE); - } else { - // freezing. stop it. - assert(state_test(CDIR_STATE_FREEZINGTREE)); - state_clear(CDIR_STATE_FREEZINGTREE); - - // cancel freeze waiters - finish_waiting(CDIR_WAIT_FREEZEABLE, -1); - } -} - -bool CDir::is_freezing_tree() -{ - CDir *dir = this; - while (1) { - if (dir->is_freezing_tree_root()) return true; - if (dir->is_import()) return false; - if (dir->is_hashed()) return false; - if (dir->inode->parent) - dir = dir->inode->parent->dir; - else - return false; // root on replica - } -} - -bool CDir::is_frozen_tree() -{ - CDir *dir = this; - while (1) { - if (dir->is_frozen_tree_root()) return true; - if (dir->is_import()) return false; - if (dir->is_hashed()) return false; - if (dir->is_frozen_tree_leaf()) return false; - if (dir->inode->parent) - dir = dir->inode->parent->dir; - else - return false; // root on replica - } -} - - - -// FREEZE DIR - -class C_MDS_FreezeDir : public Context { - CDir *dir; - Context *con; -public: - C_MDS_FreezeDir(CDir *dir, Context *c) { - this->dir = dir; - this->con = c; - } - virtual void finish(int r) { - dir->freeze_dir_finish(con); - } -}; - -void CDir::freeze_dir(Context *c) -{ - assert(!is_frozen()); - assert(!is_freezing()); - - if (is_freezeable_dir()) { - dout(10) << "freeze_dir " << *this << endl; - - state_set(CDIR_STATE_FROZENDIR); - inode->auth_pin(); // auth_pin for duration of freeze - - // easy, we're frozen - c->finish(0); - delete c; - - } else { - state_set(CDIR_STATE_FREEZINGDIR); - dout(10) << "freeze_dir + wait " << *this << endl; - - // need to wait for auth pins to expire - add_waiter(CDIR_WAIT_FREEZEABLE, new C_MDS_FreezeDir(this, c)); - } -} - -void CDir::freeze_dir_finish(Context *c) -{ - // freezeable now? - if (!is_freezeable_dir()) { - // wait again! - dout(10) << "freeze_dir_finish still waiting " << *this << endl; - state_set(CDIR_STATE_FREEZINGDIR); - add_waiter(CDIR_WAIT_FREEZEABLE, new C_MDS_FreezeDir(this, c)); - return; - } - - dout(10) << "freeze_dir_finish " << *this << endl; - state_set(CDIR_STATE_FROZENDIR); - state_clear(CDIR_STATE_FREEZINGDIR); // actually, this may get set again by next context? - - inode->auth_pin(); // auth_pin for duration of freeze - - // continue to frozen land - if (c) { - c->finish(0); - delete c; - } -} - -void CDir::unfreeze_dir() -{ - dout(10) << "unfreeze_dir " << *this << endl; - state_clear(CDIR_STATE_FROZENDIR); - - // unpin (may => FREEZEABLE) FIXME: is this order good? - inode->auth_unpin(); - - // waiters? - finish_waiting(CDIR_WAIT_UNFREEZE); -} - - - - - - - - - -// ----------------------------------------------------------------- -// debug shite - - -void CDir::dump(int depth) { - string ind(depth, '\t'); - - dout(10) << "dump:" << ind << *this << endl; - - map::iterator iter = items.begin(); - while (iter != items.end()) { - CDentry* d = iter->second; - if (d->inode) { - char isdir = ' '; - if (d->inode->dir != NULL) isdir = '/'; - dout(10) << "dump: " << ind << *d << " = " << *d->inode << endl; - d->inode->dump(depth+1); - } else { - dout(10) << "dump: " << ind << *d << " = [null]" << endl; - } - iter++; - } - - if (!(state_test(CDIR_STATE_COMPLETE))) - dout(10) << ind << "..." << endl; - if (state_test(CDIR_STATE_DIRTY)) - dout(10) << ind << "[dirty]" << endl; - -} - diff --git a/tags/20070517_before_mds_merge/mds/CDir.h b/tags/20070517_before_mds_merge/mds/CDir.h deleted file mode 100644 index 6283bef7c0aff..0000000000000 --- a/tags/20070517_before_mds_merge/mds/CDir.h +++ /dev/null @@ -1,617 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __CDIR_H -#define __CDIR_H - -#include "include/types.h" -#include "include/buffer.h" -#include "mdstypes.h" -#include "config.h" -#include "common/DecayCounter.h" - -#include -#include - -#include -#include -#include -#include -using namespace std; - -#include -using __gnu_cxx::hash_map; - - -#include "CInode.h" - -class CDentry; -class MDCache; -class MDCluster; -class Context; - - -// directory authority types -// >= 0 is the auth mds -#define CDIR_AUTH_PARENT -1 // default -#define CDIR_AUTH_UNKNOWN -2 - - -#define CDIR_NONCE_EXPORT 1 - - -// state bits -#define CDIR_STATE_AUTH (1<<0) // auth for this dir (hashing doesn't count) -#define CDIR_STATE_PROXY (1<<1) // proxy auth - -#define CDIR_STATE_COMPLETE (1<<2) // the complete contents are in cache -#define CDIR_STATE_DIRTY (1<<3) // has been modified since last commit - -#define CDIR_STATE_FROZENTREE (1<<4) // root of tree (bounded by exports) -#define CDIR_STATE_FREEZINGTREE (1<<5) // in process of freezing -#define CDIR_STATE_FROZENTREELEAF (1<<6) // outer bound of frozen region (on import) -#define CDIR_STATE_FROZENDIR (1<<7) -#define CDIR_STATE_FREEZINGDIR (1<<8) - -#define CDIR_STATE_COMMITTING (1<<9) // mid-commit -#define CDIR_STATE_FETCHING (1<<10) // currenting fetching - -#define CDIR_STATE_DELETED (1<<11) - -#define CDIR_STATE_IMPORT (1<<12) // flag set if this is an import. -#define CDIR_STATE_EXPORT (1<<13) -#define CDIR_STATE_IMPORTINGEXPORT (1<<14) - -#define CDIR_STATE_HASHED (1<<15) // if hashed -#define CDIR_STATE_HASHING (1<<16) -#define CDIR_STATE_UNHASHING (1<<17) - - - - - -// these state bits are preserved by an import/export -// ...except if the directory is hashed, in which case none of them are! -#define CDIR_MASK_STATE_EXPORTED (CDIR_STATE_COMPLETE\ - |CDIR_STATE_DIRTY) -#define CDIR_MASK_STATE_IMPORT_KEPT (CDIR_STATE_IMPORT\ - |CDIR_STATE_EXPORT\ - |CDIR_STATE_IMPORTINGEXPORT\ - |CDIR_STATE_FROZENTREE\ - |CDIR_STATE_PROXY) - -#define CDIR_MASK_STATE_EXPORT_KEPT (CDIR_STATE_HASHED\ - |CDIR_STATE_FROZENTREE\ - |CDIR_STATE_FROZENDIR\ - |CDIR_STATE_EXPORT\ - |CDIR_STATE_PROXY) - -// common states -#define CDIR_STATE_CLEAN 0 -#define CDIR_STATE_INITIAL 0 - -// directory replication -#define CDIR_REP_ALL 1 -#define CDIR_REP_NONE 0 -#define CDIR_REP_LIST 2 - - - - - - -// wait reasons -#define CDIR_WAIT_DENTRY 1 // wait for item to be in cache - // waiters: path_traverse - // trigger: handle_discover, fetch_dir_2 -#define CDIR_WAIT_COMPLETE 2 // wait for complete dir contents - // waiters: fetch_dir, commit_dir - // trigger: fetch_dir_2 -#define CDIR_WAIT_FREEZEABLE 4 // hard_pins removed - // waiters: freeze, freeze_finish - // trigger: auth_unpin, adjust_nested_auth_pins -#define CDIR_WAIT_UNFREEZE 8 // unfreeze - // waiters: path_traverse, handle_discover, handle_inode_update, - // export_dir_frozen (mdcache) - // handle_client_readdir (mds) - // trigger: unfreeze -#define CDIR_WAIT_AUTHPINNABLE CDIR_WAIT_UNFREEZE - // waiters: commit_dir (mdstore) - // trigger: (see CDIR_WAIT_UNFREEZE) -#define CDIR_WAIT_COMMITTED 32 // did commit (who uses this?**) - // waiters: commit_dir (if already committing) - // trigger: commit_dir_2 -#define CDIR_WAIT_IMPORTED 64 // import finish - // waiters: import_dir_block - // triggers: handle_export_dir_finish - -#define CDIR_WAIT_EXPORTWARNING 8192 // on bystander. - // watiers: handle_export_dir_notify - // triggers: handle_export_dir_warning -#define CDIR_WAIT_EXPORTPREPACK 16384 - // waiter export_dir - // trigger handel_export_dir_prep_ack - -#define CDIR_WAIT_HASHED (1<<17) // hash finish -#define CDIR_WAIT_THISHASHEDREADDIR (1<<18) // current readdir lock -#define CDIR_WAIT_NEXTHASHEDREADDIR (1<<19) // after current readdir lock finishes - -#define CDIR_WAIT_DNREAD (1<<20) -#define CDIR_WAIT_DNLOCK (1<<21) -#define CDIR_WAIT_DNUNPINNED (1<<22) -#define CDIR_WAIT_DNPINNABLE (CDIR_WAIT_DNREAD|CDIR_WAIT_DNUNPINNED) - -#define CDIR_WAIT_DNREQXLOCK (1<<23) - -#define CDIR_WAIT_ANY (0xffffffff) - -#define CDIR_WAIT_ATFREEZEROOT (CDIR_WAIT_AUTHPINNABLE|\ - CDIR_WAIT_UNFREEZE) // hmm, same same - - -ostream& operator<<(ostream& out, class CDir& dir); - - -// CDir -typedef map CDir_map_t; - - -//extern int cdir_pins[CDIR_NUM_PINS]; - - -class CDir : public MDSCacheObject { - public: - // -- pins -- - static const int PIN_CHILD = 0; - static const int PIN_OPENED = 1; // open by another node - static const int PIN_WAITER = 2; // waiter(s) - static const int PIN_IMPORT = 3; - static const int PIN_EXPORT = 4; - //static const int PIN_FREEZE = 5; - static const int PIN_FREEZELEAF = 6; - static const int PIN_PROXY = 7; // auth just changed. - static const int PIN_AUTHPIN = 8; - static const int PIN_IMPORTING = 9; - static const int PIN_IMPORTINGEXPORT = 10; - static const int PIN_HASHED = 11; - static const int PIN_HASHING = 12; - static const int PIN_DIRTY = 13; - static const int PIN_REQUEST = 14; - static const char *pin_name(int p) { - switch (p) { - case PIN_CHILD: return "child"; - case PIN_OPENED: return "opened"; - case PIN_WAITER: return "waiter"; - case PIN_IMPORT: return "import"; - case PIN_EXPORT: return "export"; - //case PIN_FREEZE: return "freeze"; - case PIN_FREEZELEAF: return "freezeleaf"; - case PIN_PROXY: return "proxy"; - case PIN_AUTHPIN: return "authpin"; - case PIN_IMPORTING: return "importing"; - case PIN_IMPORTINGEXPORT: return "importingexport"; - case PIN_HASHED: return "hashed"; - case PIN_HASHING: return "hashing"; - case PIN_DIRTY: return "dirty"; - case PIN_REQUEST: return "request"; - default: assert(0); - } - } - - - public: - // context - MDCache *cache; - - // my inode - CInode *inode; - - protected: - // contents - CDir_map_t items; // non-null AND null - CDir_map_t null_items; // null and foreign - size_t nitems; // non-null - size_t nnull; // null - - // state - version_t version; - version_t committing_version; - version_t last_committed_version; // slight lie; we bump this on import. - version_t projected_version; - - // authority, replicas - int dir_auth; - - // lock nesting, freeze - int auth_pins; - int nested_auth_pins; - int request_pins; - - // hashed dirs - set hashed_subset; // HASHING: subset of mds's that are hashed - public: - // for class MDS - map, list > > hashed_readdir; - protected: - - - - // waiters - multimap waiting; // tag -> context - hash_map< string, multimap > - waiting_on_dentry; - - // cache control (defined for authority; hints for replicas) - int dir_rep; - set dir_rep_by; // if dir_rep == CDIR_REP_LIST - - // popularity - meta_load_t popularity[MDS_NPOP]; - - // friends - friend class Migrator; - friend class CInode; - friend class MDCache; - friend class MDiscover; - friend class MDBalancer; - - friend class CDirDiscover; - friend class CDirExport; - - public: - CDir(CInode *in, MDCache *mdcache, bool auth); - - - - // -- accessors -- - inodeno_t ino() { return inode->ino(); } - CInode *get_inode() { return inode; } - CDir *get_parent_dir() { return inode->get_parent_dir(); } - - CDir_map_t::iterator begin() { return items.begin(); } - CDir_map_t::iterator end() { return items.end(); } - size_t get_size() { - return nitems; - } - size_t get_nitems() { return nitems; } - size_t get_nnull() { return nnull; } - - /* - float get_popularity() { - return popularity[0].get(); - } - */ - - - // -- dentries and inodes -- - public: - CDentry* lookup(const string& n) { - map::iterator iter = items.find(n); - if (iter == items.end()) - return 0; - else - return iter->second; - } - - CDentry* add_dentry( const string& dname, CInode *in=0, bool auth=true ); - CDentry* add_dentry( const string& dname, inodeno_t ino, bool auth=true ); - void remove_dentry( CDentry *dn ); // delete dentry - void link_inode( CDentry *dn, inodeno_t ino ); - void link_inode( CDentry *dn, CInode *in ); - void unlink_inode( CDentry *dn ); - private: - void link_inode_work( CDentry *dn, CInode *in ); - void unlink_inode_work( CDentry *dn ); - - void remove_null_dentries(); // on empty, clean dir - - // -- authority -- - public: - int authority(); - int dentry_authority(const string& d); - int get_dir_auth() { return dir_auth; } - void set_dir_auth(int d); - - - - // for giving to clients - void get_dist_spec(set& ls, int auth) { - if (( popularity[MDS_POP_CURDOM].pop[META_POP_IRD].get() > g_conf.mds_bal_replicate_threshold)) { - //if (!cached_by.empty() && inode.ino > 1) dout(1) << "distributed spec for " << *this << endl; - for (map::iterator p = replicas_begin(); - p != replicas_end(); - ++p) - ls.insert(p->first); - if (!ls.empty()) - ls.insert(auth); - } - } - - - // -- state -- - bool is_complete() { return state & CDIR_STATE_COMPLETE; } - bool is_dirty() { return state_test(CDIR_STATE_DIRTY); } - - bool is_auth() { return state & CDIR_STATE_AUTH; } - bool is_proxy() { return state & CDIR_STATE_PROXY; } - bool is_import() { return state & CDIR_STATE_IMPORT; } - bool is_export() { return state & CDIR_STATE_EXPORT; } - - bool is_hashed() { return state & CDIR_STATE_HASHED; } - bool is_hashing() { return state & CDIR_STATE_HASHING; } - bool is_unhashing() { return state & CDIR_STATE_UNHASHING; } - - bool is_rep() { - if (dir_rep == CDIR_REP_NONE) return false; - return true; - } - - - - // -- dirtyness -- - version_t get_version() { return version; } - void set_version(version_t v) { projected_version = version = v; } - version_t get_projected_version() { return projected_version; } - - version_t get_committing_version() { return committing_version; } - version_t get_last_committed_version() { return last_committed_version; } - // as in, we're committing the current version. - void set_committing_version() { committing_version = version; } - void set_last_committed_version(version_t v) { last_committed_version = v; } - - version_t pre_dirty(); - void _mark_dirty(); - void mark_dirty(version_t pv); - void mark_clean(); - void mark_complete() { state_set(CDIR_STATE_COMPLETE); } - bool is_clean() { return !state_test(CDIR_STATE_DIRTY); } - - - - - // -- reference counting -- - void first_get(); - void last_put(); - - void request_pin_get() { - if (request_pins == 0) get(PIN_REQUEST); - request_pins++; - } - void request_pin_put() { - request_pins--; - if (request_pins == 0) put(PIN_REQUEST); - } - - - // -- waiters -- - bool waiting_for(int tag); - bool waiting_for(int tag, const string& dn); - void add_waiter(int tag, Context *c); - void add_waiter(int tag, - const string& dentry, - Context *c); - void take_waiting(int mask, list& ls); // includes dentry waiters - void take_waiting(int mask, - const string& dentry, - list& ls, - int num=0); - void finish_waiting(int mask, int result = 0); // ditto - void finish_waiting(int mask, const string& dn, int result = 0); // ditto - - - // -- auth pins -- - bool can_auth_pin() { return !(is_frozen() || is_freezing()); } - int is_auth_pinned() { return auth_pins; } - void auth_pin(); - void auth_unpin(); - void adjust_nested_auth_pins(int inc); - void on_freezeable(); - - // -- freezing -- - void freeze_tree(Context *c); - void freeze_tree_finish(Context *c); - void unfreeze_tree(); - - void freeze_dir(Context *c); - void freeze_dir_finish(Context *c); - void unfreeze_dir(); - - bool is_freezing() { return is_freezing_tree() || is_freezing_dir(); } - bool is_freezing_tree(); - bool is_freezing_tree_root() { return state & CDIR_STATE_FREEZINGTREE; } - bool is_freezing_dir() { return state & CDIR_STATE_FREEZINGDIR; } - - bool is_frozen() { return is_frozen_dir() || is_frozen_tree(); } - bool is_frozen_tree(); - bool is_frozen_tree_root() { return state & CDIR_STATE_FROZENTREE; } - bool is_frozen_tree_leaf() { return state & CDIR_STATE_FROZENTREELEAF; } - bool is_frozen_dir() { return state & CDIR_STATE_FROZENDIR; } - - bool is_freezeable() { - if (auth_pins == 0 && nested_auth_pins == 0) return true; - return false; - } - bool is_freezeable_dir() { - if (auth_pins == 0) return true; - return false; - } - - - - // debuggin bs - void dump(int d = 0); -}; - - - -// -- encoded state -- - -// discover - -class CDirDiscover { - inodeno_t ino; - int nonce; - int dir_auth; - int dir_rep; - set rep_by; - - public: - CDirDiscover() {} - CDirDiscover(CDir *dir, int nonce) { - ino = dir->ino(); - this->nonce = nonce; - dir_auth = dir->dir_auth; - dir_rep = dir->dir_rep; - rep_by = dir->dir_rep_by; - } - - void update_dir(CDir *dir) { - assert(dir->ino() == ino); - assert(!dir->is_auth()); - - dir->replica_nonce = nonce; - dir->dir_auth = dir_auth; - dir->dir_rep = dir_rep; - dir->dir_rep_by = rep_by; - } - - inodeno_t get_ino() { return ino; } - - - void _encode(bufferlist& bl) { - bl.append((char*)&ino, sizeof(ino)); - bl.append((char*)&nonce, sizeof(nonce)); - bl.append((char*)&dir_auth, sizeof(dir_auth)); - bl.append((char*)&dir_rep, sizeof(dir_rep)); - ::_encode(rep_by, bl); - } - - void _decode(bufferlist& bl, int& off) { - bl.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - bl.copy(off, sizeof(nonce), (char*)&nonce); - off += sizeof(nonce); - bl.copy(off, sizeof(dir_auth), (char*)&dir_auth); - off += sizeof(dir_auth); - bl.copy(off, sizeof(dir_rep), (char*)&dir_rep); - off += sizeof(dir_rep); - ::_decode(rep_by, bl, off); - } - -}; - - -// export - -class CDirExport { - struct { - inodeno_t ino; - long nitems; // actual real entries - long nden; // num dentries (including null ones) - version_t version; - unsigned state; - meta_load_t popularity_justme; - meta_load_t popularity_curdom; - int dir_rep; - } st; - map replicas; - set rep_by; - - public: - CDirExport() {} - CDirExport(CDir *dir) { - memset(&st, 0, sizeof(st)); - - assert(dir->get_version() == dir->get_projected_version()); - - st.ino = dir->ino(); - st.nitems = dir->nitems; - st.nden = dir->items.size(); - st.version = dir->version; - st.state = dir->state; - st.dir_rep = dir->dir_rep; - - st.popularity_justme.take( dir->popularity[MDS_POP_JUSTME] ); - st.popularity_curdom.take( dir->popularity[MDS_POP_CURDOM] ); - dir->popularity[MDS_POP_ANYDOM] -= st.popularity_curdom; - dir->popularity[MDS_POP_NESTED] -= st.popularity_curdom; - - rep_by = dir->dir_rep_by; - replicas = dir->replicas; - } - - inodeno_t get_ino() { return st.ino; } - __uint64_t get_nden() { return st.nden; } - - void update_dir(CDir *dir) { - assert(dir->ino() == st.ino); - - //dir->nitems = st.nitems; - - // set last_committed_version at old version - dir->committing_version = dir->last_committed_version = st.version; - dir->projected_version = dir->version = st.version; // this is bumped, below, if dirty - - // twiddle state - if (dir->state & CDIR_STATE_HASHED) - dir->state_set( CDIR_STATE_AUTH ); // just inherit auth flag when hashed - else - dir->state = (dir->state & CDIR_MASK_STATE_IMPORT_KEPT) | // remember import flag, etc. - (st.state & CDIR_MASK_STATE_EXPORTED); - dir->dir_rep = st.dir_rep; - - dir->popularity[MDS_POP_JUSTME] += st.popularity_justme; - dir->popularity[MDS_POP_CURDOM] += st.popularity_curdom; - dir->popularity[MDS_POP_ANYDOM] += st.popularity_curdom; - dir->popularity[MDS_POP_NESTED] += st.popularity_curdom; - - dir->replica_nonce = 0; // no longer defined - - if (!dir->replicas.empty()) - dout(0) << "replicas not empty non import, " << *dir << ", " << dir->replicas << endl; - - dir->dir_rep_by = rep_by; - dir->replicas = replicas; - dout(12) << "replicas in export is " << replicas << ", dir now " << dir->replicas << endl; - if (!replicas.empty()) - dir->get(CDir::PIN_OPENED); - if (dir->is_dirty()) { - dir->get(CDir::PIN_DIRTY); - - // bump dir version + 1 if dirty - dir->projected_version = dir->version = st.version + 1; - } - } - - - void _encode(bufferlist& bl) { - bl.append((char*)&st, sizeof(st)); - ::_encode(replicas, bl); - ::_encode(rep_by, bl); - } - - int _decode(bufferlist& bl, int off = 0) { - bl.copy(off, sizeof(st), (char*)&st); - off += sizeof(st); - ::_decode(replicas, bl, off); - ::_decode(rep_by, bl, off); - return off; - } - -}; - - - -#endif diff --git a/tags/20070517_before_mds_merge/mds/CInode.cc b/tags/20070517_before_mds_merge/mds/CInode.cc deleted file mode 100644 index f431184fb199b..0000000000000 --- a/tags/20070517_before_mds_merge/mds/CInode.cc +++ /dev/null @@ -1,506 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "CInode.h" -#include "CDir.h" -#include "CDentry.h" - -#include "MDS.h" -#include "MDCache.h" -#include "AnchorTable.h" - -#include "common/Clock.h" - -#include - -#include "config.h" -#undef dout -#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mdcache->mds->get_nodeid() << ".cache.ino(" << inode.ino << ") " - - -//int cinode_pins[CINODE_NUM_PINS]; // counts - - -ostream& operator<<(ostream& out, CInode& in) -{ - string path; - in.make_path(path); - out << "[inode " << in.inode.ino << " " << path << (in.is_dir() ? "/ ":" "); - if (in.is_auth()) { - out << "auth"; - if (in.is_replicated()) - out << in.get_replicas(); - } else { - out << "rep@" << in.authority(); - out << "." << in.get_replica_nonce(); - assert(in.get_replica_nonce() >= 0); - } - - if (in.is_symlink()) out << " symlink"; - - out << " v" << in.get_version(); - - out << " hard=" << in.hardlock; - out << " file=" << in.filelock; - - if (in.get_num_ref()) { - out << " |"; - for(set::iterator it = in.get_ref_set().begin(); - it != in.get_ref_set().end(); - it++) - out << " " << CInode::pin_name(*it); - } - - // hack: spit out crap on which clients have caps - if (!in.get_client_caps().empty()) { - out << " caps={"; - for (map::iterator it = in.get_client_caps().begin(); - it != in.get_client_caps().end(); - it++) { - if (it != in.get_client_caps().begin()) out << ","; - out << it->first; - } - out << "}"; - } - out << " " << ∈ - out << "]"; - return out; -} - - -// ====== CInode ======= -CInode::CInode(MDCache *c, bool auth) { - mdcache = c; - - ref = 0; - - num_parents = 0; - parent = NULL; - - dir = NULL; // CDir opened separately - - auth_pins = 0; - nested_auth_pins = 0; - num_request_pins = 0; - - state = 0; - - if (auth) state_set(STATE_AUTH); -} - -CInode::~CInode() { - if (dir) { delete dir; dir = 0; } -} - - -// pins - -void CInode::first_get() -{ - // pin my dentry? - if (parent) - parent->get(CDentry::PIN_INODEPIN); -} - -void CInode::last_put() -{ - // unpin my dentry? - if (parent) { - parent->put(CDentry::PIN_INODEPIN); - } - if (num_parents == 0 && get_num_ref() == 0) - mdcache->inode_expire_queue.push_back(this); // queue myself for garbage collection -} - -void CInode::get_parent() -{ - num_parents++; -} -void CInode::put_parent() -{ - num_parents--; - if (num_parents == 0 && get_num_ref() == 0) - mdcache->inode_expire_queue.push_back(this); // queue myself for garbage collection -} - - - -CDir *CInode::get_parent_dir() -{ - if (parent) - return parent->dir; - return NULL; -} -CInode *CInode::get_parent_inode() -{ - if (parent) - return parent->dir->inode; - return NULL; -} - -bool CInode::dir_is_auth() { - if (dir) - return dir->is_auth(); - else - return is_auth(); -} - -CDir *CInode::get_or_open_dir(MDCache *mdcache) -{ - assert(is_dir()); - - if (dir) return dir; - - // can't open a dir if we're frozen_dir, bc of hashing stuff. - assert(!is_frozen_dir()); - - // only auth can open dir alone. - assert(is_auth()); - set_dir( new CDir(this, mdcache, true) ); - dir->dir_auth = -1; - return dir; -} - -CDir *CInode::set_dir(CDir *newdir) -{ - assert(dir == 0); - dir = newdir; - return dir; -} - -void CInode::close_dir() -{ - assert(dir); - assert(dir->get_num_ref() == 0); - delete dir; - dir = 0; -} - - -void CInode::set_auth(bool a) -{ - if (!is_dangling() && !is_root() && - is_auth() != a) { - } - - if (a) state_set(STATE_AUTH); - else state_clear(STATE_AUTH); -} - - - -void CInode::make_path(string& s) -{ - if (parent) { - parent->make_path(s); - } - else if (is_root()) { - s = ""; // root - } - else { - s = "(dangling)"; // dangling - } -} - -void CInode::make_anchor_trace(vector& trace) -{ - if (parent) { - parent->dir->inode->make_anchor_trace(trace); - - dout(7) << "make_anchor_trace adding " << ino() << " dirino " << parent->dir->inode->ino() << " dn " << parent->name << endl; - trace.push_back( new Anchor(ino(), - parent->dir->inode->ino(), - parent->name) ); - } - else if (state_test(STATE_DANGLING)) { - dout(7) << "make_anchor_trace dangling " << ino() << " on mds " << dangling_auth << endl; - string ref_dn; - trace.push_back( new Anchor(ino(), - MDS_INO_INODEFILE_OFFSET+dangling_auth, - ref_dn) ); - } - else - assert(is_root()); -} - - - - -version_t CInode::pre_dirty() -{ - assert(parent); - return parent->pre_dirty(); -} - -void CInode::_mark_dirty() -{ - if (!state_test(STATE_DIRTY)) { - state_set(STATE_DIRTY); - get(PIN_DIRTY); - } -} - -void CInode::mark_dirty(version_t pv) { - - dout(10) << "mark_dirty " << *this << endl; - - assert(parent); - - /* - NOTE: I may already be dirty, but this fn _still_ needs to be called so that - the directory is (perhaps newly) dirtied, and so that parent_dir_version is - updated below. - */ - - // only auth can get dirty. "dirty" async data in replicas is relative to - // filelock state, not the dirty flag. - assert(is_auth()); - - // touch my private version - assert(inode.version < pv); - inode.version = pv; - _mark_dirty(); - - // mark dentry too - parent->mark_dirty(pv); -} - -void CInode::mark_clean() -{ - dout(10) << " mark_clean " << *this << endl; - if (state_test(STATE_DIRTY)) { - state_clear(STATE_DIRTY); - put(PIN_DIRTY); - } -} - -// state - - - - - -// new state encoders - -void CInode::encode_file_state(bufferlist& bl) -{ - bl.append((char*)&inode.size, sizeof(inode.size)); - bl.append((char*)&inode.mtime, sizeof(inode.mtime)); - bl.append((char*)&inode.atime, sizeof(inode.atime)); // ?? -} - -void CInode::decode_file_state(bufferlist& r, int& off) -{ - r.copy(off, sizeof(inode.size), (char*)&inode.size); - off += sizeof(inode.size); - r.copy(off, sizeof(inode.mtime), (char*)&inode.mtime); - off += sizeof(inode.mtime); - r.copy(off, sizeof(inode.atime), (char*)&inode.atime); - off += sizeof(inode.atime); -} - -/* not used currently -void CInode::decode_merge_file_state(crope& r, int& off) -{ - __uint64_t size; - r.copy(off, sizeof(size), (char*)&size); - off += sizeof(size); - if (size > inode.size) inode.size = size; - - time_t t; - r.copy(off, sizeof(t), (char*)&t); - off += sizeof(t); - if (t > inode.mtime) inode.mtime = t; - - r.copy(off, sizeof(t), (char*)&t); - off += sizeof(t); - if (t > inode.atime) inode.atime = t; -} -*/ - -void CInode::encode_hard_state(bufferlist& r) -{ - r.append((char*)&inode.mode, sizeof(inode.mode)); - r.append((char*)&inode.uid, sizeof(inode.uid)); - r.append((char*)&inode.gid, sizeof(inode.gid)); - r.append((char*)&inode.ctime, sizeof(inode.ctime)); -} - -void CInode::decode_hard_state(bufferlist& r, int& off) -{ - r.copy(off, sizeof(inode.mode), (char*)&inode.mode); - off += sizeof(inode.mode); - r.copy(off, sizeof(inode.uid), (char*)&inode.uid); - off += sizeof(inode.uid); - r.copy(off, sizeof(inode.gid), (char*)&inode.gid); - off += sizeof(inode.gid); - r.copy(off, sizeof(inode.ctime), (char*)&inode.ctime); - off += sizeof(inode.ctime); -} - - - -// waiting - -bool CInode::is_frozen() -{ - if (parent && parent->dir->is_frozen()) - return true; - return false; -} - -bool CInode::is_frozen_dir() -{ - if (parent && parent->dir->is_frozen_dir()) - return true; - return false; -} - -bool CInode::is_freezing() -{ - if (parent && parent->dir->is_freezing()) - return true; - return false; -} - -bool CInode::waiting_for(int tag) -{ - return waiting.count(tag) > 0; -} - -void CInode::add_waiter(int tag, Context *c) { - // waiting on hierarchy? - if (tag & CDIR_WAIT_ATFREEZEROOT && (is_freezing() || is_frozen())) { - parent->dir->add_waiter(tag, c); - return; - } - - // this inode. - if (waiting.size() == 0) - get(PIN_WAITER); - waiting.insert(pair(tag,c)); - dout(10) << "add_waiter " << tag << " " << c << " on " << *this << endl; - -} - -void CInode::take_waiting(int mask, list& ls) -{ - if (waiting.empty()) return; - - multimap::iterator it = waiting.begin(); - while (it != waiting.end()) { - if (it->first & mask) { - ls.push_back(it->second); - dout(10) << "take_waiting mask " << mask << " took " << it->second << " tag " << it->first << " on " << *this << endl; - - waiting.erase(it++); - } else { - dout(10) << "take_waiting mask " << mask << " SKIPPING " << it->second << " tag " << it->first << " on " << *this << endl; - it++; - } - } - - if (waiting.empty()) - put(PIN_WAITER); -} - -void CInode::finish_waiting(int mask, int result) -{ - dout(11) << "finish_waiting mask " << mask << " result " << result << " on " << *this << endl; - - list finished; - take_waiting(mask, finished); - finish_contexts(finished, result); -} - - -// auth_pins -bool CInode::can_auth_pin() { - if (parent) - return parent->dir->can_auth_pin(); - return true; -} - -void CInode::auth_pin() { - if (auth_pins == 0) - get(PIN_AUTHPIN); - auth_pins++; - - dout(7) << "auth_pin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << endl; - - if (parent) - parent->dir->adjust_nested_auth_pins( 1 ); -} - -void CInode::auth_unpin() { - auth_pins--; - if (auth_pins == 0) - put(PIN_AUTHPIN); - - dout(7) << "auth_unpin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << endl; - - assert(auth_pins >= 0); - - if (parent) - parent->dir->adjust_nested_auth_pins( -1 ); -} - - - -// authority - -int CInode::authority() { - if (is_dangling()) - return dangling_auth; // explicit - - if (is_root()) { // i am root - if (dir) - return dir->get_dir_auth(); // bit of a chicken/egg issue here! - else - return CDIR_AUTH_UNKNOWN; - } - - if (parent) - return parent->dir->dentry_authority( parent->name ); - - return -1; // undefined (inode must not be linked yet!) -} - - -CInodeDiscover* CInode::replicate_to( int rep ) -{ - assert(is_auth()); - - // relax locks? - if (!is_replicated()) - replicate_relax_locks(); - - // return the thinger - int nonce = add_replica( rep ); - return new CInodeDiscover( this, nonce ); -} - - -// debug crap ----------------------------- - -void CInode::dump(int dep) -{ - string ind(dep, '\t'); - //cout << ind << "[inode " << this << "]" << endl; - - if (dir) - dir->dump(dep); -} - diff --git a/tags/20070517_before_mds_merge/mds/CInode.h b/tags/20070517_before_mds_merge/mds/CInode.h deleted file mode 100644 index d2292196a5ebc..0000000000000 --- a/tags/20070517_before_mds_merge/mds/CInode.h +++ /dev/null @@ -1,655 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __CINODE_H -#define __CINODE_H - -#include "config.h" -#include "include/types.h" -#include "include/lru.h" - -#include "mdstypes.h" - -#include "CDentry.h" -#include "Lock.h" -#include "Capability.h" - - -#include -#include -#include -#include -#include -#include -using namespace std; - - -// wait reasons -#define CINODE_WAIT_AUTHPINNABLE CDIR_WAIT_UNFREEZE - // waiters: write_hard_start, read_file_start, write_file_start (mdcache) - // handle_client_chmod, handle_client_touch (mds) - // trigger: (see CDIR_WAIT_UNFREEZE) -#define CINODE_WAIT_GETREPLICA (1<<11) // update/replicate individual inode - // waiters: import_dentry_inode - // trigger: handle_inode_replicate_ack - -#define CINODE_WAIT_DIR (1<<13) - // waiters: traverse_path - // triggers: handle_disocver_reply - -#define CINODE_WAIT_LINK (1<<14) // as in remotely nlink++ -#define CINODE_WAIT_ANCHORED (1<<15) -#define CINODE_WAIT_UNLINK (1<<16) // as in remotely nlink-- - -#define CINODE_WAIT_HARDR (1<<17) // 131072 -#define CINODE_WAIT_HARDW (1<<18) // 262... -#define CINODE_WAIT_HARDB (1<<19) -#define CINODE_WAIT_HARDRWB (CINODE_WAIT_HARDR|CINODE_WAIT_HARDW|CINODE_WAIT_HARDB) -#define CINODE_WAIT_HARDSTABLE (1<<20) -#define CINODE_WAIT_HARDNORD (1<<21) -#define CINODE_WAIT_FILER (1<<22) -#define CINODE_WAIT_FILEW (1<<23) -#define CINODE_WAIT_FILEB (1<<24) -#define CINODE_WAIT_FILERWB (CINODE_WAIT_FILER|CINODE_WAIT_FILEW|CINODE_WAIT_FILEB) -#define CINODE_WAIT_FILESTABLE (1<<25) -#define CINODE_WAIT_FILENORD (1<<26) -#define CINODE_WAIT_FILENOWR (1<<27) - -#define CINODE_WAIT_RENAMEACK (1<<28) -#define CINODE_WAIT_RENAMENOTIFYACK (1<<29) - -#define CINODE_WAIT_CAPS (1<<30) - -#define CINODE_WAIT_ANY 0xffffffff - - - -// misc -#define CINODE_EXPORT_NONCE 1 // nonce given to replicas created by export -#define CINODE_HASHREPLICA_NONCE 1 // hashed inodes that are duped ???FIXME??? - -class Context; -class CDentry; -class CDir; -class Message; -class CInode; -class CInodeDiscover; -class MDCache; - - -ostream& operator<<(ostream& out, CInode& in); - - -// cached inode wrapper -class CInode : public MDSCacheObject { - public: - // -- pins -- - static const int PIN_CACHED = 1; - static const int PIN_DIR = 2; - static const int PIN_DIRTY = 4; // must flush - static const int PIN_PROXY = 5; // can't expire yet - static const int PIN_WAITER = 6; // waiter - static const int PIN_CAPS = 7; // local fh's - static const int PIN_AUTHPIN = 8; - static const int PIN_IMPORTING = 9; // multipurpose, for importing - static const int PIN_REQUEST = 10; // request is logging, finishing - static const int PIN_RENAMESRC = 11; // pinned on dest for foreign rename - static const int PIN_ANCHORING = 12; - - static const int PIN_OPENINGDIR = 13; - - static const int PIN_DENTRYLOCK = 14; - - static const char *pin_name(int p) { - switch (p) { - case PIN_CACHED: return "cached"; - case PIN_DIR: return "dir"; - case PIN_DIRTY: return "dirty"; - case PIN_PROXY: return "proxy"; - case PIN_WAITER: return "waiter"; - case PIN_CAPS: return "caps"; - case PIN_AUTHPIN: return "authpin"; - case PIN_IMPORTING: return "importing"; - case PIN_REQUEST: return "request"; - case PIN_RENAMESRC: return "renamesrc"; - case PIN_ANCHORING: return "anchoring"; - case PIN_OPENINGDIR: return "openingdir"; - case PIN_DENTRYLOCK: return "dentrylock"; - default: assert(0); - } - } - - // state - static const int STATE_AUTH = (1<<0); - static const int STATE_ROOT = (1<<1); - static const int STATE_DIRTY = (1<<2); - static const int STATE_UNSAFE = (1<<3); // not logged yet - static const int STATE_DANGLING = (1<<4); // delete me when i expire; i have no dentry - static const int STATE_UNLINKING = (1<<5); - static const int STATE_PROXY = (1<<6); // can't expire yet - static const int STATE_EXPORTING = (1<<7); // on nonauth bystander. - static const int STATE_ANCHORING = (1<<8); - static const int STATE_OPENINGDIR = (1<<9); - //static const int STATE_RENAMING = (1<<8); // moving me - //static const int STATE_RENAMINGTO = (1<<9); // rename target (will be unlinked) - - - - - public: - MDCache *mdcache; - - inode_t inode; // the inode itself - - CDir *dir; // directory, if we have it opened. - string symlink; // symlink dest, if symlink - - protected: - // parent dentries in cache - int num_parents; - CDentry *parent; // primary link - set remote_parents; // if hard linked - - // -- distributed caching - int dangling_auth; // explicit auth, when dangling. - - int num_request_pins; - - // waiters - multimap waiting; - - - // -- distributed state -- -public: - // inode metadata locks - CLock hardlock; - CLock filelock; -protected: - // file capabilities - map client_caps; // client -> caps - map mds_caps_wanted; // [auth] mds -> caps wanted - int replica_caps_wanted; // [replica] what i've requested from auth - utime_t replica_caps_wanted_keep_until; - - - private: - // lock nesting - int auth_pins; - int nested_auth_pins; - - public: - meta_load_t popularity[MDS_NPOP]; - - // friends - friend class Server; - friend class Locker; - friend class Migrator; - friend class MDCache; - friend class CDir; - friend class CInodeExport; - friend class CInodeDiscover; - - public: - // --------------------------- - CInode(MDCache *c, bool auth=true); - ~CInode(); - - - // -- accessors -- - bool is_file() { return ((inode.mode & INODE_TYPE_MASK) == INODE_MODE_FILE) ? true:false; } - bool is_symlink() { return ((inode.mode & INODE_TYPE_MASK) == INODE_MODE_SYMLINK) ? true:false; } - bool is_dir() { return ((inode.mode & INODE_TYPE_MASK) == INODE_MODE_DIR) ? true:false; } - - bool is_anchored() { return inode.anchored; } - - bool is_root() { return state & STATE_ROOT; } - bool is_proxy() { return state & STATE_PROXY; } - - bool is_auth() { return state & STATE_AUTH; } - void set_auth(bool auth); - - inodeno_t ino() { return inode.ino; } - inode_t& get_inode() { return inode; } - CDentry* get_parent_dn() { return parent; } - CDir *get_parent_dir(); - CInode *get_parent_inode(); - CInode *get_realm_root(); // import, hash, or root - - CDir *get_or_open_dir(MDCache *mdcache); - CDir *set_dir(CDir *newdir); - void close_dir(); - - bool dir_is_auth(); - - - - // -- misc -- - void make_path(string& s); - void make_anchor_trace(vector& trace); - - - - // -- state -- - bool is_unsafe() { return state & STATE_UNSAFE; } - bool is_dangling() { return state & STATE_DANGLING; } - bool is_unlinking() { return state & STATE_UNLINKING; } - - void mark_unsafe() { state |= STATE_UNSAFE; } - void mark_safe() { state &= ~STATE_UNSAFE; } - - // -- state encoding -- - //void encode_basic_state(bufferlist& r); - //void decode_basic_state(bufferlist& r, int& off); - - - void encode_file_state(bufferlist& r); - void decode_file_state(bufferlist& r, int& off); - - void encode_hard_state(bufferlist& r); - void decode_hard_state(bufferlist& r, int& off); - - - // -- dirtyness -- - version_t get_version() { return inode.version; } - - bool is_dirty() { return state & STATE_DIRTY; } - bool is_clean() { return !is_dirty(); } - - version_t pre_dirty(); - void _mark_dirty(); - void mark_dirty(version_t projected_dirv); - void mark_clean(); - - - - - CInodeDiscover* replicate_to(int rep); - - - // -- waiting -- - bool waiting_for(int tag); - void add_waiter(int tag, Context *c); - void take_waiting(int tag, list& ls); - void finish_waiting(int mask, int result = 0); - - - bool is_hardlock_write_wanted() { - return waiting_for(CINODE_WAIT_HARDW); - } - bool is_filelock_write_wanted() { - return waiting_for(CINODE_WAIT_FILEW); - } - - // -- caps -- (new) - // client caps - map& get_client_caps() { return client_caps; } - void add_client_cap(int client, Capability& cap) { - if (client_caps.empty()) - get(PIN_CAPS); - assert(client_caps.count(client) == 0); - client_caps[client] = cap; - } - void remove_client_cap(int client) { - assert(client_caps.count(client) == 1); - client_caps.erase(client); - if (client_caps.empty()) - put(PIN_CAPS); - } - Capability* get_client_cap(int client) { - if (client_caps.count(client)) - return &client_caps[client]; - return 0; - } - /* - void set_client_caps(map& cl) { - if (client_caps.empty() && !cl.empty()) - get(PIN_CAPS); - client_caps.clear(); - client_caps = cl; - } - */ - void take_client_caps(map& cl) { - if (!client_caps.empty()) - put(PIN_CAPS); - cl = client_caps; - client_caps.clear(); - } - void merge_client_caps(map& cl, set& new_client_caps) { - if (client_caps.empty() && !cl.empty()) - get(PIN_CAPS); - for (map::iterator it = cl.begin(); - it != cl.end(); - it++) { - new_client_caps.insert(it->first); - if (client_caps.count(it->first)) { - // merge - client_caps[it->first].merge(it->second); - } else { - // new - client_caps[it->first] = it->second; - } - } - } - - // caps issued, wanted - int get_caps_issued() { - int c = 0; - for (map::iterator it = client_caps.begin(); - it != client_caps.end(); - it++) - c |= it->second.issued(); - return c; - } - int get_caps_wanted() { - int w = 0; - for (map::iterator it = client_caps.begin(); - it != client_caps.end(); - it++) { - w |= it->second.wanted(); - //cout << " get_caps_wanted client " << it->first << " " << cap_string(it->second.wanted()) << endl; - } - if (is_auth()) - for (map::iterator it = mds_caps_wanted.begin(); - it != mds_caps_wanted.end(); - it++) { - w |= it->second; - //cout << " get_caps_wanted mds " << it->first << " " << cap_string(it->second) << endl; - } - return w; - } - - - void replicate_relax_locks() { - assert(is_auth()); - assert(!is_replicated()); - dout(10) << " relaxing locks on " << *this << endl; - - if (hardlock.get_state() == LOCK_LOCK && - !hardlock.is_used()) { - dout(10) << " hard now sync " << *this << endl; - hardlock.set_state(LOCK_SYNC); - } - if (filelock.get_state() == LOCK_LOCK) { - if (!filelock.is_used() && - (get_caps_issued() & CAP_FILE_WR) == 0) { - filelock.set_state(LOCK_SYNC); - dout(10) << " file now sync " << *this << endl; - } else { - dout(10) << " can't relax filelock on " << *this << endl; - } - } - } - - - // -- authority -- - int authority(); - - - // -- auth pins -- - int is_auth_pinned() { - return auth_pins; - } - int adjust_nested_auth_pins(int a); - bool can_auth_pin(); - void auth_pin(); - void auth_unpin(); - - - // -- freeze -- - bool is_frozen(); - bool is_frozen_dir(); - bool is_freezing(); - - - // -- reference counting -- - - /* these can be pinned any # of times, and are - linked to an active_request, so they're automatically cleaned - up when a request is finished. pin at will! */ - void request_pin_get() { - if (num_request_pins == 0) get(PIN_REQUEST); - num_request_pins++; - } - void request_pin_put() { - num_request_pins--; - if (num_request_pins == 0) put(PIN_REQUEST); - assert(num_request_pins >= 0); - } - - void bad_put(int by) { - dout(7) << " bad put " << *this << " by " << by << " " << pin_name(by) << " was " << ref << " (" << ref_set << ")" << endl; - assert(ref_set.count(by) == 1); - assert(ref > 0); - } - void bad_get(int by) { - dout(7) << " bad get " << *this << " by " << by << " " << pin_name(by) << " was " << ref << " (" << ref_set << ")" << endl; - assert(ref_set.count(by) == 0); - } - void first_get(); - void last_put(); - - - // -- hierarchy stuff -- -private: - void get_parent(); - void put_parent(); - -public: - void set_primary_parent(CDentry *p) { - assert(parent == 0); - parent = p; - get_parent(); - } - void remove_primary_parent(CDentry *dn) { - assert(dn == parent); - parent = 0; - put_parent(); - } - void add_remote_parent(CDentry *p) { - if (remote_parents.empty()) - get_parent(); - remote_parents.insert(p); - } - void remove_remote_parent(CDentry *p) { - remote_parents.erase(p); - if (remote_parents.empty()) - put_parent(); - } - int num_remote_parents() { - return remote_parents.size(); - } - - - /* - // for giving to clients - void get_dist_spec(set& ls, int auth, timepair_t& now) { - if (( is_dir() && popularity[MDS_POP_CURDOM].get(now) > g_conf.mds_bal_replicate_threshold) || - (!is_dir() && popularity[MDS_POP_JUSTME].get(now) > g_conf.mds_bal_replicate_threshold)) { - //if (!cached_by.empty() && inode.ino > 1) dout(1) << "distributed spec for " << *this << endl; - ls = cached_by; - } - } - */ - - // dbg - void dump(int d = 0); -}; - - - - -// -- encoded state - -// discover - -class CInodeDiscover { - - inode_t inode; - int replica_nonce; - - int hardlock_state; - int filelock_state; - - public: - CInodeDiscover() {} - CInodeDiscover(CInode *in, int nonce) { - inode = in->inode; - replica_nonce = nonce; - - hardlock_state = in->hardlock.get_replica_state(); - filelock_state = in->filelock.get_replica_state(); - } - - inodeno_t get_ino() { return inode.ino; } - int get_replica_nonce() { return replica_nonce; } - - void update_inode(CInode *in) { - in->inode = inode; - - in->replica_nonce = replica_nonce; - in->hardlock.set_state(hardlock_state); - in->filelock.set_state(filelock_state); - } - - void _encode(bufferlist& bl) { - bl.append((char*)&inode, sizeof(inode)); - bl.append((char*)&replica_nonce, sizeof(replica_nonce)); - bl.append((char*)&hardlock_state, sizeof(hardlock_state)); - bl.append((char*)&filelock_state, sizeof(filelock_state)); - } - - void _decode(bufferlist& bl, int& off) { - bl.copy(off,sizeof(inode_t), (char*)&inode); - off += sizeof(inode_t); - bl.copy(off, sizeof(int), (char*)&replica_nonce); - off += sizeof(int); - bl.copy(off, sizeof(hardlock_state), (char*)&hardlock_state); - off += sizeof(hardlock_state); - bl.copy(off, sizeof(filelock_state), (char*)&filelock_state); - off += sizeof(filelock_state); - } - -}; - - -// export - -class CInodeExport { - - struct { - inode_t inode; - meta_load_t popularity_justme; - meta_load_t popularity_curdom; - bool is_dirty; // dirty inode? - - int num_caps; - } st; - - map replicas; - map cap_map; - - CLock hardlock,filelock; - //int remaining_issued; - -public: - CInodeExport() {} - CInodeExport(CInode *in) { - st.inode = in->inode; - st.is_dirty = in->is_dirty(); - replicas = in->replicas; - - hardlock = in->hardlock; - filelock = in->filelock; - - st.popularity_justme.take( in->popularity[MDS_POP_JUSTME] ); - st.popularity_curdom.take( in->popularity[MDS_POP_CURDOM] ); - in->popularity[MDS_POP_ANYDOM] -= st.popularity_curdom; - in->popularity[MDS_POP_NESTED] -= st.popularity_curdom; - - // steal WRITER caps from inode - in->take_client_caps(cap_map); - //remaining_issued = in->get_caps_issued(); - } - ~CInodeExport() { - } - - inodeno_t get_ino() { return st.inode.ino; } - - void update_inode(CInode *in, set& new_client_caps) { - in->inode = st.inode; - - in->popularity[MDS_POP_JUSTME] += st.popularity_justme; - in->popularity[MDS_POP_CURDOM] += st.popularity_curdom; - in->popularity[MDS_POP_ANYDOM] += st.popularity_curdom; - in->popularity[MDS_POP_NESTED] += st.popularity_curdom; - - if (st.is_dirty) - in->_mark_dirty(); - - in->replicas = replicas; - if (!replicas.empty()) - in->get(CInode::PIN_CACHED); - - in->hardlock = hardlock; - in->filelock = filelock; - - // caps - in->merge_client_caps(cap_map, new_client_caps); - } - - void _encode(bufferlist& bl) { - st.num_caps = cap_map.size(); - bl.append((char*)&st, sizeof(st)); - - // cached_by + nonce - ::_encode(replicas, bl); - - hardlock.encode_state(bl); - filelock.encode_state(bl); - - // caps - for (map::iterator it = cap_map.begin(); - it != cap_map.end(); - it++) { - bl.append((char*)&it->first, sizeof(it->first)); - it->second._encode(bl); - } - } - - int _decode(bufferlist& bl, int off = 0) { - bl.copy(off, sizeof(st), (char*)&st); - off += sizeof(st); - - ::_decode(replicas, bl, off); - - hardlock.decode_state(bl, off); - filelock.decode_state(bl, off); - - // caps - for (int i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __CAPABILITY_H -#define __CAPABILITY_H - -#include "include/buffer.h" - -#include -using namespace std; - -#include "config.h" - - -// definite caps -#define CAP_FILE_RDCACHE 1 // client can safely cache reads -#define CAP_FILE_RD 2 // client can read -#define CAP_FILE_WR 4 // client can write -#define CAP_FILE_WREXTEND 8 // client can extend file -#define CAP_FILE_WRBUFFER 16 // client can safely buffer writes -#define CAP_FILE_LAZYIO 32 // client can perform lazy io - - -// heuristics -//#define CAP_FILE_DELAYFLUSH 32 - -inline string cap_string(int cap) -{ - string s; - s = "["; - if (cap & CAP_FILE_RDCACHE) s += " rdcache"; - if (cap & CAP_FILE_RD) s += " rd"; - if (cap & CAP_FILE_WR) s += " wr"; - if (cap & CAP_FILE_WRBUFFER) s += " wrbuffer"; - if (cap & CAP_FILE_WRBUFFER) s += " wrextend"; - if (cap & CAP_FILE_LAZYIO) s += " lazyio"; - s += " ]"; - return s; -} - - -class Capability { - int wanted_caps; // what the client wants (ideally) - - map cap_history; // seq -> cap - long last_sent, last_recv; - - bool suppress; - -public: - Capability(int want=0) : - wanted_caps(want), - last_sent(0), - last_recv(0), - suppress(false) { - //cap_history[last_sent] = 0; - } - - - bool is_suppress() { return suppress; } - void set_suppress(bool b) { suppress = b; } - - bool is_null() { return cap_history.empty() && wanted_caps == 0; } - - // most recently issued caps. - int pending() { - if (cap_history.count(last_sent)) - return cap_history[ last_sent ]; - return 0; - } - - // caps client has confirmed receipt of - int confirmed() { - if (cap_history.count(last_recv)) - return cap_history[ last_recv ]; - return 0; - } - - // caps potentially issued - int issued() { - int c = 0; - for (long seq = last_recv; seq <= last_sent; seq++) { - if (cap_history.count(seq)) { - c |= cap_history[seq]; - dout(10) << " cap issued: seq " << seq << " " << cap_string(cap_history[seq]) << " -> " << cap_string(c) << endl; - } - } - return c; - } - - // caps this client wants to hold - int wanted() { return wanted_caps; } - void set_wanted(int w) { - wanted_caps = w; - } - - // needed - static int needed(int from) { - // strip out wrbuffer, rdcache - return from & (CAP_FILE_WR|CAP_FILE_RD); - } - int needed() { return needed(wanted_caps); } - - // conflicts - static int conflicts(int from) { - int c = 0; - if (from & CAP_FILE_WRBUFFER) c |= CAP_FILE_RDCACHE|CAP_FILE_RD; - if (from & CAP_FILE_WR) c |= CAP_FILE_RDCACHE; - if (from & CAP_FILE_RD) c |= CAP_FILE_WRBUFFER; - if (from & CAP_FILE_RDCACHE) c |= CAP_FILE_WRBUFFER|CAP_FILE_WR; - return c; - } - int wanted_conflicts() { return conflicts(wanted()); } - int needed_conflicts() { return conflicts(needed()); } - int issued_conflicts() { return conflicts(issued()); } - - // issue caps; return seq number. - long issue(int c) { - //int was = pending(); - //no! if (c == was && last_sent) return -1; // repeat of previous? - - ++last_sent; - cap_history[last_sent] = c; - - /* no! - // not recalling, just adding? - if (c & ~was && - cap_history.count(last_sent-1)) { - cap_history.erase(last_sent-1); - } - */ - return last_sent; - } - long get_last_seq() { return last_sent; } - - void merge(Capability& other) { - // issued + pending - int newpending = other.pending() | pending(); - if (other.issued() & ~newpending) - issue(other.issued() | newpending); - issue(newpending); - - // wanted - wanted_caps = wanted_caps | other.wanted(); - } - - // confirm receipt of a previous sent/issued seq. - int confirm_receipt(long seq, int caps) { - int r = 0; - - // old seqs - while (last_recv < seq) { - dout(10) << " cap.confirm_receipt forgetting seq " << last_recv << " " << cap_string(cap_history[last_recv]) << endl; - r |= cap_history[last_recv]; - cap_history.erase(last_recv); - ++last_recv; - } - - // release current? - if (cap_history.count(seq) && - cap_history[seq] != caps) { - dout(10) << " cap.confirm_receipt revising seq " << seq << " " << cap_string(cap_history[seq]) << " -> " << cap_string(caps) << endl; - // note what we're releasing.. - assert(cap_history[seq] & ~caps); - r |= cap_history[seq] & ~caps; - - cap_history[seq] = caps; // confirmed() now less than before.. - } - - // null? - if (caps == 0 && - cap_history.size() == 1 && - cap_history.count(seq)) { - cap_history.clear(); // viola, null! - } - - return r; - } - - // serializers - void _encode(bufferlist& bl) { - bl.append((char*)&wanted_caps, sizeof(wanted_caps)); - bl.append((char*)&last_sent, sizeof(last_sent)); - bl.append((char*)&last_recv, sizeof(last_recv)); - ::_encode(cap_history, bl); - } - void _decode(bufferlist& bl, int& off) { - bl.copy(off, sizeof(wanted_caps), (char*)&wanted_caps); - off += sizeof(wanted_caps); - bl.copy(off, sizeof(last_sent), (char*)&last_sent); - off += sizeof(last_sent); - bl.copy(off, sizeof(last_recv), (char*)&last_recv); - off += sizeof(last_recv); - ::_decode(cap_history, bl, off); - } - -}; - - - - - -#endif diff --git a/tags/20070517_before_mds_merge/mds/ClientMap.h b/tags/20070517_before_mds_merge/mds/ClientMap.h deleted file mode 100644 index 7cd1e496debdd..0000000000000 --- a/tags/20070517_before_mds_merge/mds/ClientMap.h +++ /dev/null @@ -1,85 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __CLIENTMAP_H -#define __CLIENTMAP_H - -#include "msg/Message.h" - -#include -using namespace std; - -#include -using namespace __gnu_cxx; - - -/* - * this structure is used by the MDS purely so that - * it can remember client addresses (entity_inst_t) - * while processing request(s) on behalf of clients. - * as such it's only really a sort of short-term cache. - * - * it also remembers which clients mounted via this MDS, - * for the same reason (so that mounted clients can be - * contacted if necessary). - */ -class ClientMap { - hash_map client_inst; - set client_mount; - hash_map client_ref; - - void inc_ref(int client, const entity_inst_t& inst) { - if (client_inst.count(client)) { - assert(client_inst[client] == inst); - assert(client_ref.count(client)); - } else { - client_inst[client] = inst; - } - client_ref[client]++; - } - void dec_ref(int client) { - assert(client_ref.count(client)); - assert(client_ref[client] > 0); - client_ref[client]--; - if (client_ref[client] == 0) { - client_ref.erase(client); - client_inst.erase(client); - } - } - -public: - const entity_inst_t& get_inst(int client) { - assert(client_inst.count(client)); - return client_inst[client]; - } - const set& get_mount_set() { return client_mount; } - - void add_mount(int client, const entity_inst_t& inst) { - inc_ref(client, inst); - client_mount.insert(client); - } - void rem_mount(int client) { - dec_ref(client); - client_mount.erase(client); - } - - - void add_open(int client, const entity_inst_t& inst) { - inc_ref(client, inst); - } - void dec_open(int client) { - dec_ref(client); - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/mds/IdAllocator.cc b/tags/20070517_before_mds_merge/mds/IdAllocator.cc deleted file mode 100644 index 671bd70a77c27..0000000000000 --- a/tags/20070517_before_mds_merge/mds/IdAllocator.cc +++ /dev/null @@ -1,200 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#define DBLEVEL 20 - -#include "IdAllocator.h" -#include "MDS.h" -#include "MDLog.h" -#include "events/EAlloc.h" - -#include "osdc/Filer.h" - -#include "include/types.h" - -#include "config.h" -#undef dout -#define dout(x) if (x <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".idalloc: " - - -void IdAllocator::init_inode() -{ - memset(&inode, 0, sizeof(inode)); - inode.ino = MDS_INO_IDS_OFFSET + mds->get_nodeid(); - inode.layout = g_OSD_FileLayout; -} - - -idno_t IdAllocator::alloc_id(bool replay) -{ - assert(is_active()); - - // pick one - idno_t id = free.start(); - free.erase(id); - dout(10) << "idalloc " << this << ": alloc id " << id << endl; - - version++; - - // log it - if (!replay) - mds->mdlog->submit_entry(new EAlloc(IDTYPE_INO, id, EALLOC_EV_ALLOC, version)); - - return id; -} - -void IdAllocator::reclaim_id(idno_t id, bool replay) -{ - assert(is_active()); - - dout(10) << "idalloc " << this << ": reclaim id " << id << endl; - free.insert(id); - - version++; - - if (!replay) - mds->mdlog->submit_entry(new EAlloc(IDTYPE_INO, id, EALLOC_EV_FREE, version)); -} - - - -class C_ID_Save : public Context { - IdAllocator *ida; - version_t version; -public: - C_ID_Save(IdAllocator *i, version_t v) : ida(i), version(v) {} - void finish(int r) { - ida->save_2(version); - } -}; - -void IdAllocator::save(Context *onfinish, version_t v) -{ - if (v > 0 && v <= committing_version) { - dout(10) << "save v " << version << " - already saving " - << committing_version << " >= needed " << v << endl; - waitfor_save[v].push_back(onfinish); - return; - } - - dout(10) << "save v " << version << endl; - assert(is_active()); - - bufferlist bl; - - bl.append((char*)&version, sizeof(version)); - ::_encode(free.m, bl); - - committing_version = version; - - if (onfinish) - waitfor_save[version].push_back(onfinish); - - // write (async) - mds->filer->write(inode, - 0, bl.length(), bl, - 0, - 0, new C_ID_Save(this, version)); -} - -void IdAllocator::save_2(version_t v) -{ - dout(10) << "save_2 v " << v << endl; - - committed_version = v; - - list ls; - while (!waitfor_save.empty()) { - if (waitfor_save.begin()->first > v) break; - ls.splice(ls.end(), waitfor_save.begin()->second); - waitfor_save.erase(waitfor_save.begin()); - } - finish_contexts(ls,0); -} - - -void IdAllocator::reset() -{ - init_inode(); - - free.clear(); - - // use generic range FIXME THIS IS CRAP - free.insert((long long)0x1000000 * (long long)(mds->get_nodeid()+1), - (long long)0x1000000 * (long long)(mds->get_nodeid()+2) - 1LL); - //free[ID_INO].dump(); - - //free[ID_FH].map_insert(10000000LL * (mds->get_nodeid()+1), - //10000000LL * (mds->get_nodeid()+2) - 1); - //free[ID_FH].dump(); - - state = STATE_ACTIVE; -} - - - -// ----------------------- - -class C_ID_Load : public Context { -public: - IdAllocator *ida; - Context *onfinish; - bufferlist bl; - C_ID_Load(IdAllocator *i, Context *o) : ida(i), onfinish(o) {} - void finish(int r) { - ida->load_2(r, bl, onfinish); - } -}; - -void IdAllocator::load(Context *onfinish) -{ - dout(10) << "load" << endl; - - init_inode(); - - assert(is_undef()); - state = STATE_OPENING; - - C_ID_Load *c = new C_ID_Load(this, onfinish); - mds->filer->read(inode, - 0, inode.layout.stripe_size, - &c->bl, - c); -} - -void IdAllocator::load_2(int r, bufferlist& bl, Context *onfinish) -{ - assert(is_opening()); - state = STATE_ACTIVE; - - if (r > 0) { - dout(10) << "load_2 got " << bl.length() << " bytes" << endl; - int off = 0; - bl.copy(off, sizeof(version), (char*)&version); - off += sizeof(version); - ::_decode(free.m, bl, off); - committed_version = version; - } - else { - dout(10) << "load_2 found no alloc file" << endl; - assert(0); // this shouldn't happen if mkfs finished. - reset(); - } - - if (onfinish) { - onfinish->finish(0); - delete onfinish; - } -} diff --git a/tags/20070517_before_mds_merge/mds/IdAllocator.h b/tags/20070517_before_mds_merge/mds/IdAllocator.h deleted file mode 100644 index c79266d3e71b6..0000000000000 --- a/tags/20070517_before_mds_merge/mds/IdAllocator.h +++ /dev/null @@ -1,79 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __IDALLOCATOR_H -#define __IDALLOCATOR_H - -#include "include/types.h" -#include "include/interval_set.h" -#include "include/buffer.h" -#include "include/Context.h" - -class MDS; - -#define IDTYPE_INO 1 -typedef inodeno_t idno_t; - -class IdAllocator { - MDS *mds; - inode_t inode; - - static const int STATE_UNDEF = 0; - static const int STATE_OPENING = 1; - static const int STATE_ACTIVE = 2; - //static const int STATE_COMMITTING = 3; - int state; - - version_t version, committing_version, committed_version; - - interval_set free; // unused ids - - map > waitfor_save; - - public: - IdAllocator(MDS *m) : - mds(m), - state(STATE_UNDEF), - version(0), committing_version(0), committed_version(0) - { - } - - void init_inode(); - - // alloc or reclaim ids - idno_t alloc_id(bool replay=false); - void reclaim_id(idno_t id, bool replay=false); - - version_t get_version() { return version; } - version_t get_committed_version() { return committed_version; } - - // load/save from disk (hack) - bool is_undef() { return state == STATE_UNDEF; } - bool is_active() { return state == STATE_ACTIVE; } - bool is_opening() { return state == STATE_OPENING; } - - void reset(); - void save(Context *onfinish=0, version_t need=0); - void save_2(version_t v); - - void shutdown() { - if (is_active()) save(0); - } - - void load(Context *onfinish); - void load_2(int, bufferlist&, Context *onfinish); - -}; - -#endif diff --git a/tags/20070517_before_mds_merge/mds/Lock.h b/tags/20070517_before_mds_merge/mds/Lock.h deleted file mode 100644 index 59d04d5b66eb7..0000000000000 --- a/tags/20070517_before_mds_merge/mds/Lock.h +++ /dev/null @@ -1,321 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __LOCK_H -#define __LOCK_H - -#include -#include -using namespace std; - -#include "include/buffer.h" - -#include "Capability.h" - -// states and such. -// C = cache reads, R = read, W = write, A = append, B = buffer writes, L = lazyio - -// basic lock -----auth-------- ---replica------- -#define LOCK_SYNC 0 // AR R . / C R . . . L R . / C R . . . L stat() -#define LOCK_LOCK 1 // AR R W / C . . . . . . . / C . . . . . truncate() -#define LOCK_GLOCKR 2 // AR R . / C . . . . . . . / C . . . . . - -// file lock states -#define LOCK_GLOCKL 3 // A . . / C . . . . . loner -> lock -#define LOCK_GLOCKM 4 // A . . / . . . . . . -#define LOCK_MIXED 5 // AR . . / . R W A . L . . / . R . . . L -#define LOCK_GMIXEDR 6 // AR R . / . R . . . L . . / . R . . . L -#define LOCK_GMIXEDL 7 // A . . / . . . . . L loner -> mixed - -#define LOCK_LONER 8 // A . . / C R W A B L (lock) -#define LOCK_GLONERR 9 // A . . / . R . . . L -#define LOCK_GLONERM 10 // A . . / . R W A . L - -#define LOCK_GSYNCL 11 // A . . / C ? . . . L loner -> sync (*) FIXME: let old loner keep R, somehow... -#define LOCK_GSYNCM 12 // A . . / . R . . . L - -// 4 stable -// +9 transition -// 13 total - -/* no append scenarios: - -loner + truncate(): - - loner needs to lose A (?unless it's the loner doing the truncate?) -loner + statlite(size): - - loner needs to lose A - -any + statlite(size) - - all lose A - -any + statlite(mtime) - - all lose W - --> we need to add lonerfixed and mixedfixed states (and associated transitions) - in order to efficiently support statlite(size) and truncate(). until then, - we have to LOCK. - - */ - -// -- lock... hard or file - -class Message; - -class CLock { - protected: - // lock state - char state; - set gather_set; // auth - - // local state - int nread; - Message *wrlock_by; - - - public: - CLock() : - state(LOCK_SYNC), - nread(0), - wrlock_by(0) { - } - - // encode/decode - void encode_state(bufferlist& bl) { - bl.append((char*)&state, sizeof(state)); - _encode(gather_set, bl); - - //bl.append((char*)&nread, sizeof(nread)); - //bl.append((char*)&nwrite, sizeof(nwrite)); - } - void decode_state(bufferlist& bl, int& off) { - bl.copy(off, sizeof(state), (char*)&state); - off += sizeof(state); - _decode(gather_set, bl, off); - - //bl.copy(off, sizeof(nread), (char*)&nread); - //off += sizeof(nread); - //bl.copy(off, sizeof(nwrite), (char*)&nwrite); - //off += sizeof(nwrite); - } - - char get_state() { return state; } - char set_state(char s) { - state = s; - assert(!is_stable() || gather_set.size() == 0); // gather should be empty in stable states. - return s; - }; - - char get_replica_state() { - switch (state) { - case LOCK_LOCK: - case LOCK_GLOCKM: - case LOCK_GLOCKL: - case LOCK_GLOCKR: - case LOCK_LONER: - case LOCK_GLONERR: - case LOCK_GLONERM: - return LOCK_LOCK; - case LOCK_MIXED: - case LOCK_GMIXEDR: - return LOCK_MIXED; - case LOCK_SYNC: - return LOCK_SYNC; - - // after gather auth will bc LOCK_AC_MIXED or whatever - case LOCK_GSYNCM: - return LOCK_MIXED; - case LOCK_GSYNCL: - case LOCK_GMIXEDL: // ** LOCK isn't exact right state, but works. - return LOCK_LOCK; - - default: - assert(0); - } - return 0; - } - - // gather set - set& get_gather_set() { return gather_set; } - void init_gather(const map& i) { - for (map::const_iterator p = i.begin(); p != i.end(); ++p) - gather_set.insert(p->first); - } - bool is_gathering(int i) { - return gather_set.count(i); - } - void clear_gather() { - gather_set.clear(); - } - - // ref counting - int get_read() { return ++nread; } - int put_read() { - assert(nread>0); - return --nread; - } - int get_nread() { return nread; } - - void get_write(Message *who) { - assert(wrlock_by == 0); - wrlock_by = who; - } - void put_write() { - assert(wrlock_by); - wrlock_by = 0; - } - bool is_wrlocked() { return wrlock_by ? true:false; } - Message *get_wrlocked_by() { return wrlock_by; } - bool is_used() { - return (is_wrlocked() || (nread>0)) ? true:false; - } - - - // stable - bool is_stable() { - return (state == LOCK_SYNC) || - (state == LOCK_LOCK) || - (state == LOCK_MIXED) || - (state == LOCK_LONER); - } - - // read/write access - bool can_read(bool auth) { - if (auth) - return (state == LOCK_SYNC) || (state == LOCK_GMIXEDR) - || (state == LOCK_GLOCKR) || (state == LOCK_LOCK); - else - return (state == LOCK_SYNC); - } - bool can_read_soon(bool auth) { - if (auth) - return (state == LOCK_GLOCKL); - else - return false; - } - - bool can_write(bool auth) { - if (auth) - return (state == LOCK_LOCK) && !is_wrlocked(); - else - return false; - } - bool can_write_soon(bool auth) { - if (auth) - return (state == LOCK_GLOCKR) || (state == LOCK_GLOCKL) - || (state == LOCK_GLOCKM); - else - return false; - } - - // client caps allowed - int caps_allowed_ever(bool auth) { - if (auth) - return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_WR | CAP_FILE_WREXTEND | CAP_FILE_WRBUFFER | CAP_FILE_LAZYIO; - else - return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_LAZYIO; - } - int caps_allowed(bool auth) { - if (auth) - switch (state) { - case LOCK_SYNC: - return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_LAZYIO; - case LOCK_LOCK: - case LOCK_GLOCKR: - case LOCK_GLOCKL: - return CAP_FILE_RDCACHE; - - case LOCK_GLOCKM: - return 0; - - case LOCK_MIXED: - return CAP_FILE_RD | CAP_FILE_WR | CAP_FILE_WREXTEND | CAP_FILE_LAZYIO; - case LOCK_GMIXEDR: - return CAP_FILE_RD | CAP_FILE_LAZYIO; - case LOCK_GMIXEDL: - return 0; - - case LOCK_LONER: // single client writer, of course. - return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_WR | CAP_FILE_WREXTEND | CAP_FILE_WRBUFFER | CAP_FILE_LAZYIO; - case LOCK_GLONERR: - return CAP_FILE_RD | CAP_FILE_LAZYIO; - case LOCK_GLONERM: - return CAP_FILE_RD | CAP_FILE_WR | CAP_FILE_WREXTEND | CAP_FILE_LAZYIO; - - case LOCK_GSYNCL: - return CAP_FILE_RDCACHE | CAP_FILE_LAZYIO; - case LOCK_GSYNCM: - return CAP_FILE_RD | CAP_FILE_LAZYIO; - } - else - switch (state) { - case LOCK_SYNC: - return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_LAZYIO; - case LOCK_LOCK: - case LOCK_GLOCKR: - return CAP_FILE_RDCACHE; - case LOCK_GMIXEDR: - case LOCK_MIXED: - return CAP_FILE_RD | CAP_FILE_LAZYIO; - } - assert(0); - return 0; - } - - friend class MDCache; - friend class Locker; - friend class Migrator; -}; - -//ostream& operator<<(ostream& out, CLock& l); -inline ostream& operator<<(ostream& out, CLock& l) -{ - static char* __lock_states[] = { - "sync", - "lock", - "glockr", - "glockl", - "glockm", - "mixed", - "gmixedr", - "gmixedl", - "loner", - "glonerr", - "glonerm", - "gsyncl", - "gsyncm" - }; - - out << "(" << __lock_states[(int)l.get_state()]; - - if (!l.get_gather_set().empty()) out << " g=" << l.get_gather_set(); - - if (l.get_nread()) - out << " r=" << l.get_nread(); - if (l.is_wrlocked()) - out << " w=" << l.get_wrlocked_by(); - - // rw? - /* - out << " "; - if (l.can_read(true)) out << "r[" << l.get_nread() << "]"; - if (l.can_write(true)) out << "w[" << l.get_nwrite() << "]"; - out << "/"; - if (l.can_read(false)) out << "r[" << l.get_nread() << "]"; - if (l.can_write(false)) out << "w[" << l.get_nwrite() << "]"; - */ - out << ")"; - return out; -} - -#endif diff --git a/tags/20070517_before_mds_merge/mds/Locker.cc b/tags/20070517_before_mds_merge/mds/Locker.cc deleted file mode 100644 index f1ada7ea26913..0000000000000 --- a/tags/20070517_before_mds_merge/mds/Locker.cc +++ /dev/null @@ -1,2246 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include "MDS.h" -#include "MDCache.h" -#include "Locker.h" -#include "Server.h" -#include "CInode.h" -#include "CDir.h" -#include "CDentry.h" -#include "Migrator.h" - -#include "MDBalancer.h" -#include "MDLog.h" -#include "MDSMap.h" - -#include "include/filepath.h" - -#include "events/EString.h" -#include "events/EUpdate.h" -#include "events/EUnlink.h" - -#include "msg/Messenger.h" - -#include "messages/MGenericMessage.h" -#include "messages/MDiscover.h" -#include "messages/MDiscoverReply.h" - -#include "messages/MDirUpdate.h" - -#include "messages/MInodeFileCaps.h" - -#include "messages/MInodeLink.h" -#include "messages/MInodeLinkAck.h" -#include "messages/MInodeUnlink.h" -#include "messages/MInodeUnlinkAck.h" - -#include "messages/MLock.h" -#include "messages/MDentryUnlink.h" - -#include "messages/MClientRequest.h" -#include "messages/MClientFileCaps.h" - -#include -#include - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".locker " - - - -void Locker::dispatch(Message *m) -{ - switch (m->get_type()) { - - // locking - case MSG_MDS_LOCK: - handle_lock((MLock*)m); - break; - - // cache fun - case MSG_MDS_INODEFILECAPS: - handle_inode_file_caps((MInodeFileCaps*)m); - break; - - case MSG_CLIENT_FILECAPS: - handle_client_file_caps((MClientFileCaps*)m); - break; - - - - default: - assert(0); - } -} - - -void Locker::send_lock_message(CInode *in, int msg, int type) -{ - for (map::iterator it = in->replicas_begin(); - it != in->replicas_end(); - it++) { - MLock *m = new MLock(msg, mds->get_nodeid()); - m->set_ino(in->ino(), type); - mds->send_message_mds(m, it->first, MDS_PORT_LOCKER); - } -} - - -void Locker::send_lock_message(CInode *in, int msg, int type, bufferlist& data) -{ - for (map::iterator it = in->replicas_begin(); - it != in->replicas_end(); - it++) { - MLock *m = new MLock(msg, mds->get_nodeid()); - m->set_ino(in->ino(), type); - m->set_data(data); - mds->send_message_mds(m, it->first, MDS_PORT_LOCKER); - } -} - -void Locker::send_lock_message(CDentry *dn, int msg) -{ - for (map::iterator it = dn->replicas_begin(); - it != dn->replicas_end(); - it++) { - MLock *m = new MLock(msg, mds->get_nodeid()); - m->set_dn(dn->dir->ino(), dn->name); - mds->send_message_mds(m, it->first, MDS_PORT_LOCKER); - } -} - - - -// file i/o ----------------------------------------- - -__uint64_t Locker::issue_file_data_version(CInode *in) -{ - dout(7) << "issue_file_data_version on " << *in << endl; - return in->inode.file_data_version; -} - - -Capability* Locker::issue_new_caps(CInode *in, - int mode, - MClientRequest *req) -{ - dout(7) << "issue_new_caps for mode " << mode << " on " << *in << endl; - - // my needs - int my_client = req->get_client(); - int my_want = 0; - if (mode & FILE_MODE_R) my_want |= CAP_FILE_RDCACHE | CAP_FILE_RD; - if (mode & FILE_MODE_W) my_want |= CAP_FILE_WRBUFFER | CAP_FILE_WR; - - // register a capability - Capability *cap = in->get_client_cap(my_client); - if (!cap) { - // new cap - Capability c(my_want); - in->add_client_cap(my_client, c); - cap = in->get_client_cap(my_client); - - // note client addr - mds->clientmap.add_open(my_client, req->get_client_inst()); - - } else { - // make sure it has sufficient caps - if (cap->wanted() & ~my_want) { - // augment wanted caps for this client - cap->set_wanted( cap->wanted() | my_want ); - } - } - - // suppress file cap messages for this guy for a few moments (we'll bundle with the open() reply) - cap->set_suppress(true); - int before = cap->pending(); - - if (in->is_auth()) { - // [auth] twiddle mode? - inode_file_eval(in); - } else { - // [replica] tell auth about any new caps wanted - request_inode_file_caps(in); - } - - // issue caps (pot. incl new one) - issue_caps(in); // note: _eval above may have done this already... - - // re-issue whatever we can - cap->issue(cap->pending()); - - // ok, stop suppressing. - cap->set_suppress(false); - - int now = cap->pending(); - if (before != now && - (before & CAP_FILE_WR) == 0 && - (now & CAP_FILE_WR)) { - // FIXME FIXME FIXME - } - - // twiddle file_data_version? - if ((before & CAP_FILE_WRBUFFER) == 0 && - (now & CAP_FILE_WRBUFFER)) { - in->inode.file_data_version++; - dout(7) << " incrementing file_data_version, now " << in->inode.file_data_version << " for " << *in << endl; - } - - return cap; -} - - - -bool Locker::issue_caps(CInode *in) -{ - // allowed caps are determined by the lock mode. - int allowed = in->filelock.caps_allowed(in->is_auth()); - dout(7) << "issue_caps filelock allows=" << cap_string(allowed) - << " on " << *in << endl; - - // count conflicts with - int nissued = 0; - - // client caps - for (map::iterator it = in->client_caps.begin(); - it != in->client_caps.end(); - it++) { - if (it->second.issued() != (it->second.wanted() & allowed)) { - // issue - nissued++; - - int before = it->second.pending(); - long seq = it->second.issue(it->second.wanted() & allowed); - int after = it->second.pending(); - - // twiddle file_data_version? - if (!(before & CAP_FILE_WRBUFFER) && - (after & CAP_FILE_WRBUFFER)) { - dout(7) << " incrementing file_data_version for " << *in << endl; - in->inode.file_data_version++; - } - - if (seq > 0 && - !it->second.is_suppress()) { - dout(7) << " sending MClientFileCaps to client" << it->first << " seq " << it->second.get_last_seq() << " new pending " << cap_string(it->second.pending()) << " was " << cap_string(before) << endl; - mds->messenger->send_message(new MClientFileCaps(in->inode, - it->second.get_last_seq(), - it->second.pending(), - it->second.wanted()), - mds->clientmap.get_inst(it->first), - 0, MDS_PORT_LOCKER); - } - } - } - - return (nissued == 0); // true if no re-issued, no callbacks -} - - - -void Locker::request_inode_file_caps(CInode *in) -{ - int wanted = in->get_caps_wanted(); - if (wanted != in->replica_caps_wanted) { - - if (wanted == 0) { - if (in->replica_caps_wanted_keep_until > g_clock.recent_now()) { - // ok, release them finally! - in->replica_caps_wanted_keep_until.sec_ref() = 0; - dout(7) << "request_inode_file_caps " << cap_string(wanted) - << " was " << cap_string(in->replica_caps_wanted) - << " no keeping anymore " - << " on " << *in - << endl; - } - else if (in->replica_caps_wanted_keep_until.sec() == 0) { - in->replica_caps_wanted_keep_until = g_clock.recent_now(); - in->replica_caps_wanted_keep_until.sec_ref() += 2; - - dout(7) << "request_inode_file_caps " << cap_string(wanted) - << " was " << cap_string(in->replica_caps_wanted) - << " keeping until " << in->replica_caps_wanted_keep_until - << " on " << *in - << endl; - return; - } else { - // wait longer - return; - } - } else { - in->replica_caps_wanted_keep_until.sec_ref() = 0; - } - assert(!in->is_auth()); - - int auth = in->authority(); - dout(7) << "request_inode_file_caps " << cap_string(wanted) - << " was " << cap_string(in->replica_caps_wanted) - << " on " << *in << " to mds" << auth << endl; - assert(!in->is_auth()); - - in->replica_caps_wanted = wanted; - mds->send_message_mds(new MInodeFileCaps(in->ino(), mds->get_nodeid(), - in->replica_caps_wanted), - auth, MDS_PORT_LOCKER); - } else { - in->replica_caps_wanted_keep_until.sec_ref() = 0; - } -} - -void Locker::handle_inode_file_caps(MInodeFileCaps *m) -{ - CInode *in = mdcache->get_inode(m->get_ino()); - assert(in); - assert(in->is_auth() || in->is_proxy()); - - dout(7) << "handle_inode_file_caps replica mds" << m->get_from() << " wants caps " << cap_string(m->get_caps()) << " on " << *in << endl; - - if (in->is_proxy()) { - dout(7) << "proxy, fw" << endl; - mds->send_message_mds(m, in->authority(), MDS_PORT_LOCKER); - return; - } - - if (m->get_caps()) - in->mds_caps_wanted[m->get_from()] = m->get_caps(); - else - in->mds_caps_wanted.erase(m->get_from()); - - inode_file_eval(in); - delete m; -} - - -/* - * note: we only get these from the client if - * - we are calling back previously issued caps (fewer than the client previously had) - * - or if the client releases (any of) its caps on its own - */ -void Locker::handle_client_file_caps(MClientFileCaps *m) -{ - int client = m->get_source().num(); - CInode *in = mdcache->get_inode(m->get_ino()); - Capability *cap = 0; - if (in) - cap = in->get_client_cap(client); - - if (!in || !cap) { - if (!in) { - dout(7) << "handle_client_file_caps on unknown ino " << m->get_ino() << ", dropping" << endl; - } else { - dout(7) << "handle_client_file_caps no cap for client" << client << " on " << *in << endl; - } - delete m; - return; - } - - assert(cap); - - // filter wanted based on what we could ever give out (given auth/replica status) - int wanted = m->get_wanted() & in->filelock.caps_allowed_ever(in->is_auth()); - - dout(7) << "handle_client_file_caps seq " << m->get_seq() - << " confirms caps " << cap_string(m->get_caps()) - << " wants " << cap_string(wanted) - << " from client" << client - << " on " << *in - << endl; - - // update wanted - if (cap->wanted() != wanted) - cap->set_wanted(wanted); - - // confirm caps - int had = cap->confirm_receipt(m->get_seq(), m->get_caps()); - int has = cap->confirmed(); - if (cap->is_null()) { - dout(7) << " cap for client" << client << " is now null, removing from " << *in << endl; - in->remove_client_cap(client); - if (!in->is_auth()) - request_inode_file_caps(in); - - // dec client addr counter - mds->clientmap.dec_open(client); - - // tell client. - MClientFileCaps *r = new MClientFileCaps(in->inode, - 0, 0, 0, - MClientFileCaps::FILECAP_RELEASE); - mds->messenger->send_message(r, m->get_source_inst(), 0, MDS_PORT_LOCKER); - } - - // merge in atime? - if (m->get_inode().atime > in->inode.atime) { - dout(7) << " taking atime " << m->get_inode().atime << " > " - << in->inode.atime << " for " << *in << endl; - in->inode.atime = m->get_inode().atime; - } - - if ((has|had) & CAP_FILE_WR) { - bool dirty = false; - - // mtime - if (m->get_inode().mtime > in->inode.mtime) { - dout(7) << " taking mtime " << m->get_inode().mtime << " > " - << in->inode.mtime << " for " << *in << endl; - in->inode.mtime = m->get_inode().mtime; - dirty = true; - } - // size - if (m->get_inode().size > in->inode.size) { - dout(7) << " taking size " << m->get_inode().size << " > " - << in->inode.size << " for " << *in << endl; - in->inode.size = m->get_inode().size; - dirty = true; - } - - if (dirty) - mds->mdlog->submit_entry(new EString("cap inode update dirty fixme")); - } - - // reevaluate, waiters - inode_file_eval(in); - in->finish_waiting(CINODE_WAIT_CAPS, 0); - - delete m; -} - - - - - - - - - - -// locks ---------------------------------------------------------------- - -/* - - -INODES: - -= two types of inode metadata: - hard - uid/gid, mode - file - mtime, size - ? atime - atime (*) <-- we want a lazy update strategy? - -= correspondingly, two types of inode locks: - hardlock - hard metadata - filelock - file metadata - - -> These locks are completely orthogonal! - -= metadata ops and how they affect inode metadata: - sma=size mtime atime - HARD FILE OP - files: - R RRR stat - RW chmod/chown - R W touch ?ctime - R openr - W read atime - R openw - Wc openwc ?ctime - WW write size mtime - close - - dirs: - R W readdir atime - RRR ( + implied stats on files) - Rc WW mkdir (ctime on new dir, size+mtime on parent dir) - R WW link/unlink/rename/rmdir (size+mtime on dir) - - - -= relationship to client (writers): - - - ops in question are - - stat ... need reasonable value for mtime (+ atime?) - - maybe we want a "quicksync" type operation instead of full lock - - truncate ... need to stop writers for the atomic truncate operation - - need a full lock - - - - -= modes - - SYNC - Rauth Rreplica Wauth Wreplica - sync - - - - - -ALSO: - - dirlock - no dir changes (prior to unhashing) - denlock - dentry lock (prior to unlink, rename) - - -*/ - - -void Locker::handle_lock(MLock *m) -{ - switch (m->get_otype()) { - case LOCK_OTYPE_IHARD: - handle_lock_inode_hard(m); - break; - - case LOCK_OTYPE_IFILE: - handle_lock_inode_file(m); - break; - - case LOCK_OTYPE_DIR: - handle_lock_dir(m); - break; - - case LOCK_OTYPE_DN: - handle_lock_dn(m); - break; - - default: - dout(7) << "handle_lock got otype " << m->get_otype() << endl; - assert(0); - break; - } -} - - - -// =============================== -// hard inode metadata - -bool Locker::inode_hard_read_try(CInode *in, Context *con) -{ - dout(7) << "inode_hard_read_try on " << *in << endl; - - // can read? grab ref. - if (in->hardlock.can_read(in->is_auth())) - return true; - - assert(!in->is_auth()); - - // wait! - dout(7) << "inode_hard_read_try waiting on " << *in << endl; - in->add_waiter(CINODE_WAIT_HARDR, con); - return false; -} - -bool Locker::inode_hard_read_start(CInode *in, MClientRequest *m) -{ - dout(7) << "inode_hard_read_start on " << *in << endl; - - // can read? grab ref. - if (in->hardlock.can_read(in->is_auth())) { - in->hardlock.get_read(); - return true; - } - - // can't read, and replicated. - assert(!in->is_auth()); - - // wait! - dout(7) << "inode_hard_read_start waiting on " << *in << endl; - in->add_waiter(CINODE_WAIT_HARDR, new C_MDS_RetryRequest(mds, m, in)); - return false; -} - - -void Locker::inode_hard_read_finish(CInode *in) -{ - // drop ref - assert(in->hardlock.can_read(in->is_auth())); - in->hardlock.put_read(); - - dout(7) << "inode_hard_read_finish on " << *in << endl; - - //if (in->hardlock.get_nread() == 0) in->finish_waiting(CINODE_WAIT_HARDNORD); -} - - -bool Locker::inode_hard_write_start(CInode *in, MClientRequest *m) -{ - dout(7) << "inode_hard_write_start on " << *in << endl; - - // if not replicated, i can twiddle lock at will - if (in->is_auth() && - !in->is_replicated() && - in->hardlock.get_state() != LOCK_LOCK) - in->hardlock.set_state(LOCK_LOCK); - - // can write? grab ref. - if (in->hardlock.can_write(in->is_auth())) { - assert(in->is_auth()); - if (!in->can_auth_pin()) { - dout(7) << "inode_hard_write_start waiting for authpinnable on " << *in << endl; - in->add_waiter(CINODE_WAIT_AUTHPINNABLE, new C_MDS_RetryRequest(mds, m, in)); - return false; - } - - in->auth_pin(); // ugh, can't condition this on nwrite==0 bc we twiddle that in handle_lock_* - in->hardlock.get_write(m); - return true; - } - - // can't write, replicated. - if (in->is_auth()) { - // auth - if (in->hardlock.can_write_soon(in->is_auth())) { - // just wait - } else { - // initiate lock - inode_hard_lock(in); - } - - dout(7) << "inode_hard_write_start waiting on " << *in << endl; - in->add_waiter(CINODE_WAIT_HARDW, new C_MDS_RetryRequest(mds, m, in)); - - return false; - } else { - // replica - // fw to auth - int auth = in->authority(); - dout(7) << "inode_hard_write_start " << *in << " on replica, fw to auth " << auth << endl; - assert(auth != mds->get_nodeid()); - mdcache->request_forward(m, auth); - return false; - } -} - - -void Locker::inode_hard_write_finish(CInode *in) -{ - // drop ref - //assert(in->hardlock.can_write(in->is_auth())); - in->hardlock.put_write(); - in->auth_unpin(); - dout(7) << "inode_hard_write_finish on " << *in << endl; - - // others waiting? - if (in->is_hardlock_write_wanted()) { - // wake 'em up - in->take_waiting(CINODE_WAIT_HARDW, mds->finished_queue); - } else { - // auto-sync if alone. - if (in->is_auth() && - !in->is_replicated() && - in->hardlock.get_state() != LOCK_SYNC) - in->hardlock.set_state(LOCK_SYNC); - - inode_hard_eval(in); - } -} - - -void Locker::inode_hard_eval(CInode *in) -{ - // finished gather? - if (in->is_auth() && - !in->hardlock.is_stable() && - in->hardlock.gather_set.empty()) { - dout(7) << "inode_hard_eval finished gather on " << *in << endl; - switch (in->hardlock.get_state()) { - case LOCK_GLOCKR: - in->hardlock.set_state(LOCK_LOCK); - - // waiters - //in->hardlock.get_write(); - in->finish_waiting(CINODE_WAIT_HARDRWB|CINODE_WAIT_HARDSTABLE); - //in->hardlock.put_write(); - break; - - default: - assert(0); - } - } - if (!in->hardlock.is_stable()) return; - - if (in->is_auth()) { - - // sync? - if (in->is_replicated() && - in->is_hardlock_write_wanted() && - in->hardlock.get_state() != LOCK_SYNC) { - dout(7) << "inode_hard_eval stable, syncing " << *in << endl; - inode_hard_sync(in); - } - - } else { - // replica - } -} - - -// mid - -void Locker::inode_hard_sync(CInode *in) -{ - dout(7) << "inode_hard_sync on " << *in << endl; - assert(in->is_auth()); - - // check state - if (in->hardlock.get_state() == LOCK_SYNC) - return; // already sync - if (in->hardlock.get_state() == LOCK_GLOCKR) - assert(0); // um... hmm! - assert(in->hardlock.get_state() == LOCK_LOCK); - - // hard data - bufferlist harddata; - in->encode_hard_state(harddata); - - // bcast to replicas - send_lock_message(in, LOCK_AC_SYNC, LOCK_OTYPE_IHARD, harddata); - - // change lock - in->hardlock.set_state(LOCK_SYNC); - - // waiters? - in->finish_waiting(CINODE_WAIT_HARDSTABLE); -} - -void Locker::inode_hard_lock(CInode *in) -{ - dout(7) << "inode_hard_lock on " << *in << " hardlock=" << in->hardlock << endl; - assert(in->is_auth()); - - // check state - if (in->hardlock.get_state() == LOCK_LOCK || - in->hardlock.get_state() == LOCK_GLOCKR) - return; // already lock or locking - assert(in->hardlock.get_state() == LOCK_SYNC); - - // bcast to replicas - send_lock_message(in, LOCK_AC_LOCK, LOCK_OTYPE_IHARD); - - // change lock - in->hardlock.set_state(LOCK_GLOCKR); - in->hardlock.init_gather(in->get_replicas()); -} - - - - - -// messenger - -void Locker::handle_lock_inode_hard(MLock *m) -{ - assert(m->get_otype() == LOCK_OTYPE_IHARD); - - if (mds->logger) mds->logger->inc("lih"); - - int from = m->get_asker(); - CInode *in = mdcache->get_inode(m->get_ino()); - - if (LOCK_AC_FOR_AUTH(m->get_action())) { - // auth - assert(in); - assert(in->is_auth() || in->is_proxy()); - dout(7) << "handle_lock_inode_hard " << *in << " hardlock=" << in->hardlock << endl; - - if (in->is_proxy()) { - // fw - int newauth = in->authority(); - assert(newauth >= 0); - if (from == newauth) { - dout(7) << "handle_lock " << m->get_ino() << " from " << from << ": proxy, but from new auth, dropping" << endl; - delete m; - } else { - dout(7) << "handle_lock " << m->get_ino() << " from " << from << ": proxy, fw to " << newauth << endl; - mds->send_message_mds(m, newauth, MDS_PORT_LOCKER); - } - return; - } - } else { - // replica - if (!in) { - dout(7) << "handle_lock_inode_hard " << m->get_ino() << ": don't have it anymore" << endl; - /* do NOT nak.. if we go that route we need to duplicate all the nonce funkiness - to keep gather_set a proper/correct subset of cached_by. better to use the existing - cacheexpire mechanism instead! - */ - delete m; - return; - } - - assert(!in->is_auth()); - } - - dout(7) << "handle_lock_inode_hard a=" << m->get_action() << " from " << from << " " << *in << " hardlock=" << in->hardlock << endl; - - CLock *lock = &in->hardlock; - - switch (m->get_action()) { - // -- replica -- - case LOCK_AC_SYNC: - assert(lock->get_state() == LOCK_LOCK); - - { // assim data - int off = 0; - in->decode_hard_state(m->get_data(), off); - } - - // update lock - lock->set_state(LOCK_SYNC); - - // no need to reply - - // waiters - in->finish_waiting(CINODE_WAIT_HARDR|CINODE_WAIT_HARDSTABLE); - break; - - case LOCK_AC_LOCK: - assert(lock->get_state() == LOCK_SYNC); - //|| lock->get_state() == LOCK_GLOCKR); - - // wait for readers to finish? - if (lock->get_nread() > 0) { - dout(7) << "handle_lock_inode_hard readers, waiting before ack on " << *in << endl; - lock->set_state(LOCK_GLOCKR); - in->add_waiter(CINODE_WAIT_HARDNORD, - new C_MDS_RetryMessage(mds,m)); - assert(0); // does this ever happen? (if so, fix hard_read_finish, and CInodeExport.update_inode!) - return; - } else { - - // update lock and reply - lock->set_state(LOCK_LOCK); - - { - MLock *reply = new MLock(LOCK_AC_LOCKACK, mds->get_nodeid()); - reply->set_ino(in->ino(), LOCK_OTYPE_IHARD); - mds->send_message_mds(reply, from, MDS_PORT_LOCKER); - } - } - break; - - - // -- auth -- - case LOCK_AC_LOCKACK: - assert(lock->state == LOCK_GLOCKR); - assert(lock->gather_set.count(from)); - lock->gather_set.erase(from); - - if (lock->gather_set.size()) { - dout(7) << "handle_lock_inode_hard " << *in << " from " << from << ", still gathering " << lock->gather_set << endl; - } else { - dout(7) << "handle_lock_inode_hard " << *in << " from " << from << ", last one" << endl; - inode_hard_eval(in); - } - } - delete m; -} - - - - -// ===================== -// soft inode metadata - - -bool Locker::inode_file_read_start(CInode *in, MClientRequest *m) -{ - dout(7) << "inode_file_read_start " << *in << " filelock=" << in->filelock << endl; - - // can read? grab ref. - if (in->filelock.can_read(in->is_auth())) { - in->filelock.get_read(); - return true; - } - - // can't read, and replicated. - if (in->filelock.can_read_soon(in->is_auth())) { - // wait - dout(7) << "inode_file_read_start can_read_soon " << *in << endl; - } else { - if (in->is_auth()) { - // auth - - // FIXME or qsync? - - if (in->filelock.is_stable()) { - inode_file_lock(in); // lock, bc easiest to back off - - if (in->filelock.can_read(in->is_auth())) { - in->filelock.get_read(); - - //in->filelock.get_write(); - in->finish_waiting(CINODE_WAIT_FILERWB|CINODE_WAIT_FILESTABLE); - //in->filelock.put_write(); - return true; - } - } else { - dout(7) << "inode_file_read_start waiting until stable on " << *in << ", filelock=" << in->filelock << endl; - in->add_waiter(CINODE_WAIT_FILESTABLE, new C_MDS_RetryRequest(mds, m, in)); - return false; - } - } else { - // replica - if (in->filelock.is_stable()) { - - // fw to auth - int auth = in->authority(); - dout(7) << "inode_file_read_start " << *in << " on replica and async, fw to auth " << auth << endl; - assert(auth != mds->get_nodeid()); - mdcache->request_forward(m, auth); - return false; - - } else { - // wait until stable - dout(7) << "inode_file_read_start waiting until stable on " << *in << ", filelock=" << in->filelock << endl; - in->add_waiter(CINODE_WAIT_FILESTABLE, new C_MDS_RetryRequest(mds, m, in)); - return false; - } - } - } - - // wait - dout(7) << "inode_file_read_start waiting on " << *in << ", filelock=" << in->filelock << endl; - in->add_waiter(CINODE_WAIT_FILER, new C_MDS_RetryRequest(mds, m, in)); - - return false; -} - - -void Locker::inode_file_read_finish(CInode *in) -{ - // drop ref - assert(in->filelock.can_read(in->is_auth())); - in->filelock.put_read(); - - dout(7) << "inode_file_read_finish on " << *in << ", filelock=" << in->filelock << endl; - - if (in->filelock.get_nread() == 0) { - in->finish_waiting(CINODE_WAIT_FILENORD); - inode_file_eval(in); - } -} - - -bool Locker::inode_file_write_start(CInode *in, MClientRequest *m) -{ - dout(7) << "inode_file_write_start on " << *in << endl; - - // can't write? - if (!in->filelock.can_write(in->is_auth())) { - - // can't write. - if (in->is_auth()) { - // auth - if (!in->filelock.can_write_soon(in->is_auth())) { - if (!in->filelock.is_stable()) { - dout(7) << "inode_file_write_start on auth, waiting for stable on " << *in << endl; - in->add_waiter(CINODE_WAIT_FILESTABLE, new C_MDS_RetryRequest(mds, m, in)); - return false; - } - - // initiate lock - inode_file_lock(in); - - // fall-thru to below. - } - } else { - // replica - // fw to auth - int auth = in->authority(); - dout(7) << "inode_file_write_start " << *in << " on replica, fw to auth " << auth << endl; - assert(auth != mds->get_nodeid()); - mdcache->request_forward(m, auth); - return false; - } - } - - // check again - if (in->filelock.can_write(in->is_auth())) { - // can i auth pin? - assert(in->is_auth()); - if (!in->can_auth_pin()) { - dout(7) << "inode_file_write_start waiting for authpinnable on " << *in << endl; - in->add_waiter(CINODE_WAIT_AUTHPINNABLE, new C_MDS_RetryRequest(mds, m, in)); - return false; - } - - in->auth_pin(); - in->filelock.get_write(m); - return true; - } else { - dout(7) << "inode_file_write_start on auth, waiting for write on " << *in << endl; - in->add_waiter(CINODE_WAIT_FILEW, new C_MDS_RetryRequest(mds, m, in)); - return false; - } -} - - -void Locker::inode_file_write_finish(CInode *in) -{ - // drop ref - //assert(in->filelock.can_write(in->is_auth())); - in->filelock.put_write(); - in->auth_unpin(); - dout(7) << "inode_file_write_finish on " << *in << ", filelock=" << in->filelock << endl; - - // drop lock? - if (!in->is_filelock_write_wanted()) { - in->finish_waiting(CINODE_WAIT_FILENOWR); - inode_file_eval(in); - } -} - - -/* - * ... - * - * also called after client caps are acked to us - * - checks if we're in unstable sfot state and can now move on to next state - * - checks if soft state should change (eg bc last writer closed) - */ - -void Locker::inode_file_eval(CInode *in) -{ - int issued = in->get_caps_issued(); - - // [auth] finished gather? - if (in->is_auth() && - !in->filelock.is_stable() && - in->filelock.gather_set.size() == 0) { - dout(7) << "inode_file_eval finished mds gather on " << *in << endl; - - switch (in->filelock.get_state()) { - // to lock - case LOCK_GLOCKR: - case LOCK_GLOCKM: - case LOCK_GLOCKL: - if ((issued & ~CAP_FILE_RDCACHE) == 0) { - in->filelock.set_state(LOCK_LOCK); - - // waiters - in->filelock.get_read(); - //in->filelock.get_write(); - in->finish_waiting(CINODE_WAIT_FILERWB|CINODE_WAIT_FILESTABLE); - in->filelock.put_read(); - //in->filelock.put_write(); - } - break; - - // to mixed - case LOCK_GMIXEDR: - if ((issued & ~(CAP_FILE_RD)) == 0) { - in->filelock.set_state(LOCK_MIXED); - in->finish_waiting(CINODE_WAIT_FILESTABLE); - } - break; - - case LOCK_GMIXEDL: - if ((issued & ~(CAP_FILE_WR)) == 0) { - in->filelock.set_state(LOCK_MIXED); - - if (in->is_replicated()) { - // data - bufferlist softdata; - in->encode_file_state(softdata); - - // bcast to replicas - send_lock_message(in, LOCK_AC_MIXED, LOCK_OTYPE_IFILE, softdata); - } - - in->finish_waiting(CINODE_WAIT_FILESTABLE); - } - break; - - // to loner - case LOCK_GLONERR: - if (issued == 0) { - in->filelock.set_state(LOCK_LONER); - in->finish_waiting(CINODE_WAIT_FILESTABLE); - } - break; - - case LOCK_GLONERM: - if ((issued & ~CAP_FILE_WR) == 0) { - in->filelock.set_state(LOCK_LONER); - in->finish_waiting(CINODE_WAIT_FILESTABLE); - } - break; - - // to sync - case LOCK_GSYNCL: - case LOCK_GSYNCM: - if ((issued & ~(CAP_FILE_RD)) == 0) { - in->filelock.set_state(LOCK_SYNC); - - { // bcast data to replicas - bufferlist softdata; - in->encode_file_state(softdata); - - send_lock_message(in, LOCK_AC_SYNC, LOCK_OTYPE_IFILE, softdata); - } - - // waiters - in->filelock.get_read(); - in->finish_waiting(CINODE_WAIT_FILER|CINODE_WAIT_FILESTABLE); - in->filelock.put_read(); - } - break; - - default: - assert(0); - } - - issue_caps(in); - } - - // [replica] finished caps gather? - if (!in->is_auth() && - !in->filelock.is_stable()) { - switch (in->filelock.get_state()) { - case LOCK_GMIXEDR: - if ((issued & ~(CAP_FILE_RD)) == 0) { - in->filelock.set_state(LOCK_MIXED); - - // ack - MLock *reply = new MLock(LOCK_AC_MIXEDACK, mds->get_nodeid()); - reply->set_ino(in->ino(), LOCK_OTYPE_IFILE); - mds->send_message_mds(reply, in->authority(), MDS_PORT_LOCKER); - } - break; - - case LOCK_GLOCKR: - if (issued == 0) { - in->filelock.set_state(LOCK_LOCK); - - // ack - MLock *reply = new MLock(LOCK_AC_LOCKACK, mds->get_nodeid()); - reply->set_ino(in->ino(), LOCK_OTYPE_IFILE); - mds->send_message_mds(reply, in->authority(), MDS_PORT_LOCKER); - } - break; - - default: - assert(0); - } - } - - // !stable -> do nothing. - if (!in->filelock.is_stable()) return; - - - // stable. - assert(in->filelock.is_stable()); - - if (in->is_auth()) { - // [auth] - int wanted = in->get_caps_wanted(); - bool loner = (in->client_caps.size() == 1) && in->mds_caps_wanted.empty(); - dout(7) << "inode_file_eval wanted=" << cap_string(wanted) - << " filelock=" << in->filelock - << " loner=" << loner - << endl; - - // * -> loner? - if (in->filelock.get_nread() == 0 && - !in->is_filelock_write_wanted() && - (wanted & CAP_FILE_WR) && - loner && - in->filelock.get_state() != LOCK_LONER) { - dout(7) << "inode_file_eval stable, bump to loner " << *in << ", filelock=" << in->filelock << endl; - inode_file_loner(in); - } - - // * -> mixed? - else if (in->filelock.get_nread() == 0 && - !in->is_filelock_write_wanted() && - (wanted & CAP_FILE_RD) && - (wanted & CAP_FILE_WR) && - !(loner && in->filelock.get_state() == LOCK_LONER) && - in->filelock.get_state() != LOCK_MIXED) { - dout(7) << "inode_file_eval stable, bump to mixed " << *in << ", filelock=" << in->filelock << endl; - inode_file_mixed(in); - } - - // * -> sync? - else if (!in->is_filelock_write_wanted() && - !(wanted & CAP_FILE_WR) && - ((wanted & CAP_FILE_RD) || - in->is_replicated() || - (!loner && in->filelock.get_state() == LOCK_LONER)) && - in->filelock.get_state() != LOCK_SYNC) { - dout(7) << "inode_file_eval stable, bump to sync " << *in << ", filelock=" << in->filelock << endl; - inode_file_sync(in); - } - - // * -> lock? (if not replicated or open) - else if (!in->is_replicated() && - wanted == 0 && - in->filelock.get_state() != LOCK_LOCK) { - inode_file_lock(in); - } - - } else { - // replica - // recall? check wiaters? XXX - } -} - - -// mid - -bool Locker::inode_file_sync(CInode *in) -{ - dout(7) << "inode_file_sync " << *in << " filelock=" << in->filelock << endl; - - assert(in->is_auth()); - - // check state - if (in->filelock.get_state() == LOCK_SYNC || - in->filelock.get_state() == LOCK_GSYNCL || - in->filelock.get_state() == LOCK_GSYNCM) - return true; - - assert(in->filelock.is_stable()); - - int issued = in->get_caps_issued(); - - assert((in->get_caps_wanted() & CAP_FILE_WR) == 0); - - if (in->filelock.get_state() == LOCK_LOCK) { - if (in->is_replicated()) { - // soft data - bufferlist softdata; - in->encode_file_state(softdata); - - // bcast to replicas - send_lock_message(in, LOCK_AC_SYNC, LOCK_OTYPE_IFILE, softdata); - } - - // change lock - in->filelock.set_state(LOCK_SYNC); - - // reissue caps - issue_caps(in); - return true; - } - - else if (in->filelock.get_state() == LOCK_MIXED) { - // writers? - if (issued & CAP_FILE_WR) { - // gather client write caps - in->filelock.set_state(LOCK_GSYNCM); - issue_caps(in); - } else { - // no writers, go straight to sync - - if (in->is_replicated()) { - // bcast to replicas - send_lock_message(in, LOCK_AC_SYNC, LOCK_OTYPE_IFILE); - } - - // change lock - in->filelock.set_state(LOCK_SYNC); - } - return false; - } - - else if (in->filelock.get_state() == LOCK_LONER) { - // writers? - if (issued & CAP_FILE_WR) { - // gather client write caps - in->filelock.set_state(LOCK_GSYNCL); - issue_caps(in); - } else { - // no writers, go straight to sync - if (in->is_replicated()) { - // bcast to replicas - send_lock_message(in, LOCK_AC_SYNC, LOCK_OTYPE_IFILE); - } - - // change lock - in->filelock.set_state(LOCK_SYNC); - } - return false; - } - else - assert(0); // wtf. - - return false; -} - - - -void Locker::inode_file_lock(CInode *in) -{ - dout(7) << "inode_file_lock " << *in << " filelock=" << in->filelock << endl; - - assert(in->is_auth()); - - // check state - if (in->filelock.get_state() == LOCK_LOCK || - in->filelock.get_state() == LOCK_GLOCKR || - in->filelock.get_state() == LOCK_GLOCKM || - in->filelock.get_state() == LOCK_GLOCKL) - return; // lock or locking - - assert(in->filelock.is_stable()); - - int issued = in->get_caps_issued(); - - if (in->filelock.get_state() == LOCK_SYNC) { - if (in->is_replicated()) { - // bcast to replicas - send_lock_message(in, LOCK_AC_LOCK, LOCK_OTYPE_IFILE); - in->filelock.init_gather(in->get_replicas()); - - // change lock - in->filelock.set_state(LOCK_GLOCKR); - - // call back caps - if (issued) - issue_caps(in); - } else { - if (issued) { - // call back caps - in->filelock.set_state(LOCK_GLOCKR); - issue_caps(in); - } else { - in->filelock.set_state(LOCK_LOCK); - } - } - } - - else if (in->filelock.get_state() == LOCK_MIXED) { - if (in->is_replicated()) { - // bcast to replicas - send_lock_message(in, LOCK_AC_LOCK, LOCK_OTYPE_IFILE); - in->filelock.init_gather(in->get_replicas()); - - // change lock - in->filelock.set_state(LOCK_GLOCKM); - - // call back caps - issue_caps(in); - } else { - //assert(issued); // ??? -sage 2/19/06 - if (issued) { - // change lock - in->filelock.set_state(LOCK_GLOCKM); - - // call back caps - issue_caps(in); - } else { - in->filelock.set_state(LOCK_LOCK); - } - } - - } - else if (in->filelock.get_state() == LOCK_LONER) { - if (issued & CAP_FILE_WR) { - // change lock - in->filelock.set_state(LOCK_GLOCKL); - - // call back caps - issue_caps(in); - } else { - in->filelock.set_state(LOCK_LOCK); - } - } - else - assert(0); // wtf. -} - - -void Locker::inode_file_mixed(CInode *in) -{ - dout(7) << "inode_file_mixed " << *in << " filelock=" << in->filelock << endl; - - assert(in->is_auth()); - - // check state - if (in->filelock.get_state() == LOCK_GMIXEDR || - in->filelock.get_state() == LOCK_GMIXEDL) - return; // mixed or mixing - - assert(in->filelock.is_stable()); - - int issued = in->get_caps_issued(); - - if (in->filelock.get_state() == LOCK_SYNC) { - if (in->is_replicated()) { - // bcast to replicas - send_lock_message(in, LOCK_AC_MIXED, LOCK_OTYPE_IFILE); - in->filelock.init_gather(in->get_replicas()); - - in->filelock.set_state(LOCK_GMIXEDR); - issue_caps(in); - } else { - if (issued) { - in->filelock.set_state(LOCK_GMIXEDR); - issue_caps(in); - } else { - in->filelock.set_state(LOCK_MIXED); - } - } - } - - else if (in->filelock.get_state() == LOCK_LOCK) { - if (in->is_replicated()) { - // data - bufferlist softdata; - in->encode_file_state(softdata); - - // bcast to replicas - send_lock_message(in, LOCK_AC_MIXED, LOCK_OTYPE_IFILE, softdata); - } - - // change lock - in->filelock.set_state(LOCK_MIXED); - issue_caps(in); - } - - else if (in->filelock.get_state() == LOCK_LONER) { - if (issued & CAP_FILE_WRBUFFER) { - // gather up WRBUFFER caps - in->filelock.set_state(LOCK_GMIXEDL); - issue_caps(in); - } - else if (in->is_replicated()) { - // bcast to replicas - send_lock_message(in, LOCK_AC_MIXED, LOCK_OTYPE_IFILE); - in->filelock.set_state(LOCK_MIXED); - issue_caps(in); - } else { - in->filelock.set_state(LOCK_MIXED); - issue_caps(in); - } - } - - else - assert(0); // wtf. -} - - -void Locker::inode_file_loner(CInode *in) -{ - dout(7) << "inode_file_loner " << *in << " filelock=" << in->filelock << endl; - - assert(in->is_auth()); - - // check state - if (in->filelock.get_state() == LOCK_LONER || - in->filelock.get_state() == LOCK_GLONERR || - in->filelock.get_state() == LOCK_GLONERM) - return; - - assert(in->filelock.is_stable()); - assert((in->client_caps.size() == 1) && in->mds_caps_wanted.empty()); - - if (in->filelock.get_state() == LOCK_SYNC) { - if (in->is_replicated()) { - // bcast to replicas - send_lock_message(in, LOCK_AC_LOCK, LOCK_OTYPE_IFILE); - in->filelock.init_gather(in->get_replicas()); - - // change lock - in->filelock.set_state(LOCK_GLONERR); - } else { - // only one guy with file open, who gets it all, so - in->filelock.set_state(LOCK_LONER); - issue_caps(in); - } - } - - else if (in->filelock.get_state() == LOCK_LOCK) { - // change lock. ignore replicas; they don't know about LONER. - in->filelock.set_state(LOCK_LONER); - issue_caps(in); - } - - else if (in->filelock.get_state() == LOCK_MIXED) { - if (in->is_replicated()) { - // bcast to replicas - send_lock_message(in, LOCK_AC_LOCK, LOCK_OTYPE_IFILE); - in->filelock.init_gather(in->get_replicas()); - - // change lock - in->filelock.set_state(LOCK_GLONERM); - } else { - in->filelock.set_state(LOCK_LONER); - issue_caps(in); - } - } - - else - assert(0); -} - -// messenger - -void Locker::handle_lock_inode_file(MLock *m) -{ - assert(m->get_otype() == LOCK_OTYPE_IFILE); - - if (mds->logger) mds->logger->inc("lif"); - - CInode *in = mdcache->get_inode(m->get_ino()); - int from = m->get_asker(); - - if (LOCK_AC_FOR_AUTH(m->get_action())) { - // auth - assert(in); - assert(in->is_auth() || in->is_proxy()); - dout(7) << "handle_lock_inode_file " << *in << " hardlock=" << in->hardlock << endl; - - if (in->is_proxy()) { - // fw - int newauth = in->authority(); - assert(newauth >= 0); - if (from == newauth) { - dout(7) << "handle_lock " << m->get_ino() << " from " << from << ": proxy, but from new auth, dropping" << endl; - delete m; - } else { - dout(7) << "handle_lock " << m->get_ino() << " from " << from << ": proxy, fw to " << newauth << endl; - mds->send_message_mds(m, newauth, MDS_PORT_LOCKER); - } - return; - } - } else { - // replica - if (!in) { - // drop it. don't nak. - dout(7) << "handle_lock " << m->get_ino() << ": don't have it anymore" << endl; - delete m; - return; - } - - assert(!in->is_auth()); - } - - dout(7) << "handle_lock_inode_file a=" << m->get_action() << " from " << from << " " << *in << " filelock=" << in->filelock << endl; - - CLock *lock = &in->filelock; - int issued = in->get_caps_issued(); - - switch (m->get_action()) { - // -- replica -- - case LOCK_AC_SYNC: - assert(lock->get_state() == LOCK_LOCK || - lock->get_state() == LOCK_MIXED); - - { // assim data - int off = 0; - in->decode_file_state(m->get_data(), off); - } - - // update lock - lock->set_state(LOCK_SYNC); - - // no need to reply. - - // waiters - in->filelock.get_read(); - in->finish_waiting(CINODE_WAIT_FILER|CINODE_WAIT_FILESTABLE); - in->filelock.put_read(); - inode_file_eval(in); - break; - - case LOCK_AC_LOCK: - assert(lock->get_state() == LOCK_SYNC || - lock->get_state() == LOCK_MIXED); - - // call back caps? - if (issued & CAP_FILE_RD) { - dout(7) << "handle_lock_inode_file client readers, gathering caps on " << *in << endl; - issue_caps(in); - } - if (lock->get_nread() > 0) { - dout(7) << "handle_lock_inode_file readers, waiting before ack on " << *in << endl; - in->add_waiter(CINODE_WAIT_FILENORD, - new C_MDS_RetryMessage(mds,m)); - lock->set_state(LOCK_GLOCKR); - assert(0);// i am broken.. why retry message when state captures all the info i need? - return; - } - if (issued & CAP_FILE_RD) { - lock->set_state(LOCK_GLOCKR); - break; - } - - // nothing to wait for, lock and ack. - { - lock->set_state(LOCK_LOCK); - - MLock *reply = new MLock(LOCK_AC_LOCKACK, mds->get_nodeid()); - reply->set_ino(in->ino(), LOCK_OTYPE_IFILE); - mds->send_message_mds(reply, from, MDS_PORT_LOCKER); - } - break; - - case LOCK_AC_MIXED: - assert(lock->get_state() == LOCK_SYNC || - lock->get_state() == LOCK_LOCK); - - if (lock->get_state() == LOCK_SYNC) { - // MIXED - if (issued & CAP_FILE_RD) { - // call back client caps - lock->set_state(LOCK_GMIXEDR); - issue_caps(in); - break; - } else { - // no clients, go straight to mixed - lock->set_state(LOCK_MIXED); - - // ack - MLock *reply = new MLock(LOCK_AC_MIXEDACK, mds->get_nodeid()); - reply->set_ino(in->ino(), LOCK_OTYPE_IFILE); - mds->send_message_mds(reply, from, MDS_PORT_LOCKER); - } - } else { - // LOCK - lock->set_state(LOCK_MIXED); - - // no ack needed. - } - - issue_caps(in); - - // waiters - //in->filelock.get_write(); - in->finish_waiting(CINODE_WAIT_FILEW|CINODE_WAIT_FILESTABLE); - //in->filelock.put_write(); - inode_file_eval(in); - break; - - - - - // -- auth -- - case LOCK_AC_LOCKACK: - assert(lock->state == LOCK_GLOCKR || - lock->state == LOCK_GLOCKM || - lock->state == LOCK_GLONERM || - lock->state == LOCK_GLONERR); - assert(lock->gather_set.count(from)); - lock->gather_set.erase(from); - - if (lock->gather_set.size()) { - dout(7) << "handle_lock_inode_file " << *in << " from " << from << ", still gathering " << lock->gather_set << endl; - } else { - dout(7) << "handle_lock_inode_file " << *in << " from " << from << ", last one" << endl; - inode_file_eval(in); - } - break; - - case LOCK_AC_SYNCACK: - assert(lock->state == LOCK_GSYNCM); - assert(lock->gather_set.count(from)); - lock->gather_set.erase(from); - - /* not used currently - { - // merge data (keep largest size, mtime, etc.) - int off = 0; - in->decode_merge_file_state(m->get_data(), off); - } - */ - - if (lock->gather_set.size()) { - dout(7) << "handle_lock_inode_file " << *in << " from " << from << ", still gathering " << lock->gather_set << endl; - } else { - dout(7) << "handle_lock_inode_file " << *in << " from " << from << ", last one" << endl; - inode_file_eval(in); - } - break; - - case LOCK_AC_MIXEDACK: - assert(lock->state == LOCK_GMIXEDR); - assert(lock->gather_set.count(from)); - lock->gather_set.erase(from); - - if (lock->gather_set.size()) { - dout(7) << "handle_lock_inode_file " << *in << " from " << from << ", still gathering " << lock->gather_set << endl; - } else { - dout(7) << "handle_lock_inode_file " << *in << " from " << from << ", last one" << endl; - inode_file_eval(in); - } - break; - - - default: - assert(0); - } - - delete m; -} - - - - - - - - - - - - - - -void Locker::handle_lock_dir(MLock *m) -{ - -} - - - -// DENTRY - -bool Locker::dentry_xlock_start(CDentry *dn, Message *m, CInode *ref) -{ - dout(7) << "dentry_xlock_start on " << *dn << endl; - - // locked? - if (dn->lockstate == DN_LOCK_XLOCK) { - if (dn->xlockedby == m) return true; // locked by me! - - // not by me, wait - dout(7) << "dentry " << *dn << " xlock by someone else" << endl; - dn->dir->add_waiter(CDIR_WAIT_DNREAD, dn->name, - new C_MDS_RetryRequest(mds,m,ref)); - return false; - } - - // prelock? - if (dn->lockstate == DN_LOCK_PREXLOCK) { - if (dn->xlockedby == m) { - dout(7) << "dentry " << *dn << " prexlock by me" << endl; - dn->dir->add_waiter(CDIR_WAIT_DNLOCK, dn->name, - new C_MDS_RetryRequest(mds,m,ref)); - } else { - dout(7) << "dentry " << *dn << " prexlock by someone else" << endl; - dn->dir->add_waiter(CDIR_WAIT_DNREAD, dn->name, - new C_MDS_RetryRequest(mds,m,ref)); - } - return false; - } - - - // lockable! - assert(dn->lockstate == DN_LOCK_SYNC || - dn->lockstate == DN_LOCK_UNPINNING); - - // dir auth pinnable? - if (!dn->dir->can_auth_pin()) { - dout(7) << "dentry " << *dn << " dir not pinnable, waiting" << endl; - dn->dir->add_waiter(CDIR_WAIT_AUTHPINNABLE, - new C_MDS_RetryRequest(mds,m,ref)); - return false; - } - - // is dentry path pinned? - if (dn->is_pinned()) { - dout(7) << "dentry " << *dn << " pinned, waiting" << endl; - dn->lockstate = DN_LOCK_UNPINNING; - dn->dir->add_waiter(CDIR_WAIT_DNUNPINNED, - dn->name, - new C_MDS_RetryRequest(mds,m,ref)); - return false; - } - - // pin path up to dentry! (if success, point of no return) - CDentry *pdn = dn->dir->inode->get_parent_dn(); - if (pdn) { - if (mdcache->active_requests[m].traces.count(pdn)) { - dout(7) << "already path pinned parent dentry " << *pdn << endl; - } else { - dout(7) << "pinning parent dentry " << *pdn << endl; - vector trace; - mdcache->make_trace(trace, pdn->inode); - assert(trace.size()); - - if (!mdcache->path_pin(trace, m, new C_MDS_RetryRequest(mds, m, ref))) return false; - - mdcache->active_requests[m].traces[trace[trace.size()-1]] = trace; - } - } - - // pin dir! - dn->dir->auth_pin(); - - // mine! - dn->xlockedby = m; - - if (dn->is_replicated()) { - dn->lockstate = DN_LOCK_PREXLOCK; - - // xlock with whom? - set who; - for (map::iterator p = dn->replicas_begin(); - p != dn->replicas_end(); - ++p) - who.insert(p->first); - dn->gather_set = who; - - // make path - string path; - dn->make_path(path); - dout(10) << "path is " << path << " for " << *dn << endl; - - for (set::iterator it = who.begin(); - it != who.end(); - it++) { - MLock *m = new MLock(LOCK_AC_LOCK, mds->get_nodeid()); - m->set_dn(dn->dir->ino(), dn->name); - m->set_path(path); - mds->send_message_mds(m, *it, MDS_PORT_LOCKER); - } - - // wait - dout(7) << "dentry_xlock_start locking, waiting for replicas " << endl; - dn->dir->add_waiter(CDIR_WAIT_DNLOCK, dn->name, - new C_MDS_RetryRequest(mds, m, ref)); - return false; - } else { - dn->lockstate = DN_LOCK_XLOCK; - mdcache->active_requests[dn->xlockedby].xlocks.insert(dn); - return true; - } -} - -void Locker::dentry_xlock_finish(CDentry *dn, bool quiet) -{ - dout(7) << "dentry_xlock_finish on " << *dn << endl; - - assert(dn->xlockedby); - if (dn->xlockedby == DN_XLOCK_FOREIGN) { - dout(7) << "this was a foreign xlock" << endl; - } else { - // remove from request record - assert(mdcache->active_requests[dn->xlockedby].xlocks.count(dn) == 1); - mdcache->active_requests[dn->xlockedby].xlocks.erase(dn); - } - - dn->xlockedby = 0; - dn->lockstate = DN_LOCK_SYNC; - - // unpin parent dir? - // -> no? because we might have xlocked 2 things in this dir. - // instead, we let request_finish clean up the mess. - - // tell replicas? - if (!quiet) { - // tell even if dn is null. - if (dn->is_replicated()) { - send_lock_message(dn, LOCK_AC_SYNC); - } - } - - // unpin dir - dn->dir->auth_unpin(); - - // kick waiters - list finished; - dn->dir->take_waiting(CDIR_WAIT_DNREAD, finished); - mds->queue_finished(finished); -} - - -/* - * onfinish->finish() will be called with - * 0 on successful xlock, - * -1 on failure - */ - -class C_MDC_XlockRequest : public Context { - Locker *mdc; - CDir *dir; - string dname; - Message *req; - Context *finisher; -public: - C_MDC_XlockRequest(Locker *mdc, - CDir *dir, string& dname, - Message *req, - Context *finisher) { - this->mdc = mdc; - this->dir = dir; - this->dname = dname; - this->req = req; - this->finisher = finisher; - } - - void finish(int r) { - mdc->dentry_xlock_request_finish(r, dir, dname, req, finisher); - } -}; - -void Locker::dentry_xlock_request_finish(int r, - CDir *dir, string& dname, - Message *req, - Context *finisher) -{ - dout(10) << "dentry_xlock_request_finish r = " << r << endl; - if (r == 1) { // 1 for xlock request success - CDentry *dn = dir->lookup(dname); - if (dn && dn->xlockedby == 0) { - // success - dn->xlockedby = req; // our request was the winner - dout(10) << "xlock request success, now xlocked by req " << req << " dn " << *dn << endl; - - // remember! - mdcache->active_requests[req].foreign_xlocks.insert(dn); - } - } - - // retry request (or whatever) - finisher->finish(0); - delete finisher; -} - -void Locker::dentry_xlock_request(CDir *dir, string& dname, bool create, - Message *req, Context *onfinish) -{ - dout(10) << "dentry_xlock_request on dn " << dname << " create=" << create << " in " << *dir << endl; - // send request - int dauth = dir->dentry_authority(dname); - MLock *m = new MLock(create ? LOCK_AC_REQXLOCKC:LOCK_AC_REQXLOCK, mds->get_nodeid()); - m->set_dn(dir->ino(), dname); - mds->send_message_mds(m, dauth, MDS_PORT_LOCKER); - - // add waiter - dir->add_waiter(CDIR_WAIT_DNREQXLOCK, dname, - new C_MDC_XlockRequest(this, - dir, dname, req, - onfinish)); -} - - - - -void Locker::handle_lock_dn(MLock *m) -{ - assert(m->get_otype() == LOCK_OTYPE_DN); - - CInode *diri = mdcache->get_inode(m->get_ino()); // may be null - CDir *dir = 0; - if (diri) dir = diri->dir; // may be null - string dname = m->get_dn(); - int from = m->get_asker(); - CDentry *dn = 0; - - if (LOCK_AC_FOR_AUTH(m->get_action())) { - // auth - - // normally we have it always - if (diri && dir) { - int dauth = dir->dentry_authority(dname); - assert(dauth == mds->get_nodeid() || dir->is_proxy() || // mine or proxy, - m->get_action() == LOCK_AC_REQXLOCKACK || // or we did a REQXLOCK and this is our ack/nak - m->get_action() == LOCK_AC_REQXLOCKNAK); - - if (dir->is_proxy()) { - - assert(dauth >= 0); - - if (dauth == m->get_asker() && - (m->get_action() == LOCK_AC_REQXLOCK || - m->get_action() == LOCK_AC_REQXLOCKC)) { - dout(7) << "handle_lock_dn got reqxlock from " << dauth << " and they are auth.. dropping on floor (their import will have woken them up)" << endl; - if (mdcache->active_requests.count(m)) - mdcache->request_finish(m); - else - delete m; - return; - } - - dout(7) << "handle_lock_dn " << m << " " << m->get_ino() << " dname " << dname << " from " << from << ": proxy, fw to " << dauth << endl; - - // forward - if (mdcache->active_requests.count(m)) { - // xlock requests are requests, use request_* functions! - assert(m->get_action() == LOCK_AC_REQXLOCK || - m->get_action() == LOCK_AC_REQXLOCKC); - // forward as a request - mdcache->request_forward(m, dauth, MDS_PORT_LOCKER); - } else { - // not an xlock req, or it is and we just didn't register the request yet - // forward normally - mds->send_message_mds(m, dauth, MDS_PORT_LOCKER); - } - return; - } - - dn = dir->lookup(dname); - } - - // except with.. an xlock request? - if (!dn) { - assert(dir); // we should still have the dir, though! the requester has the dir open. - switch (m->get_action()) { - - case LOCK_AC_LOCK: - dout(7) << "handle_lock_dn xlock on " << dname << ", adding (null)" << endl; - dn = dir->add_dentry(dname); - break; - - case LOCK_AC_REQXLOCK: - // send nak - if (dir->state_test(CDIR_STATE_DELETED)) { - dout(7) << "handle_lock_dn reqxlock on deleted dir " << *dir << ", nak" << endl; - } else { - dout(7) << "handle_lock_dn reqxlock on " << dname << " in " << *dir << " dne, nak" << endl; - } - { - MLock *reply = new MLock(LOCK_AC_REQXLOCKNAK, mds->get_nodeid()); - reply->set_dn(dir->ino(), dname); - reply->set_path(m->get_path()); - mds->send_message_mds(reply, m->get_asker(), MDS_PORT_LOCKER); - } - - // finish request (if we got that far) - if (mdcache->active_requests.count(m)) - mdcache->request_finish(m); - - delete m; - return; - - case LOCK_AC_REQXLOCKC: - dout(7) << "handle_lock_dn reqxlockc on " << dname << " in " << *dir << " dne (yet!)" << endl; - break; - - default: - assert(0); - } - } - } else { - // replica - if (dir) dn = dir->lookup(dname); - if (!dn) { - dout(7) << "handle_lock_dn " << m << " don't have " << m->get_ino() << " dname " << dname << endl; - - if (m->get_action() == LOCK_AC_REQXLOCKACK || - m->get_action() == LOCK_AC_REQXLOCKNAK) { - dout(7) << "handle_lock_dn got reqxlockack/nak, but don't have dn " << m->get_path() << ", discovering" << endl; - //assert(0); // how can this happen? tell me now! - - vector trace; - filepath path = m->get_path(); - int r = mdcache->path_traverse(path, trace, true, - m, new C_MDS_RetryMessage(mds,m), - MDS_TRAVERSE_DISCOVER); - assert(r>0); - return; - } - - if (m->get_action() == LOCK_AC_LOCK) { - if (0) { // not anymore - dout(7) << "handle_lock_dn don't have " << m->get_path() << ", discovering" << endl; - - vector trace; - filepath path = m->get_path(); - int r = mdcache->path_traverse(path, trace, true, - m, new C_MDS_RetryMessage(mds,m), - MDS_TRAVERSE_DISCOVER); - assert(r>0); - } - if (1) { - // NAK - MLock *reply = new MLock(LOCK_AC_LOCKNAK, mds->get_nodeid()); - reply->set_dn(m->get_ino(), dname); - mds->send_message_mds(reply, m->get_asker(), MDS_PORT_LOCKER); - } - } else { - dout(7) << "safely ignoring." << endl; - delete m; - } - return; - } - - assert(dn); - } - - if (dn) { - dout(7) << "handle_lock_dn a=" << m->get_action() << " from " << from << " " << *dn << endl; - } else { - dout(7) << "handle_lock_dn a=" << m->get_action() << " from " << from << " " << dname << " in " << *dir << endl; - } - - switch (m->get_action()) { - // -- replica -- - case LOCK_AC_LOCK: - assert(dn->lockstate == DN_LOCK_SYNC || - dn->lockstate == DN_LOCK_UNPINNING || - dn->lockstate == DN_LOCK_XLOCK); // <-- bc the handle_lock_dn did the discover! - - if (dn->is_pinned()) { - dn->lockstate = DN_LOCK_UNPINNING; - - // wait - dout(7) << "dn pinned, waiting " << *dn << endl; - dn->dir->add_waiter(CDIR_WAIT_DNUNPINNED, - dn->name, - new C_MDS_RetryMessage(mds, m)); - return; - } else { - dn->lockstate = DN_LOCK_XLOCK; - dn->xlockedby = 0; - - // ack now - MLock *reply = new MLock(LOCK_AC_LOCKACK, mds->get_nodeid()); - reply->set_dn(diri->ino(), dname); - mds->send_message_mds(reply, from, MDS_PORT_LOCKER); - } - - // wake up waiters - dir->finish_waiting(CDIR_WAIT_DNLOCK, dname); // ? will this happen on replica ? - break; - - case LOCK_AC_SYNC: - assert(dn->lockstate == DN_LOCK_XLOCK); - dn->lockstate = DN_LOCK_SYNC; - dn->xlockedby = 0; - - // null? hose it. - if (dn->is_null()) { - dout(7) << "hosing null (and now sync) dentry " << *dn << endl; - dir->remove_dentry(dn); - } - - // wake up waiters - dir->finish_waiting(CDIR_WAIT_DNREAD, dname); // will this happen either? YES: if a rename lock backs out - break; - - case LOCK_AC_REQXLOCKACK: - case LOCK_AC_REQXLOCKNAK: - { - dout(10) << "handle_lock_dn got ack/nak on a reqxlock for " << *dn << endl; - list finished; - dir->take_waiting(CDIR_WAIT_DNREQXLOCK, m->get_dn(), finished, 1); // TAKE ONE ONLY! - finish_contexts(finished, - (m->get_action() == LOCK_AC_REQXLOCKACK) ? 1:-1); - } - break; - - - // -- auth -- - case LOCK_AC_LOCKACK: - case LOCK_AC_LOCKNAK: - assert(dn->gather_set.count(from) == 1); - dn->gather_set.erase(from); - if (dn->gather_set.size() == 0) { - dout(7) << "handle_lock_dn finish gather, now xlock on " << *dn << endl; - dn->lockstate = DN_LOCK_XLOCK; - mdcache->active_requests[dn->xlockedby].xlocks.insert(dn); - dir->finish_waiting(CDIR_WAIT_DNLOCK, dname); - } - break; - - - case LOCK_AC_REQXLOCKC: - // make sure it's a _file_, if it exists. - if (dn && dn->inode && dn->inode->is_dir()) { - dout(7) << "handle_lock_dn failing, reqxlockc on dir " << *dn->inode << endl; - - // nak - string path; - dn->make_path(path); - - MLock *reply = new MLock(LOCK_AC_REQXLOCKNAK, mds->get_nodeid()); - reply->set_dn(dir->ino(), dname); - reply->set_path(path); - mds->send_message_mds(reply, m->get_asker(), MDS_PORT_LOCKER); - - // done - if (mdcache->active_requests.count(m)) - mdcache->request_finish(m); - else - delete m; - return; - } - - case LOCK_AC_REQXLOCK: - if (dn) { - dout(7) << "handle_lock_dn reqxlock on " << *dn << endl; - } else { - dout(7) << "handle_lock_dn reqxlock on " << dname << " in " << *dir << endl; - } - - - // start request? - if (!mdcache->active_requests.count(m)) { - vector trace; - if (!mdcache->request_start(m, dir->inode, trace)) - return; // waiting for pin - } - - // try to xlock! - if (!dn) { - assert(m->get_action() == LOCK_AC_REQXLOCKC); - dn = dir->add_dentry(dname); - } - - if (dn->xlockedby != m) { - if (!dentry_xlock_start(dn, m, dir->inode)) { - // hose null dn if we're waiting on something - if (dn->is_clean() && dn->is_null() && dn->is_sync()) dir->remove_dentry(dn); - return; // waiting for xlock - } - } else { - // successfully xlocked! on behalf of requestor. - string path; - dn->make_path(path); - - dout(7) << "handle_lock_dn reqxlock success for " << m->get_asker() << " on " << *dn << ", acking" << endl; - - // ACK xlock request - MLock *reply = new MLock(LOCK_AC_REQXLOCKACK, mds->get_nodeid()); - reply->set_dn(dir->ino(), dname); - reply->set_path(path); - mds->send_message_mds(reply, m->get_asker(), MDS_PORT_LOCKER); - - // note: keep request around in memory (to hold the xlock/pins on behalf of requester) - return; - } - break; - - case LOCK_AC_UNXLOCK: - dout(7) << "handle_lock_dn unxlock on " << *dn << endl; - { - string dname = dn->name; - Message *m = dn->xlockedby; - - // finish request - mdcache->request_finish(m); // this will drop the locks (and unpin paths!) - return; - } - break; - - default: - assert(0); - } - - delete m; -} - - - - - - - diff --git a/tags/20070517_before_mds_merge/mds/Locker.h b/tags/20070517_before_mds_merge/mds/Locker.h deleted file mode 100644 index d8dcb2c541a37..0000000000000 --- a/tags/20070517_before_mds_merge/mds/Locker.h +++ /dev/null @@ -1,127 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_LOCKER_H -#define __MDS_LOCKER_H - -#include "include/types.h" - -#include -#include -#include -using std::map; -using std::list; -using std::set; - -class MDS; -class CDir; -class CInode; -class CDentry; - -class Message; - -class MDiscover; -class MDiscoverReply; -class MCacheExpire; -class MDirUpdate; -class MDentryUnlink; -class MLock; - -class MClientRequest; - - -class Anchor; -class Capability; - - -class Locker { -private: - MDS *mds; - MDCache *mdcache; - - public: - Locker(MDS *m, MDCache *c) : mds(m), mdcache(c) {} - - void dispatch(Message *m); - - void send_lock_message(CInode *in, int msg, int type); - void send_lock_message(CInode *in, int msg, int type, bufferlist& data); - void send_lock_message(CDentry *dn, int msg); - - // -- locks -- - // high level interface - public: - bool inode_hard_read_try(CInode *in, Context *con); - bool inode_hard_read_start(CInode *in, MClientRequest *m); - void inode_hard_read_finish(CInode *in); - bool inode_hard_write_start(CInode *in, MClientRequest *m); - void inode_hard_write_finish(CInode *in); - bool inode_file_read_start(CInode *in, MClientRequest *m); - void inode_file_read_finish(CInode *in); - bool inode_file_write_start(CInode *in, MClientRequest *m); - void inode_file_write_finish(CInode *in); - - void inode_hard_eval(CInode *in); - void inode_file_eval(CInode *in); - - protected: - void inode_hard_mode(CInode *in, int mode); - void inode_file_mode(CInode *in, int mode); - - // low level triggers - void inode_hard_sync(CInode *in); - void inode_hard_lock(CInode *in); - bool inode_file_sync(CInode *in); - void inode_file_lock(CInode *in); - void inode_file_mixed(CInode *in); - void inode_file_loner(CInode *in); - - // messengers - void handle_lock(MLock *m); - void handle_lock_inode_hard(MLock *m); - void handle_lock_inode_file(MLock *m); - - // -- file i/o -- - public: - version_t issue_file_data_version(CInode *in); - Capability* issue_new_caps(CInode *in, int mode, MClientRequest *req); - bool issue_caps(CInode *in); - - protected: - void handle_client_file_caps(class MClientFileCaps *m); - - void request_inode_file_caps(CInode *in); - void handle_inode_file_caps(class MInodeFileCaps *m); - - - // dirs - void handle_lock_dir(MLock *m); - - // dentry locks - public: - bool dentry_xlock_start(CDentry *dn, - Message *m, CInode *ref); - void dentry_xlock_finish(CDentry *dn, bool quiet=false); - void handle_lock_dn(MLock *m); - void dentry_xlock_request(CDir *dir, string& dname, bool create, - Message *req, Context *onfinish); - void dentry_xlock_request_finish(int r, - CDir *dir, string& dname, - Message *req, - Context *finisher); - - -}; - - -#endif diff --git a/tags/20070517_before_mds_merge/mds/LogEvent.cc b/tags/20070517_before_mds_merge/mds/LogEvent.cc deleted file mode 100644 index 4a83902c5c6c4..0000000000000 --- a/tags/20070517_before_mds_merge/mds/LogEvent.cc +++ /dev/null @@ -1,67 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "LogEvent.h" - -#include "MDS.h" - -// events i know of -#include "events/EString.h" -#include "events/EImportMap.h" -#include "events/EMetaBlob.h" -#include "events/EUpdate.h" -#include "events/EUnlink.h" -#include "events/EAlloc.h" -#include "events/EPurgeFinish.h" -#include "events/EExportStart.h" -#include "events/EExportFinish.h" -#include "events/EImportStart.h" -#include "events/EImportFinish.h" - -LogEvent *LogEvent::decode(bufferlist& bl) -{ - // parse type, length - int off = 0; - int type; - bl.copy(off, sizeof(type), (char*)&type); - off += sizeof(type); - - int length = bl.length() - off; - dout(15) << "decode_log_event type " << type << ", size " << length << endl; - - assert(type > 0); - - // create event - LogEvent *le; - switch (type) { - case EVENT_STRING: le = new EString(); break; - case EVENT_IMPORTMAP: le = new EImportMap; break; - case EVENT_UPDATE: le = new EUpdate; break; - case EVENT_UNLINK: le = new EUnlink(); break; - case EVENT_PURGEFINISH: le = new EPurgeFinish(); break; - case EVENT_ALLOC: le = new EAlloc(); break; - case EVENT_EXPORTSTART: le = new EExportStart; break; - case EVENT_EXPORTFINISH: le = new EExportFinish; break; - case EVENT_IMPORTSTART: le = new EImportStart; break; - case EVENT_IMPORTFINISH: le = new EImportFinish; break; - default: - dout(1) << "uh oh, unknown log event type " << type << endl; - assert(0); - } - - // decode - le->decode_payload(bl, off); - - return le; -} - diff --git a/tags/20070517_before_mds_merge/mds/LogEvent.h b/tags/20070517_before_mds_merge/mds/LogEvent.h deleted file mode 100644 index 6895ed54074d4..0000000000000 --- a/tags/20070517_before_mds_merge/mds/LogEvent.h +++ /dev/null @@ -1,106 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __LOGEVENT_H -#define __LOGEVENT_H - -#define EVENT_STRING 1 - -#define EVENT_INODEUPDATE 2 -#define EVENT_DIRUPDATE 3 - -#define EVENT_IMPORTMAP 4 -#define EVENT_UPDATE 5 - -#define EVENT_ALLOC 10 -#define EVENT_MKNOD 11 -#define EVENT_MKDIR 12 -#define EVENT_LINK 13 - -#define EVENT_UNLINK 20 -#define EVENT_RMDIR 21 -#define EVENT_PURGEFINISH 22 - -#define EVENT_EXPORTSTART 30 -#define EVENT_EXPORTFINISH 31 -#define EVENT_IMPORTSTART 32 -#define EVENT_IMPORTFINISH 33 - - - -#include -using namespace std; - -#include "include/buffer.h" -#include "include/Context.h" - -class MDS; - -// generic log event -class LogEvent { - private: - int _type; - off_t _start_off,_end_off; - friend class MDLog; - - public: - LogEvent(int t) : _type(t), _start_off(0), _end_off(0) { } - virtual ~LogEvent() { } - - int get_type() { return _type; } - off_t get_start_off() { return _start_off; } - off_t get_end_off() { return _end_off; } - - // encoding - virtual void encode_payload(bufferlist& bl) = 0; - virtual void decode_payload(bufferlist& bl, int& off) = 0; - static LogEvent *decode(bufferlist &bl); - - - virtual void print(ostream& out) { - out << "event(" << _type << ")"; - } - - - /*** live journal ***/ - - /* obsolete() - is this entry committed to primary store, such that - * we can expire it from the journal? - */ - virtual bool has_expired(MDS *m) { - return true; - } - - /* expire() - prod MDS into committing the relevant state so that this - * entry can be expired from the jorunal. - */ - virtual void expire(MDS *m, Context *c) { - assert(0); - c->finish(0); - delete c; - } - - - /*** recovery ***/ - /* replay() - replay given event. this is idempotent. - */ - virtual void replay(MDS *m) { assert(0); } - -}; - -inline ostream& operator<<(ostream& out, LogEvent& le) { - le.print(out); - return out; -} - -#endif diff --git a/tags/20070517_before_mds_merge/mds/MDBalancer.cc b/tags/20070517_before_mds_merge/mds/MDBalancer.cc deleted file mode 100644 index 57e79dcdf51fc..0000000000000 --- a/tags/20070517_before_mds_merge/mds/MDBalancer.cc +++ /dev/null @@ -1,878 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "mdstypes.h" - -#include "MDBalancer.h" -#include "MDS.h" -#include "MDSMap.h" -#include "CInode.h" -#include "CDir.h" -#include "MDCache.h" -#include "Migrator.h" - -#include "include/Context.h" -#include "msg/Messenger.h" -#include "messages/MHeartbeat.h" - -#include -#include -using namespace std; - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug_mds || l<=g_conf.debug_mds_balancer) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".bal " - -#define MIN_LOAD 50 // ?? -#define MIN_REEXPORT 5 // will automatically reexport -#define MIN_OFFLOAD 10 // point at which i stop trying, close enough - - - -int MDBalancer::proc_message(Message *m) -{ - switch (m->get_type()) { - - case MSG_MDS_HEARTBEAT: - handle_heartbeat((MHeartbeat*)m); - break; - - default: - dout(1) << " balancer unknown message " << m->get_type() << endl; - assert(0); - break; - } - - return 0; -} - - - - -void MDBalancer::tick() -{ - static int num_bal_times = g_conf.mds_bal_max; - static utime_t first = g_clock.now(); - utime_t now = g_clock.now(); - utime_t elapsed = now; - elapsed -= first; - - // balance? - if (true && - mds->get_nodeid() == 0 && - (num_bal_times || - (g_conf.mds_bal_max_until >= 0 && - elapsed.sec() > g_conf.mds_bal_max_until)) && - mds->is_active() && - now.sec() - last_heartbeat.sec() >= g_conf.mds_bal_interval) { - last_heartbeat = now; - send_heartbeat(); - num_bal_times--; - } - - // hash? - if (true && - g_conf.num_mds > 1 && - now.sec() - last_hash.sec() > g_conf.mds_bal_hash_interval) { - last_hash = now; - do_hashing(); - } -} - - - - -class C_Bal_SendHeartbeat : public Context { -public: - MDS *mds; - C_Bal_SendHeartbeat(MDS *mds) { - this->mds = mds; - } - virtual void finish(int f) { - mds->balancer->send_heartbeat(); - } -}; - -mds_load_t MDBalancer::get_load() -{ - mds_load_t load; - if (mds->mdcache->get_root()) - load.root = - mds->mdcache->get_root()->popularity[MDS_POP_ANYDOM]; - // + - // mds->mdcache->get_root()->popularity[MDS_POP_NESTED]; - - load.req_rate = mds->get_req_rate(); - load.queue_len = mds->messenger->get_dispatch_queue_len(); - return load; -} - -void MDBalancer::send_heartbeat() -{ - if (!mds->mdcache->get_root()) { - dout(5) << "no root on send_heartbeat" << endl; - mds->mdcache->open_root(new C_Bal_SendHeartbeat(mds)); - return; - } - - mds_load.clear(); - if (mds->get_nodeid() == 0) - beat_epoch++; - - // load - mds_load_t load = get_load(); - mds_load[ mds->get_nodeid() ] = load; - - // import_map - map import_map; - - for (set::iterator it = mds->mdcache->imports.begin(); - it != mds->mdcache->imports.end(); - it++) { - CDir *im = *it; - if (im->inode->is_root()) continue; - int from = im->inode->authority(); - import_map[from] += im->popularity[MDS_POP_CURDOM].meta_load(); - } - mds_import_map[ mds->get_nodeid() ] = import_map; - - - dout(5) << "mds" << mds->get_nodeid() << " sending heartbeat " << beat_epoch << " " << load << endl; - for (map::iterator it = import_map.begin(); - it != import_map.end(); - it++) { - dout(5) << " import_map from " << it->first << " -> " << it->second << endl; - } - - - set up; - mds->get_mds_map()->get_up_mds_set(up); - for (set::iterator p = up.begin(); p != up.end(); ++p) { - if (*p == mds->get_nodeid()) continue; - MHeartbeat *hb = new MHeartbeat(load, beat_epoch); - hb->get_import_map() = import_map; - mds->messenger->send_message(hb, - mds->mdsmap->get_inst(*p), - MDS_PORT_BALANCER, MDS_PORT_BALANCER); - } -} - -void MDBalancer::handle_heartbeat(MHeartbeat *m) -{ - dout(25) << "=== got heartbeat " << m->get_beat() << " from " << m->get_source().num() << " " << m->get_load() << endl; - - if (!mds->mdcache->get_root()) { - dout(10) << "no root on handle" << endl; - mds->mdcache->open_root(new C_MDS_RetryMessage(mds, m)); - return; - } - - int who = m->get_source().num(); - - if (who == 0) { - dout(20) << " from mds0, new epoch" << endl; - beat_epoch = m->get_beat(); - send_heartbeat(); - - show_imports(); - } - - mds_load[ who ] = m->get_load(); - mds_import_map[ who ] = m->get_import_map(); - - //cout << " load is " << load << " have " << mds_load.size() << endl; - - unsigned cluster_size = mds->get_mds_map()->get_num_mds(); - if (mds_load.size() == cluster_size) { - // let's go! - //export_empties(); // no! - do_rebalance(m->get_beat()); - } - - // done - delete m; -} - - -void MDBalancer::export_empties() -{ - dout(5) << "export_empties checking for empty imports" << endl; - - for (set::iterator it = mds->mdcache->imports.begin(); - it != mds->mdcache->imports.end(); - it++) { - CDir *dir = *it; - - if (!dir->inode->is_root() && dir->get_size() == 0) - mds->mdcache->migrator->export_empty_import(dir); - } -} - - - -double MDBalancer::try_match(int ex, double& maxex, - int im, double& maxim) -{ - if (maxex <= 0 || maxim <= 0) return 0.0; - - double howmuch = MIN(maxex, maxim); - if (howmuch <= 0) return 0.0; - - dout(5) << " - mds" << ex << " exports " << howmuch << " to mds" << im << endl; - - if (ex == mds->get_nodeid()) - my_targets[im] += howmuch; - - exported[ex] += howmuch; - imported[im] += howmuch; - - maxex -= howmuch; - maxim -= howmuch; - - return howmuch; -} - - - -void MDBalancer::do_hashing() -{ - if (hash_queue.empty()) { - dout(20) << "do_hashing has nothing to do" << endl; - return; - } - - dout(0) << "do_hashing " << hash_queue.size() << " dirs marked for possible hashing" << endl; - - for (set::iterator i = hash_queue.begin(); - i != hash_queue.end(); - i++) { - inodeno_t dirino = *i; - CInode *in = mds->mdcache->get_inode(dirino); - if (!in) continue; - CDir *dir = in->dir; - if (!dir) continue; - if (!dir->is_auth()) continue; - - dout(0) << "do_hashing hashing " << *dir << endl; - mds->mdcache->migrator->hash_dir(dir); - } - hash_queue.clear(); -} - - - -void MDBalancer::do_rebalance(int beat) -{ - int cluster_size = mds->get_mds_map()->get_num_mds(); - int whoami = mds->get_nodeid(); - - // reset - my_targets.clear(); - imported.clear(); - exported.clear(); - - dout(5) << " do_rebalance: cluster loads are" << endl; - - // rescale! turn my mds_load back into meta_load units - double load_fac = 1.0; - if (mds_load[whoami].mds_load() > 0) { - load_fac = mds_load[whoami].root.meta_load() / mds_load[whoami].mds_load(); - dout(7) << " load_fac is " << load_fac - << " <- " << mds_load[whoami].root.meta_load() << " / " << mds_load[whoami].mds_load() - << endl; - } - - double total_load = 0; - multimap load_map; - for (int i=0; i " << l << endl; - - if (whoami == i) my_load = l; - total_load += l; - - load_map.insert(pair( l, i )); - } - - // target load - target_load = total_load / (double)cluster_size; - dout(5) << "do_rebalance: my load " << my_load - << " target " << target_load - << " total " << total_load - << endl; - - // under or over? - if (my_load < target_load) { - dout(5) << " i am underloaded, doing nothing." << endl; - show_imports(); - return; - } - - dout(5) << " i am overloaded" << endl; - - - // first separate exporters and importers - multimap importers; - multimap exporters; - set importer_set; - set exporter_set; - - for (multimap::iterator it = load_map.begin(); - it != load_map.end(); - it++) { - if (it->first < target_load) { - dout(15) << " mds" << it->second << " is importer" << endl; - importers.insert(pair(it->first,it->second)); - importer_set.insert(it->second); - } else { - dout(15) << " mds" << it->second << " is exporter" << endl; - exporters.insert(pair(it->first,it->second)); - exporter_set.insert(it->second); - } - } - - - // determine load transfer mapping - - if (true) { - // analyze import_map; do any matches i can - - dout(5) << " matching exporters to import sources" << endl; - - // big -> small exporters - for (multimap::reverse_iterator ex = exporters.rbegin(); - ex != exporters.rend(); - ex++) { - double maxex = get_maxex(ex->second); - if (maxex <= .001) continue; - - // check importers. for now, just in arbitrary order (no intelligent matching). - for (map::iterator im = mds_import_map[ex->second].begin(); - im != mds_import_map[ex->second].end(); - im++) { - double maxim = get_maxim(im->first); - if (maxim <= .001) continue; - try_match(ex->second, maxex, - im->first, maxim); - if (maxex <= .001) break;; - } - } - } - - - if (1) { - if (beat % 2 == 1) { - // old way - dout(5) << " matching big exporters to big importers" << endl; - // big exporters to big importers - multimap::reverse_iterator ex = exporters.rbegin(); - multimap::iterator im = importers.begin(); - while (ex != exporters.rend() && - im != importers.end()) { - double maxex = get_maxex(ex->second); - double maxim = get_maxim(im->second); - if (maxex < .001 || maxim < .001) break; - try_match(ex->second, maxex, - im->second, maxim); - if (maxex <= .001) ex++; - if (maxim <= .001) im++; - } - } else { - // new way - dout(5) << " matching small exporters to big importers" << endl; - // small exporters to big importers - multimap::iterator ex = exporters.begin(); - multimap::iterator im = importers.begin(); - while (ex != exporters.end() && - im != importers.end()) { - double maxex = get_maxex(ex->second); - double maxim = get_maxim(im->second); - if (maxex < .001 || maxim < .001) break; - try_match(ex->second, maxex, - im->second, maxim); - if (maxex <= .001) ex++; - if (maxim <= .001) im++; - } - } - } - - - - // make a sorted list of my imports - map import_pop_map; - multimap import_from_map; - for (set::iterator it = mds->mdcache->imports.begin(); - it != mds->mdcache->imports.end(); - it++) { - if ((*it)->is_hashed()) continue; - double pop = (*it)->popularity[MDS_POP_CURDOM].meta_load(); - if (pop < g_conf.mds_bal_idle_threshold && - (*it)->inode != mds->mdcache->get_root()) { - dout(-5) << " exporting idle import " << **it - << " back to mds" << (*it)->inode->authority() - << endl; - mds->mdcache->migrator->export_dir(*it, (*it)->inode->authority()); - continue; - } - import_pop_map[ pop ] = *it; - int from = (*it)->inode->authority(); - dout(15) << " map: i imported " << **it << " from " << from << endl; - import_from_map.insert(pair(from, *it)); - } - - - - // do my exports! - set already_exporting; - double total_sent = 0; - double total_goal = 0; - - for (map::iterator it = my_targets.begin(); - it != my_targets.end(); - it++) { - - /* - double fac = 1.0; - if (false && total_goal > 0 && total_sent > 0) { - fac = total_goal / total_sent; - dout(-5) << " total sent is " << total_sent << " / " << total_goal << " -> fac 1/ " << fac << endl; - if (fac > 1.0) fac = 1.0; - } - fac = .9 - .4 * ((float)g_conf.num_mds / 128.0); // hack magic fixme - */ - - int target = (*it).first; - double amount = (*it).second;// * load_fac; - total_goal += amount; - - if (amount < MIN_OFFLOAD) continue; - - dout(-5) << " sending " << amount << " to mds" << target - //<< " .. " << (*it).second << " * " << load_fac - << " -> " << amount - << endl;//" .. fudge is " << fudge << endl; - double have = 0; - - show_imports(); - - // search imports from target - if (import_from_map.count(target)) { - dout(5) << " aha, looking through imports from target mds" << target << endl; - pair::iterator, multimap::iterator> p = - import_from_map.equal_range(target); - while (p.first != p.second) { - CDir *dir = (*p.first).second; - dout(5) << "considering " << *dir << " from " << (*p.first).first << endl; - multimap::iterator plast = p.first++; - - if (dir->inode->is_root()) continue; - if (dir->is_hashed()) continue; - if (dir->is_freezing() || dir->is_frozen()) continue; // export pbly already in progress - double pop = dir->popularity[MDS_POP_CURDOM].meta_load(); - assert(dir->inode->authority() == target); // cuz that's how i put it in the map, dummy - - if (pop <= amount-have) { - dout(-5) << "reexporting " << *dir - << " pop " << pop - << " back to mds" << target << endl; - mds->mdcache->migrator->export_dir(dir, target); - have += pop; - import_from_map.erase(plast); - import_pop_map.erase(pop); - } else { - dout(5) << "can't reexport " << *dir << ", too big " << pop << endl; - } - if (amount-have < MIN_OFFLOAD) break; - } - } - if (amount-have < MIN_OFFLOAD) { - total_sent += have; - continue; - } - - // any other imports - if (false) - for (map::iterator import = import_pop_map.begin(); - import != import_pop_map.end(); - import++) { - CDir *imp = (*import).second; - if (imp->inode->is_root()) continue; - - double pop = (*import).first; - if (pop < amount-have || pop < MIN_REEXPORT) { - dout(-5) << "reexporting " << *imp - << " pop " << pop - << " back to mds" << imp->inode->authority() - << endl; - have += pop; - mds->mdcache->migrator->export_dir(imp, imp->inode->authority()); - } - if (amount-have < MIN_OFFLOAD) break; - } - if (amount-have < MIN_OFFLOAD) { - //fudge = amount-have; - total_sent += have; - continue; - } - - // okay, search for fragments of my workload - set candidates = mds->mdcache->imports; - - list exports; - - for (set::iterator pot = candidates.begin(); - pot != candidates.end(); - pot++) { - find_exports(*pot, amount, exports, have, already_exporting); - if (have > amount-MIN_OFFLOAD) { - break; - } - } - //fudge = amount - have; - total_sent += have; - - for (list::iterator it = exports.begin(); it != exports.end(); it++) { - dout(-5) << " exporting to mds" << target - << " fragment " << **it - << " pop " << (*it)->popularity[MDS_POP_CURDOM].meta_load() - << endl; - mds->mdcache->migrator->export_dir(*it, target); - - // hack! only do one dir. - break; - } - } - - dout(5) << "rebalance done" << endl; - show_imports(); - -} - - - -void MDBalancer::find_exports(CDir *dir, - double amount, - list& exports, - double& have, - set& already_exporting) -{ - double need = amount - have; - if (need < amount * g_conf.mds_bal_min_start) - return; // good enough! - double needmax = need * g_conf.mds_bal_need_max; - double needmin = need * g_conf.mds_bal_need_min; - double midchunk = need * g_conf.mds_bal_midchunk; - double minchunk = need * g_conf.mds_bal_minchunk; - - list bigger; - multimap smaller; - - double dir_pop = dir->popularity[MDS_POP_CURDOM].meta_load(); - double dir_sum = 0; - dout(-7) << " find_exports in " << dir_pop << " " << *dir << " need " << need << " (" << needmin << " - " << needmax << ")" << endl; - - for (CDir_map_t::iterator it = dir->begin(); - it != dir->end(); - it++) { - CInode *in = it->second->get_inode(); - if (!in) continue; - if (!in->is_dir()) continue; - if (!in->dir) continue; // clearly not popular - - if (in->dir->is_export()) continue; - if (in->dir->is_hashed()) continue; - if (already_exporting.count(in->dir)) continue; - - if (in->dir->is_frozen()) continue; // can't export this right now! - //if (in->dir->get_size() == 0) continue; // don't export empty dirs, even if they're not complete. for now! - - // how popular? - double pop = in->dir->popularity[MDS_POP_CURDOM].meta_load(); - dir_sum += pop; - dout(20) << " pop " << pop << " " << *in->dir << endl; - - if (pop < minchunk) continue; - - // lucky find? - if (pop > needmin && pop < needmax) { - exports.push_back(in->dir); - have += pop; - return; - } - - if (pop > need) - bigger.push_back(in->dir); - else - smaller.insert(pair(pop, in->dir)); - } - dout(7) << " .. sum " << dir_sum << " / " << dir_pop << endl; - - // grab some sufficiently big small items - multimap::reverse_iterator it; - for (it = smaller.rbegin(); - it != smaller.rend(); - it++) { - - if ((*it).first < midchunk) - break; // try later - - dout(7) << " taking smaller " << *(*it).second << endl; - - exports.push_back((*it).second); - already_exporting.insert((*it).second); - have += (*it).first; - if (have > needmin) - return; - } - - // apprently not enough; drill deeper into the hierarchy (if non-replicated) - for (list::iterator it = bigger.begin(); - it != bigger.end(); - it++) { - if ((*it)->is_rep()) continue; - dout(7) << " descending into " << **it << endl; - find_exports(*it, amount, exports, have, already_exporting); - if (have > needmin) - return; - } - - // ok fine, use smaller bits - for (; - it != smaller.rend(); - it++) { - - dout(7) << " taking (much) smaller " << it->first << " " << *(*it).second << endl; - - exports.push_back((*it).second); - already_exporting.insert((*it).second); - have += (*it).first; - if (have > needmin) - return; - } - - // ok fine, drill inot replicated dirs - for (list::iterator it = bigger.begin(); - it != bigger.end(); - it++) { - if (!(*it)->is_rep()) continue; - dout(7) << " descending into replicated " << **it << endl; - find_exports(*it, amount, exports, have, already_exporting); - if (have > needmin) - return; - } - -} - - - - -void MDBalancer::hit_inode(CInode *in, int type) -{ - // hit me - in->popularity[MDS_POP_JUSTME].pop[type].hit(); - in->popularity[MDS_POP_NESTED].pop[type].hit(); - if (in->is_auth()) { - in->popularity[MDS_POP_CURDOM].pop[type].hit(); - in->popularity[MDS_POP_ANYDOM].pop[type].hit(); - } - - // hit auth up to import - CDir *dir = in->get_parent_dir(); - if (dir) hit_dir(dir, type); -} - - -void MDBalancer::hit_dir(CDir *dir, int type) -{ - // hit me - float v = dir->popularity[MDS_POP_JUSTME].pop[type].hit(); - - // hit modify counter, if this was a modify - if (g_conf.num_mds > 2 && // FIXME >2 thing - !dir->inode->is_root() && // not root (for now at least) - dir->is_auth()) { - //dout(-20) << "hit_dir " << type << " pop is " << v << " " << *dir << endl; - - // hash this dir? (later?) - if (((v > g_conf.mds_bal_hash_rd && type == META_POP_IRD) || - //(v > g_conf.mds_bal_hash_wr && type == META_POP_IWR) || - (v > g_conf.mds_bal_hash_wr && type == META_POP_DWR)) && - !(dir->is_hashed() || dir->is_hashing()) && - hash_queue.count(dir->ino()) == 0) { - dout(0) << "hit_dir " << type << " pop is " << v << ", putting in hash_queue: " << *dir << endl; - hash_queue.insert(dir->ino()); - } - - } - - hit_recursive(dir, type); -} - - - -void MDBalancer::hit_recursive(CDir *dir, int type) -{ - bool anydom = dir->is_auth(); - bool curdom = dir->is_auth(); - - float rd_adj = 0.0; - - // replicate? - float dir_pop = dir->popularity[MDS_POP_CURDOM].pop[type].get(); // hmm?? - - if (dir->is_auth()) { - if (!dir->is_rep() && - dir_pop >= g_conf.mds_bal_replicate_threshold) { - // replicate - float rdp = dir->popularity[MDS_POP_JUSTME].pop[META_POP_IRD].get(); - rd_adj = rdp / mds->get_mds_map()->get_num_mds() - rdp; - rd_adj /= 2.0; // temper somewhat - - dout(1) << "replicating dir " << *dir << " pop " << dir_pop << " .. rdp " << rdp << " adj " << rd_adj << endl; - - dir->dir_rep = CDIR_REP_ALL; - mds->mdcache->send_dir_updates(dir, true); - - dir->popularity[MDS_POP_JUSTME].pop[META_POP_IRD].adjust(rd_adj); - dir->popularity[MDS_POP_CURDOM].pop[META_POP_IRD].adjust(rd_adj); - } - - if (!dir->ino() != 1 && - dir->is_rep() && - dir_pop < g_conf.mds_bal_unreplicate_threshold) { - // unreplicate - dout(1) << "unreplicating dir " << *dir << " pop " << dir_pop << endl; - - dir->dir_rep = CDIR_REP_NONE; - mds->mdcache->send_dir_updates(dir); - } - } - - - while (dir) { - CInode *in = dir->inode; - - dir->popularity[MDS_POP_NESTED].pop[type].hit(); - in->popularity[MDS_POP_NESTED].pop[type].hit(); - - if (rd_adj != 0.0) dir->popularity[MDS_POP_NESTED].pop[META_POP_IRD].adjust(rd_adj); - - if (anydom) { - dir->popularity[MDS_POP_ANYDOM].pop[type].hit(); - in->popularity[MDS_POP_ANYDOM].pop[type].hit(); - } - - if (curdom) { - dir->popularity[MDS_POP_CURDOM].pop[type].hit(); - in->popularity[MDS_POP_CURDOM].pop[type].hit(); - } - - if (dir->is_import()) - curdom = false; // end of auth domain, stop hitting auth counters. - dir = dir->inode->get_parent_dir(); - } -} - - -/* - * subtract off an exported chunk - */ -void MDBalancer::subtract_export(CDir *dir) -{ - meta_load_t curdom = dir->popularity[MDS_POP_CURDOM]; - - bool in_domain = !dir->is_import(); - - while (true) { - CInode *in = dir->inode; - - in->popularity[MDS_POP_ANYDOM] -= curdom; - if (in_domain) in->popularity[MDS_POP_CURDOM] -= curdom; - - dir = in->get_parent_dir(); - if (!dir) break; - - if (dir->is_import()) in_domain = false; - - dir->popularity[MDS_POP_ANYDOM] -= curdom; - if (in_domain) dir->popularity[MDS_POP_CURDOM] -= curdom; - } -} - - -void MDBalancer::add_import(CDir *dir) -{ - meta_load_t curdom = dir->popularity[MDS_POP_CURDOM]; - - bool in_domain = !dir->is_import(); - - while (true) { - CInode *in = dir->inode; - - in->popularity[MDS_POP_ANYDOM] += curdom; - if (in_domain) in->popularity[MDS_POP_CURDOM] += curdom; - - dir = in->get_parent_dir(); - if (!dir) break; - - if (dir->is_import()) in_domain = false; - - dir->popularity[MDS_POP_ANYDOM] += curdom; - if (in_domain) dir->popularity[MDS_POP_CURDOM] += curdom; - } - -} - - - - - - -void MDBalancer::show_imports(bool external) -{ - mds->mdcache->show_imports(); -} - - - -/* replicate? - - float dir_pop = dir->get_popularity(); - - if (dir->is_auth()) { - if (!dir->is_rep() && - dir_pop >= g_conf.mds_bal_replicate_threshold) { - // replicate - dout(5) << "replicating dir " << *in << " pop " << dir_pop << endl; - - dir->dir_rep = CDIR_REP_ALL; - mds->mdcache->send_dir_updates(dir); - } - - if (dir->is_rep() && - dir_pop < g_conf.mds_bal_unreplicate_threshold) { - // unreplicate - dout(5) << "unreplicating dir " << *in << " pop " << dir_pop << endl; - - dir->dir_rep = CDIR_REP_NONE; - mds->mdcache->send_dir_updates(dir); - } - } - -*/ diff --git a/tags/20070517_before_mds_merge/mds/MDBalancer.h b/tags/20070517_before_mds_merge/mds/MDBalancer.h deleted file mode 100644 index d84d6439dbccc..0000000000000 --- a/tags/20070517_before_mds_merge/mds/MDBalancer.h +++ /dev/null @@ -1,109 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __MDBALANCER_H -#define __MDBALANCER_H - -#include -#include -using namespace std; - -#include -#include -using namespace __gnu_cxx; - -#include "include/types.h" -#include "common/Clock.h" -#include "CInode.h" - - -class MDS; -class Message; -class MHeartbeat; -class CInode; -class Context; -class CDir; - -class MDBalancer { - protected: - MDS *mds; - int beat_epoch; - - utime_t last_heartbeat; - utime_t last_hash; - - // todo - set hash_queue; - - // per-epoch scatter/gathered info - hash_map mds_load; - hash_map mds_meta_load; - map > mds_import_map; - - // per-epoch state - double my_load, target_load; - map my_targets; - map imported; - map exported; - - double try_match(int ex, double& maxex, - int im, double& maxim); - double get_maxim(int im) { - return target_load - mds_meta_load[im] - imported[im]; - } - double get_maxex(int ex) { - return mds_meta_load[ex] - target_load - exported[ex]; - } - - public: - MDBalancer(MDS *m) : - mds(m), - beat_epoch(0) { } - - mds_load_t get_load(); - - int proc_message(Message *m); - - void send_heartbeat(); - void handle_heartbeat(MHeartbeat *m); - - void tick(); - - void do_hashing(); - - void export_empties(); - void do_rebalance(int beat); - void find_exports(CDir *dir, - double amount, - list& exports, - double& have, - set& already_exporting); - - - void subtract_export(class CDir *ex); - void add_import(class CDir *im); - - void hit_inode(class CInode *in, int type=0); - void hit_dir(class CDir *dir, int type=0); - void hit_recursive(class CDir *dir, int type=0); - - - void show_imports(bool external=false); - -}; - - - -#endif diff --git a/tags/20070517_before_mds_merge/mds/MDCache.cc b/tags/20070517_before_mds_merge/mds/MDCache.cc deleted file mode 100644 index eb8ad591d6a35..0000000000000 --- a/tags/20070517_before_mds_merge/mds/MDCache.cc +++ /dev/null @@ -1,3536 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "MDCache.h" -#include "MDStore.h" -#include "MDS.h" -#include "Server.h" -#include "Locker.h" -#include "MDLog.h" -#include "MDBalancer.h" -#include "AnchorClient.h" -#include "Migrator.h" -#include "Renamer.h" - -#include "MDSMap.h" - -#include "CInode.h" -#include "CDir.h" - -#include "include/filepath.h" - -#include "msg/Message.h" -#include "msg/Messenger.h" - -#include "common/Logger.h" - -#include "osdc/Filer.h" - -#include "events/EImportMap.h" -#include "events/EString.h" -#include "events/EUnlink.h" -#include "events/EPurgeFinish.h" - -#include "messages/MGenericMessage.h" - -#include "messages/MMDSImportMap.h" -#include "messages/MMDSCacheRejoin.h" -#include "messages/MMDSCacheRejoinAck.h" - -#include "messages/MDiscover.h" -#include "messages/MDiscoverReply.h" - -//#include "messages/MInodeUpdate.h" -#include "messages/MDirUpdate.h" -#include "messages/MCacheExpire.h" - -#include "messages/MInodeFileCaps.h" - -#include "messages/MInodeLink.h" -#include "messages/MInodeLinkAck.h" -#include "messages/MInodeUnlink.h" -#include "messages/MInodeUnlinkAck.h" - -#include "messages/MLock.h" -#include "messages/MDentryUnlink.h" - -#include "messages/MClientRequest.h" -#include "messages/MClientFileCaps.h" - -#include "IdAllocator.h" - -#include "common/Timer.h" - -#include -#include -#include -#include -#include -using namespace std; - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".cache " - - - - -MDCache::MDCache(MDS *m) -{ - mds = m; - migrator = new Migrator(mds, this); - renamer = new Renamer(mds, this); - root = NULL; - lru.lru_set_max(g_conf.mds_cache_size); - lru.lru_set_midpoint(g_conf.mds_cache_mid); - - did_shutdown_exports = false; - did_shutdown_log_cap = false; - shutdown_commits = 0; -} - -MDCache::~MDCache() -{ - delete migrator; - delete renamer; -} - - - -void MDCache::log_stat(Logger *logger) -{ - if (get_root()) { - logger->set("popanyd", (int)get_root()->popularity[MDS_POP_ANYDOM].meta_load()); - logger->set("popnest", (int)get_root()->popularity[MDS_POP_NESTED].meta_load()); - } - logger->set("c", lru.lru_get_size()); - logger->set("cpin", lru.lru_get_num_pinned()); - logger->set("ctop", lru.lru_get_top()); - logger->set("cbot", lru.lru_get_bot()); - logger->set("cptail", lru.lru_get_pintail()); -} - - -// - -bool MDCache::shutdown() -{ - if (lru.lru_get_size() > 0) { - dout(7) << "WARNING: mdcache shutodwn with non-empty cache" << endl; - //show_cache(); - show_imports(); - //dump(); - } - return true; -} - - -// MDCache - -CInode *MDCache::create_inode() -{ - CInode *in = new CInode(this); - - // zero - memset(&in->inode, 0, sizeof(inode_t)); - - // assign ino - in->inode.ino = mds->idalloc->alloc_id(); - - in->inode.nlink = 1; // FIXME - - in->inode.layout = g_OSD_FileLayout; - - add_inode(in); // add - return in; -} - -void MDCache::destroy_inode(CInode *in) -{ - mds->idalloc->reclaim_id(in->ino()); - remove_inode(in); -} - - -void MDCache::add_inode(CInode *in) -{ - // add to lru, inode map - assert(inode_map.count(in->ino()) == 0); // should be no dup inos! - inode_map[ in->ino() ] = in; -} - -void MDCache::remove_inode(CInode *o) -{ - dout(14) << "remove_inode " << *o << endl; - if (o->get_parent_dn()) { - // FIXME: multiple parents? - CDentry *dn = o->get_parent_dn(); - assert(!dn->is_dirty()); - if (dn->is_sync()) - dn->dir->remove_dentry(dn); // unlink inode AND hose dentry - else - dn->dir->unlink_inode(dn); // leave dentry - } - inode_map.erase(o->ino()); // remove from map -} - - -/* - * take note of where we write import_maps in the log, as we need - * to take care not to expire them until an updated map is safely flushed. - */ -class C_MDS_WroteImportMap : public Context { - MDLog *mdlog; - off_t end_off; -public: - C_MDS_WroteImportMap(MDLog *ml, off_t eo) : mdlog(ml), end_off(eo) { } - void finish(int r) { - // cout << "WroteImportMap at " << end_off << endl; - if (r >= 0) - mdlog->last_import_map = end_off; - mdlog->writing_import_map = false; - } -}; - - - -void MDCache::log_import_map(Context *onsync) -{ - dout(10) << "log_import_map " << imports.size() << " imports, " - << exports.size() << " exports" << endl; - - EImportMap *le = new EImportMap; - - // include import/export inodes, - // and a spanning tree to tie it to the root of the fs - for (set::iterator p = imports.begin(); - p != imports.end(); - p++) { - CDir *im = *p; - le->imports.insert(im->ino()); - le->metablob.add_dir_context(im, true); - le->metablob.add_dir(im, false); - - if (nested_exports.count(im)) { - for (set::iterator q = nested_exports[im].begin(); - q != nested_exports[im].end(); - ++q) { - CDir *ex = *q; - le->nested_exports[im->ino()].insert(ex->ino()); - le->exports.insert(ex->ino()); - le->metablob.add_dir_context(ex); - le->metablob.add_dir(ex, false); - } - } - } - - mds->mdlog->writing_import_map = true; - mds->mdlog->submit_entry(le); - mds->mdlog->wait_for_sync(new C_MDS_WroteImportMap(mds->mdlog, mds->mdlog->get_write_pos())); - if (onsync) - mds->mdlog->wait_for_sync(onsync); -} - - - - - -// ===================== -// recovery stuff - -void MDCache::send_pending_import_maps() -{ - if (wants_import_map.empty()) - return; // nothing to send. - - // only if it's appropriate! - if (migrator->is_exporting()) { - dout(7) << "send_pending_import_maps waiting, exports still in progress" << endl; - return; // not now - } - - // ok, send them. - for (set::iterator p = wants_import_map.begin(); - p != wants_import_map.end(); - p++) - send_import_map_now(*p); - wants_import_map.clear(); -} - -void MDCache::send_import_map(int who) -{ - if (migrator->is_exporting()) - send_import_map_later(who); - else - send_import_map_now(who); -} - -void MDCache::send_import_map_now(int who) -{ - dout(10) << "send_import_map to mds" << who << endl; - - MMDSImportMap *m = new MMDSImportMap; - - // known - for (set::iterator p = imports.begin(); - p != imports.end(); - p++) { - CDir *im = *p; - - if (migrator->is_importing(im->ino())) { - // ambiguous (mid-import) - m->add_ambiguous_import(im->ino(), - migrator->get_import_bounds(im->ino())); - } else { - // not ambiguous. - m->add_import(im->ino()); - - if (nested_exports.count(im)) { - for (set::iterator q = nested_exports[im].begin(); - q != nested_exports[im].end(); - ++q) { - CDir *ex = *q; - m->add_import_export(im->ino(), ex->ino()); - } - } - } - } - - // ambiguous - for (map >::iterator p = my_ambiguous_imports.begin(); - p != my_ambiguous_imports.end(); - ++p) - m->add_ambiguous_import(p->first, p->second); - - // second - mds->send_message_mds(m, who, MDS_PORT_CACHE); -} - - - -/* - * during resolve state, we share import_maps to determine who - * is authoritative for which trees. we expect to get an import_map - * from _everyone_ in the recovery_set (the mds cluster at the time of - * the first failure). - */ -void MDCache::handle_import_map(MMDSImportMap *m) -{ - dout(7) << "handle_import_map from " << m->get_source() << endl; - int from = m->get_source().num(); - - // FIXME: check if we are a surviving ambiguous importer - - // update my dir_auth values - for (map >::iterator pi = m->imap.begin(); - pi != m->imap.end(); - ++pi) { - CInode *imi = get_inode(pi->first); - if (!imi) continue; - CDir *im = imi->dir; - if (!im) continue; - - im->set_dir_auth(from); - - for (set::iterator pe = pi->second.begin(); - pe != pi->second.end(); - ++pe) { - CInode *exi = get_inode(*pe); - if (!exi) continue; - CDir *ex = exi->dir; - if (!ex) continue; - - if (ex->get_dir_auth() == CDIR_AUTH_PARENT) - ex->set_dir_auth(CDIR_AUTH_UNKNOWN); - } - } - - // note ambiguous imports too - for (map >::iterator pi = m->ambiguous_imap.begin(); - pi != m->ambiguous_imap.end(); - ++pi) - mds->mdcache->other_ambiguous_imports[from][pi->first].swap( pi->second ); - - // did i get them all? - got_import_map.insert(from); - - if (got_import_map == recovery_set) { - dout(10) << "got all import maps, ready to rejoin" << endl; - disambiguate_imports(); - recalc_auth_bits(); - trim_non_auth(); - - // move to rejoin state - mds->set_want_state(MDSMap::STATE_REJOIN); - - } else { - dout(10) << "still waiting for more importmaps, got " << got_import_map - << ", need " << recovery_set << endl; - } - - delete m; -} - - -void MDCache::disambiguate_imports() -{ - dout(10) << "disambiguate_imports" << endl; - - // other nodes' ambiguous imports - for (map > >::iterator p = other_ambiguous_imports.begin(); - p != other_ambiguous_imports.begin(); - ++p) { - int who = p->first; - - for (map >::iterator q = p->second.begin(); - q != p->second.end(); - ++q) { - CInode *diri = get_inode(q->first); - if (!diri) continue; - CDir *dir = diri->dir; - if (!dir) continue; - - if (dir->authority() >= CDIR_AUTH_UNKNOWN) { - dout(10) << "mds" << who << " did not import " << *dir << endl; - } else { - dout(10) << "mds" << who << " did import " << *dir << endl; - int was = dir->authority(); - dir->set_dir_auth(who); - - for (set::iterator r = q->second.begin(); - r != q->second.end(); - ++r) { - CInode *exi = get_inode(q->first); - if (!exi) continue; - CDir *ex = exi->dir; - if (!ex) continue; - if (ex->get_dir_auth() == CDIR_AUTH_PARENT) - ex->set_dir_auth(was); - dout(10) << " bound " << *ex << endl; - } - } - } - } - other_ambiguous_imports.clear(); - - // my ambiguous imports - while (!my_ambiguous_imports.empty()) { - map >::iterator q = my_ambiguous_imports.begin(); - - CInode *diri = get_inode(q->first); - if (!diri) continue; - CDir *dir = diri->dir; - if (!dir) continue; - - if (dir->authority() != CDIR_AUTH_UNKNOWN) { - dout(10) << "ambiguous import auth known, must not be me " << *dir << endl; - cancel_ambiguous_import(q->first); - } else { - dout(10) << "ambiguous import auth unknown, must be me " << *dir << endl; - finish_ambiguous_import(q->first); - } - } - assert(my_ambiguous_imports.empty()); - - show_imports(); -} - -void MDCache::cancel_ambiguous_import(inodeno_t dirino) -{ - assert(my_ambiguous_imports.count(dirino)); - dout(10) << "cancel_ambiguous_import " << dirino - << " bounds " << my_ambiguous_imports[dirino] - << endl; - my_ambiguous_imports.erase(dirino); -} - -void MDCache::finish_ambiguous_import(inodeno_t dirino) -{ - assert(my_ambiguous_imports.count(dirino)); - set bounds; - bounds.swap(my_ambiguous_imports[dirino]); - my_ambiguous_imports.erase(dirino); - - dout(10) << "finish_ambiguous_import " << dirino - << " bounds " << bounds - << endl; - - CInode *diri = get_inode(dirino); - assert(diri); - CDir *dir = diri->dir; - assert(dir); - - // adjust dir_auth - CDir *im = dir; - if (dir->get_inode()->authority() == mds->get_nodeid()) { - // parent is already me. adding to existing import. - im = get_auth_container(dir); - if (!im) im = dir; - nested_exports[im].erase(dir); - exports.erase(dir); - dir->set_dir_auth( CDIR_AUTH_PARENT ); - dir->state_clear(CDIR_STATE_EXPORT); - dir->put(CDir::PIN_EXPORT); - } else { - // parent isn't me. new import. - imports.insert(dir); - dir->set_dir_auth( mds->get_nodeid() ); - dir->state_set(CDIR_STATE_IMPORT); - dir->get(CDir::PIN_IMPORT); - } - - dout(10) << " base " << *dir << endl; - if (dir != im) - dout(10) << " under " << *im << endl; - - // bounds (exports, before) - for (set::iterator p = bounds.begin(); - p != bounds.end(); - ++p) { - CInode *bi = get_inode(*p); - assert(bi); - CDir *bd = bi->dir; - assert(bd); - - if (bd->get_dir_auth() == mds->get_nodeid()) { - // still me. was an import. - imports.erase(bd); - bd->set_dir_auth( CDIR_AUTH_PARENT ); - bd->state_clear(CDIR_STATE_IMPORT); - bd->put(CDir::PIN_IMPORT); - // move nested exports. - for (set::iterator q = nested_exports[bd].begin(); - q != nested_exports[bd].end(); - ++q) - nested_exports[im].insert(*q); - nested_exports.erase(bd); - - } else { - // not me anymore. now an export. - exports.insert(bd); - nested_exports[im].insert(bd); - assert(bd->get_dir_auth() != CDIR_AUTH_PARENT); - bd->set_dir_auth( CDIR_AUTH_UNKNOWN ); - bd->state_set(CDIR_STATE_EXPORT); - bd->get(CDir::PIN_EXPORT); - } - - dout(10) << " bound " << *bd << endl; - } -} - -void MDCache::finish_ambiguous_export(inodeno_t dirino, set& bounds) -{ - CInode *diri = get_inode(dirino); - assert(diri); - CDir *dir = diri->dir; - assert(dir); - - dout(10) << "finish_ambiguous_export " << dirino - << " bounds " << bounds - << endl; - - // adjust dir_auth - CDir *im = get_auth_container(dir); - if (dir->get_inode()->authority() == CDIR_AUTH_UNKNOWN) { - // was an import, hose it - assert(im == dir); - assert(imports.count(dir)); - imports.erase(dir); - dir->set_dir_auth( CDIR_AUTH_PARENT ); - dir->state_clear(CDIR_STATE_IMPORT); - dir->put(CDir::PIN_IMPORT); - } else { - // i'm now an export - exports.insert(dir); - nested_exports[im].insert(dir); - dir->set_dir_auth( CDIR_AUTH_UNKNOWN ); // not me - dir->state_set(CDIR_STATE_EXPORT); - dir->get(CDir::PIN_EXPORT); - } - dout(10) << " base " << *dir << endl; - if (dir != im) - dout(10) << " under " << *im << endl; - - // bounds (there were exports, before) - for (set::iterator p = bounds.begin(); - p != bounds.end(); - ++p) { - CInode *bi = get_inode(*p); - assert(bi); - CDir *bd = bi->dir; - assert(bd); - - // hose export - assert(exports.count(bd)); - exports.erase(bd); - nested_exports[im].erase(bd); - - // fix dir_auth - assert(bd->get_dir_auth() != CDIR_AUTH_PARENT); - bd->set_dir_auth( CDIR_AUTH_PARENT ); // not me - - bd->state_clear(CDIR_STATE_EXPORT); - bd->put(CDir::PIN_EXPORT); - - dout(10) << " bound " << *bd << endl; - } - - show_imports(); -} - - - - -/* - * rejoin phase! - * we start out by sending rejoins to everyone in the recovery set. - * - * if _were_ are rejoining, send for all regions in our cache. - * if we are active|stopping, send only to nodes that are are rejoining. - */ -void MDCache::send_cache_rejoins() -{ - dout(10) << "send_cache_rejoins " << endl; - - map rejoins; - - // if i am rejoining, send a rejoin to everyone. - // otherwise, just send to others who are rejoining. - for (set::iterator p = recovery_set.begin(); - p != recovery_set.end(); - ++p) { - if (*p == mds->get_nodeid()) continue; // nothing to myself! - if (mds->is_rejoin() || - mds->mdsmap->is_rejoin(*p)) - rejoins[*p] = new MMDSCacheRejoin; - } - - // build list of dir_auth regions - list dir_auth_regions; - for (hash_map::iterator p = inode_map.begin(); - p != inode_map.end(); - ++p) { - if (!p->second->is_dir()) continue; - if (!p->second->dir) continue; - if (p->second->dir->get_dir_auth() == CDIR_AUTH_PARENT) continue; - - int auth = p->second->dir->get_dir_auth(); - assert(auth >= 0); - - if (auth == mds->get_nodeid()) continue; // skip my own regions! - - if (rejoins.count(auth) == 0) - continue; // don't care about this node's regions - - // add to list - dout(10) << " on mds" << auth << " region " << *p->second << endl; - dir_auth_regions.push_back(p->second->dir); - } - - // walk the regions - for (list::iterator p = dir_auth_regions.begin(); - p != dir_auth_regions.end(); - ++p) { - CDir *dir = *p; - int to = dir->authority(); - cache_rejoin_walk(dir, rejoins[to]); - } - - // send the messages - assert(rejoin_ack_gather.empty()); - for (map::iterator p = rejoins.begin(); - p != rejoins.end(); - ++p) { - mds->send_message_mds(p->second, p->first, MDS_PORT_CACHE); - rejoin_ack_gather.insert(p->first); - } - - // nothing? - if (rejoins.empty()) { - dout(10) << "nothing to rejoin, going active" << endl; - mds->set_want_state(MDSMap::STATE_ACTIVE); - } -} - - - -void MDCache::cache_rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin) -{ - dout(10) << "cache_rejoin_walk " << *dir << endl; - rejoin->add_dir(dir->ino()); - - list nested; // finish this dir, then do nested items - - // walk dentries - for (map::iterator p = dir->items.begin(); - p != dir->items.end(); - ++p) { - // dentry - rejoin->add_dentry(dir->ino(), p->first); - - // inode? - if (p->second->is_primary() && p->second->get_inode()) { - CInode *in = p->second->get_inode(); - rejoin->add_inode(in->ino(), - in->get_caps_wanted()); - - // dir? - if (in->dir && - in->dir->get_dir_auth() == CDIR_AUTH_PARENT) - nested.push_back(in->dir); - } - } - - // recurse into nested dirs - for (list::iterator p = nested.begin(); - p != nested.end(); - ++p) - cache_rejoin_walk(*p, rejoin); -} - - -/* - * i got a rejoin. - * - * - reply with the lockstate - * - * if i am active|stopping, - * - remove source from replica list for everything not referenced here. - */ -void MDCache::handle_cache_rejoin(MMDSCacheRejoin *m) -{ - dout(7) << "handle_cache_rejoin from " << m->get_source() << endl; - int from = m->get_source().num(); - - MMDSCacheRejoinAck *ack = new MMDSCacheRejoinAck; - - if (mds->is_active() || mds->is_stopping()) { - dout(10) << "removing stale cache replicas" << endl; - // first, scour cache of replica references - for (hash_map::iterator p = inode_map.begin(); - p != inode_map.end(); - ++p) { - // inode - CInode *in = p->second; - if (in->is_replica(from) && m->inodes.count(p->first) == 0) { - inode_remove_replica(in, from); - dout(10) << " rem " << *in << endl; - } - - // dentry - if (in->parent) { - CDentry *dn = in->parent; - if (dn->is_replica(from) && - (m->dentries.count(dn->get_dir()->ino()) == 0 || - m->dentries[dn->get_dir()->ino()].count(dn->get_name()) == 0)) { - dn->remove_replica(from); - dout(10) << " rem " << *dn << endl; - } - } - - // dir - if (in->dir) { - CDir *dir = in->dir; - if (dir->is_replica(from) && m->dirs.count(p->first) == 0) { - dir->remove_replica(from); - dout(10) << " rem " << *dir << endl; - } - } - } - } else { - assert(mds->is_rejoin()); - } - - // dirs - for (set::iterator p = m->dirs.begin(); - p != m->dirs.end(); - ++p) { - CInode *diri = get_inode(*p); - assert(diri); - CDir *dir = diri->dir; - assert(dir); - int nonce = dir->add_replica(from); - dout(10) << " has " << *dir << endl; - ack->add_dir(*p, nonce); - - // dentries - for (set::iterator q = m->dentries[*p].begin(); - q != m->dentries[*p].end(); - ++q) { - CDentry *dn = dir->lookup(*q); - assert(dn); - int nonce = dn->add_replica(from); - dout(10) << " has " << *dn << endl; - ack->add_dentry(*p, *q, dn->get_lockstate(), nonce); - } - } - - // inodes - for (map::iterator p = m->inodes.begin(); - p != m->inodes.end(); - ++p) { - CInode *in = get_inode(p->first); - assert(in); - int nonce = in->add_replica(from); - if (p->second) - in->mds_caps_wanted[from] = p->second; - else - in->mds_caps_wanted.erase(from); - in->hardlock.gather_set.erase(from); // just in case - in->filelock.gather_set.erase(from); // just in case - dout(10) << " has " << *in << endl; - ack->add_inode(p->first, - in->hardlock.get_replica_state(), in->filelock.get_replica_state(), - nonce); - } - - // send ack - mds->send_message_mds(ack, from, MDS_PORT_CACHE); - - delete m; -} - - -void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoinAck *m) -{ - dout(7) << "handle_cache_rejoin from " << m->get_source() << endl; - int from = m->get_source().num(); - - // dirs - for (list::iterator p = m->dirs.begin(); - p != m->dirs.end(); - ++p) { - CInode *diri = get_inode(p->dirino); - CDir *dir = diri->dir; - assert(dir); - - dir->set_replica_nonce(p->nonce); - dout(10) << " got " << *dir << endl; - - // dentries - for (map::iterator q = m->dentries[p->dirino].begin(); - q != m->dentries[p->dirino].end(); - ++q) { - CDentry *dn = dir->lookup(q->first); - assert(dn); - dn->set_replica_nonce(q->second.nonce); - dn->set_lockstate(q->second.lock); - dout(10) << " got " << *dn << endl; - } - } - - // inodes - for (list::iterator p = m->inodes.begin(); - p != m->inodes.end(); - ++p) { - CInode *in = get_inode(p->ino); - assert(in); - in->set_replica_nonce(p->nonce); - in->hardlock.set_state(p->hardlock); - in->filelock.set_state(p->filelock); - dout(10) << " got " << *in << endl; - } - - delete m; - - // done? - rejoin_ack_gather.erase(from); - if (rejoin_ack_gather.empty()) { - dout(7) << "all done, going active!" << endl; - show_imports(); - show_cache(); - mds->set_want_state(MDSMap::STATE_ACTIVE); - } else { - dout(7) << "still need rejoin_ack from " << rejoin_ack_gather << endl; - } - -} - - - - - -// =============================================================================== - -void MDCache::rename_file(CDentry *srcdn, - CDentry *destdn) -{ - CInode *in = srcdn->inode; - - // unlink src - srcdn->dir->unlink_inode(srcdn); - - // unlink old inode? - if (destdn->inode) destdn->dir->unlink_inode(destdn); - - // link inode w/ dentry - destdn->dir->link_inode( destdn, in ); -} - - - -void MDCache::set_root(CInode *in) -{ - assert(root == 0); - root = in; - root->state_set(CInode::STATE_ROOT); -} - -void MDCache::add_import(CDir *dir) -{ - imports.insert(dir); - dir->state_set(CDIR_STATE_IMPORT); - dir->get(CDir::PIN_IMPORT); -} - - -void MDCache::recalc_auth_bits() -{ - dout(7) << "recalc_auth_bits" << endl; - - for (hash_map::iterator p = inode_map.begin(); - p != inode_map.end(); - ++p) { - CInode *in = p->second; - if (in->authority() == mds->get_nodeid()) - in->state_set(CInode::STATE_AUTH); - else { - in->state_clear(CInode::STATE_AUTH); - if (in->is_dirty()) - in->mark_clean(); - } - - if (in->parent) { - if (in->parent->authority() == mds->get_nodeid()) - in->parent->state_set(CDentry::STATE_AUTH); - else { - in->parent->state_clear(CDentry::STATE_AUTH); - if (in->parent->is_dirty()) - in->parent->mark_clean(); - } - } - - if (in->dir) { - if (in->dir->authority() == mds->get_nodeid()) - in->dir->state_set(CDIR_STATE_AUTH); - else { - in->dir->state_clear(CDIR_STATE_AUTH); - if (in->dir->is_dirty()) - in->dir->mark_clean(); - } - } - } - show_imports(); - show_cache(); -} - - - - - -// ************** -// Inode purging -- reliably removing deleted file's objects - -class C_MDC_PurgeFinish : public Context { - MDCache *mdc; - inodeno_t ino; -public: - C_MDC_PurgeFinish(MDCache *c, inodeno_t i) : mdc(c), ino(i) {} - void finish(int r) { - mdc->purge_inode_finish(ino); - } -}; -class C_MDC_PurgeFinish2 : public Context { - MDCache *mdc; - inodeno_t ino; -public: - C_MDC_PurgeFinish2(MDCache *c, inodeno_t i) : mdc(c), ino(i) {} - void finish(int r) { - mdc->purge_inode_finish_2(ino); - } -}; - -/* purge_inode in - * will be called by on unlink or rmdir - * caller responsible for journaling an appropriate EUnlink or ERmdir - */ -void MDCache::purge_inode(inode_t &inode) -{ - dout(10) << "purge_inode " << inode.ino << " size " << inode.size << endl; - - // take note - assert(purging.count(inode.ino) == 0); - purging[inode.ino] = inode; - - // remove - mds->filer->remove(inode, 0, inode.size, - 0, new C_MDC_PurgeFinish(this, inode.ino)); -} - -void MDCache::purge_inode_finish(inodeno_t ino) -{ - dout(10) << "purge_inode_finish " << ino << " - logging our completion" << endl; - - // log completion - mds->mdlog->submit_entry(new EPurgeFinish(ino), - new C_MDC_PurgeFinish2(this, ino)); -} - -void MDCache::purge_inode_finish_2(inodeno_t ino) -{ - dout(10) << "purge_inode_finish_2 " << ino << endl; - - // remove from purging list - purging.erase(ino); - - // tell anyone who cares (log flusher?) - list ls; - ls.swap(waiting_for_purge[ino]); - waiting_for_purge.erase(ino); - finish_contexts(ls, 0); - - // reclaim ino? - -} - -void MDCache::start_recovered_purges() -{ - for (map::iterator p = purging.begin(); - p != purging.end(); - ++p) { - dout(10) << "start_recovered_purges " << p->first << " size " << p->second.size << endl; - mds->filer->remove(p->second, 0, p->second.size, - 0, new C_MDC_PurgeFinish(this, p->first)); - } -} - - - -bool MDCache::trim(int max) -{ - // trim LRU - if (max < 0) { - max = lru.lru_get_max(); - if (!max) return false; - } - dout(7) << "trim max=" << max << " cur=" << lru.lru_get_size() << endl; - - map expiremap; - - while (lru.lru_get_size() > (unsigned)max) { - CDentry *dn = (CDentry*)lru.lru_expire(); - if (!dn) break; - - CDir *dir = dn->get_dir(); - assert(dir); - - // notify dentry authority? - if (!dn->is_auth()) { - int auth = dn->authority(); - dout(17) << "sending expire to mds" << auth << " on " << *dn << endl; - if (expiremap.count(auth) == 0) - expiremap[auth] = new MCacheExpire(mds->get_nodeid()); - expiremap[auth]->add_dentry(dir->ino(), dn->get_name(), dn->get_replica_nonce()); - } - - // unlink the dentry - dout(15) << "trim removing " << *dn << endl; - if (!dn->is_null()) - dir->unlink_inode(dn); - dir->remove_dentry(dn); - - // adjust the dir state - CInode *diri = dir->get_inode(); - diri->dir->state_clear(CDIR_STATE_COMPLETE); // dir incomplete! - - // reexport? - if (diri->dir->is_import() && // import - diri->dir->get_size() == 0 && // no children - !diri->is_root()) // not root - migrator->export_empty_import(diri->dir); - - if (mds->logger) mds->logger->inc("cex"); - } - - // inode expire_queue - while (!inode_expire_queue.empty()) { - CInode *in = inode_expire_queue.front(); - inode_expire_queue.pop_front(); - - assert(in->get_num_ref() == 0); - - int dirauth = -2; - if (in->dir) { - // notify dir authority? - dirauth = in->dir->authority(); - if (dirauth != mds->get_nodeid()) { - dout(17) << "sending expire to mds" << dirauth << " on " << *in->dir << endl; - if (expiremap.count(dirauth) == 0) - expiremap[dirauth] = new MCacheExpire(mds->get_nodeid()); - expiremap[dirauth]->add_dir(in->ino(), in->dir->replica_nonce); - } - - in->close_dir(); - } - - // notify inode authority - int auth = in->authority(); - if (auth == CDIR_AUTH_UNKNOWN) { - assert(in->ino() == 1); - assert(dirauth >= 0); - auth = dirauth; - } - if (auth != mds->get_nodeid()) { - assert(!in->is_auth()); - dout(17) << "sending expire to mds" << auth << " on " << *in << endl; - if (expiremap.count(auth) == 0) - expiremap[auth] = new MCacheExpire(mds->get_nodeid()); - expiremap[auth]->add_inode(in->ino(), in->get_replica_nonce()); - } else { - assert(in->is_auth()); - } - - dout(15) << "trim removing " << *in << endl; - if (in == root) root = 0; - remove_inode(in); - } - - // send expires - for (map::iterator it = expiremap.begin(); - it != expiremap.end(); - it++) { - dout(7) << "sending cache_expire to " << it->first << endl; - mds->send_message_mds(it->second, it->first, MDS_PORT_CACHE); - } - - - return true; -} - - -void MDCache::trim_non_auth() -{ - dout(7) << "trim_non_auth" << endl; - - CDentry *first_auth = 0; - - // trim non-auth items from the lru - while (lru.lru_get_size() > 0) { - CDentry *dn = (CDentry*)lru.lru_expire(); - if (!dn) break; - - if (dn->is_auth()) { - // add back into lru (at the top) - lru.lru_insert_top(dn); - - if (!first_auth) { - first_auth = dn; - } else { - if (first_auth == dn) - break; - } - } else { - // non-auth. expire. - CDir *dir = dn->get_dir(); - assert(dir); - - // unlink the dentry - dout(15) << "trim_non_auth removing " << *dn << endl; - if (!dn->is_null()) - dir->unlink_inode(dn); - dir->remove_dentry(dn); - - // adjust the dir state - CInode *diri = dir->get_inode(); - diri->dir->state_clear(CDIR_STATE_COMPLETE); // dir incomplete! - } - } - - // inode expire queue - while (!inode_expire_queue.empty()) { - CInode *in = inode_expire_queue.front(); - inode_expire_queue.pop_front(); - dout(15) << "trim_non_auth removing " << *in << endl; - if (in == root) root = 0; - remove_inode(in); - } -} - - - -class C_MDC_ShutdownCommit : public Context { - MDCache *mdc; -public: - C_MDC_ShutdownCommit(MDCache *mdc) { - this->mdc = mdc; - } - void finish(int r) { - mdc->shutdown_commits--; - } -}; - -class C_MDC_ShutdownCheck : public Context { - MDCache *mdc; -public: - C_MDC_ShutdownCheck(MDCache *m) : mdc(m) {} - void finish(int) { - mdc->shutdown_check(); - } -}; - -void MDCache::shutdown_check() -{ - dout(0) << "shutdown_check at " << g_clock.now() << endl; - - // cache - int o = g_conf.debug_mds; - g_conf.debug_mds = 10; - show_cache(); - g_conf.debug_mds = o; - mds->timer.add_event_after(g_conf.mds_shutdown_check, new C_MDC_ShutdownCheck(this)); - - // this - dout(0) << "lru size now " << lru.lru_get_size() << endl; - dout(0) << "log len " << mds->mdlog->get_num_events() << endl; - - - if (exports.size()) - dout(0) << "still have " << exports.size() << " exports" << endl; - - if (mds->filer->is_active()) - dout(0) << "filer still active" << endl; -} - -void MDCache::shutdown_start() -{ - dout(1) << "shutdown_start" << endl; - - if (g_conf.mds_shutdown_check) - mds->timer.add_event_after(g_conf.mds_shutdown_check, new C_MDC_ShutdownCheck(this)); -} - - - -bool MDCache::shutdown_pass() -{ - dout(7) << "shutdown_pass" << endl; - //assert(mds->is_shutting_down()); - if (mds->is_out()) { - dout(7) << " already shut down" << endl; - show_cache(); - show_imports(); - return true; - } - - // unhash dirs? - if (!hashdirs.empty()) { - // unhash any of my dirs? - for (set::iterator it = hashdirs.begin(); - it != hashdirs.end(); - it++) { - CDir *dir = *it; - if (!dir->is_auth()) continue; - if (dir->is_unhashing()) continue; - migrator->unhash_dir(dir); - } - - dout(7) << "waiting for dirs to unhash" << endl; - return false; - } - - // commit dirs? - if (g_conf.mds_commit_on_shutdown) { - - if (shutdown_commits < 0) { - dout(1) << "shutdown_pass committing all dirty dirs" << endl; - shutdown_commits = 0; - - for (hash_map::iterator it = inode_map.begin(); - it != inode_map.end(); - it++) { - CInode *in = it->second; - - // commit any dirty dir that's ours - if (in->is_dir() && in->dir && in->dir->is_auth() && in->dir->is_dirty()) { - mds->mdstore->commit_dir(in->dir, new C_MDC_ShutdownCommit(this)); - shutdown_commits++; - } - } - } - - // commits? - if (shutdown_commits > 0) { - dout(7) << "shutdown_commits still waiting for " << shutdown_commits << endl; - return false; - } - } - - // flush anything we can from the cache - trim(0); - dout(5) << "lru size now " << lru.lru_get_size() << endl; - - mds->mdlog->trim(0); - - // (wait for) flush log? - if (g_conf.mds_log_flush_on_shutdown) { - if (mds->mdlog->get_non_importmap_events()) { - dout(7) << "waiting for log to flush .. " << mds->mdlog->get_num_events() - << " (" << mds->mdlog->get_non_importmap_events() << ")" << endl; - return false; - } - } - - - // send all imports back to 0. - if (mds->get_nodeid() != 0 && !did_shutdown_exports) { - // flush what i can from the cache first.. - trim(0); - - // export to root - for (set::iterator it = imports.begin(); - it != imports.end(); - ) { - CDir *im = *it; - it++; - if (im->inode->is_root()) continue; - if (im->is_frozen() || im->is_freezing()) continue; - - dout(7) << "sending " << *im << " back to mds0" << endl; - migrator->export_dir(im,0); - } - did_shutdown_exports = true; - } - - - // waiting for imports? (e.g. root?) - if (exports.size()) { - dout(7) << "still have " << exports.size() << " exports" << endl; - //show_cache(); - return false; - } - - - // close root? - if (mds->get_nodeid() == 0 && - lru.lru_get_size() == 0 && - root && - root->dir && - root->dir->is_import() && - root->dir->get_num_ref() == 1) { // 1 is the import! - // un-import - dout(7) << "removing root import" << endl; - imports.erase(root->dir); - root->dir->state_clear(CDIR_STATE_IMPORT); - root->dir->put(CDir::PIN_IMPORT); - - if (root->is_pinned_by(CInode::PIN_DIRTY)) { - dout(7) << "clearing root inode dirty flag" << endl; - root->put(CInode::PIN_DIRTY); - } - - trim(0); - } - - // imports? - if (!imports.empty() || migrator->is_exporting()) { - dout(7) << "still have " << imports.size() << " imports, or still exporting" << endl; - show_cache(); - return false; - } - - // cap log? - if (g_conf.mds_log_flush_on_shutdown) { - - if (imports.empty() && exports.empty()) { - // (only do this once!) - if (!mds->mdlog->is_capped()) { - dout(7) << "capping the log" << endl; - mds->mdlog->cap(); - // note that this won't flush right away, so we'll make at least one more pass - } - } - - if (mds->mdlog->get_num_events()) { - dout(7) << "waiting for log to flush (including import_map, now) .. " << mds->mdlog->get_num_events() - << " (" << mds->mdlog->get_non_importmap_events() << ")" << endl; - return false; - } - - if (!did_shutdown_log_cap) { - // flush journal header - dout(7) << "writing header for (now-empty) journal" << endl; - assert(mds->mdlog->empty()); - mds->mdlog->write_head(0); - // NOTE: filer active checker below will block us until this completes. - did_shutdown_log_cap = true; - return false; - } - } - - // filer active? - if (mds->filer->is_active()) { - dout(7) << "filer still active" << endl; - return false; - } - - - // done? - if (lru.lru_get_size() > 0) { - dout(7) << "there's still stuff in the cache: " << lru.lru_get_size() << endl; - show_cache(); - //dump(); - return false; - } - - // done! - dout(1) << "shutdown done." << endl; - return true; -} - - - - - -CInode *MDCache::create_root_inode() -{ - CInode *root = new CInode(this); - memset(&root->inode, 0, sizeof(inode_t)); - root->inode.ino = 1; - root->inode.hash_seed = 0; // not hashed! - - // make it up (FIXME) - root->inode.mode = 0755 | INODE_MODE_DIR; - root->inode.size = 0; - root->inode.ctime = 0; - root->inode.mtime = g_clock.gettime(); - - root->inode.nlink = 1; - root->inode.layout = g_OSD_MDDirLayout; - - set_root( root ); - add_inode( root ); - - return root; -} - - -int MDCache::open_root(Context *c) -{ - int whoami = mds->get_nodeid(); - - // open root inode - if (whoami == 0) { - // i am root inode - CInode *root = create_root_inode(); - - // root directory too - assert(root->dir == NULL); - root->set_dir( new CDir(root, this, true) ); - root->dir->set_dir_auth( 0 ); // me! - root->dir->dir_rep = CDIR_REP_ALL; //NONE; - - // root is sort of technically an import (from a vacuum) - imports.insert( root->dir ); - root->dir->state_set(CDIR_STATE_IMPORT); - root->dir->get(CDir::PIN_IMPORT); - - if (c) { - c->finish(0); - delete c; - } - } else { - // request inode from root mds - if (waiting_for_root.empty()) { - dout(7) << "discovering root" << endl; - - filepath want; - MDiscover *req = new MDiscover(whoami, - 0, - want, - false); // there _is_ no base dir for the root inode - mds->send_message_mds(req, 0, MDS_PORT_CACHE); - } else { - dout(7) << "waiting for root" << endl; - } - - // wait - waiting_for_root.push_back(c); - - } - - return 0; -} - - - - - - - - -// ========= messaging ============== - - -void MDCache::dispatch(Message *m) -{ - switch (m->get_type()) { - - case MSG_MDS_IMPORTMAP: - handle_import_map((MMDSImportMap*)m); - break; - - case MSG_MDS_CACHEREJOIN: - handle_cache_rejoin((MMDSCacheRejoin*)m); - break; - case MSG_MDS_CACHEREJOINACK: - handle_cache_rejoin_ack((MMDSCacheRejoinAck*)m); - break; - - - case MSG_MDS_DISCOVER: - handle_discover((MDiscover*)m); - break; - case MSG_MDS_DISCOVERREPLY: - handle_discover_reply((MDiscoverReply*)m); - break; - - /* - case MSG_MDS_INODEUPDATE: - handle_inode_update((MInodeUpdate*)m); - break; - */ - - case MSG_MDS_INODELINK: - handle_inode_link((MInodeLink*)m); - break; - case MSG_MDS_INODELINKACK: - handle_inode_link_ack((MInodeLinkAck*)m); - break; - - case MSG_MDS_DIRUPDATE: - handle_dir_update((MDirUpdate*)m); - break; - - case MSG_MDS_CACHEEXPIRE: - handle_cache_expire((MCacheExpire*)m); - break; - - - - case MSG_MDS_DENTRYUNLINK: - handle_dentry_unlink((MDentryUnlink*)m); - break; - - - - - - default: - dout(7) << "cache unknown message " << m->get_type() << endl; - assert(0); - break; - } -} - - -/* path_traverse - * - * return values: - * <0 : traverse error (ENOTDIR, ENOENT) - * 0 : success - * >0 : delayed or forwarded - * - * Notes: - * onfinish context is only needed if you specify MDS_TRAVERSE_DISCOVER _and_ - * you aren't absolutely certain that the path actually exists. If it doesn't, - * the context is needed to pass a (failure) result code. - */ - -class C_MDC_TraverseDiscover : public Context { - Context *onfinish, *ondelay; - public: - C_MDC_TraverseDiscover(Context *onfinish, Context *ondelay) { - this->ondelay = ondelay; - this->onfinish = onfinish; - } - void finish(int r) { - //dout(10) << "TraverseDiscover r = " << r << endl; - if (r < 0 && onfinish) { // ENOENT on discover, pass back to caller. - onfinish->finish(r); - } else { - ondelay->finish(r); // retry as usual - } - delete onfinish; - delete ondelay; - } -}; - -int MDCache::path_traverse(filepath& origpath, - vector& trace, - bool follow_trailing_symlink, - Message *req, - Context *ondelay, - int onfail, - Context *onfinish, - bool is_client_req) // true if req is MClientRequest .. gross, FIXME -{ - int whoami = mds->get_nodeid(); - set< pair > symlinks_resolved; // keep a list of symlinks we touch to avoid loops - - bool noperm = false; - if (onfail == MDS_TRAVERSE_DISCOVER || - onfail == MDS_TRAVERSE_DISCOVERXLOCK) noperm = true; - - // root - CInode *cur = get_root(); - if (cur == NULL) { - dout(7) << "traverse: i don't have root" << endl; - open_root(ondelay); - if (onfinish) delete onfinish; - return 1; - } - - // start trace - trace.clear(); - - // make our own copy, since we'll modify when we hit symlinks - filepath path = origpath; - - unsigned depth = 0; - while (depth < path.depth()) { - dout(12) << "traverse: path seg depth " << depth << " = " << path[depth] << endl; - - // ENOTDIR? - if (!cur->is_dir()) { - dout(7) << "traverse: " << *cur << " not a dir " << endl; - delete ondelay; - if (onfinish) { - onfinish->finish(-ENOTDIR); - delete onfinish; - } - return -ENOTDIR; - } - - // open dir - if (!cur->dir) { - if (cur->dir_is_auth()) { - // parent dir frozen_dir? - if (cur->is_frozen_dir()) { - dout(7) << "traverse: " << *cur->get_parent_dir() << " is frozen_dir, waiting" << endl; - cur->get_parent_dir()->add_waiter(CDIR_WAIT_UNFREEZE, ondelay); - if (onfinish) delete onfinish; - return 1; - } - - cur->get_or_open_dir(this); - assert(cur->dir); - } else { - // discover dir from/via inode auth - assert(!cur->is_auth()); - if (cur->waiting_for(CINODE_WAIT_DIR)) { - dout(10) << "traverse: need dir for " << *cur << ", already doing discover" << endl; - } else { - filepath want = path.postfixpath(depth); - dout(10) << "traverse: need dir for " << *cur << ", doing discover, want " << want.get_path() << endl; - mds->send_message_mds(new MDiscover(mds->get_nodeid(), - cur->ino(), - want, - true), // need this dir too - cur->authority(), MDS_PORT_CACHE); - } - cur->add_waiter(CINODE_WAIT_DIR, ondelay); - if (onfinish) delete onfinish; - return 1; - } - } - - // frozen? - /* - if (cur->dir->is_frozen()) { - // doh! - // FIXME: traverse is allowed? - dout(7) << "traverse: " << *cur->dir << " is frozen, waiting" << endl; - cur->dir->add_waiter(CDIR_WAIT_UNFREEZE, ondelay); - if (onfinish) delete onfinish; - return 1; - } - */ - - // must read directory hard data (permissions, x bit) to traverse - if (!noperm && !mds->locker->inode_hard_read_try(cur, ondelay)) { - if (onfinish) delete onfinish; - return 1; - } - - // check permissions? - // XXX - - // ..? - if (path[depth] == "..") { - trace.pop_back(); - depth++; - cur = cur->get_parent_inode(); - dout(10) << "traverse: following .. back to " << *cur << endl; - continue; - } - - - // dentry - CDentry *dn = cur->dir->lookup(path[depth]); - - // null and last_bit and xlocked by me? - if (dn && dn->is_null() && - dn->is_xlockedbyme(req) && - depth == path.depth()-1) { - dout(10) << "traverse: hit (my) xlocked dentry at tail of traverse, succeeding" << endl; - trace.push_back(dn); - break; // done! - } - - if (dn && !dn->is_null()) { - // dentry exists. xlocked? - if (!noperm && dn->is_xlockedbyother(req)) { - dout(10) << "traverse: xlocked dentry at " << *dn << endl; - cur->dir->add_waiter(CDIR_WAIT_DNREAD, - path[depth], - ondelay); - if (onfinish) delete onfinish; - return 1; - } - - // do we have inode? - if (!dn->inode) { - assert(dn->is_remote()); - // do i have it? - CInode *in = get_inode(dn->get_remote_ino()); - if (in) { - dout(7) << "linking in remote in " << *in << endl; - dn->link_remote(in); - } else { - dout(7) << "remote link to " << dn->get_remote_ino() << ", which i don't have" << endl; - open_remote_ino(dn->get_remote_ino(), req, - ondelay); - return 1; - } - } - - // symlink? - if (dn->inode->is_symlink() && - (follow_trailing_symlink || depth < path.depth()-1)) { - // symlink, resolve! - filepath sym = dn->inode->symlink; - dout(10) << "traverse: hit symlink " << *dn->inode << " to " << sym << endl; - - // break up path components - // /head/symlink/tail - filepath head = path.prefixpath(depth); - filepath tail = path.postfixpath(depth+1); - dout(10) << "traverse: path head = " << head << endl; - dout(10) << "traverse: path tail = " << tail << endl; - - if (symlinks_resolved.count(pair(dn->inode, tail.get_path()))) { - dout(10) << "already hit this symlink, bailing to avoid the loop" << endl; - return -ELOOP; - } - symlinks_resolved.insert(pair(dn->inode, tail.get_path())); - - // start at root? - if (dn->inode->symlink[0] == '/') { - // absolute - trace.clear(); - depth = 0; - path = tail; - dout(10) << "traverse: absolute symlink, path now " << path << " depth " << depth << endl; - } else { - // relative - path = head; - path.append(sym); - path.append(tail); - dout(10) << "traverse: relative symlink, path now " << path << " depth " << depth << endl; - } - continue; - } else { - // keep going. - - // forwarder wants replicas? - if (is_client_req && ((MClientRequest*)req)->get_mds_wants_replica_in_dirino()) { - dout(30) << "traverse: REP is here, " << ((MClientRequest*)req)->get_mds_wants_replica_in_dirino() << " vs " << cur->dir->ino() << endl; - - if (((MClientRequest*)req)->get_mds_wants_replica_in_dirino() == cur->dir->ino() && - cur->dir->is_auth() && - cur->dir->is_rep() && - cur->dir->is_replica(req->get_source().num()) && - dn->get_inode()->is_auth() - ) { - assert(req->get_source().is_mds()); - int from = req->get_source().num(); - - if (dn->get_inode()->is_replica(from)) { - dout(15) << "traverse: REP would replicate to mds" << from << ", but already cached_by " - << req->get_source() << " dn " << *dn << endl; - } else { - dout(10) << "traverse: REP replicating to " << req->get_source() << " dn " << *dn << endl; - MDiscoverReply *reply = new MDiscoverReply(cur->dir->ino()); - reply->add_dentry( dn->replicate_to( from ) ); - reply->add_inode( dn->inode->replicate_to( from ) ); - mds->send_message_mds(reply, req->get_source().num(), MDS_PORT_CACHE); - } - } - } - - trace.push_back(dn); - cur = dn->inode; - touch_inode(cur); - depth++; - continue; - } - } - - // MISS. don't have it. - - int dauth = cur->dir->dentry_authority( path[depth] ); - dout(12) << "traverse: miss on dentry " << path[depth] << " dauth " << dauth << " in " << *cur->dir << endl; - - - if (dauth == whoami) { - // dentry is mine. - if (cur->dir->is_complete()) { - // file not found - delete ondelay; - if (onfinish) { - onfinish->finish(-ENOENT); - delete onfinish; - } - return -ENOENT; - } else { - - //wrong? - //if (onfail == MDS_TRAVERSE_DISCOVER) - // return -1; - - // directory isn't complete; reload - dout(7) << "traverse: incomplete dir contents for " << *cur << ", fetching" << endl; - touch_inode(cur); - mds->mdstore->fetch_dir(cur->dir, ondelay); - - if (mds->logger) mds->logger->inc("cmiss"); - - if (onfinish) delete onfinish; - return 1; - } - } else { - // dentry is not mine. - - /* no, let's let auth handle the discovery/replication .. - if (onfail == MDS_TRAVERSE_FORWARD && - onfinish == 0 && // no funnyness - cur->dir->is_rep()) { - dout(5) << "trying to discover in popular dir " << *cur->dir << endl; - onfail = MDS_TRAVERSE_DISCOVER; - } - */ - - if ((onfail == MDS_TRAVERSE_DISCOVER || - onfail == MDS_TRAVERSE_DISCOVERXLOCK)) { - // discover - - filepath want = path.postfixpath(depth); - if (cur->dir->waiting_for(CDIR_WAIT_DENTRY, path[depth])) { - dout(7) << "traverse: already waiting for discover on " << *cur << " for " << want.get_path() << " to mds" << dauth << endl; - } else { - dout(7) << "traverse: discover on " << *cur << " for " << want.get_path() << " to mds" << dauth << endl; - - touch_inode(cur); - - mds->send_message_mds(new MDiscover(mds->get_nodeid(), - cur->ino(), - want, - false), - dauth, MDS_PORT_CACHE); - if (mds->logger) mds->logger->inc("dis"); - } - - // delay processing of current request. - // delay finish vs ondelay until result of traverse, so that ENOENT can be - // passed to onfinish if necessary - cur->dir->add_waiter(CDIR_WAIT_DENTRY, - path[depth], - new C_MDC_TraverseDiscover(onfinish, ondelay)); - - if (mds->logger) mds->logger->inc("cmiss"); - return 1; - } - if (onfail == MDS_TRAVERSE_FORWARD) { - // forward - dout(7) << "traverse: not auth for " << path << " at " << path[depth] << ", fwd to mds" << dauth << endl; - - if (is_client_req && cur->dir->is_rep()) { - dout(15) << "traverse: REP fw to mds" << dauth << ", requesting rep under " << *cur->dir << " req " << *(MClientRequest*)req << endl; - ((MClientRequest*)req)->set_mds_wants_replica_in_dirino(cur->dir->ino()); - req->clear_payload(); // reencode! - } - - mds->send_message_mds(req, dauth, req->get_dest_port()); - //show_imports(); - - if (mds->logger) mds->logger->inc("cfw"); - if (onfinish) delete onfinish; - delete ondelay; - return 2; - } - if (onfail == MDS_TRAVERSE_FAIL) { - delete ondelay; - if (onfinish) { - onfinish->finish(-ENOENT); // -ENOENT, but only because i'm not the authority! - delete onfinish; - } - return -ENOENT; // not necessarily exactly true.... - } - } - - assert(0); // i shouldn't get here - } - - // success. - delete ondelay; - if (onfinish) { - onfinish->finish(0); - delete onfinish; - } - return 0; -} - - - -void MDCache::open_remote_dir(CInode *diri, - Context *fin) -{ - dout(10) << "open_remote_dir on " << *diri << endl; - - assert(diri->is_dir()); - assert(!diri->dir_is_auth()); - assert(!diri->is_auth()); - assert(diri->dir == 0); - - filepath want; // no dentries, i just want the dir open - mds->send_message_mds(new MDiscover(mds->get_nodeid(), - diri->ino(), - want, - true), // need the dir open - diri->authority(), MDS_PORT_CACHE); - - diri->add_waiter(CINODE_WAIT_DIR, fin); -} - - - -class C_MDC_OpenRemoteInoLookup : public Context { - MDCache *mdc; - inodeno_t ino; - Message *req; - Context *onfinish; -public: - vector anchortrace; - C_MDC_OpenRemoteInoLookup(MDCache *mdc, inodeno_t ino, Message *req, Context *onfinish) { - this->mdc = mdc; - this->ino = ino; - this->req = req; - this->onfinish = onfinish; - } - void finish(int r) { - assert(r == 0); - if (r == 0) - mdc->open_remote_ino_2(ino, req, anchortrace, onfinish); - else { - onfinish->finish(r); - delete onfinish; - } - } -}; - -void MDCache::open_remote_ino(inodeno_t ino, - Message *req, - Context *onfinish) -{ - dout(7) << "open_remote_ino on " << ino << endl; - - C_MDC_OpenRemoteInoLookup *c = new C_MDC_OpenRemoteInoLookup(this, ino, req, onfinish); - mds->anchorclient->lookup(ino, c->anchortrace, c); -} - -void MDCache::open_remote_ino_2(inodeno_t ino, - Message *req, - vector& anchortrace, - Context *onfinish) -{ - dout(7) << "open_remote_ino_2 on " << ino << ", trace depth is " << anchortrace.size() << endl; - - // construct path - filepath path; - for (unsigned i=0; iref_dn); - - dout(7) << " path is " << path << endl; - - vector trace; - int r = path_traverse(path, trace, false, - req, - onfinish, // delay actually - MDS_TRAVERSE_DISCOVER); - if (r > 0) return; - - onfinish->finish(r); - delete onfinish; -} - - - - -// path pins - -bool MDCache::path_pin(vector& trace, - Message *m, - Context *c) -{ - // verify everything is pinnable - for (vector::iterator it = trace.begin(); - it != trace.end(); - it++) { - CDentry *dn = *it; - if (!dn->is_pinnable(m)) { - // wait - if (c) { - dout(10) << "path_pin can't pin " << *dn << ", waiting" << endl; - dn->dir->add_waiter(CDIR_WAIT_DNPINNABLE, - dn->name, - c); - } else { - dout(10) << "path_pin can't pin, no waiter, failing." << endl; - } - return false; - } - } - - // pin! - for (vector::iterator it = trace.begin(); - it != trace.end(); - it++) { - (*it)->pin(m); - dout(11) << "path_pinned " << *(*it) << endl; - } - - delete c; - return true; -} - - -void MDCache::path_unpin(vector& trace, - Message *m) -{ - for (vector::iterator it = trace.begin(); - it != trace.end(); - it++) { - CDentry *dn = *it; - dn->unpin(m); - dout(11) << "path_unpinned " << *dn << endl; - - // did we completely unpin a waiter? - if (dn->lockstate == DN_LOCK_UNPINNING && !dn->get_num_ref()) { - // return state to sync, in case the unpinner flails - dn->lockstate = DN_LOCK_SYNC; - - // run finisher right now to give them a fair shot. - dn->dir->finish_waiting(CDIR_WAIT_DNUNPINNED, dn->name); - } - } -} - - -void MDCache::make_trace(vector& trace, CInode *in) -{ - CInode *parent = in->get_parent_inode(); - if (parent) { - make_trace(trace, parent); - - CDentry *dn = in->get_parent_dn(); - dout(15) << "make_trace adding " << *dn << endl; - trace.push_back(dn); - } -} - - -bool MDCache::request_start(Message *req, - CInode *ref, - vector& trace) -{ - assert(active_requests.count(req) == 0); - - // pin path - if (trace.size()) { - if (!path_pin(trace, req, new C_MDS_RetryMessage(mds,req))) return false; - } - - dout(7) << "request_start " << *req << endl; - - // add to map - active_requests[req].ref = ref; - if (trace.size()) active_requests[req].traces[trace[trace.size()-1]] = trace; - - // request pins - request_pin_inode(req, ref); - - if (mds->logger) mds->logger->inc("req"); - - return true; -} - - -void MDCache::request_pin_inode(Message *req, CInode *in) -{ - if (active_requests[req].request_pins.count(in) == 0) { - in->request_pin_get(); - active_requests[req].request_pins.insert(in); - } -} - -void MDCache::request_pin_dir(Message *req, CDir *dir) -{ - if (active_requests[req].request_dir_pins.count(dir) == 0) { - dir->request_pin_get(); - active_requests[req].request_dir_pins.insert(dir); - } -} - - -void MDCache::request_cleanup(Message *req) -{ - assert(active_requests.count(req) == 1); - - // leftover xlocks? - if (active_requests[req].xlocks.size()) { - set dns = active_requests[req].xlocks; - - for (set::iterator it = dns.begin(); - it != dns.end(); - it++) { - CDentry *dn = *it; - - dout(7) << "request_cleanup leftover xlock " << *dn << endl; - - mds->locker->dentry_xlock_finish(dn); - - // queue finishers - dn->dir->take_waiting(CDIR_WAIT_ANY, dn->name, mds->finished_queue); - - // remove clean, null dentry? (from a failed rename or whatever) - if (dn->is_null() && dn->is_sync() && !dn->is_dirty()) { - dn->dir->remove_dentry(dn); - } - } - - assert(active_requests[req].xlocks.empty()); // we just finished finished them - } - - // foreign xlocks? - if (active_requests[req].foreign_xlocks.size()) { - set dns = active_requests[req].foreign_xlocks; - active_requests[req].foreign_xlocks.clear(); - - for (set::iterator it = dns.begin(); - it != dns.end(); - it++) { - CDentry *dn = *it; - - dout(7) << "request_cleanup sending unxlock for foreign xlock on " << *dn << endl; - assert(dn->is_xlocked()); - int dauth = dn->dir->dentry_authority(dn->name); - MLock *m = new MLock(LOCK_AC_UNXLOCK, mds->get_nodeid()); - m->set_dn(dn->dir->ino(), dn->name); - mds->send_message_mds(m, dauth, MDS_PORT_CACHE); - } - } - - // unpin paths - for (map< CDentry*, vector >::iterator it = active_requests[req].traces.begin(); - it != active_requests[req].traces.end(); - it++) { - path_unpin(it->second, req); - } - - // request pins - for (set::iterator it = active_requests[req].request_pins.begin(); - it != active_requests[req].request_pins.end(); - it++) { - (*it)->request_pin_put(); - } - for (set::iterator it = active_requests[req].request_dir_pins.begin(); - it != active_requests[req].request_dir_pins.end(); - it++) { - (*it)->request_pin_put(); - } - - // remove from map - active_requests.erase(req); - - - // log some stats ***** - if (mds->logger) { - mds->logger->set("c", lru.lru_get_size()); - mds->logger->set("cpin", lru.lru_get_num_pinned()); - mds->logger->set("ctop", lru.lru_get_top()); - mds->logger->set("cbot", lru.lru_get_bot()); - mds->logger->set("cptail", lru.lru_get_pintail()); - //mds->logger->set("buf",buffer_total_alloc); - } - - if (g_conf.log_pins) { - // pin - /* -for (int i=0; ilogger2) mds->logger2->set(cinode_pin_names[i], - cinode_pins[i]); - } - */ - /* - for (map::iterator it = cdir_pins.begin(); - it != cdir_pins.end(); - it++) { - //string s = "D"; - //s += cdir_pin_names[it->first]; - if (mds->logger2) mds->logger2->set(//s, - cdir_pin_names[it->first], - it->second); - } - */ - } - -} - -void MDCache::request_finish(Message *req) -{ - dout(7) << "request_finish " << *req << endl; - request_cleanup(req); - delete req; // delete req - - if (mds->logger) mds->logger->inc("reply"); - - - //dump(); -} - - -void MDCache::request_forward(Message *req, int who, int port) -{ - if (!port) port = MDS_PORT_SERVER; - - dout(7) << "request_forward to " << who << " req " << *req << endl; - request_cleanup(req); - mds->send_message_mds(req, who, port); - - if (mds->logger) mds->logger->inc("fw"); -} - - - -// ANCHORS - -class C_MDC_AnchorInode : public Context { - CInode *in; - -public: - C_MDC_AnchorInode(CInode *in) { - this->in = in; - } - void finish(int r) { - if (r == 0) { - assert(in->inode.anchored == false); - in->inode.anchored = true; - - in->state_clear(CInode::STATE_ANCHORING); - in->put(CInode::PIN_ANCHORING); - - in->_mark_dirty(); // fixme - } - - // trigger - in->finish_waiting(CINODE_WAIT_ANCHORED, r); - } -}; - -void MDCache::anchor_inode(CInode *in, Context *onfinish) -{ - assert(in->is_auth()); - - // already anchoring? - if (in->state_test(CInode::STATE_ANCHORING)) { - dout(7) << "anchor_inode already anchoring " << *in << endl; - - // wait - in->add_waiter(CINODE_WAIT_ANCHORED, - onfinish); - - } else { - dout(7) << "anchor_inode anchoring " << *in << endl; - - // auth: do it - in->state_set(CInode::STATE_ANCHORING); - in->get(CInode::PIN_ANCHORING); - - // wait - in->add_waiter(CINODE_WAIT_ANCHORED, - onfinish); - - // make trace - vector trace; - in->make_anchor_trace(trace); - - // do it - mds->anchorclient->create(in->ino(), trace, - new C_MDC_AnchorInode( in )); - } -} - - -void MDCache::handle_inode_link(MInodeLink *m) -{ - CInode *in = get_inode(m->get_ino()); - assert(in); - - if (!in->is_auth()) { - assert(in->is_proxy()); - dout(7) << "handle_inode_link not auth for " << *in << ", fw to auth" << endl; - mds->send_message_mds(m, in->authority(), MDS_PORT_CACHE); - return; - } - - dout(7) << "handle_inode_link on " << *in << endl; - - if (!in->is_anchored()) { - assert(in->inode.nlink == 1); - dout(7) << "needs anchor, nlink=" << in->inode.nlink << ", creating anchor" << endl; - - anchor_inode(in, - new C_MDS_RetryMessage(mds, m)); - return; - } - - in->inode.nlink++; - in->_mark_dirty(); // fixme - - // reply - dout(7) << " nlink++, now " << in->inode.nlink++ << endl; - - mds->send_message_mds(new MInodeLinkAck(m->get_ino(), true), m->get_from(), MDS_PORT_CACHE); - delete m; -} - - -void MDCache::handle_inode_link_ack(MInodeLinkAck *m) -{ - CInode *in = get_inode(m->get_ino()); - assert(in); - - dout(7) << "handle_inode_link_ack success = " << m->is_success() << " on " << *in << endl; - in->finish_waiting(CINODE_WAIT_LINK, - m->is_success() ? 1:-1); -} - - - -// REPLICAS - - -void MDCache::handle_discover(MDiscover *dis) -{ - int whoami = mds->get_nodeid(); - - // from me to me? - if (dis->get_asker() == whoami) { - dout(7) << "discover for " << dis->get_want().get_path() << " bounced back to me, dropping." << endl; - delete dis; - return; - } - - CInode *cur = 0; - MDiscoverReply *reply = 0; - //filepath fullpath; - - // get started. - if (dis->get_base_ino() == 0) { - // wants root - dout(7) << "discover from mds" << dis->get_asker() << " wants root + " << dis->get_want().get_path() << endl; - - assert(mds->get_nodeid() == 0); - assert(root->is_auth()); - - //fullpath = dis->get_want(); - - - // add root - reply = new MDiscoverReply(0); - reply->add_inode( root->replicate_to( dis->get_asker() ) ); - dout(10) << "added root " << *root << endl; - - cur = root; - - } else { - // there's a base inode - cur = get_inode(dis->get_base_ino()); - assert(cur); - - if (dis->wants_base_dir()) { - dout(7) << "discover from mds" << dis->get_asker() << " has " << *cur << " wants dir+" << dis->get_want().get_path() << endl; - } else { - dout(7) << "discover from mds" << dis->get_asker() << " has " << *cur->dir << " wants " << dis->get_want().get_path() << endl; - } - - assert(cur->is_dir()); - - // crazyness? - if (!cur->dir && !cur->is_auth()) { - int iauth = cur->authority(); - dout(7) << "no dir and not inode auth; fwd to auth " << iauth << endl; - mds->send_message_mds( dis, iauth, MDS_PORT_CACHE); - return; - } - - // frozen_dir? - if (!cur->dir && cur->is_frozen_dir()) { - dout(7) << "is frozen_dir, waiting" << endl; - cur->get_parent_dir()->add_waiter(CDIR_WAIT_UNFREEZE, - new C_MDS_RetryMessage(mds, dis)); - return; - } - - if (!cur->dir) - cur->get_or_open_dir(this); - assert(cur->dir); - - dout(10) << "dir is " << *cur->dir << endl; - - // create reply - reply = new MDiscoverReply(cur->ino()); - } - - assert(reply); - assert(cur); - - /* - // first traverse and make sure we won't have to do any waiting - dout(10) << "traversing full discover path = " << fullpath << endl; - vector trav; - int r = path_traverse(fullpath, trav, dis, MDS_TRAVERSE_FAIL); - if (r > 0) - return; // fw or delay - dout(10) << "traverse finish w/o blocking, continuing" << endl; - // ok, now we know we won't block on dentry locks or readdir. - */ - - - // add content - // do some fidgeting to include a dir if they asked for the base dir, or just root. - for (unsigned i = 0; i < dis->get_want().depth() || dis->get_want().depth() == 0; i++) { - // add dir - if (reply->is_empty() && !dis->wants_base_dir()) { - dout(7) << "they don't want the base dir" << endl; - } else { - // is it actaully a dir at all? - if (!cur->is_dir()) { - dout(7) << "not a dir " << *cur << endl; - reply->set_flag_error_dir(); - break; - } - - // add dir - if (!cur->dir_is_auth()) { - dout(7) << *cur << " dir auth is someone else, i'm done" << endl; - break; - } - - // did we hit a frozen_dir? - if (!cur->dir && cur->is_frozen_dir()) { - dout(7) << *cur << " is frozen_dir, stopping" << endl; - break; - } - - if (!cur->dir) cur->get_or_open_dir(this); - - reply->add_dir( new CDirDiscover( cur->dir, - cur->dir->add_replica( dis->get_asker() ) ) ); - dout(7) << "added dir " << *cur->dir << endl; - } - if (dis->get_want().depth() == 0) break; - - // lookup dentry - int dentry_auth = cur->dir->dentry_authority( dis->get_dentry(i) ); - if (dentry_auth != mds->get_nodeid()) { - dout(7) << *cur->dir << "dentry " << dis->get_dentry(i) << " auth " << dentry_auth << ", i'm done." << endl; - break; // that's it for us! - } - - // get inode - CDentry *dn = cur->dir->lookup( dis->get_dentry(i) ); - - /* - if (dn && !dn->can_read()) { // xlocked? - dout(7) << "waiting on " << *dn << endl; - cur->dir->add_waiter(CDIR_WAIT_DNREAD, - dn->name, - new C_MDS_RetryMessage(mds, dis)); - return; - } - */ - - if (dn) { - if (!dn->inode && dn->is_sync()) { - dout(7) << "mds" << whoami << " dentry " << dis->get_dentry(i) << " null in " << *cur->dir << ", returning error" << endl; - reply->set_flag_error_dn( dis->get_dentry(i) ); - break; // don't replicate null but non-locked dentries. - } - - reply->add_dentry( dn->replicate_to( dis->get_asker() ) ); - dout(7) << "added dentry " << *dn << endl; - - if (!dn->inode) break; // we're done. - } - - if (dn && dn->inode) { - CInode *next = dn->inode; - assert(next->is_auth()); - - // add inode - //int nonce = next->cached_by_add(dis->get_asker()); - reply->add_inode( next->replicate_to( dis->get_asker() ) ); - dout(7) << "added inode " << *next << endl;// " nonce=" << nonce<< endl; - - // descend - cur = next; - } else { - // don't have inode? - if (cur->dir->is_complete()) { - // set error flag in reply - dout(7) << "mds" << whoami << " dentry " << dis->get_dentry(i) << " not found in " << *cur->dir << ", returning error" << endl; - reply->set_flag_error_dn( dis->get_dentry(i) ); - break; - } else { - // readdir - dout(7) << "mds" << whoami << " incomplete dir contents for " << *cur->dir << ", fetching" << endl; - - //mds->mdstore->fetch_dir(cur->dir, NULL); //new C_MDS_RetryMessage(mds, dis)); - //break; // send what we have so far - - mds->mdstore->fetch_dir(cur->dir, new C_MDS_RetryMessage(mds, dis)); - return; - } - } - } - - // how did we do. - if (reply->is_empty()) { - - // discard empty reply - delete reply; - - if ((cur->is_auth() || cur->is_proxy() || cur->dir->is_proxy()) && - !cur->dir->is_auth()) { - // fwd to dir auth - int dirauth = cur->dir->authority(); - if (dirauth == dis->get_asker()) { - dout(7) << "from (new?) dir auth, dropping (obsolete) discover on floor." << endl; // XXX FIXME is this right? - //assert(dis->get_asker() == dis->get_source()); //might be a weird other loop. either way, asker has it. - delete dis; - } else { - dout(7) << "fwd to dir auth " << dirauth << endl; - mds->send_message_mds( dis, dirauth, MDS_PORT_CACHE ); - } - return; - } - - dout(7) << "i'm not auth or proxy, dropping (this empty reply). i bet i just exported." << endl; - //assert(0); - - } else { - // send back to asker - dout(7) << "sending result back to asker mds" << dis->get_asker() << endl; - mds->send_message_mds(reply, dis->get_asker(), MDS_PORT_CACHE); - } - - // done. - delete dis; -} - - -void MDCache::handle_discover_reply(MDiscoverReply *m) -{ - // starting point - CInode *cur; - list finished, error; - - if (m->has_root()) { - // nowhere! - dout(7) << "discover_reply root + " << m->get_path() << " " << m->get_num_inodes() << " inodes" << endl; - assert(!root); - assert(m->get_base_ino() == 0); - assert(!m->has_base_dentry()); - assert(!m->has_base_dir()); - - // add in root - cur = new CInode(this, false); - - m->get_inode(0).update_inode(cur); - - // root - set_root( cur ); - add_inode( cur ); - dout(7) << " got root: " << *cur << endl; - - // take waiters - finished.swap(waiting_for_root); - } else { - // grab inode - cur = get_inode(m->get_base_ino()); - - if (!cur) { - dout(7) << "discover_reply don't have base ino " << m->get_base_ino() << ", dropping" << endl; - delete m; - return; - } - - dout(7) << "discover_reply " << *cur << " + " << m->get_path() << ", have " << m->get_num_inodes() << " inodes" << endl; - } - - // fyi - if (m->is_flag_error_dir()) dout(7) << " flag error, dir" << endl; - if (m->is_flag_error_dn()) dout(7) << " flag error, dentry = " << m->get_error_dentry() << endl; - dout(10) << "depth is " << m->get_depth() << ", has_root = " << m->has_root() << endl; - - // loop over discover results. - // indexese follow each ([[dir] dentry] inode) - // can start, end with any type. - - for (int i=m->has_root(); iget_depth(); i++) { - dout(10) << "discover_reply i=" << i << " cur " << *cur << endl; - - // dir - if ((i > 0) || - (i == 0 && m->has_base_dir())) { - if (cur->dir) { - // had it - /* this is strange, but it happens when: - we discover multiple dentries under a dir. - bc, no flag to indicate a dir discover is underway, (as there is w/ a dentry one). - this is actually good, since (dir aside) they're asking for different information. - */ - dout(7) << "had " << *cur->dir; - m->get_dir(i).update_dir(cur->dir); - dout2(7) << ", now " << *cur->dir << endl; - } else { - // add it (_replica_) - cur->set_dir( new CDir(cur, this, false) ); - m->get_dir(i).update_dir(cur->dir); - dout(7) << "added " << *cur->dir << " nonce " << cur->dir->replica_nonce << endl; - - // get waiters - cur->take_waiting(CINODE_WAIT_DIR, finished); - } - } - - // dentry error? - if (i == m->get_depth()-1 && - m->is_flag_error_dn()) { - // error! - assert(cur->is_dir()); - if (cur->dir) { - dout(7) << " flag_error on dentry " << m->get_error_dentry() << ", triggering dentry?" << endl; - cur->dir->take_waiting(CDIR_WAIT_DENTRY, - m->get_error_dentry(), - error); - } else { - dout(7) << " flag_error on dentry " << m->get_error_dentry() << ", triggering dir?" << endl; - cur->take_waiting(CINODE_WAIT_DIR, error); - } - break; - } - - if (i >= m->get_num_dentries()) break; - - // dentry - dout(7) << "i = " << i << " dentry is " << m->get_dentry(i).get_dname() << endl; - - CDentry *dn = 0; - if (i > 0 || - m->has_base_dentry()) { - dn = cur->dir->lookup( m->get_dentry(i).get_dname() ); - - if (dn) { - dout(7) << "had " << *dn << endl; - dn->replica_nonce = m->get_dentry(i).get_nonce(); // fix nonce. - } else { - dn = cur->dir->add_dentry( m->get_dentry(i).get_dname(), 0, false ); - m->get_dentry(i).update_dentry(dn); - dout(7) << "added " << *dn << endl; - } - - cur->dir->take_waiting(CDIR_WAIT_DENTRY, - m->get_dentry(i).get_dname(), - finished); - } - - if (i >= m->get_num_inodes()) break; - - // inode - dout(7) << "i = " << i << " ino is " << m->get_ino(i) << endl; - CInode *in = get_inode( m->get_inode(i).get_ino() ); - assert(dn); - - if (in) { - dout(7) << "had " << *in << endl; - - // fix nonce - dout(7) << " my nonce is " << in->replica_nonce << ", taking from discover, which has " << m->get_inode(i).get_replica_nonce() << endl; - in->replica_nonce = m->get_inode(i).get_replica_nonce(); - - if (dn && in != dn->inode) { - dout(7) << " but it's not linked via dentry " << *dn << endl; - // link - if (dn->inode) { - dout(7) << "dentry WAS linked to " << *dn->inode << endl; - assert(0); // WTF. - } - dn->dir->link_inode(dn, in); - } - } - else { - assert(dn->inode == 0); // better not be something else linked to this dentry... - - // didn't have it. - in = new CInode(this, false); - - m->get_inode(i).update_inode(in); - - // link in - add_inode( in ); - dn->dir->link_inode(dn, in); - - dout(7) << "added " << *in << " nonce " << in->replica_nonce << endl; - } - - // onward! - cur = in; - } - - // dir error at the end there? - if (m->is_flag_error_dir()) { - dout(7) << " flag_error on dir " << *cur << endl; - assert(!cur->is_dir()); - cur->take_waiting(CINODE_WAIT_DIR, error); - } - - // finish errors directly - finish_contexts(error, -ENOENT); - - mds->queue_finished(finished); - - // done - delete m; -} - - - - - - - - -/* -int MDCache::send_inode_updates(CInode *in) -{ - assert(in->is_auth()); - for (set::iterator it = in->cached_by_begin(); - it != in->cached_by_end(); - it++) { - dout(7) << "sending inode_update on " << *in << " to " << *it << endl; - assert(*it != mds->get_nodeid()); - mds->send_message_mds(new MInodeUpdate(in, in->get_cached_by_nonce(*it)), *it, MDS_PORT_CACHE); - } - - return 0; -} - - -void MDCache::handle_inode_update(MInodeUpdate *m) -{ - inodeno_t ino = m->get_ino(); - CInode *in = get_inode(m->get_ino()); - if (!in) { - //dout(7) << "inode_update on " << m->get_ino() << ", don't have it, ignoring" << endl; - dout(7) << "inode_update on " << m->get_ino() << ", don't have it, sending expire" << endl; - MCacheExpire *expire = new MCacheExpire(mds->get_nodeid()); - expire->add_inode(m->get_ino(), m->get_nonce()); - mds->send_message_mds(expire, m->get_source().num(), MDS_PORT_CACHE); - goto out; - } - - if (in->is_auth()) { - dout(7) << "inode_update on " << *in << ", but i'm the authority!" << endl; - assert(0); // this should never happen - } - - dout(7) << "inode_update on " << *in << endl; - - // update! NOTE dir_auth is unaffected by this. - in->decode_basic_state(m->get_payload()); - - out: - // done - delete m; -} -*/ - - - -void MDCache::handle_cache_expire(MCacheExpire *m) -{ - int from = m->get_from(); - int source = m->get_source().num(); - map proxymap; - - if (m->get_from() == source) { - dout(7) << "cache_expire from mds" << from << endl; - } else { - dout(7) << "cache_expire from mds" << from << " via " << source << endl; - } - - // inodes - for (map::iterator it = m->get_inodes().begin(); - it != m->get_inodes().end(); - it++) { - CInode *in = get_inode(it->first); - int nonce = it->second; - - if (!in) { - dout(0) << "inode expire on " << it->first << " from " << from << ", don't have it" << endl; - assert(in); // i should be authority, or proxy .. and pinned - } - if (!in->is_auth()) { - int newauth = in->authority(); - dout(7) << "proxy inode expire on " << *in << " to " << newauth << endl; - assert(newauth >= 0); - if (!in->state_test(CInode::STATE_PROXY)) dout(0) << "missing proxy bit on " << *in << endl; - assert(in->state_test(CInode::STATE_PROXY)); - if (proxymap.count(newauth) == 0) proxymap[newauth] = new MCacheExpire(from); - proxymap[newauth]->add_inode(it->first, it->second); - continue; - } - - // check nonce - if (from == mds->get_nodeid()) { - // my cache_expire, and the export_dir giving auth back to me crossed paths! - // we can ignore this. no danger of confusion since the two parties are both me. - dout(7) << "inode expire on " << *in << " from mds" << from << " .. ME! ignoring." << endl; - } - else if (nonce == in->get_replica_nonce(from)) { - // remove from our cached_by - dout(7) << "inode expire on " << *in << " from mds" << from << " cached_by was " << in->get_replicas() << endl; - inode_remove_replica(in, from); - - } - else { - // this is an old nonce, ignore expire. - dout(7) << "inode expire on " << *in << " from mds" << from - << " with old nonce " << nonce << " (current " << in->get_replica_nonce(from) << "), dropping" - << endl; - assert(in->get_replica_nonce(from) > nonce); - } - } - - // dirs - for (map::iterator it = m->get_dirs().begin(); - it != m->get_dirs().end(); - it++) { - CInode *diri = get_inode(it->first); - assert(diri); - CDir *dir = diri->dir; - int nonce = it->second; - - if (!dir) { - dout(0) << "dir expire on " << it->first << " from " << from << ", don't have it" << endl; - assert(dir); // i should be authority, or proxy ... and pinned - } - if (!dir->is_auth()) { - int newauth = dir->authority(); - dout(7) << "proxy dir expire on " << *dir << " to " << newauth << endl; - if (!dir->is_proxy()) dout(0) << "nonproxy dir expire? " << *dir << " .. auth is " << newauth << " .. expire is from " << from << endl; - assert(dir->is_proxy()); - assert(newauth >= 0); - assert(dir->state_test(CDIR_STATE_PROXY)); - if (proxymap.count(newauth) == 0) proxymap[newauth] = new MCacheExpire(from); - proxymap[newauth]->add_dir(it->first, it->second); - continue; - } - - // check nonce - if (from == mds->get_nodeid()) { - dout(7) << "dir expire on " << *dir << " from mds" << from - << " .. ME! ignoring" << endl; - } - else if (nonce == dir->get_replica_nonce(from)) { - // remove from our cached_by - dout(7) << "dir expire on " << *dir << " from mds" << from - << " replicas was " << dir->replicas << endl; - dir->remove_replica(from); - } - else { - // this is an old nonce, ignore expire. - dout(7) << "dir expire on " << *dir << " from mds" << from - << " with old nonce " << nonce << " (current " << dir->get_replica_nonce(from) - << "), dropping" << endl; - assert(dir->get_replica_nonce(from) > nonce); - } - } - - // dentries - for (map >::iterator pd = m->get_dentries().begin(); - pd != m->get_dentries().end(); - ++pd) { - dout(0) << "dn expires in dir " << pd->first << endl; - CInode *diri = get_inode(pd->first); - CDir *dir = diri->dir; - assert(dir); - - if (!dir->is_auth()) { - int newauth = dir->authority(); - dout(7) << "proxy dentry expires on " << *dir << " to " << newauth << endl; - if (!dir->is_proxy()) - dout(0) << "nonproxy dentry expires? " << *dir << " .. auth is " << newauth - << " .. expire is from " << from << endl; - assert(dir->is_proxy()); - assert(newauth >= 0); - assert(dir->state_test(CDIR_STATE_PROXY)); - if (proxymap.count(newauth) == 0) proxymap[newauth] = new MCacheExpire(from); - proxymap[newauth]->add_dentries(pd->first, pd->second); - continue; - } - - for (map::iterator p = pd->second.begin(); - p != pd->second.end(); - ++p) { - int nonce = p->second; - - CDentry *dn = dir->lookup(p->first); - if (!dn) - dout(0) << "missing dentry for " << p->first << " in " << *dir << endl; - assert(dn); - - if (from == mds->get_nodeid()) { - dout(7) << "dentry_expire on " << *dn << " from mds" << from - << " .. ME! ignoring" << endl; - } - else if (nonce == dn->get_replica_nonce(from)) { - dout(7) << "dentry_expire on " << *dn << " from mds" << from << endl; - dn->remove_replica(from); - } - else { - dout(7) << "dentry_expire on " << *dn << " from mds" << from - << " with old nonce " << nonce << " (current " << dn->get_replica_nonce(from) - << "), dropping" << endl; - assert(dn->get_replica_nonce(from) > nonce); - } - } - } - - // send proxy forwards - for (map::iterator it = proxymap.begin(); - it != proxymap.end(); - it++) { - dout(7) << "sending proxy forward to " << it->first << endl; - mds->send_message_mds(it->second, it->first, MDS_PORT_CACHE); - } - - // done - delete m; -} - -void MDCache::inode_remove_replica(CInode *in, int from) -{ - in->remove_replica(from); - in->mds_caps_wanted.erase(from); - - // note: this code calls _eval more often than it needs to! - // fix lock - if (in->hardlock.is_gathering(from)) { - in->hardlock.gather_set.erase(from); - if (in->hardlock.gather_set.size() == 0) - mds->locker->inode_hard_eval(in); - } - if (in->filelock.is_gathering(from)) { - in->filelock.gather_set.erase(from); - if (in->filelock.gather_set.size() == 0) - mds->locker->inode_file_eval(in); - } - - // alone now? - if (!in->is_replicated()) { - mds->locker->inode_hard_eval(in); - mds->locker->inode_file_eval(in); - } -} - - -int MDCache::send_dir_updates(CDir *dir, bool bcast) -{ - // this is an FYI, re: replication - - set who; - if (bcast) { - mds->get_mds_map()->get_active_mds_set(who); - } else { - for (map::iterator p = dir->replicas_begin(); - p != dir->replicas_end(); - ++p) - who.insert(p->first); - } - - dout(7) << "sending dir_update on " << *dir << " bcast " << bcast << " to " << who << endl; - - string path; - dir->inode->make_path(path); - - int whoami = mds->get_nodeid(); - for (set::iterator it = who.begin(); - it != who.end(); - it++) { - if (*it == whoami) continue; - //if (*it == except) continue; - dout(7) << "sending dir_update on " << *dir << " to " << *it << endl; - - mds->send_message_mds(new MDirUpdate(dir->ino(), - dir->dir_rep, - dir->dir_rep_by, - path, - bcast), - *it, MDS_PORT_CACHE); - } - - return 0; -} - - -void MDCache::handle_dir_update(MDirUpdate *m) -{ - CInode *in = get_inode(m->get_ino()); - if (!in || !in->dir) { - dout(5) << "dir_update on " << m->get_ino() << ", don't have it" << endl; - - // discover it? - if (m->should_discover()) { - m->tried_discover(); // only once! - vector trace; - filepath path = m->get_path(); - - dout(5) << "trying discover on dir_update for " << path << endl; - - int r = path_traverse(path, trace, true, - m, new C_MDS_RetryMessage(mds, m), - MDS_TRAVERSE_DISCOVER); - if (r > 0) - return; - if (r == 0) { - assert(in); - open_remote_dir(in, new C_MDS_RetryMessage(mds, m)); - return; - } - assert(0); - } - - goto out; - } - - // update - dout(5) << "dir_update on " << *in->dir << endl; - in->dir->dir_rep = m->get_dir_rep(); - in->dir->dir_rep_by = m->get_dir_rep_by(); - - // done - out: - delete m; -} - - - - - -class C_MDC_DentryUnlink : public Context { -public: - MDCache *mdc; - CDentry *dn; - CDir *dir; - Context *c; - C_MDC_DentryUnlink(MDCache *mdc, CDentry *dn, CDir *dir, Context *c) { - this->mdc = mdc; - this->dn = dn; - this->dir = dir; - this->c = c; - } - void finish(int r) { - assert(r == 0); - mdc->dentry_unlink_finish(dn, dir, c); - } -}; - - -// NAMESPACE FUN - -void MDCache::dentry_unlink(CDentry *dn, Context *c) -{ - CDir *dir = dn->dir; - string dname = dn->name; - - assert(dn->lockstate == DN_LOCK_XLOCK); - - // i need the inode to do any of this properly - assert(dn->inode); - - // log it - if (dn->inode) dn->inode->mark_unsafe(); // XXX ??? FIXME - mds->mdlog->submit_entry(new EString("unlink fixme fixme"),//EUnlink(dir, dn, dn->inode), - NULL); // FIXME FIXME FIXME - - // tell replicas - if (dir->is_replicated()) { - for (map::iterator it = dir->replicas_begin(); - it != dir->replicas_end(); - it++) { - dout(7) << "inode_unlink sending DentryUnlink to mds" << it->first << endl; - - mds->send_message_mds(new MDentryUnlink(dir->ino(), dn->name), it->first, MDS_PORT_CACHE); - } - - // don't need ack. - } - - - // inode deleted? - if (dn->is_primary()) { - assert(dn->inode->is_auth()); - dn->inode->inode.nlink--; - - if (dn->inode->is_dir()) assert(dn->inode->inode.nlink == 0); // no hard links on dirs - - // last link? - if (dn->inode->inode.nlink == 0) { - // truly dangling - if (dn->inode->dir) { - // mark dir clean too, since it now dne! - assert(dn->inode->dir->is_auth()); - dn->inode->dir->state_set(CDIR_STATE_DELETED); - dn->inode->dir->remove_null_dentries(); - dn->inode->dir->mark_clean(); - } - - // mark it clean, it's dead - if (dn->inode->is_dirty()) - dn->inode->mark_clean(); - - } else { - // migrate to inode file - dout(7) << "removed primary, but there are remote links, moving to inode file: " << *dn->inode << endl; - - // dangling but still linked. - assert(dn->inode->is_anchored()); - - // unlink locally - CInode *in = dn->inode; - dn->dir->unlink_inode( dn ); - dn->_mark_dirty(); // fixme - - // mark it dirty! - in->_mark_dirty(); // fixme - - // update anchor to point to inode file+mds - vector atrace; - in->make_anchor_trace(atrace); - assert(atrace.size() == 1); // it's dangling - mds->anchorclient->update(in->ino(), atrace, - new C_MDC_DentryUnlink(this, dn, dir, c)); - return; - } - } - else if (dn->is_remote()) { - // need to dec nlink on primary - if (dn->inode->is_auth()) { - // awesome, i can do it - dout(7) << "remote target is local, nlink--" << endl; - dn->inode->inode.nlink--; - dn->inode->_mark_dirty(); // fixme - - if (( dn->inode->state_test(CInode::STATE_DANGLING) && dn->inode->inode.nlink == 0) || - (!dn->inode->state_test(CInode::STATE_DANGLING) && dn->inode->inode.nlink == 1)) { - dout(7) << "nlink=1+primary or 0+dangling, removing anchor" << endl; - - // remove anchor (async) - mds->anchorclient->destroy(dn->inode->ino(), NULL); - } - } else { - int auth = dn->inode->authority(); - dout(7) << "remote target is remote, sending unlink request to " << auth << endl; - - mds->send_message_mds(new MInodeUnlink(dn->inode->ino(), mds->get_nodeid()), - auth, MDS_PORT_CACHE); - - // unlink locally - CInode *in = dn->inode; - dn->dir->unlink_inode( dn ); - dn->_mark_dirty(); // fixme - - // add waiter - in->add_waiter(CINODE_WAIT_UNLINK, c); - return; - } - } - else - assert(0); // unlink on null dentry?? - - // unlink locally - dn->dir->unlink_inode( dn ); - dn->_mark_dirty(); // fixme - - // finish! - dentry_unlink_finish(dn, dir, c); -} - - -void MDCache::dentry_unlink_finish(CDentry *dn, CDir *dir, Context *c) -{ - dout(7) << "dentry_unlink_finish on " << *dn << endl; - string dname = dn->name; - - // unpin dir / unxlock - mds->locker->dentry_xlock_finish(dn, true); // quiet, no need to bother replicas since they're already unlinking - - // did i empty out an imported dir? - if (dir->is_import() && !dir->inode->is_root() && dir->get_size() == 0) - migrator->export_empty_import(dir); - - // wake up any waiters - dir->take_waiting(CDIR_WAIT_ANY, dname, mds->finished_queue); - - c->finish(0); -} - - - - -void MDCache::handle_dentry_unlink(MDentryUnlink *m) -{ - CInode *diri = get_inode(m->get_dirino()); - CDir *dir = 0; - if (diri) dir = diri->dir; - - if (!diri || !dir) { - dout(7) << "handle_dentry_unlink don't have dir " << m->get_dirino() << endl; - } - else { - CDentry *dn = dir->lookup(m->get_dn()); - if (!dn) { - dout(7) << "handle_dentry_unlink don't have dentry " << *dir << " dn " << m->get_dn() << endl; - } else { - dout(7) << "handle_dentry_unlink on " << *dn << endl; - - // dir? - if (dn->inode) { - if (dn->inode->dir) { - dn->inode->dir->state_set(CDIR_STATE_DELETED); - dn->inode->dir->remove_null_dentries(); - } - } - - string dname = dn->name; - - // unlink - dn->dir->remove_dentry(dn); - - // wake up - //dir->finish_waiting(CDIR_WAIT_DNREAD, dname); - dir->take_waiting(CDIR_WAIT_DNREAD, dname, mds->finished_queue); - } - } - - delete m; - return; -} - - -void MDCache::handle_inode_unlink(MInodeUnlink *m) -{ - CInode *in = get_inode(m->get_ino()); - assert(in); - - // proxy? - if (in->is_proxy()) { - dout(7) << "handle_inode_unlink proxy on " << *in << endl; - mds->send_message_mds(m, in->authority(), MDS_PORT_CACHE); - return; - } - assert(in->is_auth()); - - // do it. - dout(7) << "handle_inode_unlink nlink=" << in->inode.nlink << " on " << *in << endl; - assert(in->inode.nlink > 0); - in->inode.nlink--; - - if (in->state_test(CInode::STATE_DANGLING)) { - // already dangling. - // last link? - if (in->inode.nlink == 0) { - dout(7) << "last link, marking clean and removing anchor" << endl; - - in->mark_clean(); // mark it clean. - - // remove anchor (async) - mds->anchorclient->destroy(in->ino(), NULL); - } - else { - in->_mark_dirty(); // fixme - } - } else { - // has primary link still. - assert(in->inode.nlink >= 1); - in->_mark_dirty(); // fixme - - if (in->inode.nlink == 1) { - dout(7) << "nlink=1, removing anchor" << endl; - - // remove anchor (async) - mds->anchorclient->destroy(in->ino(), NULL); - } - } - - // ack - mds->send_message_mds(new MInodeUnlinkAck(m->get_ino()), m->get_from(), MDS_PORT_CACHE); -} - -void MDCache::handle_inode_unlink_ack(MInodeUnlinkAck *m) -{ - CInode *in = get_inode(m->get_ino()); - assert(in); - - dout(7) << "handle_inode_unlink_ack on " << *in << endl; - in->finish_waiting(CINODE_WAIT_UNLINK, 0); -} - - - - - - - - - - -/* - * some import/export helpers - */ - -/** con = get_auth_container(dir) - * Returns the directory in which authority is delegated for *dir. - * This may be because a directory is an import, or because it is hashed - * and we are nested underneath an inode in that dir (that hashes to us). - * Thus do not assume result->is_auth()! It is_auth() || is_hashed(). - */ -CDir *MDCache::get_auth_container(CDir *dir) -{ - CDir *imp = dir; // might be *dir - - // find the underlying import or hash that delegates dir - while (true) { - if (imp->is_import()) break; // import - imp = imp->get_parent_dir(); - if (!imp) break; // none - if (imp->is_hashed()) break; // hash - } - - return imp; -} - -CDir *MDCache::get_export_container(CDir *dir) -{ - CDir *ex = dir; // might be *dir - assert(!ex->is_auth()); - - // find the underlying import or hash that delegates dir away - while (true) { - if (ex->is_export()) break; // import - ex = ex->get_parent_dir(); - assert(ex); - if (ex->is_hashed()) break; // hash - } - - return ex; -} - - -void MDCache::find_nested_exports(CDir *dir, set& s) -{ - CDir *import = get_auth_container(dir); - find_nested_exports_under(import, dir, s); -} - -void MDCache::find_nested_exports_under(CDir *import, CDir *dir, set& s) -{ - dout(10) << "find_nested_exports for " << *dir << endl; - dout(10) << "find_nested_exports_under import " << *import << endl; - - if (import == dir) { - // yay, my job is easy! - for (set::iterator p = nested_exports[import].begin(); - p != nested_exports[import].end(); - p++) { - CDir *nested = *p; - s.insert(nested); - dout(10) << "find_nested_exports " << *dir << " " << *nested << endl; - } - return; - } - - // ok, my job is annoying. - for (set::iterator p = nested_exports[import].begin(); - p != nested_exports[import].end(); - p++) { - CDir *nested = *p; - - dout(12) << "find_nested_exports checking " << *nested << endl; - - // trace back to import, or dir - CDir *cur = nested->get_parent_dir(); - while (!cur->is_import() || cur == dir) { - if (cur == dir) { - s.insert(nested); - dout(10) << "find_nested_exports " << *dir << " " << *nested << endl; - break; - } else { - cur = cur->get_parent_dir(); - } - } - } -} - - - - - - - - - - - - - - - - - - -// ============================================================== -// debug crap - - -void MDCache::show_imports() -{ - int db = 10; - - if (imports.empty() && - hashdirs.empty()) { - dout(db) << "show_imports: no imports/exports/hashdirs" << endl; - return; - } - dout(db) << "show_imports:" << endl; - - set ecopy = exports; - - set::iterator it = hashdirs.begin(); - while (1) { - if (it == hashdirs.end()) it = imports.begin(); - if (it == imports.end() ) break; - - CDir *im = *it; - - if (im->is_import()) { - dout(db) << " + import (" << im->popularity[MDS_POP_CURDOM] << "/" << im->popularity[MDS_POP_ANYDOM] << ") " << *im << endl; - //assert( im->is_auth() ); - } - else if (im->is_hashed()) { - if (im->is_import()) continue; // if import AND hash, list as import. - dout(db) << " + hash (" << im->popularity[MDS_POP_CURDOM] << "/" << im->popularity[MDS_POP_ANYDOM] << ") " << *im << endl; - } - - for (set::iterator p = nested_exports[im].begin(); - p != nested_exports[im].end(); - p++) { - CDir *exp = *p; - if (exp->is_hashed()) { - //assert(0); // we don't do it this way actually - dout(db) << " - hash (" << exp->popularity[MDS_POP_NESTED] << ", " << exp->popularity[MDS_POP_ANYDOM] << ") " << *exp << " to " << exp->dir_auth << endl; - //assert( !exp->is_auth() ); - } else { - dout(db) << " - ex (" << exp->popularity[MDS_POP_NESTED] << ", " << exp->popularity[MDS_POP_ANYDOM] << ") " << *exp << " to " << exp->dir_auth << endl; - assert( exp->is_export() ); - //assert( !exp->is_auth() ); - } - - if ( get_auth_container(exp) != im ) { - dout(1) << "uh oh, auth container is " << *get_auth_container(exp) << endl; - assert( get_auth_container(exp) == im ); - } - - if (ecopy.count(exp) != 1) { - dout(1) << "***** nested_export " << *exp << " not in exports" << endl; - assert(0); - } - ecopy.erase(exp); - } - - it++; - } - - if (ecopy.size()) { - for (set::iterator it = ecopy.begin(); - it != ecopy.end(); - it++) - dout(1) << "***** stray item in exports: " << **it << endl; - assert(ecopy.size() == 0); - } -} - - -void MDCache::show_cache() -{ - dout(7) << "show_cache" << endl; - - for (hash_map::iterator it = inode_map.begin(); - it != inode_map.end(); - it++) { - dout(7) << *((*it).second) << endl; - - CDentry *dn = (*it).second->get_parent_dn(); - if (dn) - dout(7) << " dn " << *dn << endl; - if ((*it).second->dir) - dout(7) << " subdir " << *(*it).second->dir << endl; - } -} - diff --git a/tags/20070517_before_mds_merge/mds/MDCache.h b/tags/20070517_before_mds_merge/mds/MDCache.h deleted file mode 100644 index 7b8825f073726..0000000000000 --- a/tags/20070517_before_mds_merge/mds/MDCache.h +++ /dev/null @@ -1,364 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __MDCACHE_H -#define __MDCACHE_H - -#include -#include -#include -#include -#include - -#include "include/types.h" -#include "include/filepath.h" - -#include "CInode.h" -#include "CDentry.h" -#include "CDir.h" -#include "Lock.h" - - -class MDS; -class Migrator; -class Renamer; - -class Logger; - -class Message; - -class MMDSImportMap; -class MMDSCacheRejoin; -class MMDSCacheRejoinAck; -class MDiscover; -class MDiscoverReply; -class MCacheExpire; -class MDirUpdate; -class MDentryUnlink; -class MLock; - - -class MClientRequest; - - -// MDCache - -//typedef const char* pchar; - - - -/** active_request_t - * state we track for requests we are currently processing. - * mostly information about locks held, so that we can drop them all - * the request is finished or forwarded. see request_*(). - */ -typedef struct { - CInode *ref; // reference inode - set< CInode* > request_pins; - set< CDir* > request_dir_pins; - map< CDentry*, vector > traces; // path pins held - set< CDentry* > xlocks; // xlocks (local) - set< CDentry* > foreign_xlocks; // xlocks on foreign hosts -} active_request_t; - -namespace __gnu_cxx { - template<> struct hash { - size_t operator()(const Message *p) const { - static hash H; - return H((unsigned long)p); - } - }; -} - -class MDCache { - public: - // my master - MDS *mds; - - LRU lru; // dentry lru for expiring items from cache - - protected: - // the cache - CInode *root; // root inode - hash_map inode_map; // map of inodes by ino - - list inode_expire_queue; // inodes to delete - - - // root - list waiting_for_root; - - // imports, exports, and hashes. - set imports; // includes root (on mds0) - set exports; - set hashdirs; - map > nested_exports; // exports nested under imports _or_ hashdirs - - void adjust_export(int to, CDir *root, set& bounds); - void adjust_import(int from, CDir *root, set& bounds); - - - - // active MDS requests - hash_map active_requests; - - // inode purging - map purging; - map > waiting_for_purge; - - // shutdown crap - int shutdown_commits; - bool did_shutdown_exports; - bool did_shutdown_log_cap; - friend class C_MDC_ShutdownCommit; - - // recovery -protected: - // from EImportStart w/o EImportFinish during journal replay - map > my_ambiguous_imports; - // from MMDSImportMaps - map > > other_ambiguous_imports; - - set recovery_set; - set wants_import_map; // nodes i need to send my import map to - set got_import_map; // nodes i need to send my import map to (when exports finish) - set rejoin_ack_gather; // nodes i need a rejoin ack from - - void handle_import_map(MMDSImportMap *m); - void handle_cache_rejoin(MMDSCacheRejoin *m); - void handle_cache_rejoin_ack(MMDSCacheRejoinAck *m); - void disambiguate_imports(); - void cache_rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin); - void send_cache_rejoin_acks(); -public: - void send_import_map(int who); - void send_import_map_now(int who); - void send_import_map_later(int who) { - wants_import_map.insert(who); - } - void send_pending_import_maps(); // maybe. - void send_cache_rejoins(); - - void set_recovery_set(set& s) { - recovery_set = s; - } - - // ambiguous imports - void add_ambiguous_import(inodeno_t base, set& bounds) { - my_ambiguous_imports[base].swap(bounds); - } - void cancel_ambiguous_import(inodeno_t dirino); - void finish_ambiguous_import(inodeno_t dirino); - - void finish_ambiguous_export(inodeno_t dirino, set& bounds); - - - - - - friend class CInode; - friend class Locker; - friend class Migrator; - friend class Renamer; - friend class MDBalancer; - friend class EImportMap; - - - public: - - // subsystems - Migrator *migrator; - Renamer *renamer; - - public: - MDCache(MDS *m); - ~MDCache(); - - // debug - void log_stat(Logger *logger); - - // root inode - CInode *get_root() { return root; } - void set_root(CInode *r); - - int get_num_imports() { return imports.size(); } - void add_import(CDir *dir); - void remove_import(CDir *dir); - void recalc_auth_bits(); - - void log_import_map(Context *onsync=0); - - - // cache - void set_cache_size(size_t max) { lru.lru_set_max(max); } - size_t get_cache_size() { return lru.lru_get_size(); } - bool trim(int max = -1); // trim cache - void trim_non_auth(); // trim out trimmable non-auth items - - // shutdown - void shutdown_start(); - void shutdown_check(); - bool shutdown_pass(); - bool shutdown(); // clear cache (ie at shutodwn) - - // inode_map - bool have_inode( inodeno_t ino ) { return inode_map.count(ino) ? true:false; } - CInode* get_inode( inodeno_t ino ) { - if (have_inode(ino)) - return inode_map[ ino ]; - return NULL; - } - - - int hash_dentry(inodeno_t ino, const string& s) { - return 0; // fixme - } - - - public: - CInode *create_inode(); - void add_inode(CInode *in); - - protected: - void remove_inode(CInode *in); - void destroy_inode(CInode *in); - void touch_inode(CInode *in) { - if (in->get_parent_dn()) - touch_dentry(in->get_parent_dn()); - } - void touch_dentry(CDentry *dn) { - // touch ancestors - if (dn->get_dir()->get_inode()->get_parent_dn()) - touch_dentry(dn->get_dir()->get_inode()->get_parent_dn()); - - // touch me - if (dn->is_auth()) - lru.lru_touch(dn); - else - lru.lru_midtouch(dn); - } - - void inode_remove_replica(CInode *in, int rep); - - void rename_file(CDentry *srcdn, CDentry *destdn); - - public: - // inode purging - void purge_inode(inode_t& inode); - void purge_inode_finish(inodeno_t ino); - void purge_inode_finish_2(inodeno_t ino); - void waitfor_purge(inodeno_t ino, Context *c); - void start_recovered_purges(); - - - public: - CDir *get_auth_container(CDir *in); - CDir *get_export_container(CDir *dir); - void find_nested_exports(CDir *dir, set& s); - void find_nested_exports_under(CDir *import, CDir *dir, set& s); - - - public: - CInode *create_root_inode(); - int open_root(Context *c); - int path_traverse(filepath& path, vector& trace, bool follow_trailing_sym, - Message *req, Context *ondelay, - int onfail, - Context *onfinish=0, - bool is_client_req = false); - void open_remote_dir(CInode *diri, Context *fin); - void open_remote_ino(inodeno_t ino, Message *req, Context *fin); - void open_remote_ino_2(inodeno_t ino, Message *req, - vector& anchortrace, - Context *onfinish); - - bool path_pin(vector& trace, Message *m, Context *c); - void path_unpin(vector& trace, Message *m); - void make_trace(vector& trace, CInode *in); - - bool request_start(Message *req, - CInode *ref, - vector& trace); - void request_cleanup(Message *req); - void request_finish(Message *req); - void request_forward(Message *req, int mds, int port=0); - void request_pin_inode(Message *req, CInode *in); - void request_pin_dir(Message *req, CDir *dir); - - // anchors - void anchor_inode(CInode *in, Context *onfinish); - //void unanchor_inode(CInode *in, Context *c); - - void handle_inode_link(class MInodeLink *m); - void handle_inode_link_ack(class MInodeLinkAck *m); - - // == messages == - public: - void dispatch(Message *m); - - protected: - // -- replicas -- - void handle_discover(MDiscover *dis); - void handle_discover_reply(MDiscoverReply *m); - - - // -- namespace -- - // these handle logging, cache sync themselves. - // UNLINK - public: - void dentry_unlink(CDentry *in, Context *c); - protected: - void dentry_unlink_finish(CDentry *in, CDir *dir, Context *c); - void handle_dentry_unlink(MDentryUnlink *m); - void handle_inode_unlink(class MInodeUnlink *m); - void handle_inode_unlink_ack(class MInodeUnlinkAck *m); - friend class C_MDC_DentryUnlink; - - - - // -- misc auth -- - int ino_proxy_auth(inodeno_t ino, - int frommds, - map >& inomap); - void do_ino_proxy(CInode *in, Message *m); - void do_dir_proxy(CDir *dir, Message *m); - - - - - // -- updates -- - //int send_inode_updates(CInode *in); - //void handle_inode_update(MInodeUpdate *m); - - int send_dir_updates(CDir *in, bool bcast=false); - void handle_dir_update(MDirUpdate *m); - - void handle_cache_expire(MCacheExpire *m); - - - - // == crap fns == - public: - void dump() { - if (root) root->dump(); - } - - void show_imports(); - void show_cache(); - -}; - - -#endif diff --git a/tags/20070517_before_mds_merge/mds/MDLog.cc b/tags/20070517_before_mds_merge/mds/MDLog.cc deleted file mode 100644 index ba2011e092b08..0000000000000 --- a/tags/20070517_before_mds_merge/mds/MDLog.cc +++ /dev/null @@ -1,437 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "MDLog.h" -#include "MDS.h" -#include "MDCache.h" -#include "LogEvent.h" - -#include "osdc/Journaler.h" - -#include "common/LogType.h" -#include "common/Logger.h" - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug_mds || l <= g_conf.debug_mds_log) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".log " -#define derr(l) if (l<=g_conf.debug_mds || l <= g_conf.debug_mds_log) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".log " - -// cons/des - -LogType mdlog_logtype; - -MDLog::MDLog(MDS *m) -{ - mds = m; - num_events = 0; - waiting_for_read = false; - - last_import_map = 0; - writing_import_map = false; - seen_import_map = false; - - max_events = g_conf.mds_log_max_len; - - capped = false; - - unflushed = 0; - - journaler = 0; - logger = 0; -} - - -MDLog::~MDLog() -{ - if (journaler) { delete journaler; journaler = 0; } - if (logger) { delete logger; logger = 0; } -} - - -void MDLog::init_journaler() -{ - // logger - char name[80]; - sprintf(name, "mds%d.log", mds->get_nodeid()); - logger = new Logger(name, &mdlog_logtype); - - static bool didit = false; - if (!didit) { - mdlog_logtype.add_inc("add"); - mdlog_logtype.add_inc("expire"); - mdlog_logtype.add_inc("obs"); - mdlog_logtype.add_inc("trim"); - mdlog_logtype.add_set("size"); - mdlog_logtype.add_set("read"); - mdlog_logtype.add_set("append"); - mdlog_logtype.add_inc("lsum"); - mdlog_logtype.add_inc("lnum"); - } - - // inode - memset(&log_inode, 0, sizeof(log_inode)); - log_inode.ino = MDS_INO_LOG_OFFSET + mds->get_nodeid(); - log_inode.layout = g_OSD_MDLogLayout; - - if (g_conf.mds_local_osd) { - log_inode.layout.object_layout = OBJECT_LAYOUT_STARTOSD; - log_inode.layout.osd = mds->get_nodeid() + 10000; // hack - } - - // log streamer - if (journaler) delete journaler; - journaler = new Journaler(log_inode, mds->objecter, logger); -} - -void MDLog::flush_logger() -{ - if (logger) - logger->flush(true); -} - - - -void MDLog::reset() -{ - dout(5) << "reset to empty log" << endl; - init_journaler(); - journaler->reset(); -} - -void MDLog::open(Context *c) -{ - dout(5) << "open discovering log bounds" << endl; - init_journaler(); - journaler->recover(c); -} - -void MDLog::write_head(Context *c) -{ - journaler->write_head(c); -} - - -off_t MDLog::get_read_pos() -{ - return journaler->get_read_pos(); -} - -off_t MDLog::get_write_pos() -{ - return journaler->get_write_pos(); -} - - - -void MDLog::submit_entry( LogEvent *le, - Context *c ) -{ - if (g_conf.mds_log) { - dout(5) << "submit_entry " << journaler->get_write_pos() << " : " << *le << endl; - - // encode it, with event type - bufferlist bl; - bl.append((char*)&le->_type, sizeof(le->_type)); - le->encode_payload(bl); - - // journal it. - journaler->append_entry(bl); - - assert(!capped); - - delete le; - num_events++; - - logger->inc("add"); - logger->set("size", num_events); - logger->set("append", journaler->get_write_pos()); - - if (c) { - unflushed = 0; - journaler->flush(c); - } - else - unflushed++; - - // should we log a new import_map? - // FIXME: should this go elsewhere? - if (last_import_map && !writing_import_map && - journaler->get_write_pos() - last_import_map >= g_conf.mds_log_import_map_interval) { - // log import map - mds->mdcache->log_import_map(); - } - - } else { - // hack: log is disabled. - if (c) { - c->finish(0); - delete c; - } - } -} - -void MDLog::wait_for_sync( Context *c ) -{ - if (g_conf.mds_log) { - // wait - journaler->flush(c); - } else { - // hack: bypass. - c->finish(0); - delete c; - } -} - -void MDLog::flush() -{ - if (unflushed) - journaler->flush(); - unflushed = 0; - - // trim - trim(NULL); -} - - - - -// trim - -class C_MDL_Trimmed : public Context { -public: - MDLog *mdl; - LogEvent *le; - - C_MDL_Trimmed(MDLog *mdl, LogEvent *le) { - this->mdl = mdl; - this->le = le; - } - void finish(int res) { - mdl->_trimmed(le); - } -}; - -class C_MDL_Reading : public Context { -public: - MDLog *mdl; - C_MDL_Reading(MDLog *m) { - mdl = m; - } - void finish(int res) { - mdl->_did_read(); - } -}; - - -void MDLog::_did_read() -{ - dout(5) << "_did_read()" << endl; - waiting_for_read = false; - trim(0); -} - -void MDLog::_trimmed(LogEvent *le) -{ - dout(7) << "trimmed : " << le->get_start_off() << " : " << *le << endl; - assert(le->has_expired(mds)); - - if (trimming.begin()->first == le->_end_off) { - // we trimmed off the front! - // we can expire the log a bit. - journaler->set_expire_pos(le->_end_off); - } - - trimming.erase(le->_end_off); - delete le; - - logger->set("trim", trimming.size()); - logger->set("read", journaler->get_read_pos()); - - trim(0); -} - - - -void MDLog::trim(Context *c) -{ - // add waiter - if (c) - trim_waiters.push_back(c); - - // trim! - dout(10) << "trim " << num_events << " events / " << max_events << " max" << endl; - - while (num_events > max_events) { - - off_t gap = journaler->get_write_pos() - journaler->get_read_pos(); - dout(5) << "trim num_events " << num_events << " > max " << max_events - << ", trimming " << trimming.size() - << ", byte gap " << gap - << endl; - - if ((int)trimming.size() >= g_conf.mds_log_max_trimming) { - dout(7) << "trim already trimming max, waiting" << endl; - return; - } - - bufferlist bl; - off_t so = journaler->get_read_pos(); - if (journaler->try_read_entry(bl)) { - // decode logevent - LogEvent *le = LogEvent::decode(bl); - le->_start_off = so; - le->_end_off = journaler->get_read_pos(); - num_events--; - - // we just read an event. - if (le->has_expired(mds)) { - // obsolete - dout(7) << "trim obsolete : " << le->get_start_off() << " : " << *le << endl; - delete le; - logger->inc("obs"); - } else { - assert ((int)trimming.size() < g_conf.mds_log_max_trimming); - - // trim! - dout(7) << "trim expiring : " << le->get_start_off() << " : " << *le << endl; - trimming[le->_end_off] = le; - le->expire(mds, new C_MDL_Trimmed(this, le)); - logger->inc("expire"); - logger->set("trim", trimming.size()); - } - logger->set("read", journaler->get_read_pos()); - logger->set("size", num_events); - } else { - // need to read! - if (!waiting_for_read) { - waiting_for_read = true; - dout(7) << "trim waiting for read" << endl; - journaler->wait_for_readable(new C_MDL_Reading(this)); - } else { - dout(7) << "trim already waiting for read" << endl; - } - return; - } - } - - dout(5) << "trim num_events " << num_events << " <= max " << max_events - << ", trimming " << trimming.size() - << ", done for now." - << endl; - - // trimmed! - std::list finished; - finished.swap(trim_waiters); - finish_contexts(finished, 0); - - // hmm, are we at the end? - /* - if (journaler->get_read_pos() == journaler->get_write_pos() && - trimming.size() == import_map_expire_waiters.size()) { - dout(5) << "trim log is empty, allowing import_map to expire" << endl; - list ls; - ls.swap(import_map_expire_waiters); - finish_contexts(ls); - } - */ -} - - -void MDLog::replay(Context *c) -{ - assert(journaler->is_active()); - - // start reading at the last known expire point. - journaler->set_read_pos( journaler->get_expire_pos() ); - - // empty? - if (journaler->get_read_pos() == journaler->get_write_pos()) { - dout(10) << "replay - journal empty, done." << endl; - if (c) { - c->finish(0); - delete c; - } - return; - } - - // add waiter - if (c) - waitfor_replay.push_back(c); - - // go! - dout(10) << "replay start, from " << journaler->get_read_pos() - << " to " << journaler->get_write_pos() << endl; - - assert(num_events == 0); - - _replay(); -} - -class C_MDL_Replay : public Context { - MDLog *mdlog; -public: - C_MDL_Replay(MDLog *l) : mdlog(l) {} - void finish(int r) { mdlog->_replay(); } -}; - -void MDLog::_replay() -{ - // read what's buffered - while (journaler->is_readable() && - journaler->get_read_pos() < journaler->get_write_pos()) { - // read it - off_t pos = journaler->get_read_pos(); - bufferlist bl; - bool r = journaler->try_read_entry(bl); - assert(r); - - // unpack event - LogEvent *le = LogEvent::decode(bl); - num_events++; - - // have we seen an import map yet? - if (!seen_import_map && - le->get_type() != EVENT_IMPORTMAP) { - dout(10) << "_replay " << pos << " / " << journaler->get_write_pos() - << " -- waiting for import_map. (skipping " << *le << ")" << endl; - } else { - dout(10) << "_replay " << pos << " / " << journaler->get_write_pos() - << " : " << *le << endl; - le->replay(mds); - - if (le->get_type() == EVENT_IMPORTMAP) - seen_import_map = true; - } - delete le; - } - - // wait for read? - if (journaler->get_read_pos() < journaler->get_write_pos()) { - journaler->wait_for_readable(new C_MDL_Replay(this)); - return; - } - - // done! - assert(journaler->get_read_pos() == journaler->get_write_pos()); - dout(10) << "_replay - complete" << endl; - - // move read pointer _back_ to expire pos, for eventual trimming - journaler->set_read_pos(journaler->get_expire_pos()); - - // kick waiter(s) - list ls; - ls.swap(waitfor_replay); - finish_contexts(ls,0); -} - - diff --git a/tags/20070517_before_mds_merge/mds/MDLog.h b/tags/20070517_before_mds_merge/mds/MDLog.h deleted file mode 100644 index 384b72d02a4ff..0000000000000 --- a/tags/20070517_before_mds_merge/mds/MDLog.h +++ /dev/null @@ -1,128 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MDLOG_H -#define __MDLOG_H - -#include "include/types.h" -#include "include/Context.h" - -#include - -//#include -//using __gnu_cxx::hash_mapset; - -class Journaler; -class LogEvent; -class MDS; - -class Logger; - -/* -namespace __gnu_cxx { - template<> struct hash { - size_t operator()(const LogEvent *p) const { - static hash H; - return H((unsigned long)p); - } - }; -} -*/ - -class MDLog { - protected: - MDS *mds; - size_t num_events; // in events - size_t max_events; - - int unflushed; - - bool capped; - - inode_t log_inode; - Journaler *journaler; - - map trimming; - std::list trim_waiters; // contexts waiting for trim - bool trim_reading; - - bool waiting_for_read; - friend class C_MDL_Reading; - - Logger *logger; - - list waitfor_replay; - - // importmaps - off_t last_import_map; // offsets of last committed importmap. constrains trimming. - list import_map_expire_waiters; - bool writing_import_map; // one is being written now - bool seen_import_map; // for recovery - - friend class EImportMap; - friend class C_MDS_WroteImportMap; - friend class MDCache; - - void init_journaler(); - - - public: - // replay state - map > pending_exports; - - - - public: - MDLog(MDS *m); - ~MDLog(); - - - void flush_logger(); - - void set_max_events(size_t max) { max_events = max; } - size_t get_max_events() { return max_events; } - size_t get_num_events() { return num_events + trimming.size(); } - size_t get_non_importmap_events() { return num_events + trimming.size() - import_map_expire_waiters.size(); } - - off_t get_read_pos(); - off_t get_write_pos(); - bool empty() { - return get_read_pos() == get_write_pos(); - } - - bool is_capped() { return capped; } - void cap() { - capped = true; - list ls; - ls.swap(import_map_expire_waiters); - finish_contexts(ls); - } - - void submit_entry( LogEvent *e, Context *c = 0 ); - void wait_for_sync( Context *c ); - void flush(); - - void trim(Context *c); - void _did_read(); - void _trimmed(LogEvent *le); - - void reset(); // fresh, empty log! - void open(Context *onopen); - void write_head(Context *onfinish); - - void replay(Context *onfinish); - void _replay(); -}; - -#endif diff --git a/tags/20070517_before_mds_merge/mds/MDS.cc b/tags/20070517_before_mds_merge/mds/MDS.cc deleted file mode 100644 index 6d66b77cd95f2..0000000000000 --- a/tags/20070517_before_mds_merge/mds/MDS.cc +++ /dev/null @@ -1,1032 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "include/types.h" -#include "common/Clock.h" - -#include "msg/Messenger.h" - -#include "osd/OSDMap.h" -#include "osdc/Objecter.h" -#include "osdc/Filer.h" - -#include "MDSMap.h" - -#include "MDS.h" -#include "Server.h" -#include "Locker.h" -#include "MDCache.h" -#include "MDStore.h" -#include "MDLog.h" -#include "MDBalancer.h" -#include "IdAllocator.h" -#include "Migrator.h" -#include "Renamer.h" - -#include "AnchorTable.h" -#include "AnchorClient.h" - -#include "common/Logger.h" -#include "common/LogType.h" - -#include "common/Timer.h" - -#include "messages/MMDSMap.h" -#include "messages/MMDSBeacon.h" - -#include "messages/MPing.h" -#include "messages/MPingAck.h" -#include "messages/MGenericMessage.h" - -#include "messages/MOSDMap.h" -#include "messages/MOSDGetMap.h" - - -LogType mds_logtype, mds_cache_logtype; - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << whoami << " " -#define derr(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << whoami << " " - - - - - -// cons/des -MDS::MDS(int whoami, Messenger *m, MonMap *mm) : timer(mds_lock) { - this->whoami = whoami; - - monmap = mm; - messenger = m; - - mdsmap = new MDSMap; - osdmap = new OSDMap; - - objecter = new Objecter(messenger, monmap, osdmap); - filer = new Filer(objecter); - - mdcache = new MDCache(this); - mdstore = new MDStore(this); - mdlog = new MDLog(this); - balancer = new MDBalancer(this); - - anchorclient = new AnchorClient(messenger, mdsmap); - idalloc = new IdAllocator(this); - - anchormgr = new AnchorTable(this); - - server = new Server(this); - locker = new Locker(this, mdcache); - - - // beacon - beacon_last_seq = 0; - beacon_sender = 0; - beacon_killer = 0; - - // tick - tick_event = 0; - - req_rate = 0; - - want_state = state = MDSMap::STATE_DNE; - - - logger = logger2 = 0; - - // i'm ready! - messenger->set_dispatcher(this); -} - -MDS::~MDS() { - if (mdcache) { delete mdcache; mdcache = NULL; } - if (mdstore) { delete mdstore; mdstore = NULL; } - if (mdlog) { delete mdlog; mdlog = NULL; } - if (balancer) { delete balancer; balancer = NULL; } - if (idalloc) { delete idalloc; idalloc = NULL; } - if (anchormgr) { delete anchormgr; anchormgr = NULL; } - if (anchorclient) { delete anchorclient; anchorclient = NULL; } - if (osdmap) { delete osdmap; osdmap = 0; } - if (mdsmap) { delete mdsmap; mdsmap = 0; } - - if (server) { delete server; server = 0; } - if (locker) { delete locker; locker = 0; } - - if (filer) { delete filer; filer = 0; } - if (objecter) { delete objecter; objecter = 0; } - if (messenger) { delete messenger; messenger = NULL; } - - if (logger) { delete logger; logger = 0; } - if (logger2) { delete logger2; logger2 = 0; } - -} - - -void MDS::reopen_logger() -{ - // flush+close old log - if (logger) { - logger->flush(true); - delete logger; - } - if (logger2) { - logger2->flush(true); - delete logger2; - } - - - // log - string name; - name = "mds"; - int w = whoami; - if (w >= 1000) name += ('0' + ((w/1000)%10)); - if (w >= 100) name += ('0' + ((w/100)%10)); - if (w >= 10) name += ('0' + ((w/10)%10)); - name += ('0' + ((w/1)%10)); - - logger = new Logger(name, (LogType*)&mds_logtype); - - mds_logtype.add_inc("req"); - mds_logtype.add_inc("reply"); - mds_logtype.add_inc("fw"); - mds_logtype.add_inc("cfw"); - - mds_logtype.add_set("l"); - mds_logtype.add_set("q"); - mds_logtype.add_set("popanyd"); - mds_logtype.add_set("popnest"); - - mds_logtype.add_inc("lih"); - mds_logtype.add_inc("lif"); - - mds_logtype.add_set("c"); - mds_logtype.add_set("ctop"); - mds_logtype.add_set("cbot"); - mds_logtype.add_set("cptail"); - mds_logtype.add_set("cpin"); - mds_logtype.add_inc("cex"); - mds_logtype.add_inc("dis"); - mds_logtype.add_inc("cmiss"); - - mds_logtype.add_set("buf"); - mds_logtype.add_inc("cdir"); - mds_logtype.add_inc("fdir"); - - mds_logtype.add_inc("iex"); - mds_logtype.add_inc("iim"); - mds_logtype.add_inc("ex"); - mds_logtype.add_inc("im"); - mds_logtype.add_inc("imex"); - mds_logtype.add_set("nex"); - mds_logtype.add_set("nim"); - - - char n[80]; - sprintf(n, "mds%d.cache", whoami); - logger2 = new Logger(n, (LogType*)&mds_cache_logtype); -} - -void MDS::send_message_mds(Message *m, int mds, int port, int fromport) -{ - // send mdsmap first? - if (peer_mdsmap_epoch[mds] < mdsmap->get_epoch()) { - messenger->send_message(new MMDSMap(mdsmap), - mdsmap->get_inst(mds)); - peer_mdsmap_epoch[mds] = mdsmap->get_epoch(); - } - - // send message - if (port && !fromport) - fromport = port; - messenger->send_message(m, mdsmap->get_inst(mds), port, fromport); -} - - -class C_MDS_Tick : public Context { - MDS *mds; -public: - C_MDS_Tick(MDS *m) : mds(m) {} - void finish(int r) { - mds->tick(); - } -}; - - - -int MDS::init(bool standby) -{ - mds_lock.Lock(); - - if (standby) - want_state = MDSMap::STATE_STANDBY; - else - want_state = MDSMap::STATE_STARTING; - - // starting beacon. this will induce an MDSMap from the monitor - beacon_start(); - - // schedule tick - reset_tick(); - - // init logger - reopen_logger(); - - mds_lock.Unlock(); - return 0; -} - -void MDS::reset_tick() -{ - // cancel old - if (tick_event) timer.cancel_event(tick_event); - - // schedule - tick_event = new C_MDS_Tick(this); - timer.add_event_after(g_conf.mon_tick_interval, tick_event); -} - -void MDS::tick() -{ - // reschedule - reset_tick(); - - // log - mds_load_t load = balancer->get_load(); - - if (logger) { - req_rate = logger->get("req"); - - logger->set("l", (int)load.mds_load()); - logger->set("q", messenger->get_dispatch_queue_len()); - logger->set("buf", buffer_total_alloc); - - mdcache->log_stat(logger); - } - - // booted? - if (is_active()) { - - // balancer - balancer->tick(); - - // HACK to test hashing stuff - if (false) { - /* - static map didhash; - if (elapsed.sec() > 15 && !didhash[whoami]) { - CInode *in = mdcache->get_inode(100000010); - if (in && in->dir) { - if (in->dir->is_auth()) - mdcache->migrator->hash_dir(in->dir); - didhash[whoami] = 1; - } - } - if (0 && elapsed.sec() > 25 && didhash[whoami] == 1) { - CInode *in = mdcache->get_inode(100000010); - if (in && in->dir) { - if (in->dir->is_auth() && in->dir->is_hashed()) - mdcache->migrator->unhash_dir(in->dir); - didhash[whoami] = 2; - } - } - */ - } - } -} - - - - -// ----------------------- -// beacons - -void MDS::beacon_start() -{ - beacon_send(); // send first beacon - - //reset_beacon_killer(); // schedule killer -} - - -class C_MDS_BeaconSender : public Context { - MDS *mds; -public: - C_MDS_BeaconSender(MDS *m) : mds(m) {} - void finish(int r) { - mds->beacon_send(); - } -}; - -void MDS::beacon_send() -{ - ++beacon_last_seq; - dout(10) << "beacon_send " << MDSMap::get_state_name(want_state) - << " seq " << beacon_last_seq - << " (currently " << MDSMap::get_state_name(state) << ")" - << endl; - - beacon_seq_stamp[beacon_last_seq] = g_clock.now(); - - int mon = monmap->pick_mon(); - messenger->send_message(new MMDSBeacon(want_state, beacon_last_seq), - monmap->get_inst(mon)); - - // schedule next sender - if (beacon_sender) timer.cancel_event(beacon_sender); - beacon_sender = new C_MDS_BeaconSender(this); - timer.add_event_after(g_conf.mds_beacon_interval, beacon_sender); -} - -void MDS::handle_mds_beacon(MMDSBeacon *m) -{ - dout(10) << "handle_mds_beacon " << MDSMap::get_state_name(m->get_state()) - << " seq " << m->get_seq() << endl; - version_t seq = m->get_seq(); - - // update lab - if (beacon_seq_stamp.count(seq)) { - assert(beacon_seq_stamp[seq] > beacon_last_acked_stamp); - beacon_last_acked_stamp = beacon_seq_stamp[seq]; - - // clean up seq_stamp map - while (!beacon_seq_stamp.empty() && - beacon_seq_stamp.begin()->first <= seq) - beacon_seq_stamp.erase(beacon_seq_stamp.begin()); - - reset_beacon_killer(); - } - - delete m; -} - -class C_MDS_BeaconKiller : public Context { - MDS *mds; - utime_t lab; -public: - C_MDS_BeaconKiller(MDS *m, utime_t l) : mds(m), lab(l) {} - void finish(int r) { - mds->beacon_kill(lab); - } -}; - -void MDS::reset_beacon_killer() -{ - utime_t when = beacon_last_acked_stamp; - when += g_conf.mds_beacon_grace; - - dout(15) << "reset_beacon_killer last_acked_stamp at " << beacon_last_acked_stamp - << ", will die at " << when << endl; - - if (beacon_killer) timer.cancel_event(beacon_killer); - - beacon_killer = new C_MDS_BeaconKiller(this, beacon_last_acked_stamp); - timer.add_event_at(when, beacon_killer); -} - -void MDS::beacon_kill(utime_t lab) -{ - if (lab == beacon_last_acked_stamp) { - dout(0) << "beacon_kill last_acked_stamp " << lab - << ", killing myself." - << endl; - exit(0); - } else { - dout(20) << "beacon_kill last_acked_stamp " << beacon_last_acked_stamp - << " != my " << lab - << ", doing nothing." - << endl; - } -} - - - -void MDS::handle_mds_map(MMDSMap *m) -{ - version_t epoch = m->get_epoch(); - dout(1) << "handle_mds_map epoch " << epoch << " from " << m->get_source() << endl; - - // note source's map version - if (m->get_source().is_mds() && - peer_mdsmap_epoch[m->get_source().num()] < epoch) { - dout(15) << " peer " << m->get_source() - << " has mdsmap epoch >= " << epoch - << endl; - peer_mdsmap_epoch[m->get_source().num()] = epoch; - } - - // is it new? - if (epoch <= mdsmap->get_epoch()) { - dout(1) << " old map epoch " << epoch << " <= " << mdsmap->get_epoch() - << ", discarding" << endl; - delete m; - return; - } - - // note some old state - int oldwhoami = whoami; - int oldstate = state; - set oldresolve; - mdsmap->get_mds_set(oldresolve, MDSMap::STATE_RESOLVE); - bool wasrejoining = mdsmap->is_rejoining(); - set oldfailed; - mdsmap->get_mds_set(oldfailed, MDSMap::STATE_FAILED); - - // decode and process - mdsmap->decode(m->get_encoded()); - - // see who i am - whoami = mdsmap->get_inst_rank(messenger->get_myaddr()); - if (oldwhoami != whoami) { - // update messenger. - messenger->reset_myname(MSG_ADDR_MDS(whoami)); - - reopen_logger(); - dout(1) << "handle_mds_map i am now mds" << whoami - << " incarnation " << mdsmap->get_inc(whoami) - << endl; - - // do i need an osdmap? - if (oldwhoami < 0) { - // we need an osdmap too. - int mon = monmap->pick_mon(); - messenger->send_message(new MOSDGetMap(0), - monmap->get_inst(mon)); - } - } - - // tell objecter my incarnation - if (objecter->get_client_incarnation() < 0 && - mdsmap->have_inst(whoami)) { - assert(mdsmap->get_inc(whoami) > 0); - objecter->set_client_incarnation(mdsmap->get_inc(whoami)); - } - - // update my state - state = mdsmap->get_state(whoami); - - // did it change? - if (oldstate != state) { - if (state == want_state) { - dout(1) << "handle_mds_map new state " << mdsmap->get_state_name(state) << endl; - } else { - dout(1) << "handle_mds_map new state " << mdsmap->get_state_name(state) - << ", although i wanted " << mdsmap->get_state_name(want_state) - << endl; - want_state = state; - } - - // now active? - if (is_active()) { - dout(1) << "now active" << endl; - finish_contexts(waitfor_active); // kick waiters - } - - else if (is_replay()) { - // initialize gather sets - set rs; - mdsmap->get_recovery_mds_set(rs); - rs.erase(whoami); - dout(1) << "now replay. my recovery peers are " << rs << endl; - mdcache->set_recovery_set(rs); - } - - // now stopping? - else if (is_stopping()) { - assert(oldstate == MDSMap::STATE_ACTIVE); - dout(1) << "now stopping" << endl; - - mdcache->shutdown_start(); - - // save anchor table - if (mdsmap->get_anchortable() == whoami) - anchormgr->save(0); // FIXME? or detect completion via filer? - - if (idalloc) - idalloc->save(0); // FIXME? or detect completion via filer? - - // flush log - mdlog->set_max_events(0); - mdlog->trim(NULL); - } - - // now standby? - else if (is_stopped()) { - assert(oldstate == MDSMap::STATE_STOPPING); - dout(1) << "now stopped, sending down:out and exiting" << endl; - shutdown_final(); - } - } - - - // is anyone resolving? - if (is_resolve() || is_rejoin() || is_active() || is_stopping()) { - set resolve; - mdsmap->get_mds_set(resolve, MDSMap::STATE_RESOLVE); - if (oldresolve != resolve) - dout(10) << "resolve set is " << resolve << ", was " << oldresolve << endl; - for (set::iterator p = resolve.begin(); p != resolve.end(); ++p) { - if (*p == whoami) continue; - if (oldresolve.count(*p) == 0 || // if other guy newly resolve, or - oldstate == MDSMap::STATE_REPLAY) // if i'm newly resolve, - mdcache->send_import_map(*p); // share my import map (now or later) - } - } - - // is everybody finally rejoining? - if (is_rejoin() || is_active() || is_stopping()) { - if (!wasrejoining && mdsmap->is_rejoining()) { - mdcache->send_cache_rejoins(); - } - } - - // did anyone go down? - if (is_active() || is_stopping()) { - set failed; - mdsmap->get_mds_set(failed, MDSMap::STATE_FAILED); - for (set::iterator p = failed.begin(); p != failed.end(); ++p) { - // newly so? - if (oldfailed.count(*p)) continue; - - mdcache->migrator->handle_mds_failure(*p); - } - } - - delete m; -} - -void MDS::handle_osd_map(MOSDMap *m) -{ - version_t had = osdmap->get_epoch(); - - dout(10) << "handle_osd_map had " << had << endl; - - // pass on to clients - for (set::iterator it = clientmap.get_mount_set().begin(); - it != clientmap.get_mount_set().end(); - it++) { - MOSDMap *n = new MOSDMap; - n->maps = m->maps; - n->incremental_maps = m->incremental_maps; - messenger->send_message(n, clientmap.get_inst(*it)); - } - - // process locally - objecter->handle_osd_map(m); - - if (had == 0 && osdmap->get_epoch() > 0) { - if (is_creating()) - boot_create(); // new tables, journal - else if (is_starting()) - boot_start(); // old tables, empty journal - else if (is_replay()) - boot_replay(); // replay, join - else - assert(is_standby()); - } - -} - - -class C_MDS_BootFinish : public Context { - MDS *mds; -public: - C_MDS_BootFinish(MDS *m) : mds(m) {} - void finish(int r) { mds->boot_finish(); } -}; - -void MDS::boot_create() -{ - dout(3) << "boot_create" << endl; - - C_Gather *fin = new C_Gather(new C_MDS_BootFinish(this)); - - if (whoami == 0) { - dout(3) << "boot_create since i am also mds0, creating root inode and dir" << endl; - - // create root inode. - mdcache->open_root(0); - CInode *root = mdcache->get_root(); - assert(root); - - // force empty root dir - CDir *dir = root->dir; - dir->mark_complete(); - dir->mark_dirty(dir->pre_dirty()); - - // save it - mdstore->commit_dir(dir, fin->new_sub()); - } - - // start with a fresh journal - dout(10) << "boot_create creating fresh journal" << endl; - mdlog->reset(); - mdlog->write_head(fin->new_sub()); - - // write our first importmap - mdcache->log_import_map(fin->new_sub()); - - // fixme: fake out idalloc (reset, pretend loaded) - dout(10) << "boot_create creating fresh idalloc table" << endl; - idalloc->reset(); - idalloc->save(fin->new_sub()); - - // fixme: fake out anchortable - if (mdsmap->get_anchortable() == whoami) { - dout(10) << "boot_create creating fresh anchortable" << endl; - anchormgr->reset(); - anchormgr->save(fin->new_sub()); - } -} - -void MDS::boot_start() -{ - dout(2) << "boot_start" << endl; - - C_Gather *fin = new C_Gather(new C_MDS_BootFinish(this)); - - dout(2) << "boot_start opening idalloc" << endl; - idalloc->load(fin->new_sub()); - - if (mdsmap->get_anchortable() == whoami) { - dout(2) << "boot_start opening anchor table" << endl; - anchormgr->load(fin->new_sub()); - } else { - dout(2) << "boot_start i have no anchor table" << endl; - } - - dout(2) << "boot_start opening mds log" << endl; - mdlog->open(fin->new_sub()); - - if (mdsmap->get_root() == whoami) { - dout(2) << "boot_start opening root directory" << endl; - mdcache->open_root(fin->new_sub()); - } -} - -void MDS::boot_finish() -{ - dout(3) << "boot_finish" << endl; - - if (is_starting()) { - // make sure mdslog is empty - assert(mdlog->get_read_pos() == mdlog->get_write_pos()); - } - - set_want_state(MDSMap::STATE_ACTIVE); -} - - -class C_MDS_BootRecover : public Context { - MDS *mds; - int nextstep; -public: - C_MDS_BootRecover(MDS *m, int n) : mds(m), nextstep(n) {} - void finish(int r) { mds->boot_replay(nextstep); } -}; - -void MDS::boot_replay(int step) -{ - switch (step) { - case 0: - step = 1; // fall-thru. - - case 1: - dout(2) << "boot_replay " << step << ": opening idalloc" << endl; - idalloc->load(new C_MDS_BootRecover(this, 2)); - break; - - case 2: - if (mdsmap->get_anchortable() == whoami) { - dout(2) << "boot_replay " << step << ": opening anchor table" << endl; - anchormgr->load(new C_MDS_BootRecover(this, 3)); - break; - } - dout(2) << "boot_replay " << step << ": i have no anchor table" << endl; - step++; // fall-thru - - case 3: - dout(2) << "boot_replay " << step << ": opening mds log" << endl; - mdlog->open(new C_MDS_BootRecover(this, 4)); - break; - - case 4: - dout(2) << "boot_replay " << step << ": replaying mds log" << endl; - mdlog->replay(new C_MDS_BootRecover(this, 5)); - break; - - case 5: - dout(2) << "boot_replay " << step << ": restarting any recovered purges" << endl; - mdcache->start_recovered_purges(); - - step++; // fall-thru - - case 6: - // done with replay! - if (mdsmap->get_num_mds(MDSMap::STATE_ACTIVE) == 0 && - mdsmap->get_num_mds(MDSMap::STATE_STOPPING) == 0 && - mdsmap->get_num_mds(MDSMap::STATE_RESOLVE) == 0 && - mdsmap->get_num_mds(MDSMap::STATE_REJOIN) == 0 && - mdsmap->get_num_mds(MDSMap::STATE_REPLAY) == 1 && // me - mdsmap->get_num_mds(MDSMap::STATE_FAILED) == 0) { - dout(2) << "boot_replay " << step << ": i am alone, moving to state active" << endl; - set_want_state(MDSMap::STATE_ACTIVE); - } else { - dout(2) << "boot_replay " << step << ": i am not alone, moving to state resolve" << endl; - set_want_state(MDSMap::STATE_RESOLVE); - } - break; - - } -} - - -void MDS::set_want_state(int s) -{ - dout(3) << "set_want_state " << MDSMap::get_state_name(s) << endl; - want_state = s; - beacon_send(); -} - - - - -int MDS::shutdown_start() -{ - dout(1) << "shutdown_start" << endl; - derr(0) << "mds shutdown start" << endl; - - // tell everyone to stop. - set active; - mdsmap->get_active_mds_set(active); - for (set::iterator p = active.begin(); - p != active.end(); - p++) { - if (mdsmap->is_up(*p)) { - dout(1) << "sending MShutdownStart to mds" << *p << endl; - send_message_mds(new MGenericMessage(MSG_MDS_SHUTDOWNSTART), - *p, MDS_PORT_MAIN); - } - } - - // go - set_want_state(MDSMap::STATE_STOPPING); - return 0; -} - - -void MDS::handle_shutdown_start(Message *m) -{ - dout(1) << " handle_shutdown_start" << endl; - - // set flag - set_want_state(MDSMap::STATE_STOPPING); - - delete m; -} - - - -int MDS::shutdown_final() -{ - dout(1) << "shutdown_final" << endl; - - // flush loggers - if (logger) logger->flush(true); - if (logger2) logger2->flush(true); - mdlog->flush_logger(); - - // send final down:out beacon (it doesn't matter if this arrives) - set_want_state(MDSMap::STATE_OUT); - - // stop timers - if (beacon_killer) { - timer.cancel_event(beacon_killer); - beacon_killer = 0; - } - if (beacon_sender) { - timer.cancel_event(beacon_sender); - beacon_sender = 0; - } - if (tick_event) { - timer.cancel_event(tick_event); - tick_event = 0; - } - timer.cancel_all(); - timer.join(); - - // shut down cache - mdcache->shutdown(); - - // shut down messenger - messenger->shutdown(); - - return 0; -} - - - - - -void MDS::dispatch(Message *m) -{ - mds_lock.Lock(); - my_dispatch(m); - mds_lock.Unlock(); -} - - - -void MDS::my_dispatch(Message *m) -{ - // from bad mds? - if (m->get_source().is_mds()) { - int from = m->get_source().num(); - if (!mdsmap->have_inst(from) || - mdsmap->get_inst(from) != m->get_source_inst()) { - // bogus mds? - if (m->get_type() != MSG_MDS_MAP) { - dout(5) << "got " << *m << " from old/bad/imposter mds " << m->get_source() - << ", dropping" << endl; - delete m; - return; - } else { - dout(5) << "got " << *m << " from old/bad/imposter mds " << m->get_source() - << ", but it's an mdsmap, looking at it" << endl; - } - } - } - - - switch (m->get_dest_port()) { - - case MDS_PORT_ANCHORMGR: - anchormgr->dispatch(m); - break; - case MDS_PORT_ANCHORCLIENT: - anchorclient->dispatch(m); - break; - - case MDS_PORT_CACHE: - mdcache->dispatch(m); - break; - case MDS_PORT_LOCKER: - locker->dispatch(m); - break; - - case MDS_PORT_MIGRATOR: - mdcache->migrator->dispatch(m); - break; - case MDS_PORT_RENAMER: - mdcache->renamer->dispatch(m); - break; - - case MDS_PORT_BALANCER: - balancer->proc_message(m); - break; - - case MDS_PORT_MAIN: - proc_message(m); - break; - - case MDS_PORT_SERVER: - server->dispatch(m); - break; - - default: - dout(1) << "MDS dispatch unknown message port" << m->get_dest_port() << endl; - assert(0); - } - - - // HACK FOR NOW - if (is_active()) { - // flush log to disk after every op. for now. - mdlog->flush(); - - // trim cache - mdcache->trim(); - } - - // finish any triggered contexts - if (finished_queue.size()) { - dout(7) << "mds has " << finished_queue.size() << " queued contexts" << endl; - list ls; - ls.splice(ls.begin(), finished_queue); - assert(finished_queue.empty()); - finish_contexts(ls); - } - - - - // hack: force hash root? - if (false && - mdcache->get_root() && - mdcache->get_root()->dir && - !(mdcache->get_root()->dir->is_hashed() || - mdcache->get_root()->dir->is_hashing())) { - dout(0) << "hashing root" << endl; - mdcache->migrator->hash_dir(mdcache->get_root()->dir); - } - - - - - // HACK to force export to test foreign renames - if (false && whoami == 0) { - static bool didit = false; - - // 7 to 1 - CInode *in = mdcache->get_inode(1001); - if (in && in->is_dir() && !didit) { - CDir *dir = in->get_or_open_dir(mdcache); - if (dir->is_auth()) { - dout(1) << "FORCING EXPORT" << endl; - mdcache->migrator->export_dir(dir,1); - didit = true; - } - } - } - - - - // shut down? - if (is_stopping()) { - if (mdcache->shutdown_pass()) { - dout(7) << "shutdown_pass=true, finished w/ shutdown, moving to up:stopped" << endl; - - // tell monitor we shut down cleanly. - set_want_state(MDSMap::STATE_STOPPED); - } - } - -} - - -void MDS::proc_message(Message *m) -{ - switch (m->get_type()) { - // OSD =============== - /* - case MSG_OSD_MKFS_ACK: - handle_osd_mkfs_ack(m); - return; - */ - case MSG_OSD_OPREPLY: - objecter->handle_osd_op_reply((class MOSDOpReply*)m); - return; - case MSG_OSD_MAP: - handle_osd_map((MOSDMap*)m); - return; - - - // MDS - case MSG_MDS_MAP: - handle_mds_map((MMDSMap*)m); - return; - - case MSG_MDS_BEACON: - handle_mds_beacon((MMDSBeacon*)m); - return; - - case MSG_MDS_SHUTDOWNSTART: // mds0 -> mds1+ - handle_shutdown_start(m); - return; - - case MSG_PING: - handle_ping((MPing*)m); - return; - - default: - assert(0); - } - -} - - - - - - -void MDS::handle_ping(MPing *m) -{ - dout(10) << " received ping from " << m->get_source() << " with seq " << m->seq << endl; - - messenger->send_message(new MPingAck(m), - m->get_source_inst()); - - delete m; -} - diff --git a/tags/20070517_before_mds_merge/mds/MDS.h b/tags/20070517_before_mds_merge/mds/MDS.h deleted file mode 100644 index 8b3ff1e4aa430..0000000000000 --- a/tags/20070517_before_mds_merge/mds/MDS.h +++ /dev/null @@ -1,269 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __MDS_H -#define __MDS_H - -#include -#include -#include -#include -#include -using namespace std; - -#include -using namespace __gnu_cxx; - -#include "mdstypes.h" - -#include "msg/Dispatcher.h" -#include "include/types.h" -#include "include/Context.h" -#include "common/DecayCounter.h" -#include "common/Logger.h" -#include "common/Mutex.h" -#include "common/Cond.h" -#include "common/Timer.h" - -#include "mon/MonMap.h" -#include "MDSMap.h" - -#include "ClientMap.h" - - -#define MDS_PORT_MAIN 0 -#define MDS_PORT_SERVER 1 -#define MDS_PORT_CACHE 2 -#define MDS_PORT_LOCKER 3 -#define MDS_PORT_STORE 4 -#define MDS_PORT_BALANCER 5 -#define MDS_PORT_MIGRATOR 6 -#define MDS_PORT_RENAMER 7 - -#define MDS_PORT_ANCHORCLIENT 10 -#define MDS_PORT_ANCHORMGR 11 - - -#define MDS_INO_ROOT 1 -#define MDS_INO_PGTABLE 2 -#define MDS_INO_LOG_OFFSET 0x100 -#define MDS_INO_IDS_OFFSET 0x200 -#define MDS_INO_INODEFILE_OFFSET 0x300 -#define MDS_INO_ANCHORTABLE 0x400 -#define MDS_INO_BASE 0x1000 - -#define MDS_TRAVERSE_FORWARD 1 -#define MDS_TRAVERSE_DISCOVER 2 // skips permissions checks etc. -#define MDS_TRAVERSE_DISCOVERXLOCK 3 // succeeds on (foreign?) null, xlocked dentries. -#define MDS_TRAVERSE_FAIL 4 - - -class filepath; - -class OSDMap; -class Objecter; -class Filer; - -class Server; -class Locker; -class AnchorTable; -class AnchorClient; -class MDCache; -class MDStore; -class MDLog; -class MDBalancer; -class IdAllocator; - -class CInode; -class CDir; -class CDentry; - -class Messenger; -class Message; - -class MClientRequest; -class MClientReply; -class MHashReaddir; -class MHashReaddirReply; - -class MMDSBeacon; - - -class MDS : public Dispatcher { - public: - Mutex mds_lock; - - SafeTimer timer; - - protected: - int whoami; - - public: - Messenger *messenger; - MDSMap *mdsmap; - MonMap *monmap; - OSDMap *osdmap; - Objecter *objecter; - Filer *filer; // for reading/writing to/from osds - - ClientMap clientmap; - - // sub systems - Server *server; - MDCache *mdcache; - Locker *locker; - MDStore *mdstore; - MDLog *mdlog; - MDBalancer *balancer; - - IdAllocator *idalloc; - - AnchorTable *anchormgr; - AnchorClient *anchorclient; - - Logger *logger, *logger2; - - - protected: - // -- MDS state -- - int state; // my confirmed state - int want_state; // the state i want - list waitfor_active; - - map peer_mdsmap_epoch; - - public: - void queue_waitfor_active(Context *c) { waitfor_active.push_back(c); } - - bool is_dne() { return state == MDSMap::STATE_DNE; } - bool is_out() { return state == MDSMap::STATE_OUT; } - bool is_failed() { return state == MDSMap::STATE_FAILED; } - bool is_creating() { return state == MDSMap::STATE_CREATING; } - bool is_starting() { return state == MDSMap::STATE_STARTING; } - bool is_standby() { return state == MDSMap::STATE_STANDBY; } - bool is_replay() { return state == MDSMap::STATE_REPLAY; } - bool is_resolve() { return state == MDSMap::STATE_RESOLVE; } - bool is_rejoin() { return state == MDSMap::STATE_REJOIN; } - bool is_active() { return state == MDSMap::STATE_ACTIVE; } - bool is_stopping() { return state == MDSMap::STATE_STOPPING; } - bool is_stopped() { return state == MDSMap::STATE_STOPPED; } - - void set_want_state(int s); - - - // -- waiters -- - list finished_queue; - - void queue_finished(Context *c) { - finished_queue.push_back(c); - } - void queue_finished(list& ls) { - finished_queue.splice( finished_queue.end(), ls ); - } - - // -- keepalive beacon -- - version_t beacon_last_seq; // last seq sent to monitor - map beacon_seq_stamp; // seq # -> time sent - utime_t beacon_last_acked_stamp; // last time we sent a beacon that got acked - Context *beacon_sender; - Context *beacon_killer; // next scheduled time of death - - // tick and other timer fun - Context *tick_event; - void reset_tick(); - - - - // shutdown crap - int req_rate; - - // ino's and fh's - public: - - int get_req_rate() { return req_rate; } - - protected: - - friend class MDStore; - - - public: - MDS(int whoami, Messenger *m, MonMap *mm); - ~MDS(); - - // who am i etc - int get_nodeid() { return whoami; } - MDSMap *get_mds_map() { return mdsmap; } - OSDMap *get_osd_map() { return osdmap; } - - void send_message_mds(Message *m, int mds, int port=0, int fromport=0); - - // start up, shutdown - int init(bool standby=false); - void reopen_logger(); - - void boot_create(); // i am new mds. - void boot_start(); // i am old but empty (was down:out) mds. - void boot_replay(int step=0); // i am recovering existing (down:failed) mds. - void boot_finish(); - - int shutdown_start(); - int shutdown_final(); - - void tick(); - - void beacon_start(); - void beacon_send(); - void beacon_kill(utime_t lab); - void handle_mds_beacon(MMDSBeacon *m); - void reset_beacon_killer(); - - // messages - void proc_message(Message *m); - virtual void dispatch(Message *m); - void my_dispatch(Message *m); - - // special message types - void handle_ping(class MPing *m); - - void handle_mds_map(class MMDSMap *m); - - void handle_shutdown_start(Message *m); - - // osds - void handle_osd_getmap(Message *m); - void handle_osd_map(class MOSDMap *m); - -}; - - - -class C_MDS_RetryMessage : public Context { - Message *m; - MDS *mds; -public: - C_MDS_RetryMessage(MDS *mds, Message *m) { - assert(m); - this->m = m; - this->mds = mds; - } - virtual void finish(int r) { - mds->my_dispatch(m); - } -}; - - - -#endif diff --git a/tags/20070517_before_mds_merge/mds/MDSMap.h b/tags/20070517_before_mds_merge/mds/MDSMap.h deleted file mode 100644 index 66b086e5ea39f..0000000000000 --- a/tags/20070517_before_mds_merge/mds/MDSMap.h +++ /dev/null @@ -1,288 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MDSMAP_H -#define __MDSMAP_H - -#include "common/Clock.h" -#include "msg/Message.h" - -#include "include/types.h" - -#include -#include -#include -using namespace std; - -class MDSMap { - public: - // mds states - static const int STATE_DNE = 0; // down, never existed. - static const int STATE_OUT = 1; // down, once existed, but no imports, empty log. - static const int STATE_FAILED = 2; // down, holds (er, held) metadata; needs to be recovered. - - static const int STATE_STANDBY = 3; // up, but inactive. waiting for assignment by monitor. - static const int STATE_CREATING = 4; // up, creating MDS instance (new journal, idalloc..) - static const int STATE_STARTING = 5; // up, starting prior out MDS instance. - static const int STATE_REPLAY = 6; // up, scanning journal, recoverying any shared state - static const int STATE_RESOLVE = 7; // up, disambiguating partial distributed operations (import/export, ...rename?) - static const int STATE_REJOIN = 8; // up, replayed journal, rejoining distributed cache - static const int STATE_ACTIVE = 9; // up, active - static const int STATE_STOPPING = 10; // up, exporting metadata (-> standby or out) - static const int STATE_STOPPED = 11; // up, finished stopping. like standby, but not avail to takeover. - - static const char *get_state_name(int s) { - switch (s) { - // down - case STATE_DNE: return "down:dne"; - case STATE_OUT: return "down:out"; - case STATE_FAILED: return "down:failed"; - // up - case STATE_STANDBY: return "up:standby"; - case STATE_CREATING: return "up:creating"; - case STATE_STARTING: return "up:starting"; - case STATE_REPLAY: return "up:replay"; - case STATE_RESOLVE: return "up:resolve"; - case STATE_REJOIN: return "up:rejoin"; - case STATE_ACTIVE: return "up:active"; - case STATE_STOPPING: return "up:stopping"; - case STATE_STOPPED: return "up:stopped"; - default: assert(0); - } - return 0; - } - - protected: - epoch_t epoch; - utime_t ctime; - - int anchortable; // which MDS has anchortable (fixme someday) - int root; // which MDS has root directory - - set mds_created; // which mds ids have initialized journals and id tables. - map mds_state; // MDS state - map mds_state_seq; - map mds_inst; // up instances - map mds_inc; // incarnation count (monotonically increases) - - friend class MDSMonitor; - - public: - MDSMap() : epoch(0), anchortable(0), root(0) {} - - epoch_t get_epoch() const { return epoch; } - void inc_epoch() { epoch++; } - - const utime_t& get_ctime() const { return ctime; } - - int get_anchortable() const { return anchortable; } - int get_root() const { return root; } - - // counts - int get_num_mds() const { return mds_state.size(); } - int get_num_mds(int state) { - int n = 0; - for (map::const_iterator p = mds_state.begin(); - p != mds_state.end(); - p++) - if (p->second == state) ++n; - return n; - } - int get_num_up_mds() { - int n = 0; - for (map::const_iterator p = mds_state.begin(); - p != mds_state.end(); - p++) - if (is_up(p->first)) ++n; - return n; - } - int get_num_up_or_failed_mds() { - int n = 0; - for (map::const_iterator p = mds_state.begin(); - p != mds_state.end(); - p++) - if (is_up(p->first) || is_failed(p->first)) - ++n; - return n; - } - - // sets - void get_mds_set(set& s) { - s.clear(); - for (map::const_iterator p = mds_state.begin(); - p != mds_state.end(); - p++) - s.insert(p->first); - } - void get_up_mds_set(set& s) { - s.clear(); - for (map::const_iterator p = mds_state.begin(); - p != mds_state.end(); - p++) - if (is_up(p->first)) - s.insert(p->first); - } - void get_mds_set(set& s, int state) { - s.clear(); - for (map::const_iterator p = mds_state.begin(); - p != mds_state.end(); - p++) - if (p->second == state) - s.insert(p->first); - } - void get_active_mds_set(set& s) { - get_mds_set(s, MDSMap::STATE_ACTIVE); - } - void get_failed_mds_set(set& s) { - get_mds_set(s, MDSMap::STATE_FAILED); - } - void get_recovery_mds_set(set& s) { - s.clear(); - for (map::const_iterator p = mds_state.begin(); - p != mds_state.end(); - p++) - if (is_failed(p->first) || - is_replay(p->first) || is_resolve(p->first) || is_rejoin(p->first) || - is_active(p->first) || is_stopping(p->first)) - s.insert(p->first); - } - - - // mds states - bool is_down(int m) { return is_dne(m) || is_out(m) || is_failed(m); } - bool is_up(int m) { return !is_down(m); } - - bool is_dne(int m) { return mds_state.count(m) == 0 || mds_state[m] == STATE_DNE; } - bool is_out(int m) { return mds_state.count(m) && mds_state[m] == STATE_OUT; } - bool is_failed(int m) { return mds_state.count(m) && mds_state[m] == STATE_FAILED; } - - bool is_standby(int m) { return mds_state.count(m) && mds_state[m] == STATE_STANDBY; } - bool is_creating(int m) { return mds_state.count(m) && mds_state[m] == STATE_CREATING; } - bool is_starting(int m) { return mds_state.count(m) && mds_state[m] == STATE_STARTING; } - bool is_replay(int m) { return mds_state.count(m) && mds_state[m] == STATE_REPLAY; } - bool is_resolve(int m) { return mds_state.count(m) && mds_state[m] == STATE_RESOLVE; } - bool is_rejoin(int m) { return mds_state.count(m) && mds_state[m] == STATE_REJOIN; } - bool is_active(int m) { return mds_state.count(m) && mds_state[m] == STATE_ACTIVE; } - bool is_stopping(int m) { return mds_state.count(m) && mds_state[m] == STATE_STOPPING; } - bool is_stopped(int m) { return mds_state.count(m) && mds_state[m] == STATE_STOPPED; } - - bool has_created(int m) { return mds_created.count(m); } - - // cluster states - bool is_degraded() { // degraded = some recovery in process. fixes active membership and recovery_set. - return get_num_mds(STATE_REPLAY) + - get_num_mds(STATE_RESOLVE) + - get_num_mds(STATE_REJOIN) + - get_num_mds(STATE_FAILED); - } - /*bool is_resolving() { // nodes are resolving distributed ops - return get_num_mds(STATE_RESOLVE); - }*/ - bool is_rejoining() { - // nodes are rejoining cache state - return get_num_mds(STATE_REJOIN) > 0 && - get_num_mds(STATE_RESOLVE) == 0 && - get_num_mds(STATE_REPLAY) == 0 && - get_num_mds(STATE_FAILED) == 0; - } - - - int get_state(int m) { - if (mds_state.count(m)) return mds_state[m]; - return STATE_OUT; - } - - // inst - bool have_inst(int m) { - return mds_inst.count(m); - } - const entity_inst_t& get_inst(int m) { - assert(mds_inst.count(m)); - return mds_inst[m]; - } - bool get_inst(int m, entity_inst_t& inst) { - if (mds_inst.count(m)) { - inst = mds_inst[m]; - return true; - } - return false; - } - - int get_inst_rank(const entity_addr_t& addr) { - for (map::iterator p = mds_inst.begin(); - p != mds_inst.end(); - ++p) { - if (p->second.addr == addr) return p->first; - } - /*else - for (map::iterator p = mds_inst.begin(); - p != mds_inst.end(); - ++p) { - if (memcmp(&p->second.addr,&inst.addr, sizeof(inst.addr)) == 0) return p->first; - } - */ - - return -1; - } - - int get_inc(int m) { - assert(mds_inc.count(m)); - return mds_inc[m]; - } - - - void remove_mds(int m) { - mds_inst.erase(m); - mds_state.erase(m); - mds_state_seq.erase(m); - } - - - // serialize, unserialize - void encode(bufferlist& blist) { - blist.append((char*)&epoch, sizeof(epoch)); - blist.append((char*)&ctime, sizeof(ctime)); - blist.append((char*)&anchortable, sizeof(anchortable)); - blist.append((char*)&root, sizeof(root)); - - ::_encode(mds_state, blist); - ::_encode(mds_state_seq, blist); - ::_encode(mds_inst, blist); - ::_encode(mds_inc, blist); - } - - void decode(bufferlist& blist) { - int off = 0; - blist.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - blist.copy(off, sizeof(ctime), (char*)&ctime); - off += sizeof(ctime); - blist.copy(off, sizeof(anchortable), (char*)&anchortable); - off += sizeof(anchortable); - blist.copy(off, sizeof(root), (char*)&root); - off += sizeof(root); - - ::_decode(mds_state, blist, off); - ::_decode(mds_state_seq, blist, off); - ::_decode(mds_inst, blist, off); - ::_decode(mds_inc, blist, off); - } - - - /*** mapping functions ***/ - - int hash_dentry( inodeno_t dirino, const string& dn ); -}; - -#endif diff --git a/tags/20070517_before_mds_merge/mds/MDStore.cc b/tags/20070517_before_mds_merge/mds/MDStore.cc deleted file mode 100644 index 13aa270a2ee6c..0000000000000 --- a/tags/20070517_before_mds_merge/mds/MDStore.cc +++ /dev/null @@ -1,752 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "MDStore.h" -#include "MDS.h" -#include "MDCache.h" -#include "CInode.h" -#include "CDir.h" -#include "CDentry.h" -#include "MDSMap.h" - -#include "osd/OSDMap.h" -#include "osdc/Filer.h" - -#include "msg/Message.h" - -#include -#include -using namespace std; - - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".store " - - -/* - * separate hashed dir slices into "regions" - */ -size_t get_hash_offset(int hashcode) { - if (hashcode < 0) - return 0; // not hashed - else - return (size_t)(1<<30) * (size_t)(1+hashcode); -} - - - - -// ========================================================================== -// FETCH - - -class C_MDS_Fetch : public Context { - protected: - MDStore *ms; - inodeno_t ino; - - public: - C_MDS_Fetch(MDStore *ms, inodeno_t ino) : Context() { - this->ms = ms; - this->ino = ino; - } - - void finish(int result) { - ms->fetch_dir_2( result, ino ); - } -}; - -/** fetch_dir(dir, context) - * public call to fetch a dir. - */ -void MDStore::fetch_dir( CDir *dir, - Context *c ) -{ - dout(7) << "fetch_dir " << *dir << " context is " << c << endl; - assert(dir->is_auth() || - dir->is_hashed()); - - // wait - if (c) dir->add_waiter(CDIR_WAIT_COMPLETE, c); - - // already fetching? - if (dir->state_test(CDIR_STATE_FETCHING)) { - dout(7) << "already fetching " << *dir << "; waiting" << endl; - return; - } - - // state - dir->state_set(CDIR_STATE_FETCHING); - - // stats - if (mds->logger) mds->logger->inc("fdir"); - - // create return context - Context *fin = new C_MDS_Fetch( this, dir->ino() ); - if (dir->is_hashed()) - fetch_dir_hash( dir, fin, mds->get_nodeid()); // hashed - else - fetch_dir_hash( dir, fin ); // normal -} - -/* - * called by low level fn when it's fetched. - * fix up dir state. - */ -void MDStore::fetch_dir_2( int result, - inodeno_t ino) -{ - CInode *idir = mds->mdcache->get_inode(ino); - - if (!idir || result < 0) return; // hmm! nevermind i guess. - - assert(idir); - CDir *dir = idir->dir; - assert(dir); - - // dir is now complete - dir->state_set(CDIR_STATE_COMPLETE); - dir->state_clear(CDIR_STATE_FETCHING); - - // finish - list finished; - dir->take_waiting(CDIR_WAIT_COMPLETE|CDIR_WAIT_DENTRY, finished); - finish_contexts(finished, result); -} - - -/** low level methods **/ - -class C_MDS_FetchHash : public Context { -protected: - MDS *mds; - inode_t inode; - int hashcode; - Context *context; - -public: - bufferlist bl; - bufferlist bl2; - - C_MDS_FetchHash(MDS *mds, inode_t inode, Context *c, int hashcode) : Context() { - this->mds = mds; - this->inode = inode; - this->hashcode = hashcode; - this->context = c; - } - - void finish(int result) { - assert(result>0); - - // combine bufferlists bl + bl2 -> bl - bl.claim_append(bl2); - - // did i get the whole thing? - size_t size; - bl.copy(0, sizeof(size_t), (char*)&size); - size_t got = bl.length() - sizeof(size); - size_t left = size - got; - size_t from = bl.length(); - - // what part of dir are we getting? - from += get_hash_offset(hashcode); - - if (got >= size) { - // done. - mds->mdstore->fetch_dir_hash_2( bl, inode, context, hashcode ); - } - else { - // read the rest! - dout(12) << "fetch_dir_hash_2 dir size is " << size << ", got " << got << ", reading remaniing " << left << " from off " << from << endl; - - // create return context - C_MDS_FetchHash *fin = new C_MDS_FetchHash( mds, inode, context, hashcode ); - fin->bl.claim( bl ); - mds->filer->read(inode, - from, left, - &fin->bl2, - fin ); - return; - } - } -}; - -/** fetch_dir_hash - * low level method. - * fetch part of a dir. either the whole thing if hashcode is -1, or a specific - * hash segment. - */ -void MDStore::fetch_dir_hash( CDir *dir, - Context *c, - int hashcode) -{ - dout(11) << "fetch_dir_hash hashcode " << hashcode << " " << *dir << endl; - - // create return context - C_MDS_FetchHash *fin = new C_MDS_FetchHash( mds, dir->get_inode()->inode, c, hashcode ); - - // grab first stripe bit (which had better be more than 16 bytes!) - assert(dir->get_inode()->inode.layout.stripe_size >= 16); - mds->filer->read(dir->get_inode()->inode, - get_hash_offset(hashcode), dir->get_inode()->inode.layout.stripe_size, - &fin->bl, - fin ); -} - -void MDStore::fetch_dir_hash_2( bufferlist& bl, - inode_t& inode, - Context *c, - int hashcode) -{ - CInode *idir = mds->mdcache->get_inode(inode.ino); - if (!idir) { - dout(7) << "fetch_dir_hash_2 on ino " << inode.ino << " but no longer in our cache!" << endl; - c->finish(-1); - delete c; - return; - } - - if (!idir->dir_is_auth() || - !idir->dir) { - dout(7) << "fetch_dir_hash_2 on " << *idir << ", but i'm not auth, or dir not open" << endl; - c->finish(-1); - delete c; - return; - } - - // make sure we have a CDir - CDir *dir = idir->get_or_open_dir(mds->mdcache); - - // do it - dout(7) << "fetch_dir_hash_2 hashcode " << hashcode << " dir " << *dir << endl; - - // parse buffer contents into cache - dout(15) << "bl is " << bl << endl; - - int off = 0; - size_t size; - __uint32_t num; - version_t got_version; - int got_hashcode; - bl.copy(off, sizeof(size), (char*)&size); - off += sizeof(size); - assert(bl.length() >= size + sizeof(size)); - bl.copy(off, sizeof(num), (char*)&num); - off += sizeof(num); - bl.copy(off, sizeof(got_version), (char*)&got_version); - off += sizeof(got_version); - bl.copy(off, sizeof(got_hashcode), (char*)&got_hashcode); - off += sizeof(got_hashcode); - - assert(got_hashcode == hashcode); - - int buflen = bl.length(); - - dout(10) << " " << num << " items in " << size << " bytes" << endl; - - unsigned parsed = 0; - while (parsed < num) { - assert(off < buflen && num > 0); - parsed++; - - dout(24) << " " << parsed << "/" << num << " pos " << off << endl; - - // dentry - string dname; - ::_decode(dname, bl, off); - dout(24) << "parse filename '" << dname << "'" << endl; - - CDentry *dn = dir->lookup(dname); // existing dentry? - - char type = bl[off]; - ++off; - if (type == 'L') { - // hard link - inodeno_t ino; - bl.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - - // what to do? - if (hashcode >= 0) { - int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), dname ); - assert(dentryhashcode == hashcode); - } - - if (dn) { - if (dn->get_inode() == 0) { - // negative dentry? - dout(12) << "readdir had NEG dentry " << dname << endl; - } else { - // had dentry - dout(12) << "readdir had dentry " << dname << endl; - } - continue; - } - - // (remote) link - CDentry *dn = dir->add_dentry( dname, ino ); - - // link to inode? - CInode *in = mds->mdcache->get_inode(ino); // we may or may not have it. - if (in) { - dn->link_remote(in); - dout(12) << "readdir got remote link " << ino << " which we have " << *in << endl; - } else { - dout(12) << "readdir got remote link " << ino << " (dont' have it)" << endl; - } - } - else if (type == 'I') { - // inode - - // parse out inode - inode_t inode; - bl.copy(off, sizeof(inode), (char*)&inode); - off += sizeof(inode); - - string symlink; - if (inode.is_symlink()) - ::_decode(symlink, bl, off); - - // what to do? - if (hashcode >= 0) { - int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), dname ); - assert(dentryhashcode == hashcode); - } - - if (dn) { - if (dn->get_inode() == 0) { - // negative dentry? - dout(12) << "readdir had NEG dentry " << dname << endl; - } else { - // had dentry - dout(12) << "readdir had dentry " << dname << endl; - - // under water? - if (dn->get_version() <= got_version) { - assert(dn->get_inode()->get_version() <= got_version); - dout(10) << "readdir had underwater dentry " << dname << " and inode, marking clean" << endl; - dn->mark_clean(); - dn->get_inode()->mark_clean(); - } - } - continue; - } - - // add inode - CInode *in = 0; - if (mds->mdcache->have_inode(inode.ino)) { - in = mds->mdcache->get_inode(inode.ino); - dout(12) << "readdir got (but i already had) " << *in - << " mode " << in->inode.mode - << " mtime " << in->inode.mtime << endl; - } else { - // inode - in = new CInode(mds->mdcache); - in->inode = inode; - - // symlink? - if (in->is_symlink()) { - in->symlink = symlink; - } - - // add - mds->mdcache->add_inode( in ); - } - - // link - dir->add_dentry( dname, in ); - dout(12) << "readdir got " << *in << " mode " << in->inode.mode << " mtime " << in->inode.mtime << endl; - } - else { - dout(1) << "corrupt directory, i got tag char '" << type << "' val " << (int)(type) - << " at pos " << off << endl; - assert(0); - } - } - dout(15) << "parsed " << parsed << endl; - - if (c) { - c->finish(0); - delete c; - } -} - - - - -// ================================================================== -// COMMIT - -class C_MDS_CommitDirVerify : public Context { -public: - MDS *mds; - inodeno_t ino; - version_t version; - Context *c; - - C_MDS_CommitDirVerify( MDS *mds, - inodeno_t ino, - version_t version, - Context *c) { - this->mds = mds; - this->c = c; - this->version = version; - this->ino = ino; - } - - virtual void finish(int r) { - - if (r >= 0) { - CInode *in = mds->mdcache->get_inode(ino); - assert(in && in->dir); - if (in && in->dir && in->dir->is_auth()) { - dout(7) << "CommitDirVerify: current = " << in->dir->get_version() - << ", last committed = " << in->dir->get_last_committed_version() - << ", required = " << version << endl; - - if (in->dir->get_last_committed_version() >= version) { - dout(7) << "my required version is safe, done." << endl; - if (c) { - c->finish(0); - delete c; - } - } else { - dout(7) << "my required version is still not safe, committing again." << endl; - - // what was requested isn't committed yet. - mds->mdstore->commit_dir(in->dir, - version, - c); - } - return; - } - } - - // must have exported ors omethign! - dout(7) << "can't retry commit dir on " << ino << ", must have exported?" << endl; - - // finish. - if (c) { - c->finish(-1); - delete c; - } - } -}; - -class C_MDS_CommitDirFinish : public Context { - protected: - MDStore *ms; - CDir *dir; - version_t version; - - public: - - C_MDS_CommitDirFinish(MDStore *ms, CDir *dir) : Context() { - this->ms = ms; - this->dir = dir; - this->version = dir->get_version(); // just for sanity check later - } - - void finish(int result) { - ms->commit_dir_2( result, dir, version ); - } -}; - - -void MDStore::commit_dir( CDir *dir, - Context *c ) -{ - assert(dir->is_dirty()); - - // commit thru current version - commit_dir(dir, dir->get_version(), c); -} - -void MDStore::commit_dir( CDir *dir, - version_t version, - Context *c ) -{ - assert(dir->is_auth() || - dir->is_hashed()); - - // already committing? - if (dir->state_test(CDIR_STATE_COMMITTING)) { - // already mid-commit! - dout(7) << "commit_dir " << *dir << " mid-commit of " << dir->get_committing_version() << endl; - dout(7) << " current version = " << dir->get_version() << endl; - dout(7) << "requested version = " << version << endl; - - assert(version >= dir->get_last_committed_version()); // why would we request _old_ one? - - dir->add_waiter(CDIR_WAIT_COMMITTED, - new C_MDS_CommitDirVerify(mds, dir->ino(), version, c) ); - return; - } - - if (!dir->can_auth_pin()) { - // something must be frozen up the hiearchy! - dout(7) << "commit_dir " << *dir << " can't auth_pin, waiting" << endl; - dir->add_waiter(CDIR_WAIT_AUTHPINNABLE, - new C_MDS_CommitDirVerify(mds, dir->ino(), version, c) ); - return; - } - - - // is it complete? - if (!dir->is_complete()) { - dout(7) << "commit_dir " << *dir << " not complete, fetching first" << endl; - // fetch dir first - fetch_dir(dir, - new C_MDS_CommitDirVerify(mds, dir->ino(), version, c) ); - return; - } - - - // ok go - dout(7) << "commit_dir " << *dir << " version " << dir->get_version() << endl; - - // add waiter - if (c) dir->add_waiter(CDIR_WAIT_COMMITTED, c); - - // get continuation ready - Context *fin = new C_MDS_CommitDirFinish(this, dir); - - // state - dir->state_set(CDIR_STATE_COMMITTING); - dir->set_committing_version(); - - // stats - if (mds->logger) mds->logger->inc("cdir"); - - if (dir->is_hashed()) { - // hashed - commit_dir_slice( dir, fin, mds->get_nodeid() ); - } else { - // non-hashed - commit_dir_slice( dir, fin ); - } -} - -void MDStore::commit_dir_2( int result, - CDir *dir, - version_t committed_version) -{ - dout(5) << "commit_dir_2 " << *dir << " committed " << committed_version << ", current version " << dir->get_version() << endl; - assert(committed_version == dir->get_committing_version()); - - // remember which version is now safe - dir->set_last_committed_version(committed_version); - - // is the dir now clean? - if (committed_version == dir->get_version()) - dir->mark_clean(); - - dir->state_clear(CDIR_STATE_COMMITTING); - - // finish - dir->finish_waiting(CDIR_WAIT_COMMITTED); -} - - - - -// low-level committer (hashed or normal) - -class C_MDS_CommitSlice : public Context { - protected: - MDStore *ms; - CDir *dir; - Context *c; - int hashcode; - version_t version; - -public: - bufferlist bl; - - C_MDS_CommitSlice(MDStore *ms, CDir *dir, Context *c, int w) : Context() { - this->ms = ms; - this->dir = dir; - this->c = c; - this->hashcode = w; - version = dir->get_version(); - } - - void finish(int result) { - ms->commit_dir_slice_2( result, dir, c, version, hashcode ); - } -}; - - -void MDStore::commit_dir_slice( CDir *dir, - Context *c, - int hashcode) -{ - if (hashcode >= 0) { - assert(dir->is_hashed()); - dout(10) << "commit_dir_slice hashcode " << hashcode << " " << *dir << " version " << dir->get_version() << endl; - } else { - assert(dir->is_auth()); - dout(10) << "commit_dir_slice (whole dir) " << *dir << " version " << dir->get_version() << endl; - } - - // get continuation ready - C_MDS_CommitSlice *fin = new C_MDS_CommitSlice(this, dir, c, hashcode); - - // fill buffer - __uint32_t num = 0; - - bufferlist dirdata; - - version_t v = dir->get_version(); - dirdata.append((char*)&v, sizeof(v)); - dirdata.append((char*)&hashcode, sizeof(hashcode)); - - for (CDir_map_t::iterator it = dir->begin(); - it != dir->end(); - it++) { - CDentry *dn = it->second; - - if (hashcode >= 0) { - int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), it->first ); - if (dentryhashcode != hashcode) continue; - } - - if (dn->is_null()) continue; // skipping negative entry - - // primary or remote? - if (dn->is_remote()) { - - inodeno_t ino = dn->get_remote_ino(); - dout(14) << " pos " << dirdata.length() << " dn '" << it->first << "' remote ino " << ino << endl; - - // name, marker, ion - dirdata.append( it->first.c_str(), it->first.length() + 1); - dirdata.append( "L", 1 ); // remote link - dirdata.append((char*)&ino, sizeof(ino)); - - } else { - // primary link - CInode *in = dn->get_inode(); - assert(in); - - dout(14) << " pos " << dirdata.length() << " dn '" << it->first << "' inode " << *in << endl; - - // name, marker, inode, [symlink string] - dirdata.append( it->first.c_str(), it->first.length() + 1); - dirdata.append( "I", 1 ); // inode - dirdata.append( (char*) &in->inode, sizeof(inode_t)); - - if (in->is_symlink()) { - // include symlink destination! - dout(18) << " inlcuding symlink ptr " << in->symlink << endl; - dirdata.append( (char*) in->symlink.c_str(), in->symlink.length() + 1); - } - } - - num++; - } - dout(14) << "num " << num << endl; - - // put count in buffer - //bufferlist bl; - size_t size = sizeof(num) + dirdata.length(); - fin->bl.append((char*)&size, sizeof(size)); - fin->bl.append((char*)&num, sizeof(num)); - fin->bl.claim_append(dirdata); //.c_str(), dirdata.length()); - assert(fin->bl.length() == size + sizeof(size)); - - // pin inode - dir->auth_pin(); - - // submit to osd - mds->filer->write( dir->get_inode()->inode, - 0, fin->bl.length(), - fin->bl, - 0, //OSD_OP_FLAGS_TRUNCATE, // truncate file/object after end of this write - NULL, fin ); // on safe -} - - -void MDStore::commit_dir_slice_2( int result, - CDir *dir, - Context *c, - version_t committed_version, - int hashcode ) -{ - dout(11) << "commit_dir_slice_2 hashcode " << hashcode << " " << *dir << " v " << committed_version << endl; - - // mark inodes and dentries clean too (if we committed them!) - list null_clean; - for (CDir_map_t::iterator it = dir->begin(); - it != dir->end(); ) { - CDentry *dn = it->second; - it++; - - if (hashcode >= 0) { - int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), dn->get_name() ); - if (dentryhashcode != hashcode) continue; - } - - // dentry - if (committed_version >= dn->get_version()) { - if (dn->is_dirty()) { - dout(15) << " dir " << committed_version << " >= dn " << dn->get_version() << " now clean " << *dn << endl; - dn->mark_clean(); - } - } else { - dout(15) << " dir " << committed_version << " < dn " << dn->get_version() << " still dirty " << *dn << endl; - } - - // only do primary... - if (!dn->is_primary()) - continue; - - CInode *in = dn->get_inode(); - assert(in); - assert(in->is_auth()); - - if (committed_version >= in->get_version()) { - if (in->is_dirty()) { - dout(15) << " dir " << committed_version << " >= inode " << in->get_version() << " now clean " << *in << endl; - in->mark_clean(); - } - } else { - dout(15) << " dir " << committed_version << " < inode " << in->get_version() << " still dirty " << *in << endl; - assert(in->is_dirty()); - } - } - - // unpin - dir->auth_unpin(); - - // finish - if (c) { - c->finish(0); - delete c; - } -} - - - - - - - - - - - - diff --git a/tags/20070517_before_mds_merge/mds/MDStore.h b/tags/20070517_before_mds_merge/mds/MDStore.h deleted file mode 100644 index fe7553608a975..0000000000000 --- a/tags/20070517_before_mds_merge/mds/MDStore.h +++ /dev/null @@ -1,75 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __MDSTORE_H -#define __MDSTORE_H - -#include "include/types.h" -#include "include/buffer.h" - -class MDS; -class CDir; -class Context; - -class MDStore { - protected: - MDS *mds; - - - public: - MDStore(MDS *m) { - mds = m; - } - - - // fetch - public: - void fetch_dir( CDir *dir, Context *c ); - protected: - void fetch_dir_2( int result, inodeno_t ino ); - - void fetch_dir_hash( CDir *dir, - Context *c, - int hashcode = -1); - void fetch_dir_hash_2( bufferlist &bl, - inode_t& inode, - Context *c, - int which); - friend class C_MDS_Fetch; - friend class C_MDS_FetchHash; - - // commit - public: - void commit_dir( CDir *dir, Context *c ); // commit current dir version to disk. - void commit_dir( CDir *dir, __uint64_t version, Context *c ); // commit specified version to disk - protected: - void commit_dir_2( int result, CDir *dir, __uint64_t committed_version ); - - // low level committers - void commit_dir_slice( CDir *dir, - Context *c, - int hashcode = -1); - void commit_dir_slice_2( int result, - CDir *dir, - Context *c, - __uint64_t version, - int hashcode ); - - friend class C_MDS_CommitDirFinish; - friend class C_MDS_CommitSlice; -}; - - -#endif diff --git a/tags/20070517_before_mds_merge/mds/Migrator.cc b/tags/20070517_before_mds_merge/mds/Migrator.cc deleted file mode 100644 index 5d14bfbee4283..0000000000000 --- a/tags/20070517_before_mds_merge/mds/Migrator.cc +++ /dev/null @@ -1,3616 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include "MDS.h" -#include "MDCache.h" -#include "CInode.h" -#include "CDir.h" -#include "CDentry.h" -#include "Migrator.h" -#include "Locker.h" -#include "MDStore.h" -#include "Migrator.h" - -#include "MDBalancer.h" -#include "MDLog.h" -#include "MDSMap.h" - -#include "include/filepath.h" - -#include "events/EString.h" -#include "events/EExportStart.h" -#include "events/EExportFinish.h" -#include "events/EImportStart.h" -#include "events/EImportFinish.h" - -#include "msg/Messenger.h" - -#include "messages/MClientFileCaps.h" - -#include "messages/MExportDirDiscover.h" -#include "messages/MExportDirDiscoverAck.h" -#include "messages/MExportDirPrep.h" -#include "messages/MExportDirPrepAck.h" -#include "messages/MExportDirWarning.h" -#include "messages/MExportDir.h" -#include "messages/MExportDirNotify.h" -#include "messages/MExportDirNotifyAck.h" -#include "messages/MExportDirFinish.h" - -#include "messages/MHashDirDiscover.h" -#include "messages/MHashDirDiscoverAck.h" -#include "messages/MHashDirPrep.h" -#include "messages/MHashDirPrepAck.h" -#include "messages/MHashDir.h" -#include "messages/MHashDirNotify.h" -#include "messages/MHashDirAck.h" - -#include "messages/MUnhashDirPrep.h" -#include "messages/MUnhashDirPrepAck.h" -#include "messages/MUnhashDir.h" -#include "messages/MUnhashDirAck.h" -#include "messages/MUnhashDirNotify.h" -#include "messages/MUnhashDirNotifyAck.h" - - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".migrator " - - - -void Migrator::dispatch(Message *m) -{ - switch (m->get_type()) { - // import - case MSG_MDS_EXPORTDIRDISCOVER: - handle_export_dir_discover((MExportDirDiscover*)m); - break; - case MSG_MDS_EXPORTDIRPREP: - handle_export_dir_prep((MExportDirPrep*)m); - break; - case MSG_MDS_EXPORTDIR: - handle_export_dir((MExportDir*)m); - break; - case MSG_MDS_EXPORTDIRFINISH: - handle_export_dir_finish((MExportDirFinish*)m); - break; - - // export - case MSG_MDS_EXPORTDIRDISCOVERACK: - handle_export_dir_discover_ack((MExportDirDiscoverAck*)m); - break; - case MSG_MDS_EXPORTDIRPREPACK: - handle_export_dir_prep_ack((MExportDirPrepAck*)m); - break; - case MSG_MDS_EXPORTDIRNOTIFYACK: - handle_export_dir_notify_ack((MExportDirNotifyAck*)m); - break; - - // export 3rd party (inode authority) - case MSG_MDS_EXPORTDIRWARNING: - handle_export_dir_warning((MExportDirWarning*)m); - break; - case MSG_MDS_EXPORTDIRNOTIFY: - handle_export_dir_notify((MExportDirNotify*)m); - break; - - - // hashing - case MSG_MDS_HASHDIRDISCOVER: - handle_hash_dir_discover((MHashDirDiscover*)m); - break; - case MSG_MDS_HASHDIRDISCOVERACK: - handle_hash_dir_discover_ack((MHashDirDiscoverAck*)m); - break; - case MSG_MDS_HASHDIRPREP: - handle_hash_dir_prep((MHashDirPrep*)m); - break; - case MSG_MDS_HASHDIRPREPACK: - handle_hash_dir_prep_ack((MHashDirPrepAck*)m); - break; - case MSG_MDS_HASHDIR: - handle_hash_dir((MHashDir*)m); - break; - case MSG_MDS_HASHDIRACK: - handle_hash_dir_ack((MHashDirAck*)m); - break; - case MSG_MDS_HASHDIRNOTIFY: - handle_hash_dir_notify((MHashDirNotify*)m); - break; - - // unhashing - case MSG_MDS_UNHASHDIRPREP: - handle_unhash_dir_prep((MUnhashDirPrep*)m); - break; - case MSG_MDS_UNHASHDIRPREPACK: - handle_unhash_dir_prep_ack((MUnhashDirPrepAck*)m); - break; - case MSG_MDS_UNHASHDIR: - handle_unhash_dir((MUnhashDir*)m); - break; - case MSG_MDS_UNHASHDIRACK: - handle_unhash_dir_ack((MUnhashDirAck*)m); - break; - case MSG_MDS_UNHASHDIRNOTIFY: - handle_unhash_dir_notify((MUnhashDirNotify*)m); - break; - case MSG_MDS_UNHASHDIRNOTIFYACK: - handle_unhash_dir_notify_ack((MUnhashDirNotifyAck*)m); - break; - - default: - assert(0); - } -} - - -class C_MDC_EmptyImport : public Context { - Migrator *mig; - CDir *dir; -public: - C_MDC_EmptyImport(Migrator *m, CDir *d) : mig(m), dir(d) {} - void finish(int r) { - mig->export_empty_import(dir); - } -}; - - -void Migrator::export_empty_import(CDir *dir) -{ - dout(7) << "export_empty_import " << *dir << endl; - - return; // hack fixme - - if (!dir->is_import()) { - dout(7) << "not import (anymore?)" << endl; - return; - } - if (dir->inode->is_root()) { - dout(7) << "root" << endl; - return; - } - - if (dir->get_size() > 0) { - dout(7) << "not actually empty" << endl; - return; - } - - // is it really empty? - if (!dir->is_complete()) { - dout(7) << "not complete, fetching." << endl; - mds->mdstore->fetch_dir(dir, - new C_MDC_EmptyImport(this,dir)); - return; - } - - int dest = dir->inode->authority(); - - // comment this out ot wreak havoc? - //if (mds->is_shutting_down()) dest = 0; // this is more efficient. - - dout(7) << "really empty, exporting to " << dest << endl; - assert (dest != mds->get_nodeid()); - - dout(-7) << "exporting to mds" << dest - << " empty import " << *dir << endl; - export_dir( dir, dest ); -} - - - - -// ========================================================== -// mds failure handling - -void Migrator::handle_mds_failure(int who) -{ - dout(5) << "handle_mds_failure mds" << who << endl; - - // check my exports - map::iterator p = export_state.begin(); - while (p != export_state.end()) { - map::iterator next = p; - next++; - CDir *dir = p->first; - - if (export_peer[dir] == who) { - // the guy i'm exporting to failed. - // clean up. - dout(10) << "cleaning up export state " << p->second << " of " << *dir << endl; - - switch (p->second) { - case EXPORT_DISCOVERING: - dout(10) << "state discovering : canceling freeze and removing auth_pin" << endl; - dir->unfreeze_tree(); // cancel the freeze - dir->auth_unpin(); // remove the auth_pin (that was holding up the freeze) - break; - - case EXPORT_FREEZING: - dout(10) << "state freezing : canceling freeze" << endl; - dir->unfreeze_tree(); // cancel the freeze - break; - - case EXPORT_LOGGINGSTART: - case EXPORT_PREPPING: - dout(10) << "state loggingstart|prepping : logging EExportFinish(false)" << endl; - mds->mdlog->submit_entry(new EExportFinish(dir,false)); - // logger will unfreeze. - break; - - case EXPORT_EXPORTING: - dout(10) << "state exporting : logging EExportFinish(false), reversing, and unfreezing" << endl; - mds->mdlog->submit_entry(new EExportFinish(dir,false)); - reverse_export(dir); - dir->unfreeze_tree(); - break; - - case EXPORT_LOGGINGFINISH: - dout(10) << "state loggingfinish : doing nothing, we were successful." << endl; - break; - - default: - assert(0); - } - - export_state.erase(dir); - export_peer.erase(dir); - - // unpin the path - vector trace; - cache->make_trace(trace, dir->inode); - cache->path_unpin(trace, 0); - - // wake up any waiters - mds->queue_finished(export_finish_waiters[dir]); - export_finish_waiters.erase(dir); - - // send pending import_maps? - mds->mdcache->send_pending_import_maps(); - - mds->mdcache->show_imports(); - mds->mdcache->show_cache(); - } else { - // third party failed. potential peripheral damage? - if (p->second == EXPORT_EXPORTING) { - // yeah, i'm waiting for acks, let's fake theirs. - if (export_notify_ack_waiting[dir].count(who)) { - dout(10) << "faking export_dir_notify_ack from mds" << who - << " on " << *dir << " to mds" << export_peer[dir] - << endl; - export_notify_ack_waiting[dir].erase(who); - if (export_notify_ack_waiting[dir].empty()) - export_dir_acked(dir); - } - } - } - - // next! - p = next; - } - - - // check my imports - map::iterator q = import_state.begin(); - while (q != import_state.end()) { - map::iterator next = q; - next++; - inodeno_t dirino = q->first; - CInode *diri = mds->mdcache->get_inode(dirino); - CDir *dir = 0; - if (diri) - dir = diri->dir; - - if (import_peer[dirino] == who) { - switch (import_peer[dirino]) { - case IMPORT_DISCOVERED: - - break; - - case IMPORT_PREPPING: - - break; - - case IMPORT_PREPPED: - - break; - - case IMPORT_LOGGINGSTART: - - break; - - case IMPORT_ACKING: - // hrm. make this an ambiguous import, and wait for exporter recovery to disambiguate - // ... - break; - - case IMPORT_LOGGINGFINISH: - // do nothing, exporter is no longer involved. - break; - } - } - - // next! - q = next; - } -} - - - - - - -// ========================================================== -// EXPORT - - -class C_MDC_ExportFreeze : public Context { - Migrator *mig; - CDir *ex; // dir i'm exporting - int dest; - -public: - C_MDC_ExportFreeze(Migrator *m, CDir *e, int d) : - mig(m), ex(e), dest(d) {} - virtual void finish(int r) { - if (r >= 0) - mig->export_dir_frozen(ex, dest); - } -}; - - - -/** export_dir(dir, dest) - * public method to initiate an export. - * will fail if the directory is freezing, frozen, unpinnable, or root. - */ -void Migrator::export_dir(CDir *dir, - int dest) -{ - dout(7) << "export_dir " << *dir << " to " << dest << endl; - assert(dest != mds->get_nodeid()); - assert(!dir->is_hashed()); - - if (mds->mdsmap->is_degraded()) { - dout(7) << "cluster degraded, no exports for now" << endl; - return; - } - - if (dir->inode->is_root()) { - dout(7) << "i won't export root" << endl; - assert(0); - return; - } - - if (dir->is_frozen() || - dir->is_freezing()) { - dout(7) << " can't export, freezing|frozen. wait for other exports to finish first." << endl; - return; - } - if (dir->is_hashed()) { - dout(7) << "can't export hashed dir right now. implement me carefully later." << endl; - return; - } - - - // pin path? - vector trace; - cache->make_trace(trace, dir->inode); - if (!cache->path_pin(trace, 0, 0)) { - dout(7) << "export_dir couldn't pin path, failing." << endl; - return; - } - - // ok, let's go. - assert(export_state.count(dir) == 0); - export_state[dir] = EXPORT_DISCOVERING; - export_peer[dir] = dest; - - // send ExportDirDiscover (ask target) - mds->send_message_mds(new MExportDirDiscover(dir->inode), dest, MDS_PORT_MIGRATOR); - dir->auth_pin(); // pin dir, to hang up our freeze (unpin on prep ack) - - // take away the popularity we're sending. FIXME: do this later? - mds->balancer->subtract_export(dir); - - // freeze the subtree - dir->freeze_tree(new C_MDC_ExportFreeze(this, dir, dest)); -} - - -/* - * called on receipt of MExportDirDiscoverAck - * the importer now has the directory's _inode_ in memory, and pinned. - */ -void Migrator::handle_export_dir_discover_ack(MExportDirDiscoverAck *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - - dout(7) << "export_dir_discover_ack from " << m->get_source() - << " on " << *dir << ", releasing auth_pin" << endl; - - export_state[dir] = EXPORT_FREEZING; - - dir->auth_unpin(); // unpin to allow freeze to complete - - delete m; // done -} - -class C_MDC_ExportStartLogged : public Context { - Migrator *mig; - CDir *ex; // dir i'm exporting - int dest; - MExportDirPrep *prep; - -public: - C_MDC_ExportStartLogged(Migrator *m, CDir *e, int d, MExportDirPrep *p) : - mig(m), ex(e), dest(d), prep(p) {} - virtual void finish(int r) { - mig->export_dir_frozen_logged(ex, prep, dest); - } -}; - -void Migrator::export_dir_frozen(CDir *dir, - int dest) -{ - // subtree is now frozen! - dout(7) << "export_dir_frozen on " << *dir << " to " << dest << endl; - export_state[dir] = EXPORT_LOGGINGSTART; - - show_imports(); - - EExportStart *le = new EExportStart(dir, dest); - MExportDirPrep *prep = new MExportDirPrep(dir->inode); - - // include spanning tree for all nested exports. - // these need to be on the destination _before_ the final export so that - // dir_auth updates on any nested exports are properly absorbed. - - set inodes_added; - - // include base dir - prep->add_dir( new CDirDiscover(dir, dir->add_replica(dest)) ); - le->metablob.add_dir( dir, false ); - - // also include traces to all nested exports. - set my_nested; - cache->find_nested_exports(dir, my_nested); - for (set::iterator it = my_nested.begin(); - it != my_nested.end(); - it++) { - CDir *exp = *it; - - dout(7) << " including nested export " << *exp << " in prep" << endl; - - prep->add_export( exp->ino() ); - le->get_bounds().insert(exp->ino()); - le->metablob.add_dir_context( exp ); - le->metablob.add_dir( exp, false ); - - /* first assemble each trace, in trace order, and put in message */ - list inode_trace; - - // trace to dir - CDir *cur = exp; - while (cur != dir) { - // don't repeat ourselves - if (inodes_added.count(cur->ino())) break; // did already! - inodes_added.insert(cur->ino()); - - CDir *parent_dir = cur->get_parent_dir(); - - // inode? - assert(cur->inode->is_auth()); - inode_trace.push_front(cur->inode); - dout(7) << " will add " << *cur->inode << endl; - - // include dir? note: this'll include everything except the nested exports themselves, - // since someone else is obviously auth. - if (cur->is_auth()) { - prep->add_dir( new CDirDiscover(cur, cur->add_replica(dest)) ); // yay! - dout(7) << " added " << *cur << endl; - } - - cur = parent_dir; - } - - for (list::iterator it = inode_trace.begin(); - it != inode_trace.end(); - it++) { - CInode *in = *it; - dout(7) << " added " << *in << endl; - prep->add_inode( in->parent->get_dir()->ino(), - in->parent->get_name(), - in->replicate_to(dest) ); - } - - } - - // log our intentions - dout(7) << " logging EExportStart" << endl; - mds->mdlog->submit_entry(le, new C_MDC_ExportStartLogged(this, dir, dest, prep)); -} - -void Migrator::export_dir_frozen_logged(CDir *dir, MExportDirPrep *prep, int dest) -{ - dout(7) << "export_dir_frozen_logged " << *dir << endl; - - if (export_state.count(dir) == 0 || - export_state[dir] != EXPORT_LOGGINGSTART) { - // export must have aborted. - dout(7) << "export must have aborted, unfreezing and deleting me old prep message" << endl; - delete prep; - dir->unfreeze_tree(); // cancel the freeze - return; - } - - export_state[dir] = EXPORT_PREPPING; - mds->send_message_mds(prep, dest, MDS_PORT_MIGRATOR); -} - -void Migrator::handle_export_dir_prep_ack(MExportDirPrepAck *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - - dout(7) << "export_dir_prep_ack " << *dir << ", starting export" << endl; - - if (export_state.count(dir) == 0 || - export_state[dir] != EXPORT_PREPPING) { - // export must have aborted. - dout(7) << "export must have aborted, unfreezing" << endl; - dir->unfreeze_tree(); - return; - } - - // start export. - export_state[dir] = EXPORT_EXPORTING; - export_dir_go(dir, m->get_source().num()); - - // done - delete m; -} - - -void Migrator::export_dir_go(CDir *dir, - int dest) -{ - dout(7) << "export_dir_go " << *dir << " to " << dest << endl; - - show_imports(); - - assert(export_bounds.count(dir) == 0); - assert(export_data.count(dir) == 0); - - // update imports/exports - CDir *containing_import = cache->get_auth_container(dir); - - if (containing_import == dir) { - dout(7) << " i'm rexporting a previous import" << endl; - assert(dir->is_import()); - cache->imports.erase(dir); - dir->state_clear(CDIR_STATE_IMPORT); - dir->put(CDir::PIN_IMPORT); // unpin, no longer an import - - // discard nested exports (that we're handing off - for (set::iterator p = cache->nested_exports[dir].begin(); - p != cache->nested_exports[dir].end(); ) { - CDir *nested = *p; - p++; - - // add to export message - export_bounds[dir].insert(nested); - - // nested beneath our new export *in; remove! - dout(7) << " export " << *nested << " was nested beneath us; removing from export list(s)" << endl; - assert(cache->exports.count(nested) == 1); - cache->nested_exports[dir].erase(nested); - } - - } else { - dout(7) << " i'm a subdir nested under import " << *containing_import << endl; - cache->exports.insert(dir); - cache->nested_exports[containing_import].insert(dir); - - dir->state_set(CDIR_STATE_EXPORT); - dir->get(CDir::PIN_EXPORT); // i must keep it pinned - - // discard nested exports (that we're handing off) - for (set::iterator p = cache->nested_exports[containing_import].begin(); - p != cache->nested_exports[containing_import].end(); ) { - CDir *nested = *p; - p++; - if (nested == dir) continue; // ignore myself - - // container of parent; otherwise we get ourselves. - CDir *containing_export = nested->get_parent_dir(); - while (containing_export && !containing_export->is_export()) - containing_export = containing_export->get_parent_dir(); - if (!containing_export) continue; - - if (containing_export == dir) { - // nested beneath our new export *in; remove! - dout(7) << " export " << *nested << " was nested beneath us; removing from nested_exports" << endl; - cache->nested_exports[containing_import].erase(nested); - // exports.erase(nested); _walk does this - - // add to msg - export_bounds[dir].insert(nested); - } else { - dout(12) << " export " << *nested << " is under other export " << *containing_export << ", which is unrelated" << endl; - assert(cache->get_auth_container(containing_export) != containing_import); - } - } - } - - // note new authority (locally) - if (dir->inode->authority() == dest) - dir->set_dir_auth( CDIR_AUTH_PARENT ); - else - dir->set_dir_auth( dest ); - - - // make list of nodes i expect an export_dir_notify_ack from - // (everyone w/ this dir open, but me!) - assert(export_notify_ack_waiting[dir].empty()); - for (map::iterator it = dir->replicas_begin(); - it != dir->replicas_end(); - it++) { - if (it->first == mds->get_nodeid()) continue; - export_notify_ack_waiting[dir].insert( it->first ); - - // send warning to all but dest - if (it->first != dest) { - dout(10) << " sending export_dir_warning to mds" << it->first << endl; - mds->send_message_mds(new MExportDirWarning( dir->ino() ), it->first, MDS_PORT_MIGRATOR); - } - } - assert(export_notify_ack_waiting[dir].count( dest )); - - // fill export message with cache data - C_Contexts *fin = new C_Contexts; // collect all the waiters - int num_exported_inodes = encode_export_dir( export_data[dir], - fin, - dir, // base - dir, // recur start point - dest ); - - // send the export data! - MExportDir *req = new MExportDir(dir->ino()); - - // export state - req->set_dirstate( export_data[dir] ); - - // add bounds - for (set::iterator p = export_bounds[dir].begin(); - p != export_bounds[dir].end(); - ++p) - req->add_export((*p)->ino()); - - //s end - mds->send_message_mds(req, dest, MDS_PORT_MIGRATOR); - - // queue up the finisher - dir->add_waiter( CDIR_WAIT_UNFREEZE, fin ); - - - // stats - if (mds->logger) mds->logger->inc("ex"); - if (mds->logger) mds->logger->inc("iex", num_exported_inodes); - - show_imports(); -} - - -/** encode_export_inode - * update our local state for this inode to export. - * encode relevant state to be sent over the wire. - * used by: export_dir_walk, file_rename (if foreign) - */ -void Migrator::encode_export_inode(CInode *in, bufferlist& enc_state, int new_auth) -{ - // tell (all) clients about migrating caps.. mark STALE - for (map::iterator it = in->client_caps.begin(); - it != in->client_caps.end(); - it++) { - dout(7) << "encode_export_inode " << *in << " telling client" << it->first << " stale caps" << endl; - MClientFileCaps *m = new MClientFileCaps(in->inode, - it->second.get_last_seq(), - it->second.pending(), - it->second.wanted(), - MClientFileCaps::FILECAP_STALE); - mds->messenger->send_message(m, mds->clientmap.get_inst(it->first), - 0, MDS_PORT_CACHE); - } - - // relax locks? - if (!in->is_replicated()) - in->replicate_relax_locks(); - - // add inode - assert(!in->is_replica(mds->get_nodeid())); - CInodeExport istate( in ); - istate._encode( enc_state ); - - // we're export this inode; fix inode state - dout(7) << "encode_export_inode " << *in << endl; - - if (in->is_dirty()) in->mark_clean(); - - // clear/unpin cached_by (we're no longer the authority) - in->clear_replicas(); - - // twiddle lock states for auth -> replica transition - // hard - in->hardlock.clear_gather(); - if (in->hardlock.get_state() == LOCK_GLOCKR) - in->hardlock.set_state(LOCK_LOCK); - - // file : we lost all our caps, so move to stable state! - in->filelock.clear_gather(); - if (in->filelock.get_state() == LOCK_GLOCKR || - in->filelock.get_state() == LOCK_GLOCKM || - in->filelock.get_state() == LOCK_GLOCKL || - in->filelock.get_state() == LOCK_GLONERR || - in->filelock.get_state() == LOCK_GLONERM || - in->filelock.get_state() == LOCK_LONER) - in->filelock.set_state(LOCK_LOCK); - if (in->filelock.get_state() == LOCK_GMIXEDR) - in->filelock.set_state(LOCK_MIXED); - // this looks like a step backwards, but it's what we want! - if (in->filelock.get_state() == LOCK_GSYNCM) - in->filelock.set_state(LOCK_MIXED); - if (in->filelock.get_state() == LOCK_GSYNCL) - in->filelock.set_state(LOCK_LOCK); - if (in->filelock.get_state() == LOCK_GMIXEDL) - in->filelock.set_state(LOCK_LOCK); - //in->filelock.set_state(LOCK_MIXED); - - // mark auth - assert(in->is_auth()); - in->set_auth(false); - in->replica_nonce = CINODE_EXPORT_NONCE; - - // *** other state too? - - // move to end of LRU so we drop out of cache quickly! - if (in->get_parent_dn()) - cache->lru.lru_bottouch(in->get_parent_dn()); -} - - -int Migrator::encode_export_dir(list& dirstatelist, - C_Contexts *fin, - CDir *basedir, - CDir *dir, - int newauth) -{ - int num_exported = 0; - - dout(7) << "export_dir_walk " << *dir << " " << dir->nitems << " items" << endl; - - assert(dir->get_projected_version() == dir->get_version()); - - // dir - bufferlist enc_dir; - - CDirExport dstate(dir); - dstate._encode( enc_dir ); - - // release open_by - dir->clear_replicas(); - - // mark - assert(dir->is_auth()); - dir->state_clear(CDIR_STATE_AUTH); - dir->replica_nonce = CDIR_NONCE_EXPORT; - - // proxy - dir->state_set(CDIR_STATE_PROXY); - dir->get(CDir::PIN_PROXY); - export_proxy_dirinos[basedir].push_back(dir->ino()); - - list subdirs; - - if (dir->is_hashed()) { - // fix state - dir->state_clear( CDIR_STATE_AUTH ); - - } else { - - if (dir->is_dirty()) - dir->mark_clean(); - - // discard most dir state - dir->state &= CDIR_MASK_STATE_EXPORT_KEPT; // i only retain a few things. - - // suck up all waiters - list waiting; - dir->take_waiting(CDIR_WAIT_ANY, waiting); // all dir waiters - fin->take(waiting); - - // inodes - - CDir_map_t::iterator it; - for (it = dir->begin(); it != dir->end(); it++) { - CDentry *dn = it->second; - CInode *in = dn->get_inode(); - - num_exported++; - - // -- dentry - dout(7) << "export_dir_walk exporting " << *dn << endl; - _encode(it->first, enc_dir); - - if (dn->is_dirty()) - enc_dir.append("D", 1); // dirty - else - enc_dir.append("C", 1); // clean - - version_t dnv = dn->get_version(); - enc_dir.append((char*)&dnv, sizeof(dnv)); - - // null dentry? - if (dn->is_null()) { - enc_dir.append("N", 1); // null dentry - assert(dn->is_sync()); - continue; - } - - if (dn->is_remote()) { - // remote link - enc_dir.append("L", 1); // remote link - - inodeno_t ino = dn->get_remote_ino(); - enc_dir.append((char*)&ino, sizeof(ino)); - continue; - } - - // primary link - // -- inode - enc_dir.append("I", 1); // inode dentry - - encode_export_inode(in, enc_dir, newauth); // encode, and (update state for) export - - // directory? - if (in->is_dir() && in->dir) { - if (in->dir->is_auth()) { - // nested subdir - assert(in->dir->get_dir_auth() == CDIR_AUTH_PARENT); - subdirs.push_back(in->dir); // it's ours, recurse (later) - - } else { - // nested export - assert(in->dir->get_dir_auth() >= 0); - dout(7) << " encountered nested export " << *in->dir << " dir_auth " << in->dir->get_dir_auth() << "; removing from exports" << endl; - assert(cache->exports.count(in->dir) == 1); - cache->exports.erase(in->dir); // discard nested export (nested_exports updated above) - - in->dir->state_clear(CDIR_STATE_EXPORT); - in->dir->put(CDir::PIN_EXPORT); - - // simplify dir_auth? - if (in->dir->get_dir_auth() == newauth) - in->dir->set_dir_auth( CDIR_AUTH_PARENT ); - } - } - - // add to proxy - export_proxy_inos[basedir].push_back(in->ino()); - in->state_set(CInode::STATE_PROXY); - in->get(CInode::PIN_PROXY); - - // waiters - list waiters; - in->take_waiting(CINODE_WAIT_ANY, waiters); - fin->take(waiters); - } - } - - // add to dirstatelist - bufferlist bl; - dirstatelist.push_back( bl ); - dirstatelist.back().claim( enc_dir ); - - // subdirs - for (list::iterator it = subdirs.begin(); it != subdirs.end(); it++) - num_exported += encode_export_dir(dirstatelist, fin, basedir, *it, newauth); - - return num_exported; -} - - -class C_MDS_ExportFinishLogged : public Context { - Migrator *migrator; - CDir *dir; -public: - C_MDS_ExportFinishLogged(Migrator *m, CDir *d) : migrator(m), dir(d) {} - void finish(int r) { - migrator->export_dir_finish(dir); - } -}; - - -/* - * i should get an export_dir_notify_ack from every mds that had me open, including the new auth (an ack) - */ -void Migrator::handle_export_dir_notify_ack(MExportDirNotifyAck *m) -{ - CInode *diri = cache->get_inode(m->get_ino()); - CDir *dir = diri->dir; - assert(dir); - assert(dir->is_frozen_tree_root()); // i'm exporting! - - // remove from waiting list - int from = m->get_source().num(); - assert(export_notify_ack_waiting[dir].count(from)); - export_notify_ack_waiting[dir].erase(from); - - dout(7) << "handle_export_dir_notify_ack on " << *dir << " from " << from - << ", still need (" << export_notify_ack_waiting[dir] << ")" << endl; - - // done? - if (export_notify_ack_waiting[dir].empty()) { - export_dir_acked(dir); - } else { - dout(7) << "handle_export_dir_notify_ack on " << *dir << " from " << from - << ", still waiting for " << export_notify_ack_waiting[dir] << endl; - } - - delete m; -} - - - -/* - * this happens if hte dest failes after i send teh export data but before it is acked - * that is, we don't know they safely received and logged it, so we reverse our changes - * and go on. - */ -void Migrator::reverse_export(CDir *dir) -{ - dout(7) << "reverse_export " << *dir << endl; - - assert(export_state[dir] == EXPORT_EXPORTING); - assert(export_bounds.count(dir)); - assert(export_data.count(dir)); - - // re-import it. - set bounds; - bounds.swap(export_bounds[dir]); - export_bounds.erase(dir); - - // -- adjust dir_auth -- - // base - CDir *im = dir; - if (dir->get_inode()->authority() == mds->get_nodeid()) { - // parent is already me. was export, adding back to existing import. - im = mds->mdcache->get_auth_container(dir); - assert(im); - mds->mdcache->nested_exports[im].erase(dir); - mds->mdcache->exports.erase(dir); - dir->set_dir_auth( CDIR_AUTH_PARENT ); - dir->state_clear(CDIR_STATE_EXPORT); - dir->put(CDir::PIN_EXPORT); - } else { - // parent isn't me. new import. - mds->mdcache->imports.insert(dir); - dir->set_dir_auth( mds->get_nodeid() ); - dir->state_set(CDIR_STATE_IMPORT); - dir->get(CDir::PIN_IMPORT); - } - - dout(10) << " base " << *dir << endl; - if (dir != im) - dout(10) << " under " << *im << endl; - - // bounds - for (set::iterator p = bounds.begin(); - p != bounds.end(); - ++p) { - CDir *bd = *p; - - if (bd->get_dir_auth() == mds->get_nodeid()) { - // still me. was an import. - mds->mdcache->imports.erase(bd); - bd->set_dir_auth( CDIR_AUTH_PARENT ); - bd->state_clear(CDIR_STATE_IMPORT); - bd->put(CDir::PIN_IMPORT); - // move nested exports. - for (set::iterator q = mds->mdcache->nested_exports[bd].begin(); - q != mds->mdcache->nested_exports[bd].end(); - ++q) - mds->mdcache->nested_exports[im].insert(*q); - mds->mdcache->nested_exports.erase(bd); - } else { - // not me anymore. now an export. - mds->mdcache->exports.insert(bd); - mds->mdcache->nested_exports[im].insert(bd); - assert(bd->get_dir_auth() != CDIR_AUTH_PARENT); - bd->set_dir_auth( CDIR_AUTH_UNKNOWN ); - bd->state_set(CDIR_STATE_EXPORT); - bd->get(CDir::PIN_EXPORT); - } - - dout(10) << " bound " << *bd << endl; - } - - - // reimport the dirs - list imported_subdirs; - int num_imported_inodes = 0; - - for (list::iterator p = export_data[dir].begin(); - p != export_data[dir].end(); - ++p) { - num_imported_inodes += - decode_import_dir(*p, - export_peer[dir], - dir, // import root - imported_subdirs, - 0); - } - - // remove proxy bits - clear_export_proxy_pins(dir); - - // some clean up - export_data.erase(dir); - export_bounds.erase(dir); - export_notify_ack_waiting.erase(dir); -} - - -void Migrator::export_dir_acked(CDir *dir) -{ - dout(7) << "export_dir_acked " << *dir << endl; - export_notify_ack_waiting.erase(dir); - - export_state[dir] = EXPORT_LOGGINGFINISH; - export_data.erase(dir); - export_bounds.erase(dir); - - // log export completion, then finish (unfreeze, trigger finish context, etc.) - mds->mdlog->submit_entry(new EExportFinish(dir, true), - new C_MDS_ExportFinishLogged(this, dir)); -} - - -/* - * once i get all teh notify_acks i can finish - */ -void Migrator::export_dir_finish(CDir *dir) -{ - dout(7) << "export_dir_finish " << *dir << endl; - - if (export_state.count(dir)) { - // send finish/commit to new auth - mds->send_message_mds(new MExportDirFinish(dir->ino()), dir->authority(), MDS_PORT_MIGRATOR); - - // remove from exporting list - export_state.erase(dir); - export_peer.erase(dir); - } else { - dout(7) << "target must have failed, not sending final commit message. export succeeded anyway." << endl; - } - - // unfreeze - dout(7) << "export_dir_finish unfreezing" << endl; - dir->unfreeze_tree(); - - // unpin path - dout(7) << "export_dir_finish unpinning path" << endl; - vector trace; - cache->make_trace(trace, dir->inode); - cache->path_unpin(trace, 0); - - // unpin proxies - clear_export_proxy_pins(dir); - - // queue finishers - mds->queue_finished(export_finish_waiters[dir]); - export_finish_waiters.erase(dir); - - // stats - if (mds->logger) mds->logger->set("nex", cache->exports.size()); - - show_imports(); - - // send pending import_maps? - mds->mdcache->send_pending_import_maps(); -} - - -void Migrator::clear_export_proxy_pins(CDir *dir) -{ - dout(10) << "clear_export_proxy_pins " << *dir << endl; - - // inodes - for (list::iterator it = export_proxy_inos[dir].begin(); - it != export_proxy_inos[dir].end(); - it++) { - CInode *in = cache->get_inode(*it); - dout(15) << " " << *in << endl; - in->put(CInode::PIN_PROXY); - assert(in->state_test(CInode::STATE_PROXY)); - in->state_clear(CInode::STATE_PROXY); - } - export_proxy_inos.erase(dir); - - // dirs - for (list::iterator it = export_proxy_dirinos[dir].begin(); - it != export_proxy_dirinos[dir].end(); - it++) { - CDir *dir = cache->get_inode(*it)->dir; - dout(15) << " " << *dir << endl; - dir->put(CDir::PIN_PROXY); - assert(dir->state_test(CDIR_STATE_PROXY)); - dir->state_clear(CDIR_STATE_PROXY); - - // hose neg dentries, too, since we're no longer auth - CDir_map_t::iterator it; - for (it = dir->begin(); it != dir->end(); ) { - CDentry *dn = it->second; - it++; - if (dn->is_null()) { - assert(dn->is_sync()); - dir->remove_dentry(dn); - } else { - //dout(10) << "export_dir_notify_ack leaving xlocked neg " << *dn << endl; - if (dn->is_dirty()) - dn->mark_clean(); - } - } - } - export_proxy_dirinos.erase(dir); -} - - - - - - -// ========================================================== -// IMPORT - - -class C_MDC_ExportDirDiscover : public Context { - Migrator *mig; - MExportDirDiscover *m; -public: - vector trace; - C_MDC_ExportDirDiscover(Migrator *mig_, MExportDirDiscover *m_) : - mig(mig_), m(m_) {} - void finish(int r) { - CInode *in = 0; - if (r >= 0) in = trace[trace.size()-1]->get_inode(); - mig->handle_export_dir_discover_2(m, in, r); - } -}; - -void Migrator::handle_export_dir_discover(MExportDirDiscover *m) -{ - assert(m->get_source().num() != mds->get_nodeid()); - - dout(7) << "handle_export_dir_discover on " << m->get_path() << endl; - - // must discover it! - C_MDC_ExportDirDiscover *onfinish = new C_MDC_ExportDirDiscover(this, m); - filepath fpath(m->get_path()); - cache->path_traverse(fpath, onfinish->trace, true, - m, new C_MDS_RetryMessage(mds,m), // on delay/retry - MDS_TRAVERSE_DISCOVER, - onfinish); // on completion|error -} - -void Migrator::handle_export_dir_discover_2(MExportDirDiscover *m, CInode *in, int r) -{ - // yay! - if (in) { - dout(7) << "handle_export_dir_discover_2 has " << *in << endl; - } - - if (r < 0 || !in->is_dir()) { - dout(7) << "handle_export_dir_discover_2 failed to discover or not dir " << m->get_path() << ", NAK" << endl; - - assert(0); // this shouldn't happen if the auth pins his path properly!!!! - - mds->send_message_mds(new MExportDirDiscoverAck(m->get_ino(), false), - m->get_source().num(), MDS_PORT_MIGRATOR); - delete m; - return; - } - - assert(in->is_dir()); - - if (in->is_frozen()) { - dout(7) << "frozen, waiting." << endl; - in->add_waiter(CINODE_WAIT_AUTHPINNABLE, - new C_MDS_RetryMessage(mds,m)); - return; - } - - // pin inode in the cache (for now) - in->get(CInode::PIN_IMPORTING); - - // pin auth too, until the import completes. - in->auth_pin(); - - import_state[in->ino()] = IMPORT_DISCOVERED; - import_peer[in->ino()] = m->get_source().num(); - - - // reply - dout(7) << " sending export_dir_discover_ack on " << *in << endl; - mds->send_message_mds(new MExportDirDiscoverAck(in->ino()), - m->get_source().num(), MDS_PORT_MIGRATOR); - delete m; -} - - - -void Migrator::handle_export_dir_prep(MExportDirPrep *m) -{ - assert(m->get_source().num() != mds->get_nodeid()); - - CInode *diri = cache->get_inode(m->get_ino()); - assert(diri); - - list finished; - - // assimilate root dir. - CDir *dir = diri->dir; - if (dir) { - dout(7) << "handle_export_dir_prep on " << *dir << " (had dir)" << endl; - - if (!m->did_assim()) - m->get_dir(diri->ino())->update_dir(dir); - } else { - assert(!m->did_assim()); - - // open dir i'm importing. - diri->set_dir( new CDir(diri, mds->mdcache, false) ); - dir = diri->dir; - m->get_dir(diri->ino())->update_dir(dir); - - dout(7) << "handle_export_dir_prep on " << *dir << " (opening dir)" << endl; - - diri->take_waiting(CINODE_WAIT_DIR, finished); - } - assert(dir->is_auth() == false); - - show_imports(); - - // assimilate contents? - if (!m->did_assim()) { - dout(7) << "doing assim on " << *dir << endl; - m->mark_assim(); // only do this the first time! - - // move pin to dir - diri->put(CInode::PIN_IMPORTING); - dir->get(CDir::PIN_IMPORTING); - - // auth pin too - dir->auth_pin(); - diri->auth_unpin(); - - // change import state - import_state[diri->ino()] = IMPORT_PREPPING; - - // assimilate traces to exports - for (list::iterator it = m->get_inodes().begin(); - it != m->get_inodes().end(); - it++) { - // inode - CInode *in = cache->get_inode( (*it)->get_ino() ); - if (in) { - (*it)->update_inode(in); - dout(7) << " updated " << *in << endl; - } else { - in = new CInode(mds->mdcache, false); - (*it)->update_inode(in); - - // link to the containing dir - CInode *condiri = cache->get_inode( m->get_containing_dirino(in->ino()) ); - assert(condiri && condiri->dir); - cache->add_inode( in ); - condiri->dir->add_dentry( m->get_dentry(in->ino()), in ); - - dout(7) << " added " << *in << endl; - } - - assert( in->get_parent_dir()->ino() == m->get_containing_dirino(in->ino()) ); - - // dir - if (m->have_dir(in->ino())) { - if (in->dir) { - m->get_dir(in->ino())->update_dir(in->dir); - dout(7) << " updated " << *in->dir << endl; - } else { - in->set_dir( new CDir(in, mds->mdcache, false) ); - m->get_dir(in->ino())->update_dir(in->dir); - dout(7) << " added " << *in->dir << endl; - in->take_waiting(CINODE_WAIT_DIR, finished); - } - } - } - - // open export dirs? - for (list::iterator it = m->get_exports().begin(); - it != m->get_exports().end(); - it++) { - dout(7) << " checking dir " << hex << *it << dec << endl; - CInode *in = cache->get_inode(*it); - assert(in); - - // note bound. - import_bounds[dir->ino()].insert(*it); - - if (!in->dir) { - dout(7) << " opening nested export on " << *in << endl; - cache->open_remote_dir(in, - new C_MDS_RetryMessage(mds, m)); - - // pin it! - in->get(CInode::PIN_OPENINGDIR); - in->state_set(CInode::STATE_OPENINGDIR); - } - } - } else { - dout(7) << " not doing assim on " << *dir << endl; - } - - - // verify we have all exports - int waiting_for = 0; - for (list::iterator it = m->get_exports().begin(); - it != m->get_exports().end(); - it++) { - inodeno_t ino = *it; - CInode *in = cache->get_inode(ino); - if (!in) dout(0) << "** missing ino " << hex << ino << dec << endl; - assert(in); - if (in->dir) { - if (!in->dir->state_test(CDIR_STATE_IMPORTINGEXPORT)) { - dout(7) << " pinning nested export " << *in->dir << endl; - in->dir->get(CDir::PIN_IMPORTINGEXPORT); - in->dir->state_set(CDIR_STATE_IMPORTINGEXPORT); - - if (in->state_test(CInode::STATE_OPENINGDIR)) { - in->put(CInode::PIN_OPENINGDIR); - in->state_clear(CInode::STATE_OPENINGDIR); - } - } else { - dout(7) << " already pinned nested export " << *in << endl; - } - } else { - dout(7) << " waiting for nested export dir on " << *in << endl; - waiting_for++; - } - } - if (waiting_for) { - dout(7) << " waiting for " << waiting_for << " nested export dir opens" << endl; - } else { - // ok! - dout(7) << " all ready, sending export_dir_prep_ack on " << *dir << endl; - mds->send_message_mds(new MExportDirPrepAck(dir->ino()), - m->get_source().num(), MDS_PORT_MIGRATOR); - - // note new state - import_state[diri->ino()] = IMPORT_PREPPED; - - // done - delete m; - } - - // finish waiters - finish_contexts(finished, 0); -} - - - - -/* this guy waits for the pre-import discovers on hashed directory dir inodes to finish. - * if it's the last one on the dir, it reprocessed the import. - */ -/* -class C_MDS_ImportPrediscover : public Context { -public: - MDS *mds; - MExportDir *m; - inodeno_t dir_ino; - string dentry; - C_MDS_ImportPrediscover(MDS *mds, MExportDir *m, inodeno_t dir_ino, const string& dentry) { - this->mds = mds; - this->m = m; - this->dir_ino = dir_ino; - this->dentry = dentry; - } - virtual void finish(int r) { - assert(r == 0); // should never fail! - - m->remove_prediscover(dir_ino, dentry); - - if (!m->any_prediscovers()) - mds->mdcache->handle_export_dir(m); - } -}; -*/ - -class C_MDS_ImportDirLoggedStart : public Context { - Migrator *migrator; - CDir *dir; - int from; - list imported_subdirs; - list exports; -public: - C_MDS_ImportDirLoggedStart(Migrator *m, CDir *d, int f, - list& is, list& e) : - migrator(m), dir(d), from(f) { - imported_subdirs.swap(is); - exports.swap(e); - } - void finish(int r) { - migrator->import_dir_logged_start(dir, from, imported_subdirs, exports); - } -}; - -void Migrator::handle_export_dir(MExportDir *m) -{ - CInode *diri = cache->get_inode(m->get_ino()); - assert(diri); - CDir *dir = diri->dir; - assert(dir); - - int oldauth = m->get_source().num(); - dout(7) << "handle_export_dir importing " << *dir << " from " << oldauth << endl; - assert(dir->is_auth() == false); - - show_imports(); - - // start the journal entry - EImportStart *le = new EImportStart(dir->ino(), m->get_exports()); - le->metablob.add_dir_context(dir); - - // note new authority (locally) - CDir *im = dir; - if (dir->inode->is_auth()) { - // parent is already me. was export, adding back to existing import. - im = mds->mdcache->get_auth_container(dir); - assert(im); - mds->mdcache->nested_exports[im].erase(dir); - mds->mdcache->exports.erase(dir); - dir->set_dir_auth( CDIR_AUTH_PARENT ); - dir->state_clear(CDIR_STATE_EXPORT); - dir->put(CDir::PIN_EXPORT); - } else { - // parent isn't me. new import. - mds->mdcache->imports.insert(dir); - dir->set_dir_auth( mds->get_nodeid() ); - dir->state_set(CDIR_STATE_IMPORT); - dir->get(CDir::PIN_IMPORT); - } - - // take out my temp pin - dir->put(CDir::PIN_IMPORTING); - - // mark import point frozen - // (note: this is a manual freeze.. hack hack hack!) - dir->get_inode()->auth_pin(); - dir->state_set(CDIR_STATE_FROZENTREE); - - dout(10) << " base " << *dir << endl; - if (dir != im) - dout(10) << " under " << *im << endl; - - // bounds - for (list::iterator it = m->get_exports().begin(); - it != m->get_exports().end(); - it++) { - CInode *bdi = cache->get_inode(*it); - CDir *bd = bdi->dir; - - if (bd->get_dir_auth() == mds->get_nodeid()) { - // still me. was an import. - assert(bd->is_import()); - mds->mdcache->imports.erase(bd); - bd->set_dir_auth( CDIR_AUTH_PARENT ); - bd->state_clear(CDIR_STATE_IMPORT); - bd->put(CDir::PIN_IMPORT); - // move nested exports. - for (set::iterator q = mds->mdcache->nested_exports[bd].begin(); - q != mds->mdcache->nested_exports[bd].end(); - ++q) - mds->mdcache->nested_exports[im].insert(*q); - mds->mdcache->nested_exports.erase(bd); - } else { - // not me anymore. now an export. - mds->mdcache->exports.insert(bd); - mds->mdcache->nested_exports[im].insert(bd); - assert(bd->get_dir_auth() != CDIR_AUTH_PARENT); - bd->set_dir_auth( CDIR_AUTH_UNKNOWN ); - bd->state_set(CDIR_STATE_EXPORT); - bd->get(CDir::PIN_EXPORT); - } - - // mark export point frozenleaf - bd->get(CDir::PIN_FREEZELEAF); - bd->state_set(CDIR_STATE_FROZENTREELEAF); - assert(import_bounds[dir->ino()].count(*it)); // we took note during prep stage - - // remove our pin - bd->put(CDir::PIN_IMPORTINGEXPORT); - bd->state_clear(CDIR_STATE_IMPORTINGEXPORT); - - dout(10) << " bound " << *bd << endl; - } - - // add this crap to my cache - list imported_subdirs; - int num_imported_inodes = 0; - - for (list::iterator p = m->get_dirstate().begin(); - p != m->get_dirstate().end(); - ++p) { - num_imported_inodes += - decode_import_dir(*p, - oldauth, - dir, // import root - imported_subdirs, - le); - } - dout(10) << " " << imported_subdirs.size() << " imported subdirs" << endl; - dout(10) << " " << m->get_exports().size() << " imported nested exports" << endl; - - - // adjust popularity - mds->balancer->add_import(dir); - - dout(7) << "handle_export_dir did " << *dir << endl; - - // log it - mds->mdlog->submit_entry(le, - new C_MDS_ImportDirLoggedStart(this, dir, m->get_source().num(), - imported_subdirs, m->get_exports())); - - // note state - import_state[dir->ino()] = IMPORT_LOGGINGSTART; - - // some stats - if (mds->logger) { - mds->logger->inc("im"); - mds->logger->inc("iim", num_imported_inodes); - mds->logger->set("nim", cache->imports.size()); - } - - delete m; -} - - -void Migrator::import_dir_logged_start(CDir *dir, int from, - list &imported_subdirs, - list &exports) -{ - dout(7) << "import_dir_logged " << *dir << endl; - - // note state - import_state[dir->ino()] = IMPORT_ACKING; - - // send notify's etc. - dout(7) << "sending notifyack for " << *dir << " to old auth mds" << from << endl; - mds->send_message_mds(new MExportDirNotifyAck(dir->inode->ino()), - from, MDS_PORT_MIGRATOR); - - dout(7) << "sending notify to others" << endl; - for (map::iterator it = dir->replicas_begin(); - it != dir->replicas_end(); - it++) { - assert( it->first != mds->get_nodeid() ); - if ( it->first == from ) continue; // not to old auth. - - MExportDirNotify *notify = new MExportDirNotify(dir->ino(), from, mds->get_nodeid()); - notify->copy_exports(exports); - - if (g_conf.mds_verify_export_dirauth) - notify->copy_subdirs(imported_subdirs); // copy subdir list (DEBUG) - - mds->send_message_mds(notify, it->first, MDS_PORT_MIGRATOR); - } - - show_imports(); -} - - -class C_MDS_ImportDirLoggedFinish : public Context { - Migrator *migrator; - CDir *dir; -public: - C_MDS_ImportDirLoggedFinish(Migrator *m, CDir *d) : migrator(m), dir(d) { } - void finish(int r) { - migrator->import_dir_logged_finish(dir); - } -}; - -void Migrator::handle_export_dir_finish(MExportDirFinish *m) -{ - CInode *diri = cache->get_inode(m->get_ino()); - CDir *dir = diri->dir; - assert(dir); - - dout(7) << "handle_export_dir_finish logging import_finish on " << *dir << endl; - assert(dir->is_auth()); - - // note state - import_state[dir->ino()] = IMPORT_LOGGINGFINISH; - - // log - mds->mdlog->submit_entry(new EImportFinish(dir, true), - new C_MDS_ImportDirLoggedFinish(this,dir)); - delete m; -} - -void Migrator::import_dir_logged_finish(CDir *dir) -{ - dout(7) << "import_dir_logged_finish " << *dir << endl; - - // un auth pin (other exports can now proceed) - dir->auth_unpin(); - - // unfreeze! - for (set::iterator p = import_bounds[dir->ino()].begin(); - p != import_bounds[dir->ino()].end(); - ++p) { - CInode *diri = mds->mdcache->get_inode(*p); - CDir *dir = diri->dir; - assert(dir->state_test(CDIR_STATE_FROZENTREELEAF)); - dir->put(CDir::PIN_FREEZELEAF); - dir->state_clear(CDIR_STATE_FROZENTREELEAF); - } - - dir->unfreeze_tree(); - - // clear import state (we're done!) - import_state.erase(dir->ino()); - import_peer.erase(dir->ino()); - import_bounds.erase(dir->ino()); - - // ok now finish contexts - dout(5) << "finishing any waiters on imported data" << endl; - dir->finish_waiting(CDIR_WAIT_IMPORTED); - - // log it - if (mds->logger) { - mds->logger->set("nex", cache->exports.size()); - mds->logger->set("nim", cache->imports.size()); - } - show_imports(); - - // is it empty? - if (dir->get_size() == 0 && - !dir->inode->is_auth()) { - // reexport! - export_empty_import(dir); - } -} - - -void Migrator::decode_import_inode(CDentry *dn, bufferlist& bl, int& off, int oldauth) -{ - CInodeExport istate; - off = istate._decode(bl, off); - dout(15) << "got a cinodeexport " << endl; - - bool added = false; - CInode *in = cache->get_inode(istate.get_ino()); - if (!in) { - in = new CInode(mds->mdcache); - added = true; - } else { - in->set_auth(true); - } - - // state after link -- or not! -sage - set merged_client_caps; - istate.update_inode(in, merged_client_caps); - - // link before state -- or not! -sage - if (dn->inode != in) { - assert(!dn->inode); - dn->dir->link_inode(dn, in); - } - - // add inode? - if (added) { - cache->add_inode(in); - dout(10) << "added " << *in << endl; - } else { - dout(10) << " had " << *in << endl; - } - - - // adjust replica list - //assert(!in->is_replica(oldauth)); // not true on failed export - in->add_replica( oldauth, CINODE_EXPORT_NONCE ); - if (in->is_replica(mds->get_nodeid())) - in->remove_replica(mds->get_nodeid()); - - // twiddle locks - // hard - if (in->hardlock.get_state() == LOCK_GLOCKR) { - in->hardlock.gather_set.erase(mds->get_nodeid()); - in->hardlock.gather_set.erase(oldauth); - if (in->hardlock.gather_set.empty()) - mds->locker->inode_hard_eval(in); - } - - // caps - for (set::iterator it = merged_client_caps.begin(); - it != merged_client_caps.end(); - it++) { - MClientFileCaps *caps = new MClientFileCaps(in->inode, - in->client_caps[*it].get_last_seq(), - in->client_caps[*it].pending(), - in->client_caps[*it].wanted(), - MClientFileCaps::FILECAP_REAP); - caps->set_mds( oldauth ); // reap from whom? - mds->messenger->send_message(caps, - mds->clientmap.get_inst(*it), - 0, MDS_PORT_CACHE); - } - - // filelock - if (!in->filelock.is_stable()) { - // take me and old auth out of gather set - in->filelock.gather_set.erase(mds->get_nodeid()); - in->filelock.gather_set.erase(oldauth); - if (in->filelock.gather_set.empty()) // necessary but not suffient... - mds->locker->inode_file_eval(in); - } -} - - -int Migrator::decode_import_dir(bufferlist& bl, - int oldauth, - CDir *import_root, - list& imported_subdirs, - EImportStart *le) -{ - int off = 0; - - // set up dir - CDirExport dstate; - off = dstate._decode(bl, off); - - CInode *diri = cache->get_inode(dstate.get_ino()); - assert(diri); - CDir *dir = diri->get_or_open_dir(mds->mdcache); - assert(dir); - - dout(7) << "decode_import_dir " << *dir << endl; - - // add to list - if (dir != import_root) - imported_subdirs.push_back(dir->ino()); - - // assimilate state - dstate.update_dir( dir ); - - // mark (may already be marked from get_or_open_dir() above) - if (!dir->is_auth()) - dir->state_set(CDIR_STATE_AUTH); - - // adjust replica list - //assert(!dir->is_replica(oldauth)); // not true on failed export - dir->add_replica(oldauth); - if (dir->is_replica(mds->get_nodeid())) - dir->remove_replica(mds->get_nodeid()); - - // add to journal entry - if (le) - le->metablob.add_dir(dir, true); // Hmm: false would be okay in some cases - - int num_imported = 0; - - if (dir->is_hashed()) { - - // do nothing; dir is hashed - } else { - // take all waiters on this dir - // NOTE: a pass of imported data is guaranteed to get all of my waiters because - // a replica's presense in my cache implies/forces it's presense in authority's. - list waiters; - - dir->take_waiting(CDIR_WAIT_ANY, waiters); - for (list::iterator it = waiters.begin(); - it != waiters.end(); - it++) - import_root->add_waiter(CDIR_WAIT_IMPORTED, *it); - - dout(15) << "doing contents" << endl; - - // contents - long nden = dstate.get_nden(); - - for (; nden>0; nden--) { - - num_imported++; - - // dentry - string dname; - _decode(dname, bl, off); - dout(15) << "dname is " << dname << endl; - - char dirty; - bl.copy(off, 1, &dirty); - off++; - - version_t dnv; - bl.copy(off, sizeof(dnv), (char*)&dnv); - off += sizeof(dnv); - - char icode; - bl.copy(off, 1, &icode); - off++; - - CDentry *dn = dir->lookup(dname); - if (!dn) - dn = dir->add_dentry(dname); // null - - // mark dentry dirty? - if (dirty == 'D') - dn->_mark_dirty(); - - dn->set_version( dnv ); - dn->set_projected_version( dnv ); - - if (icode == 'N') { - // null dentry - assert(dn->is_null()); - - // fall thru - } - else if (icode == 'L') { - // remote link - inodeno_t ino; - bl.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - dir->link_inode(dn, ino); - } - else if (icode == 'I') { - // inode - decode_import_inode(dn, bl, off, oldauth); - } - - // add dentry to journal entry - if (le) - le->metablob.add_dentry(dn, true); // Hmm: might we do dn->is_dirty() here instead? - } - - } - - dout(7) << "decode_import_dir done " << *dir << endl; - return num_imported; -} - - - - - -// authority bystander - -void Migrator::handle_export_dir_warning(MExportDirWarning *m) -{ - // add to warning list - stray_export_warnings.insert( m->get_ino() ); - - // did i already see the notify? - if (stray_export_notifies.count(m->get_ino())) { - // i did, we're good. - dout(7) << "handle_export_dir_warning on " << m->get_ino() << ". already got notify." << endl; - - // process the notify - map::iterator it = stray_export_notifies.find(m->get_ino()); - handle_export_dir_notify(it->second); - stray_export_notifies.erase(it); - } else { - dout(7) << "handle_export_dir_warning on " << m->get_ino() << ". waiting for notify." << endl; - } - - // done - delete m; -} - - -void Migrator::handle_export_dir_notify(MExportDirNotify *m) -{ - CDir *dir = 0; - CInode *in = cache->get_inode(m->get_ino()); - if (in) dir = in->dir; - - // did i see the warning yet? - if (!stray_export_warnings.count(m->get_ino())) { - // wait for it. - dout(7) << "export_dir_notify on " << m->get_ino() << ", waiting for warning." << endl; - stray_export_notifies.insert(pair( m->get_ino(), m )); - return; - } - - // i did, we're all good. - dout(7) << "export_dir_notify on " << m->get_ino() << ", already saw warning." << endl; - - // update dir_auth! - if (dir) { - dout(7) << "export_dir_notify on " << *dir << " new_auth " << m->get_new_auth() << " (old_auth " << m->get_old_auth() << ")" << endl; - - // update bounds first - for (list::iterator it = m->get_exports().begin(); - it != m->get_exports().end(); - it++) { - CInode *n = cache->get_inode(*it); - if (!n) continue; - CDir *ndir = n->dir; - if (!ndir) continue; - - int boundauth = ndir->authority(); - dout(7) << "export_dir_notify bound " << *ndir << " was dir_auth " << ndir->get_dir_auth() << " (" << boundauth << ")" << endl; - if (ndir->get_dir_auth() == CDIR_AUTH_PARENT) { - if (boundauth != m->get_new_auth()) - ndir->set_dir_auth( boundauth ); - else assert(dir->authority() == m->get_new_auth()); // apparently we already knew! - } else { - if (boundauth == m->get_new_auth()) - ndir->set_dir_auth( CDIR_AUTH_PARENT ); - } - } - - // update dir_auth - if (in->authority() == m->get_new_auth()) { - dout(7) << "handle_export_dir_notify on " << *in << ": inode auth is the same, setting dir_auth -1" << endl; - dir->set_dir_auth( CDIR_AUTH_PARENT ); - assert(!in->is_auth()); - assert(!dir->is_auth()); - } else { - dir->set_dir_auth( m->get_new_auth() ); - } - assert(dir->authority() != mds->get_nodeid()); - assert(!dir->is_auth()); - - // DEBUG: verify subdirs - if (g_conf.mds_verify_export_dirauth) { - - dout(7) << "handle_export_dir_notify on " << *dir << " checking " << m->num_subdirs() << " subdirs" << endl; - for (list::iterator it = m->subdirs_begin(); - it != m->subdirs_end(); - it++) { - CInode *diri = cache->get_inode(*it); - if (!diri) continue; // don't have it, don't care - if (!diri->dir) continue; - dout(10) << "handle_export_dir_notify checking subdir " << *diri->dir << " is auth " << diri->dir->get_dir_auth() << endl; - assert(diri->dir != dir); // base shouldn't be in subdir list - if (diri->dir->get_dir_auth() != CDIR_AUTH_PARENT) { - dout(7) << "*** weird value for dir_auth " << diri->dir->get_dir_auth() << " on " << *diri->dir << ", should have been -1 probably??? ******************" << endl; - assert(0); // bad news! - //dir->set_dir_auth( CDIR_AUTH_PARENT ); - } - assert(diri->dir->authority() == m->get_new_auth()); - } - } - } - - // send notify ack to old auth - dout(7) << "handle_export_dir_notify sending ack to old_auth " << m->get_old_auth() << endl; - mds->send_message_mds(new MExportDirNotifyAck(m->get_ino()), - m->get_old_auth(), MDS_PORT_MIGRATOR); - - - // done - stray_export_warnings.erase( m->get_ino() ); - delete m; -} - - - - - -// ======================================================================= -// HASHING - - -void Migrator::import_hashed_content(CDir *dir, bufferlist& bl, int nden, int oldauth) -{ - int off = 0; - - for (; nden>0; nden--) { - // dentry - string dname; - _decode(dname, bl, off); - dout(15) << "dname is " << dname << endl; - - char icode; - bl.copy(off, 1, &icode); - off++; - - CDentry *dn = dir->lookup(dname); - if (!dn) - dn = dir->add_dentry(dname); // null - - // mark dn dirty _after_ we link the inode (scroll down) - - if (icode == 'N') { - - // null dentry - assert(dn->is_null()); - - // fall thru - } - else if (icode == 'L') { - // remote link - inodeno_t ino; - bl.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - dir->link_inode(dn, ino); - } - else if (icode == 'I') { - // inode - decode_import_inode(dn, bl, off, oldauth); - - // fix up subdir export? - if (dn->inode->dir) { - assert(dn->inode->dir->state_test(CDIR_STATE_IMPORTINGEXPORT)); - dn->inode->dir->put(CDir::PIN_IMPORTINGEXPORT); - dn->inode->dir->state_clear(CDIR_STATE_IMPORTINGEXPORT); - - if (dn->inode->dir->is_auth()) { - // mine. must have been an import. - assert(dn->inode->dir->is_import()); - dout(7) << "unimporting subdir now that inode is mine " << *dn->inode->dir << endl; - dn->inode->dir->set_dir_auth( CDIR_AUTH_PARENT ); - cache->imports.erase(dn->inode->dir); - dn->inode->dir->put(CDir::PIN_IMPORT); - dn->inode->dir->state_clear(CDIR_STATE_IMPORT); - - // move nested under hashdir - for (set::iterator it = cache->nested_exports[dn->inode->dir].begin(); - it != cache->nested_exports[dn->inode->dir].end(); - it++) - cache->nested_exports[dir].insert(*it); - cache->nested_exports.erase(dn->inode->dir); - - // now it matches the inode - dn->inode->dir->set_dir_auth( CDIR_AUTH_PARENT ); - } - else { - // not mine. make it an export. - dout(7) << "making subdir into export " << *dn->inode->dir << endl; - dn->inode->dir->get(CDir::PIN_EXPORT); - dn->inode->dir->state_set(CDIR_STATE_EXPORT); - cache->exports.insert(dn->inode->dir); - cache->nested_exports[dir].insert(dn->inode->dir); - - if (dn->inode->dir->get_dir_auth() == CDIR_AUTH_PARENT) - dn->inode->dir->set_dir_auth( oldauth ); // no longer matches inode - assert(dn->inode->dir->get_dir_auth() >= 0); - } - } - } - - // mark dentry dirty? (only _after_ we link the inode!) - dn->_mark_dirty(); // fixme - } -} - -/* - - notes on interaction of hashing and export/import: - - - dir->is_auth() is completely independent of hashing. for a hashed dir, - - all nodes are partially authoritative - - all nodes dir->is_hashed() == true - - all nodes dir->inode->dir_is_hashed() == true - - one node dir->is_auth() == true, the rest == false - - dir_auth for all subdirs in a hashed dir will (likely?) be explicit. - - - remember simple rule: dir auth follows inode, unless dir_auth is explicit. - - - export_dir_walk and decode_import_dir take care with dir_auth: (for import/export) - - on export, -1 is changed to mds->get_nodeid() - - on import, nothing special, actually. - - - hashed dir files aren't included in export; subdirs are converted to imports - or exports as necessary. - - hashed dir subdirs are discovered on export. this is important - because dirs are needed to tie together auth hierarchy, for auth to know about - imports/exports, etc. - - - dir state is maintained on auth. - - COMPLETE and HASHED are transfered to importers. - - DIRTY is set everywhere. - - - hashed dir is like an import: hashed dir used for nested_exports map. - - nested_exports is updated appropriately on auth and replicas. - - a subtree terminates as a hashed dir, since the hashing explicitly - redelegates all inodes. thus export_dir_walk includes hashed dirs, but - not their inodes. -*/ - -// HASH on auth - -class C_MDC_HashFreeze : public Context { -public: - Migrator *mig; - CDir *dir; - C_MDC_HashFreeze(Migrator *m, CDir *d) : mig(m), dir(d) {} - virtual void finish(int r) { - mig->hash_dir_frozen(dir); - } -}; - -class C_MDC_HashComplete : public Context { -public: - Migrator *mig; - CDir *dir; - C_MDC_HashComplete(Migrator *mig, CDir *dir) { - this->mig = mig; - this->dir = dir; - } - virtual void finish(int r) { - mig->hash_dir_complete(dir); - } -}; - - -/** hash_dir(dir) - * start hashing a directory. - */ -void Migrator::hash_dir(CDir *dir) -{ - dout(-7) << "hash_dir " << *dir << endl; - - assert(!dir->is_hashed()); - assert(dir->is_auth()); - - if (dir->is_frozen() || - dir->is_freezing()) { - dout(7) << " can't hash, freezing|frozen." << endl; - return; - } - - // pin path? - vector trace; - cache->make_trace(trace, dir->inode); - if (!cache->path_pin(trace, 0, 0)) { - dout(7) << "hash_dir couldn't pin path, failing." << endl; - return; - } - - // ok, go - dir->state_set(CDIR_STATE_HASHING); - dir->get(CDir::PIN_HASHING); - assert(dir->hashed_subset.empty()); - - // discover on all mds - assert(hash_gather.count(dir) == 0); - for (int i=0; iget_mds_map()->get_num_mds(); i++) { - if (i == mds->get_nodeid()) continue; // except me - hash_gather[dir].insert(i); - mds->send_message_mds(new MHashDirDiscover(dir->inode), i, MDS_PORT_MIGRATOR); - } - dir->auth_pin(); // pin until discovers are all acked. - - // start freeze - dir->freeze_dir(new C_MDC_HashFreeze(this, dir)); - - // make complete - if (!dir->is_complete()) { - dout(7) << "hash_dir " << *dir << " not complete, fetching" << endl; - mds->mdstore->fetch_dir(dir, - new C_MDC_HashComplete(this, dir)); - } else - hash_dir_complete(dir); -} - - -/* - * wait for everybody to discover and open the hashing dir - * then auth_unpin, to let the freeze happen - */ -void Migrator::handle_hash_dir_discover_ack(MHashDirDiscoverAck *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - - int from = m->get_source().num(); - assert(hash_gather[dir].count(from)); - hash_gather[dir].erase(from); - - if (hash_gather[dir].empty()) { - hash_gather.erase(dir); - dout(7) << "hash_dir_discover_ack " << *dir << ", releasing auth_pin" << endl; - dir->auth_unpin(); // unpin to allow freeze to complete - } else { - dout(7) << "hash_dir_discover_ack " << *dir << ", still waiting for " << hash_gather[dir] << endl; - } - - delete m; // done -} - - - -/* - * once the dir is completely in memory, - * mark all migrating inodes dirty (to pin in cache) - */ -void Migrator::hash_dir_complete(CDir *dir) -{ - dout(7) << "hash_dir_complete " << *dir << ", dirtying inodes" << endl; - - assert(!dir->is_hashed()); - assert(dir->is_auth()); - - // mark dirty to pin in cache - for (CDir_map_t::iterator it = dir->begin(); - it != dir->end(); - it++) { - CInode *in = it->second->inode; - in->_mark_dirty(); // fixme - } - - if (dir->is_frozen_dir()) - hash_dir_go(dir); -} - - -/* - * once the dir is frozen, - * make sure it's complete - * send the prep messages! - */ -void Migrator::hash_dir_frozen(CDir *dir) -{ - dout(7) << "hash_dir_frozen " << *dir << endl; - - assert(!dir->is_hashed()); - assert(dir->is_auth()); - assert(dir->is_frozen_dir()); - - if (!dir->is_complete()) { - dout(7) << "hash_dir_frozen !complete, waiting still on " << *dir << endl; - return; - } - - // send prep messages w/ export directories to open - vector msgs(mds->get_mds_map()->get_num_mds()); - - // check for subdirs - for (CDir_map_t::iterator it = dir->begin(); - it != dir->end(); - it++) { - CDentry *dn = it->second; - CInode *in = dn->inode; - - if (!in->is_dir()) continue; - if (!in->dir) continue; - - int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), it->first ); - if (dentryhashcode == mds->get_nodeid()) continue; - - // msg? - if (msgs[dentryhashcode] == 0) { - msgs[dentryhashcode] = new MHashDirPrep(dir->ino()); - } - msgs[dentryhashcode]->add_inode(it->first, in->replicate_to(dentryhashcode)); - } - - // send them! - assert(hash_gather[dir].empty()); - for (unsigned i=0; isend_message_mds(msgs[i], i, MDS_PORT_MIGRATOR); - hash_gather[dir].insert(i); - } - } - - if (hash_gather[dir].empty()) { - // no subdirs! continue! - hash_gather.erase(dir); - hash_dir_go(dir); - } else { - // wait! - } -} - -/* - * wait for peers to open all subdirs - */ -void Migrator::handle_hash_dir_prep_ack(MHashDirPrepAck *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - - int from = m->get_source().num(); - - assert(hash_gather[dir].count(from) == 1); - hash_gather[dir].erase(from); - - if (hash_gather[dir].empty()) { - hash_gather.erase(dir); - dout(7) << "handle_hash_dir_prep_ack on " << *dir << ", last one" << endl; - hash_dir_go(dir); - } else { - dout(7) << "handle_hash_dir_prep_ack on " << *dir << ", waiting for " << hash_gather[dir] << endl; - } - - delete m; -} - - -/* - * once the dir is frozen, - * make sure it's complete - * do the hashing! - */ -void Migrator::hash_dir_go(CDir *dir) -{ - dout(7) << "hash_dir_go " << *dir << endl; - - assert(!dir->is_hashed()); - assert(dir->is_auth()); - assert(dir->is_frozen_dir()); - - // get messages to other nodes ready - vector msgs(mds->get_mds_map()->get_num_mds()); - for (int i=0; iget_mds_map()->get_num_mds(); i++) { - if (i == mds->get_nodeid()) continue; - msgs[i] = new MHashDir(dir->ino()); - } - - // pick a hash seed. - dir->inode->inode.hash_seed = 1;//dir->ino(); - - // suck up all waiters - C_Contexts *fin = new C_Contexts; - list waiting; - dir->take_waiting(CDIR_WAIT_ANY, waiting); // all dir waiters - fin->take(waiting); - - // get containing import. might be me. - CDir *containing_import = cache->get_auth_container(dir); - assert(containing_import != dir || dir->is_import()); - - // divy up contents - for (CDir_map_t::iterator it = dir->begin(); - it != dir->end(); - it++) { - CDentry *dn = it->second; - CInode *in = dn->inode; - - int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), it->first ); - if (dentryhashcode == mds->get_nodeid()) { - continue; // still mine! - } - - bufferlist *bl = msgs[dentryhashcode]->get_state_ptr(); - assert(bl); - - // -- dentry - dout(7) << "hash_dir_go sending to " << dentryhashcode << " dn " << *dn << endl; - _encode(it->first, *bl); - - // null dentry? - if (dn->is_null()) { - bl->append("N", 1); // null dentry - assert(dn->is_sync()); - continue; - } - - if (dn->is_remote()) { - // remote link - bl->append("L", 1); // remote link - - inodeno_t ino = dn->get_remote_ino(); - bl->append((char*)&ino, sizeof(ino)); - continue; - } - - // primary link - // -- inode - bl->append("I", 1); // inode dentry - - encode_export_inode(in, *bl, dentryhashcode); // encode, and (update state for) export - msgs[dentryhashcode]->inc_nden(); - - if (dn->is_dirty()) - dn->mark_clean(); - - // add to proxy - hash_proxy_inos[dir].push_back(in); - in->state_set(CInode::STATE_PROXY); - in->get(CInode::PIN_PROXY); - - // fix up subdirs - if (in->dir) { - if (in->dir->is_auth()) { - // mine. make it into an import. - dout(7) << "making subdir into import " << *in->dir << endl; - in->dir->set_dir_auth( mds->get_nodeid() ); - cache->imports.insert(in->dir); - in->dir->get(CDir::PIN_IMPORT); - in->dir->state_set(CDIR_STATE_IMPORT); - - // fix nested bits - for (set::iterator it = cache->nested_exports[containing_import].begin(); - it != cache->nested_exports[containing_import].end(); ) { - CDir *ex = *it; - it++; - if (cache->get_auth_container(ex) == in->dir) { - dout(10) << "moving nested export " << *ex << endl; - cache->nested_exports[containing_import].erase(ex); - cache->nested_exports[in->dir].insert(ex); - } - } - } - else { - // not mine. - dout(7) << "un-exporting subdir that's being hashed away " << *in->dir << endl; - assert(in->dir->is_export()); - in->dir->put(CDir::PIN_EXPORT); - in->dir->state_clear(CDIR_STATE_EXPORT); - cache->exports.erase(in->dir); - cache->nested_exports[containing_import].erase(in->dir); - if (in->dir->authority() == dentryhashcode) - in->dir->set_dir_auth( CDIR_AUTH_PARENT ); - else - in->dir->set_dir_auth( in->dir->authority() ); - } - } - - // waiters - list waiters; - in->take_waiting(CINODE_WAIT_ANY, waiters); - fin->take(waiters); - } - - // dir state - dir->state_set(CDIR_STATE_HASHED); - dir->get(CDir::PIN_HASHED); - cache->hashdirs.insert(dir); - dir->mark_dirty(dir->pre_dirty()); // fixme - mds->mdlog->submit_entry(new EString("dirty dir fixme")); - - // inode state - if (dir->inode->is_auth()) { - dir->inode->_mark_dirty(); // fixme - mds->mdlog->submit_entry(new EString("hash dirty fixme")); - } - - // fix up nested_exports? - if (containing_import != dir) { - dout(7) << "moving nested exports under hashed dir" << endl; - for (set::iterator it = cache->nested_exports[containing_import].begin(); - it != cache->nested_exports[containing_import].end(); ) { - CDir *ex = *it; - it++; - if (cache->get_auth_container(ex) == dir) { - dout(7) << " moving nested export under hashed dir: " << *ex << endl; - cache->nested_exports[containing_import].erase(ex); - cache->nested_exports[dir].insert(ex); - } else { - dout(7) << " NOT moving nested export under hashed dir: " << *ex << endl; - } - } - } - - // send hash messages - assert(hash_gather[dir].empty()); - assert(hash_notify_gather[dir].empty()); - assert(dir->hashed_subset.empty()); - for (int i=0; iget_mds_map()->get_num_mds(); i++) { - // all nodes hashed locally.. - dir->hashed_subset.insert(i); - - if (i == mds->get_nodeid()) continue; - - // init hash_gather and hash_notify_gather sets - hash_gather[dir].insert(i); - - assert(hash_notify_gather[dir][i].empty()); - for (int j=0; jget_mds_map()->get_num_mds(); j++) { - if (j == mds->get_nodeid()) continue; - if (j == i) continue; - hash_notify_gather[dir][i].insert(j); - } - - mds->send_message_mds(msgs[i], i, MDS_PORT_MIGRATOR); - } - - // wait for all the acks. -} - - -void Migrator::handle_hash_dir_ack(MHashDirAck *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - - assert(dir->is_hashed()); - assert(dir->is_hashing()); - - int from = m->get_source().num(); - assert(hash_gather[dir].count(from) == 1); - hash_gather[dir].erase(from); - - if (hash_gather[dir].empty()) { - dout(7) << "handle_hash_dir_ack on " << *dir << ", last one" << endl; - - if (hash_notify_gather[dir].empty()) { - dout(7) << "got notifies too, all done" << endl; - hash_dir_finish(dir); - } else { - dout(7) << "waiting on notifies " << endl; - } - - } else { - dout(7) << "handle_hash_dir_ack on " << *dir << ", waiting for " << hash_gather[dir] << endl; - } - - delete m; -} - - -void Migrator::hash_dir_finish(CDir *dir) -{ - dout(7) << "hash_dir_finish finishing " << *dir << endl; - assert(dir->is_hashed()); - assert(dir->is_hashing()); - - // dir state - hash_gather.erase(dir); - dir->state_clear(CDIR_STATE_HASHING); - dir->put(CDir::PIN_HASHING); - dir->hashed_subset.clear(); - - // unproxy inodes - // this _could_ happen sooner, on a per-peer basis, but no harm in waiting a few more seconds. - for (list::iterator it = hash_proxy_inos[dir].begin(); - it != hash_proxy_inos[dir].end(); - it++) { - CInode *in = *it; - assert(in->state_test(CInode::STATE_PROXY)); - in->state_clear(CInode::STATE_PROXY); - in->put(CInode::PIN_PROXY); - } - hash_proxy_inos.erase(dir); - - // unpin path - vector trace; - cache->make_trace(trace, dir->inode); - cache->path_unpin(trace, 0); - - // unfreeze - dir->unfreeze_dir(); - - show_imports(); - assert(hash_gather.count(dir) == 0); - - // stats - //if (mds->logger) mds->logger->inc("nh", 1); - -} - - - - -// HASH on auth and non-auth - -void Migrator::handle_hash_dir_notify(MHashDirNotify *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - assert(dir->is_hashing()); - - dout(5) << "handle_hash_dir_notify " << *dir << endl; - int from = m->get_from(); - - int source = m->get_source().num(); - if (dir->is_auth()) { - // gather notifies - assert(dir->is_hashed()); - - assert( hash_notify_gather[dir][from].count(source) ); - hash_notify_gather[dir][from].erase(source); - - if (hash_notify_gather[dir][from].empty()) { - dout(7) << "last notify from " << from << endl; - hash_notify_gather[dir].erase(from); - - if (hash_notify_gather[dir].empty()) { - dout(7) << "last notify!" << endl; - hash_notify_gather.erase(dir); - - if (hash_gather[dir].empty()) { - dout(7) << "got acks too, all done" << endl; - hash_dir_finish(dir); - } else { - dout(7) << "still waiting on acks from " << hash_gather[dir] << endl; - } - } else { - dout(7) << "still waiting for notify gathers from " << hash_notify_gather[dir].size() << " others" << endl; - } - } else { - dout(7) << "still waiting for notifies from " << from << " via " << hash_notify_gather[dir][from] << endl; - } - - // delete msg - delete m; - } else { - // update dir hashed_subset - assert(dir->hashed_subset.count(from) == 0); - dir->hashed_subset.insert(from); - - // update open subdirs - for (CDir_map_t::iterator it = dir->begin(); - it != dir->end(); - it++) { - CDentry *dn = it->second; - CInode *in = dn->get_inode(); - if (!in) continue; - if (!in->dir) continue; - - int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), it->first ); - if (dentryhashcode != from) continue; // we'll import these in a minute - - if (in->dir->authority() != dentryhashcode) - in->dir->set_dir_auth( in->dir->authority() ); - else - in->dir->set_dir_auth( CDIR_AUTH_PARENT ); - } - - // remove from notify gather set - assert(hash_gather[dir].count(from)); - hash_gather[dir].erase(from); - - // last notify? - if (hash_gather[dir].empty()) { - dout(7) << "gathered all the notifies, finishing hash of " << *dir << endl; - hash_gather.erase(dir); - - dir->state_clear(CDIR_STATE_HASHING); - dir->put(CDir::PIN_HASHING); - dir->hashed_subset.clear(); - } else { - dout(7) << "still waiting for notify from " << hash_gather[dir] << endl; - } - - // fw notify to auth - mds->send_message_mds(m, dir->authority(), MDS_PORT_MIGRATOR); - } -} - - - - -// HASH on non-auth - -/* - * discover step: - * each peer needs to open up the directory and pin it before we start - */ -class C_MDC_HashDirDiscover : public Context { - Migrator *mig; - MHashDirDiscover *m; -public: - vector trace; - C_MDC_HashDirDiscover(Migrator *mig, MHashDirDiscover *m) { - this->mig = mig; - this->m = m; - } - void finish(int r) { - CInode *in = 0; - if (r >= 0) { - if (trace.size()) - in = trace[trace.size()-1]->get_inode(); - else - in = mig->cache->get_root(); - } - mig->handle_hash_dir_discover_2(m, in, r); - } -}; - -void Migrator::handle_hash_dir_discover(MHashDirDiscover *m) -{ - assert(m->get_source().num() != mds->get_nodeid()); - - dout(7) << "handle_hash_dir_discover on " << m->get_path() << endl; - - // must discover it! - C_MDC_HashDirDiscover *onfinish = new C_MDC_HashDirDiscover(this, m); - filepath fpath(m->get_path()); - cache->path_traverse(fpath, onfinish->trace, true, - m, new C_MDS_RetryMessage(mds,m), // on delay/retry - MDS_TRAVERSE_DISCOVER, - onfinish); // on completion|error -} - -void Migrator::handle_hash_dir_discover_2(MHashDirDiscover *m, CInode *in, int r) -{ - // yay! - if (in) { - dout(7) << "handle_hash_dir_discover_2 has " << *in << endl; - } - - if (r < 0 || !in->is_dir()) { - dout(7) << "handle_hash_dir_discover_2 failed to discover or not dir " << m->get_path() << ", NAK" << endl; - assert(0); // this shouldn't happen if the auth pins his path properly!!!! - } - assert(in->is_dir()); - - // is dir open? - if (!in->dir) { - dout(7) << "handle_hash_dir_discover_2 opening dir " << *in << endl; - cache->open_remote_dir(in, - new C_MDS_RetryMessage(mds, m)); - return; - } - CDir *dir = in->dir; - - // pin dir, set hashing flag - dir->state_set(CDIR_STATE_HASHING); - dir->get(CDir::PIN_HASHING); - assert(dir->hashed_subset.empty()); - - // inode state - dir->inode->inode.hash_seed = 1;// dir->ino(); - if (dir->inode->is_auth()) { - dir->inode->_mark_dirty(); // fixme - mds->mdlog->submit_entry(new EString("hash dirty fixme")); - } - - // get gather set ready for notifies - assert(hash_gather[dir].empty()); - for (int i=0; iget_mds_map()->get_num_mds(); i++) { - if (i == mds->get_nodeid()) continue; - if (i == dir->authority()) continue; - hash_gather[dir].insert(i); - } - - // reply - dout(7) << " sending hash_dir_discover_ack on " << *dir << endl; - mds->send_message_mds(new MHashDirDiscoverAck(dir->ino()), - m->get_source().num(), MDS_PORT_MIGRATOR); - delete m; -} - -/* - * prep step: - * peers need to open up all subdirs of the hashed dir - */ - -void Migrator::handle_hash_dir_prep(MHashDirPrep *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - - dout(7) << "handle_hash_dir_prep " << *dir << endl; - - if (!m->did_assim()) { - m->mark_assim(); // only do this the first time! - - // assimilate dentry+inodes for exports - for (map::iterator it = m->get_inodes().begin(); - it != m->get_inodes().end(); - it++) { - CInode *in = cache->get_inode( it->second->get_ino() ); - if (in) { - it->second->update_inode(in); - dout(5) << " updated " << *in << endl; - } else { - in = new CInode(mds->mdcache, false); - it->second->update_inode(in); - cache->add_inode(in); - - // link - dir->add_dentry( it->first, in ); - dout(5) << " added " << *in << endl; - } - - // open! - if (!in->dir) { - dout(5) << " opening nested export on " << *in << endl; - cache->open_remote_dir(in, - new C_MDS_RetryMessage(mds, m)); - } - } - } - - // verify! - int waiting_for = 0; - for (map::iterator it = m->get_inodes().begin(); - it != m->get_inodes().end(); - it++) { - CInode *in = cache->get_inode( it->second->get_ino() ); - assert(in); - - if (in->dir) { - if (!in->dir->state_test(CDIR_STATE_IMPORTINGEXPORT)) { - dout(5) << " pinning nested export " << *in->dir << endl; - in->dir->get(CDir::PIN_IMPORTINGEXPORT); - in->dir->state_set(CDIR_STATE_IMPORTINGEXPORT); - } else { - dout(5) << " already pinned nested export " << *in << endl; - } - } else { - dout(5) << " waiting for nested export dir on " << *in << endl; - waiting_for++; - } - } - - if (waiting_for) { - dout(5) << "waiting for " << waiting_for << " dirs to open" << endl; - return; - } - - // ack! - mds->send_message_mds(new MHashDirPrepAck(dir->ino()), - m->get_source().num(), MDS_PORT_MIGRATOR); - - // done. - delete m; -} - - -/* - * hash step: - */ - -void Migrator::handle_hash_dir(MHashDir *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - assert(!dir->is_auth()); - assert(!dir->is_hashed()); - assert(dir->is_hashing()); - - dout(5) << "handle_hash_dir " << *dir << endl; - int oldauth = m->get_source().num(); - - // content - import_hashed_content(dir, m->get_state(), m->get_nden(), oldauth); - - // dir state - dir->state_set(CDIR_STATE_HASHED); - dir->get(CDir::PIN_HASHED); - cache->hashdirs.insert(dir); - dir->hashed_subset.insert(mds->get_nodeid()); - - // dir is complete - dir->mark_complete(); - dir->mark_dirty(dir->pre_dirty()); // fixme - mds->mdlog->submit_entry(new EString("dirty dir fixme")); - - // commit - mds->mdstore->commit_dir(dir, 0); - - // send notifies - dout(7) << "sending notifies" << endl; - for (int i=0; iget_mds_map()->get_num_mds(); i++) { - if (i == mds->get_nodeid()) continue; - if (i == m->get_source().num()) continue; - mds->send_message_mds(new MHashDirNotify(dir->ino(), mds->get_nodeid()), - i, MDS_PORT_MIGRATOR); - } - - // ack - dout(7) << "acking" << endl; - mds->send_message_mds(new MHashDirAck(dir->ino()), - m->get_source().num(), MDS_PORT_MIGRATOR); - - // done. - delete m; - - show_imports(); -} - - - - - -// UNHASH on auth - -class C_MDC_UnhashFreeze : public Context { -public: - Migrator *mig; - CDir *dir; - C_MDC_UnhashFreeze(Migrator *m, CDir *d) : mig(m), dir(d) {} - virtual void finish(int r) { - mig->unhash_dir_frozen(dir); - } -}; - -class C_MDC_UnhashComplete : public Context { -public: - Migrator *mig; - CDir *dir; - C_MDC_UnhashComplete(Migrator *m, CDir *d) : mig(m), dir(d) {} - virtual void finish(int r) { - mig->unhash_dir_complete(dir); - } -}; - - -void Migrator::unhash_dir(CDir *dir) -{ - dout(-7) << "unhash_dir " << *dir << endl; - - assert(dir->is_hashed()); - assert(!dir->is_unhashing()); - assert(dir->is_auth()); - assert(hash_gather.count(dir)==0); - - // pin path? - vector trace; - cache->make_trace(trace, dir->inode); - if (!cache->path_pin(trace, 0, 0)) { - dout(7) << "unhash_dir couldn't pin path, failing." << endl; - return; - } - - // twiddle state - dir->state_set(CDIR_STATE_UNHASHING); - - // first, freeze the dir. - dir->freeze_dir(new C_MDC_UnhashFreeze(this, dir)); - - // make complete - if (!dir->is_complete()) { - dout(7) << "unhash_dir " << *dir << " not complete, fetching" << endl; - mds->mdstore->fetch_dir(dir, - new C_MDC_UnhashComplete(this, dir)); - } else - unhash_dir_complete(dir); - -} - -void Migrator::unhash_dir_frozen(CDir *dir) -{ - dout(7) << "unhash_dir_frozen " << *dir << endl; - - assert(dir->is_hashed()); - assert(dir->is_auth()); - assert(dir->is_frozen_dir()); - - if (!dir->is_complete()) { - dout(7) << "unhash_dir_frozen !complete, waiting still on " << *dir << endl; - } else - unhash_dir_prep(dir); -} - - -/* - * ask peers to freeze and complete hashed dir - */ -void Migrator::unhash_dir_prep(CDir *dir) -{ - dout(7) << "unhash_dir_prep " << *dir << endl; - assert(dir->is_hashed()); - assert(dir->is_auth()); - assert(dir->is_frozen_dir()); - assert(dir->is_complete()); - - if (!hash_gather[dir].empty()) return; // already been here..freeze must have been instantaneous - - // send unhash prep to all peers - assert(hash_gather[dir].empty()); - for (int i=0; iget_mds_map()->get_num_mds(); i++) { - if (i == mds->get_nodeid()) continue; - hash_gather[dir].insert(i); - mds->send_message_mds(new MUnhashDirPrep(dir->ino()), - i, MDS_PORT_MIGRATOR); - } -} - -/* - * wait for peers to freeze and complete hashed dirs - */ -void Migrator::handle_unhash_dir_prep_ack(MUnhashDirPrepAck *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - - int from = m->get_source().num(); - dout(7) << "handle_unhash_dir_prep_ack from " << from << " " << *dir << endl; - - if (!m->did_assim()) { - m->mark_assim(); // only do this the first time! - - // assimilate dentry+inodes for exports - for (map::iterator it = m->get_inodes().begin(); - it != m->get_inodes().end(); - it++) { - CInode *in = cache->get_inode( it->second->get_ino() ); - if (in) { - it->second->update_inode(in); - dout(5) << " updated " << *in << endl; - } else { - in = new CInode(mds->mdcache, false); - it->second->update_inode(in); - cache->add_inode(in); - - // link - dir->add_dentry( it->first, in ); - dout(5) << " added " << *in << endl; - } - - // open! - if (!in->dir) { - dout(5) << " opening nested export on " << *in << endl; - cache->open_remote_dir(in, - new C_MDS_RetryMessage(mds, m)); - } - } - } - - // verify! - int waiting_for = 0; - for (map::iterator it = m->get_inodes().begin(); - it != m->get_inodes().end(); - it++) { - CInode *in = cache->get_inode( it->second->get_ino() ); - assert(in); - - if (in->dir) { - if (!in->dir->state_test(CDIR_STATE_IMPORTINGEXPORT)) { - dout(5) << " pinning nested export " << *in->dir << endl; - in->dir->get(CDir::PIN_IMPORTINGEXPORT); - in->dir->state_set(CDIR_STATE_IMPORTINGEXPORT); - } else { - dout(5) << " already pinned nested export " << *in << endl; - } - } else { - dout(5) << " waiting for nested export dir on " << *in << endl; - waiting_for++; - } - } - - if (waiting_for) { - dout(5) << "waiting for " << waiting_for << " dirs to open" << endl; - return; - } - - // ok, done with this PrepAck - assert(hash_gather[dir].count(from) == 1); - hash_gather[dir].erase(from); - - if (hash_gather[dir].empty()) { - hash_gather.erase(dir); - dout(7) << "handle_unhash_dir_prep_ack on " << *dir << ", last one" << endl; - unhash_dir_go(dir); - } else { - dout(7) << "handle_unhash_dir_prep_ack on " << *dir << ", waiting for " << hash_gather[dir] << endl; - } - - delete m; -} - - -/* - * auth: - * send out MHashDir's to peers - */ -void Migrator::unhash_dir_go(CDir *dir) -{ - dout(7) << "unhash_dir_go " << *dir << endl; - assert(dir->is_hashed()); - assert(dir->is_auth()); - assert(dir->is_frozen_dir()); - assert(dir->is_complete()); - - // send unhash prep to all peers - assert(hash_gather[dir].empty()); - for (int i=0; iget_mds_map()->get_num_mds(); i++) { - if (i == mds->get_nodeid()) continue; - hash_gather[dir].insert(i); - mds->send_message_mds(new MUnhashDir(dir->ino()), - i, MDS_PORT_MIGRATOR); - } -} - -/* - * auth: - * assimilate unhashing content - */ -void Migrator::handle_unhash_dir_ack(MUnhashDirAck *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - - dout(7) << "handle_unhash_dir_ack " << *dir << endl; - assert(dir->is_hashed()); - - // assimilate content - int from = m->get_source().num(); - import_hashed_content(dir, m->get_state(), m->get_nden(), from); - delete m; - - // done? - assert(hash_gather[dir].count(from)); - hash_gather[dir].erase(from); - - if (!hash_gather[dir].empty()) { - dout(7) << "still waiting for unhash acks from " << hash_gather[dir] << endl; - return; - } - - // done! - - // fix up nested_exports - CDir *containing_import = cache->get_auth_container(dir); - if (containing_import != dir) { - for (set::iterator it = cache->nested_exports[dir].begin(); - it != cache->nested_exports[dir].end(); - it++) { - dout(7) << "moving nested export out from under hashed dir : " << **it << endl; - cache->nested_exports[containing_import].insert(*it); - } - cache->nested_exports.erase(dir); - } - - // dir state - //dir->state_clear(CDIR_STATE_UNHASHING); //later - dir->state_clear(CDIR_STATE_HASHED); - dir->put(CDir::PIN_HASHED); - cache->hashdirs.erase(dir); - - // commit! - assert(dir->is_complete()); - //dir->mark_complete(); - dir->mark_dirty(dir->pre_dirty()); // fixme - mds->mdstore->commit_dir(dir, 0); - - // inode state - dir->inode->inode.hash_seed = 0; - if (dir->inode->is_auth()) { - dir->inode->_mark_dirty(); // fixme - mds->mdlog->submit_entry(new EString("hash inode dirty fixme")); - } - - // notify - assert(hash_gather[dir].empty()); - for (int i=0; iget_mds_map()->get_num_mds(); i++) { - if (i == mds->get_nodeid()) continue; - - hash_gather[dir].insert(i); - - mds->send_message_mds(new MUnhashDirNotify(dir->ino()), - i, MDS_PORT_MIGRATOR); - } -} - - -/* - * sent by peer to flush mds links. unfreeze when all gathered. - */ -void Migrator::handle_unhash_dir_notify_ack(MUnhashDirNotifyAck *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - - dout(7) << "handle_unhash_dir_ack " << *dir << endl; - assert(!dir->is_hashed()); - assert(dir->is_unhashing()); - assert(dir->is_frozen_dir()); - - // done? - int from = m->get_source().num(); - assert(hash_gather[dir].count(from)); - hash_gather[dir].erase(from); - delete m; - - if (!hash_gather[dir].empty()) { - dout(7) << "still waiting for notifyack from " << hash_gather[dir] << " on " << *dir << endl; - } else { - unhash_dir_finish(dir); - } -} - - -/* - * all mds links are flushed. unfreeze dir! - */ -void Migrator::unhash_dir_finish(CDir *dir) -{ - dout(7) << "unhash_dir_finish " << *dir << endl; - hash_gather.erase(dir); - - // unpin path - vector trace; - cache->make_trace(trace, dir->inode); - cache->path_unpin(trace, 0); - - // state - dir->state_clear(CDIR_STATE_UNHASHING); - - // unfreeze - dir->unfreeze_dir(); - -} - - - -// UNHASH on all - -/* - * hashed dir is complete. - * mark all migrating inodes dirty (to pin in cache) - * if frozen too, then go to next step (depending on auth) - */ -void Migrator::unhash_dir_complete(CDir *dir) -{ - dout(7) << "unhash_dir_complete " << *dir << ", dirtying inodes" << endl; - - assert(dir->is_hashed()); - assert(dir->is_complete()); - - // mark dirty to pin in cache - for (CDir_map_t::iterator it = dir->begin(); - it != dir->end(); - it++) { - CInode *in = it->second->inode; - if (in->is_auth()) { - in->_mark_dirty(); // fixme - mds->mdlog->submit_entry(new EString("unhash dirty fixme")); - } - } - - if (!dir->is_frozen_dir()) { - dout(7) << "dir complete but !frozen, waiting " << *dir << endl; - } else { - if (dir->is_auth()) - unhash_dir_prep(dir); // auth - else - unhash_dir_prep_finish(dir); // nonauth - } -} - - -// UNHASH on non-auth - -class C_MDC_UnhashPrepFreeze : public Context { -public: - Migrator *mig; - CDir *dir; - C_MDC_UnhashPrepFreeze(Migrator *m, CDir *d) : mig(m), dir(d) {} - virtual void finish(int r) { - mig->unhash_dir_prep_frozen(dir); - } -}; - - -/* - * peers need to freeze their dir and make them complete - */ -void Migrator::handle_unhash_dir_prep(MUnhashDirPrep *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - - dout(7) << "handle_unhash_dir_prep " << *dir << endl; - assert(dir->is_hashed()); - - // freeze - dir->freeze_dir(new C_MDC_UnhashPrepFreeze(this, dir)); - - // make complete - if (!dir->is_complete()) { - dout(7) << "unhash_dir " << *dir << " not complete, fetching" << endl; - mds->mdstore->fetch_dir(dir, - new C_MDC_UnhashComplete(this, dir)); - } else { - unhash_dir_complete(dir); - } - - delete m; -} - -/* - * peer has hashed dir frozen. - * complete too? - */ -void Migrator::unhash_dir_prep_frozen(CDir *dir) -{ - dout(7) << "unhash_dir_prep_frozen " << *dir << endl; - - assert(dir->is_hashed()); - assert(dir->is_frozen_dir()); - assert(!dir->is_auth()); - - if (!dir->is_complete()) { - dout(7) << "unhash_dir_prep_frozen !complete, waiting still on " << *dir << endl; - } else - unhash_dir_prep_finish(dir); -} - -/* - * peer has hashed dir complete and frozen. ack. - */ -void Migrator::unhash_dir_prep_finish(CDir *dir) -{ - dout(7) << "unhash_dir_prep_finish " << *dir << endl; - assert(dir->is_hashed()); - assert(!dir->is_auth()); - assert(dir->is_frozen()); - assert(dir->is_complete()); - - // twiddle state - if (dir->is_unhashing()) - return; // already replied. - dir->state_set(CDIR_STATE_UNHASHING); - - // send subdirs back to auth - MUnhashDirPrepAck *ack = new MUnhashDirPrepAck(dir->ino()); - int auth = dir->authority(); - - for (CDir_map_t::iterator it = dir->begin(); - it != dir->end(); - it++) { - CDentry *dn = it->second; - CInode *in = dn->inode; - - if (!in->is_dir()) continue; - if (!in->dir) continue; - - int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), it->first ); - if (dentryhashcode != mds->get_nodeid()) continue; - - // msg? - ack->add_inode(it->first, in->replicate_to(auth)); - } - - // ack - mds->send_message_mds(ack, auth, MDS_PORT_MIGRATOR); -} - - - -/* - * peer needs to send hashed dir content back to auth. - * unhash dir. - */ -void Migrator::handle_unhash_dir(MUnhashDir *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - - dout(7) << "handle_unhash_dir " << *dir << endl;//" .. hash_seed is " << dir->inode->inode.hash_seed << endl; - assert(dir->is_hashed()); - assert(dir->is_unhashing()); - assert(!dir->is_auth()); - - // get message ready - bufferlist bl; - int nden = 0; - - // suck up all waiters - C_Contexts *fin = new C_Contexts; - list waiting; - dir->take_waiting(CDIR_WAIT_ANY, waiting); // all dir waiters - fin->take(waiting); - - // divy up contents - for (CDir_map_t::iterator it = dir->begin(); - it != dir->end(); - it++) { - CDentry *dn = it->second; - CInode *in = dn->inode; - - int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), it->first ); - if (dentryhashcode != mds->get_nodeid()) { - // not mine! - // twiddle dir_auth? - if (in->dir) { - if (in->dir->authority() != dir->authority()) - in->dir->set_dir_auth( in->dir->authority() ); - else - in->dir->set_dir_auth( CDIR_AUTH_PARENT ); - } - continue; - } - - // -- dentry - dout(7) << "unhash_dir_go sending to " << dentryhashcode << " dn " << *dn << endl; - _encode(it->first, bl); - - // null dentry? - if (dn->is_null()) { - bl.append("N", 1); // null dentry - assert(dn->is_sync()); - continue; - } - - if (dn->is_remote()) { - // remote link - bl.append("L", 1); // remote link - - inodeno_t ino = dn->get_remote_ino(); - bl.append((char*)&ino, sizeof(ino)); - continue; - } - - // primary link - // -- inode - bl.append("I", 1); // inode dentry - - encode_export_inode(in, bl, dentryhashcode); // encode, and (update state for) export - nden++; - - if (dn->is_dirty()) - dn->mark_clean(); - - // proxy - in->state_set(CInode::STATE_PROXY); - in->get(CInode::PIN_PROXY); - hash_proxy_inos[dir].push_back(in); - - if (in->dir) { - if (in->dir->is_auth()) { - // mine. make it into an import. - dout(7) << "making subdir into import " << *in->dir << endl; - in->dir->set_dir_auth( mds->get_nodeid() ); - cache->imports.insert(in->dir); - in->dir->get(CDir::PIN_IMPORT); - in->dir->state_set(CDIR_STATE_IMPORT); - } - else { - // not mine. - dout(7) << "un-exporting subdir that's being unhashed away " << *in->dir << endl; - assert(in->dir->is_export()); - in->dir->put(CDir::PIN_EXPORT); - in->dir->state_clear(CDIR_STATE_EXPORT); - cache->exports.erase(in->dir); - cache->nested_exports[dir].erase(in->dir); - } - } - - // waiters - list waiters; - in->take_waiting(CINODE_WAIT_ANY, waiters); - fin->take(waiters); - } - - // we should have no nested exports; we're not auth for the dir! - assert(cache->nested_exports[dir].empty()); - cache->nested_exports.erase(dir); - - // dir state - //dir->state_clear(CDIR_STATE_UNHASHING); // later - dir->state_clear(CDIR_STATE_HASHED); - dir->put(CDir::PIN_HASHED); - cache->hashdirs.erase(dir); - dir->mark_clean(); - - // inode state - dir->inode->inode.hash_seed = 0; - if (dir->inode->is_auth()) { - dir->inode->_mark_dirty(); // fixme - mds->mdlog->submit_entry(new EString("unhash inode dirty fixme")); - } - - // init gather set - mds->get_mds_map()->get_active_mds_set( hash_gather[dir] ); - hash_gather[dir].erase(mds->get_nodeid()); - - // send unhash message - mds->send_message_mds(new MUnhashDirAck(dir->ino(), bl, nden), - dir->authority(), MDS_PORT_MIGRATOR); -} - - -/* - * first notify comes from auth. - * send notifies to all other peers, with peer = self - * if we get notify from peer=other, remove from our gather list. - * when we've gotten notifies from everyone, - * unpin proxies, - * send notify_ack to auth. - * this ensures that all mds links are flushed of cache_expire type messages. - */ -void Migrator::handle_unhash_dir_notify(MUnhashDirNotify *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - - dout(7) << "handle_unhash_dir_finish " << *dir << endl; - assert(!dir->is_hashed()); - assert(dir->is_unhashing()); - assert(!dir->is_auth()); - - int from = m->get_source().num(); - assert(hash_gather[dir].count(from) == 1); - hash_gather[dir].erase(from); - delete m; - - // did we send our shout out? - if (from == dir->authority()) { - // send notify to everyone else in weird chatter storm - for (int i=0; iget_mds_map()->get_num_mds(); i++) { - if (i == from) continue; - if (i == mds->get_nodeid()) continue; - mds->send_message_mds(new MUnhashDirNotify(dir->ino()), i, MDS_PORT_MIGRATOR); - } - } - - // are we done? - if (!hash_gather[dir].empty()) { - dout(7) << "still waiting for notify from " << hash_gather[dir] << endl; - return; - } - hash_gather.erase(dir); - - // all done! - dout(7) << "all mds links flushed, unpinning unhash proxies" << endl; - - // unpin proxies - for (list::iterator it = hash_proxy_inos[dir].begin(); - it != hash_proxy_inos[dir].end(); - it++) { - CInode *in = *it; - assert(in->state_test(CInode::STATE_PROXY)); - in->state_clear(CInode::STATE_PROXY); - in->put(CInode::PIN_PROXY); - } - - // unfreeze - dir->unfreeze_dir(); - - // ack - dout(7) << "sending notify_ack to auth for unhash of " << *dir << endl; - mds->send_message_mds(new MUnhashDirNotifyAck(dir->ino()), dir->authority(), MDS_PORT_MIGRATOR); - -} - - - - -void Migrator::show_imports() -{ - mds->balancer->show_imports(); -} diff --git a/tags/20070517_before_mds_merge/mds/Migrator.h b/tags/20070517_before_mds_merge/mds/Migrator.h deleted file mode 100644 index dd2886008d163..0000000000000 --- a/tags/20070517_before_mds_merge/mds/Migrator.h +++ /dev/null @@ -1,265 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_MIGRATOR_H -#define __MDS_MIGRATOR_H - -#include "include/types.h" - -#include -#include -#include -using std::map; -using std::list; -using std::set; - - -class MDS; -class CDir; -class CInode; -class CDentry; - -class MExportDir; -class MExportDirDiscover; -class MExportDirDiscoverAck; -class MExportDirPrep; -class MExportDirPrepAck; -class MExportDirWarning; -class MExportDirNotify; -class MExportDirNotifyAck; -class MExportDirFinish; - -class MHashDirDiscover; -class MHashDirDiscoverAck; -class MHashDirPrep; -class MHashDirPrepAck; -class MHashDir; -class MHashDirAck; -class MHashDirNotify; - -class MUnhashDirPrep; -class MUnhashDirPrepAck; -class MUnhashDir; -class MUnhashDirAck; -class MUnhashDirNotify; -class MUnhashDirNotifyAck; - -class EImportStart; - -class Migrator { -private: - MDS *mds; - MDCache *cache; - - // -- exports -- - // export stages. used to clean up intelligently if there's a failure. - const static int EXPORT_DISCOVERING = 1; // dest is disovering export dir - const static int EXPORT_FREEZING = 2; // we're freezing the dir tree - const static int EXPORT_LOGGINGSTART = 3; // we're logging EExportStart - const static int EXPORT_PREPPING = 4; // sending dest spanning tree to export bounds - const static int EXPORT_EXPORTING = 5; // sent actual export, waiting for acks - const static int EXPORT_LOGGINGFINISH = 6; // logging EExportFinish - - // export fun - map export_state; - map export_peer; - map > export_bounds; - map > export_data; // only during EXPORTING state - map > export_notify_ack_waiting; // nodes i am waiting to get export_notify_ack's from - map > export_proxy_inos; - map > export_proxy_dirinos; - - map > export_finish_waiters; - - set stray_export_warnings; // notifies i haven't seen - map stray_export_notifies; - - - // -- imports -- - const static int IMPORT_DISCOVERED = 1; // waiting for prep - const static int IMPORT_PREPPING = 2; // opening dirs on bounds - const static int IMPORT_PREPPED = 3; // opened bounds, waiting for import - const static int IMPORT_LOGGINGSTART = 4; // got import, logging EImportStart - const static int IMPORT_ACKING = 5; // logged, sent acks - const static int IMPORT_LOGGINGFINISH = 6; - - map import_state; - map import_peer; - map > import_bounds; - - - // -- hashing madness -- - multimap unhash_waiting; // nodes i am waiting for UnhashDirAck's from - multimap import_hashed_replicate_waiting; // nodes i am waiting to discover to complete my import of a hashed dir - // maps frozen_dir_ino's to waiting-for-discover ino's. - multimap import_hashed_frozen_waiting; // dirs i froze (for the above) - - - -public: - // -- cons -- - Migrator(MDS *m, MDCache *c) : mds(m), cache(c) {} - - void dispatch(Message*); - - - // -- status -- - int is_exporting(CDir *dir) { - if (export_state.count(dir)) return export_state[dir]; - return 0; - } - bool is_exporting() { return !export_state.empty(); } - int is_importing(inodeno_t dirino) { - if (import_state.count(dirino)) return import_state[dirino]; - return 0; - } - bool is_importing() { return !import_state.empty(); } - const set& get_import_bounds(inodeno_t base) { - assert(import_bounds.count(base)); - return import_bounds[base]; - } - - - // -- misc -- - void handle_mds_failure(int who); - void show_imports(); - - - // -- import/export -- - // exporter - public: - void export_dir(CDir *dir, - int mds); - void export_empty_import(CDir *dir); - - void encode_export_inode(CInode *in, bufferlist& enc_state, int newauth); - void decode_import_inode(CDentry *dn, bufferlist& bl, int &off, int oldauth); - - void add_export_finish_waiter(CDir *dir, Context *c) { - export_finish_waiters[dir].push_back(c); - } - void clear_export_proxy_pins(CDir *dir); - - protected: - void handle_export_dir_discover_ack(MExportDirDiscoverAck *m); - void export_dir_frozen(CDir *dir, int dest); - void export_dir_frozen_logged(CDir *dir, MExportDirPrep *prep, int dest); - void handle_export_dir_prep_ack(MExportDirPrepAck *m); - void export_dir_go(CDir *dir, - int dest); - int encode_export_dir(list& dirstatelist, - class C_Contexts *fin, - CDir *basedir, - CDir *dir, - int newauth); - void handle_export_dir_notify_ack(MExportDirNotifyAck *m); - void reverse_export(CDir *dir); - void export_dir_acked(CDir *dir); - void export_dir_finish(CDir *dir); - - friend class C_MDC_ExportFreeze; - friend class C_MDC_ExportStartLogged; - friend class C_MDS_ExportFinishLogged; - // importer - void handle_export_dir_discover(MExportDirDiscover *m); - void handle_export_dir_discover_2(MExportDirDiscover *m, CInode *in, int r); - void handle_export_dir_prep(MExportDirPrep *m); - void handle_export_dir(MExportDir *m); - void import_dir_logged_start(CDir *dir, int from, - list &imported_subdirs, - list &exports); - void import_dir_logged_finish(CDir *dir); - void handle_export_dir_finish(MExportDirFinish *m); - int decode_import_dir(bufferlist& bl, - int oldauth, - CDir *import_root, - list& imported_subdirs, - EImportStart *le); - void got_hashed_replica(CDir *import, - inodeno_t dir_ino, - inodeno_t replica_ino); - - friend class C_MDC_ExportDirDiscover; - friend class C_MDS_ImportDirLoggedStart; - friend class C_MDS_ImportDirLoggedFinish; - - // bystander - void handle_export_dir_warning(MExportDirWarning *m); - void handle_export_dir_notify(MExportDirNotify *m); - - - // -- hashed directories -- - - // HASH - public: - void hash_dir(CDir *dir); // on auth - protected: - map< CDir*, set > hash_gather; - map< CDir*, map< int, set > > hash_notify_gather; - map< CDir*, list > hash_proxy_inos; - - // hash on auth - void handle_hash_dir_discover_ack(MHashDirDiscoverAck *m); - void hash_dir_complete(CDir *dir); - void hash_dir_frozen(CDir *dir); - void handle_hash_dir_prep_ack(MHashDirPrepAck *m); - void hash_dir_go(CDir *dir); - void handle_hash_dir_ack(MHashDirAck *m); - void hash_dir_finish(CDir *dir); - friend class C_MDC_HashFreeze; - friend class C_MDC_HashComplete; - - // auth and non-auth - void handle_hash_dir_notify(MHashDirNotify *m); - - // hash on non-auth - void handle_hash_dir_discover(MHashDirDiscover *m); - void handle_hash_dir_discover_2(MHashDirDiscover *m, CInode *in, int r); - void handle_hash_dir_prep(MHashDirPrep *m); - void handle_hash_dir(MHashDir *m); - friend class C_MDC_HashDirDiscover; - - // UNHASH - public: - void unhash_dir(CDir *dir); // on auth - protected: - map< CDir*, list > unhash_content; - void import_hashed_content(CDir *dir, bufferlist& bl, int nden, int oldauth); - - // unhash on auth - void unhash_dir_frozen(CDir *dir); - void unhash_dir_prep(CDir *dir); - void handle_unhash_dir_prep_ack(MUnhashDirPrepAck *m); - void unhash_dir_go(CDir *dir); - void handle_unhash_dir_ack(MUnhashDirAck *m); - void handle_unhash_dir_notify_ack(MUnhashDirNotifyAck *m); - void unhash_dir_finish(CDir *dir); - friend class C_MDC_UnhashFreeze; - friend class C_MDC_UnhashComplete; - - // unhash on all - void unhash_dir_complete(CDir *dir); - - // unhash on non-auth - void handle_unhash_dir_prep(MUnhashDirPrep *m); - void unhash_dir_prep_frozen(CDir *dir); - void unhash_dir_prep_finish(CDir *dir); - void handle_unhash_dir(MUnhashDir *m); - void handle_unhash_dir_notify(MUnhashDirNotify *m); - friend class C_MDC_UnhashPrepFreeze; - - -}; - - -#endif diff --git a/tags/20070517_before_mds_merge/mds/Renamer.cc b/tags/20070517_before_mds_merge/mds/Renamer.cc deleted file mode 100644 index cf7d79170f479..0000000000000 --- a/tags/20070517_before_mds_merge/mds/Renamer.cc +++ /dev/null @@ -1,918 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "MDCache.h" -#include "MDStore.h" -#include "CInode.h" -#include "CDir.h" -#include "MDS.h" -#include "MDSMap.h" -#include "MDLog.h" -#include "AnchorClient.h" -#include "Migrator.h" -#include "Renamer.h" - -#include "include/filepath.h" - -#include "msg/Message.h" -#include "msg/Messenger.h" - -#include "events/EString.h" -#include "events/EUnlink.h" - -#include "messages/MRenameWarning.h" -#include "messages/MRenameNotify.h" -#include "messages/MRenameNotifyAck.h" -#include "messages/MRename.h" -#include "messages/MRenameAck.h" -#include "messages/MRenameReq.h" -#include "messages/MRenamePrep.h" - - - -void Renamer::dispatch(Message *m) -{ - switch (m->get_type()) { - case MSG_MDS_RENAMEWARNING: - handle_rename_warning((MRenameWarning*)m); - break; - case MSG_MDS_RENAMENOTIFY: - handle_rename_notify((MRenameNotify*)m); - break; - case MSG_MDS_RENAMENOTIFYACK: - handle_rename_notify_ack((MRenameNotifyAck*)m); - break; - case MSG_MDS_RENAME: - handle_rename((MRename*)m); - break; - case MSG_MDS_RENAMEREQ: - handle_rename_req((MRenameReq*)m); - break; - case MSG_MDS_RENAMEPREP: - handle_rename_prep((MRenamePrep*)m); - break; - case MSG_MDS_RENAMEACK: - handle_rename_ack((MRenameAck*)m); - break; - - default: - assert(0); - } -} - - -// renaming! - - -/* - fix_renamed_dir(): - - caller has already: - - relinked inode in new location - - fixed in->is_auth() - - set dir_auth, if appropriate - - caller has not: - - touched in->dir - - updated import/export tables -*/ -void Renamer::fix_renamed_dir(CDir *srcdir, - CInode *in, - CDir *destdir, - bool authchanged, // _inode_ auth - int dir_auth) // dir auth (for certain cases) -{ - dout(7) << "fix_renamed_dir on " << *in << endl; - dout(7) << "fix_renamed_dir on " << *in->dir << endl; - - if (in->dir->is_auth()) { - // dir ours - dout(7) << "dir is auth" << endl; - assert(!in->dir->is_export()); - - if (in->is_auth()) { - // inode now ours - - if (authchanged) { - // inode _was_ replica, now ours - dout(7) << "inode was replica, now ours. removing from import list." << endl; - assert(in->dir->is_import()); - - // not import anymore! - cache->imports.erase(in->dir); - in->dir->state_clear(CDIR_STATE_IMPORT); - in->dir->put(CDir::PIN_IMPORT); - - in->dir->set_dir_auth( CDIR_AUTH_PARENT ); - dout(7) << " fixing dir_auth to be " << in->dir->get_dir_auth() << endl; - - // move my nested imports to in's containing import - CDir *con = cache->get_auth_container(in->dir); - assert(con); - for (set::iterator p = cache->nested_exports[in->dir].begin(); - p != cache->nested_exports[in->dir].end(); - p++) { - dout(7) << "moving nested export under new container " << *con << endl; - cache->nested_exports[con].insert(*p); - } - cache->nested_exports.erase(in->dir); - - } else { - // inode was ours, still ours. - dout(7) << "inode was ours, still ours." << endl; - assert(!in->dir->is_import()); - assert(in->dir->get_dir_auth() == CDIR_AUTH_PARENT); - - // move any exports nested beneath me? - CDir *newcon = cache->get_auth_container(in->dir); - assert(newcon); - CDir *oldcon = cache->get_auth_container(srcdir); - assert(oldcon); - if (newcon != oldcon) { - dout(7) << "moving nested exports under new container" << endl; - set nested; - cache->find_nested_exports_under(oldcon, in->dir, nested); - for (set::iterator it = nested.begin(); - it != nested.end(); - it++) { - dout(7) << "moving nested export " << *it << " under new container" << endl; - cache->nested_exports[oldcon].erase(*it); - cache->nested_exports[newcon].insert(*it); - } - } - } - - } else { - // inode now replica - - if (authchanged) { - // inode was ours, but now replica - dout(7) << "inode was ours, now replica. adding to import list." << endl; - - // i am now an import - cache->imports.insert(in->dir); - in->dir->state_set(CDIR_STATE_IMPORT); - in->dir->get(CDir::PIN_IMPORT); - - in->dir->set_dir_auth( mds->get_nodeid() ); - dout(7) << " fixing dir_auth to be " << in->dir->get_dir_auth() << endl; - - // find old import - CDir *oldcon = cache->get_auth_container(srcdir); - assert(oldcon); - dout(7) << " oldcon is " << *oldcon << endl; - - // move nested exports under me - set nested; - cache->find_nested_exports_under(oldcon, in->dir, nested); - for (set::iterator it = nested.begin(); - it != nested.end(); - it++) { - dout(7) << "moving nested export " << *it << " under me" << endl; - cache->nested_exports[oldcon].erase(*it); - cache->nested_exports[in->dir].insert(*it); - } - - } else { - // inode was replica, still replica - dout(7) << "inode was replica, still replica. doing nothing." << endl; - assert(in->dir->is_import()); - - // verify dir_auth - assert(in->dir->get_dir_auth() == mds->get_nodeid()); // me, because i'm auth for dir. - assert(in->authority() != in->dir->get_dir_auth()); // inode not me. - } - - assert(in->dir->is_import()); - } - - } else { - // dir is not ours - dout(7) << "dir is not auth" << endl; - - if (in->is_auth()) { - // inode now ours - - if (authchanged) { - // inode was replica, now ours - dout(7) << "inode was replica, now ours. now an export." << endl; - assert(!in->dir->is_export()); - - // now export - cache->exports.insert(in->dir); - in->dir->state_set(CDIR_STATE_EXPORT); - in->dir->get(CDir::PIN_EXPORT); - - assert(dir_auth >= 0); // better be defined - in->dir->set_dir_auth( dir_auth ); - dout(7) << " fixing dir_auth to be " << in->dir->get_dir_auth() << endl; - - CDir *newcon = cache->get_auth_container(in->dir); - assert(newcon); - cache->nested_exports[newcon].insert(in->dir); - - } else { - // inode was ours, still ours - dout(7) << "inode was ours, still ours. did my import change?" << endl; - - // sanity - assert(in->dir->is_export()); - assert(in->dir->get_dir_auth() >= 0); - assert(in->dir->get_dir_auth() != in->authority()); - - // moved under new import? - CDir *oldcon = cache->get_auth_container(srcdir); - CDir *newcon = cache->get_auth_container(in->dir); - if (oldcon != newcon) { - dout(7) << "moving myself under new import " << *newcon << endl; - cache->nested_exports[oldcon].erase(in->dir); - cache->nested_exports[newcon].insert(in->dir); - } - } - - assert(in->dir->is_export()); - } else { - // inode now replica - - if (authchanged) { - // inode was ours, now replica - dout(7) << "inode was ours, now replica. removing from export list." << endl; - assert(in->dir->is_export()); - - // remove from export list - cache->exports.erase(in->dir); - in->dir->state_clear(CDIR_STATE_EXPORT); - in->dir->put(CDir::PIN_EXPORT); - - CDir *oldcon = cache->get_auth_container(srcdir); - assert(oldcon); - assert(cache->nested_exports[oldcon].count(in->dir) == 1); - cache->nested_exports[oldcon].erase(in->dir); - - // simplify dir_auth - if (in->authority() == in->dir->authority()) { - in->dir->set_dir_auth( CDIR_AUTH_PARENT ); - dout(7) << "simplified dir_auth to -1, inode auth is (also) " << in->authority() << endl; - } else { - assert(in->dir->get_dir_auth() >= 0); // someone else's export, - } - - } else { - // inode was replica, still replica - dout(7) << "inode was replica, still replica. do nothing." << endl; - - // fix dir_auth? - if (in->authority() == dir_auth) - in->dir->set_dir_auth( CDIR_AUTH_PARENT ); - else - in->dir->set_dir_auth( dir_auth ); - dout(7) << " fixing dir_auth to be " << dir_auth << endl; - - // do nothing. - } - - assert(!in->dir->is_export()); - } - } - - cache->show_imports(); -} - -/* - * when initiator gets an ack back for a foreign rename - */ - -class C_MDC_RenameNotifyAck : public Context { - Renamer *rn; - CInode *in; - int initiator; - -public: - C_MDC_RenameNotifyAck(Renamer *r, - CInode *i, int init) : rn(r), in(i), initiator(init) {} - void finish(int r) { - rn->file_rename_ack(in, initiator); - } -}; - - - -/************** initiator ****************/ - -/* - * when we get MRenameAck (and rename is done, notifies gone out+acked, etc.) - */ -class C_MDC_RenameAck : public Context { - Renamer *mdc; - CDir *srcdir; - CInode *in; - Context *c; -public: - C_MDC_RenameAck(Renamer *mdc, CDir *srcdir, CInode *in, Context *c) { - this->mdc = mdc; - this->srcdir = srcdir; - this->in = in; - this->c = c; - } - void finish(int r) { - mdc->file_rename_finish(srcdir, in, c); - } -}; - - -void Renamer::file_rename(CDentry *srcdn, CDentry *destdn, Context *onfinish) -{ - assert(srcdn->is_xlocked()); // by me - assert(destdn->is_xlocked()); // by me - - CDir *srcdir = srcdn->dir; - string srcname = srcdn->name; - - CDir *destdir = destdn->dir; - string destname = destdn->name; - - CInode *in = srcdn->inode; - //Message *req = srcdn->xlockedby; - - - // determine the players - int srcauth = srcdir->dentry_authority(srcdn->name); - int destauth = destdir->dentry_authority(destname); - - - // FOREIGN rename? - if (srcauth != mds->get_nodeid() || - destauth != mds->get_nodeid()) { - dout(7) << "foreign rename. srcauth " << srcauth << ", destauth " << destauth << ", isdir " << srcdn->inode->is_dir() << endl; - - string destpath; - destdn->make_path(destpath); - - if (destauth != mds->get_nodeid()) { - // make sure dest has dir open. - dout(7) << "file_rename i'm not dest auth. sending MRenamePrep to " << destauth << endl; - - // prep dest first, they must have the dir open! rest will follow. - string srcpath; - srcdn->make_path(srcpath); - - MRenamePrep *m = new MRenamePrep(mds->get_nodeid(), // i'm the initiator - srcdir->ino(), srcname, srcpath, - destdir->ino(), destname, destpath, - srcauth); // tell dest who src is (maybe even me) - mds->send_message_mds(m, destauth, MDS_PORT_CACHE); - - cache->show_imports(); - - } - - else if (srcauth != mds->get_nodeid()) { - if (destauth == mds->get_nodeid()) { - dout(7) << "file_rename dest auth, not src auth. sending MRenameReq" << endl; - } else { - dout(7) << "file_rename neither src auth nor dest auth. sending MRenameReq" << endl; - } - - // srcdn not important on destauth, just request - MRenameReq *m = new MRenameReq(mds->get_nodeid(), // i'm the initiator - srcdir->ino(), srcname, - destdir->ino(), destname, destpath, destauth); // tell src who dest is (they may not know) - mds->send_message_mds(m, srcauth, MDS_PORT_CACHE); - } - - else - assert(0); - - // set waiter on the inode (is this the best place?) - in->add_waiter(CINODE_WAIT_RENAMEACK, - new C_MDC_RenameAck(this, - srcdir, in, onfinish)); - return; - } - - // LOCAL rename! - assert(srcauth == mds->get_nodeid() && destauth == mds->get_nodeid()); - dout(7) << "file_rename src and dest auth, renaming locally (easy!)" << endl; - - // update our cache - if (destdn->inode && destdn->inode->is_dirty()) - destdn->inode->mark_clean(); - - cache->rename_file(srcdn, destdn); - - // update imports/exports? - if (in->is_dir() && in->dir) - fix_renamed_dir(srcdir, in, destdir, false); // auth didnt change - - // mark dentries dirty - srcdn->_mark_dirty(); // fixme - destdn->_mark_dirty(); // fixme - in->_mark_dirty(); // fixme - - - // local, restrict notify to ppl with open dirs - set notify; - for (map::iterator it = srcdir->replicas_begin(); - it != srcdir->replicas_end(); - ++it) - notify.insert(it->first); - for (map::iterator it = destdir->replicas_begin(); - it != destdir->replicas_end(); - it++) - if (notify.count(it->first) == 0) notify.insert(it->first); - - if (notify.size()) { - // warn + notify - file_rename_warn(in, notify); - file_rename_notify(in, srcdir, srcname, destdir, destname, notify, mds->get_nodeid()); - - // wait for MRenameNotifyAck's - in->add_waiter(CINODE_WAIT_RENAMENOTIFYACK, - new C_MDC_RenameNotifyAck(this, in, mds->get_nodeid())); // i am initiator - - // wait for finish - in->add_waiter(CINODE_WAIT_RENAMEACK, - new C_MDC_RenameAck(this, srcdir, in, onfinish)); - } else { - // sweet, no notify necessary, we're done! - file_rename_finish(srcdir, in, onfinish); - } -} - -void Renamer::handle_rename_ack(MRenameAck *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - - dout(7) << "handle_rename_ack on " << *in << endl; - - // all done! - in->finish_waiting(CINODE_WAIT_RENAMEACK); - - delete m; -} - -void Renamer::file_rename_finish(CDir *srcdir, CInode *in, Context *c) -{ - dout(10) << "file_rename_finish on " << *in << endl; - - // did i empty out an imported dir? FIXME this check should go somewhere else??? - if (srcdir->is_import() && !srcdir->inode->is_root() && srcdir->get_size() == 0) - cache->migrator->export_empty_import(srcdir); - - // finish our caller - if (c) { - c->finish(0); - delete c; - } -} - - -/************* src **************/ - - -/** handle_rename_req - * received by auth of src dentry (from init, or destauth if dir). - * src may not have dest dir open. - * src will export inode, unlink|rename, and send MRename to dest. - */ -void Renamer::handle_rename_req(MRenameReq *m) -{ - // i am auth, i will have it. - CInode *srcdiri = cache->get_inode(m->get_srcdirino()); - CDir *srcdir = srcdiri->dir; - CDentry *srcdn = srcdir->lookup(m->get_srcname()); - assert(srcdn); - - // do it - file_rename_foreign_src(srcdn, - m->get_destdirino(), m->get_destname(), m->get_destpath(), m->get_destauth(), - m->get_initiator()); - delete m; -} - - -void Renamer::file_rename_foreign_src(CDentry *srcdn, - inodeno_t destdirino, string& destname, string& destpath, int destauth, - int initiator) -{ - dout(7) << "file_rename_foreign_src " << *srcdn << endl; - - CDir *srcdir = srcdn->dir; - string srcname = srcdn->name; - - // (we're basically exporting this inode) - CInode *in = srcdn->inode; - assert(in); - assert(in->is_auth()); - - if (in->is_dir()) cache->show_imports(); - - // encode and export inode state - bufferlist inode_state; - cache->migrator->encode_export_inode(in, inode_state, destauth); - - // send - MRename *m = new MRename(initiator, - srcdir->ino(), srcdn->name, destdirino, destname, - inode_state); - mds->send_message_mds(m, destauth, MDS_PORT_CACHE); - - // have dest? - CInode *destdiri = cache->get_inode(m->get_destdirino()); - CDir *destdir = 0; - if (destdiri) destdir = destdiri->dir; - CDentry *destdn = 0; - if (destdir) destdn = destdir->lookup(m->get_destname()); - - // discover src - if (!destdn) { - dout(7) << "file_rename_foreign_src doesn't have destdn, discovering " << destpath << endl; - - filepath destfilepath = destpath; - vector trace; - int r = cache->path_traverse(destfilepath, trace, true, - m, new C_MDS_RetryMessage(mds, m), - MDS_TRAVERSE_DISCOVER); - assert(r>0); - return; - } - - assert(destdn); - - // update our cache - cache->rename_file(srcdn, destdn); - - // update imports/exports? - if (in->is_dir() && in->dir) - fix_renamed_dir(srcdir, in, destdir, true); // auth changed - - srcdn->_mark_dirty(); // fixme - - // proxy! - in->state_set(CInode::STATE_PROXY); - in->get(CInode::PIN_PROXY); - - // generate notify list (everybody but src|dst) and send warnings - set notify; - for (int i=0; iget_mds_map()->get_num_mds(); i++) { - if (i != mds->get_nodeid() && // except the source - i != destauth) // and the dest - notify.insert(i); - } - file_rename_warn(in, notify); - - - // wait for MRenameNotifyAck's - in->add_waiter(CINODE_WAIT_RENAMENOTIFYACK, - new C_MDC_RenameNotifyAck(this, in, initiator)); -} - -void Renamer::file_rename_warn(CInode *in, - set& notify) -{ - // note gather list - rename_waiting_for_ack[in->ino()] = notify; - - // send - for (set::iterator it = notify.begin(); - it != notify.end(); - it++) { - dout(10) << "file_rename_warn to " << *it << " for " << *in << endl; - mds->send_message_mds(new MRenameWarning(in->ino()), *it, MDS_PORT_CACHE); - } -} - - -void Renamer::handle_rename_notify_ack(MRenameNotifyAck *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - dout(7) << "handle_rename_notify_ack on " << *in << endl; - - int source = m->get_source().num(); - rename_waiting_for_ack[in->ino()].erase(source); - if (rename_waiting_for_ack[in->ino()].empty()) { - // last one! - rename_waiting_for_ack.erase(in->ino()); - in->finish_waiting(CINODE_WAIT_RENAMENOTIFYACK, 0); - } else { - dout(7) << "still waiting for " << rename_waiting_for_ack[in->ino()] << endl; - } -} - - -void Renamer::file_rename_ack(CInode *in, int initiator) -{ - // we got all our MNotifyAck's. - - // was i proxy (if not, it's cuz this was a local rename) - if (in->state_test(CInode::STATE_PROXY)) { - dout(10) << "file_rename_ack clearing proxy bit on " << *in << endl; - in->state_clear(CInode::STATE_PROXY); - in->put(CInode::PIN_PROXY); - } - - // done! - if (initiator == mds->get_nodeid()) { - // it's me, finish - dout(7) << "file_rename_ack i am initiator, finishing" << endl; - in->finish_waiting(CINODE_WAIT_RENAMEACK); - } else { - // send ack - dout(7) << "file_rename_ack sending MRenameAck to initiator " << initiator << endl; - mds->send_message_mds(new MRenameAck(in->ino()), initiator, MDS_PORT_CACHE); - } -} - - - - -/************ dest *************/ - -/** handle_rename_prep - * received by auth of dest dentry to make sure they have src + dir open. - * this is so that when they get the inode and dir, they can update exports etc properly. - * will send MRenameReq to src. - */ -void Renamer::handle_rename_prep(MRenamePrep *m) -{ - // open src - filepath srcpath = m->get_srcpath(); - vector trace; - int r = cache->path_traverse(srcpath, trace, false, - m, new C_MDS_RetryMessage(mds, m), - MDS_TRAVERSE_DISCOVER); - - if (r>0) return; - - // ok! - CInode *srcin = trace[trace.size()-1]->inode; - assert(srcin); - - dout(7) << "handle_rename_prep have srcin " << *srcin << endl; - - if (srcin->is_dir()) { - if (!srcin->dir) { - dout(7) << "handle_rename_prep need to open dir" << endl; - cache->open_remote_dir(srcin, - new C_MDS_RetryMessage(mds,m)); - return; - } - - dout(7) << "handle_rename_prep have dir " << *srcin->dir << endl; - } - - // pin - srcin->get(CInode::PIN_RENAMESRC); - - // send rename request - MRenameReq *req = new MRenameReq(m->get_initiator(), // i'm the initiator - m->get_srcdirino(), m->get_srcname(), - m->get_destdirino(), m->get_destname(), m->get_destpath(), - mds->get_nodeid()); // i am dest - mds->send_message_mds(req, m->get_srcauth(), MDS_PORT_CACHE); - delete m; - return; -} - - - -/** handle_rename - * received by auth of dest dentry. includes exported inode info. - * dest may not have srcdir open. - */ -void Renamer::handle_rename(MRename *m) -{ - // srcdn (required) - CInode *srcdiri = cache->get_inode(m->get_srcdirino()); - CDir *srcdir = srcdiri->dir; - CDentry *srcdn = srcdir->lookup(m->get_srcname()); - string srcname = srcdn->name; - assert(srcdn && srcdn->inode); - - dout(7) << "handle_rename srcdn " << *srcdn << endl; - - // destdn (required). i am auth, so i will have it. - CInode *destdiri = cache->get_inode(m->get_destdirino()); - CDir *destdir = destdiri->dir; - CDentry *destdn = destdir->lookup(m->get_destname()); - string destname = destdn->name; - assert(destdn); - - dout(7) << "handle_rename destdn " << *destdn << endl; - - // note old dir auth - int old_dir_auth = -1; - if (srcdn->inode->dir) old_dir_auth = srcdn->inode->dir->authority(); - - // rename replica into position - if (destdn->inode && destdn->inode->is_dirty()) - destdn->inode->mark_clean(); - - cache->rename_file(srcdn, destdn); - - // decode + import inode (into new location start) - int off = 0; - // HACK - bufferlist bufstate; - bufstate.claim_append(m->get_inode_state()); - cache->migrator->decode_import_inode(destdn, bufstate, off, m->get_source().num()); - - CInode *in = destdn->inode; - assert(in); - - // update imports/exports? - if (in->is_dir()) { - assert(in->dir); // i had better already ahve it open.. see MRenamePrep - fix_renamed_dir(srcdir, in, destdir, true, // auth changed - old_dir_auth); // src is possibly new dir auth. - } - - // mark dirty - destdn->_mark_dirty(); // fixme - in->_mark_dirty(); // fixme - - // unpin - in->put(CInode::PIN_RENAMESRC); - - // ok, send notifies. - set notify; - for (int i=0; iget_mds_map()->get_num_mds(); i++) { - if (i != m->get_source().num() && // except the source - i != mds->get_nodeid()) // and the dest - notify.insert(i); - } - file_rename_notify(in, srcdir, srcname, destdir, destname, notify, m->get_source().num()); - - delete m; -} - - -void Renamer::file_rename_notify(CInode *in, - CDir *srcdir, string& srcname, CDir *destdir, string& destname, - set& notify, - int srcauth) -{ - /* NOTE: notify list might include myself */ - - // tell - string destdirpath; - destdir->inode->make_path(destdirpath); - - for (set::iterator it = notify.begin(); - it != notify.end(); - it++) { - dout(10) << "file_rename_notify to " << *it << " for " << *in << endl; - mds->send_message_mds(new MRenameNotify(in->ino(), - srcdir->ino(), - srcname, - destdir->ino(), - destdirpath, - destname, - srcauth), - *it, MDS_PORT_CACHE); - } -} - - - -/************** bystanders ****************/ - -void Renamer::handle_rename_warning(MRenameWarning *m) -{ - // add to warning list - stray_rename_warnings.insert( m->get_ino() ); - - // did i already see the notify? - if (stray_rename_notifies.count(m->get_ino())) { - // i did, we're good. - dout(7) << "handle_rename_warning on " << m->get_ino() << ". already got notify." << endl; - - handle_rename_notify(stray_rename_notifies[m->get_ino()]); - stray_rename_notifies.erase(m->get_ino()); - } else { - dout(7) << "handle_rename_warning on " << m->get_ino() << ". waiting for notify." << endl; - } - - // done - delete m; -} - - -void Renamer::handle_rename_notify(MRenameNotify *m) -{ - // FIXME: when we do hard links, i think we need to - // have srcdn and destdn both, or neither, always! - - // did i see the warning yet? - if (!stray_rename_warnings.count(m->get_ino())) { - // wait for it. - dout(7) << "handle_rename_notify on " << m->get_ino() << ", waiting for warning." << endl; - stray_rename_notifies[m->get_ino()] = m; - return; - } - - dout(7) << "handle_rename_notify dir " << m->get_srcdirino() << " dn " << m->get_srcname() << " to dir " << m->get_destdirino() << " dname " << m->get_destname() << endl; - - // src - CInode *srcdiri = cache->get_inode(m->get_srcdirino()); - CDir *srcdir = 0; - if (srcdiri) srcdir = srcdiri->dir; - CDentry *srcdn = 0; - if (srcdir) srcdn = srcdir->lookup(m->get_srcname()); - - // dest - CInode *destdiri = cache->get_inode(m->get_destdirino()); - CDir *destdir = 0; - if (destdiri) destdir = destdiri->dir; - CDentry *destdn = 0; - if (destdir) destdn = destdir->lookup(m->get_destname()); - - // have both? - list finished; - if (srcdn && destdir) { - CInode *in = srcdn->inode; - - int old_dir_auth = -1; - if (in && in->dir) old_dir_auth = in->dir->authority(); - - if (!destdn) { - destdn = destdir->add_dentry(m->get_destname()); // create null dentry - destdn->lockstate = DN_LOCK_XLOCK; // that's xlocked! - } - - dout(7) << "handle_rename_notify renaming " << *srcdn << " to " << *destdn << endl; - - if (in) { - cache->rename_file(srcdn, destdn); - - // update imports/exports? - if (in && in->is_dir() && in->dir) { - fix_renamed_dir(srcdir, in, destdir, false, old_dir_auth); // auth didnt change - } - } else { - dout(7) << " i don't have the inode (just null dentries)" << endl; - } - - } - - else if (srcdn) { - dout(7) << "handle_rename_notify no dest, but have src" << endl; - dout(7) << "srcdn is " << *srcdn << endl; - - if (destdiri) { - dout(7) << "have destdiri, opening dir " << *destdiri << endl; - cache->open_remote_dir(destdiri, - new C_MDS_RetryMessage(mds,m)); - } else { - filepath destdirpath = m->get_destdirpath(); - dout(7) << "don't have destdiri even, doing traverse+discover on " << destdirpath << endl; - - vector trace; - int r = cache->path_traverse(destdirpath, trace, true, - m, new C_MDS_RetryMessage(mds, m), - MDS_TRAVERSE_DISCOVER); - assert(r>0); - } - return; - } - - else if (destdn) { - dout(7) << "handle_rename_notify unlinking dst only " << *destdn << endl; - if (destdn->inode) { - destdir->unlink_inode(destdn); - } - } - - else { - dout(7) << "handle_rename_notify didn't have srcdn or destdn" << endl; - assert(srcdn == 0 && destdn == 0); - } - - mds->queue_finished(finished); - - - // ack - dout(10) << "sending RenameNotifyAck back to srcauth " << m->get_srcauth() << endl; - MRenameNotifyAck *ack = new MRenameNotifyAck(m->get_ino()); - mds->send_message_mds(ack, m->get_srcauth(), MDS_PORT_CACHE); - - - stray_rename_warnings.erase( m->get_ino() ); - delete m; -} - - - - diff --git a/tags/20070517_before_mds_merge/mds/Renamer.h b/tags/20070517_before_mds_merge/mds/Renamer.h deleted file mode 100644 index 1005971df986f..0000000000000 --- a/tags/20070517_before_mds_merge/mds/Renamer.h +++ /dev/null @@ -1,98 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_RENAMER_H -#define __MDS_RENAMER_H - -#include "include/types.h" - -#include -#include -using std::map; -using std::set; - -class MDS; -class MDCache; -class CDentry; -class CInode; -class CDir; - -class Message; -class MRenameWarning; -class MRenameNotify; -class MRenameNotifyAck; -class MRename; -class MRenamePrep; -class MRenameReq; -class MRenameAck; - -class Renamer { - MDS *mds; - MDCache *cache; - - // rename fun - set stray_rename_warnings; // notifies i haven't seen - map stray_rename_notifies; - - map > rename_waiting_for_ack; - - - - void fix_renamed_dir(CDir *srcdir, - CInode *in, - CDir *destdir, - bool authchanged, // _inode_ auth changed - int dirauth=-1); // dirauth (for certain cases) - - -public: - Renamer(MDS *m, MDCache *c) : mds(m), cache(c) {} - - void dispatch(Message *m); - - // RENAME - // initiator - public: - void file_rename(CDentry *srcdn, CDentry *destdn, Context *c); - protected: - void handle_rename_ack(MRenameAck *m); // dest -> init (almost always) - void file_rename_finish(CDir *srcdir, CInode *in, Context *c); - friend class C_MDC_RenameAck; - - // src - void handle_rename_req(MRenameReq *m); // dest -> src - void file_rename_foreign_src(CDentry *srcdn, - inodeno_t destdirino, string& destname, string& destpath, int destauth, - int initiator); - void file_rename_warn(CInode *in, set& notify); - void handle_rename_notify_ack(MRenameNotifyAck *m); // bystanders -> src - void file_rename_ack(CInode *in, int initiator); - friend class C_MDC_RenameNotifyAck; - - // dest - void handle_rename_prep(MRenamePrep *m); // init -> dest - void handle_rename(MRename *m); // src -> dest - void file_rename_notify(CInode *in, - CDir *srcdir, string& srcname, CDir *destdir, string& destname, - set& notify, int srcauth); - - // bystander - void handle_rename_warning(MRenameWarning *m); // src -> bystanders - void handle_rename_notify(MRenameNotify *m); // dest -> bystanders - - -}; - -#endif - - diff --git a/tags/20070517_before_mds_merge/mds/Server.cc b/tags/20070517_before_mds_merge/mds/Server.cc deleted file mode 100644 index 736913f301cb1..0000000000000 --- a/tags/20070517_before_mds_merge/mds/Server.cc +++ /dev/null @@ -1,2389 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "MDS.h" -#include "Server.h" -#include "Locker.h" -#include "MDCache.h" -#include "MDLog.h" -#include "Migrator.h" -#include "MDBalancer.h" -#include "Renamer.h" -#include "MDStore.h" - -#include "msg/Messenger.h" - -#include "messages/MClientMount.h" -#include "messages/MClientMountAck.h" -#include "messages/MClientRequest.h" -#include "messages/MClientReply.h" -#include "messages/MHashReaddir.h" -#include "messages/MHashReaddirReply.h" - -#include "messages/MLock.h" - -#include "messages/MInodeLink.h" - -#include "events/EString.h" -#include "events/EUpdate.h" - -#include "include/filepath.h" -#include "common/Timer.h" -#include "common/Logger.h" -#include "common/LogType.h" - -#include -#include - -#include -#include -using namespace std; - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".server " -#define derr(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".server " - - -void Server::dispatch(Message *m) -{ - // active? - if (!mds->is_active()) { - dout(3) << "not active yet, waiting" << endl; - mds->queue_waitfor_active(new C_MDS_RetryMessage(mds, m)); - return; - } - - switch (m->get_type()) { - case MSG_CLIENT_MOUNT: - handle_client_mount((MClientMount*)m); - return; - case MSG_CLIENT_UNMOUNT: - handle_client_unmount(m); - return; - } - - - switch (m->get_type()) { - case MSG_CLIENT_REQUEST: - handle_client_request((MClientRequest*)m); - return; - - case MSG_MDS_HASHREADDIR: - handle_hash_readdir((MHashReaddir*)m); - return; - case MSG_MDS_HASHREADDIRREPLY: - handle_hash_readdir_reply((MHashReaddirReply*)m); - return; - - } - - dout(1) << " main unknown message " << m->get_type() << endl; - assert(0); -} - - - - - -void Server::handle_client_mount(MClientMount *m) -{ - int n = m->get_source().num(); - dout(3) << "mount by client" << n << endl; - mds->clientmap.add_mount(n, m->get_source_inst()); - - assert(mds->get_nodeid() == 0); // mds0 mounts/unmounts - - // ack - messenger->send_message(new MClientMountAck(m, mds->mdsmap, mds->osdmap), - m->get_source_inst()); - delete m; -} - -void Server::handle_client_unmount(Message *m) -{ - int n = m->get_source().num(); - dout(3) << "unmount by client" << n << endl; - - assert(mds->get_nodeid() == 0); // mds0 mounts/unmounts - - mds->clientmap.rem_mount(n); - - if (g_conf.mds_shutdown_on_last_unmount && - mds->clientmap.get_mount_set().empty()) { - dout(3) << "all clients done, initiating shutdown" << endl; - mds->shutdown_start(); - } - - // ack by sending back to client - messenger->send_message(m, m->get_source_inst()); -} - - - -/******* - * some generic stuff for finishing off requests - */ - -/** C_MDS_CommitRequest - */ - -class C_MDS_CommitRequest : public Context { - Server *server; - MClientRequest *req; - MClientReply *reply; - CInode *tracei; // inode to include a trace for - LogEvent *event; - -public: - C_MDS_CommitRequest(Server *server, - MClientRequest *req, MClientReply *reply, CInode *tracei, - LogEvent *event=0) { - this->server = server; - this->req = req; - this->tracei = tracei; - this->reply = reply; - this->event = event; - } - void finish(int r) { - if (r != 0) { - // failure. set failure code and reply. - reply->set_result(r); - } - if (event) { - server->commit_request(req, reply, tracei, event); - } else { - // reply. - server->reply_request(req, reply, tracei); - } - } -}; - - -/* - * send generic response (just and error code) - */ -void Server::reply_request(MClientRequest *req, int r, CInode *tracei) -{ - reply_request(req, new MClientReply(req, r), tracei); -} - - -/* - * send given reply - * include a trace to tracei - */ -void Server::reply_request(MClientRequest *req, MClientReply *reply, CInode *tracei) { - dout(10) << "reply_request r=" << reply->get_result() << " " << *req << endl; - - // include trace - if (tracei) { - reply->set_trace_dist( tracei, mds->get_nodeid() ); - } - - // send reply - messenger->send_message(reply, - req->get_client_inst()); - - // discard request - mdcache->request_finish(req); - - // stupid stats crap (FIXME) - stat_ops++; -} - - -void Server::submit_update(MClientRequest *req, - CInode *wrlockedi, - LogEvent *event, - Context *oncommit) -{ - // log - mdlog->submit_entry(event); - - // pin - mdcache->request_pin_inode(req, wrlockedi); - - // wait - mdlog->wait_for_sync(oncommit); -} - - -/* - * commit event(s) to the metadata journal, then reply. - * or, be sloppy and do it concurrently (see g_conf.mds_log_before_reply) - * - * NOTE: this is old and bad (write-behind!) - */ -void Server::commit_request(MClientRequest *req, - MClientReply *reply, - CInode *tracei, - LogEvent *event, - LogEvent *event2) -{ - // log - if (event) mdlog->submit_entry(event); - if (event2) mdlog->submit_entry(event2); - - if (g_conf.mds_log_before_reply && g_conf.mds_log && event) { - // SAFE mode! - - // pin inode so it doesn't go away! - if (tracei) mdcache->request_pin_inode(req, tracei); - - // wait for log sync - mdlog->wait_for_sync(new C_MDS_CommitRequest(this, req, reply, tracei)); - return; - } - else { - // just reply - reply_request(req, reply, tracei); - } -} - - - -/*** - * process a client request - */ - -void Server::handle_client_request(MClientRequest *req) -{ - dout(4) << "req " << *req << endl; - - // note original client addr - if (req->get_source().is_client()) { - req->set_client_inst( req->get_source_inst() ); - req->clear_payload(); - } - - if (!mds->is_active()) { - dout(5) << " not active, discarding client request." << endl; - delete req; - return; - } - - if (!mdcache->get_root()) { - dout(5) << "need to open root" << endl; - mdcache->open_root(new C_MDS_RetryMessage(mds, req)); - return; - } - - // okay, i want - CInode *ref = 0; - vector trace; // might be blank, for fh guys - - bool follow_trailing_symlink = false; - - // operations on fh's or other non-files - switch (req->get_op()) { - /* - case MDS_OP_FSTAT: - reply = handle_client_fstat(req, cur); - break; ****** fiX ME *** - */ - - case MDS_OP_TRUNCATE: - if (!req->get_ino()) break; // can be called w/ either fh OR path - - case MDS_OP_RELEASE: - case MDS_OP_FSYNC: - ref = mdcache->get_inode(req->get_ino()); // fixme someday no ino needed? - - if (!ref) { - int next = mds->get_nodeid() + 1; - if (next >= mds->mdsmap->get_num_mds()) next = 0; - dout(10) << "got request on ino we don't have, passing buck to " << next << endl; - mds->send_message_mds(req, next, MDS_PORT_SERVER); - return; - } - } - - if (!ref) { - // we need to traverse a path - filepath refpath = req->get_filepath(); - - // ops on non-existing files --> directory paths - switch (req->get_op()) { - case MDS_OP_OPEN: - if (!(req->get_iarg() & O_CREAT)) break; - - case MDS_OP_MKNOD: - case MDS_OP_MKDIR: - case MDS_OP_SYMLINK: - case MDS_OP_LINK: - case MDS_OP_UNLINK: // also wrt parent dir, NOT the unlinked inode!! - case MDS_OP_RMDIR: - case MDS_OP_RENAME: - // remove last bit of path - refpath = refpath.prefixpath(refpath.depth()-1); - break; - } - dout(10) << "refpath = " << refpath << endl; - - Context *ondelay = new C_MDS_RetryMessage(mds, req); - - if (req->get_op() == MDS_OP_LSTAT) { - follow_trailing_symlink = false; - } - - // do trace - int r = mdcache->path_traverse(refpath, trace, follow_trailing_symlink, - req, ondelay, - MDS_TRAVERSE_FORWARD, - 0, - true); // is MClientRequest - - if (r > 0) return; // delayed - if (r == -ENOENT || - r == -ENOTDIR || - r == -EISDIR) { - // error! - dout(10) << " path traverse error " << r << ", replying" << endl; - - // send error - messenger->send_message(new MClientReply(req, r), - req->get_client_inst()); - - // - // is this a special debug command? - if (refpath.depth() - 1 == trace.size() && - refpath.last_bit().find(".ceph.") == 0) { - CDir *dir = 0; - if (trace.empty()) - dir = mdcache->get_root()->dir; - else - dir = trace[trace.size()-1]->get_inode()->dir; - - dout(1) << "** POSSIBLE CEPH DEBUG COMMAND '" << refpath.last_bit() << "' in " << *dir << endl; - - if (refpath.last_bit() == ".ceph.hash" && - refpath.depth() > 1) { - dout(1) << "got explicit hash command " << refpath << endl; - CDir *dir = trace[trace.size()-1]->get_inode()->dir; - if (!dir->is_hashed() && - !dir->is_hashing() && - dir->is_auth()) - mdcache->migrator->hash_dir(dir); - } - else if (refpath.last_bit() == ".ceph.commit") { - dout(1) << "got explicit commit command on " << *dir << endl; - mds->mdstore->commit_dir(dir, 0); - } - } - // - - - delete req; - return; - } - - if (trace.size()) - ref = trace[trace.size()-1]->inode; - else - ref = mdcache->get_root(); - } - - dout(10) << "ref is " << *ref << endl; - - // rename doesn't pin src path (initially) - if (req->get_op() == MDS_OP_RENAME) trace.clear(); - - // register - if (!mdcache->request_start(req, ref, trace)) - return; - - // process - dispatch_request(req, ref); -} - - - -void Server::dispatch_request(Message *m, CInode *ref) -{ - MClientRequest *req = 0; - - // MLock or MClientRequest? - /* this is a little weird. - client requests and mlocks both initial dentry xlocks, path pins, etc., - and thus both make use of the context C_MDS_RetryRequest. - */ - switch (m->get_type()) { - case MSG_CLIENT_REQUEST: - req = (MClientRequest*)m; - break; // continue below! - - case MSG_MDS_LOCK: - mds->locker->handle_lock_dn((MLock*)m); - return; // done - - default: - assert(0); // shouldn't get here - } - - // MClientRequest. - - switch(req->get_op()) { - - // files - case MDS_OP_OPEN: - if (req->get_iarg() & O_CREAT) - handle_client_openc(req, ref); - else - handle_client_open(req, ref); - break; - case MDS_OP_TRUNCATE: - handle_client_truncate(req, ref); - break; - /* - case MDS_OP_FSYNC: - handle_client_fsync(req, ref); - break; - */ - /* - case MDS_OP_RELEASE: - handle_client_release(req, ref); - break; - */ - - // inodes - case MDS_OP_STAT: - case MDS_OP_LSTAT: - handle_client_stat(req, ref); - break; - case MDS_OP_UTIME: - handle_client_utime(req, ref); - break; - case MDS_OP_CHMOD: - handle_client_chmod(req, ref); - break; - case MDS_OP_CHOWN: - handle_client_chown(req, ref); - break; - - // namespace - case MDS_OP_READDIR: - handle_client_readdir(req, ref); - break; - case MDS_OP_MKNOD: - handle_client_mknod(req, ref); - break; - case MDS_OP_LINK: - handle_client_link(req, ref); - break; - case MDS_OP_UNLINK: - handle_client_unlink(req, ref); - break; - case MDS_OP_RENAME: - handle_client_rename(req, ref); - break; - case MDS_OP_RMDIR: - handle_client_unlink(req, ref); - break; - case MDS_OP_MKDIR: - handle_client_mkdir(req, ref); - break; - case MDS_OP_SYMLINK: - handle_client_symlink(req, ref); - break; - - - - default: - dout(1) << " unknown client op " << req->get_op() << endl; - assert(0); - } - - return; -} - - -// FIXME: this probably should go somewhere else. - -bool Server::try_open_dir(CInode *in, MClientRequest *req) -{ - if (!in->dir && in->is_frozen_dir()) { - // doh! - dout(10) << " dir inode is frozen, can't open dir, waiting " << *in << endl; - assert(in->get_parent_dir()); - in->get_parent_dir()->add_waiter(CDIR_WAIT_UNFREEZE, - new C_MDS_RetryRequest(mds, req, in)); - return false; - } - - in->get_or_open_dir(mds->mdcache); - return true; -} - - - - - -// =============================================================================== -// STAT - -void Server::handle_client_stat(MClientRequest *req, - CInode *ref) -{ - // FIXME: this is really not the way to handle the statlite mask. - - // do I need file info? - int mask = req->get_iarg(); - if (mask & (INODE_MASK_SIZE|INODE_MASK_MTIME)) { - // yes. do a full stat. - if (!mds->locker->inode_file_read_start(ref, req)) - return; // syncing - mds->locker->inode_file_read_finish(ref); - } else { - // nope! easy peasy. - } - - mds->balancer->hit_inode(ref, META_POP_IRD); - - // reply - //dout(10) << "reply to " << *req << " stat " << ref->inode.mtime << endl; - MClientReply *reply = new MClientReply(req); - reply_request(req, reply, ref); -} - - - - -// =============================================================================== -// INODE UPDATES - - -/* - * finisher: do a inode_file_write_finish and reply. - */ -class C_MDS_utime_finish : public Context { - MDS *mds; - MClientRequest *req; - CInode *in; - version_t pv; - time_t mtime, atime; -public: - C_MDS_utime_finish(MDS *m, MClientRequest *r, CInode *i, version_t pdv, time_t mt, time_t at) : - mds(m), req(r), in(i), - pv(pdv), - mtime(mt), atime(at) { } - void finish(int r) { - assert(r == 0); - - // apply - in->inode.mtime = mtime; - in->inode.atime = atime; - in->mark_dirty(pv); - - // unlock - mds->locker->inode_file_write_finish(in); - - // reply - MClientReply *reply = new MClientReply(req, 0); - reply->set_result(0); - mds->server->reply_request(req, reply, in); - } -}; - - -// utime - -void Server::handle_client_utime(MClientRequest *req, - CInode *cur) -{ - // write - if (!mds->locker->inode_file_write_start(cur, req)) - return; // fw or (wait for) sync - - mds->balancer->hit_inode(cur, META_POP_IWR); - - // prepare - version_t pdv = cur->pre_dirty(); - time_t mtime = req->get_targ(); - time_t atime = req->get_targ2(); - C_MDS_utime_finish *fin = new C_MDS_utime_finish(mds, req, cur, pdv, - mtime, atime); - - // log + wait - EUpdate *le = new EUpdate("utime"); - le->metablob.add_dir_context(cur->get_parent_dir()); - inode_t *pi = le->metablob.add_dentry(cur->parent, true); - pi->mtime = mtime; - pi->atime = mtime; - pi->version = pdv; - - mdlog->submit_entry(le); - mdlog->wait_for_sync(fin); -} - - -// -------------- - -/* - * finisher: do a inode_hard_write_finish and reply. - */ -class C_MDS_chmod_finish : public Context { - MDS *mds; - MClientRequest *req; - CInode *in; - version_t pv; - int mode; -public: - C_MDS_chmod_finish(MDS *m, MClientRequest *r, CInode *i, version_t pdv, int mo) : - mds(m), req(r), in(i), pv(pdv), mode(mo) { } - void finish(int r) { - assert(r == 0); - - // apply - in->inode.mode &= ~04777; - in->inode.mode |= (mode & 04777); - in->mark_dirty(pv); - - // unlock - mds->locker->inode_hard_write_finish(in); - - // reply - MClientReply *reply = new MClientReply(req, 0); - reply->set_result(0); - mds->server->reply_request(req, reply, in); - } -}; - - -// chmod - -void Server::handle_client_chmod(MClientRequest *req, - CInode *cur) -{ - // write - if (!mds->locker->inode_hard_write_start(cur, req)) - return; // fw or (wait for) lock - - mds->balancer->hit_inode(cur, META_POP_IWR); - - // prepare - version_t pdv = cur->pre_dirty(); - int mode = req->get_iarg(); - C_MDS_chmod_finish *fin = new C_MDS_chmod_finish(mds, req, cur, pdv, - mode); - - // log + wait - EUpdate *le = new EUpdate("chmod"); - le->metablob.add_dir_context(cur->get_parent_dir()); - inode_t *pi = le->metablob.add_dentry(cur->parent, true); - pi->mode = mode; - pi->version = pdv; - - mdlog->submit_entry(le); - mdlog->wait_for_sync(fin); -} - - -// chown - -class C_MDS_chown_finish : public Context { - MDS *mds; - MClientRequest *req; - CInode *in; - version_t pv; - int uid, gid; -public: - C_MDS_chown_finish(MDS *m, MClientRequest *r, CInode *i, version_t pdv, int u, int g) : - mds(m), req(r), in(i), pv(pdv), uid(u), gid(g) { } - void finish(int r) { - assert(r == 0); - - // apply - if (uid >= 0) in->inode.uid = uid; - if (gid >= 0) in->inode.gid = gid; - in->mark_dirty(pv); - - // unlock - mds->locker->inode_hard_write_finish(in); - - // reply - MClientReply *reply = new MClientReply(req, 0); - reply->set_result(0); - mds->server->reply_request(req, reply, in); - } -}; - - -void Server::handle_client_chown(MClientRequest *req, - CInode *cur) -{ - // write - if (!mds->locker->inode_hard_write_start(cur, req)) - return; // fw or (wait for) lock - - mds->balancer->hit_inode(cur, META_POP_IWR); - - // prepare - version_t pdv = cur->pre_dirty(); - int uid = req->get_iarg(); - int gid = req->get_iarg2(); - C_MDS_chown_finish *fin = new C_MDS_chown_finish(mds, req, cur, pdv, - uid, gid); - - // log + wait - EUpdate *le = new EUpdate("chown"); - le->metablob.add_dir_context(cur->get_parent_dir()); - inode_t *pi = le->metablob.add_dentry(cur->parent, true); - if (uid >= 0) pi->uid = uid; - if (gid >= 0) pi->gid = gid; - pi->version = pdv; - - mdlog->submit_entry(le); - mdlog->wait_for_sync(fin); -} - - - - - - - -// ================================================================= -// DIRECTORY and NAMESPACE OPS - -// READDIR - -int Server::encode_dir_contents(CDir *dir, - list& inls, - list& dnls) -{ - int numfiles = 0; - - for (CDir_map_t::iterator it = dir->begin(); - it != dir->end(); - it++) { - CDentry *dn = it->second; - - // hashed? - if (dir->is_hashed() && - mds->get_nodeid() != mds->mdcache->hash_dentry( dir->ino(), it->first )) - continue; - - if (dn->is_null()) continue; - - CInode *in = dn->inode; - if (!in) - continue; // hmm, fixme!, what about REMOTE links? - - dout(12) << "including inode " << *in << endl; - - // add this item - // note: InodeStat makes note of whether inode data is readable. - dnls.push_back( it->first ); - inls.push_back( new InodeStat(in, mds->get_nodeid()) ); - numfiles++; - } - return numfiles; -} - - -/* - * note: this is pretty sloppy, but should work just fine i think... - */ -void Server::handle_hash_readdir(MHashReaddir *m) -{ - CInode *cur = mdcache->get_inode(m->get_ino()); - assert(cur); - - if (!cur->dir || - !cur->dir->is_hashed()) { - assert(0); - dout(7) << "handle_hash_readdir don't have dir open, or not hashed. giving up!" << endl; - delete m; - return; - } - CDir *dir = cur->dir; - assert(dir); - assert(dir->is_hashed()); - - // complete? - if (!dir->is_complete()) { - dout(10) << " incomplete dir contents for readdir on " << *dir << ", fetching" << endl; - mds->mdstore->fetch_dir(dir, new C_MDS_RetryMessage(mds, m)); - return; - } - - // get content - list inls; - list dnls; - int num = encode_dir_contents(dir, inls, dnls); - - // sent it back! - messenger->send_message(new MHashReaddirReply(dir->ino(), inls, dnls, num), - m->get_source_inst(), MDS_PORT_CACHE); -} - - -void Server::handle_hash_readdir_reply(MHashReaddirReply *m) -{ - CInode *cur = mdcache->get_inode(m->get_ino()); - assert(cur); - - if (!cur->dir || - !cur->dir->is_hashed()) { - assert(0); - dout(7) << "handle_hash_readdir don't have dir open, or not hashed. giving up!" << endl; - delete m; - return; - } - CDir *dir = cur->dir; - assert(dir); - assert(dir->is_hashed()); - - // move items to hashed_readdir gather - int from = m->get_source().num(); - assert(dir->hashed_readdir.count(from) == 0); - dir->hashed_readdir[from].first.splice(dir->hashed_readdir[from].first.begin(), - m->get_in()); - dir->hashed_readdir[from].second.splice(dir->hashed_readdir[from].second.begin(), - m->get_dn()); - delete m; - - // gather finished? - if (dir->hashed_readdir.size() < (unsigned)mds->mdsmap->get_num_mds()) { - dout(7) << "still waiting for more hashed readdir bits" << endl; - return; - } - - dout(7) << "got last bit! finishing waiters" << endl; - - // do these finishers. they'll copy the results. - list finished; - dir->take_waiting(CDIR_WAIT_THISHASHEDREADDIR, finished); - finish_contexts(finished); - - // now discard these results - for (map, list > >::iterator it = dir->hashed_readdir.begin(); - it != dir->hashed_readdir.end(); - it++) { - for (list::iterator ci = it->second.first.begin(); - ci != it->second.first.end(); - ci++) - delete *ci; - } - dir->hashed_readdir.clear(); - - // unpin dir (we're done!) - dir->auth_unpin(); - - // trigger any waiters for next hashed readdir cycle - dir->take_waiting(CDIR_WAIT_NEXTHASHEDREADDIR, mds->finished_queue); -} - - -class C_MDS_HashReaddir : public Context { - Server *server; - MClientRequest *req; - CDir *dir; -public: - C_MDS_HashReaddir(Server *server, MClientRequest *req, CDir *dir) { - this->server = server; - this->req = req; - this->dir = dir; - } - void finish(int r) { - server->finish_hash_readdir(req, dir); - } -}; - -void Server::finish_hash_readdir(MClientRequest *req, CDir *dir) -{ - dout(7) << "finish_hash_readdir on " << *dir << endl; - - assert(dir->is_hashed()); - assert(dir->hashed_readdir.size() == (unsigned)mds->mdsmap->get_num_mds()); - - // reply! - MClientReply *reply = new MClientReply(req); - reply->set_result(0); - - for (int i=0; imdsmap->get_num_mds(); i++) { - reply->copy_dir_items(dir->hashed_readdir[i].first, - dir->hashed_readdir[i].second); - } - - // ok! - reply_request(req, reply, dir->inode); -} - - -void Server::handle_client_readdir(MClientRequest *req, - CInode *cur) -{ - // it's a directory, right? - if (!cur->is_dir()) { - // not a dir - dout(10) << "reply to " << *req << " readdir -ENOTDIR" << endl; - reply_request(req, -ENOTDIR); - return; - } - - // auth? - if (!cur->dir_is_auth()) { - int dirauth = cur->authority(); - if (cur->dir) - dirauth = cur->dir->authority(); - assert(dirauth >= 0); - assert(dirauth != mds->get_nodeid()); - - // forward to authority - dout(10) << " forwarding readdir to authority " << dirauth << endl; - mdcache->request_forward(req, dirauth); - return; - } - - if (!try_open_dir(cur, req)) - return; - assert(cur->dir->is_auth()); - - // unhashing? wait! - if (cur->dir->is_hashed() && - cur->dir->is_unhashing()) { - dout(10) << "unhashing, waiting" << endl; - cur->dir->add_waiter(CDIR_WAIT_UNFREEZE, - new C_MDS_RetryRequest(mds, req, cur)); - return; - } - - // check perm - if (!mds->locker->inode_hard_read_start(cur,req)) - return; - mds->locker->inode_hard_read_finish(cur); - - CDir *dir = cur->dir; - assert(dir); - - if (!dir->is_complete()) { - // fetch - dout(10) << " incomplete dir contents for readdir on " << *cur->dir << ", fetching" << endl; - mds->mdstore->fetch_dir(dir, new C_MDS_RetryRequest(mds, req, cur)); - return; - } - - if (dir->is_hashed()) { - // HASHED - dout(7) << "hashed dir" << endl; - if (!dir->can_auth_pin()) { - dout(7) << "can't auth_pin dir " << *dir << " waiting" << endl; - dir->add_waiter(CDIR_WAIT_AUTHPINNABLE, new C_MDS_RetryRequest(mds, req, cur)); - return; - } - - if (!dir->hashed_readdir.empty()) { - dout(7) << "another readdir gather in progres, waiting" << endl; - dir->add_waiter(CDIR_WAIT_NEXTHASHEDREADDIR, new C_MDS_RetryRequest(mds, req, cur)); - return; - } - - // start new readdir gather - dout(7) << "staring new hashed readdir gather" << endl; - - // pin auth for process! - dir->auth_pin(); - - // get local bits - encode_dir_contents(cur->dir, - dir->hashed_readdir[mds->get_nodeid()].first, - dir->hashed_readdir[mds->get_nodeid()].second); - - // request other bits - for (int i=0; imdsmap->get_num_mds(); i++) { - if (i == mds->get_nodeid()) continue; - mds->send_message_mds(new MHashReaddir(dir->ino()), i, MDS_PORT_SERVER); - } - - // wait - dir->add_waiter(CDIR_WAIT_THISHASHEDREADDIR, - new C_MDS_HashReaddir(this, req, dir)); - } else { - // NON-HASHED - // build dir contents - list inls; - list dnls; - int numfiles = encode_dir_contents(cur->dir, inls, dnls); - - // . too - dnls.push_back("."); - inls.push_back(new InodeStat(cur, mds->get_nodeid())); - ++numfiles; - - // yay, reply - MClientReply *reply = new MClientReply(req); - reply->take_dir_items(inls, dnls, numfiles); - - dout(10) << "reply to " << *req << " readdir " << numfiles << " files" << endl; - reply->set_result(0); - - //balancer->hit_dir(cur->dir); - - // reply - reply_request(req, reply, cur); - } -} - - - -// ------------------------------------------------ - -// MKNOD - -class C_MDS_mknod_finish : public Context { - MDS *mds; - MClientRequest *req; - CDentry *dn; - CInode *newi; - version_t pv; -public: - C_MDS_mknod_finish(MDS *m, MClientRequest *r, CDentry *d, CInode *ni) : - mds(m), req(r), dn(d), newi(ni), - pv(d->get_projected_version()) {} - void finish(int r) { - assert(r == 0); - - // link the inode - dn->get_dir()->link_inode(dn, newi); - - // dirty inode, dn, dir - newi->mark_dirty(pv); - - // unlock - mds->locker->dentry_xlock_finish(dn); - - // hit pop - mds->balancer->hit_inode(newi, META_POP_IWR); - - // reply - MClientReply *reply = new MClientReply(req, 0); - reply->set_result(0); - mds->server->reply_request(req, reply, newi); - } -}; - -void Server::handle_client_mknod(MClientRequest *req, CInode *diri) -{ - CInode *newi = 0; - CDentry *dn = 0; - - // make dentry and inode, xlock dentry. - if (!prepare_mknod(req, diri, &newi, &dn)) - return; - assert(newi); - assert(dn); - - // it's a file. - newi->inode.mode = req->get_iarg(); - newi->inode.mode &= ~INODE_TYPE_MASK; - newi->inode.mode |= INODE_MODE_FILE; - - // prepare finisher - C_MDS_mknod_finish *fin = new C_MDS_mknod_finish(mds, req, dn, newi); - EUpdate *le = new EUpdate("mknod"); - le->metablob.add_dir_context(diri->dir); - inode_t *pi = le->metablob.add_dentry(dn, true, newi); - pi->version = dn->get_projected_version(); - - // log + wait - mdlog->submit_entry(le); - mdlog->wait_for_sync(fin); -} - - - -/* - * verify that the dir exists and would own the dname. - * do not check if the dentry exists. - */ -CDir *Server::validate_new_dentry_dir(MClientRequest *req, CInode *diri, string& name) -{ - // make sure parent is a dir? - if (!diri->is_dir()) { - dout(7) << "validate_new_dentry_dir: not a dir" << endl; - reply_request(req, -ENOTDIR); - return false; - } - - // am i not open, not auth? - if (!diri->dir && !diri->is_auth()) { - int dirauth = diri->authority(); - dout(7) << "validate_new_dentry_dir: don't know dir auth, not open, auth is i think mds" << dirauth << endl; - mdcache->request_forward(req, dirauth); - return false; - } - - if (!try_open_dir(diri, req)) - return false; - CDir *dir = diri->dir; - - // make sure it's my dentry - int dnauth = dir->dentry_authority(name); - if (dnauth != mds->get_nodeid()) { - // fw - dout(7) << "mknod on " << req->get_path() << ", dentry " << *dir - << " dn " << name - << " not mine, fw to " << dnauth << endl; - mdcache->request_forward(req, dnauth); - return false; - } - - // dir auth pinnable? - if (!dir->can_auth_pin()) { - dout(7) << "validate_new_dentry_dir: dir " << *dir << " not pinnable, waiting" << endl; - dir->add_waiter(CDIR_WAIT_AUTHPINNABLE, - new C_MDS_RetryRequest(mds, req, diri)); - return false; - } - - // frozen? - if (dir->is_frozen()) { - dout(7) << "dir is frozen " << *dir << endl; - dir->add_waiter(CDIR_WAIT_UNFREEZE, - new C_MDS_RetryRequest(mds, req, diri)); - return false; - } - - return dir; -} - -/* - * prepare a mknod-type operation (mknod, mkdir, symlink, open+create). - * create the inode and dentry, but do not link them. - * pre_dirty the dentry+dir. - * xlock the dentry. - * - * return val - * 0 - wait for something - * 1 - created - * 2 - already exists (only if okexist=true) - */ -int Server::prepare_mknod(MClientRequest *req, CInode *diri, - CInode **pin, CDentry **pdn, - bool okexist) -{ - dout(10) << "prepare_mknod " << req->get_filepath() << " in " << *diri << endl; - - // get containing directory (without last bit) - filepath dirpath = req->get_filepath().prefixpath(req->get_filepath().depth() - 1); - string name = req->get_filepath().last_bit(); - - CDir *dir = validate_new_dentry_dir(req, diri, name); - if (!dir) return 0; - - // make sure name doesn't already exist - *pdn = dir->lookup(name); - if (*pdn) { - if (!(*pdn)->can_read(req)) { - dout(10) << "waiting on (existing!) dentry " << **pdn << endl; - dir->add_waiter(CDIR_WAIT_DNREAD, name, new C_MDS_RetryRequest(mds, req, diri)); - return 0; - } - - if (!(*pdn)->is_null()) { - // name already exists - if (okexist) { - dout(10) << "dentry " << name << " exists in " << *dir << endl; - *pin = (*pdn)->inode; - return 2; - } else { - dout(10) << "dentry " << name << " exists in " << *dir << endl; - reply_request(req, -EEXIST); - return 0; - } - } - } - - // make sure dir is complete - if (!dir->is_complete()) { - dout(7) << " incomplete dir contents for " << *dir << ", fetching" << endl; - mds->mdstore->fetch_dir(dir, new C_MDS_RetryRequest(mds, req, diri)); - return 0; - } - - // make sure dir is pinnable - - - // create inode - *pin = mdcache->create_inode(); - (*pin)->inode.uid = req->get_caller_uid(); - (*pin)->inode.gid = req->get_caller_gid(); - (*pin)->inode.ctime = (*pin)->inode.mtime = (*pin)->inode.atime = g_clock.gettime(); // now - // note: inode.version will get set by finisher's mark_dirty. - - // create dentry - if (!*pdn) - *pdn = dir->add_dentry(name, 0); - - (*pdn)->pre_dirty(); - - // xlock dentry - bool res = mds->locker->dentry_xlock_start(*pdn, req, diri); - assert(res == true); - - // bump modify pop - mds->balancer->hit_dir(dir, META_POP_DWR); - - return 1; -} - - - - - -// MKDIR - -void Server::handle_client_mkdir(MClientRequest *req, CInode *diri) -{ - CInode *newi = 0; - CDentry *dn = 0; - - // make dentry and inode, xlock dentry. - if (!prepare_mknod(req, diri, &newi, &dn)) - return; - assert(newi); - assert(dn); - - // it's a directory. - newi->inode.mode = req->get_iarg(); - newi->inode.mode &= ~INODE_TYPE_MASK; - newi->inode.mode |= INODE_MODE_DIR; - newi->inode.layout = g_OSD_MDDirLayout; - - // ...and that new dir is empty. - CDir *newdir = newi->get_or_open_dir(mds->mdcache); - newdir->mark_complete(); - newdir->mark_dirty(newdir->pre_dirty()); - - // prepare finisher - C_MDS_mknod_finish *fin = new C_MDS_mknod_finish(mds, req, dn, newi); - EUpdate *le = new EUpdate("mkdir"); - le->metablob.add_dir_context(diri->dir); - inode_t *pi = le->metablob.add_dentry(dn, true, newi); - pi->version = dn->get_projected_version(); - le->metablob.add_dir(newi->dir, true); - - // log + wait - mdlog->submit_entry(le); - mdlog->wait_for_sync(fin); - - - /* old export heuristic. pbly need to reimplement this at some point. - if ( - diri->dir->is_auth() && - diri->dir->is_rep() && - newdir->is_auth() && - !newdir->is_hashing()) { - int dest = rand() % mds->mdsmap->get_num_mds(); - if (dest != whoami) { - dout(10) << "exporting new dir " << *newdir << " in replicated parent " << *diri->dir << endl; - mdcache->migrator->export_dir(newdir, dest); - } - } - */ -} - - - -// SYMLINK - -void Server::handle_client_symlink(MClientRequest *req, CInode *diri) -{ - CInode *newi = 0; - CDentry *dn = 0; - - // make dentry and inode, xlock dentry. - if (!prepare_mknod(req, diri, &newi, &dn)) - return; - assert(newi); - assert(dn); - - // it's a symlink - newi->inode.mode &= ~INODE_TYPE_MASK; - newi->inode.mode |= INODE_MODE_SYMLINK; - newi->symlink = req->get_sarg(); - - // prepare finisher - C_MDS_mknod_finish *fin = new C_MDS_mknod_finish(mds, req, dn, newi); - EUpdate *le = new EUpdate("symlink"); - le->metablob.add_dir_context(diri->dir); - inode_t *pi = le->metablob.add_dentry(dn, true, newi); - pi->version = dn->get_projected_version(); - - // log + wait - mdlog->submit_entry(le); - mdlog->wait_for_sync(fin); -} - - - - - -// LINK - -class C_MDS_LinkTraverse : public Context { - Server *server; - MClientRequest *req; - CInode *ref; -public: - vector trace; - C_MDS_LinkTraverse(Server *server, MClientRequest *req, CInode *ref) { - this->server = server; - this->req = req; - this->ref = ref; - } - void finish(int r) { - server->handle_client_link_2(r, req, ref, trace); - } -}; - -void Server::handle_client_link(MClientRequest *req, CInode *ref) -{ - // figure out name - string dname = req->get_filepath().last_bit(); - dout(7) << "handle_client_link dname is " << dname << endl; - - // validate dir - CDir *dir = validate_new_dentry_dir(req, ref, dname); - if (!dir) return; - - // dentry exists? - CDentry *dn = dir->lookup(dname); - if (dn && (!dn->is_null() || dn->is_xlockedbyother(req))) { - dout(7) << "handle_client_link dn exists " << *dn << endl; - reply_request(req, -EEXIST); - return; - } - - // xlock dentry - if (!dn->is_xlockedbyme(req)) { - if (!mds->locker->dentry_xlock_start(dn, req, ref)) - return; - } - - // discover link target - filepath target = req->get_sarg(); - dout(7) << "handle_client_link discovering target " << target << endl; - C_MDS_LinkTraverse *onfinish = new C_MDS_LinkTraverse(this, req, ref); - Context *ondelay = new C_MDS_RetryRequest(mds, req, ref); - - mdcache->path_traverse(target, onfinish->trace, false, - req, ondelay, - MDS_TRAVERSE_DISCOVER, //XLOCK, - onfinish); -} - - -class C_MDS_RemoteLink : public Context { - Server *server; - MClientRequest *req; - CInode *ref; - CDentry *dn; - CInode *targeti; -public: - C_MDS_RemoteLink(Server *server, MClientRequest *req, CInode *ref, CDentry *dn, CInode *targeti) { - this->server = server; - this->req = req; - this->ref = ref; - this->dn = dn; - this->targeti = targeti; - } - void finish(int r) { - if (r > 0) { // success - // yay - server->handle_client_link_finish(req, ref, dn, targeti); - } - else if (r == 0) { - // huh? retry! - assert(0); - server->dispatch_request(req, ref); - } else { - // link failed - server->reply_request(req, r); - } - } -}; - -void Server::handle_client_link_2(int r, MClientRequest *req, CInode *diri, vector& trace) -{ - // target dne? - if (r < 0) { - dout(7) << "target " << req->get_sarg() << " dne" << endl; - reply_request(req, r); - return; - } - assert(r == 0); - - CInode *targeti = mdcache->get_root(); - if (trace.size()) targeti = trace[trace.size()-1]->inode; - assert(targeti); - - // dir? - dout(7) << "target is " << *targeti << endl; - if (targeti->is_dir()) { - dout(7) << "target is a dir, failing" << endl; - reply_request(req, -EINVAL); - return; - } - - // what was the new dentry again? - CDir *dir = diri->dir; - assert(dir); - string dname = req->get_filepath().last_bit(); - CDentry *dn = dir->lookup(dname); - assert(dn); - assert(dn->is_xlockedbyme(req)); - - - // ok! - if (targeti->is_auth()) { - // mine - - // same dir? - if (targeti->get_parent_dir() == dn->get_dir()) { - dout(7) << "target is in the same dir, sweet" << endl; - } - else if (targeti->is_anchored()) { - dout(7) << "target anchored already (nlink=" << targeti->inode.nlink << "), sweet" << endl; - } else { - assert(targeti->inode.nlink == 1); - dout(7) << "target needs anchor, nlink=" << targeti->inode.nlink << ", creating anchor" << endl; - - mdcache->anchor_inode(targeti, - new C_MDS_RetryRequest(mds, req, diri)); - return; - } - - // ok, inc link! - targeti->inode.nlink++; - dout(7) << "nlink++, now " << targeti->inode.nlink << " on " << *targeti << endl; - targeti->_mark_dirty(); // fixme - - } else { - // remote: send nlink++ request, wait - dout(7) << "target is remote, sending InodeLink" << endl; - mds->send_message_mds(new MInodeLink(targeti->ino(), mds->get_nodeid()), targeti->authority(), MDS_PORT_CACHE); - - // wait - targeti->add_waiter(CINODE_WAIT_LINK, - new C_MDS_RemoteLink(this, req, diri, dn, targeti)); - return; - } - - handle_client_link_finish(req, diri, dn, targeti); -} - -void Server::handle_client_link_finish(MClientRequest *req, CInode *ref, - CDentry *dn, CInode *targeti) -{ - // create remote link - dn->dir->link_inode(dn, targeti->ino()); - dn->link_remote( targeti ); // since we have it - dn->_mark_dirty(); // fixme - - mds->balancer->hit_dir(dn->dir, META_POP_DWR); - - // done! - commit_request(req, new MClientReply(req, 0), ref, - 0); // FIXME i should log something -} - - -// UNLINK - -void Server::handle_client_unlink(MClientRequest *req, - CInode *diri) -{ - // rmdir or unlink - bool rmdir = false; - if (req->get_op() == MDS_OP_RMDIR) rmdir = true; - - // find it - if (req->get_filepath().depth() == 0) { - dout(7) << "can't rmdir root" << endl; - reply_request(req, -EINVAL); - return; - } - string name = req->get_filepath().last_bit(); - - // make sure parent is a dir? - if (!diri->is_dir()) { - dout(7) << "not a dir" << endl; - reply_request(req, -ENOTDIR); - return; - } - - // am i not open, not auth? - if (!diri->dir && !diri->is_auth()) { - int dirauth = diri->authority(); - dout(7) << "don't know dir auth, not open, auth is i think " << dirauth << endl; - mdcache->request_forward(req, dirauth); - return; - } - - if (!try_open_dir(diri, req)) return; - CDir *dir = diri->dir; - int dnauth = dir->dentry_authority(name); - - // does it exist? - CDentry *dn = dir->lookup(name); - if (!dn) { - if (dnauth == mds->get_nodeid()) { - dout(7) << "handle_client_rmdir/unlink dne " << name << " in " << *dir << endl; - reply_request(req, -ENOENT); - } else { - // send to authority! - dout(7) << "handle_client_rmdir/unlink fw, don't have " << name << " in " << *dir << endl; - mdcache->request_forward(req, dnauth); - } - return; - } - - // have it. locked? - if (!dn->can_read(req)) { - dout(10) << " waiting on " << *dn << endl; - dir->add_waiter(CDIR_WAIT_DNREAD, - name, - new C_MDS_RetryRequest(mds, req, diri)); - return; - } - - // null? - if (dn->is_null()) { - dout(10) << "unlink on null dn " << *dn << endl; - reply_request(req, -ENOENT); - return; - } - - // ok! - CInode *in = dn->inode; - assert(in); - if (rmdir) { - dout(7) << "handle_client_rmdir on dir " << *in << endl; - } else { - dout(7) << "handle_client_unlink on non-dir " << *in << endl; - } - - // dir stuff - if (in->is_dir()) { - if (rmdir) { - // rmdir - - // open dir? - if (in->is_auth() && !in->dir) { - if (!try_open_dir(in, req)) return; - } - - // not dir auth? (or not open, which implies the same!) - if (!in->dir) { - dout(7) << "handle_client_rmdir dir not open for " << *in << ", sending to dn auth " << dnauth << endl; - mdcache->request_forward(req, dnauth); - return; - } - if (!in->dir->is_auth()) { - int dirauth = in->dir->authority(); - dout(7) << "handle_client_rmdir not auth for dir " << *in->dir << ", sending to dir auth " << dnauth << endl; - mdcache->request_forward(req, dirauth); - return; - } - - assert(in->dir); - assert(in->dir->is_auth()); - - // dir size check on dir auth (but not necessarily dentry auth)? - - // should be empty - if (in->dir->get_size() == 0 && !in->dir->is_complete()) { - dout(7) << "handle_client_rmdir on dir " << *in->dir << ", empty but not complete, fetching" << endl; - mds->mdstore->fetch_dir(in->dir, - new C_MDS_RetryRequest(mds, req, diri)); - return; - } - if (in->dir->get_size() > 0) { - dout(7) << "handle_client_rmdir on dir " << *in->dir << ", not empty" << endl; - reply_request(req, -ENOTEMPTY); - return; - } - - dout(7) << "handle_client_rmdir dir is empty!" << endl; - - // export sanity check - if (!in->is_auth()) { - // i should be exporting this now/soon, since the dir is empty. - dout(7) << "handle_client_rmdir dir is auth, but not inode." << endl; - if (!in->dir->is_freezing() && in->dir->is_frozen()) { - assert(in->dir->is_import()); - mdcache->migrator->export_empty_import(in->dir); - } else { - dout(7) << "apparently already exporting" << endl; - } - in->dir->add_waiter(CDIR_WAIT_UNFREEZE, - new C_MDS_RetryRequest(mds, req, diri)); - return; - } - - } else { - // unlink - dout(7) << "handle_client_unlink on dir " << *in << ", returning error" << endl; - reply_request(req, -EISDIR); - return; - } - } else { - if (rmdir) { - // unlink - dout(7) << "handle_client_rmdir on non-dir " << *in << ", returning error" << endl; - reply_request(req, -ENOTDIR); - return; - } - } - - // am i dentry auth? - if (dnauth != mds->get_nodeid()) { - // not auth; forward! - dout(7) << "handle_client_unlink not auth for " << *dir << " dn " << dn->name << ", fwd to " << dnauth << endl; - mdcache->request_forward(req, dnauth); - return; - } - - dout(7) << "handle_client_unlink/rmdir on " << *in << endl; - - // xlock dentry - if (!mds->locker->dentry_xlock_start(dn, req, diri)) - return; - - // is this a remote link? - if (dn->is_remote() && !dn->inode) { - CInode *in = mdcache->get_inode(dn->get_remote_ino()); - if (in) { - dn->link_remote(in); - } else { - // open inode - dout(7) << "opening target inode first, ino is " << dn->get_remote_ino() << endl; - mdcache->open_remote_ino(dn->get_remote_ino(), req, - new C_MDS_RetryRequest(mds, req, diri)); - return; - } - } - - - mds->balancer->hit_dir(dn->dir, META_POP_DWR); - - // it's locked, unlink! - MClientReply *reply = new MClientReply(req,0); - mdcache->dentry_unlink(dn, - new C_MDS_CommitRequest(this, req, reply, diri, - new EString("unlink fixme"))); - return; -} - - - - - - -// RENAME - -class C_MDS_RenameTraverseDst : public Context { - Server *server; - MClientRequest *req; - CInode *ref; - CInode *srcdiri; - CDir *srcdir; - CDentry *srcdn; - filepath destpath; -public: - vector trace; - - C_MDS_RenameTraverseDst(Server *server, - MClientRequest *req, - CInode *ref, - CInode *srcdiri, - CDir *srcdir, - CDentry *srcdn, - filepath& destpath) { - this->server = server; - this->req = req; - this->ref = ref; - this->srcdiri = srcdiri; - this->srcdir = srcdir; - this->srcdn = srcdn; - this->destpath = destpath; - } - void finish(int r) { - server->handle_client_rename_2(req, ref, - srcdiri, srcdir, srcdn, destpath, - trace, r); - } -}; - - -/* - - weirdness iwith rename: - - ref inode is what was originally srcdiri, but that may change by the tiem - the rename actually happens. for all practical purpose, ref is useless except - for C_MDS_RetryRequest - - */ -void Server::handle_client_rename(MClientRequest *req, - CInode *ref) -{ - dout(7) << "handle_client_rename on " << *req << endl; - - // sanity checks - if (req->get_filepath().depth() == 0) { - dout(7) << "can't rename root" << endl; - reply_request(req, -EINVAL); - return; - } - // mv a/b a/b/c -- meaningless - if (req->get_sarg().compare( 0, req->get_path().length(), req->get_path()) == 0 && - req->get_sarg().c_str()[ req->get_path().length() ] == '/') { - dout(7) << "can't rename to underneath myself" << endl; - reply_request(req, -EINVAL); - return; - } - - // mv blah blah -- also meaningless - if (req->get_sarg() == req->get_path()) { - dout(7) << "can't rename something to itself (or into itself)" << endl; - reply_request(req, -EINVAL); - return; - } - - // traverse to source - /* - this is abnoraml, just for rename. since we don't pin source path - (because we don't want to screw up the lock ordering) the ref inode - (normally/initially srcdiri) may move, and this may fail. - -> so, re-traverse path. and make sure we request_finish in the case of a forward! - */ - filepath refpath = req->get_filepath(); - string srcname = refpath.last_bit(); - refpath = refpath.prefixpath(refpath.depth()-1); - - dout(7) << "handle_client_rename src traversing to srcdir " << refpath << endl; - vector trace; - int r = mdcache->path_traverse(refpath, trace, true, - req, new C_MDS_RetryRequest(mds, req, ref), - MDS_TRAVERSE_FORWARD); - if (r == 2) { - dout(7) << "path traverse forwarded, ending request, doing manual request_cleanup" << endl; - dout(7) << "(pseudo) request_forward to 9999 req " << *req << endl; - mdcache->request_cleanup(req); // not _finish (deletes) or _forward (path_traverse did that) - return; - } - if (r > 0) return; - if (r < 0) { // dne or something. got renamed out from under us, probably! - dout(7) << "traverse r=" << r << endl; - reply_request(req, r); - return; - } - - CInode *srcdiri; - if (trace.size()) - srcdiri = trace[trace.size()-1]->inode; - else - srcdiri = mdcache->get_root(); - - dout(7) << "handle_client_rename srcdiri is " << *srcdiri << endl; - - dout(7) << "handle_client_rename srcname is " << srcname << endl; - - // make sure parent is a dir? - if (!srcdiri->is_dir()) { - dout(7) << "srcdiri not a dir " << *srcdiri << endl; - reply_request(req, -EINVAL); - return; - } - - // am i not open, not auth? - if (!srcdiri->dir && !srcdiri->is_auth()) { - int dirauth = srcdiri->authority(); - dout(7) << "don't know dir auth, not open, srcdir auth is probably " << dirauth << endl; - mdcache->request_forward(req, dirauth); - return; - } - - if (!try_open_dir(srcdiri, req)) return; - CDir *srcdir = srcdiri->dir; - dout(7) << "handle_client_rename srcdir is " << *srcdir << endl; - - // make sure it's my dentry - int srcauth = srcdir->dentry_authority(srcname); - if (srcauth != mds->get_nodeid()) { - // fw - dout(7) << "rename on " << req->get_path() << ", dentry " << *srcdir << " dn " << srcname << " not mine, fw to " << srcauth << endl; - mdcache->request_forward(req, srcauth); - return; - } - // ok, done passing buck. - - // src dentry - CDentry *srcdn = srcdir->lookup(srcname); - - // xlocked? - if (srcdn && !srcdn->can_read(req)) { - dout(10) << " waiting on " << *srcdn << endl; - srcdir->add_waiter(CDIR_WAIT_DNREAD, - srcname, - new C_MDS_RetryRequest(mds, req, srcdiri)); - return; - } - - if ((srcdn && !srcdn->inode) || - (!srcdn && srcdir->is_complete())) { - dout(10) << "handle_client_rename src dne " << endl; - reply_request(req, -EEXIST); - return; - } - - if (!srcdn && !srcdir->is_complete()) { - dout(10) << "readding incomplete dir" << endl; - mds->mdstore->fetch_dir(srcdir, - new C_MDS_RetryRequest(mds, req, srcdiri)); - return; - } - assert(srcdn && srcdn->inode); - - - dout(10) << "handle_client_rename srcdn is " << *srcdn << endl; - dout(10) << "handle_client_rename srci is " << *srcdn->inode << endl; - - // pin src in cache (so it won't expire) - mdcache->request_pin_inode(req, srcdn->inode); - - // find the destination, normalize - // discover, etc. on the way... just get it on the local node. - filepath destpath = req->get_sarg(); - - C_MDS_RenameTraverseDst *onfinish = new C_MDS_RenameTraverseDst(this, req, ref, srcdiri, srcdir, srcdn, destpath); - Context *ondelay = new C_MDS_RetryRequest(mds, req, ref); - - /* - * use DISCOVERXLOCK mode: - * the dest may not exist, and may be xlocked from a remote host - * we want to succeed if we find the xlocked dentry - * ?? - */ - mdcache->path_traverse(destpath, onfinish->trace, false, - req, ondelay, - MDS_TRAVERSE_DISCOVER, //XLOCK, - onfinish); -} - -void Server::handle_client_rename_2(MClientRequest *req, - CInode *ref, - CInode *srcdiri, - CDir *srcdir, - CDentry *srcdn, - filepath& destpath, - vector& trace, - int r) -{ - dout(7) << "handle_client_rename_2 on " << *req << endl; - dout(12) << " r = " << r << " trace depth " << trace.size() << " destpath depth " << destpath.depth() << endl; - - CInode *srci = srcdn->inode; - assert(srci); - CDir* destdir = 0; - string destname; - - // what is the dest? (dir or file or complete filename) - // note: trace includes root, destpath doesn't (include leading /) - if (trace.size() && trace[trace.size()-1]->inode == 0) { - dout(10) << "dropping null dentry from tail of trace" << endl; - trace.pop_back(); // drop it! - } - - CInode *d; - if (trace.size()) - d = trace[trace.size()-1]->inode; - else - d = mdcache->get_root(); - assert(d); - dout(10) << "handle_client_rename_2 traced to " << *d << ", trace size = " << trace.size() << ", destpath = " << destpath.depth() << endl; - - // make sure i can open the dir? - if (d->is_dir() && !d->dir_is_auth() && !d->dir) { - // discover it - mdcache->open_remote_dir(d, - new C_MDS_RetryRequest(mds, req, ref)); - return; - } - - if (trace.size() == destpath.depth()) { - if (d->is_dir()) { - // mv /some/thing /to/some/dir - if (!try_open_dir(d, req)) return; - destdir = d->dir; // /to/some/dir - destname = req->get_filepath().last_bit(); // thing - destpath.add_dentry(destname); - } else { - // mv /some/thing /to/some/existing_filename - destdir = trace[trace.size()-1]->dir; // /to/some - destname = destpath.last_bit(); // existing_filename - } - } - else if (trace.size() == destpath.depth()-1) { - if (d->is_dir()) { - // mv /some/thing /to/some/place_that_maybe_dne (we might be replica) - if (!try_open_dir(d, req)) return; - destdir = d->dir; // /to/some - destname = destpath.last_bit(); // place_that_MAYBE_dne - } else { - dout(7) << "dest dne" << endl; - reply_request(req, -EINVAL); - return; - } - } - else { - assert(trace.size() < destpath.depth()-1); - // check traverse return value - if (r > 0) { - return; // discover, readdir, etc. - } - - // ?? - assert(r < 0 || trace.size() == 0); // musta been an error - - // error out - dout(7) << " rename dest " << destpath << " dne" << endl; - reply_request(req, -EINVAL); - return; - } - - string srcpath = req->get_path(); - dout(10) << "handle_client_rename_2 srcpath " << srcpath << endl; - dout(10) << "handle_client_rename_2 destpath " << destpath << endl; - - // src == dest? - if (srcdn->get_dir() == destdir && srcdn->name == destname) { - dout(7) << "rename src=dest, same file " << endl; - reply_request(req, -EINVAL); - return; - } - - // does destination exist? (is this an overwrite?) - CDentry *destdn = destdir->lookup(destname); - CInode *oldin = 0; - if (destdn) { - oldin = destdn->get_inode(); - - if (oldin) { - // make sure it's also a file! - // this can happen, e.g. "mv /some/thing /a/dir" where /a/dir/thing exists and is a dir. - if (oldin->is_dir()) { - // fail! - dout(7) << "dest exists and is dir" << endl; - reply_request(req, -EISDIR); - return; - } - - if (srcdn->inode->is_dir() && - !oldin->is_dir()) { - dout(7) << "cannot overwrite non-directory with directory" << endl; - reply_request(req, -EISDIR); - return; - } - } - - dout(7) << "dest exists " << *destdn << endl; - if (destdn->get_inode()) { - dout(7) << "destino is " << *destdn->get_inode() << endl; - } else { - dout(7) << "dest dn is a NULL stub" << endl; - } - } else { - dout(7) << "dest dn dne (yet)" << endl; - } - - - // local or remote? - int srcauth = srcdir->dentry_authority(srcdn->name); - int destauth = destdir->dentry_authority(destname); - dout(7) << "handle_client_rename_2 destname " << destname << " destdir " << *destdir << " auth " << destauth << endl; - - // - if (srcauth != mds->get_nodeid() || - destauth != mds->get_nodeid()) { - dout(7) << "rename has remote dest " << destauth << endl; - dout(7) << "FOREIGN RENAME" << endl; - - // punt? - if (false && srcdn->inode->is_dir()) { - reply_request(req, -EINVAL); - return; - } - - } else { - dout(7) << "rename is local" << endl; - } - - handle_client_rename_local(req, ref, - srcpath, srcdiri, srcdn, - destpath.get_path(), destdir, destdn, destname); - return; -} - - - - -void Server::handle_client_rename_local(MClientRequest *req, - CInode *ref, - string& srcpath, - CInode *srcdiri, - CDentry *srcdn, - string& destpath, - CDir *destdir, - CDentry *destdn, - string& destname) -{ - //bool everybody = false; - //if (true || srcdn->inode->is_dir()) { - /* overkill warning: lock w/ everyone for simplicity. FIXME someday! along with the foreign rename crap! - i could limit this to cases where something beneath me is exported. - could possibly limit the list. (maybe.) - Underlying constraint is that, regardless of the order i do the xlocks, and whatever - imports/exports might happen in the process, the destdir _must_ exist on any node - importing something beneath me when rename finishes, or else mayhem ensues when - their import is dangling in the cache. - */ - /* - having made a proper mess of this on the first pass, here is my plan: - - - xlocks of src, dest are done in lex order - - xlock is optional.. if you have the dentry, lock it, if not, don't. - - if you discover an xlocked dentry, you get the xlock. - - possible trouble: - - you have an import beneath the source, and don't have the dest dir. - - when the actual rename happens, you discover the dest - - actually, do this on any open dir, so we don't detach whole swaths - of our cache. - - notes: - - xlocks are initiated from authority, as are discover_replies, so replicas are - guaranteed to either not have dentry, or to have it xlocked. - - - - foreign xlocks are eventually unraveled by the initiator on success or failure. - - todo to make this work: - - hose bool everybody param crap - /- make handle_lock_dn not discover, clean up cases - /- put dest path in MRenameNotify - /- make rename_notify discover if its a dir - / - this will catch nested imports too, obviously - /- notify goes to merged list on local rename - /- notify goes to everybody on a foreign rename - /- handle_notify needs to gracefully ignore spurious notifies - */ - //dout(7) << "handle_client_rename_local: overkill? doing xlocks with _all_ nodes" << endl; - //everybody = true; - //} - - bool srclocal = srcdn->dir->dentry_authority(srcdn->name) == mds->get_nodeid(); - bool destlocal = destdir->dentry_authority(destname) == mds->get_nodeid(); - - dout(7) << "handle_client_rename_local: src local=" << srclocal << " " << *srcdn << endl; - if (destdn) { - dout(7) << "handle_client_rename_local: dest local=" << destlocal << " " << *destdn << endl; - } else { - dout(7) << "handle_client_rename_local: dest local=" << destlocal << " dn dne yet" << endl; - } - - /* lock source and dest dentries, in lexicographic order. - */ - bool dosrc = srcpath < destpath; - for (int i=0; i<2; i++) { - if (dosrc) { - - // src - if (srclocal) { - if (!srcdn->is_xlockedbyme(req) && - !mds->locker->dentry_xlock_start(srcdn, req, ref)) - return; - } else { - if (!srcdn || srcdn->xlockedby != req) { - mds->locker->dentry_xlock_request(srcdn->dir, srcdn->name, false, req, new C_MDS_RetryRequest(mds, req, ref)); - return; - } - } - dout(7) << "handle_client_rename_local: srcdn is xlock " << *srcdn << endl; - - } else { - - if (destlocal) { - // dest - if (!destdn) destdn = destdir->add_dentry(destname); - if (!destdn->is_xlockedbyme(req) && - !mds->locker->dentry_xlock_start(destdn, req, ref)) { - if (destdn->is_clean() && destdn->is_null() && destdn->is_sync()) destdir->remove_dentry(destdn); - return; - } - } else { - if (!destdn || destdn->xlockedby != req) { - /* NOTE: require that my xlocked item be a leaf/file, NOT a dir. in case - * my traverse and determination of dest vs dest/srcfilename was out of date. - */ - mds->locker->dentry_xlock_request(destdir, destname, true, req, new C_MDS_RetryRequest(mds, req, ref)); - return; - } - } - dout(7) << "handle_client_rename_local: destdn is xlock " << *destdn << endl; - - } - - dosrc = !dosrc; - } - - - // final check: verify if dest exists that src is a file - - // FIXME: is this necessary? - - if (destdn->inode) { - if (destdn->inode->is_dir()) { - dout(7) << "handle_client_rename_local failing, dest exists and is a dir: " << *destdn->inode << endl; - assert(0); - reply_request(req, -EINVAL); - return; - } - if (srcdn->inode->is_dir()) { - dout(7) << "handle_client_rename_local failing, dest exists and src is a dir: " << *destdn->inode << endl; - assert(0); - reply_request(req, -EINVAL); - return; - } - } else { - // if destdn->inode is null, then we know it's a non-existent dest, - // why? because if it's local, it dne. and if it's remote, we xlocked with - // REQXLOCKC, which will only allow you to lock a file. - // so we know dest is a file, or non-existent - if (!destlocal) { - if (srcdn->inode->is_dir()) { - // help: maybe the dest exists and is a file? ..... FIXME - } else { - // we're fine, src is file, dest is file|dne - } - } - } - - mds->balancer->hit_dir(srcdn->dir, META_POP_DWR); - mds->balancer->hit_dir(destdn->dir, META_POP_DWR); - - // we're golden. - // everything is xlocked by us, we rule, etc. - MClientReply *reply = new MClientReply(req, 0); - mdcache->renamer->file_rename( srcdn, destdn, - new C_MDS_CommitRequest(this, req, reply, srcdn->inode, - new EString("file rename fixme")) ); -} - - - - - - - - - - - -// =================================== -// TRUNCATE, FSYNC - -/* - * FIXME: this truncate implemention is WRONG WRONG WRONG - */ - -void Server::handle_client_truncate(MClientRequest *req, CInode *cur) -{ - // write - if (!mds->locker->inode_file_write_start(cur, req)) - return; // fw or (wait for) lock - - // check permissions - - // do update - cur->inode.size = req->get_sizearg(); - cur->_mark_dirty(); // fixme - - mds->locker->inode_file_write_finish(cur); - - mds->balancer->hit_inode(cur, META_POP_IWR); - - // start reply - MClientReply *reply = new MClientReply(req, 0); - - // commit - commit_request(req, reply, cur, - new EString("truncate fixme")); -} - - - -// =========================== -// open, openc, close - -void Server::handle_client_open(MClientRequest *req, - CInode *cur) -{ - int flags = req->get_iarg(); - int mode = req->get_iarg2(); - - dout(7) << "open " << flags << " on " << *cur << endl; - dout(10) << "open flags = " << flags << " mode = " << mode << endl; - - // is it a file? - if (!(cur->inode.mode & INODE_MODE_FILE)) { - dout(7) << "not a regular file" << endl; - reply_request(req, -EINVAL); // FIXME what error do we want? - return; - } - - // auth for write access - if (mode != FILE_MODE_R && mode != FILE_MODE_LAZY && - !cur->is_auth()) { - int auth = cur->authority(); - assert(auth != mds->get_nodeid()); - dout(9) << "open writeable on replica for " << *cur << " fw to auth " << auth << endl; - - mdcache->request_forward(req, auth); - return; - } - - // O_TRUNC - if (flags & O_TRUNC) { - // write - if (!mds->locker->inode_file_write_start(cur, req)) - return; // fw or (wait for) lock - - // do update - cur->inode.size = req->get_sizearg(); - cur->_mark_dirty(); // fixme - - mds->locker->inode_file_write_finish(cur); - } - - - // hmm, check permissions or something. - - - // can we issue the caps they want? - version_t fdv = mds->locker->issue_file_data_version(cur); - Capability *cap = mds->locker->issue_new_caps(cur, mode, req); - if (!cap) return; // can't issue (yet), so wait! - - dout(12) << "open gets caps " << cap_string(cap->pending()) << " for " << req->get_source() << " on " << *cur << endl; - - mds->balancer->hit_inode(cur, META_POP_IRD); - - // reply - MClientReply *reply = new MClientReply(req, 0); - reply->set_file_caps(cap->pending()); - reply->set_file_caps_seq(cap->get_last_seq()); - reply->set_file_data_version(fdv); - reply_request(req, reply, cur); -} - - -class C_MDS_openc_finish : public Context { - MDS *mds; - MClientRequest *req; - CDentry *dn; - CInode *newi; - version_t pv; -public: - C_MDS_openc_finish(MDS *m, MClientRequest *r, CDentry *d, CInode *ni) : - mds(m), req(r), dn(d), newi(ni), - pv(d->get_projected_version()) {} - void finish(int r) { - assert(r == 0); - - // link the inode - dn->get_dir()->link_inode(dn, newi); - - // dirty inode, dn, dir - newi->mark_dirty(pv); - - // unlock - mds->locker->dentry_xlock_finish(dn); - - // hit pop - mds->balancer->hit_inode(newi, META_POP_IWR); - - // ok, do the open. - mds->server->handle_client_open(req, newi); - } -}; - - -void Server::handle_client_openc(MClientRequest *req, CInode *diri) -{ - dout(7) << "open w/ O_CREAT on " << req->get_filepath() << endl; - - CInode *in = 0; - CDentry *dn = 0; - - // make dentry and inode, xlock dentry. - bool excl = req->get_iarg() & O_EXCL; - int r = prepare_mknod(req, diri, &in, &dn, !excl); - if (!r) - return; // wait on something - assert(in); - assert(dn); - - if (r == 1) { - // created. - // it's a file. - in->inode.mode = 0644; // FIXME req should have a umask - in->inode.mode |= INODE_MODE_FILE; - - // prepare finisher - C_MDS_openc_finish *fin = new C_MDS_openc_finish(mds, req, dn, in); - EUpdate *le = new EUpdate("openc"); - le->metablob.add_dir_context(diri->dir); - inode_t *pi = le->metablob.add_dentry(dn, true, in); - pi->version = dn->get_projected_version(); - - // log + wait - mdlog->submit_entry(le); - mdlog->wait_for_sync(fin); - - /* - FIXME. this needs to be rewritten when the write capability stuff starts - getting journaled. - */ - } else { - // exists! - // FIXME: do i need to repin path based existant inode? hmm. - handle_client_open(req, in); - } -} - - - - - - - - - - - - - - diff --git a/tags/20070517_before_mds_merge/mds/Server.h b/tags/20070517_before_mds_merge/mds/Server.h deleted file mode 100644 index d4509f1418e07..0000000000000 --- a/tags/20070517_before_mds_merge/mds/Server.h +++ /dev/null @@ -1,156 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_SERVER_H -#define __MDS_SERVER_H - -#include "MDS.h" - -class LogEvent; - -class Server { - MDS *mds; - MDCache *mdcache; - MDLog *mdlog; - Messenger *messenger; - - __uint64_t stat_ops; - - -public: - Server(MDS *m) : - mds(m), - mdcache(mds->mdcache), mdlog(mds->mdlog), - messenger(mds->messenger), - stat_ops(0) { - } - - void dispatch(Message *m); - - // generic request helpers - void reply_request(MClientRequest *req, int r = 0, CInode *tracei = 0); - void reply_request(MClientRequest *req, MClientReply *reply, CInode *tracei); - - void submit_update(MClientRequest *req, CInode *wrlockedi, - LogEvent *event, - Context *oncommit); - - void commit_request(MClientRequest *req, - MClientReply *reply, - CInode *tracei, - LogEvent *event, - LogEvent *event2 = 0); - - bool try_open_dir(CInode *in, MClientRequest *req); - - - // clients - void handle_client_mount(class MClientMount *m); - void handle_client_unmount(Message *m); - - void handle_client_request(MClientRequest *m); - void handle_client_request_2(MClientRequest *req, - vector& trace, - int r); - - // fs ops - void handle_client_fstat(MClientRequest *req); - - // requests - void dispatch_request(Message *m, CInode *ref); - - // inode request *req, CInode *ref; - void handle_client_stat(MClientRequest *req, CInode *ref); - void handle_client_utime(MClientRequest *req, CInode *ref); - void handle_client_inode_soft_update_2(MClientRequest *req, - MClientReply *reply, - CInode *ref); - void handle_client_chmod(MClientRequest *req, CInode *ref); - void handle_client_chown(MClientRequest *req, CInode *ref); - void handle_client_inode_hard_update_2(MClientRequest *req, - MClientReply *reply, - CInode *ref); - - // readdir - void handle_client_readdir(MClientRequest *req, CInode *ref); - int encode_dir_contents(CDir *dir, - list& inls, - list& dnls); - void handle_hash_readdir(MHashReaddir *m); - void handle_hash_readdir_reply(MHashReaddirReply *m); - void finish_hash_readdir(MClientRequest *req, CDir *dir); - - // namespace changes - void handle_client_mknod(MClientRequest *req, CInode *ref); - void handle_client_link(MClientRequest *req, CInode *ref); - void handle_client_link_2(int r, MClientRequest *req, CInode *ref, vector& trace); - void handle_client_link_finish(MClientRequest *req, CInode *ref, - CDentry *dn, CInode *targeti); - - void handle_client_unlink(MClientRequest *req, CInode *ref); - void handle_client_rename(MClientRequest *req, CInode *ref); - void handle_client_rename_2(MClientRequest *req, - CInode *ref, - CInode *srcdiri, - CDir *srcdir, - CDentry *srcdn, - filepath& destpath, - vector& trace, - int r); - void handle_client_rename_local(MClientRequest *req, CInode *ref, - string& srcpath, CInode *srcdiri, CDentry *srcdn, - string& destpath, CDir *destdir, CDentry *destdn, string& name); - - void handle_client_mkdir(MClientRequest *req, CInode *ref); - void handle_client_rmdir(MClientRequest *req, CInode *ref); - void handle_client_symlink(MClientRequest *req, CInode *ref); - - // file - void handle_client_open(MClientRequest *req, CInode *ref); - void handle_client_openc(MClientRequest *req, CInode *ref); - void handle_client_release(MClientRequest *req, CInode *in); - void handle_client_truncate(MClientRequest *req, CInode *in); - void handle_client_fsync(MClientRequest *req, CInode *in); - - - // some helpers - CInode *mknod(MClientRequest *req, CInode *ref, bool okexist=false); // used by mknod, symlink, mkdir, openc - - CDir *validate_new_dentry_dir(MClientRequest *req, CInode *diri, string& dname); - int prepare_mknod(MClientRequest *req, CInode *diri, - CInode **pin, CDentry **pdn, - bool okexist=false); - - - -}; - -class C_MDS_RetryRequest : public Context { - MDS *mds; - Message *req; // MClientRequest or MLock - CInode *ref; - public: - C_MDS_RetryRequest(MDS *mds, Message *req, CInode *ref) { - assert(ref); - this->mds = mds; - this->req = req; - this->ref = ref; - } - virtual void finish(int r) { - mds->server->dispatch_request(req, ref); - } -}; - - - -#endif diff --git a/tags/20070517_before_mds_merge/mds/events/EAlloc.h b/tags/20070517_before_mds_merge/mds/events/EAlloc.h deleted file mode 100644 index 9360db4ab49bb..0000000000000 --- a/tags/20070517_before_mds_merge/mds/events/EAlloc.h +++ /dev/null @@ -1,76 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_EALLOC_H -#define __MDS_EALLOC_H - -#include -#include "config.h" -#include "include/types.h" - -#include "../LogEvent.h" -#include "../IdAllocator.h" - -#define EALLOC_EV_ALLOC 1 -#define EALLOC_EV_FREE 2 - -class EAlloc : public LogEvent { - protected: - int idtype; - idno_t id; - int what; // alloc or dealloc - version_t table_version; - - public: - EAlloc() : LogEvent(EVENT_ALLOC) { } - EAlloc(int idtype, idno_t id, int what, version_t v) : - LogEvent(EVENT_ALLOC) { - this->idtype = idtype; - this->id = id; - this->what = what; - this->table_version = v; - } - - void encode_payload(bufferlist& bl) { - bl.append((char*)&idtype, sizeof(idtype)); - bl.append((char*)&id, sizeof(id)); - bl.append((char*)&what, sizeof(what)); - bl.append((char*)&table_version, sizeof(table_version)); - } - void decode_payload(bufferlist& bl, int& off) { - bl.copy(off, sizeof(idtype), (char*)&idtype); - off += sizeof(idtype); - bl.copy(off, sizeof(id), (char*)&id); - off += sizeof(id); - bl.copy(off, sizeof(what), (char*)&what); - off += sizeof(what); - bl.copy(off, sizeof(table_version), (char*)&table_version); - off += sizeof(table_version); - } - - - void print(ostream& out) { - if (what == EALLOC_EV_ALLOC) - out << "EAlloc alloc " << hex << id << dec << " tablev " << table_version; - else - out << "EAlloc dealloc " << hex << id << dec << " tablev " << table_version; - } - - - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); - void replay(MDS *mds); - -}; - -#endif diff --git a/tags/20070517_before_mds_merge/mds/events/EExportFinish.h b/tags/20070517_before_mds_merge/mds/events/EExportFinish.h deleted file mode 100644 index 114d580b6a499..0000000000000 --- a/tags/20070517_before_mds_merge/mds/events/EExportFinish.h +++ /dev/null @@ -1,59 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __EEXPORTFINISH_H -#define __EEXPORTFINISH_H - -#include -#include "config.h" -#include "include/types.h" - -#include "../MDS.h" - -class EExportFinish : public LogEvent { - protected: - inodeno_t dirino; // exported dir - bool success; - - public: - EExportFinish(CDir *dir, bool s) : LogEvent(EVENT_EXPORTFINISH), - dirino(dir->ino()), - success(s) { } - EExportFinish() : LogEvent(EVENT_EXPORTFINISH) { } - - void print(ostream& out) { - out << "export_finish " << dirino; - if (success) - out << " success"; - else - out << " failure"; - } - - virtual void encode_payload(bufferlist& bl) { - bl.append((char*)&dirino, sizeof(dirino)); - bl.append((char*)&success, sizeof(success)); - } - void decode_payload(bufferlist& bl, int& off) { - bl.copy(off, sizeof(dirino), (char*)&dirino); - off += sizeof(dirino); - bl.copy(off, sizeof(success), (char*)&success); - off += sizeof(success); - } - - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); - void replay(MDS *mds); - -}; - -#endif diff --git a/tags/20070517_before_mds_merge/mds/events/EExportStart.h b/tags/20070517_before_mds_merge/mds/events/EExportStart.h deleted file mode 100644 index 37ed92a7239c2..0000000000000 --- a/tags/20070517_before_mds_merge/mds/events/EExportStart.h +++ /dev/null @@ -1,68 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __EEXPORTSTART_H -#define __EEXPORTSTART_H - -#include -#include "config.h" -#include "include/types.h" - -#include "../MDS.h" - -#include "EMetaBlob.h" - -class EExportStart : public LogEvent { - public: - EMetaBlob metablob; // exported dir - protected: - inodeno_t dirino; - int dest; // dest mds - set bounds; - - public: - EExportStart(CDir *dir, int d) : LogEvent(EVENT_EXPORTSTART), - dirino(dir->ino()), - dest(d) { - metablob.add_dir_context(dir); - } - EExportStart() : LogEvent(EVENT_EXPORTSTART) { } - - set &get_bounds() { return bounds; } - - void print(ostream& out) { - out << "export_start " << dirino << " -> " << dest; - } - - virtual void encode_payload(bufferlist& bl) { - metablob._encode(bl); - bl.append((char*)&dirino, sizeof(dirino)); - bl.append((char*)&dest, sizeof(dest)); - ::_encode(bounds, bl); - } - void decode_payload(bufferlist& bl, int& off) { - metablob._decode(bl, off); - bl.copy(off, sizeof(dirino), (char*)&dirino); - off += sizeof(dirino); - bl.copy(off, sizeof(dest), (char*)&dest); - off += sizeof(dest); - ::_decode(bounds, bl, off); - } - - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); - void replay(MDS *mds); - -}; - -#endif diff --git a/tags/20070517_before_mds_merge/mds/events/EImportFinish.h b/tags/20070517_before_mds_merge/mds/events/EImportFinish.h deleted file mode 100644 index 14a9ab6403af6..0000000000000 --- a/tags/20070517_before_mds_merge/mds/events/EImportFinish.h +++ /dev/null @@ -1,59 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __EIMPORTFINISH_H -#define __EIMPORTFINISH_H - -#include -#include "config.h" -#include "include/types.h" - -#include "../MDS.h" - -class EImportFinish : public LogEvent { - protected: - inodeno_t dirino; // imported dir - bool success; - - public: - EImportFinish(CDir *dir, bool s) : LogEvent(EVENT_IMPORTFINISH), - dirino(dir->ino()), - success(s) { } - EImportFinish() : LogEvent(EVENT_IMPORTFINISH) { } - - void print(ostream& out) { - out << "import_finish " << dirino; - if (success) - out << " success"; - else - out << " failed"; - } - - virtual void encode_payload(bufferlist& bl) { - bl.append((char*)&dirino, sizeof(dirino)); - bl.append((char*)&success, sizeof(success)); - } - void decode_payload(bufferlist& bl, int& off) { - bl.copy(off, sizeof(dirino), (char*)&dirino); - off += sizeof(dirino); - bl.copy(off, sizeof(success), (char*)&success); - off += sizeof(success); - } - - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); - void replay(MDS *mds); - -}; - -#endif diff --git a/tags/20070517_before_mds_merge/mds/events/EImportMap.h b/tags/20070517_before_mds_merge/mds/events/EImportMap.h deleted file mode 100644 index 50f366faaa9fa..0000000000000 --- a/tags/20070517_before_mds_merge/mds/events/EImportMap.h +++ /dev/null @@ -1,66 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_EIMPORTMAP_H -#define __MDS_EIMPORTMAP_H - -#include "../LogEvent.h" -#include "EMetaBlob.h" - -class EImportMap : public LogEvent { -public: - EMetaBlob metablob; - set imports; - set exports; - //set hashdirs; - map > nested_exports; - - EImportMap() : LogEvent(EVENT_IMPORTMAP) { } - - void print(ostream& out) { - out << "import_map " << imports.size() << " imports, " - << exports.size() << " exports" - << " " << metablob; - } - - void encode_payload(bufferlist& bl) { - metablob._encode(bl); - ::_encode(imports, bl); - ::_encode(exports, bl); - for (set::iterator p = imports.begin(); - p != imports.end(); - ++p) { - ::_encode(nested_exports[*p], bl); - if (nested_exports[*p].empty()) - nested_exports.erase(*p); - } - } - void decode_payload(bufferlist& bl, int& off) { - metablob._decode(bl, off); - ::_decode(imports, bl, off); - ::_decode(exports, bl, off); - for (set::iterator p = imports.begin(); - p != imports.end(); - ++p) { - ::_decode(nested_exports[*p], bl, off); - if (nested_exports[*p].empty()) - nested_exports.erase(*p); - } - } - - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); - void replay(MDS *mds); -}; - -#endif diff --git a/tags/20070517_before_mds_merge/mds/events/EImportStart.h b/tags/20070517_before_mds_merge/mds/events/EImportStart.h deleted file mode 100644 index 59c074dec6f4f..0000000000000 --- a/tags/20070517_before_mds_merge/mds/events/EImportStart.h +++ /dev/null @@ -1,60 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __EIMPORTSTART_H -#define __EIMPORTSTART_H - -#include -#include "config.h" -#include "include/types.h" - -#include "../MDS.h" - -#include "EMetaBlob.h" - -class EImportStart : public LogEvent { -protected: - inodeno_t dirino; - list bounds; - - public: - EMetaBlob metablob; - - EImportStart(inodeno_t di, - list& b) : LogEvent(EVENT_IMPORTSTART), - dirino(di), bounds(b) { } - EImportStart() : LogEvent(EVENT_IMPORTSTART) { } - - void print(ostream& out) { - out << "EImportStart " << metablob; - } - - virtual void encode_payload(bufferlist& bl) { - bl.append((char*)&dirino, sizeof(dirino)); - metablob._encode(bl); - ::_encode(bounds, bl); - } - void decode_payload(bufferlist& bl, int& off) { - bl.copy(off, sizeof(dirino), (char*)&dirino); - off += sizeof(dirino); - metablob._decode(bl, off); - ::_decode(bounds, bl, off); - } - - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); - void replay(MDS *mds); - -}; - -#endif diff --git a/tags/20070517_before_mds_merge/mds/events/EMetaBlob.h b/tags/20070517_before_mds_merge/mds/events/EMetaBlob.h deleted file mode 100644 index 800c6674c91a8..0000000000000 --- a/tags/20070517_before_mds_merge/mds/events/EMetaBlob.h +++ /dev/null @@ -1,339 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_EMETABLOB_H -#define __MDS_EMETABLOB_H - -#include -#include -using namespace std; - -#include "../CInode.h" -#include "../CDir.h" -#include "../CDentry.h" - - -class MDS; - -/* - * a bunch of metadata in the journal - */ - -/* notes: - * - * - make sure you adjust the inode.version for any modified inode you - * journal. CDir and CDentry maintain a projected_version, but CInode - * doesn't, since the journaled inode usually has to be modifed - * manually anyway (to delay the change in the MDS's cache until after - * it is journaled). - * - */ - - -class EMetaBlob { - - /* fullbit - a regular dentry + inode - */ - struct fullbit { - string dn; // dentry - version_t dnv; - inode_t inode; // if it's not - string symlink; - bool dirty; - - fullbit(const string& d, version_t v, inode_t& i, bool dr) : dn(d), dnv(v), inode(i), dirty(dr) { } - fullbit(const string& d, version_t v, inode_t& i, string& sym, bool dr) : dn(d), dnv(v), inode(i), symlink(sym), dirty(dr) { } - fullbit(bufferlist& bl, int& off) { _decode(bl, off); } - void _encode(bufferlist& bl) { - ::_encode(dn, bl); - bl.append((char*)&dnv, sizeof(dnv)); - bl.append((char*)&inode, sizeof(inode)); - if (inode.is_symlink()) - ::_encode(symlink, bl); - bl.append((char*)&dirty, sizeof(dirty)); - } - void _decode(bufferlist& bl, int& off) { - ::_decode(dn, bl, off); - bl.copy(off, sizeof(dnv), (char*)&dnv); - off += sizeof(dnv); - bl.copy(off, sizeof(inode), (char*)&inode); - off += sizeof(inode); - if (inode.is_symlink()) - ::_decode(symlink, bl, off); - bl.copy(off, sizeof(dirty), (char*)&dirty); - off += sizeof(dirty); - } - }; - - /* remotebit - a dentry + remote inode link (i.e. just an ino) - */ - struct remotebit { - string dn; - version_t dnv; - inodeno_t ino; - bool dirty; - - remotebit(const string& d, version_t v, inodeno_t i, bool dr) : dn(d), dnv(v), ino(i), dirty(dr) { } - remotebit(bufferlist& bl, int& off) { _decode(bl, off); } - void _encode(bufferlist& bl) { - ::_encode(dn, bl); - bl.append((char*)&dnv, sizeof(dnv)); - bl.append((char*)&ino, sizeof(ino)); - bl.append((char*)&dirty, sizeof(dirty)); - } - void _decode(bufferlist& bl, int& off) { - ::_decode(dn, bl, off); - bl.copy(off, sizeof(dnv), (char*)&dnv); - off += sizeof(dnv); - bl.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - bl.copy(off, sizeof(dirty), (char*)&dirty); - off += sizeof(dirty); - } - }; - - /* - * nullbit - a null dentry - */ - struct nullbit { - string dn; - version_t dnv; - bool dirty; - nullbit(const string& d, version_t v, bool dr) : dn(d), dnv(v), dirty(dr) { } - nullbit(bufferlist& bl, int& off) { _decode(bl, off); } - void _encode(bufferlist& bl) { - ::_encode(dn, bl); - bl.append((char*)&dnv, sizeof(dnv)); - bl.append((char*)&dirty, sizeof(dirty)); - } - void _decode(bufferlist& bl, int& off) { - ::_decode(dn, bl, off); - bl.copy(off, sizeof(dnv), (char*)&dnv); - off += sizeof(dnv); - bl.copy(off, sizeof(dirty), (char*)&dirty); - off += sizeof(dirty); - } - }; - - - /* dirlump - contains metadata for any dir we have contents for. - */ - struct dirlump { - static const int STATE_IMPORT = (1<<0); - static const int STATE_COMPLETE = (1<<1); - static const int STATE_DIRTY = (1<<2); // dirty due to THIS journal item, that is! - - dirslice_t dirslice; - version_t dirv; - int state; - int nfull, nremote, nnull; - bufferlist bfull, bremote, bnull; - - private: - bool dn_decoded; - list dfull; - list dremote; - list dnull; - - public: - dirlump() : state(0), nfull(0), nremote(0), nnull(0), dn_decoded(true) { } - - bool is_import() { return state & STATE_IMPORT; } - void mark_import() { state |= STATE_IMPORT; } - bool is_complete() { return state & STATE_COMPLETE; } - void mark_complete() { state |= STATE_COMPLETE; } - bool is_dirty() { return state & STATE_DIRTY; } - void mark_dirty() { state |= STATE_DIRTY; } - - list &get_dfull() { return dfull; } - list &get_dremote() { return dremote; } - list &get_dnull() { return dnull; } - - void _encode_bits() { - for (list::iterator p = dfull.begin(); p != dfull.end(); ++p) - p->_encode(bfull); - for (list::iterator p = dremote.begin(); p != dremote.end(); ++p) - p->_encode(bremote); - for (list::iterator p = dnull.begin(); p != dnull.end(); ++p) - p->_encode(bnull); - } - void _decode_bits() { - if (dn_decoded) return; - int off = 0; - for (int i=0; i lump_order; - map lump_map; - - public: - - // remote pointer to to-be-journaled inode iff it's a normal (non-remote) dentry - inode_t *add_dentry(CDentry *dn, bool dirty, CInode *in=0) { - CDir *dir = dn->get_dir(); - if (!in) in = dn->get_inode(); - - // add the dir - dirlump& lump = add_dir(dir, false); - - // add the dirbit - if (dn->is_remote()) { - lump.nremote++; - if (dirty) - lump.get_dremote().push_front(remotebit(dn->get_name(), - dn->get_projected_version(), - dn->get_remote_ino(), - dirty)); - else - lump.get_dremote().push_back(remotebit(dn->get_name(), - dn->get_projected_version(), - dn->get_remote_ino(), - dirty)); - } - else if (!in) { - lump.nnull++; - if (dirty) - lump.get_dnull().push_front(nullbit(dn->get_name(), - dn->get_projected_version(), - dirty)); - else - lump.get_dnull().push_back(nullbit(dn->get_name(), - dn->get_projected_version(), - dirty)); - } - else { - lump.nfull++; - if (dirty) { - lump.get_dfull().push_front(fullbit(dn->get_name(), - dn->get_projected_version(), - in->inode, in->symlink, - dirty)); - return &lump.get_dfull().front().inode; - } else { - lump.get_dfull().push_back(fullbit(dn->get_name(), - dn->get_projected_version(), - in->inode, in->symlink, - dirty)); - return &lump.get_dfull().back().inode; - } - } - return 0; - } - - dirlump& add_dir(CDir *dir, bool dirty) { - if (lump_map.count(dir->ino()) == 0) { - lump_order.push_back(dir->ino()); - lump_map[dir->ino()].dirv = dir->get_projected_version(); - } - dirlump& l = lump_map[dir->ino()]; - if (dir->is_complete()) l.mark_complete(); - if (dir->is_import()) l.mark_import(); - if (dirty) l.mark_dirty(); - return l; - } - - void add_dir_context(CDir *dir, bool toroot=false) { - // already have this dir? (we must always add in order) - if (lump_map.count(dir->ino())) - return; - - CInode *diri = dir->get_inode(); - if (!toroot && - (dir->is_import() || dir->is_hashed())) - return; // stop at import point - if (!dir->get_inode()->get_parent_dn()) - return; - - CDentry *parent = diri->get_parent_dn(); - add_dir_context(parent->get_dir(), toroot); - add_dentry(parent, false); - } - - - // encoding - - void _encode(bufferlist& bl) { - int n = lump_map.size(); - bl.append((char*)&n, sizeof(n)); - for (list::iterator i = lump_order.begin(); - i != lump_order.end(); - ++i) { - bl.append((char*)&(*i), sizeof(*i)); - lump_map[*i]._encode(bl); - } - } - void _decode(bufferlist& bl, int& off) { - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __EPURGE_H -#define __EPURGE_H - -#include -#include "config.h" -#include "include/types.h" - -class EPurgeFinish : public LogEvent { - protected: - inodeno_t ino; - - public: - EPurgeFinish(inodeno_t i) : - LogEvent(EVENT_PURGEFINISH), - ino(i) { } - EPurgeFinish() : LogEvent(EVENT_PURGEFINISH) { } - - void print(ostream& out) { - out << "purgefinish " << ino; - } - - virtual void encode_payload(bufferlist& bl) { - bl.append((char*)&ino, sizeof(ino)); - } - void decode_payload(bufferlist& bl, int& off) { - bl.copy(off, sizeof(ino), (char*)&ino); - } - - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); - void replay(MDS *mds); - -}; - -#endif diff --git a/tags/20070517_before_mds_merge/mds/events/EString.h b/tags/20070517_before_mds_merge/mds/events/EString.h deleted file mode 100644 index 0ef7577406454..0000000000000 --- a/tags/20070517_before_mds_merge/mds/events/EString.h +++ /dev/null @@ -1,56 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __ESTRING_H -#define __ESTRING_H - -#include -#include -using namespace std; - -#include "../LogEvent.h" - -// generic log event -class EString : public LogEvent { - protected: - string event; - - public: - EString(string e) : - LogEvent(EVENT_STRING) { - event = e; - } - EString() : - LogEvent(EVENT_STRING) { - } - - void decode_payload(bufferlist& bl, int& off) { - event = bl.c_str() + off; - off += event.length() + 1; - } - void encode_payload(bufferlist& bl) { - bl.append(event.c_str(), event.length()+1); - } - - void print(ostream& out) { - out << '"' << event << '"'; - } - - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); - void replay(MDS *mds); - -}; - -#endif diff --git a/tags/20070517_before_mds_merge/mds/events/EUnlink.h b/tags/20070517_before_mds_merge/mds/events/EUnlink.h deleted file mode 100644 index 7d972488dab1b..0000000000000 --- a/tags/20070517_before_mds_merge/mds/events/EUnlink.h +++ /dev/null @@ -1,71 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __EUNLINK_H -#define __EUNLINK_H - -#include -#include "config.h" -#include "include/types.h" - -#include "../LogEvent.h" -#include "EMetaBlob.h" - -#include "../CInode.h" -#include "../CDentry.h" -#include "../CDir.h" - -/// help rewrite me - -class EUnlink : public LogEvent { - protected: - version_t dirv; - string dname; - - public: - EMetaBlob metaglob; - - /* - EUnlink(CDir *dir, CDentry* dn, CInode *in) : - LogEvent(EVENT_UNLINK), - diritrace(dir->inode), - dirv(dir->get_version()), - dname(dn->get_name()), - inodetrace(in) {} - */ - EUnlink() : LogEvent(EVENT_UNLINK) { } - - virtual void encode_payload(bufferlist& bl) { - /* - diritrace.encode(bl); - bl.append((char*)&dirv, sizeof(dirv)); - ::_encode(dname, bl); - inodetrace.encode(bl); - */ - } - void decode_payload(bufferlist& bl, int& off) { - /* - diritrace.decode(bl,off); - bl.copy(off, sizeof(dirv), (char*)&dirv); - off += sizeof(dirv); - ::_decode(dname, bl, off); - inodetrace.decode(bl, off); - */ - } - - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); - void replay(MDS *mds); -}; - -#endif diff --git a/tags/20070517_before_mds_merge/mds/events/EUpdate.h b/tags/20070517_before_mds_merge/mds/events/EUpdate.h deleted file mode 100644 index 4a8dad5876a62..0000000000000 --- a/tags/20070517_before_mds_merge/mds/events/EUpdate.h +++ /dev/null @@ -1,49 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_EUPDATE_H -#define __MDS_EUPDATE_H - -#include "../LogEvent.h" -#include "EMetaBlob.h" - -class EUpdate : public LogEvent { -public: - EMetaBlob metablob; - string type; - - EUpdate() : LogEvent(EVENT_UPDATE) { } - EUpdate(const char *s) : LogEvent(EVENT_UPDATE), - type(s) { } - - void print(ostream& out) { - if (type.length()) - out << type << " "; - out << metablob; - } - - void encode_payload(bufferlist& bl) { - ::_encode(type, bl); - metablob._encode(bl); - } - void decode_payload(bufferlist& bl, int& off) { - ::_decode(type, bl, off); - metablob._decode(bl, off); - } - - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); - void replay(MDS *mds); -}; - -#endif diff --git a/tags/20070517_before_mds_merge/mds/journal.cc b/tags/20070517_before_mds_merge/mds/journal.cc deleted file mode 100644 index 2182d33ffc878..0000000000000 --- a/tags/20070517_before_mds_merge/mds/journal.cc +++ /dev/null @@ -1,589 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "events/EString.h" - -#include "events/EMetaBlob.h" -#include "events/EAlloc.h" -#include "events/EUpdate.h" -#include "events/EImportMap.h" - -#include "events/EPurgeFinish.h" -#include "events/EUnlink.h" -#include "events/EExportStart.h" -#include "events/EExportFinish.h" -#include "events/EImportStart.h" -#include "events/EImportFinish.h" - -#include "MDS.h" -#include "MDLog.h" -#include "MDCache.h" -#include "MDStore.h" -#include "Migrator.h" - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug_mds || l <= g_conf.debug_mds_log) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".journal " -#define derr(l) if (l<=g_conf.debug_mds || l <= g_conf.debug_mds_log) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".journal " - - -// ----------------------- -// EString - -bool EString::has_expired(MDS *mds) { - dout(10) << "EString.has_expired " << event << endl; - return true; -} -void EString::expire(MDS *mds, Context *c) -{ - dout(10) << "EString.expire " << event << endl; -} -void EString::replay(MDS *mds) -{ - dout(10) << "EString.replay " << event << endl; -} - - - -// ----------------------- -// EMetaBlob - -/* - * we need to ensure that a journaled item has either - * - * - been safely committed to its dirslice. - * - * - has been safely exported. note that !is_auth() && !is_proxy() - * implies safely exported. if !is_auth() && is_proxy(), we need to - * add a waiter for the export to complete. - * - */ -bool EMetaBlob::has_expired(MDS *mds) -{ - // examine dirv's for my lumps - for (map::iterator lp = lump_map.begin(); - lp != lump_map.end(); - ++lp) { - CInode *diri = mds->mdcache->get_inode(lp->first); - if (!diri) - continue; // we expired it - CDir *dir = diri->dir; - if (!dir) - continue; // we expired it - - // FIXME: check the slice only - - if (dir->is_proxy()) { - dout(10) << "EMetaBlob.has_expired am proxy, needed dirv " << lp->second.dirv - << " for " << *dir << endl; - return false; // we need to wait until the export flushes! - } - if (!dir->is_auth()) { - dout(10) << "EMetaBlob.has_expired not auth, needed dirv " << lp->second.dirv - << " for " << *dir << endl; - continue; // not our problem - } - - if (dir->get_last_committed_version() < lp->second.dirv) { - dout(10) << "EMetaBlob.has_expired need dirv " << lp->second.dirv - << " for " << *dir << endl; - return false; // not committed. - } else { - dout(10) << "EMetaBlob.has_expired have dirv " << lp->second.dirv - << " for " << *dir << endl; - } - } - - return true; // all dirlumps expired. -} - -void EMetaBlob::expire(MDS *mds, Context *c) -{ - list commit; - list waitfor_export; - int ncommit = 0; - - // examine dirv's for my lumps - // make list of dir slices i need to commit - for (map::iterator lp = lump_map.begin(); - lp != lump_map.end(); - ++lp) { - CInode *diri = mds->mdcache->get_inode(lp->first); - if (!diri) - continue; // we expired it - CDir *dir = diri->dir; - if (!dir) - continue; // we expired it - - // FIXME: check the slice only - - if (dir->is_proxy()) { - // wait until export is acked (logged on remote) and committed (logged locally) - CDir *ex = mds->mdcache->get_export_container(dir); - dout(10) << "EMetaBlob.expire proxy for " << *dir - << ", waiting for export finish on " << *ex << endl; - waitfor_export.push_back(ex); - continue; - } - if (!dir->is_auth()) { - dout(10) << "EMetaBlob.expire not auth, needed dirv " << lp->second.dirv - << " for " << *dir << endl; - continue; // not our problem - } - if (dir->get_last_committed_version() < lp->second.dirv) { - dout(10) << "EMetaBlob.expire need dirv " << lp->second.dirv - << ", committing " << *dir << endl; - commit.push_back(dir); - ncommit++; - } else { - dout(10) << "EMetaBlob.expire have dirv " << lp->second.dirv - << " on " << *dir << endl; - } - } - - // commit - assert(!commit.empty()); - - if (ncommit == 1) { - mds->mdstore->commit_dir(commit.front(), c); - } else { - C_Gather *gather = new C_Gather(c); - for (list::iterator p = commit.begin(); - p != commit.end(); - ++p) - mds->mdstore->commit_dir(*p, gather->new_sub()); - for (list::iterator p = waitfor_export.begin(); - p != waitfor_export.end(); - ++p) - mds->mdcache->migrator->add_export_finish_waiter(*p, gather->new_sub()); - } -} - -void EMetaBlob::replay(MDS *mds) -{ - dout(10) << "EMetaBlob.replay " << lump_map.size() << " dirlumps" << endl; - - // walk through my dirs (in order!) - for (list::iterator lp = lump_order.begin(); - lp != lump_order.end(); - ++lp) { - dout(10) << "EMetaBlob.replay dir " << *lp << endl; - dirlump &lump = lump_map[*lp]; - - // the dir - CInode *diri = mds->mdcache->get_inode(*lp); - CDir *dir; - if (!diri) { - assert(*lp == 1); - diri = mds->mdcache->create_root_inode(); - dout(10) << "EMetaBlob.replay created root " << *diri << endl; - } - if (diri->dir) { - dir = diri->dir; - dout(20) << "EMetaBlob.replay had dir " << *dir << endl; - } else { - dir = diri->get_or_open_dir(mds->mdcache); - if (*lp == 1) - dir->set_dir_auth(CDIR_AUTH_UNKNOWN); - dout(10) << "EMetaBlob.replay added dir " << *dir << endl; - } - dir->set_version( lump.dirv ); - if (lump.is_dirty()) - dir->_mark_dirty(); - if (lump.is_complete()) - dir->mark_complete(); - - // decode bits - lump._decode_bits(); - - // full dentry+inode pairs - for (list::iterator p = lump.get_dfull().begin(); - p != lump.get_dfull().end(); - p++) { - CInode *in = mds->mdcache->get_inode(p->inode.ino); - if (!in) { - // inode - in = new CInode(mds->mdcache); - in->inode = p->inode; - if (in->inode.is_symlink()) in->symlink = p->symlink; - mds->mdcache->add_inode(in); - // dentry - CDentry *dn = dir->add_dentry( p->dn, in ); - dn->set_version(p->dnv); - dn->_mark_dirty(); - dout(10) << "EMetaBlob.replay added " << *dn << " " << *in << endl; - } else { - // inode - in->inode = p->inode; - if (in->inode.is_symlink()) in->symlink = p->symlink; - // dentry - CDentry *dn = in->get_parent_dn(); - dn->set_version(p->dnv); - dn->_mark_dirty(); - dout(10) << "EMetaBlob.replay had " << *in->get_parent_dn() << " " << *in << endl; - } - } - - // remote dentries - for (list::iterator p = lump.get_dremote().begin(); - p != lump.get_dremote().end(); - p++) { - CDentry *dn = dir->lookup(p->dn); - if (!dn) { - dn = dir->add_dentry(p->dn, p->ino); - dn->set_remote_ino(p->ino); - dn->set_version(p->dnv); - dn->_mark_dirty(); - dout(10) << "EMetaBlob.replay added " << *dn << endl; - } else { - dn->set_remote_ino(p->ino); - dn->set_version(p->dnv); - dn->_mark_dirty(); - dout(10) << "EMetaBlob.replay had " << *dn << endl; - } - } - - // null dentries - for (list::iterator p = lump.get_dnull().begin(); - p != lump.get_dnull().end(); - p++) { - CDentry *dn = dir->lookup(p->dn); - if (!dn) { - dn = dir->add_dentry(p->dn); - dn->set_version(p->dnv); - dn->_mark_dirty(); - dout(10) << "EMetaBlob.replay added " << *dn << endl; - } else { - dn->set_version(p->dnv); - dn->_mark_dirty(); - dout(10) << "EMetaBlob.replay had " << *dn << endl; - } - } - } -} - - - -// ----------------------- -// EAlloc - -bool EAlloc::has_expired(MDS *mds) -{ - version_t cv = mds->idalloc->get_committed_version(); - if (cv < table_version) { - dout(10) << "EAlloc.has_expired v " << table_version << " > " << cv - << ", still dirty" << endl; - return false; // still dirty - } else { - dout(10) << "EAlloc.has_expired v " << table_version << " <= " << cv - << ", already flushed" << endl; - return true; // already flushed - } -} - -void EAlloc::expire(MDS *mds, Context *c) -{ - dout(10) << "EAlloc.expire saving idalloc table" << endl; - mds->idalloc->save(c, table_version); -} - -void EAlloc::replay(MDS *mds) -{ - if (mds->idalloc->get_version() >= table_version) { - dout(10) << "EAlloc.replay event " << table_version - << " <= table " << mds->idalloc->get_version() << endl; - } else { - dout(10) << " EAlloc.replay event " << table_version - << " - 1 == table " << mds->idalloc->get_version() << endl; - assert(table_version-1 == mds->idalloc->get_version()); - - if (what == EALLOC_EV_ALLOC) { - idno_t nid = mds->idalloc->alloc_id(true); - assert(nid == id); // this should match. - } - else if (what == EALLOC_EV_FREE) { - mds->idalloc->reclaim_id(id, true); - } - else - assert(0); - - assert(table_version == mds->idalloc->get_version()); - } -} - - -// ----------------------- -// EUpdate - -bool EUpdate::has_expired(MDS *mds) -{ - return metablob.has_expired(mds); -} - -void EUpdate::expire(MDS *mds, Context *c) -{ - metablob.expire(mds, c); -} - -void EUpdate::replay(MDS *mds) -{ - metablob.replay(mds); -} - - -// ----------------------- -// EImportMap - -bool EImportMap::has_expired(MDS *mds) -{ - if (mds->mdlog->last_import_map > get_end_off()) { - dout(10) << "EImportMap.has_expired -- there's a newer map" << endl; - return true; - } - else if (mds->mdlog->is_capped()) { - dout(10) << "EImportMap.has_expired -- log is capped, allowing map to expire" << endl; - return true; - } else { - dout(10) << "EImportMap.has_expired -- not until there's a newer map written" << endl; - return false; - } -} - -/* -class C_MDS_ImportMapFlush : public Context { - MDS *mds; - off_t end_off; -public: - C_MDS_ImportMapFlush(MDS *m, off_t eo) : mds(m), end_off(eo) { } - void finish(int r) { - // am i the last thing in the log? - if (mds->mdlog->get_write_pos() == end_off) { - // yes. we're good. - } else { - // no. submit another import_map so that we can go away. - } - } -}; -*/ - -void EImportMap::expire(MDS *mds, Context *c) -{ - dout(10) << "EImportMap.has_expire -- waiting for a newer map to be written (or for shutdown)" << endl; - mds->mdlog->import_map_expire_waiters.push_back(c); -} - -void EImportMap::replay(MDS *mds) -{ - dout(10) << "EImportMap.replay -- reconstructing import/export spanning tree" << endl; - assert(mds->mdcache->imports.empty()); - - // first, stick the spanning tree in my cache - metablob.replay(mds); - - // restore import/export maps - for (set::iterator p = imports.begin(); - p != imports.end(); - ++p) { - mds->mdcache->add_ambiguous_import(*p, nested_exports[*p]); - mds->mdcache->finish_ambiguous_import(*p); - } - - mds->mdcache->show_imports(); -} - - - -// ----------------------- -// EUnlink - -bool EUnlink::has_expired(MDS *mds) -{ - /* - // dir - CInode *diri = mds->mdcache->get_inode( diritrace.back().inode.ino ); - CDir *dir = 0; - if (diri) dir = diri->dir; - - if (dir && dir->get_last_committed_version() < dirv) return false; - - if (!inodetrace.trace.empty()) { - // inode - CInode *in = mds->mdcache->get_inode( inodetrace.back().inode.ino ); - if (in && in->get_last_committed_version() < inodetrace.back().inode.version) - return false; - } - */ - return true; -} - -void EUnlink::expire(MDS *mds, Context *c) -{ - /* - CInode *diri = mds->mdcache->get_inode( diritrace.back().inode.ino ); - CDir *dir = diri->dir; - assert(dir); - - // okay! - dout(7) << "commiting dirty (from unlink) dir " << *dir << endl; - mds->mdstore->commit_dir(dir, dirv, c); - */ -} - -void EUnlink::replay(MDS *mds) -{ -} - - - - -// ----------------------- -// EPurgeFinish - - -bool EPurgeFinish::has_expired(MDS *mds) -{ - return true; -} - -void EPurgeFinish::expire(MDS *mds, Context *c) -{ -} - -void EPurgeFinish::replay(MDS *mds) -{ -} - - - - - -// ========================================================================= - -// ----------------------- -// EExportStart - -bool EExportStart::has_expired(MDS *mds) -{ - CInode *diri = mds->mdcache->get_inode(dirino); - if (!diri) return true; - CDir *dir = diri->dir; - if (!dir) return true; - if (!mds->mdcache->migrator->is_exporting(dir)) - return true; - dout(10) << "EExportStart.has_expired still exporting " << *dir << endl; - return false; -} - -void EExportStart::expire(MDS *mds, Context *c) -{ - CInode *diri = mds->mdcache->get_inode(dirino); - assert(diri); - CDir *dir = diri->dir; - assert(dir); - assert(mds->mdcache->migrator->is_exporting(dir)); - - dout(10) << "EExportStart.expire waiting for export of " << *dir << endl; - mds->mdcache->migrator->add_export_finish_waiter(dir, c); -} - -void EExportStart::replay(MDS *mds) -{ - dout(10) << "EExportStart.replay " << dirino << " -> " << dest << endl; - metablob.replay(mds); - - // put in pending_exports lists - mds->mdlog->pending_exports[dirino] = bounds; -} - -// ----------------------- -// EExportFinish - -bool EExportFinish::has_expired(MDS *mds) -{ - // we can always expire. - return true; -} - -void EExportFinish::expire(MDS *mds, Context *c) -{ - assert(0); // should never happen. -} - -void EExportFinish::replay(MDS *mds) -{ - dout(10) << "EExportFinish.replay " << dirino << " success=" << success << endl; - - assert(mds->mdlog->pending_exports.count(dirino)); - - // finish? - if (success) - mds->mdcache->finish_ambiguous_export(dirino, mds->mdlog->pending_exports[dirino]); - - // remove from pending_exports list - mds->mdlog->pending_exports.erase(dirino); -} - - -// ----------------------- -// EImportStart - -bool EImportStart::has_expired(MDS *mds) -{ - return metablob.has_expired(mds); -} - -void EImportStart::expire(MDS *mds, Context *c) -{ - dout(10) << "EImportStart.expire " << dirino << endl; - metablob.expire(mds, c); -} - -void EImportStart::replay(MDS *mds) -{ - dout(10) << "EImportStart.replay " << dirino << endl; - metablob.replay(mds); - - // convert list -> set - set b; - for (list::iterator p = bounds.begin(); p != bounds.end(); ++p) - b.insert(*p); - - // put in ambiguous import list - mds->mdcache->add_ambiguous_import(dirino, b); -} - -// ----------------------- -// EImportFinish - -bool EImportFinish::has_expired(MDS *mds) -{ - return true; -} -void EImportFinish::expire(MDS *mds, Context *c) -{ - assert(0); // shouldn't ever happen -} - -void EImportFinish::replay(MDS *mds) -{ - dout(10) << "EImportFinish.replay " << dirino << " success=" << success << endl; - if (success) - mds->mdcache->finish_ambiguous_import(dirino); - else - mds->mdcache->cancel_ambiguous_import(dirino); -} - - - - - diff --git a/tags/20070517_before_mds_merge/mds/mdstypes.h b/tags/20070517_before_mds_merge/mds/mdstypes.h deleted file mode 100644 index 1ac4525e76559..0000000000000 --- a/tags/20070517_before_mds_merge/mds/mdstypes.h +++ /dev/null @@ -1,290 +0,0 @@ -#ifndef __MDSTYPES_H -#define __MDSTYPES_H - - -#include -#include -#include -#include -using namespace std; - -#include "config.h" -#include "common/DecayCounter.h" - -#include - - - -// md ops -#define MDS_OP_STATFS 1 - -#define MDS_OP_STAT 100 -#define MDS_OP_LSTAT 101 -#define MDS_OP_UTIME 102 -#define MDS_OP_CHMOD 103 -#define MDS_OP_CHOWN 104 - - -#define MDS_OP_READDIR 200 -#define MDS_OP_MKNOD 201 -#define MDS_OP_LINK 202 -#define MDS_OP_UNLINK 203 -#define MDS_OP_RENAME 204 - -#define MDS_OP_MKDIR 220 -#define MDS_OP_RMDIR 221 -#define MDS_OP_SYMLINK 222 - -#define MDS_OP_OPEN 301 -#define MDS_OP_TRUNCATE 306 -#define MDS_OP_FSYNC 307 -//#define MDS_OP_CLOSE 310 -#define MDS_OP_RELEASE 308 - - - -// ================================================================ - -/* meta_load_t - * hierarchical load for an inode/dir and it's children - */ -#define META_POP_IRD 0 -#define META_POP_IWR 1 -#define META_POP_DWR 2 -//#define META_POP_LOG 3 -//#define META_POP_FDIR 4 -//#define META_POP_CDIR 4 -#define META_NPOP 3 - -class meta_load_t { - public: - DecayCounter pop[META_NPOP]; - - double meta_load() { - return pop[META_POP_IRD].get() + 2*pop[META_POP_IWR].get(); - } - - void take(meta_load_t& other) { - for (int i=0; i"; -} - - -inline meta_load_t& operator-=(meta_load_t& l, meta_load_t& r) -{ - for (int i=0; i"; -} - -/* -inline mds_load_t& operator+=( mds_load_t& l, mds_load_t& r ) -{ - l.root_pop += r.root_pop; - l.req_rate += r.req_rate; - l.queue_len += r.queue_len; - return l; -} - -inline mds_load_t operator/( mds_load_t& a, double d ) -{ - mds_load_t r; - r.root_pop = a.root_pop / d; - r.req_rate = a.req_rate / d; - r.queue_len = a.queue_len / d; - return r; -} -*/ - - -// ================================================================ -// dir slices - -struct dirslice_t { - short hash_mask; - short hash_val; -}; - - - -// ================================================================ - -#define MDS_PIN_REPLICATED 1 - -class MDSCacheObject { - protected: - unsigned state; // state bits - - int ref; // reference count - set ref_set; - - map replicas; // [auth] mds -> nonce - int replica_nonce; // [replica] defined on replica - - public: - MDSCacheObject() : - state(0), - ref(0), - replica_nonce(0) {} - virtual ~MDSCacheObject() {} - - // -------------------------------------------- - // state - unsigned get_state() { return state; } - void state_clear(unsigned mask) { state &= ~mask; } - void state_set(unsigned mask) { state |= mask; } - unsigned state_test(unsigned mask) { return state & mask; } - void state_reset(unsigned s) { state = s; } - - // -------------------------------------------- - // pins - int get_num_ref() { return ref; } - bool is_pinned_by(int by) { return ref_set.count(by); } - set& get_ref_set() { return ref_set; } - - virtual void last_put() {} - virtual void bad_put(int by) { - assert(ref_set.count(by) == 1); - assert(ref > 0); - } - void put(int by) { - if (ref == 0 || ref_set.count(by) != 1) { - bad_put(by); - } else { - ref--; - ref_set.erase(by); - assert(ref == (int)ref_set.size()); - if (ref == 0) - last_put(); - } - } - - virtual void first_get() {} - virtual void bad_get(int by) { - assert(ref_set.count(by) == 0); - assert(0); - } - void get(int by) { - if (ref_set.count(by)) { - bad_get(by); - } else { - if (ref == 0) - first_get(); - ref++; - ref_set.insert(by); - assert(ref == (int)ref_set.size()); - } - } - - - - // -------------------------------------------- - // replication - bool is_replicated() { return !replicas.empty(); } - bool is_replica(int mds) { return replicas.count(mds); } - int num_replicas() { return replicas.size(); } - int add_replica(int mds) { - if (replicas.count(mds)) - return ++replicas[mds]; // inc nonce - if (replicas.empty()) - get(MDS_PIN_REPLICATED); - return replicas[mds] = 1; - } - void add_replica(int mds, int nonce) { - if (replicas.empty()) - get(MDS_PIN_REPLICATED); - replicas[mds] = nonce; - } - int get_replica_nonce(int mds) { - assert(replicas.count(mds)); - return replicas[mds]; - } - void remove_replica(int mds) { - assert(replicas.count(mds)); - replicas.erase(mds); - if (replicas.empty()) - put(MDS_PIN_REPLICATED); - } - void clear_replicas() { - if (!replicas.empty()) - put(MDS_PIN_REPLICATED); - replicas.clear(); - } - map::iterator replicas_begin() { return replicas.begin(); } - map::iterator replicas_end() { return replicas.end(); } - const map& get_replicas() { return replicas; } - - int get_replica_nonce() { return replica_nonce;} - void set_replica_nonce(int n) { replica_nonce = n; } -}; - - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MAnchorReply.h b/tags/20070517_before_mds_merge/messages/MAnchorReply.h deleted file mode 100644 index 0186118f53260..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MAnchorReply.h +++ /dev/null @@ -1,74 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MANCHORREPLY_H -#define __MANCHORREPLY_H - -#include - -#include "msg/Message.h" -#include "mds/AnchorTable.h" - -#include "MAnchorRequest.h" - - -class MAnchorReply : public Message { - int op; - inodeno_t ino; - vector trace; - - public: - MAnchorReply() {} - MAnchorReply(MAnchorRequest *req) : Message(MSG_MDS_ANCHORREPLY) { - this->op = req->get_op(); - this->ino = req->get_ino(); - } - ~MAnchorReply() { - for (unsigned i=0; i& trace) { this->trace = trace; } - - int get_op() { return op; } - inodeno_t get_ino() { return ino; } - vector& get_trace() { return trace; } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(op), (char*)&op); - off += sizeof(op); - payload.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - int n; - payload.copy(off, sizeof(int), (char*)&n); - off += sizeof(int); - for (int i=0; i_decode(payload, off); - trace.push_back(a); - } - } - - virtual void encode_payload() { - payload.append((char*)&op, sizeof(op)); - payload.append((char*)&ino, sizeof(ino)); - int n = trace.size(); - payload.append((char*)&n, sizeof(int)); - for (int i=0; i_encode(payload); - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MAnchorRequest.h b/tags/20070517_before_mds_merge/messages/MAnchorRequest.h deleted file mode 100644 index 2a2d0088978b4..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MAnchorRequest.h +++ /dev/null @@ -1,76 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MANCHORREQUEST_H -#define __MANCHORREQUEST_H - -#include - -#include "msg/Message.h" -#include "mds/AnchorTable.h" - -#define ANCHOR_OP_CREATE 1 -#define ANCHOR_OP_DESTROY 2 -#define ANCHOR_OP_LOOKUP 3 -#define ANCHOR_OP_UPDATE 4 - -class MAnchorRequest : public Message { - int op; - inodeno_t ino; - vector trace; - - public: - MAnchorRequest() {} - MAnchorRequest(int op, inodeno_t ino) : Message(MSG_MDS_ANCHORREQUEST) { - this->op = op; - this->ino = ino; - } - ~MAnchorRequest() { - for (unsigned i=0; i& trace) { this->trace = trace; } - - int get_op() { return op; } - inodeno_t get_ino() { return ino; } - vector& get_trace() { return trace; } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(op), (char*)&op); - off += sizeof(op); - payload.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - int n; - payload.copy(off, sizeof(int), (char*)&n); - off += sizeof(int); - for (int i=0; i_decode(payload, off); - trace.push_back(a); - } - } - - virtual void encode_payload() { - payload.append((char*)&op, sizeof(op)); - payload.append((char*)&ino, sizeof(ino)); - int n = trace.size(); - payload.append((char*)&n, sizeof(int)); - for (int i=0; i_encode(payload); - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MCacheExpire.h b/tags/20070517_before_mds_merge/messages/MCacheExpire.h deleted file mode 100644 index 461d283c23072..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MCacheExpire.h +++ /dev/null @@ -1,86 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MCACHEEXPIRE_H -#define __MCACHEEXPIRE_H - -class MCacheExpire : public Message { - int from; - map inodes; - map dirs; - map > dentries; - - public: - int get_from() { return from; } - map& get_inodes() { return inodes; } - map& get_dirs() { return dirs; } - map >& get_dentries() { return dentries; } - - MCacheExpire() {} - MCacheExpire(int f) : - Message(MSG_MDS_CACHEEXPIRE), - from(f) { } - - virtual char *get_type_name() { return "CEx";} - - void add_inode(inodeno_t ino, int nonce) { - inodes[ino] = nonce; - } - void add_dir(inodeno_t ino, int nonce) { - dirs[ino] = nonce; - } - void add_dentry(inodeno_t dirino, const string& dn, int nonce) { - dentries[dirino][dn] = nonce; - } - void add_dentries(inodeno_t dirino, map& dmap) { - dentries[dirino] = dmap; - } - - void decode_payload() { - int off = 0; - - payload.copy(off, sizeof(from), (char*)&from); - off += sizeof(from); - - ::_decode(inodes, payload, off); - ::_decode(dirs, payload, off); - - int n; - payload.copy(off, sizeof(int), (char*)&n); - off += sizeof(int); - for (int i=0; i >::iterator p = dentries.begin(); - p != dentries.end(); - ++p) { - payload.append((char*)&p->first, sizeof(p->first)); - ::_encode(p->second, payload); - } - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MClientBoot.h b/tags/20070517_before_mds_merge/messages/MClientBoot.h deleted file mode 100644 index 460f9f02e27f4..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MClientBoot.h +++ /dev/null @@ -1,31 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MCLIENTBOOT_H -#define __MCLIENTBOOT_H - -#include "msg/Message.h" - -class MClientBoot : public Message { - - public: - MClientBoot() : Message(MSG_CLIENT_BOOT) { } - - char *get_type_name() { return "ClientBoot"; } - - void encode_payload() { } - void decode_payload() { } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MClientFileCaps.h b/tags/20070517_before_mds_merge/messages/MClientFileCaps.h deleted file mode 100644 index 7fde047b02655..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MClientFileCaps.h +++ /dev/null @@ -1,102 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MCLIENTFILECAPS_H -#define __MCLIENTFILECAPS_H - -#define CLIENT_FILECAP_RELEASE 1 // mds closed the cap -#define CLIENT_FILECAP_STALE 2 // mds has exported the cap -#define CLIENT_FILECAP_REAP 3 // mds has imported the cap from get_mds() - -class MClientFileCaps : public Message { - public: - static const int FILECAP_RELEASE = 1; - static const int FILECAP_STALE = 2; - static const int FILECAP_REAP = 3; - - - private: - inode_t inode; - int caps; - long seq; - int wanted; - //int client; - - int special; // stale || reap; in conjunction w/ mds value - int mds; - - public: - inodeno_t get_ino() { return inode.ino; } - inode_t& get_inode() { return inode; } - int get_caps() { return caps; } - int get_wanted() { return wanted; } - long get_seq() { return seq; } - //int get_client() { return client; } - - // for cap migration - int get_mds() { return mds; } - int get_special() { return special; } - - //void set_client(int c) { client = c; } - void set_caps(int c) { caps = c; } - void set_wanted(int w) { wanted = w; } - - void set_mds(int m) { mds = m; } - void set_special(int s) { special = s; } - - MClientFileCaps() {} - MClientFileCaps(inode_t& inode, - long seq, - int caps, - int wanted, - int special=0, - int mds=0) : - Message(MSG_CLIENT_FILECAPS) { - this->inode = inode; - this->seq = seq; - this->caps = caps; - this->wanted = wanted; - this->special = special; - this->mds = mds; - } - virtual char *get_type_name() { return "Cfcap";} - - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(seq), (char*)&seq); - off += sizeof(seq); - s.copy(off, sizeof(inode), (char*)&inode); - off += sizeof(inode); - s.copy(off, sizeof(caps), (char*)&caps); - off += sizeof(caps); - s.copy(off, sizeof(wanted), (char*)&wanted); - off += sizeof(wanted); - //s.copy(off, sizeof(client), (char*)&client); - //off += sizeof(client); - s.copy(off, sizeof(mds), (char*)&mds); - off += sizeof(mds); - s.copy(off, sizeof(special), (char*)&special); - off += sizeof(special); - } - virtual void encode_payload(crope& s) { - s.append((char*)&seq, sizeof(seq)); - s.append((char*)&inode, sizeof(inode)); - s.append((char*)&caps, sizeof(caps)); - s.append((char*)&wanted, sizeof(wanted)); - //s.append((char*)&client, sizeof(client)); - s.append((char*)&mds,sizeof(mds)); - s.append((char*)&special,sizeof(special)); - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MClientInodeAuthUpdate.h b/tags/20070517_before_mds_merge/messages/MClientInodeAuthUpdate.h deleted file mode 100644 index e9083f6abc575..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MClientInodeAuthUpdate.h +++ /dev/null @@ -1,46 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MCLIENTINODEAUTHUPDATE_H -#define __MCLIENTINODEAUTHUPDATE_H - -class MClientInodeAuthUpdate : public Message { - inodeno_t ino; - int newauth; - - public: - inodeno_t get_ino() { return ino; } - int get_auth() { return newauth; } - - MClientInodeAuthUpdate() {} - MClientInodeAuthUpdate(inodeno_t ino, int newauth) : - Message(MSG_CLIENT_INODEAUTHUPDATE) { - this->ino = ino; - this->newauth = newauth; - } - virtual char *get_type_name() { return "Ciau";} - - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - s.copy(off, sizeof(newauth), (char*)&newauth); - off += sizeof(newauth); - } - virtual void encode_payload(crope& s) { - s.append((char*)&ino,sizeof(ino)); - s.append((char*)&newauth,sizeof(newauth)); - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MClientMount.h b/tags/20070517_before_mds_merge/messages/MClientMount.h deleted file mode 100644 index 0684cea8d95c2..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MClientMount.h +++ /dev/null @@ -1,34 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MCLIENTMOUNT_H -#define __MCLIENTMOUNT_H - -#include "msg/Message.h" - -class MClientMount : public Message { - - public: - MClientMount() : Message(MSG_CLIENT_MOUNT) { - } - - char *get_type_name() { return "Cmnt"; } - - virtual void decode_payload(crope& s, int& off) { - } - virtual void encode_payload(crope& s) { - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MClientMountAck.h b/tags/20070517_before_mds_merge/messages/MClientMountAck.h deleted file mode 100644 index 6b1b7cb2a901b..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MClientMountAck.h +++ /dev/null @@ -1,59 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MCLIENTMOUNTACK_H -#define __MCLIENTMOUNTACK_H - -#include "msg/Message.h" -#include "MClientMount.h" -#include "mds/MDSMap.h" -#include "osd/OSDMap.h" - - -class MClientMountAck : public Message { - long pcid; - bufferlist osd_map_state; - bufferlist mds_map_state; - - public: - MClientMountAck() {} - MClientMountAck(MClientMount *mnt, MDSMap *mdsmap, OSDMap *osdmap) : Message(MSG_CLIENT_MOUNTACK) { - this->pcid = mnt->get_pcid(); - mdsmap->encode( mds_map_state ); - osdmap->encode( osd_map_state ); - } - - bufferlist& get_mds_map_state() { return mds_map_state; } - bufferlist& get_osd_map_state() { return osd_map_state; } - - void set_pcid(long pcid) { this->pcid = pcid; } - long get_pcid() { return pcid; } - - char *get_type_name() { return "CmntA"; } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(pcid), (char*)&pcid); - off += sizeof(pcid); - ::_decode( mds_map_state, payload, off); - ::_decode( osd_map_state, payload, off); - } - virtual void encode_payload() { - payload.append((char*)&pcid, sizeof(pcid)); - ::_encode( mds_map_state, payload ); - ::_encode( osd_map_state, payload ); - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MClientReply.h b/tags/20070517_before_mds_merge/messages/MClientReply.h deleted file mode 100644 index 6206b909b0c05..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MClientReply.h +++ /dev/null @@ -1,302 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MCLIENTREPLY_H -#define __MCLIENTREPLY_H - -#include "include/types.h" - -#include "msg/Message.h" -#include "mds/CInode.h" -#include "mds/CDir.h" -#include "mds/CDentry.h" - -#include -using namespace std; - -class CInode; - -/*** - * - * MClientReply - container message for MDS reply to a client's MClientRequest - * - * key fields: - * long tid - transaction id, so the client can match up with pending request - * int result - error code, or fh if it was open - * - * for most requests: - * trace is a vector of c_inoe_info's tracing from root to the file/dir/whatever - * the operation referred to, so that the client can update it's info about what - * metadata lives on what MDS. - * - * for readdir replies: - * dir_contents is a vector c_inode_info*'s. - * - * that's mostly it, i think! - * - */ - -class InodeStat { - - public: - inode_t inode; - string symlink; // symlink content (if symlink) - - - // mds distribution hints - int dir_auth; - bool hashed, replicated; - bool spec_defined; - set dist; // where am i replicated? - - public: - InodeStat() {} - InodeStat(CInode *in, int whoami) : - inode(in->inode) - { - // inode.mask - inode.mask = INODE_MASK_BASE; - if (in->filelock.can_read(in->is_auth())) - inode.mask |= INODE_MASK_PERM; - if (in->hardlock.can_read(in->is_auth())) - inode.mask |= INODE_MASK_SIZE | INODE_MASK_MTIME; // fixme when we separate this out. - - // symlink content? - if (in->is_symlink()) - symlink = in->symlink; - - // replicated where? - if (in->dir && in->dir->is_auth()) { - spec_defined = true; - in->dir->get_dist_spec(this->dist, whoami); - } else - spec_defined = false; - - if (in->dir) - dir_auth = in->dir->get_dir_auth(); - else - dir_auth = -1; - - // dir info - hashed = (in->dir && in->dir->is_hashed()); // FIXME not quite right. - replicated = (in->dir && in->dir->is_rep()); - } - - void _encode(bufferlist &bl) { - bl.append((char*)&inode, sizeof(inode)); - bl.append((char*)&spec_defined, sizeof(spec_defined)); - bl.append((char*)&dir_auth, sizeof(dir_auth)); - bl.append((char*)&hashed, sizeof(hashed)); - bl.append((char*)&replicated, sizeof(replicated)); - - ::_encode(symlink, bl); - ::_encode(dist, bl); // distn - } - - void _decode(bufferlist &bl, int& off) { - bl.copy(off, sizeof(inode), (char*)&inode); - off += sizeof(inode); - bl.copy(off, sizeof(spec_defined), (char*)&spec_defined); - off += sizeof(spec_defined); - bl.copy(off, sizeof(dir_auth), (char*)&dir_auth); - off += sizeof(dir_auth); - bl.copy(off, sizeof(hashed), (char*)&hashed); - off += sizeof(hashed); - bl.copy(off, sizeof(replicated), (char*)&replicated); - off += sizeof(replicated); - - ::_decode(symlink, bl, off); - ::_decode(dist, bl, off); - } -}; - - -typedef struct { - long pcid; - long tid; - int op; - int result; // error code - unsigned char file_caps; // for open - long file_caps_seq; - __uint64_t file_data_version; // for client buffercache consistency - - int _num_trace_in; - int _dir_size; -} MClientReply_st; - -class MClientReply : public Message { - // reply data - MClientReply_st st; - - string path; - list trace_in; - list trace_dn; - - list dir_in; - list dir_dn; - - public: - void set_pcid(long pcid) { this->st.pcid = pcid; } - long get_pcid() { return st.pcid; } - - long get_tid() { return st.tid; } - int get_op() { return st.op; } - - int get_result() { return st.result; } - const string& get_path() { return path; } - - inodeno_t get_ino() { return trace_in.back()->inode.ino; } - const inode_t& get_inode() { return trace_in.back()->inode; } - - const list& get_trace_in() { return trace_in; } - const list& get_trace_dn() { return trace_dn; } - - const list& get_dir_in() { return dir_in; } - const list& get_dir_dn() { return dir_dn; } - - unsigned char get_file_caps() { return st.file_caps; } - long get_file_caps_seq() { return st.file_caps_seq; } - __uint64_t get_file_data_version() { return st.file_data_version; } - - void set_result(int r) { st.result = r; } - void set_file_caps(unsigned char c) { st.file_caps = c; } - void set_file_caps_seq(long s) { st.file_caps_seq = s; } - void set_file_data_version(__uint64_t v) { st.file_data_version = v; } - - MClientReply() {}; - MClientReply(MClientRequest *req, int result = 0) : - Message(MSG_CLIENT_REPLY) { - memset(&st, 0, sizeof(st)); - this->st.pcid = req->get_pcid(); // match up procedure call id!!! - this->st.tid = req->get_tid(); - this->st.op = req->get_op(); - this->path = req->get_path(); - - this->st.result = result; - - st._dir_size = 0; - st._num_trace_in = 0; - } - virtual ~MClientReply() { - list::iterator it; - - for (it = trace_in.begin(); it != trace_in.end(); ++it) - delete *it; - for (it = dir_in.begin(); it != dir_in.end(); ++it) - delete *it; - } - virtual char *get_type_name() { return "creply"; } - - - // serialization - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(st), (char*)&st); - off += sizeof(st); - - _decode(path, payload, off); - - for (int i=0; i_decode(payload, off); - trace_in.push_back(ci); - } - - for (int i=0; i_decode(payload, off); - dir_in.push_back(ci); - string dn; - ::_decode(dn, payload, off); - dir_dn.push_back(dn); - } - } - virtual void encode_payload() { - payload.append((char*)&st, sizeof(st)); - _encode(path, payload); - - // trace - list::iterator pdn = trace_dn.begin(); - list::iterator pin; - for (pin = trace_in.begin(); - pin != trace_in.end(); - ++pin) { - if (pin != trace_in.begin()) { - ::_encode(*pdn, payload); - ++pdn; - } - (*pin)->_encode(payload); - } - - // dir contents - pdn = dir_dn.begin(); - for (pin = dir_in.begin(); - pin != dir_in.end(); - ++pin, ++pdn) { - (*pin)->_encode(payload); - ::_encode(*pdn, payload); - } - } - - // builders - /* - void add_dir_item(string& dn, InodeStat *in) { - dir_dn.push_back(dn); - dir_in.push_back(in); - ++st._dir_size; - }*/ - void take_dir_items(list& inls, - list& dnls, - int num) { - dir_in.swap(inls); - dir_dn.swap(dnls); - st._dir_size = num; - } - void copy_dir_items(const list& inls, - const list& dnls) { - list::const_iterator pdn = dnls.begin(); - list::const_iterator pin = inls.begin(); - while (pin != inls.end()) { - // copy! - InodeStat *i = new InodeStat; - *i = **pin; - dir_in.push_back(i); - dir_dn.push_back(*pdn); - ++pin; - ++pdn; - ++st._dir_size; - } - } - - void set_trace_dist(CInode *in, int whoami) { - st._num_trace_in = 0; - while (in) { - // add this inode to trace, along with referring dentry name - if (in->get_parent_dn()) - trace_dn.push_front(in->get_parent_dn()->get_name()); - trace_in.push_front(new InodeStat(in, whoami)); - ++st._num_trace_in; - - in = in->get_parent_inode(); - } - } - -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MClientRequest.h b/tags/20070517_before_mds_merge/messages/MClientRequest.h deleted file mode 100644 index 9b9ac4e115cac..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MClientRequest.h +++ /dev/null @@ -1,202 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MCLIENTREQUEST_H -#define __MCLIENTREQUEST_H - -#include - -#include "msg/Message.h" -#include "include/filepath.h" -#include "mds/mdstypes.h" -#include "mds/MDS.h" - -/** - * - * MClientRequest - container for a client METADATA request. created/sent by clients. - * can be forwarded around between MDS's. - * - * int client - the originating client - * long pcid - procedure call id, used to match request+response. - * long tid - transaction id, unique among requests for that client. probably just a counter! - * -> the MDS passes the Request to the Reply constructor, so this always matches. - * - * int op - the metadata op code. MDS_OP_RENAME, etc. - * int caller_uid, _gid - guess - * - * arguments: one or more of these are defined, depending on the metadata op: - * inodeno ino - used by close(), along with fh. not strictly necessary except MDS is currently coded lame. - * filepath path - main file argument (almost everything) - * string sarg - string argument (if a second arg is needed, e.g. rename, symlink) - * int iarg - int arg... file mode for open, fh for close, mode for mkdir, etc. - * int iarg2 - second int arg... gid for chown (iarg is uid) - * time_t targ, targ2 - time args, used by utime - * - * That's basically it! - * - */ - - -typedef struct { - long tid; - int client; - int op; - - entity_inst_t client_inst; - - int caller_uid, caller_gid; - inodeno_t ino; - - int iarg, iarg2; - time_t targ, targ2; - - inodeno_t mds_wants_replica_in_dirino; - - size_t sizearg; -} MClientRequest_st; - - -class MClientRequest : public Message { - MClientRequest_st st; - filepath path; - string sarg; - string sarg2; - - - public: - MClientRequest() {} - MClientRequest(int op, int client) : Message(MSG_CLIENT_REQUEST) { - memset(&st, 0, sizeof(st)); - this->st.op = op; - this->st.client = client; - this->st.iarg = 0; - } - virtual char *get_type_name() { return "creq"; } - - // keep a pcid (procedure call id) to match up request+reply - //void set_pcid(long pcid) { this->st.pcid = pcid; } - //long get_pcid() { return st.pcid; } - - // normal fields - void set_tid(long t) { st.tid = t; } - void set_path(string& p) { path.set_path(p); } - void set_path(const char *p) { path.set_path(p); } - void set_path(const filepath& fp) { path = fp; } - void set_caller_uid(int u) { st.caller_uid = u; } - void set_caller_gid(int g) { st.caller_gid = g; } - void set_ino(inodeno_t ino) { st.ino = ino; } - void set_iarg(int i) { st.iarg = i; } - void set_iarg2(int i) { st.iarg2 = i; } - void set_targ(time_t& t) { st.targ = t; } - void set_targ2(time_t& t) { st.targ2 = t; } - void set_sarg(string& arg) { this->sarg = arg; } - void set_sarg(const char *arg) { this->sarg = arg; } - void set_sarg2(string& arg) { this->sarg2 = arg; } - void set_sizearg(size_t s) { st.sizearg = s; } - void set_mds_wants_replica_in_dirino(inodeno_t dirino) { - st.mds_wants_replica_in_dirino = dirino; } - - void set_client_inst(const entity_inst_t& i) { st.client_inst = i; } - const entity_inst_t& get_client_inst() { return st.client_inst; } - - int get_client() { return st.client; } - long get_tid() { return st.tid; } - int get_op() { return st.op; } - int get_caller_uid() { return st.caller_uid; } - int get_caller_gid() { return st.caller_gid; } - inodeno_t get_ino() { return st.ino; } - string& get_path() { return path.get_path(); } - filepath& get_filepath() { return path; } - int get_iarg() { return st.iarg; } - int get_iarg2() { return st.iarg2; } - time_t get_targ() { return st.targ; } - time_t get_targ2() { return st.targ2; } - string& get_sarg() { return sarg; } - string& get_sarg2() { return sarg2; } - size_t get_sizearg() { return st.sizearg; } - inodeno_t get_mds_wants_replica_in_dirino() { - return st.mds_wants_replica_in_dirino; } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(st), (char*)&st); - off += sizeof(st); - path._decode(payload, off); - _decode(sarg, payload, off); - _decode(sarg2, payload, off); - } - - virtual void encode_payload() { - payload.append((char*)&st, sizeof(st)); - path._encode(payload); - _encode(sarg, payload); - _encode(sarg2, payload); - } - - void print(ostream& out) { - out << "clientreq(client" << get_client() - << "." << get_tid() - //<< ".pcid=" << get_pcid() - << ":"; - switch(get_op()) { - case MDS_OP_STAT: - out << "stat"; break; - case MDS_OP_LSTAT: - out << "lstat"; break; - case MDS_OP_UTIME: - out << "utime"; break; - case MDS_OP_CHMOD: - out << "chmod"; break; - case MDS_OP_CHOWN: - out << "chown"; break; - - case MDS_OP_READDIR: - out << "readdir"; break; - case MDS_OP_MKNOD: - out << "mknod"; break; - case MDS_OP_LINK: - out << "link"; break; - case MDS_OP_UNLINK: - out << "unlink"; break; - case MDS_OP_RENAME: - out << "rename"; break; - - case MDS_OP_MKDIR: - out << "mkdir"; break; - case MDS_OP_RMDIR: - out << "rmdir"; break; - case MDS_OP_SYMLINK: - out << "symlink"; break; - - case MDS_OP_OPEN: - out << "open"; break; - case MDS_OP_TRUNCATE: - out << "truncate"; break; - case MDS_OP_FSYNC: - out << "fsync"; break; - case MDS_OP_RELEASE: - out << "release"; break; - default: - out << "unknown=" << get_op(); - } - if (get_path().length()) - out << "=" << get_path(); - if (get_sarg().length()) - out << " " << get_sarg(); - out << ")"; - } - -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MDentryUnlink.h b/tags/20070517_before_mds_merge/messages/MDentryUnlink.h deleted file mode 100644 index ec1503eeadf00..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MDentryUnlink.h +++ /dev/null @@ -1,45 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MDENTRYUNLINK_H -#define __MDENTRYUNLINK_H - -class MDentryUnlink : public Message { - inodeno_t dirino; - string dn; - - public: - inodeno_t get_dirino() { return dirino; } - string& get_dn() { return dn; } - - MDentryUnlink() {} - MDentryUnlink(inodeno_t dirino, string& dn) : - Message(MSG_MDS_DENTRYUNLINK) { - this->dirino = dirino; - this->dn = dn; - } - virtual char *get_type_name() { return "Dun";} - - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(dirino), (char*)&dirino); - off += sizeof(dirino); - _unrope(dn, s, off); - } - virtual void encode_payload(crope& s) { - s.append((char*)&dirino,sizeof(dirino)); - _rope(dn, s); - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MDirExpire.h b/tags/20070517_before_mds_merge/messages/MDirExpire.h deleted file mode 100644 index a81de3d538365..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MDirExpire.h +++ /dev/null @@ -1,50 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MDIREXPIRE_H -#define __MDIREXPIRE_H - -typedef struct { - inodeno_t ino; - int nonce; - int from; -} MDirExpire_st; - -class MDirExpire : public Message { - MDirExpire_st st; - - public: - inodeno_t get_ino() { return st.ino; } - int get_from() { return st.from; } - int get_nonce() { return st.nonce; } - - MDirExpire() {} - MDirExpire(inodeno_t ino, int from, int nonce) : - Message(MSG_MDS_DIREXPIRE) { - st.ino = ino; - st.from = from; - st.nonce = nonce; - } - virtual char *get_type_name() { return "DirEx";} - - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(st), (char*)&st); - off += sizeof(st); - } - virtual void encode_payload(crope& s) { - s.append((char*)&st,sizeof(st)); - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MDirExpireReq.h b/tags/20070517_before_mds_merge/messages/MDirExpireReq.h deleted file mode 100644 index 604a55265c723..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MDirExpireReq.h +++ /dev/null @@ -1,49 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MDIREXPIREREQ_H -#define __MDIREXPIREREQ_H - -typedef struct { - inodeno_t ino; - int nonce; - int from; -} MDirExpireReq_st; - -class MDirExpire : public Message { - MDirExpireReq_st st; - - public: - inodeno_t get_ino() { return st.ino; } - int get_from() { return st.from; } - int get_nonce() { return st.nonce; } - - MDirExpire() {} - MDirExpire(inodeno_t ino, int from, int nonce) : - Message(MSG_MDS_DIREXPIREREQ) { - st.ino = ino; - st.from = from; - st.nonce = nonce; - } - virtual char *get_type_name() { return "DirExR";} - - virtual void decode_payload(crope& s) { - s.copy(0, sizeof(st), (char*)&st); - } - virtual void encode_payload(crope& s) { - s.append((char*)&st,sizeof(st)); - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MDirUpdate.h b/tags/20070517_before_mds_merge/messages/MDirUpdate.h deleted file mode 100644 index 9bac721654c22..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MDirUpdate.h +++ /dev/null @@ -1,71 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MDIRUPDATE_H -#define __MDIRUPDATE_H - -#include "msg/Message.h" - -typedef struct { - inodeno_t ino; - int dir_rep; - int discover; -} MDirUpdate_st; - -class MDirUpdate : public Message { - MDirUpdate_st st; - set dir_rep_by; - string path; - - public: - inodeno_t get_ino() { return st.ino; } - int get_dir_rep() { return st.dir_rep; } - set& get_dir_rep_by() { return dir_rep_by; } - bool should_discover() { return st.discover > 0; } - string& get_path() { return path; } - - void tried_discover() { - if (st.discover) st.discover--; - } - - MDirUpdate() {} - MDirUpdate(inodeno_t ino, - int dir_rep, - set& dir_rep_by, - string& path, - bool discover = false) : - Message(MSG_MDS_DIRUPDATE) { - this->st.ino = ino; - this->st.dir_rep = dir_rep; - this->dir_rep_by = dir_rep_by; - if (discover) this->st.discover = 5; - this->path = path; - } - virtual char *get_type_name() { return "dup"; } - - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(st), (char*)&st); - off += sizeof(st); - _unrope(dir_rep_by, s, off); - _unrope(path, s, off); - } - - virtual void encode_payload(crope& r) { - r.append((char*)&st, sizeof(st)); - _rope(dir_rep_by, r); - _rope(path, r); - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MDiscover.h b/tags/20070517_before_mds_merge/messages/MDiscover.h deleted file mode 100644 index d207ab28cc143..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MDiscover.h +++ /dev/null @@ -1,75 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MDISCOVER_H -#define __MDISCOVER_H - -#include "msg/Message.h" -#include "mds/CDir.h" -#include "include/filepath.h" - -#include -#include -using namespace std; - - -class MDiscover : public Message { - int asker; - inodeno_t base_ino; // 0 -> none, want root - bool want_base_dir; - bool want_root_inode; - - filepath want; // ... [/]need/this/stuff - - public: - int get_asker() { return asker; } - inodeno_t get_base_ino() { return base_ino; } - filepath& get_want() { return want; } - const string& get_dentry(int n) { return want[n]; } - bool wants_base_dir() { return want_base_dir; } - - MDiscover() { } - MDiscover(int asker, - inodeno_t base_ino, - filepath& want, - bool want_base_dir = true, - bool want_root_inode = false) : - Message(MSG_MDS_DISCOVER) { - this->asker = asker; - this->base_ino = base_ino; - this->want = want; - this->want_base_dir = want_base_dir; - this->want_root_inode = want_root_inode; - } - virtual char *get_type_name() { return "Dis"; } - - virtual void decode_payload(crope& r, int& off) { - r.copy(off, sizeof(asker), (char*)&asker); - off += sizeof(asker); - r.copy(off, sizeof(base_ino), (char*)&base_ino); - off += sizeof(base_ino); - r.copy(off, sizeof(bool), (char*)&want_base_dir); - off += sizeof(bool); - want._unrope(r, off); - } - virtual void encode_payload(crope& r) { - r.append((char*)&asker, sizeof(asker)); - r.append((char*)&base_ino, sizeof(base_ino)); - r.append((char*)&want_base_dir, sizeof(want_base_dir)); - want._rope(r); - } - -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MDiscoverReply.h b/tags/20070517_before_mds_merge/messages/MDiscoverReply.h deleted file mode 100644 index c759bc9a76bd1..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MDiscoverReply.h +++ /dev/null @@ -1,254 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MDISCOVERREPLY_H -#define __MDISCOVERREPLY_H - -#include "msg/Message.h" -#include "mds/CDir.h" -#include "mds/CInode.h" -#include "include/filepath.h" - -#include -#include -using namespace std; - -#define max(a,b) ((a)>(b) ? (a):(b)) - - -/** - * MDiscoverReply - return new replicas (of inodes, dirs, dentries) - * - * we group returned items by (dir, dentry, inode). each - * item in each set shares an index (it's "depth"). - * - * we can start and end with any type. - * no_base_dir = true if the first group has an inode but no dir - * no_base_dentry = true if the first group has an inode but no dentry - * they are false if there is no returned data, ie the first group is empty. - * - * we also return errors: - * error_flag_dn(string) - the specified dentry dne - * error_flag_dir - the last item wasn't a dir, so we couldn't continue. - * - * depth() gives us the number of depth units/indices for which we have - * information. this INCLUDES those for which we have errors but no data. - * - * see MDCache::handle_discover, handle_discover_reply. - * - - old crap, maybe not accurate: - - // dir [ + ... ] : discover want_base_dir=true - - // dentry [ + inode [ + ... ] ] : discover want_base_dir=false - // no_base_dir=true - // -> we only exclude inode if dentry is null+xlock - - // inode [ + ... ], base_ino = 0 : discover base_ino=0, start w/ root ino, - // no_base_dir=no_base_dentry=true - - * - */ - -class MDiscoverReply : public Message { - inodeno_t base_ino; - bool no_base_dir; // no base dir (but IS dentry+inode) - bool no_base_dentry; // no base dentry (but IS inode) - bool flag_error_dn; - bool flag_error_dir; - string error_dentry; // dentry that was not found (to trigger waiters on asker) - - - vector dirs; // not inode-aligned if no_base_dir = true. - vector dentries; // not inode-aligned if no_base_dentry = true - vector inodes; - - string path; - - public: - // accessors - inodeno_t get_base_ino() { return base_ino; } - int get_num_inodes() { return inodes.size(); } - int get_num_dentries() { return dentries.size(); } - int get_num_dirs() { return dirs.size(); } - - int get_depth() { // return depth of deepest object (in dir/dentry/inode units) - return max( inodes.size(), // at least this many - max( no_base_dentry + dentries.size() + flag_error_dn, // inode start + path + possible error - dirs.size() + no_base_dir )); // dn/inode + dirs - } - - bool has_base_dir() { return !no_base_dir && dirs.size(); } - bool has_base_dentry() { return !no_base_dentry && dentries.size(); } - bool has_root() { - if (base_ino == 0) { - assert(no_base_dir && no_base_dentry); - return true; - } - return false; - } - - const string& get_path() { return path; } - - // bool is_flag_forward() { return flag_forward; } - bool is_flag_error_dn() { return flag_error_dn; } - bool is_flag_error_dir() { return flag_error_dir; } - string& get_error_dentry() { return error_dentry; } - - // these index _arguments_ are aligned to each ([[dir, ] dentry, ] inode) set. - CDirDiscover& get_dir(int n) { return *(dirs[n - no_base_dir]); } - CDentryDiscover& get_dentry(int n) { return *(dentries[n - no_base_dentry]); } - CInodeDiscover& get_inode(int n) { return *(inodes[n]); } - inodeno_t get_ino(int n) { return inodes[n]->get_ino(); } - - // cons - MDiscoverReply() {} - MDiscoverReply(inodeno_t base_ino) : - Message(MSG_MDS_DISCOVERREPLY) { - this->base_ino = base_ino; - flag_error_dn = false; - flag_error_dir = false; - no_base_dir = no_base_dentry = false; - } - ~MDiscoverReply() { - for (vector::iterator it = dirs.begin(); - it != dirs.end(); - it++) - delete *it; - for (vector::iterator it = inodes.begin(); - it != inodes.end(); - it++) - delete *it; - } - virtual char *get_type_name() { return "DisR"; } - - // builders - bool is_empty() { - return dirs.empty() && dentries.empty() && inodes.empty() && - !flag_error_dn && - !flag_error_dir; - } - void add_dentry(CDentryDiscover* ddis) { - if (dentries.empty() && dirs.empty()) no_base_dir = true; - dentries.push_back(ddis); - if (path.length()) path += "/"; - path += ddis->get_dname(); - } - - void add_inode(CInodeDiscover* din) { - if (inodes.empty() && dentries.empty()) no_base_dir = no_base_dentry = true; - inodes.push_back( din ); - } - - void add_dir(CDirDiscover* dir) { - dirs.push_back( dir ); - } - - // void set_flag_forward() { flag_forward = true; } - void set_flag_error_dn(const string& dn) { - flag_error_dn = true; - error_dentry = dn; - } - void set_flag_error_dir() { - flag_error_dir = true; - } - - - // ... - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(base_ino), (char*)&base_ino); - off += sizeof(base_ino); - payload.copy(off, sizeof(bool), (char*)&no_base_dir); - off += sizeof(bool); - payload.copy(off, sizeof(bool), (char*)&no_base_dentry); - off += sizeof(bool); - // payload.copy(off, sizeof(bool), (char*)&flag_forward); - //off += sizeof(bool); - payload.copy(off, sizeof(bool), (char*)&flag_error_dn); - off += sizeof(bool); - - _decode(error_dentry, payload, off); - payload.copy(off, sizeof(bool), (char*)&flag_error_dir); - off += sizeof(bool); - - // dirs - int n; - payload.copy(off, sizeof(int), (char*)&n); - off += sizeof(int); - for (int i=0; i_decode(payload, off); - } - //dout(12) << n << " dirs out" << endl; - - // inodes - payload.copy(off, sizeof(int), (char*)&n); - off += sizeof(int); - for (int i=0; i_decode(payload, off); - } - //dout(12) << n << " inodes out" << endl; - - // dentries - payload.copy(off, sizeof(int), (char*)&n); - off += sizeof(int); - for (int i=0; i_decode(payload, off); - } - } - void encode_payload() { - payload.append((char*)&base_ino, sizeof(base_ino)); - payload.append((char*)&no_base_dir, sizeof(bool)); - payload.append((char*)&no_base_dentry, sizeof(bool)); - // payload.append((char*)&flag_forward, sizeof(bool)); - payload.append((char*)&flag_error_dn, sizeof(bool)); - - _encode(error_dentry, payload); - payload.append((char*)&flag_error_dir, sizeof(bool)); - - // dirs - int n = dirs.size(); - payload.append((char*)&n, sizeof(int)); - for (vector::iterator it = dirs.begin(); - it != dirs.end(); - it++) - (*it)->_encode( payload ); - //dout(12) << n << " dirs in" << endl; - - // inodes - n = inodes.size(); - payload.append((char*)&n, sizeof(int)); - for (vector::iterator it = inodes.begin(); - it != inodes.end(); - it++) - (*it)->_encode( payload ); - //dout(12) << n << " inodes in" << endl; - - // dentries - n = dentries.size(); - payload.append((char*)&n, sizeof(int)); - for (vector::iterator it = dentries.begin(); - it != dentries.end(); - it++) - (*it)->_encode( payload ); - //dout(12) << n << " dentries in" << endl; - } - -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MExportDir.h b/tags/20070517_before_mds_merge/messages/MExportDir.h deleted file mode 100644 index 8fdda89466b1e..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MExportDir.h +++ /dev/null @@ -1,64 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MEXPORTDIR_H -#define __MEXPORTDIR_H - -#include "msg/Message.h" - - -class MExportDir : public Message { - inodeno_t ino; - - list dirstate; // a bl for reach dir - list exports; - - public: - MExportDir() {} - MExportDir(inodeno_t dirino) : - Message(MSG_MDS_EXPORTDIR), - ino(dirino) { - } - virtual char *get_type_name() { return "Ex"; } - - inodeno_t get_ino() { return ino; } - list& get_dirstate() { return dirstate; } - list& get_exports() { return exports; } - - void add_dir(bufferlist& dir) { - dirstate.push_back(dir); - } - void set_dirstate(const list& ls) { - dirstate = ls; - } - void add_export(inodeno_t dirino) { - exports.push_back(dirino); - } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - ::_decode(exports, payload, off); - ::_decode(dirstate, payload, off); - } - virtual void encode_payload() { - payload.append((char*)&ino, sizeof(ino)); - ::_encode(exports, payload); - ::_encode(dirstate, payload); - } - -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MExportDirAck.h b/tags/20070517_before_mds_merge/messages/MExportDirAck.h deleted file mode 100644 index 35691bf94e2a7..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MExportDirAck.h +++ /dev/null @@ -1,42 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MEXPORTDIRACK_H -#define __MEXPORTDIRACK_H - -#include "MExportDir.h" - -class MExportDirAck : public Message { - inodeno_t ino; - - public: - inodeno_t get_ino() { return ino; } - - MExportDirAck() {} - MExportDirAck(MExportDir *req) : - Message(MSG_MDS_EXPORTDIRACK) { - ino = req->get_ino(); - } - virtual char *get_type_name() { return "ExAck"; } - - virtual void decode_payload(crope& s) { - s.copy(0, sizeof(ino), (char*)&ino); - } - virtual void encode_payload(crope& s) { - s.append((char*)&ino, sizeof(ino)); - } - -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MExportDirDiscover.h b/tags/20070517_before_mds_merge/messages/MExportDirDiscover.h deleted file mode 100644 index 24f77036455f4..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MExportDirDiscover.h +++ /dev/null @@ -1,51 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MEXPORTDIRDISCOVER_H -#define __MEXPORTDIRDISCOVER_H - -#include "msg/Message.h" -#include "mds/CInode.h" -#include "include/types.h" - -class MExportDirDiscover : public Message { - inodeno_t ino; - string path; - - public: - inodeno_t get_ino() { return ino; } - string& get_path() { return path; } - - MExportDirDiscover() {} - MExportDirDiscover(CInode *in) : - Message(MSG_MDS_EXPORTDIRDISCOVER) { - in->make_path(path); - ino = in->ino(); - } - virtual char *get_type_name() { return "ExDis"; } - - - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - _unrope(path, s, off); - } - - virtual void encode_payload(crope& s) { - s.append((char*)&ino, sizeof(ino)); - _rope(path, s); - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MExportDirDiscoverAck.h b/tags/20070517_before_mds_merge/messages/MExportDirDiscoverAck.h deleted file mode 100644 index a25e3b46672e3..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MExportDirDiscoverAck.h +++ /dev/null @@ -1,52 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MEXPORTDIRDISCOVERACK_H -#define __MEXPORTDIRDISCOVERACK_H - -#include "msg/Message.h" -#include "mds/CInode.h" -#include "include/types.h" - -class MExportDirDiscoverAck : public Message { - inodeno_t ino; - bool success; - - public: - inodeno_t get_ino() { return ino; } - bool is_success() { return success; } - - MExportDirDiscoverAck() {} - MExportDirDiscoverAck(inodeno_t ino, bool success=true) : - Message(MSG_MDS_EXPORTDIRDISCOVERACK) { - this->ino = ino; - this->success = false; - } - virtual char *get_type_name() { return "ExDisA"; } - - - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - s.copy(off, sizeof(success), (char*)&success); - off += sizeof(success); - } - - virtual void encode_payload(crope& s) { - s.append((char*)&ino, sizeof(ino)); - s.append((char*)&success, sizeof(success)); - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MExportDirFinish.h b/tags/20070517_before_mds_merge/messages/MExportDirFinish.h deleted file mode 100644 index 89c9e5290c4b2..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MExportDirFinish.h +++ /dev/null @@ -1,43 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MEXPORTDIRFINISH_H -#define __MEXPORTDIRFINISH_H - -#include "MExportDir.h" - -class MExportDirFinish : public Message { - inodeno_t ino; - - public: - inodeno_t get_ino() { return ino; } - - MExportDirFinish() {} - MExportDirFinish(inodeno_t ino) : - Message(MSG_MDS_EXPORTDIRFINISH) { - this->ino = ino; - } - virtual char *get_type_name() { return "ExFin"; } - - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - } - virtual void encode_payload(crope& s) { - s.append((char*)&ino, sizeof(ino)); - } - -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MExportDirNotify.h b/tags/20070517_before_mds_merge/messages/MExportDirNotify.h deleted file mode 100644 index 9d6532cad478c..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MExportDirNotify.h +++ /dev/null @@ -1,111 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MEXPORTDIRNOTIFY_H -#define __MEXPORTDIRNOTIFY_H - -#include "msg/Message.h" -#include -using namespace std; - -class MExportDirNotify : public Message { - int new_auth; - int old_auth; - inodeno_t ino; - - list exports; // bounds; these dirs are _not_ included (tho the inodes are) - list subdirs; - - public: - inodeno_t get_ino() { return ino; } - int get_new_auth() { return new_auth; } - int get_old_auth() { return old_auth; } - list& get_exports() { return exports; } - list::iterator subdirs_begin() { return subdirs.begin(); } - list::iterator subdirs_end() { return subdirs.end(); } - int num_subdirs() { return subdirs.size(); } - - MExportDirNotify() {} - MExportDirNotify(inodeno_t ino, int old_auth, int new_auth) : - Message(MSG_MDS_EXPORTDIRNOTIFY) { - this->ino = ino; - this->old_auth = old_auth; - this->new_auth = new_auth; - } - virtual char *get_type_name() { return "ExNot"; } - - void copy_subdirs(list& s) { - this->subdirs = s; - } - void copy_exports(list& ex) { - this->exports = ex; - } - - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(int), (char*)&new_auth); - off += sizeof(int); - s.copy(off, sizeof(int), (char*)&old_auth); - off += sizeof(int); - s.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - - // notify - int n; - s.copy(off, sizeof(int), (char*)&n); - off += sizeof(int); - for (int i=0; i::iterator it = exports.begin(); - it != exports.end(); - it++) { - inodeno_t ino = *it; - s.append((char*)&ino, sizeof(ino)); - } - - // subdirs - n = subdirs.size(); - s.append((char*)&n, sizeof(int)); - for (list::iterator it = subdirs.begin(); - it != subdirs.end(); - it++) { - inodeno_t ino = *it; - s.append((char*)&ino, sizeof(ino)); - } - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MExportDirNotifyAck.h b/tags/20070517_before_mds_merge/messages/MExportDirNotifyAck.h deleted file mode 100644 index 3179fd4f544f1..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MExportDirNotifyAck.h +++ /dev/null @@ -1,46 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MEXPORTDIRNOTIFYACK_H -#define __MEXPORTDIRNOTIFYACK_H - -#include "msg/Message.h" -#include -using namespace std; - -class MExportDirNotifyAck : public Message { - inodeno_t ino; - - public: - inodeno_t get_ino() { return ino; } - - MExportDirNotifyAck() {} - MExportDirNotifyAck(inodeno_t ino) : - Message(MSG_MDS_EXPORTDIRNOTIFYACK) { - this->ino = ino; - } - virtual char *get_type_name() { return "ExNotA"; } - - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - } - - virtual void encode_payload(crope& s) { - s.append((char*)&ino, sizeof(ino)); - } - -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MExportDirPrep.h b/tags/20070517_before_mds_merge/messages/MExportDirPrep.h deleted file mode 100644 index 6967d950afad9..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MExportDirPrep.h +++ /dev/null @@ -1,186 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MEXPORTDIRPREP_H -#define __MEXPORTDIRPREP_H - -#include "msg/Message.h" -#include "mds/CInode.h" -#include "include/types.h" - -class MExportDirPrep : public Message { - inodeno_t ino; - - /* nested export discover payload. - not all inodes will have dirs; they may require a separate discover. - dentries are the links to each inode. - dirs map includes base dir (ino) - */ - list exports; - - list inodes; - map inode_dirino; - map inode_dentry; - - map dirs; - - bool b_did_assim; - - public: - inodeno_t get_ino() { return ino; } - list& get_exports() { return exports; } - list& get_inodes() { return inodes; } - inodeno_t get_containing_dirino(inodeno_t ino) { - return inode_dirino[ino]; - } - string& get_dentry(inodeno_t ino) { - return inode_dentry[ino]; - } - bool have_dir(inodeno_t ino) { - return dirs.count(ino); - } - CDirDiscover* get_dir(inodeno_t ino) { - return dirs[ino]; - } - - bool did_assim() { return b_did_assim; } - void mark_assim() { b_did_assim = true; } - - MExportDirPrep() { - b_did_assim = false; - } - MExportDirPrep(CInode *in) : - Message(MSG_MDS_EXPORTDIRPREP) { - ino = in->ino(); - b_did_assim = false; - } - ~MExportDirPrep() { - for (list::iterator iit = inodes.begin(); - iit != inodes.end(); - iit++) - delete *iit; - for (map::iterator dit = dirs.begin(); - dit != dirs.end(); - dit++) - delete dit->second; - } - - - virtual char *get_type_name() { return "ExP"; } - - - - - void add_export(inodeno_t dirino) { - exports.push_back( dirino ); - } - void add_inode(inodeno_t dirino, const string& dentry, CInodeDiscover *in) { - inodes.push_back(in); - inode_dirino.insert(pair(in->get_ino(), dirino)); - inode_dentry.insert(pair(in->get_ino(), dentry)); - } - void add_dir(CDirDiscover *dir) { - dirs.insert(pair(dir->get_ino(), dir)); - } - - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - - // exports - int ne; - payload.copy(off, sizeof(int), (char*)&ne); - off += sizeof(int); - for (int i=0; i_decode(payload, off); - inodes.push_back(in); - - // dentry - string d; - _decode(d, payload, off); - inode_dentry[in->get_ino()] = d; - - // dir ino - inodeno_t dino; - payload.copy(off, sizeof(dino), (char*)&dino); - off += sizeof(dino); - inode_dirino[in->get_ino()] = dino; - } - - // dirs - int nd; - payload.copy(off, sizeof(int), (char*)&nd); - off += sizeof(int); - for (int i=0; i_decode(payload, off); - dirs[dir->get_ino()] = dir; - } - } - - virtual void encode_payload() { - payload.append((char*)&ino, sizeof(ino)); - - // exports - int ne = exports.size(); - payload.append((char*)&ne, sizeof(int)); - for (list::iterator it = exports.begin(); - it != exports.end(); - it++) { - inodeno_t ino = *it; - payload.append((char*)&ino, sizeof(ino)); - } - - // inodes - int ni = inodes.size(); - payload.append((char*)&ni, sizeof(int)); - for (list::iterator iit = inodes.begin(); - iit != inodes.end(); - iit++) { - (*iit)->_encode(payload); - - // dentry - _encode(inode_dentry[(*iit)->get_ino()], payload); - - // dir ino - inodeno_t ino = inode_dirino[(*iit)->get_ino()]; - payload.append((char*)&ino, sizeof(ino)); - } - - // dirs - int nd = dirs.size(); - payload.append((char*)&nd, sizeof(int)); - for (map::iterator dit = dirs.begin(); - dit != dirs.end(); - dit++) - dit->second->_encode(payload); - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MExportDirPrepAck.h b/tags/20070517_before_mds_merge/messages/MExportDirPrepAck.h deleted file mode 100644 index c32d7255c5074..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MExportDirPrepAck.h +++ /dev/null @@ -1,44 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MEXPORTDIRPREPACK_H -#define __MEXPORTDIRPREPACK_H - -#include "msg/Message.h" -#include "include/types.h" - -class MExportDirPrepAck : public Message { - inodeno_t ino; - - public: - inodeno_t get_ino() { return ino; } - - MExportDirPrepAck() {} - MExportDirPrepAck(inodeno_t ino) : - Message(MSG_MDS_EXPORTDIRPREPACK) { - this->ino = ino; - } - - virtual char *get_type_name() { return "ExPAck"; } - - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - } - virtual void encode_payload(crope& s) { - s.append((char*)&ino, sizeof(ino)); - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MExportDirWarning.h b/tags/20070517_before_mds_merge/messages/MExportDirWarning.h deleted file mode 100644 index 6f2fdf55dde4f..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MExportDirWarning.h +++ /dev/null @@ -1,45 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MEXPORTDIRWARNING_H -#define __MEXPORTDIRWARNING_H - -#include "msg/Message.h" -#include "mds/CInode.h" -#include "include/types.h" - -class MExportDirWarning : public Message { - inodeno_t ino; - - public: - inodeno_t get_ino() { return ino; } - - MExportDirWarning() {} - MExportDirWarning(inodeno_t ino) : - Message(MSG_MDS_EXPORTDIRWARNING) { - this->ino = ino; - } - - virtual char *get_type_name() { return "ExW"; } - - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - } - virtual void encode_payload(crope& s) { - s.append((char*)&ino, sizeof(ino)); - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MFailure.h b/tags/20070517_before_mds_merge/messages/MFailure.h deleted file mode 100644 index 0ec53f6e36b18..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MFailure.h +++ /dev/null @@ -1,49 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MFAILURE_H -#define __MFAILURE_H - -#include "msg/Message.h" - - -class MFailure : public Message { - public: - entity_name_t failed; - entity_inst_t inst; - - MFailure() {} - MFailure(entity_name_t f, entity_inst_t& i) : - Message(MSG_FAILURE), - failed(f), inst(i) {} - - entity_name_t get_failed() { return failed; } - entity_inst_t& get_inst() { return inst; } - - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(failed), (char*)&failed); - off += sizeof(failed); - payload.copy(off, sizeof(inst), (char*)&inst); - off += sizeof(inst); - } - void encode_payload() { - payload.append((char*)&failed, sizeof(failed)); - payload.append((char*)&inst, sizeof(inst)); - } - - virtual char *get_type_name() { return "fail"; } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MFailureAck.h b/tags/20070517_before_mds_merge/messages/MFailureAck.h deleted file mode 100644 index ec0036dcdac55..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MFailureAck.h +++ /dev/null @@ -1,42 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MFAILUREACK_H -#define __MFAILUREACK_H - -#include "MFailure.h" - - -class MFailureAck : public Message { - public: - entity_name_t failed; - MFailureAck(MFailure *m) : Message(MSG_FAILURE_ACK) { - this->failed = m->get_failed(); - } - MFailureAck() {} - - entity_name_t get_failed() { return failed; } - - virtual void decode_payload(crope& s, int& off) { - s.copy(0, sizeof(failed), (char*)&failed); - off += sizeof(failed); - } - virtual void encode_payload(crope& s) { - s.append((char*)&failed, sizeof(failed)); - } - - virtual char *get_type_name() { return "faila"; } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MGenericMessage.h b/tags/20070517_before_mds_merge/messages/MGenericMessage.h deleted file mode 100644 index b2f39534e6e23..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MGenericMessage.h +++ /dev/null @@ -1,44 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MGENERICMESSAGE_H -#define __MGENERICMESSAGE_H - -#include "msg/Message.h" - -class MGenericMessage : public Message { - char tname[20]; - //long pcid; - - public: - MGenericMessage(int t) : Message(t) { - sprintf(tname, "generic%d", get_type()); - } - - //void set_pcid(long pcid) { this->pcid = pcid; } - //long get_pcid() { return pcid; } - - char *get_type_name() { return tname; } - - virtual void decode_payload() { - //int off = 0; - //payload.copy(off, sizeof(pcid), (char*)&pcid); - //off += sizeof(pcid); - } - virtual void encode_payload() { - //payload.append((char*)&pcid, sizeof(pcid)); - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MHashDir.h b/tags/20070517_before_mds_merge/messages/MHashDir.h deleted file mode 100644 index ddf7e3ac2bbce..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MHashDir.h +++ /dev/null @@ -1,64 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MHASHDIR_H -#define __MHASHDIR_H - -#include "msg/Message.h" - -class MHashDir : public Message { - inodeno_t ino; - bufferlist state; - int nden; - - public: - MHashDir() {} - MHashDir(inodeno_t ino) : - Message(MSG_MDS_HASHDIR) { - this->ino = ino; - nden = 0; - } - virtual char *get_type_name() { return "Ha"; } - - inodeno_t get_ino() { return ino; } - bufferlist& get_state() { return state; } - bufferlist* get_state_ptr() { return &state; } - int get_nden() { return nden; } - - void set_nden(int n) { nden = n; } - void inc_nden() { nden++; } - - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - payload.copy(off, sizeof(nden), (char*)&nden); - off += sizeof(nden); - - size_t len; - payload.copy(off, sizeof(len), (char*)&len); - off += sizeof(len); - state.substr_of(payload, off, len); - } - void encode_payload() { - payload.append((char*)&ino, sizeof(ino)); - payload.append((char*)&nden, sizeof(nden)); - size_t size = state.length(); - payload.append((char*)&size, sizeof(size)); - payload.claim_append(state); - } - -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MHashDirAck.h b/tags/20070517_before_mds_merge/messages/MHashDirAck.h deleted file mode 100644 index cd6d4da8cf34f..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MHashDirAck.h +++ /dev/null @@ -1,42 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MHASHDIRACK_H -#define __MHASHDIRACK_H - -#include "MHashDir.h" - -class MHashDirAck : public Message { - inodeno_t ino; - - public: - inodeno_t get_ino() { return ino; } - - MHashDirAck() {} - MHashDirAck(inodeno_t ino) : - Message(MSG_MDS_HASHDIRACK) { - this->ino = ino; - } - virtual char *get_type_name() { return "HAck"; } - - virtual void decode_payload() { - payload.copy(0, sizeof(ino), (char*)&ino); - } - virtual void encode_payload() { - payload.append((char*)&ino, sizeof(ino)); - } - -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MHashDirDiscover.h b/tags/20070517_before_mds_merge/messages/MHashDirDiscover.h deleted file mode 100644 index 0ea1ff8b79990..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MHashDirDiscover.h +++ /dev/null @@ -1,52 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MHASHDIRDISCOVER_H -#define __MHASHDIRDISCOVER_H - -#include "msg/Message.h" -#include "mds/CInode.h" -#include "include/types.h" - -class MHashDirDiscover : public Message { - inodeno_t ino; - string path; - - public: - inodeno_t get_ino() { return ino; } - string& get_path() { return path; } - - MHashDirDiscover() {} - MHashDirDiscover(CInode *in) : - Message(MSG_MDS_HASHDIRDISCOVER) { - in->make_path(path); - ino = in->ino(); - } - virtual char *get_type_name() { return "HDis"; } - - - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - _decode(path, payload, off); - } - - void encode_payload() { - payload.append((char*)&ino, sizeof(ino)); - _encode(path, payload); - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MHashDirDiscoverAck.h b/tags/20070517_before_mds_merge/messages/MHashDirDiscoverAck.h deleted file mode 100644 index 34734af0f97ad..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MHashDirDiscoverAck.h +++ /dev/null @@ -1,53 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MHASHDIRDISCOVERACK_H -#define __MHASHDIRDISCOVERACK_H - -#include "msg/Message.h" -#include "mds/CInode.h" -#include "include/types.h" - -class MHashDirDiscoverAck : public Message { - inodeno_t ino; - bool success; - - public: - inodeno_t get_ino() { return ino; } - bool is_success() { return success; } - - MHashDirDiscoverAck() {} - MHashDirDiscoverAck(inodeno_t ino, bool success=true) : - Message(MSG_MDS_HASHDIRDISCOVERACK) { - this->ino = ino; - this->success = false; - } - virtual char *get_type_name() { return "HDisA"; } - - - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - payload.copy(off, sizeof(success), (char*)&success); - off += sizeof(success); - } - - void encode_payload() { - payload.append((char*)&ino, sizeof(ino)); - payload.append((char*)&success, sizeof(success)); - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MHashDirNotify.h b/tags/20070517_before_mds_merge/messages/MHashDirNotify.h deleted file mode 100644 index ececc3ec2cc65..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MHashDirNotify.h +++ /dev/null @@ -1,50 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MHASHDIRNOTIFY_H -#define __MHASHDIRNOTIFY_H - -#include "msg/Message.h" - -class MHashDirNotify : public Message { - inodeno_t ino; - int from; - - public: - inodeno_t get_ino() { return ino; } - int get_from() { return from; } - - MHashDirNotify() {} - MHashDirNotify(inodeno_t ino, int from) : - Message(MSG_MDS_HASHDIRNOTIFY) { - this->ino = ino; - this->from = from; - } - virtual char *get_type_name() { return "HN"; } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - payload.copy(off, sizeof(from), (char*)&from); - off += sizeof(from); - } - virtual void encode_payload() { - payload.append((char*)&ino, sizeof(ino)); - payload.append((char*)&from, sizeof(from)); - } - -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MHashDirPrep.h b/tags/20070517_before_mds_merge/messages/MHashDirPrep.h deleted file mode 100644 index 29a42217d6a4b..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MHashDirPrep.h +++ /dev/null @@ -1,93 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MHASHDIRPREP_H -#define __MHASHDIRPREP_H - -#include "msg/Message.h" -#include "mds/CInode.h" -#include "include/types.h" - -class MHashDirPrep : public Message { - inodeno_t ino; - bool assim; - - // subdir dentry names + inodes - map inodes; - - public: - inodeno_t get_ino() { return ino; } - map& get_inodes() { return inodes; } - - bool did_assim() { return assim; } - void mark_assim() { assert(!assim); assim = true; } - - MHashDirPrep() : assim(false) { } - MHashDirPrep(inodeno_t ino) : - Message(MSG_MDS_HASHDIRPREP), - assim(false) { - this->ino = ino; - } - ~MHashDirPrep() { - for (map::iterator it = inodes.begin(); - it != inodes.end(); - it++) - delete it->second; - } - - - virtual char *get_type_name() { return "HP"; } - - void add_inode(const string& dentry, CInodeDiscover *in) { - inodes[dentry] = in; - } - - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - - // inodes - int ni; - payload.copy(off, sizeof(int), (char*)&ni); - off += sizeof(int); - for (int i=0; i_decode(payload, off); - - inodes[dname] = in; - } - } - - virtual void encode_payload() { - payload.append((char*)&ino, sizeof(ino)); - - // inodes - int ni = inodes.size(); - payload.append((char*)&ni, sizeof(int)); - for (map::iterator iit = inodes.begin(); - iit != inodes.end(); - iit++) { - _encode(iit->first, payload); // dentry - iit->second->_encode(payload); // inode - } - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MHashDirPrepAck.h b/tags/20070517_before_mds_merge/messages/MHashDirPrepAck.h deleted file mode 100644 index 1d0db35c10f88..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MHashDirPrepAck.h +++ /dev/null @@ -1,43 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MHASHDIRPREPACK_H -#define __MHASHDIRPREPACK_H - -#include "msg/Message.h" -#include "include/types.h" - -class MHashDirPrepAck : public Message { - inodeno_t ino; - - public: - inodeno_t get_ino() { return ino; } - - MHashDirPrepAck() {} - MHashDirPrepAck(inodeno_t ino) : - Message(MSG_MDS_HASHDIRPREPACK) { - this->ino = ino; - } - - virtual char *get_type_name() { return "HPAck"; } - - void decode_payload() { - payload.copy(0, sizeof(ino), (char*)&ino); - } - void encode_payload() { - payload.append((char*)&ino, sizeof(ino)); - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MHashReaddir.h b/tags/20070517_before_mds_merge/messages/MHashReaddir.h deleted file mode 100644 index 864cb6944aeda..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MHashReaddir.h +++ /dev/null @@ -1,44 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MHASHREADDIR_H -#define __MHASHREADDIR_H - -#include "include/types.h" -#include "msg/Message.h" - -class MHashReaddir : public Message { - inodeno_t ino; - - public: - MHashReaddir() { } - MHashReaddir(inodeno_t ino) : - Message(MSG_MDS_HASHREADDIR) { - this->ino = ino; - } - - inodeno_t get_ino() { return ino; } - - virtual char *get_type_name() { return "Hls"; } - - virtual void decode_payload() { - payload.copy(0, sizeof(ino), (char*)&ino); - } - virtual void encode_payload() { - payload.append((char*)&ino, sizeof(ino)); - } - -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MHashReaddirReply.h b/tags/20070517_before_mds_merge/messages/MHashReaddirReply.h deleted file mode 100644 index d9d73d8528f00..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MHashReaddirReply.h +++ /dev/null @@ -1,80 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MHASHREADDIRREPLY_H -#define __MHASHREADDIRREPLY_H - -#include "MClientReply.h" - -class MHashReaddirReply : public Message { - inodeno_t ino; - - list dir_in; - list dir_dn; - - int num; - - public: - MHashReaddirReply() { } - MHashReaddirReply(inodeno_t _ino, list& inls, list& dnls, int n) : - Message(MSG_MDS_HASHREADDIRREPLY), - ino(_ino), - num(n) { - dir_in.swap(inls); - dir_dn.swap(dnls); - } - ~MHashReaddirReply() { - for (list::iterator it = dir_in.begin(); it != dir_in.end(); it++) - delete *it; - } - - inodeno_t get_ino() { return ino; } - list& get_in() { return dir_in; } - list& get_dn() { return dir_dn; } - - virtual char *get_type_name() { return "Hls"; } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - int n; - payload.copy(n, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i_decode(payload, off); - dir_in.push_back(ci); - } - } - virtual void encode_payload() { - payload.append((char*)&ino, sizeof(ino)); - int n = dir_in.size(); // FIXME? - payload.append((char*)&n, sizeof(n)); - list::iterator pdn = dir_dn.begin(); - for (list::iterator pin = dir_in.begin(); - pin != dir_in.end(); - ++pin, ++pdn) { - ::_encode(*pdn, payload); - (*pin)->_encode(payload); - } - } - -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MHeartbeat.h b/tags/20070517_before_mds_merge/messages/MHeartbeat.h deleted file mode 100644 index 55455f406ef18..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MHeartbeat.h +++ /dev/null @@ -1,81 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MHEARTBEAT_H -#define __MHEARTBEAT_H - -#include "include/types.h" -#include "msg/Message.h" - -class MHeartbeat : public Message { - mds_load_t load; - int beat; - map import_map; - - public: - mds_load_t& get_load() { return load; } - int get_beat() { return beat; } - - map& get_import_map() { - return import_map; - } - - MHeartbeat() {} - MHeartbeat(mds_load_t& load, int beat) : - Message(MSG_MDS_HEARTBEAT) { - this->load = load; - this->beat = beat; - } - - virtual char *get_type_name() { return "HB"; } - - virtual void decode_payload(crope& s, int& off) { - s.copy(off,sizeof(load), (char*)&load); - off += sizeof(load); - s.copy(off, sizeof(beat), (char*)&beat); - off += sizeof(beat); - - int n; - s.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - while (n--) { - int f; - s.copy(off, sizeof(f), (char*)&f); - off += sizeof(f); - float v; - s.copy(off, sizeof(v), (char*)&v); - off += sizeof(v); - import_map[f] = v; - } - } - virtual void encode_payload(crope& s) { - s.append((char*)&load, sizeof(load)); - s.append((char*)&beat, sizeof(beat)); - - int n = import_map.size(); - s.append((char*)&n, sizeof(n)); - for (map::iterator it = import_map.begin(); - it != import_map.end(); - it++) { - int f = it->first; - s.append((char*)&f, sizeof(f)); - float v = it->second; - s.append((char*)&v, sizeof(v)); - } - - } - -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MInodeExpire.h b/tags/20070517_before_mds_merge/messages/MInodeExpire.h deleted file mode 100644 index 637f378324022..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MInodeExpire.h +++ /dev/null @@ -1,50 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MINODEEXPIRE_H -#define __MINODEEXPIRE_H - -typedef struct { - inodeno_t ino; - int nonce; - int from; -} MInodeExpire_st; - -class MInodeExpire : public Message { - MInodeExpire_st st; - - public: - inodeno_t get_ino() { return st.ino; } - int get_from() { return st.from; } - int get_nonce() { return st.nonce; } - - MInodeExpire() {} - MInodeExpire(inodeno_t ino, int from, int nonce) : - Message(MSG_MDS_INODEEXPIRE) { - st.ino = ino; - st.from = from; - st.nonce = nonce; - } - virtual char *get_type_name() { return "InEx";} - - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(st), (char*)&st); - off += sizeof(st); - } - virtual void encode_payload(crope& s) { - s.append((char*)&st,sizeof(st)); - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MInodeFileCaps.h b/tags/20070517_before_mds_merge/messages/MInodeFileCaps.h deleted file mode 100644 index 5bd51be0e347b..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MInodeFileCaps.h +++ /dev/null @@ -1,55 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MINODEFILECAPS_H -#define __MINODEFILECAPS_H - -class MInodeFileCaps : public Message { - inodeno_t ino; - int from; - int caps; - - public: - inodeno_t get_ino() { return ino; } - int get_from() { return from; } - int get_caps() { return caps; } - - MInodeFileCaps() {} - // from auth - MInodeFileCaps(inodeno_t ino, int from, int caps) : - Message(MSG_MDS_INODEFILECAPS) { - - this->ino = ino; - this->from = from; - this->caps = caps; - } - - virtual char *get_type_name() { return "Icap";} - - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(from), (char*)&from); - off += sizeof(from); - s.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - s.copy(off, sizeof(caps), (char*)&caps); - off += sizeof(caps); - } - virtual void encode_payload(crope& s) { - s.append((char*)&from, sizeof(from)); - s.append((char*)&ino, sizeof(ino)); - s.append((char*)&caps, sizeof(caps)); - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MInodeLink.h b/tags/20070517_before_mds_merge/messages/MInodeLink.h deleted file mode 100644 index feefc4ea21c7b..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MInodeLink.h +++ /dev/null @@ -1,47 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MINODELINK_H -#define __MINODELINK_H - -typedef struct { - inodeno_t ino; - int from; -} MInodeLink_st; - -class MInodeLink : public Message { - MInodeLink_st st; - - public: - inodeno_t get_ino() { return st.ino; } - int get_from() { return st.from; } - - MInodeLink() {} - MInodeLink(inodeno_t ino, int from) : - Message(MSG_MDS_INODELINK) { - st.ino = ino; - st.from = from; - } - virtual char *get_type_name() { return "InL";} - - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(st), (char*)&st); - off += sizeof(st); - } - virtual void encode_payload(crope& s) { - s.append((char*)&st,sizeof(st)); - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MInodeLinkAck.h b/tags/20070517_before_mds_merge/messages/MInodeLinkAck.h deleted file mode 100644 index 987b70741edcb..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MInodeLinkAck.h +++ /dev/null @@ -1,47 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MINODELINKACK_H -#define __MINODELINKACK_H - -typedef struct { - inodeno_t ino; - bool success; -} MInodeLinkAck_st; - -class MInodeLinkAck : public Message { - MInodeLinkAck_st st; - - public: - inodeno_t get_ino() { return st.ino; } - bool is_success() { return st.success; } - - MInodeLinkAck() {} - MInodeLinkAck(inodeno_t ino, bool success) : - Message(MSG_MDS_INODELINKACK) { - st.ino = ino; - st.success = success; - } - virtual char *get_type_name() { return "InLA";} - - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(st), (char*)&st); - off += sizeof(st); - } - virtual void encode_payload(crope& s) { - s.append((char*)&st,sizeof(st)); - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MInodeUnlink.h b/tags/20070517_before_mds_merge/messages/MInodeUnlink.h deleted file mode 100644 index e1aa463153c26..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MInodeUnlink.h +++ /dev/null @@ -1,47 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MINODEUNLINK_H -#define __MINODEUNLINK_H - -typedef struct { - inodeno_t ino; - int from; -} MInodeUnlink_st; - -class MInodeUnlink : public Message { - MInodeUnlink_st st; - - public: - inodeno_t get_ino() { return st.ino; } - int get_from() { return st.from; } - - MInodeUnlink() {} - MInodeUnlink(inodeno_t ino, int from) : - Message(MSG_MDS_INODEUNLINK) { - st.ino = ino; - st.from = from; - } - virtual char *get_type_name() { return "InUl";} - - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(st), (char*)&st); - off += sizeof(st); - } - virtual void encode_payload(crope& s) { - s.append((char*)&st,sizeof(st)); - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MInodeUnlinkAck.h b/tags/20070517_before_mds_merge/messages/MInodeUnlinkAck.h deleted file mode 100644 index 283c016f2bec9..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MInodeUnlinkAck.h +++ /dev/null @@ -1,44 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MINODEUNLINKACK_H -#define __MINODEUNLINKACK_H - -typedef struct { - inodeno_t ino; -} MInodeUnlinkAck_st; - -class MInodeUnlinkAck : public Message { - MInodeUnlinkAck_st st; - - public: - inodeno_t get_ino() { return st.ino; } - - MInodeUnlinkAck() {} - MInodeUnlinkAck(inodeno_t ino) : - Message(MSG_MDS_INODEUNLINKACK) { - st.ino = ino; - } - virtual char *get_type_name() { return "InUlA";} - - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(st), (char*)&st); - off += sizeof(st); - } - virtual void encode_payload(crope& s) { - s.append((char*)&st,sizeof(st)); - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MInodeUpdate.h b/tags/20070517_before_mds_merge/messages/MInodeUpdate.h deleted file mode 100644 index bbab924089aa5..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MInodeUpdate.h +++ /dev/null @@ -1,61 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MINODEUPDATE_H -#define __MINODEUPDATE_H - -#include "msg/Message.h" - -#include -using namespace std; - -class MInodeUpdate : public Message { - int nonce; - crope inode_basic_state; - - public: - inodeno_t get_ino() { - inodeno_t ino; - inode_basic_state.copy(0, sizeof(inodeno_t), (char*)&ino); - return ino; - } - int get_nonce() { return nonce; } - - MInodeUpdate() {} - MInodeUpdate(CInode *in, int nonce) : - Message(MSG_MDS_INODEUPDATE) { - inode_basic_state = in->encode_basic_state(); - this->nonce = nonce; - } - virtual char *get_type_name() { return "Iup"; } - - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(int), (char*)&nonce); - off += sizeof(int); - size_t len; - s.copy(off, sizeof(len), (char*)&len); - off += sizeof(len); - inode_basic_state = s.substr(off, len); - off += len; - } - virtual void encode_payload(crope& s) { - s.append((char*)&nonce, sizeof(int)); - size_t len = inode_basic_state.length(); - s.append((char*)&len, sizeof(len)); - s.append(inode_basic_state); - } - -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MLock.h b/tags/20070517_before_mds_merge/messages/MLock.h deleted file mode 100644 index 1d22d297d79d4..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MLock.h +++ /dev/null @@ -1,128 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MLOCK_H -#define __MLOCK_H - -#include "msg/Message.h" - -#define LOCK_OTYPE_IHARD 1 -#define LOCK_OTYPE_IFILE 2 -#define LOCK_OTYPE_DIR 3 -#define LOCK_OTYPE_DN 4 - -// for replicas -#define LOCK_AC_SYNC 0 -#define LOCK_AC_MIXED 1 -#define LOCK_AC_LOCK 2 - -#define LOCK_AC_REQXLOCKACK 9 // req dentry xlock -#define LOCK_AC_REQXLOCKNAK 10 // req dentry xlock -#define LOCK_AC_LOCKNAK 12 // for dentry xlock - - -#define LOCK_AC_FOR_REPLICA(a) ((a) <= 10) -#define LOCK_AC_FOR_AUTH(a) ((a) >= 11) - -// for auth - -#define LOCK_AC_SYNCACK 13 -#define LOCK_AC_MIXEDACK 14 -#define LOCK_AC_LOCKACK 15 - - -#define LOCK_AC_REQREAD 19 -#define LOCK_AC_REQWRITE 20 - -#define LOCK_AC_REQXLOCK 21 -#define LOCK_AC_REQXLOCKC 22 // create if necessary -#define LOCK_AC_UNXLOCK 23 - -#define lock_ac_name(x) - - -class MLock : public Message { - int asker; // who is initiating this request - int action; // action type - - char otype; // lock object type - inodeno_t ino; // ino ref, or possibly - string dn; // dentry name - bufferlist data; // and possibly some data - string path; // possibly a path too (for dentry lock discovers) - - public: - inodeno_t get_ino() { return ino; } - string& get_dn() { return dn; } - bufferlist& get_data() { return data; } - int get_asker() { return asker; } - int get_action() { return action; } - int get_otype() { return otype; } - string& get_path() { return path; } - - MLock() {} - MLock(int action, int asker) : - Message(MSG_MDS_LOCK) { - this->action = action; - this->asker = asker; - } - virtual char *get_type_name() { return "ILock"; } - - void set_ino(inodeno_t ino, char ot) { - otype = ot; - this->ino = ino; - } - void set_dirino(inodeno_t dirino) { - otype = LOCK_OTYPE_DIR; - this->ino = ino; - } - void set_dn(inodeno_t dirino, string& dn) { - otype = LOCK_OTYPE_DN; - this->ino = dirino; - this->dn = dn; - } - void set_data(bufferlist& data) { - this->data.claim( data ); - } - void set_path(const string& p) { - path = p; - } - - void decode_payload() { - int off = 0; - payload.copy(off,sizeof(action), (char*)&action); - off += sizeof(action); - payload.copy(off,sizeof(asker), (char*)&asker); - off += sizeof(asker); - payload.copy(off,sizeof(otype), (char*)&otype); - off += sizeof(otype); - payload.copy(off,sizeof(ino), (char*)&ino); - off += sizeof(ino); - ::_decode(dn, payload, off); - ::_decode(path, payload, off); - ::_decode(data, payload, off); - } - virtual void encode_payload() { - payload.append((char*)&action, sizeof(action)); - payload.append((char*)&asker, sizeof(asker)); - payload.append((char*)&otype, sizeof(otype)); - payload.append((char*)&ino, sizeof(inodeno_t)); - ::_encode(dn, payload); - ::_encode(path, payload); - ::_encode(data, payload); - } - -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MMDSBeacon.h b/tags/20070517_before_mds_merge/messages/MMDSBeacon.h deleted file mode 100644 index 86eccc689d396..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MMDSBeacon.h +++ /dev/null @@ -1,54 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMDSBEACON_H -#define __MMDSBEACON_H - -#include "msg/Message.h" - -#include "include/types.h" - -#include "mds/MDSMap.h" - -class MMDSBeacon : public Message { - int state; - version_t seq; - - public: - MMDSBeacon() : Message(MSG_MDS_BEACON) {} - MMDSBeacon(int st, version_t se) : Message(MSG_MDS_BEACON), - state(st), seq(se) { } - - int get_state() { return state; } - version_t get_seq() { return seq; } - char *get_type_name() { return "mdsbeacon"; } - - void print(ostream& out) { - out << "mdsbeacon(" << MDSMap::get_state_name(state) - << " seq " << seq << ")"; - } - - void encode_payload() { - payload.append((char*)&state, sizeof(state)); - payload.append((char*)&seq, sizeof(seq)); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(state), (char*)&state); - off += sizeof(state); - payload.copy(off, sizeof(seq), (char*)&seq); - off += sizeof(seq); - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MMDSBoot.h b/tags/20070517_before_mds_merge/messages/MMDSBoot.h deleted file mode 100644 index c0c554152cc87..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MMDSBoot.h +++ /dev/null @@ -1,38 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMDSBOOT_H -#define __MMDSBOOT_H - -#include "msg/Message.h" - -#include "include/types.h" - -class MMDSBoot : public Message { - public: - MMDSBoot() : Message(MSG_MDS_BOOT) { - } - - char *get_type_name() { return "mdsboot"; } - - void encode_payload() { - //payload.append((char*)&sb, sizeof(sb)); - } - void decode_payload() { - //int off = 0; - //payload.copy(off, sizeof(sb), (char*)&sb); - //off += sizeof(sb); - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MMDSCacheRejoin.h b/tags/20070517_before_mds_merge/messages/MMDSCacheRejoin.h deleted file mode 100644 index 2789e30844743..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MMDSCacheRejoin.h +++ /dev/null @@ -1,62 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMDSCACHEREJOIN_H -#define __MMDSCACHEREJOIN_H - -#include "msg/Message.h" - -#include "include/types.h" - -// sent from replica to auth - -class MMDSCacheRejoin : public Message { - public: - map inodes; // ino -> caps_wanted - set dirs; - map > dentries; // dir -> (dentries...) - - MMDSCacheRejoin() : Message(MSG_MDS_CACHEREJOIN) {} - - char *get_type_name() { return "cache_rejoin"; } - - void print(ostream& out) { - out << "cache_rejoin" << endl; - } - - void add_dir(inodeno_t dirino) { - dirs.insert(dirino); - } - void add_dentry(inodeno_t dirino, const string& dn) { - dentries[dirino].insert(dn); - } - void add_inode(inodeno_t ino, int cw) { - inodes[ino] = cw; - } - - void encode_payload() { - ::_encode(inodes, payload); - ::_encode(dirs, payload); - for (set::iterator p = dirs.begin(); p != dirs.end(); ++p) - ::_encode(dentries[*p], payload); - } - void decode_payload() { - int off = 0; - ::_decode(inodes, payload, off); - ::_decode(dirs, payload, off); - for (set::iterator p = dirs.begin(); p != dirs.end(); ++p) - ::_decode(dentries[*p], payload, off); - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MMDSCacheRejoinAck.h b/tags/20070517_before_mds_merge/messages/MMDSCacheRejoinAck.h deleted file mode 100644 index b8f0d23ebbba0..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MMDSCacheRejoinAck.h +++ /dev/null @@ -1,82 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMDSCACHEREJOINACK_H -#define __MMDSCACHEREJOINACK_H - -#include "msg/Message.h" - -#include "include/types.h" - -// sent from auth back to replica - -class MMDSCacheRejoinAck : public Message { - public: - struct inodeinfo { - inodeno_t ino; - int hardlock; - int filelock; - int nonce; - inodeinfo() {} - inodeinfo(inodeno_t i, int h, int f, int n) : ino(i), hardlock(h), filelock(f), nonce(n) {} - }; - struct dninfo { - int lock; - int nonce; - dninfo() {} - dninfo(int l, int n) : lock(l), nonce(n) {} - }; - struct dirinfo { - inodeno_t dirino; - int nonce; - dirinfo() {} - dirinfo(inodeno_t i, int n) : dirino(i), nonce(n) {} - }; - list inodes; - map > dentries; - list dirs; - - MMDSCacheRejoinAck() : Message(MSG_MDS_CACHEREJOINACK) {} - - char *get_type_name() { return "cache_rejoin_ack"; } - - void print(ostream& out) { - out << "cache_rejoin" << endl; - } - - void add_dir(inodeno_t dirino, int nonce) { - dirs.push_back(dirinfo(dirino,nonce)); - } - void add_dentry(inodeno_t dirino, const string& dn, int ls, int nonce) { - dentries[dirino][dn] = dninfo(ls, nonce); - } - void add_inode(inodeno_t ino, int hl, int fl, int nonce) { - inodes.push_back(inodeinfo(ino, hl, fl, nonce)); - } - - void encode_payload() { - ::_encode(inodes, payload); - ::_encode(dirs, payload); - for (list::iterator p = dirs.begin(); p != dirs.end(); ++p) - ::_encode(dentries[p->dirino], payload); - } - void decode_payload() { - int off = 0; - ::_decode(inodes, payload, off); - ::_decode(dirs, payload, off); - for (list::iterator p = dirs.begin(); p != dirs.end(); ++p) - ::_decode(dentries[p->dirino], payload, off); - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MMDSGetMap.h b/tags/20070517_before_mds_merge/messages/MMDSGetMap.h deleted file mode 100644 index 6bb6b92c00ccd..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MMDSGetMap.h +++ /dev/null @@ -1,38 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMDSGETMAP_H -#define __MMDSGETMAP_H - -#include "msg/Message.h" - -#include "include/types.h" - -class MMDSGetMap : public Message { - public: - MMDSGetMap() : Message(MSG_MDS_GETMAP) { - } - - char *get_type_name() { return "mdsgetmap"; } - - void encode_payload() { - //payload.append((char*)&sb, sizeof(sb)); - } - void decode_payload() { - //int off = 0; - //payload.copy(off, sizeof(sb), (char*)&sb); - //off += sizeof(sb); - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MMDSImportMap.h b/tags/20070517_before_mds_merge/messages/MMDSImportMap.h deleted file mode 100644 index 22774cdabc2ec..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MMDSImportMap.h +++ /dev/null @@ -1,59 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMDSIMPORTMAP_H -#define __MMDSIMPORTMAP_H - -#include "msg/Message.h" - -#include "include/types.h" - - -class MMDSImportMap : public Message { - public: - map > imap; - map > ambiguous_imap; - - MMDSImportMap() : Message(MSG_MDS_IMPORTMAP) {} - - char *get_type_name() { return "mdsimportmap"; } - - void print(ostream& out) { - out << "mdsimportmap(" << imap.size() - << "+" << ambiguous_imap.size() - << " imports)"; - } - - void add_import(inodeno_t im) { - imap[im].clear(); - } - void add_import_export(inodeno_t im, inodeno_t ex) { - imap[im].insert(ex); - } - - void add_ambiguous_import(inodeno_t im, const set& m) { - ambiguous_imap[im] = m; - } - - void encode_payload() { - ::_encode(imap, payload); - ::_encode(ambiguous_imap, payload); - } - void decode_payload() { - int off = 0; - ::_decode(imap, payload, off); - ::_decode(ambiguous_imap, payload, off); - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MMDSMap.h b/tags/20070517_before_mds_merge/messages/MMDSMap.h deleted file mode 100644 index 701ba9a050cc3..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MMDSMap.h +++ /dev/null @@ -1,78 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MMDSMAP_H -#define __MMDSMAP_H - -#include "msg/Message.h" -#include "mds/MDSMap.h" - - -class MMDSMap : public Message { - public: - /* - map maps; - map incremental_maps; - - epoch_t get_first() { - epoch_t e = 0; - map::iterator i = maps.begin(); - if (i != maps.end()) e = i->first; - i = incremental_maps.begin(); - if (i != incremental_maps.end() && - (e == 0 || i->first < e)) e = i->first; - return e; - } - epoch_t get_last() { - epoch_t e = 0; - map::reverse_iterator i = maps.rbegin(); - if (i != maps.rend()) e = i->first; - i = incremental_maps.rbegin(); - if (i != incremental_maps.rend() && - (e == 0 || i->first > e)) e = i->first; - return e; - } - */ - - version_t epoch; - bufferlist encoded; - - version_t get_epoch() const { return epoch; } - bufferlist& get_encoded() { return encoded; } - - MMDSMap() : - Message(MSG_MDS_MAP) {} - MMDSMap(MDSMap *mm) : - Message(MSG_MDS_MAP) { - epoch = mm->get_epoch(); - mm->encode(encoded); - } - - - // marshalling - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - ::_decode(encoded, payload, off); - } - virtual void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - ::_encode(encoded, payload); - } - - virtual char *get_type_name() { return "mdsmap"; } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MMonElectionAck.h b/tags/20070517_before_mds_merge/messages/MMonElectionAck.h deleted file mode 100644 index 2399cca73d60c..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MMonElectionAck.h +++ /dev/null @@ -1,31 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MMONELECTIONACK_H -#define __MMONELECTIONACK_H - -#include "msg/Message.h" - - -class MMonElectionAck : public Message { - public: - MMonElectionAck() : Message(MSG_MON_ELECTION_ACK) {} - - virtual char *get_type_name() { return "election_ack"; } - - void encode_payload() {} - void decode_payload() {} -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MMonElectionCollect.h b/tags/20070517_before_mds_merge/messages/MMonElectionCollect.h deleted file mode 100644 index d91870dfce5c6..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MMonElectionCollect.h +++ /dev/null @@ -1,42 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MMONELECTIONCOLLECT_H -#define __MMONELECTIONCOLLECT_H - -#include "msg/Message.h" - - -class MMonElectionCollect : public Message { - public: - int read_num; - - MMonElectionCollect() {} - MMonElectionCollect(int n) : - Message(MSG_MON_ELECTION_COLLECT), - read_num(n) {} - - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(read_num), (char*)&read_num); - off += sizeof(read_num); - } - void encode_payload() { - payload.append((char*)&read_num, sizeof(read_num)); - } - - virtual char *get_type_name() { return "MonElCollect"; } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MMonElectionPropose.h b/tags/20070517_before_mds_merge/messages/MMonElectionPropose.h deleted file mode 100644 index d9310f222bc7b..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MMonElectionPropose.h +++ /dev/null @@ -1,32 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MMONELECTIONPROPOSE_H -#define __MMONELECTIONPROPOSE_H - -#include "msg/Message.h" - - -class MMonElectionPropose : public Message { - public: - MMonElectionPropose() : Message(MSG_MON_ELECTION_PROPOSE) {} - - virtual char *get_type_name() { return "election_propose"; } - - void encode_payload() {} - void decode_payload() {} - -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MMonElectionRefresh.h b/tags/20070517_before_mds_merge/messages/MMonElectionRefresh.h deleted file mode 100644 index 497276f06b12f..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MMonElectionRefresh.h +++ /dev/null @@ -1,51 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MMONELECTIONREFRESH_H -#define __MMONELECTIONREFRESH_H - -#include "msg/Message.h" - -#include "mon/Elector.h" - -class MMonElectionRefresh : public Message { - public: - int p; - Elector::State state; - int refresh_num; - - MMonElectionRefresh() {} - MMonElectionRefresh(int _p, Elector::State& s, int r) : - Message(MSG_MON_ELECTION_REFRESH), - p(_p), state(s), refresh_num(r) {} - - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(p), (char*)&p); - off += sizeof(p); - payload.copy(off, sizeof(state), (char*)&state); - off += sizeof(state); - payload.copy(off, sizeof(refresh_num), (char*)&refresh_num); - off += sizeof(refresh_num); - } - void encode_payload() { - payload.append((char*)&p, sizeof(p)); - payload.append((char*)&state, sizeof(state)); - payload.append((char*)&refresh_num, sizeof(refresh_num)); - } - - virtual char *get_type_name() { return "MonElRefresh"; } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MMonElectionStatus.h b/tags/20070517_before_mds_merge/messages/MMonElectionStatus.h deleted file mode 100644 index 071d0fcc82e0a..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MMonElectionStatus.h +++ /dev/null @@ -1,50 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MMONELECTIONSTATUS_H -#define __MMONELECTIONSTATUS_H - -#include "msg/Message.h" - -#include "mon/Elector.h" - -class MMonElectionStatus : public Message { - public: - int q; - int read_num; - map registry; - - MMonElectionStatus() {} - MMonElectionStatus(int _q, int r, map reg) : - Message(MSG_MON_ELECTION_STATUS), - q(_q), read_num(r), registry(reg) {} - - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(q), (char*)&q); - off += sizeof(q); - payload.copy(off, sizeof(read_num), (char*)&read_num); - off += sizeof(read_num); - ::_decode(registry, payload, off); - } - void encode_payload() { - payload.append((char*)&q, sizeof(q)); - payload.append((char*)&read_num, sizeof(read_num)); - ::_encode(registry, payload); - } - - virtual char *get_type_name() { return "MonElStatus"; } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MMonElectionVictory.h b/tags/20070517_before_mds_merge/messages/MMonElectionVictory.h deleted file mode 100644 index 8bdbf2f85a3aa..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MMonElectionVictory.h +++ /dev/null @@ -1,40 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MMONELECTIONVICTORY_H -#define __MMONELECTIONVICTORY_H - -#include "msg/Message.h" - - -class MMonElectionVictory : public Message { - public: - //set active_set; - - MMonElectionVictory(/*set& as*/) : Message(MSG_MON_ELECTION_VICTORY)//, - //active_set(as) - {} - - virtual char *get_type_name() { return "election_victory"; } - - void encode_payload() { - //::_encode(active_set, payload); - } - void decode_payload() { - //int off = 0; - //::_decode(active_set, payload, off); - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MMonOSDMapInfo.h b/tags/20070517_before_mds_merge/messages/MMonOSDMapInfo.h deleted file mode 100644 index 182b36f0a57cf..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MMonOSDMapInfo.h +++ /dev/null @@ -1,49 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMONOSDMAPINFO_H -#define __MMONOSDMAPINFO_H - -#include "msg/Message.h" - -#include "include/types.h" - -class MMonOSDMapInfo : public Message { - public: - epoch_t epoch; - epoch_t mon_epoch; - - epoch_t get_epoch() { return epoch; } - epoch_t get_mon_epoch() { return mon_epoch; } - - MMonOSDMapInfo(epoch_t e, epoch_t me) : - Message(MSG_MON_OSDMAP_UPDATE_PREPARE), - epoch(e), mon_epoch(me) { - } - - char *get_type_name() { return "omap_info"; } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - payload.append((char*)&mon_epoch, sizeof(mon_epoch)); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - payload.copy(off, sizeof(mon_epoch), (char*)&mon_epoch); - off += sizeof(mon_epoch); - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MMonOSDMapLease.h b/tags/20070517_before_mds_merge/messages/MMonOSDMapLease.h deleted file mode 100644 index c6112bd898cae..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MMonOSDMapLease.h +++ /dev/null @@ -1,49 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMONOSDMAPLEASE_H -#define __MMONOSDMAPLEASE_H - -#include "msg/Message.h" - -#include "include/types.h" - -class MMonOSDMapLease : public Message { - epoch_t epoch; - utime_t lease_expire; - - public: - epoch_t get_epoch() { return epoch; } - const utime_t& get_lease_expire() { return lease_expire; } - - MMonOSDMapLease(epoch_t e, utime_t le) : - Message(MSG_MON_OSDMAP_LEASE), - epoch(e), lease_expire(le) { - } - - char *get_type_name() { return "omap_lease"; } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - payload.append((char*)&lease_expire, sizeof(lease_expire)); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - payload.copy(off, sizeof(lease_expire), (char*)&lease_expire); - off += sizeof(lease_expire); - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MMonOSDMapLeaseAck.h b/tags/20070517_before_mds_merge/messages/MMonOSDMapLeaseAck.h deleted file mode 100644 index 85d5ea7c02809..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MMonOSDMapLeaseAck.h +++ /dev/null @@ -1,44 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMONOSDMAPLEASEACK_H -#define __MMONOSDMAPLEASEACK_H - -#include "msg/Message.h" - -#include "include/types.h" - -class MMonOSDMapLeaseAck : public Message { - epoch_t epoch; - -public: - epoch_t get_epoch() { return epoch; } - - MMonOSDMapLeaseAck(epoch_t e) : - Message(MSG_MON_OSDMAP_LEASE_ACK), - epoch(e) { - } - - char *get_type_name() { return "omap_lease_ack"; } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MMonOSDMapUpdateAck.h b/tags/20070517_before_mds_merge/messages/MMonOSDMapUpdateAck.h deleted file mode 100644 index 8673788f0632f..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MMonOSDMapUpdateAck.h +++ /dev/null @@ -1,42 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMONOSDMAPUPDATEACK_H -#define __MMONOSDMAPUPDATEACK_H - -#include "msg/Message.h" - -#include "include/types.h" - -class MMonOSDMapUpdateAck : public Message { -public: - epoch_t epoch; - - MMonOSDMapUpdateAck(epoch_t e) : - Message(MSG_MON_OSDMAP_UPDATE_ACK), - epoch(e) { - } - - char *get_type_name() { return "omap_update_ack"; } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MMonOSDMapUpdateCommit.h b/tags/20070517_before_mds_merge/messages/MMonOSDMapUpdateCommit.h deleted file mode 100644 index 6f12a8e3c784d..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MMonOSDMapUpdateCommit.h +++ /dev/null @@ -1,42 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMONOSDMAPUPDATECOMMIT_H -#define __MMONOSDMAPUPDATECOMMIT_H - -#include "msg/Message.h" - -#include "include/types.h" - -class MMonOSDMapUpdateCommit : public Message { - public: - epoch_t epoch; - - MMonOSDMapUpdateCommit(epoch_t e) : - Message(MSG_MON_OSDMAP_UPDATE_COMMIT), - epoch(e) { - } - - char *get_type_name() { return "omap_update_commit"; } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MMonOSDMapUpdatePrepare.h b/tags/20070517_before_mds_merge/messages/MMonOSDMapUpdatePrepare.h deleted file mode 100644 index bc962ea2b3eb2..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MMonOSDMapUpdatePrepare.h +++ /dev/null @@ -1,52 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMONOSDMAPUPDATEPREPARE_H -#define __MMONOSDMAPUPDATEPREPARE_H - -#include "msg/Message.h" - -#include "include/types.h" - -class MMonOSDMapUpdatePrepare : public Message { - public: - epoch_t epoch; - bufferlist map_bl; - bufferlist inc_map_bl; - - epoch_t get_epoch() { return epoch; } - - MMonOSDMapUpdatePrepare(epoch_t e, - bufferlist& mbl, bufferlist& incmbl) : - Message(MSG_MON_OSDMAP_UPDATE_PREPARE), - epoch(e), - map_bl(mbl), inc_map_bl(incmbl) { - } - - char *get_type_name() { return "omap_update_prepare"; } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - ::_encode(map_bl, payload); - ::_encode(inc_map_bl, payload); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - ::_decode(map_bl, payload, off); - ::_decode(inc_map_bl, payload, off); - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MMonPaxos.h b/tags/20070517_before_mds_merge/messages/MMonPaxos.h deleted file mode 100644 index b3f6e850a9c5d..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MMonPaxos.h +++ /dev/null @@ -1,80 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MMONPAXOS_H -#define __MMONPAXOS_H - -#include "msg/Message.h" - -class MMonPaxos : public Message { - public: - // op types - const static int OP_COLLECT = 1; // proposer: propose round - const static int OP_LAST = 2; // voter: accept proposed round - const static int OP_OLDROUND = 3; // voter: notify proposer he proposed an old round - const static int OP_BEGIN = 4; // proposer: value proposed for this round - const static int OP_ACCEPT = 5; // voter: accept propsed value - const static int OP_SUCCESS = 7; // proposer: notify learners of agreed value - const static int OP_ACK = 8; // learner: notify proposer that new value has been saved - - int op; - int machine_id; - version_t proposal; - version_t n; - bufferlist value; - - MMonPaxos() : Message(MSG_MON_PAXOS) {} - MMonPaxos(int o, int mid, - version_t pn, version_t v) : Message(MSG_MON_PAXOS), - op(o), machine_id(mid), - proposal(pn), n(v) {} - MMonPaxos(int o, int mid, - version_t pn, version_t v, - bufferlist& b) : Message(MSG_MON_PAXOS), - op(o), machine_id(mid), - proposal(pn), n(v), - value(b) {} - - virtual char *get_type_name() { return "paxos"; } - - void print(ostream& out) { - out << "paxos(op " << op - << ", machine " << machine_id - << ", proposal " << proposal - << ", state " << n - << ", " << value.length() << " bytes)"; - } - - void encode_payload() { - payload.append((char*)&op, sizeof(op)); - payload.append((char*)&machine_id, sizeof(machine_id)); - payload.append((char*)&proposal, sizeof(proposal)); - payload.append((char*)&n, sizeof(n)); - ::_encode(value, payload); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(op), (char*)&op); - off += sizeof(op); - payload.copy(off, sizeof(machine_id), (char*)&machine_id); - off += sizeof(machine_id); - payload.copy(off, sizeof(proposal), (char*)&proposal); - off += sizeof(proposal); - payload.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - ::_decode(value, payload, off); - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MNSConnect.h b/tags/20070517_before_mds_merge/messages/MNSConnect.h deleted file mode 100644 index 28150f79d8476..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MNSConnect.h +++ /dev/null @@ -1,45 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MNSCONNECT_H -#define __MNSCONNECT_H - -#include "msg/Message.h" -#include "msg/tcp.h" - -class MNSConnect : public Message { - tcpaddr_t tcpaddr; - - public: - MNSConnect() {} - MNSConnect(tcpaddr_t t) : - Message(MSG_NS_CONNECT) { - tcpaddr = t; - } - - char *get_type_name() { return "NSCon"; } - - tcpaddr_t& get_addr() { return tcpaddr; } - - void encode_payload() { - payload.append((char*)&tcpaddr, sizeof(tcpaddr)); - } - void decode_payload() { - payload.copy(0, sizeof(tcpaddr), (char*)&tcpaddr); - } -}; - - -#endif - diff --git a/tags/20070517_before_mds_merge/messages/MNSConnectAck.h b/tags/20070517_before_mds_merge/messages/MNSConnectAck.h deleted file mode 100644 index 696b13f2a41e6..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MNSConnectAck.h +++ /dev/null @@ -1,53 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MNSCONNECTACK_H -#define __MNSCONNECTACK_H - -#include "msg/Message.h" -#include "msg/TCPMessenger.h" - -class MNSConnectAck : public Message { - int rank; - int inst; - - public: - MNSConnectAck() {} - MNSConnectAck(int r, int g=0) : - Message(MSG_NS_CONNECTACK) { - rank = r; - inst = g; - } - - char *get_type_name() { return "NSConA"; } - - int get_rank() { return rank; } - int get_inst() { return inst; } - - void encode_payload() { - payload.append((char*)&rank, sizeof(rank)); - payload.append((char*)&inst, sizeof(inst)); - } - void decode_payload() { - unsigned off = 0; - payload.copy(off, sizeof(rank), (char*)&rank); - off += sizeof(rank); - payload.copy(off, sizeof(inst), (char*)&inst); - off += sizeof(inst); - } -}; - - -#endif - diff --git a/tags/20070517_before_mds_merge/messages/MNSFailure.h b/tags/20070517_before_mds_merge/messages/MNSFailure.h deleted file mode 100644 index 405bfcfd2dacb..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MNSFailure.h +++ /dev/null @@ -1,52 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MNSFAILURE_H -#define __MNSFAILURE_H - -#include "msg/Message.h" -#include "msg/tcp.h" - -class MNSFailure : public Message { - //msg_addr_t entity; - entity_inst_t inst; - - public: - MNSFailure() {} - MNSFailure(entity_inst_t& i) : - Message(MSG_NS_FAILURE), - //entity(w), - inst(i) {} - - char *get_type_name() { return "NSFail"; } - - //msg_addr_t &get_entity() { return entity; } - entity_inst_t &get_inst() { return inst; } - - void encode_payload() { - //payload.append((char*)&entity, sizeof(entity)); - payload.append((char*)&inst, sizeof(inst)); - } - void decode_payload() { - unsigned off = 0; - //payload.copy(off, sizeof(entity), (char*)&entity); - //off += sizeof(entity); - payload.copy(off, sizeof(inst), (char*)&inst); - off += sizeof(inst); - } -}; - - -#endif - diff --git a/tags/20070517_before_mds_merge/messages/MNSLookup.h b/tags/20070517_before_mds_merge/messages/MNSLookup.h deleted file mode 100644 index b6df663a15a88..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MNSLookup.h +++ /dev/null @@ -1,46 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MNSLOOKUP_H -#define __MNSLOOKUP_H - -#include "msg/Message.h" - -class MNSLookup : public Message { - entity_name_t entity; - - public: - MNSLookup() {} - MNSLookup(entity_name_t e) : - Message(MSG_NS_LOOKUP) { - entity = e; - } - - char *get_type_name() { return "NSLook"; } - - entity_name_t get_entity() { return entity; } - - void encode_payload() { - payload.append((char*)&entity, sizeof(entity)); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(entity), (char*)&entity); - off += sizeof(entity); - } -}; - - -#endif - diff --git a/tags/20070517_before_mds_merge/messages/MNSLookupReply.h b/tags/20070517_before_mds_merge/messages/MNSLookupReply.h deleted file mode 100644 index e6720eba397d8..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MNSLookupReply.h +++ /dev/null @@ -1,44 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MNSLOOKUPREPLY_H -#define __MNSLOOKUPREPLY_H - -#include "msg/Message.h" -#include "msg/TCPMessenger.h" - -class MNSLookupReply : public Message { - public: - map entity_map; - - public: - MNSLookupReply() {} - MNSLookupReply(MNSLookup *m) : - Message(MSG_NS_LOOKUPREPLY) { - } - - char *get_type_name() { return "NSLookR"; } - - void encode_payload() { - ::_encode(entity_map, payload); - } - void decode_payload() { - int off = 0; - ::_decode(entity_map, payload, off); - } -}; - - -#endif - diff --git a/tags/20070517_before_mds_merge/messages/MNSRegister.h b/tags/20070517_before_mds_merge/messages/MNSRegister.h deleted file mode 100644 index 01d29a2315fa9..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MNSRegister.h +++ /dev/null @@ -1,59 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MNSREGISTER_H -#define __MNSREGISTER_H - -#include "msg/Message.h" -#include "msg/TCPMessenger.h" - -class MNSRegister : public Message { - entity_name_t addr; - int rank; - long tid; - - public: - MNSRegister() {} - MNSRegister(entity_name_t a, int r, int ti) : - Message(MSG_NS_REGISTER) { - addr = a; - rank = r; - tid = ti; - } - - char *get_type_name() { return "NSReg"; } - - entity_name_t get_entity() { return addr; } - int get_rank() { return rank; } - long get_tid() { return tid; } - - void encode_payload() { - payload.append((char*)&addr, sizeof(addr)); - payload.append((char*)&rank, sizeof(rank)); - payload.append((char*)&tid, sizeof(tid)); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(addr), (char*)&addr); - off += sizeof(addr); - payload.copy(off, sizeof(rank), (char*)&rank); - off += sizeof(rank); - payload.copy(off, sizeof(tid), (char*)&tid); - off += sizeof(tid); - } -}; - - -#endif - diff --git a/tags/20070517_before_mds_merge/messages/MNSRegisterAck.h b/tags/20070517_before_mds_merge/messages/MNSRegisterAck.h deleted file mode 100644 index fa2f88ac10e82..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MNSRegisterAck.h +++ /dev/null @@ -1,53 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MNSREGISTERACK_H -#define __MNSREGISTERACK_H - -#include "msg/Message.h" -#include "msg/TCPMessenger.h" - -class MNSRegisterAck : public Message { - entity_name_t entity; - long tid; - - public: - MNSRegisterAck() {} - MNSRegisterAck(long t, entity_name_t e) : - Message(MSG_NS_REGISTERACK) { - entity = e; - tid = t; - } - - char *get_type_name() { return "NSRegA"; } - - entity_name_t get_entity() { return entity; } - long get_tid() { return tid; } - - void encode_payload() { - payload.append((char*)&entity, sizeof(entity)); - payload.append((char*)&tid, sizeof(tid)); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(entity), (char*)&entity); - off += sizeof(entity); - payload.copy(off, sizeof(tid), (char*)&tid); - off += sizeof(tid); - } -}; - - -#endif - diff --git a/tags/20070517_before_mds_merge/messages/MOSDBoot.h b/tags/20070517_before_mds_merge/messages/MOSDBoot.h deleted file mode 100644 index cfff1869fbe51..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MOSDBoot.h +++ /dev/null @@ -1,44 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MOSDBOOT_H -#define __MOSDBOOT_H - -#include "msg/Message.h" - -#include "include/types.h" -#include "osd/osd_types.h" - -class MOSDBoot : public Message { - public: - OSDSuperblock sb; - - MOSDBoot() {} - MOSDBoot(OSDSuperblock& s) : - Message(MSG_OSD_BOOT), - sb(s) { - } - - char *get_type_name() { return "oboot"; } - - void encode_payload() { - payload.append((char*)&sb, sizeof(sb)); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(sb), (char*)&sb); - off += sizeof(sb); - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MOSDFailure.h b/tags/20070517_before_mds_merge/messages/MOSDFailure.h deleted file mode 100644 index c4a557856594a..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MOSDFailure.h +++ /dev/null @@ -1,49 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDFAILURE_H -#define __MOSDFAILURE_H - -#include "msg/Message.h" - - -class MOSDFailure : public Message { - public: - entity_inst_t failed; - epoch_t epoch; - - MOSDFailure() {} - MOSDFailure(entity_inst_t f, epoch_t e) : - Message(MSG_OSD_FAILURE), - failed(f), epoch(e) {} - - entity_inst_t get_failed() { return failed; } - epoch_t get_epoch() { return epoch; } - - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(failed), (char*)&failed); - off += sizeof(failed); - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - } - void encode_payload() { - payload.append((char*)&failed, sizeof(failed)); - payload.append((char*)&epoch, sizeof(epoch)); - } - - virtual char *get_type_name() { return "osdfail"; } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MOSDGetMap.h b/tags/20070517_before_mds_merge/messages/MOSDGetMap.h deleted file mode 100644 index 58afd527bda93..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MOSDGetMap.h +++ /dev/null @@ -1,45 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MOSDGETMAP_H -#define __MOSDGETMAP_H - -#include "msg/Message.h" - -#include "include/types.h" - -class MOSDGetMap : public Message { - public: - epoch_t since; - - //MOSDGetMap() : since(0) {} - MOSDGetMap(epoch_t s=0) : - Message(MSG_OSD_GETMAP), - since(s) { - } - - epoch_t get_since() { return since; } - - char *get_type_name() { return "getomap"; } - - void encode_payload() { - payload.append((char*)&since, sizeof(since)); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(since), (char*)&since); - off += sizeof(since); - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MOSDIn.h b/tags/20070517_before_mds_merge/messages/MOSDIn.h deleted file mode 100644 index 276a930d2e00b..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MOSDIn.h +++ /dev/null @@ -1,42 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __MOSDIN_H -#define __MOSDIN_H - -#include "msg/Message.h" - - -class MOSDIn : public Message { - public: - epoch_t map_epoch; - - MOSDIn(epoch_t e) : Message(MSG_OSD_IN), map_epoch(e) { - } - MOSDIn() {} - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(map_epoch), (char*)&map_epoch); - off += sizeof(map_epoch); - } - virtual void encode_payload() { - payload.append((char*)&map_epoch, sizeof(map_epoch)); - } - - virtual char *get_type_name() { return "oin"; } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MOSDMap.h b/tags/20070517_before_mds_merge/messages/MOSDMap.h deleted file mode 100644 index dd231a831d63d..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MOSDMap.h +++ /dev/null @@ -1,69 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDGETMAPACK_H -#define __MOSDGETMAPACK_H - -#include "msg/Message.h" -#include "osd/OSDMap.h" - - -class MOSDMap : public Message { - public: - map maps; - map incremental_maps; - - epoch_t get_first() { - epoch_t e = 0; - map::iterator i = maps.begin(); - if (i != maps.end()) e = i->first; - i = incremental_maps.begin(); - if (i != incremental_maps.end() && - (e == 0 || i->first < e)) e = i->first; - return e; - } - epoch_t get_last() { - epoch_t e = 0; - map::reverse_iterator i = maps.rbegin(); - if (i != maps.rend()) e = i->first; - i = incremental_maps.rbegin(); - if (i != incremental_maps.rend() && - (e == 0 || i->first > e)) e = i->first; - return e; - } - - - MOSDMap() : - Message(MSG_OSD_MAP) {} - MOSDMap(OSDMap *oc) : - Message(MSG_OSD_MAP) { - oc->encode(maps[oc->get_epoch()]); - } - - - // marshalling - virtual void decode_payload() { - int off = 0; - ::_decode(maps, payload, off); - ::_decode(incremental_maps, payload, off); - } - virtual void encode_payload() { - ::_encode(maps, payload); - ::_encode(incremental_maps, payload); - } - - virtual char *get_type_name() { return "omap"; } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MOSDOp.h b/tags/20070517_before_mds_merge/messages/MOSDOp.h deleted file mode 100644 index d16b02e8aad51..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MOSDOp.h +++ /dev/null @@ -1,221 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDOP_H -#define __MOSDOP_H - -#include "msg/Message.h" -#include "osd/osd_types.h" - -/* - * OSD op - * - * oid - object id - * op - OSD_OP_DELETE, etc. - * - */ - -// osd client ops -#define OSD_OP_READ 1 -#define OSD_OP_STAT 2 - -#define OSD_OP_WRNOOP 10 -#define OSD_OP_WRITE 11 -#define OSD_OP_DELETE 12 -#define OSD_OP_TRUNCATE 13 -#define OSD_OP_ZERO 14 - -#define OSD_OP_WRLOCK 20 -#define OSD_OP_WRUNLOCK 21 -#define OSD_OP_RDLOCK 22 -#define OSD_OP_RDUNLOCK 23 -#define OSD_OP_UPLOCK 24 -#define OSD_OP_DNLOCK 25 - -#define OSD_OP_PULL 30 -#define OSD_OP_PUSH 31 - - -class MOSDOp : public Message { -public: - static const char* get_opname(int op) { - switch (op) { - case OSD_OP_READ: return "read"; - case OSD_OP_STAT: return "stat"; - - case OSD_OP_WRNOOP: return "wrnoop"; - case OSD_OP_WRITE: return "write"; - case OSD_OP_ZERO: return "zero"; - case OSD_OP_DELETE: return "delete"; - case OSD_OP_TRUNCATE: return "truncate"; - case OSD_OP_WRLOCK: return "wrlock"; - case OSD_OP_WRUNLOCK: return "wrunlock"; - case OSD_OP_RDLOCK: return "rdlock"; - case OSD_OP_RDUNLOCK: return "rdunlock"; - case OSD_OP_UPLOCK: return "uplock"; - case OSD_OP_DNLOCK: return "dnlock"; - - case OSD_OP_PULL: return "pull"; - case OSD_OP_PUSH: return "push"; - default: assert(0); - } - return 0; - } - -private: - struct { - long pcid; - - // who's asking? - entity_inst_t client; - reqid_t reqid; // minor weirdness: entity_name_t is in reqid_t too. - - // for replication - tid_t rep_tid; - - object_t oid; - objectrev_t rev; - pg_t pg; - - epoch_t map_epoch; - - eversion_t pg_trim_to; // primary->replica: trim to here - - int op; - size_t length; - off_t offset; - - eversion_t version; - eversion_t old_version; - - bool want_ack; - bool want_commit; - } st; - - bufferlist data; - map attrset; - - friend class MOSDOpReply; - - public: - const reqid_t& get_reqid() { return st.reqid; } - const tid_t get_client_tid() { return st.reqid.tid; } - int get_client_inc() { return st.reqid.inc; } - - const entity_name_t& get_client() { return st.client.name; } - const entity_inst_t& get_client_inst() { return st.client; } - void set_client_inst(const entity_inst_t& i) { st.client = i; } - - const tid_t get_rep_tid() { return st.rep_tid; } - void set_rep_tid(tid_t t) { st.rep_tid = t; } - - const object_t get_oid() { return st.oid; } - const pg_t get_pg() { return st.pg; } - const epoch_t get_map_epoch() { return st.map_epoch; } - - //const int get_pg_role() { return st.pg_role; } // who am i asking for? - const eversion_t get_version() { return st.version; } - //const eversion_t get_old_version() { return st.old_version; } - - void set_rev(objectrev_t r) { st.rev = r; } - objectrev_t get_rev() { return st.rev; } - - const eversion_t get_pg_trim_to() { return st.pg_trim_to; } - void set_pg_trim_to(eversion_t v) { st.pg_trim_to = v; } - - const int get_op() { return st.op; } - void set_op(int o) { st.op = o; } - - const size_t get_length() { return st.length; } - const off_t get_offset() { return st.offset; } - - map& get_attrset() { return attrset; } - void set_attrset(map &as) { attrset = as; } - - const bool wants_ack() { return st.want_ack; } - const bool wants_commit() { return st.want_commit; } - - - void set_data(bufferlist &d) { - data.claim(d); - } - bufferlist& get_data() { - return data; - } - size_t get_data_len() { return data.length(); } - - - // keep a pcid (procedure call id) to match up request+reply - void set_pcid(long pcid) { this->st.pcid = pcid; } - long get_pcid() { return st.pcid; } - - MOSDOp(entity_inst_t asker, int inc, long tid, - object_t oid, pg_t pg, epoch_t mapepoch, int op) : - Message(MSG_OSD_OP) { - memset(&st, 0, sizeof(st)); - this->st.client = asker; - this->st.reqid.name = asker.name; - this->st.reqid.inc = inc; - this->st.reqid.tid = tid; - - this->st.oid = oid; - this->st.pg = pg; - this->st.map_epoch = mapepoch; - this->st.op = op; - - this->st.rep_tid = 0; - - this->st.want_ack = true; - this->st.want_commit = true; - } - MOSDOp() {} - - //void set_pg_role(int r) { st.pg_role = r; } - //void set_rg_nrep(int n) { st.rg_nrep = n; } - - void set_length(size_t l) { st.length = l; } - void set_offset(off_t o) { st.offset = o; } - void set_version(eversion_t v) { st.version = v; } - void set_old_version(eversion_t ov) { st.old_version = ov; } - - void set_want_ack(bool b) { st.want_ack = b; } - void set_want_commit(bool b) { st.want_commit = b; } - - // marshalling - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(st), (char*)&st); - off += sizeof(st); - ::_decode(attrset, payload, off); - ::_decode(data, payload, off); - } - virtual void encode_payload() { - payload.append((char*)&st, sizeof(st)); - ::_encode(attrset, payload); - ::_encode(data, payload); - } - - virtual char *get_type_name() { return "oop"; } - - void print(ostream& out) { - out << "osd_op(" << st.reqid - << " " << get_opname(st.op) - << " " << st.oid - //<< " " << this - << ")"; - } -}; - - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MOSDOpReply.h b/tags/20070517_before_mds_merge/messages/MOSDOpReply.h deleted file mode 100644 index 05106e096d176..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MOSDOpReply.h +++ /dev/null @@ -1,148 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDOPREPLY_H -#define __MOSDOPREPLY_H - -#include "msg/Message.h" - -#include "MOSDOp.h" -#include "osd/ObjectStore.h" - -/* - * OSD op reply - * - * oid - object id - * op - OSD_OP_DELETE, etc. - * - */ - -class MOSDOpReply : public Message { - struct { - // req - reqid_t reqid; - - tid_t rep_tid; - - object_t oid; - pg_t pg; - - int op; - - // reply - int result; - bool commit; - size_t length, offset; - size_t object_size; - eversion_t version; - - eversion_t pg_complete_thru; - - epoch_t map_epoch; - } st; - - bufferlist data; - map attrset; - - public: - const reqid_t& get_reqid() { return st.reqid; } - long get_tid() { return st.reqid.tid; } - long get_rep_tid() { return st.rep_tid; } - object_t get_oid() { return st.oid; } - pg_t get_pg() { return st.pg; } - int get_op() { return st.op; } - bool get_commit() { return st.commit; } - - int get_result() { return st.result; } - size_t get_length() { return st.length; } - size_t get_offset() { return st.offset; } - size_t get_object_size() { return st.object_size; } - eversion_t get_version() { return st.version; } - map& get_attrset() { return attrset; } - - eversion_t get_pg_complete_thru() { return st.pg_complete_thru; } - void set_pg_complete_thru(eversion_t v) { st.pg_complete_thru = v; } - - void set_result(int r) { st.result = r; } - void set_length(size_t s) { st.length = s; } - void set_offset(size_t o) { st.offset = o; } - void set_object_size(size_t s) { st.object_size = s; } - void set_version(eversion_t v) { st.version = v; } - void set_attrset(map &as) { attrset = as; } - - void set_op(int op) { st.op = op; } - void set_rep_tid(tid_t t) { st.rep_tid = t; } - - // data payload - void set_data(bufferlist &d) { - data.claim(d); - } - bufferlist& get_data() { - return data; - } - - // osdmap - epoch_t get_map_epoch() { return st.map_epoch; } - - -public: - MOSDOpReply(MOSDOp *req, int result, epoch_t e, bool commit) : - Message(MSG_OSD_OPREPLY) { - memset(&st, 0, sizeof(st)); - this->st.reqid = req->st.reqid; - this->st.op = req->st.op; - this->st.rep_tid = req->st.rep_tid; - - this->st.oid = req->st.oid; - this->st.pg = req->st.pg; - this->st.result = result; - this->st.commit = commit; - - this->st.length = req->st.length; // speculative... OSD should ensure these are correct - this->st.offset = req->st.offset; - this->st.version = req->st.version; - - this->st.map_epoch = e; - } - MOSDOpReply() {} - - - // marshalling - virtual void decode_payload() { - payload.copy(0, sizeof(st), (char*)&st); - payload.splice(0, sizeof(st)); - int off = 0; - ::_decode(attrset, payload, off); - ::_decode(data, payload, off); - } - virtual void encode_payload() { - payload.append((char*)&st, sizeof(st)); - ::_encode(attrset, payload); - ::_encode(data, payload); - } - - virtual char *get_type_name() { return "oopr"; } - - void print(ostream& out) { - out << "osd_op_reply(" << st.reqid - << " " << MOSDOp::get_opname(st.op) - << " " << st.oid << " = " << st.result - //<< " " << this - << ")"; - } - -}; - - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MOSDOut.h b/tags/20070517_before_mds_merge/messages/MOSDOut.h deleted file mode 100644 index 61a594de3294a..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MOSDOut.h +++ /dev/null @@ -1,42 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __MOSDOUT_H -#define __MOSDOUT_H - -#include "msg/Message.h" - - -class MOSDOut : public Message { - public: - epoch_t map_epoch; - - MOSDOut(epoch_t e) : Message(MSG_OSD_OUT), map_epoch(e) { - } - MOSDOut() {} - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(map_epoch), (char*)&map_epoch); - off += sizeof(map_epoch); - } - virtual void encode_payload() { - payload.append((char*)&map_epoch, sizeof(map_epoch)); - } - - virtual char *get_type_name() { return "oout"; } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MOSDPGLog.h b/tags/20070517_before_mds_merge/messages/MOSDPGLog.h deleted file mode 100644 index e4731c6037107..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MOSDPGLog.h +++ /dev/null @@ -1,61 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDPGLOG_H -#define __MOSDPGLOG_H - -#include "msg/Message.h" - -class MOSDPGLog : public Message { - epoch_t epoch; - pg_t pgid; - -public: - PG::Info info; - PG::Log log; - PG::Missing missing; - - epoch_t get_epoch() { return epoch; } - pg_t get_pgid() { return pgid; } - - MOSDPGLog() {} - MOSDPGLog(version_t mv, pg_t pgid) : - Message(MSG_OSD_PG_LOG) { - this->epoch = mv; - this->pgid = pgid; - } - - char *get_type_name() { return "PGlog"; } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - payload.append((char*)&pgid, sizeof(pgid)); - payload.append((char*)&info, sizeof(info)); - log._encode(payload); - missing._encode(payload); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - payload.copy(off, sizeof(pgid), (char*)&pgid); - off += sizeof(pgid); - payload.copy(off, sizeof(info), (char*)&info); - off += sizeof(info); - log._decode(payload, off); - missing._decode(payload, off); - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MOSDPGNotify.h b/tags/20070517_before_mds_merge/messages/MOSDPGNotify.h deleted file mode 100644 index f6fe8ee88c170..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MOSDPGNotify.h +++ /dev/null @@ -1,54 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MOSDPGPEERNOTIFY_H -#define __MOSDPGPEERNOTIFY_H - -#include "msg/Message.h" - -#include "osd/PG.h" - -/* - * PGNotify - notify primary of my PGs and versions. - */ - -class MOSDPGNotify : public Message { - epoch_t epoch; - list pg_list; // pgid -> version - - public: - version_t get_epoch() { return epoch; } - list& get_pg_list() { return pg_list; } - - MOSDPGNotify() {} - MOSDPGNotify(epoch_t e, list& l) : - Message(MSG_OSD_PG_NOTIFY) { - this->epoch = e; - pg_list.splice(pg_list.begin(),l); - } - - char *get_type_name() { return "PGnot"; } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - _encode(pg_list, payload); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - _decode(pg_list, payload, off); - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MOSDPGPeer.h b/tags/20070517_before_mds_merge/messages/MOSDPGPeer.h deleted file mode 100644 index ebe1cda485c4c..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MOSDPGPeer.h +++ /dev/null @@ -1,57 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDPGPEER_H -#define __MOSDPGPEER_H - -#include "msg/Message.h" - - -class MOSDPGPeer : public Message { - __uint64_t map_version; - list pg_list; - - bool complete; - - public: - __uint64_t get_version() { return map_version; } - list& get_pg_list() { return pg_list; } - bool get_complete() { return complete; } - - MOSDPGPeer() {} - MOSDPGPeer(__uint64_t v, list& l, bool c=false) : - Message(MSG_OSD_PG_PEER) { - this->map_version = v; - this->complete = c; - pg_list.splice(pg_list.begin(), l); - } - - char *get_type_name() { return "PGPeer"; } - - void encode_payload() { - payload.append((char*)&map_version, sizeof(map_version)); - payload.append((char*)&complete, sizeof(complete)); - _encode(pg_list, payload); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(map_version), (char*)&map_version); - off += sizeof(map_version); - payload.copy(off, sizeof(complete), (char*)&complete); - off += sizeof(complete); - _decode(pg_list, payload, off); - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MOSDPGPeerAck.h b/tags/20070517_before_mds_merge/messages/MOSDPGPeerAck.h deleted file mode 100644 index e21a2607bb573..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MOSDPGPeerAck.h +++ /dev/null @@ -1,69 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDPGPEERACK_H -#define __MOSDPGPEERACK_H - -#include "msg/Message.h" -#include "osd/OSD.h" - -class MOSDPGPeerAck : public Message { - __uint64_t map_version; - - public: - list pg_dne; // pg dne - map pg_state; // state, lists, etc. - - __uint64_t get_version() { return map_version; } - - MOSDPGPeerAck() {} - MOSDPGPeerAck(__uint64_t v) : - Message(MSG_OSD_PG_PEERACK) { - this->map_version = v; - } - - char *get_type_name() { return "PGPeer"; } - - void encode_payload() { - payload.append((char*)&map_version, sizeof(map_version)); - _encode(pg_dne, payload); - - int n = pg_state.size(); - payload.append((char*)&n, sizeof(n)); - for (map::iterator it = pg_state.begin(); - it != pg_state.end(); - it++) { - payload.append((char*)&it->first, sizeof(it->first)); - it->second._encode(payload); - } - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(map_version), (char*)&map_version); - off += sizeof(map_version); - _decode(pg_dne, payload, off); - - int n; - payload.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDPEERREQUEST_H -#define __MOSDPEERREQUEST_H - -#include "msg/Message.h" - - -class MOSDPGPeerRequest : public Message { - __uint64_t map_version; - list pg_list; - - public: - __uint64_t get_version() { return map_version; } - list& get_pg_list() { return pg_list; } - - MOSDPGPeerRequest() {} - MOSDPGPeerRequest(__uint64_t v, list& l) : - Message(MSG_OSD_PG_PEERREQUEST) { - this->map_version = v; - pg_list.splice(pg_list.begin(), l); - } - - char *get_type_name() { return "PGPR"; } - - void encode_payload() { - payload.append((char*)&map_version, sizeof(map_version)); - _encode(pg_list, payload); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(map_version), (char*)&map_version); - off += sizeof(map_version); - _decode(pg_list, payload, off); - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MOSDPGQuery.h b/tags/20070517_before_mds_merge/messages/MOSDPGQuery.h deleted file mode 100644 index 926acce81349d..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MOSDPGQuery.h +++ /dev/null @@ -1,51 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDPGQUERY_H -#define __MOSDPGQUERY_H - -#include "msg/Message.h" - -/* - * PGQuery - query another OSD as to the contents of their PGs - */ - -class MOSDPGQuery : public Message { - version_t epoch; - - public: - version_t get_epoch() { return epoch; } - map pg_list; - - MOSDPGQuery() {} - MOSDPGQuery(epoch_t e, map& ls) : - Message(MSG_OSD_PG_QUERY), - epoch(e), pg_list(ls) { - } - - char *get_type_name() { return "PGq"; } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - ::_encode(pg_list, payload); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - ::_decode(pg_list, payload, off); - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MOSDPGRemove.h b/tags/20070517_before_mds_merge/messages/MOSDPGRemove.h deleted file mode 100644 index 9629a3782764b..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MOSDPGRemove.h +++ /dev/null @@ -1,51 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDPGREMOVE_H -#define __MOSDPGREMOVE_H - -#include "msg/Message.h" - - -class MOSDPGRemove : public Message { - epoch_t epoch; - - public: - set pg_list; - - epoch_t get_epoch() { return epoch; } - - MOSDPGRemove() {} - MOSDPGRemove(epoch_t e, set& l) : - Message(MSG_OSD_PG_REMOVE) { - this->epoch = e; - pg_list = l; - } - - char *get_type_name() { return "PGrm"; } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - _encode(pg_list, payload); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - _decode(pg_list, payload, off); - } - -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MOSDPGSummary.h b/tags/20070517_before_mds_merge/messages/MOSDPGSummary.h deleted file mode 100644 index dc4af837209bb..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MOSDPGSummary.h +++ /dev/null @@ -1,65 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDPGQUERYREPLY_H -#define __MOSDPGQUERYREPLY_H - -#include "msg/Message.h" - -class MOSDPGSummary : public Message { - epoch_t epoch; - pg_t pgid; - -public: - PG::PGInfo info; - bufferlist sumbl; - - epoch_t get_epoch() { return epoch; } - - MOSDPGSummary() {} - MOSDPGSummary(version_t mv, pg_t pgid, PG::PGSummary &summary) : - Message(MSG_OSD_PG_SUMMARY) { - this->epoch = mv; - this->pgid = pgid; - summary._encode(sumbl); - } - - pg_t get_pgid() { return pgid; } - bufferlist& get_summary_bl() { - return sumbl; - } - - char *get_type_name() { return "PGsum"; } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - payload.append((char*)&pgid, sizeof(pgid)); - payload.append((char*)&info, sizeof(info)); - payload.claim_append(sumbl); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - payload.copy(off, sizeof(pgid), (char*)&pgid); - off += sizeof(pgid); - payload.copy(off, sizeof(info), (char*)&info); - off += sizeof(info); - - payload.splice(0, off); - sumbl.claim(payload); - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MOSDPGUpdate.h b/tags/20070517_before_mds_merge/messages/MOSDPGUpdate.h deleted file mode 100644 index 93809d6820d21..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MOSDPGUpdate.h +++ /dev/null @@ -1,64 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDPGUPDATE_H -#define __MOSDPGUPDATE_H - -#include "msg/Message.h" - -class MOSDPGUpdate : public Message { - version_t map_version; - pg_t pgid; - //pginfo_t info; - bool complete; - version_t last_any_complete; - - public: - version_t get_version() { return map_version; } - pg_t get_pgid() { return pgid; } - //pginfo_t& get_pginfo() { return info; } - bool is_complete() { return complete; } - version_t get_last_any_complete() { return last_any_complete; } - - MOSDPGUpdate() {} - MOSDPGUpdate(version_t mv, pg_t pgid, bool complete, version_t last_any_complete) : - Message(MSG_OSD_PG_UPDATE) { - this->map_version = mv; - this->pgid = pgid; - this->complete = complete; - this->last_any_complete = last_any_complete; - } - - char *get_type_name() { return "PGUp"; } - - void encode_payload() { - payload.append((char*)&map_version, sizeof(map_version)); - payload.append((char*)&pgid, sizeof(pgid)); - payload.append((char*)&complete, sizeof(complete)); - payload.append((char*)&last_any_complete, sizeof(last_any_complete)); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(map_version), (char*)&map_version); - off += sizeof(map_version); - payload.copy(off, sizeof(pgid), (char*)&pgid); - off += sizeof(pgid); - payload.copy(off, sizeof(complete), (char*)&complete); - off += sizeof(complete); - payload.copy(off, sizeof(last_any_complete), (char*)&last_any_complete); - off += sizeof(last_any_complete); - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MOSDPing.h b/tags/20070517_before_mds_merge/messages/MOSDPing.h deleted file mode 100644 index fae80edd91cfc..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MOSDPing.h +++ /dev/null @@ -1,50 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MOSDPING_H -#define __MOSDPING_H - -#include "msg/Message.h" - - -class MOSDPing : public Message { - public: - epoch_t map_epoch; - bool ack; - float avg_qlen; - - MOSDPing(epoch_t e, - float aq, - bool a=false) : Message(MSG_OSD_PING), map_epoch(e), ack(a), avg_qlen(aq) { - } - MOSDPing() {} - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(map_epoch), (char*)&map_epoch); - off += sizeof(map_epoch); - payload.copy(off, sizeof(ack), (char*)&ack); - off += sizeof(ack); - payload.copy(off, sizeof(avg_qlen), (char*)&avg_qlen); - off += sizeof(avg_qlen); - } - virtual void encode_payload() { - payload.append((char*)&map_epoch, sizeof(map_epoch)); - payload.append((char*)&ack, sizeof(ack)); - payload.append((char*)&avg_qlen, sizeof(avg_qlen)); - } - - virtual char *get_type_name() { return "oping"; } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MPing.h b/tags/20070517_before_mds_merge/messages/MPing.h deleted file mode 100644 index 65b65a738cd66..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MPing.h +++ /dev/null @@ -1,41 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __MPING_H -#define __MPING_H - -#include "msg/Message.h" - - -class MPing : public Message { - public: - int seq; - MPing(int s) : Message(MSG_PING) { - seq = s; - } - MPing() : Message(MSG_PING) {} - - virtual void decode_payload(crope& s, int& off) { - s.copy(0, sizeof(seq), (char*)&seq); - off += sizeof(seq); - } - virtual void encode_payload(crope& s) { - s.append((char*)&seq, sizeof(seq)); - } - - virtual char *get_type_name() { return "ping"; } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MPingAck.h b/tags/20070517_before_mds_merge/messages/MPingAck.h deleted file mode 100644 index 0ee385b7a2b80..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MPingAck.h +++ /dev/null @@ -1,40 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MPINGACK_H -#define __MPINGACK_H - -#include "MPing.h" - - -class MPingAck : public Message { - public: - int seq; - MPingAck() {} - MPingAck(MPing *p) : Message(MSG_PING_ACK) { - this->seq = p->seq; - } - - virtual void decode_payload(crope& s, int& off) { - s.copy(0, sizeof(seq), (char*)&seq); - off += sizeof(seq); - } - virtual void encode_payload(crope& s) { - s.append((char*)&seq, sizeof(seq)); - } - - virtual char *get_type_name() { return "pinga"; } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MRename.h b/tags/20070517_before_mds_merge/messages/MRename.h deleted file mode 100644 index e648f3e652fc7..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MRename.h +++ /dev/null @@ -1,80 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MRENAME_H -#define __MRENAME_H - -class MRename : public Message { - inodeno_t srcdirino; - string srcname; - inodeno_t destdirino; - string destname; - int initiator; - - bufferlist inode_state; - - public: - int get_initiator() { return initiator; } - inodeno_t get_srcdirino() { return srcdirino; } - string& get_srcname() { return srcname; } - inodeno_t get_destdirino() { return destdirino; } - string& get_destname() { return destname; } - bufferlist& get_inode_state() { return inode_state; } - - MRename() {} - MRename(int initiator, - inodeno_t srcdirino, - const string& srcname, - inodeno_t destdirino, - const string& destname, - bufferlist& inode_state) : - Message(MSG_MDS_RENAME) { - this->initiator = initiator; - this->srcdirino = srcdirino; - this->srcname = srcname; - this->destdirino = destdirino; - this->destname = destname; - this->inode_state.claim( inode_state ); - } - virtual char *get_type_name() { return "Rn";} - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(initiator), (char*)&initiator); - off += sizeof(initiator); - payload.copy(off, sizeof(srcdirino), (char*)&srcdirino); - off += sizeof(srcdirino); - payload.copy(off, sizeof(destdirino), (char*)&destdirino); - off += sizeof(destdirino); - _decode(srcname, payload, off); - _decode(destname, payload, off); - size_t len; - payload.copy(off, sizeof(len), (char*)&len); - off += sizeof(len); - inode_state.substr_of(payload, off, len); - off += len; - } - virtual void encode_payload() { - payload.append((char*)&initiator,sizeof(initiator)); - payload.append((char*)&srcdirino,sizeof(srcdirino)); - payload.append((char*)&destdirino,sizeof(destdirino)); - _encode(srcname, payload); - _encode(destname, payload); - size_t len = inode_state.length(); - payload.append((char*)&len, sizeof(len)); - payload.claim_append(inode_state); - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MRenameAck.h b/tags/20070517_before_mds_merge/messages/MRenameAck.h deleted file mode 100644 index 14843cef5f616..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MRenameAck.h +++ /dev/null @@ -1,42 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MRENAMEACK_H -#define __MRENAMEACK_H - -/* FIXME: relateive to dn, not inode */ - -class MRenameAck : public Message { - inodeno_t ino; - - public: - inodeno_t get_ino() { return ino; } - - MRenameAck() {} - MRenameAck(inodeno_t ino) : - Message(MSG_MDS_RENAMEACK) { - this->ino = ino; - } - virtual char *get_type_name() { return "RnAck";} - - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - } - virtual void encode_payload(crope& s) { - s.append((char*)&ino,sizeof(ino)); - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MRenameNotify.h b/tags/20070517_before_mds_merge/messages/MRenameNotify.h deleted file mode 100644 index bc32300b82e3a..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MRenameNotify.h +++ /dev/null @@ -1,80 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MRENAMENOTIFY_H -#define __MRENAMENOTIFY_H - -class MRenameNotify : public Message { - inodeno_t ino; - inodeno_t srcdirino; - string srcname; - inodeno_t destdirino; - string destname; - string destdirpath; - int srcauth; - - public: - inodeno_t get_ino() { return ino; } - inodeno_t get_srcdirino() { return srcdirino; } - string& get_srcname() { return srcname; } - inodeno_t get_destdirino() { return destdirino; } - string& get_destname() { return destname; } - string& get_destdirpath() { return destdirpath; } - int get_srcauth() { return srcauth; } - - MRenameNotify() {} - MRenameNotify(inodeno_t ino, - inodeno_t srcdirino, - const string& srcname, - inodeno_t destdirino, - const string& destdirpath, - const string& destname, - int srcauth - ) : - Message(MSG_MDS_RENAMENOTIFY) { - this->ino = ino; - this->srcdirino = srcdirino; - this->srcname = srcname; - this->destdirino = destdirino; - this->destname = destname; - this->destdirpath = destdirpath; - this->srcauth = srcauth; - } - virtual char *get_type_name() { return "Rnot";} - - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - s.copy(off, sizeof(srcdirino), (char*)&srcdirino); - off += sizeof(srcdirino); - s.copy(off, sizeof(destdirino), (char*)&destdirino); - off += sizeof(destdirino); - _unrope(srcname, s, off); - _unrope(destname, s, off); - _unrope(destdirpath, s, off); - s.copy(off, sizeof(srcauth), (char*)&srcauth); - off += sizeof(srcauth); - } - virtual void encode_payload(crope& s) { - s.append((char*)&ino,sizeof(ino)); - s.append((char*)&srcdirino,sizeof(srcdirino)); - s.append((char*)&destdirino,sizeof(destdirino)); - _rope(srcname, s); - _rope(destname, s); - _rope(destdirpath, s); - s.append((char*)&srcauth, sizeof(srcauth)); - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MRenameNotifyAck.h b/tags/20070517_before_mds_merge/messages/MRenameNotifyAck.h deleted file mode 100644 index d1a01339cd97a..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MRenameNotifyAck.h +++ /dev/null @@ -1,40 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MRENAMENOTIFYACK_H -#define __MRENAMENOTIFYACK_H - -class MRenameNotifyAck : public Message { - inodeno_t ino; - - public: - inodeno_t get_ino() { return ino; } - - MRenameNotifyAck() {} - MRenameNotifyAck(inodeno_t ino) : - Message(MSG_MDS_RENAMENOTIFYACK) { - this->ino = ino; - } - virtual char *get_type_name() { return "RnotA";} - - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - } - virtual void encode_payload(crope& s) { - s.append((char*)&ino,sizeof(ino)); - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MRenamePrep.h b/tags/20070517_before_mds_merge/messages/MRenamePrep.h deleted file mode 100644 index 1af798c674489..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MRenamePrep.h +++ /dev/null @@ -1,85 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MRENAMEPREP_H -#define __MRENAMEPREP_H - -class MRenamePrep : public Message { - inodeno_t srcdirino; - string srcname; - string srcpath; - inodeno_t destdirino; - string destname; - string destpath; - int initiator; - int srcauth; - - public: - int get_initiator() { return initiator; } - inodeno_t get_srcdirino() { return srcdirino; } - string& get_srcname() { return srcname; } - string& get_srcpath() { return srcpath; } - int get_srcauth() { return srcauth; } - inodeno_t get_destdirino() { return destdirino; } - string& get_destname() { return destname; } - string& get_destpath() { return destpath; } - - MRenamePrep() {} - MRenamePrep(int initiator, - inodeno_t srcdirino, - const string& srcname, - const string& srcpath, - inodeno_t destdirino, - const string& destname, - const string& destpath, - int srcauth) : - Message(MSG_MDS_RENAMEPREP) { - this->initiator = initiator; - this->srcdirino = srcdirino; - this->srcname = srcname; - this->srcpath = srcpath; - this->destdirino = destdirino; - this->destname = destname; - this->destpath = destpath; - this->srcauth = srcauth; - } - virtual char *get_type_name() { return "RnP";} - - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(initiator), (char*)&initiator); - off += sizeof(initiator); - s.copy(off, sizeof(srcdirino), (char*)&srcdirino); - off += sizeof(srcdirino); - s.copy(off, sizeof(destdirino), (char*)&destdirino); - off += sizeof(destdirino); - _unrope(srcname, s, off); - _unrope(srcpath, s, off); - _unrope(destname, s, off); - _unrope(destpath, s, off); - s.copy(off, sizeof(srcauth), (char*)&srcauth); - off += sizeof(srcauth); - } - virtual void encode_payload(crope& s) { - s.append((char*)&initiator,sizeof(initiator)); - s.append((char*)&srcdirino,sizeof(srcdirino)); - s.append((char*)&destdirino,sizeof(destdirino)); - _rope(srcname, s); - _rope(srcpath, s); - _rope(destname, s); - _rope(destpath, s); - s.append((char*)&srcauth, sizeof(srcauth)); - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MRenameReq.h b/tags/20070517_before_mds_merge/messages/MRenameReq.h deleted file mode 100644 index b70e96a38203b..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MRenameReq.h +++ /dev/null @@ -1,79 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MRENAMEREQ_H -#define __MRENAMEREQ_H - -class MRenameReq : public Message { - int initiator; - inodeno_t srcdirino; - string srcname; - inodeno_t destdirino; - string destname; - string destpath; - int destauth; - - public: - int get_initiator() { return initiator; } - inodeno_t get_srcdirino() { return srcdirino; } - string& get_srcname() { return srcname; } - inodeno_t get_destdirino() { return destdirino; } - string& get_destname() { return destname; } - string& get_destpath() { return destpath; } - int get_destauth() { return destauth; } - - MRenameReq() {} - MRenameReq(int initiator, - inodeno_t srcdirino, - const string& srcname, - inodeno_t destdirino, - const string& destname, - const string& destpath, - int destauth) : - Message(MSG_MDS_RENAMEREQ) { - this->initiator = initiator; - this->srcdirino = srcdirino; - this->srcname = srcname; - this->destdirino = destdirino; - this->destname = destname; - this->destpath = destpath; - this->destauth = destauth; - } - virtual char *get_type_name() { return "RnReq";} - - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(initiator), (char*)&initiator); - off += sizeof(initiator); - s.copy(off, sizeof(srcdirino), (char*)&srcdirino); - off += sizeof(srcdirino); - s.copy(off, sizeof(destdirino), (char*)&destdirino); - off += sizeof(destdirino); - _unrope(srcname, s, off); - _unrope(destname, s, off); - _unrope(destpath, s, off); - s.copy(off, sizeof(destauth), (char*)&destauth); - off += sizeof(destauth); - } - virtual void encode_payload(crope& s) { - s.append((char*)&initiator,sizeof(initiator)); - s.append((char*)&srcdirino,sizeof(srcdirino)); - s.append((char*)&destdirino,sizeof(destdirino)); - _rope(srcname, s); - _rope(destname, s); - _rope(destpath, s); - s.append((char*)&destauth, sizeof(destauth)); - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MRenameWarning.h b/tags/20070517_before_mds_merge/messages/MRenameWarning.h deleted file mode 100644 index 85463dfd2c179..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MRenameWarning.h +++ /dev/null @@ -1,40 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MRENAMEWARNING_H -#define __MRENAMEWARNING_H - -class MRenameWarning : public Message { - inodeno_t ino; - - public: - inodeno_t get_ino() { return ino; } - - MRenameWarning() {} - MRenameWarning(inodeno_t ino) : - Message(MSG_MDS_RENAMEWARNING) { - this->ino = ino; - } - virtual char *get_type_name() { return "RnW";} - - virtual void decode_payload(crope& s, int& off) { - s.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - } - virtual void encode_payload(crope& s) { - s.append((char*)&ino,sizeof(ino)); - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MUnhashDir.h b/tags/20070517_before_mds_merge/messages/MUnhashDir.h deleted file mode 100644 index 911a14d9c9592..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MUnhashDir.h +++ /dev/null @@ -1,42 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MUNHASHDIR_H -#define __MUNHASHDIR_H - -#include "msg/Message.h" - -class MUnhashDir : public Message { - inodeno_t ino; - - public: - inodeno_t get_ino() { return ino; } - - MUnhashDir() {} - MUnhashDir(inodeno_t ino) : - Message(MSG_MDS_UNHASHDIR) { - this->ino = ino; - } - virtual char *get_type_name() { return "UH"; } - - virtual void decode_payload() { - payload.copy(0, sizeof(ino), (char*)&ino); - } - virtual void encode_payload() { - payload.append((char*)&ino, sizeof(ino)); - } - -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MUnhashDirAck.h b/tags/20070517_before_mds_merge/messages/MUnhashDirAck.h deleted file mode 100644 index e052683e736c3..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MUnhashDirAck.h +++ /dev/null @@ -1,65 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MUNHASHDIRACK_H -#define __MUNHASHDIRACK_H - -#include "msg/Message.h" - -class MUnhashDirAck : public Message { - inodeno_t ino; - bufferlist state; - int nden; - - public: - MUnhashDirAck() {} - MUnhashDirAck(inodeno_t ino, bufferlist& bl, int nden) : - Message(MSG_MDS_UNHASHDIRACK) { - this->ino = ino; - state.claim(bl); - this->nden = nden; - } - virtual char *get_type_name() { return "UHaA"; } - - inodeno_t get_ino() { return ino; } - bufferlist& get_state() { return state; } - bufferlist* get_state_ptr() { return &state; } - int get_nden() { return nden; } - - //void set_nden(int n) { nden = n; } - //void inc_nden() { nden++; } - - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - payload.copy(off, sizeof(nden), (char*)&nden); - off += sizeof(nden); - - size_t len; - payload.copy(off, sizeof(len), (char*)&len); - off += sizeof(len); - state.substr_of(payload, off, len); - } - void encode_payload() { - payload.append((char*)&ino, sizeof(ino)); - payload.append((char*)&nden, sizeof(nden)); - size_t size = state.length(); - payload.append((char*)&size, sizeof(size)); - payload.claim_append(state); - } - -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MUnhashDirNotify.h b/tags/20070517_before_mds_merge/messages/MUnhashDirNotify.h deleted file mode 100644 index a9d6707a3aa25..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MUnhashDirNotify.h +++ /dev/null @@ -1,50 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MUNHASHDIRNOTIFY_H -#define __MUNHASHDIRNOTIFY_H - -#include "msg/Message.h" - -class MUnhashDirNotify : public Message { - inodeno_t ino; - //int peer; - - public: - inodeno_t get_ino() { return ino; } - //int get_peer() { return peer; } - - MUnhashDirNotify() {} - MUnhashDirNotify(inodeno_t ino/*, int peer*/) : - Message(MSG_MDS_UNHASHDIRNOTIFY) { - this->ino = ino; - //this->peer = peer; - } - virtual char *get_type_name() { return "UHN"; } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - //payload.copy(off, sizeof(peer), (char*)&peer); - //off += sizeof(peer); - } - virtual void encode_payload() { - payload.append((char*)&ino, sizeof(ino)); - //payload.append((char*)&peer, sizeof(peer)); - } - -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MUnhashDirNotifyAck.h b/tags/20070517_before_mds_merge/messages/MUnhashDirNotifyAck.h deleted file mode 100644 index ad4843676f0fb..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MUnhashDirNotifyAck.h +++ /dev/null @@ -1,42 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MUNHASHDIRNOTIFYACK_H -#define __MUNHASHDIRNOTIFYACK_H - -#include "msg/Message.h" - -class MUnhashDirNotifyAck : public Message { - inodeno_t ino; - - public: - inodeno_t get_ino() { return ino; } - - MUnhashDirNotifyAck() {} - MUnhashDirNotifyAck(inodeno_t ino) : - Message(MSG_MDS_UNHASHDIRNOTIFYACK) { - this->ino = ino; - } - virtual char *get_type_name() { return "UHNa"; } - - virtual void decode_payload() { - payload.copy(0, sizeof(ino), (char*)&ino); - } - virtual void encode_payload() { - payload.append((char*)&ino, sizeof(ino)); - } - -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MUnhashDirPrep.h b/tags/20070517_before_mds_merge/messages/MUnhashDirPrep.h deleted file mode 100644 index c4dc2ea422cd9..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MUnhashDirPrep.h +++ /dev/null @@ -1,42 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MUNHASHDIRPREP_H -#define __MUNHASHDIRPREP_H - -#include "msg/Message.h" - -class MUnhashDirPrep : public Message { - inodeno_t ino; - - public: - inodeno_t get_ino() { return ino; } - - MUnhashDirPrep() {} - MUnhashDirPrep(inodeno_t ino) : - Message(MSG_MDS_UNHASHDIRPREP) { - this->ino = ino; - } - virtual char *get_type_name() { return "UHP"; } - - virtual void decode_payload() { - payload.copy(0, sizeof(ino), (char*)&ino); - } - virtual void encode_payload() { - payload.append((char*)&ino, sizeof(ino)); - } - -}; - -#endif diff --git a/tags/20070517_before_mds_merge/messages/MUnhashDirPrepAck.h b/tags/20070517_before_mds_merge/messages/MUnhashDirPrepAck.h deleted file mode 100644 index bd7e93981964b..0000000000000 --- a/tags/20070517_before_mds_merge/messages/MUnhashDirPrepAck.h +++ /dev/null @@ -1,93 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MUNHASHDIRPREPACK_H -#define __MUNHASHDIRPREPACK_H - -#include "msg/Message.h" -#include "mds/CInode.h" -#include "include/types.h" - -class MUnhashDirPrepAck : public Message { - inodeno_t ino; - bool assim; - - // subdir dentry names + inodes - map inodes; - - public: - inodeno_t get_ino() { return ino; } - map& get_inodes() { return inodes; } - - bool did_assim() { return assim; } - void mark_assim() { assert(!assim); assim = true; } - - MUnhashDirPrepAck() : assim(false) { } - MUnhashDirPrepAck(inodeno_t ino) : - Message(MSG_MDS_UNHASHDIRPREPACK), - assim(false) { - this->ino = ino; - } - ~MUnhashDirPrepAck() { - for (map::iterator it = inodes.begin(); - it != inodes.end(); - it++) - delete it->second; - } - - - virtual char *get_type_name() { return "HP"; } - - void add_inode(const string& dentry, CInodeDiscover *in) { - inodes[dentry] = in; - } - - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - - // inodes - int ni; - payload.copy(off, sizeof(int), (char*)&ni); - off += sizeof(int); - for (int i=0; i_decode(payload, off); - - inodes[dname] = in; - } - } - - virtual void encode_payload() { - payload.append((char*)&ino, sizeof(ino)); - - // inodes - int ni = inodes.size(); - payload.append((char*)&ni, sizeof(int)); - for (map::iterator iit = inodes.begin(); - iit != inodes.end(); - iit++) { - _encode(iit->first, payload); // dentry - iit->second->_encode(payload); // inode - } - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/mkmonmap.cc b/tags/20070517_before_mds_merge/mkmonmap.cc deleted file mode 100644 index 1ec4c808d6204..0000000000000 --- a/tags/20070517_before_mds_merge/mkmonmap.cc +++ /dev/null @@ -1,67 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include -#include -#include - -#include -#include -#include -using namespace std; - -#include "config.h" - -#include "mon/MonMap.h" - - - - - -int main(int argc, char **argv) -{ - vector args; - argv_to_vec(argc, argv, args); - - MonMap monmap; - - char *outfn = ".ceph_monmap"; - - for (unsigned i=0; i= 0); - - return 0; -} diff --git a/tags/20070517_before_mds_merge/mon/ClientMonitor.cc b/tags/20070517_before_mds_merge/mon/ClientMonitor.cc deleted file mode 100644 index 8ab59504d4bae..0000000000000 --- a/tags/20070517_before_mds_merge/mon/ClientMonitor.cc +++ /dev/null @@ -1,109 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include "ClientMonitor.h" -#include "Monitor.h" -#include "MDSMonitor.h" - -#include "messages/MClientBoot.h" -#include "messages/MMDSMap.h" -//#include "messages/MMDSFailure.h" - -#include "common/Timer.h" - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cout << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".client " -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cerr << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".client " - - - - -void ClientMonitor::dispatch(Message *m) -{ - switch (m->get_type()) { - - case MSG_CLIENT_BOOT: - handle_client_boot((MClientBoot*)m); - break; - - /* - case MSG_client_FAILURE: - handle_client_failure((MClientFailure*)m); - break; - */ - - default: - assert(0); - } -} - -void ClientMonitor::handle_client_boot(MClientBoot *m) -{ - dout(7) << "client_boot from " << m->get_source() << " at " << m->get_source_inst() << endl; - assert(m->get_source().is_client()); - int from = m->get_source().num(); - - // choose a client id - if (from < 0 || - (client_map.count(m->get_source()) && client_map[m->get_source()] != m->get_source_addr())) { - from = ++num_clients; - dout(10) << "client_boot assigned client" << from << endl; - } - - client_map[MSG_ADDR_CLIENT(from)] = m->get_source_addr(); - - // reply with latest mds map - entity_inst_t to = m->get_source_inst(); - to.name = MSG_ADDR_CLIENT(from); - mon->mdsmon->send_latest(to); - delete m; -} - -/* -void ClientMonitor::handle_mds_shutdown(Message *m) -{ - assert(m->get_source().is_mds()); - int from = m->get_source().num(); - - mdsmap.mds_inst.erase(from); - mdsmap.all_mds.erase(from); - - dout(7) << "mds_shutdown from " << m->get_source() - << ", still have " << mdsmap.all_mds - << endl; - - // tell someone? - // fixme - - delete m; -} - -*/ - -/* -void ClientMonitor::bcast_latest_mds() -{ - dout(10) << "bcast_latest_mds " << mdsmap.get_epoch() << endl; - - // tell mds - for (set::iterator p = mdsmap.get_mds().begin(); - p != mdsmap.get_mds().end(); - p++) { - if (mdsmap.is_down(*p)) continue; - send_full(MSG_ADDR_MDS(*p), mdsmap.get_inst(*p)); - } -} - -*/ diff --git a/tags/20070517_before_mds_merge/mon/ClientMonitor.h b/tags/20070517_before_mds_merge/mon/ClientMonitor.h deleted file mode 100644 index c3ea253bafc48..0000000000000 --- a/tags/20070517_before_mds_merge/mon/ClientMonitor.h +++ /dev/null @@ -1,52 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __CLIENTMONITOR_H -#define __CLIENTMONITOR_H - -#include -#include -using namespace std; - -#include "include/types.h" -#include "msg/Messenger.h" - -#include "mds/MDSMap.h" - -class Monitor; - -class ClientMonitor : public Dispatcher { - Monitor *mon; - Messenger *messenger; - Mutex &lock; - - private: - int num_clients; - map client_map; - - void bcast_latest_mds(); - - //void accept_pending(); // accept pending, new map. - //void send_incremental(epoch_t since, msg_addr_t dest); - - void handle_client_boot(class MClientBoot *m); - - public: - ClientMonitor(Monitor *mn, Messenger *m, Mutex& l) : mon(mn), messenger(m), lock(l), - num_clients(0) { } - - void dispatch(Message *m); - void tick(); // check state, take actions -}; - -#endif diff --git a/tags/20070517_before_mds_merge/mon/Elector.cc b/tags/20070517_before_mds_merge/mon/Elector.cc deleted file mode 100644 index d3098ba065a47..0000000000000 --- a/tags/20070517_before_mds_merge/mon/Elector.cc +++ /dev/null @@ -1,219 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "Elector.h" -#include "Monitor.h" - -#include "common/Timer.h" - -#include "messages/MMonElectionPropose.h" -#include "messages/MMonElectionAck.h" -#include "messages/MMonElectionVictory.h" - -#include "config.h" -#undef dout -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cerr << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".elector " -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cout << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".elector " - - -void Elector::start() -{ - dout(5) << "start -- can i be leader?" << endl; - - leader_acked = -1; - - // start by trying to elect me - start_stamp = g_clock.now(); - acked_me.clear(); - acked_me.insert(whoami); - electing_me = true; - - // bcast to everyone else - for (int i=0; imonmap->num_mon; ++i) { - if (i == whoami) continue; - mon->messenger->send_message(new MMonElectionPropose, - mon->monmap->get_inst(i)); - } - - reset_timer(); -} - -void Elector::defer(int who) -{ - dout(5) << "defer to " << who << endl; - - if (electing_me) { - acked_me.clear(); - electing_me = false; - } - - // ack them - leader_acked = who; - ack_stamp = g_clock.now(); - mon->messenger->send_message(new MMonElectionAck, - mon->monmap->get_inst(who)); - - // set a timer - reset_timer(1.0); // give the leader some extra time to declare victory -} - - -class C_Mon_ElectionExpire : public Context { - Elector *elector; -public: - C_Mon_ElectionExpire(Elector *e) : elector(e) { } - void finish(int r) { - elector->expire(); - } -}; - -void Elector::reset_timer(double plus) -{ - // set the timer - cancel_timer(); - expire_event = new C_Mon_ElectionExpire(this); - g_timer.add_event_after(g_conf.mon_lease + plus, - expire_event); -} - - -void Elector::cancel_timer() -{ - if (expire_event) - g_timer.cancel_event(expire_event); -} - -void Elector::expire() -{ - dout(5) << "election timer expired" << endl; - - // did i win? - if (electing_me && - acked_me.size() > (unsigned)(mon->monmap->num_mon / 2)) { - // i win - victory(); - } else { - // whoever i deferred to didn't declare victory quickly enough. - start(); - } -} - - -void Elector::victory() -{ - leader_acked = -1; - electing_me = false; - - cancel_timer(); - - // tell everyone - for (int i=0; imonmap->num_mon; ++i) { - if (i == whoami) continue; - mon->messenger->send_message(new MMonElectionVictory, - mon->monmap->get_inst(i)); - } - - // tell monitor - mon->win_election(acked_me); -} - - -void Elector::handle_propose(MMonElectionPropose *m) -{ - dout(5) << "handle_propose from " << m->get_source() << endl; - int from = m->get_source().num(); - - if (from > whoami) { - // wait, i should win! - if (!electing_me) - start(); - } else { - // they would win over me - if (leader_acked < 0 || // haven't acked anyone yet, or - leader_acked > from || // they would win over who you did ack, or - leader_acked == from) { // this is the guy we're already deferring to - defer(from); - } else { - // ignore them! - dout(5) << "no, we already acked " << leader_acked << endl; - } - } - - delete m; -} - -void Elector::handle_ack(MMonElectionAck *m) -{ - dout(5) << "handle_ack from " << m->get_source() << endl; - int from = m->get_source().num(); - - if (electing_me) { - // thanks - acked_me.insert(from); - dout(5) << " so far i have " << acked_me << endl; - - // is that _everyone_? - if (acked_me.size() == (unsigned)mon->monmap->num_mon) { - // if yes, shortcut to election finish - victory(); - } - } else { - // ignore, i'm deferring already. - } - - delete m; -} - -void Elector::handle_victory(MMonElectionVictory *m) -{ - dout(5) << "handle_victory from " << m->get_source() << endl; - int from = m->get_source().num(); - - if (from < whoami) { - // ok, fine, they win - mon->lose_election(from); - - // cancel my timer - cancel_timer(); - } else { - // no, that makes no sense, i should win. start over! - start(); - } -} - - - - -void Elector::dispatch(Message *m) -{ - switch (m->get_type()) { - case MSG_MON_ELECTION_ACK: - handle_ack((MMonElectionAck*)m); - break; - - case MSG_MON_ELECTION_PROPOSE: - handle_propose((MMonElectionPropose*)m); - break; - - case MSG_MON_ELECTION_VICTORY: - handle_victory((MMonElectionVictory*)m); - break; - - default: - assert(0); - } -} - - - - diff --git a/tags/20070517_before_mds_merge/mon/Elector.h b/tags/20070517_before_mds_merge/mon/Elector.h deleted file mode 100644 index 67ed59945c46b..0000000000000 --- a/tags/20070517_before_mds_merge/mon/Elector.h +++ /dev/null @@ -1,72 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MON_ELECTOR_H -#define __MON_ELECTOR_H - -#include -using namespace std; - -#include "include/types.h" -#include "msg/Message.h" - -#include "include/Context.h" - -#include "common/Timer.h" - -class Monitor; - - -class Elector { - private: - Monitor *mon; - int whoami; - - Context *expire_event; - - void reset_timer(double plus=0.0); - void cancel_timer(); - - // electing me - bool electing_me; - utime_t start_stamp; - set acked_me; - - // electing them - int leader_acked; // who i've acked - utime_t ack_stamp; // and when - - public: - - void start(); // start an electing me - void defer(int who); - void expire(); // timer goes off - void victory(); - - void handle_propose(class MMonElectionPropose *m); - void handle_ack(class MMonElectionAck *m); - void handle_victory(class MMonElectionVictory *m); - - - public: - Elector(Monitor *m, int w) : mon(m), whoami(w) { - // initialize all those values! - // ... - } - - void dispatch(Message *m); -}; - - -#endif diff --git a/tags/20070517_before_mds_merge/mon/MDSMonitor.cc b/tags/20070517_before_mds_merge/mon/MDSMonitor.cc deleted file mode 100644 index 24beadf85e9f0..0000000000000 --- a/tags/20070517_before_mds_merge/mon/MDSMonitor.cc +++ /dev/null @@ -1,370 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include "MDSMonitor.h" -#include "Monitor.h" -#include "MonitorStore.h" - -#include "messages/MMDSMap.h" -#include "messages/MMDSGetMap.h" -#include "messages/MMDSBeacon.h" - -#include "common/Timer.h" - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cout << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".mds e" << mdsmap.get_epoch() << " " -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cerr << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".mds e" << mdsmap.get_epoch() << " " - - - -/********* MDS map **************/ - -void MDSMonitor::dispatch(Message *m) -{ - switch (m->get_type()) { - - case MSG_MDS_BEACON: - handle_mds_beacon((MMDSBeacon*)m); - break; - - case MSG_MDS_GETMAP: - handle_mds_getmap((MMDSGetMap*)m); - break; - - default: - assert(0); - } -} - - - -void MDSMonitor::election_finished() -{ - if (mon->is_leader()) { - - // FIXME be smarter later. - - if (g_conf.mkfs) { - create_initial(); - save_map(); - } else { - load_map(); - } - } -} - - -void MDSMonitor::create_initial() -{ - mdsmap.epoch = 0; // until everyone boots - mdsmap.ctime = g_clock.now(); - - mdsmap.encode(encoded_map); - - print_map(); -} - -void MDSMonitor::load_map() -{ - int r = mon->store->get_bl_ss(encoded_map, "mdsmap", "current"); - assert(r > 0); - mdsmap.decode(encoded_map); - dout(7) << "load_map epoch " << mdsmap.get_epoch() << endl; -} - -void MDSMonitor::save_map() -{ - dout(7) << "save_map epoch " << mdsmap.get_epoch() << endl; - - int r = mon->store->put_bl_ss(encoded_map, "mdsmap", "current"); - assert(r>=0); -} - -void MDSMonitor::print_map() -{ - dout(7) << "print_map epoch " << mdsmap.get_epoch() << endl; - entity_inst_t blank; - set all; - mdsmap.get_mds_set(all); - for (set::iterator p = all.begin(); - p != all.end(); - ++p) { - dout(7) << " mds" << *p << "." << mdsmap.mds_inc[*p] - << " : " << MDSMap::get_state_name(mdsmap.get_state(*p)) - << " : " << (mdsmap.have_inst(*p) ? mdsmap.get_inst(*p) : blank) - << endl; - } -} - - -void MDSMonitor::issue_map() -{ - mdsmap.inc_epoch(); - encoded_map.clear(); - mdsmap.encode(encoded_map); - - dout(7) << "issue_map epoch " << mdsmap.get_epoch() << endl; - - save_map(); - print_map(); - - // bcast map - bcast_latest_mds(); - send_current(); -} - - -void MDSMonitor::handle_mds_beacon(MMDSBeacon *m) -{ - dout(7) << "mds_beacon " << *m - << " from " << m->get_source() - << " " << m->get_source_inst() - << endl; - int from = m->get_source().num(); - int state = m->get_state(); - version_t seq = m->get_seq(); - - // initial boot? - bool booted = false; - - // choose an MDS id - if (from >= 0) { - // wants to be (or already is) a specific MDS. - if (mdsmap.is_down(from)) { - dout(10) << "mds_beacon assigning requested mds" << from << endl; - booted = true; - } else if (mdsmap.get_inst(from) != m->get_source_inst()) { - dout(10) << "mds_beacon not assigning requested mds" << from - << ", that mds is up and someone else" << endl; - from = -1; - } - } - if (from < 0) { - // pick a failed mds? - set failed; - mdsmap.get_failed_mds_set(failed); - if (!failed.empty()) { - from = *failed.begin(); - dout(10) << "mds_beacon assigned failed mds" << from << endl; - booted = true; - } - } - if (from < 0) { - // ok, just pick any unused mds id. - for (from=0; ; ++from) { - if (mdsmap.is_dne(from) || - mdsmap.is_out(from)) { - dout(10) << "mds_beacon assigned out|dne mds" << from << endl; - booted = true; - break; - } - } - } - - - // old beacon? - if (mdsmap.mds_state_seq[from] > seq) { - dout(7) << "mds_beacon " << *m << " has old seq, ignoring" << endl; - delete m; - return; - } - - // reply to beacon? - if (state != MDSMap::STATE_OUT) { - last_beacon[from] = g_clock.now(); // note time - messenger->send_message(new MMDSBeacon(state, seq), - m->get_source_inst()); - } - - - // make sure it's in the map - if (booted) { - mdsmap.mds_inst[from].addr = m->get_source_addr(); - mdsmap.mds_inst[from].name = MSG_ADDR_MDS(from); - mdsmap.mds_inc[from]++; - - // starting -> creating|starting|replay - if (mdsmap.is_degraded() && - !mdsmap.is_failed(from)) { - dout(10) << "mds_beacon currently degraded, mds" << from << " will be standby" << endl; - state = MDSMap::STATE_STANDBY; - } - else if (state == MDSMap::STATE_STARTING) { - if (mdsmap.is_failed(from)) { - dout(10) << "mds_beacon will recover mds" << from << endl; - state = MDSMap::STATE_REPLAY; - } - else if (mdsmap.is_out(from)) { - dout(10) << "mds_beacon will start mds" << from << endl; - state = MDSMap::STATE_STARTING; - } - else { - dout(10) << "mds_beacon will create mds" << from << endl; - state = MDSMap::STATE_CREATING; - } - } - } - - // if creating -> active, go to standby instead - if (state == MDSMap::STATE_ACTIVE && mdsmap.is_creating(from)) { - mdsmap.mds_created.insert(from); - dout(10) << "mds_beacon created mds" << from << endl; - - if (mdsmap.is_degraded()) { - dout(10) << "mds_beacon current degraded, marking mds" << from << " as standby" << endl; - state = MDSMap::STATE_STANDBY; - } - } - - - // did we update the map? - if (mdsmap.mds_state.count(from) == 0 || - mdsmap.mds_state[from] != state) { - // update mds state - dout(10) << "mds_beacon mds" << from << " " << MDSMap::get_state_name(mdsmap.mds_state[from]) - << " -> " << MDSMap::get_state_name(state) - << endl; - mdsmap.mds_state[from] = state; - if (mdsmap.is_up(from)) - mdsmap.mds_state_seq[from] = seq; - else - mdsmap.mds_state_seq.erase(from); - - issue_map(); - } - - delete m; -} - - -void MDSMonitor::handle_mds_getmap(MMDSGetMap *m) -{ - dout(7) << "mds_getmap from " << m->get_source() << " " << m->get_source_inst() << endl; - if (mdsmap.get_epoch() > 0) - send_full(m->get_source_inst()); - else - awaiting_map.push_back( m->get_source_inst() ); -} - - -void MDSMonitor::bcast_latest_mds() -{ - dout(10) << "bcast_latest_mds " << mdsmap.get_epoch() << endl; - - // tell mds - set up; - mdsmap.get_up_mds_set(up); - for (set::iterator p = up.begin(); - p != up.end(); - p++) - send_full(mdsmap.get_inst(*p)); -} - -void MDSMonitor::send_full(entity_inst_t dest) -{ - dout(11) << "send_full to " << dest << endl; - messenger->send_message(new MMDSMap(&mdsmap), dest); -} - -void MDSMonitor::send_current() -{ - dout(10) << "mds_send_current " << mdsmap.get_epoch() << endl; - for (list::iterator i = awaiting_map.begin(); - i != awaiting_map.end(); - i++) - send_full(*i); - awaiting_map.clear(); -} - -void MDSMonitor::send_latest(entity_inst_t dest) -{ - // FIXME: check if we're locked, etc. - if (mdsmap.get_epoch() > 0) - send_full(dest); - else - awaiting_map.push_back(dest); -} - - -void MDSMonitor::tick() -{ - // make sure mds's are still alive - utime_t now = g_clock.now(); - if (now > g_conf.mds_beacon_grace) { - utime_t cutoff = now; - cutoff -= g_conf.mds_beacon_grace; - - bool changed = false; - - set up; - mdsmap.get_up_mds_set(up); - - for (set::iterator p = up.begin(); - p != up.end(); - ++p) { - if (last_beacon.count(*p)) { - if (last_beacon[*p] < cutoff) { - - // failure! - int newstate; - switch (mdsmap.get_state(*p)) { - case MDSMap::STATE_CREATING: - // didn't finish creating - newstate = MDSMap::STATE_DNE; - break; - - case MDSMap::STATE_STANDBY: - if (mdsmap.has_created(*p)) - newstate = MDSMap::STATE_OUT; - else - newstate = MDSMap::STATE_DNE; - break; - - case MDSMap::STATE_REPLAY: - case MDSMap::STATE_REJOIN: - case MDSMap::STATE_ACTIVE: - case MDSMap::STATE_STOPPING: - newstate = MDSMap::STATE_FAILED; - break; - - case MDSMap::STATE_STARTING: - case MDSMap::STATE_STOPPED: - newstate = MDSMap::STATE_OUT; - break; - - default: - assert(0); - } - - dout(10) << "no beacon from mds" << *p << " since " << last_beacon[*p] - << ", marking " << mdsmap.get_state_name(newstate) - << endl; - - // update map - mdsmap.mds_state[*p] = newstate; - mdsmap.mds_state_seq.erase(*p); - changed = true; - } - } else { - dout(10) << "no beacons from mds" << *p << ", assuming one " << now << endl; - last_beacon[*p] = now; - } - } - - if (changed) { - issue_map(); - } - } -} diff --git a/tags/20070517_before_mds_merge/mon/MDSMonitor.h b/tags/20070517_before_mds_merge/mon/MDSMonitor.h deleted file mode 100644 index c3bc3d165883c..0000000000000 --- a/tags/20070517_before_mds_merge/mon/MDSMonitor.h +++ /dev/null @@ -1,87 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDSMONITOR_H -#define __MDSMONITOR_H - -#include -#include -using namespace std; - -#include "include/types.h" -#include "msg/Messenger.h" - -#include "mds/MDSMap.h" - -class Monitor; - -class MDSMonitor : public Dispatcher { - Monitor *mon; - Messenger *messenger; - Mutex &lock; - - // mds maps - public: - MDSMap mdsmap; - - private: - bufferlist encoded_map; - - //map inc_maps; - //MDSMap::Incremental pending_inc; - - list awaiting_map; - - // beacons - map last_beacon; - - bool is_alive(int mds); - - - // maps - void create_initial(); - void send_current(); // send current map to waiters. - void send_full(entity_inst_t dest); - void bcast_latest_mds(); - - void issue_map(); - - void save_map(); - void load_map(); - void print_map(); - - //void accept_pending(); // accept pending, new map. - //void send_incremental(epoch_t since, msg_addr_t dest); - - void handle_mds_state(class MMDSState *m); - void handle_mds_beacon(class MMDSBeacon *m); - //void handle_mds_failure(class MMDSFailure *m); - void handle_mds_getmap(class MMDSGetMap *m); - - - - public: - MDSMonitor(Monitor *mn, Messenger *m, Mutex& l) : mon(mn), messenger(m), lock(l) { - } - - void dispatch(Message *m); - void tick(); // check state, take actions - - void election_starting(); - void election_finished(); - - void send_latest(entity_inst_t dest); - -}; - -#endif diff --git a/tags/20070517_before_mds_merge/mon/MonMap.h b/tags/20070517_before_mds_merge/mon/MonMap.h deleted file mode 100644 index d8e66c51b589e..0000000000000 --- a/tags/20070517_before_mds_merge/mon/MonMap.h +++ /dev/null @@ -1,103 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MONMAP_H -#define __MONMAP_H - -#include -#include -#include - -#include "msg/Message.h" -#include "include/types.h" - -class MonMap { - public: - epoch_t epoch; // what epoch of the osd cluster descriptor is this - int num_mon; - vector mon_inst; - - int last_mon; // last mon i talked to - - MonMap(int s=0) : epoch(0), num_mon(s), mon_inst(s), last_mon(-1) {} - - void add_mon(entity_inst_t inst) { - mon_inst.push_back(inst); - num_mon++; - } - - // pick a mon. - // choice should be stable, unless we explicitly ask for a new one. - int pick_mon(bool newmon=false) { - if (newmon || (last_mon < 0)) { - last_mon = 0; //last_mon = rand() % num_mon; - } - return last_mon; - } - - const entity_inst_t &get_inst(int m) { - assert(m < num_mon); - return mon_inst[m]; - } - - void encode(bufferlist& blist) { - blist.append((char*)&epoch, sizeof(epoch)); - blist.append((char*)&num_mon, sizeof(num_mon)); - - _encode(mon_inst, blist); - } - - void decode(bufferlist& blist) { - int off = 0; - blist.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - blist.copy(off, sizeof(num_mon), (char*)&num_mon); - off += sizeof(num_mon); - - _decode(mon_inst, blist, off); - } - - int write(char *fn) { - // encode - bufferlist bl; - encode(bl); - - // write - int fd = ::open(fn, O_RDWR|O_CREAT); - if (fd < 0) return fd; - ::fchmod(fd, 0644); - ::write(fd, (void*)bl.c_str(), bl.length()); - ::close(fd); - return 0; - } - - int read(char *fn) { - // read - bufferlist bl; - int fd = ::open(fn, O_RDONLY); - if (fd < 0) return fd; - struct stat st; - ::fstat(fd, &st); - bufferptr bp(st.st_size); - bl.append(bp); - ::read(fd, (void*)bl.c_str(), bl.length()); - ::close(fd); - - // decode - decode(bl); - return 0; - } - -}; - -#endif diff --git a/tags/20070517_before_mds_merge/mon/Monitor.cc b/tags/20070517_before_mds_merge/mon/Monitor.cc deleted file mode 100644 index 8bf1d2f0cfe21..0000000000000 --- a/tags/20070517_before_mds_merge/mon/Monitor.cc +++ /dev/null @@ -1,303 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -// TODO: missing run() method, which creates the two main timers, refreshTimer and readTimer - -#include "Monitor.h" - -#include "osd/OSDMap.h" - -#include "MonitorStore.h" - -#include "msg/Message.h" -#include "msg/Messenger.h" - -#include "messages/MPing.h" -#include "messages/MPingAck.h" -#include "messages/MGenericMessage.h" - -#include "messages/MMonPaxos.h" - -#include "common/Timer.h" -#include "common/Clock.h" - -#include "OSDMonitor.h" -#include "MDSMonitor.h" -#include "ClientMonitor.h" - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cout << g_clock.now() << " mon" << whoami << (is_starting() ? (const char*)"(starting)":(is_leader() ? (const char*)"(leader)":(is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << " " -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cerr << g_clock.now() << " mon" << whoami << (is_starting() ? (const char*)"(starting)":(is_leader() ? (const char*)"(leader)":(is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << " " - - - -void Monitor::init() -{ - lock.Lock(); - - dout(1) << "init" << endl; - - // store - char s[80]; - sprintf(s, "mondata/mon%d", whoami); - store = new MonitorStore(s); - - if (g_conf.mkfs) - store->mkfs(); - - store->mount(); - - // create - osdmon = new OSDMonitor(this, messenger, lock); - mdsmon = new MDSMonitor(this, messenger, lock); - clientmon = new ClientMonitor(this, messenger, lock); - - // i'm ready! - messenger->set_dispatcher(this); - - // start ticker - reset_tick(); - - // call election? - if (monmap->num_mon > 1) { - assert(monmap->num_mon != 2); - call_election(); - } else { - // we're standalone. - set q; - q.insert(whoami); - win_election(q); - } - - lock.Unlock(); -} - -void Monitor::shutdown() -{ - dout(1) << "shutdown" << endl; - - // cancel all events - cancel_tick(); - timer.cancel_all(); - timer.join(); - - // stop osds. - for (set::iterator it = osdmon->osdmap.get_osds().begin(); - it != osdmon->osdmap.get_osds().end(); - it++) { - if (osdmon->osdmap.is_down(*it)) continue; - dout(10) << "sending shutdown to osd" << *it << endl; - messenger->send_message(new MGenericMessage(MSG_SHUTDOWN), - osdmon->osdmap.get_inst(*it)); - } - osdmon->mark_all_down(); - - // monitors too. - for (int i=0; inum_mon; i++) - if (i != whoami) - messenger->send_message(new MGenericMessage(MSG_SHUTDOWN), - monmap->get_inst(i)); - - // unmount my local storage - if (store) - delete store; - - // clean up - if (monmap) delete monmap; - if (osdmon) delete osdmon; - if (mdsmon) delete mdsmon; - if (clientmon) delete clientmon; - - // die. - messenger->shutdown(); - delete messenger; -} - - -void Monitor::call_election() -{ - if (monmap->num_mon == 1) return; - - dout(10) << "call_election" << endl; - state = STATE_STARTING; - - elector.start(); - - osdmon->election_starting(); - //mdsmon->election_starting(); -} - -void Monitor::win_election(set& active) -{ - state = STATE_LEADER; - leader = whoami; - quorum = active; - dout(10) << "win_election, quorum is " << quorum << endl; - - // init - osdmon->election_finished(); - mdsmon->election_finished(); - - // init paxos - test_paxos.leader_start(); -} - -void Monitor::lose_election(int l) -{ - state = STATE_PEON; - leader = l; - dout(10) << "lose_election, leader is mon" << leader << endl; -} - - - -void Monitor::dispatch(Message *m) -{ - lock.Lock(); - { - switch (m->get_type()) { - - // misc - case MSG_PING_ACK: - handle_ping_ack((MPingAck*)m); - break; - - case MSG_SHUTDOWN: - assert(m->get_source().is_osd()); - osdmon->dispatch(m); - break; - - - // OSDs - case MSG_OSD_GETMAP: - case MSG_OSD_FAILURE: - case MSG_OSD_BOOT: - case MSG_OSD_IN: - case MSG_OSD_OUT: - osdmon->dispatch(m); - break; - - - // MDSs - case MSG_MDS_BEACON: - case MSG_MDS_GETMAP: - mdsmon->dispatch(m); - - // hackish: did all mds's shut down? - if (g_conf.mon_stop_with_last_mds && - mdsmon->mdsmap.get_num_up_or_failed_mds() == 0) - shutdown(); - - break; - - // clients - case MSG_CLIENT_BOOT: - clientmon->dispatch(m); - break; - - - // paxos - case MSG_MON_PAXOS: - // send it to the right paxos instance - switch (((MMonPaxos*)m)->machine_id) { - case PAXOS_TEST: - test_paxos.dispatch(m); - break; - case PAXOS_OSDMAP: - //... - - default: - assert(0); - } - break; - - // elector messages - case MSG_MON_ELECTION_PROPOSE: - case MSG_MON_ELECTION_ACK: - case MSG_MON_ELECTION_VICTORY: - elector.dispatch(m); - break; - - - default: - dout(0) << "unknown message " << *m << endl; - assert(0); - } - } - lock.Unlock(); -} - - -void Monitor::handle_shutdown(Message *m) -{ - dout(1) << "shutdown from " << m->get_source() << endl; - - shutdown(); - delete m; -} - -void Monitor::handle_ping_ack(MPingAck *m) -{ - // ... - - delete m; -} - - - - -/************ TICK ***************/ - -class C_Mon_Tick : public Context { - Monitor *mon; -public: - C_Mon_Tick(Monitor *m) : mon(m) {} - void finish(int r) { - mon->tick(); - } -}; - -void Monitor::cancel_tick() -{ - if (tick_timer) timer.cancel_event(tick_timer); -} - -void Monitor::reset_tick() -{ - cancel_tick(); - tick_timer = new C_Mon_Tick(this); - timer.add_event_after(g_conf.mon_tick_interval, tick_timer); -} - - -void Monitor::tick() -{ - tick_timer = 0; - - // ok go. - dout(11) << "tick" << endl; - - osdmon->tick(); - mdsmon->tick(); - - // next tick! - reset_tick(); -} - - - - - - - diff --git a/tags/20070517_before_mds_merge/mon/Monitor.h b/tags/20070517_before_mds_merge/mon/Monitor.h deleted file mode 100644 index 6554ad36239b1..0000000000000 --- a/tags/20070517_before_mds_merge/mon/Monitor.h +++ /dev/null @@ -1,139 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MONITOR_H -#define __MONITOR_H - -#include "include/types.h" -#include "msg/Messenger.h" - -#include "common/Timer.h" - -#include "MonMap.h" -#include "Elector.h" -#include "Paxos.h" - - -class MonitorStore; -class OSDMonitor; -class MDSMonitor; -class ClientMonitor; - -#define PAXOS_TEST 0 -#define PAXOS_OSDMAP 1 -#define PAXOS_MDSMAP 2 -#define PAXOS_CLIENTMAP 3 - -class Monitor : public Dispatcher { -protected: - // me - int whoami; - Messenger *messenger; - Mutex lock; - - MonMap *monmap; - - // timer. - SafeTimer timer; - Context *tick_timer; - void cancel_tick(); - void reset_tick(); - friend class C_Mon_Tick; - - // my local store - //ObjectStore *store; - MonitorStore *store; - - const static int INO_ELECTOR = 1; - const static int INO_MON_MAP = 2; - const static int INO_OSD_MAP = 10; - const static int INO_OSD_INC_MAP = 11; - const static int INO_MDS_MAP = 20; - - // elector - Elector elector; - friend class Elector; - - epoch_t mon_epoch; // monitor epoch (election instance) - set quorum; // current active set of monitors (if !starting) - - //void call_election(); - - // paxos - Paxos test_paxos; - friend class Paxos; - - - // monitor state - const static int STATE_STARTING = 0; // electing - const static int STATE_LEADER = 1; - const static int STATE_PEON = 2; - int state; - - int leader; // current leader (to best of knowledge) - utime_t last_called_election; // [starting] last time i called an election - - bool is_starting() { return state == STATE_STARTING; } - bool is_leader() { return state == STATE_LEADER; } - bool is_peon() { return state == STATE_PEON; } - - // my public services - OSDMonitor *osdmon; - MDSMonitor *mdsmon; - ClientMonitor *clientmon; - - // messages - void handle_shutdown(Message *m); - void handle_ping_ack(class MPingAck *m); - - friend class OSDMonitor; - friend class MDSMonitor; - friend class ClientMonitor; - - // initiate election - void call_election(); - - // end election (called by Elector) - void win_election(set& q); - void lose_election(int l); - - - - public: - Monitor(int w, Messenger *m, MonMap *mm) : - whoami(w), - messenger(m), - monmap(mm), - timer(lock), tick_timer(0), - store(0), - elector(this, w), - mon_epoch(0), - - test_paxos(this, w, PAXOS_TEST, "tester"), // tester state machine - - state(STATE_STARTING), - leader(0), - osdmon(0), mdsmon(0), clientmon(0) - { - } - - - void init(); - void shutdown(); - void dispatch(Message *m); - void tick(); - -}; - -#endif diff --git a/tags/20070517_before_mds_merge/mon/MonitorStore.cc b/tags/20070517_before_mds_merge/mon/MonitorStore.cc deleted file mode 100644 index f5a10696c7ada..0000000000000 --- a/tags/20070517_before_mds_merge/mon/MonitorStore.cc +++ /dev/null @@ -1,224 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "MonitorStore.h" -#include "common/Clock.h" - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cout << g_clock.now() << " store(" << dir <<") " -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cerr << g_clock.now() << " store(" << dir <<") " - -#include -#include -#include -#include -#include - -void MonitorStore::mount() -{ - dout(1) << "mount" << endl; - // verify dir exists - DIR *d = ::opendir(dir.c_str()); - if (!d) { - derr(1) << "basedir " << dir << " dne" << endl; - assert(0); - } - ::closedir(d); - - if (g_conf.use_abspaths) { - // combine it with the cwd, in case fuse screws things up (i.e. fakefuse) - string old = dir; - char *cwd = get_current_dir_name(); - dir = cwd; - delete cwd; - dir += "/"; - dir += old; - } -} - - -void MonitorStore::mkfs() -{ - dout(1) << "mkfs" << endl; - - char cmd[200]; - sprintf(cmd, "test -d %s && /bin/rm -r %s ; mkdir -p %s", dir.c_str(), dir.c_str(), dir.c_str()); - dout(1) << cmd << endl; - system(cmd); -} - - -version_t MonitorStore::get_int(const char *a, const char *b) -{ - char fn[200]; - if (b) - sprintf(fn, "%s/%s/%s", dir.c_str(), a, b); - else - sprintf(fn, "%s/%s", dir.c_str(), a); - - FILE *f = ::fopen(fn, "r"); - if (!f) - return 0; - - char buf[20]; - ::fgets(buf, 20, f); - ::fclose(f); - - version_t val = atoi(buf); - - if (b) { - dout(15) << "get_int " << a << "/" << b << " = " << val << endl; - } else { - dout(15) << "get_int " << a << " = " << val << endl; - } - return val; -} - - -void MonitorStore::put_int(version_t val, const char *a, const char *b) -{ - char fn[200]; - sprintf(fn, "%s/%s", dir.c_str(), a); - if (b) { - ::mkdir(fn, 0755); - dout(15) << "set_int " << a << "/" << b << " = " << val << endl; - sprintf(fn, "%s/%s/%s", dir.c_str(), a, b); - } else { - dout(15) << "set_int " << a << " = " << val << endl; - } - - char vs[30]; -#ifdef __LP64__ - sprintf(vs, "%ld\n", val); -#else - sprintf(vs, "%lld\n", val); -#endif - - char tfn[200]; - sprintf(tfn, "%s.new", fn); - - int fd = ::open(tfn, O_WRONLY|O_CREAT); - assert(fd > 0); - ::fchmod(fd, 0644); - ::write(fd, vs, strlen(vs)); - ::close(fd); - ::rename(tfn, fn); -} - - -// ---------------------------------------- -// buffers - -bool MonitorStore::exists_bl_ss(const char *a, const char *b) -{ - char fn[200]; - if (b) { - dout(15) << "exists_bl " << a << "/" << b << endl; - sprintf(fn, "%s/%s/%s", dir.c_str(), a, b); - } else { - dout(15) << "exists_bl " << a << endl; - sprintf(fn, "%s/%s", dir.c_str(), a); - } - - struct stat st; - int r = ::stat(fn, &st); - return r == 0; -} - - -int MonitorStore::get_bl_ss(bufferlist& bl, const char *a, const char *b) -{ - char fn[200]; - if (b) { - sprintf(fn, "%s/%s/%s", dir.c_str(), a, b); - } else { - sprintf(fn, "%s/%s", dir.c_str(), a); - } - - int fd = ::open(fn, O_RDONLY); - if (!fd) { - if (b) { - dout(15) << "get_bl " << a << "/" << b << " DNE" << endl; - } else { - dout(15) << "get_bl " << a << " DNE" << endl; - } - return 0; - } - - // get size - struct stat st; - int rc = ::fstat(fd, &st); - assert(rc == 0); - __int32_t len = st.st_size; - - // read buffer - bl.clear(); - bufferptr bp(len); - int off = 0; - while (off < len) { - dout(20) << "reading at off " << off << " of " << len << endl; - int r = ::read(fd, bp.c_str()+off, len-off); - if (r < 0) derr(0) << "errno on read " << strerror(errno) << endl; - assert(r>0); - off += r; - } - bl.append(bp); - ::close(fd); - - if (b) { - dout(15) << "get_bl " << a << "/" << b << " = " << bl.length() << " bytes" << endl; - } else { - dout(15) << "get_bl " << a << " = " << bl.length() << " bytes" << endl; - } - - return len; -} - -int MonitorStore::put_bl_ss(bufferlist& bl, const char *a, const char *b) -{ - char fn[200]; - sprintf(fn, "%s/%s", dir.c_str(), a); - if (b) { - ::mkdir(fn, 0755); - dout(15) << "put_bl " << a << "/" << b << " = " << bl.length() << " bytes" << endl; - sprintf(fn, "%s/%s/%s", dir.c_str(), a, b); - } else { - dout(15) << "put_bl " << a << " = " << bl.length() << " bytes" << endl; - } - - char tfn[200]; - sprintf(tfn, "%s.new", fn); - int fd = ::open(tfn, O_WRONLY|O_CREAT); - assert(fd); - - // chmod - ::fchmod(fd, 0644); - - // write data - for (list::const_iterator it = bl.buffers().begin(); - it != bl.buffers().end(); - it++) { - int r = ::write(fd, it->c_str(), it->length()); - if (r != (int)it->length()) - derr(0) << "put_bl_ss ::write() returned " << r << " not " << it->length() << endl; - if (r < 0) - derr(0) << "put_bl_ss ::write() errored out, errno is " << strerror(errno) << endl; - } - - ::fsync(fd); - ::close(fd); - ::rename(tfn, fn); - - return 0; -} diff --git a/tags/20070517_before_mds_merge/mon/MonitorStore.h b/tags/20070517_before_mds_merge/mon/MonitorStore.h deleted file mode 100644 index 122118f33f556..0000000000000 --- a/tags/20070517_before_mds_merge/mon/MonitorStore.h +++ /dev/null @@ -1,81 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MON_MONITORSTORE_H -#define __MON_MONITORSTORE_H - -#include "include/types.h" -#include "include/buffer.h" - -#include - -class MonitorStore { - string dir; - -public: - MonitorStore(char *d) : dir(d) { - } - ~MonitorStore() { - } - - void mkfs(); // wipe - void mount(); - - // ints (stored as ascii) - version_t get_int(const char *a, const char *b=0); - void put_int(version_t v, const char *a, const char *b=0); - - // buffers - // ss and sn varieties. - bool exists_bl_ss(const char *a, const char *b=0); - int get_bl_ss(bufferlist& bl, const char *a, const char *b); - int put_bl_ss(bufferlist& bl, const char *a, const char *b); - bool exists_bl_sn(const char *a, version_t b) { - char bs[20]; -#ifdef __LP64__ - sprintf(bs, "%lu", b); -#else - sprintf(bs, "%llu", b); -#endif - return exists_bl_ss(a, bs); - } - int get_bl_sn(bufferlist& bl, const char *a, version_t b) { - char bs[20]; -#ifdef __LP64__ - sprintf(bs, "%lu", b); -#else - sprintf(bs, "%llu", b); -#endif - return get_bl_ss(bl, a, bs); - } - int put_bl_sn(bufferlist& bl, const char *a, version_t b) { - char bs[20]; -#ifdef __LP64__ - sprintf(bs, "%lu", b); -#else - sprintf(bs, "%llu", b); -#endif - return put_bl_ss(bl, a, bs); - } - - /* - version_t get_incarnation() { return get_int("incarnation"); } - void set_incarnation(version_t i) { set_int(i, "incarnation"); } - - version_t get_last_proposal() { return get_int("last_proposal"); } - void set_last_proposal(version_t i) { set_int(i, "last_proposal"); } - */ -}; - - -#endif diff --git a/tags/20070517_before_mds_merge/mon/OSDMonitor.cc b/tags/20070517_before_mds_merge/mon/OSDMonitor.cc deleted file mode 100644 index 8a1ff495515fd..0000000000000 --- a/tags/20070517_before_mds_merge/mon/OSDMonitor.cc +++ /dev/null @@ -1,902 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "OSDMonitor.h" -#include "Monitor.h" -#include "MDSMonitor.h" - -#include "MonitorStore.h" - -#include "messages/MOSDFailure.h" -#include "messages/MOSDMap.h" -#include "messages/MOSDGetMap.h" -#include "messages/MOSDBoot.h" -#include "messages/MOSDIn.h" -#include "messages/MOSDOut.h" - -#include "messages/MMonOSDMapInfo.h" -#include "messages/MMonOSDMapLease.h" -#include "messages/MMonOSDMapLeaseAck.h" -#include "messages/MMonOSDMapUpdatePrepare.h" -#include "messages/MMonOSDMapUpdateAck.h" -#include "messages/MMonOSDMapUpdateCommit.h" - -#include "common/Timer.h" - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cout << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".osd(" << (state == STATE_INIT ? (const char*)"init":(state == STATE_SYNC ? (const char*)"sync":(state == STATE_LOCK ? (const char*)"lock":(state == STATE_UPDATING ? (const char*)"updating":(const char*)"?\?")))) << ") e" << osdmap.get_epoch() << " " -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cerr << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".osd(" << (state == STATE_INIT ? (const char*)"init":(state == STATE_SYNC ? (const char*)"sync":(state == STATE_LOCK ? (const char*)"lock":(state == STATE_UPDATING ? (const char*)"updating":(const char*)"?\?")))) << ") e" << osdmap.get_epoch() << " " - - -class C_Mon_FakeOSDFailure : public Context { - OSDMonitor *mon; - int osd; - bool down; -public: - C_Mon_FakeOSDFailure(OSDMonitor *m, int o, bool d) : mon(m), osd(o), down(d) {} - void finish(int r) { - mon->fake_osd_failure(osd,down); - } -}; - - -void OSDMonitor::fake_osdmap_update() -{ - dout(1) << "fake_osdmap_update" << endl; - accept_pending(); - - // tell a random osd - int osd = rand() % g_conf.num_osd; - send_incremental(osdmap.get_epoch()-1, // ick! FIXME - osdmap.get_inst(osd)); -} - - -void OSDMonitor::fake_reorg() -{ - int r = rand() % g_conf.num_osd; - - if (osdmap.is_out(r)) { - dout(1) << "fake_reorg marking osd" << r << " in" << endl; - pending_inc.new_in.push_back(r); - } else { - dout(1) << "fake_reorg marking osd" << r << " out" << endl; - pending_inc.new_out.push_back(r); - } - - accept_pending(); - - // tell him! - send_incremental(osdmap.get_epoch()-1, osdmap.get_inst(r)); - - // do it again? - /* - if (g_conf.num_osd - d > 4 && - g_conf.num_osd - d > g_conf.num_osd/2) - mon->timer.add_event_after(g_conf.fake_osdmap_expand, - new C_Mon_Faker(this)); - */ -} - - - -/* -void OSDMonitor::init() -{ - // start with blank map - - // load my last state from the store - bufferlist bl; - if (get_map_bl(0, bl)) { // FIXME - // yay! - osdmap.decode(bl); - dout(1) << "init got epoch " << osdmap.get_epoch() << " from store" << endl; - - // set up pending_inc - pending_inc.epoch = osdmap.get_epoch()+1; - } -} -*/ - - - - -/************ MAPS ****************/ - - -void OSDMonitor::create_initial() -{ - dout(1) << "create_initial generating osdmap from g_conf" << endl; - - // - osdmap.mon_epoch = mon->mon_epoch; - osdmap.ctime = g_clock.now(); - - if (g_conf.osd_pg_bits) { - osdmap.set_pg_bits(g_conf.osd_pg_bits); - } else { - // figure out how many bits worth of osds we have. - // 1 osd -> 0 bits - // <= 2 osds -> 1 bit - // <= 4 osds -> 2 bits - int osdbits = -1; - int n = g_conf.num_osd; - assert(n > 0); - while (n) { - n = n >> 1; - osdbits++; - } - - // 7 bits per osd. - osdmap.set_pg_bits(osdbits + 4); // FIXME - } - - // start at epoch 0 until all osds boot - //osdmap.inc_epoch(); // = 1 - //assert(osdmap.get_epoch() == 1); - - if (g_conf.num_osd >= 12) { - int ndom = g_conf.osd_max_rep; - UniformBucket *domain[ndom]; - int domid[ndom]; - for (int i=0; iadd_item(i, 1.0); - //cerr << "osd" << i << " in domain " << dom << endl; - i++; - if (i == g_conf.num_osd) break; - } - } - - // root - Bucket *root = new ListBucket(2); - for (int i=0; iget_weight() << endl; - root->add_item(domid[i], domain[i]->get_weight()); - } - int nroot = osdmap.crush.add_bucket(root); - - // rules - for (int i=1; i<=ndom; i++) { - osdmap.crush.rules[i].steps.push_back(RuleStep(CRUSH_RULE_TAKE, nroot)); - osdmap.crush.rules[i].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, i, 1)); - osdmap.crush.rules[i].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, 1, 0)); - osdmap.crush.rules[i].steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - } - - // test - //vector out; - //osdmap.pg_to_osds(0x40200000110ULL, out); - - } else { - // one bucket - Bucket *b = new UniformBucket(1, 0); - int root = osdmap.crush.add_bucket(b); - for (int i=0; iadd_item(i, 1.0); - } - - for (int i=1; i<=g_conf.osd_max_rep; i++) { - osdmap.crush.rules[i].steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - osdmap.crush.rules[i].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, i, 0)); - osdmap.crush.rules[i].steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - } - } - - if (g_conf.mds_local_osd) { - // add mds osds, but don't put them in the crush mapping func - for (int i=0; i - - // fake osd failures - for (map::iterator i = g_fake_osd_down.begin(); - i != g_fake_osd_down.end(); - i++) { - dout(0) << "will fake osd" << i->first << " DOWN after " << i->second << endl; - mon->timer.add_event_after(i->second, new C_Mon_FakeOSDFailure(this, i->first, 1)); - } - for (map::iterator i = g_fake_osd_out.begin(); - i != g_fake_osd_out.end(); - i++) { - dout(0) << "will fake osd" << i->first << " OUT after " << i->second << endl; - mon->timer.add_event_after(i->second, new C_Mon_FakeOSDFailure(this, i->first, 0)); - } -} - - -bool OSDMonitor::get_map_bl(epoch_t epoch, bufferlist& bl) -{ - if (!mon->store->exists_bl_sn("osdmap", epoch)) - return false; - int r = mon->store->get_bl_sn(bl, "osdmap", epoch); - assert(r > 0); - return true; -} - -bool OSDMonitor::get_inc_map_bl(epoch_t epoch, bufferlist& bl) -{ - if (!mon->store->exists_bl_sn("osdincmap", epoch)) - return false; - int r = mon->store->get_bl_sn(bl, "osdincmap", epoch); - assert(r > 0); - return true; -} - - -void OSDMonitor::save_map() -{ - bufferlist bl; - osdmap.encode(bl); - - mon->store->put_bl_sn(bl, "osdmap", osdmap.get_epoch()); - mon->store->put_int(osdmap.get_epoch(), "osd_epoch"); -} - -void OSDMonitor::save_inc_map(OSDMap::Incremental &inc) -{ - bufferlist bl; - osdmap.encode(bl); - - bufferlist incbl; - inc.encode(incbl); - - mon->store->put_bl_sn(bl, "osdmap", osdmap.get_epoch()); - mon->store->put_bl_sn(incbl, "osdincmap", osdmap.get_epoch()); - mon->store->put_int(osdmap.get_epoch(), "osd_epoch"); -} - - - -void OSDMonitor::dispatch(Message *m) -{ - switch (m->get_type()) { - - // services - case MSG_OSD_GETMAP: - handle_osd_getmap((MOSDGetMap*)m); - break; - case MSG_OSD_FAILURE: - handle_osd_failure((MOSDFailure*)m); - break; - case MSG_OSD_BOOT: - handle_osd_boot((MOSDBoot*)m); - break; - case MSG_OSD_IN: - handle_osd_in((MOSDIn*)m); - break; - case MSG_OSD_OUT: - handle_osd_out((MOSDOut*)m); - break; - - // replication - case MSG_MON_OSDMAP_INFO: - handle_info((MMonOSDMapInfo*)m); - break; - case MSG_MON_OSDMAP_LEASE: - handle_lease((MMonOSDMapLease*)m); - break; - case MSG_MON_OSDMAP_LEASE_ACK: - handle_lease_ack((MMonOSDMapLeaseAck*)m); - break; - case MSG_MON_OSDMAP_UPDATE_PREPARE: - handle_update_prepare((MMonOSDMapUpdatePrepare*)m); - break; - case MSG_MON_OSDMAP_UPDATE_ACK: - handle_update_ack((MMonOSDMapUpdateAck*)m); - break; - case MSG_MON_OSDMAP_UPDATE_COMMIT: - handle_update_commit((MMonOSDMapUpdateCommit*)m); - break; - - default: - assert(0); - } -} - - - -void OSDMonitor::handle_osd_failure(MOSDFailure *m) -{ - dout(1) << "osd failure: " << m->get_failed() << " from " << m->get_source() << endl; - - // FIXME - // take their word for it - int from = m->get_failed().name.num(); - if (osdmap.is_up(from) && - (osdmap.osd_inst.count(from) == 0 || - osdmap.osd_inst[from] == m->get_failed())) { - pending_inc.new_down[from] = m->get_failed(); - - if (osdmap.is_in(from)) - down_pending_out[from] = g_clock.now(); - - //awaiting_maps[pending_inc.epoch][m->get_source()] = - - accept_pending(); - - send_incremental(m->get_epoch(), m->get_source_inst()); - - send_waiting(); - bcast_latest_mds(); - } - - delete m; -} - - -void OSDMonitor::fake_osd_failure(int osd, bool down) -{ - if (down) { - dout(1) << "fake_osd_failure DOWN osd" << osd << endl; - pending_inc.new_down[osd] = osdmap.osd_inst[osd]; - } else { - dout(1) << "fake_osd_failure OUT osd" << osd << endl; - pending_inc.new_out.push_back(osd); - } - accept_pending(); - bcast_latest_osd(); - bcast_latest_mds(); -} - -void OSDMonitor::mark_all_down() -{ - dout(7) << "mark_all_down" << endl; - - for (set::iterator it = osdmap.get_osds().begin(); - it != osdmap.get_osds().end(); - it++) { - if (osdmap.is_down(*it)) continue; - pending_inc.new_down[*it] = osdmap.get_inst(*it); - } - accept_pending(); -} - - - - -void OSDMonitor::handle_osd_boot(MOSDBoot *m) -{ - dout(7) << "osd_boot from " << m->get_source() << endl; - assert(m->get_source().is_osd()); - int from = m->get_source().num(); - - if (osdmap.get_epoch() == 0) { - // waiting for boot! - osdmap.osd_inst[from] = m->get_source_inst(); - - if (osdmap.osd_inst.size() == osdmap.osds.size()) { - dout(-7) << "osd_boot all osds booted." << endl; - osdmap.inc_epoch(); - - save_map(); - - pending_inc.epoch = osdmap.get_epoch()+1; // 2 - - bcast_latest_osd(); - bcast_latest_mds(); - } else { - dout(7) << "osd_boot waiting for " - << (osdmap.osds.size() - osdmap.osd_inst.size()) - << " osds to boot" << endl; - } - - delete m; - return; - } - - // already up? mark down first? - if (osdmap.is_up(from)) { - pending_inc.new_down[from] = osdmap.osd_inst[from]; - accept_pending(); - } - - // mark up. - down_pending_out.erase(from); - assert(osdmap.is_down(from)); - pending_inc.new_up[from] = m->get_source_inst(); - - // mark in? - if (osdmap.out_osds.count(from)) - pending_inc.new_in.push_back(from); - - accept_pending(); - - // the booting osd will spread word - send_incremental(m->sb.current_epoch, m->get_source_inst()); - delete m; - - // tell mds - bcast_latest_mds(); -} - -void OSDMonitor::handle_osd_in(MOSDIn *m) -{ - dout(7) << "osd_in from " << m->get_source() << endl; - int from = m->get_source().num(); - - if (osdmap.is_out(from)) - pending_inc.new_in.push_back(from); - accept_pending(); - send_incremental(m->map_epoch, m->get_source_inst()); -} - -void OSDMonitor::handle_osd_out(MOSDOut *m) -{ - dout(7) << "osd_out from " << m->get_source() << endl; - int from = m->get_source().num(); - if (osdmap.is_in(from)) { - pending_inc.new_out.push_back(from); - accept_pending(); - send_incremental(m->map_epoch, m->get_source_inst()); - } -} - -void OSDMonitor::handle_osd_getmap(MOSDGetMap *m) -{ - dout(7) << "osd_getmap from " << m->get_source() << " since " << m->get_since() << endl; - - if (osdmap.get_epoch() == 0) { - awaiting_map[m->get_source()].first = m->get_source_inst(); - awaiting_map[m->get_source()].second = m->get_since(); - } else { - //if (m->get_since()) - send_incremental(m->get_since(), m->get_source_inst()); - //else - //send_full(m->get_source(), m->get_source_inst()); - } - delete m; -} - - - -void OSDMonitor::accept_pending() -{ - dout(-10) << "accept_pending " << osdmap.get_epoch() << " -> " << pending_inc.epoch << endl; - - // accept pending into a new map! - pending_inc.ctime = g_clock.now(); - pending_inc.mon_epoch = mon->mon_epoch; - - // advance! - osdmap.apply_incremental(pending_inc); - - // save it. - save_inc_map( pending_inc ); - - // tell me about it - for (map::iterator i = pending_inc.new_up.begin(); - i != pending_inc.new_up.end(); - i++) { - dout(0) << "osd" << i->first << " UP " << i->second << endl; - derr(0) << "osd" << i->first << " UP " << i->second << endl; - } - for (map::iterator i = pending_inc.new_down.begin(); - i != pending_inc.new_down.end(); - i++) { - dout(0) << "osd" << i->first << " DOWN " << i->second << endl; - derr(0) << "osd" << i->first << " DOWN " << i->second << endl; - messenger->mark_down(i->second.addr); - } - for (list::iterator i = pending_inc.new_in.begin(); - i != pending_inc.new_in.end(); - i++) { - dout(0) << "osd" << *i << " IN" << endl; - derr(0) << "osd" << *i << " IN" << endl; - } - for (list::iterator i = pending_inc.new_out.begin(); - i != pending_inc.new_out.end(); - i++) { - dout(0) << "osd" << *i << " OUT" << endl; - derr(0) << "osd" << *i << " OUT" << endl; - } - - // clear new pending - OSDMap::Incremental next(osdmap.get_epoch() + 1); - pending_inc = next; -} - -void OSDMonitor::send_waiting() -{ - dout(10) << "send_waiting " << osdmap.get_epoch() << endl; - - for (map >::iterator i = awaiting_map.begin(); - i != awaiting_map.end(); - i++) - send_incremental(i->second.second, i->second.first); -} - - -void OSDMonitor::send_full(entity_inst_t who) -{ - messenger->send_message(new MOSDMap(&osdmap), who); -} - -void OSDMonitor::send_incremental(epoch_t since, entity_inst_t dest) -{ - dout(5) << "osd_send_incremental " << since << " -> " << osdmap.get_epoch() - << " to " << dest << endl; - - MOSDMap *m = new MOSDMap; - - for (epoch_t e = osdmap.get_epoch(); - e > since; - e--) { - bufferlist bl; - if (get_inc_map_bl(e, bl)) { - dout(10) << "osd_send_incremental inc " << e << endl; - m->incremental_maps[e] = bl; - } - else if (get_map_bl(e, bl)) { - dout(10) << "osd_send_incremental full " << e << endl; - m->maps[e] = bl; - } - else { - assert(0); // we should have all maps. - } - } - - messenger->send_message(m, dest); -} - - - -void OSDMonitor::bcast_latest_mds() -{ - epoch_t e = osdmap.get_epoch(); - dout(1) << "bcast_latest_mds epoch " << e << endl; - - // tell mds - set up; - mon->mdsmon->mdsmap.get_up_mds_set(up); - for (set::iterator i = up.begin(); - i != up.end(); - i++) { - send_incremental(osdmap.get_epoch()-1, mon->mdsmon->mdsmap.get_inst(*i)); - } -} - -void OSDMonitor::bcast_latest_osd() -{ - epoch_t e = osdmap.get_epoch(); - dout(1) << "bcast_latest_osd epoch " << e << endl; - - // tell osds - set osds; - osdmap.get_all_osds(osds); - for (set::iterator it = osds.begin(); - it != osds.end(); - it++) { - if (osdmap.is_down(*it)) continue; - - send_incremental(osdmap.get_epoch()-1, osdmap.get_inst(*it)); - } -} - - - -void OSDMonitor::tick() -{ - // mark down osds out? - utime_t now = g_clock.now(); - list mark_out; - for (map::iterator i = down_pending_out.begin(); - i != down_pending_out.end(); - i++) { - utime_t down = now; - down -= i->second; - - if (down.sec() >= g_conf.mon_osd_down_out_interval) { - dout(10) << "tick marking osd" << i->first << " OUT after " << down << " sec" << endl; - mark_out.push_back(i->first); - } - } - for (list::iterator i = mark_out.begin(); - i != mark_out.end(); - i++) { - down_pending_out.erase(*i); - pending_inc.new_out.push_back( *i ); - } - if (!mark_out.empty()) { - accept_pending(); - - // hrmpf. bcast map for now. FIXME FIXME. - bcast_latest_osd(); - } -} - -void OSDMonitor::election_starting() -{ - dout(10) << "election_starting" << endl; -} - -void OSDMonitor::election_finished() -{ - dout(10) << "election_finished" << endl; - - if (mon->is_leader()) { - if (g_conf.mkfs) { - create_initial(); - save_map(); - } else { - // - epoch_t epoch = mon->store->get_int("osd_epoch"); - dout(10) << " last epoch was " << epoch << endl; - bufferlist bl, blinc; - int r = mon->store->get_bl_sn(bl, "osdmap", epoch); - assert(r>0); - osdmap.decode(bl); - - // pending_inc - pending_inc.epoch = epoch+1; - } - - } - - /* - state = STATE_INIT; - - // map? - if (osdmap.get_epoch() == 0 && - mon->is_leader()) { - create_initial(); - } - - - - if (mon->is_leader()) { - // leader. - if (mon->monmap->num_mon == 1) { - // hmm, it's just me! - state = STATE_SYNC; - } - } - else if (mon->is_peon()) { - // peon. send info - //messenger->send_message(new MMonOSDMapInfo(osdmap.epoch, osdmap.mon_epoch), - // mon->monmap->get_inst(mon->leader)); - } - */ -} - - - -void OSDMonitor::handle_info(MMonOSDMapInfo *m) -{ - dout(10) << "handle_info from " << m->get_source() - << " epoch " << m->get_epoch() << " in mon_epoch " << m->get_mon_epoch() - << endl; - - epoch_t epoch = m->get_epoch(); - - // did they have anything? - if (epoch > 0) { - // make sure it's current. - if (epoch == osdmap.get_epoch()) { - if (osdmap.mon_epoch != m->get_mon_epoch()) { - dout(10) << "handle_info had divergent epoch " << m->get_epoch() - << ", mon_epoch " << m->get_mon_epoch() << " != " << osdmap.mon_epoch << endl; - epoch--; - } - } else { - bufferlist bl; - get_map_bl(epoch, bl); - - OSDMap old; - old.decode(bl); - - if (old.mon_epoch != m->get_mon_epoch()) { - dout(10) << "handle_info had divergent epoch " << m->get_epoch() - << ", mon_epoch " << m->get_mon_epoch() << " != " << old.mon_epoch << endl; - epoch--; - } - } - } - - // bring up to date - if (epoch < osdmap.get_epoch()) - send_incremental(epoch, m->get_source_inst()); - - delete m; -} - - -void OSDMonitor::issue_leases() -{ - dout(10) << "issue_leases" << endl; - assert(mon->is_leader()); - - // set lease endpoint - lease_expire = g_clock.now(); - lease_expire += g_conf.mon_lease; - - pending_ack.clear(); - - for (set::iterator i = mon->quorum.begin(); - i != mon->quorum.end(); - i++) { - if (*i == mon->whoami) continue; - messenger->send_message(new MMonOSDMapLease(osdmap.get_epoch(), lease_expire), - mon->monmap->get_inst(*i)); - pending_ack.insert(*i); - } -} - -void OSDMonitor::handle_lease(MMonOSDMapLease *m) -{ - if (m->get_epoch() != osdmap.get_epoch() + 1) { - dout(10) << "map_lease from " << m->get_source() - << " on epoch " << m->get_epoch() << ", but i am " << osdmap.get_epoch() << endl; - assert(0); - delete m; - return; - } - - dout(10) << "map_lease from " << m->get_source() << " expires " << lease_expire << endl; - lease_expire = m->get_lease_expire(); - - delete m; -} - -void OSDMonitor::handle_lease_ack(MMonOSDMapLeaseAck *m) -{ - // right epoch? - if (m->get_epoch() != osdmap.get_epoch()) { - dout(10) << "map_lease_ack from " << m->get_source() - << " on old epoch " << m->get_epoch() << ", dropping" << endl; - delete m; - return; - } - - // within time limit? - if (g_clock.now() >= lease_expire) { - dout(10) << "map_lease_ack from " << m->get_source() - << ", but lease expired, calling election" << endl; - mon->call_election(); - delete m; - return; - } - - assert(m->get_source().is_mon()); - int from = m->get_source().num(); - - assert(pending_ack.count(from)); - pending_ack.erase(from); - - if (pending_ack.empty()) { - dout(10) << "map_lease_ack from " << m->get_source() - << ", last one" << endl; - } else { - dout(10) << "map_lease_ack from " << m->get_source() - << ", still waiting on " << pending_ack << endl; - } - - delete m; -} - - -void OSDMonitor::update_map() -{ - // lock map - state = STATE_UPDATING; - pending_ack.clear(); - - // set lease endpoint - lease_expire += g_conf.mon_lease; - - // send prepare - epoch_t epoch = osdmap.get_epoch(); - bufferlist map_bl, inc_map_bl; - if (!get_inc_map_bl(epoch, inc_map_bl)) - get_map_bl(epoch, map_bl); - - for (set::iterator i = mon->quorum.begin(); - i != mon->quorum.end(); - i++) { - if (*i == mon->whoami) continue; - messenger->send_message(new MMonOSDMapUpdatePrepare(epoch, - map_bl, inc_map_bl), - mon->monmap->get_inst(*i)); - pending_ack.insert(*i); - } -} - - - -void OSDMonitor::handle_update_prepare(MMonOSDMapUpdatePrepare *m) -{ - dout(10) << "map_update_prepare from " << m->get_source() << " epoch " << m->get_epoch() << endl; - // accept map - assert(m->get_epoch() == osdmap.get_epoch() + 1); - - if (m->inc_map_bl.length()) { - int off = 0; - pending_inc.decode(m->inc_map_bl, off); - accept_pending(); - } else { - osdmap.decode(m->map_bl); - } - - // state - state = STATE_LOCK; - //lease_expire = m->lease_expire; - - // ack - messenger->send_message(new MMonOSDMapUpdateAck(osdmap.get_epoch()), - m->get_source_inst()); - delete m; -} - -void OSDMonitor::handle_update_ack(MMonOSDMapUpdateAck *m) -{ - /* - // right epoch? - if (m->get_epoch() != osdmap.get_epoch()) { - dout(10) << "map_update_ack from " << m->get_source() - << " on old epoch " << m->get_epoch() << ", dropping" << endl; - delete m; - return; - } - - // within time limit? - if (g_clock.now() >= lease_expire) { - dout(10) << "map_update_ack from " << m->get_source() - << ", but lease expired, calling election" << endl; - state = STATE_SYNC; - mon->call_election(); - return; - } - - assert(m->get_source().is_mon()); - int from = m->get_source().num(); - - assert(pending_lease_ack.count(from)); - pending_lease_ack.erase(from); - - if (pending_lease_ack.empty()) { - dout(10) << "map_update_ack from " << m->get_source() - << ", last one" << endl; - state = STATE_SYNC; - - // send lease commit - for (map::iterator i = mon->quorum.begin(); - i != mon->quorum.end(); - i++) { - if (i == mon->whoami) continue; - messenger->send_message(new MMonOSDMapLeaseCommit(osdmap), - MSG_ADDR_MON(*i), mon->monmap->get_inst(*i)); - } - } else { - dout(10) << "map_update_ack from " << m->get_source() - << ", still waiting on " << pending_lease_ack << endl; - } -*/ -} - -void OSDMonitor::handle_update_commit(MMonOSDMapUpdateCommit *m) -{ -} diff --git a/tags/20070517_before_mds_merge/mon/OSDMonitor.h b/tags/20070517_before_mds_merge/mon/OSDMonitor.h deleted file mode 100644 index bf393f17d9f7a..0000000000000 --- a/tags/20070517_before_mds_merge/mon/OSDMonitor.h +++ /dev/null @@ -1,110 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __OSDMONITOR_H -#define __OSDMONITOR_H - -#include -#include -using namespace std; - -#include "include/types.h" -#include "msg/Messenger.h" - -#include "osd/OSDMap.h" - -class Monitor; - -class OSDMonitor : public Dispatcher { - Monitor *mon; - Messenger *messenger; - Mutex &lock; - - // osd maps -public: - OSDMap osdmap; - -private: - map > awaiting_map; - - void create_initial(); - bool get_map_bl(epoch_t epoch, bufferlist &bl); - bool get_inc_map_bl(epoch_t epoch, bufferlist &bl); - - void save_map(); - void save_inc_map(OSDMap::Incremental &inc); - - // [leader] - OSDMap::Incremental pending_inc; - map down_pending_out; // osd down -> out - - set pending_ack; - - // we are distributed - const static int STATE_INIT = 0; // startup - const static int STATE_SYNC = 1; // sync map copy (readonly) - const static int STATE_LOCK = 2; // [peon] map locked - const static int STATE_UPDATING = 3; // [leader] map locked, waiting for peon ack - - int state; - utime_t lease_expire; // when lease expires - - //void init(); - - // maps - void accept_pending(); // accept pending, new map. - void send_waiting(); // send current map to waiters. - void send_full(entity_inst_t dest); - void send_incremental(epoch_t since, entity_inst_t dest); - void bcast_latest_mds(); - void bcast_latest_osd(); - - void update_map(); - - void handle_osd_boot(class MOSDBoot *m); - void handle_osd_in(class MOSDIn *m); - void handle_osd_out(class MOSDOut *m); - void handle_osd_failure(class MOSDFailure *m); - void handle_osd_getmap(class MOSDGetMap *m); - - void handle_info(class MMonOSDMapInfo*); - void handle_lease(class MMonOSDMapLease*); - void handle_lease_ack(class MMonOSDMapLeaseAck*); - void handle_update_prepare(class MMonOSDMapUpdatePrepare*); - void handle_update_ack(class MMonOSDMapUpdateAck*); - void handle_update_commit(class MMonOSDMapUpdateCommit*); - - public: - OSDMonitor(Monitor *mn, Messenger *m, Mutex& l) : - mon(mn), messenger(m), lock(l), - state(STATE_SYNC) { - //init(); - } - - void dispatch(Message *m); - void tick(); // check state, take actions - - void election_starting(); // abort whatever. - void election_finished(); // reinitialize whatever. - - void issue_leases(); - - void mark_all_down(); - - void fake_osd_failure(int osd, bool down); - void fake_osdmap_update(); - void fake_reorg(); -}; - -#endif diff --git a/tags/20070517_before_mds_merge/mon/Paxos.cc b/tags/20070517_before_mds_merge/mon/Paxos.cc deleted file mode 100644 index 67c4e2e99e179..0000000000000 --- a/tags/20070517_before_mds_merge/mon/Paxos.cc +++ /dev/null @@ -1,182 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "Paxos.h" -#include "Monitor.h" -#include "MonitorStore.h" - -#include "messages/MMonPaxos.h" - -#include "config.h" -#undef dout -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cerr << g_clock.now() << " mon" << whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".paxos(" << machine_name << ") " -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cout << g_clock.now() << " mon" << whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".paxos(" << machine_name << ") " - - -// --------------------------------- -// proposer -void Paxos::propose(version_t v, bufferlist& value) -{ -//todo high rf -} - -void Paxos::handle_last(MMonPaxos *m) -{ -//todo high rf - dout(10) << "handle_last " << *m << endl; - delete m; -} - -void Paxos::handle_accept(MMonPaxos *m) -{ -//todo high rf - dout(10) << "handle_accept " << *m << endl; - delete m; - -} - -void Paxos::handle_ack(MMonPaxos *m) -{ -//todo high rf - dout(10) << "handle_ack " << *m << endl; - delete m; -} - -void Paxos::handle_old_round(MMonPaxos *m) -{ -//todo high rf - dout(10) << "handle_old_round " << *m << endl; - delete m; -} - - -/* - * return a globally unique, monotonically increasing proposal number - */ -version_t Paxos::get_new_proposal_number(version_t gt) -{ - // read last - version_t last = mon->store->get_int("last_paxos_proposal"); - if (last < gt) - last = gt; - - // update - last /= 100; - last++; - - // make it unique among all monitors. - version_t pn = last*100 + (version_t)whoami; - - // write - mon->store->put_int(pn, "last_paxos_proposal"); - - dout(10) << "get_new_proposal_number = " << pn << endl; - return pn; -} - - -// --------------------------------- -// accepter -void Paxos::handle_collect(MMonPaxos *m) -{ -//todo high rf - // ... - - delete m; -} - - - - -// --------------------------------- -// learner -void Paxos::handle_success(MMonPaxos *m) -{ - //todo high rf - delete m; -} - -void Paxos::handle_begin(MMonPaxos *m) -{ - //todo high rf - delete m; -} - -// --------------------------------- - -void Paxos::leader_start() -{ - dout(10) << "i am the leader" << endl; - - // .. do something else too - version_t pn = get_new_proposal_number(); - for (int i=0; imonmap->num_mon; ++i) { - if (i == whoami) continue; - // todo high rf I pass the pn twice... what is the last parameter for? - mon->messenger->send_message(new MMonPaxos(MMonPaxos::OP_COLLECT, whoami, pn, pn), - mon->monmap->get_inst(i)); - } -} - - - -void Paxos::dispatch(Message *m) -{ - switch (m->get_type()) { - - case MSG_MON_PAXOS: - { - MMonPaxos *pm = (MMonPaxos*)m; - - // NOTE: these ops are defined in messages/MMonPaxos.h - switch (pm->op) { - // learner - case MMonPaxos::OP_COLLECT: - handle_collect(pm); - break; - - case MMonPaxos::OP_LAST: - handle_last(pm); - break; - - case MMonPaxos::OP_OLDROUND: - handle_old_round(pm); - break; - - case MMonPaxos::OP_BEGIN: - handle_begin(pm); - break; - - case MMonPaxos::OP_ACCEPT: - handle_accept(pm); - break; - - case MMonPaxos::OP_SUCCESS: - handle_success(pm); - break; - - case MMonPaxos::OP_ACK: - handle_ack(pm); - break; - - default: - assert(0); - } - } - break; - - default: - assert(0); - } -} - diff --git a/tags/20070517_before_mds_merge/mon/Paxos.h b/tags/20070517_before_mds_merge/mon/Paxos.h deleted file mode 100644 index 52a509d25aa76..0000000000000 --- a/tags/20070517_before_mds_merge/mon/Paxos.h +++ /dev/null @@ -1,73 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MON_PAXOS_H -#define __MON_PAXOS_H - -#include "include/types.h" -#include "include/buffer.h" -#include "msg/Message.h" - -#include "include/Context.h" - -#include "common/Timer.h" - -class Monitor; -class MMonPaxos; - -// i am one state machine. -class Paxos { - Monitor *mon; - int whoami; - - // my state machine info - int machine_id; - const char *machine_name; - map accepted_values; - map accepted_proposal_number; - - // proposer - void propose(version_t v, bufferlist& value); - - void handle_last(MMonPaxos*); - void handle_accept(MMonPaxos*); - void handle_ack(MMonPaxos*); - void handle_old_round(MMonPaxos*); - - version_t get_new_proposal_number(version_t gt=0); - - // accepter - void handle_collect(MMonPaxos*); - - // learner - void handle_success(MMonPaxos*); - void handle_begin(MMonPaxos*); - - -public: - Paxos(Monitor *m, int w, - int mid,const char *mnm) : mon(m), whoami(w), - machine_id(mid), machine_name(mnm) { - } - - void dispatch(Message *m); - - void leader_start(); - -}; - - - -#endif - diff --git a/tags/20070517_before_mds_merge/msg/Dispatcher.cc b/tags/20070517_before_mds_merge/msg/Dispatcher.cc deleted file mode 100644 index edee54a2c631f..0000000000000 --- a/tags/20070517_before_mds_merge/msg/Dispatcher.cc +++ /dev/null @@ -1,27 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "Dispatcher.h" -#include "Messenger.h" - -#include "mds/MDS.h" - -/* -int Dispatcher::send_message(Message *m, msg_addr_t dest, int dest_port) -{ - assert(0); - //return dis_messenger->send_message(m, dest, dest_port, MDS_PORT_SERVER); // on my port! -} -*/ diff --git a/tags/20070517_before_mds_merge/msg/Dispatcher.h b/tags/20070517_before_mds_merge/msg/Dispatcher.h deleted file mode 100644 index 8b6fe92381427..0000000000000 --- a/tags/20070517_before_mds_merge/msg/Dispatcher.h +++ /dev/null @@ -1,33 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __DISPATCHER_H -#define __DISPATCHER_H - -#include "Message.h" - -class Messenger; - -class Dispatcher { - public: - virtual ~Dispatcher() { } - - // how i receive messages - virtual void dispatch(Message *m) = 0; - - // how i deal with transmission failures. - virtual void ms_handle_failure(Message *m, const entity_inst_t& inst) { delete m; } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/msg/FakeMessenger.cc b/tags/20070517_before_mds_merge/msg/FakeMessenger.cc deleted file mode 100644 index 2aa6c6b06b75b..0000000000000 --- a/tags/20070517_before_mds_merge/msg/FakeMessenger.cc +++ /dev/null @@ -1,338 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "Message.h" -#include "FakeMessenger.h" -#include "mds/MDS.h" - -#include "common/Timer.h" - -#include "common/LogType.h" -#include "common/Logger.h" - -#include "config.h" - -#undef dout -#define dout(x) if ((x) <= g_conf.debug_ms) cout << g_clock.now() << " " - - - -#include -#include -#include -#include -#include - -using namespace std; - -#include -using namespace __gnu_cxx; - - -#include "common/Cond.h" -#include "common/Mutex.h" -#include - - -// global queue. - -int nranks = 0; // this identify each entity_inst_t - -map directory; -hash_map loggers; -LogType fakemsg_logtype; - -set shutdown_set; - -Mutex lock; -Cond cond; - -bool awake = false; -bool fm_shutdown = false; -pthread_t thread_id; - - - - -void *fakemessenger_thread(void *ptr) -{ - lock.Lock(); - while (1) { - dout(20) << "thread waiting" << endl; - if (fm_shutdown) break; - awake = false; - cond.Wait(lock); - awake = true; - dout(20) << "thread woke up" << endl; - if (fm_shutdown) break; - - fakemessenger_do_loop_2(); - - if (directory.empty()) break; - } - lock.Unlock(); - - dout(1) << "thread finish (i woke up but no messages, bye)" << endl; - return 0; -} - - -void fakemessenger_startthread() { - pthread_create(&thread_id, NULL, fakemessenger_thread, 0); -} - -void fakemessenger_stopthread() { - cout << "fakemessenger_stopthread setting stop flag" << endl; - lock.Lock(); - fm_shutdown = true; - lock.Unlock(); - cond.Signal(); - - fakemessenger_wait(); -} - -void fakemessenger_wait() -{ - cout << "fakemessenger_wait waiting" << endl; - void *ptr; - pthread_join(thread_id, &ptr); -} - - - - -// lame main looper - -int fakemessenger_do_loop() -{ - lock.Lock(); - fakemessenger_do_loop_2(); - lock.Unlock(); - - g_timer.shutdown(); - return 0; -} - - -int fakemessenger_do_loop_2() -{ - //lock.Lock(); - dout(18) << "do_loop begin." << endl; - - while (1) { - bool didone = false; - - dout(18) << "do_loop top" << endl; - - // messages - map::iterator it = directory.begin(); - while (it != directory.end()) { - FakeMessenger *mgr = it->second; - - dout(18) << "messenger " << mgr << " at " << mgr->get_myname() << " has " << mgr->num_incoming() << " queued" << endl; - - if (!mgr->is_ready()) { - dout(18) << "messenger " << mgr << " at " << mgr->get_myname() << " has no dispatcher, skipping" << endl; - it++; - continue; - } - - Message *m = mgr->get_message(); - it++; - - if (m) { - //dout(18) << "got " << m << endl; - dout(1) << "---- " << m->get_dest() - << " <- " << m->get_source() - << " ---- " << *m - << endl; - - if (g_conf.fakemessenger_serialize) { - // encode - if (m->empty_payload()) - m->encode_payload(); - msg_envelope_t env = m->get_envelope(); - bufferlist bl; - bl.claim( m->get_payload() ); - //bl.c_str(); // condense into 1 buffer - - delete m; - - // decode - m = decode_message(env, bl); - assert(m); - } - - didone = true; - - lock.Unlock(); - mgr->dispatch(m); - lock.Lock(); - } - } - - // deal with shutdowns.. dleayed to avoid concurrent directory modification - if (!shutdown_set.empty()) { - for (set::iterator it = shutdown_set.begin(); - it != shutdown_set.end(); - it++) { - dout(7) << "fakemessenger: removing " << *it << " from directory" << endl; - assert(directory.count(*it)); - directory.erase(*it); - if (directory.empty()) { - dout(1) << "fakemessenger: last shutdown" << endl; - ::fm_shutdown = true; - } - } - shutdown_set.clear(); - } - - if (!didone) - break; - } - - - dout(18) << "do_loop end (no more messages)." << endl; - //lock.Unlock(); - return 0; -} - - -FakeMessenger::FakeMessenger(entity_name_t me) : Messenger(me) -{ - lock.Lock(); - { - // assign rank - _myinst.name = me; - _myinst.addr.port = nranks++; - //if (!me.is_mon()) - //_myinst.addr.nonce = getpid(); - - // add to directory - directory[ _myinst.addr ] = this; - } - lock.Unlock(); - - - cout << "fakemessenger " << get_myname() << " messenger is " << this << " at " << _myinst << endl; - - qlen = 0; - - /* - string name; - name = "m."; - name += MSG_ADDR_TYPE(myaddr); - int w = MSG_ADDR_NUM(myaddr); - if (w >= 1000) name += ('0' + ((w/1000)%10)); - if (w >= 100) name += ('0' + ((w/100)%10)); - if (w >= 10) name += ('0' + ((w/10)%10)); - name += ('0' + ((w/1)%10)); - - loggers[ myaddr ] = new Logger(name, (LogType*)&fakemsg_logtype); - */ -} - -FakeMessenger::~FakeMessenger() -{ - // hose any undelivered messages - for (list::iterator p = incoming.begin(); - p != incoming.end(); - ++p) - delete *p; -} - - -int FakeMessenger::shutdown() -{ - //cout << "shutdown on messenger " << this << " has " << num_incoming() << " queued" << endl; - lock.Lock(); - assert(directory.count(_myinst.addr) == 1); - shutdown_set.insert(_myinst.addr); - - /* - if (loggers[myaddr]) { - delete loggers[myaddr]; - loggers.erase(myaddr); - } - */ - - lock.Unlock(); - return 0; -} - - -void FakeMessenger::reset_myname(entity_name_t m) -{ - dout(1) << "reset_myname from " << get_myname() << " to " << m << endl; - _set_myname(m); - - directory.erase(_myinst.addr); - _myinst.name = m; - directory[_myinst.addr] = this; - -} - - -int FakeMessenger::send_message(Message *m, entity_inst_t inst, int port, int fromport) -{ - entity_name_t dest = inst.name; - - m->set_source(get_myname(), fromport); - m->set_source_addr(get_myaddr()); - - m->set_dest(inst.name, port); - - lock.Lock(); - -#ifdef LOG_MESSAGES - // stats - loggers[get_myaddr()]->inc("+send",1); - loggers[dest]->inc("-recv",1); - - char s[20]; - sprintf(s,"+%s", m->get_type_name()); - loggers[get_myaddr()]->inc(s); - sprintf(s,"-%s", m->get_type_name()); - loggers[dest]->inc(s); -#endif - - // queue - if (directory.count(inst.addr)) { - dout(1) << "--> " << get_myname() << " -> " << inst.name << " " << *m << endl; - directory[inst.addr]->queue_incoming(m); - } else { - dout(0) << "--> " << get_myname() << " -> " << inst.name << " " << *m - << " *** destination DNE ***" << endl; - for (map::iterator p = directory.begin(); - p != directory.end(); - ++p) { - dout(0) << "** have " << p->first << " to " << p->second << endl; - } - //assert(dm); - delete m; - } - - // wake up loop? - if (!awake) { - dout(10) << "waking up fakemessenger thread" << endl; - cond.Signal(); - lock.Unlock(); - } else - lock.Unlock(); - - return 0; -} - - diff --git a/tags/20070517_before_mds_merge/msg/FakeMessenger.h b/tags/20070517_before_mds_merge/msg/FakeMessenger.h deleted file mode 100644 index 13cd6f95326d1..0000000000000 --- a/tags/20070517_before_mds_merge/msg/FakeMessenger.h +++ /dev/null @@ -1,88 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __FAKEMESSENGER_H -#define __FAKEMESSENGER_H - -#include "Messenger.h" -#include "Dispatcher.h" - -#include -#include - -class Timer; - -class FakeMessenger : public Messenger { - protected: - class Logger *logger; - - int qlen; - list incoming; // incoming queue - - entity_inst_t _myinst; - - public: - FakeMessenger(entity_name_t me); - ~FakeMessenger(); - - virtual int shutdown(); - - const entity_inst_t& get_myinst() { - return _myinst; - }; - const entity_addr_t& get_myaddr() { - return _myinst.addr; - } - - void reset_myname(entity_name_t m); - - // msg interface - virtual int send_message(Message *m, entity_inst_t dest, int port=0, int fromport=0); - - // events - //virtual void trigger_timer(Timer *t); - - int get_dispatch_queue_len() { return qlen; } - - // -- incoming queue -- - // (that nothing uses) - Message *get_message() { - if (!incoming.empty()) { - Message *m = incoming.front(); - incoming.pop_front(); - qlen--; - return m; - } - return NULL; - } - bool queue_incoming(Message *m) { - incoming.push_back(m); - qlen++; - return true; - } - int num_incoming() { - //return incoming.size(); - return qlen; - } - -}; - -int fakemessenger_do_loop(); -int fakemessenger_do_loop_2(); -void fakemessenger_startthread(); -void fakemessenger_stopthread(); -void fakemessenger_wait(); - -#endif diff --git a/tags/20070517_before_mds_merge/msg/HostMonitor.cc b/tags/20070517_before_mds_merge/msg/HostMonitor.cc deleted file mode 100644 index 44ab35a9fcc10..0000000000000 --- a/tags/20070517_before_mds_merge/msg/HostMonitor.cc +++ /dev/null @@ -1,235 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "HostMonitor.h" - -#include "msg/Message.h" -#include "msg/Messenger.h" - -#include "messages/MPing.h" -#include "messages/MPingAck.h" -#include "messages/MFailure.h" -#include "messages/MFailureAck.h" - -#include "common/Timer.h" -#include "common/Clock.h" - -#define DBL 10 - - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug) cout << whoami << " hostmon: " - - -// timer contexts - -class C_HM_InitiateHeartbeat : public Context { - HostMonitor *hm; -public: - C_HM_InitiateHeartbeat(HostMonitor *hm) { - this->hm = hm; - } - void finish(int r) { - //cout << "HEARTBEAT" << endl; - hm->pending_events.erase(this); - hm->initiate_heartbeat(); - } -}; - -class C_HM_CheckHeartbeat : public Context { - HostMonitor *hm; -public: - C_HM_CheckHeartbeat(HostMonitor *hm) { - this->hm = hm; - } - void finish(int r) { - //cout << "CHECK" << endl; - hm->pending_events.erase(this); - hm->check_heartbeat(); - } -}; - - - -// startup/shutdown - -void HostMonitor::init() -{ - dout(DBL) << "init" << endl; - - // hack params for now - heartbeat_interval = 10; - max_ping_time = 2; - max_heartbeat_misses = 3; - notify_retry_interval = 10; - - // schedule first hb - schedule_heartbeat(); -} - - -void HostMonitor::shutdown() -{ - // cancel any events - for (set::iterator it = pending_events.begin(); - it != pending_events.end(); - it++) { - g_timer.cancel_event(*it); - delete *it; - } - pending_events.clear(); -} - - -// schedule next heartbeat - -void HostMonitor::schedule_heartbeat() -{ - dout(DBL) << "schedule_heartbeat" << endl; - Context *e = new C_HM_InitiateHeartbeat(this); - pending_events.insert(e); - g_timer.add_event_after(heartbeat_interval, e); -} - - -// take note of a live host - -void HostMonitor::host_is_alive(entity_name_t host) -{ - if (hosts.count(host)) - status[host].last_heard_from = g_clock.gettime(); -} - - -// do heartbeat - -void HostMonitor::initiate_heartbeat() -{ - time_t now = g_clock.gettime(); - - // send out pings - inflight_pings.clear(); - for (set::iterator it = hosts.begin(); - it != hosts.end(); - it++) { - // have i heard from them recently? - if (now - status[*it].last_heard_from < heartbeat_interval) { - dout(DBL) << "skipping " << *it << ", i heard from them recently" << endl; - } else { - dout(DBL) << "pinging " << *it << endl; - status[*it].last_pinged = now; - inflight_pings.insert(*it); - - messenger->send_message(new MPing(1), *it, 0); - } - } - - // set timer to check results - Context *e = new C_HM_CheckHeartbeat(this); - pending_events.insert(e); - g_timer.add_event_after(max_ping_time, e); - dout(10) << "scheduled check " << e << endl; - - schedule_heartbeat(); // schedule next heartbeat -} - - -// check results - -void HostMonitor::check_heartbeat() -{ - dout(DBL) << "check_heartbeat()" << endl; - - // check inflight pings - for (set::iterator it = inflight_pings.begin(); - it != inflight_pings.end(); - it++) { - status[*it].num_heartbeats_missed++; - - dout(DBL) << "no response from " << *it << " for " << status[*it].num_heartbeats_missed << " beats" << endl; - - if (status[*it].num_heartbeats_missed >= max_heartbeat_misses) { - if (acked_failures.count(*it)) { - dout(DBL) << *it << " is already failed" << endl; - } else { - if (unacked_failures.count(*it)) { - dout(DBL) << *it << " is already failed, but unacked, sending another failure message" << endl; - } else { - dout(DBL) << "failing " << *it << endl; - unacked_failures.insert(*it); - } - - /*if (false) // do this in NewMessenger for now! FIXME - for (set::iterator nit = notify.begin(); - nit != notify.end(); - nit++) { - messenger->send_message(new MFailure(*it, messenger->get_inst(*it)), - *nit, notify_port, 0); - } - */ - } - } - } - - // forget about the pings. - inflight_pings.clear(); -} - - -// incoming messages - -void HostMonitor::proc_message(Message *m) -{ - switch (m->get_type()) { - - case MSG_PING_ACK: - handle_ping_ack((MPingAck*)m); - break; - - case MSG_FAILURE_ACK: - handle_failure_ack((MFailureAck*)m); - break; - - } -} - -void HostMonitor::handle_ping_ack(MPingAck *m) -{ - entity_name_t from = m->get_source(); - - dout(DBL) << "ping ack from " << from << endl; - status[from].last_pinged = g_clock.gettime(); - status[from].num_heartbeats_missed = 0; - inflight_pings.erase(from); - - delete m; -} - -void HostMonitor::handle_failure_ack(MFailureAck *m) -{ - - // FIXME: this doesn't handle failed -> alive transitions gracefully at all.. - - // the higher-up's acknowledged our failure notification, we can stop resending it. - entity_name_t failed = m->get_failed(); - dout(DBL) << "handle_failure_ack " << failed << endl; - unacked_failures.erase(failed); - acked_failures.insert(failed); - - delete m; -} - - diff --git a/tags/20070517_before_mds_merge/msg/HostMonitor.h b/tags/20070517_before_mds_merge/msg/HostMonitor.h deleted file mode 100644 index fffe798b71450..0000000000000 --- a/tags/20070517_before_mds_merge/msg/HostMonitor.h +++ /dev/null @@ -1,97 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __HOSTMONITOR_H -#define __HOSTMONITOR_H - -#include - -#include -#include -using namespace std; - -#include "include/Context.h" -#include "msg/Message.h" - -class Message; -class Messenger; - -typedef struct { - time_t last_heard_from; - time_t last_pinged; - int num_heartbeats_missed; -} monitor_rec_t; - -class HostMonitor { - Messenger *messenger; - string whoami; - - // hosts i monitor - set hosts; - - // who i tell when they fail - set notify; - int notify_port; - - // their status - map status; - - set inflight_pings; // pings we sent that haven't replied yet - - set unacked_failures; // failed hosts that haven't been acked yet. - set acked_failures; // these failures have been acked. - - float heartbeat_interval; // how often to do a heartbeat - float max_ping_time; // how long before it's a miss - int max_heartbeat_misses; // how many misses before i tell - float notify_retry_interval; // how often to retry failure notification - - public: - set pending_events; - - private: - void schedule_heartbeat(); - - public: - HostMonitor(Messenger *m, string& whoami) { - this->messenger = m; - this->whoami = whoami; - notify_port = 0; - } - set& get_hosts() { return hosts; } - set& get_notify() { return notify; } - void set_notify_port(int p) { notify_port = p; } - - void remove_host(entity_name_t h) { - hosts.erase(h); - status.erase(h); - unacked_failures.erase(h); - acked_failures.erase(h); - } - - void init(); - void shutdown(); - - void host_is_alive(entity_name_t who); - - void proc_message(Message *m); - void handle_ping_ack(class MPingAck *m); - void handle_failure_ack(class MFailureAck *m); - - void initiate_heartbeat(); - void check_heartbeat(); - -}; - -#endif diff --git a/tags/20070517_before_mds_merge/msg/MPIMessenger.cc b/tags/20070517_before_mds_merge/msg/MPIMessenger.cc deleted file mode 100644 index 6c4e65d063fc9..0000000000000 --- a/tags/20070517_before_mds_merge/msg/MPIMessenger.cc +++ /dev/null @@ -1,608 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "config.h" -#include "include/error.h" - -#include "common/Timer.h" -#include "common/Mutex.h" - -#include "MPIMessenger.h" -#include "Message.h" - -#include -#include -using namespace std; -#include -using namespace __gnu_cxx; - -#include -#include - -/* - * We make a directory, so that we can have multiple Messengers in the - * same process (rank). This is useful for benchmarking and creating lots of - * simulated clients, e.g. - */ - -hash_map directory; -list outgoing, incoming; -list unfinished_sends; -map unfinished_send_message; - -/* this process */ -int mpi_world; -int mpi_rank; -bool mpi_done = false; // set this flag to stop the event loop - - -#define FUNNEL_MPI // if we want to funnel mpi through a single thread -#define TAG_UNSOLICITED 0 -#define DBLVL 18 - -// the key used to fetch the tag for the current thread. -pthread_key_t tag_key; -pthread_t thread_id = 0; // thread id of the event loop. init value == nobody - -Mutex sender_lock; -Mutex out_queue_lock; - -bool pending_timer; - - -// our lock for any common data; it's okay to have only the one global mutex -// because our common data isn't a whole lot. -//static pthread_mutex_t mutex; - -// the number of distinct threads we've seen so far; used to generate -// a unique tag for each thread. -//static int nthreads = 10; - -//#define TAG_UNSOLICITED 0 - -// debug -#undef dout -#define dout(l) if (l<=g_conf.debug) cout << "[MPI " << mpi_rank << "/" << mpi_world << " " << getpid() << "." << pthread_self() << "] " - - - -/***** - * MPI global methods for process-wide startup, shutdown. - */ - -int mpimessenger_init(int& argc, char**& argv) -{ - MPI_Init(&argc, &argv); - - MPI_Comm_size(MPI_COMM_WORLD, &mpi_world); - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - - char hostname[100]; - gethostname(hostname,100); - int pid = getpid(); - - dout(12) << "init: i am " << hostname << " pid " << pid << endl; - - assert(mpi_world > g_conf.num_osd+g_conf.num_mds); - - return mpi_rank; -} - -int mpimessenger_shutdown() -{ - dout(5) << "mpimessenger_shutdown barrier waiting for all to finish" << endl; - MPI_Barrier (MPI_COMM_WORLD); - dout(1) << "mpimessenger_shutdown all done, MPI_Finalize()" << endl; - MPI_Finalize(); - return 0; -} - -int mpimessenger_world() -{ - return mpi_world; -} - - - -/*** - * internal send/recv - */ - - -/* - * get fresh MPI_Request* (on heap) for a new async MPI_Isend - */ - -MPI_Request *mpi_prep_send_req() { - MPI_Request *req = new MPI_Request; - unfinished_sends.push_back(req); - dout(DBLVL) << "prep_send_req " << req << endl; - return req; -} - - -/* - * clean up MPI_Request*'s for Isends that have completed. - * also, hose any associated Message*'s for Messages that are completely sent. - * - * if wait=true, block and wait for sends to finish. - */ - -void mpi_reap_sends(bool wait=false) { - sender_lock.Lock(); - - list::iterator it = unfinished_sends.begin(); - while (it != unfinished_sends.end()) { - MPI_Status status; - int flag; - - if (wait) { - MPI_Wait(*it, &status); - } else { - MPI_Test(*it, &flag, &status); - if (!flag) break; // not finished yet - } - - dout(DBLVL) << "send " << *it << " completed" << endl; - - if (unfinished_send_message.count(*it)) { - dout(DBLVL) << "send message " << unfinished_send_message[*it] << " completed" << endl; - delete unfinished_send_message[*it]; - unfinished_send_message.erase(*it); - } - - delete *it; - it++; - unfinished_sends.pop_front(); - } - - dout(DBLVL) << "reap has " << unfinished_sends.size() << " Isends outstanding, " << unfinished_send_message.size() << " messages" << endl; - - sender_lock.Unlock(); -} - - -void mpi_finish_sends() { - mpi_reap_sends(true); -} - - -/* - * recv a Message* - */ -Message *mpi_recv(int tag) -{ - // envelope - dout(DBLVL) << "mpi_recv waiting for message tag " << tag << endl; - - MPI_Status status; - msg_envelope_t env; - - ASSERT(MPI_Recv((void*)&env, - sizeof(env), - MPI_CHAR, - MPI_ANY_SOURCE,// status.MPI_SOURCE,//MPI_ANY_SOURCE, - tag, - MPI_COMM_WORLD, - &status/*, - &recv_env_req*/) == MPI_SUCCESS); - assert(status.count == MSG_ENVELOPE_LEN); - - if (env.type == 0) { - dout(DBLVL) << "mpi_recv got type 0 message, kicked!" << endl; - return 0; - } - - dout(DBLVL) << "mpi_recv got envelope " << status.count << ", type=" << env.type << " src " << env.source << " dst " << env.dest << " nchunks=" << env.nchunks << " from " << status.MPI_SOURCE << endl; - - // payload - bufferlist blist; - for (int i=0; iget_dest(), mpi_world); - - // local? - if (rank == mpi_rank) { - dout(DBLVL) << "queuing local delivery" << endl; - incoming.push_back(m); - return 0; - } - - // marshall - if (m->empty_payload()) - m->encode_payload(); - msg_envelope_t *env = &m->get_envelope(); - env->nchunks = m->get_payload().buffers().size(); - - dout(7) << "sending " << *m << " to " << MSG_ADDR_NICE(env->dest) << " (rank " << rank << ")" << endl; - -#ifndef FUNNEL_MPI - sender_lock.Lock(); -#endif - - // send envelope - ASSERT(MPI_Isend((void*)env, - sizeof(*env), - MPI_CHAR, - rank, - tag, - MPI_COMM_WORLD, - mpi_prep_send_req()) == MPI_SUCCESS); - - // payload - int i = 0; - for (list::iterator it = m->get_payload().buffers().begin(); - it != m->get_payload().buffers().end(); - it++) { - dout(DBLVL) << "mpi_sending frag " << i << " len " << (*it).length() << endl; - //MPI_Request *req = new MPI_Request; - ASSERT(MPI_Isend((void*)(*it).c_str(), - (*it).length(), - MPI_CHAR, - rank, - tag, - MPI_COMM_WORLD, - mpi_prep_send_req()) == MPI_SUCCESS); - i++; - } - - // attach message to last send, so we can free it later - MPI_Request *req = unfinished_sends.back(); - unfinished_send_message[req] = m; - - dout(DBLVL) << "mpi_send done, attached message to Isend " << req << endl; - -#ifndef FUNNEL_MPI - sender_lock.Unlock(); -#endif - return 0; -} - - - -// get the tag for this thread - -#ifndef FUNNEL_MPI -static int get_thread_tag() -{ - int tag = (int)pthread_getspecific(tag_key); - - if (tag == 0) { - // first time this thread has performed MPI messaging - - if (pthread_mutex_lock(&mutex) < 0) - SYSERROR(); - - tag = ++nthreads; - - if (pthread_mutex_unlock(&mutex) < 0) - SYSERROR(); - - if (pthread_setspecific(tag_key, (void*)tag) < 0) - SYSERROR(); - } - - return tag; -} -#endif - - - -// recv event loop, for unsolicited messages. - -void* mpimessenger_loop(void*) -{ - dout(5) << "mpimessenger_loop start pid " << getpid() << endl; - - while (1) { - - // outgoing - mpi_reap_sends(); - -#ifdef FUNNEL_MPI - // check outgoing queue - out_queue_lock.Lock(); - if (outgoing.size()) { - dout(10) << outgoing.size() << " outgoing messages" << endl; - for (list::iterator it = outgoing.begin(); - it != outgoing.end(); - it++) { - mpi_send(*it, TAG_UNSOLICITED); - } - } - outgoing.clear(); - out_queue_lock.Unlock(); -#endif - - - // timer events? - if (pending_timer) { - dout(DBLVL) << "pending timer" << endl; - g_timer.execute_pending(); - } - - // done? - if (mpi_done && - incoming.empty() && - outgoing.empty() && - !pending_timer) break; - - - // incoming - Message *m = 0; - - if (incoming.size()) { - dout(12) << "loop pulling message off incoming" << endl; - m = incoming.front(); - incoming.pop_front(); - } - else { - // check mpi - dout(12) << "loop waiting for incoming messages" << endl; - - // get message - m = mpi_recv(TAG_UNSOLICITED); - } - - // dispatch? - if (m) { - int dest = m->get_dest(); - if (directory.count(dest)) { - Messenger *who = directory[ dest ]; - - dout(4) << "---- '" << m->get_type_name() << - "' from " << MSG_ADDR_NICE(m->get_source()) << ':' << m->get_source_port() << - " to " << MSG_ADDR_NICE(m->get_dest()) << ':' << m->get_dest_port() << " ---- " - << m - << endl; - - who->dispatch(m); - } else { - dout (1) << "---- i don't know who " << dest << " is." << endl; - assert(0); - break; - } - } - - } - - dout(5) << "finishing async sends" << endl; - mpi_finish_sends(); - - g_timer.shutdown(); - - dout(5) << "mpimessenger_loop exiting loop" << endl; - return 0; -} - - -// start/stop mpi receiver thread (for unsolicited messages) -int mpimessenger_start() -{ - dout(5) << "starting thread" << endl; - - // start a thread - pthread_create(&thread_id, - NULL, - mpimessenger_loop, - 0); - return 0; -} - - -/* - * kick and wake up _loop (to pick up new outgoing message, or quit) - */ - -MPI_Request kick_req; -msg_envelope_t kick_env; - -void mpimessenger_kick_loop() -{ - // if we're same thread as the loop, no kicking necessary - if (pthread_self() == thread_id) return; - - kick_env.type = 0; - - sender_lock.Lock(); - ASSERT(MPI_Isend(&kick_env, // kick sync for now, but ONLY because it makes me feel safer. - sizeof(kick_env), - MPI_CHAR, - mpi_rank, - TAG_UNSOLICITED, - MPI_COMM_WORLD, - mpi_prep_send_req()) == MPI_SUCCESS); - sender_lock.Unlock(); -} - - -// stop thread - -void mpimessenger_stop() -{ - dout(5) << "mpimessenger_stop stopping thread" << endl; - - if (mpi_done) { - dout(1) << "mpimessenger_stop called, but already done!" << endl; - assert(!mpi_done); - } - - // set finish flag - mpi_done = true; - mpimessenger_kick_loop(); - - // wait for thread to stop - mpimessenger_wait(); -} - - -// wait for thread to finish - -void mpimessenger_wait() -{ - void *returnval; - dout(10) << "mpimessenger_wait waiting for thread to finished." << endl; - pthread_join(thread_id, &returnval); - dout(10) << "mpimessenger_wait thread finished." << endl; -} - - - - -/*********** - * MPIMessenger class implementation - */ - -class C_MPIKicker : public Context { - void finish(int r) { - dout(DBLVL) << "timer kick" << endl; - mpimessenger_kick_loop(); - } -}; - -MPIMessenger::MPIMessenger(entity_name_t myaddr) : Messenger(myaddr) -{ - // my address - this->myaddr = myaddr; - - // register myself in the messenger directory - directory[myaddr] = this; - - // register to execute timer events - g_timer.set_messenger_kicker(new C_MPIKicker()); - - // logger - /* - string name; - name = "m."; - name += MSG_ADDR_TYPE(whoami); - int w = MSG_ADDR_NUM(whoami); - if (w >= 1000) name += ('0' + ((w/1000)%10)); - if (w >= 100) name += ('0' + ((w/100)%10)); - if (w >= 10) name += ('0' + ((w/10)%10)); - name += ('0' + ((w/1)%10)); - - logger = new Logger(name, (LogType*)&mpimsg_logtype); - loggers[ whoami ] = logger; - */ -} - -MPIMessenger::~MPIMessenger() -{ - //delete logger; -} - - -int MPIMessenger::shutdown() -{ - // remove me from the directory - directory.erase(myaddr); - - // no more timer events - g_timer.unset_messenger_kicker(); - - // last one? - if (directory.empty()) { - dout(10) << "shutdown last mpimessenger on rank " << mpi_rank << " shut down" << endl; - pthread_t whoami = pthread_self(); - - dout(15) << "whoami = " << whoami << ", thread = " << thread_id << endl; - if (whoami == thread_id) { - // i am the event loop thread, just set flag! - dout(15) << " set mpi_done=true" << endl; - mpi_done = true; - } else { - // i am a different thread, tell the event loop to stop. - dout(15) << " calling mpimessenger_stop()" << endl; - mpimessenger_stop(); - } - } else { - dout(10) << "shutdown still " << directory.size() << " other messengers on rank " << mpi_rank << endl; - } - return 0; -} - - - - -/*** - * public messaging interface - */ - - -/* note: send_message _MUST_ be non-blocking */ -int MPIMessenger::send_message(Message *m, entity_name_t dest, int port, int fromport) -{ - // set envelope - m->set_source(myaddr, fromport); - m->set_dest(dest, port); - -#ifdef FUNNEL_MPI - - // queue up - out_queue_lock.Lock(); - dout(DBLVL) << "queuing outgoing message " << *m << endl; - outgoing.push_back(m); - out_queue_lock.Unlock(); - - mpimessenger_kick_loop(); - -#else - - // send in this thread - mpi_send(m, m->get_pcid()); - -#endif - return 0; -} - - - - - - diff --git a/tags/20070517_before_mds_merge/msg/MPIMessenger.h b/tags/20070517_before_mds_merge/msg/MPIMessenger.h deleted file mode 100644 index 88e753de89749..0000000000000 --- a/tags/20070517_before_mds_merge/msg/MPIMessenger.h +++ /dev/null @@ -1,56 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MPIMESSENGER_H -#define __MPIMESSENGER_H - -#include "Messenger.h" -#include "Dispatcher.h" - -#define NUMMDS g_conf.num_mds -#define NUMOSD g_conf.num_osd -#define MPI_DEST_TO_RANK(dest,world) ((dest)<(NUMMDS+NUMOSD) ? \ - (dest) : \ - ((NUMMDS+NUMOSD)+(((dest)-NUMMDS-NUMOSD) % ((world)-NUMMDS-NUMOSD)))) - -class Timer; - -class MPIMessenger : public Messenger { - protected: - entity_name_t myaddr; // my address - //class Logger *logger; // for logging - - public: - MPIMessenger(entity_name_t myaddr); - ~MPIMessenger(); - - // init, shutdown MPI and associated event loop thread. - virtual int shutdown(); - - // message interface - virtual int send_message(Message *m, entity_name_t dest, int port=0, int fromport=0); -}; - -/** - * these are all ONE per process. - */ -extern int mpimessenger_world(); // get world size -extern int mpimessenger_init(int& argc, char**& argv); // init mpi -extern int mpimessenger_start(); // start thread -extern void mpimessenger_stop(); // stop thread. -extern void mpimessenger_wait(); // wait for thread to finish. -extern int mpimessenger_shutdown(); // finalize MPI - - -#endif diff --git a/tags/20070517_before_mds_merge/msg/MTMessenger.cc b/tags/20070517_before_mds_merge/msg/MTMessenger.cc deleted file mode 100644 index 02ab9981ff353..0000000000000 --- a/tags/20070517_before_mds_merge/msg/MTMessenger.cc +++ /dev/null @@ -1,197 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include -#include "mpi.h" - -#include "include/config.h" -#include "include/error.h" -#include "Messenger.h" -#include "MTMessenger.h" - -// This module uses MPI to implement a blocking sendrecv function that -// feels more like a procedure call and less like event processesing. -// -// Threads are not independently addressable in MPI, only processes -// are. However, MPI does include a user defined tag in the message -// envelope, and a reader may selectively read only messages with a -// matching tag. The modules assign an integer to each thread to use -// as the tag. -// - -// our lock for any common data; it's okay to have only the one global mutex -// because our common data isn't a whole lot. -static pthread_mutex_t mutex; - -// the key used to fetch the tag for the current thread. -pthread_key_t tag_key; - -// the number of distinct threads we've seen so far; used to generate -// a unique tag for each thread. -static int nthreads; - -// the MPI identity of this process -static int mpi_rank; - - -// get the tag for this thread -static int get_tag() -{ - int tag = (int)pthread_getspecific(tag_key); - - if (tag == 0) { - // first time this thread has performed MPI messaging - - if (pthread_mutex_lock(&mutex) < 0) - SYSERROR(); - - tag = ++nthreads; - - if (pthread_mutex_unlock(&mutex) < 0) - SYSERROR(); - - if (pthread_setspecific(tag_key, (void*)tag) < 0) - SYSERROR(); - } - - return tag; -} - - -// marshall a message and send it over MPI -static void send(Message *m, int rank, int tag) -{ - // marshall the message - crope r; - m->encode(r); - int size = r.length(); - - char *buf = (char*)r.c_str(); - ASSERT(MPI_Send(buf, - size, - MPI_CHAR, - rank, - tag, - MPI_COMM_WORLD) == MPI_SUCCESS); -} - -// read a message from MPI and unmarshall it -static Message *receive(int tag) -{ - MPI_Status status; - - // get message size - ASSERT(MPI_Probe(MPI_ANY_SOURCE, - tag, - MPI_COMM_WORLD, - &status) == MPI_SUCCESS); - - // get message; there may be multiple messages on the queue, we - // need to be sure to read the one which corresponds to size - // obtained above. - char *buf = new char[status.count]; - ASSERT(MPI_Recv(buf, - status.count, - MPI_CHAR, - status.MPI_SOURCE, - status.MPI_TAG, - MPI_COMM_WORLD, - &status) == MPI_SUCCESS); - - // unmarshall message - crope r(buf, status.count); - delete[] buf; - Message *m = decode_message(r); - - return m; -} - -MTMessenger::MTMessenger(int& argc, char**& argv) -{ - // setup MPI; MPI errors will probably invoke the default MPI error - // handler, which aborts the program with a friendly message rather - // than returning from a function; just in case, we abort the - // program if we get an MPI error. - - int provided; - ASSERT(MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided) - == MPI_SUCCESS); - - ASSERT(MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank) == MPI_SUCCESS); - - if (pthread_mutex_init(&mutex, NULL) < 0) - SYSERROR(); - - if (pthread_key_create(&tag_key, NULL) < 0) - SYSERROR(); - - nthreads = 0; -} - -MTMessenger::~MTMessenger() -{ - // ignore shutdown errors - - pthread_key_delete(tag_key); - - pthread_mutex_destroy(&mutex); - - MPI_Finalize(); -} - -// send a request and wait for the response -Message *MTMessenger::sendrecv(Message *m, entity_name_t dest) -{ - int dest_tag = 0; // servers listen for any tag - int my_tag = get_tag(); - - // set our envelope (not to be confused with the MPI envelope) - m->set_source(mpi_rank, my_tag); - m->set_dest(dest, dest_tag); - - send(m, dest, dest_tag); - - return receive(my_tag); -} - -// receive a request from anyone -Message *MTMessenger::recvreq() -{ - return receive(MPI_ANY_TAG); -} - -// forward request, masquerading as original source -void MTMessenger::fwdreq(Message *req, int dest) -{ - int dest_tag = 0; // servers listen for any tag - - // set our envelope (not to be confused with the MPI envelope) - req->set_dest(dest, dest_tag); - - send(req, dest, dest_tag); -} - -// send a response to the originator of the request -void MTMessenger::sendresp(Message *req, Message *resp) -{ - int req_rank = req->get_source(); - int req_tag = req->get_source_port(); - int my_tag = get_tag(); - - // set our envelope (not to be confused with the MPI envelope) - resp->set_source(mpi_rank, my_tag); - resp->set_dest(req_rank, req_tag); - - send(resp, req_rank, req_tag); -} diff --git a/tags/20070517_before_mds_merge/msg/MTMessenger.h b/tags/20070517_before_mds_merge/msg/MTMessenger.h deleted file mode 100644 index 477a39c60561d..0000000000000 --- a/tags/20070517_before_mds_merge/msg/MTMessenger.h +++ /dev/null @@ -1,50 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MTMESSENGER_H -#define __MTMESSENGER_H - -#include "Message.h" -#include "SerialMessenger.h" - -// Marshall and unmarshall OBFS messages, send and receive them over -// MPI. - -class MTMessenger -{ -public: - // sets up the queues and internal thread; the MPI initialization - // will scan argc/argv for MPI specific flags and remove them from - // argc/argv. - MTMessenger(int &argc, char **&argv); - - // tears it all down - ~MTMessenger(); - - // send a request to a server and wait (block) for the response; - virtual Message *sendrecv(Message *m, entity_name_t dest); - - // wait (block) for a request from anyone - Message *recvreq(); - - // forward request, masquerading as original source - void fwdreq(Message *req, int dest); - - // send the response to the originator of the request - virtual void sendresp(Message *req, Message *resp); - - -}; // class MTMessenger - -#endif // __MTMESSENGER_H diff --git a/tags/20070517_before_mds_merge/msg/Message.cc b/tags/20070517_before_mds_merge/msg/Message.cc deleted file mode 100644 index ae01d9106ddaf..0000000000000 --- a/tags/20070517_before_mds_merge/msg/Message.cc +++ /dev/null @@ -1,466 +0,0 @@ - -#include -#include -using namespace std; - -#include "include/types.h" - -#include "Message.h" - -#include "messages/MGenericMessage.h" - -/* -#include "messages/MNSConnect.h" -#include "messages/MNSConnectAck.h" -#include "messages/MNSRegister.h" -#include "messages/MNSRegisterAck.h" -#include "messages/MNSLookup.h" -#include "messages/MNSLookupReply.h" -#include "messages/MNSFailure.h" -*/ - -#include "messages/MMonPaxos.h" - -#include "messages/MMonElectionAck.h" -#include "messages/MMonElectionPropose.h" -#include "messages/MMonElectionVictory.h" - -#include "messages/MPing.h" -#include "messages/MPingAck.h" -//#include "messages/MFailure.h" -//#include "messages/MFailureAck.h" - -#include "messages/MOSDBoot.h" -#include "messages/MOSDIn.h" -#include "messages/MOSDOut.h" -#include "messages/MOSDFailure.h" -#include "messages/MOSDPing.h" -#include "messages/MOSDOp.h" -#include "messages/MOSDOpReply.h" -#include "messages/MOSDMap.h" -#include "messages/MOSDGetMap.h" -#include "messages/MOSDPGNotify.h" -#include "messages/MOSDPGQuery.h" -#include "messages/MOSDPGLog.h" -#include "messages/MOSDPGRemove.h" - -#include "messages/MClientBoot.h" -#include "messages/MClientMount.h" -#include "messages/MClientMountAck.h" -#include "messages/MClientRequest.h" -#include "messages/MClientReply.h" -#include "messages/MClientFileCaps.h" - -#include "messages/MMDSGetMap.h" -#include "messages/MMDSMap.h" -#include "messages/MMDSBeacon.h" -#include "messages/MMDSImportMap.h" -#include "messages/MMDSCacheRejoin.h" -#include "messages/MMDSCacheRejoinAck.h" - -#include "messages/MDirUpdate.h" -#include "messages/MDiscover.h" -#include "messages/MDiscoverReply.h" - -#include "messages/MExportDirDiscover.h" -#include "messages/MExportDirDiscoverAck.h" -#include "messages/MExportDirPrep.h" -#include "messages/MExportDirPrepAck.h" -#include "messages/MExportDirWarning.h" -#include "messages/MExportDir.h" -#include "messages/MExportDirNotify.h" -#include "messages/MExportDirNotifyAck.h" -#include "messages/MExportDirFinish.h" - -#include "messages/MHashReaddir.h" -#include "messages/MHashReaddirReply.h" - -#include "messages/MHashDirDiscover.h" -#include "messages/MHashDirDiscoverAck.h" -#include "messages/MHashDirPrep.h" -#include "messages/MHashDirPrepAck.h" -#include "messages/MHashDir.h" -#include "messages/MHashDirAck.h" -#include "messages/MHashDirNotify.h" - -#include "messages/MUnhashDirPrep.h" -#include "messages/MUnhashDirPrepAck.h" -#include "messages/MUnhashDir.h" -#include "messages/MUnhashDirAck.h" -#include "messages/MUnhashDirNotify.h" -#include "messages/MUnhashDirNotifyAck.h" - -#include "messages/MRenameWarning.h" -#include "messages/MRenameNotify.h" -#include "messages/MRenameNotifyAck.h" -#include "messages/MRename.h" -#include "messages/MRenamePrep.h" -#include "messages/MRenameReq.h" -#include "messages/MRenameAck.h" -#include "messages/MDentryUnlink.h" - -#include "messages/MHeartbeat.h" - -#include "messages/MAnchorRequest.h" -#include "messages/MAnchorReply.h" -#include "messages/MInodeLink.h" -#include "messages/MInodeLinkAck.h" - -//#include "messages/MInodeUpdate.h" -#include "messages/MInodeExpire.h" -#include "messages/MDirExpire.h" -#include "messages/MCacheExpire.h" -#include "messages/MInodeFileCaps.h" - -#include "messages/MLock.h" - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug) cout << "messenger: " -#define DEBUGLVL 10 // debug level of output - - - - - - - -Message * -decode_message(msg_envelope_t& env, bufferlist& payload) -{ - // make message - Message *m = 0; - switch(env.type) { - - // -- with payload -- - - /* - case MSG_NS_CONNECT: - m = new MNSConnect(); - break; - case MSG_NS_CONNECTACK: - m = new MNSConnectAck(); - break; - case MSG_NS_REGISTER: - m = new MNSRegister(); - break; - case MSG_NS_REGISTERACK: - m = new MNSRegisterAck(); - break; - case MSG_NS_LOOKUP: - m = new MNSLookup(); - break; - case MSG_NS_LOOKUPREPLY: - m = new MNSLookupReply(); - break; - case MSG_NS_FAILURE: - m = new MNSFailure(); - break; - */ - - case MSG_MON_PAXOS: - m = new MMonPaxos; - break; - - case MSG_MON_ELECTION_PROPOSE: - m = new MMonElectionPropose; - break; - case MSG_MON_ELECTION_ACK: - m = new MMonElectionAck; - break; - case MSG_MON_ELECTION_VICTORY: - m = new MMonElectionVictory; - break; - - case MSG_PING: - m = new MPing(); - break; - case MSG_PING_ACK: - m = new MPingAck(); - break; - /* - case MSG_FAILURE: - m = new MFailure(); - break; - case MSG_FAILURE_ACK: - m = new MFailureAck(); - break; - */ - - case MSG_OSD_BOOT: - m = new MOSDBoot(); - break; - case MSG_OSD_IN: - m = new MOSDIn(); - break; - case MSG_OSD_OUT: - m = new MOSDOut(); - break; - case MSG_OSD_FAILURE: - m = new MOSDFailure(); - break; - case MSG_OSD_PING: - m = new MOSDPing(); - break; - case MSG_OSD_OP: - m = new MOSDOp(); - break; - case MSG_OSD_OPREPLY: - m = new MOSDOpReply(); - break; - - case MSG_OSD_MAP: - m = new MOSDMap(); - break; - case MSG_OSD_GETMAP: - m = new MOSDGetMap(); - break; - - case MSG_OSD_PG_NOTIFY: - m = new MOSDPGNotify(); - break; - case MSG_OSD_PG_QUERY: - m = new MOSDPGQuery(); - break; - case MSG_OSD_PG_LOG: - m = new MOSDPGLog(); - break; - case MSG_OSD_PG_REMOVE: - m = new MOSDPGRemove(); - break; - - // clients - case MSG_CLIENT_BOOT: - m = new MClientBoot(); - break; - case MSG_CLIENT_MOUNT: - m = new MClientMount(); - break; - case MSG_CLIENT_MOUNTACK: - m = new MClientMountAck(); - break; - case MSG_CLIENT_REQUEST: - m = new MClientRequest(); - break; - case MSG_CLIENT_REPLY: - m = new MClientReply(); - break; - case MSG_CLIENT_FILECAPS: - m = new MClientFileCaps(); - break; - - // mds - case MSG_MDS_GETMAP: - m = new MMDSGetMap(); - break; - case MSG_MDS_MAP: - m = new MMDSMap(); - break; - case MSG_MDS_BEACON: - m = new MMDSBeacon; - break; - case MSG_MDS_IMPORTMAP: - m = new MMDSImportMap; - break; - case MSG_MDS_CACHEREJOIN: - m = new MMDSCacheRejoin; - break; - case MSG_MDS_CACHEREJOINACK: - m = new MMDSCacheRejoinAck; - break; - - case MSG_MDS_DIRUPDATE: - m = new MDirUpdate(); - break; - - case MSG_MDS_DISCOVER: - m = new MDiscover(); - break; - case MSG_MDS_DISCOVERREPLY: - m = new MDiscoverReply(); - break; - - case MSG_MDS_EXPORTDIRDISCOVER: - m = new MExportDirDiscover(); - break; - case MSG_MDS_EXPORTDIRDISCOVERACK: - m = new MExportDirDiscoverAck(); - break; - - case MSG_MDS_EXPORTDIR: - m = new MExportDir(); - break; - - case MSG_MDS_EXPORTDIRFINISH: - m = new MExportDirFinish(); - break; - - case MSG_MDS_EXPORTDIRNOTIFY: - m = new MExportDirNotify(); - break; - - case MSG_MDS_EXPORTDIRNOTIFYACK: - m = new MExportDirNotifyAck(); - break; - - case MSG_MDS_EXPORTDIRPREP: - m = new MExportDirPrep(); - break; - - case MSG_MDS_EXPORTDIRPREPACK: - m = new MExportDirPrepAck(); - break; - - case MSG_MDS_EXPORTDIRWARNING: - m = new MExportDirWarning(); - break; - - - case MSG_MDS_HASHREADDIR: - m = new MHashReaddir(); - break; - case MSG_MDS_HASHREADDIRREPLY: - m = new MHashReaddirReply(); - break; - - case MSG_MDS_HASHDIRDISCOVER: - m = new MHashDirDiscover(); - break; - case MSG_MDS_HASHDIRDISCOVERACK: - m = new MHashDirDiscoverAck(); - break; - case MSG_MDS_HASHDIRPREP: - m = new MHashDirPrep(); - break; - case MSG_MDS_HASHDIRPREPACK: - m = new MHashDirPrepAck(); - break; - case MSG_MDS_HASHDIR: - m = new MHashDir(); - break; - case MSG_MDS_HASHDIRACK: - m = new MHashDirAck(); - break; - case MSG_MDS_HASHDIRNOTIFY: - m = new MHashDirNotify(); - break; - - case MSG_MDS_UNHASHDIRPREP: - m = new MUnhashDirPrep(); - break; - case MSG_MDS_UNHASHDIRPREPACK: - m = new MUnhashDirPrepAck(); - break; - case MSG_MDS_UNHASHDIR: - m = new MUnhashDir(); - break; - case MSG_MDS_UNHASHDIRACK: - m = new MUnhashDirAck(); - break; - case MSG_MDS_UNHASHDIRNOTIFY: - m = new MUnhashDirNotify(); - break; - case MSG_MDS_UNHASHDIRNOTIFYACK: - m = new MUnhashDirNotifyAck(); - break; - - case MSG_MDS_RENAMEWARNING: - m = new MRenameWarning(); - break; - case MSG_MDS_RENAMENOTIFY: - m = new MRenameNotify(); - break; - case MSG_MDS_RENAMENOTIFYACK: - m = new MRenameNotifyAck(); - break; - case MSG_MDS_RENAME: - m = new MRename(); - break; - case MSG_MDS_RENAMEPREP: - m = new MRenamePrep(); - break; - case MSG_MDS_RENAMEREQ: - m = new MRenameReq(); - break; - case MSG_MDS_RENAMEACK: - m = new MRenameAck(); - break; - - case MSG_MDS_DENTRYUNLINK: - m = new MDentryUnlink(); - break; - - case MSG_MDS_HEARTBEAT: - m = new MHeartbeat(); - break; - - case MSG_MDS_CACHEEXPIRE: - m = new MCacheExpire(); - break; - - case MSG_MDS_ANCHORREQUEST: - m = new MAnchorRequest(); - break; - case MSG_MDS_ANCHORREPLY: - m = new MAnchorReply(); - break; - - case MSG_MDS_INODELINK: - m = new MInodeLink(); - break; - case MSG_MDS_INODELINKACK: - m = new MInodeLinkAck(); - break; - - /* case MSG_MDS_INODEUPDATE: - m = new MInodeUpdate(); - break; - */ - - case MSG_MDS_INODEEXPIRE: - m = new MInodeExpire(); - break; - - case MSG_MDS_INODEFILECAPS: - m = new MInodeFileCaps(); - break; - - case MSG_MDS_DIREXPIRE: - m = new MDirExpire(); - break; - - case MSG_MDS_LOCK: - m = new MLock(); - break; - - - // -- simple messages without payload -- - - case MSG_CLOSE: - case MSG_NS_STARTED: - case MSG_NS_UNREGISTER: - case MSG_SHUTDOWN: - case MSG_MDS_SHUTDOWNSTART: - case MSG_MDS_SHUTDOWNFINISH: - case MSG_CLIENT_UNMOUNT: - case MSG_OSD_MKFS_ACK: - m = new MGenericMessage(env.type); - break; - - default: - dout(1) << "can't decode unknown message type " << env.type << endl; - assert(0); - } - - // env - m->set_envelope(env); - - // decode - m->set_payload(payload); - m->decode_payload(); - - // done! - return m; -} - - diff --git a/tags/20070517_before_mds_merge/msg/Message.h b/tags/20070517_before_mds_merge/msg/Message.h deleted file mode 100644 index 80e1b9feaac28..0000000000000 --- a/tags/20070517_before_mds_merge/msg/Message.h +++ /dev/null @@ -1,320 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MESSAGE_H -#define __MESSAGE_H - -#define MSG_CLOSE 0 - -#define MSG_NS_CONNECT 1 -#define MSG_NS_CONNECTACK 2 -#define MSG_NS_REGISTER 3 -#define MSG_NS_REGISTERACK 4 -#define MSG_NS_STARTED 5 -#define MSG_NS_UNREGISTER 6 -#define MSG_NS_LOOKUP 7 -#define MSG_NS_LOOKUPREPLY 8 -#define MSG_NS_FAILURE 9 - - -#define MSG_PING 10 -#define MSG_PING_ACK 11 - -#define MSG_FAILURE 12 -#define MSG_FAILURE_ACK 13 - -#define MSG_SHUTDOWN 99999 - - - -#define MSG_MON_ELECTION_ACK 15 -#define MSG_MON_ELECTION_PROPOSE 16 -#define MSG_MON_ELECTION_VICTORY 17 - -#define MSG_MON_OSDMAP_INFO 20 -#define MSG_MON_OSDMAP_LEASE 21 -#define MSG_MON_OSDMAP_LEASE_ACK 22 -#define MSG_MON_OSDMAP_UPDATE_PREPARE 23 -#define MSG_MON_OSDMAP_UPDATE_ACK 24 -#define MSG_MON_OSDMAP_UPDATE_COMMIT 25 - -#define MSG_MON_PAXOS 30 - -#define MSG_OSD_OP 40 // delete, etc. -#define MSG_OSD_OPREPLY 41 // delete, etc. -#define MSG_OSD_PING 42 - -#define MSG_OSD_GETMAP 43 -#define MSG_OSD_MAP 44 - -#define MSG_OSD_BOOT 45 -#define MSG_OSD_MKFS_ACK 46 - -#define MSG_OSD_FAILURE 47 - -#define MSG_OSD_IN 48 -#define MSG_OSD_OUT 49 - - - -#define MSG_OSD_PG_NOTIFY 50 -#define MSG_OSD_PG_QUERY 51 -#define MSG_OSD_PG_SUMMARY 52 -#define MSG_OSD_PG_LOG 53 -#define MSG_OSD_PG_REMOVE 54 - -#define MSG_CLIENT_REQUEST 60 -#define MSG_CLIENT_REPLY 61 -//#define MSG_CLIENT_DONE 62 -#define MSG_CLIENT_FILECAPS 63 -#define MSG_CLIENT_INODEAUTHUPDATE 64 - -#define MSG_CLIENT_BOOT 70 -#define MSG_CLIENT_MOUNT 71 -#define MSG_CLIENT_MOUNTACK 72 -#define MSG_CLIENT_UNMOUNT 73 - - -// *** MDS *** - -#define MSG_MDS_GETMAP 102 -#define MSG_MDS_MAP 103 -#define MSG_MDS_HEARTBEAT 104 // for mds load balancer -#define MSG_MDS_BEACON 105 // to monitor - -#define MSG_MDS_IMPORTMAP 106 -#define MSG_MDS_CACHEREJOIN 107 -#define MSG_MDS_CACHEREJOINACK 108 - -#define MSG_MDS_DISCOVER 110 -#define MSG_MDS_DISCOVERREPLY 111 - -#define MSG_MDS_INODEGETREPLICA 112 -#define MSG_MDS_INODEGETREPLICAACK 113 - -#define MSG_MDS_INODEFILECAPS 115 - -#define MSG_MDS_INODEUPDATE 120 -#define MSG_MDS_DIRUPDATE 121 -#define MSG_MDS_INODEEXPIRE 122 -#define MSG_MDS_DIREXPIRE 123 - -#define MSG_MDS_DIREXPIREREQ 124 - -#define MSG_MDS_CACHEEXPIRE 125 - -#define MSG_MDS_ANCHORREQUEST 130 -#define MSG_MDS_ANCHORREPLY 131 - -#define MSG_MDS_INODELINK 140 -#define MSG_MDS_INODELINKACK 141 -#define MSG_MDS_INODEUNLINK 142 -#define MSG_MDS_INODEUNLINKACK 143 - -#define MSG_MDS_EXPORTDIRDISCOVER 150 -#define MSG_MDS_EXPORTDIRDISCOVERACK 151 -#define MSG_MDS_EXPORTDIRPREP 152 -#define MSG_MDS_EXPORTDIRPREPACK 153 -#define MSG_MDS_EXPORTDIRWARNING 154 -#define MSG_MDS_EXPORTDIR 155 -#define MSG_MDS_EXPORTDIRNOTIFY 156 -#define MSG_MDS_EXPORTDIRNOTIFYACK 157 -#define MSG_MDS_EXPORTDIRFINISH 158 - - -#define MSG_MDS_HASHDIRDISCOVER 160 -#define MSG_MDS_HASHDIRDISCOVERACK 161 -#define MSG_MDS_HASHDIRPREP 162 -#define MSG_MDS_HASHDIRPREPACK 163 -#define MSG_MDS_HASHDIR 164 -#define MSG_MDS_HASHDIRACK 165 -#define MSG_MDS_HASHDIRNOTIFY 166 - -#define MSG_MDS_HASHREADDIR 168 -#define MSG_MDS_HASHREADDIRREPLY 169 - -#define MSG_MDS_UNHASHDIRPREP 170 -#define MSG_MDS_UNHASHDIRPREPACK 171 -#define MSG_MDS_UNHASHDIR 172 -#define MSG_MDS_UNHASHDIRACK 173 -#define MSG_MDS_UNHASHDIRNOTIFY 174 -#define MSG_MDS_UNHASHDIRNOTIFYACK 175 - -#define MSG_MDS_DENTRYUNLINK 200 - -#define MSG_MDS_RENAMEWARNING 300 // sent from src to bystanders -#define MSG_MDS_RENAMENOTIFY 301 // sent from dest to bystanders -#define MSG_MDS_RENAMENOTIFYACK 302 // sent back to src -#define MSG_MDS_RENAMEACK 303 // sent from src to initiator, to xlock_finish - -#define MSG_MDS_RENAMEPREP 304 // sent from initiator to dest auth (if dir) -#define MSG_MDS_RENAMEREQ 305 // sent from initiator (or dest if dir) to src auth -#define MSG_MDS_RENAME 306 // sent from src to dest, includes inode - -#define MSG_MDS_LOCK 500 - -#define MSG_MDS_SHUTDOWNSTART 900 -#define MSG_MDS_SHUTDOWNFINISH 901 - - -#include -#include - -#include -#include -using std::list; - -#include -#include - -using __gnu_cxx::crope; - -#include "include/types.h" -#include "include/buffer.h" -#include "msg_types.h" - - - - -// ====================================================== - -// abstract Message class - - - -typedef struct { - int type; - entity_inst_t src, dst; - int source_port, dest_port; - int nchunks; -} msg_envelope_t; - -#define MSG_ENVELOPE_LEN sizeof(msg_envelope_t) - - -class Message { - private: - - protected: - msg_envelope_t env; // envelope - bufferlist payload; // payload - - friend class Messenger; -public: - - public: - Message() { - env.source_port = env.dest_port = -1; - env.nchunks = 0; - }; - Message(int t) { - env.source_port = env.dest_port = -1; - env.nchunks = 0; - env.type = t; - } - virtual ~Message() { - } - - - // for rpc-type procedural messages (pcid = procedure call id) - virtual long get_pcid() { return 0; } - virtual void set_pcid(long t) { assert(0); } // overload me - - void clear_payload() { payload.clear(); } - bool empty_payload() { return payload.length() == 0; } - bufferlist& get_payload() { - return payload; - } - void set_payload(bufferlist& bl) { - payload.claim(bl); - } - msg_envelope_t& get_envelope() { - return env; - } - void set_envelope(msg_envelope_t& env) { - this->env = env; - } - - - // ENVELOPE ---- - - // type - int get_type() { return env.type; } - void set_type(int t) { env.type = t; } - virtual char *get_type_name() = 0; - - // source/dest - entity_inst_t& get_dest_inst() { return env.dst; } - void set_dest_inst(entity_inst_t& inst) { env.dst = inst; } - - entity_inst_t& get_source_inst() { return env.src; } - void set_source_inst(entity_inst_t& inst) { env.src = inst; } - - entity_name_t& get_dest() { return env.dst.name; } - void set_dest(entity_name_t a, int p) { env.dst.name = a; env.dest_port = p; } - int get_dest_port() { return env.dest_port; } - void set_dest_port(int p) { env.dest_port = p; } - - entity_name_t& get_source() { return env.src.name; } - void set_source(entity_name_t a, int p) { env.src.name = a; env.source_port = p; } - int get_source_port() { return env.source_port; } - - entity_addr_t& get_source_addr() { return env.src.addr; } - void set_source_addr(const entity_addr_t &i) { env.src.addr = i; } - - // PAYLOAD ---- - void reset_payload() { - payload.clear(); - } - - // overload either the rope version (easier!) - virtual void encode_payload(crope& s) { assert(0); } - virtual void decode_payload(crope& s, int& off) { assert(0); } - - // of the bufferlist versions (faster!) - virtual void decode_payload() { - // use a crope for convenience, small messages, etc. FIXME someday. - crope ser; - for (list::const_iterator it = payload.buffers().begin(); - it != payload.buffers().end(); - it++) - ser.append((*it).c_str(), (*it).length()); - - int off = 0; - decode_payload(ser, off); - assert((unsigned)off == payload.length()); - } - virtual void encode_payload() { - assert(payload.length() == 0); // caller should reset payload - - // use crope for convenience, small messages. FIXME someday. - crope r; - encode_payload(r); - - // copy payload - payload.push_back( buffer::copy(r.c_str(), r.length()) ); - } - - virtual void print(ostream& out) { - out << get_type_name(); - } - -}; - -extern Message *decode_message(msg_envelope_t &env, bufferlist& bl); -inline ostream& operator<<(ostream& out, Message& m) { - m.print(out); - return out; -} - -#endif diff --git a/tags/20070517_before_mds_merge/msg/Messenger.cc b/tags/20070517_before_mds_merge/msg/Messenger.cc deleted file mode 100644 index a6133260e9b9e..0000000000000 --- a/tags/20070517_before_mds_merge/msg/Messenger.cc +++ /dev/null @@ -1,38 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include -#include "include/types.h" - -#include "Message.h" -#include "Messenger.h" -#include "messages/MGenericMessage.h" - -#include -#include -using namespace std; - - -// --------- -// incoming messages - -void Messenger::dispatch(Message *m) -{ - assert(dispatcher); - dispatcher->dispatch(m); -} - - - diff --git a/tags/20070517_before_mds_merge/msg/Messenger.h b/tags/20070517_before_mds_merge/msg/Messenger.h deleted file mode 100644 index 991e80c839112..0000000000000 --- a/tags/20070517_before_mds_merge/msg/Messenger.h +++ /dev/null @@ -1,86 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __MESSENGER_H -#define __MESSENGER_H - -#include -using namespace std; - -#include "Message.h" -#include "Dispatcher.h" -#include "common/Mutex.h" -#include "common/Cond.h" -#include "include/Context.h" - - - -class MDS; -class Timer; - -class Messenger { - private: - Dispatcher *dispatcher; - entity_name_t _myname; - - public: - Messenger(entity_name_t w) : dispatcher(0), _myname(w) { } - virtual ~Messenger() { } - - // accessors - entity_name_t get_myname() { return _myname; } - void _set_myname(entity_name_t m) { _myname = m; } - - virtual void reset_myname(entity_name_t m) = 0; - - virtual const entity_addr_t &get_myaddr() = 0; - - entity_inst_t get_myinst() { return entity_inst_t(_myname, get_myaddr()); } - - // hrmpf. - virtual int get_dispatch_queue_len() { return 0; }; - - // setup - void set_dispatcher(Dispatcher *d) { dispatcher = d; ready(); } - Dispatcher *get_dispatcher() { return dispatcher; } - virtual void ready() { } - bool is_ready() { return dispatcher != 0; } - - // dispatch incoming messages - virtual void dispatch(Message *m) { - assert(dispatcher); - dispatcher->dispatch(m); - } - - // shutdown - virtual int shutdown() = 0; - - // send message - virtual void prepare_dest(const entity_addr_t& addr) {} - virtual int send_message(Message *m, entity_inst_t dest, - int port=0, int fromport=0) = 0; - - // make a procedure call - //virtual Message* sendrecv(Message *m, msg_name_t dest, int port=0); - - virtual void mark_down(entity_addr_t a) {} - -}; - - - - - -#endif diff --git a/tags/20070517_before_mds_merge/msg/NewMessenger.cc b/tags/20070517_before_mds_merge/msg/NewMessenger.cc deleted file mode 100644 index 1455c31724c68..0000000000000 --- a/tags/20070517_before_mds_merge/msg/NewMessenger.cc +++ /dev/null @@ -1,1714 +0,0 @@ - -#include "NewMessenger.h" - -#include -#include -#include -#include - -#include "config.h" - -#include "messages/MGenericMessage.h" -#include "messages/MNSConnect.h" -#include "messages/MNSConnectAck.h" -#include "messages/MNSRegister.h" -#include "messages/MNSRegisterAck.h" -#include "messages/MNSLookup.h" -#include "messages/MNSLookupReply.h" -#include "messages/MNSFailure.h" - -//#include "messages/MFailure.h" - -#include - - -#undef dout -#define dout(l) if (l<=g_conf.debug_ms) cout << g_clock.now() << " -- rank" << rank.my_rank << " " -#define derr(l) if (l<=g_conf.debug_ms) cerr << g_clock.now() << " -- rank" << rank.my_rank << " " - - - -#include "tcp.cc" - - -Rank rank; - - -/******************************************** - * Namer - */ - -Rank::Namer::Namer(EntityMessenger *msgr) : - messenger(msgr), - nrank(0), nclient(0), nmds(0), nosd(0), nmon(0) -{ - assert(rank.my_rank == 0); - nrank = g_conf.num_mon; - - // announce myself - /* - cerr << "ceph ns is " << rank.accepter.listen_addr << endl; - cout << "export CEPH_NAMESERVER=" << rank.accepter.listen_addr << endl; - int fd = ::open(".ceph_ns", O_WRONLY|O_CREAT); - ::write(fd, (void*)&rank.accepter.listen_addr, sizeof(tcpaddr_t)); - ::fchmod(fd, 0755); - ::close(fd); - */ - - // ok - messenger->set_dispatcher(this); -} - -Rank::Namer::~Namer() -{ - //::unlink(".ceph_ns"); -} - - -void Rank::Namer::dispatch(Message *m) -{ - rank.lock.Lock(); - int type = m->get_type(); - switch (type) { - case MSG_NS_CONNECT: - handle_connect((class MNSConnect*)m); - break; - case MSG_NS_REGISTER: - handle_register((class MNSRegister*)m); - break; - case MSG_NS_STARTED: - handle_started(m); - break; - case MSG_NS_UNREGISTER: - handle_unregister(m); - break; - case MSG_NS_LOOKUP: - handle_lookup((class MNSLookup*)m); - break; - case MSG_NS_FAILURE: - handle_failure((class MNSFailure*)m); - break; - - case MSG_FAILURE_ACK: - delete m; - break; - - default: - assert(0); - } - rank.lock.Unlock(); -} - -void Rank::Namer::handle_connect(MNSConnect *m) -{ - int newrank = nrank++; - dout(2) << "namer.handle_connect from new rank" << newrank << " " << m->get_addr() << endl; - - rank.entity_map[MSG_ADDR_RANK(newrank)].addr = m->get_addr(); - rank.entity_map[MSG_ADDR_RANK(newrank)].rank = newrank; - rank.entity_unstarted.insert(MSG_ADDR_RANK(newrank)); - - messenger->send_message(new MNSConnectAck(newrank), - MSG_ADDR_RANK(newrank), rank.entity_map[MSG_ADDR_RANK(newrank)]); - delete m; -} - -void Rank::Namer::manual_insert_inst(const entity_inst_t &inst) -{ - rank.entity_map[MSG_ADDR_RANK(inst.rank)] = inst; -} - -void Rank::Namer::handle_register(MNSRegister *m) -{ - dout(10) << "namer.handle_register from rank " << m->get_rank() - << " addr " << m->get_entity() << endl; - - // pick id - entity_name_t entity = m->get_entity(); - - if (entity.is_new()) { - // make up a new address! - switch (entity.type()) { - case MSG_ADDR_MDS_BASE: - entity = MSG_ADDR_MDS(nmds++); - break; - - case MSG_ADDR_OSD_BASE: - entity = MSG_ADDR_OSD(nosd++); - break; - - case MSG_ADDR_CLIENT_BASE: - entity = MSG_ADDR_CLIENT(nclient++); - break; - - default: - assert(0); - } - } else { - // specific address! - } - - - // register - if (rank.entity_map.count(entity)) { - dout(1) << "namer.handle_register re-registering " << entity - << " inst " << m->get_source_inst() - << " (was " << rank.entity_map[entity] << ")" - << endl; - } else { - dout(1) << "namer.handle_register registering " << entity - << " inst " << m->get_source_inst() - << endl; - } - rank.entity_map[entity] = m->get_source_inst(); - rank.entity_unstarted.insert(entity); - - // reply w/ new id - messenger->send_message(new MNSRegisterAck(m->get_tid(), entity), - m->get_source(), rank.entity_map[entity]); - - delete m; -} - -void Rank::Namer::handle_started(Message *m) -{ - entity_name_t who = m->get_source(); - dout(10) << "namer.handle_started from entity " << who << endl; - - assert(rank.entity_unstarted.count(who)); - rank.entity_unstarted.erase(who); - - // anybody waiting? - if (waiting.count(who)) { - list ls; - ls.swap(waiting[who]); - waiting.erase(who); - - dout(10) << "doing waiters on " << who << endl; - for (list::iterator it = ls.begin(); - it != ls.end(); - it++) - dispatch(*it); - } - -} - -void Rank::Namer::handle_unregister(Message *m) -{ - entity_name_t who = m->get_source(); - dout(1) << "namer.handle_unregister entity " << who << endl; - - rank.show_dir(); - - assert(rank.entity_map.count(who)); - rank.entity_map.erase(who); - - rank.show_dir(); - - // shut myself down? kick watcher. - if (rank.entity_map.size() == 2) { - dout(10) << "namer.handle_unregister stopping namer" << endl; - rank.lock.Unlock(); - messenger->shutdown(); - delete messenger; - rank.lock.Lock(); - } - - delete m; -} - - -void Rank::Namer::handle_lookup(MNSLookup *m) -{ - // have it? - if (rank.entity_map.count(m->get_entity()) == 0) { - dout(10) << "namer " << m->get_source() << " lookup '" << m->get_entity() << "' -> dne" << endl; - waiting[m->get_entity()].push_back(m); - return; - } - - if (rank.entity_unstarted.count(m->get_entity())) { - dout(10) << "namer " << m->get_source() << " lookup '" << m->get_entity() << "' -> unstarted" << endl; - waiting[m->get_entity()].push_back(m); - return; - } - - // look it up! - MNSLookupReply *reply = new MNSLookupReply(m); - - reply->entity_map[m->get_entity()] = rank.entity_map[m->get_entity()]; - - dout(10) << "namer " << m->get_source() - << " lookup '" << m->get_entity() - << "' -> " << rank.entity_map[m->get_entity()] << endl; - - messenger->send_message(reply, m->get_source(), m->get_source_inst()); - delete m; -} - -void Rank::Namer::handle_failure(MNSFailure *m) -{ - dout(10) << "namer.handle_failure inst " << m->get_inst() - << endl; - - // search for entities on this instance - list rm; - for (hash_map::iterator i = rank.entity_map.begin(); - i != rank.entity_map.end(); - i++) { - if (i->second != m->get_inst()) continue; - rm.push_back(i->first); - } - for (list::iterator i = rm.begin(); - i != rm.end(); - i++) { - dout(10) << "namer.handle_failure inst " << m->get_inst() - << ", removing " << *i << endl; - - rank.entity_map.erase(*i); - rank.entity_unstarted.erase(*i); - - /* - if ((*i).is_osd()) { - // tell the monitor - messenger->send_message(new MFailure(*i, m->get_inst()), MSG_ADDR_MON(0)); - } - */ - } - - delete m; -} - - - -/******************************************** - * Accepter - */ - -int Rank::Accepter::start() -{ - // bind to a socket - dout(10) << "accepter.start binding to listen " << endl; - - /* socket creation */ - listen_sd = socket(AF_INET,SOCK_STREAM,0); - assert(listen_sd > 0); - - /* bind to port */ - memset((char*)&listen_addr, 0, sizeof(listen_addr)); - listen_addr.sin_family = AF_INET; - listen_addr.sin_addr.s_addr = htonl(INADDR_ANY); - listen_addr.sin_port = 0; - - int rc = bind(listen_sd, (struct sockaddr *) &listen_addr, sizeof(listen_addr)); - assert(rc >= 0); - - socklen_t llen = sizeof(listen_addr); - getsockname(listen_sd, (sockaddr*)&listen_addr, &llen); - - int myport = listen_addr.sin_port; - - // listen! - rc = ::listen(listen_sd, 1000); - assert(rc >= 0); - - //dout(10) << "accepter.start listening on " << myport << endl; - - // my address is... - char host[100]; - bzero(host, 100); - gethostname(host, 100); - //dout(10) << "accepter.start my hostname is " << host << endl; - - struct hostent *myhostname = gethostbyname( host ); - - struct sockaddr_in my_addr; - memset(&my_addr, 0, sizeof(my_addr)); - - my_addr.sin_family = myhostname->h_addrtype; - memcpy((char *) &my_addr.sin_addr.s_addr, - myhostname->h_addr_list[0], - myhostname->h_length); - my_addr.sin_port = myport; - - listen_addr = my_addr; - - dout(10) << "accepter.start listen addr is " << listen_addr << endl; - - // start thread - create(); - - return 0; -} - -void *Rank::Accepter::entry() -{ - dout(10) << "accepter starting" << endl; - - while (!done) { - // accept - struct sockaddr_in addr; - socklen_t slen = sizeof(addr); - int sd = ::accept(listen_sd, (sockaddr*)&addr, &slen); - if (sd > 0) { - dout(10) << "accepted incoming on sd " << sd << endl; - - Receiver *r = new Receiver(sd); - r->create(); - - rank.lock.Lock(); - rank.receivers.insert(r); - rank.lock.Unlock(); - } else { - dout(10) << "no incoming connection?" << endl; - break; - } - } - - return 0; -} - - -/************************************** - * Receiver - */ - -void *Rank::Receiver::entry() -{ - while (!done) { - Message *m = read_message(); - if (!m) { - ::close(sd); - break; - } - - dout(10) << "receiver.entry got message for " << m->get_dest() << endl; - - EntityMessenger *entity = 0; - - rank.lock.Lock(); - { - if (rank.down.count(m->get_dest())) { - dout(0) << "receiver.entry dest " << m->get_dest() << " down, dropping " << *m << endl; - delete m; - - if (rank.looking_up.count(m->get_dest()) == 0) - rank.lookup(m->get_dest()); - } - else if (rank.entity_map.count(m->get_source()) && - rank.entity_map[m->get_source()] > m->get_source_inst()) { - derr(0) << "receiver.entry source " << m->get_source() - << " inst " << m->get_source_inst() - << " < " << rank.entity_map[m->get_source()] - << ", dropping " << *m << endl; - delete m; - } - else { - if (rank.entity_map.count(m->get_source()) && - rank.entity_map[m->get_source()] > m->get_source_inst()) { - derr(0) << "receiver.entry source " << m->get_source() - << " inst " << m->get_source_inst() - << " > " << rank.entity_map[m->get_source()] - << ", WATCH OUT " << *m << endl; - rank.entity_map[m->get_source()] = m->get_source_inst(); - } - - if (m->get_dest().type() == MSG_ADDR_RANK_BASE) { - // ours. - rank.dispatch(m); - } else { - if (g_conf.ms_single_dispatch) { - // submit to single dispatch queue - rank._submit_single_dispatch(m); - } else { - if (rank.local.count(m->get_dest())) { - // find entity - entity = rank.local[m->get_dest()]; - } else { - derr(0) << "got message " << *m << " for " << m->get_dest() << ", which isn't local" << endl; - rank.waiting_for_lookup[m->get_dest()].push_back(m); - } - } - } - } - } - rank.lock.Unlock(); - - if (entity) - entity->queue_message(m); // queue - } - - // add to reap queue - rank.lock.Lock(); - rank.receiver_reap_queue.push_back(this); - rank.wait_cond.Signal(); - rank.lock.Unlock(); - - return 0; -} - -Message *Rank::Receiver::read_message() -{ - // envelope - //dout(10) << "receiver.read_message from sd " << sd << endl; - - msg_envelope_t env; - if (!tcp_read( sd, (char*)&env, sizeof(env) )) - return 0; - - if (env.type == 0) { - dout(10) << "receiver got dummy env, bailing" << endl; - return 0; - } - - dout(20) << "receiver got envelope type=" << env.type - << " src " << env.source << " dst " << env.dest - << " nchunks=" << env.nchunks - << endl; - - // payload - bufferlist blist; - for (int i=0; iget_source() << endl; - - return m; -} - - -/************************************** - * Sender - */ - -int Rank::Sender::connect() -{ - dout(10) << "sender(" << inst << ").connect" << endl; - - // create socket? - sd = socket(AF_INET,SOCK_STREAM,0); - assert(sd > 0); - - // bind any port - struct sockaddr_in myAddr; - myAddr.sin_family = AF_INET; - myAddr.sin_addr.s_addr = htonl(INADDR_ANY); - myAddr.sin_port = htons( 0 ); - - int rc = bind(sd, (struct sockaddr *) &myAddr, sizeof(myAddr)); - assert(rc>=0); - - // connect! - int r = ::connect(sd, (sockaddr*)&inst.addr, sizeof(myAddr)); - if (r < 0) return r; - - // identify myself - // FIXME - - return 0; -} - - -void Rank::Sender::finish() -{ - dout(10) << "sender(" << inst << ").finish" << endl; - - // make sure i get reaped. - rank.lock.Lock(); - rank.sender_reap_queue.push_back(this); - rank.wait_cond.Signal(); - rank.lock.Unlock(); -} - -void Rank::Sender::fail_and_requeue(list& out) -{ - dout(10) << "sender(" << inst << ").fail" << endl;// and requeue" << endl; - - // tell namer - if (!rank.messenger) { - derr(0) << "FATAL error: can't send failure to namer0, not connected yet" << endl; - assert(0); - } - - // old and unnecessary? - if (0) - rank.messenger->send_message(new MNSFailure(inst), - MSG_ADDR_NAMER(0)); - - - // FIXME: possible race before i reclaim lock here? - - Dispatcher *dis = 0; - entity_name_t dis_dest; - - list lost; - - // requeue my messages - rank.lock.Lock(); - lock.Lock(); - { - // include out at front of queue - q.splice(q.begin(), out); - dout(10) << "sender(" << inst << ").fail " - << q.size() << " messages" << endl; - - if (0) { - lost.swap(q); - } else { - - while (!q.empty()) { - // don't keep reconnecting.. - if (rank.entity_map.count(q.front()->get_dest()) && - rank.entity_map[q.front()->get_dest()] == inst) - rank.down.insert(q.front()->get_dest()); - //rank.entity_map.erase(q.front()->get_dest()); - - if (!dis && - rank.local.count(q.front()->get_source())) { - dis_dest = q.front()->get_dest(); - dis = rank.local[q.front()->get_source()]->get_dispatcher(); - } - - if (g_conf.ms_requeue_on_sender_fail) - rank.submit_message( q.front() ); - else - lost.push_back( q.front() ); - q.pop_front(); - } - } - - // deactivate myself - if (rank.rank_sender.count(inst.rank) && - rank.rank_sender[inst.rank] == this) - rank.rank_sender.erase(inst.rank); - - // stop sender loop - done = true; - } - lock.Unlock(); - - - // send special failure msg? - if (dis) { - for (list::iterator p = lost.begin(); - p != lost.end(); - p++) - dis->ms_handle_failure(*p, dis_dest, inst); - } - - rank.lock.Unlock(); -} - -void *Rank::Sender::entry() -{ - // connect - if (sd == 0) { - int rc = connect(); - if (rc < 0) { - list out; - derr(0) << "error connecting to " << inst << endl; - fail_and_requeue(out); - finish(); - return 0; - } - } - - lock.Lock(); - while (!q.empty() || !done) { - - if (!q.empty()) { - dout(20) << "sender(" << inst << ") grabbing message(s)" << endl; - - // grab outgoing list - list out; - out.swap(q); - - // drop lock while i send these - lock.Unlock(); - - while (!out.empty()) { - Message *m = out.front(); - out.pop_front(); - - dout(20) << "sender(" << inst << ") sending " << *m << endl; - - // stamp. - m->set_source_inst(rank.my_inst); - - // marshall - if (m->empty_payload()) - m->encode_payload(); - - if (write_message(m) < 0) { - // failed! - derr(0) << "error sending to " << m->get_dest() << " on " << inst << endl; - out.push_front(m); - fail_and_requeue(out); - break; - } - } - - lock.Lock(); - continue; - } - - // wait - dout(20) << "sender(" << inst << ") sleeping" << endl; - cond.Wait(lock); - } - lock.Unlock(); - - finish(); - return 0; -} - - -int Rank::Sender::write_message(Message *m) -{ - // get envelope, buffers - msg_envelope_t *env = &m->get_envelope(); - bufferlist blist; - blist.claim( m->get_payload() ); - -#ifdef TCP_KEEP_CHUNKS - env->nchunks = blist.buffers().size(); -#else - env->nchunks = 1; -#endif - - dout(20)// << g_clock.now() - << " sending " << m << " " << *m - << " to " << m->get_dest() - << endl; - - // send envelope - int r = tcp_write( sd, (char*)env, sizeof(*env) ); - if (r < 0) { - derr(20) << "error sending envelope for " << *m - << " to " << m->get_dest() << endl; - return -1; - } - - // payload -#ifdef TCP_KEEP_CHUNKS - // send chunk-wise - int i = 0; - for (list::iterator it = blist.buffers().begin(); - it != blist.buffers().end(); - it++) { - dout(10) << "tcp_sending frag " << i << " len " << (*it).length() << endl; - int size = (*it).length(); - r = tcp_write( sd, (char*)&size, sizeof(size) ); - if (r < 0) { - derr(20) << "error sending chunk len for " << *m << " to " << m->get_dest() << endl; - return -1; - } - r = tcp_write( sd, (*it).c_str(), size ); - if (r < 0) { - derr(20) << "error sending data chunk for " << *m << " to " << m->get_dest() << endl; - return -1; - } - i++; - } -#else - // one big chunk - int size = blist.length(); - r = tcp_write( sd, (char*)&size, sizeof(size) ); - if (r < 0) { - derr(20) << "error sending data len for " << *m << " to " << m->get_dest() << endl; - return -1; - } - for (list::iterator it = blist.buffers().begin(); - it != blist.buffers().end(); - it++) { - r = tcp_write( sd, (*it).c_str(), (*it).length() ); - if (r < 0) { - derr(20) << "error sending data megachunk for " << *m << " to " << m->get_dest() << " : len " << (*it).length() << endl; - return -1; - } - } -#endif - - // delete message - delete m; - return 0; -} - - - -/******************************************** - * Rank - */ - -Rank::Rank(int r) : - single_dispatcher(this), - my_rank(r), - namer(0) { -} -Rank::~Rank() -{ - //FIXME - if (namer) delete namer; -} - - -void Rank::_submit_single_dispatch(Message *m) -{ - assert(lock.is_locked()); - - if (local.count(m->get_dest()) && - local[m->get_dest()]->is_ready()) { - rank.single_dispatch_queue.push_back(m); - rank.single_dispatch_cond.Signal(); - } else { - waiting_for_ready[m->get_dest()].push_back(m); - } -} - - -void Rank::single_dispatcher_entry() -{ - lock.Lock(); - while (!single_dispatch_stop || !single_dispatch_queue.empty()) { - if (!single_dispatch_queue.empty()) { - list ls; - ls.swap(single_dispatch_queue); - - lock.Unlock(); - { - while (!ls.empty()) { - Message *m = ls.front(); - ls.pop_front(); - - dout(1) //<< g_clock.now() - << "---- " - << m->get_source() << ':' << m->get_source_port() - << " to " << m->get_dest() << ':' << m->get_dest_port() - << " ---- " << m->get_type_name() - << " ---- " << m - << endl; - - if (m->get_dest().type() == MSG_ADDR_RANK_BASE) - rank.dispatch(m); - else { - assert(local.count(m->get_dest())); - local[m->get_dest()]->dispatch(m); - } - } - } - lock.Lock(); - continue; - } - single_dispatch_cond.Wait(lock); - } - lock.Unlock(); -} - - -/* - * note: assumes lock is held - */ -void Rank::reaper() -{ - assert(lock.is_locked()); - - while (!receiver_reap_queue.empty()) { - Receiver *r = receiver_reap_queue.front(); - receiver_reap_queue.pop_front(); - //dout(10) << "reaper reaping receiver sd " << r->sd << endl; - receivers.erase(r); - r->join(); - dout(10) << "reaper reaped receiver sd " << r->sd << endl; - delete r; - } - - while (!sender_reap_queue.empty()) { - Sender *s = sender_reap_queue.front(); - sender_reap_queue.pop_front(); - //dout(10) << "reaper reaping sender rank " << s->dest_rank << " at " << s->tcpaddr << endl; - if (rank_sender.count(s->inst.rank) && - rank_sender[s->inst.rank] == s) - rank_sender.erase(s->inst.rank); - s->join(); - dout(10) << "reaper reaped sender " << s->inst << endl; - delete s; - } -} - - -int Rank::start_rank() -{ - dout(10) << "start_rank" << endl; - - // bind to a socket - if (accepter.start() < 0) - return -1; - - // start single thread dispatcher? - if (g_conf.ms_single_dispatch) { - single_dispatch_stop = false; - single_dispatcher.create(); - } - - lock.Lock(); - - if (my_rank < 0) { - dout(10) << "start_rank connecting to namer0" << endl; - - // connect to namer - assert(entity_map.count(MSG_ADDR_NAMER(0))); - Sender *sender = connect_rank(entity_map[MSG_ADDR_NAMER(0)]); - - // send - Message *m = new MNSConnect(accepter.listen_addr); - m->set_dest(MSG_ADDR_NAMER(0), 0); - sender->send(m); - - // wait - while (my_rank < 0) - waiting_for_rank.Wait(lock); - assert(my_rank >= 0); - - dout(10) << "start_rank got rank " << my_rank << endl; - - // create rank entity - entity_map[MSG_ADDR_RANK(my_rank)] = my_inst; - local[MSG_ADDR_RANK(my_rank)] = messenger = new EntityMessenger(MSG_ADDR_RANK(my_rank)); - messenger->set_dispatcher(this); - } else { - // my_inst - my_inst.addr = accepter.listen_addr; - my_inst.rank = my_rank; - - // create my rank - entity_name_t raddr = MSG_ADDR_RANK(my_rank); - entity_map[raddr] = my_inst; - entity_unstarted.insert(raddr); - local[raddr] = messenger = new EntityMessenger(raddr); - messenger->set_dispatcher(this); - - dout(1) << "start_rank " << my_rank << " at " << my_inst << endl; - } - - lock.Unlock(); - return 0; -} - -void Rank::start_namer() -{ - // create namer0 - entity_name_t naddr = MSG_ADDR_NAMER(0); - entity_map[naddr] = my_inst; - local[naddr] = new EntityMessenger(naddr); - namer = new Namer(local[naddr]); -} - -void Rank::set_namer(const tcpaddr_t& ns) -{ - entity_map[MSG_ADDR_NAMER(0)].addr = ns; - entity_map[MSG_ADDR_NAMER(0)].rank = 0; -} - -/* connect_rank - * NOTE: assumes rank.lock held. - */ -Rank::Sender *Rank::connect_rank(const entity_inst_t& inst) -{ - assert(rank.lock.is_locked()); - assert(inst != rank.my_inst); - - dout(10) << "connect_rank to " << inst << endl; - - // create sender - Sender *sender = new Sender(inst); - //int rc = sender->connect(); - //assert(rc >= 0); - - // start thread. - sender->create(); - - // old sender? - assert(rank.rank_sender.count(inst.rank) == 0); - //if (rank.rank_sender.count(r)) - //rank.rank_sender[r]->stop(); - - // ok! - rank.rank_sender[inst.rank] = sender; - return sender; -} - - - - - -void Rank::show_dir() -{ - dout(10) << "show_dir ---" << endl; - - for (hash_map::iterator i = entity_map.begin(); - i != entity_map.end(); - i++) { - if (local.count(i->first)) { - dout(10) << "show_dir entity_map " << i->first << " -> " << i->second << " local " << endl; - } else { - dout(10) << "show_dir entity_map " << i->first << " -> " << i->second << endl; - } - } -} - - -/* lookup - * NOTE: assumes directory.lock held - */ -void Rank::lookup(entity_name_t addr) -{ - dout(10) << "lookup " << addr << endl; - assert(lock.is_locked()); - - assert(looking_up.count(addr) == 0); - looking_up.insert(addr); - - MNSLookup *r = new MNSLookup(addr); - messenger->send_message(r, MSG_ADDR_DIRECTORY); -} - - - -/* register_entity - */ -Rank::EntityMessenger *Rank::register_entity(entity_name_t addr) -{ - dout(10) << "register_entity " << addr << endl; - lock.Lock(); - - // register with namer - static long reg_attempt = 0; - long id = ++reg_attempt; - - Message *reg = new MNSRegister(addr, my_rank, id); - reg->set_source(MSG_ADDR_RANK(my_rank), 0); - reg->set_source_inst(my_inst); - reg->set_dest(MSG_ADDR_DIRECTORY, 0); - - // prepare cond - Cond cond; - waiting_for_register_cond[id] = &cond; - - // send request - lock.Unlock(); - submit_message(reg); - lock.Lock(); - - // wait - while (!waiting_for_register_result.count(id)) - cond.Wait(lock); - - // grab result - addr = waiting_for_register_result[id]; - dout(10) << "register_entity got " << addr << endl; - - // clean up - waiting_for_register_cond.erase(id); - waiting_for_register_result.erase(id); - - // create messenger - EntityMessenger *msgr = new EntityMessenger(addr); - - // add to directory - entity_map[addr] = my_inst; - local[addr] = msgr; - - // was anyone waiting? - if (waiting_for_lookup.count(addr)) { - submit_messages(waiting_for_lookup[addr]); - waiting_for_lookup.erase(addr); - } - - lock.Unlock(); - return msgr; -} - -void Rank::unregister_entity(EntityMessenger *msgr) -{ - lock.Lock(); - dout(10) << "unregister_entity " << msgr->get_myaddr() << endl; - - // remove from local directory. - assert(local.count(msgr->get_myaddr())); - local.erase(msgr->get_myaddr()); - - if (my_rank > 0) { - assert(entity_map.count(msgr->get_myaddr())); - entity_map.erase(msgr->get_myaddr()); - } // else namer will do it. - - // tell namer. - if (msgr->get_myaddr() != MSG_ADDR_NAMER(0) && - msgr->get_myaddr() != MSG_ADDR_RANK(0)) - msgr->send_message(new MGenericMessage(MSG_NS_UNREGISTER), - MSG_ADDR_NAMER(0)); - - // kick wait()? - if (local.size() <= 2) - wait_cond.Signal(); - - lock.Unlock(); -} - - -void Rank::submit_messages(list& ls) -{ - for (list::iterator i = ls.begin(); i != ls.end(); i++) - submit_message(*i); - ls.clear(); -} - - -void Rank::prepare_dest(entity_name_t dest) -{ - lock.Lock(); - - if (entity_map.count( dest )) { - // remote, known rank addr. - entity_inst_t inst = entity_map[dest]; - - if (inst == my_inst) { - //dout(20) << "submit_message " << *m << " dest " << dest << " local but mid-register, waiting." << endl; - //waiting_for_lookup[dest].push_back(m); - } - else if (rank_sender.count( inst.rank ) && - rank_sender[inst.rank]->inst == inst) { - //dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << inst << ", connected." << endl; - // connected. - //sender = rank_sender[ inst.rank ]; - } else { - //dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << inst << ", connecting." << endl; - // not connected. - connect_rank( inst ); - } - } else { - // unknown dest rank or rank addr. - if (looking_up.count(dest) == 0) { - //dout(20) << "submit_message " << *m << " dest " << dest << " remote, unknown addr, looking up" << endl; - lookup(dest); - } else { - //dout(20) << "submit_message " << *m << " dest " << dest << " remote, unknown addr, already looking up" << endl; - } - } - - lock.Unlock(); -} - -void Rank::submit_message(Message *m, const entity_inst_t& dest_inst) -{ - const entity_name_t dest = m->get_dest(); - - // lookup - EntityMessenger *entity = 0; - Sender *sender = 0; - - lock.Lock(); - { - // local? - if (dest_inst.rank == my_inst.rank) { - if (local.count(dest)) { - // local - dout(20) << "submit_message " << *m << " dest " << dest << " local" << endl; - if (g_conf.ms_single_dispatch) { - _submit_single_dispatch(m); - } else { - entity = local[dest]; - } - } else { - // mid-register - dout(20) << "submit_message " << *m << " dest " << dest << " local but mid-register, waiting." << endl; - assert(0); - waiting_for_lookup[dest].push_back(m); - } - } - else { - // remote. - if (rank_sender.count( dest_inst.rank )) { - //&& - //rank_sender[dest_inst.rank]->inst == dest_inst) { - dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << dest_inst << ", connected." << endl; - // connected. - sender = rank_sender[ dest_inst.rank ]; - } else { - dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << dest_inst << ", connecting." << endl; - // not connected. - sender = connect_rank( dest_inst ); - } - } - } - lock.Unlock(); - - // do it - if (entity) { - // local! - dout(20) << "submit_message " << *m << " dest " << dest << " local, queueing" << endl; - entity->queue_message(m); - } - else if (sender) { - // remote! - dout(20) << "submit_message " << *m << " dest " << dest << " remote, sending" << endl; - sender->send(m); - } -} - - -void Rank::submit_message(Message *m) -{ - const entity_name_t dest = m->get_dest(); - - // lookup - EntityMessenger *entity = 0; - Sender *sender = 0; - - lock.Lock(); - { - if (down.count(dest)) { - // black hole. - dout(0) << "submit_message " << *m << " dest " << dest << " down, dropping" << endl; - delete m; - - if (looking_up.count(dest) == 0) - lookup(dest); - - } else if (local.count(dest)) { - dout(20) << "submit_message " << *m << " dest " << dest << " local" << endl; - - // local - if (g_conf.ms_single_dispatch) { - _submit_single_dispatch(m); - } else { - entity = local[dest]; - } - } else if (entity_map.count( dest )) { - // remote, known rank addr. - entity_inst_t inst = entity_map[dest]; - - if (inst == my_inst) { - dout(20) << "submit_message " << *m << " dest " << dest << " local but mid-register, waiting." << endl; - waiting_for_lookup[dest].push_back(m); - } - else if (rank_sender.count( inst.rank ) && - rank_sender[inst.rank]->inst == inst) { - dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << inst << ", connected." << endl; - // connected. - sender = rank_sender[ inst.rank ]; - } else { - dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << inst << ", connecting." << endl; - // not connected. - sender = connect_rank( inst ); - } - } else { - // unknown dest rank or rank addr. - if (looking_up.count(dest) == 0) { - dout(20) << "submit_message " << *m << " dest " << dest << " remote, unknown addr, looking up" << endl; - lookup(dest); - } else { - dout(20) << "submit_message " << *m << " dest " << dest << " remote, unknown addr, already looking up" << endl; - } - waiting_for_lookup[dest].push_back(m); - } - } - lock.Unlock(); - - // do it - if (entity) { - // local! - dout(20) << "submit_message " << *m << " dest " << dest << " local, queueing" << endl; - entity->queue_message(m); - } - else if (sender) { - // remote! - dout(20) << "submit_message " << *m << " dest " << dest << " remote, sending" << endl; - sender->send(m); - } -} - - - - -void Rank::dispatch(Message *m) -{ - lock.Lock(); - - dout(10) << "dispatching " << *m << endl; - - switch (m->get_type()) { - case MSG_NS_CONNECTACK: - handle_connect_ack((MNSConnectAck*)m); - break; - - case MSG_NS_REGISTERACK: - handle_register_ack((MNSRegisterAck*)m); - break; - - case MSG_NS_LOOKUPREPLY: - handle_lookup_reply((MNSLookupReply*)m); - break; - - default: - assert(0); - } - - lock.Unlock(); -} - -void Rank::handle_connect_ack(MNSConnectAck *m) -{ - dout(10) << "handle_connect_ack, my rank is " << m->get_rank() << endl; - my_rank = m->get_rank(); - - my_inst.addr = accepter.listen_addr; - my_inst.rank = my_rank; - - waiting_for_rank.SignalAll(); - delete m; - - // logger! - /*dout(10) << "logger" << endl; - char names[100]; - sprintf(names, "rank%d", my_rank); - string name = names; - - if (g_conf.tcp_log) { - logger = new Logger(name, (LogType*)&rank_logtype); - rank_logtype.add_set("num"); - rank_logtype.add_inc("in"); - rank_logtype.add_inc("inb"); - rank_logtype.add_inc("dis"); - rank_logtype.add_set("inq"); - rank_logtype.add_set("inqb"); - rank_logtype.add_set("outq"); - rank_logtype.add_set("outqb"); - } - */ -} - - -void Rank::handle_register_ack(MNSRegisterAck *m) -{ - dout(10) << "handle_register_ack " << m->get_entity() << endl; - - long tid = m->get_tid(); - waiting_for_register_result[tid] = m->get_entity(); - waiting_for_register_cond[tid]->Signal(); - delete m; -} - -void Rank::handle_lookup_reply(MNSLookupReply *m) -{ - list waiting; - dout(10) << "got lookup reply" << endl; - - for (map::iterator it = m->entity_map.begin(); - it != m->entity_map.end(); - it++) { - dout(10) << "lookup got " << it->first << " at " << it->second << endl; - entity_name_t addr = it->first; - entity_inst_t inst = it->second; - - if (down.count(addr)) { - // ignore - dout(10) << "ignoring lookup results for " << addr << ", who is down" << endl; - //assert(entity_map.count(addr) == 0); - continue; - } - - if (entity_map.count(addr) && - entity_map[addr] > inst) { - dout(10) << "ignoring lookup results for " << addr << ", " \ - << entity_map[addr] << " > " << inst << endl; - continue; - } - - // update map. - entity_map[addr] = inst; - - if (inst.rank == my_rank) { - // local - dout(10) << "delivering lookup results locally" << endl; - if (local.count(addr)) { - if (g_conf.ms_single_dispatch) { - single_dispatch_queue.splice(single_dispatch_queue.end(), - waiting_for_lookup[addr]); - } else { - local[addr]->queue_messages(waiting_for_lookup[addr]); - } - waiting_for_lookup.erase(addr); - } else - lookup(addr); // try again! - - } else { - // remote - if (rank_sender.count(inst.rank) == 0) - connect_rank(inst); - else if (rank_sender[inst.rank]->inst != inst) { - dout(0) << "lookup got rank addr change, WATCH OUT" << endl; - // FIXME BUG possible message loss weirdness? - rank_sender[inst.rank]->stop(); - rank_sender.erase(inst.rank); - connect_rank(inst); - } - - // take waiters - Sender *sender = rank_sender[inst.rank]; - assert(sender); - - if (waiting_for_lookup.count(addr)) { - sender->send(waiting_for_lookup[addr]); - waiting_for_lookup.erase(addr); - } - } - } - - delete m; -} - - -void Rank::wait() -{ - lock.Lock(); - while (1) { - // reap dead senders, receivers. - reaper(); - - if (local.size() == 0) { - dout(10) << "wait: everything stopped" << endl; - break; // everything stopped. - } - - if (local.size() == 1 && - !messenger->is_stopped()) { - dout(10) << "wait: stopping rank" << endl; - lock.Unlock(); - messenger->shutdown(); - delete messenger; - lock.Lock(); - continue; - } - - wait_cond.Wait(lock); - } - lock.Unlock(); - - // done! clean up. - - // stop dispatch thread - if (g_conf.ms_single_dispatch) { - dout(10) << "wait: stopping dispatch thread" << endl; - lock.Lock(); - single_dispatch_stop = true; - single_dispatch_cond.Signal(); - lock.Unlock(); - single_dispatcher.join(); - } - - // reap senders and receivers - lock.Lock(); - { - dout(10) << "wait: stopping senders" << endl; - for (hash_map::iterator i = rank_sender.begin(); - i != rank_sender.end(); - i++) - i->second->stop(); - while (!rank_sender.empty()) { - wait_cond.Wait(lock); - reaper(); - } - - if (0) { // stop() no worky on receivers! we leak, but who cares. - dout(10) << "wait: stopping receivers" << endl; - for (set::iterator i = receivers.begin(); - i != receivers.end(); - i++) - (*i)->stop(); - while (!receivers.empty()) { - wait_cond.Wait(lock); - reaper(); - } - } - - } - lock.Unlock(); - - dout(10) << "wait: done." << endl; -} - - - -int Rank::find_ns_addr(tcpaddr_t &nsa) -{ - // file? - int fd = ::open(".ceph_ns",O_RDONLY); - if (fd > 0) { - ::read(fd, (void*)&nsa, sizeof(nsa)); - ::close(fd); - cout << "ceph ns is " << nsa << endl; - return 0; - } - - // env var? - char *nsaddr = getenv("CEPH_NAMESERVER");////envz_entry(*envp, e_len, "CEPH_NAMESERVER"); - if (nsaddr) { - while (nsaddr[0] != '=') nsaddr++; - nsaddr++; - - if (tcp_hostlookup(nsaddr, nsa) < 0) { - cout << "can't resolve " << nsaddr << endl; - return -1; - } - - cout << "ceph ns is " << nsa << endl; - return 0; - } - - cerr << "i can't find ceph ns addr in .ceph_ns or CEPH_NAMESERVER" << endl; - return -1; -} - - - -/********************************** - * EntityMessenger - */ - -Rank::EntityMessenger::EntityMessenger(entity_name_t myaddr) : - Messenger(myaddr), - stop(false), - dispatch_thread(this) -{ -} -Rank::EntityMessenger::~EntityMessenger() -{ -} - -void Rank::EntityMessenger::dispatch_entry() -{ - lock.Lock(); - while (!stop) { - if (!dispatch_queue.empty()) { - list ls; - ls.swap(dispatch_queue); - - lock.Unlock(); - { - // deliver - while (!ls.empty()) { - Message *m = ls.front(); - ls.pop_front(); - dout(1) //<< g_clock.now() - << "---- " - << m->get_source() << ':' << m->get_source_port() - << " to " << m->get_dest() << ':' << m->get_dest_port() - << " ---- " << m->get_type_name() - << " ---- " << m->get_source_inst() - << " ---- " << m - << endl; - dispatch(m); - } - } - lock.Lock(); - continue; - } - cond.Wait(lock); - } - lock.Unlock(); -} - -void Rank::EntityMessenger::ready() -{ - dout(10) << "ready " << get_myaddr() << endl; - - if (g_conf.ms_single_dispatch) { - rank.lock.Lock(); - if (rank.waiting_for_ready.count(get_myaddr())) { - rank.single_dispatch_queue.splice(rank.single_dispatch_queue.end(), - rank.waiting_for_ready[get_myaddr()]); - rank.waiting_for_ready.erase(get_myaddr()); - rank.single_dispatch_cond.Signal(); - } - rank.lock.Unlock(); - } else { - // start my dispatch thread - dispatch_thread.create(); - } - - // tell namer - if (get_myaddr() != MSG_ADDR_NAMER(0)) - send_message(new MGenericMessage(MSG_NS_STARTED), MSG_ADDR_NAMER(0)); -} - - -int Rank::EntityMessenger::shutdown() -{ - dout(10) << "shutdown " << get_myaddr() << endl; - - // deregister - rank.unregister_entity(this); - - // stop my dispatch thread - if (dispatch_thread.am_self()) { - dout(1) << "shutdown i am dispatch, setting stop flag" << endl; - stop = true; - } else { - dout(1) << "shutdown i am not dispatch, setting stop flag and joining thread." << endl; - lock.Lock(); - stop = true; - cond.Signal(); - lock.Unlock(); - dispatch_thread.join(); - } - - return 0; -} - - -void Rank::EntityMessenger::prepare_send_message(entity_name_t dest) -{ - rank.prepare_dest(dest); -} - -int Rank::EntityMessenger::send_message(Message *m, entity_name_t dest, const entity_inst_t& inst) -{ - // set envelope - m->set_source(get_myaddr(), 0); - m->set_dest(dest, 0); - - m->set_source_inst(rank.my_inst); - - dout(1) << "--> " - << m->get_source() //<< ':' << m->get_source_port() - << " to " << m->get_dest() //<< ':' << m->get_dest_port() - << " ---- " << m->get_type_name() - << " ---- " << rank.my_inst << " --> " << inst - << " ---- " << m - << endl; - - rank.submit_message(m, inst); - - return 0; -} - - -int Rank::EntityMessenger::send_message(Message *m, entity_name_t dest, int port, int fromport) -{ - // set envelope - m->set_source(get_myaddr(), fromport); - m->set_dest(dest, port); - - m->set_source_inst(rank.my_inst); - - dout(1) << "--> " - << m->get_source() //<< ':' << m->get_source_port() - << " to " << m->get_dest() //<< ':' << m->get_dest_port() - << " ---- " << m->get_type_name() - << " ---- " << rank.my_inst << " --> ?" - << " ---- " << m - << endl; - - rank.submit_message(m); - - return 0; -} - - -void Rank::EntityMessenger::mark_down(entity_name_t a, entity_inst_t& i) -{ - assert(a != get_myaddr()); - rank.mark_down(a,i); -} - -void Rank::mark_down(entity_name_t a, entity_inst_t& inst) -{ - if (my_rank == 0) return; // ugh.. rank0 already handles this stuff in the namer - lock.Lock(); - if (down.count(a) == 0) { - if (entity_map.count(a) && - entity_map[a] > inst) { - dout(10) << "mark_down " << a << " inst " << inst << " < " << entity_map[a] << endl; - derr(10) << "mark_down " << a << " inst " << inst << " < " << entity_map[a] << endl; - // do nothing! - } else { - down.insert(a); - - if (entity_map.count(a) == 0) { - // don't know it - dout(10) << "mark_down " << a << " inst " << inst << " ... unknown by me" << endl; - derr(10) << "mark_down " << a << " inst " << inst << " ... unknown by me" << endl; - - waiting_for_lookup.erase(a); - looking_up.erase(a); - } else { - // know it - assert(entity_map[a] <= inst); - dout(10) << "mark_down " << a << " inst " << inst << endl; - derr(10) << "mark_down " << a << " inst " << inst << endl; - - entity_map.erase(a); - - if (rank_sender.count(inst.rank)) { - rank_sender[inst.rank]->stop(); - rank_sender.erase(inst.rank); - } - } - } - } - lock.Unlock(); -} - -void Rank::EntityMessenger::mark_up(entity_name_t a, entity_inst_t& i) -{ - assert(a != get_myaddr()); - rank.mark_up(a, i); -} - -void Rank::mark_up(entity_name_t a, entity_inst_t& i) -{ - if (my_rank == 0) return; - lock.Lock(); - { - dout(10) << "mark_up " << a << " inst " << i << endl; - derr(10) << "mark_up " << a << " inst " << i << endl; - - down.erase(a); - - assert(i.rank != my_rank); // hrm? - - if (entity_map.count(a) == 0 || - entity_map[a] < i) { - entity_map[a] = i; - connect_rank(i); - } else if (entity_map[a] == i) { - dout(10) << "mark_up " << a << " inst " << i << " ... knew it" << endl; - derr(10) << "mark_up " << a << " inst " << i << " ... knew it" << endl; - } else { - dout(-10) << "mark_up " << a << " inst " << i << " < " << entity_map[a] << endl; - derr(-10) << "mark_up " << a << " inst " << i << " < " << entity_map[a] << endl; - } - - //if (waiting_for_lookup.count(a)) - //lookup(a); - } - lock.Unlock(); -} - diff --git a/tags/20070517_before_mds_merge/msg/NewMessenger.h b/tags/20070517_before_mds_merge/msg/NewMessenger.h deleted file mode 100644 index 0e04315a10883..0000000000000 --- a/tags/20070517_before_mds_merge/msg/NewMessenger.h +++ /dev/null @@ -1,305 +0,0 @@ -#ifndef __NEWMESSENGER_H -#define __NEWMESSENGER_H - - -#include -#include -using namespace std; -#include -#include -using namespace __gnu_cxx; - - -#include "include/types.h" - -#include "common/Mutex.h" -#include "common/Cond.h" -#include "common/Thread.h" - -#include "Messenger.h" -#include "Message.h" -#include "tcp.h" - - - - -/* Rank - per-process - */ -class Rank : public Dispatcher { - - class EntityMessenger; - class Sender; - class Receiver; - - // namer - class Namer : public Dispatcher { - public: - EntityMessenger *messenger; // namerN - - int nrank; - int nclient, nmds, nosd, nmon; - - map > waiting; - - Namer(EntityMessenger *msgr); - ~Namer(); - - void handle_connect(class MNSConnect*); - void handle_register(class MNSRegister *m); - void handle_started(Message *m); - void handle_lookup(class MNSLookup *m); - void handle_unregister(Message *m); - void handle_failure(class MNSFailure *m); - - void dispatch(Message *m); - - void manual_insert_inst(const entity_inst_t &inst); - - }; - - // incoming - class Accepter : public Thread { - public: - bool done; - - tcpaddr_t listen_addr; - int listen_sd; - - Accepter() : done(false) {} - - void *entry(); - void stop() { - done = true; - ::close(listen_sd); - join(); - } - int start(); - } accepter; - - - class Receiver : public Thread { - public: - int sd; - bool done; - - Receiver(int _sd) : sd(_sd), done(false) {} - - void *entry(); - void stop() { - done = true; - ::close(sd); - //join(); - } - Message *read_message(); - }; - - - // outgoing - class Sender : public Thread { - public: - entity_inst_t inst; - bool done; - int sd; - - set entities; - list q; - - Mutex lock; - Cond cond; - - Sender(const entity_inst_t& i, int s=0) : inst(i), done(false), sd(s) {} - virtual ~Sender() {} - - void *entry(); - - int connect(); - void fail_and_requeue(list& ls); - void finish(); - - void stop() { - lock.Lock(); - done = true; - cond.Signal(); - lock.Unlock(); - } - - void send(Message *m) { - lock.Lock(); - q.push_back(m); - cond.Signal(); - lock.Unlock(); - } - void send(list& ls) { - lock.Lock(); - q.splice(q.end(), ls); - cond.Signal(); - lock.Unlock(); - } - - int write_message(Message *m); - }; - - - - // messenger interface - class EntityMessenger : public Messenger { - Mutex lock; - Cond cond; - list dispatch_queue; - bool stop; - - class DispatchThread : public Thread { - EntityMessenger *m; - public: - DispatchThread(EntityMessenger *_m) : m(_m) {} - void *entry() { - m->dispatch_entry(); - return 0; - } - } dispatch_thread; - void dispatch_entry(); - - public: - void queue_message(Message *m) { - lock.Lock(); - dispatch_queue.push_back(m); - cond.Signal(); - lock.Unlock(); - } - void queue_messages(list ls) { - lock.Lock(); - dispatch_queue.splice(dispatch_queue.end(), ls); - cond.Signal(); - lock.Unlock(); - } - - public: - EntityMessenger(entity_name_t myaddr); - ~EntityMessenger(); - - void ready(); - bool is_stopped() { return stop; } - - void wait() { - dispatch_thread.join(); - } - - virtual void callback_kick() {} - virtual int shutdown(); - virtual void prepare_send_message(entity_name_t dest); - virtual int send_message(Message *m, entity_name_t dest, int port=0, int fromport=0); - virtual int send_message(Message *m, entity_name_t dest, const entity_inst_t& inst); - - virtual void mark_down(entity_name_t a, entity_inst_t& i); - virtual void mark_up(entity_name_t a, entity_inst_t& i); - //virtual void reset(msg_addr_t a); - }; - - - class SingleDispatcher : public Thread { - Rank *rank; - public: - SingleDispatcher(Rank *r) : rank(r) {} - void *entry() { - rank->single_dispatcher_entry(); - return 0; - } - } single_dispatcher; - - Cond single_dispatch_cond; - bool single_dispatch_stop; - list single_dispatch_queue; - - map > waiting_for_ready; - - void single_dispatcher_entry(); - void _submit_single_dispatch(Message *m); - - - // Rank stuff - public: - Mutex lock; - Cond wait_cond; // for wait() - - // my rank - int my_rank; - Cond waiting_for_rank; - - // my instance - entity_inst_t my_inst; - - // lookup - hash_map entity_map; - hash_set entity_unstarted; - - map > waiting_for_lookup; - set looking_up; - - hash_set down; - - // register - map waiting_for_register_cond; - map waiting_for_register_result; - - // local - map local; - - // remote - hash_map rank_sender; - - set receivers; - - list sender_reap_queue; - list receiver_reap_queue; - - EntityMessenger *messenger; // rankN - Namer *namer; - - - void show_dir(); - - void lookup(entity_name_t addr); - - void dispatch(Message *m); - void handle_connect_ack(class MNSConnectAck *m); - void handle_register_ack(class MNSRegisterAck *m); - void handle_lookup_reply(class MNSLookupReply *m); - - Sender *connect_rank(const entity_inst_t& inst); - - void mark_down(entity_name_t addr, entity_inst_t& i); - void mark_up(entity_name_t addr, entity_inst_t& i); - - tcpaddr_t get_listen_addr() { return accepter.listen_addr; } - - void reaper(); - - -public: - Rank(int r=-1); - ~Rank(); - - int find_ns_addr(tcpaddr_t &tcpaddr); - - void set_namer(const tcpaddr_t& ns); - void start_namer(); - - int start_rank(); - void wait(); - - EntityMessenger *register_entity(entity_name_t addr); - void unregister_entity(EntityMessenger *ms); - - void submit_message(Message *m, const entity_inst_t& inst); - void prepare_dest(entity_name_t dest); - void submit_message(Message *m); - void submit_messages(list& ls); - - // create a new messenger - EntityMessenger *new_entity(entity_name_t addr); - -} ; - -extern Rank rank; - -#endif diff --git a/tags/20070517_before_mds_merge/msg/NewerMessenger.cc b/tags/20070517_before_mds_merge/msg/NewerMessenger.cc deleted file mode 100644 index c277eea4b409b..0000000000000 --- a/tags/20070517_before_mds_merge/msg/NewerMessenger.cc +++ /dev/null @@ -1,1791 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "NewerMessenger.h" - -#include -#include -#include -#include - -#include "config.h" - -#include "messages/MGenericMessage.h" -#include "messages/MNSConnect.h" -#include "messages/MNSConnectAck.h" -#include "messages/MNSRegister.h" -#include "messages/MNSRegisterAck.h" -#include "messages/MNSLookup.h" -#include "messages/MNSLookupReply.h" -#include "messages/MNSFailure.h" - -//#include "messages/MFailure.h" - -#include - - -#undef dout -#define dout(l) if (l<=g_conf.debug_ms) cout << g_clock.now() << " -- rank" << rank.my_rank << " " -#define derr(l) if (l<=g_conf.debug_ms) cerr << g_clock.now() << " -- rank" << rank.my_rank << " " - - - -#include "tcp.cc" - - -Rank rank; - - -/******************************************** - * Namer - */ - -Rank::Namer::Namer(EntityMessenger *msgr) : - messenger(msgr), - nrank(0), nclient(0), nmds(0), nosd(0), nmon(0) -{ - assert(rank.my_rank == 0); - nrank = g_conf.num_mon; - - // announce myself - /* - cerr << "ceph ns is " << rank.accepter.listen_addr << endl; - cout << "export CEPH_NAMESERVER=" << rank.accepter.listen_addr << endl; - int fd = ::open(".ceph_ns", O_WRONLY|O_CREAT); - ::write(fd, (void*)&rank.accepter.listen_addr, sizeof(tcpaddr_t)); - ::fchmod(fd, 0755); - ::close(fd); - */ - - // ok - messenger->set_dispatcher(this); -} - -Rank::Namer::~Namer() -{ - //::unlink(".ceph_ns"); -} - - -void Rank::Namer::dispatch(Message *m) -{ - rank.lock.Lock(); - int type = m->get_type(); - switch (type) { - case MSG_NS_CONNECT: - handle_connect((class MNSConnect*)m); - break; - case MSG_NS_REGISTER: - handle_register((class MNSRegister*)m); - break; - case MSG_NS_STARTED: - handle_started(m); - break; - case MSG_NS_UNREGISTER: - handle_unregister(m); - break; - case MSG_NS_LOOKUP: - handle_lookup((class MNSLookup*)m); - break; - case MSG_NS_FAILURE: - handle_failure((class MNSFailure*)m); - break; - - case MSG_FAILURE_ACK: - delete m; - break; - - default: - assert(0); - } - rank.lock.Unlock(); -} - -void Rank::Namer::handle_connect(MNSConnect *m) -{ - int newrank = nrank++; - dout(2) << "namer.handle_connect from new rank" << newrank << " " << m->get_addr() << endl; - - rank.entity_map[MSG_ADDR_RANK(newrank)].addr = m->get_addr(); - rank.entity_map[MSG_ADDR_RANK(newrank)].rank = newrank; - rank.entity_unstarted.insert(MSG_ADDR_RANK(newrank)); - - messenger->send_message(new MNSConnectAck(newrank), - MSG_ADDR_RANK(newrank), rank.entity_map[MSG_ADDR_RANK(newrank)]); - delete m; -} - -void Rank::Namer::manual_insert_inst(const entity_inst_t &inst) -{ - rank.entity_map[MSG_ADDR_RANK(inst.rank)] = inst; -} - -void Rank::Namer::handle_register(MNSRegister *m) -{ - dout(10) << "namer.handle_register from rank " << m->get_rank() - << " addr " << m->get_entity() << endl; - - // pick id - entity_name_t entity = m->get_entity(); - - if (entity.is_new()) { - // make up a new address! - switch (entity.type()) { - case MSG_ADDR_MDS_BASE: - entity = MSG_ADDR_MDS(nmds++); - break; - - case MSG_ADDR_OSD_BASE: - entity = MSG_ADDR_OSD(nosd++); - break; - - case MSG_ADDR_CLIENT_BASE: - entity = MSG_ADDR_CLIENT(nclient++); - break; - - default: - assert(0); - } - } else { - // specific address! - } - - - // register - if (rank.entity_map.count(entity)) { - dout(1) << "namer.handle_register re-registering " << entity - << " inst " << m->get_source_inst() - << " (was " << rank.entity_map[entity] << ")" - << endl; - } else { - dout(1) << "namer.handle_register registering " << entity - << " inst " << m->get_source_inst() - << endl; - } - rank.entity_map[entity] = m->get_source_inst(); - rank.entity_unstarted.insert(entity); - - // reply w/ new id - messenger->send_message(new MNSRegisterAck(m->get_tid(), entity), - m->get_source(), rank.entity_map[entity]); - - delete m; -} - -void Rank::Namer::handle_started(Message *m) -{ - entity_name_t who = m->get_source(); - dout(10) << "namer.handle_started from entity " << who << endl; - - assert(rank.entity_unstarted.count(who)); - rank.entity_unstarted.erase(who); - - // anybody waiting? - if (waiting.count(who)) { - list ls; - ls.swap(waiting[who]); - waiting.erase(who); - - dout(10) << "doing waiters on " << who << endl; - for (list::iterator it = ls.begin(); - it != ls.end(); - it++) - dispatch(*it); - } - -} - -void Rank::Namer::handle_unregister(Message *m) -{ - entity_name_t who = m->get_source(); - dout(1) << "namer.handle_unregister entity " << who << endl; - - rank.show_dir(); - - assert(rank.entity_map.count(who)); - rank.entity_map.erase(who); - - rank.show_dir(); - - // shut myself down? kick watcher. - if (rank.entity_map.size() == 2) { - dout(10) << "namer.handle_unregister stopping namer" << endl; - rank.lock.Unlock(); - messenger->shutdown(); - delete messenger; - rank.lock.Lock(); - } - - delete m; -} - - -void Rank::Namer::handle_lookup(MNSLookup *m) -{ - // have it? - if (rank.entity_map.count(m->get_entity()) == 0) { - dout(10) << "namer " << m->get_source() << " lookup '" << m->get_entity() << "' -> dne" << endl; - waiting[m->get_entity()].push_back(m); - return; - } - - if (rank.entity_unstarted.count(m->get_entity())) { - dout(10) << "namer " << m->get_source() << " lookup '" << m->get_entity() << "' -> unstarted" << endl; - waiting[m->get_entity()].push_back(m); - return; - } - - // look it up! - MNSLookupReply *reply = new MNSLookupReply(m); - - reply->entity_map[m->get_entity()] = rank.entity_map[m->get_entity()]; - - dout(10) << "namer " << m->get_source() - << " lookup '" << m->get_entity() - << "' -> " << rank.entity_map[m->get_entity()] << endl; - - messenger->send_message(reply, m->get_source(), m->get_source_inst()); - delete m; -} - -void Rank::Namer::handle_failure(MNSFailure *m) -{ - dout(10) << "namer.handle_failure inst " << m->get_inst() - << endl; - - // search for entities on this instance - list rm; - for (hash_map::iterator i = rank.entity_map.begin(); - i != rank.entity_map.end(); - i++) { - if (i->second != m->get_inst()) continue; - rm.push_back(i->first); - } - for (list::iterator i = rm.begin(); - i != rm.end(); - i++) { - dout(10) << "namer.handle_failure inst " << m->get_inst() - << ", removing " << *i << endl; - - rank.entity_map.erase(*i); - rank.entity_unstarted.erase(*i); - - /* - if ((*i).is_osd()) { - // tell the monitor - messenger->send_message(new MFailure(*i, m->get_inst()), MSG_ADDR_MON(0)); - } - */ - } - - delete m; -} - - - -/******************************************** - * Accepter - */ - -int Rank::Accepter::start() -{ - // bind to a socket - dout(10) << "accepter.start binding to listen " << endl; - - /* socket creation */ - listen_sd = socket(AF_INET,SOCK_STREAM,0); - assert(listen_sd > 0); - - /* bind to port */ - memset((char*)&listen_addr, 0, sizeof(listen_addr)); - listen_addr.sin_family = AF_INET; - listen_addr.sin_addr.s_addr = htonl(INADDR_ANY); - listen_addr.sin_port = 0; - - int rc = bind(listen_sd, (struct sockaddr *) &listen_addr, sizeof(listen_addr)); - assert(rc >= 0); - - socklen_t llen = sizeof(listen_addr); - getsockname(listen_sd, (sockaddr*)&listen_addr, &llen); - - int myport = listen_addr.sin_port; - - // listen! - rc = ::listen(listen_sd, 1000); - assert(rc >= 0); - - //dout(10) << "accepter.start listening on " << myport << endl; - - // my address is... - char host[100]; - bzero(host, 100); - gethostname(host, 100); - //dout(10) << "accepter.start my hostname is " << host << endl; - - struct hostent *myhostname = gethostbyname( host ); - - struct sockaddr_in my_addr; - memset(&my_addr, 0, sizeof(my_addr)); - - my_addr.sin_family = myhostname->h_addrtype; - memcpy((char *) &my_addr.sin_addr.s_addr, - myhostname->h_addr_list[0], - myhostname->h_length); - my_addr.sin_port = myport; - - listen_addr = my_addr; - - dout(10) << "accepter.start listen addr is " << listen_addr << endl; - - // start thread - create(); - - return 0; -} - -void *Rank::Accepter::entry() -{ - dout(10) << "accepter starting" << endl; - - while (!done) { - // accept - struct sockaddr_in addr; - socklen_t slen = sizeof(addr); - int sd = ::accept(listen_sd, (sockaddr*)&addr, &slen); - if (sd > 0) { - dout(10) << "accepted incoming on sd " << sd << endl; - - rank.lock.Lock(); - Pipe *p = new Pipe(sd); - rank.pipes.insert(p); - rank.lock.Unlock(); - } else { - dout(10) << "no incoming connection?" << endl; - break; - } - } - - return 0; -} - - - -/************************************** - * Pipe - */ - -int Rank::Pipe::accept() -{ - // my creater gave me sd via accept() - - // announce myself. - int rc = tcp_write(sd, (char*)&rank.my_inst, sizeof(rank.my_inst)); - if (rc < 0) { - ::close(sd); - done = true; - return -1; - } - - // identify peer - rc = tcp_read(sd, (char*)&peer_inst, sizeof(peer_inst)); - if (rc < 0) { - dout(10) << "pipe(? " << this << ").accept couldn't read peer inst" << endl; - ::close(sd); - done = true; - return -1; - } - - // create writer thread. - writer_running = true; - writer_thread.create(); - - // register pipe. - if (peer_inst.rank >= 0) { - rank.lock.Lock(); - { - if (rank.rank_pipe.count(peer_inst.rank) == 0) { - // install a pipe! - dout(10) << "pipe(" << peer_inst << ' ' << this << ").accept peer is " << peer_inst << endl; - rank.rank_pipe[peer_inst.rank] = this; - } else { - // low ranks' Pipes "win" - if (peer_inst.rank < rank.my_inst.rank || - rank.my_inst.rank < 0) { - dout(10) << "pipe(" << peer_inst << ' ' << this << ").accept peer is " << peer_inst - << ", already had pipe, but switching to this new one" << endl; - // switch to this new Pipe - rank.rank_pipe[peer_inst.rank]->close(); // close old one - rank.rank_pipe[peer_inst.rank] = this; - } else { - dout(10) << "pipe(" << peer_inst << ' ' << this << ").accept peer is " << peer_inst - << ", already had pipe, sticking with it" << endl; - } - } - } - rank.lock.Unlock(); - } else { - dout(10) << "pipe(" << peer_inst << ' ' << this << ").accept peer is unranked " << peer_inst << endl; - } - - return 0; // success. -} - -int Rank::Pipe::connect() -{ - dout(10) << "pipe(" << peer_inst << ' ' << this << ").connect" << endl; - - // create socket? - sd = socket(AF_INET,SOCK_STREAM,0); - assert(sd > 0); - - // bind any port - struct sockaddr_in myAddr; - myAddr.sin_family = AF_INET; - myAddr.sin_addr.s_addr = htonl(INADDR_ANY); - myAddr.sin_port = htons( 0 ); - - int rc = bind(sd, (struct sockaddr *) &myAddr, sizeof(myAddr)); - assert(rc>=0); - - // connect! - rc = ::connect(sd, (sockaddr*)&peer_inst.addr, sizeof(myAddr)); - if (rc < 0) return rc; - - // identify peer - entity_inst_t inst; - rc = tcp_read(sd, (char*)&inst, sizeof(inst)); - if (inst.rank < 0) - inst = peer_inst; // i know better than they do. - if (peer_inst != inst && inst.rank > 0) { - derr(0) << "pipe(" << peer_inst << ' ' << this << ").connect peer is " << inst << ", wtf" << endl; - assert(0); - return -1; - } - - // identify myself - rc = tcp_write(sd, (char*)&rank.my_inst, sizeof(rank.my_inst)); - if (rc < 0) - return -1; - - // register pipe - rank.lock.Lock(); - { - if (rank.rank_pipe.count(peer_inst.rank) == 0) { - dout(10) << "pipe(" << peer_inst << ' ' << this << ").connect registering pipe" << endl; - rank.rank_pipe[peer_inst.rank] = this; - } else { - // this is normal. - dout(10) << "pipe(" << peer_inst << ' ' << this << ").connect pipe already registered." << endl; - } - } - rank.lock.Unlock(); - - // start reader - reader_running = true; - reader_thread.create(); - - return 0; -} - - -void Rank::Pipe::close() -{ - if (sent_close) { - dout(10) << "pipe(" << peer_inst << ' ' << this << ").close already closing" << endl; - return; - } - dout(10) << "pipe(" << peer_inst << ' ' << this << ").close" << endl; - - // unreg ourselves - rank.lock.Lock(); - { - if (rank.rank_pipe.count(peer_inst.rank) && - rank.rank_pipe[peer_inst.rank] == this) { - dout(10) << "pipe(" << peer_inst << ' ' << this << ").close unregistering pipe" << endl; - rank.rank_pipe.erase(peer_inst.rank); - } - } - rank.lock.Unlock(); - - // queue close message. - dout(10) << "pipe(" << peer_inst << ' ' << this << ").close queueing MSG_CLOSE" << endl; - lock.Lock(); - q.push_back(new MGenericMessage(MSG_CLOSE)); - cond.Signal(); - sent_close = true; - lock.Unlock(); -} - - -/* read msgs from socket. - * also, server. - * - */ -void Rank::Pipe::reader() -{ - if (server) - accept(); - - // loop. - while (!done) { - Message *m = read_message(); - if (!m || m->get_type() == 0) { - if (m) { - delete m; - dout(10) << "pipe(" << peer_inst << ' ' << this << ").reader read MSG_CLOSE message" << endl; - } else { - derr(10) << "pipe(" << peer_inst << ' ' << this << ").reader read null message" << endl; - } - - if (!sent_close) - close(); - - done = true; - cond.Signal(); // wake up writer too. - break; - } - - dout(10) << "pipe(" << peer_inst << ' ' << this << ").reader got message for " << m->get_dest() << endl; - - EntityMessenger *entity = 0; - - rank.lock.Lock(); - { - if (rank.entity_map.count(m->get_source()) && - rank.entity_map[m->get_source()] > m->get_source_inst()) { - derr(0) << "pipe(" << peer_inst << ' ' << this << ").reader source " << m->get_source() - << " inst " << m->get_source_inst() - << " > " << rank.entity_map[m->get_source()] - << ", WATCH OUT " << *m << endl; - assert(0); - } - - if (m->get_dest().type() == MSG_ADDR_RANK_BASE) { - // ours. - rank.dispatch(m); - } else { - if (g_conf.ms_single_dispatch) { - // submit to single dispatch queue - rank._submit_single_dispatch(m); - } else { - if (rank.local.count(m->get_dest())) { - // find entity - entity = rank.local[m->get_dest()]; - } else { - derr(0) << "pipe(" << peer_inst << ' ' << this << ").reader got message " << *m << " for " << m->get_dest() << ", which isn't local" << endl; - assert(0); // FIXME do this differently - //rank.waiting_for_lookup[m->get_dest()].push_back(m); - } - } - } - } - rank.lock.Unlock(); - - if (entity) - entity->queue_message(m); // queue - } - - - // reap? - bool reap = false; - lock.Lock(); - { - reader_running = false; - if (!writer_running) reap = true; - } - lock.Unlock(); - - if (reap) { - dout(20) << "pipe(" << peer_inst << ' ' << this << ").reader queueing for reap" << endl; - ::close(sd); - rank.lock.Lock(); - { - rank.pipe_reap_queue.push_back(this); - rank.wait_cond.Signal(); - } - rank.lock.Unlock(); - } -} - - -/* write msgs to socket. - * also, client. - */ -void Rank::Pipe::writer() -{ - if (!server) { - int rc = connect(); - if (rc < 0) { - derr(1) << "pipe(" << peer_inst << ' ' << this << ").writer error connecting" << endl; - done = true; - list out; - fail(out); - } - } - - // loop. - lock.Lock(); - while (!q.empty() || !done) { - - if (!q.empty()) { - dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer grabbing message(s)" << endl; - - // grab outgoing list - list out; - out.swap(q); - - // drop lock while i send these - lock.Unlock(); - - while (!out.empty()) { - Message *m = out.front(); - out.pop_front(); - - dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer sending " << *m << endl; - - // stamp. - m->set_source_inst(rank.my_inst); - - // marshall - if (m->empty_payload()) - m->encode_payload(); - - if (write_message(m) < 0) { - // failed! - derr(1) << "pipe(" << peer_inst << ' ' << this << ").writer error sending " << *m << " to " << m->get_dest() << endl; - out.push_front(m); - fail(out); - done = true; - break; - } - - // did i just send a close? - if (m->get_type() == MSG_CLOSE) - done = true; - - // clean up - delete m; - } - - lock.Lock(); - continue; - } - - // wait - dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer sleeping" << endl; - cond.Wait(lock); - } - lock.Unlock(); - - dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer finishing" << endl; - - // reap? - bool reap = false; - lock.Lock(); - { - writer_running = false; - if (!reader_running) reap = true; - } - lock.Unlock(); - - if (reap) { - dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer queueing for reap" << endl; - ::close(sd); - rank.lock.Lock(); - { - rank.pipe_reap_queue.push_back(this); - rank.wait_cond.Signal(); - } - rank.lock.Unlock(); - } -} - - -Message *Rank::Pipe::read_message() -{ - // envelope - //dout(10) << "receiver.read_message from sd " << sd << endl; - - msg_envelope_t env; - if (!tcp_read( sd, (char*)&env, sizeof(env) )) - return 0; - - dout(20) << "pipe(" << peer_inst << ' ' << this << ").reader got envelope type=" << env.type - << " src " << env.source << " dst " << env.dest - << " nchunks=" << env.nchunks - << endl; - - // payload - bufferlist blist; - for (int i=0; iget_source() << endl; - - return m; -} - - - -int Rank::Pipe::write_message(Message *m) -{ - // get envelope, buffers - msg_envelope_t *env = &m->get_envelope(); - bufferlist blist; - blist.claim( m->get_payload() ); - -#ifdef TCP_KEEP_CHUNKS - env->nchunks = blist.buffers().size(); -#else - env->nchunks = 1; -#endif - - dout(20)// << g_clock.now() - << "pipe(" << peer_inst << ' ' << this << ").writer sending " << m << " " << *m - << " to " << m->get_dest() - << endl; - - // send envelope - int r = tcp_write( sd, (char*)env, sizeof(*env) ); - if (r < 0) { - derr(1) << "pipe(" << peer_inst << ' ' << this << ").writer error sending envelope for " << *m - << " to " << m->get_dest() << endl; - return -1; - } - - // payload -#ifdef TCP_KEEP_CHUNKS - // send chunk-wise - int i = 0; - for (list::const_iterator it = blist.buffers().begin(); - it != blist.buffers().end(); - it++) { - dout(10) << "pipe(" << peer_inst << ' ' << this << ").writer tcp_sending frag " << i << " len " << (*it).length() << endl; - int size = (*it).length(); - r = tcp_write( sd, (char*)&size, sizeof(size) ); - if (r < 0) { - derr(10) << "pipe(" << peer_inst << ' ' << this << ").writer error sending chunk len for " << *m << " to " << m->get_dest() << endl; - return -1; - } - r = tcp_write( sd, (*it).c_str(), size ); - if (r < 0) { - derr(10) << "pipe(" << peer_inst << ' ' << this << ").writer error sending data chunk for " << *m << " to " << m->get_dest() << endl; - return -1; - } - i++; - } -#else - // one big chunk - int size = blist.length(); - r = tcp_write( sd, (char*)&size, sizeof(size) ); - if (r < 0) { - derr(10) << "pipe(" << peer_inst << ' ' << this << ").writer error sending data len for " << *m << " to " << m->get_dest() << endl; - return -1; - } - dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer data len is " << size << " in " << blist.buffers().size() << " buffers" << endl; - - for (list::const_iterator it = blist.buffers().begin(); - it != blist.buffers().end(); - it++) { - if ((*it).length() == 0) continue; // blank buffer. - r = tcp_write( sd, (char*)(*it).c_str(), (*it).length() ); - if (r < 0) { - derr(10) << "pipe(" << peer_inst << ' ' << this << ").writer error sending data megachunk for " << *m << " to " << m->get_dest() << " : len " << (*it).length() << endl; - return -1; - } - } -#endif - - return 0; -} - - -void Rank::Pipe::fail(list& out) -{ - derr(10) << "pipe(" << peer_inst << ' ' << this << ").fail" << endl; - - // tell namer - if (!rank.messenger) { - derr(0) << "FATAL error: can't send failure to namer0, not connected yet" << endl; - assert(0); - } - - // FIXME: possible race before i reclaim lock here? - - // deactivate myself - rank.lock.Lock(); - { - if (rank.rank_pipe.count(peer_inst.rank) && - rank.rank_pipe[peer_inst.rank] == this) - rank.rank_pipe.erase(peer_inst.rank); - } - rank.lock.Unlock(); - - // what do i do about reader()? FIXME - - // sort my messages by (source) dispatcher, dest. - map > > by_dis; - lock.Lock(); - { - // include out at front of queue - q.splice(q.begin(), out); - - // sort - while (!q.empty()) { - if (q.front()->get_type() == MSG_CLOSE) { - delete q.front(); - } - else if (rank.local.count(q.front()->get_source())) { - Dispatcher *dis = rank.local[q.front()->get_source()]->get_dispatcher(); - by_dis[dis][q.front()->get_dest()].push_back(q.front()); - } - else { - // oh well. sending entity musta just shut down? - assert(0); - delete q.front(); - } - q.pop_front(); - } - } - lock.Unlock(); - - // report failure(s) to dispatcher(s) - for (map > >::iterator i = by_dis.begin(); - i != by_dis.end(); - ++i) - for (map >::iterator j = i->second.begin(); - j != i->second.end(); - ++j) - for (list::iterator k = j->second.begin(); - k != j->second.end(); - ++k) { - derr(1) << "pipe(" << peer_inst << ' ' << this << ").fail on " << **k << " to " << j->first << " inst " << peer_inst << endl; - i->first->ms_handle_failure(*k, j->first, peer_inst); - } -} - - - - - - -/******************************************** - * Rank - */ - -Rank::Rank(int r) : - single_dispatcher(this), - my_rank(r), - namer(0) { -} -Rank::~Rank() -{ - //FIXME - if (namer) delete namer; -} - - -void Rank::_submit_single_dispatch(Message *m) -{ - assert(lock.is_locked()); - - if (local.count(m->get_dest()) && - local[m->get_dest()]->is_ready()) { - rank.single_dispatch_queue.push_back(m); - rank.single_dispatch_cond.Signal(); - } else { - waiting_for_ready[m->get_dest()].push_back(m); - } -} - - -void Rank::single_dispatcher_entry() -{ - lock.Lock(); - while (!single_dispatch_stop || !single_dispatch_queue.empty()) { - if (!single_dispatch_queue.empty()) { - list ls; - ls.swap(single_dispatch_queue); - - lock.Unlock(); - { - while (!ls.empty()) { - Message *m = ls.front(); - ls.pop_front(); - - dout(1) //<< g_clock.now() - << "---- " - << m->get_source()// << ':' << m->get_source_port() - << " to " << m->get_dest()// << ':' << m->get_dest_port() - << " ---- " << m->get_type_name() - << " ---- " << m - << endl; - - if (m->get_dest().type() == MSG_ADDR_RANK_BASE) - rank.dispatch(m); - else { - assert(local.count(m->get_dest())); - local[m->get_dest()]->dispatch(m); - } - } - } - lock.Lock(); - continue; - } - single_dispatch_cond.Wait(lock); - } - lock.Unlock(); -} - - -/* - * note: assumes lock is held - */ -void Rank::reaper() -{ - dout(10) << "reaper" << endl; - assert(lock.is_locked()); - - while (!pipe_reap_queue.empty()) { - Pipe *p = pipe_reap_queue.front(); - dout(10) << "reaper reaping pipe " << p->get_peer_inst() << endl; - pipe_reap_queue.pop_front(); - assert(pipes.count(p)); - pipes.erase(p); - p->join(); - dout(10) << "reaper reaped pipe " << p->get_peer_inst() << endl; - delete p; - } -} - - -int Rank::start_rank() -{ - dout(10) << "start_rank" << endl; - - // bind to a socket - if (accepter.start() < 0) - return -1; - - // start single thread dispatcher? - if (g_conf.ms_single_dispatch) { - single_dispatch_stop = false; - single_dispatcher.create(); - } - - lock.Lock(); - - // my_inst - my_inst.addr = accepter.listen_addr; - my_inst.rank = my_rank; - - if (my_rank < 0) { - dout(10) << "start_rank connecting to namer0" << endl; - - // connect to namer - assert(entity_map.count(MSG_ADDR_NAMER(0))); - Pipe *pipe = connect_rank(entity_map[MSG_ADDR_NAMER(0)]); - - // send - Message *m = new MNSConnect(accepter.listen_addr); - m->set_dest(MSG_ADDR_NAMER(0), 0); - pipe->send(m); - - // wait - while (my_rank < 0) - waiting_for_rank.Wait(lock); - assert(my_rank >= 0); - - dout(10) << "start_rank got rank " << my_rank << endl; - - // create rank entity - entity_map[MSG_ADDR_RANK(my_rank)] = my_inst; - local[MSG_ADDR_RANK(my_rank)] = messenger = new EntityMessenger(MSG_ADDR_RANK(my_rank)); - messenger->set_dispatcher(this); - } else { - // create my rank - entity_name_t raddr = MSG_ADDR_RANK(my_rank); - entity_map[raddr] = my_inst; - entity_unstarted.insert(raddr); - local[raddr] = messenger = new EntityMessenger(raddr); - messenger->set_dispatcher(this); - - dout(1) << "start_rank " << my_rank << " at " << my_inst << endl; - } - - lock.Unlock(); - return 0; -} - -void Rank::start_namer() -{ - // create namer0 - entity_name_t naddr = MSG_ADDR_NAMER(0); - entity_map[naddr] = my_inst; - local[naddr] = new EntityMessenger(naddr); - namer = new Namer(local[naddr]); - namer_inst = my_inst; -} - -void Rank::set_namer(const tcpaddr_t& ns) -{ - namer_inst.addr = entity_map[MSG_ADDR_NAMER(0)].addr = ns; - namer_inst.rank = entity_map[MSG_ADDR_NAMER(0)].rank = 0; -} - -/* connect_rank - * NOTE: assumes rank.lock held. - */ -Rank::Pipe *Rank::connect_rank(const entity_inst_t& inst) -{ - assert(rank.lock.is_locked()); - assert(inst != rank.my_inst); - - dout(10) << "connect_rank to " << inst << endl; - - // create pipe - Pipe *pipe = new Pipe(inst); - rank.rank_pipe[inst.rank] = pipe; - pipes.insert(pipe); - - return pipe; -} - - - - - -void Rank::show_dir() -{ - dout(10) << "show_dir ---" << endl; - - for (hash_map::iterator i = entity_map.begin(); - i != entity_map.end(); - i++) { - if (local.count(i->first)) { - dout(10) << "show_dir entity_map " << i->first << " -> " << i->second << " local " << endl; - } else { - dout(10) << "show_dir entity_map " << i->first << " -> " << i->second << endl; - } - } -} - - -/* lookup - * NOTE: assumes directory.lock held - */ -void Rank::lookup(entity_name_t addr) -{ - dout(10) << "lookup " << addr << endl; - assert(lock.is_locked()); - - assert(looking_up.count(addr) == 0); - looking_up.insert(addr); - - MNSLookup *r = new MNSLookup(addr); - messenger->send_message(r, MSG_ADDR_NAMER(0), namer_inst); -} - - - -/* register_entity - */ -Rank::EntityMessenger *Rank::register_entity(entity_name_t addr) -{ - dout(10) << "register_entity " << addr << endl; - lock.Lock(); - - // register with namer - static long reg_attempt = 0; - long id = ++reg_attempt; - - Message *reg = new MNSRegister(addr, my_rank, id); - reg->set_source(MSG_ADDR_RANK(my_rank), 0); - reg->set_source_inst(my_inst); - reg->set_dest(MSG_ADDR_DIRECTORY, 0); - - // prepare cond - Cond cond; - waiting_for_register_cond[id] = &cond; - - // send request - lock.Unlock(); - submit_message(reg); - lock.Lock(); - - // wait - while (!waiting_for_register_result.count(id)) - cond.Wait(lock); - - // grab result - addr = waiting_for_register_result[id]; - dout(10) << "register_entity got " << addr << endl; - - // clean up - waiting_for_register_cond.erase(id); - waiting_for_register_result.erase(id); - - // create messenger - EntityMessenger *msgr = new EntityMessenger(addr); - - // add to directory - entity_map[addr] = my_inst; - local[addr] = msgr; - - // was anyone waiting? - if (waiting_for_lookup.count(addr)) { - submit_messages(waiting_for_lookup[addr]); - waiting_for_lookup.erase(addr); - } - - lock.Unlock(); - return msgr; -} - -void Rank::unregister_entity(EntityMessenger *msgr) -{ - lock.Lock(); - dout(10) << "unregister_entity " << msgr->get_myaddr() << endl; - - // remove from local directory. - assert(local.count(msgr->get_myaddr())); - local.erase(msgr->get_myaddr()); - - if (my_rank > 0) { - assert(entity_map.count(msgr->get_myaddr())); - entity_map.erase(msgr->get_myaddr()); - } // else namer will do it. - - // tell namer. - if (msgr->get_myaddr() != MSG_ADDR_NAMER(0) && - msgr->get_myaddr() != MSG_ADDR_RANK(0)) - msgr->send_message(new MGenericMessage(MSG_NS_UNREGISTER), - MSG_ADDR_NAMER(0), namer_inst); - - // kick wait()? - if (local.size() <= 2) - wait_cond.Signal(); - - lock.Unlock(); -} - - -void Rank::submit_messages(list& ls) -{ - for (list::iterator i = ls.begin(); i != ls.end(); i++) - submit_message(*i); - ls.clear(); -} - - - -void Rank::submit_message(Message *m, const entity_inst_t& dest_inst) -{ - const entity_name_t dest = m->get_dest(); - - // lookup - EntityMessenger *entity = 0; - Pipe *pipe = 0; - - lock.Lock(); - { - // local? - if (dest_inst.rank == my_inst.rank) { - if (local.count(dest)) { - // local - dout(20) << "submit_message " << *m << " dest " << dest << " local" << endl; - if (g_conf.ms_single_dispatch) { - _submit_single_dispatch(m); - } else { - entity = local[dest]; - } - } else { - // mid-register - dout(20) << "submit_message " << *m << " dest " << dest << " " << dest_inst << " local but mid-register, waiting." << endl; - assert(0); // hmpf - waiting_for_lookup[dest].push_back(m); - } - } - else { - // remote. - if (rank_pipe.count( dest_inst.rank )) { - //&& - //rank_pipe[dest_inst.rank]->inst == dest_inst) { - dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << dest_inst << ", already connected." << endl; - // connected. - pipe = rank_pipe[ dest_inst.rank ]; - } else { - dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << dest_inst << ", connecting." << endl; - // not connected. - pipe = connect_rank( dest_inst ); - } - } - } - lock.Unlock(); - - // do it - if (entity) { - // local! - dout(20) << "submit_message " << *m << " dest " << dest << " local, queueing" << endl; - entity->queue_message(m); - } - else if (pipe) { - // remote! - dout(20) << "submit_message " << *m << " dest " << dest << " remote, sending" << endl; - pipe->send(m); - } -} - - -void Rank::submit_message(Message *m) -{ - const entity_name_t dest = m->get_dest(); - - // lookup - EntityMessenger *entity = 0; - Pipe *pipe = 0; - - lock.Lock(); - { - if (local.count(dest)) { - dout(20) << "submit_message " << *m << " dest " << dest << " local" << endl; - - // local - if (g_conf.ms_single_dispatch) { - _submit_single_dispatch(m); - } else { - entity = local[dest]; - } - } else if (entity_map.count( dest )) { - // remote, known rank addr. - entity_inst_t inst = entity_map[dest]; - - if (inst == my_inst) { - dout(20) << "submit_message " << *m << " dest " << dest << " local but mid-register, waiting." << endl; - waiting_for_lookup[dest].push_back(m); - } - else if (rank_pipe.count( inst.rank ) && - rank_pipe[inst.rank]->get_peer_inst() == inst) { - dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << inst << ", connected." << endl; - // connected. - pipe = rank_pipe[ inst.rank ]; - } else { - dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << inst << ", connecting." << endl; - // not connected. - pipe = connect_rank( inst ); - } - } else { - // unknown dest rank or rank addr. - if (looking_up.count(dest) == 0) { - dout(20) << "submit_message " << *m << " dest " << dest << " remote, unknown addr, looking up" << endl; - lookup(dest); - } else { - dout(20) << "submit_message " << *m << " dest " << dest << " remote, unknown addr, already looking up" << endl; - } - waiting_for_lookup[dest].push_back(m); - } - } - lock.Unlock(); - - // do it - if (entity) { - // local! - dout(20) << "submit_message " << *m << " dest " << dest << " local, queueing" << endl; - entity->queue_message(m); - } - else if (pipe) { - // remote! - dout(20) << "submit_message " << *m << " dest " << dest << " remote, sending" << endl; - pipe->send(m); - } -} - - - - -void Rank::dispatch(Message *m) -{ - lock.Lock(); - - dout(10) << "dispatching " << *m << endl; - - switch (m->get_type()) { - case MSG_NS_CONNECTACK: - handle_connect_ack((MNSConnectAck*)m); - break; - - case MSG_NS_REGISTERACK: - handle_register_ack((MNSRegisterAck*)m); - break; - - case MSG_NS_LOOKUPREPLY: - handle_lookup_reply((MNSLookupReply*)m); - break; - - default: - assert(0); - } - - lock.Unlock(); -} - -void Rank::handle_connect_ack(MNSConnectAck *m) -{ - dout(10) << "handle_connect_ack, my rank is " << m->get_rank() << endl; - my_rank = m->get_rank(); - - my_inst.addr = accepter.listen_addr; - my_inst.rank = my_rank; - - waiting_for_rank.SignalAll(); - delete m; - - // logger! - /*dout(10) << "logger" << endl; - char names[100]; - sprintf(names, "rank%d", my_rank); - string name = names; - - if (g_conf.tcp_log) { - logger = new Logger(name, (LogType*)&rank_logtype); - rank_logtype.add_set("num"); - rank_logtype.add_inc("in"); - rank_logtype.add_inc("inb"); - rank_logtype.add_inc("dis"); - rank_logtype.add_set("inq"); - rank_logtype.add_set("inqb"); - rank_logtype.add_set("outq"); - rank_logtype.add_set("outqb"); - } - */ -} - - -void Rank::handle_register_ack(MNSRegisterAck *m) -{ - dout(10) << "handle_register_ack " << m->get_entity() << endl; - - long tid = m->get_tid(); - waiting_for_register_result[tid] = m->get_entity(); - waiting_for_register_cond[tid]->Signal(); - delete m; -} - -void Rank::handle_lookup_reply(MNSLookupReply *m) -{ - list waiting; - dout(10) << "got lookup reply" << endl; - - for (map::iterator it = m->entity_map.begin(); - it != m->entity_map.end(); - it++) { - dout(10) << "lookup got " << it->first << " at " << it->second << endl; - entity_name_t addr = it->first; - entity_inst_t inst = it->second; - - if (entity_map.count(addr) && - entity_map[addr] > inst) { - dout(10) << "ignoring lookup results for " << addr << ", " \ - << entity_map[addr] << " > " << inst << endl; - continue; - } - - // update map. - entity_map[addr] = inst; - - if (inst.rank == my_rank) { - // local - dout(10) << "delivering lookup results locally" << endl; - if (local.count(addr)) { - if (g_conf.ms_single_dispatch) { - single_dispatch_queue.splice(single_dispatch_queue.end(), - waiting_for_lookup[addr]); - } else { - local[addr]->queue_messages(waiting_for_lookup[addr]); - } - waiting_for_lookup.erase(addr); - } else - lookup(addr); // try again! - - } else { - // remote - if (rank_pipe.count(inst.rank) == 0) - connect_rank(inst); - else if (rank_pipe[inst.rank]->get_peer_inst() != inst) { - dout(0) << "lookup got rank addr change, WATCH OUT" << endl; - // FIXME BUG possible message loss weirdness? - rank_pipe[inst.rank]->close(); - rank_pipe.erase(inst.rank); - connect_rank(inst); - } - - // take waiters - Pipe *pipe = rank_pipe[inst.rank]; - assert(pipe); - - if (waiting_for_lookup.count(addr)) { - pipe->send(waiting_for_lookup[addr]); - waiting_for_lookup.erase(addr); - } - } - } - - delete m; -} - - -void Rank::wait() -{ - lock.Lock(); - while (1) { - // reap dead pipes - reaper(); - - if (local.size() == 0) { - dout(10) << "wait: everything stopped" << endl; - break; // everything stopped. - } - - if (local.size() == 1 && - !messenger->is_stopped()) { - dout(10) << "wait: stopping rank" << endl; - lock.Unlock(); - messenger->shutdown(); - delete messenger; - lock.Lock(); - continue; - } - - wait_cond.Wait(lock); - } - lock.Unlock(); - - // done! clean up. - - // stop dispatch thread - if (g_conf.ms_single_dispatch) { - dout(10) << "wait: stopping dispatch thread" << endl; - lock.Lock(); - single_dispatch_stop = true; - single_dispatch_cond.Signal(); - lock.Unlock(); - single_dispatcher.join(); - } - - // reap pipes - lock.Lock(); - { - dout(10) << "wait: closing pipes" << endl; - list toclose; - for (hash_map::iterator i = rank_pipe.begin(); - i != rank_pipe.end(); - i++) - toclose.push_back(i->second); - for (list::iterator i = toclose.begin(); - i != toclose.end(); - i++) - (*i)->close(); - - dout(10) << "wait: waiting for pipes " << pipes << " to close" << endl; - while (!pipes.empty()) { - wait_cond.Wait(lock); - reaper(); - } - } - lock.Unlock(); - - dout(10) << "wait: done." << endl; -} - - - -int Rank::find_ns_addr(tcpaddr_t &nsa) -{ - // file? - int fd = ::open(".ceph_ns",O_RDONLY); - if (fd > 0) { - ::read(fd, (void*)&nsa, sizeof(nsa)); - ::close(fd); - cout << "ceph ns is " << nsa << endl; - return 0; - } - - // env var? - char *nsaddr = getenv("CEPH_NAMESERVER");////envz_entry(*envp, e_len, "CEPH_NAMESERVER"); - if (nsaddr) { - while (nsaddr[0] != '=') nsaddr++; - nsaddr++; - - if (tcp_hostlookup(nsaddr, nsa) < 0) { - cout << "can't resolve " << nsaddr << endl; - return -1; - } - - cout << "ceph ns is " << nsa << endl; - return 0; - } - - cerr << "i can't find ceph ns addr in .ceph_ns or CEPH_NAMESERVER" << endl; - return -1; -} - - - -/********************************** - * EntityMessenger - */ - -Rank::EntityMessenger::EntityMessenger(entity_name_t myaddr) : - Messenger(myaddr), - stop(false), - dispatch_thread(this) -{ -} -Rank::EntityMessenger::~EntityMessenger() -{ -} - -void Rank::EntityMessenger::dispatch_entry() -{ - lock.Lock(); - while (!stop) { - if (!dispatch_queue.empty()) { - list ls; - ls.swap(dispatch_queue); - - lock.Unlock(); - { - // deliver - while (!ls.empty()) { - Message *m = ls.front(); - ls.pop_front(); - dout(1) //<< g_clock.now() - << "---- " - << m->get_source()// << ':' << m->get_source_port() - << " to " << m->get_dest()// << ':' << m->get_dest_port() - << " ---- " << m->get_type_name() - << " ---- " << m->get_source_inst() - << " ---- " << m - << endl; - dispatch(m); - } - } - lock.Lock(); - continue; - } - cond.Wait(lock); - } - lock.Unlock(); -} - -void Rank::EntityMessenger::ready() -{ - dout(10) << "ready " << get_myaddr() << endl; - - if (g_conf.ms_single_dispatch) { - rank.lock.Lock(); - if (rank.waiting_for_ready.count(get_myaddr())) { - rank.single_dispatch_queue.splice(rank.single_dispatch_queue.end(), - rank.waiting_for_ready[get_myaddr()]); - rank.waiting_for_ready.erase(get_myaddr()); - rank.single_dispatch_cond.Signal(); - } - rank.lock.Unlock(); - } else { - // start my dispatch thread - dispatch_thread.create(); - } - - // tell namer - if (get_myaddr() != MSG_ADDR_NAMER(0) && - get_myaddr() != MSG_ADDR_RANK(0)) - send_message(new MGenericMessage(MSG_NS_STARTED), MSG_ADDR_NAMER(0), rank.namer_inst); -} - - -int Rank::EntityMessenger::shutdown() -{ - dout(10) << "shutdown " << get_myaddr() << endl; - - // deregister - rank.unregister_entity(this); - - // stop my dispatch thread - if (dispatch_thread.am_self()) { - dout(1) << "shutdown i am dispatch, setting stop flag" << endl; - stop = true; - } else { - dout(1) << "shutdown i am not dispatch, setting stop flag and joining thread." << endl; - lock.Lock(); - stop = true; - cond.Signal(); - lock.Unlock(); - dispatch_thread.join(); - } - - return 0; -} - - -void Rank::EntityMessenger::prepare_dest(const entity_inst_t& inst) -{ - rank.lock.Lock(); - { - if (rank.rank_pipe.count(inst.rank) == 0) - rank.connect_rank(inst); - } - rank.lock.Unlock(); -} - -int Rank::EntityMessenger::send_message(Message *m, entity_name_t dest, const entity_inst_t& inst, - int port, int fromport) -{ - // set envelope - m->set_source(get_myaddr(), fromport); - m->set_dest(dest, port); - - m->set_source_inst(rank.my_inst); - - dout(1) << "--> " - << m->get_source() //<< ':' << m->get_source_port() - << " to " << m->get_dest() //<< ':' << m->get_dest_port() - << " ---- " << m->get_type_name() - << " ---- " << rank.my_inst << " --> " << inst - << " ---- " << m - << endl; - - rank.submit_message(m, inst); - - return 0; -} - - -int Rank::EntityMessenger::send_message(Message *m, entity_name_t dest, int port, int fromport) -{ - // set envelope - m->set_source(get_myaddr(), fromport); - m->set_dest(dest, port); - - m->set_source_inst(rank.my_inst); - - dout(1) << "--> " - << m->get_source() //<< ':' << m->get_source_port() - << " to " << m->get_dest() //<< ':' << m->get_dest_port() - << " ---- " << m->get_type_name() - << " ---- " << rank.my_inst << " --> ? (DEPRECATED)" - << " ---- " << m - << endl; - - rank.submit_message(m); - - return 0; -} - - -void Rank::EntityMessenger::mark_down(entity_name_t a, entity_inst_t& i) -{ - assert(a != get_myaddr()); - rank.mark_down(a,i); -} - -void Rank::mark_down(entity_name_t a, entity_inst_t& inst) -{ - //if (my_rank == 0) return; // ugh.. rank0 already handles this stuff in the namer - lock.Lock(); - if (entity_map.count(a) && - entity_map[a] > inst) { - dout(10) << "mark_down " << a << " inst " << inst << " < " << entity_map[a] << endl; - derr(10) << "mark_down " << a << " inst " << inst << " < " << entity_map[a] << endl; - // do nothing! - } else { - if (entity_map.count(a) == 0) { - // don't know it - dout(10) << "mark_down " << a << " inst " << inst << " ... unknown by me" << endl; - derr(10) << "mark_down " << a << " inst " << inst << " ... unknown by me" << endl; - - waiting_for_lookup.erase(a); - looking_up.erase(a); - } else { - // know it - assert(entity_map[a] <= inst); - dout(10) << "mark_down " << a << " inst " << inst << endl; - derr(10) << "mark_down " << a << " inst " << inst << endl; - - entity_map.erase(a); - - if (rank_pipe.count(inst.rank)) { - rank_pipe[inst.rank]->close(); - rank_pipe.erase(inst.rank); - } - - // kill rank# too? only if i'm the namer. - if (my_rank == 0) { - entity_map.erase(MSG_ADDR_RANK(inst.rank)); - } - } - } - lock.Unlock(); -} - -void Rank::EntityMessenger::mark_up(entity_name_t a, entity_inst_t& i) -{ - assert(a != get_myaddr()); - rank.mark_up(a, i); -} - -void Rank::mark_up(entity_name_t a, entity_inst_t& i) -{ - if (my_rank == 0) return; - lock.Lock(); - { - dout(10) << "mark_up " << a << " inst " << i << endl; - derr(10) << "mark_up " << a << " inst " << i << endl; - - assert(i.rank != my_rank); // hrm? - - if (entity_map.count(a) == 0 || - entity_map[a] < i) { - entity_map[a] = i; - connect_rank(i); - } else if (entity_map[a] == i) { - dout(10) << "mark_up " << a << " inst " << i << " ... knew it" << endl; - derr(10) << "mark_up " << a << " inst " << i << " ... knew it" << endl; - } else { - dout(-10) << "mark_up " << a << " inst " << i << " < " << entity_map[a] << endl; - derr(-10) << "mark_up " << a << " inst " << i << " < " << entity_map[a] << endl; - } - - //if (waiting_for_lookup.count(a)) - //lookup(a); - } - lock.Unlock(); -} - diff --git a/tags/20070517_before_mds_merge/msg/NewerMessenger.h b/tags/20070517_before_mds_merge/msg/NewerMessenger.h deleted file mode 100644 index 29b885745df48..0000000000000 --- a/tags/20070517_before_mds_merge/msg/NewerMessenger.h +++ /dev/null @@ -1,343 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __NEWMESSENGER_H -#define __NEWMESSENGER_H - - -#include -#include -using namespace std; -#include -#include -using namespace __gnu_cxx; - - -#include "include/types.h" - -#include "common/Mutex.h" -#include "common/Cond.h" -#include "common/Thread.h" - -#include "Messenger.h" -#include "Message.h" -#include "tcp.h" - - - - -/* Rank - per-process - */ -class Rank : public Dispatcher { - - class EntityMessenger; - class Pipe; - - // namer - class Namer : public Dispatcher { - public: - EntityMessenger *messenger; // namerN - - int nrank; - int nclient, nmds, nosd, nmon; - - map > waiting; - - Namer(EntityMessenger *msgr); - ~Namer(); - - void handle_connect(class MNSConnect*); - void handle_register(class MNSRegister *m); - void handle_started(Message *m); - void handle_lookup(class MNSLookup *m); - void handle_unregister(Message *m); - void handle_failure(class MNSFailure *m); - - void dispatch(Message *m); - - void manual_insert_inst(const entity_inst_t &inst); - - }; - - // incoming - class Accepter : public Thread { - public: - bool done; - - tcpaddr_t listen_addr; - int listen_sd; - - Accepter() : done(false) {} - - void *entry(); - void stop() { - done = true; - ::close(listen_sd); - join(); - } - int start(); - } accepter; - - - - class Pipe { - protected: - int sd; - bool done; - entity_inst_t peer_inst; - bool server; - bool sent_close; - - bool reader_running; - bool writer_running; - - list q; - Mutex lock; - Cond cond; - - int accept(); // server handshake - int connect(); // client handshake - void reader(); - void writer(); - - Message *read_message(); - int write_message(Message *m); - void fail(list& ls); - - // threads - class Reader : public Thread { - Pipe *pipe; - public: - Reader(Pipe *p) : pipe(p) {} - void *entry() { pipe->reader(); return 0; } - } reader_thread; - friend class Reader; - - class Writer : public Thread { - Pipe *pipe; - public: - Writer(Pipe *p) : pipe(p) {} - void *entry() { pipe->writer(); return 0; } - } writer_thread; - friend class Writer; - - public: - Pipe(int s) : sd(s), - done(false), server(true), - sent_close(false), - reader_running(false), writer_running(false), - reader_thread(this), writer_thread(this) { - // server - reader_running = true; - reader_thread.create(); - } - Pipe(const entity_inst_t &pi) : sd(0), - done(false), peer_inst(pi), server(false), - sent_close(false), - reader_running(false), writer_running(false), - reader_thread(this), writer_thread(this) { - // client - writer_running = true; - writer_thread.create(); - } - - // public constructors - static const Pipe& Server(int s); - static const Pipe& Client(const entity_inst_t& pi); - - entity_inst_t& get_peer_inst() { return peer_inst; } - - void close(); - void join() { - writer_thread.join(); - reader_thread.join(); - } - - void send(Message *m) { - lock.Lock(); - q.push_back(m); - cond.Signal(); - lock.Unlock(); - } - void send(list& ls) { - lock.Lock(); - q.splice(q.end(), ls); - cond.Signal(); - lock.Unlock(); - } - }; - - - - // messenger interface - class EntityMessenger : public Messenger { - Mutex lock; - Cond cond; - list dispatch_queue; - bool stop; - - class DispatchThread : public Thread { - EntityMessenger *m; - public: - DispatchThread(EntityMessenger *_m) : m(_m) {} - void *entry() { - m->dispatch_entry(); - return 0; - } - } dispatch_thread; - void dispatch_entry(); - - public: - void queue_message(Message *m) { - lock.Lock(); - dispatch_queue.push_back(m); - cond.Signal(); - lock.Unlock(); - } - void queue_messages(list ls) { - lock.Lock(); - dispatch_queue.splice(dispatch_queue.end(), ls); - cond.Signal(); - lock.Unlock(); - } - - public: - EntityMessenger(entity_name_t myaddr); - ~EntityMessenger(); - - void ready(); - bool is_stopped() { return stop; } - - void wait() { - dispatch_thread.join(); - } - - virtual void callback_kick() {} - virtual int shutdown(); - virtual void prepare_dest(const entity_inst_t& inst); - virtual int send_message(Message *m, entity_name_t dest, int port=0, int fromport=0); - virtual int send_message(Message *m, entity_name_t dest, const entity_inst_t& inst, - int port=0, int fromport=0); - - virtual void mark_down(entity_name_t a, entity_inst_t& i); - virtual void mark_up(entity_name_t a, entity_inst_t& i); - //virtual void reset(msg_addr_t a); - }; - - - class SingleDispatcher : public Thread { - Rank *rank; - public: - SingleDispatcher(Rank *r) : rank(r) {} - void *entry() { - rank->single_dispatcher_entry(); - return 0; - } - } single_dispatcher; - - Cond single_dispatch_cond; - bool single_dispatch_stop; - list single_dispatch_queue; - - map > waiting_for_ready; - - void single_dispatcher_entry(); - void _submit_single_dispatch(Message *m); - - - // Rank stuff - public: - Mutex lock; - Cond wait_cond; // for wait() - - // my rank - int my_rank; - Cond waiting_for_rank; - - // my instance - entity_inst_t my_inst; - - // lookup - hash_map entity_map; - hash_set entity_unstarted; - - map > waiting_for_lookup; - set looking_up; - - // register - map waiting_for_register_cond; - map waiting_for_register_result; - - // local - map local; - - // remote - hash_map rank_pipe; - - set pipes; - list pipe_reap_queue; - - EntityMessenger *messenger; // rankN - Namer *namer; - - entity_inst_t namer_inst; - - void show_dir(); - - void lookup(entity_name_t addr); - - void dispatch(Message *m); - void handle_connect_ack(class MNSConnectAck *m); - void handle_register_ack(class MNSRegisterAck *m); - void handle_lookup_reply(class MNSLookupReply *m); - - Pipe *connect_rank(const entity_inst_t& inst); - - void mark_down(entity_name_t addr, entity_inst_t& i); - void mark_up(entity_name_t addr, entity_inst_t& i); - - tcpaddr_t get_listen_addr() { return accepter.listen_addr; } - - void reaper(); - - -public: - Rank(int r=-1); - ~Rank(); - - int find_ns_addr(tcpaddr_t &tcpaddr); - - void set_namer(const tcpaddr_t& ns); - void start_namer(); - - int start_rank(); - void wait(); - - EntityMessenger *register_entity(entity_name_t addr); - void unregister_entity(EntityMessenger *ms); - - void submit_message(Message *m, const entity_inst_t& inst); - void prepare_dest(const entity_inst_t& inst); - void submit_message(Message *m); - void submit_messages(list& ls); - - // create a new messenger - EntityMessenger *new_entity(entity_name_t addr); - -} ; - - - -extern Rank rank; - -#endif diff --git a/tags/20070517_before_mds_merge/msg/RWLock.h b/tags/20070517_before_mds_merge/msg/RWLock.h deleted file mode 100644 index 83b84c6faf370..0000000000000 --- a/tags/20070517_before_mds_merge/msg/RWLock.h +++ /dev/null @@ -1,49 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef _RWLock_Posix_ -#define _RWLock_Posix_ - -#include - -class RWLock -{ - mutable pthread_rwlock_t L; - - public: - - RWLock() { - pthread_rwlock_init(&L, NULL); - } - - virtual ~RWLock() { - pthread_rwlock_unlock(&L); - pthread_rwlock_destroy(&L); - } - - void unlock() { - pthread_rwlock_unlock(&L); - } - void get_read() { - pthread_rwlock_rdlock(&L); - } - void put_read() { unlock(); } - void get_write() { - pthread_rwlock_wrlock(&L); - } - void put_write() { unlock(); } -}; - -#endif // !_Mutex_Posix_ diff --git a/tags/20070517_before_mds_merge/msg/SerialMessenger.h b/tags/20070517_before_mds_merge/msg/SerialMessenger.h deleted file mode 100644 index 1c5c9e9c3961a..0000000000000 --- a/tags/20070517_before_mds_merge/msg/SerialMessenger.h +++ /dev/null @@ -1,28 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __SERIAL_MESSENGER_H -#define __SERIAL_MESSENGER_H - -#include "Dispatcher.h" -#include "Message.h" - -class SerialMessenger : public Dispatcher { - public: - virtual void dispatch(Message *m) = 0; // i receive my messages here - virtual void send(Message *m, entity_name_t dest, int port=0, int fromport=0) = 0; // doesn't block - virtual Message *sendrecv(Message *m, entity_name_t dest, int port=0, int fromport=0) = 0; // blocks for matching reply -}; - -#endif diff --git a/tags/20070517_before_mds_merge/msg/SimpleMessenger.cc b/tags/20070517_before_mds_merge/msg/SimpleMessenger.cc deleted file mode 100644 index de90acaafd6ac..0000000000000 --- a/tags/20070517_before_mds_merge/msg/SimpleMessenger.cc +++ /dev/null @@ -1,1197 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "SimpleMessenger.h" - -#include -#include -#include -#include -#include - -#include "config.h" - -#include "messages/MGenericMessage.h" - -//#include "messages/MFailure.h" - -#include - - -#undef dout -#define dout(l) if (l<=g_conf.debug_ms) cout << g_clock.now() << " -- " << rank.my_addr << " " -#define derr(l) if (l<=g_conf.debug_ms) cerr << g_clock.now() << " -- " << rank.my_addr << " " - - - -#include "tcp.cc" - - -Rank rank; - - - -/******************************************** - * Accepter - */ - -void simplemessenger_sigint(int r) -{ - rank.sigint(); -} - -void Rank::sigint() -{ - lock.Lock(); - derr(0) << "got control-c, exiting" << endl; - ::close(accepter.listen_sd); - _exit(-1); - lock.Unlock(); -} - - - - -int Rank::Accepter::start() -{ - // bind to a socket - dout(10) << "accepter.start binding to listen " << endl; - - // use whatever user specified.. - g_my_addr.make_addr(rank.listen_addr); - - /* socket creation */ - listen_sd = socket(AF_INET,SOCK_STREAM,0); - assert(listen_sd > 0); - - /* bind to port */ - int rc = bind(listen_sd, (struct sockaddr *) &rank.listen_addr, sizeof(rank.listen_addr)); - if (rc < 0) - derr(0) << "accepter.start unable to bind to " << rank.listen_addr << endl; - assert(rc >= 0); - - // what port did we get? - socklen_t llen = sizeof(rank.listen_addr); - getsockname(listen_sd, (sockaddr*)&rank.listen_addr, &llen); - - dout(10) << "accepter.start bound to " << rank.listen_addr << endl; - - // listen! - rc = ::listen(listen_sd, 1000); - assert(rc >= 0); - - // my address is... HELP HELP HELP! - char host[100]; - bzero(host, 100); - gethostname(host, 100); - //dout(10) << "accepter.start my hostname is " << host << endl; - - struct hostent *myhostname = gethostbyname( host ); - - // figure out my_addr - if (g_my_addr.port > 0) { - // user specified it, easy peasy. - rank.my_addr = g_my_addr; - } else { - // look up my hostname. blech! this sucks. - rank.listen_addr.sin_family = myhostname->h_addrtype; - memcpy((char *) &rank.listen_addr.sin_addr.s_addr, - myhostname->h_addr_list[0], - myhostname->h_length); - - // set up my_addr with a nonce - rank.my_addr.set_addr(rank.listen_addr); - rank.my_addr.nonce = getpid(); // FIXME: pid might not be best choice here. - } - - dout(10) << "accepter.start my addr is " << rank.my_addr << endl; - - // set up signal handler - signal(SIGINT, simplemessenger_sigint); - - // start thread - create(); - - return 0; -} - -void *Rank::Accepter::entry() -{ - dout(10) << "accepter starting" << endl; - - while (!done) { - // accept - struct sockaddr_in addr; - socklen_t slen = sizeof(addr); - int sd = ::accept(listen_sd, (sockaddr*)&addr, &slen); - if (sd > 0) { - dout(10) << "accepted incoming on sd " << sd << endl; - - rank.lock.Lock(); - if (!rank.local.empty()) { - Pipe *p = new Pipe(sd); - rank.pipes.insert(p); - } - rank.lock.Unlock(); - } else { - dout(10) << "no incoming connection?" << endl; - break; - } - } - - return 0; -} - - - -/************************************** - * Pipe - */ - -int Rank::Pipe::accept() -{ - // my creater gave me sd via accept() - - // announce myself. - int rc = tcp_write(sd, (char*)&rank.my_addr, sizeof(rank.my_addr)); - if (rc < 0) { - ::close(sd); - done = true; - return -1; - } - - // identify peer - rc = tcp_read(sd, (char*)&peer_addr, sizeof(peer_addr)); - if (rc < 0) { - dout(10) << "pipe(? " << this << ").accept couldn't read peer inst" << endl; - ::close(sd); - done = true; - return -1; - } - - // create writer thread. - writer_running = true; - writer_thread.create(); - - // register pipe. - rank.lock.Lock(); - { - if (rank.rank_pipe.count(peer_addr) == 0) { - // install a pipe! - dout(10) << "pipe(" << peer_addr << ' ' << this << ").accept peer is " << peer_addr << endl; - rank.rank_pipe[peer_addr] = this; - } else { - // low ranks' Pipes "win" - if (peer_addr < rank.my_addr) { - dout(10) << "pipe(" << peer_addr << ' ' << this << ").accept peer is " << peer_addr - << ", already had pipe, but switching to this new one" << endl; - // switch to this new Pipe - rank.rank_pipe[peer_addr]->close(); // close old one - rank.rank_pipe[peer_addr] = this; - } else { - dout(10) << "pipe(" << peer_addr << ' ' << this << ").accept peer is " << peer_addr - << ", already had pipe, sticking with it" << endl; - } - } - } - rank.lock.Unlock(); - - return 0; // success. -} - -int Rank::Pipe::connect() -{ - dout(10) << "pipe(" << peer_addr << ' ' << this << ").connect" << endl; - - // create socket? - sd = socket(AF_INET,SOCK_STREAM,0); - assert(sd > 0); - - // bind any port - struct sockaddr_in myAddr; - myAddr.sin_family = AF_INET; - myAddr.sin_addr.s_addr = htonl(INADDR_ANY); - myAddr.sin_port = htons( 0 ); - - int rc = bind(sd, (struct sockaddr *) &myAddr, sizeof(myAddr)); - assert(rc>=0); - - // connect! - tcpaddr_t tcpaddr; - peer_addr.make_addr(tcpaddr); - rc = ::connect(sd, (sockaddr*)&tcpaddr, sizeof(myAddr)); - if (rc < 0) { - dout(10) << "connect error " << peer_addr - << ", " << errno << ": " << strerror(errno) << endl; - return rc; - } - - // identify peer - entity_addr_t paddr; - rc = tcp_read(sd, (char*)&paddr, sizeof(paddr)); - if (!rc) { // bool - dout(10) << "pipe(" << peer_addr << ' ' << this << ").connect couldn't read peer addr" << endl; - return -1; - } - if (peer_addr != paddr) { - derr(0) << "pipe(" << peer_addr << ' ' << this << ").connect peer is " << paddr << ", wtf" << endl; - assert(0); - return -1; - } - - // identify myself - rc = tcp_write(sd, (char*)&rank.my_addr, sizeof(rank.my_addr)); - if (rc < 0) - return -1; - - // register pipe - rank.lock.Lock(); - { - if (rank.rank_pipe.count(peer_addr) == 0) { - dout(10) << "pipe(" << peer_addr << ' ' << this << ").connect registering pipe" << endl; - rank.rank_pipe[peer_addr] = this; - } else { - // this is normal. - dout(10) << "pipe(" << peer_addr << ' ' << this << ").connect pipe already registered." << endl; - } - } - rank.lock.Unlock(); - - // start reader - reader_running = true; - reader_thread.create(); - - return 0; -} - - -void Rank::Pipe::close() -{ - dout(10) << "pipe(" << peer_addr << ' ' << this << ").close" << endl; - - // unreg ourselves - rank.lock.Lock(); - { - if (rank.rank_pipe.count(peer_addr) && - rank.rank_pipe[peer_addr] == this) { - dout(10) << "pipe(" << peer_addr << ' ' << this - << ").close unregistering pipe" << endl; - rank.rank_pipe.erase(peer_addr); - } - } - rank.lock.Unlock(); - - // queue close message? - if (!need_to_send_close) { - dout(10) << "pipe(" << peer_addr << ' ' << this - << ").close already closing/closed" << endl; - return; - } - - if (!writer_running) { - dout(10) << "pipe(" << peer_addr << ' ' << this - << ").close not queueing MSG_CLOSE, no writer running" << endl; - } else { - dout(10) << "pipe(" << peer_addr << ' ' << this - << ").close queueing MSG_CLOSE" << endl; - lock.Lock(); - q.push_back(new MGenericMessage(MSG_CLOSE)); - cond.Signal(); - need_to_send_close = false; - lock.Unlock(); - } -} - - -/* read msgs from socket. - * also, server. - * - */ -void Rank::Pipe::reader() -{ - if (server) - accept(); - - // loop. - while (!done) { - Message *m = read_message(); - if (!m || m->get_type() == 0) { - if (m) { - delete m; - dout(10) << "pipe(" << peer_addr << ' ' << this << ").reader read MSG_CLOSE message" << endl; - need_to_send_close = false; - } else { - derr(10) << "pipe(" << peer_addr << ' ' << this << ").reader read null message" << endl; - } - - close(); - - done = true; - cond.Signal(); // wake up writer too. - break; - } - - dout(10) << "pipe(" << peer_addr << ' ' << this << ").reader got message for " << m->get_dest() << endl; - - EntityMessenger *entity = 0; - - rank.lock.Lock(); - { - if (g_conf.ms_single_dispatch) { - // submit to single dispatch queue - rank._submit_single_dispatch(m); - } else { - if (rank.local.count(m->get_dest())) { - // find entity - entity = rank.local[m->get_dest()]; - } else { - entity = rank.find_unnamed(m->get_dest()); - if (!entity) { - if (rank.stopped.count(m->get_dest())) { - // ignore it - } else { - derr(0) << "pipe(" << peer_addr << ' ' << this << ").reader got message " << *m << " for " << m->get_dest() << ", which isn't local" << endl; - assert(0); // FIXME do this differently - } - } - } - } - } - rank.lock.Unlock(); - - if (entity) - entity->queue_message(m); // queue - } - - - // reap? - bool reap = false; - lock.Lock(); - { - reader_running = false; - if (!writer_running) reap = true; - } - lock.Unlock(); - - if (reap) { - dout(20) << "pipe(" << peer_addr << ' ' << this << ").reader queueing for reap" << endl; - ::close(sd); - rank.lock.Lock(); - { - rank.pipe_reap_queue.push_back(this); - rank.wait_cond.Signal(); - } - rank.lock.Unlock(); - } -} - - -/* write msgs to socket. - * also, client. - */ -void Rank::Pipe::writer() -{ - if (!server) { - int rc = connect(); - if (rc < 0) { - derr(1) << "pipe(" << peer_addr << ' ' << this << ").writer error connecting, " - << errno << ": " << strerror(errno) - << endl; - done = true; - list out; - fail(out); - } - } - - // loop. - lock.Lock(); - while (!q.empty() || !done) { - - if (!q.empty()) { - dout(20) << "pipe(" << peer_addr << ' ' << this << ").writer grabbing message(s)" << endl; - - // grab outgoing list - list out; - out.swap(q); - - // drop lock while i send these - lock.Unlock(); - - while (!out.empty()) { - Message *m = out.front(); - out.pop_front(); - - dout(20) << "pipe(" << peer_addr << ' ' << this << ").writer sending " << *m << endl; - - // stamp. - m->set_source_addr(rank.my_addr); - - // marshall - if (m->empty_payload()) - m->encode_payload(); - - if (write_message(m) < 0) { - // failed! - derr(1) << "pipe(" << peer_addr << ' ' << this << ").writer error sending " << *m << " to " << m->get_dest() - << ", " << errno << ": " << strerror(errno) - << endl; - out.push_front(m); - fail(out); - done = true; - break; - } - - // did i just send a close? - if (m->get_type() == MSG_CLOSE) - done = true; - - // clean up - delete m; - } - - lock.Lock(); - continue; - } - - // wait - dout(20) << "pipe(" << peer_addr << ' ' << this << ").writer sleeping" << endl; - cond.Wait(lock); - } - lock.Unlock(); - - dout(20) << "pipe(" << peer_addr << ' ' << this << ").writer finishing" << endl; - - // reap? - bool reap = false; - lock.Lock(); - { - writer_running = false; - if (!reader_running) reap = true; - } - lock.Unlock(); - - if (reap) { - dout(20) << "pipe(" << peer_addr << ' ' << this << ").writer queueing for reap" << endl; - ::close(sd); - rank.lock.Lock(); - { - rank.pipe_reap_queue.push_back(this); - rank.wait_cond.Signal(); - } - rank.lock.Unlock(); - } -} - - -Message *Rank::Pipe::read_message() -{ - // envelope - //dout(10) << "receiver.read_message from sd " << sd << endl; - - msg_envelope_t env; - if (!tcp_read( sd, (char*)&env, sizeof(env) )) { - need_to_send_close = false; - return 0; - } - - dout(20) << "pipe(" << peer_addr << ' ' << this << ").reader got envelope type=" << env.type - << " src " << env.src << " dst " << env.dst - << " nchunks=" << env.nchunks - << endl; - - // payload - bufferlist blist; - for (int i=0; iget_source() << endl; - - return m; -} - - - -int Rank::Pipe::write_message(Message *m) -{ - // get envelope, buffers - msg_envelope_t *env = &m->get_envelope(); - bufferlist blist; - blist.claim( m->get_payload() ); - -#ifdef TCP_KEEP_CHUNKS - env->nchunks = blist.buffers().size(); -#else - env->nchunks = 1; -#endif - - dout(20) << "pipe(" << peer_addr << ' ' << this << ").writer sending " << m << " " << *m - << " to " << m->get_dest() - << endl; - - // send envelope - int r = tcp_write( sd, (char*)env, sizeof(*env) ); - if (r < 0) { - derr(1) << "pipe(" << peer_addr << ' ' << this << ").writer error sending envelope for " << *m - << " to " << m->get_dest() << endl; - need_to_send_close = false; - return -1; - } - - // payload -#ifdef TCP_KEEP_CHUNKS - // send chunk-wise - int i = 0; - for (list::const_iterator it = blist.buffers().begin(); - it != blist.buffers().end(); - it++) { - dout(10) << "pipe(" << peer_addr << ' ' << this << ").writer tcp_sending frag " << i << " len " << (*it).length() << endl; - int size = (*it).length(); - r = tcp_write( sd, (char*)&size, sizeof(size) ); - if (r < 0) { - derr(10) << "pipe(" << peer_addr << ' ' << this << ").writer error sending chunk len for " << *m << " to " << m->get_dest() << endl; - need_to_send_close = false; - return -1; - } - r = tcp_write( sd, (*it).c_str(), size ); - if (r < 0) { - derr(10) << "pipe(" << peer_addr << ' ' << this << ").writer error sending data chunk for " << *m << " to " << m->get_dest() << endl; - need_to_send_close = false; - return -1; - } - i++; - } -#else - // one big chunk - int size = blist.length(); - r = tcp_write( sd, (char*)&size, sizeof(size) ); - if (r < 0) { - derr(10) << "pipe(" << peer_addr << ' ' << this << ").writer error sending data len for " << *m << " to " << m->get_dest() << endl; - need_to_send_close = false; - return -1; - } - dout(20) << "pipe(" << peer_addr << ' ' << this << ").writer data len is " << size << " in " << blist.buffers().size() << " buffers" << endl; - - for (list::const_iterator it = blist.buffers().begin(); - it != blist.buffers().end(); - it++) { - if ((*it).length() == 0) continue; // blank buffer. - r = tcp_write( sd, (char*)(*it).c_str(), (*it).length() ); - if (r < 0) { - derr(10) << "pipe(" << peer_addr << ' ' << this << ").writer error sending data megachunk for " << *m << " to " << m->get_dest() << " : len " << (*it).length() << endl; - need_to_send_close = false; - return -1; - } - } -#endif - - return 0; -} - - -void Rank::Pipe::fail(list& out) -{ - derr(10) << "pipe(" << peer_addr << ' ' << this << ").fail" << endl; - - // FIXME: possible race before i reclaim lock here? - - // deactivate myself - rank.lock.Lock(); - { - if (rank.rank_pipe.count(peer_addr) && - rank.rank_pipe[peer_addr] == this) - rank.rank_pipe.erase(peer_addr); - } - rank.lock.Unlock(); - - // what do i do about reader()? FIXME - - // sort my messages by (source) dispatcher, dest. - map > > by_dis; - lock.Lock(); - { - // include out at front of queue - q.splice(q.begin(), out); - - // sort - while (!q.empty()) { - if (q.front()->get_type() == MSG_CLOSE) { - delete q.front(); - } - else if (rank.local.count(q.front()->get_source())) { - EntityMessenger *mgr = rank.local[q.front()->get_source()]; - Dispatcher *dis = mgr->get_dispatcher(); - if (mgr->is_stopped()) { - // ignore. - dout(1) << "pipe(" << peer_addr << ' ' << this << ").fail on " << *q.front() << ", dispatcher stopping, ignoring." << endl; - delete q.front(); - } else { - by_dis[dis][q.front()->get_dest()].push_back(q.front()); - } - } - else { - // oh well. sending entity musta just shut down? - assert(0); - delete q.front(); - } - q.pop_front(); - } - } - lock.Unlock(); - - // report failure(s) to dispatcher(s) - for (map > >::iterator i = by_dis.begin(); - i != by_dis.end(); - ++i) - for (map >::iterator j = i->second.begin(); - j != i->second.end(); - ++j) - for (list::iterator k = j->second.begin(); - k != j->second.end(); - ++k) { - derr(1) << "pipe(" << peer_addr << ' ' << this << ").fail on " << **k << " to " << (*k)->get_dest_inst() << endl; - i->first->ms_handle_failure(*k, (*k)->get_dest_inst()); - } -} - - - - - - -/******************************************** - * Rank - */ - -Rank::Rank() : - single_dispatcher(this), - started(false) { - // default to any listen_addr - memset((char*)&listen_addr, 0, sizeof(listen_addr)); - listen_addr.sin_family = AF_INET; - listen_addr.sin_addr.s_addr = htonl(INADDR_ANY); - listen_addr.sin_port = 0; -} -Rank::~Rank() -{ -} - -/* -void Rank::set_listen_addr(tcpaddr_t& a) -{ - dout(10) << "set_listen_addr " << a << endl; - memcpy((char*)&listen_addr.sin_addr.s_addr, (char*)&a.sin_addr.s_addr, 4); - listen_addr.sin_port = a.sin_port; -} -*/ - -void Rank::_submit_single_dispatch(Message *m) -{ - assert(lock.is_locked()); - - if (local.count(m->get_dest()) && - local[m->get_dest()]->is_ready()) { - rank.single_dispatch_queue.push_back(m); - rank.single_dispatch_cond.Signal(); - } else { - waiting_for_ready[m->get_dest()].push_back(m); - } -} - - -void Rank::single_dispatcher_entry() -{ - lock.Lock(); - while (!single_dispatch_stop || !single_dispatch_queue.empty()) { - if (!single_dispatch_queue.empty()) { - list ls; - ls.swap(single_dispatch_queue); - - lock.Unlock(); - { - while (!ls.empty()) { - Message *m = ls.front(); - ls.pop_front(); - - dout(1) << m->get_dest() - << " <-- " << m->get_source_inst() - << " ---- " << *m - << " -- " << m - << endl; - - assert(local.count(m->get_dest())); - local[m->get_dest()]->dispatch(m); - } - } - lock.Lock(); - continue; - } - single_dispatch_cond.Wait(lock); - } - lock.Unlock(); -} - - -/* - * note: assumes lock is held - */ -void Rank::reaper() -{ - dout(10) << "reaper" << endl; - assert(lock.is_locked()); - - while (!pipe_reap_queue.empty()) { - Pipe *p = pipe_reap_queue.front(); - dout(10) << "reaper reaping pipe " << p->get_peer_addr() << endl; - pipe_reap_queue.pop_front(); - assert(pipes.count(p)); - pipes.erase(p); - p->join(); - dout(10) << "reaper reaped pipe " << p->get_peer_addr() << endl; - delete p; - } -} - - -int Rank::start_rank() -{ - lock.Lock(); - if (started) { - dout(10) << "start_rank already started" << endl; - lock.Unlock(); - return 0; - } - dout(10) << "start_rank" << endl; - lock.Unlock(); - - // bind to a socket - if (accepter.start() < 0) - return -1; - - // start single thread dispatcher? - if (g_conf.ms_single_dispatch) { - single_dispatch_stop = false; - single_dispatcher.create(); - } - - lock.Lock(); - - dout(1) << "start_rank at " << listen_addr << endl; - started = true; - lock.Unlock(); - return 0; -} - - - -/* connect_rank - * NOTE: assumes rank.lock held. - */ -Rank::Pipe *Rank::connect_rank(const entity_addr_t& addr) -{ - assert(rank.lock.is_locked()); - assert(addr != rank.my_addr); - - dout(10) << "connect_rank to " << addr << endl; - - // create pipe - Pipe *pipe = new Pipe(addr); - rank.rank_pipe[addr] = pipe; - pipes.insert(pipe); - - return pipe; -} - - - - - - -Rank::EntityMessenger *Rank::find_unnamed(entity_name_t a) -{ - // find an unnamed local entity of the right type - for (map::iterator p = local.begin(); - p != local.end(); - ++p) { - if (p->first.type() == a.type() && p->first.is_new()) - return p->second; - } - return 0; -} - - - - -/* register_entity - */ -Rank::EntityMessenger *Rank::register_entity(entity_name_t name) -{ - dout(10) << "register_entity " << name << endl; - lock.Lock(); - - // create messenger - EntityMessenger *msgr = new EntityMessenger(name); - - // add to directory - assert(local.count(name) == 0); - local[name] = msgr; - - lock.Unlock(); - return msgr; -} - - -void Rank::unregister_entity(EntityMessenger *msgr) -{ - lock.Lock(); - dout(10) << "unregister_entity " << msgr->get_myname() << endl; - - // remove from local directory. - entity_name_t name = msgr->get_myname(); - assert(local.count(name)); - local.erase(name); - - stopped.insert(name); - wait_cond.Signal(); - - lock.Unlock(); -} - - -void Rank::submit_message(Message *m, const entity_addr_t& dest_addr) -{ - const entity_name_t dest = m->get_dest(); - - // lookup - EntityMessenger *entity = 0; - Pipe *pipe = 0; - - lock.Lock(); - { - // local? - if (dest_addr == my_addr) { - if (local.count(dest)) { - // local - dout(20) << "submit_message " << *m << " dest " << dest << " local" << endl; - if (g_conf.ms_single_dispatch) { - _submit_single_dispatch(m); - } else { - entity = local[dest]; - } - } else { - derr(0) << "submit_message " << *m << " dest " << dest << " " << dest_addr << " local but not in local map?" << endl; - //assert(0); // hmpf, this is probably mds->mon beacon from newsyn. - } - } - else { - // remote. - if (rank_pipe.count( dest_addr )) { - dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << dest_addr << ", already connected." << endl; - // connected. - pipe = rank_pipe[ dest_addr ]; - } else { - dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << dest_addr << ", connecting." << endl; - // not connected. - pipe = connect_rank( dest_addr ); - } - } - } - lock.Unlock(); - - // do it - if (entity) { - // local! - dout(20) << "submit_message " << *m << " dest " << dest << " local, queueing" << endl; - entity->queue_message(m); - } - else if (pipe) { - // remote! - dout(20) << "submit_message " << *m << " dest " << dest << " remote, sending" << endl; - pipe->send(m); - } -} - - - - - -void Rank::wait() -{ - lock.Lock(); - while (1) { - // reap dead pipes - reaper(); - - if (local.empty()) { - dout(10) << "wait: everything stopped" << endl; - break; // everything stopped. - } else { - dout(10) << "wait: local still has " << local.size() << " items, waiting" << endl; - } - - wait_cond.Wait(lock); - } - lock.Unlock(); - - // done! clean up. - - //dout(10) << "wait: stopping accepter thread" << endl; - //accepter.stop(); - - // stop dispatch thread - if (g_conf.ms_single_dispatch) { - dout(10) << "wait: stopping dispatch thread" << endl; - lock.Lock(); - single_dispatch_stop = true; - single_dispatch_cond.Signal(); - lock.Unlock(); - single_dispatcher.join(); - } - - // reap pipes - lock.Lock(); - { - dout(10) << "wait: closing pipes" << endl; - list toclose; - for (hash_map::iterator i = rank_pipe.begin(); - i != rank_pipe.end(); - i++) - toclose.push_back(i->second); - for (list::iterator i = toclose.begin(); - i != toclose.end(); - i++) - (*i)->close(); - - dout(10) << "wait: waiting for pipes " << pipes << " to close" << endl; - while (!pipes.empty()) { - wait_cond.Wait(lock); - reaper(); - } - } - lock.Unlock(); - - dout(10) << "wait: done." << endl; -} - - - - - - -/********************************** - * EntityMessenger - */ - -Rank::EntityMessenger::EntityMessenger(entity_name_t myaddr) : - Messenger(myaddr), - stop(false), - dispatch_thread(this) -{ -} -Rank::EntityMessenger::~EntityMessenger() -{ -} - -void Rank::EntityMessenger::dispatch_entry() -{ - lock.Lock(); - while (!stop) { - if (!dispatch_queue.empty()) { - list ls; - ls.swap(dispatch_queue); - - lock.Unlock(); - { - // deliver - while (!ls.empty()) { - if (stop) { - dout(1) << "dispatch: stop=true, discarding " << ls.size() - << " messages in dispatch queue" << endl; - break; - } - Message *m = ls.front(); - ls.pop_front(); - dout(1) << m->get_dest() - << " <-- " << m->get_source_inst() - << " ---- " << *m - << " -- " << m - << endl; - dispatch(m); - } - } - lock.Lock(); - continue; - } - cond.Wait(lock); - } - lock.Unlock(); - - // deregister - rank.unregister_entity(this); -} - -void Rank::EntityMessenger::ready() -{ - dout(10) << "ready " << get_myaddr() << endl; - - if (g_conf.ms_single_dispatch) { - rank.lock.Lock(); - if (rank.waiting_for_ready.count(get_myname())) { - rank.single_dispatch_queue.splice(rank.single_dispatch_queue.end(), - rank.waiting_for_ready[get_myname()]); - rank.waiting_for_ready.erase(get_myname()); - rank.single_dispatch_cond.Signal(); - } - rank.lock.Unlock(); - } else { - // start my dispatch thread - dispatch_thread.create(); - } -} - - -int Rank::EntityMessenger::shutdown() -{ - dout(10) << "shutdown " << get_myaddr() << endl; - - // stop my dispatch thread - if (dispatch_thread.am_self()) { - dout(1) << "shutdown i am dispatch, setting stop flag" << endl; - stop = true; - } else { - dout(1) << "shutdown i am not dispatch, setting stop flag and joining thread." << endl; - lock.Lock(); - stop = true; - cond.Signal(); - lock.Unlock(); - dispatch_thread.join(); - } - - return 0; -} - - -void Rank::EntityMessenger::prepare_dest(const entity_addr_t& addr) -{ - rank.lock.Lock(); - { - if (rank.rank_pipe.count(addr) == 0) - rank.connect_rank(addr); - } - rank.lock.Unlock(); -} - -int Rank::EntityMessenger::send_message(Message *m, entity_inst_t dest, - int port, int fromport) -{ - // set envelope - m->set_source(get_myname(), fromport); - m->set_source_addr(rank.my_addr); - m->set_dest_inst(dest); - m->set_dest_port(port); - - dout(1) << m->get_source() - << " --> " << dest.name << " " << dest.addr - << " -- " << *m - << " -- " << m - << endl; - - rank.submit_message(m, dest.addr); - - return 0; -} - - - -const entity_addr_t &Rank::EntityMessenger::get_myaddr() -{ - return rank.my_addr; -} - - -void Rank::EntityMessenger::reset_myname(entity_name_t newname) -{ - entity_name_t oldname = get_myname(); - dout(10) << "reset_myname " << oldname << " to " << newname << endl; - - rank.local.erase(oldname); - rank.local[newname] = this; - - _set_myname(newname); -} - - - - -void Rank::EntityMessenger::mark_down(entity_addr_t a) -{ - rank.mark_down(a); -} - -void Rank::mark_down(entity_addr_t addr) -{ - //if (my_rank == 0) return; // ugh.. rank0 already handles this stuff in the namer - lock.Lock(); - /* - if (entity_map.count(a) && - entity_map[a] > inst) { - dout(10) << "mark_down " << a << " inst " << inst << " < " << entity_map[a] << endl; - derr(10) << "mark_down " << a << " inst " << inst << " < " << entity_map[a] << endl; - // do nothing! - } else { - if (entity_map.count(a) == 0) { - // don't know it - dout(10) << "mark_down " << a << " inst " << inst << " ... unknown by me" << endl; - derr(10) << "mark_down " << a << " inst " << inst << " ... unknown by me" << endl; - } else { - // know it - assert(entity_map[a] <= inst); - dout(10) << "mark_down " << a << " inst " << inst << endl; - derr(10) << "mark_down " << a << " inst " << inst << endl; - - entity_map.erase(a); - - if (rank_pipe.count(inst)) { - rank_pipe[inst]->close(); - rank_pipe.erase(inst); - } - } - } - */ - lock.Unlock(); -} - - diff --git a/tags/20070517_before_mds_merge/msg/SimpleMessenger.h b/tags/20070517_before_mds_merge/msg/SimpleMessenger.h deleted file mode 100644 index f4cdcd67a84eb..0000000000000 --- a/tags/20070517_before_mds_merge/msg/SimpleMessenger.h +++ /dev/null @@ -1,294 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __SIMPLEMESSENGER_H -#define __SIMPLEMESSENGER_H - - -#include -#include -using namespace std; -#include -#include -using namespace __gnu_cxx; - - -#include "include/types.h" - -#include "common/Mutex.h" -#include "common/Cond.h" -#include "common/Thread.h" - -#include "Messenger.h" -#include "Message.h" -#include "tcp.h" - - - - -/* Rank - per-process - */ -class Rank { -public: - void sigint(); - -private: - class EntityMessenger; - class Pipe; - - // incoming - class Accepter : public Thread { - public: - bool done; - - int listen_sd; - - Accepter() : done(false) {} - - void *entry(); - void stop() { - done = true; - ::close(listen_sd); - join(); - } - int start(); - } accepter; - - void sigint(int r); - - - // pipe - class Pipe { - protected: - int sd; - bool done; - entity_addr_t peer_addr; - bool server; - bool need_to_send_close; - - bool reader_running; - bool writer_running; - - list q; - Mutex lock; - Cond cond; - - int accept(); // server handshake - int connect(); // client handshake - void reader(); - void writer(); - - Message *read_message(); - int write_message(Message *m); - void fail(list& ls); - - // threads - class Reader : public Thread { - Pipe *pipe; - public: - Reader(Pipe *p) : pipe(p) {} - void *entry() { pipe->reader(); return 0; } - } reader_thread; - friend class Reader; - - class Writer : public Thread { - Pipe *pipe; - public: - Writer(Pipe *p) : pipe(p) {} - void *entry() { pipe->writer(); return 0; } - } writer_thread; - friend class Writer; - - public: - Pipe(int s) : sd(s), - done(false), server(true), - need_to_send_close(true), - reader_running(false), writer_running(false), - reader_thread(this), writer_thread(this) { - // server - reader_running = true; - reader_thread.create(); - } - Pipe(const entity_addr_t &pi) : sd(0), - done(false), peer_addr(pi), server(false), - need_to_send_close(true), - reader_running(false), writer_running(false), - reader_thread(this), writer_thread(this) { - // client - writer_running = true; - writer_thread.create(); - } - - // public constructors - static const Pipe& Server(int s); - static const Pipe& Client(const entity_addr_t& pi); - - entity_addr_t& get_peer_addr() { return peer_addr; } - - void close(); - void join() { - writer_thread.join(); - reader_thread.join(); - } - - void send(Message *m) { - lock.Lock(); - q.push_back(m); - cond.Signal(); - lock.Unlock(); - } - void send(list& ls) { - lock.Lock(); - q.splice(q.end(), ls); - cond.Signal(); - lock.Unlock(); - } - }; - - - - // messenger interface - class EntityMessenger : public Messenger { - Mutex lock; - Cond cond; - list dispatch_queue; - bool stop; - - class DispatchThread : public Thread { - EntityMessenger *m; - public: - DispatchThread(EntityMessenger *_m) : m(_m) {} - void *entry() { - m->dispatch_entry(); - return 0; - } - } dispatch_thread; - void dispatch_entry(); - - public: - void queue_message(Message *m) { - lock.Lock(); - dispatch_queue.push_back(m); - cond.Signal(); - lock.Unlock(); - } - void queue_messages(list ls) { - lock.Lock(); - dispatch_queue.splice(dispatch_queue.end(), ls); - cond.Signal(); - lock.Unlock(); - } - - public: - EntityMessenger(entity_name_t myaddr); - ~EntityMessenger(); - - void ready(); - bool is_stopped() { return stop; } - - void wait() { - dispatch_thread.join(); - } - - const entity_addr_t &get_myaddr(); - - void reset_myname(entity_name_t m); - - int shutdown(); - void prepare_dest(const entity_addr_t& addr); - int send_message(Message *m, entity_inst_t dest, - int port=0, int fromport=0); - - void mark_down(entity_addr_t a); - void mark_up(entity_name_t a, entity_addr_t& i); - }; - - - class SingleDispatcher : public Thread { - Rank *rank; - public: - SingleDispatcher(Rank *r) : rank(r) {} - void *entry() { - rank->single_dispatcher_entry(); - return 0; - } - } single_dispatcher; - - Cond single_dispatch_cond; - bool single_dispatch_stop; - list single_dispatch_queue; - - map > waiting_for_ready; - - void single_dispatcher_entry(); - void _submit_single_dispatch(Message *m); - - - // Rank stuff - public: - Mutex lock; - Cond wait_cond; // for wait() - bool started; - - // where i listen - tcpaddr_t listen_addr; - entity_addr_t my_addr; - - // local - map local; - set stopped; - //hash_set entity_unstarted; - - // remote - hash_map rank_pipe; - - set pipes; - list pipe_reap_queue; - - Pipe *connect_rank(const entity_addr_t& addr); - - void mark_down(entity_addr_t addr); - //void mark_up(entity_name_t addr, entity_addr_t& i); - - tcpaddr_t get_listen_addr() { return listen_addr; } - - void reaper(); - - EntityMessenger *find_unnamed(entity_name_t a); - -public: - Rank(); - ~Rank(); - - //void set_listen_addr(tcpaddr_t& a); - - int start_rank(); - void wait(); - - EntityMessenger *register_entity(entity_name_t addr); - void rename_entity(EntityMessenger *ms, entity_name_t newaddr); - void unregister_entity(EntityMessenger *ms); - - void submit_message(Message *m, const entity_addr_t& addr); - void prepare_dest(const entity_addr_t& addr); - - // create a new messenger - EntityMessenger *new_entity(entity_name_t addr); - -} ; - - - -extern Rank rank; - -#endif diff --git a/tags/20070517_before_mds_merge/msg/TCPDirectory.cc b/tags/20070517_before_mds_merge/msg/TCPDirectory.cc deleted file mode 100644 index 57000ac30d74c..0000000000000 --- a/tags/20070517_before_mds_merge/msg/TCPDirectory.cc +++ /dev/null @@ -1,178 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "TCPDirectory.h" - -#include "messages/MNSConnect.h" -#include "messages/MNSConnectAck.h" -#include "messages/MNSRegister.h" -#include "messages/MNSRegisterAck.h" -#include "messages/MNSLookup.h" -#include "messages/MNSLookupReply.h" -//#include "messages/MNSUnregister.h" - -#include "config.h" -#undef dout -#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_ns) cout << "nameserver: " - -void tcp_open(int rank); - - -void TCPDirectory::handle_connect(MNSConnect *m) -{ - int rank = nrank++; - dout(2) << "connect from new rank " << rank << " " << m->get_addr() << endl; - - dir[MSG_ADDR_RANK(rank)] = rank; - messenger->map_entity_rank(MSG_ADDR_RANK(rank), rank); - - rank_addr[rank] = m->get_addr(); - messenger->map_rank_addr(rank, m->get_addr()); - - messenger->send_message(new MNSConnectAck(rank), - MSG_ADDR_RANK(rank)); - delete m; -} - - - -void TCPDirectory::handle_register(MNSRegister *m) -{ - dout(10) << "register from rank " << m->get_rank() << " addr " << MSG_ADDR_NICE(m->get_entity()) << endl; - - // pick id - int rank = m->get_rank(); - entity_name_t entity = m->get_entity(); - - if (entity.is_new()) { - // make up a new address! - switch (entity.type()) { - - case MSG_ADDR_RANK_BASE: // stupid client should be able to figure this out - entity = MSG_ADDR_RANK(rank); - break; - - case MSG_ADDR_MDS_BASE: - entity = MSG_ADDR_MDS(nmds++); - break; - - case MSG_ADDR_OSD_BASE: - entity = MSG_ADDR_OSD(nosd++); - break; - - case MSG_ADDR_CLIENT_BASE: - entity = MSG_ADDR_CLIENT(nclient++); - break; - - default: - assert(0); - } - } else { - // specific address! - assert(dir.count(entity) == 0); // make sure it doesn't exist yet. - } - - dout(2) << "registered " << MSG_ADDR_NICE(entity) << endl; - - // register - dir[entity] = rank; - - if (entity == MSG_ADDR_RANK(rank)) // map this locally now so we can reply - messenger->map_entity_rank(entity, rank); // otherwise wait until they send STARTED msg - - hold.insert(entity); - - ++version; - update_log[version] = entity; - - // reply w/ new id - messenger->send_message(new MNSRegisterAck(m->get_tid(), entity), - MSG_ADDR_RANK(rank)); - delete m; -} - -void TCPDirectory::handle_started(Message *m) -{ - entity_name_t entity = m->get_source(); - - dout(3) << "start signal from " << MSG_ADDR_NICE(entity) << endl; - hold.erase(entity); - messenger->map_entity_rank(entity, dir[entity]); - - // waiters? - if (waiting.count(entity)) { - list ls; - ls.splice(ls.begin(), waiting[entity]); - waiting.erase(entity); - - dout(10) << "doing waiter on " << MSG_ADDR_NICE(entity) << endl; - for (list::iterator it = ls.begin(); - it != ls.end(); - it++) { - dispatch(*it); - } - } -} - -void TCPDirectory::handle_unregister(Message *m) -{ - entity_name_t who = m->get_source(); - dout(2) << "unregister from entity " << MSG_ADDR_NICE(who) << endl; - - assert(dir.count(who)); - dir.erase(who); - - // shutdown? - if (dir.size() <= 2) { - dout(2) << "dir is empty except for me, shutting down" << endl; - tcpmessenger_stop_nameserver(); - } - else { - if (0) { - dout(10) << "dir size now " << dir.size() << endl; - for (hash_map::iterator it = dir.begin(); - it != dir.end(); - it++) { - dout(10) << " dir: " << MSG_ADDR_NICE(it->first) << " on rank " << it->second << endl; - } - } - } - -} - - -void TCPDirectory::handle_lookup(MNSLookup *m) -{ - // have it? - if (dir.count(m->get_entity()) == 0 || - hold.count(m->get_entity())) { - dout(2) << MSG_ADDR_NICE(m->get_source()) << " lookup '" << MSG_ADDR_NICE(m->get_entity()) << "' -> dne or on hold" << endl; - waiting[m->get_entity()].push_back(m); - return; - } - - // look it up! - MNSLookupReply *reply = new MNSLookupReply(m); - - int rank = dir[m->get_entity()]; - reply->entity_map[m->get_entity()] = rank; - reply->rank_addr[rank] = rank_addr[rank]; - - dout(2) << MSG_ADDR_NICE(m->get_source()) << " lookup '" << MSG_ADDR_NICE(m->get_entity()) << "' -> rank " << rank << endl; - - messenger->send_message(reply, - m->get_source(), m->get_source_port()); - delete m; -} diff --git a/tags/20070517_before_mds_merge/msg/TCPDirectory.h b/tags/20070517_before_mds_merge/msg/TCPDirectory.h deleted file mode 100644 index 7f450e9a64be5..0000000000000 --- a/tags/20070517_before_mds_merge/msg/TCPDirectory.h +++ /dev/null @@ -1,110 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __TCPDIRECTORY_H -#define __TCPDIRECTORY_H - -/* - * rank -- a process (listening on some host:port) - * entity -- a logical entity (osd123, mds3, client3245, etc.) - * - * multiple entities can coexist on a single rank. - */ - -#include "Dispatcher.h" -#include "TCPMessenger.h" - -#include -using namespace std; -#include -using namespace __gnu_cxx; - -#include -//#include -#include - -class TCPDirectory : public Dispatcher { - protected: - // how i communicate - TCPMessenger *messenger; - - // directory - hash_map dir; // entity -> rank - hash_map rank_addr; // rank -> ADDR (e.g. host:port) - - __uint64_t version; - map<__uint64_t, entity_name_t> update_log; - - int nrank; - int nclient, nmds, nosd; - - set hold; - map > waiting; - - // messages - void handle_connect(class MNSConnect*); - void handle_register(class MNSRegister *m); - void handle_started(Message *m); - void handle_lookup(class MNSLookup *m); - void handle_unregister(Message *m); - - public: - TCPDirectory(TCPMessenger *m) : - messenger(m), - version(0), - nrank(0), nclient(0), nmds(0), nosd(0) { - messenger->set_dispatcher(this); - - // i am rank 0! - dir[MSG_ADDR_DIRECTORY] = 0; - rank_addr[0] = m->get_tcpaddr(); - ++nrank; - - // announce nameserver - cout << "export CEPH_NAMESERVER=" << m->get_tcpaddr() << endl; - - int fd = ::open(".ceph_ns", O_WRONLY|O_CREAT); - ::write(fd, (void*)&m->get_tcpaddr(), sizeof(tcpaddr_t)); - ::fchmod(fd, 0755); - ::close(fd); - } - ~TCPDirectory() { - ::unlink(".ceph_ns"); - } - - void dispatch(Message *m) { - switch (m->get_type()) { - case MSG_NS_CONNECT: - handle_connect((class MNSConnect*)m); - break; - case MSG_NS_REGISTER: - handle_register((class MNSRegister*)m); - break; - case MSG_NS_STARTED: - handle_started(m); - break; - case MSG_NS_UNREGISTER: - handle_unregister(m); - break; - case MSG_NS_LOOKUP: - handle_lookup((class MNSLookup*)m); - break; - - default: - assert(0); - } - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/msg/TCPMessenger.cc b/tags/20070517_before_mds_merge/msg/TCPMessenger.cc deleted file mode 100644 index f40ea9b162e6b..0000000000000 --- a/tags/20070517_before_mds_merge/msg/TCPMessenger.cc +++ /dev/null @@ -1,1454 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "config.h" -#include "include/error.h" - -#include "common/Timer.h" -#include "common/Mutex.h" - -#include "TCPMessenger.h" -#include "Message.h" - -#include -#include -using namespace std; -#include -using namespace __gnu_cxx; - -#include -# include -# include -# include -# include -#include -#include -#include -#include - -#include - -#include "messages/MGenericMessage.h" -#include "messages/MNSConnect.h" -#include "messages/MNSConnectAck.h" -#include "messages/MNSRegister.h" -#include "messages/MNSRegisterAck.h" -#include "messages/MNSLookup.h" -#include "messages/MNSLookupReply.h" - -#include "TCPDirectory.h" - -#include "common/Logger.h" - -#define DBL 18 - -//#define TCP_SERIALMARSHALL // do NOT turn this off until you check messages/* encode_payload methods -//#define TCP_SERIALOUT // be paranoid/annoying and send messages in same thread - - -TCPMessenger *rankmessenger = 0; // - -TCPDirectory *nameserver = 0; // only defined on rank 0 -TCPMessenger *nsmessenger = 0; - - -/***************************/ -LogType rank_logtype; -Logger *logger; - -int stat_num = 0; -off_t stat_inq = 0, stat_inqb = 0; -off_t stat_disq = 0, stat_disqb = 0; -off_t stat_outq = 0, stat_outqb = 0; -/***************************/ - - -// local directory -hash_map directory; // local -hash_set directory_ready; -Mutex directory_lock; - -// connecting -struct sockaddr_in listen_addr; // my listen addr -int listen_sd = 0; -int my_rank = -1; -Cond waiting_for_rank; - -// register -long regid = 0; -map waiting_for_register_cond; -map waiting_for_register_result; - -// incoming messages -list incoming; -Mutex incoming_lock; -Cond incoming_cond; - -// outgoing messages -/* -list outgoing; -Mutex outgoing_lock; -Cond outgoing_cond; -*/ - -class OutThread : public Thread { -public: - Mutex lock; - Cond cond; - list q; - bool done; - - OutThread() : done(false) {} - virtual ~OutThread() {} - - void *entry(); - - void stop() { - lock.Lock(); - done = true; - cond.Signal(); - lock.Unlock(); - join(); - } - - void send(Message *m) { - lock.Lock(); - q.push_back(m); - cond.Signal(); - lock.Unlock(); - } -} single_out_thread; - -Mutex lookup_lock; // -hash_map entity_rank; // entity -> rank -hash_map rank_sd; // outgoing sockets, rank -> sd -hash_map rank_out; -hash_map rank_addr; // rank -> tcpaddr -map > waiting_for_lookup; - - -/* this process */ -bool tcp_done = false; // set this flag to stop the event loop - - -// threads -pthread_t dispatch_thread_id = 0; // thread id of the event loop. init value == nobody -pthread_t out_thread_id = 0; // thread id of the event loop. init value == nobody -pthread_t listen_thread_id = 0; -map in_threads; // sd -> threadid - -//bool pending_timer = false; - -// per-rank fun - - -// debug -#undef dout -#define dout(l) if (l<=g_conf.debug) cout << "[TCP " << my_rank /*(<< " " << getpid() << "." << pthread_self() */ << "] " - - -#include "tcp.cc" - -// some declarations -void tcp_open(int rank); -int tcp_send(Message *m); -void tcpmessenger_kick_dispatch_loop(); -OutThread *tcp_lookup(Message *m); - -int tcpmessenger_get_rank() -{ - return my_rank; -} - - -int tcpmessenger_findns(tcpaddr_t &nsa) -{ - char *nsaddr = 0; - bool have_nsa = false; - - // env var? - /*int e_len = 0; - for (int i=0; envp[i]; i++) - e_len += strlen(envp[i]) + 1; - */ - nsaddr = getenv("CEPH_NAMESERVER");////envz_entry(*envp, e_len, "CEPH_NAMESERVER"); - if (nsaddr) { - while (nsaddr[0] != '=') nsaddr++; - nsaddr++; - } - - else { - // file? - int fd = ::open(".ceph_ns",O_RDONLY); - if (fd > 0) { - ::read(fd, (void*)&nsa, sizeof(nsa)); - ::close(fd); - have_nsa = true; - nsaddr = "from .ceph_ns"; - } - } - - if (!nsaddr && !have_nsa) { - cerr << "i need ceph ns addr.. either CEPH_NAMESERVER env var or --ns blah" << endl; - return -1; - //exit(-1); - } - - // look up nsaddr? - if (!have_nsa && tcpmessenger_lookup(nsaddr, nsa) < 0) { - return -1; - } - - dout(2) << "ceph ns is " << nsaddr << " or " << nsa << endl; - return 0; -} - - - -/** rankserver - * - * one per rank. handles entity->rank lookup replies. - */ - -class RankServer : public Dispatcher { -public: - void dispatch(Message *m) { - lookup_lock.Lock(); - - dout(DBL) << "rankserver dispatching " << *m << endl; - - switch (m->get_type()) { - case MSG_NS_CONNECTACK: - handle_connect_ack((MNSConnectAck*)m); - break; - - case MSG_NS_REGISTERACK: - handle_register_ack((MNSRegisterAck*)m); - break; - - case MSG_NS_LOOKUPREPLY: - handle_lookup_reply((MNSLookupReply*)m); - break; - - default: - assert(0); - } - - lookup_lock.Unlock(); - } - - void handle_connect_ack(MNSConnectAck *m) { - dout(DBL) << "my rank is " << m->get_rank(); - my_rank = m->get_rank(); - - // now that i know my rank, - entity_rank[MSG_ADDR_RANK(my_rank)] = my_rank; - rank_addr[my_rank] = listen_addr; - - waiting_for_rank.SignalAll(); - - delete m; - - // logger! - dout(DBL) << "logger" << endl; - char names[100]; - sprintf(names, "rank%d", my_rank); - string name = names; - - if (g_conf.tcp_log) { - logger = new Logger(name, (LogType*)&rank_logtype); - rank_logtype.add_set("num"); - rank_logtype.add_inc("in"); - rank_logtype.add_inc("inb"); - rank_logtype.add_inc("dis"); - rank_logtype.add_set("inq"); - rank_logtype.add_set("inqb"); - rank_logtype.add_set("outq"); - rank_logtype.add_set("outqb"); - } - - } - - void handle_register_ack(MNSRegisterAck *m) { - long tid = m->get_tid(); - waiting_for_register_result[tid] = m->get_entity(); - waiting_for_register_cond[tid]->Signal(); - delete m; - } - - void handle_lookup_reply(MNSLookupReply *m) { - list waiting; - dout(DBL) << "got lookup reply" << endl; - - for (map::iterator it = m->entity_rank.begin(); - it != m->entity_rank.end(); - it++) { - dout(DBL) << "lookup got " << MSG_ADDR_NICE(it->first) << " on rank " << it->second << endl; - entity_rank[it->first] = it->second; - - if (it->second == my_rank) { - // deliver locally - dout(-DBL) << "delivering lookup results locally" << endl; - incoming_lock.Lock(); - - for (list::iterator i = waiting_for_lookup[it->first].begin(); - i != waiting_for_lookup[it->first].end(); - i++) { - stat_inq++; - stat_inqb += (*i)->get_payload().length(); - (*i)->decode_payload(); - incoming.push_back(*i); - } - incoming_cond.Signal(); - incoming_lock.Unlock(); - } else { - // take waiters - waiting.splice(waiting.begin(), waiting_for_lookup[it->first]); - } - waiting_for_lookup.erase(it->first); - - } - - for (map::iterator it = m->rank_addr.begin(); - it != m->rank_addr.end(); - it++) { - dout(DBL) << "lookup got rank " << it->first << " addr " << it->second << endl; - rank_addr[it->first] = it->second; - - // open it now - if (rank_sd.count(it->first) == 0) - tcp_open(it->first); - } - - // send waiting messages -#ifdef TCP_SERIALOUT - for (list::iterator it = waiting.begin(); - it != waiting.end(); - it++) { - OutThread *outt = tcp_lookup(*it); - assert(outt); - tcp_send(*it); - } -#else - for (list::iterator it = waiting.begin(); - it != waiting.end(); - it++) { - OutThread *outt = tcp_lookup(*it); - assert(outt); - outt->send(*it); -// dout(0) << "lookup done, splicing in " << *it << endl; - } -#endif - - delete m; - } - -} rankserver; - - -class C_TCPKicker : public Context { - void finish(int r) { - dout(DBL) << "timer kick" << endl; - tcpmessenger_kick_dispatch_loop(); - } -}; - -void TCPMessenger::callback_kick() -{ - tcpmessenger_kick_dispatch_loop(); -} - - -extern int tcpmessenger_lookup(char *str, tcpaddr_t& ta) -{ - char *host = str; - char *port = 0; - - for (int i=0; str[i]; i++) { - if (str[i] == ':') { - port = str+i+1; - str[i] = 0; - break; - } - } - if (!port) { - cerr << "addr '" << str << "' doesn't look like 'host:port'" << endl; - return -1; - } - //cout << "host '" << host << "' port '" << port << "'" << endl; - - int iport = atoi(port); - - struct hostent *myhostname = gethostbyname( host ); - if (!myhostname) { - cerr << "host " << host << " not found" << endl; - return -1; - } - - memset(&ta, 0, sizeof(ta)); - - //cout << "addrtype " << myhostname->h_addrtype << " len " << myhostname->h_length << endl; - - ta.sin_family = myhostname->h_addrtype; - memcpy((char *)&ta.sin_addr, - myhostname->h_addr, - myhostname->h_length); - ta.sin_port = iport; - - cout << "lookup '" << host << ":" << port << "' -> " << ta << endl; - - return 0; -} - - - -/***** - * global methods for process-wide startup, shutdown. - */ - -int tcpmessenger_init() -{ - // LISTEN - dout(DBL) << "binding to listen " << endl; - - /* socket creation */ - listen_sd = socket(AF_INET,SOCK_STREAM,0); - assert(listen_sd > 0); - - /* bind to port */ - memset((char*)&listen_addr, 0, sizeof(listen_addr)); - listen_addr.sin_family = AF_INET; - listen_addr.sin_addr.s_addr = htonl(INADDR_ANY); - listen_addr.sin_port = 0; - - int rc = bind(listen_sd, (struct sockaddr *) &listen_addr, sizeof(listen_addr)); - assert(rc >= 0); - - socklen_t llen = sizeof(listen_addr); - getsockname(listen_sd, (sockaddr*)&listen_addr, &llen); - - int myport = listen_addr.sin_port; - - // listen! - rc = ::listen(listen_sd, 1000); - assert(rc >= 0); - - dout(DBL) << "listening on " << myport << endl; - - // my address is... - char host[100]; - gethostname(host, 100); - dout(DBL) << "my hostname is " << host << endl; - - struct hostent *myhostname = gethostbyname( host ); - - struct sockaddr_in my_addr; - memset(&my_addr, 0, sizeof(my_addr)); - - my_addr.sin_family = myhostname->h_addrtype; - memcpy((char *) &my_addr.sin_addr.s_addr, - myhostname->h_addr_list[0], - myhostname->h_length); - my_addr.sin_port = myport; - - listen_addr = my_addr; - - dout(DBL) << "listen addr is " << listen_addr << endl; - - // register to execute timer events - //g_timer.set_messenger_kicker(new C_TCPKicker()); - - - dout(DBL) << "init done" << endl; - return 0; -} - - -// on first rank only -void tcpmessenger_start_nameserver(tcpaddr_t& diraddr) -{ - dout(DBL) << "starting nameserver on " << MSG_ADDR_NICE(MSG_ADDR_DIRECTORY) << endl; - - // i am rank 0. - nsmessenger = new TCPMessenger(MSG_ADDR_DIRECTORY); - - // start name server - nameserver = new TCPDirectory(nsmessenger); - - // diraddr is my addr! - diraddr = rank_addr[0] = listen_addr; - my_rank = 0; - entity_rank[MSG_ADDR_DIRECTORY] = 0; -} -void tcpmessenger_stop_nameserver() -{ - if (nsmessenger) { - dout(DBL) << "shutting down nsmessenger" << endl; - TCPMessenger *m = nsmessenger; - nsmessenger = 0; - m->shutdown(); - delete m; - } -} - -// on all ranks -void tcpmessenger_start_rankserver(tcpaddr_t& ns) -{ - // connect to nameserver - entity_rank[MSG_ADDR_DIRECTORY] = 0; - rank_addr[0] = ns; - tcp_open(0); - - if (my_rank >= 0) { - // i know my rank - rankmessenger = new TCPMessenger(MSG_ADDR_RANK(my_rank)); - } else { - // start rank messenger, and discover my rank. - rankmessenger = new TCPMessenger(MSG_ADDR_RANK_NEW); - } -} -void tcpmessenger_stop_rankserver() -{ - if (rankmessenger) { - dout(DBL) << "shutting down rankmessenger" << endl; - rankmessenger->shutdown(); - delete rankmessenger; - rankmessenger = 0; - } -} - - - - - - -int tcpmessenger_shutdown() -{ - dout(DBL) << "tcpmessenger_shutdown barrier" << endl; - - - dout(2) << "tcpmessenger_shutdown closing all sockets etc" << endl; - - // bleh - for (hash_map::iterator it = rank_sd.begin(); - it != rank_sd.end(); - it++) { - ::close(it->second); - } - - return 0; -} - - - - -/*** - * internal send/recv - */ - - - - -/* - * recv a Message* - */ - - - -Message *tcp_recv(int sd) -{ - // envelope - dout(DBL) << "tcp_recv receiving message from sd " << sd << endl; - - msg_envelope_t env; - if (!tcp_read( sd, (char*)&env, sizeof(env) )) - return 0; - - if (env.type == 0) { - dout(DBL) << "got dummy env, bailing" << endl; - return 0; - } - - dout(DBL) << "tcp_recv got envelope type=" << env.type << " src " << MSG_ADDR_NICE(env.source) << " dst " << MSG_ADDR_NICE(env.dest) << " nchunks=" << env.nchunks << endl; - - // payload - bufferlist blist; - for (int i=0; iinc("in"); - logger->inc("inb", s+sizeof(env)); - } - - dout(DBL) << "tcp_recv got " << s << " byte message from " << MSG_ADDR_NICE(m->get_source()) << endl; - - return m; -} - - - - -void tcp_open(int rank) -{ - dout(DBL) << "tcp_open to rank " << rank << " at " << rank_addr[rank] << endl; - - // create socket? - int sd = socket(AF_INET,SOCK_STREAM,0); - assert(sd > 0); - - // bind any port - struct sockaddr_in myAddr; - myAddr.sin_family = AF_INET; - myAddr.sin_addr.s_addr = htonl(INADDR_ANY); - myAddr.sin_port = htons( 0 ); - - int rc = bind(sd, (struct sockaddr *) &myAddr, sizeof(myAddr)); - assert(rc>=0); - - // connect! - int r = connect(sd, (sockaddr*)&rank_addr[rank], sizeof(myAddr)); - assert(r >= 0); - - //dout(DBL) << "tcp_open connected to " << who << endl; - assert(rank_sd.count(rank) == 0); - rank_sd[rank] = sd; - - if (g_conf.tcp_multi_out) { - rank_out[rank] = new OutThread(); - rank_out[rank]->create(); - } else { - rank_out[rank] = &single_out_thread; - if (!single_out_thread.is_started()) - single_out_thread.create(); - } -} - - -void tcp_marshall(Message *m) -{ - // marshall - if (m->empty_payload()) - m->encode_payload(); -} - -OutThread *tcp_lookup(Message *m) -{ - entity_name_t addr = m->get_dest(); - - if (!entity_rank.count(m->get_dest())) { - // lookup and wait. - if (waiting_for_lookup.count(addr)) { - dout(DBL) << "already looking up " << MSG_ADDR_NICE(addr) << endl; - } else { - dout(DBL) << "lookup on " << MSG_ADDR_NICE(addr) << " for " << m << endl; - MNSLookup *r = new MNSLookup(addr); - rankmessenger->send_message(r, MSG_ADDR_DIRECTORY); - } - - // add waiter - waiting_for_lookup[addr].push_back(m); - return 0; - } - - int rank = entity_rank[m->get_dest()]; - - if (rank_sd.count(rank) == 0) { // should only happen on rank0? - tcp_open(rank); - } - assert(rank_sd.count(rank)); - m->set_tcp_sd( rank_sd[rank] ); - return rank_out[rank]; -} - - -/* - * send a Message* over the wire. ** do not block **. - */ -int tcp_send(Message *m) -{ - /*int rank = entity_rank[m->get_dest()]; - //if (rank_sd.count(rank) == 0) tcp_open(rank); - assert(rank_sd.count(rank)); - - int sd = rank_sd[rank]; - assert(sd); - */ - int sd = m->get_tcp_sd(); - assert(sd); - - // get envelope, buffers - msg_envelope_t *env = &m->get_envelope(); - bufferlist blist; - blist.claim( m->get_payload() ); - -#ifdef TCP_KEEP_CHUNKS - env->nchunks = blist.buffers().size(); -#else - env->nchunks = 1; -#endif - - // HACK osd -> client only - //if (m->get_source() >= MSG_ADDR_OSD(0) && m->get_source() < MSG_ADDR_CLIENT(0) && - // m->get_dest() >= MSG_ADDR_CLIENT(0)) - dout(DBL) << g_clock.now() << " sending " << m << " " << *m << " to " << MSG_ADDR_NICE(m->get_dest()) - //<< " rank " << rank - << " sd " << sd << endl; - - // send envelope - int r = tcp_write( sd, (char*)env, sizeof(*env) ); - if (r < 0) { cerr << "error sending envelope for " << *m << " to " << MSG_ADDR_NICE(m->get_dest()) << endl; assert(0); } - - // payload -#ifdef TCP_KEEP_CHUNKS - // send chunk-wise - int i = 0; - for (list::iterator it = blist.buffers().begin(); - it != blist.buffers().end(); - it++) { - dout(DBL) << "tcp_sending frag " << i << " len " << (*it).length() << endl; - int size = (*it).length(); - r = tcp_write( sd, (char*)&size, sizeof(size) ); - if (r < 0) { cerr << "error sending chunk len for " << *m << " to " << MSG_ADDR_NICE(m->get_dest()) << endl; assert(0); } - r = tcp_write( sd, (*it).c_str(), size ); - if (r < 0) { cerr << "error sending data chunk for " << *m << " to " << MSG_ADDR_NICE(m->get_dest()) << endl; assert(0); } - i++; - } -#else - // one big chunk - int size = blist.length(); - r = tcp_write( sd, (char*)&size, sizeof(size) ); - if (r < 0) { cerr << "error sending data len for " << *m << " to " << MSG_ADDR_NICE(m->get_dest()) << endl; assert(0); } - for (list::iterator it = blist.buffers().begin(); - it != blist.buffers().end(); - it++) { - r = tcp_write( sd, (*it).c_str(), (*it).length() ); - if (r < 0) { cerr << "error sending data megachunk for " << *m << " to " << MSG_ADDR_NICE(m->get_dest()) << " : len " << (*it).length() << endl; assert(0); } - } -#endif - - // hose message - delete m; - return 0; -} - - - - - -/** tcp_outthread - * this thread watching the outgoing queue, and encodes+sends any queued messages - */ - -void* OutThread::entry() -{ - lock.Lock(); - while (!q.empty() || !done) { - - if (!q.empty()) { - dout(DBL) << "outthread grabbing message(s)" << endl; - - // grab outgoing list - list out; - out.splice(out.begin(), q); - - // drop lock while i send these - lock.Unlock(); - - while (!out.empty()) { - Message *m = out.front(); - out.pop_front(); - - dout(DBL) << "outthread sending " << m << endl; - - if (!g_conf.tcp_serial_marshall) - tcp_marshall(m); - - tcp_send(m); - } - - lock.Lock(); - continue; - } - - // wait - dout(DBL) << "outthread sleeping" << endl; - cond.Wait(lock); - } - dout(DBL) << "outthread done" << endl; - - lock.Unlock(); - return 0; -} - - - -/** tcp_inthread - * read incoming messages from a given peer. - * give received and decoded messages to dispatch loop. - */ -void *tcp_inthread(void *r) -{ - int sd = (int)r; - - dout(DBL) << "tcp_inthread reading on sd " << sd << endl; - - while (!tcp_done) { - Message *m = tcp_recv(sd); - if (!m) break; - entity_name_t who = m->get_source(); - - dout(20) << g_clock.now() << " inthread got " << m << " from sd " << sd << " who is " << who << endl; - - // give to dispatch loop - size_t sz = m->get_payload().length(); - - if (g_conf.tcp_multi_dispatch) { - const entity_name_t dest = m->get_dest(); - directory_lock.Lock(); - TCPMessenger *messenger = directory[ dest ]; - directory_lock.Unlock(); - - if (messenger) - messenger->dispatch_queue(m); - else - dout(0) << "dest " << dest << " dne" << endl; - - } else { - // single dispatch queue - incoming_lock.Lock(); - { - //dout(-20) << "in1 stat_inq " << stat_inq << ", incoming " << incoming.size() << endl; - //assert(stat_inq == incoming.size()); - incoming.push_back(m); - incoming_cond.Signal(); - - stat_inq++; - //assert(stat_inq == incoming.size()); - //dout(-20) << "in2 stat_inq " << stat_inq << ", incoming " << incoming.size() << endl; - stat_inqb += sz; - } - incoming_lock.Unlock(); - } - - if (logger) { - //logger->inc("in"); - //logger->inc("inb", sz); - } - } - - dout(DBL) << "tcp_inthread closing " << sd << endl; - - //::close(sd); - return 0; -} - -/** tcp_accepthread - * accept incoming connections from peers. - * start a tcp_inthread for each. - */ -void *tcp_acceptthread(void *) -{ - dout(DBL) << "tcp_acceptthread starting" << endl; - - while (!tcp_done) { - //dout(DBL) << "accepting, left = " << left << endl; - - struct sockaddr_in addr; - socklen_t slen = sizeof(addr); - int sd = ::accept(listen_sd, (sockaddr*)&addr, &slen); - if (sd > 0) { - dout(DBL) << "accepted incoming on sd " << sd << endl; - - pthread_t th; - pthread_create(&th, - NULL, - tcp_inthread, - (void*)sd); - in_threads[sd] = th; - } else { - dout(DBL) << "no incoming connection?" << endl; - break; - } - } - return 0; -} - - - - -/** tcp_dispatchthread - * wait for pending timers, incoming messages. dispatch them. - */ -void TCPMessenger::dispatch_entry() -{ - incoming_lock.Lock(); - while (!incoming.empty() || !incoming_stop) { - if (!incoming.empty()) { - // grab incoming messages - list in; - in.splice(in.begin(), incoming); - - assert(stat_disq == 0); - stat_disq = stat_inq; - stat_disqb = stat_inqb; - stat_inq = 0; - stat_inqb = 0; - - // drop lock while we deliver - //assert(stat_inq == incoming.size()); - incoming_lock.Unlock(); - - // dispatch! - while (!in.empty()) { - Message *m = in.front(); - in.pop_front(); - - stat_disq--; - stat_disqb -= m->get_payload().length(); - if (logger) { - logger->set("inq", stat_inq+stat_disq); - logger->set("inqb", stat_inqb+stat_disq); - logger->inc("dis"); - } - - dout(4) << g_clock.now() << " ---- '" << m->get_type_name() << - "' from " << MSG_ADDR_NICE(m->get_source()) << ':' << m->get_source_port() << - " to " << MSG_ADDR_NICE(m->get_dest()) << ':' << m->get_dest_port() << " ---- " - << m - << endl; - - dispatch(m); - } - - continue; - } - - // sleep - dout(DBL) << "dispatch: waiting for incoming messages" << endl; - incoming_cond.Wait(incoming_lock); - dout(DBL) << "dispatch: woke up" << endl; - } - incoming_lock.Unlock(); -} - - -void* tcp_dispatchthread(void*) -{ - dout(5) << "tcp_dispatchthread start pid " << getpid() << endl; - - while (1) { - // inq? - incoming_lock.Lock(); - - // done? - if (tcp_done && incoming.empty()) { - incoming_lock.Unlock(); - break; - } - - // wait? - if (incoming.empty()) { - // wait - dout(DBL) << "dispatch: incoming empty, waiting for incoming messages" << endl; - incoming_cond.Wait(incoming_lock); - dout(DBL) << "dispatch: woke up" << endl; - } - - // grab incoming messages - //dout(-20) << "dis1 stat_inq " << stat_inq << ", incoming " << incoming.size() << endl; - //assert(stat_inq == incoming.size()); - - list in; - in.splice(in.begin(), incoming); - - assert(stat_disq == 0); - stat_disq = stat_inq; - stat_disqb = stat_inqb; - stat_inq = 0; - stat_inqb = 0; - //assert(stat_inq == incoming.size()); - //dout(-20) << "dis2 stat_inq " << stat_inq << ", incoming " << incoming.size() << endl; - - // drop lock while we deliver - incoming_lock.Unlock(); - - // dispatch! - while (!in.empty()) { - Message *m = in.front(); - in.pop_front(); - - stat_disq--; - stat_disqb -= m->get_payload().length(); - if (logger) { - logger->set("inq", stat_inq+stat_disq); - logger->set("inqb", stat_inqb+stat_disq); - logger->inc("dis"); - } - - dout(DBL) << "dispatch doing " << *m << endl; - - // for rankserver? - if (m->get_type() == MSG_NS_CONNECTACK || // i just connected - m->get_dest() == MSG_ADDR_RANK(my_rank)) { - dout(DBL) << " giving to rankserver" << endl; - rankserver.dispatch(m); - continue; - } - - // ok - entity_name_t dest = m->get_dest(); - directory_lock.Lock(); - if (directory.count(dest)) { - Messenger *who = directory[ dest ]; - directory_lock.Unlock(); - - dout(4) << g_clock.now() << " ---- '" << m->get_type_name() << - "' from " << MSG_ADDR_NICE(m->get_source()) << ':' << m->get_source_port() << - " to " << MSG_ADDR_NICE(m->get_dest()) << ':' << m->get_dest_port() << " ---- " - << *m - << endl; - - who->dispatch(m); - } else { - directory_lock.Unlock(); - dout (1) << "---- i don't know who " << MSG_ADDR_NICE(dest) << " " << dest << " is." << endl; - assert(0); - } - } - assert(stat_disq == 0); - - } - - - g_timer.shutdown(); - - dout(5) << "tcp_dispatchthread exiting loop" << endl; - return 0; -} - - -// start/stop mpi receiver thread (for unsolicited messages) -int tcpmessenger_start() -{ - dout(5) << "starting accept thread" << endl; - pthread_create(&listen_thread_id, - NULL, - tcp_acceptthread, - 0); - - dout(5) << "starting dispatch thread" << endl; - - // start a thread - pthread_create(&dispatch_thread_id, - NULL, - tcp_dispatchthread, - 0); - - - /* - dout(5) << "starting outgoing thread" << endl; - pthread_create(&out_thread_id, - NULL, - tcp_outthread, - 0); - */ - if (!g_conf.tcp_multi_out) - single_out_thread.create(); - return 0; -} - - -/* - * kick and wake up _loop (to pick up new outgoing message, or quit) - */ - -void tcpmessenger_kick_dispatch_loop() -{ - if (g_conf.tcp_multi_dispatch) { - assert(0); - // all of them - /*for (hash_map::iterator i = directory.begin(); - i != directory.end(); - i++) - i->second->dispatch_kick(); - */ - } else { - // just one - dout(DBL) << "kicking" << endl; - incoming_lock.Lock(); - dout(DBL) << "prekick" << endl; - incoming_cond.Signal(); - incoming_lock.Unlock(); - dout(DBL) << "kicked" << endl; - } -} - -/* -void tcpmessenger_kick_outgoing_loop() -{ - outgoing_lock.Lock(); - outgoing_cond.Signal(); - outgoing_lock.Unlock(); -} -*/ - - -// wait for thread to finish - -void tcpmessenger_wait() -{ - if (g_conf.tcp_multi_dispatch) { - // new way - incoming_lock.Lock(); - while (!tcp_done) - incoming_cond.Wait(incoming_lock); - incoming_lock.Unlock(); - } else { - // old way - dout(10) << "tcpmessenger_wait waking up dispatch loop" << endl; - tcpmessenger_kick_dispatch_loop(); - - void *returnval; - dout(10) << "tcpmessenger_wait waiting for thread to finished." << endl; - pthread_join(dispatch_thread_id, &returnval); - dout(10) << "tcpmessenger_wait thread finished." << endl; - } -} - - - - -entity_name_t register_entity(entity_name_t addr) -{ - lookup_lock.Lock(); - - // prepare to wait - long id = ++regid; - Cond cond; - waiting_for_register_cond[id] = &cond; - - if (my_rank < 0) { - dout(DBL) << "register_entity don't know my rank, connecting" << endl; - - // connect to nameserver; discover my rank. - Message *m = new MNSConnect(listen_addr); - m->set_dest(MSG_ADDR_DIRECTORY, 0); - tcp_marshall(m); - OutThread *outt = tcp_lookup(m); - assert(outt); - tcp_send(m); - - // wait for reply - while (my_rank < 0) - waiting_for_rank.Wait(lookup_lock); - assert(my_rank > 0); - } - - // send req - dout(DBL) << "register_entity " << MSG_ADDR_NICE(addr) << endl; - Message *m = new MNSRegister(addr, my_rank, id); - m->set_dest(MSG_ADDR_DIRECTORY, 0); - tcp_marshall(m); - OutThread *outt = tcp_lookup(m); - assert(outt); - tcp_send(m); - - // wait? - while (!waiting_for_register_result.count(id)) - cond.Wait(lookup_lock); - - // get result, clean up - entity_name_t entity = waiting_for_register_result[id]; - waiting_for_register_result.erase(id); - waiting_for_register_cond.erase(id); - - dout(DBL) << "register_entity got " << MSG_ADDR_NICE(entity) << endl; - - lookup_lock.Unlock(); - - // ok! - return entity; -} - - - -/*********** - * Tcpmessenger class implementation - */ - - -TCPMessenger::TCPMessenger(entity_name_t myaddr) : - Messenger(myaddr), - dispatch_thread(this) -{ - if (myaddr != MSG_ADDR_DIRECTORY) { - // register! - myaddr = register_entity(myaddr); - } - - - // my address - set_myaddr( myaddr ); - - // register myself in the messenger directory - directory_lock.Lock(); - { - directory[myaddr] = this; - - stat_num++; - if (logger) logger->set("num", stat_num); - } - directory_lock.Unlock(); - - // register to execute timer events - //g_timer.set_messenger_kicker(new C_TCPKicker()); - // g_timer.set_messenger(this); -} - - -void TCPMessenger::ready() -{ - directory_lock.Lock(); - directory_ready.insert(get_myaddr()); - directory_lock.Unlock(); - - if (get_myaddr() != MSG_ADDR_DIRECTORY) { - // started! tell namer we are up and running. - lookup_lock.Lock(); - { - Message *m = new MGenericMessage(MSG_NS_STARTED); - m->set_source(get_myaddr(), 0); - m->set_dest(MSG_ADDR_DIRECTORY, 0); - tcp_marshall(m); - OutThread *outt = tcp_lookup(m); - assert(outt); - tcp_send(m); - } - lookup_lock.Unlock(); - } -} - - -TCPMessenger::~TCPMessenger() -{ - //delete logger; -} - -tcpaddr_t& TCPMessenger::get_tcpaddr() -{ - return listen_addr; -} - -void TCPMessenger::map_entity_rank(entity_name_t e, int r) -{ - lookup_lock.Lock(); - entity_rank[e] = r; - lookup_lock.Unlock(); -} - -void TCPMessenger::map_rank_addr(int r, tcpaddr_t a) -{ - lookup_lock.Lock(); - rank_addr[r] = a; - lookup_lock.Unlock(); -} - - -int TCPMessenger::get_dispatch_queue_len() -{ - return stat_inq+stat_disq; -} - - -int TCPMessenger::shutdown() -{ - dout(DBL) << "shutdown by " << MSG_ADDR_NICE(get_myaddr()) << endl; - - // dont' send unregistery from nsmessenger shutdown! - if (this != nsmessenger && - (my_rank > 0 || nsmessenger)) { - dout(DBL) << "sending unregister from " << MSG_ADDR_NICE(get_myaddr()) << " to ns" << endl; - send_message(new MGenericMessage(MSG_NS_UNREGISTER), - MSG_ADDR_DIRECTORY); - } - - // remove me from the directory - directory_lock.Lock(); - directory.erase(get_myaddr()); - - // last one? - bool lastone = directory.empty(); - //dout(1) << "lastone = " << lastone << " .. " << directory.size() << endl; - - - // or almost last one? - if (rankmessenger && directory.size() == 1) { - directory_lock.Unlock(); - tcpmessenger_stop_rankserver(); - directory_lock.Lock(); - } - - stat_num--; - if (logger) logger->set("num", stat_num); - - directory_lock.Unlock(); - - // last one? - if (lastone) { - dout(2) << "shutdown last tcpmessenger on rank " << my_rank << " shut down" << endl; - //pthread_t whoami = pthread_self(); - - // no more timer events - //g_timer.unset_messenger(); - - // close incoming sockets - //void *r; - for (map::iterator it = in_threads.begin(); - it != in_threads.end(); - it++) { - dout(DBL) << "closing reader on sd " << it->first << endl; - ::close(it->first); - //pthread_join(it->second, &r); - } - - if (g_conf.tcp_multi_dispatch) { - // kill off dispatch threads - dout(DBL) << "killing dispatch threads" << endl; - for (hash_map::iterator it = directory.begin(); - it != directory.end(); - it++) - it->second->dispatch_stop(); - } - - dout(DBL) << "setting tcp_done" << endl; - - // kick/kill incoming thread - incoming_lock.Lock(); - tcp_done = true; - incoming_cond.Signal(); - incoming_lock.Unlock(); - - // finish off outgoing thread - dout(10) << "waiting for outgoing to finish" << endl; - if (g_conf.tcp_multi_out) { - for (hash_map::iterator it = rank_out.begin(); - it != rank_out.end(); - it++) { - it->second->stop(); - delete it->second; - } - } else { - single_out_thread.stop(); - } - - - /* - - dout(15) << "whoami = " << whoami << ", thread = " << dispatch_thread_id << endl; - if (whoami == thread_id) { - // i am the event loop thread, just set flag! - dout(15) << " set tcp_done=true" << endl; - tcp_done = true; - } - */ - } - return 0; -} - - - - -/*** - * public messaging interface - */ - - -/* note: send_message _MUST_ be non-blocking */ -int TCPMessenger::send_message(Message *m, entity_name_t dest, int port, int fromport) -{ - // set envelope - m->set_source(get_myaddr(), fromport); - m->set_dest(dest, port); - m->set_lamport_send_stamp( get_lamport() ); - - dout(4) << "--> " << m->get_type_name() - << " from " << MSG_ADDR_NICE(m->get_source()) << ':' << m->get_source_port() - << " to " << MSG_ADDR_NICE(m->get_dest()) << ':' << m->get_dest_port() - << " ---- " << m - << endl; - - // local? - TCPMessenger *entity = 0; - directory_lock.Lock(); - if (directory.count(dest) && - directory_ready.count(dest)) entity = directory[dest]; - directory_lock.Unlock(); - - if (entity) { - // local! - ::incoming_lock.Lock(); - { - dout(20) << " queueing locally for " << dest << " " << m << endl; //", stat_inq " << stat_inq << ", incomign " << ::incoming.size() << endl; - //assert(stat_inq == ::incoming.size()); - ::incoming.push_back(m); - ::incoming_cond.Signal(); - stat_inq++; - //assert(stat_inq == ::incoming.size()); - //dout(-20) << " stat_inq " << stat_inq << ", incoming " << ::incoming.size() << endl; - stat_inqb += m->get_payload().length(); - } - ::incoming_lock.Unlock(); - } else { - // remote! - - if (g_conf.tcp_serial_marshall) - tcp_marshall(m); - - if (g_conf.tcp_serial_out) { - lookup_lock.Lock(); - // send in this thread - if (tcp_lookup(m)) - tcp_send(m); - lookup_lock.Unlock(); - } else { - lookup_lock.Lock(); - OutThread *outt = tcp_lookup(m); - lookup_lock.Unlock(); - - if (outt) outt->send(m); - } - } - - return 0; -} - - - - diff --git a/tags/20070517_before_mds_merge/msg/TCPMessenger.h b/tags/20070517_before_mds_merge/msg/TCPMessenger.h deleted file mode 100644 index 414e50f5fef87..0000000000000 --- a/tags/20070517_before_mds_merge/msg/TCPMessenger.h +++ /dev/null @@ -1,115 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __TCPMESSENGER_H -#define __TCPMESSENGER_H - -#include "Messenger.h" -#include "Dispatcher.h" -#include "common/Thread.h" - -#include "tcp.h" - -class Timer; - - -class TCPMessenger : public Messenger { - protected: - - //class Logger *logger; // for logging - - bool incoming_stop; - Mutex incoming_lock; - list incoming; - Cond incoming_cond; - - class DispatchThread : public Thread { - TCPMessenger *m; - public: - DispatchThread(TCPMessenger *_m) : m(_m) {} - void *entry() { - m->dispatch_entry(); - return 0; - } - } dispatch_thread; - - void dispatch_entry(); - -public: - void dispatch_start() { - incoming_stop = false; - dispatch_thread.create(); - } - /* void dispatch_kick() { - incoming_lock.Lock(); - incoming_cond.Signal(); - incoming_lock.Unlock(); - }*/ - void dispatch_stop() { - incoming_lock.Lock(); - incoming_stop = true; - incoming_cond.Signal(); - incoming_lock.Unlock(); - dispatch_thread.join(); - } - void dispatch_queue(Message *m) { - incoming_lock.Lock(); - incoming.push_back(m); - incoming_cond.Signal(); - incoming_lock.Unlock(); - } - - public: - TCPMessenger(entity_name_t myaddr); - ~TCPMessenger(); - - void ready(); - - tcpaddr_t& get_tcpaddr(); - void map_entity_rank(entity_name_t e, int r); - void map_rank_addr(int r, tcpaddr_t a); - - int get_dispatch_queue_len(); - - void callback_kick(); - - // init, shutdown MPI and associated event loop thread. - virtual int shutdown(); - - // message interface - virtual int send_message(Message *m, entity_name_t dest, int port=0, int fromport=0); -}; - -/** - * these are all ONE per process. - */ - -extern int tcpmessenger_lookup(char *str, tcpaddr_t& ta); - -extern int tcpmessenger_findns(tcpaddr_t &nsa); - -extern int tcpmessenger_init(); -extern int tcpmessenger_start(); // start thread -extern void tcpmessenger_wait(); // wait for thread to finish. -extern int tcpmessenger_shutdown(); // finalize MPI - -extern void tcpmessenger_start_nameserver(tcpaddr_t& ta); // on rank 0 -extern void tcpmessenger_stop_nameserver(); // on rank 0 -extern void tcpmessenger_start_rankserver(tcpaddr_t& ta); // on all ranks -extern void tcpmessenger_stop_rankserver(); // on all ranks - -extern int tcpmessenger_get_rank(); - - -#endif diff --git a/tags/20070517_before_mds_merge/msg/error.c b/tags/20070517_before_mds_merge/msg/error.c deleted file mode 100644 index 15cd16a2ca9da..0000000000000 --- a/tags/20070517_before_mds_merge/msg/error.c +++ /dev/null @@ -1,77 +0,0 @@ -#include -#include -#include -#include - -#include "include/error.h" - -#define EXIT_USAGE_ERROR -1 /* error codes for program exit */ -#define EXIT_SYSTEM_ERROR -2 -#define EXIT_GENERIC_ERROR -3 -#define MSGSIZ 1024 /* maximum error message length */ - -/* print usage error message and exit */ -void userror(const char *use, const char *fmt, ...) -{ - char msg[MSGSIZ]; - int len; - - va_list ap; - va_start(ap, fmt); - - len = vsnprintf(msg+len, MSGSIZ-len, fmt, ap); - len += snprintf(msg+len, MSGSIZ-len, "\n"); - len += snprintf(msg+len, MSGSIZ-len, use); - fprintf(stderr, "%s\n", msg); - exit(EXIT_USAGE_ERROR); - - va_end(ap); -} - -/* print system error message and exit */ -void syserror(const char *fmt, ...) -{ - char msg[MSGSIZ]; - int len; - - va_list ap; - va_start(ap, fmt); - - len = vsnprintf(msg+len, MSGSIZ-len, fmt, ap); - len += snprintf(msg+len, MSGSIZ-len, ": %s\n", strerror(errno)); - fprintf(stderr, "%s", msg); - exit(EXIT_SYSTEM_ERROR); - - va_end(ap); -} - -/* print error message and exit */ -void exiterror(const char *fmt, ...) -{ - char msg[MSGSIZ]; - int len; - - va_list ap; - va_start(ap, fmt); - - len = vsnprintf(msg+len, MSGSIZ-len, fmt, ap); - fprintf(stderr, "%s\n", msg); - exit(EXIT_GENERIC_ERROR); - - va_end(ap); -} - -/* print error message */ -void error(const char *fmt, ...) -{ - char msg[MSGSIZ]; - int len; - - va_list ap; - va_start(ap, fmt); - - len = vsnprintf(msg+len, MSGSIZ-len, fmt, ap); - fprintf(stderr, "%s\n", msg); - - va_end(ap); -} diff --git a/tags/20070517_before_mds_merge/msg/mpistarter.cc b/tags/20070517_before_mds_merge/msg/mpistarter.cc deleted file mode 100644 index 79391f78210d2..0000000000000 --- a/tags/20070517_before_mds_merge/msg/mpistarter.cc +++ /dev/null @@ -1,62 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include - -#include "TCPMessenger.h" - -/* - * start up TCPMessenger via MPI. - */ - -pair mpi_bootstrap_tcp(int& argc, char**& argv) -{ - tcpmessenger_init(); - tcpmessenger_start(); - - // exchnage addresses with other nodes - MPI_Init(&argc, &argv); - - int mpi_world; - int mpi_rank; - MPI_Comm_size(MPI_COMM_WORLD, &mpi_world); - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - - //dout(1) << "i am " << mpi_rank << " of " << mpi_world << endl; - - // start up directory? - tcpaddr_t ta; - if (mpi_rank == 0) { - dout(30) << "i am rank 0, starting ns directory" << endl; - tcpmessenger_start_nameserver(ta); - } else { - memset(&ta, 0, sizeof(ta)); - } - - // distribute tcpaddr - int r = MPI_Bcast(&ta, sizeof(ta), MPI_CHAR, - 0, MPI_COMM_WORLD); - - dout(30) << "r = " << r << " ns tcpaddr is " << ta << endl; - tcpmessenger_start_rankserver(ta); - - MPI_Barrier(MPI_COMM_WORLD); - //g_clock.tare(); - MPI_Finalize(); - - return pair(mpi_rank, mpi_world); -} - - diff --git a/tags/20070517_before_mds_merge/msg/msg_types.h b/tags/20070517_before_mds_merge/msg/msg_types.h deleted file mode 100644 index 0b92df47020d0..0000000000000 --- a/tags/20070517_before_mds_merge/msg/msg_types.h +++ /dev/null @@ -1,186 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MSG_TYPES_H -#define __MSG_TYPES_H - -#include "include/types.h" -#include "tcp.h" - -// new typed msg_addr_t way! -class entity_name_t { - int _type; - int _num; - -public: - static const int TYPE_MON = 1; - static const int TYPE_MDS = 2; - static const int TYPE_OSD = 3; - static const int TYPE_CLIENT = 4; - - static const int NEW = -1; - - // cons - entity_name_t() : _type(0), _num(0) {} - entity_name_t(int t, int n) : _type(t), _num(n) {} - - int num() const { return _num; } - int type() const { return _type; } - const char *type_str() const { - switch (type()) { - case TYPE_MDS: return "mds"; - case TYPE_OSD: return "osd"; - case TYPE_MON: return "mon"; - case TYPE_CLIENT: return "client"; - default: return "unknown"; - } - } - - bool is_new() const { return num() == NEW; } - - bool is_client() const { return type() == TYPE_CLIENT; } - bool is_mds() const { return type() == TYPE_MDS; } - bool is_osd() const { return type() == TYPE_OSD; } - bool is_mon() const { return type() == TYPE_MON; } -}; - -inline bool operator== (const entity_name_t& l, const entity_name_t& r) { - return (l.type() == r.type()) && (l.num() == r.num()); } -inline bool operator!= (const entity_name_t& l, const entity_name_t& r) { - return (l.type() != r.type()) || (l.num() != r.num()); } -inline bool operator< (const entity_name_t& l, const entity_name_t& r) { - return (l.type() < r.type()) || (l.type() == r.type() && l.num() < r.num()); } - -inline std::ostream& operator<<(std::ostream& out, const entity_name_t& addr) { - //if (addr.is_namer()) return out << "namer"; - if (addr.is_new() || addr.num() < 0) - return out << addr.type_str() << "?"; - else - return out << addr.type_str() << addr.num(); -} - -namespace __gnu_cxx { - template<> struct hash< entity_name_t > - { - size_t operator()( const entity_name_t m ) const - { - static blobhash H; - return H((const char*)&m, sizeof(m)); - } - }; -} - -// get rid of these -#define MSG_ADDR_MDS(x) entity_name_t(entity_name_t::TYPE_MDS,x) -#define MSG_ADDR_OSD(x) entity_name_t(entity_name_t::TYPE_OSD,x) -#define MSG_ADDR_MON(x) entity_name_t(entity_name_t::TYPE_MON,x) -#define MSG_ADDR_CLIENT(x) entity_name_t(entity_name_t::TYPE_CLIENT,x) - -#define MSG_ADDR_RANK_NEW MSG_ADDR_RANK(entity_name_t::NEW) -#define MSG_ADDR_MDS_NEW MSG_ADDR_MDS(entity_name_t::NEW) -#define MSG_ADDR_OSD_NEW MSG_ADDR_OSD(entity_name_t::NEW) -#define MSG_ADDR_CLIENT_NEW MSG_ADDR_CLIENT(entity_name_t::NEW) - - -/* - * an entity's network address. - * includes a random value that prevents it from being reused. - * thus identifies a particular process instance. - * ipv4 for now. - */ -struct entity_addr_t { - __uint8_t ipq[4]; - __uint32_t port; - __uint32_t nonce; // bind time, or pid, or something unique! - - entity_addr_t() : port(0), nonce(0) { - ipq[0] = ipq[1] = ipq[2] = ipq[3] = 0; - } - - void set_addr(tcpaddr_t a) { - memcpy((char*)ipq, (char*)&a.sin_addr.s_addr, 4); - port = ntohs(a.sin_port); - } - void make_addr(tcpaddr_t& a) const { - memset(&a, 0, sizeof(a)); - a.sin_family = AF_INET; - memcpy((char*)&a.sin_addr.s_addr, (char*)ipq, 4); - a.sin_port = htons(port); - } -}; - -inline ostream& operator<<(ostream& out, const entity_addr_t &addr) -{ - return out << (int)addr.ipq[0] - << '.' << (int)addr.ipq[1] - << '.' << (int)addr.ipq[2] - << '.' << (int)addr.ipq[3] - << ':' << addr.port - << '.' << addr.nonce; -} - -inline bool operator==(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) == 0; } -inline bool operator!=(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) != 0; } -inline bool operator<(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) < 0; } -inline bool operator<=(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) <= 0; } -inline bool operator>(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) > 0; } -inline bool operator>=(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) >= 0; } - -namespace __gnu_cxx { - template<> struct hash< entity_addr_t > - { - size_t operator()( const entity_addr_t& x ) const - { - static blobhash H; - return H((const char*)&x, sizeof(x)); - } - }; -} - - -/* - * a particular entity instance - */ -struct entity_inst_t { - entity_name_t name; - entity_addr_t addr; - entity_inst_t() {} - entity_inst_t(entity_name_t n, const entity_addr_t& a) : name(n), addr(a) {} -}; - - -inline bool operator==(const entity_inst_t& a, const entity_inst_t& b) { return memcmp(&a, &b, sizeof(a)) == 0; } -inline bool operator!=(const entity_inst_t& a, const entity_inst_t& b) { return memcmp(&a, &b, sizeof(a)) != 0; } -inline bool operator<(const entity_inst_t& a, const entity_inst_t& b) { return memcmp(&a, &b, sizeof(a)) < 0; } -inline bool operator<=(const entity_inst_t& a, const entity_inst_t& b) { return memcmp(&a, &b, sizeof(a)) <= 0; } -inline bool operator>(const entity_inst_t& a, const entity_inst_t& b) { return memcmp(&a, &b, sizeof(a)) > 0; } -inline bool operator>=(const entity_inst_t& a, const entity_inst_t& b) { return memcmp(&a, &b, sizeof(a)) >= 0; } - -namespace __gnu_cxx { - template<> struct hash< entity_inst_t > - { - size_t operator()( const entity_inst_t& x ) const - { - static blobhash H; - return H((const char*)&x, sizeof(x)); - } - }; -} - -inline ostream& operator<<(ostream& out, const entity_inst_t &i) -{ - return out << i.name << " " << i.addr; -} - - -#endif diff --git a/tags/20070517_before_mds_merge/msg/new_mpistarter.cc b/tags/20070517_before_mds_merge/msg/new_mpistarter.cc deleted file mode 100644 index fc9da720f19ee..0000000000000 --- a/tags/20070517_before_mds_merge/msg/new_mpistarter.cc +++ /dev/null @@ -1,43 +0,0 @@ -#include -#include "NewMessenger.h" - -/* - * start up NewMessenger via MPI. - */ - -pair mpi_bootstrap_new(int& argc, char**& argv) -{ - MPI_Init(&argc, &argv); - - int mpi_world; - int mpi_rank; - MPI_Comm_size(MPI_COMM_WORLD, &mpi_world); - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - - tcpaddr_t nsaddr; - memset(&nsaddr, 0, sizeof(nsaddr)); - - if (mpi_rank == 0) { - // i am root. - rank.my_rank = 0; - rank.start_rank(nsaddr); - nsaddr = rank.get_listen_addr(); - } - - int r = MPI_Bcast(&nsaddr, sizeof(nsaddr), MPI_CHAR, - 0, MPI_COMM_WORLD); - - dout(30) << "r = " << r << " ns tcpaddr is " << nsaddr << endl; - - if (mpi_rank != 0) { - rank.start_rank(nsaddr); - } - - MPI_Barrier(MPI_COMM_WORLD); - - //g_clock.tare(); - - MPI_Finalize(); - - return pair(mpi_rank, mpi_world); -} diff --git a/tags/20070517_before_mds_merge/msg/tcp.cc b/tags/20070517_before_mds_merge/msg/tcp.cc deleted file mode 100644 index 1a448a91cb2c6..0000000000000 --- a/tags/20070517_before_mds_merge/msg/tcp.cc +++ /dev/null @@ -1,87 +0,0 @@ - -#include "tcp.h" - -/****************** - * tcp crap - */ - -bool tcp_read(int sd, char *buf, int len) -{ - while (len > 0) { - int got = ::recv( sd, buf, len, 0 ); - if (got == 0) { - dout(18) << "tcp_read socket " << sd << " closed" << endl; - return false; - } - if (got < 0) { - dout(18) << "tcp_read bailing with " << got << endl; - return false; - } - assert(got >= 0); - len -= got; - buf += got; - //dout(DBL) << "tcp_read got " << got << ", " << len << " left" << endl; - } - return true; -} - -int tcp_write(int sd, char *buf, int len) -{ - //dout(DBL) << "tcp_write writing " << len << endl; - assert(len > 0); - while (len > 0) { - int did = ::send( sd, buf, len, 0 ); - if (did < 0) { - dout(1) << "tcp_write error did = " << did << " errno " << errno << " " << strerror(errno) << endl; - //cerr << "tcp_write error did = " << did << " errno " << errno << " " << strerror(errno) << endl; - } - //assert(did >= 0); - if (did < 0) return did; - len -= did; - buf += did; - //dout(DBL) << "tcp_write did " << did << ", " << len << " left" << endl; - } - return 0; -} - - -int tcp_hostlookup(char *str, tcpaddr_t& ta) -{ - char *host = str; - char *port = 0; - - for (int i=0; str[i]; i++) { - if (str[i] == ':') { - port = str+i+1; - str[i] = 0; - break; - } - } - if (!port) { - cerr << "addr '" << str << "' doesn't look like 'host:port'" << endl; - return -1; - } - //cout << "host '" << host << "' port '" << port << "'" << endl; - - int iport = atoi(port); - - struct hostent *myhostname = gethostbyname( host ); - if (!myhostname) { - cerr << "host " << host << " not found" << endl; - return -1; - } - - memset(&ta, 0, sizeof(ta)); - - //cout << "addrtype " << myhostname->h_addrtype << " len " << myhostname->h_length << endl; - - ta.sin_family = myhostname->h_addrtype; - memcpy((char *)&ta.sin_addr, - myhostname->h_addr, - myhostname->h_length); - ta.sin_port = iport; - - cout << "lookup '" << host << ":" << port << "' -> " << ta << endl; - - return 0; -} diff --git a/tags/20070517_before_mds_merge/msg/tcp.h b/tags/20070517_before_mds_merge/msg/tcp.h deleted file mode 100644 index 65043cda8e2ac..0000000000000 --- a/tags/20070517_before_mds_merge/msg/tcp.h +++ /dev/null @@ -1,37 +0,0 @@ -#ifndef __TCP_H -#define __TCP_H - -#include -#include -#include -#include - -typedef struct sockaddr_in tcpaddr_t; - -using std::ostream; - -inline ostream& operator<<(ostream& out, const tcpaddr_t &a) -{ - unsigned char addr[4]; - memcpy((char*)addr, (char*)&a.sin_addr.s_addr, 4); - out << (unsigned)addr[0] << "." - << (unsigned)addr[1] << "." - << (unsigned)addr[2] << "." - << (unsigned)addr[3] << ":" - << ntohs(a.sin_port); - return out; -} - -extern bool tcp_read(int sd, char *buf, int len); -extern int tcp_write(int sd, char *buf, int len); -extern int tcp_hostlookup(char *str, tcpaddr_t& ta); - -inline bool operator==(const tcpaddr_t& a, const tcpaddr_t& b) { - return strncmp((const char*)&a, (const char*)&b, sizeof(a)) == 0; -} -inline bool operator!=(const tcpaddr_t& a, const tcpaddr_t& b) { - return strncmp((const char*)&a, (const char*)&b, sizeof(a)) != 0; -} - - -#endif diff --git a/tags/20070517_before_mds_merge/newsyn.cc b/tags/20070517_before_mds_merge/newsyn.cc deleted file mode 100644 index 9ec409c9c3e7c..0000000000000 --- a/tags/20070517_before_mds_merge/newsyn.cc +++ /dev/null @@ -1,419 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include -#include -#include -using namespace std; - -#include - -#include "config.h" - -#include "mds/MDS.h" -#include "osd/OSD.h" -#include "mon/Monitor.h" -#include "client/Client.h" -#include "client/SyntheticClient.h" - -#include "msg/SimpleMessenger.h" - -#include "common/Timer.h" - -#define NUMMDS g_conf.num_mds -#define NUMOSD g_conf.num_osd -#define NUMCLIENT g_conf.num_client - -class C_Test : public Context { -public: - void finish(int r) { - cout << "C_Test->finish(" << r << ")" << endl; - } -}; - - -/* - * start up NewMessenger via MPI. - */ -#include - -pair mpi_bootstrap_new(int& argc, char**& argv, MonMap *monmap) -{ - MPI_Init(&argc, &argv); - - int mpi_world; - int mpi_rank; - MPI_Comm_size(MPI_COMM_WORLD, &mpi_world); - MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); - - // first, synchronize clocks. - if (g_conf.clock_tare) { - if (1) { - // use an MPI barrier. probably not terribly precise. - MPI_Barrier(MPI_COMM_WORLD); - g_clock.tare(); - } else { - // use wall clock; assume NTP has all nodes synchronized already. - // FIXME someday: this hangs for some reason. whatever. - utime_t z = g_clock.now(); - MPI_Bcast( &z, sizeof(z), MPI_CHAR, - 0, MPI_COMM_WORLD); - cout << "z is " << z << endl; - g_clock.tare(z); - } - } - - // start up all monitors at known addresses. - entity_inst_t moninst[mpi_world]; // only care about first g_conf.num_mon of these. - - rank.start_rank(); // bind and listen - - if (mpi_rank < g_conf.num_mon) { - moninst[mpi_rank].addr = rank.my_addr; - moninst[mpi_rank].name = MSG_ADDR_MON(mpi_rank); - - //cerr << mpi_rank << " at " << rank.get_listen_addr() << endl; - } - - MPI_Gather( &moninst[mpi_rank], sizeof(entity_inst_t), MPI_CHAR, - moninst, sizeof(entity_inst_t), MPI_CHAR, - 0, MPI_COMM_WORLD); - - if (mpi_rank == 0) { - for (int i=0; imon_inst[i] = moninst[i]; - } - } - - - // distribute monmap - bufferlist bl; - if (mpi_rank == 0) { - monmap->encode(bl); - monmap->write(".ceph_monmap"); - } else { - int l = g_conf.num_mon * 1000; // nice'n big. - bufferptr bp(l); - bl.append(bp); - } - - MPI_Bcast(bl.c_str(), bl.length(), MPI_CHAR, - 0, MPI_COMM_WORLD); - - if (mpi_rank > 0) { - monmap->decode(bl); - } - - // wait for everyone! - MPI_Barrier(MPI_COMM_WORLD); - - return pair(mpi_rank, mpi_world); -} - -utime_t tick_start; -int tick_count = 0; - -class C_Tick : public Context { -public: - void finish(int) { - utime_t now = g_clock.now() - tick_start; - dout(0) << "tick +" << g_conf.tick << " -> " << now << " (" << tick_count << ")" << endl; - tick_count += g_conf.tick; - utime_t next = tick_start; - next.sec_ref() += tick_count; - g_timer.add_event_at(next, new C_Tick); - } -}; - -class C_Die : public Context { -public: - void finish(int) { - cerr << "die" << endl; - exit(1); - } -}; - -class C_Debug : public Context { - public: - void finish(int) { - int size = &g_conf.debug_after - &g_conf.debug; - memcpy((char*)&g_conf.debug, (char*)&g_debug_after_conf.debug, size); - dout(0) << "debug_after flipping debug settings" << endl; - } -}; - - -int main(int argc, char **argv) -{ - vector args; - argv_to_vec(argc, argv, args); - - map kill_osd_after; - if (1) { - vector nargs; - for (unsigned i=0; i nargs; - for (unsigned i=0; i mpiwho = mpi_bootstrap_new(argc, argv, monmap); - int myrank = mpiwho.first; - int world = mpiwho.second; - - int need = 0; - if (g_conf.ms_skip_rank0) need++; - need += NUMMDS; - if (g_conf.ms_stripe_osds) - need++; - else - need += NUMOSD; - if (NUMCLIENT) { - if (!g_conf.ms_overlay_clients) - need += 1; - } - assert(need <= world); - - if (myrank == 0) - cerr << "nummds " << NUMMDS << " numosd " << NUMOSD << " numclient " << NUMCLIENT << " .. need " << need << ", have " << world << endl; - - - char hostname[100]; - gethostname(hostname,100); - int pid = getpid(); - - int started = 0; - - //if (myrank == 0) g_conf.debug = 20; - - // create mon - if (myrank < g_conf.num_mon) { - Monitor *mon = new Monitor(myrank, rank.register_entity(MSG_ADDR_MON(myrank)), monmap); - mon->init(); - } - - - // wait for monitors to start. - MPI_Barrier(MPI_COMM_WORLD); - - // okay, home free! - MPI_Finalize(); - - - // create mds - map mds; - map mdsosd; - for (int i=0; iinit(); - started++; - - if (g_conf.mds_local_osd) { - mdsosd[i] = new OSD(i+10000, rank.register_entity(MSG_ADDR_OSD(i+10000)), monmap); - mdsosd[i]->init(); - } - } - - // create osd - map osd; - int max_osd_nodes = world - NUMMDS - g_conf.ms_skip_rank0; // assumes 0 clients, if we stripe. - int osds_per_node = (NUMOSD-1)/max_osd_nodes + 1; - for (int i=0; iinit(); - started++; - } - - if (g_conf.ms_overlay_clients) sleep(5); - - // create client - int skip_osd = NUMOSD; - if (g_conf.ms_overlay_clients) - skip_osd = 0; // put clients with osds too! - int client_nodes = world - NUMMDS - skip_osd - g_conf.ms_skip_rank0; - int clients_per_node = 1; - if (NUMCLIENT && client_nodes > 0) clients_per_node = (NUMCLIENT-1) / client_nodes + 1; - set clientlist; - map client;//[NUMCLIENT]; - map syn;//[NUMCLIENT]; - int nclients = 0; - for (int i=0; iinit(); - started++; - - syn[i] = new SyntheticClient(client[i]); - - client[i]->mount(); - nclients++; - } - - if (!clientlist.empty()) dout(2) << "i have " << clientlist << endl; - - for (set::iterator it = clientlist.begin(); - it != clientlist.end(); - it++) { - int i = *it; - - //cerr << "starting synthetic client" << i << " on rank " << myrank << endl; - syn[i]->start_thread(); - - } - if (nclients) { - cerr << nclients << " clients at " << rank.my_addr << " " << hostname << "." << pid << endl; - } - - for (set::iterator it = clientlist.begin(); - it != clientlist.end(); - it++) { - int i = *it; - - // cout << "waiting for synthetic client" << i << " to finish" << endl; - syn[i]->join_thread(); - delete syn[i]; - - client[i]->unmount(); - //cout << "client" << i << " unmounted" << endl; - client[i]->shutdown(); - - delete client[i]; - } - - - if (myrank && !started) { - //dout(1) << "IDLE" << endl; - cerr << "idle at " << rank.my_addr << " " << hostname << "." << pid << endl; - //rank.stop_rank(); - } - - // wait for everything to finish - rank.wait(); - - if (started) cerr << "newsyn finishing" << endl; - - return 0; // whatever, cleanup hangs sometimes (stopping ebofs threads?). - - - // cleanup - for (map::iterator i = mds.begin(); i != mds.end(); i++) - delete i->second; - for (map::iterator i = mdsosd.begin(); i != mdsosd.end(); i++) - delete i->second; - for (map::iterator i = osd.begin(); i != osd.end(); i++) - delete i->second; - /* - for (map::iterator i = client.begin(); i != client.end(); i++) - delete i->second; - for (map::iterator i = syn.begin(); i != syn.end(); i++) - delete i->second; - */ - /* - for (int i=0; i - -Ceph - scalable distributed file system - -This is free software; you can redistribute it and/or -modify it under the terms of the GNU Lesser General Public -License version 2.1, as published by the Free Software -Foundation. See file COPYING. */ - - -#include -#include -#include -#include "OSBDB.h" -#include "common/Timer.h" - -using namespace std; - -#undef dout -#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_bdbstore) cout << "bdbstore(" << device << ")@" << __LINE__ << "." -#undef derr -#define derr(x) if (x <= g_conf.debug || x <= g_conf.debug_bdbstore) cerr << "bdbstore(" << device << ")@" << __LINE__ << "." - -#define CLEANUP(onsafe) do { \ - dout(6) << "DELETE " << hex << onsafe << dec << endl; \ - delete onsafe; \ - } while (0) -#define COMMIT(onsafe) do { \ - dout(6) << "COMMIT " << hex << onsafe << dec << endl; \ - sync(onsafe); \ - } while (0) - - // Have a lock, already. - -class scoped_lock -{ -private: - Mutex *m; -public: - scoped_lock(Mutex *m) : m(m) { m->Lock(); } - ~scoped_lock() { m->Unlock(); } -}; - - // Utilities. - -// Starting off with my own bsearch; mail reader to follow... - -// Perform a binary search on a sorted array, returning the insertion -// point for key, or key if it is exactly found. In other words, this -// will return a pointer to the element that will come after key if -// key were to be inserted into the sorted array. -// -// Requires that T have < and > operators defined. -template -uint32_t binary_search (T *array, size_t size, T key) -{ - int low = 0; - int high = size; - int p = (low + high) / 2; - - while (low < high - 1) - { - if (array[p] > key) - { - high = p; - } - else if (array[p] < key) - { - low = p; - } - else - return p; - - p = (low + high) / 2; - } - - if (array[p] < key) - p++; - else if (array[p] > key && p > 0) - p--; - return p; -} - - // Management. - -DbEnv *OSBDB::getenv () -{ - DbEnv *envp = new DbEnv (DB_CXX_NO_EXCEPTIONS); - if (g_conf.debug > 1 || g_conf.debug_bdbstore > 1) - envp->set_error_stream (&std::cerr); - if (g_conf.debug > 2 || g_conf.debug_bdbstore > 2) - envp->set_message_stream (&std::cout); - envp->set_flags (DB_LOG_INMEMORY, 1); - //env->set_flags (DB_DIRECT_DB, 1); - int env_flags = (DB_CREATE - | DB_THREAD - //| DB_INIT_LOCK - | DB_INIT_MPOOL - //| DB_INIT_TXN - //| DB_INIT_LOG - | DB_PRIVATE); - if (envp->open (NULL, env_flags, 0) != 0) - { - std::cerr << "failed to open environment " << std::endl; - assert(0); - } - return envp; -} - -int OSBDB::opendb(DBTYPE type, int flags, bool new_env) -{ - env = getenv(); - db = new Db(env, 0); - db->set_error_stream (&std::cerr); - db->set_message_stream (&std::cout); - db->set_flags (0); - if (!g_conf.bdbstore_btree) - { - if (g_conf.bdbstore_pagesize > 0) - db->set_pagesize (g_conf.bdbstore_pagesize); - if (g_conf.bdbstore_ffactor > 0 && g_conf.bdbstore_nelem > 0) - { - db->set_h_ffactor (g_conf.bdbstore_ffactor); - db->set_h_nelem (g_conf.bdbstore_nelem); - } - } - if (g_conf.bdbstore_cachesize > 0) - { - db->set_cachesize (0, g_conf.bdbstore_cachesize, 0); - } - - flags = flags | DB_THREAD; - if (transactional) - flags = flags | DB_AUTO_COMMIT; - - int ret; - if ((ret = db->open (NULL, device.c_str(), NULL, type, flags, 0)) != 0) - { - derr(1) << "failed to open database: " << device << ": " - << db_strerror(ret) << std::endl; - return -EINVAL; - } - opened = true; - return 0; -} - -int OSBDB::mount() -{ - dout(2) << "mount " << device << endl; - - if (mounted) - { - dout(4) << "..already mounted" << endl; - return 0; - } - - if (!opened) - { - int ret; - if ((ret = opendb ()) != 0) - { - dout(4) << "..returns " << ret << endl; - return ret; - } - } - - // XXX Do we want anything else in the superblock? - - Dbt key (OSBDB_SUPERBLOCK_KEY, 1); - stored_superblock super; - Dbt value (&super, sizeof (super)); - value.set_dlen (sizeof (super)); - value.set_ulen (sizeof (super)); - value.set_flags (DB_DBT_USERMEM | DB_DBT_PARTIAL); - - if (db->get (NULL, &key, &value, 0) != 0) - { - dout(4) << "..get superblock fails" << endl; - return -EINVAL; // XXX how to say "badly formed fs?" - } - - dout(3) << ".mount " << super << endl; - - if (super.version != OSBDB_THIS_VERSION) - { - dout(4) << "version mismatch (" << super.version << ")" << endl; - return -EINVAL; - } - - DBTYPE t; - db->get_type (&t); - - if (t == DB_BTREE) - { - u_int32_t minkey; - u_int32_t flags; - db->get_bt_minkey (&minkey); - db->get_flags (&flags); - dout(1) << "mounted version " << OSBDB_THIS_VERSION << "; Btree; " - << "min keys per page: " << minkey << "; flags: " - << hex << flags << dec << endl; - cout << dec; - } - else - { - u_int32_t ffactor; - u_int32_t nelem; - u_int32_t flags; - db->get_h_ffactor (&ffactor); - db->get_h_nelem (&nelem); - db->get_flags (&flags); - dout(1) << "mounted version " << OSBDB_THIS_VERSION << "; Hash; " - << "fill factor: " << ffactor - << " table size: " << nelem << "; flags: " - << hex << flags << dec << endl; - cout << dec; - } - - mounted = true; - dout(4) << "..mounted" << endl; - return 0; -} - -int OSBDB::umount() -{ - if (!mounted) - return -EINVAL; - - dout(2) << "umount" << endl; - - int ret; - if (opened) - { - if (transactional) - { - env->log_flush (NULL); - if ((ret = env->lsn_reset (device.c_str(), 0)) != 0) - { - derr(1) << "lsn_reset: " << db_strerror (ret) << endl; - } - } - - db->sync (0); - - if ((ret = db->close (0)) != 0) - { - derr(1) << "close: " << db_strerror(ret) << endl; - return -EINVAL; - } - delete db; - db = NULL; - - if (env) - { - env->close (0); - delete env; - env = NULL; - } - } - mounted = false; - opened = false; - dout(4) << "..unmounted" << endl; - return 0; -} - -int OSBDB::mkfs() -{ - if (mounted) - return -EINVAL; - - dout(2) << "mkfs" << endl; - - string d = env_dir; - d += device; - unlink (d.c_str()); - - int ret; - if ((ret = opendb((g_conf.bdbstore_btree ? DB_BTREE : DB_HASH), - DB_CREATE, true)) != 0) - { - derr(1) << "failed to open database: " << device << ": " - << db_strerror(ret) << std::endl; - return -EINVAL; - } - opened = true; - dout(3) << "..opened " << device << endl; - - uint32_t c; - ret = db->truncate (NULL, &c, 0); - if (ret != 0) - { - derr(1) << "db truncate failed: " << db_strerror (ret) << endl; - return -EIO; // ??? - } - - Dbt key (OSBDB_SUPERBLOCK_KEY, 1); - struct stored_superblock sb; - sb.version = OSBDB_THIS_VERSION; - Dbt value (&sb, sizeof (sb)); - - dout(3) << "..writing superblock" << endl; - if ((ret = db->put (NULL, &key, &value, 0)) != 0) - { - derr(1) << "failed to write superblock: " << db_strerror (ret) - << endl; - return -EIO; - } - dout(3) << "..wrote superblock" << endl; - dout(4) << "..mkfs done" << endl; - return 0; -} - - // Objects. - -int OSBDB::pick_object_revision_lt(object_t& oid) -{ - // Not really needed. - dout(0) << "pick_object_revision_lt " << oid << endl; - return -ENOSYS; -} - -bool OSBDB::exists(object_t oid) -{ - dout(2) << "exists " << oid << endl; - struct stat st; - bool ret = (stat (oid, &st) == 0); - dout(4) << "..returns " << ret << endl; - return ret; -} - -int OSBDB::statfs (struct statfs *st) -{ - // Hacky? - if (::statfs (device.c_str(), st) != 0) - { - int ret = -errno; - derr(1) << "statfs returns " << ret << endl; - return ret; - } - st->f_type = OSBDB_MAGIC; - dout(4) << "..statfs OK" << endl; - return 0; -} - -int OSBDB::stat(object_t oid, struct stat *st) -{ - if (!mounted) - { - dout(4) << "not mounted!" << endl; - return -EINVAL; - } - - dout(2) << "stat " << oid << endl; - - object_inode_key ikey = new_object_inode_key(oid); - stored_object obj; - Dbt key (&ikey, sizeof_object_inode_key()); - Dbt value (&obj, sizeof (obj)); - value.set_flags (DB_DBT_USERMEM); - value.set_ulen (sizeof (obj)); - - dout(3) << " lookup " << ikey << endl; - int ret; - if ((ret = db->get (NULL, &key, &value, 0)) != 0) - { - derr(1) << " get returned " << ret << endl; - return -ENOENT; - } - - st->st_size = obj.length; - dout(3) << "stat length:" << obj.length << endl; - dout(4) << "..stat OK" << endl; - return 0; -} - -int OSBDB::remove(object_t oid, Context *onsafe) -{ - if (!mounted) - { - derr(1) << "not mounted!" << endl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - dout(6) << "Context " << hex << onsafe << dec << endl; - scoped_lock __lock(&lock); - dout(2) << "remove " << oid << endl; - - DbTxn *txn = NULL; - if (transactional) - env->txn_begin (NULL, &txn, 0); - - oid_t id; - mkoid(id, oid); - Dbt key (&id, sizeof (oid_t)); - int ret; - if ((ret = db->del (txn, &key, 0)) != 0) - { - derr(1) << ".del returned error: " << db_strerror (ret) << endl; - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - - object_inode_key _ikey = new_object_inode_key (oid); - Dbt ikey (&_ikey, sizeof_object_inode_key()); - if ((ret = db->del (txn, &ikey, 0)) != 0) - { - derr(1) << ".del returned error: " << db_strerror (ret) << endl; - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - - attrs_id aids = new_attrs_id (oid); - Dbt askey (&aids, sizeof_attrs_id()); - Dbt asval; - asval.set_flags (DB_DBT_MALLOC); - if (db->get (txn, &askey, &asval, 0) == 0) - { - // We have attributes; remove them. - stored_attrs *sap = (stored_attrs *) asval.get_data(); - auto_ptr sa (sap); - for (unsigned i = 0; i < sap->count; i++) - { - attr_id aid = new_attr_id (oid, sap->names[i].name); - Dbt akey (&aid, sizeof (aid)); - if ((ret = db->del (txn, &akey, 0)) != 0) - { - derr(1) << ".del returns error: " << db_strerror (ret) << endl; - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - } - if ((ret = db->del (txn, &askey, 0)) != 0) - { - derr(1) << ".del returns error: " << db_strerror (ret) << endl; - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - } - - // XXX check del return value - - if (txn != NULL) - txn->commit (0); - if (onsafe != NULL) - COMMIT(onsafe); - dout(4) << "..remove OK" << endl; - return 0; -} - -int OSBDB::truncate(object_t oid, off_t size, Context *onsafe) -{ - dout(6) << "Context " << hex << onsafe << dec << endl; - - if (!mounted) - { - derr(1) << "not mounted!" << endl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - scoped_lock __lock(&lock); - dout(2) << "truncate " << size << endl; - - if (size > 0xFFFFFFFF) - { - derr(1) << "object size too big!" << endl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -ENOSPC; - } - - DbTxn *txn = NULL; - - if (transactional) - env->txn_begin (NULL, &txn, 0); - - object_inode_key ikey = new_object_inode_key (oid); - stored_object obj; - Dbt key (&ikey, sizeof_object_inode_key()); - Dbt value (&obj, sizeof (obj)); - value.set_dlen (sizeof (obj)); - value.set_ulen (sizeof (obj)); - value.set_flags (DB_DBT_USERMEM); - - if (db->get (txn, &key, &value, 0) != 0) - { - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - dout(4) << "..returns -ENOENT" << endl; - return -ENOENT; - } - - if (obj.length < size) - { - oid_t id; - mkoid (id, oid); - Dbt okey (&id, sizeof (oid_t)); - char b[] = { '\0' }; - Dbt newVal (b, 1); - newVal.set_doff ((size_t) size); - newVal.set_dlen (1); - newVal.set_ulen (1); - newVal.set_flags (DB_DBT_PARTIAL); - if (db->put (txn, &okey, &newVal, 0) != 0) - { - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".updating object failed" << endl; - return -EIO; - } - - obj.length = size; - value.set_ulen (sizeof (obj)); - if (db->put (txn, &key, &value, 0) != 0) - { - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".updating object info failed" << endl; - return -EIO; - } - } - else if (obj.length > size) - { - obj.length = size; - Dbt tval (&obj, sizeof (obj)); - tval.set_ulen (sizeof (obj)); - tval.set_flags (DB_DBT_USERMEM); - if (db->put (txn, &key, &tval, 0) != 0) - { - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".updating object info failed" << endl; - return -EIO; - } - if (size == 0) - { - char x[1]; - oid_t id; - mkoid (id, oid); - Dbt okey (&id, sizeof (oid_t)); - Dbt oval (&x, 0); - if (db->put (txn, &okey, &oval, 0) != 0) - { - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".updating object failed" << endl; - return -EIO; - } - } - else - { - oid_t id; - mkoid (id, oid); - Dbt okey (&id, sizeof (oid_t)); - Dbt oval; - oval.set_flags (DB_DBT_MALLOC); - if (db->get (txn, &okey, &oval, 0) != 0) - { - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".getting old object failed" << endl; - return -EIO; - } - auto_ptr ovalPtr ((char *) oval.get_data()); - oval.set_size ((size_t) size); - oval.set_ulen ((size_t) size); - if (db->put (txn, &okey, &oval, 0) != 0) - { - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".putting new object failed" << endl; - return -EIO; - } - } - } - - if (txn) - txn->commit (0); - if (onsafe != NULL) - COMMIT(onsafe); - - dout(4) << "..truncate OK" << endl; - return 0; -} - -int OSBDB::read(object_t oid, off_t offset, size_t len, bufferlist& bl) -{ - if (!mounted) - { - derr(1) << "not mounted!" << endl; - return -EINVAL; - } - - dout(2) << "read " << oid << " " << offset << " " - << len << endl; - - if (bl.length() < len) - { - int remain = len - bl.length(); - bufferptr ptr (remain); - bl.push_back(ptr); - } - - DbTxn *txn = NULL; - if (transactional) - env->txn_begin (NULL, &txn, 0); - - object_inode_key _ikey = new_object_inode_key (oid); - stored_object obj; - Dbt ikey (&_ikey, sizeof_object_inode_key()); - Dbt ival (&obj, sizeof (obj)); - ival.set_flags (DB_DBT_USERMEM); - ival.set_ulen (sizeof(obj)); - - dout(3) << "..get " << _ikey << endl; - int ret; - if ((ret = db->get (txn, &ikey, &ival, 0)) != 0) - { - if (txn) - txn->abort(); - derr(1) << "get returned " << db_strerror (ret) << endl; - return -ENOENT; - } - - dout(3) << "..object has size " << obj.length << endl; - - if (offset == 0 && len >= obj.length) - { - len = obj.length; - dout(3) << "..doing full read of " << len << endl; - oid_t id; - mkoid (id, oid); - Dbt key (&id, sizeof (oid_t)); - Dbt value (bl.c_str(), len); - value.set_ulen (len); - value.set_flags (DB_DBT_USERMEM); - dout(3) << "..getting " << oid << endl; - if ((ret = db->get (txn, &key, &value, 0)) != 0) - { - derr(1) << ".get returned " << db_strerror (ret) << endl; - if (txn) - txn->abort(); - return -EIO; - } - } - else - { - if (offset > obj.length) - { - dout(2) << "..offset out of range" << endl; - return 0; - } - if (offset + len > obj.length) - len = obj.length - (size_t) offset; - dout(3) << "..doing partial read of " << len << endl; - oid_t id; - mkoid (id, oid); - Dbt key (&id, sizeof (oid)); - Dbt value; - char *data = bl.c_str(); - dout(3) << ".bufferlist c_str returned " << ((void*) data) << endl; - value.set_data (data); - value.set_doff ((size_t) offset); - value.set_dlen (len); - value.set_ulen (len); - value.set_flags (DB_DBT_USERMEM | DB_DBT_PARTIAL); - dout(3) << "..getting " << oid << endl; - if ((ret = db->get (txn, &key, &value, 0)) != 0) - { - derr(1) << ".get returned " << db_strerror (ret) << endl; - if (txn) - txn->abort(); - return -EIO; - } - } - - if (txn) - txn->commit (0); - dout(4) << "..read OK, returning " << len << endl; - return len; -} - -int OSBDB::write(object_t oid, off_t offset, size_t len, - bufferlist& bl, Context *onsafe) -{ - dout(6) << "Context " << hex << onsafe << dec << endl; - if (!mounted) - { - derr(1) << "not mounted!" << endl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - scoped_lock __lock(&lock); - dout(2) << "write " << oid << " " << offset << " " - << len << endl; - - if (offset > 0xFFFFFFFFL || offset + len > 0xFFFFFFFFL) - { - derr(1) << "object too big" << endl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -ENOSPC; - } - - DbTxn *txn = NULL; - if (transactional) - env->txn_begin (txn, &txn, 0); - - object_inode_key _ikey = new_object_inode_key (oid); - stored_object obj; - Dbt ikey (&_ikey, sizeof_object_inode_key()); - Dbt ival (&obj, sizeof (obj)); - ival.set_ulen (sizeof (obj)); - ival.set_flags (DB_DBT_USERMEM); - - int ret; - dout(3) << "..getting " << _ikey << endl; - if (db->get (txn, &ikey, &ival, 0) != 0) - { - dout(3) << "..writing new object" << endl; - - // New object. - obj.length = (size_t) offset + len; - dout(3) << "..mapping " << _ikey << " => " - << obj << endl; - if ((ret = db->put (txn, &ikey, &ival, 0)) != 0) - { - derr(1) << "..put returned " << db_strerror (ret) << endl; - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - - oid_t id; - mkoid (id, oid); - Dbt key (&id, sizeof (oid_t)); - Dbt value (bl.c_str(), len); - if (offset == 0) // whole object - { - value.set_flags (DB_DBT_USERMEM); - value.set_ulen (len); - } - else - { - value.set_flags (DB_DBT_USERMEM | DB_DBT_PARTIAL); - value.set_ulen (len); - value.set_doff ((size_t) offset); - value.set_dlen (len); - } - dout(3) << "..mapping " << oid << " => (" - << obj.length << " bytes)" << endl; - if ((ret = db->put (txn, &key, &value, 0)) != 0) - { - derr(1) << "..put returned " << db_strerror (ret) << endl; - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - - if (txn != NULL) - txn->commit (0); - if (onsafe != NULL) - COMMIT(onsafe); - - dout(4) << "..write OK, returning " << len << endl; - return len; - } - - if (offset == 0 && len >= obj.length) - { - if (len != obj.length) - { - obj.length = len; - if ((ret = db->put (txn, &ikey, &ival, 0)) != 0) - { - derr(1) << " put returned " << db_strerror (ret) << endl; - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - } - oid_t id; - mkoid(id, oid); - Dbt key (&id, sizeof (oid_t)); - Dbt value (bl.c_str(), len); - if (db->put (txn, &key, &value, 0) != 0) - { - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << "..writing object failed!" << endl; - return -EIO; - } - } - else - { - if (offset + len > obj.length) - { - obj.length = (size_t) offset + len; - if (db->put (txn, &ikey, &ival, 0) != 0) - { - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << "..writing object info failed!" << endl; - return -EIO; - } - } - oid_t id; - mkoid(id, oid); - Dbt key (&id, sizeof (oid_t)); - Dbt value (bl.c_str(), len); - value.set_doff ((size_t) offset); - value.set_dlen (len); - value.set_ulen (len); - value.set_flags (DB_DBT_PARTIAL); - if (db->put (txn, &key, &value, 0) != 0) - { - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << "..writing object failed!" << endl; - return -EIO; - } - } - - if (txn != NULL) - txn->commit (0); - if (onsafe != NULL) - COMMIT(onsafe); - - dout(4) << "..write OK, returning " << len << endl; - return len; -} - -int OSBDB::clone(object_t oid, object_t noid) -{ - if (!mounted) - { - derr(1) << "not mounted!" << endl; - return -EINVAL; - } - - dout(2) << "clone " << oid << ", " << noid << endl; - - if (exists (noid)) - { - dout(4) << "..target exists; returning -EEXIST" << endl; - return -EEXIST; - } - - DbTxn *txn = NULL; - if (transactional) - env->txn_begin (NULL, &txn, 0); - - object_inode_key _ikey = new_object_inode_key (oid); - object_inode_key _nikey = new_object_inode_key (noid); - stored_object obj; - Dbt ikey (&_ikey, sizeof_object_inode_key()); - Dbt ival (&obj, sizeof (obj)); - Dbt nikey (&_nikey, sizeof_object_inode_key()); - ival.set_ulen (sizeof (obj)); - ival.set_flags (DB_DBT_USERMEM); - - oid_t id, nid; - mkoid(id, oid); - mkoid(nid, noid); - Dbt key (&id, sizeof (oid_t)); - Dbt nkey (&oid, sizeof (oid_t)); - Dbt value; - value.set_flags (DB_DBT_MALLOC); - - if (db->get (txn, &ikey, &ival, 0) != 0) - { - if (txn) - txn->abort(); - derr(1) << "..getting object info failed!" << endl; - return -ENOENT; - } - if (db->get (txn, &key, &value, 0) != 0) - { - if (txn) - txn->abort(); - derr(1) << "..getting original object failed" << endl; - return -ENOENT; - } - auto_ptr valueptr ((char *) value.get_data()); - - if (db->put (txn, &nikey, &ival, 0) != 0) - { - if (txn) - txn->abort(); - derr(1) << "..putting object info failed" << endl; - return -EIO; - } - if (db->put (txn, &nkey, &value, 0) != 0) - { - if (txn) - txn->abort(); - derr(1) << "..putting new object failed" << endl; - return -EIO; - } - - if (txn) - txn->commit (0); - - dout(4) << "..clone OK" << endl; - return 0; -} - - // Collections - -int OSBDB::list_collections(list& ls) -{ - if (!mounted) - { - derr(1) << "not mounted!" << endl; - return -EINVAL; - } - - dout(2) << "list_collections" << endl; - - Dbt key (COLLECTIONS_KEY, 1); - Dbt value; - value.set_flags (DB_DBT_MALLOC); - - if (db->get (NULL, &key, &value, 0) != 0) - { - dout(4) << "..no collections" << endl; - return 0; // no collections. - } - - auto_ptr sc ((stored_colls *) value.get_data()); - stored_colls *scp = sc.get(); - for (uint32_t i = 0; i < sc->count; i++) - ls.push_back (scp->colls[i]); - - dout(4) << "..list_collections returns " << scp->count << endl; - return scp->count; -} - -int OSBDB::create_collection(coll_t c, Context *onsafe) -{ - dout(6) << "Context " << hex << onsafe << dec << endl; - if (!mounted) - { - derr(1) << "not mounted" << endl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - scoped_lock __lock(&lock); - dout(2) << "create_collection " << hex << c << dec << endl; - - Dbt key (COLLECTIONS_KEY, 1); - Dbt value; - value.set_flags (DB_DBT_MALLOC); - - DbTxn *txn = NULL; - if (transactional) - env->txn_begin (NULL, &txn, 0); - - stored_colls *scp = NULL; - size_t sz = 0; - bool created = false; - if (db->get (txn, &key, &value, 0) != 0) - { - sz = sizeof (stored_colls) + sizeof (coll_t); - scp = (stored_colls *) malloc (sz); - scp->count = 0; - created = true; - } - else - { - scp = (stored_colls *) value.get_data(); - sz = value.get_size(); - } - - auto_ptr sc (scp); - int ins = 0; - if (scp->count > 0) - ins = binary_search (scp->colls, scp->count, c); - if (ins < scp->count && scp->colls[ins] == c) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".collection " << c << " already exists " << endl; - return -EEXIST; - } - - dout(3) << "..insertion point: " << ins << endl; - - // Make room for a new collection ID. - if (!created) - { - sz += sizeof (coll_t); - dout(3) << "..increase size to " << sz << endl; - stored_colls *scp2 = (stored_colls *) realloc (scp, sz); - sc.release (); - sc.reset (scp2); - scp = scp2; - } - - int n = (scp->count - ins) * sizeof (coll_t); - if (n > 0) - { - dout(3) << "..moving " << n << " bytes up" << endl; - memmove (&scp->colls[ins + 1], &scp->colls[ins], n); - } - scp->count++; - scp->colls[ins] = c; - - dout(3) << "..collections: " << scp << endl; - - // Put the modified collection list back. - { - Dbt value2 (scp, sz); - if (db->put (txn, &key, &value2, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".writing new collections list failed" << endl; - return -EIO; - } - } - - // Create the new collection. - { - stored_coll new_coll; - new_coll.count = 0; - Dbt coll_key (&c, sizeof (coll_t)); - Dbt coll_value (&new_coll, sizeof (stored_coll)); - if (db->put (txn, &coll_key, &coll_value, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".writing new collection failed" << endl; - return -EIO; - } - } - - if (txn != NULL) - txn->commit (0); - if (onsafe != NULL) - COMMIT(onsafe); - - dout(4) << "..create_collection OK" << endl; - return 0; -} - -int OSBDB::destroy_collection(coll_t c, Context *onsafe) -{ - dout(6) << "Context " << hex << onsafe << dec << endl; - if (!mounted) - { - derr(1) << "not mounted" << endl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - scoped_lock __lock(&lock); - dout(2) << "destroy_collection " << hex << c << dec << endl; - - Dbt key (COLLECTIONS_KEY, 1); - Dbt value; - value.set_flags (DB_DBT_MALLOC); - DbTxn *txn = NULL; - - if (transactional) - env->txn_begin (NULL, &txn, 0); - - if (db->get (txn, &key, &value, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".collection list doesn't exist" << endl; - return -ENOENT; // XXX - } - - stored_colls *scp = (stored_colls *) value.get_data(); - auto_ptr valueBuf (scp); - if (scp->count == 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".collection " << c << " not listed" << endl; - return -ENOENT; - } - uint32_t ins = binary_search (scp->colls, scp->count, c); - dout(4) << "..insertion point is " << ins << endl; - if (ins >= scp->count || scp->colls[ins] != c) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".collection " << c << " not listed" << endl; - return -ENOENT; - } - - dout(4) << "..collections list is " << scp << endl; - - // Move the rest of the list down in memory, if needed. - if (ins < scp->count) - { - size_t n = scp->count - ins - 1; - dout(4) << "..shift list down " << n << endl; - memmove (&scp->colls[ins], &scp->colls[ins + 1], n); - } - - dout(4) << "..collections list is " << scp << endl; - - // Modify the record size to be one less. - Dbt nvalue (scp, value.get_size() - sizeof (coll_t)); - nvalue.set_flags (DB_DBT_USERMEM); - if (db->put (txn, &key, &nvalue, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".putting modified collection list failed" << endl; - return -EIO; - } - - // Delete the collection. - Dbt collKey (&c, sizeof (coll_t)); - if (db->del (txn, &collKey, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".deleting collection failed" << endl; - return -EIO; - } - - if (txn != NULL) - txn->commit (0); - if (onsafe != NULL) - COMMIT(onsafe); - dout(4) << "..destroy_collection OK" << endl; - return 0; -} - -bool OSBDB::collection_exists(coll_t c) -{ - if (!mounted) - { - derr(1) << "not mounted" << endl; - return -EINVAL; - } - - dout(2) << "collection_exists " << hex << c << dec << endl; - - /*Dbt key (COLLECTIONS_KEY, 1); - Dbt value; - value.set_flags (DB_DBT_MALLOC); - - if (db->get (NULL, &key, &value, 0) != 0) - { - dout(4) << "..no collection list; return false" << endl; - return false; - } - - stored_colls *scp = (stored_colls *) value.get_data(); - auto_ptr sc (scp); - dout(5) << "..collection list is " << scp << endl; - if (scp->count == 0) - { - dout(4) << "..empty collection list; return false" << endl; - return false; - } - uint32_t ins = binary_search (scp->colls, scp->count, c); - dout(4) << "..insertion point is " << ins << endl; - - int ret = (scp->colls[ins] == c); - dout(4) << "..returns " << ret << endl; - return ret;*/ - - Dbt key (&c, sizeof (coll_t)); - Dbt value; - value.set_flags (DB_DBT_MALLOC); - if (db->get (NULL, &key, &value, 0) != 0) - { - dout(4) << "..no collection, return false" << endl; - return false; - } - void *val = value.get_data(); - free (val); - dout(4) << "..collection exists; return true" << endl; - return true; -} - -int OSBDB::collection_stat(coll_t c, struct stat *st) -{ - if (!mounted) - { - derr(1) << "not mounted" << endl; - return -EINVAL; - } - - dout(2) << "collection_stat " << c << endl; - // XXX is this needed? - return -ENOSYS; -} - -int OSBDB::collection_add(coll_t c, object_t o, Context *onsafe) -{ - dout(6) << "Context " << hex << onsafe << dec << endl; - if (!mounted) - { - dout(2) << "not mounted" << endl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - scoped_lock __lock(&lock); - dout(2) << "collection_add " << hex << c << dec << " " << o << endl; - - Dbt key (&c, sizeof (coll_t)); - Dbt value; - value.set_flags (DB_DBT_MALLOC); - DbTxn *txn = NULL; - - if (transactional) - env->txn_begin (NULL, &txn, 0); - - if (db->get (txn, &key, &value, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << "failed to find collection" << endl; - return -ENOENT; - } - - size_t sz = value.get_size(); - stored_coll *scp = (stored_coll *) value.get_data(); - auto_ptr sc (scp); - - // Find the insertion point for the new object ID. - uint32_t ins = 0; - if (scp->count > 0) - { - ins = binary_search (scp->objects, scp->count, o); - // Already there? - if (ins < scp->count && scp->objects[ins] == o) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << "collection already has object" << endl; - return -EEXIST; - } - } - - // Make room for the new value, and add it. - sz += sizeof (object_t); - scp = (stored_coll *) realloc (scp, sz); - sc.release(); - sc.reset (scp); - dout(3) << "..current collection: " << scp << endl; - if (ins < scp->count - 1) - { - size_t n = (scp->count - ins) * sizeof (object_t); - dout(3) << "..move up " << n << " bytes" << endl; - memmove (&scp->objects[ins + 1], &scp->objects[ins], n); - } - scp->count++; - scp->objects[ins] = o; - - dout(3) << "..collection: " << scp << endl; - - Dbt nvalue (scp, sz); - if (db->put (txn, &key, &nvalue, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << "..putting modified collection failed" << endl; - return -EIO; - } - - if (txn != NULL) - txn->commit (0); - if (onsafe != NULL) - COMMIT(onsafe); - dout(4) << "..collection add OK" << endl; - return 0; -} - -int OSBDB::collection_remove(coll_t c, object_t o, Context *onsafe) -{ - dout(6) << "Context " << hex << onsafe << dec << endl; - if (!mounted) - { - derr(1) << "not mounted" << endl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - scoped_lock __lock(&lock); - dout(2) << "collection_remove " << hex << c << dec << " " << o << endl; - - Dbt key (&c, sizeof (coll_t)); - Dbt value; - value.set_flags (DB_DBT_MALLOC); - DbTxn *txn = NULL; - - if (transactional) - env->txn_begin (NULL, &txn, 0); - - if (db->get (txn, &key, &value, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - dout(1) << "..collection doesn't exist" << endl; - return -ENOENT; - } - - stored_coll *scp = (stored_coll *) value.get_data(); - auto_ptr sc (scp); - - dout(5) << "..collection is " << scp << endl; - if (scp->count == 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - dout(1) << "..collection is empty" << endl; - return -ENOENT; - } - uint32_t ins = binary_search (scp->objects, scp->count, o); - dout(4) << "..insertion point is " << ins << endl; - if (ins >= scp->count || scp->objects[ins] != o) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - dout(1) << "..object not in collection" << endl; - return -ENOENT; - } - - if (ins < scp->count - 1) - { - size_t n = (scp->count - ins - 1) * sizeof (object_t); - dout(5) << "..moving " << n << " bytes down" << endl; - memmove (&scp->objects[ins], &scp->objects[ins + 1], n); - } - scp->count--; - - dout(3) << "..collection " << scp << endl; - - Dbt nval (scp, value.get_size() - sizeof (object_t)); - if (db->put (txn, &key, &nval, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << "..putting modified collection failed" << endl; - return -EIO; - } - - if (txn != NULL) - txn->commit (0); - if (onsafe != NULL) - COMMIT(onsafe); - dout(4) << "..collection remove OK" << endl; - return 0; -} - -int OSBDB::collection_list(coll_t c, list& o) -{ - if (!mounted) - { - derr(1) << "not mounted" << endl; - return -EINVAL; - } - - Dbt key (&c, sizeof (coll_t)); - Dbt value; - DbTxn *txn = NULL; - - if (transactional) - env->txn_begin (NULL, &txn, 0); - - if (db->get (txn, &key, &value, 0) != 0) - { - if (txn != NULL) - txn->abort(); - return -ENOENT; - } - - stored_coll *scp = (stored_coll *) value.get_data(); - auto_ptr sc (scp); - for (uint32_t i = 0; i < scp->count; i++) - o.push_back (scp->objects[i]); - - if (txn != NULL) - txn->commit (0); - return 0; -} - - // Attributes - -int OSBDB::_setattr(object_t oid, const char *name, - const void *value, size_t size, Context *onsafe, - DbTxn *txn) -{ - dout(6) << "Context " << hex << onsafe << dec << endl; - if (!mounted) - { - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - if (strlen (name) >= OSBDB_MAX_ATTR_LEN) - { - derr(1) << "name too long: " << name << endl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -ENAMETOOLONG; - } - - scoped_lock __lock(&lock); - - // Add name to attribute list, if needed. - attrs_id aids = new_attrs_id (oid); - Dbt attrs_key (&aids, sizeof_attrs_id()); - Dbt attrs_val; - attrs_val.set_flags (DB_DBT_MALLOC); - stored_attrs *sap = NULL; - size_t sz = 0; - - dout(3) << " getting " << aids << endl; - if (db->get (txn, &attrs_key, &attrs_val, 0) != 0) - { - dout(2) << " first attribute" << endl; - sz = sizeof (stored_attrs); - sap = (stored_attrs *) malloc(sz); - sap->count = 0; - } - else - { - sz = attrs_val.get_size(); - sap = (stored_attrs *) attrs_val.get_data(); - dout(2) << "..add to list of " << sap->count << " attrs" << endl; - } - auto_ptr sa (sap); - - attr_name _name; - strncpy (_name.name, name, OSBDB_MAX_ATTR_LEN); - - int ins = 0; - if (sap->count > 0) - ins = binary_search (sap->names, sap->count, _name); - dout(3) << "..insertion point is " << ins << endl; - if (sap->count == 0 || - (ins >= sap->count || strcmp (sap->names[ins].name, name) != 0)) - { - sz += sizeof (attr_name); - dout(3) << "..realloc " << ((void *) sap) << " to " - << dec << sz << endl; - sap = (stored_attrs *) realloc (sap, sz); - dout(3) << "..returns " << ((void *) sap) << endl; - sa.release (); - sa.reset (sap); - int n = (sap->count - ins) * sizeof (attr_name); - if (n > 0) - { - dout(3) << "..move " << n << " bytes from 0x" - << hex << (&sap->names[ins]) << " to 0x" - << hex << (&sap->names[ins+1]) << dec << endl; - memmove (&sap->names[ins+1], &sap->names[ins], n); - } - memset (&sap->names[ins], 0, sizeof (attr_name)); - strncpy (sap->names[ins].name, name, OSBDB_MAX_ATTR_LEN); - sap->count++; - - Dbt newAttrs_val (sap, sz); - newAttrs_val.set_ulen (sz); - newAttrs_val.set_flags (DB_DBT_USERMEM); - dout(3) << "..putting " << aids << endl; - if (db->put (txn, &attrs_key, &newAttrs_val, 0) != 0) - { - derr(1) << ".writing attributes list failed" << endl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - } - else - { - dout(3) << "..attribute " << name << " already exists" << endl; - } - - dout(5) << "..attributes list: " << sap << endl; - - // Add the attribute. - attr_id aid = new_attr_id (oid, name); - Dbt attr_key (&aid, sizeof (aid)); - Dbt attr_val ((void *) value, size); - dout(3) << "..writing attribute key " << aid << endl; - if (db->put (txn, &attr_key, &attr_val, 0) != 0) - { - derr(1) << ".writing attribute key failed" << endl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - - dout(4) << "..setattr OK" << endl; - if (onsafe != NULL) - COMMIT(onsafe); - return 0; -} - -int OSBDB::setattr(object_t oid, const char *name, - const void *value, size_t size, - Context *onsafe) -{ - dout(6) << "Context " << hex << onsafe << dec << endl; - if (!mounted) - { - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - DbTxn *txn = NULL; - if (transactional) - env->txn_begin (NULL, &txn, 0); - - dout(2) << "setattr " << oid << ":" << name << " => (" - << size << " bytes)" << endl; - int ret = _setattr (oid, name, value, size, onsafe, txn); - if (ret == 0) - { - if (txn != NULL) - txn->commit (0); - } - else - { - if (txn != NULL) - txn->abort(); - } - return ret; -} - -int OSBDB::setattrs(object_t oid, map& aset, - Context *onsafe) -{ - dout(6) << "Context " << hex << onsafe << dec << endl; - if (!mounted) - { - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - DbTxn *txn = NULL; - - if (transactional) - env->txn_begin (NULL, &txn, 0); - - map::iterator it; - for (it = aset.begin(); it != aset.end(); it++) - { - string name = it->first; - bufferptr value = it->second; - int ret = _setattr (oid, name.c_str(), value.c_str(), - value.length(), onsafe, txn); - if (ret != 0) - { - if (txn != NULL) - txn->abort(); - return ret; - } - } - - if (txn != NULL) - txn->commit (0); - return 0; -} - -int OSBDB::_getattr (object_t oid, const char *name, void *value, size_t size) -{ - if (!mounted) - return -EINVAL; - - dout(2) << "_getattr " << oid << " " << name << " " << size << endl; - - attr_id aid = new_attr_id (oid, name); - Dbt key (&aid, sizeof (aid)); - Dbt val (value, size); - val.set_ulen (size); - val.set_doff (0); - val.set_dlen (size); - val.set_flags (DB_DBT_USERMEM | DB_DBT_PARTIAL); - - int ret; - if ((ret = db->get (NULL, &key, &val, 0)) != 0) - { - derr(1) << ".getting value failed: " << db_strerror (ret) << endl; - return -ENOENT; - } - - dout(4) << ".._getattr OK; returns " << val.get_size() << endl; - return val.get_size(); -} - -int OSBDB::getattr(object_t oid, const char *name, void *value, size_t size) -{ - if (!mounted) - return -EINVAL; - - return _getattr (oid, name, value, size); -} - -int OSBDB::getattrs(object_t oid, map& aset) -{ - if (!mounted) - return -EINVAL; - - for (map::iterator it = aset.begin(); - it != aset.end(); it++) - { - int ret = _getattr (oid, (*it).first.c_str(), - (*it).second.c_str(), - (*it).second.length()); - if (ret < 0) - return ret; - } - return 0; -} - -int OSBDB::rmattr(object_t oid, const char *name, Context *onsafe) -{ - dout(6) << "Context " << hex << onsafe << dec << endl; - if (!mounted) - { - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - scoped_lock __lock(&lock); - dout(2) << "rmattr " << oid << " " << name << endl; - - attrs_id aids = new_attrs_id (oid); - Dbt askey (&aids, sizeof_attrs_id()); - Dbt asvalue; - asvalue.set_flags (DB_DBT_MALLOC); - - DbTxn *txn = NULL; - - if (transactional) - env->txn_begin (NULL, &txn, 0); - - if (db->get (txn, &askey, &asvalue, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - return -ENOENT; - } - - stored_attrs *sap = (stored_attrs *) asvalue.get_data(); - auto_ptr sa (sap); - - dout(5) << "..attributes list " << sap << endl; - - if (sap->count == 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".empty attribute list" << endl; - return -ENOENT; - } - - attr_name _name; - memset(&_name, 0, sizeof (_name)); - strncpy (_name.name, name, OSBDB_MAX_ATTR_LEN); - int ins = binary_search (sap->names, sap->count, _name); - dout(4) << "..insertion point is " << ins << endl; - if (ins >= sap->count || strcmp (sap->names[ins].name, name) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".attribute not found in list" << endl; - return -ENOENT; - } - - // Shift the later elements down by one, if needed. - int n = (sap->count - ins) * sizeof (attr_name); - if (n > 0) - { - dout(4) << "..shift down by " << n << endl; - memmove (&(sap->names[ins]), &(sap->names[ins + 1]), n); - } - sap->count--; - dout(5) << "..attributes list now " << sap << endl; - - asvalue.set_size(asvalue.get_size() - sizeof (attr_name)); - int ret; - if ((ret = db->put (txn, &askey, &asvalue, 0)) != 0) - { - derr(1) << "put stored_attrs " << db_strerror (ret) << endl; - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - - // Remove the attribute. - attr_id aid = new_attr_id (oid, name); - Dbt key (&aid, sizeof (aid)); - if ((ret = db->del (txn, &key, 0)) != 0) - { - derr(1) << "deleting " << aid << ": " << db_strerror(ret) << endl; - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - - if (txn != NULL) - txn->commit (0); - if (onsafe != NULL) - COMMIT(onsafe); - dout(4) << "..rmattr OK" << endl; - return 0; -} - -int OSBDB::listattr(object_t oid, char *attrs, size_t size) -{ - if (!mounted) - return -EINVAL; - - dout(2) << "listattr " << oid << endl; - - attrs_id aids = new_attrs_id (oid); - Dbt key (&aids, sizeof_attrs_id()); - Dbt value; - value.set_flags (DB_DBT_MALLOC); - - // XXX Transactions for read atomicity??? - - int ret; - if ((ret = db->get (NULL, &key, &value, 0)) != 0) - { - derr(1) << "fetching " << aids << ": " << db_strerror (ret) - << endl; - return -ENOENT; - } - - stored_attrs *attrsp = (stored_attrs *) value.get_data(); - auto_ptr _attrs (attrsp); - size_t s = 0; - char *p = attrs; - for (unsigned i = 0; i < attrsp->count && s < size; i++) - { - int n = MIN (OSBDB_MAX_ATTR_LEN, - MIN (strlen (attrsp->names[i].name), size - s - 1)); - strncpy (p, attrsp->names[i].name, n); - p[n] = '\0'; - p = p + n + 1; - } - - dout(4) << "listattr OK" << endl; - return 0; -} - - // Collection attributes. - -int OSBDB::collection_setattr(coll_t cid, const char *name, - const void *value, size_t size, - Context *onsafe) -{ - dout(6) << "Context " << hex << onsafe << dec << endl; - if (!mounted) - { - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - scoped_lock __lock(&lock); - dout(2) << "collection_setattr " << hex << cid << dec << " " << name - << " (" << size << " bytes)" << endl; - if (strlen (name) >= OSBDB_MAX_ATTR_LEN) - { - derr(1) << "name too long" << endl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -ENAMETOOLONG; - } - - // Add name to attribute list, if needed. - coll_attrs_id aids = new_coll_attrs_id (cid); - Dbt attrs_key (&aids, sizeof_coll_attrs_id()); - Dbt attrs_val; - attrs_val.set_flags (DB_DBT_MALLOC); - stored_attrs *sap = NULL; - size_t sz = 0; - - DbTxn *txn = NULL; - if (transactional) - env->txn_begin (NULL, &txn, 0); - - dout(3) << " getting " << aids << endl; - if (db->get (txn, &attrs_key, &attrs_val, 0) != 0) - { - dout(2) << " first attribute" << endl; - sz = sizeof (stored_attrs); - sap = (stored_attrs *) malloc(sz); - sap->count = 0; - } - else - { - sz = attrs_val.get_size(); - sap = (stored_attrs *) attrs_val.get_data(); - dout(2) << " add to list of " << sap->count << " attrs" << endl; - } - auto_ptr sa (sap); - - attr_name _name; - strncpy (_name.name, name, OSBDB_MAX_ATTR_LEN); - - int ins = 0; - if (sap->count > 0) - ins = binary_search (sap->names, sap->count, _name); - dout(3) << " insertion point is " << ins << endl; - if (ins >= sap->count || strcmp (sap->names[ins].name, name) != 0) - { - sz += sizeof (attr_name); - dout(3) << " realloc " << hex << ((void *) sap) << " to " - << dec << sz << endl; - sap = (stored_attrs *) realloc (sap, sz); - dout(3) << " returns " << hex << ((void *) sap) << dec << endl; - sa.release (); - sa.reset (sap); - int n = (sap->count - ins) * sizeof (attr_name); - if (n > 0) - { - dout(3) << " move " << n << " bytes from 0x" - << hex << (&sap->names[ins]) << " to 0x" - << hex << (&sap->names[ins+1]) << dec << endl; - memmove (&sap->names[ins+1], &sap->names[ins], n); - } - memset (&sap->names[ins], 0, sizeof (attr_name)); - strncpy (sap->names[ins].name, name, OSBDB_MAX_ATTR_LEN); - sap->count++; - - Dbt newAttrs_val (sap, sz); - newAttrs_val.set_ulen (sz); - newAttrs_val.set_flags (DB_DBT_USERMEM); - dout(3) << " putting " << aids << endl; - if (db->put (txn, &attrs_key, &newAttrs_val, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".putting new attributes failed" << endl; - return -EIO; - } - } - else - { - dout(3) << "..attribute " << name << " already exists" << endl; - } - - dout(3) << "..attributes list: " << sap << endl; - - // Add the attribute. - coll_attr_id aid = new_coll_attr_id (cid, name); - Dbt attr_key (&aid, sizeof (aid)); - Dbt attr_val ((void *) value, size); - dout(3) << " writing attribute key " << aid << endl; - if (db->put (txn, &attr_key, &attr_val, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".putting attribute failed" << endl; - return -EIO; - } - - if (txn != NULL) - txn->commit (0); - if (onsafe != NULL) - COMMIT(onsafe); - - dout(4) << "..collection setattr OK" << endl; - return 0; -} - -int OSBDB::collection_rmattr(coll_t cid, const char *name, - Context *onsafe) -{ - dout(6) << "Context " << hex << onsafe << dec << endl; - if (!mounted) - { - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - scoped_lock __lock(&lock); - dout(2) << "collection_rmattr " << hex << cid << dec - << " " << name << endl; - - coll_attrs_id aids = new_coll_attrs_id (cid); - Dbt askey (&aids, sizeof_coll_attrs_id()); - Dbt asvalue; - asvalue.set_flags (DB_DBT_MALLOC); - - DbTxn *txn = NULL; - if (transactional) - env->txn_begin (NULL, &txn, 0); - - if (db->get (txn, &askey, &asvalue, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".no attributes list" << endl; - return -ENOENT; - } - - stored_attrs *sap = (stored_attrs *) asvalue.get_data(); - auto_ptr sa (sap); - - dout(5) << "..attributes list " << sap << endl; - if (sap->count == 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".empty attributes list" << endl; - return -ENOENT; - } - - attr_name _name; - memset(&_name, 0, sizeof (_name)); - strncpy (_name.name, name, OSBDB_MAX_ATTR_LEN); - int ins = binary_search (sap->names, sap->count, _name); - if (ins >= sap->count || strcmp (sap->names[ins].name, name) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".attribute not listed" << endl; - return -ENOENT; - } - - // Shift the later elements down by one, if needed. - int n = (sap->count - ins) * sizeof (attr_name); - if (n > 0) - { - dout(4) << "..shift down by " << n << endl; - memmove (&(sap->names[ins]), &(sap->names[ins + 1]), n); - } - sap->count--; - dout(5) << "..attributes list now " << sap << endl; - - asvalue.set_size(asvalue.get_size() - sizeof (attr_name)); - int ret; - if ((ret = db->put (txn, &askey, &asvalue, 0)) != 0) - { - derr(1) << "put stored_attrs " << db_strerror (ret) << endl; - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - - // Remove the attribute. - coll_attr_id aid = new_coll_attr_id (cid, name); - Dbt key (&aid, sizeof (aid)); - if ((ret = db->del (txn, &key, 0)) != 0) - { - derr(1) << "deleting " << aid << ": " << db_strerror(ret) << endl; - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - - if (txn != NULL) - txn->commit (0); - if (onsafe != NULL) - COMMIT(onsafe); - - dout(4) << "..collection rmattr OK" << endl; - return 0; -} - -int OSBDB::collection_getattr(coll_t cid, const char *name, - void *value, size_t size) -{ - if (!mounted) - return -EINVAL; - - dout(2) << "collection_getattr " << hex << cid << dec - << " " << name << endl; - - // XXX transactions/read isolation? - - coll_attr_id caid = new_coll_attr_id (cid, name); - Dbt key (&caid, sizeof (caid)); - Dbt val (value, size); - val.set_ulen (size); - val.set_dlen (size); - val.set_flags (DB_DBT_USERMEM | DB_DBT_PARTIAL); - - if (db->get (NULL, &key, &val, 0) != 0) - { - derr(1) << ".no attribute entry" << endl; - return -ENOENT; - } - - dout(4) << "..collection getattr OK; returns " << val.get_size() << endl; - return val.get_size(); -} - -int OSBDB::collection_listattr(coll_t cid, char *attrs, size_t size) -{ - if (!mounted) - return -EINVAL; - - dout(2) << "collection_listattr " << hex << cid << dec << endl; - - // XXX transactions/read isolation? - - coll_attrs_id caids = new_coll_attrs_id (cid); - Dbt key (&caids, sizeof_coll_attrs_id()); - Dbt value; - value.set_flags (DB_DBT_MALLOC); - - int ret; - if ((ret = db->get (NULL, &key, &value, 0)) != 0) - { - derr(1) << "fetching " << caids << ": " << db_strerror (ret) - << endl; - return -ENOENT; - } - - stored_attrs *attrsp = (stored_attrs *) value.get_data(); - auto_ptr _attrs (attrsp); - size_t s = 0; - char *p = attrs; - for (unsigned i = 0; i < attrsp->count && s < size; i++) - { - int n = MIN (OSBDB_MAX_ATTR_LEN, - MIN (strlen (attrsp->names[i].name), size - s - 1)); - strncpy (p, attrsp->names[i].name, n); - p[n] = '\0'; - p = p + n + 1; - } - return 0; -} - - // Sync. - -void OSBDB::sync (Context *onsync) -{ - if (!mounted) - return; - - sync(); - - if (onsync != NULL) - { - g_timer.add_event_after(0.1, onsync); - } -} - -void OSBDB::sync() -{ - if (!mounted) - return; - - if (transactional) - { - env->log_flush (NULL); - env->lsn_reset (device.c_str(), 0); - } - db->sync(0); -} diff --git a/tags/20070517_before_mds_merge/osbdb/OSBDB.h b/tags/20070517_before_mds_merge/osbdb/OSBDB.h deleted file mode 100644 index f75bfc2c168ac..0000000000000 --- a/tags/20070517_before_mds_merge/osbdb/OSBDB.h +++ /dev/null @@ -1,480 +0,0 @@ -/* OSBDB.h -- ObjectStore on Berkeley DB. -*- c++ -*- - Copyright (C) 2007 Casey Marshall - -Ceph - scalable distributed file system - -This is free software; you can redistribute it and/or -modify it under the terms of the GNU Lesser General Public -License version 2.1, as published by the Free Software -Foundation. See file COPYING. */ - - -#include -#include "osd/ObjectStore.h" - -#define OSBDB_MAGIC 0x05BDB - -/* - * Maximum length of an attribute name. - */ -#define OSBDB_MAX_ATTR_LEN 256 - -#define OSBDB_THIS_VERSION 1 - -#define OSBDB_SUPERBLOCK_KEY ((void *) "s") - -/* - * The "superblock" of the BDB object store. We store one of these in - * the DB, to store version and other information. We don't record - * anything special here, just the version number the database was - * written with. - * - * In principle, this structure is variable-length, depending on the - * software version writing the superblock. - */ -struct stored_superblock -{ - uint32_t version; -}; - -inline ostream& operator<<(ostream& out, const stored_superblock sb) -{ - out << "osbdb.super(" << sb.version << ")" << endl; - return out; -} - -/** - * An object identifier; we define this so we can have a POD object to - * work with. - */ -struct oid_t // POD -{ - char id[16]; -}; - -inline void mkoid (oid_t& id, object_t& oid) -{ - // XXX byte order? - memcpy (id.id, &oid, sizeof (oid_t)); -} - -inline ostream& operator<<(ostream& out, const oid_t id) -{ - for (int i = 0; i < 16; i++) - { - out.fill('0'); - out << setw(2) << hex << (id.id[i] & 0xFF); - if ((i & 3) == 3) - out << ':'; - } - out.unsetf(ios::right); - out << dec; - return out; -} - -/** - * An "inode" key. We map a 'stored_object' struct to this key for - * every object. - */ -struct object_inode_key // POD -{ - oid_t oid; - char tag; -}; - -/** - * "Constructor" for an object_inode_key. - */ -inline object_inode_key new_object_inode_key (object_t& oid) -{ - object_inode_key key; - memset(&key, 0, sizeof (object_inode_key)); - mkoid (key.oid, oid); - key.tag = 'i'; - return key; -} - -/* - * We use this, instead of sizeof(), to try and guarantee that we - * don't include the structure padding, if any. - * - * This *should* return 17: sizeof (oid_t) == 16; sizeof (char) == 1. - */ -inline size_t sizeof_object_inode_key() -{ - return offsetof(object_inode_key, tag) + sizeof (char); -} - - // Frank Poole: Unfortunately, that sounds a little - // like famous last words. - // -- 2001: A Space Odyssey - -inline ostream& operator<<(ostream& out, const object_inode_key o) -{ - out << o.tag << "/" << o.oid; - return out; -} - -/** - * A stored object. This is essentially the "inode" of the object, - * containing things like the object's length. The object itself is - * stored as-is, mapped by the 128-bit object ID. - */ -struct stored_object -{ - uint32_t length; -}; - -inline ostream& operator<<(ostream& out, const stored_object s) -{ - out << "inode(l:" << s.length << ")"; - return out; -} - -/* - * Key referencing the list of attribute names for an object. This is - * simply the object's ID, with an additional character 'a' appended. - */ -struct attrs_id // POD -{ - oid_t oid; - char tag; -}; - -/* - * "Construtor" for attrs_id. - */ -inline struct attrs_id new_attrs_id (object_t& oid) -{ - attrs_id aid; - memset (&aid, 0, sizeof (attrs_id)); - mkoid(aid.oid, oid); - aid.tag = 'a'; - return aid; -} - -/* - * See explanation for sizeof_object_inode_id. - */ -inline size_t sizeof_attrs_id() -{ - return offsetof(struct attrs_id, tag) + sizeof (char); -} - -inline ostream& operator<<(ostream& out, const attrs_id id) -{ - out << id.tag << "/" << id.oid; - return out; -} - -/* - * Encapsulation of a single attribute name. - */ -struct attr_name // POD -{ - char name[OSBDB_MAX_ATTR_LEN]; -}; - -inline ostream& operator<<(ostream& out, const attr_name n) -{ - out << n.name; - return out; -} - -inline bool operator<(const attr_name n1, const attr_name n2) -{ - return (strncmp (n1.name, n2.name, OSBDB_MAX_ATTR_LEN) < 0); -} - -inline bool operator>(const attr_name n1, const attr_name n2) -{ - return (strncmp (n1.name, n2.name, OSBDB_MAX_ATTR_LEN) > 0); -} - -inline bool operator==(const attr_name n1, const attr_name n2) -{ - std::cerr << n1.name << " == " << n2.name << "?" << endl; - return (strncmp (n1.name, n2.name, OSBDB_MAX_ATTR_LEN) == 0); -} - -inline bool operator!=(const attr_name n1, const attr_name n2) -{ - return !(n1 == n2); -} - -inline bool operator>=(const attr_name n1, const attr_name n2) -{ - return !(n1 < n2); -} - -inline bool operator<=(const attr_name n1, const attr_name n2) -{ - return !(n1 > n2); -} - -/* - * A list of an object or collection's attribute names. - */ -struct stored_attrs -{ - uint32_t count; - attr_name names[0]; // actually variable-length -}; - -inline ostream& operator<<(ostream& out, const stored_attrs *sa) -{ - out << sa->count << " [ "; - for (unsigned i = 0; i < sa->count; i++) - out << sa->names[i] << (i == sa->count - 1 ? " " : ", "); - out << "]"; - return out; -} - -/* - * An object attribute key. An object attribute is mapped simply by - * the object ID appended with the attribute name. Attribute names - * may not be empty, and must be less than 256 characters, in this - * implementation. - */ -struct attr_id // POD -{ - oid_t oid; - attr_name name; -}; - -inline attr_id new_attr_id (object_t& oid, const char *name) -{ - attr_id aid; - memset(&aid, 0, sizeof (attr_id)); - mkoid (aid.oid, oid); - strncpy (aid.name.name, name, OSBDB_MAX_ATTR_LEN); - return aid; -} - -inline ostream& operator<<(ostream &out, const attr_id id) -{ - out << id.oid << ":" << id.name; - return out; -} - -/* - * A key for a collection attributes list. - */ -struct coll_attrs_id // POD -{ - coll_t cid; - char tag; -}; - -inline coll_attrs_id new_coll_attrs_id (coll_t cid) -{ - coll_attrs_id catts; - memset(&catts, 0, sizeof (coll_attrs_id)); - catts.cid = cid; - catts.tag = 'C'; - return catts; -} - -inline size_t sizeof_coll_attrs_id() -{ - return offsetof(coll_attrs_id, tag) + sizeof (char); -} - -inline ostream& operator<<(ostream& out, coll_attrs_id id) -{ - out << id.tag << "/" << id.cid; - return out; -} - -/* - * A collection attribute key. Similar to - */ -struct coll_attr_id // POD -{ - coll_t cid; - attr_name name; -}; - -inline coll_attr_id new_coll_attr_id (coll_t cid, const char *name) -{ - coll_attr_id catt; - memset(&catt, 0, sizeof (coll_attr_id)); - catt.cid = cid; - strncpy (catt.name.name, name, OSBDB_MAX_ATTR_LEN); - return catt; -} - -inline ostream& operator<<(ostream& out, coll_attr_id id) -{ - out << id.cid << ":" << id.name; - return out; -} - -/* - * This is the key we store the master collections list under. - */ -#define COLLECTIONS_KEY ((void *) "c") - -/* - * The master list of collections. There should be one of these per - * OSD. The sole reason for this structure is to have the ability - * to enumerate all collections stored on this OSD. - */ -struct stored_colls -{ - // The number of collections. - uint32_t count; - - // The collection identifiers. This is a sorted list of coll_t - // values. - coll_t colls[0]; // actually variable-length -}; - -inline ostream& operator<<(ostream& out, stored_colls *c) -{ - out << c->count << " [ "; - for (unsigned i = 0; i < c->count; i++) - { - out << hex << c->colls[i]; - if (i < c->count - 1) - out << ", "; - } - out << " ]" << dec; - return out; -} - -/* - * A stored collection (a bag of object IDs). These are referenced by - * the bare collection identifier type, a coll_t (thus, a 32-bit - * integer). Internally this is stored as a sorted list of object IDs. - * - * Note, this structure places all collection items in a single - * record; this may be a memory burden for large collections. - */ -struct stored_coll -{ - // The size of this collection. - uint32_t count; - - // The object IDs in this collection. This is a sorted list of all - // object ID's in this collection. - object_t objects[0]; // actually variable-length -}; - -inline ostream& operator<<(ostream& out, stored_coll *c) -{ - out << c->count << " [ "; - for (unsigned i = 0; i < c->count; i++) - { - out << c->objects[i]; - if (i < c->count - 1) - out << ", "; - } - out << " ]"; - return out; -} - -class OSBDBException : public std::exception -{ - const char *msg; - -public: - OSBDBException(const char *msg) : msg(msg) { } - const char *what() { return msg; } -}; - -/* - * The object store interface for Berkeley DB. - */ -class OSBDB : public ObjectStore -{ - private: - Mutex lock; - DbEnv *env; - Db *db; - string device; - string env_dir; - bool mounted; - bool opened; - bool transactional; - - public: - - OSBDB(const char *dev) throw(OSBDBException) - : lock(true), env(0), db (0), device (dev), mounted(false), opened(false), - transactional(g_conf.bdbstore_transactional) - { - } - - ~OSBDB() - { - if (mounted) - { - umount(); - } - } - - int mount(); - int umount(); - int mkfs(); - - int statfs(struct statfs *buf); - - int pick_object_revision_lt(object_t& oid); - - bool exists(object_t oid); - int stat(object_t oid, struct stat *st); - - int remove(object_t oid, Context *onsafe=0); - - int truncate(object_t oid, off_t size, Context *onsafe=0); - - int read(object_t oid, off_t offset, size_t len, - bufferlist& bl); - int write(object_t oid, off_t offset, size_t len, - bufferlist& bl, Context *onsafe); - - int setattr(object_t oid, const char *name, - const void *value, size_t size, Context *onsafe=0); - int setattrs(object_t oid, map& aset, - Context *onsafe=0); - int getattr(object_t oid, const char *name, - void *value, size_t size); - int getattrs(object_t oid, map& aset); - int rmattr(object_t oid, const char *name, - Context *onsafe=0); - int listattr(object_t oid, char *attrs, size_t size); - - int clone(object_t oid, object_t noid); - - // Collections. - - int list_collections(list& ls); - int create_collection(coll_t c, Context *onsafe=0); - int destroy_collection(coll_t c, Context *onsafe=0); - bool collection_exists(coll_t c); - int collection_stat(coll_t c, struct stat *st); - int collection_add(coll_t c, object_t o, Context *onsafe=0); - int collection_remove(coll_t c, object_t o, Context *onsafe=0); - int collection_list(coll_t c, list& o); - - int collection_setattr(coll_t cid, const char *name, - const void *value, size_t size, - Context *onsafe=0); - int collection_rmattr(coll_t cid, const char *name, - Context *onsafe=0); - int collection_getattr(coll_t cid, const char *name, - void *value, size_t size); - int collection_listattr(coll_t cid, char *attrs, size_t size); - - void sync(Context *onsync); - void sync(); - -private: - int opendb (DBTYPE type=DB_UNKNOWN, int flags=0, bool new_env=false); - - int _setattr(object_t oid, const char *name, const void *value, - size_t size, Context *onsync, DbTxn *txn); - int _getattr(object_t oid, const char *name, void *value, size_t size); - DbEnv *getenv(); -}; diff --git a/tags/20070517_before_mds_merge/osd/Ager.cc b/tags/20070517_before_mds_merge/osd/Ager.cc deleted file mode 100644 index 038688c5cdfd5..0000000000000 --- a/tags/20070517_before_mds_merge/osd/Ager.cc +++ /dev/null @@ -1,331 +0,0 @@ - -#include "include/types.h" - -#include "Ager.h" -#include "ObjectStore.h" - -#include "config.h" -#include "common/Clock.h" - -// ick -#include "ebofs/Ebofs.h" -#include -#include -#include - -#ifdef DARWIN -#include -#include -#endif // DARWIN - - -int myrand() -{ - if (0) - return rand(); - else { - static int n = 0; - srand(n++); - return rand(); - } -} - - -object_t Ager::age_get_oid() { - if (!age_free_oids.empty()) { - object_t o = age_free_oids.front(); - age_free_oids.pop_front(); - return o; - } - object_t last = age_cur_oid; - ++age_cur_oid.bno; - return last; -} - -ssize_t Ager::age_pick_size() { - ssize_t max = file_size_distn.sample() * 1024; - return max/2 + (myrand() % 100) * max/200 + 1; -} - -bool start_debug = false; - -__uint64_t Ager::age_fill(float pc, utime_t until) { - int max = 1024*1024; - bufferptr bp(max); - bp.zero(); - bufferlist bl; - bl.push_back(bp); - __uint64_t wrote = 0; - while (1) { - if (g_clock.now() > until) break; - - struct statfs st; - store->statfs(&st); - float free = 1.0 - ((float)(st.f_bfree) / (float)st.f_blocks); - float avail = 1.0 - ((float)(st.f_bavail) / (float)st.f_blocks); // to write to - //float a = (float)(st.f_bfree) / (float)st.f_blocks; - //dout(10) << "age_fill at " << a << " / " << pc << " .. " << st.f_blocks << " " << st.f_bavail << endl; - if (free >= pc) { - dout(2) << "age_fill at " << free << " / " << avail << " / " << " / " << pc << " stopping" << endl; - break; - } - - // make sure we can write to it.. - if (avail > .98 || - avail - free > .02) - store->sync(); - - object_t oid = age_get_oid(); - - int b = myrand() % 10; - age_objects[b].push_back(oid); - - ssize_t s = age_pick_size(); - wrote += (s + 4095) / 4096; - - - - - dout(2) << "age_fill at " << free << " / " << avail << " / " << pc << " creating " << hex << oid << dec << " sz " << s << endl; - - - if (false && !g_conf.ebofs_verify && start_debug && wrote > 1000000ULL) { - /* - - - 1005700 -? -1005000 -1005700 - 1005710 - 1005725ULL - 1005750ULL - 1005800 - 1006000 - -// 99 1000500 ? 1000750 1006000 -*/ - g_conf.debug_ebofs = 30; - g_conf.ebofs_verify = true; - } - - off_t off = 0; - while (s) { - ssize_t t = MIN(s, max); - bufferlist sbl; - sbl.substr_of(bl, 0, t); - store->write(oid, off, t, sbl, false); - off += t; - s -= t; - } - oid.bno++; - } - - return wrote*4; // KB -} - -void Ager::age_empty(float pc) { - int nper = 20; - int n = nper; - - //g_conf.ebofs_verify = true; - - while (1) { - struct statfs st; - store->statfs(&st); - float free = 1.0 - ((float)(st.f_bfree) / (float)st.f_blocks); - float avail = 1.0 - ((float)(st.f_bavail) / (float)st.f_blocks); // to write to - dout(2) << "age_empty at " << free << " / " << avail << " / " << pc << endl;//" stopping" << endl; - if (free <= pc) { - dout(2) << "age_empty at " << free << " / " << avail << " / " << pc << " stopping" << endl; - break; - } - - int b = myrand() % 10; - n--; - if (n == 0 || age_objects[b].empty()) { - dout(2) << "age_empty sync" << endl; - //sync(); - //sync(); - n = nper; - continue; - } - object_t oid = age_objects[b].front(); - age_objects[b].pop_front(); - - dout(2) << "age_empty at " << free << " / " << avail << " / " << pc << " removing " << hex << oid << dec << endl; - - store->remove(oid); - age_free_oids.push_back(oid); - } - - g_conf.ebofs_verify = false; -} - -void pfrag(__uint64_t written, ObjectStore::FragmentationStat &st) -{ - cout << "#gb wr\ttotal\tn x\tavg x\tavg per\tavg j\tfree\tn fr\tavg fr\tnum<2\tsum<2\tnum<4\tsum<4\t..." - << endl; - cout << written - << "\t" << st.total - << "\t" << st.num_extent - << "\t" << st.avg_extent - << "\t" << st.avg_extent_per_object - << "\t" << st.avg_extent_jump - << "\t" << st.total_free - << "\t" << st.num_free_extent - << "\t" << st.avg_free_extent; - - int n = st.num_extent; - for (__uint64_t i=1; i <= 30; i += 1) { - cout << "\t" << st.extent_dist[i]; - cout << "\t" << st.extent_dist_sum[i]; - //cout << "\ta " << (st.extent_dist[i] ? (st.extent_dist_sum[i] / st.extent_dist[i]):0); - n -= st.extent_dist[i]; - if (n == 0) break; - } - cout << endl; -} - - -void Ager::age(int time, - float high_water, // fill to this % - float low_water, // then empty to this % - int count, // this many times - float final_water, // and end here ( <= low_water) - int fake_size_mb) { - - store->_fake_writes(true); - srand(0); - - utime_t start = g_clock.now(); - utime_t until = start; - until.sec_ref() += time; - - int elapsed = 0; - int freelist_inc = 60; - utime_t nextfl = start; - nextfl.sec_ref() += freelist_inc; - - while (age_objects.size() < 10) age_objects.push_back( list() ); - - if (fake_size_mb) { - int fake_bl = fake_size_mb * 256; - struct statfs st; - store->statfs(&st); - float f = (float)fake_bl / (float)st.f_blocks; - high_water = (float)high_water * f; - low_water = (float)low_water * f; - final_water = (float)final_water * f; - dout(2) << "fake " << fake_bl << " / " << st.f_blocks << " is " << f << ", high " << high_water << " low " << low_water << " final " << final_water << endl; - } - - // init size distn (once) - if (!did_distn) { - did_distn = true; - age_cur_oid = object_t(0,1); - file_size_distn.add(1, 19.0758125+0.65434375); - file_size_distn.add(512, 35.6566); - file_size_distn.add(1024, 27.7271875); - file_size_distn.add(2*1024, 16.63503125); - //file_size_distn.add(4*1024, 106.82384375); - //file_size_distn.add(8*1024, 81.493375); - //file_size_distn.add(16*1024, 14.13553125); - //file_size_distn.add(32*1024, 2.176); - //file_size_distn.add(256*1024, 0.655938); - //file_size_distn.add(512*1024, 0.1480625); - //file_size_distn.add(1*1024*1024, 0.020125); // actually 2, but 32bit - file_size_distn.normalize(); - } - - // clear - for (int i=0; i<10; i++) - age_objects[i].clear(); - - ObjectStore::FragmentationStat st; - - __uint64_t wrote = 0; - - for (int c=1; c<=count; c++) { - if (g_clock.now() > until) break; - - //if (c == 7) start_debug = true; - - dout(1) << "#age " << c << "/" << count << " filling to " << high_water << endl; - __uint64_t w = age_fill(high_water, until); - //dout(1) << "age wrote " << w << endl; - wrote += w; - //store->sync(); - //store->_get_frag_stat(st); - //pfrag(st); - - - if (c == count) { - dout(1) << "#age final empty to " << final_water << endl; - age_empty(final_water); - } else { - dout(1) << "#age " << c << "/" << count << " emptying to " << low_water << endl; - age_empty(low_water); - } - //store->sync(); - //store->sync(); - - // show frag state - store->_get_frag_stat(st); - pfrag(wrote / (1024ULL*1024ULL) , // GB - st); - - // dump freelist? - if (g_clock.now() > nextfl) { - elapsed += freelist_inc; - save_freelist(elapsed); - nextfl.sec_ref() += freelist_inc; - } - } - - // dump the freelist - save_freelist(0); - exit(0); // hack - - // ok! - store->_fake_writes(false); - store->sync(); - store->sync(); - dout(1) << "age finished" << endl; -} - - -void Ager::load_freelist() -{ - dout(1) << "load_freelist" << endl; - - struct stat st; - - int r = ::stat("ebofs.freelist", &st); - assert(r == 0); - - bufferptr bp(st.st_size); - bufferlist bl; - bl.push_back(bp); - int fd = ::open("ebofs.freelist", O_RDONLY); - ::read(fd, bl.c_str(), st.st_size); - ::close(fd); - - ((Ebofs*)store)->_import_freelist(bl); - store->sync(); - store->sync(); -} - -void Ager::save_freelist(int el) -{ - dout(1) << "save_freelist " << el << endl; - char s[100]; - sprintf(s, "ebofs.freelist.%d", el); - bufferlist bl; - ((Ebofs*)store)->_export_freelist(bl); - ::unlink(s); - int fd = ::open(s, O_CREAT|O_WRONLY); - ::fchmod(fd, 0644); - ::write(fd, bl.c_str(), bl.length()); - ::close(fd); -} diff --git a/tags/20070517_before_mds_merge/osd/Ager.h b/tags/20070517_before_mds_merge/osd/Ager.h deleted file mode 100644 index 864c23fce8e14..0000000000000 --- a/tags/20070517_before_mds_merge/osd/Ager.h +++ /dev/null @@ -1,42 +0,0 @@ -#ifndef __AGER_H -#define __AGER_H - -#include "include/types.h" -#include "include/Distribution.h" -#include "ObjectStore.h" -#include "common/Clock.h" - -#include -#include -using namespace std; - -class Ager { - ObjectStore *store; - - private: - list age_free_oids; - object_t age_cur_oid; - vector< list > age_objects; - Distribution file_size_distn; //kb - bool did_distn; - - void age_empty(float pc); - __uint64_t age_fill(float pc, utime_t until); - ssize_t age_pick_size(); - object_t age_get_oid(); - - public: - Ager(ObjectStore *s) : store(s), did_distn(false) {} - - void age(int time, - float high_water, // fill to this % - float low_water, // then empty to this % - int count, // this many times - float final_water, // and end here ( <= low_water) - int fake_size_mb=0); - - void save_freelist(int); - void load_freelist(); -}; - -#endif diff --git a/tags/20070517_before_mds_merge/osd/BDBMap.h b/tags/20070517_before_mds_merge/osd/BDBMap.h deleted file mode 100644 index 203a4ca9dce8f..0000000000000 --- a/tags/20070517_before_mds_merge/osd/BDBMap.h +++ /dev/null @@ -1,136 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __BERKELEYDB_H -#define __BERKELEYDB_H - -#include -#include - -#include -using namespace std; - - -template -class BDBMap { - private: - DB *dbp; - - public: - BDBMap() : dbp(0) {} - ~BDBMap() { - close(); - } - - bool is_open() { return dbp ? true:false; } - - // open/close - int open(const char *fn) { - //cout << "open " << fn << endl; - - int r; - if ((r = db_create(&dbp, NULL, 0)) != 0) { - cerr << "db_create: " << db_strerror(r) << endl; - assert(0); - } - - dbp->set_errfile(dbp, stderr); - dbp->set_errpfx(dbp, "bdbmap"); - - r = dbp->open(dbp, NULL, fn, NULL, DB_BTREE, DB_CREATE, 0644); - if (r != 0) { - dbp->err(dbp, r, "%s", fn); - } - assert(r == 0); - return 0; - } - void close() { - if (dbp) { - dbp->close(dbp,0); - dbp = 0; - } - } - void remove(const char *fn) { - if (!dbp) open(fn); - if (dbp) { - dbp->remove(dbp, fn, 0, 0); - dbp = 0; - } else { - ::unlink(fn); - } - } - - // accessors - int put(K key, - D data) { - DBT k; - memset(&k, 0, sizeof(k)); - k.data = &key; - k.size = sizeof(K); - DBT d; - memset(&d, 0, sizeof(d)); - d.data = &data; - d.size = sizeof(data); - return dbp->put(dbp, NULL, &k, &d, 0); - } - - int get(K key, - D& data) { - DBT k; - memset(&k, 0, sizeof(k)); - k.data = &key; - k.size = sizeof(key); - DBT d; - memset(&d, 0, sizeof(d)); - d.data = &data; - d.size = sizeof(data); - int r = dbp->get(dbp, NULL, &k, &d, 0); - return r; - } - - int del(K key) { - DBT k; - memset(&k, 0, sizeof(k)); - k.data = &key; - k.size = sizeof(key); - return dbp->del(dbp, NULL, &k, 0); - } - - int list_keys(list& ls) { - DBC *cursor = 0; - int r = dbp->cursor(dbp, NULL, &cursor, 0); - assert(r == 0); - - DBT k,d; - memset(&k, 0, sizeof(k)); - memset(&d, 0, sizeof(d)); - - while ((r = cursor->c_get(cursor, &k, &d, DB_NEXT)) == 0) { - K key; - assert(k.size == sizeof(key)); - memcpy(&key, k.data, k.size); - ls.push_back(key); - } - if (r != DB_NOTFOUND) { - dbp->err(dbp, r, "DBcursor->get"); - assert(r == DB_NOTFOUND); - } - - cursor->c_close(cursor); - return 0; - } - -}; - -#endif diff --git a/tags/20070517_before_mds_merge/osd/Fake.h b/tags/20070517_before_mds_merge/osd/Fake.h deleted file mode 100644 index 01fa4afcf3cb8..0000000000000 --- a/tags/20070517_before_mds_merge/osd/Fake.h +++ /dev/null @@ -1,249 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __FAKE_H -#define __FAKE_H - -#include "include/types.h" - -#include -#include -#include -using namespace std; -using namespace __gnu_cxx; - -class FakeStoreCollections { - private: - Mutex faker_lock; - ObjectStore *store; - hash_map > fakecollections; - - public: - FakeStoreCollections(ObjectStore *s) : store(s) {} - - // faked collections - int list_collections(list& ls) { - faker_lock.Lock(); - int r = 0; - for (hash_map< coll_t, set >::iterator p = fakecollections.begin(); - p != fakecollections.end(); - p++) { - r++; - ls.push_back(p->first); - } - faker_lock.Unlock(); - return r; - } - - int create_collection(coll_t c, - Context *onsafe=0) { - faker_lock.Lock(); - fakecollections[c].size(); - if (onsafe) store->sync(onsafe); - faker_lock.Unlock(); - return 0; - } - - int destroy_collection(coll_t c, - Context *onsafe=0) { - int r = 0; - faker_lock.Lock(); - if (fakecollections.count(c)) { - fakecollections.erase(c); - //fakecattr.erase(c); - if (onsafe) store->sync(onsafe); - } else - r = -1; - faker_lock.Unlock(); - return r; - } - - int collection_stat(coll_t c, struct stat *st) { - return collection_exists(c) ? 0:-1; - } - - bool collection_exists(coll_t c) { - faker_lock.Lock(); - int r = fakecollections.count(c); - faker_lock.Unlock(); - return r; - } - - int collection_add(coll_t c, object_t o, - Context *onsafe=0) { - faker_lock.Lock(); - fakecollections[c].insert(o); - if (onsafe) store->sync(onsafe); - faker_lock.Unlock(); - return 0; - } - - int collection_remove(coll_t c, object_t o, - Context *onsafe=0) { - faker_lock.Lock(); - fakecollections[c].erase(o); - if (onsafe) store->sync(onsafe); - faker_lock.Unlock(); - return 0; - } - - int collection_list(coll_t c, list& o) { - faker_lock.Lock(); - int r = 0; - for (set::iterator p = fakecollections[c].begin(); - p != fakecollections[c].end(); - p++) { - o.push_back(*p); - r++; - } - faker_lock.Unlock(); - return r; - } - -}; - -class FakeStoreAttrs { - private: - - class FakeAttrSet { - public: - map attrs; - - int getattr(const char *name, void *value, size_t size) { - string n = name; - if (attrs.count(n)) { - size_t l = MIN( attrs[n].length(), size ); - bufferlist bl; - bl.append(attrs[n]); - bl.copy(0, l, (char*)value); - return l; - } - return -1; - } - int getattrs(map& aset) { - aset = attrs; - return 0; - } - int setattrs(map& aset) { - attrs = aset; - return 0; - } - - int setattr(const char *name, const void *value, size_t size) { - string n = name; - bufferptr bp = buffer::copy((char*)value, size); - attrs[n] = bp; - return 0; - } - - int listattr(char *attrs, size_t size) { - assert(0); - return 0; - } - - int rmattr(const char *name) { - string n = name; - attrs.erase(n); - return 0; - } - - bool empty() { return attrs.empty(); } - }; - - Mutex faker_lock; - ObjectStore *store; - hash_map fakeoattrs; - hash_map fakecattrs; - - public: - FakeStoreAttrs(ObjectStore *s) : store(s) {} - - int setattr(object_t oid, const char *name, - const void *value, size_t size, - Context *onsafe=0) { - faker_lock.Lock(); - int r = fakeoattrs[oid].setattr(name, value, size); - if (onsafe) store->sync(onsafe); - faker_lock.Unlock(); - return r; - } - int setattrs(object_t oid, map& aset) { - faker_lock.Lock(); - int r = fakeoattrs[oid].setattrs(aset); - faker_lock.Unlock(); - return r; - } - int getattr(object_t oid, const char *name, - void *value, size_t size) { - faker_lock.Lock(); - int r = fakeoattrs[oid].getattr(name, value, size); - faker_lock.Unlock(); - return r; - } - int getattrs(object_t oid, map& aset) { - faker_lock.Lock(); - int r = fakeoattrs[oid].getattrs(aset); - faker_lock.Unlock(); - return r; - } - int rmattr(object_t oid, const char *name, - Context *onsafe=0) { - faker_lock.Lock(); - int r = fakeoattrs[oid].rmattr(name); - if (onsafe) store->sync(onsafe); - faker_lock.Unlock(); - return r; - } - - int listattr(object_t oid, char *attrs, size_t size) { - faker_lock.Lock(); - int r = fakeoattrs[oid].listattr(attrs,size); - faker_lock.Unlock(); - return r; - } - - int collection_setattr(coll_t c, const char *name, - void *value, size_t size, - Context *onsafe=0) { - faker_lock.Lock(); - int r = fakecattrs[c].setattr(name, value, size); - if (onsafe) store->sync(onsafe); - faker_lock.Unlock(); - return r; - } - int collection_rmattr(coll_t c, const char *name, - Context *onsafe=0) { - faker_lock.Lock(); - int r = fakecattrs[c].rmattr(name); - if (onsafe) store->sync(onsafe); - faker_lock.Unlock(); - return r; - } - int collection_getattr(coll_t c, const char *name, - void *value, size_t size) { - faker_lock.Lock(); - int r = fakecattrs[c].getattr(name, value, size); - faker_lock.Unlock(); - return r; - } - int collection_listattr(coll_t c, char *attrs, size_t size) { - faker_lock.Lock(); - int r = fakecattrs[c].listattr(attrs,size); - faker_lock.Unlock(); - return r; - } - -}; - -#endif diff --git a/tags/20070517_before_mds_merge/osd/FakeStore.cc b/tags/20070517_before_mds_merge/osd/FakeStore.cc deleted file mode 100644 index 1360711f3b417..0000000000000 --- a/tags/20070517_before_mds_merge/osd/FakeStore.cc +++ /dev/null @@ -1,643 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "FakeStore.h" -#include "include/types.h" - -#include "common/Timer.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -//#include - -#ifdef DARWIN -#include -#include -#endif // DARWIN - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug) cout << g_clock.now() << " osd" << whoami << ".fakestore " -#define derr(l) if (l<=g_conf.debug) cerr << g_clock.now() << " osd" << whoami << ".fakestore " - -#include "include/buffer.h" - -#include -#include -using namespace __gnu_cxx; - -// crap-a-crap hash -#define HASH_DIRS 0x80 -#define HASH_MASK 0x7f -// end crap hash - - - - -int FakeStore::statfs(struct statfs *buf) -{ - return ::statfs(basedir.c_str(), buf); -} - - -/* - * sorry, these are sentitive to the object_t and coll_t typing. - */ -void FakeStore::get_oname(object_t oid, char *s) -{ - static hash H; - assert(sizeof(oid) == 16); -#ifdef __LP64__ - sprintf(s, "%s/objects/%02lx/%016lx.%016lx", basedir.c_str(), H(oid) & HASH_MASK, - *((__uint64_t*)&oid), - *(((__uint64_t*)&oid) + 1)); -#else - sprintf(s, "%s/objects/%02x/%016llx.%016llx", basedir.c_str(), H(oid) & HASH_MASK, - *((__uint64_t*)&oid), - *(((__uint64_t*)&oid) + 1)); -#endif -} - -void FakeStore::get_cdir(coll_t cid, char *s) -{ - assert(sizeof(cid) == 8); -#ifdef __LP64__ - sprintf(s, "%s/collections/%016lx", basedir.c_str(), - cid); -#else - sprintf(s, "%s/collections/%016llx", basedir.c_str(), - cid); -#endif -} - -void FakeStore::get_coname(coll_t cid, object_t oid, char *s) -{ - assert(sizeof(oid) == 16); -#ifdef __LP64__ - sprintf(s, "%s/collections/%016lx/%016lx.%016lx", basedir.c_str(), cid, - *((__uint64_t*)&oid), - *(((__uint64_t*)&oid) + 1)); -#else - sprintf(s, "%s/collections/%016llx/%016llx.%016llx", basedir.c_str(), cid, - *((__uint64_t*)&oid), - *(((__uint64_t*)&oid) + 1)); -#endif -} - - - - -int FakeStore::mkfs() -{ - char cmd[200]; - if (g_conf.fakestore_dev) { - dout(0) << "mounting" << endl; - sprintf(cmd,"mount %s", g_conf.fakestore_dev); - system(cmd); - } - - dout(1) << "mkfs in " << basedir << endl; - - // wipe - sprintf(cmd, "test -d %s && rm -r %s ; mkdir -p %s/collections && mkdir -p %s/objects", - basedir.c_str(), basedir.c_str(), basedir.c_str(), basedir.c_str()); - - dout(5) << "wipe: " << cmd << endl; - system(cmd); - - // hashed bits too - for (int i=0; i 0) bl.push_back( bptr ); // put it in the target bufferlist - } - ::flock(fd, LOCK_UN); - ::close(fd); - return got; -} - - -int FakeStore::write(object_t oid, - off_t offset, size_t len, - bufferlist& bl, - Context *onsafe) -{ - dout(20) << "write " << oid << " len " << len << " off " << offset << endl; - - char fn[200]; - get_oname(oid,fn); - - ::mknod(fn, 0644, 0); // in case it doesn't exist yet. - - int flags = O_WRONLY;//|O_CREAT; - int fd = ::open(fn, flags); - if (fd < 0) { - derr(0) << "write couldn't open " << fn << " flags " << flags << " errno " << errno << " " << strerror(errno) << endl; - return fd; - } - ::fchmod(fd, 0664); - ::flock(fd, LOCK_EX); // lock for safety - - // seek - off_t actual = ::lseek(fd, offset, SEEK_SET); - int did = 0; - assert(actual == offset); - - // write buffers - for (list::const_iterator it = bl.buffers().begin(); - it != bl.buffers().end(); - it++) { - int r = ::write(fd, (char*)(*it).c_str(), (*it).length()); - if (r > 0) - did += r; - else { - derr(0) << "couldn't write to " << fn << " len " << len << " off " << offset << " errno " << errno << " " << strerror(errno) << endl; - } - } - - if (did < 0) { - derr(0) << "couldn't write to " << fn << " len " << len << " off " << offset << " errno " << errno << " " << strerror(errno) << endl; - } - - ::flock(fd, LOCK_UN); - - // schedule sync - if (onsafe) sync(onsafe); - - ::close(fd); - - return did; -} - - -class C_FakeSync : public Context { - Context *c; - int *n; - Mutex *lock; - Cond *cond; - -public: - C_FakeSync(Context *c_, int *n_, Mutex *lo, Cond *co) : - c(c_), n(n_), - lock(lo), cond(co) { - lock->Lock(); - ++*n; - lock->Unlock(); - } - void finish(int r) { - c->finish(r); - - lock->Lock(); - --(*n); - if (*n == 0) cond->Signal(); - lock->Unlock(); - } -}; - -void FakeStore::sync() -{ - synclock.Lock(); - while (unsync > 0) { - dout(0) << "sync waiting for " << unsync << " items to (fake) sync" << endl; - synccond.Wait(synclock); - } - synclock.Unlock(); -} - -void FakeStore::sync(Context *onsafe) -{ - if (g_conf.fakestore_fake_sync) { - g_timer.add_event_after((float)g_conf.fakestore_fake_sync, - new C_FakeSync(onsafe, &unsync, &synclock, &synccond)); - - } else { - assert(0); // der..no implemented anymore - } -} - - -// ------------------------------- -// attributes - -// objects - -int FakeStore::setattr(object_t oid, const char *name, - const void *value, size_t size, - Context *onsafe) -{ - if (fake_attrs) return attrs.setattr(oid, name, value, size, onsafe); - - char fn[100]; - get_oname(oid, fn); - int r = ::setxattr(fn, name, value, size, 0); - return r; -} - -int FakeStore::setattrs(object_t oid, map& aset) -{ - if (fake_attrs) return attrs.setattrs(oid, aset); - - char fn[100]; - get_oname(oid, fn); - int r = 0; - for (map::iterator p = aset.begin(); - p != aset.end(); - ++p) { - r = ::setxattr(fn, p->first.c_str(), p->second.c_str(), p->second.length(), 0); - if (r < 0) break; - } - return r; -} - -int FakeStore::getattr(object_t oid, const char *name, - void *value, size_t size) -{ - if (fake_attrs) return attrs.getattr(oid, name, value, size); - char fn[100]; - get_oname(oid, fn); - int r = ::getxattr(fn, name, value, size); - return r; -} - -int FakeStore::getattrs(object_t oid, map& aset) -{ - if (fake_attrs) return attrs.getattrs(oid, aset); - - char fn[100]; - get_oname(oid, fn); - - char val[1000]; - char names[1000]; - int num = ::listxattr(fn, names, 1000); - - char *name = names; - for (int i=0; i& ls) -{ - if (fake_collections) return collections.list_collections(ls); - - char fn[200]; - sprintf(fn, "%s/collections", basedir.c_str()); - - DIR *dir = ::opendir(fn); - assert(dir); - - struct dirent *de; - while ((de = ::readdir(dir)) != 0) { - // parse - coll_t c = strtoll(de->d_name, 0, 16); - dout(0) << " got " << c << " errno " << errno << " on " << de->d_name << endl; - if (errno) continue; - ls.push_back(c); - } - - ::closedir(dir); - return 0; -} - -int FakeStore::create_collection(coll_t c, - Context *onsafe) -{ - if (fake_collections) return collections.create_collection(c, onsafe); - - char fn[200]; - get_cdir(c, fn); - - int r = ::mkdir(fn, 0755); - - if (onsafe) sync(onsafe); - return r; -} - -int FakeStore::destroy_collection(coll_t c, - Context *onsafe) -{ - if (fake_collections) return collections.destroy_collection(c, onsafe); - - char fn[200]; - get_cdir(c, fn); - char cmd[200]; - sprintf(cmd, "test -d %s && rm -r %s", fn, fn); - system(cmd); - - if (onsafe) sync(onsafe); - return 0; -} - -int FakeStore::collection_stat(coll_t c, struct stat *st) -{ - if (fake_collections) return collections.collection_stat(c, st); - - char fn[200]; - get_cdir(c, fn); - return ::lstat(fn, st); -} - -bool FakeStore::collection_exists(coll_t c) -{ - if (fake_collections) return collections.collection_exists(c); - - struct stat st; - return collection_stat(c, &st) == 0; -} - - -int FakeStore::collection_add(coll_t c, object_t o, - Context *onsafe) -{ - if (fake_collections) return collections.collection_add(c, o, onsafe); - - char cof[200]; - get_coname(c, o, cof); - char of[200]; - get_oname(o, of); - - int r = ::link(of, cof); - if (onsafe) sync(onsafe); - return r; -} - -int FakeStore::collection_remove(coll_t c, object_t o, - Context *onsafe) -{ - if (fake_collections) return collections.collection_remove(c, o, onsafe); - - char cof[200]; - get_coname(c, o, cof); - - int r = ::unlink(cof); - if (onsafe) sync(onsafe); - return r; -} - -int FakeStore::collection_list(coll_t c, list& ls) -{ - if (fake_collections) return collections.collection_list(c, ls); - - char fn[200]; - get_cdir(c, fn); - - DIR *dir = ::opendir(fn); - assert(dir); - - struct dirent *de; - while ((de = ::readdir(dir)) != 0) { - // parse - object_t o; - assert(sizeof(o) == 16); - *(((__uint64_t*)&o) + 0) = strtoll(de->d_name, 0, 16); - assert(de->d_name[16] == '.'); - *(((__uint64_t*)&o) + 1) = strtoll(de->d_name+17, 0, 16); - dout(0) << " got " << o << " errno " << errno << " on " << de->d_name << endl; - if (errno) continue; - ls.push_back(o); - } - - ::closedir(dir); - return 0; -} - -// eof. diff --git a/tags/20070517_before_mds_merge/osd/FakeStore.h b/tags/20070517_before_mds_merge/osd/FakeStore.h deleted file mode 100644 index 4ad2cb4a054e8..0000000000000 --- a/tags/20070517_before_mds_merge/osd/FakeStore.h +++ /dev/null @@ -1,110 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __FAKESTORE_H -#define __FAKESTORE_H - -#include "ObjectStore.h" -#include "common/ThreadPool.h" -#include "common/Mutex.h" - -#include "Fake.h" -//#include "FakeStoreBDBCollections.h" - - -#include -using namespace std; - -#include -using namespace __gnu_cxx; - - -// fake attributes in memory, if we need to. - -class FakeStore : public ObjectStore { - string basedir; - int whoami; - - Mutex synclock; - Cond synccond; - int unsync; - - // fake attrs? - FakeStoreAttrs attrs; - bool fake_attrs; - - // fake collections? - FakeStoreCollections collections; - bool fake_collections; - - // helper fns - void get_oname(object_t oid, char *s); - void get_cdir(coll_t cid, char *s); - void get_coname(coll_t cid, object_t oid, char *s); - - public: - FakeStore(char *base, int w) : - basedir(base), - whoami(w), - unsync(0), - attrs(this), fake_attrs(false), - collections(this), fake_collections(false) { } - - int mount(); - int umount(); - int mkfs(); - - int statfs(struct statfs *buf); - - // ------------------ - // objects - int pick_object_revision_lt(object_t& oid) { - return 0; - } - bool exists(object_t oid); - int stat(object_t oid, struct stat *st); - int remove(object_t oid, Context *onsafe); - int truncate(object_t oid, off_t size, Context *onsafe); - int read(object_t oid, off_t offset, size_t len, bufferlist& bl); - int write(object_t oid, off_t offset, size_t len, bufferlist& bl, Context *onsafe); - - void sync(); - void sync(Context *onsafe); - - // attrs - int setattr(object_t oid, const char *name, const void *value, size_t size, Context *onsafe=0); - int setattrs(object_t oid, map& aset); - int getattr(object_t oid, const char *name, void *value, size_t size); - int getattrs(object_t oid, map& aset); - int rmattr(object_t oid, const char *name, Context *onsafe=0); - //int listattr(object_t oid, char *attrs, size_t size); - int collection_setattr(coll_t c, const char *name, void *value, size_t size, Context *onsafe=0); - int collection_rmattr(coll_t c, const char *name, Context *onsafe=0); - int collection_getattr(coll_t c, const char *name, void *value, size_t size); - //int collection_listattr(coll_t c, char *attrs, size_t size); - - - // collections - int list_collections(list& ls); - int create_collection(coll_t c, Context *onsafe=0); - int destroy_collection(coll_t c, Context *onsafe=0); - int collection_stat(coll_t c, struct stat *st); - bool collection_exists(coll_t c); - int collection_add(coll_t c, object_t o, Context *onsafe=0); - int collection_remove(coll_t c, object_t o, Context *onsafe=0); - int collection_list(coll_t c, list& o); - -}; - -#endif diff --git a/tags/20070517_before_mds_merge/osd/FakeStoreBDBCollections.h b/tags/20070517_before_mds_merge/osd/FakeStoreBDBCollections.h deleted file mode 100644 index 97316d2642674..0000000000000 --- a/tags/20070517_before_mds_merge/osd/FakeStoreBDBCollections.h +++ /dev/null @@ -1,168 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __FAKESTOREBDBCOLLECTIONS_H -#define __FAKESTOREBDBCOLLECTIONS_H - -#include "BDBMap.h" -#include "ObjectStore.h" -#include "common/Mutex.h" - -#define BDBHASH_DIRS 128LL -#define BDBHASH_FUNC(x) (((x) ^ ((x)>>30) ^ ((x)>>18) ^ ((x)>>45) ^ 0xdead1234) * 884811 % BDBHASH_DIRS) - -class FakeStoreBDBCollections { - private: - int whoami; - string basedir; - - Mutex bdblock; - - // collection dbs - BDBMap collections; - map*> collection_map; - - // dirs - void get_dir(string& dir) { - char s[30]; - sprintf(s, "%d", whoami); - dir = basedir + "/" + s; - } - void get_collfn(coll_t c, string &fn) { - char s[100]; - sprintf(s, "%d/%02llx/%016llx.co", whoami, BDBHASH_FUNC(c), c); - fn = basedir + "/" + s; - } - - void open_collections() { - string cfn; - get_dir(cfn); - cfn += "/collections"; - collections.open(cfn.c_str()); - list ls; - collections.list_keys(ls); - } - void close_collections() { - if (collections.is_open()) - collections.close(); - - for (map*>::iterator it = collection_map.begin(); - it != collection_map.end(); - it++) { - it->second->close(); - } - collection_map.clear(); - } - - int open_collection(coll_t c) { - if (collection_map.count(c)) - return 0; // already open. - - string fn; - get_collfn(c,fn); - collection_map[c] = new BDBMap; - int r = collection_map[c]->open(fn.c_str()); - if (r != 0) - collection_map.erase(c); // failed - return r; - } - - public: - FakeStoreBDBCollections(int w, string& bd) : whoami(w), basedir(bd) {} - ~FakeStoreBDBCollections() { - close_collections(); - } - - int list_collections(list& ls) { - bdblock.Lock(); - if (!collections.is_open()) open_collections(); - - ls.clear(); - collections.list_keys(ls); - bdblock.Unlock(); - return 0; - } - int create_collection(coll_t c) { - bdblock.Lock(); - if (!collections.is_open()) open_collections(); - - collections.put(c, 1); - open_collection(c); - bdblock.Unlock(); - return 0; - } - int destroy_collection(coll_t c) { - bdblock.Lock(); - if (!collections.is_open()) open_collections(); - - collections.del(c); - - open_collection(c); - collection_map[c]->close(); - - string fn; - get_collfn(c,fn); - collection_map[c]->remove(fn.c_str()); - delete collection_map[c]; - collection_map.erase(c); - bdblock.Unlock(); - return 0; - } - int collection_stat(coll_t c, struct stat *st) { - bdblock.Lock(); - if (!collections.is_open()) open_collections(); - - string fn; - get_collfn(c,fn); - int r = ::stat(fn.c_str(), st); - bdblock.Unlock(); - return r; - } - bool collection_exists(coll_t c) { - bdblock.Lock(); - struct stat st; - int r = collection_stat(c, &st) == 0; - bdblock.Unlock(); - return r; - } - int collection_add(coll_t c, object_t o) { - bdblock.Lock(); - if (!collections.is_open()) open_collections(); - - open_collection(c); - collection_map[c]->put(o,1); - bdblock.Unlock(); - return 0; - } - int collection_remove(coll_t c, object_t o) { - bdblock.Lock(); - if (!collections.is_open()) open_collections(); - - open_collection(c); - collection_map[c]->del(o); - bdblock.Unlock(); - return 0; - } - int collection_list(coll_t c, list& o) { - bdblock.Lock(); - if (!collections.is_open()) open_collections(); - - open_collection(c); - collection_map[c]->list_keys(o); - bdblock.Unlock(); - return 0; - } -}; - -#endif diff --git a/tags/20070517_before_mds_merge/osd/OBFSStore.cc b/tags/20070517_before_mds_merge/osd/OBFSStore.cc deleted file mode 100644 index e82c6f804721d..0000000000000 --- a/tags/20070517_before_mds_merge/osd/OBFSStore.cc +++ /dev/null @@ -1,244 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "OBFSStore.h" - -extern "C" { -#include "../../uofs/uofs.h" -} - -#include "common/Timer.h" - -#include "include/types.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug) cout << "osd" << whoami << ".obfs " - -OBFSStore::OBFSStore(int whoami, char *param, char *dev) -{ - this->whoami = whoami; - this->mounted = -1; - this->bdev_id = -1; - this->param[0] = 0; - this->dev[0] = 0; - if (dev) - strcpy(this->dev, dev); - if (param) - strcpy(this->param, param); -} - -int OBFSStore::mount(void) -{ - dout(0) << "OBFS init!" << endl; - if ((this->bdev_id = device_open(this->dev, O_RDWR)) < 0) { - dout(0) << "device open FAILED on " << this->dev << ", errno " << errno << endl; - return -1; - } - - this->mkfs(); - this->mounted = uofs_mount(this->bdev_id, - g_conf.uofs_cache_size, - g_conf.uofs_min_flush_pages, - this->whoami); - switch (this->mounted) { - case -1: - this->mkfs(); - //retry to mount - dout(0) << "remount the OBFS" << endl; - this->mounted = uofs_mount(this->bdev_id, - g_conf.uofs_cache_size, - g_conf.uofs_min_flush_pages, - this->whoami); - assert(this->mounted >= 0); - break; - case -2: - //fsck - dout(0) << "Need fsck! Simply formatted for now!" << endl; - this->mkfs(); - this->mounted = uofs_mount(this->bdev_id, - g_conf.uofs_cache_size, - g_conf.uofs_min_flush_pages, - this->whoami); - assert(this->mounted >= 0); - break; - case 0: - //success - break; - default: - break; - } - - if (this->mounted >= 0) - dout(0) << "successfully mounted!" << endl; - else - dout(0) << "error in mounting obfsstore!" << endl; - - return 0; -} - -int OBFSStore::mkfs(void) -{ - /*int donode_size_byte = 1024, - bd_ratio = 10, - reg_size_mb = 256, - sb_size_kb = 4, - lb_size_kb = 1024, - nr_hash_table_buckets = 1023, - delay_allocation = 1, - flush_interval = 5; - FILE *param; - */ - - - if (this->mounted >= 0) - return 0; - - dout(0) << "OBFS.mkfs!" << endl; - /* - if (strlen(this->param) > 0) { - param = fopen(this->param, "r"); - if (param) { - //fscanf(param, "Block Device: %s\n", this->dev); - fscanf(param, "Donode Size: %d\n", &donode_size_byte); - fscanf(param, "Block vs Donode Ratio: %d\n", &bd_ratio); - fscanf(param, "Region Size: %d MB\n", ®_size_mb); - fscanf(param, "Small Block Size: %d KB\n", &sb_size_kb); - fscanf(param, "Large Block Size: %d KB\n", &lb_size_kb); - fscanf(param, "Hash Table Buckets: %d\n", &nr_hash_table_buckets); - fscanf(param, "Delayed Allocation: %d\n", &delay_allocation); - } else { - dout(0) << "read open FAILED on "<< this->param <<", errno " << errno << endl; - dout(0) << "use default parameters" << endl; - } - } else - dout(0) << "use default parameters" << endl; - */ - - if (this->bdev_id <= 0) - if ((this->bdev_id = device_open(this->dev, O_RDWR)) < 0) { - dout(0) << "device open FAILED on "<< this->dev <<", errno " << errno << endl; - return -1; - } - - dout(0) << "start formating!" << endl; - - uofs_format(this->bdev_id, - g_conf.uofs_onode_size, - g_conf.uofs_block_meta_ratio, - g_conf.uofs_segment_size, - g_conf.uofs_small_block_size, - g_conf.uofs_large_block_size, - g_conf.uofs_nr_hash_buckets, - g_conf.uofs_delay_allocation, - 0,//g_conf.uofs_dev_force_size, - g_conf.uofs_flush_interval, - 0); - - dout(0) << "formatting complete!" << endl; - return 0; -} - -int OBFSStore::umount(void) -{ - uofs_shutdown(); - close(this->bdev_id); - - return 0; -} - -int OBFSStore::statfs(struct statfs *sfs) -{ - return 0; -} - -bool OBFSStore::exists(object_t oid) -{ - //dout(0) << "calling function exists!" << endl; - return uofs_exist(oid); -} - -int OBFSStore::stat(object_t oid, struct stat *st) -{ - dout(0) << "calling function stat!" << endl; - if (uofs_exist(oid)) return 0; - return -1; -} - -int OBFSStore::remove(object_t oid) -{ - dout(0) << "calling remove function!" << endl; - return uofs_del(oid); -} - -int OBFSStore::truncate(object_t oid, off_t size) -{ - dout(0) << "calling truncate function!" << endl; - //return uofs_truncate(oid, size); - return -1; -} - -int OBFSStore::read(object_t oid, size_t len, - off_t offset, bufferlist &bl) -{ - //dout(0) << "calling read function!" << endl; - //dout(0) << oid << " 0 " << len << " " << offset << " 100" << endl; - - // FIXME: page-align this and we can avoid a memcpy... - bl.push_back(new buffer(len)); - return uofs_read(oid, bl.c_str(), offset, len); -} - -int OBFSStore::write(object_t oid, size_t len, - off_t offset, bufferlist& bl, bool fsync) -{ - int ret = 0; - - //dout(0) << "calling write function!" << endl; - //if (whoami == 0) - // dout(0) << oid << " 0 " << len << " " << offset << " 101" << endl; - - for (list::iterator p = bl.buffers().begin(); - p != bl.buffers().end(); - p++) { - ret += uofs_write(oid, (*p).c_str(), offset, len, 0); - } - - if (fsync) - ret += uofs_sync(oid); - - return ret; -} - - -int OBFSStore::write(object_t oid, size_t len, - off_t offset, bufferlist& bl, Context *onflush) -{ - int r = write(oid, len, offset, bl, false); - g_timer.add_event_after((float)g_conf.uofs_fake_sync, onflush); - return r; -} diff --git a/tags/20070517_before_mds_merge/osd/OBFSStore.h b/tags/20070517_before_mds_merge/osd/OBFSStore.h deleted file mode 100644 index cb4a6afc815d7..0000000000000 --- a/tags/20070517_before_mds_merge/osd/OBFSStore.h +++ /dev/null @@ -1,56 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef _OBFSSTORE_H_ -#define _OBFSSTORE_H_ - -#include "ObjectStore.h" -#include "Fake.h" - -class OBFSStore : public ObjectStore, - public FakeStoreAttrs, - public FakeStoreCollections { - int whoami; - int bdev_id; - int mounted; - char dev[128]; - char param[128]; - - public: - OBFSStore(int whoami, char *param, char *dev); - - int mount(void); - int umount(void); - int mkfs(void); - - int statfs(struct statfs *); - - bool exists(object_t oid); - int stat(object_t oid, struct stat *st); - - int remove(object_t oid); - int truncate(object_t oid, off_t size); - - int read(object_t oid, size_t len, - off_t offset, bufferlist& bl); - int write(object_t oid, size_t len, - off_t offset, bufferlist& bl, - bool fsync); - int write(object_t oid, size_t len, - off_t offset, bufferlist& bl, - Context *onflush); - -}; - -#endif diff --git a/tags/20070517_before_mds_merge/osd/OSD.cc b/tags/20070517_before_mds_merge/osd/OSD.cc deleted file mode 100644 index c1e579db496fe..0000000000000 --- a/tags/20070517_before_mds_merge/osd/OSD.cc +++ /dev/null @@ -1,3494 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "include/types.h" - -#include "OSD.h" -#include "OSDMap.h" - -#ifdef USE_OBFS -# include "OBFSStore.h" -#else -# include "FakeStore.h" -#endif - -#include "ebofs/Ebofs.h" - -#ifdef USE_OSBDB -#include "osbdb/OSBDB.h" -#endif // USE_OSBDB - -#include "Ager.h" - - -#include "msg/Messenger.h" -#include "msg/Message.h" - -#include "messages/MGenericMessage.h" -#include "messages/MPing.h" -#include "messages/MPingAck.h" -#include "messages/MOSDPing.h" -#include "messages/MOSDFailure.h" -#include "messages/MOSDOp.h" -#include "messages/MOSDOpReply.h" -#include "messages/MOSDBoot.h" -#include "messages/MOSDIn.h" -#include "messages/MOSDOut.h" - -#include "messages/MOSDMap.h" -#include "messages/MOSDGetMap.h" -#include "messages/MOSDPGNotify.h" -#include "messages/MOSDPGQuery.h" -#include "messages/MOSDPGLog.h" -#include "messages/MOSDPGRemove.h" - -#include "common/Logger.h" -#include "common/LogType.h" -#include "common/Timer.h" -#include "common/ThreadPool.h" - -#include -#include -#include -#include - - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_osd) cout << dbeginl << g_clock.now() << " osd" << whoami << " " << (osdmap ? osdmap->get_epoch():0) << " " -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_osd) cerr << dbeginl << g_clock.now() << " osd" << whoami << " " << (osdmap ? osdmap->get_epoch():0) << " " - -char *osd_base_path = "./osddata"; -char *ebofs_base_path = "./dev"; - - -object_t SUPERBLOCK_OBJECT(0,0); - - -// force remount hack for performance testing FakeStore -class C_Remount : public Context { - OSD *osd; -public: - C_Remount(OSD *o) : osd(o) {} - void finish(int) { - osd->force_remount(); - } -}; - -void OSD::force_remount() -{ - dout(0) << "forcing remount" << dendl; - osd_lock.Lock(); - { - store->umount(); - store->mount(); - } - osd_lock.Unlock(); - dout(0) << "finished remount" << dendl; -} -// - - -// cons/des - -LogType osd_logtype; - -OSD::OSD(int id, Messenger *m, MonMap *mm, char *dev) : timer(osd_lock) -{ - whoami = id; - messenger = m; - monmap = mm; - - osdmap = 0; - boot_epoch = 0; - - last_tid = 0; - num_pulling = 0; - - state = STATE_BOOTING; - - hb_stat_ops = 0; - hb_stat_qlen = 0; - - pending_ops = 0; - waiting_for_no_ops = false; - - if (g_conf.osd_remount_at) - timer.add_event_after(g_conf.osd_remount_at, new C_Remount(this)); - - - // init object store - // try in this order: - // dev/osd$num - // dev/osd.$hostname - // dev/osd.all - - if (dev) { - strcpy(dev_path,dev); - } else { - char hostname[100]; - hostname[0] = 0; - gethostname(hostname,100); - - sprintf(dev_path, "%s/osd%d", ebofs_base_path, whoami); - - struct stat sta; - if (::lstat(dev_path, &sta) != 0) - sprintf(dev_path, "%s/osd.%s", ebofs_base_path, hostname); - - if (::lstat(dev_path, &sta) != 0) - sprintf(dev_path, "%s/osd.all", ebofs_base_path); - } - - if (g_conf.ebofs) { - store = new Ebofs(dev_path); - //store->_fake_writes(true); - } -#ifdef USE_OBFS - else if (g_conf.uofs) { - store = new OBFSStore(whoami, NULL, dev_path); - } -#endif -#ifdef USE_OSBDB - else if (g_conf.bdbstore) { - store = new OSBDB(dev_path); - } -#endif // USE_OSBDB - else { - sprintf(dev_path, "osddata/osd%d", whoami); - store = new FakeStore(dev_path, whoami); - } - -} - -OSD::~OSD() -{ - if (threadpool) { delete threadpool; threadpool = 0; } - if (osdmap) { delete osdmap; osdmap = 0; } - //if (monitor) { delete monitor; monitor = 0; } - if (messenger) { delete messenger; messenger = 0; } - if (logger) { delete logger; logger = 0; } - if (store) { delete store; store = 0; } -} - -int OSD::init() -{ - osd_lock.Lock(); - { - // mkfs? - if (g_conf.osd_mkfs) { - dout(2) << "mkfs" << dendl; - store->mkfs(); - - // make up a superblock - //superblock.fsid = ???; - superblock.whoami = whoami; - } - - // mount. - dout(2) << "mounting " << dev_path << dendl; - int r = store->mount(); - assert(r>=0); - - if (g_conf.osd_mkfs) { - // age? - if (g_conf.osd_age_time != 0) { - dout(2) << "age" << dendl; - Ager ager(store); - if (g_conf.osd_age_time < 0) - ager.load_freelist(); - else - ager.age(g_conf.osd_age_time, - g_conf.osd_age, - g_conf.osd_age - .05, - 50000, - g_conf.osd_age - .05); - } - } - else { - dout(2) << "boot" << dendl; - - // read superblock - read_superblock(); - - // load up pgs (as they previously existed) - load_pgs(); - - dout(2) << "superblock: i am osd" << superblock.whoami << dendl; - assert(whoami == superblock.whoami); - } - - - // log - char name[80]; - sprintf(name, "osd%02d", whoami); - logger = new Logger(name, (LogType*)&osd_logtype); - osd_logtype.add_set("opq"); - osd_logtype.add_inc("op"); - osd_logtype.add_inc("c_rd"); - osd_logtype.add_inc("c_rdb"); - osd_logtype.add_inc("c_wr"); - osd_logtype.add_inc("c_wrb"); - - osd_logtype.add_inc("r_push"); - osd_logtype.add_inc("r_pushb"); - osd_logtype.add_inc("r_wr"); - osd_logtype.add_inc("r_wrb"); - - osd_logtype.add_inc("rlnum"); - - osd_logtype.add_set("numpg"); - osd_logtype.add_set("pingset"); - - osd_logtype.add_set("buf"); - - osd_logtype.add_inc("map"); - osd_logtype.add_inc("mapi"); - osd_logtype.add_inc("mapidup"); - osd_logtype.add_inc("mapf"); - osd_logtype.add_inc("mapfdup"); - - // request thread pool - { - char name[80]; - sprintf(name,"osd%d.threadpool", whoami); - threadpool = new ThreadPool(name, g_conf.osd_maxthreads, - static_dequeueop, - this); - } - - // i'm ready! - messenger->set_dispatcher(this); - - // announce to monitor i exist and have booted. - int mon = monmap->pick_mon(); - messenger->send_message(new MOSDBoot(superblock), monmap->get_inst(mon)); - - // start the heart - timer.add_event_after(g_conf.osd_heartbeat_interval, new C_Heartbeat(this)); - } - osd_lock.Unlock(); - - //dout(0) << "osd_rep " << g_conf.osd_rep << dendl; - - return 0; -} - -int OSD::shutdown() -{ - dout(1) << "shutdown" << dendl; - - state = STATE_STOPPING; - - // cancel timers - timer.cancel_all(); - timer.join(); - - // finish ops - wait_for_no_ops(); - - // stop threads - delete threadpool; - threadpool = 0; - - // close pgs - for (hash_map::iterator p = pg_map.begin(); - p != pg_map.end(); - p++) { - delete p->second; - } - pg_map.clear(); - - // shut everything else down - //monitor->shutdown(); - messenger->shutdown(); - - osd_lock.Unlock(); - int r = store->umount(); - osd_lock.Lock(); - return r; -} - - - -void OSD::write_superblock(ObjectStore::Transaction& t) -{ - dout(10) << "write_superblock " << superblock << dendl; - - bufferlist bl; - bl.append((char*)&superblock, sizeof(superblock)); - t.write(SUPERBLOCK_OBJECT, 0, sizeof(superblock), bl); -} - -int OSD::read_superblock() -{ - bufferlist bl; - int r = store->read(SUPERBLOCK_OBJECT, 0, sizeof(superblock), bl); - if (bl.length() != sizeof(superblock)) { - dout(10) << "read_superblock failed, r = " << r << ", i got " << bl.length() << " bytes, not " << sizeof(superblock) << dendl; - return -1; - } - - bl.copy(0, sizeof(superblock), (char*)&superblock); - - dout(10) << "read_superblock " << superblock << dendl; - - // load up "current" osdmap - assert(!osdmap); - osdmap = new OSDMap; - bl.clear(); - get_map_bl(superblock.current_epoch, bl); - osdmap->decode(bl); - - assert(whoami == superblock.whoami); // fixme! - return 0; -} - - -// object locks - -PG *OSD::lock_pg(pg_t pgid) -{ - osd_lock.Lock(); - PG *pg = _lock_pg(pgid); - osd_lock.Unlock(); - return pg; -} - -PG *OSD::_lock_pg(pg_t pgid) -{ - assert(pg_map.count(pgid)); - - if (pg_lock.count(pgid)) { - Cond c; - dout(15) << "lock_pg " << pgid << " waiting as " << &c << dendl; - //cerr << "lock_pg " << pgid << " waiting as " << &c << dendl; - - list& ls = pg_lock_waiters[pgid]; // this is commit, right? - ls.push_back(&c); - - while (pg_lock.count(pgid) || - ls.front() != &c) - c.Wait(osd_lock); - - assert(ls.front() == &c); - ls.pop_front(); - if (ls.empty()) - pg_lock_waiters.erase(pgid); - } - - dout(15) << "lock_pg " << pgid << dendl; - pg_lock.insert(pgid); - - return pg_map[pgid]; -} - -void OSD::unlock_pg(pg_t pgid) -{ - osd_lock.Lock(); - _unlock_pg(pgid); - osd_lock.Unlock(); -} - -void OSD::_unlock_pg(pg_t pgid) -{ - // unlock - assert(pg_lock.count(pgid)); - pg_lock.erase(pgid); - - if (pg_lock_waiters.count(pgid)) { - // someone is in line - Cond *c = pg_lock_waiters[pgid].front(); - assert(c); - dout(15) << "unlock_pg " << pgid << " waking up next guy " << c << dendl; - c->Signal(); - } else { - // nobody waiting - dout(15) << "unlock_pg " << pgid << dendl; - } -} - -void OSD::_remove_pg(pg_t pgid) -{ - dout(10) << "_remove_pg " << pgid << dendl; - - // remove from store - list olist; - store->collection_list(pgid, olist); - - ObjectStore::Transaction t; - { - for (list::iterator p = olist.begin(); - p != olist.end(); - p++) - t.remove(*p); - t.remove_collection(pgid); - t.remove(pgid.to_object()); // log too - } - store->apply_transaction(t); - - // hose from memory - delete pg_map[pgid]; - pg_map.erase(pgid); -} - - -void OSD::activate_pg(pg_t pgid, epoch_t epoch) -{ - osd_lock.Lock(); - { - if (pg_map.count(pgid)) { - PG *pg = _lock_pg(pgid); - if (pg->is_crashed() && - pg->is_replay() && - pg->get_role() == 0 && - pg->info.history.same_primary_since <= epoch) { - ObjectStore::Transaction t; - pg->activate(t); - store->apply_transaction(t); - } - _unlock_pg(pgid); - } - } - - // finishers? - if (finished.empty()) { - osd_lock.Unlock(); - } else { - list waiting; - waiting.splice(waiting.begin(), finished); - - osd_lock.Unlock(); - - for (list::iterator it = waiting.begin(); - it != waiting.end(); - it++) { - dispatch(*it); - } - } -} - - -// ------------------------------------- - -void OSD::heartbeat() -{ - utime_t now = g_clock.now(); - utime_t since = now; - since.sec_ref() -= g_conf.osd_heartbeat_interval; - - // calc my stats - float avg_qlen = 0; - if (hb_stat_ops) avg_qlen = (float)hb_stat_qlen / (float)hb_stat_ops; - - dout(5) << "heartbeat " << now - << ": ops " << hb_stat_ops - << ", avg qlen " << avg_qlen - << dendl; - - // reset until next time around - hb_stat_ops = 0; - hb_stat_qlen = 0; - - // send pings - set pingset; - for (hash_map::iterator i = pg_map.begin(); - i != pg_map.end(); - i++) { - PG *pg = i->second; - - // we want to ping the primary. - if (pg->get_role() <= 0) continue; - if (pg->acting.size() < 1) continue; - - if (pg->last_heartbeat < since) { - pg->last_heartbeat = now; - pingset.insert(pg->acting[0]); - } - } - for (set::iterator i = pingset.begin(); - i != pingset.end(); - i++) { - _share_map_outgoing( osdmap->get_inst(*i) ); - messenger->send_message(new MOSDPing(osdmap->get_epoch(), avg_qlen), - osdmap->get_inst(*i)); - } - - if (logger) logger->set("pingset", pingset.size()); - - // hack: fake reorg? - if (osdmap && g_conf.fake_osdmap_updates) { - int mon = monmap->pick_mon(); - if ((rand() % g_conf.fake_osdmap_updates) == 0) { - //if ((rand() % (g_conf.num_osd / g_conf.fake_osdmap_updates)) == whoami / g_conf.fake_osdmap_updates) { - messenger->send_message(new MOSDIn(osdmap->get_epoch()), - monmap->get_inst(mon)); - } - /* - if (osdmap->is_out(whoami)) { - messenger->send_message(new MOSDIn(osdmap->get_epoch()), - MSG_ADDR_MON(mon), monmap->get_inst(mon)); - } - else if ((rand() % g_conf.fake_osdmap_updates) == 0) { - //messenger->send_message(new MOSDOut(osdmap->get_epoch()), - //MSG_ADDR_MON(mon), monmap->get_inst(mon)); - } - } - */ - } - - // schedule next! randomly. - float wait = .5 + ((float)(rand() % 10)/10.0) * (float)g_conf.osd_heartbeat_interval; - timer.add_event_after(wait, new C_Heartbeat(this)); -} - - - -// -------------------------------------- -// dispatch - -bool OSD::_share_map_incoming(const entity_inst_t& inst, epoch_t epoch) -{ - bool shared = false; - - // does client have old map? - if (inst.name.is_client()) { - if (epoch < osdmap->get_epoch()) { - dout(10) << inst.name << " has old map " << epoch << " < " << osdmap->get_epoch() << dendl; - send_incremental_map(epoch, inst, true); - shared = true; - } - } - - // does peer have old map? - if (inst.name.is_osd()) { - // remember - if (peer_map_epoch[inst.name] < epoch) - peer_map_epoch[inst.name] = epoch; - - // older? - if (peer_map_epoch[inst.name] < osdmap->get_epoch()) { - dout(10) << inst.name << " has old map " << epoch << " < " << osdmap->get_epoch() << dendl; - send_incremental_map(epoch, inst, true); - peer_map_epoch[inst.name] = osdmap->get_epoch(); // so we don't send it again. - shared = true; - } - } - - return shared; -} - - -void OSD::_share_map_outgoing(const entity_inst_t& inst) -{ - assert(inst.name.is_osd()); - - if (inst.name.is_osd()) { - // send map? - if (peer_map_epoch.count(inst.name)) { - epoch_t pe = peer_map_epoch[inst.name]; - if (pe < osdmap->get_epoch()) { - send_incremental_map(pe, inst, true); - peer_map_epoch[inst.name] = osdmap->get_epoch(); - } - } else { - // no idea about peer's epoch. - // ??? send recent ??? - // do nothing. - } - } -} - - - -void OSD::dispatch(Message *m) -{ - // lock! - osd_lock.Lock(); - - switch (m->get_type()) { - - // -- don't need lock -- - case MSG_PING: - dout(10) << "ping from " << m->get_source() << dendl; - delete m; - break; - - // -- don't need OSDMap -- - - /* - // host monitor - case MSG_PING_ACK: - case MSG_FAILURE_ACK: - monitor->proc_message(m); - break; - */ - - // map and replication - case MSG_OSD_MAP: - handle_osd_map((MOSDMap*)m); - break; - - // osd - case MSG_SHUTDOWN: - shutdown(); - delete m; - break; - - - - // -- need OSDMap -- - - default: - { - // no map? starting up? - if (!osdmap) { - dout(7) << "no OSDMap, not booted" << dendl; - waiting_for_osdmap.push_back(m); - break; - } - - // down? - if (osdmap->is_down(whoami)) { - dout(7) << "i am marked down, dropping " << *m << dendl; - delete m; - break; - } - - - - - // need OSDMap - switch (m->get_type()) { - - case MSG_OSD_PING: - // take note. - handle_osd_ping((MOSDPing*)m); - break; - - case MSG_OSD_PG_NOTIFY: - handle_pg_notify((MOSDPGNotify*)m); - break; - case MSG_OSD_PG_QUERY: - handle_pg_query((MOSDPGQuery*)m); - break; - case MSG_OSD_PG_LOG: - handle_pg_log((MOSDPGLog*)m); - break; - case MSG_OSD_PG_REMOVE: - handle_pg_remove((MOSDPGRemove*)m); - break; - - case MSG_OSD_OP: - handle_op((MOSDOp*)m); - break; - - // for replication etc. - case MSG_OSD_OPREPLY: - handle_op_reply((MOSDOpReply*)m); - break; - - - default: - dout(1) << " got unknown message " << m->get_type() << dendl; - assert(0); - } - } - } - - // finishers? - if (!finished.empty()) { - list waiting; - waiting.splice(waiting.begin(), finished); - - osd_lock.Unlock(); - - for (list::iterator it = waiting.begin(); - it != waiting.end(); - it++) { - dispatch(*it); - } - return; - } - - osd_lock.Unlock(); -} - - -void OSD::ms_handle_failure(Message *m, const entity_inst_t& inst) -{ - entity_name_t dest = inst.name; - - if (g_conf.ms_die_on_failure) { - dout(0) << "ms_handle_failure " << inst << " on " << *m << dendl; - exit(0); - } - - if (dest.is_osd()) { - // failed osd. drop message, report to mon. - int mon = monmap->pick_mon(); - dout(0) << "ms_handle_failure " << inst - << ", dropping and reporting to mon" << mon - << " " << *m - << dendl; - messenger->send_message(new MOSDFailure(inst, osdmap->get_epoch()), - monmap->get_inst(mon)); - delete m; - } else if (dest.is_mon()) { - // resend to a different monitor. - int mon = monmap->pick_mon(true); - dout(0) << "ms_handle_failure " << inst - << ", resending to mon" << mon - << " " << *m - << dendl; - messenger->send_message(m, monmap->get_inst(mon)); - } - else { - // client? - dout(0) << "ms_handle_failure " << inst - << ", dropping " << *m << dendl; - delete m; - } -} - - - - -void OSD::handle_osd_ping(MOSDPing *m) -{ - dout(20) << "osdping from " << m->get_source() << dendl; - _share_map_incoming(m->get_source_inst(), ((MOSDPing*)m)->map_epoch); - - int from = m->get_source().num(); - peer_qlen[from] = m->avg_qlen; - - //if (!m->ack) - //messenger->send_message(new MOSDPing(osdmap->get_epoch(), true), - //m->get_source()); - - delete m; -} - - - - -// ===================================================== -// MAP - -void OSD::wait_for_new_map(Message *m) -{ - // ask - if (waiting_for_osdmap.empty()) { - int mon = monmap->pick_mon(); - messenger->send_message(new MOSDGetMap(osdmap->get_epoch()), - monmap->get_inst(mon)); - } - - waiting_for_osdmap.push_back(m); -} - - -/** update_map - * assimilate new OSDMap(s). scan pgs, etc. - */ -void OSD::handle_osd_map(MOSDMap *m) -{ - wait_for_no_ops(); - - assert(osd_lock.is_locked()); - - ObjectStore::Transaction t; - - if (osdmap) { - dout(3) << "handle_osd_map epochs [" - << m->get_first() << "," << m->get_last() - << "], i have " << osdmap->get_epoch() - << dendl; - } else { - dout(3) << "handle_osd_map epochs [" - << m->get_first() << "," << m->get_last() - << "], i have none" - << dendl; - osdmap = new OSDMap; - boot_epoch = m->get_last(); // hrm...? - } - - logger->inc("mapmsg"); - - // store them? - for (map::iterator p = m->maps.begin(); - p != m->maps.end(); - p++) { - object_t oid = get_osdmap_object_name(p->first); - if (store->exists(oid)) { - dout(10) << "handle_osd_map already had full map epoch " << p->first << dendl; - logger->inc("mapfdup"); - bufferlist bl; - get_map_bl(p->first, bl); - dout(10) << " .. it is " << bl.length() << " bytes" << dendl; - continue; - } - - dout(10) << "handle_osd_map got full map epoch " << p->first << dendl; - //t.write(oid, 0, p->second.length(), p->second); - store->write(oid, 0, p->second.length(), p->second, 0); - - if (p->first > superblock.newest_map) - superblock.newest_map = p->first; - if (p->first < superblock.oldest_map || - superblock.oldest_map == 0) - superblock.oldest_map = p->first; - - logger->inc("mapf"); - } - for (map::iterator p = m->incremental_maps.begin(); - p != m->incremental_maps.end(); - p++) { - object_t oid = get_inc_osdmap_object_name(p->first); - if (store->exists(oid)) { - dout(10) << "handle_osd_map already had incremental map epoch " << p->first << dendl; - logger->inc("mapidup"); - bufferlist bl; - get_inc_map_bl(p->first, bl); - dout(10) << " .. it is " << bl.length() << " bytes" << dendl; - continue; - } - - dout(10) << "handle_osd_map got incremental map epoch " << p->first << dendl; - //t.write(oid, 0, p->second.length(), p->second); - store->write(oid, 0, p->second.length(), p->second, 0); - - if (p->first > superblock.newest_map) - superblock.newest_map = p->first; - if (p->first < superblock.oldest_map || - superblock.oldest_map == 0) - superblock.oldest_map = p->first; - - logger->inc("mapi"); - } - - // advance if we can - bool advanced = false; - - if (m->get_source().is_mon() && is_booting()) - advanced = true; - - epoch_t cur = superblock.current_epoch; - while (cur < superblock.newest_map) { - bufferlist bl; - if (m->incremental_maps.count(cur+1) || - store->exists(get_inc_osdmap_object_name(cur+1))) { - dout(10) << "handle_osd_map decoding inc map epoch " << cur+1 << dendl; - - bufferlist bl; - if (m->incremental_maps.count(cur+1)) - bl = m->incremental_maps[cur+1]; - else - get_inc_map_bl(cur+1, bl); - - OSDMap::Incremental inc; - int off = 0; - inc.decode(bl, off); - - osdmap->apply_incremental(inc); - - // archive the full map - bl.clear(); - osdmap->encode(bl); - t.write( get_osdmap_object_name(cur+1), 0, bl.length(), bl); - - // notify messenger - for (map::iterator i = inc.new_down.begin(); - i != inc.new_down.end(); - i++) { - int osd = i->first; - if (osd == whoami) continue; - messenger->mark_down(i->second.addr); - peer_map_epoch.erase(MSG_ADDR_OSD(osd)); - - // kick any replica ops - for (hash_map::iterator it = pg_map.begin(); - it != pg_map.end(); - it++) { - PG *pg = it->second; - - _lock_pg(pg->info.pgid); - { - list ls; // do async; repop_ack() may modify pg->repop_gather - for (map::iterator p = pg->repop_gather.begin(); - p != pg->repop_gather.end(); - p++) { - //dout(-1) << "checking repop tid " << p->first << dendl; - if (p->second->waitfor_ack.count(osd) || - p->second->waitfor_commit.count(osd)) - ls.push_back(p->second); - } - for (list::iterator p = ls.begin(); - p != ls.end(); - p++) - repop_ack(pg, *p, -1, true, osd); - } - _unlock_pg(pg->info.pgid); - } - } - for (map::iterator i = inc.new_up.begin(); - i != inc.new_up.end(); - i++) { - if (i->first == whoami) continue; - peer_map_epoch.erase(MSG_ADDR_OSD(i->first)); - } - } - else if (m->maps.count(cur+1) || - store->exists(get_osdmap_object_name(cur+1))) { - dout(10) << "handle_osd_map decoding full map epoch " << cur+1 << dendl; - bufferlist bl; - if (m->maps.count(cur+1)) - bl = m->maps[cur+1]; - else - get_map_bl(cur+1, bl); - osdmap->decode(bl); - - // FIXME BUG: need to notify messenger of ups/downs!! - } - else { - dout(10) << "handle_osd_map missing epoch " << cur+1 << dendl; - int mon = monmap->pick_mon(); - messenger->send_message(new MOSDGetMap(cur), monmap->get_inst(mon)); - break; - } - - cur++; - superblock.current_epoch = cur; - advance_map(t); - advanced = true; - } - - // all the way? - if (advanced && cur == superblock.newest_map) { - // yay! - activate_map(t); - - // process waiters - take_waiters(waiting_for_osdmap); - } - - // write updated pg state to store - for (hash_map::iterator i = pg_map.begin(); - i != pg_map.end(); - i++) { - pg_t pgid = i->first; - PG *pg = i->second; - t.collection_setattr( pgid, "info", &pg->info, sizeof(pg->info)); - } - - // superblock and commit - write_superblock(t); - store->apply_transaction(t); - - //if (osdmap->get_epoch() == 1) store->sync(); // in case of early death, blah - - delete m; -} - - -/** - * scan placement groups, initiate any replication - * activities. - */ -void OSD::advance_map(ObjectStore::Transaction& t) -{ - dout(7) << "advance_map epoch " << osdmap->get_epoch() - << " " << pg_map.size() << " pgs" - << dendl; - - if (osdmap->is_mkfs()) { - ps_t maxps = 1ULL << osdmap->get_pg_bits(); - ps_t maxlps = 1ULL << osdmap->get_localized_pg_bits(); - dout(1) << "mkfs on " << osdmap->get_pg_bits() << " bits, " << maxps << " pgs" << dendl; - assert(osdmap->get_epoch() == 1); - - //cerr << "osdmap " << osdmap->get_ctime() << " logger start " << logger->get_start() << dendl; - logger->set_start( osdmap->get_ctime() ); - - assert(g_conf.osd_mkfs); // make sure we did a mkfs! - - // create PGs - for (int nrep = 1; - nrep <= MIN(g_conf.num_osd, g_conf.osd_max_rep); // for low osd counts.. hackish bleh - nrep++) { - for (ps_t ps = 0; ps < maxps; ++ps) { - vector acting; - pg_t pgid = osdmap->ps_nrep_to_pg(ps, nrep); - int nrep = osdmap->pg_to_acting_osds(pgid, acting); - int role = osdmap->calc_pg_role(whoami, acting, nrep); - if (role < 0) continue; - - PG *pg = create_pg(pgid, t); - pg->set_role(role); - pg->acting.swap(acting); - pg->last_epoch_started_any = - pg->info.last_epoch_started = - pg->info.history.same_since = - pg->info.history.same_primary_since = - pg->info.history.same_acker_since = osdmap->get_epoch(); - pg->activate(t); - - dout(7) << "created " << *pg << dendl; - } - - for (ps_t ps = 0; ps < maxlps; ++ps) { - // local PG too - vector acting; - pg_t pgid = osdmap->ps_osd_nrep_to_pg(ps, whoami, nrep); - int nrep = osdmap->pg_to_acting_osds(pgid, acting); - int role = osdmap->calc_pg_role(whoami, acting, nrep); - - PG *pg = create_pg(pgid, t); - pg->acting.swap(acting); - pg->set_role(role); - pg->last_epoch_started_any = - pg->info.last_epoch_started = - pg->info.history.same_primary_since = - pg->info.history.same_acker_since = - pg->info.history.same_since = osdmap->get_epoch(); - pg->activate(t); - - dout(7) << "created " << *pg << dendl; - } - } - - dout(1) << "mkfs done, created " << pg_map.size() << " pgs" << dendl; - - } else { - // scan existing pg's - for (hash_map::iterator it = pg_map.begin(); - it != pg_map.end(); - it++) { - pg_t pgid = it->first; - PG *pg = it->second; - - // did i finish this epoch? - if (pg->is_active()) { - pg->info.last_epoch_finished = osdmap->get_epoch()-1; - } - - // get new acting set - vector tacting; - int nrep = osdmap->pg_to_acting_osds(pgid, tacting); - int role = osdmap->calc_pg_role(whoami, tacting, nrep); - - // no change? - if (tacting == pg->acting) - continue; - - // -- there was a change! -- - _lock_pg(pgid); - - int oldrole = pg->get_role(); - int oldprimary = pg->get_primary(); - int oldacker = pg->get_acker(); - vector oldacting = pg->acting; - - // update PG - pg->acting.swap(tacting); - pg->set_role(role); - - // did primary|acker change? - pg->info.history.same_since = osdmap->get_epoch(); - if (oldprimary != pg->get_primary()) { - pg->info.history.same_primary_since = osdmap->get_epoch(); - pg->cancel_recovery(); - } - if (oldacker != pg->get_acker()) { - pg->info.history.same_acker_since = osdmap->get_epoch(); - } - - // deactivate. - pg->state_clear(PG::STATE_ACTIVE); - - // reset primary state? - if (oldrole == 0 || pg->get_role() == 0) - pg->clear_primary_state(); - - // apply any repops in progress. - if (oldacker == whoami) { - // apply repops - for (map::iterator p = pg->repop_gather.begin(); - p != pg->repop_gather.end(); - p++) { - if (!p->second->applied) - apply_repop(pg, p->second); - delete p->second->op; - delete p->second; - } - pg->repop_gather.clear(); - - // and repop waiters - for (map >::iterator p = pg->waiting_for_repop.begin(); - p != pg->waiting_for_repop.end(); - p++) - for (list::iterator pm = p->second.begin(); - pm != p->second.end(); - pm++) - delete *pm; - pg->waiting_for_repop.clear(); - } - - if (role != oldrole) { - // old primary? - if (oldrole == 0) { - pg->state_clear(PG::STATE_CLEAN); - - // take replay queue waiters - list ls; - for (map::iterator it = pg->replay_queue.begin(); - it != pg->replay_queue.end(); - it++) - ls.push_back(it->second); - pg->replay_queue.clear(); - take_waiters(ls); - - // take active waiters - take_waiters(pg->waiting_for_active); - - // take object waiters - for (hash_map >::iterator it = pg->waiting_for_missing_object.begin(); - it != pg->waiting_for_missing_object.end(); - it++) - take_waiters(it->second); - pg->waiting_for_missing_object.clear(); - } - - // new primary? - if (role == 0) { - // i am new primary - pg->state_clear(PG::STATE_STRAY); - } else { - // i am now replica|stray. we need to send a notify. - pg->state_set(PG::STATE_STRAY); - - if (nrep == 0) { - pg->state_set(PG::STATE_CRASHED); - dout(1) << *pg << " is crashed" << dendl; - } - } - - // my role changed. - dout(10) << *pg << " " << oldacting << " -> " << pg->acting - << ", role " << oldrole << " -> " << role << dendl; - - } else { - // no role change. - // did primary change? - if (pg->get_primary() != oldprimary) { - // we need to announce - pg->state_set(PG::STATE_STRAY); - - dout(10) << *pg << " " << oldacting << " -> " << pg->acting - << ", acting primary " - << oldprimary << " -> " << pg->get_primary() - << dendl; - } else { - // primary is the same. - if (role == 0) { - // i am (still) primary. but my replica set changed. - pg->state_clear(PG::STATE_CLEAN); - pg->state_clear(PG::STATE_REPLAY); - - dout(10) << *pg << " " << oldacting << " -> " << pg->acting - << ", replicas changed" << dendl; - } - } - } - - - _unlock_pg(pgid); - } - } -} - -void OSD::activate_map(ObjectStore::Transaction& t) -{ - dout(7) << "activate_map version " << osdmap->get_epoch() << dendl; - - map< int, list > notify_list; // primary -> list - map< int, map > query_map; // peer -> PG -> get_summary_since - - // scan pg's - for (hash_map::iterator it = pg_map.begin(); - it != pg_map.end(); - it++) { - //pg_t pgid = it->first; - PG *pg = it->second; - - if (pg->is_active()) { - // update started counter - pg->info.last_epoch_started = osdmap->get_epoch(); - } - else if (pg->get_role() == 0 && !pg->is_active()) { - // i am (inactive) primary - pg->build_prior(); - pg->peer(t, query_map); - } - else if (pg->is_stray() && - pg->get_primary() >= 0) { - // i am residual|replica - notify_list[pg->get_primary()].push_back(pg->info); - } - - } - - if (osdmap->is_mkfs()) // hack: skip the queries/summaries if it's a mkfs - return; - - // notify? (residual|replica) - do_notifies(notify_list); - - // do queries. - do_queries(query_map); - - logger->set("numpg", pg_map.size()); -} - - -void OSD::send_incremental_map(epoch_t since, const entity_inst_t& inst, bool full) -{ - dout(10) << "send_incremental_map " << since << " -> " << osdmap->get_epoch() - << " to " << inst << dendl; - - MOSDMap *m = new MOSDMap; - - for (epoch_t e = osdmap->get_epoch(); - e > since; - e--) { - bufferlist bl; - if (get_inc_map_bl(e,bl)) { - m->incremental_maps[e].claim(bl); - } else if (get_map_bl(e,bl)) { - m->maps[e].claim(bl); - if (!full) break; - } - else { - assert(0); // we should have all maps. - } - } - - messenger->send_message(m, inst); -} - -bool OSD::get_map_bl(epoch_t e, bufferlist& bl) -{ - return store->read(get_osdmap_object_name(e), 0, 0, bl) >= 0; -} - -bool OSD::get_inc_map_bl(epoch_t e, bufferlist& bl) -{ - return store->read(get_inc_osdmap_object_name(e), 0, 0, bl) >= 0; -} - -void OSD::get_map(epoch_t epoch, OSDMap &m) -{ - // find a complete map - list incs; - epoch_t e; - for (e = epoch; e > 0; e--) { - bufferlist bl; - if (get_map_bl(e, bl)) { - //dout(10) << "get_map " << epoch << " full " << e << dendl; - m.decode(bl); - break; - } else { - OSDMap::Incremental inc; - bool got = get_inc_map(e, inc); - assert(got); - incs.push_front(inc); - } - } - assert(e > 0); - - // apply incrementals - for (e++; e <= epoch; e++) { - //dout(10) << "get_map " << epoch << " inc " << e << dendl; - m.apply_incremental( incs.front() ); - incs.pop_front(); - } -} - - -bool OSD::get_inc_map(epoch_t e, OSDMap::Incremental &inc) -{ - bufferlist bl; - if (!get_inc_map_bl(e, bl)) - return false; - int off = 0; - inc.decode(bl, off); - return true; -} - - - - - -bool OSD::require_current_map(Message *m, epoch_t ep) -{ - // older map? - if (ep < osdmap->get_epoch()) { - dout(7) << "require_current_map epoch " << ep << " < " << osdmap->get_epoch() << dendl; - delete m; // discard and ignore. - return false; - } - - // newer map? - if (ep > osdmap->get_epoch()) { - dout(7) << "require_current_map epoch " << ep << " > " << osdmap->get_epoch() << dendl; - wait_for_new_map(m); - return false; - } - - assert(ep == osdmap->get_epoch()); - return true; -} - - -/* - * require that we have same (or newer) map, and that - * the source is the pg primary. - */ -bool OSD::require_same_or_newer_map(Message *m, epoch_t epoch) -{ - dout(10) << "require_same_or_newer_map " << epoch << " (i am " << osdmap->get_epoch() << ")" << dendl; - - // newer map? - if (epoch > osdmap->get_epoch()) { - dout(7) << " from newer map epoch " << epoch << " > " << osdmap->get_epoch() << dendl; - wait_for_new_map(m); - return false; - } - - if (epoch < boot_epoch) { - dout(7) << " from pre-boot epoch " << epoch << " < " << boot_epoch << dendl; - delete m; - return false; - } - - return true; -} - - - - -// ====================================================== -// REPLICATION - -// PG - -bool OSD::pg_exists(pg_t pgid) -{ - return store->collection_exists(pgid); -} - -PG *OSD::create_pg(pg_t pgid, ObjectStore::Transaction& t) -{ - if (pg_map.count(pgid)) { - dout(0) << "create_pg on " << pgid << ", already have " << *pg_map[pgid] << dendl; - } - assert(pg_map.count(pgid) == 0); - assert(!pg_exists(pgid)); - - PG *pg = new PG(this, pgid); - pg_map[pgid] = pg; - - t.create_collection(pgid); - - return pg; -} - - - - -PG *OSD::get_pg(pg_t pgid) -{ - if (pg_map.count(pgid)) - return pg_map[pgid]; - return 0; -} - -void OSD::load_pgs() -{ - dout(10) << "load_pgs" << dendl; - assert(pg_map.empty()); - - list ls; - store->list_collections(ls); - - for (list::iterator it = ls.begin(); - it != ls.end(); - it++) { - pg_t pgid = *it; - - PG *pg = new PG(this, pgid); - pg_map[pgid] = pg; - - // read pg info - store->collection_getattr(pgid, "info", &pg->info, sizeof(pg->info)); - - // read pg log - pg->read_log(store); - - // generate state for current mapping - int nrep = osdmap->pg_to_acting_osds(pgid, pg->acting); - int role = osdmap->calc_pg_role(whoami, pg->acting, nrep); - pg->set_role(role); - - dout(10) << "load_pgs loaded " << *pg << " " << pg->log << dendl; - } -} - -/** - * check epochs starting from start to verify the pg acting set hasn't changed - * up until now - */ -void OSD::project_pg_history(pg_t pgid, PG::Info::History& h, epoch_t from) -{ - dout(15) << "project_pg_history " << pgid - << " from " << from << " to " << osdmap->get_epoch() - << ", start " << h - << dendl; - - vector last; - osdmap->pg_to_acting_osds(pgid, last); - - for (epoch_t e = osdmap->get_epoch()-1; - e >= from; - e--) { - // verify during intermediate epoch - OSDMap oldmap; - get_map(e, oldmap); - - vector acting; - oldmap.pg_to_acting_osds(pgid, acting); - - // acting set change? - if (acting != last && - e <= h.same_since) { - dout(15) << "project_pg_history " << pgid << " changed in " << e+1 - << " from " << acting << " -> " << last << dendl; - h.same_since = e+1; - } - - // primary change? - if (!(!acting.empty() && !last.empty() && acting[0] == last[0]) && - e <= h.same_primary_since) { - dout(15) << "project_pg_history " << pgid << " primary changed in " << e+1 << dendl; - h.same_primary_since = e+1; - - if (g_conf.osd_rep == OSD_REP_PRIMARY) - h.same_acker_since = h.same_primary_since; - } - - // acker change? - if (g_conf.osd_rep != OSD_REP_PRIMARY) { - if (!(!acting.empty() && !last.empty() && acting[acting.size()-1] == last[last.size()-1]) && - e <= h.same_acker_since) { - dout(15) << "project_pg_history " << pgid << " acker changed in " << e+1 << dendl; - h.same_acker_since = e+1; - } - } - - if (h.same_since > e && - h.same_primary_since > e && - h.same_acker_since > e) break; - } - - dout(15) << "project_pg_history end " << h << dendl; -} - - -/** do_notifies - * Send an MOSDPGNotify to a primary, with a list of PGs that I have - * content for, and they are primary for. - */ - -void OSD::do_notifies(map< int, list >& notify_list) -{ - for (map< int, list >::iterator it = notify_list.begin(); - it != notify_list.end(); - it++) { - if (it->first == whoami) { - dout(7) << "do_notify osd" << it->first << " is self, skipping" << dendl; - continue; - } - dout(7) << "do_notify osd" << it->first << " on " << it->second.size() << " PGs" << dendl; - MOSDPGNotify *m = new MOSDPGNotify(osdmap->get_epoch(), it->second); - _share_map_outgoing(osdmap->get_inst(it->first)); - messenger->send_message(m, osdmap->get_inst(it->first)); - } -} - - -/** do_queries - * send out pending queries for info | summaries - */ -void OSD::do_queries(map< int, map >& query_map) -{ - for (map< int, map >::iterator pit = query_map.begin(); - pit != query_map.end(); - pit++) { - int who = pit->first; - dout(7) << "do_queries querying osd" << who - << " on " << pit->second.size() << " PGs" << dendl; - - MOSDPGQuery *m = new MOSDPGQuery(osdmap->get_epoch(), - pit->second); - _share_map_outgoing(osdmap->get_inst(who)); - messenger->send_message(m, osdmap->get_inst(who)); - } -} - - - - -/** PGNotify - * from non-primary to primary - * includes PG::Info. - * NOTE: called with opqueue active. - */ -void OSD::handle_pg_notify(MOSDPGNotify *m) -{ - dout(7) << "handle_pg_notify from " << m->get_source() << dendl; - int from = m->get_source().num(); - - if (!require_same_or_newer_map(m, m->get_epoch())) return; - - ObjectStore::Transaction t; - - // look for unknown PGs i'm primary for - map< int, map > query_map; - - for (list::iterator it = m->get_pg_list().begin(); - it != m->get_pg_list().end(); - it++) { - pg_t pgid = it->pgid; - PG *pg; - - if (pg_map.count(pgid) == 0) { - // same primary? - PG::Info::History history = it->history; - project_pg_history(pgid, history, m->get_epoch()); - - if (m->get_epoch() < history.same_primary_since) { - dout(10) << "handle_pg_notify pg " << pgid << " dne, and primary changed in " - << history.same_primary_since << " (msg from " << m->get_epoch() << ")" << dendl; - continue; - } - - // ok, create PG! - pg = create_pg(pgid, t); - osdmap->pg_to_acting_osds(pgid, pg->acting); - pg->set_role(0); - pg->info.history = history; - - pg->last_epoch_started_any = it->last_epoch_started; - pg->build_prior(); - - t.collection_setattr(pgid, "info", (char*)&pg->info, sizeof(pg->info)); - - dout(10) << *pg << " is new" << dendl; - - // kick any waiters - if (waiting_for_pg.count(pgid)) { - take_waiters(waiting_for_pg[pgid]); - waiting_for_pg.erase(pgid); - } - - _lock_pg(pgid); - } else { - // already had it. am i (still) the primary? - pg = _lock_pg(pgid); - if (m->get_epoch() < pg->info.history.same_primary_since) { - dout(10) << *pg << " handle_pg_notify primary changed in " - << pg->info.history.same_primary_since - << " (msg from " << m->get_epoch() << ")" << dendl; - _unlock_pg(pgid); - continue; - } - } - - // ok! - - // stray? - bool acting = pg->is_acting(from); - if (!acting && (*it).last_epoch_started > 0) { - dout(10) << *pg << " osd" << from << " has stray content: " << *it << dendl; - pg->stray_set.insert(from); - pg->state_clear(PG::STATE_CLEAN); - } - - // save info. - bool had = pg->peer_info.count(from); - pg->peer_info[from] = *it; - - if (had) { - if (pg->is_active() && - (*it).is_clean() && acting) { - pg->clean_set.insert(from); - dout(10) << *pg << " osd" << from << " now clean (" << pg->clean_set - << "): " << *it << dendl; - if (pg->is_all_clean()) { - dout(10) << *pg << " now clean on all replicas" << dendl; - pg->state_set(PG::STATE_CLEAN); - pg->clean_replicas(); - } - } else { - // hmm, maybe keep an eye out for cases where we see this, but peer should happen. - dout(10) << *pg << " already had notify info from osd" << from << ": " << *it << dendl; - } - } else { - // adjust prior? - if (it->last_epoch_started > pg->last_epoch_started_any) - pg->adjust_prior(); - - // peer - pg->peer(t, query_map); - } - - _unlock_pg(pgid); - } - - unsigned tr = store->apply_transaction(t); - assert(tr == 0); - - do_queries(query_map); - - delete m; -} - - - -/** PGLog - * from non-primary to primary - * includes log and info - * from primary to non-primary - * includes log for use in recovery - * NOTE: called with opqueue active. - */ - -void OSD::handle_pg_log(MOSDPGLog *m) -{ - int from = m->get_source().num(); - const pg_t pgid = m->get_pgid(); - - if (!require_same_or_newer_map(m, m->get_epoch())) return; - if (pg_map.count(pgid) == 0) { - dout(10) << "handle_pg_log don't have pg " << pgid << ", dropping" << dendl; - assert(m->get_epoch() < osdmap->get_epoch()); - delete m; - return; - } - - PG *pg = _lock_pg(pgid); - assert(pg); - - if (m->get_epoch() < pg->info.history.same_since) { - dout(10) << "handle_pg_log " << *pg - << " from " << m->get_source() - << " is old, discarding" - << dendl; - delete m; - return; - } - - dout(7) << "handle_pg_log " << *pg - << " got " << m->log << " " << m->missing - << " from " << m->get_source() << dendl; - - //m->log.print(cout); - - ObjectStore::Transaction t; - - if (pg->is_primary()) { - // i am PRIMARY - assert(pg->peer_log_requested.count(from) || - pg->peer_summary_requested.count(from)); - - pg->proc_replica_log(m->log, m->missing, from); - - // peer - map< int, map > query_map; - pg->peer(t, query_map); - do_queries(query_map); - - } else { - // i am REPLICA - dout(10) << *pg << " got " << m->log << " " << m->missing << dendl; - - // merge log - pg->merge_log(m->log, m->missing, from); - pg->proc_missing(m->log, m->missing, from); - assert(pg->missing.num_lost() == 0); - - // ok activate! - pg->activate(t); - } - - unsigned tr = store->apply_transaction(t); - assert(tr == 0); - - _unlock_pg(pgid); - - delete m; -} - - -/** PGQuery - * from primary to replica | stray - * NOTE: called with opqueue active. - */ -void OSD::handle_pg_query(MOSDPGQuery *m) -{ - dout(7) << "handle_pg_query from " << m->get_source() << " epoch " << m->get_epoch() << dendl; - int from = m->get_source().num(); - - if (!require_same_or_newer_map(m, m->get_epoch())) return; - - map< int, list > notify_list; - - for (map::iterator it = m->pg_list.begin(); - it != m->pg_list.end(); - it++) { - pg_t pgid = it->first; - PG *pg = 0; - - if (pg_map.count(pgid) == 0) { - // same primary? - PG::Info::History history = it->second.history; - project_pg_history(pgid, history, m->get_epoch()); - - if (m->get_epoch() < history.same_since) { - dout(10) << " pg " << pgid << " dne, and pg has changed in " - << history.same_primary_since << " (msg from " << m->get_epoch() << ")" << dendl; - continue; - } - - // get active rush mapping - vector acting; - int nrep = osdmap->pg_to_acting_osds(pgid, acting); - int role = osdmap->calc_pg_role(whoami, acting, nrep); - - if (role < 0) { - dout(10) << " pg " << pgid << " dne, and i am not an active replica" << dendl; - PG::Info empty(pgid); - notify_list[from].push_back(empty); - continue; - } - assert(role > 0); - - ObjectStore::Transaction t; - pg = create_pg(pgid, t); - pg->acting.swap( acting ); - pg->set_role(role); - pg->info.history = history; - - t.collection_setattr(pgid, "info", (char*)&pg->info, sizeof(pg->info)); - store->apply_transaction(t); - - dout(10) << *pg << " dne (before), but i am role " << role << dendl; - _lock_pg(pgid); - } else { - pg = _lock_pg(pgid); - - // same primary? - if (m->get_epoch() < pg->info.history.same_since) { - dout(10) << *pg << " handle_pg_query primary changed in " - << pg->info.history.same_since - << " (msg from " << m->get_epoch() << ")" << dendl; - _unlock_pg(pgid); - continue; - } - } - - // ok, process query! - assert(!pg->acting.empty()); - assert(from == pg->acting[0]); - - if (it->second.type == PG::Query::INFO) { - // info - dout(10) << *pg << " sending info" << dendl; - notify_list[from].push_back(pg->info); - } else { - MOSDPGLog *m = new MOSDPGLog(osdmap->get_epoch(), pg->get_pgid()); - m->info = pg->info; - m->missing = pg->missing; - - if (it->second.type == PG::Query::LOG) { - dout(10) << *pg << " sending info+missing+log since split " << it->second.split - << " from floor " << it->second.floor - << dendl; - if (!m->log.copy_after_unless_divergent(pg->log, it->second.split, it->second.floor)) { - dout(10) << *pg << " divergent, sending backlog" << dendl; - it->second.type = PG::Query::BACKLOG; - } - } - - if (it->second.type == PG::Query::BACKLOG) { - dout(10) << *pg << " sending info+missing+backlog" << dendl; - if (pg->log.backlog) { - m->log = pg->log; - } else { - pg->generate_backlog(); - m->log = pg->log; - pg->drop_backlog(); - } - } - else if (it->second.type == PG::Query::FULLLOG) { - dout(10) << *pg << " sending info+missing+full log" << dendl; - m->log.copy_non_backlog(pg->log); - } - - dout(10) << *pg << " sending " << m->log << " " << m->missing << dendl; - //m->log.print(cout); - - _share_map_outgoing(osdmap->get_inst(from)); - messenger->send_message(m, osdmap->get_inst(from)); - } - - _unlock_pg(pgid); - } - - do_notifies(notify_list); - - delete m; -} - - -void OSD::handle_pg_remove(MOSDPGRemove *m) -{ - dout(7) << "handle_pg_remove from " << m->get_source() << dendl; - - if (!require_same_or_newer_map(m, m->get_epoch())) return; - - for (set::iterator it = m->pg_list.begin(); - it != m->pg_list.end(); - it++) { - pg_t pgid = *it; - PG *pg; - - if (pg_map.count(pgid) == 0) { - dout(10) << " don't have pg " << pgid << dendl; - continue; - } - - pg = _lock_pg(pgid); - - dout(10) << *pg << " removing." << dendl; - assert(pg->get_role() == -1); - - _remove_pg(pgid); - - // unlock. there shouldn't be any waiters, since we're a stray, and pg is presumably clean0. - assert(pg_lock_waiters.count(pgid) == 0); - _unlock_pg(pgid); - } - - delete m; -} - - - - - - -/*** RECOVERY ***/ - -/** pull - request object from a peer - */ -void OSD::pull(PG *pg, object_t oid) -{ - assert(pg->missing.loc.count(oid)); - eversion_t v = pg->missing.missing[oid]; - int osd = pg->missing.loc[oid]; - - dout(7) << *pg << " pull " << oid - << " v " << v - << " from osd" << osd - << dendl; - - // send op - tid_t tid = ++last_tid; - MOSDOp *op = new MOSDOp(messenger->get_myinst(), 0, tid, - oid, pg->get_pgid(), - osdmap->get_epoch(), - OSD_OP_PULL); - op->set_version(v); - messenger->send_message(op, osdmap->get_inst(osd)); - - // take note - assert(pg->objects_pulling.count(oid) == 0); - num_pulling++; - pg->objects_pulling[oid] = v; -} - - -/** push - send object to a peer - */ -void OSD::push(PG *pg, object_t oid, int dest) -{ - // read data+attrs - bufferlist bl; - eversion_t v; - int vlen = sizeof(v); - map attrset; - - ObjectStore::Transaction t; - t.read(oid, 0, 0, &bl); - t.getattr(oid, "version", &v, &vlen); - t.getattrs(oid, attrset); - unsigned tr = store->apply_transaction(t); - - assert(tr == 0); // !!! - - // ok - dout(7) << *pg << " push " << oid << " v " << v - << " size " << bl.length() - << " to osd" << dest - << dendl; - - logger->inc("r_push"); - logger->inc("r_pushb", bl.length()); - - // send - MOSDOp *op = new MOSDOp(messenger->get_myinst(), 0, ++last_tid, - oid, pg->info.pgid, osdmap->get_epoch(), - OSD_OP_PUSH); - op->set_offset(0); - op->set_length(bl.length()); - op->set_data(bl); // note: claims bl, set length above here! - op->set_version(v); - op->set_attrset(attrset); - - messenger->send_message(op, osdmap->get_inst(dest)); -} - - -/** op_pull - * process request to pull an entire object. - * NOTE: called from opqueue. - */ -void OSD::op_pull(MOSDOp *op, PG *pg) -{ - const object_t oid = op->get_oid(); - const eversion_t v = op->get_version(); - int from = op->get_source().num(); - - dout(7) << *pg << " op_pull " << oid << " v " << op->get_version() - << " from " << op->get_source() - << dendl; - - // is a replica asking? are they missing it? - if (pg->is_primary()) { - // primary - assert(pg->peer_missing.count(from)); // we had better know this, from the peering process. - - if (!pg->peer_missing[from].is_missing(oid)) { - dout(7) << *pg << " op_pull replica isn't actually missing it, we must have already pushed to them" << dendl; - delete op; - return; - } - - // do we have it yet? - if (waitfor_missing_object(op, pg)) - return; - } else { - // non-primary - if (pg->missing.is_missing(oid)) { - dout(7) << *pg << " op_pull not primary, and missing " << oid << ", ignoring" << dendl; - delete op; - return; - } - } - - // push it back! - push(pg, oid, op->get_source().num()); -} - - -/** op_push - * NOTE: called from opqueue. - */ -void OSD::op_push(MOSDOp *op, PG *pg) -{ - object_t oid = op->get_oid(); - eversion_t v = op->get_version(); - - if (!pg->missing.is_missing(oid)) { - dout(7) << *pg << " op_push not missing " << oid << dendl; - return; - } - - dout(7) << *pg << " op_push " - << oid - << " v " << v - << " size " << op->get_length() << " " << op->get_data().length() - << dendl; - - assert(op->get_data().length() == op->get_length()); - - // write object and add it to the PG - ObjectStore::Transaction t; - t.remove(oid); // in case old version exists - t.write(oid, 0, op->get_length(), op->get_data()); - t.setattrs(oid, op->get_attrset()); - t.collection_add(pg->info.pgid, oid); - - // close out pull op? - num_pulling--; - if (pg->objects_pulling.count(oid)) - pg->objects_pulling.erase(oid); - pg->missing.got(oid, v); - - - // raise last_complete? - assert(pg->log.complete_to != pg->log.log.end()); - while (pg->log.complete_to != pg->log.log.end()) { - if (pg->missing.missing.count(pg->log.complete_to->oid)) break; - if (pg->info.last_complete < pg->log.complete_to->version) - pg->info.last_complete = pg->log.complete_to->version; - pg->log.complete_to++; - } - dout(10) << *pg << " last_complete now " << pg->info.last_complete << dendl; - - - // apply to disk! - t.collection_setattr(pg->info.pgid, "info", &pg->info, sizeof(pg->info)); - unsigned r = store->apply_transaction(t); - assert(r == 0); - - - - // am i primary? are others missing this too? - if (pg->is_primary()) { - for (unsigned i=1; iacting.size(); i++) { - int peer = pg->acting[i]; - assert(pg->peer_missing.count(peer)); - if (pg->peer_missing[peer].is_missing(oid)) { - // ok, push it, and they (will) have it now. - pg->peer_missing[peer].got(oid, v); - push(pg, oid, peer); - } - } - } - - // continue recovery - pg->do_recovery(); - - // kick waiters - if (pg->waiting_for_missing_object.count(oid)) - take_waiters(pg->waiting_for_missing_object[oid]); - - delete op; -} - - - - -// op_rep_modify - -// commit (to disk) callback -class C_OSD_RepModifyCommit : public Context { -public: - OSD *osd; - MOSDOp *op; - int destosd; - - eversion_t pg_last_complete; - - Mutex lock; - Cond cond; - bool acked; - bool waiting; - - C_OSD_RepModifyCommit(OSD *o, MOSDOp *oo, int dosd, eversion_t lc) : - osd(o), op(oo), destosd(dosd), pg_last_complete(lc), - acked(false), waiting(false) { } - void finish(int r) { - lock.Lock(); - assert(!waiting); - while (!acked) { - waiting = true; - cond.Wait(lock); - } - assert(acked); - lock.Unlock(); - osd->op_rep_modify_commit(op, destosd, pg_last_complete); - } - void ack() { - lock.Lock(); - assert(!acked); - acked = true; - if (waiting) cond.Signal(); - - // discard my reference to buffer - op->get_data().clear(); - - lock.Unlock(); - } -}; - -void OSD::op_rep_modify_commit(MOSDOp *op, int ackerosd, eversion_t last_complete) -{ - // send commit. - dout(10) << "rep_modify_commit on op " << *op - << ", sending commit to osd" << ackerosd - << dendl; - MOSDOpReply *commit = new MOSDOpReply(op, 0, osdmap->get_epoch(), true); - commit->set_pg_complete_thru(last_complete); - messenger->send_message(commit, osdmap->get_inst(ackerosd)); - delete op; -} - -// process a modification operation - -class C_OSD_WriteCommit : public Context { -public: - OSD *osd; - pg_t pgid; - tid_t rep_tid; - eversion_t pg_last_complete; - C_OSD_WriteCommit(OSD *o, pg_t p, tid_t rt, eversion_t lc) : osd(o), pgid(p), rep_tid(rt), pg_last_complete(lc) {} - void finish(int r) { - osd->op_modify_commit(pgid, rep_tid, pg_last_complete); - } -}; - - -/** op_rep_modify - * process a replicated modify. - * NOTE: called from opqueue. - */ -void OSD::op_rep_modify(MOSDOp *op, PG *pg) -{ - object_t oid = op->get_oid(); - eversion_t nv = op->get_version(); - - const char *opname = MOSDOp::get_opname(op->get_op()); - - // check crev - objectrev_t crev = 0; - store->getattr(oid, "crev", (char*)&crev, sizeof(crev)); - - dout(10) << "op_rep_modify " << opname - << " " << oid - << " v " << nv - << " " << op->get_offset() << "~" << op->get_length() - << " in " << *pg - << dendl; - - // we better not be missing this. - assert(!pg->missing.is_missing(oid)); - - // prepare our transaction - ObjectStore::Transaction t; - - // am i acker? - PG::RepOpGather *repop = 0; - int ackerosd = pg->acting[0]; - - if ((g_conf.osd_rep == OSD_REP_CHAIN || g_conf.osd_rep == OSD_REP_SPLAY)) { - ackerosd = pg->get_acker(); - - if (pg->is_acker()) { - // i am tail acker. - if (pg->repop_gather.count(op->get_rep_tid())) { - repop = pg->repop_gather[ op->get_rep_tid() ]; - } else { - repop = new_repop_gather(pg, op); - } - - // infer ack from source - int fromosd = op->get_source().num(); - get_repop_gather(repop); - { - //assert(repop->waitfor_ack.count(fromosd)); // no, we may come thru here twice. - repop->waitfor_ack.erase(fromosd); - } - put_repop_gather(pg, repop); - - // prepare dest socket - //messenger->prepare_send_message(op->get_client()); - } - - // chain? forward? - if (g_conf.osd_rep == OSD_REP_CHAIN && !pg->is_acker()) { - // chain rep, not at the tail yet. - int myrank = osdmap->calc_pg_rank(whoami, pg->acting); - int next = myrank+1; - if (next == (int)pg->acting.size()) - next = 1; - issue_repop(pg, op, pg->acting[next]); - } - } - - // do op? - C_OSD_RepModifyCommit *oncommit = 0; - - logger->inc("r_wr"); - logger->inc("r_wrb", op->get_length()); - - if (repop) { - // acker. we'll apply later. - if (op->get_op() != OSD_OP_WRNOOP) { - prepare_log_transaction(repop->t, op, nv, crev, op->get_rev(), pg, op->get_pg_trim_to()); - prepare_op_transaction(repop->t, op, nv, crev, op->get_rev(), pg); - } - } else { - // middle|replica. - if (op->get_op() != OSD_OP_WRNOOP) { - prepare_log_transaction(t, op, nv, crev, op->get_rev(), pg, op->get_pg_trim_to()); - prepare_op_transaction(t, op, nv, crev, op->get_rev(), pg); - } - - oncommit = new C_OSD_RepModifyCommit(this, op, ackerosd, pg->info.last_complete); - - // apply log update. and possibly update itself. - unsigned tr = store->apply_transaction(t, oncommit); - if (tr != 0 && // no errors - tr != 2) { // or error on collection_add - cerr << "error applying transaction: r = " << tr << dendl; - assert(tr == 0); - } - } - - // ack? - if (repop) { - // (logical) local ack. this may induce the actual update. - get_repop_gather(repop); - { - assert(repop->waitfor_ack.count(whoami)); - repop->waitfor_ack.erase(whoami); - } - put_repop_gather(pg, repop); - } - else { - // send ack to acker? - if (g_conf.osd_rep != OSD_REP_CHAIN) { - MOSDOpReply *ack = new MOSDOpReply(op, 0, osdmap->get_epoch(), false); - messenger->send_message(ack, osdmap->get_inst(ackerosd)); - } - - // ack myself. - assert(oncommit); - oncommit->ack(); - } -} - - -// ========================================================= -// OPS - -void OSD::handle_op(MOSDOp *op) -{ - const pg_t pgid = op->get_pg(); - PG *pg = get_pg(pgid); - - - logger->set("buf", buffer_total_alloc); - - // update qlen stats - hb_stat_ops++; - hb_stat_qlen += pending_ops; - - - // require same or newer map - if (!require_same_or_newer_map(op, op->get_map_epoch())) return; - - // share our map with sender, if they're old - _share_map_incoming(op->get_source_inst(), op->get_map_epoch()); - - // what kind of op? - bool read = op->get_op() < 10; // read, stat. but not pull. - - if (!op->get_source().is_osd()) { - // REGULAR OP (non-replication) - - // note original source - op->set_client_inst( op->get_source_inst() ); - op->clear_payload(); // and hose encoded payload (in case we forward) - - // have pg? - if (!pg) { - dout(7) << "hit non-existent pg " - << pgid - << ", waiting" << dendl; - waiting_for_pg[pgid].push_back(op); - return; - } - - if (read) { - // read. am i the (same) acker? - if (//pg->get_acker() != whoami || - op->get_map_epoch() < pg->info.history.same_acker_since) { - dout(7) << "acting acker is osd" << pg->get_acker() - << " since " << pg->info.history.same_acker_since - << ", dropping" << dendl; - assert(op->get_map_epoch() < osdmap->get_epoch()); - delete op; - return; - } - } else { - // write. am i the (same) primary? - if (pg->get_primary() != whoami || - op->get_map_epoch() < pg->info.history.same_primary_since) { - dout(7) << "acting primary is osd" << pg->get_primary() - << " since " << pg->info.history.same_primary_since - << ", dropping" << dendl; - assert(op->get_map_epoch() < osdmap->get_epoch()); - delete op; - return; - } - } - - // must be active. - if (!pg->is_active()) { - // replay? - if (op->get_version().version > 0) { - if (op->get_version() > pg->info.last_update) { - dout(7) << *pg << " queueing replay at " << op->get_version() - << " for " << *op << dendl; - pg->replay_queue[op->get_version()] = op; - return; - } else { - dout(7) << *pg << " replay at " << op->get_version() << " <= " << pg->info.last_update - << " for " << *op - << ", will queue for WRNOOP" << dendl; - } - } - - dout(7) << *pg << " not active (yet)" << dendl; - pg->waiting_for_active.push_back(op); - return; - } - - // missing object? - if (read && op->get_oid().rev > 0) { - // versioned read. hrm. - // are we missing a revision that we might need? - object_t moid = op->get_oid(); - if (pick_missing_object_rev(moid, pg)) { - // is there a local revision we might use instead? - object_t loid = op->get_oid(); - if (store->pick_object_revision_lt(loid) && - moid <= loid) { - // we need moid. pull it. - dout(10) << "handle_op read on " << op->get_oid() - << ", have " << loid - << ", but need missing " << moid - << ", pulling" << dendl; - pull(pg, moid); - pg->waiting_for_missing_object[moid].push_back(op); - return; - } - - dout(10) << "handle_op read on " << op->get_oid() - << ", have " << loid - << ", don't need missing " << moid - << dendl; - } - } else { - // live revision. easy. - if (op->get_op() != OSD_OP_PUSH && - waitfor_missing_object(op, pg)) return; - } - - dout(7) << "handle_op " << *op << " in " << *pg << dendl; - - - // balance reads? - if (read && - g_conf.osd_balance_reads && - pg->get_acker() == whoami) { - // test - if (false) { - if (pg->acting.size() > 1) { - int peer = pg->acting[1]; - dout(-10) << "fwd client read op to osd" << peer << " for " << op->get_client() << " " << op->get_client_inst() << dendl; - messenger->send_message(op, osdmap->get_inst(peer)); - return; - } - } - - // am i above my average? - float my_avg = hb_stat_qlen / hb_stat_ops; - if (pending_ops > my_avg) { - // is there a peer who is below my average? - for (unsigned i=1; iacting.size(); ++i) { - int peer = pg->acting[i]; - if (peer_qlen.count(peer) && - peer_qlen[peer] < my_avg) { - // calculate a probability that we should redirect - float p = (my_avg - peer_qlen[peer]) / my_avg; // this is dumb. - - if (drand48() <= p) { - // take the first one - dout(-10) << "my qlen " << pending_ops << " > my_avg " << my_avg - << ", p=" << p - << ", fwd to peer w/ qlen " << peer_qlen[peer] - << " osd" << peer - << dendl; - messenger->send_message(op, osdmap->get_inst(peer)); - return; - } - } - } - } - } - - } else { - // REPLICATION OP (it's from another OSD) - - // have pg? - if (!pg) { - derr(-7) << "handle_rep_op " << *op - << " pgid " << pgid << " dne" << dendl; - delete op; - //assert(0); // wtf, shouldn't happen. - return; - } - - // check osd map: same set, or primary+acker? - if (g_conf.osd_rep == OSD_REP_CHAIN && - op->get_map_epoch() < pg->info.history.same_since) { - dout(10) << "handle_rep_op pg changed " << pg->info.history - << " after " << op->get_map_epoch() - << ", dropping" << dendl; - delete op; - return; - } - if (g_conf.osd_rep != OSD_REP_CHAIN && - (op->get_map_epoch() < pg->info.history.same_primary_since || - op->get_map_epoch() < pg->info.history.same_acker_since)) { - dout(10) << "handle_rep_op pg primary|acker changed " << pg->info.history - << " after " << op->get_map_epoch() - << ", dropping" << dendl; - delete op; - return; - } - - assert(pg->get_role() >= 0); - dout(7) << "handle_rep_op " << op << " in " << *pg << dendl; - } - - if (g_conf.osd_maxthreads < 1) { - _lock_pg(pgid); - do_op(op, pg); // do it now - _unlock_pg(pgid); - } else { - // queue for worker threads - if (read) - enqueue_op(0, op); // no locking needed for reads - else - enqueue_op(pgid, op); - } -} - -void OSD::handle_op_reply(MOSDOpReply *op) -{ - if (op->get_map_epoch() < boot_epoch) { - dout(3) << "replica op reply from before boot" << dendl; - delete op; - return; - } - - // must be a rep op. - assert(op->get_source().is_osd()); - - // make sure we have the pg - const pg_t pgid = op->get_pg(); - PG *pg = get_pg(pgid); - - // require same or newer map - if (!require_same_or_newer_map(op, op->get_map_epoch())) return; - - // share our map with sender, if they're old - _share_map_incoming(op->get_source_inst(), op->get_map_epoch()); - - if (!pg) { - // hmm. - delete op; - } - - if (g_conf.osd_maxthreads < 1) { - _lock_pg(pgid); - do_op(op, pg); // do it now - _unlock_pg(pgid); - } else { - enqueue_op(pgid, op); // queue for worker threads - } -} - - -/* - * enqueue called with osd_lock held - */ -void OSD::enqueue_op(pg_t pgid, Message *op) -{ - while (pending_ops > g_conf.osd_max_opq) { - dout(10) << "enqueue_op waiting for pending_ops " << pending_ops << " to drop to " << g_conf.osd_max_opq << dendl; - op_queue_cond.Wait(osd_lock); - } - - op_queue[pgid].push_back(op); - pending_ops++; - logger->set("opq", pending_ops); - - threadpool->put_op(pgid); -} - -/* - * NOTE: dequeue called in worker thread, without osd_lock - */ -void OSD::dequeue_op(pg_t pgid) -{ - Message *op = 0; - PG *pg = 0; - - osd_lock.Lock(); - { - if (pgid) { - // lock pg - pg = _lock_pg(pgid); - } - - // get pending op - list &ls = op_queue[pgid]; - assert(!ls.empty()); - op = ls.front(); - ls.pop_front(); - - if (pgid) { - dout(10) << "dequeue_op " << op << " write pg " << pgid - << ls.size() << " / " << (pending_ops-1) << " more pending" << dendl; - } else { - dout(10) << "dequeue_op " << op << " read " - << ls.size() << " / " << (pending_ops-1) << " more pending" << dendl; - } - - if (ls.empty()) - op_queue.erase(pgid); - } - osd_lock.Unlock(); - - // do it - do_op(op, pg); - - // finish - osd_lock.Lock(); - { - if (pgid) { - // unlock pg - _unlock_pg(pgid); - } - - dout(10) << "dequeue_op " << op << " finish" << dendl; - assert(pending_ops > 0); - - if (pending_ops > g_conf.osd_max_opq) - op_queue_cond.Signal(); - - pending_ops--; - logger->set("opq", pending_ops); - if (pending_ops == 0 && waiting_for_no_ops) - no_pending_ops.Signal(); - } - osd_lock.Unlock(); -} - - - -/** do_op - do an op - * object lock will be held (if multithreaded) - * osd_lock NOT held. - */ -void OSD::do_op(Message *m, PG *pg) -{ - //dout(15) << "do_op " << *m << dendl; - - if (m->get_type() == MSG_OSD_OP) { - MOSDOp *op = (MOSDOp*)m; - - logger->inc("op"); - - switch (op->get_op()) { - - // reads - case OSD_OP_READ: - op_read(op);//, pg); - break; - case OSD_OP_STAT: - op_stat(op);//, pg); - break; - - // rep stuff - case OSD_OP_PULL: - op_pull(op, pg); - break; - case OSD_OP_PUSH: - op_push(op, pg); - break; - - // writes - case OSD_OP_WRNOOP: - case OSD_OP_WRITE: - case OSD_OP_ZERO: - case OSD_OP_DELETE: - case OSD_OP_TRUNCATE: - case OSD_OP_WRLOCK: - case OSD_OP_WRUNLOCK: - case OSD_OP_RDLOCK: - case OSD_OP_RDUNLOCK: - case OSD_OP_UPLOCK: - case OSD_OP_DNLOCK: - if (op->get_source().is_osd()) - op_rep_modify(op, pg); - else - op_modify(op, pg); - break; - - default: - assert(0); - } - } - else if (m->get_type() == MSG_OSD_OPREPLY) { - // must be replication. - MOSDOpReply *r = (MOSDOpReply*)m; - tid_t rep_tid = r->get_rep_tid(); - - if (pg->repop_gather.count(rep_tid)) { - // oh, good. - int fromosd = r->get_source().num(); - repop_ack(pg, pg->repop_gather[rep_tid], - r->get_result(), r->get_commit(), - fromosd, - r->get_pg_complete_thru()); - delete m; - } else { - // early ack. - pg->waiting_for_repop[rep_tid].push_back(r); - } - - } else - assert(0); -} - - - -void OSD::wait_for_no_ops() -{ - if (pending_ops > 0) { - dout(7) << "wait_for_no_ops - waiting for " << pending_ops << dendl; - waiting_for_no_ops = true; - while (pending_ops > 0) - no_pending_ops.Wait(osd_lock); - waiting_for_no_ops = false; - assert(pending_ops == 0); - } - dout(7) << "wait_for_no_ops - none" << dendl; -} - - -// ============================== -// Object locking - -// -// If the target object of the operation op is locked for writing by another client, the function puts op to the waiting queue waiting_for_wr_unlock -// returns true if object was locked, otherwise returns false -// -bool OSD::block_if_wrlocked(MOSDOp* op) -{ - object_t oid = op->get_oid(); - - entity_name_t source; - int len = store->getattr(oid, "wrlock", &source, sizeof(entity_name_t)); - //cout << "getattr returns " << len << " on " << oid << dendl; - - if (len == sizeof(source) && - source != op->get_client()) { - //the object is locked for writing by someone else -- add the op to the waiting queue - waiting_for_wr_unlock[oid].push_back(op); - return true; - } - - return false; //the object wasn't locked, so the operation can be handled right away -} - - - -// =============================== -// OPS - -/* -int OSD::list_missing_revs(object_t oid, set& revs, PG *pg) -{ - int c = 0; - oid.rev = 0; - - map::iterator p = pg->missing.missing.lower_bound(oid); - if (p == pg->missing.missing.end()) - return 0; // clearly not - - while (p->first.ino == oid.ino && - p->first.bno == oid.bno) { - revs.insert(p->first); - c++; - } - return c; -}*/ - -bool OSD::pick_missing_object_rev(object_t& oid, PG *pg) -{ - map::iterator p = pg->missing.missing.upper_bound(oid); - if (p == pg->missing.missing.end()) - return false; // clearly no candidate - - if (p->first.ino == oid.ino && p->first.bno == oid.bno) { - oid = p->first; // yes! it's an upper bound revision for me. - return true; - } - return false; -} - -bool OSD::pick_object_rev(object_t& oid) -{ - object_t t = oid; - - if (!store->pick_object_revision_lt(t)) - return false; // we have no revisions of this object! - - objectrev_t crev; - int r = store->getattr(t, "crev", &crev, sizeof(crev)); - assert(r >= 0); - if (crev <= oid.rev) { - dout(10) << "pick_object_rev choosing " << t << " crev " << crev << " for " << oid << dendl; - oid = t; - return true; - } - - return false; -} - -bool OSD::waitfor_missing_object(MOSDOp *op, PG *pg) -{ - const object_t oid = op->get_oid(); - - // are we missing the object? - if (pg->missing.missing.count(oid)) { - // we don't have it (yet). - eversion_t v = pg->missing.missing[oid]; - if (pg->objects_pulling.count(oid)) { - dout(7) << "missing " - << oid - << " v " << v - << " in " << *pg - << ", already pulling" - << dendl; - } else { - dout(7) << "missing " - << oid - << " v " << v - << " in " << *pg - << ", pulling" - << dendl; - pull(pg, oid); - } - pg->waiting_for_missing_object[oid].push_back(op); - return true; - } - - return false; -} - - - - -// READ OPS - -/** op_read - * client read op - * NOTE: called from opqueue. - */ -void OSD::op_read(MOSDOp *op)//, PG *pg) -{ - object_t oid = op->get_oid(); - - // if the target object is locked for writing by another client, put 'op' to the waiting queue - // for _any_ op type -- eg only the locker can unlock! - if (block_if_wrlocked(op)) return; // op will be handled later, after the object unlocks - - dout(10) << "op_read " << oid - << " " << op->get_offset() << "~" << op->get_length() - //<< " in " << *pg - << dendl; - - long r = 0; - bufferlist bl; - - if (oid.rev && !pick_object_rev(oid)) { - // we have no revision for this request. - r = -EEXIST; - } else { - // read into a buffer - r = store->read(oid, - op->get_offset(), op->get_length(), - bl); - } - - // set up reply - MOSDOpReply *reply = new MOSDOpReply(op, 0, osdmap->get_epoch(), true); - if (r >= 0) { - reply->set_result(0); - reply->set_data(bl); - reply->set_length(r); - - logger->inc("c_rd"); - logger->inc("c_rdb", r); - - } else { - reply->set_result(r); // error - reply->set_length(0); - } - - dout(10) << " read got " << r << " / " << op->get_length() << " bytes from obj " << oid << dendl; - - logger->inc("rd"); - if (r >= 0) logger->inc("rdb", r); - - // send it - messenger->send_message(reply, op->get_client_inst()); - - delete op; -} - - -/** op_stat - * client stat - * NOTE: called from opqueue - */ -void OSD::op_stat(MOSDOp *op)//, PG *pg) -{ - object_t oid = op->get_oid(); - - // if the target object is locked for writing by another client, put 'op' to the waiting queue - if (block_if_wrlocked(op)) return; //read will be handled later, after the object unlocks - - struct stat st; - memset(&st, sizeof(st), 0); - int r = 0; - - if (oid.rev && !pick_object_rev(oid)) { - // we have no revision for this request. - r = -EEXIST; - } else { - r = store->stat(oid, &st); - } - - dout(3) << "op_stat on " << oid - << " r = " << r - << " size = " << st.st_size - //<< " in " << *pg - << dendl; - - MOSDOpReply *reply = new MOSDOpReply(op, r, osdmap->get_epoch(), true); - reply->set_object_size(st.st_size); - messenger->send_message(reply, op->get_client_inst()); - - logger->inc("stat"); - - delete op; -} - - - -/********* - * new repops - */ - -void OSD::get_repop_gather(PG::RepOpGather *repop) -{ - //repop->lock.Lock(); - dout(10) << "get_repop " << *repop << dendl; -} - -void OSD::apply_repop(PG *pg, PG::RepOpGather *repop) -{ - dout(10) << "apply_repop applying update on " << *repop << dendl; - assert(!repop->applied); - - Context *oncommit = new C_OSD_WriteCommit(this, pg->info.pgid, repop->rep_tid, repop->pg_local_last_complete); - unsigned r = store->apply_transaction(repop->t, oncommit); - if (r) - dout(-10) << "apply_repop apply transaction return " << r << " on " << *repop << dendl; - - // discard my reference to buffer - repop->op->get_data().clear(); - - repop->applied = true; -} - -void OSD::put_repop_gather(PG *pg, PG::RepOpGather *repop) -{ - dout(10) << "put_repop " << *repop << dendl; - - // commit? - if (repop->can_send_commit() && - repop->op->wants_commit()) { - // send commit. - MOSDOpReply *reply = new MOSDOpReply(repop->op, 0, osdmap->get_epoch(), true); - dout(10) << "put_repop sending commit on " << *repop << " " << reply << dendl; - messenger->send_message(reply, repop->op->get_client_inst()); - repop->sent_commit = true; - } - - // ack? - else if (repop->can_send_ack() && - repop->op->wants_ack()) { - // apply - apply_repop(pg, repop); - - // send ack - MOSDOpReply *reply = new MOSDOpReply(repop->op, 0, osdmap->get_epoch(), false); - dout(10) << "put_repop sending ack on " << *repop << " " << reply << dendl; - messenger->send_message(reply, repop->op->get_client_inst()); - repop->sent_ack = true; - - utime_t now = g_clock.now(); - now -= repop->start; - logger->finc("rlsum", now); - logger->inc("rlnum", 1); - } - - // done. - if (repop->can_delete()) { - // adjust peers_complete_thru - if (!repop->pg_complete_thru.empty()) { - eversion_t min = pg->info.last_complete; // hrm.... - for (unsigned i=0; iacting.size(); i++) { - if (repop->pg_complete_thru[pg->acting[i]] < min) // note: if we haven't heard, it'll be zero, which is what we want. - min = repop->pg_complete_thru[pg->acting[i]]; - } - - if (min > pg->peers_complete_thru) { - dout(10) << "put_repop peers_complete_thru " << pg->peers_complete_thru << " -> " << min << " in " << *pg << dendl; - pg->peers_complete_thru = min; - } - } - - dout(10) << "put_repop deleting " << *repop << dendl; - //repop->lock.Unlock(); - - assert(pg->repop_gather.count(repop->rep_tid)); - pg->repop_gather.erase(repop->rep_tid); - - delete repop->op; - delete repop; - - } else { - //repop->lock.Unlock(); - } -} - - -void OSD::issue_repop(PG *pg, MOSDOp *op, int osd) -{ - object_t oid = op->get_oid(); - - dout(7) << " issue_repop rep_tid " << op->get_rep_tid() - << " in " << *pg - << " o " << oid - << " to osd" << osd - << dendl; - - // forward the write/update/whatever - MOSDOp *wr = new MOSDOp(op->get_client_inst(), op->get_client_inc(), op->get_reqid().tid, - oid, - pg->get_pgid(), - osdmap->get_epoch(), - op->get_op()); - wr->get_data() = op->get_data(); // _copy_ bufferlist - wr->set_length(op->get_length()); - wr->set_offset(op->get_offset()); - wr->set_version(op->get_version()); - - wr->set_rep_tid(op->get_rep_tid()); - wr->set_pg_trim_to(pg->peers_complete_thru); - - messenger->send_message(wr, osdmap->get_inst(osd)); -} - -PG::RepOpGather *OSD::new_repop_gather(PG *pg, - MOSDOp *op) -{ - dout(10) << "new_repop_gather rep_tid " << op->get_rep_tid() << " on " << *op << " in " << *pg << dendl; - - PG::RepOpGather *repop = new PG::RepOpGather(op, op->get_rep_tid(), - op->get_version(), - pg->info.last_complete); - - // osds. commits all come to me. - for (unsigned i=0; iacting.size(); i++) { - int osd = pg->acting[i]; - repop->osds.insert(osd); - repop->waitfor_commit.insert(osd); - } - - // acks vary: - if (g_conf.osd_rep == OSD_REP_CHAIN) { - // chain rep. - // there's my local ack... - repop->osds.insert(whoami); - repop->waitfor_ack.insert(whoami); - repop->waitfor_commit.insert(whoami); - - // also, the previous guy will ack to me - int myrank = osdmap->calc_pg_rank(whoami, pg->acting); - if (myrank > 0) { - int osd = pg->acting[ myrank-1 ]; - repop->osds.insert(osd); - repop->waitfor_ack.insert(osd); - repop->waitfor_commit.insert(osd); - } - } else { - // primary, splay. all osds ack to me. - for (unsigned i=0; iacting.size(); i++) { - int osd = pg->acting[i]; - repop->waitfor_ack.insert(osd); - } - } - - repop->start = g_clock.now(); - - pg->repop_gather[ repop->rep_tid ] = repop; - - // anyone waiting? (acks that got here before the op did) - if (pg->waiting_for_repop.count(repop->rep_tid)) { - take_waiters(pg->waiting_for_repop[repop->rep_tid]); - pg->waiting_for_repop.erase(repop->rep_tid); - } - - return repop; -} - - -void OSD::repop_ack(PG *pg, PG::RepOpGather *repop, - int result, bool commit, - int fromosd, eversion_t pg_complete_thru) -{ - MOSDOp *op = repop->op; - - dout(7) << "repop_ack rep_tid " << repop->rep_tid << " op " << *op - << " result " << result << " commit " << commit << " from osd" << fromosd - << " in " << *pg - << dendl; - - get_repop_gather(repop); - { - if (commit) { - // commit - assert(repop->waitfor_commit.count(fromosd)); - repop->waitfor_commit.erase(fromosd); - repop->waitfor_ack.erase(fromosd); - repop->pg_complete_thru[fromosd] = pg_complete_thru; - } else { - // ack - repop->waitfor_ack.erase(fromosd); - } - } - put_repop_gather(pg, repop); -} - - - - - -/** op_modify_commit - * transaction commit on the acker. - */ -void OSD::op_modify_commit(pg_t pgid, tid_t rep_tid, eversion_t pg_complete_thru) -{ - PG *pg = lock_pg(pgid); - if (pg) { - if (pg->repop_gather.count(rep_tid)) { - PG::RepOpGather *repop = pg->repop_gather[rep_tid]; - - dout(10) << "op_modify_commit " << *repop->op << dendl; - get_repop_gather(repop); - { - assert(repop->waitfor_commit.count(whoami)); - repop->waitfor_commit.erase(whoami); - repop->pg_complete_thru[whoami] = pg_complete_thru; - } - put_repop_gather(pg, repop); - dout(10) << "op_modify_commit done on " << repop << dendl; - } else { - dout(10) << "op_modify_commit pg " << pgid << " rep_tid " << rep_tid << " dne" << dendl; - } - - unlock_pg(pgid); - } else { - dout(10) << "op_modify_commit pg " << pgid << " dne" << dendl; - } -} - - -/** op_modify - * process client modify op - * NOTE: called from opqueue. - */ -void OSD::op_modify(MOSDOp *op, PG *pg) -{ - object_t oid = op->get_oid(); - - const char *opname = MOSDOp::get_opname(op->get_op()); - - // are any peers missing this? - for (unsigned i=1; iacting.size(); i++) { - int peer = pg->acting[i]; - if (pg->peer_missing.count(peer) && - pg->peer_missing[peer].is_missing(oid)) { - // push it before this update. - // FIXME, this is probably extra much work (eg if we're about to overwrite) - pg->peer_missing[peer].got(oid); - push(pg, oid, peer); - } - } - - // dup op? - if (pg->log.logged_req(op->get_reqid())) { - dout(-3) << "op_modify " << opname << " dup op " << op->get_reqid() - << ", doing WRNOOP" << dendl; - op->set_op(OSD_OP_WRNOOP); - opname = MOSDOp::get_opname(op->get_op()); - } - - // locked by someone else? - // for _any_ op type -- eg only the locker can unlock! - if (op->get_op() != OSD_OP_WRNOOP && // except WRNOOP; we just want to flush - block_if_wrlocked(op)) - return; // op will be handled later, after the object unlocks - - - // check crev - objectrev_t crev = 0; - store->getattr(oid, "crev", (char*)&crev, sizeof(crev)); - - // assign version - eversion_t clone_version; - eversion_t nv = pg->log.top; - if (op->get_op() != OSD_OP_WRNOOP) { - nv.epoch = osdmap->get_epoch(); - nv.version++; - assert(nv > pg->info.last_update); - assert(nv > pg->log.top); - - // will clone? - if (crev && op->get_rev() && op->get_rev() > crev) { - clone_version = nv; - nv.version++; - } - - if (op->get_version().version) { - // replay! - if (nv.version < op->get_version().version) { - nv.version = op->get_version().version; - - // clone? - if (crev && op->get_rev() && op->get_rev() > crev) { - // backstep clone - clone_version = nv; - clone_version.version--; - } - } - } - } - - // set version in op, for benefit of client and our eventual reply - op->set_version(nv); - - dout(10) << "op_modify " << opname - << " " << oid - << " v " << nv - << " crev " << crev - << " rev " << op->get_rev() - << " " << op->get_offset() << "~" << op->get_length() - << dendl; - - if (op->get_op() == OSD_OP_WRITE) { - logger->inc("c_wr"); - logger->inc("c_wrb", op->get_length()); - } - - // share latest osd map? - osd_lock.Lock(); - { - for (unsigned i=1; iacting.size(); i++) { - int osd = pg->acting[i]; - _share_map_outgoing( osdmap->get_inst(osd) ); - } - } - osd_lock.Unlock(); - - // issue replica writes - PG::RepOpGather *repop = 0; - bool alone = (pg->acting.size() == 1); - tid_t rep_tid = ++last_tid; - op->set_rep_tid(rep_tid); - - if (g_conf.osd_rep == OSD_REP_CHAIN && !alone) { - // chain rep. send to #2 only. - int next = pg->acting[1]; - if (pg->acting.size() > 2) - next = pg->acting[2]; - issue_repop(pg, op, next); - } - else if (g_conf.osd_rep == OSD_REP_SPLAY && !alone) { - // splay rep. send to rest. - for (unsigned i=1; iacting.size(); ++i) - //for (unsigned i=pg->acting.size()-1; i>=1; --i) - issue_repop(pg, op, pg->acting[i]); - } else { - // primary rep, or alone. - repop = new_repop_gather(pg, op); - - // send to rest. - if (!alone) - for (unsigned i=1; iacting.size(); i++) - issue_repop(pg, op, pg->acting[i]); - } - - if (repop) { - // we are acker. - if (op->get_op() != OSD_OP_WRNOOP) { - // log and update later. - prepare_log_transaction(repop->t, op, nv, crev, op->get_rev(), pg, pg->peers_complete_thru); - prepare_op_transaction(repop->t, op, nv, crev, op->get_rev(), pg); - } - - // (logical) local ack. - // (if alone, this will apply the update.) - get_repop_gather(repop); - { - assert(repop->waitfor_ack.count(whoami)); - repop->waitfor_ack.erase(whoami); - } - put_repop_gather(pg, repop); - - } else { - // chain or splay. apply. - ObjectStore::Transaction t; - prepare_log_transaction(t, op, nv, crev, op->get_rev(), pg, pg->peers_complete_thru); - prepare_op_transaction(t, op, nv, crev, op->get_rev(), pg); - - C_OSD_RepModifyCommit *oncommit = new C_OSD_RepModifyCommit(this, op, pg->get_acker(), - pg->info.last_complete); - unsigned r = store->apply_transaction(t, oncommit); - if (r != 0 && // no errors - r != 2) { // or error on collection_add - cerr << "error applying transaction: r = " << r << dendl; - assert(r == 0); - } - - oncommit->ack(); - } -} - - - -void OSD::prepare_log_transaction(ObjectStore::Transaction& t, - MOSDOp *op, eversion_t& version, - objectrev_t crev, objectrev_t rev, - PG *pg, - eversion_t trim_to) -{ - const object_t oid = op->get_oid(); - - // clone entry? - if (crev && rev && rev > crev) { - eversion_t cv = version; - cv.version--; - PG::Log::Entry cloneentry(PG::Log::Entry::CLONE, oid, cv, op->get_reqid()); - pg->log.add(cloneentry); - - dout(10) << "prepare_log_transaction " << op->get_op() - << " " << cloneentry - << " in " << *pg << dendl; - } - - // actual op - int opcode = PG::Log::Entry::MODIFY; - if (op->get_op() == OSD_OP_DELETE) opcode = PG::Log::Entry::DELETE; - PG::Log::Entry logentry(opcode, oid, version, op->get_reqid()); - - dout(10) << "prepare_log_transaction " << op->get_op() - << " " << logentry - << " in " << *pg << dendl; - - // append to log - assert(version > pg->log.top); - pg->log.add(logentry); - assert(pg->log.top == version); - dout(10) << "prepare_log_transaction appended to " << *pg << dendl; - - // write to pg log on disk - pg->append_log(t, logentry, trim_to); -} - - -/** prepare_op_transaction - * apply an op to the store wrapped in a transaction. - */ -void OSD::prepare_op_transaction(ObjectStore::Transaction& t, - MOSDOp *op, eversion_t& version, - objectrev_t crev, objectrev_t rev, - PG *pg) -{ - const object_t oid = op->get_oid(); - const pg_t pgid = op->get_pg(); - - bool did_clone = false; - - dout(10) << "prepare_op_transaction " << MOSDOp::get_opname( op->get_op() ) - << " " << oid - << " v " << version - << " crev " << crev - << " rev " << rev - << " in " << *pg << dendl; - - // WRNOOP does nothing. - if (op->get_op() == OSD_OP_WRNOOP) - return; - - // raise last_complete? - if (pg->info.last_complete == pg->info.last_update) - pg->info.last_complete = version; - - // raise last_update. - assert(version > pg->info.last_update); - pg->info.last_update = version; - - // write pg info - t.collection_setattr(pgid, "info", &pg->info, sizeof(pg->info)); - - // clone? - if (crev && rev && rev > crev) { - object_t noid = oid; - noid.rev = rev; - dout(10) << "prepare_op_transaction cloning " << oid << " crev " << crev << " to " << noid << dendl; - t.clone(oid, noid); - did_clone = true; - } - - // apply the op - switch (op->get_op()) { - case OSD_OP_WRLOCK: - { // lock object - //r = store->setattr(oid, "wrlock", &op->get_asker(), sizeof(msg_addr_t), oncommit); - t.setattr(oid, "wrlock", &op->get_client(), sizeof(entity_name_t)); - } - break; - - case OSD_OP_WRUNLOCK: - { // unlock objects - //r = store->rmattr(oid, "wrlock", oncommit); - t.rmattr(oid, "wrlock"); - - // unblock all operations that were waiting for this object to become unlocked - if (waiting_for_wr_unlock.count(oid)) { - take_waiters(waiting_for_wr_unlock[oid]); - waiting_for_wr_unlock.erase(oid); - } - } - break; - - case OSD_OP_WRITE: - { // write - assert(op->get_data().length() == op->get_length()); - bufferlist bl; - bl.claim( op->get_data() ); // give buffers to store; we keep *op in memory for a long time! - - //if (oid < 100000000000000ULL) // hack hack-- don't write client data - t.write( oid, op->get_offset(), op->get_length(), bl ); - } - break; - - case OSD_OP_ZERO: - { - assert(0); // are you sure this is what you want? - // zero, remove, or truncate? - struct stat st; - int r = store->stat(oid, &st); - if (r >= 0) { - if (op->get_offset() + (off_t)op->get_length() >= (off_t)st.st_size) { - if (op->get_offset()) - t.truncate(oid, op->get_length() + op->get_offset()); - else - t.remove(oid); - } else { - // zero. the dumb way. FIXME. - bufferptr bp(op->get_length()); - bp.zero(); - bufferlist bl; - bl.push_back(bp); - t.write(oid, op->get_offset(), op->get_length(), bl); - } - } else { - // noop? - dout(10) << "apply_transaction zero on " << oid << ", but dne? stat returns " << r << dendl; - } - } - break; - - case OSD_OP_TRUNCATE: - { // truncate - //r = store->truncate(oid, op->get_offset()); - t.truncate(oid, op->get_length() ); - } - break; - - case OSD_OP_DELETE: - { // delete - //r = store->remove(oid); - t.remove(oid); - } - break; - - default: - assert(0); - } - - // object collection, version - if (op->get_op() == OSD_OP_DELETE) { - // remove object from c - t.collection_remove(pgid, oid); - } else { - // add object to c - t.collection_add(pgid, oid); - - // object version - t.setattr(oid, "version", &version, sizeof(version)); - - // set object crev - if (crev == 0 || // new object - did_clone) // we cloned - t.setattr(oid, "crev", &rev, sizeof(rev)); - } -} diff --git a/tags/20070517_before_mds_merge/osd/OSD.h b/tags/20070517_before_mds_merge/osd/OSD.h deleted file mode 100644 index 5c5205a8c1aac..0000000000000 --- a/tags/20070517_before_mds_merge/osd/OSD.h +++ /dev/null @@ -1,273 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __OSD_H -#define __OSD_H - -#include "msg/Dispatcher.h" - -#include "common/Mutex.h" -#include "common/ThreadPool.h" -#include "common/Timer.h" - -#include "mon/MonMap.h" - -#include "ObjectStore.h" -#include "PG.h" - -#include -using namespace std; -#include -#include -using namespace __gnu_cxx; - -#include "messages/MOSDOp.h" - -class Messenger; -class Message; - - - - -class OSD : public Dispatcher { -public: - - /** superblock - */ - OSDSuperblock superblock; - epoch_t boot_epoch; - - object_t get_osdmap_object_name(epoch_t epoch) { return object_t(0,epoch << 1); } - object_t get_inc_osdmap_object_name(epoch_t epoch) { return object_t(0, (epoch << 1) + 1); } - - void write_superblock(); - void write_superblock(ObjectStore::Transaction& t); - int read_superblock(); - - - /** OSD **/ - protected: - Messenger *messenger; - int whoami; - - static const int STATE_BOOTING = 1; - static const int STATE_ACTIVE = 2; - static const int STATE_STOPPING = 3; - - int state; - - bool is_booting() { return state == STATE_BOOTING; } - bool is_active() { return state == STATE_ACTIVE; } - bool is_stopping() { return state == STATE_STOPPING; } - - - MonMap *monmap; - - class Logger *logger; - - // local store - char dev_path[100]; - class ObjectStore *store; - - // heartbeat - void heartbeat(); - - class C_Heartbeat : public Context { - OSD *osd; - public: - C_Heartbeat(OSD *o) : osd(o) {} - void finish(int r) { - osd->heartbeat(); - } - }; - - // global lock - Mutex osd_lock; - SafeTimer timer; - - // -- stats -- - int hb_stat_ops; // ops since last heartbeat - int hb_stat_qlen; // cumulative queue length since last hb - - hash_map peer_qlen; - - // per-pg locking (serializing) - hash_set pg_lock; - hash_map > pg_lock_waiters; - PG *lock_pg(pg_t pgid); - PG *_lock_pg(pg_t pgid); - void unlock_pg(pg_t pgid); - void _unlock_pg(pg_t pgid); - - // finished waiting messages, that will go at tail of dispatch() - list finished; - void take_waiters(list& ls) { - finished.splice(finished.end(), ls); - } - - // object locking - hash_map > waiting_for_wr_unlock; /** list of operations for each object waiting for 'wrunlock' */ - - bool block_if_wrlocked(MOSDOp* op); - - // -- ops -- - class ThreadPool *threadpool; - hash_map > op_queue; - int pending_ops; - bool waiting_for_no_ops; - Cond no_pending_ops; - Cond op_queue_cond; - - void wait_for_no_ops(); - - void enqueue_op(pg_t pgid, Message *op); - void dequeue_op(pg_t pgid); - static void static_dequeueop(OSD *o, pg_t pgid) { - o->dequeue_op(pgid); - }; - - void do_op(Message *m, PG *pg); // actually do it - - void prepare_log_transaction(ObjectStore::Transaction& t, MOSDOp* op, eversion_t& version, - objectrev_t crev, objectrev_t rev, PG *pg, eversion_t trim_to); - void prepare_op_transaction(ObjectStore::Transaction& t, MOSDOp* op, eversion_t& version, - objectrev_t crev, objectrev_t rev, PG *pg); - - bool waitfor_missing_object(MOSDOp *op, PG *pg); - bool pick_missing_object_rev(object_t& oid, PG *pg); - bool pick_object_rev(object_t& oid); - - - - friend class PG; - - protected: - - // -- osd map -- - class OSDMap *osdmap; - list waiting_for_osdmap; - - hash_map peer_map_epoch; // FIXME types - bool _share_map_incoming(const entity_inst_t& inst, epoch_t epoch); - void _share_map_outgoing(const entity_inst_t& inst); - - void wait_for_new_map(Message *m); - void handle_osd_map(class MOSDMap *m); - - void advance_map(ObjectStore::Transaction& t); - void activate_map(ObjectStore::Transaction& t); - - void get_map(epoch_t e, OSDMap &m); - bool get_map_bl(epoch_t e, bufferlist& bl); - bool get_inc_map_bl(epoch_t e, bufferlist& bl); - bool get_inc_map(epoch_t e, OSDMap::Incremental &inc); - - void send_incremental_map(epoch_t since, const entity_inst_t& inst, bool full); - - - - // -- replication -- - - // PG - hash_map pg_map; - void load_pgs(); - bool pg_exists(pg_t pg); - PG *create_pg(pg_t pg, ObjectStore::Transaction& t); // create new PG - PG *get_pg(pg_t pg); // return existing PG, or null - void _remove_pg(pg_t pg); // remove from store and memory - - void project_pg_history(pg_t pgid, PG::Info::History& h, epoch_t from); - - void activate_pg(pg_t pgid, epoch_t epoch); - - class C_Activate : public Context { - OSD *osd; - pg_t pgid; - epoch_t epoch; - public: - C_Activate(OSD *o, pg_t p, epoch_t e) : osd(o), pgid(p), epoch(e) {} - void finish(int r) { - osd->activate_pg(pgid, epoch); - } - }; - - - tid_t last_tid; - int num_pulling; - - hash_map > waiting_for_pg; - - // replica ops - void get_repop_gather(PG::RepOpGather*); - void apply_repop(PG *pg, PG::RepOpGather *repop); - void put_repop_gather(PG *pg, PG::RepOpGather*); - void issue_repop(PG *pg, MOSDOp *op, int osd); - PG::RepOpGather *new_repop_gather(PG *pg, MOSDOp *op); - void repop_ack(PG *pg, PG::RepOpGather *repop, - int result, bool commit, - int fromosd, eversion_t pg_complete_thru=0); - - void handle_rep_op_ack(MOSDOpReply *m); - - // recovery - void do_notifies(map< int, list >& notify_list); - void do_queries(map< int, map >& query_map); - void repeer(PG *pg, map< int, map >& query_map); - - void pull(PG *pg, object_t oid); - void push(PG *pg, object_t oid, int dest); - - bool require_current_map(Message *m, epoch_t v); - bool require_same_or_newer_map(Message *m, epoch_t e); - - void handle_pg_query(class MOSDPGQuery *m); - void handle_pg_notify(class MOSDPGNotify *m); - void handle_pg_log(class MOSDPGLog *m); - void handle_pg_remove(class MOSDPGRemove *m); - - void op_pull(class MOSDOp *op, PG *pg); - void op_push(class MOSDOp *op, PG *pg); - - void op_rep_modify(class MOSDOp *op, PG *pg); // write, trucnate, delete - void op_rep_modify_commit(class MOSDOp *op, int ackerosd, - eversion_t last_complete); - friend class C_OSD_RepModifyCommit; - - - public: - OSD(int id, Messenger *m, MonMap *mm, char *dev = 0); - ~OSD(); - - // startup/shutdown - int init(); - int shutdown(); - - // messages - virtual void dispatch(Message *m); - virtual void ms_handle_failure(Message *m, const entity_inst_t& inst); - - void handle_osd_ping(class MOSDPing *m); - void handle_op(class MOSDOp *m); - - void op_read(class MOSDOp *m);//, PG *pg); - void op_stat(class MOSDOp *m);//, PG *pg); - void op_modify(class MOSDOp *m, PG *pg); - void op_modify_commit(pg_t pgid, tid_t rep_tid, eversion_t pg_complete_thru); - - // for replication - void handle_op_reply(class MOSDOpReply *m); - - void force_remount(); -}; - -#endif diff --git a/tags/20070517_before_mds_merge/osd/OSDMap.h b/tags/20070517_before_mds_merge/osd/OSDMap.h deleted file mode 100644 index 163c14e65ed24..0000000000000 --- a/tags/20070517_before_mds_merge/osd/OSDMap.h +++ /dev/null @@ -1,519 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __OSDMAP_H -#define __OSDMAP_H - -/* - * describe properties of the OSD cluster. - * disks, disk groups, total # osds, - * - */ -#include "config.h" -#include "include/types.h" -#include "osd_types.h" -#include "msg/Message.h" -#include "common/Mutex.h" -#include "common/Clock.h" - -#include "crush/crush.h" -using namespace crush; - -#include -#include -#include -#include -using namespace std; - - -/* - * some system constants - */ - -// from LSB to MSB, -#define PG_PS_BITS 16 // max bits for placement seed/group portion of PG -#define PG_REP_BITS 6 // up to 64 replicas -#define PG_TYPE_BITS 2 -#define PG_PS_MASK ((1LL< new_up; - map new_down; - list new_in; - list new_out; - map new_overload; // updated overload value - list old_overload; // no longer overload - - void encode(bufferlist& bl) { - bl.append((char*)&epoch, sizeof(epoch)); - bl.append((char*)&mon_epoch, sizeof(mon_epoch)); - bl.append((char*)&ctime, sizeof(ctime)); - ::_encode(new_up, bl); - ::_encode(new_down, bl); - ::_encode(new_in, bl); - ::_encode(new_out, bl); - ::_encode(new_overload, bl); - } - void decode(bufferlist& bl, int& off) { - bl.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - bl.copy(off, sizeof(mon_epoch), (char*)&mon_epoch); - off += sizeof(mon_epoch); - bl.copy(off, sizeof(ctime), (char*)&ctime); - off += sizeof(ctime); - ::_decode(new_up, bl, off); - ::_decode(new_down, bl, off); - ::_decode(new_in, bl, off); - ::_decode(new_out, bl, off); - ::_decode(new_overload, bl, off); - } - - Incremental(epoch_t e=0) : epoch(e), mon_epoch(0) {} - }; - -private: - epoch_t epoch; // what epoch of the osd cluster descriptor is this - epoch_t mon_epoch; // monitor epoch (election iteration) - utime_t ctime; // epoch start time - int pg_bits; // placement group bits - int localized_pg_bits; // bits for localized pgs - - set osds; // all osds - set down_osds; // list of down disks - set out_osds; // list of unmapped disks - map overload_osds; - map osd_inst; - - public: - Crush crush; // hierarchical map - - friend class OSDMonitor; - friend class MDS; - - public: - OSDMap() : epoch(0), mon_epoch(0), pg_bits(5), localized_pg_bits(3) {} - - // map info - epoch_t get_epoch() const { return epoch; } - void inc_epoch() { epoch++; } - - int get_pg_bits() const { return pg_bits; } - void set_pg_bits(int b) { pg_bits = b; } - int get_localized_pg_bits() const { return localized_pg_bits; } - - const utime_t& get_ctime() const { return ctime; } - - bool is_mkfs() const { return epoch == 1; } - //void set_mkfs() { assert(epoch == 1); } - - /***** cluster state *****/ - int num_osds() { return osds.size(); } - void get_all_osds(set& ls) { ls = osds; } - - const set& get_osds() { return osds; } - const set& get_down_osds() { return down_osds; } - const set& get_out_osds() { return out_osds; } - const map& get_overload_osds() { return overload_osds; } - - bool is_down(int osd) { return down_osds.count(osd); } - bool is_up(int osd) { return !is_down(osd); } - bool is_out(int osd) { return out_osds.count(osd); } - bool is_in(int osd) { return !is_out(osd); } - - const entity_inst_t& get_inst(int osd) { - assert(osd_inst.count(osd)); - return osd_inst[osd]; - } - bool get_inst(int osd, entity_inst_t& inst) { - if (osd_inst.count(osd)) { - inst = osd_inst[osd]; - return true; - } - return false; - } - - void mark_down(int o) { down_osds.insert(o); } - void mark_up(int o) { down_osds.erase(o); } - void mark_out(int o) { out_osds.insert(o); } - void mark_in(int o) { out_osds.erase(o); } - - - void apply_incremental(Incremental &inc) { - assert(inc.epoch == epoch+1); - epoch++; - mon_epoch = inc.mon_epoch; - ctime = inc.ctime; - - for (map::iterator i = inc.new_up.begin(); - i != inc.new_up.end(); - i++) { - assert(down_osds.count(i->first)); - down_osds.erase(i->first); - assert(osd_inst.count(i->first) == 0); - osd_inst[i->first] = i->second; - //cout << "epoch " << epoch << " up osd" << i->first << endl; - } - for (map::iterator i = inc.new_down.begin(); - i != inc.new_down.end(); - i++) { - assert(down_osds.count(i->first) == 0); - down_osds.insert(i->first); - assert(osd_inst.count(i->first) == 0 || - osd_inst[i->first] == i->second); - osd_inst.erase(i->first); - //cout << "epoch " << epoch << " down osd" << i->first << endl; - } - for (list::iterator i = inc.new_in.begin(); - i != inc.new_in.end(); - i++) { - assert(out_osds.count(*i)); - out_osds.erase(*i); - //cout << "epoch " << epoch << " in osd" << *i << endl; - } - for (list::iterator i = inc.new_out.begin(); - i != inc.new_out.end(); - i++) { - assert(out_osds.count(*i) == 0); - out_osds.insert(*i); - //cout << "epoch " << epoch << " out osd" << *i << endl; - } - for (map::iterator i = inc.new_overload.begin(); - i != inc.new_overload.end(); - i++) { - overload_osds[i->first] = i->second; - } - for (list::iterator i = inc.old_overload.begin(); - i != inc.old_overload.end(); - i++) { - assert(overload_osds.count(*i)); - overload_osds.erase(*i); - } - } - - // serialize, unserialize - void encode(bufferlist& blist) { - blist.append((char*)&epoch, sizeof(epoch)); - blist.append((char*)&mon_epoch, sizeof(mon_epoch)); - blist.append((char*)&ctime, sizeof(ctime)); - blist.append((char*)&pg_bits, sizeof(pg_bits)); - - _encode(osds, blist); - _encode(down_osds, blist); - _encode(out_osds, blist); - _encode(overload_osds, blist); - _encode(osd_inst, blist); - - crush._encode(blist); - } - - void decode(bufferlist& blist) { - int off = 0; - blist.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - blist.copy(off, sizeof(mon_epoch), (char*)&mon_epoch); - off += sizeof(mon_epoch); - blist.copy(off, sizeof(ctime), (char*)&ctime); - off += sizeof(ctime); - blist.copy(off, sizeof(pg_bits), (char*)&pg_bits); - off += sizeof(pg_bits); - - _decode(osds, blist, off); - _decode(down_osds, blist, off); - _decode(out_osds, blist, off); - _decode(overload_osds, blist, off); - _decode(osd_inst, blist, off); - - crush._decode(blist, off); - } - - - - - /**** mapping facilities ****/ - - // oid -> pg - pg_t object_to_pg(object_t oid, FileLayout& layout) { - static crush::Hash H(777); - - int policy = layout.object_layout; - if (policy == 0) - policy = g_conf.osd_object_layout; - - int type = PG_TYPE_RAND; - ps_t ps; - - switch (policy) { - case OBJECT_LAYOUT_LINEAR: - { - //const object_t ono = oid.bno; - //const inodeno_t ino = oid >> OID_ONO_BITS; - ps = (oid.bno + oid.ino) & PG_PS_MASK; - ps &= ((1ULL<> OID_ONO_BITS; - ps = (oid.bno + H(oid.ino)) & PG_PS_MASK; - ps &= ((1ULL<> 32) ) & PG_PS_MASK; - ps &= ((1ULL< pg - pg_t ps_nrep_to_pg(ps_t ps, int nrep) { - /*return ((pg_t)ps & ((1ULL< nrep - int pg_to_nrep(pg_t pg) { - return pg.u.fields.nrep; - //return (pg >> PG_PS_BITS) & ((1ULL << PG_REP_BITS)-1); - } - - // pg -> ps - int pg_to_ps(pg_t pg) { - //return pg & PG_PS_MASK; - return pg.u.fields.ps; - } - - // pg -> (osd list) - int pg_to_osds(pg_t pg, - vector& osds) { // list of osd addr's - pg_t ps = pg_to_ps(pg); - int num_rep = pg_to_nrep(pg); - assert(num_rep > 0); - - // map to osds[] - switch (g_conf.osd_pg_layout) { - case PG_LAYOUT_CRUSH: - { - int forcefeed = -1; - if (pg.u.fields.preferred > 0 && - out_osds.count(pg.u.fields.preferred-1) == 0) - forcefeed = pg.u.fields.preferred-1; - crush.do_rule(crush.rules[num_rep], // FIXME rule thing. - ps, - osds, - out_osds, overload_osds, - forcefeed); - } - break; - - case PG_LAYOUT_LINEAR: - for (int i=0; i 0 && - g_conf.osd_pg_layout != PG_LAYOUT_CRUSH) { - int osd = pg.u.fields.preferred-1; - - // already in there? - if (osds.empty()) { - osds.push_back(osd); - } else { - assert(num_rep > 0); - for (int i=1; i (up osd list) - int pg_to_acting_osds(pg_t pg, - vector& osds) { // list of osd addr's - // get rush list - vector raw; - pg_to_osds(pg, raw); - - osds.clear(); - for (unsigned i=0; i primary osd - int get_pg_primary(pg_t pg) { - vector group; - int nrep = pg_to_osds(pg, group); - if (nrep) - return group[0]; - return -1; // we fail! - } - - // pg -> acting primary osd - int get_pg_acting_primary(pg_t pg) { - vector group; - int nrep = pg_to_acting_osds(pg, group); - if (nrep > 0) - return group[0]; - return -1; // we fail! - } - int get_pg_acting_tail(pg_t pg) { - vector group; - int nrep = pg_to_acting_osds(pg, group); - if (nrep > 0) - return group[group.size()-1]; - return -1; // we fail! - } - - - /* what replica # is a given osd? 0 primary, -1 for none. */ - int calc_pg_rank(int osd, vector& acting, int nrep=0) { - if (!nrep) nrep = acting.size(); - for (int i=0; i& acting, int nrep=0) { - if (!nrep) nrep = acting.size(); - int rank = calc_pg_rank(osd, acting, nrep); - - if (rank < 0) return PG_ROLE_STRAY; - else if (rank == 0) return PG_ROLE_HEAD; - else if (rank == 1) return PG_ROLE_ACKER; - else return PG_ROLE_MIDDLE; - } - - int get_pg_role(pg_t pg, int osd) { - vector group; - int nrep = pg_to_osds(pg, group); - return calc_pg_role(osd, group, nrep); - } - - /* rank is -1 (stray), 0 (primary), 1,2,3,... (replica) */ - int get_pg_acting_rank(pg_t pg, int osd) { - vector group; - int nrep = pg_to_acting_osds(pg, group); - return calc_pg_rank(osd, group, nrep); - } - /* role is -1 (stray), 0 (primary), 1 (replica) */ - int get_pg_acting_role(pg_t pg, int osd) { - vector group; - int nrep = pg_to_acting_osds(pg, group); - return calc_pg_role(osd, group, nrep); - } - - - - -}; - - -#endif diff --git a/tags/20070517_before_mds_merge/osd/ObjectStore.cc b/tags/20070517_before_mds_merge/osd/ObjectStore.cc deleted file mode 100644 index 82af869e93775..0000000000000 --- a/tags/20070517_before_mds_merge/osd/ObjectStore.cc +++ /dev/null @@ -1,149 +0,0 @@ - -#include "ObjectStore.h" - -#include "config.h" -#include "common/Clock.h" - - -object_t ObjectStore::age_get_oid() { - if (!age_free_oids.empty()) { - object_t o = age_free_oids.front(); - age_free_oids.pop_front(); - return o; - } - return age_cur_oid++; - } - - ssize_t ObjectStore::age_pick_size() { - ssize_t max = file_size_distn.sample() * 1024; - return max/2 + (rand() % 100) * max/200 + 1; - } - - void ObjectStore::age_fill(float pc, utime_t until) { - bufferptr bp(1024*1024); - bp.zero(); - bufferlist bl; - bl.push_back(bp); - while (1) { - if (g_clock.now() > until) break; - - struct statfs st; - statfs(&st); - float a = (float)(st.f_blocks-st.f_bavail) / (float)st.f_blocks; - if (a >= pc) { - dout(10) << "age_fill at " << a << " / " << pc << " stopping" << endl; - break; - } - - object_t oid = age_get_oid(); - - int b = rand() % 10; - age_objects[b].push_back(oid); - - ssize_t s = age_pick_size(); - - dout(10) << "age_fill at " << a << " / " << pc << " creating " << hex << oid << dec << " sz " << s << endl; - - off_t off = 0; - while (s) { - ssize_t t = MIN(s, 1024*1024); - write(oid, t, off, bl, false); - off += t; - s -= t; - } - oid++; - } - } - - void ObjectStore::age_empty(float pc) { - int nper = 20; - int n = nper; - while (1) { - struct statfs st; - statfs(&st); - float a = (float)(st.f_blocks-st.f_bavail) / (float)st.f_blocks; - if (a <= pc) { - dout(10) << "age_empty at " << a << " / " << pc << " stopping" << endl; - break; - } - - int b = rand() % 10; - n--; - if (n == 0 || age_objects[b].empty()) { - dout(10) << "age_empty sync" << endl; - //sync(); - sync(); - n = nper; - continue; - } - object_t oid = age_objects[b].front(); - age_objects[b].pop_front(); - - dout(10) << "age_empty at " << a << " / " << pc << " removing " << hex << oid << dec << endl; - - remove(oid); - age_free_oids.push_back(oid); - } - } - - - void ObjectStore::age(int time, - float high_water, // fill to this % - float low_water, // then empty to this % - int count, // this many times - float final_water, // and end here ( <= low_water) - int fake_size_mb) { - utime_t until = g_clock.now(); - until.sec_ref() += time; - - while (age_objects.size() < 10) age_objects.push_back( list() ); - - if (fake_size_mb) { - int fake_bl = fake_size_mb * 256; - struct statfs st; - statfs(&st); - float f = (float)fake_bl / (float)st.f_blocks; - high_water = (float)high_water * f; - low_water = (float)low_water * f; - final_water = (float)final_water * f; - dout(10) << "fake " << fake_bl << " / " << st.f_blocks << " is " << f << ", high " << high_water << " low " << low_water << " final " << final_water << endl; - } - - // init size distn (once) - if (!did_distn) { - did_distn = true; - age_cur_oid = 1; - file_size_distn.add(1, 19.0758125+0.65434375); - file_size_distn.add(512, 35.6566); - file_size_distn.add(1024, 27.7271875); - file_size_distn.add(2*1024, 16.63503125); - //file_size_distn.add(4*1024, 106.82384375); - //file_size_distn.add(8*1024, 81.493375); - //file_size_distn.add(16*1024, 14.13553125); - //file_size_distn.add(32*1024, 2.176); - //file_size_distn.add(256*1024, 0.655938); - //file_size_distn.add(512*1024, 0.1480625); - //file_size_distn.add(1*1024*1024, 0.020125); // actually 2, but 32bit - file_size_distn.normalize(); - } - - // clear - for (int i=0; i<10; i++) - age_objects[i].clear(); - - for (int c=1; c<=count; c++) { - if (g_clock.now() > until) break; - - dout(1) << "age " << c << "/" << count << " filling to " << high_water << endl; - age_fill(high_water, until); - if (c == count) { - dout(1) << "age final empty to " << final_water << endl; - age_empty(final_water); - } else { - dout(1) << "age " << c << "/" << count << " emptying to " << low_water << endl; - age_empty(low_water); - } - } - dout(1) << "age finished" << endl; - } - diff --git a/tags/20070517_before_mds_merge/osd/ObjectStore.h b/tags/20070517_before_mds_merge/osd/ObjectStore.h deleted file mode 100644 index 9ff94adfcae99..0000000000000 --- a/tags/20070517_before_mds_merge/osd/ObjectStore.h +++ /dev/null @@ -1,505 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __OBJECTSTORE_H -#define __OBJECTSTORE_H - -#include "include/types.h" -#include "osd_types.h" -#include "include/Context.h" -#include "include/buffer.h" - -#include "include/Distribution.h" - -#include - -#ifdef DARWIN -#include -#else -#include /* or */ -#endif /* DARWIN */ - -#include -using namespace std; - -#ifndef MIN -# define MIN(a,b) ((a) < (b) ? (a):(b)) -#endif - -/* - * low-level interface to the local OSD file system - */ - - - -class ObjectStore { -public: - - - class FragmentationStat { - public: - int total; - int num_extent; - int avg_extent; - map extent_dist; // powers of two - map extent_dist_sum; // powers of two - - float avg_extent_per_object; - int avg_extent_jump; // avg distance bweteen consecutive extents - - int total_free; - int num_free_extent; - int avg_free_extent; - map free_extent_dist; // powers of two - map free_extent_dist_sum; // powers of two - }; - - - - /********************************* - * transaction - */ - class Transaction { - public: - static const int OP_READ = 1; // oid, offset, len, pbl - static const int OP_STAT = 2; // oid, pstat - static const int OP_GETATTR = 3; // oid, attrname, pattrval - static const int OP_GETATTRS = 4; // oid, pattrset - - static const int OP_WRITE = 10; // oid, offset, len, bl - static const int OP_TRUNCATE = 11; // oid, len - static const int OP_REMOVE = 13; // oid - static const int OP_SETATTR = 14; // oid, attrname, attrval - static const int OP_SETATTRS = 15; // oid, attrset - static const int OP_RMATTR = 16; // oid, attrname - static const int OP_CLONE = 17; // oid, newoid - - static const int OP_TRIMCACHE = 18; // oid, offset, len - - static const int OP_MKCOLL = 20; // cid - static const int OP_RMCOLL = 21; // cid - static const int OP_COLL_ADD = 22; // cid, oid - static const int OP_COLL_REMOVE = 23; // cid, oid - static const int OP_COLL_SETATTR = 24; // cid, attrname, attrval - static const int OP_COLL_RMATTR = 25; // cid, attrname - - list ops; - list bls; - list oids; - list cids; - list offsets; - list lengths; - list attrnames; - //list< pair > attrvals; - list attrbls; - - list pbls; - list psts; - list< pair > pattrvals; - list< map* > pattrsets; - - void read(object_t oid, off_t off, size_t len, bufferlist *pbl) { - int op = OP_READ; - ops.push_back(op); - oids.push_back(oid); - offsets.push_back(off); - lengths.push_back(len); - pbls.push_back(pbl); - } - void stat(object_t oid, struct stat *st) { - int op = OP_STAT; - ops.push_back(op); - oids.push_back(oid); - psts.push_back(st); - } - void getattr(object_t oid, const char* name, void* val, int *plen) { - int op = OP_GETATTR; - ops.push_back(op); - oids.push_back(oid); - attrnames.push_back(name); - pattrvals.push_back(pair(val,plen)); - } - void getattrs(object_t oid, map& aset) { - int op = OP_GETATTRS; - ops.push_back(op); - oids.push_back(oid); - pattrsets.push_back(&aset); - } - - void write(object_t oid, off_t off, size_t len, bufferlist& bl) { - int op = OP_WRITE; - ops.push_back(op); - oids.push_back(oid); - offsets.push_back(off); - lengths.push_back(len); - bls.push_back(bl); - } - void trim_from_cache(object_t oid, off_t off, size_t len) { - int op = OP_TRIMCACHE; - ops.push_back(op); - oids.push_back(oid); - offsets.push_back(off); - lengths.push_back(len); - } - void truncate(object_t oid, off_t off) { - int op = OP_TRUNCATE; - ops.push_back(op); - oids.push_back(oid); - offsets.push_back(off); - } - void remove(object_t oid) { - int op = OP_REMOVE; - ops.push_back(op); - oids.push_back(oid); - } - void setattr(object_t oid, const char* name, const void* val, int len) { - int op = OP_SETATTR; - ops.push_back(op); - oids.push_back(oid); - attrnames.push_back(name); - //attrvals.push_back(pair(val,len)); - bufferlist bl; - bl.append((char*)val,len); - attrbls.push_back(bl); - } - void setattrs(object_t oid, map& attrset) { - int op = OP_SETATTRS; - ops.push_back(op); - oids.push_back(oid); - pattrsets.push_back(&attrset); - } - void rmattr(object_t oid, const char* name) { - int op = OP_RMATTR; - ops.push_back(op); - oids.push_back(oid); - attrnames.push_back(name); - } - void clone(object_t oid, object_t noid) { - int op = OP_CLONE; - ops.push_back(op); - oids.push_back(oid); - oids.push_back(noid); - } - void create_collection(coll_t cid) { - int op = OP_MKCOLL; - ops.push_back(op); - cids.push_back(cid); - } - void remove_collection(coll_t cid) { - int op = OP_RMCOLL; - ops.push_back(op); - cids.push_back(cid); - } - void collection_add(coll_t cid, object_t oid) { - int op = OP_COLL_ADD; - ops.push_back(op); - cids.push_back(cid); - oids.push_back(oid); - } - void collection_remove(coll_t cid, object_t oid) { - int op = OP_COLL_REMOVE; - ops.push_back(op); - cids.push_back(cid); - oids.push_back(oid); - } - void collection_setattr(coll_t cid, const char* name, const void* val, int len) { - int op = OP_COLL_SETATTR; - ops.push_back(op); - cids.push_back(cid); - attrnames.push_back(name); - //attrvals.push_back(pair(val,len)); - bufferlist bl; - bl.append((char*)val, len); - attrbls.push_back(bl); - } - void collection_rmattr(coll_t cid, const char* name) { - int op = OP_COLL_RMATTR; - ops.push_back(op); - cids.push_back(cid); - attrnames.push_back(name); - } - - // etc. - }; - - - - /* this implementation is here only for naive ObjectStores that - * do not do atomic transactions natively. it is not atomic. - */ - virtual unsigned apply_transaction(Transaction& t, Context *onsafe=0) { - // non-atomic implementation - for (list::iterator p = t.ops.begin(); - p != t.ops.end(); - p++) { - switch (*p) { - case Transaction::OP_READ: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - off_t offset = t.offsets.front(); t.offsets.pop_front(); - size_t len = t.lengths.front(); t.lengths.pop_front(); - bufferlist *pbl = t.pbls.front(); t.pbls.pop_front(); - read(oid, offset, len, *pbl); - } - break; - case Transaction::OP_STAT: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - struct stat *st = t.psts.front(); t.psts.pop_front(); - stat(oid, st); - } - break; - case Transaction::OP_GETATTR: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - const char *attrname = t.attrnames.front(); t.attrnames.pop_front(); - pair pattrval = t.pattrvals.front(); t.pattrvals.pop_front(); - *pattrval.second = getattr(oid, attrname, pattrval.first, *pattrval.second); - } - break; - case Transaction::OP_GETATTRS: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - map *pset = t.pattrsets.front(); t.pattrsets.pop_front(); - getattrs(oid, *pset); - } - break; - - case Transaction::OP_WRITE: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - off_t offset = t.offsets.front(); t.offsets.pop_front(); - size_t len = t.lengths.front(); t.lengths.pop_front(); - bufferlist bl = t.bls.front(); t.bls.pop_front(); - write(oid, offset, len, bl, 0); - } - break; - - case Transaction::OP_TRIMCACHE: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - off_t offset = t.offsets.front(); t.offsets.pop_front(); - size_t len = t.lengths.front(); t.lengths.pop_front(); - trim_from_cache(oid, offset, len); - } - break; - - case Transaction::OP_TRUNCATE: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - off_t len = t.offsets.front(); t.offsets.pop_front(); - truncate(oid, len, 0); - } - break; - - case Transaction::OP_REMOVE: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - remove(oid, 0); - } - break; - - case Transaction::OP_SETATTR: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - const char *attrname = t.attrnames.front(); t.attrnames.pop_front(); - //pair attrval = t.attrvals.front(); t.attrvals.pop_front(); - bufferlist bl; - bl.claim( t.attrbls.front() ); - t.attrbls.pop_front(); - setattr(oid, attrname, bl.c_str(), bl.length(), 0); - } - break; - case Transaction::OP_SETATTRS: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - map *pattrset = t.pattrsets.front(); t.pattrsets.pop_front(); - setattrs(oid, *pattrset, 0); - } - break; - - case Transaction::OP_RMATTR: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - const char *attrname = t.attrnames.front(); t.attrnames.pop_front(); - rmattr(oid, attrname, 0); - } - break; - - case Transaction::OP_CLONE: - { - object_t oid = t.oids.front(); t.oids.pop_front(); - object_t noid = t.oids.front(); t.oids.pop_front(); - clone(oid, noid); - } - break; - - case Transaction::OP_MKCOLL: - { - coll_t cid = t.cids.front(); t.cids.pop_front(); - create_collection(cid, 0); - } - break; - - case Transaction::OP_RMCOLL: - { - coll_t cid = t.cids.front(); t.cids.pop_front(); - destroy_collection(cid, 0); - } - break; - - case Transaction::OP_COLL_ADD: - { - coll_t cid = t.cids.front(); t.cids.pop_front(); - object_t oid = t.oids.front(); t.oids.pop_front(); - collection_add(cid, oid, 0); - } - break; - - case Transaction::OP_COLL_REMOVE: - { - coll_t cid = t.cids.front(); t.cids.pop_front(); - object_t oid = t.oids.front(); t.oids.pop_front(); - collection_remove(cid, oid, 0); - } - break; - - case Transaction::OP_COLL_SETATTR: - { - coll_t cid = t.cids.front(); t.cids.pop_front(); - const char *attrname = t.attrnames.front(); t.attrnames.pop_front(); - //pair attrval = t.attrvals.front(); t.attrvals.pop_front(); - bufferlist bl; - bl.claim( t.attrbls.front() ); - t.attrbls.pop_front(); - collection_setattr(cid, attrname, bl.c_str(), bl.length(), 0); - } - break; - - case Transaction::OP_COLL_RMATTR: - { - coll_t cid = t.cids.front(); t.cids.pop_front(); - const char *attrname = t.attrnames.front(); t.attrnames.pop_front(); - collection_rmattr(cid, attrname, 0); - } - break; - - - default: - cerr << "bad op " << *p << endl; - assert(0); - } - } - - if (onsafe) sync(onsafe); - - return 0; // FIXME count errors - } - - /*********************************************/ - - - - public: - ObjectStore() {} - virtual ~ObjectStore() {} - - // mgmt - virtual int mount() = 0; - virtual int umount() = 0; - virtual int mkfs() = 0; // wipe - - virtual int statfs(struct statfs *buf) = 0; - - // objects - virtual int pick_object_revision_lt(object_t& oid) = 0; - - virtual bool exists(object_t oid) = 0; // useful? - virtual int stat(object_t oid, struct stat *st) = 0; // struct stat? - - virtual int remove(object_t oid, - Context *onsafe=0) = 0; - - virtual int truncate(object_t oid, off_t size, - Context *onsafe=0) = 0; - - virtual int read(object_t oid, - off_t offset, size_t len, - bufferlist& bl) = 0; - - /*virtual int write(object_t oid, - off_t offset, size_t len, - bufferlist& bl, - bool fsync=true) = 0; - */ - virtual int write(object_t oid, - off_t offset, size_t len, - bufferlist& bl, - Context *onsafe) = 0;//{ return -1; } - virtual void trim_from_cache(object_t oid, - off_t offset, size_t len) { } - - virtual int setattr(object_t oid, const char *name, - const void *value, size_t size, - Context *onsafe=0) {return 0;} //= 0; - virtual int setattrs(object_t oid, map& aset, - Context *onsafe=0) {return 0;} //= 0; - virtual int getattr(object_t oid, const char *name, - void *value, size_t size) {return 0;} //= 0; - virtual int getattrs(object_t oid, map& aset) {return 0;}; - - virtual int rmattr(object_t oid, const char *name, - Context *onsafe=0) {return 0;} - - virtual int clone(object_t oid, object_t noid) { - return -1; - } - - //virtual int listattr(object_t oid, char *attrs, size_t size) {return 0;} //= 0; - - // collections - virtual int list_collections(list& ls) {return 0;}//= 0; - virtual int create_collection(coll_t c, - Context *onsafe=0) {return 0;}//= 0; - virtual int destroy_collection(coll_t c, - Context *onsafe=0) {return 0;}//= 0; - virtual bool collection_exists(coll_t c) {return 0;} - virtual int collection_stat(coll_t c, struct stat *st) {return 0;}//= 0; - virtual int collection_add(coll_t c, object_t o, - Context *onsafe=0) {return 0;}//= 0; - virtual int collection_remove(coll_t c, object_t o, - Context *onsafe=0) {return 0;}// = 0; - virtual int collection_list(coll_t c, list& o) {return 0;}//= 0; - - virtual int collection_setattr(coll_t cid, const char *name, - const void *value, size_t size, - Context *onsafe=0) {return 0;} //= 0; - virtual int collection_rmattr(coll_t cid, const char *name, - Context *onsafe=0) {return 0;} //= 0; - virtual int collection_getattr(coll_t cid, const char *name, - void *value, size_t size) {return 0;} //= 0; - //virtual int collection_listattr(coll_t cid, char *attrs, size_t size) {return 0;} //= 0; - - virtual void sync(Context *onsync) {} - virtual void sync() {} - - - virtual void _fake_writes(bool b) {}; - - virtual void _get_frag_stat(FragmentationStat& st) {}; - -}; - - -#endif diff --git a/tags/20070517_before_mds_merge/osd/PG.cc b/tags/20070517_before_mds_merge/osd/PG.cc deleted file mode 100644 index a3402ec0d8fac..0000000000000 --- a/tags/20070517_before_mds_merge/osd/PG.cc +++ /dev/null @@ -1,1333 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "PG.h" -#include "config.h" -#include "OSD.h" - -#include "common/Timer.h" - -#include "messages/MOSDPGNotify.h" -#include "messages/MOSDPGLog.h" -#include "messages/MOSDPGRemove.h" - -#undef dout -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_osd) cout << dbeginl << g_clock.now() << " osd" << osd->whoami << " " << (osd->osdmap ? osd->osdmap->get_epoch():0) << " " << *this << " " - - -/******* PGLog ********/ - -void PG::Log::copy_after(const Log &other, eversion_t v) -{ - assert(v >= other.bottom); - top = bottom = other.top; - for (list::const_reverse_iterator i = other.log.rbegin(); - i != other.log.rend(); - i++) { - if (i->version == v) break; - assert(i->version > v); - log.push_front(*i); - } - bottom = v; -} - -bool PG::Log::copy_after_unless_divergent(const Log &other, eversion_t split, eversion_t floor) -{ - assert(split >= other.bottom); - assert(floor >= other.bottom); - assert(floor <= split); - top = bottom = other.top; - - /* runs on replica. split is primary's log.top. floor is how much they want. - split tell us if the primary is divergent.. e.g.: - -> i am A, B is primary, split is 2'6, floor is 2'2. -A B C -2'2 2'2 -2'3 2'3 2'3 -2'4 2'4 2'4 -3'5 | 2'5 2'5 -3'6 | 2'6 -3'7 | -3'8 | -3'9 | - -> i return full backlog. - */ - - for (list::const_reverse_iterator i = other.log.rbegin(); - i != other.log.rend(); - i++) { - // is primary divergent? - // e.g. my 3'6 vs their 2'6 split - if (i->version.version == split.version && i->version.epoch > split.epoch) { - clear(); - return false; // divergent! - } - if (i->version == floor) break; - assert(i->version > floor); - - // e.g. my 2'23 > '12 - log.push_front(*i); - } - bottom = floor; - return true; -} - -void PG::Log::copy_non_backlog(const Log &other) -{ - if (other.backlog) { - top = other.top; - bottom = other.bottom; - for (list::const_reverse_iterator i = other.log.rbegin(); - i != other.log.rend(); - i++) - if (i->version > bottom) - log.push_front(*i); - else - break; - } else { - *this = other; - } -} - - - -void PG::IndexedLog::trim(ObjectStore::Transaction& t, eversion_t s) -{ - if (backlog && s < bottom) - s = bottom; - - while (!log.empty()) { - Entry &e = *log.begin(); - - if (e.version > s) break; - - assert(complete_to != log.begin()); - assert(requested_to != log.begin()); - - // remove from index, - unindex(e); - - // from log - log.pop_front(); - } - - // raise bottom? - if (backlog) backlog = false; - if (bottom < s) bottom = s; -} - - -void PG::IndexedLog::trim_write_ahead(eversion_t last_update) -{ - while (!log.empty() && - log.rbegin()->version > last_update) { - // remove from index - unindex(*log.rbegin()); - - // remove - log.pop_back(); - } -} - -void PG::trim_write_ahead() -{ - if (info.last_update < log.top) { - dout(10) << "trim_write_ahead (" << info.last_update << "," << log.top << "]" << dendl; - log.trim_write_ahead(info.last_update); - } else { - assert(info.last_update == log.top); - dout(10) << "trim_write_ahead last_update=top=" << info.last_update << dendl; - } - -} - -void PG::proc_replica_log(Log &olog, Missing& omissing, int from) -{ - dout(10) << "proc_replica_log for osd" << from << ": " << olog << " " << omissing << dendl; - assert(!is_active()); - - if (!have_master_log) { - // i'm building master log. - // note peer's missing. - peer_missing[from] = omissing; - - // merge log into our own log - merge_log(olog, omissing, from); - proc_missing(olog, omissing, from); - } else { - // i'm just building missing lists. - peer_missing[from] = omissing; - - // iterate over peer log. in reverse. - list::reverse_iterator pp = olog.log.rbegin(); - eversion_t lu = peer_info[from].last_update; - while (pp != olog.log.rend()) { - if (!log.objects.count(pp->oid)) { - dout(10) << " divergent " << *pp << " not in our log, generating backlog" << dendl; - generate_backlog(); - } - - if (!log.objects.count(pp->oid)) { - dout(10) << " divergent " << *pp << " dne, must have been new, ignoring" << dendl; - ++pp; - continue; - } - - if (log.objects[pp->oid]->version == pp->version) { - break; // we're no longer divergent. - //++pp; - //continue; - } - - if (log.objects[pp->oid]->version > pp->version) { - dout(10) << " divergent " << *pp - << " superceded by " << log.objects[pp->oid] - << ", ignoring" << dendl; - } else { - dout(10) << " divergent " << *pp << ", adding to missing" << dendl; - peer_missing[from].add(pp->oid, pp->version); - } - - ++pp; - if (pp != olog.log.rend()) - lu = pp->version; - else - lu = olog.bottom; - } - - if (lu < peer_info[from].last_update) { - dout(10) << " peer osd" << from << " last_update now " << lu << dendl; - peer_info[from].last_update = lu; - if (lu < oldest_update) { - dout(10) << " oldest_update now " << lu << dendl; - oldest_update = lu; - } - } - - proc_missing(olog, peer_missing[from], from); - } -} - -void PG::merge_log(Log &olog, Missing &omissing, int fromosd) -{ - dout(10) << "merge_log " << olog << " from osd" << fromosd - << " into " << log << dendl; - - //cout << "log" << dendl; - //log.print(cout); - //cout << "olog" << dendl; - //olog.print(cout); - - if (log.empty() || - (olog.bottom > log.top && olog.backlog)) { // e.g. log=(0,20] olog=(40,50]+backlog) - - // swap and index - log.log.swap(olog.log); - log.index(); - - // find split point (old log.top) in new log - // add new items to missing along the way. - for (list::reverse_iterator p = log.log.rbegin(); - p != log.log.rend(); - p++) { - if (p->version <= log.top) { - // ok, p is at split point. - - // was our old log divergent? - if (log.top > p->version) { - dout(10) << "merge_log i was (possibly) divergent for (" << p->version << "," << log.top << "]" << dendl; - if (p->version < oldest_update) - oldest_update = p->version; - - while (!olog.log.empty() && - olog.log.rbegin()->version > p->version) { - Log::Entry &oe = *olog.log.rbegin(); // old entry (possibly divergent) - if (log.objects.count(oe.oid)) { - if (log.objects[oe.oid]->version < oe.version) { - dout(10) << "merge_log divergent entry " << oe - << " not superceded by " << *log.objects[oe.oid] - << ", adding to missing" << dendl; - missing.add(oe.oid, oe.version); - } else { - dout(10) << "merge_log divergent entry " << oe - << " superceded by " << *log.objects[oe.oid] - << ", ignoring" << dendl; - } - } else { - dout(10) << "merge_log divergent entry " << oe << ", adding to missing" << dendl; - missing.add(oe.oid, oe.version); - } - olog.log.pop_back(); // discard divergent entry - } - } - break; - } - - if (p->is_delete()) { - dout(10) << "merge_log merging " << *p << ", not missing" << dendl; - missing.rm(p->oid, p->version); - } else { - dout(10) << "merge_log merging " << *p << ", now missing" << dendl; - missing.add(p->oid, p->version); - } - } - - info.last_update = log.top = olog.top; - info.log_bottom = log.bottom = olog.bottom; - info.log_backlog = log.backlog = olog.backlog; - } - - else { - // i can merge the two logs! - - // extend on bottom? - // FIXME: what if we have backlog, but they have lower bottom? - if (olog.bottom < log.bottom && olog.top >= log.bottom && !log.backlog) { - dout(10) << "merge_log extending bottom to " << olog.bottom - << (olog.backlog ? " +backlog":"") - << dendl; - - // ok - list::iterator from = olog.log.begin(); - list::iterator to; - for (to = from; - to != olog.log.end(); - to++) { - if (to->version > log.bottom) break; - - // update our index while we're here - log.index(*to); - - dout(15) << *to << dendl; - - // new missing object? - if (to->version > info.last_complete) { - if (to->is_update()) - missing.add(to->oid, to->version); - else - missing.rm(to->oid, to->version); - } - } - assert(to != olog.log.end()); - - // splice into our log. - log.log.splice(log.log.begin(), - olog.log, from, to); - - info.log_bottom = log.bottom = olog.bottom; - info.log_backlog = log.backlog = olog.backlog; - } - - // extend on top? - if (olog.top > log.top && - olog.bottom <= log.top) { - dout(10) << "merge_log extending top to " << olog.top << dendl; - - list::iterator to = olog.log.end(); - list::iterator from = olog.log.end(); - while (1) { - if (from == olog.log.begin()) break; - from--; - //dout(0) << "? " << *from << dendl; - if (from->version < log.top) { - from++; - break; - } - - log.index(*from); - dout(10) << "merge_log " << *from << dendl; - - // add to missing - if (from->is_update()) { - missing.add(from->oid, from->version); - } else - missing.rm(from->oid, from->version); - } - - // remove divergent items - while (1) { - Log::Entry *oldtail = &(*log.log.rbegin()); - if (oldtail->version.version+1 == from->version.version) break; - - // divergent! - assert(oldtail->version.version >= from->version.version); - - if (log.objects[oldtail->oid]->version == oldtail->version) { - // and significant. - dout(10) << "merge_log had divergent " << *oldtail << ", adding to missing" << dendl; - //missing.add(oldtail->oid); - assert(0); - } else { - dout(10) << "merge_log had divergent " << *oldtail << ", already missing" << dendl; - assert(missing.is_missing(oldtail->oid)); - } - log.log.pop_back(); - } - - // splice - log.log.splice(log.log.end(), - olog.log, from, to); - - info.last_update = log.top = olog.top; - } - } - - dout(10) << "merge_log result " << log << " " << missing << dendl; - //log.print(cout); - -} - -void PG::proc_missing(Log &olog, Missing &omissing, int fromosd) -{ - // found items? - for (map::iterator p = missing.missing.begin(); - p != missing.missing.end(); - p++) { - if (omissing.is_missing(p->first)) { - assert(omissing.is_missing(p->first, p->second)); - if (omissing.loc.count(p->first)) { - dout(10) << "proc_missing missing " << p->first << " " << p->second - << " on osd" << omissing.loc[p->first] << dendl; - missing.loc[p->first] = omissing.loc[p->first]; - } else { - dout(10) << "proc_missing missing " << p->first << " " << p->second - << " also LOST on source, osd" << fromosd << dendl; - } - } - else if (p->second <= olog.top) { - dout(10) << "proc_missing missing " << p->first << " " << p->second - << " on source, osd" << fromosd << dendl; - missing.loc[p->first] = fromosd; - } else { - dout(10) << "proc_missing " << p->first << " " << p->second - << " > olog.top " << olog.top << ", not found...." - << dendl; - } - } - - dout(10) << "proc_missing missing " << missing.missing << dendl; -} - - - -void PG::generate_backlog() -{ - dout(10) << "generate_backlog to " << log << dendl; - assert(!log.backlog); - log.backlog = true; - - list olist; - osd->store->collection_list(info.pgid, olist); - - int local = 0; - map add; - for (list::iterator it = olist.begin(); - it != olist.end(); - it++) { - local++; - - if (log.logged_object(*it)) continue; // already have it logged. - - // add entry - Log::Entry e; - e.op = Log::Entry::MODIFY; // FIXME when we do smarter op codes! - e.oid = *it; - osd->store->getattr(*it, - "version", - &e.version, sizeof(e.version)); - add[e.version] = e; - dout(10) << "generate_backlog found " << e << dendl; - } - - for (map::reverse_iterator i = add.rbegin(); - i != add.rend(); - i++) { - log.log.push_front(i->second); - log.index( *log.log.begin() ); // index - } - - dout(10) << local << " local objects, " - << add.size() << " objects added to backlog, " - << log.objects.size() << " in pg" << dendl; - - //log.print(cout); -} - -void PG::drop_backlog() -{ - dout(10) << "drop_backlog for " << log << dendl; - //log.print(cout); - - assert(log.backlog); - log.backlog = false; - - while (!log.log.empty()) { - Log::Entry &e = *log.log.begin(); - if (e.version > log.bottom) break; - - dout(15) << "drop_backlog trimming " << e.version << dendl; - log.unindex(e); - log.log.pop_front(); - } -} - - - - - -ostream& PG::Log::print(ostream& out) const -{ - out << *this << dendl; - for (list::const_iterator p = log.begin(); - p != log.end(); - p++) - out << *p << dendl; - return out; -} - - - - - -/******* PG ***********/ -void PG::build_prior() -{ - // build prior set. - prior_set.clear(); - - // current - for (unsigned i=1; iosdmap->get_epoch(); - epoch++) { - OSDMap omap; - osd->get_map(epoch, omap); - - vector acting; - omap.pg_to_acting_osds(get_pgid(), acting); - - for (unsigned i=0; iosdmap->is_up(acting[i]) && // is up now - acting[i] != osd->whoami) // and is not me - prior_set.insert(acting[i]); - } - } - - dout(10) << "build_prior built " << prior_set << dendl; -} - -void PG::adjust_prior() -{ - assert(!prior_set.empty()); - - // raise last_epoch_started_any - epoch_t max = 0; - for (map::iterator it = peer_info.begin(); - it != peer_info.end(); - it++) { - if (it->second.last_epoch_started > max) - max = it->second.last_epoch_started; - } - - dout(10) << "adjust_prior last_epoch_started_any " - << last_epoch_started_any << " -> " << max << dendl; - assert(max > last_epoch_started_any); - last_epoch_started_any = max; - - // rebuild prior set - build_prior(); -} - - -void PG::clear_primary_state() -{ - dout(10) << "clear_primary_state" << dendl; - - // clear peering state - have_master_log = false; - prior_set.clear(); - stray_set.clear(); - clean_set.clear(); - peer_info_requested.clear(); - peer_log_requested.clear(); - peer_info.clear(); - peer_missing.clear(); - - last_epoch_started_any = info.last_epoch_started; -} - -void PG::peer(ObjectStore::Transaction& t, - map< int, map >& query_map) -{ - dout(10) << "peer. acting is " << acting - << ", prior_set is " << prior_set << dendl; - - - /** GET ALL PG::Info *********/ - - // -- query info from everyone in prior_set. - bool missing_info = false; - for (set::iterator it = prior_set.begin(); - it != prior_set.end(); - it++) { - if (peer_info.count(*it)) { - dout(10) << " have info from osd" << *it - << ": " << peer_info[*it] - << dendl; - continue; - } - missing_info = true; - - if (peer_info_requested.count(*it)) { - dout(10) << " waiting for osd" << *it << dendl; - continue; - } - - dout(10) << " querying info from osd" << *it << dendl; - query_map[*it][info.pgid] = Query(Query::INFO, info.history); - peer_info_requested.insert(*it); - } - if (missing_info) return; - - - // -- ok, we have all (prior_set) info. (and maybe others.) - - // did we crash? - dout(10) << " last_epoch_started_any " << last_epoch_started_any << dendl; - if (last_epoch_started_any) { - OSDMap omap; - osd->get_map(last_epoch_started_any, omap); - - // start with the last active set of replicas - set last_started; - vector acting; - omap.pg_to_acting_osds(get_pgid(), acting); - for (unsigned i=0; iosdmap->get_epoch(); - e++) { - OSDMap omap; - osd->get_map(e, omap); - - set still_up; - - for (set::iterator i = last_started.begin(); - i != last_started.end(); - i++) { - //dout(10) << " down in epoch " << e << " is " << omap.get_down_osds() << dendl; - if (omap.is_up(*i)) - still_up.insert(*i); - } - - last_started.swap(still_up); - //dout(10) << " still active as of epoch " << e << ": " << last_started << dendl; - } - - if (last_started.empty()) { - dout(10) << " crashed since epoch " << last_epoch_started_any << dendl; - state_set(STATE_CRASHED); - } else { - dout(10) << " still active from last started: " << last_started << dendl; - } - } else if (osd->osdmap->get_epoch() > 1) { - dout(10) << " crashed since epoch " << last_epoch_started_any << dendl; - state_set(STATE_CRASHED); - } - - dout(10) << " peers_complete_thru " << peers_complete_thru << dendl; - - - - - /** CREATE THE MASTER PG::Log *********/ - - // who (of all priors and active) has the latest PG version? - eversion_t newest_update = info.last_update; - int newest_update_osd = osd->whoami; - - oldest_update = info.last_update; // only of acting (current) osd set. - peers_complete_thru = info.last_complete; - - for (map::iterator it = peer_info.begin(); - it != peer_info.end(); - it++) { - if (it->second.last_update > newest_update) { - newest_update = it->second.last_update; - newest_update_osd = it->first; - } - if (is_acting(it->first)) { - if (it->second.last_update < oldest_update) - oldest_update = it->second.last_update; - if (it->second.last_complete < peers_complete_thru) - peers_complete_thru = it->second.last_complete; - } - } - - // gather log(+missing) from that person! - if (newest_update_osd != osd->whoami) { - if (peer_log_requested.count(newest_update_osd) || - peer_summary_requested.count(newest_update_osd)) { - dout(10) << " newest update on osd" << newest_update_osd - << " v " << newest_update - << ", already queried" - << dendl; - } else { - // we'd like it back to oldest_update, but will settle for log_bottom - eversion_t since = MAX(peer_info[newest_update_osd].log_bottom, - oldest_update); - if (peer_info[newest_update_osd].log_bottom < log.top) { - dout(10) << " newest update on osd" << newest_update_osd - << " v " << newest_update - << ", querying since " << since - << dendl; - query_map[newest_update_osd][info.pgid] = Query(Query::LOG, log.top, since, info.history); - peer_log_requested.insert(newest_update_osd); - } else { - dout(10) << " newest update on osd" << newest_update_osd - << " v " << newest_update - << ", querying entire summary/backlog" - << dendl; - assert((peer_info[newest_update_osd].last_complete >= - peer_info[newest_update_osd].log_bottom) || - peer_info[newest_update_osd].log_backlog); // or else we're in trouble. - query_map[newest_update_osd][info.pgid] = Query(Query::BACKLOG, info.history); - peer_summary_requested.insert(newest_update_osd); - } - } - return; - } else { - dout(10) << " newest_update " << info.last_update << " (me)" << dendl; - } - - dout(10) << " oldest_update " << oldest_update << dendl; - - have_master_log = true; - - - // -- do i need to generate backlog for any of my peers? - if (oldest_update < log.bottom && !log.backlog) { - dout(10) << "generating backlog for some peers, bottom " - << log.bottom << " > " << oldest_update - << dendl; - generate_backlog(); - } - - - /** COLLECT MISSING+LOG FROM PEERS **********/ - /* - we also detect divergent replicas here by pulling the full log - from everyone. - */ - - // gather missing from peers - for (unsigned i=1; i 0) { - dout(10) << "there are still " << missing.num_lost() << " lost objects" << dendl; - - // ***** - // FIXME: i don't think this actually accomplishes anything! - // ***** - - // ok, let's get more summaries! - bool waiting = false; - for (map::iterator it = peer_info.begin(); - it != peer_info.end(); - it++) { - int peer = it->first; - - if (peer_summary_requested.count(peer)) { - dout(10) << " already requested summary/backlog from osd" << peer << dendl; - waiting = true; - continue; - } - - dout(10) << " requesting summary/backlog from osd" << peer << dendl; - query_map[peer][info.pgid] = Query(Query::BACKLOG, info.history); - peer_summary_requested.insert(peer); - waiting = true; - } - - if (!waiting) { - dout(10) << missing.num_lost() << " objects are still lost, waiting+hoping for a notify from someone else!" << dendl; - } - return; - } - - // sanity check - assert(missing.num_lost() == 0); - assert(info.last_complete >= log.bottom || log.backlog); - - - // -- crash recovery? - if (is_crashed()) { - dout(10) << "crashed, allowing op replay for " << g_conf.osd_replay_window << dendl; - state_set(STATE_REPLAY); - osd->timer.add_event_after(g_conf.osd_replay_window, - new OSD::C_Activate(osd, info.pgid, osd->osdmap->get_epoch())); - } - else if (!is_active()) { - // -- ok, activate! - activate(t); - } -} - - -void PG::activate(ObjectStore::Transaction& t) -{ - assert(!is_active()); - - // twiddle pg state - state_set(STATE_ACTIVE); - state_clear(STATE_STRAY); - if (is_crashed()) { - //assert(is_replay()); // HELP.. not on replica? - state_clear(STATE_CRASHED); - state_clear(STATE_REPLAY); - } - info.last_epoch_started = osd->osdmap->get_epoch(); - - if (role == 0) { // primary state - peers_complete_thru = 0; // we don't know (yet)! - } - - assert(info.last_complete >= log.bottom || log.backlog); - - // write pg info - t.collection_setattr(info.pgid, "info", (char*)&info, sizeof(info)); - - // write log - write_log(t); - - // clean up stray objects - clean_up_local(t); - - // init complete pointer - if (info.last_complete == info.last_update) { - dout(10) << "activate - complete" << dendl; - log.complete_to == log.log.end(); - log.requested_to = log.log.end(); - } - //else if (is_primary()) { - else if (true) { - dout(10) << "activate - not complete, " << missing << ", starting recovery" << dendl; - - // init complete_to - log.complete_to = log.log.begin(); - while (log.complete_to->version < info.last_complete) { - log.complete_to++; - assert(log.complete_to != log.log.end()); - } - - // start recovery - log.requested_to = log.complete_to; - do_recovery(); - } else { - dout(10) << "activate - not complete, " << missing << dendl; - } - - - // if primary.. - if (role == 0 && - osd->osdmap->get_epoch() > 1) { - // who is clean? - clean_set.clear(); - if (info.is_clean()) - clean_set.insert(osd->whoami); - - // start up replicas - for (unsigned i=1; iosdmap->get_epoch(), - info.pgid); - m->info = info; - - if (peer_info[peer].last_update == info.last_update) { - // empty log - } - else if (peer_info[peer].last_update < log.bottom) { - // summary/backlog - assert(log.backlog); - m->log = log; - } - else { - // incremental log - assert(peer_info[peer].last_update < info.last_update); - m->log.copy_after(log, peer_info[peer].last_update); - } - - // update local version of peer's missing list! - { - eversion_t plu = peer_info[peer].last_update; - Missing& pm = peer_missing[peer]; - for (list::iterator p = m->log.log.begin(); - p != m->log.log.end(); - p++) - if (p->version > plu) - pm.add(p->oid, p->version); - } - - dout(10) << "activate sending " << m->log << " " << m->missing - << " to osd" << peer << dendl; - //m->log.print(cout); - osd->messenger->send_message(m, osd->osdmap->get_inst(peer)); - - // update our missing - if (peer_missing[peer].num_missing() == 0) { - dout(10) << "activate peer osd" << peer << " already clean, " << peer_info[peer] << dendl; - assert(peer_info[peer].last_complete == info.last_update); - clean_set.insert(peer); - } else { - dout(10) << "activate peer osd" << peer << " " << peer_info[peer] - << " missing " << peer_missing[peer] << dendl; - } - - } - - // discard unneeded peering state - //peer_log.clear(); // actually, do this carefully, in case peer() is called again. - - // all clean? - if (is_all_clean()) { - state_set(STATE_CLEAN); - dout(10) << "activate all replicas clean" << dendl; - clean_replicas(); - } - } - - - // replay (queue them _before_ other waiting ops!) - if (!replay_queue.empty()) { - eversion_t c = info.last_update; - list replay; - for (map::iterator p = replay_queue.begin(); - p != replay_queue.end(); - p++) { - if (p->first <= info.last_update) { - dout(10) << "activate will WRNOOP " << p->first << " " << *p->second << dendl; - replay.push_back(p->second); - continue; - } - if (p->first.version != c.version+1) { - dout(10) << "activate replay " << p->first - << " skipping " << c.version+1 - p->first.version - << " ops" - << dendl; - } - dout(10) << "activate replay " << p->first << " " << *p->second << dendl; - replay.push_back(p->second); - c = p->first; - } - replay_queue.clear(); - osd->take_waiters(replay); - } - - // waiters - osd->take_waiters(waiting_for_active); -} - -/** clean_up_local - * remove any objects that we're storing but shouldn't. - * as determined by log. - */ -void PG::clean_up_local(ObjectStore::Transaction& t) -{ - dout(10) << "clean_up_local" << dendl; - - assert(info.last_update >= log.bottom); // otherwise we need some help! - - if (log.backlog) { - // be thorough. - list ls; - osd->store->collection_list(info.pgid, ls); - set s; - - for (list::iterator i = ls.begin(); - i != ls.end(); - i++) - s.insert(*i); - - set did; - for (list::reverse_iterator p = log.log.rbegin(); - p != log.log.rend(); - p++) { - if (did.count(p->oid)) continue; - did.insert(p->oid); - - if (p->is_delete()) { - if (s.count(p->oid)) { - dout(10) << " deleting " << p->oid - << " when " << p->version << dendl; - t.remove(p->oid); - } - s.erase(p->oid); - } else { - // just leave old objects.. they're missing or whatever - s.erase(p->oid); - } - } - - for (set::iterator i = s.begin(); - i != s.end(); - i++) { - dout(10) << " deleting stray " << *i << dendl; - t.remove(*i); - } - - } else { - // just scan the log. - set did; - for (list::reverse_iterator p = log.log.rbegin(); - p != log.log.rend(); - p++) { - if (did.count(p->oid)) continue; - did.insert(p->oid); - - if (p->is_delete()) { - dout(10) << " deleting " << p->oid - << " when " << p->version << dendl; - t.remove(p->oid); - } else { - // keep old(+missing) objects, just for kicks. - } - } - } -} - - - -void PG::cancel_recovery() -{ - // forget about where missing items are, or anything we're pulling - missing.loc.clear(); - osd->num_pulling -= objects_pulling.size(); - objects_pulling.clear(); -} - -/** - * do one recovery op. - * return true if done, false if nothing left to do. - */ -bool PG::do_recovery() -{ - dout(-10) << "do_recovery pulling " << objects_pulling.size() << " in pg, " - << osd->num_pulling << "/" << g_conf.osd_max_pull << " total" - << dendl; - dout(10) << "do_recovery " << missing << dendl; - - // can we slow down on this PG? - if (osd->num_pulling >= g_conf.osd_max_pull && !objects_pulling.empty()) { - dout(-10) << "do_recovery already pulling max, waiting" << dendl; - return true; - } - - // look at log! - Log::Entry *latest = 0; - - while (log.requested_to != log.log.end()) { - assert(log.objects.count(log.requested_to->oid)); - latest = log.objects[log.requested_to->oid]; - assert(latest); - - dout(10) << "do_recovery " - << *log.requested_to - << (objects_pulling.count(latest->oid) ? " (pulling)":"") - << dendl; - - if (latest->is_update() && - !objects_pulling.count(latest->oid) && - missing.is_missing(latest->oid)) { - osd->pull(this, latest->oid); - return true; - } - - log.requested_to++; - } - - if (!objects_pulling.empty()) { - dout(7) << "do_recovery requested everything, still waiting" << dendl; - return false; - } - - // done? - assert(missing.num_missing() == 0); - assert(info.last_complete == info.last_update); - - if (is_primary()) { - // i am primary - dout(7) << "do_recovery complete, cleaning strays" << dendl; - clean_set.insert(osd->whoami); - if (is_all_clean()) { - state_set(PG::STATE_CLEAN); - clean_replicas(); - } - } else { - // tell primary - dout(7) << "do_recovery complete, telling primary" << dendl; - list ls; - ls.push_back(info); - osd->messenger->send_message(new MOSDPGNotify(osd->osdmap->get_epoch(), - ls), - osd->osdmap->get_inst(get_primary())); - } - - return false; -} - -void PG::do_peer_recovery() -{ - dout(10) << "do_peer_recovery" << dendl; - - for (unsigned i=0; isecond; - eversion_t v = peer_missing[peer].rmissing.begin()->first; - - osd->push(this, oid, peer); - - // do other peers need it too? - for (i++; ipush(this, oid, peer); - } - - return; - } - - // nothing to do! -} - - - -void PG::clean_replicas() -{ - dout(10) << "clean_replicas. strays are " << stray_set << dendl; - - for (set::iterator p = stray_set.begin(); - p != stray_set.end(); - p++) { - dout(10) << "sending PGRemove to osd" << *p << dendl; - set ls; - ls.insert(info.pgid); - MOSDPGRemove *m = new MOSDPGRemove(osd->osdmap->get_epoch(), ls); - osd->messenger->send_message(m, osd->osdmap->get_inst(*p)); - } - - stray_set.clear(); -} - - - -void PG::write_log(ObjectStore::Transaction& t) -{ - dout(10) << "write_log" << dendl; - - // assemble buffer - bufferlist bl; - - // build buffer - ondisklog.bottom = 0; - ondisklog.block_map.clear(); - for (list::iterator p = log.log.begin(); - p != log.log.end(); - p++) { - if (bl.length() % 4096 == 0) - ondisklog.block_map[bl.length()] = p->version; - bl.append((char*)&(*p), sizeof(*p)); - if (g_conf.osd_pad_pg_log) { // pad to 4k, until i fix ebofs reallocation crap. FIXME. - bufferptr bp(4096 - sizeof(*p)); - bl.push_back(bp); - } - } - ondisklog.top = bl.length(); - - // write it - t.remove( info.pgid.to_object() ); - t.write( info.pgid.to_object() , 0, bl.length(), bl); - t.collection_setattr(info.pgid, "ondisklog_bottom", &ondisklog.bottom, sizeof(ondisklog.bottom)); - t.collection_setattr(info.pgid, "ondisklog_top", &ondisklog.top, sizeof(ondisklog.top)); - - t.collection_setattr(info.pgid, "info", &info, sizeof(info)); -} - -void PG::trim_ondisklog_to(ObjectStore::Transaction& t, eversion_t v) -{ - dout(15) << " trim_ondisk_log_to v " << v << dendl; - - map::iterator p = ondisklog.block_map.begin(); - while (p != ondisklog.block_map.end()) { - dout(15) << " " << p->first << " -> " << p->second << dendl; - p++; - if (p == ondisklog.block_map.end() || - p->second > v) { // too far! - p--; // back up - break; - } - } - dout(15) << " * " << p->first << " -> " << p->second << dendl; - if (p == ondisklog.block_map.begin()) - return; // can't trim anything! - - // we can trim! - off_t trim = p->first; - dout(10) << " trimming ondisklog to [" << ondisklog.bottom << "," << ondisklog.top << ")" << dendl; - - ondisklog.bottom = trim; - - // adjust block_map - while (p != ondisklog.block_map.begin()) - ondisklog.block_map.erase(ondisklog.block_map.begin()); - - t.collection_setattr(info.pgid, "ondisklog_bottom", &ondisklog.bottom, sizeof(ondisklog.bottom)); - t.collection_setattr(info.pgid, "ondisklog_top", &ondisklog.top, sizeof(ondisklog.top)); -} - - -void PG::append_log(ObjectStore::Transaction& t, PG::Log::Entry& logentry, - eversion_t trim_to) -{ - dout(10) << "append_log " << ondisklog.top << " " << logentry << dendl; - - // write entry on disk - bufferlist bl; - bl.append( (char*)&logentry, sizeof(logentry) ); - if (g_conf.osd_pad_pg_log) { // pad to 4k, until i fix ebofs reallocation crap. FIXME. - bufferptr bp(4096 - sizeof(logentry)); - bl.push_back(bp); - } - t.write( info.pgid.to_object(), ondisklog.top, bl.length(), bl ); - - // update block map? - if (ondisklog.top % 4096 == 0) - ondisklog.block_map[ondisklog.top] = logentry.version; - - ondisklog.top += bl.length(); - t.collection_setattr(info.pgid, "ondisklog_top", &ondisklog.top, sizeof(ondisklog.top)); - - // trim? - if (trim_to > log.bottom) { - dout(10) << " trimming " << log << " to " << trim_to << dendl; - log.trim(t, trim_to); - info.log_bottom = log.bottom; - info.log_backlog = log.backlog; - trim_ondisklog_to(t, trim_to); - } - dout(10) << " ondisklog [" << ondisklog.bottom << "," << ondisklog.top << ")" << dendl; -} - -void PG::read_log(ObjectStore *store) -{ - int r; - // load bounds - ondisklog.bottom = ondisklog.top = 0; - r = store->collection_getattr(info.pgid, "ondisklog_bottom", &ondisklog.bottom, sizeof(ondisklog.bottom)); - assert(r == sizeof(ondisklog.bottom)); - r = store->collection_getattr(info.pgid, "ondisklog_top", &ondisklog.top, sizeof(ondisklog.top)); - assert(r == sizeof(ondisklog.top)); - - dout(10) << "read_log [" << ondisklog.bottom << "," << ondisklog.top << ")" << dendl; - - log.backlog = info.log_backlog; - log.bottom = info.log_bottom; - - if (ondisklog.top > 0) { - // read - bufferlist bl; - store->read(info.pgid.to_object(), ondisklog.bottom, ondisklog.top-ondisklog.bottom, bl); - - PG::Log::Entry e; - off_t pos = ondisklog.bottom; - assert(log.log.empty()); - while (pos < ondisklog.top) { - bl.copy(pos-ondisklog.bottom, sizeof(e), (char*)&e); - dout(10) << "read_log " << pos << " " << e << dendl; - - if (e.version > log.bottom || log.backlog) { // ignore items below log.bottom - if (pos % 4096 == 0) - ondisklog.block_map[pos] = e.version; - log.log.push_back(e); - } else { - dout(10) << "read_log ignoring entry at " << pos << dendl; - } - - if (g_conf.osd_pad_pg_log) // pad to 4k, until i fix ebofs reallocation crap. FIXME. - pos += 4096; - else - pos += sizeof(e); - } - } - log.top = info.last_update; - log.index(); - - // build missing - set did; - for (list::reverse_iterator i = log.log.rbegin(); - i != log.log.rend(); - i++) { - if (i->version <= info.last_complete) break; - if (did.count(i->oid)) continue; - did.insert(i->oid); - - if (i->is_delete()) continue; - - eversion_t v; - int r = osd->store->getattr(i->oid, "version", &v, sizeof(v)); - if (r < 0 || v < i->version) - missing.add(i->oid, i->version); - } -} - diff --git a/tags/20070517_before_mds_merge/osd/PG.h b/tags/20070517_before_mds_merge/osd/PG.h deleted file mode 100644 index f3b00cf935f91..0000000000000 --- a/tags/20070517_before_mds_merge/osd/PG.h +++ /dev/null @@ -1,707 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __PG_H -#define __PG_H - - -#include "include/types.h" -#include "include/buffer.h" - -#include "OSDMap.h" -#include "ObjectStore.h" -#include "msg/Messenger.h" -#include "messages/MOSDOpReply.h" - -#include "include/types.h" - -#include -using namespace std; - -#include -using namespace __gnu_cxx; - - -class OSD; - - - -/** PG - Replica Placement Group - * - */ - -class PG { -public: - - /* - * PG::Info - summary of PG statistics. - * - * some notes: - * - last_complete implies we have all objects that existed as of that - * stamp, OR a newer object, OR have already applied a later delete. - * - if last_complete >= log.bottom, then we know pg contents thru log.top. - * otherwise, we have no idea what the pg is supposed to contain. - */ - struct Info { - pg_t pgid; - eversion_t last_update; // last object version applied to store. - eversion_t last_complete; // last version pg was complete through. - - eversion_t log_bottom; // oldest log entry. - bool log_backlog; // do we store a complete log? - - epoch_t last_epoch_started; // last epoch started. - epoch_t last_epoch_finished; // last epoch finished. - - struct History { - epoch_t same_since; // same acting set since - epoch_t same_primary_since; // same primary at least back through this epoch. - epoch_t same_acker_since; // same acker at least back through this epoch. - History() : same_since(0), same_primary_since(0), same_acker_since(0) {} - } history; - - Info(pg_t p=0) : pgid(p), - log_backlog(false), - last_epoch_started(0), last_epoch_finished(0) {} - bool is_clean() const { return last_update == last_complete; } - bool is_empty() const { return last_update.version == 0; } - }; - - - /** - * Query - used to ask a peer for information about a pg. - * - * note: if version=0, type=LOG, then we just provide our full log. - * only if type=BACKLOG do we generate a backlog and provide that too. - */ - struct Query { - const static int INFO = 0; - const static int LOG = 1; - const static int BACKLOG = 2; - const static int FULLLOG = 3; - - int type; - eversion_t split, floor; - Info::History history; - - Query() : type(-1) {} - Query(int t, Info::History& h) : - type(t), history(h) { assert(t != LOG); } - Query(int t, eversion_t s, eversion_t f, Info::History& h) : - type(t), split(s), floor(f), history(h) { assert(t == LOG); } - }; - - - /* - * Missing - summary of missing objects. - * kept in memory, as a supplement to Log. - * also used to pass missing info in messages. - */ - class Missing { - public: - map missing; // oid -> v - map rmissing; // v -> oid - - map loc; // where i think i can get them. - - int num_lost() const { return missing.size() - loc.size(); } - int num_missing() const { return missing.size(); } - - bool is_missing(object_t oid) { - return missing.count(oid); - } - bool is_missing(object_t oid, eversion_t v) { - return missing.count(oid) && missing[oid] <= v; - } - void add(object_t oid) { - eversion_t z; - add(oid,z); - } - void add(object_t oid, eversion_t v) { - if (missing.count(oid)) { - if (missing[oid] > v) return; // already missing newer. - rmissing.erase(missing[oid]); - } - missing[oid] = v; - rmissing[v] = oid; - } - void rm(object_t oid, eversion_t when) { - if (missing.count(oid) && missing[oid] < when) { - rmissing.erase(missing[oid]); - missing.erase(oid); - loc.erase(oid); - } - } - void got(object_t oid, eversion_t v) { - assert(missing.count(oid)); - assert(missing[oid] <= v); - loc.erase(oid); - rmissing.erase(missing[oid]); - missing.erase(oid); - } - void got(object_t oid) { - assert(missing.count(oid)); - loc.erase(oid); - rmissing.erase(missing[oid]); - missing.erase(oid); - } - - void _encode(bufferlist& blist) { - ::_encode(missing, blist); - ::_encode(loc, blist); - } - void _decode(bufferlist& blist, int& off) { - ::_decode(missing, blist, off); - ::_decode(loc, blist, off); - - for (map::iterator it = missing.begin(); - it != missing.end(); - it++) - rmissing[it->second] = it->first; - } - }; - - - /* - * Log - incremental log of recent pg changes. - * also, serves as a recovery queue. - * - * when backlog is true, - * objects with versions <= bottom are in log. - * we do not have any deletion info before that time, however. - * log is a "summary" in that it contains all objects in the PG. - */ - class Log { - public: - /** top, bottom - * top - newest entry (update|delete) - * bottom - entry previous to oldest (update|delete) for which we have - * complete negative information. - * i.e. we can infer pg contents for any store whose last_update >= bottom. - */ - eversion_t top; // newest entry (update|delete) - eversion_t bottom; // version prior to oldest (update|delete) - - /** backlog - true if log is a complete summary of pg contents. - * updated will include all items in pg, but deleted will not include - * negative entries for items deleted prior to 'bottom'. - */ - bool backlog; - - /** Entry - * mapped from the eversion_t, so don't include that. - */ - class Entry { - public: - const static int LOST = 0; - const static int MODIFY = 1; - const static int CLONE = 2; - const static int DELETE = 3; - - int op; // write, zero, trunc, remove - object_t oid; - eversion_t version; - objectrev_t rev; - - reqid_t reqid; // caller+tid to uniquely identify request - - Entry() : op(0) {} - Entry(int _op, object_t _oid, const eversion_t& v, - const reqid_t& rid) : - op(_op), oid(_oid), version(v), reqid(rid) {} - - bool is_delete() const { return op == DELETE; } - bool is_clone() const { return op == CLONE; } - bool is_modify() const { return op == MODIFY; } - bool is_update() const { return is_clone() || is_modify(); } - }; - - list log; // the actual log. - - Log() : backlog(false) {} - - void clear() { - eversion_t z; - top = bottom = z; - backlog = false; - log.clear(); - } - bool empty() const { - return top.version == 0 && top.epoch == 0; - } - - void _encode(bufferlist& blist) const { - blist.append((char*)&top, sizeof(top)); - blist.append((char*)&bottom, sizeof(bottom)); - blist.append((char*)&backlog, sizeof(backlog)); - ::_encode(log, blist); - } - void _decode(bufferlist& blist, int& off) { - blist.copy(off, sizeof(top), (char*)&top); - off += sizeof(top); - blist.copy(off, sizeof(bottom), (char*)&bottom); - off += sizeof(bottom); - blist.copy(off, sizeof(backlog), (char*)&backlog); - off += sizeof(backlog); - - ::_decode(log, blist, off); - } - - void copy_after(const Log &other, eversion_t v); - bool copy_after_unless_divergent(const Log &other, eversion_t split, eversion_t floor); - void copy_non_backlog(const Log &other); - ostream& print(ostream& out) const; - }; - - /** - * IndexLog - adds in-memory index of the log, by oid. - * plus some methods to manipulate it all. - */ - class IndexedLog : public Log { - public: - hash_map objects; // ptrs into log. be careful! - hash_set caller_ops; - - // recovery pointers - list::iterator requested_to; // not inclusive of referenced item - list::iterator complete_to; // not inclusive of referenced item - - /****/ - IndexedLog() {} - - void clear() { - assert(0); - unindex(); - Log::clear(); - } - - bool logged_object(object_t oid) { - return objects.count(oid); - } - bool logged_req(const reqid_t &r) { - return caller_ops.count(r); - } - - void index() { - objects.clear(); - caller_ops.clear(); - for (list::iterator i = log.begin(); - i != log.end(); - i++) { - objects[i->oid] = &(*i); - caller_ops.insert(i->reqid); - } - } - - void index(Entry& e) { - if (objects.count(e.oid) == 0 || - objects[e.oid]->version < e.version) - objects[e.oid] = &e; - caller_ops.insert(e.reqid); - } - void unindex() { - objects.clear(); - caller_ops.clear(); - } - void unindex(Entry& e) { - // NOTE: this only works if we remove from the _bottom_ of the log! - assert(objects.count(e.oid)); - if (objects[e.oid]->version == e.version) - objects.erase(e.oid); - caller_ops.erase(e.reqid); - } - - - // accessors - Entry *is_updated(object_t oid) { - if (objects.count(oid) && objects[oid]->is_update()) return objects[oid]; - return 0; - } - Entry *is_deleted(object_t oid) { - if (objects.count(oid) && objects[oid]->is_delete()) return objects[oid]; - return 0; - } - - // actors - void add(Entry& e) { - // add to log - log.push_back(e); - assert(e.version > top); - assert(top.version == 0 || e.version.version > top.version); - top = e.version; - - // to our index - objects[e.oid] = &(log.back()); - caller_ops.insert(e.reqid); - } - - void trim(ObjectStore::Transaction &t, eversion_t s); - void trim_write_ahead(eversion_t last_update); - }; - - - /** - * OndiskLog - some info about how we store the log on disk. - */ - class OndiskLog { - public: - // ok - off_t bottom; // first byte of log. - off_t top; // byte following end of log. - map block_map; // block -> first stamp logged there - - OndiskLog() : bottom(0), top(0) {} - - bool trim_to(eversion_t v, ObjectStore::Transaction& t); - }; - - - /*** - */ - - class RepOpGather { - public: - class MOSDOp *op; - tid_t rep_tid; - - ObjectStore::Transaction t; - bool applied; - - set waitfor_ack; - set waitfor_commit; - - utime_t start; - - bool sent_ack, sent_commit; - - set osds; - eversion_t new_version; - - eversion_t pg_local_last_complete; - map pg_complete_thru; - - RepOpGather(MOSDOp *o, tid_t rt, eversion_t nv, eversion_t lc) : - op(o), rep_tid(rt), - applied(false), - sent_ack(false), sent_commit(false), - new_version(nv), - pg_local_last_complete(lc) { } - - bool can_send_ack() { - return !sent_ack && !sent_commit && - waitfor_ack.empty(); - } - bool can_send_commit() { - return !sent_commit && - waitfor_ack.empty() && waitfor_commit.empty(); - } - bool can_delete() { - return waitfor_ack.empty() && waitfor_commit.empty(); - } - }; - - - /*** PG ****/ -public: - // any - static const int STATE_ACTIVE = 1; // i am active. (primary: replicas too) - - // primary - static const int STATE_CLEAN = 2; // peers are complete, clean of stray replicas. - static const int STATE_CRASHED = 4; // all replicas went down. - static const int STATE_REPLAY = 8; // crashed, waiting for replay - - // non-primary - static const int STATE_STRAY = 16; // i must notify the primary i exist. - - - protected: - OSD *osd; - -public: - // pg state - Info info; - IndexedLog log; - OndiskLog ondisklog; - Missing missing; - utime_t last_heartbeat; // - -protected: - int role; // 0 = primary, 1 = replica, -1=none. - int state; // see bit defns above - - // primary state - public: - vector acting; - epoch_t last_epoch_started_any; - eversion_t last_complete_commit; - - // [primary only] content recovery state - eversion_t peers_complete_thru; - bool have_master_log; - protected: - set prior_set; // current+prior OSDs, as defined by last_epoch_started_any. - set stray_set; // non-acting osds that have PG data. - set clean_set; // current OSDs that are clean - eversion_t oldest_update; // lowest (valid) last_update in active set - map peer_info; // info from peers (stray or prior) - set peer_info_requested; - map peer_missing; - set peer_log_requested; // logs i've requested (and start stamps) - set peer_summary_requested; - friend class OSD; - - - // [primary|tail] - // old way - map replica_ops; - map > replica_tids_by_osd; // osd -> (tid,...) - - // new way - map repop_gather; - map > waiting_for_repop; - - - // [primary|replica] - // pg waiters - list waiting_for_active; - hash_map > waiting_for_missing_object; - map replay_queue; - - // recovery - map objects_pulling; // which objects are currently being pulled - -public: - void clear_primary_state(); - - public: - bool is_acting(int osd) const { - for (unsigned i=0; i peers_complete_thru) { - peers_complete_thru = t; - return true; - } - return false; - } - - void proc_replica_log(Log &olog, Missing& omissing, int from); - void merge_log(Log &olog, Missing& omissing, int from); - void proc_missing(Log &olog, Missing &omissing, int fromosd); - - void generate_backlog(); - void drop_backlog(); - - void trim_write_ahead(); - - void peer(ObjectStore::Transaction& t, map< int, map >& query_map); - - void activate(ObjectStore::Transaction& t); - - void cancel_recovery(); - bool do_recovery(); - void do_peer_recovery(); - - void clean_replicas(); - - off_t get_log_write_pos() { - return 0; - } - - public: - PG(OSD *o, pg_t p) : - osd(o), - info(p), - role(0), - state(0), - last_epoch_started_any(0), - last_complete_commit(0), - peers_complete_thru(0), - have_master_log(true) - { } - - pg_t get_pgid() const { return info.pgid; } - int get_nrep() const { return acting.size(); } - - int get_primary() { return acting.empty() ? -1:acting[0]; } - //int get_tail() { return acting.empty() ? -1:acting[ acting.size()-1 ]; } - //int get_acker() { return g_conf.osd_rep == OSD_REP_PRIMARY ? get_primary():get_tail(); } - int get_acker() { - if (g_conf.osd_rep == OSD_REP_PRIMARY || - acting.size() <= 1) - return get_primary(); - return acting[1]; - } - - int get_role() const { return role; } - void set_role(int r) { role = r; } - - bool is_primary() const { return role == PG_ROLE_HEAD; } - bool is_acker() const { return role == PG_ROLE_ACKER; } - bool is_head() const { return role == PG_ROLE_HEAD; } - bool is_middle() const { return role == PG_ROLE_MIDDLE; } - bool is_residual() const { return role == PG_ROLE_STRAY; } - - //int get_state() const { return state; } - bool state_test(int m) const { return (state & m) != 0; } - void state_set(int m) { state |= m; } - void state_clear(int m) { state &= ~m; } - - bool is_complete() const { return info.last_complete == info.last_update; } - - bool is_active() const { return state_test(STATE_ACTIVE); } - bool is_crashed() const { return state_test(STATE_CRASHED); } - bool is_replay() const { return state_test(STATE_REPLAY); } - //bool is_complete() { return state_test(STATE_COMPLETE); } - bool is_clean() const { return state_test(STATE_CLEAN); } - bool is_stray() const { return state_test(STATE_STRAY); } - - bool is_empty() const { return info.last_update == 0; } - - int num_active_ops() const { - return objects_pulling.size(); - } - - - // pg on-disk content - void clean_up_local(ObjectStore::Transaction& t); - - // pg on-disk state - void write_log(ObjectStore::Transaction& t); - void append_log(ObjectStore::Transaction& t, - PG::Log::Entry& logentry, - eversion_t trim_to); - void read_log(ObjectStore *store); - void trim_ondisklog_to(ObjectStore::Transaction& t, eversion_t v); - - - -}; - - - -inline ostream& operator<<(ostream& out, const PG::Info::History& h) -{ - return out << h.same_since << "/" << h.same_primary_since << "/" << h.same_acker_since; -} - -inline ostream& operator<<(ostream& out, const PG::Info& pgi) -{ - out << "pginfo(" << pgi.pgid; - if (pgi.is_empty()) - out << " empty"; - else - out << " v " << pgi.last_update << "/" << pgi.last_complete - << " (" << pgi.log_bottom << "," << pgi.last_update << "]" - << (pgi.log_backlog ? "+backlog":""); - out << " e " << pgi.last_epoch_started << "/" << pgi.last_epoch_finished - << " " << pgi.history - << ")"; - return out; -} - -inline ostream& operator<<(ostream& out, const PG::Log::Entry& e) -{ - return out << " " << e.version - << (e.is_delete() ? " - ": - (e.is_clone() ? " c ": - (e.is_modify() ? " m ": - " ? "))) - << e.oid << " by " << e.reqid; -} - -inline ostream& operator<<(ostream& out, const PG::Log& log) -{ - out << "log(" << log.bottom << "," << log.top << "]"; - if (log.backlog) out << "+backlog"; - return out; -} - -inline ostream& operator<<(ostream& out, const PG::Missing& missing) -{ - out << "missing(" << missing.num_missing(); - if (missing.num_lost()) out << ", " << missing.num_lost() << " lost"; - out << ")"; - return out; -} - -inline ostream& operator<<(ostream& out, const PG& pg) -{ - out << "pg[" << pg.info - << " r=" << pg.get_role(); - - if (pg.log.bottom != pg.info.log_bottom) - out << " (info mismatch, " << pg.log << ")"; - - if (pg.log.log.empty()) { - // shoudl it be? - if (pg.log.top.version - pg.log.bottom.version != 0) { - out << " (log bound mismatch, empty)"; - } - } else { - if (((pg.log.log.begin()->version.version - 1 != pg.log.bottom.version) && - !pg.log.backlog) || - (pg.log.log.rbegin()->version.version != pg.log.top.version)) { - out << " (log bound mismatch, actual=[" - << pg.log.log.begin()->version << "," - << pg.log.log.rbegin()->version << "] len=" << pg.log.log.size() << ")"; - } - } - - if (pg.get_role() == 0) out << " pct " << pg.peers_complete_thru; - if (!pg.have_master_log) out << " !hml"; - if (pg.is_active()) out << " active"; - if (pg.is_crashed()) out << " crashed"; - if (pg.is_replay()) out << " replay"; - if (pg.is_clean()) out << " clean"; - if (pg.is_stray()) out << " stray"; - //out << " (" << pg.log.bottom << "," << pg.log.top << "]"; - if (pg.missing.num_missing()) out << " m=" << pg.missing.num_missing(); - if (pg.missing.num_lost()) out << " l=" << pg.missing.num_lost(); - out << "]"; - - - return out; -} - - -inline ostream& operator<<(ostream& out, PG::RepOpGather& repop) -{ - out << "repop(" << &repop << " rep_tid=" << repop.rep_tid - << " wfack=" << repop.waitfor_ack - << " wfcommit=" << repop.waitfor_commit; - out << " pct=" << repop.pg_complete_thru; - out << " op=" << *(repop.op); - out << " repop=" << &repop; - out << ")"; - return out; -} - - -#endif diff --git a/tags/20070517_before_mds_merge/osd/osd_types.h b/tags/20070517_before_mds_merge/osd/osd_types.h deleted file mode 100644 index f8656e1f3e178..0000000000000 --- a/tags/20070517_before_mds_merge/osd/osd_types.h +++ /dev/null @@ -1,174 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __OSD_TYPES_H -#define __OSD_TYPES_H - -#include "include/reqid.h" - -#define PG_INO 1 - - -// osd types -typedef __uint64_t coll_t; // collection id - -// pg stuff -typedef __uint16_t ps_t; -typedef __uint8_t pruleset_t; - -// placement group id -struct pg_t { - union { - struct { - __uint32_t preferred:32; // 32 - ps_t ps:16; // 16 - __uint8_t nrep:8; // 8 - pruleset_t ruleset:8; // 8 - } fields; - __uint64_t val; // 64 - } u; - - pg_t() { u.val = 0; } - pg_t(const pg_t& o) { u.val = o.u.val; } - pg_t(ps_t s, int p, unsigned char n, pruleset_t r=0) { - u.fields.ps = s; - u.fields.preferred = p; - u.fields.nrep = n; - u.fields.ruleset = r; - } - pg_t(__uint64_t v) { u.val = v; } - /* - pg_t operator=(__uint64_t v) { u.val = v; return *this; } - pg_t operator&=(__uint64_t v) { u.val &= v; return *this; } - pg_t operator+=(pg_t o) { u.val += o.val; return *this; } - pg_t operator-=(pg_t o) { u.val -= o.val; return *this; } - pg_t operator++() { ++u.val; return *this; } - */ - operator __uint64_t() const { return u.val; } - - object_t to_object() const { return object_t(PG_INO, u.val >> 32, u.val & 0xffffffff); } -}; - -inline ostream& operator<<(ostream& out, pg_t pg) { - //return out << hex << pg.val << dec; - if (pg.u.fields.ruleset) - out << (int)pg.u.fields.ruleset << '.'; - out << (int)pg.u.fields.nrep << '.'; - if (pg.u.fields.preferred) - out << pg.u.fields.preferred << '.'; - out << hex << pg.u.fields.ps << dec; - out << "=" << hex << pg.u.val << dec; - out << "=" << hex << (__uint64_t)pg << dec; - return out; -} - -namespace __gnu_cxx { - template<> struct hash< pg_t > - { - size_t operator()( const pg_t& x ) const - { - static hash<__uint64_t> H; - return H(x); - } - }; -} - - - -// compound rados version type -class eversion_t { -public: - epoch_t epoch; - version_t version; - eversion_t(epoch_t e=0, version_t v=0) : epoch(e), version(v) {} -}; - -inline bool operator==(const eversion_t& l, const eversion_t& r) { - return (l.epoch == r.epoch) && (l.version == r.version); -} -inline bool operator!=(const eversion_t& l, const eversion_t& r) { - return (l.epoch != r.epoch) || (l.version != r.version); -} -inline bool operator<(const eversion_t& l, const eversion_t& r) { - return (l.epoch == r.epoch) ? (l.version < r.version):(l.epoch < r.epoch); -} -inline bool operator<=(const eversion_t& l, const eversion_t& r) { - return (l.epoch == r.epoch) ? (l.version <= r.version):(l.epoch <= r.epoch); -} -inline bool operator>(const eversion_t& l, const eversion_t& r) { - return (l.epoch == r.epoch) ? (l.version > r.version):(l.epoch > r.epoch); -} -inline bool operator>=(const eversion_t& l, const eversion_t& r) { - return (l.epoch == r.epoch) ? (l.version >= r.version):(l.epoch >= r.epoch); -} -inline ostream& operator<<(ostream& out, const eversion_t e) { - return out << e.epoch << "'" << e.version; -} - - - - - -// ----------------------------------------- - -class ObjectExtent { - public: - object_t oid; // object id - off_t start; // in object - size_t length; // in object - - objectrev_t rev; // which revision? - pg_t pgid; // where to find the object - - map buffer_extents; // off -> len. extents in buffer being mapped (may be fragmented bc of striping!) - - ObjectExtent() : start(0), length(0), rev(0), pgid(0) {} - ObjectExtent(object_t o, off_t s=0, size_t l=0) : oid(o), start(s), length(l), rev(0), pgid(0) { } -}; - -inline ostream& operator<<(ostream& out, ObjectExtent &ex) -{ - return out << "extent(" - << ex.oid << " in " << hex << ex.pgid << dec - << " " << ex.start << "~" << ex.length - << ")"; -} - - - -// --------------------------------------- - -class OSDSuperblock { -public: - const static __uint64_t MAGIC = 0xeb0f505dULL; - __uint64_t magic; - __uint64_t fsid; // unique fs id (random number) - int whoami; // my role in this fs. - epoch_t current_epoch; // most recent epoch - epoch_t oldest_map, newest_map; // oldest/newest maps we have. - OSDSuperblock(__uint64_t f=0, int w=0) : - magic(MAGIC), fsid(f), whoami(w), - current_epoch(0), oldest_map(0), newest_map(0) {} -}; - -inline ostream& operator<<(ostream& out, OSDSuperblock& sb) -{ - return out << "sb(fsid " << sb.fsid - << " osd" << sb.whoami - << " e" << sb.current_epoch - << " [" << sb.oldest_map << "," << sb.newest_map - << "])"; -} - - -#endif diff --git a/tags/20070517_before_mds_merge/osd/rush.cc b/tags/20070517_before_mds_merge/osd/rush.cc deleted file mode 100644 index aebca7ac1a351..0000000000000 --- a/tags/20070517_before_mds_merge/osd/rush.cc +++ /dev/null @@ -1,230 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -// -// -// rush.cc -// -// $Id$ -// - -#include -#include -#include -#include "rush.h" - - -static -unsigned int -myhash (unsigned int n) -{ - unsigned int v = (n ^ 0xdead1234) * (884811920 * 3 + 1); - return (v); -} - -Rush::Rush () -{ - nClusters = 0; - totalServers = 0; -} - -//---------------------------------------------------------------------- -// -// Rush::AddCluster -// -// Add a cluster. The number of servers in the cluster and -// the weight of each server is passed. The current number of -// clusters is returned. -// -//---------------------------------------------------------------------- -int -Rush::AddCluster (int nServers, double weight) -{ - clusterSize[nClusters] = nServers; - clusterWeight[nClusters] = weight; - if (nClusters == 0) { - serversInPrevious[0] = 0; - totalWeightBefore[0] = 0.0; - } else { - serversInPrevious[nClusters] = serversInPrevious[nClusters-1] + - clusterSize[nClusters-1]; - totalWeightBefore[nClusters] = - totalWeightBefore[nClusters-1] + (double)clusterSize[nClusters-1] * - clusterWeight[nClusters-1]; - } - nClusters += 1; - totalServers += nServers; -#if 0 - for (int i = 0; i < nClusters; i++) { - fprintf (stderr, "size=%-3d prev=%-3d weight=%-6.2f prevWeight=%-8.2f\n", - clusterSize[i], serversInPrevious[i], clusterWeight[i], - totalWeightBefore[i]); - } -#endif - return (nClusters); -} - - -//---------------------------------------------------------------------- -// -// Rush::GetServersByKey -// -// This function returns a list of servers on which an object -// should be placed. The servers array must be large enough to -// contain the list. -// -//---------------------------------------------------------------------- -void -Rush::GetServersByKey (int key, int nReplicas, int servers[]) -{ - int replicasLeft = nReplicas; - int cluster; - int mustAssign, numberAssigned; - int i, toDraw; - int *srv = servers; - double myWeight; - RushRNG rng; - - // There may not be more replicas than servers! - assert (nReplicas <= totalServers); - - for (cluster = nClusters-1; (cluster >= 0) && (replicasLeft > 0); cluster--) { - if (serversInPrevious[cluster] < replicasLeft) { - mustAssign = replicasLeft - serversInPrevious[cluster]; - } else { - mustAssign = 0; - } - toDraw = replicasLeft - mustAssign; - if (toDraw > (clusterSize[cluster] - mustAssign)) { - toDraw = clusterSize[cluster] - mustAssign; - } - myWeight = (double)clusterSize[cluster] * clusterWeight[cluster]; - rng.Seed (myhash (key)^cluster, cluster^0xb90738); - numberAssigned = mustAssign + - rng.HyperGeometricWeighted (toDraw, myWeight, - totalWeightBefore[cluster] + myWeight, - clusterWeight[cluster]); - if (numberAssigned > 0) { - rng.Seed (myhash (key)^cluster ^ 11, cluster^0xfea937); - rng.DrawKofN (srv, numberAssigned, clusterSize[cluster]); - for (i = 0; i < numberAssigned; i++) { - srv[i] += serversInPrevious[cluster]; - } - replicasLeft -= numberAssigned; - srv += numberAssigned; - } - } -} - - - -//---------------------------------------------------------------------- -// -// RushRNG::HyperGeometricWeighted -// -// Use an iterative method to generate a hypergeometric random -// variable. This approach guarantees that, if the number of draws -// is reduced, the number of successes must be as well as long as -// the seed for the RNG is the same. -// -//---------------------------------------------------------------------- -int -RushRNG::HyperGeometricWeighted (int nDraws, double yesWeighted, - double totalWeighted, double weightOne) -{ - int positives = 0, i; - double curRand; - - // If the weight is too small (or is negative), choose zero objects. - if (weightOne <= 1e-9 || nDraws == 0) { - return (0); - } - - // Draw nDraws items from the "bag". For each positive, subtract off - // the weight of an object from the weight of positives remaining. For - // each draw, subtract off the weight of an object from the total weight - // remaining. - for (i = 0; i < nDraws; i++) { - curRand = RandomDouble (); - if (curRand < (yesWeighted / totalWeighted)) { - positives += 1; - yesWeighted -= weightOne; - } - totalWeighted -= weightOne; - } - return (positives); -} - -//---------------------------------------------------------------------- -// -// RushRNG::DrawKofN -// -//---------------------------------------------------------------------- -void -RushRNG::DrawKofN (int vals[], int nToDraw, int setSize) -{ - int deck[setSize]; - int i, pick; - - assert(nToDraw <= setSize); - - for (i = 0; i < setSize; i++) { - deck[i] = i; - } - - for (i = 0; i < nToDraw; i++) { - pick = (int)(RandomDouble () * (double)(setSize - i)); - if (pick >= setSize-i) pick = setSize-i-1; // in case - // assert(i >= 0 && i < nToDraw); - // assert(pick >= 0 && pick < setSize); - vals[i] = deck[pick]; - deck[pick] = deck[setSize-i-1]; - } -} - -#define SEED_X 521288629 -#define SEED_Y 362436069 -RushRNG::RushRNG () -{ - Seed (0, 0); -} - -void -RushRNG::Seed (unsigned int seed1, unsigned int seed2) -{ - state1 = ((seed1 == 0) ? SEED_X : seed1); - state2 = ((seed2 == 0) ? SEED_Y : seed2); -} - -unsigned int -RushRNG::RandomInt () -{ - const unsigned int a = 18000; - const unsigned int b = 18879; - unsigned int rndValue; - - state1 = a * (state1 & 0xffff) + (state1 >> 16); - state2 = b * (state2 & 0xffff) + (state2 >> 16); - rndValue = (state1 << 16) + (state2 & 0xffff); - return (rndValue); -} - -double -RushRNG::RandomDouble () -{ - double v; - - v = (double)RandomInt() / (65536.0*65536.0); - return (v); -} diff --git a/tags/20070517_before_mds_merge/osd/rush.h b/tags/20070517_before_mds_merge/osd/rush.h deleted file mode 100644 index 3d880a32415e0..0000000000000 --- a/tags/20070517_before_mds_merge/osd/rush.h +++ /dev/null @@ -1,60 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -// -// -// rush.h -// -// Classes and definitions for the RUSH algorithm. -// -// $Id$ -// -// - -#ifndef _rush_h_ -#define _rush_h_ - -#define RUSH_MAX_CLUSTERS 100 - -class RushRNG { -public: - unsigned int RandomInt (); - double RandomDouble (); - void Seed (unsigned int a, unsigned int b); - int HyperGeometricWeighted (int nDraws, double yesWeighted, - double totalWeighted, double weightOne); - void DrawKofN (int vals[], int nToDraw, int setSize); - RushRNG(); -private: - unsigned int state1, state2; -}; - -class Rush { -public: - void GetServersByKey (int key, int nReplicas, int servers[]); - int AddCluster (int nServers, double weight); - int Clusters () {return (nClusters);} - int Servers () {return (totalServers);} - Rush (); -private: - int DrawKofN (int *servers, int n, int clusterSize, RushRNG *g); - int nClusters; - int totalServers; - int clusterSize[RUSH_MAX_CLUSTERS]; - int serversInPrevious[RUSH_MAX_CLUSTERS]; - double clusterWeight[RUSH_MAX_CLUSTERS]; - double totalWeightBefore[RUSH_MAX_CLUSTERS]; -}; - -#endif /* _rush_h_ */ diff --git a/tags/20070517_before_mds_merge/osd/tp.cc b/tags/20070517_before_mds_merge/osd/tp.cc deleted file mode 100644 index c8171895beef0..0000000000000 --- a/tags/20070517_before_mds_merge/osd/tp.cc +++ /dev/null @@ -1,80 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include -#include - -using namespace std; - -#include "common/Mutex.h" -#include "common/ThreadPool.h" -// #include - -class Op { - int i; - -public: - - Op(int i) - { - this->i = i; - } - - int get() - { - return i; - } -}; - -void foop(class TP *t, class Op *o); - -class TP { -public: - - void foo(Op *o) - { - cout << "Thread "<< pthread_self() << ": " << o->get() << "\n"; - usleep(1); - - // sched_yield(); - } - - int main(int argc, char *argv) - { - ThreadPool *t = new ThreadPool(10, (void (*)(TP*, Op*))foop, this); - - for(int i = 0; i < 100; i++) { - Op *o = new Op(i); - t->put_op(o); - } - - sleep(1); - - delete(t); - - return 0; - } -}; - -void foop(class TP *t, class Op *o) { - t->foo(o); -} - -int main(int argc, char *argv) { - TP t; - - t.main(argc,argv); -} - diff --git a/tags/20070517_before_mds_merge/osdc/Blinker.h b/tags/20070517_before_mds_merge/osdc/Blinker.h deleted file mode 100644 index 231fe47fb1e31..0000000000000 --- a/tags/20070517_before_mds_merge/osdc/Blinker.h +++ /dev/null @@ -1,91 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __BLINKER_H -#define __BLINKER_H - -class Blinker { - - public: - - class Op { - int op; - static const int LOOKUP = 1; - static const int INSERT = 2; - static const int REMOVE = 3; - static const int CLEAR = 4; - Op(int o) : op(o) {} - }; - - class OpLookup : public Op { - public: - bufferptr key; - OpLookup(bufferptr& k) : Op(Op::LOOKUP), key(k) {} - }; - - class OpInsert : public Op { - bufferptr key; - bufferlist val; - OpInsert(bufferptr& k, bufferlist& v) : Op(Op::INSERT), key(k), val(v) {} - }; - - class OpRemove : public Op { - public: - bufferptr key; - OpRemove(bufferptr& k) : Op(Op::REMOVE), key(k) {} - }; - - class OpClear : public Op { - public: - OpClear() : Op(Op::CLEAR) {} - }; - - - -private: - Objecter *objecter; - - // in-flight operations. - - - // cache information about tree structure. - - - -public: - // public interface - - // simple accessors - void lookup(inode_t& inode, bufferptr& key, bufferlist *pval, Context *onfinish); - - // simple modifiers - void insert(inode_t& inode, bufferptr& key, bufferlist& val, Context *onack, Context *onsafe); - void remove(inode_t& inode, bufferptr& key, Context *onack, Context *onsafe); - void clear(inode_t& inode, Context *onack, Context *onsafe); - - // these are dangerous: the table may be large. - void listkeys(inode_t& inode, list* pkeys, Context *onfinish); - void listvals(inode_t& inode, list* pkeys, list* pvals, Context *onfinish); - - // fetch *at least* key, but also anything else that is convenient. - // include lexical bounds for which this is a complete result. - // (if *start and *end are empty, it's the entire table) - void prefetch(inode_t& inode, bufferptr& key, - list* pkeys, list* pvals, - bufferptr *start, bufferptr *end, - Context *onfinish); - - -}; - -#endif diff --git a/tags/20070517_before_mds_merge/osdc/Filer.cc b/tags/20070517_before_mds_merge/osdc/Filer.cc deleted file mode 100644 index 2a2871e5b9e37..0000000000000 --- a/tags/20070517_before_mds_merge/osdc/Filer.cc +++ /dev/null @@ -1,235 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include - -#include "Filer.h" -#include "osd/OSDMap.h" - -//#include "messages/MOSDRead.h" -//#include "messages/MOSDReadReply.h" -//#include "messages/MOSDWrite.h" -//#include "messages/MOSDWriteReply.h" -#include "messages/MOSDOp.h" -#include "messages/MOSDOpReply.h" -#include "messages/MOSDMap.h" - -#include "msg/Messenger.h" - -#include "include/Context.h" - -#include "config.h" -#undef dout -#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_filer) cout << g_clock.now() << " " << objecter->messenger->get_myname() << ".filer " - - -class Filer::C_Probe : public Context { -public: - Filer *filer; - Probe *probe; - object_t oid; - off_t size; - C_Probe(Filer *f, Probe *p, object_t o) : filer(f), probe(p), oid(o), size(0) {} - void finish(int r) { - filer->_probed(probe, oid, size); - } -}; - -int Filer::probe_fwd(inode_t& inode, - off_t start_from, - off_t *end, - Context *onfinish) -{ - dout(10) << "probe_fwd " << hex << inode.ino << dec << " starting from " << start_from << endl; - - Probe *probe = new Probe(inode, start_from, end, onfinish); - - // period (bytes before we jump unto a new set of object(s)) - off_t period = inode.layout.period(); - - // start with 1+ periods. - probe->probing_len = period; - if (start_from % period) - probe->probing_len += period - (start_from % period); - - _probe(probe); - return 0; -} - -void Filer::_probe(Probe *probe) -{ - dout(10) << "_probe " << hex << probe->inode.ino << dec << " " << probe->from << "~" << probe->probing_len << endl; - - // map range onto objects - file_to_extents(probe->inode, probe->from, probe->probing_len, probe->probing); - - for (list::iterator p = probe->probing.begin(); - p != probe->probing.end(); - p++) { - dout(10) << "_probe probing " << p->oid << endl; - C_Probe *c = new C_Probe(this, probe, p->oid); - probe->ops[p->oid] = objecter->stat(p->oid, &c->size, c); - } -} - -void Filer::_probed(Probe *probe, object_t oid, off_t size) -{ - dout(10) << "_probed " << probe->inode.ino << " object " << hex << oid << dec << " has size " << size << endl; - - probe->known[oid] = size; - assert(probe->ops.count(oid)); - probe->ops.erase(oid); - - if (!probe->ops.empty()) - return; // waiting for more! - - // analyze! - off_t end = 0; - for (list::iterator p = probe->probing.begin(); - p != probe->probing.end(); - p++) { - off_t shouldbe = p->length+p->start; - dout(10) << "_probed " << probe->inode.ino << " object " << hex << p->oid << dec - << " should be " << shouldbe - << ", actual is " << probe->known[p->oid] - << endl; - - if (probe->known[p->oid] < 0) { end = -1; break; } // error! - - assert(probe->known[p->oid] <= shouldbe); - if (shouldbe == probe->known[p->oid]) continue; // keep going - - // aha, we found the end! - // calc offset into buffer_extent to get distance from probe->from. - off_t oleft = probe->known[p->oid] - p->start; - for (map::iterator i = p->buffer_extents.begin(); - i != p->buffer_extents.end(); - i++) { - if (oleft <= (off_t)i->second) { - end = probe->from + i->first + oleft; - dout(10) << "_probed end is in buffer_extent " << i->first << "~" << i->second << " off " << oleft - << ", from was " << probe->from << ", end is " << end - << endl; - break; - } - oleft -= i->second; - } - break; - } - - if (end == 0) { - // keep probing! - dout(10) << "_probed didn't find end, probing further" << endl; - off_t period = probe->inode.layout.object_size * probe->inode.layout.stripe_count; - probe->from += probe->probing_len; - probe->probing_len = period; - _probe(probe); - return; - } - - if (end < 0) { - dout(10) << "_probed encountered an error while probing" << endl; - *probe->end = -1; - } else { - // hooray! - dout(10) << "_probed found end at " << end << endl; - *probe->end = end; - } - - // done! finish and clean up. - probe->onfinish->finish(end > 0 ? 0:-1); - delete probe->onfinish; - delete probe; -} - - -void Filer::file_to_extents(inode_t inode, - off_t offset, size_t len, - list& extents, - objectrev_t rev) -{ - dout(10) << "file_to_extents " << offset << "~" << len - << " on " << hex << inode.ino << dec - << endl; - - /* we want only one extent per object! - * this means that each extent we read may map into different bits of the - * final read buffer.. hence OSDExtent.buffer_extents - */ - map< object_t, ObjectExtent > object_extents; - - assert(inode.layout.object_size >= inode.layout.stripe_size); - off_t stripes_per_object = inode.layout.object_size / inode.layout.stripe_size; - dout(20) << " stripes_per_object " << stripes_per_object << endl; - - off_t cur = offset; - off_t left = len; - while (left > 0) { - // layout into objects - off_t blockno = cur / inode.layout.stripe_size; - off_t stripeno = blockno / inode.layout.stripe_count; - off_t stripepos = blockno % inode.layout.stripe_count; - off_t objectsetno = stripeno / stripes_per_object; - off_t objectno = objectsetno * inode.layout.stripe_count + stripepos; - - // find oid, extent - ObjectExtent *ex = 0; - object_t oid( inode.ino, objectno ); - if (object_extents.count(oid)) - ex = &object_extents[oid]; - else { - ex = &object_extents[oid]; - ex->oid = oid; - ex->rev = rev; - ex->pgid = objecter->osdmap->object_to_pg( oid, inode.layout ); - } - - // map range into object - off_t block_start = (stripeno % stripes_per_object)*inode.layout.stripe_size; - off_t block_off = cur % inode.layout.stripe_size; - off_t max = inode.layout.stripe_size - block_off; - - off_t x_offset = block_start + block_off; - off_t x_len; - if (left > max) - x_len = max; - else - x_len = left; - - if (ex->start + (off_t)ex->length == x_offset) { - // add to extent - ex->length += x_len; - } else { - // new extent - assert(ex->length == 0); - assert(ex->start == 0); - ex->start = x_offset; - ex->length = x_len; - } - ex->buffer_extents[cur-offset] = x_len; - - dout(15) << "file_to_extents " << *ex << " in " << ex->pgid << endl; - //cout << "map: ino " << ino << " oid " << ex.oid << " osd " << ex.osd << " offset " << ex.offset << " len " << ex.len << " ... left " << left << endl; - - left -= x_len; - cur += x_len; - } - - // make final list - for (map::iterator it = object_extents.begin(); - it != object_extents.end(); - it++) { - extents.push_back(it->second); - } -} diff --git a/tags/20070517_before_mds_merge/osdc/Filer.h b/tags/20070517_before_mds_merge/osdc/Filer.h deleted file mode 100644 index 161bfec304531..0000000000000 --- a/tags/20070517_before_mds_merge/osdc/Filer.h +++ /dev/null @@ -1,158 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __FILER_H -#define __FILER_H - -/*** Filer - * - * stripe file ranges onto objects. - * build list for the objecter or objectcacher. - * - * also, provide convenience methods that call objecter for you. - * - * "files" are identified by ino. - */ - -#include -#include -using namespace std; - -#include -#include -using namespace __gnu_cxx; - -#include "include/types.h" - -#include "osd/OSDMap.h" -#include "Objecter.h" - -class Context; -class Messenger; -class OSDMap; - - -/**** Filer interface ***/ - -class Filer { - Objecter *objecter; - - // probes - struct Probe { - inode_t inode; - off_t from; - off_t *end; - Context *onfinish; - - list probing; - off_t probing_len; - - map known; - map ops; - - Probe(inode_t &i, off_t f, off_t *e, Context *c) : - inode(i), from(f), end(e), onfinish(c), probing_len(0) {} - }; - - class C_Probe; - //friend class C_Probe; - - void _probe(Probe *p); - void _probed(Probe *p, object_t oid, off_t size); - - public: - Filer(Objecter *o) : objecter(o) {} - ~Filer() {} - - bool is_active() { - return objecter->is_active(); // || (oc && oc->is_active()); - } - - /*** async file interface ***/ - int read(inode_t& inode, - off_t offset, - size_t len, - bufferlist *bl, // ptr to data - Context *onfinish) { - Objecter::OSDRead *rd = new Objecter::OSDRead(bl); - file_to_extents(inode, offset, len, rd->extents); - return objecter->readx(rd, onfinish) > 0 ? 0:-1; - } - - int write(inode_t& inode, - off_t offset, - size_t len, - bufferlist& bl, - int flags, - Context *onack, - Context *oncommit, - objectrev_t rev=0) { - Objecter::OSDWrite *wr = new Objecter::OSDWrite(bl); - file_to_extents(inode, offset, len, wr->extents, rev); - return objecter->modifyx(wr, onack, oncommit) > 0 ? 0:-1; - } - - int zero(inode_t& inode, - off_t offset, - size_t len, - Context *onack, - Context *oncommit) { - Objecter::OSDModify *z = new Objecter::OSDModify(OSD_OP_ZERO); - file_to_extents(inode, offset, len, z->extents); - return objecter->modifyx(z, onack, oncommit) > 0 ? 0:-1; - } - - int remove(inode_t& inode, - off_t offset, - size_t len, - Context *onack, - Context *oncommit) { - Objecter::OSDModify *z = new Objecter::OSDModify(OSD_OP_DELETE); - file_to_extents(inode, offset, len, z->extents); - return objecter->modifyx(z, onack, oncommit) > 0 ? 0:-1; - } - - int probe_fwd(inode_t& inode, - off_t start_from, - off_t *end, - Context *onfinish); - - - /***** mapping *****/ - - /* map (ino, ono) to an object name - (to be used on any osd in the proper replica group) */ - /*object_t file_to_object(inodeno_t ino, - size_t _ono) { - __uint64_t ono = _ono; - assert(ino < (1ULL<& extents, - objectrev_t rev=0); - -}; - - - -#endif diff --git a/tags/20070517_before_mds_merge/osdc/Journaler.cc b/tags/20070517_before_mds_merge/osdc/Journaler.cc deleted file mode 100644 index dee6448b3494d..0000000000000 --- a/tags/20070517_before_mds_merge/osdc/Journaler.cc +++ /dev/null @@ -1,610 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "Journaler.h" - -#include "include/Context.h" -#include "common/Logger.h" -#include "msg/Messenger.h" - -#include "config.h" -#undef dout -#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_objecter) cout << g_clock.now() << " " << objecter->messenger->get_myname() << ".journaler " -#define derr(x) if (x <= g_conf.debug || x <= g_conf.debug_objecter) cerr << g_clock.now() << " " << objecter->messenger->get_myname() << ".journaler " - - - -void Journaler::reset() -{ - dout(1) << "reset to blank journal" << endl; - state = STATE_ACTIVE; - write_pos = flush_pos = ack_pos = - read_pos = requested_pos = received_pos = - expire_pos = trimming_pos = trimmed_pos = inode.layout.period(); -} - - -/***************** HEADER *******************/ - -ostream& operator<<(ostream& out, Journaler::Header &h) -{ - return out << "loghead(trim " << h.trimmed_pos - << ", expire " << h.expire_pos - << ", read " << h.read_pos - << ", write " << h.write_pos - << ")"; -} - -class Journaler::C_ReadHead : public Context { - Journaler *ls; -public: - bufferlist bl; - C_ReadHead(Journaler *l) : ls(l) {} - void finish(int r) { - ls->_finish_read_head(r, bl); - } -}; - -class Journaler::C_ProbeEnd : public Context { - Journaler *ls; -public: - off_t end; - C_ProbeEnd(Journaler *l) : ls(l), end(-1) {} - void finish(int r) { - ls->_finish_probe_end(r, end); - } -}; - -void Journaler::recover(Context *onread) -{ - assert(state != STATE_ACTIVE); - - if (onread) - waitfor_recover.push_back(onread); - - if (state != STATE_UNDEF) { - dout(1) << "recover - already recoverying" << endl; - return; - } - - dout(1) << "read_head" << endl; - state = STATE_READHEAD; - C_ReadHead *fin = new C_ReadHead(this); - filer.read(inode, 0, sizeof(Header), &fin->bl, fin); -} - -void Journaler::_finish_read_head(int r, bufferlist& bl) -{ - assert(state == STATE_READHEAD); - - if (bl.length() == 0) { - dout(1) << "_finish_read_head r=" << r << " read 0 bytes, assuming empty log" << endl; - state = STATE_ACTIVE; - list ls; - ls.swap(waitfor_recover); - finish_contexts(ls, 0); - return; - } - - // unpack header - Header h; - assert(bl.length() == sizeof(h)); - bl.copy(0, sizeof(h), (char*)&h); - - write_pos = flush_pos = ack_pos = h.write_pos; - read_pos = requested_pos = received_pos = h.read_pos; - expire_pos = h.expire_pos; - trimmed_pos = trimming_pos = h.trimmed_pos; - - dout(1) << "_finish_read_head " << h << ". probing for end of log (from " << write_pos << ")..." << endl; - - // probe the log - state = STATE_PROBING; - C_ProbeEnd *fin = new C_ProbeEnd(this); - filer.probe_fwd(inode, h.write_pos, &fin->end, fin); -} - -void Journaler::_finish_probe_end(int r, off_t end) -{ - assert(state == STATE_PROBING); - - if (end == -1) { - end = write_pos; - dout(1) << "_finish_probe_end write_pos = " << end - << " (header had " << write_pos << "). log was empty. recovered." - << endl; - assert(0); // hrm. - } else { - assert(end >= write_pos); - assert(r >= 0); - dout(1) << "_finish_probe_end write_pos = " << end - << " (header had " << write_pos << "). recovered." - << endl; - } - - write_pos = flush_pos = ack_pos = end; - - // done. - list ls; - ls.swap(waitfor_recover); - finish_contexts(ls, 0); -} - - -// WRITING - -class Journaler::C_WriteHead : public Context { -public: - Journaler *ls; - Header h; - Context *oncommit; - C_WriteHead(Journaler *l, Header& h_, Context *c) : ls(l), h(h_), oncommit(c) {} - void finish(int r) { - ls->_finish_write_head(h, oncommit); - } -}; - -void Journaler::write_head(Context *oncommit) -{ - assert(state == STATE_ACTIVE); - last_written.trimmed_pos = trimmed_pos; - last_written.expire_pos = expire_pos; - last_written.read_pos = read_pos; - last_written.write_pos = ack_pos; //write_pos; - dout(10) << "write_head " << last_written << endl; - - last_wrote_head = g_clock.now(); - - bufferlist bl; - bl.append((char*)&last_written, sizeof(last_written)); - filer.write(inode, 0, bl.length(), bl, 0, - 0, new C_WriteHead(this, last_written, oncommit)); -} - -void Journaler::_finish_write_head(Header &wrote, Context *oncommit) -{ - dout(10) << "_finish_write_head " << wrote << endl; - last_committed = wrote; - if (oncommit) { - oncommit->finish(0); - delete oncommit; - } - - trim(); // trim? -} - - -/***************** WRITING *******************/ - -class Journaler::C_Flush : public Context { - Journaler *ls; - off_t start; -public: - C_Flush(Journaler *l, off_t s) : ls(l), start(s) {} - void finish(int r) { ls->_finish_flush(r, start); } -}; - -void Journaler::_finish_flush(int r, off_t start) -{ - assert(r>=0); - - assert(start >= ack_pos); - assert(start < flush_pos); - assert(pending_flush.count(start)); - - // calc latency? - if (logger) { - utime_t lat = g_clock.now(); - lat -= pending_flush[start]; - logger->finc("lsum", lat); - logger->inc("lnum"); - } - - pending_flush.erase(start); - - // adjust ack_pos - if (pending_flush.empty()) - ack_pos = flush_pos; - else - ack_pos = pending_flush.begin()->first; - - dout(10) << "_finish_flush from " << start - << ", pending_flush now " << pending_flush - << ", write positions now " << write_pos << "/" << flush_pos << "/" << ack_pos - << endl; - - // kick waiters <= ack_pos - while (!waitfor_flush.empty()) { - if (waitfor_flush.begin()->first > ack_pos) break; - finish_contexts(waitfor_flush.begin()->second); - waitfor_flush.erase(waitfor_flush.begin()); - } -} - - -off_t Journaler::append_entry(bufferlist& bl, Context *onsync) -{ - size_t s = bl.length(); - - if (!g_conf.journaler_allow_split_entries) { - // will we span a stripe boundary? - int p = inode.layout.stripe_size; - if (write_pos / p != (write_pos + (off_t)(bl.length() + sizeof(s))) / p) { - // yes. - // move write_pos forward. - off_t owp = write_pos; - write_pos += p; - write_pos -= (write_pos % p); - - // pad with zeros. - bufferptr bp(write_pos - owp); - bp.zero(); - assert(bp.length() >= 4); - write_buf.push_back(bp); - - // now flush. - flush(); - - dout(12) << "append_entry skipped " << (write_pos-owp) << " bytes to " << write_pos << " to avoid spanning stripe boundary" << endl; - } - } - - dout(10) << "append_entry len " << bl.length() << " to " << write_pos << "~" << (bl.length() + sizeof(size_t)) << endl; - - // append - write_buf.append((char*)&s, sizeof(s)); - write_buf.append(bl); - write_pos += sizeof(s) + s; - - // flush now? - if (onsync) - flush(onsync); - - return write_pos; -} - - -void Journaler::flush(Context *onsync) -{ - if (write_pos == flush_pos) { - assert(write_buf.length() == 0); - dout(10) << "flush nothing to flush, write pointers at " << write_pos << "/" << flush_pos << "/" << ack_pos << endl; - - if (onsync) { - onsync->finish(0); - delete onsync; - } - return; - } - - unsigned len = write_pos - flush_pos; - assert(len == write_buf.length()); - dout(10) << "flush flushing " << flush_pos << "~" << len << endl; - - // submit write for anything pending - filer.write(inode, flush_pos, len, write_buf, 0, - new C_Flush(this, flush_pos), 0); // flush _start_ pos to _finish_flush - pending_flush[flush_pos] = g_clock.now(); - - // adjust pointers - flush_pos = write_pos; - write_buf.clear(); - - dout(10) << "flush write pointers now at " << write_pos << "/" << flush_pos << "/" << ack_pos << endl; - - // queue waiter (at _new_ write_pos; will go when reached by ack_pos) - if (onsync) - waitfor_flush[write_pos].push_back(onsync); - - // write head? - if (last_wrote_head.sec() + 30 < g_clock.now().sec()) { - write_head(); - } -} - - - -/***************** READING *******************/ - - -class Journaler::C_Read : public Context { - Journaler *ls; -public: - C_Read(Journaler *l) : ls(l) {} - void finish(int r) { ls->_finish_read(r); } -}; - -class Journaler::C_RetryRead : public Context { - Journaler *ls; -public: - C_RetryRead(Journaler *l) : ls(l) {} - void finish(int r) { ls->is_readable(); } // this'll kickstart. -}; - -void Journaler::_finish_read(int r) -{ - assert(r>=0); - - dout(10) << "_finish_read got " << received_pos << "~" << reading_buf.length() << endl; - received_pos += reading_buf.length(); - read_buf.claim_append(reading_buf); - assert(received_pos <= requested_pos); - dout(10) << "_finish_read read_buf now " << read_pos << "~" << read_buf.length() - << ", read pointers " << read_pos << "/" << received_pos << "/" << requested_pos - << endl; - - if (is_readable()) { // NOTE: this check may read more - // readable! - dout(10) << "_finish_read now readable" << endl; - if (on_readable) { - Context *f = on_readable; - on_readable = 0; - f->finish(0); - delete f; - } - - if (read_bl) { - bool r = try_read_entry(*read_bl); - assert(r); // this should have worked. - - // clear state - Context *f = on_read_finish; - on_read_finish = 0; - read_bl = 0; - - // do callback - f->finish(0); - delete f; - } - } - - // prefetch? - _prefetch(); -} - -/* NOTE: this could be slightly smarter... we could allow - * multiple reads to be in progress. e.g., if we prefetch, but - * then discover we need even more for an especially large entry. - * i don't think that circumstance will arise particularly often. - */ -void Journaler::_issue_read(off_t len) -{ - if (_is_reading()) { - dout(10) << "_issue_read " << len << " waiting, already reading " - << received_pos << "~" << (requested_pos-received_pos) << endl; - return; - } - assert(requested_pos == received_pos); - - // stuck at ack_pos? - assert(requested_pos <= ack_pos); - if (requested_pos == ack_pos) { - dout(10) << "_issue_read requested_pos = ack_pos = " << ack_pos << ", waiting" << endl; - assert(write_pos > requested_pos); - if (flush_pos == ack_pos) - flush(); - assert(flush_pos > ack_pos); - waitfor_flush[flush_pos].push_back(new C_RetryRead(this)); - return; - } - - // don't read too much - if (requested_pos + len > ack_pos) { - len = ack_pos - requested_pos; - dout(10) << "_issue_read reading only up to ack_pos " << ack_pos << endl; - } - - // go. - dout(10) << "_issue_read reading " << requested_pos << "~" << len - << ", read pointers " << read_pos << "/" << received_pos << "/" << (requested_pos+len) - << endl; - - filer.read(inode, requested_pos, len, &reading_buf, - new C_Read(this)); - requested_pos += len; -} - -void Journaler::_prefetch() -{ - // prefetch? - off_t left = requested_pos - read_pos; - if (left <= prefetch_from && // should read more, - !_is_reading() && // and not reading anything right now - write_pos > requested_pos) { // there's something more to read... - dout(10) << "_prefetch only " << left << " < " << prefetch_from - << ", prefetching " << endl; - _issue_read(fetch_len); - } -} - - -void Journaler::read_entry(bufferlist *bl, Context *onfinish) -{ - // only one read at a time! - assert(read_bl == 0); - assert(on_read_finish == 0); - - if (is_readable()) { - dout(10) << "read_entry at " << read_pos << ", read_buf is " - << read_pos << "~" << read_buf.length() - << ", readable now" << endl; - - // nice, just do it now. - bool r = try_read_entry(*bl); - assert(r); - - // callback - onfinish->finish(0); - delete onfinish; - } else { - dout(10) << "read_entry at " << read_pos << ", read_buf is " - << read_pos << "~" << read_buf.length() - << ", not readable now" << endl; - - bl->clear(); - - // set it up - read_bl = bl; - on_read_finish = onfinish; - - // is_readable() will have already initiated a read (if it was possible) - } -} - - -/* is_readable() - * return true if next entry is ready. - * kickstart read as necessary. - */ -bool Journaler::is_readable() -{ - // anything to read? - if (read_pos == write_pos) return false; - - // have enough for entry size? - size_t s = 0; - if (read_buf.length() >= sizeof(s)) - read_buf.copy(0, sizeof(s), (char*)&s); - - // entry and payload? - if (read_buf.length() >= sizeof(s) && - read_buf.length() >= sizeof(s) + s) - return true; // yep, next entry is ready. - - // darn it! - - // partial fragment at the end? - if (received_pos == write_pos) { - dout(10) << "is_readable() detected partial entry at tail, adjusting write_pos to " << read_pos << endl; - write_pos = flush_pos = ack_pos = read_pos; - assert(write_buf.length() == 0); - - // truncate? - // FIXME: how much? - - return false; - } - - // start reading some more? - if (!_is_reading()) { - if (s) - fetch_len = MAX(fetch_len, (off_t)(sizeof(s)+s-read_buf.length())); - _issue_read(fetch_len); - } - - return false; -} - - -/* try_read_entry(bl) - * read entry into bl if it's ready. - * otherwise, do nothing. (well, we'll start fetching it for good measure.) - */ -bool Journaler::try_read_entry(bufferlist& bl) -{ - if (!is_readable()) { // this may start a read. - dout(10) << "try_read_entry at " << read_pos << " not readable" << endl; - return false; - } - - size_t s; - assert(read_buf.length() >= sizeof(s)); - read_buf.copy(0, sizeof(s), (char*)&s); - assert(read_buf.length() >= sizeof(s) + s); - - dout(10) << "try_read_entry at " << read_pos << " reading " - << read_pos << "~" << (sizeof(s)+s) << endl; - - // do it - assert(bl.length() == 0); - read_buf.splice(0, sizeof(s)); - read_buf.splice(0, s, &bl); - read_pos += sizeof(s) + s; - - // prefetch? - _prefetch(); - return true; -} - -void Journaler::wait_for_readable(Context *onreadable) -{ - dout(10) << "wait_for_readable at " << read_pos << " onreadable " << onreadable << endl; - assert(!is_readable()); - assert(on_readable == 0); - on_readable = onreadable; -} - - - - -/***************** TRIMMING *******************/ - - -class Journaler::C_Trim : public Context { - Journaler *ls; - off_t to; -public: - C_Trim(Journaler *l, off_t t) : ls(l), to(t) {} - void finish(int r) { - ls->_trim_finish(r, to); - } -}; - -void Journaler::trim() -{ - off_t trim_to = last_committed.expire_pos; - trim_to -= trim_to % inode.layout.period(); - dout(10) << "trim last_commited head was " << last_committed - << ", can trim to " << trim_to - << endl; - if (trim_to == 0 || trim_to == trimming_pos) { - dout(10) << "trim already trimmed/trimming to " - << trimmed_pos << "/" << trimming_pos << endl; - return; - } - - // trim - assert(trim_to <= write_pos); - assert(trim_to > trimming_pos); - dout(10) << "trim trimming to " << trim_to - << ", trimmed/trimming/expire are " - << trimmed_pos << "/" << trimming_pos << "/" << expire_pos - << endl; - - filer.remove(inode, trimming_pos, trim_to-trimming_pos, - 0, new C_Trim(this, trim_to)); - trimming_pos = trim_to; -} - -void Journaler::_trim_finish(int r, off_t to) -{ - dout(10) << "_trim_finish trimmed_pos was " << trimmed_pos - << ", trimmed/trimming/expire now " - << to << "/" << trimming_pos << "/" << expire_pos - << endl; - assert(r >= 0); - - assert(to <= trimming_pos); - assert(to > trimmed_pos); - trimmed_pos = to; - - // finishers? - while (!waitfor_trim.empty() && - waitfor_trim.begin()->first <= trimmed_pos) { - finish_contexts(waitfor_trim.begin()->second, 0); - waitfor_trim.erase(waitfor_trim.begin()); - } -} - - -// eof. diff --git a/tags/20070517_before_mds_merge/osdc/Journaler.h b/tags/20070517_before_mds_merge/osdc/Journaler.h deleted file mode 100644 index 0b8d7061330e8..0000000000000 --- a/tags/20070517_before_mds_merge/osdc/Journaler.h +++ /dev/null @@ -1,218 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -/* Journaler - * - * This class stripes a serial log over objects on the store. Four logical pointers: - * - * write_pos - where we're writing new entries - * read_pos - where we're reading old entires - * expire_pos - what is deemed "old" by user - * trimmed_pos - where we're expiring old items - * - * trimmed_pos <= expire_pos <= read_pos <= write_pos. - * - * Often, read_pos <= write_pos (as with MDS log). During recovery, write_pos is undefined - * until the end of the log is discovered. - * - * A "head" struct at the beginning of the log is used to store metadata at - * regular intervals. The basic invariants include: - * - * head.read_pos <= read_pos -- the head may "lag", since it's updated lazily. - * head.write_pos <= write_pos - * head.expire_pos <= expire_pos - * head.trimmed_pos <= trimmed_pos - * - * More significantly, - * - * head.expire_pos >= trimmed_pos -- this ensures we can find the "beginning" of the log - * as last recorded, before it is trimmed. trimming will - * block until a sufficiently current expire_pos is committed. - * - * To recover log state, we simply start at the last write_pos in the head, and probe the - * object sequence sizes until we read the end. - * - * Head struct is stored in the first object. Actual journal starts after layout.period() bytes. - * - */ - -#ifndef __JOURNALER_H -#define __JOURNALER_H - -#include "Objecter.h" -#include "Filer.h" - -#include -#include - -class Context; -class Logger; - -class Journaler { - - // this goes at the head of the log "file". - struct Header { - off_t trimmed_pos; - off_t expire_pos; - off_t read_pos; - off_t write_pos; - Header() : trimmed_pos(0), expire_pos(0), read_pos(0), write_pos(0) {} - } last_written, last_committed; - - friend ostream& operator<<(ostream& out, Header &h); - - - // me - inode_t inode; - Objecter *objecter; - Filer filer; - - Logger *logger; - - // my state - static const int STATE_UNDEF = 0; - static const int STATE_READHEAD = 1; - static const int STATE_PROBING = 2; - static const int STATE_ACTIVE = 2; - - int state; - - // header - utime_t last_wrote_head; - void _finish_write_head(Header &wrote, Context *oncommit); - class C_WriteHead; - friend class C_WriteHead; - - list waitfor_recover; - void _finish_read_head(int r, bufferlist& bl); - void _finish_probe_end(int r, off_t end); - class C_ReadHead; - friend class C_ReadHead; - class C_ProbeEnd; - friend class C_ProbeEnd; - - - - // writer - off_t write_pos; // logical write position, where next entry will go - off_t flush_pos; // where we will flush. if write_pos>flush_pos, we're buffering writes. - off_t ack_pos; // what has been acked. - bufferlist write_buf; // write buffer. flush_pos + write_buf.length() == write_pos. - - std::map pending_flush; // start offsets and times for pending flushes - std::map > waitfor_flush; // when flushed through given offset - - void _finish_flush(int r, off_t start); - class C_Flush; - friend class C_Flush; - - // reader - off_t read_pos; // logical read position, where next entry starts. - off_t requested_pos; // what we've requested from OSD. - off_t received_pos; // what we've received from OSD. - bufferlist read_buf; // read buffer. read_pos + read_buf.length() == prefetch_pos. - bufferlist reading_buf; // what i'm reading into - - off_t fetch_len; // how much to read at a time - off_t prefetch_from; // how far from end do we read next chunk - - // for read_entry() in-progress read - bufferlist *read_bl; - Context *on_read_finish; - // for wait_for_readable() - Context *on_readable; - - bool _is_reading() { - return requested_pos > received_pos; - } - void _finish_read(int r); // we just read some (read completion callback) - void _issue_read(off_t len); // read some more - void _prefetch(); // maybe read ahead - class C_Read; - friend class C_Read; - class C_RetryRead; - friend class C_RetryRead; - - // trimmer - off_t expire_pos; // what we're allowed to trim to - off_t trimming_pos; // what we've requested to trim through - off_t trimmed_pos; // what has been trimmed - map > waitfor_trim; - - void _trim_finish(int r, off_t to); - class C_Trim; - friend class C_Trim; - -public: - Journaler(inode_t& inode_, Objecter *obj, Logger *l, off_t fl=0, off_t pff=0) : - inode(inode_), objecter(obj), filer(objecter), logger(l), - state(STATE_UNDEF), - write_pos(0), flush_pos(0), ack_pos(0), - read_pos(0), requested_pos(0), received_pos(0), - fetch_len(fl), prefetch_from(pff), - read_bl(0), on_read_finish(0), on_readable(0), - expire_pos(0), trimming_pos(0), trimmed_pos(0) - { - // prefetch intelligently. - // (watch out, this is big if you use big objects or weird striping) - if (!fetch_len) - fetch_len = inode.layout.object_size*inode.layout.stripe_count; - if (!prefetch_from) - prefetch_from = fetch_len / 2; - } - - // me - //void open(Context *onopen); - //void claim(Context *onclaim, msg_addr_t from); - - /* reset - * NOTE: we assume the caller knows/has ensured that any objects - * in our sequence do not exist.. e.g. after a MKFS. this is _not_ - * an "erase" method. - */ - void reset(); - void recover(Context *onfinish); - void write_head(Context *onsave=0); - - bool is_active() { return state == STATE_ACTIVE; } - - off_t get_write_pos() const { return write_pos; } - off_t get_read_pos() const { return read_pos; } - off_t get_expire_pos() const { return expire_pos; } - off_t get_trimmed_pos() const { return trimmed_pos; } - - // write - off_t append_entry(bufferlist& bl, Context *onsync = 0); - void flush(Context *onsync = 0); - - // read - void set_read_pos(off_t p) { - assert(requested_pos == received_pos); // we can't cope w/ in-progress read right now. - assert(read_bl == 0); // ... - read_pos = requested_pos = received_pos = p; - read_buf.clear(); - } - bool is_readable(); - bool try_read_entry(bufferlist& bl); - void wait_for_readable(Context *onfinish); - void read_entry(bufferlist* bl, Context *onfinish); - - // trim - void set_expire_pos(off_t ep) { expire_pos = ep; } - void trim(); - //bool is_trimmable() { return trimming_pos < expire_pos; } - //void trim(off_t trim_to=0, Context *c=0); -}; - - -#endif diff --git a/tags/20070517_before_mds_merge/osdc/ObjectCacher.cc b/tags/20070517_before_mds_merge/osdc/ObjectCacher.cc deleted file mode 100644 index c406455ca5407..0000000000000 --- a/tags/20070517_before_mds_merge/osdc/ObjectCacher.cc +++ /dev/null @@ -1,1555 +0,0 @@ - -#include "msg/Messenger.h" -#include "ObjectCacher.h" -#include "Objecter.h" - - - -/*** ObjectCacher::BufferHead ***/ - - -/*** ObjectCacher::Object ***/ - -#undef dout -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_objectcacher) cout << g_clock.now() << " " << oc->objecter->messenger->get_myname() << ".objectcacher.object(" << oid << ") " - - -ObjectCacher::BufferHead *ObjectCacher::Object::split(BufferHead *bh, off_t off) -{ - dout(20) << "split " << *bh << " at " << off << endl; - - // split off right - ObjectCacher::BufferHead *right = new BufferHead(this); - right->last_write_tid = bh->last_write_tid; - right->set_state(bh->get_state()); - - off_t newleftlen = off - bh->start(); - right->set_start( off ); - right->set_length( bh->length() - newleftlen ); - - // shorten left - oc->bh_stat_sub(bh); - bh->set_length( newleftlen ); - oc->bh_stat_add(bh); - - // add right - oc->bh_add(this, right); - - // split buffers too - bufferlist bl; - bl.claim(bh->bl); - if (bl.length()) { - assert(bl.length() == (bh->length() + right->length())); - right->bl.substr_of(bl, bh->length(), right->length()); - bh->bl.substr_of(bl, 0, bh->length()); - } - - // move read waiters - if (!bh->waitfor_read.empty()) { - map >::iterator o, p = bh->waitfor_read.end(); - p--; - while (p != bh->waitfor_read.begin()) { - if (p->first < right->start()) break; - dout(0) << "split moving waiters at byte " << p->first << " to right bh" << endl; - right->waitfor_read[p->first].swap( p->second ); - o = p; - p--; - bh->waitfor_read.erase(o); - } - } - - dout(20) << "split left is " << *bh << endl; - dout(20) << "split right is " << *right << endl; - return right; -} - - -void ObjectCacher::Object::merge_left(BufferHead *left, BufferHead *right) -{ - assert(left->end() == right->start()); - assert(left->get_state() == right->get_state()); - - dout(10) << "merge_left " << *left << " + " << *right << endl; - oc->bh_remove(this, right); - oc->bh_stat_sub(left); - left->set_length( left->length() + right->length()); - oc->bh_stat_add(left); - - // data - left->bl.claim_append(right->bl); - - // version - // note: this is sorta busted, but should only be used for dirty buffers - left->last_write_tid = MAX( left->last_write_tid, right->last_write_tid ); - left->last_write = MAX( left->last_write, right->last_write ); - - // waiters - for (map >::iterator p = right->waitfor_read.begin(); - p != right->waitfor_read.end(); - p++) - left->waitfor_read[p->first].splice( left->waitfor_read[p->first].begin(), - p->second ); - - // hose right - delete right; - - dout(10) << "merge_left result " << *left << endl; -} - - - -/* - * map a range of bytes into buffer_heads. - * - create missing buffer_heads as necessary. - */ -int ObjectCacher::Object::map_read(Objecter::OSDRead *rd, - map& hits, - map& missing, - map& rx) -{ - for (list::iterator ex_it = rd->extents.begin(); - ex_it != rd->extents.end(); - ex_it++) { - - if (ex_it->oid != oid) continue; - - dout(10) << "map_read " << ex_it->oid - << " " << ex_it->start << "~" << ex_it->length << endl; - - map::iterator p = data.lower_bound(ex_it->start); - // p->first >= start - - off_t cur = ex_it->start; - off_t left = ex_it->length; - - if (p != data.begin() && - (p == data.end() || p->first > cur)) { - p--; // might overlap! - if (p->first + p->second->length() <= cur) - p++; // doesn't overlap. - } - - while (left > 0) { - // at end? - if (p == data.end()) { - // rest is a miss. - BufferHead *n = new BufferHead(this); - n->set_start( cur ); - n->set_length( left ); - oc->bh_add(this, n); - missing[cur] = n; - dout(20) << "map_read miss " << left << " left, " << *n << endl; - cur += left; - left -= left; - assert(left == 0); - assert(cur == ex_it->start + (off_t)ex_it->length); - break; // no more. - } - - if (p->first <= cur) { - // have it (or part of it) - BufferHead *e = p->second; - - if (e->is_clean() || - e->is_dirty() || - e->is_tx()) { - hits[cur] = e; // readable! - dout(20) << "map_read hit " << *e << endl; - } - else if (e->is_rx()) { - rx[cur] = e; // missing, not readable. - dout(20) << "map_read rx " << *e << endl; - } - else assert(0); - - off_t lenfromcur = MIN(e->end() - cur, left); - cur += lenfromcur; - left -= lenfromcur; - p++; - continue; // more? - - } else if (p->first > cur) { - // gap.. miss - off_t next = p->first; - BufferHead *n = new BufferHead(this); - n->set_start( cur ); - n->set_length( MIN(next - cur, left) ); - oc->bh_add(this,n); - missing[cur] = n; - cur += MIN(left, n->length()); - left -= MIN(left, n->length()); - dout(20) << "map_read gap " << *n << endl; - continue; // more? - } - else - assert(0); - } - } - return(0); -} - -/* - * map a range of extents on an object's buffer cache. - * - combine any bh's we're writing into one - * - break up bufferheads that don't fall completely within the range - * //no! - return a bh that includes the write. may also include other dirty data to left and/or right. - */ -ObjectCacher::BufferHead *ObjectCacher::Object::map_write(Objecter::OSDWrite *wr) -{ - BufferHead *final = 0; - - for (list::iterator ex_it = wr->extents.begin(); - ex_it != wr->extents.end(); - ex_it++) { - - if (ex_it->oid != oid) continue; - - dout(10) << "map_write oex " << ex_it->oid - << " " << ex_it->start << "~" << ex_it->length << endl; - - map::iterator p = data.lower_bound(ex_it->start); - // p->first >= start - - off_t cur = ex_it->start; - off_t left = ex_it->length; - - if (p != data.begin() && - (p == data.end() || p->first > cur)) { - p--; // might overlap or butt up! - - /*// dirty and butts up? - if (p->first + p->second->length() == cur && - p->second->is_dirty()) { - dout(10) << "map_write will append to tail of " << *p->second << endl; - final = p->second; - } - */ - if (p->first + p->second->length() <= cur) - p++; // doesn't overlap. - } - - while (left > 0) { - off_t max = left; - - // at end ? - if (p == data.end()) { - if (final == NULL) { - final = new BufferHead(this); - final->set_start( cur ); - final->set_length( max ); - oc->bh_add(this, final); - dout(10) << "map_write adding trailing bh " << *final << endl; - } else { - final->set_length( final->length() + max ); - } - left -= max; - cur += max; - continue; - } - - dout(10) << "p is " << *p->second << endl; - - if (p->first <= cur) { - BufferHead *bh = p->second; - dout(10) << "map_write bh " << *bh << " intersected" << endl; - - /*if (bh->is_dirty()) { - // already dirty, let's use it. - final = bh; - } else { - */ - if (p->first < cur) { - assert(final == 0); - if (cur + max >= p->first + p->second->length()) { - // we want right bit (one splice) - final = split(bh, cur); // just split it, take right half. - p++; - assert(p->second == final); - } else { - // we want middle bit (two splices) - final = split(bh, cur); - p++; - assert(p->second == final); - split(final, cur+max); - } - } else if (p->first == cur) { - /*if (bh->is_dirty()) { - // already dirty, use it. - } - else*/ - if (p->second->length() <= max) { - // whole bufferhead, piece of cake. - } else { - // we want left bit (one splice) - split(bh, cur + max); // just split - } - if (final) - merge_left(final,bh); - else - final = bh; - } - - // keep going. - off_t lenfromcur = final->end() - cur; - cur += lenfromcur; - left -= lenfromcur; - p++; - continue; - } else { - // gap! - off_t next = p->first; - off_t glen = MIN(next - cur, max); - dout(10) << "map_write gap " << cur << "~" << glen << endl; - if (final) { - final->set_length( final->length() + glen ); - } else { - final = new BufferHead(this); - final->set_start( cur ); - final->set_length( glen ); - oc->bh_add(this, final); - } - - cur += glen; - left -= glen; - continue; // more? - } - } - } - - // set versoin - assert(final); - dout(10) << "map_write final is " << *final << endl; - - return final; -} - - -void ObjectCacher::Object::truncate(off_t s) -{ - dout(10) << "truncate to " << s << endl; - - while (!data.empty()) { - BufferHead *bh = data.rbegin()->second; - if (bh->end() <= s) - break; - - // split bh at truncation point? - if (bh->start() < s) { - split(bh, s); - continue; - } - - // remove bh entirely - assert(bh->start() >= s); - oc->bh_remove(this, bh); - delete bh; - } -} - - -/*** ObjectCacher ***/ - -#undef dout -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_objectcacher) cout << g_clock.now() << " " << objecter->messenger->get_myname() << ".objectcacher " - - - -/* private */ - -void ObjectCacher::close_object(Object *ob) -{ - dout(10) << "close_object " << *ob << endl; - assert(ob->can_close()); - - // ok! - objects.erase(ob->get_oid()); - objects_by_ino[ob->get_ino()].erase(ob); - if (objects_by_ino[ob->get_ino()].empty()) - objects_by_ino.erase(ob->get_ino()); - delete ob; -} - - - - -void ObjectCacher::bh_read(BufferHead *bh) -{ - dout(7) << "bh_read on " << *bh << endl; - - mark_rx(bh); - - // finisher - C_ReadFinish *onfinish = new C_ReadFinish(this, bh->ob->get_oid(), bh->start(), bh->length()); - - // go - objecter->read(bh->ob->get_oid(), bh->start(), bh->length(), &onfinish->bl, - onfinish); -} - -void ObjectCacher::bh_read_finish(object_t oid, off_t start, size_t length, bufferlist &bl) -{ - //lock.Lock(); - dout(7) << "bh_read_finish " - << oid - << " " << start << "~" << length - << endl; - - if (objects.count(oid) == 0) { - dout(7) << "bh_read_finish no object cache" << endl; - } else { - Object *ob = objects[oid]; - - // apply to bh's! - off_t opos = start; - map::iterator p = ob->data.lower_bound(opos); - - while (p != ob->data.end() && - opos < start+(off_t)length) { - BufferHead *bh = p->second; - - if (bh->start() > opos) { - dout(1) << "weirdness: gap when applying read results, " - << opos << "~" << bh->start() - opos - << endl; - opos = bh->start(); - continue; - } - - if (!bh->is_rx()) { - dout(10) << "bh_read_finish skipping non-rx " << *bh << endl; - opos = bh->end(); - p++; - continue; - } - - assert(opos >= bh->start()); - assert(bh->start() == opos); // we don't merge rx bh's... yet! - assert(bh->length() <= start+(off_t)length-opos); - - bh->bl.substr_of(bl, - opos-bh->start(), - bh->length()); - mark_clean(bh); - dout(10) << "bh_read_finish read " << *bh << endl; - - opos = bh->end(); - p++; - - // finishers? - // called with lock held. - list ls; - for (map >::iterator p = bh->waitfor_read.begin(); - p != bh->waitfor_read.end(); - p++) - ls.splice(ls.end(), p->second); - bh->waitfor_read.clear(); - finish_contexts(ls); - } - } - //lock.Unlock(); -} - - -void ObjectCacher::bh_write(BufferHead *bh) -{ - dout(7) << "bh_write " << *bh << endl; - - // finishers - C_WriteAck *onack = new C_WriteAck(this, bh->ob->get_oid(), bh->start(), bh->length()); - C_WriteCommit *oncommit = new C_WriteCommit(this, bh->ob->get_oid(), bh->start(), bh->length()); - - // go - tid_t tid = objecter->write(bh->ob->get_oid(), bh->start(), bh->length(), bh->bl, - onack, oncommit); - - // set bh last_write_tid - onack->tid = tid; - oncommit->tid = tid; - bh->ob->last_write_tid = tid; - bh->last_write_tid = tid; - - mark_tx(bh); -} - -void ObjectCacher::lock_ack(list& oids, tid_t tid) -{ - for (list::iterator i = oids.begin(); - i != oids.end(); - i++) { - object_t oid = *i; - - if (objects.count(oid) == 0) { - dout(7) << "lock_ack no object cache" << endl; - assert(0); - } - - Object *ob = objects[oid]; - - list ls; - - assert(tid <= ob->last_write_tid); - if (ob->last_write_tid == tid) { - dout(10) << "lock_ack " << *ob - << " tid " << tid << endl; - - switch (ob->lock_state) { - case Object::LOCK_RDUNLOCKING: - case Object::LOCK_WRUNLOCKING: - ob->lock_state = Object::LOCK_NONE; - break; - case Object::LOCK_RDLOCKING: - case Object::LOCK_DOWNGRADING: - ob->lock_state = Object::LOCK_RDLOCK; - ls.splice(ls.begin(), ob->waitfor_rd); - break; - case Object::LOCK_UPGRADING: - case Object::LOCK_WRLOCKING: - ob->lock_state = Object::LOCK_WRLOCK; - ls.splice(ls.begin(), ob->waitfor_wr); - ls.splice(ls.begin(), ob->waitfor_rd); - break; - - default: - assert(0); - } - - ob->last_ack_tid = tid; - - if (ob->can_close()) - close_object(ob); - } else { - dout(10) << "lock_ack " << *ob - << " tid " << tid << " obsolete" << endl; - } - - // waiters? - if (ob->waitfor_ack.count(tid)) { - ls.splice(ls.end(), ob->waitfor_ack[tid]); - ob->waitfor_ack.erase(tid); - } - - finish_contexts(ls); - - } -} - -void ObjectCacher::bh_write_ack(object_t oid, off_t start, size_t length, tid_t tid) -{ - //lock.Lock(); - - dout(7) << "bh_write_ack " - << oid - << " tid " << tid - << " " << start << "~" << length - << endl; - if (objects.count(oid) == 0) { - dout(7) << "bh_write_ack no object cache" << endl; - assert(0); - } else { - Object *ob = objects[oid]; - - // apply to bh's! - for (map::iterator p = ob->data.lower_bound(start); - p != ob->data.end(); - p++) { - BufferHead *bh = p->second; - - if (bh->start() > start+(off_t)length) break; - - if (bh->start() < start && - bh->end() > start+(off_t)length) { - dout(20) << "bh_write_ack skipping " << *bh << endl; - continue; - } - - // make sure bh is tx - if (!bh->is_tx()) { - dout(10) << "bh_write_ack skipping non-tx " << *bh << endl; - continue; - } - - // make sure bh tid matches - if (bh->last_write_tid != tid) { - assert(bh->last_write_tid > tid); - dout(10) << "bh_write_ack newer tid on " << *bh << endl; - continue; - } - - // ok! mark bh clean. - mark_clean(bh); - dout(10) << "bh_write_ack clean " << *bh << endl; - } - - // update object last_ack. - assert(ob->last_ack_tid < tid); - ob->last_ack_tid = tid; - - // waiters? - if (ob->waitfor_ack.count(tid)) { - list ls; - ls.splice(ls.begin(), ob->waitfor_ack[tid]); - ob->waitfor_ack.erase(tid); - finish_contexts(ls); - } - } - //lock.Unlock(); -} - -void ObjectCacher::bh_write_commit(object_t oid, off_t start, size_t length, tid_t tid) -{ - //lock.Lock(); - - // update object last_commit - dout(7) << "bh_write_commit " - << oid - << " tid " << tid - << " " << start << "~" << length - << endl; - if (objects.count(oid) == 0) { - dout(7) << "bh_write_commit no object cache" << endl; - //assert(0); - } else { - Object *ob = objects[oid]; - - // update last_commit. - ob->last_commit_tid = tid; - - // waiters? - if (ob->waitfor_commit.count(tid)) { - list ls; - ls.splice(ls.begin(), ob->waitfor_commit[tid]); - ob->waitfor_commit.erase(tid); - finish_contexts(ls); - } - } - - // lock.Unlock(); -} - - -void ObjectCacher::flush(off_t amount) -{ - utime_t cutoff = g_clock.now(); - //cutoff.sec_ref() -= g_conf.client_oc_max_dirty_age; - - dout(10) << "flush " << amount << endl; - - /* - * NOTE: we aren't actually pulling things off the LRU here, just looking at the - * tail item. Then we call bh_write, which moves it to the other LRU, so that we - * can call lru_dirty.lru_get_next_expire() again. - */ - off_t did = 0; - while (amount == 0 || did < amount) { - BufferHead *bh = (BufferHead*) lru_dirty.lru_get_next_expire(); - if (!bh) break; - if (bh->last_write > cutoff) break; - - did += bh->length(); - bh_write(bh); - } -} - - -void ObjectCacher::trim(off_t max) -{ - if (max < 0) - max = g_conf.client_oc_size; - - dout(10) << "trim start: max " << max - << " clean " << get_stat_clean() - << endl; - - while (get_stat_clean() > max) { - BufferHead *bh = (BufferHead*) lru_rest.lru_expire(); - if (!bh) break; - - dout(10) << "trim trimming " << *bh << endl; - assert(bh->is_clean()); - - Object *ob = bh->ob; - bh_remove(ob, bh); - delete bh; - - if (ob->can_close()) { - dout(10) << "trim trimming " << *ob << endl; - close_object(ob); - } - } - - dout(10) << "trim finish: max " << max - << " clean " << get_stat_clean() - << endl; -} - - - -/* public */ - -/* - * returns # bytes read (if in cache). onfinish is untouched (caller must delete it) - * returns 0 if doing async read - */ -int ObjectCacher::readx(Objecter::OSDRead *rd, inodeno_t ino, Context *onfinish) -{ - bool success = true; - list hit_ls; - map stripe_map; // final buffer offset -> substring - - for (list::iterator ex_it = rd->extents.begin(); - ex_it != rd->extents.end(); - ex_it++) { - dout(10) << "readx " << *ex_it << endl; - - // get Object cache - Object *o = get_object(ex_it->oid, ino); - - // map extent into bufferheads - map hits, missing, rx; - o->map_read(rd, hits, missing, rx); - - if (!missing.empty() || !rx.empty()) { - // read missing - for (map::iterator bh_it = missing.begin(); - bh_it != missing.end(); - bh_it++) { - bh_read(bh_it->second); - if (success) { - dout(10) << "readx missed, waiting on " << *bh_it->second - << " off " << bh_it->first << endl; - success = false; - bh_it->second->waitfor_read[bh_it->first].push_back( new C_RetryRead(this, rd, ino, onfinish) ); - } - } - - // bump rx - for (map::iterator bh_it = rx.begin(); - bh_it != rx.end(); - bh_it++) { - touch_bh(bh_it->second); // bump in lru, so we don't lose it. - if (success) { - dout(10) << "readx missed, waiting on " << *bh_it->second - << " off " << bh_it->first << endl; - success = false; - bh_it->second->waitfor_read[bh_it->first].push_back( new C_RetryRead(this, rd, ino, onfinish) ); - } - } - } else { - assert(!hits.empty()); - - // make a plain list - for (map::iterator bh_it = hits.begin(); - bh_it != hits.end(); - bh_it++) { - dout(10) << "readx hit bh " << *bh_it->second << endl; - hit_ls.push_back(bh_it->second); - } - - // create reverse map of buffer offset -> object for the eventual result. - // this is over a single ObjectExtent, so we know that - // - the bh's are contiguous - // - the buffer frags need not be (and almost certainly aren't) - off_t opos = ex_it->start; - map::iterator bh_it = hits.begin(); - assert(bh_it->second->start() <= opos); - size_t bhoff = opos - bh_it->second->start(); - map::iterator f_it = ex_it->buffer_extents.begin(); - size_t foff = 0; - while (1) { - BufferHead *bh = bh_it->second; - assert(opos == (off_t)(bh->start() + bhoff)); - - dout(10) << "readx rmap opos " << opos - << ": " << *bh << " +" << bhoff - << " frag " << f_it->first << "~" << f_it->second << " +" << foff - << endl; - - size_t len = MIN(f_it->second - foff, - bh->length() - bhoff); - stripe_map[f_it->first].substr_of(bh->bl, - opos - bh->start(), - len); - opos += len; - bhoff += len; - foff += len; - if (opos == bh->end()) { - bh_it++; - bhoff = 0; - } - if (foff == f_it->second) { - f_it++; - foff = 0; - } - if (bh_it == hits.end()) break; - if (f_it == ex_it->buffer_extents.end()) break; - } - assert(f_it == ex_it->buffer_extents.end()); - assert(opos == ex_it->start + (off_t)ex_it->length); - } - } - - // bump hits in lru - for (list::iterator bhit = hit_ls.begin(); - bhit != hit_ls.end(); - bhit++) - touch_bh(*bhit); - - if (!success) return 0; // wait! - - // no misses... success! do the read. - assert(!hit_ls.empty()); - dout(10) << "readx has all buffers" << endl; - - // ok, assemble into result buffer. - rd->bl->clear(); - size_t pos = 0; - for (map::iterator i = stripe_map.begin(); - i != stripe_map.end(); - i++) { - assert(pos == i->first); - dout(10) << "readx adding buffer len " << i->second.length() << " at " << pos << endl; - pos += i->second.length(); - rd->bl->claim_append(i->second); - } - dout(10) << "readx result is " << rd->bl->length() << endl; - - // done with read. - delete rd; - - trim(); - - return pos; -} - - -int ObjectCacher::writex(Objecter::OSDWrite *wr, inodeno_t ino) -{ - utime_t now = g_clock.now(); - - for (list::iterator ex_it = wr->extents.begin(); - ex_it != wr->extents.end(); - ex_it++) { - // get object cache - Object *o = get_object(ex_it->oid, ino); - - // map it all into a single bufferhead. - BufferHead *bh = o->map_write(wr); - - // adjust buffer pointers (ie "copy" data into my cache) - // this is over a single ObjectExtent, so we know that - // - there is one contiguous bh - // - the buffer frags need not be (and almost certainly aren't) - // note: i assume striping is monotonic... no jumps backwards, ever! - off_t opos = ex_it->start; - for (map::iterator f_it = ex_it->buffer_extents.begin(); - f_it != ex_it->buffer_extents.end(); - f_it++) { - dout(10) << "writex writing " << f_it->first << "~" << f_it->second << " into " << *bh << " at " << opos << endl; - size_t bhoff = bh->start() - opos; - assert(f_it->second <= bh->length() - bhoff); - - bufferlist frag; - frag.substr_of(wr->bl, - f_it->first, f_it->second); - - bh->bl.claim_append(frag); - opos += f_it->second; - } - - // it's dirty. - mark_dirty(bh); - touch_bh(bh); - bh->last_write = now; - - // recombine with left? - map::iterator p = o->data.find(bh->start()); - if (p != o->data.begin()) { - p--; - if (p->second->is_dirty()) { - o->merge_left(p->second,bh); - bh = p->second; - } - } - // right? - p = o->data.find(bh->start()); - p++; - if (p != o->data.end() && - p->second->is_dirty()) - o->merge_left(p->second,bh); - } - - delete wr; - - trim(); - return 0; -} - - -// blocking wait for write. -void ObjectCacher::wait_for_write(size_t len, Mutex& lock) -{ - while (get_stat_dirty() > g_conf.client_oc_max_dirty) { - dout(10) << "wait_for_write waiting" << endl; - flusher_cond.Signal(); - stat_waiter++; - stat_cond.Wait(lock); - stat_waiter--; - dout(10) << "wait_for_write woke up" << endl; - } -} - -void ObjectCacher::flusher_entry() -{ - dout(10) << "flusher start" << endl; - lock.Lock(); - while (!flusher_stop) { - while (!flusher_stop) { - off_t all = get_stat_tx() + get_stat_rx() + get_stat_clean() + get_stat_dirty(); - dout(11) << "flusher " - << all << " / " << g_conf.client_oc_size << ": " - << get_stat_tx() << " tx, " - << get_stat_rx() << " rx, " - << get_stat_clean() << " clean, " - << get_stat_dirty() << " / " << g_conf.client_oc_max_dirty << " dirty" - << endl; - if (get_stat_dirty() > g_conf.client_oc_max_dirty) { - // flush some dirty pages - dout(10) << "flusher " - << get_stat_dirty() << " / " << g_conf.client_oc_max_dirty << " dirty," - << " flushing some dirty bhs" << endl; - flush(get_stat_dirty() - g_conf.client_oc_max_dirty); - } - else { - // check tail of lru for old dirty items - utime_t cutoff = g_clock.now(); - cutoff.sec_ref()--; - BufferHead *bh = 0; - while ((bh = (BufferHead*)lru_dirty.lru_get_next_expire()) != 0 && - bh->last_write < cutoff) { - dout(10) << "flusher flushing aged dirty bh " << *bh << endl; - bh_write(bh); - } - break; - } - } - if (flusher_stop) break; - flusher_cond.WaitInterval(lock, utime_t(1,0)); - } - lock.Unlock(); - dout(10) << "flusher finish" << endl; -} - - - -// blocking. atomic+sync. -int ObjectCacher::atomic_sync_readx(Objecter::OSDRead *rd, inodeno_t ino, Mutex& lock) -{ - dout(10) << "atomic_sync_readx " << rd - << " in " << ino - << endl; - - if (rd->extents.size() == 1) { - // single object. - // just write synchronously. - Cond cond; - bool done = false; - objecter->readx(rd, new C_SafeCond(&lock, &cond, &done)); - - // block - while (!done) cond.Wait(lock); - } else { - // spans multiple objects, or is big. - - // sort by object... - map by_oid; - for (list::iterator ex_it = rd->extents.begin(); - ex_it != rd->extents.end(); - ex_it++) - by_oid[ex_it->oid] = *ex_it; - - // lock - for (map::iterator i = by_oid.begin(); - i != by_oid.end(); - i++) { - Object *o = get_object(i->first, ino); - rdlock(o); - } - - // readx will hose rd - list extents = rd->extents; - - // do the read, into our cache - Cond cond; - bool done = false; - readx(rd, ino, new C_SafeCond(&lock, &cond, &done)); - - // block - while (!done) cond.Wait(lock); - - // release the locks - for (list::iterator ex_it = extents.begin(); - ex_it != extents.end(); - ex_it++) { - assert(objects.count(ex_it->oid)); - Object *o = objects[ex_it->oid]; - rdunlock(o); - } - } - - return 0; -} - -int ObjectCacher::atomic_sync_writex(Objecter::OSDWrite *wr, inodeno_t ino, Mutex& lock) -{ - dout(10) << "atomic_sync_writex " << wr - << " in " << ino - << endl; - - if (wr->extents.size() == 1 && - wr->extents.front().length <= g_conf.client_oc_max_sync_write) { - // single object. - - // make sure we aren't already locking/locked... - object_t oid = wr->extents.front().oid; - Object *o = 0; - if (objects.count(oid)) o = get_object(oid, ino); - if (!o || - (o->lock_state != Object::LOCK_WRLOCK && - o->lock_state != Object::LOCK_WRLOCKING && - o->lock_state != Object::LOCK_UPGRADING)) { - // just write synchronously. - dout(10) << "atomic_sync_writex " << wr - << " in " << ino - << " doing sync write" - << endl; - - Cond cond; - bool done = false; - objecter->modifyx(wr, new C_SafeCond(&lock, &cond, &done), 0); - - // block - while (!done) cond.Wait(lock); - return 0; - } - } - - // spans multiple objects, or is big. - // sort by object... - map by_oid; - for (list::iterator ex_it = wr->extents.begin(); - ex_it != wr->extents.end(); - ex_it++) - by_oid[ex_it->oid] = *ex_it; - - // wrlock - for (map::iterator i = by_oid.begin(); - i != by_oid.end(); - i++) { - Object *o = get_object(i->first, ino); - wrlock(o); - } - - // writex will hose wr - list extents = wr->extents; - - // do the write, into our cache - writex(wr, ino); - - // flush - // ...and release the locks? - for (list::iterator ex_it = extents.begin(); - ex_it != extents.end(); - ex_it++) { - assert(objects.count(ex_it->oid)); - Object *o = objects[ex_it->oid]; - - wrunlock(o); - } - - return 0; -} - - - -// locking ----------------------------- - -void ObjectCacher::rdlock(Object *o) -{ - // lock? - if (o->lock_state == Object::LOCK_NONE || - o->lock_state == Object::LOCK_RDUNLOCKING || - o->lock_state == Object::LOCK_WRUNLOCKING) { - dout(10) << "rdlock rdlock " << *o << endl; - - o->lock_state = Object::LOCK_RDLOCKING; - - C_LockAck *ack = new C_LockAck(this, o->get_oid()); - C_WriteCommit *commit = new C_WriteCommit(this, o->get_oid(), 0, 0); - - commit->tid = - ack->tid = - o->last_write_tid = - objecter->lock(OSD_OP_RDLOCK, o->get_oid(), ack, commit); - } - - // stake our claim. - o->rdlock_ref++; - - // wait? - if (o->lock_state == Object::LOCK_RDLOCKING || - o->lock_state == Object::LOCK_WRLOCKING) { - dout(10) << "rdlock waiting for rdlock|wrlock on " << *o << endl; - Cond cond; - bool done = false; - o->waitfor_rd.push_back(new C_SafeCond(&lock, &cond, &done)); - while (!done) cond.Wait(lock); - } - assert(o->lock_state == Object::LOCK_RDLOCK || - o->lock_state == Object::LOCK_WRLOCK || - o->lock_state == Object::LOCK_UPGRADING || - o->lock_state == Object::LOCK_DOWNGRADING); -} - -void ObjectCacher::wrlock(Object *o) -{ - // lock? - if (o->lock_state != Object::LOCK_WRLOCK && - o->lock_state != Object::LOCK_WRLOCKING && - o->lock_state != Object::LOCK_UPGRADING) { - dout(10) << "wrlock wrlock " << *o << endl; - - int op = 0; - if (o->lock_state == Object::LOCK_RDLOCK) { - o->lock_state = Object::LOCK_UPGRADING; - op = OSD_OP_UPLOCK; - } else { - o->lock_state = Object::LOCK_WRLOCKING; - op = OSD_OP_WRLOCK; - } - - C_LockAck *ack = new C_LockAck(this, o->get_oid()); - C_WriteCommit *commit = new C_WriteCommit(this, o->get_oid(), 0, 0); - - commit->tid = - ack->tid = - o->last_write_tid = - objecter->lock(op, o->get_oid(), ack, commit); - } - - // stake our claim. - o->wrlock_ref++; - - // wait? - if (o->lock_state == Object::LOCK_WRLOCKING || - o->lock_state == Object::LOCK_UPGRADING) { - dout(10) << "wrlock waiting for wrlock on " << *o << endl; - Cond cond; - bool done = false; - o->waitfor_wr.push_back(new C_SafeCond(&lock, &cond, &done)); - while (!done) cond.Wait(lock); - } - assert(o->lock_state == Object::LOCK_WRLOCK); -} - - -void ObjectCacher::rdunlock(Object *o) -{ - dout(10) << "rdunlock " << *o << endl; - assert(o->lock_state == Object::LOCK_RDLOCK || - o->lock_state == Object::LOCK_WRLOCK || - o->lock_state == Object::LOCK_UPGRADING || - o->lock_state == Object::LOCK_DOWNGRADING); - - assert(o->rdlock_ref > 0); - o->rdlock_ref--; - if (o->rdlock_ref > 0 || - o->wrlock_ref > 0) { - dout(10) << "rdunlock " << *o << " still has rdlock|wrlock refs" << endl; - return; - } - - release(o); // release first - - o->lock_state = Object::LOCK_RDUNLOCKING; - - C_LockAck *lockack = new C_LockAck(this, o->get_oid()); - C_WriteCommit *commit = new C_WriteCommit(this, o->get_oid(), 0, 0); - commit->tid = - lockack->tid = - o->last_write_tid = - objecter->lock(OSD_OP_RDUNLOCK, o->get_oid(), lockack, commit); -} - -void ObjectCacher::wrunlock(Object *o) -{ - dout(10) << "wrunlock " << *o << endl; - assert(o->lock_state == Object::LOCK_WRLOCK); - - assert(o->wrlock_ref > 0); - o->wrlock_ref--; - if (o->wrlock_ref > 0) { - dout(10) << "wrunlock " << *o << " still has wrlock refs" << endl; - return; - } - - flush(o); // flush first - - int op = 0; - if (o->rdlock_ref > 0) { - dout(10) << "wrunlock rdlock " << *o << endl; - op = OSD_OP_DNLOCK; - o->lock_state = Object::LOCK_DOWNGRADING; - } else { - dout(10) << "wrunlock wrunlock " << *o << endl; - op = OSD_OP_WRUNLOCK; - o->lock_state = Object::LOCK_WRUNLOCKING; - } - - C_LockAck *lockack = new C_LockAck(this, o->get_oid()); - C_WriteCommit *commit = new C_WriteCommit(this, o->get_oid(), 0, 0); - commit->tid = - lockack->tid = - o->last_write_tid = - objecter->lock(op, o->get_oid(), lockack, commit); -} - - -// ------------------------------------------------- - - -bool ObjectCacher::set_is_cached(inodeno_t ino) -{ - if (objects_by_ino.count(ino) == 0) - return false; - - set& s = objects_by_ino[ino]; - for (set::iterator i = s.begin(); - i != s.end(); - i++) { - Object *ob = *i; - if (!ob->data.empty()) return true; - } - - return false; -} - -bool ObjectCacher::set_is_dirty_or_committing(inodeno_t ino) -{ - if (objects_by_ino.count(ino) == 0) - return false; - - set& s = objects_by_ino[ino]; - for (set::iterator i = s.begin(); - i != s.end(); - i++) { - Object *ob = *i; - - for (map::iterator p = ob->data.begin(); - p != ob->data.end(); - p++) { - BufferHead *bh = p->second; - if (bh->is_dirty() || bh->is_tx()) - return true; - } - } - - return false; -} - - -// purge. non-blocking. violently removes dirty buffers from cache. -void ObjectCacher::purge(Object *ob) -{ - dout(10) << "purge " << *ob << endl; - - for (map::iterator p = ob->data.begin(); - p != ob->data.end(); - p++) { - BufferHead *bh = p->second; - if (!bh->is_clean()) - dout(0) << "purge forcibly removing " << *ob << " " << *bh << endl; - bh_remove(ob, bh); - delete bh; - } - - if (ob->can_close()) { - dout(10) << "trim trimming " << *ob << endl; - close_object(ob); - } -} - -// flush. non-blocking. no callback. -// true if clean, already flushed. -// false if we wrote something. -bool ObjectCacher::flush(Object *ob) -{ - bool clean = true; - for (map::iterator p = ob->data.begin(); - p != ob->data.end(); - p++) { - BufferHead *bh = p->second; - if (bh->is_tx()) { - clean = false; - continue; - } - if (!bh->is_dirty()) continue; - - bh_write(bh); - clean = false; - } - return clean; -} - -// flush. non-blocking, takes callback. -// returns true if already flushed -bool ObjectCacher::flush_set(inodeno_t ino, Context *onfinish) -{ - if (objects_by_ino.count(ino) == 0) { - dout(10) << "flush_set on " << ino << " dne" << endl; - return true; - } - - dout(10) << "flush_set " << ino << endl; - - C_Gather *gather = 0; // we'll need to wait for all objects to flush! - - set& s = objects_by_ino[ino]; - bool safe = true; - for (set::iterator i = s.begin(); - i != s.end(); - i++) { - Object *ob = *i; - - if (!flush(ob)) { - // we'll need to gather... - if (!gather && onfinish) - gather = new C_Gather(onfinish); - safe = false; - - dout(10) << "flush_set " << ino << " will wait for ack tid " - << ob->last_write_tid - << " on " << *ob - << endl; - if (gather) - ob->waitfor_ack[ob->last_write_tid].push_back(gather->new_sub()); - } - } - - if (safe) { - dout(10) << "flush_set " << ino << " has no dirty|tx bhs" << endl; - return true; - } - return false; -} - - -// commit. non-blocking, takes callback. -// return true if already flushed. -bool ObjectCacher::commit_set(inodeno_t ino, Context *onfinish) -{ - assert(onfinish); // doesn't make any sense otherwise. - - if (objects_by_ino.count(ino) == 0) { - dout(10) << "commit_set on " << ino << " dne" << endl; - return true; - } - - dout(10) << "commit_set " << ino << endl; - - C_Gather *gather = 0; // we'll need to wait for all objects to commit - - set& s = objects_by_ino[ino]; - bool safe = true; - for (set::iterator i = s.begin(); - i != s.end(); - i++) { - Object *ob = *i; - - // make sure it's flushing. - flush_set(ino); - - if (ob->last_write_tid > ob->last_commit_tid) { - dout(10) << "commit_set " << ino << " " << *ob - << " will finish on commit tid " << ob->last_write_tid - << endl; - if (!gather && onfinish) gather = new C_Gather(onfinish); - safe = false; - if (gather) - ob->waitfor_commit[ob->last_write_tid].push_back( gather->new_sub() ); - } - } - - if (safe) { - dout(10) << "commit_set " << ino << " all committed" << endl; - return true; - } - return false; -} - -void ObjectCacher::purge_set(inodeno_t ino) -{ - if (objects_by_ino.count(ino) == 0) { - dout(10) << "purge_set on " << ino << " dne" << endl; - return; - } - - dout(10) << "purge_set " << ino << endl; - - set& s = objects_by_ino[ino]; - for (set::iterator i = s.begin(); - i != s.end(); - i++) { - Object *ob = *i; - purge(ob); - } -} - - -off_t ObjectCacher::release(Object *ob) -{ - list clean; - off_t o_unclean = 0; - - for (map::iterator p = ob->data.begin(); - p != ob->data.end(); - p++) { - BufferHead *bh = p->second; - if (bh->is_clean()) - clean.push_back(bh); - else - o_unclean += bh->length(); - } - - for (list::iterator p = clean.begin(); - p != clean.end(); - p++) { - bh_remove(ob, *p); - delete *p; - } - - if (ob->can_close()) { - dout(10) << "trim trimming " << *ob << endl; - close_object(ob); - } - - return o_unclean; -} - -off_t ObjectCacher::release_set(inodeno_t ino) -{ - // return # bytes not clean (and thus not released). - off_t unclean = 0; - - if (objects_by_ino.count(ino) == 0) { - dout(10) << "release_set on " << ino << " dne" << endl; - return 0; - } - - dout(10) << "release_set " << ino << endl; - - set s = objects_by_ino[ino]; - for (set::iterator i = s.begin(); - i != s.end(); - i++) { - Object *ob = *i; - - off_t o_unclean = release(ob); - unclean += o_unclean; - - if (o_unclean) - dout(10) << "release_set " << ino << " " << *ob - << " has " << o_unclean << " bytes left" - << endl; - - } - - if (unclean) { - dout(10) << "release_set " << ino - << ", " << unclean << " bytes left" << endl; - } - - return unclean; -} - -void ObjectCacher::truncate_set(inodeno_t ino, list& exls) -{ - if (objects_by_ino.count(ino) == 0) { - dout(10) << "truncate_set on " << ino << " dne" << endl; - return; - } - - dout(10) << "truncate_set " << ino << endl; - - for (list::iterator p = exls.begin(); - p != exls.end(); - ++p) { - ObjectExtent &ex = *p; - if (objects.count(ex.oid) == 0) continue; - Object *ob = objects[ex.oid]; - - // purge or truncate? - if (ex.start == 0) { - dout(10) << "truncate_set purging " << *ob << endl; - purge(ob); - } else { - // hrm, truncate object - dout(10) << "truncate_set truncating " << *ob << " at " << ex.start << endl; - ob->truncate(ex.start); - - if (ob->can_close()) { - dout(10) << "truncate_set trimming " << *ob << endl; - close_object(ob); - } - } - } -} - - -void ObjectCacher::kick_sync_writers(inodeno_t ino) -{ - if (objects_by_ino.count(ino) == 0) { - dout(10) << "kick_sync_writers on " << ino << " dne" << endl; - return; - } - - dout(10) << "kick_sync_writers on " << ino << endl; - - list ls; - - set& s = objects_by_ino[ino]; - for (set::iterator i = s.begin(); - i != s.end(); - i++) { - Object *ob = *i; - - ls.splice(ls.begin(), ob->waitfor_wr); - } - - finish_contexts(ls); -} - -void ObjectCacher::kick_sync_readers(inodeno_t ino) -{ - if (objects_by_ino.count(ino) == 0) { - dout(10) << "kick_sync_readers on " << ino << " dne" << endl; - return; - } - - dout(10) << "kick_sync_readers on " << ino << endl; - - list ls; - - set& s = objects_by_ino[ino]; - for (set::iterator i = s.begin(); - i != s.end(); - i++) { - Object *ob = *i; - - ls.splice(ls.begin(), ob->waitfor_rd); - } - - finish_contexts(ls); -} - - - diff --git a/tags/20070517_before_mds_merge/osdc/ObjectCacher.h b/tags/20070517_before_mds_merge/osdc/ObjectCacher.h deleted file mode 100644 index dbec05fa5eee9..0000000000000 --- a/tags/20070517_before_mds_merge/osdc/ObjectCacher.h +++ /dev/null @@ -1,558 +0,0 @@ -#ifndef __OBJECTCACHER_H_ -#define __OBJECTCACHER_H_ - -#include "include/types.h" -#include "include/lru.h" -#include "include/Context.h" - -#include "common/Cond.h" -#include "common/Thread.h" - -#include "Objecter.h" -#include "Filer.h" - -class Objecter; -class Objecter::OSDRead; -class Objecter::OSDWrite; - -class ObjectCacher { - public: - - class Object; - - // ******* BufferHead ********* - class BufferHead : public LRUObject { - public: - // states - static const int STATE_MISSING = 0; - static const int STATE_CLEAN = 1; - static const int STATE_DIRTY = 2; - static const int STATE_RX = 3; - static const int STATE_TX = 4; - - private: - // my fields - int state; - int ref; - struct { - off_t start, length; // bh extent in object - } ex; - - public: - Object *ob; - bufferlist bl; - tid_t last_write_tid; // version of bh (if non-zero) - utime_t last_write; - - map< off_t, list > waitfor_read; - - public: - // cons - BufferHead(Object *o) : - state(STATE_MISSING), - ref(0), - ob(o), - last_write_tid(0) {} - - // extent - off_t start() { return ex.start; } - void set_start(off_t s) { ex.start = s; } - off_t length() { return ex.length; } - void set_length(off_t l) { ex.length = l; } - off_t end() { return ex.start + ex.length; } - off_t last() { return end() - 1; } - - // states - void set_state(int s) { - if (s == STATE_RX || s == STATE_TX) get(); - if (state == STATE_RX || state == STATE_TX) put(); - state = s; - } - int get_state() { return state; } - - bool is_missing() { return state == STATE_MISSING; } - bool is_dirty() { return state == STATE_DIRTY; } - bool is_clean() { return state == STATE_CLEAN; } - bool is_tx() { return state == STATE_TX; } - bool is_rx() { return state == STATE_RX; } - - // reference counting - int get() { - assert(ref >= 0); - if (ref == 0) lru_pin(); - return ++ref; - } - int put() { - assert(ref > 0); - if (ref == 1) lru_unpin(); - --ref; - return ref; - } - }; - - - // ******* Object ********* - class Object { - private: - // ObjectCacher::Object fields - ObjectCacher *oc; - object_t oid; // this _always_ is oid.rev=0 - inodeno_t ino; - objectrev_t rev; // last rev we're written - - public: - map data; - - tid_t last_write_tid; // version of bh (if non-zero) - tid_t last_ack_tid; // last update acked. - tid_t last_commit_tid; // last update commited. - - map< tid_t, list > waitfor_ack; - map< tid_t, list > waitfor_commit; - list waitfor_rd; - list waitfor_wr; - - // lock - static const int LOCK_NONE = 0; - static const int LOCK_WRLOCKING = 1; - static const int LOCK_WRLOCK = 2; - static const int LOCK_WRUNLOCKING = 3; - static const int LOCK_RDLOCKING = 4; - static const int LOCK_RDLOCK = 5; - static const int LOCK_RDUNLOCKING = 6; - static const int LOCK_UPGRADING = 7; // rd -> wr - static const int LOCK_DOWNGRADING = 8; // wr -> rd - int lock_state; - int wrlock_ref; // how many ppl want or are using a WRITE lock - int rdlock_ref; // how many ppl want or are using a READ lock - - public: - Object(ObjectCacher *_oc, object_t o, inodeno_t i) : - oc(_oc), - oid(o), ino(i), - last_write_tid(0), last_ack_tid(0), last_commit_tid(0), - lock_state(LOCK_NONE), wrlock_ref(0), rdlock_ref(0) - {} - ~Object() { - assert(data.empty()); - } - - object_t get_oid() { return oid; } - inodeno_t get_ino() { return ino; } - - bool can_close() { - return data.empty() && lock_state == LOCK_NONE && - waitfor_ack.empty() && waitfor_commit.empty() && - waitfor_rd.empty() && waitfor_wr.empty(); - } - - // bh - void add_bh(BufferHead *bh) { - // add to my map - assert(data.count(bh->start()) == 0); - - if (0) { // sanity check FIXME DEBUG - //cout << "add_bh " << bh->start() << "~" << bh->length() << endl; - map::iterator p = data.lower_bound(bh->start()); - if (p != data.end()) { - //cout << " after " << *p->second << endl; - //cout << " after starts at " << p->first << endl; - assert(p->first >= bh->end()); - } - if (p != data.begin()) { - p--; - //cout << " before starts at " << p->second->start() - //<< " and ends at " << p->second->end() << endl; - //cout << " before " << *p->second << endl; - assert(p->second->end() <= bh->start()); - } - } - - data[bh->start()] = bh; - } - void remove_bh(BufferHead *bh) { - assert(data.count(bh->start())); - data.erase(bh->start()); - } - bool is_empty() { return data.empty(); } - - // mid-level - BufferHead *split(BufferHead *bh, off_t off); - void merge_left(BufferHead *left, BufferHead *right); - void merge_right(BufferHead *left, BufferHead *right); - - int map_read(Objecter::OSDRead *rd, - map& hits, - map& missing, - map& rx); - BufferHead *map_write(Objecter::OSDWrite *wr); - - void truncate(off_t s); - }; - - // ******* ObjectCacher ********* - // ObjectCacher fields - public: - Objecter *objecter; - Filer filer; - - private: - Mutex& lock; - - hash_map objects; - hash_map > objects_by_ino; - - set dirty_bh; - LRU lru_dirty, lru_rest; - - Cond flusher_cond; - bool flusher_stop; - void flusher_entry(); - class FlusherThread : public Thread { - ObjectCacher *oc; - public: - FlusherThread(ObjectCacher *o) : oc(o) {} - void *entry() { - oc->flusher_entry(); - return 0; - } - } flusher_thread; - - - // objects - Object *get_object(object_t oid, inodeno_t ino) { - // have it? - if (objects.count(oid)) - return objects[oid]; - - // create it. - Object *o = new Object(this, oid, ino); - objects[oid] = o; - objects_by_ino[ino].insert(o); - return o; - } - void close_object(Object *ob); - - // bh stats - Cond stat_cond; - int stat_waiter; - - off_t stat_clean; - off_t stat_dirty; - off_t stat_rx; - off_t stat_tx; - off_t stat_missing; - - void bh_stat_add(BufferHead *bh) { - switch (bh->get_state()) { - case BufferHead::STATE_MISSING: stat_missing += bh->length(); break; - case BufferHead::STATE_CLEAN: stat_clean += bh->length(); break; - case BufferHead::STATE_DIRTY: stat_dirty += bh->length(); break; - case BufferHead::STATE_TX: stat_tx += bh->length(); break; - case BufferHead::STATE_RX: stat_rx += bh->length(); break; - } - if (stat_waiter) stat_cond.Signal(); - } - void bh_stat_sub(BufferHead *bh) { - switch (bh->get_state()) { - case BufferHead::STATE_MISSING: stat_missing -= bh->length(); break; - case BufferHead::STATE_CLEAN: stat_clean -= bh->length(); break; - case BufferHead::STATE_DIRTY: stat_dirty -= bh->length(); break; - case BufferHead::STATE_TX: stat_tx -= bh->length(); break; - case BufferHead::STATE_RX: stat_rx -= bh->length(); break; - } - } - off_t get_stat_tx() { return stat_tx; } - off_t get_stat_rx() { return stat_rx; } - off_t get_stat_dirty() { return stat_dirty; } - off_t get_stat_clean() { return stat_clean; } - - void touch_bh(BufferHead *bh) { - if (bh->is_dirty()) - lru_dirty.lru_touch(bh); - else - lru_rest.lru_touch(bh); - } - - // bh states - void bh_set_state(BufferHead *bh, int s) { - // move between lru lists? - if (s == BufferHead::STATE_DIRTY && bh->get_state() != BufferHead::STATE_DIRTY) { - lru_rest.lru_remove(bh); - lru_dirty.lru_insert_top(bh); - dirty_bh.insert(bh); - } - if (s != BufferHead::STATE_DIRTY && bh->get_state() == BufferHead::STATE_DIRTY) { - lru_dirty.lru_remove(bh); - lru_rest.lru_insert_mid(bh); - dirty_bh.erase(bh); - } - - // set state - bh_stat_sub(bh); - bh->set_state(s); - bh_stat_add(bh); - } - - void copy_bh_state(BufferHead *bh1, BufferHead *bh2) { - bh_set_state(bh2, bh1->get_state()); - } - - void mark_missing(BufferHead *bh) { bh_set_state(bh, BufferHead::STATE_MISSING); }; - void mark_clean(BufferHead *bh) { bh_set_state(bh, BufferHead::STATE_CLEAN); }; - void mark_rx(BufferHead *bh) { bh_set_state(bh, BufferHead::STATE_RX); }; - void mark_tx(BufferHead *bh) { bh_set_state(bh, BufferHead::STATE_TX); }; - void mark_dirty(BufferHead *bh) { - bh_set_state(bh, BufferHead::STATE_DIRTY); - lru_dirty.lru_touch(bh); - //bh->set_dirty_stamp(g_clock.now()); - }; - - void bh_add(Object *ob, BufferHead *bh) { - ob->add_bh(bh); - if (bh->is_dirty()) { - lru_dirty.lru_insert_top(bh); - dirty_bh.insert(bh); - } else { - lru_rest.lru_insert_top(bh); - } - bh_stat_add(bh); - } - void bh_remove(Object *ob, BufferHead *bh) { - ob->remove_bh(bh); - if (bh->is_dirty()) { - lru_dirty.lru_remove(bh); - dirty_bh.erase(bh); - } else { - lru_rest.lru_remove(bh); - } - bh_stat_sub(bh); - } - - // io - void bh_read(BufferHead *bh); - void bh_write(BufferHead *bh); - - void trim(off_t max=-1); - void flush(off_t amount=0); - - bool flush(Object *o); - off_t release(Object *o); - void purge(Object *o); - - void rdlock(Object *o); - void rdunlock(Object *o); - void wrlock(Object *o); - void wrunlock(Object *o); - - public: - void bh_read_finish(object_t oid, off_t offset, size_t length, bufferlist &bl); - void bh_write_ack(object_t oid, off_t offset, size_t length, tid_t t); - void bh_write_commit(object_t oid, off_t offset, size_t length, tid_t t); - void lock_ack(list& oids, tid_t tid); - - class C_ReadFinish : public Context { - ObjectCacher *oc; - object_t oid; - off_t start; - size_t length; - public: - bufferlist bl; - C_ReadFinish(ObjectCacher *c, object_t o, off_t s, size_t l) : oc(c), oid(o), start(s), length(l) {} - void finish(int r) { - oc->bh_read_finish(oid, start, length, bl); - } - }; - - class C_WriteAck : public Context { - ObjectCacher *oc; - object_t oid; - off_t start; - size_t length; - public: - tid_t tid; - C_WriteAck(ObjectCacher *c, object_t o, off_t s, size_t l) : oc(c), oid(o), start(s), length(l) {} - void finish(int r) { - oc->bh_write_ack(oid, start, length, tid); - } - }; - class C_WriteCommit : public Context { - ObjectCacher *oc; - object_t oid; - off_t start; - size_t length; - public: - tid_t tid; - C_WriteCommit(ObjectCacher *c, object_t o, off_t s, size_t l) : oc(c), oid(o), start(s), length(l) {} - void finish(int r) { - oc->bh_write_commit(oid, start, length, tid); - } - }; - - class C_LockAck : public Context { - ObjectCacher *oc; - public: - list oids; - tid_t tid; - C_LockAck(ObjectCacher *c, object_t o) : oc(c) { - oids.push_back(o); - } - void finish(int r) { - oc->lock_ack(oids, tid); - } - }; - - - - public: - ObjectCacher(Objecter *o, Mutex& l) : - objecter(o), filer(o), lock(l), - flusher_stop(false), flusher_thread(this), - stat_waiter(0), - stat_clean(0), stat_dirty(0), stat_rx(0), stat_tx(0), stat_missing(0) { - flusher_thread.create(); - } - ~ObjectCacher() { - // we should be empty. - assert(objects.empty()); - assert(lru_rest.lru_get_size() == 0); - assert(lru_dirty.lru_get_size() == 0); - assert(dirty_bh.empty()); - - assert(flusher_thread.is_started()); - lock.Lock(); // hmm.. watch out for deadlock! - flusher_stop = true; - flusher_cond.Signal(); - lock.Unlock(); - flusher_thread.join(); - } - - - class C_RetryRead : public Context { - ObjectCacher *oc; - Objecter::OSDRead *rd; - inodeno_t ino; - Context *onfinish; - public: - C_RetryRead(ObjectCacher *_oc, Objecter::OSDRead *r, inodeno_t i, Context *c) : oc(_oc), rd(r), ino(i), onfinish(c) {} - void finish(int) { - int r = oc->readx(rd, ino, onfinish); - if (r > 0) { - onfinish->finish(r); - delete onfinish; - } - } - }; - - // non-blocking. async. - int readx(Objecter::OSDRead *rd, inodeno_t ino, Context *onfinish); - int writex(Objecter::OSDWrite *wr, inodeno_t ino); - - // write blocking - void wait_for_write(size_t len, Mutex& lock); - - // blocking. atomic+sync. - int atomic_sync_readx(Objecter::OSDRead *rd, inodeno_t ino, Mutex& lock); - int atomic_sync_writex(Objecter::OSDWrite *wr, inodeno_t ino, Mutex& lock); - - bool set_is_cached(inodeno_t ino); - bool set_is_dirty_or_committing(inodeno_t ino); - - bool flush_set(inodeno_t ino, Context *onfinish=0); - void flush_all(Context *onfinish=0); - - bool commit_set(inodeno_t ino, Context *oncommit); - void commit_all(Context *oncommit=0); - - void purge_set(inodeno_t ino); - - off_t release_set(inodeno_t ino); // returns # of bytes not released (ie non-clean) - - void truncate_set(inodeno_t ino, list& ex); - - void kick_sync_writers(inodeno_t ino); - void kick_sync_readers(inodeno_t ino); - - - // file functions - - /*** async+caching (non-blocking) file interface ***/ - int file_read(inode_t& inode, - off_t offset, size_t len, - bufferlist *bl, - Context *onfinish) { - Objecter::OSDRead *rd = new Objecter::OSDRead(bl); - filer.file_to_extents(inode, offset, len, rd->extents); - return readx(rd, inode.ino, onfinish); - } - - int file_write(inode_t& inode, - off_t offset, size_t len, - bufferlist& bl, - objectrev_t rev=0) { - Objecter::OSDWrite *wr = new Objecter::OSDWrite(bl); - filer.file_to_extents(inode, offset, len, wr->extents); - return writex(wr, inode.ino); - } - - - - /*** sync+blocking file interface ***/ - - int file_atomic_sync_read(inode_t& inode, - off_t offset, size_t len, - bufferlist *bl, - Mutex &lock) { - Objecter::OSDRead *rd = new Objecter::OSDRead(bl); - filer.file_to_extents(inode, offset, len, rd->extents); - return atomic_sync_readx(rd, inode.ino, lock); - } - - int file_atomic_sync_write(inode_t& inode, - off_t offset, size_t len, - bufferlist& bl, - Mutex &lock, - objectrev_t rev=0) { - Objecter::OSDWrite *wr = new Objecter::OSDWrite(bl); - filer.file_to_extents(inode, offset, len, wr->extents); - return atomic_sync_writex(wr, inode.ino, lock); - } - -}; - - -inline ostream& operator<<(ostream& out, ObjectCacher::BufferHead &bh) -{ - out << "bh[" - << bh.start() << "~" << bh.length() - << " (" << bh.bl.length() << ")" - << " v " << bh.last_write_tid; - if (bh.is_tx()) out << " tx"; - if (bh.is_rx()) out << " rx"; - if (bh.is_dirty()) out << " dirty"; - if (bh.is_clean()) out << " clean"; - if (bh.is_missing()) out << " missing"; - out << "]"; - return out; -} - -inline ostream& operator<<(ostream& out, ObjectCacher::Object &ob) -{ - out << "object[" - << hex << ob.get_oid() << " ino " << ob.get_ino() << dec - << " wr " << ob.last_write_tid << "/" << ob.last_ack_tid << "/" << ob.last_commit_tid; - - switch (ob.lock_state) { - case ObjectCacher::Object::LOCK_WRLOCKING: out << " wrlocking"; break; - case ObjectCacher::Object::LOCK_WRLOCK: out << " wrlock"; break; - case ObjectCacher::Object::LOCK_WRUNLOCKING: out << " wrunlocking"; break; - case ObjectCacher::Object::LOCK_RDLOCKING: out << " rdlocking"; break; - case ObjectCacher::Object::LOCK_RDLOCK: out << " rdlock"; break; - case ObjectCacher::Object::LOCK_RDUNLOCKING: out << " rdunlocking"; break; - } - - out << "]"; - return out; -} - -#endif diff --git a/tags/20070517_before_mds_merge/osdc/Objecter.cc b/tags/20070517_before_mds_merge/osdc/Objecter.cc deleted file mode 100644 index 9e49a43ace89b..0000000000000 --- a/tags/20070517_before_mds_merge/osdc/Objecter.cc +++ /dev/null @@ -1,838 +0,0 @@ - -#include "Objecter.h" -#include "osd/OSDMap.h" -#include "mon/MonMap.h" - -#include "msg/Messenger.h" -#include "msg/Message.h" - -#include "messages/MOSDOp.h" -#include "messages/MOSDOpReply.h" -#include "messages/MOSDMap.h" -#include "messages/MOSDGetMap.h" - -#include "messages/MOSDFailure.h" - -#include - -#include "config.h" -#undef dout -#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_objecter) cout << g_clock.now() << " " << messenger->get_myname() << ".objecter " -#define derr(x) if (x <= g_conf.debug || x <= g_conf.debug_objecter) cerr << g_clock.now() << " " << messenger->get_myname() << ".objecter " - - -// messages ------------------------------ - -void Objecter::dispatch(Message *m) -{ - switch (m->get_type()) { - case MSG_OSD_OPREPLY: - handle_osd_op_reply((MOSDOpReply*)m); - break; - - case MSG_OSD_MAP: - handle_osd_map((MOSDMap*)m); - break; - - default: - dout(1) << "don't know message type " << m->get_type() << endl; - assert(0); - } -} - -void Objecter::handle_osd_map(MOSDMap *m) -{ - assert(osdmap); - - if (m->get_last() <= osdmap->get_epoch()) { - dout(3) << "handle_osd_map ignoring epochs [" - << m->get_first() << "," << m->get_last() - << "] <= " << osdmap->get_epoch() << endl; - } - else { - dout(3) << "handle_osd_map got epochs [" - << m->get_first() << "," << m->get_last() - << "] > " << osdmap->get_epoch() - << endl; - - set changed_pgs; - - for (epoch_t e = osdmap->get_epoch() + 1; - e <= m->get_last(); - e++) { - if (m->incremental_maps.count(e)) { - dout(3) << "handle_osd_map decoding incremental epoch " << e << endl; - OSDMap::Incremental inc; - int off = 0; - inc.decode(m->incremental_maps[e], off); - osdmap->apply_incremental(inc); - - // notify messenger - for (map::iterator i = inc.new_down.begin(); - i != inc.new_down.end(); - i++) - messenger->mark_down(i->second.addr); - - } - else if (m->maps.count(e)) { - dout(3) << "handle_osd_map decoding full epoch " << e << endl; - osdmap->decode(m->maps[e]); - } - else { - dout(3) << "handle_osd_map requesting missing epoch " << osdmap->get_epoch()+1 << endl; - int mon = monmap->pick_mon(); - messenger->send_message(new MOSDGetMap(osdmap->get_epoch()), - monmap->get_inst(mon)); - break; - } - - // scan pgs for changes - scan_pgs(changed_pgs); - - assert(e == osdmap->get_epoch()); - } - - // kick requests who might be timing out on the wrong osds - if (!changed_pgs.empty()) - kick_requests(changed_pgs); - } - - delete m; -} - -void Objecter::scan_pgs(set& changed_pgs) -{ - dout(10) << "scan_pgs" << endl; - - for (hash_map::iterator i = pg_map.begin(); - i != pg_map.end(); - i++) { - pg_t pgid = i->first; - PG& pg = i->second; - - // calc new. - vector other; - osdmap->pg_to_acting_osds(pgid, other); - - if (other == pg.acting) - continue; // no change. - - other.swap(pg.acting); - - if (g_conf.osd_rep == OSD_REP_PRIMARY) { - // same primary? - if (!other.empty() && - !pg.acting.empty() && - other[0] == pg.acting[0]) - continue; - } - else if (g_conf.osd_rep == OSD_REP_SPLAY) { - // same primary and acker? - if (!other.empty() && - !pg.acting.empty() && - other[0] == pg.acting[0] && - other[other.size() > 1 ? 1:0] == pg.acting[pg.acting.size() > 1 ? 1:0]) - continue; - } - else if (g_conf.osd_rep == OSD_REP_CHAIN) { - // any change is significant. - } - - // changed significantly. - dout(10) << "scan_pgs pg " << pgid - << " (" << pg.active_tids << ")" - << " " << other << " -> " << pg.acting - << endl; - changed_pgs.insert(pgid); - } -} - -void Objecter::kick_requests(set& changed_pgs) -{ - dout(10) << "kick_requests in pgs " << changed_pgs << endl; - - for (set::iterator i = changed_pgs.begin(); - i != changed_pgs.end(); - i++) { - pg_t pgid = *i; - PG& pg = pg_map[pgid]; - - // resubmit ops! - set tids; - tids.swap( pg.active_tids ); - close_pg( pgid ); // will pbly reopen, unless it's just commits we're missing - - for (set::iterator p = tids.begin(); - p != tids.end(); - p++) { - tid_t tid = *p; - - if (op_modify.count(tid)) { - OSDModify *wr = op_modify[tid]; - op_modify.erase(tid); - - // WRITE - if (wr->tid_version.count(tid)) { - if (wr->op == OSD_OP_WRITE && - !g_conf.objecter_buffer_uncommitted) { - dout(0) << "kick_requests missing commit, cannot replay: objecter_buffer_uncommitted == FALSE" << endl; - } else { - dout(0) << "kick_requests missing commit, replay write " << tid - << " v " << wr->tid_version[tid] << endl; - modifyx_submit(wr, wr->waitfor_commit[tid], tid); - } - } - else if (wr->waitfor_ack.count(tid)) { - dout(0) << "kick_requests missing ack, resub write " << tid << endl; - modifyx_submit(wr, wr->waitfor_ack[tid], tid); - } - } - - else if (op_read.count(tid)) { - // READ - OSDRead *rd = op_read[tid]; - op_read.erase(tid); - dout(0) << "kick_requests resub read " << tid << endl; - - // resubmit - readx_submit(rd, rd->ops[tid]); - rd->ops.erase(tid); - } - - else if (op_stat.count(tid)) { - OSDStat *st = op_stat[tid]; - op_stat.erase(tid); - - dout(0) << "kick_requests resub stat " << tid << endl; - - // resubmit - stat_submit(st); - } - - else - assert(0); - } - } -} - - - -void Objecter::handle_osd_op_reply(MOSDOpReply *m) -{ - // read or modify? - switch (m->get_op()) { - case OSD_OP_READ: - handle_osd_read_reply(m); - break; - - case OSD_OP_STAT: - handle_osd_stat_reply(m); - break; - - case OSD_OP_WRNOOP: - case OSD_OP_WRITE: - case OSD_OP_ZERO: - case OSD_OP_DELETE: - case OSD_OP_WRUNLOCK: - case OSD_OP_WRLOCK: - case OSD_OP_RDLOCK: - case OSD_OP_RDUNLOCK: - case OSD_OP_UPLOCK: - case OSD_OP_DNLOCK: - handle_osd_modify_reply(m); - break; - - default: - assert(0); - } -} - - - -// stat ----------------------------------- - -tid_t Objecter::stat(object_t oid, off_t *size, Context *onfinish, - objectrev_t rev) -{ - OSDStat *st = new OSDStat(size); - st->extents.push_back(ObjectExtent(oid, 0, 0)); - st->extents.front().pgid = osdmap->object_to_pg( oid, g_OSD_FileLayout ); - st->extents.front().rev = rev; - st->onfinish = onfinish; - - return stat_submit(st); -} - -tid_t Objecter::stat_submit(OSDStat *st) -{ - // find OSD - ObjectExtent &ex = st->extents.front(); - PG &pg = get_pg( ex.pgid ); - - // pick tid - last_tid++; - assert(client_inc >= 0); - - // add to gather set - st->tid = last_tid; - op_stat[last_tid] = st; - - pg.active_tids.insert(last_tid); - - // send? - dout(10) << "stat_submit " << st << " tid " << last_tid - << " oid " << ex.oid - << " pg " << ex.pgid - << " osd" << pg.acker() - << endl; - - if (pg.acker() >= 0) { - MOSDOp *m = new MOSDOp(messenger->get_myinst(), client_inc, last_tid, - ex.oid, ex.pgid, osdmap->get_epoch(), - OSD_OP_STAT); - - messenger->send_message(m, osdmap->get_inst(pg.acker())); - } - - return last_tid; -} - -void Objecter::handle_osd_stat_reply(MOSDOpReply *m) -{ - // get pio - tid_t tid = m->get_tid(); - - if (op_stat.count(tid) == 0) { - dout(7) << "handle_osd_stat_reply " << tid << " ... stray" << endl; - delete m; - return; - } - - dout(7) << "handle_osd_stat_reply " << tid - << " r=" << m->get_result() - << " size=" << m->get_object_size() - << endl; - OSDStat *st = op_stat[ tid ]; - op_stat.erase( tid ); - - // remove from osd/tid maps - PG& pg = get_pg( m->get_pg() ); - assert(pg.active_tids.count(tid)); - pg.active_tids.erase(tid); - if (pg.active_tids.empty()) close_pg( m->get_pg() ); - - // success? - if (m->get_result() == -EAGAIN) { - dout(7) << " got -EAGAIN, resubmitting" << endl; - stat_submit(st); - delete m; - return; - } - //assert(m->get_result() >= 0); - - // ok! - if (m->get_result() < 0) { - *st->size = -1; - } else { - *st->size = m->get_object_size(); - } - - // finish, clean up - Context *onfinish = st->onfinish; - - // done - delete st; - if (onfinish) { - onfinish->finish(m->get_result()); - delete onfinish; - } - - delete m; -} - - -// read ----------------------------------- - - -tid_t Objecter::read(object_t oid, off_t off, size_t len, bufferlist *bl, - Context *onfinish, - objectrev_t rev) -{ - OSDRead *rd = new OSDRead(bl); - rd->extents.push_back(ObjectExtent(oid, off, len)); - rd->extents.front().pgid = osdmap->object_to_pg( oid, g_OSD_FileLayout ); - rd->extents.front().rev = rev; - readx(rd, onfinish); - return last_tid; -} - - -tid_t Objecter::readx(OSDRead *rd, Context *onfinish) -{ - rd->onfinish = onfinish; - - // issue reads - for (list::iterator it = rd->extents.begin(); - it != rd->extents.end(); - it++) - readx_submit(rd, *it); - - return last_tid; -} - -tid_t Objecter::readx_submit(OSDRead *rd, ObjectExtent &ex) -{ - // find OSD - PG &pg = get_pg( ex.pgid ); - - // pick tid - last_tid++; - assert(client_inc >= 0); - - // add to gather set - rd->ops[last_tid] = ex; - op_read[last_tid] = rd; - - pg.active_tids.insert(last_tid); - - // send? - dout(10) << "readx_submit " << rd << " tid " << last_tid - << " oid " << ex.oid << " " << ex.start << "~" << ex.length - << " (" << ex.buffer_extents.size() << " buffer fragments)" - << " pg " << ex.pgid - << " osd" << pg.acker() - << endl; - - if (pg.acker() >= 0) { - MOSDOp *m = new MOSDOp(messenger->get_myinst(), client_inc, last_tid, - ex.oid, ex.pgid, osdmap->get_epoch(), - OSD_OP_READ); - m->set_length(ex.length); - m->set_offset(ex.start); - - messenger->send_message(m, osdmap->get_inst(pg.acker())); - } - - return last_tid; -} - - -void Objecter::handle_osd_read_reply(MOSDOpReply *m) -{ - // get pio - tid_t tid = m->get_tid(); - - if (op_read.count(tid) == 0) { - dout(7) << "handle_osd_read_reply " << tid << " ... stray" << endl; - delete m; - return; - } - - dout(7) << "handle_osd_read_reply " << tid << endl; - OSDRead *rd = op_read[ tid ]; - op_read.erase( tid ); - - // remove from osd/tid maps - PG& pg = get_pg( m->get_pg() ); - assert(pg.active_tids.count(tid)); - pg.active_tids.erase(tid); - if (pg.active_tids.empty()) close_pg( m->get_pg() ); - - // our op finished - rd->ops.erase(tid); - - // success? - if (m->get_result() == -EAGAIN) { - dout(7) << " got -EAGAIN, resubmitting" << endl; - readx_submit(rd, rd->ops[tid]); - delete m; - return; - } - //assert(m->get_result() >= 0); - - // what buffer offset are we? - dout(7) << " got frag from " << m->get_oid() << " " - << m->get_offset() << "~" << m->get_length() - << ", still have " << rd->ops.size() << " more ops" << endl; - - if (rd->ops.empty()) { - // all done - size_t bytes_read = 0; - - if (rd->read_data.size()) { - dout(15) << " assembling frags" << endl; - - /** FIXME This doesn't handle holes efficiently. - * It allocates zero buffers to fill whole buffer, and - * then discards trailing ones at the end. - * - * Actually, this whole thing is pretty messy with temporary bufferlist*'s all over - * the heap. - */ - - // we have other fragments, assemble them all... blech! - rd->read_data[m->get_oid()] = new bufferlist; - rd->read_data[m->get_oid()]->claim( m->get_data() ); - - // map extents back into buffer - map by_off; // buffer offset -> bufferlist - - // for each object extent... - for (list::iterator eit = rd->extents.begin(); - eit != rd->extents.end(); - eit++) { - bufferlist *ox_buf = rd->read_data[eit->oid]; - unsigned ox_len = ox_buf->length(); - unsigned ox_off = 0; - assert(ox_len <= eit->length); - - // for each buffer extent we're mapping into... - for (map::iterator bit = eit->buffer_extents.begin(); - bit != eit->buffer_extents.end(); - bit++) { - dout(21) << " object " << eit->oid << " extent " << eit->start << "~" << eit->length << " : ox offset " << ox_off << " -> buffer extent " << bit->first << "~" << bit->second << endl; - by_off[bit->first] = new bufferlist; - - if (ox_off + bit->second <= ox_len) { - // we got the whole bx - by_off[bit->first]->substr_of(*ox_buf, ox_off, bit->second); - if (bytes_read < bit->first + bit->second) - bytes_read = bit->first + bit->second; - } else if (ox_off + bit->second > ox_len && ox_off < ox_len) { - // we got part of this bx - by_off[bit->first]->substr_of(*ox_buf, ox_off, (ox_len-ox_off)); - if (bytes_read < bit->first + ox_len-ox_off) - bytes_read = bit->first + ox_len-ox_off; - - // zero end of bx - dout(21) << " adding some zeros to the end " << ox_off + bit->second-ox_len << endl; - bufferptr z(ox_off + bit->second - ox_len); - z.zero(); - by_off[bit->first]->append( z ); - } else { - // we got none of this bx. zero whole thing. - assert(ox_off >= ox_len); - dout(21) << " adding all zeros for this bit " << bit->second << endl; - bufferptr z(bit->second); - z.zero(); - by_off[bit->first]->append( z ); - } - ox_off += bit->second; - } - assert(ox_off == eit->length); - } - - // sort and string bits together - for (map::iterator it = by_off.begin(); - it != by_off.end(); - it++) { - assert(it->second->length()); - if (it->first < (off_t)bytes_read) { - dout(21) << " concat buffer frag off " << it->first << " len " << it->second->length() << endl; - rd->bl->claim_append(*(it->second)); - } else { - dout(21) << " NO concat zero buffer frag off " << it->first << " len " << it->second->length() << endl; - } - delete it->second; - } - - // trim trailing zeros? - if (rd->bl->length() > bytes_read) { - dout(10) << " trimming off trailing zeros . bytes_read=" << bytes_read - << " len=" << rd->bl->length() << endl; - rd->bl->splice(bytes_read, rd->bl->length() - bytes_read); - assert(bytes_read == rd->bl->length()); - } - - // hose p->read_data bufferlist*'s - for (map::iterator it = rd->read_data.begin(); - it != rd->read_data.end(); - it++) { - delete it->second; - } - } else { - dout(15) << " only one frag" << endl; - - // only one fragment, easy - rd->bl->claim( m->get_data() ); - bytes_read = rd->bl->length(); - } - - // finish, clean up - Context *onfinish = rd->onfinish; - - dout(7) << " " << bytes_read << " bytes " - << rd->bl->length() - << endl; - - // done - delete rd; - if (onfinish) { - onfinish->finish(bytes_read);// > 0 ? bytes_read:m->get_result()); - delete onfinish; - } - } else { - // store my bufferlist for later assembling - rd->read_data[m->get_oid()] = new bufferlist; - rd->read_data[m->get_oid()]->claim( m->get_data() ); - } - - delete m; -} - - - -// write ------------------------------------ - -tid_t Objecter::write(object_t oid, off_t off, size_t len, bufferlist &bl, - Context *onack, Context *oncommit, - objectrev_t rev) -{ - OSDWrite *wr = new OSDWrite(bl); - wr->extents.push_back(ObjectExtent(oid, off, len)); - wr->extents.front().pgid = osdmap->object_to_pg( oid, g_OSD_FileLayout ); - wr->extents.front().buffer_extents[0] = len; - wr->extents.front().rev = rev; - modifyx(wr, onack, oncommit); - return last_tid; -} - - -// zero - -tid_t Objecter::zero(object_t oid, off_t off, size_t len, - Context *onack, Context *oncommit, - objectrev_t rev) -{ - OSDModify *z = new OSDModify(OSD_OP_ZERO); - z->extents.push_back(ObjectExtent(oid, off, len)); - z->extents.front().pgid = osdmap->object_to_pg( oid, g_OSD_FileLayout ); - z->extents.front().rev = rev; - modifyx(z, onack, oncommit); - return last_tid; -} - - -// lock ops - -tid_t Objecter::lock(int op, object_t oid, - Context *onack, Context *oncommit) -{ - OSDModify *l = new OSDModify(op); - l->extents.push_back(ObjectExtent(oid, 0, 0)); - l->extents.front().pgid = osdmap->object_to_pg( oid, g_OSD_FileLayout ); - modifyx(l, onack, oncommit); - return last_tid; -} - - - -// generic modify ----------------------------------- - -tid_t Objecter::modifyx(OSDModify *wr, Context *onack, Context *oncommit) -{ - wr->onack = onack; - wr->oncommit = oncommit; - - // issue writes/whatevers - for (list::iterator it = wr->extents.begin(); - it != wr->extents.end(); - it++) - modifyx_submit(wr, *it); - - return last_tid; -} - - -tid_t Objecter::modifyx_submit(OSDModify *wr, ObjectExtent &ex, tid_t usetid) -{ - // find - PG &pg = get_pg( ex.pgid ); - - // pick tid - tid_t tid; - if (usetid > 0) - tid = usetid; - else - tid = ++last_tid; - - // add to gather set - wr->waitfor_ack[tid] = ex; - wr->waitfor_commit[tid] = ex; - op_modify[tid] = wr; - pg.active_tids.insert(tid); - - ++num_unacked; - ++num_uncommitted; - - // send? - dout(10) << "modifyx_submit " << MOSDOp::get_opname(wr->op) << " tid " << tid - << " oid " << ex.oid - << " " << ex.start << "~" << ex.length - << " pg " << ex.pgid - << " osd" << pg.primary() - << endl; - if (pg.primary() >= 0) { - MOSDOp *m = new MOSDOp(messenger->get_myinst(), client_inc, tid, - ex.oid, ex.pgid, osdmap->get_epoch(), - wr->op); - m->set_length(ex.length); - m->set_offset(ex.start); - m->set_rev(ex.rev); - - if (wr->tid_version.count(tid)) - m->set_version(wr->tid_version[tid]); // we're replaying this op! - - // what type of op? - switch (wr->op) { - case OSD_OP_WRITE: - { - // map buffer segments into this extent - // (may be fragmented bc of striping) - bufferlist cur; - for (map::iterator bit = ex.buffer_extents.begin(); - bit != ex.buffer_extents.end(); - bit++) { - bufferlist thisbit; - thisbit.substr_of(((OSDWrite*)wr)->bl, bit->first, bit->second); - cur.claim_append(thisbit); - } - assert(cur.length() == ex.length); - m->set_data(cur);//.claim(cur); - } - break; - } - - messenger->send_message(m, osdmap->get_inst(pg.primary())); - } - - dout(5) << num_unacked << " unacked, " << num_uncommitted << " uncommitted" << endl; - - return tid; -} - - - -void Objecter::handle_osd_modify_reply(MOSDOpReply *m) -{ - // get pio - tid_t tid = m->get_tid(); - - if (op_modify.count(tid) == 0) { - dout(7) << "handle_osd_modify_reply " << tid - << (m->get_commit() ? " commit":" ack") - << " ... stray" << endl; - delete m; - return; - } - - dout(7) << "handle_osd_modify_reply " << tid - << (m->get_commit() ? " commit":" ack") - << " v " << m->get_version() - << endl; - OSDModify *wr = op_modify[ tid ]; - - Context *onack = 0; - Context *oncommit = 0; - - PG &pg = get_pg( m->get_pg() ); - - // ignore? - if (pg.acker() != m->get_source().num()) { - dout(7) << " ignoring ack|commit from non-acker" << endl; - delete m; - return; - } - - assert(m->get_result() >= 0); - - // ack or commit? - if (m->get_commit()) { - //dout(15) << " handle_osd_write_reply commit on " << tid << endl; - assert(wr->tid_version.count(tid) == 0 || - m->get_version() == wr->tid_version[tid]); - - // remove from tid/osd maps - assert(pg.active_tids.count(tid)); - pg.active_tids.erase(tid); - if (pg.active_tids.empty()) close_pg( m->get_pg() ); - - // commit. - op_modify.erase( tid ); - wr->waitfor_ack.erase(tid); - wr->waitfor_commit.erase(tid); - - num_uncommitted--; - - if (wr->waitfor_commit.empty()) { - onack = wr->onack; - oncommit = wr->oncommit; - delete wr; - } - } else { - // ack. - //dout(15) << " handle_osd_write_reply ack on " << tid << endl; - assert(wr->waitfor_ack.count(tid)); - wr->waitfor_ack.erase(tid); - - num_unacked--; - - if (wr->tid_version.count(tid) && - wr->tid_version[tid].version != m->get_version().version) { - dout(-10) << "handle_osd_modify_reply WARNING: replay of tid " << tid - << " did not achieve previous ordering" << endl; - } - wr->tid_version[tid] = m->get_version(); - - if (wr->waitfor_ack.empty()) { - onack = wr->onack; - wr->onack = 0; // only do callback once - - // buffer uncommitted? - if (!g_conf.objecter_buffer_uncommitted && - wr->op == OSD_OP_WRITE) { - // discard buffer! - ((OSDWrite*)wr)->bl.clear(); - } - } - } - - // do callbacks - if (onack) { - onack->finish(0); - delete onack; - } - if (oncommit) { - oncommit->finish(0); - delete oncommit; - } - - delete m; -} - - - -void Objecter::ms_handle_failure(Message *m, entity_name_t dest, const entity_inst_t& inst) -{ - if (dest.is_mon()) { - // try a new mon - int mon = monmap->pick_mon(true); - dout(0) << "ms_handle_failure " << dest << " inst " << inst - << ", resending to mon" << mon - << endl; - messenger->send_message(m, monmap->get_inst(mon)); - } - else if (dest.is_osd()) { - int mon = monmap->pick_mon(); - dout(0) << "ms_handle_failure " << dest << " inst " << inst - << ", dropping and reporting to mon" << mon - << endl; - messenger->send_message(new MOSDFailure(inst, osdmap->get_epoch()), - monmap->get_inst(mon)); - delete m; - } else { - dout(0) << "ms_handle_failure " << dest << " inst " << inst - << ", dropping" << endl; - delete m; - } -} diff --git a/tags/20070517_before_mds_merge/osdc/Objecter.h b/tags/20070517_before_mds_merge/osdc/Objecter.h deleted file mode 100644 index 741db052a21ea..0000000000000 --- a/tags/20070517_before_mds_merge/osdc/Objecter.h +++ /dev/null @@ -1,197 +0,0 @@ -#ifndef __OBJECTER_H -#define __OBJECTER_H - -#include "include/types.h" -#include "include/buffer.h" - -#include "osd/OSDMap.h" -#include "messages/MOSDOp.h" - -#include -#include -#include -using namespace std; -using namespace __gnu_cxx; - -class Context; -class Messenger; -class OSDMap; -class MonMap; -class Message; - -class Objecter { - public: - Messenger *messenger; - MonMap *monmap; - OSDMap *osdmap; - - private: - tid_t last_tid; - int client_inc; - int num_unacked; - int num_uncommitted; - - /*** track pending operations ***/ - // read - public: - class OSDOp { - public: - list extents; - virtual ~OSDOp() {} - }; - - class OSDRead : public OSDOp { - public: - bufferlist *bl; - Context *onfinish; - map ops; - map read_data; // bits of data as they come back - - OSDRead(bufferlist *b) : bl(b), onfinish(0) { - bl->clear(); - } - }; - - class OSDStat : public OSDOp { - public: - tid_t tid; - off_t *size; // where the size goes. - Context *onfinish; - OSDStat(off_t *s) : tid(0), size(s), onfinish(0) { } - }; - - // generic modify - class OSDModify : public OSDOp { - public: - int op; - list extents; - Context *onack; - Context *oncommit; - map waitfor_ack; - map tid_version; - map waitfor_commit; - - OSDModify(int o) : op(o), onack(0), oncommit(0) {} - }; - - // write (includes the bufferlist) - class OSDWrite : public OSDModify { - public: - bufferlist bl; - OSDWrite(bufferlist &b) : OSDModify(OSD_OP_WRITE), bl(b) {} - }; - - - - private: - // pending ops - hash_map op_stat; - hash_map op_read; - hash_map op_modify; - - /** - * track pending ops by pg - * ...so we can cope with failures, map changes - */ - class PG { - public: - vector acting; - set active_tids; // active ops - - PG() {} - - // primary - where i write - int primary() { - if (acting.empty()) return -1; - return acting[0]; - } - // acker - where i read, and receive acks from - int acker() { - if (acting.empty()) return -1; - if (g_conf.osd_rep == OSD_REP_PRIMARY) - return acting[0]; - else - return acting[acting.size() > 1 ? 1:0]; - } - }; - - hash_map pg_map; - - - PG &get_pg(pg_t pgid) { - if (!pg_map.count(pgid)) - osdmap->pg_to_acting_osds(pgid, pg_map[pgid].acting); - return pg_map[pgid]; - } - void close_pg(pg_t pgid) { - assert(pg_map.count(pgid)); - assert(pg_map[pgid].active_tids.empty()); - pg_map.erase(pgid); - } - void scan_pgs(set& chnaged_pgs); - void kick_requests(set& changed_pgs); - - - public: - Objecter(Messenger *m, MonMap *mm, OSDMap *om) : - messenger(m), monmap(mm), osdmap(om), - last_tid(0), client_inc(-1), - num_unacked(0), num_uncommitted(0) - {} - ~Objecter() { - // clean up op_* - // *** - } - - // messages - public: - void dispatch(Message *m); - void handle_osd_op_reply(class MOSDOpReply *m); - void handle_osd_stat_reply(class MOSDOpReply *m); - void handle_osd_read_reply(class MOSDOpReply *m); - void handle_osd_modify_reply(class MOSDOpReply *m); - void handle_osd_lock_reply(class MOSDOpReply *m); - void handle_osd_map(class MOSDMap *m); - - private: - tid_t readx_submit(OSDRead *rd, ObjectExtent& ex); - tid_t modifyx_submit(OSDModify *wr, ObjectExtent& ex, tid_t tid=0); - tid_t stat_submit(OSDStat *st); - - // public interface - public: - bool is_active() { - return !(op_read.empty() && op_modify.empty()); - } - - int get_client_incarnation() { return client_inc; } - void set_client_incarnation(int inc) { - client_inc = inc; - } - - // med level - tid_t readx(OSDRead *read, Context *onfinish); - tid_t modifyx(OSDModify *wr, Context *onack, Context *oncommit); - //tid_t lockx(OSDLock *l, Context *onack, Context *oncommit); - - // even lazier - tid_t read(object_t oid, off_t off, size_t len, bufferlist *bl, - Context *onfinish, - objectrev_t rev=0); - tid_t write(object_t oid, off_t off, size_t len, bufferlist &bl, - Context *onack, Context *oncommit, - objectrev_t rev=0); - tid_t zero(object_t oid, off_t off, size_t len, - Context *onack, Context *oncommit, - objectrev_t rev=0); - tid_t stat(object_t oid, off_t *size, Context *onfinish, - objectrev_t rev=0); - - tid_t lock(int op, object_t oid, Context *onack, Context *oncommit); - - - void ms_handle_failure(Message *m, entity_name_t dest, const entity_inst_t& inst); - -}; - -#endif diff --git a/tags/20070517_before_mds_merge/script/add_header.pl b/tags/20070517_before_mds_merge/script/add_header.pl deleted file mode 100755 index f5891cc668c45..0000000000000 --- a/tags/20070517_before_mds_merge/script/add_header.pl +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/perl - -use strict; -my $fn = shift @ARGV; -my $f = `cat $fn`; - -my $header = '// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -'; - -unless ($f =~ /Ceph - scalable distributed file system/) { - open(O, ">$fn.new"); - print O $header; - print O $f; - close O; - rename "$fn.new", $fn; -} - diff --git a/tags/20070517_before_mds_merge/script/adjusttabs.pl b/tags/20070517_before_mds_merge/script/adjusttabs.pl deleted file mode 100755 index 66edff2ac6c02..0000000000000 --- a/tags/20070517_before_mds_merge/script/adjusttabs.pl +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/perl - -my $tablen = shift @ARGV; -my $fn = shift @ARGV; - -my $tab = ' ' x $tablen; -open(I, $fn); -my $f; -my $oldtab = ' ' x 4; -while () { - if (my ($oldlen) = /\-\*\- .*tab-width:(\d)/) { - print "old length was $oldlen\n"; - $oldtab = ' ' x $oldlen; - s/tab-width:\d/tab-width:$tablen/; - } - s/\t/$oldtab/g; - $f .= $_; -} -close I; -open(O, ">$fn.new"); -print O $f; -close O; - -rename "$fn.new", $fn; diff --git a/tags/20070517_before_mds_merge/script/clean_osd_cow.sh b/tags/20070517_before_mds_merge/script/clean_osd_cow.sh deleted file mode 100755 index 1e443c95e7ebc..0000000000000 --- a/tags/20070517_before_mds_merge/script/clean_osd_cow.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/sh - -rm osddata/*/*\.* diff --git a/tags/20070517_before_mds_merge/script/clean_trace.pl b/tags/20070517_before_mds_merge/script/clean_trace.pl deleted file mode 100755 index cb02ff7abe7c2..0000000000000 --- a/tags/20070517_before_mds_merge/script/clean_trace.pl +++ /dev/null @@ -1,8 +0,0 @@ -#!/usr/bin/perl - -my $n = 0; -while (<>) { - next unless /trace: /; - my $l = $'; $'; - print $l; -} diff --git a/tags/20070517_before_mds_merge/script/comb.pl b/tags/20070517_before_mds_merge/script/comb.pl deleted file mode 100755 index 88a4bb72a7970..0000000000000 --- a/tags/20070517_before_mds_merge/script/comb.pl +++ /dev/null @@ -1,113 +0,0 @@ -#!/usr/bin/perl - -use strict; - -my $xaxis = shift @ARGV; -my @vars; -while (@ARGV) { - $_ = shift @ARGV; - last if ($_ eq '-'); - push(@vars, $_); -} -my @dirs; -while (@ARGV) { - $_ = shift @ARGV; - last if ($_ eq '-'); - push(@dirs, $_) if -d $_; -} -my @filt = @ARGV; -push( @filt, '.' ) unless @filt; - -print "#xaxis $xaxis -#vars @vars -#dirs @dirs -#filt @filt -"; - -sub load_sum { - my $fn = shift @_; - - open(I, "$fn"); - my $k = ; - chomp($k); - my @k = split(/\s+/,$k); - shift @k; - - my $s; - while () { - chomp; - s/^\#//; - next unless $_; - my @l = split(/\s+/,$_); - my $k = shift @l; - for my $f (@k) { - $s->{$k}->{$f} = shift @l; - } - - # clnode latency? - if ($fn =~ /cl/) { - $s->{$k}->{'wrlat'} = $s->{$k}->{'wrlsum'} / $s->{$k}->{'wrlnum'} if $s->{$k}->{'wrlnum'} > 0; - $s->{$k}->{'rlat'} = $s->{$k}->{'rlsum'} / $s->{$k}->{'rlnum'} if $s->{$k}->{'rlnum'} > 0; - $s->{$k}->{'lat'} = $s->{$k}->{'lsum'} / $s->{$k}->{'lnum'} if $s->{$k}->{'lnum'} > 0; - $s->{$k}->{'latw'} = $s->{$k}->{'lwsum'} / $s->{$k}->{'lwnum'} if $s->{$k}->{'lwnum'} > 0; - $s->{$k}->{'latr'} = $s->{$k}->{'lrsum'} / $s->{$k}->{'lrnum'} if $s->{$k}->{'lrnum'} > 0; - $s->{$k}->{'statlat'} = $s->{$k}->{'lstatsum'} / $s->{$k}->{'lstatnum'} if $s->{$k}->{'lstatnum'} > 0; - $s->{$k}->{'dirlat'} = $s->{$k}->{'ldirsum'} / $s->{$k}->{'ldirnum'} if $s->{$k}->{'ldirnum'} > 0; - } - } - return $s; -} - - -my %res; -my @key; -my %didkey; -for my $f (@filt) { - my @reg = split(/,/, $f); - #print "reg @reg\n"; - for my $d (@dirs) { - if ($f ne '.') { - my $r = (split(/\//,$d))[-1]; - my @db = split(/,/, $r); - #print "db @db\n"; - my $ok = 1; - for my $r (@reg) { - - $ok = 0 unless grep {$_ eq $r} @db; - } - next unless $ok; - } - #next if ($f ne '.' && $d !~ /$reg/); - #print "$d\n"; - my ($x) = $d =~ /$xaxis=(\d+)/; - - for my $v (@vars) { - my ($what, $field) = $v =~ /^(.+)\.([^\.]+)$/; - #print "$what $field .. $v .. $f.$field\n"; - my $s = &load_sum("$d/sum.$what"); - - #print "\t$v"; - if ($field =~ /^sum=/) { - #warn "SUM field $field\n"; - push( @{$res{$x}}, $s->{'sum'}->{$'} ); #'}); - } else { - #warn "avg field $field\n"; - push( @{$res{$x}}, $s->{'avgval'}->{$field} ); - } - - push( @key, "$f.$field" ) unless $didkey{"$f.$field"}; - $didkey{"$f.$field"} = 1; - - if (0 && exists $s->{'avgvaldevt'}) { - push( @{$res{$x}}, $s->{'avgvaldevt'}->{$field} ); - push( @key, "$f.$field.dev" ) unless $didkey{"$f.$field.dev"}; - $didkey{"$f.$field.dev"} = 1; - } - } - } -} - -print join("\t", "#", @key) . "\n"; -for my $x (sort {$a <=> $b} keys %res) { - print join("\t", $x, @{$res{$x}}) . "\n"; -} diff --git a/tags/20070517_before_mds_merge/script/find_auth_pins.pl b/tags/20070517_before_mds_merge/script/find_auth_pins.pl deleted file mode 100755 index c02c12922ed7b..0000000000000 --- a/tags/20070517_before_mds_merge/script/find_auth_pins.pl +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/perl - -my %pin; -my %hist; -my $l = 1; -my @pins; -while (<>) { - - #cdir:adjust_nested_auth_pins on [dir 163 /foo/ rep@13 | child] count now 0 + 1 - - if (/adjust_nested_auth_pins/) { - my ($what) = /\[(\w+ \d+) /; - $hist{$what} .= "$l: $_" - if defined $pin{$what}; - } - - # cinode:auth_pin on inode [1000000002625 /gnu/blah_client_created. 0x89b7700] count now 1 + 0 - - if (/auth_pin /) { - my ($what) = /\[(\w+ \d+) /; -# print "add_waiter $c $what\n"; - $pin{$what}++; - $hist{$what} .= "$l: $_"; - push( @pins, $what ) unless grep {$_ eq $what} @pins; - } - - # cinode:auth_unpin on inode [1000000002625 (dangling) 0x89b7700] count now 0 + 0 - - if (/auth_unpin/) { - my ($what) = /\[(\w+ \d+) /;# / on (.*\])/; - $pin{$what}--; - $hist{$what} .= "$l: $_"; - unless ($pin{$what}) { - delete $hist{$what}; - delete $pin{$what}; - @pins = grep {$_ ne $what} @pins; - } - } - $l++; -} - -for my $what (@pins) { - print "---- count $pin{$what} on $what -$hist{$what} -"; -} diff --git a/tags/20070517_before_mds_merge/script/find_bufferleaks.pl b/tags/20070517_before_mds_merge/script/find_bufferleaks.pl deleted file mode 100755 index 152515d5e788e..0000000000000 --- a/tags/20070517_before_mds_merge/script/find_bufferleaks.pl +++ /dev/null @@ -1,69 +0,0 @@ -#!/usr/bin/perl - -use strict; -my %buffers; -my %bufferlists; -my %ref; -my %mal; -my $l = 1; -while (<>) { - #print "$l: $_"; - - # cinode:auth_pin on inode [1000000002625 /gnu/blah_client_created. 0x89b7700] count now 1 + 0 - - if (/^buffer\.cons /) { - my ($x) = /(0x\S+)/; - $buffers{$x} = 1; - } - if (/^buffer\.des /) { - my ($x) = /(0x\S+)/; - die "des without cons at $l: $_" unless $buffers{$x}; - delete $buffers{$x}; - die "des with ref>0 at $l: $_" unless $ref{$x} == 0; - delete $ref{$x}; - } - - if (/^bufferlist\.cons /) { - my ($x) = /(0x\S+)/; - $bufferlists{$x} = 1; - } - if (/^bufferlist\.des /) { - my ($x) = /(0x\S+)/; - warn "des without cons at $l: $_" unless $bufferlists{$x}; - delete $bufferlists{$x}; - } - - - if (/^buffer\.malloc /) { - my ($x) = /(0x\S+)/; - $mal{$x} = 1; - } - if (/^buffer\.free /) { - my ($x) = /(0x\S+)/; - die "free with malloc at $l: $_" unless $mal{$x}; - delete $mal{$x}; - } - - if (/^buffer\.get /) { - my ($x) = /(0x\S+)/; - $ref{$x}++; - } - if (/^buffer\.get /) { - my ($x) = /(0x\S+)/; - $ref{$x}--; - } - -$l++; -} - -for my $x (keys %bufferlists) { - print "leaked bufferlist $x\n"; -} - -for my $x (keys %buffers) { - print "leaked buffer $x ref $ref{$x}\n"; -} - -for my $x (keys %mal) { - print "leaked buffer dataptr $x ref $ref{$x}\n"; -} diff --git a/tags/20070517_before_mds_merge/script/find_lost_bdev_ops.pl b/tags/20070517_before_mds_merge/script/find_lost_bdev_ops.pl deleted file mode 100755 index ac1793b42dfac..0000000000000 --- a/tags/20070517_before_mds_merge/script/find_lost_bdev_ops.pl +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/perl - -use strict; -my %op; - -my $line = 0; -while (<>) { - #print $line . $_ if /0x8d4f6a0/; - chomp; - $line++; - - #bdev(./ebofsdev/0)._submit_io bio(wr 269~1 usemap 0x4de33cc0) - if (my ($bio) = /_submit_io bio\(.*(0x\w+)\)/) { - $op{$bio} = $line; - } - - # cancel - #bdev(./ebofsdev/3)._cancel_io bio(wr 1525~1 bh_write 0x8a437b8) - if (my ($bio) = /_cancel_io bio\(.*(0x\w+)\)/ && - !(/FAILED/)) { - delete $op{$bio}; - } - - # finish - #bdev(./ebofsdev/3).complete_thread finishing bio(wr 1131~1 write_cnode 0x832c1f8) - if (my ($bio) = /complete_thread finishing bio\(.*(0x\w+)\)/) { - delete $op{$bio}; - } - -} - -for my $bio (keys %op) { - print "---- lost bio $bio\n"; -} diff --git a/tags/20070517_before_mds_merge/script/find_lost_commit.pl b/tags/20070517_before_mds_merge/script/find_lost_commit.pl deleted file mode 100755 index 73934248ad5c0..0000000000000 --- a/tags/20070517_before_mds_merge/script/find_lost_commit.pl +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/perl - -use strict; -my %op; - -my $line = 0; -while (<>) { - #print "$line: $_"; - $line++; - - #osd3 do_op MOSDOp(client0.933 oid 100000000000008 0x84b4480) in pg[pginfo(4020000000d v 5662/0 e 2/1) r=0 active (0,5662]] - if (my ($from, $opno, $oid, $op) = /do_op MOSDOp\((\S+) op (\d+) oid (\d+) (\w+)\)/) { -# print "$op\n"; - if ($opno == 2 || $opno == 11 || $opno == 12 || $opno == 14 || $opno == 15) { - $op{$op} = $from; - } - } - - # commits - #osd1 op_modify_commit on op MOSDOp(client1.289 oid 100000100000002 0x51a2f788) - if (my ($op) = /op_modify_commit.* (\w+)\)/) { - delete $op{$op}; - } - #osd4 rep_modify_commit on op MOSDOp(osd3.289 oid 100000000000008 0x84b0980) - if (my ($op) = /rep_modify_commit.* (\w+)\)/) { - delete $op{$op}; - } - - # forwarded? - if (my ($op) = /sending (\w+) to osd/) { - delete $op{$op}; - } - -} - -for my $op (keys %op) { - print "---- lost op $op $op{$op}\n"; -} diff --git a/tags/20070517_before_mds_merge/script/find_lost_objecter.pl b/tags/20070517_before_mds_merge/script/find_lost_objecter.pl deleted file mode 100755 index a0c2089140e23..0000000000000 --- a/tags/20070517_before_mds_merge/script/find_lost_objecter.pl +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/perl - -use strict; -my %ack; -my %commit; - -my $line = 0; -while (<>) { - #print "$line: $_"; - $line++; - - #client0.objecter writex_submit tid 21 osd0 oid 100000000000001 851424~100000 - if (my ($who, $tid) = /(\S+)\.objecter writex_submit tid\D+(\d+)\D+osd/) { -# print "$who.$tid\n"; - $ack{"$who.$tid"} = $line; - $commit{"$who.$tid"} = $line; - } - - #client1.objecter handle_osd_write_reply 304 commit 0 - #client1.objecter handle_osd_write_reply 777 commit 1 - if (my ($who, $tid, $commit) = /(\S+)\.objecter handle_osd_write_reply\D+(\d+)\D+commit\D+(\d)/) { -# print "$who.$tid\n"; - delete $ack{"$who.$tid"}; - delete $commit{"$who.$tid"} if $commit; - } - -} - -for my $op (keys %commit) { - print "---- lost commit $op $commit{$op}\n"; -} -for my $op (keys %ack) { - print "---- lost ack $op $commit{$op}\n"; -} diff --git a/tags/20070517_before_mds_merge/script/find_pathpins.pl b/tags/20070517_before_mds_merge/script/find_pathpins.pl deleted file mode 100755 index e4a7d81dfb7b7..0000000000000 --- a/tags/20070517_before_mds_merge/script/find_pathpins.pl +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/perl - -my %pin; -my %hist; -my $l = 1; -my @pins; -while (<>) { - - # cinode:auth_pin on inode [1000000002625 /gnu/blah_client_created. 0x89b7700] count now 1 + 0 - - if (/path_pinned /) { - my ($dname, $dir) = /\[dentry (\S+) .* in \[dir (\d+) /; - $what = "$dname $dir"; - #print "$l pin $what\n"; - $pin{$what}++; - $hist{$what} .= "$l: $_"; - push( @pins, $what ) unless grep {$_ eq $what} @pins; - } - - # cinode:auth_unpin on inode [1000000002625 (dangling) 0x89b7700] count now 0 + 0 - - if (/path_unpinned/) { - my ($dname, $dir) = /\[dentry (\S+) .* in \[dir (\d+) /; - $what = "$dname $dir"; - #print "$l unpin $what\n"; - $pin{$what}--; - $hist{$what} .= "$l: $_"; - unless ($pin{$what}) { - delete $hist{$what}; - delete $pin{$what}; - @pins = grep {$_ ne $what} @pins; - } - } - $l++; -} - -for my $what (@pins) { - print "---- count $pin{$what} on $what -$hist{$what} -"; -} diff --git a/tags/20070517_before_mds_merge/script/find_requests.pl b/tags/20070517_before_mds_merge/script/find_requests.pl deleted file mode 100755 index 5144896249413..0000000000000 --- a/tags/20070517_before_mds_merge/script/find_requests.pl +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/perl - -my %waiting; # context => what where what is "inode ..." or "dir ..." -my %hist; # context => history since waited -my @waiting; - -my $line = 0; -while (<>) { - - #print $line . $_ if /0x8d4f6a0/; - $line++; - if (/request_start/) { - my ($c) = /(0x\w+)/; - my ($what) = $'; #'; - chomp $what; - #print "$line add_waiter $c $what\n" if /0x8d4f6a0/; - $waiting{$c} = $what - if $what && !$waiting{$c}; - $hist{$c} .= "$line: $_"; - unless (grep {$_ eq $c} @waiting) { - push( @waiting, $c ); - } - } - #if (/finish_waiting/) { - # my ($c) = /(0x\w+)/; - # $hist{$c} .= "$line: $_"; - #} - if (/request_finish/ || - /request_forward/) { - my ($c) = /(0x\w+)/; - #print "took\n" if /0x8d4f6a0/; - delete $waiting{$c}; - delete $hist{$c}; - @waiting = grep {$_ ne $c} @waiting; - } -} - -for my $c (@waiting) { - print "---- lost request $c $waiting{$c} -$hist{$c} -"; -} diff --git a/tags/20070517_before_mds_merge/script/find_waiters.pl b/tags/20070517_before_mds_merge/script/find_waiters.pl deleted file mode 100755 index c89d2b1a49db7..0000000000000 --- a/tags/20070517_before_mds_merge/script/find_waiters.pl +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/perl - -my %waiting; # context => what where what is "inode ..." or "dir ..." -my %hist; # context => history since waited -my @waiting; - -my $line = 0; -while (<>) { - #print $line . $_ if /0x8d4f6a0/; - $line++; - if (/add_waiter/) { - my ($c) = /(0x\w+)/; - my ($what) = / on (.*\])/; - #print "$line add_waiter $c $what\n" if /0x8d4f6a0/; - $waiting{$c} = $what - if $what && !$waiting{$c}; - $hist{$c} .= "$line: $_"; - unless (grep {$_ eq $c} @waiting) { - push( @waiting, $c ); - } - } - #if (/finish_waiting/) { - # my ($c) = /(0x\w+)/; - # $hist{$c} .= "$line: $_"; - #} - if (/take_waiting/) { - my ($c) = /(0x\w+)/; - if (/SKIPPING/) { - #print "skipping\n" if /0x8d4f6a0/; - $hist{$c} .= "$line: $_"; - } elsif (/took/) { - #print "took\n" if /0x8d4f6a0/; - delete $waiting{$c}; - delete $hist{$c}; - @waiting = grep {$_ ne $c} @waiting; - } else { - die "i don't understand: $_"; - } - } -} - -for my $c (@waiting) { - print "---- lost waiter $c $waiting{$c} -$hist{$c} -"; -} diff --git a/tags/20070517_before_mds_merge/script/grepblock b/tags/20070517_before_mds_merge/script/grepblock deleted file mode 100755 index f5acf95732abb..0000000000000 --- a/tags/20070517_before_mds_merge/script/grepblock +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/perl - -use strict; - -my $block = shift ARGV; -die unless int $block; - -while (<>) { - my $yes = 0; - for my $x (/(\d+\~\d+)/) { - my ($s,$l) = split(/\~/,$x); - $yes = 1 if ($block >= $s && $block < $s+$l); - } - print if $yes; -} diff --git a/tags/20070517_before_mds_merge/script/merge_trace_rw.pl b/tags/20070517_before_mds_merge/script/merge_trace_rw.pl deleted file mode 100644 index 378d629ef43f6..0000000000000 --- a/tags/20070517_before_mds_merge/script/merge_trace_rw.pl +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/perl - -use strict; - -my @file = <>; -sub get_op { - my @op = shift @file; - while (@file && - $file[0] !~ /^[a-z]+$/) { - push( @op, shift @file ); - } - #print "op = ( @op )\n"; - return @op; -} - -my $n = 0; -while (@file) { - my ($op, @args) = &get_op; - while ($op eq "read\n" || - $op eq "write\n") { - die unless scalar(@args) == 3; - my ($nop, @nargs) = &get_op; - if ($nop eq $op - && ($args[0] == $nargs[0] ) - && ($args[2] + $args[1] == $nargs[2]) - ) { - die unless scalar(@nargs) == 3; - $args[1] += $nargs[1]; - $args[1] .= "\n"; - die unless scalar(@args) == 3; - #print STDOUT "combining $n $op @args\n"; - $n++; - } else { -# print STDERR "not combinging\n"; - unshift( @file, $nop, @nargs ); - die unless scalar(@args) == 3; - last; - } - } - print $op; - print join('', @args); -} diff --git a/tags/20070517_before_mds_merge/script/profonly.pl b/tags/20070517_before_mds_merge/script/profonly.pl deleted file mode 100755 index 6a05dec473ca0..0000000000000 --- a/tags/20070517_before_mds_merge/script/profonly.pl +++ /dev/null @@ -1,12 +0,0 @@ -#!/usr/bin/perl - -my $rank = shift @ARGV; -my $args = join(' ',@ARGV); -if ($rank == $ENV{MPD_JRANK}) { - $c = "LD_PRELOAD=$ENV{'HOME'}/csl/obsd/src/pmds/gprof-helper.so ./newsyn $args"; -} else { - $c = "./newsyn.nopg $args"; -} - -#print "$rank: $c\n"; -system $c; diff --git a/tags/20070517_before_mds_merge/script/runset.pl b/tags/20070517_before_mds_merge/script/runset.pl deleted file mode 100755 index 966cf4e5100cb..0000000000000 --- a/tags/20070517_before_mds_merge/script/runset.pl +++ /dev/null @@ -1,380 +0,0 @@ -#!/usr/bin/perl - -use strict; -use Data::Dumper; - -=item sample input file - -# hi there -{ - # startup - 'n' => 30, # mpi nodes - 'sleep' => 10, # seconds between runs - 'nummds' => 1, - 'numosd' => 8, - 'numclient' => 400,#[10, 50, 100, 200, 400], - - # parameters - 'fs' => [ 'ebofs', 'fakestore' ], - 'until' => 150, # --syn until $n ... when to stop clients - 'writefile' => 1, - 'writefile_size' => [ 4096, 65526, 256000, 1024000, 2560000 ], - 'writefile_mb' => 1000, - - 'custom' => '--tcp_skip_rank0 --osd_maxthreads 0'; - - # for final summation (script/sum.pl) - 'start' => 30, - 'end' => 120, - - '_psub' => 'alc.tp' # switch to psub mode! -}; - -=cut - -my $usage = "script/runset.pl [--clean] jobs/some/job blah\n"; - -my $clean; -my $use_srun; -my $nobg = '&'; -my $in = shift || die $usage; -if ($in eq '--clean') { - $clean = 1; - $in = shift || die $usage; -} -if ($in eq '--srun') { - $use_srun = 1; - $in = shift || die $usage; -} -if ($in eq '--nobg') { - $nobg = ''; - $in = shift || die $usage; -} -my $tag = shift || die $usage; -my $fake = shift; - - -my ($job) = $in =~ /^jobs\/(.*)/; -my ($jname) = $job =~ /\/(\w+)$/; -$jname ||= $job; -die "not jobs/?" unless defined $job; -my $out = "log/$job.$tag"; -my $relout = "$job.$tag"; - - -my $cwd = `/bin/pwd`; -chomp($cwd); - - - -print "# --- job $job, tag $tag ---\n"; - - -# get input -my $raw = `cat $in`; -my $sim = eval $raw; -unless (ref $sim) { - print "bad input: $in\n"; - system "perl -c $in"; - exit 1; -} - -# prep output -system "mkdir -p $out" unless -d "$out"; - -open(W, ">$out/in"); -print W $raw; -close W; - -my $comb = $sim->{'comb'}; -delete $sim->{'comb'}; -my %filters; -my @fulldirs; - - - -sub reset { - print "reset: restarting mpd in 3 seconds\n"; - system "sleep 3 && (mpiexec -l -n 32 killall newsyn ; restartmpd.sh)"; - print "reset: done\n"; -} - - -if (`hostname` =~ /alc/ && !$use_srun) { - print "# this looks like alc\n"; - $sim->{'_psub'} = 'jobs/alc.tp'; -} - - -sub iterate { - my $sim = shift @_; - my $fix = shift @_ || {}; - my $vary; - my @r; - - my $this; - for my $k (sort keys %$sim) { - next if $k =~ /^_/; - if (defined $fix->{$k}) { - $this->{$k} = $fix->{$k}; - } - elsif (ref $sim->{$k} eq 'HASH') { - # nothing - } - elsif (!(ref $sim->{$k})) { - $this->{$k} = $sim->{$k}; - } - else { - #print ref $sim->{$k}; - if (!(defined $vary)) { - $vary = $k; - } - } - } - - if ($vary) { - #print "vary $vary\n"; - for my $v (@{$sim->{$vary}}) { - $this->{$vary} = $v; - push(@r, &iterate($sim, $this)); - } - } else { - - if ($sim->{'_dep'}) { - my @s = @{$sim->{'_dep'}}; - while (@s) { - my $dv = shift @s; - my $eq = shift @s; - - $eq =~ s/\$(\w+)/"\$this->{'$1'}"/eg; - $this->{$dv} = eval $eq; - #print "$dv : $eq -> $this->{$dv}\n"; - } - } - - push(@r, $this); - } - return @r; -} - - - -sub run { - my $h = shift @_; - - my @fn; - my @filt; - my @vals; - for my $k (sort keys %$sim) { - next if $k =~ /^_/; - next unless ref $sim->{$k} eq 'ARRAY'; - push(@fn, "$k=$h->{$k}"); - push(@vals, $h->{$k}); - next if $comb && $k eq $comb->{'x'}; - push(@filt, "$k=$h->{$k}"); - } - my $keys = join(",", @fn); - $keys =~ s/ /_/g; - my $fn = $out . '/' . $keys; - my $name = $jname . '_' . join('_',@vals); #$tag . '_' . $keys; - - push( @fulldirs, "" . $fn ); - - - # filters - $filters{ join(',', @filt) } = 1; - - - #system "sh $fn/sh.post" if -e "$fn/sh.post";# && !(-e "$fn/.post"); - if (-e "$fn/.done") { - print "already done.\n"; - return; - } - system "rm -r $fn" if $clean && -d "$fn"; - system "mkdir $fn" unless -d "$fn"; - - my $e = './newsyn'; - #$e = './tcpsynobfs' if $h->{'fs'} eq 'obfs'; - my $c = "$e"; - $c .= " --mkfs" unless $h->{'no_mkfs'}; - $c .= " --$h->{'fs'}" if $h->{'fs'}; - $c .= " --syn until $h->{'until'}" if $h->{'until'}; - - $c .= " --syn writefile $h->{'writefile_mb'} $h->{'writefile_size'}" if $h->{'writefile'}; - $c .= " --syn rw $h->{'rw_mb'} $h->{'rw_size'}" if $h->{'rw'}; - $c .= " --syn readfile $h->{'readfile_mb'} $h->{'readfile_size'}" if $h->{'readfile'}; - $c .= " --syn makedirs $h->{'makedirs_dirs'} $h->{'makedirs_files'} $h->{'makedirs_depth'}" if $h->{'makedirs'}; - - if ($h->{'ebofs_freelist'}) { - system "cp freelist/ebofs.freelist.$h->{'ebofs_freelist'} ebofs.freelist"; - $c .= " --osd_age_time -1"; - } - - for my $k ('nummds', 'numclient', 'numosd', 'kill_after', - 'osd_maxthreads', 'osd_object_layout', 'osd_pg_layout','osd_pg_bits', - 'mds_bal_rep', 'mds_bal_interval', 'mds_bal_max','mds_decay_halflife', - 'mds_bal_hash_rd','mds_bal_hash_wr','mds_bal_unhash_rd','mds_bal_unhash_wr', - 'mds_cache_size','mds_log_max_len', - 'mds_local_osd', - 'osd_age_time','osd_age', - 'osd_rep', - 'osd_pad_pg_log','ebofs_realloc', - 'osd_balance_reads', - 'tcp_multi_out', - 'client_cache_stat_ttl','client_cache_readdir_ttl', - 'client_oc', - 'fake_osdmap_updates', - 'bdev_el_bidir', 'ebofs_idle_commit_ms', 'ebofs_commit_ms', - 'ebofs_oc_size','ebofs_cc_size','ebofs_bc_size','ebofs_bc_max_dirty','ebofs_abp_max_alloc', - 'file_layout_ssize','file_layout_scount','file_layout_osize','file_layout_num_rep', - 'meta_dir_layout_ssize','meta_dir_layout_scount','meta_dir_layout_osize','meta_dir_layout_num_rep', - 'meta_log_layout_ssize','meta_log_layout_scount','meta_log_layout_osize','meta_log_layout_num_rep') { - $c .= " --$k $h->{$k}" if defined $h->{$k}; - } - - $c .= ' ' . $h->{'custom'} if $h->{'custom'}; - - $c .= " --log_name $relout/$keys"; - - my $post = "#!/bin/sh -script/sum.pl -start $h->{'start'} -end $h->{'end'} $fn/osd\\* > $fn/sum.osd -script/sum.pl -start $h->{'start'} -end $h->{'end'} $fn/mds? $fn/mds?? > $fn/sum.mds -script/sum.pl -start $h->{'start'} -end $h->{'end'} $fn/mds*.log > $fn/sum.mds.log -script/sum.pl -start $h->{'start'} -end $h->{'end'} $fn/clnode* > $fn/sum.cl -touch $fn/.post -"; - open(O,">$fn/sh.post"); - print O $post; - close O; - - my $killmin = 1 + int ($h->{'kill_after'} / 60); - - $c = "bash -c \"ulimit -c 0 ; $c\""; - #$c = "bash -c \"$c\""; - - my $srun = "srun --wait=600 --exclude=jobs/ltest.ignore -l -t $killmin -N $h->{'n'} -p ltest"; - my $mpiexec = "mpiexec -l -n $h->{'n'}"; - my $launch; - if ($use_srun) { - $launch = $srun; - } else { - $launch = $mpiexec; - } - - if ($sim->{'_psub'}) { - # template! - my $tp = `cat $sim->{'_psub'}`; - $tp =~ s/\$CWD/$cwd/g; - $tp =~ s/\$NAME/$name/g; - $tp =~ s/\$NUM/$h->{'n'}/g; - $tp =~ s/\$OUT/$fn\/o/g; - $tp =~ s/\$DONE/$fn\/.done/g; - $tp =~ s/\$CMD/$c/g; - open(O,">$out/$name"); - print O $tp; - close O; - print "\npsub $out/$name\n"; - return; - } else { - # run - my $cmd = "\n$launch $c > $fn/o && touch $fn/.done";# - #my $cmd = "\n$launch $c > $fn/o ; touch $fn/.done"; - print "$cmd $nobg\n"; - my $r = undef; - unless ($fake) { - if ($sim->{'_pre'}) { - print "pre: $launch $sim->{'_pre'}\n"; - system "$launch $sim->{'_pre'}"; - } - $r = system $cmd; - if ($sim->{'_post'}) { - print "post: $launch $sim->{'_post'}\n"; - system "$launch $sim->{'_post'}"; - } - if ($r) { - print "r = $r\n"; - #&reset; - } - system "sh $fn/sh.post"; - } - return $r; - } -} - - - -my @r = &iterate($sim); -my $n = scalar(@r); -my $c = 1; -my %r; -my $nfailed = 0; -for my $h (@r) { - my $d = `date`; - chomp($d); - $d =~ s/ P.T .*//; - print "# === $c/$n"; - print " ($nfailed failed)" if $nfailed; - print " $d: "; - my $r = &run($h); - - if (!(defined $r)) { - # already done - } else { - if ($r) { - $nfailed++; - } - print "sleep $h->{'sleep'}\n"; - sleep $h->{'sleep'}; - } - - $c++; -} -print "$nfailed failed\n"; - - -my @comb; -if ($comb) { - my $x = $comb->{'x'}; - my @vars = @{$comb->{'vars'}}; - - print "\n\n# post\n"; - for my $p (@fulldirs) { - print "sh $p/sh.post\n"; - } - - my @filters = sort keys %filters; - my $cmd = "script/comb.pl $x @vars - @fulldirs - @filters > $out/c"; - print "$cmd\n"; - open(O,">$out/comb"); - print O "$cmd\n"; - close O; - system $cmd; - - print "\n\n"; - - my $plot; - $plot .= "set data style linespoints;\n"; - my $s = 2; - for my $v (@vars) { - my $c = $s; - $s++; - my @p; - for my $f (@filters) { - my $t = $f; - if ($comb->{'maptitle'}) { - for my $a (keys %{$comb->{'maptitle'}}) { - my $b = $comb->{'maptitle'}->{$a}; - $t =~ s/$a/$b/; - } - } - push (@p, "\"$out/c\" u 1:$c t \"$t\"" ); - $c += scalar(@vars); - } - $plot .= "# $v\nplot " . join(", ", @p) . ";\n\n"; - } - print $plot; - open(O,">$out/plot"); - print O $plot; - close O; -} - diff --git a/tags/20070517_before_mds_merge/script/sum.pl b/tags/20070517_before_mds_merge/script/sum.pl deleted file mode 100755 index 92ef9a9b222a8..0000000000000 --- a/tags/20070517_before_mds_merge/script/sum.pl +++ /dev/null @@ -1,148 +0,0 @@ -#!/usr/bin/perl - -use strict; -my $starttime = 1; -my $endtime = -1; - -my $avgrows = 0; - -while ($ARGV[0] =~ /^-/) { - $_ = shift @ARGV; - if ($_ eq '-avg') { - $avgrows = 1; - } - elsif ($_ eq '-start') { - $starttime = shift @ARGV; - } - elsif ($_ eq '-end') { - $endtime = shift @ARGV; - } - else { - die "i don't understand arg $_"; - } -} -my @files = @ARGV; - -if (scalar(@files) == 1 && $files[0] =~ /\*/) { - my ($dir, $pat) = $files[0] =~ /^(.*)\/([^\/]+)$/; - @files = (); - $pat =~ s/\*//; -# print "dir $dir pat $pat\n"; - opendir(D,"$dir"); - for my $f (readdir(D)) { - # print "$f\n"; - next unless $f =~ /^$pat/; - push(@files, "$dir/$f"); - } - closedir(D); - -# print "files = @files\n"; -} - -my @data; -for my $f (@files) { - open(I,$f); - push( @data, ); - close I; -} - -my %sum; # time -> name -> val -my %col; # colnum -> name .. colnums start at 0 (time doesn't count) -my %min; -my %max; -my %avg; -my %tcount; -my $files; -for (@data) { - chomp; - my @r = split(/\s+/,$_); - my $r = shift @r; - - # column headings? - if ($r =~ /^\#/) { - my $num = 0; - while (my $name = shift @r) { - $col{$num} = $name; - $num++; - } - next; - } - - next unless int $r; - next if $r < $starttime; - next if $endtime > 0 && $r > $endtime; - - $tcount{$r}++; - $files = $tcount{$r} if $tcount{$r} > $files; - #print "$r: @r\n"; - my $i = 0; - while (@r) { - my $v = shift @r; - $sum{$r}->{$col{$i}} += $v; # if $v > 0; - - $min{$col{$i}} = $v - if ($min{$col{$i}} > $v || !(defined $min{$col{$i}})); - $max{$col{$i}} = $v - if ($max{$col{$i}} < $v); - - $avg{$col{$i}} += $v; - $i++; - } -} - -## dump -my @c = sort {$a <=> $b} keys %col; -# cols -print join("\t",'#', map { $col{$_} } @c) . "\n"; -my $n = 0; -for my $k (sort {$a <=> $b} keys %sum) { - if ($avgrows) { - print join("\t",$k, #map int, - map { $sum{$k}->{$col{$_}}/$tcount{$k} } @c ) . "\n"; - } else { - print join("\t",$k, map { $sum{$k}->{$col{$_}} } @c ) . "\n"; - } - $n++; -} - -my $rows = $n || 1; -#my $files = $tcount{$starttime}; -my %avgval; - -## devt -#warn "rows $rows, files $files\n"; -my %avgvalvart; # std dev of each col avg, over time -for my $k (keys %avg) { - my $av = $avgval{$k} = $avg{$k} / ($rows*$files); - - my $var = 0.0; - for my $t (sort {$a <=> $b} keys %sum) { - my $a = $sum{$t}->{$k} / $files; - $var += ($a - $av) * ($a - $av); - } - - $avgvalvart{$k} = $var / $rows; -} - - - - -print "\n"; -print join("\t",'#', map { $col{$_} } @c) . "\n"; -print join("\t", '#minval', map { $min{$col{$_}} } @c ) . "\n"; -print join("\t", '#maxval', map { $max{$col{$_}} } @c ) . "\n"; -print join("\t", '#rows', map { $rows } @c) . "\n"; -print join("\t", '#files', map { $files } @c) . "\n"; -print join("\t", '#sum', - map { $avg{$col{$_}} } @c ) . "\n"; -print join("\t", '#avgval', #map int, - map { $avgval{$col{$_}} } @c ) . "\n"; -# map { ($rows*$files) ? ($_ / ($rows*$files)):0 } map { $avg{$col{$_}} } @c ) . "\n"; - -print join("\t", '#avgvalvart', - map { $avgvalvart{$col{$_}} } @c ) . "\n"; -print join("\t", '#avgvaldevt', - map { sqrt($_) } map { $avgvalvart{$col{$_}} } @c ) . "\n"; - -print join("\t", '#avgsum', #map int, - map { $_ / $rows } map { $avg{$col{$_}} } @c ) . "\n"; diff --git a/tags/20070517_before_mds_merge/test/fakemds.cc b/tags/20070517_before_mds_merge/test/fakemds.cc deleted file mode 100644 index b75b62d58152c..0000000000000 --- a/tags/20070517_before_mds_merge/test/fakemds.cc +++ /dev/null @@ -1,104 +0,0 @@ - - -#include -#include -#include - -#include "mds/MDS.h" -#include "osd/OSD.h" -#include "fakeclient/FakeClient.h" - -#include "mds/MDCluster.h" -#include "mds/MDCache.h" -#include "mds/MDStore.h" - -#include "msg/FakeMessenger.h" - -#include "messages/MPing.h" - -using namespace std; - -__uint64_t ino = 1; - - - -#include "config.h" -#define NUMMDS g_conf.num_mds -#define NUMOSD g_conf.num_osd -#define NUMCLIENT g_conf.num_fakeclient - -// this parses find output -int play(); - -int main(int oargc, char **oargv) { - cerr << "hi there" << endl; - - int argc; - char **argv; - parse_config_options(oargc, oargv, - argc, argv); - - MDCluster *mdc = new MDCluster(NUMMDS, NUMOSD); - - // local config settings - g_conf.num_client = g_conf.num_fakeclient; // to fool mds, hack gross - - // create osds - OSD *osd[NUMOSD]; - for (int i=0; iinit(); - } - - // create mds - MDS *mds[NUMMDS]; - for (int i=0; iinit(); - } - - - // create clients - FakeClient *client[NUMCLIENT]; - for (int i=0; iinit(); - } - - // mount clients - for (int i=0; imount(); - - // loop - fakemessenger_do_loop(); - - //mds[0]->shutdown_start(); - //fakemessenger_do_loop(); - - // - if (argc > 1 && - strcmp(argv[1], "nocheck") == 0) { - cerr << "---- nocheck" << endl; - } else { - cout << "---- check ----" << endl; - for (int i=0; imdcache->shutdown_pass(); - } - - // cleanup - cout << "cleanup" << endl; - for (int i=0; i - * Daniel Jönsson - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the Do What The Fuck You Want To - * Public License as published by Banlu Kemiyatorn. See - * http://sam.zoy.org/projects/COPYING.WTFPL for more details. - * - * Compilation example: - * gcc -shared -fPIC gprof-helper.c -o gprof-helper.so -lpthread -ldl - * - * Usage example: - * LD_PRELOAD=./gprof-helper.so your_program - */ - -#define _GNU_SOURCE -#include -#include -#include -#include -#include - -static void * wrapper_routine(void *); - -/* Original pthread function */ -static int (*pthread_create_orig)(pthread_t *__restrict, - __const pthread_attr_t *__restrict, - void *(*)(void *), - void *__restrict) = NULL; - -/* Library initialization function */ -void wooinit(void) __attribute__((constructor)); - -void wooinit(void) -{ - pthread_create_orig = dlsym(RTLD_NEXT, "pthread_create"); - fprintf(stderr, "pthreads: using profiling hooks for gprof\n"); - if(pthread_create_orig == NULL) - { - char *error = dlerror(); - if(error == NULL) - { - error = "pthread_create is NULL"; - } - fprintf(stderr, "%s\n", error); - exit(EXIT_FAILURE); - } -} - -/* Our data structure passed to the wrapper */ -typedef struct wrapper_s -{ - void * (*start_routine)(void *); - void * arg; - - pthread_mutex_t lock; - pthread_cond_t wait; - - struct itimerval itimer; - -} wrapper_t; - -/* The wrapper function in charge for setting the itimer value */ -static void * wrapper_routine(void * data) -{ - /* Put user data in thread-local variables */ - void * (*start_routine)(void *) = ((wrapper_t*)data)->start_routine; - void * arg = ((wrapper_t*)data)->arg; - - /* Set the profile timer value */ - setitimer(ITIMER_PROF, &((wrapper_t*)data)->itimer, NULL); - - /* Tell the calling thread that we don't need its data anymore */ - pthread_mutex_lock(&((wrapper_t*)data)->lock); - pthread_cond_signal(&((wrapper_t*)data)->wait); - pthread_mutex_unlock(&((wrapper_t*)data)->lock); - - /* Call the real function */ - return start_routine(arg); -} - -/* Our wrapper function for the real pthread_create() */ -int pthread_create(pthread_t *__restrict thread, - __const pthread_attr_t *__restrict attr, - void * (*start_routine)(void *), - void *__restrict arg) -{ - wrapper_t wrapper_data; - int i_return; - - /* Initialize the wrapper structure */ - wrapper_data.start_routine = start_routine; - wrapper_data.arg = arg; - getitimer(ITIMER_PROF, &wrapper_data.itimer); - pthread_cond_init(&wrapper_data.wait, NULL); - pthread_mutex_init(&wrapper_data.lock, NULL); - pthread_mutex_lock(&wrapper_data.lock); - - /* The real pthread_create call */ - i_return = pthread_create_orig(thread, - attr, - &wrapper_routine, - &wrapper_data); - - /* If the thread was successfully spawned, wait for the data - * to be released */ - if(i_return == 0) - { - pthread_cond_wait(&wrapper_data.wait, &wrapper_data.lock); - } - - pthread_mutex_unlock(&wrapper_data.lock); - pthread_mutex_destroy(&wrapper_data.lock); - pthread_cond_destroy(&wrapper_data.wait); - - return i_return; -} - diff --git a/tags/20070517_before_mds_merge/test/makedirs.cc b/tags/20070517_before_mds_merge/test/makedirs.cc deleted file mode 100644 index 8fd74d996ef9f..0000000000000 --- a/tags/20070517_before_mds_merge/test/makedirs.cc +++ /dev/null @@ -1,38 +0,0 @@ -#include -#include -using namespace std; - -int make_dirs(const char *basedir, int dirs, int files, int depth) -{ - //if (time_to_stop()) return 0; - - // make sure base dir exists - int r = mkdir(basedir, 0755); - if (r != 0) { - cout << "can't make base dir? " << basedir << endl; - return -1; - } - - // children - char d[500]; - cout << "make_dirs " << basedir << " dirs " << dirs << " files " << files << " depth " << depth << endl; - for (int i=0; i -#include -#include -using namespace std; - -#include "mds/MDCluster.h" -#include "mds/MDS.h" -#include "osd/OSD.h" -#include "fakeclient/FakeClient.h" - -#include "mds/MDCache.h" -#include "mds/MDStore.h" - -#include "msg/MPIMessenger.h" -//#include "msg/CheesySerializer.h" - -#include "messages/MPing.h" - - -__uint64_t ino = 1; - - - -#include "config.h" -#define NUMMDS g_conf.num_mds -#define NUMOSD g_conf.num_osd -#define NUMCLIENT g_conf.num_client - -// this parses find output -int play(); - -int main(int argc, char **argv) { - cout << "mpitest starting" << endl; - - int myrank = mpimessenger_init(argc, argv); - int world = mpimessenger_world(); - - - - MDCluster *mdc = new MDCluster(NUMMDS, NUMOSD); - - // create osds - OSD *osd[NUMOSD]; - for (int i=0; iinit(); - } - - // create mds - MDS *mds[NUMMDS]; - for (int i=0; iinit(); - } - - // create clients - FakeClient *client[NUMCLIENT]; - for (int i=0; iset_dispatcher(serializer); - - client[i] = new FakeClient(mdc, i, real, g_conf.fakeclient_requests); - client[i]->init(); - } - - // seed initial requests - for (int i=0; iissue_request(); - } - - mpimessenger_start(); // start message loop - mpimessenger_wait(); // wait for thread to finish - mpimessenger_shutdown(); // shutdown MPI - - // - /* - cout << "---- check ----" << endl; - for (int i=0; imdcache->shutdown_pass(); - } - */ - - // cleanup - //cout << "cleanup" << endl; - for (int i=0; i -#include "mpi.h" - -#include "messages/MClientRequest.h" -#include "msg/MTMessenger.h" -#include "include/error.h" - -#define SARG_SIZE 64 -#define SERVER_RANK 0 -#define NTHREADS 11 // number of threads per rank -#define NMESSAGES 31 // number of messages per thread - -static void server_loop(MTMessenger &msgr, int world_size) -{ - // we expect this many messages from clients, then we quit - // (world_size-1 since server is one of the processes). - int totmsg = NTHREADS * NMESSAGES * (world_size - 1); - int nmsg = 0; - - char buf[SARG_SIZE]; - - while(nmsg < totmsg) { - MClientRequest *req = (MClientRequest*)msgr.recvreq(); - ASSERT(req->get_type() == MSG_CLIENT_REQUEST); - - //cout << "Server acknowledging " << req->get_sarg() << endl; - - sprintf(buf, "%s reply", req->get_sarg().c_str()); - MClientRequest resp(0, 0); - resp.set_sarg(buf); - msgr.sendresp(req, &resp); - - delete req; - nmsg++; - } - - cout << "Server successful" << endl; -} - -// arguments for client thread start function (see pthread_create) -struct client_arg -{ - MTMessenger *msgr; - int rank; - int thread; -}; - -static void *client_session(void *_carg) -{ - client_arg *carg = (client_arg *)_carg; - - char buf[SARG_SIZE]; - - // repeat some number (arbitrary really) of rounds - for (int i = 0; i < NMESSAGES; i++) { - - // send the message, receive the reply and check reply is as - // expected - - MClientRequest request(0, 0); - sprintf(buf, "r%d:t%d:m%d", carg->rank, carg->thread, i); - request.set_sarg(buf); - - //cout << "Client sending " << request.get_sarg() << endl; - - MClientRequest *resp = - (MClientRequest*)carg->msgr->sendrecv(&request, SERVER_RANK); - - ASSERT(resp->get_type() == MSG_CLIENT_REQUEST); - sprintf(buf, "r%d:t%d:m%d reply", carg->rank, carg->thread, i); - ASSERT(strcmp(buf, resp->get_sarg().c_str()) == 0); - - //cout << "Client verified " << resp->get_sarg() << endl; - - delete resp; - } - - cout << "Client (" << carg->rank << "," << carg->thread - << ") successful" << endl; - - delete carg; - return NULL; -} - -static void launch_clients(MTMessenger &msgr, int rank) -{ - pthread_t tid[NTHREADS]; - - // launch some number (arbitrary really) of threads - for (int i = 0; i < NTHREADS; i++) { - - client_arg *carg = (client_arg*)malloc(sizeof(client_arg)); - ASSERT(carg); - carg->msgr = &msgr; - carg->rank = rank; - carg->thread = i; - - if (pthread_create(&tid[i], NULL, client_session, carg) < 0) - SYSERROR(); - } - - // we must wait for all the threads to exit before returning, - // otherwise we shutdown MPI before while the threads are - // chatting. - for (int i = 0; i < NTHREADS; i++) { - void *retval; - - if (pthread_join(tid[i], &retval) < 0) - SYSERROR(); - } -} - -int main(int argc, char **argv) -{ - MTMessenger msgr(argc, argv); - - int rank; - ASSERT(MPI_Comm_rank(MPI_COMM_WORLD, &rank) == MPI_SUCCESS); - int world_size; - ASSERT(MPI_Comm_size(MPI_COMM_WORLD, &world_size) == MPI_SUCCESS); - - if (rank == SERVER_RANK) - server_loop(msgr, world_size); - else - launch_clients(msgr, rank); - - return 0; -} diff --git a/tags/20070517_before_mds_merge/test/rushconfig b/tags/20070517_before_mds_merge/test/rushconfig deleted file mode 100644 index 40d82702ea0a5..0000000000000 --- a/tags/20070517_before_mds_merge/test/rushconfig +++ /dev/null @@ -1,7 +0,0 @@ -6 -8 10.0 -4 20.0 -7 30.0 -9 10.0 -8 15.0 -5 11.0 diff --git a/tags/20070517_before_mds_merge/test/rushtest.cc b/tags/20070517_before_mds_merge/test/rushtest.cc deleted file mode 100644 index ecff83523e0c6..0000000000000 --- a/tags/20070517_before_mds_merge/test/rushtest.cc +++ /dev/null @@ -1,49 +0,0 @@ -// -// $Id$ -// - -#include -#include -#include "../osd/rush.h" - -main (int argc, char *argv[]) -{ - Rush rush; - char buf[200]; - int i, j, k, numClusters; - int numKeys = 5; - int numReplicas = 4; - int curSize; - double curWeight; - int servers[1000]; - - if (argc > 1) { - numKeys = atoi (argv[1]); - } - if (argc > 2) { - numReplicas = atoi (argv[2]); - } - - fgets (buf, sizeof (buf) - 2, stdin); - sscanf (buf, "%d", &numClusters); - for (i = 0; i < numClusters; i++) { - fgets (buf, sizeof (buf) - 2, stdin); - sscanf (buf, "%d %lf", &curSize, &curWeight); - rush.AddCluster (curSize, curWeight); - if (rush.Servers () < numReplicas) { - fprintf (stderr, "ERROR: must have at least %d disks in the system!\n", - rush.Clusters ()); - exit (-1); - } - for (j = 0; j < numKeys; j++) { - rush.GetServersByKey (j, numReplicas, servers); -#if 0 - printf ("%-3d %-6d ", i, j); - for (k = 0; k < numReplicas; k++) { - printf ("%-5d ", servers[k]); - } - putchar ('\n'); -#endif - } - } -} diff --git a/tags/20070517_before_mds_merge/test/rushtest.cc~ b/tags/20070517_before_mds_merge/test/rushtest.cc~ deleted file mode 100644 index 0b9512ccd0c3d..0000000000000 --- a/tags/20070517_before_mds_merge/test/rushtest.cc~ +++ /dev/null @@ -1,49 +0,0 @@ -// -// $Id$ -// - -#include -#include -#include "rush.h" - -main (int argc, char *argv[]) -{ - Rush rush; - char buf[200]; - int i, j, k, numClusters; - int numKeys = 5; - int numReplicas = 4; - int curSize; - double curWeight; - int servers[1000]; - - if (argc > 1) { - numKeys = atoi (argv[1]); - } - if (argc > 2) { - numReplicas = atoi (argv[2]); - } - - fgets (buf, sizeof (buf) - 2, stdin); - sscanf (buf, "%d", &numClusters); - for (i = 0; i < numClusters; i++) { - fgets (buf, sizeof (buf) - 2, stdin); - sscanf (buf, "%d %lf", &curSize, &curWeight); - rush.AddCluster (curSize, curWeight); - if (rush.Servers () < numReplicas) { - fprintf (stderr, "ERROR: must have at least %d disks in the system!\n", - rush.Clusters ()); - exit (-1); - } - for (j = 0; j < numKeys; j++) { - rush.GetServersByKey (j, numReplicas, servers); -#if 0 - printf ("%-3d %-6d ", i, j); - for (k = 0; k < numReplicas; k++) { - printf ("%-5d ", servers[k]); - } - putchar ('\n'); -#endif - } - } -} diff --git a/tags/20070517_before_mds_merge/test/testbucket.cc b/tags/20070517_before_mds_merge/test/testbucket.cc deleted file mode 100644 index d8676da18faba..0000000000000 --- a/tags/20070517_before_mds_merge/test/testbucket.cc +++ /dev/null @@ -1,67 +0,0 @@ - - -#include "../crush/Bucket.h" -using namespace crush; - -#include -#include -using namespace std; - - -ostream& operator<<(ostream& out, vector& v) -{ - out << "["; - for (int i=0; i disks; - for (int i=0; i<20; i++) - disks.push_back(i); - - - /* - UniformBucket ub(1, 1, 0, 10, disks); - ub.make_primes(h); - cout << "primes are " << ub.primes << endl; - */ - - MixedBucket mb(2, 1); - for (int i=0;i<20;i++) - mb.add_item(i, 10); - - /* - MixedBucket b(3, 1); - b.add_item(1, ub.get_weight()); - b.add_item(2, mb.get_weight()); - */ - MixedBucket b= mb; - - vector ocount(disks.size()); - int numrep = 3; - - vector v(numrep); - for (int x=1; x<1000000; x++) { - //cout << H(x) << "\t" << h(x) << endl; - for (int i=0; i -using namespace std; - -#include "include/bufferlist.h" - - -int main() -{ - - bufferptr p1 = new buffer("123456",6); - bufferptr p2 = p1; - - cout << "it is '" << p1.c_str() << "'" << endl; - - bufferptr p3 = new buffer("abcdef",6); - - cout << "p3 is " << p3 << endl; - - bufferlist bl; - bl.push_back(p2); - bl.push_back(p1); - bl.push_back(p3); - - cout << "bl is " << bl << endl; - - cout << "len is " << bl.length() << endl; - - bufferlist took; - bl.splice(10,4,&took); - - cout << "took out " << took << "leftover is " << bl << endl; - //cout << "len is " << bl.length() << endl; - - bufferlist bl2; - bl2.substr_of(bl, 3, 5); - cout << "bl2 is " << bl2 << endl; - - -} diff --git a/tags/20070517_before_mds_merge/test/testcrush.cc b/tags/20070517_before_mds_merge/test/testcrush.cc deleted file mode 100644 index bd432b23ee95c..0000000000000 --- a/tags/20070517_before_mds_merge/test/testcrush.cc +++ /dev/null @@ -1,266 +0,0 @@ - - -#include "../crush/crush.h" -using namespace crush; - -#include - -#include -#include -using namespace std; - -/* -ostream& operator<<(ostream& out, vector& v) -{ - out << "["; - for (int i=0; i& d) -{ - d.clear(); - while (n) { - d.push_back(no); - no++; - n--; - } -} - - -Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks, int& nbuckets) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - return b; - } else { - // mixed - MixedBucket *b = new MixedBucket(nbuckets--, h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, int& ndisks, int& nbuckets) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks, nbuckets); - return b->get_id(); -} - - - -int main() -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - vector disks; - int root = -1; - int nbuckets = -1; - int ndisks = 0; - - if (0) { - make_disks(12, ndisks, disks); - UniformBucket ub1(-1, 1, 0, 30, disks); - ub1.make_primes(h); - cout << "ub1 primes are " << ub1.primes << endl; - c.add_bucket(&ub1); - - make_disks(17, ndisks, disks); - UniformBucket ub2(-2, 1, 0, 30, disks); - ub2.make_primes(h); - cout << "ub2 primes are " << ub2.primes << endl; - c.add_bucket(&ub2); - - make_disks(4, ndisks, disks); - UniformBucket ub3(-3, 1, 0, 30, disks); - ub3.make_primes(h); - cout << "ub3 primes are " << ub3.primes << endl; - c.add_bucket(&ub3); - - make_disks(20, ndisks, disks); - MixedBucket umb1(-4, 1); - for (int i=0; i<20; i++) - umb1.add_item(disks[i], 30); - c.add_bucket(&umb1); - - MixedBucket b(-100, 1); - //b.add_item(-2, ub1.get_weight()); - b.add_item(-4, umb1.get_weight()); - //b.add_item(-2, ub2.get_weight()); - //b.add_item(-3, ub3.get_weight()); - } - - if (0) { - int bucket = -1; - MixedBucket *root = new MixedBucket(bucket--, 2); - - for (int i=0; i<5; i++) { - MixedBucket *b = new MixedBucket(bucket--, 1); - - int n = 5; - - if (1) { - // add n buckets of n disks - for (int j=0; jadd_item(disks[k], 10); - - //b->add_item(disks[j], 10); - c.add_bucket(d); - b->add_item(d->get_id(), d->get_weight()); - } - - c.add_bucket(b); - root->add_item(b->get_id(), b->get_weight()); - } else { - // add n*n disks - make_disks(n*n, ndisks, disks); - for (int k=0; kadd_item(disks[k], 10); - - c.add_bucket(b); - root->add_item(b->get_id(), b->get_weight()); - } - } - - c.add_bucket(root); - } - - - if (1) { - vector wid; - for (int d=0; d<5; d++) - wid.push_back(10); - root = make_hierarchy(c, wid, ndisks, nbuckets); - } - - - - // rule - int numrep = 1; - - Rule rule; - if (0) { - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, -100)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - } - if (1) { - /* - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, -4)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, 2, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - */ - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, 1, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - } - - //c.overload[10] = .1; - - - int pg_per = 100; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - cout << ndisks << " disks, " << 1-nbuckets << " buckets" << endl; - cout << pg_per << " pgs per disk" << endl; - cout << numpg << " logical pgs" << endl; - cout << "numrep is " << numrep << endl; - - - int place = 1000000; - int times = place / numpg; - if (!times) times = 1; - - cout << "looping " << times << " times" << endl; - - float tvar = 0; - int tvarnum = 0; - - int x = 0; - for (int t=0; t v(numrep); - - for (int z=0; z -using namespace std; - -int print(string s) { - filepath fp = s; - cout << "s = " << s << " filepath = " << fp << endl; - cout << " depth " << fp.depth() << endl; - for (int i=0; i -#include -#include -using namespace std; - -#include "config.h" -#include "messages/MPing.h" -#include "common/Mutex.h" - -#include "msg/MPIMessenger.h" - -class Pinger : public Dispatcher { -public: - Messenger *messenger; - Pinger(Messenger *m) : messenger(m) { - m->set_dispatcher(this); - } - void dispatch(Message *m) { - //dout(1) << "got incoming " << m << endl; - delete m; - - } -}; - -int main(int argc, char **argv) { - int num = 1000; - - int myrank = mpimessenger_init(argc, argv); - int world = mpimessenger_world(); - - Pinger *p = new Pinger( new MPIMessenger(myrank) ); - - mpimessenger_start(); - - //while (1) { - for (int i=0; i<10000; i++) { - - // ping random nodes - int d = rand() % world; - if (d != myrank) { - //cout << "sending " << i << " to " << d << endl; - p->messenger->send_message(new MPing(), d); - } - - } - - - //cout << "shutting down" << endl; - //p->messenger->shutdown(); - - mpimessenger_wait(); - mpimessenger_shutdown(); // shutdown MPI -} diff --git a/tags/20070517_before_mds_merge/test/testnewbuffers.cc b/tags/20070517_before_mds_merge/test/testnewbuffers.cc deleted file mode 100644 index 0fea7571a4572..0000000000000 --- a/tags/20070517_before_mds_merge/test/testnewbuffers.cc +++ /dev/null @@ -1,91 +0,0 @@ - -#include -#include -using namespace std; - - -#include "include/newbuffer.h" -//#include "include/bufferlist.h" - -#include "common/Thread.h" - - - class Th : public Thread { - public: - bufferlist bl; - Th(bufferlist& o) : bl(o) { } - - void *entry() { - //cout << "start" << endl; - // thrash it a bit. - for (int n=0; n<10000; n++) { - bufferlist bl2; - unsigned off = rand() % (bl.length() -1); - unsigned len = 1 + rand() % (bl.length() - off - 1); - bl2.substr_of(bl, off, len); - bufferlist bl3; - bl3.append(bl); - bl3.append(bl2); - //cout << bl3 << endl; - bl2.clear(); - bl3.clear(); - } - //cout << "end" << endl; - } - }; - -int main() -{ - - bufferptr p1 = buffer::copy("123456",7); - //bufferptr p1 = new buffer("123456",7); - bufferptr p2 = p1; - - cout << "p1 is '" << p1.c_str() << "'" << " " << p1 << endl; - cout << "p2 is '" << p2.c_str() << "'" << " " << p2 << endl; - - bufferptr p3 = buffer::copy("abcdef",7); - //bufferptr p3 = new buffer("abcdef",7); - - cout << "p3 is " << p3.c_str() << " " << p3 << endl; - - bufferlist bl; - bl.push_back(p2); - bl.push_back(p1); - bl.push_back(p3); - - cout << "bl is " << bl << endl; - - bufferlist took; - bl.splice(10,4,&took); - - cout << "took out " << took << ", leftover is " << bl << endl; - //cout << "len is " << bl.length() << endl; - - bufferlist bl2; - bl2.substr_of(bl, 3, 5); - cout << "bl2 is " << bl2 << endl; - - - cout << "bl before " << bl << endl; - - list ls; - for (int t=0; t<40; t++) { - Th *t = new Th(bl); - cout << "create" << endl; - t->create(); - ls.push_back(t); - } - - bl.clear(); - - while (!ls.empty()) { - cout << "join" << endl; - ls.front()->join(); - delete ls.front(); - ls.pop_front(); - } - - cout << "bl after " << bl << endl; - -} diff --git a/tags/20070517_before_mds_merge/test/testos.cc b/tags/20070517_before_mds_merge/test/testos.cc deleted file mode 100644 index 24c81590d899c..0000000000000 --- a/tags/20070517_before_mds_merge/test/testos.cc +++ /dev/null @@ -1,343 +0,0 @@ -/* testos.cc -- simple ObjectStore test harness. - Copyright (C) 2007 Casey Marshall - -Ceph - scalable distributed file system - -This is free software; you can redistribute it and/or -modify it under the terms of the GNU Lesser General Public -License version 2.1, as published by the Free Software -Foundation. See file COPYING. */ - - -#include "osd/ObjectStore.h" -#include "ebofs/Ebofs.h" -#include "osbdb/OSBDB.h" -#include "include/buffer.h" - -#include -#include -#include - -#include -#include - -using namespace std; - -static inline unsigned long long -to_usec (struct timeval &time) -{ - return (((unsigned long long) time.tv_sec * 1000000) - + ((unsigned long long) time.tv_usec)); -} - -static inline unsigned long long -to_msec (struct timeval &time) -{ - return (((unsigned long long) time.tv_sec * 1000) - + ((unsigned long long) time.tv_usec / 1000)); -} - -int main (int argc, char **argv) -{ - vector args; - char *osd_name = "ebofs"; - unsigned object_size = 1024; - unsigned object_count = 1024; - unsigned write_iter = 64; - unsigned random_seed = ::time(NULL); - char *device = "/tmp/testos"; - char *mountcmd = "mount /tmp/testos"; - char *umountcmd = "umount /tmp/testos"; - - bool ebofs_raw_device = false; - bool inhibit_remount = (getenv("TESTOS_INHIBIT_REMOUNT") != NULL); - - if (argc > 1 - && (strcmp (argv[1], "-h") == 0 - || strcmp (argv[1], "-help") == 0 - || strcmp (argv[1], "--help") == 0)) - { - cout << "usage: " << argv[0] << " [store [object-size [object-count [iterations [seed]]]]]" << endl; - cout << endl; - cout << "Where the arguments are:" << endl << endl; - cout << " store -- store type; default \"ebofs\"" << endl; - cout << " object-size -- size of objects; default 1024" << endl; - cout << " object-count -- number of objects to write; default 1024" - << endl; - cout << " iterations -- write the objects that many times; default 5" - << endl; - cout << " seed -- random seed; default current time" << endl; - exit (0); - } - - argv_to_vec (argc, argv, args); - for (vector::iterator it = args.begin(); it != args.end(); - it++) - cout << *it << " "; - cout << endl; - parse_config_options (args); - for (vector::iterator it = args.begin(); it != args.end(); - it++) - cout << *it << " "; - cout << endl; - - argc = args.size(); - if (argc > 0) - osd_name = args[0]; - if (argc > 1) - object_size = (unsigned) atol (args[1]); - if (argc > 2) - object_count = (unsigned) atol (args[2]); - if (argc > 3) - write_iter = (unsigned) atol (args[3]); - if (argc > 4) - random_seed = (unsigned) atol (args[4]); - - // algin object size to 'long' - object_size = ((object_size + (sizeof (long) - 1)) / sizeof (long)) * sizeof (long); - - char *osd_file = new char[32]; - strcpy (osd_file, "/tmp/testos/testos.XXXXXX"); - mktemp (osd_file); - - if (strcasecmp (osd_name, "ebofs") == 0) - { - char *dev_env = getenv ("TESTOS_EBOFS_DEV"); - if (dev_env != NULL) - { - // Assume it is a true device. - strncpy (osd_file, dev_env, 32); - inhibit_remount = true; - ebofs_raw_device = true; - } - } - - if (!inhibit_remount) - { - if (system (mountcmd) != 0) - { - cerr << "mount failed" << endl; - exit (1); - } - } - - ObjectStore *os = NULL; - if (strcasecmp (osd_name, "ebofs") == 0) - { - if (!ebofs_raw_device) - { - FILE *f = fopen (osd_file, "w"); - if (f == NULL) - { - cerr << "failed to open " << osd_file << ": " << strerror (errno) - << endl; - exit (1); - } - // 1G file. - fseek (f, 1024 * 1024 * 1024, SEEK_SET); - fputc ('\0', f); - fclose (f); - } - os = new Ebofs (osd_file); - } - else if (strcasecmp (osd_name, "osbdb") == 0) - { - os = new OSBDB (osd_file); - } - else if (strcasecmp (osd_name, "osbdb-btree") == 0) - { - g_conf.bdbstore_btree = true; - os = new OSBDB (osd_file); - } - else - { - cerr << "I don't know about object store \"" << osd_name << "\"" - << endl; - exit (1); - } - - cout << "Writing " << object_count << " objects of size " - << object_size << " to " << osd_name << endl; - - char *val = (char *) malloc (object_size); - char *val2 = (char *) malloc (object_size); - auto_ptr valptr (val); - auto_ptr valptr2(val2); - if (getenv ("TESTOS_UNALIGNED") != NULL) - { - val = val + 1; - val2 = val2 + 1; - } - - for (unsigned i = 0; i < object_size; i++) - { - val[i] = (char) i; - val2[i] = (char) i; - } - object_t *oids = new object_t[object_count]; - - utime_t writes[write_iter]; - utime_t total_write; - utime_t reads[write_iter]; - utime_t total_read; - for (unsigned i = 0; i < write_iter; i++) - { - cerr << "Iteration " << i << endl; - - int ret = os->mkfs(); - if (ret != 0) - { - cerr << "mkfs(" << osd_file << "): " << strerror (-ret) << endl; - exit (1); - } - ret = os->mount(); - if (ret != 0) - { - cerr << "mount(): " << strerror (-ret) << endl; - exit (1); - } - - srandom (random_seed + i); - - for (unsigned j = 0; j < object_count; j++) - { - oids[j].ino = (uint64_t) random() << 32 | random(); - oids[j].bno = random(); - } - - utime_t begin = g_clock.now(); - for (unsigned o = 0; o < object_count; o++) - { - bufferptr bp (val, object_size); - bufferlist bl; - bl.push_back (bp); - int ret; - if ((ret = os->write (oids[o], 0L, object_size, bl, NULL)) < 0) - cerr << "write " << oids[o] << " failed: " - << strerror (-ret) << endl; - } - os->sync(); - - utime_t end = g_clock.now() - begin; - - cerr << "Write finished in " << end << endl; - total_write += end; - writes[i] = end; - - os->umount(); - sync(); - - if (!inhibit_remount) - { - if (system (umountcmd) != 0) - { - cerr << "umount failed" << endl; - exit (1); - } - - if (system (mountcmd) != 0) - { - cerr << "mount(2) failed" << endl; - exit (1); - } - } - - os->mount(); - - // Shuffle the OIDs. - for (int j = 0; j < object_count; j++) - { - int x = random() % object_count; - if (x < 0) - x = -x; - object_t o = oids[j]; - oids[j] = oids[x]; - oids[x] = o; - } - - begin = g_clock.now(); - for (unsigned o = 0; o < object_count; o++) - { - bufferptr bp (val2, object_size); - bufferlist bl; - bl.push_back (bp); - - if (os->read (oids[o], 0L, object_size, bl) < 0) - { - cerr << "object " << oids[o] << " not found!" << endl; - } - } - end = g_clock.now() - begin; - - cerr << "Read finished in " << end << endl; - total_read += end; - reads[i] = end; - - os->umount(); - sync(); - - if (!inhibit_remount) - { - if (system (umountcmd) != 0) - { - cerr << "umount(2) failed" << endl; - exit (1); - } - - if (system (mountcmd) != 0) - { - cerr << "mount(3) failed" << endl; - exit (1); - } - } - } - - cerr << "Finished in " << (total_write + total_read) << endl; - - double write_mean = ((double) total_write) / ((double) write_iter); - double write_sd = 0.0; - for (unsigned i = 0; i < write_iter; i++) - { - double x = ((double) writes[i]) - write_mean; - write_sd += x * x; - } - write_sd = sqrt (write_sd / ((double) write_iter)); - - double read_mean = ((double) total_read) / ((double) write_iter); - double read_sd = 0.0; - for (unsigned i = 0; i < write_iter; i++) - { - double x = ((double) reads[i]) - read_mean; - write_sd += x * x; - } - read_sd = sqrt (read_sd / ((double) write_iter)); - - cout << "TESTOS: write " << osd_name << ":" << object_size << ":" - << object_count << ":" << write_iter << ":" << random_seed - << " -- " << write_mean << " " << write_sd << endl; - - cout << "TESTOS: write.raw -- "; - for (int i = 0; i < write_iter; i++) - cout << ((double) writes[i]) << " "; - cout << endl; - - cout << "TESTOS: read " << osd_name << ":" << object_size << ":" - << object_count << ":" << write_iter << ":" << random_seed - << " -- " << read_mean << " " << read_sd << endl; - - cout << "TESTOS: read.raw -- "; - for (int i = 0; i < write_iter; i++) - cout << ((double) reads[i]) << " "; - cout << endl; - - unlink (osd_file); - if (!inhibit_remount) - { - if (system (umountcmd) != 0) - { - cerr << "umount(3) failed" << endl; - exit (1); - } - } - exit (0); -} diff --git a/tags/20070517_before_mds_merge/test/testosbdb.cc b/tags/20070517_before_mds_merge/test/testosbdb.cc deleted file mode 100644 index 19268e7587531..0000000000000 --- a/tags/20070517_before_mds_merge/test/testosbdb.cc +++ /dev/null @@ -1,347 +0,0 @@ -/* testosbdb.cc -- test OSBDB. - Copyright (C) 2007 Casey Marshall */ - - -#include -#include "osbdb/OSBDB.h" - -using namespace std; - -int -main (int argc, char **argv) -{ - vector args; - argv_to_vec (argc, argv, args); - parse_config_options (args); - - g_conf.debug_bdbstore = 10; - //g_conf.bdbstore_btree = true; - char dbfile[256]; - strncpy (dbfile, "/tmp/testosbdb/db.XXXXXX", 256); - mktemp (dbfile); - OSBDB *os = new OSBDB(dbfile); - auto_ptr osPtr (os); - os->mkfs(); - os->mount(); - - // Put an object. - object_t oid (0xDEADBEEF00000000ULL, 0xFEEDFACE); - - cout << "sizeof oid_t is " << sizeof (oid_t) << endl; - cout << "offsetof oid_t.id " << offsetof (oid_t, id) << endl; - - cout << sizeof (object_t) << endl; - cout << sizeof (oid.ino) << endl; - cout << sizeof (oid.bno) << endl; - cout << sizeof (oid.rev) << endl; - - // Shouldn't be there. - if (os->exists (oid)) - { - cout << "FAIL: oid shouldn't be there " << oid << endl; - } - - // Write an object. - char *x = (char *) malloc (1024); - memset(x, 0xaa, 1024); - bufferptr bp (x, 1024); - bufferlist bl; - bl.push_back (bp); - - if (os->write (oid, 0L, 1024, bl, NULL) != 1024) - { - cout << "FAIL: writing object" << endl; - } - - os->sync(); - - // Should be there. - if (!os->exists (oid)) - { - cout << "FAIL: oid should be there: " << oid << endl; - } - - memset(x, 0, 1024); - if (os->read (oid, 0, 1024, bl) != 1024) - { - cout << "FAIL: reading object" << endl; - } - - for (int i = 0; i < 1024; i++) - { - if ((x[i] & 0xFF) != 0xaa) - { - cout << "FAIL: data read out is different" << endl; - break; - } - } - - // Set some attributes - if (os->setattr (oid, "alpha", "value", strlen ("value")) != 0) - { - cout << "FAIL: set attribute" << endl; - } - if (os->setattr (oid, "beta", "value", strlen ("value")) != 0) - { - cout << "FAIL: set attribute" << endl; - } - if (os->setattr (oid, "gamma", "value", strlen ("value")) != 0) - { - cout << "FAIL: set attribute" << endl; - } - if (os->setattr (oid, "fred", "value", strlen ("value")) != 0) - { - cout << "FAIL: set attribute" << endl; - } - - char *attrs = (char *) malloc (1024); - if (os->listattr (oid, attrs, 1024) != 0) - { - cout << "FAIL: listing attributes" << endl; - } - else - { - char *p = attrs; - if (strcmp (p, "alpha") != 0) - { - cout << "FAIL: should be \"alpha:\" \"" << p << "\"" << endl; - } - p = p + strlen (p) + 1; - if (strcmp (p, "beta") != 0) - { - cout << "FAIL: should be \"beta:\" \"" << p << "\"" << endl; - } - p = p + strlen (p) + 1; - if (strcmp (p, "fred") != 0) - { - cout << "FAIL: should be \"fred:\" \"" << p << "\"" << endl; - } - p = p + strlen (p) + 1; - if (strcmp (p, "gamma") != 0) - { - cout << "FAIL: should be \"gamma:\" \"" << p << "\"" << endl; - } - } - - char attrvalue[256]; - memset(attrvalue, 0, sizeof (attrvalue)); - if (os->getattr (oid, "alpha", attrvalue, sizeof(attrvalue)) < 0) - { - cout << "FAIL: getattr alpha" << endl; - } - else if (strncmp ("value", attrvalue, strlen("value")) != 0) - { - cout << "FAIL: read attribute value differs" << endl; - } - memset(attrvalue, 0, sizeof (attrvalue)); - if (os->getattr (oid, "fred", attrvalue, sizeof(attrvalue)) < 0) - { - cout << "FAIL: getattr fred" << endl; - } - else if (strncmp ("value", attrvalue, strlen("value")) != 0) - { - cout << "FAIL: read attribute value differs" << endl; - } - memset(attrvalue, 0, sizeof (attrvalue)); - if (os->getattr (oid, "beta", attrvalue, sizeof(attrvalue)) < 0) - { - cout << "FAIL: getattr beta" << endl; - } - else if (strncmp ("value", attrvalue, strlen("value")) != 0) - { - cout << "FAIL: read attribute value differs" << endl; - } - memset(attrvalue, 0, sizeof (attrvalue)); - if (os->getattr (oid, "gamma", attrvalue, sizeof(attrvalue)) < 0) - { - cout << "FAIL: getattr gamma" << endl; - } - else if (strncmp ("value", attrvalue, strlen("value")) != 0) - { - cout << "FAIL: read attribute value differs" << endl; - } - - if (os->setattr (oid, "alpha", "different", strlen("different")) != 0) - cout << "FAIL: setattr overwrite" << endl; - memset(attrvalue, 0, sizeof (attrvalue)); - if (os->getattr (oid, "alpha", attrvalue, sizeof(attrvalue)) < 0) - { - cout << "FAIL: getattr alpha" << endl; - } - else if (strncmp ("different", attrvalue, strlen("different")) != 0) - { - cout << "FAIL: read attribute value differs" << endl; - } - - if (os->rmattr (oid, "alpha") != 0) - { - cout << "FAIL: rmattr alpha" << endl; - } - if (os->rmattr (oid, "fred") != 0) - { - cout << "FAIL: rmattr fred" << endl; - } - if (os->rmattr (oid, "beta") != 0) - { - cout << "FAIL: rmattr beta" << endl; - } - if (os->rmattr (oid, "gamma") != 0) - { - cout << "FAIL: rmattr gamma" << endl; - } - - coll_t cid = 0xCAFEBABE; - if (os->create_collection (cid) != 0) - { - cout << "FAIL: create_collection" << endl; - } - if (os->create_collection (cid + 10) != 0) - { - cout << "FAIL: create_collection" << endl; - } - if (os->create_collection (cid + 5) != 0) - { - cout << "FAIL: create_collection" << endl; - } - if (os->create_collection (42) != 0) - { - cout << "FAIL: create_collection" << endl; - } - - if (os->collection_add (cid, oid) != 0) - { - cout << "FAIL: collection_add" << endl; - } - - list ls; - if (os->list_collections (ls) < 0) - { - cout << "FAIL: list_collections" << endl; - } - cout << "collections: "; - for (list::iterator it = ls.begin(); it != ls.end(); it++) - { - cout << *it << ", "; - } - cout << endl; - - if (os->destroy_collection (0xCAFEBABE + 10) != 0) - { - cout << "FAIL: destroy_collection" << endl; - } - - if (os->destroy_collection (0xCAFEBADE + 10) == 0) - { - cout << "FAIL: destroy_collection" << endl; - } - - object_t oid2 (12345, 12345); - for (int i = 0; i < 8; i++) - { - oid2.rev++; - if (os->collection_add (cid, oid2) != 0) - { - cout << "FAIL: collection_add" << endl; - } - } - for (int i = 0; i < 8; i++) - { - if (os->collection_remove (cid, oid2) != 0) - { - cout << "FAIL: collection_remove" << endl; - } - oid2.rev--; - } - - if (os->collection_setattr (cid, "alpha", "value", 5) != 0) - cout << "FAIL: collection_setattr" << endl; - if (os->collection_setattr (cid, "beta", "value", 5) != 0) - cout << "FAIL: collection_setattr" << endl; - if (os->collection_setattr (cid, "gamma", "value", 5) != 0) - cout << "FAIL: collection_setattr" << endl; - if (os->collection_setattr (cid, "fred", "value", 5) != 0) - cout << "FAIL: collection_setattr" << endl; - - memset (attrvalue, 0, sizeof (attrvalue)); - if (os->collection_getattr (cid, "alpha", attrvalue, sizeof (attrvalue)) < 0) - cout << "FAIL: collection_getattr" << endl; - else if (strncmp (attrvalue, "value", 5) != 0) - cout << "FAIL: collection attribute value different" << endl; - memset (attrvalue, 0, sizeof (attrvalue)); - if (os->collection_getattr (cid, "beta", attrvalue, sizeof (attrvalue)) < 0) - cout << "FAIL: collection_getattr" << endl; - else if (strncmp (attrvalue, "value", 5) != 0) - cout << "FAIL: collection attribute value different" << endl; - memset (attrvalue, 0, sizeof (attrvalue)); - if (os->collection_getattr (cid, "gamma", attrvalue, sizeof (attrvalue)) < 0) - cout << "FAIL: collection_getattr" << endl; - else if (strncmp (attrvalue, "value", 5) != 0) - cout << "FAIL: collection attribute value different" << endl; - memset (attrvalue, 0, sizeof (attrvalue)); - if (os->collection_getattr (cid, "fred", attrvalue, sizeof (attrvalue)) < 0) - cout << "FAIL: collection_getattr" << endl; - else if (strncmp (attrvalue, "value", 5) != 0) - cout << "FAIL: collection attribute value different" << endl; - - if (os->collection_setattr (cid, "alpha", "eulavvalue", 10) != 0) - cout << "FAIL: collection setattr overwrite" << endl; - memset (attrvalue, 0, sizeof (attrvalue)); - if (os->collection_getattr (cid, "alpha", attrvalue, sizeof (attrvalue)) < 0) - cout << "FAIL: collection_getattr" << endl; - else if (strncmp (attrvalue, "eulavvalue", 10) != 0) - cout << "FAIL: collection attribute value different" << endl; - memset (attrvalue, 0, sizeof (attrvalue)); - if (os->collection_getattr (cid, "beta", attrvalue, sizeof (attrvalue)) < 0) - cout << "FAIL: collection_getattr" << endl; - else if (strncmp (attrvalue, "value", 5) != 0) - cout << "FAIL: collection attribute value different" << endl; - memset (attrvalue, 0, sizeof (attrvalue)); - if (os->collection_getattr (cid, "gamma", attrvalue, sizeof (attrvalue)) < 0) - cout << "FAIL: collection_getattr" << endl; - else if (strncmp (attrvalue, "value", 5) != 0) - cout << "FAIL: collection attribute value different" << endl; - memset (attrvalue, 0, sizeof (attrvalue)); - if (os->collection_getattr (cid, "fred", attrvalue, sizeof (attrvalue)) < 0) - cout << "FAIL: collection_getattr" << endl; - else if (strncmp (attrvalue, "value", 5) != 0) - cout << "FAIL: collection attribute value different" << endl; - - if (os->collection_rmattr (cid, "alpha") != 0) - cout << "FAIL: collection_rmattr" << endl; - if (os->collection_rmattr (cid, "fred") != 0) - cout << "FAIL: collection_rmattr" << endl; - if (os->collection_rmattr (cid, "beta") != 0) - cout << "FAIL: collection_rmattr" << endl; - if (os->collection_rmattr (cid, "gamma") != 0) - cout << "FAIL: collection_rmattr" << endl; - - if (os->collection_rmattr (cid, "alpha") == 0) - cout << "FAIL: collection_rmattr (nonexistent)" << endl; - - // Truncate the object. - if (os->truncate (oid, 512, NULL) != 0) - { - cout << "FAIL: truncate" << endl; - } - - // Expand the object. - if (os->truncate (oid, 1200, NULL) != 0) - { - cout << "FAIL: expand" << endl; - } - - // Delete the object. - if (os->remove (oid) != 0) - { - cout << "FAIL: could not remove object" << endl; - } - - // Shouldn't be there - if (os->exists (oid)) - { - cout << "FAIL: should not be there" << endl; - } - - os->sync(); - exit (0); -} diff --git a/tags/20070517_before_mds_merge/test/testtree.cc b/tags/20070517_before_mds_merge/test/testtree.cc deleted file mode 100644 index 2c21bcbe52e25..0000000000000 --- a/tags/20070517_before_mds_merge/test/testtree.cc +++ /dev/null @@ -1,46 +0,0 @@ - - -#include "../crush/BinaryTree.h" -using namespace crush; - -#include -#include -using namespace std; - -int main() -{ - BinaryTree t; - - vector nodes; - - for (int i=0; i<30; i++) { - cout << "adding " << i << endl; - int n = t.add_node(1); - nodes.push_back(n); - //cout << t << endl; - } - cout << t << endl; - - for (int k=0; k<10000; k++) { - if (rand() % 2) { - cout << "adding" << endl; - nodes.push_back( t.add_node(1) ); - } else { - if (!nodes.empty()) { - //for (int i=0; i -using namespace std; - - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -int main(int argc, char**argv) -{ - int a = 1; - int b = 2; - - mknod("test", 0600, 0); - - cout << "setxattr " << setxattr("test", "asdf", &a, sizeof(a), 0) << endl; - cout << "errno " << errno << " " << strerror(errno) << endl; - cout << "getxattr " << getxattr("test", "asdf", &b, sizeof(b)) << endl; - cout << "errno " << errno << " " << strerror(errno) << endl; - cout << "a is " << a << " and b is " << b << endl; - return 0; -} diff --git a/tags/20070517_before_mds_merge/valgrind.supp b/tags/20070517_before_mds_merge/valgrind.supp deleted file mode 100644 index a6154be057544..0000000000000 --- a/tags/20070517_before_mds_merge/valgrind.supp +++ /dev/null @@ -1,25 +0,0 @@ -# some valgrind suppressions -# to load these automagically, -# cat > ~/.valgrindrc -# --suppressions=valgrind.supp -# - - -# this one makes valgrind shut up about what appears to be a bug in libc's writev. -{ - writev uninit bytes thing -sage - Memcheck:Param - writev(vector[...]) - fun:writev - fun:_ZN11BlockDevice6_writeEijjRN6buffer4listE - fun:_ZN11BlockDevice5do_ioEiRSt4listIPNS_6biovecESaIS2_EE - fun:_ZN11BlockDevice15io_thread_entryEv - fun:_ZN11BlockDevice8IOThread5entryEv - fun:_ZN6Thread11_entry_funcEPv - fun:start_thread - fun:clone - obj:* - obj:* - obj:* - obj:* -} diff --git a/trunk/ceph/COPYING b/trunk/ceph/COPYING deleted file mode 100644 index 5ab7695ab8cab..0000000000000 --- a/trunk/ceph/COPYING +++ /dev/null @@ -1,504 +0,0 @@ - GNU LESSER GENERAL PUBLIC LICENSE - Version 2.1, February 1999 - - Copyright (C) 1991, 1999 Free Software Foundation, Inc. - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - Everyone is permitted to copy and distribute verbatim copies - of this license document, but changing it is not allowed. - -[This is the first released version of the Lesser GPL. It also counts - as the successor of the GNU Library Public License, version 2, hence - the version number 2.1.] - - Preamble - - The licenses for most software are designed to take away your -freedom to share and change it. By contrast, the GNU General Public -Licenses are intended to guarantee your freedom to share and change -free software--to make sure the software is free for all its users. - - This license, the Lesser General Public License, applies to some -specially designated software packages--typically libraries--of the -Free Software Foundation and other authors who decide to use it. You -can use it too, but we suggest you first think carefully about whether -this license or the ordinary General Public License is the better -strategy to use in any particular case, based on the explanations below. - - When we speak of free software, we are referring to freedom of use, -not price. Our General Public Licenses are designed to make sure that -you have the freedom to distribute copies of free software (and charge -for this service if you wish); that you receive source code or can get -it if you want it; that you can change the software and use pieces of -it in new free programs; and that you are informed that you can do -these things. - - To protect your rights, we need to make restrictions that forbid -distributors to deny you these rights or to ask you to surrender these -rights. These restrictions translate to certain responsibilities for -you if you distribute copies of the library or if you modify it. - - For example, if you distribute copies of the library, whether gratis -or for a fee, you must give the recipients all the rights that we gave -you. You must make sure that they, too, receive or can get the source -code. If you link other code with the library, you must provide -complete object files to the recipients, so that they can relink them -with the library after making changes to the library and recompiling -it. And you must show them these terms so they know their rights. - - We protect your rights with a two-step method: (1) we copyright the -library, and (2) we offer you this license, which gives you legal -permission to copy, distribute and/or modify the library. - - To protect each distributor, we want to make it very clear that -there is no warranty for the free library. Also, if the library is -modified by someone else and passed on, the recipients should know -that what they have is not the original version, so that the original -author's reputation will not be affected by problems that might be -introduced by others. - - Finally, software patents pose a constant threat to the existence of -any free program. We wish to make sure that a company cannot -effectively restrict the users of a free program by obtaining a -restrictive license from a patent holder. Therefore, we insist that -any patent license obtained for a version of the library must be -consistent with the full freedom of use specified in this license. - - Most GNU software, including some libraries, is covered by the -ordinary GNU General Public License. This license, the GNU Lesser -General Public License, applies to certain designated libraries, and -is quite different from the ordinary General Public License. We use -this license for certain libraries in order to permit linking those -libraries into non-free programs. - - When a program is linked with a library, whether statically or using -a shared library, the combination of the two is legally speaking a -combined work, a derivative of the original library. The ordinary -General Public License therefore permits such linking only if the -entire combination fits its criteria of freedom. The Lesser General -Public License permits more lax criteria for linking other code with -the library. - - We call this license the "Lesser" General Public License because it -does Less to protect the user's freedom than the ordinary General -Public License. It also provides other free software developers Less -of an advantage over competing non-free programs. These disadvantages -are the reason we use the ordinary General Public License for many -libraries. However, the Lesser license provides advantages in certain -special circumstances. - - For example, on rare occasions, there may be a special need to -encourage the widest possible use of a certain library, so that it becomes -a de-facto standard. To achieve this, non-free programs must be -allowed to use the library. A more frequent case is that a free -library does the same job as widely used non-free libraries. In this -case, there is little to gain by limiting the free library to free -software only, so we use the Lesser General Public License. - - In other cases, permission to use a particular library in non-free -programs enables a greater number of people to use a large body of -free software. For example, permission to use the GNU C Library in -non-free programs enables many more people to use the whole GNU -operating system, as well as its variant, the GNU/Linux operating -system. - - Although the Lesser General Public License is Less protective of the -users' freedom, it does ensure that the user of a program that is -linked with the Library has the freedom and the wherewithal to run -that program using a modified version of the Library. - - The precise terms and conditions for copying, distribution and -modification follow. Pay close attention to the difference between a -"work based on the library" and a "work that uses the library". The -former contains code derived from the library, whereas the latter must -be combined with the library in order to run. - - GNU LESSER GENERAL PUBLIC LICENSE - TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION - - 0. This License Agreement applies to any software library or other -program which contains a notice placed by the copyright holder or -other authorized party saying it may be distributed under the terms of -this Lesser General Public License (also called "this License"). -Each licensee is addressed as "you". - - A "library" means a collection of software functions and/or data -prepared so as to be conveniently linked with application programs -(which use some of those functions and data) to form executables. - - The "Library", below, refers to any such software library or work -which has been distributed under these terms. A "work based on the -Library" means either the Library or any derivative work under -copyright law: that is to say, a work containing the Library or a -portion of it, either verbatim or with modifications and/or translated -straightforwardly into another language. (Hereinafter, translation is -included without limitation in the term "modification".) - - "Source code" for a work means the preferred form of the work for -making modifications to it. For a library, complete source code means -all the source code for all modules it contains, plus any associated -interface definition files, plus the scripts used to control compilation -and installation of the library. - - Activities other than copying, distribution and modification are not -covered by this License; they are outside its scope. The act of -running a program using the Library is not restricted, and output from -such a program is covered only if its contents constitute a work based -on the Library (independent of the use of the Library in a tool for -writing it). Whether that is true depends on what the Library does -and what the program that uses the Library does. - - 1. You may copy and distribute verbatim copies of the Library's -complete source code as you receive it, in any medium, provided that -you conspicuously and appropriately publish on each copy an -appropriate copyright notice and disclaimer of warranty; keep intact -all the notices that refer to this License and to the absence of any -warranty; and distribute a copy of this License along with the -Library. - - You may charge a fee for the physical act of transferring a copy, -and you may at your option offer warranty protection in exchange for a -fee. - - 2. You may modify your copy or copies of the Library or any portion -of it, thus forming a work based on the Library, and copy and -distribute such modifications or work under the terms of Section 1 -above, provided that you also meet all of these conditions: - - a) The modified work must itself be a software library. - - b) You must cause the files modified to carry prominent notices - stating that you changed the files and the date of any change. - - c) You must cause the whole of the work to be licensed at no - charge to all third parties under the terms of this License. - - d) If a facility in the modified Library refers to a function or a - table of data to be supplied by an application program that uses - the facility, other than as an argument passed when the facility - is invoked, then you must make a good faith effort to ensure that, - in the event an application does not supply such function or - table, the facility still operates, and performs whatever part of - its purpose remains meaningful. - - (For example, a function in a library to compute square roots has - a purpose that is entirely well-defined independent of the - application. Therefore, Subsection 2d requires that any - application-supplied function or table used by this function must - be optional: if the application does not supply it, the square - root function must still compute square roots.) - -These requirements apply to the modified work as a whole. If -identifiable sections of that work are not derived from the Library, -and can be reasonably considered independent and separate works in -themselves, then this License, and its terms, do not apply to those -sections when you distribute them as separate works. But when you -distribute the same sections as part of a whole which is a work based -on the Library, the distribution of the whole must be on the terms of -this License, whose permissions for other licensees extend to the -entire whole, and thus to each and every part regardless of who wrote -it. - -Thus, it is not the intent of this section to claim rights or contest -your rights to work written entirely by you; rather, the intent is to -exercise the right to control the distribution of derivative or -collective works based on the Library. - -In addition, mere aggregation of another work not based on the Library -with the Library (or with a work based on the Library) on a volume of -a storage or distribution medium does not bring the other work under -the scope of this License. - - 3. You may opt to apply the terms of the ordinary GNU General Public -License instead of this License to a given copy of the Library. To do -this, you must alter all the notices that refer to this License, so -that they refer to the ordinary GNU General Public License, version 2, -instead of to this License. (If a newer version than version 2 of the -ordinary GNU General Public License has appeared, then you can specify -that version instead if you wish.) Do not make any other change in -these notices. - - Once this change is made in a given copy, it is irreversible for -that copy, so the ordinary GNU General Public License applies to all -subsequent copies and derivative works made from that copy. - - This option is useful when you wish to copy part of the code of -the Library into a program that is not a library. - - 4. You may copy and distribute the Library (or a portion or -derivative of it, under Section 2) in object code or executable form -under the terms of Sections 1 and 2 above provided that you accompany -it with the complete corresponding machine-readable source code, which -must be distributed under the terms of Sections 1 and 2 above on a -medium customarily used for software interchange. - - If distribution of object code is made by offering access to copy -from a designated place, then offering equivalent access to copy the -source code from the same place satisfies the requirement to -distribute the source code, even though third parties are not -compelled to copy the source along with the object code. - - 5. A program that contains no derivative of any portion of the -Library, but is designed to work with the Library by being compiled or -linked with it, is called a "work that uses the Library". Such a -work, in isolation, is not a derivative work of the Library, and -therefore falls outside the scope of this License. - - However, linking a "work that uses the Library" with the Library -creates an executable that is a derivative of the Library (because it -contains portions of the Library), rather than a "work that uses the -library". The executable is therefore covered by this License. -Section 6 states terms for distribution of such executables. - - When a "work that uses the Library" uses material from a header file -that is part of the Library, the object code for the work may be a -derivative work of the Library even though the source code is not. -Whether this is true is especially significant if the work can be -linked without the Library, or if the work is itself a library. The -threshold for this to be true is not precisely defined by law. - - If such an object file uses only numerical parameters, data -structure layouts and accessors, and small macros and small inline -functions (ten lines or less in length), then the use of the object -file is unrestricted, regardless of whether it is legally a derivative -work. (Executables containing this object code plus portions of the -Library will still fall under Section 6.) - - Otherwise, if the work is a derivative of the Library, you may -distribute the object code for the work under the terms of Section 6. -Any executables containing that work also fall under Section 6, -whether or not they are linked directly with the Library itself. - - 6. As an exception to the Sections above, you may also combine or -link a "work that uses the Library" with the Library to produce a -work containing portions of the Library, and distribute that work -under terms of your choice, provided that the terms permit -modification of the work for the customer's own use and reverse -engineering for debugging such modifications. - - You must give prominent notice with each copy of the work that the -Library is used in it and that the Library and its use are covered by -this License. You must supply a copy of this License. If the work -during execution displays copyright notices, you must include the -copyright notice for the Library among them, as well as a reference -directing the user to the copy of this License. Also, you must do one -of these things: - - a) Accompany the work with the complete corresponding - machine-readable source code for the Library including whatever - changes were used in the work (which must be distributed under - Sections 1 and 2 above); and, if the work is an executable linked - with the Library, with the complete machine-readable "work that - uses the Library", as object code and/or source code, so that the - user can modify the Library and then relink to produce a modified - executable containing the modified Library. (It is understood - that the user who changes the contents of definitions files in the - Library will not necessarily be able to recompile the application - to use the modified definitions.) - - b) Use a suitable shared library mechanism for linking with the - Library. A suitable mechanism is one that (1) uses at run time a - copy of the library already present on the user's computer system, - rather than copying library functions into the executable, and (2) - will operate properly with a modified version of the library, if - the user installs one, as long as the modified version is - interface-compatible with the version that the work was made with. - - c) Accompany the work with a written offer, valid for at - least three years, to give the same user the materials - specified in Subsection 6a, above, for a charge no more - than the cost of performing this distribution. - - d) If distribution of the work is made by offering access to copy - from a designated place, offer equivalent access to copy the above - specified materials from the same place. - - e) Verify that the user has already received a copy of these - materials or that you have already sent this user a copy. - - For an executable, the required form of the "work that uses the -Library" must include any data and utility programs needed for -reproducing the executable from it. However, as a special exception, -the materials to be distributed need not include anything that is -normally distributed (in either source or binary form) with the major -components (compiler, kernel, and so on) of the operating system on -which the executable runs, unless that component itself accompanies -the executable. - - It may happen that this requirement contradicts the license -restrictions of other proprietary libraries that do not normally -accompany the operating system. Such a contradiction means you cannot -use both them and the Library together in an executable that you -distribute. - - 7. You may place library facilities that are a work based on the -Library side-by-side in a single library together with other library -facilities not covered by this License, and distribute such a combined -library, provided that the separate distribution of the work based on -the Library and of the other library facilities is otherwise -permitted, and provided that you do these two things: - - a) Accompany the combined library with a copy of the same work - based on the Library, uncombined with any other library - facilities. This must be distributed under the terms of the - Sections above. - - b) Give prominent notice with the combined library of the fact - that part of it is a work based on the Library, and explaining - where to find the accompanying uncombined form of the same work. - - 8. You may not copy, modify, sublicense, link with, or distribute -the Library except as expressly provided under this License. Any -attempt otherwise to copy, modify, sublicense, link with, or -distribute the Library is void, and will automatically terminate your -rights under this License. However, parties who have received copies, -or rights, from you under this License will not have their licenses -terminated so long as such parties remain in full compliance. - - 9. You are not required to accept this License, since you have not -signed it. However, nothing else grants you permission to modify or -distribute the Library or its derivative works. These actions are -prohibited by law if you do not accept this License. Therefore, by -modifying or distributing the Library (or any work based on the -Library), you indicate your acceptance of this License to do so, and -all its terms and conditions for copying, distributing or modifying -the Library or works based on it. - - 10. Each time you redistribute the Library (or any work based on the -Library), the recipient automatically receives a license from the -original licensor to copy, distribute, link with or modify the Library -subject to these terms and conditions. You may not impose any further -restrictions on the recipients' exercise of the rights granted herein. -You are not responsible for enforcing compliance by third parties with -this License. - - 11. If, as a consequence of a court judgment or allegation of patent -infringement or for any other reason (not limited to patent issues), -conditions are imposed on you (whether by court order, agreement or -otherwise) that contradict the conditions of this License, they do not -excuse you from the conditions of this License. If you cannot -distribute so as to satisfy simultaneously your obligations under this -License and any other pertinent obligations, then as a consequence you -may not distribute the Library at all. For example, if a patent -license would not permit royalty-free redistribution of the Library by -all those who receive copies directly or indirectly through you, then -the only way you could satisfy both it and this License would be to -refrain entirely from distribution of the Library. - -If any portion of this section is held invalid or unenforceable under any -particular circumstance, the balance of the section is intended to apply, -and the section as a whole is intended to apply in other circumstances. - -It is not the purpose of this section to induce you to infringe any -patents or other property right claims or to contest validity of any -such claims; this section has the sole purpose of protecting the -integrity of the free software distribution system which is -implemented by public license practices. Many people have made -generous contributions to the wide range of software distributed -through that system in reliance on consistent application of that -system; it is up to the author/donor to decide if he or she is willing -to distribute software through any other system and a licensee cannot -impose that choice. - -This section is intended to make thoroughly clear what is believed to -be a consequence of the rest of this License. - - 12. If the distribution and/or use of the Library is restricted in -certain countries either by patents or by copyrighted interfaces, the -original copyright holder who places the Library under this License may add -an explicit geographical distribution limitation excluding those countries, -so that distribution is permitted only in or among countries not thus -excluded. In such case, this License incorporates the limitation as if -written in the body of this License. - - 13. The Free Software Foundation may publish revised and/or new -versions of the Lesser General Public License from time to time. -Such new versions will be similar in spirit to the present version, -but may differ in detail to address new problems or concerns. - -Each version is given a distinguishing version number. If the Library -specifies a version number of this License which applies to it and -"any later version", you have the option of following the terms and -conditions either of that version or of any later version published by -the Free Software Foundation. If the Library does not specify a -license version number, you may choose any version ever published by -the Free Software Foundation. - - 14. If you wish to incorporate parts of the Library into other free -programs whose distribution conditions are incompatible with these, -write to the author to ask for permission. For software which is -copyrighted by the Free Software Foundation, write to the Free -Software Foundation; we sometimes make exceptions for this. Our -decision will be guided by the two goals of preserving the free status -of all derivatives of our free software and of promoting the sharing -and reuse of software generally. - - NO WARRANTY - - 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO -WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. -EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR -OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY -KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE -LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME -THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. - - 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN -WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY -AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU -FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR -CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE -LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING -RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A -FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF -SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH -DAMAGES. - - END OF TERMS AND CONDITIONS - - How to Apply These Terms to Your New Libraries - - If you develop a new library, and you want it to be of the greatest -possible use to the public, we recommend making it free software that -everyone can redistribute and change. You can do so by permitting -redistribution under these terms (or, alternatively, under the terms of the -ordinary General Public License). - - To apply these terms, attach the following notices to the library. It is -safest to attach them to the start of each source file to most effectively -convey the exclusion of warranty; and each file should have at least the -"copyright" line and a pointer to where the full notice is found. - - - Copyright (C) - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - -Also add information on how to contact you by electronic and paper mail. - -You should also get your employer (if you work as a programmer) or your -school, if any, to sign a "copyright disclaimer" for the library, if -necessary. Here is a sample; alter the names: - - Yoyodyne, Inc., hereby disclaims all copyright interest in the - library `Frob' (a library for tweaking knobs) written by James Random Hacker. - - , 1 April 1990 - Ty Coon, President of Vice - -That's all there is to it! - - diff --git a/trunk/ceph/README b/trunk/ceph/README deleted file mode 100644 index aa016817cebf0..0000000000000 --- a/trunk/ceph/README +++ /dev/null @@ -1,4 +0,0 @@ -Ceph - a scalable distributed file system ------------------------------------------ - -Please see http://ceph.sourceforge.net/ for current info. diff --git a/trunk/ceph/active/activemaster.cc b/trunk/ceph/active/activemaster.cc deleted file mode 100644 index b4dc742c414ab..0000000000000 --- a/trunk/ceph/active/activemaster.cc +++ /dev/null @@ -1,115 +0,0 @@ -/* - * Startup executable for - * Ceph Active Storage. See README for details. - * - */ -#include "activemaster.h" - - -/* - * What main() must do: - * - * - start up a Ceph client - * - find the set of OSDs that the file is striped across - * - start up the Map task on each OSD, using ssh - * - eat lunch? - * - start up the Reduce task locally - */ - -int main(int argc, const char* argv[]) { - - if (argc < 4) { - usage(argv[0]); - exit(-1); - } - - const char* input_filename = argv[1]; - const char* map_command = argv[2]; - //const char* reduce_command = argv[3]; - - // fire up the client - Client* client = startCephClient(); - - // open the file as read_only - int fh = client->open(input_filename, O_RDONLY); - if (fh < 0) { - cout << "The input file " << input_filename << " could not be opened." << endl; - exit(-1); - } - - // How big is the file? - int filesize; - struct stat stbuf; - if (0 > client->lstat(input_filename, &stbuf)) { - cout << "Error: could not retrieve size of input file " << input_filename << endl; - exit(-1); - } - filesize = stbuf.st_size; - if (filesize < 1) { - cout << "Error: input file size is " << filesize << endl; - exit(-1); - } - - // retrieve all the object extents - list extents; - int offset = 0; - client->enumerate_layout(fh, extents, filesize, offset); - - // for each object extent, retrieve the OSD IP address and start up a Map task - list::iterator i; - map::iterator j; - int osd; - int start, length; - tcpaddr_t tcpaddr; - - for (i = extents.begin(); i != extents.end(); i++) - { - // find the primary and get its IP address - osd = client->osdmap->get_pg_primary(i->pgid); - entity_inst_t inst = client->osdmap->get_inst(osd); - entity_addr_t entity_addr = inst.addr; - entity_addr.make_addr(tcpaddr); - - // iterate through each buffer_extent in the ObjectExtent - for (j = i->buffer_extents.begin(); - j != i->buffer_extents.end(); j++) - { - // get the range of the buffer_extent - start = (*j).first; - length = (*j).second; - // fire up the Map task - start_map_task(map_command, input_filename, start, length, tcpaddr); - } - } - return 0; -} - -// Fires up the map task. -// For the moment, all it does is echo the command line, not run it. -int start_map_task(const char* command, const char* input_filename, - long start, long length, sockaddr_in ip_address) -{ - string ip_addr_string(inet_ntoa(ip_address.sin_addr)); - - - - - - cout << "ssh " << ip_addr_string << " " << command - << " " << input_filename << " " << start << " " << length << endl; - return 0; -} - - - -void usage(const char* name) { - cout << "usage: " << name << " inputfile map_task reduce_task" << endl; - cout << "inputfile must be a valid path in the running Ceph filesystem." << endl; - cout << "map_task should be given with an absolute path, and be present on "; - cout << "the REGULAR filesystem every node." << endl; - cout << "reduce_task need be present on this node only." << endl; -} - - - - diff --git a/trunk/ceph/active/activemaster.h b/trunk/ceph/active/activemaster.h deleted file mode 100644 index 524138e253c7b..0000000000000 --- a/trunk/ceph/active/activemaster.h +++ /dev/null @@ -1,18 +0,0 @@ -/* - * This is the master executable to start up - * a compute task across several nodes. - * - * - */ - - -//#include -#include "utility.h" - -int start_map_task(const char* command, const char* input_filename, - long start, long length, tcpaddr_t ip_address); - -void usage(const char* name); - -//Client* startCephClient(); -//void kill_client(Client* client); diff --git a/trunk/ceph/active/activetaskd.cc b/trunk/ceph/active/activetaskd.cc deleted file mode 100644 index ec9f290543093..0000000000000 --- a/trunk/ceph/active/activetaskd.cc +++ /dev/null @@ -1,241 +0,0 @@ -/* - * This is a daemon for receiving and executing commands for compute tasks on an OSD. - * - * The daemon uses skeleton code from - * http://www.linuxprofilm.com/articles/linux-daemon-howto.html. The - * site is no longer up, but can be seen through the archive.org. - * Networking code is based off examples from Stevens' UNIX Network Programming. - */ - -#include "activetaskd.h" - - -#define SERVER - -#undef SERVER - -int main(int argc, const char* argv[]) { - - /* Our process ID and Session ID */ - pid_t pid, sid; - - /* Fork off the parent process */ - pid = fork(); - if (pid < 0) { - exit(EXIT_FAILURE); - } - /* If we got a good PID, then - we can exit the parent process. */ - if (pid > 0) { - exit(EXIT_SUCCESS); - } - - /* Change the file mode mask */ - umask(0); - - /* Open any logs here */ - - /* Create a new SID for the child process */ - sid = setsid(); - if (sid < 0) { - /* Log the failure */ - exit(EXIT_FAILURE); - } - - - /* Change the current working directory */ - if ((chdir("/")) < 0) { - /* Log the failure */ - exit(EXIT_FAILURE); - } - - /* Close out the standard file descriptors */ - close(STDIN_FILENO); - close(STDOUT_FILENO); - close(STDERR_FILENO); - - /* Daemon-specific initialization goes here */ - - - - /* Set up TCP server */ - int sockfd, newsockfd, childpid; - socklen_t clilen; - struct sockaddr_in cli_addr, serv_addr; - - const char *pname = argv[0]; // process name - - // Open a TCP socket - if ((sockfd = socket(AF_INET, SOCK_STREAM, 0)) < 0) - exit(-1); - //err_dump("server: can't open stream socket"); - - // set up the port - bzero((char*) &serv_addr, sizeof(serv_addr)); - serv_addr.sin_family = AF_INET; - serv_addr.sin_addr.s_addr = htonl(INADDR_ANY); - serv_addr.sin_port = htons(SERV_TCP_PORT); - - if (bind(sockfd, (struct sockaddr *) &serv_addr, sizeof(serv_addr)) < 0) - exit(-1); - //err_dump("server: can't bind local address"); - - if(listen(sockfd, SOMAXCONN) < 0) - exit(-1); - //err_dump("server: listening error"); - - /* The Big Loop */ - while (1) { - - // wait for a message and fork off a child process to handle it - clilen = sizeof(cli_addr); - newsockfd = accept(sockfd, - (struct sockaddr *) &cli_addr, - &clilen); - - if (newsockfd < 0) - exit(-1); - //err_dump("server: accept error"); - - if ( (childpid = fork()) < 0) - exit(-1); - // err_dump("server: fork error"); - - else if (childpid == 0) { // child process - close(sockfd); - //str_echo(newsockfd); - str_run(newsockfd); - // insert code to process the request - exit(0); - } - - close (newsockfd); // parent - - //sleep(30); /* wait 30 seconds */ - } - exit(EXIT_SUCCESS); -} - - -// Echo a stream socket message back to the sender. - -void str_echo(int sockfd) { - - int n; - char line[MAXLINE]; - - while(true) { - - // read from the stream - n = readline(sockfd, line, MAXLINE); - - if (0 == n) - return; // connection is terminated - else if (n < 0) - //err_dump("str_echo: readline error"); - exit(-1); - - // write back to the stream - if (n != writen(sockfd, line, n)) - //err_dump("str_echo: writen error"); - exit(-1); - } -} - - -void str_ack(int sockfd) { - - int n; - char line[MAXLINE]; - char *ack = "ack"; - - while(true) { - - // read from the stream - n = readline(sockfd, line, MAXLINE); - - if (0 == n) - return; // connection is terminated - else if (n < 0) - //err_dump("str_echo: readline error"); - exit(-1); - - // write back to the stream - if (4 != writen(sockfd, "ack\n", 4)) - //err_dump("str_echo: writen error"); - exit(-1); - } -} - - - - -// Read command lines from the socket and execute them - -void str_run(int sockfd) { - - int n; - char line[MAXLINE]; - char* error_msg = "str_run: No command interpreter found\n"; - char* ack_msg = "Running command... "; - char* commit_msg = "Command executed!\n"; - - while(true) { - - // read from the stream - n = readline(sockfd, line, MAXLINE); - - if (0 == n) - return; // connection is terminated - else if (n < 0) - //err_dump("str_echo: readline error"); - exit(-1); - - if (system(NULL)) { - writen(sockfd, ack_msg, strlen(ack_msg)); - system(line); - writen(sockfd, commit_msg, strlen(commit_msg)); - } - else if (strlen(error_msg) != writen(sockfd, error_msg, strlen(error_msg))) - //err_dump("str_echo: writen error"); - exit(-1); - } -} - - -// take a filename and copy it from Ceph to a local directory - -void str_copytolocal(int sockfd) { - - int n; - char line[MAXLINE]; - char* error_msg = "str_copy: No command interpreter found\n"; - char* ack_msg = "Running command... "; - char* commit_msg = "Command executed!\n"; - char* temp_dir = "/tmp"; - - - while(true) { - - // read from the stream - n = readline(sockfd, line, MAXLINE); - - if (0 == n) - return; // connection is terminated - else if (n < 0) - //err_dump("str_echo: readline error"); - exit(-1); - - if (system(NULL)) { - writen(sockfd, ack_msg, strlen(ack_msg)); - system(line); - writen(sockfd, commit_msg, strlen(commit_msg)); - } - else if (strlen(error_msg) != writen(sockfd, error_msg, strlen(error_msg))) - //err_dump("str_echo: writen error"); - exit(-1); - } -} - - - diff --git a/trunk/ceph/active/activetaskd.h b/trunk/ceph/active/activetaskd.h deleted file mode 100644 index fc5cec923c4bc..0000000000000 --- a/trunk/ceph/active/activetaskd.h +++ /dev/null @@ -1,14 +0,0 @@ -#include "inet.h" -#include "common.h" -#include "utility.h" -#include "client/Client.h" - - -// The port number is "osdd" on a telephone keypad. -#define SERV_TCP_PORT 6733 - -#define MAXLINE 512 - -void str_echo(int sockfd); -void str_ack(int sockfd); -void str_run(int sockfd); diff --git a/trunk/ceph/active/client_init.cc b/trunk/ceph/active/client_init.cc deleted file mode 100644 index 8b137891791fe..0000000000000 --- a/trunk/ceph/active/client_init.cc +++ /dev/null @@ -1 +0,0 @@ - diff --git a/trunk/ceph/active/client_init.h b/trunk/ceph/active/client_init.h deleted file mode 100644 index 139597f9cb07c..0000000000000 --- a/trunk/ceph/active/client_init.h +++ /dev/null @@ -1,2 +0,0 @@ - - diff --git a/trunk/ceph/active/echotestclient.cc b/trunk/ceph/active/echotestclient.cc deleted file mode 100644 index 2b2d15e7ca5cb..0000000000000 --- a/trunk/ceph/active/echotestclient.cc +++ /dev/null @@ -1,74 +0,0 @@ -/* - * This is merely a test of an echo server; it's an early step in - * building up the Ceph distributed compute service. This is - * discardable once the next stage is up and running. - * - * Code is based off examples in Stevens' "Unix Network Programming". - */ - -#include "echotestclient.h" - -int main(int argc, char* argv[]) { - - int sockfd; - struct sockaddr_in serv_addr; - - char* pname = argv[0]; - - bzero((char *) &serv_addr, sizeof(serv_addr)); - serv_addr.sin_family = AF_INET; - serv_addr.sin_addr.s_addr = inet_addr(SERV_HOST_ADDR); - serv_addr.sin_port = htons(SERV_TCP_PORT); - - - // open a TCP socket - if ((sockfd = socket(AF_INET, SOCK_STREAM, 0)) < 0) { - printf("client: can't open stream socket"); - exit (-1); - } - - // connect to the server. - if (connect(sockfd, (struct sockaddr *) &serv_addr, - sizeof(serv_addr)) < 0) { - printf("client: can't connect to server"); - exit (-1); - } - - // start the test echoer - str_cli(stdin, sockfd); - - - close (sockfd); - exit(0); -} - - -void str_cli(FILE *fp, int sockfd) { - - int n; - char sendline[MAXLINE], recvline[MAXLINE + 1]; - - // read from the fp and write to the socket; - // then read from the socket and write to stdout - while (fgets(sendline, MAXLINE, fp) != NULL) { - - n = strlen(sendline); - if (writen(sockfd, sendline, n) != n) { - printf("str_cli: writen error on socket"); - exit(-1); - } - n = readline(sockfd, recvline, MAXLINE); - if (n < 0) { - printf("str_cli: readline error"); - exit(-1); - } - recvline[n] = 0; - fputs(recvline, stdout); - } - - if (ferror(fp)) { - printf("str_cli: error reading file"); - exit(-1); - } - -} diff --git a/trunk/ceph/active/echotestclient.h b/trunk/ceph/active/echotestclient.h deleted file mode 100644 index 9b26416640bc2..0000000000000 --- a/trunk/ceph/active/echotestclient.h +++ /dev/null @@ -1,10 +0,0 @@ -#include "inet.h" -#include "common.h" -#include "socket_utility.h" - -#define SERV_HOST_ADDR "128.114.57.143" //issdm-8 -#define SERV_TCP_PORT 6733 -#define MAXLINE 512 - -void str_cli(FILE *fp, int sockfd); - diff --git a/trunk/ceph/active/inet.h b/trunk/ceph/active/inet.h deleted file mode 100644 index 385fa915f9dc7..0000000000000 --- a/trunk/ceph/active/inet.h +++ /dev/null @@ -1,9 +0,0 @@ -/* - * Generic TCP/IP definitions - */ - -#include -#include -#include -#include -#include diff --git a/trunk/ceph/active/trivial_task.cc b/trunk/ceph/active/trivial_task.cc deleted file mode 100644 index 7a72ecb277c4b..0000000000000 --- a/trunk/ceph/active/trivial_task.cc +++ /dev/null @@ -1,50 +0,0 @@ -#include "trivial_task.h" - -void start_trivial_task (const char* ceph_filename, const char* local_filename, - off_t offset, off_t length) { - // Don't bother to copy the file to disk. Read the file directly from Ceph, - // and add up all the bytes. - // Write the total to the local file as a string. - Client * client = startCephClient(); - - bufferptr bp(CHUNK); - - // get the source file's size. Sanity-check the request range. - struct stat st; - int r = client->lstat(ceph_filename, &st); - assert (r == 0); - - off_t src_total_size = st.st_size; - if (src_total_size < offset + length) { - cerr << "Error in copy ExtentToLocalFile: offset + length = " << offset << " + " << length - << " = " + (offset + length) << ", source file size is only " << src_total_size << endl; - exit(-1); - } - off_t remaining = length; - - // open the file and seek to the start position - cerr << "start_trivial_task: opening the source file and seeking " << endl; - - int fh_ceph = client->open(ceph_filename, O_RDONLY); - assert (fh_ceph > -1); - r = client->lseek(fh_ceph, offset, SEEK_SET); - assert (r == offset); - - int counter = 0; - // read through the extent and add up the bytes - cerr << "start_trivial_task: counting up bytes" << endl; - char* bp_c = bp.c_str(); - while (remaining > 0) { - off_t got = client->read(fh_ceph, bp_c, MIN(remaining,CHUNK), -1); - assert(got > 0); - remaining -= got; - for (off_t i = 0; i < got; ++i) { - counter += (unsigned int)(bp_c[i]); - } - } - cerr << "start_trivial_task: Done! Answer is " << counter << endl; - client->close(fh_ceph); - - //assert(0); -} - diff --git a/trunk/ceph/active/trivial_task.h b/trunk/ceph/active/trivial_task.h deleted file mode 100644 index ce9b47c82ceb6..0000000000000 --- a/trunk/ceph/active/trivial_task.h +++ /dev/null @@ -1,12 +0,0 @@ -// Shared library for the trivial task of adding up all the bytes in a file - -//#include "inet.h" -#include "common.h" -#include "utility.h" -#include "client/Client.h" - - -extern "C" void start_trivial_task (const char* ceph_filename, - const char* local_filename, - off_t offset, off_t length); - diff --git a/trunk/ceph/cfuse.cc b/trunk/ceph/cfuse.cc deleted file mode 100644 index 3c157fefadf89..0000000000000 --- a/trunk/ceph/cfuse.cc +++ /dev/null @@ -1,88 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include -#include -#include -using namespace std; - -#include "config.h" - -#include "client/Client.h" -#include "client/fuse.h" -#include "client/fuse_ll.h" - -#include "msg/SimpleMessenger.h" - -#include "common/Timer.h" - -#ifndef DARWIN -#include -#endif // DARWIN - -#include -#include -#include - -int main(int argc, char **argv, char *envp[]) { - - //cerr << "cfuse starting " << myrank << "/" << world << std::endl; - vector args; - argv_to_vec(argc, argv, args); - parse_config_options(args); - - // args for fuse - vec_to_argv(args, argc, argv); - - // FUSE will chdir("/"); be ready. - g_conf.use_abspaths = true; - - if (g_conf.clock_tare) g_clock.tare(); - - // load monmap - MonMap monmap; - int r = monmap.read(".ceph_monmap"); - assert(r >= 0); - - // start up network - rank.start_rank(); - - // start client - Client *client = new Client(rank.register_entity(entity_name_t::CLIENT()), &monmap); - client->init(); - - // start up fuse - // use my argc, argv (make sure you pass a mount point!) - cout << "mounting" << std::endl; - client->mount(); - - //cerr << "starting fuse on pid " << getpid() << std::endl; - if (g_conf.fuse_ll) - ceph_fuse_ll_main(client, argc, argv); - else - ceph_fuse_main(client, argc, argv); - //cerr << "fuse finished on pid " << getpid() << std::endl; - - client->unmount(); - cout << "unmounted" << std::endl; - client->shutdown(); - - delete client; - - // wait for messenger to finish - rank.wait(); - - return 0; -} - diff --git a/trunk/ceph/client/FileCache.cc b/trunk/ceph/client/FileCache.cc deleted file mode 100644 index 1adec4aaabee7..0000000000000 --- a/trunk/ceph/client/FileCache.cc +++ /dev/null @@ -1,266 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "include/types.h" - -#include "FileCache.h" -#include "osdc/ObjectCacher.h" - -#include "msg/Messenger.h" - -#include "config.h" -#define dout(x) if (x <= g_conf.debug_client) *_dout << dbeginl << g_clock.now() << " " << oc->objecter->messenger->get_myname() << ".filecache " -#define derr(x) if (x <= g_conf.debug_client) *_derr << dbeginl << g_clock.now() << " " << oc->objecter->messenger->get_myname() << ".filecache " - - - -// flush/release/clean - -void FileCache::flush_dirty(Context *onflush) -{ - if (oc->flush_set(inode.ino, onflush)) { - onflush->finish(0); - delete onflush; - } -} - -off_t FileCache::release_clean() -{ - return oc->release_set(inode.ino); -} - -bool FileCache::is_cached() -{ - return oc->set_is_cached(inode.ino); -} - -bool FileCache::is_dirty() -{ - return oc->set_is_dirty_or_committing(inode.ino); -} - -void FileCache::empty(Context *onempty) -{ - off_t unclean = release_clean(); - bool clean = oc->flush_set(inode.ino, onempty); - assert(!unclean == clean); - - if (clean) { - onempty->finish(0); - delete onempty; - } -} - - -void FileCache::tear_down() -{ - off_t unclean = release_clean(); - if (unclean) { - dout(0) << "tear_down " << unclean << " unclean bytes, purging" << dendl; - oc->purge_set(inode.ino); - } -} - -// truncate - -void FileCache::truncate(off_t olds, off_t news) -{ - dout(5) << "truncate " << olds << " -> " << news << dendl; - - // map range to objects - list ls; - oc->filer.file_to_extents(inode, news, olds-news, ls); - oc->truncate_set(inode.ino, ls); -} - -// caps - -class C_FC_CheckCaps : public Context { - FileCache *fc; -public: - C_FC_CheckCaps(FileCache *f) : fc(f) {} - void finish(int r) { - fc->check_caps(); - } -}; - -void FileCache::set_caps(int caps, Context *onimplement) -{ - if (onimplement) { - dout(10) << "set_caps setting onimplement context for " << cap_string(caps) << dendl; - assert(latest_caps & ~caps); // we should be losing caps. - caps_callbacks[caps].push_back(onimplement); - } - - latest_caps = caps; - check_caps(); - - // kick waiters? (did we gain caps?) - if (can_read() && !waitfor_read.empty()) - for (set::iterator p = waitfor_read.begin(); - p != waitfor_read.end(); - ++p) - (*p)->Signal(); - if (can_write() && !waitfor_write.empty()) - for (set::iterator p = waitfor_write.begin(); - p != waitfor_write.end(); - ++p) - (*p)->Signal(); - -} - -int FileCache::get_used_caps() -{ - int used = 0; - if (num_reading) used |= CAP_FILE_RD; - if (oc->set_is_cached(inode.ino)) used |= CAP_FILE_RDCACHE; - if (num_writing) used |= CAP_FILE_WR; - if (oc->set_is_dirty_or_committing(inode.ino)) used |= CAP_FILE_WRBUFFER; - return used; -} - -void FileCache::check_caps() -{ - // calc used - int used = get_used_caps(); - dout(10) << "check_caps used was " << cap_string(used) << dendl; - - // try to implement caps? - // BUG? latest_caps, not least caps i've seen? - if ((latest_caps & CAP_FILE_RDCACHE) == 0 && - (used & CAP_FILE_RDCACHE)) - release_clean(); - if ((latest_caps & CAP_FILE_WRBUFFER) == 0 && - (used & CAP_FILE_WRBUFFER)) - flush_dirty(new C_FC_CheckCaps(this)); - - used = get_used_caps(); - dout(10) << "check_caps used now " << cap_string(used) << dendl; - - // check callbacks - map >::iterator p = caps_callbacks.begin(); - while (p != caps_callbacks.end()) { - if (used == 0 || (~(p->first) & used) == 0) { - // implemented. - dout(10) << "used is " << cap_string(used) - << ", caps " << cap_string(p->first) << " implemented, doing callback(s)" << dendl; - finish_contexts(p->second); - map >::iterator o = p; - p++; - caps_callbacks.erase(o); - } else { - dout(10) << "used is " << cap_string(used) - << ", caps " << cap_string(p->first) << " not yet implemented" << dendl; - p++; - } - } -} - - - -// read/write - -int FileCache::read(off_t offset, size_t size, bufferlist& blist, Mutex& client_lock) -{ - int r = 0; - - // can i read? - while ((latest_caps & CAP_FILE_RD) == 0) { - dout(10) << "read doesn't have RD cap, blocking" << dendl; - Cond c; - waitfor_read.insert(&c); - c.Wait(client_lock); - waitfor_read.erase(&c); - } - - // inc reading counter - num_reading++; - - if (latest_caps & CAP_FILE_RDCACHE) { - // read (and block) - Cond cond; - bool done = false; - int rvalue = 0; - C_Cond *onfinish = new C_Cond(&cond, &done, &rvalue); - - r = oc->file_read(inode, offset, size, &blist, onfinish); - - if (r == 0) { - // block - while (!done) - cond.Wait(client_lock); - r = rvalue; - } else { - // it was cached. - delete onfinish; - } - } else { - r = oc->file_atomic_sync_read(inode, offset, size, &blist, client_lock); - } - - // dec reading counter - num_reading--; - - if (num_reading == 0 && !caps_callbacks.empty()) - check_caps(); - - return r; -} - -void FileCache::write(off_t offset, size_t size, bufferlist& blist, Mutex& client_lock) -{ - // can i write - while ((latest_caps & CAP_FILE_WR) == 0) { - dout(10) << "write doesn't have WR cap, blocking" << dendl; - Cond c; - waitfor_write.insert(&c); - c.Wait(client_lock); - waitfor_write.erase(&c); - } - - // inc writing counter - num_writing++; - - if (size > 0) { - if (latest_caps & CAP_FILE_WRBUFFER) { // caps buffered write? - // wait? (this may block!) - oc->wait_for_write(size, client_lock); - - // async, caching, non-blocking. - oc->file_write(inode, offset, size, blist); - } else { - // atomic, synchronous, blocking. - oc->file_atomic_sync_write(inode, offset, size, blist, client_lock); - } - } - - // dec writing counter - num_writing--; - if (num_writing == 0 && !caps_callbacks.empty()) - check_caps(); -} - -bool FileCache::all_safe() -{ - return !oc->set_is_dirty_or_committing(inode.ino); -} - -void FileCache::add_safe_waiter(Context *c) -{ - bool safe = oc->commit_set(inode.ino, c); - if (safe) { - c->finish(0); - delete c; - } -} diff --git a/trunk/ceph/client/FileCache.h b/trunk/ceph/client/FileCache.h deleted file mode 100644 index 8d6e08146b508..0000000000000 --- a/trunk/ceph/client/FileCache.h +++ /dev/null @@ -1,85 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __FILECACHE_H -#define __FILECACHE_H - -#include -using std::iostream; - -#include "common/Cond.h" -#include "mds/Capability.h" - -class ObjectCacher; - -class FileCache { - ObjectCacher *oc; - inode_t inode; - - // caps - int latest_caps; - map > caps_callbacks; - - int num_reading; - int num_writing; - //int num_unsafe; - - // waiters - set waitfor_read; - set waitfor_write; - - bool waitfor_release; - - public: - FileCache(ObjectCacher *_oc, inode_t _inode) : - oc(_oc), - inode(_inode), - latest_caps(0), - num_reading(0), num_writing(0),// num_unsafe(0), - waitfor_release(false) {} - ~FileCache() { - tear_down(); - } - - // waiters/waiting - bool can_read() { return latest_caps & CAP_FILE_RD; } - bool can_write() { return latest_caps & CAP_FILE_WR; } - bool all_safe();// { return num_unsafe == 0; } - - void add_safe_waiter(Context *c); - - void truncate(off_t olds, off_t news); - - // ... - void flush_dirty(Context *onflush=0); - off_t release_clean(); - void empty(Context *onempty=0); - bool is_empty() { return !(is_cached() || is_dirty()); } - bool is_cached(); - bool is_dirty(); - - void tear_down(); - - int get_caps() { return latest_caps; } - int get_used_caps(); - void set_caps(int caps, Context *onimplement=0); - void check_caps(); - - int read(off_t offset, size_t size, bufferlist& blist, Mutex& client_lock); // may block. - void write(off_t offset, size_t size, bufferlist& blist, Mutex& client_lock); // may block. - -}; - - -#endif diff --git a/trunk/ceph/client/Trace.cc b/trunk/ceph/client/Trace.cc deleted file mode 100644 index 31bb1c4cf5c4a..0000000000000 --- a/trunk/ceph/client/Trace.cc +++ /dev/null @@ -1,83 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "Trace.h" - -#include -#include -#include -#include -using namespace __gnu_cxx; - -#include "common/Mutex.h" - -#include "config.h" - -#include -#include -#include - - - - - -void Trace::start() -{ - //cout << "start" << std::endl; - delete fs; - - fs = new ifstream(); - fs->open(filename); - if (!fs->is_open()) { - generic_dout(0) << "** unable to open trace file " << filename << dendl; - assert(0); - } - generic_dout(2) << "opened traced file '" << filename << "'" << dendl; - - // read first line - getline(*fs, line); - //cout << "first line is " << line << std::endl; - - _line = 1; -} - -const char *Trace::peek_string(char *buf, const char *prefix) -{ - //if (prefix) cout << "prefix '" << prefix << "' line '" << line << "'" << std::endl; - if (prefix && - strstr(line.c_str(), "/prefix") == line.c_str()) { - strcpy(buf, prefix); - strcpy(buf + strlen(prefix), - line.c_str() + strlen("/prefix")); - } else { - strcpy(buf, line.c_str()); - } - return buf; -} - - -const char *Trace::get_string(char *buf, const char *prefix) -{ - peek_string(buf, prefix); - - //cout << "buf is " << buf << std::endl; - // read next line (and detect eof early) - _line++; - getline(*fs, line); - //cout << "next line is " << line << std::endl; - - return buf; -} diff --git a/trunk/ceph/client/Trace.h b/trunk/ceph/client/Trace.h deleted file mode 100644 index 97821f4e95e56..0000000000000 --- a/trunk/ceph/client/Trace.h +++ /dev/null @@ -1,63 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __CLIENT_TRACE_H -#define __CLIENT_TRACE_H - -#include -#include -#include -#include -using std::list; -using std::string; -using std::ifstream; - -/* - - this class is more like an iterator over a constant tokenlist (which - is protected by a mutex, see Trace.cc) - - */ - -class Trace { - int _line; - const char *filename; - ifstream *fs; - string line; - - public: - Trace(const char* f) : filename(f), fs(0) {} - ~Trace() { - delete fs; - } - - int get_line() { return _line; } - - void start(); - - const char *peek_string(char *buf, const char *prefix); - const char *get_string(char *buf, const char *prefix); - - __int64_t get_int() { - char buf[20]; - return atoll(get_string(buf, 0)); - } - bool end() { - return !fs || fs->eof(); - //return _cur == _end; - } -}; - -#endif diff --git a/trunk/ceph/client/fuse.cc b/trunk/ceph/client/fuse.cc deleted file mode 100644 index 64198dc41df51..0000000000000 --- a/trunk/ceph/client/fuse.cc +++ /dev/null @@ -1,306 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -/* - FUSE: Filesystem in Userspace - Copyright (C) 2001-2005 Miklos Szeredi - - This program can be distributed under the terms of the GNU GPL. - See the file COPYING. -*/ - - -// fuse crap -#ifdef linux -/* For pread()/pwrite() */ -#define _XOPEN_SOURCE 500 -#endif - -#define FUSE_USE_VERSION 26 - -#include -#include -#include -#include -#include -#include -#include -#include - - -// ceph stuff -#include "include/types.h" - -#include "Client.h" - -#include "config.h" - -// globals -static Client *client; // the ceph client - - - -// ------ -// fuse hooks - -static int ceph_getattr(const char *path, struct stat *stbuf) -{ - return client->lstat(path, stbuf); -} - -static int ceph_readlink(const char *path, char *buf, size_t size) -{ - int res; - - res = client->readlink(path, buf, size - 1); - if (res < 0) return res; - - buf[res] = '\0'; - return 0; -} - -static int ceph_mknod(const char *path, mode_t mode, dev_t rdev) -{ - return client->mknod(path, mode); -} - -static int ceph_mkdir(const char *path, mode_t mode) -{ - return client->mkdir(path, mode); -} - -static int ceph_unlink(const char *path) -{ - return client->unlink(path); -} - -static int ceph_rmdir(const char *path) -{ - return client->rmdir(path); -} - -static int ceph_symlink(const char *from, const char *to) -{ - return client->symlink(from, to); -} - -static int ceph_rename(const char *from, const char *to) -{ - return client->rename(from, to); -} - -static int ceph_link(const char *from, const char *to) -{ - return client->link(from, to); -} - -static int ceph_chmod(const char *path, mode_t mode) -{ - return client->chmod(path, mode); -} - -static int ceph_chown(const char *path, uid_t uid, gid_t gid) -{ - return client->chown(path, uid, gid); -} - -static int ceph_truncate(const char *path, off_t size) -{ - return client->truncate(path, size); -} - -static int ceph_utime(const char *path, struct utimbuf *buf) -{ - return client->utime(path, buf); -} - - -// ------------------ -// file i/o - -static int ceph_open(const char *path, struct fuse_file_info *fi) -{ - int res; - - res = client->open(path, fi->flags, 0); - if (res < 0) return res; - fi->fh = res; - return 0; // fuse wants 0 onsucess -} - -static int ceph_read(const char *path, char *buf, size_t size, off_t offset, - struct fuse_file_info *fi) -{ - int fd = fi->fh; - return client->read(fd, buf, size, offset); -} - -static int ceph_write(const char *path, const char *buf, size_t size, - off_t offset, struct fuse_file_info *fi) -{ - int fd = fi->fh; - return client->write(fd, buf, size, offset); -} - -static int ceph_flush(const char *path, struct fuse_file_info *fi) -{ - //int fh = fi->fh; - //return client->flush(fh); - return 0; -} - -static int ceph_statfs(const char *path, struct statvfs *stbuf) -{ - return client->statfs(path, stbuf); -} - -static int ceph_release(const char *path, struct fuse_file_info *fi) -{ - int fd = fi->fh; - int r = client->close(fd); // close the file - return r; -} - -static int ceph_fsync(const char *path, int isdatasync, - struct fuse_file_info *fi) -{ - int fd = fi->fh; - return client->fsync(fd, isdatasync ? true:false); -} - - -// --------------------- -// directory i/o - -static int ceph_opendir(const char *path, struct fuse_file_info *fi) -{ - DIR *dirp; - int r = client->opendir(path, &dirp); - if (r < 0) return r; - fi->fh = (uint64_t)(void*)dirp; - return 0; -} - -static int ceph_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t off, fuse_file_info *fi) -{ - DIR *dirp = (DIR*)fi->fh; - - client->seekdir(dirp, off); - - int res = 0; - struct dirent de; - struct stat st; - int stmask = 0; - while (res == 0) { - int r = client->readdirplus_r(dirp, &de, &st, &stmask); - if (r != 0) break; - int stneed = STAT_MASK_INO | STAT_MASK_TYPE; - res = filler(buf, - de.d_name, - ((stmask & stneed) == stneed) ? &st:0, - client->telldir(dirp)); - } - return 0; -} - -static int ceph_releasedir(const char *path, struct fuse_file_info *fi) -{ - DIR *dirp = (DIR*)fi->fh; - int r = client->closedir(dirp); // close the file - return r; -} - - - - - -static struct fuse_operations ceph_oper = { - getattr: ceph_getattr, - readlink: ceph_readlink, - getdir: 0, - mknod: ceph_mknod, - mkdir: ceph_mkdir, - unlink: ceph_unlink, - rmdir: ceph_rmdir, - symlink: ceph_symlink, - rename: ceph_rename, - link: ceph_link, - chmod: ceph_chmod, - chown: ceph_chown, - truncate: ceph_truncate, - utime: ceph_utime, - open: ceph_open, - read: ceph_read, - write: ceph_write, - statfs: ceph_statfs, - flush: ceph_flush, - release: ceph_release, - fsync: ceph_fsync, - setxattr: 0, - getxattr: 0, - listxattr: 0, - removexattr: 0, - opendir: ceph_opendir, - readdir: ceph_readdir, - releasedir: ceph_releasedir -}; - - -int ceph_fuse_main(Client *c, int argc, char *argv[]) -{ - // init client - client = c; - - // set up fuse argc/argv - int newargc = 0; - char **newargv = (char **) malloc((argc + 10) * sizeof(char *)); - newargv[newargc++] = argv[0]; - - // allow other (all!) users to see my file system - // NOTE: echo user_allow_other >> /etc/fuse.conf - // NB: seems broken on Darwin -#ifndef DARWIN - newargv[newargc++] = "-o"; - newargv[newargc++] = "allow_other"; -#endif // DARWIN - - // use inos - newargv[newargc++] = "-o"; - newargv[newargc++] = "use_ino"; - - // large reads, direct_io (no kernel cachine) - //newargv[newargc++] = "-o"; - //newargv[newargc++] = "large_read"; - if (g_conf.fuse_direct_io) { - newargv[newargc++] = "-o"; - newargv[newargc++] = "direct_io"; - } - - // disable stupid fuse unlink hiding thing - newargv[newargc++] = "-o"; - newargv[newargc++] = "hard_remove"; - - // force into foreground - // -> we can watch stdout this way!! - newargv[newargc++] = "-f"; - - // copy rest of cmdline (hopefully, the mount point!) - for (int argctr = 1; argctr < argc; argctr++) newargv[newargc++] = argv[argctr]; - - // go fuse go - cout << "ok, calling fuse_main" << std::endl; - int r = fuse_main(newargc, newargv, &ceph_oper, 0); - return r; -} diff --git a/trunk/ceph/client/fuse.h b/trunk/ceph/client/fuse.h deleted file mode 100644 index dfacbaa4fdd85..0000000000000 --- a/trunk/ceph/client/fuse.h +++ /dev/null @@ -1,24 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -/* ceph_fuse_main - * - start up fuse glue, attached to Client* cl. - * - argc, argv should include a mount point, and - * any weird fuse options you want. by default, - * we will put fuse in the foreground so that it - * won't fork and we can see stdout. - */ -int ceph_fuse_main(Client *cl, int argc, char *argv[]); diff --git a/trunk/ceph/client/fuse_ll.cc b/trunk/ceph/client/fuse_ll.cc deleted file mode 100644 index f1f92b0cd01b3..0000000000000 --- a/trunk/ceph/client/fuse_ll.cc +++ /dev/null @@ -1,397 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#define FUSE_USE_VERSION 26 - -#include -#include -#include -#include -#include -#include -#include -#include - -// ceph -#include "include/types.h" -#include "Client.h" -#include "config.h" - -static Client *client; - - -static void ceph_ll_lookup(fuse_req_t req, fuse_ino_t parent, const char *name) -{ - struct fuse_entry_param fe; - int stmask; - - memset(&fe, 0, sizeof(fe)); - stmask = client->ll_lookup(parent, name, &fe.attr); - if (stmask >= 0) { - fe.ino = fe.attr.st_ino; - fuse_reply_entry(req, &fe); - } else { - fuse_reply_err(req, ENOENT); - } -} - -static void ceph_ll_forget(fuse_req_t req, fuse_ino_t ino, long unsigned nlookup) -{ - client->ll_forget(ino, nlookup); - fuse_reply_none(req); -} - -static void ceph_ll_getattr(fuse_req_t req, fuse_ino_t ino, - struct fuse_file_info *fi) -{ - struct stat stbuf; - - (void) fi; - - if (client->ll_getattr(ino, &stbuf) == 0) - fuse_reply_attr(req, &stbuf, 0); - else - fuse_reply_err(req, ENOENT); -} - -static void ceph_ll_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr, - int to_set, struct fuse_file_info *fi) -{ - int r = client->ll_setattr(ino, attr, to_set); - if (r == 0) - fuse_reply_attr(req, attr, 0); - else - fuse_reply_err(req, -r); -} - -static void ceph_ll_opendir(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) -{ - void *dirp; - int r = client->ll_opendir(ino, &dirp); - if (r >= 0) { - fi->fh = (long)dirp; - fuse_reply_open(req, fi); - } else { - fuse_reply_err(req, -r); - } -} - -static void ceph_ll_readlink(fuse_req_t req, fuse_ino_t ino) -{ - const char *value; - int r = client->ll_readlink(ino, &value); - if (r == 0) - fuse_reply_readlink(req, value); - else - fuse_reply_err(req, -r); -} - -static void ceph_ll_mknod(fuse_req_t req, fuse_ino_t parent, const char *name, - mode_t mode, dev_t rdev) -{ - struct fuse_entry_param fe; - memset(&fe, 0, sizeof(fe)); - - int r = client->ll_mknod(parent, name, mode, rdev, &fe.attr); - if (r == 0) { - fe.ino = fe.attr.st_ino; - fuse_reply_entry(req, &fe); - } else { - fuse_reply_err(req, -r); - } -} - -static void ceph_ll_mkdir(fuse_req_t req, fuse_ino_t parent, const char *name, - mode_t mode) -{ - struct fuse_entry_param fe; - memset(&fe, 0, sizeof(fe)); - - int r = client->ll_mkdir(parent, name, mode, &fe.attr); - if (r == 0) { - fe.ino = fe.attr.st_ino; - fuse_reply_entry(req, &fe); - } else { - fuse_reply_err(req, -r); - } -} - -static void ceph_ll_unlink(fuse_req_t req, fuse_ino_t parent, const char *name) -{ - int r = client->ll_unlink(parent, name); - fuse_reply_err(req, -r); -} - -static void ceph_ll_rmdir(fuse_req_t req, fuse_ino_t parent, const char *name) -{ - int r = client->ll_rmdir(parent, name); - fuse_reply_err(req, -r); -} - -static void ceph_ll_symlink(fuse_req_t req, const char *existing, fuse_ino_t parent, const char *name) -{ - struct fuse_entry_param fe; - memset(&fe, 0, sizeof(fe)); - - int r = client->ll_symlink(parent, name, existing, &fe.attr); - if (r == 0) { - fe.ino = fe.attr.st_ino; - fuse_reply_entry(req, &fe); - } else { - fuse_reply_err(req, -r); - } -} - -static void ceph_ll_rename(fuse_req_t req, fuse_ino_t parent, const char *name, - fuse_ino_t newparent, const char *newname) -{ - int r = client->ll_rename(parent, name, newparent, newname); - fuse_reply_err(req, -r); -} - -static void ceph_ll_link(fuse_req_t req, fuse_ino_t ino, fuse_ino_t newparent, - const char *newname) -{ - struct fuse_entry_param fe; - memset(&fe, 0, sizeof(fe)); - - int r = client->ll_link(ino, newparent, newname, &fe.attr); - if (r == 0) { - fe.ino = fe.attr.st_ino; - fuse_reply_entry(req, &fe); - } else { - fuse_reply_err(req, -r); - } -} - -static void ceph_ll_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) -{ - Fh *fh; - int r = client->ll_open(ino, fi->flags, &fh); - if (r == 0) { - fi->fh = (long)fh; - fuse_reply_open(req, fi); - } else { - fuse_reply_err(req, -r); - } -} - -static void ceph_ll_read(fuse_req_t req, fuse_ino_t ino, size_t size, off_t off, - struct fuse_file_info *fi) -{ - Fh *fh = (Fh*)fi->fh; - bufferlist bl; - int r = client->ll_read(fh, off, size, &bl); - if (r >= 0) - fuse_reply_buf(req, bl.c_str(), bl.length()); - else - fuse_reply_err(req, -r); -} - -static void ceph_ll_write(fuse_req_t req, fuse_ino_t ino, const char *buf, - size_t size, off_t off, struct fuse_file_info *fi) -{ - Fh *fh = (Fh*)fi->fh; - int r = client->ll_write(fh, off, size, buf); - if (r >= 0) - fuse_reply_write(req, r); - else - fuse_reply_err(req, -r); -} - -static void ceph_ll_flush(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) -{ - // NOOP - fuse_reply_err(req, 0); -} - -static void ceph_ll_release(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) -{ - Fh *fh = (Fh*)fi->fh; - int r = client->ll_release(fh); - fuse_reply_err(req, -r); -} - -static void ceph_ll_fsync(fuse_req_t req, fuse_ino_t ino, int datasync, - struct fuse_file_info *fi) -{ - -} - -static void ceph_ll_readdir(fuse_req_t req, fuse_ino_t ino, size_t size, - off_t off, struct fuse_file_info *fi) -{ - (void) fi; - - // buffer - char *buf; - size_t pos = 0; - - buf = new char[size]; - if (!buf) { - fuse_reply_err(req, ENOMEM); - return; - } - - DIR *dirp = (DIR*)fi->fh; - client->seekdir(dirp, off); - - struct dirent de; - struct stat st; - memset(&st, 0, sizeof(st)); - - while (1) { - int r = client->readdir_r(dirp, &de); - if (r < 0) break; - st.st_ino = de.d_ino; - st.st_mode = DT_TO_MODE(de.d_type); - - off_t off = client->telldir(dirp); - size_t entrysize = fuse_add_direntry(req, buf + pos, size - pos, - de.d_name, &st, off); - - /* - cout << "ceph_ll_readdir added " << de.d_name << " at " << pos << " len " << entrysize - << " (buffer size is " << size << ")" - << " .. off = " << off - << std::endl; - */ - - if (entrysize > size - pos) - break; // didn't fit, done for now. - pos += entrysize; - } - - fuse_reply_buf(req, buf, pos); - delete[] buf; -} - -static void ceph_ll_releasedir(fuse_req_t req, fuse_ino_t ino, - struct fuse_file_info *fi) -{ - DIR *dirp = (DIR*)fi->fh; - client->ll_releasedir(dirp); - fuse_reply_err(req, 0); -} - -static void ceph_ll_create(fuse_req_t req, fuse_ino_t parent, const char *name, - mode_t mode, struct fuse_file_info *fi) -{ - struct fuse_entry_param fe; - memset(&fe, 0, sizeof(fe)); - Fh *fh; - int r = client->ll_create(parent, name, mode, fi->flags, &fe.attr, &fh); - if (r == 0) { - fi->fh = (long)fh; - fe.ino = fe.attr.st_ino; - fuse_reply_create(req, &fe, fi); - } else { - fuse_reply_err(req, -r); - } -} - -static void ceph_ll_statfs(fuse_req_t req, fuse_ino_t ino) -{ - struct statvfs stbuf; - int r = client->ll_statfs(ino, &stbuf); - if (r == 0) - fuse_reply_statfs(req, &stbuf); - else - fuse_reply_err(req, -r); -} - -static struct fuse_lowlevel_ops ceph_ll_oper = { - init: 0, - destroy: 0, - lookup: ceph_ll_lookup, - forget: ceph_ll_forget, - getattr: ceph_ll_getattr, - setattr: ceph_ll_setattr, - readlink: ceph_ll_readlink, - mknod: ceph_ll_mknod, - mkdir: ceph_ll_mkdir, - unlink: ceph_ll_unlink, - rmdir: ceph_ll_rmdir, - symlink: ceph_ll_symlink, - rename: ceph_ll_rename, - link: ceph_ll_link, - open: ceph_ll_open, - read: ceph_ll_read, - write: ceph_ll_write, - flush: ceph_ll_flush, - release: ceph_ll_release, - fsync: ceph_ll_fsync, - opendir: ceph_ll_opendir, - readdir: ceph_ll_readdir, - releasedir: ceph_ll_releasedir, - fsyncdir: 0, - statfs: ceph_ll_statfs, - setxattr: 0, - getxattr: 0, - listxattr: 0, - removexattr: 0, - access: 0, - create: ceph_ll_create, - getlk: 0, - setlk: 0, - bmap: 0 -}; - -int ceph_fuse_ll_main(Client *c, int argc, char *argv[]) -{ - cout << "ceph_fuse_ll_main starting fuse on pid " << getpid() << std::endl; - - client = c; - - // set up fuse argc/argv - int newargc = 0; - char **newargv = (char **) malloc((argc + 10) * sizeof(char *)); - newargv[newargc++] = argv[0]; - newargv[newargc++] = "-f"; // stay in foreground - - newargv[newargc++] = "-o"; - newargv[newargc++] = "allow_other"; - - for (int argctr = 1; argctr < argc; argctr++) newargv[newargc++] = argv[argctr]; - - // go go gadget fuse - struct fuse_args args = FUSE_ARGS_INIT(newargc, newargv); - struct fuse_chan *ch; - char *mountpoint; - int err = -1; - - if (fuse_parse_cmdline(&args, &mountpoint, NULL, NULL) != -1 && - (ch = fuse_mount(mountpoint, &args)) != NULL) { - struct fuse_session *se; - - // init fuse - se = fuse_lowlevel_new(&args, &ceph_ll_oper, sizeof(ceph_ll_oper), - NULL); - if (se != NULL) { - if (fuse_set_signal_handlers(se) != -1) { - fuse_session_add_chan(se, ch); - err = fuse_session_loop(se); - fuse_remove_signal_handlers(se); - fuse_session_remove_chan(ch); - } - fuse_session_destroy(se); - } - fuse_unmount(mountpoint, ch); - } - fuse_opt_free_args(&args); - - cout << "ceph_fuse_ll_main done, err=" << err << std::endl; - return err ? 1 : 0; -} - diff --git a/trunk/ceph/client/fuse_ll.h b/trunk/ceph/client/fuse_ll.h deleted file mode 100644 index 068969c4f7487..0000000000000 --- a/trunk/ceph/client/fuse_ll.h +++ /dev/null @@ -1,15 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -int ceph_fuse_ll_main(Client *c, int argc, char *argv[]); diff --git a/trunk/ceph/client/hadoop/CephFSInterface.cc b/trunk/ceph/client/hadoop/CephFSInterface.cc deleted file mode 100644 index 7aa8c133d370b..0000000000000 --- a/trunk/ceph/client/hadoop/CephFSInterface.cc +++ /dev/null @@ -1,789 +0,0 @@ -#include "CephFSInterface.h" - -using namespace std; - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_initializeClient - * Signature: ()J - * Initializes a ceph client. - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1initializeClient - (JNIEnv *, jobject) -{ - - dout(3) << "CephFSInterface: Initializing Ceph client:" << endl; - - // parse args from CEPH_ARGS - vector args; - env_to_vec(args); - parse_config_options(args); - - if (g_conf.clock_tare) g_clock.tare(); - - // be safe - g_conf.use_abspaths = true; - - // load monmap - MonMap monmap; - // int r = monmap.read(".ceph_monmap"); - int r = monmap.read("/cse/grads/eestolan/ceph/trunk/ceph/.ceph_monmap"); - if (r < 0) { - dout(0) << "CephFSInterface: could not find .ceph_monmap" << endl; - assert(0 && "could not find .ceph_monmap"); - // return 0; - } - assert(r >= 0); - - // start up network - rank.start_rank(); - - // start client - Client *client; - client = new Client(rank.register_entity(MSG_ADDR_CLIENT_NEW), &monmap); - client->init(); - - // mount - client->mount(); - - jlong clientp = *(jlong*)&client; - return clientp; -} - - - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_copyFromLocalFile - * Signature: (JLjava/lang/String;Ljava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1copyFromLocalFile -(JNIEnv * env, jobject obj, jlong clientp, jstring j_local_path, jstring j_ceph_path) { - - dout(10) << "CephFSInterface: In copyFromLocalFile" << endl; - Client* client; - //client = (Client*) clientp; - client = *(Client**)&clientp; - - const char* c_local_path = env->GetStringUTFChars(j_local_path, 0); - const char* c_ceph_path = env->GetStringUTFChars(j_ceph_path, 0); - - dout(10) << "CephFSInterface: Local source file is "<< c_local_path << " and Ceph destination file is " << c_ceph_path << endl; - struct stat st; - int r = ::stat(c_local_path, &st); - assert (r == 0); - - // open the files - int fh_local = ::open(c_local_path, O_RDONLY); - int fh_ceph = client->open(c_ceph_path, O_WRONLY|O_CREAT|O_TRUNC); - assert (fh_local > -1); - assert (fh_ceph > -1); - dout(10) << "CephFSInterface: local fd is " << fh_local << " and Ceph fd is " << fh_ceph << endl; - - // get the source file size - off_t remaining = st.st_size; - - // copy the file a MB at a time - const int chunk = 1048576; - bufferptr bp(chunk); - - while (remaining > 0) { - off_t got = ::read(fh_local, bp.c_str(), MIN(remaining,chunk)); - assert(got > 0); - remaining -= got; - off_t wrote = client->write(fh_ceph, bp.c_str(), got, -1); - assert (got == wrote); - } - client->close(fh_ceph); - ::close(fh_local); - - env->ReleaseStringUTFChars(j_local_path, c_local_path); - env->ReleaseStringUTFChars(j_ceph_path, c_ceph_path); - - return JNI_TRUE; -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_copyToLocalFile - * Signature: (JLjava/lang/String;Ljava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1copyToLocalFile -(JNIEnv *env, jobject obj, jlong clientp, jstring j_ceph_path, jstring j_local_path) -{ - Client* client; - client = *(Client**)&clientp; - const char* c_ceph_path = env->GetStringUTFChars(j_ceph_path, 0); - const char* c_local_path = env->GetStringUTFChars(j_local_path, 0); - - dout(3) << "CephFSInterface: dout(3): In copyToLocalFile, copying from Ceph file " << c_ceph_path << - " to local file " << c_local_path << endl; - - cout << "CephFSInterface: cout: In copyToLocalFile, copying from Ceph file " << c_ceph_path << - " to local file " << c_local_path << endl; - - - // get source file size - struct stat st; - //dout(10) << "Attempting lstat with file " << c_ceph_path << ":" << endl; - int r = client->lstat(c_ceph_path, &st); - assert (r == 0); - - dout(10) << "CephFSInterface: Opening Ceph source file for read: " << endl; - int fh_ceph = client->open(c_ceph_path, O_RDONLY); - assert (fh_ceph > -1); - - dout(10) << "CephFSInterface: Opened Ceph file! Opening local destination file: " << endl; - int fh_local = ::open(c_local_path, O_WRONLY|O_CREAT|O_TRUNC, 0644); - assert (fh_local > -1); - - // copy the file a chunk at a time - const int chunk = 1048576; - bufferptr bp(chunk); - - off_t remaining = st.st_size; - while (remaining > 0) { - off_t got = client->read(fh_ceph, bp.c_str(), MIN(remaining,chunk), -1); - assert(got > 0); - remaining -= got; - off_t wrote = ::write(fh_local, bp.c_str(), got); - assert (got == wrote); - } - client->close(fh_ceph); - ::close(fh_local); - - env->ReleaseStringUTFChars(j_local_path, c_local_path); - env->ReleaseStringUTFChars(j_ceph_path, c_ceph_path); - - return JNI_TRUE; -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_getcwd - * Signature: (J)Ljava/lang/String; - * Returns the current working directory. - */ -JNIEXPORT jstring JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1getcwd - (JNIEnv *env, jobject obj, jlong clientp) -{ - dout(10) << "CephFSInterface: In getcwd" << endl; - - Client* client; - client = *(Client**)&clientp; - - return (env->NewStringUTF(client->getcwd().c_str())); -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_setcwd - * Signature: (JLjava/lang/String;)Z - * - * Changes the working directory. - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1setcwd -(JNIEnv *env, jobject obj, jlong clientp, jstring j_path) -{ - dout(10) << "CephFSInterface: In setcwd" << endl; - - Client* client; - client = *(Client**)&clientp; - - const char* c_path = env->GetStringUTFChars(j_path, 0); - return (0 <= client->chdir(c_path)) ? JNI_TRUE : JNI_FALSE; - env->ReleaseStringUTFChars(j_path, c_path); -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_rmdir - * Signature: (JLjava/lang/String;)Z - * Removes an empty directory. - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1rmdir - (JNIEnv *env, jobject, jlong clientp, jstring j_path) -{ - dout(10) << "CephFSInterface: In rmdir" << endl; - - Client* client; - client = *(Client**)&clientp; - - const char* c_path = env->GetStringUTFChars(j_path, 0); - return (0 == client->rmdir(c_path)) ? JNI_TRUE : JNI_FALSE; - env->ReleaseStringUTFChars(j_path, c_path); -} - - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_mkdir - * Signature: (JLjava/lang/String;)Z - * Creates a directory with full permissions. - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1mkdir - (JNIEnv * env, jobject, jlong clientp, jstring j_path) -{ - dout(10) << "CephFSInterface: In mkdir" << endl; - - Client* client; - client = *(Client**)&clientp; - - const char* c_path = env->GetStringUTFChars(j_path, 0); - return (0 == client->mkdir(c_path, 0xFF)) ? JNI_TRUE : JNI_FALSE; - env->ReleaseStringUTFChars(j_path, c_path); -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_unlink - * Signature: (JLjava/lang/String;)Z - * Unlinks a path. - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1unlink - (JNIEnv * env, jobject, jlong clientp, jstring j_path) -{ - Client* client; - client = *(Client**)&clientp; - - const char* c_path = env->GetStringUTFChars(j_path, 0); - dout(10) << "CephFSInterface: In unlink for path " << c_path << ":" << endl; - - // is it a file or a directory? - struct stat stbuf; - int stat_result = client->lstat(c_path, &stbuf); - if (stat_result < 0) {// then the path doesn't even exist - dout(0) << "ceph_unlink: path " << c_path << " does not exist" << endl; - return false; - } - int result; - if (0 != S_ISDIR(stbuf.st_mode)) { // it's a directory - dout(10) << "ceph_unlink: path " << c_path << " is a directory. Calling client->rmdir()" << endl; - result = client->rmdir(c_path); - } - else if (0 != S_ISREG(stbuf.st_mode)) { // it's a file - dout(10) << "ceph_unlink: path " << c_path << " is a file. Calling client->unlink()" << endl; - result = client->unlink(c_path); - } - else { - dout(0) << "ceph_unlink: path " << c_path << " is not a file or a directory. Failing:" << endl; - result = -1; - } - - dout(10) << "In ceph_unlink for path " << c_path << - ": got result " - << result << ". Returning..."<< endl; - - env->ReleaseStringUTFChars(j_path, c_path); - return (0 == result) ? JNI_TRUE : JNI_FALSE; -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_rename - * Signature: (JLjava/lang/String;Ljava/lang/String;)Z - * Renames a file. - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1rename - (JNIEnv *env, jobject, jlong clientp, jstring j_from, jstring j_to) -{ - dout(10) << "CephFSInterface: In rename" << endl; - - Client* client; - client = *(Client**)&clientp; - - const char* c_from = env->GetStringUTFChars(j_from, 0); - const char* c_to = env->GetStringUTFChars(j_to, 0); - - return (0 <= client->rename(c_from, c_to)) ? JNI_TRUE : JNI_FALSE; - env->ReleaseStringUTFChars(j_from, c_from); - env->ReleaseStringUTFChars(j_to, c_to); -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_exists - * Signature: (JLjava/lang/String;)Z - * Returns true if the path exists. - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1exists -(JNIEnv *env, jobject, jlong clientp, jstring j_path) -{ - - dout(10) << "CephFSInterface: In exists" << endl; - - Client* client; - struct stat stbuf; - client = *(Client**)&clientp; - - const char* c_path = env->GetStringUTFChars(j_path, 0); - dout(10) << "Attempting lstat with file " << c_path << ":" ; - int result = client->lstat(c_path, &stbuf); - dout(10) << "result is " << result << endl; - env->ReleaseStringUTFChars(j_path, c_path); - if (result < 0) { - dout(10) << "Returning false (file does not exist)" << endl; - return JNI_FALSE; - } - else { - dout(10) << "Returning true (file exists)" << endl; - return JNI_TRUE; - } -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_getblocksize - * Signature: (JLjava/lang/String;)J - * Returns the block size. Size is -1 if the file - * does not exist. - * TODO: see if Hadoop wants something more like stripe size - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1getblocksize - (JNIEnv *env, jobject obj, jlong clientp, jstring j_path) -{ - dout(10) << "In getblocksize" << endl; - - Client* client; - //struct stat stbuf; - client = *(Client**)&clientp; - - jint result; - - const char* c_path = env->GetStringUTFChars(j_path, 0); - - /* - if (0 > client->lstat(c_path, &stbuf)) - result = -1; - else - result = stbuf.st_blksize; - */ - - // we need to open the file to retrieve the stripe size - dout(10) << "CephFSInterface: getblocksize: opening file" << endl; - int fh = client->open(c_path, O_RDONLY); - if (fh < 0) - return -1; - - result = client->get_stripe_unit(fh); - - int close_result = client->close(fh); - assert (close_result > -1); - - - env->ReleaseStringUTFChars(j_path, c_path); - return result; -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_getfilesize - * Signature: (JLjava/lang/String;)J - * Returns the file size, or -1 on failure. - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1getfilesize - (JNIEnv *env, jobject, jlong clientp, jstring j_path) -{ - dout(10) << "In getfilesize" << endl; - - Client* client; - struct stat stbuf; - client = *(Client**)&clientp; - - jlong result; - - const char* c_path = env->GetStringUTFChars(j_path, 0); - if (0 > client->lstat(c_path, &stbuf)) result = -1; - else result = stbuf.st_size; - env->ReleaseStringUTFChars(j_path, c_path); - - return result; -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_isfile - * Signature: (JLjava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1isfile - (JNIEnv *env, jobject obj, jlong clientp, jstring j_path) -{ - dout(10) << "In isfile" << endl; - - Client* client; - struct stat stbuf; - client = *(Client**)&clientp; - - - const char* c_path = env->GetStringUTFChars(j_path, 0); - int result = client->lstat(c_path, &stbuf); - - env->ReleaseStringUTFChars(j_path, c_path); - - // if the stat call failed, it's definitely not a file... - if (0 > result) return JNI_FALSE; - - // check the stat result - return (0 == S_ISREG(stbuf.st_mode)) ? JNI_FALSE : JNI_TRUE; -} - - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_isdirectory - * Signature: (JLjava/lang/String;)Z - * Returns true if the path is a directory. - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1isdirectory - (JNIEnv *env, jobject, jlong clientp, jstring j_path) -{ - dout(10) << "In isdirectory" << endl; - - Client* client; - struct stat stbuf; - client = *(Client**)&clientp; - - const char* c_path = env->GetStringUTFChars(j_path, 0); - int result = client->lstat(c_path, &stbuf); - env->ReleaseStringUTFChars(j_path, c_path); - - // if the stat call failed, it's definitely not a directory... - if (0 > result) return JNI_FALSE; - - // check the stat result - return (0 == S_ISDIR(stbuf.st_mode)) ? JNI_FALSE : JNI_TRUE; -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_getdir - * Signature: (JLjava/lang/String;)[Ljava/lang/String; - * Returns a Java array of Strings with the directory contents - */ -JNIEXPORT jobjectArray JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1getdir -(JNIEnv *env, jobject obj, jlong clientp, jstring j_path) { - - dout(10) << "In getdir" << endl; - - Client* client; - client = *(Client**)&clientp; - - // get the directory listing - map contents; - const char* c_path = env->GetStringUTFChars(j_path, 0); - int result = client->getdir(c_path, contents); - env->ReleaseStringUTFChars(j_path, c_path); - - if (result < 0) return NULL; - - dout(10) << "checking for empty dir" << endl; - jint dir_size = contents.size(); - - // Hadoop freaks out if the listing contains "." or "..". Shrink - // the listing size by two, or by one if the directory is the root. - if(('/' == c_path[0]) && (0 == c_path[1])) - dir_size -= 1; - else - dir_size -= 2; - assert (dir_size >= 0); - - // Create a Java String array of the size of the directory listing - // jstring blankString = env->NewStringUTF(""); - jclass stringClass = env->FindClass("java/lang/String"); - if (NULL == stringClass) { - dout(0) << "ERROR: java String class not found; dying a horrible, painful death" << endl; - assert(0); - } - jobjectArray dirListingStringArray = (jobjectArray) env->NewObjectArray(dir_size, stringClass, NULL); - - // populate the array with the elements of the directory list, - // omitting . and .. - int i = 0; - string dot("."); - string dotdot (".."); - for (map::iterator it = contents.begin(); - it != contents.end(); - it++) { - // is it "."? - if (it->first == dot) continue; - if (it->first == dotdot) continue; - - if (0 == dir_size) - dout(0) << "CephFSInterface: WARNING: adding stuff to an empty array." << endl; - assert (i < dir_size); - env->SetObjectArrayElement(dirListingStringArray, i, - env->NewStringUTF(it->first.c_str())); - ++i; - } - - return dirListingStringArray; -} - - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_open_for_read - * Signature: (JLjava/lang/String;)I - * Open a file for reading. - */ -JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1open_1for_1read - (JNIEnv *env, jobject obj, jlong clientp, jstring j_path) - -{ - dout(10) << "In open_for_read" << endl; - - Client* client; - client = *(Client**)&clientp; - - jint result; - - // open as read-only: flag = O_RDONLY - - const char* c_path = env->GetStringUTFChars(j_path, 0); - result = client->open(c_path, O_RDONLY); - env->ReleaseStringUTFChars(j_path, c_path); - - // returns file handle, or -1 on failure - return result; -} - - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_open_for_overwrite - * Signature: (JLjava/lang/String;)I - * Opens a file for overwriting; creates it if necessary. - */ -JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1open_1for_1overwrite - (JNIEnv *env, jobject obj, jlong clientp, jstring j_path) -{ - dout(10) << "In open_for_overwrite" << endl; - - Client* client; - client = *(Client**)&clientp; - - jint result; - - - const char* c_path = env->GetStringUTFChars(j_path, 0); - result = client->open(c_path, O_WRONLY|O_CREAT|O_TRUNC); - env->ReleaseStringUTFChars(j_path, c_path); - - // returns file handle, or -1 on failure - return result; -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_kill_client - * Signature: (J)Z - * - * Closes the Ceph client. - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1kill_1client - (JNIEnv *env, jobject obj, jlong clientp) -{ - Client* client; - client = *(Client**)&clientp; - - client->unmount(); - client->shutdown(); - delete client; - - // wait for messenger to finish - rank.wait(); - - return true; -} - - - -/* - * Class: org_apache_hadoop_fs_ceph_CephInputStream - * Method: ceph_read - * Signature: (JI[BII)I - * Reads into the given byte array from the current position. - */ -JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephInputStream_ceph_1read - (JNIEnv *env, jobject obj, jlong clientp, jint fh, jbyteArray j_buffer, jint buffer_offset, jint length) -{ - dout(10) << "In read" << endl; - - - // IMPORTANT NOTE: Hadoop read arguments are a bit different from POSIX so we - // have to convert. The read is *always* from the current position in the file, - // and buffer_offset is the location in the *buffer* where we start writing. - - - Client* client; - client = *(Client**)&clientp; - jint result; - - // Step 1: get a pointer to the buffer. - jbyte* j_buffer_ptr = env->GetByteArrayElements(j_buffer, NULL); - char* c_buffer = (char*) j_buffer_ptr; - - // Step 2: pointer arithmetic to start in the right buffer position - c_buffer += (int)buffer_offset; - - // Step 3: do the read - result = client->read((int)fh, c_buffer, length, -1); - - // Step 4: release the pointer to the buffer - env->ReleaseByteArrayElements(j_buffer, j_buffer_ptr, 0); - - return result; -} - - -/* - * Class: org_apache_hadoop_fs_ceph_CephInputStream - * Method: ceph_seek_from_start - * Signature: (JIJ)J - * Seeks to the given position. - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephInputStream_ceph_1seek_1from_1start - (JNIEnv *env, jobject obj, jlong clientp, jint fh, jlong pos) -{ - dout(10) << "In CephInputStream::seek_from_start" << endl; - - Client* client; - client = *(Client**)&clientp; - jint result; - - result = client->lseek(fh, pos, SEEK_SET); - - return result; -} - -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephInputStream_ceph_1getpos - (JNIEnv *env, jobject obj, jlong clientp, jint fh) -{ - dout(10) << "In CephInputStream::ceph_getpos" << endl; - - Client* client; - client = *(Client**)&clientp; - jint result; - - // seek a distance of 0 to get current offset - result = client->lseek(fh, 0, SEEK_CUR); - - return result; -} - - -/* - * Class: org_apache_hadoop_fs_ceph_CephInputStream - * Method: ceph_close - * Signature: (JI)I - * Closes the file. - */ -JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephInputStream_ceph_1close - (JNIEnv *env, jobject obj, jlong clientp, jint fh) -{ - dout(10) << "In CephInputStream::ceph_close" << endl; - - Client* client; - client = *(Client**)&clientp; - jint result; - - result = client->close(fh); - - return result; -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephOutputStream - * Method: ceph_seek_from_start - * Signature: (JIJ)J - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephOutputStream_ceph_1seek_1from_1start - (JNIEnv *env, jobject obj, jlong clientp, jint fh, jlong pos) -{ - dout(10) << "In CephOutputStream::ceph_seek_from_start" << endl; - - Client* client; - client = *(Client**)&clientp; - jint result; - - result = client->lseek(fh, pos, SEEK_SET); - - return result; -} - - -/* - * Class: org_apache_hadoop_fs_ceph_CephOutputStream - * Method: ceph_getpos - * Signature: (JI)J - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephOutputStream_ceph_1getpos - (JNIEnv *env, jobject obj, jlong clientp, jint fh) -{ - dout(10) << "In CephOutputStream::ceph_getpos" << endl; - - Client* client; - client = *(Client**)&clientp; - jint result; - - // seek a distance of 0 to get current offset - result = client->lseek(fh, 0, SEEK_CUR); - - return result; -} - -/* - * Class: org_apache_hadoop_fs_ceph_CephOutputStream - * Method: ceph_close - * Signature: (JI)I - */ -JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephOutputStream_ceph_1close - (JNIEnv *env, jobject obj, jlong clientp, jint fh) -{ - dout(10) << "In CephOutputStream::ceph_close" << endl; - - Client* client; - client = *(Client**)&clientp; - jint result; - - result = client->close(fh); - - return result; -} - - - -/* - * Class: org_apache_hadoop_fs_ceph_CephOutputStream - * Method: ceph_write - * Signature: (JI[BII)I - */ -JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephOutputStream_ceph_1write - (JNIEnv *env, jobject obj, jlong clientp, jint fh, jbyteArray j_buffer, jint buffer_offset, jint length) -{ - dout(10) << "In write" << endl; - - // IMPORTANT NOTE: Hadoop write arguments are a bit different from POSIX so we - // have to convert. The write is *always* from the current position in the file, - // and buffer_offset is the location in the *buffer* where we start writing. - - Client* client; - client = *(Client**)&clientp; - jint result; - - // Step 1: get a pointer to the buffer. - jbyte* j_buffer_ptr = env->GetByteArrayElements(j_buffer, NULL); - char* c_buffer = (char*) j_buffer_ptr; - - // Step 2: pointer arithmetic to start in the right buffer position - c_buffer += (int)buffer_offset; - - // Step 3: do the write - result = client->write((int)fh, c_buffer, length, -1); - - // Step 4: release the pointer to the buffer - env->ReleaseByteArrayElements(j_buffer, j_buffer_ptr, 0); - - return result; -} - diff --git a/trunk/ceph/client/hadoop/CephFSInterface.h b/trunk/ceph/client/hadoop/CephFSInterface.h deleted file mode 100644 index 549925aba6e64..0000000000000 --- a/trunk/ceph/client/hadoop/CephFSInterface.h +++ /dev/null @@ -1,239 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* DO NOT EDIT THIS FILE - it is machine generated */ -#include -/* Header for class org_apache_hadoop_fs_ceph_CephFileSystem */ - -#include -#include "client/Client.h" -#include "config.h" -#include "client/fuse.h" -#include "msg/SimpleMessenger.h" -#include "common/Timer.h" - -#ifndef _Included_org_apache_hadoop_fs_ceph_CephFileSystem -#define _Included_org_apache_hadoop_fs_ceph_CephFileSystem -#ifdef __cplusplus -extern "C" { -#endif - -#undef org_apache_hadoop_fs_ceph_CephFileSystem_DEFAULT_BLOCK_SIZE -#define org_apache_hadoop_fs_ceph_CephFileSystem_DEFAULT_BLOCK_SIZE 1048576LL -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_initializeClient - * Signature: ()J - * Initializes a ceph client. - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1initializeClient -(JNIEnv *, jobject); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_copyFromLocalFile - * Signature: (JLjava/lang/String;Ljava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1copyFromLocalFile - (JNIEnv *, jobject, jlong, jstring, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_copyToLocalFile - * Signature: (JLjava/lang/String;Ljava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1copyToLocalFile - (JNIEnv *, jobject, jlong, jstring, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_getcwd - * Signature: (J)Ljava/lang/String; - */ -JNIEXPORT jstring JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1getcwd - (JNIEnv *, jobject, jlong); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_setcwd - * Signature: (JLjava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1setcwd - (JNIEnv *, jobject, jlong, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_rmdir - * Signature: (JLjava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1rmdir - (JNIEnv *, jobject, jlong, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_mkdir - * Signature: (JLjava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1mkdir - (JNIEnv *, jobject, jlong, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_unlink - * Signature: (JLjava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1unlink - (JNIEnv *, jobject, jlong, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_rename - * Signature: (JLjava/lang/String;Ljava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1rename - (JNIEnv *, jobject, jlong, jstring, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_exists - * Signature: (JLjava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1exists - (JNIEnv *, jobject, jlong, jstring); - - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_getblocksize - * Signature: (JLjava/lang/String;)J - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1getblocksize - (JNIEnv *, jobject, jlong, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_getfilesize - * Signature: (JLjava/lang/String;)J - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1getfilesize - (JNIEnv *, jobject, jlong, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_isdirectory - * Signature: (JLjava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1isdirectory - (JNIEnv *, jobject, jlong, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_isfile - * Signature: (JLjava/lang/String;)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1isfile - (JNIEnv *, jobject, jlong, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_getdir - * Signature: (JLjava/lang/String;)[Ljava/lang/String; - */ -JNIEXPORT jobjectArray JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1getdir - (JNIEnv *, jobject, jlong, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_open_for_read - * Signature: (JLjava/lang/String;)I - */ -JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1open_1for_1read - (JNIEnv *, jobject, jlong, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_open_for_overwrite - * Signature: (JLjava/lang/String;)I - */ -JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1open_1for_1overwrite - (JNIEnv *, jobject, jlong, jstring); - -/* - * Class: org_apache_hadoop_fs_ceph_CephFileSystem - * Method: ceph_kill_client - * Signature: (J)Z - */ -JNIEXPORT jboolean JNICALL Java_org_apache_hadoop_fs_ceph_CephFileSystem_ceph_1kill_1client - (JNIEnv *, jobject, jlong); - -#undef org_apache_hadoop_fs_ceph_CephInputStream_SKIP_BUFFER_SIZE -#define org_apache_hadoop_fs_ceph_CephInputStream_SKIP_BUFFER_SIZE 2048L -/* - * Class: org_apache_hadoop_fs_ceph_CephInputStream - * Method: ceph_read - * Signature: (JI[BII)I - */ -JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephInputStream_ceph_1read - (JNIEnv *, jobject, jlong, jint, jbyteArray, jint, jint); - -/* - * Class: org_apache_hadoop_fs_ceph_CephInputStream - * Method: ceph_seek_from_start - * Signature: (JIJ)J - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephInputStream_ceph_1seek_1from_1start - (JNIEnv *, jobject, jlong, jint, jlong); - -/* - * Class: org_apache_hadoop_fs_ceph_CephInputStream - * Method: ceph_getpos - * Signature: (JI)J - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephInputStream_ceph_1getpos - (JNIEnv *, jobject, jlong, jint); - -/* - * Class: org_apache_hadoop_fs_ceph_CephInputStream - * Method: ceph_close - * Signature: (JI)I - */ -JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephInputStream_ceph_1close - (JNIEnv *, jobject, jlong, jint); - -/* Header for class org_apache_hadoop_fs_ceph_CephOutputStream */ - -/* - * Class: org_apache_hadoop_fs_ceph_CephOutputStream - * Method: ceph_seek_from_start - * Signature: (JIJ)J - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephOutputStream_ceph_1seek_1from_1start - (JNIEnv *, jobject, jlong, jint, jlong); - -/* - * Class: org_apache_hadoop_fs_ceph_CephOutputStream - * Method: ceph_getpos - * Signature: (JI)J - */ -JNIEXPORT jlong JNICALL Java_org_apache_hadoop_fs_ceph_CephOutputStream_ceph_1getpos - (JNIEnv *, jobject, jlong, jint); - -/* - * Class: org_apache_hadoop_fs_ceph_CephOutputStream - * Method: ceph_close - * Signature: (JI)I - */ -JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephOutputStream_ceph_1close - (JNIEnv *, jobject, jlong, jint); - -/* - * Class: org_apache_hadoop_fs_ceph_CephOutputStream - * Method: ceph_write - * Signature: (JI[BII)I - */ -JNIEXPORT jint JNICALL Java_org_apache_hadoop_fs_ceph_CephOutputStream_ceph_1write - (JNIEnv *, jobject, jlong, jint, jbyteArray, jint, jint); - -#ifdef __cplusplus -} -#endif -#endif diff --git a/trunk/ceph/client/ldceph.cc b/trunk/ceph/client/ldceph.cc deleted file mode 100644 index b17133ee1e6f2..0000000000000 --- a/trunk/ceph/client/ldceph.cc +++ /dev/null @@ -1,298 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include -using namespace std; - -// ceph stuff -#include "config.h" -#include "client/Client.h" -#include "msg/SimpleMessenger.h" - -// syscall fun -#include -#include -#include -//#include - -#define _FCNTL_H -#include - -#define CEPH_FD_OFF 50000 - - -/****** startup etc *******/ - -class LdCeph { -public: - // globals - bool started; - char *mount_point; - char *mount_point_parent; - int mount_point_len; - - Client *client; - - filepath fp_mount_point; - filepath cwd; - bool cwd_above_mp, cwd_in_mp; - - const char *get_ceph_path(const char *orig, char *buf) { - if (!started) return 0; - - // relative path? BUG: this won't catch "blah/../../asdf" - if (orig[0] && - orig[0] != '/' && - !(orig[0] == '.' && orig[1] == '.')) { - - if (cwd_in_mp) return orig; // inside mount point, definitely ceph - if (!cwd_above_mp) return 0; // not above mount point, definitely not ceph - - // relative, above mp. - filepath o = orig; - filepath p = cwd; - for (unsigned b = 0; b < o.depth(); b++) { - if (o[b] == "..") - p.pop_dentry(); - else - p.add_dentry(o[b]); - } - - // FIXME rewrite - if (strncmp(p.c_str(), mount_point, mount_point_len) == 0) { - if (p.c_str()[mount_point_len] == 0) - return "/"; - if (p.c_str()[mount_point_len] == '/') { - strcpy(buf, p.c_str() + mount_point_len); - return buf; - } - } - return 0; - } else { - // absolute - if (strncmp(orig, mount_point, mount_point_len) == 0) { - if (orig[mount_point_len] == 0) - return "/"; - if (orig[mount_point_len] == '/') - return orig + mount_point_len; - } - return 0; - } - } - - void refresh_cwd() { - char buf[255]; - syscall(SYS_getcwd, buf, 255); - cwd = buf; - - if (strncmp(buf, mount_point, mount_point_len) == 0 && - (buf[mount_point_len] == 0 || - buf[mount_point_len] == '/')) - cwd_in_mp = true; - else { - if (cwd.depth() > fp_mount_point.depth()) - cwd_above_mp = false; - else { - cwd_above_mp = true; - for (unsigned i=0; iget_myaddr() << endl; - - refresh_cwd(); - } - } - ~LdCeph() { - cout << "ldceph fini" << endl; - if (false && client) { - client->unmount(); - client->shutdown(); - delete client; - client = 0; - tcpmessenger_wait(); - tcpmessenger_shutdown(); - } - } - -} ldceph; - - - -/****** original functions ****/ - - - -/****** captured functions ****/ - - -#define MYFD(f) ((fd) > CEPH_FD_OFF && ldceph.started) -#define TO_FD(fd) (fd > 0 ? fd+CEPH_FD_OFF:fd) -#define FROM_FD(fd) (fd - CEPH_FD_OFF) - -extern "C" { - - // open/close - //int open(const char *pathname, int flags) { - int open(const char *pathname, int flags, mode_t mode) { - char buf[255]; - if (const char *c = ldceph.get_ceph_path(pathname, buf)) - return TO_FD(ldceph.client->open(c, flags)); - else - return syscall(SYS_open, pathname, flags, mode); - } - - int creat(const char *pathname, mode_t mode) { - return open(pathname, O_CREAT|O_WRONLY|O_TRUNC, mode); - } - int close(int fd) { - if (MYFD(fd)) - return ldceph.client->close(FROM_FD(fd)); - else - return syscall(SYS_close, fd); - } - - - // read/write - ssize_t write(int fd, const void *buf, size_t count) { - if (MYFD(fd)) - return ldceph.client->write(FROM_FD(fd), (char*)buf, count); - else - return syscall(SYS_write, fd, buf, count); - } - - ssize_t read(int fd, void *buf, size_t count) { - if (MYFD(fd)) - return ldceph.client->read(FROM_FD(fd), (char*)buf, count); - else - return syscall(SYS_read, fd, buf, count); - } - - //int fsync(int fd); - //int fdatasync(int fd); - - - // namespace - int rmdir(const char *pathname) { - char buf[255]; - if (const char *c = ldceph.get_ceph_path(pathname, buf)) - return ldceph.client->rmdir(c); - else - return syscall(SYS_rmdir, pathname); - } - int mkdir(const char *pathname, mode_t mode) { - char buf[255]; - if (const char *c = ldceph.get_ceph_path(pathname, buf)) - return ldceph.client->mkdir(c, mode); - else - return syscall(SYS_mkdir, pathname, mode); - } - int unlink(const char *pathname) { - char buf[255]; - if (const char *c = ldceph.get_ceph_path(pathname, buf)) - return ldceph.client->unlink(c); - else - return syscall(SYS_unlink, pathname); - } - - int stat(const char *pathname, struct stat *st) { - //int __xstat64(int __ver, const char *pathname, struct stat64 *st64) { // stoopid GLIBC - //struct stat *st = (struct stat*)st64; - char buf[255]; - if (const char *c = ldceph.get_ceph_path(pathname, buf)) - return ldceph.client->lstat(c, st); // FIXME - else - return syscall(SYS_stat, pathname, st); - } - //int fstat(int filedes, struct stat *buf); - //int lstat(const char *file_name, struct stat *buf); - - int chdir(const char *pathname) { - char buf[255]; - if (const char *c = ldceph.get_ceph_path(pathname, buf)) { - int r = ldceph.client->chdir(c); - if (r == 0) { - if (!ldceph.cwd_in_mp) - syscall(SYS_chdir, ldceph.mount_point_parent); - ldceph.cwd_in_mp = true; - ldceph.cwd_above_mp = false; - ldceph.cwd = ldceph.mount_point; - filepath fpc = c; - ldceph.cwd.append(fpc); - } - return r; - } else { - int r = syscall(SYS_chdir, pathname); - if (r) { - ldceph.refresh_cwd(); - } - return r; - } - } - char *getcwd(char *buf, size_t size) { - strncpy(buf, ldceph.cwd.c_str(), size); - return buf; - } - //int fchdir(int fd); - - - - -} diff --git a/trunk/ceph/cmds.cc b/trunk/ceph/cmds.cc deleted file mode 100644 index 6e475ad4b588d..0000000000000 --- a/trunk/ceph/cmds.cc +++ /dev/null @@ -1,108 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include -#include -#include - -#include -#include -#include -using namespace std; - -#include "config.h" - -#include "mon/MonMap.h" -#include "mds/MDS.h" - -#include "msg/SimpleMessenger.h" - -#include "common/Timer.h" - - -class C_Die : public Context { -public: - void finish(int) { - cerr << "die" << std::endl; - exit(1); - } -}; - -class C_Debug : public Context { - public: - void finish(int) { - int size = &g_conf.debug_after - &g_conf.debug; - memcpy((char*)&g_conf.debug, (char*)&g_debug_after_conf.debug, size); - generic_dout(0) << "debug_after flipping debug settings" << dendl; - } -}; - - -int main(int argc, char **argv) -{ - vector args; - argv_to_vec(argc, argv, args); - - parse_config_options(args); - - if (g_conf.kill_after) - g_timer.add_event_after(g_conf.kill_after, new C_Die); - if (g_conf.debug_after) - g_timer.add_event_after(g_conf.debug_after, new C_Debug); - - // mds specific args - int whoami = -1; - bool standby = false; // by default, i'll start active. - for (unsigned i=0; i= 0); - - // start up network - rank.start_rank(); - - // start mds - Messenger *m = rank.register_entity(entity_name_t::MDS(whoami)); - assert(m); - - MDS *mds = new MDS(whoami, m, &monmap); - mds->init(standby); - - // wait - rank.wait(); - - // yuck: grab the mds lock, so we can be sure that whoever in *mds - // called shutdown finishes what they were doing. - mds->mds_lock.Lock(); - mds->mds_lock.Unlock(); - - // done - //delete mds; - - return 0; -} - diff --git a/trunk/ceph/cmon.cc b/trunk/ceph/cmon.cc deleted file mode 100644 index 0adac2ab5cec5..0000000000000 --- a/trunk/ceph/cmon.cc +++ /dev/null @@ -1,129 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include -#include -#include - -#include -#include -#include -using namespace std; - -#include "config.h" - -#include "mon/MonMap.h" -#include "mon/Monitor.h" - -#include "msg/SimpleMessenger.h" - -#include "common/Timer.h" - - -class C_Die : public Context { -public: - void finish(int) { - cerr << "die" << std::endl; - exit(1); - } -}; - -class C_Debug : public Context { - public: - void finish(int) { - int size = &g_conf.debug_after - &g_conf.debug; - memcpy((char*)&g_conf.debug, (char*)&g_debug_after_conf.debug, size); - generic_dout(0) << "debug_after flipping debug settings" << dendl; - } -}; - - -int main(int argc, char **argv) -{ - vector args; - argv_to_vec(argc, argv, args); - - parse_config_options(args); - - if (g_conf.kill_after) - g_timer.add_event_after(g_conf.kill_after, new C_Die); - if (g_conf.debug_after) - g_timer.add_event_after(g_conf.debug_after, new C_Debug); - - // args - int whoami = -1; - char *monmap_fn = ".ceph_monmap"; - for (unsigned i=0; i= 0); - } else { - // i am specific monitor. - - // read monmap - cout << "reading monmap from .ceph_monmap" << std::endl; - int r = monmap.read(monmap_fn); - assert(r >= 0); - - // bind to a specific port - cout << "starting mon" << whoami << " at " << monmap.get_inst(whoami) << std::endl; - g_my_addr = monmap.get_inst(whoami).addr; - rank.start_rank(); - } - - // start monitor - Messenger *m = rank.register_entity(entity_name_t::MON(whoami)); - Monitor *mon = new Monitor(whoami, m, &monmap); - mon->init(); - - // wait - cout << "waiting for shutdown ..." << std::endl; - rank.wait(); - - // done - delete mon; - - return 0; -} - diff --git a/trunk/ceph/cmonctl.cc b/trunk/ceph/cmonctl.cc deleted file mode 100644 index 85f4e1dc49392..0000000000000 --- a/trunk/ceph/cmonctl.cc +++ /dev/null @@ -1,92 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include -#include -#include -using namespace std; - -#include "config.h" - -#include "mon/MonMap.h" -#include "msg/SimpleMessenger.h" -#include "messages/MMonCommand.h" -#include "messages/MMonCommandAck.h" - -#include "common/Timer.h" - -#ifndef DARWIN -#include -#endif // DARWIN - -#include -#include -#include - - -Messenger *messenger = 0; - -class Admin : public Dispatcher { - void dispatch(Message *m) { - switch (m->get_type()) { - case MSG_MON_COMMAND_ACK: - generic_dout(0) << m->get_source() << " -> '" - << ((MMonCommandAck*)m)->rs << "' (" << ((MMonCommandAck*)m)->r << ")" - << dendl; - messenger->shutdown(); - break; - } - } -} dispatcher; - -int main(int argc, char **argv, char *envp[]) { - - vector args; - argv_to_vec(argc, argv, args); - parse_config_options(args); - - // args for fuse - vec_to_argv(args, argc, argv); - - // load monmap - MonMap monmap; - int r = monmap.read(".ceph_monmap"); - assert(r >= 0); - - // start up network - rank.start_rank(); - messenger = rank.register_entity(entity_name_t::ADMIN()); - messenger->set_dispatcher(&dispatcher); - - // build command - MMonCommand *m = new MMonCommand(messenger->get_myinst()); - string cmd; - for (unsigned i=0; icmd.push_back(string(args[i])); - } - int mon = monmap.pick_mon(); - - generic_dout(0) << "mon" << mon << " <- '" << cmd << "'" << dendl; - - // send it - messenger->send_message(m, monmap.get_inst(mon)); - - // wait for messenger to finish - rank.wait(); - - return 0; -} - diff --git a/trunk/ceph/common/Clock.cc b/trunk/ceph/common/Clock.cc deleted file mode 100644 index 8b07f6d9eb15f..0000000000000 --- a/trunk/ceph/common/Clock.cc +++ /dev/null @@ -1,20 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include "Clock.h" - -// public -Clock g_clock; - diff --git a/trunk/ceph/common/Cond.h b/trunk/ceph/common/Cond.h deleted file mode 100644 index 4cb3d721b423f..0000000000000 --- a/trunk/ceph/common/Cond.h +++ /dev/null @@ -1,119 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __COND_H -#define __COND_H - -#include - -#include "Mutex.h" -#include "Clock.h" - -#include "include/Context.h" - -#include -#include - -class Cond { - // my bits - pthread_cond_t _c; - - // don't allow copying. - void operator=(Cond &C) {} - Cond( const Cond &C ) {} - - public: - Cond() { - int r = pthread_cond_init(&_c,NULL); - assert(r == 0); - } - virtual ~Cond() { - pthread_cond_destroy(&_c); - } - - int Wait(Mutex &mutex) { - int r = pthread_cond_wait(&_c, &mutex._m); - return r; - } - - int Wait(Mutex &mutex, char* s) { - //cout << "Wait: " << s << endl; - int r = pthread_cond_wait(&_c, &mutex._m); - return r; - } - - int WaitUntil(Mutex &mutex, utime_t when) { - struct timespec ts; - g_clock.make_timespec(when, &ts); - //cout << "timedwait for " << ts.tv_sec << " sec " << ts.tv_nsec << " nsec" << endl; - int r = pthread_cond_timedwait(&_c, &mutex._m, &ts); - return r; - } - int WaitInterval(Mutex &mutex, utime_t interval) { - utime_t when = g_clock.now(); - when += interval; - return WaitUntil(mutex, when); - } - - int Signal() { - //int r = pthread_cond_signal(&_c); - int r = pthread_cond_broadcast(&_c); - return r; - } - int SignalOne() { - int r = pthread_cond_signal(&_c); - return r; - } - int SignalAll() { - //int r = pthread_cond_signal(&_c); - int r = pthread_cond_broadcast(&_c); - return r; - } -}; - -class C_Cond : public Context { - Cond *cond; - bool *done; - int *rval; -public: - C_Cond(Cond *c, bool *d, int *r=0) : cond(c), done(d), rval(r) { - *done = false; - } - void finish(int r) { - if (rval) *rval = r; - *done = true; - cond->Signal(); - } -}; - -class C_SafeCond : public Context { - Mutex *lock; - Cond *cond; - bool *done; - int *rval; -public: - C_SafeCond(Mutex *l, Cond *c, bool *d, int *r=0) : lock(l), cond(c), done(d), rval(r) { - *done = false; - } - void finish(int r) { - lock->Lock(); - if (rval) *rval = r; - *done = true; - cond->Signal(); - lock->Unlock(); - } -}; - -#endif diff --git a/trunk/ceph/common/DecayCounter.h b/trunk/ceph/common/DecayCounter.h deleted file mode 100644 index f431fb2073cd7..0000000000000 --- a/trunk/ceph/common/DecayCounter.h +++ /dev/null @@ -1,138 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __DECAYCOUNTER_H -#define __DECAYCOUNTER_H - -#include -#include "Clock.h" - -#include "config.h" - -/** - * - * TODO: normalize value based on some fucntion of half_life, - * so that it can be interpreted as an approximation of a - * moving average of N seconds. currently, changing half-life - * skews the scale of the value, even at steady state. - * - */ - -class DecayCounter { - protected: -public: - double half_life; - double k; // k = ln(.5)/half_life - double val; // value - double delta; // delta since last decay - double vel; // recent velocity - utime_t last_decay; // time of last decay - - public: - DecayCounter() : val(0), delta(0), vel(0) { - set_halflife( g_conf.mds_decay_halflife ); - reset(); - } - DecayCounter(double hl) : val(0), delta(0), vel(0) { - set_halflife( hl ); - reset(); - } - - /** - * reading - */ - - double get() { - return get(g_clock.now()); - } - - double get(utime_t now) { - decay(now); - return val; - } - - double get_last() { - return val; - } - - double get_last_vel() { - return vel; - } - - utime_t get_last_decay() { - return last_decay; - } - - /** - * adjusting - */ - - double hit(utime_t now, double v = 1.0) { - decay(now); - delta += v; - return val+delta; - } - - void adjust(double a) { - val += a; - } - void adjust(utime_t now, double a) { - decay(now); - val += a; - } - void scale(double f) { - val *= f; - delta *= f; - vel *= f; - } - - /** - * decay etc. - */ - - void set_halflife(double hl) { - half_life = hl; - k = log(.5) / hl; - } - - void reset() { - reset(g_clock.now()); - } - void reset(utime_t now) { - last_decay = g_clock.now(); - val = delta = 0; - } - - void decay(utime_t now) { - utime_t el = now; - el -= last_decay; - - if (el.sec() >= 1) { - // calculate new value - double newval = (val+delta) * exp((double)el * k); - if (newval < .01) newval = 0.0; - - // calculate velocity approx - vel += (newval - val) * (double)el; - vel *= exp((double)el * k); - - val = newval; - delta = 0; - last_decay = now; - } - } -}; - - -#endif diff --git a/trunk/ceph/common/LogType.h b/trunk/ceph/common/LogType.h deleted file mode 100644 index a0889545acb6a..0000000000000 --- a/trunk/ceph/common/LogType.h +++ /dev/null @@ -1,122 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __LOGTYPE_H -#define __LOGTYPE_H - -#include "include/types.h" - -#include -#include -using std::string; -using std::ofstream; - -#include -#include -using __gnu_cxx::hash_map; -using __gnu_cxx::hash_set; - -#include "Mutex.h" - - -class LogType { - protected: - hash_map keymap; - vector keys; - set inc_keys; - vector avg; - - int version; - - // HACK to avoid the hash table as often as possible... - // cache recent key name lookups in a small ring buffer - const static int cache_keys = 10; - intptr_t kc_ptr[cache_keys]; - int kc_val[cache_keys]; - int kc_pos; - - friend class Logger; - - public: - LogType() { - version = 1; - - for (int i=0;i= 0) return i; - - i = keys.size(); - keys.push_back(key); - avg.push_back(false); - - intptr_t p = (intptr_t)key; - keymap[p] = i; - if (is_inc) inc_keys.insert(i); - - version++; - return i; - } - int add_inc(const char* key) { - return add_key(key, true); - } - int add_set(const char *key) { - return add_key(key, false); - } - int add_avg(const char *key) { - int i = add_key(key, true); - avg[i] = true; - return i; - } - - bool have_key(const char* key) { - return lookup_key(key) < 0; - } - - int lookup_key(const char* key) { - intptr_t p = (intptr_t)key; - - if (keymap.count(p)) - return keymap[p]; - - // try kc ringbuffer - int pos = kc_pos-1; - for (int j=0; j - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __LOGGER_H -#define __LOGGER_H - -#include "include/types.h" -#include "Clock.h" - -#include -#include -#include -using std::vector; -using std::string; -using std::ofstream; - -#include "LogType.h" - - -class Logger { - protected: - // values for this instance - vector vals; - vector fvals; - vector< vector > vals_to_avg; - - void maybe_resize(unsigned s) { - while (s >= vals.size()) { - vals.push_back(0); - fvals.push_back(0.0); - vals_to_avg.push_back(vector()); - } - } - - // my type - LogType *type; - int version; - - string filename; - ofstream out; - - // what i've written - //int last_logged; - int wrote_header; - int wrote_header_last; - - public: - Logger(string fn, LogType *type, bool append=false); - ~Logger(); - - long inc(const char *s, long v = 1); - long set(const char *s, long v); - long get(const char *s); - - double fset(const char *s, double v); - double finc(const char *s, double v); - double favg(const char *s, double v); - - //void flush(); - void _flush(); - - void set_start(utime_t s); -}; - -#endif diff --git a/trunk/ceph/common/Mutex.h b/trunk/ceph/common/Mutex.h deleted file mode 100755 index 724c4dbed2a76..0000000000000 --- a/trunk/ceph/common/Mutex.h +++ /dev/null @@ -1,83 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MUTEX_H -#define __MUTEX_H - -#include -#include - -class Mutex { -private: - pthread_mutex_t _m; - int nlock; - bool recursive; - - // don't allow copying. - void operator=(Mutex &M) {} - Mutex( const Mutex &M ) {} - -public: - Mutex(bool r = true) : nlock(0), recursive(r) { - if (recursive) { - pthread_mutexattr_t attr; - pthread_mutexattr_init(&attr); - pthread_mutexattr_settype(&attr,PTHREAD_MUTEX_RECURSIVE); - pthread_mutex_init(&_m,&attr); - pthread_mutexattr_destroy(&attr); - } else { - pthread_mutex_init(&_m,NULL); - } - } - virtual ~Mutex() { - assert(nlock == 0); - pthread_mutex_destroy(&_m); - } - - bool is_locked() { - return (nlock > 0); - } - - void Lock() { - int r = pthread_mutex_lock(&_m); - assert(r == 0); - nlock++; - assert(nlock == 1 || recursive); - } - - void Unlock() { - assert(nlock > 0); - --nlock; - int r = pthread_mutex_unlock(&_m); - assert(r == 0); - } - - friend class Cond; - - -public: - class Locker { - Mutex &mutex; - - public: - Locker(Mutex& m) : mutex(m) { - mutex.Lock(); - } - ~Locker() { - mutex.Unlock(); - } - }; -}; - -#endif diff --git a/trunk/ceph/common/RWLock.h b/trunk/ceph/common/RWLock.h deleted file mode 100644 index 14e158a64ab97..0000000000000 --- a/trunk/ceph/common/RWLock.h +++ /dev/null @@ -1,50 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef _RWLock_Posix_ -#define _RWLock_Posix_ - -#include - -class RWLock -{ - mutable pthread_rwlock_t L; - - public: - - RWLock() { - pthread_rwlock_init(&L, NULL); - } - - virtual ~RWLock() { - pthread_rwlock_unlock(&L); - pthread_rwlock_destroy(&L); - } - - void unlock() { - pthread_rwlock_unlock(&L); - } - void get_read() { - pthread_rwlock_rdlock(&L); - } - void put_read() { unlock(); } - void get_write() { - pthread_rwlock_wrlock(&L); - } - void put_write() { unlock(); } -}; - -#endif // !_Mutex_Posix_ diff --git a/trunk/ceph/common/Semaphore.h b/trunk/ceph/common/Semaphore.h deleted file mode 100644 index bc0a9e60d7ffa..0000000000000 --- a/trunk/ceph/common/Semaphore.h +++ /dev/null @@ -1,53 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef _Sem_Posix_ -#define _Sem_Posix_ - -#include - -class Semaphore -{ - Mutex m; - Cond c; - int count; - - public: - - Semaphore() - { - count = 0; - } - - void Put() - { - m.Lock(); - count++; - c.Signal(); - m.Unlock(); - } - - void Get() - { - m.Lock(); - while(count <= 0) { - c.Wait(m); - } - count--; - m.Unlock(); - } -}; - -#endif // !_Mutex_Posix_ diff --git a/trunk/ceph/common/Thread.h b/trunk/ceph/common/Thread.h deleted file mode 100644 index 06e20047da57f..0000000000000 --- a/trunk/ceph/common/Thread.h +++ /dev/null @@ -1,81 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __THREAD_H -#define __THREAD_H - -#include -#include -#include - -class Thread { - private: - pthread_t thread_id; - - public: - Thread() : thread_id(0) {} - virtual ~Thread() {} - - protected: - virtual void *entry() = 0; - - private: - static void *_entry_func(void *arg) { - return ((Thread*)arg)->entry(); - } - - public: - pthread_t &get_thread_id() { return thread_id; } - bool is_started() { return thread_id != 0; } - bool am_self() { return (pthread_self() == thread_id); } - - int kill(int signal) { - return pthread_kill(thread_id, signal); - } - int create() { - return pthread_create( &thread_id, NULL, _entry_func, (void*)this ); - } - int join(void **prval = 0) { - if (thread_id == 0) { - generic_derr(0) << "WARNING: join on thread that was never started" << dendl; - //assert(0); - return -EINVAL; // never started. - } - - int status = pthread_join(thread_id, prval); - if (status != 0) { - switch (status) { - case -EINVAL: - generic_derr(0) << "thread " << thread_id << " join status = EINVAL" << dendl; - break; - case -ESRCH: - generic_derr(0) << "thread " << thread_id << " join status = ESRCH" << dendl; - assert(0); - break; - case -EDEADLK: - generic_derr(0) << "thread " << thread_id << " join status = EDEADLK" << dendl; - break; - default: - generic_derr(0) << "thread " << thread_id << " join status = " << status << dendl; - } - assert(0); // none of these should happen. - } - thread_id = 0; - return status; - } - -}; - -#endif diff --git a/trunk/ceph/common/ThreadPool.h b/trunk/ceph/common/ThreadPool.h deleted file mode 100644 index 62855a240cd0c..0000000000000 --- a/trunk/ceph/common/ThreadPool.h +++ /dev/null @@ -1,139 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef THREADPOOL -#define THREADPOOL - -#include -using std::list; - - -#include -#include -#include -#include - - -// debug output -#include "config.h" -#define tpdout(x) if (x <= g_conf.debug) *_dout << myname -#define DBLVL 15 - - -using namespace std; - -#define MAX_THREADS 1000 - -template -class ThreadPool { - - private: - list q; - Mutex q_lock; - Semaphore q_sem; - - int num_ops; - int num_threads; - vector thread; - - U u; - void (*func)(U,T); - void (*prefunc)(U,T); - string myname; - - static void *foo(void *arg) - { - ThreadPool *t = (ThreadPool *)arg; - t->do_ops(arg); - return 0; - } - - void *do_ops(void *nothing) - { - tpdout(DBLVL) << ".do_ops thread " << pthread_self() << " starting" << std::endl; - while (1) { - q_sem.Get(); - if (q.empty()) break; - - T op = get_op(); - tpdout(DBLVL) << ".func thread "<< pthread_self() << " on " << op << std::endl; - func(u, op); - } - tpdout(DBLVL) << ".do_ops thread " << pthread_self() << " exiting" << std::endl; - return 0; - } - - - T get_op() - { - T op; - q_lock.Lock(); - { - op = q.front(); - q.pop_front(); - num_ops--; - - if (prefunc && op) { - tpdout(DBLVL) << ".prefunc thread "<< pthread_self() << " on " << op << std::endl; - prefunc(u, op); - } - } - q_lock.Unlock(); - - return op; - } - - public: - - ThreadPool(char *myname, int howmany, void (*f)(U,T), U obj, void (*pf)(U,T) = 0) : - num_ops(0), num_threads(howmany), - thread(num_threads), - u(obj), - func(f), prefunc(pf), - myname(myname) { - tpdout(DBLVL) << ".cons num_threads " << num_threads << std::endl; - - // start threads - int status; - for(int i = 0; i < howmany; i++) { - status = pthread_create(&thread[i], NULL, (void*(*)(void *))&ThreadPool::foo, this); - assert(status == 0); - } - } - - ~ThreadPool() { - // bump sem to make threads exit cleanly - for(int i = 0; i < num_threads; i++) - q_sem.Put(); - - // wait for them to die - for(int i = 0; i < num_threads; i++) { - tpdout(DBLVL) << ".des joining thread " << thread[i] << std::endl; - void *rval = 0; // we don't actually care - pthread_join(thread[i], &rval); - } - } - - void put_op(T op) { - tpdout(DBLVL) << ".put_op " << op << std::endl; - q_lock.Lock(); - q.push_back(op); - num_ops++; - q_sem.Put(); - q_lock.Unlock(); - } - -}; -#endif diff --git a/trunk/ceph/common/Timer.cc b/trunk/ceph/common/Timer.cc deleted file mode 100644 index 1705bc759ac9f..0000000000000 --- a/trunk/ceph/common/Timer.cc +++ /dev/null @@ -1,335 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - - -#include "Timer.h" -#include "Cond.h" - -#include "config.h" -#include "include/Context.h" - -#define dout(x) if (x <= g_conf.debug_timer) *_dout << dbeginl << g_clock.now() << " TIMER " -#define derr(x) if (x <= g_conf.debug_timer) *_derr << dbeginl << g_clock.now() << " TIMER " - -#define DBL 10 - -#include -#include -#include - -// single global instance -Timer g_timer; - - - -/**** thread solution *****/ - -bool Timer::get_next_due(utime_t& when) -{ - if (scheduled.empty()) { - dout(10) << "get_next_due - nothing scheduled" << dendl; - return false; - } else { - map< utime_t, set >::iterator it = scheduled.begin(); - when = it->first; - dout(10) << "get_next_due - " << when << dendl; - return true; - } -} - - -void Timer::timer_entry() -{ - lock.Lock(); - - while (!thread_stop) { - - // now - utime_t now = g_clock.now(); - - // any events due? - utime_t next; - bool next_due = get_next_due(next); - - if (next_due && now >= next) { - // move to pending list - list pending; - - map< utime_t, set >::iterator it = scheduled.begin(); - while (it != scheduled.end()) { - if (it->first > now) break; - - utime_t t = it->first; - dout(DBL) << "queueing event(s) scheduled at " << t << dendl; - - for (set::iterator cit = it->second.begin(); - cit != it->second.end(); - cit++) { - pending.push_back(*cit); - event_times.erase(*cit); - num_event--; - } - - map< utime_t, set >::iterator previt = it; - it++; - scheduled.erase(previt); - } - - if (!pending.empty()) { - sleeping = false; - lock.Unlock(); - { - // make sure we're not holding any locks while we do callbacks - // make the callbacks myself. - for (list::iterator cit = pending.begin(); - cit != pending.end(); - cit++) { - dout(DBL) << "start callback " << *cit << dendl; - (*cit)->finish(0); - dout(DBL) << "finish callback " << *cit << dendl; - delete *cit; - } - pending.clear(); - assert(pending.empty()); - } - lock.Lock(); - } - - } - else { - // sleep - if (next_due) { - dout(DBL) << "sleeping until " << next << dendl; - timed_sleep = true; - sleeping = true; - timeout_cond.WaitUntil(lock, next); // wait for waker or time - utime_t now = g_clock.now(); - dout(DBL) << "kicked or timed out at " << now << dendl; - } else { - dout(DBL) << "sleeping" << dendl; - timed_sleep = false; - sleeping = true; - sleep_cond.Wait(lock); // wait for waker - utime_t now = g_clock.now(); - dout(DBL) << "kicked at " << now << dendl; - } - } - } - - lock.Unlock(); -} - - - -/** - * Timer bits - */ - -void Timer::register_timer() -{ - if (timer_thread.is_started()) { - if (sleeping) { - dout(DBL) << "register_timer kicking thread" << dendl; - if (timed_sleep) - timeout_cond.SignalAll(); - else - sleep_cond.SignalAll(); - } else { - dout(DBL) << "register_timer doing nothing; thread is awake" << dendl; - // it's probably doing callbacks. - } - } else { - dout(DBL) << "register_timer starting thread" << dendl; - timer_thread.create(); - } -} - -void Timer::cancel_timer() -{ - // clear my callback pointers - if (timer_thread.is_started()) { - dout(10) << "setting thread_stop flag" << dendl; - lock.Lock(); - thread_stop = true; - if (timed_sleep) - timeout_cond.SignalAll(); - else - sleep_cond.SignalAll(); - lock.Unlock(); - - dout(10) << "waiting for thread to finish" << dendl; - void *ptr; - timer_thread.join(&ptr); - - dout(10) << "thread finished, exit code " << ptr << dendl; - } -} - - -/* - * schedule - */ - - -void Timer::add_event_after(double seconds, - Context *callback) -{ - utime_t when = g_clock.now(); - when += seconds; - add_event_at(when, callback); -} - -void Timer::add_event_at(utime_t when, - Context *callback) -{ - lock.Lock(); - - dout(DBL) << "add_event " << callback << " at " << when << dendl; - - // insert - scheduled[when].insert(callback); - assert(event_times.count(callback) == 0); - event_times[callback] = when; - - num_event++; - - // make sure i wake up on time - register_timer(); - - lock.Unlock(); -} - -bool Timer::cancel_event(Context *callback) -{ - lock.Lock(); - - dout(DBL) << "cancel_event " << callback << dendl; - - if (!event_times.count(callback)) { - dout(DBL) << "cancel_event " << callback << " isn't scheduled (probably executing)" << dendl; - lock.Unlock(); - return false; // wasn't scheduled. - } - - utime_t tp = event_times[callback]; - event_times.erase(callback); - - assert(scheduled.count(tp)); - assert(scheduled[tp].count(callback)); - scheduled[tp].erase(callback); - if (scheduled[tp].empty()) - scheduled.erase(tp); - - lock.Unlock(); - - // delete the canceled event. - delete callback; - - return true; -} - - -// ------------------------------- - -void SafeTimer::add_event_after(double seconds, Context *c) -{ - assert(lock.is_locked()); - Context *w = new EventWrapper(this, c); - dout(DBL) << "SafeTimer.add_event_after wrapping " << c << " with " << w << dendl; - scheduled[c] = w; - g_timer.add_event_after(seconds, w); -} - -void SafeTimer::add_event_at(utime_t when, Context *c) -{ - assert(lock.is_locked()); - Context *w = new EventWrapper(this, c); - dout(DBL) << "SafeTimer.add_event_at wrapping " << c << " with " << w << dendl; - scheduled[c] = w; - g_timer.add_event_at(when, w); -} - -void SafeTimer::EventWrapper::finish(int r) -{ - timer->lock.Lock(); - if (timer->scheduled.count(actual)) { - // still scheduled. execute. - actual->finish(r); - timer->scheduled.erase(actual); - } else { - // i was canceled. - assert(timer->canceled.count(actual)); - } - - // did i get canceled? - // (this can happen even if i just executed above. e.g., i may have canceled myself.) - if (timer->canceled.count(actual)) { - timer->canceled.erase(actual); - timer->cond.Signal(); - } - - // delete the original event - delete actual; - - timer->lock.Unlock(); -} - -void SafeTimer::cancel_event(Context *c) -{ - assert(lock.is_locked()); - assert(scheduled.count(c)); - - if (g_timer.cancel_event(scheduled[c])) { - // hosed wrapper. hose original event too. - delete c; - } else { - // clean up later. - canceled[c] = scheduled[c]; - } - scheduled.erase(c); -} - -void SafeTimer::cancel_all() -{ - assert(lock.is_locked()); - - while (!scheduled.empty()) - cancel_event(scheduled.begin()->first); -} - -void SafeTimer::join() -{ - assert(lock.is_locked()); - assert(scheduled.empty()); - - if (!canceled.empty()) { - while (!canceled.empty()) { - // wait - dout(2) << "SafeTimer.join waiting for " << canceled.size() << " to join: " << canceled << dendl; - cond.Wait(lock); - } - dout(2) << "SafeTimer.join done" << dendl; - } -} - -SafeTimer::~SafeTimer() -{ - if (!scheduled.empty() && !canceled.empty()) { - derr(0) << "SafeTimer.~SafeTimer " << scheduled.size() << " events scheduled, " - << canceled.size() << " canceled but unflushed" - << dendl; - } -} diff --git a/trunk/ceph/common/Timer.h b/trunk/ceph/common/Timer.h deleted file mode 100644 index 3574833c342c3..0000000000000 --- a/trunk/ceph/common/Timer.h +++ /dev/null @@ -1,175 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __TIMER_H -#define __TIMER_H - -#include "include/types.h" -#include "include/Context.h" -#include "Clock.h" - -#include "Mutex.h" -#include "Cond.h" -#include "Thread.h" - -#include -#include -using std::map; -using std::set; - -#include -using namespace __gnu_cxx; - - -/*** Timer - * schedule callbacks - */ - -//class Messenger; - - -namespace __gnu_cxx { - template<> struct hash { - size_t operator()(const Context *p) const { - static hash H; - return H((unsigned long)p); - } - }; -} - - -class Timer { - private: - map< utime_t, set > scheduled; // time -> (context ...) - hash_map< Context*, utime_t > event_times; // event -> time - - bool get_next_due(utime_t &when); - - void register_timer(); // make sure i get a callback - void cancel_timer(); // make sure i get a callback - - bool thread_stop; - Mutex lock; - bool timed_sleep; - bool sleeping; - Cond sleep_cond; - Cond timeout_cond; - - public: - void timer_entry(); // waiter thread (that wakes us up) - - class TimerThread : public Thread { - Timer *t; - public: - void *entry() { - t->timer_entry(); - return 0; - } - TimerThread(Timer *_t) : t(_t) {} - } timer_thread; - - - int num_event; - - - public: - Timer() : - thread_stop(false), - timed_sleep(false), - sleeping(false), - timer_thread(this), - num_event(0) - { - } - ~Timer() { - // stop. - cancel_timer(); - - // scheduled - for (map< utime_t, set >::iterator it = scheduled.begin(); - it != scheduled.end(); - it++) { - for (set::iterator sit = it->second.begin(); - sit != it->second.end(); - sit++) - delete *sit; - } - scheduled.clear(); - } - - void init() { - register_timer(); - } - void shutdown() { - cancel_timer(); - } - - // schedule events - void add_event_after(double seconds, - Context *callback); - void add_event_at(utime_t when, - Context *callback); - bool cancel_event(Context *callback); - - // execute pending events - void execute_pending(); - -}; - - -/* - * SafeTimer is a wrapper around the raw Timer (or rather, g_timer, it's global - * instantiation) that protects event execution with an existing mutex. It - * provides for, among other things, reliable event cancellation on class - * destruction. The caller just needs to cancel each event (or cancel_all()), - * and then call join() to ensure any concurrently exectuting events (in other - * threads) get flushed. - */ -class SafeTimer { - Mutex& lock; - Cond cond; - map scheduled; // actual -> wrapper - map canceled; - - class EventWrapper : public Context { - SafeTimer *timer; - Context *actual; - public: - EventWrapper(SafeTimer *st, Context *c) : timer(st), - actual(c) {} - void finish(int r); - }; - -public: - SafeTimer(Mutex& l) : lock(l) { } - ~SafeTimer(); - - void add_event_after(double seconds, Context *c); - void add_event_at(utime_t when, Context *c); - void cancel_event(Context *c); - void cancel_all(); - void join(); - - int get_num_scheduled() { return scheduled.size(); } - int get_num_canceled() { return canceled.size(); } -}; - - -// single global instance -extern Timer g_timer; - - - -#endif diff --git a/trunk/ceph/cosd.cc b/trunk/ceph/cosd.cc deleted file mode 100644 index e575c72836e69..0000000000000 --- a/trunk/ceph/cosd.cc +++ /dev/null @@ -1,135 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include -#include -#include - -#include -#include -#include -using namespace std; - -#include "config.h" - -#include "mon/MonMap.h" - -#include "osd/OSD.h" -#include "ebofs/Ebofs.h" - -#include "msg/SimpleMessenger.h" - -#include "common/Timer.h" - - -class C_Die : public Context { -public: - void finish(int) { - cerr << "die" << std::endl; - exit(1); - } -}; - -class C_Debug : public Context { - public: - void finish(int) { - int size = &g_conf.debug_after - &g_conf.debug; - memcpy((char*)&g_conf.debug, (char*)&g_debug_after_conf.debug, size); - cout << "debug_after flipping debug settings" << std::endl; - } -}; - - -int main(int argc, char **argv) -{ - vector args; - argv_to_vec(argc, argv, args); - - parse_config_options(args); - - if (g_conf.kill_after) - g_timer.add_event_after(g_conf.kill_after, new C_Die); - if (g_conf.debug_after) - g_timer.add_event_after(g_conf.debug_after, new C_Debug); - - if (g_conf.clock_tare) g_clock.tare(); - - // osd specific args - char *dev = 0; - char dev_default[20]; - int whoami = -1; - for (unsigned i=0; imount(); - int r = store->read(object_t(0,0), 0, sizeof(sb), bl); - if (r < 0) { - cerr << "couldn't read superblock object on " << dev << std::endl; - exit(0); - } - bl.copy(0, sizeof(sb), (char*)&sb); - store->umount(); - delete store; - whoami = sb.whoami; - - cout << "osd fs says i am osd" << whoami << std::endl; - } else { - cout << "command line arg says i am osd" << whoami << std::endl; - } - - // load monmap - MonMap monmap; - int r = monmap.read(".ceph_monmap"); - assert(r >= 0); - - // start up network - rank.start_rank(); - - // start osd - Messenger *m = rank.register_entity(entity_name_t::OSD(whoami)); - assert(m); - OSD *osd = new OSD(whoami, m, &monmap, dev); - osd->init(); - - // wait - rank.wait(); - - // done - delete osd; - - return 0; -} - diff --git a/trunk/ceph/crush.old/BinaryTree.h b/trunk/ceph/crush.old/BinaryTree.h deleted file mode 100644 index 7573fc02ed6dc..0000000000000 --- a/trunk/ceph/crush.old/BinaryTree.h +++ /dev/null @@ -1,285 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __crush_BINARYTREE_H -#define __crush_BINARYTREE_H - -#include -#include -#include -#include -using std::map; -using std::vector; - -#include "include/buffer.h" - -namespace crush { - - class BinaryTree { - private: - // tree def - int root_node; // 0 for empty tree. - int alloc; - vector node_nested; // all existing nodes in this map - vector node_weight; // and this one - vector node_complete; // only nodes with all possible children - - public: - BinaryTree() : root_node(0), alloc(0) {} - - void _encode(bufferlist& bl) { - bl.append((char*)&root_node, sizeof(root_node)); - bl.append((char*)&alloc, sizeof(alloc)); - ::_encode(node_nested, bl); - ::_encode(node_weight, bl); - ::_encode(node_complete, bl); - } - void _decode(bufferlist& bl, int& off) { - bl.copy(off, sizeof(root_node), (char*)&root_node); - off += sizeof(root_node); - bl.copy(off, sizeof(alloc), (char*)&alloc); - off += sizeof(alloc); - ::_decode(node_nested, bl, off); - ::_decode(node_weight, bl, off); - ::_decode(node_complete, bl, off); - } - - // accessors - bool empty() const { return root_node == 0; } - bool exists(int n) const { return n < alloc && node_nested[n]; } - int nested(int n) const { return exists(n) ? node_nested[n]:0; } - float weight(int n) const { return exists(n) ? node_weight[n]:0; } - bool complete(int n) const { return exists(n) ? node_complete[n]:false; } - - int root() const { return root_node; } - - void realloc(int n) { - /* - while (alloc <= n) { - node_nested.push_back(0); - node_weight.push_back(0); - node_complete.push_back(0); - alloc++; - } - */ - if (alloc <= n) { - int add = n - alloc + 1; - node_nested.insert(node_nested.end(), add, 0); - node_weight.insert(node_weight.end(), add, 0); - node_complete.insert(node_complete.end(), add, 0); - alloc = n+1; - } - } - - // tree navigation - bool terminal(int n) const { return n & 1; } // odd nodes are leaves. - int height(int n) const { - assert(n); - int h = 0; - while ((n & 1) == 0) { - assert(n > 0); - h++; n = n >> 1; - } - return h; - } - int left(int n) const { - int h = height(n); - //cout << "left of " << n << " is " << (n - (1 << h)) << std::endl; - return n - (1 << (h-1)); - } - int right(int n) const { - int h = height(n); - //cout << "right of " << n << " is " << (n + (1 << h)) << std::endl; - return n + (1 << (h-1)); - } - bool on_right(int n, int h = -1) const { - if (h < 0) h = height(n); - return n & (1 << (h+1)); - } - bool on_left(int n) const { return !on_right(n); } - int parent(int n) const { - int h = height(n); - if (on_right(n, h)) - return n - (1<0; t--) out << " "; - if (tree.root() == n) - out << "root "; - else { - if (tree.on_left(n)) - out << "left "; - else - out << "right "; - } - out << n << " : nested " << tree.nested(n) << " weight " << tree.weight(n); - if (tree.complete(n)) out << " complete"; - out << std::endl; - if (!tree.terminal(n)) { - if (tree.exists(tree.left(n))) - print_binary_tree_node(out, tree, tree.left(n), i+2); - if (tree.exists(tree.right(n))) - print_binary_tree_node(out, tree, tree.right(n), i+2); - } - } - - inline ostream& operator<<(ostream& out, const BinaryTree& tree) { - if (tree.empty()) - return out << "tree is empty"; - print_binary_tree_node(out, tree, tree.root(), 0); - return out; - } - -} - -#endif diff --git a/trunk/ceph/crush.old/Bucket.h b/trunk/ceph/crush.old/Bucket.h deleted file mode 100644 index 81a2576697bd7..0000000000000 --- a/trunk/ceph/crush.old/Bucket.h +++ /dev/null @@ -1,632 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __crush_BUCKET_H -#define __crush_BUCKET_H - -#include "BinaryTree.h" -#include "Hash.h" - -#include -#include -#include -#include -using namespace std; - -#include - -#include "include/buffer.h" - -namespace crush { - - - const int CRUSH_BUCKET_UNIFORM = 1; - const int CRUSH_BUCKET_TREE = 2; - const int CRUSH_BUCKET_LIST = 3; - const int CRUSH_BUCKET_STRAW = 4; - - /** abstract bucket **/ - class Bucket { - protected: - int id; - int parent; - int type; - float weight; - - public: - Bucket(int _type, - float _weight) : - id(0), parent(0), - type(_type), - weight(_weight) { } - - Bucket(bufferlist& bl, int& off) { - bl.copy(off, sizeof(id), (char*)&id); - off += sizeof(id); - bl.copy(off, sizeof(parent), (char*)&parent); - off += sizeof(parent); - bl.copy(off, sizeof(type), (char*)&type); - off += sizeof(type); - bl.copy(off, sizeof(weight), (char*)&weight); - off += sizeof(weight); - } - - virtual ~Bucket() { } - - virtual const char *get_bucket_type() const = 0; - virtual bool is_uniform() const = 0; - - int get_id() const { return id; } - int get_type() const { return type; } - float get_weight() const { return weight; } - int get_parent() const { return parent; } - virtual int get_size() const = 0; - - void set_id(int i) { id = i; } - void set_parent(int p) { parent = p; } - void set_weight(float w) { weight = w; } - - virtual void get_items(vector& i) const = 0; - virtual float get_item_weight(int item) const = 0; - virtual void add_item(int item, float w, bool back=false) = 0; - virtual void adjust_item_weight(int item, float w) = 0; - virtual void set_item_weight(int item, float w) { - adjust_item_weight(item, w - get_item_weight(item)); - } - - virtual int choose_r(int x, int r, Hash& h) const = 0; - - virtual void _encode(bufferlist& bl) = 0; - }; - - - - - /** uniform bucket **/ - class UniformBucket : public Bucket { - protected: - public: - vector items; - int item_type; - float item_weight; - - // primes - vector primes; - - int get_prime(int j) const { - return primes[ j % primes.size() ]; - } - void make_primes() { - if (items.empty()) return; - - //cout << "make_primes " << get_id() << " " << items.size() << endl; - Hash h(123+get_id()); - primes.clear(); - - // start with odd number > num_items - unsigned x = items.size() + 1; // this is the minimum! - x += h(items.size()) % (3*items.size()); // bump it up some - x |= 1; // make it odd - - while (primes.size() < items.size()) { - unsigned j; - for (j=2; j*j<=x; j++) - if (x % j == 0) break; - if (j*j > x) { - primes.push_back(x); - //cout << "prime " << x << endl; - } - x += 2; - } - } - - public: - UniformBucket(int _type, int _item_type) : - Bucket(_type, 0), - item_type(_item_type) { } - UniformBucket(int _type, int _item_type, - float _item_weight, vector& _items) : - Bucket(_type, _item_weight*_items.size()), - item_type(_item_type), - item_weight(_item_weight) { - items = _items; - make_primes(); - } - - UniformBucket(bufferlist& bl, int& off) : Bucket(bl, off) { - bl.copy(off, sizeof(item_type), (char*)&item_type); - off += sizeof(item_type); - bl.copy(off, sizeof(item_weight), (char*)&item_weight); - off += sizeof(item_weight); - ::_decode(items, bl, off); - make_primes(); - } - - void _encode(bufferlist& bl) { - char t = CRUSH_BUCKET_UNIFORM; - bl.append((char*)&t, sizeof(t)); - bl.append((char*)&id, sizeof(id)); - bl.append((char*)&parent, sizeof(parent)); - bl.append((char*)&type, sizeof(type)); - bl.append((char*)&weight, sizeof(weight)); - - bl.append((char*)&item_type, sizeof(item_type)); - bl.append((char*)&item_weight, sizeof(item_weight)); - - ::_encode(items, bl); - } - - const char *get_bucket_type() const { return "uniform"; } - bool is_uniform() const { return true; } - - int get_size() const { return items.size(); } - - // items - void get_items(vector& i) const { - i = items; - } - int get_item_type() const { return item_type; } - float get_item_weight(int item) const { return item_weight; } - - void add_item(int item, float w, bool back=false) { - if (items.empty()) - item_weight = w; - items.push_back(item); - weight += item_weight; - make_primes(); - } - - void adjust_item_weight(int item, float w) { - assert(0); - } - - int choose_r(int x, int r, Hash& hash) const { - //cout << "uniformbucket.choose_r(" << x << ", " << r << ")" << endl; - //if (r >= get_size()) cout << "warning: r " << r << " >= " << get_size() << " uniformbucket.size" << endl; - - unsigned v = hash(x, get_id());// % get_size(); - unsigned p = get_prime( hash(get_id(), x) ); // choose a prime based on hash(x, get_id(), 2) - unsigned s = (x + v + (r+1)*p) % get_size(); - return items[s]; - } - - }; - - - - - - // list bucket.. RUSH_P sorta - - class ListBucket : public Bucket { - protected: - list items; - list item_weight; - list sum_weight; - - public: - ListBucket(int _type) : Bucket(_type, 0) { } - - ListBucket(bufferlist& bl, int& off) : Bucket(bl, off) { - ::_decode(items, bl, off); - ::_decode(item_weight, bl, off); - ::_decode(sum_weight, bl, off); - } - - void _encode(bufferlist& bl) { - char t = CRUSH_BUCKET_LIST; - bl.append((char*)&t, sizeof(t)); - bl.append((char*)&id, sizeof(id)); - bl.append((char*)&parent, sizeof(parent)); - bl.append((char*)&type, sizeof(type)); - bl.append((char*)&weight, sizeof(weight)); - - ::_encode(items, bl); - ::_encode(item_weight, bl); - ::_encode(sum_weight, bl); - } - - const char *get_bucket_type() const { return "list"; } - bool is_uniform() const { return false; } - - int get_size() const { return items.size(); } - - void get_items(vector& i) const { - for (list::const_iterator it = items.begin(); - it != items.end(); - it++) - i.push_back(*it); - } - float get_item_weight(int item) const { - list::const_iterator i = items.begin(); - list::const_iterator w = item_weight.begin(); - while (i != items.end()) { - if (*i == item) return *w; - i++; w++; - } - assert(0); - return 0; - } - - void add_item(int item, float w, bool back=false) { - if (back) { - items.push_back(item); - item_weight.push_back(w); - sum_weight.clear(); - float s = 0.0; - for (list::reverse_iterator i = item_weight.rbegin(); - i != item_weight.rend(); - i++) { - s += *i; - sum_weight.push_front(s); - } - weight += w; - assert(weight == s); - } else { - items.push_front(item); - item_weight.push_front(w); - weight += w; - sum_weight.push_front(weight); - } - } - - void adjust_item_weight(int item, float dw) { - // find it - list::iterator p = items.begin(); - list::iterator pw = item_weight.begin(); - list::iterator ps = sum_weight.begin(); - - while (*p != item) { - *ps += dw; - p++; pw++; ps++; // next! - assert(p != items.end()); - } - - assert(*p == item); - *pw += dw; - *ps += dw; - } - - - int choose_r(int x, int r, Hash& h) const { - //cout << "linearbucket.choose_r(" << x << ", " << r << ")" << endl; - - list::const_iterator p = items.begin(); - list::const_iterator pw = item_weight.begin(); - list::const_iterator ps = sum_weight.begin(); - - while (p != items.end()) { - const int item = *p; - const float iw = *pw; - const float tw = *ps; - const float f = (float)(h(x, item, r, get_id()) % 10000) * tw / 10000.0; - //cout << "item " << item << " iw = " << iw << " tw = " << tw << " f = " << f << endl; - if (f < iw) { - //cout << "linearbucket.choose_r(" << x << ", " << r << ") = " << item << endl; - return item; - } - p++; pw++; ps++; // next! - } - assert(0); - return 0; - } - - - }; - - - - - // mixed bucket, based on RUSH_T type binary tree - - class TreeBucket : public Bucket { - protected: - //vector item_weight; - - // public: - BinaryTree tree; - map node_item; // node id -> item - vector node_item_vec; // fast version of above - map item_node; // item -> node id - map item_weight; - - public: - TreeBucket(int _type) : Bucket(_type, 0) { } - - TreeBucket(bufferlist& bl, int& off) : Bucket(bl, off) { - tree._decode(bl, off); - - ::_decode(node_item, bl, off); - ::_decode(node_item_vec, bl, off); - ::_decode(item_node, bl, off); - ::_decode(item_weight, bl, off); - } - - void _encode(bufferlist& bl) { - char t = CRUSH_BUCKET_TREE; - bl.append((char*)&t, sizeof(t)); - bl.append((char*)&id, sizeof(id)); - bl.append((char*)&parent, sizeof(parent)); - bl.append((char*)&type, sizeof(type)); - bl.append((char*)&weight, sizeof(weight)); - - tree._encode(bl); - - ::_encode(node_item, bl); - ::_encode(node_item_vec, bl); - ::_encode(item_node, bl); - ::_encode(item_weight, bl); - } - - const char *get_bucket_type() const { return "tree"; } - bool is_uniform() const { return false; } - - int get_size() const { return node_item.size(); } - - // items - void get_items(vector& i) const { - for (map::const_iterator it = node_item.begin(); - it != node_item.end(); - it++) - i.push_back(it->second); - } - float get_item_weight(int i) const { - assert(item_weight.count(i)); - return ((map)item_weight)[i]; - } - - - void add_item(int item, float w, bool back=false) { - item_weight[item] = w; - weight += w; - - unsigned n = tree.add_node(w); - node_item[n] = item; - item_node[item] = n; - - while (node_item_vec.size() <= n) - node_item_vec.push_back(0); - node_item_vec[n] = item; - } - - void adjust_item_weight(int item, float dw) { - // adjust my weight - weight += dw; - item_weight[item] += dw; - - // adjust tree weights - tree.adjust_node_weight(item_node[item], dw); - } - - int choose_r(int x, int r, Hash& h) const { - //cout << "mixedbucket.choose_r(" << x << ", " << r << ")" << endl; - int n = tree.root(); - while (!tree.terminal(n)) { - // pick a point in [0,w) - float w = tree.weight(n); - float f = (float)(h(x, n, r, get_id()) % 10000) * w / 10000.0; - - // left or right? - int l = tree.left(n); - if (tree.exists(l) && - f < tree.weight(l)) - n = l; - else - n = tree.right(n); - } - //assert(node_item.count(n)); - //return ((map)node_item)[n]; - return node_item_vec[n]; - } - }; - - - - - - // straw bucket.. new thing! - - class StrawBucket : public Bucket { - protected: - map item_weight; - map item_straw; - - list _items; - list _straws; - - public: - StrawBucket(int _type) : Bucket(_type, 0) { } - - StrawBucket(bufferlist& bl, int& off) : Bucket(bl, off) { - ::_decode(item_weight, bl, off); - calc_straws(); - } - - void _encode(bufferlist& bl) { - char t = CRUSH_BUCKET_TREE; - bl.append((char*)&t, sizeof(t)); - bl.append((char*)&id, sizeof(id)); - bl.append((char*)&parent, sizeof(parent)); - bl.append((char*)&type, sizeof(type)); - bl.append((char*)&weight, sizeof(weight)); - - ::_encode(item_weight, bl); - } - - const char *get_bucket_type() const { return "straw"; } - bool is_uniform() const { return false; } - - int get_size() const { return item_weight.size(); } - - - // items - void get_items(vector& i) const { - for (map::const_iterator it = item_weight.begin(); - it != item_weight.end(); - it++) - i.push_back(it->first); - } - float get_item_weight(int item) const { - assert(item_weight.count(item)); - return ((map)item_weight)[item]; - } - - void add_item(int item, float w, bool back=false) { - item_weight[item] = w; - weight += w; - calc_straws(); - } - - void adjust_item_weight(int item, float dw) { - //cout << "adjust " << item << " " << dw << endl; - weight += dw; - item_weight[item] += dw; - calc_straws(); - } - - - /* calculate straw lengths. - this is kind of ugly. not sure if there's a closed form way to calculate this or not! - */ - void calc_straws() { - //cout << get_id() << ": calc_straws ============" << endl; - - item_straw.clear(); - _items.clear(); - _straws.clear(); - - // reverse sort by weight; skip zero weight items - map > reverse; - for (map::iterator p = item_weight.begin(); - p != item_weight.end(); - p++) { - //cout << get_id() << ":" << p->first << " " << p->second << endl; - if (p->second > 0) { - //p->second /= minw; - reverse[p->second].insert(p->first); - } - } - - /* 1:2:7 - item_straw[0] = 1.0; - item_straw[1] = item_straw[0]*sqrt(1.0/.6); - item_straw[2] = item_straw[1]*2.0; - */ - - // work from low to high weights - float straw = 1.0; - float numleft = item_weight.size(); - float wbelow = 0.0; - float lastw = 0.0; - - map >::iterator next = reverse.begin(); - //while (next != reverse.end()) { - while (1) { - //cout << "hi " << next->first << endl; - map >::iterator cur = next; - - // set straw length for this set - for (set::iterator s = cur->second.begin(); - s != cur->second.end(); - s++) { - item_straw[*s] = straw; - //cout << "straw " << *s << " w " << item_weight[*s] << " -> " << straw << endl; - _items.push_back(*s); - _straws.push_back(straw); - } - - next++; - if (next == reverse.end()) break; - - wbelow += (cur->first-lastw) * numleft; - //cout << "wbelow " << wbelow << endl; - - numleft -= 1.0 * (float)cur->second.size(); - //cout << "numleft now " << numleft << endl; - - float wnext = numleft * (next->first - cur->first); - //cout << "wnext " << wnext << endl; - - float pbelow = wbelow / (wbelow+wnext); - //cout << "pbelow " << pbelow << endl; - - straw *= pow((double)(1.0/pbelow), (double)1.0/numleft); - - lastw = cur->first; - } - //cout << "============" << endl; - } - - int choose_r(int x, int r, Hash& h) const { - //cout << "strawbucket.choose_r(" << x << ", " << r << ")" << endl; - - float high_draw = -1; - int high = 0; - - list::const_iterator pi = _items.begin(); - list::const_iterator ps = _straws.begin(); - while (pi != _items.end()) { - const int item = *pi; - const float rnd = (float)(h(x, item, r) % 1000000) / 1000000.0; - const float straw = *ps * rnd; - - if (high_draw < 0 || - straw > high_draw) { - high = *pi; - high_draw = straw; - } - - pi++; - ps++; - } - return high; - } - }; - - - - - - inline Bucket* decode_bucket(bufferlist& bl, int& off) { - char t; - bl.copy(off, sizeof(t), (char*)&t); - off += sizeof(t); - - switch (t) { - case CRUSH_BUCKET_UNIFORM: - return new UniformBucket(bl, off); - case CRUSH_BUCKET_LIST: - return new ListBucket(bl, off); - case CRUSH_BUCKET_TREE: - return new TreeBucket(bl, off); - case CRUSH_BUCKET_STRAW: - return new StrawBucket(bl, off); - default: - assert(0); - } - return 0; - } - - - -} - - - - - - - - -#endif diff --git a/trunk/ceph/crush.old/Hash.h b/trunk/ceph/crush.old/Hash.h deleted file mode 100644 index 2f0d9e4db918b..0000000000000 --- a/trunk/ceph/crush.old/Hash.h +++ /dev/null @@ -1,301 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -// Robert Jenkins' function for mixing 32-bit values -// http://burtleburtle.net/bob/hash/evahash.html -// a, b = random bits, c = input and output -#define hashmix(a,b,c) \ - a=a-b; a=a-c; a=a^(c>>13); \ - b=b-c; b=b-a; b=b^(a<<8); \ - c=c-a; c=c-b; c=c^(b>>13); \ - a=a-b; a=a-c; a=a^(c>>12); \ - b=b-c; b=b-a; b=b^(a<<16); \ - c=c-a; c=c-b; c=c^(b>>5); \ - a=a-b; a=a-c; a=a^(c>>3); \ - b=b-c; b=b-a; b=b^(a<<10); \ - c=c-a; c=c-b; c=c^(b>>15); - -namespace crush { - - class Hash { - int seed; - - public: - int get_seed() { return seed; } - void set_seed(int s) { seed = s; } - - Hash(int s) { - unsigned int hash = 1315423911; - int x = 231232; - int y = 1232; - hashmix(s, x, hash); - hashmix(y, s, hash); - seed = s; - } - - inline int operator()(int a) { - unsigned int hash = seed ^ a; - int b = a; - int x = 231232; - int y = 1232; - hashmix(b, x, hash); - hashmix(y, a, hash); - return (hash & 0x7FFFFFFF); - } - - inline int operator()(int a, int b) { - unsigned int hash = seed ^ a ^ b; - int x = 231232; - int y = 1232; - hashmix(a, b, hash); - hashmix(x, a, hash); - hashmix(b, y, hash); - return (hash & 0x7FFFFFFF); - } - - inline int operator()(int a, int b, int c) { - unsigned int hash = seed ^ a ^ b ^ c; - int x = 231232; - int y = 1232; - hashmix(a, b, hash); - hashmix(c, x, hash); - hashmix(y, a, hash); - hashmix(b, x, hash); - hashmix(y, c, hash); - return (hash & 0x7FFFFFFF); - } - - inline int operator()(int a, int b, int c, int d) { - unsigned int hash = seed ^a ^ b ^ c ^ d; - int x = 231232; - int y = 1232; - hashmix(a, b, hash); - hashmix(c, d, hash); - hashmix(a, x, hash); - hashmix(y, b, hash); - hashmix(c, x, hash); - hashmix(y, d, hash); - return (hash & 0x7FFFFFFF); - } - - inline int operator()(int a, int b, int c, int d, int e) { - unsigned int hash = seed ^ a ^ b ^ c ^ d ^ e; - int x = 231232; - int y = 1232; - hashmix(a, b, hash); - hashmix(c, d, hash); - hashmix(e, x, hash); - hashmix(y, a, hash); - hashmix(b, x, hash); - hashmix(y, c, hash); - hashmix(d, x, hash); - hashmix(y, e, hash); - return (hash & 0x7FFFFFFF); - } - }; - -} - - - -#if 0 - - - //return myhash(a) ^ seed; - return myhash(a, seed); - } - int operator()(int a, int b) { - //return myhash( myhash(a) ^ myhash(b) ^ seed ); - return myhash(a, b, seed); - } - int operator()(int a, int b, int c) { - //return myhash( myhash(a ^ seed) ^ myhash(b ^ seed) ^ myhash(c ^ seed) ^ seed ); - return myhash(a, b, c, seed); - } - int operator()(int a, int b, int c, int d) { - //return myhash( myhash(a ^ seed) ^ myhash(b ^ seed) ^ myhash(c ^ seed) ^ myhash(d ^ seed) ^ seed ); - return myhash(a, b, c, d, seed); - } - - // ethan's rush hash? - if (0) - return (n ^ 0xdead1234) * (884811920 * 3 + 1); - - if (1) { - - // before - hash ^= ((hash << 5) + (n&255) + (hash >> 2)); - hashmix(a, b, hash); - n = n >> 8; - hash ^= ((hash << 5) + (n&255) + (hash >> 2)); - hashmix(a, b, hash); - n = n >> 8; - hash ^= ((hash << 5) + (n&255) + (hash >> 2)); - hashmix(a, b, hash); - n = n >> 8; - hash ^= ((hash << 5) + (n&255) + (hash >> 2)); - hashmix(a, b, hash); - n = n >> 8; - - //return hash; - return (hash & 0x7FFFFFFF); - } - - // JS - // a little better than RS - // + jenkin's mixing thing (which sucks on its own but helps tons here) - // best so far - if (1) { - unsigned int hash = 1315423911; - int a = 231232; - int b = 1232; - - for(unsigned int i = 0; i < 4; i++) - { - hash ^= ((hash << 5) + (n&255) + (hash >> 2)); - hashmix(a, b, hash); - n = n >> 8; - } - - return (hash & 0x7FFFFFFF); - } - - - // Robert jenkins' 96 bit mix - // sucks - if (0) { - int c = n; - int a = 12378912; - int b = 2982827; - a=a-b; a=a-c; a=a^(c>>13); - b=b-c; b=b-a; b=b^(a<<8); - c=c-a; c=c-b; c=c^(b>>13); - a=a-b; a=a-c; a=a^(c>>12); - b=b-c; b=b-a; b=b^(a<<16); - c=c-a; c=c-b; c=c^(b>>5); - a=a-b; a=a-c; a=a^(c>>3); - b=b-c; b=b-a; b=b^(a<<10); - c=c-a; c=c-b; c=c^(b>>15); - return c; - } - // robert jenkins 32-bit - // sucks - if (0) { - n += (n << 12); - n ^= (n >> 22); - n += (n << 4); - n ^= (n >> 9); - n += (n << 10); - n ^= (n >> 2); - n += (n << 7); - n ^= (n >> 12); - return n; - } - - // djb2 - if (0) { - unsigned int hash = 5381; - for (int i=0; i<4; i++) { - hash = ((hash << 5) + hash) + ((n&255) ^ 123); - n = n >> 8; - } - return hash; - } - - - // SDBM - if (1) { - unsigned int hash = 0; - - for(unsigned int i = 0; i < 4; i++) - { - hash = (n&255) + (hash << 6) + (hash << 16) - hash; - n = n >> 8; - } - - return (hash & 0x7FFFFFFF); - } - - // PJW - // horrid - if (0) { - unsigned int BitsInUnsignedInt = (unsigned int)(sizeof(unsigned int) * 8); - unsigned int ThreeQuarters = (unsigned int)((BitsInUnsignedInt * 3) / 4); - unsigned int OneEighth = (unsigned int)(BitsInUnsignedInt / 8); - unsigned int HighBits = (unsigned int)(0xFFFFFFFF) << (BitsInUnsignedInt - OneEighth); - unsigned int hash = 0; - unsigned int test = 0; - - for(unsigned int i = 0; i < 4; i++) - { - hash = (hash << OneEighth) + (n&255); - - if((test = hash & HighBits) != 0) - { - hash = (( hash ^ (test >> ThreeQuarters)) & (~HighBits)); - } - n = n >> 8; - } - - return (hash & 0x7FFFFFFF); - } - - // RS Hash function, from Robert Sedgwicks Algorithms in C book, w/ some changes. - if (0) { - unsigned int b = 378551; - unsigned int a = 63689; - unsigned int hash = 0; - - for(unsigned int i=0; i<4; i++) - { - hash = hash * a + (n&0xff); - a = a * b; - n = n >> 8; - } - - return (hash & 0x7FFFFFFF); - } - - // DJB - // worse than rs - if (0) { - unsigned int hash = 5381; - - for(unsigned int i = 0; i < 4; i++) - { - hash = ((hash << 5) + hash) + (n&255); - n = n >> 8; - } - - return (hash & 0x7FFFFFFF); - } - - // AP - // even worse - if (1) { - unsigned int hash = 0; - - for(unsigned int i = 0; i < 4; i++) - { - hash ^= ((i & 1) == 0) ? ( (hash << 7) ^ (n&255) ^ (hash >> 3)) : - (~((hash << 11) ^ (n&255) ^ (hash >> 5))); - n = n >> 8; - } - - return (hash & 0x7FFFFFFF); - } - - -#endif diff --git a/trunk/ceph/crush.old/crush.h b/trunk/ceph/crush.old/crush.h deleted file mode 100644 index 376e7d9b3fc86..0000000000000 --- a/trunk/ceph/crush.old/crush.h +++ /dev/null @@ -1,543 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __crush_CRUSH_H -#define __crush_CRUSH_H - -#include -#include -#include -#include -#include -using std::set; -using std::map; -using std::vector; -using std::list; -#include -#include -using namespace __gnu_cxx; - - -#include "Bucket.h" - -#include "include/buffer.h" - - -namespace crush { - - - // *** RULES *** - - class RuleStep { - public: - int cmd; - vector args; - - RuleStep(int c) : cmd(c) {} - RuleStep(int c, int a) : cmd(c) { - args.push_back(a); - } - RuleStep(int c, int a, int b) : cmd(c) { - args.push_back(a); - args.push_back(b); - } - RuleStep(int o, int a, int b, int c) : cmd(o) { - args.push_back(a); - args.push_back(b); - args.push_back(c); - } - - void _encode(bufferlist& bl) { - bl.append((char*)&cmd, sizeof(cmd)); - ::_encode(args, bl); - } - void _decode(bufferlist& bl, int& off) { - bl.copy(off, sizeof(cmd), (char*)&cmd); - off += sizeof(cmd); - ::_decode(args, bl, off); - } - }; - - - // Rule operations - const int CRUSH_RULE_TAKE = 0; - const int CRUSH_RULE_CHOOSE = 1; // first n by default - const int CRUSH_RULE_CHOOSE_FIRSTN = 1; - const int CRUSH_RULE_CHOOSE_INDEP = 2; - const int CRUSH_RULE_EMIT = 3; - - class Rule { - public: - vector< RuleStep > steps; - - void _encode(bufferlist& bl) { - int n = steps.size(); - bl.append((char*)&n, sizeof(n)); - for (int i=0; i buckets; - int bucketno; - Hash h; - - hash_map parent_map; // what bucket each leaf/bucket lives in - - public: - map rules; - - //map collisions; - //map bumps; - - void _encode(bufferlist& bl) { - // buckets - int n = buckets.size(); - bl.append((char*)&n, sizeof(n)); - for (map::const_iterator it = buckets.begin(); - it != buckets.end(); - it++) { - bl.append((char*)&it->first, sizeof(it->first)); - it->second->_encode(bl); - } - bl.append((char*)&bucketno, sizeof(bucketno)); - - // hash - int s = h.get_seed(); - bl.append((char*)&s, sizeof(s)); - - //::_encode(out, bl); - //::_encode(overload, bl); - - // rules - n = rules.size(); - bl.append((char*)&n, sizeof(n)); - for(map::iterator it = rules.begin(); - it != rules.end(); - it++) { - bl.append((char*)&it->first, sizeof(it->first)); - it->second._encode(bl); - } - - } - - void _decode(bufferlist& bl, int& off) { - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i::iterator bp = buckets.begin(); - bp != buckets.end(); - ++bp) { - // index bucket items - vector items; - bp->second->get_items(items); - for (vector::iterator ip = items.begin(); - ip != items.end(); - ++ip) - parent_map[*ip] = bp->first; - } - } - - - - public: - Crush(int seed=123) : bucketno(-1), h(seed) {} - ~Crush() { - // hose buckets - for (map::iterator it = buckets.begin(); - it != buckets.end(); - it++) { - delete it->second; - } - } - - int print(ostream& out, int root, int indent=0) { - for (int i=0; iget_weight() << "\t" << b->get_id() << "\t"; - for (int i=0; iget_bucket_type() << ": "; - - vector items; - b->get_items(items); - - if (buckets.count(items[0])) { - out << std::endl; - for (unsigned i=0; iset_id(n); - buckets[n] = b; - return n; - } - - void add_item(int parent, int item, float w, bool back=false) { - // add item - assert(!buckets[parent]->is_uniform()); - Bucket *p = buckets[parent]; - - p->add_item(item, w, back); - - // set item's parent - Bucket *n = buckets[item]; - if (n) - n->set_parent(parent); - - // update weights - while (buckets.count(p->get_parent())) { - int child = p->get_id(); - p = buckets[p->get_parent()]; - p->adjust_item_weight(child, w); - } - } - - - /* - this is a hack, fix me! weights should be consistent throughout hierarchy! - - */ - void set_bucket_weight(int item, float w) { - Bucket *b = buckets[item]; - float adj = w - b->get_weight(); - - while (buckets.count(b->get_parent())) { - Bucket *p = buckets[b->get_parent()]; - p->adjust_item_weight(b->get_id(), adj); - b = p; - } - } - - - /* - * choose numrep distinct items of type type - */ - void choose(int x, - int numrep, - int type, - Bucket *inbucket, - vector& outvec, - bool firstn, - set& outset, map& overloadmap, - bool forcefeed=false, - int forcefeedval=-1) { - int off = outvec.size(); - - // for each replica - for (int rep=0; repis_uniform()) { - // uniform bucket; be careful! - if (firstn || numrep >= in->get_size()) { - // uniform bucket is too small; just walk thru elements - r += ftotal; // r' = r + f_total (first n) - } else { - // make sure numrep is not a multple of bucket size - int add = numrep*flocal; // r' = r + n*f_local - if (in->get_size() % numrep == 0) { - add += add/in->get_size(); // shift seq once per pass through the bucket - } - r += add; - } - } else { - // mixed bucket; just make a distinct-ish r sequence - if (firstn) - r += ftotal; // r' = r + f_total - else - r += numrep * flocal; // r' = r + n*f_local - } - - // choose - outv = in->choose_r(x, r, h); - - // did we get the type we want? - int itemtype = 0; // 0 is terminal type - Bucket *newin = 0; // remember bucket we hit - if (in->is_uniform()) { - itemtype = ((UniformBucket*)in)->get_item_type(); - } else { - if (buckets.count(outv)) { // another bucket - newin = buckets[outv]; - itemtype = newin->get_type(); - } - } - if (itemtype == type) { // this is what we want! - // collision? - bool collide = false; - for (int prep=0; prep overloadmap[outv]) - bad = true; - } - - if (collide || bad) { - ftotal++; - flocal++; - - if (collide && flocal < 3) - continue; // try locally a few times! - - if (ftotal >= 10) { - // ok fine, just ignore dup. FIXME. - skip_rep = true; - break; - } - - retry_rep = true; - } - - break; // ok then! - } - - // next - in = newin; - } - - if (retry_rep) continue; // try again - - break; - } - - // skip this rep? (e.g. too many collisions, we give up) - if (skip_rep) continue; - - // output this value - outvec.push_back(outv); - } // for rep - - // double check! - if (0) { - for (unsigned i=1; i& result, - set& outset, map& overloadmap, - int forcefeed=-1) { - //int numresult = 0; - result.clear(); - - // determine hierarchical context for forcefeed (if any) - list force_stack; - if (forcefeed >= 0 && parent_map.count(forcefeed)) { - int t = forcefeed; - while (1) { - force_stack.push_front(t); - //cout << "push " << t << " onto force_stack" << std::endl; - if (parent_map.count(t) == 0) break; // reached root, presumably. - //cout << " " << t << " parent is " << parent_map[t] << std::endl; - t = parent_map[t]; - } - } - - // working vector - vector w; // working variable - - // go through each statement - for (vector::iterator pc = rule.steps.begin(); - pc != rule.steps.end(); - pc++) { - // move input? - - // do it - switch (pc->cmd) { - case CRUSH_RULE_TAKE: - { - const int arg = pc->args[0]; - //cout << "take " << arg << std::endl; - - if (!force_stack.empty()) { - assert(force_stack.front() == arg); - force_stack.pop_front(); - } - - w.clear(); - w.push_back(arg); - } - break; - - case CRUSH_RULE_CHOOSE_FIRSTN: - case CRUSH_RULE_CHOOSE_INDEP: - { - const bool firstn = pc->cmd == CRUSH_RULE_CHOOSE_FIRSTN; - const int numrep = pc->args[0]; - const int type = pc->args[1]; - - //cout << "choose " << numrep << " of type " << type << std::endl; - - assert(!w.empty()); - - // reset output - vector out; - - // forcefeeding? - bool forcing = false; - int forceval = -1; - if (!force_stack.empty()) { - forceval = force_stack.front(); - force_stack.pop_front(); - //cout << "priming out with " << forceval << std::endl; - forcing = true; - } else if (forcefeed >= 0 && type == 0) { - //cout << "forcing context-less " << forcefeed << std::endl; - forceval = forcefeed; - forcefeed = -1; - forcing = true; - } - - // do each row independently - for (vector::iterator i = w.begin(); - i != w.end(); - i++) { - assert(buckets.count(*i)); - Bucket *b = buckets[*i]; - choose(x, numrep, type, b, out, firstn, - outset, overloadmap, - forcing, - forceval); - forcing = false; // only once - } // for inrow - - // put back into w - w.swap(out); - out.clear(); - } - break; - - case CRUSH_RULE_EMIT: - { - for (unsigned i=0; i - -#include -#include -using namespace std; - - -void place(Crush& c, Rule& rule, int numpg, int numrep, map >& placement) -{ - vector v(numrep); - map ocount; - - for (int x=1; x<=numpg; x++) { - - //cout << H(x) << "\t" << h(x) << endl; - c.do_rule(rule, x, v); - //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl; - - bool bad = false; - for (int i=0; i::iterator it = ocount.begin(); - it != ocount.end(); - it++) - cout << it->first << "\t" << it->second << endl; - -} - - -float testmovement(int n, float f, int buckettype) -{ - Hash h(73232313); - - // crush - Crush c; - - int ndisks = 0; - - // bucket - Bucket *b; - if (buckettype == 0) - b = new TreeBucket(1); - else if (buckettype == 1 || buckettype == 2) - b = new ListBucket(1); - else if (buckettype == 3) - b = new StrawBucket(1); - else if (buckettype == 4) - b = new UniformBucket(0,0); - - for (int i=0; iadd_item(ndisks++,1); - - c.add_bucket(b); - int root = b->get_id(); - - //c.print(cout,root); - - // rule - int numrep = 2; - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - //c.overload[10] = .1; - - - int pg_per = 1000; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - - /* - cout << ndisks << " disks, " << endl; - cout << pg_per << " pgs per disk" << endl; - cout << numpg << " logical pgs" << endl; - cout << "numrep is " << numrep << endl; - */ - map > placement1, placement2; - - //c.print(cout, root); - - - // ORIGINAL - place(c, rule, numpg, numrep, placement1); - - int olddisks = ndisks; - - // add item - if (buckettype == 2) { - // start over! - ndisks = 0; - b = new ListBucket(1); - for (int i=0; i<=n; i++) - b->add_item(ndisks++,1); - c.add_bucket(b); - root = b->get_id(); - - rule.steps.clear(); - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - } - else - b->add_item(ndisks++, 1); - - - // ADDED - //c.print(cout, root); - place(c, rule, numpg, numrep, placement2); - - int moved = 0; - for (int x=1; x<=numpg; x++) - if (placement1[x] != placement2[x]) - for (int j=0; j - -#include -#include -using namespace std; - - -Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - return b; - } else { - // mixed - //Bucket *b = new MixedBucket(h+1); - Bucket *b = new StrawBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); - return b->get_id(); -} - - -float go(int dep) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - if (0) { - for (int d=0; dadd_item(ndisks++, 10); - root = c.add_bucket(b); - } - if (0) { - vector disks; - for (int i=0; i<10000; i++) - disks.push_back(ndisks++); - UniformBucket *b = new UniformBucket(1, 0, 10000, disks); - Hash h(123); - b->make_primes(h); - root = c.add_bucket(b); - } - - - - // rule - int numrep = 1; - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - //c.overload[10] = .1; - - - int pg_per = 100; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - //cout << ndisks << " disks, " << endl; - //cout << pg_per << " pgs per disk" << endl; - // cout << numpg << " logical pgs" << endl; - //cout << "numrep is " << numrep << endl; - - - int place = 100000; - int times = place / numpg; - if (!times) times = 1; - - cout << "#looping " << times << " times" << endl; - - float tvar = 0; - int tvarnum = 0; - - int x = 0; - for (int t=0; t v(numrep); - - for (int z=0; z - -#include -#include -using namespace std; - -int buckettype = 0; - -Bucket *make_bucket(Crush& c, vector& wid, int h, map< int, list >& buckets, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - buckets[h].push_back(b); - return b; - } else { - // mixed - //Bucket *b = new TreeBucket(h+1); - //Bucket *b = new ListBucket(h+1); - //Bucket *b = new StrawBucket(h+1); - Bucket *b; - if (buckettype == 0) - b = new TreeBucket(h+1); - else if (buckettype == 1 || buckettype == 2) - b = new ListBucket(h+1); - else if (buckettype == 3) - b = new StrawBucket(h+1); - - c.add_bucket(b); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - n->set_parent(b->get_id()); - } - buckets[h].push_back(b); - //cout << b->get_id() << " mixedbucket with " << wid[h] << " at " << h << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, map< int, list >& buckets, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, buckets, ndisks); - return b->get_id(); -} - - -void place(Crush& c, Rule& rule, int numpg, int numrep, map >& placement) -{ - vector v(numrep); - map ocount; - - for (int x=1; x<=numpg; x++) { - - //cout << H(x) << "\t" << h(x) << endl; - c.do_rule(rule, x, v); - //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl; - - bool bad = false; - for (int i=0; i::iterator it = ocount.begin(); - it != ocount.end(); - it++) - cout << it->first << "\t" << it->second << endl; - -} - - -float testmovement(int depth, int branching, int udisks, int add, int modifydepth) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - wid.push_back(udisks); - for (int d=1; d > buckets; - - root = make_hierarchy(c, wid, buckets, ndisks); - - //c.print(cout,root); - - // rule - int numrep = 2; - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - //c.overload[10] = .1; - - - int pg_per = 100; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - - /* - cout << ndisks << " disks, " << endl; - cout << pg_per << " pgs per disk" << endl; - cout << numpg << " logical pgs" << endl; - cout << "numrep is " << numrep << endl; - */ - map > placement1, placement2; - - //c.print(cout, root); - - - // ORIGINAL - place(c, rule, numpg, numrep, placement1); - - int olddisks = ndisks; - - // add disks - //cout << " adding " << add << " disks" << endl; - vector disks; - for (int i=0; imake_primes(h); - - //Bucket *o = buckets[2].back(); - Bucket *o; - if (buckettype == 2) - o = buckets[modifydepth].front(); - else - o = buckets[modifydepth].back(); - - c.add_bucket(b); - //cout << " adding under " << o->get_id() << endl; - c.add_item(o->get_id(), b->get_id(), b->get_weight()); - //((MixedBucket*)o)->add_item(b->get_id(), b->get_weight()); - //newbucket = b; - - - // ADDED - //c.print(cout, root); - place(c, rule, numpg, numrep, placement2); - - int moved = 0; - for (int x=1; x<=numpg; x++) - if (placement1[x] != placement2[x]) - for (int j=0; j - -#include -#include -using namespace std; - - -int buckettype = 2; // 0 = mixed, 1 = linear, 2 = straw - -int big_one_skip = 255; -int big_one_size; -Bucket *big_one = 0; - -Bucket *make_bucket(Crush& c, vector& wid, int h, map< int, list >& buckets, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - - int s = wid[h]; - if (big_one_skip > 0) - big_one_skip--; - if (!big_one_skip && !big_one) - s = big_one_size; - - - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks " << disks.size()<< endl; - buckets[h].push_back(b); - return b; - } else { - // mixed - Bucket *b; - if (buckettype == 0) - b = new TreeBucket(h+1); - else if (buckettype == 1) - b = new ListBucket(h+1); - else if (buckettype == 2) - b = new StrawBucket(h+1); - c.add_bucket(b); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - n->set_parent(b->get_id()); - } - buckets[h].push_back(b); - //cout << b->get_id() << " mixedbucket with " << wid[h] << " at " << h << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, map< int, list >& buckets, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, buckets, ndisks); - return b->get_id(); -} - - -void place(Crush& c, Rule& rule, int numpg, int numrep, map >& placement) -{ - vector v(numrep); - map ocount; - - for (int x=1; x<=numpg; x++) { - - //cout << H(x) << "\t" << h(x) << endl; - c.do_rule(rule, x, v); - //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl; - - bool bad = false; - for (int i=0; i::iterator it = ocount.begin(); - it != ocount.end(); - it++) - cout << it->first << "\t" << it->second << endl; - -} - - -float testmovement(int depth, int branching, int udisks, int add) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - wid.push_back(udisks); - for (int d=1; d > buckets; - - big_one_size = add; - big_one = 0; - - //cout << "making tree" << endl; - root = make_hierarchy(c, wid, buckets, ndisks); - - //c.print(cout, root); - - - // rule - int numrep = 2; - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - //c.overload[10] = .1; - - - int pg_per = 100; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - - /* - cout << ndisks << " disks, " << endl; - cout << pg_per << " pgs per disk" << endl; - cout << numpg << " logical pgs" << endl; - cout << "numrep is " << numrep << endl; - */ - map > placement1, placement2; - - //c.print(cout, root); - - int olddisks = ndisks; - - - place(c, rule, numpg, numrep, placement1); - - if (1) { - // remove disks - assert(big_one); - c.adjust_item(big_one->get_id(), 0); - } - - int newdisks = ndisks - add; - - //c.print(cout, root); - place(c, rule, numpg, numrep, placement2); - - int moved = 0; - for (int x=1; x<=numpg; x++) - if (placement1[x] != placement2[x]) - for (int j=0; j >::iterator i = r.begin(); - i != r.end(); - i++) { - cout << i->first; - for (map::iterator j = i->second.begin(); - j != i->second.end(); - j++) - cout << "\t" << j->first << "\t" << j->second; - cout << endl; - } - */ -} - diff --git a/trunk/ceph/crush.old/test/cluster_movement_rush.cc b/trunk/ceph/crush.old/test/cluster_movement_rush.cc deleted file mode 100644 index 90cc197c24f65..0000000000000 --- a/trunk/ceph/crush.old/test/cluster_movement_rush.cc +++ /dev/null @@ -1,218 +0,0 @@ - - -#include "../crush.h" -using namespace crush; - -#include - -#include -#include -using namespace std; - -int buckettype = 0; - -Bucket *make_bucket(Crush& c, vector& wid, int h, map< int, list >& buckets, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - buckets[h].push_back(b); - return b; - } else { - // mixed - //Bucket *b = new TreeBucket(h+1); - //Bucket *b = new ListBucket(h+1); - //Bucket *b = new StrawBucket(h+1); - Bucket *b; - if (buckettype == 0) - b = new TreeBucket(h+1); - else if (buckettype == 1 || buckettype == 2) - b = new ListBucket(h+1); - else if (buckettype == 3) - b = new StrawBucket(h+1); - - c.add_bucket(b); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - n->set_parent(b->get_id()); - } - buckets[h].push_back(b); - //cout << b->get_id() << " mixedbucket with " << wid[h] << " at " << h << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, map< int, list >& buckets, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, buckets, ndisks); - return b->get_id(); -} - - -void place(Crush& c, Rule& rule, int numpg, int numrep, map >& placement) -{ - vector v(numrep); - map ocount; - - for (int x=1; x<=numpg; x++) { - - //cout << H(x) << "\t" << h(x) << endl; - c.do_rule(rule, x, v); - //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl; - - bool bad = false; - for (int i=0; i::iterator it = ocount.begin(); - it != ocount.end(); - it++) - cout << it->first << "\t" << it->second << endl; - -} - - -float testmovement(int depth, int branching, int udisks, int add, int modifydepth) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - wid.push_back(udisks); - for (int d=1; d > buckets; - - root = make_hierarchy(c, wid, buckets, ndisks); - - //c.print(cout,root); - - // rule - int numrep = 2; - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - //c.overload[10] = .1; - - - int pg_per = 100; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - - /* - cout << ndisks << " disks, " << endl; - cout << pg_per << " pgs per disk" << endl; - cout << numpg << " logical pgs" << endl; - cout << "numrep is " << numrep << endl; - */ - map > placement1, placement2; - - //c.print(cout, root); - - - // ORIGINAL - place(c, rule, numpg, numrep, placement1); - - int olddisks = ndisks; - - // add disks - //cout << " adding " << add << " disks" << endl; - vector disks; - for (int i=0; imake_primes(h); - - //Bucket *o = buckets[2].back(); - Bucket *o; - if (buckettype == 2) - o = buckets[modifydepth].front(); - else - o = buckets[modifydepth].back(); - - c.add_bucket(b); - //cout << " adding under " << o->get_id() << endl; - c.add_item(o->get_id(), b->get_id(), b->get_weight(), buckettype == 2); - //((MixedBucket*)o)->add_item(b->get_id(), b->get_weight()); - //newbucket = b; - - - // ADDED - //c.print(cout, root); - place(c, rule, numpg, numrep, placement2); - - int moved = 0; - for (int x=1; x<=numpg; x++) - if (placement1[x] != placement2[x]) - for (int j=0; j - -#include -#include -using namespace std; - - -Clock g_clock; - - -Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks weight " << w << endl; - return b; - } else { - // mixed - Bucket *b = new TreeBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); - return b->get_id(); -} - - - -float go(int dep, int failpc) -{ - Hash h(73232313); - - //int overloadcutoff = (int)((float)10000.0 / (float)utilization); - - //cout << "util " << utilization << " cutoff " << overloadcutoff << endl; - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - for (int d=0; d ocount(ndisks); - //cout << ndisks << " disks, " << endl; - //cout << pg_per << " pgs per disk" << endl; - // cout << numpg << " logical pgs" << endl; - //cout << "numrep is " << numrep << endl; - - - int place = 1000000; - int times = place / numpg; - if (!times) times = 1; - - - //cout << "looping " << times << " times" << endl; - - float tavg[10]; - float tvar[10]; - for (int j=0;j<10;j++) { - tvar[j] = 0; - tavg[j] = 0; - } - int tvarnum = 0; - float trvar = 0.0; - - float overloadsum = 0.0; - float adjustsum = 0.0; - float afteroverloadsum = 0.0; - float aslowdown = 0.0; - int chooses = 0; - int xs = 1; - for (int t=0; t v(numrep); - - c.out.clear(); - - for (int z=0; z= ndisks) cout << "v[i] " << i << " is " << v[i] << " .. x = " << x << endl; - //assert(v[i] < ndisks); - ocount[v[i]]++; - } - } - utime_t t1b = g_clock.now(); - - // add in numf failed disks - for (int f = 0; f < numf; f++) { - int d = rand() % ndisks; - while (c.out.count(d)) d = rand() % ndisks; - c.out.insert(d); - } - - utime_t t3a = g_clock.now(); - for (int x=xs; x - -#include -#include -using namespace std; - - -Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - return b; - } else { - // mixed - MixedBucket *b = new MixedBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); - return b->get_id(); -} - - -Bucket *make_random(Crush& c, int wid, int height, int& ndisks) -{ - int w = rand() % (wid-1) + 2; - - if (height == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - return b; - } else { - // mixed - int h = rand() % height + 1; - MixedBucket *b = new MixedBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } - -} - - -float go(int dep, int overloadcutoff) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - for (int d=0; dget_id(); - //c.print(cout, root); - } - if (0) { - MixedBucket *b = new MixedBucket(1); - for (int i=0; i<10000; i++) - b->add_item(ndisks++, 10); - root = c.add_bucket(b); - } - if (0) { - vector disks; - for (int i=0; i<10000; i++) - disks.push_back(ndisks++); - UniformBucket *b = new UniformBucket(1, 0, 10000, disks); - Hash h(123); - b->make_primes(h); - root = c.add_bucket(b); - } - //cout << ndisks << " disks" << endl; - - - - // rule - int numrep = 1; - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - //c.overload[10] = .1; - - - int pg_per = 100; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - //cout << ndisks << " disks, " << endl; - //cout << pg_per << " pgs per disk" << endl; - // cout << numpg << " logical pgs" << endl; - //cout << "numrep is " << numrep << endl; - - - int place = 1000000; - int times = place / numpg; - if (!times) times = 1; - - - //cout << "looping " << times << " times" << endl; - - float tvar = 0; - int tvarnum = 0; - - float overloadsum = 0.0; - float adjustsum = 0.0; - float afteroverloadsum = 0.0; - int chooses = 0; - int xs = 1; - for (int t=0; t v(numrep); - - c.overload.clear(); - - for (int z=0; z overloadcutoff) - overloaded++; - - if (ocount[i] > 100+(overloadcutoff-100)/2) { - adjusted++; - c.overload[i] = 100.0 / (float)ocount[i]; - //cout << "disk " << i << " has " << ocount[i] << endl; - } - ocount[i] = 0; - } - //cout << overloaded << " overloaded" << endl; - overloadsum += (float)overloaded / (float)ndisks; - adjustsum += (float)adjusted / (float)ndisks; - - - for (int x=xs; x overloadcutoff) { - still++; - //c.overload[ocount[i]] = 100.0 / (float)ocount[i]; - //cout << "disk " << i << " has " << ocount[i] << endl; - } - } - //if (still) cout << "overload was " << overloaded << " now " << still << endl; - afteroverloadsum += (float)still / (float)ndisks; - - //cout << "collisions: " << c.collisions << endl; - //cout << "r bumps: " << c.bumps << endl; - - float avg = 0.0; - for (int i=0; i100; d -= 5) { - float var = go(3,d); - //cout << "## depth = " << d << endl; - //cout << d << "\t" << var << endl; - } -} diff --git a/trunk/ceph/crush.old/test/depth_variance.cc b/trunk/ceph/crush.old/test/depth_variance.cc deleted file mode 100644 index 7d60ebaae9501..0000000000000 --- a/trunk/ceph/crush.old/test/depth_variance.cc +++ /dev/null @@ -1,185 +0,0 @@ - - -#include "../crush.h" -using namespace crush; - -#include - -#include -#include -using namespace std; - - -Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - return b; - } else { - // mixed - Bucket *b = new TreeBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); - return b->get_id(); -} - - -float go(int dep) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - if (1) { - for (int d=0; d ocount(ndisks); - //cout << ndisks << " disks, " << endl; - //cout << pg_per << " pgs per disk" << endl; - // cout << numpg << " logical pgs" << endl; - //cout << "numrep is " << numrep << endl; - - - int place = 100000; - int times = place / numpg; - if (!times) times = 1; - - cout << "#looping " << times << " times" << endl; - - float tvar = 0; - int tvarnum = 0; - float tavg = 0; - - int x = 0; - for (int t=0; t v(numrep); - - for (int z=0; z - -#include -#include -using namespace std; - - -Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks weight " << w << endl; - return b; - } else { - // mixed - Bucket *b = new TreeBucket(h+1); - //Bucket *b = new StrawBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); - return b->get_id(); -} - - - -float go(int dep, int overloadcutoff) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - for (int d=0; d ocount(ndisks); - //cout << ndisks << " disks, " << endl; - //cout << pg_per << " pgs per disk" << endl; - // cout << numpg << " logical pgs" << endl; - //cout << "numrep is " << numrep << endl; - - - int place = 100000; - int times = place / numpg; - if (!times) times = 1; - - - //cout << "looping " << times << " times" << endl; - - float tavg[10]; - float tvar[10]; - for (int j=0;j<10;j++) { - tvar[j] = 0; - tavg[j] = 0; - } - int tvarnum = 0; - - float overloadsum = 0.0; - float adjustsum = 0.0; - float afteroverloadsum = 0.0; - int chooses = 0; - int xs = 1; - for (int t=0; t v(numrep); - - c.overload.clear(); - - for (int z=0; z cutoff) - overloaded++; - - if (ocount[i] > adjoff) { - adjusted++; - c.overload[i] = (float)target / (float)ocount[i]; - //cout << "setting overload " << i << " to " << c.overload[i] << endl; - //cout << "disk " << i << " has " << ocount[i] << endl; - } - ocount[i] = 0; - } - //cout << overloaded << " overloaded" << endl; - overloadsum += (float)overloaded / (float)ndisks; - adjustsum += (float)adjusted / (float)ndisks; - - - - if (1) { - // second pass - for (int x=xs; x= adjoff) { - adjusted++; - if (c.overload.count(i) == 0) { - c.overload[i] = 1.0; - adjusted++; - } - //else cout << "(re)adjusting " << i << endl; - c.overload[i] *= (float)target / (float)ocount[i]; - //cout << "setting overload " << i << " to " << c.overload[i] << endl; - //cout << "disk " << i << " has " << ocount[i] << endl; - } - ocount[i] = 0; - } - } - - for (int x=xs; x cutoff) { - still++; - //c.overload[ocount[i]] = 100.0 / (float)ocount[i]; - if (c.overload.count(i)) cout << "[adjusted] "; - cout << "disk " << i << " has " << ocount[i] << endl; - } - } - //if (still) cout << "overload was " << overloaded << " now " << still << endl; - afteroverloadsum += (float)still / (float)ndisks; - - //cout << "collisions: " << c.collisions << endl; - //cout << "r bumps: " << c.bumps << endl; - - int n = ndisks/10; - float avg[10]; - float var[10]; - for (int i=0;i<10;i++) { - int s = n*i; - avg[i] = 0.0; - for (int j=0; j100; d -= 5) { - float var = go(3,d); - //cout << "## depth = " << d << endl; - //cout << d << "\t" << var << endl; - } -} diff --git a/trunk/ceph/crush.old/test/movement.cc b/trunk/ceph/crush.old/test/movement.cc deleted file mode 100644 index 2621f09457fe6..0000000000000 --- a/trunk/ceph/crush.old/test/movement.cc +++ /dev/null @@ -1,223 +0,0 @@ - - -#include "../crush.h" -using namespace crush; - -#include - -#include -#include -using namespace std; - - -Bucket *make_bucket(Crush& c, vector& wid, int h, map< int, list >& buckets, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - buckets[h].push_back(b); - return b; - } else { - // mixed - MixedBucket *b = new MixedBucket(h+1); - c.add_bucket(b); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - n->set_parent(b->get_id()); - } - buckets[h].push_back(b); - //cout << b->get_id() << " mixedbucket with " << wid[h] << " at " << h << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, map< int, list >& buckets, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, buckets, ndisks); - return b->get_id(); -} - - -void place(Crush& c, Rule& rule, int numpg, int numrep, map >& placement) -{ - vector v(numrep); - map ocount; - - for (int x=1; x<=numpg; x++) { - - //cout << H(x) << "\t" << h(x) << endl; - c.do_rule(rule, x, v); - //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl; - - bool bad = false; - for (int i=0; i::iterator it = ocount.begin(); - it != ocount.end(); - it++) - cout << it->first << "\t" << it->second << endl; - -} - - -float testmovement(int depth, int branching, int udisks) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - wid.push_back(udisks); - for (int d=1; d > buckets; - - if (1) { - root = make_hierarchy(c, wid, buckets, ndisks); - } - if (0) { - MixedBucket *b = new MixedBucket(1); - for (int i=0; i<10000; i++) - b->add_item(ndisks++, 10); - root = c.add_bucket(b); - } - if (0) { - vector disks; - for (int i=0; i<10000; i++) - disks.push_back(ndisks++); - UniformBucket *b = new UniformBucket(1, 0, 1, disks); - Hash h(123); - b->make_primes(h); - root = c.add_bucket(b); - } - - - - // rule - int numrep = 2; - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - //c.overload[10] = .1; - - - int pg_per = 100; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - - /* - cout << ndisks << " disks, " << endl; - cout << pg_per << " pgs per disk" << endl; - cout << numpg << " logical pgs" << endl; - cout << "numrep is " << numrep << endl; - */ - map > placement1, placement2; - - //c.print(cout, root); - - place(c, rule, numpg, numrep, placement1); - - if (1) { - // failed - - //for (int i=500; i<1000; i++) - //c.failed.insert(i); - c.failed.insert(0); - } - - int olddisks = ndisks; - - if (1) { - int n = udisks; - //cout << " adding " << n << " disks" << endl; - vector disks; - for (int i=0; imake_primes(h); - Bucket *o = buckets[1].back(); - c.add_bucket(b); - //cout << " adding under " << o->get_id() << endl; - c.add_item(o->get_id(), b->get_id(), b->get_weight()); - //((MixedBucket*)o)->add_item(b->get_id(), b->get_weight()); - } - - //c.print(cout, root); - place(c, rule, numpg, numrep, placement2); - - int moved = 0; - for (int x=1; x<=numpg; x++) { - if (placement1[x] != placement2[x]) { - for (int j=0; j v; - cout << depth; - for (int branching = 3; branching < 16; branching += 1) { - float fac = testmovement(depth, branching, udisks); - v.push_back(fac); - int n = udisks * pow((float)branching, (float)depth-1); - cout << "\t" << n; - cout << "\t" << fac; - } - //for (int i=0; i - -#include -#include -using namespace std; - - -Bucket *make_bucket(Crush& c, vector& wid, int h, map< int, list >& buckets, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - buckets[h].push_back(b); - return b; - } else { - // mixed - MixedBucket *b = new MixedBucket(h+1); - c.add_bucket(b); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - n->set_parent(b->get_id()); - } - buckets[h].push_back(b); - //cout << b->get_id() << " mixedbucket with " << wid[h] << " at " << h << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, map< int, list >& buckets, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, buckets, ndisks); - return b->get_id(); -} - - -void place(Crush& c, Rule& rule, int numpg, int numrep, map >& placement) -{ - vector v(numrep); - map ocount; - - for (int x=1; x<=numpg; x++) { - - //cout << H(x) << "\t" << h(x) << endl; - c.do_rule(rule, x, v); - //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl; - - bool bad = false; - for (int i=0; i::iterator it = ocount.begin(); - it != ocount.end(); - it++) - cout << it->first << "\t" << it->second << endl; - -} - - -float testmovement(int depth, int branching, int udisks) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - wid.push_back(udisks); - for (int d=1; d > buckets; - - if (1) { - root = make_hierarchy(c, wid, buckets, ndisks); - } - if (0) { - MixedBucket *b = new MixedBucket(1); - for (int i=0; i<10000; i++) - b->add_item(ndisks++, 10); - root = c.add_bucket(b); - } - if (0) { - vector disks; - for (int i=0; i<10000; i++) - disks.push_back(ndisks++); - UniformBucket *b = new UniformBucket(1, 0, 1, disks); - Hash h(123); - b->make_primes(h); - root = c.add_bucket(b); - } - - - - // rule - int numrep = 2; - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - //c.overload[10] = .1; - - - int pg_per = 100; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - - /* - cout << ndisks << " disks, " << endl; - cout << pg_per << " pgs per disk" << endl; - cout << numpg << " logical pgs" << endl; - cout << "numrep is " << numrep << endl; - */ - map > placement1, placement2; - - //c.print(cout, root); - - place(c, rule, numpg, numrep, placement1); - - float over = .5; - - if (1) { - // failed - - //for (int i=500; i<1000; i++) - //c.failed.insert(i); - //c.failed.insert(0); - c.overload[0] = over; - } - - int olddisks = ndisks; - - - - if (0) { - int n = udisks; - //cout << " adding " << n << " disks" << endl; - vector disks; - for (int i=0; imake_primes(h); - Bucket *o = buckets[1].back(); - c.add_bucket(b); - //cout << " adding under " << o->get_id() << endl; - c.add_item(o->get_id(), b->get_id(), b->get_weight()); - //((MixedBucket*)o)->add_item(b->get_id(), b->get_weight()); - } - - //c.print(cout, root); - place(c, rule, numpg, numrep, placement2); - - vector moved(ndisks); - - //int moved = 0; - for (int d=0; d::iterator it = placement1[d].begin(); - it != placement1[d].end(); - it++) { - placement2[d].erase(*it); - } - } - - float avg = 0; - for (int d=0; d v; - cout << depth; - for (int branching = 3; branching < 16; branching += 1) { - float fac = testmovement(depth, branching, udisks); - v.push_back(fac); - int n = udisks * pow((float)branching, (float)depth-1); - //cout << "\t" << n; - //cout << "\t" << fac; - } - //for (int i=0; i - -#include -#include -using namespace std; - - -Clock g_clock; - - -Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks weight " << w << endl; - return b; - } else { - // mixed - Bucket *b = new TreeBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); - return b->get_id(); -} - - - -float go(int dep, int utilization ) -{ - Hash h(73232313); - - int overloadcutoff = (int)((float)10000.0 / (float)utilization); - - //cout << "util " << utilization << " cutoff " << overloadcutoff << endl; - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - for (int d=0; d ocount(ndisks); - //cout << ndisks << " disks, " << endl; - //cout << pg_per << " pgs per disk" << endl; - // cout << numpg << " logical pgs" << endl; - //cout << "numrep is " << numrep << endl; - - - int place = 100000; - int times = place / numpg; - if (!times) times = 1; - - - //cout << "looping " << times << " times" << endl; - - float tavg[10]; - float tvar[10]; - for (int j=0;j<10;j++) { - tvar[j] = 0; - tavg[j] = 0; - } - int tvarnum = 0; - - float overloadsum = 0.0; - float adjustsum = 0.0; - float afteroverloadsum = 0.0; - float aslowdown = 0.0; - int chooses = 0; - int xs = 1; - for (int t=0; t v(numrep); - - c.overload.clear(); - - for (int z=0; z cutoff) - overloaded++; - - if (ocount[i] > adjoff) { - adjusted++; - c.overload[i] = (float)target / (float)ocount[i]; - //cout << "setting overload " << i << " to " << c.overload[i] << endl; - //cout << "disk " << i << " has " << ocount[i] << endl; - } - ocount[i] = 0; - } - //cout << overloaded << " overloaded" << endl; - overloadsum += (float)overloaded / (float)ndisks; - adjustsum += (float)adjusted / (float)ndisks; - - - - // keep adjusting! - for (int bla=0; bla<5; bla++) { - utime_t t2a = g_clock.now(); - - // second pass - for (int x=xs; x= adjoff) { - numover++; - if (c.overload.count(i) == 0) { - c.overload[i] = 1.0; - adjusted++; - } - //else cout << "(re)adjusting " << i << endl; - c.overload[i] *= (float)target / (float)ocount[i]; - //cout << "setting overload " << i << " to " << c.overload[i] << endl; - //cout << "disk " << i << " has " << ocount[i] << endl; - } - ocount[i] = 0; - } - if (!numover) break; - cout << "readjusting" << endl; - } - - utime_t t3a = g_clock.now(); - - for (int x=xs; x cutoff) { - still++; - //c.overload[ocount[i]] = 100.0 / (float)ocount[i]; - if (c.overload.count(i)) cout << "[adjusted] "; - cout << "disk " << i << " has " << ocount[i] << endl; - } - } - //if (still) cout << "overload was " << overloaded << " now " << still << endl; - afteroverloadsum += (float)still / (float)ndisks; - - //cout << "collisions: " << c.collisions << endl; - //cout << "r bumps: " << c.bumps << endl; - - int n = ndisks/10; - float avg[10]; - float var[10]; - for (int i=0;i<10;i++) { - int s = n*i; - avg[i] = 0.0; - for (int j=0; j - -#include -#include -using namespace std; - - -Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - return b; - } else { - // mixed - MixedBucket *b = new MixedBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); - return b->get_id(); -} - - -Bucket *make_random(Crush& c, int wid, int height, int& ndisks) -{ - int w = rand() % (wid-1) + 2; - - if (height == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - return b; - } else { - // mixed - int h = rand() % height + 1; - MixedBucket *b = new MixedBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } - -} - - -float go(int dep, int overloadcutoff) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - for (int d=0; dget_id(); - //c.print(cout, root); - } - if (0) { - MixedBucket *b = new MixedBucket(1); - for (int i=0; i<10000; i++) - b->add_item(ndisks++, 10); - root = c.add_bucket(b); - } - if (0) { - vector disks; - for (int i=0; i<10000; i++) - disks.push_back(ndisks++); - UniformBucket *b = new UniformBucket(1, 0, 10000, disks); - Hash h(123); - b->make_primes(h); - root = c.add_bucket(b); - } - //cout << ndisks << " disks" << endl; - - - - // rule - int numrep = 1; - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - //c.overload[10] = .1; - - - int pg_per = 100; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - //cout << ndisks << " disks, " << endl; - //cout << pg_per << " pgs per disk" << endl; - // cout << numpg << " logical pgs" << endl; - //cout << "numrep is " << numrep << endl; - - - int place = 1000000; - int times = place / numpg; - if (!times) times = 1; - - - //cout << "looping " << times << " times" << endl; - - float tvar = 0; - int tvarnum = 0; - - float overloadsum = 0.0; - float adjustsum = 0.0; - float afteroverloadsum = 0.0; - int chooses = 0; - int xs = 1; - for (int t=0; t v(numrep); - - c.overload.clear(); - - for (int z=0; z overloadcutoff) - overloaded++; - - if (ocount[i] > 100+(overloadcutoff-100)/2) { - adjusted++; - c.overload[i] = 100.0 / (float)ocount[i]; - //cout << "disk " << i << " has " << ocount[i] << endl; - } - ocount[i] = 0; - } - //cout << overloaded << " overloaded" << endl; - overloadsum += (float)overloaded / (float)ndisks; - adjustsum += (float)adjusted / (float)ndisks; - - - for (int x=xs; x overloadcutoff) { - still++; - //c.overload[ocount[i]] = 100.0 / (float)ocount[i]; - //cout << "disk " << i << " has " << ocount[i] << endl; - } - } - //if (still) cout << "overload was " << overloaded << " now " << still << endl; - afteroverloadsum += (float)still / (float)ndisks; - - //cout << "collisions: " << c.collisions << endl; - //cout << "r bumps: " << c.bumps << endl; - - float avg = 0.0; - for (int i=0; i100; d -= 5) { - float var = go(3,d); - //cout << "## depth = " << d << endl; - //cout << d << "\t" << var << endl; - } -} diff --git a/trunk/ceph/crush.old/test/sizes.cc b/trunk/ceph/crush.old/test/sizes.cc deleted file mode 100644 index cc5780218210a..0000000000000 --- a/trunk/ceph/crush.old/test/sizes.cc +++ /dev/null @@ -1,131 +0,0 @@ - -#include "include/types.h" -#include "include/Distribution.h" -#include "osd/OSDMap.h" - - -Distribution file_size_distn; //kb - - -list object_queue; -int max_object_size = 1024*1024*100; //kb - -off_t no; - -int get_object() //kb -{ - if (object_queue.empty()) { - int max = file_size_distn.sample(); - no++; - int filesize = max/2 + (rand() % 100) * max/200 + 1; - //cout << "file " << filesize << endl; - while (filesize > max_object_size) { - object_queue.push_back(max_object_size); - filesize -= max_object_size; - } - object_queue.push_back(filesize); - } - int s = object_queue.front(); - object_queue.pop_front(); - //cout << "object " << s << endl; - return s; -} - -void getdist(vector& v, float& avg, float& var) -{ - avg = 0.0; - for (int i=0; i pgs(n); - off_t did = 0; - - no = 0; - while (did < dist) { - off_t s = get_object(); - pgs[rand()%n] += s; - did += s; - } - while (!object_queue.empty()) - pgs[rand()%n] += get_object(); - - numo = no; - //cout << did/n << endl; - - //for (int i=0; i - -#include -#include -using namespace std; - - -Bucket *make_bucket(Crush& c, vector& wid, int h, map< int, list >& buckets, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - buckets[h].push_back(b); - return b; - } else { - // mixed - Bucket *b = new TreeBucket(h+1); - c.add_bucket(b); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - n->set_parent(b->get_id()); - } - buckets[h].push_back(b); - //cout << b->get_id() << " mixedbucket with " << wid[h] << " at " << h << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, map< int, list >& buckets, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, buckets, ndisks); - return b->get_id(); -} - - -void place(Crush& c, Rule& rule, int numpg, int numrep, vector& ocount) -{ - vector v(numrep); - //map ocount; - - for (int x=1; x<=numpg; x++) { - - //cout << H(x) << "\t" << h(x) << endl; - c.do_rule(rule, x, v); - //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl; - - bool bad = false; - for (int i=0; i wid; - wid.push_back(10); - wid.push_back(2); - - map< int, list > buckets; - root = make_hierarchy(c, wid, buckets, ndisks); - - // add small bucket - vector disks; - for (int i=0; i<3; i++) - disks.push_back(ndisks++); - UniformBucket *b = new UniformBucket(1, 0, 1, disks); - b->make_primes(h); - Bucket *o = buckets[1].back(); - c.add_bucket(b); - //cout << " adding under " << o->get_id() << endl; - c.add_item(o->get_id(), b->get_id(), b->get_weight()); - - - // rule - int numrep = 6; - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - //c.overload[10] = .1; - - int pg_per = 10000; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - - c.print(cout, root); - - place(c, rule, numpg, numrep, ocount); - - for (int i=0; i - -#include -#include -using namespace std; - - -int numrep = 1; - - -double go(int n, int bucket) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - Bucket *b; - vector items; - if (bucket == 0) b = new UniformBucket(1,0,10,items); - if (bucket == 1) b = new TreeBucket(1); - if (bucket == 2) b = new ListBucket(1); - if (bucket == 3) b = new StrawBucket(1); - - for (int d=0; dadd_item(ndisks++, 1); - - //if (!bucket) ((UniformBucket*)b)->make_primes(h); - - root = c.add_bucket(b); - - // rule - Rule rule; - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - - - int place = 1000000; - - - vector v(numrep); - set out; - map overload; - - utime_t start = g_clock.now(); - - for (int x=1; x <= place; x++) - c.do_rule(rule, x, v, out, overload); - - utime_t end = g_clock.now(); - - end -= start; - double el = (double)end; - - //cout << "\t" << ndisks; - - return el; -} - - -int main() -{ - - for (int n=4; n<=50; n += 4) { - cout << n; - for (int b=0; b<4; b++) { - double el = go(n,b); - cout << "\t" << el; - } - cout << endl; - } -} diff --git a/trunk/ceph/crush.old/test/speed_depth.cc b/trunk/ceph/crush.old/test/speed_depth.cc deleted file mode 100644 index 32275d16d2b31..0000000000000 --- a/trunk/ceph/crush.old/test/speed_depth.cc +++ /dev/null @@ -1,174 +0,0 @@ - -#include "../../common/Clock.h" -#include "../crush.h" -using namespace crush; - - -Clock g_clock; - -#include - -#include -#include -using namespace std; - - -int uniform = 10; -int branching = 10; -int buckettype = 0; -int numrep = 1; - -Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - return b; - } else { - // mixed - Bucket *b; - if (buckettype == 0) - b = new TreeBucket(h+1); - else if (buckettype == 1 || buckettype == 2) - b = new ListBucket(h+1); - else if (buckettype == 3) - b = new StrawBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); - return b->get_id(); -} - - -double go(int dep, int per) -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - if (1) { - wid.push_back(uniform); - for (int d=1; d v(numrep); - - utime_t start = g_clock.now(); - - set out; - map overload; - - for (int x=1; x <= place; x++) - c.do_rule(rule, x, v, out, overload); - - utime_t end = g_clock.now(); - - end -= start; - double el = (double)end; - - //cout << "\t" << ndisks; - - return el; -} - - -int main() -{ - uniform = branching = 8; - - cout << "// dep\tuniform\tbranch\tndisks" << endl; - - for (int d=2; d<=5; d++) { - cout << d;// << "\t" << branching; - cout << "\t" << uniform; - cout << "\t" << branching; - - int n = 1; - for (int i=0; i - -#include -#include -using namespace std; - - -int branching = 10; -bool linear = false; -int numrep = 1; - -Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - return b; - } else { - // mixed - Bucket *b; - if (linear) - b = new ListBucket(h+1); - else - b = new TreeBucket(h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, int& ndisks) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); - return b->get_id(); -} - - -double go(int s) -{ - int dep = 2; - Hash h(73232313); - - // crush - Crush c; - - - // buckets - int root = -1; - int ndisks = 0; - - vector wid; - if (1) { - //for (int d=0; d v(numrep); - - utime_t start = g_clock.now(); - - for (int x=1; x <= place; x++) - c.do_rule(rule, x, v); - - utime_t end = g_clock.now(); - - end -= start; - double el = (double)end; - - cout << "\t" << ndisks; - - return el; -} - - -int main() -{ - branching = 8; - - int d = 2; - numrep = 2; - - for (int s = 64; s <= 32768; s *= 8) { - cout << "t"; - linear = false; - double el = go(s, d); - cout << "\t" << el; - - cout << "\tp"; - linear = true; - el = go(s, d); - cout << "\t" << el; - - cout << endl; - } -} diff --git a/trunk/ceph/crush.old/test/t.cc b/trunk/ceph/crush.old/test/t.cc deleted file mode 100644 index 0785ef47d6c04..0000000000000 --- a/trunk/ceph/crush.old/test/t.cc +++ /dev/null @@ -1,25 +0,0 @@ - -#include "../../common/Clock.h" -#include "../crush.h" -using namespace crush; - - -Clock g_clock; - -#include - -#include -#include -using namespace std; - - -int branching = 10; -bool linear = false; -int numrep = 1; - -int main() { - - Bucket *b = new UniformBucket(1, 0); - //b = new TreeBucket(1); -} - diff --git a/trunk/ceph/crush.old/test/testbucket.cc b/trunk/ceph/crush.old/test/testbucket.cc deleted file mode 100644 index 065721c2c1967..0000000000000 --- a/trunk/ceph/crush.old/test/testbucket.cc +++ /dev/null @@ -1,61 +0,0 @@ - - -#include "../Bucket.h" -using namespace crush; - -#include -#include -using namespace std; - - -ostream& operator<<(ostream& out, vector& v) -{ - out << "["; - for (int i=0; i ocount(ndisks); - - vector v(numrep); - int nplace = 0; - for (int x=1; x<1000000; x++) { - //cout << H(x) << "\t" << h(x) << endl; - for (int i=0; i -#include -using namespace std; - - -void getdist(vector& v, float& avg, float& var) -{ - avg = 0.0; - for (int i=0; i a(n); - vector b(n); - - for (int i=0; i c(n); - for (int i=0; i>13); \ - b=b-c; b=b-a; b=b^(a<<8); \ - c=c-a; c=c-b; c=c^(b>>13); \ - a=a-b; a=a-c; a=a^(c>>12); \ - b=b-c; b=b-a; b=b^(a<<16); \ - c=c-a; c=c-b; c=c^(b>>5); \ - a=a-b; a=a-c; a=a^(c>>3); \ - b=b-c; b=b-a; b=b^(a<<10); \ - c=c-a; c=c-b; c=c^(b>>15); - -#define crush_hash_seed 1315423911 - -static __inline__ unsigned crush_hash32(unsigned a) { - unsigned hash = crush_hash_seed ^ a; - unsigned b = a; - unsigned x = 231232; - unsigned y = 1232; - hashmix(b, x, hash); - hashmix(y, a, hash); - return (hash & 0xFFFFFFFF); -} - -static __inline__ unsigned crush_hash32_2(unsigned a, unsigned b) { - unsigned hash = crush_hash_seed ^ a ^ b; - unsigned x = 231232; - unsigned y = 1232; - hashmix(a, b, hash); - hashmix(x, a, hash); - hashmix(b, y, hash); - return (hash & 0xFFFFFFFF); -} - -static __inline__ unsigned crush_hash32_3(unsigned a, unsigned b, unsigned c) { - unsigned int hash = crush_hash_seed ^ a ^ b ^ c; - unsigned x = 231232; - unsigned y = 1232; - hashmix(a, b, hash); - hashmix(c, x, hash); - hashmix(y, a, hash); - hashmix(b, x, hash); - hashmix(y, c, hash); - return (hash & 0xFFFFFFFF); -} - -static __inline__ unsigned crush_hash32_4(unsigned a, unsigned b, unsigned c, unsigned d) { - unsigned int hash = crush_hash_seed ^a ^ b ^ c ^ d; - unsigned x = 231232; - unsigned y = 1232; - hashmix(a, b, hash); - hashmix(c, d, hash); - hashmix(a, x, hash); - hashmix(y, b, hash); - hashmix(c, x, hash); - hashmix(y, d, hash); - return (hash & 0xFFFFFFFF); -} - -static __inline__ unsigned crush_hash32_5(unsigned a, unsigned b, unsigned c, unsigned d, unsigned e) { - unsigned int hash = crush_hash_seed ^ a ^ b ^ c ^ d ^ e; - unsigned x = 231232; - unsigned y = 1232; - hashmix(a, b, hash); - hashmix(c, d, hash); - hashmix(e, x, hash); - hashmix(y, a, hash); - hashmix(b, x, hash); - hashmix(y, c, hash); - hashmix(d, x, hash); - hashmix(y, e, hash); - return (hash & 0xFFFFFFFF); -} - -#endif diff --git a/trunk/ceph/crush/mapper.h b/trunk/ceph/crush/mapper.h deleted file mode 100644 index 6c74539b634ca..0000000000000 --- a/trunk/ceph/crush/mapper.h +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef _CRUSH_MAPPER_H -#define _CRUSH_MAPPER_H - -#ifdef __cplusplus -extern "C" { -#endif - -#include "crush.h" - -extern int crush_do_rule(struct crush_map *map, - int ruleno, - int x, int *result, int result_max, - int forcefeed); /* -1 for none */ - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/trunk/ceph/crush/types.h b/trunk/ceph/crush/types.h deleted file mode 100644 index ffb208b2fec01..0000000000000 --- a/trunk/ceph/crush/types.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef _CRUSH_TYPES_H -#define _CRUSH_TYPES_H - -#ifdef KERNEL -# define free(x) kfree(x) -#else -# include -#endif - - -#include /* just for int types */ - -#ifndef BUG_ON -# include -# define BUG_ON(x) assert(!(x)) -#endif - -#endif diff --git a/trunk/ceph/csyn.cc b/trunk/ceph/csyn.cc deleted file mode 100644 index 562f00e3f861b..0000000000000 --- a/trunk/ceph/csyn.cc +++ /dev/null @@ -1,87 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include -#include -#include -using namespace std; - -#include "config.h" - -#include "client/SyntheticClient.h" -#include "client/Client.h" -#include "client/fuse.h" - -#include "msg/SimpleMessenger.h" - -#include "common/Timer.h" - -#ifndef DARWIN -#include -#endif // DARWIN - -#include -#include -#include - -int main(int argc, char **argv, char *envp[]) { - - //cerr << "cfuse starting " << myrank << "/" << world << std::endl; - vector args; - argv_to_vec(argc, argv, args); - parse_config_options(args); - parse_syn_options(args); // for SyntheticClient - - // args for fuse - vec_to_argv(args, argc, argv); - - if (g_conf.clock_tare) g_clock.tare(); - - // load monmap - MonMap monmap; - int r = monmap.read(".ceph_monmap"); - assert(r >= 0); - - // start up network - rank.start_rank(); - - list clients; - list synclients; - - cout << "mounting and starting " << g_conf.num_client << " syn client(s)" << std::endl; - for (int i=0; istart_thread(); - clients.push_back(client); - synclients.push_back(syn); - } - - cout << "waiting for client(s) to finish" << std::endl; - while (!clients.empty()) { - Client *client = clients.front(); - SyntheticClient *syn = synclients.front(); - clients.pop_front(); - synclients.pop_front(); - syn->join_thread(); - delete syn; - delete client; - } - - // wait for messenger to finish - rank.wait(); - - return 0; -} - diff --git a/trunk/ceph/doc/Commitdir.txt b/trunk/ceph/doc/Commitdir.txt deleted file mode 100644 index 05c727be60ae6..0000000000000 --- a/trunk/ceph/doc/Commitdir.txt +++ /dev/null @@ -1,24 +0,0 @@ -OLD - - -How Directory Committing Works: - -Each CDir has: - version - current version of directory - committing_version - which version was sent to stable storage - last_committed_version - last version to be safely stored - -Each Inode has: - parent_dir_version - what dir version i was in when i was dirtied. (*) - - (*) note that if you change an inode, mark_dirty() again, even if it's already dirty! - - -How committing works: - -A call to commit_dir(dir, context) will ensure tha the _current_ version is stored safely on disk before the context is finished. - -When a commit completes, inodes in the directory are checked. If they are dirty and belonged to the _committed_ (or earlier) version, then they are marked clean. If they belong to a newer version, then they are _still dirty_. - - - diff --git a/trunk/ceph/doc/anchortable.txt b/trunk/ceph/doc/anchortable.txt deleted file mode 100644 index d9c0fefc31e08..0000000000000 --- a/trunk/ceph/doc/anchortable.txt +++ /dev/null @@ -1,54 +0,0 @@ - -ANCHOR TABLE PROTOCOL - -MDS sends an update PREPARE to the anchortable MDS. The prepare is -identified by the ino and operation type; only one for each type -(create, update, destroy) can be pending at any time. Both parties -may actually be the same local node, but for simplicity we treat that -situation the same. (That is, we act as if they may fail -independently, even if they can't.) - -The anchortable journals the proposed update, and responds with an -AGREE and a version number. This uniquely identifies the request. - -The MDS can then update the filesystem metadata however it sees fit. -When it is finished (and the results journaled), it sends a COMMIT to -the anchortable. The table journals the commit, frees any state from -the transaction, and sends an ACK. The initiating MDS should then -journal the ACK to complete the transaction. - - -ANCHOR TABLE FAILURE - -If the AT fails before journaling the PREPARE and sending the AGREE, -the initiating MDS will simply retry the request. - -If the AT fails after journaling PREPARE but before journaling COMMIT, -it will resend AGREE to the initiating MDS. - -If the AT fails after the COMMIT, the transaction has been closed, and it -takes no action. If it receives a COMMIT for which it has no open -transaction, it will reply with ACK. - - -INITIATING MDS FAILURE - -If the MDS fails before the metadata update has been journaled, no -action is taken, since nothing is known about the previously proposed -transaction. If an AGREE message is received and there is no -corresponding PREPARE or pending-commit state, and ROLLBACK is sent to -the anchor table. - -If the MDS fails after journaling the metadata update but before -journaling the ACK, it resends COMMIT to the anchor table. If it -receives an AGREE after resending the COMMIT, it simply ignores the -AGREE. The anchortable will respond with an ACK, allowing the -initiating MDS to journal the final ACK and close out the transaction -locally. - -On journal replay, each metadata update (EMetaBlob) encountered that -includes an anchor transaction is noted in the AnchorClient by adding -it to the pending_commit list, and each journaled ACK is removed from -that list. Journal replay may enounter ACKs with no prior metadata -update; these are ignored. When recovery finishes, a COMMIT is sent -for all outstanding transactions. diff --git a/trunk/ceph/doc/bdb.txt b/trunk/ceph/doc/bdb.txt deleted file mode 100644 index 63e647f5bb3cc..0000000000000 --- a/trunk/ceph/doc/bdb.txt +++ /dev/null @@ -1,48 +0,0 @@ -OBJECT STORE ON BERKELEY DB ---------------------------- - -OSBDB is an implementation of an object store that uses Berkeley DB as -the underlying storage. It is meant to be an alternative to EBOFS. - -BUILDING --------- - -You will need to have Berkeley DB installed, including the developent -packages. We've tested this with Berkeley DB 4.4.20 on Ubuntu 6.10. - -To compile OSBDB support, you need to pass the argument "want_bdb=yes" -to "make." If you don't specify this, OSBDB and all its associated -support is not included in the executables. - -RUNNING -------- - -To use OSBDB in Ceph, simply pass the --bdbstore flag to programs. You -don't need to create a "device" for OSBDB ahead of time; Berkeley DB -will take care of creating the files. You also *cannot* use a raw -device as your store -- it must be regular file. - -OSBDB additionally accepts the following flags: - - --bdbstore-btree Configures OSBDB to use the "Btree" - database type for Berkeley DB. The default - database type is "Hash". - - --bdbstore-hash-ffactor Sets the "fill factor" for the hash - database type. Takes an integer argument. - - --bdbstore-hash-nelem Sets the "nelem" parameter for the hash - database type. Takes an integer argument. - - --bdbstore-hash-pagesize Sets the page size for the hash database - type. Takes an integer argument. - - --bdbstore-cachesize Sets the cache size. Takes an integer - argument, which must be a power of two, and - no less than 20 KiB. - - --bdbstore-transactional Enable (in-memory-only) transactions for - all operations in the OSBDB store. - - --debug-bdbstore Set the debug level. Takes an integer - argument. diff --git a/trunk/ceph/doc/caching.txt b/trunk/ceph/doc/caching.txt deleted file mode 100644 index 161eaf7428a53..0000000000000 --- a/trunk/ceph/doc/caching.txt +++ /dev/null @@ -1,303 +0,0 @@ - -SPANNING TREE PROPERTY - -All metadata that exists in the cache is attached directly or -indirectly to the root inode. That is, if the /usr/bin/vi inode is in -the cache, then /usr/bin, /usr, and / are too, including the inodes, -directory objects, and dentries. - - -AUTHORITY - -The authority maintains a list of what nodes cache each inode. -Additionally, each replica is assigned a nonce (initial 0) to -disambiguate multiple replicas of the same item (see below). - - map replicas; // maps replicating mds# to nonce - -The cached_by set _always_ includes all nodes that cache the -partcuarly object, but may additionally include nodes that used to -cache it but no longer do. In those cases, an expire message should -be in transit. That is, we have two invariants: - - 1) the authority's replica set will always include all actual - replicas, and - - 2) cache expiration notices will be reliably delivered to the - authority. - -The second invariant is particularly important because the presence of -replicas will pin the metadata object in memory on the authority, -preventing it from being trimmed from the cache. Notification of -expiration of the replicas is required to allow previously replicated -objects from eventually being trimmed from the cache as well. - -Each metdata object has a authority bit that indicates whether it is -authoritative or a replica. - - -REPLICA NONCE - -Each replicated object maintains a "nonce" value, issued by the -authority at the time the replica was created. If the authority has -already created a replica for the given MDS, the new replica will be -issues a new (incremented) nonce. This nonce is attached -to cache expirations, and allows the authority to disambiguate -expirations when multiple replicas of the same object are created and -cache expiration is coincident with replication. That is, when an -old replica is expired from the replicating MDS at the same time that -a new replica is issued by the authority and the resulting messages -cross paths, the authority can tell that it was the old replica that -was expired and effectively ignore the expiration message. The -replica is removed from the replicas map only if the nonce matches. - - -SUBTREE PARTITION - -Authority of the file system namespace is partitioned using a -subtree-based partitioning strategy. This strategy effectively -separates directory inodes from directory contents, such that the -directory contents are the unit of redelegation. That is, if / is -assigned to mds0 and /usr to mds1, the inode for /usr will be managed -by mds0 (it is part of the / directory), while the contents of /usr -(and everything nested beneath it) will be managed by mds1. - -The description for this partition exists solely in the collective -memory of the MDS cluster and in the individual MDS journals. It is -not described in the regular on-disk metadata structures. This is -related to the fact that authority delegation is a property of the -{\it directory} and not the directory's {\it inode}. - -Subsequently, if an MDS is authoritative for a directory inode and does -not yet have any state associated with the directory in its cache, -then it can assume that it is also authoritative for the directory. - -Directory state consists of a data object that describes any cached -dentries contained in the directory, information about the -relationship between the cached contents and what appears on disk, and -any delegation of authority. That is, each CDir object has a dir_auth -element. Normally dir_auth has a value of AUTH_PARENT, meaning that -the authority for the directory is the same as the directory's inode. -When dir_auth specifies another metadata server, that directory is -point of authority delegation and becomes a {\it subtree root}. A -CDir is a subtree root iff its dir_auth specifies an MDS id (and is not -AUTH_PARENT). - - - A dir is a subtree root iff dir_auth != AUTH_PARENT. - - - If dir_auth = AUTH_PARENT then the inode auth == dir auth, but the - converse may not be true. - -The authority for any metadata object in the cache can be determined -by following the parent pointers toward the root until a subtree root -CDir object is reached, at which point the authority is specified by -its dir_auth. - -Each MDS cache maintains a subtree data structure that describes the -subtree partition for all objects currently in the cache: - - map< CDir*, set > subtrees; - - - A dir will appear in the subtree map (as a key) IFF it is a subtree - root. - -Each subtree root will have an entry in the map. The map value is a -set of all other subtree roots nested beneath that point. Nested -subtree roots effectively bound or prune a subtree. For example, if -we had the following partition: - - mds0 / - mds1 /usr - mds0 /usr/local - mds0 /home - -The subtree map on mds0 would be - - / -> (/usr, /home) - /usr/local -> () - /home -> () - -and on mds1: - - /usr -> (/usr/local) - - -AMBIGUOUS DIR_AUTH - -While metadata for a subtree is being migrated between two MDS nodes, -the dir_auth for the subtree root is allowed to be ambiguous. That -is, it will specify both the old and new MDS ids, indicating that a -migration is in progress. - -If a replicated metadata object is expired from the cache from a -subtree whose authority is ambiguous, the cache expiration is sent to -both potential authorities. This ensures that the message will be -reliably delivered, even if either of those nodes fails. A number of -alternative strategies were considered. Sending the expiration to the -old or new authority and having it forwarded if authority has been -delegated can result in message loss if the forwarding node fails. -Pinning ambiguous metadata in cache is computationally expensive for -implementation reasons, and while delaying the transmission of expiration -messages is difficult to implement because the replicating must send -the final expiration messages when the subtree authority is -disambiguated, forcing it to keep certain elements of it cache in -memory. Although duplicated expirations incurs a small communications -overhead, the implementation is much simpler. - - -AUTH PINS - -Most operations that modify metadata must allow some amount of time to -pass in order for the operation to be journaled or for communication -to take place between the object's authority and any replicas. For -this reason it must not only be pinned in the authority's metadata -cache, but also be locked such that the object's authority is not -allowed to change until the operation completes. This is accomplished -using {\it auth pins}, which increment a reference counter on the -object in question, as well as all parent metadata objects up to the -root of the subtree. As long as the pin is in place, it is impossible -for that subtree (or any fragment of it that contains one or more -pins) to be migrated to a different MDS node. Pins can be placed on -both inodes and directories. - -Auth pins can only exist for authoritative metadata, because they are -only created if the object is authoritative, and their presense -prevents the migration of authority. - - -FREEZING - -More specifically, auth pins prevent a subtree from being frozen. -When a subtree is frozen, all updates to metadata are forbidden. This -includes updates to the replicas map that describes which replicas -(and nonces) exist for each object. - -In order for metadata to be migrated between MDS nodes, it must first -be frozen. The root of the subtree is initially marked as {\it -freezing}. This prevents the creation of any new auth pins within the -subtree. After all existing auth pins are removed, the subtree is -then marked as {\it frozen}, at which point all updates are -forbidden. This allows metadata state to be packaged up in a message -and transmitted to the new authority, without worrying about -intervening updates. - -If the directory at the base of a freezing or frozen subtree is not -also a subtree root (that is, it has dir_auth == AUTH_PARENT), the -directory's parent inode is auth pinned. - - - a frozen tree root dir will auth_pin its inode IFF it is auth AND - not a subtree root. - -This prevents a parent directory from being concurrently frozen, and a -range of resulting implementation complications relating metadata -migration. - - -CACHE EXPIRATION FOR FROZEN SUBTREES - -Cache expiration messages that are received for a subtree that is -frozen are temporarily set aside instead of being processed. Only -when the subtree is unfrozen are the expirations either processed (if -the MDS is authoritative) or discarded (if it is not). Because either -the exporting or importing metadata can fail during the migration -process, the MDS cannot tell whether it will be authoritative or not -until the process completes. - -During a migration, the subtree will first be frozen on both the -exporter and importer, and then all other replicas will be informed of -a subtrees ambiguous authority. This ensures that all expirations -during migration will go to both parties, and nothing will be lost in -the event of a failure. - - - - -NORMAL MIGRATION - -The exporter begins by doing some checks in export_dir() to verify -that it is permissible to export the subtree at this time. In -particular, the cluster must not be degraded, the subtree root may not -be freezing or frozen, and the path must be pinned (\ie not conflicted -with a rename). If these conditions are met, the subtree root -directory is temporarily auth pinned, the subtree freeze is initiated, -and the exporter is committed to the subtree migration, barring an -intervening failure of the importer or itself. - -The MExportDiscover serves simply to ensure that the inode for the -base directory being exported is open on the destination node. It is -pinned by the importer to prevent it from being trimmed. This occurs -before the exporter completes the freeze of the subtree to ensure that -the importer is able to replicate the necessary metadata. When the -exporter receives the MDiscoverAck, it allows the freeze to proceed by -removing its temporary auth pin. - -The MExportPrep message then follows to populate the importer with a -spanning tree that includes all dirs, inodes, and dentries necessary -to reach any nested subtrees within the exported region. This -replicates metadata as well, but it is pushed out by the exporter, -avoiding deadlock with the regular discover and replication process. -The importer is responsible for opening the bounding directories from -any third parties authoritative for those subtrees before -acknowledging. This ensures that the importer has correct dir_auth -information about where authority is redelegated for all points nested -beneath the subtree being migrated. While processing the MExportPrep, -the importer freezes the entire subtree region to prevent any new -replication or cache expiration. - -A warning stage occurs only if the base subtree directory is open by -nodes other than the importer and exporter. If it is not, then this -implies that no metadata within or nested beneath the subtree is -replicated by any node other than the importer an exporter. If it is, -then a MExportWarning message informs any bystanders that the -authority for the region is temporarily ambiguous, and lists both the -exporter and importer as authoritative MDS nodes. In particular, -bystanders who are trimming items from their cache must send -MCacheExpire messages to both the old and new authorities. This is -necessary to ensure that the surviving authority reliably receives all -expirations even if the importer or exporter fails. While the subtree -is frozen (on both the importer and exporter), expirations will not be -immediately processed; instead, they will be queued until the region -is unfrozen and it can be determined that the node is or is not -authoritative. - -The exporter walks the subtree hierarchy and packages up an MExport -message containing all metadata and important state (\eg, information -about metadata replicas). At the same time, the expoter's metadata -objects are flagged as non-authoritative. The MExport message sends -the actual subtree metadata to the importer. Upon receipt, the -importer inserts the data into its cache, marks all objects as -authoritative, and logs a copy of all metadata in an EImportStart -journal message. Once that has safely flushed, it replies with an -MExportAck. The exporter can now log an EExport journal entry, which -ultimately specifies that the export was a success. In the presence -of failures, it is the existence of the EExport entry only that -disambiguates authority during recovery. - -Once logged, the exporter will send an MExportNotify to any -bystanders, informing them that the authority is no longer ambiguous -and cache expirations should be sent only to the new authority (the -importer). Once these are acknowledged back to the exporter, -implicitly flushing the bystander to exporter message streams of any -stray expiration notices, the exporter unfreezes the subtree, cleans -up its migration-related state, and sends a final MExportFinish to the -importer. Upon receipt, the importer logs an EImportFinish(true) -(noting locally that the export was indeed a success), unfreezes its -subtree, processes any queued cache expierations, and cleans up its -state. - - -PARTIAL FAILURE RECOVERY - - - - -RECOVERY FROM JOURNAL - - - - - - - - - diff --git a/trunk/ceph/doc/exports.txt b/trunk/ceph/doc/exports.txt deleted file mode 100644 index 8e0e146bea2fe..0000000000000 --- a/trunk/ceph/doc/exports.txt +++ /dev/null @@ -1,72 +0,0 @@ - -NORMAL MIGRATION - -The exporter begins by doing some checks in export_dir() to verify -that it is permissible to export the subtree at this time. In -particular, the cluster must not be degraded, the subtree root may not -be freezing or frozen (\ie already exporting, or nested beneath -something that is exporting), and the path must be pinned (\ie not -conflicted with a rename). If these conditions are met, the subtree -freeze is initiated, and the exporter is committed to the subtree -migration, barring an intervening failure of the importer or itself. - -The MExportDiscover serves simply to ensure that the base directory -being exported is open on the destination node. It is pinned by the -importer to prevent it from being trimmed. This occurs before the -exporter completes the freeze of the subtree to ensure that the -importer is able to replicate the necessary metadata. When the -exporter receives the MDiscoverAck, it allows the freeze to proceed. - -The MExportPrep message then follows to populate a spanning tree that -includes all dirs, inodes, and dentries necessary to reach any nested -exports within the exported region. This replicates metadata as well, -but it is pushed out by the exporter, avoiding deadlock with the -regular discover and replication process. The importer is responsible -for opening the bounding directories from any third parties before -acknowledging. This ensures that the importer has correct dir_auth -information about where authority is delegated for all points nested -within the subtree being migrated. While processing the MExportPrep, -the importer freezes the entire subtree region to prevent any new -replication or cache expiration. - -The warning stage occurs only if the base subtree directory is open by -nodes other than the importer and exporter. If so, then a -MExportWarning message informs any bystanders that the authority for -the region is temporarily ambiguous. In particular, bystanders who -are trimming items from their cache must send MCacheExpire messages to -both the old and new authorities. This is necessary to ensure that -the surviving authority reliably receives all expirations even if the -importer or exporter fails. While the subtree is frozen (on both the -importer and exporter), expirations will not be immediately processed; -instead, they will be queued until the region is unfrozen and it can -be determined that the node is or is not authoritative for the region. - -The MExport message sends the actual subtree metadata to the importer. -Upon receipt, the importer inserts the data into its cache, logs a -copy in the EImportStart, and replies with an ExportAck. The exporter -can now log an EExportFinish(true), which ultimately specifies that -the export was a success. In the presence of failures, it is the -existence (and value) of the EExportFinish that disambiguates -authority during recovery. - -Once logged, the exporter will send an MExportNotify to any -bystanders, informing them that the authority is no longer ambiguous -and cache expirations should be sent only to the new authority (the -importer). Once these are acknowledged, implicitly flushing the -bystander to exporter message streams of any stray expiration notices, -the exporter unfreezes the subtree, cleans up its state, and sends a -final MExportFinish to the importer. Upon receipt, the importer logs -an EImportFinish(true), unfreezes its subtree, and cleans up its -state. - - -PARTIAL FAILURE RECOVERY - - - -RECOVERY FROM JOURNAL - - - - - diff --git a/trunk/ceph/doc/header.txt b/trunk/ceph/doc/header.txt deleted file mode 100644 index bccdb81533b6f..0000000000000 --- a/trunk/ceph/doc/header.txt +++ /dev/null @@ -1,13 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ diff --git a/trunk/ceph/doc/inos.txt b/trunk/ceph/doc/inos.txt deleted file mode 100644 index b5ab1db25ca60..0000000000000 --- a/trunk/ceph/doc/inos.txt +++ /dev/null @@ -1,11 +0,0 @@ - -inodeno_t namespace - - relevant both for ino's, and for the (ino) input for Filer and object storage namespace... - -1 - root inode - -100+mds - mds log/journal -200+mds - mds ino, fh allocation tables -300+mds - mds inode files (for non-embedded inodes) - -1000+ - regular files and directories \ No newline at end of file diff --git a/trunk/ceph/doc/lazy_posix.txt b/trunk/ceph/doc/lazy_posix.txt deleted file mode 100644 index 1d226cd03d8e4..0000000000000 --- a/trunk/ceph/doc/lazy_posix.txt +++ /dev/null @@ -1,53 +0,0 @@ - -http://www.usenix.org/events/fast05/wips/slides/welch.pdf - - - --- STATLITE - statlite(const char *filename, struct statlite *buf); - fstatlite(int fd, struct statlite *buf); - lstatlite(const char *filename, struct statlite *buf); - - * file size, mtime are optionally not guaranteed to be correct - * mask field to specify which fields you need to be correct - - --- READDIR+ - - struct dirent_plus *readdirplus(DIR *dirp); - int readdirplus_r(DIR *dirp, struct dirent_plus *entry, struct dirent_plus **result); - struct dirent_lite *readdirlite(DIR *dirp); - int readdirlite_r(DIR *dirp, struct dirent_lite *entry, struct dirent_lite **result); - - * plus returns lstat - * lite returns lstatlite - - --- lazy i/o integrity - - O_LAZY to open(2) - - * relax data coherency - * writes may not be visible until lazyio_propagate, fsync, close - - lazyio_propagate(int fd, off_t offset, size_t count); - * my writes are safe - - lazyio_synchronize(int fd, off_t offset, size_t count); - * i will see everyone else's propagated writes - --- read/write non-serial vectors - - ssize_t readx(int fd, const struct iovec *iov, size_t iov_count, struct xtvec *xtv, size_t xtv_count); - ssize_t writex(int fd, const struct iovec *iov, size_t iov_count, struct xtvec *xtv, size_t xtv_count); - - * like readv/writev, but serial - * - - -int lockg(int fd, int cmd, lgid_t *lgid) - group locks - -int openg(char *path, int mode, fh_t *handle); - portable file handle -int sutoc(fh_t *fh); \ No newline at end of file diff --git a/trunk/ceph/doc/mds_locks.txt b/trunk/ceph/doc/mds_locks.txt deleted file mode 100644 index f41a89a9b31e5..0000000000000 --- a/trunk/ceph/doc/mds_locks.txt +++ /dev/null @@ -1,66 +0,0 @@ - -new names - dentry_read (not path_pins) - dentry_xlock - - inode_read - inode_xlock (not inode_write) - -locks are always tied to active_requests. - -read locks can be placed on any node. -xlocks must be applied at the authority. - -for multi-lock operations (link, unlink, rename), we must acquire xlocks on a remote node. lock requests are associated with a reqid. the authoritative node keeps track of which remote xlocks it holds. when forwarded/restarted, it can drop remote locks. - -when restarting, drop all locks. -on remote, drop locks and state, and notify main req node. -recover dist request state on rejoin: - - surviving op initiator will assert read or xlock - - recovering op initiator will restart requests. (from initiator's perspective, ops have either happened or they haven't, depending on whether the event is journaled.) - - recovering or surviving op cohort will determine lock state during rejoin, or get a commit or rollback... - - - - ---- path_pin = read lock on /some/random/path - - blocks a dentry xlock - ---- dnxlock = exclusive lock on /some/random/path - - locking: prevents subsequent path pins. - - locked: prevents dn read - - on auth - --> grab _all_ path pins at onces; hold none while waiting. --> grab xlocks in order. - ---- auth_pin = pin to authority, on *dir, *in - - prevents freezing -> frozen. - - freezing blocks new auth pins, thus blocking other local auth_pins. (hangs up local export.) - - does not block remote auth_pins, because remote side is not auth (or frozen!) until after local subtree is frozen. - --> blocking on auth_pins is dangerous. _never_ block if we are holding other auth_pins on the same node (subtree?). --> grab _all_ auth pins at once; hold none while waiting. - ---- hard/file_wrlock = exlusive lock on inode content - - prevents inode read - - on auth - --> grab locks in order. - - -ORDERING -- namespace(dentries) < inodes -- order dentries on (dirino, dname) -- order inodes on (ino); -- need to order both read and write locks, esp with dentries. so, if we need to lock /usr/bin/foo with read on usr and bin and xwrite on foo, we need to acquire all of those locks using the same ordering. - - on same host, we can be 'nice' and check lockability of all items, then lock all, and drop everything while waiting. (actually, is there any use to this?) - - on mutiple hosts, we need to use full ordering (at least as things separate across host boundaries). and if needed lock set changes (such that the order of already acquired locks changes), we need to drop those locks and start over. - -- how do auth pins fit into all this? - - auth pin on xlocks only. no need on read locks. - - pre-grab all auth pins on a node the first time it is visiting during lock acquisition. - - what if things move? if we find we are missing a needed auth pin when we revisit a host at any point, and the item is not still authpinnable, we back off and restart. (we cannot block.) - - - - if we find we are not authpinnable, drop all locks and wait. - - diff --git a/trunk/ceph/doc/modeline.txt b/trunk/ceph/doc/modeline.txt deleted file mode 100644 index 1b3956f4d486b..0000000000000 --- a/trunk/ceph/doc/modeline.txt +++ /dev/null @@ -1,2 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab diff --git a/trunk/ceph/doc/shared_write_states_nogo.txt b/trunk/ceph/doc/shared_write_states_nogo.txt deleted file mode 100644 index f409617d82681..0000000000000 --- a/trunk/ceph/doc/shared_write_states_nogo.txt +++ /dev/null @@ -1,39 +0,0 @@ - -// stable states // ------auth----- -----replica----- -#define LOCK_SYNC 0 // R . / . . . WB same ... for stat() -#define LOCK_LOCK 1 // R W / RC . . . . . / RC . . . ... for truncate(), fsync() -#define LOCK_RDONLY 2 // R . / RC R . . same -#define LOCK_MIXED 3 // . . / . R W . same -#define LOCK_WRONLY 4 // . . / . . W WB same - -// transition states -#define LOCK_GSYNCR 8 // R . / RC . . . same -#define LOCK_GSYNCMW 9 // . . / RC . . WB same -#define LOCK_GSYNCMW2 9 // . . / RC . . WB same - -#define LOCK_GLOCKSR 5 // R . / RC . . . . . / RC . . . -#define LOCK_GLOCKMW 7 // . . / RC . . . same - -#define LOCK_GRDONLYM 10 // . . / . R . . same -#define LOCK_GRDONLYM2 10 // --- . . / . R . . -#define LOCK_GRDONLYW 11 // . . / . . . . same -#define LOCK_GRDONLYW2 11 // --- . . / . . . . -#define LOCK_GRDONLYS 12 // R . / RC . . . same -#define LOCK_GRDONLYL 13 // R . / RC . . . --- - -#define LOCK_GMIXEDR 14 // R . / . R . . . . / . R . . -#define LOCK_GMIXEDR2 15 // --- . . / . R . . -#define LOCK_GMIXEDW 16 // . . / . . W . same -#define LOCK_GMIXEDW2 16 // --- . . / . . W . -#define LOCK_GMIXEDS 16 // R . / . . . . . . / . . . . -#define LOCK_GMIXEDS2 16 // --- . . / . . . . -#define LOCK_GMIXEDL 17 // R . / . . . . --- - -#define LOCK_GWRONLYR 18 // R . / . . . . same -#define LOCK_GWRONLYR2 18 // --- . . / . . . . -#define LOCK_GWRONLYM 19 // . . / . . . . same -#define LOCK_GWRONLYM2 19 // --- . . / . . . . -#define LOCK_GWRONLYS 20 // R . / . . . WB same -#define LOCK_GWRONLYS2 20 // --- . . / . . . . -#define LOCK_GWRONLYL 21 - diff --git a/trunk/ceph/ebofs/Allocator.cc b/trunk/ceph/ebofs/Allocator.cc deleted file mode 100644 index 35b0db16b84c2..0000000000000 --- a/trunk/ceph/ebofs/Allocator.cc +++ /dev/null @@ -1,693 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "Allocator.h" -#include "Ebofs.h" - - -#undef dout -#define dout(x) if (x <= g_conf.debug_ebofs) *_dout << dbeginl << g_clock.now() << " ebofs(" << fs->dev.get_device_name() << ").allocator." - - -void Allocator::dump_freelist() -{ - if (1) { - interval_set free; // validate too - - block_t n = 0; - for (int b=0; b<=EBOFS_NUM_FREE_BUCKETS; b++) { - Table *tab; - if (b < EBOFS_NUM_FREE_BUCKETS) { - tab = fs->free_tab[b]; - dout(0) << "dump bucket " << b << " " << tab->get_num_keys() << dendl; - } else { - tab = fs->limbo_tab; - dout(0) << "dump limbo " << tab->get_num_keys() << dendl;; - } - - if (tab->get_num_keys() > 0) { - Table::Cursor cursor(tab); - assert(tab->find(0, cursor) >= 0); - while (1) { - dout(0) << "dump ex " << cursor.current().key << "~" << cursor.current().value << dendl; - assert(cursor.current().value > 0); - - if (b < EBOFS_NUM_FREE_BUCKETS) - n += cursor.current().value; - - if (free.contains( cursor.current().key, cursor.current().value )) - dout(0) << "dump bad " << cursor.current().key << "~" << cursor.current().value << dendl; - assert(!free.contains( cursor.current().key, cursor.current().value )); - free.insert( cursor.current().key, cursor.current().value ); - if (cursor.move_right() <= 0) break; - } - } else { - //dout(0) << " empty" << dendl; - } - } - - assert(n == fs->free_blocks); - dout(0) << "dump combined freelist is " << free << dendl; - - - // alloc_tab - if (fs->alloc_tab->get_num_keys() > 0) { - Table >::Cursor cursor(fs->alloc_tab); - assert(fs->alloc_tab->find(0, cursor) >= 0); - while (1) { - dout(0) << "alloc ex " << cursor.current().key << "~" << cursor.current().value.first << " ref " - << cursor.current().value.second - << dendl; - assert(cursor.current().value.first > 0); - - if (cursor.move_right() <= 0) break; - } - } - } -} - - -int Allocator::find(Extent& ex, int bucket, block_t num, block_t near, int dir) -{ - Table::Cursor cursor(fs->free_tab[bucket]); - bool found = false; - - if ((dir == DIR_ANY || dir == DIR_FWD) && - fs->free_tab[bucket]->find( near, cursor ) >= 0) { - // look to the right - do { - if (cursor.current().value >= num) - found = true; - } while (!found && cursor.move_right() > 0); - } - - if ((dir == DIR_ANY || dir == DIR_BACK) && - !found) { - // look to the left - fs->free_tab[bucket]->find( near, cursor ); - - while (!found && cursor.move_left() >= 0) - if (cursor.current().value >= num) - found = true; - } - - if (found) { - ex.start = cursor.current().key; - ex.length = cursor.current().value; - return 0; - } - - return -1; -} - -int Allocator::allocate(Extent& ex, block_t num, block_t near) -{ - //dump_freelist(); - - int dir = DIR_ANY; // no dir - if (near == NEAR_LAST_FWD) { - near = last_pos; - dir = DIR_FWD; // fwd - } - else if (near == NEAR_LAST) - near = last_pos; - - int bucket; - - while (1) { // try twice, if fwd = true - - // look for contiguous extent - for (bucket = pick_bucket(num); bucket < EBOFS_NUM_FREE_BUCKETS; bucket++) { - if (find(ex, bucket, num, near, dir) >= 0) { - // yay! - - // remove original - fs->free_tab[bucket]->remove( ex.start ); - fs->free_blocks -= ex.length; - - if (ex.length > num) { - if (ex.start < near) { - // to the left - if (ex.start + ex.length - num <= near) { - // by a lot. take right-most portion. - Extent left; - left.start = ex.start; - left.length = ex.length - num; - ex.start += left.length; - ex.length -= left.length; - assert(ex.length == num); - _release_loner(left); - } else { - // take middle part. - Extent left,right; - left.start = ex.start; - left.length = near - ex.start; - ex.start = near; - right.start = ex.start + num; - right.length = ex.length - left.length - num; - ex.length = num; - _release_loner(left); - _release_loner(right); - } - } - else { - // to the right. take left-most part. - Extent right; - right.start = ex.start + num; - right.length = ex.length - num; - ex.length = num; - _release_loner(right); - } - } - - dout(20) << "allocate " << ex << " near " << near << dendl; - last_pos = ex.end(); - //dump_freelist(); - if (g_conf.ebofs_cloneable) - alloc_inc(ex); - return num; - } - } - - if (dir == DIR_BACK || dir == DIR_ANY) break; - dir = DIR_BACK; - } - - // ok, find partial extent instead. - for (block_t trysize = num/2; trysize >= 1; trysize /= 2) { - int bucket = pick_bucket(trysize); - if (find(ex, bucket, trysize, near) >= 0) { - // yay! - assert(ex.length < num); - - fs->free_tab[bucket]->remove(ex.start); - fs->free_blocks -= ex.length; - last_pos = ex.end(); - dout(20) << "allocate partial " << ex << " (wanted " << num << ") near " << near << dendl; - //dump_freelist(); - if (g_conf.ebofs_cloneable) - alloc_inc(ex); - return ex.length; - } - } - - dout(1) << "allocate failed, fs completely full! " << fs->free_blocks << dendl; - assert(0); - //dump_freelist(); - return -1; -} - -int Allocator::_release_into_limbo(Extent& ex) -{ - dout(10) << "_release_into_limbo " << ex << dendl; - dout(10) << "limbo is " << limbo << dendl; - assert(ex.length > 0); - limbo.insert(ex.start, ex.length); - fs->limbo_blocks += ex.length; - return 0; -} - -int Allocator::release(Extent& ex) -{ - if (g_conf.ebofs_cloneable) - return alloc_dec(ex); - - _release_into_limbo(ex); - return 0; -} - -int Allocator::commit_limbo() -{ - dout(20) << "commit_limbo" << dendl; - for (map::iterator i = limbo.m.begin(); - i != limbo.m.end(); - i++) { - fs->limbo_tab->insert(i->first, i->second); - //fs->free_blocks += i->second; - } - limbo.clear(); - //fs->limbo_blocks = 0; - //dump_freelist(); - return 0; -} - -int Allocator::release_limbo() -{ - //dump_freelist(); - if (fs->limbo_tab->get_num_keys() > 0) { - Table::Cursor cursor(fs->limbo_tab); - fs->limbo_tab->find(0, cursor); - while (1) { - Extent ex(cursor.current().key, cursor.current().value); - dout(20) << "release_limbo ex " << ex << dendl; - - fs->limbo_blocks -= ex.length; - _release_merge(ex); - - if (cursor.move_right() <= 0) break; - } - } - fs->limbo_tab->clear(); - //dump_freelist(); - return 0; -} - - - -/* -int Allocator::_alloc_loner_inc(Extent& ex) -{ - Table >::Cursor cursor(fs->alloc_tab); - - if (fs->alloc_tab->find( ex.start, cursor ) - == Table >::Cursor::MATCH) { - assert(cursor.current().value.first == ex.length); - pair& v = cursor.dirty_current_value(); - v.second++; - dout(10) << "_alloc_loner_inc " << ex << " " - << (v.second-1) << " -> " << v.second - << dendl; - } else { - // insert it, @1 - fs->alloc_tab->insert(ex.start, pair(ex.length,1)); - dout(10) << "_alloc_loner_inc " << ex << " 0 -> 1" << dendl; - } - return 0; -} - -int Allocator::_alloc_loner_dec(Extent& ex) -{ - Table >::Cursor cursor(fs->alloc_tab); - - if (fs->alloc_tab->find( ex.start, cursor ) - == Table >::Cursor::MATCH) { - assert(cursor.current().value.first == ex.length); - if (cursor.current().value.second == 1) { - dout(10) << "_alloc_loner_dec " << ex << " 1 -> 0" << dendl; - fs->alloc_tab->remove( cursor.current().key ); - } else { - pair& v = cursor.dirty_current_value(); - --v.second; - dout(10) << "_alloc_loner_dec " << ex << " " - << (v.second+1) << " -> " << v.second - << dendl; - } - } else { - assert(0); - } - return 0; -} -*/ - - -int Allocator::alloc_inc(Extent ex) -{ - dout(10) << "alloc_inc " << ex << dendl; - - // empty table? - if (fs->alloc_tab->get_num_keys() == 0) { - // easy. - fs->alloc_tab->insert(ex.start, pair(ex.length,1)); - dout(10) << "alloc_inc + " << ex << " 0 -> 1 (first entry)" << dendl; - return 0; - } - - Table >::Cursor cursor(fs->alloc_tab); - - // try to move to left (to check for overlap) - int r = fs->alloc_tab->find( ex.start, cursor ); - if (r == Table >::Cursor::OOB || - cursor.current().key > ex.start) { - r = cursor.move_left(); - dout(10) << "alloc_inc move_left r = " << r << dendl; - } - - while (1) { - dout(10) << "alloc_inc loop at " << cursor.current().key - << "~" << cursor.current().value.first - << " ref " << cursor.current().value.second - << dendl; - - // too far left? - if (cursor.current().key < ex.start && - cursor.current().key + cursor.current().value.first <= ex.start) { - // adjacent? - bool adjacent = false; - if (cursor.current().key + cursor.current().value.first == ex.start && - cursor.current().value.second == 1) - adjacent = true; - - // no overlap. - r = cursor.move_right(); - dout(10) << "alloc_inc move_right r = " << r << dendl; - - // at end? - if (r <= 0) { - // hmm! - if (adjacent) { - // adjust previous entry - cursor.move_left(); - pair &v = cursor.dirty_current_value(); - v.first += ex.length; // yay! - dout(10) << "alloc_inc + " << ex << " 0 -> 1 (adjust at end)" << dendl; - } else { - // insert at end, finish. - int r = fs->alloc_tab->insert(ex.start, pair(ex.length,1)); - dout(10) << "alloc_inc + " << ex << " 0 -> 1 (at end) .. r = " << r << dendl; - //dump_freelist(); - } - return 0; - } - } - - if (cursor.current().key > ex.start) { - // gap. - // oooooo - // nnnnn..... - block_t l = MIN(ex.length, cursor.current().key - ex.start); - - fs->alloc_tab->insert(ex.start, pair(l,1)); - dout(10) << "alloc_inc + " << ex.start << "~" << l << " 0 -> 1" << dendl; - ex.start += l; - ex.length -= l; - if (ex.length == 0) break; - fs->alloc_tab->find( ex.start, cursor ); - } - else if (cursor.current().key < ex.start) { - block_t end = cursor.current().value.first + cursor.current().key; - - if (end <= ex.end()) { - // single split - // oooooo - // nnnnn - pair& v = cursor.dirty_current_value(); - v.first = ex.start - cursor.current().key; - int ref = v.second; - - block_t l = end - ex.start; - fs->alloc_tab->insert(ex.start, pair(l, 1+ref)); - - dout(10) << "alloc_inc " << ex.start << "~" << l - << " " << ref << " -> " << ref+1 - << " (right split)" << dendl; - - ex.start += l; - ex.length -= l; - if (ex.length == 0) break; - fs->alloc_tab->find( ex.start, cursor ); - - } else { - // double split, finish. - // ------------- - // ------ - pair& v = cursor.dirty_current_value(); - v.first = ex.start - cursor.current().key; - int ref = v.second; - - fs->alloc_tab->insert(ex.start, pair(ex.length, 1+ref)); - - int rl = end - ex.end(); - fs->alloc_tab->insert(ex.end(), pair(rl, ref)); - - dout(10) << "alloc_inc " << ex - << " " << ref << " -> " << ref+1 - << " (double split finish)" - << dendl; - - break; - } - } - else { - assert(cursor.current().key == ex.start); - - if (cursor.current().value.first <= ex.length) { - // inc. - // oooooo - // nnnnnnnn - pair& v = cursor.dirty_current_value(); - v.second++; - dout(10) << "alloc_inc " << ex.start << "~" << cursor.current().value.first - << " " << cursor.current().value.second-1 << " -> " - << cursor.current().value.second - << " (left split)" << dendl; - ex.start += v.first; - ex.length -= v.first; - if (ex.length == 0) break; - cursor.move_right(); - } else { - // single split, finish. - // oooooo - // nnn - block_t l = cursor.current().value.first - ex.length; - int ref = cursor.current().value.second; - - pair& v = cursor.dirty_current_value(); - v.first = ex.length; - v.second++; - - fs->alloc_tab->insert(ex.end(), pair(l, ref)); - - dout(10) << "alloc_inc " << ex - << " " << ref << " -> " << ref+1 - << " (left split finish)" - << dendl; - - break; - } - } - } - - return 0; -} - - -int Allocator::alloc_dec(Extent ex) -{ - dout(10) << "alloc_dec " << ex << dendl; - - assert(fs->alloc_tab->get_num_keys() >= 0); - - Table >::Cursor cursor(fs->alloc_tab); - - // try to move to left (to check for overlap) - int r = fs->alloc_tab->find( ex.start, cursor ); - dout(10) << "alloc_dec find r = " << r << dendl; - - if (r == Table >::Cursor::OOB || - cursor.current().key > ex.start) { - r = cursor.move_left(); - dout(10) << "alloc_dec move_left r = " << r << dendl; - - // too far left? - if (cursor.current().key < ex.start && - cursor.current().key + cursor.current().value.first <= ex.start) { - // no overlap. - dump_freelist(); - assert(0); - } - } - - while (1) { - dout(10) << "alloc_dec ? " << cursor.current().key - << "~" << cursor.current().value.first - << " " << cursor.current().value.second - << ", ex is " << ex - << dendl; - - assert(cursor.current().key <= ex.start); // no gap allowed. - - if (cursor.current().key < ex.start) { - block_t end = cursor.current().value.first + cursor.current().key; - - if (end <= ex.end()) { - // single split - // oooooo - // ----- - pair& v = cursor.dirty_current_value(); - v.first = ex.start - cursor.current().key; - int ref = v.second; - dout(10) << "alloc_dec s " << cursor.current().key << "~" << cursor.current().value.first - << " " << ref - << " shortened left bit of single" << dendl; - - block_t l = end - ex.start; - if (ref > 1) { - fs->alloc_tab->insert(ex.start, pair(l, ref-1)); - dout(10) << "alloc_dec . " << ex.start << "~" << l - << " " << ref << " -> " << ref-1 - << dendl; - } else { - Extent r(ex.start, l); - _release_into_limbo(r); - } - - ex.start += l; - ex.length -= l; - if (ex.length == 0) break; - fs->alloc_tab->find( ex.start, cursor ); - - } else { - // double split, finish. - // ooooooooooooo - // ------ - pair& v = cursor.dirty_current_value(); - v.first = ex.start - cursor.current().key; - int ref = v.second; - dout(10) << "alloc_dec s " << cursor.current().key << "~" << cursor.current().value.first - << " " << ref - << " shorted left bit of double split" << dendl; - - if (ref > 1) { - fs->alloc_tab->insert(ex.start, pair(ex.length, ref-1)); - dout(10) << "alloc_inc s " << ex - << " " << ref << " -> " << ref-1 - << " reinserted middle bit of double split" - << dendl; - } else { - _release_into_limbo(ex); - } - - int rl = end - ex.end(); - fs->alloc_tab->insert(ex.end(), pair(rl, ref)); - dout(10) << "alloc_dec s " << ex.end() << "~" << rl - << " " << ref - << " reinserted right bit of double split" << dendl; - break; - } - } - else { - assert(cursor.current().key == ex.start); - - if (cursor.current().value.first <= ex.length) { - // inc. - // oooooo - // nnnnnnnn - if (cursor.current().value.second > 1) { - pair& v = cursor.dirty_current_value(); - v.second--; - dout(10) << "alloc_dec s " << ex.start << "~" << cursor.current().value.first - << " " << cursor.current().value.second+1 << " -> " << cursor.current().value.second - << dendl; - ex.start += v.first; - ex.length -= v.first; - if (ex.length == 0) break; - cursor.move_right(); - } else { - Extent r(cursor.current().key, cursor.current().value.first); - _release_into_limbo(r); - - ex.start += cursor.current().value.first; - ex.length -= cursor.current().value.first; - cursor.remove(); - - if (ex.length == 0) break; - fs->alloc_tab->find( ex.start, cursor ); - } - } else { - // single split, finish. - // oooooo - // nnn - block_t l = cursor.current().value.first - ex.length; - int ref = cursor.current().value.second; - - if (ref > 1) { - pair& v = cursor.dirty_current_value(); - v.first = ex.length; - v.second--; - dout(10) << "alloc_inc . " << ex - << " " << ref << " -> " << ref-1 - << dendl; - } else { - _release_into_limbo(ex); - cursor.remove(); - } - - dout(10) << "alloc_dec s " << ex.end() << "~" << l - << " " << ref - << " reinserted right bit of single split" << dendl; - fs->alloc_tab->insert(ex.end(), pair(l, ref)); - break; - } - } - - - } - - return 0; -} - - -/* - * release extent into freelist - * WARNING: *ONLY* use this if you _know_ there are no adjacent free extents - */ -int Allocator::_release_loner(Extent& ex) -{ - assert(ex.length > 0); - int b = pick_bucket(ex.length); - fs->free_tab[b]->insert(ex.start, ex.length); - fs->free_blocks += ex.length; - return 0; -} - -/* - * release extent into freelist - * look for any adjacent extents and merge with them! - */ -int Allocator::_release_merge(Extent& orig) -{ - dout(15) << "_release_merge " << orig << dendl; - assert(orig.length > 0); - - Extent newex = orig; - - // one after us? - for (int b=0; b::Cursor cursor(fs->free_tab[b]); - - if (fs->free_tab[b]->find( newex.start+newex.length, cursor ) - == Table::Cursor::MATCH) { - // add following extent to ours - newex.length += cursor.current().value; - - // remove it - fs->free_blocks -= cursor.current().value; - fs->free_tab[b]->remove( cursor.current().key ); - break; - } - } - - // one before us? - for (int b=0; b::Cursor cursor(fs->free_tab[b]); - fs->free_tab[b]->find( newex.start+newex.length, cursor ); - if (cursor.move_left() >= 0 && - (cursor.current().key + cursor.current().value == newex.start)) { - // merge - newex.start = cursor.current().key; - newex.length += cursor.current().value; - - // remove it - fs->free_blocks -= cursor.current().value; - fs->free_tab[b]->remove( cursor.current().key ); - break; - } - } - - // ok, insert newex - _release_loner(newex); - return 0; -} diff --git a/trunk/ceph/ebofs/Allocator.h b/trunk/ceph/ebofs/Allocator.h deleted file mode 100644 index c1898784d50a7..0000000000000 --- a/trunk/ceph/ebofs/Allocator.h +++ /dev/null @@ -1,85 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __EBOFS_ALLOCATOR_H -#define __EBOFS_ALLOCATOR_H - -#include "types.h" - -#include "include/interval_set.h" - -class Ebofs; - -class Allocator { -public: - const static block_t NEAR_LAST = 0; - const static block_t NEAR_LAST_FWD = 1; - - const static int DIR_ANY = 0; - const static int DIR_FWD = 2; - const static int DIR_BACK = 1; - -protected: - Ebofs *fs; - block_t last_pos; - - - interval_set limbo; - - static int pick_bucket(block_t num) { - int b = 0; - while (num > 1) { - b++; - num = num >> EBOFS_FREE_BUCKET_BITS; - } - if (b >= EBOFS_NUM_FREE_BUCKETS) - b = EBOFS_NUM_FREE_BUCKETS-1; - return b; - } - - int find(Extent& ex, int bucket, block_t num, block_t near, int dir = DIR_ANY); - - void dump_freelist(); - - public: - int _release_into_limbo(Extent& ex); - - int _release_loner(Extent& ex); // release loner extent - int _release_merge(Extent& ex); // release any extent (searches for adjacent) - - //int _alloc_loner_inc(Extent& ex); - //int _alloc_loner_dec(Extent& ex); - - - public: - Allocator(Ebofs *f) : fs(f), last_pos(0) {} - - int allocate(Extent& ex, block_t num, block_t near=NEAR_LAST); - int release(Extent& ex); // alias for alloc_dec - - int alloc_inc(Extent ex); - int alloc_dec(Extent ex); - - - int unallocate(Extent& ex) { // skip limbo - return _release_merge(ex); - } - - int commit_limbo(); // limbo -> fs->limbo_tab - int release_limbo(); // fs->limbo_tab -> free_tabs - -}; - -#endif diff --git a/trunk/ceph/ebofs/FileJournal.cc b/trunk/ceph/ebofs/FileJournal.cc deleted file mode 100644 index 35a1e6f4127b6..0000000000000 --- a/trunk/ceph/ebofs/FileJournal.cc +++ /dev/null @@ -1,456 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "FileJournal.h" -#include "Ebofs.h" - -#include -#include -#include -#include - - -#include "config.h" - -#define dout(x) if (x <= g_conf.debug_ebofs) *_dout << dbeginl << g_clock.now() << " ebofs(" << ebofs->dev.get_device_name() << ").journal " -#define derr(x) if (x <= g_conf.debug_ebofs) *_derr << dbeginl << g_clock.now() << " ebofs(" << ebofs->dev.get_device_name() << ").journal " - - -int FileJournal::create() -{ - dout(2) << "create " << fn << dendl; - - // open/create - fd = ::open(fn.c_str(), O_RDWR|O_SYNC); - if (fd < 0) { - dout(2) << "create failed " << errno << " " << strerror(errno) << dendl; - return -errno; - } - assert(fd > 0); - - //::ftruncate(fd, 0); - //::fchmod(fd, 0644); - - // get size - struct stat st; - ::fstat(fd, &st); - dout(2) << "create " << fn << " " << st.st_size << " bytes" << dendl; - - // write empty header - memset(&header, 0, sizeof(header)); - header.clear(); - header.fsid = ebofs->get_fsid(); - header.max_size = st.st_size; - write_header(); - - // writeable. - read_pos = 0; - write_pos = queue_pos = sizeof(header); - - ::close(fd); - - return 0; -} - -int FileJournal::open() -{ - //dout(1) << "open " << fn << dendl; - - // open and file - assert(fd == 0); - fd = ::open(fn.c_str(), O_RDWR|O_SYNC); - if (fd < 0) { - dout(2) << "open failed " << errno << " " << strerror(errno) << dendl; - return -errno; - } - assert(fd > 0); - - // assume writeable, unless... - read_pos = 0; - write_pos = queue_pos = sizeof(header); - - // read header? - read_header(); - if (header.fsid != ebofs->get_fsid()) { - dout(2) << "open journal fsid doesn't match, invalid (someone else's?) journal" << dendl; - } - else if (header.num > 0) { - // valid header, pick an offset - for (int i=0; iget_super_epoch()) { - dout(2) << "using read_pos header pointer " - << header.epoch[i] << " at " << header.offset[i] - << dendl; - read_pos = header.offset[i]; - write_pos = queue_pos = 0; - break; - } - else if (header.epoch[i] < ebofs->get_super_epoch()) { - dout(2) << "super_epoch is " << ebofs->get_super_epoch() - << ", skipping old " << header.epoch[i] << " at " << header.offset[i] - << dendl; - } - else if (header.epoch[i] > ebofs->get_super_epoch()) { - dout(2) << "super_epoch is " << ebofs->get_super_epoch() - << ", but wtf, journal is later " << header.epoch[i] << " at " << header.offset[i] - << dendl; - break; - } - } - } - - start_writer(); - - return 0; -} - -void FileJournal::close() -{ - dout(1) << "close " << fn << dendl; - - // stop writer thread - stop_writer(); - - // close - assert(writeq.empty()); - assert(commitq.empty()); - assert(fd > 0); - ::close(fd); - fd = 0; -} - -void FileJournal::start_writer() -{ - write_stop = false; - write_thread.create(); -} - -void FileJournal::stop_writer() -{ - write_lock.Lock(); - { - write_stop = true; - write_cond.Signal(); - } - write_lock.Unlock(); - write_thread.join(); -} - - -void FileJournal::print_header() -{ - for (int i=0; i::const_iterator it = bl.buffers().begin(); - it != bl.buffers().end(); - it++) { - if ((*it).length() == 0) continue; // blank buffer. - ::write(fd, (char*)(*it).c_str(), (*it).length() ); - } - - ::write(fd, &h, sizeof(h)); - - // move position pointer - write_pos += 2*sizeof(entry_header_t) + bl.length(); - - if (oncommit) { - if (1) { - // queue callback - ebofs->queue_finisher(oncommit); - } else { - // callback now - oncommit->finish(0); - delete oncommit; - } - } - } - } - - write_lock.Unlock(); - dout(10) << "write_thread_entry finish" << dendl; -} - -bool FileJournal::submit_entry(bufferlist& e, Context *oncommit) -{ - assert(queue_pos != 0); // bad create(), or journal didn't replay to completion. - - // ** lock ** - Mutex::Locker locker(write_lock); - - // wrap? full? - off_t size = 2*sizeof(entry_header_t) + e.length(); - - if (full) return false; // already marked full. - - if (header.wrap) { - // we're wrapped. don't overwrite ourselves. - if (queue_pos + size >= header.offset[0]) { - dout(10) << "submit_entry JOURNAL FULL (and wrapped), " << queue_pos << "+" << size - << " >= " << header.offset[0] - << dendl; - full = true; - print_header(); - return false; - } - } else { - // we haven't wrapped. - if (queue_pos + size >= header.max_size) { - // is there room if we wrap? - if ((off_t)sizeof(header_t) + size < header.offset[0]) { - // yes! - dout(10) << "submit_entry wrapped from " << queue_pos << " to " << sizeof(header_t) << dendl; - header.wrap = queue_pos; - queue_pos = sizeof(header_t); - header.push(ebofs->get_super_epoch(), queue_pos); - } else { - // no room. - dout(10) << "submit_entry JOURNAL FULL (and can't wrap), " << queue_pos << "+" << size - << " >= " << header.max_size - << dendl; - full = true; - return false; - } - } - } - - dout(10) << "submit_entry " << queue_pos << " : " << e.length() - << " epoch " << ebofs->get_super_epoch() - << " " << oncommit << dendl; - - // dump on queue - writeq.push_back(pair(ebofs->get_super_epoch(), e)); - commitq.push_back(oncommit); - - queue_pos += size; - - // kick writer thread - write_cond.Signal(); - - return true; -} - - -void FileJournal::commit_epoch_start() -{ - dout(10) << "commit_epoch_start on " << ebofs->get_super_epoch()-1 - << " -- new epoch " << ebofs->get_super_epoch() - << dendl; - - Mutex::Locker locker(write_lock); - - // was full -> empty -> now usable? - if (full) { - if (header.num != 0) { - dout(1) << " journal FULL, ignoring this epoch" << dendl; - return; - } - - dout(1) << " clearing FULL flag, journal now usable" << dendl; - full = false; - } - - // note epoch boundary - header.push(ebofs->get_super_epoch(), queue_pos); // note: these entries may not yet be written. - //write_header(); // no need to write it now, though... -} - -void FileJournal::commit_epoch_finish() -{ - dout(10) << "commit_epoch_finish committed " << ebofs->get_super_epoch()-1 << dendl; - - write_lock.Lock(); - { - if (full) { - // full journal damage control. - dout(15) << " journal was FULL, contents now committed, clearing header. journal still not usable until next epoch." << dendl; - header.clear(); - write_pos = queue_pos = sizeof(header_t); - } else { - // update header -- trim/discard old (committed) epochs - while (header.epoch[0] < ebofs->get_super_epoch()) - header.pop(); - } - write_header(); - - // discard any unwritten items in previous epoch, and do callbacks - epoch_t epoch = ebofs->get_super_epoch(); - list callbacks; - while (!writeq.empty() && writeq.front().first < epoch) { - dout(15) << " dropping unwritten and committed " - << write_pos << " : " << writeq.front().second.length() - << " epoch " << writeq.front().first - << dendl; - // finisher? - Context *oncommit = commitq.front(); - if (oncommit) callbacks.push_back(oncommit); - - write_pos += 2*sizeof(entry_header_t) + writeq.front().second.length(); - - // discard. - writeq.pop_front(); - commitq.pop_front(); - } - - // queue the finishers - ebofs->queue_finishers(callbacks); - } - write_lock.Unlock(); - -} - - -void FileJournal::make_writeable() -{ - if (read_pos) - write_pos = queue_pos = read_pos; - else - write_pos = queue_pos = sizeof(header_t); - read_pos = 0; -} - - -bool FileJournal::read_entry(bufferlist& bl, epoch_t& epoch) -{ - if (!read_pos) { - dout(2) << "read_entry -- not readable" << dendl; - return false; - } - - if (read_pos == header.wrap) { - // find wrap point - for (int i=1; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __EBOFS_JOURNAL_H -#define __EBOFS_JOURNAL_H - -class Ebofs; - -#include "include/buffer.h" -#include "include/Context.h" - -class Journal { -protected: - Ebofs *ebofs; - -public: - Journal(Ebofs *e) : ebofs(e) { } - virtual ~Journal() { } - - virtual int create() = 0; - virtual int open() = 0; - virtual void close() = 0; - - // writes - virtual void make_writeable() = 0; - virtual bool submit_entry(bufferlist& e, Context *oncommit) = 0;// submit an item - virtual void commit_epoch_start() = 0; // mark epoch boundary - virtual void commit_epoch_finish() = 0; // mark prior epoch as committed (we can expire) - virtual bool read_entry(bufferlist& bl, epoch_t &e) = 0; - - // reads/recovery - -}; - -#endif diff --git a/trunk/ceph/ebofs/mkfs.ebofs.cc b/trunk/ceph/ebofs/mkfs.ebofs.cc deleted file mode 100644 index d1d5975e7fd65..0000000000000 --- a/trunk/ceph/ebofs/mkfs.ebofs.cc +++ /dev/null @@ -1,349 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include -#include "ebofs/Ebofs.h" - - -int main(int argc, char **argv) -{ - // args - vector args; - argv_to_vec(argc, argv, args); - parse_config_options(args); - - if (args.size() < 1) { - cerr << "usage: mkfs.ebofs [options] " << std::endl; - return -1; - } - char *filename = args[0]; - - // mkfs - Ebofs mfs(filename); - int r = mfs.mkfs(); - if (r < 0) exit(r); - - if (args.size() > 1) { // pass an extra arg of some sort to trigger the test crapola - // test-o-rama! - Ebofs fs(filename); - fs.mount(); - - // zillion objects - if (1) { - char crap[1024*1024]; - memset(crap, 0, 1024*1024); - bufferlist bl; - int sz = 10000; - bl.append(crap, sz); - - int n = 100000; - utime_t start = g_clock.now(); - for (int i=0; i nsec - - while (1) { - cout << g_clock.now() << " writing " << pos << "~" << sz << std::endl; - fs.write(oid, pos, sz, bl, (Context*)0); - pos += sz; - nanosleep(&ts, 0); - } - - } - - /* - if (1) { - // partial write tests - char crap[1024*1024]; - memset(crap, 0, 1024*1024); - - bufferlist small; - small.append(crap, 10); - bufferlist med; - med.append(crap, 1000); - bufferlist big; - big.append(crap, 1024*1024); - - cout << "0" << std::endl; - fs.write(10, 0, 1024*1024, big, (Context*)0); - fs.sync(); - fs.trim_buffer_cache(); - - cout << "1" << std::endl; - fs.write(10, 10, 10, small, 0); - fs.write(10, 1, 1000, med, 0); - fs.sync(); - fs.trim_buffer_cache(); - - cout << "2" << std::endl; - fs.write(10, 10, 10, small, 0); - //fs.sync(); - fs.write(10, 1, 1000, med, 0); - fs.sync(); - fs.trim_buffer_cache(); - - cout << "3" << std::endl; - fs.write(10, 1, 1000, med, 0); - fs.write(10, 10000, 10, small, 0); - fs.truncate(10, 100, 0); - fs.sync(); - fs.trim_buffer_cache(); - - cout << "4" << std::endl; - fs.remove(10); - fs.sync(); - fs.write(10, 10, 10, small, 0); - fs.sync(); - fs.write(10, 1, 1000, med, 0); - fs.sync(); - fs.truncate(10, 100, 0); - fs.write(10, 10, 10, small, 0); - fs.trim_buffer_cache(); - - - - } - - if (0) { // onode write+read test - bufferlist bl; - char crap[1024*1024]; - memset(crap, 0, 1024*1024); - bl.append(crap, 10); - - fs.write(10, 10, 0, bl, (Context*)0); - fs.umount(); - - Ebofs fs2(filename); - fs2.mount(); - fs2.read(10, 10, 0, bl); - fs2.umount(); - - return 0; - } - - - if (0) { // small write + read test - bufferlist bl; - char crap[1024*1024]; - memset(crap, 0, 1024*1024); - - object_t oid = 10; - int n = 10000; - int l = 128; - bl.append(crap, l); - - - char *p = bl.c_str(); - off_t o = 0; - for (int i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include -#include -#include - -#include -#include -#include -using namespace std; - -#include "config.h" - -#include "mon/MonMap.h" - -#include "ebofs/Ebofs.h" - -#include "osd/OSD.h" -#include "mon/MonitorStore.h" - -int main(int argc, char **argv) -{ - vector args; - argv_to_vec(argc, argv, args); - - Ebofs eb("dev/osd0"); - eb.mount(); - MonitorStore ms("mondata/mon0"); - ms.mount(); - - epoch_t e = 1; - while (1) { - bufferlist bl; - object_t oid = OSD::get_osdmap_object_name(e); - eb.read(oid, 0, 0, bl); - if (bl.length() == 0) break; - cout << "saving epoch " << e << std::endl; - - bufferlist ibl; - oid = OSD::get_inc_osdmap_object_name(e); - eb.read(oid, 0, 0, ibl); - - ms.put_bl_sn(ibl, "osdmap", e); - ms.put_bl_sn(bl, "osdmap_full", e); - e++; - } - - eb.umount(); - //ms.umount(); - - return 0; -} diff --git a/trunk/ceph/include/Context.h b/trunk/ceph/include/Context.h deleted file mode 100644 index e5c74de6cb6e5..0000000000000 --- a/trunk/ceph/include/Context.h +++ /dev/null @@ -1,153 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __CONTEXT_H -#define __CONTEXT_H - -#include "config.h" - -#include -#include -#include - -#include - - -/* - * Context - abstract callback class - */ -class Context { - public: - virtual ~Context() {} // we want a virtual destructor!!! - virtual void finish(int r) = 0; -}; - - -/* - * finish and destroy a list of Contexts - */ -inline void finish_contexts(std::list& finished, - int result = 0) -{ - using std::cout; - using std::endl; - - list ls; - if (finished.empty()) return; - - ls.swap(finished); // swap out of place to avoid weird loops - - generic_dout(10) << ls.size() << " contexts to finish with " << result << dendl; - for (std::list::iterator it = ls.begin(); - it != ls.end(); - it++) { - Context *c = *it; - generic_dout(10) << "---- " << c << dendl; - c->finish(result); - delete c; - } -} - -class C_NoopContext : public Context { -public: - void finish(int r) { } -}; - - -/* - * C_Contexts - set of Contexts - */ -class C_Contexts : public Context { -public: - std::list contexts; - - void add(Context* c) { - contexts.push_back(c); - } - void take(std::list& ls) { - contexts.splice(contexts.end(), ls); - } - void finish(int r) { - finish_contexts(contexts, r); - } -}; - - -/* - * C_Gather - * - * BUG: does not report errors. - */ -class C_Gather : public Context { -public: - bool sub_finish(int r) { - //cout << "C_Gather sub_finish " << this << " got " << r << " of " << waitfor << endl; - assert(waitfor.count(r)); - waitfor.erase(r); - if (!waitfor.empty()) - return false; // more subs left - - // last one - onfinish->finish(0); - delete onfinish; - onfinish = 0; - return true; - } - - class C_GatherSub : public Context { - C_Gather *gather; - int num; - public: - C_GatherSub(C_Gather *g, int n) : gather(g), num(n) {} - void finish(int r) { - if (gather->sub_finish(num)) - delete gather; // last one! - } - }; - -private: - Context *onfinish; - std::set waitfor; - int num; - -public: - C_Gather(Context *f=0) : onfinish(f), num(0) { - //cout << "C_Gather new " << this << endl; - } - ~C_Gather() { - //cout << "C_Gather delete " << this << endl; - assert(!onfinish); - } - - void set_finisher(Context *c) { - assert(!onfinish); - onfinish = c; - } - Context *new_sub() { - num++; - waitfor.insert(num); - return new C_GatherSub(this, num); - } - - bool empty() { return num == 0; } - int get_num() { return num; } - - void finish(int r) { - assert(0); // nobody should ever call me. - } - -}; - -#endif diff --git a/trunk/ceph/include/Distribution.h b/trunk/ceph/include/Distribution.h deleted file mode 100644 index efc0795a72fcb..0000000000000 --- a/trunk/ceph/include/Distribution.h +++ /dev/null @@ -1,75 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __DISTRIBUTION_H -#define __DISTRIBUTION_H - -#include -#include -using namespace std; - -class Distribution { - vector p; - vector v; - - public: - //Distribution() { - //} - - unsigned get_width() { - return p.size(); - } - - void clear() { - p.clear(); - v.clear(); - } - void add(int val, float pr) { - p.push_back(pr); - v.push_back(val); - } - - void random() { - float sum = 0.0; - for (unsigned i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __BITMAPPER_H -#define __BITMAPPER_H - -class bitmapper { - char *_data; - int _len; - - public: - bitmapper() : _data(0), _len(0) { } - bitmapper(char *data, int len) : _data(data), _len(len) { } - - void set_data(char *data, int len) { _data = data; _len = len; } - - int bytes() const { return _len; } - int bits() const { return _len * 8; } - - bool operator[](int b) const { - return get(b); - } - bool get(int b) const { - return _data[b >> 3] & (1 << (b&7)); - } - void set(int b) { - _data[b >> 3] |= 1 << (b&7); - } - void clear(int b) { - _data[b >> 3] &= ~(1 << (b&7)); - } - void toggle(int b) { - _data[b >> 3] ^= 1 << (b&7); - } -}; - -#endif diff --git a/trunk/ceph/include/blobhash.h b/trunk/ceph/include/blobhash.h deleted file mode 100644 index a3703e46d67f5..0000000000000 --- a/trunk/ceph/include/blobhash.h +++ /dev/null @@ -1,47 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __BLOBHASH_H -#define __BLOBHASH_H - -#include "hash.h" - -/* -- this is to make some of the STL types work with 64 bit values, string hash keys, etc. -- added when i was using an old STL.. maybe try taking these out and see if things - compile now? -*/ - -class blobhash { -public: - size_t operator()(const char *p, unsigned len) { - static rjhash H; - size_t acc = 0; - while (len >= sizeof(size_t)) { - acc ^= *(size_t*)p; - p += sizeof(size_t); - len -= sizeof(size_t); - } - int sh = 0; - while (len) { - acc ^= (size_t)*p << sh; - sh += 8; - len--; - p++; - } - return H(acc); - } -}; - - -#endif diff --git a/trunk/ceph/include/encodable.h b/trunk/ceph/include/encodable.h deleted file mode 100644 index 321361866ec9b..0000000000000 --- a/trunk/ceph/include/encodable.h +++ /dev/null @@ -1,424 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __ENCODABLE_H -#define __ENCODABLE_H - -#include "buffer.h" - -#include -#include -#include -#include -#include -#include - - -// ================================================================== -// simple - - -// raw -template -inline void _encode_raw(const T& t, bufferlist& bl) -{ - bl.append((char*)&t, sizeof(t)); -} -template -inline void _decode_raw(T& t, bufferlist::iterator &p) -{ - p.copy(sizeof(t), (char*)&t); -} - -#include -#include -#include -#include -#include -#include - -// list -template -inline void _encode_simple(const std::list& ls, bufferlist& bl) -{ - // should i pre- or post- count? - if (!ls.empty()) { - unsigned pos = bl.length(); - uint32_t n = 0; - _encode_raw(n, bl); - for (typename std::list::const_iterator p = ls.begin(); p != ls.end(); ++p) { - n++; - _encode_simple(*p, bl); - } - bl.copy_in(pos, sizeof(n), (char*)&n); - } else { - uint32_t n = ls.size(); // FIXME: this is slow on a list. - _encode_raw(n, bl); - for (typename std::list::const_iterator p = ls.begin(); p != ls.end(); ++p) - _encode_simple(*p, bl); - } -} -template -inline void _decode_simple(std::list& ls, bufferlist::iterator& p) -{ - uint32_t n; - _decode_raw(n, p); - ls.clear(); - while (n--) { - T v; - _decode_simple(v, p); - ls.push_back(v); - } -} - -// deque -template -inline void _encode_simple(const std::deque& ls, bufferlist& bl) -{ - uint32_t n = ls.size(); - _encode_raw(n, bl); - for (typename std::deque::const_iterator p = ls.begin(); p != ls.end(); ++p) - _encode_simple(*p, bl); -} -template -inline void _decode_simple(std::deque& ls, bufferlist::iterator& p) -{ - uint32_t n; - _decode_raw(n, p); - ls.clear(); - while (n--) { - T v; - _decode_simple(v, p); - ls.push_back(v); - } -} - -// set -template -inline void _encode_simple(const std::set& s, bufferlist& bl) -{ - uint32_t n = s.size(); - _encode_raw(n, bl); - for (typename std::set::const_iterator p = s.begin(); p != s.end(); ++p) - _encode_simple(*p, bl); -} -template -inline void _decode_simple(std::set& s, bufferlist::iterator& p) -{ - uint32_t n; - _decode_raw(n, p); - s.clear(); - while (n--) { - T v; - _decode_simple(v, p); - s.insert(v); - } -} - -// vector -template -inline void _encode_simple(const std::vector& v, bufferlist& bl) -{ - uint32_t n = v.size(); - _encode_raw(n, bl); - for (typename std::vector::const_iterator p = v.begin(); p != v.end(); ++p) - _encode_simple(*p, bl); -} -template -inline void _decode_simple(std::vector& v, bufferlist::iterator& p) -{ - uint32_t n; - _decode_raw(n, p); - v.resize(n); - for (uint32_t i=0; i -inline void _encode_simple(const std::map& m, bufferlist& bl) -{ - uint32_t n = m.size(); - _encode_raw(n, bl); - for (typename std::map::const_iterator p = m.begin(); p != m.end(); ++p) { - _encode_simple(p->first, bl); - _encode_simple(p->second, bl); - } -} -template -inline void _decode_simple(std::map& m, bufferlist::iterator& p) -{ - uint32_t n; - _decode_raw(n, p); - m.clear(); - while (n--) { - T k; - _decode_simple(k, p); - _decode_simple(m[k], p); - } -} - -// hash_map -template -inline void _encode_simple(const __gnu_cxx::hash_map& m, bufferlist& bl) -{ - uint32_t n = m.size(); - _encode_raw(n, bl); - for (typename __gnu_cxx::hash_map::const_iterator p = m.begin(); p != m.end(); ++p) { - _encode_simple(p->first, bl); - _encode_simple(p->second, bl); - } -} -template -inline void _decode_simple(__gnu_cxx::hash_map& m, bufferlist::iterator& p) -{ - uint32_t n; - _decode_raw(n, p); - m.clear(); - while (n--) { - T k; - _decode_simple(k, p); - _decode_simple(m[k], p); - } -} - -// string -inline void _encode_simple(const std::string& s, bufferlist& bl) -{ - uint32_t len = s.length(); - _encode_raw(len, bl); - bl.append(s.data(), len); -} -inline void _decode_simple(std::string& s, bufferlist::iterator& p) -{ - uint32_t len; - _decode_raw(len, p); - s.clear(); - p.copy(len, s); -} - -// const char* (encode only, string compatible) -inline void _encode_simple(const char *s, bufferlist& bl) -{ - uint32_t len = strlen(s); - _encode_raw(len, bl); - bl.append(s, len); -} - -// bufferptr (encapsulated) -inline void _encode_simple(const buffer::ptr& bp, bufferlist& bl) -{ - uint32_t len = bp.length(); - _encode_raw(len, bl); - bl.append(bp); -} -inline void _decode_simple(buffer::ptr& bp, bufferlist::iterator& p) -{ - uint32_t len; - _decode_raw(len, p); - - bufferlist s; - p.copy(len, s); - - if (s.buffers().size() == 1) - bp = s.buffers().front(); - else - bp = buffer::copy(s.c_str(), s.length()); -} - -// bufferlist (encapsulated) -inline void _encode_simple(const bufferlist& s, bufferlist& bl) -{ - uint32_t len = s.length(); - _encode_raw(len, bl); - bl.append(s); -} -inline void _encode_simple_destructively(bufferlist& s, bufferlist& bl) -{ - uint32_t len = s.length(); - _encode_raw(len, bl); - bl.claim_append(s); -} -inline void _decode_simple(bufferlist& s, bufferlist::iterator& p) -{ - uint32_t len; - _decode_raw(len, p); - s.clear(); - p.copy(len, s); -} - -// base -template -inline void _encode_simple(const T& t, bufferlist& bl) -{ - _encode_raw(t, bl); -} -template -inline void _decode_simple(T& t, bufferlist::iterator& p) -{ - _decode_raw(t, p); -} - - - - -// ================================================================== -// complex - -// list -template -inline void _encode_complex(const std::list& ls, bufferlist& bl) -{ - uint32_t n = ls.size(); - _encode_raw(n, bl); - for (typename std::list::const_iterator p = ls.begin(); p != ls.end(); ++p) - _encode_complex(*p, bl); -} -template -inline void _decode_complex(std::list& ls, bufferlist::iterator& p) -{ - uint32_t n; - _decode_raw(n, p); - ls.clear(); - while (n--) { - T v; - _decode_complex(v, p); - ls.push_back(v); - } -} - -// deque -template -inline void _encode_complex(const std::deque& ls, bufferlist& bl) -{ - uint32_t n = ls.size(); - _encode_raw(n, bl); - for (typename std::deque::const_iterator p = ls.begin(); p != ls.end(); ++p) - _encode_complex(*p, bl); -} -template -inline void _decode_complex(std::deque& ls, bufferlist::iterator& p) -{ - uint32_t n; - _decode_raw(n, p); - ls.clear(); - while (n--) { - T v; - _decode_complex(v, p); - ls.push_back(v); - } -} - -// set -template -inline void _encode_complex(const std::set& s, bufferlist& bl) -{ - uint32_t n = s.size(); - _encode_raw(n, bl); - for (typename std::set::const_iterator p = s.begin(); p != s.end(); ++p) - _encode_complex(*p, bl); -} -template -inline void _decode_complex(std::set& s, bufferlist::iterator& p) -{ - uint32_t n; - _decode_raw(n, p); - s.clear(); - while (n--) { - T v; - _decode_complex(v, p); - s.insert(v); - } -} - -// vector -template -inline void _encode_complex(const std::vector& v, bufferlist& bl) -{ - uint32_t n = v.size(); - _encode_raw(n, bl); - for (typename std::vector::const_iterator p = v.begin(); p != v.end(); ++p) - _encode_complex(*p, bl); -} -template -inline void _decode_complex(std::vector& v, bufferlist::iterator& p) -{ - uint32_t n; - _decode_raw(n, p); - v.resize(n); - for (uint32_t i=0; i -inline void _encode_complex(const std::map& m, bufferlist& bl) -{ - uint32_t n = m.size(); - _encode_raw(n, bl); - for (typename std::map::const_iterator p = m.begin(); p != m.end(); ++p) { - _encode_simple(p->first, bl); - _encode_complex(p->second, bl); - } -} -template -inline void _decode_complex(std::map& m, bufferlist::iterator& p) -{ - uint32_t n; - _decode_raw(n, p); - m.clear(); - while (n--) { - T k; - _decode_simple(k, p); - _decode_complex(m[k], p); - } -} - -// hash_map -template -inline void _encode_complex(const __gnu_cxx::hash_map& m, bufferlist& bl) -{ - uint32_t n = m.size(); - _encode_raw(n, bl); - for (typename __gnu_cxx::hash_map::const_iterator p = m.begin(); p != m.end(); ++p) { - _encode_simple(p->first, bl); - _encode_complex(p->second, bl); - } -} -template -inline void _decode_complex(__gnu_cxx::hash_map& m, bufferlist::iterator& p) -{ - uint32_t n; - _decode_raw(n, p); - m.clear(); - while (n--) { - T k; - _decode_simple(k, p); - _decode_complex(m[k], p); - } -} - -// base case -template -inline void _encode_complex(const T& t, bufferlist& bl) -{ - t._encode(bl); -} -template -inline void _decode_complex(T& t, bufferlist::iterator& p) -{ - t._decode(p); -} - -#endif diff --git a/trunk/ceph/include/error.h b/trunk/ceph/include/error.h deleted file mode 100644 index a548d9756b9b8..0000000000000 --- a/trunk/ceph/include/error.h +++ /dev/null @@ -1,41 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -#define SYSERROR() syserror("At %s:%d", __FILE__, __LINE__) - -#define ASSERT(c) \ - ((c) || (exiterror("Assertion failed at %s:%d", __FILE__, __LINE__), 1)) - -/* print usage error message and exit */ -extern void userror(const char *use, const char *fmt, ...); - -/* print system error message and exit */ -extern void syserror(const char *fmt, ...); - -/* print error message and exit */ -extern void exiterror(const char *fmt, ...); - -/* print error message */ -extern void error(const char *fmt, ...); - -#ifdef __cplusplus -} // extern "C" -#endif diff --git a/trunk/ceph/include/filepath.h b/trunk/ceph/include/filepath.h deleted file mode 100644 index c92663049bf1f..0000000000000 --- a/trunk/ceph/include/filepath.h +++ /dev/null @@ -1,199 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __FILEPATH_H -#define __FILEPATH_H - - -/* - * BUG: /a/b/c is equivalent to a/b/c in dentry-breakdown, but not string. - * -> should it be different? how? should this[0] be "", with depth 4? - * - */ - - -#include -#include -#include -using namespace std; - -#include "buffer.h" -#include "encodable.h" - -class filepath { - /** path - */ - inodeno_t ino; // base inode - string path; // relative path - - /** bits - path segemtns - * this is ['a', 'b', 'c'] for both the aboslute and relative case. - * - * NOTE: this value is LAZILY maintained... i.e. it's a cache - */ - mutable vector bits; - - void rebuild_path() { - if (absolute()) - path = "/"; - else - path.clear(); - for (unsigned i=0; i 0) parse_bits(); - return bits.size(); - } - bool empty() const { return path.length() == 0; } - - bool absolute() const { return ino > 0; } - bool relative() const { return !absolute(); } - - const string& operator[](int i) const { - if (bits.empty() && path.length() > 0) parse_bits(); - return bits[i]; - } - - const string& last_dentry() const { - if (bits.empty() && path.length() > 0) parse_bits(); - return bits[ bits.size()-1 ]; - } - - filepath prefixpath(int s) const { - filepath t; - t.ino = ino; - for (int i=0; i 0) parse_bits(); - bits.pop_back(); - rebuild_path(); - } - void push_dentry(const string& s) { - if (bits.empty() && path.length() > 0) parse_bits(); - bits.push_back(s); - if (path.length() && path[path.length()-1] != '/') - path += "/"; - path += s; - } - void append(const filepath& a) { - for (unsigned i=0; i 1) - out << '#' << hex << path.get_ino() << dec; - if (path.get_ino() > 0 && path.depth()) - out << '/'; - return out << path.get_path(); -} - -#endif diff --git a/trunk/ceph/include/lru.h b/trunk/ceph/include/lru.h deleted file mode 100644 index 40dce1aa191ab..0000000000000 --- a/trunk/ceph/include/lru.h +++ /dev/null @@ -1,341 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __LRU_H -#define __LRU_H - -#include -#include - -#include "config.h" - - - -class LRUObject { - private: - LRUObject *lru_next, *lru_prev; - bool lru_pinned; - class LRU *lru; - class LRUList *lru_list; - - public: - LRUObject() { - lru_next = lru_prev = NULL; - lru_list = 0; - lru_pinned = false; - lru = 0; - } - - // pin/unpin item in cache - void lru_pin(); - void lru_unpin(); - bool lru_is_expireable() { return !lru_pinned; } - - friend class LRU; - friend class LRUList; -}; - - -class LRUList { - private: - LRUObject *head, *tail; - uint32_t len; - - public: - LRUList() { - head = tail = 0; - len = 0; - } - - uint32_t get_length() { return len; } - - LRUObject *get_head() { - return head; - } - LRUObject *get_tail() { - return tail; - } - - void clear() { - while (len > 0) { - remove(get_head()); - } - } - - void insert_head(LRUObject *o) { - o->lru_next = head; - o->lru_prev = NULL; - if (head) { - head->lru_prev = o; - } else { - tail = o; - } - head = o; - o->lru_list = this; - len++; - } - void insert_tail(LRUObject *o) { - o->lru_next = NULL; - o->lru_prev = tail; - if (tail) { - tail->lru_next = o; - } else { - head = o; - } - tail = o; - o->lru_list = this; - len++; - } - - void remove(LRUObject *o) { - assert(o->lru_list == this); - if (o->lru_next) - o->lru_next->lru_prev = o->lru_prev; - else - tail = o->lru_prev; - if (o->lru_prev) - o->lru_prev->lru_next = o->lru_next; - else - head = o->lru_next; - o->lru_next = o->lru_prev = NULL; - o->lru_list = 0; - assert(len>0); - len--; - } - -}; - - -class LRU { - protected: - LRUList lru_top, lru_bot, lru_pintail; - uint32_t lru_num, lru_num_pinned; - uint32_t lru_max; // max items - double lru_midpoint; - - friend class LRUObject; - //friend class MDCache; // hack - - public: - LRU(int max = 0) { - lru_num = 0; - lru_num_pinned = 0; - lru_midpoint = .6; - lru_max = max; - } - - uint32_t lru_get_size() { return lru_num; } - uint32_t lru_get_top() { return lru_top.get_length(); } - uint32_t lru_get_bot() { return lru_bot.get_length(); } - uint32_t lru_get_pintail() { return lru_pintail.get_length(); } - uint32_t lru_get_max() { return lru_max; } - uint32_t lru_get_num_pinned() { return lru_num_pinned; } - - void lru_set_max(uint32_t m) { lru_max = m; } - void lru_set_midpoint(float f) { lru_midpoint = f; } - - void lru_clear() { - lru_top.clear(); - lru_bot.clear(); - lru_pintail.clear(); - } - - // insert at top of lru - void lru_insert_top(LRUObject *o) { - //assert(!o->lru_in_lru); - //o->lru_in_lru = true; - assert(!o->lru); - o->lru = this; - lru_top.insert_head( o ); - lru_num++; - if (o->lru_pinned) lru_num_pinned++; - lru_adjust(); - } - - // insert at mid point in lru - void lru_insert_mid(LRUObject *o) { - //assert(!o->lru_in_lru); - //o->lru_in_lru = true; - assert(!o->lru); - o->lru = this; - lru_bot.insert_head(o); - lru_num++; - if (o->lru_pinned) lru_num_pinned++; - } - - // insert at bottom of lru - void lru_insert_bot(LRUObject *o) { - assert(!o->lru); - o->lru = this; - lru_bot.insert_tail(o); - lru_num++; - if (o->lru_pinned) lru_num_pinned++; - } - - /* - // insert at bottom of lru - void lru_insert_pintail(LRUObject *o) { - assert(!o->lru); - o->lru = this; - - assert(o->lru_pinned); - - lru_pintail.insert_head(o); - lru_num++; - lru_num_pinned += o->lru_pinned; - } - */ - - - - - // adjust top/bot balance, as necessary - void lru_adjust() { - if (!lru_max) return; - - unsigned toplen = lru_top.get_length(); - unsigned topwant = (unsigned)(lru_midpoint * ((double)lru_max - lru_num_pinned)); - while (toplen > 0 && - toplen > topwant) { - // remove from tail of top, stick at head of bot - // FIXME: this could be way more efficient by moving a whole chain of items. - - LRUObject *o = lru_top.get_tail(); - lru_top.remove(o); - lru_bot.insert_head(o); - toplen--; - } - } - - - // remove an item - LRUObject *lru_remove(LRUObject *o) { - // not in list - //assert(o->lru_in_lru); - //if (!o->lru_in_lru) return o; // might have expired and been removed that way. - if (!o->lru) return o; - - - if (o->lru_list == &lru_top) - lru_top.remove(o); - else if (o->lru_list == &lru_bot) - lru_bot.remove(o); - else if (o->lru_list == &lru_pintail) - lru_pintail.remove(o); - else - assert(0); - - lru_num--; - if (o->lru_pinned) lru_num_pinned--; - o->lru = 0; - return o; - } - - // touch item -- move to head of lru - bool lru_touch(LRUObject *o) { - lru_remove(o); - lru_insert_top(o); - return true; - } - - // touch item -- move to midpoint (unless already higher) - bool lru_midtouch(LRUObject *o) { - if (o->lru_list == &lru_top) return false; - - lru_remove(o); - lru_insert_mid(o); - return true; - } - - // touch item -- move to bottom - bool lru_bottouch(LRUObject *o) { - lru_remove(o); - lru_insert_bot(o); - return true; - } - - void lru_touch_entire_pintail() { - // promote entire pintail to the top lru - while (lru_pintail.get_length() > 0) { - LRUObject *o = lru_pintail.get_head(); - lru_pintail.remove(o); - lru_top.insert_tail(o); - } - } - - - // expire -- expire a single item - LRUObject *lru_get_next_expire() { - LRUObject *p; - - // look through tail of bot - while (lru_bot.get_length()) { - p = lru_bot.get_tail(); - if (!p->lru_pinned) return p; - - // move to pintail - lru_bot.remove(p); - lru_pintail.insert_head(p); - } - - // ok, try head then - while (lru_top.get_length()) { - p = lru_top.get_tail(); - if (!p->lru_pinned) return p; - - // move to pintail - lru_top.remove(p); - lru_pintail.insert_head(p); - } - - // no luck! - return NULL; - } - - LRUObject *lru_expire() { - LRUObject *p = lru_get_next_expire(); - if (p) - return lru_remove(p); - return NULL; - } - - - void lru_status() { - generic_dout(10) << "lru: " << lru_num << " items, " << lru_top.get_length() << " top, " << lru_bot.get_length() << " bot, " << lru_pintail.get_length() << " pintail" << dendl; - } - -}; - - -inline void LRUObject::lru_pin() -{ - lru_pinned = true; - if (lru) lru->lru_num_pinned++; -} -inline void LRUObject::lru_unpin() { - lru_pinned = false; - if (lru) { - lru->lru_num_pinned--; - - // move from pintail -> bot - if (lru_list == &lru->lru_pintail) { - lru->lru_pintail.remove(this); - lru->lru_bot.insert_tail(this); - } - } -} - -#endif diff --git a/trunk/ceph/include/rangeset.h b/trunk/ceph/include/rangeset.h deleted file mode 100644 index 547ea3ab72274..0000000000000 --- a/trunk/ceph/include/rangeset.h +++ /dev/null @@ -1,253 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __RANGESET_H -#define __RANGESET_H - -/* - * - * my first container with iterator! it's pretty ugly. - * - */ - -#include -#include -#include -using namespace std; - -//typedef int T; - -template -struct _rangeset_base { - map ranges; // pair(first,last) (inclusive, e.g. [first,last]) - - typedef typename map::iterator mapit; - - // get iterator for range including val. or ranges.end(). - mapit get_range_for(T val) { - mapit it = ranges.lower_bound(val); - if (it == ranges.end()) { - // search backwards - typename map::reverse_iterator it = ranges.rbegin(); - if (it == ranges.rend()) return ranges.end(); - if (it->first <= val && it->second >= val) - return ranges.find(it->first); - return ranges.end(); - } else { - if (it->first == val) return - it--; - if (it->first <= val && it->second >= val) - return it; - return ranges.end(); - } - } - -}; - - -template -class rangeset_iterator : - public std::iterator -{ - //typedef typename map::iterator mapit; - - map ranges; - typename map::iterator it; - T current; - -public: - // cons - rangeset_iterator() {} - - rangeset_iterator(typename map::iterator& it, map& ranges) { - this->ranges = ranges; - this->it = it; - if (this->it != ranges.end()) - current = it->first; - } - - bool operator==(rangeset_iterator rit) { - return (it == rit.it && rit.current == current); - } - bool operator!=(rangeset_iterator rit) { - return (it != rit.it) || (rit.current != current); - } - - T& operator*() { - return current; - } - - rangeset_iterator operator++(int) { - if (current < it->second) - current++; - else { - it++; - if (it != ranges.end()) - current = it->first; - } - - return *this; - } -}; - - -template -class rangeset -{ - typedef typename map::iterator map_iterator; - - _rangeset_base theset; - inodeno_t _size; - -public: - rangeset() { _size = 0; } - typedef rangeset_iterator iterator; - - iterator begin() { - map_iterator it = theset.ranges.begin(); - return iterator(it, theset.ranges); - } - - iterator end() { - map_iterator it = theset.ranges.end(); - return iterator(it, theset.ranges); - } - - map_iterator map_begin() { - return theset.ranges.begin(); - } - map_iterator map_end() { - return theset.ranges.end(); - } - int map_size() { - return theset.ranges.size(); - } - - void map_insert(T v1, T v2) { - theset.ranges.insert(pair(v1,v2)); - _size += v2 - v1+1; - } - - - // ... - bool contains(T val) { - if (theset.get_range_for(val) == theset.ranges.end()) return false; - assert(!empty()); - return true; - } - - void insert(T val) { - assert(!contains(val)); - - map_iterator left = theset.get_range_for(val-1); - map_iterator right = theset.get_range_for(val+1); - - if (left != theset.ranges.end() && - right != theset.ranges.end()) { - // join! - left->second = right->second; - theset.ranges.erase(right); - _size++; - return; - } - - if (left != theset.ranges.end()) { - // add to left range - left->second = val; - _size++; - return; - } - - if (right != theset.ranges.end()) { - // add to right range - theset.ranges.insert(pair(val, right->second)); - theset.ranges.erase(val+1); - _size++; - return; - } - - // new range - theset.ranges.insert(pair(val,val)); - _size++; - return; - } - - unsigned size() { - return size(); - } - - bool empty() { - if (theset.ranges.empty()) { - assert(_size == 0); - return true; - } - assert(_size>0); - return false; - } - - - T first() { - assert(!empty()); - map_iterator it = theset.ranges.begin(); - return it->first; - } - - void erase(T val) { - assert(contains(val)); - map_iterator it = theset.get_range_for(val); - assert(it != theset.ranges.end()); - - // entire range - if (val == it->first && val == it->second) { - theset.ranges.erase(it); - _size--; - return; - } - - // beginning - if (val == it->first) { - theset.ranges.insert(pair(val+1, it->second)); - theset.ranges.erase(it); - _size--; - return; - } - - // end - if (val == it->second) { - it->second = val-1; - _size--; - return; - } - - // middle split - theset.ranges.insert(pair(it->first, val-1)); - theset.ranges.insert(pair(val+1, it->second)); - theset.ranges.erase(it); - _size--; - return; - } - - void dump() { - for (typename map::iterator it = theset.ranges.begin(); - it != theset.ranges.end(); - it++) { - cout << " " << it->first << "-" << it->second << endl; - } - } - -}; - - -#endif diff --git a/trunk/ceph/include/statlite.h b/trunk/ceph/include/statlite.h deleted file mode 100644 index a9c0433e4a4e8..0000000000000 --- a/trunk/ceph/include/statlite.h +++ /dev/null @@ -1,72 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -#ifndef _STATLITE_H -#define _STATLITE_H - -extern "C" { - -#include -#include -#include -#include -#include - -struct statlite { - dev_t st_dev; /* device */ - ino_t st_ino; /* inode */ - mode_t st_mode; /* protection */ - nlink_t st_nlink; /* number of hard links */ - uid_t st_uid; /* user ID of owner */ - gid_t st_gid; /* group ID of owner */ - dev_t st_rdev; /* device type (if inode device)*/ - unsigned long st_litemask; /* bit mask for optional fields */ - /***************************************************************/ - /**** Remaining fields are optional according to st_litemask ***/ - off_t st_size; /* total size, in bytes */ - blksize_t st_blksize; /* blocksize for filesystem I/O */ - blkcnt_t st_blocks; /* number of blocks allocated */ - struct timespec st_atim; /* Time of last access. */ - struct timespec st_mtim; /* Time of last modification. */ - struct timespec st_ctim; /* Time of last status change. */ - //time_t st_atime; /* time of last access */ - //time_t st_mtime; /* time of last modification */ - //time_t st_ctime; /* time of last change */ -}; - -#define S_STATLITE_SIZE 1 -#define S_STATLITE_BLKSIZE 2 -#define S_STATLITE_BLOCKS 4 -#define S_STATLITE_ATIME 8 -#define S_STATLITE_MTIME 16 -#define S_STATLITE_CTIME 32 - -#define S_REQUIRESIZE(m) (m | S_STATLITE_SIZE) -#define S_REQUIREBLKSIZE(m) (m | S_STATLITE_BLKSIZE) -#define S_REQUIREBLOCKS(m) (m | S_STATLITE_BLOCKS) -#define S_REQUIREATIME(m) (m | S_STATLITE_ATIME) -#define S_REQUIREMTIME(m) (m | S_STATLITE_MTIME) -#define S_REQUIRECTIME(m) (m | S_STATLITE_CTIME) - -#define S_ISVALIDSIZE(m) (m & S_STATLITE_SIZE) -#define S_ISVALIDBLKSIZE(m) (m & S_STATLITE_BLKSIZE) -#define S_ISVALIDBLOCKS(m) (m & S_STATLITE_BLOCKS) -#define S_ISVALIDATIME(m) (m & S_STATLITE_ATIME) -#define S_ISVALIDMTIME(m) (m & S_STATLITE_MTIME) -#define S_ISVALIDCTIME(m) (m & S_STATLITE_CTIME) - - -// readdirplus etc. - -struct dirent_plus { - struct dirent d_dirent; /* dirent struct for this entry */ - struct stat d_stat; /* attributes for this entry */ - int d_stat_err;/* errno for d_stat, or 0 */ -}; -struct dirent_lite { - struct dirent d_dirent; /* dirent struct for this entry */ - struct statlite d_stat; /* attributes for this entry */ - int d_stat_err;/* errno for d_stat, or 0 */ -}; - -} -#endif diff --git a/trunk/ceph/include/triple.h b/trunk/ceph/include/triple.h deleted file mode 100644 index e9f43b9315d21..0000000000000 --- a/trunk/ceph/include/triple.h +++ /dev/null @@ -1,28 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __TRIPLE_H -#define __TRIPLE_H - -template -class triple { - public: - A first; - B second; - C third; - triple() {} - triple(A f, B s, C t) : first(f), second(s), third(t) {} -}; - -#endif diff --git a/trunk/ceph/include/uofs.h b/trunk/ceph/include/uofs.h deleted file mode 100644 index a4673aaa616ea..0000000000000 --- a/trunk/ceph/include/uofs.h +++ /dev/null @@ -1,51 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -/* - * uofs.h - * - * user-level object-based file system - */ - - #ifndef _UOFS_H_ - #define _UOFS_H_ - - #include - #include - #include - - - int device_open(char *path, int xflags); - void device_findsizes(int fd, long long *sz, int *bsz); - - int uofs_format(int bdev_id, int donode_size, int bd_ratio, int reg_size, int sb_size, int lb_size, - int nr_hash_table_buckets, int delay_allocation, int flush_interval); - - int uofs_mount(int bdev_id); - void uofs_shutdown(void); - - int uofs_read(long long oid, void *buf, off_t offset, size_t count); - int uofs_write(long long oid, void *buf, off_t offset, size_t count); - int uofs_del(long long oid); - int uofs_sync(long long oid); - int uofs_exist(long long oid); - - int uofs_get_size(long long oid); - - void uofs_superblock_printout(void); - int get_large_object_pages(void); - - int uofs_buffer_size(void); - #endif diff --git a/trunk/ceph/include/xlist.h b/trunk/ceph/include/xlist.h deleted file mode 100644 index 2ea2cbec6c815..0000000000000 --- a/trunk/ceph/include/xlist.h +++ /dev/null @@ -1,123 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __XLIST_H -#define __XLIST_H - -template -class xlist { -public: - struct item { - T _item; - item *_prev, *_next; - xlist *_head; - - item(T i) : _item(i), _prev(0), _next(0), _head(0) {} - ~item() { - remove_myself(); - } - - xlist* get_xlist() { return _head; } - void remove_myself() { - if (_head) - _head->remove(this); - assert(_head == 0); - } - }; - -private: - item *_front, *_back; - int _size; - -public: - xlist() : _front(0), _back(0), _size(0) {} - ~xlist() { - assert(_size == 0); - assert(_front == 0); - assert(_back == 0); - } - - int size() { return _size; } - bool empty() { - assert((bool)_front == (bool)_size); - return _front == 0; - } - - void clear() { - while (_front) remove(_front); - } - - void push_back(item *item) { - if (item->_head) - item->_head->remove(item); - - item->_head = this; - item->_next = 0; - item->_prev = _back; - if (_back) - _back->_next = item; - else - _front = item; - _back = item; - _size++; - } - void remove(item *item) { - assert(item->_head == this); - - if (item->_prev) - item->_prev->_next = item->_next; - else - _front = item->_next; - if (item->_next) - item->_next->_prev = item->_prev; - else - _back = item->_prev; - _size--; - - item->_head = 0; - item->_next = item->_prev = 0; - } - - T front() { return (T)_front->_item; } - T back() { return (T)_back->_item; } - - void pop_front() { - assert(!empty()); - remove(_front); - } - void pop_back() { - assert(!empty()); - remove(_back); - } - - class iterator { - private: - item *cur; - public: - iterator(item *i = 0) : cur(i) {} - T operator*() { return (T)cur->_item; } - iterator& operator++() { - assert(cur); - cur = cur->_next; - return *this; - } - bool end() { return cur == 0; } - }; - - iterator begin() { return iterator(_front); } - iterator end() { return iterator(NULL); } -}; - - -#endif diff --git a/trunk/ceph/jobs/alc.tp b/trunk/ceph/jobs/alc.tp deleted file mode 100644 index c600850c54be0..0000000000000 --- a/trunk/ceph/jobs/alc.tp +++ /dev/null @@ -1,38 +0,0 @@ -#PSUB -s /bin/bash # Sets your shell in batch -#PSUB -c alc # Where to run the job - -#PSUB -eo # Send std error & std out to the same file - -#PSUB -ln $NUM # Number of nodes to use -#PSUB -g $NUM # Total Number of tasks to use -#PSUB -cpn 1 # cpus per node - -####PSUB -c 1024Mb # memory limit -#PSUB -lc 1500 # Core file size per process -#PSUB -nr # Do not automatically resubmit job -#PSUB -tM 20m # Select time limit. The default time limit - # is only 30 minutes! Time can be HH:MM:SS or HH:MM - -#PSUB -o $CWD/$OUT # filename for output - -# Put your commands here. Remember to 'cd' to the appropriate -# directory, because the job will initially be in your home directory. -# To run a parallel job, you need to use the srun. - - - -echo job $PSUB_JOBID nodes $NUM name $NAME - -# environment -cd $CWD -export LD_LIBRARY_PATH=/usr/lib/mpi/mpi_gnu/lib - -# create fakestore dirs -srun -l -N $NUM -ppbatch bash -c "test -d tmp/osddata || mkdir tmp/osddata || echo cant make osddata ; uptime" - -# go -srun -l -N $NUM -ppbatch $CMD && touch $DONE - -# clean up fakestore -srun -l -N $NUM -ppbatch bash -c 'uptime ; rm -r tmp/osddata/*' - diff --git a/trunk/ceph/jobs/alcdat/makedirs b/trunk/ceph/jobs/alcdat/makedirs deleted file mode 100644 index af5a098a254c9..0000000000000 --- a/trunk/ceph/jobs/alcdat/makedirs +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - #'nummds' => [1, 2, 4, 8, 16, 32, 48, 64, 80, 96, 128, 160, 192], - 'nummds' => [1, 2, 8, 16, 32, 48, 64, 80, 96, 112, 128],#144, 160, 192, 208], - - 'cper' => [15,20], - '_dep' => [ 'cnode' => '$nummds',# / 4 + 1', - 'numclient' => '$nummds * $cper', - 'numosd' => '$nummds > 1 ? $nummds:2', - 'n' => '1 + $cnode + $nummds + $numosd' ], - - # parameters - 'fs' => 'ebofs', - #'fs' => 'fakestore', - - 'mds_bal_rep' => 10000, # none of that! - 'mds_decay_halflife' => 30, - - 'mds_bal_interval' => 45, - 'mds_bal_max' => [2], - - 'until' => 300, # --syn until $n ... when to stop clients - 'kill_after' => 400, - 'start' => 100, - 'end' => 300, - - 'makedirs' => 1, - 'makedirs_dirs' => 10, - 'makedirs_files' => 10, - 'makedirs_depth' => 4, - - # --meta_log_layout_scount 32 --meta_log_layout_ssize 256 - # --osd_pg_layout linear - 'custom' => '--tcp_skip_rank0 --meta_log_layout_num_rep 1 --meta_dir_layout_num_rep 1 --mds_shutdown_check 60', - #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', - - 'comb' => { - 'x' => 'nummds', - 'vars' => [ 'mds.req' ] - } -}; diff --git a/trunk/ceph/jobs/alcdat/makedirs.big b/trunk/ceph/jobs/alcdat/makedirs.big deleted file mode 100644 index c67b2b93dd742..0000000000000 --- a/trunk/ceph/jobs/alcdat/makedirs.big +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - #'nummds' => [1, 2, 4, 8, 16, 32, 48, 64, 80, 96, 128, 160, 192], - 'nummds' => [160, 200],#[1, 2, 8, 16, 32, 48, 64, 80, 96, 112, 128],#144, 160, 192, 208], - - 'cper' => [15,20], - '_dep' => [ 'cnode' => '40',#$nummds',# / 4 + 1', - 'numclient' => '$nummds * $cper', - 'numosd' => '$nummds * .8', - 'n' => '415'],#1 + $cnode + $nummds + $numosd' ], - - # parameters - 'fs' => 'ebofs', - #'fs' => 'fakestore', - - 'mds_bal_rep' => 10000, # none of that! - 'mds_decay_halflife' => 30, - - 'mds_bal_interval' => 45, - 'mds_bal_max' => 2, - - 'until' => 300, # --syn until $n ... when to stop clients - 'kill_after' => 400, - 'start' => 100, - 'end' => 300, - - 'makedirs' => 1, - 'makedirs_dirs' => 10, - 'makedirs_files' => 10, - 'makedirs_depth' => 4, - - # --meta_log_layout_scount 32 --meta_log_layout_ssize 256 - # --osd_pg_layout linear - 'custom' => '--tcp_skip_rank0 --meta_log_layout_num_rep 1 --meta_dir_layout_num_rep 1 --mds_shutdown_check 60', - #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', - - 'comb' => { - 'x' => 'nummds', - 'vars' => [ 'mds.req' ] - } -}; diff --git a/trunk/ceph/jobs/alcdat/makedirs.tput b/trunk/ceph/jobs/alcdat/makedirs.tput deleted file mode 100644 index 8dd5ae4c47d8c..0000000000000 --- a/trunk/ceph/jobs/alcdat/makedirs.tput +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - #'nummds' => [1, 2, 4, 8, 16, 32, 48, 64, 80, 96, 128, 160, 192], - 'nummds' => [4, 16, 64],#[1, 16, 64, 128],#144, 160, 192, 208], - - #'cper' => [2, 5, 7, 10, 13, 16, 20, 30, 40, 50, 100, 150], - 'cper' => [13, 30, 40], # just for final run... - '_dep' => [ 'cnode' => '$nummds',# / 4 + 1', - 'numclient' => '$nummds * $cper', - 'numosd' => '$nummds', - 'n' => '1 + $cnode + $nummds + $numosd' ], - - # parameters - 'fs' => 'ebofs', - #'fs' => 'fakestore', - - 'mds_bal_rep' => 10000, # none of that! - 'mds_decay_halflife' => 30, - - 'mds_bal_interval' => 45, - 'mds_bal_max' => 2, - - 'until' => 300, # --syn until $n ... when to stop clients - 'kill_after' => 400, - 'start' => 100, - 'end' => 300, - - 'makedirs' => 1, - 'makedirs_dirs' => 10, - 'makedirs_files' => 10, - 'makedirs_depth' => 4, - - # --meta_log_layout_scount 32 --meta_log_layout_ssize 256 - # --osd_pg_layout linear - 'custom' => '--tcp_skip_rank0 --meta_log_layout_num_rep 1 --meta_dir_layout_num_rep 1 --mds_shutdown_check 60', - #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', - - 'comb' => { - 'x' => 'cper',#nummds', - 'vars' => [ 'mds.req', 'cl.lat' ] - } -}; diff --git a/trunk/ceph/jobs/alcdat/makefiles.shared b/trunk/ceph/jobs/alcdat/makefiles.shared deleted file mode 100644 index ab96702c73289..0000000000000 --- a/trunk/ceph/jobs/alcdat/makefiles.shared +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - 'nummds' => [1, 2, 4, 8, 16, 32, 64, 96, 128], #2, 4, 8, 16, 32, 48, 64, 80, 96], - - 'cper' => [25, 50, 100, 150],# 100, 150, 200], - - '_dep' => [ 'cnode' => '$nummds', - 'numclient' => '$nummds * $cper', - 'numosd' => '$nummds', - 'n' => '1 + $cnode + $nummds + $numosd' ], - - # parameters - 'fs' => 'ebofs', - - 'mds_bal_hash_wr' => 1000, - - 'until' => 180, # --syn until $n ... when to stop clients - 'kill_after' => 250, - 'start' => 30, - 'end' => 180, - - 'custom' => '--tcp_skip_rank0 --meta_log_layout_num_rep 1 --meta_dir_layout_num_rep 1 --mds_shutdown_check 60 --syn makefiles 100000 1000 0', - - 'comb' => { - 'x' => 'nummds', - 'vars' => [ 'mds.req', 'cl.lat' ] - } -}; diff --git a/trunk/ceph/jobs/alcdat/openshared b/trunk/ceph/jobs/alcdat/openshared deleted file mode 100644 index 5ed7ba95894b3..0000000000000 --- a/trunk/ceph/jobs/alcdat/openshared +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - 'nummds' => [1, 4, 16, 64, 128, 192 ], - - 'cper' => [10, 50, 100, 150], - '_dep' => [ 'cnode' => '$nummds',# > 30 ? 30:$nummds', - 'numclient' => '$nummds*$cper', - 'numosd' => '$nummds > 30 ? 30:$nummds', - 'n' => '1 + $cnode + $nummds + $numosd' ], - - # parameters - 'fs' => 'ebofs', - - 'mds_bal_interval' => 10000, - 'mds_bal_hash_wr' => 1000, - - 'until' => 120, # --syn until $n ... when to stop clients - 'kill_after' => 180, - 'start' => 10, - 'end' => 120, - - 'custom' => '--tcp_skip_rank0 --debug_mds_balancer 10 --mds_shutdown_check 60 --syn only 0 --syn createshared 10 --syn sleep 5 --syn openshared 10 10000', - - 'comb' => { - 'x' => 'nummds', - 'vars' => [ 'mds.req', 'cl.lat' ] - } -}; diff --git a/trunk/ceph/jobs/alcdat/ossh.include b/trunk/ceph/jobs/alcdat/ossh.include deleted file mode 100644 index c9a368ba5c60f..0000000000000 --- a/trunk/ceph/jobs/alcdat/ossh.include +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 10, - - #'nummds' => [2, 4, 8, 16, 32, 48, 64, 80, 96, 128], - #'nummds' => [1, 2, 4, 6, 7], # googoo - 'nummds' => [ 2, 4, 8, 16, 32, 48, 64, 80, 96, 128 ], - - #'trace' => ['make.lib', 'make.include'], - - 'mds_bal_interval' => 45, - 'mds_bal_max' => 2,#6, #[ 2,4,6 ], - 'mds_decay_halflife' => 30, - 'mds_bal_rep' => 1500, - 'mds_bal_hash_rd' => 100000, - - 'cper' => [15, 20],#25, 50, 100], #50,#[25, 50, 75, 100],#50,# [ 50, 100 ], - #'cper' => 125, #[30, 50, 75, 100, 125, 150], #50, #[10,50,100],# [ 50, 75, 100 ], - - '_dep' => [ 'cnode' => '$nummds', - 'numclient' => '$nummds * $cper', - 'numosd' => '$nummds', - 'n' => '1 + $cnode + $nummds + $numosd' ], - - 'custom' => '--tcp_skip_rank0 --mds_shutdown_check 60 --syn only 0 --syn trace traces/openssh/untar.include 1 --syn sleep 30 --syn trace traces/openssh/make.include 1000', - - # parameters - 'fs' => 'ebofs', - - #'until' => 500, - #'kill_after' => 600, - #'start' => 200, - #'end' => 500, - 'until' => 300, # --syn until $n ... when to stop clients - 'kill_after' => 400, - 'start' => 200, - 'end' => 300, - - 'comb' => { - 'x' => 'nummds', - 'vars' => [ 'mds.req' ] - } -}; diff --git a/trunk/ceph/jobs/alcdat/ossh.include.big b/trunk/ceph/jobs/alcdat/ossh.include.big deleted file mode 100644 index b92895a53a763..0000000000000 --- a/trunk/ceph/jobs/alcdat/ossh.include.big +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 10, - - #'nummds' => [2, 4, 8, 16, 32, 48, 64, 80, 96, 128], - #'nummds' => [1, 2, 4, 6, 7], # googoo - #'nummds' => [ 2, 4, 8, 16, 32, 48, 64, 80, 96, 128 ], - 'nummds' => [160,200], - - #'trace' => ['make.lib', 'make.include'], - - 'mds_bal_interval' => 45, - 'mds_bal_max' => 2,#6, #[ 2,4,6 ], - 'mds_decay_halflife' => 30, - 'mds_bal_rep' => 1500, - 'mds_bal_hash_rd' => 100000, - - 'cper' => [25, 50], #50,#[25, 50, 75, 100],#50,# [ 50, 100 ], - #'cper' => 125, #[30, 50, 75, 100, 125, 150], #50, #[10,50,100],# [ 50, 75, 100 ], - - '_dep' => [ 'cnode' => '$nummds', - 'numclient' => '$nummds * $cper', - 'numosd' => '$nummds * .6', - 'n' => '415'],#1 + $cnode + $nummds + $numosd' ], - - 'custom' => '--tcp_skip_rank0 --tcp_overlay_clients --mds_shutdown_check 60 --syn only 0 --syn trace traces/openssh/untar.include 1 --syn sleep 30 --syn trace traces/openssh/make.include 1000', - - # parameters - 'fs' => 'ebofs', - - #'until' => 500, - #'kill_after' => 600, - #'start' => 200, - #'end' => 500, - 'until' => 300, # --syn until $n ... when to stop clients - 'kill_after' => 400, - 'start' => 200, - 'end' => 300, - - 'comb' => { - 'x' => 'nummds', - 'vars' => [ 'mds.req' ] - } -}; diff --git a/trunk/ceph/jobs/alcdat/ossh.lib b/trunk/ceph/jobs/alcdat/ossh.lib deleted file mode 100644 index 73372866f051f..0000000000000 --- a/trunk/ceph/jobs/alcdat/ossh.lib +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 10, - - #'nummds' => [1, 2, 4, 8, 16, 32, 48, 64, 80, 96, 128], - 'nummds' => [2, 4, 8, 16, 32, 48, 64, 80, 96, 128], - - #'nummds' => [1, 2, 4, 6, 7], # googoo - #'trace' => ['make.lib', 'make.include'], - - 'mds_bal_interval' => 90, #[30, 60, 90], #$[60,90], #[60,90],#[30, 60, 90], - #'mds_bal_max' => [4, 10],#6,#[2,4,6,8], - - 'mds_decay_halflife' => 30, - 'mds_bal_rep' => 1500, - 'cper' => [10, 16], #50,#[25, 50, 75, 100],#50,# [ 50, 100 ], - - '_dep' => [ 'cnode' => '$nummds', - 'numclient' => '$nummds * $cper', - 'numosd' => '$nummds', - 'n' => '1 + $cnode + $nummds + $numosd' ], - - - 'custom' => '--tcp_skip_rank0 --debug_mds_balancer 1 --mds_shutdown_check 60 --syn only 0 --syn trace traces/openssh/untar.lib 1 --syn sleep 10 --syn trace traces/openssh/make.lib 1000', - - # parameters - #'fs' => ['fakestore'], - 'fs' => 'ebofs', - - #'until' => 500, - #'kill_after' => 600, - #'start' => 200, - #'end' => 500, - 'until' => 300, # --syn until $n ... when to stop clients - 'kill_after' => 400, - 'start' => 150, - 'end' => 300, - - 'comb' => { - 'x' => 'nummds', - 'vars' => [ 'mds.req' ] - } -}; diff --git a/trunk/ceph/jobs/alcdat/ossh.lib.big b/trunk/ceph/jobs/alcdat/ossh.lib.big deleted file mode 100644 index b9e0dd1ff68cd..0000000000000 --- a/trunk/ceph/jobs/alcdat/ossh.lib.big +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 10, - - #'nummds' => [1, 2, 4, 8, 16, 32, 48, 64, 80, 96, 128], - #'nummds' => [2, 4, 8, 16, 32, 48, 64, 80, 96, 128], - 'nummds' => [160,200], - - #'nummds' => [1, 2, 4, 6, 7], # googoo - #'trace' => ['make.lib', 'make.include'], - - 'mds_bal_interval' => 90, #[30, 60, 90], #$[60,90], #[60,90],#[30, 60, 90], - #'mds_bal_max' => [4, 10],#6,#[2,4,6,8], - - 'mds_decay_halflife' => 30, - 'mds_bal_rep' => 1500, - 'cper' => [25, 50, 100], #50,#[25, 50, 75, 100],#50,# [ 50, 100 ], - - '_dep' => [ 'cnode' => 0,#'30', - 'numclient' => '$nummds * $cper', - 'numosd' => '$nummds * .6', - 'n' => '415'],#'1 + $cnode + $nummds + $numosd' ], - - - 'custom' => '--tcp_skip_rank0 --debug_mds_balancer 1 --mds_shutdown_check 60 --syn only 0 --syn trace traces/openssh/untar.lib 1 --syn sleep 10 --syn trace traces/openssh/make.lib 1000', - - # parameters - #'fs' => ['fakestore'], - 'fs' => 'ebofs', - - #'until' => 500, - #'kill_after' => 600, - #'start' => 200, - #'end' => 500, - 'until' => 300, # --syn until $n ... when to stop clients - 'kill_after' => 400, - 'start' => 150, - 'end' => 300, - - 'comb' => { - 'x' => 'nummds', - 'vars' => [ 'mds.req' ] - } -}; diff --git a/trunk/ceph/jobs/alcdat/striping b/trunk/ceph/jobs/alcdat/striping deleted file mode 100644 index de71828d12bde..0000000000000 --- a/trunk/ceph/jobs/alcdat/striping +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - 'nummds' => 1, - 'numosd' => 10, - - 'cnode' => 10, - 'cper' => [ 10, 25, 50, 100 ], - - '_dep' => [ 'numclient' => '$cper * $cnode', - 'n' => '1 + $cnode + $nummds + $numosd', - 'file_layout_osize' => '$writefile_size' ], - - # parameters - 'fs' => 'ebofs', - #'fs' => 'fakestore', - - 'until' => 160, # --syn until $n ... when to stop clients - 'kill_after' => 200, - 'start' => 100, - 'end' => 160, - - 'writefile' => 1, - 'writefile_size' => [ -# 4*1024*1024, - 1024*1024 ], -# 256*1024, -# 64*1024 - 'writefile_mb' => 100000, - - 'osd_pg_bits' => 10,#16, - #'osd_pg_bits' => [ 16, 20 ], - - #'osd_object_layout' => [ 'hash', 'hashino', 'linear' ], - 'osd_pg_layout' => [ 'crush', -# 'hash', - 'linear' ], - - 'custom' => '--tcp_skip_rank0 --file_layout_num_rep 1 --mds_shutdown_check 60', - - 'comb' => { - 'x' => 'cper',#writefile_size', - 'vars' => [ 'osd.c_wrb', 'osd.c_wr' ], - } -}; diff --git a/trunk/ceph/jobs/example b/trunk/ceph/jobs/example deleted file mode 100644 index 802a8b66e6332..0000000000000 --- a/trunk/ceph/jobs/example +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/perl -# hi there -{ - # startup - 'n' => 30, # number of mpi nodes - 'sleep' => 3, # seconds to sleep between runs (so you have time to control-c out) - 'nummds' => 1, - 'numosd' => 6, - 'numclient' => 100, - - 'until' => 100, # --syn until $n ... synthetic client will stop itself after this many seconds. - 'kill_after' => 300, # seconds before everything commits suicide (in case something hangs) - - # stuff i want to vary - # here's a simple example: - - # do --syn writefile command - 'writefile' => 1, - # and very the write size - 'writefile_size' => [ # vary -# 2048*1024, - 1024*1024, - 512*1024, - 256*1024, - 128*1024, - 64*1024, - 48*1024, - 32*1024, - 28*1024, - 24*1024, - 16*1024, - 12*1024, - 8*1024, - 4096, -# 256, -# 16, -# 1 - ], - 'writefile_mb' => 1000, # each client shoudl write 1GB (or more likely, keep going until time runs out) - - 'file_layout_num_rep'=> [1,2], # also vary the replication level - - # pass some other random things to newsyn - 'custom' => '--', - - # for final summation (script/sum.pl) - # specify time period to look at the results - 'start' => 30, # skip first 30 seconds, so that caches are full etc. - 'end' => 90, # go for 60 seconds - - # what should i parse/plot? - 'comb' => { - 'x' => 'writefile_size', - 'vars' => [ 'osd.c_wrb', 'osd.r_wrb' ], - } -}; diff --git a/trunk/ceph/jobs/mds/log_striping b/trunk/ceph/jobs/mds/log_striping deleted file mode 100644 index 46242cdda4f00..0000000000000 --- a/trunk/ceph/jobs/mds/log_striping +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - 'kill_after' => 300, - - 'nummds' => 1, - 'numosd' => 8, - 'numclient' => 100, - 'n' => 16, - - # parameters - 'fs' => ['ebofs','fakestore'], - 'meta_log_ssize' => [ 128, 256, 1024, 1 << 15, 1 << 20 ], - 'meta_log_scount' => 4,#[ 1, 2, 4, 8 ], - - 'until' => 200, # --syn until $n ... when to stop clients - - 'makedirs' => 1, - 'makedirs_dirs' => 10, - 'makedirs_files' => 10, - 'makedirs_depth' => 4, - - 'custom' => '--tcp_skip_rank0', - #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', - - # for final summation (script/sum.pl) - 'start' => 100, - 'end' => 550, - - 'comb' => { - 'x' => 'nummds', - 'vars' => [ 'mds.req' ] - } -}; diff --git a/trunk/ceph/jobs/mds/makedir_lat b/trunk/ceph/jobs/mds/makedir_lat deleted file mode 100644 index 63374f52a36c0..0000000000000 --- a/trunk/ceph/jobs/mds/makedir_lat +++ /dev/null @@ -1,33 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - 'nummds' => 1, - 'numosd' => 8, - 'numclient' => [1],#, 40, 80, 160 ], - 'n' => 20, - - 'fs' => 'ebofs', - - 'start' => 20, - 'end' => 40, - 'until' => 40, - 'kill_after' => 60, - - 'makedirs' => 1, - 'makedirs_dirs' => 10, - 'makedirs_files' => 10, - 'makedirs_depth' => 5, - - 'mds_local_osd' => [ 0, 1 ], - 'meta_log_layout_num_rep' => [ 0, 1, 2, 3, 4], - - 'custom' => '--tcp_skip_rank0', - - 'comb' => { - 'x' => 'meta_log_layout_num_rep', - 'vars' => [ 'mds.log.lat', 'cl.lat', 'osd.rlat' ] - } -}; diff --git a/trunk/ceph/jobs/mds/makedirs b/trunk/ceph/jobs/mds/makedirs deleted file mode 100644 index 4ca42d72fa37e..0000000000000 --- a/trunk/ceph/jobs/mds/makedirs +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - '_psub' => 'jobs/alc.tp', - - 'sleep' => 3, - - 'nummds' => [1, 2, 4, 6, 8, 12, 16, 24, 32, 40, 48, 64], - - 'cper' => 50, - '_dep' => [ 'cnode' => '$nummds', - 'numclient' => '$cnode * $cper', - 'numosd' => '$nummds * 2', - 'n' => '1 + $cnode + $nummds + $numosd' ], - - # parameters - #'fs' => 'ebofs', - 'fs' => 'fakestore', - - 'until' => 300, # --syn until $n ... when to stop clients - 'kill_after' => 400, - - 'makedirs' => 1, - 'makedirs_dirs' => 10, - 'makedirs_files' => 10, - 'makedirs_depth' => 3, - - 'custom' => '--tcp_skip_rank0', - #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', - - # for final summation (script/sum.pl) - 'start' => 100, - 'end' => 550, - - 'comb' => { - 'x' => 'nummds', - 'vars' => [ 'mds.req' ] - } -}; diff --git a/trunk/ceph/jobs/mds/opensshlib b/trunk/ceph/jobs/mds/opensshlib deleted file mode 100644 index d8b61ae52c655..0000000000000 --- a/trunk/ceph/jobs/mds/opensshlib +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - 'nummds' => [1, 2, 4, 7], # googoo - #'nummds' => [1, 2, 4, 6, 8, 12, 16, 24, 32, 40, 48, 64], # alc - - - # parameters - 'fs' => 'ebofs', - #'fs' => 'fakestore', - - 'until' => 300, # --syn until $n ... when to stop clients - 'kill_after' => 400, - 'start' => 150, - 'end' => 300, - - 'mds_bal_interval' => 90,#[60, 90], - #'mds_bal_max' => [3,4,5], - 'mds_bal_max' => 4, - 'mds_decay_halflife' => 30,#[15, 25, 30, 45, 60], - 'mds_bal_rep' => 1500,#[1000, 1500, 2000], - - 'decay_hl' => 100,#[ 25, 50, 100, 150 ], - - 'cper' => 100, #[50, 75, 100, 125, 150, 200], - '_dep' => [ 'cnode' => '$nummds', - 'numclient' => '$nummds * $cper', - 'numosd' => '$nummds * 2', - 'n' => '1 + $cnode + $nummds + $numosd', - 'mds_bal_rep' => '$mds_decay_halflife * $decay_hl'], - - 'custom' => '--tcp_skip_rank0 --syn only 0 --syn trace traces/openssh/untar.lib 1 --syn sleep 10 --syn randomsleep 30 --syn trace traces/openssh/make.lib 100 --debug_mds_balancer 1 --mds_shutdown_check 60', - #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', - - # for final summation (script/sum.pl) - - 'comb' => { - 'x' => 'nummds',#decay_hl',#'nummds', - 'vars' => [ 'mds.req' ] - } -}; diff --git a/trunk/ceph/jobs/meta1 b/trunk/ceph/jobs/meta1 deleted file mode 100644 index 743212f1c3009..0000000000000 --- a/trunk/ceph/jobs/meta1 +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/sh - -# makedirs for 300 seconds -# first bit in memory -# second bit is commiting from journal too -# then walk fs for 300 seconds -# this should all be in memory. - -JOB="meta1" -ARGS="--numosd 10 --fullmkfs --syn until 180 --syn makedirs 10 10 4 --syn until 360 --syn repeatwalk --mds_bal_max 1 --osd_fsync 0 --mds_log_max_len 200000 --mds_cache_size 500000" - -#rm core* ; make tcpsyn && mpiexec -l -n 17 ./tcpsyn $ARGS --nummds 1 --log_name $JOB/1 --numclient 25 > log/$JOB/o.1 -#rm core* ; make tcpsyn && mpiexec -l -n 18 ./tcpsyn $ARGS --nummds 2 --log_name $JOB/2 --numclient 50 > log/$JOB/o.2 -#rm core* ; make tcpsyn && mpiexec -l -n 20 ./tcpsyn $ARGS --nummds 4 --log_name $JOB/4 --numclient 100 > log/$JOB/o.4 -#rm core* ; make tcpsyn && mpiexec -l -n 24 ./tcpsyn $ARGS --nummds 8 --log_name $JOB/8 --numclient 200 > log/$JOB/o.8 -#rm core* ; make tcpsyn && mpiexec -l -n 28 ./tcpsyn $ARGS --nummds 12 --log_name $JOB/12 --numclient 300 > log/$JOB/o.12 -rm core* ; make tcpsyn && mpiexec -l -n 32 ./tcpsyn $ARGS --nummds 16 --log_name $JOB/16 --numclient 300 > log/$JOB/o.16 - - diff --git a/trunk/ceph/jobs/meta1.proc.sh b/trunk/ceph/jobs/meta1.proc.sh deleted file mode 100755 index 616acbefff619..0000000000000 --- a/trunk/ceph/jobs/meta1.proc.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/sh - -for d in 1 2 4 8 12 -do - echo $d - cd $d - ../../../script/sum.pl mds? mds?? > mds.sum - ../../../script/sum.pl -avg mds? mds?? > mds.avg - - ../../../script/sum.pl -start 90 -end 180 mds? mds?? > mds.sum.makedirs - ../../../script/sum.pl -start 200 -end 300 mds? mds?? > mds.sum.walk - - cd .. -done diff --git a/trunk/ceph/jobs/osd/ebofs b/trunk/ceph/jobs/osd/ebofs deleted file mode 100644 index 5d11523f6f832..0000000000000 --- a/trunk/ceph/jobs/osd/ebofs +++ /dev/null @@ -1,51 +0,0 @@ -# hi there -{ - # startup - 'n' => 30, # mpi nodes - 'sleep' => 3, # seconds between runs - 'nummds' => 1, - 'numosd' => 8, - 'numclient' => 100,#[10, 50, 100, 200, 400], - -'kill_after' => 200, - - # parameters - 'fs' => 'ebofs',#[ -# 'obfs', -# 'fakestore', -# 'ebofs' -# ], - 'until' => 100, # --syn until $n ... when to stop clients - 'writefile' => 1, - 'writefile_size' => [ -# 2560000, - 1024000, - 262144, -# 131072, -# 98304, - 65536, -# 16384, -# 4096, - 256, -# 16, -# 1 - ], - 'writefile_mb' => 1000, - - 'ebofs_idle_commit_ms' => [ 100, 500 ], - 'ebofs_abp_max_alloc' => [ 4096*16, 4096*64 ], - -# 'custom' => '--tcp_skip_rank0',# --osd_maxthreads 0', - 'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', - - # for final summation (script/sum.pl) - 'start' => 30, - 'end' => 90, - -'comb' => { - 'x' => 'writefile_size', - 'vars' => [ 'osd.c_wrb' ], -# 'maptitle' => { 'osd_object_layout=' => '', -# ',osd_pg_layout=' => ' + '} - } -}; diff --git a/trunk/ceph/jobs/osd/mds_log b/trunk/ceph/jobs/osd/mds_log deleted file mode 100644 index 0f99f6998dcfc..0000000000000 --- a/trunk/ceph/jobs/osd/mds_log +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - #'_psub' => 'jobs/alc.tp', - 'sleep' => 3, - - 'nummds' => 1, - 'numclient' => [5, 10, 15, 25, 50, 100, 200, 300, 400], - #'numclient' => [ 50, 100, 200 ], - 'numosd' => [2,4],#[ 4, 8, 12, 16, 20, 24 ], - 'n' => 12, - - # parameters - 'fs' => 'fakestore',#['ebofs', 'fakestore','obfs'], - #'fs' => 'ebofs', - #'ebofs_commit_ms' => [ 1000, 5000 ], - #'osd_maxthreads' => [ 0, 1, 2, 4, 8 ], - - 'until' => 100, # --syn until $n ... when to stop clients - 'kill_after' => 300, - 'start' => 20, - 'end' => 90, - - 'makedirs' => 1, - 'makedirs_dirs' => 10, - 'makedirs_files' => 10, - 'makedirs_depth' => 3, - - - #'meta_log_layout_ssize' => [256, 512, 1024, 4096, 16384, 65536, 262400], - #'meta_log_layout_scount' => [2, 4, 8], - #'meta_log_layout_num_rep' => [1, 2], - #'meta_log_layout_num_rep' => 1, - - 'custom' => '--tcp_skip_rank0 --mds_shutdown_check 60', - #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', - - 'comb' => { - 'x' => 'numclient',#'meta_log_layout_ssize', - 'vars' => [ 'mds.req' ] - } -}; diff --git a/trunk/ceph/jobs/osd/osd_threads b/trunk/ceph/jobs/osd/osd_threads deleted file mode 100644 index ef271f9e88710..0000000000000 --- a/trunk/ceph/jobs/osd/osd_threads +++ /dev/null @@ -1,33 +0,0 @@ -# hi there -{ - # startup - 'n' => 30, # mpi nodes - 'sleep' => 10, # seconds between runs - 'nummds' => 1, - 'numosd' => 8, - 'numclient' => 50, - - # parameters - 'fs' => [ -# 'obfs', - 'fakestore', - 'ebofs' - ], - 'until' => 100, # --syn until $n ... when to stop clients - 'writefile' => 1, - 'writefile_size' => [ - 1024000, - 131072, - 65536, - 16 - ], - 'writefile_mb' => 1000, - - 'osd_maxthreads' => [0, 1, 2, 4, 8], - - 'custom' => '--tcp_skip_rank0', - - # for final summation (script/sum.pl) - 'start' => 30, - 'end' => 90 -}; diff --git a/trunk/ceph/jobs/osd/striping b/trunk/ceph/jobs/osd/striping deleted file mode 100644 index ea8cabe643274..0000000000000 --- a/trunk/ceph/jobs/osd/striping +++ /dev/null @@ -1,78 +0,0 @@ -#!/usr/bin/perl -# hi there -{ - # startup - #'n' => 28, # mpi nodes - - 'sleep' => 3, # seconds between runs - 'nummds' => 1, - - 'numosd' => [2,3,4,5,6,7,8,10,12], #[6, 8, 10, 12, 16], - 'numosd' => [14], - #'cper' => [4, 5, 6, 7, 8, 9, 10, 11, 12], #[1, 4, 6, 8, 16, 32, 64], - #'cper' => [4, 6, 8, 10, 12, 16, 24, 32 ], #[1, 4, 6, 8, 16, 32, 64], - 'cper' => [30], - - '_dep' => [ 'cnode' => '$numosd', - 'numclient' => '$cnode * $cper', - 'n' => 38],#'$nummds + $numosd + $cnode'], - #'numclient' => [5, 10, 20, 50, 75, 100, 150 ], - - 'start' => 30, - 'end' => 90, - 'until' => 100, # --syn until $n ... when to stop clients - 'kill_after' => 260, - - # parameters - 'fs' => 'ebofs', - 'writefile' => 1, - - 'writefile_size' => [# 4096, - # 16*1024, - # 64*1024, - # 256*1024, - 1024*1024 ], -# 'writefile_size' => [ -# 2048*1024, -# 1048576, -# 512*1024, -# 262144, -# 65536, -# 16384 -# ], - 'writefile_mb' => 1000, - - 'file_layout_num_rep'=> [1,2,3], - - 'osd_pg_bits' => 12,#[6, 8, 10, 12, 14], - - 'osd_object_layout' => [ 'hashino' ],#'hash', 'hashino', 'linear' ], - 'osd_pg_layout' => [ 'crush', 'linear' ],#, 'linear'],#, 'hash' ],#, 'linear' ],#, 'hash' ], - - #'custom' => '--tcp_skip_rank0', # --osd_maxthreads 0', - #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', - - # for final summation (script/sum.pl) - - 'comb' => { - 'x' => 'numosd',#'writefile_size', - 'vars' => [ 'osd.c_wrb', 'cl.wrlat' ], -# 'maptitle' => { 'osd_object_layout=' => '', -# ',osd_pg_layout=' => ' + '} - } -}; - - -=item some googoo notes - -for 1mb 1x writes, - - with numosd=6, min cper=6 to saturate (cper_saturate) - googoo saturates at numosd=8. (osd_saturate) - - -> so, numosd=6 or 7 is a safe size! - - - - -=cut diff --git a/trunk/ceph/jobs/osd/wr_lat2 b/trunk/ceph/jobs/osd/wr_lat2 deleted file mode 100644 index 47053dd61f3ab..0000000000000 --- a/trunk/ceph/jobs/osd/wr_lat2 +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - 'nummds' => 1, - 'numosd' => [12], - 'numclient' => [1],#, 40, 80, 160 ], - 'n' => 16, - - 'fs' => 'ebofs', - - 'start' => 10, - 'end' => 40, - 'until' => 40, - 'kill_after' => 90, - - 'writefile' => 1, - 'writefile_size' => [4096, - 8*1024, - 16*1024, - 32*1024, - 64*1024, - 128*1024, - 256*1024, - 512*1024, - 1024*1024], - 'writefile_mb' => 10000, - - #'tcp_multi_out' => [0,1], - -# 'mds_local_osd' => [ 0, 1 ], - 'file_layout_num_rep' => [1,2,3],#, 2, 3, 4], - - 'client_oc' => [0,1], - - 'custom' => '--tcp_skip_rank0', - - 'comb' => { - 'x' => 'writefile_size',#'file_layout_num_rep', - 'vars' => [ 'osd.c_wrb','cl.wrlat' ] - } -}; diff --git a/trunk/ceph/jobs/osd/write_sizes b/trunk/ceph/jobs/osd/write_sizes deleted file mode 100644 index 57369f3a97c50..0000000000000 --- a/trunk/ceph/jobs/osd/write_sizes +++ /dev/null @@ -1,60 +0,0 @@ -#!/usr/bin/perl -# hi there -{ - # startup - 'n' => 30, # mpi nodes - 'sleep' => 3, # seconds between runs - 'nummds' => 1, - 'numosd' => 6, - 'numclient' => 100,#[25,50,100,300],#100,#[10, 50, 100, 200, 400], - - 'until' => 100, # --syn until $n ... when to stop clients - 'kill_after' => 300, - - # parameters - 'fs' => [ -# 'obfs', - 'fakestore', -# 'ebofs' - ], - 'writefile' => 1, - 'writefile_size' => [ -# 2048*1024, - 1024*1024, - 512*1024, - 256*1024, - 128*1024, - 64*1024, - 48*1024, - 32*1024, - 28*1024, - 24*1024, - 16*1024, - 12*1024, - 8*1024, - 4096, -# 256, -# 16, -# 1 - ], - 'writefile_mb' => 1000, - - 'file_layout_num_rep'=> 1,#[1,2], - - -# 'ebofs_idle_commit_ms' => [ 100, 500 ], -# 'ebofs_abp_max_alloc' => [ 4096*16, 4096*64 ], - - 'custom' => '--debug_after 110 --debug_mds 15 --debug 5 --mds_shutdown_check 60', - - # for final summation (script/sum.pl) - 'start' => 30, - 'end' => 90, - - 'comb' => { - 'x' => 'writefile_size', - 'vars' => [ 'osd.c_wrb' ], -# 'maptitle' => { 'osd_object_layout=' => '', -# ',osd_pg_layout=' => ' + '} - } -}; diff --git a/trunk/ceph/jobs/rados/map_dist b/trunk/ceph/jobs/rados/map_dist deleted file mode 100644 index 39f16daa1cdc2..0000000000000 --- a/trunk/ceph/jobs/rados/map_dist +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - 'osdbits' => [6,7,8],#,9],10,11], - 'pgperbits' => [3],#,4,5],#[4,6,8], - - 'nummds' => 1, - - '_dep' => [ 'numosd' => '1 << $osdbits', - 'osd_pg_bits' => '$pgperbits + $osdbits', - 'n' => '3 + $numosd / 32'], - 'numclient' => 0, - - 'fake_osdmap_updates' => [30], - - 'fs' => 'ebofs', - - 'start' => 30, - 'end' => 300, - 'kill_after' => 300, - - 'custom' => '--bdev_lock 0 --ms_stripe_osds --osd_maxthreads 0', - #'custom' => '--tcp_skip_rank0', - - 'comb' => { - 'x' => 'osdbits', - 'vars' => [ 'osd.sum=mapi', 'osd.sum=mapidup', 'osd.numpg', 'osd.pingset' ] - } -}; diff --git a/trunk/ceph/jobs/rados/rep_lat b/trunk/ceph/jobs/rados/rep_lat deleted file mode 100644 index 3f5ab0c8a7d87..0000000000000 --- a/trunk/ceph/jobs/rados/rep_lat +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - 'sleep' => 3, - - 'nummds' => 1, - 'numosd' => 8, #[6], - 'numclient' => 1,#, 40, 80, 160 ], - 'n' => 10, - - 'fs' => 'ebofs', - - 'start' => 10, - 'end' => 40, - 'until' => 40, - 'kill_after' => 45, - - 'writefile' => 1, - 'writefile_size' => [4096, -# 8*1024, -# 16*1024, -# 32*1024, - 64*1024, -# 128*1024, -# 256*1024, -# 512*1024, -# 1024*1024 -], - 'writefile_mb' => 10000, - - 'osd_rep' => [0,1,2], - - 'file_layout_num_rep' => [1,2,3,4,5,6],#, 2, 3, 4], - - 'osd_pg_bits' => 4, - 'custom' => '--osd_max_rep 8', - - 'comb' => { - 'x' => 'file_layout_num_rep', - 'vars' => [ 'cl.wrlat' ] - } -}; diff --git a/trunk/ceph/jobs/runjobsample b/trunk/ceph/jobs/runjobsample deleted file mode 100644 index 590be207771b2..0000000000000 --- a/trunk/ceph/jobs/runjobsample +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/perl - -# hi there -{ - '_sleep' => 3, - - 'nummds' => 1, - 'numosd' => 16, #[8],#10,14,16], - 'numclient' => 32,#,4,10,20,40], #[10*16], - '_n' => 32, - - '_start' => 15, - '_end' => 45, - '_kill_after' => 190, - - 'osd_pg_bits' => [4, 6], - 'osd_auto_weight' => [0,1], - 'file_layout_pg_size' => [1,2], - - '_custom' => '--syn createobjects 1000000 1048576 2', - - '_comb' => { - 'x' => 'osd_pg_bits', - 'vars' => [ 'osd.c_wrb' ] - } -}; diff --git a/trunk/ceph/mds/Anchor.h b/trunk/ceph/mds/Anchor.h deleted file mode 100644 index a55a07dd3068e..0000000000000 --- a/trunk/ceph/mds/Anchor.h +++ /dev/null @@ -1,108 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __ANCHOR_H -#define __ANCHOR_H - -#include -using std::string; - -#include "include/types.h" -#include "mdstypes.h" -#include "include/buffer.h" - - -// anchor ops -#define ANCHOR_OP_LOOKUP 1 -#define ANCHOR_OP_LOOKUP_REPLY -2 - -#define ANCHOR_OP_CREATE_PREPARE 11 -#define ANCHOR_OP_CREATE_AGREE -12 - -#define ANCHOR_OP_DESTROY_PREPARE 21 -#define ANCHOR_OP_DESTROY_AGREE -22 - -#define ANCHOR_OP_UPDATE_PREPARE 31 -#define ANCHOR_OP_UPDATE_AGREE -32 - -#define ANCHOR_OP_COMMIT 41 -#define ANCHOR_OP_ACK -42 -#define ANCHOR_OP_ROLLBACK 43 - - - -inline const char* get_anchor_opname(int o) { - switch (o) { - case ANCHOR_OP_LOOKUP: return "lookup"; - case ANCHOR_OP_LOOKUP_REPLY: return "lookup_reply"; - - case ANCHOR_OP_CREATE_PREPARE: return "create_prepare"; - case ANCHOR_OP_CREATE_AGREE: return "create_agree"; - case ANCHOR_OP_DESTROY_PREPARE: return "destroy_prepare"; - case ANCHOR_OP_DESTROY_AGREE: return "destroy_agree"; - case ANCHOR_OP_UPDATE_PREPARE: return "update_prepare"; - case ANCHOR_OP_UPDATE_AGREE: return "update_agree"; - - case ANCHOR_OP_COMMIT: return "commit"; - case ANCHOR_OP_ACK: return "ack"; - case ANCHOR_OP_ROLLBACK: return "rollback"; - default: assert(0); return 0; - } -} - - -// identifies a anchor table mutation - - - -// anchor type - -class Anchor { -public: - inodeno_t ino; // anchored ino - dirfrag_t dirfrag; // containing dirfrag - //string ref_dn; // referring dentry - int nref; // reference count - - Anchor() {} - Anchor(inodeno_t i, dirfrag_t df, - //string& rd, - int nr=0) : - ino(i), dirfrag(df), - //ref_dn(rd), - nref(nr) { } - - void _encode(bufferlist &bl) { - bl.append((char*)&ino, sizeof(ino)); - bl.append((char*)&dirfrag, sizeof(dirfrag)); - bl.append((char*)&nref, sizeof(nref)); - //::_encode(ref_dn, bl); - } - void _decode(bufferlist& bl, int& off) { - bl.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - bl.copy(off, sizeof(dirfrag), (char*)&dirfrag); - off += sizeof(dirfrag); - bl.copy(off, sizeof(nref), (char*)&nref); - off += sizeof(nref); - //::_decode(ref_dn, bl, off); - } -}; - -inline ostream& operator<<(ostream& out, Anchor& a) -{ - return out << "a(" << a.ino << " " << a.dirfrag << " " << a.nref << ")"; -} - -#endif diff --git a/trunk/ceph/mds/AnchorClient.cc b/trunk/ceph/mds/AnchorClient.cc deleted file mode 100644 index 1cc18dc7d8fa4..0000000000000 --- a/trunk/ceph/mds/AnchorClient.cc +++ /dev/null @@ -1,365 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include -using std::cout; -using std::cerr; - -#include "Anchor.h" -#include "AnchorClient.h" -#include "MDSMap.h" - -#include "include/Context.h" -#include "msg/Messenger.h" - -#include "MDS.h" -#include "MDLog.h" -#include "LogSegment.h" - -#include "events/EAnchorClient.h" -#include "messages/MAnchor.h" - -#include "config.h" - -#define dout(x) if (x <= g_conf.debug_mds) *_dout << dbeginl << g_clock.now() << " " << mds->messenger->get_myname() << ".anchorclient " -#define derr(x) if (x <= g_conf.debug_mds) *_derr << dbeginl << g_clock.now() << " " << mds->messenger->get_myname() << ".anchorclient " - - -void AnchorClient::dispatch(Message *m) -{ - switch (m->get_type()) { - case MSG_MDS_ANCHOR: - handle_anchor_reply((MAnchor*)m); - break; - - default: - assert(0); - } -} - -void AnchorClient::handle_anchor_reply(class MAnchor *m) -{ - inodeno_t ino = m->get_ino(); - version_t atid = m->get_atid(); - - dout(10) << "handle_anchor_reply " << *m << dendl; - - switch (m->get_op()) { - - // lookup - case ANCHOR_OP_LOOKUP_REPLY: - assert(pending_lookup.count(ino)); - { - *pending_lookup[ino].trace = m->get_trace(); - Context *onfinish = pending_lookup[ino].onfinish; - pending_lookup.erase(ino); - - if (onfinish) { - onfinish->finish(0); - delete onfinish; - } - } - break; - - // prepare -> agree - case ANCHOR_OP_CREATE_AGREE: - if (pending_create_prepare.count(ino)) { - dout(10) << "got create_agree on " << ino << " atid " << atid << dendl; - Context *onfinish = pending_create_prepare[ino].onfinish; - *pending_create_prepare[ino].patid = atid; - pending_create_prepare.erase(ino); - - if (onfinish) { - onfinish->finish(0); - delete onfinish; - } - } - else if (pending_commit.count(atid)) { - dout(10) << "stray create_agree on " << ino - << " atid " << atid - << ", already committing, resending COMMIT" - << dendl; - MAnchor *req = new MAnchor(ANCHOR_OP_COMMIT, 0, atid); - mds->messenger->send_message(req, - mds->mdsmap->get_inst(mds->mdsmap->get_anchortable())); - } - else { - dout(10) << "stray create_agree on " << ino - << " atid " << atid - << ", sending ROLLBACK" - << dendl; - MAnchor *req = new MAnchor(ANCHOR_OP_ROLLBACK, 0, atid); - mds->messenger->send_message(req, - mds->mdsmap->get_inst(mds->mdsmap->get_anchortable())); - } - break; - - case ANCHOR_OP_DESTROY_AGREE: - if (pending_destroy_prepare.count(ino)) { - dout(10) << "got destroy_agree on " << ino << " atid " << atid << dendl; - Context *onfinish = pending_destroy_prepare[ino].onfinish; - *pending_destroy_prepare[ino].patid = atid; - pending_destroy_prepare.erase(ino); - - if (onfinish) { - onfinish->finish(0); - delete onfinish; - } - } - else if (pending_commit.count(atid)) { - dout(10) << "stray destroy_agree on " << ino - << " atid " << atid - << ", already committing, resending COMMIT" - << dendl; - MAnchor *req = new MAnchor(ANCHOR_OP_COMMIT, 0, atid); - mds->messenger->send_message(req, - mds->mdsmap->get_inst(mds->mdsmap->get_anchortable())); - } - else { - dout(10) << "stray destroy_agree on " << ino - << " atid " << atid - << ", sending ROLLBACK" - << dendl; - MAnchor *req = new MAnchor(ANCHOR_OP_ROLLBACK, 0, atid); - mds->messenger->send_message(req, - mds->mdsmap->get_inst(mds->mdsmap->get_anchortable())); - } - break; - - case ANCHOR_OP_UPDATE_AGREE: - if (pending_update_prepare.count(ino)) { - dout(10) << "got update_agree on " << ino << " atid " << atid << dendl; - Context *onfinish = pending_update_prepare[ino].onfinish; - *pending_update_prepare[ino].patid = atid; - pending_update_prepare.erase(ino); - - if (onfinish) { - onfinish->finish(0); - delete onfinish; - } - } - else if (pending_commit.count(atid)) { - dout(10) << "stray update_agree on " << ino - << " atid " << atid - << ", already committing, resending COMMIT" - << dendl; - MAnchor *req = new MAnchor(ANCHOR_OP_COMMIT, 0, atid); - mds->messenger->send_message(req, - mds->mdsmap->get_inst(mds->mdsmap->get_anchortable())); - } - else { - dout(10) << "stray update_agree on " << ino - << " atid " << atid - << ", sending ROLLBACK" - << dendl; - MAnchor *req = new MAnchor(ANCHOR_OP_ROLLBACK, 0, atid); - mds->messenger->send_message(req, - mds->mdsmap->get_inst(mds->mdsmap->get_anchortable())); - } - break; - - // commit -> ack - case ANCHOR_OP_ACK: - { - dout(10) << "got ack on atid " << atid << ", logging" << dendl; - - // remove from committing list - assert(pending_commit.count(atid)); - assert(pending_commit[atid]->pending_commit_atids.count(atid)); - - // log ACK. - mds->mdlog->submit_entry(new EAnchorClient(ANCHOR_OP_ACK, atid), - new C_LoggedAck(this, atid)); - } - break; - - default: - assert(0); - } - - delete m; -} - - -void AnchorClient::_logged_ack(version_t atid) -{ - dout(10) << "_logged_ack" << dendl; - - assert(pending_commit.count(atid)); - assert(pending_commit[atid]->pending_commit_atids.count(atid)); - - pending_commit[atid]->pending_commit_atids.erase(atid); - pending_commit.erase(atid); - - // kick any waiters (LogSegment trim) - if (ack_waiters.count(atid)) { - dout(15) << "kicking ack waiters on atid " << atid << dendl; - mds->queue_waiters(ack_waiters[atid]); - ack_waiters.erase(atid); - } -} - - -/* - * public async interface - */ - - -/* - * FIXME: we need to be able to resubmit messages if the anchortable mds fails. - */ - - -void AnchorClient::lookup(inodeno_t ino, vector& trace, Context *onfinish) -{ - // send message - MAnchor *req = new MAnchor(ANCHOR_OP_LOOKUP, ino); - - assert(pending_lookup.count(ino) == 0); - pending_lookup[ino].onfinish = onfinish; - pending_lookup[ino].trace = &trace; - - mds->send_message_mds(req, - mds->mdsmap->get_anchortable()); -} - - -// PREPARE - -void AnchorClient::prepare_create(inodeno_t ino, vector& trace, - version_t *patid, Context *onfinish) -{ - dout(10) << "prepare_create " << ino << " " << trace << dendl; - - // send message - MAnchor *req = new MAnchor(ANCHOR_OP_CREATE_PREPARE, ino); - req->set_trace(trace); - - pending_create_prepare[ino].trace = trace; - pending_create_prepare[ino].patid = patid; - pending_create_prepare[ino].onfinish = onfinish; - - mds->send_message_mds(req, - mds->mdsmap->get_anchortable()); -} - -void AnchorClient::prepare_destroy(inodeno_t ino, - version_t *patid, Context *onfinish) -{ - dout(10) << "prepare_destroy " << ino << dendl; - - // send message - MAnchor *req = new MAnchor(ANCHOR_OP_DESTROY_PREPARE, ino); - pending_destroy_prepare[ino].onfinish = onfinish; - pending_destroy_prepare[ino].patid = patid; - mds->messenger->send_message(req, - mds->mdsmap->get_inst(mds->mdsmap->get_anchortable())); -} - - -void AnchorClient::prepare_update(inodeno_t ino, vector& trace, - version_t *patid, Context *onfinish) -{ - dout(10) << "prepare_update " << ino << " " << trace << dendl; - - // send message - MAnchor *req = new MAnchor(ANCHOR_OP_UPDATE_PREPARE, ino); - req->set_trace(trace); - - pending_update_prepare[ino].trace = trace; - pending_update_prepare[ino].patid = patid; - pending_update_prepare[ino].onfinish = onfinish; - - mds->messenger->send_message(req, - mds->mdsmap->get_inst(mds->mdsmap->get_anchortable())); -} - - -// COMMIT - -void AnchorClient::commit(version_t atid, LogSegment *ls) -{ - dout(10) << "commit " << atid << dendl; - - assert(pending_commit.count(atid) == 0); - pending_commit[atid] = ls; - ls->pending_commit_atids.insert(atid); - - // send message - MAnchor *req = new MAnchor(ANCHOR_OP_COMMIT, 0, atid); - mds->messenger->send_message(req, - mds->mdsmap->get_inst(mds->mdsmap->get_anchortable())); -} - - - -// RECOVERY - -void AnchorClient::finish_recovery() -{ - dout(7) << "finish_recovery" << dendl; - - resend_commits(); -} - -void AnchorClient::resend_commits() -{ - for (map::iterator p = pending_commit.begin(); - p != pending_commit.end(); - ++p) { - dout(10) << "resending commit on " << p->first << dendl; - MAnchor *req = new MAnchor(ANCHOR_OP_COMMIT, 0, p->first); - mds->send_message_mds(req, - mds->mdsmap->get_anchortable()); - } -} - -void AnchorClient::resend_prepares(hash_map& prepares, int op) -{ - for (hash_map::iterator p = prepares.begin(); - p != prepares.end(); - p++) { - dout(10) << "resending " << get_anchor_opname(op) << " on " << p->first << dendl; - MAnchor *req = new MAnchor(op, p->first); - req->set_trace(p->second.trace); - mds->send_message_mds(req, - mds->mdsmap->get_anchortable()); - } -} - - -void AnchorClient::handle_mds_recovery(int who) -{ - dout(7) << "handle_mds_recovery mds" << who << dendl; - - if (who != mds->mdsmap->get_anchortable()) - return; // do nothing. - - // resend any pending lookups. - for (hash_map::iterator p = pending_lookup.begin(); - p != pending_lookup.end(); - p++) { - dout(10) << "resending lookup on " << p->first << dendl; - mds->send_message_mds(new MAnchor(ANCHOR_OP_LOOKUP, p->first), - mds->mdsmap->get_anchortable()); - } - - // resend any pending prepares. - resend_prepares(pending_create_prepare, ANCHOR_OP_CREATE_PREPARE); - resend_prepares(pending_update_prepare, ANCHOR_OP_UPDATE_PREPARE); - resend_prepares(pending_destroy_prepare, ANCHOR_OP_DESTROY_PREPARE); - - // resend any pending commits. - resend_commits(); -} diff --git a/trunk/ceph/mds/AnchorClient.h b/trunk/ceph/mds/AnchorClient.h deleted file mode 100644 index fd790f39c399d..0000000000000 --- a/trunk/ceph/mds/AnchorClient.h +++ /dev/null @@ -1,107 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __ANCHORCLIENT_H -#define __ANCHORCLIENT_H - -#include -using std::vector; -#include -using __gnu_cxx::hash_map; - -#include "include/types.h" -#include "msg/Dispatcher.h" - -#include "Anchor.h" - -class Context; -class MDS; -class LogSegment; - -class AnchorClient : public Dispatcher { - MDS *mds; - - // lookups - struct _pending_lookup { - vector *trace; - Context *onfinish; - }; - hash_map pending_lookup; - - // prepares - struct _pending_prepare { - vector trace; - Context *onfinish; - version_t *patid; // ptr to atid - }; - hash_map pending_create_prepare; - hash_map pending_destroy_prepare; - hash_map pending_update_prepare; - - // pending commits - map pending_commit; - map > ack_waiters; - - void handle_anchor_reply(class MAnchor *m); - - class C_LoggedAck : public Context { - AnchorClient *ac; - version_t atid; - public: - C_LoggedAck(AnchorClient *a, version_t t) : ac(a), atid(t) {} - void finish(int r) { - ac->_logged_ack(atid); - } - }; - void _logged_ack(version_t atid); - -public: - AnchorClient(MDS *m) : mds(m) {} - - void dispatch(Message *m); - - // async user interface - void lookup(inodeno_t ino, vector& trace, Context *onfinish); - - void prepare_create(inodeno_t ino, vector& trace, version_t *atid, Context *onfinish); - void prepare_destroy(inodeno_t ino, version_t *atid, Context *onfinish); - void prepare_update(inodeno_t ino, vector& trace, version_t *atid, Context *onfinish); - - void commit(version_t atid, LogSegment *ls); - - // for recovery (by other nodes) - void handle_mds_recovery(int mds); // called when someone else recovers - - void resend_commits(); - void resend_prepares(hash_map& prepares, int op); - - // for recovery (by me) - void got_journaled_agree(version_t atid, LogSegment *ls) { - pending_commit[atid] = ls; - } - void got_journaled_ack(version_t atid) { - pending_commit.erase(atid); - } - bool has_committed(version_t atid) { - return pending_commit.count(atid) == 0; - } - void wait_for_ack(version_t atid, Context *c) { - ack_waiters[atid].push_back(c); - } - void finish_recovery(); // called when i recover and go active - - -}; - -#endif diff --git a/trunk/ceph/mds/AnchorTable.cc b/trunk/ceph/mds/AnchorTable.cc deleted file mode 100644 index f3c4fb05b772b..0000000000000 --- a/trunk/ceph/mds/AnchorTable.cc +++ /dev/null @@ -1,713 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "AnchorTable.h" -#include "MDS.h" - -#include "osdc/Filer.h" - -#include "msg/Messenger.h" -#include "messages/MAnchor.h" - -#include "common/Clock.h" - -#include "MDLog.h" -#include "events/EAnchor.h" - -#include "config.h" - -#define dout(x) if (x <= g_conf.debug_mds) *_dout << dbeginl << g_clock.now() << " " << mds->messenger->get_myname() << ".anchortable " -#define derr(x) if (x <= g_conf.debug_mds) *_derr << dbeginl << g_clock.now() << " " << mds->messenger->get_myname() << ".anchortable " - - -void AnchorTable::dump() -{ - dout(7) << "dump v " << version << dendl; - for (hash_map::iterator it = anchor_map.begin(); - it != anchor_map.end(); - it++) - dout(15) << "dump " << it->second << dendl; -} - - -/* - * basic updates - */ - -bool AnchorTable::add(inodeno_t ino, dirfrag_t dirfrag) -{ - //dout(17) << "add " << ino << " dirfrag " << dirfrag << dendl; - - // parent should be there - assert(dirfrag.ino < MDS_INO_BASE || // system dirino - anchor_map.count(dirfrag.ino)); // have - - if (anchor_map.count(ino) == 0) { - // new item - anchor_map[ino] = Anchor(ino, dirfrag); - dout(7) << "add added " << anchor_map[ino] << dendl; - return true; - } else { - dout(7) << "add had " << anchor_map[ino] << dendl; - return false; - } -} - -void AnchorTable::inc(inodeno_t ino) -{ - dout(7) << "inc " << ino << dendl; - - assert(anchor_map.count(ino)); - - while (1) { - Anchor &anchor = anchor_map[ino]; - anchor.nref++; - - dout(10) << "inc now " << anchor << dendl; - ino = anchor.dirfrag.ino; - - if (ino == 0) break; - if (anchor_map.count(ino) == 0) break; - } -} - -void AnchorTable::dec(inodeno_t ino) -{ - dout(7) << "dec " << ino << dendl; - assert(anchor_map.count(ino)); - - while (true) { - Anchor &anchor = anchor_map[ino]; - anchor.nref--; - - if (anchor.nref == 0) { - dout(10) << "dec removing " << anchor << dendl; - dirfrag_t dirfrag = anchor.dirfrag; - anchor_map.erase(ino); - ino = dirfrag.ino; - } else { - dout(10) << "dec now " << anchor << dendl; - ino = anchor.dirfrag.ino; - } - - if (ino == 0) break; - if (anchor_map.count(ino) == 0) break; - } -} - - -/* - * high level - */ - - -// LOOKUP - -void AnchorTable::handle_lookup(MAnchor *req) -{ - inodeno_t curino = req->get_ino(); - dout(7) << "handle_lookup " << curino << dendl; - - vector trace; - while (true) { - assert(anchor_map.count(curino) == 1); - Anchor &anchor = anchor_map[curino]; - - dout(10) << "handle_lookup adding " << anchor << dendl; - trace.insert(trace.begin(), anchor); // lame FIXME - - if (anchor.dirfrag.ino < MDS_INO_BASE) break; - curino = anchor.dirfrag.ino; - } - - // reply - MAnchor *reply = new MAnchor(ANCHOR_OP_LOOKUP_REPLY, req->get_ino()); - reply->set_trace(trace); - mds->messenger->send_message(reply, req->get_source_inst()); - - delete req; -} - - -// MIDLEVEL - -void AnchorTable::create_prepare(inodeno_t ino, vector& trace, int reqmds) -{ - // make sure trace is in table - for (unsigned i=0; i& trace, int reqmds) -{ - version++; - pending_update[version].first = ino; - pending_update[version].second = trace; - pending_reqmds[version] = reqmds; - //dump(); -} - -void AnchorTable::commit(version_t atid) -{ - if (pending_create.count(atid)) { - dout(7) << "commit " << atid << " create " << pending_create[atid] << dendl; - pending_create.erase(atid); - } - - else if (pending_destroy.count(atid)) { - inodeno_t ino = pending_destroy[atid]; - dout(7) << "commit " << atid << " destroy " << ino << dendl; - - dec(ino); // destroy - - pending_destroy.erase(atid); - } - - else if (pending_update.count(atid)) { - inodeno_t ino = pending_update[atid].first; - vector &trace = pending_update[atid].second; - - dout(7) << "commit " << atid << " update " << ino << dendl; - - // remove old - dec(ino); - - // add new - for (unsigned i=0; i_create_prepare_logged(req, atid); - } -}; - -void AnchorTable::handle_create_prepare(MAnchor *req) -{ - inodeno_t ino = req->get_ino(); - vector& trace = req->get_trace(); - - dout(7) << "handle_create_prepare " << ino << dendl; - - create_prepare(ino, trace, req->get_source().num()); - - // log it - EAnchor *le = new EAnchor(ANCHOR_OP_CREATE_PREPARE, ino, version, req->get_source().num()); - le->set_trace(trace); - mds->mdlog->submit_entry(le, - new C_AT_CreatePrepare(this, req, version)); -} - -void AnchorTable::_create_prepare_logged(MAnchor *req, version_t atid) -{ - inodeno_t ino = req->get_ino(); - dout(7) << "_create_prepare_logged " << ino << " atid " << atid << dendl; - - // reply - MAnchor *reply = new MAnchor(ANCHOR_OP_CREATE_AGREE, ino, atid); - mds->messenger->send_message(reply, req->get_source_inst()); - - delete req; -} - - - - -// DESTROY - -class C_AT_DestroyPrepare : public Context { - AnchorTable *at; - MAnchor *req; - version_t atid; -public: - C_AT_DestroyPrepare(AnchorTable *a, MAnchor *r, version_t t) : - at(a), req(r), atid(t) { } - void finish(int r) { - at->_destroy_prepare_logged(req, atid); - } -}; - -void AnchorTable::handle_destroy_prepare(MAnchor *req) -{ - inodeno_t ino = req->get_ino(); - dout(7) << "handle_destroy_prepare " << ino << dendl; - - destroy_prepare(ino, req->get_source().num()); - - mds->mdlog->submit_entry(new EAnchor(ANCHOR_OP_DESTROY_PREPARE, ino, version, req->get_source().num()), - new C_AT_DestroyPrepare(this, req, version)); -} - -void AnchorTable::_destroy_prepare_logged(MAnchor *req, version_t atid) -{ - inodeno_t ino = req->get_ino(); - dout(7) << "_destroy_prepare_logged " << ino << " atid " << atid << dendl; - - // reply - MAnchor *reply = new MAnchor(ANCHOR_OP_DESTROY_AGREE, ino, atid); - mds->messenger->send_message(reply, req->get_source_inst()); - delete req; -} - - - -// UPDATE - -class C_AT_UpdatePrepare : public Context { - AnchorTable *at; - MAnchor *req; - version_t atid; -public: - C_AT_UpdatePrepare(AnchorTable *a, MAnchor *r, version_t t) : - at(a), req(r), atid(t) { } - void finish(int r) { - at->_update_prepare_logged(req, atid); - } -}; - -void AnchorTable::handle_update_prepare(MAnchor *req) -{ - inodeno_t ino = req->get_ino(); - vector& trace = req->get_trace(); - - dout(7) << "handle_update_prepare " << ino << dendl; - - update_prepare(ino, trace, req->get_source().num()); - - // log it - EAnchor *le = new EAnchor(ANCHOR_OP_UPDATE_PREPARE, ino, version, req->get_source().num()); - le->set_trace(trace); - mds->mdlog->submit_entry(le, - new C_AT_UpdatePrepare(this, req, version)); -} - -void AnchorTable::_update_prepare_logged(MAnchor *req, version_t atid) -{ - inodeno_t ino = req->get_ino(); - dout(7) << "_update_prepare_logged " << ino << " atid " << atid << dendl; - - // reply - MAnchor *reply = new MAnchor(ANCHOR_OP_UPDATE_AGREE, ino, atid); - mds->messenger->send_message(reply, req->get_source_inst()); - delete req; -} - - - -// COMMIT - -class C_AT_Commit : public Context { - AnchorTable *at; - MAnchor *req; -public: - C_AT_Commit(AnchorTable *a, MAnchor *r) : - at(a), req(r) { } - void finish(int r) { - at->_commit_logged(req); - } -}; - -void AnchorTable::handle_commit(MAnchor *req) -{ - version_t atid = req->get_atid(); - dout(7) << "handle_commit " << atid << dendl; - - if (pending_create.count(atid) || - pending_destroy.count(atid) || - pending_update.count(atid)) { - commit(atid); - mds->mdlog->submit_entry(new EAnchor(ANCHOR_OP_COMMIT, atid, version)); - } - else if (atid <= version) { - dout(0) << "got commit for atid " << atid << " <= " << version - << ", already committed, sending ack." - << dendl; - MAnchor *reply = new MAnchor(ANCHOR_OP_ACK, 0, atid); - mds->messenger->send_message(reply, req->get_source_inst()); - delete req; - return; - } - else { - // wtf. - dout(0) << "got commit for atid " << atid << " > " << version << dendl; - assert(atid <= version); - } - - // wait for it to journal - mds->mdlog->wait_for_sync(new C_AT_Commit(this, req)); -} - - -void AnchorTable::_commit_logged(MAnchor *req) -{ - dout(7) << "_commit_logged, sending ACK" << dendl; - MAnchor *reply = new MAnchor(ANCHOR_OP_ACK, req->get_ino(), req->get_atid()); - mds->messenger->send_message(reply, req->get_source_inst()); - delete req; -} - - - -// ROLLBACK - -void AnchorTable::handle_rollback(MAnchor *req) -{ - version_t atid = req->get_atid(); - dout(7) << "handle_rollback " << atid << dendl; - rollback(atid); - delete req; -} - - - -/* - * messages - */ - -void AnchorTable::dispatch(Message *m) -{ - switch (m->get_type()) { - case MSG_MDS_ANCHOR: - handle_anchor_request((MAnchor*)m); - break; - - default: - assert(0); - } -} - - -void AnchorTable::handle_anchor_request(class MAnchor *req) -{ - // make sure i'm open! - if (!opened) { - dout(7) << "not open yet" << dendl; - - waiting_for_open.push_back(new C_MDS_RetryMessage(mds, req)); - - if (!opening) { - opening = true; - load(0); - } - return; - } - - dout(10) << "handle_anchor_request " << *req << dendl; - - // go - switch (req->get_op()) { - - case ANCHOR_OP_LOOKUP: - handle_lookup(req); - break; - - case ANCHOR_OP_CREATE_PREPARE: - handle_create_prepare(req); - break; - case ANCHOR_OP_DESTROY_PREPARE: - handle_destroy_prepare(req); - break; - case ANCHOR_OP_UPDATE_PREPARE: - handle_update_prepare(req); - break; - - case ANCHOR_OP_COMMIT: - handle_commit(req); - break; - - case ANCHOR_OP_ROLLBACK: - handle_rollback(req); - break; - - default: - assert(0); - } - -} - - - - -// primitive load/save for now! - -// load/save entire table for now! - -class C_AT_Saved : public Context { - AnchorTable *at; - version_t version; -public: - C_AT_Saved(AnchorTable *a, version_t v) : at(a), version(v) {} - void finish(int r) { - at->_saved(version); - } -}; - -void AnchorTable::save(Context *onfinish) -{ - dout(7) << "save v " << version << dendl; - if (!opened) { - assert(!onfinish); - return; - } - - if (onfinish) - waiting_for_save[version].push_back(onfinish); - - if (committing_version == version) { - dout(7) << "save already committing v " << version << dendl; - return; - } - committing_version = version; - - // build up write - bufferlist bl; - - // version - bl.append((char*)&version, sizeof(version)); - - // # anchors - size_t size = anchor_map.size(); - bl.append((char*)&size, sizeof(size)); - - // anchors - for (hash_map::iterator it = anchor_map.begin(); - it != anchor_map.end(); - it++) { - it->second._encode(bl); - dout(15) << "save encoded " << it->second << dendl; - } - - // pending - ::_encode(pending_reqmds, bl); - ::_encode(pending_create, bl); - ::_encode(pending_destroy, bl); - - size_t s = pending_update.size(); - bl.append((char*)&s, sizeof(s)); - for (map > >::iterator p = pending_update.begin(); - p != pending_update.end(); - ++p) { - bl.append((char*)&p->first, sizeof(p->first)); - bl.append((char*)&p->second.first, sizeof(p->second.first)); - ::_encode(p->second.second, bl); - } - - // write! - object_t oid = object_t(MDS_INO_ANCHORTABLE+mds->get_nodeid(), 0); - mds->objecter->write(oid, - 0, bl.length(), - mds->objecter->osdmap->file_to_object_layout(oid, g_OSD_MDAnchorTableLayout), - bl, - NULL, new C_AT_Saved(this, version)); -} - -void AnchorTable::_saved(version_t v) -{ - dout(7) << "_saved v " << v << dendl; - - assert(v <= committing_version); - assert(committed_version < v); - committed_version = v; - - finish_contexts(waiting_for_save[v], 0); - waiting_for_save.erase(v); -} - - - -class C_AT_Load : public Context { - AnchorTable *at; -public: - bufferlist bl; - C_AT_Load(AnchorTable *a) : at(a) {} - void finish(int result) { - assert(result > 0); - at->_loaded(bl); - } -}; - -void AnchorTable::load(Context *onfinish) -{ - dout(7) << "load" << dendl; - assert(!opened); - - waiting_for_open.push_back(onfinish); - - C_AT_Load *fin = new C_AT_Load(this); - object_t oid = object_t(MDS_INO_ANCHORTABLE+mds->get_nodeid(), 0); - mds->objecter->read(oid, - 0, 0, - mds->objecter->osdmap->file_to_object_layout(oid, g_OSD_MDAnchorTableLayout), - &fin->bl, fin); -} - -void AnchorTable::_loaded(bufferlist& bl) -{ - dout(10) << "_loaded got " << bl.length() << " bytes" << dendl; - - int off = 0; - bl.copy(off, sizeof(version), (char*)&version); - off += sizeof(version); - - size_t size; - bl.copy(off, sizeof(size), (char*)&size); - off += sizeof(size); - - for (size_t n=0; n::iterator p = pending_reqmds.begin(); - p != pending_reqmds.end(); - p++) - resend_agree(p->first, p->second); -} - - -void AnchorTable::resend_agree(version_t v, int who) -{ - if (pending_create.count(v)) { - MAnchor *reply = new MAnchor(ANCHOR_OP_CREATE_AGREE, pending_create[v], v); - mds->send_message_mds(reply, who); - } - else if (pending_destroy.count(v)) { - MAnchor *reply = new MAnchor(ANCHOR_OP_DESTROY_AGREE, pending_destroy[v], v); - mds->send_message_mds(reply, who); - } - else { - assert(pending_update.count(v)); - MAnchor *reply = new MAnchor(ANCHOR_OP_UPDATE_AGREE, pending_update[v].first, v); - mds->send_message_mds(reply, who); - } -} - -void AnchorTable::handle_mds_recovery(int who) -{ - dout(7) << "handle_mds_recovery mds" << who << dendl; - - // resend agrees for recovered mds - for (map::iterator p = pending_reqmds.begin(); - p != pending_reqmds.end(); - p++) { - if (p->second != who) continue; - resend_agree(p->first, p->second); - } -} diff --git a/trunk/ceph/mds/AnchorTable.h b/trunk/ceph/mds/AnchorTable.h deleted file mode 100644 index 64a2002ba7c85..0000000000000 --- a/trunk/ceph/mds/AnchorTable.h +++ /dev/null @@ -1,127 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __ANCHORTABLE_H -#define __ANCHORTABLE_H - -#include "Anchor.h" -#include "include/Context.h" - -#include -using namespace __gnu_cxx; - -class MDS; -class MAnchor; - -class AnchorTable { - MDS *mds; - - // keep the entire table in memory. - hash_map anchor_map; - - // uncommitted operations - map pending_reqmds; - map pending_create; - map pending_destroy; - map > > pending_update; - - version_t version; // this includes anchor_map AND pending_* state. - version_t committing_version; - version_t committed_version; - - // load/save state - bool opening, opened; - - // waiters - list waiting_for_open; - map > waiting_for_save; - -protected: - - // basic updates - bool add(inodeno_t ino, dirfrag_t dirfrag); - void inc(inodeno_t ino); - void dec(inodeno_t ino); - - // mid-level - void create_prepare(inodeno_t ino, vector& trace, int reqmds); - void destroy_prepare(inodeno_t ino, int reqmds); - void update_prepare(inodeno_t ino, vector& trace, int reqmds); - void commit(version_t atid); - void rollback(version_t atid); - friend class EAnchor; // used for journal replay. - - // high level interface - void handle_lookup(MAnchor *req); - - void handle_create_prepare(MAnchor *req); - void _create_prepare_logged(MAnchor *req, version_t atid); - friend class C_AT_CreatePrepare; - - void handle_destroy_prepare(MAnchor *req); - void _destroy_prepare_logged(MAnchor *req, version_t atid); - friend class C_AT_DestroyPrepare; - - void handle_update_prepare(MAnchor *req); - void _update_prepare_logged(MAnchor *req, version_t atid); - friend class C_AT_UpdatePrepare; - - void handle_commit(MAnchor *req); - void _commit_logged(MAnchor *req); - friend class C_AT_Commit; - - void handle_rollback(MAnchor *req); - - // messages - void handle_anchor_request(MAnchor *m); - - void dump(); - -public: - AnchorTable(MDS *m) : - mds(m), - version(0), committing_version(0), committed_version(0), - opening(false), opened(false) { } - - void dispatch(class Message *m); - - version_t get_version() { return version; } - version_t get_committed_version() { return committed_version; } - - void create_fresh() { - // reset (i.e. on mkfs) to empty, but unsaved table. - version = 1; - opened = true; - opening = false; - anchor_map.clear(); - pending_create.clear(); - pending_destroy.clear(); - pending_update.clear(); - } - - // load/save entire table for now! - void save(Context *onfinish); - void _saved(version_t v); - void load(Context *onfinish); - void _loaded(bufferlist& bl); - - // recovery - void handle_mds_recovery(int who); - void finish_recovery(); - void resend_agree(version_t v, int who); - -}; - -#endif diff --git a/trunk/ceph/mds/CDentry.cc b/trunk/ceph/mds/CDentry.cc deleted file mode 100644 index ce1fd04c14af4..0000000000000 --- a/trunk/ceph/mds/CDentry.cc +++ /dev/null @@ -1,384 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "CDentry.h" -#include "CInode.h" -#include "CDir.h" -#include "Anchor.h" - -#include "MDS.h" -#include "MDCache.h" -#include "LogSegment.h" - -#include "messages/MLock.h" - -#include - -#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_mds) *_dout << dbeginl << g_clock.now() << " mds" << dir->cache->mds->get_nodeid() << ".cache.den(" << dir->dirfrag() << " " << name << ") " - - - -ostream& CDentry::print_db_line_prefix(ostream& out) -{ - return out << g_clock.now() << " mds" << dir->cache->mds->get_nodeid() << ".cache.den(" << dir->ino() << " " << name << ") "; -} - - -// CDentry - -ostream& operator<<(ostream& out, CDentry& dn) -{ - filepath path; - dn.make_path(path); - - out << "[dentry " << path; - - if (dn.is_auth()) { - out << " auth"; - if (dn.is_replicated()) - out << dn.get_replicas(); - } else { - out << " rep@" << dn.authority(); - out << "." << dn.get_replica_nonce(); - assert(dn.get_replica_nonce() >= 0); - } - - if (dn.is_null()) out << " NULL"; - if (dn.is_remote()) { - out << " REMOTE("; - switch (dn.get_remote_d_type() << 12) { - case S_IFREG: out << "reg"; break; - case S_IFDIR: out << "dir"; break; - case S_IFLNK: out << "lnk"; break; - default: assert(0); - } - out << ")"; - } - - out << " " << dn.lock; - - out << " v=" << dn.get_version(); - out << " pv=" << dn.get_projected_version(); - - out << " inode=" << dn.get_inode(); - - if (dn.is_new()) out << " state=new"; - - if (dn.get_num_ref()) { - out << " |"; - dn.print_pin_set(out); - } - - out << " " << &dn; - out << "]"; - return out; -} - - -bool operator<(const CDentry& l, const CDentry& r) -{ - if (l.get_dir()->ino() < r.get_dir()->ino()) return true; - if (l.get_dir()->ino() == r.get_dir()->ino() && - l.get_name() < r.get_name()) return true; - return false; -} - - -void CDentry::print(ostream& out) -{ - out << *this; -} - - -inodeno_t CDentry::get_ino() -{ - if (inode) - return inode->ino(); - return inodeno_t(); -} - - -pair CDentry::authority() -{ - return dir->authority(); -} - - -void CDentry::add_waiter(int tag, Context *c) -{ - // wait on the directory? - if (tag & (WAIT_UNFREEZE|WAIT_SINGLEAUTH)) { - dir->add_waiter(tag, c); - return; - } - MDSCacheObject::add_waiter(tag, c); -} - - -version_t CDentry::pre_dirty(version_t min) -{ - projected_version = dir->pre_dirty(min); - dout(10) << " pre_dirty " << *this << dendl; - return projected_version; -} - - -void CDentry::_mark_dirty(LogSegment *ls) -{ - // state+pin - if (!state_test(STATE_DIRTY)) { - state_set(STATE_DIRTY); - dir->inc_num_dirty(); - get(PIN_DIRTY); - assert(ls); - } - if (ls) - ls->dirty_dentries.push_back(&xlist_dirty); -} - -void CDentry::mark_dirty(version_t pv, LogSegment *ls) -{ - dout(10) << " mark_dirty " << *this << dendl; - - // i now live in this new dir version - assert(pv <= projected_version); - version = pv; - _mark_dirty(ls); - - // mark dir too - dir->mark_dirty(pv, ls); -} - - -void CDentry::mark_clean() -{ - dout(10) << " mark_clean " << *this << dendl; - assert(is_dirty()); - assert(dir->get_version() == 0 || version <= dir->get_version()); // hmm? - - // state+pin - state_clear(STATE_DIRTY); - dir->dec_num_dirty(); - put(PIN_DIRTY); - - xlist_dirty.remove_myself(); - - if (state_test(STATE_NEW)) - state_clear(STATE_NEW); -} - -void CDentry::mark_new() -{ - dout(10) << " mark_new " << *this << dendl; - state_set(STATE_NEW); -} - -void CDentry::make_path_string(string& s) -{ - if (dir) { - dir->inode->make_path_string(s); - } else { - s = "???"; - } - s += "/"; - s += name; -} - -void CDentry::make_path(filepath& fp) -{ - assert(dir); - if (dir->inode->is_base()) - fp.set_ino(dir->inode->ino()); // base case - else if (dir->inode->get_parent_dn()) - dir->inode->get_parent_dn()->make_path(fp); // recurse - else - fp.set_ino(dir->inode->ino()); // relative but not base? hrm! - fp.push_dentry(name); -} - -/* -void CDentry::make_path(string& s, inodeno_t tobase) -{ - assert(dir); - - if (dir->inode->is_root()) { - s += "/"; // make it an absolute path (no matter what) if we hit the root. - } - else if (dir->inode->get_parent_dn() && - dir->inode->ino() != tobase) { - dir->inode->get_parent_dn()->make_path(s, tobase); - s += "/"; - } - s += name; -} -*/ - -/** make_anchor_trace - * construct an anchor trace for this dentry, as if it were linked to *in. - */ -void CDentry::make_anchor_trace(vector& trace, CInode *in) -{ - // start with parent dir inode - if (dir) - dir->inode->make_anchor_trace(trace); - - // add this inode (in my dirfrag) to the end - trace.push_back(Anchor(in->ino(), dir->dirfrag())); - dout(10) << "make_anchor_trace added " << trace.back() << dendl; -} - - - -void CDentry::link_remote(CInode *in) -{ - assert(is_remote()); - assert(in->ino() == remote_ino); - - inode = in; - in->add_remote_parent(this); -} - -void CDentry::unlink_remote() -{ - assert(is_remote()); - assert(inode); - - inode->remove_remote_parent(this); - inode = 0; -} - - -CDentryDiscover *CDentry::replicate_to(int who) -{ - int nonce = add_replica(who); - return new CDentryDiscover(this, nonce); -} - - -// ---------------------------- -// auth pins - -bool CDentry::can_auth_pin() -{ - assert(dir); - return dir->can_auth_pin(); -} - -void CDentry::auth_pin() -{ - if (auth_pins == 0) - get(PIN_AUTHPIN); - auth_pins++; - - dout(10) << "auth_pin on " << *this - << " now " << auth_pins << "+" << nested_auth_pins - << dendl; - - dir->adjust_nested_auth_pins(1); -} - -void CDentry::auth_unpin() -{ - auth_pins--; - if (auth_pins == 0) - put(PIN_AUTHPIN); - - dout(10) << "auth_unpin on " << *this - << " now " << auth_pins << "+" << nested_auth_pins - << dendl; - assert(auth_pins >= 0); - - dir->adjust_nested_auth_pins(-1); -} - -void CDentry::adjust_nested_auth_pins(int by) -{ - nested_auth_pins += by; - - dout(15) << "adjust_nested_auth_pins by " << by - << " now " << auth_pins << "+" << nested_auth_pins - << dendl; - assert(nested_auth_pins >= 0); - - dir->adjust_nested_auth_pins(by); -} - -bool CDentry::is_frozen() -{ - return dir->is_frozen(); -} - - -// ---------------------------- -// locking - -void CDentry::set_object_info(MDSCacheObjectInfo &info) -{ - info.dirfrag = dir->dirfrag(); - info.dname = name; -} - -void CDentry::encode_lock_state(int type, bufferlist& bl) -{ - // null, ino, or remote_ino? - int c; - if (is_primary()) { - c = 1; - ::_encode(c, bl); - ::_encode(inode->inode.ino, bl); - } - else if (is_remote()) { - c = 2; - ::_encode(c, bl); - ::_encode(remote_ino, bl); - } - else if (is_null()) { - // encode nothing. - } - else assert(0); -} - -void CDentry::decode_lock_state(int type, bufferlist& bl) -{ - if (bl.length() == 0) { - // null - assert(is_null()); - return; - } - - int off = 0; - char c; - inodeno_t ino; - ::_decode(c, bl, off); - - switch (c) { - case 1: - case 2: - _decode(ino, bl, off); - // newly linked? - if (is_null() && !is_auth()) { - // force trim from cache! - dout(10) << "decode_lock_state replica dentry null -> non-null, must trim" << dendl; - //assert(get_num_ref() == 0); - } else { - // verify? - - } - break; - default: - assert(0); - } -} diff --git a/trunk/ceph/mds/CDentry.h b/trunk/ceph/mds/CDentry.h deleted file mode 100644 index b99ad9ea603d5..0000000000000 --- a/trunk/ceph/mds/CDentry.h +++ /dev/null @@ -1,325 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __CDENTRY_H -#define __CDENTRY_H - -#include -#include -#include -using namespace std; - -#include "include/types.h" -#include "include/buffer.h" -#include "include/lru.h" -#include "include/xlist.h" -#include "include/filepath.h" -#include "mdstypes.h" - -#include "SimpleLock.h" - -class CInode; -class CDir; -class MDRequest; - -class Message; -class CDentryDiscover; -class Anchor; - -class CDentry; -class LogSegment; - - -// define an ordering -bool operator<(const CDentry& l, const CDentry& r); - -// dentry -class CDentry : public MDSCacheObject, public LRUObject { - public: - // -- state -- - static const int STATE_NEW = 1; - static const int STATE_FRAGMENTING = 2; - - // -- pins -- - static const int PIN_INODEPIN = 1; // linked inode is pinned - static const int PIN_FRAGMENTING = -2; // containing dir is refragmenting - const char *pin_name(int p) { - switch (p) { - case PIN_INODEPIN: return "inodepin"; - case PIN_FRAGMENTING: return "fragmenting"; - default: return generic_pin_name(p); - } - }; - - // -- wait -- - static const int WAIT_LOCK_OFFSET = 8; - - void add_waiter(int tag, Context *c); - - static const int EXPORT_NONCE = 1; - - bool is_lt(const MDSCacheObject *r) const { - return *this < *(CDentry*)r; - } - - protected: - string name; - - inodeno_t remote_ino; // if remote dentry - unsigned char remote_d_type; - - CInode *inode; // linked inode (if any) - CDir *dir; // containing dirfrag - - version_t version; // dir version when last touched. - version_t projected_version; // what it will be when i unlock/commit. - - xlist::item xlist_dirty; - - off_t dir_offset; - - int auth_pins, nested_auth_pins; - - friend class Migrator; - friend class Locker; - friend class Renamer; - friend class Server; - friend class MDCache; - friend class MDS; - friend class CInode; - friend class C_MDC_XlockRequest; - - -public: - // lock - SimpleLock lock; - - - - public: - // cons - CDentry() : - remote_ino(0), remote_d_type(0), - inode(0), dir(0), - version(0), projected_version(0), - xlist_dirty(this), - dir_offset(0), - auth_pins(0), nested_auth_pins(0), - lock(this, LOCK_OTYPE_DN, WAIT_LOCK_OFFSET) { } - CDentry(const string& n, CInode *in) : - name(n), - remote_ino(0), remote_d_type(0), - inode(in), dir(0), - version(0), projected_version(0), - xlist_dirty(this), - dir_offset(0), - auth_pins(0), nested_auth_pins(0), - lock(this, LOCK_OTYPE_DN, WAIT_LOCK_OFFSET) { } - CDentry(const string& n, inodeno_t ino, unsigned char dt, CInode *in=0) : - name(n), - remote_ino(ino), remote_d_type(dt), - inode(in), dir(0), - version(0), projected_version(0), - xlist_dirty(this), - dir_offset(0), - auth_pins(0), nested_auth_pins(0), - lock(this, LOCK_OTYPE_DN, WAIT_LOCK_OFFSET) { } - - CInode *get_inode() const { return inode; } - CDir *get_dir() const { return dir; } - const string& get_name() const { return name; } - inodeno_t get_ino(); - - off_t get_dir_offset() { return dir_offset; } - void set_dir_offset(off_t o) { dir_offset = o; } - void clear_dir_offset() { dir_offset = 0; } - - inodeno_t get_remote_ino() { return remote_ino; } - unsigned char get_remote_d_type() { return remote_d_type; } - void set_remote(inodeno_t ino, unsigned char d_type) { - remote_ino = ino; - remote_d_type = d_type; - } - - // ref counts: pin ourselves in the LRU when we're pinned. - void first_get() { - lru_pin(); - } - void last_put() { - lru_unpin(); - } - - // auth pins - bool can_auth_pin(); - void auth_pin(); - void auth_unpin(); - void adjust_nested_auth_pins(int by); - bool is_frozen(); - - - // dentry type is primary || remote || null - // inode ptr is required for primary, optional for remote, undefined for null - bool is_primary() { return remote_ino == 0 && inode != 0; } - bool is_remote() { return remote_ino > 0; } - bool is_null() { return (remote_ino == 0 && inode == 0) ? true:false; } - - // remote links - void link_remote(CInode *in); - void unlink_remote(); - - - // copy cons - CDentry(const CDentry& m); - const CDentry& operator= (const CDentry& right); - - // misc - void make_path_string(string& s); - void make_path(filepath& fp); - void make_anchor_trace(vector& trace, CInode *in); - - // -- version -- - version_t get_version() { return version; } - void set_version(version_t v) { projected_version = version = v; } - version_t get_projected_version() { return projected_version; } - void set_projected_version(version_t v) { projected_version = v; } - - pair authority(); - - version_t pre_dirty(version_t min=0); - void _mark_dirty(LogSegment *ls); - void mark_dirty(version_t projected_dirv, LogSegment *ls); - void mark_clean(); - - void mark_new(); - bool is_new() { return state_test(STATE_NEW); } - - // -- replication - CDentryDiscover *replicate_to(int rep); - - - // -- exporting - // note: this assumes the dentry already exists. - // i.e., the name is already extracted... so we just need the other state. - void encode_export(bufferlist& bl) { - ::_encode_simple(state, bl); - ::_encode_simple(version, bl); - ::_encode_simple(projected_version, bl); - lock._encode(bl); - ::_encode_simple(replica_map, bl); - get(PIN_TEMPEXPORTING); - } - void finish_export() { - // twiddle - clear_replica_map(); - replica_nonce = EXPORT_NONCE; - state_clear(CDentry::STATE_AUTH); - if (is_dirty()) - mark_clean(); - put(PIN_TEMPEXPORTING); - } - void abort_export() { - put(PIN_TEMPEXPORTING); - } - void decode_import(bufferlist::iterator& blp, LogSegment *ls) { - int nstate; - ::_decode_simple(nstate, blp); - ::_decode_simple(version, blp); - ::_decode_simple(projected_version, blp); - lock._decode(blp); - ::_decode_simple(replica_map, blp); - - // twiddle - state = 0; - state_set(CDentry::STATE_AUTH); - if (nstate & STATE_DIRTY) - _mark_dirty(ls); - if (!replica_map.empty()) - get(PIN_REPLICATED); - } - - // -- locking -- - SimpleLock* get_lock(int type) { - assert(type == LOCK_OTYPE_DN); - return &lock; - } - void set_object_info(MDSCacheObjectInfo &info); - void encode_lock_state(int type, bufferlist& bl); - void decode_lock_state(int type, bufferlist& bl); - - - ostream& print_db_line_prefix(ostream& out); - void print(ostream& out); - - friend class CDir; -}; - -ostream& operator<<(ostream& out, CDentry& dn); - - - -class CDentryDiscover { - string dname; - int replica_nonce; - int lockstate; - off_t dir_offset; - inodeno_t remote_ino; - unsigned char remote_d_type; - -public: - CDentryDiscover() {} - CDentryDiscover(CDentry *dn, int nonce) : - dname(dn->get_name()), replica_nonce(nonce), - lockstate(dn->lock.get_replica_state()), - dir_offset(dn->get_dir_offset()), - remote_ino(dn->get_remote_ino()), remote_d_type(dn->get_remote_d_type()) { } - - string& get_dname() { return dname; } - int get_nonce() { return replica_nonce; } - bool is_remote() { return remote_ino ? true:false; } - inodeno_t get_remote_ino() { return remote_ino; } - unsigned char get_remote_d_type() { return remote_d_type; } - - void update_dentry(CDentry *dn) { - dn->set_dir_offset(dir_offset); - dn->set_replica_nonce(replica_nonce); - } - void init_dentry_lock(CDentry *dn) { - dn->lock.set_state( lockstate ); - } - - void _encode(bufferlist& bl) { - ::_encode(dname, bl); - ::_encode(dir_offset, bl); - ::_encode(remote_ino, bl); - ::_encode(remote_d_type, bl); - ::_encode(replica_nonce, bl); - ::_encode(lockstate, bl); - } - - void _decode(bufferlist& bl, int& off) { - ::_decode(dname, bl, off); - ::_decode(dir_offset, bl, off); - ::_decode(remote_ino, bl, off); - ::_decode(remote_d_type, bl, off); - ::_decode(replica_nonce, bl, off); - ::_decode(lockstate, bl, off); - } - -}; - - - -#endif diff --git a/trunk/ceph/mds/CDir.cc b/trunk/ceph/mds/CDir.cc deleted file mode 100644 index adaf5fa6c0d4f..0000000000000 --- a/trunk/ceph/mds/CDir.cc +++ /dev/null @@ -1,1676 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include "include/types.h" - -#include "CDir.h" -#include "CDentry.h" -#include "CInode.h" - -#include "MDS.h" -#include "MDCache.h" -#include "MDSMap.h" -#include "LogSegment.h" - -#include "include/Context.h" -#include "common/Clock.h" - -#include "osdc/Objecter.h" - -#include - -#include "config.h" - -#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_mds) *_dout << dbeginl << g_clock.now() << " mds" << cache->mds->get_nodeid() << ".cache.dir(" << this->dirfrag() << ") " - - - - -// PINS -//int cdir_pins[CDIR_NUM_PINS] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0 }; - - - -ostream& operator<<(ostream& out, CDir& dir) -{ - filepath path; - dir.get_inode()->make_path(path); - out << "[dir " << dir.dirfrag() << " " << path << "/"; - if (dir.is_auth()) { - out << " auth"; - if (dir.is_replicated()) - out << dir.get_replicas(); - - out << " pv=" << dir.get_projected_version(); - out << " v=" << dir.get_version(); - out << " cv=" << dir.get_committing_version(); - out << "/" << dir.get_committed_version(); - out << "/" << dir.get_committed_version_equivalent(); - } else { - out << " rep@" << dir.authority(); - if (dir.get_replica_nonce() > 1) - out << "." << dir.get_replica_nonce(); - } - - if (dir.is_rep()) out << " REP"; - - if (dir.get_dir_auth() != CDIR_AUTH_DEFAULT) { - if (dir.get_dir_auth().second == CDIR_AUTH_UNKNOWN) - out << " dir_auth=" << dir.get_dir_auth().first; - else - out << " dir_auth=" << dir.get_dir_auth(); - } - - if (dir.get_cum_auth_pins()) - out << " ap=" << dir.get_auth_pins() << "+" << dir.get_nested_auth_pins(); - - out << " state=" << dir.get_state(); - if (dir.state_test(CDir::STATE_COMPLETE)) out << "|complete"; - if (dir.state_test(CDir::STATE_FREEZINGTREE)) out << "|freezingtree"; - if (dir.state_test(CDir::STATE_FROZENTREE)) out << "|frozentree"; - //if (dir.state_test(CDir::STATE_FROZENTREELEAF)) out << "|frozentreeleaf"; - if (dir.state_test(CDir::STATE_FROZENDIR)) out << "|frozendir"; - if (dir.state_test(CDir::STATE_FREEZINGDIR)) out << "|freezingdir"; - if (dir.state_test(CDir::STATE_EXPORTBOUND)) out << "|exportbound"; - if (dir.state_test(CDir::STATE_IMPORTBOUND)) out << "|importbound"; - - out << " sz=" << dir.get_nitems() << "+" << dir.get_nnull(); - if (dir.get_num_dirty()) - out << " dirty=" << dir.get_num_dirty(); - - - if (dir.get_num_ref()) { - out << " |"; - dir.print_pin_set(out); - } - - out << " " << &dir; - return out << "]"; -} - - -void CDir::print(ostream& out) -{ - out << *this; -} - - - - -ostream& CDir::print_db_line_prefix(ostream& out) -{ - return out << g_clock.now() << " mds" << cache->mds->get_nodeid() << ".cache.dir(" << this->dirfrag() << ") "; -} - - - -// ------------------------------------------------------------------- -// CDir - -CDir::CDir(CInode *in, frag_t fg, MDCache *mdcache, bool auth) : - xlist_dirty(this) -{ - inode = in; - frag = fg; - this->cache = mdcache; - - nitems = 0; - nnull = 0; - num_dirty = 0; - - state = STATE_INITIAL; - - projected_version = version = 0; - committing_version = 0; - committed_version_equivalent = committed_version = 0; - - // dir_auth - dir_auth = CDIR_AUTH_DEFAULT; - - // auth - assert(in->is_dir()); - if (auth) - state |= STATE_AUTH; - - auth_pins = 0; - nested_auth_pins = 0; - request_pins = 0; - - //hack_num_accessed = -1; - - dir_rep = REP_NONE; - //dir_rep = REP_ALL; // hack: to wring out some bugs! FIXME FIXME -} - - - - -/*** - * linking fun - */ - -CDentry* CDir::add_null_dentry(const string& dname) -{ - // foreign - assert(lookup(dname) == 0); - - // create dentry - CDentry* dn = new CDentry(dname, 0); - if (is_auth()) - dn->state_set(CDentry::STATE_AUTH); - cache->lru.lru_insert_mid(dn); - - dn->dir = this; - dn->version = projected_version; - - // add to dir - assert(items.count(dn->name) == 0); - //assert(null_items.count(dn->name) == 0); - - items[dn->name] = dn; - nnull++; - - dout(12) << "add_null_dentry " << *dn << dendl; - - // pin? - if (nnull + nitems == 1) get(PIN_CHILD); - - assert(nnull + nitems == items.size()); - //assert(nnull == null_items.size()); - return dn; -} - - -CDentry* CDir::add_primary_dentry(const string& dname, CInode *in) -{ - // primary - assert(lookup(dname) == 0); - - // create dentry - CDentry* dn = new CDentry(dname, 0); - if (is_auth()) - dn->state_set(CDentry::STATE_AUTH); - cache->lru.lru_insert_mid(dn); - - dn->dir = this; - dn->version = projected_version; - - // add to dir - assert(items.count(dn->name) == 0); - //assert(null_items.count(dn->name) == 0); - - items[dn->name] = dn; - link_inode_work( dn, in ); - - dout(12) << "add_primary_dentry " << *dn << dendl; - - // pin? - if (nnull + nitems == 1) get(PIN_CHILD); - - assert(nnull + nitems == items.size()); - //assert(nnull == null_items.size()); - return dn; -} - -CDentry* CDir::add_remote_dentry(const string& dname, inodeno_t ino, unsigned char d_type) -{ - // foreign - assert(lookup(dname) == 0); - - // create dentry - CDentry* dn = new CDentry(dname, ino, d_type); - if (is_auth()) - dn->state_set(CDentry::STATE_AUTH); - cache->lru.lru_insert_mid(dn); - - dn->dir = this; - dn->version = projected_version; - - // add to dir - assert(items.count(dn->name) == 0); - //assert(null_items.count(dn->name) == 0); - - items[dn->name] = dn; - nitems++; - - dout(12) << "add_remote_dentry " << *dn << dendl; - - // pin? - if (nnull + nitems == 1) get(PIN_CHILD); - - assert(nnull + nitems == items.size()); - //assert(nnull == null_items.size()); - return dn; -} - - - -void CDir::remove_dentry(CDentry *dn) -{ - dout(12) << "remove_dentry " << *dn << dendl; - - if (dn->inode) { - // detach inode and dentry - unlink_inode_work(dn); - } else { - // remove from null list - //assert(null_items.count(dn->name) == 1); - //null_items.erase(dn->name); - nnull--; - } - - // remove from list - assert(items.count(dn->name) == 1); - items.erase(dn->name); - - // adjust dirty counter? - if (dn->state_test(CDentry::STATE_DIRTY)) - num_dirty--; - - cache->lru.lru_remove(dn); - delete dn; - - // unpin? - if (nnull + nitems == 0) put(PIN_CHILD); - - assert(nnull + nitems == items.size()); - //assert(nnull == null_items.size()); -} - -void CDir::link_remote_inode(CDentry *dn, inodeno_t ino, unsigned char d_type) -{ - dout(12) << "link_remote_inode " << *dn << " remote " << ino << dendl; - assert(dn->is_null()); - - dn->set_remote(ino, d_type); - nitems++; - dn->clear_dir_offset(); - - //assert(null_items.count(dn->name) == 1); - //null_items.erase(dn->name); - nnull--; - assert(nnull + nitems == items.size()); -} - -void CDir::link_primary_inode(CDentry *dn, CInode *in) -{ - dout(12) << "link_primary_inode " << *dn << " " << *in << dendl; - assert(dn->is_null()); - - link_inode_work(dn,in); - dn->clear_dir_offset(); - - // remove from null list - //assert(null_items.count(dn->name) == 1); - //null_items.erase(dn->name); - nnull--; - - assert(nnull + nitems == items.size()); - //assert(nnull == null_items.size()); -} - -void CDir::link_inode_work( CDentry *dn, CInode *in) -{ - assert(dn->inode == 0); - dn->inode = in; - in->set_primary_parent(dn); - - nitems++; // adjust dir size - - // set inode version - //in->inode.version = dn->get_version(); - - // pin dentry? - if (in->get_num_ref()) - dn->get(CDentry::PIN_INODEPIN); - - // adjust auth pin count - if (in->auth_pins + in->nested_auth_pins) - dn->adjust_nested_auth_pins(in->auth_pins + in->nested_auth_pins); -} - -void CDir::unlink_inode( CDentry *dn ) -{ - if (dn->is_remote()) { - dout(12) << "unlink_inode " << *dn << dendl; - } else { - dout(12) << "unlink_inode " << *dn << " " << *dn->inode << dendl; - } - - dn->clear_dir_offset(); - unlink_inode_work(dn); - - // add to null list - //assert(null_items.count(dn->name) == 0); - //null_items[dn->name] = dn; - nnull++; - - assert(nnull + nitems == items.size()); - //assert(nnull == null_items.size()); -} - -void CDir::try_remove_unlinked_dn(CDentry *dn) -{ - assert(dn->dir == this); - assert(dn->is_null()); - assert(dn->is_dirty()); - - /* FIXME: there is a bug in this. i think new dentries are properly - identified.. e.g. maybe a dentry exists, is committed, is removed, is now - dirty+null, then reused and mistakenly considered new.. then it is removed, - we remove it here, the dir is fetched, and the dentry exists again. - - somethign like that... - */ - return; - - - // no pins (besides dirty)? - if (dn->get_num_ref() != 1) - return; - - // was the dn new? or is the dir complete (i.e. we don't need negatives)? - if (dn->is_new() || is_complete()) { - dout(10) << "try_remove_unlinked_dn " << *dn << " in " << *this << dendl; - dn->mark_clean(); - remove_dentry(dn); - - if (version == projected_version && - committing_version == committed_version && - num_dirty == 0) { - dout(10) << "try_remove_unlinked_dn committed_equivalent now " << version - << " vs committed " << committed_version - << dendl; - committed_version_equivalent = committed_version; - } - } -} - - - -void CDir::unlink_inode_work( CDentry *dn ) -{ - CInode *in = dn->inode; - - if (dn->is_remote()) { - // remote - if (in) - dn->unlink_remote(); - - dn->set_remote(0, 0); - } else { - // primary - assert(dn->is_primary()); - - // unpin dentry? - if (in->get_num_ref()) - dn->put(CDentry::PIN_INODEPIN); - - // unlink auth_pin count - if (in->auth_pins + in->nested_auth_pins) - dn->adjust_nested_auth_pins(0 - (in->auth_pins + in->nested_auth_pins)); - - // detach inode - in->remove_primary_parent(dn); - dn->inode = 0; - } - - nitems--; // adjust dir size -} - -void CDir::remove_null_dentries() { - dout(12) << "remove_null_dentries " << *this << dendl; - - list dns; - for (CDir::map_t::iterator it = items.begin(); - it != items.end(); - it++) { - if (it->second->is_null()) - dns.push_back(it->second); - } - - for (list::iterator it = dns.begin(); - it != dns.end(); - it++) { - CDentry *dn = *it; - remove_dentry(dn); - } - //assert(null_items.empty()); - assert(nnull == 0); - assert(nnull + nitems == items.size()); -} - - -/** - * steal_dentry -- semi-violently move a dentry from one CDir to another - * (*) violently, in that nitems, most pins, etc. are not correctly maintained - * on the old CDir corpse; must call purge_stolen() when finished. - */ -void CDir::steal_dentry(CDentry *dn) -{ - dout(15) << "steal_dentry " << *dn << dendl; - - items[dn->name] = dn; - - dn->dir->items.erase(dn->name); - if (dn->dir->items.empty()) - dn->dir->put(PIN_CHILD); - - if (nnull + nitems == 0) - get(PIN_CHILD); - if (dn->is_null()) - nnull++; - else - nitems++; - - nested_auth_pins += dn->auth_pins + dn->nested_auth_pins; - if (dn->is_dirty()) - num_dirty++; - - dn->dir = this; -} - -void CDir::purge_stolen(list& waiters) -{ - // take waiters _before_ unfreeze... - take_waiting(WAIT_ANY, waiters); - - if (is_auth()) { - assert(is_frozen_dir()); - unfreeze_dir(); - } - - nnull = nitems = 0; - - if (is_auth()) - clear_replica_map(); - if (is_dirty()) mark_clean(); - if (state_test(STATE_IMPORTBOUND)) put(PIN_IMPORTBOUND); - if (state_test(STATE_EXPORTBOUND)) put(PIN_EXPORTBOUND); - - if (auth_pins > 0) put(PIN_AUTHPIN); - - assert(get_num_ref() == (state_test(STATE_STICKY) ? 1:0)); -} - -void CDir::init_fragment_pins() -{ - if (!replica_map.empty()) get(PIN_REPLICATED); - if (state_test(STATE_DIRTY)) get(PIN_DIRTY); - if (state_test(STATE_EXPORTBOUND)) get(PIN_EXPORTBOUND); - if (state_test(STATE_IMPORTBOUND)) get(PIN_IMPORTBOUND); -} - -void CDir::split(int bits, list& subs, list& waiters) -{ - dout(10) << "split by " << bits << " bits on " << *this << dendl; - - if (cache->mds->logger) cache->mds->logger->inc("dir_sp"); - - assert(is_complete() || !is_auth()); - - list frags; - frag.split(bits, frags); - - vector subfrags(1 << bits); - - double fac = 1.0 / (double)(1 << bits); // for scaling load vecs - - // create subfrag dirs - int n = 0; - for (list::iterator p = frags.begin(); p != frags.end(); ++p) { - CDir *f = new CDir(inode, *p, cache, is_auth()); - f->state_set(state & MASK_STATE_FRAGMENT_KEPT); - f->replica_map = replica_map; - f->dir_auth = dir_auth; - f->init_fragment_pins(); - f->version = version; - f->projected_version = projected_version; - - f->pop_me = pop_me; - f->pop_me *= fac; - - // FIXME; this is an approximation - f->pop_nested = pop_nested; - f->pop_nested *= fac; - f->pop_auth_subtree = pop_auth_subtree; - f->pop_auth_subtree *= fac; - f->pop_auth_subtree_nested = pop_auth_subtree_nested; - f->pop_auth_subtree_nested *= fac; - - dout(10) << " subfrag " << *p << " " << *f << dendl; - subfrags[n++] = f; - subs.push_back(f); - inode->add_dirfrag(f); - } - - // repartition dentries - while (!items.empty()) { - CDir::map_t::iterator p = items.begin(); - - CDentry *dn = p->second; - frag_t subfrag = inode->pick_dirfrag(p->first); - int n = subfrag.value() >> frag.bits(); - dout(15) << " subfrag " << subfrag << " n=" << n << " for " << p->first << dendl; - CDir *f = subfrags[n]; - f->steal_dentry(dn); - } - - purge_stolen(waiters); - inode->close_dirfrag(frag); // selft deletion, watch out. -} - -void CDir::merge(int bits, list& waiters) -{ - dout(10) << "merge by " << bits << " bits" << dendl; - - list frags; - frag.split(bits, frags); - - for (list::iterator p = frags.begin(); p != frags.end(); ++p) { - CDir *dir = inode->get_or_open_dirfrag(cache, *p); - assert(dir->is_complete()); - dout(10) << " subfrag " << *p << " " << *dir << dendl; - - // steal dentries - while (!dir->items.empty()) - steal_dentry(dir->items.begin()->second); - - // merge replica map - for (map::iterator p = dir->replica_map.begin(); - p != dir->replica_map.end(); - ++p) - replica_map[p->first] = MAX(replica_map[p->first], p->second); - - // merge state - state_set(dir->get_state() & MASK_STATE_FRAGMENT_KEPT); - dir_auth = dir->dir_auth; - - dir->purge_stolen(waiters); - inode->close_dirfrag(dir->get_frag()); - } - - init_fragment_pins(); -} - - - - - - - -CDirDiscover *CDir::replicate_to(int mds) -{ - assert(is_auth()); - return new CDirDiscover( this, add_replica(mds) ); -} - - - - - -/**************************************** - * WAITING - */ - -void CDir::add_dentry_waiter(const string& dname, Context *c) -{ - if (waiting_on_dentry.empty()) - get(PIN_DNWAITER); - waiting_on_dentry[dname].push_back(c); - dout(10) << "add_dentry_waiter dentry " << dname << " " << c << " on " << *this << dendl; -} - -void CDir::take_dentry_waiting(const string& dname, list& ls) -{ - if (waiting_on_dentry.empty()) return; - if (waiting_on_dentry.count(dname) == 0) return; - dout(10) << "take_dentry_waiting dentry " << dname - << " x " << waiting_on_dentry[dname].size() - << " on " << *this << dendl; - ls.splice(ls.end(), waiting_on_dentry[dname]); - waiting_on_dentry.erase(dname); - if (waiting_on_dentry.empty()) - put(PIN_DNWAITER); -} - -void CDir::add_ino_waiter(inodeno_t ino, Context *c) -{ - if (waiting_on_ino.empty()) - get(PIN_INOWAITER); - waiting_on_ino[ino].push_back(c); - dout(10) << "add_ino_waiter ino " << ino << " " << c << " on " << *this << dendl; -} - -void CDir::take_ino_waiting(inodeno_t ino, list& ls) -{ - if (waiting_on_ino.empty()) return; - if (waiting_on_ino.count(ino) == 0) return; - dout(10) << "take_ino_waiting ino " << ino - << " x " << waiting_on_ino[ino].size() - << " on " << *this << dendl; - ls.splice(ls.end(), waiting_on_ino[ino]); - waiting_on_ino.erase(ino); - if (waiting_on_ino.empty()) - put(PIN_INOWAITER); -} - -void CDir::take_sub_waiting(list& ls) -{ - dout(10) << "take_sub_waiting" << dendl; - for (hash_map >::iterator p = waiting_on_dentry.begin(); - p != waiting_on_dentry.end(); - ++p) - ls.splice(ls.end(), p->second); - waiting_on_dentry.clear(); - for (hash_map >::iterator p = waiting_on_ino.begin(); - p != waiting_on_ino.end(); - ++p) - ls.splice(ls.end(), p->second); - waiting_on_ino.clear(); -} - - - -void CDir::add_waiter(int tag, Context *c) -{ - // hierarchical? - - // at free root? - if (tag & WAIT_ATFREEZEROOT) { - if (!(is_freezing_tree_root() || is_frozen_tree_root() || - is_freezing_dir() || is_frozen_dir())) { - // try parent - dout(10) << "add_waiter " << tag << " " << c << " should be ATFREEZEROOT, " << *this << " is not root, trying parent" << dendl; - inode->parent->dir->add_waiter(tag, c); - return; - } - } - - // at subtree root? - if (tag & WAIT_ATSUBTREEROOT) { - if (!is_subtree_root()) { - // try parent - dout(10) << "add_waiter " << tag << " " << c << " should be ATSUBTREEROOT, " << *this << " is not root, trying parent" << dendl; - inode->parent->dir->add_waiter(tag, c); - return; - } - } - - MDSCacheObject::add_waiter(tag, c); -} - - - -/* NOTE: this checks dentry waiters too */ -void CDir::take_waiting(int mask, list& ls) -{ - if (mask & WAIT_DENTRY) { - // take each each dentry waiter - hash_map >::iterator it = - waiting_on_dentry.begin(); - while (it != waiting_on_dentry.end()) { - take_dentry_waiting((it++)->first, ls); // not post-inc - } - } - - // waiting - MDSCacheObject::take_waiting(mask, ls); -} - - -void CDir::finish_waiting(int mask, int result) -{ - dout(11) << "finish_waiting mask " << hex << mask << dec << " result " << result << " on " << *this << dendl; - - list finished; - take_waiting(mask, finished); - if (result < 0) - finish_contexts(finished, result); - else - cache->mds->queue_waiters(finished); -} - - - -// dirty/clean - -version_t CDir::pre_dirty(version_t min) -{ - if (min > projected_version) - projected_version = min; - ++projected_version; - dout(10) << "pre_dirty " << projected_version << dendl; - return projected_version; -} - -void CDir::_mark_dirty(LogSegment *ls) -{ - if (!state_test(STATE_DIRTY)) { - state_set(STATE_DIRTY); - dout(10) << "mark_dirty (was clean) " << *this << " version " << version << dendl; - get(PIN_DIRTY); - assert(ls); - } else { - dout(10) << "mark_dirty (already dirty) " << *this << " version " << version << dendl; - } - if (ls) - ls->dirty_dirfrags.push_back(&xlist_dirty); -} - -void CDir::mark_dirty(version_t pv, LogSegment *ls) -{ - assert(version < pv); - version = pv; - _mark_dirty(ls); -} - -void CDir::mark_clean() -{ - dout(10) << "mark_clean " << *this << " version " << version << dendl; - if (state_test(STATE_DIRTY)) { - state_clear(STATE_DIRTY); - put(PIN_DIRTY); - - xlist_dirty.remove_myself(); - } -} - - - - -void CDir::first_get() -{ - inode->get(CInode::PIN_DIRFRAG); -} - -void CDir::last_put() -{ - inode->put(CInode::PIN_DIRFRAG); -} - - - -/****************************************************************************** - * FETCH and COMMIT - */ - -// ----------------------- -// FETCH - -class C_Dir_Fetch : public Context { - protected: - CDir *dir; - public: - bufferlist bl; - - C_Dir_Fetch(CDir *d) : dir(d) { } - void finish(int result) { - dir->_fetched(bl); - } -}; - -void CDir::fetch(Context *c, bool ignore_authpinnability) -{ - dout(10) << "fetch on " << *this << dendl; - - assert(is_auth()); - assert(!is_complete()); - - if (!can_auth_pin() && !ignore_authpinnability) { - dout(7) << "fetch waiting for authpinnable" << dendl; - add_waiter(WAIT_UNFREEZE, c); - return; - } - - if (c) add_waiter(WAIT_COMPLETE, c); - - // already fetching? - if (state_test(CDir::STATE_FETCHING)) { - dout(7) << "already fetching; waiting" << dendl; - return; - } - - auth_pin(); - state_set(CDir::STATE_FETCHING); - - if (cache->mds->logger) cache->mds->logger->inc("dir_f"); - - // start by reading the first hunk of it - C_Dir_Fetch *fin = new C_Dir_Fetch(this); - cache->mds->objecter->read( get_ondisk_object(), - 0, 0, // whole object - cache->mds->objecter->osdmap->file_to_object_layout( get_ondisk_object(), - g_OSD_MDDirLayout ), - &fin->bl, - fin ); -} - -void CDir::_fetched(bufferlist &bl) -{ - dout(10) << "_fetched " << bl.length() - << " bytes for " << *this - << dendl; - - assert(is_auth()); - assert(!is_frozen()); - - // decode. - int len = bl.length(); - int off = 0; - version_t got_version; - - ::_decode(got_version, bl, off); - - dout(10) << "_fetched version " << got_version - << ", " << len << " bytes" - << dendl; - - int32_t n; - ::_decode(n, bl, off); - - //int num_new_inodes_loaded = 0; - - for (int i=0; iget_inode() == 0) { - dout(12) << "_fetched had NEG dentry " << *dn << dendl; - } else { - dout(12) << "_fetched had dentry " << *dn << dendl; - } - } else { - // (remote) link - dn = add_remote_dentry(dname, ino, d_type); - - // link to inode? - CInode *in = cache->get_inode(ino); // we may or may not have it. - if (in) { - dn->link_remote(in); - dout(12) << "_fetched got remote link " << ino << " which we have " << *in << dendl; - } else { - dout(12) << "_fetched got remote link " << ino << " (dont' have it)" << dendl; - } - } - } - else if (type == 'I') { - // inode - - // parse out inode - inode_t inode; - ::_decode(inode, bl, off); - - string symlink; - if (inode.is_symlink()) - ::_decode(symlink, bl, off); - - fragtree_t fragtree; - fragtree._decode(bl, off); - - if (dn) { - if (dn->get_inode() == 0) { - dout(12) << "_fetched had NEG dentry " << *dn << dendl; - } else { - dout(12) << "_fetched had dentry " << *dn << dendl; - } - } else { - // add inode - CInode *in = 0; - if (cache->have_inode(inode.ino)) { - in = cache->get_inode(inode.ino); - dout(-12) << "_fetched got (but i already had) " << *in - << " mode " << in->inode.mode - << " mtime " << in->inode.mtime << dendl; - assert(0); // this shouldn't happen!! - } else { - // inode - in = new CInode(cache); - in->inode = inode; - - // symlink? - if (in->is_symlink()) - in->symlink = symlink; - - // dirfragtree - in->dirfragtree.swap(fragtree); - - // add - cache->add_inode( in ); - - // link - dn = add_primary_dentry(dname, in); - dout(12) << "_fetched got " << *dn << " " << *in << dendl; - - //in->hack_accessed = false; - //in->hack_load_stamp = g_clock.now(); - //num_new_inodes_loaded++; - } - } - } else { - dout(1) << "corrupt directory, i got tag char '" << type << "' val " << (int)(type) - << " at pos " << off << dendl; - assert(0); - } - - // make note of dentry position in the directory - dn->dir_offset = dn_offset; - - /** clean underwater item? - * Underwater item is something that is dirty in our cache from - * journal replay, but was previously flushed to disk before the - * mds failed. - * - * We only do this is committed_version == 0. that implies either - * - this is a fetch after from a clean/empty CDir is created - * (and has no effect, since the dn won't exist); or - * - this is a fetch after _recovery_, which is what we're worried - * about. Items that are marked dirty from the journal should be - * marked clean if they appear on disk. - */ - if (committed_version == 0 && - dn && - dn->get_version() <= got_version && - dn->is_dirty()) { - dout(10) << "_fetched had underwater dentry " << *dn << ", marking clean" << dendl; - dn->mark_clean(); - - if (dn->get_inode()) { - assert(dn->get_inode()->get_version() <= got_version); - dout(10) << "_fetched had underwater inode " << *dn->get_inode() << ", marking clean" << dendl; - dn->get_inode()->mark_clean(); - } - } - } - //assert(off == len); no, directories may shrink. add this back in when we properly truncate objects on write. - - // take the loaded version? - // only if we are a fresh CDir* with no prior state. - if (version == 0) { - assert(projected_version == 0); - assert(!state_test(STATE_COMMITTING)); - projected_version = version = committing_version = committed_version = got_version; - } - - //cache->mds->logger->inc("newin", num_new_inodes_loaded); - //hack_num_accessed = 0; - - // mark complete, !fetching - state_set(STATE_COMPLETE); - state_clear(STATE_FETCHING); - auth_unpin(); - - // kick waiters - finish_waiting(WAIT_COMPLETE, 0); -} - - - -// ----------------------- -// COMMIT - -/** - * commit - * - * @param want - min version i want committed - * @param c - callback for completion - */ -void CDir::commit(version_t want, Context *c) -{ - dout(10) << "commit want " << want << " on " << *this << dendl; - if (want == 0) want = version; - - // preconditions - assert(want <= version || version == 0); // can't commit the future - assert(want > committed_version); // the caller is stupid - assert(is_auth()); - assert(can_auth_pin()); - - // note: queue up a noop if necessary, so that we always - // get an auth_pin. - if (!c) - c = new C_NoopContext; - - // auth_pin on first waiter - if (waiting_for_commit.empty()) - auth_pin(); - waiting_for_commit[want].push_back(c); - - // ok. - _commit(want); -} - - -class C_Dir_RetryCommit : public Context { - CDir *dir; - version_t want; -public: - C_Dir_RetryCommit(CDir *d, version_t v) : - dir(d), want(v) { } - void finish(int r) { - dir->_commit(want); - } -}; - -class C_Dir_Committed : public Context { - CDir *dir; - version_t version; -public: - C_Dir_Committed(CDir *d, version_t v) : dir(d), version(v) { } - void finish(int r) { - dir->_committed(version); - } -}; - -void CDir::_commit(version_t want) -{ - dout(10) << "_commit want " << want << " on " << *this << dendl; - - // we can't commit things in the future. - // (even the projected future.) - assert(want <= version || version == 0); - - // check pre+postconditions. - assert(is_auth()); - - // already committed? - if (committed_version >= want) { - dout(10) << "already committed " << committed_version << " >= " << want << dendl; - return; - } - // already committing >= want? - if (committing_version >= want) { - dout(10) << "already committing " << committing_version << " >= " << want << dendl; - assert(state_test(STATE_COMMITTING)); - return; - } - - // complete? - if (!is_complete()) { - dout(7) << "commit not complete, fetching first" << dendl; - if (cache->mds->logger) cache->mds->logger->inc("dir_ffc"); - fetch(new C_Dir_RetryCommit(this, want)); - return; - } - - // commit. - committing_version = version; - - // mark committing (if not already) - if (!state_test(STATE_COMMITTING)) { - dout(10) << "marking committing" << dendl; - state_set(STATE_COMMITTING); - } - - if (cache->mds->logger) cache->mds->logger->inc("dir_c"); - - // encode - bufferlist bl; - - ::_encode(version, bl); - int32_t n = nitems; - ::_encode(n, bl); - - for (map_t::iterator it = items.begin(); - it != items.end(); - it++) { - CDentry *dn = it->second; - - if (dn->is_null()) - continue; // skip negative entries - - n--; - - // primary or remote? - if (dn->is_remote()) { - inodeno_t ino = dn->get_remote_ino(); - unsigned char d_type = dn->get_remote_d_type(); - dout(14) << " pos " << bl.length() << " dn '" << it->first << "' remote ino " << ino << dendl; - - // marker, name, ino - bl.append( "L", 1 ); // remote link - ::_encode(it->first, bl); - ::_encode(ino, bl); - ::_encode(d_type, bl); - } else { - // primary link - CInode *in = dn->get_inode(); - assert(in); - - dout(14) << " pos " << bl.length() << " dn '" << it->first << "' inode " << *in << dendl; - - // marker, name, inode, [symlink string] - bl.append( "I", 1 ); // inode - ::_encode(it->first, bl); - ::_encode(in->inode, bl); - - if (in->is_symlink()) { - // include symlink destination! - dout(18) << " inlcuding symlink ptr " << in->symlink << dendl; - ::_encode(in->symlink, bl); - } - - in->dirfragtree._encode(bl); - } - } - assert(n == 0); - - // write it. - cache->mds->objecter->write( get_ondisk_object(), - 0, bl.length(), - cache->mds->objecter->osdmap->file_to_object_layout( get_ondisk_object(), - g_OSD_MDDirLayout ), - bl, - NULL, new C_Dir_Committed(this, version) ); -} - - -/** - * _committed - * - * @param v version i just committed - */ -void CDir::_committed(version_t v) -{ - dout(10) << "_committed v " << v << " on " << *this << dendl; - assert(is_auth()); - - // take note. - assert(v > committed_version); - assert(v <= committing_version); - committed_version = v; - - // _all_ commits done? - if (committing_version == committed_version) - state_clear(CDir::STATE_COMMITTING); - - // dir clean? - if (committed_version == version) - mark_clean(); - - // dentries clean? - for (map_t::iterator it = items.begin(); - it != items.end(); ) { - CDentry *dn = it->second; - it++; - - // dentry - if (committed_version >= dn->get_version()) { - if (dn->is_dirty()) { - dout(15) << " dir " << committed_version << " >= dn " << dn->get_version() << " now clean " << *dn << dendl; - dn->mark_clean(); - } - } else { - dout(15) << " dir " << committed_version << " < dn " << dn->get_version() << " still dirty " << *dn << dendl; - } - - // inode? - if (dn->is_primary()) { - CInode *in = dn->get_inode(); - assert(in); - assert(in->is_auth()); - - if (committed_version >= in->get_version()) { - if (in->is_dirty()) { - dout(15) << " dir " << committed_version << " >= inode " << in->get_version() << " now clean " << *in << dendl; - in->mark_clean(); - } - } else { - dout(15) << " dir " << committed_version << " < inode " << in->get_version() << " still dirty " << *in << dendl; - assert(in->is_dirty()); - } - } - } - - // finishers? - bool were_waiters = !waiting_for_commit.empty(); - - map >::iterator p = waiting_for_commit.begin(); - while (p != waiting_for_commit.end()) { - map >::iterator n = p; - n++; - if (p->first > committed_version) break; // haven't committed this far yet. - cache->mds->queue_waiters(p->second); - waiting_for_commit.erase(p); - p = n; - } - - // unpin if we kicked the last waiter. - if (were_waiters && - waiting_for_commit.empty()) - auth_unpin(); -} - - - - - -// IMPORT/EXPORT - -void CDir::encode_export(bufferlist& bl) -{ - ::_encode_simple(version, bl); - ::_encode_simple(committed_version, bl); - ::_encode_simple(committed_version_equivalent, bl); - - ::_encode_simple(state, bl); - ::_encode_simple(dir_rep, bl); - - ::_encode_simple(pop_me, bl); - ::_encode_simple(pop_auth_subtree, bl); - - ::_encode_simple(dir_rep_by, bl); - ::_encode_simple(replica_map, bl); - - get(PIN_TEMPEXPORTING); -} - -void CDir::finish_export(utime_t now) -{ - pop_auth_subtree_nested -= pop_auth_subtree; - pop_me.zero(now); - pop_auth_subtree.zero(now); - put(PIN_TEMPEXPORTING); -} - -void CDir::decode_import(bufferlist::iterator& blp) -{ - ::_decode_simple(version, blp); - ::_decode_simple(committed_version, blp); - ::_decode_simple(committed_version_equivalent, blp); - committing_version = committed_version; - projected_version = version; - - unsigned s; - ::_decode_simple(s, blp); - state &= MASK_STATE_IMPORT_KEPT; - state |= (s & MASK_STATE_EXPORTED); - if (is_dirty()) get(PIN_DIRTY); - - ::_decode_simple(dir_rep, blp); - - ::_decode_simple(pop_me, blp); - ::_decode_simple(pop_auth_subtree, blp); - pop_auth_subtree_nested += pop_auth_subtree; - - ::_decode_simple(dir_rep_by, blp); - ::_decode_simple(replica_map, blp); - if (!replica_map.empty()) get(PIN_REPLICATED); - - replica_nonce = 0; // no longer defined -} - - - - -/******************************** - * AUTHORITY - */ - -/* - * if dir_auth.first == parent, auth is same as inode. - * unless .second != unknown, in which case that sticks. - */ -pair CDir::authority() -{ - if (is_subtree_root()) - return dir_auth; - else - return inode->authority(); -} - -/** is_subtree_root() - * true if this is an auth delegation point. - * that is, dir_auth != default (parent,unknown) - * - * some key observations: - * if i am auth: - * - any region bound will be an export, or frozen. - * - * note that this DOES heed dir_auth.pending - */ -/* -bool CDir::is_subtree_root() -{ - if (dir_auth == CDIR_AUTH_DEFAULT) { - //dout(10) << "is_subtree_root false " << dir_auth << " != " << CDIR_AUTH_DEFAULT - //<< " on " << ino() << dendl; - return false; - } else { - //dout(10) << "is_subtree_root true " << dir_auth << " != " << CDIR_AUTH_DEFAULT - //<< " on " << ino() << dendl; - return true; - } -} -*/ - -/** contains(x) - * true if we are x, or an ancestor of x - */ -bool CDir::contains(CDir *x) -{ - while (1) { - if (x == this) return true; - x = x->get_parent_dir(); - if (x == 0) return false; - } -} - - - -/** set_dir_auth - */ -void CDir::set_dir_auth(pair a) -{ - dout(10) << "setting dir_auth=" << a - << " from " << dir_auth - << " on " << *this << dendl; - - bool was_subtree = is_subtree_root(); - bool was_ambiguous = dir_auth.second >= 0; - - // set it. - dir_auth = a; - - // new subtree root? - if (!was_subtree && is_subtree_root()) { - dout(10) << " new subtree root, adjusting auth_pins" << dendl; - - // adjust nested auth pins - inode->adjust_nested_auth_pins(-get_cum_auth_pins()); - - // unpin parent of frozen dir/tree? - if (inode->is_auth() && (is_frozen_tree_root() || is_frozen_dir())) - inode->auth_unpin(); - } - if (was_subtree && !is_subtree_root()) { - dout(10) << " old subtree root, adjusting auth_pins" << dendl; - - // adjust nested auth pins - inode->adjust_nested_auth_pins(get_cum_auth_pins()); - - // pin parent of frozen dir/tree? - if (inode->is_auth() && (is_frozen_tree_root() || is_frozen_dir())) - inode->auth_pin(); - } - - // newly single auth? - if (was_ambiguous && dir_auth.second == CDIR_AUTH_UNKNOWN) { - list ls; - take_waiting(WAIT_SINGLEAUTH, ls); - cache->mds->queue_waiters(ls); - } -} - - -/***************************************** - * AUTH PINS and FREEZING - * - * the basic plan is that auth_pins only exist in auth regions, and they - * prevent a freeze (and subsequent auth change). - * - * however, we also need to prevent a parent from freezing if a child is frozen. - * for that reason, the parent inode of a frozen directory is auth_pinned. - * - * the oddity is when the frozen directory is a subtree root. if that's the case, - * the parent inode isn't frozen. which means that when subtree authority is adjusted - * at the bounds, inodes for any frozen bound directories need to get auth_pins at that - * time. - * - */ - -void CDir::auth_pin() -{ - if (auth_pins == 0) - get(PIN_AUTHPIN); - auth_pins++; - - dout(10) << "auth_pin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << dendl; - - // nest pins? - if (is_subtree_root()) return; // no. - //assert(!is_import()); - - inode->adjust_nested_auth_pins(1); -} - -void CDir::auth_unpin() -{ - auth_pins--; - if (auth_pins == 0) - put(PIN_AUTHPIN); - - dout(10) << "auth_unpin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << dendl; - assert(auth_pins >= 0); - - maybe_finish_freeze(); // pending freeze? - - // nest? - if (is_subtree_root()) return; // no. - //assert(!is_import()); - - inode->adjust_nested_auth_pins(-1); -} - -void CDir::adjust_nested_auth_pins(int inc) -{ - nested_auth_pins += inc; - - dout(15) << "adjust_nested_auth_pins " << inc << " on " << *this - << " count now " << auth_pins << " + " << nested_auth_pins << dendl; - assert(nested_auth_pins >= 0); - - maybe_finish_freeze(); // pending freeze? - - // adjust my inode? - if (is_subtree_root()) - return; // no, stop. - - // yes. - inode->adjust_nested_auth_pins(inc); -} - - - -/***************************************************************************** - * FREEZING - */ - -// FREEZE TREE - -bool CDir::freeze_tree() -{ - assert(!is_frozen()); - assert(!is_freezing()); - - auth_pin(); - if (is_freezeable(true)) { - _freeze_tree(); - auth_unpin(); - return true; - } else { - state_set(STATE_FREEZINGTREE); - dout(10) << "freeze_tree waiting " << *this << dendl; - return false; - } -} - -void CDir::_freeze_tree() -{ - dout(10) << "_freeze_tree " << *this << dendl; - assert(is_freezeable(true)); - - // twiddle state - state_clear(STATE_FREEZINGTREE); // actually, this may get set again by next context? - state_set(STATE_FROZENTREE); - get(PIN_FROZEN); - - // auth_pin inode for duration of freeze, if we are not a subtree root. - if (is_auth() && !is_subtree_root()) - inode->auth_pin(); -} - -void CDir::unfreeze_tree() -{ - dout(10) << "unfreeze_tree " << *this << dendl; - - if (state_test(STATE_FROZENTREE)) { - // frozen. unfreeze. - state_clear(STATE_FROZENTREE); - put(PIN_FROZEN); - - // unpin (may => FREEZEABLE) FIXME: is this order good? - if (is_auth() && !is_subtree_root()) - inode->auth_unpin(); - - // waiters? - finish_waiting(WAIT_UNFREEZE); - } else { - finish_waiting(WAIT_FROZEN, -1); - - // freezing. stop it. - assert(state_test(STATE_FREEZINGTREE)); - state_clear(STATE_FREEZINGTREE); - auth_unpin(); - - finish_waiting(WAIT_UNFREEZE); - } -} - -bool CDir::is_freezing_tree() -{ - CDir *dir = this; - while (1) { - if (dir->is_freezing_tree_root()) return true; - if (dir->is_subtree_root()) return false; - if (dir->inode->parent) - dir = dir->inode->parent->dir; - else - return false; // root on replica - } -} - -bool CDir::is_frozen_tree() -{ - CDir *dir = this; - while (1) { - if (dir->is_frozen_tree_root()) return true; - if (dir->is_subtree_root()) return false; - if (dir->inode->parent) - dir = dir->inode->parent->dir; - else - return false; // root on replica - } -} - -CDir *CDir::get_frozen_tree_root() -{ - assert(is_frozen()); - CDir *dir = this; - while (1) { - if (dir->is_frozen_tree_root()) - return dir; - if (dir->inode->parent) - dir = dir->inode->parent->dir; - else - assert(0); - } -} - - - -// FREEZE DIR - -bool CDir::freeze_dir() -{ - assert(!is_frozen()); - assert(!is_freezing()); - - auth_pin(); - if (is_freezeable_dir(true)) { - _freeze_dir(); - auth_unpin(); - return true; - } else { - state_set(STATE_FREEZINGDIR); - dout(10) << "freeze_dir + wait " << *this << dendl; - return false; - } -} - -void CDir::_freeze_dir() -{ - dout(10) << "_freeze_dir " << *this << dendl; - assert(is_freezeable_dir(true)); - - state_clear(STATE_FREEZINGDIR); - state_set(STATE_FROZENDIR); - get(PIN_FROZEN); - - if (is_auth() && !is_subtree_root()) - inode->auth_pin(); // auth_pin for duration of freeze -} - - -void CDir::unfreeze_dir() -{ - dout(10) << "unfreeze_dir " << *this << dendl; - - if (state_test(STATE_FROZENDIR)) { - state_clear(STATE_FROZENDIR); - put(PIN_FROZEN); - - // unpin (may => FREEZEABLE) FIXME: is this order good? - if (is_auth() && !is_subtree_root()) - inode->auth_unpin(); - - finish_waiting(WAIT_UNFREEZE); - } else { - finish_waiting(WAIT_FROZEN, -1); - - // still freezing. stop. - assert(state_test(STATE_FREEZINGDIR)); - state_clear(STATE_FREEZINGDIR); - auth_unpin(); - - finish_waiting(WAIT_UNFREEZE); - } -} - - - - - - - - diff --git a/trunk/ceph/mds/CDir.h b/trunk/ceph/mds/CDir.h deleted file mode 100644 index 99bad3801e130..0000000000000 --- a/trunk/ceph/mds/CDir.h +++ /dev/null @@ -1,540 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __CDIR_H -#define __CDIR_H - -#include "include/types.h" -#include "include/buffer.h" -#include "mdstypes.h" -#include "config.h" -#include "common/DecayCounter.h" - -#include -#include - -#include -#include -#include -#include -using namespace std; - -#include -using __gnu_cxx::hash_map; - - -#include "CInode.h" - -class CDentry; -class MDCache; -class MDCluster; -class Context; -class CDirDiscover; - - -ostream& operator<<(ostream& out, class CDir& dir); - - -class CDir : public MDSCacheObject { - public: - // -- pins -- - static const int PIN_DNWAITER = 1; - static const int PIN_INOWAITER = 2; - static const int PIN_CHILD = 3; - static const int PIN_FROZEN = 4; - static const int PIN_SUBTREE = 5; - static const int PIN_IMPORTING = 7; - static const int PIN_IMPORTBOUND = 9; - static const int PIN_EXPORTBOUND = 10; - static const int PIN_STICKY = 11; - static const int PIN_SUBTREETEMP = 12; // used by MDCache::trim_non_auth() - const char *pin_name(int p) { - switch (p) { - case PIN_DNWAITER: return "dnwaiter"; - case PIN_INOWAITER: return "inowaiter"; - case PIN_CHILD: return "child"; - case PIN_FROZEN: return "frozen"; - case PIN_SUBTREE: return "subtree"; - case PIN_IMPORTING: return "importing"; - case PIN_IMPORTBOUND: return "importbound"; - case PIN_EXPORTBOUND: return "exportbound"; - case PIN_STICKY: return "sticky"; - case PIN_SUBTREETEMP: return "subtreetemp"; - default: return generic_pin_name(p); - } - } - - // -- state -- - static const unsigned STATE_COMPLETE = (1<< 1); // the complete contents are in cache - static const unsigned STATE_FROZENTREE = (1<< 2); // root of tree (bounded by exports) - static const unsigned STATE_FREEZINGTREE = (1<< 3); // in process of freezing - static const unsigned STATE_FROZENDIR = (1<< 4); - static const unsigned STATE_FREEZINGDIR = (1<< 5); - static const unsigned STATE_COMMITTING = (1<< 6); // mid-commit - static const unsigned STATE_FETCHING = (1<< 7); // currenting fetching - static const unsigned STATE_IMPORTBOUND = (1<<10); - static const unsigned STATE_EXPORTBOUND = (1<<11); - static const unsigned STATE_EXPORTING = (1<<12); - static const unsigned STATE_IMPORTING = (1<<13); - static const unsigned STATE_FRAGMENTING = (1<<14); - static const unsigned STATE_STICKY = (1<<15); // sticky pin due to inode stickydirs - static const unsigned STATE_DNPINNEDFRAG = (1<<16); // dir is refragmenting - - // common states - static const unsigned STATE_CLEAN = 0; - static const unsigned STATE_INITIAL = 0; - - // these state bits are preserved by an import/export - // ...except if the directory is hashed, in which case none of them are! - static const unsigned MASK_STATE_EXPORTED = - (STATE_COMPLETE|STATE_DIRTY); - static const unsigned MASK_STATE_IMPORT_KEPT = - ( - STATE_IMPORTING - |STATE_IMPORTBOUND|STATE_EXPORTBOUND - |STATE_FROZENTREE - |STATE_STICKY); - static const unsigned MASK_STATE_EXPORT_KEPT = - (STATE_EXPORTING - |STATE_IMPORTBOUND|STATE_EXPORTBOUND - |STATE_FROZENTREE - |STATE_FROZENDIR - |STATE_STICKY); - static const unsigned MASK_STATE_FRAGMENT_KEPT = - (STATE_DIRTY | - STATE_COMPLETE | - STATE_EXPORTBOUND | - STATE_IMPORTBOUND); - - // -- rep spec -- - static const int REP_NONE = 0; - static const int REP_ALL = 1; - static const int REP_LIST = 2; - - - static const int NONCE_EXPORT = 1; - - - // -- wait masks -- - static const int WAIT_DENTRY = (1<<0); // wait for item to be in cache - static const int WAIT_COMPLETE = (1<<1); // wait for complete dir contents - static const int WAIT_FROZEN = (1<<2); // auth pins removed - - static const int WAIT_DNLOCK_OFFSET = 4; - - static const int WAIT_ANY = (0xffffffff); - static const int WAIT_ATFREEZEROOT = (WAIT_UNFREEZE); - static const int WAIT_ATSUBTREEROOT = (WAIT_SINGLEAUTH); - - - - - public: - // context - MDCache *cache; - - CInode *inode; // my inode - frag_t frag; // my frag - - bool is_lt(const MDSCacheObject *r) const { - return dirfrag() < ((const CDir*)r)->dirfrag(); - } - - //int hack_num_accessed; - -public: - //typedef hash_map map_t; // there is a bug somewhere, valgrind me. - typedef map map_t; -protected: - // contents - map_t items; // non-null AND null - unsigned nitems; // # non-null - unsigned nnull; // # null - - int num_dirty; - - - - // state - version_t version; - version_t committing_version; - version_t committed_version; - version_t committed_version_equivalent; // in case of, e.g., temporary file - version_t projected_version; - - xlist::item xlist_dirty; - - // lock nesting, freeze - int auth_pins; - int nested_auth_pins; - int request_pins; - - // cache control (defined for authority; hints for replicas) - int dir_rep; - set dir_rep_by; // if dir_rep == REP_LIST - - // popularity - dirfrag_load_vec_t pop_me; - dirfrag_load_vec_t pop_nested; - dirfrag_load_vec_t pop_auth_subtree; - dirfrag_load_vec_t pop_auth_subtree_nested; - - utime_t last_popularity_sample; - - load_spread_t pop_spread; - - // and to provide density - int num_dentries_nested; - int num_dentries_auth_subtree; - int num_dentries_auth_subtree_nested; - - - // friends - friend class Migrator; - friend class CInode; - friend class MDCache; - friend class MDiscover; - friend class MDBalancer; - - friend class CDirDiscover; - friend class CDirExport; - - public: - CDir(CInode *in, frag_t fg, MDCache *mdcache, bool auth); - - - - // -- accessors -- - inodeno_t ino() const { return inode->ino(); } // deprecate me? - frag_t get_frag() const { return frag; } - dirfrag_t dirfrag() const { return dirfrag_t(inode->ino(), frag); } - - CInode *get_inode() { return inode; } - CDir *get_parent_dir() { return inode->get_parent_dir(); } - - map_t::iterator begin() { return items.begin(); } - map_t::iterator end() { return items.end(); } - unsigned get_size() { - return nitems; - } - unsigned get_nitems() { return nitems; } - unsigned get_nnull() { return nnull; } - - void inc_num_dirty() { num_dirty++; } - void dec_num_dirty() { - assert(num_dirty > 0); - num_dirty--; - } - int get_num_dirty() { - return num_dirty; - } - - - // -- dentries and inodes -- - public: - CDentry* lookup(const string& n) { - map_t::iterator iter = items.find(n); - if (iter == items.end()) - return 0; - else - return iter->second; - } - - CDentry* add_null_dentry(const string& dname); - CDentry* add_primary_dentry(const string& dname, CInode *in); - CDentry* add_remote_dentry(const string& dname, inodeno_t ino, unsigned char d_type); - void remove_dentry( CDentry *dn ); // delete dentry - void link_remote_inode( CDentry *dn, inodeno_t ino, unsigned char d_type); - void link_primary_inode( CDentry *dn, CInode *in ); - void unlink_inode( CDentry *dn ); - void try_remove_unlinked_dn(CDentry *dn); -private: - void link_inode_work( CDentry *dn, CInode *in ); - void unlink_inode_work( CDentry *dn ); - void remove_null_dentries(); - -public: - void split(int bits, list& subs, list& waiters); - void merge(int bits, list& waiters); -private: - void steal_dentry(CDentry *dn); // from another dir. used by merge/split. - void purge_stolen(list& waiters); - void init_fragment_pins(); - - - // -- authority -- - /* - * normal: !subtree_root - * delegation: subtree_root - * ambiguous: subtree_root - * subtree_root - */ - pair dir_auth; - - public: - pair authority(); - pair get_dir_auth() { return dir_auth; } - void set_dir_auth(pair a); - void set_dir_auth(int a) { set_dir_auth(pair(a, CDIR_AUTH_UNKNOWN)); } - bool is_ambiguous_dir_auth() { - return dir_auth.second != CDIR_AUTH_UNKNOWN; - } - bool is_full_dir_auth() { - return is_auth() && !is_ambiguous_dir_auth(); - } - bool is_full_dir_nonauth() { - return !is_auth() && !is_ambiguous_dir_auth(); - } - - bool is_subtree_root() { - return dir_auth != CDIR_AUTH_DEFAULT; - } - - bool contains(CDir *x); // true if we are x or an ancestor of x - - - // for giving to clients - void get_dist_spec(set& ls, int auth) { - if (is_rep()) { - for (map::iterator p = replicas_begin(); - p != replicas_end(); - ++p) - ls.insert(p->first); - if (!ls.empty()) - ls.insert(auth); - } - } - - CDirDiscover *replicate_to(int mds); - - - // -- state -- - bool is_complete() { return state & STATE_COMPLETE; } - bool is_exporting() { return state & STATE_EXPORTING; } - bool is_importing() { return state & STATE_IMPORTING; } - - int get_dir_rep() { return dir_rep; } - bool is_rep() { - if (dir_rep == REP_NONE) return false; - return true; - } - - // -- fetch -- - object_t get_ondisk_object() { return object_t(ino(), frag); } - void fetch(Context *c, bool ignore_authpinnability=false); - void _fetched(bufferlist &bl); - - // -- commit -- - map > waiting_for_commit; - - void commit_to(version_t want); - void commit(version_t want, Context *c); - void _commit(version_t want); - void _committed(version_t v); - void wait_for_commit(Context *c, version_t v=0); - - // -- dirtyness -- - version_t get_version() { return version; } - void set_version(version_t v) { projected_version = version = v; } - version_t get_projected_version() { return projected_version; } - version_t get_committing_version() { return committing_version; } - version_t get_committed_version() { return committed_version; } - version_t get_committed_version_equivalent() { return committed_version_equivalent; } - void set_committed_version(version_t v) { committed_version = v; } - - version_t pre_dirty(version_t min=0); - void _mark_dirty(LogSegment *ls); - void mark_dirty(version_t pv, LogSegment *ls); - void mark_clean(); - void mark_complete() { state_set(STATE_COMPLETE); } - - - // -- reference counting -- - void first_get(); - void last_put(); - - void request_pin_get() { - if (request_pins == 0) get(PIN_REQUEST); - request_pins++; - } - void request_pin_put() { - request_pins--; - if (request_pins == 0) put(PIN_REQUEST); - } - - - // -- waiters -- -protected: - hash_map< string, list > waiting_on_dentry; - hash_map< inodeno_t, list > waiting_on_ino; - -public: - bool is_waiting_for_dentry(const string& dn) { - return waiting_on_dentry.count(dn); - } - void add_dentry_waiter(const string& dentry, Context *c); - void take_dentry_waiting(const string& dentry, list& ls); - - bool is_waiting_for_ino(inodeno_t ino) { - return waiting_on_ino.count(ino); - } - void add_ino_waiter(inodeno_t ino, Context *c); - void take_ino_waiting(inodeno_t ino, list& ls); - - void take_sub_waiting(list& ls); // dentry or ino - - void add_waiter(int mask, Context *c); - void take_waiting(int mask, list& ls); // may include dentry waiters - void finish_waiting(int mask, int result = 0); // ditto - - - // -- import/export -- - void encode_export(bufferlist& bl); - void finish_export(utime_t now); - void abort_export() { - put(PIN_TEMPEXPORTING); - } - void decode_import(bufferlist::iterator& blp); - - - // -- auth pins -- - bool can_auth_pin() { return is_auth() && !(is_frozen() || is_freezing()); } - int is_auth_pinned() { return auth_pins; } - int get_cum_auth_pins() { return auth_pins + nested_auth_pins; } - int get_auth_pins() { return auth_pins; } - int get_nested_auth_pins() { return nested_auth_pins; } - void auth_pin(); - void auth_unpin(); - void adjust_nested_auth_pins(int inc); - - // -- freezing -- - bool freeze_tree(); - void _freeze_tree(); - void unfreeze_tree(); - - bool freeze_dir(); - void _freeze_dir(); - void unfreeze_dir(); - - void maybe_finish_freeze() { - if (auth_pins != 1 || nested_auth_pins != 0) - return; - if (state_test(STATE_FREEZINGTREE)) { - _freeze_tree(); - auth_unpin(); - finish_waiting(WAIT_FROZEN); - } - if (state_test(STATE_FREEZINGDIR)) { - _freeze_dir(); - auth_unpin(); - finish_waiting(WAIT_FROZEN); - } - } - - bool is_freezing() { return is_freezing_tree() || is_freezing_dir(); } - bool is_freezing_tree(); - bool is_freezing_tree_root() { return state & STATE_FREEZINGTREE; } - bool is_freezing_dir() { return state & STATE_FREEZINGDIR; } - - bool is_frozen() { return is_frozen_dir() || is_frozen_tree(); } - bool is_frozen_tree(); - bool is_frozen_tree_root() { return state & STATE_FROZENTREE; } - bool is_frozen_dir() { return state & STATE_FROZENDIR; } - - bool is_freezeable(bool freezing=false) { - // no nested auth pins. - if ((auth_pins-freezing) > 0 || nested_auth_pins > 0) - return false; - - // inode must not be frozen. - if (!is_subtree_root() && inode->is_frozen()) - return false; - - return true; - } - bool is_freezeable_dir(bool freezing=false) { - if ((auth_pins-freezing) > 0) - return false; - - // if not subtree root, inode must not be frozen (tree--frozen_dir is okay). - if (!is_subtree_root() && inode->is_frozen() && !inode->is_frozen_dir()) - return false; - - return true; - } - - CDir *get_frozen_tree_root(); - - - - ostream& print_db_line_prefix(ostream& out); - void print(ostream& out); -}; - - - -// -- encoded state -- - -// discover - -class CDirDiscover { - dirfrag_t dirfrag; - int nonce; - int dir_rep; - set rep_by; - - public: - CDirDiscover() {} - CDirDiscover(CDir *dir, int nonce) { - dirfrag = dir->dirfrag(); - this->nonce = nonce; - dir_rep = dir->dir_rep; - rep_by = dir->dir_rep_by; - } - - void update_dir(CDir *dir) { - assert(dir->dirfrag() == dirfrag); - assert(!dir->is_auth()); - - dir->replica_nonce = nonce; - dir->dir_rep = dir_rep; - dir->dir_rep_by = rep_by; - } - - dirfrag_t get_dirfrag() { return dirfrag; } - - void _encode(bufferlist& bl) { - bl.append((char*)&dirfrag, sizeof(dirfrag)); - bl.append((char*)&nonce, sizeof(nonce)); - bl.append((char*)&dir_rep, sizeof(dir_rep)); - ::_encode(rep_by, bl); - } - - void _decode(bufferlist& bl, int& off) { - bl.copy(off, sizeof(dirfrag), (char*)&dirfrag); - off += sizeof(dirfrag); - bl.copy(off, sizeof(nonce), (char*)&nonce); - off += sizeof(nonce); - bl.copy(off, sizeof(dir_rep), (char*)&dir_rep); - off += sizeof(dir_rep); - ::_decode(rep_by, bl, off); - } - -}; - - - -#endif diff --git a/trunk/ceph/mds/CInode.cc b/trunk/ceph/mds/CInode.cc deleted file mode 100644 index 4320750c50eb8..0000000000000 --- a/trunk/ceph/mds/CInode.cc +++ /dev/null @@ -1,844 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "CInode.h" -#include "CDir.h" -#include "CDentry.h" - -#include "MDS.h" -#include "MDCache.h" -#include "AnchorTable.h" - -#include "LogSegment.h" - -#include "common/Clock.h" - -#include "messages/MLock.h" - -#include -#include - -#include "config.h" - -#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_mds) *_dout << dbeginl << g_clock.now() << " mds" << mdcache->mds->get_nodeid() << ".cache.ino(" << inode.ino << ") " - - -//int cinode_pins[CINODE_NUM_PINS]; // counts -ostream& CInode::print_db_line_prefix(ostream& out) -{ - return out << g_clock.now() << " mds" << mdcache->mds->get_nodeid() << ".cache.ino(" << inode.ino << ") "; -} - - - -ostream& operator<<(ostream& out, CInode& in) -{ - filepath path; - in.make_path(path); - out << "[inode " << in.inode.ino << " " << path << (in.is_dir() ? "/ ":" "); - if (in.is_auth()) { - out << "auth"; - if (in.is_replicated()) - out << in.get_replicas(); - } else { - out << "rep@" << in.authority(); - out << "." << in.get_replica_nonce(); - assert(in.get_replica_nonce() >= 0); - } - - if (in.is_symlink()) out << " symlink"; - if (in.is_dir() && !in.dirfragtree.empty()) out << " " << in.dirfragtree; - - out << " v" << in.get_version(); - - if (in.state_test(CInode::STATE_AMBIGUOUSAUTH)) out << " AMBIGAUTH"; - if (in.is_freezing_inode()) out << " FREEZING=" << in.auth_pin_freeze_allowance; - if (in.is_frozen_inode()) out << " FROZEN"; - - // locks - out << " " << in.authlock; - out << " " << in.linklock; - out << " " << in.dirfragtreelock; - out << " " << in.filelock; - out << " " << in.dirlock; - - // hack: spit out crap on which clients have caps - if (!in.get_client_caps().empty()) { - out << " caps={"; - for (map::iterator it = in.get_client_caps().begin(); - it != in.get_client_caps().end(); - it++) { - if (it != in.get_client_caps().begin()) out << ","; - out << it->first; - } - out << "}"; - } - - if (in.get_num_ref()) { - out << " |"; - in.print_pin_set(out); - } - - out << " " << ∈ - out << "]"; - return out; -} - - -void CInode::print(ostream& out) -{ - out << *this; -} - - -inode_t *CInode::project_inode() -{ - if (projected_inode.empty()) { - projected_inode.push_back(new inode_t(inode)); - } else { - projected_inode.push_back(new inode_t(*projected_inode.back())); - } - dout(15) << "project_inode " << projected_inode.back() << dendl; - return projected_inode.back(); -} - -void CInode::pop_and_dirty_projected_inode(LogSegment *ls) -{ - assert(!projected_inode.empty()); - dout(15) << "pop_and_dirty_projected_inode " << projected_inode.front() - << " v" << projected_inode.front()->version << dendl; - mark_dirty(projected_inode.front()->version, ls); - inode = *projected_inode.front(); - delete projected_inode.front(); - projected_inode.pop_front(); -} - - -// ====== CInode ======= - -// dirfrags - -frag_t CInode::pick_dirfrag(const string& dn) -{ - if (dirfragtree.empty()) - return frag_t(); // avoid the string hash if we can. - - static hash H; - return dirfragtree[H(dn)]; -} - -void CInode::get_dirfrags_under(frag_t fg, list& ls) -{ - list fglist; - dirfragtree.get_leaves_under(fg, fglist); - for (list::iterator p = fglist.begin(); - p != fglist.end(); - ++p) - if (dirfrags.count(*p)) - ls.push_back(dirfrags[*p]); -} - -CDir *CInode::get_approx_dirfrag(frag_t fg) -{ - CDir *dir = get_dirfrag(fg); - if (dir) return dir; - - // find a child? - list ls; - get_dirfrags_under(fg, ls); - if (!ls.empty()) - return ls.front(); - - // try parents? - while (1) { - fg = fg.parent(); - dir = get_dirfrag(fg); - if (dir) return dir; - } -} - -void CInode::get_dirfrags(list& ls) -{ - // all dirfrags - for (map::iterator p = dirfrags.begin(); - p != dirfrags.end(); - ++p) - ls.push_back(p->second); -} -void CInode::get_nested_dirfrags(list& ls) -{ - // dirfrags in same subtree - for (map::iterator p = dirfrags.begin(); - p != dirfrags.end(); - ++p) - if (!p->second->is_subtree_root()) - ls.push_back(p->second); -} -void CInode::get_subtree_dirfrags(list& ls) -{ - // dirfrags that are roots of new subtrees - for (map::iterator p = dirfrags.begin(); - p != dirfrags.end(); - ++p) - if (p->second->is_subtree_root()) - ls.push_back(p->second); -} - - -CDir *CInode::get_or_open_dirfrag(MDCache *mdcache, frag_t fg) -{ - assert(is_dir()); - - // have it? - CDir *dir = get_dirfrag(fg); - if (!dir) { - // create it. - assert(is_auth()); - dir = new CDir(this, fg, mdcache, true); - add_dirfrag(dir); - } - return dir; -} - -CDir *CInode::add_dirfrag(CDir *dir) -{ - assert(dirfrags.count(dir->dirfrag().frag) == 0); - dirfrags[dir->dirfrag().frag] = dir; - - if (stickydir_ref > 0) { - dir->state_set(CDir::STATE_STICKY); - dir->get(CDir::PIN_STICKY); - } - - return dir; -} - -void CInode::close_dirfrag(frag_t fg) -{ - dout(14) << "close_dirfrag " << fg << dendl; - assert(dirfrags.count(fg)); - - CDir *dir = dirfrags[fg]; - dir->remove_null_dentries(); - - // clear dirty flag - if (dir->is_dirty()) - dir->mark_clean(); - - if (stickydir_ref > 0) { - dir->state_clear(CDir::STATE_STICKY); - dir->put(CDir::PIN_STICKY); - } - - // dump any remaining dentries, for debugging purposes - for (CDir::map_t::iterator p = dir->items.begin(); - p != dir->items.end(); - ++p) - dout(14) << "close_dirfrag LEFTOVER dn " << *p->second << dendl; - - assert(dir->get_num_ref() == 0); - delete dir; - dirfrags.erase(fg); -} - -void CInode::close_dirfrags() -{ - while (!dirfrags.empty()) - close_dirfrag(dirfrags.begin()->first); -} - -bool CInode::has_subtree_root_dirfrag() -{ - for (map::iterator p = dirfrags.begin(); - p != dirfrags.end(); - ++p) - if (p->second->is_subtree_root()) - return true; - return false; -} - - -void CInode::get_stickydirs() -{ - if (stickydir_ref == 0) { - get(PIN_STICKYDIRS); - for (map::iterator p = dirfrags.begin(); - p != dirfrags.end(); - ++p) { - p->second->state_set(CDir::STATE_STICKY); - p->second->get(CDir::PIN_STICKY); - } - } - stickydir_ref++; -} - -void CInode::put_stickydirs() -{ - assert(stickydir_ref > 0); - stickydir_ref--; - if (stickydir_ref == 0) { - put(PIN_STICKYDIRS); - for (map::iterator p = dirfrags.begin(); - p != dirfrags.end(); - ++p) { - p->second->state_clear(CDir::STATE_STICKY); - p->second->put(CDir::PIN_STICKY); - } - } -} - - - - - -// pins - -void CInode::first_get() -{ - // pin my dentry? - if (parent) - parent->get(CDentry::PIN_INODEPIN); -} - -void CInode::last_put() -{ - // unpin my dentry? - if (parent) - parent->put(CDentry::PIN_INODEPIN); -} - -void CInode::add_remote_parent(CDentry *p) -{ - if (remote_parents.empty()) - get(PIN_REMOTEPARENT); - remote_parents.insert(p); -} -void CInode::remove_remote_parent(CDentry *p) -{ - remote_parents.erase(p); - if (remote_parents.empty()) - put(PIN_REMOTEPARENT); -} - - - - -CDir *CInode::get_parent_dir() -{ - if (parent) - return parent->dir; - return NULL; -} -CInode *CInode::get_parent_inode() -{ - if (parent) - return parent->dir->inode; - return NULL; -} - - - -void CInode::make_path_string(string& s) -{ - if (parent) { - parent->make_path_string(s); - } - else if (is_root()) { - s = ""; // root - } - else if (is_stray()) { - s = "~stray"; - char n[10]; - sprintf(n, "%d", (int)(ino()-MDS_INO_STRAY_OFFSET)); - s += n; - } - else { - s = "(dangling)"; // dangling - } -} - -void CInode::make_path(filepath& fp) -{ - if (parent) - parent->make_path(fp); - else - fp.set_ino(ino()); -} - -void CInode::make_anchor_trace(vector& trace) -{ - if (parent) { - parent->dir->inode->make_anchor_trace(trace); - trace.push_back(Anchor(ino(), parent->dir->dirfrag())); - dout(10) << "make_anchor_trace added " << trace.back() << dendl; - } - else - assert(is_root() || is_stray()); -} - -void CInode::name_stray_dentry(string& dname) -{ - char s[20]; -#ifdef __LP64__ - sprintf(s, "%lx", inode.ino.val); -#else - sprintf(s, "%llx", inode.ino.val); -#endif - dname = s; -} - - -version_t CInode::pre_dirty() -{ - assert(parent); - return parent->pre_dirty(); -} - -void CInode::_mark_dirty(LogSegment *ls) -{ - if (!state_test(STATE_DIRTY)) { - state_set(STATE_DIRTY); - get(PIN_DIRTY); - assert(ls); - } - - // move myself to this segment's dirty list - if (ls) - ls->dirty_inodes.push_back(&xlist_dirty); -} - -void CInode::mark_dirty(version_t pv, LogSegment *ls) { - - dout(10) << "mark_dirty " << *this << dendl; - - assert(parent); - - /* - NOTE: I may already be dirty, but this fn _still_ needs to be called so that - the directory is (perhaps newly) dirtied, and so that parent_dir_version is - updated below. - */ - - // only auth can get dirty. "dirty" async data in replicas is relative to - // filelock state, not the dirty flag. - assert(is_auth()); - - // touch my private version - assert(inode.version < pv); - inode.version = pv; - _mark_dirty(ls); - - // mark dentry too - parent->mark_dirty(pv, ls); -} - - -void CInode::mark_clean() -{ - dout(10) << " mark_clean " << *this << dendl; - if (state_test(STATE_DIRTY)) { - state_clear(STATE_DIRTY); - put(PIN_DIRTY); - - // remove myself from ls dirty list - xlist_dirty.remove_myself(); - } -} - - - -// ------------------ -// locking - -void CInode::set_object_info(MDSCacheObjectInfo &info) -{ - info.ino = ino(); -} - -void CInode::encode_lock_state(int type, bufferlist& bl) -{ - switch (type) { - case LOCK_OTYPE_IAUTH: - _encode(inode.ctime, bl); - _encode(inode.mode, bl); - _encode(inode.uid, bl); - _encode(inode.gid, bl); - break; - - case LOCK_OTYPE_ILINK: - _encode(inode.ctime, bl); - _encode(inode.nlink, bl); - _encode(inode.anchored, bl); - break; - - case LOCK_OTYPE_IDIRFRAGTREE: - { - // encode the raw tree - dirfragtree._encode(bl); - - // also specify which frags are mine - set myfrags; - list dfls; - get_dirfrags(dfls); - for (list::iterator p = dfls.begin(); p != dfls.end(); ++p) - if ((*p)->is_auth()) { - frag_t fg = (*p)->get_frag(); - myfrags.insert(fg); - } - _encode(myfrags, bl); - } - break; - - case LOCK_OTYPE_IFILE: - _encode(inode.size, bl); - _encode(inode.mtime, bl); - _encode(inode.atime, bl); - break; - - case LOCK_OTYPE_IDIR: - _encode(inode.mtime, bl); - if (0) { - map frag_sizes; - for (map::iterator p = dirfrags.begin(); - p != dirfrags.end(); - ++p) - if (p->second->is_auth()) { - //frag_t fg = (*p)->get_frag(); - //frag_sizes[f] = dirfrag_size[fg]; - } - _encode(frag_sizes, bl); - } - break; - - default: - assert(0); - } -} - -void CInode::decode_lock_state(int type, bufferlist& bl) -{ - int off = 0; - utime_t tm; - - switch (type) { - case LOCK_OTYPE_IAUTH: - _decode(tm, bl, off); - if (inode.ctime < tm) inode.ctime = tm; - _decode(inode.mode, bl, off); - _decode(inode.uid, bl, off); - _decode(inode.gid, bl, off); - break; - - case LOCK_OTYPE_ILINK: - _decode(tm, bl, off); - if (inode.ctime < tm) inode.ctime = tm; - _decode(inode.nlink, bl, off); - _decode(inode.anchored, bl, off); - break; - - case LOCK_OTYPE_IDIRFRAGTREE: - { - fragtree_t temp; - temp._decode(bl, off); - set authfrags; - _decode(authfrags, bl, off); - if (is_auth()) { - // auth. believe replica's auth frags only. - for (set::iterator p = authfrags.begin(); p != authfrags.end(); ++p) - dirfragtree.force_to_leaf(*p); - } else { - // replica. just take the tree. - dirfragtree.swap(temp); - } - } - break; - - case LOCK_OTYPE_IFILE: - _decode(inode.size, bl, off); - _decode(inode.mtime, bl, off); - _decode(inode.atime, bl, off); - break; - - case LOCK_OTYPE_IDIR: - //::_decode(inode.size, bl, off); - _decode(tm, bl, off); - if (inode.mtime < tm) { - inode.mtime = tm; - if (is_auth()) { - dout(10) << "decode_lock_state auth got mtime " << tm << " > my " << inode.mtime - << ", setting dirlock updated flag on " << *this - << dendl; - dirlock.set_updated(); - } - } - if (0) { - map dfsz; - ::_decode(dfsz, bl, off); - // hmm which to keep? - } - break; - - default: - assert(0); - } -} - -void CInode::clear_dirty_scattered(int type) -{ - dout(10) << "clear_dirty_scattered " << type << " on " << *this << dendl; - switch (type) { - case LOCK_OTYPE_IDIR: - xlist_dirty_inode_mtime.remove_myself(); - break; - default: - assert(0); - } -} - - - -// waiting - -bool CInode::is_frozen() -{ - if (is_frozen_inode()) return true; - if (parent && parent->dir->is_frozen()) return true; - return false; -} - -bool CInode::is_frozen_dir() -{ - if (parent && parent->dir->is_frozen_dir()) return true; - return false; -} - -bool CInode::is_freezing() -{ - if (is_freezing_inode()) return true; - if (parent && parent->dir->is_freezing()) return true; - return false; -} - -void CInode::add_waiter(int tag, Context *c) -{ - dout(10) << "add_waiter tag " << tag - << " !ambig " << !state_test(STATE_AMBIGUOUSAUTH) - << " !frozen " << !is_frozen_inode() - << " !freezing " << !is_freezing_inode() - << dendl; - // wait on the directory? - // make sure its not the inode that is explicitly ambiguous|freezing|frozen - if (((tag & WAIT_SINGLEAUTH) && !state_test(STATE_AMBIGUOUSAUTH)) || - ((tag & WAIT_UNFREEZE) && !is_frozen_inode() && !is_freezing_inode())) { - parent->dir->add_waiter(tag, c); - return; - } - MDSCacheObject::add_waiter(tag, c); -} - -bool CInode::freeze_inode(int auth_pin_allowance) -{ - assert(auth_pin_allowance > 0); // otherwise we need to adjust parent's nested_auth_pins - assert(auth_pins >= auth_pin_allowance); - if (auth_pins > auth_pin_allowance) { - dout(10) << "freeze_inode - waiting for auth_pins to drop to " << auth_pin_allowance << dendl; - auth_pin_freeze_allowance = auth_pin_allowance; - get(PIN_FREEZING); - state_set(STATE_FREEZING); - return false; - } - - dout(10) << "freeze_inode - frozen" << dendl; - assert(auth_pins == auth_pin_allowance); - get(PIN_FROZEN); - state_set(STATE_FROZEN); - return true; -} - -void CInode::unfreeze_inode(list& finished) -{ - dout(10) << "unfreeze_inode" << dendl; - if (state_test(STATE_FREEZING)) { - state_clear(STATE_FREEZING); - put(PIN_FREEZING); - } else if (state_test(STATE_FROZEN)) { - state_clear(STATE_FROZEN); - put(PIN_FROZEN); - } else - assert(0); - take_waiting(WAIT_UNFREEZE, finished); -} - - -// auth_pins -bool CInode::can_auth_pin() { - if (is_freezing_inode() || is_frozen_inode()) return false; - if (parent) - return parent->can_auth_pin(); - return true; -} - -void CInode::auth_pin() -{ - if (auth_pins == 0) - get(PIN_AUTHPIN); - auth_pins++; - - dout(10) << "auth_pin on " << *this - << " now " << auth_pins << "+" << nested_auth_pins - << dendl; - - if (parent) - parent->adjust_nested_auth_pins( 1 ); -} - -void CInode::auth_unpin() -{ - auth_pins--; - if (auth_pins == 0) - put(PIN_AUTHPIN); - - dout(10) << "auth_unpin on " << *this - << " now " << auth_pins << "+" << nested_auth_pins - << dendl; - - assert(auth_pins >= 0); - - if (parent) - parent->adjust_nested_auth_pins( -1 ); - - if (is_freezing_inode() && - auth_pins == auth_pin_freeze_allowance) { - dout(10) << "auth_unpin freezing!" << dendl; - get(PIN_FROZEN); - put(PIN_FREEZING); - state_clear(STATE_FREEZING); - state_set(STATE_FROZEN); - finish_waiting(WAIT_FROZEN); - } -} - -void CInode::adjust_nested_auth_pins(int a) -{ - if (!parent) return; - nested_auth_pins += a; - - dout(15) << "adjust_nested_auth_pins by " << a - << " now " << auth_pins << "+" << nested_auth_pins - << dendl; - assert(nested_auth_pins >= 0); - - parent->adjust_nested_auth_pins(a); -} - - - -// authority - -pair CInode::authority() -{ - if (inode_auth.first >= 0) - return inode_auth; - - if (parent) - return parent->dir->authority(); - - return CDIR_AUTH_UNDEF; -} - - -CInodeDiscover* CInode::replicate_to( int rep ) -{ - assert(is_auth()); - - // relax locks? - if (!is_replicated()) - replicate_relax_locks(); - - // return the thinger - int nonce = add_replica( rep ); - return new CInodeDiscover( this, nonce ); -} - - - - -// IMPORT/EXPORT - -void CInode::encode_export(bufferlist& bl) -{ - ::_encode_simple(inode, bl); - ::_encode_simple(symlink, bl); - dirfragtree._encode(bl); - - bool dirty = is_dirty(); - ::_encode_simple(dirty, bl); - - ::_encode_simple(pop, bl); - - ::_encode_simple(replica_map, bl); - - authlock._encode(bl); - linklock._encode(bl); - dirfragtreelock._encode(bl); - filelock._encode(bl); - dirlock._encode(bl); - - get(PIN_TEMPEXPORTING); -} - -void CInode::finish_export(utime_t now) -{ - pop.zero(now); - - // just in case! - dirlock.clear_updated(); - - put(PIN_TEMPEXPORTING); -} - -void CInode::decode_import(bufferlist::iterator& p, - LogSegment *ls) -{ - utime_t old_mtime = inode.mtime; - ::_decode_simple(inode, p); - if (old_mtime > inode.mtime) { - assert(dirlock.is_updated()); - inode.mtime = old_mtime; // preserve our mtime, if it is larger - } - - ::_decode_simple(symlink, p); - dirfragtree._decode(p); - - bool dirty; - ::_decode_simple(dirty, p); - if (dirty) - _mark_dirty(ls); - - ::_decode_simple(pop, p); - - ::_decode_simple(replica_map, p); - if (!replica_map.empty()) get(PIN_REPLICATED); - - authlock._decode(p); - linklock._decode(p); - dirfragtreelock._decode(p); - filelock._decode(p); - dirlock._decode(p); -} - - - diff --git a/trunk/ceph/mds/CInode.h b/trunk/ceph/mds/CInode.h deleted file mode 100644 index b6da550a8d7b4..0000000000000 --- a/trunk/ceph/mds/CInode.h +++ /dev/null @@ -1,615 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __CINODE_H -#define __CINODE_H - -#include "config.h" -#include "include/types.h" -#include "include/lru.h" - -#include "mdstypes.h" - -#include "CDentry.h" -#include "SimpleLock.h" -#include "FileLock.h" -#include "ScatterLock.h" -#include "LocalLock.h" -#include "Capability.h" - - -#include -#include -#include -#include -#include -#include -using namespace std; - -class Context; -class CDentry; -class CDir; -class Message; -class CInode; -class CInodeDiscover; -class MDCache; -class LogSegment; - -ostream& operator<<(ostream& out, CInode& in); - - -// cached inode wrapper -class CInode : public MDSCacheObject { - public: - // -- pins -- - static const int PIN_DIRFRAG = -1; - static const int PIN_CAPS = 2; // client caps - static const int PIN_IMPORTING = -4; // importing - static const int PIN_ANCHORING = 5; - static const int PIN_UNANCHORING = 6; - static const int PIN_OPENINGDIR = 7; - static const int PIN_REMOTEPARENT = 8; - static const int PIN_BATCHOPENJOURNAL = 9; - static const int PIN_SCATTERED = 10; - static const int PIN_STICKYDIRS = 11; - static const int PIN_PURGING = -12; - static const int PIN_FREEZING = 13; - static const int PIN_FROZEN = 14; - static const int PIN_IMPORTINGCAPS = 15; - - const char *pin_name(int p) { - switch (p) { - case PIN_DIRFRAG: return "dirfrag"; - case PIN_CAPS: return "caps"; - case PIN_IMPORTING: return "importing"; - case PIN_ANCHORING: return "anchoring"; - case PIN_UNANCHORING: return "unanchoring"; - case PIN_OPENINGDIR: return "openingdir"; - case PIN_REMOTEPARENT: return "remoteparent"; - case PIN_BATCHOPENJOURNAL: return "batchopenjournal"; - case PIN_SCATTERED: return "scattered"; - case PIN_STICKYDIRS: return "stickydirs"; - case PIN_PURGING: return "purging"; - case PIN_FREEZING: return "freezing"; - case PIN_FROZEN: return "frozen"; - case PIN_IMPORTINGCAPS: return "importingcaps"; - default: return generic_pin_name(p); - } - } - - // -- state -- - static const int STATE_EXPORTING = (1<<2); // on nonauth bystander. - static const int STATE_ANCHORING = (1<<3); - static const int STATE_UNANCHORING = (1<<4); - static const int STATE_OPENINGDIR = (1<<5); - static const int STATE_REJOINUNDEF = (1<<6); // inode contents undefined. - static const int STATE_FREEZING = (1<<7); - static const int STATE_FROZEN = (1<<8); - static const int STATE_AMBIGUOUSAUTH = (1<<9); - static const int STATE_EXPORTINGCAPS = (1<<10); - - // -- waiters -- - //static const int WAIT_SLAVEAGREE = (1<<0); - static const int WAIT_DIR = (1<<1); - static const int WAIT_ANCHORED = (1<<2); - static const int WAIT_UNANCHORED = (1<<3); - static const int WAIT_CAPS = (1<<4); - static const int WAIT_FROZEN = (1<<5); - - static const int WAIT_AUTHLOCK_OFFSET = 5; - static const int WAIT_LINKLOCK_OFFSET = 5 + SimpleLock::WAIT_BITS; - static const int WAIT_DIRFRAGTREELOCK_OFFSET = 5 + 2*SimpleLock::WAIT_BITS; - static const int WAIT_FILELOCK_OFFSET = 5 + 3*SimpleLock::WAIT_BITS; - static const int WAIT_DIRLOCK_OFFSET = 5 + 4*SimpleLock::WAIT_BITS; - static const int WAIT_VERSIONLOCK_OFFSET = 5 + 5*SimpleLock::WAIT_BITS; - - static const int WAIT_ANY = 0xffffffff; - - // misc - static const int EXPORT_NONCE = 1; // nonce given to replicas created by export - - ostream& print_db_line_prefix(ostream& out); - - public: - MDCache *mdcache; - - // inode contents proper - inode_t inode; // the inode itself - string symlink; // symlink dest, if symlink - fragtree_t dirfragtree; // dir frag tree, if any. always consistent with our dirfrag map. - map dirfrag_size; // size of each dirfrag - - off_t last_journaled; // log offset for the last time i was journaled - off_t last_open_journaled; // log offset for the last journaled EOpen - - //bool hack_accessed; - //utime_t hack_load_stamp; - - // projected values (only defined while dirty) - list projected_inode; - list projected_dirfragtree; - - version_t get_projected_version() { - if (projected_inode.empty()) - return inode.version; - else - return projected_inode.back()->version; - } - - inode_t *project_inode(); - void pop_and_dirty_projected_inode(LogSegment *ls); - - // -- cache infrastructure -- -private: - map dirfrags; // cached dir fragments - int stickydir_ref; - -public: - frag_t pick_dirfrag(const string &dn); - bool has_dirfrags() { return !dirfrags.empty(); } - CDir* get_dirfrag(frag_t fg) { - if (dirfrags.count(fg)) { - assert(g_conf.debug_mds < 2 || dirfragtree.is_leaf(fg)); // performance hack FIXME - return dirfrags[fg]; - } else - return 0; - } - void get_dirfrags_under(frag_t fg, list& ls); - CDir* get_approx_dirfrag(frag_t fg); - void get_dirfrags(list& ls); - void get_nested_dirfrags(list& ls); - void get_subtree_dirfrags(list& ls); - CDir *get_or_open_dirfrag(MDCache *mdcache, frag_t fg); - CDir *add_dirfrag(CDir *dir); - void close_dirfrag(frag_t fg); - void close_dirfrags(); - bool has_subtree_root_dirfrag(); - - void get_stickydirs(); - void put_stickydirs(); - - protected: - // parent dentries in cache - CDentry *parent; // primary link - set remote_parents; // if hard linked - - pair inode_auth; - - // -- distributed state -- -protected: - // file capabilities - map client_caps; // client -> caps - map mds_caps_wanted; // [auth] mds -> caps wanted - int replica_caps_wanted; // [replica] what i've requested from auth - utime_t replica_caps_wanted_keep_until; - - - // LogSegment xlists i (may) belong to - xlist::item xlist_dirty; -public: - xlist::item xlist_open_file; - xlist::item xlist_dirty_inode_mtime; - xlist::item xlist_purging_inode; - -private: - // auth pin - int auth_pins; - int nested_auth_pins; -public: - int auth_pin_freeze_allowance; - - public: - inode_load_vec_t pop; - - // friends - friend class Server; - friend class Locker; - friend class Migrator; - friend class MDCache; - friend class CDir; - friend class CInodeExport; - friend class CInodeDiscover; - - public: - // --------------------------- - CInode(MDCache *c, bool auth=true) : - mdcache(c), - last_journaled(0), last_open_journaled(0), - //hack_accessed(true), - stickydir_ref(0), - parent(0), inode_auth(CDIR_AUTH_DEFAULT), - replica_caps_wanted(0), - xlist_dirty(this), xlist_open_file(this), - xlist_dirty_inode_mtime(this), xlist_purging_inode(this), - auth_pins(0), nested_auth_pins(0), - versionlock(this, LOCK_OTYPE_IVERSION, WAIT_VERSIONLOCK_OFFSET), - authlock(this, LOCK_OTYPE_IAUTH, WAIT_AUTHLOCK_OFFSET), - linklock(this, LOCK_OTYPE_ILINK, WAIT_LINKLOCK_OFFSET), - dirfragtreelock(this, LOCK_OTYPE_IDIRFRAGTREE, WAIT_DIRFRAGTREELOCK_OFFSET), - filelock(this, LOCK_OTYPE_IFILE, WAIT_FILELOCK_OFFSET), - dirlock(this, LOCK_OTYPE_IDIR, WAIT_DIRLOCK_OFFSET) - { - state = 0; - if (auth) state_set(STATE_AUTH); - }; - ~CInode() { - close_dirfrags(); - } - - - // -- accessors -- - bool is_file() { return inode.is_file(); } - bool is_symlink() { return inode.is_symlink(); } - bool is_dir() { return inode.is_dir(); } - - bool is_anchored() { return inode.anchored; } - bool is_anchoring() { return state_test(STATE_ANCHORING); } - bool is_unanchoring() { return state_test(STATE_UNANCHORING); } - - bool is_root() { return inode.ino == MDS_INO_ROOT; } - bool is_stray() { return MDS_INO_IS_STRAY(inode.ino); } - bool is_base() { return inode.ino < MDS_INO_BASE; } - - // note: this overloads MDSCacheObject - bool is_ambiguous_auth() { - return state_test(STATE_AMBIGUOUSAUTH) || - MDSCacheObject::is_ambiguous_auth(); - } - - - inodeno_t ino() const { return inode.ino; } - inode_t& get_inode() { return inode; } - CDentry* get_parent_dn() { return parent; } - CDir *get_parent_dir(); - CInode *get_parent_inode(); - - bool is_lt(const MDSCacheObject *r) const { - return ino() < ((CInode*)r)->ino(); - } - - // -- misc -- - void make_path_string(string& s); - void make_path(filepath& s); - void make_anchor_trace(vector& trace); - void name_stray_dentry(string& dname); - - - - // -- dirtyness -- - version_t get_version() { return inode.version; } - - version_t pre_dirty(); - void _mark_dirty(LogSegment *ls); - void mark_dirty(version_t projected_dirv, LogSegment *ls); - void mark_clean(); - - - CInodeDiscover* replicate_to(int rep); - - - // -- waiting -- - void add_waiter(int tag, Context *c); - - - // -- import/export -- - void encode_export(bufferlist& bl); - void finish_export(utime_t now); - void abort_export() { - put(PIN_TEMPEXPORTING); - } - void decode_import(bufferlist::iterator& p, LogSegment *ls); - - - // -- locks -- -public: - LocalLock versionlock; - SimpleLock authlock; - SimpleLock linklock; - ScatterLock dirfragtreelock; - FileLock filelock; - ScatterLock dirlock; - - - SimpleLock* get_lock(int type) { - switch (type) { - case LOCK_OTYPE_IFILE: return &filelock; - case LOCK_OTYPE_IAUTH: return &authlock; - case LOCK_OTYPE_ILINK: return &linklock; - case LOCK_OTYPE_IDIRFRAGTREE: return &dirfragtreelock; - case LOCK_OTYPE_IDIR: return &dirlock; - default: assert(0); return 0; - } - } - void set_object_info(MDSCacheObjectInfo &info); - void encode_lock_state(int type, bufferlist& bl); - void decode_lock_state(int type, bufferlist& bl); - - void clear_dirty_scattered(int type); - - // -- caps -- (new) - // client caps - bool is_any_caps() { return !client_caps.empty(); } - map& get_client_caps() { return client_caps; } - void add_client_cap(int client, Capability& cap) { - if (client_caps.empty()) - get(PIN_CAPS); - assert(client_caps.count(client) == 0); - client_caps[client] = cap; - } - void remove_client_cap(int client) { - assert(client_caps.count(client) == 1); - client_caps.erase(client); - if (client_caps.empty()) - put(PIN_CAPS); - } - Capability* get_client_cap(int client) { - if (client_caps.count(client)) - return &client_caps[client]; - return 0; - } - void reconnect_cap(int client, inode_caps_reconnect_t& icr) { - Capability *cap = get_client_cap(client); - if (cap) { - cap->merge(icr.wanted, icr.issued); - } else { - Capability newcap(icr.wanted, 0); - newcap.issue(icr.issued); - add_client_cap(client, newcap); - } - inode.size = MAX(inode.size, icr.size); - inode.mtime = MAX(inode.mtime, icr.mtime); - inode.atime = MAX(inode.atime, icr.atime); - } - /* - void set_client_caps(map& cl) { - if (client_caps.empty() && !cl.empty()) - get(PIN_CAPS); - client_caps.clear(); - client_caps = cl; - } - */ - void clear_client_caps() { - if (!client_caps.empty()) - put(PIN_CAPS); - client_caps.clear(); - } - void export_client_caps(map& cl) { - for (map::iterator it = client_caps.begin(); - it != client_caps.end(); - it++) { - cl[it->first] = it->second.make_export(); - } - } - void merge_client_caps(map& cl, set& new_client_caps) { - if (client_caps.empty() && !cl.empty()) - get(PIN_CAPS); - - for (map::iterator it = cl.begin(); - it != cl.end(); - it++) { - new_client_caps.insert(it->first); - if (client_caps.count(it->first)) { - // merge - client_caps[it->first].merge(it->second); - } else { - // new - client_caps[it->first] = Capability(it->second); - } - } - } - - // caps issued, wanted - int get_caps_issued() { - int c = 0; - for (map::iterator it = client_caps.begin(); - it != client_caps.end(); - it++) - c |= it->second.issued(); - return c; - } - int get_caps_wanted() { - int w = 0; - for (map::iterator it = client_caps.begin(); - it != client_caps.end(); - it++) { - w |= it->second.wanted(); - //cout << " get_caps_wanted client " << it->first << " " << cap_string(it->second.wanted()) << endl; - } - if (is_auth()) - for (map::iterator it = mds_caps_wanted.begin(); - it != mds_caps_wanted.end(); - it++) { - w |= it->second; - //cout << " get_caps_wanted mds " << it->first << " " << cap_string(it->second) << endl; - } - return w; - } - - - void replicate_relax_locks() { - //dout(10) << " relaxing locks on " << *this << dendl; - assert(is_auth()); - assert(!is_replicated()); - - authlock.replicate_relax(); - linklock.replicate_relax(); - dirfragtreelock.replicate_relax(); - - if (get_caps_issued() & (CAP_FILE_WR|CAP_FILE_WRBUFFER) == 0) - filelock.replicate_relax(); - - dirlock.replicate_relax(); - } - - - // -- authority -- - pair authority(); - - - // -- auth pins -- - int is_auth_pinned() { - return auth_pins; - } - void adjust_nested_auth_pins(int a); - bool can_auth_pin(); - void auth_pin(); - void auth_unpin(); - - - // -- freeze -- - bool is_freezing_inode() { return state_test(STATE_FREEZING); } - bool is_frozen_inode() { return state_test(STATE_FROZEN); } - bool is_frozen(); - bool is_frozen_dir(); - bool is_freezing(); - - bool freeze_inode(int auth_pin_allowance=0); - void unfreeze_inode(list& finished); - - - // -- reference counting -- - void bad_put(int by) { - generic_dout(7) << " bad put " << *this << " by " << by << " " << pin_name(by) << " was " << ref << " (" << ref_set << ")" << dendl; -#ifdef MDS_REF_SET - assert(ref_set.count(by) == 1); -#endif - assert(ref > 0); - } - void bad_get(int by) { - generic_dout(7) << " bad get " << *this << " by " << by << " " << pin_name(by) << " was " << ref << " (" << ref_set << ")" << dendl; -#ifdef MDS_REF_SET - assert(ref_set.count(by) == 0); -#endif - } - void first_get(); - void last_put(); - - - // -- hierarchy stuff -- -public: - void set_primary_parent(CDentry *p) { - assert(parent == 0); - parent = p; - } - void remove_primary_parent(CDentry *dn) { - assert(dn == parent); - parent = 0; - } - void add_remote_parent(CDentry *p); - void remove_remote_parent(CDentry *p); - int num_remote_parents() { - return remote_parents.size(); - } - - - /* - // for giving to clients - void get_dist_spec(set& ls, int auth, timepair_t& now) { - if (( is_dir() && popularity[MDS_POP_CURDOM].get(now) > g_conf.mds_bal_replicate_threshold) || - (!is_dir() && popularity[MDS_POP_JUSTME].get(now) > g_conf.mds_bal_replicate_threshold)) { - //if (!cached_by.empty() && inode.ino > 1) dout(1) << "distributed spec for " << *this << dendl; - ls = cached_by; - } - } - */ - - void print(ostream& out); - -}; - - - - -// -- encoded state - -// discover - -class CInodeDiscover { - - inode_t inode; - string symlink; - fragtree_t dirfragtree; - - int replica_nonce; - - int authlock_state; - int linklock_state; - int dirfragtreelock_state; - int filelock_state; - int dirlock_state; - - public: - CInodeDiscover() {} - CInodeDiscover(CInode *in, int nonce) { - inode = in->inode; - symlink = in->symlink; - dirfragtree = in->dirfragtree; - - replica_nonce = nonce; - - authlock_state = in->authlock.get_replica_state(); - linklock_state = in->linklock.get_replica_state(); - dirfragtreelock_state = in->dirfragtreelock.get_replica_state(); - filelock_state = in->filelock.get_replica_state(); - dirlock_state = in->dirlock.get_replica_state(); - } - - inodeno_t get_ino() { return inode.ino; } - int get_replica_nonce() { return replica_nonce; } - - void update_inode(CInode *in) { - in->inode = inode; - in->symlink = symlink; - in->dirfragtree = dirfragtree; - in->replica_nonce = replica_nonce; - } - void init_inode_locks(CInode *in) { - in->authlock.set_state(authlock_state); - in->linklock.set_state(linklock_state); - in->dirfragtreelock.set_state(dirfragtreelock_state); - in->filelock.set_state(filelock_state); - in->dirlock.set_state(dirlock_state); - } - - void _encode(bufferlist& bl) { - ::_encode(inode, bl); - ::_encode(symlink, bl); - dirfragtree._encode(bl); - ::_encode(replica_nonce, bl); - ::_encode(authlock_state, bl); - ::_encode(linklock_state, bl); - ::_encode(dirfragtreelock_state, bl); - ::_encode(filelock_state, bl); - ::_encode(dirlock_state, bl); - } - - void _decode(bufferlist& bl, int& off) { - ::_decode(inode, bl, off); - ::_decode(symlink, bl, off); - dirfragtree._decode(bl, off); - ::_decode(replica_nonce, bl, off); - ::_decode(authlock_state, bl, off); - ::_decode(linklock_state, bl, off); - ::_decode(dirfragtreelock_state, bl, off); - ::_decode(filelock_state, bl, off); - ::_decode(dirlock_state, bl, off); - } - -}; - - - -#endif diff --git a/trunk/ceph/mds/Capability.h b/trunk/ceph/mds/Capability.h deleted file mode 100644 index d7619d13ca156..0000000000000 --- a/trunk/ceph/mds/Capability.h +++ /dev/null @@ -1,245 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __CAPABILITY_H -#define __CAPABILITY_H - -#include "include/buffer.h" - -#include -using namespace std; - -#include "config.h" - - -// definite caps -#define CAP_FILE_RDCACHE 1 // client can safely cache reads -#define CAP_FILE_RD 2 // client can read -#define CAP_FILE_WR 4 // client can write -#define CAP_FILE_WREXTEND 8 // client can extend file -#define CAP_FILE_WRBUFFER 16 // client can safely buffer writes -#define CAP_FILE_LAZYIO 32 // client can perform lazy io - - -// heuristics -//#define CAP_FILE_DELAYFLUSH 32 - -inline string cap_string(int cap) -{ - string s; - s = "["; - if (cap & CAP_FILE_RDCACHE) s += " rdcache"; - if (cap & CAP_FILE_RD) s += " rd"; - if (cap & CAP_FILE_WR) s += " wr"; - if (cap & CAP_FILE_WRBUFFER) s += " wrbuffer"; - if (cap & CAP_FILE_WRBUFFER) s += " wrextend"; - if (cap & CAP_FILE_LAZYIO) s += " lazyio"; - s += " ]"; - return s; -} - -typedef uint32_t capseq_t; - -class Capability { -public: - struct Export { - int wanted; - int issued; - int pending; - Export() {} - Export(int w, int i, int p) : wanted(w), issued(i), pending(p) {} - }; - -private: - int wanted_caps; // what the client wants (ideally) - - map cap_history; // seq -> cap - capseq_t last_sent, last_recv; - - bool suppress; - -public: - Capability(int want=0, capseq_t s=0) : - wanted_caps(want), - last_sent(s), - last_recv(s), - suppress(false) { - } - Capability(Export& other) : - wanted_caps(other.wanted), - last_sent(0), last_recv(0) { - // issued vs pending - if (other.issued & ~other.pending) - issue(other.issued); - issue(other.pending); - } - - bool is_suppress() { return suppress; } - void set_suppress(bool b) { suppress = b; } - - bool is_null() { return cap_history.empty() && wanted_caps == 0; } - - // most recently issued caps. - int pending() { - if (cap_history.count(last_sent)) - return cap_history[ last_sent ]; - return 0; - } - - // caps client has confirmed receipt of - int confirmed() { - if (cap_history.count(last_recv)) - return cap_history[ last_recv ]; - return 0; - } - - // caps potentially issued - int issued() { - int c = 0; - for (capseq_t seq = last_recv; seq <= last_sent; seq++) { - if (cap_history.count(seq)) { - c |= cap_history[seq]; - generic_dout(10) << " cap issued: seq " << seq << " " << cap_string(cap_history[seq]) << " -> " << cap_string(c) << dendl; - } - } - return c; - } - - // caps this client wants to hold - int wanted() { return wanted_caps; } - void set_wanted(int w) { - wanted_caps = w; - } - - // needed - static int needed(int from) { - // strip out wrbuffer, rdcache - return from & (CAP_FILE_WR|CAP_FILE_RD); - } - int needed() { return needed(wanted_caps); } - - // conflicts - static int conflicts(int from) { - int c = 0; - if (from & CAP_FILE_WRBUFFER) c |= CAP_FILE_RDCACHE|CAP_FILE_RD; - if (from & CAP_FILE_WR) c |= CAP_FILE_RDCACHE; - if (from & CAP_FILE_RD) c |= CAP_FILE_WRBUFFER; - if (from & CAP_FILE_RDCACHE) c |= CAP_FILE_WRBUFFER|CAP_FILE_WR; - return c; - } - int wanted_conflicts() { return conflicts(wanted()); } - int needed_conflicts() { return conflicts(needed()); } - int issued_conflicts() { return conflicts(issued()); } - - // issue caps; return seq number. - capseq_t issue(int c) { - //int was = pending(); - //no! if (c == was && last_sent) return -1; // repeat of previous? - - ++last_sent; - cap_history[last_sent] = c; - - /* no! - // not recalling, just adding? - if (c & ~was && - cap_history.count(last_sent-1)) { - cap_history.erase(last_sent-1); - } - */ - return last_sent; - } - capseq_t get_last_seq() { return last_sent; } - - Export make_export() { - return Export(wanted_caps, issued(), pending()); - } - void merge(Export& other) { - // issued + pending - int newpending = other.pending | pending(); - if (other.issued & ~newpending) - issue(other.issued | newpending); - issue(newpending); - - // wanted - wanted_caps = wanted_caps | other.wanted; - } - void merge(int otherwanted, int otherissued) { - // issued + pending - int newpending = pending(); - if (otherissued & ~newpending) - issue(otherissued | newpending); - issue(newpending); - - // wanted - wanted_caps = wanted_caps | otherwanted; - } - - // confirm receipt of a previous sent/issued seq. - int confirm_receipt(capseq_t seq, int caps) { - int r = 0; - - // old seqs - while (last_recv < seq) { - generic_dout(10) << " cap.confirm_receipt forgetting seq " << last_recv << " " << cap_string(cap_history[last_recv]) << dendl; - r |= cap_history[last_recv]; - cap_history.erase(last_recv); - ++last_recv; - } - - // release current? - if (cap_history.count(seq) && - cap_history[seq] != caps) { - generic_dout(10) << " cap.confirm_receipt revising seq " << seq << " " << cap_string(cap_history[seq]) << " -> " << cap_string(caps) << dendl; - // note what we're releasing.. - assert(cap_history[seq] & ~caps); - r |= cap_history[seq] & ~caps; - - cap_history[seq] = caps; // confirmed() now less than before.. - } - - // null? - if (caps == 0 && - cap_history.size() == 1 && - cap_history.count(seq)) { - cap_history.clear(); // viola, null! - } - - return r; - } - - // serializers - void _encode(bufferlist& bl) { - bl.append((char*)&wanted_caps, sizeof(wanted_caps)); - bl.append((char*)&last_sent, sizeof(last_sent)); - bl.append((char*)&last_recv, sizeof(last_recv)); - ::_encode(cap_history, bl); - } - void _decode(bufferlist& bl, int& off) { - bl.copy(off, sizeof(wanted_caps), (char*)&wanted_caps); - off += sizeof(wanted_caps); - bl.copy(off, sizeof(last_sent), (char*)&last_sent); - off += sizeof(last_sent); - bl.copy(off, sizeof(last_recv), (char*)&last_recv); - off += sizeof(last_recv); - ::_decode(cap_history, bl, off); - } - -}; - - - - - -#endif diff --git a/trunk/ceph/mds/ClientMap.cc b/trunk/ceph/mds/ClientMap.cc deleted file mode 100644 index 1d781b9ba48c3..0000000000000 --- a/trunk/ceph/mds/ClientMap.cc +++ /dev/null @@ -1,126 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#define DBLEVEL 20 - -#include "include/types.h" - -#include "MDS.h" -#include "ClientMap.h" - -#include "osdc/Filer.h" - -#include "config.h" - -#define dout(x) if (x <= g_conf.debug_mds) *_dout << dbeginl << g_clock.now() << " mds" << mds->get_nodeid() << ".clientmap " - - - -void ClientMap::init_inode() -{ - memset(&inode, 0, sizeof(inode)); - inode.ino = MDS_INO_CLIENTMAP_OFFSET + mds->get_nodeid(); - inode.layout = g_OSD_FileLayout; -} - - -// ---------------- -// LOAD - -class C_CM_Load : public Context { - ClientMap *clientmap; -public: - bufferlist bl; - C_CM_Load(ClientMap *cm) : clientmap(cm) {} - void finish(int r) { - clientmap->_load_finish(bl); - } -}; - -void ClientMap::load(Context *onload) -{ - dout(10) << "load" << dendl; - - init_inode(); - - if (onload) - waiting_for_load.push_back(onload); - - C_CM_Load *c = new C_CM_Load(this); - mds->filer->read(inode, - 0, inode.layout.fl_stripe_unit, - &c->bl, - c); - -} - -void ClientMap::_load_finish(bufferlist &bl) -{ - int off = 0; - decode(bl, off); - dout(10) << "_load_finish v " << version - << ", " << client_inst.size() << " clients, " - << bl.length() << " bytes" - << dendl; - projected = committing = committed = version; - finish_contexts(waiting_for_load); -} - - -// ---------------- -// SAVE - -class C_CM_Save : public Context { - ClientMap *clientmap; - version_t version; -public: - C_CM_Save(ClientMap *cm, version_t v) : clientmap(cm), version(v) {} - void finish(int r) { - clientmap->_save_finish(version); - } -}; - -void ClientMap::save(Context *onsave, version_t needv) -{ - dout(10) << "save needv " << needv << ", v " << version << dendl; - - if (needv && committing >= needv) { - assert(committing > committed); - commit_waiters[committing].push_back(onsave); - return; - } - - commit_waiters[version].push_back(onsave); - - bufferlist bl; - - init_inode(); - encode(bl); - committing = version; - mds->filer->write(inode, - 0, bl.length(), bl, - 0, - 0, new C_CM_Save(this, version)); -} - -void ClientMap::_save_finish(version_t v) -{ - dout(10) << "_save_finish v" << v << dendl; - committed = v; - - finish_contexts(commit_waiters[v]); - commit_waiters.erase(v); -} diff --git a/trunk/ceph/mds/ClientMap.h b/trunk/ceph/mds/ClientMap.h deleted file mode 100644 index 59b3dde49b6be..0000000000000 --- a/trunk/ceph/mds/ClientMap.h +++ /dev/null @@ -1,205 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __CLIENTMAP_H -#define __CLIENTMAP_H - -#include "msg/Message.h" - -#include -using namespace std; - -#include -using namespace __gnu_cxx; - -#include "include/Context.h" -#include "mdstypes.h" - -class MDS; - -/* - * this structure is used by the MDS purely so that - * it can remember client addresses (entity_inst_t) - * for clients with an active session. - * - * it is also used to keep track of recently completed - * operations, should the client have to resubmit them - * (after a connection failure, etc.) - */ -class ClientMap { -private: - MDS *mds; - - version_t version; - version_t projected; - version_t committing; - version_t committed; - map > commit_waiters; - -public: - version_t get_version() { return version; } - version_t get_projected() { return projected; } - version_t get_committing() { return committing; } - version_t get_committed() { return committed; } - - void set_version(version_t v) { version = v; } - version_t inc_projected() { return ++projected; } - void reset_projected() { projected = version; } - void set_committing(version_t v) { committing = v; } - void set_committed(version_t v) { committed = v; } - -private: - // affects version - hash_map client_inst; - - // does not affect version - set sessions; - set opening; - set closing; - -public: - bool empty() { - return client_inst.empty(); - } - - const entity_inst_t& get_inst(int client) { - assert(client_inst.count(client)); - return client_inst[client]; - } - const set& get_session_set() { return sessions; } - - bool is_opening(int c) { return opening.count(c); } - void add_opening(int c) { opening.insert(c); } - bool is_closing(int c) { return closing.count(c); } - void add_closing(int c) { closing.insert(c); } - void remove_closing(int c) { closing.erase(c); } - bool have_session(int client) { - return client_inst.count(client); - } - void open_session(const entity_inst_t& inst) { - opening.erase(inst.name.num()); - client_inst[inst.name.num()] = inst; - sessions.insert(inst.name.num()); - version++; - } - void close_session(int client) { - closing.erase(client); - sessions.erase(client); - client_inst.erase(client); - version++; - } - void noop() { - version++; - } - void open_sessions(map& cm) { - for (map::iterator p = cm.begin(); p != cm.end(); ++p) { - client_inst[p->first] = p->second; - sessions.insert(p->first); - } - version++; - } - -private: - // -- push sequence -- - hash_map client_push_seq; // seq # for messages pushed to client. - -public: - version_t inc_push_seq(int client) { - return ++client_push_seq[client]; - } - version_t get_push_seq(int client) { - return client_push_seq[client]; - } - - -private: - // -- completed requests -- - // who -> { tid set ... } - map > completed_requests; - map > waiting_for_trim; - version_t requestmapv; - -public: - void add_completed_request(metareqid_t ri) { - completed_requests[ri.name].insert(ri.tid); - requestmapv++; - } - void trim_completed_requests(entity_name_t who, tid_t mintid) { // zero means trim all! - map >::iterator p = completed_requests.find(who); - if (p == completed_requests.end()) - return; - - // trim - while (!p->second.empty() && (mintid == 0 || *p->second.begin() < mintid)) - p->second.erase(p->second.begin()); - if (p->second.empty()) - completed_requests.erase(p); - - // kick waiters - map >::iterator q = waiting_for_trim.find(who); - if (q != waiting_for_trim.end()) { - list fls; - while (!q->second.empty() && - (mintid == 0 || q->second.begin()->first < mintid)) { - fls.push_back(q->second.begin()->second); - q->second.erase(q->second.begin()); - } - if (q->second.empty()) - waiting_for_trim.erase(q); - finish_contexts(fls); - } - } - void add_trim_waiter(metareqid_t ri, Context *c) { - waiting_for_trim[ri.name][ri.tid] = c; - } - bool have_completed_request(metareqid_t ri) { - return completed_requests.count(ri.name) && - completed_requests[ri.name].count(ri.tid); - } - - - - ClientMap(MDS *m) : mds(m), - version(0), projected(0), committing(0), committed(0), - requestmapv(0) {} - - - // -- encoding -- - void encode(bufferlist& bl) { - bl.append((char*)&version, sizeof(version)); - ::_encode(client_inst, bl); - ::_encode(sessions, bl); - } - void decode(bufferlist& bl, int& off) { - bl.copy(off, sizeof(version), (char*)&version); - off += sizeof(version); - ::_decode(client_inst, bl, off); - ::_decode(sessions, bl, off); - - projected = committing = committed = version; - } - - - // -- loading, saving -- - inode_t inode; - list waiting_for_load; - - void init_inode(); - void load(Context *onload); - void _load_finish(bufferlist &bl); - void save(Context *onsave, version_t needv=0); - void _save_finish(version_t v); -}; - -#endif diff --git a/trunk/ceph/mds/FileLock.h b/trunk/ceph/mds/FileLock.h deleted file mode 100644 index 09868f7563fb6..0000000000000 --- a/trunk/ceph/mds/FileLock.h +++ /dev/null @@ -1,227 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __FILELOCK_H -#define __FILELOCK_H - -#include -#include -using namespace std; - -#include "include/buffer.h" - -#include "SimpleLock.h" -#include "Capability.h" - -// states and such. -// C = cache reads, R = read, W = write, A = append, B = buffer writes, L = lazyio - -// -----auth-------- ---replica------- -#define LOCK_SYNC_ 1 // AR R . / C R . . . L R . / C R . . . L stat() -#define LOCK_GSYNCL -12 // A . . / C ? . . . L loner -> sync (*) -#define LOCK_GSYNCM -13 // A . . / . R . . . L - -#define LOCK_LOCK_ 2 // AR R W / C . . . . . . . / C . . . . . truncate() -#define LOCK_GLOCKR_ -3 // AR R . / C . . . . . . . / C . . . . . -#define LOCK_GLOCKL -4 // A . . / C . . . . . loner -> lock -#define LOCK_GLOCKM -5 // A . . / . . . . . . - -#define LOCK_MIXED 6 // AR . . / . R W A . L . . / . R . . . L -#define LOCK_GMIXEDR -7 // AR R . / . R . . . L . . / . R . . . L -#define LOCK_GMIXEDL -8 // A . . / . . . . . L loner -> mixed - -#define LOCK_LONER 9 // A . . / C R W A B L (lock) -#define LOCK_GLONERR -10 // A . . / . R . . . L -#define LOCK_GLONERM -11 // A . . / . R W A . L - -// (*) FIXME: how to let old loner keep R, somehow, during GSYNCL - -// 4 stable -// +9 transition -// 13 total - -inline const char *get_filelock_state_name(int n) { - switch (n) { - case LOCK_SYNC: return "sync"; - case LOCK_GSYNCL: return "gsyncl"; - case LOCK_GSYNCM: return "gsyncm"; - case LOCK_LOCK: return "lock"; - case LOCK_GLOCKR: return "glockr"; - case LOCK_GLOCKL: return "glockl"; - case LOCK_GLOCKM: return "glockm"; - case LOCK_MIXED: return "mixed"; - case LOCK_GMIXEDR: return "gmixedr"; - case LOCK_GMIXEDL: return "gmixedl"; - case LOCK_LONER: return "loner"; - case LOCK_GLONERR: return "glonerr"; - case LOCK_GLONERM: return "glonerm"; - default: assert(0); return 0; - } -} - - -/* no append scenarios: - -loner + truncate(): - - loner needs to lose A (?unless it's the loner doing the truncate?) -loner + statlite(size): - - loner needs to lose A - -any + statlite(size) - - all lose A - -any + statlite(mtime) - - all lose W - --> we need to add lonerfixed and mixedfixed states (and associated transitions) - in order to efficiently support statlite(size) and truncate(). until then, - we have to LOCK. - - */ - -// -- lock... hard or file - -class MDRequest; - -class FileLock : public SimpleLock { - public: - FileLock(MDSCacheObject *o, int t, int wo) : SimpleLock(o, t, wo) { } - - int get_replica_state() { - switch (state) { - case LOCK_LOCK: - case LOCK_GLOCKM: - case LOCK_GLOCKL: - case LOCK_GLOCKR: - case LOCK_LONER: - case LOCK_GLONERR: - case LOCK_GLONERM: - return LOCK_LOCK; - case LOCK_MIXED: - case LOCK_GMIXEDR: - return LOCK_MIXED; - case LOCK_SYNC: - return LOCK_SYNC; - - // after gather auth will bc LOCK_AC_MIXED or whatever - case LOCK_GSYNCM: - return LOCK_MIXED; - case LOCK_GSYNCL: - case LOCK_GMIXEDL: // ** LOCK isn't exact right state, but works. - return LOCK_LOCK; - - default: - assert(0); - } - return 0; - } - void export_twiddle() { - clear_gather(); - state = get_replica_state(); - } - - // read/write access - bool can_rdlock(MDRequest *mdr) { - if (!parent->is_auth()) return (state == LOCK_SYNC); - //if (state == LOCK_LOCK && mdr && xlock_by == mdr) return true; - if (state == LOCK_LOCK && !xlock_by) return true; - return - (state == LOCK_SYNC) || - (state == LOCK_GMIXEDR) || - (state == LOCK_GLOCKR); - } - bool can_rdlock_soon() { - if (parent->is_auth()) - return (state == LOCK_GLOCKL); - else - return false; - } - bool can_xlock_soon() { - if (parent->is_auth()) - return (state == LOCK_GLOCKR) || (state == LOCK_GLOCKL) - || (state == LOCK_GLOCKM); - else - return false; - } - - // client caps allowed - int caps_allowed_ever() { - if (parent->is_auth()) - return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_WR | CAP_FILE_WREXTEND | CAP_FILE_WRBUFFER | CAP_FILE_LAZYIO; - else - return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_LAZYIO; - } - int caps_allowed() { - if (parent->is_auth()) - switch (state) { - case LOCK_SYNC: - return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_LAZYIO; - case LOCK_LOCK: - case LOCK_GLOCKR: - case LOCK_GLOCKL: - return CAP_FILE_RDCACHE; - - case LOCK_GLOCKM: - return 0; - - case LOCK_MIXED: - return CAP_FILE_RD | CAP_FILE_WR | CAP_FILE_WREXTEND | CAP_FILE_LAZYIO; - case LOCK_GMIXEDR: - return CAP_FILE_RD | CAP_FILE_LAZYIO; - case LOCK_GMIXEDL: - return 0; - - case LOCK_LONER: // single client writer, of course. - return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_WR | CAP_FILE_WREXTEND | CAP_FILE_WRBUFFER | CAP_FILE_LAZYIO; - case LOCK_GLONERR: - return CAP_FILE_RD | CAP_FILE_LAZYIO; - case LOCK_GLONERM: - return CAP_FILE_RD | CAP_FILE_WR | CAP_FILE_WREXTEND | CAP_FILE_LAZYIO; - - case LOCK_GSYNCL: - return CAP_FILE_RDCACHE | CAP_FILE_LAZYIO; - case LOCK_GSYNCM: - return CAP_FILE_RD | CAP_FILE_LAZYIO; - } - else - switch (state) { - case LOCK_SYNC: - return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_LAZYIO; - case LOCK_LOCK: - case LOCK_GLOCKR: - return CAP_FILE_RDCACHE; - case LOCK_GMIXEDR: - case LOCK_MIXED: - return CAP_FILE_RD | CAP_FILE_LAZYIO; - } - assert(0); - return 0; - } - - void print(ostream& out) { - out << "("; - out << get_lock_type_name(get_type()) << " "; - out << get_filelock_state_name(get_state()); - if (!get_gather_set().empty()) out << " g=" << get_gather_set(); - if (is_rdlocked()) - out << " r=" << get_num_rdlocks(); - if (is_xlocked()) - out << " x=" << get_xlocked_by(); - out << ")"; - } -}; - - -#endif diff --git a/trunk/ceph/mds/IdAllocator.cc b/trunk/ceph/mds/IdAllocator.cc deleted file mode 100644 index 36a36ea9eb037..0000000000000 --- a/trunk/ceph/mds/IdAllocator.cc +++ /dev/null @@ -1,205 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#define DBLEVEL 20 - -#include "IdAllocator.h" -#include "MDS.h" -#include "MDLog.h" - -#include "osdc/Filer.h" - -#include "include/types.h" - -#include "config.h" - -#define dout(x) if (x <= g_conf.debug_mds) *_dout << dbeginl << g_clock.now() << " mds" << mds->get_nodeid() << ".idalloc: " - - -void IdAllocator::init_inode() -{ - memset(&inode, 0, sizeof(inode)); - inode.ino = MDS_INO_IDS_OFFSET + mds->get_nodeid(); - inode.layout = g_OSD_FileLayout; -} - - -inodeno_t IdAllocator::alloc_id() -{ - assert(is_active()); - - // pick one - inodeno_t id = free.start(); - free.erase(id); - dout(10) << "idalloc " << this << ": alloc id " << id << dendl; - - version++; - - // log it - /* - if (!replay) - mds->mdlog->submit_entry(new EAlloc(IDTYPE_INO, id, EALLOC_EV_ALLOC, version)); - */ - - return id; -} - -void IdAllocator::reclaim_id(inodeno_t id) -{ - assert(is_active()); - - dout(10) << "idalloc " << this << ": reclaim id " << id << dendl; - free.insert(id); - - version++; - - /* - if (!replay) - mds->mdlog->submit_entry(new EAlloc(IDTYPE_INO, id, EALLOC_EV_FREE, version)); - */ -} - - - -class C_ID_Save : public Context { - IdAllocator *ida; - version_t version; -public: - C_ID_Save(IdAllocator *i, version_t v) : ida(i), version(v) {} - void finish(int r) { - ida->save_2(version); - } -}; - -void IdAllocator::save(Context *onfinish, version_t v) -{ - if (v > 0 && v <= committing_version) { - dout(10) << "save v " << version << " - already saving " - << committing_version << " >= needed " << v << dendl; - waitfor_save[v].push_back(onfinish); - return; - } - - dout(10) << "save v " << version << dendl; - assert(is_active()); - - bufferlist bl; - - bl.append((char*)&version, sizeof(version)); - ::_encode(free.m, bl); - - committing_version = version; - - if (onfinish) - waitfor_save[version].push_back(onfinish); - - // write (async) - mds->filer->write(inode, - 0, bl.length(), bl, - 0, - 0, new C_ID_Save(this, version)); -} - -void IdAllocator::save_2(version_t v) -{ - dout(10) << "save_2 v " << v << dendl; - - committed_version = v; - - list ls; - while (!waitfor_save.empty()) { - if (waitfor_save.begin()->first > v) break; - ls.splice(ls.end(), waitfor_save.begin()->second); - waitfor_save.erase(waitfor_save.begin()); - } - finish_contexts(ls,0); -} - - -void IdAllocator::reset() -{ - init_inode(); - - // use generic range. FIXME THIS IS CRAP - free.clear(); -#ifdef __LP64__ - uint64_t start = (uint64_t)(mds->get_nodeid()+1) << 40; - uint64_t end = ((uint64_t)(mds->get_nodeid()+2) << 40) - 1; -#else -# warning this looks like a 32-bit system, using small inode numbers. - uint64_t start = (uint64_t)(mds->get_nodeid()+1) << 25; - uint64_t end = ((uint64_t)(mds->get_nodeid()+2) << 25) - 1; -#endif - free.insert(start, end); - - state = STATE_ACTIVE; -} - - - -// ----------------------- - -class C_ID_Load : public Context { -public: - IdAllocator *ida; - Context *onfinish; - bufferlist bl; - C_ID_Load(IdAllocator *i, Context *o) : ida(i), onfinish(o) {} - void finish(int r) { - ida->load_2(r, bl, onfinish); - } -}; - -void IdAllocator::load(Context *onfinish) -{ - dout(10) << "load" << dendl; - - init_inode(); - - assert(is_undef()); - state = STATE_OPENING; - - C_ID_Load *c = new C_ID_Load(this, onfinish); - mds->filer->read(inode, - 0, inode.layout.fl_stripe_unit, - &c->bl, - c); -} - -void IdAllocator::load_2(int r, bufferlist& bl, Context *onfinish) -{ - assert(is_opening()); - state = STATE_ACTIVE; - - if (r > 0) { - dout(10) << "load_2 got " << bl.length() << " bytes" << dendl; - int off = 0; - bl.copy(off, sizeof(version), (char*)&version); - off += sizeof(version); - ::_decode(free.m, bl, off); - committed_version = version; - } - else { - dout(10) << "load_2 found no alloc file" << dendl; - assert(0); // this shouldn't happen if mkfs finished. - reset(); - } - - if (onfinish) { - onfinish->finish(0); - delete onfinish; - } -} diff --git a/trunk/ceph/mds/IdAllocator.h b/trunk/ceph/mds/IdAllocator.h deleted file mode 100644 index 51001f2236627..0000000000000 --- a/trunk/ceph/mds/IdAllocator.h +++ /dev/null @@ -1,78 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __IDALLOCATOR_H -#define __IDALLOCATOR_H - -#include "include/types.h" -#include "include/interval_set.h" -#include "include/buffer.h" -#include "include/Context.h" - -class MDS; - -class IdAllocator { - MDS *mds; - inode_t inode; - - static const int STATE_UNDEF = 0; - static const int STATE_OPENING = 1; - static const int STATE_ACTIVE = 2; - //static const int STATE_COMMITTING = 3; - int state; - - version_t version, committing_version, committed_version; - - interval_set free; // unused ids - - map > waitfor_save; - - public: - IdAllocator(MDS *m) : - mds(m), - state(STATE_UNDEF), - version(0), committing_version(0), committed_version(0) - { - } - - void init_inode(); - - // alloc or reclaim ids - inodeno_t alloc_id(); - void reclaim_id(inodeno_t ino); - - version_t get_version() { return version; } - version_t get_committed_version() { return committed_version; } - version_t get_committing_version() { return committing_version; } - - // load/save from disk (hack) - bool is_undef() { return state == STATE_UNDEF; } - bool is_active() { return state == STATE_ACTIVE; } - bool is_opening() { return state == STATE_OPENING; } - - void reset(); - void save(Context *onfinish=0, version_t need=0); - void save_2(version_t v); - - void shutdown() { - if (is_active()) save(0); - } - - void load(Context *onfinish); - void load_2(int, bufferlist&, Context *onfinish); - -}; - -#endif diff --git a/trunk/ceph/mds/LocalLock.h b/trunk/ceph/mds/LocalLock.h deleted file mode 100644 index 752fdcb4d3fd1..0000000000000 --- a/trunk/ceph/mds/LocalLock.h +++ /dev/null @@ -1,61 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __LOCALLOCK_H -#define __LOCALLOCK_H - -#include "SimpleLock.h" - -class LocalLock : public SimpleLock { -protected: - int num_wrlock; - -public: - LocalLock(MDSCacheObject *o, int t, int wo) : - SimpleLock(o, t, wo), - num_wrlock(0) { - set_state(LOCK_LOCK); // always. - } - - bool can_wrlock() { - return !is_xlocked(); - } - void get_wrlock() { - assert(can_wrlock()); - if (num_wrlock == 0) parent->get(MDSCacheObject::PIN_LOCK); - ++num_wrlock; - } - void put_wrlock() { - --num_wrlock; - if (num_wrlock == 0) parent->put(MDSCacheObject::PIN_LOCK); - } - bool is_wrlocked() { return num_wrlock > 0; } - int get_num_wrlocks() { return num_wrlock; } - - - void print(ostream& out) { - out << "("; - out << get_lock_type_name(get_type()); - if (is_xlocked()) - out << " x=" << get_xlocked_by(); - if (is_wrlocked()) - out << " wr=" << get_num_wrlocks(); - out << ")"; - } - -}; - - -#endif diff --git a/trunk/ceph/mds/Locker.h b/trunk/ceph/mds/Locker.h deleted file mode 100644 index a69055f49449e..0000000000000 --- a/trunk/ceph/mds/Locker.h +++ /dev/null @@ -1,195 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_LOCKER_H -#define __MDS_LOCKER_H - -#include "include/types.h" - -#include -#include -#include -using std::map; -using std::list; -using std::set; - -class MDS; -class CDir; -class CInode; -class CDentry; - -class Message; - -class MDiscover; -class MDiscoverReply; -class MCacheExpire; -class MDirUpdate; -class MDentryUnlink; -class MLock; - -class MClientRequest; - -class Anchor; -class Capability; -class LogSegment; - -class SimpleLock; -class FileLock; -class ScatterLock; -class LocalLock; - -class Locker { -private: - MDS *mds; - MDCache *mdcache; - - public: - Locker(MDS *m, MDCache *c) : mds(m), mdcache(c) {} - - SimpleLock *get_lock(int lock_type, MDSCacheObjectInfo &info); - - void dispatch(Message *m); - void handle_lock(MLock *m); - - -protected: - void send_lock_message(SimpleLock *lock, int msg); - void send_lock_message(SimpleLock *lock, int msg, const bufferlist &data); - - // -- locks -- -public: - bool acquire_locks(MDRequest *mdr, - set &rdlocks, - set &wrlocks, - set &xlocks); - - void drop_locks(MDRequest *mdr); - -protected: - bool rdlock_start(SimpleLock *lock, MDRequest *mdr); - void rdlock_finish(SimpleLock *lock, MDRequest *mdr); - bool xlock_start(SimpleLock *lock, MDRequest *mdr); -public: - void xlock_finish(SimpleLock *lock, MDRequest *mdr); // public for Server's slave UNXLOCK -protected: - bool wrlock_start(SimpleLock *lock, MDRequest *mdr); - void wrlock_finish(SimpleLock *lock, MDRequest *mdr); - -public: - void rejoin_set_state(SimpleLock *lock, int s, list& waiters); - - // simple -public: - void try_simple_eval(SimpleLock *lock); - void simple_eval_gather(SimpleLock *lock); - bool simple_rdlock_try(SimpleLock *lock, Context *con); -protected: - void simple_eval(SimpleLock *lock); - void handle_simple_lock(SimpleLock *lock, MLock *m); - void simple_sync(SimpleLock *lock); - void simple_lock(SimpleLock *lock); - bool simple_rdlock_start(SimpleLock *lock, MDRequest *mdr); - void simple_rdlock_finish(SimpleLock *lock, MDRequest *mdr); - bool simple_xlock_start(SimpleLock *lock, MDRequest *mdr); - void simple_xlock_finish(SimpleLock *lock, MDRequest *mdr); - -public: - bool dentry_can_rdlock_trace(vector& trace); - void dentry_anon_rdlock_trace_start(vector& trace); - void dentry_anon_rdlock_trace_finish(vector& trace); - - // scatter -protected: - xlist autoscattered; - -public: - void try_scatter_eval(ScatterLock *lock); - void scatter_eval(ScatterLock *lock); // public for MDCache::adjust_subtree_auth() - void scatter_eval_gather(ScatterLock *lock); - - void scatter_unscatter_autoscattered(); - void scatter_try_unscatter(ScatterLock *lock, Context *c); - void note_autoscattered(ScatterLock *lock); - - void scatter_lock(ScatterLock *lock); // called by LogSegment::try_to_expire - -protected: - void handle_scatter_lock(ScatterLock *lock, MLock *m); - void _scatter_replica_lock(ScatterLock *lock, int auth); - void scatter_sync(ScatterLock *lock); - void scatter_scatter(ScatterLock *lock); - void scatter_tempsync(ScatterLock *lock); - bool scatter_rdlock_start(ScatterLock *lock, MDRequest *mdr); - void scatter_rdlock_finish(ScatterLock *lock, MDRequest *mdr); - bool scatter_wrlock_start(ScatterLock *lock, MDRequest *mdr); - void scatter_wrlock_finish(ScatterLock *lock, MDRequest *mdr); - - void scatter_writebehind(ScatterLock *lock); - class C_Locker_ScatterWB : public Context { - Locker *locker; - ScatterLock *lock; - LogSegment *ls; - public: - C_Locker_ScatterWB(Locker *l, ScatterLock *sl, LogSegment *s) : locker(l), lock(sl), ls(s) {} - void finish(int r) { - locker->scatter_writebehind_finish(lock, ls); - } - }; - void scatter_writebehind_finish(ScatterLock *lock, LogSegment *ls); - - // local -protected: - bool local_wrlock_start(LocalLock *lock, MDRequest *mdr); - void local_wrlock_finish(LocalLock *lock, MDRequest *mdr); - bool local_xlock_start(LocalLock *lock, MDRequest *mdr); - void local_xlock_finish(LocalLock *lock, MDRequest *mdr); - - - // file -public: - void file_eval_gather(FileLock *lock); - void try_file_eval(FileLock *lock); -protected: - void file_eval(FileLock *lock); - void handle_file_lock(FileLock *lock, MLock *m); - bool file_sync(FileLock *lock); - void file_lock(FileLock *lock); - void file_mixed(FileLock *lock); - void file_loner(FileLock *lock); - bool file_rdlock_try(FileLock *lock, Context *con); - bool file_rdlock_start(FileLock *lock, MDRequest *mdr); - void file_rdlock_finish(FileLock *lock, MDRequest *mdr); - bool file_xlock_start(FileLock *lock, MDRequest *mdr); - void file_xlock_finish(FileLock *lock, MDRequest *mdr); - - - - // -- file i/o -- - public: - version_t issue_file_data_version(CInode *in); - Capability* issue_new_caps(CInode *in, int mode, MClientRequest *req); - bool issue_caps(CInode *in); - - protected: - void handle_client_file_caps(class MClientFileCaps *m); - - void request_inode_file_caps(CInode *in); - void handle_inode_file_caps(class MInodeFileCaps *m); - - friend class C_MDL_RequestInodeFileCaps; - -}; - - -#endif diff --git a/trunk/ceph/mds/LogEvent.cc b/trunk/ceph/mds/LogEvent.cc deleted file mode 100644 index 65b0bb2ec1322..0000000000000 --- a/trunk/ceph/mds/LogEvent.cc +++ /dev/null @@ -1,87 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "LogEvent.h" - -#include "MDS.h" - -// events i know of -#include "events/EString.h" - -#include "events/ESubtreeMap.h" -#include "events/EExport.h" -#include "events/EImportStart.h" -#include "events/EImportFinish.h" -#include "events/EFragment.h" - -#include "events/ESession.h" -#include "events/ESessions.h" - -#include "events/EUpdate.h" -#include "events/ESlaveUpdate.h" -#include "events/EOpen.h" - -#include "events/EPurgeFinish.h" - -#include "events/EAnchor.h" -#include "events/EAnchorClient.h" - - - -LogEvent *LogEvent::decode(bufferlist& bl) -{ - // parse type, length - int off = 0; - int type; - bl.copy(off, sizeof(type), (char*)&type); - off += sizeof(type); - - int length = bl.length() - off; - generic_dout(15) << "decode_log_event type " << type << ", size " << length << dendl; - - assert(type > 0); - - // create event - LogEvent *le; - switch (type) { - case EVENT_STRING: le = new EString; break; - - case EVENT_SUBTREEMAP: le = new ESubtreeMap; break; - case EVENT_EXPORT: le = new EExport; break; - case EVENT_IMPORTSTART: le = new EImportStart; break; - case EVENT_IMPORTFINISH: le = new EImportFinish; break; - case EVENT_FRAGMENT: le = new EFragment; break; - - case EVENT_SESSION: le = new ESession; break; - case EVENT_SESSIONS: le = new ESessions; break; - - case EVENT_UPDATE: le = new EUpdate; break; - case EVENT_SLAVEUPDATE: le = new ESlaveUpdate; break; - case EVENT_OPEN: le = new EOpen; break; - - case EVENT_PURGEFINISH: le = new EPurgeFinish; break; - - case EVENT_ANCHOR: le = new EAnchor; break; - case EVENT_ANCHORCLIENT: le = new EAnchorClient; break; - default: - generic_dout(1) << "uh oh, unknown log event type " << type << dendl; - assert(0); - } - - // decode - le->decode_payload(bl, off); - - return le; -} - diff --git a/trunk/ceph/mds/LogEvent.h b/trunk/ceph/mds/LogEvent.h deleted file mode 100644 index 8d36a1d515c1c..0000000000000 --- a/trunk/ceph/mds/LogEvent.h +++ /dev/null @@ -1,97 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __LOGEVENT_H -#define __LOGEVENT_H - -#define EVENT_STRING 1 - -#define EVENT_SUBTREEMAP 2 -#define EVENT_EXPORT 3 -#define EVENT_IMPORTSTART 4 -#define EVENT_IMPORTFINISH 5 -#define EVENT_FRAGMENT 6 - -#define EVENT_SESSION 10 -#define EVENT_SESSIONS 11 - -#define EVENT_UPDATE 20 -#define EVENT_SLAVEUPDATE 21 -#define EVENT_OPEN 22 - -#define EVENT_PURGEFINISH 30 - -#define EVENT_ANCHOR 40 -#define EVENT_ANCHORCLIENT 41 - - - - -#include -using namespace std; - -#include "include/buffer.h" -#include "include/Context.h" - -class MDS; -class LogSegment; - -// generic log event -class LogEvent { - private: - int _type; - off_t _start_off,_end_off; - - friend class MDLog; - - public: - LogSegment *_segment; - - LogEvent(int t) : - _type(t), _start_off(0), _end_off(0), _segment(0) { } - virtual ~LogEvent() { } - - int get_type() { return _type; } - off_t get_start_off() { return _start_off; } - off_t get_end_off() { return _end_off; } - - // encoding - virtual void encode_payload(bufferlist& bl) = 0; - virtual void decode_payload(bufferlist& bl, int& off) = 0; - static LogEvent *decode(bufferlist &bl); - - - virtual void print(ostream& out) { - out << "event(" << _type << ")"; - } - - /*** live journal ***/ - /* update_segment() - adjust any state we need to in the LogSegment - */ - virtual void update_segment() { } - - /*** recovery ***/ - /* replay() - replay given event. this is idempotent. - */ - virtual void replay(MDS *m) { assert(0); } - - -}; - -inline ostream& operator<<(ostream& out, LogEvent& le) { - le.print(out); - return out; -} - -#endif diff --git a/trunk/ceph/mds/LogSegment.h b/trunk/ceph/mds/LogSegment.h deleted file mode 100644 index c4cf1d50897ff..0000000000000 --- a/trunk/ceph/mds/LogSegment.h +++ /dev/null @@ -1,69 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __LOGSEGMENT_H -#define __LOGSEGMENT_H - -#include "include/xlist.h" -#include "include/interval_set.h" -#include "include/Context.h" - -#include -using __gnu_cxx::hash_set; - -class CDir; -class CInode; -class CDentry; -class MDS; -class MDSlaveUpdate; - -class LogSegment { - public: - off_t offset, end; - int num_events; - - // dirty items - xlist dirty_dirfrags; - xlist dirty_inodes; - xlist dirty_dentries; - - xlist open_files; - xlist dirty_inode_mtimes; - - xlist slave_updates; - - //xlist purging_inodes; - map > purging_inodes; - - // committed anchor transactions - hash_set pending_commit_atids; - - // client request ids - map last_client_tids; - - // table version - version_t allocv; - version_t clientmapv; - version_t anchortablev; - - // try to expire - C_Gather *try_to_expire(MDS *mds); - - // cons - LogSegment(off_t off) : offset(off), end(off), num_events(0), - allocv(0), clientmapv(0), anchortablev(0) - { } -}; - -#endif diff --git a/trunk/ceph/mds/MDBalancer.h b/trunk/ceph/mds/MDBalancer.h deleted file mode 100644 index 819c69f0616c0..0000000000000 --- a/trunk/ceph/mds/MDBalancer.h +++ /dev/null @@ -1,118 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __MDBALANCER_H -#define __MDBALANCER_H - -#include -#include -using std::list; -using std::map; - -#include -using namespace __gnu_cxx; - -#include "include/types.h" -#include "common/Clock.h" -#include "CInode.h" - - -class MDS; -class Message; -class MHeartbeat; -class CInode; -class Context; -class CDir; - -class MDBalancer { - protected: - MDS *mds; - int beat_epoch; - - int last_epoch_under; - int last_epoch_over; - - utime_t last_heartbeat; - utime_t last_fragment; - utime_t last_sample; - - - // todo - set split_queue; - - // per-epoch scatter/gathered info - hash_map mds_load; - hash_map mds_meta_load; - map > mds_import_map; - - // per-epoch state - double my_load, target_load; - map my_targets; - map imported; - map exported; - - double try_match(int ex, double& maxex, - int im, double& maxim); - double get_maxim(int im) { - return target_load - mds_meta_load[im] - imported[im]; - } - double get_maxex(int ex) { - return mds_meta_load[ex] - target_load - exported[ex]; - } - - public: - MDBalancer(MDS *m) : - mds(m), - beat_epoch(0), - last_epoch_under(0), last_epoch_over(0) { } - - mds_load_t get_load(); - - int proc_message(Message *m); - - void send_heartbeat(); - void handle_heartbeat(MHeartbeat *m); - - void tick(); - - void do_fragmenting(); - - void export_empties(); - void do_rebalance(int beat); - void find_exports(CDir *dir, - double amount, - list& exports, - double& have, - set& already_exporting, - utime_t now); - - - void subtract_export(class CDir *ex); - void add_import(class CDir *im); - - void hit_inode(utime_t now, class CInode *in, int type, int who=-1); - void hit_dir(utime_t now, class CDir *dir, int type, int who, double amount=1.0); - void hit_recursive(utime_t now, class CDir *dir, int type, double amount, double rd_adj); - - - void show_imports(bool external=false); - void dump_pop_map(); - -}; - - - -#endif diff --git a/trunk/ceph/mds/MDCache.h b/trunk/ceph/mds/MDCache.h deleted file mode 100644 index 35c324f150686..0000000000000 --- a/trunk/ceph/mds/MDCache.h +++ /dev/null @@ -1,726 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __MDCACHE_H -#define __MDCACHE_H - -#include -#include -#include -#include -#include - -#include "include/types.h" -#include "include/filepath.h" - -#include "CInode.h" -#include "CDentry.h" -#include "CDir.h" -#include "include/Context.h" -#include "events/EMetaBlob.h" - -class MDS; -class Migrator; -class Renamer; - -class Logger; - -class Message; - -class MMDSResolve; -class MMDSResolveAck; -class MMDSCacheRejoin; -class MMDSCacheRejoinAck; -class MDiscover; -class MDiscoverReply; -class MCacheExpire; -class MDirUpdate; -class MDentryUnlink; -class MLock; - -class Message; -class MClientRequest; -class MMDSSlaveRequest; - -class MMDSFragmentNotify; - -class ESubtreeMap; - - -// MDCache - -//typedef const char* pchar; - - -struct PVList { - map ls; - - version_t add(MDSCacheObject* o, version_t v) { - return ls[o] = v; - } -}; - -/** active_request_t - * state we track for requests we are currently processing. - * mostly information about locks held, so that we can drop them all - * the request is finished or forwarded. see request_*(). - */ -struct MDRequest { - metareqid_t reqid; - - // -- i am a client (master) request - MClientRequest *client_request; // client request (if any) - - vector trace; // original path traversal. - CInode *ref; // reference inode. if there is only one, and its path is pinned. - - // -- i am a slave request - MMDSSlaveRequest *slave_request; // slave request (if one is pending; implies slave == true) - int slave_to_mds; // this is a slave request if >= 0. - - // -- misc -- - LogSegment *ls; // the log segment i'm committing to - utime_t now; - - // -- my pins and locks -- - // cache pins (so things don't expire) - set< MDSCacheObject* > pins; - set stickydirs; - - // auth pins - set< MDSCacheObject* > remote_auth_pins; - set< MDSCacheObject* > auth_pins; - - // held locks - set< SimpleLock* > rdlocks; // always local. - set< SimpleLock* > wrlocks; // always local. - set< SimpleLock* > xlocks; // local or remote. - set< SimpleLock*, SimpleLock::ptr_lt > locks; // full ordering - - // if this flag is set, do not attempt to acquire further locks. - // (useful for wrlock, which may be a moving auth target) - bool done_locking; - bool committing; - bool aborted; - - struct More { - set slaves; // mds nodes that have slave requests to me (implies client_request) - set waiting_on_slave; // peers i'm waiting for slavereq replies from. - - // for rename/link/unlink - set witnessed; // nodes who have journaled a RenamePrepare - map pvmap; - - // for rename - set extra_witnesses; // replica list from srcdn auth (rename) - version_t src_reanchor_atid; // src->dst - version_t dst_reanchor_atid; // dst->stray - bufferlist inode_import; - version_t inode_import_v; - CInode* destdn_was_remote_inode; - bool was_link_merge; - - map imported_client_map; - map > cap_imports; - - // called when slave commits or aborts - Context *slave_commit; - - More() : - src_reanchor_atid(0), dst_reanchor_atid(0), inode_import_v(0), - destdn_was_remote_inode(0), was_link_merge(false), - slave_commit(0) { } - } *_more; - - - // --------------------------------------------------- - MDRequest() : - client_request(0), ref(0), - slave_request(0), slave_to_mds(-1), - ls(0), - done_locking(false), committing(false), aborted(false), - _more(0) {} - MDRequest(metareqid_t ri, MClientRequest *req) : - reqid(ri), client_request(req), ref(0), - slave_request(0), slave_to_mds(-1), - ls(0), - done_locking(false), committing(false), aborted(false), - _more(0) {} - MDRequest(metareqid_t ri, int by) : - reqid(ri), client_request(0), ref(0), - slave_request(0), slave_to_mds(by), - ls(0), - done_locking(false), committing(false), aborted(false), - _more(0) {} - ~MDRequest() { - delete _more; - } - - bool is_master() { return slave_to_mds < 0; } - bool is_slave() { return slave_to_mds >= 0; } - - More* more() { - if (!_more) _more = new More(); - return _more; - } - - bool slave_did_prepare() { return more()->slave_commit; } - - - // pin items in cache - void pin(MDSCacheObject *o) { - if (pins.count(o) == 0) { - o->get(MDSCacheObject::PIN_REQUEST); - pins.insert(o); - } - } - void set_stickydirs(CInode *in) { - if (stickydirs.count(in) == 0) { - in->get_stickydirs(); - stickydirs.insert(in); - } - } - - // auth pins - bool is_auth_pinned(MDSCacheObject *object) { - return auth_pins.count(object) || remote_auth_pins.count(object); - } - void auth_pin(MDSCacheObject *object) { - if (!is_auth_pinned(object)) { - object->auth_pin(); - auth_pins.insert(object); - } - } - void auth_unpin(MDSCacheObject *object) { - assert(is_auth_pinned(object)); - object->auth_unpin(); - auth_pins.erase(object); - } - void drop_local_auth_pins() { - for (set::iterator it = auth_pins.begin(); - it != auth_pins.end(); - it++) { - assert((*it)->is_auth()); - (*it)->auth_unpin(); - } - auth_pins.clear(); - } -}; - -inline ostream& operator<<(ostream& out, MDRequest &mdr) -{ - out << "request(" << mdr.reqid; - //if (mdr.request) out << " " << *mdr.request; - if (mdr.is_slave()) out << " slave_to mds" << mdr.slave_to_mds; - if (mdr.client_request) out << " cr=" << mdr.client_request; - if (mdr.slave_request) out << " sr=" << mdr.slave_request; - out << ")"; - return out; -} - -struct MDSlaveUpdate { - EMetaBlob commit; - EMetaBlob rollback; - xlist::item xlistitem; - Context *waiter; - MDSlaveUpdate() : xlistitem(this), waiter(0) {} - MDSlaveUpdate(EMetaBlob c, EMetaBlob r, xlist &list) : - commit(c), rollback(r), - xlistitem(this), - waiter(0) { - list.push_back(&xlistitem); - } - ~MDSlaveUpdate() { - if (waiter) waiter->finish(0); - delete waiter; - } -}; - - -class MDCache { - public: - // my master - MDS *mds; - - // -- my cache -- - LRU lru; // dentry lru for expiring items from cache - protected: - hash_map inode_map; // map of inodes by ino - CInode *root; // root inode - CInode *stray; // my stray dir - - set base_inodes; // inodes < MDS_INO_BASE (root, stray, etc.) - - // -- discover -- - // waiters - map > > waiting_for_base_ino; - - // in process discovers, by mds. - // this is just enough info to kick any waiters in the event of a failure. - // FIXME: use pointers here instead of identifiers? - map > discover_dir; - map > discover_dir_sub; - - void discover_base_ino(inodeno_t want_ino, Context *onfinish, int from=-1); - void discover_dir_frag(CInode *base, frag_t approx_fg, Context *onfinish, - int from=-1); - void discover_path(CInode *base, filepath want_path, Context *onfinish, - bool want_xlocked=false, int from=-1); - void discover_path(CDir *base, filepath want_path, Context *onfinish, - bool want_xlocked=false); - void discover_ino(CDir *base, inodeno_t want_ino, Context *onfinish, - bool want_xlocked=false); - - void kick_discovers(int who); // after a failure. - - -public: - int get_num_inodes() { return inode_map.size(); } - int get_num_dentries() { return lru.lru_get_size(); } - - - // -- subtrees -- -protected: - map > subtrees; // nested bounds on subtrees. - - // adjust subtree auth specification - // dir->dir_auth - // imports/exports/nested_exports - // join/split subtrees as appropriate -public: - bool is_subtrees() { return !subtrees.empty(); } - void list_subtrees(list& ls); - void adjust_subtree_auth(CDir *root, pair auth); - void adjust_subtree_auth(CDir *root, int a, int b=CDIR_AUTH_UNKNOWN) { - adjust_subtree_auth(root, pair(a,b)); - } - void adjust_bounded_subtree_auth(CDir *dir, set& bounds, pair auth); - void adjust_bounded_subtree_auth(CDir *dir, set& bounds, int a) { - adjust_bounded_subtree_auth(dir, bounds, pair(a, CDIR_AUTH_UNKNOWN)); - } - void adjust_bounded_subtree_auth(CDir *dir, list& bounds, pair auth); - void adjust_bounded_subtree_auth(CDir *dir, list& bounds, int a) { - adjust_bounded_subtree_auth(dir, bounds, pair(a, CDIR_AUTH_UNKNOWN)); - } - void map_dirfrag_set(list& dfs, set& result); - void try_subtree_merge(CDir *root); - void try_subtree_merge_at(CDir *root); - void subtree_merge_writebehind_finish(CInode *in, LogSegment *ls); - void eval_subtree_root(CDir *dir); - CDir *get_subtree_root(CDir *dir); - void remove_subtree(CDir *dir); - void get_subtree_bounds(CDir *root, set& bounds); - void get_wouldbe_subtree_bounds(CDir *root, set& bounds); - void verify_subtree_bounds(CDir *root, const set& bounds); - void verify_subtree_bounds(CDir *root, const list& bounds); - - void adjust_subtree_after_rename(CInode *diri, CDir *olddir); - - void get_auth_subtrees(set& s); - void get_fullauth_subtrees(set& s); - - int num_subtrees(); - int num_subtrees_fullauth(); - int num_subtrees_fullnonauth(); - - -protected: - // delayed cache expire - map > delayed_expire; // subtree root -> expire msg - - - // -- requests -- -protected: - hash_map active_requests; - -public: - MDRequest* request_start(MClientRequest *req); - MDRequest* request_start_slave(metareqid_t rid, int by); - bool have_request(metareqid_t rid) { - return active_requests.count(rid); - } - MDRequest* request_get(metareqid_t rid); - void request_pin_ref(MDRequest *r, CInode *ref, vector& trace); - void request_finish(MDRequest *mdr); - void request_forward(MDRequest *mdr, int mds, int port=0); - void dispatch_request(MDRequest *mdr); - void request_forget_foreign_locks(MDRequest *mdr); - void request_cleanup(MDRequest *r); - - - // inode purging - map > purging; // inode -> newsize -> oldsize - map > purging_ls; - map > > waiting_for_purge; - - // -- recovery -- -protected: - set recovery_set; - -public: - void set_recovery_set(set& s); - void handle_mds_failure(int who); - void handle_mds_recovery(int who); - -protected: - // [resolve] - // from EImportStart w/o EImportFinish during journal replay - map > my_ambiguous_imports; - // from MMDSResolves - map > > other_ambiguous_imports; - - map > uncommitted_slave_updates; // for replay. - map ambiguous_slave_updates; // for log trimming. - map waiting_for_slave_update_commit; - friend class ESlaveUpdate; - - set wants_resolve; // nodes i need to send my resolve to - set got_resolve; // nodes i got resolves from - set need_resolve_ack; // nodes i need a resolve_ack from - - void handle_resolve(MMDSResolve *m); - void handle_resolve_ack(MMDSResolveAck *m); - void maybe_resolve_finish(); - void disambiguate_imports(); - void recalc_auth_bits(); -public: - // ambiguous imports - void add_ambiguous_import(dirfrag_t base, list& bounds); - void add_ambiguous_import(CDir *base, const set& bounds); - bool have_ambiguous_import(dirfrag_t base) { - return my_ambiguous_imports.count(base); - } - void cancel_ambiguous_import(dirfrag_t dirino); - void finish_ambiguous_import(dirfrag_t dirino); - void send_resolve(int who); - void send_resolve_now(int who); - void send_resolve_later(int who); - void maybe_send_pending_resolves(); - - ESubtreeMap *create_subtree_map(); - - -protected: - // [rejoin] - set rejoin_gather; // nodes from whom i need a rejoin - set rejoin_sent; // nodes i sent a rejoin to - set rejoin_ack_gather; // nodes from whom i need a rejoin ack - - map > cap_exports; // ino -> client -> capex - map cap_export_paths; - - map > > cap_imports; // ino -> client -> frommds -> capex - map cap_import_paths; - - set rejoin_undef_inodes; - - void rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin); - void handle_cache_rejoin(MMDSCacheRejoin *m); - void handle_cache_rejoin_weak(MMDSCacheRejoin *m); - CInode* rejoin_invent_inode(inodeno_t ino); - void handle_cache_rejoin_strong(MMDSCacheRejoin *m); - void rejoin_scour_survivor_replicas(int from, MMDSCacheRejoin *ack); - void handle_cache_rejoin_ack(MMDSCacheRejoin *m); - void handle_cache_rejoin_purge(MMDSCacheRejoin *m); - void handle_cache_rejoin_missing(MMDSCacheRejoin *m); - void handle_cache_rejoin_full(MMDSCacheRejoin *m); - void rejoin_send_acks(); - void rejoin_trim_undef_inodes(); -public: - void rejoin_gather_finish(); - void rejoin_send_rejoins(); - void rejoin_export_caps(inodeno_t ino, string& path, int client, inode_caps_reconnect_t& icr) { - cap_exports[ino][client] = icr; - cap_export_paths[ino] = path; - } - void rejoin_recovered_caps(inodeno_t ino, string& path, int client, inode_caps_reconnect_t& icr, - int frommds=-1) { - cap_imports[ino][client][frommds] = icr; - cap_import_paths[ino] = path; - } - void rejoin_import_cap(CInode *in, int client, inode_caps_reconnect_t& icr, int frommds); - - - friend class Locker; - friend class Migrator; - friend class Renamer; - friend class MDBalancer; - - - public: - - // subsystems - Migrator *migrator; - Renamer *renamer; - - public: - MDCache(MDS *m); - ~MDCache(); - - // debug - void log_stat(Logger *logger); - - // root inode - CInode *get_root() { return root; } - void set_root(CInode *r); - CInode *get_stray() { return stray; } - - // cache - void set_cache_size(size_t max) { lru.lru_set_max(max); } - size_t get_cache_size() { return lru.lru_get_size(); } - - // trimming - bool trim(int max = -1); // trim cache - void trim_dentry(CDentry *dn, map& expiremap); - void trim_dirfrag(CDir *dir, CDir *con, - map& expiremap); - void trim_inode(CDentry *dn, CInode *in, CDir *con, - map& expiremap); - void send_expire_messages(map& expiremap); - void trim_non_auth(); // trim out trimmable non-auth items - - // shutdown - void shutdown_start(); - void shutdown_check(); - bool shutdown_pass(); - bool shutdown_export_strays(); - bool shutdown_export_caps(); - bool shutdown(); // clear cache (ie at shutodwn) - - bool did_shutdown_log_cap; - - // inode_map - bool have_inode( inodeno_t ino ) { return inode_map.count(ino) ? true:false; } - CInode* get_inode( inodeno_t ino ) { - if (have_inode(ino)) - return inode_map[ino]; - return NULL; - } - CDir* get_dirfrag(dirfrag_t df) { - if (!have_inode(df.ino)) return NULL; - return inode_map[df.ino]->get_dirfrag(df.frag); - } - /* - void get_dirfrags_under(dirfrag_t df, list& ls) { - if (have_inode(df.ino)) - inode_map[df.ino]->get_dirfrags_under(df.frag, ls); - } - */ - - MDSCacheObject *get_object(MDSCacheObjectInfo &info); - - - - public: - CInode *create_inode(); - void add_inode(CInode *in); - - void remove_inode(CInode *in); - protected: - void touch_inode(CInode *in) { - if (in->get_parent_dn()) - touch_dentry(in->get_parent_dn()); - } - void touch_dentry(CDentry *dn) { - // touch ancestors - if (dn->get_dir()->get_inode()->get_parent_dn()) - touch_dentry(dn->get_dir()->get_inode()->get_parent_dn()); - - // touch me - if (dn->is_auth()) - lru.lru_touch(dn); - else - lru.lru_midtouch(dn); - } - - void inode_remove_replica(CInode *in, int rep); - void dentry_remove_replica(CDentry *dn, int rep); - - void rename_file(CDentry *srcdn, CDentry *destdn); - - public: - // inode purging - void purge_inode(CInode *in, off_t newsize, off_t oldsize, LogSegment *ls); - void _do_purge_inode(CInode *in, off_t newsize, off_t oldsize); - void purge_inode_finish(CInode *in, off_t newsize, off_t oldsize); - void purge_inode_finish_2(CInode *in, off_t newsize, off_t oldsize); - bool is_purging(CInode *in, off_t newsize, off_t oldsize) { - return purging.count(in) && purging[in].count(newsize); - } - void wait_for_purge(CInode *in, off_t newsize, Context *c) { - waiting_for_purge[in][newsize].push_back(c); - } - - void add_recovered_purge(CInode *in, off_t newsize, off_t oldsize, LogSegment *ls); - void remove_recovered_purge(CInode *in, off_t newsize, off_t oldsize); - void start_recovered_purges(); - - - public: - CDir *get_auth_container(CDir *in); - CDir *get_export_container(CDir *dir); - void find_nested_exports(CDir *dir, set& s); - void find_nested_exports_under(CDir *import, CDir *dir, set& s); - - public: - CInode *create_root_inode(); - void open_root(Context *c); - CInode *create_stray_inode(int whose=-1); - void open_local_stray(); - void open_foreign_stray(int who, Context *c); - CDentry *get_or_create_stray_dentry(CInode *in); - - Context *_get_waiter(MDRequest *mdr, Message *req); - int path_traverse(MDRequest *mdr, Message *req, filepath& path, - vector& trace, bool follow_trailing_sym, - int onfail); - bool path_is_mine(filepath& path); - bool path_is_mine(string& p) { - filepath path(p); - return path_is_mine(path); - } - CDir *path_traverse_to_dir(filepath& path); - - void open_remote_dirfrag(CInode *diri, frag_t fg, Context *fin); - CInode *get_dentry_inode(CDentry *dn, MDRequest *mdr); - void open_remote_ino(inodeno_t ino, MDRequest *mdr, Context *fin); - void open_remote_ino_2(inodeno_t ino, MDRequest *mdr, - vector& anchortrace, - Context *onfinish); - - C_Gather *parallel_fetch(map& pathmap); - - void make_trace(vector& trace, CInode *in); - - // -- anchors -- -public: - void anchor_create(MDRequest *mdr, CInode *in, Context *onfinish); - void anchor_destroy(CInode *in, Context *onfinish); -protected: - void _anchor_create_prepared(CInode *in, version_t atid); - void _anchor_create_logged(CInode *in, version_t atid, LogSegment *ls); - void _anchor_destroy_prepared(CInode *in, version_t atid); - void _anchor_destroy_logged(CInode *in, version_t atid, LogSegment *ls); - - friend class C_MDC_AnchorCreatePrepared; - friend class C_MDC_AnchorCreateLogged; - friend class C_MDC_AnchorDestroyPrepared; - friend class C_MDC_AnchorDestroyLogged; - - // -- stray -- -public: - void eval_stray(CDentry *dn); - void eval_remote(CDentry *dn); -protected: - void _purge_stray(CDentry *dn); - void _purge_stray_logged(CDentry *dn, version_t pdv, LogSegment *ls); - friend class C_MDC_PurgeStray; - void reintegrate_stray(CDentry *dn, CDentry *rlink); - void migrate_stray(CDentry *dn, int src, int dest); - - - // == messages == - public: - void dispatch(Message *m); - - protected: - // -- replicas -- - void handle_discover(MDiscover *dis); - void handle_discover_reply(MDiscoverReply *m); - - CDir* add_replica_dir(CInode *diri, - frag_t fg, CDirDiscover& dis, - int from, - list& finished); - CDir* forge_replica_dir(CInode *diri, frag_t fg, int from); - - CDentry *add_replica_dentry(CDir *dir, CDentryDiscover &dis, list& finished); -public: // for Server::handle_slave_rename_prep - CInode *add_replica_inode(CInodeDiscover& dis, CDentry *dn, list& finished); - -public: - CDentry *add_replica_stray(bufferlist &bl, CInode *strayin, int from); -protected: - - - - // -- namespace -- - void handle_dentry_unlink(MDentryUnlink *m); - - - // -- fragmenting -- -private: - void adjust_dir_fragments(CInode *diri, frag_t basefrag, int bits, - list& frags, list& waiters); - friend class EFragment; - -public: - void split_dir(CDir *dir, int byn); - -private: - void fragment_freeze(CInode *diri, list& startfrags, frag_t basefrag, int bits); - void fragment_mark_and_complete(CInode *diri, list& startfrags, frag_t basefrag, int bits); - void fragment_go(CInode *diri, list& startfrags, frag_t basefrag, int bits); - void fragment_stored(CInode *diri, frag_t basefrag, int bits, list& resultfrags); - void fragment_logged(CInode *diri, frag_t basefrag, int bits, list& resultfrags, vector& pvs, LogSegment *ls); - friend class C_MDC_FragmentGo; - friend class C_MDC_FragmentMarking; - friend class C_MDC_FragmentStored; - friend class C_MDC_FragmentLogged; - - void handle_fragment_notify(MMDSFragmentNotify *m); - - - // -- updates -- - //int send_inode_updates(CInode *in); - //void handle_inode_update(MInodeUpdate *m); - - int send_dir_updates(CDir *in, bool bcast=false); - void handle_dir_update(MDirUpdate *m); - - // -- cache expiration -- - void handle_cache_expire(MCacheExpire *m); - void process_delayed_expire(CDir *dir); - void discard_delayed_expire(CDir *dir); - - - // == crap fns == - public: - void show_cache(); - void dump_cache(); - void show_subtrees(int dbl=10); - - CInode *hack_pick_random_inode() { - assert(!inode_map.empty()); - int n = rand() % inode_map.size(); - hash_map::iterator p = inode_map.begin(); - while (n--) p++; - return p->second; - } - -}; - -class C_MDS_RetryRequest : public Context { - MDCache *cache; - MDRequest *mdr; - public: - C_MDS_RetryRequest(MDCache *c, MDRequest *r) : cache(c), mdr(r) {} - virtual void finish(int r) { - cache->dispatch_request(mdr); - } -}; - -#endif diff --git a/trunk/ceph/mds/MDLog.h b/trunk/ceph/mds/MDLog.h deleted file mode 100644 index c958585b86a48..0000000000000 --- a/trunk/ceph/mds/MDLog.h +++ /dev/null @@ -1,198 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MDLOG_H -#define __MDLOG_H - -#include "include/types.h" -#include "include/Context.h" - -#include "common/Thread.h" -#include "common/Cond.h" - -#include "LogSegment.h" - -#include - -//#include -//using __gnu_cxx::hash_mapset; - -class Journaler; -class LogEvent; -class MDS; -class LogSegment; -class ESubtreeMap; - -class Logger; - -#include -using std::map; - - -class MDLog { - protected: - MDS *mds; - int num_events; // in events - int max_events; - int max_segments; - - int unflushed; - - bool capped; - - inode_t log_inode; - Journaler *journaler; - - Logger *logger; - - - // -- replay -- - Cond replay_cond; - - class ReplayThread : public Thread { - MDLog *log; - public: - ReplayThread(MDLog *l) : log(l) {} - void* entry() { - log->_replay_thread(); - return 0; - } - } replay_thread; - - friend class ReplayThread; - friend class C_MDL_Replay; - - list waitfor_replay; - - void _replay(); // old way - void _replay_thread(); // new way - - - // -- segments -- - map segments; - set expiring_segments; - set expired_segments; - int expiring_events; - int expired_events; - - class C_MDL_WroteSubtreeMap : public Context { - MDLog *mdlog; - off_t off; - public: - C_MDL_WroteSubtreeMap(MDLog *l, off_t o) : mdlog(l), off(o) { } - void finish(int r) { - mdlog->_logged_subtree_map(off); - } - }; - void _logged_subtree_map(off_t off); - - - // -- subtreemaps -- - bool writing_subtree_map; // one is being written now - - friend class ESubtreeMap; - friend class C_MDS_WroteImportMap; - friend class MDCache; - -public: - off_t get_last_segment_offset() { - assert(!segments.empty()); - return segments.rbegin()->first; - } - - -private: - void init_journaler(); - -public: - void reopen_logger(utime_t start, bool append=false); - - // replay state - map > pending_exports; - - - -public: - MDLog(MDS *m) : mds(m), - num_events(0), - max_events(g_conf.mds_log_max_events), - max_segments(g_conf.mds_log_max_segments), - unflushed(0), - capped(false), - journaler(0), - logger(0), - replay_thread(this), - expiring_events(0), expired_events(0), - writing_subtree_map(false) { - } - ~MDLog(); - - - void start_new_segment(Context *onsync=0); - LogSegment *get_current_segment() { - return segments.empty() ? 0:segments.rbegin()->second; - } - - - void flush_logger(); - - size_t get_num_events() { return num_events; } - void set_max_events(int m) { max_events = m; } - size_t get_num_segments() { return segments.size(); } - void set_max_segments(int m) { max_segments = m; } - - off_t get_read_pos(); - off_t get_write_pos(); - bool empty() { return segments.empty(); } - - bool is_capped() { return capped; } - void cap(); - - void submit_entry( LogEvent *e, Context *c = 0 ); - void wait_for_sync( Context *c ); - void flush(); - bool is_flushed() { - return unflushed == 0; - } - -private: - class C_MaybeExpiredSegment : public Context { - MDLog *mdlog; - LogSegment *ls; - public: - C_MaybeExpiredSegment(MDLog *mdl, LogSegment *s) : mdlog(mdl), ls(s) {} - void finish(int res) { - mdlog->_maybe_expired(ls); - } - }; - - void try_expire(LogSegment *ls); - void _maybe_expired(LogSegment *ls); - void _expired(LogSegment *ls); - -public: - void trim(); - -private: - void write_head(Context *onfinish); - -public: - void create(Context *onfinish); // fresh, empty log! - void open(Context *onopen); // append() or replay() to follow! - void append(); - void replay(Context *onfinish); -}; - -#endif diff --git a/trunk/ceph/mds/Migrator.h b/trunk/ceph/mds/Migrator.h deleted file mode 100644 index b32a6a1a4f33f..0000000000000 --- a/trunk/ceph/mds/Migrator.h +++ /dev/null @@ -1,277 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_MIGRATOR_H -#define __MDS_MIGRATOR_H - -#include "include/types.h" - -#include -#include -#include -using std::map; -using std::list; -using std::set; - - -class MDS; -class CDir; -class CInode; -class CDentry; - -class MExportDirDiscover; -class MExportDirDiscoverAck; -class MExportDirCancel; -class MExportDirPrep; -class MExportDirPrepAck; -class MExportDir; -class MExportDirAck; -class MExportDirNotify; -class MExportDirNotifyAck; -class MExportDirFinish; - -class MExportCaps; -class MExportCapsAck; - -class EImportStart; - - -class Migrator { -private: - MDS *mds; - MDCache *cache; - - // -- exports -- -public: - // export stages. used to clean up intelligently if there's a failure. - const static int EXPORT_DISCOVERING = 1; // dest is disovering export dir - const static int EXPORT_FREEZING = 2; // we're freezing the dir tree - const static int EXPORT_PREPPING = 4; // sending dest spanning tree to export bounds - const static int EXPORT_WARNING = 5; // warning bystanders of dir_auth_pending - const static int EXPORT_EXPORTING = 6; // sent actual export, waiting for ack - const static int EXPORT_LOGGINGFINISH = 7; // logging EExportFinish - const static int EXPORT_NOTIFYING = 8; // waiting for notifyacks - const static int EXPORT_ABORTING = 9; // notifying bystanders of abort - static const char *get_export_statename(int s) { - switch (s) { - case EXPORT_DISCOVERING: return "discovering"; - case EXPORT_FREEZING: return "freezing"; - case EXPORT_PREPPING: return "prepping"; - case EXPORT_WARNING: return "warning"; - case EXPORT_EXPORTING: return "exporting"; - case EXPORT_LOGGINGFINISH: return "loggingfinish"; - case EXPORT_NOTIFYING: return "notifying"; - case EXPORT_ABORTING: return "aborting"; - default: assert(0); return 0; - } - } - -protected: - // export fun - map export_state; - map export_peer; - //map > export_data; // only during EXPORTING state - map > export_warning_ack_waiting; - map > export_notify_ack_waiting; - - map > export_finish_waiters; - - list< pair > export_queue; - - // -- imports -- -public: - const static int IMPORT_DISCOVERING = 1; // waiting for prep - const static int IMPORT_DISCOVERED = 2; // waiting for prep - const static int IMPORT_PREPPING = 3; // opening dirs on bounds - const static int IMPORT_PREPPED = 4; // opened bounds, waiting for import - const static int IMPORT_LOGGINGSTART = 5; // got import, logging EImportStart - const static int IMPORT_ACKING = 6; // logged EImportStart, sent ack, waiting for finish - const static int IMPORT_ABORTING = 8; // notifying bystanders of an abort before unfreezing - static const char *get_import_statename(int s) { - switch (s) { - case IMPORT_DISCOVERING: return "discovering"; - case IMPORT_DISCOVERED: return "discovered"; - case IMPORT_PREPPING: return "prepping"; - case IMPORT_PREPPED: return "prepped"; - case IMPORT_LOGGINGSTART: return "loggingstart"; - case IMPORT_ACKING: return "acking"; - case IMPORT_ABORTING: return "aborting"; - default: assert(0); return 0; - } - } - -protected: - map import_state; // FIXME make these dirfrags - map import_peer; - map > import_bystanders; - map > import_bound_ls; - map > import_updated_scatterlocks; - map > > import_caps; - - -public: - // -- cons -- - Migrator(MDS *m, MDCache *c) : mds(m), cache(c) {} - - void dispatch(Message*); - - void show_importing(); - void show_exporting(); - - // -- status -- - int is_exporting(CDir *dir) { - if (export_state.count(dir)) return export_state[dir]; - return 0; - } - bool is_exporting() { return !export_state.empty(); } - int is_importing(dirfrag_t df) { - if (import_state.count(df)) return import_state[df]; - return 0; - } - bool is_importing() { return !import_state.empty(); } - - int get_import_state(dirfrag_t df) { - assert(import_state.count(df)); - return import_state[df]; - } - int get_import_peer(dirfrag_t df) { - assert(import_peer.count(df)); - return import_peer[df]; - } - - int get_export_state(CDir *dir) { - assert(export_state.count(dir)); - return export_state[dir]; - } - // this returns true if we are export @dir, - // and are not waiting for @who to be - // be warned of ambiguous auth. - // only returns meaningful results during EXPORT_WARNING state. - bool export_has_warned(CDir *dir, int who) { - assert(is_exporting(dir)); - assert(export_state[dir] == EXPORT_WARNING); - return (export_warning_ack_waiting[dir].count(who) == 0); - } - - - // -- misc -- - void handle_mds_failure_or_stop(int who); - - void audit(); - - // -- import/export -- - // exporter - public: - void export_dir(CDir *dir, int dest); - void export_empty_import(CDir *dir); - - void export_dir_nicely(CDir *dir, int dest); - void maybe_do_queued_export(); - void clear_export_queue() { - export_queue.clear(); - } - - void encode_export_inode(CInode *in, bufferlist& bl, - map& exported_client_map); - void encode_export_inode_caps(CInode *in, bufferlist& bl, - map& exported_client_map); - void finish_export_inode(CInode *in, utime_t now, list& finished); - void finish_export_inode_caps(CInode *in); - - int encode_export_dir(bufferlist& exportbl, - CDir *dir, - map& exported_client_map, - utime_t now); - void finish_export_dir(CDir *dir, list& finished, utime_t now); - - void add_export_finish_waiter(CDir *dir, Context *c) { - export_finish_waiters[dir].push_back(c); - } - void clear_export_proxy_pins(CDir *dir); - - void export_caps(CInode *in); - - protected: - void handle_export_discover_ack(MExportDirDiscoverAck *m); - void export_frozen(CDir *dir); - void handle_export_prep_ack(MExportDirPrepAck *m); - void export_go(CDir *dir); - void export_go_synced(CDir *dir); - void export_reverse(CDir *dir); - void handle_export_ack(MExportDirAck *m); - void export_logged_finish(CDir *dir); - void handle_export_notify_ack(MExportDirNotifyAck *m); - void export_finish(CDir *dir); - - void handle_export_caps_ack(MExportCapsAck *m); - - - friend class C_MDC_ExportFreeze; - friend class C_MDS_ExportFinishLogged; - friend class C_M_ExportGo; - - // importer - void handle_export_discover(MExportDirDiscover *m); - void handle_export_cancel(MExportDirCancel *m); - void handle_export_prep(MExportDirPrep *m); - void handle_export_dir(MExportDir *m); - -public: - void decode_import_inode(CDentry *dn, bufferlist::iterator& blp, int oldauth, - LogSegment *ls, - map >& cap_imports, - list& updated_scatterlocks); - void decode_import_inode_caps(CInode *in, - bufferlist::iterator &blp, - map >& cap_imports); - void finish_import_inode_caps(CInode *in, int from, map &cap_map); - int decode_import_dir(bufferlist::iterator& blp, - int oldauth, - CDir *import_root, - EImportStart *le, - LogSegment *ls, - map >& cap_imports, - list& updated_scatterlocks); - -public: - void import_reverse(CDir *dir); -protected: - void import_remove_pins(CDir *dir, set& bounds); - void import_reverse_unfreeze(CDir *dir); - void import_reverse_final(CDir *dir); - void import_notify_abort(CDir *dir, set& bounds); - void import_logged_start(CDir *dir, int from, - map &imported_client_map); - void handle_export_finish(MExportDirFinish *m); -public: - void import_finish(CDir *dir); -protected: - - void handle_export_caps(MExportCaps *m); - void logged_import_caps(CInode *in, - int from, - map >& cap_imports); - - - friend class C_MDS_ImportDirLoggedStart; - friend class C_MDS_ImportDirLoggedFinish; - friend class C_M_LoggedImportCaps; - - // bystander - void handle_export_notify(MExportDirNotify *m); - -}; - - -#endif diff --git a/trunk/ceph/mds/ScatterLock.h b/trunk/ceph/mds/ScatterLock.h deleted file mode 100644 index 24a1361f82d68..0000000000000 --- a/trunk/ceph/mds/ScatterLock.h +++ /dev/null @@ -1,183 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __SCATTERLOCK_H -#define __SCATTERLOCK_H - -#include "SimpleLock.h" - - -// lock state machine states: -// Sync -- Lock -- sCatter -// Tempsync _/ -// auth repl -#define LOCK_SYNC__ // R . R . rdlocks allowed on auth and replicas -#define LOCK_GLOCKS -20 // r . r . waiting for replicas+rdlocks (auth), or rdlocks to release (replica) -#define LOCK_GSCATTERS -28 // r . r . - -#define LOCK_GSYNCL__ // . w LOCK on replica. -#define LOCK_LOCK__ // . W . . -#define LOCK_GTEMPSYNCL -21 // . w LOCK on replica. - -#define LOCK_GLOCKC -22 // . wp . wp waiting for replicas+wrlocks (auth), or wrlocks to release (replica) -#define LOCK_SCATTER 23 // . Wp . WP mtime updates on replicas allowed, no reads. stable here. -#define LOCK_GTEMPSYNCC -24 // . wp . wp GLOCKC|LOCK on replica - -#define LOCK_GSCATTERT -25 // r . LOCK on replica. -#define LOCK_GLOCKT -26 // r . LOCK on replica. -#define LOCK_TEMPSYNC 27 // R . LOCK on replica. - - -inline const char *get_scatterlock_state_name(int s) { - switch(s) { - case LOCK_SYNC: return "Sync"; - case LOCK_GLOCKS: return "gLockS"; - case LOCK_GSCATTERS: return "gScatterS"; - - case LOCK_GSYNCL: return "gSyncL"; - case LOCK_LOCK: return "Lock"; - case LOCK_GTEMPSYNCL: return "gTempsyncL"; - - case LOCK_GLOCKC: return "gLockC"; - case LOCK_SCATTER: return "sCatter"; - case LOCK_GTEMPSYNCC: return "gTempsyncC"; - - case LOCK_GSCATTERT: return "gsCatterT"; - case LOCK_GLOCKT: return "gLockT"; - case LOCK_TEMPSYNC: return "Tempsync"; - - default: assert(0); return 0; - } -} - -class ScatterLock : public SimpleLock { - int num_wrlock; - bool updated; - utime_t last_scatter; - -public: - xlist::item xlistitem_autoscattered; - - ScatterLock(MDSCacheObject *o, int t, int wo) : - SimpleLock(o, t, wo), - num_wrlock(0), - updated(false), - xlistitem_autoscattered(this) {} - - int get_replica_state() { - switch (state) { - case LOCK_SYNC: - return LOCK_SYNC; - - case LOCK_GSCATTERS: // hrm. - case LOCK_GLOCKS: - case LOCK_GSYNCL: - case LOCK_LOCK: - case LOCK_GTEMPSYNCL: - case LOCK_GLOCKC: - return LOCK_LOCK; - - case LOCK_SCATTER: - return LOCK_SCATTER; - - case LOCK_GTEMPSYNCC: - case LOCK_GSCATTERT: - case LOCK_GLOCKT: - case LOCK_TEMPSYNC: - return LOCK_LOCK; - default: - assert(0); - return 0; - } - } - - void set_updated() { - if (!updated) { - parent->get(MDSCacheObject::PIN_DIRTYSCATTERED); - updated = true; - } - } - void clear_updated() { - if (updated) { - parent->put(MDSCacheObject::PIN_DIRTYSCATTERED); - updated = false; - parent->clear_dirty_scattered(type); - } - } - bool is_updated() { return updated; } - - void set_last_scatter(utime_t t) { last_scatter = t; } - utime_t get_last_scatter() { return last_scatter; } - - void replicate_relax() { - } - - void export_twiddle() { - clear_gather(); - state = get_replica_state(); - } - - // rdlock - bool can_rdlock(MDRequest *mdr) { - return state == LOCK_SYNC || state == LOCK_TEMPSYNC; - } - bool can_rdlock_soon() { - return state == LOCK_GTEMPSYNCC; - } - - // xlock - bool can_xlock_soon() { - if (parent->is_auth()) - return (state == LOCK_GLOCKC || - state == LOCK_GLOCKS); - else - return false; - } - - // wrlock - bool can_wrlock() { - return state == LOCK_SCATTER || state == LOCK_LOCK; - } - void get_wrlock() { - assert(can_wrlock()); - if (num_wrlock == 0) parent->get(MDSCacheObject::PIN_LOCK); - ++num_wrlock; - } - void put_wrlock() { - --num_wrlock; - if (num_wrlock == 0) parent->put(MDSCacheObject::PIN_LOCK); - } - bool is_wrlocked() { return num_wrlock > 0; } - int get_num_wrlocks() { return num_wrlock; } - - void print(ostream& out) { - out << "("; - out << get_lock_type_name(get_type()) << " "; - out << get_scatterlock_state_name(get_state()); - if (!get_gather_set().empty()) out << " g=" << get_gather_set(); - if (is_rdlocked()) - out << " r=" << get_num_rdlocks(); - if (is_xlocked()) - out << " x=" << get_xlocked_by(); - if (is_wrlocked()) - out << " wr=" << get_num_wrlocks(); - if (updated) - out << " updated"; - out << ")"; - } - -}; - -#endif diff --git a/trunk/ceph/mds/Server.h b/trunk/ceph/mds/Server.h deleted file mode 100644 index d2252f33df7bc..0000000000000 --- a/trunk/ceph/mds/Server.h +++ /dev/null @@ -1,187 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_SERVER_H -#define __MDS_SERVER_H - -#include "MDS.h" - -class Logger; -class LogEvent; -class C_MDS_rename_finish; -class MDRequest; -class EMetaBlob; -class EUpdate; -class PVList; -class MMDSSlaveRequest; - -class Server { - MDS *mds; - MDCache *mdcache; - MDLog *mdlog; - Messenger *messenger; - Logger *logger; - -public: - Server(MDS *m) : - mds(m), - mdcache(mds->mdcache), mdlog(mds->mdlog), - messenger(mds->messenger), - logger(0) { - } - ~Server() { - delete logger; - } - - void reopen_logger(utime_t start, bool append); - - // message handler - void dispatch(Message *m); - - - // -- sessions and recovery -- - utime_t reconnect_start; - set client_reconnect_gather; // clients i need a reconnect msg from. - set reconnected_caps; - - void handle_client_session(class MClientSession *m); - void _session_logged(entity_inst_t ci, bool open, version_t cmapv); - void prepare_force_open_sessions(map &cm); - void finish_force_open_sessions(map &cm); - void terminate_sessions(); - void reconnect_clients(); - void handle_client_reconnect(class MClientReconnect *m); - void process_reconnect_cap(CInode *in, int from, inode_caps_reconnect_t& capinfo); - void add_reconnected_cap_inode(CInode *in) { - reconnected_caps.insert(in); - } - void process_reconnected_caps(); - void client_reconnect_failure(int from); - void reconnect_gather_finish(); - - - // -- requests -- - void handle_client_request(MClientRequest *m); - - void dispatch_client_request(MDRequest *mdr); - void reply_request(MDRequest *mdr, int r = 0, CInode *tracei = 0); - void reply_request(MDRequest *mdr, MClientReply *reply, CInode *tracei); - - void handle_slave_request(MMDSSlaveRequest *m); - void dispatch_slave_request(MDRequest *mdr); - void handle_slave_auth_pin(MDRequest *mdr); - void handle_slave_auth_pin_ack(MDRequest *mdr, MMDSSlaveRequest *ack); - - // some helpers - CDir *validate_dentry_dir(MDRequest *mdr, CInode *diri, const string& dname); - CDir *traverse_to_auth_dir(MDRequest *mdr, vector &trace, filepath refpath); - CDentry *prepare_null_dentry(MDRequest *mdr, CDir *dir, const string& dname, bool okexist=false); - CInode* prepare_new_inode(MDRequest *mdr, CDir *dir); - - CInode* rdlock_path_pin_ref(MDRequest *mdr, bool want_auth); - CDentry* rdlock_path_xlock_dentry(MDRequest *mdr, bool okexist, bool mustexist); - - CDir* try_open_auth_dirfrag(CInode *diri, frag_t fg, MDRequest *mdr); - - version_t predirty_dn_diri(MDRequest *mdr, CDentry *dn, class EMetaBlob *blob); - void dirty_dn_diri(MDRequest *mdr, CDentry *dn, version_t dirpv); - - - // requests on existing inodes. - void handle_client_stat(MDRequest *mdr); - void handle_client_utime(MDRequest *mdr); - void handle_client_chmod(MDRequest *mdr); - void handle_client_chown(MDRequest *mdr); - void handle_client_readdir(MDRequest *mdr); - void handle_client_truncate(MDRequest *mdr); - void handle_client_fsync(MDRequest *mdr); - - // open - void handle_client_open(MDRequest *mdr); - void handle_client_openc(MDRequest *mdr); // O_CREAT variant. - void handle_client_opent(MDRequest *mdr); // O_TRUNC variant. - void _do_open(MDRequest *mdr, CInode *ref); - - set journal_open_queue; // to be journal - list journal_open_waiters; - void queue_journal_open(CInode *in); - void add_journal_open_waiter(Context *c) { - journal_open_waiters.push_back(c); - } - void maybe_journal_opens() { - if (journal_open_queue.size() >= (unsigned)g_conf.mds_log_eopen_size) - journal_opens(); - } - void journal_opens(); - - // namespace changes - void handle_client_mknod(MDRequest *mdr); - void handle_client_mkdir(MDRequest *mdr); - void handle_client_symlink(MDRequest *mdr); - - // link - void handle_client_link(MDRequest *mdr); - void _link_local(MDRequest *mdr, CDentry *dn, CInode *targeti); - void _link_local_finish(MDRequest *mdr, - CDentry *dn, CInode *targeti, - version_t, version_t, version_t); - - void _link_remote(MDRequest *mdr, CDentry *dn, CInode *targeti); - void _link_remote_finish(MDRequest *mdr, CDentry *dn, CInode *targeti, - version_t, version_t); - - void handle_slave_link_prep(MDRequest *mdr); - void _logged_slave_link(MDRequest *mdr, CInode *targeti, utime_t old_ctime, bool inc); - void _commit_slave_link(MDRequest *mdr, int r, CInode *targeti, - utime_t old_ctime, version_t old_version, bool inc); - void handle_slave_link_prep_ack(MDRequest *mdr, MMDSSlaveRequest *m); - - // unlink - void handle_client_unlink(MDRequest *mdr); - bool _verify_rmdir(MDRequest *mdr, CInode *rmdiri); - void _unlink_local(MDRequest *mdr, CDentry *dn, CDentry *straydn); - void _unlink_local_finish(MDRequest *mdr, - CDentry *dn, CDentry *straydn, - version_t, version_t); - - void _unlink_remote(MDRequest *mdr, CDentry *dn); - void _unlink_remote_finish(MDRequest *mdr, - CDentry *dn, - version_t, version_t); - - // rename - void handle_client_rename(MDRequest *mdr); - void _rename_finish(MDRequest *mdr, - CDentry *srcdn, CDentry *destdn, CDentry *straydn); - - // helpers - void _rename_prepare_witness(MDRequest *mdr, int who, - CDentry *srcdn, CDentry *destdn, CDentry *straydn); - void _rename_prepare(MDRequest *mdr, - EMetaBlob *metablob, bufferlist *client_map_bl, - CDentry *srcdn, CDentry *destdn, CDentry *straydn); - void _rename_apply(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn); - - // slaving - void handle_slave_rename_prep(MDRequest *mdr); - void handle_slave_rename_prep_ack(MDRequest *mdr, MMDSSlaveRequest *m); - void _logged_slave_rename(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn); - void _commit_slave_rename(MDRequest *mdr, int r, CDentry *srcdn, CDentry *destdn, CDentry *straydn); - -}; - - - - -#endif diff --git a/trunk/ceph/mds/SimpleLock.h b/trunk/ceph/mds/SimpleLock.h deleted file mode 100644 index e785e2c36d50c..0000000000000 --- a/trunk/ceph/mds/SimpleLock.h +++ /dev/null @@ -1,301 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __SIMPLELOCK_H -#define __SIMPLELOCK_H - -// -- lock types -- -// NOTE: this also defines the lock ordering! -#define LOCK_OTYPE_DN 1 - -#define LOCK_OTYPE_IVERSION 2 -#define LOCK_OTYPE_IFILE 3 -#define LOCK_OTYPE_IAUTH 4 -#define LOCK_OTYPE_ILINK 5 -#define LOCK_OTYPE_IDIRFRAGTREE 6 -#define LOCK_OTYPE_IDIR 7 - -//#define LOCK_OTYPE_DIR 7 // not used - -inline const char *get_lock_type_name(int t) { - switch (t) { - case LOCK_OTYPE_DN: return "dn"; - case LOCK_OTYPE_IVERSION: return "iversion"; - case LOCK_OTYPE_IFILE: return "ifile"; - case LOCK_OTYPE_IAUTH: return "iauth"; - case LOCK_OTYPE_ILINK: return "ilink"; - case LOCK_OTYPE_IDIRFRAGTREE: return "idft"; - case LOCK_OTYPE_IDIR: return "idir"; - default: assert(0); return 0; - } -} - -// -- lock states -- -// sync <-> lock -#define LOCK_UNDEF 0 -// auth rep -#define LOCK_SYNC 1 // AR R . R . -#define LOCK_LOCK 2 // AR R W . . -#define LOCK_GLOCKR -3 // AR R . . . -#define LOCK_REMOTEXLOCK -50 // on NON-auth - -inline const char *get_simplelock_state_name(int n) { - switch (n) { - case LOCK_UNDEF: return "UNDEF"; - case LOCK_SYNC: return "sync"; - case LOCK_LOCK: return "lock"; - case LOCK_GLOCKR: return "glockr"; - case LOCK_REMOTEXLOCK: return "remote_xlock"; - default: assert(0); return 0; - } -} - -class MDRequest; - -class SimpleLock { -public: - static const int WAIT_RD = (1<<0); // to read - static const int WAIT_WR = (1<<1); // to write - static const int WAIT_XLOCK = (1<<2); // to xlock (** dup) - static const int WAIT_STABLE = (1<<2); // for a stable state - static const int WAIT_REMOTEXLOCK = (1<<3); // for a remote xlock - static const int WAIT_BITS = 4; - static const int WAIT_ALL = ((1< gather_set; // auth - - // local state - int num_rdlock; - MDRequest *xlock_by; - -public: - SimpleLock(MDSCacheObject *o, int t, int wo) : - parent(o), type(t), wait_offset(wo), - state(LOCK_SYNC), - num_rdlock(0), xlock_by(0) { } - virtual ~SimpleLock() {} - - // parent - MDSCacheObject *get_parent() { return parent; } - int get_type() { return type; } - - struct ptr_lt { - bool operator()(const SimpleLock* l, const SimpleLock* r) const { - // first sort by object type (dn < inode) - if ((l->type>LOCK_OTYPE_DN) < (r->type>LOCK_OTYPE_DN)) return true; - if ((l->type>LOCK_OTYPE_DN) == (r->type>LOCK_OTYPE_DN)) { - // then sort by object - if (l->parent->is_lt(r->parent)) return true; - if (l->parent == r->parent) { - // then sort by (inode) lock type - if (l->type < r->type) return true; - } - } - return false; - } - }; - - void decode_locked_state(bufferlist& bl) { - parent->decode_lock_state(type, bl); - } - void encode_locked_state(bufferlist& bl) { - parent->encode_lock_state(type, bl); - } - void finish_waiters(int mask, int r=0) { - parent->finish_waiting(mask << wait_offset, r); - } - void take_waiting(int mask, list& ls) { - parent->take_waiting(mask << wait_offset, ls); - } - void add_waiter(int mask, Context *c) { - parent->add_waiter(mask << wait_offset, c); - } - bool is_waiter_for(int mask) { - return parent->is_waiter_for(mask << wait_offset); - } - - - - // state - int get_state() { return state; } - int set_state(int s) { - state = s; - assert(!is_stable() || gather_set.size() == 0); // gather should be empty in stable states. - return s; - }; - bool is_stable() { - return state >= 0; - } - - - // gather set - const set& get_gather_set() { return gather_set; } - void init_gather() { - for (map::const_iterator p = parent->replicas_begin(); - p != parent->replicas_end(); - ++p) - gather_set.insert(p->first); - } - bool is_gathering() { return !gather_set.empty(); } - bool is_gathering(int i) { - return gather_set.count(i); - } - void clear_gather() { - gather_set.clear(); - } - void remove_gather(int i) { - gather_set.erase(i); - } - - // ref counting - bool is_rdlocked() { return num_rdlock > 0; } - int get_rdlock() { - if (!num_rdlock) parent->get(MDSCacheObject::PIN_LOCK); - return ++num_rdlock; - } - int put_rdlock() { - assert(num_rdlock>0); - --num_rdlock; - if (num_rdlock == 0) parent->put(MDSCacheObject::PIN_LOCK); - return num_rdlock; - } - int get_num_rdlocks() { return num_rdlock; } - - void get_xlock(MDRequest *who) { - assert(xlock_by == 0); - parent->get(MDSCacheObject::PIN_LOCK); - xlock_by = who; - } - void put_xlock() { - assert(xlock_by); - parent->put(MDSCacheObject::PIN_LOCK); - xlock_by = 0; - } - bool is_xlocked() { return xlock_by ? true:false; } - bool is_xlocked_by_other(MDRequest *mdr) { - return is_xlocked() && xlock_by != mdr; - } - MDRequest *get_xlocked_by() { return xlock_by; } - bool is_used() { - return is_xlocked() || is_rdlocked(); - } - - // encode/decode - void _encode(bufferlist& bl) { - ::_encode_simple(state, bl); - ::_encode_simple(gather_set, bl); - } - void _decode(bufferlist::iterator& p) { - ::_decode_simple(state, p); - ::_decode_simple(gather_set, p); - } - - - // simplelock specifics - int get_replica_state() { - switch (state) { - case LOCK_LOCK: - case LOCK_GLOCKR: - return LOCK_LOCK; - case LOCK_SYNC: - return LOCK_SYNC; - default: - assert(0); - } - return 0; - } - void export_twiddle() { - clear_gather(); - state = get_replica_state(); - } - - /** replicate_relax - * called on first replica creation. - */ - void replicate_relax() { - assert(parent->is_auth()); - assert(!parent->is_replicated()); - if (state == LOCK_LOCK && !is_used()) - state = LOCK_SYNC; - } - bool remove_replica(int from) { - if (is_gathering(from)) { - remove_gather(from); - if (!is_gathering()) - return true; - } - return false; - } - bool do_import(int from, int to) { - if (!is_stable()) { - remove_gather(from); - remove_gather(to); - if (!is_gathering()) - return true; - } - if (!is_stable() && !is_gathering()) - return true; - return false; - } - - bool can_rdlock(MDRequest *mdr) { - //if (state == LOCK_LOCK && mdr && xlock_by == mdr) return true; // xlocked by me. (actually, is this right?) - //if (state == LOCK_LOCK && !xlock_by && parent->is_auth()) return true; - return (state == LOCK_SYNC); - } - bool can_xlock(MDRequest *mdr) { - if (mdr && xlock_by == mdr) { - assert(state == LOCK_LOCK); - return true; // auth or replica! xlocked by me. - } - if (state == LOCK_LOCK && parent->is_auth() && !xlock_by) return true; - return false; - } - bool can_xlock_soon() { - if (parent->is_auth()) - return (state == LOCK_GLOCKR); - else - return false; - } - - virtual void print(ostream& out) { - out << "("; - out << get_lock_type_name(get_type()) << " "; - out << get_simplelock_state_name(get_state()); - if (!get_gather_set().empty()) out << " g=" << get_gather_set(); - if (is_rdlocked()) - out << " r=" << get_num_rdlocks(); - if (is_xlocked()) - out << " x=" << get_xlocked_by(); - out << ")"; - } -}; - -inline ostream& operator<<(ostream& out, SimpleLock& l) -{ - l.print(out); - return out; -} - - -#endif diff --git a/trunk/ceph/mds/events/EAnchor.h b/trunk/ceph/mds/events/EAnchor.h deleted file mode 100644 index 97a21a36734be..0000000000000 --- a/trunk/ceph/mds/events/EAnchor.h +++ /dev/null @@ -1,80 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_EANCHOR_H -#define __MDS_EANCHOR_H - -#include -#include "config.h" -#include "include/types.h" - -#include "../LogEvent.h" -#include "../Anchor.h" - -class EAnchor : public LogEvent { -protected: - int op; - inodeno_t ino; - version_t atid; - vector trace; - version_t version; // anchor table version - int reqmds; - - public: - EAnchor() : LogEvent(EVENT_ANCHOR) { } - EAnchor(int o, inodeno_t i, version_t v, int rm) : - LogEvent(EVENT_ANCHOR), - op(o), ino(i), atid(0), version(v), reqmds(rm) { } - EAnchor(int o, version_t a, version_t v) : - LogEvent(EVENT_ANCHOR), - op(o), atid(a), version(v), reqmds(-1) { } - - void set_trace(vector& t) { trace = t; } - vector& get_trace() { return trace; } - - void encode_payload(bufferlist& bl) { - bl.append((char*)&op, sizeof(op)); - bl.append((char*)&ino, sizeof(ino)); - bl.append((char*)&atid, sizeof(atid)); - ::_encode(trace, bl); - bl.append((char*)&version, sizeof(version)); - bl.append((char*)&reqmds, sizeof(reqmds)); - } - void decode_payload(bufferlist& bl, int& off) { - bl.copy(off, sizeof(op), (char*)&op); - off += sizeof(op); - bl.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - bl.copy(off, sizeof(atid), (char*)&atid); - off += sizeof(atid); - ::_decode(trace, bl, off); - bl.copy(off, sizeof(version), (char*)&version); - off += sizeof(version); - bl.copy(off, sizeof(reqmds), (char*)&reqmds); - off += sizeof(reqmds); - } - - void print(ostream& out) { - out << "EAnchor " << get_anchor_opname(op); - if (ino) out << " " << ino; - if (atid) out << " atid " << atid; - if (version) out << " v " << version; - if (reqmds >= 0) out << " by mds" << reqmds; - } - - void update_segment(); - void replay(MDS *mds); -}; - -#endif diff --git a/trunk/ceph/mds/events/EAnchorClient.h b/trunk/ceph/mds/events/EAnchorClient.h deleted file mode 100644 index 21f78369cae72..0000000000000 --- a/trunk/ceph/mds/events/EAnchorClient.h +++ /dev/null @@ -1,56 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_EANCHORCLIENT_H -#define __MDS_EANCHORCLIENT_H - -#include -#include "config.h" -#include "include/types.h" - -#include "../LogEvent.h" -#include "../Anchor.h" - -class EAnchorClient : public LogEvent { -protected: - int op; - version_t atid; - - public: - EAnchorClient() : LogEvent(EVENT_ANCHORCLIENT) { } - EAnchorClient(int o, version_t at) : - LogEvent(EVENT_ANCHORCLIENT), - op(o), atid(at) { } - - void encode_payload(bufferlist& bl) { - bl.append((char*)&op, sizeof(op)); - bl.append((char*)&atid, sizeof(atid)); - } - void decode_payload(bufferlist& bl, int& off) { - bl.copy(off, sizeof(op), (char*)&op); - off += sizeof(op); - bl.copy(off, sizeof(atid), (char*)&atid); - off += sizeof(atid); - } - - void print(ostream& out) { - out << "EAnchorClient " << get_anchor_opname(op); - if (atid) out << " atid " << atid; - } - - void replay(MDS *mds); - -}; - -#endif diff --git a/trunk/ceph/mds/events/EExport.h b/trunk/ceph/mds/events/EExport.h deleted file mode 100644 index 89534f12b51bf..0000000000000 --- a/trunk/ceph/mds/events/EExport.h +++ /dev/null @@ -1,63 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __EEXPORT_H -#define __EEXPORT_H - -#include -#include "config.h" -#include "include/types.h" - -#include "../MDS.h" - -#include "EMetaBlob.h" - -class EExport : public LogEvent { -public: - EMetaBlob metablob; // exported dir -protected: - dirfrag_t base; - set bounds; - -public: - EExport() : LogEvent(EVENT_EXPORT) { } - EExport(MDLog *mdlog, CDir *dir) : - LogEvent(EVENT_EXPORT), metablob(mdlog), - base(dir->dirfrag()) { } - - set &get_bounds() { return bounds; } - - void print(ostream& out) { - out << "EExport " << base << " " << metablob; - } - - virtual void encode_payload(bufferlist& bl) { - metablob._encode(bl); - bl.append((char*)&base, sizeof(base)); - ::_encode(bounds, bl); - } - void decode_payload(bufferlist& bl, int& off) { - metablob._decode(bl, off); - bl.copy(off, sizeof(base), (char*)&base); - off += sizeof(base); - ::_decode(bounds, bl, off); - } - - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); - void replay(MDS *mds); - -}; - -#endif diff --git a/trunk/ceph/mds/events/EFragment.h b/trunk/ceph/mds/events/EFragment.h deleted file mode 100644 index 64969111193c0..0000000000000 --- a/trunk/ceph/mds/events/EFragment.h +++ /dev/null @@ -1,54 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_EFRAGMENT_H -#define __MDS_EFRAGMENT_H - -#include "../LogEvent.h" -#include "EMetaBlob.h" - -class EFragment : public LogEvent { -public: - EMetaBlob metablob; - inodeno_t ino; - frag_t basefrag; - int bits; // positive for split (from basefrag), negative for merge (to basefrag) - - EFragment() : LogEvent(EVENT_FRAGMENT) { } - EFragment(MDLog *mdlog, inodeno_t i, frag_t bf, int b) : - LogEvent(EVENT_FRAGMENT), metablob(mdlog), - ino(i), basefrag(bf), bits(b) { } - void print(ostream& out) { - out << "EFragment " << ino << " " << basefrag << " by " << bits << " " << metablob; - } - - void encode_payload(bufferlist& bl) { - ::_encode(ino, bl); - ::_encode(basefrag, bl); - ::_encode(bits, bl); - metablob._encode(bl); - } - void decode_payload(bufferlist& bl, int& off) { - ::_decode(ino, bl, off); - ::_decode(basefrag, bl, off); - ::_decode(bits, bl, off); - metablob._decode(bl, off); - } - - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); - void replay(MDS *mds); -}; - -#endif diff --git a/trunk/ceph/mds/events/EImportFinish.h b/trunk/ceph/mds/events/EImportFinish.h deleted file mode 100644 index 0ee6d71ffdc13..0000000000000 --- a/trunk/ceph/mds/events/EImportFinish.h +++ /dev/null @@ -1,60 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __EIMPORTFINISH_H -#define __EIMPORTFINISH_H - -#include -#include "config.h" -#include "include/types.h" - -#include "../MDS.h" - -class EImportFinish : public LogEvent { - protected: - dirfrag_t base; // imported dir - bool success; - - public: - EImportFinish(CDir *dir, bool s) : LogEvent(EVENT_IMPORTFINISH), - base(dir->dirfrag()), - success(s) { } - EImportFinish() : LogEvent(EVENT_IMPORTFINISH) { } - - void print(ostream& out) { - out << "EImportFinish " << base; - if (success) - out << " success"; - else - out << " failed"; - } - - virtual void encode_payload(bufferlist& bl) { - bl.append((char*)&base, sizeof(base)); - bl.append((char*)&success, sizeof(success)); - } - void decode_payload(bufferlist& bl, int& off) { - bl.copy(off, sizeof(base), (char*)&base); - off += sizeof(base); - bl.copy(off, sizeof(success), (char*)&success); - off += sizeof(success); - } - - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); - void replay(MDS *mds); - -}; - -#endif diff --git a/trunk/ceph/mds/events/EImportStart.h b/trunk/ceph/mds/events/EImportStart.h deleted file mode 100644 index 5671e404298a4..0000000000000 --- a/trunk/ceph/mds/events/EImportStart.h +++ /dev/null @@ -1,67 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __EIMPORTSTART_H -#define __EIMPORTSTART_H - -#include -#include "config.h" -#include "include/types.h" - -#include "../MDS.h" - -#include "EMetaBlob.h" - -class EImportStart : public LogEvent { -protected: - dirfrag_t base; - list bounds; - - public: - EMetaBlob metablob; - bufferlist client_map; // encoded map - version_t cmapv; - - EImportStart(dirfrag_t di, - list& b) : LogEvent(EVENT_IMPORTSTART), - base(di), bounds(b) { } - EImportStart() : LogEvent(EVENT_IMPORTSTART) { } - - void print(ostream& out) { - out << "EImportStart " << base << " " << metablob; - } - - virtual void encode_payload(bufferlist& bl) { - bl.append((char*)&base, sizeof(base)); - metablob._encode(bl); - ::_encode(bounds, bl); - ::_encode(cmapv, bl); - ::_encode(client_map, bl); - } - void decode_payload(bufferlist& bl, int& off) { - bl.copy(off, sizeof(base), (char*)&base); - off += sizeof(base); - metablob._decode(bl, off); - ::_decode(bounds, bl, off); - ::_decode(cmapv, bl, off); - ::_decode(client_map, bl, off); - } - - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); - void replay(MDS *mds); - -}; - -#endif diff --git a/trunk/ceph/mds/events/EMetaBlob.h b/trunk/ceph/mds/events/EMetaBlob.h deleted file mode 100644 index 767521523f9fe..0000000000000 --- a/trunk/ceph/mds/events/EMetaBlob.h +++ /dev/null @@ -1,501 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_EMETABLOB_H -#define __MDS_EMETABLOB_H - -#include -#include -using std::string; - -#include "../CInode.h" -#include "../CDir.h" -#include "../CDentry.h" - -#include "include/triple.h" - -class MDS; -class MDLog; -class LogSegment; - -/* - * a bunch of metadata in the journal - */ - -/* notes: - * - * - make sure you adjust the inode.version for any modified inode you - * journal. CDir and CDentry maintain a projected_version, but CInode - * doesn't, since the journaled inode usually has to be modifed - * manually anyway (to delay the change in the MDS's cache until after - * it is journaled). - * - */ - - -class EMetaBlob { - - /* fullbit - a regular dentry + inode - */ - struct fullbit { - string dn; // dentry - version_t dnv; - inode_t inode; // if it's not - fragtree_t dirfragtree; - string symlink; - bool dirty; - - fullbit(const string& d, version_t v, inode_t& i, fragtree_t dft, bool dr) : - dn(d), dnv(v), inode(i), dirfragtree(dft), dirty(dr) { } - fullbit(const string& d, version_t v, inode_t& i, fragtree_t dft, string& sym, bool dr) : - dn(d), dnv(v), inode(i), dirfragtree(dft), symlink(sym), dirty(dr) { } - fullbit(bufferlist& bl, int& off) { _decode(bl, off); } - void _encode(bufferlist& bl) { - ::_encode(dn, bl); - ::_encode(dnv, bl); - ::_encode(inode, bl); - dirfragtree._encode(bl); - if (inode.is_symlink()) - ::_encode(symlink, bl); - ::_encode(dirty, bl); - } - void _decode(bufferlist& bl, int& off) { - ::_decode(dn, bl, off); - ::_decode(dnv, bl, off); - ::_decode(inode, bl, off); - dirfragtree._decode(bl, off); - if (inode.is_symlink()) - ::_decode(symlink, bl, off); - ::_decode(dirty, bl, off); - } - void print(ostream& out) { - out << " fullbit dn " << dn << " dnv " << dnv - << " inode " << inode.ino - << " dirty=" << dirty << std::endl; - } - }; - - /* remotebit - a dentry + remote inode link (i.e. just an ino) - */ - struct remotebit { - string dn; - version_t dnv; - inodeno_t ino; - unsigned char d_type; - bool dirty; - - remotebit(const string& d, version_t v, inodeno_t i, unsigned char dt, bool dr) : - dn(d), dnv(v), ino(i), d_type(dt), dirty(dr) { } - remotebit(bufferlist& bl, int& off) { _decode(bl, off); } - void _encode(bufferlist& bl) { - ::_encode(dn, bl); - ::_encode(dnv, bl); - ::_encode(ino, bl); - ::_encode(d_type, bl); - ::_encode(dirty, bl); - } - void _decode(bufferlist& bl, int& off) { - ::_decode(dn, bl, off); - ::_decode(dnv, bl, off); - ::_decode(ino, bl, off); - ::_decode(d_type, bl, off); - ::_decode(dirty, bl, off); - } - void print(ostream& out) { - out << " remotebit dn " << dn << " dnv " << dnv - << " ino " << ino - << " dirty=" << dirty << std::endl; - } - }; - - /* - * nullbit - a null dentry - */ - struct nullbit { - string dn; - version_t dnv; - bool dirty; - nullbit(const string& d, version_t v, bool dr) : dn(d), dnv(v), dirty(dr) { } - nullbit(bufferlist& bl, int& off) { _decode(bl, off); } - void _encode(bufferlist& bl) { - ::_encode(dn, bl); - ::_encode(dnv, bl); - ::_encode(dirty, bl); - } - void _decode(bufferlist& bl, int& off) { - ::_decode(dn, bl, off); - ::_decode(dnv, bl, off); - ::_decode(dirty, bl, off); - } - void print(ostream& out) { - out << " nullbit dn " << dn << " dnv " << dnv - << " dirty=" << dirty << std::endl; - } - }; - - - /* dirlump - contains metadata for any dir we have contents for. - */ -public: - struct dirlump { - static const int STATE_COMPLETE = (1<<1); - static const int STATE_DIRTY = (1<<2); // dirty due to THIS journal item, that is! - - version_t dirv; - int state; - int nfull, nremote, nnull; - - private: - bufferlist dnbl; - bool dn_decoded; - list dfull; - list dremote; - list dnull; - - public: - dirlump() : dirv(0), state(0), nfull(0), nremote(0), nnull(0), dn_decoded(true) { } - - bool is_complete() { return state & STATE_COMPLETE; } - void mark_complete() { state |= STATE_COMPLETE; } - bool is_dirty() { return state & STATE_DIRTY; } - void mark_dirty() { state |= STATE_DIRTY; } - - list &get_dfull() { return dfull; } - list &get_dremote() { return dremote; } - list &get_dnull() { return dnull; } - - void print(dirfrag_t dirfrag, ostream& out) { - out << "dirlump " << dirfrag << " dirv " << dirv - << " state " << state - << " num " << nfull << "/" << nremote << "/" << nnull - << std::endl; - _decode_bits(); - for (list::iterator p = dfull.begin(); p != dfull.end(); ++p) - p->print(out); - for (list::iterator p = dremote.begin(); p != dremote.end(); ++p) - p->print(out); - for (list::iterator p = dnull.begin(); p != dnull.end(); ++p) - p->print(out); - } - - void _encode_bits() { - for (list::iterator p = dfull.begin(); p != dfull.end(); ++p) - p->_encode(dnbl); - for (list::iterator p = dremote.begin(); p != dremote.end(); ++p) - p->_encode(dnbl); - for (list::iterator p = dnull.begin(); p != dnull.end(); ++p) - p->_encode(dnbl); - } - void _decode_bits() { - if (dn_decoded) return; - int off = 0; - for (int i=0; i lump_order; - map lump_map; - - // anchor transactions included in this update. - list atids; - - // inode dirlocks (scatterlocks) i've touched. - map dirty_inode_mtimes; - - // ino's i've allocated - list allocated_inos; - version_t alloc_tablev; - - // inodes i've destroyed. - list< triple > truncated_inodes; - - // idempotent op(s) - list client_reqs; - - public: - // soft state - off_t last_subtree_map; - off_t my_offset; - - // for replay, in certain cases - LogSegment *_segment; - - EMetaBlob() : last_subtree_map(0), my_offset(0), _segment(0) { } - EMetaBlob(MDLog *mdl); // defined in journal.cc - - void print(ostream& out) { - for (list::iterator p = lump_order.begin(); - p != lump_order.end(); - ++p) { - lump_map[*p].print(*p, out); - } - } - - void add_client_req(metareqid_t r) { - client_reqs.push_back(r); - } - - void add_anchor_transaction(version_t atid) { - atids.push_back(atid); - } - - void add_dirtied_inode_mtime(inodeno_t ino, utime_t ctime) { - dirty_inode_mtimes[ino] = ctime; - } - - void add_allocated_ino(inodeno_t ino, version_t tablev) { - allocated_inos.push_back(ino); - alloc_tablev = tablev; - } - - void add_inode_truncate(inodeno_t ino, off_t newsize, off_t oldsize) { - truncated_inodes.push_back(triple(ino, newsize, oldsize)); - } - - void add_null_dentry(CDentry *dn, bool dirty) { - add_null_dentry(add_dir(dn->get_dir(), false), dn, dirty); - } - void add_null_dentry(dirlump& lump, CDentry *dn, bool dirty) { - // add the dir - lump.nnull++; - if (dirty) - lump.get_dnull().push_front(nullbit(dn->get_name(), - dn->get_projected_version(), - dirty)); - else - lump.get_dnull().push_back(nullbit(dn->get_name(), - dn->get_projected_version(), - dirty)); - } - - void add_remote_dentry(CDentry *dn, bool dirty, inodeno_t rino=0) { - add_remote_dentry(add_dir(dn->get_dir(), false), - dn, dirty, rino); - } - void add_remote_dentry(dirlump& lump, CDentry *dn, bool dirty, - inodeno_t rino=0, unsigned char rdt=0) { - if (!rino) { - rino = dn->get_remote_ino(); - rdt = dn->get_remote_d_type(); - } - lump.nremote++; - if (dirty) - lump.get_dremote().push_front(remotebit(dn->get_name(), - dn->get_projected_version(), - rino, rdt, - dirty)); - else - lump.get_dremote().push_back(remotebit(dn->get_name(), - dn->get_projected_version(), - rino, rdt, - dirty)); - } - - // return remote pointer to to-be-journaled inode - inode_t *add_primary_dentry(CDentry *dn, bool dirty, - CInode *in=0, inode_t *pi=0, fragtree_t *pdft=0) { - return add_primary_dentry(add_dir(dn->get_dir(), false), - dn, dirty, in, pi, pdft); - } - inode_t *add_primary_dentry(dirlump& lump, CDentry *dn, bool dirty, - CInode *in=0, inode_t *pi=0, fragtree_t *pdft=0) { - if (!in) - in = dn->get_inode(); - - // make note of where this inode was last journaled - in->last_journaled = my_offset; - //cout << "journaling " << in->inode.ino << " at " << my_offset << std::endl; - - lump.nfull++; - if (dirty) { - lump.get_dfull().push_front(fullbit(dn->get_name(), - dn->get_projected_version(), - in->inode, in->dirfragtree, in->symlink, - dirty)); - if (pi) lump.get_dfull().front().inode = *pi; - return &lump.get_dfull().front().inode; - } else { - lump.get_dfull().push_back(fullbit(dn->get_name(), - dn->get_projected_version(), - in->inode, in->dirfragtree, in->symlink, - dirty)); - if (pi) lump.get_dfull().back().inode = *pi; - return &lump.get_dfull().back().inode; - } - } - - // convenience: primary or remote? figure it out. - inode_t *add_dentry(CDentry *dn, bool dirty) { - dirlump& lump = add_dir(dn->get_dir(), false); - return add_dentry(lump, dn, dirty); - } - inode_t *add_dentry(dirlump& lump, CDentry *dn, bool dirty) { - // primary or remote - if (dn->is_remote()) { - add_remote_dentry(dn, dirty); - return 0; - } else if (dn->is_null()) { - add_null_dentry(dn, dirty); - return 0; - } - assert(dn->is_primary()); - return add_primary_dentry(dn, dirty); - } - - - dirlump& add_dir(CDir *dir, bool dirty, bool complete=false) { - return add_dir(dir->dirfrag(), dir->get_projected_version(), dirty, complete); - } - dirlump& add_dir(dirfrag_t df, version_t pv, bool dirty, bool complete=false) { - if (lump_map.count(df) == 0) { - lump_order.push_back(df); - lump_map[df].dirv = pv; - } - dirlump& l = lump_map[df]; - if (complete) l.mark_complete(); - if (dirty) l.mark_dirty(); - return l; - } - - static const int TO_AUTH_SUBTREE_ROOT = 0; // default. - static const int TO_ROOT = 1; - - void add_dir_context(CDir *dir, int mode = TO_AUTH_SUBTREE_ROOT) { - // already have this dir? (we must always add in order) - if (lump_map.count(dir->dirfrag())) - return; - - if (mode == TO_AUTH_SUBTREE_ROOT) { - //return; // hack: for comparison purposes.. what if NO context? - - // subtree root? - if (dir->is_subtree_root() && dir->is_auth()) - return; - - // was the inode journaled since the last subtree_map? - if (//false && // for benchmarking - last_subtree_map && - dir->inode->last_journaled >= last_subtree_map) { - /* - cout << " inode " << dir->inode->inode.ino - << " last journaled at " << dir->inode->last_journaled - << " and last_subtree_map is " << last_subtree_map - << std::endl; - */ - return; - } - } - - // stop at root/stray - CInode *diri = dir->get_inode(); - if (!diri->get_parent_dn()) - return; - - // journaled? - - // add parent dn - CDentry *parent = diri->get_parent_dn(); - add_dir_context(parent->get_dir(), mode); - add_dentry(parent, false); - } - - - // encoding - - void _encode(bufferlist& bl) { - int32_t n = lump_map.size(); - ::_encode(n, bl); - for (list::iterator i = lump_order.begin(); - i != lump_order.end(); - ++i) { - dirfrag_t dirfrag = *i; - ::_encode(dirfrag, bl); - lump_map[*i]._encode(bl); - } - ::_encode(atids, bl); - ::_encode(dirty_inode_mtimes, bl); - ::_encode(allocated_inos, bl); - if (!allocated_inos.empty()) - ::_encode(alloc_tablev, bl); - ::_encode(truncated_inodes, bl); - ::_encode(client_reqs, bl); - } - void _decode(bufferlist& bl, int& off) { - int32_t n; - ::_decode(n, bl, off); - for (int i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_EOPEN_H -#define __MDS_EOPEN_H - -#include "../LogEvent.h" -#include "EMetaBlob.h" - -class EOpen : public LogEvent { -public: - EMetaBlob metablob; - list inos; - - EOpen() : LogEvent(EVENT_OPEN) { } - EOpen(MDLog *mdlog) : - LogEvent(EVENT_OPEN), metablob(mdlog) { } - - void print(ostream& out) { - out << "EOpen " << metablob; - } - - void add_inode(CInode *in) { - inos.push_back(in->ino()); - metablob.add_dir_context(in->get_parent_dn()->get_dir()); - metablob.add_primary_dentry(in->get_parent_dn(), false); - } - - void encode_payload(bufferlist& bl) { - ::_encode(inos, bl); - metablob._encode(bl); - } - void decode_payload(bufferlist& bl, int& off) { - ::_decode(inos, bl, off); - metablob._decode(bl, off); - } - - void update_segment(); - void replay(MDS *mds); -}; - -#endif diff --git a/trunk/ceph/mds/events/EPurgeFinish.h b/trunk/ceph/mds/events/EPurgeFinish.h deleted file mode 100644 index dff0101b7699a..0000000000000 --- a/trunk/ceph/mds/events/EPurgeFinish.h +++ /dev/null @@ -1,54 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __EPURGE_H -#define __EPURGE_H - -#include -#include "config.h" -#include "include/types.h" - -class EPurgeFinish : public LogEvent { - protected: - inodeno_t ino; - off_t newsize, oldsize; - - public: - EPurgeFinish(inodeno_t i, off_t ns, off_t os) : - LogEvent(EVENT_PURGEFINISH), - ino(i), newsize(ns), oldsize(os) { } - EPurgeFinish() : LogEvent(EVENT_PURGEFINISH) { } - - void print(ostream& out) { - out << "purgefinish " << ino << " " << oldsize << " ->" << newsize; - } - - virtual void encode_payload(bufferlist& bl) { - bl.append((char*)&ino, sizeof(ino)); - bl.append((char*)&newsize, sizeof(newsize)); - bl.append((char*)&oldsize, sizeof(oldsize)); - } - void decode_payload(bufferlist& bl, int& off) { - ::_decode(ino, bl, off); - ::_decode(newsize, bl, off); - ::_decode(oldsize, bl, off); - } - - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); - void update_segment(); - void replay(MDS *mds); -}; - -#endif diff --git a/trunk/ceph/mds/events/ESession.h b/trunk/ceph/mds/events/ESession.h deleted file mode 100644 index 3aba5559aac1c..0000000000000 --- a/trunk/ceph/mds/events/ESession.h +++ /dev/null @@ -1,64 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_ESESSION_H -#define __MDS_ESESSION_H - -#include -#include "config.h" -#include "include/types.h" - -#include "../LogEvent.h" - -class ESession : public LogEvent { - protected: - entity_inst_t client_inst; - bool open; // open or close - version_t cmapv; // client map version - - public: - ESession() : LogEvent(EVENT_SESSION) { } - ESession(entity_inst_t inst, bool o, version_t v) : - LogEvent(EVENT_SESSION), - client_inst(inst), - open(o), - cmapv(v) { - } - - void encode_payload(bufferlist& bl) { - ::_encode(client_inst, bl); - ::_encode(open, bl); - ::_encode(cmapv, bl); - } - void decode_payload(bufferlist& bl, int& off) { - ::_decode(client_inst, bl, off); - ::_decode(open, bl, off); - ::_decode(cmapv, bl, off); - } - - - void print(ostream& out) { - if (open) - out << "ESession " << client_inst << " open cmapv " << cmapv; - else - out << "ESession " << client_inst << " close cmapv " << cmapv; - } - - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); - void update_segment(); - void replay(MDS *mds); -}; - -#endif diff --git a/trunk/ceph/mds/events/ESessions.h b/trunk/ceph/mds/events/ESessions.h deleted file mode 100644 index 0db175c948877..0000000000000 --- a/trunk/ceph/mds/events/ESessions.h +++ /dev/null @@ -1,55 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_ESESSIONS_H -#define __MDS_ESESSIONS_H - -#include -#include "config.h" -#include "include/types.h" - -#include "../LogEvent.h" - -class ESessions : public LogEvent { -protected: - version_t cmapv; // client map version - -public: - map client_map; - - ESessions() : LogEvent(EVENT_SESSION) { } - ESessions(version_t v) : - LogEvent(EVENT_SESSION), - cmapv(v) { - } - - void encode_payload(bufferlist& bl) { - ::_encode(client_map, bl); - ::_encode(cmapv, bl); - } - void decode_payload(bufferlist& bl, int& off) { - ::_decode(client_map, bl, off); - ::_decode(cmapv, bl, off); - } - - - void print(ostream& out) { - out << "ESessions " << client_map.size() << " opens cmapv " << cmapv; - } - - void update_segment(); - void replay(MDS *mds); -}; - -#endif diff --git a/trunk/ceph/mds/events/ESlaveUpdate.h b/trunk/ceph/mds/events/ESlaveUpdate.h deleted file mode 100644 index 54eaef9c6a296..0000000000000 --- a/trunk/ceph/mds/events/ESlaveUpdate.h +++ /dev/null @@ -1,79 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_ESLAVEUPDATE_H -#define __MDS_ESLAVEUPDATE_H - -#include "../LogEvent.h" -#include "EMetaBlob.h" - -class ESlaveUpdate : public LogEvent { -public: - const static int OP_PREPARE = 1; - const static int OP_COMMIT = 2; - const static int OP_ROLLBACK = 3; - - /* - * we journal a rollback metablob that contains the unmodified metadata - * too, because we may be updating previously dirty metadata, which - * will allow old log segments to be trimmed. if we end of rolling back, - * those updates could be lost.. so we re-journal the unmodified metadata, - * and replay will apply _either_ commit or rollback. - */ - EMetaBlob commit, rollback; - string type; - metareqid_t reqid; - int master; - int op; // prepare, commit, abort - - ESlaveUpdate() : LogEvent(EVENT_SLAVEUPDATE) { } - ESlaveUpdate(MDLog *mdlog, const char *s, metareqid_t ri, int mastermds, int o) : - LogEvent(EVENT_SLAVEUPDATE), commit(mdlog), rollback(mdlog), - type(s), - reqid(ri), - master(mastermds), - op(o) { } - - void print(ostream& out) { - if (type.length()) - out << type << " "; - out << " " << op; - out << " " << reqid; - out << " for mds" << master; - out << commit << " " << rollback; - } - - void encode_payload(bufferlist& bl) { - ::_encode(type, bl); - ::_encode(reqid, bl); - ::_encode(master, bl); - ::_encode(op, bl); - commit._encode(bl); - rollback._encode(bl); - } - void decode_payload(bufferlist& bl, int& off) { - ::_decode(type, bl, off); - ::_decode(reqid, bl, off); - ::_decode(master, bl, off); - ::_decode(op, bl, off); - commit._decode(bl, off); - rollback._decode(bl, off); - } - - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); - void replay(MDS *mds); -}; - -#endif diff --git a/trunk/ceph/mds/events/EString.h b/trunk/ceph/mds/events/EString.h deleted file mode 100644 index b292f9927d76f..0000000000000 --- a/trunk/ceph/mds/events/EString.h +++ /dev/null @@ -1,56 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __ESTRING_H -#define __ESTRING_H - -#include -#include -using namespace std; - -#include "../LogEvent.h" - -// generic log event -class EString : public LogEvent { - protected: - string event; - - public: - EString(string e) : - LogEvent(EVENT_STRING) { - event = e; - } - EString() : - LogEvent(EVENT_STRING) { - } - - void decode_payload(bufferlist& bl, int& off) { - ::_decode(event, bl, off); - } - void encode_payload(bufferlist& bl) { - ::_encode(event, bl); - } - - void print(ostream& out) { - out << '"' << event << '"'; - } - - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); - void replay(MDS *mds); - -}; - -#endif diff --git a/trunk/ceph/mds/events/ESubtreeMap.h b/trunk/ceph/mds/events/ESubtreeMap.h deleted file mode 100644 index cb6feb1d92ec6..0000000000000 --- a/trunk/ceph/mds/events/ESubtreeMap.h +++ /dev/null @@ -1,47 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_ESUBTREEMAP_H -#define __MDS_ESUBTREEMAP_H - -#include "../LogEvent.h" -#include "EMetaBlob.h" - -class ESubtreeMap : public LogEvent { -public: - EMetaBlob metablob; - map > subtrees; - - ESubtreeMap() : LogEvent(EVENT_SUBTREEMAP) { } - - void print(ostream& out) { - out << "subtree_map " << subtrees.size() << " subtrees " - << metablob; - } - - void encode_payload(bufferlist& bl) { - metablob._encode(bl); - ::_encode(subtrees, bl); - } - void decode_payload(bufferlist& bl, int& off) { - metablob._decode(bl, off); - ::_decode(subtrees, bl, off); - } - - //bool has_expired(MDS *mds); - //void expire(MDS *mds, Context *c); - void replay(MDS *mds); -}; - -#endif diff --git a/trunk/ceph/mds/events/EUpdate.h b/trunk/ceph/mds/events/EUpdate.h deleted file mode 100644 index 3939527cef41c..0000000000000 --- a/trunk/ceph/mds/events/EUpdate.h +++ /dev/null @@ -1,53 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_EUPDATE_H -#define __MDS_EUPDATE_H - -#include "../LogEvent.h" -#include "EMetaBlob.h" - -class EUpdate : public LogEvent { -public: - EMetaBlob metablob; - string type; - bufferlist client_map; - - EUpdate() : LogEvent(EVENT_UPDATE) { } - EUpdate(MDLog *mdlog, const char *s) : - LogEvent(EVENT_UPDATE), metablob(mdlog), - type(s) { } - - void print(ostream& out) { - if (type.length()) - out << type << " "; - out << metablob; - } - - void encode_payload(bufferlist& bl) { - ::_encode(type, bl); - metablob._encode(bl); - ::_encode(client_map, bl); - } - void decode_payload(bufferlist& bl, int& off) { - ::_decode(type, bl, off); - metablob._decode(bl, off); - ::_decode(client_map, bl, off); - } - - void update_segment(); - void replay(MDS *mds); -}; - -#endif diff --git a/trunk/ceph/mds/journal.cc b/trunk/ceph/mds/journal.cc deleted file mode 100644 index 3b39679dcd61f..0000000000000 --- a/trunk/ceph/mds/journal.cc +++ /dev/null @@ -1,1126 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "events/EString.h" -#include "events/ESubtreeMap.h" -#include "events/ESession.h" -#include "events/ESessions.h" - -#include "events/EMetaBlob.h" - -#include "events/EUpdate.h" -#include "events/ESlaveUpdate.h" -#include "events/EOpen.h" - -#include "events/EPurgeFinish.h" - -#include "events/EExport.h" -#include "events/EImportStart.h" -#include "events/EImportFinish.h" -#include "events/EFragment.h" - -#include "events/EAnchor.h" -#include "events/EAnchorClient.h" - -#include "LogSegment.h" - -#include "MDS.h" -#include "MDLog.h" -#include "MDCache.h" -#include "Server.h" -#include "Migrator.h" -#include "AnchorTable.h" -#include "AnchorClient.h" -#include "IdAllocator.h" -#include "Locker.h" - - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug_mds || l <= g_conf.debug_mds_log || l <= g_conf.debug_mds_log_expire) *_dout << dbeginl << g_clock.now() << " mds" << mds->get_nodeid() << ".journal " -#define derr(l) if (l<=g_conf.debug_mds || l <= g_conf.debug_mds_log || l <= g_conf.debug_mds_log_expire) *_dout << dbeginl << g_clock.now() << " mds" << mds->get_nodeid() << ".journal " - - -// ----------------------- -// LogSegment - -class C_MDL_RetryExpireSegment : public Context { -public: - MDS *mds; - LogSegment *ls; - C_MDL_RetryExpireSegment(MDS *m, LogSegment *l) : mds(m), ls(l) {} - void finish(int r) { - ls->try_to_expire(mds); - } -}; - -C_Gather *LogSegment::try_to_expire(MDS *mds) -{ - C_Gather *gather = 0; - - set commit; - - dout(6) << "LogSegment(" << offset << ").try_to_expire" << dendl; - - // commit dirs - for (xlist::iterator p = dirty_dirfrags.begin(); !p.end(); ++p) { - assert((*p)->is_auth()); - commit.insert(*p); - } - for (xlist::iterator p = dirty_dentries.begin(); !p.end(); ++p) { - assert((*p)->is_auth()); - commit.insert((*p)->get_dir()); - } - for (xlist::iterator p = dirty_inodes.begin(); !p.end(); ++p) { - assert((*p)->is_auth()); - commit.insert((*p)->get_parent_dn()->get_dir()); - } - - if (!commit.empty()) { - if (!gather) gather = new C_Gather; - - for (set::iterator p = commit.begin(); - p != commit.end(); - ++p) { - CDir *dir = *p; - assert(dir->is_auth()); - if (dir->can_auth_pin()) { - dout(15) << "try_to_expire committing " << *dir << dendl; - dir->commit(0, gather->new_sub()); - } else { - dout(15) << "try_to_expire waiting for unfreeze on " << *dir << dendl; - dir->add_waiter(CDir::WAIT_UNFREEZE, gather->new_sub()); - } - } - } - - // dirty non-auth mtimes - for (xlist::iterator p = dirty_inode_mtimes.begin(); !p.end(); ++p) { - CInode *in = *p; - dout(10) << "try_to_expire waiting for dirlock mtime flush on " << *in << dendl; - if (!gather) gather = new C_Gather; - - if (in->is_ambiguous_auth()) { - dout(10) << " waiting for single auth on " << *in << dendl; - in->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, gather->new_sub()); - } else if (in->is_auth()) { - dout(10) << " i'm auth, unscattering dirlock on " << *in << dendl; - assert(in->is_replicated()); // hrm! - mds->locker->scatter_lock(&in->dirlock); - in->dirlock.add_waiter(SimpleLock::WAIT_STABLE, gather->new_sub()); - } else { - dout(10) << " i'm a replica, requesting dirlock unscatter of " << *in << dendl; - mds->locker->scatter_try_unscatter(&in->dirlock, gather->new_sub()); - } - //(*p)->dirlock.add_waiter(SimpleLock::WAIT_STABLE, gather->new_sub()); - } - - // open files - if (!open_files.empty()) { - assert(!mds->mdlog->is_capped()); // hmm FIXME - for (xlist::iterator p = open_files.begin(); !p.end(); ++p) { - dout(20) << "try_to_expire requeueing open file " << **p << dendl; - mds->server->queue_journal_open(*p); - } - if (!gather) gather = new C_Gather; - mds->server->add_journal_open_waiter(gather->new_sub()); - mds->server->maybe_journal_opens(); - dout(10) << "try_to_expire waiting for open files to rejournal" << dendl; - } - - // slave updates - for (xlist::iterator p = slave_updates.begin(); !p.end(); ++p) { - MDSlaveUpdate *su = *p; - dout(10) << "try_to_expire waiting on slave update " << su << dendl; - assert(su->waiter == 0); - if (!gather) gather = new C_Gather; - su->waiter = gather->new_sub(); - } - - // idalloc - if (allocv > mds->idalloc->get_committed_version()) { - dout(10) << "try_to_expire saving idalloc table, need " << allocv - << ", committed is " << mds->idalloc->get_committed_version() - << " (" << mds->idalloc->get_committing_version() << ")" - << dendl; - if (!gather) gather = new C_Gather; - mds->idalloc->save(gather->new_sub(), allocv); - } - - // clientmap - if (clientmapv > mds->clientmap.get_committed()) { - dout(10) << "try_to_expire saving clientmap, need " << clientmapv - << ", committed is " << mds->clientmap.get_committed() - << " (" << mds->clientmap.get_committing() << ")" - << dendl; - if (!gather) gather = new C_Gather; - mds->clientmap.save(gather->new_sub(), clientmapv); - } - - // pending commit atids - for (hash_set::iterator p = pending_commit_atids.begin(); - p != pending_commit_atids.end(); - ++p) { - if (!gather) gather = new C_Gather; - assert(!mds->anchorclient->has_committed(*p)); - dout(10) << "try_to_expire anchor transaction " << *p - << " pending commit (not yet acked), waiting" << dendl; - mds->anchorclient->wait_for_ack(*p, gather->new_sub()); - } - - // anchortable - if (anchortablev > mds->anchortable->get_committed_version()) { - dout(10) << "try_to_expire waiting for anchor table to save, need " << anchortablev << dendl; - if (!gather) gather = new C_Gather; - mds->anchortable->save(gather->new_sub()); - } - - // FIXME client requests...? - // audit handling of anchor transactions? - - if (gather) { - dout(6) << "LogSegment(" << offset << ").try_to_expire waiting" << dendl; - } else { - dout(6) << "LogSegment(" << offset << ").try_to_expire success" << dendl; - } - return gather; -} - - - -#undef dout -#undef derr -#define dout(l) if (l<=g_conf.debug_mds || l <= g_conf.debug_mds_log) *_dout << dbeginl << g_clock.now() << " mds" << mds->get_nodeid() << ".journal " -#define derr(l) if (l<=g_conf.debug_mds || l <= g_conf.debug_mds_log) *_dout << dbeginl << g_clock.now() << " mds" << mds->get_nodeid() << ".journal " - - -// ----------------------- -// EString - -bool EString::has_expired(MDS *mds) { - dout(10) << "EString.has_expired " << event << dendl; - return true; -} -void EString::expire(MDS *mds, Context *c) -{ - dout(10) << "EString.expire " << event << dendl; -} -void EString::replay(MDS *mds) -{ - dout(10) << "EString.replay " << event << dendl; -} - - - -// ----------------------- -// EMetaBlob - -EMetaBlob::EMetaBlob(MDLog *mdlog) : - last_subtree_map(mdlog->get_last_segment_offset()), - my_offset(mdlog->get_write_pos()) -{ -} - - -/* - * we need to ensure that a journaled item has either - * - * - been safely committed to its dirslice. - * - * - has been safely exported. i.e., authority().first != us. - * in particular, auth of is not enough, we need to - * wait for . - * - * note that this check is overly conservative, in that we'll - * try to flush the dir again if we reimport the subtree, even though - * later journal entries contain the same dirty data (from the import). - * - */ -bool EMetaBlob::has_expired(MDS *mds) -{ -/* - // examine dirv's for my lumps - for (map::iterator lp = lump_map.begin(); - lp != lump_map.end(); - ++lp) { - CDir *dir = mds->mdcache->get_dirfrag(lp->first); - if (!dir) - continue; // we expired it - - // FIXME: check the slice only - - if (dir->authority().first != mds->get_nodeid()) { - dout(10) << "EMetaBlob.has_expired not auth, needed dirv " << lp->second.dirv - << " for " << *dir << dendl; - continue; // not our problem - } - - if (g_conf.mds_hack_log_expire_for_better_stats) { - // FIXME HACK: this makes logger stats more accurage, for journal stats, - // but is not perfectly safe. for benchmarking ONLY! - if (dir->get_committing_version() >= lp->second.dirv || // committING, not committED. - dir->get_committed_version_equivalent() >= lp->second.dirv) { - dout(10) << "EMetaBlob.has_expired have|committING (unsafe hack!) dirv " << lp->second.dirv - << " for " << *dir << dendl; - continue; // yay - } - } else { - // this is the proper (safe) way - if (dir->get_committed_version() >= lp->second.dirv || - dir->get_committed_version_equivalent() >= lp->second.dirv) { - dout(10) << "EMetaBlob.has_expired have dirv " << lp->second.dirv - << " for " << *dir << dendl; - continue; // yay - } - } - - if (dir->is_ambiguous_dir_auth()) { - CDir *ex = mds->mdcache->get_subtree_root(dir); - if (ex->is_exporting()) { - // wait until export is acked (logged on remote) and committed (logged locally) - dout(10) << "EMetaBlob.has_expired ambiguous auth for " << *dir - << ", exporting on " << *ex << dendl; - return false; - } else { - dout(10) << "EMetaBlob.has_expired ambiguous auth for " << *dir - << ", importing on " << *ex << dendl; - return false; - } - } - - if (dir->get_committed_version() < lp->second.dirv) { - dout(10) << "EMetaBlob.has_expired need dirv " << lp->second.dirv - << " for " << *dir << dendl; - return false; // not committed. - } - - assert(0); // i goofed the logic - } - - // have my anchortable ops committed? - for (list::iterator p = atids.begin(); - p != atids.end(); - ++p) { - if (!mds->anchorclient->has_committed(*p)) { - dout(10) << "EMetaBlob.has_expired anchor transaction " << *p - << " not yet acked" << dendl; - return false; - } - } - - if (!dirty_inode_mtimes.empty()) - for (map::iterator p = dirty_inode_mtimes.begin(); - p != dirty_inode_mtimes.end(); - ++p) { - CInode *in = mds->mdcache->get_inode(p->first); - if (in) { - if (in->inode.ctime == p->second && - in->dirlock.is_updated()) { - dout(10) << "EMetaBlob.has_expired dirty mtime dirlock hasn't flushed on " << *in << dendl; - return false; - } - } - } - - // allocated_ios - if (!allocated_inos.empty()) { - version_t cv = mds->idalloc->get_committed_version(); - if (cv < alloc_tablev) { - dout(10) << "EMetaBlob.has_expired idalloc tablev " << alloc_tablev << " > " << cv - << ", still dirty" << dendl; - return false; // still dirty - } else { - dout(10) << "EMetaBlob.has_expired idalloc tablev " << alloc_tablev << " <= " << cv - << ", already flushed" << dendl; - } - } - - - // truncated inodes - for (list< pair >::iterator p = truncated_inodes.begin(); - p != truncated_inodes.end(); - ++p) { - if (mds->mdcache->is_purging(p->first.ino, p->second)) { - dout(10) << "EMetaBlob.has_expired still purging inode " << p->first.ino - << " to " << p->second << dendl; - return false; - } - } - - // client requests - for (list::iterator p = client_reqs.begin(); - p != client_reqs.end(); - ++p) { - if (mds->clientmap.have_completed_request(*p)) { - dout(10) << "EMetaBlob.has_expired still have completed request " << *p - << dendl; - return false; - } - } - - - */ - return true; // all dirlumps expired, etc. -} - - -void EMetaBlob::expire(MDS *mds, Context *c) -{ -/* - map commit; // dir -> version needed - list waitfor_export; - list waitfor_import; - int ncommit = 0; - - // examine dirv's for my lumps - // make list of dir slices i need to commit - for (map::iterator lp = lump_map.begin(); - lp != lump_map.end(); - ++lp) { - CDir *dir = mds->mdcache->get_dirfrag(lp->first); - if (!dir) - continue; // we expired it - - // FIXME: check the slice only - - if (dir->authority().first != mds->get_nodeid()) { - dout(10) << "EMetaBlob.expire not auth, needed dirv " << lp->second.dirv - << " for " << *dir << dendl; - continue; // not our problem - } - if (dir->get_committed_version() >= lp->second.dirv || - dir->get_committed_version_equivalent() >= lp->second.dirv) { - dout(10) << "EMetaBlob.expire have dirv " << lp->second.dirv - << " on " << *dir << dendl; - continue; // yay - } - - if (dir->is_ambiguous_dir_auth()) { - CDir *ex = mds->mdcache->get_subtree_root(dir); - if (ex->is_exporting()) { - // wait until export is acked (logged on remote) and committed (logged locally) - dout(10) << "EMetaBlob.expire ambiguous auth for " << *dir - << ", waiting for export finish on " << *ex << dendl; - waitfor_export.push_back(ex); - continue; - } else { - dout(10) << "EMetaBlob.expire ambiguous auth for " << *dir - << ", waiting for import finish on " << *ex << dendl; - waitfor_import.push_back(ex); - continue; - } - } - - assert(dir->get_committed_version() < lp->second.dirv); - dout(10) << "EMetaBlob.expire need dirv " << lp->second.dirv - << ", committing " << *dir << dendl; - commit[dir] = MAX(commit[dir], lp->second.dirv); - ncommit++; - } - - // set up gather context - C_Gather *gather = new C_Gather(c); - - // do or wait for exports and commits - for (map::iterator p = commit.begin(); - p != commit.end(); - ++p) { - if (p->first->can_auth_pin()) - p->first->commit(p->second, gather->new_sub()); - else - // pbly about to export|split|merge. - // just wait for it to unfreeze, then retry - p->first->add_waiter(CDir::WAIT_UNFREEZE, gather->new_sub()); - } - for (list::iterator p = waitfor_export.begin(); - p != waitfor_export.end(); - ++p) - mds->mdcache->migrator->add_export_finish_waiter(*p, gather->new_sub()); - for (list::iterator p = waitfor_import.begin(); - p != waitfor_import.end(); - ++p) - (*p)->add_waiter(CDir::WAIT_UNFREEZE, gather->new_sub()); - - - // have my anchortable ops committed? - for (list::iterator p = atids.begin(); - p != atids.end(); - ++p) { - if (!mds->anchorclient->has_committed(*p)) { - dout(10) << "EMetaBlob.expire anchor transaction " << *p - << " not yet acked, waiting" << dendl; - mds->anchorclient->wait_for_ack(*p, gather->new_sub()); - } - } - - // dirtied inode mtimes - if (!dirty_inode_mtimes.empty()) - for (map::iterator p = dirty_inode_mtimes.begin(); - p != dirty_inode_mtimes.end(); - ++p) { - CInode *in = mds->mdcache->get_inode(p->first); - if (in) { - if (in->inode.ctime == p->second && - in->dirlock.is_updated()) { - dout(10) << "EMetaBlob.expire dirty mtime dirlock hasn't flushed, waiting on " - << *in << dendl; - in->dirlock.add_waiter(SimpleLock::WAIT_STABLE, gather->new_sub()); - } - } - } - - // allocated_inos - if (!allocated_inos.empty()) { - version_t cv = mds->idalloc->get_committed_version(); - if (cv < alloc_tablev) { - dout(10) << "EMetaBlob.expire saving idalloc table, need " << alloc_tablev << dendl; - mds->idalloc->save(gather->new_sub(), alloc_tablev); - } - } - - // truncated inodes - for (list< pair >::iterator p = truncated_inodes.begin(); - p != truncated_inodes.end(); - ++p) { - if (mds->mdcache->is_purging(p->first.ino, p->second)) { - dout(10) << "EMetaBlob.expire waiting for purge of inode " << p->first.ino - << " to " << p->second << dendl; - mds->mdcache->wait_for_purge(p->first.ino, p->second, gather->new_sub()); - } - } - - // client requests - for (list::iterator p = client_reqs.begin(); - p != client_reqs.end(); - ++p) { - if (mds->clientmap.have_completed_request(*p)) { - dout(10) << "EMetaBlob.expire waiting on completed request " << *p - << dendl; - mds->clientmap.add_trim_waiter(*p, gather->new_sub()); - } - } - - dout(10) << "my gather finsher is " << gather << " with " << gather->get_num() << dendl; - -*/ -} - -void EMetaBlob::update_segment(LogSegment *ls) -{ - // atids? - //for (list::iterator p = atids.begin(); p != atids.end(); ++p) - // ls->pending_commit_atids[*p] = ls; - // -> handled directly by AnchorClient - - // dirty inode mtimes - // -> handled directly by Server.cc, replay() - - // alloc table update? - if (!allocated_inos.empty()) - ls->allocv = alloc_tablev; - - // truncated inodes - // -> handled directly by Server.cc - - // client requests - // note the newest request per client - //if (!client_reqs.empty()) - // ls->last_client_tid[client_reqs.rbegin()->client] = client_reqs.rbegin()->tid); -} - -void EMetaBlob::replay(MDS *mds, LogSegment *logseg) -{ - dout(10) << "EMetaBlob.replay " << lump_map.size() << " dirlumps" << dendl; - - if (!logseg) logseg = _segment; - assert(logseg); - - // walk through my dirs (in order!) - for (list::iterator lp = lump_order.begin(); - lp != lump_order.end(); - ++lp) { - dout(10) << "EMetaBlob.replay dir " << *lp << dendl; - dirlump &lump = lump_map[*lp]; - - // the dir - CDir *dir = mds->mdcache->get_dirfrag(*lp); - if (!dir) { - // hmm. do i have the inode? - CInode *diri = mds->mdcache->get_inode((*lp).ino); - if (!diri) { - if ((*lp).ino == MDS_INO_ROOT) { - diri = mds->mdcache->create_root_inode(); - dout(10) << "EMetaBlob.replay created root " << *diri << dendl; - } else if (MDS_INO_IS_STRAY((*lp).ino)) { - int whose = (*lp).ino - MDS_INO_STRAY_OFFSET; - diri = mds->mdcache->create_stray_inode(whose); - dout(10) << "EMetaBlob.replay created stray " << *diri << dendl; - } else { - dout(0) << "EMetaBlob.replay missing dir ino " << (*lp).ino << dendl; - assert(0); - } - } - // create the dirfrag - dir = diri->get_or_open_dirfrag(mds->mdcache, (*lp).frag); - - if ((*lp).ino < MDS_INO_BASE) - mds->mdcache->adjust_subtree_auth(dir, CDIR_AUTH_UNKNOWN); - - dout(10) << "EMetaBlob.replay added dir " << *dir << dendl; - } - dir->set_version( lump.dirv ); - if (lump.is_dirty()) - dir->_mark_dirty(logseg); - if (lump.is_complete()) - dir->mark_complete(); - - // decode bits - lump._decode_bits(); - - // full dentry+inode pairs - for (list::iterator p = lump.get_dfull().begin(); - p != lump.get_dfull().end(); - p++) { - CDentry *dn = dir->lookup(p->dn); - if (!dn) { - dn = dir->add_null_dentry(p->dn); - dn->set_version(p->dnv); - if (p->dirty) dn->_mark_dirty(logseg); - dout(10) << "EMetaBlob.replay added " << *dn << dendl; - } else { - dn->set_version(p->dnv); - if (p->dirty) dn->_mark_dirty(logseg); - dout(10) << "EMetaBlob.replay had " << *dn << dendl; - } - - CInode *in = mds->mdcache->get_inode(p->inode.ino); - if (!in) { - in = new CInode(mds->mdcache); - in->inode = p->inode; - in->dirfragtree = p->dirfragtree; - if (in->inode.is_symlink()) in->symlink = p->symlink; - mds->mdcache->add_inode(in); - if (!dn->is_null()) { - if (dn->is_primary()) - dout(-10) << "EMetaBlob.replay FIXME had dentry linked to wrong inode " << *dn - << " " << *dn->get_inode() - << " should be " << p->inode.ino - << dendl; - dir->unlink_inode(dn); - //assert(0); // hrm! fallout from sloppy unlink? or? hmmm FIXME investigate further - } - dir->link_primary_inode(dn, in); - if (p->dirty) in->_mark_dirty(logseg); - dout(10) << "EMetaBlob.replay added " << *in << dendl; - } else { - if (dn->get_inode() != in && in->get_parent_dn()) { - dout(10) << "EMetaBlob.replay unlinking " << *in << dendl; - in->get_parent_dn()->get_dir()->unlink_inode(in->get_parent_dn()); - } - in->inode = p->inode; - in->dirfragtree = p->dirfragtree; - if (in->inode.is_symlink()) in->symlink = p->symlink; - if (p->dirty) in->_mark_dirty(logseg); - if (dn->get_inode() != in) { - dir->link_primary_inode(dn, in); - dout(10) << "EMetaBlob.replay linked " << *in << dendl; - } else { - dout(10) << "EMetaBlob.replay had " << *in << dendl; - } - } - } - - // remote dentries - for (list::iterator p = lump.get_dremote().begin(); - p != lump.get_dremote().end(); - p++) { - CDentry *dn = dir->lookup(p->dn); - if (!dn) { - dn = dir->add_remote_dentry(p->dn, p->ino, p->d_type); - dn->set_version(p->dnv); - if (p->dirty) dn->_mark_dirty(logseg); - dout(10) << "EMetaBlob.replay added " << *dn << dendl; - } else { - if (!dn->is_null()) { - dout(10) << "EMetaBlob.replay unlinking " << *dn << dendl; - dir->unlink_inode(dn); - } - dn->set_remote(p->ino, p->d_type); - dn->set_version(p->dnv); - if (p->dirty) dn->_mark_dirty(logseg); - dout(10) << "EMetaBlob.replay had " << *dn << dendl; - } - } - - // null dentries - for (list::iterator p = lump.get_dnull().begin(); - p != lump.get_dnull().end(); - p++) { - CDentry *dn = dir->lookup(p->dn); - if (!dn) { - dn = dir->add_null_dentry(p->dn); - dn->set_version(p->dnv); - if (p->dirty) dn->_mark_dirty(logseg); - dout(10) << "EMetaBlob.replay added " << *dn << dendl; - } else { - if (!dn->is_null()) { - dout(10) << "EMetaBlob.replay unlinking " << *dn << dendl; - dir->unlink_inode(dn); - } - dn->set_version(p->dnv); - if (p->dirty) dn->_mark_dirty(logseg); - dout(10) << "EMetaBlob.replay had " << *dn << dendl; - } - } - } - - // anchor transactions - for (list::iterator p = atids.begin(); - p != atids.end(); - ++p) { - dout(10) << "EMetaBlob.replay noting anchor transaction " << *p << dendl; - mds->anchorclient->got_journaled_agree(*p, logseg); - } - - // dirtied inode mtimes - if (!dirty_inode_mtimes.empty()) - for (map::iterator p = dirty_inode_mtimes.begin(); - p != dirty_inode_mtimes.end(); - ++p) { - CInode *in = mds->mdcache->get_inode(p->first); - dout(10) << "EMetaBlob.replay setting dirlock updated flag on " << *in << dendl; - in->dirlock.set_updated(); - logseg->dirty_inode_mtimes.push_back(&in->xlist_dirty_inode_mtime); - } - - // allocated_inos - if (!allocated_inos.empty()) { - if (mds->idalloc->get_version() >= alloc_tablev) { - dout(10) << "EMetaBlob.replay idalloc tablev " << alloc_tablev - << " <= table " << mds->idalloc->get_version() << dendl; - } else { - for (list::iterator p = allocated_inos.begin(); - p != allocated_inos.end(); - ++p) { - dout(10) << " EMetaBlob.replay idalloc " << *p << " tablev " << alloc_tablev - << " - 1 == table " << mds->idalloc->get_version() << dendl; - assert(alloc_tablev-1 == mds->idalloc->get_version()); - - inodeno_t ino = mds->idalloc->alloc_id(); - assert(ino == *p); // this should match. - } - assert(alloc_tablev == mds->idalloc->get_version()); - } - } - - // truncated inodes - for (list< triple >::iterator p = truncated_inodes.begin(); - p != truncated_inodes.end(); - ++p) { - CInode *in = mds->mdcache->get_inode(p->first); - assert(in); - dout(10) << "EMetaBlob.replay will purge truncated " - << p->third << " -> " << p->second - << " on " << *in << dendl; - mds->mdcache->add_recovered_purge(in, p->second, p->third, logseg); - } - - // client requests - for (list::iterator p = client_reqs.begin(); - p != client_reqs.end(); - ++p) - mds->clientmap.add_completed_request(*p); - - - // update segment - update_segment(logseg); -} - -// ----------------------- -// ESession - -void ESession::update_segment() -{ - _segment->clientmapv = cmapv; -} - -void ESession::replay(MDS *mds) -{ - if (mds->clientmap.get_version() >= cmapv) { - dout(10) << "ESession.replay clientmap " << mds->clientmap.get_version() - << " >= " << cmapv << ", noop" << dendl; - - // hrm, this isn't very pretty. - if (!open) - mds->clientmap.trim_completed_requests(client_inst.name, 0); - - } else { - dout(10) << "ESession.replay clientmap " << mds->clientmap.get_version() - << " < " << cmapv << dendl; - assert(mds->clientmap.get_version() + 1 == cmapv); - if (open) { - mds->clientmap.open_session(client_inst); - } else { - mds->clientmap.close_session(client_inst.name.num()); - mds->clientmap.trim_completed_requests(client_inst.name, 0); - } - mds->clientmap.reset_projected(); // make it follow version. - } -} - -void ESessions::update_segment() -{ - _segment->clientmapv = cmapv; -} - -void ESessions::replay(MDS *mds) -{ - if (mds->clientmap.get_version() >= cmapv) { - dout(10) << "ESessions.replay clientmap " << mds->clientmap.get_version() - << " >= " << cmapv << ", noop" << dendl; - } else { - dout(10) << "ESessions.replay clientmap " << mds->clientmap.get_version() - << " < " << cmapv << dendl; - mds->clientmap.open_sessions(client_map); - assert(mds->clientmap.get_version() == cmapv); - mds->clientmap.reset_projected(); // make it follow version. - } -} - - - -// ----------------------- -// EAnchor - -void EAnchor::update_segment() -{ - _segment->anchortablev = version; -} - -void EAnchor::replay(MDS *mds) -{ - if (mds->anchortable->get_version() >= version) { - dout(10) << "EAnchor.replay event " << version - << " <= table " << mds->anchortable->get_version() << dendl; - } else { - dout(10) << " EAnchor.replay event " << version - << " - 1 == table " << mds->anchortable->get_version() << dendl; - assert(version-1 == mds->anchortable->get_version()); - - switch (op) { - // anchortable - case ANCHOR_OP_CREATE_PREPARE: - mds->anchortable->create_prepare(ino, trace, reqmds); - break; - case ANCHOR_OP_DESTROY_PREPARE: - mds->anchortable->destroy_prepare(ino, reqmds); - break; - case ANCHOR_OP_UPDATE_PREPARE: - mds->anchortable->update_prepare(ino, trace, reqmds); - break; - case ANCHOR_OP_COMMIT: - mds->anchortable->commit(atid); - break; - - default: - assert(0); - } - - assert(version == mds->anchortable->get_version()); - } -} - - -// EAnchorClient - -void EAnchorClient::replay(MDS *mds) -{ - dout(10) << " EAnchorClient.replay op " << op << " atid " << atid << dendl; - - switch (op) { - // anchorclient - case ANCHOR_OP_ACK: - mds->anchorclient->got_journaled_ack(atid); - break; - - default: - assert(0); - } -} - - -// ----------------------- -// EUpdate - -void EUpdate::update_segment() -{ - metablob.update_segment(_segment); -} - -void EUpdate::replay(MDS *mds) -{ - metablob.replay(mds, _segment); -} - - -// ------------------------ -// EOpen - -void EOpen::update_segment() -{ - // ?? -} - -void EOpen::replay(MDS *mds) -{ - dout(10) << "EOpen.replay " << dendl; - metablob.replay(mds, _segment); -} - - -// ----------------------- -// ESlaveUpdate - -void ESlaveUpdate::replay(MDS *mds) -{ - switch (op) { - case ESlaveUpdate::OP_PREPARE: - // FIXME: horribly inefficient copy; EMetaBlob needs a swap() or something - dout(10) << "ESlaveUpdate.replay prepare " << reqid << " for mds" << master - << ": saving blobs for later commit" << dendl; - assert(mds->mdcache->uncommitted_slave_updates[master].count(reqid) == 0); - commit._segment = _segment; // may need this later - rollback._segment = _segment; // may need this later - mds->mdcache->uncommitted_slave_updates[master][reqid] = - MDSlaveUpdate(commit, rollback, _segment->slave_updates); - break; - - case ESlaveUpdate::OP_COMMIT: - if (mds->mdcache->uncommitted_slave_updates[master].count(reqid)) { - dout(10) << "ESlaveUpdate.replay commit " << reqid << " for mds" << master - << ": applying commit blob" << dendl; - mds->mdcache->uncommitted_slave_updates[master][reqid].commit.replay(mds, _segment); - mds->mdcache->uncommitted_slave_updates[master].erase(reqid); - } else { - dout(10) << "ESlaveUpdate.replay commit " << reqid << " for mds" << master - << ": ignoring, no previously saved blobs" << dendl; - } - break; - - case ESlaveUpdate::OP_ROLLBACK: - if (mds->mdcache->uncommitted_slave_updates[master].count(reqid)) { - dout(10) << "ESlaveUpdate.replay abort " << reqid << " for mds" << master - << ": applying rollback blob" << dendl; - assert(mds->mdcache->uncommitted_slave_updates[master].count(reqid)); - mds->mdcache->uncommitted_slave_updates[master][reqid].rollback.replay(mds, _segment); - mds->mdcache->uncommitted_slave_updates[master].erase(reqid); - } else { - dout(10) << "ESlaveUpdate.replay abort " << reqid << " for mds" << master - << ": ignoring, no previously saved blobs" << dendl; - } - break; - - default: - assert(0); - } -} - - -// ----------------------- -// ESubtreeMap - -void ESubtreeMap::replay(MDS *mds) -{ - // suck up the subtree map? - if (mds->mdcache->is_subtrees()) { - dout(10) << "ESubtreeMap.replay -- ignoring, already have import map" << dendl; - return; - } - - dout(10) << "ESubtreeMap.replay -- reconstructing (auth) subtree spanning tree" << dendl; - - // first, stick the spanning tree in my cache - //metablob.print(cout); - metablob.replay(mds, _segment); - - // restore import/export maps - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) { - CDir *dir = mds->mdcache->get_dirfrag(p->first); - mds->mdcache->adjust_bounded_subtree_auth(dir, p->second, mds->get_nodeid()); - } - - mds->mdcache->show_subtrees(); -} - - - -// ----------------------- -// EFragment - -void EFragment::replay(MDS *mds) -{ - dout(10) << "EFragment.replay " << ino << " " << basefrag << " by " << bits << dendl; - - CInode *in = mds->mdcache->get_inode(ino); - assert(in); - - list resultfrags; - list waiters; - mds->mdcache->adjust_dir_fragments(in, basefrag, bits, resultfrags, waiters); - - metablob.replay(mds, _segment); -} - - - -// ----------------------- -// EPurgeFinish - - -bool EPurgeFinish::has_expired(MDS *mds) -{ - return true; -} - -void EPurgeFinish::expire(MDS *mds, Context *c) -{ - assert(0); -} - -void EPurgeFinish::update_segment() -{ - // ** update purge lists? -} - -void EPurgeFinish::replay(MDS *mds) -{ - dout(10) << "EPurgeFinish.replay " << ino << " " << oldsize << " -> " << newsize << dendl; - CInode *in = mds->mdcache->get_inode(ino); - assert(in); - mds->mdcache->remove_recovered_purge(in, newsize, oldsize); -} - - - - - -// ========================================================================= - -// ----------------------- -// EExport - -bool EExport::has_expired(MDS *mds) -{ - CDir *dir = mds->mdcache->get_dirfrag(base); - if (dir && mds->mdcache->migrator->is_exporting(dir)) { - dout(10) << "EExport.has_expired still exporting " << *dir << dendl; - return false; - } - return true; -} - -void EExport::expire(MDS *mds, Context *c) -{ - CDir *dir = mds->mdcache->get_dirfrag(base); - assert(dir); - assert(mds->mdcache->migrator->is_exporting(dir)); - - dout(10) << "EExport.expire waiting for export of " << *dir << dendl; - mds->mdcache->migrator->add_export_finish_waiter(dir, c); -} - -void EExport::replay(MDS *mds) -{ - dout(10) << "EExport.replay " << base << dendl; - metablob.replay(mds, _segment); - - CDir *dir = mds->mdcache->get_dirfrag(base); - assert(dir); - - set realbounds; - for (set::iterator p = bounds.begin(); - p != bounds.end(); - ++p) { - CDir *bd = mds->mdcache->get_dirfrag(*p); - assert(bd); - realbounds.insert(bd); - } - - // adjust auth away - mds->mdcache->adjust_bounded_subtree_auth(dir, realbounds, pair(CDIR_AUTH_UNKNOWN, CDIR_AUTH_UNKNOWN)); - mds->mdcache->try_subtree_merge(dir); -} - - -// ----------------------- -// EImportStart - -void EImportStart::replay(MDS *mds) -{ - dout(10) << "EImportStart.replay " << base << dendl; - metablob.replay(mds, _segment); - - // put in ambiguous import list - mds->mdcache->add_ambiguous_import(base, bounds); - - // open client sessions? - if (mds->clientmap.get_version() >= cmapv) { - dout(10) << "EImportStart.replay clientmap " << mds->clientmap.get_version() - << " >= " << cmapv << ", noop" << dendl; - } else { - dout(10) << "EImportStart.replay clientmap " << mds->clientmap.get_version() - << " < " << cmapv << dendl; - map cm; - bufferlist::iterator blp = client_map.begin(); - ::_decode_simple(cm, blp); - mds->clientmap.open_sessions(cm); - assert(mds->clientmap.get_version() == cmapv); - mds->clientmap.reset_projected(); // make it follow version. - } -} - -// ----------------------- -// EImportFinish - -bool EImportFinish::has_expired(MDS *mds) -{ - return true; -} -void EImportFinish::expire(MDS *mds, Context *c) -{ - assert(0); // shouldn't ever happen -} - -void EImportFinish::replay(MDS *mds) -{ - if (mds->mdcache->have_ambiguous_import(base)) { - dout(10) << "EImportFinish.replay " << base << " success=" << success << dendl; - if (success) - mds->mdcache->finish_ambiguous_import(base); - else - mds->mdcache->cancel_ambiguous_import(base); - } else { - dout(10) << "EImportFinish.replay " << base << " success=" << success - << ", predates my subtree_map start point, ignoring" - << dendl; - // verify that? - } -} - - - - - diff --git a/trunk/ceph/messages/MAnchor.h b/trunk/ceph/messages/MAnchor.h deleted file mode 100644 index 6ceb8981244fa..0000000000000 --- a/trunk/ceph/messages/MAnchor.h +++ /dev/null @@ -1,74 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MANCHORREQUEST_H -#define __MANCHORREQUEST_H - -#include - -#include "msg/Message.h" -#include "mds/Anchor.h" - - -class MAnchor : public Message { - int op; - inodeno_t ino; - vector trace; - version_t atid; // anchor table version. - - public: - MAnchor() {} - MAnchor(int o, inodeno_t i, version_t v=0) : - Message(MSG_MDS_ANCHOR), - op(o), ino(i), atid(v) { } - - virtual char *get_type_name() { return "anchor"; } - void print(ostream& o) { - o << "anchor(" << get_anchor_opname(op); - if (ino) o << " " << ino; - if (atid) o << " atid " << atid; - if (!trace.empty()) o << ' ' << trace; - o << ")"; - } - - void set_trace(vector& trace) { - this->trace = trace; - } - - int get_op() { return op; } - inodeno_t get_ino() { return ino; } - vector& get_trace() { return trace; } - version_t get_atid() { return atid; } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(op), (char*)&op); - off += sizeof(op); - payload.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - payload.copy(off, sizeof(atid), (char*)&atid); - off += sizeof(atid); - ::_decode(trace, payload, off); - } - - virtual void encode_payload() { - payload.append((char*)&op, sizeof(op)); - payload.append((char*)&ino, sizeof(ino)); - payload.append((char*)&atid, sizeof(atid)); - ::_encode(trace, payload); - } -}; - -#endif diff --git a/trunk/ceph/messages/MCacheExpire.h b/trunk/ceph/messages/MCacheExpire.h deleted file mode 100644 index 015aa562038a7..0000000000000 --- a/trunk/ceph/messages/MCacheExpire.h +++ /dev/null @@ -1,127 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MCACHEEXPIRE_H -#define __MCACHEEXPIRE_H - -class MCacheExpire : public Message { - int from; - -public: - /* - group things by realm (auth delgation root), since that's how auth is determined. - that makes it less work to process when exports are in progress. - */ - struct realm { - map inodes; - map dirs; - map > dentries; - }; - map realms; - - int get_from() { return from; } - - MCacheExpire() {} - MCacheExpire(int f) : - Message(MSG_MDS_CACHEEXPIRE), - from(f) { } - - virtual char *get_type_name() { return "CEx";} - - void add_inode(dirfrag_t r, inodeno_t ino, int nonce) { - realms[r].inodes[ino] = nonce; - } - void add_dir(dirfrag_t r, dirfrag_t df, int nonce) { - realms[r].dirs[df] = nonce; - } - void add_dentry(dirfrag_t r, dirfrag_t df, const string& dn, int nonce) { - realms[r].dentries[df][dn] = nonce; - } - - void add_realm(dirfrag_t df, realm& r) { - realm& myr = realms[df]; - for (map::iterator p = r.inodes.begin(); - p != r.inodes.end(); - ++p) - myr.inodes[p->first] = p->second; - for (map::iterator p = r.dirs.begin(); - p != r.dirs.end(); - ++p) - myr.dirs[p->first] = p->second; - for (map >::iterator p = r.dentries.begin(); - p != r.dentries.end(); - ++p) - for (map::iterator q = p->second.begin(); - q != p->second.end(); - ++q) - myr.dentries[p->first][q->first] = q->second; - } - - void decode_payload() { - int off = 0; - - payload.copy(off, sizeof(from), (char*)&from); - off += sizeof(from); - - int nr; - payload.copy(off, sizeof(nr), (char*)&nr); - off += sizeof(nr); - - while (nr--) { - dirfrag_t r; - payload.copy(off, sizeof(r), (char*)&r); - off += sizeof(r); - - ::_decode(realms[r].inodes, payload, off); - ::_decode(realms[r].dirs, payload, off); - - int n; - payload.copy(off, sizeof(int), (char*)&n); - off += sizeof(int); - for (int i=0; i::iterator q = realms.begin(); - q != realms.end(); - ++q) { - payload.append((char*)&q->first, sizeof(q->first)); - - ::_encode(q->second.inodes, payload); - ::_encode(q->second.dirs, payload); - - int n = q->second.dentries.size(); - payload.append((char*)&n, sizeof(n)); - for (map >::iterator p = q->second.dentries.begin(); - p != q->second.dentries.end(); - ++p) { - payload.append((char*)&p->first, sizeof(p->first)); - ::_encode(p->second, payload); - } - } - } -}; - -#endif diff --git a/trunk/ceph/messages/MDentryUnlink.h b/trunk/ceph/messages/MDentryUnlink.h deleted file mode 100644 index 6e24d6f45410f..0000000000000 --- a/trunk/ceph/messages/MDentryUnlink.h +++ /dev/null @@ -1,82 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MDENTRYUNLINK_H -#define __MDENTRYUNLINK_H - -class MDentryUnlink : public Message { - dirfrag_t dirfrag; - string dn; - - public: - dirfrag_t get_dirfrag() { return dirfrag; } - string& get_dn() { return dn; } - - CInodeDiscover *strayin; - CDirDiscover *straydir; - CDentryDiscover *straydn; - - MDentryUnlink() : - Message(MSG_MDS_DENTRYUNLINK), - strayin(0), straydir(0), straydn(0) { } - MDentryUnlink(dirfrag_t df, string& n) : - Message(MSG_MDS_DENTRYUNLINK), - dirfrag(df), - dn(n), - strayin(0), straydir(0), straydn(0) { } - ~MDentryUnlink() { - delete strayin; - delete straydir; - delete straydn; - } - - char *get_type_name() { return "dentry_unlink";} - void print(ostream& o) { - o << "dentry_unlink(" << dirfrag << " " << dn << ")"; - } - - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(dirfrag), (char*)&dirfrag); - off += sizeof(dirfrag); - ::_decode(dn, payload, off); - - bool isstray; - payload.copy(off, sizeof(isstray), (char*)&isstray); - off += sizeof(isstray); - if (isstray) { - strayin = new CInodeDiscover; - strayin->_decode(payload, off); - straydir = new CDirDiscover; - straydir->_decode(payload, off); - straydn = new CDentryDiscover; - straydn->_decode(payload, off); - } - } - void encode_payload() { - payload.append((char*)&dirfrag,sizeof(dirfrag)); - ::_encode(dn, payload); - - bool isstray = strayin ? true:false; - payload.append((char*)&isstray, sizeof(isstray)); - if (isstray) { - strayin->_encode(payload); - straydir->_encode(payload); - straydn->_encode(payload); - } - } -}; - -#endif diff --git a/trunk/ceph/messages/MDirUpdate.h b/trunk/ceph/messages/MDirUpdate.h deleted file mode 100644 index 87d7e4fa7389b..0000000000000 --- a/trunk/ceph/messages/MDirUpdate.h +++ /dev/null @@ -1,74 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MDIRUPDATE_H -#define __MDIRUPDATE_H - -#include "msg/Message.h" - -class MDirUpdate : public Message { - struct { - dirfrag_t dirfrag; - int dir_rep; - int discover; - } st; - set dir_rep_by; - filepath path; - - public: - dirfrag_t get_dirfrag() { return st.dirfrag; } - int get_dir_rep() { return st.dir_rep; } - set& get_dir_rep_by() { return dir_rep_by; } - bool should_discover() { return st.discover > 0; } - filepath& get_path() { return path; } - - void tried_discover() { - if (st.discover) st.discover--; - } - - MDirUpdate() {} - MDirUpdate(dirfrag_t dirfrag, - int dir_rep, - set& dir_rep_by, - filepath& path, - bool discover = false) : - Message(MSG_MDS_DIRUPDATE) { - this->st.dirfrag = dirfrag; - this->st.dir_rep = dir_rep; - this->dir_rep_by = dir_rep_by; - if (discover) this->st.discover = 5; - this->path = path; - } - virtual char *get_type_name() { return "dir_update"; } - void print(ostream& out) { - out << "dir_update(" << get_dirfrag() << ")"; - } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(st), (char*)&st); - off += sizeof(st); - ::_decode(dir_rep_by, payload, off); - path._decode(payload, off); - } - - virtual void encode_payload() { - payload.append((char*)&st, sizeof(st)); - ::_encode(dir_rep_by, payload); - path._encode(payload); - } -}; - -#endif diff --git a/trunk/ceph/messages/MDiscover.h b/trunk/ceph/messages/MDiscover.h deleted file mode 100644 index 7294bad22d796..0000000000000 --- a/trunk/ceph/messages/MDiscover.h +++ /dev/null @@ -1,108 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MDISCOVER_H -#define __MDISCOVER_H - -#include "msg/Message.h" -#include "mds/CDir.h" -#include "include/filepath.h" - -#include -#include -using namespace std; - - -class MDiscover : public Message { - int asker; - inodeno_t base_ino; // 1 -> root - frag_t base_dir_frag; - - filepath want; // ... [/]need/this/stuff - inodeno_t want_ino; - - bool want_base_dir; - bool want_xlocked; - - public: - int get_asker() { return asker; } - inodeno_t get_base_ino() { return base_ino; } - frag_t get_base_dir_frag() { return base_dir_frag; } - - filepath& get_want() { return want; } - inodeno_t get_want_ino() { return want_ino; } - const string& get_dentry(int n) { return want[n]; } - - bool wants_base_dir() { return want_base_dir; } - bool wants_xlocked() { return want_xlocked; } - - void set_base_dir_frag(frag_t f) { base_dir_frag = f; } - - MDiscover() { } - MDiscover(int asker_, - inodeno_t base_ino_, - filepath& want_, - bool want_base_dir_ = true, - bool discover_xlocks_ = false) : - Message(MSG_MDS_DISCOVER), - asker(asker_), - base_ino(base_ino_), - want(want_), - want_ino(0), - want_base_dir(want_base_dir_), - want_xlocked(discover_xlocks_) { } - MDiscover(int asker_, - dirfrag_t base_dirfrag, - inodeno_t want_ino_, - bool want_base_dir_ = true) : - Message(MSG_MDS_DISCOVER), - asker(asker_), - base_ino(base_dirfrag.ino), - base_dir_frag(base_dirfrag.frag), - want_ino(want_ino_), - want_base_dir(want_base_dir_), - want_xlocked(false) { } - - char *get_type_name() { return "Dis"; } - void print(ostream &out) { - out << "discover(" << base_ino << "." << base_dir_frag - << " " << want; - if (want_ino) out << want_ino; - out << ")"; - } - - void decode_payload() { - int off = 0; - ::_decode(asker, payload, off); - ::_decode(base_ino, payload, off); - ::_decode(base_dir_frag, payload, off); - want._decode(payload, off); - ::_decode(want_ino, payload, off); - ::_decode(want_base_dir, payload, off); - ::_decode(want_xlocked, payload, off); - } - void encode_payload() { - ::_encode(asker, payload); - ::_encode(base_ino, payload); - ::_encode(base_dir_frag, payload); - want._encode(payload); - ::_encode(want_ino, payload); - ::_encode(want_base_dir, payload); - ::_encode(want_xlocked, payload); - } - -}; - -#endif diff --git a/trunk/ceph/messages/MDiscoverReply.h b/trunk/ceph/messages/MDiscoverReply.h deleted file mode 100644 index 67491049c0b8f..0000000000000 --- a/trunk/ceph/messages/MDiscoverReply.h +++ /dev/null @@ -1,300 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MDISCOVERREPLY_H -#define __MDISCOVERREPLY_H - -#include "msg/Message.h" -#include "mds/CDir.h" -#include "mds/CInode.h" -#include "include/filepath.h" - -#include -#include -using namespace std; - -#define max(a,b) ((a)>(b) ? (a):(b)) - - -/** - * MDiscoverReply - return new replicas (of inodes, dirs, dentries) - * - * we group returned items by (dir, dentry, inode). each - * item in each set shares an index (it's "depth"). - * - * we can start and end with any type. - * no_base_dir = true if the first group has an inode but no dir - * no_base_dentry = true if the first group has an inode but no dentry - * they are false if there is no returned data, ie the first group is empty. - * - * we also return errors: - * error_flag_dn(string) - the specified dentry dne - * error_flag_dir - the last item wasn't a dir, so we couldn't continue. - * - * and sometimes, - * dir_auth_hint - where we think the dir auth is - * - * depth() gives us the number of depth units/indices for which we have - * information. this INCLUDES those for which we have errors but no data. - * - * see MDCache::handle_discover, handle_discover_reply. - * - * - * so basically, we get - * - * dir den ino i - * x 0 - * x x x 1 - * or - * x x 0 - * x x x 1 - * or - * x x x 0 - * x x x 1 - * ...and trail off however we want. - * - * - */ - -class MDiscoverReply : public Message { - // info about original request - inodeno_t base_ino; - frag_t base_dir_frag; - bool wanted_base_dir; - bool wanted_xlocked; - inodeno_t wanted_ino; - - // and the response - bool flag_error_dn; - bool flag_error_ino; - bool flag_error_dir; - bool no_base_dir; // no base dir (but IS dentry+inode) - bool no_base_dentry; // no base dentry (but IS inode) - string error_dentry; // dentry that was not found (to trigger waiters on asker) - - int dir_auth_hint; - - vector dirs; // not inode-aligned if no_base_dir = true. - vector dentries; // not inode-aligned if no_base_dentry = true - vector inodes; - - - public: - // accessors - inodeno_t get_base_ino() { return base_ino; } - frag_t get_base_dir_frag() { return base_dir_frag; } - bool get_wanted_base_dir() { return wanted_base_dir; } - bool get_wanted_xlocked() { return wanted_xlocked; } - inodeno_t get_wanted_ino() { return wanted_ino; } - - int get_num_inodes() { return inodes.size(); } - int get_num_dentries() { return dentries.size(); } - int get_num_dirs() { return dirs.size(); } - - int get_last_inode() { return inodes.size(); } - int get_last_dentry() { return dentries.size() + no_base_dentry; } - int get_last_dir() { return dirs.size() + no_base_dir; } - - int get_depth() { // return depth of deepest object (in dir/dentry/inode units) - return max( inodes.size(), // at least this many - max( no_base_dentry + dentries.size() + flag_error_dn, // inode start + path + possible error - dirs.size() + no_base_dir )); // dn/inode + dirs - } - - bool has_base_dir() { return !no_base_dir && dirs.size(); } - bool has_base_dentry() { return !no_base_dentry && dentries.size(); } - bool has_base_inode() { return no_base_dir && no_base_dentry; } - - bool is_flag_error_dn() { return flag_error_dn; } - bool is_flag_error_ino() { return flag_error_ino; } - bool is_flag_error_dir() { return flag_error_dir; } - string& get_error_dentry() { return error_dentry; } - - int get_dir_auth_hint() { return dir_auth_hint; } - - - // these index _arguments_ are aligned to each ([[dir, ] dentry, ] inode) set. - CInodeDiscover& get_inode(int n) { return *(inodes[n]); } - CDentryDiscover& get_dentry(int n) { return *(dentries[n - no_base_dentry]); } - CDirDiscover& get_dir(int n) { return *(dirs[n - no_base_dir]); } - inodeno_t get_ino(int n) { return inodes[n]->get_ino(); } - - // cons - MDiscoverReply() {} - MDiscoverReply(MDiscover *dis) : - Message(MSG_MDS_DISCOVERREPLY), - base_ino(dis->get_base_ino()), - base_dir_frag(dis->get_base_dir_frag()), - wanted_base_dir(dis->wants_base_dir()), - wanted_xlocked(dis->wants_xlocked()), - wanted_ino(dis->get_want_ino()), - flag_error_dn(false), - flag_error_ino(false), - flag_error_dir(false), - no_base_dir(false), no_base_dentry(false), - dir_auth_hint(CDIR_AUTH_UNKNOWN) { - } - MDiscoverReply(dirfrag_t df) : - Message(MSG_MDS_DISCOVERREPLY), - base_ino(df.ino), - base_dir_frag(df.frag), - wanted_base_dir(false), - wanted_xlocked(false), - wanted_ino(inodeno_t()), - flag_error_dn(false), - flag_error_ino(false), - flag_error_dir(false), - no_base_dir(false), no_base_dentry(false), - dir_auth_hint(CDIR_AUTH_UNKNOWN) { - } - ~MDiscoverReply() { - for (vector::iterator it = dirs.begin(); - it != dirs.end(); - it++) - delete *it; - for (vector::iterator it = dentries.begin(); - it != dentries.end(); - it++) - delete *it; - for (vector::iterator it = inodes.begin(); - it != inodes.end(); - it++) - delete *it; - } - virtual char *get_type_name() { return "DisR"; } - - // builders - bool is_empty() { - return dirs.empty() && dentries.empty() && inodes.empty() && - !flag_error_dn && - !flag_error_ino && - !flag_error_dir && - dir_auth_hint == CDIR_AUTH_UNKNOWN; - } - void add_dentry(CDentryDiscover* ddis) { - if (dentries.empty() && dirs.empty()) no_base_dir = true; - dentries.push_back(ddis); - } - - void add_inode(CInodeDiscover* din) { - if (inodes.empty() && dentries.empty()) no_base_dir = no_base_dentry = true; - inodes.push_back( din ); - } - - void add_dir(CDirDiscover* dir) { - dirs.push_back( dir ); - } - - - // void set_flag_forward() { flag_forward = true; } - void set_flag_error_dn(const string& dn) { - flag_error_dn = true; - error_dentry = dn; - } - void set_flag_error_ino() { - flag_error_ino = true; - } - void set_flag_error_dir() { - flag_error_dir = true; - } - void set_dir_auth_hint(int a) { - dir_auth_hint = a; - } - void set_error_dentry(const string& dn) { - error_dentry = dn; - } - - - // ... - virtual void decode_payload() { - int off = 0; - ::_decode(base_ino, payload, off); - ::_decode(base_dir_frag, payload, off); - ::_decode(wanted_base_dir, payload, off); - ::_decode(wanted_xlocked, payload, off); - ::_decode(flag_error_dn, payload, off); - ::_decode(flag_error_ino, payload, off); - ::_decode(flag_error_dir, payload, off); - ::_decode(no_base_dir, payload, off); - ::_decode(no_base_dentry, payload, off); - ::_decode(error_dentry, payload, off); - ::_decode(dir_auth_hint, payload, off); - - // dirs - int n; - payload.copy(off, sizeof(int), (char*)&n); - off += sizeof(int); - for (int i=0; i_decode(payload, off); - } - - // inodes - payload.copy(off, sizeof(int), (char*)&n); - off += sizeof(int); - for (int i=0; i_decode(payload, off); - } - - // dentries - payload.copy(off, sizeof(int), (char*)&n); - off += sizeof(int); - for (int i=0; i_decode(payload, off); - } - } - void encode_payload() { - ::_encode(base_ino, payload); - ::_encode(base_dir_frag, payload); - ::_encode(wanted_base_dir, payload); - ::_encode(wanted_xlocked, payload); - ::_encode(flag_error_dn, payload); - ::_encode(flag_error_ino, payload); - ::_encode(flag_error_dir, payload); - ::_encode(no_base_dir, payload); - ::_encode(no_base_dentry, payload); - ::_encode(error_dentry, payload); - ::_encode(dir_auth_hint, payload); - - // dirs - int n = dirs.size(); - payload.append((char*)&n, sizeof(int)); - for (vector::iterator it = dirs.begin(); - it != dirs.end(); - it++) - (*it)->_encode( payload ); - - // inodes - n = inodes.size(); - payload.append((char*)&n, sizeof(int)); - for (vector::iterator it = inodes.begin(); - it != inodes.end(); - it++) - (*it)->_encode( payload ); - - // dentries - n = dentries.size(); - payload.append((char*)&n, sizeof(int)); - for (vector::iterator it = dentries.begin(); - it != dentries.end(); - it++) - (*it)->_encode( payload ); - } - -}; - -#endif diff --git a/trunk/ceph/messages/MExportCaps.h b/trunk/ceph/messages/MExportCaps.h deleted file mode 100644 index f2057bfb1ebc1..0000000000000 --- a/trunk/ceph/messages/MExportCaps.h +++ /dev/null @@ -1,50 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MEXPORTCAPS_H -#define __MEXPORTCAPS_H - -#include "msg/Message.h" - - -class MExportCaps : public Message { - public: - inodeno_t ino; - bufferlist cap_bl; - map client_map; - - MExportCaps() : - Message(MSG_MDS_EXPORTCAPS) {} - - virtual char *get_type_name() { return "export_caps"; } - void print(ostream& o) { - o << "export_caps(" << ino << ")"; - } - - virtual void decode_payload() { - int off = 0; - ::_decode(ino, payload, off); - ::_decode(cap_bl, payload, off); - ::_decode(client_map, payload, off); - } - virtual void encode_payload() { - ::_encode(ino, payload); - ::_encode(cap_bl, payload); - ::_encode(client_map, payload); - } - -}; - -#endif diff --git a/trunk/ceph/messages/MExportCapsAck.h b/trunk/ceph/messages/MExportCapsAck.h deleted file mode 100644 index dd5e212ecfd99..0000000000000 --- a/trunk/ceph/messages/MExportCapsAck.h +++ /dev/null @@ -1,46 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MEXPORTCAPSACK_H -#define __MEXPORTCAPSACK_H - -#include "msg/Message.h" - - -class MExportCapsAck : public Message { - public: - inodeno_t ino; - - MExportCapsAck() : - Message(MSG_MDS_EXPORTCAPSACK) {} - MExportCapsAck(inodeno_t i) : - Message(MSG_MDS_EXPORTCAPSACK), ino(i) {} - - virtual char *get_type_name() { return "export_caps_ack"; } - void print(ostream& o) { - o << "export_caps_ack(" << ino << ")"; - } - - virtual void decode_payload() { - int off = 0; - ::_decode(ino, payload, off); - } - virtual void encode_payload() { - ::_encode(ino, payload); - } - -}; - -#endif diff --git a/trunk/ceph/messages/MExportDir.h b/trunk/ceph/messages/MExportDir.h deleted file mode 100644 index 9964a7059c1d2..0000000000000 --- a/trunk/ceph/messages/MExportDir.h +++ /dev/null @@ -1,65 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MEXPORTDIR_H -#define __MEXPORTDIR_H - -#include "msg/Message.h" - - -class MExportDir : public Message { - dirfrag_t dirfrag; - - bufferlist dirstate; - list bounds; - - public: - MExportDir() {} - MExportDir(dirfrag_t df) : - Message(MSG_MDS_EXPORTDIR), - dirfrag(df) { - } - virtual char *get_type_name() { return "Ex"; } - void print(ostream& o) { - o << "export(" << dirfrag << ")"; - } - - dirfrag_t get_dirfrag() { return dirfrag; } - bufferlist& get_dirstate() { return dirstate; } - list& get_bounds() { return bounds; } - - void take_dirstate(bufferlist& bl) { - dirstate.claim(bl); - } - void add_export(dirfrag_t df) { - bounds.push_back(df); - } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(dirfrag), (char*)&dirfrag); - off += sizeof(dirfrag); - ::_decode(bounds, payload, off); - ::_decode(dirstate, payload, off); - } - virtual void encode_payload() { - payload.append((char*)&dirfrag, sizeof(dirfrag)); - ::_encode(bounds, payload); - ::_encode(dirstate, payload); - } - -}; - -#endif diff --git a/trunk/ceph/messages/MExportDirAck.h b/trunk/ceph/messages/MExportDirAck.h deleted file mode 100644 index 1b9d683b4e36f..0000000000000 --- a/trunk/ceph/messages/MExportDirAck.h +++ /dev/null @@ -1,46 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MEXPORTDIRACK_H -#define __MEXPORTDIRACK_H - -#include "MExportDir.h" - -class MExportDirAck : public Message { - dirfrag_t dirfrag; - - public: - dirfrag_t get_dirfrag() { return dirfrag; } - - MExportDirAck() {} - MExportDirAck(dirfrag_t i) : - Message(MSG_MDS_EXPORTDIRACK), dirfrag(i) { } - - virtual char *get_type_name() { return "ExAck"; } - void print(ostream& o) { - o << "export_ack(" << dirfrag << ")"; - } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(dirfrag), (char*)&dirfrag); - off += sizeof(dirfrag); - } - virtual void encode_payload() { - payload.append((char*)&dirfrag, sizeof(dirfrag)); - } - -}; - -#endif diff --git a/trunk/ceph/messages/MExportDirCancel.h b/trunk/ceph/messages/MExportDirCancel.h deleted file mode 100644 index f13ee1a44fa21..0000000000000 --- a/trunk/ceph/messages/MExportDirCancel.h +++ /dev/null @@ -1,49 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MEXPORTDIRCANCEL_H -#define __MEXPORTDIRCANCEL_H - -#include "msg/Message.h" -#include "mds/CInode.h" -#include "include/types.h" - -class MExportDirCancel : public Message { - dirfrag_t dirfrag; - - public: - dirfrag_t get_dirfrag() { return dirfrag; } - - MExportDirCancel() {} - MExportDirCancel(dirfrag_t df) : - Message(MSG_MDS_EXPORTDIRCANCEL), - dirfrag(df) { } - - virtual char *get_type_name() { return "ExCancel"; } - void print(ostream& o) { - o << "export_cancel(" << dirfrag << ")"; - } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(dirfrag), (char*)&dirfrag); - off += sizeof(dirfrag); - } - - virtual void encode_payload() { - payload.append((char*)&dirfrag, sizeof(dirfrag)); - } -}; - -#endif diff --git a/trunk/ceph/messages/MExportDirDiscover.h b/trunk/ceph/messages/MExportDirDiscover.h deleted file mode 100644 index 01c61a67648c3..0000000000000 --- a/trunk/ceph/messages/MExportDirDiscover.h +++ /dev/null @@ -1,59 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MEXPORTDIRDISCOVER_H -#define __MEXPORTDIRDISCOVER_H - -#include "msg/Message.h" -#include "mds/CInode.h" -#include "include/types.h" - -class MExportDirDiscover : public Message { - dirfrag_t dirfrag; - filepath path; - - public: - inodeno_t get_ino() { return dirfrag.ino; } - dirfrag_t get_dirfrag() { return dirfrag; } - filepath& get_path() { return path; } - - bool started; - - MExportDirDiscover() : - Message(MSG_MDS_EXPORTDIRDISCOVER), - started(false) { } - MExportDirDiscover(CDir *dir) : - Message(MSG_MDS_EXPORTDIRDISCOVER), - started(false) { - dir->get_inode()->make_path(path); - dirfrag = dir->dirfrag(); - } - virtual char *get_type_name() { return "ExDis"; } - void print(ostream& o) { - o << "export_discover(" << dirfrag << " " << path << ")"; - } - - virtual void decode_payload() { - bufferlist::iterator p = payload.begin(); - ::_decode_simple(dirfrag, p); - path._decode(p); - } - - virtual void encode_payload() { - ::_encode_simple(dirfrag, payload); - path._encode(payload); - } -}; - -#endif diff --git a/trunk/ceph/messages/MExportDirDiscoverAck.h b/trunk/ceph/messages/MExportDirDiscoverAck.h deleted file mode 100644 index 5e1924bc57e38..0000000000000 --- a/trunk/ceph/messages/MExportDirDiscoverAck.h +++ /dev/null @@ -1,60 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MEXPORTDIRDISCOVERACK_H -#define __MEXPORTDIRDISCOVERACK_H - -#include "msg/Message.h" -#include "mds/CInode.h" -#include "include/types.h" - -class MExportDirDiscoverAck : public Message { - dirfrag_t dirfrag; - bool success; - - public: - inodeno_t get_ino() { return dirfrag.ino; } - dirfrag_t get_dirfrag() { return dirfrag; } - bool is_success() { return success; } - - MExportDirDiscoverAck() {} - MExportDirDiscoverAck(dirfrag_t df, bool s=true) : - Message(MSG_MDS_EXPORTDIRDISCOVERACK), - dirfrag(df), - success(s) { } - - virtual char *get_type_name() { return "ExDisA"; } - void print(ostream& o) { - o << "export_discover_ack(" << dirfrag; - if (success) - o << " success)"; - else - o << " failure)"; - } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(dirfrag), (char*)&dirfrag); - off += sizeof(dirfrag); - payload.copy(off, sizeof(success), (char*)&success); - off += sizeof(success); - } - - virtual void encode_payload() { - payload.append((char*)&dirfrag, sizeof(dirfrag)); - payload.append((char*)&success, sizeof(success)); - } -}; - -#endif diff --git a/trunk/ceph/messages/MExportDirFinish.h b/trunk/ceph/messages/MExportDirFinish.h deleted file mode 100644 index 03f5e1fcc9ef3..0000000000000 --- a/trunk/ceph/messages/MExportDirFinish.h +++ /dev/null @@ -1,46 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MEXPORTDIRFINISH_H -#define __MEXPORTDIRFINISH_H - -#include "msg/Message.h" - -class MExportDirFinish : public Message { - dirfrag_t dirfrag; - - public: - dirfrag_t get_dirfrag() { return dirfrag; } - - MExportDirFinish() {} - MExportDirFinish(dirfrag_t dirfrag) : - Message(MSG_MDS_EXPORTDIRFINISH) { - this->dirfrag = dirfrag; - } - virtual char *get_type_name() { return "ExFin"; } - void print(ostream& o) { - o << "export_finish(" << dirfrag << ")"; - } - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(dirfrag), (char*)&dirfrag); - off += sizeof(dirfrag); - } - virtual void encode_payload() { - payload.append((char*)&dirfrag, sizeof(dirfrag)); - } - -}; - -#endif diff --git a/trunk/ceph/messages/MExportDirNotify.h b/trunk/ceph/messages/MExportDirNotify.h deleted file mode 100644 index c7a79a64f9317..0000000000000 --- a/trunk/ceph/messages/MExportDirNotify.h +++ /dev/null @@ -1,85 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MEXPORTDIRNOTIFY_H -#define __MEXPORTDIRNOTIFY_H - -#include "msg/Message.h" -#include -using namespace std; - -class MExportDirNotify : public Message { - dirfrag_t base; - bool ack; - pair old_auth, new_auth; - list bounds; // bounds; these dirs are _not_ included (tho the dirfragdes are) - - public: - dirfrag_t get_dirfrag() { return base; } - pair get_old_auth() { return old_auth; } - pair get_new_auth() { return new_auth; } - bool wants_ack() { return ack; } - list& get_bounds() { return bounds; } - - MExportDirNotify() {} - MExportDirNotify(dirfrag_t i, bool a, pair oa, pair na) : - Message(MSG_MDS_EXPORTDIRNOTIFY), - base(i), ack(a), old_auth(oa), new_auth(na) { } - - virtual char *get_type_name() { return "ExNot"; } - void print(ostream& o) { - o << "export_notify(" << base; - o << " " << old_auth << " -> " << new_auth; - if (ack) - o << " ack)"; - else - o << " no ack)"; - } - - void copy_bounds(list& ex) { - this->bounds = ex; - } - void copy_bounds(set& ex) { - for (set::iterator i = ex.begin(); - i != ex.end(); ++i) - bounds.push_back(*i); - } - void copy_bounds(set& ex) { - for (set::iterator i = ex.begin(); - i != ex.end(); ++i) - bounds.push_back((*i)->dirfrag()); - } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(base), (char*)&base); - off += sizeof(base); - payload.copy(off, sizeof(ack), (char*)&ack); - off += sizeof(ack); - payload.copy(off, sizeof(old_auth), (char*)&old_auth); - off += sizeof(old_auth); - payload.copy(off, sizeof(new_auth), (char*)&new_auth); - off += sizeof(new_auth); - ::_decode(bounds, payload, off); - } - virtual void encode_payload() { - payload.append((char*)&base, sizeof(base)); - payload.append((char*)&ack, sizeof(ack)); - payload.append((char*)&old_auth, sizeof(old_auth)); - payload.append((char*)&new_auth, sizeof(new_auth)); - ::_encode(bounds, payload); - } -}; - -#endif diff --git a/trunk/ceph/messages/MExportDirNotifyAck.h b/trunk/ceph/messages/MExportDirNotifyAck.h deleted file mode 100644 index 6a41aee83b5f3..0000000000000 --- a/trunk/ceph/messages/MExportDirNotifyAck.h +++ /dev/null @@ -1,50 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MEXPORTDIRNOTIFYACK_H -#define __MEXPORTDIRNOTIFYACK_H - -#include "msg/Message.h" -#include -using namespace std; - -class MExportDirNotifyAck : public Message { - dirfrag_t dirfrag; - - public: - dirfrag_t get_dirfrag() { return dirfrag; } - - MExportDirNotifyAck() {} - MExportDirNotifyAck(dirfrag_t dirfrag) : - Message(MSG_MDS_EXPORTDIRNOTIFYACK) { - this->dirfrag = dirfrag; - } - virtual char *get_type_name() { return "ExNotA"; } - void print(ostream& o) { - o << "export_notify_ack(" << dirfrag << ")"; - } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(dirfrag), (char*)&dirfrag); - off += sizeof(dirfrag); - } - - virtual void encode_payload() { - payload.append((char*)&dirfrag, sizeof(dirfrag)); - } - -}; - -#endif diff --git a/trunk/ceph/messages/MExportDirPrep.h b/trunk/ceph/messages/MExportDirPrep.h deleted file mode 100644 index 5789e301e8b11..0000000000000 --- a/trunk/ceph/messages/MExportDirPrep.h +++ /dev/null @@ -1,205 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MEXPORTDIRPREP_H -#define __MEXPORTDIRPREP_H - -#include "msg/Message.h" -#include "mds/CInode.h" -#include "include/types.h" - -class MExportDirPrep : public Message { - dirfrag_t dirfrag; - - /* nested export discover payload. - not all inodes will have dirs; they may require a separate discover. - dentries are the links to each inode. - dirs map includes base dir (ino) - */ - list bounds; - - list inodes; - list dentries; - map inode_dirfrag; - map inode_dentry; - - map > frags_by_ino; - map dirfrags; - - set bystanders; - - bool b_did_assim; - - public: - dirfrag_t get_dirfrag() { return dirfrag; } - list& get_bounds() { return bounds; } - list& get_inodes() { return inodes; } - list& get_dentries() { return dentries; } - list& get_inode_dirfrags(inodeno_t ino) { - return frags_by_ino[ino]; - } - dirfrag_t get_containing_dirfrag(inodeno_t ino) { - return inode_dirfrag[ino]; - } - string& get_dentry(inodeno_t ino) { - return inode_dentry[ino]; - } - bool have_dirfrag(dirfrag_t df) { - return dirfrags.count(df); - } - CDirDiscover* get_dirfrag_discover(dirfrag_t df) { - return dirfrags[df]; - } - set &get_bystanders() { return bystanders; } - - bool did_assim() { return b_did_assim; } - void mark_assim() { b_did_assim = true; } - - MExportDirPrep() { - b_did_assim = false; - } - MExportDirPrep(dirfrag_t df) : - Message(MSG_MDS_EXPORTDIRPREP), - dirfrag(df), - b_did_assim(false) { } - ~MExportDirPrep() { - for (list::iterator iit = inodes.begin(); - iit != inodes.end(); - iit++) - delete *iit; - for (list::iterator p = dentries.begin(); - p != dentries.end(); - p++) - delete *p; - for (map::iterator dit = dirfrags.begin(); - dit != dirfrags.end(); - dit++) - delete dit->second; - } - - - virtual char *get_type_name() { return "ExP"; } - void print(ostream& o) { - o << "export_prep(" << dirfrag << ")"; - } - - void add_export(dirfrag_t df) { - bounds.push_back( df ); - } - void add_inode(dirfrag_t df, const string& name, CDentryDiscover *dn, CInodeDiscover *in) { - inodes.push_back(in); - dentries.push_back(dn); - inode_dirfrag[in->get_ino()] = df; - inode_dentry[in->get_ino()] = name; - } - void add_dirfrag(CDirDiscover *dir) { - dirfrags[dir->get_dirfrag()] = dir; - frags_by_ino[dir->get_dirfrag().ino].push_back(dir->get_dirfrag().frag); - } - void add_bystander(int who) { - bystanders.insert(who); - } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(dirfrag), (char*)&dirfrag); - off += sizeof(dirfrag); - - ::_decode(bounds, payload, off); - - // inodes - int ni; - payload.copy(off, sizeof(int), (char*)&ni); - off += sizeof(int); - for (int i=0; i_decode(payload, off); - inodes.push_back(in); - - // dentry - CDentryDiscover *dn = new CDentryDiscover; - dn->_decode(payload, off); - dentries.push_back(dn); - - // dentry - string d; - _decode(d, payload, off); - inode_dentry[in->get_ino()] = d; - - // dir ino - dirfrag_t df; - payload.copy(off, sizeof(df), (char*)&df); - off += sizeof(df); - inode_dirfrag[in->get_ino()] = df; - - // child frags - ::_decode(frags_by_ino[in->get_ino()], payload, off); - } - - // dirs - int nd; - payload.copy(off, sizeof(int), (char*)&nd); - off += sizeof(int); - for (int i=0; i_decode(payload, off); - dirfrags[dir->get_dirfrag()] = dir; - } - - ::_decode(bystanders, payload, off); - } - - virtual void encode_payload() { - payload.append((char*)&dirfrag, sizeof(dirfrag)); - - ::_encode(bounds, payload); - - // inodes - int ni = inodes.size(); - payload.append((char*)&ni, sizeof(int)); - list::iterator dit = dentries.begin(); - list::iterator iit = inodes.begin(); - while (iit != inodes.end()) { - (*iit)->_encode(payload); - (*dit)->_encode(payload); - - // dentry name - _encode(inode_dentry[(*iit)->get_ino()], payload); - - // dir ino - dirfrag_t df = inode_dirfrag[(*iit)->get_ino()]; - payload.append((char*)&df, sizeof(df)); - - // child frags - ::_encode(frags_by_ino[(*iit)->get_ino()], payload); - - iit++; - dit++; - } - - // dirs - int nd = dirfrags.size(); - payload.append((char*)&nd, sizeof(int)); - for (map::iterator dit = dirfrags.begin(); - dit != dirfrags.end(); - dit++) - dit->second->_encode(payload); - - ::_encode(bystanders, payload); - } -}; - -#endif diff --git a/trunk/ceph/messages/MExportDirPrepAck.h b/trunk/ceph/messages/MExportDirPrepAck.h deleted file mode 100644 index 355541e9f1b5c..0000000000000 --- a/trunk/ceph/messages/MExportDirPrepAck.h +++ /dev/null @@ -1,47 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MEXPORTDIRPREPACK_H -#define __MEXPORTDIRPREPACK_H - -#include "msg/Message.h" -#include "include/types.h" - -class MExportDirPrepAck : public Message { - dirfrag_t dirfrag; - - public: - dirfrag_t get_dirfrag() { return dirfrag; } - - MExportDirPrepAck() {} - MExportDirPrepAck(dirfrag_t df) : - Message(MSG_MDS_EXPORTDIRPREPACK), - dirfrag(df) { } - - virtual char *get_type_name() { return "ExPAck"; } - void print(ostream& o) { - o << "export_prep_ack(" << dirfrag << ")"; - } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(dirfrag), (char*)&dirfrag); - off += sizeof(dirfrag); - } - virtual void encode_payload() { - payload.append((char*)&dirfrag, sizeof(dirfrag)); - } -}; - -#endif diff --git a/trunk/ceph/messages/MExportDirWarning.h b/trunk/ceph/messages/MExportDirWarning.h deleted file mode 100644 index b59e2eb12251c..0000000000000 --- a/trunk/ceph/messages/MExportDirWarning.h +++ /dev/null @@ -1,50 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MEXPORTDIRWARNING_H -#define __MEXPORTDIRWARNING_H - -#include "msg/Message.h" -#include "mds/CInode.h" -#include "include/types.h" - -class MExportDirWarning : public Message { - inodeno_t ino; - int new_dir_auth; - - public: - inodeno_t get_ino() { return ino; } - int get_new_dir_auth() { return new_dir_auth; } - - MExportDirWarning() {} - MExportDirWarning(inodeno_t i, int nda) : - Message(MSG_MDS_EXPORTDIRWARNING), - ino(i), new_dir_auth(nda) {} - - virtual char *get_type_name() { return "ExW"; } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - payload.copy(off, sizeof(new_dir_auth), (char*)&new_dir_auth); - off += sizeof(new_dir_auth); - } - virtual void encode_payload() { - payload.append((char*)&ino, sizeof(ino)); - payload.append((char*)&new_dir_auth, sizeof(new_dir_auth)); - } -}; - -#endif diff --git a/trunk/ceph/messages/MExportDirWarningAck.h b/trunk/ceph/messages/MExportDirWarningAck.h deleted file mode 100644 index 7ee3078e61973..0000000000000 --- a/trunk/ceph/messages/MExportDirWarningAck.h +++ /dev/null @@ -1,45 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MEXPORTDIRWARNINGACK_H -#define __MEXPORTDIRWARNINGACK_H - -#include "msg/Message.h" -#include "mds/CInode.h" -#include "include/types.h" - -class MExportDirWarningAck : public Message { - inodeno_t ino; - - public: - inodeno_t get_ino() { return ino; } - - MExportDirWarningAck() {} - MExportDirWarningAck(inodeno_t i) : - Message(MSG_MDS_EXPORTDIRWARNINGACK), - ino(i) {} - - virtual char *get_type_name() { return "ExWAck"; } - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - } - virtual void encode_payload() { - payload.append((char*)&ino, sizeof(ino)); - } -}; - -#endif diff --git a/trunk/ceph/messages/MGenericMessage.h b/trunk/ceph/messages/MGenericMessage.h deleted file mode 100644 index fee4e014edaf8..0000000000000 --- a/trunk/ceph/messages/MGenericMessage.h +++ /dev/null @@ -1,45 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MGENERICMESSAGE_H -#define __MGENERICMESSAGE_H - -#include "msg/Message.h" - -class MGenericMessage : public Message { - char tname[20]; - //long pcid; - - public: - MGenericMessage(int t) : Message(t) { - sprintf(tname, "generic%d", get_type()); - } - - //void set_pcid(long pcid) { this->pcid = pcid; } - //long get_pcid() { return pcid; } - - char *get_type_name() { return tname; } - - virtual void decode_payload() { - //int off = 0; - //payload.copy(off, sizeof(pcid), (char*)&pcid); - //off += sizeof(pcid); - } - virtual void encode_payload() { - //payload.append((char*)&pcid, sizeof(pcid)); - } -}; - -#endif diff --git a/trunk/ceph/messages/MHeartbeat.h b/trunk/ceph/messages/MHeartbeat.h deleted file mode 100644 index 964f2a3bd49f2..0000000000000 --- a/trunk/ceph/messages/MHeartbeat.h +++ /dev/null @@ -1,60 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MHEARTBEAT_H -#define __MHEARTBEAT_H - -#include "include/types.h" -#include "msg/Message.h" - -class MHeartbeat : public Message { - mds_load_t load; - int beat; - map import_map; - - public: - mds_load_t& get_load() { return load; } - int get_beat() { return beat; } - - map& get_import_map() { - return import_map; - } - - MHeartbeat() {} - MHeartbeat(mds_load_t& load, int beat) : - Message(MSG_MDS_HEARTBEAT) { - this->load = load; - this->beat = beat; - } - - virtual char *get_type_name() { return "HB"; } - - virtual void decode_payload() { - int off = 0; - payload.copy(off,sizeof(load), (char*)&load); - off += sizeof(load); - payload.copy(off, sizeof(beat), (char*)&beat); - off += sizeof(beat); - ::_decode(import_map, payload, off); - } - virtual void encode_payload() { - payload.append((char*)&load, sizeof(load)); - payload.append((char*)&beat, sizeof(beat)); - ::_encode(import_map, payload); - } - -}; - -#endif diff --git a/trunk/ceph/messages/MInodeFileCaps.h b/trunk/ceph/messages/MInodeFileCaps.h deleted file mode 100644 index 05ade1094c9c8..0000000000000 --- a/trunk/ceph/messages/MInodeFileCaps.h +++ /dev/null @@ -1,57 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MINODEFILECAPS_H -#define __MINODEFILECAPS_H - -class MInodeFileCaps : public Message { - inodeno_t ino; - int from; - int caps; - - public: - inodeno_t get_ino() { return ino; } - int get_from() { return from; } - int get_caps() { return caps; } - - MInodeFileCaps() {} - // from auth - MInodeFileCaps(inodeno_t ino, int from, int caps) : - Message(MSG_MDS_INODEFILECAPS) { - - this->ino = ino; - this->from = from; - this->caps = caps; - } - - virtual char *get_type_name() { return "Icap";} - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(from), (char*)&from); - off += sizeof(from); - payload.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - payload.copy(off, sizeof(caps), (char*)&caps); - off += sizeof(caps); - } - virtual void encode_payload() { - payload.append((char*)&from, sizeof(from)); - payload.append((char*)&ino, sizeof(ino)); - payload.append((char*)&caps, sizeof(caps)); - } -}; - -#endif diff --git a/trunk/ceph/messages/MMDSBeacon.h b/trunk/ceph/messages/MMDSBeacon.h deleted file mode 100644 index 3a2a90f49152f..0000000000000 --- a/trunk/ceph/messages/MMDSBeacon.h +++ /dev/null @@ -1,67 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMDSBEACON_H -#define __MMDSBEACON_H - -#include "msg/Message.h" - -#include "include/types.h" - -#include "mds/MDSMap.h" - -class MMDSBeacon : public Message { - entity_inst_t inst; - epoch_t last_epoch_seen; // include last mdsmap epoch mds has seen to avoid race with monitor decree - int state; - version_t seq; - int want_rank; - - public: - MMDSBeacon() : Message(MSG_MDS_BEACON) {} - MMDSBeacon(entity_inst_t i, epoch_t les, int st, version_t se, int wr) : - Message(MSG_MDS_BEACON), - inst(i), last_epoch_seen(les), state(st), seq(se), want_rank(wr) { } - - entity_inst_t& get_mds_inst() { return inst; } - epoch_t get_last_epoch_seen() { return last_epoch_seen; } - int get_state() { return state; } - version_t get_seq() { return seq; } - char *get_type_name() { return "mdsbeacon"; } - int get_want_rank() { return want_rank; } - - void print(ostream& out) { - out << "mdsbeacon(" << inst - << " " << MDSMap::get_state_name(state) - << " seq " << seq << ")"; - } - - void encode_payload() { - ::_encode(inst, payload); - ::_encode(last_epoch_seen, payload); - ::_encode(state, payload); - ::_encode(seq, payload); - ::_encode(want_rank, payload); - } - void decode_payload() { - int off = 0; - ::_decode(inst, payload, off); - ::_decode(last_epoch_seen, payload, off); - ::_decode(state, payload, off); - ::_decode(seq, payload, off); - ::_decode(want_rank, payload, off); - } -}; - -#endif diff --git a/trunk/ceph/messages/MMDSBoot.h b/trunk/ceph/messages/MMDSBoot.h deleted file mode 100644 index 8529578e29d56..0000000000000 --- a/trunk/ceph/messages/MMDSBoot.h +++ /dev/null @@ -1,39 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMDSBOOT_H -#define __MMDSBOOT_H - -#include "msg/Message.h" - -#include "include/types.h" - -class MMDSBoot : public Message { - public: - MMDSBoot() : Message(MSG_MDS_BOOT) { - } - - char *get_type_name() { return "mdsboot"; } - - void encode_payload() { - //payload.append((char*)&sb, sizeof(sb)); - } - void decode_payload() { - //int off = 0; - //payload.copy(off, sizeof(sb), (char*)&sb); - //off += sizeof(sb); - } -}; - -#endif diff --git a/trunk/ceph/messages/MMDSCacheRejoin.h b/trunk/ceph/messages/MMDSCacheRejoin.h deleted file mode 100644 index 844ece02000ae..0000000000000 --- a/trunk/ceph/messages/MMDSCacheRejoin.h +++ /dev/null @@ -1,230 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMDSCACHEREJOIN_H -#define __MMDSCACHEREJOIN_H - -#include "msg/Message.h" - -#include "include/types.h" -#include "include/encodable.h" - -// sent from replica to auth - -class MMDSCacheRejoin : public Message { - public: - static const int OP_WEAK = 1; // replica -> auth, i exist, + maybe open files. - static const int OP_STRONG = 2; // replica -> auth, i exist, + open files and lock state. - static const int OP_ACK = 3; // auth -> replica, here is your lock state. - //static const int OP_PURGE = 4; // auth -> replica, remove these items, they are old/obsolete. - static const int OP_MISSING = 5; // auth -> replica, i am missing these items - static const int OP_FULL = 6; // replica -> auth, here is the full object. - static const char *get_opname(int op) { - switch (op) { - case OP_WEAK: return "weak"; - case OP_STRONG: return "strong"; - case OP_ACK: return "ack"; - case OP_MISSING: return "missing"; - case OP_FULL: return "full"; - default: assert(0); return 0; - } - } - - // -- types -- - struct inode_strong { - int32_t caps_wanted; - int32_t nonce; - int32_t authlock; - int32_t linklock; - int32_t dirfragtreelock; - int32_t filelock; - __int32_t dirlock; - inode_strong() {} - inode_strong(int n, int cw=0, int a=0, int l=0, int dft=0, int f=0, int dl=0) : - caps_wanted(cw), - nonce(n), - authlock(a), linklock(l), dirfragtreelock(dft), filelock(f), dirlock(dl) { } - }; - struct inode_full { - inode_t inode; - string symlink; - fragtree_t dirfragtree; - inode_full() {} - inode_full(const inode_t& i, const string& s, const fragtree_t& f) : - inode(i), symlink(s), dirfragtree(f) {} - - void _decode(bufferlist::iterator& p) { - ::_decode_simple(inode, p); - ::_decode_simple(symlink, p); - dirfragtree._decode(p); - } - void _encode(bufferlist& bl) const { - ::_encode(inode, bl); - ::_encode(symlink, bl); - dirfragtree._encode(bl); - } - }; - - struct dirfrag_strong { - int32_t nonce; - int8_t dir_rep; - dirfrag_strong() {} - dirfrag_strong(int n, int dr) : nonce(n), dir_rep(dr) {} - }; - struct dn_strong { - inodeno_t ino; - inodeno_t remote_ino; - unsigned char remote_d_type; - int32_t nonce; - int32_t lock; - dn_strong() : - ino(0), remote_ino(0), remote_d_type(0), nonce(0), lock(0) {} - dn_strong(inodeno_t pi, inodeno_t ri, unsigned char rdt, int n, int l) : - ino(pi), remote_ino(ri), remote_d_type(rdt), nonce(n), lock(l) {} - bool is_primary() { return ino > 0; } - bool is_remote() { return remote_ino > 0; } - bool is_null() { return ino == 0 && remote_ino == 0; } - }; - - struct dn_weak { - inodeno_t ino; - dn_weak() : ino(0) {} - dn_weak(inodeno_t pi) : ino(pi) {} - }; - - // -- data -- - int32_t op; - - // weak - map > weak; - set weak_inodes; - - // strong - map strong_dirfrags; - map > strong_dentries; - map strong_inodes; - - // open - bufferlist cap_export_bl; - map > cap_exports; - map cap_export_paths; - - // full - list full_inodes; - - // authpins, xlocks - map authpinned_inodes; - map > xlocked_inodes; - map > authpinned_dentries; - map > xlocked_dentries; - - MMDSCacheRejoin() : Message(MSG_MDS_CACHEREJOIN) {} - MMDSCacheRejoin(int o) : - Message(MSG_MDS_CACHEREJOIN), - op(o) {} - - char *get_type_name() { return "cache_rejoin"; } - void print(ostream& out) { - out << "cache_rejoin " << get_opname(op); - } - - // -- builders -- - // inodes - void add_weak_inode(inodeno_t i) { - weak_inodes.insert(i); - } - void add_strong_inode(inodeno_t i, int n, int cw, int a, int l, int dft, int f, int dl) { - strong_inodes[i] = inode_strong(n, cw, a, l, dft, f, dl); - } - void add_full_inode(inode_t &i, const string& s, const fragtree_t &f) { - full_inodes.push_back(inode_full(i, s, f)); - } - void add_inode_authpin(inodeno_t ino, const metareqid_t& ri) { - authpinned_inodes[ino] = ri; - } - void add_inode_xlock(inodeno_t ino, int lt, const metareqid_t& ri) { - xlocked_inodes[ino][lt] = ri; - } - - void copy_cap_exports(bufferlist &bl) { - cap_export_bl = bl; - } - - // dirfrags - void add_weak_dirfrag(dirfrag_t df) { - weak[df]; - } - void add_weak_dirfrag(dirfrag_t df, map& dnmap) { - weak[df] = dnmap; - } - void add_strong_dirfrag(dirfrag_t df, int n, int dr) { - strong_dirfrags[df] = dirfrag_strong(n, dr); - } - - // dentries - void add_weak_dentry(dirfrag_t df, const string& dname, dn_weak& dnw) { - weak[df][dname] = dnw; - } - void add_weak_primary_dentry(dirfrag_t df, const string& dname, inodeno_t ino) { - weak[df][dname] = dn_weak(ino); - } - void add_strong_dentry(dirfrag_t df, const string& dname, inodeno_t pi, inodeno_t ri, unsigned char rdt, int n, int ls) { - strong_dentries[df][dname] = dn_strong(pi, ri, rdt, n, ls); - } - void add_dentry_authpin(dirfrag_t df, const string& dname, const metareqid_t& ri) { - authpinned_dentries[df][dname] = ri; - } - void add_dentry_xlock(dirfrag_t df, const string& dname, const metareqid_t& ri) { - xlocked_dentries[df][dname] = ri; - } - - // -- encoding -- - void encode_payload() { - ::_encode(op, payload); - ::_encode(strong_inodes, payload); - ::_encode_complex(full_inodes, payload); - ::_encode(authpinned_inodes, payload); - ::_encode(xlocked_inodes, payload); - ::_encode(cap_export_bl, payload); - ::_encode(strong_dirfrags, payload); - ::_encode(weak, payload); - ::_encode(weak_inodes, payload); - ::_encode(strong_dentries, payload); - ::_encode(authpinned_dentries, payload); - ::_encode(xlocked_dentries, payload); - } - void decode_payload() { - bufferlist::iterator p = payload.begin(); - ::_decode_simple(op, p); - ::_decode_simple(strong_inodes, p); - ::_decode_complex(full_inodes, p); - ::_decode_simple(authpinned_inodes, p); - ::_decode_simple(xlocked_inodes, p); - ::_decode_simple(cap_export_bl, p); - if (cap_export_bl.length()) { - bufferlist::iterator q = cap_export_bl.begin(); - ::_decode_simple(cap_exports, q); - ::_decode_simple(cap_export_paths, q); - } - ::_decode_simple(strong_dirfrags, p); - ::_decode_simple(weak, p); - ::_decode_simple(weak_inodes, p); - ::_decode_simple(strong_dentries, p); - ::_decode_simple(authpinned_dentries, p); - ::_decode_simple(xlocked_dentries, p); - } - -}; - -#endif diff --git a/trunk/ceph/messages/MMDSFragmentNotify.h b/trunk/ceph/messages/MMDSFragmentNotify.h deleted file mode 100644 index 232cce92427bb..0000000000000 --- a/trunk/ceph/messages/MMDSFragmentNotify.h +++ /dev/null @@ -1,60 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMDSFRAGMENTNOTIFY_H -#define __MMDSFRAGMENTNOTIFY_H - -#include "msg/Message.h" -#include -using namespace std; - -class MMDSFragmentNotify : public Message { - inodeno_t ino; - frag_t basefrag; - int8_t bits; - - public: - inodeno_t get_ino() { return ino; } - frag_t get_basefrag() { return basefrag; } - int get_bits() { return bits; } - - bufferlist basebl; - - MMDSFragmentNotify() {} - MMDSFragmentNotify(inodeno_t i, frag_t bf, int b) : - Message(MSG_MDS_FRAGMENTNOTIFY), - ino(i), basefrag(bf), bits(b) { } - - virtual char *get_type_name() { return "fragment_notify"; } - void print(ostream& o) { - o << "fragment_notify(" << ino << "#" << basefrag - << " " << (int)bits << ")"; - } - - virtual void decode_payload() { - int off = 0; - ::_decode(ino, payload, off); - ::_decode(basefrag, payload, off); - ::_decode(bits, payload, off); - ::_decode(basebl, payload, off); - } - virtual void encode_payload() { - ::_encode(ino, payload); - ::_encode(basefrag, payload); - ::_encode(bits, payload); - ::_encode(basebl, payload); - } -}; - -#endif diff --git a/trunk/ceph/messages/MMDSResolve.h b/trunk/ceph/messages/MMDSResolve.h deleted file mode 100644 index 2103a0115081d..0000000000000 --- a/trunk/ceph/messages/MMDSResolve.h +++ /dev/null @@ -1,66 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMDSRESOLVE_H -#define __MMDSRESOLVE_H - -#include "msg/Message.h" - -#include "include/types.h" - -class MMDSResolve : public Message { - public: - map > subtrees; - map > ambiguous_imports; - list slave_requests; - - MMDSResolve() : Message(MSG_MDS_RESOLVE) {} - - char *get_type_name() { return "mds_resolve"; } - - void print(ostream& out) { - out << "mds_resolve(" << subtrees.size() - << "+" << ambiguous_imports.size() - << " subtrees +" << slave_requests.size() << " slave requests)"; - } - - void add_subtree(dirfrag_t im) { - subtrees[im].clear(); - } - void add_subtree_bound(dirfrag_t im, dirfrag_t ex) { - subtrees[im].push_back(ex); - } - - void add_ambiguous_import(dirfrag_t im, const list& m) { - ambiguous_imports[im] = m; - } - - void add_slave_request(metareqid_t reqid) { - slave_requests.push_back(reqid); - } - - void encode_payload() { - ::_encode(subtrees, payload); - ::_encode(ambiguous_imports, payload); - ::_encode(slave_requests, payload); - } - void decode_payload() { - int off = 0; - ::_decode(subtrees, payload, off); - ::_decode(ambiguous_imports, payload, off); - ::_decode(slave_requests, payload, off); - } -}; - -#endif diff --git a/trunk/ceph/messages/MMDSResolveAck.h b/trunk/ceph/messages/MMDSResolveAck.h deleted file mode 100644 index 1870e226b4161..0000000000000 --- a/trunk/ceph/messages/MMDSResolveAck.h +++ /dev/null @@ -1,56 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMDSRESOLVEACK_H -#define __MMDSRESOLVEACK_H - -#include "msg/Message.h" - -#include "include/types.h" - - -class MMDSResolveAck : public Message { - public: - list commit; - list abort; - - MMDSResolveAck() : Message(MSG_MDS_RESOLVEACK) {} - - char *get_type_name() { return "resolve_ack"; } - /*void print(ostream& out) { - out << "resolve_ack.size() - << "+" << ambiguous_imap.size() - << " imports +" << slave_requests.size() << " slave requests)"; - } - */ - - void add_commit(metareqid_t r) { - commit.push_back(r); - } - void add_abort(metareqid_t r) { - abort.push_back(r); - } - - void encode_payload() { - ::_encode(commit, payload); - ::_encode(abort, payload); - } - void decode_payload() { - int off = 0; - ::_decode(commit, payload, off); - ::_decode(abort, payload, off); - } -}; - -#endif diff --git a/trunk/ceph/messages/MMDSSlaveRequest.h b/trunk/ceph/messages/MMDSSlaveRequest.h deleted file mode 100644 index a5c2339fd4cd5..0000000000000 --- a/trunk/ceph/messages/MMDSSlaveRequest.h +++ /dev/null @@ -1,148 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MMDSSLAVEREQUEST_H -#define __MMDSSLAVEREQUEST_H - -#include "msg/Message.h" -#include "mds/mdstypes.h" -#include "include/encodable.h" - -class MMDSSlaveRequest : public Message { - public: - static const int OP_XLOCK = 1; - static const int OP_XLOCKACK = -1; - static const int OP_UNXLOCK = 2; - static const int OP_AUTHPIN = 3; - static const int OP_AUTHPINACK = -3; - - static const int OP_LINKPREP = 4; - static const int OP_UNLINKPREP = 5; - static const int OP_LINKPREPACK = -4; - - static const int OP_RENAMEPREP = 7; - static const int OP_RENAMEPREPACK = -7; - - static const int OP_FINISH = 17; - - static const int OP_ABORT = 20; // used for recovery only - //static const int OP_COMMIT = 21; // used for recovery only - - - const static char *get_opname(int o) { - switch (o) { - case OP_XLOCK: return "xlock"; - case OP_XLOCKACK: return "xlock_ack"; - case OP_UNXLOCK: return "unxlock"; - case OP_AUTHPIN: return "authpin"; - case OP_AUTHPINACK: return "authpin_ack"; - - case OP_LINKPREP: return "link_prep"; - case OP_LINKPREPACK: return "link_prep_ack"; - case OP_UNLINKPREP: return "unlink_prep"; - - case OP_RENAMEPREP: return "rename_prep"; - case OP_RENAMEPREPACK: return "rename_prep_ack"; - - case OP_FINISH: return "finish"; // commit - case OP_ABORT: return "abort"; - //case OP_COMMIT: return "commit"; - - default: assert(0); return 0; - } - } - - private: - metareqid_t reqid; - char op; - - // for locking - char lock_type; // lock object type - MDSCacheObjectInfo object_info; - - // for authpins - list authpins; - - public: - // for rename prep - filepath srcdnpath; - filepath destdnpath; - set witnesses; - bufferlist inode_export; - version_t inode_export_v; - bufferlist srci_replica; - utime_t now; - - bufferlist stray; // stray dir + dentry - -public: - metareqid_t get_reqid() { return reqid; } - int get_op() { return op; } - bool is_reply() { return op < 0; } - - int get_lock_type() { return lock_type; } - MDSCacheObjectInfo &get_object_info() { return object_info; } - - list& get_authpins() { return authpins; } - - void set_lock_type(int t) { lock_type = t; } - - // ---- - MMDSSlaveRequest() : Message(MSG_MDS_SLAVE_REQUEST) { } - MMDSSlaveRequest(metareqid_t ri, int o) : - Message(MSG_MDS_SLAVE_REQUEST), - reqid(ri), op(o) { } - void encode_payload() { - ::_encode(reqid, payload); - ::_encode(op, payload); - ::_encode(lock_type, payload); - object_info._encode(payload); - ::_encode_complex(authpins, payload); - srcdnpath._encode(payload); - destdnpath._encode(payload); - ::_encode(witnesses, payload); - ::_encode(now, payload); - ::_encode(inode_export, payload); - ::_encode(inode_export_v, payload); - ::_encode(srci_replica, payload); - ::_encode(stray, payload); - } - void decode_payload() { - bufferlist::iterator p = payload.begin(); - ::_decode_simple(reqid, p); - ::_decode_simple(op, p); - ::_decode_simple(lock_type, p); - object_info._decode(p); - ::_decode_complex(authpins, p); - srcdnpath._decode(p); - destdnpath._decode(p); - ::_decode_simple(witnesses, p); - ::_decode_simple(now, p); - ::_decode_simple(inode_export, p); - ::_decode_simple(inode_export_v, p); - ::_decode_simple(srci_replica, p); - ::_decode_simple(stray, p); - } - - char *get_type_name() { return "slave_request"; } - void print(ostream& out) { - out << "slave_request(" << reqid - << " " << get_opname(op) - << ")"; - } - -}; - -#endif diff --git a/trunk/ceph/messages/MMonCommand.h b/trunk/ceph/messages/MMonCommand.h deleted file mode 100644 index 19d25dd7a4d77..0000000000000 --- a/trunk/ceph/messages/MMonCommand.h +++ /dev/null @@ -1,54 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMONCOMMAND_H -#define __MMONCOMMAND_H - -#include "msg/Message.h" - -#include -using std::vector; - -class MMonCommand : public Message { - public: - entity_inst_t inst; - vector cmd; - - MMonCommand() : Message(MSG_MON_COMMAND) {} - MMonCommand(entity_inst_t i) : - Message(MSG_MON_COMMAND), - inst(i) { } - - virtual char *get_type_name() { return "mon_command"; } - void print(ostream& o) { - o << "mon_command("; - for (unsigned i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMONCOMMANDACK_H -#define __MMONCOMMANDACK_H - -#include "msg/Message.h" - -class MMonCommandAck : public Message { - public: - int r; - string rs; - - MMonCommandAck() : Message(MSG_MON_COMMAND_ACK) {} - MMonCommandAck(int _r, string s) : Message(MSG_MON_COMMAND_ACK), - r(_r), rs(s) { } - - virtual char *get_type_name() { return "mon_command"; } - void print(ostream& o) { - o << "mon_command_ack(" << r << " " << rs << ")"; - } - - void encode_payload() { - payload.append((char*)&r, sizeof(r)); - ::_encode(rs, payload); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(r), (char*)&r); - off += sizeof(r); - ::_decode(rs, payload, off); - } -}; - -#endif diff --git a/trunk/ceph/messages/MMonElection.h b/trunk/ceph/messages/MMonElection.h deleted file mode 100644 index 14a29af9140f9..0000000000000 --- a/trunk/ceph/messages/MMonElection.h +++ /dev/null @@ -1,63 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MMONELECTION_H -#define __MMONELECTION_H - -#include "msg/Message.h" - - -class MMonElection : public Message { -public: - static const int OP_PROPOSE = 1; - static const int OP_ACK = 2; - static const int OP_NAK = 3; - static const int OP_VICTORY = 4; - static const char *get_opname(int o) { - switch (o) { - case OP_PROPOSE: return "propose"; - case OP_ACK: return "ack"; - case OP_NAK: return "nak"; - case OP_VICTORY: return "victory"; - default: assert(0); return 0; - } - } - - int32_t op; - epoch_t epoch; - - MMonElection() : Message(MSG_MON_ELECTION) {} - MMonElection(int o, epoch_t e) : - Message(MSG_MON_ELECTION), - op(o), epoch(e) {} - - char *get_type_name() { return "election"; } - void print(ostream& out) { - out << "election(" << get_opname(op) << " " << epoch << ")"; - } - - void encode_payload() { - ::_encode(op, payload); - ::_encode(epoch, payload); - } - void decode_payload() { - int off = 0; - ::_decode(op, payload, off); - ::_decode(epoch, payload, off); - } - -}; - -#endif diff --git a/trunk/ceph/messages/MMonElectionCollect.h b/trunk/ceph/messages/MMonElectionCollect.h deleted file mode 100644 index f9f0c12d1ac2e..0000000000000 --- a/trunk/ceph/messages/MMonElectionCollect.h +++ /dev/null @@ -1,43 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MMONELECTIONCOLLECT_H -#define __MMONELECTIONCOLLECT_H - -#include "msg/Message.h" - - -class MMonElectionCollect : public Message { - public: - int read_num; - - MMonElectionCollect() {} - MMonElectionCollect(int n) : - Message(MSG_MON_ELECTION_COLLECT), - read_num(n) {} - - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(read_num), (char*)&read_num); - off += sizeof(read_num); - } - void encode_payload() { - payload.append((char*)&read_num, sizeof(read_num)); - } - - virtual char *get_type_name() { return "MonElCollect"; } -}; - -#endif diff --git a/trunk/ceph/messages/MMonElectionRefresh.h b/trunk/ceph/messages/MMonElectionRefresh.h deleted file mode 100644 index bc0337b8720dc..0000000000000 --- a/trunk/ceph/messages/MMonElectionRefresh.h +++ /dev/null @@ -1,52 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MMONELECTIONREFRESH_H -#define __MMONELECTIONREFRESH_H - -#include "msg/Message.h" - -#include "mon/Elector.h" - -class MMonElectionRefresh : public Message { - public: - int p; - Elector::State state; - int refresh_num; - - MMonElectionRefresh() {} - MMonElectionRefresh(int _p, Elector::State& s, int r) : - Message(MSG_MON_ELECTION_REFRESH), - p(_p), state(s), refresh_num(r) {} - - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(p), (char*)&p); - off += sizeof(p); - payload.copy(off, sizeof(state), (char*)&state); - off += sizeof(state); - payload.copy(off, sizeof(refresh_num), (char*)&refresh_num); - off += sizeof(refresh_num); - } - void encode_payload() { - payload.append((char*)&p, sizeof(p)); - payload.append((char*)&state, sizeof(state)); - payload.append((char*)&refresh_num, sizeof(refresh_num)); - } - - virtual char *get_type_name() { return "MonElRefresh"; } -}; - -#endif diff --git a/trunk/ceph/messages/MMonElectionStatus.h b/trunk/ceph/messages/MMonElectionStatus.h deleted file mode 100644 index f91e42d64b184..0000000000000 --- a/trunk/ceph/messages/MMonElectionStatus.h +++ /dev/null @@ -1,51 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MMONELECTIONSTATUS_H -#define __MMONELECTIONSTATUS_H - -#include "msg/Message.h" - -#include "mon/Elector.h" - -class MMonElectionStatus : public Message { - public: - int q; - int read_num; - map registry; - - MMonElectionStatus() {} - MMonElectionStatus(int _q, int r, map reg) : - Message(MSG_MON_ELECTION_STATUS), - q(_q), read_num(r), registry(reg) {} - - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(q), (char*)&q); - off += sizeof(q); - payload.copy(off, sizeof(read_num), (char*)&read_num); - off += sizeof(read_num); - ::_decode(registry, payload, off); - } - void encode_payload() { - payload.append((char*)&q, sizeof(q)); - payload.append((char*)&read_num, sizeof(read_num)); - ::_encode(registry, payload); - } - - virtual char *get_type_name() { return "MonElStatus"; } -}; - -#endif diff --git a/trunk/ceph/messages/MMonOSDMapInfo.h b/trunk/ceph/messages/MMonOSDMapInfo.h deleted file mode 100644 index 329c05e657d46..0000000000000 --- a/trunk/ceph/messages/MMonOSDMapInfo.h +++ /dev/null @@ -1,50 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMONOSDMAPINFO_H -#define __MMONOSDMAPINFO_H - -#include "msg/Message.h" - -#include "include/types.h" - -class MMonOSDMapInfo : public Message { - public: - epoch_t epoch; - epoch_t mon_epoch; - - epoch_t get_epoch() { return epoch; } - epoch_t get_mon_epoch() { return mon_epoch; } - - MMonOSDMapInfo(epoch_t e, epoch_t me) : - Message(MSG_MON_OSDMAP_UPDATE_PREPARE), - epoch(e), mon_epoch(me) { - } - - char *get_type_name() { return "omap_info"; } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - payload.append((char*)&mon_epoch, sizeof(mon_epoch)); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - payload.copy(off, sizeof(mon_epoch), (char*)&mon_epoch); - off += sizeof(mon_epoch); - } -}; - -#endif diff --git a/trunk/ceph/messages/MMonOSDMapLease.h b/trunk/ceph/messages/MMonOSDMapLease.h deleted file mode 100644 index 3f4ed8ea4db85..0000000000000 --- a/trunk/ceph/messages/MMonOSDMapLease.h +++ /dev/null @@ -1,50 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMONOSDMAPLEASE_H -#define __MMONOSDMAPLEASE_H - -#include "msg/Message.h" - -#include "include/types.h" - -class MMonOSDMapLease : public Message { - epoch_t epoch; - utime_t lease_expire; - - public: - epoch_t get_epoch() { return epoch; } - const utime_t& get_lease_expire() { return lease_expire; } - - MMonOSDMapLease(epoch_t e, utime_t le) : - Message(MSG_MON_OSDMAP_LEASE), - epoch(e), lease_expire(le) { - } - - char *get_type_name() { return "omap_lease"; } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - payload.append((char*)&lease_expire, sizeof(lease_expire)); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - payload.copy(off, sizeof(lease_expire), (char*)&lease_expire); - off += sizeof(lease_expire); - } -}; - -#endif diff --git a/trunk/ceph/messages/MMonOSDMapLeaseAck.h b/trunk/ceph/messages/MMonOSDMapLeaseAck.h deleted file mode 100644 index 449a0ac61a84f..0000000000000 --- a/trunk/ceph/messages/MMonOSDMapLeaseAck.h +++ /dev/null @@ -1,45 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMONOSDMAPLEASEACK_H -#define __MMONOSDMAPLEASEACK_H - -#include "msg/Message.h" - -#include "include/types.h" - -class MMonOSDMapLeaseAck : public Message { - epoch_t epoch; - -public: - epoch_t get_epoch() { return epoch; } - - MMonOSDMapLeaseAck(epoch_t e) : - Message(MSG_MON_OSDMAP_LEASE_ACK), - epoch(e) { - } - - char *get_type_name() { return "omap_lease_ack"; } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - } -}; - -#endif diff --git a/trunk/ceph/messages/MMonOSDMapUpdateAck.h b/trunk/ceph/messages/MMonOSDMapUpdateAck.h deleted file mode 100644 index 9655548dfcb00..0000000000000 --- a/trunk/ceph/messages/MMonOSDMapUpdateAck.h +++ /dev/null @@ -1,43 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMONOSDMAPUPDATEACK_H -#define __MMONOSDMAPUPDATEACK_H - -#include "msg/Message.h" - -#include "include/types.h" - -class MMonOSDMapUpdateAck : public Message { -public: - epoch_t epoch; - - MMonOSDMapUpdateAck(epoch_t e) : - Message(MSG_MON_OSDMAP_UPDATE_ACK), - epoch(e) { - } - - char *get_type_name() { return "omap_update_ack"; } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - } -}; - -#endif diff --git a/trunk/ceph/messages/MMonOSDMapUpdateCommit.h b/trunk/ceph/messages/MMonOSDMapUpdateCommit.h deleted file mode 100644 index 8aa6929c2ed9a..0000000000000 --- a/trunk/ceph/messages/MMonOSDMapUpdateCommit.h +++ /dev/null @@ -1,43 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMONOSDMAPUPDATECOMMIT_H -#define __MMONOSDMAPUPDATECOMMIT_H - -#include "msg/Message.h" - -#include "include/types.h" - -class MMonOSDMapUpdateCommit : public Message { - public: - epoch_t epoch; - - MMonOSDMapUpdateCommit(epoch_t e) : - Message(MSG_MON_OSDMAP_UPDATE_COMMIT), - epoch(e) { - } - - char *get_type_name() { return "omap_update_commit"; } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - } -}; - -#endif diff --git a/trunk/ceph/messages/MMonOSDMapUpdatePrepare.h b/trunk/ceph/messages/MMonOSDMapUpdatePrepare.h deleted file mode 100644 index 8e908e2ed0664..0000000000000 --- a/trunk/ceph/messages/MMonOSDMapUpdatePrepare.h +++ /dev/null @@ -1,53 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MMONOSDMAPUPDATEPREPARE_H -#define __MMONOSDMAPUPDATEPREPARE_H - -#include "msg/Message.h" - -#include "include/types.h" - -class MMonOSDMapUpdatePrepare : public Message { - public: - epoch_t epoch; - bufferlist map_bl; - bufferlist inc_map_bl; - - epoch_t get_epoch() { return epoch; } - - MMonOSDMapUpdatePrepare(epoch_t e, - bufferlist& mbl, bufferlist& incmbl) : - Message(MSG_MON_OSDMAP_UPDATE_PREPARE), - epoch(e), - map_bl(mbl), inc_map_bl(incmbl) { - } - - char *get_type_name() { return "omap_update_prepare"; } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - ::_encode(map_bl, payload); - ::_encode(inc_map_bl, payload); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - ::_decode(map_bl, payload, off); - ::_decode(inc_map_bl, payload, off); - } -}; - -#endif diff --git a/trunk/ceph/messages/MMonPaxos.h b/trunk/ceph/messages/MMonPaxos.h deleted file mode 100644 index 7210b179c9a42..0000000000000 --- a/trunk/ceph/messages/MMonPaxos.h +++ /dev/null @@ -1,98 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MMONPAXOS_H -#define __MMONPAXOS_H - -#include "msg/Message.h" -#include "mon/mon_types.h" - -class MMonPaxos : public Message { - public: - // op types - const static int OP_COLLECT = 1; // proposer: propose round - const static int OP_LAST = 2; // voter: accept proposed round - const static int OP_BEGIN = 3; // proposer: value proposed for this round - const static int OP_ACCEPT = 4; // voter: accept propsed value - const static int OP_COMMIT = 5; // proposer: notify learners of agreed value - const static int OP_LEASE = 6; // leader: extend peon lease - const static int OP_LEASE_ACK = 7; // peon: lease ack - const static char *get_opname(int op) { - switch (op) { - case OP_COLLECT: return "collect"; - case OP_LAST: return "last"; - case OP_BEGIN: return "begin"; - case OP_ACCEPT: return "accept"; - case OP_COMMIT: return "commit"; - case OP_LEASE: return "lease"; - case OP_LEASE_ACK: return "lease_ack"; - default: assert(0); return 0; - } - } - - epoch_t epoch; // monitor epoch - int op; // paxos op - int machine_id; // which state machine? - - version_t last_committed; // i've committed to - version_t pn_from; // i promise to accept after - version_t pn; // with with proposal - version_t uncommitted_pn; // previous pn, if we are a LAST with an uncommitted value - utime_t lease_expire; - - map values; - - MMonPaxos() : Message(MSG_MON_PAXOS) {} - MMonPaxos(epoch_t e, int o, int mid) : - Message(MSG_MON_PAXOS), - epoch(e), - op(o), machine_id(mid), - last_committed(0), pn_from(0), pn(0), uncommitted_pn(0) { } - - virtual char *get_type_name() { return "paxos"; } - - void print(ostream& out) { - out << "paxos(" << get_paxos_name(machine_id) - << " " << get_opname(op) << " lc " << last_committed - << " pn " << pn << " opn " << uncommitted_pn - << ")"; - } - - void encode_payload() { - ::_encode(epoch, payload); - ::_encode(op, payload); - ::_encode(machine_id, payload); - ::_encode(last_committed, payload); - ::_encode(pn_from, payload); - ::_encode(pn, payload); - ::_encode(uncommitted_pn, payload); - ::_encode(lease_expire, payload); - ::_encode(values, payload); - } - void decode_payload() { - int off = 0; - ::_decode(epoch, payload, off); - ::_decode(op, payload, off); - ::_decode(machine_id, payload, off); - ::_decode(last_committed, payload, off); - ::_decode(pn_from, payload, off); - ::_decode(pn, payload, off); - ::_decode(uncommitted_pn, payload, off); - ::_decode(lease_expire, payload, off); - ::_decode(values, payload, off); - } -}; - -#endif diff --git a/trunk/ceph/messages/MOSDBoot.h b/trunk/ceph/messages/MOSDBoot.h deleted file mode 100644 index 00c94ad1a2a80..0000000000000 --- a/trunk/ceph/messages/MOSDBoot.h +++ /dev/null @@ -1,51 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MOSDBOOT_H -#define __MOSDBOOT_H - -#include "msg/Message.h" - -#include "include/types.h" -#include "osd/osd_types.h" - -class MOSDBoot : public Message { - public: - entity_inst_t inst; - OSDSuperblock sb; - - MOSDBoot() {} - MOSDBoot(entity_inst_t i, OSDSuperblock& s) : - Message(MSG_OSD_BOOT), - inst(i), - sb(s) { - } - - char *get_type_name() { return "osd_boot"; } - void print(ostream& out) { - out << "osd_boot(" << inst << ")"; - } - - void encode_payload() { - ::_encode(inst, payload); - ::_encode(sb, payload); - } - void decode_payload() { - int off = 0; - ::_decode(inst, payload, off); - ::_decode(sb, payload, off); - } -}; - -#endif diff --git a/trunk/ceph/messages/MOSDFailure.h b/trunk/ceph/messages/MOSDFailure.h deleted file mode 100644 index adc4e700a4f85..0000000000000 --- a/trunk/ceph/messages/MOSDFailure.h +++ /dev/null @@ -1,55 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDFAILURE_H -#define __MOSDFAILURE_H - -#include "msg/Message.h" - - -class MOSDFailure : public Message { - public: - entity_inst_t from; - entity_inst_t failed; - epoch_t epoch; - - MOSDFailure() {} - MOSDFailure(entity_inst_t fr, entity_inst_t f, epoch_t e) : - Message(MSG_OSD_FAILURE), - from(fr), failed(f), epoch(e) {} - - entity_inst_t get_from() { return from; } - entity_inst_t get_failed() { return failed; } - epoch_t get_epoch() { return epoch; } - - void decode_payload() { - int off = 0; - ::_decode(from, payload, off); - ::_decode(failed, payload, off); - ::_decode(epoch, payload, off); - } - void encode_payload() { - ::_encode(from, payload); - ::_encode(failed, payload); - ::_encode(epoch, payload); - } - - virtual char *get_type_name() { return "osd_failure"; } - /*void print(ostream& out) { - out << "osd_failure(" << failed << " e" << epoch << ")"; - }*/ -}; - -#endif diff --git a/trunk/ceph/messages/MOSDIn.h b/trunk/ceph/messages/MOSDIn.h deleted file mode 100644 index 8f8cb4b7877ae..0000000000000 --- a/trunk/ceph/messages/MOSDIn.h +++ /dev/null @@ -1,43 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __MOSDIN_H -#define __MOSDIN_H - -#include "msg/Message.h" - - -class MOSDIn : public Message { - public: - epoch_t map_epoch; - - MOSDIn(epoch_t e) : Message(MSG_OSD_IN), map_epoch(e) { - } - MOSDIn() {} - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(map_epoch), (char*)&map_epoch); - off += sizeof(map_epoch); - } - virtual void encode_payload() { - payload.append((char*)&map_epoch, sizeof(map_epoch)); - } - - virtual char *get_type_name() { return "oin"; } -}; - -#endif diff --git a/trunk/ceph/messages/MOSDOut.h b/trunk/ceph/messages/MOSDOut.h deleted file mode 100644 index 798356f663f9e..0000000000000 --- a/trunk/ceph/messages/MOSDOut.h +++ /dev/null @@ -1,43 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef __MOSDOUT_H -#define __MOSDOUT_H - -#include "msg/Message.h" - - -class MOSDOut : public Message { - public: - epoch_t map_epoch; - - MOSDOut(epoch_t e) : Message(MSG_OSD_OUT), map_epoch(e) { - } - MOSDOut() {} - - virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(map_epoch), (char*)&map_epoch); - off += sizeof(map_epoch); - } - virtual void encode_payload() { - payload.append((char*)&map_epoch, sizeof(map_epoch)); - } - - virtual char *get_type_name() { return "oout"; } -}; - -#endif diff --git a/trunk/ceph/messages/MOSDPGActivateSet.h b/trunk/ceph/messages/MOSDPGActivateSet.h deleted file mode 100644 index cdee7996e9647..0000000000000 --- a/trunk/ceph/messages/MOSDPGActivateSet.h +++ /dev/null @@ -1,50 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDPGACTIVATESET_H -#define __MOSDPGACTIVATESET_H - -#include "msg/Message.h" - -class MOSDPGActivateSet : public Message { - epoch_t epoch; - -public: - list pg_info; - - epoch_t get_epoch() { return epoch; } - - MOSDPGActivateSet() {} - MOSDPGActivateSet(version_t mv) : - Message(MSG_OSD_PG_ACTIVATE_SET), - epoch(mv) { } - - char *get_type_name() { return "pg_activate_set"; } - void print(ostream& out) { - out << "pg_activate_set(" << pg_info.size() << " pgs e" << epoch << ")"; - } - - void encode_payload() { - ::_encode(epoch, payload); - ::_encode(pg_info, payload); - } - void decode_payload() { - int off = 0; - ::_decode(epoch, payload, off); - ::_decode(pg_info, payload, off); - } -}; - -#endif diff --git a/trunk/ceph/messages/MOSDPGLog.h b/trunk/ceph/messages/MOSDPGLog.h deleted file mode 100644 index 653bb9f10570c..0000000000000 --- a/trunk/ceph/messages/MOSDPGLog.h +++ /dev/null @@ -1,59 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDPGLOG_H -#define __MOSDPGLOG_H - -#include "msg/Message.h" - -class MOSDPGLog : public Message { - epoch_t epoch; - -public: - PG::Info info; - PG::Log log; - PG::Missing missing; - - epoch_t get_epoch() { return epoch; } - pg_t get_pgid() { return info.pgid; } - - MOSDPGLog() {} - MOSDPGLog(version_t mv, PG::Info& i) : - Message(MSG_OSD_PG_LOG), - epoch(mv), info(i) { } - - char *get_type_name() { return "PGlog"; } - void print(ostream& out) { - out << "pg_log(" << info.pgid << " e" << epoch << ")"; - } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - payload.append((char*)&info, sizeof(info)); - log._encode(payload); - missing._encode(payload); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - payload.copy(off, sizeof(info), (char*)&info); - off += sizeof(info); - log._decode(payload, off); - missing._decode(payload, off); - } -}; - -#endif diff --git a/trunk/ceph/messages/MOSDPGNotify.h b/trunk/ceph/messages/MOSDPGNotify.h deleted file mode 100644 index 76a984276b66b..0000000000000 --- a/trunk/ceph/messages/MOSDPGNotify.h +++ /dev/null @@ -1,55 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MOSDPGPEERNOTIFY_H -#define __MOSDPGPEERNOTIFY_H - -#include "msg/Message.h" - -#include "osd/PG.h" - -/* - * PGNotify - notify primary of my PGs and versions. - */ - -class MOSDPGNotify : public Message { - epoch_t epoch; - list pg_list; // pgid -> version - - public: - version_t get_epoch() { return epoch; } - list& get_pg_list() { return pg_list; } - - MOSDPGNotify() {} - MOSDPGNotify(epoch_t e, list& l) : - Message(MSG_OSD_PG_NOTIFY) { - this->epoch = e; - pg_list.splice(pg_list.begin(),l); - } - - char *get_type_name() { return "PGnot"; } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - _encode(pg_list, payload); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - _decode(pg_list, payload, off); - } -}; - -#endif diff --git a/trunk/ceph/messages/MOSDPGPeer.h b/trunk/ceph/messages/MOSDPGPeer.h deleted file mode 100644 index dd3164cdc1124..0000000000000 --- a/trunk/ceph/messages/MOSDPGPeer.h +++ /dev/null @@ -1,58 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDPGPEER_H -#define __MOSDPGPEER_H - -#include "msg/Message.h" - - -class MOSDPGPeer : public Message { - uint64_t map_version; - list pg_list; - - bool complete; - - public: - uint64_t get_version() { return map_version; } - list& get_pg_list() { return pg_list; } - bool get_complete() { return complete; } - - MOSDPGPeer() {} - MOSDPGPeer(uint64_t v, list& l, bool c=false) : - Message(MSG_OSD_PG_PEER) { - this->map_version = v; - this->complete = c; - pg_list.splice(pg_list.begin(), l); - } - - char *get_type_name() { return "PGPeer"; } - - void encode_payload() { - payload.append((char*)&map_version, sizeof(map_version)); - payload.append((char*)&complete, sizeof(complete)); - _encode(pg_list, payload); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(map_version), (char*)&map_version); - off += sizeof(map_version); - payload.copy(off, sizeof(complete), (char*)&complete); - off += sizeof(complete); - _decode(pg_list, payload, off); - } -}; - -#endif diff --git a/trunk/ceph/messages/MOSDPGPeerAck.h b/trunk/ceph/messages/MOSDPGPeerAck.h deleted file mode 100644 index dc4fac1a9436b..0000000000000 --- a/trunk/ceph/messages/MOSDPGPeerAck.h +++ /dev/null @@ -1,70 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDPGPEERACK_H -#define __MOSDPGPEERACK_H - -#include "msg/Message.h" -#include "osd/OSD.h" - -class MOSDPGPeerAck : public Message { - version_t map_version; - - public: - list pg_dne; // pg dne - map pg_state; // state, lists, etc. - - version_t get_version() { return map_version; } - - MOSDPGPeerAck() {} - MOSDPGPeerAck(version_t v) : - Message(MSG_OSD_PG_PEERACK) { - this->map_version = v; - } - - char *get_type_name() { return "PGPeer"; } - - void encode_payload() { - payload.append((char*)&map_version, sizeof(map_version)); - _encode(pg_dne, payload); - - int n = pg_state.size(); - payload.append((char*)&n, sizeof(n)); - for (map::iterator it = pg_state.begin(); - it != pg_state.end(); - it++) { - payload.append((char*)&it->first, sizeof(it->first)); - it->second._encode(payload); - } - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(map_version), (char*)&map_version); - off += sizeof(map_version); - _decode(pg_dne, payload, off); - - int n; - payload.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDPEERREQUEST_H -#define __MOSDPEERREQUEST_H - -#include "msg/Message.h" - - -class MOSDPGPeerRequest : public Message { - version_t map_version; - list pg_list; - - public: - version_t get_version() { return map_version; } - list& get_pg_list() { return pg_list; } - - MOSDPGPeerRequest() {} - MOSDPGPeerRequest(version_t v, list& l) : - Message(MSG_OSD_PG_PEERREQUEST) { - this->map_version = v; - pg_list.splice(pg_list.begin(), l); - } - - char *get_type_name() { return "PGPR"; } - - void encode_payload() { - payload.append((char*)&map_version, sizeof(map_version)); - _encode(pg_list, payload); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(map_version), (char*)&map_version); - off += sizeof(map_version); - _decode(pg_list, payload, off); - } -}; - -#endif diff --git a/trunk/ceph/messages/MOSDPGRemove.h b/trunk/ceph/messages/MOSDPGRemove.h deleted file mode 100644 index 17cb28a3c95a1..0000000000000 --- a/trunk/ceph/messages/MOSDPGRemove.h +++ /dev/null @@ -1,52 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDPGREMOVE_H -#define __MOSDPGREMOVE_H - -#include "msg/Message.h" - - -class MOSDPGRemove : public Message { - epoch_t epoch; - - public: - set pg_list; - - epoch_t get_epoch() { return epoch; } - - MOSDPGRemove() {} - MOSDPGRemove(epoch_t e, set& l) : - Message(MSG_OSD_PG_REMOVE) { - this->epoch = e; - pg_list = l; - } - - char *get_type_name() { return "PGrm"; } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - _encode(pg_list, payload); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - _decode(pg_list, payload, off); - } - -}; - -#endif diff --git a/trunk/ceph/messages/MOSDPGSummary.h b/trunk/ceph/messages/MOSDPGSummary.h deleted file mode 100644 index 0dcebffaf74da..0000000000000 --- a/trunk/ceph/messages/MOSDPGSummary.h +++ /dev/null @@ -1,69 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDPGQUERYREPLY_H -#define __MOSDPGQUERYREPLY_H - -#include "msg/Message.h" - -class MOSDPGSummary : public Message { - epoch_t epoch; - pg_t pgid; - -public: - PG::PGInfo info; - bufferlist sumbl; - - epoch_t get_epoch() { return epoch; } - - MOSDPGSummary() {} - MOSDPGSummary(version_t mv, pg_t pgid, PG::PGSummary &summary) : - Message(MSG_OSD_PG_SUMMARY) { - this->epoch = mv; - this->pgid = pgid; - summary._encode(sumbl); - } - - pg_t get_pgid() { return pgid; } - bufferlist& get_summary_bl() { - return sumbl; - } - - char *get_type_name() { return "PGsum"; } - void print(ostream& out) { - out << "pg_summary(" << pgid << " e" << epoch << ")"; - } - - void encode_payload() { - payload.append((char*)&epoch, sizeof(epoch)); - payload.append((char*)&pgid, sizeof(pgid)); - payload.append((char*)&info, sizeof(info)); - payload.claim_append(sumbl); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - payload.copy(off, sizeof(pgid), (char*)&pgid); - off += sizeof(pgid); - payload.copy(off, sizeof(info), (char*)&info); - off += sizeof(info); - - payload.splice(0, off); - sumbl.claim(payload); - } -}; - -#endif diff --git a/trunk/ceph/messages/MOSDPGUpdate.h b/trunk/ceph/messages/MOSDPGUpdate.h deleted file mode 100644 index 869c02e18c156..0000000000000 --- a/trunk/ceph/messages/MOSDPGUpdate.h +++ /dev/null @@ -1,71 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MOSDPGUPDATE_H -#define __MOSDPGUPDATE_H - -#include "msg/Message.h" - -class MOSDPGUpdate : public Message { - version_t map_version; - pg_t pgid; - //pginfo_t info; - bool complete; - version_t last_any_complete; - - public: - version_t get_version() { return map_version; } - pg_t get_pgid() { return pgid; } - //pginfo_t& get_pginfo() { return info; } - bool is_complete() { return complete; } - version_t get_last_any_complete() { return last_any_complete; } - - MOSDPGUpdate() {} - MOSDPGUpdate(version_t mv, pg_t pgid, bool complete, version_t last_any_complete) : - Message(MSG_OSD_PG_UPDATE) { - this->map_version = mv; - this->pgid = pgid; - this->complete = complete; - this->last_any_complete = last_any_complete; - } - - char *get_type_name() { return "PGUp"; } - void print(ostream& out) { - out << "pg_update(" << pgid << " e" << map_version; - if (complete) out << " complete"; - out << " lac=" << last_any_complete; - out << ")"; - } - - void encode_payload() { - payload.append((char*)&map_version, sizeof(map_version)); - payload.append((char*)&pgid, sizeof(pgid)); - payload.append((char*)&complete, sizeof(complete)); - payload.append((char*)&last_any_complete, sizeof(last_any_complete)); - } - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(map_version), (char*)&map_version); - off += sizeof(map_version); - payload.copy(off, sizeof(pgid), (char*)&pgid); - off += sizeof(pgid); - payload.copy(off, sizeof(complete), (char*)&complete); - off += sizeof(complete); - payload.copy(off, sizeof(last_any_complete), (char*)&last_any_complete); - off += sizeof(last_any_complete); - } -}; - -#endif diff --git a/trunk/ceph/messages/MOSDPing.h b/trunk/ceph/messages/MOSDPing.h deleted file mode 100644 index 37be289c0a923..0000000000000 --- a/trunk/ceph/messages/MOSDPing.h +++ /dev/null @@ -1,49 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MOSDPING_H -#define __MOSDPING_H - -#include "common/Clock.h" - -#include "msg/Message.h" -#include "osd/osd_types.h" - - -class MOSDPing : public Message { - public: - epoch_t map_epoch; - bool ack; - osd_peer_stat_t peer_stat; - - MOSDPing(epoch_t e, osd_peer_stat_t& ps, bool a=false) : - Message(MSG_OSD_PING), map_epoch(e), ack(a), peer_stat(ps) { } - MOSDPing() {} - - virtual void decode_payload() { - int off = 0; - ::_decode(map_epoch, payload, off); - ::_decode(ack, payload, off); - ::_decode(peer_stat, payload, off); - } - virtual void encode_payload() { - ::_encode(map_epoch, payload); - ::_encode(ack, payload); - ::_encode(peer_stat, payload); - } - - virtual char *get_type_name() { return "osd_ping"; } -}; - -#endif diff --git a/trunk/ceph/messages/MPGStats.h b/trunk/ceph/messages/MPGStats.h deleted file mode 100644 index a851eb103f07f..0000000000000 --- a/trunk/ceph/messages/MPGStats.h +++ /dev/null @@ -1,43 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MPGSTATS_H -#define __MPGSTATS_H - -#include "osd/osd_types.h" - -class MPGStats : public Message { -public: - map pg_stat; - osd_stat_t osd_stat; - - MPGStats() : Message(MSG_PGSTATS) {} - - char *get_type_name() { return "pg_stats"; } - void print(ostream& out) { - out << "pg_stats"; - } - - void encode_payload() { - ::_encode(osd_stat, payload); - ::_encode(pg_stat, payload); - } - void decode_payload() { - int off = 0; - ::_decode(osd_stat, payload, off); - ::_decode(pg_stat, payload, off); - } -}; - -#endif diff --git a/trunk/ceph/mon/Elector.h b/trunk/ceph/mon/Elector.h deleted file mode 100644 index 9bfd7cb644fc7..0000000000000 --- a/trunk/ceph/mon/Elector.h +++ /dev/null @@ -1,92 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __MON_ELECTOR_H -#define __MON_ELECTOR_H - -#include -using namespace std; - -#include "include/types.h" -#include "msg/Message.h" - -#include "include/Context.h" - -#include "common/Timer.h" - -class Monitor; - - -class Elector { - private: - Monitor *mon; - int whoami; - - Context *expire_event; - - void reset_timer(double plus=0.0); - void cancel_timer(); - - epoch_t epoch; // latest epoch we've seen. odd == election, even == stable, - - // electing me - bool electing_me; - utime_t start_stamp; - set acked_me; - - // electing them - int leader_acked; // who i've acked - utime_t ack_stamp; // and when - - void bump_epoch(epoch_t e=0); // i just saw a larger epoch - - class C_ElectionExpire : public Context { - Elector *elector; - public: - C_ElectionExpire(Elector *e) : elector(e) { } - void finish(int r) { - elector->expire(); - } - }; - - void start(); // start an electing me - void defer(int who); - void expire(); // timer goes off - void victory(); - - void handle_propose(class MMonElection *m); - void handle_ack(class MMonElection *m); - void handle_victory(class MMonElection *m); - - public: - Elector(Monitor *m, int w) : mon(m), whoami(w), - expire_event(0), - epoch(0), - electing_me(false), - leader_acked(-1) { } - - void init(); - void shutdown(); - - void dispatch(Message *m); - - void call_election() { - start(); - } - -}; - - -#endif diff --git a/trunk/ceph/mon/MonitorStore.h b/trunk/ceph/mon/MonitorStore.h deleted file mode 100644 index 485bf972551c4..0000000000000 --- a/trunk/ceph/mon/MonitorStore.h +++ /dev/null @@ -1,82 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MON_MONITORSTORE_H -#define __MON_MONITORSTORE_H - -#include "include/types.h" -#include "include/buffer.h" - -#include - -class MonitorStore { - string dir; - -public: - MonitorStore(char *d) : dir(d) { - } - ~MonitorStore() { - } - - void mkfs(); // wipe - void mount(); - - // ints (stored as ascii) - version_t get_int(const char *a, const char *b=0); - void put_int(version_t v, const char *a, const char *b=0); - - // buffers - // ss and sn varieties. - bool exists_bl_ss(const char *a, const char *b=0); - int get_bl_ss(bufferlist& bl, const char *a, const char *b); - int put_bl_ss(bufferlist& bl, const char *a, const char *b); - bool exists_bl_sn(const char *a, version_t b) { - char bs[20]; -#ifdef __LP64__ - sprintf(bs, "%lu", b); -#else - sprintf(bs, "%llu", b); -#endif - return exists_bl_ss(a, bs); - } - int get_bl_sn(bufferlist& bl, const char *a, version_t b) { - char bs[20]; -#ifdef __LP64__ - sprintf(bs, "%lu", b); -#else - sprintf(bs, "%llu", b); -#endif - return get_bl_ss(bl, a, bs); - } - int put_bl_sn(bufferlist& bl, const char *a, version_t b) { - char bs[20]; -#ifdef __LP64__ - sprintf(bs, "%lu", b); -#else - sprintf(bs, "%llu", b); -#endif - return put_bl_ss(bl, a, bs); - } - - /* - version_t get_incarnation() { return get_int("incarnation"); } - void set_incarnation(version_t i) { set_int(i, "incarnation"); } - - version_t get_last_proposal() { return get_int("last_proposal"); } - void set_last_proposal(version_t i) { set_int(i, "last_proposal"); } - */ -}; - - -#endif diff --git a/trunk/ceph/mon/PGMap.h b/trunk/ceph/mon/PGMap.h deleted file mode 100644 index b915c28cbd755..0000000000000 --- a/trunk/ceph/mon/PGMap.h +++ /dev/null @@ -1,103 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __PGMAP_H -#define __PGMAP_H - -#include "osd/osd_types.h" - -class PGMap { -public: - // the map - version_t version; - hash_map pg_stat; - hash_map osd_stat; - - class Incremental { - public: - version_t version; - map pg_stat_updates; - map osd_stat_updates; - - void _encode(bufferlist &bl) { - ::_encode(version, bl); - ::_encode(pg_stat_updates, bl); - ::_encode(osd_stat_updates, bl); - } - void _decode(bufferlist& bl, int& off) { - ::_decode(version, bl, off); - ::_decode(pg_stat_updates, bl, off); - ::_decode(osd_stat_updates, bl, off); - } - - Incremental() : version(0) {} - }; - - void apply_incremental(Incremental& inc) { - assert(inc.version == version+1); - version++; - for (map::iterator p = inc.pg_stat_updates.begin(); - p != inc.pg_stat_updates.end(); - ++p) { - if (pg_stat.count(p->first)) - stat_sub(pg_stat[p->first]); - pg_stat[p->first] = p->second; - stat_add(p->second); - } - } - - // aggregate stats (soft state) - hash_map num_pg_by_state; - int64_t num_pg; - int64_t total_size; - int64_t total_num_blocks; - - void stat_zero() { - num_pg = 0; - num_pg_by_state.clear(); - total_size = 0; - total_num_blocks = 0; - } - void stat_add(pg_stat_t &s) { - num_pg++; - num_pg_by_state[s.state]++; - total_size += s.size; - total_num_blocks += s.num_blocks; - } - void stat_sub(pg_stat_t &s) { - num_pg--; - num_pg_by_state[s.state]--; - total_size -= s.size; - total_num_blocks -= s.num_blocks; - } - - PGMap() : version(0), - num_pg(0), total_size(0), total_num_blocks(0) {} - - void _encode(bufferlist &bl) { - ::_encode(version, bl); - ::_encode(pg_stat, bl); - } - void _decode(bufferlist& bl, int& off) { - ::_decode(version, bl, off); - ::_decode(pg_stat, bl, off); - stat_zero(); - for (hash_map::iterator p = pg_stat.begin(); - p != pg_stat.end(); - ++p) - stat_add(p->second); - } -}; - -#endif diff --git a/trunk/ceph/mon/PGMonitor.h b/trunk/ceph/mon/PGMonitor.h deleted file mode 100644 index 7b6d44f814fd2..0000000000000 --- a/trunk/ceph/mon/PGMonitor.h +++ /dev/null @@ -1,58 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __PGMONITOR_H -#define __PGMONITOR_H - -#include -#include -using namespace std; - -#include "include/types.h" -#include "msg/Messenger.h" -#include "PaxosService.h" - -#include "PGMap.h" - -class MPGStats; -class MStatfs; - -class PGMonitor : public PaxosService { -public: - -private: - PGMap pg_map; - PGMap::Incremental pending_inc; - - void create_initial(); - bool update_from_paxos(); - void create_pending(); // prepare a new pending - void encode_pending(bufferlist &bl); // propose pending update to peers - - void committed(); - - bool preprocess_query(Message *m); // true if processed. - bool prepare_update(Message *m); - - void handle_statfs(MStatfs *statfs); - bool handle_pg_stats(MPGStats *stats); - - public: - PGMonitor(Monitor *mn, Paxos *p) : PaxosService(mn, p) { } - - //void tick(); // check state, take actions - -}; - -#endif diff --git a/trunk/ceph/mon/PaxosService.cc b/trunk/ceph/mon/PaxosService.cc deleted file mode 100644 index 7b0eed20972a0..0000000000000 --- a/trunk/ceph/mon/PaxosService.cc +++ /dev/null @@ -1,172 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "PaxosService.h" -#include "common/Clock.h" -#include "Monitor.h" - - - -#include "config.h" - -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_paxos) *_dout << dbeginl << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".paxosservice(" << get_paxos_name(paxos->machine_id) << ") " - - - - -void PaxosService::dispatch(Message *m) -{ - dout(10) << "dispatch " << *m << " from " << m->get_source_inst() << dendl; - - // make sure our map is readable and up to date - if (!paxos->is_readable()) { - dout(10) << " waiting for paxos -> readable" << dendl; - paxos->wait_for_readable(new C_RetryMessage(this, m)); - return; - } - - // make sure service has latest from paxos. - update_from_paxos(); - - // preprocess - if (preprocess_query(m)) - return; // easy! - - // leader? - if (!mon->is_leader()) { - // fw to leader - dout(10) << " fw to leader mon" << mon->get_leader() << dendl; - mon->messenger->send_message(m, mon->monmap->get_inst(mon->get_leader())); - return; - } - - // writeable? - if (!paxos->is_writeable()) { - dout(10) << " waiting for paxos -> writeable" << dendl; - paxos->wait_for_writeable(new C_RetryMessage(this, m)); - return; - } - - // update - if (prepare_update(m)) { - double delay; - if (should_propose(delay)) { - if (delay == 0.0) { - propose_pending(); - } else { - // delay a bit - if (!proposal_timer) { - dout(10) << " setting propose timer with dealy of " << delay << dendl; - proposal_timer = new C_Propose(this); - mon->timer.add_event_after(delay, proposal_timer); - } else { - dout(10) << " propose timer already set" << dendl; - } - } - } else { - dout(10) << " not proposing" << dendl; - } - } -} - -bool PaxosService::should_propose(double& delay) -{ - // simple default policy: quick startup, then some damping. - if (paxos->last_committed <= 1) - delay = 0.0; - else - delay = g_conf.paxos_propose_interval; - return true; -} - -void PaxosService::_commit() -{ - dout(7) << "_commit" << dendl; - update_from_paxos(); // notify service of new paxos state - - if (mon->is_leader()) { - dout(7) << "_commit creating new pending" << dendl; - assert(have_pending == false); - create_pending(); - have_pending = true; - - committed(); - } -} - - -void PaxosService::propose_pending() -{ - dout(10) << "propose_pending" << dendl; - assert(have_pending); - - if (proposal_timer) { - mon->timer.cancel_event(proposal_timer); - proposal_timer = 0; - } - - // finish and encode - bufferlist bl; - encode_pending(bl); - have_pending = false; - - // apply to paxos - paxos->wait_for_commit_front(new C_Commit(this)); - paxos->propose_new_value(bl); -} - - - - -void PaxosService::election_finished() -{ - dout(10) << "election_finished" << dendl; - - if (have_pending && - !mon->is_leader()) { - discard_pending(); - have_pending = false; - } - - // make sure we update our state - if (paxos->is_active()) - _active(); - else - paxos->wait_for_active(new C_Active(this)); -} - -void PaxosService::_active() -{ - dout(10) << "_active" << dendl; - assert(paxos->is_active()); - - // pull latest from paxos - update_from_paxos(); - - // create pending state? - if (mon->is_leader()) { - if (!have_pending) { - create_pending(); - have_pending = true; - } - - if (g_conf.mkfs && - paxos->get_version() == 0) { - create_initial(); - propose_pending(); - } - } -} - - diff --git a/trunk/ceph/mon/mon_types.h b/trunk/ceph/mon/mon_types.h deleted file mode 100644 index 8d1ac92822356..0000000000000 --- a/trunk/ceph/mon/mon_types.h +++ /dev/null @@ -1,35 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MON_TYPES_H -#define __MON_TYPES_H - -#define PAXOS_TEST 0 -#define PAXOS_MDSMAP 1 -#define PAXOS_OSDMAP 2 -#define PAXOS_CLIENTMAP 3 -#define PAXOS_PGMAP 4 - -inline const char *get_paxos_name(int p) { - switch (p) { - case PAXOS_TEST: return "test"; - case PAXOS_MDSMAP: return "mdsmap"; - case PAXOS_OSDMAP: return "osdmap"; - case PAXOS_CLIENTMAP: return "clientmap"; - case PAXOS_PGMAP: return "pgmap"; - default: assert(0); return 0; - } -} - -#endif diff --git a/trunk/ceph/msg/Dispatcher.cc b/trunk/ceph/msg/Dispatcher.cc deleted file mode 100644 index 4fa04d7d4c92a..0000000000000 --- a/trunk/ceph/msg/Dispatcher.cc +++ /dev/null @@ -1,28 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "Dispatcher.h" -#include "Messenger.h" - -#include "mds/MDS.h" - -/* -int Dispatcher::send_message(Message *m, msg_addr_t dest, int dest_port) -{ - assert(0); - //return dis_messenger->send_message(m, dest, dest_port, MDS_PORT_SERVER); // on my port! -} -*/ diff --git a/trunk/ceph/msg/Dispatcher.h b/trunk/ceph/msg/Dispatcher.h deleted file mode 100644 index 0a77de3d20369..0000000000000 --- a/trunk/ceph/msg/Dispatcher.h +++ /dev/null @@ -1,34 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __DISPATCHER_H -#define __DISPATCHER_H - -#include "Message.h" - -class Messenger; - -class Dispatcher { - public: - virtual ~Dispatcher() { } - - // how i receive messages - virtual void dispatch(Message *m) = 0; - - // how i deal with transmission failures. - virtual void ms_handle_failure(Message *m, const entity_inst_t& inst) { delete m; } -}; - -#endif diff --git a/trunk/ceph/msg/Messenger.cc b/trunk/ceph/msg/Messenger.cc deleted file mode 100644 index 5af83462b2995..0000000000000 --- a/trunk/ceph/msg/Messenger.cc +++ /dev/null @@ -1,39 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include -#include "include/types.h" - -#include "Message.h" -#include "Messenger.h" -#include "messages/MGenericMessage.h" - -#include -#include -using namespace std; - - -// --------- -// incoming messages - -void Messenger::dispatch(Message *m) -{ - assert(dispatcher); - dispatcher->dispatch(m); -} - - - diff --git a/trunk/ceph/msg/tcp.cc b/trunk/ceph/msg/tcp.cc deleted file mode 100644 index 43fd27ab372ff..0000000000000 --- a/trunk/ceph/msg/tcp.cc +++ /dev/null @@ -1,93 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#include "tcp.h" - -/****************** - * tcp crap - */ - -/* -inlined, see tcp.h - - -bool tcp_read(int sd, char *buf, int len) -{ - while (len > 0) { - int got = ::recv( sd, buf, len, 0 ); - if (got == 0) { - generic_dout(18) << "tcp_read socket " << sd << " closed" << dendl; - return false; - } - if (got < 0) { - generic_dout(18) << "tcp_read bailing with " << got << dendl; - return false; - } - assert(got >= 0); - len -= got; - buf += got; - //generic_dout(DBL) << "tcp_read got " << got << ", " << len << " left" << dendl; - } - return true; -} - -int tcp_write(int sd, char *buf, int len) -{ - //generic_dout(DBL) << "tcp_write writing " << len << dendl; - assert(len > 0); - while (len > 0) { - int did = ::send( sd, buf, len, 0 ); - if (did < 0) { - generic_dout(1) << "tcp_write error did = " << did << " errno " << errno << " " << strerror(errno) << dendl; - //derr(0) << "tcp_write error did = " << did << " errno " << errno << " " << strerror(errno) << dendl; - } - //assert(did >= 0); - if (did < 0) return did; - len -= did; - buf += did; - //generic_dout(DBL) << "tcp_write did " << did << ", " << len << " left" << dendl; - } - return 0; -} -*/ - -int tcp_hostlookup(char *str, sockaddr_in& ta) -{ - char *host = str; - char *port = 0; - - for (int i=0; str[i]; i++) { - if (str[i] == ':') { - port = str+i+1; - str[i] = 0; - break; - } - } - if (!port) { - cerr << "addr '" << str << "' doesn't look like 'host:port'" << std::endl; - return -1; - } - //cout << "host '" << host << "' port '" << port << "'" << std::endl; - - int iport = atoi(port); - - struct hostent *myhostname = gethostbyname( host ); - if (!myhostname) { - cerr << "host " << host << " not found" << std::endl; - return -1; - } - - memset(&ta, 0, sizeof(ta)); - - //cout << "addrtype " << myhostname->h_addrtype << " len " << myhostname->h_length << std::endl; - - ta.sin_family = myhostname->h_addrtype; - memcpy((char *)&ta.sin_addr, - myhostname->h_addr, - myhostname->h_length); - ta.sin_port = iport; - - cout << "lookup '" << host << ":" << port << "' -> " << ta << std::endl; - - return 0; -} diff --git a/trunk/ceph/osbdb/OSBDB.cc b/trunk/ceph/osbdb/OSBDB.cc deleted file mode 100644 index a5ed2bf0ece33..0000000000000 --- a/trunk/ceph/osbdb/OSBDB.cc +++ /dev/null @@ -1,2169 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* OSBDB.cc -- ObjectStore on top of Berkeley DB. - Copyright (C) 2007 Casey Marshall - -Ceph - scalable distributed file system - -This is free software; you can redistribute it and/or -modify it under the terms of the GNU Lesser General Public -License version 2.1, as published by the Free Software -Foundation. See file COPYING. */ - - -#include -#include -#include -#include "OSBDB.h" -#include "common/Timer.h" - -using namespace std; - -#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_bdbstore) *_dout << dbeginl << "bdbstore(" << device << ")@" << __LINE__ << "." -#define derr(x) if (x <= g_conf.debug || x <= g_conf.debug_bdbstore) *_derr << dbeginl << "bdbstore(" << device << ")@" << __LINE__ << "." - -#define CLEANUP(onsafe) do { \ - dout(6) << "DELETE " << hex << onsafe << dec << dendl; \ - delete onsafe; \ - } while (0) -#define COMMIT(onsafe) do { \ - dout(6) << "COMMIT " << hex << onsafe << dec << dendl; \ - sync(onsafe); \ - } while (0) - - // Have a lock, already. - -class scoped_lock -{ -private: - Mutex *m; -public: - scoped_lock(Mutex *m) : m(m) { m->Lock(); } - ~scoped_lock() { m->Unlock(); } -}; - - // Utilities. - -// Starting off with my own bsearch; mail reader to follow... - -// Perform a binary search on a sorted array, returning the insertion -// point for key, or key if it is exactly found. In other words, this -// will return a pointer to the element that will come after key if -// key were to be inserted into the sorted array. -// -// Requires that T have < and > operators defined. -template -uint32_t binary_search (T *array, size_t size, T key) -{ - int low = 0; - int high = size; - int p = (low + high) / 2; - - while (low < high - 1) - { - if (array[p] > key) - { - high = p; - } - else if (array[p] < key) - { - low = p; - } - else - return p; - - p = (low + high) / 2; - } - - if (array[p] < key) - p++; - else if (array[p] > key && p > 0) - p--; - return p; -} - - // Management. - -DbEnv *OSBDB::getenv () -{ - DbEnv *envp = new DbEnv (DB_CXX_NO_EXCEPTIONS); - if (g_conf.debug > 1 || g_conf.debug_bdbstore > 1) - envp->set_error_stream (&std::cerr); - if (g_conf.debug > 2 || g_conf.debug_bdbstore > 2) - envp->set_message_stream (&std::cout); - envp->set_flags (DB_LOG_INMEMORY, 1); - //env->set_flags (DB_DIRECT_DB, 1); - int env_flags = (DB_CREATE - | DB_THREAD - //| DB_INIT_LOCK - | DB_INIT_MPOOL - //| DB_INIT_TXN - //| DB_INIT_LOG - | DB_PRIVATE); - if (envp->open (NULL, env_flags, 0) != 0) - { - std::cerr << "failed to open environment " << std::endl; - assert(0); - } - return envp; -} - -int OSBDB::opendb(DBTYPE type, int flags, bool new_env) -{ - env = getenv(); - db = new Db(env, 0); - db->set_error_stream (&std::cerr); - db->set_message_stream (&std::cout); - db->set_flags (0); - if (!g_conf.bdbstore_btree) - { - if (g_conf.bdbstore_pagesize > 0) - db->set_pagesize (g_conf.bdbstore_pagesize); - if (g_conf.bdbstore_ffactor > 0 && g_conf.bdbstore_nelem > 0) - { - db->set_h_ffactor (g_conf.bdbstore_ffactor); - db->set_h_nelem (g_conf.bdbstore_nelem); - } - } - if (g_conf.bdbstore_cachesize > 0) - { - db->set_cachesize (0, g_conf.bdbstore_cachesize, 0); - } - - flags = flags | DB_THREAD; - if (transactional) - flags = flags | DB_AUTO_COMMIT; - - int ret; - if ((ret = db->open (NULL, device.c_str(), NULL, type, flags, 0)) != 0) - { - derr(1) << "failed to open database: " << device << ": " - << db_strerror(ret) << std::dendl; - return -EINVAL; - } - opened = true; - return 0; -} - -int OSBDB::mount() -{ - dout(2) << "mount " << device << dendl; - - if (mounted) - { - dout(4) << "..already mounted" << dendl; - return 0; - } - - if (!opened) - { - int ret; - if ((ret = opendb ()) != 0) - { - dout(4) << "..returns " << ret << dendl; - return ret; - } - } - - // XXX Do we want anything else in the superblock? - - Dbt key (OSBDB_SUPERBLOCK_KEY, 1); - stored_superblock super; - Dbt value (&super, sizeof (super)); - value.set_dlen (sizeof (super)); - value.set_ulen (sizeof (super)); - value.set_flags (DB_DBT_USERMEM | DB_DBT_PARTIAL); - - if (db->get (NULL, &key, &value, 0) != 0) - { - dout(4) << "..get superblock fails" << dendl; - return -EINVAL; // XXX how to say "badly formed fs?" - } - - dout(3) << ".mount " << super << dendl; - - if (super.version != OSBDB_THIS_VERSION) - { - dout(4) << "version mismatch (" << super.version << ")" << dendl; - return -EINVAL; - } - - DBTYPE t; - db->get_type (&t); - - if (t == DB_BTREE) - { - u_int32_t minkey; - u_int32_t flags; - db->get_bt_minkey (&minkey); - db->get_flags (&flags); - dout(1) << "mounted version " << OSBDB_THIS_VERSION << "; Btree; " - << "min keys per page: " << minkey << "; flags: " - << hex << flags << dec << dendl; - cout << dec; - } - else - { - u_int32_t ffactor; - u_int32_t nelem; - u_int32_t flags; - db->get_h_ffactor (&ffactor); - db->get_h_nelem (&nelem); - db->get_flags (&flags); - dout(1) << "mounted version " << OSBDB_THIS_VERSION << "; Hash; " - << "fill factor: " << ffactor - << " table size: " << nelem << "; flags: " - << hex << flags << dec << dendl; - cout << dec; - } - - mounted = true; - dout(4) << "..mounted" << dendl; - return 0; -} - -int OSBDB::umount() -{ - if (!mounted) - return -EINVAL; - - dout(2) << "umount" << dendl; - - int ret; - if (opened) - { - if (transactional) - { - env->log_flush (NULL); - if ((ret = env->lsn_reset (device.c_str(), 0)) != 0) - { - derr(1) << "lsn_reset: " << db_strerror (ret) << dendl; - } - } - - db->sync (0); - - if ((ret = db->close (0)) != 0) - { - derr(1) << "close: " << db_strerror(ret) << dendl; - return -EINVAL; - } - delete db; - db = NULL; - - if (env) - { - env->close (0); - delete env; - env = NULL; - } - } - mounted = false; - opened = false; - dout(4) << "..unmounted" << dendl; - return 0; -} - -int OSBDB::mkfs() -{ - if (mounted) - return -EINVAL; - - dout(2) << "mkfs" << dendl; - - string d = env_dir; - d += device; - unlink (d.c_str()); - - int ret; - if ((ret = opendb((g_conf.bdbstore_btree ? DB_BTREE : DB_HASH), - DB_CREATE, true)) != 0) - { - derr(1) << "failed to open database: " << device << ": " - << db_strerror(ret) << std::dendl; - return -EINVAL; - } - opened = true; - dout(3) << "..opened " << device << dendl; - - uint32_t c; - ret = db->truncate (NULL, &c, 0); - if (ret != 0) - { - derr(1) << "db truncate failed: " << db_strerror (ret) << dendl; - return -EIO; // ??? - } - - Dbt key (OSBDB_SUPERBLOCK_KEY, 1); - struct stored_superblock sb; - sb.version = OSBDB_THIS_VERSION; - Dbt value (&sb, sizeof (sb)); - - dout(3) << "..writing superblock" << dendl; - if ((ret = db->put (NULL, &key, &value, 0)) != 0) - { - derr(1) << "failed to write superblock: " << db_strerror (ret) - << dendl; - return -EIO; - } - dout(3) << "..wrote superblock" << dendl; - dout(4) << "..mkfs done" << dendl; - return 0; -} - - // Objects. - -int OSBDB::pick_object_revision_lt(object_t& oid) -{ - // Not really needed. - dout(0) << "pick_object_revision_lt " << oid << dendl; - return -ENOSYS; -} - -bool OSBDB::exists(object_t oid) -{ - dout(2) << "exists " << oid << dendl; - struct stat st; - bool ret = (stat (oid, &st) == 0); - dout(4) << "..returns " << ret << dendl; - return ret; -} - -int OSBDB::statfs (struct statfs *st) -{ - // Hacky? - if (::statfs (device.c_str(), st) != 0) - { - int ret = -errno; - derr(1) << "statfs returns " << ret << dendl; - return ret; - } - st->f_type = OSBDB_MAGIC; - dout(4) << "..statfs OK" << dendl; - return 0; -} - -int OSBDB::stat(object_t oid, struct stat *st) -{ - if (!mounted) - { - dout(4) << "not mounted!" << dendl; - return -EINVAL; - } - - dout(2) << "stat " << oid << dendl; - - object_inode_key ikey = new_object_inode_key(oid); - stored_object obj; - Dbt key (&ikey, sizeof_object_inode_key()); - Dbt value (&obj, sizeof (obj)); - value.set_flags (DB_DBT_USERMEM); - value.set_ulen (sizeof (obj)); - - dout(3) << " lookup " << ikey << dendl; - int ret; - if ((ret = db->get (NULL, &key, &value, 0)) != 0) - { - derr(1) << " get returned " << ret << dendl; - return -ENOENT; - } - - st->st_size = obj.length; - dout(3) << "stat length:" << obj.length << dendl; - dout(4) << "..stat OK" << dendl; - return 0; -} - -int OSBDB::remove(object_t oid, Context *onsafe) -{ - if (!mounted) - { - derr(1) << "not mounted!" << dendl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - dout(6) << "Context " << hex << onsafe << dec << dendl; - scoped_lock __lock(&lock); - dout(2) << "remove " << oid << dendl; - - DbTxn *txn = NULL; - if (transactional) - env->txn_begin (NULL, &txn, 0); - - oid_t id; - mkoid(id, oid); - Dbt key (&id, sizeof (oid_t)); - int ret; - if ((ret = db->del (txn, &key, 0)) != 0) - { - derr(1) << ".del returned error: " << db_strerror (ret) << dendl; - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - - object_inode_key _ikey = new_object_inode_key (oid); - Dbt ikey (&_ikey, sizeof_object_inode_key()); - if ((ret = db->del (txn, &ikey, 0)) != 0) - { - derr(1) << ".del returned error: " << db_strerror (ret) << dendl; - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - - attrs_id aids = new_attrs_id (oid); - Dbt askey (&aids, sizeof_attrs_id()); - Dbt asval; - asval.set_flags (DB_DBT_MALLOC); - if (db->get (txn, &askey, &asval, 0) == 0) - { - // We have attributes; remove them. - stored_attrs *sap = (stored_attrs *) asval.get_data(); - auto_ptr sa (sap); - for (unsigned i = 0; i < sap->count; i++) - { - attr_id aid = new_attr_id (oid, sap->names[i].name); - Dbt akey (&aid, sizeof (aid)); - if ((ret = db->del (txn, &akey, 0)) != 0) - { - derr(1) << ".del returns error: " << db_strerror (ret) << dendl; - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - } - if ((ret = db->del (txn, &askey, 0)) != 0) - { - derr(1) << ".del returns error: " << db_strerror (ret) << dendl; - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - } - - // XXX check del return value - - if (txn != NULL) - txn->commit (0); - if (onsafe != NULL) - COMMIT(onsafe); - dout(4) << "..remove OK" << dendl; - return 0; -} - -int OSBDB::truncate(object_t oid, off_t size, Context *onsafe) -{ - dout(6) << "Context " << hex << onsafe << dec << dendl; - - if (!mounted) - { - derr(1) << "not mounted!" << dendl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - scoped_lock __lock(&lock); - dout(2) << "truncate " << size << dendl; - - if (size > 0xFFFFFFFF) - { - derr(1) << "object size too big!" << dendl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -ENOSPC; - } - - DbTxn *txn = NULL; - - if (transactional) - env->txn_begin (NULL, &txn, 0); - - object_inode_key ikey = new_object_inode_key (oid); - stored_object obj; - Dbt key (&ikey, sizeof_object_inode_key()); - Dbt value (&obj, sizeof (obj)); - value.set_dlen (sizeof (obj)); - value.set_ulen (sizeof (obj)); - value.set_flags (DB_DBT_USERMEM); - - if (db->get (txn, &key, &value, 0) != 0) - { - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - dout(4) << "..returns -ENOENT" << dendl; - return -ENOENT; - } - - if (obj.length < size) - { - oid_t id; - mkoid (id, oid); - Dbt okey (&id, sizeof (oid_t)); - char b[] = { '\0' }; - Dbt newVal (b, 1); - newVal.set_doff ((size_t) size); - newVal.set_dlen (1); - newVal.set_ulen (1); - newVal.set_flags (DB_DBT_PARTIAL); - if (db->put (txn, &okey, &newVal, 0) != 0) - { - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".updating object failed" << dendl; - return -EIO; - } - - obj.length = size; - value.set_ulen (sizeof (obj)); - if (db->put (txn, &key, &value, 0) != 0) - { - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".updating object info failed" << dendl; - return -EIO; - } - } - else if (obj.length > size) - { - obj.length = size; - Dbt tval (&obj, sizeof (obj)); - tval.set_ulen (sizeof (obj)); - tval.set_flags (DB_DBT_USERMEM); - if (db->put (txn, &key, &tval, 0) != 0) - { - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".updating object info failed" << dendl; - return -EIO; - } - if (size == 0) - { - char x[1]; - oid_t id; - mkoid (id, oid); - Dbt okey (&id, sizeof (oid_t)); - Dbt oval (&x, 0); - if (db->put (txn, &okey, &oval, 0) != 0) - { - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".updating object failed" << dendl; - return -EIO; - } - } - else - { - oid_t id; - mkoid (id, oid); - Dbt okey (&id, sizeof (oid_t)); - Dbt oval; - oval.set_flags (DB_DBT_MALLOC); - if (db->get (txn, &okey, &oval, 0) != 0) - { - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".getting old object failed" << dendl; - return -EIO; - } - auto_ptr ovalPtr ((char *) oval.get_data()); - oval.set_size ((size_t) size); - oval.set_ulen ((size_t) size); - if (db->put (txn, &okey, &oval, 0) != 0) - { - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".putting new object failed" << dendl; - return -EIO; - } - } - } - - if (txn) - txn->commit (0); - if (onsafe != NULL) - COMMIT(onsafe); - - dout(4) << "..truncate OK" << dendl; - return 0; -} - -int OSBDB::read(object_t oid, off_t offset, size_t len, bufferlist& bl) -{ - if (!mounted) - { - derr(1) << "not mounted!" << dendl; - return -EINVAL; - } - - dout(2) << "read " << oid << " " << offset << " " - << len << dendl; - - if (bl.length() < len) - { - int remain = len - bl.length(); - bufferptr ptr (remain); - bl.push_back(ptr); - } - - DbTxn *txn = NULL; - if (transactional) - env->txn_begin (NULL, &txn, 0); - - object_inode_key _ikey = new_object_inode_key (oid); - stored_object obj; - Dbt ikey (&_ikey, sizeof_object_inode_key()); - Dbt ival (&obj, sizeof (obj)); - ival.set_flags (DB_DBT_USERMEM); - ival.set_ulen (sizeof(obj)); - - dout(3) << "..get " << _ikey << dendl; - int ret; - if ((ret = db->get (txn, &ikey, &ival, 0)) != 0) - { - if (txn) - txn->abort(); - derr(1) << "get returned " << db_strerror (ret) << dendl; - return -ENOENT; - } - - dout(3) << "..object has size " << obj.length << dendl; - - if (offset == 0 && len >= obj.length) - { - len = obj.length; - dout(3) << "..doing full read of " << len << dendl; - oid_t id; - mkoid (id, oid); - Dbt key (&id, sizeof (oid_t)); - Dbt value (bl.c_str(), len); - value.set_ulen (len); - value.set_flags (DB_DBT_USERMEM); - dout(3) << "..getting " << oid << dendl; - if ((ret = db->get (txn, &key, &value, 0)) != 0) - { - derr(1) << ".get returned " << db_strerror (ret) << dendl; - if (txn) - txn->abort(); - return -EIO; - } - } - else - { - if (offset > obj.length) - { - dout(2) << "..offset out of range" << dendl; - return 0; - } - if (offset + len > obj.length) - len = obj.length - (size_t) offset; - dout(3) << "..doing partial read of " << len << dendl; - oid_t id; - mkoid (id, oid); - Dbt key (&id, sizeof (oid)); - Dbt value; - char *data = bl.c_str(); - dout(3) << ".bufferlist c_str returned " << ((void*) data) << dendl; - value.set_data (data); - value.set_doff ((size_t) offset); - value.set_dlen (len); - value.set_ulen (len); - value.set_flags (DB_DBT_USERMEM | DB_DBT_PARTIAL); - dout(3) << "..getting " << oid << dendl; - if ((ret = db->get (txn, &key, &value, 0)) != 0) - { - derr(1) << ".get returned " << db_strerror (ret) << dendl; - if (txn) - txn->abort(); - return -EIO; - } - } - - if (txn) - txn->commit (0); - dout(4) << "..read OK, returning " << len << dendl; - return len; -} - -int OSBDB::write(object_t oid, off_t offset, size_t len, - bufferlist& bl, Context *onsafe) -{ - dout(6) << "Context " << hex << onsafe << dec << dendl; - if (!mounted) - { - derr(1) << "not mounted!" << dendl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - scoped_lock __lock(&lock); - dout(2) << "write " << oid << " " << offset << " " - << len << dendl; - - if (offset > 0xFFFFFFFFL || offset + len > 0xFFFFFFFFL) - { - derr(1) << "object too big" << dendl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -ENOSPC; - } - - DbTxn *txn = NULL; - if (transactional) - env->txn_begin (txn, &txn, 0); - - object_inode_key _ikey = new_object_inode_key (oid); - stored_object obj; - Dbt ikey (&_ikey, sizeof_object_inode_key()); - Dbt ival (&obj, sizeof (obj)); - ival.set_ulen (sizeof (obj)); - ival.set_flags (DB_DBT_USERMEM); - - int ret; - dout(3) << "..getting " << _ikey << dendl; - if (db->get (txn, &ikey, &ival, 0) != 0) - { - dout(3) << "..writing new object" << dendl; - - // New object. - obj.length = (size_t) offset + len; - dout(3) << "..mapping " << _ikey << " => " - << obj << dendl; - if ((ret = db->put (txn, &ikey, &ival, 0)) != 0) - { - derr(1) << "..put returned " << db_strerror (ret) << dendl; - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - - oid_t id; - mkoid (id, oid); - Dbt key (&id, sizeof (oid_t)); - Dbt value (bl.c_str(), len); - if (offset == 0) // whole object - { - value.set_flags (DB_DBT_USERMEM); - value.set_ulen (len); - } - else - { - value.set_flags (DB_DBT_USERMEM | DB_DBT_PARTIAL); - value.set_ulen (len); - value.set_doff ((size_t) offset); - value.set_dlen (len); - } - dout(3) << "..mapping " << oid << " => (" - << obj.length << " bytes)" << dendl; - if ((ret = db->put (txn, &key, &value, 0)) != 0) - { - derr(1) << "..put returned " << db_strerror (ret) << dendl; - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - - if (txn != NULL) - txn->commit (0); - if (onsafe != NULL) - COMMIT(onsafe); - - dout(4) << "..write OK, returning " << len << dendl; - return len; - } - - if (offset == 0 && len >= obj.length) - { - if (len != obj.length) - { - obj.length = len; - if ((ret = db->put (txn, &ikey, &ival, 0)) != 0) - { - derr(1) << " put returned " << db_strerror (ret) << dendl; - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - } - oid_t id; - mkoid(id, oid); - Dbt key (&id, sizeof (oid_t)); - Dbt value (bl.c_str(), len); - if (db->put (txn, &key, &value, 0) != 0) - { - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << "..writing object failed!" << dendl; - return -EIO; - } - } - else - { - if (offset + len > obj.length) - { - obj.length = (size_t) offset + len; - if (db->put (txn, &ikey, &ival, 0) != 0) - { - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << "..writing object info failed!" << dendl; - return -EIO; - } - } - oid_t id; - mkoid(id, oid); - Dbt key (&id, sizeof (oid_t)); - Dbt value (bl.c_str(), len); - value.set_doff ((size_t) offset); - value.set_dlen (len); - value.set_ulen (len); - value.set_flags (DB_DBT_PARTIAL); - if (db->put (txn, &key, &value, 0) != 0) - { - if (txn) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << "..writing object failed!" << dendl; - return -EIO; - } - } - - if (txn != NULL) - txn->commit (0); - if (onsafe != NULL) - COMMIT(onsafe); - - dout(4) << "..write OK, returning " << len << dendl; - return len; -} - -int OSBDB::clone(object_t oid, object_t noid) -{ - if (!mounted) - { - derr(1) << "not mounted!" << dendl; - return -EINVAL; - } - - dout(2) << "clone " << oid << ", " << noid << dendl; - - if (exists (noid)) - { - dout(4) << "..target exists; returning -EEXIST" << dendl; - return -EEXIST; - } - - DbTxn *txn = NULL; - if (transactional) - env->txn_begin (NULL, &txn, 0); - - object_inode_key _ikey = new_object_inode_key (oid); - object_inode_key _nikey = new_object_inode_key (noid); - stored_object obj; - Dbt ikey (&_ikey, sizeof_object_inode_key()); - Dbt ival (&obj, sizeof (obj)); - Dbt nikey (&_nikey, sizeof_object_inode_key()); - ival.set_ulen (sizeof (obj)); - ival.set_flags (DB_DBT_USERMEM); - - oid_t id, nid; - mkoid(id, oid); - mkoid(nid, noid); - Dbt key (&id, sizeof (oid_t)); - Dbt nkey (&oid, sizeof (oid_t)); - Dbt value; - value.set_flags (DB_DBT_MALLOC); - - if (db->get (txn, &ikey, &ival, 0) != 0) - { - if (txn) - txn->abort(); - derr(1) << "..getting object info failed!" << dendl; - return -ENOENT; - } - if (db->get (txn, &key, &value, 0) != 0) - { - if (txn) - txn->abort(); - derr(1) << "..getting original object failed" << dendl; - return -ENOENT; - } - auto_ptr valueptr ((char *) value.get_data()); - - if (db->put (txn, &nikey, &ival, 0) != 0) - { - if (txn) - txn->abort(); - derr(1) << "..putting object info failed" << dendl; - return -EIO; - } - if (db->put (txn, &nkey, &value, 0) != 0) - { - if (txn) - txn->abort(); - derr(1) << "..putting new object failed" << dendl; - return -EIO; - } - - if (txn) - txn->commit (0); - - dout(4) << "..clone OK" << dendl; - return 0; -} - - // Collections - -int OSBDB::list_collections(list& ls) -{ - if (!mounted) - { - derr(1) << "not mounted!" << dendl; - return -EINVAL; - } - - dout(2) << "list_collections" << dendl; - - Dbt key (COLLECTIONS_KEY, 1); - Dbt value; - value.set_flags (DB_DBT_MALLOC); - - if (db->get (NULL, &key, &value, 0) != 0) - { - dout(4) << "..no collections" << dendl; - return 0; // no collections. - } - - auto_ptr sc ((stored_colls *) value.get_data()); - stored_colls *scp = sc.get(); - for (uint32_t i = 0; i < sc->count; i++) - ls.push_back (scp->colls[i]); - - dout(4) << "..list_collections returns " << scp->count << dendl; - return scp->count; -} - -int OSBDB::create_collection(coll_t c, Context *onsafe) -{ - dout(6) << "Context " << hex << onsafe << dec << dendl; - if (!mounted) - { - derr(1) << "not mounted" << dendl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - scoped_lock __lock(&lock); - dout(2) << "create_collection " << hex << c << dec << dendl; - - Dbt key (COLLECTIONS_KEY, 1); - Dbt value; - value.set_flags (DB_DBT_MALLOC); - - DbTxn *txn = NULL; - if (transactional) - env->txn_begin (NULL, &txn, 0); - - stored_colls *scp = NULL; - size_t sz = 0; - bool created = false; - if (db->get (txn, &key, &value, 0) != 0) - { - sz = sizeof (stored_colls) + sizeof (coll_t); - scp = (stored_colls *) malloc (sz); - scp->count = 0; - created = true; - } - else - { - scp = (stored_colls *) value.get_data(); - sz = value.get_size(); - } - - auto_ptr sc (scp); - int ins = 0; - if (scp->count > 0) - ins = binary_search (scp->colls, scp->count, c); - if (ins < scp->count && scp->colls[ins] == c) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".collection " << c << " already exists " << dendl; - return -EEXIST; - } - - dout(3) << "..insertion point: " << ins << dendl; - - // Make room for a new collection ID. - if (!created) - { - sz += sizeof (coll_t); - dout(3) << "..increase size to " << sz << dendl; - stored_colls *scp2 = (stored_colls *) realloc (scp, sz); - sc.release (); - sc.reset (scp2); - scp = scp2; - } - - int n = (scp->count - ins) * sizeof (coll_t); - if (n > 0) - { - dout(3) << "..moving " << n << " bytes up" << dendl; - memmove (&scp->colls[ins + 1], &scp->colls[ins], n); - } - scp->count++; - scp->colls[ins] = c; - - dout(3) << "..collections: " << scp << dendl; - - // Put the modified collection list back. - { - Dbt value2 (scp, sz); - if (db->put (txn, &key, &value2, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".writing new collections list failed" << dendl; - return -EIO; - } - } - - // Create the new collection. - { - stored_coll new_coll; - new_coll.count = 0; - Dbt coll_key (&c, sizeof (coll_t)); - Dbt coll_value (&new_coll, sizeof (stored_coll)); - if (db->put (txn, &coll_key, &coll_value, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".writing new collection failed" << dendl; - return -EIO; - } - } - - if (txn != NULL) - txn->commit (0); - if (onsafe != NULL) - COMMIT(onsafe); - - dout(4) << "..create_collection OK" << dendl; - return 0; -} - -int OSBDB::destroy_collection(coll_t c, Context *onsafe) -{ - dout(6) << "Context " << hex << onsafe << dec << dendl; - if (!mounted) - { - derr(1) << "not mounted" << dendl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - scoped_lock __lock(&lock); - dout(2) << "destroy_collection " << hex << c << dec << dendl; - - Dbt key (COLLECTIONS_KEY, 1); - Dbt value; - value.set_flags (DB_DBT_MALLOC); - DbTxn *txn = NULL; - - if (transactional) - env->txn_begin (NULL, &txn, 0); - - if (db->get (txn, &key, &value, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".collection list doesn't exist" << dendl; - return -ENOENT; // XXX - } - - stored_colls *scp = (stored_colls *) value.get_data(); - auto_ptr valueBuf (scp); - if (scp->count == 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".collection " << c << " not listed" << dendl; - return -ENOENT; - } - uint32_t ins = binary_search (scp->colls, scp->count, c); - dout(4) << "..insertion point is " << ins << dendl; - if (ins >= scp->count || scp->colls[ins] != c) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".collection " << c << " not listed" << dendl; - return -ENOENT; - } - - dout(4) << "..collections list is " << scp << dendl; - - // Move the rest of the list down in memory, if needed. - if (ins < scp->count) - { - size_t n = scp->count - ins - 1; - dout(4) << "..shift list down " << n << dendl; - memmove (&scp->colls[ins], &scp->colls[ins + 1], n); - } - - dout(4) << "..collections list is " << scp << dendl; - - // Modify the record size to be one less. - Dbt nvalue (scp, value.get_size() - sizeof (coll_t)); - nvalue.set_flags (DB_DBT_USERMEM); - if (db->put (txn, &key, &nvalue, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".putting modified collection list failed" << dendl; - return -EIO; - } - - // Delete the collection. - Dbt collKey (&c, sizeof (coll_t)); - if (db->del (txn, &collKey, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".deleting collection failed" << dendl; - return -EIO; - } - - if (txn != NULL) - txn->commit (0); - if (onsafe != NULL) - COMMIT(onsafe); - dout(4) << "..destroy_collection OK" << dendl; - return 0; -} - -bool OSBDB::collection_exists(coll_t c) -{ - if (!mounted) - { - derr(1) << "not mounted" << dendl; - return -EINVAL; - } - - dout(2) << "collection_exists " << hex << c << dec << dendl; - - /*Dbt key (COLLECTIONS_KEY, 1); - Dbt value; - value.set_flags (DB_DBT_MALLOC); - - if (db->get (NULL, &key, &value, 0) != 0) - { - dout(4) << "..no collection list; return false" << dendl; - return false; - } - - stored_colls *scp = (stored_colls *) value.get_data(); - auto_ptr sc (scp); - dout(5) << "..collection list is " << scp << dendl; - if (scp->count == 0) - { - dout(4) << "..empty collection list; return false" << dendl; - return false; - } - uint32_t ins = binary_search (scp->colls, scp->count, c); - dout(4) << "..insertion point is " << ins << dendl; - - int ret = (scp->colls[ins] == c); - dout(4) << "..returns " << ret << dendl; - return ret;*/ - - Dbt key (&c, sizeof (coll_t)); - Dbt value; - value.set_flags (DB_DBT_MALLOC); - if (db->get (NULL, &key, &value, 0) != 0) - { - dout(4) << "..no collection, return false" << dendl; - return false; - } - void *val = value.get_data(); - free (val); - dout(4) << "..collection exists; return true" << dendl; - return true; -} - -int OSBDB::collection_stat(coll_t c, struct stat *st) -{ - if (!mounted) - { - derr(1) << "not mounted" << dendl; - return -EINVAL; - } - - dout(2) << "collection_stat " << c << dendl; - // XXX is this needed? - return -ENOSYS; -} - -int OSBDB::collection_add(coll_t c, object_t o, Context *onsafe) -{ - dout(6) << "Context " << hex << onsafe << dec << dendl; - if (!mounted) - { - dout(2) << "not mounted" << dendl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - scoped_lock __lock(&lock); - dout(2) << "collection_add " << hex << c << dec << " " << o << dendl; - - Dbt key (&c, sizeof (coll_t)); - Dbt value; - value.set_flags (DB_DBT_MALLOC); - DbTxn *txn = NULL; - - if (transactional) - env->txn_begin (NULL, &txn, 0); - - if (db->get (txn, &key, &value, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << "failed to find collection" << dendl; - return -ENOENT; - } - - size_t sz = value.get_size(); - stored_coll *scp = (stored_coll *) value.get_data(); - auto_ptr sc (scp); - - // Find the insertion point for the new object ID. - uint32_t ins = 0; - if (scp->count > 0) - { - ins = binary_search (scp->objects, scp->count, o); - // Already there? - if (ins < scp->count && scp->objects[ins] == o) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << "collection already has object" << dendl; - return -EEXIST; - } - } - - // Make room for the new value, and add it. - sz += sizeof (object_t); - scp = (stored_coll *) realloc (scp, sz); - sc.release(); - sc.reset (scp); - dout(3) << "..current collection: " << scp << dendl; - if (ins < scp->count - 1) - { - size_t n = (scp->count - ins) * sizeof (object_t); - dout(3) << "..move up " << n << " bytes" << dendl; - memmove (&scp->objects[ins + 1], &scp->objects[ins], n); - } - scp->count++; - scp->objects[ins] = o; - - dout(3) << "..collection: " << scp << dendl; - - Dbt nvalue (scp, sz); - if (db->put (txn, &key, &nvalue, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << "..putting modified collection failed" << dendl; - return -EIO; - } - - if (txn != NULL) - txn->commit (0); - if (onsafe != NULL) - COMMIT(onsafe); - dout(4) << "..collection add OK" << dendl; - return 0; -} - -int OSBDB::collection_remove(coll_t c, object_t o, Context *onsafe) -{ - dout(6) << "Context " << hex << onsafe << dec << dendl; - if (!mounted) - { - derr(1) << "not mounted" << dendl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - scoped_lock __lock(&lock); - dout(2) << "collection_remove " << hex << c << dec << " " << o << dendl; - - Dbt key (&c, sizeof (coll_t)); - Dbt value; - value.set_flags (DB_DBT_MALLOC); - DbTxn *txn = NULL; - - if (transactional) - env->txn_begin (NULL, &txn, 0); - - if (db->get (txn, &key, &value, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - dout(1) << "..collection doesn't exist" << dendl; - return -ENOENT; - } - - stored_coll *scp = (stored_coll *) value.get_data(); - auto_ptr sc (scp); - - dout(5) << "..collection is " << scp << dendl; - if (scp->count == 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - dout(1) << "..collection is empty" << dendl; - return -ENOENT; - } - uint32_t ins = binary_search (scp->objects, scp->count, o); - dout(4) << "..insertion point is " << ins << dendl; - if (ins >= scp->count || scp->objects[ins] != o) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - dout(1) << "..object not in collection" << dendl; - return -ENOENT; - } - - if (ins < scp->count - 1) - { - size_t n = (scp->count - ins - 1) * sizeof (object_t); - dout(5) << "..moving " << n << " bytes down" << dendl; - memmove (&scp->objects[ins], &scp->objects[ins + 1], n); - } - scp->count--; - - dout(3) << "..collection " << scp << dendl; - - Dbt nval (scp, value.get_size() - sizeof (object_t)); - if (db->put (txn, &key, &nval, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << "..putting modified collection failed" << dendl; - return -EIO; - } - - if (txn != NULL) - txn->commit (0); - if (onsafe != NULL) - COMMIT(onsafe); - dout(4) << "..collection remove OK" << dendl; - return 0; -} - -int OSBDB::collection_list(coll_t c, list& o) -{ - if (!mounted) - { - derr(1) << "not mounted" << dendl; - return -EINVAL; - } - - Dbt key (&c, sizeof (coll_t)); - Dbt value; - DbTxn *txn = NULL; - - if (transactional) - env->txn_begin (NULL, &txn, 0); - - if (db->get (txn, &key, &value, 0) != 0) - { - if (txn != NULL) - txn->abort(); - return -ENOENT; - } - - stored_coll *scp = (stored_coll *) value.get_data(); - auto_ptr sc (scp); - for (uint32_t i = 0; i < scp->count; i++) - o.push_back (scp->objects[i]); - - if (txn != NULL) - txn->commit (0); - return 0; -} - - // Attributes - -int OSBDB::_setattr(object_t oid, const char *name, - const void *value, size_t size, Context *onsafe, - DbTxn *txn) -{ - dout(6) << "Context " << hex << onsafe << dec << dendl; - if (!mounted) - { - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - if (strlen (name) >= OSBDB_MAX_ATTR_LEN) - { - derr(1) << "name too long: " << name << dendl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -ENAMETOOLONG; - } - - scoped_lock __lock(&lock); - - // Add name to attribute list, if needed. - attrs_id aids = new_attrs_id (oid); - Dbt attrs_key (&aids, sizeof_attrs_id()); - Dbt attrs_val; - attrs_val.set_flags (DB_DBT_MALLOC); - stored_attrs *sap = NULL; - size_t sz = 0; - - dout(3) << " getting " << aids << dendl; - if (db->get (txn, &attrs_key, &attrs_val, 0) != 0) - { - dout(2) << " first attribute" << dendl; - sz = sizeof (stored_attrs); - sap = (stored_attrs *) malloc(sz); - sap->count = 0; - } - else - { - sz = attrs_val.get_size(); - sap = (stored_attrs *) attrs_val.get_data(); - dout(2) << "..add to list of " << sap->count << " attrs" << dendl; - } - auto_ptr sa (sap); - - attr_name _name; - strncpy (_name.name, name, OSBDB_MAX_ATTR_LEN); - - int ins = 0; - if (sap->count > 0) - ins = binary_search (sap->names, sap->count, _name); - dout(3) << "..insertion point is " << ins << dendl; - if (sap->count == 0 || - (ins >= sap->count || strcmp (sap->names[ins].name, name) != 0)) - { - sz += sizeof (attr_name); - dout(3) << "..realloc " << ((void *) sap) << " to " - << dec << sz << dendl; - sap = (stored_attrs *) realloc (sap, sz); - dout(3) << "..returns " << ((void *) sap) << dendl; - sa.release (); - sa.reset (sap); - int n = (sap->count - ins) * sizeof (attr_name); - if (n > 0) - { - dout(3) << "..move " << n << " bytes from 0x" - << hex << (&sap->names[ins]) << " to 0x" - << hex << (&sap->names[ins+1]) << dec << dendl; - memmove (&sap->names[ins+1], &sap->names[ins], n); - } - memset (&sap->names[ins], 0, sizeof (attr_name)); - strncpy (sap->names[ins].name, name, OSBDB_MAX_ATTR_LEN); - sap->count++; - - Dbt newAttrs_val (sap, sz); - newAttrs_val.set_ulen (sz); - newAttrs_val.set_flags (DB_DBT_USERMEM); - dout(3) << "..putting " << aids << dendl; - if (db->put (txn, &attrs_key, &newAttrs_val, 0) != 0) - { - derr(1) << ".writing attributes list failed" << dendl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - } - else - { - dout(3) << "..attribute " << name << " already exists" << dendl; - } - - dout(5) << "..attributes list: " << sap << dendl; - - // Add the attribute. - attr_id aid = new_attr_id (oid, name); - Dbt attr_key (&aid, sizeof (aid)); - Dbt attr_val ((void *) value, size); - dout(3) << "..writing attribute key " << aid << dendl; - if (db->put (txn, &attr_key, &attr_val, 0) != 0) - { - derr(1) << ".writing attribute key failed" << dendl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - - dout(4) << "..setattr OK" << dendl; - if (onsafe != NULL) - COMMIT(onsafe); - return 0; -} - -int OSBDB::setattr(object_t oid, const char *name, - const void *value, size_t size, - Context *onsafe) -{ - dout(6) << "Context " << hex << onsafe << dec << dendl; - if (!mounted) - { - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - DbTxn *txn = NULL; - if (transactional) - env->txn_begin (NULL, &txn, 0); - - dout(2) << "setattr " << oid << ":" << name << " => (" - << size << " bytes)" << dendl; - int ret = _setattr (oid, name, value, size, onsafe, txn); - if (ret == 0) - { - if (txn != NULL) - txn->commit (0); - } - else - { - if (txn != NULL) - txn->abort(); - } - return ret; -} - -int OSBDB::setattrs(object_t oid, map& aset, - Context *onsafe) -{ - dout(6) << "Context " << hex << onsafe << dec << dendl; - if (!mounted) - { - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - DbTxn *txn = NULL; - - if (transactional) - env->txn_begin (NULL, &txn, 0); - - map::iterator it; - for (it = aset.begin(); it != aset.end(); it++) - { - string name = it->first; - bufferptr value = it->second; - int ret = _setattr (oid, name.c_str(), value.c_str(), - value.length(), onsafe, txn); - if (ret != 0) - { - if (txn != NULL) - txn->abort(); - return ret; - } - } - - if (txn != NULL) - txn->commit (0); - return 0; -} - -int OSBDB::_getattr (object_t oid, const char *name, void *value, size_t size) -{ - if (!mounted) - return -EINVAL; - - dout(2) << "_getattr " << oid << " " << name << " " << size << dendl; - - attr_id aid = new_attr_id (oid, name); - Dbt key (&aid, sizeof (aid)); - Dbt val (value, size); - val.set_ulen (size); - val.set_doff (0); - val.set_dlen (size); - val.set_flags (DB_DBT_USERMEM | DB_DBT_PARTIAL); - - int ret; - if ((ret = db->get (NULL, &key, &val, 0)) != 0) - { - derr(1) << ".getting value failed: " << db_strerror (ret) << dendl; - return -ENOENT; - } - - dout(4) << ".._getattr OK; returns " << val.get_size() << dendl; - return val.get_size(); -} - -int OSBDB::getattr(object_t oid, const char *name, void *value, size_t size) -{ - if (!mounted) - return -EINVAL; - - return _getattr (oid, name, value, size); -} - -int OSBDB::getattrs(object_t oid, map& aset) -{ - if (!mounted) - return -EINVAL; - - for (map::iterator it = aset.begin(); - it != aset.end(); it++) - { - int ret = _getattr (oid, (*it).first.c_str(), - (*it).second.c_str(), - (*it).second.length()); - if (ret < 0) - return ret; - } - return 0; -} - -int OSBDB::rmattr(object_t oid, const char *name, Context *onsafe) -{ - dout(6) << "Context " << hex << onsafe << dec << dendl; - if (!mounted) - { - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - scoped_lock __lock(&lock); - dout(2) << "rmattr " << oid << " " << name << dendl; - - attrs_id aids = new_attrs_id (oid); - Dbt askey (&aids, sizeof_attrs_id()); - Dbt asvalue; - asvalue.set_flags (DB_DBT_MALLOC); - - DbTxn *txn = NULL; - - if (transactional) - env->txn_begin (NULL, &txn, 0); - - if (db->get (txn, &askey, &asvalue, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - return -ENOENT; - } - - stored_attrs *sap = (stored_attrs *) asvalue.get_data(); - auto_ptr sa (sap); - - dout(5) << "..attributes list " << sap << dendl; - - if (sap->count == 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".empty attribute list" << dendl; - return -ENOENT; - } - - attr_name _name; - memset(&_name, 0, sizeof (_name)); - strncpy (_name.name, name, OSBDB_MAX_ATTR_LEN); - int ins = binary_search (sap->names, sap->count, _name); - dout(4) << "..insertion point is " << ins << dendl; - if (ins >= sap->count || strcmp (sap->names[ins].name, name) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".attribute not found in list" << dendl; - return -ENOENT; - } - - // Shift the later elements down by one, if needed. - int n = (sap->count - ins) * sizeof (attr_name); - if (n > 0) - { - dout(4) << "..shift down by " << n << dendl; - memmove (&(sap->names[ins]), &(sap->names[ins + 1]), n); - } - sap->count--; - dout(5) << "..attributes list now " << sap << dendl; - - asvalue.set_size(asvalue.get_size() - sizeof (attr_name)); - int ret; - if ((ret = db->put (txn, &askey, &asvalue, 0)) != 0) - { - derr(1) << "put stored_attrs " << db_strerror (ret) << dendl; - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - - // Remove the attribute. - attr_id aid = new_attr_id (oid, name); - Dbt key (&aid, sizeof (aid)); - if ((ret = db->del (txn, &key, 0)) != 0) - { - derr(1) << "deleting " << aid << ": " << db_strerror(ret) << dendl; - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - - if (txn != NULL) - txn->commit (0); - if (onsafe != NULL) - COMMIT(onsafe); - dout(4) << "..rmattr OK" << dendl; - return 0; -} - -int OSBDB::listattr(object_t oid, char *attrs, size_t size) -{ - if (!mounted) - return -EINVAL; - - dout(2) << "listattr " << oid << dendl; - - attrs_id aids = new_attrs_id (oid); - Dbt key (&aids, sizeof_attrs_id()); - Dbt value; - value.set_flags (DB_DBT_MALLOC); - - // XXX Transactions for read atomicity??? - - int ret; - if ((ret = db->get (NULL, &key, &value, 0)) != 0) - { - derr(1) << "fetching " << aids << ": " << db_strerror (ret) - << dendl; - return -ENOENT; - } - - stored_attrs *attrsp = (stored_attrs *) value.get_data(); - auto_ptr _attrs (attrsp); - size_t s = 0; - char *p = attrs; - for (unsigned i = 0; i < attrsp->count && s < size; i++) - { - int n = MIN (OSBDB_MAX_ATTR_LEN, - MIN (strlen (attrsp->names[i].name), size - s - 1)); - strncpy (p, attrsp->names[i].name, n); - p[n] = '\0'; - p = p + n + 1; - } - - dout(4) << "listattr OK" << dendl; - return 0; -} - - // Collection attributes. - -int OSBDB::collection_setattr(coll_t cid, const char *name, - const void *value, size_t size, - Context *onsafe) -{ - dout(6) << "Context " << hex << onsafe << dec << dendl; - if (!mounted) - { - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - scoped_lock __lock(&lock); - dout(2) << "collection_setattr " << hex << cid << dec << " " << name - << " (" << size << " bytes)" << dendl; - if (strlen (name) >= OSBDB_MAX_ATTR_LEN) - { - derr(1) << "name too long" << dendl; - if (onsafe != NULL) - CLEANUP(onsafe); - return -ENAMETOOLONG; - } - - // Add name to attribute list, if needed. - coll_attrs_id aids = new_coll_attrs_id (cid); - Dbt attrs_key (&aids, sizeof_coll_attrs_id()); - Dbt attrs_val; - attrs_val.set_flags (DB_DBT_MALLOC); - stored_attrs *sap = NULL; - size_t sz = 0; - - DbTxn *txn = NULL; - if (transactional) - env->txn_begin (NULL, &txn, 0); - - dout(3) << " getting " << aids << dendl; - if (db->get (txn, &attrs_key, &attrs_val, 0) != 0) - { - dout(2) << " first attribute" << dendl; - sz = sizeof (stored_attrs); - sap = (stored_attrs *) malloc(sz); - sap->count = 0; - } - else - { - sz = attrs_val.get_size(); - sap = (stored_attrs *) attrs_val.get_data(); - dout(2) << " add to list of " << sap->count << " attrs" << dendl; - } - auto_ptr sa (sap); - - attr_name _name; - strncpy (_name.name, name, OSBDB_MAX_ATTR_LEN); - - int ins = 0; - if (sap->count > 0) - ins = binary_search (sap->names, sap->count, _name); - dout(3) << " insertion point is " << ins << dendl; - if (ins >= sap->count || strcmp (sap->names[ins].name, name) != 0) - { - sz += sizeof (attr_name); - dout(3) << " realloc " << hex << ((void *) sap) << " to " - << dec << sz << dendl; - sap = (stored_attrs *) realloc (sap, sz); - dout(3) << " returns " << hex << ((void *) sap) << dec << dendl; - sa.release (); - sa.reset (sap); - int n = (sap->count - ins) * sizeof (attr_name); - if (n > 0) - { - dout(3) << " move " << n << " bytes from 0x" - << hex << (&sap->names[ins]) << " to 0x" - << hex << (&sap->names[ins+1]) << dec << dendl; - memmove (&sap->names[ins+1], &sap->names[ins], n); - } - memset (&sap->names[ins], 0, sizeof (attr_name)); - strncpy (sap->names[ins].name, name, OSBDB_MAX_ATTR_LEN); - sap->count++; - - Dbt newAttrs_val (sap, sz); - newAttrs_val.set_ulen (sz); - newAttrs_val.set_flags (DB_DBT_USERMEM); - dout(3) << " putting " << aids << dendl; - if (db->put (txn, &attrs_key, &newAttrs_val, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".putting new attributes failed" << dendl; - return -EIO; - } - } - else - { - dout(3) << "..attribute " << name << " already exists" << dendl; - } - - dout(3) << "..attributes list: " << sap << dendl; - - // Add the attribute. - coll_attr_id aid = new_coll_attr_id (cid, name); - Dbt attr_key (&aid, sizeof (aid)); - Dbt attr_val ((void *) value, size); - dout(3) << " writing attribute key " << aid << dendl; - if (db->put (txn, &attr_key, &attr_val, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".putting attribute failed" << dendl; - return -EIO; - } - - if (txn != NULL) - txn->commit (0); - if (onsafe != NULL) - COMMIT(onsafe); - - dout(4) << "..collection setattr OK" << dendl; - return 0; -} - -int OSBDB::collection_rmattr(coll_t cid, const char *name, - Context *onsafe) -{ - dout(6) << "Context " << hex << onsafe << dec << dendl; - if (!mounted) - { - if (onsafe != NULL) - CLEANUP(onsafe); - return -EINVAL; - } - - scoped_lock __lock(&lock); - dout(2) << "collection_rmattr " << hex << cid << dec - << " " << name << dendl; - - coll_attrs_id aids = new_coll_attrs_id (cid); - Dbt askey (&aids, sizeof_coll_attrs_id()); - Dbt asvalue; - asvalue.set_flags (DB_DBT_MALLOC); - - DbTxn *txn = NULL; - if (transactional) - env->txn_begin (NULL, &txn, 0); - - if (db->get (txn, &askey, &asvalue, 0) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".no attributes list" << dendl; - return -ENOENT; - } - - stored_attrs *sap = (stored_attrs *) asvalue.get_data(); - auto_ptr sa (sap); - - dout(5) << "..attributes list " << sap << dendl; - if (sap->count == 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".empty attributes list" << dendl; - return -ENOENT; - } - - attr_name _name; - memset(&_name, 0, sizeof (_name)); - strncpy (_name.name, name, OSBDB_MAX_ATTR_LEN); - int ins = binary_search (sap->names, sap->count, _name); - if (ins >= sap->count || strcmp (sap->names[ins].name, name) != 0) - { - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - derr(1) << ".attribute not listed" << dendl; - return -ENOENT; - } - - // Shift the later elements down by one, if needed. - int n = (sap->count - ins) * sizeof (attr_name); - if (n > 0) - { - dout(4) << "..shift down by " << n << dendl; - memmove (&(sap->names[ins]), &(sap->names[ins + 1]), n); - } - sap->count--; - dout(5) << "..attributes list now " << sap << dendl; - - asvalue.set_size(asvalue.get_size() - sizeof (attr_name)); - int ret; - if ((ret = db->put (txn, &askey, &asvalue, 0)) != 0) - { - derr(1) << "put stored_attrs " << db_strerror (ret) << dendl; - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - - // Remove the attribute. - coll_attr_id aid = new_coll_attr_id (cid, name); - Dbt key (&aid, sizeof (aid)); - if ((ret = db->del (txn, &key, 0)) != 0) - { - derr(1) << "deleting " << aid << ": " << db_strerror(ret) << dendl; - if (txn != NULL) - txn->abort(); - if (onsafe != NULL) - CLEANUP(onsafe); - return -EIO; - } - - if (txn != NULL) - txn->commit (0); - if (onsafe != NULL) - COMMIT(onsafe); - - dout(4) << "..collection rmattr OK" << dendl; - return 0; -} - -int OSBDB::collection_getattr(coll_t cid, const char *name, - void *value, size_t size) -{ - if (!mounted) - return -EINVAL; - - dout(2) << "collection_getattr " << hex << cid << dec - << " " << name << dendl; - - // XXX transactions/read isolation? - - coll_attr_id caid = new_coll_attr_id (cid, name); - Dbt key (&caid, sizeof (caid)); - Dbt val (value, size); - val.set_ulen (size); - val.set_dlen (size); - val.set_flags (DB_DBT_USERMEM | DB_DBT_PARTIAL); - - if (db->get (NULL, &key, &val, 0) != 0) - { - derr(1) << ".no attribute entry" << dendl; - return -ENOENT; - } - - dout(4) << "..collection getattr OK; returns " << val.get_size() << dendl; - return val.get_size(); -} - -int OSBDB::collection_listattr(coll_t cid, char *attrs, size_t size) -{ - if (!mounted) - return -EINVAL; - - dout(2) << "collection_listattr " << hex << cid << dec << dendl; - - // XXX transactions/read isolation? - - coll_attrs_id caids = new_coll_attrs_id (cid); - Dbt key (&caids, sizeof_coll_attrs_id()); - Dbt value; - value.set_flags (DB_DBT_MALLOC); - - int ret; - if ((ret = db->get (NULL, &key, &value, 0)) != 0) - { - derr(1) << "fetching " << caids << ": " << db_strerror (ret) - << dendl; - return -ENOENT; - } - - stored_attrs *attrsp = (stored_attrs *) value.get_data(); - auto_ptr _attrs (attrsp); - size_t s = 0; - char *p = attrs; - for (unsigned i = 0; i < attrsp->count && s < size; i++) - { - int n = MIN (OSBDB_MAX_ATTR_LEN, - MIN (strlen (attrsp->names[i].name), size - s - 1)); - strncpy (p, attrsp->names[i].name, n); - p[n] = '\0'; - p = p + n + 1; - } - return 0; -} - - // Sync. - -void OSBDB::sync (Context *onsync) -{ - if (!mounted) - return; - - sync(); - - if (onsync != NULL) - { - g_timer.add_event_after(0.1, onsync); - } -} - -void OSBDB::sync() -{ - if (!mounted) - return; - - if (transactional) - { - env->log_flush (NULL); - env->lsn_reset (device.c_str(), 0); - } - db->sync(0); -} diff --git a/trunk/ceph/osbdb/OSBDB.h b/trunk/ceph/osbdb/OSBDB.h deleted file mode 100644 index 8eb2004d3903f..0000000000000 --- a/trunk/ceph/osbdb/OSBDB.h +++ /dev/null @@ -1,482 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* OSBDB.h -- ObjectStore on Berkeley DB. -*- c++ -*- - Copyright (C) 2007 Casey Marshall - -Ceph - scalable distributed file system - -This is free software; you can redistribute it and/or -modify it under the terms of the GNU Lesser General Public -License version 2.1, as published by the Free Software -Foundation. See file COPYING. */ - - -#include -#include "osd/ObjectStore.h" - -#define OSBDB_MAGIC 0x05BDB - -/* - * Maximum length of an attribute name. - */ -#define OSBDB_MAX_ATTR_LEN 256 - -#define OSBDB_THIS_VERSION 1 - -#define OSBDB_SUPERBLOCK_KEY ((void *) "s") - -/* - * The "superblock" of the BDB object store. We store one of these in - * the DB, to store version and other information. We don't record - * anything special here, just the version number the database was - * written with. - * - * In principle, this structure is variable-length, depending on the - * software version writing the superblock. - */ -struct stored_superblock -{ - uint32_t version; -}; - -inline ostream& operator<<(ostream& out, const stored_superblock sb) -{ - out << "osbdb.super(" << sb.version << ")" << endl; - return out; -} - -/** - * An object identifier; we define this so we can have a POD object to - * work with. - */ -struct oid_t // POD -{ - char id[16]; -}; - -inline void mkoid (oid_t& id, object_t& oid) -{ - // XXX byte order? - memcpy (id.id, &oid, sizeof (oid_t)); -} - -inline ostream& operator<<(ostream& out, const oid_t id) -{ - for (int i = 0; i < 16; i++) - { - out.fill('0'); - out << setw(2) << hex << (id.id[i] & 0xFF); - if ((i & 3) == 3) - out << ':'; - } - out.unsetf(ios::right); - out << dec; - return out; -} - -/** - * An "inode" key. We map a 'stored_object' struct to this key for - * every object. - */ -struct object_inode_key // POD -{ - oid_t oid; - char tag; -}; - -/** - * "Constructor" for an object_inode_key. - */ -inline object_inode_key new_object_inode_key (object_t& oid) -{ - object_inode_key key; - memset(&key, 0, sizeof (object_inode_key)); - mkoid (key.oid, oid); - key.tag = 'i'; - return key; -} - -/* - * We use this, instead of sizeof(), to try and guarantee that we - * don't include the structure padding, if any. - * - * This *should* return 17: sizeof (oid_t) == 16; sizeof (char) == 1. - */ -inline size_t sizeof_object_inode_key() -{ - return offsetof(object_inode_key, tag) + sizeof (char); -} - - // Frank Poole: Unfortunately, that sounds a little - // like famous last words. - // -- 2001: A Space Odyssey - -inline ostream& operator<<(ostream& out, const object_inode_key o) -{ - out << o.tag << "/" << o.oid; - return out; -} - -/** - * A stored object. This is essentially the "inode" of the object, - * containing things like the object's length. The object itself is - * stored as-is, mapped by the 128-bit object ID. - */ -struct stored_object -{ - uint32_t length; -}; - -inline ostream& operator<<(ostream& out, const stored_object s) -{ - out << "inode(l:" << s.length << ")"; - return out; -} - -/* - * Key referencing the list of attribute names for an object. This is - * simply the object's ID, with an additional character 'a' appended. - */ -struct attrs_id // POD -{ - oid_t oid; - char tag; -}; - -/* - * "Construtor" for attrs_id. - */ -inline struct attrs_id new_attrs_id (object_t& oid) -{ - attrs_id aid; - memset (&aid, 0, sizeof (attrs_id)); - mkoid(aid.oid, oid); - aid.tag = 'a'; - return aid; -} - -/* - * See explanation for sizeof_object_inode_id. - */ -inline size_t sizeof_attrs_id() -{ - return offsetof(struct attrs_id, tag) + sizeof (char); -} - -inline ostream& operator<<(ostream& out, const attrs_id id) -{ - out << id.tag << "/" << id.oid; - return out; -} - -/* - * Encapsulation of a single attribute name. - */ -struct attr_name // POD -{ - char name[OSBDB_MAX_ATTR_LEN]; -}; - -inline ostream& operator<<(ostream& out, const attr_name n) -{ - out << n.name; - return out; -} - -inline bool operator<(const attr_name n1, const attr_name n2) -{ - return (strncmp (n1.name, n2.name, OSBDB_MAX_ATTR_LEN) < 0); -} - -inline bool operator>(const attr_name n1, const attr_name n2) -{ - return (strncmp (n1.name, n2.name, OSBDB_MAX_ATTR_LEN) > 0); -} - -inline bool operator==(const attr_name n1, const attr_name n2) -{ - std::cerr << n1.name << " == " << n2.name << "?" << endl; - return (strncmp (n1.name, n2.name, OSBDB_MAX_ATTR_LEN) == 0); -} - -inline bool operator!=(const attr_name n1, const attr_name n2) -{ - return !(n1 == n2); -} - -inline bool operator>=(const attr_name n1, const attr_name n2) -{ - return !(n1 < n2); -} - -inline bool operator<=(const attr_name n1, const attr_name n2) -{ - return !(n1 > n2); -} - -/* - * A list of an object or collection's attribute names. - */ -struct stored_attrs -{ - uint32_t count; - attr_name names[0]; // actually variable-length -}; - -inline ostream& operator<<(ostream& out, const stored_attrs *sa) -{ - out << sa->count << " [ "; - for (unsigned i = 0; i < sa->count; i++) - out << sa->names[i] << (i == sa->count - 1 ? " " : ", "); - out << "]"; - return out; -} - -/* - * An object attribute key. An object attribute is mapped simply by - * the object ID appended with the attribute name. Attribute names - * may not be empty, and must be less than 256 characters, in this - * implementation. - */ -struct attr_id // POD -{ - oid_t oid; - attr_name name; -}; - -inline attr_id new_attr_id (object_t& oid, const char *name) -{ - attr_id aid; - memset(&aid, 0, sizeof (attr_id)); - mkoid (aid.oid, oid); - strncpy (aid.name.name, name, OSBDB_MAX_ATTR_LEN); - return aid; -} - -inline ostream& operator<<(ostream &out, const attr_id id) -{ - out << id.oid << ":" << id.name; - return out; -} - -/* - * A key for a collection attributes list. - */ -struct coll_attrs_id // POD -{ - coll_t cid; - char tag; -}; - -inline coll_attrs_id new_coll_attrs_id (coll_t cid) -{ - coll_attrs_id catts; - memset(&catts, 0, sizeof (coll_attrs_id)); - catts.cid = cid; - catts.tag = 'C'; - return catts; -} - -inline size_t sizeof_coll_attrs_id() -{ - return offsetof(coll_attrs_id, tag) + sizeof (char); -} - -inline ostream& operator<<(ostream& out, coll_attrs_id id) -{ - out << id.tag << "/" << id.cid; - return out; -} - -/* - * A collection attribute key. Similar to - */ -struct coll_attr_id // POD -{ - coll_t cid; - attr_name name; -}; - -inline coll_attr_id new_coll_attr_id (coll_t cid, const char *name) -{ - coll_attr_id catt; - memset(&catt, 0, sizeof (coll_attr_id)); - catt.cid = cid; - strncpy (catt.name.name, name, OSBDB_MAX_ATTR_LEN); - return catt; -} - -inline ostream& operator<<(ostream& out, coll_attr_id id) -{ - out << id.cid << ":" << id.name; - return out; -} - -/* - * This is the key we store the master collections list under. - */ -#define COLLECTIONS_KEY ((void *) "c") - -/* - * The master list of collections. There should be one of these per - * OSD. The sole reason for this structure is to have the ability - * to enumerate all collections stored on this OSD. - */ -struct stored_colls -{ - // The number of collections. - uint32_t count; - - // The collection identifiers. This is a sorted list of coll_t - // values. - coll_t colls[0]; // actually variable-length -}; - -inline ostream& operator<<(ostream& out, stored_colls *c) -{ - out << c->count << " [ "; - for (unsigned i = 0; i < c->count; i++) - { - out << hex << c->colls[i]; - if (i < c->count - 1) - out << ", "; - } - out << " ]" << dec; - return out; -} - -/* - * A stored collection (a bag of object IDs). These are referenced by - * the bare collection identifier type, a coll_t (thus, a 32-bit - * integer). Internally this is stored as a sorted list of object IDs. - * - * Note, this structure places all collection items in a single - * record; this may be a memory burden for large collections. - */ -struct stored_coll -{ - // The size of this collection. - uint32_t count; - - // The object IDs in this collection. This is a sorted list of all - // object ID's in this collection. - object_t objects[0]; // actually variable-length -}; - -inline ostream& operator<<(ostream& out, stored_coll *c) -{ - out << c->count << " [ "; - for (unsigned i = 0; i < c->count; i++) - { - out << c->objects[i]; - if (i < c->count - 1) - out << ", "; - } - out << " ]"; - return out; -} - -class OSBDBException : public std::exception -{ - const char *msg; - -public: - OSBDBException(const char *msg) : msg(msg) { } - const char *what() { return msg; } -}; - -/* - * The object store interface for Berkeley DB. - */ -class OSBDB : public ObjectStore -{ - private: - Mutex lock; - DbEnv *env; - Db *db; - string device; - string env_dir; - bool mounted; - bool opened; - bool transactional; - - public: - - OSBDB(const char *dev) throw(OSBDBException) - : lock(true), env(0), db (0), device (dev), mounted(false), opened(false), - transactional(g_conf.bdbstore_transactional) - { - } - - ~OSBDB() - { - if (mounted) - { - umount(); - } - } - - int mount(); - int umount(); - int mkfs(); - - int statfs(struct statfs *buf); - - int pick_object_revision_lt(object_t& oid); - - bool exists(object_t oid); - int stat(object_t oid, struct stat *st); - - int remove(object_t oid, Context *onsafe=0); - - int truncate(object_t oid, off_t size, Context *onsafe=0); - - int read(object_t oid, off_t offset, size_t len, - bufferlist& bl); - int write(object_t oid, off_t offset, size_t len, - bufferlist& bl, Context *onsafe); - - int setattr(object_t oid, const char *name, - const void *value, size_t size, Context *onsafe=0); - int setattrs(object_t oid, map& aset, - Context *onsafe=0); - int getattr(object_t oid, const char *name, - void *value, size_t size); - int getattrs(object_t oid, map& aset); - int rmattr(object_t oid, const char *name, - Context *onsafe=0); - int listattr(object_t oid, char *attrs, size_t size); - - int clone(object_t oid, object_t noid); - - // Collections. - - int list_collections(list& ls); - int create_collection(coll_t c, Context *onsafe=0); - int destroy_collection(coll_t c, Context *onsafe=0); - bool collection_exists(coll_t c); - int collection_stat(coll_t c, struct stat *st); - int collection_add(coll_t c, object_t o, Context *onsafe=0); - int collection_remove(coll_t c, object_t o, Context *onsafe=0); - int collection_list(coll_t c, list& o); - - int collection_setattr(coll_t cid, const char *name, - const void *value, size_t size, - Context *onsafe=0); - int collection_rmattr(coll_t cid, const char *name, - Context *onsafe=0); - int collection_getattr(coll_t cid, const char *name, - void *value, size_t size); - int collection_listattr(coll_t cid, char *attrs, size_t size); - - void sync(Context *onsync); - void sync(); - -private: - int opendb (DBTYPE type=DB_UNKNOWN, int flags=0, bool new_env=false); - - int _setattr(object_t oid, const char *name, const void *value, - size_t size, Context *onsync, DbTxn *txn); - int _getattr(object_t oid, const char *name, void *value, size_t size); - DbEnv *getenv(); -}; diff --git a/trunk/ceph/osd/Ager.cc b/trunk/ceph/osd/Ager.cc deleted file mode 100644 index fb777238da8fb..0000000000000 --- a/trunk/ceph/osd/Ager.cc +++ /dev/null @@ -1,333 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#include "include/types.h" - -#include "Ager.h" -#include "ObjectStore.h" - -#include "config.h" -#include "common/Clock.h" - -// ick -#include "ebofs/Ebofs.h" -#include -#include -#include - -#ifdef DARWIN -#include -#include -#endif // DARWIN - - -int myrand() -{ - if (0) - return rand(); - else { - static int n = 0; - srand(n++); - return rand(); - } -} - - -object_t Ager::age_get_oid() { - if (!age_free_oids.empty()) { - object_t o = age_free_oids.front(); - age_free_oids.pop_front(); - return o; - } - object_t last = age_cur_oid; - ++age_cur_oid.bno; - return last; -} - -ssize_t Ager::age_pick_size() { - ssize_t max = file_size_distn.sample() * 1024; - return max/2 + (myrand() % 100) * max/200 + 1; -} - -bool start_debug = false; - -uint64_t Ager::age_fill(float pc, utime_t until) { - int max = 1024*1024; - bufferptr bp(max); - bp.zero(); - bufferlist bl; - bl.push_back(bp); - uint64_t wrote = 0; - while (1) { - if (g_clock.now() > until) break; - - struct statfs st; - store->statfs(&st); - float free = 1.0 - ((float)(st.f_bfree) / (float)st.f_blocks); - float avail = 1.0 - ((float)(st.f_bavail) / (float)st.f_blocks); // to write to - //float a = (float)(st.f_bfree) / (float)st.f_blocks; - //dout(10) << "age_fill at " << a << " / " << pc << " .. " << st.f_blocks << " " << st.f_bavail << dendl; - if (free >= pc) { - generic_dout(2) << "age_fill at " << free << " / " << avail << " / " << " / " << pc << " stopping" << dendl; - break; - } - - // make sure we can write to it.. - if (avail > .98 || - avail - free > .02) - store->sync(); - - object_t oid = age_get_oid(); - - int b = myrand() % 10; - age_objects[b].push_back(oid); - - ssize_t s = age_pick_size(); - wrote += (s + 4095) / 4096; - - - - - generic_dout(2) << "age_fill at " << free << " / " << avail << " / " << pc << " creating " << hex << oid << dec << " sz " << s << dendl; - - - if (false && !g_conf.ebofs_verify && start_debug && wrote > 1000000ULL) { - /* - - - 1005700 -? -1005000 -1005700 - 1005710 - 1005725ULL - 1005750ULL - 1005800 - 1006000 - -// 99 1000500 ? 1000750 1006000 -*/ - g_conf.debug_ebofs = 30; - g_conf.ebofs_verify = true; - } - - off_t off = 0; - while (s) { - ssize_t t = MIN(s, max); - bufferlist sbl; - sbl.substr_of(bl, 0, t); - store->write(oid, off, t, sbl, false); - off += t; - s -= t; - } - oid.bno++; - } - - return wrote*4; // KB -} - -void Ager::age_empty(float pc) { - int nper = 20; - int n = nper; - - //g_conf.ebofs_verify = true; - - while (1) { - struct statfs st; - store->statfs(&st); - float free = 1.0 - ((float)(st.f_bfree) / (float)st.f_blocks); - float avail = 1.0 - ((float)(st.f_bavail) / (float)st.f_blocks); // to write to - generic_dout(2) << "age_empty at " << free << " / " << avail << " / " << pc << dendl;//" stopping" << dendl; - if (free <= pc) { - generic_dout(2) << "age_empty at " << free << " / " << avail << " / " << pc << " stopping" << dendl; - break; - } - - int b = myrand() % 10; - n--; - if (n == 0 || age_objects[b].empty()) { - generic_dout(2) << "age_empty sync" << dendl; - //sync(); - //sync(); - n = nper; - continue; - } - object_t oid = age_objects[b].front(); - age_objects[b].pop_front(); - - generic_dout(2) << "age_empty at " << free << " / " << avail << " / " << pc << " removing " << hex << oid << dec << dendl; - - store->remove(oid); - age_free_oids.push_back(oid); - } - - g_conf.ebofs_verify = false; -} - -void pfrag(uint64_t written, ObjectStore::FragmentationStat &st) -{ - cout << "#gb wr\ttotal\tn x\tavg x\tavg per\tavg j\tfree\tn fr\tavg fr\tnum<2\tsum<2\tnum<4\tsum<4\t..." - << std::endl; - cout << written - << "\t" << st.total - << "\t" << st.num_extent - << "\t" << st.avg_extent - << "\t" << st.avg_extent_per_object - << "\t" << st.avg_extent_jump - << "\t" << st.total_free - << "\t" << st.num_free_extent - << "\t" << st.avg_free_extent; - - int n = st.num_extent; - for (uint64_t i=1; i <= 30; i += 1) { - cout << "\t" << st.extent_dist[i]; - cout << "\t" << st.extent_dist_sum[i]; - //cout << "\ta " << (st.extent_dist[i] ? (st.extent_dist_sum[i] / st.extent_dist[i]):0); - n -= st.extent_dist[i]; - if (n == 0) break; - } - cout << std::endl; -} - - -void Ager::age(int time, - float high_water, // fill to this % - float low_water, // then empty to this % - int count, // this many times - float final_water, // and end here ( <= low_water) - int fake_size_mb) { - - store->_fake_writes(true); - srand(0); - - utime_t start = g_clock.now(); - utime_t until = start; - until.sec_ref() += time; - - int elapsed = 0; - int freelist_inc = 60; - utime_t nextfl = start; - nextfl.sec_ref() += freelist_inc; - - while (age_objects.size() < 10) age_objects.push_back( list() ); - - if (fake_size_mb) { - int fake_bl = fake_size_mb * 256; - struct statfs st; - store->statfs(&st); - float f = (float)fake_bl / (float)st.f_blocks; - high_water = (float)high_water * f; - low_water = (float)low_water * f; - final_water = (float)final_water * f; - generic_dout(2) << "fake " << fake_bl << " / " << st.f_blocks << " is " << f << ", high " << high_water << " low " << low_water << " final " << final_water << dendl; - } - - // init size distn (once) - if (!did_distn) { - did_distn = true; - age_cur_oid = object_t(0,1); - file_size_distn.add(1, 19.0758125+0.65434375); - file_size_distn.add(512, 35.6566); - file_size_distn.add(1024, 27.7271875); - file_size_distn.add(2*1024, 16.63503125); - //file_size_distn.add(4*1024, 106.82384375); - //file_size_distn.add(8*1024, 81.493375); - //file_size_distn.add(16*1024, 14.13553125); - //file_size_distn.add(32*1024, 2.176); - //file_size_distn.add(256*1024, 0.655938); - //file_size_distn.add(512*1024, 0.1480625); - //file_size_distn.add(1*1024*1024, 0.020125); // actually 2, but 32bit - file_size_distn.normalize(); - } - - // clear - for (int i=0; i<10; i++) - age_objects[i].clear(); - - ObjectStore::FragmentationStat st; - - uint64_t wrote = 0; - - for (int c=1; c<=count; c++) { - if (g_clock.now() > until) break; - - //if (c == 7) start_debug = true; - - generic_dout(1) << "#age " << c << "/" << count << " filling to " << high_water << dendl; - uint64_t w = age_fill(high_water, until); - //dout(1) << "age wrote " << w << dendl; - wrote += w; - //store->sync(); - //store->_get_frag_stat(st); - //pfrag(st); - - - if (c == count) { - generic_dout(1) << "#age final empty to " << final_water << dendl; - age_empty(final_water); - } else { - generic_dout(1) << "#age " << c << "/" << count << " emptying to " << low_water << dendl; - age_empty(low_water); - } - //store->sync(); - //store->sync(); - - // show frag state - store->_get_frag_stat(st); - pfrag(wrote / (1024ULL*1024ULL) , // GB - st); - - // dump freelist? - if (g_clock.now() > nextfl) { - elapsed += freelist_inc; - save_freelist(elapsed); - nextfl.sec_ref() += freelist_inc; - } - } - - // dump the freelist - save_freelist(0); - exit(0); // hack - - // ok! - store->_fake_writes(false); - store->sync(); - store->sync(); - generic_dout(1) << "age finished" << dendl; -} - - -void Ager::load_freelist() -{ - generic_dout(1) << "load_freelist" << dendl; - - struct stat st; - - int r = ::stat("ebofs.freelist", &st); - assert(r == 0); - - bufferptr bp(st.st_size); - bufferlist bl; - bl.push_back(bp); - int fd = ::open("ebofs.freelist", O_RDONLY); - ::read(fd, bl.c_str(), st.st_size); - ::close(fd); - - ((Ebofs*)store)->_import_freelist(bl); - store->sync(); - store->sync(); -} - -void Ager::save_freelist(int el) -{ - generic_dout(1) << "save_freelist " << el << dendl; - char s[100]; - sprintf(s, "ebofs.freelist.%d", el); - bufferlist bl; - ((Ebofs*)store)->_export_freelist(bl); - ::unlink(s); - int fd = ::open(s, O_CREAT|O_WRONLY); - ::fchmod(fd, 0644); - ::write(fd, bl.c_str(), bl.length()); - ::close(fd); -} diff --git a/trunk/ceph/osd/Ager.h b/trunk/ceph/osd/Ager.h deleted file mode 100644 index ad160c0e9f9ff..0000000000000 --- a/trunk/ceph/osd/Ager.h +++ /dev/null @@ -1,44 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -#ifndef __AGER_H -#define __AGER_H - -#include "include/types.h" -#include "include/Distribution.h" -#include "ObjectStore.h" -#include "common/Clock.h" - -#include -#include -using namespace std; - -class Ager { - ObjectStore *store; - - private: - list age_free_oids; - object_t age_cur_oid; - vector< list > age_objects; - Distribution file_size_distn; //kb - bool did_distn; - - void age_empty(float pc); - uint64_t age_fill(float pc, utime_t until); - ssize_t age_pick_size(); - object_t age_get_oid(); - - public: - Ager(ObjectStore *s) : store(s), did_distn(false) {} - - void age(int time, - float high_water, // fill to this % - float low_water, // then empty to this % - int count, // this many times - float final_water, // and end here ( <= low_water) - int fake_size_mb=0); - - void save_freelist(int); - void load_freelist(); -}; - -#endif diff --git a/trunk/ceph/osd/BDBMap.h b/trunk/ceph/osd/BDBMap.h deleted file mode 100644 index a8e96a8a192f7..0000000000000 --- a/trunk/ceph/osd/BDBMap.h +++ /dev/null @@ -1,137 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __BERKELEYDB_H -#define __BERKELEYDB_H - -#include -#include - -#include -using namespace std; - - -template -class BDBMap { - private: - DB *dbp; - - public: - BDBMap() : dbp(0) {} - ~BDBMap() { - close(); - } - - bool is_open() { return dbp ? true:false; } - - // open/close - int open(const char *fn) { - //cout << "open " << fn << endl; - - int r; - if ((r = db_create(&dbp, NULL, 0)) != 0) { - cerr << "db_create: " << db_strerror(r) << endl; - assert(0); - } - - dbp->set_errfile(dbp, stderr); - dbp->set_errpfx(dbp, "bdbmap"); - - r = dbp->open(dbp, NULL, fn, NULL, DB_BTREE, DB_CREATE, 0644); - if (r != 0) { - dbp->err(dbp, r, "%s", fn); - } - assert(r == 0); - return 0; - } - void close() { - if (dbp) { - dbp->close(dbp,0); - dbp = 0; - } - } - void remove(const char *fn) { - if (!dbp) open(fn); - if (dbp) { - dbp->remove(dbp, fn, 0, 0); - dbp = 0; - } else { - ::unlink(fn); - } - } - - // accessors - int put(K key, - D data) { - DBT k; - memset(&k, 0, sizeof(k)); - k.data = &key; - k.size = sizeof(K); - DBT d; - memset(&d, 0, sizeof(d)); - d.data = &data; - d.size = sizeof(data); - return dbp->put(dbp, NULL, &k, &d, 0); - } - - int get(K key, - D& data) { - DBT k; - memset(&k, 0, sizeof(k)); - k.data = &key; - k.size = sizeof(key); - DBT d; - memset(&d, 0, sizeof(d)); - d.data = &data; - d.size = sizeof(data); - int r = dbp->get(dbp, NULL, &k, &d, 0); - return r; - } - - int del(K key) { - DBT k; - memset(&k, 0, sizeof(k)); - k.data = &key; - k.size = sizeof(key); - return dbp->del(dbp, NULL, &k, 0); - } - - int list_keys(list& ls) { - DBC *cursor = 0; - int r = dbp->cursor(dbp, NULL, &cursor, 0); - assert(r == 0); - - DBT k,d; - memset(&k, 0, sizeof(k)); - memset(&d, 0, sizeof(d)); - - while ((r = cursor->c_get(cursor, &k, &d, DB_NEXT)) == 0) { - K key; - assert(k.size == sizeof(key)); - memcpy(&key, k.data, k.size); - ls.push_back(key); - } - if (r != DB_NOTFOUND) { - dbp->err(dbp, r, "DBcursor->get"); - assert(r == DB_NOTFOUND); - } - - cursor->c_close(cursor); - return 0; - } - -}; - -#endif diff --git a/trunk/ceph/osd/FakeStoreBDBCollections.h b/trunk/ceph/osd/FakeStoreBDBCollections.h deleted file mode 100644 index a779a2a57972c..0000000000000 --- a/trunk/ceph/osd/FakeStoreBDBCollections.h +++ /dev/null @@ -1,169 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __FAKESTOREBDBCOLLECTIONS_H -#define __FAKESTOREBDBCOLLECTIONS_H - -#include "BDBMap.h" -#include "ObjectStore.h" -#include "common/Mutex.h" - -#define BDBHASH_DIRS 128LL -#define BDBHASH_FUNC(x) (((x) ^ ((x)>>30) ^ ((x)>>18) ^ ((x)>>45) ^ 0xdead1234) * 884811 % BDBHASH_DIRS) - -class FakeStoreBDBCollections { - private: - int whoami; - string basedir; - - Mutex bdblock; - - // collection dbs - BDBMap collections; - map*> collection_map; - - // dirs - void get_dir(string& dir) { - char s[30]; - sprintf(s, "%d", whoami); - dir = basedir + "/" + s; - } - void get_collfn(coll_t c, string &fn) { - char s[100]; - sprintf(s, "%d/%02llx/%016llx.co", whoami, BDBHASH_FUNC(c), c); - fn = basedir + "/" + s; - } - - void open_collections() { - string cfn; - get_dir(cfn); - cfn += "/collections"; - collections.open(cfn.c_str()); - list ls; - collections.list_keys(ls); - } - void close_collections() { - if (collections.is_open()) - collections.close(); - - for (map*>::iterator it = collection_map.begin(); - it != collection_map.end(); - it++) { - it->second->close(); - } - collection_map.clear(); - } - - int open_collection(coll_t c) { - if (collection_map.count(c)) - return 0; // already open. - - string fn; - get_collfn(c,fn); - collection_map[c] = new BDBMap; - int r = collection_map[c]->open(fn.c_str()); - if (r != 0) - collection_map.erase(c); // failed - return r; - } - - public: - FakeStoreBDBCollections(int w, string& bd) : whoami(w), basedir(bd) {} - ~FakeStoreBDBCollections() { - close_collections(); - } - - int list_collections(list& ls) { - bdblock.Lock(); - if (!collections.is_open()) open_collections(); - - ls.clear(); - collections.list_keys(ls); - bdblock.Unlock(); - return 0; - } - int create_collection(coll_t c) { - bdblock.Lock(); - if (!collections.is_open()) open_collections(); - - collections.put(c, 1); - open_collection(c); - bdblock.Unlock(); - return 0; - } - int destroy_collection(coll_t c) { - bdblock.Lock(); - if (!collections.is_open()) open_collections(); - - collections.del(c); - - open_collection(c); - collection_map[c]->close(); - - string fn; - get_collfn(c,fn); - collection_map[c]->remove(fn.c_str()); - delete collection_map[c]; - collection_map.erase(c); - bdblock.Unlock(); - return 0; - } - int collection_stat(coll_t c, struct stat *st) { - bdblock.Lock(); - if (!collections.is_open()) open_collections(); - - string fn; - get_collfn(c,fn); - int r = ::stat(fn.c_str(), st); - bdblock.Unlock(); - return r; - } - bool collection_exists(coll_t c) { - bdblock.Lock(); - struct stat st; - int r = collection_stat(c, &st) == 0; - bdblock.Unlock(); - return r; - } - int collection_add(coll_t c, object_t o) { - bdblock.Lock(); - if (!collections.is_open()) open_collections(); - - open_collection(c); - collection_map[c]->put(o,1); - bdblock.Unlock(); - return 0; - } - int collection_remove(coll_t c, object_t o) { - bdblock.Lock(); - if (!collections.is_open()) open_collections(); - - open_collection(c); - collection_map[c]->del(o); - bdblock.Unlock(); - return 0; - } - int collection_list(coll_t c, list& o) { - bdblock.Lock(); - if (!collections.is_open()) open_collections(); - - open_collection(c); - collection_map[c]->list_keys(o); - bdblock.Unlock(); - return 0; - } -}; - -#endif diff --git a/trunk/ceph/osd/ObjectStore.cc b/trunk/ceph/osd/ObjectStore.cc deleted file mode 100644 index 7aeab1d063d4d..0000000000000 --- a/trunk/ceph/osd/ObjectStore.cc +++ /dev/null @@ -1,152 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#include "ObjectStore.h" - -#include "config.h" -#include "common/Clock.h" - -#define dout(x) if (x < g_conf.debug) *_dout << dbeginl << g_clock.now() << " ager: " - -object_t ObjectStore::age_get_oid() { - if (!age_free_oids.empty()) { - object_t o = age_free_oids.front(); - age_free_oids.pop_front(); - return o; - } - return age_cur_oid++; - } - - ssize_t ObjectStore::age_pick_size() { - ssize_t max = file_size_distn.sample() * 1024; - return max/2 + (rand() % 100) * max/200 + 1; - } - - void ObjectStore::age_fill(float pc, utime_t until) { - bufferptr bp(1024*1024); - bp.zero(); - bufferlist bl; - bl.push_back(bp); - while (1) { - if (g_clock.now() > until) break; - - struct statfs st; - statfs(&st); - float a = (float)(st.f_blocks-st.f_bavail) / (float)st.f_blocks; - if (a >= pc) { - dout(10) << "age_fill at " << a << " / " << pc << " stopping" << dendl; - break; - } - - object_t oid = age_get_oid(); - - int b = rand() % 10; - age_objects[b].push_back(oid); - - ssize_t s = age_pick_size(); - - dout(10) << "age_fill at " << a << " / " << pc << " creating " << hex << oid << dec << " sz " << s << dendl; - - off_t off = 0; - while (s) { - ssize_t t = MIN(s, 1024*1024); - write(oid, t, off, bl, false); - off += t; - s -= t; - } - oid++; - } - } - - void ObjectStore::age_empty(float pc) { - int nper = 20; - int n = nper; - while (1) { - struct statfs st; - statfs(&st); - float a = (float)(st.f_blocks-st.f_bavail) / (float)st.f_blocks; - if (a <= pc) { - dout(10) << "age_empty at " << a << " / " << pc << " stopping" << dendl; - break; - } - - int b = rand() % 10; - n--; - if (n == 0 || age_objects[b].empty()) { - dout(10) << "age_empty sync" << dendl; - //sync(); - sync(); - n = nper; - continue; - } - object_t oid = age_objects[b].front(); - age_objects[b].pop_front(); - - dout(10) << "age_empty at " << a << " / " << pc << " removing " << hex << oid << dec << dendl; - - remove(oid); - age_free_oids.push_back(oid); - } - } - - - void ObjectStore::age(int time, - float high_water, // fill to this % - float low_water, // then empty to this % - int count, // this many times - float final_water, // and end here ( <= low_water) - int fake_size_mb) { - utime_t until = g_clock.now(); - until.sec_ref() += time; - - while (age_objects.size() < 10) age_objects.push_back( list() ); - - if (fake_size_mb) { - int fake_bl = fake_size_mb * 256; - struct statfs st; - statfs(&st); - float f = (float)fake_bl / (float)st.f_blocks; - high_water = (float)high_water * f; - low_water = (float)low_water * f; - final_water = (float)final_water * f; - dout(10) << "fake " << fake_bl << " / " << st.f_blocks << " is " << f << ", high " << high_water << " low " << low_water << " final " << final_water << dendl; - } - - // init size distn (once) - if (!did_distn) { - did_distn = true; - age_cur_oid = 1; - file_size_distn.add(1, 19.0758125+0.65434375); - file_size_distn.add(512, 35.6566); - file_size_distn.add(1024, 27.7271875); - file_size_distn.add(2*1024, 16.63503125); - //file_size_distn.add(4*1024, 106.82384375); - //file_size_distn.add(8*1024, 81.493375); - //file_size_distn.add(16*1024, 14.13553125); - //file_size_distn.add(32*1024, 2.176); - //file_size_distn.add(256*1024, 0.655938); - //file_size_distn.add(512*1024, 0.1480625); - //file_size_distn.add(1*1024*1024, 0.020125); // actually 2, but 32bit - file_size_distn.normalize(); - } - - // clear - for (int i=0; i<10; i++) - age_objects[i].clear(); - - for (int c=1; c<=count; c++) { - if (g_clock.now() > until) break; - - dout(1) << "age " << c << "/" << count << " filling to " << high_water << dendl; - age_fill(high_water, until); - if (c == count) { - dout(1) << "age final empty to " << final_water << dendl; - age_empty(final_water); - } else { - dout(1) << "age " << c << "/" << count << " emptying to " << low_water << dendl; - age_empty(low_water); - } - } - dout(1) << "age finished" << dendl; - } - diff --git a/trunk/ceph/osdc/Blinker.h b/trunk/ceph/osdc/Blinker.h deleted file mode 100644 index e59c9629725ce..0000000000000 --- a/trunk/ceph/osdc/Blinker.h +++ /dev/null @@ -1,92 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __BLINKER_H -#define __BLINKER_H - -class Blinker { - - public: - - class Op { - int op; - static const int LOOKUP = 1; - static const int INSERT = 2; - static const int REMOVE = 3; - static const int CLEAR = 4; - Op(int o) : op(o) {} - }; - - class OpLookup : public Op { - public: - bufferptr key; - OpLookup(bufferptr& k) : Op(Op::LOOKUP), key(k) {} - }; - - class OpInsert : public Op { - bufferptr key; - bufferlist val; - OpInsert(bufferptr& k, bufferlist& v) : Op(Op::INSERT), key(k), val(v) {} - }; - - class OpRemove : public Op { - public: - bufferptr key; - OpRemove(bufferptr& k) : Op(Op::REMOVE), key(k) {} - }; - - class OpClear : public Op { - public: - OpClear() : Op(Op::CLEAR) {} - }; - - - -private: - Objecter *objecter; - - // in-flight operations. - - - // cache information about tree structure. - - - -public: - // public interface - - // simple accessors - void lookup(inode_t& inode, bufferptr& key, bufferlist *pval, Context *onfinish); - - // simple modifiers - void insert(inode_t& inode, bufferptr& key, bufferlist& val, Context *onack, Context *onsafe); - void remove(inode_t& inode, bufferptr& key, Context *onack, Context *onsafe); - void clear(inode_t& inode, Context *onack, Context *onsafe); - - // these are dangerous: the table may be large. - void listkeys(inode_t& inode, list* pkeys, Context *onfinish); - void listvals(inode_t& inode, list* pkeys, list* pvals, Context *onfinish); - - // fetch *at least* key, but also anything else that is convenient. - // include lexical bounds for which this is a complete result. - // (if *start and *end are empty, it's the entire table) - void prefetch(inode_t& inode, bufferptr& key, - list* pkeys, list* pvals, - bufferptr *start, bufferptr *end, - Context *onfinish); - - -}; - -#endif diff --git a/trunk/ceph/osdc/Filer.cc b/trunk/ceph/osdc/Filer.cc deleted file mode 100644 index 193089d3915b1..0000000000000 --- a/trunk/ceph/osdc/Filer.cc +++ /dev/null @@ -1,235 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#include - -#include "Filer.h" -#include "osd/OSDMap.h" - -//#include "messages/MOSDRead.h" -//#include "messages/MOSDReadReply.h" -//#include "messages/MOSDWrite.h" -//#include "messages/MOSDWriteReply.h" -#include "messages/MOSDOp.h" -#include "messages/MOSDOpReply.h" -#include "messages/MOSDMap.h" - -#include "msg/Messenger.h" - -#include "include/Context.h" - -#include "config.h" - -#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_filer) *_dout << dbeginl << g_clock.now() << " " << objecter->messenger->get_myname() << ".filer " - - -class Filer::C_Probe : public Context { -public: - Filer *filer; - Probe *probe; - object_t oid; - off_t size; - C_Probe(Filer *f, Probe *p, object_t o) : filer(f), probe(p), oid(o), size(0) {} - void finish(int r) { - filer->_probed(probe, oid, size); - } -}; - -int Filer::probe_fwd(inode_t& inode, - off_t start_from, - off_t *end, - Context *onfinish) -{ - dout(10) << "probe_fwd " << hex << inode.ino << dec << " starting from " << start_from << dendl; - - Probe *probe = new Probe(inode, start_from, end, onfinish); - - // period (bytes before we jump unto a new set of object(s)) - off_t period = ceph_file_layout_period(inode.layout); - - // start with 1+ periods. - probe->probing_len = period; - if (start_from % period) - probe->probing_len += period - (start_from % period); - - _probe(probe); - return 0; -} - -void Filer::_probe(Probe *probe) -{ - dout(10) << "_probe " << hex << probe->inode.ino << dec << " " << probe->from << "~" << probe->probing_len << dendl; - - // map range onto objects - file_to_extents(probe->inode, probe->from, probe->probing_len, probe->probing); - - for (list::iterator p = probe->probing.begin(); - p != probe->probing.end(); - p++) { - dout(10) << "_probe probing " << p->oid << dendl; - C_Probe *c = new C_Probe(this, probe, p->oid); - probe->ops[p->oid] = objecter->stat(p->oid, &c->size, p->layout, c); - } -} - -void Filer::_probed(Probe *probe, object_t oid, off_t size) -{ - dout(10) << "_probed " << probe->inode.ino << " object " << hex << oid << dec << " has size " << size << dendl; - - probe->known[oid] = size; - assert(probe->ops.count(oid)); - probe->ops.erase(oid); - - if (!probe->ops.empty()) - return; // waiting for more! - - // analyze! - off_t end = 0; - for (list::iterator p = probe->probing.begin(); - p != probe->probing.end(); - p++) { - off_t shouldbe = p->length+p->start; - dout(10) << "_probed " << probe->inode.ino << " object " << hex << p->oid << dec - << " should be " << shouldbe - << ", actual is " << probe->known[p->oid] - << dendl; - - if (probe->known[p->oid] < 0) { end = -1; break; } // error! - - assert(probe->known[p->oid] <= shouldbe); - if (shouldbe == probe->known[p->oid]) continue; // keep going - - // aha, we found the end! - // calc offset into buffer_extent to get distance from probe->from. - off_t oleft = probe->known[p->oid] - p->start; - for (map::iterator i = p->buffer_extents.begin(); - i != p->buffer_extents.end(); - i++) { - if (oleft <= (off_t)i->second) { - end = probe->from + i->first + oleft; - dout(10) << "_probed end is in buffer_extent " << i->first << "~" << i->second << " off " << oleft - << ", from was " << probe->from << ", end is " << end - << dendl; - break; - } - oleft -= i->second; - } - break; - } - - if (end == 0) { - // keep probing! - dout(10) << "_probed didn't find end, probing further" << dendl; - off_t period = probe->inode.layout.fl_object_size * probe->inode.layout.fl_stripe_count; - probe->from += probe->probing_len; - probe->probing_len = period; - _probe(probe); - return; - } - - if (end < 0) { - dout(10) << "_probed encountered an error while probing" << dendl; - *probe->end = -1; - } else { - // hooray! - dout(10) << "_probed found end at " << end << dendl; - *probe->end = end; - } - - // done! finish and clean up. - probe->onfinish->finish(end > 0 ? 0:-1); - delete probe->onfinish; - delete probe; -} - - -void Filer::file_to_extents(inode_t inode, - off_t offset, size_t len, - list& extents, - objectrev_t rev) -{ - dout(10) << "file_to_extents " << offset << "~" << len - << " on " << hex << inode.ino << dec - << dendl; - - /* we want only one extent per object! - * this means that each extent we read may map into different bits of the - * final read buffer.. hence OSDExtent.buffer_extents - */ - map< object_t, ObjectExtent > object_extents; - - assert(inode.layout.fl_object_size >= inode.layout.fl_stripe_unit); - off_t stripes_per_object = inode.layout.fl_object_size / inode.layout.fl_stripe_unit; - dout(20) << " stripes_per_object " << stripes_per_object << dendl; - - off_t cur = offset; - off_t left = len; - while (left > 0) { - // layout into objects - off_t blockno = cur / inode.layout.fl_stripe_unit; // which block - off_t stripeno = blockno / inode.layout.fl_stripe_count; // which horizontal stripe (Y) - off_t stripepos = blockno % inode.layout.fl_stripe_count; // which object in the object set (X) - off_t objectsetno = stripeno / stripes_per_object; // which object set - off_t objectno = objectsetno * inode.layout.fl_stripe_count + stripepos; // object id - - // find oid, extent - ObjectExtent *ex = 0; - object_t oid( inode.ino, objectno, rev ); - if (object_extents.count(oid)) - ex = &object_extents[oid]; - else { - ex = &object_extents[oid]; - ex->oid = oid; - ex->layout = objecter->osdmap->file_to_object_layout( oid, inode.layout ); - } - - // map range into object - off_t block_start = (stripeno % stripes_per_object)*inode.layout.fl_stripe_unit; - off_t block_off = cur % inode.layout.fl_stripe_unit; - off_t max = inode.layout.fl_stripe_unit - block_off; - - off_t x_offset = block_start + block_off; - off_t x_len; - if (left > max) - x_len = max; - else - x_len = left; - - if (ex->start + (off_t)ex->length == x_offset) { - // add to extent - ex->length += x_len; - } else { - // new extent - assert(ex->length == 0); - assert(ex->start == 0); - ex->start = x_offset; - ex->length = x_len; - } - ex->buffer_extents[cur-offset] = x_len; - - dout(15) << "file_to_extents " << *ex << " in " << ex->layout << dendl; - //dout(0) << "map: ino " << ino << " oid " << ex.oid << " osd " << ex.osd << " offset " << ex.offset << " len " << ex.len << " ... left " << left << dendl; - - left -= x_len; - cur += x_len; - } - - // make final list - for (map::iterator it = object_extents.begin(); - it != object_extents.end(); - it++) { - extents.push_back(it->second); - } -} diff --git a/trunk/ceph/osdc/Filer.h b/trunk/ceph/osdc/Filer.h deleted file mode 100644 index 0679a9b6ffef3..0000000000000 --- a/trunk/ceph/osdc/Filer.h +++ /dev/null @@ -1,165 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __FILER_H -#define __FILER_H - -/*** Filer - * - * stripe file ranges onto objects. - * build list for the objecter or objectcacher. - * - * also, provide convenience methods that call objecter for you. - * - * "files" are identified by ino. - */ - -#include -#include -using namespace std; - -#include -using namespace __gnu_cxx; - -#include "include/types.h" - -#include "osd/OSDMap.h" -#include "Objecter.h" - -class Context; -class Messenger; -class OSDMap; - - -/**** Filer interface ***/ - -class Filer { - Objecter *objecter; - - // probes - struct Probe { - inode_t inode; - off_t from; - off_t *end; - Context *onfinish; - - list probing; - off_t probing_len; - - map known; - map ops; - - Probe(inode_t &i, off_t f, off_t *e, Context *c) : - inode(i), from(f), end(e), onfinish(c), probing_len(0) {} - }; - - class C_Probe; - //friend class C_Probe; - - void _probe(Probe *p); - void _probed(Probe *p, object_t oid, off_t size); - - public: - Filer(Objecter *o) : objecter(o) {} - ~Filer() {} - - bool is_active() { - return objecter->is_active(); // || (oc && oc->is_active()); - } - - /*** async file interface ***/ - Objecter::OSDRead *prepare_read(inode_t& inode, - off_t offset, - size_t len, - bufferlist *bl) { - Objecter::OSDRead *rd = new Objecter::OSDRead(bl); - file_to_extents(inode, offset, len, rd->extents); - return rd; - } - int read(inode_t& inode, - off_t offset, - size_t len, - bufferlist *bl, // ptr to data - Context *onfinish) { - Objecter::OSDRead *rd = prepare_read(inode, offset, len, bl); - return objecter->readx(rd, onfinish) > 0 ? 0:-1; - } - - int write(inode_t& inode, - off_t offset, - size_t len, - bufferlist& bl, - int flags, - Context *onack, - Context *oncommit, - objectrev_t rev=0) { - Objecter::OSDWrite *wr = new Objecter::OSDWrite(bl); - file_to_extents(inode, offset, len, wr->extents, rev); - return objecter->modifyx(wr, onack, oncommit) > 0 ? 0:-1; - } - - int zero(inode_t& inode, - off_t offset, - size_t len, - Context *onack, - Context *oncommit) { - Objecter::OSDModify *z = new Objecter::OSDModify(OSD_OP_ZERO); - file_to_extents(inode, offset, len, z->extents); - return objecter->modifyx(z, onack, oncommit) > 0 ? 0:-1; - } - - int remove(inode_t& inode, - off_t offset, - size_t len, - Context *onack, - Context *oncommit) { - Objecter::OSDModify *z = new Objecter::OSDModify(OSD_OP_DELETE); - file_to_extents(inode, offset, len, z->extents); - return objecter->modifyx(z, onack, oncommit) > 0 ? 0:-1; - } - - int probe_fwd(inode_t& inode, - off_t start_from, - off_t *end, - Context *onfinish); - - - /***** mapping *****/ - - /* map (ino, ono) to an object name - (to be used on any osd in the proper replica group) */ - /*object_t file_to_object(inodeno_t ino, - size_t _ono) { - uint64_t ono = _ono; - assert(ino < (1ULL<& extents, - objectrev_t rev=0); - -}; - - - -#endif diff --git a/trunk/ceph/osdc/Journaler.cc b/trunk/ceph/osdc/Journaler.cc deleted file mode 100644 index 363b7c60de9aa..0000000000000 --- a/trunk/ceph/osdc/Journaler.cc +++ /dev/null @@ -1,666 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "Journaler.h" - -#include "include/Context.h" -#include "common/Logger.h" -#include "msg/Messenger.h" - -#include "config.h" - -#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_journaler) *_dout << dbeginl << g_clock.now() << " " << objecter->messenger->get_myname() << ".journaler " -#define derr(x) if (x <= g_conf.debug || x <= g_conf.debug_journaler) *_derr << dbeginl << g_clock.now() << " " << objecter->messenger->get_myname() << ".journaler " - - - -void Journaler::reset() -{ - dout(1) << "reset to blank journal" << dendl; - state = STATE_ACTIVE; - write_pos = flush_pos = ack_pos = - read_pos = requested_pos = received_pos = - expire_pos = trimming_pos = trimmed_pos = ceph_file_layout_period(inode.layout); -} - - -/***************** HEADER *******************/ - -ostream& operator<<(ostream& out, Journaler::Header &h) -{ - return out << "loghead(trim " << h.trimmed_pos - << ", expire " << h.expire_pos - << ", read " << h.read_pos - << ", write " << h.write_pos - << ")"; -} - -class Journaler::C_ReadHead : public Context { - Journaler *ls; -public: - bufferlist bl; - C_ReadHead(Journaler *l) : ls(l) {} - void finish(int r) { - ls->_finish_read_head(r, bl); - } -}; - -class Journaler::C_ProbeEnd : public Context { - Journaler *ls; -public: - off_t end; - C_ProbeEnd(Journaler *l) : ls(l), end(-1) {} - void finish(int r) { - ls->_finish_probe_end(r, end); - } -}; - -void Journaler::recover(Context *onread) -{ - assert(state != STATE_ACTIVE); - - if (onread) - waitfor_recover.push_back(onread); - - if (state != STATE_UNDEF) { - dout(1) << "recover - already recoverying" << dendl; - return; - } - - dout(1) << "read_head" << dendl; - state = STATE_READHEAD; - C_ReadHead *fin = new C_ReadHead(this); - filer.read(inode, 0, sizeof(Header), &fin->bl, fin); -} - -void Journaler::_finish_read_head(int r, bufferlist& bl) -{ - assert(state == STATE_READHEAD); - - if (bl.length() == 0) { - dout(1) << "_finish_read_head r=" << r << " read 0 bytes, assuming empty log" << dendl; - state = STATE_ACTIVE; - list ls; - ls.swap(waitfor_recover); - finish_contexts(ls, 0); - return; - } - - // unpack header - Header h; - assert(bl.length() == sizeof(h)); - bl.copy(0, sizeof(h), (char*)&h); - - write_pos = flush_pos = ack_pos = h.write_pos; - read_pos = requested_pos = received_pos = h.read_pos; - expire_pos = h.expire_pos; - trimmed_pos = trimming_pos = h.trimmed_pos; - - dout(1) << "_finish_read_head " << h << ". probing for end of log (from " << write_pos << ")..." << dendl; - - // probe the log - state = STATE_PROBING; - C_ProbeEnd *fin = new C_ProbeEnd(this); - filer.probe_fwd(inode, h.write_pos, &fin->end, fin); -} - -void Journaler::_finish_probe_end(int r, off_t end) -{ - assert(state == STATE_PROBING); - - if (end == -1) { - end = write_pos; - dout(1) << "_finish_probe_end write_pos = " << end - << " (header had " << write_pos << "). log was empty. recovered." - << dendl; - assert(0); // hrm. - } else { - assert(end >= write_pos); - assert(r >= 0); - dout(1) << "_finish_probe_end write_pos = " << end - << " (header had " << write_pos << "). recovered." - << dendl; - } - - write_pos = flush_pos = ack_pos = end; - - // done. - list ls; - ls.swap(waitfor_recover); - finish_contexts(ls, 0); -} - - -// WRITING - -class Journaler::C_WriteHead : public Context { -public: - Journaler *ls; - Header h; - Context *oncommit; - C_WriteHead(Journaler *l, Header& h_, Context *c) : ls(l), h(h_), oncommit(c) {} - void finish(int r) { - ls->_finish_write_head(h, oncommit); - } -}; - -void Journaler::write_head(Context *oncommit) -{ - assert(state == STATE_ACTIVE); - last_written.trimmed_pos = trimmed_pos; - last_written.expire_pos = expire_pos; - last_written.read_pos = read_pos; - last_written.write_pos = ack_pos; //write_pos; - dout(10) << "write_head " << last_written << dendl; - - last_wrote_head = g_clock.now(); - - bufferlist bl; - bl.append((char*)&last_written, sizeof(last_written)); - filer.write(inode, 0, bl.length(), bl, 0, - 0, - new C_WriteHead(this, last_written, oncommit)); -} - -void Journaler::_finish_write_head(Header &wrote, Context *oncommit) -{ - dout(10) << "_finish_write_head " << wrote << dendl; - last_committed = wrote; - if (oncommit) { - oncommit->finish(0); - delete oncommit; - } - - trim(); // trim? -} - - -/***************** WRITING *******************/ - -class Journaler::C_Flush : public Context { - Journaler *ls; - off_t start; -public: - C_Flush(Journaler *l, off_t s) : ls(l), start(s) {} - void finish(int r) { ls->_finish_flush(r, start); } -}; - -void Journaler::_finish_flush(int r, off_t start) -{ - assert(r>=0); - - assert(start >= ack_pos); - assert(start < flush_pos); - assert(pending_flush.count(start)); - - // calc latency? - if (logger) { - utime_t lat = g_clock.now(); - lat -= pending_flush[start]; - logger->favg("jlat", lat); - } - - pending_flush.erase(start); - - // adjust ack_pos - if (pending_flush.empty()) - ack_pos = flush_pos; - else - ack_pos = pending_flush.begin()->first; - - dout(10) << "_finish_flush from " << start - << ", pending_flush now " << pending_flush - << ", write positions now " << write_pos << "/" << flush_pos << "/" << ack_pos - << dendl; - - // kick waiters <= ack_pos - while (!waitfor_flush.empty()) { - if (waitfor_flush.begin()->first > ack_pos) break; - finish_contexts(waitfor_flush.begin()->second); - waitfor_flush.erase(waitfor_flush.begin()); - } -} - - -off_t Journaler::append_entry(bufferlist& bl, Context *onsync) -{ - uint32_t s = bl.length(); - - if (!g_conf.journaler_allow_split_entries) { - // will we span a stripe boundary? - int p = inode.layout.fl_stripe_unit; - if (write_pos / p != (write_pos + (off_t)(bl.length() + sizeof(s))) / p) { - // yes. - // move write_pos forward. - off_t owp = write_pos; - write_pos += p; - write_pos -= (write_pos % p); - - // pad with zeros. - bufferptr bp(write_pos - owp); - bp.zero(); - assert(bp.length() >= 4); - write_buf.push_back(bp); - - // now flush. - flush(); - - dout(12) << "append_entry skipped " << (write_pos-owp) << " bytes to " << write_pos << " to avoid spanning stripe boundary" << dendl; - } - } - - dout(10) << "append_entry len " << bl.length() << " to " << write_pos << "~" << (bl.length() + sizeof(uint32_t)) << dendl; - - // cache? - // NOTE: this is a dumb thing to do; this is used for a benchmarking - // purposes only. - if (g_conf.journaler_cache && - write_pos == read_pos + read_buf.length()) { - dout(10) << "append_entry caching in read_buf too" << dendl; - assert(requested_pos == received_pos); - assert(requested_pos == read_pos + read_buf.length()); - read_buf.append((char*)&s, sizeof(s)); - read_buf.append(bl); - requested_pos = received_pos = write_pos + sizeof(s) + s; - } - - // append - write_buf.append((char*)&s, sizeof(s)); - write_buf.claim_append(bl); - write_pos += sizeof(s) + s; - - // flush now? - if (onsync) - flush(onsync); - - return write_pos; -} - - -void Journaler::_do_flush() -{ - if (write_pos == flush_pos) return; - assert(write_pos > flush_pos); - - // flush - unsigned len = write_pos - flush_pos; - assert(len == write_buf.length()); - dout(10) << "_do_flush flushing " << flush_pos << "~" << len << dendl; - - // submit write for anything pending - // flush _start_ pos to _finish_flush - filer.write(inode, flush_pos, len, write_buf, 0, - g_conf.journaler_safe ? 0:new C_Flush(this, flush_pos), // on ACK - g_conf.journaler_safe ? new C_Flush(this, flush_pos):0); // on COMMIT - pending_flush[flush_pos] = g_clock.now(); - - // adjust pointers - flush_pos = write_pos; - write_buf.clear(); - - dout(10) << "_do_flush write pointers now at " << write_pos << "/" << flush_pos << "/" << ack_pos << dendl; -} - - - -void Journaler::flush(Context *onsync) -{ - // all flushed and acked? - if (write_pos == ack_pos) { - assert(write_buf.length() == 0); - dout(10) << "flush nothing to flush, write pointers at " << write_pos << "/" << flush_pos << "/" << ack_pos << dendl; - if (onsync) { - onsync->finish(0); - delete onsync; - } - return; - } - - if (write_pos == flush_pos) { - assert(write_buf.length() == 0); - dout(10) << "flush nothing to flush, write pointers at " << write_pos << "/" << flush_pos << "/" << ack_pos << dendl; - } else { - if (1) { - // maybe buffer - if (write_buf.length() < g_conf.journaler_batch_max) { - // delay! schedule an event. - dout(20) << "flush delaying flush" << dendl; - if (delay_flush_event) timer.cancel_event(delay_flush_event); - delay_flush_event = new C_DelayFlush(this); - timer.add_event_after(g_conf.journaler_batch_interval, delay_flush_event); - } else { - dout(20) << "flush not delaying flush" << dendl; - _do_flush(); - } - } else { - // always flush - _do_flush(); - } - } - - // queue waiter (at _new_ write_pos; will go when reached by ack_pos) - if (onsync) - waitfor_flush[write_pos].push_back(onsync); - - // write head? - if (last_wrote_head.sec() + g_conf.journaler_write_head_interval < g_clock.now().sec()) { - write_head(); - } -} - - - -/***************** READING *******************/ - - -class Journaler::C_Read : public Context { - Journaler *ls; -public: - C_Read(Journaler *l) : ls(l) {} - void finish(int r) { ls->_finish_read(r); } -}; - -class Journaler::C_RetryRead : public Context { - Journaler *ls; -public: - C_RetryRead(Journaler *l) : ls(l) {} - void finish(int r) { ls->is_readable(); } // this'll kickstart. -}; - -void Journaler::_finish_read(int r) -{ - assert(r>=0); - - dout(10) << "_finish_read got " << received_pos << "~" << reading_buf.length() << dendl; - received_pos += reading_buf.length(); - read_buf.claim_append(reading_buf); - assert(received_pos <= requested_pos); - dout(10) << "_finish_read read_buf now " << read_pos << "~" << read_buf.length() - << ", read pointers " << read_pos << "/" << received_pos << "/" << requested_pos - << dendl; - - if (is_readable()) { // NOTE: this check may read more - // readable! - dout(10) << "_finish_read now readable" << dendl; - if (on_readable) { - Context *f = on_readable; - on_readable = 0; - f->finish(0); - delete f; - } - - if (read_bl) { - bool r = try_read_entry(*read_bl); - assert(r); // this should have worked. - - // clear state - Context *f = on_read_finish; - on_read_finish = 0; - read_bl = 0; - - // do callback - f->finish(0); - delete f; - } - } - - // prefetch? - _prefetch(); -} - -/* NOTE: this could be slightly smarter... we could allow - * multiple reads to be in progress. e.g., if we prefetch, but - * then discover we need even more for an especially large entry. - * i don't think that circumstance will arise particularly often. - */ -void Journaler::_issue_read(off_t len) -{ - // make sure we're fully flushed - _do_flush(); - - if (_is_reading()) { - dout(10) << "_issue_read " << len << " waiting, already reading " - << received_pos << "~" << (requested_pos-received_pos) << dendl; - return; - } - assert(requested_pos == received_pos); - - // stuck at ack_pos? - assert(requested_pos <= ack_pos); - if (requested_pos == ack_pos) { - dout(10) << "_issue_read requested_pos = ack_pos = " << ack_pos << ", waiting" << dendl; - assert(write_pos > requested_pos); - if (flush_pos == ack_pos) - flush(); - assert(flush_pos > ack_pos); - waitfor_flush[flush_pos].push_back(new C_RetryRead(this)); - return; - } - - // don't read too much - if (requested_pos + len > ack_pos) { - len = ack_pos - requested_pos; - dout(10) << "_issue_read reading only up to ack_pos " << ack_pos << dendl; - } - - // go. - dout(10) << "_issue_read reading " << requested_pos << "~" << len - << ", read pointers " << read_pos << "/" << received_pos << "/" << (requested_pos+len) - << dendl; - - filer.read(inode, requested_pos, len, &reading_buf, - new C_Read(this)); - requested_pos += len; -} - -void Journaler::_prefetch() -{ - // prefetch? - off_t left = requested_pos - read_pos; - if (left <= prefetch_from && // should read more, - !_is_reading() && // and not reading anything right now - write_pos > requested_pos) { // there's something more to read... - dout(10) << "_prefetch only " << left << " < " << prefetch_from - << ", prefetching " << dendl; - _issue_read(fetch_len); - } -} - - -void Journaler::read_entry(bufferlist *bl, Context *onfinish) -{ - // only one read at a time! - assert(read_bl == 0); - assert(on_read_finish == 0); - - if (is_readable()) { - dout(10) << "read_entry at " << read_pos << ", read_buf is " - << read_pos << "~" << read_buf.length() - << ", readable now" << dendl; - - // nice, just do it now. - bool r = try_read_entry(*bl); - assert(r); - - // callback - onfinish->finish(0); - delete onfinish; - } else { - dout(10) << "read_entry at " << read_pos << ", read_buf is " - << read_pos << "~" << read_buf.length() - << ", not readable now" << dendl; - - bl->clear(); - - // set it up - read_bl = bl; - on_read_finish = onfinish; - - // is_readable() will have already initiated a read (if it was possible) - } -} - - -/* is_readable() - * return true if next entry is ready. - * kickstart read as necessary. - */ -bool Journaler::is_readable() -{ - // anything to read? - if (read_pos == write_pos) return false; - - // have enough for entry size? - uint32_t s = 0; - if (read_buf.length() >= sizeof(s)) - read_buf.copy(0, sizeof(s), (char*)&s); - - // entry and payload? - if (read_buf.length() >= sizeof(s) && - read_buf.length() >= sizeof(s) + s) - return true; // yep, next entry is ready. - - // darn it! - - // partial fragment at the end? - if (received_pos == write_pos) { - dout(10) << "is_readable() detected partial entry at tail, adjusting write_pos to " << read_pos << dendl; - write_pos = flush_pos = ack_pos = read_pos; - assert(write_buf.length() == 0); - - // truncate? - // FIXME: how much? - - return false; - } - - // start reading some more? - if (!_is_reading()) { - if (s) - fetch_len = MAX(fetch_len, (off_t)(sizeof(s)+s-read_buf.length())); - _issue_read(fetch_len); - } - - return false; -} - - -/* try_read_entry(bl) - * read entry into bl if it's ready. - * otherwise, do nothing. (well, we'll start fetching it for good measure.) - */ -bool Journaler::try_read_entry(bufferlist& bl) -{ - if (!is_readable()) { // this may start a read. - dout(10) << "try_read_entry at " << read_pos << " not readable" << dendl; - return false; - } - - uint32_t s; - assert(read_buf.length() >= sizeof(s)); - read_buf.copy(0, sizeof(s), (char*)&s); - assert(read_buf.length() >= sizeof(s) + s); - - dout(10) << "try_read_entry at " << read_pos << " reading " - << read_pos << "~" << (sizeof(s)+s) << dendl; - - // do it - assert(bl.length() == 0); - read_buf.splice(0, sizeof(s)); - read_buf.splice(0, s, &bl); - read_pos += sizeof(s) + s; - - // prefetch? - _prefetch(); - return true; -} - -void Journaler::wait_for_readable(Context *onreadable) -{ - dout(10) << "wait_for_readable at " << read_pos << " onreadable " << onreadable << dendl; - assert(!is_readable()); - assert(on_readable == 0); - on_readable = onreadable; -} - - - - -/***************** TRIMMING *******************/ - - -class Journaler::C_Trim : public Context { - Journaler *ls; - off_t to; -public: - C_Trim(Journaler *l, off_t t) : ls(l), to(t) {} - void finish(int r) { - ls->_trim_finish(r, to); - } -}; - -void Journaler::trim() -{ - off_t trim_to = last_committed.expire_pos; - trim_to -= trim_to % ceph_file_layout_period(inode.layout); - dout(10) << "trim last_commited head was " << last_committed - << ", can trim to " << trim_to - << dendl; - if (trim_to == 0 || trim_to == trimming_pos) { - dout(10) << "trim already trimmed/trimming to " - << trimmed_pos << "/" << trimming_pos << dendl; - return; - } - - if (trimming_pos > trimmed_pos) { - dout(10) << "trim already trimming atm, try again later. trimmed/trimming is " - << trimmed_pos << "/" << trimming_pos << dendl; - return; - } - - // trim - assert(trim_to <= write_pos); - assert(trim_to > trimming_pos); - dout(10) << "trim trimming to " << trim_to - << ", trimmed/trimming/expire are " - << trimmed_pos << "/" << trimming_pos << "/" << expire_pos - << dendl; - - filer.remove(inode, trimming_pos, trim_to-trimming_pos, - 0, new C_Trim(this, trim_to)); - trimming_pos = trim_to; -} - -void Journaler::_trim_finish(int r, off_t to) -{ - dout(10) << "_trim_finish trimmed_pos was " << trimmed_pos - << ", trimmed/trimming/expire now " - << to << "/" << trimming_pos << "/" << expire_pos - << dendl; - assert(r >= 0); - - assert(to <= trimming_pos); - assert(to > trimmed_pos); - trimmed_pos = to; - - // finishers? - while (!waitfor_trim.empty() && - waitfor_trim.begin()->first <= trimmed_pos) { - finish_contexts(waitfor_trim.begin()->second, 0); - waitfor_trim.erase(waitfor_trim.begin()); - } -} - - -// eof. diff --git a/trunk/ceph/osdc/Journaler.h b/trunk/ceph/osdc/Journaler.h deleted file mode 100644 index 7f7a5753ad708..0000000000000 --- a/trunk/ceph/osdc/Journaler.h +++ /dev/null @@ -1,237 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -/* Journaler - * - * This class stripes a serial log over objects on the store. Four logical pointers: - * - * write_pos - where we're writing new entries - * read_pos - where we're reading old entires - * expire_pos - what is deemed "old" by user - * trimmed_pos - where we're expiring old items - * - * trimmed_pos <= expire_pos <= read_pos <= write_pos. - * - * Often, read_pos <= write_pos (as with MDS log). During recovery, write_pos is undefined - * until the end of the log is discovered. - * - * A "head" struct at the beginning of the log is used to store metadata at - * regular intervals. The basic invariants include: - * - * head.read_pos <= read_pos -- the head may "lag", since it's updated lazily. - * head.write_pos <= write_pos - * head.expire_pos <= expire_pos - * head.trimmed_pos <= trimmed_pos - * - * More significantly, - * - * head.expire_pos >= trimmed_pos -- this ensures we can find the "beginning" of the log - * as last recorded, before it is trimmed. trimming will - * block until a sufficiently current expire_pos is committed. - * - * To recover log state, we simply start at the last write_pos in the head, and probe the - * object sequence sizes until we read the end. - * - * Head struct is stored in the first object. Actual journal starts after layout.period() bytes. - * - */ - -#ifndef __JOURNALER_H -#define __JOURNALER_H - -#include "Objecter.h" -#include "Filer.h" - -#include -#include - -class Context; -class Logger; - -class Journaler { - - // this goes at the head of the log "file". - struct Header { - off_t trimmed_pos; - off_t expire_pos; - off_t read_pos; - off_t write_pos; - Header() : trimmed_pos(0), expire_pos(0), read_pos(0), write_pos(0) {} - } last_written, last_committed; - - friend ostream& operator<<(ostream& out, Header &h); - - - // me - inode_t inode; - Objecter *objecter; - Filer filer; - - Logger *logger; - - Mutex *lock; - SafeTimer timer; - - class C_DelayFlush : public Context { - Journaler *journaler; - public: - C_DelayFlush(Journaler *j) : journaler(j) {} - void finish(int r) { - journaler->delay_flush_event = 0; - journaler->_do_flush(); - } - } *delay_flush_event; - - - // my state - static const int STATE_UNDEF = 0; - static const int STATE_READHEAD = 1; - static const int STATE_PROBING = 2; - static const int STATE_ACTIVE = 2; - - int state; - - // header - utime_t last_wrote_head; - void _finish_write_head(Header &wrote, Context *oncommit); - class C_WriteHead; - friend class C_WriteHead; - - list waitfor_recover; - void _finish_read_head(int r, bufferlist& bl); - void _finish_probe_end(int r, off_t end); - class C_ReadHead; - friend class C_ReadHead; - class C_ProbeEnd; - friend class C_ProbeEnd; - - - - // writer - off_t write_pos; // logical write position, where next entry will go - off_t flush_pos; // where we will flush. if write_pos>flush_pos, we're buffering writes. - off_t ack_pos; // what has been acked. - bufferlist write_buf; // write buffer. flush_pos + write_buf.length() == write_pos. - - std::map pending_flush; // start offsets and times for pending flushes - std::map > waitfor_flush; // when flushed through given offset - - void _do_flush(); - void _finish_flush(int r, off_t start); - class C_Flush; - friend class C_Flush; - - // reader - off_t read_pos; // logical read position, where next entry starts. - off_t requested_pos; // what we've requested from OSD. - off_t received_pos; // what we've received from OSD. - bufferlist read_buf; // read buffer. read_pos + read_buf.length() == prefetch_pos. - bufferlist reading_buf; // what i'm reading into - - off_t fetch_len; // how much to read at a time - off_t prefetch_from; // how far from end do we read next chunk - - // for read_entry() in-progress read - bufferlist *read_bl; - Context *on_read_finish; - // for wait_for_readable() - Context *on_readable; - - bool _is_reading() { - return requested_pos > received_pos; - } - void _finish_read(int r); // we just read some (read completion callback) - void _issue_read(off_t len); // read some more - void _prefetch(); // maybe read ahead - class C_Read; - friend class C_Read; - class C_RetryRead; - friend class C_RetryRead; - - // trimmer - off_t expire_pos; // what we're allowed to trim to - off_t trimming_pos; // what we've requested to trim through - off_t trimmed_pos; // what has been trimmed - map > waitfor_trim; - - void _trim_finish(int r, off_t to); - class C_Trim; - friend class C_Trim; - -public: - Journaler(inode_t& inode_, Objecter *obj, Logger *l, Mutex *lk, off_t fl=0, off_t pff=0) : - inode(inode_), objecter(obj), filer(objecter), logger(l), - lock(lk), timer(*lk), delay_flush_event(0), - state(STATE_UNDEF), - write_pos(0), flush_pos(0), ack_pos(0), - read_pos(0), requested_pos(0), received_pos(0), - fetch_len(fl), prefetch_from(pff), - read_bl(0), on_read_finish(0), on_readable(0), - expire_pos(0), trimming_pos(0), trimmed_pos(0) - { - // prefetch intelligently. - // (watch out, this is big if you use big objects or weird striping) - if (!fetch_len) - fetch_len = inode.layout.fl_object_size*inode.layout.fl_stripe_count * - g_conf.journaler_prefetch_periods; - if (!prefetch_from) - prefetch_from = fetch_len / 2; - } - - // me - //void open(Context *onopen); - //void claim(Context *onclaim, msg_addr_t from); - - /* reset - * NOTE: we assume the caller knows/has ensured that any objects - * in our sequence do not exist.. e.g. after a MKFS. this is _not_ - * an "erase" method. - */ - void reset(); - void recover(Context *onfinish); - void write_head(Context *onsave=0); - - bool is_active() { return state == STATE_ACTIVE; } - - off_t get_write_pos() const { return write_pos; } - off_t get_write_ack_pos() const { return ack_pos; } - off_t get_read_pos() const { return read_pos; } - off_t get_expire_pos() const { return expire_pos; } - off_t get_trimmed_pos() const { return trimmed_pos; } - - // write - off_t append_entry(bufferlist& bl, Context *onsync = 0); - void flush(Context *onsync = 0); - - // read - void set_read_pos(off_t p) { - assert(requested_pos == received_pos); // we can't cope w/ in-progress read right now. - assert(read_bl == 0); // ... - read_pos = requested_pos = received_pos = p; - read_buf.clear(); - } - bool is_readable(); - bool try_read_entry(bufferlist& bl); - void wait_for_readable(Context *onfinish); - void read_entry(bufferlist* bl, Context *onfinish); - - // trim - void set_expire_pos(off_t ep) { expire_pos = ep; } - void trim(); - //bool is_trimmable() { return trimming_pos < expire_pos; } - //void trim(off_t trim_to=0, Context *c=0); -}; - - -#endif diff --git a/trunk/ceph/osdc/ObjectCacher.cc b/trunk/ceph/osdc/ObjectCacher.cc deleted file mode 100644 index d5f347d3863cb..0000000000000 --- a/trunk/ceph/osdc/ObjectCacher.cc +++ /dev/null @@ -1,1587 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - -#include "msg/Messenger.h" -#include "ObjectCacher.h" -#include "Objecter.h" - - - -/*** ObjectCacher::BufferHead ***/ - - -/*** ObjectCacher::Object ***/ - -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_objectcacher) *_dout << dbeginl << g_clock.now() << " " << oc->objecter->messenger->get_myname() << ".objectcacher.object(" << oid << ") " - - -ObjectCacher::BufferHead *ObjectCacher::Object::split(BufferHead *left, off_t off) -{ - dout(20) << "split " << *left << " at " << off << dendl; - - // split off right - ObjectCacher::BufferHead *right = new BufferHead(this); - right->last_write_tid = left->last_write_tid; - right->set_state(left->get_state()); - - off_t newleftlen = off - left->start(); - right->set_start(off); - right->set_length(left->length() - newleftlen); - - // shorten left - oc->bh_stat_sub(left); - left->set_length(newleftlen); - oc->bh_stat_add(left); - - // add right - oc->bh_add(this, right); - - // split buffers too - bufferlist bl; - bl.claim(left->bl); - if (bl.length()) { - assert(bl.length() == (left->length() + right->length())); - right->bl.substr_of(bl, left->length(), right->length()); - left->bl.substr_of(bl, 0, left->length()); - } - - // move read waiters - if (!left->waitfor_read.empty()) { - map >::iterator o, p = left->waitfor_read.end(); - p--; - while (p != left->waitfor_read.begin()) { - if (p->first < right->start()) break; - dout(0) << "split moving waiters at byte " << p->first << " to right bh" << dendl; - right->waitfor_read[p->first].swap( p->second ); - o = p; - p--; - left->waitfor_read.erase(o); - } - } - - dout(20) << "split left is " << *left << dendl; - dout(20) << "split right is " << *right << dendl; - return right; -} - - -void ObjectCacher::Object::merge_left(BufferHead *left, BufferHead *right) -{ - assert(left->end() == right->start()); - assert(left->get_state() == right->get_state()); - - dout(10) << "merge_left " << *left << " + " << *right << dendl; - oc->bh_remove(this, right); - oc->bh_stat_sub(left); - left->set_length( left->length() + right->length()); - oc->bh_stat_add(left); - - // data - left->bl.claim_append(right->bl); - - // version - // note: this is sorta busted, but should only be used for dirty buffers - left->last_write_tid = MAX( left->last_write_tid, right->last_write_tid ); - left->last_write = MAX( left->last_write, right->last_write ); - - // waiters - for (map >::iterator p = right->waitfor_read.begin(); - p != right->waitfor_read.end(); - p++) - left->waitfor_read[p->first].splice( left->waitfor_read[p->first].begin(), - p->second ); - - // hose right - delete right; - - dout(10) << "merge_left result " << *left << dendl; -} - -void ObjectCacher::Object::try_merge_bh(BufferHead *bh) -{ - dout(10) << "try_merge_bh " << *bh << dendl; - - // to the left? - map::iterator p = data.find(bh->start()); - assert(p->second == bh); - if (p != data.begin()) { - p--; - if (p->second->end() == bh->start() && - p->second->get_state() == bh->get_state()) { - merge_left(p->second, bh); - bh = p->second; - } else - p++; - } - // to the right? - assert(p->second == bh); - p++; - if (p != data.end() && - p->second->start() == bh->end() && - p->second->get_state() == bh->get_state()) - merge_left(bh, p->second); -} - - -/* - * map a range of bytes into buffer_heads. - * - create missing buffer_heads as necessary. - */ -int ObjectCacher::Object::map_read(Objecter::OSDRead *rd, - map& hits, - map& missing, - map& rx) -{ - for (list::iterator ex_it = rd->extents.begin(); - ex_it != rd->extents.end(); - ex_it++) { - - if (ex_it->oid != oid) continue; - - dout(10) << "map_read " << ex_it->oid - << " " << ex_it->start << "~" << ex_it->length << dendl; - - map::iterator p = data.lower_bound(ex_it->start); - // p->first >= start - - off_t cur = ex_it->start; - off_t left = ex_it->length; - - if (p != data.begin() && - (p == data.end() || p->first > cur)) { - p--; // might overlap! - if (p->first + p->second->length() <= cur) - p++; // doesn't overlap. - } - - while (left > 0) { - // at end? - if (p == data.end()) { - // rest is a miss. - BufferHead *n = new BufferHead(this); - n->set_start( cur ); - n->set_length( left ); - oc->bh_add(this, n); - missing[cur] = n; - dout(20) << "map_read miss " << left << " left, " << *n << dendl; - cur += left; - left -= left; - assert(left == 0); - assert(cur == ex_it->start + (off_t)ex_it->length); - break; // no more. - } - - if (p->first <= cur) { - // have it (or part of it) - BufferHead *e = p->second; - - if (e->is_clean() || - e->is_dirty() || - e->is_tx()) { - hits[cur] = e; // readable! - dout(20) << "map_read hit " << *e << dendl; - } - else if (e->is_rx()) { - rx[cur] = e; // missing, not readable. - dout(20) << "map_read rx " << *e << dendl; - } - else assert(0); - - off_t lenfromcur = MIN(e->end() - cur, left); - cur += lenfromcur; - left -= lenfromcur; - p++; - continue; // more? - - } else if (p->first > cur) { - // gap.. miss - off_t next = p->first; - BufferHead *n = new BufferHead(this); - n->set_start( cur ); - n->set_length( MIN(next - cur, left) ); - oc->bh_add(this,n); - missing[cur] = n; - cur += MIN(left, n->length()); - left -= MIN(left, n->length()); - dout(20) << "map_read gap " << *n << dendl; - continue; // more? - } - else - assert(0); - } - } - return(0); -} - -/* - * map a range of extents on an object's buffer cache. - * - combine any bh's we're writing into one - * - break up bufferheads that don't fall completely within the range - * //no! - return a bh that includes the write. may also include other dirty data to left and/or right. - */ -ObjectCacher::BufferHead *ObjectCacher::Object::map_write(Objecter::OSDWrite *wr) -{ - BufferHead *final = 0; - - for (list::iterator ex_it = wr->extents.begin(); - ex_it != wr->extents.end(); - ex_it++) { - - if (ex_it->oid != oid) continue; - - dout(10) << "map_write oex " << ex_it->oid - << " " << ex_it->start << "~" << ex_it->length << dendl; - - map::iterator p = data.lower_bound(ex_it->start); - // p->first >= start - - off_t cur = ex_it->start; - off_t left = ex_it->length; - - if (p != data.begin() && - (p == data.end() || p->first > cur)) { - p--; // might overlap or butt up! - - /*// dirty and butts up? - if (p->first + p->second->length() == cur && - p->second->is_dirty()) { - dout(10) << "map_write will append to tail of " << *p->second << dendl; - final = p->second; - } - */ - if (p->first + p->second->length() <= cur) - p++; // doesn't overlap. - } - - while (left > 0) { - off_t max = left; - - // at end ? - if (p == data.end()) { - if (final == NULL) { - final = new BufferHead(this); - final->set_start( cur ); - final->set_length( max ); - oc->bh_add(this, final); - dout(10) << "map_write adding trailing bh " << *final << dendl; - } else { - final->set_length( final->length() + max ); - } - left -= max; - cur += max; - continue; - } - - dout(10) << "p is " << *p->second << dendl; - - if (p->first <= cur) { - BufferHead *bh = p->second; - dout(10) << "map_write bh " << *bh << " intersected" << dendl; - - if (p->first < cur) { - assert(final == 0); - if (cur + max >= p->first + p->second->length()) { - // we want right bit (one splice) - final = split(bh, cur); // just split it, take right half. - p++; - assert(p->second == final); - } else { - // we want middle bit (two splices) - final = split(bh, cur); - p++; - assert(p->second == final); - split(final, cur+max); - } - } else if (p->first == cur) { - if (p->second->length() <= max) { - // whole bufferhead, piece of cake. - } else { - // we want left bit (one splice) - split(bh, cur + max); // just split - } - if (final) - merge_left(final, bh); - else - final = bh; - } - - // keep going. - off_t lenfromcur = final->end() - cur; - cur += lenfromcur; - left -= lenfromcur; - p++; - continue; - } else { - // gap! - off_t next = p->first; - off_t glen = MIN(next - cur, max); - dout(10) << "map_write gap " << cur << "~" << glen << dendl; - if (final) { - final->set_length( final->length() + glen ); - } else { - final = new BufferHead(this); - final->set_start( cur ); - final->set_length( glen ); - oc->bh_add(this, final); - } - - cur += glen; - left -= glen; - continue; // more? - } - } - } - - // set versoin - assert(final); - dout(10) << "map_write final is " << *final << dendl; - - return final; -} - - -void ObjectCacher::Object::truncate(off_t s) -{ - dout(10) << "truncate to " << s << dendl; - - while (!data.empty()) { - BufferHead *bh = data.rbegin()->second; - if (bh->end() <= s) - break; - - // split bh at truncation point? - if (bh->start() < s) { - split(bh, s); - continue; - } - - // remove bh entirely - assert(bh->start() >= s); - oc->bh_remove(this, bh); - delete bh; - } -} - - - - - -/*** ObjectCacher ***/ - -#undef dout -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_objectcacher) *_dout << dbeginl << g_clock.now() << " " << objecter->messenger->get_myname() << ".objectcacher " - - - -/* private */ - -void ObjectCacher::close_object(Object *ob) -{ - dout(10) << "close_object " << *ob << dendl; - assert(ob->can_close()); - - // ok! - objects.erase(ob->get_oid()); - objects_by_ino[ob->get_ino()].erase(ob); - if (objects_by_ino[ob->get_ino()].empty()) - objects_by_ino.erase(ob->get_ino()); - delete ob; -} - - - - -void ObjectCacher::bh_read(BufferHead *bh) -{ - dout(7) << "bh_read on " << *bh << dendl; - - mark_rx(bh); - - // finisher - C_ReadFinish *onfinish = new C_ReadFinish(this, bh->ob->get_oid(), bh->start(), bh->length()); - - // go - objecter->read(bh->ob->get_oid(), bh->start(), bh->length(), bh->ob->get_layout(), &onfinish->bl, - onfinish); -} - -void ObjectCacher::bh_read_finish(object_t oid, off_t start, size_t length, bufferlist &bl) -{ - //lock.Lock(); - dout(7) << "bh_read_finish " - << oid - << " " << start << "~" << length - << " (bl is " << bl.length() << ")" - << dendl; - - if (bl.length() < length) { - bufferptr bp(length - bl.length()); - bp.zero(); - dout(7) << "bh_read_finish " << oid << " padding " << start << "~" << length - << " with " << bp.length() << " bytes of zeroes" << dendl; - bl.push_back(bp); - } - - if (objects.count(oid) == 0) { - dout(7) << "bh_read_finish no object cache" << dendl; - } else { - Object *ob = objects[oid]; - - // apply to bh's! - off_t opos = start; - map::iterator p = ob->data.lower_bound(opos); - - while (p != ob->data.end() && - opos < start+(off_t)length) { - BufferHead *bh = p->second; - - if (bh->start() > opos) { - dout(1) << "weirdness: gap when applying read results, " - << opos << "~" << bh->start() - opos - << dendl; - opos = bh->start(); - continue; - } - - if (!bh->is_rx()) { - dout(10) << "bh_read_finish skipping non-rx " << *bh << dendl; - opos = bh->end(); - p++; - continue; - } - - assert(opos >= bh->start()); - assert(bh->start() == opos); // we don't merge rx bh's... yet! - assert(bh->length() <= start+(off_t)length-opos); - - bh->bl.substr_of(bl, - opos-bh->start(), - bh->length()); - mark_clean(bh); - dout(10) << "bh_read_finish read " << *bh << dendl; - - opos = bh->end(); - p++; - - // finishers? - // called with lock held. - list ls; - for (map >::iterator p = bh->waitfor_read.begin(); - p != bh->waitfor_read.end(); - p++) - ls.splice(ls.end(), p->second); - bh->waitfor_read.clear(); - finish_contexts(ls); - - // clean up? - ob->try_merge_bh(bh); - } - } - //lock.Unlock(); -} - - -void ObjectCacher::bh_write(BufferHead *bh) -{ - dout(7) << "bh_write " << *bh << dendl; - - // finishers - C_WriteAck *onack = new C_WriteAck(this, bh->ob->get_oid(), bh->start(), bh->length()); - C_WriteCommit *oncommit = new C_WriteCommit(this, bh->ob->get_oid(), bh->start(), bh->length()); - - // go - tid_t tid = objecter->write(bh->ob->get_oid(), bh->start(), bh->length(), bh->ob->get_layout(), bh->bl, - onack, oncommit); - - // set bh last_write_tid - onack->tid = tid; - oncommit->tid = tid; - bh->ob->last_write_tid = tid; - bh->last_write_tid = tid; - - mark_tx(bh); -} - -void ObjectCacher::lock_ack(list& oids, tid_t tid) -{ - for (list::iterator i = oids.begin(); - i != oids.end(); - i++) { - object_t oid = *i; - - if (objects.count(oid) == 0) { - dout(7) << "lock_ack no object cache" << dendl; - assert(0); - } - - Object *ob = objects[oid]; - - list ls; - - assert(tid <= ob->last_write_tid); - if (ob->last_write_tid == tid) { - dout(10) << "lock_ack " << *ob - << " tid " << tid << dendl; - - switch (ob->lock_state) { - case Object::LOCK_RDUNLOCKING: - case Object::LOCK_WRUNLOCKING: - ob->lock_state = Object::LOCK_NONE; - break; - case Object::LOCK_RDLOCKING: - case Object::LOCK_DOWNGRADING: - ob->lock_state = Object::LOCK_RDLOCK; - ls.splice(ls.begin(), ob->waitfor_rd); - break; - case Object::LOCK_UPGRADING: - case Object::LOCK_WRLOCKING: - ob->lock_state = Object::LOCK_WRLOCK; - ls.splice(ls.begin(), ob->waitfor_wr); - ls.splice(ls.begin(), ob->waitfor_rd); - break; - - default: - assert(0); - } - - ob->last_ack_tid = tid; - - if (ob->can_close()) - close_object(ob); - } else { - dout(10) << "lock_ack " << *ob - << " tid " << tid << " obsolete" << dendl; - } - - // waiters? - if (ob->waitfor_ack.count(tid)) { - ls.splice(ls.end(), ob->waitfor_ack[tid]); - ob->waitfor_ack.erase(tid); - } - - finish_contexts(ls); - - } -} - -void ObjectCacher::bh_write_ack(object_t oid, off_t start, size_t length, tid_t tid) -{ - //lock.Lock(); - - dout(7) << "bh_write_ack " - << oid - << " tid " << tid - << " " << start << "~" << length - << dendl; - if (objects.count(oid) == 0) { - dout(7) << "bh_write_ack no object cache" << dendl; - assert(0); - } else { - Object *ob = objects[oid]; - - // apply to bh's! - for (map::iterator p = ob->data.lower_bound(start); - p != ob->data.end(); - p++) { - BufferHead *bh = p->second; - - if (bh->start() > start+(off_t)length) break; - - if (bh->start() < start && - bh->end() > start+(off_t)length) { - dout(20) << "bh_write_ack skipping " << *bh << dendl; - continue; - } - - // make sure bh is tx - if (!bh->is_tx()) { - dout(10) << "bh_write_ack skipping non-tx " << *bh << dendl; - continue; - } - - // make sure bh tid matches - if (bh->last_write_tid != tid) { - assert(bh->last_write_tid > tid); - dout(10) << "bh_write_ack newer tid on " << *bh << dendl; - continue; - } - - // ok! mark bh clean. - mark_clean(bh); - dout(10) << "bh_write_ack clean " << *bh << dendl; - } - - // update object last_ack. - assert(ob->last_ack_tid < tid); - ob->last_ack_tid = tid; - - // waiters? - if (ob->waitfor_ack.count(tid)) { - list ls; - ls.splice(ls.begin(), ob->waitfor_ack[tid]); - ob->waitfor_ack.erase(tid); - finish_contexts(ls); - } - } - //lock.Unlock(); -} - -void ObjectCacher::bh_write_commit(object_t oid, off_t start, size_t length, tid_t tid) -{ - //lock.Lock(); - - // update object last_commit - dout(7) << "bh_write_commit " - << oid - << " tid " << tid - << " " << start << "~" << length - << dendl; - if (objects.count(oid) == 0) { - dout(7) << "bh_write_commit no object cache" << dendl; - //assert(0); - } else { - Object *ob = objects[oid]; - - // update last_commit. - ob->last_commit_tid = tid; - - // waiters? - if (ob->waitfor_commit.count(tid)) { - list ls; - ls.splice(ls.begin(), ob->waitfor_commit[tid]); - ob->waitfor_commit.erase(tid); - finish_contexts(ls); - } - } - - // lock.Unlock(); -} - - -void ObjectCacher::flush(off_t amount) -{ - utime_t cutoff = g_clock.now(); - //cutoff.sec_ref() -= g_conf.client_oc_max_dirty_age; - - dout(10) << "flush " << amount << dendl; - - /* - * NOTE: we aren't actually pulling things off the LRU here, just looking at the - * tail item. Then we call bh_write, which moves it to the other LRU, so that we - * can call lru_dirty.lru_get_next_expire() again. - */ - off_t did = 0; - while (amount == 0 || did < amount) { - BufferHead *bh = (BufferHead*) lru_dirty.lru_get_next_expire(); - if (!bh) break; - if (bh->last_write > cutoff) break; - - did += bh->length(); - bh_write(bh); - } -} - - -void ObjectCacher::trim(off_t max) -{ - if (max < 0) - max = g_conf.client_oc_size; - - dout(10) << "trim start: max " << max - << " clean " << get_stat_clean() - << dendl; - - while (get_stat_clean() > max) { - BufferHead *bh = (BufferHead*) lru_rest.lru_expire(); - if (!bh) break; - - dout(10) << "trim trimming " << *bh << dendl; - assert(bh->is_clean()); - - Object *ob = bh->ob; - bh_remove(ob, bh); - delete bh; - - if (ob->can_close()) { - dout(10) << "trim trimming " << *ob << dendl; - close_object(ob); - } - } - - dout(10) << "trim finish: max " << max - << " clean " << get_stat_clean() - << dendl; -} - - - -/* public */ - -/* - * returns # bytes read (if in cache). onfinish is untouched (caller must delete it) - * returns 0 if doing async read - */ -int ObjectCacher::readx(Objecter::OSDRead *rd, inodeno_t ino, Context *onfinish) -{ - bool success = true; - list hit_ls; - map stripe_map; // final buffer offset -> substring - - for (list::iterator ex_it = rd->extents.begin(); - ex_it != rd->extents.end(); - ex_it++) { - dout(10) << "readx " << *ex_it << dendl; - - // get Object cache - Object *o = get_object(ex_it->oid, ino, ex_it->layout); - - // map extent into bufferheads - map hits, missing, rx; - o->map_read(rd, hits, missing, rx); - - if (!missing.empty() || !rx.empty()) { - // read missing - for (map::iterator bh_it = missing.begin(); - bh_it != missing.end(); - bh_it++) { - bh_read(bh_it->second); - if (success) { - dout(10) << "readx missed, waiting on " << *bh_it->second - << " off " << bh_it->first << dendl; - success = false; - bh_it->second->waitfor_read[bh_it->first].push_back( new C_RetryRead(this, rd, ino, onfinish) ); - } - } - - // bump rx - for (map::iterator bh_it = rx.begin(); - bh_it != rx.end(); - bh_it++) { - touch_bh(bh_it->second); // bump in lru, so we don't lose it. - if (success) { - dout(10) << "readx missed, waiting on " << *bh_it->second - << " off " << bh_it->first << dendl; - success = false; - bh_it->second->waitfor_read[bh_it->first].push_back( new C_RetryRead(this, rd, ino, onfinish) ); - } - } - } else { - assert(!hits.empty()); - - // make a plain list - for (map::iterator bh_it = hits.begin(); - bh_it != hits.end(); - bh_it++) { - dout(10) << "readx hit bh " << *bh_it->second << dendl; - hit_ls.push_back(bh_it->second); - } - - // create reverse map of buffer offset -> object for the eventual result. - // this is over a single ObjectExtent, so we know that - // - the bh's are contiguous - // - the buffer frags need not be (and almost certainly aren't) - off_t opos = ex_it->start; - map::iterator bh_it = hits.begin(); - assert(bh_it->second->start() <= opos); - size_t bhoff = opos - bh_it->second->start(); - map::iterator f_it = ex_it->buffer_extents.begin(); - size_t foff = 0; - while (1) { - BufferHead *bh = bh_it->second; - assert(opos == (off_t)(bh->start() + bhoff)); - - dout(10) << "readx rmap opos " << opos - << ": " << *bh << " +" << bhoff - << " frag " << f_it->first << "~" << f_it->second << " +" << foff - << dendl; - - size_t len = MIN(f_it->second - foff, - bh->length() - bhoff); - bufferlist bit; // put substr here first, since substr_of clobbers, and - // we may get multiple bh's at this stripe_map position - bit.substr_of(bh->bl, - opos - bh->start(), - len); - stripe_map[f_it->first].claim_append(bit); - - opos += len; - bhoff += len; - foff += len; - if (opos == bh->end()) { - bh_it++; - bhoff = 0; - } - if (foff == f_it->second) { - f_it++; - foff = 0; - } - if (bh_it == hits.end()) break; - if (f_it == ex_it->buffer_extents.end()) break; - } - assert(f_it == ex_it->buffer_extents.end()); - assert(opos == ex_it->start + (off_t)ex_it->length); - } - } - - // bump hits in lru - for (list::iterator bhit = hit_ls.begin(); - bhit != hit_ls.end(); - bhit++) - touch_bh(*bhit); - - if (!success) return 0; // wait! - - // no misses... success! do the read. - assert(!hit_ls.empty()); - dout(10) << "readx has all buffers" << dendl; - - // ok, assemble into result buffer. - rd->bl->clear(); - size_t pos = 0; - for (map::iterator i = stripe_map.begin(); - i != stripe_map.end(); - i++) { - assert(pos == i->first); - dout(10) << "readx adding buffer len " << i->second.length() << " at " << pos << dendl; - pos += i->second.length(); - rd->bl->claim_append(i->second); - assert(rd->bl->length() == pos); - } - dout(10) << "readx result is " << rd->bl->length() << dendl; - - // done with read. - delete rd; - - trim(); - - return pos; -} - - -int ObjectCacher::writex(Objecter::OSDWrite *wr, inodeno_t ino) -{ - utime_t now = g_clock.now(); - - for (list::iterator ex_it = wr->extents.begin(); - ex_it != wr->extents.end(); - ex_it++) { - // get object cache - Object *o = get_object(ex_it->oid, ino, ex_it->layout); - - // map it all into a single bufferhead. - BufferHead *bh = o->map_write(wr); - - // adjust buffer pointers (ie "copy" data into my cache) - // this is over a single ObjectExtent, so we know that - // - there is one contiguous bh - // - the buffer frags need not be (and almost certainly aren't) - // note: i assume striping is monotonic... no jumps backwards, ever! - off_t opos = ex_it->start; - for (map::iterator f_it = ex_it->buffer_extents.begin(); - f_it != ex_it->buffer_extents.end(); - f_it++) { - dout(10) << "writex writing " << f_it->first << "~" << f_it->second << " into " << *bh << " at " << opos << dendl; - size_t bhoff = bh->start() - opos; - assert(f_it->second <= bh->length() - bhoff); - - // get the frag we're mapping in - bufferlist frag; - frag.substr_of(wr->bl, - f_it->first, f_it->second); - - // keep anything left of bhoff - bufferlist newbl; - if (bhoff) - newbl.substr_of(bh->bl, 0, bhoff); - newbl.claim_append(frag); - bh->bl.swap(newbl); - - opos += f_it->second; - } - - // ok, now bh is dirty. - mark_dirty(bh); - touch_bh(bh); - bh->last_write = now; - - o->try_merge_bh(bh); - } - - delete wr; - - trim(); - return 0; -} - - -// blocking wait for write. -void ObjectCacher::wait_for_write(size_t len, Mutex& lock) -{ - while (get_stat_dirty() + get_stat_tx() >= g_conf.client_oc_max_dirty) { - dout(10) << "wait_for_write waiting on " << len << ", dirty|tx " - << (get_stat_dirty() + get_stat_tx()) - << " >= " << g_conf.client_oc_max_dirty - << dendl; - flusher_cond.Signal(); - stat_waiter++; - stat_cond.Wait(lock); - stat_waiter--; - dout(10) << "wait_for_write woke up" << dendl; - } -} - -void ObjectCacher::flusher_entry() -{ - dout(10) << "flusher start" << dendl; - lock.Lock(); - while (!flusher_stop) { - while (!flusher_stop) { - off_t all = get_stat_tx() + get_stat_rx() + get_stat_clean() + get_stat_dirty(); - dout(11) << "flusher " - << all << " / " << g_conf.client_oc_size << ": " - << get_stat_tx() << " tx, " - << get_stat_rx() << " rx, " - << get_stat_clean() << " clean, " - << get_stat_dirty() << " / " << g_conf.client_oc_max_dirty << " dirty" - << dendl; - if (get_stat_dirty() > g_conf.client_oc_max_dirty) { - // flush some dirty pages - dout(10) << "flusher " - << get_stat_dirty() << " / " << g_conf.client_oc_max_dirty << " dirty," - << " flushing some dirty bhs" << dendl; - flush(get_stat_dirty() - g_conf.client_oc_max_dirty); - } - else { - // check tail of lru for old dirty items - utime_t cutoff = g_clock.now(); - cutoff.sec_ref()--; - BufferHead *bh = 0; - while ((bh = (BufferHead*)lru_dirty.lru_get_next_expire()) != 0 && - bh->last_write < cutoff) { - dout(10) << "flusher flushing aged dirty bh " << *bh << dendl; - bh_write(bh); - } - break; - } - } - if (flusher_stop) break; - flusher_cond.WaitInterval(lock, utime_t(1,0)); - } - lock.Unlock(); - dout(10) << "flusher finish" << dendl; -} - - - -// blocking. atomic+sync. -int ObjectCacher::atomic_sync_readx(Objecter::OSDRead *rd, inodeno_t ino, Mutex& lock) -{ - dout(10) << "atomic_sync_readx " << rd - << " in " << ino - << dendl; - - if (rd->extents.size() == 1) { - // single object. - // just write synchronously. - Cond cond; - bool done = false; - objecter->readx(rd, new C_SafeCond(&lock, &cond, &done)); - - // block - while (!done) cond.Wait(lock); - } else { - // spans multiple objects, or is big. - - // sort by object... - map by_oid; - for (list::iterator ex_it = rd->extents.begin(); - ex_it != rd->extents.end(); - ex_it++) - by_oid[ex_it->oid] = *ex_it; - - // lock - for (map::iterator i = by_oid.begin(); - i != by_oid.end(); - i++) { - Object *o = get_object(i->first, ino, i->second.layout); - rdlock(o); - } - - // readx will hose rd - list extents = rd->extents; - - // do the read, into our cache - Cond cond; - bool done = false; - readx(rd, ino, new C_SafeCond(&lock, &cond, &done)); - - // block - while (!done) cond.Wait(lock); - - // release the locks - for (list::iterator ex_it = extents.begin(); - ex_it != extents.end(); - ex_it++) { - assert(objects.count(ex_it->oid)); - Object *o = objects[ex_it->oid]; - rdunlock(o); - } - } - - return 0; -} - -int ObjectCacher::atomic_sync_writex(Objecter::OSDWrite *wr, inodeno_t ino, Mutex& lock) -{ - dout(10) << "atomic_sync_writex " << wr - << " in " << ino - << dendl; - - if (wr->extents.size() == 1 && - wr->extents.front().length <= g_conf.client_oc_max_sync_write) { - // single object. - - // make sure we aren't already locking/locked... - object_t oid = wr->extents.front().oid; - Object *o = 0; - if (objects.count(oid)) o = get_object(oid, ino, wr->extents.front().layout); - if (!o || - (o->lock_state != Object::LOCK_WRLOCK && - o->lock_state != Object::LOCK_WRLOCKING && - o->lock_state != Object::LOCK_UPGRADING)) { - // just write synchronously. - dout(10) << "atomic_sync_writex " << wr - << " in " << ino - << " doing sync write" - << dendl; - - Cond cond; - bool done = false; - objecter->modifyx(wr, new C_SafeCond(&lock, &cond, &done), 0); - - // block - while (!done) cond.Wait(lock); - return 0; - } - } - - // spans multiple objects, or is big. - // sort by object... - map by_oid; - for (list::iterator ex_it = wr->extents.begin(); - ex_it != wr->extents.end(); - ex_it++) - by_oid[ex_it->oid] = *ex_it; - - // wrlock - for (map::iterator i = by_oid.begin(); - i != by_oid.end(); - i++) { - Object *o = get_object(i->first, ino, i->second.layout); - wrlock(o); - } - - // writex will hose wr - list extents = wr->extents; - - // do the write, into our cache - writex(wr, ino); - - // flush - // ...and release the locks? - for (list::iterator ex_it = extents.begin(); - ex_it != extents.end(); - ex_it++) { - assert(objects.count(ex_it->oid)); - Object *o = objects[ex_it->oid]; - - wrunlock(o); - } - - return 0; -} - - - -// locking ----------------------------- - -void ObjectCacher::rdlock(Object *o) -{ - // lock? - if (o->lock_state == Object::LOCK_NONE || - o->lock_state == Object::LOCK_RDUNLOCKING || - o->lock_state == Object::LOCK_WRUNLOCKING) { - dout(10) << "rdlock rdlock " << *o << dendl; - - o->lock_state = Object::LOCK_RDLOCKING; - - C_LockAck *ack = new C_LockAck(this, o->get_oid()); - C_WriteCommit *commit = new C_WriteCommit(this, o->get_oid(), 0, 0); - - commit->tid = - ack->tid = - o->last_write_tid = - objecter->lock(OSD_OP_RDLOCK, o->get_oid(), o->get_layout(), ack, commit); - } - - // stake our claim. - o->rdlock_ref++; - - // wait? - if (o->lock_state == Object::LOCK_RDLOCKING || - o->lock_state == Object::LOCK_WRLOCKING) { - dout(10) << "rdlock waiting for rdlock|wrlock on " << *o << dendl; - Cond cond; - bool done = false; - o->waitfor_rd.push_back(new C_SafeCond(&lock, &cond, &done)); - while (!done) cond.Wait(lock); - } - assert(o->lock_state == Object::LOCK_RDLOCK || - o->lock_state == Object::LOCK_WRLOCK || - o->lock_state == Object::LOCK_UPGRADING || - o->lock_state == Object::LOCK_DOWNGRADING); -} - -void ObjectCacher::wrlock(Object *o) -{ - // lock? - if (o->lock_state != Object::LOCK_WRLOCK && - o->lock_state != Object::LOCK_WRLOCKING && - o->lock_state != Object::LOCK_UPGRADING) { - dout(10) << "wrlock wrlock " << *o << dendl; - - int op = 0; - if (o->lock_state == Object::LOCK_RDLOCK) { - o->lock_state = Object::LOCK_UPGRADING; - op = OSD_OP_UPLOCK; - } else { - o->lock_state = Object::LOCK_WRLOCKING; - op = OSD_OP_WRLOCK; - } - - C_LockAck *ack = new C_LockAck(this, o->get_oid()); - C_WriteCommit *commit = new C_WriteCommit(this, o->get_oid(), 0, 0); - - commit->tid = - ack->tid = - o->last_write_tid = - objecter->lock(op, o->get_oid(), o->get_layout(), ack, commit); - } - - // stake our claim. - o->wrlock_ref++; - - // wait? - if (o->lock_state == Object::LOCK_WRLOCKING || - o->lock_state == Object::LOCK_UPGRADING) { - dout(10) << "wrlock waiting for wrlock on " << *o << dendl; - Cond cond; - bool done = false; - o->waitfor_wr.push_back(new C_SafeCond(&lock, &cond, &done)); - while (!done) cond.Wait(lock); - } - assert(o->lock_state == Object::LOCK_WRLOCK); -} - - -void ObjectCacher::rdunlock(Object *o) -{ - dout(10) << "rdunlock " << *o << dendl; - assert(o->lock_state == Object::LOCK_RDLOCK || - o->lock_state == Object::LOCK_WRLOCK || - o->lock_state == Object::LOCK_UPGRADING || - o->lock_state == Object::LOCK_DOWNGRADING); - - assert(o->rdlock_ref > 0); - o->rdlock_ref--; - if (o->rdlock_ref > 0 || - o->wrlock_ref > 0) { - dout(10) << "rdunlock " << *o << " still has rdlock|wrlock refs" << dendl; - return; - } - - release(o); // release first - - o->lock_state = Object::LOCK_RDUNLOCKING; - - C_LockAck *lockack = new C_LockAck(this, o->get_oid()); - C_WriteCommit *commit = new C_WriteCommit(this, o->get_oid(), 0, 0); - commit->tid = - lockack->tid = - o->last_write_tid = - objecter->lock(OSD_OP_RDUNLOCK, o->get_oid(), o->get_layout(), lockack, commit); -} - -void ObjectCacher::wrunlock(Object *o) -{ - dout(10) << "wrunlock " << *o << dendl; - assert(o->lock_state == Object::LOCK_WRLOCK); - - assert(o->wrlock_ref > 0); - o->wrlock_ref--; - if (o->wrlock_ref > 0) { - dout(10) << "wrunlock " << *o << " still has wrlock refs" << dendl; - return; - } - - flush(o); // flush first - - int op = 0; - if (o->rdlock_ref > 0) { - dout(10) << "wrunlock rdlock " << *o << dendl; - op = OSD_OP_DNLOCK; - o->lock_state = Object::LOCK_DOWNGRADING; - } else { - dout(10) << "wrunlock wrunlock " << *o << dendl; - op = OSD_OP_WRUNLOCK; - o->lock_state = Object::LOCK_WRUNLOCKING; - } - - C_LockAck *lockack = new C_LockAck(this, o->get_oid()); - C_WriteCommit *commit = new C_WriteCommit(this, o->get_oid(), 0, 0); - commit->tid = - lockack->tid = - o->last_write_tid = - objecter->lock(op, o->get_oid(), o->get_layout(), lockack, commit); -} - - -// ------------------------------------------------- - - -bool ObjectCacher::set_is_cached(inodeno_t ino) -{ - if (objects_by_ino.count(ino) == 0) - return false; - - set& s = objects_by_ino[ino]; - for (set::iterator i = s.begin(); - i != s.end(); - i++) { - Object *ob = *i; - if (!ob->data.empty()) return true; - } - - return false; -} - -bool ObjectCacher::set_is_dirty_or_committing(inodeno_t ino) -{ - if (objects_by_ino.count(ino) == 0) - return false; - - set& s = objects_by_ino[ino]; - for (set::iterator i = s.begin(); - i != s.end(); - i++) { - Object *ob = *i; - - for (map::iterator p = ob->data.begin(); - p != ob->data.end(); - p++) { - BufferHead *bh = p->second; - if (bh->is_dirty() || bh->is_tx()) - return true; - } - } - - return false; -} - - -// purge. non-blocking. violently removes dirty buffers from cache. -void ObjectCacher::purge(Object *ob) -{ - dout(10) << "purge " << *ob << dendl; - - for (map::iterator p = ob->data.begin(); - p != ob->data.end(); - p++) { - BufferHead *bh = p->second; - if (!bh->is_clean()) - dout(0) << "purge forcibly removing " << *ob << " " << *bh << dendl; - bh_remove(ob, bh); - delete bh; - } - - if (ob->can_close()) { - dout(10) << "trim trimming " << *ob << dendl; - close_object(ob); - } -} - -// flush. non-blocking. no callback. -// true if clean, already flushed. -// false if we wrote something. -bool ObjectCacher::flush(Object *ob) -{ - bool clean = true; - for (map::iterator p = ob->data.begin(); - p != ob->data.end(); - p++) { - BufferHead *bh = p->second; - if (bh->is_tx()) { - clean = false; - continue; - } - if (!bh->is_dirty()) continue; - - bh_write(bh); - clean = false; - } - return clean; -} - -// flush. non-blocking, takes callback. -// returns true if already flushed -bool ObjectCacher::flush_set(inodeno_t ino, Context *onfinish) -{ - if (objects_by_ino.count(ino) == 0) { - dout(10) << "flush_set on " << ino << " dne" << dendl; - return true; - } - - dout(10) << "flush_set " << ino << dendl; - - C_Gather *gather = 0; // we'll need to wait for all objects to flush! - - set& s = objects_by_ino[ino]; - bool safe = true; - for (set::iterator i = s.begin(); - i != s.end(); - i++) { - Object *ob = *i; - - if (!flush(ob)) { - // we'll need to gather... - if (!gather && onfinish) - gather = new C_Gather(onfinish); - safe = false; - - dout(10) << "flush_set " << ino << " will wait for ack tid " - << ob->last_write_tid - << " on " << *ob - << dendl; - if (gather) - ob->waitfor_ack[ob->last_write_tid].push_back(gather->new_sub()); - } - } - - if (safe) { - dout(10) << "flush_set " << ino << " has no dirty|tx bhs" << dendl; - return true; - } - return false; -} - - -// commit. non-blocking, takes callback. -// return true if already flushed. -bool ObjectCacher::commit_set(inodeno_t ino, Context *onfinish) -{ - assert(onfinish); // doesn't make any sense otherwise. - - if (objects_by_ino.count(ino) == 0) { - dout(10) << "commit_set on " << ino << " dne" << dendl; - return true; - } - - dout(10) << "commit_set " << ino << dendl; - - C_Gather *gather = 0; // we'll need to wait for all objects to commit - - set& s = objects_by_ino[ino]; - bool safe = true; - for (set::iterator i = s.begin(); - i != s.end(); - i++) { - Object *ob = *i; - - // make sure it's flushing. - flush_set(ino); - - if (ob->last_write_tid > ob->last_commit_tid) { - dout(10) << "commit_set " << ino << " " << *ob - << " will finish on commit tid " << ob->last_write_tid - << dendl; - if (!gather && onfinish) gather = new C_Gather(onfinish); - safe = false; - if (gather) - ob->waitfor_commit[ob->last_write_tid].push_back( gather->new_sub() ); - } - } - - if (safe) { - dout(10) << "commit_set " << ino << " all committed" << dendl; - return true; - } - return false; -} - -void ObjectCacher::purge_set(inodeno_t ino) -{ - if (objects_by_ino.count(ino) == 0) { - dout(10) << "purge_set on " << ino << " dne" << dendl; - return; - } - - dout(10) << "purge_set " << ino << dendl; - - set& s = objects_by_ino[ino]; - for (set::iterator i = s.begin(); - i != s.end(); - i++) { - Object *ob = *i; - purge(ob); - } -} - - -off_t ObjectCacher::release(Object *ob) -{ - list clean; - off_t o_unclean = 0; - - for (map::iterator p = ob->data.begin(); - p != ob->data.end(); - p++) { - BufferHead *bh = p->second; - if (bh->is_clean()) - clean.push_back(bh); - else - o_unclean += bh->length(); - } - - for (list::iterator p = clean.begin(); - p != clean.end(); - p++) { - bh_remove(ob, *p); - delete *p; - } - - if (ob->can_close()) { - dout(10) << "trim trimming " << *ob << dendl; - close_object(ob); - } - - return o_unclean; -} - -off_t ObjectCacher::release_set(inodeno_t ino) -{ - // return # bytes not clean (and thus not released). - off_t unclean = 0; - - if (objects_by_ino.count(ino) == 0) { - dout(10) << "release_set on " << ino << " dne" << dendl; - return 0; - } - - dout(10) << "release_set " << ino << dendl; - - set s = objects_by_ino[ino]; - for (set::iterator i = s.begin(); - i != s.end(); - i++) { - Object *ob = *i; - - off_t o_unclean = release(ob); - unclean += o_unclean; - - if (o_unclean) - dout(10) << "release_set " << ino << " " << *ob - << " has " << o_unclean << " bytes left" - << dendl; - - } - - if (unclean) { - dout(10) << "release_set " << ino - << ", " << unclean << " bytes left" << dendl; - } - - return unclean; -} - -void ObjectCacher::truncate_set(inodeno_t ino, list& exls) -{ - if (objects_by_ino.count(ino) == 0) { - dout(10) << "truncate_set on " << ino << " dne" << dendl; - return; - } - - dout(10) << "truncate_set " << ino << dendl; - - for (list::iterator p = exls.begin(); - p != exls.end(); - ++p) { - ObjectExtent &ex = *p; - if (objects.count(ex.oid) == 0) continue; - Object *ob = objects[ex.oid]; - - // purge or truncate? - if (ex.start == 0) { - dout(10) << "truncate_set purging " << *ob << dendl; - purge(ob); - } else { - // hrm, truncate object - dout(10) << "truncate_set truncating " << *ob << " at " << ex.start << dendl; - ob->truncate(ex.start); - - if (ob->can_close()) { - dout(10) << "truncate_set trimming " << *ob << dendl; - close_object(ob); - } - } - } -} - - -void ObjectCacher::kick_sync_writers(inodeno_t ino) -{ - if (objects_by_ino.count(ino) == 0) { - dout(10) << "kick_sync_writers on " << ino << " dne" << dendl; - return; - } - - dout(10) << "kick_sync_writers on " << ino << dendl; - - list ls; - - set& s = objects_by_ino[ino]; - for (set::iterator i = s.begin(); - i != s.end(); - i++) { - Object *ob = *i; - - ls.splice(ls.begin(), ob->waitfor_wr); - } - - finish_contexts(ls); -} - -void ObjectCacher::kick_sync_readers(inodeno_t ino) -{ - if (objects_by_ino.count(ino) == 0) { - dout(10) << "kick_sync_readers on " << ino << " dne" << dendl; - return; - } - - dout(10) << "kick_sync_readers on " << ino << dendl; - - list ls; - - set& s = objects_by_ino[ino]; - for (set::iterator i = s.begin(); - i != s.end(); - i++) { - Object *ob = *i; - - ls.splice(ls.begin(), ob->waitfor_rd); - } - - finish_contexts(ls); -} - - - diff --git a/trunk/ceph/osdc/ObjectCacher.h b/trunk/ceph/osdc/ObjectCacher.h deleted file mode 100644 index f1d057beef99c..0000000000000 --- a/trunk/ceph/osdc/ObjectCacher.h +++ /dev/null @@ -1,566 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -#ifndef __OBJECTCACHER_H_ -#define __OBJECTCACHER_H_ - -#include "include/types.h" -#include "include/lru.h" -#include "include/Context.h" - -#include "common/Cond.h" -#include "common/Thread.h" - -#include "Objecter.h" -#include "Filer.h" - -class Objecter; -class Objecter::OSDRead; -class Objecter::OSDWrite; - -class ObjectCacher { - public: - - class Object; - - // ******* BufferHead ********* - class BufferHead : public LRUObject { - public: - // states - static const int STATE_MISSING = 0; - static const int STATE_CLEAN = 1; - static const int STATE_DIRTY = 2; - static const int STATE_RX = 3; - static const int STATE_TX = 4; - - private: - // my fields - int state; - int ref; - struct { - off_t start, length; // bh extent in object - } ex; - - public: - Object *ob; - bufferlist bl; - tid_t last_write_tid; // version of bh (if non-zero) - utime_t last_write; - - map< off_t, list > waitfor_read; - - public: - // cons - BufferHead(Object *o) : - state(STATE_MISSING), - ref(0), - ob(o), - last_write_tid(0) {} - - // extent - off_t start() { return ex.start; } - void set_start(off_t s) { ex.start = s; } - off_t length() { return ex.length; } - void set_length(off_t l) { ex.length = l; } - off_t end() { return ex.start + ex.length; } - off_t last() { return end() - 1; } - - // states - void set_state(int s) { - if (s == STATE_RX || s == STATE_TX) get(); - if (state == STATE_RX || state == STATE_TX) put(); - state = s; - } - int get_state() { return state; } - - bool is_missing() { return state == STATE_MISSING; } - bool is_dirty() { return state == STATE_DIRTY; } - bool is_clean() { return state == STATE_CLEAN; } - bool is_tx() { return state == STATE_TX; } - bool is_rx() { return state == STATE_RX; } - - // reference counting - int get() { - assert(ref >= 0); - if (ref == 0) lru_pin(); - return ++ref; - } - int put() { - assert(ref > 0); - if (ref == 1) lru_unpin(); - --ref; - return ref; - } - }; - - - // ******* Object ********* - class Object { - private: - // ObjectCacher::Object fields - ObjectCacher *oc; - object_t oid; // this _always_ is oid.rev=0 - inodeno_t ino; - objectrev_t rev; // last rev we're written - ObjectLayout layout; - - public: - map data; - - tid_t last_write_tid; // version of bh (if non-zero) - tid_t last_ack_tid; // last update acked. - tid_t last_commit_tid; // last update commited. - - map< tid_t, list > waitfor_ack; - map< tid_t, list > waitfor_commit; - list waitfor_rd; - list waitfor_wr; - - // lock - static const int LOCK_NONE = 0; - static const int LOCK_WRLOCKING = 1; - static const int LOCK_WRLOCK = 2; - static const int LOCK_WRUNLOCKING = 3; - static const int LOCK_RDLOCKING = 4; - static const int LOCK_RDLOCK = 5; - static const int LOCK_RDUNLOCKING = 6; - static const int LOCK_UPGRADING = 7; // rd -> wr - static const int LOCK_DOWNGRADING = 8; // wr -> rd - int lock_state; - int wrlock_ref; // how many ppl want or are using a WRITE lock - int rdlock_ref; // how many ppl want or are using a READ lock - - public: - Object(ObjectCacher *_oc, object_t o, inodeno_t i, ObjectLayout& l) : - oc(_oc), - oid(o), ino(i), layout(l), - last_write_tid(0), last_ack_tid(0), last_commit_tid(0), - lock_state(LOCK_NONE), wrlock_ref(0), rdlock_ref(0) - {} - ~Object() { - assert(data.empty()); - } - - object_t get_oid() { return oid; } - inodeno_t get_ino() { return ino; } - - ObjectLayout& get_layout() { return layout; } - void set_layout(ObjectLayout& l) { layout = l; } - - bool can_close() { - return data.empty() && lock_state == LOCK_NONE && - waitfor_ack.empty() && waitfor_commit.empty() && - waitfor_rd.empty() && waitfor_wr.empty(); - } - - // bh - void add_bh(BufferHead *bh) { - // add to my map - assert(data.count(bh->start()) == 0); - - if (0) { // sanity check FIXME DEBUG - //cout << "add_bh " << bh->start() << "~" << bh->length() << endl; - map::iterator p = data.lower_bound(bh->start()); - if (p != data.end()) { - //cout << " after " << *p->second << endl; - //cout << " after starts at " << p->first << endl; - assert(p->first >= bh->end()); - } - if (p != data.begin()) { - p--; - //cout << " before starts at " << p->second->start() - //<< " and ends at " << p->second->end() << endl; - //cout << " before " << *p->second << endl; - assert(p->second->end() <= bh->start()); - } - } - - data[bh->start()] = bh; - } - void remove_bh(BufferHead *bh) { - assert(data.count(bh->start())); - data.erase(bh->start()); - } - bool is_empty() { return data.empty(); } - - // mid-level - BufferHead *split(BufferHead *bh, off_t off); - void merge_left(BufferHead *left, BufferHead *right); - void try_merge_bh(BufferHead *bh); - - int map_read(Objecter::OSDRead *rd, - map& hits, - map& missing, - map& rx); - BufferHead *map_write(Objecter::OSDWrite *wr); - - void truncate(off_t s); - - }; - - // ******* ObjectCacher ********* - // ObjectCacher fields - public: - Objecter *objecter; - Filer filer; - - private: - Mutex& lock; - - hash_map objects; - hash_map > objects_by_ino; - - set dirty_bh; - LRU lru_dirty, lru_rest; - - Cond flusher_cond; - bool flusher_stop; - void flusher_entry(); - class FlusherThread : public Thread { - ObjectCacher *oc; - public: - FlusherThread(ObjectCacher *o) : oc(o) {} - void *entry() { - oc->flusher_entry(); - return 0; - } - } flusher_thread; - - - // objects - Object *get_object(object_t oid, inodeno_t ino, ObjectLayout &l) { - // have it? - if (objects.count(oid)) - return objects[oid]; - - // create it. - Object *o = new Object(this, oid, ino, l); - objects[oid] = o; - objects_by_ino[ino].insert(o); - return o; - } - void close_object(Object *ob); - - // bh stats - Cond stat_cond; - int stat_waiter; - - off_t stat_clean; - off_t stat_dirty; - off_t stat_rx; - off_t stat_tx; - off_t stat_missing; - - void bh_stat_add(BufferHead *bh) { - switch (bh->get_state()) { - case BufferHead::STATE_MISSING: stat_missing += bh->length(); break; - case BufferHead::STATE_CLEAN: stat_clean += bh->length(); break; - case BufferHead::STATE_DIRTY: stat_dirty += bh->length(); break; - case BufferHead::STATE_TX: stat_tx += bh->length(); break; - case BufferHead::STATE_RX: stat_rx += bh->length(); break; - } - if (stat_waiter) stat_cond.Signal(); - } - void bh_stat_sub(BufferHead *bh) { - switch (bh->get_state()) { - case BufferHead::STATE_MISSING: stat_missing -= bh->length(); break; - case BufferHead::STATE_CLEAN: stat_clean -= bh->length(); break; - case BufferHead::STATE_DIRTY: stat_dirty -= bh->length(); break; - case BufferHead::STATE_TX: stat_tx -= bh->length(); break; - case BufferHead::STATE_RX: stat_rx -= bh->length(); break; - } - } - off_t get_stat_tx() { return stat_tx; } - off_t get_stat_rx() { return stat_rx; } - off_t get_stat_dirty() { return stat_dirty; } - off_t get_stat_clean() { return stat_clean; } - - void touch_bh(BufferHead *bh) { - if (bh->is_dirty()) - lru_dirty.lru_touch(bh); - else - lru_rest.lru_touch(bh); - } - - // bh states - void bh_set_state(BufferHead *bh, int s) { - // move between lru lists? - if (s == BufferHead::STATE_DIRTY && bh->get_state() != BufferHead::STATE_DIRTY) { - lru_rest.lru_remove(bh); - lru_dirty.lru_insert_top(bh); - dirty_bh.insert(bh); - } - if (s != BufferHead::STATE_DIRTY && bh->get_state() == BufferHead::STATE_DIRTY) { - lru_dirty.lru_remove(bh); - lru_rest.lru_insert_mid(bh); - dirty_bh.erase(bh); - } - - // set state - bh_stat_sub(bh); - bh->set_state(s); - bh_stat_add(bh); - } - - void copy_bh_state(BufferHead *bh1, BufferHead *bh2) { - bh_set_state(bh2, bh1->get_state()); - } - - void mark_missing(BufferHead *bh) { bh_set_state(bh, BufferHead::STATE_MISSING); }; - void mark_clean(BufferHead *bh) { bh_set_state(bh, BufferHead::STATE_CLEAN); }; - void mark_rx(BufferHead *bh) { bh_set_state(bh, BufferHead::STATE_RX); }; - void mark_tx(BufferHead *bh) { bh_set_state(bh, BufferHead::STATE_TX); }; - void mark_dirty(BufferHead *bh) { - bh_set_state(bh, BufferHead::STATE_DIRTY); - lru_dirty.lru_touch(bh); - //bh->set_dirty_stamp(g_clock.now()); - }; - - void bh_add(Object *ob, BufferHead *bh) { - ob->add_bh(bh); - if (bh->is_dirty()) { - lru_dirty.lru_insert_top(bh); - dirty_bh.insert(bh); - } else { - lru_rest.lru_insert_top(bh); - } - bh_stat_add(bh); - } - void bh_remove(Object *ob, BufferHead *bh) { - ob->remove_bh(bh); - if (bh->is_dirty()) { - lru_dirty.lru_remove(bh); - dirty_bh.erase(bh); - } else { - lru_rest.lru_remove(bh); - } - bh_stat_sub(bh); - } - - // io - void bh_read(BufferHead *bh); - void bh_write(BufferHead *bh); - - void trim(off_t max=-1); - void flush(off_t amount=0); - - bool flush(Object *o); - off_t release(Object *o); - void purge(Object *o); - - void rdlock(Object *o); - void rdunlock(Object *o); - void wrlock(Object *o); - void wrunlock(Object *o); - - public: - void bh_read_finish(object_t oid, off_t offset, size_t length, bufferlist &bl); - void bh_write_ack(object_t oid, off_t offset, size_t length, tid_t t); - void bh_write_commit(object_t oid, off_t offset, size_t length, tid_t t); - void lock_ack(list& oids, tid_t tid); - - class C_ReadFinish : public Context { - ObjectCacher *oc; - object_t oid; - off_t start; - size_t length; - public: - bufferlist bl; - C_ReadFinish(ObjectCacher *c, object_t o, off_t s, size_t l) : oc(c), oid(o), start(s), length(l) {} - void finish(int r) { - oc->bh_read_finish(oid, start, length, bl); - } - }; - - class C_WriteAck : public Context { - ObjectCacher *oc; - object_t oid; - off_t start; - size_t length; - public: - tid_t tid; - C_WriteAck(ObjectCacher *c, object_t o, off_t s, size_t l) : oc(c), oid(o), start(s), length(l) {} - void finish(int r) { - oc->bh_write_ack(oid, start, length, tid); - } - }; - class C_WriteCommit : public Context { - ObjectCacher *oc; - object_t oid; - off_t start; - size_t length; - public: - tid_t tid; - C_WriteCommit(ObjectCacher *c, object_t o, off_t s, size_t l) : oc(c), oid(o), start(s), length(l) {} - void finish(int r) { - oc->bh_write_commit(oid, start, length, tid); - } - }; - - class C_LockAck : public Context { - ObjectCacher *oc; - public: - list oids; - tid_t tid; - C_LockAck(ObjectCacher *c, object_t o) : oc(c) { - oids.push_back(o); - } - void finish(int r) { - oc->lock_ack(oids, tid); - } - }; - - - - public: - ObjectCacher(Objecter *o, Mutex& l) : - objecter(o), filer(o), lock(l), - flusher_stop(false), flusher_thread(this), - stat_waiter(0), - stat_clean(0), stat_dirty(0), stat_rx(0), stat_tx(0), stat_missing(0) { - flusher_thread.create(); - } - ~ObjectCacher() { - // we should be empty. - assert(objects.empty()); - assert(lru_rest.lru_get_size() == 0); - assert(lru_dirty.lru_get_size() == 0); - assert(dirty_bh.empty()); - - assert(flusher_thread.is_started()); - lock.Lock(); // hmm.. watch out for deadlock! - flusher_stop = true; - flusher_cond.Signal(); - lock.Unlock(); - flusher_thread.join(); - } - - - class C_RetryRead : public Context { - ObjectCacher *oc; - Objecter::OSDRead *rd; - inodeno_t ino; - Context *onfinish; - public: - C_RetryRead(ObjectCacher *_oc, Objecter::OSDRead *r, inodeno_t i, Context *c) : oc(_oc), rd(r), ino(i), onfinish(c) {} - void finish(int) { - int r = oc->readx(rd, ino, onfinish); - if (r > 0) { - onfinish->finish(r); - delete onfinish; - } - } - }; - - // non-blocking. async. - int readx(Objecter::OSDRead *rd, inodeno_t ino, Context *onfinish); - int writex(Objecter::OSDWrite *wr, inodeno_t ino); - - // write blocking - void wait_for_write(size_t len, Mutex& lock); - - // blocking. atomic+sync. - int atomic_sync_readx(Objecter::OSDRead *rd, inodeno_t ino, Mutex& lock); - int atomic_sync_writex(Objecter::OSDWrite *wr, inodeno_t ino, Mutex& lock); - - bool set_is_cached(inodeno_t ino); - bool set_is_dirty_or_committing(inodeno_t ino); - - bool flush_set(inodeno_t ino, Context *onfinish=0); - void flush_all(Context *onfinish=0); - - bool commit_set(inodeno_t ino, Context *oncommit); - void commit_all(Context *oncommit=0); - - void purge_set(inodeno_t ino); - - off_t release_set(inodeno_t ino); // returns # of bytes not released (ie non-clean) - - void truncate_set(inodeno_t ino, list& ex); - - void kick_sync_writers(inodeno_t ino); - void kick_sync_readers(inodeno_t ino); - - - // file functions - - /*** async+caching (non-blocking) file interface ***/ - int file_read(inode_t& inode, - off_t offset, size_t len, - bufferlist *bl, - Context *onfinish) { - Objecter::OSDRead *rd = new Objecter::OSDRead(bl); - filer.file_to_extents(inode, offset, len, rd->extents); - return readx(rd, inode.ino, onfinish); - } - - int file_write(inode_t& inode, - off_t offset, size_t len, - bufferlist& bl, - objectrev_t rev=0) { - Objecter::OSDWrite *wr = new Objecter::OSDWrite(bl); - filer.file_to_extents(inode, offset, len, wr->extents); - return writex(wr, inode.ino); - } - - - - /*** sync+blocking file interface ***/ - - int file_atomic_sync_read(inode_t& inode, - off_t offset, size_t len, - bufferlist *bl, - Mutex &lock) { - Objecter::OSDRead *rd = new Objecter::OSDRead(bl); - filer.file_to_extents(inode, offset, len, rd->extents); - return atomic_sync_readx(rd, inode.ino, lock); - } - - int file_atomic_sync_write(inode_t& inode, - off_t offset, size_t len, - bufferlist& bl, - Mutex &lock, - objectrev_t rev=0) { - Objecter::OSDWrite *wr = new Objecter::OSDWrite(bl); - filer.file_to_extents(inode, offset, len, wr->extents); - return atomic_sync_writex(wr, inode.ino, lock); - } - -}; - - -inline ostream& operator<<(ostream& out, ObjectCacher::BufferHead &bh) -{ - out << "bh[" - << bh.start() << "~" << bh.length() - << " (" << bh.bl.length() << ")" - << " v " << bh.last_write_tid; - if (bh.is_tx()) out << " tx"; - if (bh.is_rx()) out << " rx"; - if (bh.is_dirty()) out << " dirty"; - if (bh.is_clean()) out << " clean"; - if (bh.is_missing()) out << " missing"; - if (bh.bl.length() > 0) out << " firstbyte=" << (int)bh.bl[0]; - out << "]"; - return out; -} - -inline ostream& operator<<(ostream& out, ObjectCacher::Object &ob) -{ - out << "object[" - << hex << ob.get_oid() << " ino " << ob.get_ino() << dec - << " wr " << ob.last_write_tid << "/" << ob.last_ack_tid << "/" << ob.last_commit_tid; - - switch (ob.lock_state) { - case ObjectCacher::Object::LOCK_WRLOCKING: out << " wrlocking"; break; - case ObjectCacher::Object::LOCK_WRLOCK: out << " wrlock"; break; - case ObjectCacher::Object::LOCK_WRUNLOCKING: out << " wrunlocking"; break; - case ObjectCacher::Object::LOCK_RDLOCKING: out << " rdlocking"; break; - case ObjectCacher::Object::LOCK_RDLOCK: out << " rdlock"; break; - case ObjectCacher::Object::LOCK_RDUNLOCKING: out << " rdunlocking"; break; - } - - out << "]"; - return out; -} - -#endif diff --git a/trunk/ceph/script/add_header.pl b/trunk/ceph/script/add_header.pl deleted file mode 100755 index 023c06e455fd1..0000000000000 --- a/trunk/ceph/script/add_header.pl +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/perl - -use strict; -my $fn = shift @ARGV; -my $old = `cat $fn`; - -my $header = `cat doc/header.txt`; - -# strip existing header -my $new = $old; -if ($new =~ /^(.*)\* Ceph - scalable distributed file system/s) { - my ($a,@b) = split(/\*\/\n/, $new); - $new = join("*/\n",@b); -} -$new = $header . $new; - -if ($new ne $old) { - open(O, ">$fn.new"); - print O $new; - close O; - system "diff $fn $fn.new"; - rename "$fn.new", $fn; - #unlink "$fn.new"; - -} - diff --git a/trunk/ceph/script/adjusttabs.pl b/trunk/ceph/script/adjusttabs.pl deleted file mode 100755 index 66edff2ac6c02..0000000000000 --- a/trunk/ceph/script/adjusttabs.pl +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/perl - -my $tablen = shift @ARGV; -my $fn = shift @ARGV; - -my $tab = ' ' x $tablen; -open(I, $fn); -my $f; -my $oldtab = ' ' x 4; -while () { - if (my ($oldlen) = /\-\*\- .*tab-width:(\d)/) { - print "old length was $oldlen\n"; - $oldtab = ' ' x $oldlen; - s/tab-width:\d/tab-width:$tablen/; - } - s/\t/$oldtab/g; - $f .= $_; -} -close I; -open(O, ">$fn.new"); -print O $f; -close O; - -rename "$fn.new", $fn; diff --git a/trunk/ceph/script/check_cache_dumps.pl b/trunk/ceph/script/check_cache_dumps.pl deleted file mode 100755 index 95bd28a474991..0000000000000 --- a/trunk/ceph/script/check_cache_dumps.pl +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/perl - -my $epoch = shift || die "specify epoch"; - -my %auth; # mds -> id -> replica -> nonce -my %replica; # mds -> id -> auth -> nonce - -print "reading\n"; -for (my $i=0; -e "cachedump.$epoch.mds$i"; $i++) { - open(O,"cachedump.$epoch.mds$i"); - while () { - my ($name,$s); - ($name,$s) = /^\[(inode \d+) \S+ (\S+)/; - ($name,$s) = /^\[(dir \d+) \S+ (\S+)/ unless $name; - ($name,$s) = /^\[dentry (\S+) (\S+)/ unless $name; - if ($name) { - if ($s =~ /^auth/) { - $auth{$i}->{$name} = {}; - my ($rl) = $s =~ /\{(.*)\}/; - for my $r (split(/,/,$rl)) { - my ($who,$nonce) = $r =~ /(\d+)\=(\d+)/; - $auth{$i}->{$name}->{$who} = $nonce; - #print "auth $name rep by $who $nonce $s\n"; - } - } - else { - my ($a,$b,$n) = $s =~ /rep@(\d+)\,([\-\d]+)\.(\d+)/; - die $_ unless $a >= 0; - $replica{$i}->{$name}->{$a} = $n; - if ($b >= 0) { - $replica{$i}->{$name}->{$b} = $n; - } - } - } - } -} - -print "verifying replicas\n"; -for my $mds (keys %replica) { - for my $name (keys %{$replica{$mds}}) { - for my $auth (keys %{$replica{$mds}->{$name}}) { - if ($auth{$auth}->{$name}->{$mds}) { - if ($auth{$auth}->{$name}->{$mds} < $replica{$mds}->{$name}->{$auth}) { - print "problem: mds$mds has $name from mds$auth nonce $replica{$mds}->{$name}->{$auth}, auth has nonce $auth{$auth}->{$name}->{$mds}\n"; - } else { - print "ok: mds$mds has $name from mds$auth nonce $replica{$mds}->{$name}->{$auth}, auth has nonce $auth{$auth}->{$name}->{$mds}\n"; - } - } else { - print "??: mds$mds has $name from mds$auth nonce $replica{$mds}->{$name}->{$auth}, auth has no nonce\n"; - } - - } - } -} - - diff --git a/trunk/ceph/script/clean_osd_cow.sh b/trunk/ceph/script/clean_osd_cow.sh deleted file mode 100755 index 1e443c95e7ebc..0000000000000 --- a/trunk/ceph/script/clean_osd_cow.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/sh - -rm osddata/*/*\.* diff --git a/trunk/ceph/script/clean_trace.pl b/trunk/ceph/script/clean_trace.pl deleted file mode 100755 index cb02ff7abe7c2..0000000000000 --- a/trunk/ceph/script/clean_trace.pl +++ /dev/null @@ -1,8 +0,0 @@ -#!/usr/bin/perl - -my $n = 0; -while (<>) { - next unless /trace: /; - my $l = $'; $'; - print $l; -} diff --git a/trunk/ceph/script/comb.pl b/trunk/ceph/script/comb.pl deleted file mode 100755 index 1a0d4dcbe6c07..0000000000000 --- a/trunk/ceph/script/comb.pl +++ /dev/null @@ -1,113 +0,0 @@ -#!/usr/bin/perl - -use strict; - -my $xaxis = shift @ARGV; -my @vars; -while (@ARGV) { - $_ = shift @ARGV; - last if ($_ eq '-'); - push(@vars, $_); -} -my @dirs; -while (@ARGV) { - $_ = shift @ARGV; - last if ($_ eq '-'); - push(@dirs, $_) if -d $_; -} -my @filt = @ARGV; -push( @filt, '.' ) unless @filt; - -print "#xaxis $xaxis -#vars @vars -#dirs @dirs -#filt @filt -"; - -sub load_sum { - my $fn = shift @_; - - open(I, "$fn"); - my $k = ; - chomp($k); - my @k = split(/\s+/,$k); - shift @k; - - my $s; - while () { - chomp; - s/^\#//; - next unless $_; - my @l = split(/\s+/,$_); - my $k = shift @l; - for my $f (@k) { - $s->{$k}->{$f} = shift @l; - } - - # clnode latency? - if ($fn =~ /cl/) { - $s->{$k}->{'wrlat'} = $s->{$k}->{'wrlsum'} / $s->{$k}->{'wrlnum'} if $s->{$k}->{'wrlnum'} > 0; - $s->{$k}->{'rlat'} = $s->{$k}->{'rlsum'} / $s->{$k}->{'rlnum'} if $s->{$k}->{'rlnum'} > 0; - $s->{$k}->{'lat'} = $s->{$k}->{'lsum'} / $s->{$k}->{'lnum'} if $s->{$k}->{'lnum'} > 0; - $s->{$k}->{'latw'} = $s->{$k}->{'lwsum'} / $s->{$k}->{'lwnum'} if $s->{$k}->{'lwnum'} > 0; - $s->{$k}->{'latr'} = $s->{$k}->{'lrsum'} / $s->{$k}->{'lrnum'} if $s->{$k}->{'lrnum'} > 0; - $s->{$k}->{'statlat'} = $s->{$k}->{'lstatsum'} / $s->{$k}->{'lstatnum'} if $s->{$k}->{'lstatnum'} > 0; - $s->{$k}->{'dirlat'} = $s->{$k}->{'ldirsum'} / $s->{$k}->{'ldirnum'} if $s->{$k}->{'ldirnum'} > 0; - } - } - return $s; -} - - -my %res; -my @key; -my %didkey; -for my $f (@filt) { - my @reg = split(/,/, $f); - #print "reg @reg\n"; - for my $d (@dirs) { - if ($f ne '.') { - my $r = (split(/\//,$d))[-1]; - my @db = split(/,/, $r); - #print "db @db\n"; - my $ok = 1; - for my $r (@reg) { - - $ok = 0 unless grep {$_ eq $r} @db; - } - next unless $ok; - } - #next if ($f ne '.' && $d !~ /$reg/); - #print "$d\n"; - my ($x) = $d =~ /$xaxis=([\d\.]+)/; - - for my $v (@vars) { - my ($what, $field) = $v =~ /^(.+)\.([^\.]+)$/; - #print "$what $field .. $v .. $f.$field\n"; - my $s = &load_sum("$d/sum.$what"); - - #print "\t$v"; - if ($field =~ /^sum=/) { - #warn "SUM field $field\n"; - push( @{$res{$x}}, $s->{'sum'}->{$'} ); #'}); - } else { - #warn "avg field $field\n"; - push( @{$res{$x}}, $s->{'avgval'}->{$field} ); - } - - push( @key, "$f.$field" ) unless $didkey{"$f.$field"}; - $didkey{"$f.$field"} = 1; - - if (0 && exists $s->{'avgvaldevt'}) { - push( @{$res{$x}}, $s->{'avgvaldevt'}->{$field} ); - push( @key, "$f.$field.dev" ) unless $didkey{"$f.$field.dev"}; - $didkey{"$f.$field.dev"} = 1; - } - } - } -} - -print join("\t", "#", @key) . "\n"; -for my $x (sort {$a <=> $b} keys %res) { - print join("\t", $x, @{$res{$x}}) . "\n"; -} diff --git a/trunk/ceph/script/convert_soe_trace.pl b/trunk/ceph/script/convert_soe_trace.pl deleted file mode 100755 index a6ec80312d0fe..0000000000000 --- a/trunk/ceph/script/convert_soe_trace.pl +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/perl - -# this reads in one of kristal's anonymized static traces from -# soe and makes it look like output from -# -# find . -exec ls -dilsn --time-style=+%s \{\} \; -# -# (which is what SyntheticClient likes to "import", and -# study_static.pl likes to analyze for hardlinks, dirsizes, etc.) - -while (<>) { - chomp; - my ($file, $ino, $size, $actime, $ctime, $mtime, $uid, $gid, $omode, $nlink) = split(/ /,substr($_,1)); - $file = '.' . $file; - my $nmode = oct($omode); - my $mode = '-...'; - $mode = 'd...' if (($nmode & 0170000) == 0040000); - $mode = 'f...' if (($nmode & 0170000) == 0100000); - $size = hex($size); - $mtime = hex($mtime); - $uid = hex($uid); - $gid = hex($gid); - print "$ino ? $mode ? $nlink $uid $gid $size $mtime $file\n"; -} - -__END__ - -soe format is -0. a space -1. full path of file name (MD5-ed and in base 64) -2. inode number -3. size of file in bytes (hex) -4. atime (hex) -5. ctime (hex) -6. mtime (hex) -7. uid (hex) -8. gid (hex) -9. mode (octal) -10. number of links diff --git a/trunk/ceph/script/find_auth_pins.pl b/trunk/ceph/script/find_auth_pins.pl deleted file mode 100755 index d37fb109a48da..0000000000000 --- a/trunk/ceph/script/find_auth_pins.pl +++ /dev/null @@ -1,51 +0,0 @@ -#!/usr/bin/perl - -my %pin; -my %hist; -my $l = 1; -my @pins; -while (<>) { - - #cdir:adjust_nested_auth_pins on [dir 163 /foo/ rep@13 | child] count now 0 + 1 - - if (/adjust_nested_auth_pins/) { - my ($what) = / (\w+)\]/; - $what =~ s/ 0x/ /; - $hist{$what} .= "$l: $_" - if defined $pin{$what}; - } - - # cinode:auth_pin on inode [1000000002625 /gnu/blah_client_created. 0x89b7700] count now 1 + 0 - - elsif (/auth_pin / && !/waiting/) { - #my ($what) = /\[(\w+ \w+) /; - my ($what) = / (\w+)\]/; - $what =~ s/ 0x/ /; - #print "$_ add_waiter $c $what\n"; - $pin{$what}++; - $hist{$what} .= "$l: $_"; - push( @pins, $what ) unless grep {$_ eq $what} @pins; - } - - # cinode:auth_unpin on inode [1000000002625 (dangling) 0x89b7700] count now 0 + 0 - - elsif (/auth_unpin/) { - #my ($what) = /\[(\w+ \w+) /;# / on (.*\])/; - my ($what) = / (\w+)\]/; - $what =~ s/ 0x/ /; - $pin{$what}--; - $hist{$what} .= "$l: $_"; - unless ($pin{$what}) { - delete $hist{$what}; - delete $pin{$what}; - @pins = grep {$_ ne $what} @pins; - } - } - $l++; -} - -for my $what (@pins) { - print "---- count $pin{$what} on $what -$hist{$what} -"; -} diff --git a/trunk/ceph/script/find_bufferleaks.pl b/trunk/ceph/script/find_bufferleaks.pl deleted file mode 100755 index 152515d5e788e..0000000000000 --- a/trunk/ceph/script/find_bufferleaks.pl +++ /dev/null @@ -1,69 +0,0 @@ -#!/usr/bin/perl - -use strict; -my %buffers; -my %bufferlists; -my %ref; -my %mal; -my $l = 1; -while (<>) { - #print "$l: $_"; - - # cinode:auth_pin on inode [1000000002625 /gnu/blah_client_created. 0x89b7700] count now 1 + 0 - - if (/^buffer\.cons /) { - my ($x) = /(0x\S+)/; - $buffers{$x} = 1; - } - if (/^buffer\.des /) { - my ($x) = /(0x\S+)/; - die "des without cons at $l: $_" unless $buffers{$x}; - delete $buffers{$x}; - die "des with ref>0 at $l: $_" unless $ref{$x} == 0; - delete $ref{$x}; - } - - if (/^bufferlist\.cons /) { - my ($x) = /(0x\S+)/; - $bufferlists{$x} = 1; - } - if (/^bufferlist\.des /) { - my ($x) = /(0x\S+)/; - warn "des without cons at $l: $_" unless $bufferlists{$x}; - delete $bufferlists{$x}; - } - - - if (/^buffer\.malloc /) { - my ($x) = /(0x\S+)/; - $mal{$x} = 1; - } - if (/^buffer\.free /) { - my ($x) = /(0x\S+)/; - die "free with malloc at $l: $_" unless $mal{$x}; - delete $mal{$x}; - } - - if (/^buffer\.get /) { - my ($x) = /(0x\S+)/; - $ref{$x}++; - } - if (/^buffer\.get /) { - my ($x) = /(0x\S+)/; - $ref{$x}--; - } - -$l++; -} - -for my $x (keys %bufferlists) { - print "leaked bufferlist $x\n"; -} - -for my $x (keys %buffers) { - print "leaked buffer $x ref $ref{$x}\n"; -} - -for my $x (keys %mal) { - print "leaked buffer dataptr $x ref $ref{$x}\n"; -} diff --git a/trunk/ceph/script/find_lost_bdev_ops.pl b/trunk/ceph/script/find_lost_bdev_ops.pl deleted file mode 100755 index ac1793b42dfac..0000000000000 --- a/trunk/ceph/script/find_lost_bdev_ops.pl +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/perl - -use strict; -my %op; - -my $line = 0; -while (<>) { - #print $line . $_ if /0x8d4f6a0/; - chomp; - $line++; - - #bdev(./ebofsdev/0)._submit_io bio(wr 269~1 usemap 0x4de33cc0) - if (my ($bio) = /_submit_io bio\(.*(0x\w+)\)/) { - $op{$bio} = $line; - } - - # cancel - #bdev(./ebofsdev/3)._cancel_io bio(wr 1525~1 bh_write 0x8a437b8) - if (my ($bio) = /_cancel_io bio\(.*(0x\w+)\)/ && - !(/FAILED/)) { - delete $op{$bio}; - } - - # finish - #bdev(./ebofsdev/3).complete_thread finishing bio(wr 1131~1 write_cnode 0x832c1f8) - if (my ($bio) = /complete_thread finishing bio\(.*(0x\w+)\)/) { - delete $op{$bio}; - } - -} - -for my $bio (keys %op) { - print "---- lost bio $bio\n"; -} diff --git a/trunk/ceph/script/find_lost_commit.pl b/trunk/ceph/script/find_lost_commit.pl deleted file mode 100755 index 73934248ad5c0..0000000000000 --- a/trunk/ceph/script/find_lost_commit.pl +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/perl - -use strict; -my %op; - -my $line = 0; -while (<>) { - #print "$line: $_"; - $line++; - - #osd3 do_op MOSDOp(client0.933 oid 100000000000008 0x84b4480) in pg[pginfo(4020000000d v 5662/0 e 2/1) r=0 active (0,5662]] - if (my ($from, $opno, $oid, $op) = /do_op MOSDOp\((\S+) op (\d+) oid (\d+) (\w+)\)/) { -# print "$op\n"; - if ($opno == 2 || $opno == 11 || $opno == 12 || $opno == 14 || $opno == 15) { - $op{$op} = $from; - } - } - - # commits - #osd1 op_modify_commit on op MOSDOp(client1.289 oid 100000100000002 0x51a2f788) - if (my ($op) = /op_modify_commit.* (\w+)\)/) { - delete $op{$op}; - } - #osd4 rep_modify_commit on op MOSDOp(osd3.289 oid 100000000000008 0x84b0980) - if (my ($op) = /rep_modify_commit.* (\w+)\)/) { - delete $op{$op}; - } - - # forwarded? - if (my ($op) = /sending (\w+) to osd/) { - delete $op{$op}; - } - -} - -for my $op (keys %op) { - print "---- lost op $op $op{$op}\n"; -} diff --git a/trunk/ceph/script/find_lost_objecter.pl b/trunk/ceph/script/find_lost_objecter.pl deleted file mode 100755 index a0c2089140e23..0000000000000 --- a/trunk/ceph/script/find_lost_objecter.pl +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/perl - -use strict; -my %ack; -my %commit; - -my $line = 0; -while (<>) { - #print "$line: $_"; - $line++; - - #client0.objecter writex_submit tid 21 osd0 oid 100000000000001 851424~100000 - if (my ($who, $tid) = /(\S+)\.objecter writex_submit tid\D+(\d+)\D+osd/) { -# print "$who.$tid\n"; - $ack{"$who.$tid"} = $line; - $commit{"$who.$tid"} = $line; - } - - #client1.objecter handle_osd_write_reply 304 commit 0 - #client1.objecter handle_osd_write_reply 777 commit 1 - if (my ($who, $tid, $commit) = /(\S+)\.objecter handle_osd_write_reply\D+(\d+)\D+commit\D+(\d)/) { -# print "$who.$tid\n"; - delete $ack{"$who.$tid"}; - delete $commit{"$who.$tid"} if $commit; - } - -} - -for my $op (keys %commit) { - print "---- lost commit $op $commit{$op}\n"; -} -for my $op (keys %ack) { - print "---- lost ack $op $commit{$op}\n"; -} diff --git a/trunk/ceph/script/find_pathpins.pl b/trunk/ceph/script/find_pathpins.pl deleted file mode 100755 index e4a7d81dfb7b7..0000000000000 --- a/trunk/ceph/script/find_pathpins.pl +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/perl - -my %pin; -my %hist; -my $l = 1; -my @pins; -while (<>) { - - # cinode:auth_pin on inode [1000000002625 /gnu/blah_client_created. 0x89b7700] count now 1 + 0 - - if (/path_pinned /) { - my ($dname, $dir) = /\[dentry (\S+) .* in \[dir (\d+) /; - $what = "$dname $dir"; - #print "$l pin $what\n"; - $pin{$what}++; - $hist{$what} .= "$l: $_"; - push( @pins, $what ) unless grep {$_ eq $what} @pins; - } - - # cinode:auth_unpin on inode [1000000002625 (dangling) 0x89b7700] count now 0 + 0 - - if (/path_unpinned/) { - my ($dname, $dir) = /\[dentry (\S+) .* in \[dir (\d+) /; - $what = "$dname $dir"; - #print "$l unpin $what\n"; - $pin{$what}--; - $hist{$what} .= "$l: $_"; - unless ($pin{$what}) { - delete $hist{$what}; - delete $pin{$what}; - @pins = grep {$_ ne $what} @pins; - } - } - $l++; -} - -for my $what (@pins) { - print "---- count $pin{$what} on $what -$hist{$what} -"; -} diff --git a/trunk/ceph/script/find_requests.pl b/trunk/ceph/script/find_requests.pl deleted file mode 100755 index 5144896249413..0000000000000 --- a/trunk/ceph/script/find_requests.pl +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/perl - -my %waiting; # context => what where what is "inode ..." or "dir ..." -my %hist; # context => history since waited -my @waiting; - -my $line = 0; -while (<>) { - - #print $line . $_ if /0x8d4f6a0/; - $line++; - if (/request_start/) { - my ($c) = /(0x\w+)/; - my ($what) = $'; #'; - chomp $what; - #print "$line add_waiter $c $what\n" if /0x8d4f6a0/; - $waiting{$c} = $what - if $what && !$waiting{$c}; - $hist{$c} .= "$line: $_"; - unless (grep {$_ eq $c} @waiting) { - push( @waiting, $c ); - } - } - #if (/finish_waiting/) { - # my ($c) = /(0x\w+)/; - # $hist{$c} .= "$line: $_"; - #} - if (/request_finish/ || - /request_forward/) { - my ($c) = /(0x\w+)/; - #print "took\n" if /0x8d4f6a0/; - delete $waiting{$c}; - delete $hist{$c}; - @waiting = grep {$_ ne $c} @waiting; - } -} - -for my $c (@waiting) { - print "---- lost request $c $waiting{$c} -$hist{$c} -"; -} diff --git a/trunk/ceph/script/find_waiters.pl b/trunk/ceph/script/find_waiters.pl deleted file mode 100755 index c89d2b1a49db7..0000000000000 --- a/trunk/ceph/script/find_waiters.pl +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/perl - -my %waiting; # context => what where what is "inode ..." or "dir ..." -my %hist; # context => history since waited -my @waiting; - -my $line = 0; -while (<>) { - #print $line . $_ if /0x8d4f6a0/; - $line++; - if (/add_waiter/) { - my ($c) = /(0x\w+)/; - my ($what) = / on (.*\])/; - #print "$line add_waiter $c $what\n" if /0x8d4f6a0/; - $waiting{$c} = $what - if $what && !$waiting{$c}; - $hist{$c} .= "$line: $_"; - unless (grep {$_ eq $c} @waiting) { - push( @waiting, $c ); - } - } - #if (/finish_waiting/) { - # my ($c) = /(0x\w+)/; - # $hist{$c} .= "$line: $_"; - #} - if (/take_waiting/) { - my ($c) = /(0x\w+)/; - if (/SKIPPING/) { - #print "skipping\n" if /0x8d4f6a0/; - $hist{$c} .= "$line: $_"; - } elsif (/took/) { - #print "took\n" if /0x8d4f6a0/; - delete $waiting{$c}; - delete $hist{$c}; - @waiting = grep {$_ ne $c} @waiting; - } else { - die "i don't understand: $_"; - } - } -} - -for my $c (@waiting) { - print "---- lost waiter $c $waiting{$c} -$hist{$c} -"; -} diff --git a/trunk/ceph/script/fix_modeline.pl b/trunk/ceph/script/fix_modeline.pl deleted file mode 100755 index 8eadde9b54e56..0000000000000 --- a/trunk/ceph/script/fix_modeline.pl +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/perl - -use strict; -my $fn = shift @ARGV; -my $old = `cat $fn`; -my $header = `cat doc/modeline.txt`; - -# strip existing modeline -my $new = $old; -$new =~ s/^\/\/ \-\*\- ([^\n]+) \-\*\-([^\n]*)\n//s; # emacs -$new =~ s/^\/\/ vim: ([^\n]*)\n//s; # vim; -$new =~ s/^\/\/ \-\*\- ([^\n]+) \-\*\-([^\n]*)\n//s; # emacs -$new =~ s/^\/\/ vim: ([^\n]*)\n//s; # vim; -$new =~ s/^\/\/ \-\*\- ([^\n]+) \-\*\-([^\n]*)\n//s; # emacs -$new =~ s/^\/\/ vim: ([^\n]*)\n//s; # vim; - -# add correct header -$new = $header . $new; - -if ($new ne $old) { - print "$fn\n"; - open(O, ">$fn.new"); - print O $new; - close O; - system "diff $fn $fn.new"; - rename "$fn.new", $fn; - #unlink "$fn.new"; -} - diff --git a/trunk/ceph/script/gprofnewsyn b/trunk/ceph/script/gprofnewsyn deleted file mode 100755 index 5d352e4e9e52c..0000000000000 --- a/trunk/ceph/script/gprofnewsyn +++ /dev/null @@ -1,12 +0,0 @@ -#!/usr/bin/perl - -my @ranks = @ARGV; -unless (@ranks) { - @ranks = split(/\n/,`/bin/ls gmon`); -} -print "will do @ranks\n"; -for my $r (@ranks) { - print "$r\n"; - system "test -e gmon.out && rm gmon.out ; ln -f gmon/$r/gmon.out ; gprof newsyn > gmon/$r/o"; -} - diff --git a/trunk/ceph/script/grepblock b/trunk/ceph/script/grepblock deleted file mode 100755 index f5acf95732abb..0000000000000 --- a/trunk/ceph/script/grepblock +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/perl - -use strict; - -my $block = shift ARGV; -die unless int $block; - -while (<>) { - my $yes = 0; - for my $x (/(\d+\~\d+)/) { - my ($s,$l) = split(/\~/,$x); - $yes = 1 if ($block >= $s && $block < $s+$l); - } - print if $yes; -} diff --git a/trunk/ceph/script/merge_cdfs.pl b/trunk/ceph/script/merge_cdfs.pl deleted file mode 100755 index 98c22764fc8b3..0000000000000 --- a/trunk/ceph/script/merge_cdfs.pl +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bin/perl - -my %rows; # val -> [ count1, count2, ... ] - -my $filen = 0; -for my $file (@ARGV) { - open(I,"$file"); - while () { - next if /^\#/; - chomp; - my ($v, $c) = split(/\t/,$_); - $rows{$v}->[$filen] = $c; - } - $filen++; -} - -for my $v (sort {$a <=> $b} keys %rows) { - print "$v"; - for (my $i=0; $i < $filen; $i++) { - print "\t" . int($rows{$v}->[$i]); - } - print "\n"; - #print join("\t", $v, @{$rows{$v}}) . "\n"; -} diff --git a/trunk/ceph/script/merge_trace_rw.pl b/trunk/ceph/script/merge_trace_rw.pl deleted file mode 100644 index 378d629ef43f6..0000000000000 --- a/trunk/ceph/script/merge_trace_rw.pl +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/perl - -use strict; - -my @file = <>; -sub get_op { - my @op = shift @file; - while (@file && - $file[0] !~ /^[a-z]+$/) { - push( @op, shift @file ); - } - #print "op = ( @op )\n"; - return @op; -} - -my $n = 0; -while (@file) { - my ($op, @args) = &get_op; - while ($op eq "read\n" || - $op eq "write\n") { - die unless scalar(@args) == 3; - my ($nop, @nargs) = &get_op; - if ($nop eq $op - && ($args[0] == $nargs[0] ) - && ($args[2] + $args[1] == $nargs[2]) - ) { - die unless scalar(@nargs) == 3; - $args[1] += $nargs[1]; - $args[1] .= "\n"; - die unless scalar(@args) == 3; - #print STDOUT "combining $n $op @args\n"; - $n++; - } else { -# print STDERR "not combinging\n"; - unshift( @file, $nop, @nargs ); - die unless scalar(@args) == 3; - last; - } - } - print $op; - print join('', @args); -} diff --git a/trunk/ceph/script/plot.pl b/trunk/ceph/script/plot.pl deleted file mode 100755 index 2d4e3002bbd4d..0000000000000 --- a/trunk/ceph/script/plot.pl +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/perl - -use strict; - -my $dir = shift @ARGV; -my ($type,$subtype) = split(/\./, shift @ARGV); -$subtype = '.' . $subtype if $subtype; - -# list files -my @files; -my %fields; -for my $f (`ls $dir/$type*$subtype`) { - chomp $f; - next unless $f =~ /$type(\d+)$subtype$/; - push(@files, $f); - unless (%fields) { - open(I,$f); - while () { - next unless /^\#/; - my @f = split(/\t/,$_); - for (my $n=1; @f; $n++) { - my $f = shift @f; - $fields{$f} = $n; - #print "$f = $n\n"; - } - last; - } - close I; - } -} -#print "#files @files\n"; - -# get field names -my $var = shift @ARGV; -my $rest = join(' ', @ARGV); - -print "set style data lines\nset grid\n"; -print "set title \"$dir .. $var\"\n"; -if (scalar(@files) > 30) { print "set key off\n"; } -#for my $var (@ARGV) { - my @p; - for my $f (@files) { - my ($lastbit) = $f =~ /\/([^\/]+)$/; - push(@p, "\"$f\" u 1:$fields{$var} $rest t \"$lastbit\""); - } - print "plot " . join(',', @p) . "\n"; -#} -print "pause 60000\n"; diff --git a/trunk/ceph/script/profonly.pl b/trunk/ceph/script/profonly.pl deleted file mode 100755 index 6a05dec473ca0..0000000000000 --- a/trunk/ceph/script/profonly.pl +++ /dev/null @@ -1,12 +0,0 @@ -#!/usr/bin/perl - -my $rank = shift @ARGV; -my $args = join(' ',@ARGV); -if ($rank == $ENV{MPD_JRANK}) { - $c = "LD_PRELOAD=$ENV{'HOME'}/csl/obsd/src/pmds/gprof-helper.so ./newsyn $args"; -} else { - $c = "./newsyn.nopg $args"; -} - -#print "$rank: $c\n"; -system $c; diff --git a/trunk/ceph/script/runjob.pl b/trunk/ceph/script/runjob.pl deleted file mode 100755 index c432675d33830..0000000000000 --- a/trunk/ceph/script/runjob.pl +++ /dev/null @@ -1,341 +0,0 @@ -#!/usr/bin/perl - -use strict; -use Data::Dumper; - - -my $usage = "script/runset.pl [--clean] jobs/some/job blah\n"; - -my $clean; -my $use_srun = 0; -my $nobg = '&'; -my $in = shift || die $usage; -if ($in eq '--clean') { - $clean = 1; - $in = shift || die $usage; -} -if ($in eq '--srun') { - $use_srun = 1; - $in = shift || die $usage; -} -if ($in eq '--nobg') { - $nobg = ''; - $in = shift || die $usage; -} -my $tag = shift || die $usage; -my $fake = shift; - - -my ($job) = $in =~ /^jobs\/(.*)/; -my ($jname) = $job =~ /\/(\w+)$/; -$jname ||= $job; -die "not jobs/?" unless defined $job; -my $out = "log/$job.$tag"; -my $relout = "$job.$tag"; - - -my $cwd = `/bin/pwd`; -chomp($cwd); - - - -print "# --- job $job, tag $tag ---\n"; - - -# get input -my $raw = `cat $in`; -my $sim = eval $raw; -unless (ref $sim) { - print "bad input: $in\n"; - system "perl -c $in"; - exit 1; -} - -# prep output -system "mkdir -p $out" unless -d "$out"; - -open(W, ">$out/in"); -print W $raw; -close W; - -my $comb = $sim->{'_comb'}; -delete $sim->{'_comb'}; -my %filters; -my @fulldirs; - - - -sub reset { - print "reset: restarting mpd in 3 seconds\n"; - system "sleep 3 && (mpiexec -l -n 32 killall newsyn ; restartmpd.sh)"; - print "reset: done\n"; -} - - -if (`hostname` =~ /alc/ && !$use_srun) { - print "# this looks like alc\n"; - $sim->{'_psub'} = 'jobs/alc.tp'; -} - - -sub iterate { - my $sim = shift @_; - my $fix = shift @_ || {}; - my $vary; - my @r; - - my $this; - for my $k (sort keys %$sim) { - #next if $k =~ /^_/; - if (defined $fix->{$k}) { - $this->{$k} = $fix->{$k}; - } - elsif (ref $sim->{$k} eq 'HASH') { - # nothing - } - elsif ($k =~ /^_/ || !(ref $sim->{$k})) { - $this->{$k} = $sim->{$k}; - } - else { - #print ref $sim->{$k}; - if (!(defined $vary)) { - $vary = $k; - } - } - } - - if ($vary) { - #print "vary $vary\n"; - for my $v (@{$sim->{$vary}}) { - $this->{$vary} = $v; - push(@r, &iterate($sim, $this)); - } - } else { - - if ($sim->{'_dep'}) { - my @s = @{$sim->{'_dep'}}; - while (@s) { - my $dv = shift @s; - my $eq = shift @s; - - $eq =~ s/\$(\w+)/"\$this->{'$1'}"/eg; - $this->{$dv} = eval $eq; - #print "$dv : $eq -> $this->{$dv}\n"; - } - } - - push(@r, $this); - } - return @r; -} - - - -sub run { - my $h = shift @_; - - my @fn; - my @filt; - my @vals; - for my $k (sort keys %$sim) { - next if $k =~ /^_/; - next unless ref $sim->{$k} eq 'ARRAY'; - push(@fn, "$k=$h->{$k}"); - push(@vals, $h->{$k}); - next if $comb && $k eq $comb->{'x'}; - push(@filt, "$k=$h->{$k}"); - } - my $keys = join(",", @fn); - $keys =~ s/ /_/g; - my $fn = $out . '/' . $keys; - my $name = $jname . '_' . join('_',@vals); #$tag . '_' . $keys; - - push( @fulldirs, "" . $fn ); - - - # filters - $filters{ join(',', @filt) } = 1; - - - #system "sh $fn/sh.post" if -e "$fn/sh.post";# && !(-e "$fn/.post"); - - if (-e "$fn/.done") { - print "already done.\n"; - return; - } - system "rm -r $fn" if $clean && -d "$fn"; - system "mkdir $fn" unless -d "$fn"; - system "mkdir $fn/out" unless -d "$fn/out"; - - my $e = './newsyn'; - #$e = './tcpsynobfs' if $h->{'fs'} eq 'obfs'; - my $c = "$e"; - $c .= " --mkfs" unless $h->{'_no_mkfs'}; - - for my $k (keys %$h) { - next if $k =~ /^_/; - next if $h->{'_noarg'} && grep {$k eq $_} @{$h->{'_noarg'}}; - next if $h->{'_subst'} && grep {$k eq $_} @{$h->{'_subst'}}; - $c .= " --$k $h->{$k}"; - } - - if ($h->{'_custom'}) { - if ($h->{'_subst'}) { - for my $var (@{$h->{'_subst'}}) { - $h->{'_custom'} =~ s/\$$var/$h->{$var}/g; - } - } - $c .= ' ' . $h->{'_custom'}; - } - - $c .= " --log_name $relout/$keys"; - $c .= " --doutdir log/$relout/$keys/out"; - - my $post = "#!/bin/sh -script/sum.pl -start $h->{'_start'} -end $h->{'_end'} $fn/osd\\* > $fn/sum.osd -script/sum.pl -start $h->{'_start'} -end $h->{'_end'} $fn/mds? $fn/mds?? > $fn/sum.mds -script/sum.pl -start $h->{'_start'} -end $h->{'_end'} $fn/mds*.log > $fn/sum.mds.log -script/sum.pl -start $h->{'_start'} -end $h->{'_end'} $fn/clnode* > $fn/sum.cl -touch $fn/.post -"; - open(O,">$fn/sh.post"); - print O $post; - close O; - - my $killmin; - if ($h->{'_kill_after'}) { - $killmin = 1 + int ($h->{'_kill_after'} / 60); - $killmin = "-t $killmin"; - } - - $c = "bash -c \"ulimit -c 0 ; $c\""; - #$c = "bash -c \"$c\""; - - #print "h keys are " . join(' ', sort keys %$h) . "\n"; - - my $srun = "srun --wait=600 -x jobs/ltest.ignore -l $killmin -N $h->{'_n'} -p ltest"; - my $mpiexec = "mpiexec -l -n $h->{'_n'}"; - my $launch; - if ($use_srun) { - $launch = $srun; - } else { - $launch = $mpiexec; - } - - if ($sim->{'_psub'}) { - # template! - my $tp = `cat $sim->{'_psub'}`; - $tp =~ s/\$CWD/$cwd/g; - $tp =~ s/\$NAME/$name/g; - $tp =~ s/\$NUM/$h->{'_n'}/g; - $tp =~ s/\$OUT/$fn\/o/g; - $tp =~ s/\$DONE/$fn\/.done/g; - $tp =~ s/\$CMD/$c/g; - open(O,">$out/$name"); - print O $tp; - close O; - print "\npsub $out/$name\n"; - return; - } else { - # run - my $cmd = "\n$launch $c > $fn/o && touch $fn/.done";# - #my $cmd = "\n$launch $c > $fn/o ; touch $fn/.done"; - print "$cmd $nobg\n"; - my $r = undef; - unless ($fake) { - if ($sim->{'_pre'}) { - print "pre: $launch $sim->{'_pre'}\n"; - system "$launch $sim->{'_pre'}"; - } - $r = system $cmd; - if ($sim->{'_post'}) { - print "post: $launch $sim->{'_post'}\n"; - system "$launch $sim->{'_post'}"; - } - if ($r) { - print "r = $r\n"; - #&reset; - } - system "sh $fn/sh.post"; - } - return $r; - } -} - - - -my @r = &iterate($sim); -my $n = scalar(@r); -my $c = 1; -my %r; -my $nfailed = 0; -for my $h (@r) { - my $d = `date`; - chomp($d); - $d =~ s/ P.T .*//; - print "# === $c/$n"; - print " ($nfailed failed)" if $nfailed; - print " $d: "; - my $r = &run($h); - - if (!(defined $r)) { - # already done - } else { - if ($r) { - $nfailed++; - } - print "sleep $h->{'_sleep'}\n"; - sleep $h->{'_sleep'}; - } - - $c++; -} -print "$nfailed failed\n"; - - -my @comb; -if ($comb) { - my $x = $comb->{'x'}; - my @vars = @{$comb->{'vars'}}; - - print "\n\n# post\n"; - for my $p (@fulldirs) { - print "sh $p/sh.post\n"; - } - - my @filters = sort keys %filters; - my $cmd = "script/comb.pl $x @vars - @fulldirs - @filters > $out/c"; - print "$cmd\n"; - open(O,">$out/comb"); - print O "$cmd\n"; - close O; - system $cmd; - - print "\n\n"; - - my $plot; - $plot .= "set data style linespoints;\n"; - my $s = 2; - for my $v (@vars) { - my $c = $s; - $s++; - my @p; - for my $f (@filters) { - my $t = $f; - if ($comb->{'maptitle'}) { - for my $a (keys %{$comb->{'maptitle'}}) { - my $b = $comb->{'maptitle'}->{$a}; - $t =~ s/$a/$b/; - } - } - push (@p, "\"$out/c\" u 1:$c t \"$t\"" ); - $c += scalar(@vars); - } - $plot .= "# $v\nplot " . join(", ", @p) . ";\n\n"; - } - print $plot; - open(O,">$out/plot"); - print O $plot; - close O; -} - diff --git a/trunk/ceph/script/runset.pl b/trunk/ceph/script/runset.pl deleted file mode 100755 index 966cf4e5100cb..0000000000000 --- a/trunk/ceph/script/runset.pl +++ /dev/null @@ -1,380 +0,0 @@ -#!/usr/bin/perl - -use strict; -use Data::Dumper; - -=item sample input file - -# hi there -{ - # startup - 'n' => 30, # mpi nodes - 'sleep' => 10, # seconds between runs - 'nummds' => 1, - 'numosd' => 8, - 'numclient' => 400,#[10, 50, 100, 200, 400], - - # parameters - 'fs' => [ 'ebofs', 'fakestore' ], - 'until' => 150, # --syn until $n ... when to stop clients - 'writefile' => 1, - 'writefile_size' => [ 4096, 65526, 256000, 1024000, 2560000 ], - 'writefile_mb' => 1000, - - 'custom' => '--tcp_skip_rank0 --osd_maxthreads 0'; - - # for final summation (script/sum.pl) - 'start' => 30, - 'end' => 120, - - '_psub' => 'alc.tp' # switch to psub mode! -}; - -=cut - -my $usage = "script/runset.pl [--clean] jobs/some/job blah\n"; - -my $clean; -my $use_srun; -my $nobg = '&'; -my $in = shift || die $usage; -if ($in eq '--clean') { - $clean = 1; - $in = shift || die $usage; -} -if ($in eq '--srun') { - $use_srun = 1; - $in = shift || die $usage; -} -if ($in eq '--nobg') { - $nobg = ''; - $in = shift || die $usage; -} -my $tag = shift || die $usage; -my $fake = shift; - - -my ($job) = $in =~ /^jobs\/(.*)/; -my ($jname) = $job =~ /\/(\w+)$/; -$jname ||= $job; -die "not jobs/?" unless defined $job; -my $out = "log/$job.$tag"; -my $relout = "$job.$tag"; - - -my $cwd = `/bin/pwd`; -chomp($cwd); - - - -print "# --- job $job, tag $tag ---\n"; - - -# get input -my $raw = `cat $in`; -my $sim = eval $raw; -unless (ref $sim) { - print "bad input: $in\n"; - system "perl -c $in"; - exit 1; -} - -# prep output -system "mkdir -p $out" unless -d "$out"; - -open(W, ">$out/in"); -print W $raw; -close W; - -my $comb = $sim->{'comb'}; -delete $sim->{'comb'}; -my %filters; -my @fulldirs; - - - -sub reset { - print "reset: restarting mpd in 3 seconds\n"; - system "sleep 3 && (mpiexec -l -n 32 killall newsyn ; restartmpd.sh)"; - print "reset: done\n"; -} - - -if (`hostname` =~ /alc/ && !$use_srun) { - print "# this looks like alc\n"; - $sim->{'_psub'} = 'jobs/alc.tp'; -} - - -sub iterate { - my $sim = shift @_; - my $fix = shift @_ || {}; - my $vary; - my @r; - - my $this; - for my $k (sort keys %$sim) { - next if $k =~ /^_/; - if (defined $fix->{$k}) { - $this->{$k} = $fix->{$k}; - } - elsif (ref $sim->{$k} eq 'HASH') { - # nothing - } - elsif (!(ref $sim->{$k})) { - $this->{$k} = $sim->{$k}; - } - else { - #print ref $sim->{$k}; - if (!(defined $vary)) { - $vary = $k; - } - } - } - - if ($vary) { - #print "vary $vary\n"; - for my $v (@{$sim->{$vary}}) { - $this->{$vary} = $v; - push(@r, &iterate($sim, $this)); - } - } else { - - if ($sim->{'_dep'}) { - my @s = @{$sim->{'_dep'}}; - while (@s) { - my $dv = shift @s; - my $eq = shift @s; - - $eq =~ s/\$(\w+)/"\$this->{'$1'}"/eg; - $this->{$dv} = eval $eq; - #print "$dv : $eq -> $this->{$dv}\n"; - } - } - - push(@r, $this); - } - return @r; -} - - - -sub run { - my $h = shift @_; - - my @fn; - my @filt; - my @vals; - for my $k (sort keys %$sim) { - next if $k =~ /^_/; - next unless ref $sim->{$k} eq 'ARRAY'; - push(@fn, "$k=$h->{$k}"); - push(@vals, $h->{$k}); - next if $comb && $k eq $comb->{'x'}; - push(@filt, "$k=$h->{$k}"); - } - my $keys = join(",", @fn); - $keys =~ s/ /_/g; - my $fn = $out . '/' . $keys; - my $name = $jname . '_' . join('_',@vals); #$tag . '_' . $keys; - - push( @fulldirs, "" . $fn ); - - - # filters - $filters{ join(',', @filt) } = 1; - - - #system "sh $fn/sh.post" if -e "$fn/sh.post";# && !(-e "$fn/.post"); - if (-e "$fn/.done") { - print "already done.\n"; - return; - } - system "rm -r $fn" if $clean && -d "$fn"; - system "mkdir $fn" unless -d "$fn"; - - my $e = './newsyn'; - #$e = './tcpsynobfs' if $h->{'fs'} eq 'obfs'; - my $c = "$e"; - $c .= " --mkfs" unless $h->{'no_mkfs'}; - $c .= " --$h->{'fs'}" if $h->{'fs'}; - $c .= " --syn until $h->{'until'}" if $h->{'until'}; - - $c .= " --syn writefile $h->{'writefile_mb'} $h->{'writefile_size'}" if $h->{'writefile'}; - $c .= " --syn rw $h->{'rw_mb'} $h->{'rw_size'}" if $h->{'rw'}; - $c .= " --syn readfile $h->{'readfile_mb'} $h->{'readfile_size'}" if $h->{'readfile'}; - $c .= " --syn makedirs $h->{'makedirs_dirs'} $h->{'makedirs_files'} $h->{'makedirs_depth'}" if $h->{'makedirs'}; - - if ($h->{'ebofs_freelist'}) { - system "cp freelist/ebofs.freelist.$h->{'ebofs_freelist'} ebofs.freelist"; - $c .= " --osd_age_time -1"; - } - - for my $k ('nummds', 'numclient', 'numosd', 'kill_after', - 'osd_maxthreads', 'osd_object_layout', 'osd_pg_layout','osd_pg_bits', - 'mds_bal_rep', 'mds_bal_interval', 'mds_bal_max','mds_decay_halflife', - 'mds_bal_hash_rd','mds_bal_hash_wr','mds_bal_unhash_rd','mds_bal_unhash_wr', - 'mds_cache_size','mds_log_max_len', - 'mds_local_osd', - 'osd_age_time','osd_age', - 'osd_rep', - 'osd_pad_pg_log','ebofs_realloc', - 'osd_balance_reads', - 'tcp_multi_out', - 'client_cache_stat_ttl','client_cache_readdir_ttl', - 'client_oc', - 'fake_osdmap_updates', - 'bdev_el_bidir', 'ebofs_idle_commit_ms', 'ebofs_commit_ms', - 'ebofs_oc_size','ebofs_cc_size','ebofs_bc_size','ebofs_bc_max_dirty','ebofs_abp_max_alloc', - 'file_layout_ssize','file_layout_scount','file_layout_osize','file_layout_num_rep', - 'meta_dir_layout_ssize','meta_dir_layout_scount','meta_dir_layout_osize','meta_dir_layout_num_rep', - 'meta_log_layout_ssize','meta_log_layout_scount','meta_log_layout_osize','meta_log_layout_num_rep') { - $c .= " --$k $h->{$k}" if defined $h->{$k}; - } - - $c .= ' ' . $h->{'custom'} if $h->{'custom'}; - - $c .= " --log_name $relout/$keys"; - - my $post = "#!/bin/sh -script/sum.pl -start $h->{'start'} -end $h->{'end'} $fn/osd\\* > $fn/sum.osd -script/sum.pl -start $h->{'start'} -end $h->{'end'} $fn/mds? $fn/mds?? > $fn/sum.mds -script/sum.pl -start $h->{'start'} -end $h->{'end'} $fn/mds*.log > $fn/sum.mds.log -script/sum.pl -start $h->{'start'} -end $h->{'end'} $fn/clnode* > $fn/sum.cl -touch $fn/.post -"; - open(O,">$fn/sh.post"); - print O $post; - close O; - - my $killmin = 1 + int ($h->{'kill_after'} / 60); - - $c = "bash -c \"ulimit -c 0 ; $c\""; - #$c = "bash -c \"$c\""; - - my $srun = "srun --wait=600 --exclude=jobs/ltest.ignore -l -t $killmin -N $h->{'n'} -p ltest"; - my $mpiexec = "mpiexec -l -n $h->{'n'}"; - my $launch; - if ($use_srun) { - $launch = $srun; - } else { - $launch = $mpiexec; - } - - if ($sim->{'_psub'}) { - # template! - my $tp = `cat $sim->{'_psub'}`; - $tp =~ s/\$CWD/$cwd/g; - $tp =~ s/\$NAME/$name/g; - $tp =~ s/\$NUM/$h->{'n'}/g; - $tp =~ s/\$OUT/$fn\/o/g; - $tp =~ s/\$DONE/$fn\/.done/g; - $tp =~ s/\$CMD/$c/g; - open(O,">$out/$name"); - print O $tp; - close O; - print "\npsub $out/$name\n"; - return; - } else { - # run - my $cmd = "\n$launch $c > $fn/o && touch $fn/.done";# - #my $cmd = "\n$launch $c > $fn/o ; touch $fn/.done"; - print "$cmd $nobg\n"; - my $r = undef; - unless ($fake) { - if ($sim->{'_pre'}) { - print "pre: $launch $sim->{'_pre'}\n"; - system "$launch $sim->{'_pre'}"; - } - $r = system $cmd; - if ($sim->{'_post'}) { - print "post: $launch $sim->{'_post'}\n"; - system "$launch $sim->{'_post'}"; - } - if ($r) { - print "r = $r\n"; - #&reset; - } - system "sh $fn/sh.post"; - } - return $r; - } -} - - - -my @r = &iterate($sim); -my $n = scalar(@r); -my $c = 1; -my %r; -my $nfailed = 0; -for my $h (@r) { - my $d = `date`; - chomp($d); - $d =~ s/ P.T .*//; - print "# === $c/$n"; - print " ($nfailed failed)" if $nfailed; - print " $d: "; - my $r = &run($h); - - if (!(defined $r)) { - # already done - } else { - if ($r) { - $nfailed++; - } - print "sleep $h->{'sleep'}\n"; - sleep $h->{'sleep'}; - } - - $c++; -} -print "$nfailed failed\n"; - - -my @comb; -if ($comb) { - my $x = $comb->{'x'}; - my @vars = @{$comb->{'vars'}}; - - print "\n\n# post\n"; - for my $p (@fulldirs) { - print "sh $p/sh.post\n"; - } - - my @filters = sort keys %filters; - my $cmd = "script/comb.pl $x @vars - @fulldirs - @filters > $out/c"; - print "$cmd\n"; - open(O,">$out/comb"); - print O "$cmd\n"; - close O; - system $cmd; - - print "\n\n"; - - my $plot; - $plot .= "set data style linespoints;\n"; - my $s = 2; - for my $v (@vars) { - my $c = $s; - $s++; - my @p; - for my $f (@filters) { - my $t = $f; - if ($comb->{'maptitle'}) { - for my $a (keys %{$comb->{'maptitle'}}) { - my $b = $comb->{'maptitle'}->{$a}; - $t =~ s/$a/$b/; - } - } - push (@p, "\"$out/c\" u 1:$c t \"$t\"" ); - $c += scalar(@vars); - } - $plot .= "# $v\nplot " . join(", ", @p) . ";\n\n"; - } - print $plot; - open(O,">$out/plot"); - print O $plot; - close O; -} - diff --git a/trunk/ceph/script/smooth.pl b/trunk/ceph/script/smooth.pl deleted file mode 100755 index 6cfbaf60ff921..0000000000000 --- a/trunk/ceph/script/smooth.pl +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/perl - -my $n = shift @ARGV || 2; - -my %v; # t -> [..] -while (<>) { - chomp; - my @l = split(/\t/,$_); - my $t = shift @l; - if (int $t) { - $v{$t} = \@l; - } else { - print "$_\n"; - } -} - -for my $t (sort {$a <=> $b} keys %v) { - my $s = $t - $n/2; - my @v; - my $c = 0; - for (my $a=0; $a < $n; $a++) { - my $x = $t + $a; - next unless ($v{$x}); - my @o = @{$v{$x}}; - #print "$t: $x o @o\n"; - if (@v) { - for (my $y=0; $y<=$#o; $y++) { - $v[$y] += $o[$y]; - } - } else { - @v = @o; - } - #print "$t: $x v @v\n"; - $c++; - } - print "$t"; - for my $sum (@v) { - print "\t" . ($sum / $c); - } - print "\n"; -} diff --git a/trunk/ceph/script/study_find.pl b/trunk/ceph/script/study_find.pl deleted file mode 100755 index 6e6cccdf37c89..0000000000000 --- a/trunk/ceph/script/study_find.pl +++ /dev/null @@ -1,224 +0,0 @@ -#!/usr/bin/perl - -use strict; - -my $name = shift @ARGV || die; - -my $nfiles = 0; -my $ndirs = 0; -my $nreg = 0; -my $nhardlinks = 0; -my %nlinks; -my %ino_nlinks; -my %names; -my %dirsize; - -my %fnlen; - -my %hdepth; - -my $bytes; -my $ebytes; - -# -# output generated with -# -# find . -path ./.snapshot -prune -o -exec ls -dilsn --time-style=+%s \{\} \; -# -# find output looks like this: -#4495744 4 drwxrwxrwx 24 0 0 4096 1187290970 . -#2996320 8 drwxr-xr-x 189 0 1000 8192 1186594257 ./jangle -#28378499 4 drwxr-x--x 4 1068885 52673 4096 1162938122 ./jangle/cymcruise -#28378500 4 drwx--S--- 5 1068885 52673 4096 1162938122 ./jangle/cymcruise/Maildir -#28378501 4 drwx------ 2 1068885 52673 4096 1162938122 ./jangle/cymcruise/Maildir/tmp -#28378502 4 drwx------ 2 1068885 52673 4096 1162938122 ./jangle/cymcruise/Maildir/new -#28378503 4 drwx------ 2 1068885 52673 4096 1162938122 ./jangle/cymcruise/Maildir/cur -#28378504 4 -rw-r--r-- 1 1068885 52673 260 943743700 ./jangle/cymcruise/.alias -#999425 4 drwxr-xr-x 92 1125 100 4096 1186523060 . -#999426 0 lrwxrwxrwx 1 0 0 5 1177701093 ./root -> /root -#1015809 4 drwxr-xr-x 4 1289 1000 4096 1174584949 ./andrea -#541007 4 drwxr-xr-x 3 0 0 4096 1173111449 ./andrea/lux -#5014055 4 drwx--S--- 11 70228 51207 4096 1172250346 ./andrea/lux/Maildir - -# dirs we're currently counting in -my %numindir; - -sub finish_dir { - my $curdir = shift @_; - #print "finish_dir $numindir{$curdir} in $curdir\n"; - $dirsize{$numindir{$curdir}}++; - $ndirs++; - delete $numindir{$curdir}; -} - -my $curdir; -while (<>) { - #print; - chomp; - my ($ino, $blah, $mode, $nlink, $uid, $gid, $size, $mtime, @path) = split(/[ ]+/,$_); - my $file = join(' ',@path); - ($file) = split(/ \-\> /, $file); # ignore symlink dest - my @bits = split(/\//, $file); - my $depth = scalar(@bits); - my $f = pop @bits; - my $dir = join('/', @bits); - #print "file = '$file', dir = '$dir', curdir = '$curdir'\n"; - - if ($dir ne $curdir) { - for my $d (keys %numindir) { - #print "? $d vs $dir\n"; - &finish_dir($d) if ($d ne substr($dir, 0, length($d))); - } - $curdir = $dir; - } - - my $esize = 0; - $esize = int (($size-1)/4096)*4096 + 4096 if $size > 0; - $esize += 160; # for the inode? - $bytes += $size; - $ebytes += $esize; - - $nfiles++; - $numindir{$dir}++; - - $hdepth{$depth}++; - - my $fnlen = length($f); - $fnlen{$fnlen}++; - - if ($mode =~ /^d/) { - # find does depth-first search, so assume we descend, so that on empty dir we "back out" above and &finish_dir. - $numindir{$file} = 0; - $curdir = $file; - } else { - $nreg++ if $mode =~ /^f/; - if ($nlink > 1) { - #system "ls -aldi $file"; - $nhardlinks++; - $nlinks{$nlink}++; - $ino_nlinks{$ino} = $nlink; - push(@{$names{$ino}->{$dir}}, $file); - } - } -} -for my $d (keys %numindir) { - &finish_dir($d); -} - - - -my $nsamedir = 0; -open(LOG, ">$name.log"); -my %dirmap; # from dir -> to dir -for my $ino (keys %names) { - print LOG "# $ino\n"; - my @dirs = keys %{$names{$ino}}; - my $insamedir = 1 if scalar(@dirs) == 1; - for my $dir (@dirs) { - print LOG "#\t$dir\n"; - for my $fn (@{$names{$ino}->{$dir}}) { - print LOG "#\t\t$fn\n"; - $nsamedir++ if $insamedir; - } - } - - # stick in dirmap - for (my $i=0; $i<$#dirs; $i++) { - for (my $j=1; $j <= $#dirs; $j++) { - print LOG "# $dirs[$i] <-> $dirs[$j]\n"; - push(@{$dirmap{$dirs[$i]}->{$dirs[$j]}}, $ino); - push(@{$dirmap{$dirs[$j]}->{$dirs[$i]}}, $ino); - } - } -} - - -my $notherinsamedir = 0; -my $notherinsamedirs = 0; -for my $ino (keys %names) { - my @dirs = keys %{$names{$ino}}; - next unless (scalar(@dirs) > 1); - my $n = 0; - my $np = 0; - for (my $i=0; $i<$#dirs; $i++) { - for (my $j=$i+1; $j <= $#dirs; $j++) { - $np++; - if (scalar(@{$dirmap{$dirs[$i]}->{$dirs[$j]}}) > 1 || - scalar(@{$dirmap{$dirs[$j]}->{$dirs[$i]}}) > 1) { - $n++; - #print LOG "# $ino is not alone between $dirs[$i] and $dirs[$j] : @{$dirmap{$dirs[$j]}->{$dirs[$i]}}\n"; - } - } - } - if ($n) { - print LOG "# $ino\tfor $n / $np dir pairs, there is another hl between the same pair of dirs\n"; - $notherinsamedir += $ino_nlinks{$ino}; - $notherinsamedirs += ($n / $np) * $ino_nlinks{$ino}; - } else { - print LOG "# $ino is ALL ALONE\n"; - } -} -close LOG; -$notherinsamedirs = sprintf("%.1f",$notherinsamedirs); - - -sub do_cdf { - my $hash = shift @_; - my $num = shift @_; - my $fn = shift @_; - - open(CDF, ">$fn") if $fn; - print CDF "# $name\n"; - - my $median; - my $sum = 0; - my $p = 0; - my $lastv = 0; - for my $v (sort {$a <=> $b} keys %$hash) { - print CDF "$v\t$hash->{$v}\n"; - $p += $hash->{$v}; - $sum += $hash->{$v} * $v; - if (!(defined $median) && - $p >= ($num/2)) { - $median = $v; - } - } - if ($p != $num) { - warn "uh oh, BUG, $p != $num in cdf/median calculation\n"; - } - my $avg = sprintf("%.2f", $sum/$num); - print CDF "# avg $avg, median $median, sum $sum, num $num\n"; - return ($avg, $median); -} -close DSLOG; - - -# do cdfs -my ($avgdirsize, $mediandirsize) = &do_cdf(\%dirsize, $ndirs, "$name.ds"); -my ($avgfnlen, $medianfnlen) = &do_cdf(\%fnlen, $nfiles, "$name.fnlen"); -my ($avgdepth, $mediandepth) = &do_cdf(\%hdepth, $nfiles, "$name.hdepth"); - - -# stat fs -#my $df = `df $base`; -#my $line = (split(/\n/,$df))[1]; # second line -#my ($kb) = $df =~ /\s+\d+\s+(\d+)/; -my $gb = sprintf("%.1f",($ebytes / 1024 / 1024 / 1024)); - -open(O, ">$name.sum"); - -# final line -my $pad = '# ' . (' ' x (length($name)-2)); -print O "$pad\tgb\tfiles\tdirs\tdsavg\tdsmed\tfnavg\tfnmed\treg\tnl>1\tsmdr\tothers\totherss\tnlink=2\t=3\t=4\t...\n"; -print O "$name\t$gb\t$nfiles\t$ndirs\t$avgdirsize\t$mediandirsize\t$avgfnlen\t$medianfnlen\t$nreg\t$nhardlinks\t$nsamedir\t$notherinsamedir\t$notherinsamedirs"; -my $i = 2; -for (sort {$a <=> $b} keys %nlinks) { - while ($_ < $i) { - print O "\t0"; - } - print O "\t$nlinks{$_}"; - $i = $_ + 1; -} -print O "\n"; - -close O; diff --git a/trunk/ceph/script/study_hardlink_lifetimes.pl b/trunk/ceph/script/study_hardlink_lifetimes.pl deleted file mode 100755 index 012ef6009bb43..0000000000000 --- a/trunk/ceph/script/study_hardlink_lifetimes.pl +++ /dev/null @@ -1,131 +0,0 @@ -#!/usr/bin/perl - -use strict; - -my %ns; # parent -> fn -> ino -my %nlink; # num links to each ino -my %since; # when it got its second link - -my @ignore = ('ll_getattr','ll_setattr','ll_forget','ll_fsync','ll_readlink','ll_statfs','ll_opendir','ll_releasedir','ll_flush','ll_release','ll_open','ll_read','ll_write'); - -my $when; - -my $sumage; -my $numage; - -sub unlink { - my ($p,$n) = @_; - my $i = $ns{$p}->{$n}; - my $new = --$nlink{$i}; - if ($new == 1) { - my $age = $when - $since{$i}; - #print "$since{$i} to $when on $i\t$age\n"; - delete $since{$i}; - - $numage++; - $sumage += $age; - - } elsif ($new == 0) { - delete $nlink{$i}; - } - delete $ns{$p}->{$n}; -} - - -my ($sec, $usec, $cmd); -$_ = <>; -while (1) { - # read trace record - chomp; - last unless $_ eq '@'; - - chomp(my $sec = <>); - chomp(my $usec = <>); - $when = sprintf("%d.%06d",$sec,$usec);# + ($usec / 1000000); - #$when = "$sec.$usec"; - - chomp($cmd = <>); - - #print "cmd $cmd\n"; - - if ($cmd eq 'll_lookup') { - chomp(my $p = <>); - chomp(my $n = <>); - chomp(my $r = <>); - $ns{$p}->{$n} = $r; - } - - elsif ($cmd eq 'll_create') { - chomp(my $p = <>); - chomp(my $n = <>); - <>; <>; <>; - chomp(my $r = <>); - $ns{$p}->{$n} = $r; - $nlink{$r} = 1; - } - elsif ($cmd eq 'll_mknod') { - chomp(my $p = <>); - chomp(my $n = <>); - <>; <>; - chomp(my $r = <>); - $ns{$p}->{$n} = $r; - $nlink{$r} = 1; - } - elsif ($cmd eq 'll_mkdir') { - chomp(my $p = <>); - chomp(my $n = <>); - <>; - chomp(my $r = <>); - $ns{$p}->{$n} = $r; - $nlink{$r} = 1; - } - elsif ($cmd eq 'll_symlink') { - chomp(my $p = <>); - chomp(my $n = <>); - <>; - chomp(my $r = <>); - $ns{$p}->{$n} = $r; - $nlink{$r} = 1; - } - elsif ($cmd eq 'll_link') { - chomp(my $i = <>); - chomp(my $p = <>); - chomp(my $n = <>); - $ns{$p}->{$n} = $i; - if (++$nlink{$i} == 2) { - $since{$i} = $when; - } - } - elsif ($cmd eq 'll_unlink' || - $cmd eq 'll_rmdir') { - chomp(my $p = <>); - chomp(my $n = <>); - &unlink($p, $n); - } - elsif ($cmd eq 'll_rename') { - chomp(my $p = <>); - chomp(my $n = <>); - chomp(my $np = <>); - chomp(my $nn = <>); - if ($ns{$np}->{$nn}) { - &unlink($np, $nn); - } - $ns{$np}->{$nn} = $ns{$p}->{$n}; - delete $ns{$p}->{$n}; - } - - # skip to @ - while (<>) { - last if $_ eq "@\n"; - print "$cmd: $_" - unless grep {$_ eq $cmd} @ignore; - } -} - -print "num $numage .. sum $sumage .. avg lifetime " . ($sumage / $numage) . "\n"; - -# dump hard link inos -for my $ino (keys %nlink) { - next if $nlink{$ino} < 2; - print "$ino\t$nlink{$ino}\n"; -} diff --git a/trunk/ceph/script/study_lookups.pl b/trunk/ceph/script/study_lookups.pl deleted file mode 100644 index 7a0784f3210a4..0000000000000 --- a/trunk/ceph/script/study_lookups.pl +++ /dev/null @@ -1,137 +0,0 @@ -#!/usr/bin/perl - -use strict; - -my @buckets = (1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096); - -my %dir_miss; # dir(ino) -> last lookup miss -my %dir_has; # ino -> dentries we have - - -my %ns; # parent -> fn -> ino -my %nlink; # num links to each ino -my %since; # when it got its second link - -my @ignore = ('ll_getattr','ll_setattr','ll_forget','ll_fsync','ll_readlink','ll_statfs','ll_opendir','ll_releasedir','ll_flush','ll_release','ll_open','ll_read','ll_write'); - -my $when; - -my $sumage; -my $numage; - -sub unlink { - my ($p,$n) = @_; - my $i = $ns{$p}->{$n}; - my $new = --$nlink{$i}; - if ($new == 1) { - my $age = $when - $since{$i}; - #print "$since{$i} to $when on $i\t$age\n"; - delete $since{$i}; - - $numage++; - $sumage += $age; - - } elsif ($new == 0) { - delete $nlink{$i}; - } - delete $ns{$p}->{$n}; -} - - -my ($sec, $usec, $cmd); -$_ = <>; -while (1) { - # read trace record - chomp; - last unless $_ eq '@'; - - chomp(my $sec = <>); - chomp(my $usec = <>); - $when = sprintf("%d.%06d",$sec,$usec);# + ($usec / 1000000); - #$when = "$sec.$usec"; - - chomp($cmd = <>); - - #print "cmd $cmd\n"; - - if ($cmd eq 'll_lookup') { - chomp(my $p = <>); - chomp(my $n = <>); - chomp(my $r = <>); - $ns{$p}->{$n} = $r; - } - - elsif ($cmd eq 'll_create') { - chomp(my $p = <>); - chomp(my $n = <>); - <>; <>; <>; - chomp(my $r = <>); - $ns{$p}->{$n} = $r; - $nlink{$r} = 1; - } - elsif ($cmd eq 'll_mknod') { - chomp(my $p = <>); - chomp(my $n = <>); - <>; <>; - chomp(my $r = <>); - $ns{$p}->{$n} = $r; - $nlink{$r} = 1; - } - elsif ($cmd eq 'll_mkdir') { - chomp(my $p = <>); - chomp(my $n = <>); - <>; - chomp(my $r = <>); - $ns{$p}->{$n} = $r; - $nlink{$r} = 1; - } - elsif ($cmd eq 'll_symlink') { - chomp(my $p = <>); - chomp(my $n = <>); - <>; - chomp(my $r = <>); - $ns{$p}->{$n} = $r; - $nlink{$r} = 1; - } - elsif ($cmd eq 'll_link') { - chomp(my $i = <>); - chomp(my $p = <>); - chomp(my $n = <>); - $ns{$p}->{$n} = $i; - if (++$nlink{$i} == 2) { - $since{$i} = $when; - } - } - elsif ($cmd eq 'll_unlink' || - $cmd eq 'll_rmdir') { - chomp(my $p = <>); - chomp(my $n = <>); - &unlink($p, $n); - } - elsif ($cmd eq 'll_rename') { - chomp(my $p = <>); - chomp(my $n = <>); - chomp(my $np = <>); - chomp(my $nn = <>); - if ($ns{$np}->{$nn}) { - &unlink($np, $nn); - } - $ns{$np}->{$nn} = $ns{$p}->{$n}; - delete $ns{$p}->{$n}; - } - - # skip to @ - while (<>) { - last if $_ eq "@\n"; - print "$cmd: $_" - unless grep {$_ eq $cmd} @ignore; - } -} - -print "num $numage .. sum $sumage .. avg lifetime " . ($sumage / $numage) . "\n"; - -# dump hard link inos -for my $ino (keys %nlink) { - next if $nlink{$ino} < 2; - print "$ino\t$nlink{$ino}\n"; -} diff --git a/trunk/ceph/script/sum.pl b/trunk/ceph/script/sum.pl deleted file mode 100755 index 92ef9a9b222a8..0000000000000 --- a/trunk/ceph/script/sum.pl +++ /dev/null @@ -1,148 +0,0 @@ -#!/usr/bin/perl - -use strict; -my $starttime = 1; -my $endtime = -1; - -my $avgrows = 0; - -while ($ARGV[0] =~ /^-/) { - $_ = shift @ARGV; - if ($_ eq '-avg') { - $avgrows = 1; - } - elsif ($_ eq '-start') { - $starttime = shift @ARGV; - } - elsif ($_ eq '-end') { - $endtime = shift @ARGV; - } - else { - die "i don't understand arg $_"; - } -} -my @files = @ARGV; - -if (scalar(@files) == 1 && $files[0] =~ /\*/) { - my ($dir, $pat) = $files[0] =~ /^(.*)\/([^\/]+)$/; - @files = (); - $pat =~ s/\*//; -# print "dir $dir pat $pat\n"; - opendir(D,"$dir"); - for my $f (readdir(D)) { - # print "$f\n"; - next unless $f =~ /^$pat/; - push(@files, "$dir/$f"); - } - closedir(D); - -# print "files = @files\n"; -} - -my @data; -for my $f (@files) { - open(I,$f); - push( @data, ); - close I; -} - -my %sum; # time -> name -> val -my %col; # colnum -> name .. colnums start at 0 (time doesn't count) -my %min; -my %max; -my %avg; -my %tcount; -my $files; -for (@data) { - chomp; - my @r = split(/\s+/,$_); - my $r = shift @r; - - # column headings? - if ($r =~ /^\#/) { - my $num = 0; - while (my $name = shift @r) { - $col{$num} = $name; - $num++; - } - next; - } - - next unless int $r; - next if $r < $starttime; - next if $endtime > 0 && $r > $endtime; - - $tcount{$r}++; - $files = $tcount{$r} if $tcount{$r} > $files; - #print "$r: @r\n"; - my $i = 0; - while (@r) { - my $v = shift @r; - $sum{$r}->{$col{$i}} += $v; # if $v > 0; - - $min{$col{$i}} = $v - if ($min{$col{$i}} > $v || !(defined $min{$col{$i}})); - $max{$col{$i}} = $v - if ($max{$col{$i}} < $v); - - $avg{$col{$i}} += $v; - $i++; - } -} - -## dump -my @c = sort {$a <=> $b} keys %col; -# cols -print join("\t",'#', map { $col{$_} } @c) . "\n"; -my $n = 0; -for my $k (sort {$a <=> $b} keys %sum) { - if ($avgrows) { - print join("\t",$k, #map int, - map { $sum{$k}->{$col{$_}}/$tcount{$k} } @c ) . "\n"; - } else { - print join("\t",$k, map { $sum{$k}->{$col{$_}} } @c ) . "\n"; - } - $n++; -} - -my $rows = $n || 1; -#my $files = $tcount{$starttime}; -my %avgval; - -## devt -#warn "rows $rows, files $files\n"; -my %avgvalvart; # std dev of each col avg, over time -for my $k (keys %avg) { - my $av = $avgval{$k} = $avg{$k} / ($rows*$files); - - my $var = 0.0; - for my $t (sort {$a <=> $b} keys %sum) { - my $a = $sum{$t}->{$k} / $files; - $var += ($a - $av) * ($a - $av); - } - - $avgvalvart{$k} = $var / $rows; -} - - - - -print "\n"; -print join("\t",'#', map { $col{$_} } @c) . "\n"; -print join("\t", '#minval', map { $min{$col{$_}} } @c ) . "\n"; -print join("\t", '#maxval', map { $max{$col{$_}} } @c ) . "\n"; -print join("\t", '#rows', map { $rows } @c) . "\n"; -print join("\t", '#files', map { $files } @c) . "\n"; -print join("\t", '#sum', - map { $avg{$col{$_}} } @c ) . "\n"; -print join("\t", '#avgval', #map int, - map { $avgval{$col{$_}} } @c ) . "\n"; -# map { ($rows*$files) ? ($_ / ($rows*$files)):0 } map { $avg{$col{$_}} } @c ) . "\n"; - -print join("\t", '#avgvalvart', - map { $avgvalvart{$col{$_}} } @c ) . "\n"; -print join("\t", '#avgvaldevt', - map { sqrt($_) } map { $avgvalvart{$col{$_}} } @c ) . "\n"; - -print join("\t", '#avgsum', #map int, - map { $_ / $rows } map { $avg{$col{$_}} } @c ) . "\n"; diff --git a/trunk/ceph/test/fakemds.cc b/trunk/ceph/test/fakemds.cc deleted file mode 100644 index b75b62d58152c..0000000000000 --- a/trunk/ceph/test/fakemds.cc +++ /dev/null @@ -1,104 +0,0 @@ - - -#include -#include -#include - -#include "mds/MDS.h" -#include "osd/OSD.h" -#include "fakeclient/FakeClient.h" - -#include "mds/MDCluster.h" -#include "mds/MDCache.h" -#include "mds/MDStore.h" - -#include "msg/FakeMessenger.h" - -#include "messages/MPing.h" - -using namespace std; - -__uint64_t ino = 1; - - - -#include "config.h" -#define NUMMDS g_conf.num_mds -#define NUMOSD g_conf.num_osd -#define NUMCLIENT g_conf.num_fakeclient - -// this parses find output -int play(); - -int main(int oargc, char **oargv) { - cerr << "hi there" << endl; - - int argc; - char **argv; - parse_config_options(oargc, oargv, - argc, argv); - - MDCluster *mdc = new MDCluster(NUMMDS, NUMOSD); - - // local config settings - g_conf.num_client = g_conf.num_fakeclient; // to fool mds, hack gross - - // create osds - OSD *osd[NUMOSD]; - for (int i=0; iinit(); - } - - // create mds - MDS *mds[NUMMDS]; - for (int i=0; iinit(); - } - - - // create clients - FakeClient *client[NUMCLIENT]; - for (int i=0; iinit(); - } - - // mount clients - for (int i=0; imount(); - - // loop - fakemessenger_do_loop(); - - //mds[0]->shutdown_start(); - //fakemessenger_do_loop(); - - // - if (argc > 1 && - strcmp(argv[1], "nocheck") == 0) { - cerr << "---- nocheck" << endl; - } else { - cout << "---- check ----" << endl; - for (int i=0; imdcache->shutdown_pass(); - } - - // cleanup - cout << "cleanup" << endl; - for (int i=0; i - * Daniel Jönsson - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the Do What The Fuck You Want To - * Public License as published by Banlu Kemiyatorn. See - * http://sam.zoy.org/projects/COPYING.WTFPL for more details. - * - * Compilation example: - * gcc -shared -fPIC gprof-helper.c -o gprof-helper.so -lpthread -ldl - * - * Usage example: - * LD_PRELOAD=./gprof-helper.so your_program - */ - -#define _GNU_SOURCE -#include -#include -#include -#include -#include - -static void * wrapper_routine(void *); - -/* Original pthread function */ -static int (*pthread_create_orig)(pthread_t *__restrict, - __const pthread_attr_t *__restrict, - void *(*)(void *), - void *__restrict) = NULL; - -/* Library initialization function */ -void wooinit(void) __attribute__((constructor)); - -void wooinit(void) -{ - pthread_create_orig = dlsym(RTLD_NEXT, "pthread_create"); - fprintf(stderr, "pthreads: using profiling hooks for gprof\n"); - if(pthread_create_orig == NULL) - { - char *error = dlerror(); - if(error == NULL) - { - error = "pthread_create is NULL"; - } - fprintf(stderr, "%s\n", error); - exit(EXIT_FAILURE); - } -} - -/* Our data structure passed to the wrapper */ -typedef struct wrapper_s -{ - void * (*start_routine)(void *); - void * arg; - - pthread_mutex_t lock; - pthread_cond_t wait; - - struct itimerval itimer; - -} wrapper_t; - -/* The wrapper function in charge for setting the itimer value */ -static void * wrapper_routine(void * data) -{ - /* Put user data in thread-local variables */ - void * (*start_routine)(void *) = ((wrapper_t*)data)->start_routine; - void * arg = ((wrapper_t*)data)->arg; - - /* Set the profile timer value */ - setitimer(ITIMER_PROF, &((wrapper_t*)data)->itimer, NULL); - - /* Tell the calling thread that we don't need its data anymore */ - pthread_mutex_lock(&((wrapper_t*)data)->lock); - pthread_cond_signal(&((wrapper_t*)data)->wait); - pthread_mutex_unlock(&((wrapper_t*)data)->lock); - - /* Call the real function */ - return start_routine(arg); -} - -/* Our wrapper function for the real pthread_create() */ -int pthread_create(pthread_t *__restrict thread, - __const pthread_attr_t *__restrict attr, - void * (*start_routine)(void *), - void *__restrict arg) -{ - wrapper_t wrapper_data; - int i_return; - - /* Initialize the wrapper structure */ - wrapper_data.start_routine = start_routine; - wrapper_data.arg = arg; - getitimer(ITIMER_PROF, &wrapper_data.itimer); - pthread_cond_init(&wrapper_data.wait, NULL); - pthread_mutex_init(&wrapper_data.lock, NULL); - pthread_mutex_lock(&wrapper_data.lock); - - /* The real pthread_create call */ - i_return = pthread_create_orig(thread, - attr, - &wrapper_routine, - &wrapper_data); - - /* If the thread was successfully spawned, wait for the data - * to be released */ - if(i_return == 0) - { - pthread_cond_wait(&wrapper_data.wait, &wrapper_data.lock); - } - - pthread_mutex_unlock(&wrapper_data.lock); - pthread_mutex_destroy(&wrapper_data.lock); - pthread_cond_destroy(&wrapper_data.wait); - - return i_return; -} - diff --git a/trunk/ceph/test/makedirs.cc b/trunk/ceph/test/makedirs.cc deleted file mode 100644 index 8fd74d996ef9f..0000000000000 --- a/trunk/ceph/test/makedirs.cc +++ /dev/null @@ -1,38 +0,0 @@ -#include -#include -using namespace std; - -int make_dirs(const char *basedir, int dirs, int files, int depth) -{ - //if (time_to_stop()) return 0; - - // make sure base dir exists - int r = mkdir(basedir, 0755); - if (r != 0) { - cout << "can't make base dir? " << basedir << endl; - return -1; - } - - // children - char d[500]; - cout << "make_dirs " << basedir << " dirs " << dirs << " files " << files << " depth " << depth << endl; - for (int i=0; i -#include -#include -using namespace std; - -#include "mds/MDCluster.h" -#include "mds/MDS.h" -#include "osd/OSD.h" -#include "fakeclient/FakeClient.h" - -#include "mds/MDCache.h" -#include "mds/MDStore.h" - -#include "msg/MPIMessenger.h" -//#include "msg/CheesySerializer.h" - -#include "messages/MPing.h" - - -__uint64_t ino = 1; - - - -#include "config.h" -#define NUMMDS g_conf.num_mds -#define NUMOSD g_conf.num_osd -#define NUMCLIENT g_conf.num_client - -// this parses find output -int play(); - -int main(int argc, char **argv) { - cout << "mpitest starting" << endl; - - int myrank = mpimessenger_init(argc, argv); - int world = mpimessenger_world(); - - - - MDCluster *mdc = new MDCluster(NUMMDS, NUMOSD); - - // create osds - OSD *osd[NUMOSD]; - for (int i=0; iinit(); - } - - // create mds - MDS *mds[NUMMDS]; - for (int i=0; iinit(); - } - - // create clients - FakeClient *client[NUMCLIENT]; - for (int i=0; iset_dispatcher(serializer); - - client[i] = new FakeClient(mdc, i, real, g_conf.fakeclient_requests); - client[i]->init(); - } - - // seed initial requests - for (int i=0; iissue_request(); - } - - mpimessenger_start(); // start message loop - mpimessenger_wait(); // wait for thread to finish - mpimessenger_shutdown(); // shutdown MPI - - // - /* - cout << "---- check ----" << endl; - for (int i=0; imdcache->shutdown_pass(); - } - */ - - // cleanup - //cout << "cleanup" << endl; - for (int i=0; i -#include "mpi.h" - -#include "messages/MClientRequest.h" -#include "msg/MTMessenger.h" -#include "include/error.h" - -#define SARG_SIZE 64 -#define SERVER_RANK 0 -#define NTHREADS 11 // number of threads per rank -#define NMESSAGES 31 // number of messages per thread - -static void server_loop(MTMessenger &msgr, int world_size) -{ - // we expect this many messages from clients, then we quit - // (world_size-1 since server is one of the processes). - int totmsg = NTHREADS * NMESSAGES * (world_size - 1); - int nmsg = 0; - - char buf[SARG_SIZE]; - - while(nmsg < totmsg) { - MClientRequest *req = (MClientRequest*)msgr.recvreq(); - ASSERT(req->get_type() == MSG_CLIENT_REQUEST); - - //cout << "Server acknowledging " << req->get_sarg() << endl; - - sprintf(buf, "%s reply", req->get_sarg().c_str()); - MClientRequest resp(0, 0); - resp.set_sarg(buf); - msgr.sendresp(req, &resp); - - delete req; - nmsg++; - } - - cout << "Server successful" << endl; -} - -// arguments for client thread start function (see pthread_create) -struct client_arg -{ - MTMessenger *msgr; - int rank; - int thread; -}; - -static void *client_session(void *_carg) -{ - client_arg *carg = (client_arg *)_carg; - - char buf[SARG_SIZE]; - - // repeat some number (arbitrary really) of rounds - for (int i = 0; i < NMESSAGES; i++) { - - // send the message, receive the reply and check reply is as - // expected - - MClientRequest request(0, 0); - sprintf(buf, "r%d:t%d:m%d", carg->rank, carg->thread, i); - request.set_sarg(buf); - - //cout << "Client sending " << request.get_sarg() << endl; - - MClientRequest *resp = - (MClientRequest*)carg->msgr->sendrecv(&request, SERVER_RANK); - - ASSERT(resp->get_type() == MSG_CLIENT_REQUEST); - sprintf(buf, "r%d:t%d:m%d reply", carg->rank, carg->thread, i); - ASSERT(strcmp(buf, resp->get_sarg().c_str()) == 0); - - //cout << "Client verified " << resp->get_sarg() << endl; - - delete resp; - } - - cout << "Client (" << carg->rank << "," << carg->thread - << ") successful" << endl; - - delete carg; - return NULL; -} - -static void launch_clients(MTMessenger &msgr, int rank) -{ - pthread_t tid[NTHREADS]; - - // launch some number (arbitrary really) of threads - for (int i = 0; i < NTHREADS; i++) { - - client_arg *carg = (client_arg*)malloc(sizeof(client_arg)); - ASSERT(carg); - carg->msgr = &msgr; - carg->rank = rank; - carg->thread = i; - - if (pthread_create(&tid[i], NULL, client_session, carg) < 0) - SYSERROR(); - } - - // we must wait for all the threads to exit before returning, - // otherwise we shutdown MPI before while the threads are - // chatting. - for (int i = 0; i < NTHREADS; i++) { - void *retval; - - if (pthread_join(tid[i], &retval) < 0) - SYSERROR(); - } -} - -int main(int argc, char **argv) -{ - MTMessenger msgr(argc, argv); - - int rank; - ASSERT(MPI_Comm_rank(MPI_COMM_WORLD, &rank) == MPI_SUCCESS); - int world_size; - ASSERT(MPI_Comm_size(MPI_COMM_WORLD, &world_size) == MPI_SUCCESS); - - if (rank == SERVER_RANK) - server_loop(msgr, world_size); - else - launch_clients(msgr, rank); - - return 0; -} diff --git a/trunk/ceph/test/rushconfig b/trunk/ceph/test/rushconfig deleted file mode 100644 index 40d82702ea0a5..0000000000000 --- a/trunk/ceph/test/rushconfig +++ /dev/null @@ -1,7 +0,0 @@ -6 -8 10.0 -4 20.0 -7 30.0 -9 10.0 -8 15.0 -5 11.0 diff --git a/trunk/ceph/test/rushtest.cc b/trunk/ceph/test/rushtest.cc deleted file mode 100644 index ecff83523e0c6..0000000000000 --- a/trunk/ceph/test/rushtest.cc +++ /dev/null @@ -1,49 +0,0 @@ -// -// $Id$ -// - -#include -#include -#include "../osd/rush.h" - -main (int argc, char *argv[]) -{ - Rush rush; - char buf[200]; - int i, j, k, numClusters; - int numKeys = 5; - int numReplicas = 4; - int curSize; - double curWeight; - int servers[1000]; - - if (argc > 1) { - numKeys = atoi (argv[1]); - } - if (argc > 2) { - numReplicas = atoi (argv[2]); - } - - fgets (buf, sizeof (buf) - 2, stdin); - sscanf (buf, "%d", &numClusters); - for (i = 0; i < numClusters; i++) { - fgets (buf, sizeof (buf) - 2, stdin); - sscanf (buf, "%d %lf", &curSize, &curWeight); - rush.AddCluster (curSize, curWeight); - if (rush.Servers () < numReplicas) { - fprintf (stderr, "ERROR: must have at least %d disks in the system!\n", - rush.Clusters ()); - exit (-1); - } - for (j = 0; j < numKeys; j++) { - rush.GetServersByKey (j, numReplicas, servers); -#if 0 - printf ("%-3d %-6d ", i, j); - for (k = 0; k < numReplicas; k++) { - printf ("%-5d ", servers[k]); - } - putchar ('\n'); -#endif - } - } -} diff --git a/trunk/ceph/test/rushtest.cc~ b/trunk/ceph/test/rushtest.cc~ deleted file mode 100644 index 0b9512ccd0c3d..0000000000000 --- a/trunk/ceph/test/rushtest.cc~ +++ /dev/null @@ -1,49 +0,0 @@ -// -// $Id$ -// - -#include -#include -#include "rush.h" - -main (int argc, char *argv[]) -{ - Rush rush; - char buf[200]; - int i, j, k, numClusters; - int numKeys = 5; - int numReplicas = 4; - int curSize; - double curWeight; - int servers[1000]; - - if (argc > 1) { - numKeys = atoi (argv[1]); - } - if (argc > 2) { - numReplicas = atoi (argv[2]); - } - - fgets (buf, sizeof (buf) - 2, stdin); - sscanf (buf, "%d", &numClusters); - for (i = 0; i < numClusters; i++) { - fgets (buf, sizeof (buf) - 2, stdin); - sscanf (buf, "%d %lf", &curSize, &curWeight); - rush.AddCluster (curSize, curWeight); - if (rush.Servers () < numReplicas) { - fprintf (stderr, "ERROR: must have at least %d disks in the system!\n", - rush.Clusters ()); - exit (-1); - } - for (j = 0; j < numKeys; j++) { - rush.GetServersByKey (j, numReplicas, servers); -#if 0 - printf ("%-3d %-6d ", i, j); - for (k = 0; k < numReplicas; k++) { - printf ("%-5d ", servers[k]); - } - putchar ('\n'); -#endif - } - } -} diff --git a/trunk/ceph/test/test_disk_bw.cc b/trunk/ceph/test/test_disk_bw.cc deleted file mode 100644 index fc36da74fadb2..0000000000000 --- a/trunk/ceph/test/test_disk_bw.cc +++ /dev/null @@ -1,59 +0,0 @@ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "common/Clock.h" - -#include -using namespace std; - -int main(int argc, char **argv) -{ - void *buf; - int fd, count, loop = 0, ret; - - if (argc != 4) { - fprintf(stderr, "Usage: %s device bsize count\n", argv[0]); - exit (0); - } - - int bsize = atoi(argv[2]); - count = atoi(argv[3]); - - posix_memalign(&buf, sysconf(_SC_PAGESIZE), bsize); - - //if ((fd = open(argv[1], O_SYNC|O_RDWR)) < 0) { - if ((fd = open(argv[1], O_DIRECT|O_RDWR)) < 0) { - - fprintf(stderr, "Can't open device %s\n", argv[1]); - exit (4); - } - - - utime_t start = g_clock.now(); - while (loop++ < count) { - ret = ::write(fd, buf, bsize); - //if ((loop % 100) == 0) - //fprintf(stderr, "."); - } - ::fsync(fd); - ::close(fd); - utime_t end = g_clock.now(); - end -= start; - - - char hostname[80]; - gethostname(hostname, 80); - - double mb = bsize*count/1024/1024; - - cout << hostname << "\t" << mb << " MB\t" << end << " seconds\t" << (mb / (double)end) << " MB/sec" << std::endl; -} diff --git a/trunk/ceph/test/testbucket.cc b/trunk/ceph/test/testbucket.cc deleted file mode 100644 index d8676da18faba..0000000000000 --- a/trunk/ceph/test/testbucket.cc +++ /dev/null @@ -1,67 +0,0 @@ - - -#include "../crush/Bucket.h" -using namespace crush; - -#include -#include -using namespace std; - - -ostream& operator<<(ostream& out, vector& v) -{ - out << "["; - for (int i=0; i disks; - for (int i=0; i<20; i++) - disks.push_back(i); - - - /* - UniformBucket ub(1, 1, 0, 10, disks); - ub.make_primes(h); - cout << "primes are " << ub.primes << endl; - */ - - MixedBucket mb(2, 1); - for (int i=0;i<20;i++) - mb.add_item(i, 10); - - /* - MixedBucket b(3, 1); - b.add_item(1, ub.get_weight()); - b.add_item(2, mb.get_weight()); - */ - MixedBucket b= mb; - - vector ocount(disks.size()); - int numrep = 3; - - vector v(numrep); - for (int x=1; x<1000000; x++) { - //cout << H(x) << "\t" << h(x) << endl; - for (int i=0; i -using namespace std; - -#include "include/bufferlist.h" - - -int main() -{ - - bufferptr p1 = new buffer("123456",6); - bufferptr p2 = p1; - - cout << "it is '" << p1.c_str() << "'" << endl; - - bufferptr p3 = new buffer("abcdef",6); - - cout << "p3 is " << p3 << endl; - - bufferlist bl; - bl.push_back(p2); - bl.push_back(p1); - bl.push_back(p3); - - cout << "bl is " << bl << endl; - - cout << "len is " << bl.length() << endl; - - bufferlist took; - bl.splice(10,4,&took); - - cout << "took out " << took << "leftover is " << bl << endl; - //cout << "len is " << bl.length() << endl; - - bufferlist bl2; - bl2.substr_of(bl, 3, 5); - cout << "bl2 is " << bl2 << endl; - - -} diff --git a/trunk/ceph/test/testcounter.cc b/trunk/ceph/test/testcounter.cc deleted file mode 100644 index a3194489e4886..0000000000000 --- a/trunk/ceph/test/testcounter.cc +++ /dev/null @@ -1,70 +0,0 @@ - -#include "common/DecayCounter.h" - -#include -using namespace std; - -struct RealCounter { -public: - list hits; - - void hit(int ms) { - hits.push_back(ms); - } - - int get(double hl, int now) { - trim(now-hl); - return hits.size(); - } - - void trim(int to) { - while (!hits.empty() && - hits.front() < to) - hits.pop_front(); - } - - -}; - -int main(int argc, char **argv) -{ - int target; - double hl = atof(argv[1]); - cerr << "halflife " << hl << endl; - - DecayCounter dc(hl); - RealCounter rc; - - utime_t now = g_clock.now(); - - for (int ms=0; ms < 300*1000; ms++) { - if (ms % 30000 == 0) { - target = 1 + (rand() % 10) * 10; - if (ms > 200000) target = 0; - } - - if (target && - (rand() % (1000/target) == 0)) { - dc.hit(); - rc.hit(ms); - } - - if (ms % 500 == 0) dc.get(now); - if (ms % 100 == 0) { - //dc.get(now); - DecayCounter o = dc; - cout << ms << "\t" - << target*hl << "\t" - << rc.get(hl*1000, ms) << "\t" - << o.get(now) << "\t" - << dc.val << "\t" - // << dc.delta << "\t" - << o.get_last_vel() << "\t" - << o.get_last() + o.get_last_vel() << "\t" - << endl; - } - - now += .001; - } - -} diff --git a/trunk/ceph/test/testcrush.cc b/trunk/ceph/test/testcrush.cc deleted file mode 100644 index bd432b23ee95c..0000000000000 --- a/trunk/ceph/test/testcrush.cc +++ /dev/null @@ -1,266 +0,0 @@ - - -#include "../crush/crush.h" -using namespace crush; - -#include - -#include -#include -using namespace std; - -/* -ostream& operator<<(ostream& out, vector& v) -{ - out << "["; - for (int i=0; i& d) -{ - d.clear(); - while (n) { - d.push_back(no); - no++; - n--; - } -} - - -Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks, int& nbuckets) -{ - if (h == 0) { - // uniform - Hash hash(123); - vector disks; - for (int i=0; imake_primes(hash); - c.add_bucket(b); - //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; - return b; - } else { - // mixed - MixedBucket *b = new MixedBucket(nbuckets--, h+1); - for (int i=0; iadd_item(n->get_id(), n->get_weight()); - } - c.add_bucket(b); - //cout << h << " mixedbucket with " << wid[h] << endl; - return b; - } -} - -int make_hierarchy(Crush& c, vector& wid, int& ndisks, int& nbuckets) -{ - Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks, nbuckets); - return b->get_id(); -} - - - -int main() -{ - Hash h(73232313); - - // crush - Crush c; - - - // buckets - vector disks; - int root = -1; - int nbuckets = -1; - int ndisks = 0; - - if (0) { - make_disks(12, ndisks, disks); - UniformBucket ub1(-1, 1, 0, 30, disks); - ub1.make_primes(h); - cout << "ub1 primes are " << ub1.primes << endl; - c.add_bucket(&ub1); - - make_disks(17, ndisks, disks); - UniformBucket ub2(-2, 1, 0, 30, disks); - ub2.make_primes(h); - cout << "ub2 primes are " << ub2.primes << endl; - c.add_bucket(&ub2); - - make_disks(4, ndisks, disks); - UniformBucket ub3(-3, 1, 0, 30, disks); - ub3.make_primes(h); - cout << "ub3 primes are " << ub3.primes << endl; - c.add_bucket(&ub3); - - make_disks(20, ndisks, disks); - MixedBucket umb1(-4, 1); - for (int i=0; i<20; i++) - umb1.add_item(disks[i], 30); - c.add_bucket(&umb1); - - MixedBucket b(-100, 1); - //b.add_item(-2, ub1.get_weight()); - b.add_item(-4, umb1.get_weight()); - //b.add_item(-2, ub2.get_weight()); - //b.add_item(-3, ub3.get_weight()); - } - - if (0) { - int bucket = -1; - MixedBucket *root = new MixedBucket(bucket--, 2); - - for (int i=0; i<5; i++) { - MixedBucket *b = new MixedBucket(bucket--, 1); - - int n = 5; - - if (1) { - // add n buckets of n disks - for (int j=0; jadd_item(disks[k], 10); - - //b->add_item(disks[j], 10); - c.add_bucket(d); - b->add_item(d->get_id(), d->get_weight()); - } - - c.add_bucket(b); - root->add_item(b->get_id(), b->get_weight()); - } else { - // add n*n disks - make_disks(n*n, ndisks, disks); - for (int k=0; kadd_item(disks[k], 10); - - c.add_bucket(b); - root->add_item(b->get_id(), b->get_weight()); - } - } - - c.add_bucket(root); - } - - - if (1) { - vector wid; - for (int d=0; d<5; d++) - wid.push_back(10); - root = make_hierarchy(c, wid, ndisks, nbuckets); - } - - - - // rule - int numrep = 1; - - Rule rule; - if (0) { - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, -100)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); - } - if (1) { - /* - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, -4)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, 2, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - */ - rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, 1, 0)); - rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - } - - //c.overload[10] = .1; - - - int pg_per = 100; - int numpg = pg_per*ndisks/numrep; - - vector ocount(ndisks); - cout << ndisks << " disks, " << 1-nbuckets << " buckets" << endl; - cout << pg_per << " pgs per disk" << endl; - cout << numpg << " logical pgs" << endl; - cout << "numrep is " << numrep << endl; - - - int place = 1000000; - int times = place / numpg; - if (!times) times = 1; - - cout << "looping " << times << " times" << endl; - - float tvar = 0; - int tvarnum = 0; - - int x = 0; - for (int t=0; t v(numrep); - - for (int z=0; z -using namespace std; - -int print(string s) { - filepath fp = s; - cout << "s = " << s << " filepath = " << fp << endl; - cout << " depth " << fp.depth() << endl; - for (int i=0; i -#include -#include -using namespace std; - -#include "config.h" -#include "messages/MPing.h" -#include "common/Mutex.h" - -#include "msg/MPIMessenger.h" - -class Pinger : public Dispatcher { -public: - Messenger *messenger; - Pinger(Messenger *m) : messenger(m) { - m->set_dispatcher(this); - } - void dispatch(Message *m) { - //dout(1) << "got incoming " << m << endl; - delete m; - - } -}; - -int main(int argc, char **argv) { - int num = 1000; - - int myrank = mpimessenger_init(argc, argv); - int world = mpimessenger_world(); - - Pinger *p = new Pinger( new MPIMessenger(myrank) ); - - mpimessenger_start(); - - //while (1) { - for (int i=0; i<10000; i++) { - - // ping random nodes - int d = rand() % world; - if (d != myrank) { - //cout << "sending " << i << " to " << d << endl; - p->messenger->send_message(new MPing(), d); - } - - } - - - //cout << "shutting down" << endl; - //p->messenger->shutdown(); - - mpimessenger_wait(); - mpimessenger_shutdown(); // shutdown MPI -} diff --git a/trunk/ceph/test/testnewbuffers.cc b/trunk/ceph/test/testnewbuffers.cc deleted file mode 100644 index 0fea7571a4572..0000000000000 --- a/trunk/ceph/test/testnewbuffers.cc +++ /dev/null @@ -1,91 +0,0 @@ - -#include -#include -using namespace std; - - -#include "include/newbuffer.h" -//#include "include/bufferlist.h" - -#include "common/Thread.h" - - - class Th : public Thread { - public: - bufferlist bl; - Th(bufferlist& o) : bl(o) { } - - void *entry() { - //cout << "start" << endl; - // thrash it a bit. - for (int n=0; n<10000; n++) { - bufferlist bl2; - unsigned off = rand() % (bl.length() -1); - unsigned len = 1 + rand() % (bl.length() - off - 1); - bl2.substr_of(bl, off, len); - bufferlist bl3; - bl3.append(bl); - bl3.append(bl2); - //cout << bl3 << endl; - bl2.clear(); - bl3.clear(); - } - //cout << "end" << endl; - } - }; - -int main() -{ - - bufferptr p1 = buffer::copy("123456",7); - //bufferptr p1 = new buffer("123456",7); - bufferptr p2 = p1; - - cout << "p1 is '" << p1.c_str() << "'" << " " << p1 << endl; - cout << "p2 is '" << p2.c_str() << "'" << " " << p2 << endl; - - bufferptr p3 = buffer::copy("abcdef",7); - //bufferptr p3 = new buffer("abcdef",7); - - cout << "p3 is " << p3.c_str() << " " << p3 << endl; - - bufferlist bl; - bl.push_back(p2); - bl.push_back(p1); - bl.push_back(p3); - - cout << "bl is " << bl << endl; - - bufferlist took; - bl.splice(10,4,&took); - - cout << "took out " << took << ", leftover is " << bl << endl; - //cout << "len is " << bl.length() << endl; - - bufferlist bl2; - bl2.substr_of(bl, 3, 5); - cout << "bl2 is " << bl2 << endl; - - - cout << "bl before " << bl << endl; - - list ls; - for (int t=0; t<40; t++) { - Th *t = new Th(bl); - cout << "create" << endl; - t->create(); - ls.push_back(t); - } - - bl.clear(); - - while (!ls.empty()) { - cout << "join" << endl; - ls.front()->join(); - delete ls.front(); - ls.pop_front(); - } - - cout << "bl after " << bl << endl; - -} diff --git a/trunk/ceph/test/testos.cc b/trunk/ceph/test/testos.cc deleted file mode 100644 index 24c81590d899c..0000000000000 --- a/trunk/ceph/test/testos.cc +++ /dev/null @@ -1,343 +0,0 @@ -/* testos.cc -- simple ObjectStore test harness. - Copyright (C) 2007 Casey Marshall - -Ceph - scalable distributed file system - -This is free software; you can redistribute it and/or -modify it under the terms of the GNU Lesser General Public -License version 2.1, as published by the Free Software -Foundation. See file COPYING. */ - - -#include "osd/ObjectStore.h" -#include "ebofs/Ebofs.h" -#include "osbdb/OSBDB.h" -#include "include/buffer.h" - -#include -#include -#include - -#include -#include - -using namespace std; - -static inline unsigned long long -to_usec (struct timeval &time) -{ - return (((unsigned long long) time.tv_sec * 1000000) - + ((unsigned long long) time.tv_usec)); -} - -static inline unsigned long long -to_msec (struct timeval &time) -{ - return (((unsigned long long) time.tv_sec * 1000) - + ((unsigned long long) time.tv_usec / 1000)); -} - -int main (int argc, char **argv) -{ - vector args; - char *osd_name = "ebofs"; - unsigned object_size = 1024; - unsigned object_count = 1024; - unsigned write_iter = 64; - unsigned random_seed = ::time(NULL); - char *device = "/tmp/testos"; - char *mountcmd = "mount /tmp/testos"; - char *umountcmd = "umount /tmp/testos"; - - bool ebofs_raw_device = false; - bool inhibit_remount = (getenv("TESTOS_INHIBIT_REMOUNT") != NULL); - - if (argc > 1 - && (strcmp (argv[1], "-h") == 0 - || strcmp (argv[1], "-help") == 0 - || strcmp (argv[1], "--help") == 0)) - { - cout << "usage: " << argv[0] << " [store [object-size [object-count [iterations [seed]]]]]" << endl; - cout << endl; - cout << "Where the arguments are:" << endl << endl; - cout << " store -- store type; default \"ebofs\"" << endl; - cout << " object-size -- size of objects; default 1024" << endl; - cout << " object-count -- number of objects to write; default 1024" - << endl; - cout << " iterations -- write the objects that many times; default 5" - << endl; - cout << " seed -- random seed; default current time" << endl; - exit (0); - } - - argv_to_vec (argc, argv, args); - for (vector::iterator it = args.begin(); it != args.end(); - it++) - cout << *it << " "; - cout << endl; - parse_config_options (args); - for (vector::iterator it = args.begin(); it != args.end(); - it++) - cout << *it << " "; - cout << endl; - - argc = args.size(); - if (argc > 0) - osd_name = args[0]; - if (argc > 1) - object_size = (unsigned) atol (args[1]); - if (argc > 2) - object_count = (unsigned) atol (args[2]); - if (argc > 3) - write_iter = (unsigned) atol (args[3]); - if (argc > 4) - random_seed = (unsigned) atol (args[4]); - - // algin object size to 'long' - object_size = ((object_size + (sizeof (long) - 1)) / sizeof (long)) * sizeof (long); - - char *osd_file = new char[32]; - strcpy (osd_file, "/tmp/testos/testos.XXXXXX"); - mktemp (osd_file); - - if (strcasecmp (osd_name, "ebofs") == 0) - { - char *dev_env = getenv ("TESTOS_EBOFS_DEV"); - if (dev_env != NULL) - { - // Assume it is a true device. - strncpy (osd_file, dev_env, 32); - inhibit_remount = true; - ebofs_raw_device = true; - } - } - - if (!inhibit_remount) - { - if (system (mountcmd) != 0) - { - cerr << "mount failed" << endl; - exit (1); - } - } - - ObjectStore *os = NULL; - if (strcasecmp (osd_name, "ebofs") == 0) - { - if (!ebofs_raw_device) - { - FILE *f = fopen (osd_file, "w"); - if (f == NULL) - { - cerr << "failed to open " << osd_file << ": " << strerror (errno) - << endl; - exit (1); - } - // 1G file. - fseek (f, 1024 * 1024 * 1024, SEEK_SET); - fputc ('\0', f); - fclose (f); - } - os = new Ebofs (osd_file); - } - else if (strcasecmp (osd_name, "osbdb") == 0) - { - os = new OSBDB (osd_file); - } - else if (strcasecmp (osd_name, "osbdb-btree") == 0) - { - g_conf.bdbstore_btree = true; - os = new OSBDB (osd_file); - } - else - { - cerr << "I don't know about object store \"" << osd_name << "\"" - << endl; - exit (1); - } - - cout << "Writing " << object_count << " objects of size " - << object_size << " to " << osd_name << endl; - - char *val = (char *) malloc (object_size); - char *val2 = (char *) malloc (object_size); - auto_ptr valptr (val); - auto_ptr valptr2(val2); - if (getenv ("TESTOS_UNALIGNED") != NULL) - { - val = val + 1; - val2 = val2 + 1; - } - - for (unsigned i = 0; i < object_size; i++) - { - val[i] = (char) i; - val2[i] = (char) i; - } - object_t *oids = new object_t[object_count]; - - utime_t writes[write_iter]; - utime_t total_write; - utime_t reads[write_iter]; - utime_t total_read; - for (unsigned i = 0; i < write_iter; i++) - { - cerr << "Iteration " << i << endl; - - int ret = os->mkfs(); - if (ret != 0) - { - cerr << "mkfs(" << osd_file << "): " << strerror (-ret) << endl; - exit (1); - } - ret = os->mount(); - if (ret != 0) - { - cerr << "mount(): " << strerror (-ret) << endl; - exit (1); - } - - srandom (random_seed + i); - - for (unsigned j = 0; j < object_count; j++) - { - oids[j].ino = (uint64_t) random() << 32 | random(); - oids[j].bno = random(); - } - - utime_t begin = g_clock.now(); - for (unsigned o = 0; o < object_count; o++) - { - bufferptr bp (val, object_size); - bufferlist bl; - bl.push_back (bp); - int ret; - if ((ret = os->write (oids[o], 0L, object_size, bl, NULL)) < 0) - cerr << "write " << oids[o] << " failed: " - << strerror (-ret) << endl; - } - os->sync(); - - utime_t end = g_clock.now() - begin; - - cerr << "Write finished in " << end << endl; - total_write += end; - writes[i] = end; - - os->umount(); - sync(); - - if (!inhibit_remount) - { - if (system (umountcmd) != 0) - { - cerr << "umount failed" << endl; - exit (1); - } - - if (system (mountcmd) != 0) - { - cerr << "mount(2) failed" << endl; - exit (1); - } - } - - os->mount(); - - // Shuffle the OIDs. - for (int j = 0; j < object_count; j++) - { - int x = random() % object_count; - if (x < 0) - x = -x; - object_t o = oids[j]; - oids[j] = oids[x]; - oids[x] = o; - } - - begin = g_clock.now(); - for (unsigned o = 0; o < object_count; o++) - { - bufferptr bp (val2, object_size); - bufferlist bl; - bl.push_back (bp); - - if (os->read (oids[o], 0L, object_size, bl) < 0) - { - cerr << "object " << oids[o] << " not found!" << endl; - } - } - end = g_clock.now() - begin; - - cerr << "Read finished in " << end << endl; - total_read += end; - reads[i] = end; - - os->umount(); - sync(); - - if (!inhibit_remount) - { - if (system (umountcmd) != 0) - { - cerr << "umount(2) failed" << endl; - exit (1); - } - - if (system (mountcmd) != 0) - { - cerr << "mount(3) failed" << endl; - exit (1); - } - } - } - - cerr << "Finished in " << (total_write + total_read) << endl; - - double write_mean = ((double) total_write) / ((double) write_iter); - double write_sd = 0.0; - for (unsigned i = 0; i < write_iter; i++) - { - double x = ((double) writes[i]) - write_mean; - write_sd += x * x; - } - write_sd = sqrt (write_sd / ((double) write_iter)); - - double read_mean = ((double) total_read) / ((double) write_iter); - double read_sd = 0.0; - for (unsigned i = 0; i < write_iter; i++) - { - double x = ((double) reads[i]) - read_mean; - write_sd += x * x; - } - read_sd = sqrt (read_sd / ((double) write_iter)); - - cout << "TESTOS: write " << osd_name << ":" << object_size << ":" - << object_count << ":" << write_iter << ":" << random_seed - << " -- " << write_mean << " " << write_sd << endl; - - cout << "TESTOS: write.raw -- "; - for (int i = 0; i < write_iter; i++) - cout << ((double) writes[i]) << " "; - cout << endl; - - cout << "TESTOS: read " << osd_name << ":" << object_size << ":" - << object_count << ":" << write_iter << ":" << random_seed - << " -- " << read_mean << " " << read_sd << endl; - - cout << "TESTOS: read.raw -- "; - for (int i = 0; i < write_iter; i++) - cout << ((double) reads[i]) << " "; - cout << endl; - - unlink (osd_file); - if (!inhibit_remount) - { - if (system (umountcmd) != 0) - { - cerr << "umount(3) failed" << endl; - exit (1); - } - } - exit (0); -} diff --git a/trunk/ceph/test/testosbdb.cc b/trunk/ceph/test/testosbdb.cc deleted file mode 100644 index 19268e7587531..0000000000000 --- a/trunk/ceph/test/testosbdb.cc +++ /dev/null @@ -1,347 +0,0 @@ -/* testosbdb.cc -- test OSBDB. - Copyright (C) 2007 Casey Marshall */ - - -#include -#include "osbdb/OSBDB.h" - -using namespace std; - -int -main (int argc, char **argv) -{ - vector args; - argv_to_vec (argc, argv, args); - parse_config_options (args); - - g_conf.debug_bdbstore = 10; - //g_conf.bdbstore_btree = true; - char dbfile[256]; - strncpy (dbfile, "/tmp/testosbdb/db.XXXXXX", 256); - mktemp (dbfile); - OSBDB *os = new OSBDB(dbfile); - auto_ptr osPtr (os); - os->mkfs(); - os->mount(); - - // Put an object. - object_t oid (0xDEADBEEF00000000ULL, 0xFEEDFACE); - - cout << "sizeof oid_t is " << sizeof (oid_t) << endl; - cout << "offsetof oid_t.id " << offsetof (oid_t, id) << endl; - - cout << sizeof (object_t) << endl; - cout << sizeof (oid.ino) << endl; - cout << sizeof (oid.bno) << endl; - cout << sizeof (oid.rev) << endl; - - // Shouldn't be there. - if (os->exists (oid)) - { - cout << "FAIL: oid shouldn't be there " << oid << endl; - } - - // Write an object. - char *x = (char *) malloc (1024); - memset(x, 0xaa, 1024); - bufferptr bp (x, 1024); - bufferlist bl; - bl.push_back (bp); - - if (os->write (oid, 0L, 1024, bl, NULL) != 1024) - { - cout << "FAIL: writing object" << endl; - } - - os->sync(); - - // Should be there. - if (!os->exists (oid)) - { - cout << "FAIL: oid should be there: " << oid << endl; - } - - memset(x, 0, 1024); - if (os->read (oid, 0, 1024, bl) != 1024) - { - cout << "FAIL: reading object" << endl; - } - - for (int i = 0; i < 1024; i++) - { - if ((x[i] & 0xFF) != 0xaa) - { - cout << "FAIL: data read out is different" << endl; - break; - } - } - - // Set some attributes - if (os->setattr (oid, "alpha", "value", strlen ("value")) != 0) - { - cout << "FAIL: set attribute" << endl; - } - if (os->setattr (oid, "beta", "value", strlen ("value")) != 0) - { - cout << "FAIL: set attribute" << endl; - } - if (os->setattr (oid, "gamma", "value", strlen ("value")) != 0) - { - cout << "FAIL: set attribute" << endl; - } - if (os->setattr (oid, "fred", "value", strlen ("value")) != 0) - { - cout << "FAIL: set attribute" << endl; - } - - char *attrs = (char *) malloc (1024); - if (os->listattr (oid, attrs, 1024) != 0) - { - cout << "FAIL: listing attributes" << endl; - } - else - { - char *p = attrs; - if (strcmp (p, "alpha") != 0) - { - cout << "FAIL: should be \"alpha:\" \"" << p << "\"" << endl; - } - p = p + strlen (p) + 1; - if (strcmp (p, "beta") != 0) - { - cout << "FAIL: should be \"beta:\" \"" << p << "\"" << endl; - } - p = p + strlen (p) + 1; - if (strcmp (p, "fred") != 0) - { - cout << "FAIL: should be \"fred:\" \"" << p << "\"" << endl; - } - p = p + strlen (p) + 1; - if (strcmp (p, "gamma") != 0) - { - cout << "FAIL: should be \"gamma:\" \"" << p << "\"" << endl; - } - } - - char attrvalue[256]; - memset(attrvalue, 0, sizeof (attrvalue)); - if (os->getattr (oid, "alpha", attrvalue, sizeof(attrvalue)) < 0) - { - cout << "FAIL: getattr alpha" << endl; - } - else if (strncmp ("value", attrvalue, strlen("value")) != 0) - { - cout << "FAIL: read attribute value differs" << endl; - } - memset(attrvalue, 0, sizeof (attrvalue)); - if (os->getattr (oid, "fred", attrvalue, sizeof(attrvalue)) < 0) - { - cout << "FAIL: getattr fred" << endl; - } - else if (strncmp ("value", attrvalue, strlen("value")) != 0) - { - cout << "FAIL: read attribute value differs" << endl; - } - memset(attrvalue, 0, sizeof (attrvalue)); - if (os->getattr (oid, "beta", attrvalue, sizeof(attrvalue)) < 0) - { - cout << "FAIL: getattr beta" << endl; - } - else if (strncmp ("value", attrvalue, strlen("value")) != 0) - { - cout << "FAIL: read attribute value differs" << endl; - } - memset(attrvalue, 0, sizeof (attrvalue)); - if (os->getattr (oid, "gamma", attrvalue, sizeof(attrvalue)) < 0) - { - cout << "FAIL: getattr gamma" << endl; - } - else if (strncmp ("value", attrvalue, strlen("value")) != 0) - { - cout << "FAIL: read attribute value differs" << endl; - } - - if (os->setattr (oid, "alpha", "different", strlen("different")) != 0) - cout << "FAIL: setattr overwrite" << endl; - memset(attrvalue, 0, sizeof (attrvalue)); - if (os->getattr (oid, "alpha", attrvalue, sizeof(attrvalue)) < 0) - { - cout << "FAIL: getattr alpha" << endl; - } - else if (strncmp ("different", attrvalue, strlen("different")) != 0) - { - cout << "FAIL: read attribute value differs" << endl; - } - - if (os->rmattr (oid, "alpha") != 0) - { - cout << "FAIL: rmattr alpha" << endl; - } - if (os->rmattr (oid, "fred") != 0) - { - cout << "FAIL: rmattr fred" << endl; - } - if (os->rmattr (oid, "beta") != 0) - { - cout << "FAIL: rmattr beta" << endl; - } - if (os->rmattr (oid, "gamma") != 0) - { - cout << "FAIL: rmattr gamma" << endl; - } - - coll_t cid = 0xCAFEBABE; - if (os->create_collection (cid) != 0) - { - cout << "FAIL: create_collection" << endl; - } - if (os->create_collection (cid + 10) != 0) - { - cout << "FAIL: create_collection" << endl; - } - if (os->create_collection (cid + 5) != 0) - { - cout << "FAIL: create_collection" << endl; - } - if (os->create_collection (42) != 0) - { - cout << "FAIL: create_collection" << endl; - } - - if (os->collection_add (cid, oid) != 0) - { - cout << "FAIL: collection_add" << endl; - } - - list ls; - if (os->list_collections (ls) < 0) - { - cout << "FAIL: list_collections" << endl; - } - cout << "collections: "; - for (list::iterator it = ls.begin(); it != ls.end(); it++) - { - cout << *it << ", "; - } - cout << endl; - - if (os->destroy_collection (0xCAFEBABE + 10) != 0) - { - cout << "FAIL: destroy_collection" << endl; - } - - if (os->destroy_collection (0xCAFEBADE + 10) == 0) - { - cout << "FAIL: destroy_collection" << endl; - } - - object_t oid2 (12345, 12345); - for (int i = 0; i < 8; i++) - { - oid2.rev++; - if (os->collection_add (cid, oid2) != 0) - { - cout << "FAIL: collection_add" << endl; - } - } - for (int i = 0; i < 8; i++) - { - if (os->collection_remove (cid, oid2) != 0) - { - cout << "FAIL: collection_remove" << endl; - } - oid2.rev--; - } - - if (os->collection_setattr (cid, "alpha", "value", 5) != 0) - cout << "FAIL: collection_setattr" << endl; - if (os->collection_setattr (cid, "beta", "value", 5) != 0) - cout << "FAIL: collection_setattr" << endl; - if (os->collection_setattr (cid, "gamma", "value", 5) != 0) - cout << "FAIL: collection_setattr" << endl; - if (os->collection_setattr (cid, "fred", "value", 5) != 0) - cout << "FAIL: collection_setattr" << endl; - - memset (attrvalue, 0, sizeof (attrvalue)); - if (os->collection_getattr (cid, "alpha", attrvalue, sizeof (attrvalue)) < 0) - cout << "FAIL: collection_getattr" << endl; - else if (strncmp (attrvalue, "value", 5) != 0) - cout << "FAIL: collection attribute value different" << endl; - memset (attrvalue, 0, sizeof (attrvalue)); - if (os->collection_getattr (cid, "beta", attrvalue, sizeof (attrvalue)) < 0) - cout << "FAIL: collection_getattr" << endl; - else if (strncmp (attrvalue, "value", 5) != 0) - cout << "FAIL: collection attribute value different" << endl; - memset (attrvalue, 0, sizeof (attrvalue)); - if (os->collection_getattr (cid, "gamma", attrvalue, sizeof (attrvalue)) < 0) - cout << "FAIL: collection_getattr" << endl; - else if (strncmp (attrvalue, "value", 5) != 0) - cout << "FAIL: collection attribute value different" << endl; - memset (attrvalue, 0, sizeof (attrvalue)); - if (os->collection_getattr (cid, "fred", attrvalue, sizeof (attrvalue)) < 0) - cout << "FAIL: collection_getattr" << endl; - else if (strncmp (attrvalue, "value", 5) != 0) - cout << "FAIL: collection attribute value different" << endl; - - if (os->collection_setattr (cid, "alpha", "eulavvalue", 10) != 0) - cout << "FAIL: collection setattr overwrite" << endl; - memset (attrvalue, 0, sizeof (attrvalue)); - if (os->collection_getattr (cid, "alpha", attrvalue, sizeof (attrvalue)) < 0) - cout << "FAIL: collection_getattr" << endl; - else if (strncmp (attrvalue, "eulavvalue", 10) != 0) - cout << "FAIL: collection attribute value different" << endl; - memset (attrvalue, 0, sizeof (attrvalue)); - if (os->collection_getattr (cid, "beta", attrvalue, sizeof (attrvalue)) < 0) - cout << "FAIL: collection_getattr" << endl; - else if (strncmp (attrvalue, "value", 5) != 0) - cout << "FAIL: collection attribute value different" << endl; - memset (attrvalue, 0, sizeof (attrvalue)); - if (os->collection_getattr (cid, "gamma", attrvalue, sizeof (attrvalue)) < 0) - cout << "FAIL: collection_getattr" << endl; - else if (strncmp (attrvalue, "value", 5) != 0) - cout << "FAIL: collection attribute value different" << endl; - memset (attrvalue, 0, sizeof (attrvalue)); - if (os->collection_getattr (cid, "fred", attrvalue, sizeof (attrvalue)) < 0) - cout << "FAIL: collection_getattr" << endl; - else if (strncmp (attrvalue, "value", 5) != 0) - cout << "FAIL: collection attribute value different" << endl; - - if (os->collection_rmattr (cid, "alpha") != 0) - cout << "FAIL: collection_rmattr" << endl; - if (os->collection_rmattr (cid, "fred") != 0) - cout << "FAIL: collection_rmattr" << endl; - if (os->collection_rmattr (cid, "beta") != 0) - cout << "FAIL: collection_rmattr" << endl; - if (os->collection_rmattr (cid, "gamma") != 0) - cout << "FAIL: collection_rmattr" << endl; - - if (os->collection_rmattr (cid, "alpha") == 0) - cout << "FAIL: collection_rmattr (nonexistent)" << endl; - - // Truncate the object. - if (os->truncate (oid, 512, NULL) != 0) - { - cout << "FAIL: truncate" << endl; - } - - // Expand the object. - if (os->truncate (oid, 1200, NULL) != 0) - { - cout << "FAIL: expand" << endl; - } - - // Delete the object. - if (os->remove (oid) != 0) - { - cout << "FAIL: could not remove object" << endl; - } - - // Shouldn't be there - if (os->exists (oid)) - { - cout << "FAIL: should not be there" << endl; - } - - os->sync(); - exit (0); -} diff --git a/trunk/ceph/test/testtree.cc b/trunk/ceph/test/testtree.cc deleted file mode 100644 index 2c21bcbe52e25..0000000000000 --- a/trunk/ceph/test/testtree.cc +++ /dev/null @@ -1,46 +0,0 @@ - - -#include "../crush/BinaryTree.h" -using namespace crush; - -#include -#include -using namespace std; - -int main() -{ - BinaryTree t; - - vector nodes; - - for (int i=0; i<30; i++) { - cout << "adding " << i << endl; - int n = t.add_node(1); - nodes.push_back(n); - //cout << t << endl; - } - cout << t << endl; - - for (int k=0; k<10000; k++) { - if (rand() % 2) { - cout << "adding" << endl; - nodes.push_back( t.add_node(1) ); - } else { - if (!nodes.empty()) { - //for (int i=0; i -using namespace std; - - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -int main(int argc, char**argv) -{ - int a = 1; - int b = 2; - - mknod("test", 0600, 0); - - cout << "setxattr " << setxattr("test", "asdf", &a, sizeof(a), 0) << endl; - cout << "errno " << errno << " " << strerror(errno) << endl; - cout << "getxattr " << getxattr("test", "asdf", &b, sizeof(b)) << endl; - cout << "errno " << errno << " " << strerror(errno) << endl; - cout << "a is " << a << " and b is " << b << endl; - return 0; -} diff --git a/trunk/web/Makefile b/web/Makefile similarity index 100% rename from trunk/web/Makefile rename to web/Makefile diff --git a/trunk/web/ceph.css b/web/ceph.css similarity index 100% rename from trunk/web/ceph.css rename to web/ceph.css diff --git a/trunk/web/gen.pl b/web/gen.pl similarity index 100% rename from trunk/web/gen.pl rename to web/gen.pl diff --git a/trunk/web/images/ceph-architecture.png b/web/images/ceph-architecture.png similarity index 100% rename from trunk/web/images/ceph-architecture.png rename to web/images/ceph-architecture.png diff --git a/trunk/web/images/ceph-logo1.jpg b/web/images/ceph-logo1.jpg similarity index 100% rename from trunk/web/images/ceph-logo1.jpg rename to web/images/ceph-logo1.jpg diff --git a/trunk/web/index.body b/web/index.body similarity index 100% rename from trunk/web/index.body rename to web/index.body diff --git a/trunk/web/overview.body b/web/overview.body similarity index 100% rename from trunk/web/overview.body rename to web/overview.body diff --git a/trunk/web/publications.body b/web/publications.body similarity index 100% rename from trunk/web/publications.body rename to web/publications.body diff --git a/trunk/web/source.body b/web/source.body similarity index 100% rename from trunk/web/source.body rename to web/source.body diff --git a/trunk/web/tasks.body b/web/tasks.body similarity index 100% rename from trunk/web/tasks.body rename to web/tasks.body diff --git a/trunk/web/template.html b/web/template.html similarity index 100% rename from trunk/web/template.html rename to web/template.html